diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c28a982
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+# git rm -r --cached .
+# git add .
+# git commit -m 'update .gitignore'
+
+*.iml
+.gradle
+/local.properties
+/.idea
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+/cmake-build-debug
diff --git a/3rdparty/TNN/.clang-format b/3rdparty/TNN/.clang-format
new file mode 100755
index 0000000..7850633
--- /dev/null
+++ b/3rdparty/TNN/.clang-format
@@ -0,0 +1,114 @@
+---
+# 语言: None Cpp Java ObjC Protp
+Language:      Cpp
+#LLVM Google
+BasedOnStyle:  Google
+# 语言: None Cpp Java ObjC Protp
+# 访问说明符的偏移(public private)
+AccessModifierOffset: -4
+# 括号之后,水平对齐参数: Align DontAlign AlwaysBreak
+AlignAfterOpenBracket: Align
+# 连续的宏
+# AlignConsecutiveMacros: true
+# 连续的赋值时,对齐所有的等号
+AlignConsecutiveAssignments: true
+# 左对齐换行(使用反斜杠换行)的反斜杠
+AlignEscapedNewlines: Right
+# # 左对齐换行(使用反斜杠换行)的反斜杠
+# AlignEscapedNewlinesLeft: true
+# 水平对齐二元和三元表达式的操作数
+AlignOperands: true
+# 允许函数声明的所有参数在放在下一行
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowAllArgumentsOnNextLine: false
+# 允许短的块放在同一行
+AllowShortBlocksOnASingleLine : false
+# 允许短的case标签放在同一行
+AllowShortCaseLabelsOnASingleLine: false
+# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中，空函数), All
+AllowShortFunctionsOnASingleLine: Empty
+# 是否允许短if单行 If true, if (a) return; 可以放到同一行
+AllowShortIfStatementsOnASingleLine: false
+# 允许短的循环保持在同一行
+AllowShortLoopsOnASingleLine: false
+# 总是在定义返回类型后换行(deprecated)
+AlwaysBreakAfterDefinitionReturnType: None
+# 每行字符的限制，0表示没有限制
+ColumnLimit: 120
+# 描述具有特殊意义的注释的正则表达式，它不应该被分割为多行或以其它方式改变
+CommentPragmas: '^ IWYU pragma:'
+#指针的*的挨着哪边
+PointerAlignment: Right
+#缩进宽度
+IndentWidth: 4
+# OC block后面的缩进
+ObjCBlockIndentWidth: 4
+#tab键盘的宽度
+TabWidth: 4
+Standard:        Cpp11
+UseTab:          Never
+CompactNamespaces: false
+# 命名空间的偏移
+NamespaceIndentation: Inner
+# 命名空间的末尾注释
+FixNamespaceComments: true
+# IndentPPDirectives: BeforeHash
+---
+# 语言: None Cpp Java ObjC Protp
+Language:      ObjC
+#LLVM Google
+BasedOnStyle:  LLVM
+# 访问说明符的偏移(public private)
+AccessModifierOffset: -4
+# 括号之后,水平对齐参数: Align DontAlign AlwaysBreak
+AlignAfterOpenBracket: Align
+# 连续的宏
+# AlignConsecutiveMacros: true
+# 连续的赋值时,对齐所有的等号
+AlignConsecutiveAssignments: true
+# 左对齐换行(使用反斜杠换行)的反斜杠
+AlignEscapedNewlines: Right
+# # 左对齐换行(使用反斜杠换行)的反斜杠
+# AlignEscapedNewlinesLeft: true
+# 水平对齐二元和三元表达式的操作数
+AlignOperands: true
+# 允许函数声明的所有参数在放在下一行
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowAllArgumentsOnNextLine: false
+# 允许短的块放在同一行
+AllowShortBlocksOnASingleLine : false
+# 允许短的case标签放在同一行
+AllowShortCaseLabelsOnASingleLine: false
+# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中，空函数), All
+AllowShortFunctionsOnASingleLine: Empty
+# 是否允许短if单行 If true, if (a) return; 可以放到同一行
+AllowShortIfStatementsOnASingleLine: false
+# 允许短的循环保持在同一行
+AllowShortLoopsOnASingleLine: false
+# 总是在定义返回类型后换行(deprecated)
+AlwaysBreakAfterDefinitionReturnType: None
+# 每行字符的限制，0表示没有限制
+ColumnLimit: 120
+# 描述具有特殊意义的注释的正则表达式，它不应该被分割为多行或以其它方式改变
+CommentPragmas: '^ IWYU pragma:'
+#指针的*的挨着哪边
+PointerAlignment: Right
+#缩进宽度
+IndentWidth: 4
+# OC block后面的缩进
+ObjCBlockIndentWidth: 4
+#tab键盘的宽度
+TabWidth: 4
+Standard:        Cpp11
+UseTab:          Never
+CompactNamespaces: false
+# 命名空间的偏移
+NamespaceIndentation: Inner
+# 命名空间的末尾注释
+FixNamespaceComments: true
+# IndentPPDirectives: BeforeHash
+---
+Language: Proto
+#.proto文件不格式化
+DisableFormat: true
+...
diff --git a/3rdparty/TNN/.github/ISSUE_TEMPLATE/feature_request.md b/3rdparty/TNN/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..bbcbbe7
--- /dev/null
+++ b/3rdparty/TNN/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/3rdparty/TNN/.github/ISSUE_TEMPLATE/model-converter-issue.md b/3rdparty/TNN/.github/ISSUE_TEMPLATE/model-converter-issue.md
new file mode 100644
index 0000000..6ad2d0c
--- /dev/null
+++ b/3rdparty/TNN/.github/ISSUE_TEMPLATE/model-converter-issue.md
@@ -0,0 +1,31 @@
+---
+name: model converter issue
+about: Describe this issue template's purpose here.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**1. 使用环境（environment）**
+ - OS: Mac/Ubuntu/Centos/Docker
+ - OS Version:
+
+  **2. Github版本**
+ - branch：
+ - commit(optional):
+
+  **3. 详细描述bug 情况 (Describe the bug)**
+ A clear and concise description of what the bug is.
+ - issue type: 编译问题（Build failed）/ 模型转换失败（converter failed）/ 模型不对齐（model misalignment）
+ - original model： Caffe/ONNX/TensorFlow/TensorFlowLite 
+ （如果可以的话，请上传原始的模型文件）
+
+  **4. 日志（Log）**
+ ```txt
+ 将日志粘贴在这里
+ Paste log here or pastebin
+ ```
+
+  **5. 截图（Screenshots）**
+ If applicable, add screenshots to help explain your problem.
diff --git a/3rdparty/TNN/.github/ISSUE_TEMPLATE/tnn-inference-issue.md b/3rdparty/TNN/.github/ISSUE_TEMPLATE/tnn-inference-issue.md
new file mode 100644
index 0000000..8df4d15
--- /dev/null
+++ b/3rdparty/TNN/.github/ISSUE_TEMPLATE/tnn-inference-issue.md
@@ -0,0 +1,41 @@
+---
+name: tnn inference issue
+about: Describe this issue template's purpose here.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+  **1. 环境（environment）**
+ - Build OS and Version: Mac/Ubuntu/Centos/Windows
+ - RunTime OS Version: Linux/Android/IOS
+ - RunTime DEVICE: ARM/OPENCL/METAL
+
+  **2. Github版本**
+ - branch：
+ - commit(optional):
+
+  **3. 编译方式(compile method)**
+  CMake完整编译参数(full cmake arguments)
+
+  **4. 编译日志(build log)**
+ ```txt
+ 将日志粘贴在这里
+ Paste log here or pastebin
+ ```
+
+  **5. 详细描述bug 情况 (Describe the bug)**
+
+
+  **6. 运行日志（runtime log）**
+ ```txt
+ 将日志粘贴在这里
+ Paste log here or pastebin
+ ```
+
+  **7. 截图（Screenshots）**
+ ```txt
+ 将截图粘贴在这里
+ Paste screenshorts here or pastebin
+ ```
diff --git a/3rdparty/TNN/.github/release-drafter.yml b/3rdparty/TNN/.github/release-drafter.yml
new file mode 100644
index 0000000..b05f832
--- /dev/null
+++ b/3rdparty/TNN/.github/release-drafter.yml
@@ -0,0 +1,29 @@
+name-template: 'TNN v$RESOLVED_VERSION'
+tag-template: 'v$RESOLVED_VERSION'
+categories:
+  - title: '🚀 Features'
+    labels:
+      - 'enhancement'
+  - title: '🐛 Bug Fixes'
+    labels:
+      - 'bug'
+  - title: '🧰 Maintenance'
+    labels:
+      - 'documentation'
+change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
+change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
+version-resolver:
+  major:
+    labels:
+      - 'major'
+  minor:
+    labels:
+      - 'minor'
+  patch:
+    labels:
+      - 'patch'
+  default: patch
+template: |
+  ## Changes
+
+  $CHANGES
\ No newline at end of file
diff --git a/3rdparty/TNN/.github/workflows/android-arm-cpu.yml b/3rdparty/TNN/.github/workflows/android-arm-cpu.yml
new file mode 100644
index 0000000..f49b5c0
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/android-arm-cpu.yml
@@ -0,0 +1,32 @@
+name: android-arm-cpu
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      CONDITION: ${{ steps.preflight.outputs.CONDITION }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Preflight
+      id: preflight
+      run: |
+        echo ::set-output name=CONDITION::0
+        ./scripts/.ci/preflight.sh android || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+
+  android:
+    needs: [setup]
+    if: ${{ needs.setup.outputs.CONDITION != '11' }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: configure
+      run: sudo apt-get install attr
+    - name: build
+      run: export ANDROID_NDK=$ANDROID_HOME/ndk-bundle && ./scripts/build_android.sh
\ No newline at end of file
diff --git a/3rdparty/TNN/.github/workflows/ios-cpu.yml b/3rdparty/TNN/.github/workflows/ios-cpu.yml
new file mode 100644
index 0000000..ff47973
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/ios-cpu.yml
@@ -0,0 +1,30 @@
+name: ios-cpu
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      CONDITION: ${{ steps.preflight.outputs.CONDITION }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Preflight
+      id: preflight
+      run: |
+        echo ::set-output name=CONDITION::0
+        ./scripts/.ci/preflight.sh ios || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+
+  ios-iphone-os:
+    needs: [setup]
+    if: ${{ needs.setup.outputs.CONDITION != '11' }}
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: build
+      run: ./scripts/build_ios.sh
diff --git a/3rdparty/TNN/.github/workflows/linux-x86-cpu-gcc.yml b/3rdparty/TNN/.github/workflows/linux-x86-cpu-gcc.yml
new file mode 100644
index 0000000..3195d75
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/linux-x86-cpu-gcc.yml
@@ -0,0 +1,34 @@
+name: linux-x86-cpu-gcc
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      CONDITION: ${{ steps.preflight.outputs.CONDITION }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Preflight
+      id: preflight
+      run: |
+        echo ::set-output name=CONDITION::0
+        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+
+  linux-gcc:
+    needs: [setup]
+    if: ${{ needs.setup.outputs.CONDITION != '11' }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: update
+      run: sudo apt-get update
+    - name: gcc-multilib
+      run: sudo apt-get install gcc-multilib g++-multilib libprotobuf-dev protobuf-compiler
+    - name: build
+      run: ./scripts/build_x86_linux.sh
diff --git a/3rdparty/TNN/.github/workflows/macos-x64-cpu.yml b/3rdparty/TNN/.github/workflows/macos-x64-cpu.yml
new file mode 100644
index 0000000..e93862e
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/macos-x64-cpu.yml
@@ -0,0 +1,32 @@
+name: macos-x64-cpu
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      CONDITION: ${{ steps.preflight.outputs.CONDITION }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Preflight
+      id: preflight
+      run: |
+        echo ::set-output name=CONDITION::0
+        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+
+  macos-clang:
+    needs: [setup]
+    if: ${{ needs.setup.outputs.CONDITION != '11' }}
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: protobuf
+      run: brew install protobuf opencv3
+    - name: build
+      run: ./scripts/build_macos.sh
diff --git a/3rdparty/TNN/.github/workflows/release-drafter.yml b/3rdparty/TNN/.github/workflows/release-drafter.yml
new file mode 100644
index 0000000..c77b1be
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/release-drafter.yml
@@ -0,0 +1,16 @@
+name: Release Drafter
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  update_release_draft:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: release-drafter/release-drafter@v5
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        config-name: release-drafter.yml
diff --git a/3rdparty/TNN/.github/workflows/release.yml b/3rdparty/TNN/.github/workflows/release.yml
new file mode 100644
index 0000000..1889c2f
--- /dev/null
+++ b/3rdparty/TNN/.github/workflows/release.yml
@@ -0,0 +1,617 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - '*'
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      VERSION: ${{ steps.get_version.outputs.VERSION }}
+    steps:
+    - name: Get-version
+      id: get_version
+      run: |
+        echo "github ref:" ${GITHUB_REF}
+        echo "tag version:" ${GITHUB_REF/refs\/tags\//}
+        echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//}
+
+  full-source:
+    needs: [setup]
+    runs-on: ubuntu-latest
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-full-source
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Compress
+        env:
+          ASSET_PATH: /tmp/${{ env.ASSET_NAME }}.zip
+        run: |
+          echo "compress to" ${ASSET_PATH}
+          rm -rf .git
+          rm -f ${ASSET_PATH}
+          zip -9r ${ASSET_PATH} .
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: /tmp/${{ env.ASSET_NAME }}.zip
+          if-no-files-found: error
+
+  android:
+    needs: [setup]
+    runs-on: ubuntu-latest
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-android
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Configure
+        run: sudo apt-get install attr
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.zip
+        run: |
+          cd ./scripts
+          export ANDROID_NDK=$ANDROID_HOME/ndk-bundle && ./build_android.sh
+          cd ./release
+          zip -9r ${ASSET_PATH} .
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/release/${{ env.ASSET_NAME }}.zip
+          if-no-files-found: error
+
+  ios:
+    needs: [setup]
+    runs-on: macos-latest
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ios
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        run: |
+          ./scripts/build_ios.sh
+          cd ./platforms/ios
+          zip -9r ${{ env.ASSET_NAME }}.zip ./tnn.bundle ./tnn.framework
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./platforms/ios/${{ env.ASSET_NAME }}.zip
+          if-no-files-found: error
+
+  centos7-x86:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:centos7-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-centos7-x86
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          export PATH=$PATH:/usr/local/cmake-3.18.4-Linux-x86_64/bin/
+          cd ./scripts
+          ./build_x86_linux.sh
+          cd ./x86_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/x86_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  centos7-cuda:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:centos7-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-centos7-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          export PATH=$PATH:/usr/local/cmake-3.18.4-Linux-x86_64/bin/
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_cuda_linux.sh
+          cd cuda_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/cuda_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  centos7-x86-cuda:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:centos7-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-centos7-x86-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          export PATH=$PATH:/usr/local/cmake-3.18.4-Linux-x86_64/bin/ 
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_linux.sh
+          cd linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  centos8-x86:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:centos8-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-centos8-x86
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          export PATH=$PATH:/usr/local/cmake-3.18.4-Linux-x86_64/bin/
+          cd ./scripts
+          ./build_x86_linux.sh
+          cd ./x86_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/x86_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1604-x86:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-16.04-x86
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Configure
+        run: sudo apt-get install attr
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          cd ./scripts
+          ./build_x86_linux.sh
+          cd ./x86_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/x86_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1604-cuda:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:ubuntu-16.04-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-16.04-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          export PATH=$PATH:/usr/local/cmake-3.15.3/bin/
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_cuda_linux.sh
+          cd cuda_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/cuda_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1604-x86-cuda:
+    needs: [setup]
+    runs-on: ubuntu-16.04
+    container:
+      image: neiltian/tnn-cuda-build-env:ubuntu-16.04-cuda10.2-cudnn8-trt7.1
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-16.04-x86-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          apt-get install wget
+          export PATH=$PATH:/usr/local/cmake-3.15.3/bin/ 
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_linux.sh
+          cd linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1804-x86:
+    needs: [setup]
+    runs-on: ubuntu-18.04
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-18.04-x86
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Configure
+        run: sudo apt-get install attr
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          cd ./scripts
+          ./build_x86_linux.sh
+          cd ./x86_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/x86_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1804-cuda:
+    needs: [setup]
+    runs-on: ubuntu-18.04
+    container:
+      image: nvcr.io/nvidia/tensorrt:20.03-py3
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-18.04-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          mkdir /usr/local/cudnn
+          mkdir /usr/local/tensorrt
+          ln -s /usr/include/ /usr/local/cudnn/include
+          ln -s /usr/include/ /usr/local/tensorrt/include
+          ln -s /usr/lib/x86_64-linux-gnu/ /usr/local/cudnn/lib64
+          ln -s /usr/lib/x86_64-linux-gnu/ /usr/local/tensorrt/lib
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_cuda_linux.sh
+          cd cuda_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/cuda_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-1804-x86-cuda:
+    needs: [setup]
+    runs-on: ubuntu-18.04
+    container:
+      image: nvcr.io/nvidia/tensorrt:20.03-py3
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-18.04-x86-cuda
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          mkdir /usr/local/cudnn
+          mkdir /usr/local/tensorrt
+          ln -s /usr/include/ /usr/local/cudnn/include
+          ln -s /usr/include/ /usr/local/tensorrt/include
+          ln -s /usr/lib/x86_64-linux-gnu/ /usr/local/cudnn/lib64
+          ln -s /usr/lib/x86_64-linux-gnu/ /usr/local/tensorrt/lib
+          export TENSORRT_ROOT_DIR=/usr/local/tensorrt/
+          export CUDNN_ROOT_DIR=/usr/local/cudnn/
+          cd ./scripts
+          ./build_linux.sh
+          cd linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+
+  ubuntu-2004-x86:
+    needs: [setup]
+    runs-on: ubuntu-20.04
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-ubuntu-20.04-x86
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Configure
+        run: sudo apt-get install attr
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.tar.gz
+        run: |
+          cd ./scripts
+          ./build_x86_linux.sh
+          cd ./x86_linux_release
+          tar -zcvf ${ASSET_PATH} lib include bin
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/x86_linux_release/${{ env.ASSET_NAME }}.tar.gz
+          if-no-files-found: error
+ 
+  macos:
+    needs: [setup]
+    runs-on: macos-latest
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-macos
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build
+        env:
+            ASSET_PATH: ${{ env.ASSET_NAME }}.zip
+        run: |
+          cd ./scripts
+          ./build_macos.sh
+          cd ./macos_release
+          zip -9r ${ASSET_PATH} .
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: ./scripts/macos_release/${{ env.ASSET_NAME }}.zip
+          if-no-files-found: error
+
+  windows:
+    needs: [setup]
+    runs-on: windows-latest
+    env:
+      ASSET_NAME: tnn-${{ needs.setup.outputs.VERSION }}-windows
+    outputs:
+      ASSET_NAME: ${{ env.ASSET_NAME }}
+    steps:
+      - uses: actions/checkout@v2
+      - uses: ilammy/msvc-dev-cmd@v1
+      - uses: seanmiddleditch/gha-setup-ninja@master
+      - name: Build
+        run: |
+          cd .\scripts
+          .\build_msvc.bat
+          cd .\msvc_release
+          7z a -r ${{ env.ASSET_NAME }}.zip .
+      - name: Upload_asset
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.ASSET_NAME }}
+          path: .\scripts\msvc_release\${{ env.ASSET_NAME }}.zip
+          if-no-files-found: error
+
+  release:
+    needs: [setup, full-source, android, ios, centos7-x86, centos7-cuda, centos7-x86-cuda, centos8-x86, ubuntu-1604-x86, ubuntu-1604-cuda, ubuntu-1604-x86-cuda, ubuntu-1804-x86, ubuntu-1804-cuda, ubuntu-1804-x86-cuda, ubuntu-2004-x86, macos, windows]
+    runs-on: ubuntu-latest
+    env:
+      ARTIFACTS_PATH: artifacts
+    steps:
+    - name: Download
+      id: download_artifacts
+      uses: actions/download-artifact@v2
+      with:
+        path: ${{ env.ARTIFACTS_PATH }}
+    - name: Create-release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        tag_name: ${{ needs.setup.outputs.VERSION }}
+        release_name: TNN ${{ needs.setup.outputs.VERSION }}
+        draft: true
+        prerelease: false
+    - name: Upload-full-source
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.full-source.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.zip
+        asset_name: ${{ env.ASSET_NAME }}.zip
+        asset_content_type: application/zip
+    - name: Upload-android
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.android.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.zip
+        asset_name: ${{ env.ASSET_NAME }}.zip
+        asset_content_type: application/zip
+    - name: Upload-ios
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ios.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.zip
+        asset_name: ${{ env.ASSET_NAME }}.zip
+        asset_content_type: application/zip
+    - name: Upload-centos7-x86
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.centos7-x86.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-centos8-x86
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.centos8-x86.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-16.04-x86
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1604-x86.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-16.04-cuda
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1604-cuda.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-16.04-x86-cuda
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1604-x86-cuda.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-18.04-x86
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1804-x86.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-18.04-cuda
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1804-cuda.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-18.04-x86-cuda
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-1804-x86-cuda.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-ubuntu-20.04-x86
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.ubuntu-2004-x86.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.tar.gz
+        asset_name: ${{ env.ASSET_NAME }}.tar.gz
+        asset_content_type: application/tar+gzip
+    - name: Upload-macos
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.macos.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.zip
+        asset_name: ${{ env.ASSET_NAME }}.zip
+        asset_content_type: application/zip
+    - name: Upload-windows
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ASSET_NAME: ${{ needs.windows.outputs.ASSET_NAME }}
+      with:
+        upload_url: ${{ steps.create_release.outputs.upload_url }}
+        asset_path: ${{ env.ARTIFACTS_PATH }}/${{ env.ASSET_NAME }}/${{ env.ASSET_NAME }}.zip
+        asset_name: ${{ env.ASSET_NAME }}.zip
+        asset_content_type: application/zip
diff --git a/3rdparty/TNN/.gitignore b/3rdparty/TNN/.gitignore
new file mode 100644
index 0000000..21221bf
--- /dev/null
+++ b/3rdparty/TNN/.gitignore
@@ -0,0 +1,498 @@
+.DS_Store
+.vscode
+build
+build32
+build64
+release
+tags
+.idea/
+tools/onnx2tnn/onnx-converter/3rdparty/
+GPATH
+GRTAGS
+GTAGS
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Global/JetBrains.gitignore
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Global/Emacs.gitignore
+
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+dist/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Global/Vim.gitignore
+
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Global/Linux.gitignore
+
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Global/macOS.gitignore
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/CMake.gitignore
+
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/C++.gitignore
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+
+### https://raw.github.com/github/gitignore/eee21bf0c397cddc39ff1c94615d135e0ad36f8c/Android.gitignore
+
+# Built application files
+*.aar
+*.ap_
+*.aab
+
+# Files for the ART/Dalvik VM
+*.dex
+
+# Java class files
+*.class
+
+# Generated files
+bin/
+gen/
+out/
+#  Uncomment the following line in case you need and you don't have the release build type files in your app
+# release/
+
+# Gradle files
+.gradle/
+build/
+
+# Local configuration file (sdk path, etc)
+local.properties
+
+# Proguard folder generated by Eclipse
+proguard/
+
+# Log Files
+*.log
+
+# Android Studio Navigation editor temp files
+.navigation/
+
+# Android Studio captures folder
+captures/
+
+# IntelliJ
+*.iml
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/gradle.xml
+.idea/assetWizardSettings.xml
+.idea/dictionaries
+.idea/libraries
+# Android Studio 3 in .gitignore file.
+.idea/caches
+.idea/modules.xml
+# Comment next line if keeping position of elements in Navigation Editor is relevant for you
+.idea/navEditor.xml
+
+# Keystore files
+# Uncomment the following lines if you do not want to check your keystore files in.
+#*.jks
+#*.keystore
+
+# External native build folder generated in Android Studio 2.2 and later
+.externalNativeBuild
+.cxx/
+
+# Google Services (e.g. APIs or Firebase)
+# google-services.json
+
+# Freeline
+freeline.py
+freeline/
+freeline_project_description.json
+
+# fastlane
+fastlane/report.xml
+fastlane/Preview.html
+fastlane/screenshots
+fastlane/test_output
+fastlane/readme.md
+
+# Version control
+vcs.xml
+
+# lint
+lint/intermediates/
+lint/generated/
+lint/outputs/
+lint/tmp/
+# lint/reports/
+
+source/tnn/network/tensorrt/thirdparty/TensorRT*
+model/
+# opencl generated code
+opencl_program.cc
+
+# opencl generated code
+opencl_program.cc
diff --git a/3rdparty/TNN/.travis.yml b/3rdparty/TNN/.travis.yml
new file mode 100644
index 0000000..db6bc36
--- /dev/null
+++ b/3rdparty/TNN/.travis.yml
@@ -0,0 +1,73 @@
+sudo: false
+
+git:
+  depth: 3
+  quiet: true
+  
+addons:
+  apt:
+    package:
+      - lcov
+
+matrix:
+  include:
+    - name: "Linux | Arm64 | build"
+      os: linux
+      arch: arm64
+      before_install:
+        - ./scripts/.ci/preflight.sh arm || travis_terminate 0
+      script:
+        - ./scripts/build_aarch64_linux.sh
+
+    - name: "Linux | Arm32 | build"
+      os: linux
+      arch: arm64
+      before_install:
+        - ./scripts/.ci/preflight.sh arm || travis_terminate 0
+      before_script:
+        - sudo dpkg --add-architecture armhf
+        - sudo apt-get update
+        - sudo apt-get -y install crossbuild-essential-armhf libc6:armhf libstdc++-5-dev:armhf linux-libc-dev:armhf
+      script:
+        - ./scripts/build_armhf_linux.sh
+
+    - name: "Linux | Arm64 | test"
+      os: linux
+      compiler: clang
+      arch: arm64
+      before_install:
+        - ./scripts/.ci/preflight.sh arm || travis_terminate 0
+      script:
+        - travis_wait 40 ./scripts/build_test.sh
+
+    - name: "Windows | x64 | build"
+      os: windows
+      language: cpp
+      before_install:
+        - ./scripts/.ci/preflight.sh x86 || travis_terminate 0
+      install:
+        - PowerShell -Command 'Set-ExecutionPolicy -ExecutionPolicy RemoteSigned'
+        - choco install ninja
+      script:
+        - scripts/build_msvc_native.bat x64 ci
+      env:
+        - CXX=cl.exe
+        - CXX_FOR_BUILD=cl.exe
+        - CC=cl.exe
+        - CC_FOR_BUILD=cl.exe
+
+    - name: "Windows | x86 | build"
+      os: windows
+      language: cpp
+      before_install:
+        - ./scripts/.ci/preflight.sh x86 || travis_terminate 0
+      install:
+        - PowerShell -Command 'Set-ExecutionPolicy -ExecutionPolicy RemoteSigned'
+        - choco install ninja
+      script:
+        - scripts/build_msvc_native.bat x86 ci
+      env:
+        - CXX=cl.exe
+        - CXX_FOR_BUILD=cl.exe
+        - CC=cl.exe
+        - CC_FOR_BUILD=cl.exe
diff --git a/3rdparty/TNN/CMakeLists.txt b/3rdparty/TNN/CMakeLists.txt
new file mode 100644
index 0000000..684fa8f
--- /dev/null
+++ b/3rdparty/TNN/CMakeLists.txt
@@ -0,0 +1,390 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(TNN)
+
+ENABLE_LANGUAGE(ASM)
+
+set(TNN_MAJOR_VERSION 0)
+set(TNN_MINOR_VERSION 1)
+set(TNN_PATCH_VERSION 0)
+set(TNN_BUILD_VERSION 0)
+set(TNN_VERSION "${TNN_MAJOR_VERSION}.${TNN_MINOR_VERSION}.${TNN_PATCH_VERSION}.${TNN_BUILD_VERSION}")
+
+option(TNN_CPU_ENABLE "Enable Cpu" ON)
+option(TNN_X86_ENABLE  "Enable X86" OFF)
+option(TNN_ARM_ENABLE "Enable Arm" OFF)
+option(TNN_ARM82_ENABLE "Enable Arm82" OFF)
+option(TNN_METAL_ENABLE "Enable Metal" OFF)
+option(TNN_OPENCL_ENABLE "Enable OpenCL" OFF)
+option(TNN_CUDA_ENABLE "Enable CUDA" OFF)
+option(TNN_DSP_ENABLE "Enable DSP" OFF)
+option(TNN_ATLAS_ENABLE "Enable Atlas" OFF)
+option(TNN_TENSORRT_ENABLE "Enable TensorRT" OFF)
+option(TNN_OPENVINO_ENABLE  "Enable OPENVINO" OFF)
+option(TNN_NPU_ENABLE "Enable NPU" OFF)
+option(TNN_HUAWEI_NPU_ENABLE "Enable NPU" OFF)
+option(TNN_RK_NPU_ENABLE "Enable RKNPU" OFF)
+option(TNN_SYMBOL_HIDE "Enable Hide Symbol Visibility" ON)
+option(TNN_OPENMP_ENABLE "Enable OpenMP" OFF)
+option(TNN_BUILD_SHARED "Build Shared Library" ON)
+option(TNN_OPENVINO_BUILD_SHARED "Build Shared Openvino Library" OFF)
+option(TNN_TEST_ENABLE "Enable Test" OFF)
+option(TNN_UNIT_TEST_ENABLE "Enable Test" OFF)
+option(TNN_PROFILER_ENABLE "Enable Test" OFF)
+option(TNN_QUANTIZATION_ENABLE "Enable Test" OFF)
+option(TNN_MODEL_CHECK_ENABLE "Enable Test" OFF)
+option(TNN_BENCHMARK_MODE "Enable Benchmark" OFF)
+option(TNN_UNIT_TEST_BENCHMARK "Enable Benchmark Layer" OFF)
+option(TNN_CONVERTER_ENABLE "Enable Model Converter" OFF)
+option(TNN_ONNX2TNN_ENABLE "Enable ONNX2TNN Converter" OFF)
+option(TNN_TNN2MEM_ENABLE "Enable tnn2mem" OFF)
+option(TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE "Enable Build Benchmark Test Lib" OFF)
+option(TNN_GLIBCXX_USE_CXX11_ABI_ENABLE "Enable Use CXX11 ABI" ON)
+
+set(TNN_USE_GFLAGS OFF)
+
+message(${CMAKE_SOURCE_DIR})
+message(${CMAKE_CURRENT_SOURCE_DIR})
+
+include(cmake/macros.cmake)
+
+if (SYSTEM.Windows)
+    add_definitions(-DBUILDING_DLL)
+endif()
+
+if(TNN_PROFILER_ENABLE)
+    add_definitions(-DTNN_PROFILE)
+    set(TNN_SYMBOL_HIDE OFF)
+endif()
+
+if(TNN_BENCHMARK_MODE)
+    add_definitions(-DGENERATE_RESOURCE)
+endif()
+
+if(MSVC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4003 /wd4819 /wd4244 /wd4018 /utf-8")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4003 /wd4819 /wd4244 /wd4018 /utf-8")
+endif()
+
+# ignore loop-vectorize warning
+if(SYSTEM.Windows)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pass-failed")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pass-failed")
+endif()
+
+# ignore deprecated warning
+if(SYSTEM.Windows)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-declarations -Wno-ignored-attributes")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-ignored-attributes")
+endif()
+
+if(DEBUG)
+    set(TNN_SYMBOL_HIDE OFF)
+    add_definitions(-DDEBUG)
+    if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "")
+        set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "set build type to debug" FORCE)
+    endif()
+else()
+    if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "")
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING "set build type to release" FORCE)
+    endif()
+    if(BUILD_FOR_ANDROID_COMMAND)
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s -Wl,--gc-sections")
+    endif()
+endif()
+
+if(TNN_TEST_ENABLE)
+    option(TNN_METAL_FLOAT32 "Enable Metal Float32" ON)
+else()
+    set(TNN_UNIT_TEST_ENABLE OFF)
+endif()
+
+if(TNN_UNIT_TEST_ENABLE)
+    enable_testing()
+    set(TNN_CPU_ENABLE ON)
+    set(TNN_SYMBOL_HIDE OFF)
+    add_definitions(-DGENERATE_RESOURCE)
+endif()
+
+if(TNN_CONVERTER_ENABLE OR TNN_ONNX2TNN_ENABLE)
+    set(TNN_SYMBOL_HIDE OFF)
+    add_definitions(-DTNN_CONVERTER_RUNTIME)
+endif()
+
+if(TNN_QUANTIZATION_ENABLE OR TNN_MODEL_CHECK_ENABLE)
+    set(TNN_SYMBOL_HIDE OFF)
+    add_definitions(-DFORWARD_CALLBACK_ENABLE)
+endif()
+
+if(TNN_QUANTIZATION_ENABLE OR TNN_UNIT_TEST_ENABLE)
+    add_definitions(-DGET_INTERP_ENABLE)
+endif()
+
+if(TNN_MODEL_CHECK_ENABLE)
+    option(TNN_METAL_FLOAT32 "Enable Metal Float32" ON)
+endif()
+
+if(TNN_ARM82_ENABLE)
+    add_definitions(-DTNN_ARM82=1)
+endif()
+
+# only used to simulate arm82 computation in the unit test
+option(TNN_ARM82_SIMU "Enable arm82 simulation" OFF)
+if(TNN_ARM82_SIMU)
+    add_definitions(-DTNN_ARM82_SIMU)
+endif()
+
+if(TNN_METAL_FLOAT32)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTNN_METAL_FULL_PRECISION=1")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTNN_METAL_FULL_PRECISION=1")
+
+    if(TNN_PROFILER_ENABLE OR TNN_MODEL_CHECK_ENABLE)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTNN_METAL_BENCHMARK=1 -DTNN_METAL_DEBUG=1")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTNN_METAL_BENCHMARK=1 -DTNN_METAL_DEBUG=1")
+    endif()
+endif()
+
+if(TNN_OPENMP_ENABLE)
+    FIND_PACKAGE(OpenMP REQUIRED)
+    if(OPENMP_FOUND)
+        if(MSVC)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+        else()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+            include_directories(${OpenMP_C_INCLUDE_DIRS} ${OpenMP_CXX_INCLUDE_DIRS})
+            link_libraries(${OpenMP_C_LIBRARIES} ${OpenMP_CXX_LIBRARIES})
+        endif()
+    else()
+        error("OpenMP Not Found.")
+    endif()
+endif()
+
+
+if(UNIX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+    if(TNN_GLIBCXX_USE_CXX11_ABI_ENABLE)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    endif()
+endif()
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+if(TNN_METAL_ENABLE)
+    add_compile_options(-x objective-c++)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fobjc-arc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
+endif()
+
+if(TNN_TNN2MEM_ENABLE)
+    add_subdirectory(tools/tnn2mem)
+endif()
+
+message(STATUS ">>>>>>>>>>>>>")
+message(STATUS "TNN BUILD INFO:")
+message(STATUS "\tSystem: ${CMAKE_SYSTEM_NAME}")
+message(STATUS "\tProcessor: ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "\tCpu:\t${TNN_CPU_ENABLE}")
+message(STATUS "\tX86:\t${TNN_X86_ENABLE}")
+message(STATUS "\tArm:\t${TNN_ARM_ENABLE}")
+message(STATUS "\tArm82:\t${TNN_ARM82_ENABLE}")
+message(STATUS "\tMetal:\t${TNN_METAL_ENABLE}")
+message(STATUS "\tOpenCL:\t${TNN_OPENCL_ENABLE}")
+message(STATUS "\tCUDA:\t${TNN_CUDA_ENABLE}")
+message(STATUS "\tDSP:\t${TNN_DSP_ENABLE}")
+message(STATUS "\tAtlas:\t${TNN_ATLAS_ENABLE}")
+message(STATUS "\tTensorRT:\t${TNN_TENSORRT_ENABLE}")
+message(STATUS "\tHuaweiNPU:\t${TNN_HUAWEI_NPU_ENABLE}")
+message(STATUS "\tRKNPU:\t${TNN_RK_NPU_ENABLE}")
+message(STATUS "\tOpenVINO:\t${TNN_OPENVINO_ENABLE}")
+message(STATUS "\tOpenMP:\t${TNN_OPENMP_ENABLE}")
+message(STATUS "\tTEST:\t${TNN_TEST_ENABLE}")
+message(STATUS "\t--Unit Test:\t${TNN_UNIT_TEST_ENABLE}")
+message(STATUS "\tQuantization:\t${TNN_QUANTIZATION_ENABLE}")
+message(STATUS "\tModelCheck:\t${TNN_MODEL_CHECK_ENABLE}")
+message(STATUS "\tDEBUG:\t${DEBUG}")
+message(STATUS "\tPROFILE:\t${TNN_PROFILER_ENABLE}")
+message(STATUS "\tBENCHMARK:\t${TNN_BENCHMARK_MODE}")
+message(STATUS "\tBENCHMARK Layer:\t${TNN_UNIT_TEST_BENCHMARK}")
+message(STATUS "\tModel Converter:\t${TNN_CONVERTER_ENABLE}")
+message(STATUS "\tONNX2TNN Converter:\t${TNN_ONNX2TNN_ENABLE}")
+message(STATUS "\tTNN2MEM:\t${TNN_TNN2MEM_ENABLE}")
+message(STATUS "\tBENCHMARK Test Lib:\t${TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE}")
+
+include_directories(include)
+include_directories(source)
+
+file(GLOB_RECURSE SRC "source/tnn/core/*.h"
+                      "source/tnn/core/*.cc"
+                      "source/tnn/layer/*.h"
+                      "source/tnn/layer/*.cc"
+                      "source/tnn/utils/*.h"
+                      "source/tnn/utils/*.cc"
+                      "source/tnn/interpreter/*.h"
+                      "source/tnn/interpreter/*.cc"
+                      "source/tnn/optimizer/*.h"
+                      "source/tnn/optimizer/*.cc"
+                      "source/tnn/extern_wrapper/*.h"
+                      "source/tnn/extern_wrapper/*.cc"
+                      "source/tnn/memory_manager/*.h"
+                      "source/tnn/memory_manager/*.cc")
+
+if(TNN_SYMBOL_HIDE AND UNIX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden")
+endif()
+
+if(TNN_X86_ENABLE)
+    add_subdirectory(source/tnn/device/x86)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNX86>")
+endif()
+
+if(TNN_CPU_ENABLE)
+    add_subdirectory(source/tnn/device/cpu)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNCpu>")
+endif()
+
+if(TNN_ARM_ENABLE)
+    add_subdirectory(source/tnn/device/arm)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNArm>")
+    if(TNN_ARM82_ENABLE)
+        set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNArm82>")
+    endif()
+endif()
+
+if(TNN_OPENVINO_ENABLE)
+    add_subdirectory(source/tnn/network/openvino)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNOpenVINO>")
+endif()
+
+if(TNN_OPENCL_ENABLE)
+    include(FindPythonInterp REQUIRED)
+    if (NOT PYTHON_EXECUTABLE)
+        message (FATAL_ERROR "No Python installation found! It is required by OpenCL codegen.")
+    endif ()
+
+    if(SHARING_MEM_WITH_OPENGL)
+        add_definitions(-DSHARING_MEM_WITH_OPENGL)
+        add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=120)
+    endif()
+    add_subdirectory(source/tnn/device/opencl)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNOpenCL>")
+endif()
+
+if(TNN_METAL_ENABLE)
+    add_subdirectory(source/tnn/device/metal)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNMetal>")
+endif()
+
+if(TNN_CUDA_ENABLE)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+    add_subdirectory(source/tnn/device/cuda)
+    if(TNN_TENSORRT_ENABLE)
+        add_subdirectory(source/tnn/network/tensorrt)
+        set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNTensorRT>")
+    endif()
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNCuda>")
+endif()
+
+if(TNN_HUAWEI_NPU_ENABLE)
+    if(ANDROID_ABI STREQUAL "armeabi-v7a")
+        link_directories(
+                third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a/
+        )
+    else()
+        link_directories(
+                third_party/huawei_npu/hiai_ddk_latest/arm64-v8a/
+        )
+    endif()
+    add_subdirectory(source/tnn/device/huawei_npu)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNNPU>")
+endif()
+
+if(TNN_RK_NPU_ENABLE)
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        link_directories(
+                ./third_party/rknpu/rknpu_ddk/lib64/
+        )
+    else()
+        link_directories(
+                ./third_party/rknpu/rknpu_ddk/lib/
+        )
+    endif()
+    add_subdirectory(source/tnn/device/rknpu)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNRKNPU>")
+endif()
+
+if(TNN_BUILD_SHARED)
+    add_library(TNN SHARED ${SRC} ${TARGET_OBJECTS})
+    set_target_properties(TNN PROPERTIES VERSION ${TNN_VERSION} SOVERSION ${TNN_MAJOR_VERSION})
+    if(SHARING_MEM_WITH_OPENGL)
+        target_link_libraries(TNN -lEGL -lGLESv2)
+    endif()
+else()
+    add_library(TNN STATIC ${SRC} ${TARGET_OBJECTS})
+    set_target_properties(TNN PROPERTIES VERSION ${TNN_VERSION})
+    if(SHARING_MEM_WITH_OPENGL)
+        target_link_libraries(TNN -lEGL -lGLESv2)
+    endif()
+endif()
+
+if(TNN_QUANTIZATION_ENABLE)
+    add_subdirectory(tools/quantization)
+endif()
+
+if(SYSTEM.Linux)
+    include(platforms/linux/CMakeLists.txt)
+elseif(SYSTEM.Android)
+    include(platforms/android/CMakeLists.txt)
+elseif(SYSTEM.iOS)
+    include(platforms/ios/CMakeLists.txt)
+elseif(SYSTEM.Darwin)
+    include(platforms/mac/CMakeLists.txt)
+elseif(SYSTEM.Windows)
+    include(platforms/windows/CMakeLists.txt)
+endif()
+
+if (TNN_TEST_ENABLE OR TNN_CONVERTER_ENABLE OR TNN_MODEL_CHECK_ENABLE)
+    set(TNN_USE_GFLAGS ON)
+endif ()
+
+if (TNN_USE_GFLAGS)
+    add_subdirectory(third_party/gflags)
+    get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+    include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+endif ()
+
+if(TNN_MODEL_CHECK_ENABLE)
+    add_subdirectory(tools/model_check)
+endif()
+
+if(TNN_TEST_ENABLE)
+    add_subdirectory(test)
+endif()
+
+if(TNN_CONVERTER_ENABLE)
+    add_subdirectory(third_party/flatbuffers)
+    add_subdirectory(tools/converter)
+endif()
+
+if(TNN_ONNX2TNN_ENABLE)
+    add_subdirectory(tools/onnx2tnn/onnx-converter)
+endif()
+
+if(TNN_COVERAGE)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -coverage -fprofile-arcs -ftest-coverage")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -coverage -lgcov")
+    endif()
+endif()
diff --git a/3rdparty/TNN/Dockerfile b/3rdparty/TNN/Dockerfile
new file mode 100644
index 0000000..30d5296
--- /dev/null
+++ b/3rdparty/TNN/Dockerfile
@@ -0,0 +1,39 @@
+FROM ubuntu:18.04
+
+ENV LANG C.UTF-8
+ENV LANGUAGE C.UTF-8
+ENV LC_ALL C.UTF-8
+
+RUN sed -i s@/archive.ubuntu.com/@/mirrors.tencent.com/@g /etc/apt/sources.list
+RUN sed -i s@/security.ubuntu.com/@/mirrors.tencent.com/@g /etc/apt/sources.list
+
+RUN apt-get clean && apt-get update -y && apt-get -y install --no-install-recommends apt-utils
+
+RUN apt-get -y install git cmake make
+
+RUN apt-get -y install gcc g++
+
+RUN apt-get -y install protobuf-compiler libprotobuf-dev
+
+RUN apt-get -y install python3 python3-dev python3-pip
+
+#RUN mkdir -p  /root/.pip && echo "[global]\n index-url = https://mirrors.tencent.com/pypi/simple/" >> /root/.pip/pip.conf
+
+RUN python3 -m pip install --upgrade pip && pip3 install -U onnx==1.6.0 onnxruntime numpy onnx-simplifier setuptools protobuf
+
+
+RUN pip3 install tensorflow==1.15.0 tf2onnx
+
+ENV TNN_ROOT=/opt/TNN
+ENV TOOLS_ROOT=$TNN_ROOT/tools
+# COPY ./onnx2tnn $TOOLS_ROOT/onnx2tnn
+# COPY ./caffe2onnx $TOOLS_ROOT/caffe2onnx
+# COPY ./convert2tnn $TOOLS_ROOT/convert2tnn
+COPY . $TNN_ROOT/
+#RUN cd $TOOLS_ROOT/onnx2tnn/onnx-converter && ./build.sh
+RUN cd $TOOLS_ROOT/convert2tnn && bash ./build.sh
+
+
+RUN python3 $TOOLS_ROOT/convert2tnn/converter.py -h
+
+WORKDIR $TOOLS_ROOT/convert2tnn/
diff --git a/3rdparty/TNN/LICENSE b/3rdparty/TNN/LICENSE
new file mode 100644
index 0000000..e8a499e
Binary files /dev/null and b/3rdparty/TNN/LICENSE differ
diff --git a/3rdparty/TNN/README.md b/3rdparty/TNN/README.md
new file mode 100644
index 0000000..de1be0f
--- /dev/null
+++ b/3rdparty/TNN/README.md
@@ -0,0 +1,132 @@
+[中文版本](README_CH.md)
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+## Introduction
+
+TNN: A high-performance, lightweight neural network inference framework open sourced by Tencent Youtu Lab. It also has many outstanding advantages such as cross-platform, high performance, model compression, and code tailoring. The TNN framework further strengthens the support and performance optimization of mobile devices on the basis of the original Rapidnet and ncnn frameworks. At the same time, it refers to the high performance and good scalability characteristics of the industry's mainstream open source frameworks, and expands the support for X86 and NV GPUs. On the mobile phone, TNN has been used by many applications such as mobile QQ, weishi, and Pitu. As a basic acceleration framework for Tencent Cloud AI, TNN has provided acceleration support for the implementation of many businesses. Everyone is welcome to participate in the collaborative construction to promote the further improvement of the TNN reasoning framework.
+
+## Effect Example
+
+Face Detection(blazeface)   |   Object Detection(yolov5s)    |  Face Alignment<br>(from Tencent Youtu Lab)  |   Hair Segmentation<br>(from Tencent Guangying Lab) 
+:-------------------------: | :------: | :------: | :------:
+[![face_detection](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/face_detection.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/blazeface) <br> model link: [tflite](https://github.com/google/mediapipe/blob/master/mediapipe/models/face_detection_front.tflite) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/blazeface) | [![yolov5](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/object-detection.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/yolov5) <br> model link: [onnx](https://github.com/ultralytics/yolov5/blob/master/models/export.py) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/yolov5) | [![youtu_face_alignment](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/face_alignment.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/youtu_face_alignment) <br> model link: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/youtu_face_alignment) | [![hair_segmentation](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/hair_seg_red.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/hair_segmentation) <br> model link: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/hair_segmentation)
+
+Pose Estimation<br>(from Tencent Guangliu)   |   Pose Estimation<br>(blazepose)    |   Chinese OCR |  Reading Comprehension
+:--------------------------: | :------: | :------: | :------:
+[![skeleton](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/skeleton_guangliu.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/skeleton) <br> model link: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/skeleton) | [![blazepose](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/skeleton_blazepose.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/blazepose) <br> model link: [tflite](https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_landmark_full_body.tflite) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/blazepose) | [![chinese-ocr](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/chinese-ocr.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/chinese-ocr) <br> model link: [onnx](https://github.com/DayBreak-u/chineseocr_lite/tree/onnx/models) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/chinese-ocr) | [![bertsquad10](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/bert_squad.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/bertsquad10) <br> model link: [onnx](https://github.com/onnx/models/blob/master/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/bertsquad10)
+
+<small>Chinese OCR demo is the TNN implementation of [chineseocr_lite](https://github.com/DayBreak-u/chineseocr_lite) project. It is lightweight and supports tilted, rotated and vertical text recognition.</small>
+
+The support for each demo is shown in the following table. You can click the ✅ and find the entrance code for each demo.
+demo                                                                                      |   ARM    |  OpenCL  |   Metal  |    NPU   |    X86   |    CUDA   
+:---------------------------------------------------------------------------------------: | :------: | :------: | :------: | :------: | :------: | :------:
+[Face Detection](https://github.com/Tencent/TNN/blob/master/examples/base/blazeface_detector.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector) | [✅ ](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector) | | 
+[Object Detection](https://github.com/Tencent/TNN/blob/master/examples/base/object_detector_yolo.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector) | | 
+[Face Alignment](https://github.com/Tencent/TNN/blob/master/examples/base/face_detect_aligner.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign) | | 
+[Hair Segmentation](https://github.com/Tencent/TNN/blob/master/examples/base/hair_segmentation.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation) | | 
+[Pose Estimation<br>(from Tencent Guangliu)](https://github.com/Tencent/TNN/blob/master/examples/base/skeleton_detector.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector) | | 
+[Pose Estimation(blazepose)](https://github.com/Tencent/TNN/blob/master/examples/base/pose_detect_landmark.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark) |  |  | 
+[Chinese OCR](https://github.com/Tencent/TNN/blob/master/examples/base/ocr_driver.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.mm) |  | | 
+[Reading Comprehension](https://github.com/Tencent/TNN/blob/master/examples/base/bert_tokenizer.cc)  | | | | | [✅](https://github.com/Tencent/TNN/blob/master/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc) | [✅](https://github.com/Tencent/TNN/blob/master/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc)
+
+## Quick Start
+
+It is very simple to use TNN. If you have a trained model, the model can be deployed on the target platform through three steps.
+1. Convert the trained model into a TNN model. We provide a wealth of tools to help you complete this step, whether you are using Tensorflow, Pytorch, or Caffe, you can easily complete the conversion.
+Detailed hands-on tutorials can be found here [How to Create a TNN Model](doc/en/user/convert_en.md).
+
+2. When you have finished converting the model, the second step is to compile the TNN engine of the target platform. You can choose among different acceleration solutions such as ARM/OpenCL/Metal/NPU/X86/CUDA according to the hardware support.
+   For these platforms, TNN provides convenient one-click scripts to compile. For detailed steps, please refer to [How to Compile TNN](doc/en/user/compile_en.md).
+
+3. The final step is to use the compiled TNN engine for inference. You can make program calls to TNN inside your application. We provide a rich and detailed demo as a reference to help you complete.
+    * [Run an iOS Demo](doc/en/user/demo_en.md)
+    * [Run an Android Demo](doc/en/user/demo_en.md)
+    * [Run an Linux/Windows Demo](doc/en/user/demo_en.md)
+
+## Technical Solutions
+
+At present, TNN has been launched in various major businesses, and its following characteristics have been widely praised.
+
+* Computation optimization
+    * The backend operators are primely optimized to make the best use of computing power in different architectures, regarding instruction issue, throughput, delay, cache bandwidth, cache delay, registers, etc..
+    * The TNN performance on mainstream hardware platforms (CPU: ARMv7, ARMv8， X86, GPU: Mali, Adreno, Apple， NV GPU， NPU) has been greatly tuned and improved.
+    * The convolution function is implemented by various algorithms such as Winograd, Tile-GEMM, Direct Conv, etc., to ensure efficiency under different parameters and sizes.
+    * Op fusion: TNN can do offline analysis of network graph, fuse multiple simple operations and reduce overhead such as redundant memory access and kernel startup cost.
+
+* Low precision computation acceleration
+    * TNN supports INT8/FP16 mode, reduces model size & memory consumption, and utilizes specific hardware low-precision instructions to accelerate calculations.
+    * TNN supports INT8 WINOGRAD algorithm, (input 6bit), further reduces the model calculation complexity without sacrificing the accuracy.
+    * TNN supports mixed-precision data in one model, speeding up the model's calculation speed while preserving its accuracy.
+
+* Memory optimization
+    * Efficient "memory pool" implementation: Based on a full network DAG analysis, the implementation reuses memory between non-dependent nodes which reduces memory cost by 90%.
+    * Cross-model memory reduces: This supports external real-time design for network memory so that multiple models can share mutual memory.
+
+* The performance of mainstream models on TNN: [benchmark data](doc/benchmark_data.md)
+
+* TNN architecture diagram：
+
+   <div><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/en/imgs/tnn_architect.jpg"/>
+
+* TNN supports TensorFlow, Pytorch, MxNet, Caffe, and other training frameworks through ONNX, leveraging the continuous improvement of the ONNX open-source society.
+  Currently, TNN supports 100+ ONNX operators, consisting of most of the mainstream CNN, NLP operators needed.
+* TNN runs on mainstream operating systems (Android, iOS, embedded Linux, Windows, Linux), and is compatible with ARM CPU,X86 GPU, NPU hardware platform.
+* TNN is constructed through Modular Design, which abstracts and isolates components such as model analysis, graph construction, graph optimization, low-level hardware adaptation, and high-performance kernel.
+   It uses "Factory Mode" to register and build devices, that tries to minimize the cost of supporting more hardware and acceleration solutions.
+* The size of the mobile dynamic library is only around 400KB, and it provides basic image conversion operations, which are light-weight and convenient. TNN uses unified models and interfaces across platforms and can switch easily by configuring just one single parameter.
+
+## Learn About TNN Abilities
+* [Operator Support](doc/en/user/support_en.md)
+* [Model Support](doc/en/user/support_en.md)
+* [Device Support](doc/en/user/support_en.md)
+* [Profiling](doc/en/development/profiling_en.md)
+
+## Manual
+* [Compile TNN](doc/en/user/compile_en.md)
+* [Tools]()
+    * [Create a TNN Model](doc/en/user/convert_en.md)
+    * [Model Quantization](doc/en/user/quantization_en.md)
+    * [Model Visualization Netron](https://lutzroeder.github.io/netron/)
+    * [Performance Analysis](doc/en/development/profiling_en.md)
+    * [Model Alignment](doc/en/development/model_check_en.md)
+
+## API Document
+* [API call](doc/en/user/api_en.md)
+
+## Contribute to TNN
+* [Development Basics](doc/en/development/contributing_en.md)
+* [Detailed Architecture](doc/en/development/architecture_en.md)
+* [Add a New Operator](doc/en/development/add_op_en.md)
+* [Unit Test](doc/en/development/unit_test_en.md)
+
+## Roadmap
+* [Road map](doc/cn/user/roadmap.md)
+
+## Acknowledgement
+TNN referenced the following projects：
+
+* [ncnn](https://github.com/Tencent/ncnn)
+* [mace](https://github.com/XiaoMi/mace.git)
+* [MNN](https://github.com/alibaba/MNN)
+* [caffe-onnx](https://github.com/htshinichi/caffe-onnx)
+* [tensorflow-onnx](https://github.com/onnx/tensorflow-onnx)
+* [onnx](https://github.com/onnx/onnx)
+* [onnxruntime](https://github.com/microsoft/onnxruntime)
+* [openvino](https://github.com/openvinotoolkit/openvino)
+* [xbyak](https://github.com/herumi/xbyak)
+* [TensorRT](https://developer.nvidia.com/tensorrt)
+
+## License
+* [BSD 3 Clause](LICENSE)
+
+## FAQ
+* [FAQ](doc/en/faq_en.md)
+
+## Join Us
+
+* Everyone is welcome to participate to build the best inference framework in the industry.
+
+* Technical Discussion QQ Group: 913940506 Answer: TNN
+
+* Scan the QR code to join the TNN discussion group：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN-QQ.png"/>
diff --git a/3rdparty/TNN/README_CH.md b/3rdparty/TNN/README_CH.md
new file mode 100644
index 0000000..5e53c38
--- /dev/null
+++ b/3rdparty/TNN/README_CH.md
@@ -0,0 +1,132 @@
+[English Version](README.md)
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+## 简介
+
+TNN：由腾讯优图实验室开源的高性能、轻量级神经网络推理框架，同时拥有跨平台、高性能、模型压缩、代码裁剪等众多突出优势。TNN框架在原有Rapidnet、ncnn框架的基础上进一步加强了移动端设备的支持以及性能优化，同时借鉴了业界主流开源框架高性能和良好拓展性的特性，拓展了对于后台X86, NV GPU的支持。手机端 TNN已经在手Q、微视、P图等众多应用中落地，服务端TNN作为腾讯云AI基础加速框架已为众多业务落地提供加速支持。欢迎大家参与协同共建，促进TNN推理框架进一步完善。
+
+
+## 效果示例
+
+人脸检测(blazeface)   |   物体检测(yolov5s)    |  人脸配准(腾讯优图)  |   头发分割(腾讯光影)
+:-------------------------: | :------: | :------: | :------:
+[![face_detection](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/face_detection.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/blazeface) <br> 模型链接: [tflite](https://github.com/google/mediapipe/blob/master/mediapipe/models/face_detection_front.tflite) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/blazeface) | [![yolov5](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/object-detection.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/yolov5) <br> 模型链接: [onnx](https://github.com/ultralytics/yolov5/blob/master/models/export.py) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/yolov5) | [![youtu_face_alignment](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/face_alignment.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/youtu_face_alignment) <br> 模型链接: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/youtu_face_alignment) | [![hair_segmentation](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/hair_seg_red.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/hair_segmentation) <br> 模型链接: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/hair_segmentation)
+
+姿势估计(腾讯光流)   |   姿势估计(blazepose)    |  中文字符识别 |  阅读理解
+:--------------------------: | :------: | :------: | :------:
+[![skeleton](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/skeleton_guangliu.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/skeleton) <br> 模型链接: [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/skeleton) | [![blazepose](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/skeleton_blazepose.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/blazepose) <br> 模型链接: [tflite](https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_landmark/pose_landmark_full_body.tflite) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/blazepose) | [![chinese-ocr](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/chinese-ocr.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/chinese-ocr) <br> 模型链接: [onnx](https://github.com/DayBreak-u/chineseocr_lite/tree/onnx/models) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/chinese-ocr) | [![bertsquad10](https://raw.githubusercontent.com/darrenyao87/tnn-models/master/doc/demo/bert_squad.gif)](https://github.com/darrenyao87/tnn-models/tree/master/model/bertsquad10) <br> 模型链接: [onnx](https://github.com/onnx/models/blob/master/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx) [tnn](https://github.com/darrenyao87/tnn-models/tree/master/model/bertsquad10)
+
+<small>中文字符识别demo是[chineseocr_lite](https://github.com/DayBreak-u/chineseocr_lite)的TNN实现，是一个超轻量级的中文ocr，支持倾斜、旋转和竖排文字识别。</small>
+
+各个平台对demo的支持情况如下表所示，单击✅标记，便可以跳转至对应demo的入口代码。
+demo                                                                                      |   ARM    |  OpenCL  |   Metal  |    NPU   |    X86   |    CUDA   
+:---------------------------------------------------------------------------------------: | :------: | :------: | :------: | :------: | :------: | :------: 
+[人脸检测](https://github.com/Tencent/TNN/blob/master/examples/base/blazeface_detector.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector) | [✅ ](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector) | | 
+[物体检测](https://github.com/Tencent/TNN/blob/master/examples/base/object_detector_yolo.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector) | | 
+[人脸配准](https://github.com/Tencent/TNN/blob/master/examples/base/face_detect_aligner.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign) | | 
+[头发分割](https://github.com/Tencent/TNN/blob/master/examples/base/hair_segmentation.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation) | | 
+[姿势估计(腾讯光流)](https://github.com/Tencent/TNN/blob/master/examples/base/skeleton_detector.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector) | | 
+[姿势估计(blazepose)](https://github.com/Tencent/TNN/blob/master/examples/base/pose_detect_landmark.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.mm) | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark) | | 
+[中文字符识别](https://github.com/Tencent/TNN/blob/master/examples/base/ocr_driver.cc)   | ✅ | [✅](https://github.com/Tencent/TNN/tree/master/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector) | [✅](https://github.com/Tencent/TNN/blob/master/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.mm) |  | | 
+[阅读理解](https://github.com/Tencent/TNN/blob/master/examples/base/bert_tokenizer.cc)  | | | | | [✅](https://github.com/Tencent/TNN/blob/master/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc) | [✅](https://github.com/Tencent/TNN/blob/master/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc)
+
+## 快速开始
+
+使用 TNN 非常简单，如果你有一个已经训练好的模型, 那么一般而言通过以下三个步骤就能完成模型在目标平台上的部署。
+1. 第一步是把训练好的模型转换成TNN的模型，为此我们提供了丰富的工具来帮助你完成这一步，无论你使用的是 TensorFlow、PyTorch、或者 Caffe，都可以轻松完成转换。
+详细的手把手教程可以参见这里[如何转换模型](doc/cn/user/convert.md)。
+
+2. 当你完成了模型的转换，第二步就是编译目标平台的 TNN 引擎了，你可以根据自己的目标平台的硬件支持情况，选择 CPU/ARM/OpenCL/Metal/NPU/X86/CUDA 等加速方案。
+   对于这些平台，TNN 都提供了一键编译的脚本，使用非常方便。详细步骤可以参考这里[如何编译TNN](doc/cn/user/compile.md)。
+
+3. 最后一步就是使用编译好的 TNN 引擎进行推理，你可以在自己的应用程序中嵌入对 TNN 的调用，这方面我们提供了丰富而详实的 demo 来帮助你完成。
+    *  [从0开始跑通一个iOS Demo](doc/cn/user/demo.md)
+    *  [从0开始跑通一个Android Demo](doc/cn/user/demo.md)
+    *  [从0开始跑通一个Windows/Linux Demo](doc/cn/user/demo.md#四)
+
+## 技术方案
+
+目前TNN具有的以下特性获得了广泛的好评。
+
+* 计算优化
+    * 针对不同架构在硬件指令发射、吞吐、延迟、缓存带宽、缓存延迟、寄存器数量等特点，深度优化底层算子，极致利用硬件算力
+    * 主流硬件平台(CPU: ARMv7， ARMv8，X86 GPU: Mali， Adreno， Apple， NV GPU) 深度调优
+    * CNN 核心卷积运算通过 Winograd，Tile-GEMM， Direct Conv 等多种算法实现，保证不同参数、计算尺度下高效计算
+    * Op 融合：离线分析网络计算图，多个小 Op（计算量小、功能较简单）融合运算，减少反复内存读取、kernel 启动等开销
+
+* 低精度优化
+    * 支持 INT8， FP16 低精度计算，减少模型大小、内存消耗，同时利用硬件低精度计算指令加速计算
+    * 支持 INT8 Winograd 算法，(输入6bit)， 在精度满足要求的情况下，进一步降低模型计算复杂度
+    * 支持单模型多种精度混合计算，加速计算同时保证模型精度
+
+* 内存优化
+    * 高效”内存池”实现：通过 DAG 网络计算图分析，实现无计算依赖的节点间复用内存，降低 90% 内存资源消耗
+    * 跨模型内存复用：支持外部实时指定用于网络内存，实现“多个模型，单份内存”。
+
+* 主流模型实测性能：[评测数据](doc/benchmark_data.md)
+
+* TNN架构图：
+
+   <div><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/tnn_architect.jpg"/>
+
+* 通过 ONNX 支持 TensorFlow， PyTorch， MXNet， Caffe 等多种训练框架，充分利用和融入不断完善的 ONNX 开源生态。当前支持 ONNX 算子100+，覆盖主流CNN, NLP网络。
+* 支持主流安卓、iOS、Embedded Linux 操作系统, Windows, Linux，支持 ARM CPU, x86, Mali GPU, Adreno GPU, NV GPU, 达芬奇NPU，RK NPU。
+* 模块化设计，将模型解析、计算图构建、优化、底层硬件适配、高性能 kernel 实现各部分抽象隔离，通过 Factory Mode 注册、构建设备，方便接入更多的底层硬件、加速方案。
+* 移动端动态库尺寸仅约 400KB，并提供基础图像变换操作，调用简单便捷。跨平台模型统一、调用接口统一，通过单个配置参数快速切换。
+
+## 能力展示
+* [支持的算子](doc/cn/user/support.md)
+* [支持的网络](doc/cn/user/support.md)
+* [支持的架构](doc/cn/user/support.md)
+* [Benchmark性能测试方法](doc/cn/development/profiling.md)
+
+## 使用手册
+* [从源码编译](doc/cn/user/compile.md)
+* [工具集]()
+    * [模型转换](doc/cn/user/convert.md)
+    * [模型量化](doc/cn/user/quantization.md)
+    * [模型可视化Netron](https://lutzroeder.github.io/netron/)
+    * [性能分析工具](doc/cn/development/profiling.md)
+    * [模型对齐工具](doc/cn/development/model_check.md)
+
+## API文档
+* [API调用](doc/cn/user/api.md)
+
+## 贡献者须知
+* [开发基础须知](doc/cn/development/contributing.md)
+* [架构详解](doc/cn/development/architecture.md)
+* [新增OP](doc/cn/development/add_op.md)
+* [单元测试](doc/cn/development/unit_test.md)
+
+## Roadmap
+* [Road map](doc/cn/user/roadmap.md)
+
+## 致谢
+TNN参考和借鉴了下列项目：
+
+* [ncnn](https://github.com/Tencent/ncnn)
+* [mace](https://github.com/XiaoMi/mace.git)
+* [MNN](https://github.com/alibaba/MNN)
+* [caffe-onnx](https://github.com/htshinichi/caffe-onnx)
+* [tensorflow-onnx](https://github.com/onnx/tensorflow-onnx)
+* [onnx](https://github.com/onnx/onnx)
+* [onnxruntime](https://github.com/microsoft/onnxruntime)
+* [openvino](https://github.com/openvinotoolkit/openvino) 
+* [xbyak](https://github.com/herumi/xbyak)
+* [TensorRT](https://developer.nvidia.com/zh-cn/tensorrt)
+
+## License
+
+* [BSD 3 Clause](LICENSE)
+
+## FAQ
+* [FAQ 常见问题](doc/cn/faq.md)
+
+## 加入我们
+
+* 欢迎大家参与，协同共建，打造业界最好的高性能推理框架。
+
+* 技术交流 QQ 群： 913940506 答案：TNN
+
+* QQ 群二维码：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN-QQ.png"/>
diff --git a/3rdparty/TNN/RELEASE.md b/3rdparty/TNN/RELEASE.md
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/TNN-QQ.png b/3rdparty/TNN/TNN-QQ.png
new file mode 100644
index 0000000..cc236ff
Binary files /dev/null and b/3rdparty/TNN/TNN-QQ.png differ
diff --git a/3rdparty/TNN/TNN.png b/3rdparty/TNN/TNN.png
new file mode 100644
index 0000000..b07ecc7
Binary files /dev/null and b/3rdparty/TNN/TNN.png differ
diff --git a/3rdparty/TNN/_config.yml b/3rdparty/TNN/_config.yml
new file mode 100644
index 0000000..c419263
--- /dev/null
+++ b/3rdparty/TNN/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-cayman
\ No newline at end of file
diff --git a/3rdparty/TNN/benchmark/.gitignore b/3rdparty/TNN/benchmark/.gitignore
new file mode 100644
index 0000000..2211df6
--- /dev/null
+++ b/3rdparty/TNN/benchmark/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/3rdparty/TNN/benchmark/benchmark-model/densenet.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/densenet.tnnproto
new file mode 100644
index 0000000..8464dc5
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/densenet.tnnproto
@@ -0,0 +1,315 @@
+"1 0 1 4206624770 ,"
+"Placeholder 1 3 224 224 ,"
+" ,"
+"softmax_tensor ,"
+" 310 ,"
+"Convolution Relu 1 1 Placeholder Relu 1 3 64 7 7 2 2 0 0 1 0 1 1 1 ,"
+"Pooling max_pooling2d/MaxPool 1 1 Relu max_pooling2d/MaxPool 0 3 3 2 2 0 0 -1 -1 0 1 ,"
+"Mul block-0/denseblock-0-0/batch_normalization/FusedBatchNorm_mul_0 1 1 max_pooling2d/MaxPool block-0/denseblock-0-0/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-0/Relu 1 1 block-0/denseblock-0-0/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-0/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-0/Relu_activation 1 1 block-0/denseblock-0-0/Relu_output block-0/denseblock-0-0/Relu ,"
+"Convolution block-0/denseblock-0-0/conv2d/Conv2D 1 1 block-0/denseblock-0-0/Relu block-0/denseblock-0-0/conv2d/Conv2D 1 64 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-0/concat 2 1 max_pooling2d/MaxPool block-0/denseblock-0-0/conv2d/Conv2D block-0/denseblock-0-0/concat 1 ,"
+"Mul block-0/denseblock-0-1/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-0/concat block-0/denseblock-0-1/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-1/Relu 1 1 block-0/denseblock-0-1/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-1/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-1/Relu_activation 1 1 block-0/denseblock-0-1/Relu_output block-0/denseblock-0-1/Relu ,"
+"Convolution block-0/denseblock-0-1/conv2d/Conv2D 1 1 block-0/denseblock-0-1/Relu block-0/denseblock-0-1/conv2d/Conv2D 1 96 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-1/concat 2 1 block-0/denseblock-0-0/concat block-0/denseblock-0-1/conv2d/Conv2D block-0/denseblock-0-1/concat 1 ,"
+"Mul block-0/denseblock-0-2/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-1/concat block-0/denseblock-0-2/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-2/Relu 1 1 block-0/denseblock-0-2/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-2/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-2/Relu_activation 1 1 block-0/denseblock-0-2/Relu_output block-0/denseblock-0-2/Relu ,"
+"Convolution block-0/denseblock-0-2/conv2d/Conv2D 1 1 block-0/denseblock-0-2/Relu block-0/denseblock-0-2/conv2d/Conv2D 1 128 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-2/concat 2 1 block-0/denseblock-0-1/concat block-0/denseblock-0-2/conv2d/Conv2D block-0/denseblock-0-2/concat 1 ,"
+"Mul block-0/denseblock-0-3/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-2/concat block-0/denseblock-0-3/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-3/Relu 1 1 block-0/denseblock-0-3/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-3/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-3/Relu_activation 1 1 block-0/denseblock-0-3/Relu_output block-0/denseblock-0-3/Relu ,"
+"Convolution block-0/denseblock-0-3/conv2d/Conv2D 1 1 block-0/denseblock-0-3/Relu block-0/denseblock-0-3/conv2d/Conv2D 1 160 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-3/concat 2 1 block-0/denseblock-0-2/concat block-0/denseblock-0-3/conv2d/Conv2D block-0/denseblock-0-3/concat 1 ,"
+"Mul block-0/denseblock-0-4/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-3/concat block-0/denseblock-0-4/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-4/Relu 1 1 block-0/denseblock-0-4/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-4/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-4/Relu_activation 1 1 block-0/denseblock-0-4/Relu_output block-0/denseblock-0-4/Relu ,"
+"Convolution block-0/denseblock-0-4/conv2d/Conv2D 1 1 block-0/denseblock-0-4/Relu block-0/denseblock-0-4/conv2d/Conv2D 1 192 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-4/concat 2 1 block-0/denseblock-0-3/concat block-0/denseblock-0-4/conv2d/Conv2D block-0/denseblock-0-4/concat 1 ,"
+"Mul block-0/denseblock-0-5/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-4/concat block-0/denseblock-0-5/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/denseblock-0-5/Relu 1 1 block-0/denseblock-0-5/batch_normalization/FusedBatchNorm_mul_0 block-0/denseblock-0-5/Relu_output 1 ,"
+"ReLU block-0/denseblock-0-5/Relu_activation 1 1 block-0/denseblock-0-5/Relu_output block-0/denseblock-0-5/Relu ,"
+"Convolution block-0/denseblock-0-5/conv2d/Conv2D 1 1 block-0/denseblock-0-5/Relu block-0/denseblock-0-5/conv2d/Conv2D 1 224 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-0/denseblock-0-5/concat 2 1 block-0/denseblock-0-4/concat block-0/denseblock-0-5/conv2d/Conv2D block-0/denseblock-0-5/concat 1 ,"
+"Mul block-0/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/denseblock-0-5/concat block-0/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-0/Relu 1 1 block-0/batch_normalization/FusedBatchNorm_mul_0 block-0/Relu_output 1 ,"
+"ReLU block-0/Relu_activation 1 1 block-0/Relu_output block-0/Relu ,"
+"Convolution block-0/conv2d/Conv2D 1 1 block-0/Relu block-0/conv2d/Conv2D 1 256 128 1 1 1 1 0 0 1 0 1 1 0 ,"
+"Pooling block-0/average_pooling2d/AvgPool 1 1 block-0/conv2d/Conv2D block-0/average_pooling2d/AvgPool 1 2 2 2 2 0 0 -1 -1 0 1 ,"
+"Mul block-1/denseblock-1-0/batch_normalization/FusedBatchNorm_mul_0 1 1 block-0/average_pooling2d/AvgPool block-1/denseblock-1-0/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-0/Relu 1 1 block-1/denseblock-1-0/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-0/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-0/Relu_activation 1 1 block-1/denseblock-1-0/Relu_output block-1/denseblock-1-0/Relu ,"
+"Convolution block-1/denseblock-1-0/conv2d/Conv2D 1 1 block-1/denseblock-1-0/Relu block-1/denseblock-1-0/conv2d/Conv2D 1 128 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-0/concat 2 1 block-0/average_pooling2d/AvgPool block-1/denseblock-1-0/conv2d/Conv2D block-1/denseblock-1-0/concat 1 ,"
+"Mul block-1/denseblock-1-1/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-0/concat block-1/denseblock-1-1/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-1/Relu 1 1 block-1/denseblock-1-1/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-1/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-1/Relu_activation 1 1 block-1/denseblock-1-1/Relu_output block-1/denseblock-1-1/Relu ,"
+"Convolution block-1/denseblock-1-1/conv2d/Conv2D 1 1 block-1/denseblock-1-1/Relu block-1/denseblock-1-1/conv2d/Conv2D 1 160 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-1/concat 2 1 block-1/denseblock-1-0/concat block-1/denseblock-1-1/conv2d/Conv2D block-1/denseblock-1-1/concat 1 ,"
+"Mul block-1/denseblock-1-2/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-1/concat block-1/denseblock-1-2/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-2/Relu 1 1 block-1/denseblock-1-2/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-2/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-2/Relu_activation 1 1 block-1/denseblock-1-2/Relu_output block-1/denseblock-1-2/Relu ,"
+"Convolution block-1/denseblock-1-2/conv2d/Conv2D 1 1 block-1/denseblock-1-2/Relu block-1/denseblock-1-2/conv2d/Conv2D 1 192 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-2/concat 2 1 block-1/denseblock-1-1/concat block-1/denseblock-1-2/conv2d/Conv2D block-1/denseblock-1-2/concat 1 ,"
+"Mul block-1/denseblock-1-3/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-2/concat block-1/denseblock-1-3/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-3/Relu 1 1 block-1/denseblock-1-3/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-3/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-3/Relu_activation 1 1 block-1/denseblock-1-3/Relu_output block-1/denseblock-1-3/Relu ,"
+"Convolution block-1/denseblock-1-3/conv2d/Conv2D 1 1 block-1/denseblock-1-3/Relu block-1/denseblock-1-3/conv2d/Conv2D 1 224 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-3/concat 2 1 block-1/denseblock-1-2/concat block-1/denseblock-1-3/conv2d/Conv2D block-1/denseblock-1-3/concat 1 ,"
+"Mul block-1/denseblock-1-4/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-3/concat block-1/denseblock-1-4/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-4/Relu 1 1 block-1/denseblock-1-4/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-4/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-4/Relu_activation 1 1 block-1/denseblock-1-4/Relu_output block-1/denseblock-1-4/Relu ,"
+"Convolution block-1/denseblock-1-4/conv2d/Conv2D 1 1 block-1/denseblock-1-4/Relu block-1/denseblock-1-4/conv2d/Conv2D 1 256 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-4/concat 2 1 block-1/denseblock-1-3/concat block-1/denseblock-1-4/conv2d/Conv2D block-1/denseblock-1-4/concat 1 ,"
+"Mul block-1/denseblock-1-5/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-4/concat block-1/denseblock-1-5/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-5/Relu 1 1 block-1/denseblock-1-5/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-5/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-5/Relu_activation 1 1 block-1/denseblock-1-5/Relu_output block-1/denseblock-1-5/Relu ,"
+"Convolution block-1/denseblock-1-5/conv2d/Conv2D 1 1 block-1/denseblock-1-5/Relu block-1/denseblock-1-5/conv2d/Conv2D 1 288 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-5/concat 2 1 block-1/denseblock-1-4/concat block-1/denseblock-1-5/conv2d/Conv2D block-1/denseblock-1-5/concat 1 ,"
+"Mul block-1/denseblock-1-6/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-5/concat block-1/denseblock-1-6/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-6/Relu 1 1 block-1/denseblock-1-6/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-6/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-6/Relu_activation 1 1 block-1/denseblock-1-6/Relu_output block-1/denseblock-1-6/Relu ,"
+"Convolution block-1/denseblock-1-6/conv2d/Conv2D 1 1 block-1/denseblock-1-6/Relu block-1/denseblock-1-6/conv2d/Conv2D 1 320 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-6/concat 2 1 block-1/denseblock-1-5/concat block-1/denseblock-1-6/conv2d/Conv2D block-1/denseblock-1-6/concat 1 ,"
+"Mul block-1/denseblock-1-7/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-6/concat block-1/denseblock-1-7/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-7/Relu 1 1 block-1/denseblock-1-7/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-7/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-7/Relu_activation 1 1 block-1/denseblock-1-7/Relu_output block-1/denseblock-1-7/Relu ,"
+"Convolution block-1/denseblock-1-7/conv2d/Conv2D 1 1 block-1/denseblock-1-7/Relu block-1/denseblock-1-7/conv2d/Conv2D 1 352 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-7/concat 2 1 block-1/denseblock-1-6/concat block-1/denseblock-1-7/conv2d/Conv2D block-1/denseblock-1-7/concat 1 ,"
+"Mul block-1/denseblock-1-8/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-7/concat block-1/denseblock-1-8/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-8/Relu 1 1 block-1/denseblock-1-8/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-8/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-8/Relu_activation 1 1 block-1/denseblock-1-8/Relu_output block-1/denseblock-1-8/Relu ,"
+"Convolution block-1/denseblock-1-8/conv2d/Conv2D 1 1 block-1/denseblock-1-8/Relu block-1/denseblock-1-8/conv2d/Conv2D 1 384 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-8/concat 2 1 block-1/denseblock-1-7/concat block-1/denseblock-1-8/conv2d/Conv2D block-1/denseblock-1-8/concat 1 ,"
+"Mul block-1/denseblock-1-9/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-8/concat block-1/denseblock-1-9/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-9/Relu 1 1 block-1/denseblock-1-9/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-9/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-9/Relu_activation 1 1 block-1/denseblock-1-9/Relu_output block-1/denseblock-1-9/Relu ,"
+"Convolution block-1/denseblock-1-9/conv2d/Conv2D 1 1 block-1/denseblock-1-9/Relu block-1/denseblock-1-9/conv2d/Conv2D 1 416 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-9/concat 2 1 block-1/denseblock-1-8/concat block-1/denseblock-1-9/conv2d/Conv2D block-1/denseblock-1-9/concat 1 ,"
+"Mul block-1/denseblock-1-10/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-9/concat block-1/denseblock-1-10/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-10/Relu 1 1 block-1/denseblock-1-10/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-10/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-10/Relu_activation 1 1 block-1/denseblock-1-10/Relu_output block-1/denseblock-1-10/Relu ,"
+"Convolution block-1/denseblock-1-10/conv2d/Conv2D 1 1 block-1/denseblock-1-10/Relu block-1/denseblock-1-10/conv2d/Conv2D 1 448 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-10/concat 2 1 block-1/denseblock-1-9/concat block-1/denseblock-1-10/conv2d/Conv2D block-1/denseblock-1-10/concat 1 ,"
+"Mul block-1/denseblock-1-11/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-10/concat block-1/denseblock-1-11/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/denseblock-1-11/Relu 1 1 block-1/denseblock-1-11/batch_normalization/FusedBatchNorm_mul_0 block-1/denseblock-1-11/Relu_output 1 ,"
+"ReLU block-1/denseblock-1-11/Relu_activation 1 1 block-1/denseblock-1-11/Relu_output block-1/denseblock-1-11/Relu ,"
+"Convolution block-1/denseblock-1-11/conv2d/Conv2D 1 1 block-1/denseblock-1-11/Relu block-1/denseblock-1-11/conv2d/Conv2D 1 480 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-1/denseblock-1-11/concat 2 1 block-1/denseblock-1-10/concat block-1/denseblock-1-11/conv2d/Conv2D block-1/denseblock-1-11/concat 1 ,"
+"Mul block-1/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/denseblock-1-11/concat block-1/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-1/Relu 1 1 block-1/batch_normalization/FusedBatchNorm_mul_0 block-1/Relu_output 1 ,"
+"ReLU block-1/Relu_activation 1 1 block-1/Relu_output block-1/Relu ,"
+"Convolution block-1/conv2d/Conv2D 1 1 block-1/Relu block-1/conv2d/Conv2D 1 512 256 1 1 1 1 0 0 1 0 1 1 0 ,"
+"Pooling block-1/average_pooling2d/AvgPool 1 1 block-1/conv2d/Conv2D block-1/average_pooling2d/AvgPool 1 2 2 2 2 0 0 -1 -1 0 1 ,"
+"Mul block-2/denseblock-2-0/batch_normalization/FusedBatchNorm_mul_0 1 1 block-1/average_pooling2d/AvgPool block-2/denseblock-2-0/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-0/Relu 1 1 block-2/denseblock-2-0/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-0/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-0/Relu_activation 1 1 block-2/denseblock-2-0/Relu_output block-2/denseblock-2-0/Relu ,"
+"Convolution block-2/denseblock-2-0/conv2d/Conv2D 1 1 block-2/denseblock-2-0/Relu block-2/denseblock-2-0/conv2d/Conv2D 1 256 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-0/concat 2 1 block-1/average_pooling2d/AvgPool block-2/denseblock-2-0/conv2d/Conv2D block-2/denseblock-2-0/concat 1 ,"
+"Mul block-2/denseblock-2-1/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-0/concat block-2/denseblock-2-1/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-1/Relu 1 1 block-2/denseblock-2-1/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-1/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-1/Relu_activation 1 1 block-2/denseblock-2-1/Relu_output block-2/denseblock-2-1/Relu ,"
+"Convolution block-2/denseblock-2-1/conv2d/Conv2D 1 1 block-2/denseblock-2-1/Relu block-2/denseblock-2-1/conv2d/Conv2D 1 288 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-1/concat 2 1 block-2/denseblock-2-0/concat block-2/denseblock-2-1/conv2d/Conv2D block-2/denseblock-2-1/concat 1 ,"
+"Mul block-2/denseblock-2-2/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-1/concat block-2/denseblock-2-2/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-2/Relu 1 1 block-2/denseblock-2-2/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-2/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-2/Relu_activation 1 1 block-2/denseblock-2-2/Relu_output block-2/denseblock-2-2/Relu ,"
+"Convolution block-2/denseblock-2-2/conv2d/Conv2D 1 1 block-2/denseblock-2-2/Relu block-2/denseblock-2-2/conv2d/Conv2D 1 320 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-2/concat 2 1 block-2/denseblock-2-1/concat block-2/denseblock-2-2/conv2d/Conv2D block-2/denseblock-2-2/concat 1 ,"
+"Mul block-2/denseblock-2-3/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-2/concat block-2/denseblock-2-3/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-3/Relu 1 1 block-2/denseblock-2-3/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-3/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-3/Relu_activation 1 1 block-2/denseblock-2-3/Relu_output block-2/denseblock-2-3/Relu ,"
+"Convolution block-2/denseblock-2-3/conv2d/Conv2D 1 1 block-2/denseblock-2-3/Relu block-2/denseblock-2-3/conv2d/Conv2D 1 352 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-3/concat 2 1 block-2/denseblock-2-2/concat block-2/denseblock-2-3/conv2d/Conv2D block-2/denseblock-2-3/concat 1 ,"
+"Mul block-2/denseblock-2-4/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-3/concat block-2/denseblock-2-4/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-4/Relu 1 1 block-2/denseblock-2-4/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-4/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-4/Relu_activation 1 1 block-2/denseblock-2-4/Relu_output block-2/denseblock-2-4/Relu ,"
+"Convolution block-2/denseblock-2-4/conv2d/Conv2D 1 1 block-2/denseblock-2-4/Relu block-2/denseblock-2-4/conv2d/Conv2D 1 384 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-4/concat 2 1 block-2/denseblock-2-3/concat block-2/denseblock-2-4/conv2d/Conv2D block-2/denseblock-2-4/concat 1 ,"
+"Mul block-2/denseblock-2-5/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-4/concat block-2/denseblock-2-5/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-5/Relu 1 1 block-2/denseblock-2-5/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-5/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-5/Relu_activation 1 1 block-2/denseblock-2-5/Relu_output block-2/denseblock-2-5/Relu ,"
+"Convolution block-2/denseblock-2-5/conv2d/Conv2D 1 1 block-2/denseblock-2-5/Relu block-2/denseblock-2-5/conv2d/Conv2D 1 416 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-5/concat 2 1 block-2/denseblock-2-4/concat block-2/denseblock-2-5/conv2d/Conv2D block-2/denseblock-2-5/concat 1 ,"
+"Mul block-2/denseblock-2-6/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-5/concat block-2/denseblock-2-6/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-6/Relu 1 1 block-2/denseblock-2-6/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-6/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-6/Relu_activation 1 1 block-2/denseblock-2-6/Relu_output block-2/denseblock-2-6/Relu ,"
+"Convolution block-2/denseblock-2-6/conv2d/Conv2D 1 1 block-2/denseblock-2-6/Relu block-2/denseblock-2-6/conv2d/Conv2D 1 448 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-6/concat 2 1 block-2/denseblock-2-5/concat block-2/denseblock-2-6/conv2d/Conv2D block-2/denseblock-2-6/concat 1 ,"
+"Mul block-2/denseblock-2-7/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-6/concat block-2/denseblock-2-7/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-7/Relu 1 1 block-2/denseblock-2-7/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-7/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-7/Relu_activation 1 1 block-2/denseblock-2-7/Relu_output block-2/denseblock-2-7/Relu ,"
+"Convolution block-2/denseblock-2-7/conv2d/Conv2D 1 1 block-2/denseblock-2-7/Relu block-2/denseblock-2-7/conv2d/Conv2D 1 480 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-7/concat 2 1 block-2/denseblock-2-6/concat block-2/denseblock-2-7/conv2d/Conv2D block-2/denseblock-2-7/concat 1 ,"
+"Mul block-2/denseblock-2-8/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-7/concat block-2/denseblock-2-8/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-8/Relu 1 1 block-2/denseblock-2-8/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-8/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-8/Relu_activation 1 1 block-2/denseblock-2-8/Relu_output block-2/denseblock-2-8/Relu ,"
+"Convolution block-2/denseblock-2-8/conv2d/Conv2D 1 1 block-2/denseblock-2-8/Relu block-2/denseblock-2-8/conv2d/Conv2D 1 512 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-8/concat 2 1 block-2/denseblock-2-7/concat block-2/denseblock-2-8/conv2d/Conv2D block-2/denseblock-2-8/concat 1 ,"
+"Mul block-2/denseblock-2-9/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-8/concat block-2/denseblock-2-9/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-9/Relu 1 1 block-2/denseblock-2-9/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-9/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-9/Relu_activation 1 1 block-2/denseblock-2-9/Relu_output block-2/denseblock-2-9/Relu ,"
+"Convolution block-2/denseblock-2-9/conv2d/Conv2D 1 1 block-2/denseblock-2-9/Relu block-2/denseblock-2-9/conv2d/Conv2D 1 544 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-9/concat 2 1 block-2/denseblock-2-8/concat block-2/denseblock-2-9/conv2d/Conv2D block-2/denseblock-2-9/concat 1 ,"
+"Mul block-2/denseblock-2-10/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-9/concat block-2/denseblock-2-10/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-10/Relu 1 1 block-2/denseblock-2-10/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-10/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-10/Relu_activation 1 1 block-2/denseblock-2-10/Relu_output block-2/denseblock-2-10/Relu ,"
+"Convolution block-2/denseblock-2-10/conv2d/Conv2D 1 1 block-2/denseblock-2-10/Relu block-2/denseblock-2-10/conv2d/Conv2D 1 576 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-10/concat 2 1 block-2/denseblock-2-9/concat block-2/denseblock-2-10/conv2d/Conv2D block-2/denseblock-2-10/concat 1 ,"
+"Mul block-2/denseblock-2-11/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-10/concat block-2/denseblock-2-11/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-11/Relu 1 1 block-2/denseblock-2-11/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-11/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-11/Relu_activation 1 1 block-2/denseblock-2-11/Relu_output block-2/denseblock-2-11/Relu ,"
+"Convolution block-2/denseblock-2-11/conv2d/Conv2D 1 1 block-2/denseblock-2-11/Relu block-2/denseblock-2-11/conv2d/Conv2D 1 608 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-11/concat 2 1 block-2/denseblock-2-10/concat block-2/denseblock-2-11/conv2d/Conv2D block-2/denseblock-2-11/concat 1 ,"
+"Mul block-2/denseblock-2-12/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-11/concat block-2/denseblock-2-12/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-12/Relu 1 1 block-2/denseblock-2-12/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-12/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-12/Relu_activation 1 1 block-2/denseblock-2-12/Relu_output block-2/denseblock-2-12/Relu ,"
+"Convolution block-2/denseblock-2-12/conv2d/Conv2D 1 1 block-2/denseblock-2-12/Relu block-2/denseblock-2-12/conv2d/Conv2D 1 640 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-12/concat 2 1 block-2/denseblock-2-11/concat block-2/denseblock-2-12/conv2d/Conv2D block-2/denseblock-2-12/concat 1 ,"
+"Mul block-2/denseblock-2-13/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-12/concat block-2/denseblock-2-13/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-13/Relu 1 1 block-2/denseblock-2-13/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-13/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-13/Relu_activation 1 1 block-2/denseblock-2-13/Relu_output block-2/denseblock-2-13/Relu ,"
+"Convolution block-2/denseblock-2-13/conv2d/Conv2D 1 1 block-2/denseblock-2-13/Relu block-2/denseblock-2-13/conv2d/Conv2D 1 672 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-13/concat 2 1 block-2/denseblock-2-12/concat block-2/denseblock-2-13/conv2d/Conv2D block-2/denseblock-2-13/concat 1 ,"
+"Mul block-2/denseblock-2-14/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-13/concat block-2/denseblock-2-14/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-14/Relu 1 1 block-2/denseblock-2-14/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-14/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-14/Relu_activation 1 1 block-2/denseblock-2-14/Relu_output block-2/denseblock-2-14/Relu ,"
+"Convolution block-2/denseblock-2-14/conv2d/Conv2D 1 1 block-2/denseblock-2-14/Relu block-2/denseblock-2-14/conv2d/Conv2D 1 704 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-14/concat 2 1 block-2/denseblock-2-13/concat block-2/denseblock-2-14/conv2d/Conv2D block-2/denseblock-2-14/concat 1 ,"
+"Mul block-2/denseblock-2-15/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-14/concat block-2/denseblock-2-15/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-15/Relu 1 1 block-2/denseblock-2-15/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-15/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-15/Relu_activation 1 1 block-2/denseblock-2-15/Relu_output block-2/denseblock-2-15/Relu ,"
+"Convolution block-2/denseblock-2-15/conv2d/Conv2D 1 1 block-2/denseblock-2-15/Relu block-2/denseblock-2-15/conv2d/Conv2D 1 736 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-15/concat 2 1 block-2/denseblock-2-14/concat block-2/denseblock-2-15/conv2d/Conv2D block-2/denseblock-2-15/concat 1 ,"
+"Mul block-2/denseblock-2-16/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-15/concat block-2/denseblock-2-16/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-16/Relu 1 1 block-2/denseblock-2-16/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-16/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-16/Relu_activation 1 1 block-2/denseblock-2-16/Relu_output block-2/denseblock-2-16/Relu ,"
+"Convolution block-2/denseblock-2-16/conv2d/Conv2D 1 1 block-2/denseblock-2-16/Relu block-2/denseblock-2-16/conv2d/Conv2D 1 768 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-16/concat 2 1 block-2/denseblock-2-15/concat block-2/denseblock-2-16/conv2d/Conv2D block-2/denseblock-2-16/concat 1 ,"
+"Mul block-2/denseblock-2-17/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-16/concat block-2/denseblock-2-17/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-17/Relu 1 1 block-2/denseblock-2-17/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-17/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-17/Relu_activation 1 1 block-2/denseblock-2-17/Relu_output block-2/denseblock-2-17/Relu ,"
+"Convolution block-2/denseblock-2-17/conv2d/Conv2D 1 1 block-2/denseblock-2-17/Relu block-2/denseblock-2-17/conv2d/Conv2D 1 800 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-17/concat 2 1 block-2/denseblock-2-16/concat block-2/denseblock-2-17/conv2d/Conv2D block-2/denseblock-2-17/concat 1 ,"
+"Mul block-2/denseblock-2-18/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-17/concat block-2/denseblock-2-18/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-18/Relu 1 1 block-2/denseblock-2-18/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-18/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-18/Relu_activation 1 1 block-2/denseblock-2-18/Relu_output block-2/denseblock-2-18/Relu ,"
+"Convolution block-2/denseblock-2-18/conv2d/Conv2D 1 1 block-2/denseblock-2-18/Relu block-2/denseblock-2-18/conv2d/Conv2D 1 832 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-18/concat 2 1 block-2/denseblock-2-17/concat block-2/denseblock-2-18/conv2d/Conv2D block-2/denseblock-2-18/concat 1 ,"
+"Mul block-2/denseblock-2-19/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-18/concat block-2/denseblock-2-19/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-19/Relu 1 1 block-2/denseblock-2-19/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-19/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-19/Relu_activation 1 1 block-2/denseblock-2-19/Relu_output block-2/denseblock-2-19/Relu ,"
+"Convolution block-2/denseblock-2-19/conv2d/Conv2D 1 1 block-2/denseblock-2-19/Relu block-2/denseblock-2-19/conv2d/Conv2D 1 864 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-19/concat 2 1 block-2/denseblock-2-18/concat block-2/denseblock-2-19/conv2d/Conv2D block-2/denseblock-2-19/concat 1 ,"
+"Mul block-2/denseblock-2-20/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-19/concat block-2/denseblock-2-20/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-20/Relu 1 1 block-2/denseblock-2-20/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-20/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-20/Relu_activation 1 1 block-2/denseblock-2-20/Relu_output block-2/denseblock-2-20/Relu ,"
+"Convolution block-2/denseblock-2-20/conv2d/Conv2D 1 1 block-2/denseblock-2-20/Relu block-2/denseblock-2-20/conv2d/Conv2D 1 896 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-20/concat 2 1 block-2/denseblock-2-19/concat block-2/denseblock-2-20/conv2d/Conv2D block-2/denseblock-2-20/concat 1 ,"
+"Mul block-2/denseblock-2-21/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-20/concat block-2/denseblock-2-21/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-21/Relu 1 1 block-2/denseblock-2-21/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-21/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-21/Relu_activation 1 1 block-2/denseblock-2-21/Relu_output block-2/denseblock-2-21/Relu ,"
+"Convolution block-2/denseblock-2-21/conv2d/Conv2D 1 1 block-2/denseblock-2-21/Relu block-2/denseblock-2-21/conv2d/Conv2D 1 928 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-21/concat 2 1 block-2/denseblock-2-20/concat block-2/denseblock-2-21/conv2d/Conv2D block-2/denseblock-2-21/concat 1 ,"
+"Mul block-2/denseblock-2-22/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-21/concat block-2/denseblock-2-22/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-22/Relu 1 1 block-2/denseblock-2-22/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-22/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-22/Relu_activation 1 1 block-2/denseblock-2-22/Relu_output block-2/denseblock-2-22/Relu ,"
+"Convolution block-2/denseblock-2-22/conv2d/Conv2D 1 1 block-2/denseblock-2-22/Relu block-2/denseblock-2-22/conv2d/Conv2D 1 960 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-22/concat 2 1 block-2/denseblock-2-21/concat block-2/denseblock-2-22/conv2d/Conv2D block-2/denseblock-2-22/concat 1 ,"
+"Mul block-2/denseblock-2-23/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-22/concat block-2/denseblock-2-23/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/denseblock-2-23/Relu 1 1 block-2/denseblock-2-23/batch_normalization/FusedBatchNorm_mul_0 block-2/denseblock-2-23/Relu_output 1 ,"
+"ReLU block-2/denseblock-2-23/Relu_activation 1 1 block-2/denseblock-2-23/Relu_output block-2/denseblock-2-23/Relu ,"
+"Convolution block-2/denseblock-2-23/conv2d/Conv2D 1 1 block-2/denseblock-2-23/Relu block-2/denseblock-2-23/conv2d/Conv2D 1 992 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-2/denseblock-2-23/concat 2 1 block-2/denseblock-2-22/concat block-2/denseblock-2-23/conv2d/Conv2D block-2/denseblock-2-23/concat 1 ,"
+"Mul block-2/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/denseblock-2-23/concat block-2/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-2/Relu 1 1 block-2/batch_normalization/FusedBatchNorm_mul_0 block-2/Relu_output 1 ,"
+"ReLU block-2/Relu_activation 1 1 block-2/Relu_output block-2/Relu ,"
+"Convolution block-2/conv2d/Conv2D 1 1 block-2/Relu block-2/conv2d/Conv2D 1 1024 512 1 1 1 1 0 0 1 0 1 1 0 ,"
+"Pooling block-2/average_pooling2d/AvgPool 1 1 block-2/conv2d/Conv2D block-2/average_pooling2d/AvgPool 1 2 2 2 2 0 0 -1 -1 0 1 ,"
+"Mul block-3/denseblock-3-0/batch_normalization/FusedBatchNorm_mul_0 1 1 block-2/average_pooling2d/AvgPool block-3/denseblock-3-0/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-0/Relu 1 1 block-3/denseblock-3-0/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-0/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-0/Relu_activation 1 1 block-3/denseblock-3-0/Relu_output block-3/denseblock-3-0/Relu ,"
+"Convolution block-3/denseblock-3-0/conv2d/Conv2D 1 1 block-3/denseblock-3-0/Relu block-3/denseblock-3-0/conv2d/Conv2D 1 512 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-0/concat 2 1 block-2/average_pooling2d/AvgPool block-3/denseblock-3-0/conv2d/Conv2D block-3/denseblock-3-0/concat 1 ,"
+"Mul block-3/denseblock-3-1/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-0/concat block-3/denseblock-3-1/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-1/Relu 1 1 block-3/denseblock-3-1/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-1/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-1/Relu_activation 1 1 block-3/denseblock-3-1/Relu_output block-3/denseblock-3-1/Relu ,"
+"Convolution block-3/denseblock-3-1/conv2d/Conv2D 1 1 block-3/denseblock-3-1/Relu block-3/denseblock-3-1/conv2d/Conv2D 1 544 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-1/concat 2 1 block-3/denseblock-3-0/concat block-3/denseblock-3-1/conv2d/Conv2D block-3/denseblock-3-1/concat 1 ,"
+"Mul block-3/denseblock-3-2/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-1/concat block-3/denseblock-3-2/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-2/Relu 1 1 block-3/denseblock-3-2/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-2/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-2/Relu_activation 1 1 block-3/denseblock-3-2/Relu_output block-3/denseblock-3-2/Relu ,"
+"Convolution block-3/denseblock-3-2/conv2d/Conv2D 1 1 block-3/denseblock-3-2/Relu block-3/denseblock-3-2/conv2d/Conv2D 1 576 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-2/concat 2 1 block-3/denseblock-3-1/concat block-3/denseblock-3-2/conv2d/Conv2D block-3/denseblock-3-2/concat 1 ,"
+"Mul block-3/denseblock-3-3/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-2/concat block-3/denseblock-3-3/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-3/Relu 1 1 block-3/denseblock-3-3/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-3/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-3/Relu_activation 1 1 block-3/denseblock-3-3/Relu_output block-3/denseblock-3-3/Relu ,"
+"Convolution block-3/denseblock-3-3/conv2d/Conv2D 1 1 block-3/denseblock-3-3/Relu block-3/denseblock-3-3/conv2d/Conv2D 1 608 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-3/concat 2 1 block-3/denseblock-3-2/concat block-3/denseblock-3-3/conv2d/Conv2D block-3/denseblock-3-3/concat 1 ,"
+"Mul block-3/denseblock-3-4/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-3/concat block-3/denseblock-3-4/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-4/Relu 1 1 block-3/denseblock-3-4/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-4/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-4/Relu_activation 1 1 block-3/denseblock-3-4/Relu_output block-3/denseblock-3-4/Relu ,"
+"Convolution block-3/denseblock-3-4/conv2d/Conv2D 1 1 block-3/denseblock-3-4/Relu block-3/denseblock-3-4/conv2d/Conv2D 1 640 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-4/concat 2 1 block-3/denseblock-3-3/concat block-3/denseblock-3-4/conv2d/Conv2D block-3/denseblock-3-4/concat 1 ,"
+"Mul block-3/denseblock-3-5/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-4/concat block-3/denseblock-3-5/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-5/Relu 1 1 block-3/denseblock-3-5/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-5/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-5/Relu_activation 1 1 block-3/denseblock-3-5/Relu_output block-3/denseblock-3-5/Relu ,"
+"Convolution block-3/denseblock-3-5/conv2d/Conv2D 1 1 block-3/denseblock-3-5/Relu block-3/denseblock-3-5/conv2d/Conv2D 1 672 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-5/concat 2 1 block-3/denseblock-3-4/concat block-3/denseblock-3-5/conv2d/Conv2D block-3/denseblock-3-5/concat 1 ,"
+"Mul block-3/denseblock-3-6/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-5/concat block-3/denseblock-3-6/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-6/Relu 1 1 block-3/denseblock-3-6/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-6/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-6/Relu_activation 1 1 block-3/denseblock-3-6/Relu_output block-3/denseblock-3-6/Relu ,"
+"Convolution block-3/denseblock-3-6/conv2d/Conv2D 1 1 block-3/denseblock-3-6/Relu block-3/denseblock-3-6/conv2d/Conv2D 1 704 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-6/concat 2 1 block-3/denseblock-3-5/concat block-3/denseblock-3-6/conv2d/Conv2D block-3/denseblock-3-6/concat 1 ,"
+"Mul block-3/denseblock-3-7/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-6/concat block-3/denseblock-3-7/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-7/Relu 1 1 block-3/denseblock-3-7/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-7/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-7/Relu_activation 1 1 block-3/denseblock-3-7/Relu_output block-3/denseblock-3-7/Relu ,"
+"Convolution block-3/denseblock-3-7/conv2d/Conv2D 1 1 block-3/denseblock-3-7/Relu block-3/denseblock-3-7/conv2d/Conv2D 1 736 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-7/concat 2 1 block-3/denseblock-3-6/concat block-3/denseblock-3-7/conv2d/Conv2D block-3/denseblock-3-7/concat 1 ,"
+"Mul block-3/denseblock-3-8/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-7/concat block-3/denseblock-3-8/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-8/Relu 1 1 block-3/denseblock-3-8/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-8/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-8/Relu_activation 1 1 block-3/denseblock-3-8/Relu_output block-3/denseblock-3-8/Relu ,"
+"Convolution block-3/denseblock-3-8/conv2d/Conv2D 1 1 block-3/denseblock-3-8/Relu block-3/denseblock-3-8/conv2d/Conv2D 1 768 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-8/concat 2 1 block-3/denseblock-3-7/concat block-3/denseblock-3-8/conv2d/Conv2D block-3/denseblock-3-8/concat 1 ,"
+"Mul block-3/denseblock-3-9/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-8/concat block-3/denseblock-3-9/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-9/Relu 1 1 block-3/denseblock-3-9/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-9/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-9/Relu_activation 1 1 block-3/denseblock-3-9/Relu_output block-3/denseblock-3-9/Relu ,"
+"Convolution block-3/denseblock-3-9/conv2d/Conv2D 1 1 block-3/denseblock-3-9/Relu block-3/denseblock-3-9/conv2d/Conv2D 1 800 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-9/concat 2 1 block-3/denseblock-3-8/concat block-3/denseblock-3-9/conv2d/Conv2D block-3/denseblock-3-9/concat 1 ,"
+"Mul block-3/denseblock-3-10/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-9/concat block-3/denseblock-3-10/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-10/Relu 1 1 block-3/denseblock-3-10/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-10/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-10/Relu_activation 1 1 block-3/denseblock-3-10/Relu_output block-3/denseblock-3-10/Relu ,"
+"Convolution block-3/denseblock-3-10/conv2d/Conv2D 1 1 block-3/denseblock-3-10/Relu block-3/denseblock-3-10/conv2d/Conv2D 1 832 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-10/concat 2 1 block-3/denseblock-3-9/concat block-3/denseblock-3-10/conv2d/Conv2D block-3/denseblock-3-10/concat 1 ,"
+"Mul block-3/denseblock-3-11/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-10/concat block-3/denseblock-3-11/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-11/Relu 1 1 block-3/denseblock-3-11/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-11/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-11/Relu_activation 1 1 block-3/denseblock-3-11/Relu_output block-3/denseblock-3-11/Relu ,"
+"Convolution block-3/denseblock-3-11/conv2d/Conv2D 1 1 block-3/denseblock-3-11/Relu block-3/denseblock-3-11/conv2d/Conv2D 1 864 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-11/concat 2 1 block-3/denseblock-3-10/concat block-3/denseblock-3-11/conv2d/Conv2D block-3/denseblock-3-11/concat 1 ,"
+"Mul block-3/denseblock-3-12/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-11/concat block-3/denseblock-3-12/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-12/Relu 1 1 block-3/denseblock-3-12/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-12/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-12/Relu_activation 1 1 block-3/denseblock-3-12/Relu_output block-3/denseblock-3-12/Relu ,"
+"Convolution block-3/denseblock-3-12/conv2d/Conv2D 1 1 block-3/denseblock-3-12/Relu block-3/denseblock-3-12/conv2d/Conv2D 1 896 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-12/concat 2 1 block-3/denseblock-3-11/concat block-3/denseblock-3-12/conv2d/Conv2D block-3/denseblock-3-12/concat 1 ,"
+"Mul block-3/denseblock-3-13/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-12/concat block-3/denseblock-3-13/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-13/Relu 1 1 block-3/denseblock-3-13/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-13/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-13/Relu_activation 1 1 block-3/denseblock-3-13/Relu_output block-3/denseblock-3-13/Relu ,"
+"Convolution block-3/denseblock-3-13/conv2d/Conv2D 1 1 block-3/denseblock-3-13/Relu block-3/denseblock-3-13/conv2d/Conv2D 1 928 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-13/concat 2 1 block-3/denseblock-3-12/concat block-3/denseblock-3-13/conv2d/Conv2D block-3/denseblock-3-13/concat 1 ,"
+"Mul block-3/denseblock-3-14/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-13/concat block-3/denseblock-3-14/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-14/Relu 1 1 block-3/denseblock-3-14/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-14/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-14/Relu_activation 1 1 block-3/denseblock-3-14/Relu_output block-3/denseblock-3-14/Relu ,"
+"Convolution block-3/denseblock-3-14/conv2d/Conv2D 1 1 block-3/denseblock-3-14/Relu block-3/denseblock-3-14/conv2d/Conv2D 1 960 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-14/concat 2 1 block-3/denseblock-3-13/concat block-3/denseblock-3-14/conv2d/Conv2D block-3/denseblock-3-14/concat 1 ,"
+"Mul block-3/denseblock-3-15/batch_normalization/FusedBatchNorm_mul_0 1 1 block-3/denseblock-3-14/concat block-3/denseblock-3-15/batch_normalization/FusedBatchNorm_mul_0 1 ,"
+"Add block-3/denseblock-3-15/Relu 1 1 block-3/denseblock-3-15/batch_normalization/FusedBatchNorm_mul_0 block-3/denseblock-3-15/Relu_output 1 ,"
+"ReLU block-3/denseblock-3-15/Relu_activation 1 1 block-3/denseblock-3-15/Relu_output block-3/denseblock-3-15/Relu ,"
+"Convolution block-3/denseblock-3-15/conv2d/Conv2D 1 1 block-3/denseblock-3-15/Relu block-3/denseblock-3-15/conv2d/Conv2D 1 992 32 3 3 1 1 0 0 1 0 1 1 0 ,"
+"Concat block-3/denseblock-3-15/concat 2 1 block-3/denseblock-3-14/concat block-3/denseblock-3-15/conv2d/Conv2D block-3/denseblock-3-15/concat 1 ,"
+"Pooling global_pool 1 1 block-3/denseblock-3-15/concat global_pool 1 0 0 1 1 0 0 -1 -1 -1 0 ,"
+"InnerProduct dense/BiasAdd 1 1 global_pool dense/BiasAdd 1001 1 1 1 ,"
+"Softmax softmax_tensor 1 1 dense/BiasAdd softmax_tensor 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/inception_v3.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/inception_v3.tnnproto
new file mode 100644
index 0000000..d39a3fa
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/inception_v3.tnnproto
@@ -0,0 +1,225 @@
+"1 0 1 4206624770 ,"
+"input 1 3 299 299 ,"
+" ,"
+"InceptionV3/Predictions/Reshape_1 ,"
+" 220 ,"
+"Convolution InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu 1 1 input InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu_output 1 3 32 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu_output InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu 1 1 InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu_output 1 32 32 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu_output InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu 1 1 InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu_output 1 32 64 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu_output InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/MaxPool_3a_3x3/MaxPool 1 1 InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu InceptionV3/InceptionV3/MaxPool_3a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Convolution InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu 1 1 InceptionV3/InceptionV3/MaxPool_3a_3x3/MaxPool InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu_output 1 64 80 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu_output InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu 1 1 InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu_output 1 80 192 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu_output InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool 1 1 InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu_output 1 192 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu_output 1 192 48 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu_output 1 48 64 5 5 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu_output 1 192 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu_output 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu_output 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu_output 1 192 32 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_5b/concat 4 1 InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_5b/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/concat InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu_output 1 256 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/concat InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu_output 1 256 48 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu_output 1 48 64 5 5 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5b/concat InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu_output 1 256 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu_output 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu_output 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_5b/concat InceptionV3/InceptionV3/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu_output 1 256 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_5c/concat 4 1 InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_5c/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/concat InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu_output 1 288 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/concat InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu_output 1 288 48 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu_output 1 48 64 5 5 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5c/concat InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu_output 1 288 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu_output 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu_output 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_5c/concat InceptionV3/InceptionV3/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu_output 1 288 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_5d/concat 4 1 InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_5d/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/concat InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu_output 1 288 384 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_5d/concat InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu_output 1 288 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu_output 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu_output 1 96 96 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool 1 1 InceptionV3/InceptionV3/Mixed_5d/concat InceptionV3/InceptionV3/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Concat InceptionV3/InceptionV3/Mixed_6a/concat 3 1 InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu InceptionV3/InceptionV3/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_6a/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6a/concat InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6a/concat InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu_output 1 768 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu_output 1 128 128 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu_output 1 128 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6a/concat InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu_output 1 768 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu_output 1 128 128 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu_output 1 128 128 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu_output 1 128 128 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu_output 1 128 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_6a/concat InceptionV3/InceptionV3/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_6b/concat 4 1 InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_6b/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/concat InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/concat InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu_output 1 768 160 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu_output 1 160 160 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu_output 1 160 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6b/concat InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu_output 1 768 160 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu_output 1 160 160 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu_output 1 160 160 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu_output 1 160 160 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu_output 1 160 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_6b/concat InceptionV3/InceptionV3/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_6c/concat 4 1 InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_6c/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/concat InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/concat InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu_output 1 768 160 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu_output 1 160 160 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu_output 1 160 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6c/concat InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu_output 1 768 160 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu_output 1 160 160 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu_output 1 160 160 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu_output 1 160 160 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu_output 1 160 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_6c/concat InceptionV3/InceptionV3/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_6d/concat 4 1 InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_6d/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/concat InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/concat InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu_output 1 192 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu_output 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6d/concat InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu_output 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu_output 1 192 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu_output 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu_output 1 192 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_6d/concat InceptionV3/InceptionV3/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_6e/concat 4 1 InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_6e/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/concat InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu_output 1 192 320 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_6e/concat InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu_output 1 768 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu_output 1 192 192 1 7 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu_output 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu_output 1 192 192 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu_output InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu ,"
+"Pooling InceptionV3/InceptionV3/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool 1 1 InceptionV3/InceptionV3/Mixed_6e/concat InceptionV3/InceptionV3/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Concat InceptionV3/InceptionV3/Mixed_7a/concat 3 1 InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu InceptionV3/InceptionV3/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool InceptionV3/InceptionV3/Mixed_7a/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/concat InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu_output 1 1280 320 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/concat InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu_output 1 1280 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu_output 1 384 384 1 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu_output 1 384 384 3 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7b/Branch_1/concat 2 1 InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_1/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7a/concat InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu_output 1 1280 448 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu_output 1 448 384 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu_output 1 384 384 1 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu_output 1 384 384 3 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7b/Branch_2/concat 2 1 InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_2/concat 1 ,"
+"Pooling InceptionV3/InceptionV3/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_7a/concat InceptionV3/InceptionV3/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu_output 1 1280 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7b/concat 4 1 InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7b/Branch_1/concat InceptionV3/InceptionV3/Mixed_7b/Branch_2/concat InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_7b/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/concat InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu_output 1 2048 320 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/concat InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu_output 1 2048 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu_output 1 384 384 1 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu_output 1 384 384 3 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7c/Branch_1/concat 2 1 InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_1/concat 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7b/concat InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu_output 1 2048 448 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu_output 1 448 384 3 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu_output 1 384 384 1 3 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu_output 1 384 384 3 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7c/Branch_2/concat 2 1 InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_2/concat 1 ,"
+"Pooling InceptionV3/InceptionV3/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_7b/concat InceptionV3/InceptionV3/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu_output 1 2048 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"ReLU InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu_activation 1 1 InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu_output InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu ,"
+"Concat InceptionV3/InceptionV3/Mixed_7c/concat 4 1 InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu InceptionV3/InceptionV3/Mixed_7c/Branch_1/concat InceptionV3/InceptionV3/Mixed_7c/Branch_2/concat InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu InceptionV3/InceptionV3/Mixed_7c/concat 1 ,"
+"Pooling InceptionV3/Logits/AvgPool_1a_8x8/AvgPool 1 1 InceptionV3/InceptionV3/Mixed_7c/concat InceptionV3/Logits/AvgPool_1a_8x8/AvgPool 1 8 8 2 2 0 0 -1 -1 1 1 ,"
+"Convolution InceptionV3/Logits/Conv2d_1c_1x1/BiasAdd 1 1 InceptionV3/Logits/AvgPool_1a_8x8/AvgPool InceptionV3/Logits/Conv2d_1c_1x1/BiasAdd 1 2048 1001 1 1 1 1 0 0 1 0 1 1 0 ,"
+"Reshape InceptionV3/Logits/SpatialSqueeze 1 1 InceptionV3/Logits/Conv2d_1c_1x1/BiasAdd InceptionV3/Logits/SpatialSqueeze 0 4 4 0 1001 1 1 0 ,"
+"Softmax InceptionV3/Predictions/Reshape_1 1 1 InceptionV3/Logits/SpatialSqueeze InceptionV3/Predictions/Reshape_1 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/inception_v4.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/inception_v4.tnnproto
new file mode 100644
index 0000000..4b25699
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/inception_v4.tnnproto
@@ -0,0 +1,200 @@
+"1 0 1 4206624770 ,"
+"input 1 3 299 299 ,"
+" ,"
+"InceptionV4/Logits/Predictions ,"
+" 195 ,"
+"Convolution InceptionV4/InceptionV4/Conv2d_1a_3x3/Relu 1 1 input InceptionV4/InceptionV4/Conv2d_1a_3x3/Relu 1 3 32 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Conv2d_2a_3x3/Relu 1 1 InceptionV4/InceptionV4/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Conv2d_2a_3x3/Relu 1 32 32 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu 1 1 InceptionV4/InceptionV4/Conv2d_2a_3x3/Relu InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu 1 32 64 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool 1 1 InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu InceptionV4/InceptionV4/Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu 1 1 InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu 1 64 96 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_3a/concat 2 1 InceptionV4/InceptionV4/Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu InceptionV4/InceptionV4/Mixed_3a/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_3a/concat InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu 1 160 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu 1 64 96 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_3a/concat InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu 1 160 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu 1 64 64 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu 1 64 64 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu 1 64 96 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_4a/concat 2 1 InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_4a/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_4a/concat InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu 1 192 192 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool 1 1 InceptionV4/InceptionV4/Mixed_4a/concat InceptionV4/InceptionV4/Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_5a/concat 2 1 InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool InceptionV4/InceptionV4/Mixed_5a/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5a/concat InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5a/concat InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5a/concat InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_5a/concat InceptionV4/InceptionV4/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_5b/concat 4 1 InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_5b/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/concat InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/concat InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5b/concat InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_5b/concat InceptionV4/InceptionV4/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_5c/concat 4 1 InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_5c/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/concat InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/concat InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5c/concat InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_5c/concat InceptionV4/InceptionV4/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_5d/concat 4 1 InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/Relu InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_5d/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/concat InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/concat InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5d/concat InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/Relu 1 384 64 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/Relu 1 64 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/Relu 1 96 96 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_5e/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_5d/concat InceptionV4/InceptionV4/Mixed_5e/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/Relu 1 384 96 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_5e/concat 4 1 InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/Relu InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_5e/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/concat InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu 1 384 384 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_5e/concat InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu 1 384 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu 1 192 224 3 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu 1 224 256 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool 1 1 InceptionV4/InceptionV4/Mixed_5e/concat InceptionV4/InceptionV4/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6a/concat 3 1 InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool InceptionV4/InceptionV4/Mixed_6a/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6a/concat InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6a/concat InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6a/concat InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6a/concat InceptionV4/InceptionV4/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6b/concat 4 1 InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6b/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/concat InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/concat InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6b/concat InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6b/concat InceptionV4/InceptionV4/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6c/concat 4 1 InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6c/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/concat InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/concat InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6c/concat InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6c/concat InceptionV4/InceptionV4/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6d/concat 4 1 InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6d/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/concat InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/concat InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6d/concat InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6d/concat InceptionV4/InceptionV4/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6e/concat 4 1 InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6e/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/concat InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/concat InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6e/concat InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6f/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6e/concat InceptionV4/InceptionV4/Mixed_6f/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6f/concat 4 1 InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6f/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/concat InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/concat InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6f/concat InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6g/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6f/concat InceptionV4/InceptionV4/Mixed_6g/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6g/concat 4 1 InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6g/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/concat InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/Relu 1 1024 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/concat InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/Relu 1 224 256 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6g/concat InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/Relu 1 192 192 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/Relu 1 192 224 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/Relu 1 224 224 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/Relu 1 224 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_6h/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_6g/concat InceptionV4/InceptionV4/Mixed_6h/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/Relu 1 1024 128 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_6h/concat 4 1 InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/Relu InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_6h/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/concat InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu 1 1024 192 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu 1 192 192 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_6h/concat InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu 1 1024 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu 1 256 256 1 7 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu 1 256 320 7 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu 1 320 320 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool 1 1 InceptionV4/InceptionV4/Mixed_6h/concat InceptionV4/InceptionV4/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool 0 3 3 2 2 0 0 -1 -1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7a/concat 3 1 InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu InceptionV4/InceptionV4/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool InceptionV4/InceptionV4/Mixed_7a/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/concat InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/concat InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu 1 384 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/Relu 1 384 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7b/Branch_1/concat 2 1 InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_1/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7a/concat InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/Relu 1 384 448 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu 1 448 512 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/Relu 1 512 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/Relu 1 512 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7b/Branch_2/concat 2 1 InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_2/concat 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_7a/concat InceptionV4/InceptionV4/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7b/concat 4 1 InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7b/Branch_1/concat InceptionV4/InceptionV4/Mixed_7b/Branch_2/concat InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_7b/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/concat InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/concat InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu 1 384 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu 1 384 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7c/Branch_1/concat 2 1 InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_1/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7b/concat InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/Relu 1 384 448 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu 1 448 512 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/Relu 1 512 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/Relu 1 512 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7c/Branch_2/concat 2 1 InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_2/concat 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_7b/concat InceptionV4/InceptionV4/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7c/concat 4 1 InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7c/Branch_1/concat InceptionV4/InceptionV4/Mixed_7c/Branch_2/concat InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_7c/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/concat InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/concat InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/Relu 1 384 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/Relu 1 384 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7d/Branch_1/concat 2 1 InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_1/concat 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7c/concat InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/Relu 1 1536 384 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/Relu 1 384 448 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu 1 448 512 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/Relu 1 512 256 1 3 1 1 0 0 1 0 1 1 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/Relu 1 512 256 3 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7d/Branch_2/concat 2 1 InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_2/concat 1 ,"
+"Pooling InceptionV4/InceptionV4/Mixed_7d/Branch_3/AvgPool_0a_3x3/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_7c/concat InceptionV4/InceptionV4/Mixed_7d/Branch_3/AvgPool_0a_3x3/AvgPool 1 3 3 1 1 0 0 -1 -1 0 1 ,"
+"Convolution InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/Relu 1 1 InceptionV4/InceptionV4/Mixed_7d/Branch_3/AvgPool_0a_3x3/AvgPool InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/Relu 1 1536 256 1 1 1 1 0 0 1 0 1 1 1 ,"
+"Concat InceptionV4/InceptionV4/Mixed_7d/concat 4 1 InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/Relu InceptionV4/InceptionV4/Mixed_7d/Branch_1/concat InceptionV4/InceptionV4/Mixed_7d/Branch_2/concat InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/Relu InceptionV4/InceptionV4/Mixed_7d/concat 1 ,"
+"Pooling InceptionV4/Logits/AvgPool_1a/AvgPool 1 1 InceptionV4/InceptionV4/Mixed_7d/concat InceptionV4/Logits/AvgPool_1a/AvgPool 1 8 8 1 1 0 0 -1 -1 1 1 ,"
+"InnerProduct InceptionV4/Logits/Logits/BiasAdd 1 1 InceptionV4/Logits/AvgPool_1a/AvgPool InceptionV4/Logits/Logits/BiasAdd 1001 1 1 1 ,"
+"Softmax InceptionV4/Logits/Predictions 1 1 InceptionV4/Logits/Logits/BiasAdd InceptionV4/Logits/Predictions 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v1.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v1.tnnproto
new file mode 100644
index 0000000..98ed73c
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v1.tnnproto
@@ -0,0 +1,62 @@
+"1 58 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" conv1 conv2_1/dw conv2_1/sep conv2_2/dw conv2_2/sep conv3_1/dw conv3_1/sep conv3_2/dw conv3_2/sep conv4_1/dw conv4_1/sep conv4_2/dw conv4_2/sep conv5_1/dw conv5_1/sep conv5_2/dw conv5_2/sep conv5_3/dw conv5_3/sep conv5_4/dw conv5_4/sep conv5_5/dw conv5_5/sep conv5_6/dw conv5_6/sep conv6/dw conv6/sep fc7 input pool6 prob relu1 relu2_1/dw relu2_1/sep relu2_2/dw relu2_2/sep relu3_1/dw relu3_1/sep relu3_2/dw relu3_2/sep relu4_1/dw relu4_1/sep relu4_2/dw relu4_2/sep relu5_1/dw relu5_1/sep relu5_2/dw relu5_2/sep relu5_3/dw relu5_3/sep relu5_4/dw relu5_4/sep relu5_5/dw relu5_5/sep relu5_6/dw relu5_6/sep relu6/dw relu6/sep ,"
+"prob ,"
+" 57 ,"
+"Convolution conv1 1 1 input conv1 1 3 32 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu1 1 1 conv1 relu1 ,"
+"Convolution conv2_1/dw 1 1 relu1 conv2_1/dw 32 1 32 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu2_1/dw 1 1 conv2_1/dw relu2_1/dw ,"
+"Convolution conv2_1/sep 1 1 relu2_1/dw conv2_1/sep 1 32 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu2_1/sep 1 1 conv2_1/sep relu2_1/sep ,"
+"Convolution conv2_2/dw 1 1 relu2_1/sep conv2_2/dw 64 1 64 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu2_2/dw 1 1 conv2_2/dw relu2_2/dw ,"
+"Convolution conv2_2/sep 1 1 relu2_2/dw conv2_2/sep 1 64 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu2_2/sep 1 1 conv2_2/sep relu2_2/sep ,"
+"Convolution conv3_1/dw 1 1 relu2_2/sep conv3_1/dw 128 1 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu3_1/dw 1 1 conv3_1/dw relu3_1/dw ,"
+"Convolution conv3_1/sep 1 1 relu3_1/dw conv3_1/sep 1 128 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu3_1/sep 1 1 conv3_1/sep relu3_1/sep ,"
+"Convolution conv3_2/dw 1 1 relu3_1/sep conv3_2/dw 128 1 128 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu3_2/dw 1 1 conv3_2/dw relu3_2/dw ,"
+"Convolution conv3_2/sep 1 1 relu3_2/dw conv3_2/sep 1 128 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu3_2/sep 1 1 conv3_2/sep relu3_2/sep ,"
+"Convolution conv4_1/dw 1 1 relu3_2/sep conv4_1/dw 256 1 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_1/dw 1 1 conv4_1/dw relu4_1/dw ,"
+"Convolution conv4_1/sep 1 1 relu4_1/dw conv4_1/sep 1 256 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_1/sep 1 1 conv4_1/sep relu4_1/sep ,"
+"Convolution conv4_2/dw 1 1 relu4_1/sep conv4_2/dw 256 1 256 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu4_2/dw 1 1 conv4_2/dw relu4_2/dw ,"
+"Convolution conv4_2/sep 1 1 relu4_2/dw conv4_2/sep 1 256 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_2/sep 1 1 conv4_2/sep relu4_2/sep ,"
+"Convolution conv5_1/dw 1 1 relu4_2/sep conv5_1/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_1/dw 1 1 conv5_1/dw relu5_1/dw ,"
+"Convolution conv5_1/sep 1 1 relu5_1/dw conv5_1/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_1/sep 1 1 conv5_1/sep relu5_1/sep ,"
+"Convolution conv5_2/dw 1 1 relu5_1/sep conv5_2/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_2/dw 1 1 conv5_2/dw relu5_2/dw ,"
+"Convolution conv5_2/sep 1 1 relu5_2/dw conv5_2/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_2/sep 1 1 conv5_2/sep relu5_2/sep ,"
+"Convolution conv5_3/dw 1 1 relu5_2/sep conv5_3/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_3/dw 1 1 conv5_3/dw relu5_3/dw ,"
+"Convolution conv5_3/sep 1 1 relu5_3/dw conv5_3/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_3/sep 1 1 conv5_3/sep relu5_3/sep ,"
+"Convolution conv5_4/dw 1 1 relu5_3/sep conv5_4/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_4/dw 1 1 conv5_4/dw relu5_4/dw ,"
+"Convolution conv5_4/sep 1 1 relu5_4/dw conv5_4/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_4/sep 1 1 conv5_4/sep relu5_4/sep ,"
+"Convolution conv5_5/dw 1 1 relu5_4/sep conv5_5/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_5/dw 1 1 conv5_5/dw relu5_5/dw ,"
+"Convolution conv5_5/sep 1 1 relu5_5/dw conv5_5/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_5/sep 1 1 conv5_5/sep relu5_5/sep ,"
+"Convolution conv5_6/dw 1 1 relu5_5/sep conv5_6/dw 512 1 512 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu5_6/dw 1 1 conv5_6/dw relu5_6/dw ,"
+"Convolution conv5_6/sep 1 1 relu5_6/dw conv5_6/sep 1 512 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_6/sep 1 1 conv5_6/sep relu5_6/sep ,"
+"Convolution conv6/dw 1 1 relu5_6/sep conv6/dw 1024 1 1024 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu6/dw 1 1 conv6/dw relu6/dw ,"
+"Convolution conv6/sep 1 1 relu6/dw conv6/sep 1 1024 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu6/sep 1 1 conv6/sep relu6/sep ,"
+"Pooling pool6 1 1 relu6/sep pool6 1 0 0 1 1 0 0 -1 -1 -1 0 ,"
+"Convolution fc7 1 1 pool6 fc7 1 1024 1000 1 1 1 1 0 0 1 -1 1 1 ,"
+"SoftmaxCaffe prob 1 1 fc7 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v2.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v2.tnnproto
new file mode 100644
index 0000000..8dd6c7d
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/mobilenet_v2.tnnproto
@@ -0,0 +1,107 @@
+"1 103 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" block_3_1 block_4_1 block_4_2 block_4_4 block_4_5 block_4_6 block_5_1 block_5_2 block_6_1 block_6_2 conv1/bn conv2_1/dwise/bn conv2_1/expand/bn conv2_1/linear/bn conv2_2/dwise/bn conv2_2/expand/bn conv2_2/linear/bn conv3_1/dwise/bn conv3_1/expand/bn conv3_1/linear/bn conv3_2/dwise/bn conv3_2/expand/bn conv3_2/linear/bn conv4_1/dwise/bn conv4_1/expand/bn conv4_1/linear/bn conv4_2/dwise/bn conv4_2/expand/bn conv4_2/linear/bn conv4_3/dwise/bn conv4_3/expand/bn conv4_3/linear/bn conv4_4/dwise/bn conv4_4/expand/bn conv4_4/linear/bn conv4_5/dwise/bn conv4_5/expand/bn conv4_5/linear/bn conv4_6/dwise/bn conv4_6/expand/bn conv4_6/linear/bn conv4_7/dwise/bn conv4_7/expand/bn conv4_7/linear/bn conv5_1/dwise/bn conv5_1/expand/bn conv5_1/linear/bn conv5_2/dwise/bn conv5_2/expand/bn conv5_2/linear/bn conv5_3/dwise/bn conv5_3/expand/bn conv5_3/linear/bn conv6_1/dwise/bn conv6_1/expand/bn conv6_1/linear/bn conv6_2/dwise/bn conv6_2/expand/bn conv6_2/linear/bn conv6_3/dwise/bn conv6_3/expand/bn conv6_3/linear/bn conv6_4/bn fc7 input pool6 prob relu1 relu2_1/dwise relu2_1/expand relu2_2/dwise relu2_2/expand relu3_1/dwise relu3_1/expand relu3_2/dwise relu3_2/expand relu4_1/dwise relu4_1/expand relu4_2/dwise relu4_2/expand relu4_3/dwise relu4_3/expand relu4_4/dwise relu4_4/expand relu4_5/dwise relu4_5/expand relu4_6/dwise relu4_6/expand relu4_7/dwise relu4_7/expand relu5_1/dwise relu5_1/expand relu5_2/dwise relu5_2/expand relu5_3/dwise relu5_3/expand relu6_1/dwise relu6_1/expand relu6_2/dwise relu6_2/expand relu6_3/dwise relu6_3/expand relu6_4 ,"
+"prob ,"
+" 102 ,"
+"Convolution conv1 1 1 input conv1/bn 1 3 32 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu1 1 1 conv1/bn relu1 ,"
+"Convolution conv2_1/expand 1 1 relu1 conv2_1/expand/bn 1 32 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu2_1/expand 1 1 conv2_1/expand/bn relu2_1/expand ,"
+"Convolution conv2_1/dwise 1 1 relu2_1/expand conv2_1/dwise/bn 32 1 32 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu2_1/dwise 1 1 conv2_1/dwise/bn relu2_1/dwise ,"
+"Convolution conv2_1/linear 1 1 relu2_1/dwise conv2_1/linear/bn 1 32 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv2_2/expand 1 1 conv2_1/linear/bn conv2_2/expand/bn 1 16 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu2_2/expand 1 1 conv2_2/expand/bn relu2_2/expand ,"
+"Convolution conv2_2/dwise 1 1 relu2_2/expand conv2_2/dwise/bn 96 1 96 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu2_2/dwise 1 1 conv2_2/dwise/bn relu2_2/dwise ,"
+"Convolution conv2_2/linear 1 1 relu2_2/dwise conv2_2/linear/bn 1 96 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv3_1/expand 1 1 conv2_2/linear/bn conv3_1/expand/bn 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu3_1/expand 1 1 conv3_1/expand/bn relu3_1/expand ,"
+"Convolution conv3_1/dwise 1 1 relu3_1/expand conv3_1/dwise/bn 144 1 144 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu3_1/dwise 1 1 conv3_1/dwise/bn relu3_1/dwise ,"
+"Convolution conv3_1/linear 1 1 relu3_1/dwise conv3_1/linear/bn 1 144 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_3_1 2 1 conv2_2/linear/bn conv3_1/linear/bn block_3_1 ,"
+"Convolution conv3_2/expand 1 1 block_3_1 conv3_2/expand/bn 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu3_2/expand 1 1 conv3_2/expand/bn relu3_2/expand ,"
+"Convolution conv3_2/dwise 1 1 relu3_2/expand conv3_2/dwise/bn 144 1 144 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu3_2/dwise 1 1 conv3_2/dwise/bn relu3_2/dwise ,"
+"Convolution conv3_2/linear 1 1 relu3_2/dwise conv3_2/linear/bn 1 144 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv4_1/expand 1 1 conv3_2/linear/bn conv4_1/expand/bn 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_1/expand 1 1 conv4_1/expand/bn relu4_1/expand ,"
+"Convolution conv4_1/dwise 1 1 relu4_1/expand conv4_1/dwise/bn 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_1/dwise 1 1 conv4_1/dwise/bn relu4_1/dwise ,"
+"Convolution conv4_1/linear 1 1 relu4_1/dwise conv4_1/linear/bn 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_4_1 2 1 conv3_2/linear/bn conv4_1/linear/bn block_4_1 ,"
+"Convolution conv4_2/expand 1 1 block_4_1 conv4_2/expand/bn 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_2/expand 1 1 conv4_2/expand/bn relu4_2/expand ,"
+"Convolution conv4_2/dwise 1 1 relu4_2/expand conv4_2/dwise/bn 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_2/dwise 1 1 conv4_2/dwise/bn relu4_2/dwise ,"
+"Convolution conv4_2/linear 1 1 relu4_2/dwise conv4_2/linear/bn 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_4_2 2 1 block_4_1 conv4_2/linear/bn block_4_2 ,"
+"Convolution conv4_3/expand 1 1 block_4_2 conv4_3/expand/bn 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_3/expand 1 1 conv4_3/expand/bn relu4_3/expand ,"
+"Convolution conv4_3/dwise 1 1 relu4_3/expand conv4_3/dwise/bn 192 1 192 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu4_3/dwise 1 1 conv4_3/dwise/bn relu4_3/dwise ,"
+"Convolution conv4_3/linear 1 1 relu4_3/dwise conv4_3/linear/bn 1 192 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv4_4/expand 1 1 conv4_3/linear/bn conv4_4/expand/bn 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_4/expand 1 1 conv4_4/expand/bn relu4_4/expand ,"
+"Convolution conv4_4/dwise 1 1 relu4_4/expand conv4_4/dwise/bn 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_4/dwise 1 1 conv4_4/dwise/bn relu4_4/dwise ,"
+"Convolution conv4_4/linear 1 1 relu4_4/dwise conv4_4/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_4_4 2 1 conv4_3/linear/bn conv4_4/linear/bn block_4_4 ,"
+"Convolution conv4_5/expand 1 1 block_4_4 conv4_5/expand/bn 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_5/expand 1 1 conv4_5/expand/bn relu4_5/expand ,"
+"Convolution conv4_5/dwise 1 1 relu4_5/expand conv4_5/dwise/bn 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_5/dwise 1 1 conv4_5/dwise/bn relu4_5/dwise ,"
+"Convolution conv4_5/linear 1 1 relu4_5/dwise conv4_5/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_4_5 2 1 block_4_4 conv4_5/linear/bn block_4_5 ,"
+"Convolution conv4_6/expand 1 1 block_4_5 conv4_6/expand/bn 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_6/expand 1 1 conv4_6/expand/bn relu4_6/expand ,"
+"Convolution conv4_6/dwise 1 1 relu4_6/expand conv4_6/dwise/bn 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_6/dwise 1 1 conv4_6/dwise/bn relu4_6/dwise ,"
+"Convolution conv4_6/linear 1 1 relu4_6/dwise conv4_6/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_4_6 2 1 block_4_5 conv4_6/linear/bn block_4_6 ,"
+"Convolution conv4_7/expand 1 1 block_4_6 conv4_7/expand/bn 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu4_7/expand 1 1 conv4_7/expand/bn relu4_7/expand ,"
+"Convolution conv4_7/dwise 1 1 relu4_7/expand conv4_7/dwise/bn 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu4_7/dwise 1 1 conv4_7/dwise/bn relu4_7/dwise ,"
+"Convolution conv4_7/linear 1 1 relu4_7/dwise conv4_7/linear/bn 1 384 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv5_1/expand 1 1 conv4_7/linear/bn conv5_1/expand/bn 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_1/expand 1 1 conv5_1/expand/bn relu5_1/expand ,"
+"Convolution conv5_1/dwise 1 1 relu5_1/expand conv5_1/dwise/bn 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_1/dwise 1 1 conv5_1/dwise/bn relu5_1/dwise ,"
+"Convolution conv5_1/linear 1 1 relu5_1/dwise conv5_1/linear/bn 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_5_1 2 1 conv4_7/linear/bn conv5_1/linear/bn block_5_1 ,"
+"Convolution conv5_2/expand 1 1 block_5_1 conv5_2/expand/bn 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_2/expand 1 1 conv5_2/expand/bn relu5_2/expand ,"
+"Convolution conv5_2/dwise 1 1 relu5_2/expand conv5_2/dwise/bn 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu5_2/dwise 1 1 conv5_2/dwise/bn relu5_2/dwise ,"
+"Convolution conv5_2/linear 1 1 relu5_2/dwise conv5_2/linear/bn 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_5_2 2 1 block_5_1 conv5_2/linear/bn block_5_2 ,"
+"Convolution conv5_3/expand 1 1 block_5_2 conv5_3/expand/bn 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu5_3/expand 1 1 conv5_3/expand/bn relu5_3/expand ,"
+"Convolution conv5_3/dwise 1 1 relu5_3/expand conv5_3/dwise/bn 576 1 576 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU relu5_3/dwise 1 1 conv5_3/dwise/bn relu5_3/dwise ,"
+"Convolution conv5_3/linear 1 1 relu5_3/dwise conv5_3/linear/bn 1 576 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv6_1/expand 1 1 conv5_3/linear/bn conv6_1/expand/bn 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu6_1/expand 1 1 conv6_1/expand/bn relu6_1/expand ,"
+"Convolution conv6_1/dwise 1 1 relu6_1/expand conv6_1/dwise/bn 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu6_1/dwise 1 1 conv6_1/dwise/bn relu6_1/dwise ,"
+"Convolution conv6_1/linear 1 1 relu6_1/dwise conv6_1/linear/bn 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_6_1 2 1 conv5_3/linear/bn conv6_1/linear/bn block_6_1 ,"
+"Convolution conv6_2/expand 1 1 block_6_1 conv6_2/expand/bn 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu6_2/expand 1 1 conv6_2/expand/bn relu6_2/expand ,"
+"Convolution conv6_2/dwise 1 1 relu6_2/expand conv6_2/dwise/bn 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu6_2/dwise 1 1 conv6_2/dwise/bn relu6_2/dwise ,"
+"Convolution conv6_2/linear 1 1 relu6_2/dwise conv6_2/linear/bn 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add block_6_2 2 1 block_6_1 conv6_2/linear/bn block_6_2 ,"
+"Convolution conv6_3/expand 1 1 block_6_2 conv6_3/expand/bn 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu6_3/expand 1 1 conv6_3/expand/bn relu6_3/expand ,"
+"Convolution conv6_3/dwise 1 1 relu6_3/expand conv6_3/dwise/bn 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu6_3/dwise 1 1 conv6_3/dwise/bn relu6_3/dwise ,"
+"Convolution conv6_3/linear 1 1 relu6_3/dwise conv6_3/linear/bn 1 960 320 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution conv6_4 1 1 conv6_3/linear/bn conv6_4/bn 1 320 1280 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU relu6_4 1 1 conv6_4/bn relu6_4 ,"
+"Pooling pool6 1 1 relu6_4 pool6 1 0 0 1 1 0 0 -1 -1 -1 0 ,"
+"Convolution fc7 1 1 pool6 fc7 1 1280 1000 1 1 1 1 0 0 1 -1 1 1 ,"
+"SoftmaxCaffe prob 1 1 fc7 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_inception_v3.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_inception_v3.tnnproto
new file mode 100644
index 0000000..d89f983
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_inception_v3.tnnproto
@@ -0,0 +1,127 @@
+"1 217 1 4206624770 ,"
+"input 1 3 395 395 ,"
+" classifier classifier_Reshape conv1_3x3_relu conv1_3x3_s2 conv2_3x3_relu conv2_3x3_s1 conv3_3x3_relu conv3_3x3_s1 conv4_3x3 conv4_3x3_reduce conv4_relu_3x3 conv4_relu_3x3_reduce inception_a1_1x1 inception_a1_1x1_relu inception_a1_3x3_1 inception_a1_3x3_1_relu inception_a1_3x3_2 inception_a1_3x3_2_relu inception_a1_3x3_reduce inception_a1_3x3_reduce_relu inception_a1_5x5 inception_a1_5x5_reduce inception_a1_5x5_reduce_relu inception_a1_5x5_relu inception_a1_output inception_a1_pool inception_a1_pool_proj inception_a1_pool_proj_relu inception_a2_1x1 inception_a2_1x1_relu inception_a2_3x3_1 inception_a2_3x3_1_relu inception_a2_3x3_2 inception_a2_3x3_2_relu inception_a2_3x3_reduce inception_a2_3x3_reduce_relu inception_a2_5x5 inception_a2_5x5_reduce inception_a2_5x5_reduce_relu inception_a2_5x5_relu inception_a2_output inception_a2_pool inception_a2_pool_proj inception_a2_pool_proj_relu inception_a3_1x1 inception_a3_1x1_relu inception_a3_3x3_1 inception_a3_3x3_1_relu inception_a3_3x3_2 inception_a3_3x3_2_relu inception_a3_3x3_reduce inception_a3_3x3_reduce_relu inception_a3_5x5 inception_a3_5x5_reduce inception_a3_5x5_reduce_relu inception_a3_5x5_relu inception_a3_output inception_a3_pool inception_a3_pool_proj inception_a3_pool_proj_relu inception_b1_1x1 inception_b1_1x1_2 inception_b1_1x1_2_relu inception_b1_1x1_relu inception_b1_1x7 inception_b1_1x7_2 inception_b1_1x7_2_relu inception_b1_1x7_3 inception_b1_1x7_3_relu inception_b1_1x7_reduce inception_b1_1x7_reduce_relu inception_b1_1x7_relu inception_b1_7x1 inception_b1_7x1_2 inception_b1_7x1_2_relu inception_b1_7x1_3 inception_b1_7x1_3_relu inception_b1_7x1_reduce inception_b1_7x1_reduce_relu inception_b1_7x1_relu inception_b1_concat inception_b1_pool_ave inception_b2_1x1 inception_b2_1x1_2 inception_b2_1x1_2_relu inception_b2_1x1_relu inception_b2_1x7 inception_b2_1x7_2 inception_b2_1x7_2_relu inception_b2_1x7_3 inception_b2_1x7_3_relu inception_b2_1x7_reduce inception_b2_1x7_reduce_relu inception_b2_1x7_relu inception_b2_7x1 inception_b2_7x1_2 inception_b2_7x1_2_relu inception_b2_7x1_3 inception_b2_7x1_3_relu inception_b2_7x1_reduce inception_b2_7x1_reduce_relu inception_b2_7x1_relu inception_b2_concat inception_b2_pool_ave inception_b3_1x1 inception_b3_1x1_2 inception_b3_1x1_2_relu inception_b3_1x1_relu inception_b3_1x7 inception_b3_1x7_2 inception_b3_1x7_2_relu inception_b3_1x7_3 inception_b3_1x7_3_relu inception_b3_1x7_reduce inception_b3_1x7_reduce_relu inception_b3_1x7_relu inception_b3_7x1 inception_b3_7x1_2 inception_b3_7x1_2_relu inception_b3_7x1_3 inception_b3_7x1_3_relu inception_b3_7x1_reduce inception_b3_7x1_reduce_relu inception_b3_7x1_relu inception_b3_concat inception_b3_pool_ave inception_b4_1x1 inception_b4_1x1_2 inception_b4_1x1_2_relu inception_b4_1x1_relu inception_b4_1x7 inception_b4_1x7_2 inception_b4_1x7_2_relu inception_b4_1x7_3 inception_b4_1x7_3_relu inception_b4_1x7_reduce inception_b4_1x7_reduce_relu inception_b4_1x7_relu inception_b4_7x1 inception_b4_7x1_2 inception_b4_7x1_2_relu inception_b4_7x1_3 inception_b4_7x1_3_relu inception_b4_7x1_reduce inception_b4_7x1_reduce_relu inception_b4_7x1_relu inception_b4_concat inception_b4_pool_ave inception_c1_1x1 inception_c1_1x1_2 inception_c1_1x1_2_relu inception_c1_1x1_relu inception_c1_1x3 inception_c1_1x3_2 inception_c1_1x3_2_relu inception_c1_1x3_reduce inception_c1_1x3_reduce_relu inception_c1_1x3_relu inception_c1_3x1 inception_c1_3x1_2 inception_c1_3x1_2_relu inception_c1_3x1_relu inception_c1_3x3 inception_c1_3x3_reduce inception_c1_3x3_reduce_relu inception_c1_3x3_relu inception_c1_concat inception_c1_pool inception_c2_1x1 inception_c2_1x1_2 inception_c2_1x1_2_relu inception_c2_1x1_relu inception_c2_1x3 inception_c2_1x3_2 inception_c2_1x3_2_relu inception_c2_1x3_reduce inception_c2_1x3_reduce_relu inception_c2_1x3_relu inception_c2_3x1 inception_c2_3x1_2 inception_c2_3x1_2_relu inception_c2_3x1_relu inception_c2_3x3 inception_c2_3x3_reduce inception_c2_3x3_reduce_relu inception_c2_3x3_relu inception_c2_concat inception_c2_pool input pool1_3x3_s2 pool2_3x3_s2 pool_8x8_s1 prob reduction_a_3x3 reduction_a_3x3_2 reduction_a_3x3_2_reduce reduction_a_3x3_2_reduce_relu reduction_a_3x3_2_relu reduction_a_3x3_3 reduction_a_3x3_3_relu reduction_a_3x3_relu reduction_a_concat reduction_a_pool reduction_b_1x7 reduction_b_1x7_reduce reduction_b_1x7_reduce_relu reduction_b_1x7_relu reduction_b_3x3 reduction_b_3x3_2 reduction_b_3x3_2_relu reduction_b_3x3_reduce reduction_b_3x3_reduce_relu reduction_b_3x3_relu reduction_b_7x1 reduction_b_7x1_relu reduction_b_concat reduction_b_pool ,"
+"prob ,"
+" 122 ,"
+"QuantizedConvolution conv1_3x3_s2 1 1 input conv1_3x3_relu 1 3 32 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_3x3_s1 1 1 conv1_3x3_relu conv2_3x3_relu 1 32 32 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_3x3_s1 1 1 conv2_3x3_relu conv3_3x3_relu 1 32 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedPooling pool1_3x3_s2 1 1 conv3_3x3_relu pool1_3x3_s2 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution conv4_3x3_reduce 1 1 pool1_3x3_s2 conv4_relu_3x3_reduce 1 64 80 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_3x3 1 1 conv4_relu_3x3_reduce conv4_relu_3x3 1 80 192 3 3 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling pool2_3x3_s2 1 1 conv4_relu_3x3 pool2_3x3_s2 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_a1_1x1 1 1 pool2_3x3_s2 inception_a1_1x1_relu 1 192 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a1_5x5_reduce 1 1 pool2_3x3_s2 inception_a1_5x5_reduce_relu 1 192 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a1_5x5 1 1 inception_a1_5x5_reduce_relu inception_a1_5x5_relu 1 48 64 5 5 1 1 2 2 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a1_3x3_reduce 1 1 pool2_3x3_s2 inception_a1_3x3_reduce_relu 1 192 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a1_3x3_1 1 1 inception_a1_3x3_reduce_relu inception_a1_3x3_1_relu 1 64 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a1_3x3_2 1 1 inception_a1_3x3_1_relu inception_a1_3x3_2_relu 1 96 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedPooling inception_a1_pool 1 1 pool2_3x3_s2 inception_a1_pool 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_a1_pool_proj 1 1 inception_a1_pool inception_a1_pool_proj_relu 1 192 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_a1_output 4 1 inception_a1_1x1_relu inception_a1_5x5_relu inception_a1_3x3_2_relu inception_a1_pool_proj_relu inception_a1_output 1 ,"
+"QuantizedConvolution inception_a2_1x1 1 1 inception_a1_output inception_a2_1x1_relu 1 256 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a2_5x5_reduce 1 1 inception_a1_output inception_a2_5x5_reduce_relu 1 256 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a2_5x5 1 1 inception_a2_5x5_reduce_relu inception_a2_5x5_relu 1 48 64 5 5 1 1 2 2 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a2_3x3_reduce 1 1 inception_a1_output inception_a2_3x3_reduce_relu 1 256 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a2_3x3_1 1 1 inception_a2_3x3_reduce_relu inception_a2_3x3_1_relu 1 64 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a2_3x3_2 1 1 inception_a2_3x3_1_relu inception_a2_3x3_2_relu 1 96 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedPooling inception_a2_pool 1 1 inception_a1_output inception_a2_pool 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_a2_pool_proj 1 1 inception_a2_pool inception_a2_pool_proj_relu 1 256 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_a2_output 4 1 inception_a2_1x1_relu inception_a2_5x5_relu inception_a2_3x3_2_relu inception_a2_pool_proj_relu inception_a2_output 1 ,"
+"QuantizedConvolution inception_a3_1x1 1 1 inception_a2_output inception_a3_1x1_relu 1 288 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a3_5x5_reduce 1 1 inception_a2_output inception_a3_5x5_reduce_relu 1 288 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a3_5x5 1 1 inception_a3_5x5_reduce_relu inception_a3_5x5_relu 1 48 64 5 5 1 1 2 2 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a3_3x3_reduce 1 1 inception_a2_output inception_a3_3x3_reduce_relu 1 288 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a3_3x3_1 1 1 inception_a3_3x3_reduce_relu inception_a3_3x3_1_relu 1 64 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_a3_3x3_2 1 1 inception_a3_3x3_1_relu inception_a3_3x3_2_relu 1 96 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedPooling inception_a3_pool 1 1 inception_a2_output inception_a3_pool 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_a3_pool_proj 1 1 inception_a3_pool inception_a3_pool_proj_relu 1 288 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_a3_output 4 1 inception_a3_1x1_relu inception_a3_5x5_relu inception_a3_3x3_2_relu inception_a3_pool_proj_relu inception_a3_output 1 ,"
+"QuantizedConvolution reduction_a_3x3 1 1 inception_a3_output reduction_a_3x3_relu 1 288 384 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_a_3x3_2_reduce 1 1 inception_a3_output reduction_a_3x3_2_reduce_relu 1 288 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_a_3x3_2 1 1 reduction_a_3x3_2_reduce_relu reduction_a_3x3_2_relu 1 64 96 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_a_3x3_3 1 1 reduction_a_3x3_2_relu reduction_a_3x3_3_relu 1 96 96 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling reduction_a_pool 1 1 inception_a3_output reduction_a_pool 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConcat reduction_a_concat 3 1 reduction_a_3x3_relu reduction_a_3x3_3_relu reduction_a_pool reduction_a_concat 1 ,"
+"QuantizedConvolution inception_b1_1x1_2 1 1 reduction_a_concat inception_b1_1x1_2_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_1x7_reduce 1 1 reduction_a_concat inception_b1_1x7_reduce_relu 1 768 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_1x7 1 1 inception_b1_1x7_reduce_relu inception_b1_1x7_relu 1 128 128 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_7x1 1 1 inception_b1_1x7_relu inception_b1_7x1_relu 1 128 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_7x1_reduce 1 1 reduction_a_concat inception_b1_7x1_reduce_relu 1 768 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_7x1_2 1 1 inception_b1_7x1_reduce_relu inception_b1_7x1_2_relu 1 128 128 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_1x7_2 1 1 inception_b1_7x1_2_relu inception_b1_1x7_2_relu 1 128 128 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_7x1_3 1 1 inception_b1_1x7_2_relu inception_b1_7x1_3_relu 1 128 128 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b1_1x7_3 1 1 inception_b1_7x1_3_relu inception_b1_1x7_3_relu 1 128 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedPooling inception_b1_pool_ave 1 1 reduction_a_concat inception_b1_pool_ave 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_b1_1x1 1 1 inception_b1_pool_ave inception_b1_1x1_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_b1_concat 4 1 inception_b1_1x1_2_relu inception_b1_7x1_relu inception_b1_1x7_3_relu inception_b1_1x1_relu inception_b1_concat 1 ,"
+"QuantizedConvolution inception_b2_1x1_2 1 1 inception_b1_concat inception_b2_1x1_2_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_1x7_reduce 1 1 inception_b1_concat inception_b2_1x7_reduce_relu 1 768 160 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_1x7 1 1 inception_b2_1x7_reduce_relu inception_b2_1x7_relu 1 160 160 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_7x1 1 1 inception_b2_1x7_relu inception_b2_7x1_relu 1 160 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_7x1_reduce 1 1 inception_b1_concat inception_b2_7x1_reduce_relu 1 768 160 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_7x1_2 1 1 inception_b2_7x1_reduce_relu inception_b2_7x1_2_relu 1 160 160 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_1x7_2 1 1 inception_b2_7x1_2_relu inception_b2_1x7_2_relu 1 160 160 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_7x1_3 1 1 inception_b2_1x7_2_relu inception_b2_7x1_3_relu 1 160 160 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b2_1x7_3 1 1 inception_b2_7x1_3_relu inception_b2_1x7_3_relu 1 160 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedPooling inception_b2_pool_ave 1 1 inception_b1_concat inception_b2_pool_ave 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_b2_1x1 1 1 inception_b2_pool_ave inception_b2_1x1_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_b2_concat 4 1 inception_b2_1x1_2_relu inception_b2_7x1_relu inception_b2_1x7_3_relu inception_b2_1x1_relu inception_b2_concat 1 ,"
+"QuantizedConvolution inception_b3_1x1_2 1 1 inception_b2_concat inception_b3_1x1_2_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_1x7_reduce 1 1 inception_b2_concat inception_b3_1x7_reduce_relu 1 768 160 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_1x7 1 1 inception_b3_1x7_reduce_relu inception_b3_1x7_relu 1 160 160 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_7x1 1 1 inception_b3_1x7_relu inception_b3_7x1_relu 1 160 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_7x1_reduce 1 1 inception_b2_concat inception_b3_7x1_reduce_relu 1 768 160 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_7x1_2 1 1 inception_b3_7x1_reduce_relu inception_b3_7x1_2_relu 1 160 160 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_1x7_2 1 1 inception_b3_7x1_2_relu inception_b3_1x7_2_relu 1 160 160 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_7x1_3 1 1 inception_b3_1x7_2_relu inception_b3_7x1_3_relu 1 160 160 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b3_1x7_3 1 1 inception_b3_7x1_3_relu inception_b3_1x7_3_relu 1 160 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedPooling inception_b3_pool_ave 1 1 inception_b2_concat inception_b3_pool_ave 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_b3_1x1 1 1 inception_b3_pool_ave inception_b3_1x1_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_b3_concat 4 1 inception_b3_1x1_2_relu inception_b3_7x1_relu inception_b3_1x7_3_relu inception_b3_1x1_relu inception_b3_concat 1 ,"
+"QuantizedConvolution inception_b4_1x1_2 1 1 inception_b3_concat inception_b4_1x1_2_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_1x7_reduce 1 1 inception_b3_concat inception_b4_1x7_reduce_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_1x7 1 1 inception_b4_1x7_reduce_relu inception_b4_1x7_relu 1 192 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_7x1 1 1 inception_b4_1x7_relu inception_b4_7x1_relu 1 192 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_7x1_reduce 1 1 inception_b3_concat inception_b4_7x1_reduce_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_7x1_2 1 1 inception_b4_7x1_reduce_relu inception_b4_7x1_2_relu 1 192 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_1x7_2 1 1 inception_b4_7x1_2_relu inception_b4_1x7_2_relu 1 192 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_7x1_3 1 1 inception_b4_1x7_2_relu inception_b4_7x1_3_relu 1 192 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_b4_1x7_3 1 1 inception_b4_7x1_3_relu inception_b4_1x7_3_relu 1 192 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedPooling inception_b4_pool_ave 1 1 inception_b3_concat inception_b4_pool_ave 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_b4_1x1 1 1 inception_b4_pool_ave inception_b4_1x1_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_b4_concat 4 1 inception_b4_1x1_2_relu inception_b4_7x1_relu inception_b4_1x7_3_relu inception_b4_1x1_relu inception_b4_concat 1 ,"
+"QuantizedConvolution reduction_b_3x3_reduce 1 1 inception_b4_concat reduction_b_3x3_reduce_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_b_3x3 1 1 reduction_b_3x3_reduce_relu reduction_b_3x3_relu 1 192 320 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_b_1x7_reduce 1 1 inception_b4_concat reduction_b_1x7_reduce_relu 1 768 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_b_1x7 1 1 reduction_b_1x7_reduce_relu reduction_b_1x7_relu 1 192 192 1 7 1 1 0 3 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_b_7x1 1 1 reduction_b_1x7_relu reduction_b_7x1_relu 1 192 192 7 1 1 1 3 0 1 -1 1 1 1 ,"
+"QuantizedConvolution reduction_b_3x3_2 1 1 reduction_b_7x1_relu reduction_b_3x3_2_relu 1 192 192 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling reduction_b_pool 1 1 inception_b4_concat reduction_b_pool 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConcat reduction_b_concat 3 1 reduction_b_3x3_relu reduction_b_3x3_2_relu reduction_b_pool reduction_b_concat 1 ,"
+"QuantizedConvolution inception_c1_1x1_2 1 1 reduction_b_concat inception_c1_1x1_2_relu 1 1280 320 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_1x3_reduce 1 1 reduction_b_concat inception_c1_1x3_reduce_relu 1 1280 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_1x3 1 1 inception_c1_1x3_reduce_relu inception_c1_1x3_relu 1 384 384 1 3 1 1 0 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_3x1 1 1 inception_c1_1x3_reduce_relu inception_c1_3x1_relu 1 384 384 3 1 1 1 1 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_3x3_reduce 1 1 reduction_b_concat inception_c1_3x3_reduce_relu 1 1280 448 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_3x3 1 1 inception_c1_3x3_reduce_relu inception_c1_3x3_relu 1 448 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_1x3_2 1 1 inception_c1_3x3_relu inception_c1_1x3_2_relu 1 384 384 1 3 1 1 0 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c1_3x1_2 1 1 inception_c1_3x3_relu inception_c1_3x1_2_relu 1 384 384 3 1 1 1 1 0 1 -1 1 1 1 ,"
+"QuantizedPooling inception_c1_pool 1 1 reduction_b_concat inception_c1_pool 1 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_c1_1x1 1 1 inception_c1_pool inception_c1_1x1_relu 1 1280 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_c1_concat 6 1 inception_c1_1x1_2_relu inception_c1_1x3_relu inception_c1_3x1_relu inception_c1_1x3_2_relu inception_c1_3x1_2_relu inception_c1_1x1_relu inception_c1_concat 1 ,"
+"QuantizedConvolution inception_c2_1x1_2 1 1 inception_c1_concat inception_c2_1x1_2_relu 1 2048 320 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_1x3_reduce 1 1 inception_c1_concat inception_c2_1x3_reduce_relu 1 2048 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_1x3 1 1 inception_c2_1x3_reduce_relu inception_c2_1x3_relu 1 384 384 1 3 1 1 0 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_3x1 1 1 inception_c2_1x3_reduce_relu inception_c2_3x1_relu 1 384 384 3 1 1 1 1 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_3x3_reduce 1 1 inception_c1_concat inception_c2_3x3_reduce_relu 1 2048 448 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_3x3 1 1 inception_c2_3x3_reduce_relu inception_c2_3x3_relu 1 448 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_1x3_2 1 1 inception_c2_3x3_relu inception_c2_1x3_2_relu 1 384 384 1 3 1 1 0 1 1 -1 1 1 1 ,"
+"QuantizedConvolution inception_c2_3x1_2 1 1 inception_c2_3x3_relu inception_c2_3x1_2_relu 1 384 384 3 1 1 1 1 0 1 -1 1 1 1 ,"
+"QuantizedPooling inception_c2_pool 1 1 inception_c1_concat inception_c2_pool 0 3 3 1 1 1 1 -1 -1 -1 1 ,"
+"QuantizedConvolution inception_c2_1x1 1 1 inception_c2_pool inception_c2_1x1_relu 1 2048 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConcat inception_c2_concat 6 1 inception_c2_1x1_2_relu inception_c2_1x3_relu inception_c2_3x1_relu inception_c2_1x3_2_relu inception_c2_3x1_2_relu inception_c2_1x1_relu inception_c2_concat 1 ,"
+"QuantizedPooling pool_8x8_s1 1 1 inception_c2_concat pool_8x8_s1 1 11 11 1 1 0 0 -1 -1 -1 0 ,"
+"Reshape classifier_Reshape 1 1 pool_8x8_s1 classifier_Reshape 0 4 4 0 2048 1 1 0 ,"
+"InnerProduct classifier 1 1 classifier_Reshape classifier 1000 1 0 1 ,"
+"SoftmaxCaffe prob 1 1 classifier prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v1.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v1.tnnproto
new file mode 100644
index 0000000..a3c54b9
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v1.tnnproto
@@ -0,0 +1,35 @@
+"1 58 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" conv1 conv2_1/dw conv2_1/sep conv2_2/dw conv2_2/sep conv3_1/dw conv3_1/sep conv3_2/dw conv3_2/sep conv4_1/dw conv4_1/sep conv4_2/dw conv4_2/sep conv5_1/dw conv5_1/sep conv5_2/dw conv5_2/sep conv5_3/dw conv5_3/sep conv5_4/dw conv5_4/sep conv5_5/dw conv5_5/sep conv5_6/dw conv5_6/sep conv6/dw conv6/sep fc7 input pool6 prob relu1 relu2_1/dw relu2_1/sep relu2_2/dw relu2_2/sep relu3_1/dw relu3_1/sep relu3_2/dw relu3_2/sep relu4_1/dw relu4_1/sep relu4_2/dw relu4_2/sep relu5_1/dw relu5_1/sep relu5_2/dw relu5_2/sep relu5_3/dw relu5_3/sep relu5_4/dw relu5_4/sep relu5_5/dw relu5_5/sep relu5_6/dw relu5_6/sep relu6/dw relu6/sep ,"
+"prob ,"
+" 30 ,"
+"QuantizedConvolution conv1 1 1 input relu1 1 3 32 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_1/dw 1 1 relu1 relu2_1/dw 32 1 32 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_1/sep 1 1 relu2_1/dw relu2_1/sep 1 32 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_2/dw 1 1 relu2_1/sep relu2_2/dw 64 1 64 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_2/sep 1 1 relu2_2/dw relu2_2/sep 1 64 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_1/dw 1 1 relu2_2/sep relu3_1/dw 128 1 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_1/sep 1 1 relu3_1/dw relu3_1/sep 1 128 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_2/dw 1 1 relu3_1/sep relu3_2/dw 128 1 128 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_2/sep 1 1 relu3_2/dw relu3_2/sep 1 128 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_1/dw 1 1 relu3_2/sep relu4_1/dw 256 1 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_1/sep 1 1 relu4_1/dw relu4_1/sep 1 256 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_2/dw 1 1 relu4_1/sep relu4_2/dw 256 1 256 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_2/sep 1 1 relu4_2/dw relu4_2/sep 1 256 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_1/dw 1 1 relu4_2/sep relu5_1/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_1/sep 1 1 relu5_1/dw relu5_1/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_2/dw 1 1 relu5_1/sep relu5_2/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_2/sep 1 1 relu5_2/dw relu5_2/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_3/dw 1 1 relu5_2/sep relu5_3/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_3/sep 1 1 relu5_3/dw relu5_3/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_4/dw 1 1 relu5_3/sep relu5_4/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_4/sep 1 1 relu5_4/dw relu5_4/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_5/dw 1 1 relu5_4/sep relu5_5/dw 512 1 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_5/sep 1 1 relu5_5/dw relu5_5/sep 1 512 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_6/dw 1 1 relu5_5/sep relu5_6/dw 512 1 512 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_6/sep 1 1 relu5_6/dw relu5_6/sep 1 512 1024 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6/dw 1 1 relu5_6/sep relu6/dw 1024 1 1024 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6/sep 1 1 relu6/dw relu6/sep 1 1024 1024 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling pool6 1 1 relu6/sep pool6 1 7 7 1 1 0 0 -1 -1 -1 0 ,"
+"QuantizedConvolution fc7 1 1 pool6 fc7 1 1024 1000 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"SoftmaxCaffe prob 1 1 fc7 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v2.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v2.tnnproto
new file mode 100644
index 0000000..adf32fb
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_mobilenet_v2.tnnproto
@@ -0,0 +1,71 @@
+"1 103 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" block_3_1 block_4_1 block_4_2 block_4_4 block_4_5 block_4_6 block_5_1 block_5_2 block_6_1 block_6_2 conv1/bn conv2_1/dwise/bn conv2_1/expand/bn conv2_1/linear/bn conv2_2/dwise/bn conv2_2/expand/bn conv2_2/linear/bn conv3_1/dwise/bn conv3_1/expand/bn conv3_1/linear/bn conv3_2/dwise/bn conv3_2/expand/bn conv3_2/linear/bn conv4_1/dwise/bn conv4_1/expand/bn conv4_1/linear/bn conv4_2/dwise/bn conv4_2/expand/bn conv4_2/linear/bn conv4_3/dwise/bn conv4_3/expand/bn conv4_3/linear/bn conv4_4/dwise/bn conv4_4/expand/bn conv4_4/linear/bn conv4_5/dwise/bn conv4_5/expand/bn conv4_5/linear/bn conv4_6/dwise/bn conv4_6/expand/bn conv4_6/linear/bn conv4_7/dwise/bn conv4_7/expand/bn conv4_7/linear/bn conv5_1/dwise/bn conv5_1/expand/bn conv5_1/linear/bn conv5_2/dwise/bn conv5_2/expand/bn conv5_2/linear/bn conv5_3/dwise/bn conv5_3/expand/bn conv5_3/linear/bn conv6_1/dwise/bn conv6_1/expand/bn conv6_1/linear/bn conv6_2/dwise/bn conv6_2/expand/bn conv6_2/linear/bn conv6_3/dwise/bn conv6_3/expand/bn conv6_3/linear/bn conv6_4/bn fc7 input pool6 prob relu1 relu2_1/dwise relu2_1/expand relu2_2/dwise relu2_2/expand relu3_1/dwise relu3_1/expand relu3_2/dwise relu3_2/expand relu4_1/dwise relu4_1/expand relu4_2/dwise relu4_2/expand relu4_3/dwise relu4_3/expand relu4_4/dwise relu4_4/expand relu4_5/dwise relu4_5/expand relu4_6/dwise relu4_6/expand relu4_7/dwise relu4_7/expand relu5_1/dwise relu5_1/expand relu5_2/dwise relu5_2/expand relu5_3/dwise relu5_3/expand relu6_1/dwise relu6_1/expand relu6_2/dwise relu6_2/expand relu6_3/dwise relu6_3/expand relu6_4 ,"
+"prob ,"
+" 66 ,"
+"QuantizedConvolution conv1 1 1 input relu1 1 3 32 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_1/expand 1 1 relu1 relu2_1/expand 1 32 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_1/dwise 1 1 relu2_1/expand relu2_1/dwise 32 1 32 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_1/linear 1 1 relu2_1/dwise conv2_1/linear/bn 1 32 16 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv2_2/expand 1 1 conv2_1/linear/bn relu2_2/expand 1 16 96 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_2/dwise 1 1 relu2_2/expand relu2_2/dwise 96 1 96 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv2_2/linear 1 1 relu2_2/dwise conv2_2/linear/bn 1 96 24 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv3_1/expand 1 1 conv2_2/linear/bn relu3_1/expand 1 24 144 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_1/dwise 1 1 relu3_1/expand relu3_1/dwise 144 1 144 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_1/linear 1 1 relu3_1/dwise conv3_1/linear/bn 1 144 24 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_3_1 2 1 conv2_2/linear/bn conv3_1/linear/bn block_3_1 1 ,"
+"QuantizedConvolution conv3_2/expand 1 1 block_3_1 relu3_2/expand 1 24 144 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_2/dwise 1 1 relu3_2/expand relu3_2/dwise 144 1 144 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv3_2/linear 1 1 relu3_2/dwise conv3_2/linear/bn 1 144 32 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv4_1/expand 1 1 conv3_2/linear/bn relu4_1/expand 1 32 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_1/dwise 1 1 relu4_1/expand relu4_1/dwise 192 1 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_1/linear 1 1 relu4_1/dwise conv4_1/linear/bn 1 192 32 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_4_1 2 1 conv3_2/linear/bn conv4_1/linear/bn block_4_1 1 ,"
+"QuantizedConvolution conv4_2/expand 1 1 block_4_1 relu4_2/expand 1 32 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_2/dwise 1 1 relu4_2/expand relu4_2/dwise 192 1 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_2/linear 1 1 relu4_2/dwise conv4_2/linear/bn 1 192 32 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_4_2 2 1 block_4_1 conv4_2/linear/bn block_4_2 1 ,"
+"QuantizedConvolution conv4_3/expand 1 1 block_4_2 relu4_3/expand 1 32 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_3/dwise 1 1 relu4_3/expand relu4_3/dwise 192 1 192 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_3/linear 1 1 relu4_3/dwise conv4_3/linear/bn 1 192 64 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv4_4/expand 1 1 conv4_3/linear/bn relu4_4/expand 1 64 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_4/dwise 1 1 relu4_4/expand relu4_4/dwise 384 1 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_4/linear 1 1 relu4_4/dwise conv4_4/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_4_4 2 1 conv4_3/linear/bn conv4_4/linear/bn block_4_4 1 ,"
+"QuantizedConvolution conv4_5/expand 1 1 block_4_4 relu4_5/expand 1 64 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_5/dwise 1 1 relu4_5/expand relu4_5/dwise 384 1 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_5/linear 1 1 relu4_5/dwise conv4_5/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_4_5 2 1 block_4_4 conv4_5/linear/bn block_4_5 1 ,"
+"QuantizedConvolution conv4_6/expand 1 1 block_4_5 relu4_6/expand 1 64 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_6/dwise 1 1 relu4_6/expand relu4_6/dwise 384 1 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_6/linear 1 1 relu4_6/dwise conv4_6/linear/bn 1 384 64 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_4_6 2 1 block_4_5 conv4_6/linear/bn block_4_6 1 ,"
+"QuantizedConvolution conv4_7/expand 1 1 block_4_6 relu4_7/expand 1 64 384 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_7/dwise 1 1 relu4_7/expand relu4_7/dwise 384 1 384 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv4_7/linear 1 1 relu4_7/dwise conv4_7/linear/bn 1 384 96 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv5_1/expand 1 1 conv4_7/linear/bn relu5_1/expand 1 96 576 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_1/dwise 1 1 relu5_1/expand relu5_1/dwise 576 1 576 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_1/linear 1 1 relu5_1/dwise conv5_1/linear/bn 1 576 96 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_5_1 2 1 conv4_7/linear/bn conv5_1/linear/bn block_5_1 1 ,"
+"QuantizedConvolution conv5_2/expand 1 1 block_5_1 relu5_2/expand 1 96 576 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_2/dwise 1 1 relu5_2/expand relu5_2/dwise 576 1 576 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_2/linear 1 1 relu5_2/dwise conv5_2/linear/bn 1 576 96 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_5_2 2 1 block_5_1 conv5_2/linear/bn block_5_2 1 ,"
+"QuantizedConvolution conv5_3/expand 1 1 block_5_2 relu5_3/expand 1 96 576 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_3/dwise 1 1 relu5_3/expand relu5_3/dwise 576 1 576 3 3 2 2 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv5_3/linear 1 1 relu5_3/dwise conv5_3/linear/bn 1 576 160 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv6_1/expand 1 1 conv5_3/linear/bn relu6_1/expand 1 160 960 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_1/dwise 1 1 relu6_1/expand relu6_1/dwise 960 1 960 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_1/linear 1 1 relu6_1/dwise conv6_1/linear/bn 1 960 160 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_6_1 2 1 conv5_3/linear/bn conv6_1/linear/bn block_6_1 1 ,"
+"QuantizedConvolution conv6_2/expand 1 1 block_6_1 relu6_2/expand 1 160 960 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_2/dwise 1 1 relu6_2/expand relu6_2/dwise 960 1 960 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_2/linear 1 1 relu6_2/dwise conv6_2/linear/bn 1 960 160 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd block_6_2 2 1 block_6_1 conv6_2/linear/bn block_6_2 1 ,"
+"QuantizedConvolution conv6_3/expand 1 1 block_6_2 relu6_3/expand 1 160 960 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_3/dwise 1 1 relu6_3/expand relu6_3/dwise 960 1 960 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution conv6_3/linear 1 1 relu6_3/dwise conv6_3/linear/bn 1 960 320 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution conv6_4 1 1 conv6_3/linear/bn relu6_4 1 320 1280 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling pool6 1 1 relu6_4 pool6 1 0 0 1 1 0 0 -1 -1 -1 0 ,"
+"QuantizedConvolution fc7 1 1 pool6 fc7 1 1280 1000 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"SoftmaxCaffe prob 1 1 fc7 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_resnet50.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_resnet50.tnnproto
new file mode 100644
index 0000000..5b7e566
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_resnet50.tnnproto
@@ -0,0 +1,95 @@
+"1 124 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" conv1_Y conv1_relu_Y fc1000_Gemm_Y fc1000_Reshape_Y input pool1_Y pool5_Y prob_Y res2a_Y res2a_branch1_Y res2a_branch2a_Y res2a_branch2a_relu_Y res2a_branch2b_Y res2a_branch2b_relu_Y res2a_branch2c_Y res2a_relu_Y res2b_Y res2b_branch2a_Y res2b_branch2a_relu_Y res2b_branch2b_Y res2b_branch2b_relu_Y res2b_branch2c_Y res2b_relu_Y res2c_Y res2c_branch2a_Y res2c_branch2a_relu_Y res2c_branch2b_Y res2c_branch2b_relu_Y res2c_branch2c_Y res2c_relu_Y res3a_Y res3a_branch1_Y res3a_branch2a_Y res3a_branch2a_relu_Y res3a_branch2b_Y res3a_branch2b_relu_Y res3a_branch2c_Y res3a_relu_Y res3b_Y res3b_branch2a_Y res3b_branch2a_relu_Y res3b_branch2b_Y res3b_branch2b_relu_Y res3b_branch2c_Y res3b_relu_Y res3c_Y res3c_branch2a_Y res3c_branch2a_relu_Y res3c_branch2b_Y res3c_branch2b_relu_Y res3c_branch2c_Y res3c_relu_Y res3d_Y res3d_branch2a_Y res3d_branch2a_relu_Y res3d_branch2b_Y res3d_branch2b_relu_Y res3d_branch2c_Y res3d_relu_Y res4a_Y res4a_branch1_Y res4a_branch2a_Y res4a_branch2a_relu_Y res4a_branch2b_Y res4a_branch2b_relu_Y res4a_branch2c_Y res4a_relu_Y res4b_Y res4b_branch2a_Y res4b_branch2a_relu_Y res4b_branch2b_Y res4b_branch2b_relu_Y res4b_branch2c_Y res4b_relu_Y res4c_Y res4c_branch2a_Y res4c_branch2a_relu_Y res4c_branch2b_Y res4c_branch2b_relu_Y res4c_branch2c_Y res4c_relu_Y res4d_Y res4d_branch2a_Y res4d_branch2a_relu_Y res4d_branch2b_Y res4d_branch2b_relu_Y res4d_branch2c_Y res4d_relu_Y res4e_Y res4e_branch2a_Y res4e_branch2a_relu_Y res4e_branch2b_Y res4e_branch2b_relu_Y res4e_branch2c_Y res4e_relu_Y res4f_Y res4f_branch2a_Y res4f_branch2a_relu_Y res4f_branch2b_Y res4f_branch2b_relu_Y res4f_branch2c_Y res4f_relu_Y res5a_Y res5a_branch1_Y res5a_branch2a_Y res5a_branch2a_relu_Y res5a_branch2b_Y res5a_branch2b_relu_Y res5a_branch2c_Y res5a_relu_Y res5b_Y res5b_branch2a_Y res5b_branch2a_relu_Y res5b_branch2b_Y res5b_branch2b_relu_Y res5b_branch2c_Y res5b_relu_Y res5c_Y res5c_branch2a_Y res5c_branch2a_relu_Y res5c_branch2b_Y res5c_branch2b_relu_Y res5c_branch2c_Y res5c_relu_Y ,"
+"prob_Y ,"
+" 90 ,"
+"QuantizedConvolution conv1 1 1 input conv1_relu_Y 1 3 64 7 7 2 2 3 3 1 -1 1 1 1 ,"
+"QuantizedPooling pool1 1 1 conv1_relu_Y pool1_Y 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution res2a_branch1 1 1 pool1_Y res2a_branch1_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution res2a_branch2a 1 1 pool1_Y res2a_branch2a_relu_Y 1 64 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res2a_branch2b 1 1 res2a_branch2a_relu_Y res2a_branch2b_relu_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res2a_branch2c 1 1 res2a_branch2b_relu_Y res2a_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res2a 2 1 res2a_branch1_Y res2a_branch2c_Y res2a_Y ,"
+"QuantizedReLU res2a_relu 1 1 res2a_Y res2a_relu_Y ,"
+"QuantizedConvolution res2b_branch2a 1 1 res2a_relu_Y res2b_branch2a_relu_Y 1 256 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res2b_branch2b 1 1 res2b_branch2a_relu_Y res2b_branch2b_relu_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res2b_branch2c 1 1 res2b_branch2b_relu_Y res2b_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res2b 2 1 res2a_relu_Y res2b_branch2c_Y res2b_Y ,"
+"QuantizedReLU res2b_relu 1 1 res2b_Y res2b_relu_Y ,"
+"QuantizedConvolution res2c_branch2a 1 1 res2b_relu_Y res2c_branch2a_relu_Y 1 256 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res2c_branch2b 1 1 res2c_branch2a_relu_Y res2c_branch2b_relu_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res2c_branch2c 1 1 res2c_branch2b_relu_Y res2c_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res2c 2 1 res2b_relu_Y res2c_branch2c_Y res2c_Y ,"
+"QuantizedReLU res2c_relu 1 1 res2c_Y res2c_relu_Y ,"
+"QuantizedConvolution res3a_branch1 1 1 res2c_relu_Y res3a_branch1_Y 1 256 512 1 1 2 2 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution res3a_branch2a 1 1 res2c_relu_Y res3a_branch2a_relu_Y 1 256 128 1 1 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res3a_branch2b 1 1 res3a_branch2a_relu_Y res3a_branch2b_relu_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res3a_branch2c 1 1 res3a_branch2b_relu_Y res3a_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res3a 2 1 res3a_branch1_Y res3a_branch2c_Y res3a_Y ,"
+"QuantizedReLU res3a_relu 1 1 res3a_Y res3a_relu_Y ,"
+"QuantizedConvolution res3b_branch2a 1 1 res3a_relu_Y res3b_branch2a_relu_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res3b_branch2b 1 1 res3b_branch2a_relu_Y res3b_branch2b_relu_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res3b_branch2c 1 1 res3b_branch2b_relu_Y res3b_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res3b 2 1 res3a_relu_Y res3b_branch2c_Y res3b_Y ,"
+"QuantizedReLU res3b_relu 1 1 res3b_Y res3b_relu_Y ,"
+"QuantizedConvolution res3c_branch2a 1 1 res3b_relu_Y res3c_branch2a_relu_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res3c_branch2b 1 1 res3c_branch2a_relu_Y res3c_branch2b_relu_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res3c_branch2c 1 1 res3c_branch2b_relu_Y res3c_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res3c 2 1 res3b_relu_Y res3c_branch2c_Y res3c_Y ,"
+"QuantizedReLU res3c_relu 1 1 res3c_Y res3c_relu_Y ,"
+"QuantizedConvolution res3d_branch2a 1 1 res3c_relu_Y res3d_branch2a_relu_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res3d_branch2b 1 1 res3d_branch2a_relu_Y res3d_branch2b_relu_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res3d_branch2c 1 1 res3d_branch2b_relu_Y res3d_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res3d 2 1 res3c_relu_Y res3d_branch2c_Y res3d_Y ,"
+"QuantizedReLU res3d_relu 1 1 res3d_Y res3d_relu_Y ,"
+"QuantizedConvolution res4a_branch1 1 1 res3d_relu_Y res4a_branch1_Y 1 512 1024 1 1 2 2 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution res4a_branch2a 1 1 res3d_relu_Y res4a_branch2a_relu_Y 1 512 256 1 1 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4a_branch2b 1 1 res4a_branch2a_relu_Y res4a_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4a_branch2c 1 1 res4a_branch2b_relu_Y res4a_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4a 2 1 res4a_branch1_Y res4a_branch2c_Y res4a_Y ,"
+"QuantizedReLU res4a_relu 1 1 res4a_Y res4a_relu_Y ,"
+"QuantizedConvolution res4b_branch2a 1 1 res4a_relu_Y res4b_branch2a_relu_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4b_branch2b 1 1 res4b_branch2a_relu_Y res4b_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4b_branch2c 1 1 res4b_branch2b_relu_Y res4b_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4b 2 1 res4a_relu_Y res4b_branch2c_Y res4b_Y ,"
+"QuantizedReLU res4b_relu 1 1 res4b_Y res4b_relu_Y ,"
+"QuantizedConvolution res4c_branch2a 1 1 res4b_relu_Y res4c_branch2a_relu_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4c_branch2b 1 1 res4c_branch2a_relu_Y res4c_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4c_branch2c 1 1 res4c_branch2b_relu_Y res4c_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4c 2 1 res4b_relu_Y res4c_branch2c_Y res4c_Y ,"
+"QuantizedReLU res4c_relu 1 1 res4c_Y res4c_relu_Y ,"
+"QuantizedConvolution res4d_branch2a 1 1 res4c_relu_Y res4d_branch2a_relu_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4d_branch2b 1 1 res4d_branch2a_relu_Y res4d_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4d_branch2c 1 1 res4d_branch2b_relu_Y res4d_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4d 2 1 res4c_relu_Y res4d_branch2c_Y res4d_Y ,"
+"QuantizedReLU res4d_relu 1 1 res4d_Y res4d_relu_Y ,"
+"QuantizedConvolution res4e_branch2a 1 1 res4d_relu_Y res4e_branch2a_relu_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4e_branch2b 1 1 res4e_branch2a_relu_Y res4e_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4e_branch2c 1 1 res4e_branch2b_relu_Y res4e_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4e 2 1 res4d_relu_Y res4e_branch2c_Y res4e_Y ,"
+"QuantizedReLU res4e_relu 1 1 res4e_Y res4e_relu_Y ,"
+"QuantizedConvolution res4f_branch2a 1 1 res4e_relu_Y res4f_branch2a_relu_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res4f_branch2b 1 1 res4f_branch2a_relu_Y res4f_branch2b_relu_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res4f_branch2c 1 1 res4f_branch2b_relu_Y res4f_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res4f 2 1 res4e_relu_Y res4f_branch2c_Y res4f_Y ,"
+"QuantizedReLU res4f_relu 1 1 res4f_Y res4f_relu_Y ,"
+"QuantizedConvolution res5a_branch1 1 1 res4f_relu_Y res5a_branch1_Y 1 1024 2048 1 1 2 2 0 0 1 -1 1 1 0 ,"
+"QuantizedConvolution res5a_branch2a 1 1 res4f_relu_Y res5a_branch2a_relu_Y 1 1024 512 1 1 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res5a_branch2b 1 1 res5a_branch2a_relu_Y res5a_branch2b_relu_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res5a_branch2c 1 1 res5a_branch2b_relu_Y res5a_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res5a 2 1 res5a_branch1_Y res5a_branch2c_Y res5a_Y ,"
+"QuantizedReLU res5a_relu 1 1 res5a_Y res5a_relu_Y ,"
+"QuantizedConvolution res5b_branch2a 1 1 res5a_relu_Y res5b_branch2a_relu_Y 1 2048 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res5b_branch2b 1 1 res5b_branch2a_relu_Y res5b_branch2b_relu_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res5b_branch2c 1 1 res5b_branch2b_relu_Y res5b_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res5b 2 1 res5a_relu_Y res5b_branch2c_Y res5b_Y ,"
+"QuantizedReLU res5b_relu 1 1 res5b_Y res5b_relu_Y ,"
+"QuantizedConvolution res5c_branch2a 1 1 res5b_relu_Y res5c_branch2a_relu_Y 1 2048 512 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution res5c_branch2b 1 1 res5c_branch2a_relu_Y res5c_branch2b_relu_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConvolution res5c_branch2c 1 1 res5c_branch2b_relu_Y res5c_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 0 ,"
+"QuantizedAdd res5c 2 1 res5b_relu_Y res5c_branch2c_Y res5c_Y ,"
+"QuantizedReLU res5c_relu 1 1 res5c_Y res5c_relu_Y ,"
+"QuantizedPooling pool5 1 1 res5c_relu_Y pool5_Y 1 7 7 1 1 0 0 -1 -1 -1 0 ,"
+"Reshape fc1000_Reshape 1 1 pool5_Y fc1000_Reshape_Y 0 4 4 0 2048 1 1 0 ,"
+"InnerProduct fc1000_Gemm 1 1 fc1000_Reshape_Y fc1000_Gemm_Y 1000 1 0 1 ,"
+"SoftmaxCaffe prob 1 1 fc1000_Gemm_Y prob_Y 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.0.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.0.tnnproto
new file mode 100644
index 0000000..7fba617
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.0.tnnproto
@@ -0,0 +1,44 @@
+"1 66 1 4206624770 ,"
+"input 1 3 227 227 ,"
+" conv1 conv10 fire2/concat fire2/expand1x1 fire2/expand3x3 fire2/relu_expand1x1 fire2/relu_expand3x3 fire2/relu_squeeze1x1 fire2/squeeze1x1 fire3/concat fire3/expand1x1 fire3/expand3x3 fire3/relu_expand1x1 fire3/relu_expand3x3 fire3/relu_squeeze1x1 fire3/squeeze1x1 fire4/concat fire4/expand1x1 fire4/expand3x3 fire4/relu_expand1x1 fire4/relu_expand3x3 fire4/relu_squeeze1x1 fire4/squeeze1x1 fire5/concat fire5/expand1x1 fire5/expand3x3 fire5/relu_expand1x1 fire5/relu_expand3x3 fire5/relu_squeeze1x1 fire5/squeeze1x1 fire6/concat fire6/expand1x1 fire6/expand3x3 fire6/relu_expand1x1 fire6/relu_expand3x3 fire6/relu_squeeze1x1 fire6/squeeze1x1 fire7/concat fire7/expand1x1 fire7/expand3x3 fire7/relu_expand1x1 fire7/relu_expand3x3 fire7/relu_squeeze1x1 fire7/squeeze1x1 fire8/concat fire8/expand1x1 fire8/expand3x3 fire8/relu_expand1x1 fire8/relu_expand3x3 fire8/relu_squeeze1x1 fire8/squeeze1x1 fire9/concat fire9/expand1x1 fire9/expand3x3 fire9/relu_expand1x1 fire9/relu_expand3x3 fire9/relu_squeeze1x1 fire9/squeeze1x1 input pool1 pool10 pool4 pool8 prob relu_conv1 relu_conv10 ,"
+"prob ,"
+" 39 ,"
+"QuantizedConvolution conv1 1 1 input relu_conv1 1 3 96 7 7 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling pool1 1 1 relu_conv1 pool1 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution fire2/squeeze1x1 1 1 pool1 fire2/relu_squeeze1x1 1 96 16 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire2/expand1x1 1 1 fire2/relu_squeeze1x1 fire2/relu_expand1x1 1 16 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire2/expand3x3 1 1 fire2/relu_squeeze1x1 fire2/relu_expand3x3 1 16 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire2/concat 2 1 fire2/relu_expand1x1 fire2/relu_expand3x3 fire2/concat 1 ,"
+"QuantizedConvolution fire3/squeeze1x1 1 1 fire2/concat fire3/relu_squeeze1x1 1 128 16 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire3/expand1x1 1 1 fire3/relu_squeeze1x1 fire3/relu_expand1x1 1 16 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire3/expand3x3 1 1 fire3/relu_squeeze1x1 fire3/relu_expand3x3 1 16 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire3/concat 2 1 fire3/relu_expand1x1 fire3/relu_expand3x3 fire3/concat 1 ,"
+"QuantizedConvolution fire4/squeeze1x1 1 1 fire3/concat fire4/relu_squeeze1x1 1 128 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire4/expand1x1 1 1 fire4/relu_squeeze1x1 fire4/relu_expand1x1 1 32 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire4/expand3x3 1 1 fire4/relu_squeeze1x1 fire4/relu_expand3x3 1 32 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire4/concat 2 1 fire4/relu_expand1x1 fire4/relu_expand3x3 fire4/concat 1 ,"
+"QuantizedPooling pool4 1 1 fire4/concat pool4 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution fire5/squeeze1x1 1 1 pool4 fire5/relu_squeeze1x1 1 256 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire5/expand1x1 1 1 fire5/relu_squeeze1x1 fire5/relu_expand1x1 1 32 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire5/expand3x3 1 1 fire5/relu_squeeze1x1 fire5/relu_expand3x3 1 32 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire5/concat 2 1 fire5/relu_expand1x1 fire5/relu_expand3x3 fire5/concat 1 ,"
+"QuantizedConvolution fire6/squeeze1x1 1 1 fire5/concat fire6/relu_squeeze1x1 1 256 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire6/expand1x1 1 1 fire6/relu_squeeze1x1 fire6/relu_expand1x1 1 48 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire6/expand3x3 1 1 fire6/relu_squeeze1x1 fire6/relu_expand3x3 1 48 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire6/concat 2 1 fire6/relu_expand1x1 fire6/relu_expand3x3 fire6/concat 1 ,"
+"QuantizedConvolution fire7/squeeze1x1 1 1 fire6/concat fire7/relu_squeeze1x1 1 384 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire7/expand1x1 1 1 fire7/relu_squeeze1x1 fire7/relu_expand1x1 1 48 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire7/expand3x3 1 1 fire7/relu_squeeze1x1 fire7/relu_expand3x3 1 48 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire7/concat 2 1 fire7/relu_expand1x1 fire7/relu_expand3x3 fire7/concat 1 ,"
+"QuantizedConvolution fire8/squeeze1x1 1 1 fire7/concat fire8/relu_squeeze1x1 1 384 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire8/expand1x1 1 1 fire8/relu_squeeze1x1 fire8/relu_expand1x1 1 64 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire8/expand3x3 1 1 fire8/relu_squeeze1x1 fire8/relu_expand3x3 1 64 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire8/concat 2 1 fire8/relu_expand1x1 fire8/relu_expand3x3 fire8/concat 1 ,"
+"QuantizedPooling pool8 1 1 fire8/concat pool8 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"QuantizedConvolution fire9/squeeze1x1 1 1 pool8 fire9/relu_squeeze1x1 1 512 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire9/expand1x1 1 1 fire9/relu_squeeze1x1 fire9/relu_expand1x1 1 64 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution fire9/expand3x3 1 1 fire9/relu_squeeze1x1 fire9/relu_expand3x3 1 64 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat fire9/concat 2 1 fire9/relu_expand1x1 fire9/relu_expand3x3 fire9/concat 1 ,"
+"QuantizedConvolution conv10 1 1 fire9/concat relu_conv10 1 512 1000 1 1 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedPooling pool10 1 1 relu_conv10 pool10 1 15 15 1 1 0 0 -1 -1 -1 0 ,"
+"SoftmaxCaffe prob 1 1 pool10 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.1.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.1.tnnproto
new file mode 100644
index 0000000..f768103
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/quant_squeezenet_v1.1.tnnproto
@@ -0,0 +1,44 @@
+"1 66 1 4206624770 ,"
+"data 1 3 224 224 ,"
+" data squeezenet0_concat0 squeezenet0_concat1 squeezenet0_concat2 squeezenet0_concat3 squeezenet0_concat4 squeezenet0_concat5 squeezenet0_concat6 squeezenet0_concat7 squeezenet0_conv0_fwd squeezenet0_conv10_fwd squeezenet0_conv11_fwd squeezenet0_conv12_fwd squeezenet0_conv13_fwd squeezenet0_conv14_fwd squeezenet0_conv15_fwd squeezenet0_conv16_fwd squeezenet0_conv17_fwd squeezenet0_conv18_fwd squeezenet0_conv19_fwd squeezenet0_conv1_fwd squeezenet0_conv20_fwd squeezenet0_conv21_fwd squeezenet0_conv22_fwd squeezenet0_conv23_fwd squeezenet0_conv24_fwd squeezenet0_conv25_fwd squeezenet0_conv2_fwd squeezenet0_conv3_fwd squeezenet0_conv4_fwd squeezenet0_conv5_fwd squeezenet0_conv6_fwd squeezenet0_conv7_fwd squeezenet0_conv8_fwd squeezenet0_conv9_fwd squeezenet0_flatten0_reshape0 squeezenet0_pool0_fwd squeezenet0_pool1_fwd squeezenet0_pool2_fwd squeezenet0_pool3_fwd squeezenet0_relu0_fwd squeezenet0_relu10_fwd squeezenet0_relu11_fwd squeezenet0_relu12_fwd squeezenet0_relu13_fwd squeezenet0_relu14_fwd squeezenet0_relu15_fwd squeezenet0_relu16_fwd squeezenet0_relu17_fwd squeezenet0_relu18_fwd squeezenet0_relu19_fwd squeezenet0_relu1_fwd squeezenet0_relu20_fwd squeezenet0_relu21_fwd squeezenet0_relu22_fwd squeezenet0_relu23_fwd squeezenet0_relu24_fwd squeezenet0_relu25_fwd squeezenet0_relu2_fwd squeezenet0_relu3_fwd squeezenet0_relu4_fwd squeezenet0_relu5_fwd squeezenet0_relu6_fwd squeezenet0_relu7_fwd squeezenet0_relu8_fwd squeezenet0_relu9_fwd ,"
+"squeezenet0_flatten0_reshape0 ,"
+" 39 ,"
+"QuantizedConvolution squeezenet0_conv0_fwd 1 1 data squeezenet0_relu0_fwd 1 3 64 3 3 2 2 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling squeezenet0_pool0_fwd 1 1 squeezenet0_relu0_fwd squeezenet0_pool0_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"QuantizedConvolution squeezenet0_conv1_fwd 1 1 squeezenet0_pool0_fwd squeezenet0_relu1_fwd 1 64 16 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv2_fwd 1 1 squeezenet0_relu1_fwd squeezenet0_relu2_fwd 1 16 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv3_fwd 1 1 squeezenet0_relu1_fwd squeezenet0_relu3_fwd 1 16 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat0 2 1 squeezenet0_relu2_fwd squeezenet0_relu3_fwd squeezenet0_concat0 1 ,"
+"QuantizedConvolution squeezenet0_conv4_fwd 1 1 squeezenet0_concat0 squeezenet0_relu4_fwd 1 128 16 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv5_fwd 1 1 squeezenet0_relu4_fwd squeezenet0_relu5_fwd 1 16 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv6_fwd 1 1 squeezenet0_relu4_fwd squeezenet0_relu6_fwd 1 16 64 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat1 2 1 squeezenet0_relu5_fwd squeezenet0_relu6_fwd squeezenet0_concat1 1 ,"
+"QuantizedPooling squeezenet0_pool1_fwd 1 1 squeezenet0_concat1 squeezenet0_pool1_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"QuantizedConvolution squeezenet0_conv7_fwd 1 1 squeezenet0_pool1_fwd squeezenet0_relu7_fwd 1 128 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv8_fwd 1 1 squeezenet0_relu7_fwd squeezenet0_relu8_fwd 1 32 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv9_fwd 1 1 squeezenet0_relu7_fwd squeezenet0_relu9_fwd 1 32 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat2 2 1 squeezenet0_relu8_fwd squeezenet0_relu9_fwd squeezenet0_concat2 1 ,"
+"QuantizedConvolution squeezenet0_conv10_fwd 1 1 squeezenet0_concat2 squeezenet0_relu10_fwd 1 256 32 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv11_fwd 1 1 squeezenet0_relu10_fwd squeezenet0_relu11_fwd 1 32 128 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv12_fwd 1 1 squeezenet0_relu10_fwd squeezenet0_relu12_fwd 1 32 128 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat3 2 1 squeezenet0_relu11_fwd squeezenet0_relu12_fwd squeezenet0_concat3 1 ,"
+"QuantizedPooling squeezenet0_pool2_fwd 1 1 squeezenet0_concat3 squeezenet0_pool2_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"QuantizedConvolution squeezenet0_conv13_fwd 1 1 squeezenet0_pool2_fwd squeezenet0_relu13_fwd 1 256 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv14_fwd 1 1 squeezenet0_relu13_fwd squeezenet0_relu14_fwd 1 48 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv15_fwd 1 1 squeezenet0_relu13_fwd squeezenet0_relu15_fwd 1 48 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat4 2 1 squeezenet0_relu14_fwd squeezenet0_relu15_fwd squeezenet0_concat4 1 ,"
+"QuantizedConvolution squeezenet0_conv16_fwd 1 1 squeezenet0_concat4 squeezenet0_relu16_fwd 1 384 48 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv17_fwd 1 1 squeezenet0_relu16_fwd squeezenet0_relu17_fwd 1 48 192 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv18_fwd 1 1 squeezenet0_relu16_fwd squeezenet0_relu18_fwd 1 48 192 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat5 2 1 squeezenet0_relu17_fwd squeezenet0_relu18_fwd squeezenet0_concat5 1 ,"
+"QuantizedConvolution squeezenet0_conv19_fwd 1 1 squeezenet0_concat5 squeezenet0_relu19_fwd 1 384 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv20_fwd 1 1 squeezenet0_relu19_fwd squeezenet0_relu20_fwd 1 64 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv21_fwd 1 1 squeezenet0_relu19_fwd squeezenet0_relu21_fwd 1 64 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat6 2 1 squeezenet0_relu20_fwd squeezenet0_relu21_fwd squeezenet0_concat6 1 ,"
+"QuantizedConvolution squeezenet0_conv22_fwd 1 1 squeezenet0_concat6 squeezenet0_relu22_fwd 1 512 64 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv23_fwd 1 1 squeezenet0_relu22_fwd squeezenet0_relu23_fwd 1 64 256 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedConvolution squeezenet0_conv24_fwd 1 1 squeezenet0_relu22_fwd squeezenet0_relu24_fwd 1 64 256 3 3 1 1 1 1 1 -1 1 1 1 ,"
+"QuantizedConcat squeezenet0_concat7 2 1 squeezenet0_relu23_fwd squeezenet0_relu24_fwd squeezenet0_concat7 1 ,"
+"QuantizedConvolution squeezenet0_conv25_fwd 1 1 squeezenet0_concat7 squeezenet0_relu25_fwd 1 512 1000 1 1 1 1 0 0 1 -1 1 1 1 ,"
+"QuantizedPooling squeezenet0_pool3_fwd 1 1 squeezenet0_relu25_fwd squeezenet0_pool3_fwd 1 13 13 13 13 0 0 -1 -1 -1 0 ,"
+"Reshape squeezenet0_flatten0_reshape0 1 1 squeezenet0_pool3_fwd squeezenet0_flatten0_reshape0 0 4 4 0 0 -1 1 0 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/resnet50.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/resnet50.tnnproto
new file mode 100644
index 0000000..87d46dc
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/resnet50.tnnproto
@@ -0,0 +1,128 @@
+"1 124 1 4206624770 ,"
+"input 1 3 224 224 ,"
+" conv1_Y conv1_relu_Y fc1000_Gemm_Y fc1000_Reshape_Y input pool1_Y pool5_Y prob_Y res2a_Y res2a_branch1_Y res2a_branch2a_Y res2a_branch2a_relu_Y res2a_branch2b_Y res2a_branch2b_relu_Y res2a_branch2c_Y res2a_relu_Y res2b_Y res2b_branch2a_Y res2b_branch2a_relu_Y res2b_branch2b_Y res2b_branch2b_relu_Y res2b_branch2c_Y res2b_relu_Y res2c_Y res2c_branch2a_Y res2c_branch2a_relu_Y res2c_branch2b_Y res2c_branch2b_relu_Y res2c_branch2c_Y res2c_relu_Y res3a_Y res3a_branch1_Y res3a_branch2a_Y res3a_branch2a_relu_Y res3a_branch2b_Y res3a_branch2b_relu_Y res3a_branch2c_Y res3a_relu_Y res3b_Y res3b_branch2a_Y res3b_branch2a_relu_Y res3b_branch2b_Y res3b_branch2b_relu_Y res3b_branch2c_Y res3b_relu_Y res3c_Y res3c_branch2a_Y res3c_branch2a_relu_Y res3c_branch2b_Y res3c_branch2b_relu_Y res3c_branch2c_Y res3c_relu_Y res3d_Y res3d_branch2a_Y res3d_branch2a_relu_Y res3d_branch2b_Y res3d_branch2b_relu_Y res3d_branch2c_Y res3d_relu_Y res4a_Y res4a_branch1_Y res4a_branch2a_Y res4a_branch2a_relu_Y res4a_branch2b_Y res4a_branch2b_relu_Y res4a_branch2c_Y res4a_relu_Y res4b_Y res4b_branch2a_Y res4b_branch2a_relu_Y res4b_branch2b_Y res4b_branch2b_relu_Y res4b_branch2c_Y res4b_relu_Y res4c_Y res4c_branch2a_Y res4c_branch2a_relu_Y res4c_branch2b_Y res4c_branch2b_relu_Y res4c_branch2c_Y res4c_relu_Y res4d_Y res4d_branch2a_Y res4d_branch2a_relu_Y res4d_branch2b_Y res4d_branch2b_relu_Y res4d_branch2c_Y res4d_relu_Y res4e_Y res4e_branch2a_Y res4e_branch2a_relu_Y res4e_branch2b_Y res4e_branch2b_relu_Y res4e_branch2c_Y res4e_relu_Y res4f_Y res4f_branch2a_Y res4f_branch2a_relu_Y res4f_branch2b_Y res4f_branch2b_relu_Y res4f_branch2c_Y res4f_relu_Y res5a_Y res5a_branch1_Y res5a_branch2a_Y res5a_branch2a_relu_Y res5a_branch2b_Y res5a_branch2b_relu_Y res5a_branch2c_Y res5a_relu_Y res5b_Y res5b_branch2a_Y res5b_branch2a_relu_Y res5b_branch2b_Y res5b_branch2b_relu_Y res5b_branch2c_Y res5b_relu_Y res5c_Y res5c_branch2a_Y res5c_branch2a_relu_Y res5c_branch2b_Y res5c_branch2b_relu_Y res5c_branch2c_Y res5c_relu_Y ,"
+"prob_Y ,"
+" 123 ,"
+"Convolution conv1 1 1 input conv1_Y 1 3 64 7 7 2 2 3 3 1 -1 1 1 ,"
+"ReLU conv1_relu 1 1 conv1_Y conv1_relu_Y ,"
+"Pooling pool1 1 1 conv1_relu_Y pool1_Y 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"Convolution res2a_branch1 1 1 pool1_Y res2a_branch1_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution res2a_branch2a 1 1 pool1_Y res2a_branch2a_Y 1 64 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res2a_branch2a_relu 1 1 res2a_branch2a_Y res2a_branch2a_relu_Y ,"
+"Convolution res2a_branch2b 1 1 res2a_branch2a_relu_Y res2a_branch2b_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res2a_branch2b_relu 1 1 res2a_branch2b_Y res2a_branch2b_relu_Y ,"
+"Convolution res2a_branch2c 1 1 res2a_branch2b_relu_Y res2a_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res2a 2 1 res2a_branch1_Y res2a_branch2c_Y res2a_Y ,"
+"ReLU res2a_relu 1 1 res2a_Y res2a_relu_Y ,"
+"Convolution res2b_branch2a 1 1 res2a_relu_Y res2b_branch2a_Y 1 256 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res2b_branch2a_relu 1 1 res2b_branch2a_Y res2b_branch2a_relu_Y ,"
+"Convolution res2b_branch2b 1 1 res2b_branch2a_relu_Y res2b_branch2b_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res2b_branch2b_relu 1 1 res2b_branch2b_Y res2b_branch2b_relu_Y ,"
+"Convolution res2b_branch2c 1 1 res2b_branch2b_relu_Y res2b_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res2b 2 1 res2a_relu_Y res2b_branch2c_Y res2b_Y ,"
+"ReLU res2b_relu 1 1 res2b_Y res2b_relu_Y ,"
+"Convolution res2c_branch2a 1 1 res2b_relu_Y res2c_branch2a_Y 1 256 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res2c_branch2a_relu 1 1 res2c_branch2a_Y res2c_branch2a_relu_Y ,"
+"Convolution res2c_branch2b 1 1 res2c_branch2a_relu_Y res2c_branch2b_Y 1 64 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res2c_branch2b_relu 1 1 res2c_branch2b_Y res2c_branch2b_relu_Y ,"
+"Convolution res2c_branch2c 1 1 res2c_branch2b_relu_Y res2c_branch2c_Y 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res2c 2 1 res2b_relu_Y res2c_branch2c_Y res2c_Y ,"
+"ReLU res2c_relu 1 1 res2c_Y res2c_relu_Y ,"
+"Convolution res3a_branch1 1 1 res2c_relu_Y res3a_branch1_Y 1 256 512 1 1 2 2 0 0 1 -1 1 1 ,"
+"Convolution res3a_branch2a 1 1 res2c_relu_Y res3a_branch2a_Y 1 256 128 1 1 2 2 0 0 1 -1 1 1 ,"
+"ReLU res3a_branch2a_relu 1 1 res3a_branch2a_Y res3a_branch2a_relu_Y ,"
+"Convolution res3a_branch2b 1 1 res3a_branch2a_relu_Y res3a_branch2b_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res3a_branch2b_relu 1 1 res3a_branch2b_Y res3a_branch2b_relu_Y ,"
+"Convolution res3a_branch2c 1 1 res3a_branch2b_relu_Y res3a_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res3a 2 1 res3a_branch1_Y res3a_branch2c_Y res3a_Y ,"
+"ReLU res3a_relu 1 1 res3a_Y res3a_relu_Y ,"
+"Convolution res3b_branch2a 1 1 res3a_relu_Y res3b_branch2a_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res3b_branch2a_relu 1 1 res3b_branch2a_Y res3b_branch2a_relu_Y ,"
+"Convolution res3b_branch2b 1 1 res3b_branch2a_relu_Y res3b_branch2b_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res3b_branch2b_relu 1 1 res3b_branch2b_Y res3b_branch2b_relu_Y ,"
+"Convolution res3b_branch2c 1 1 res3b_branch2b_relu_Y res3b_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res3b 2 1 res3a_relu_Y res3b_branch2c_Y res3b_Y ,"
+"ReLU res3b_relu 1 1 res3b_Y res3b_relu_Y ,"
+"Convolution res3c_branch2a 1 1 res3b_relu_Y res3c_branch2a_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res3c_branch2a_relu 1 1 res3c_branch2a_Y res3c_branch2a_relu_Y ,"
+"Convolution res3c_branch2b 1 1 res3c_branch2a_relu_Y res3c_branch2b_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res3c_branch2b_relu 1 1 res3c_branch2b_Y res3c_branch2b_relu_Y ,"
+"Convolution res3c_branch2c 1 1 res3c_branch2b_relu_Y res3c_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res3c 2 1 res3b_relu_Y res3c_branch2c_Y res3c_Y ,"
+"ReLU res3c_relu 1 1 res3c_Y res3c_relu_Y ,"
+"Convolution res3d_branch2a 1 1 res3c_relu_Y res3d_branch2a_Y 1 512 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res3d_branch2a_relu 1 1 res3d_branch2a_Y res3d_branch2a_relu_Y ,"
+"Convolution res3d_branch2b 1 1 res3d_branch2a_relu_Y res3d_branch2b_Y 1 128 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res3d_branch2b_relu 1 1 res3d_branch2b_Y res3d_branch2b_relu_Y ,"
+"Convolution res3d_branch2c 1 1 res3d_branch2b_relu_Y res3d_branch2c_Y 1 128 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res3d 2 1 res3c_relu_Y res3d_branch2c_Y res3d_Y ,"
+"ReLU res3d_relu 1 1 res3d_Y res3d_relu_Y ,"
+"Convolution res4a_branch1 1 1 res3d_relu_Y res4a_branch1_Y 1 512 1024 1 1 2 2 0 0 1 -1 1 1 ,"
+"Convolution res4a_branch2a 1 1 res3d_relu_Y res4a_branch2a_Y 1 512 256 1 1 2 2 0 0 1 -1 1 1 ,"
+"ReLU res4a_branch2a_relu 1 1 res4a_branch2a_Y res4a_branch2a_relu_Y ,"
+"Convolution res4a_branch2b 1 1 res4a_branch2a_relu_Y res4a_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4a_branch2b_relu 1 1 res4a_branch2b_Y res4a_branch2b_relu_Y ,"
+"Convolution res4a_branch2c 1 1 res4a_branch2b_relu_Y res4a_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4a 2 1 res4a_branch1_Y res4a_branch2c_Y res4a_Y ,"
+"ReLU res4a_relu 1 1 res4a_Y res4a_relu_Y ,"
+"Convolution res4b_branch2a 1 1 res4a_relu_Y res4b_branch2a_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res4b_branch2a_relu 1 1 res4b_branch2a_Y res4b_branch2a_relu_Y ,"
+"Convolution res4b_branch2b 1 1 res4b_branch2a_relu_Y res4b_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4b_branch2b_relu 1 1 res4b_branch2b_Y res4b_branch2b_relu_Y ,"
+"Convolution res4b_branch2c 1 1 res4b_branch2b_relu_Y res4b_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4b 2 1 res4a_relu_Y res4b_branch2c_Y res4b_Y ,"
+"ReLU res4b_relu 1 1 res4b_Y res4b_relu_Y ,"
+"Convolution res4c_branch2a 1 1 res4b_relu_Y res4c_branch2a_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res4c_branch2a_relu 1 1 res4c_branch2a_Y res4c_branch2a_relu_Y ,"
+"Convolution res4c_branch2b 1 1 res4c_branch2a_relu_Y res4c_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4c_branch2b_relu 1 1 res4c_branch2b_Y res4c_branch2b_relu_Y ,"
+"Convolution res4c_branch2c 1 1 res4c_branch2b_relu_Y res4c_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4c 2 1 res4b_relu_Y res4c_branch2c_Y res4c_Y ,"
+"ReLU res4c_relu 1 1 res4c_Y res4c_relu_Y ,"
+"Convolution res4d_branch2a 1 1 res4c_relu_Y res4d_branch2a_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res4d_branch2a_relu 1 1 res4d_branch2a_Y res4d_branch2a_relu_Y ,"
+"Convolution res4d_branch2b 1 1 res4d_branch2a_relu_Y res4d_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4d_branch2b_relu 1 1 res4d_branch2b_Y res4d_branch2b_relu_Y ,"
+"Convolution res4d_branch2c 1 1 res4d_branch2b_relu_Y res4d_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4d 2 1 res4c_relu_Y res4d_branch2c_Y res4d_Y ,"
+"ReLU res4d_relu 1 1 res4d_Y res4d_relu_Y ,"
+"Convolution res4e_branch2a 1 1 res4d_relu_Y res4e_branch2a_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res4e_branch2a_relu 1 1 res4e_branch2a_Y res4e_branch2a_relu_Y ,"
+"Convolution res4e_branch2b 1 1 res4e_branch2a_relu_Y res4e_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4e_branch2b_relu 1 1 res4e_branch2b_Y res4e_branch2b_relu_Y ,"
+"Convolution res4e_branch2c 1 1 res4e_branch2b_relu_Y res4e_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4e 2 1 res4d_relu_Y res4e_branch2c_Y res4e_Y ,"
+"ReLU res4e_relu 1 1 res4e_Y res4e_relu_Y ,"
+"Convolution res4f_branch2a 1 1 res4e_relu_Y res4f_branch2a_Y 1 1024 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res4f_branch2a_relu 1 1 res4f_branch2a_Y res4f_branch2a_relu_Y ,"
+"Convolution res4f_branch2b 1 1 res4f_branch2a_relu_Y res4f_branch2b_Y 1 256 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res4f_branch2b_relu 1 1 res4f_branch2b_Y res4f_branch2b_relu_Y ,"
+"Convolution res4f_branch2c 1 1 res4f_branch2b_relu_Y res4f_branch2c_Y 1 256 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res4f 2 1 res4e_relu_Y res4f_branch2c_Y res4f_Y ,"
+"ReLU res4f_relu 1 1 res4f_Y res4f_relu_Y ,"
+"Convolution res5a_branch1 1 1 res4f_relu_Y res5a_branch1_Y 1 1024 2048 1 1 2 2 0 0 1 -1 1 1 ,"
+"Convolution res5a_branch2a 1 1 res4f_relu_Y res5a_branch2a_Y 1 1024 512 1 1 2 2 0 0 1 -1 1 1 ,"
+"ReLU res5a_branch2a_relu 1 1 res5a_branch2a_Y res5a_branch2a_relu_Y ,"
+"Convolution res5a_branch2b 1 1 res5a_branch2a_relu_Y res5a_branch2b_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res5a_branch2b_relu 1 1 res5a_branch2b_Y res5a_branch2b_relu_Y ,"
+"Convolution res5a_branch2c 1 1 res5a_branch2b_relu_Y res5a_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res5a 2 1 res5a_branch1_Y res5a_branch2c_Y res5a_Y ,"
+"ReLU res5a_relu 1 1 res5a_Y res5a_relu_Y ,"
+"Convolution res5b_branch2a 1 1 res5a_relu_Y res5b_branch2a_Y 1 2048 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res5b_branch2a_relu 1 1 res5b_branch2a_Y res5b_branch2a_relu_Y ,"
+"Convolution res5b_branch2b 1 1 res5b_branch2a_relu_Y res5b_branch2b_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res5b_branch2b_relu 1 1 res5b_branch2b_Y res5b_branch2b_relu_Y ,"
+"Convolution res5b_branch2c 1 1 res5b_branch2b_relu_Y res5b_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res5b 2 1 res5a_relu_Y res5b_branch2c_Y res5b_Y ,"
+"ReLU res5b_relu 1 1 res5b_Y res5b_relu_Y ,"
+"Convolution res5c_branch2a 1 1 res5b_relu_Y res5c_branch2a_Y 1 2048 512 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU res5c_branch2a_relu 1 1 res5c_branch2a_Y res5c_branch2a_relu_Y ,"
+"Convolution res5c_branch2b 1 1 res5c_branch2a_relu_Y res5c_branch2b_Y 1 512 512 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU res5c_branch2b_relu 1 1 res5c_branch2b_Y res5c_branch2b_relu_Y ,"
+"Convolution res5c_branch2c 1 1 res5c_branch2b_relu_Y res5c_branch2c_Y 1 512 2048 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add res5c 2 1 res5b_relu_Y res5c_branch2c_Y res5c_Y ,"
+"ReLU res5c_relu 1 1 res5c_Y res5c_relu_Y ,"
+"Pooling pool5 1 1 res5c_relu_Y pool5_Y 1 7 7 1 1 0 0 -1 -1 -1 0 ,"
+"Reshape fc1000_Reshape 1 1 pool5_Y fc1000_Reshape_Y 0 4 4 0 2048 1 1 ,"
+"InnerProduct fc1000_Gemm 1 1 fc1000_Reshape_Y fc1000_Gemm_Y 1000 1 0 1 ,"
+"SoftmaxCaffe prob 1 1 fc1000_Gemm_Y prob_Y 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/shufflenet_v2.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/shufflenet_v2.tnnproto
new file mode 100644
index 0000000..018e4e9
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/shufflenet_v2.tnnproto
@@ -0,0 +1,160 @@
+"1 156 1 4206624770 ,"
+"input_input 1 3 224 224 ,"
+" conv5 conv5/relu data fc input_input pool stage1/conv stage1/conv/relu stage1/pool stage_2_1/concat stage_2_1/conv1 stage_2_1/conv1/relu stage_2_1/conv2 stage_2_1/conv3 stage_2_1/conv3/relu stage_2_1/conv4 stage_2_1/conv5 stage_2_1/conv5/relu stage_2_1/shuffle stage_2_2/concat stage_2_2/conv1 stage_2_2/conv1/relu stage_2_2/conv2 stage_2_2/conv3 stage_2_2/conv3/relu stage_2_2/shuffle stage_2_2/slice1 stage_2_2/slice2 stage_2_3/concat stage_2_3/conv1 stage_2_3/conv1/relu stage_2_3/conv2 stage_2_3/conv3 stage_2_3/conv3/relu stage_2_3/shuffle stage_2_3/slice1 stage_2_3/slice2 stage_2_4/concat stage_2_4/conv1 stage_2_4/conv1/relu stage_2_4/conv2 stage_2_4/conv3 stage_2_4/conv3/relu stage_2_4/shuffle stage_2_4/slice1 stage_2_4/slice2 stage_3_1/concat stage_3_1/conv1 stage_3_1/conv1/relu stage_3_1/conv2 stage_3_1/conv3 stage_3_1/conv3/relu stage_3_1/conv4 stage_3_1/conv5 stage_3_1/conv5/relu stage_3_1/shuffle stage_3_2/concat stage_3_2/conv1 stage_3_2/conv1/relu stage_3_2/conv2 stage_3_2/conv3 stage_3_2/conv3/relu stage_3_2/shuffle stage_3_2/slice1 stage_3_2/slice2 stage_3_3/concat stage_3_3/conv1 stage_3_3/conv1/relu stage_3_3/conv2 stage_3_3/conv3 stage_3_3/conv3/relu stage_3_3/shuffle stage_3_3/slice1 stage_3_3/slice2 stage_3_4/concat stage_3_4/conv1 stage_3_4/conv1/relu stage_3_4/conv2 stage_3_4/conv3 stage_3_4/conv3/relu stage_3_4/shuffle stage_3_4/slice1 stage_3_4/slice2 stage_3_5/concat stage_3_5/conv1 stage_3_5/conv1/relu stage_3_5/conv2 stage_3_5/conv3 stage_3_5/conv3/relu stage_3_5/shuffle stage_3_5/slice1 stage_3_5/slice2 stage_3_6/concat stage_3_6/conv1 stage_3_6/conv1/relu stage_3_6/conv2 stage_3_6/conv3 stage_3_6/conv3/relu stage_3_6/shuffle stage_3_6/slice1 stage_3_6/slice2 stage_3_7/concat stage_3_7/conv1 stage_3_7/conv1/relu stage_3_7/conv2 stage_3_7/conv3 stage_3_7/conv3/relu stage_3_7/shuffle stage_3_7/slice1 stage_3_7/slice2 stage_3_8/concat stage_3_8/conv1 stage_3_8/conv1/relu stage_3_8/conv2 stage_3_8/conv3 stage_3_8/conv3/relu stage_3_8/shuffle stage_3_8/slice1 stage_3_8/slice2 stage_4_1/concat stage_4_1/conv1 stage_4_1/conv1/relu stage_4_1/conv2 stage_4_1/conv3 stage_4_1/conv3/relu stage_4_1/conv4 stage_4_1/conv5 stage_4_1/conv5/relu stage_4_1/shuffle stage_4_2/concat stage_4_2/conv1 stage_4_2/conv1/relu stage_4_2/conv2 stage_4_2/conv3 stage_4_2/conv3/relu stage_4_2/shuffle stage_4_2/slice1 stage_4_2/slice2 stage_4_3/concat stage_4_3/conv1 stage_4_3/conv1/relu stage_4_3/conv2 stage_4_3/conv3 stage_4_3/conv3/relu stage_4_3/shuffle stage_4_3/slice1 stage_4_3/slice2 stage_4_4/concat stage_4_4/conv1 stage_4_4/conv1/relu stage_4_4/conv2 stage_4_4/conv3 stage_4_4/conv3/relu stage_4_4/shuffle stage_4_4/slice1 stage_4_4/slice2 ,"
+"fc ,"
+" 155 ,"
+"BatchNormCxx data/bn 1 1 input_input data ,"
+"Convolution stage1/conv 1 1 data stage1/conv 1 3 24 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU stage1/conv/relu 1 1 stage1/conv stage1/conv/relu ,"
+"Pooling stage1/pool 1 1 stage1/conv/relu stage1/pool 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"Convolution stage_2_1/conv4 1 1 stage1/pool stage_2_1/conv4 24 1 24 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_2_1/conv5 1 1 stage_2_1/conv4 stage_2_1/conv5 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_1/conv5/relu 1 1 stage_2_1/conv5 stage_2_1/conv5/relu ,"
+"Convolution stage_2_1/conv1 1 1 stage1/pool stage_2_1/conv1 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_1/conv1/relu 1 1 stage_2_1/conv1 stage_2_1/conv1/relu ,"
+"Convolution stage_2_1/conv2 1 1 stage_2_1/conv1/relu stage_2_1/conv2 24 1 24 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_2_1/conv3 1 1 stage_2_1/conv2 stage_2_1/conv3 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_1/conv3/relu 1 1 stage_2_1/conv3 stage_2_1/conv3/relu ,"
+"Concat stage_2_1/concat 2 1 stage_2_1/conv5/relu stage_2_1/conv3/relu stage_2_1/concat 1 ,"
+"ShuffleChannel stage_2_1/shuffle 1 1 stage_2_1/concat stage_2_1/shuffle 2 ,"
+"StridedSlice stage_2_2/slice1 1 1 stage_2_1/shuffle stage_2_2/slice1 4 0 0 0 0 4 0 24 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_2_2/slice2 1 1 stage_2_1/shuffle stage_2_2/slice2 4 0 24 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"Convolution stage_2_2/conv1 1 1 stage_2_2/slice2 stage_2_2/conv1 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_2/conv1/relu 1 1 stage_2_2/conv1 stage_2_2/conv1/relu ,"
+"Convolution stage_2_2/conv2 1 1 stage_2_2/conv1/relu stage_2_2/conv2 24 1 24 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_2_2/conv3 1 1 stage_2_2/conv2 stage_2_2/conv3 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_2/conv3/relu 1 1 stage_2_2/conv3 stage_2_2/conv3/relu ,"
+"Concat stage_2_2/concat 2 1 stage_2_2/slice1 stage_2_2/conv3/relu stage_2_2/concat 1 ,"
+"ShuffleChannel stage_2_2/shuffle 1 1 stage_2_2/concat stage_2_2/shuffle 2 ,"
+"StridedSlice stage_2_3/slice1 1 1 stage_2_2/shuffle stage_2_3/slice1 4 0 0 0 0 4 0 24 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_2_3/slice2 1 1 stage_2_2/shuffle stage_2_3/slice2 4 0 24 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"Convolution stage_2_3/conv1 1 1 stage_2_3/slice2 stage_2_3/conv1 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_3/conv1/relu 1 1 stage_2_3/conv1 stage_2_3/conv1/relu ,"
+"Convolution stage_2_3/conv2 1 1 stage_2_3/conv1/relu stage_2_3/conv2 24 1 24 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_2_3/conv3 1 1 stage_2_3/conv2 stage_2_3/conv3 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_3/conv3/relu 1 1 stage_2_3/conv3 stage_2_3/conv3/relu ,"
+"Concat stage_2_3/concat 2 1 stage_2_3/slice1 stage_2_3/conv3/relu stage_2_3/concat 1 ,"
+"ShuffleChannel stage_2_3/shuffle 1 1 stage_2_3/concat stage_2_3/shuffle 2 ,"
+"StridedSlice stage_2_4/slice1 1 1 stage_2_3/shuffle stage_2_4/slice1 4 0 0 0 0 4 0 24 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_2_4/slice2 1 1 stage_2_3/shuffle stage_2_4/slice2 4 0 24 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"Convolution stage_2_4/conv1 1 1 stage_2_4/slice2 stage_2_4/conv1 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_4/conv1/relu 1 1 stage_2_4/conv1 stage_2_4/conv1/relu ,"
+"Convolution stage_2_4/conv2 1 1 stage_2_4/conv1/relu stage_2_4/conv2 24 1 24 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_2_4/conv3 1 1 stage_2_4/conv2 stage_2_4/conv3 1 24 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_2_4/conv3/relu 1 1 stage_2_4/conv3 stage_2_4/conv3/relu ,"
+"Concat stage_2_4/concat 2 1 stage_2_4/slice1 stage_2_4/conv3/relu stage_2_4/concat 1 ,"
+"ShuffleChannel stage_2_4/shuffle 1 1 stage_2_4/concat stage_2_4/shuffle 2 ,"
+"Convolution stage_3_1/conv4 1 1 stage_2_4/shuffle stage_3_1/conv4 48 1 48 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_3_1/conv5 1 1 stage_3_1/conv4 stage_3_1/conv5 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_1/conv5/relu 1 1 stage_3_1/conv5 stage_3_1/conv5/relu ,"
+"Convolution stage_3_1/conv1 1 1 stage_2_4/shuffle stage_3_1/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_1/conv1/relu 1 1 stage_3_1/conv1 stage_3_1/conv1/relu ,"
+"Convolution stage_3_1/conv2 1 1 stage_3_1/conv1/relu stage_3_1/conv2 48 1 48 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_3_1/conv3 1 1 stage_3_1/conv2 stage_3_1/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_1/conv3/relu 1 1 stage_3_1/conv3 stage_3_1/conv3/relu ,"
+"Concat stage_3_1/concat 2 1 stage_3_1/conv5/relu stage_3_1/conv3/relu stage_3_1/concat 1 ,"
+"ShuffleChannel stage_3_1/shuffle 1 1 stage_3_1/concat stage_3_1/shuffle 2 ,"
+"StridedSlice stage_3_2/slice1 1 1 stage_3_1/shuffle stage_3_2/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_2/slice2 1 1 stage_3_1/shuffle stage_3_2/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_2/conv1 1 1 stage_3_2/slice2 stage_3_2/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_2/conv1/relu 1 1 stage_3_2/conv1 stage_3_2/conv1/relu ,"
+"Convolution stage_3_2/conv2 1 1 stage_3_2/conv1/relu stage_3_2/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_2/conv3 1 1 stage_3_2/conv2 stage_3_2/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_2/conv3/relu 1 1 stage_3_2/conv3 stage_3_2/conv3/relu ,"
+"Concat stage_3_2/concat 2 1 stage_3_2/slice1 stage_3_2/conv3/relu stage_3_2/concat 1 ,"
+"ShuffleChannel stage_3_2/shuffle 1 1 stage_3_2/concat stage_3_2/shuffle 2 ,"
+"StridedSlice stage_3_3/slice1 1 1 stage_3_2/shuffle stage_3_3/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_3/slice2 1 1 stage_3_2/shuffle stage_3_3/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_3/conv1 1 1 stage_3_3/slice2 stage_3_3/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_3/conv1/relu 1 1 stage_3_3/conv1 stage_3_3/conv1/relu ,"
+"Convolution stage_3_3/conv2 1 1 stage_3_3/conv1/relu stage_3_3/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_3/conv3 1 1 stage_3_3/conv2 stage_3_3/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_3/conv3/relu 1 1 stage_3_3/conv3 stage_3_3/conv3/relu ,"
+"Concat stage_3_3/concat 2 1 stage_3_3/slice1 stage_3_3/conv3/relu stage_3_3/concat 1 ,"
+"ShuffleChannel stage_3_3/shuffle 1 1 stage_3_3/concat stage_3_3/shuffle 2 ,"
+"StridedSlice stage_3_4/slice1 1 1 stage_3_3/shuffle stage_3_4/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_4/slice2 1 1 stage_3_3/shuffle stage_3_4/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_4/conv1 1 1 stage_3_4/slice2 stage_3_4/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_4/conv1/relu 1 1 stage_3_4/conv1 stage_3_4/conv1/relu ,"
+"Convolution stage_3_4/conv2 1 1 stage_3_4/conv1/relu stage_3_4/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_4/conv3 1 1 stage_3_4/conv2 stage_3_4/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_4/conv3/relu 1 1 stage_3_4/conv3 stage_3_4/conv3/relu ,"
+"Concat stage_3_4/concat 2 1 stage_3_4/slice1 stage_3_4/conv3/relu stage_3_4/concat 1 ,"
+"ShuffleChannel stage_3_4/shuffle 1 1 stage_3_4/concat stage_3_4/shuffle 2 ,"
+"StridedSlice stage_3_5/slice1 1 1 stage_3_4/shuffle stage_3_5/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_5/slice2 1 1 stage_3_4/shuffle stage_3_5/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_5/conv1 1 1 stage_3_5/slice2 stage_3_5/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_5/conv1/relu 1 1 stage_3_5/conv1 stage_3_5/conv1/relu ,"
+"Convolution stage_3_5/conv2 1 1 stage_3_5/conv1/relu stage_3_5/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_5/conv3 1 1 stage_3_5/conv2 stage_3_5/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_5/conv3/relu 1 1 stage_3_5/conv3 stage_3_5/conv3/relu ,"
+"Concat stage_3_5/concat 2 1 stage_3_5/slice1 stage_3_5/conv3/relu stage_3_5/concat 1 ,"
+"ShuffleChannel stage_3_5/shuffle 1 1 stage_3_5/concat stage_3_5/shuffle 2 ,"
+"StridedSlice stage_3_6/slice1 1 1 stage_3_5/shuffle stage_3_6/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_6/slice2 1 1 stage_3_5/shuffle stage_3_6/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_6/conv1 1 1 stage_3_6/slice2 stage_3_6/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_6/conv1/relu 1 1 stage_3_6/conv1 stage_3_6/conv1/relu ,"
+"Convolution stage_3_6/conv2 1 1 stage_3_6/conv1/relu stage_3_6/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_6/conv3 1 1 stage_3_6/conv2 stage_3_6/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_6/conv3/relu 1 1 stage_3_6/conv3 stage_3_6/conv3/relu ,"
+"Concat stage_3_6/concat 2 1 stage_3_6/slice1 stage_3_6/conv3/relu stage_3_6/concat 1 ,"
+"ShuffleChannel stage_3_6/shuffle 1 1 stage_3_6/concat stage_3_6/shuffle 2 ,"
+"StridedSlice stage_3_7/slice1 1 1 stage_3_6/shuffle stage_3_7/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_7/slice2 1 1 stage_3_6/shuffle stage_3_7/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_7/conv1 1 1 stage_3_7/slice2 stage_3_7/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_7/conv1/relu 1 1 stage_3_7/conv1 stage_3_7/conv1/relu ,"
+"Convolution stage_3_7/conv2 1 1 stage_3_7/conv1/relu stage_3_7/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_7/conv3 1 1 stage_3_7/conv2 stage_3_7/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_7/conv3/relu 1 1 stage_3_7/conv3 stage_3_7/conv3/relu ,"
+"Concat stage_3_7/concat 2 1 stage_3_7/slice1 stage_3_7/conv3/relu stage_3_7/concat 1 ,"
+"ShuffleChannel stage_3_7/shuffle 1 1 stage_3_7/concat stage_3_7/shuffle 2 ,"
+"StridedSlice stage_3_8/slice1 1 1 stage_3_7/shuffle stage_3_8/slice1 4 0 0 0 0 4 0 48 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_3_8/slice2 1 1 stage_3_7/shuffle stage_3_8/slice2 4 0 48 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"Convolution stage_3_8/conv1 1 1 stage_3_8/slice2 stage_3_8/conv1 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_8/conv1/relu 1 1 stage_3_8/conv1 stage_3_8/conv1/relu ,"
+"Convolution stage_3_8/conv2 1 1 stage_3_8/conv1/relu stage_3_8/conv2 48 1 48 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_3_8/conv3 1 1 stage_3_8/conv2 stage_3_8/conv3 1 48 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_3_8/conv3/relu 1 1 stage_3_8/conv3 stage_3_8/conv3/relu ,"
+"Concat stage_3_8/concat 2 1 stage_3_8/slice1 stage_3_8/conv3/relu stage_3_8/concat 1 ,"
+"ShuffleChannel stage_3_8/shuffle 1 1 stage_3_8/concat stage_3_8/shuffle 2 ,"
+"Convolution stage_4_1/conv4 1 1 stage_3_8/shuffle stage_4_1/conv4 96 1 96 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_4_1/conv5 1 1 stage_4_1/conv4 stage_4_1/conv5 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_1/conv5/relu 1 1 stage_4_1/conv5 stage_4_1/conv5/relu ,"
+"Convolution stage_4_1/conv1 1 1 stage_3_8/shuffle stage_4_1/conv1 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_1/conv1/relu 1 1 stage_4_1/conv1 stage_4_1/conv1/relu ,"
+"Convolution stage_4_1/conv2 1 1 stage_4_1/conv1/relu stage_4_1/conv2 96 1 96 3 3 2 2 1 1 1 -1 1 1 ,"
+"Convolution stage_4_1/conv3 1 1 stage_4_1/conv2 stage_4_1/conv3 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_1/conv3/relu 1 1 stage_4_1/conv3 stage_4_1/conv3/relu ,"
+"Concat stage_4_1/concat 2 1 stage_4_1/conv5/relu stage_4_1/conv3/relu stage_4_1/concat 1 ,"
+"ShuffleChannel stage_4_1/shuffle 1 1 stage_4_1/concat stage_4_1/shuffle 2 ,"
+"StridedSlice stage_4_2/slice1 1 1 stage_4_1/shuffle stage_4_2/slice1 4 0 0 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_4_2/slice2 1 1 stage_4_1/shuffle stage_4_2/slice2 4 0 96 0 0 4 0 192 0 0 4 1 1 1 1 ,"
+"Convolution stage_4_2/conv1 1 1 stage_4_2/slice2 stage_4_2/conv1 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_2/conv1/relu 1 1 stage_4_2/conv1 stage_4_2/conv1/relu ,"
+"Convolution stage_4_2/conv2 1 1 stage_4_2/conv1/relu stage_4_2/conv2 96 1 96 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_4_2/conv3 1 1 stage_4_2/conv2 stage_4_2/conv3 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_2/conv3/relu 1 1 stage_4_2/conv3 stage_4_2/conv3/relu ,"
+"Concat stage_4_2/concat 2 1 stage_4_2/slice1 stage_4_2/conv3/relu stage_4_2/concat 1 ,"
+"ShuffleChannel stage_4_2/shuffle 1 1 stage_4_2/concat stage_4_2/shuffle 2 ,"
+"StridedSlice stage_4_3/slice1 1 1 stage_4_2/shuffle stage_4_3/slice1 4 0 0 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_4_3/slice2 1 1 stage_4_2/shuffle stage_4_3/slice2 4 0 96 0 0 4 0 192 0 0 4 1 1 1 1 ,"
+"Convolution stage_4_3/conv1 1 1 stage_4_3/slice2 stage_4_3/conv1 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_3/conv1/relu 1 1 stage_4_3/conv1 stage_4_3/conv1/relu ,"
+"Convolution stage_4_3/conv2 1 1 stage_4_3/conv1/relu stage_4_3/conv2 96 1 96 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_4_3/conv3 1 1 stage_4_3/conv2 stage_4_3/conv3 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_3/conv3/relu 1 1 stage_4_3/conv3 stage_4_3/conv3/relu ,"
+"Concat stage_4_3/concat 2 1 stage_4_3/slice1 stage_4_3/conv3/relu stage_4_3/concat 1 ,"
+"ShuffleChannel stage_4_3/shuffle 1 1 stage_4_3/concat stage_4_3/shuffle 2 ,"
+"StridedSlice stage_4_4/slice1 1 1 stage_4_3/shuffle stage_4_4/slice1 4 0 0 0 0 4 0 96 0 0 4 1 1 1 1 ,"
+"StridedSlice stage_4_4/slice2 1 1 stage_4_3/shuffle stage_4_4/slice2 4 0 96 0 0 4 0 192 0 0 4 1 1 1 1 ,"
+"Convolution stage_4_4/conv1 1 1 stage_4_4/slice2 stage_4_4/conv1 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_4/conv1/relu 1 1 stage_4_4/conv1 stage_4_4/conv1/relu ,"
+"Convolution stage_4_4/conv2 1 1 stage_4_4/conv1/relu stage_4_4/conv2 96 1 96 3 3 1 1 1 1 1 -1 1 1 ,"
+"Convolution stage_4_4/conv3 1 1 stage_4_4/conv2 stage_4_4/conv3 1 96 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU stage_4_4/conv3/relu 1 1 stage_4_4/conv3 stage_4_4/conv3/relu ,"
+"Concat stage_4_4/concat 2 1 stage_4_4/slice1 stage_4_4/conv3/relu stage_4_4/concat 1 ,"
+"ShuffleChannel stage_4_4/shuffle 1 1 stage_4_4/concat stage_4_4/shuffle 2 ,"
+"Convolution conv5 1 1 stage_4_4/shuffle conv5 1 192 1024 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU conv5/relu 1 1 conv5 conv5/relu ,"
+"Pooling pool 1 1 conv5/relu pool 1 7 7 1 1 0 0 -1 -1 -1 1 ,"
+"Convolution fc 1 1 pool fc 1 1024 1000 1 1 1 1 0 0 1 -1 1 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.0.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.0.tnnproto
new file mode 100644
index 0000000..706f149
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.0.tnnproto
@@ -0,0 +1,70 @@
+"1 66 1 4206624770 ,"
+"input 1 3 227 227 ,"
+" conv1 conv10 fire2/concat fire2/expand1x1 fire2/expand3x3 fire2/relu_expand1x1 fire2/relu_expand3x3 fire2/relu_squeeze1x1 fire2/squeeze1x1 fire3/concat fire3/expand1x1 fire3/expand3x3 fire3/relu_expand1x1 fire3/relu_expand3x3 fire3/relu_squeeze1x1 fire3/squeeze1x1 fire4/concat fire4/expand1x1 fire4/expand3x3 fire4/relu_expand1x1 fire4/relu_expand3x3 fire4/relu_squeeze1x1 fire4/squeeze1x1 fire5/concat fire5/expand1x1 fire5/expand3x3 fire5/relu_expand1x1 fire5/relu_expand3x3 fire5/relu_squeeze1x1 fire5/squeeze1x1 fire6/concat fire6/expand1x1 fire6/expand3x3 fire6/relu_expand1x1 fire6/relu_expand3x3 fire6/relu_squeeze1x1 fire6/squeeze1x1 fire7/concat fire7/expand1x1 fire7/expand3x3 fire7/relu_expand1x1 fire7/relu_expand3x3 fire7/relu_squeeze1x1 fire7/squeeze1x1 fire8/concat fire8/expand1x1 fire8/expand3x3 fire8/relu_expand1x1 fire8/relu_expand3x3 fire8/relu_squeeze1x1 fire8/squeeze1x1 fire9/concat fire9/expand1x1 fire9/expand3x3 fire9/relu_expand1x1 fire9/relu_expand3x3 fire9/relu_squeeze1x1 fire9/squeeze1x1 input pool1 pool10 pool4 pool8 prob relu_conv1 relu_conv10 ,"
+"prob ,"
+" 65 ,"
+"Convolution conv1 1 1 input conv1 1 3 96 7 7 2 2 0 0 1 -1 1 1 ,"
+"ReLU relu_conv1 1 1 conv1 relu_conv1 ,"
+"Pooling pool1 1 1 relu_conv1 pool1 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 1 96 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/relu_squeeze1x1 ,"
+"Convolution fire2/expand1x1 1 1 fire2/relu_squeeze1x1 fire2/expand1x1 1 16 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/relu_expand1x1 ,"
+"Convolution fire2/expand3x3 1 1 fire2/relu_squeeze1x1 fire2/expand3x3 1 16 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/relu_expand3x3 ,"
+"Concat fire2/concat 2 1 fire2/relu_expand1x1 fire2/relu_expand3x3 fire2/concat 1 ,"
+"Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 1 128 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/relu_squeeze1x1 ,"
+"Convolution fire3/expand1x1 1 1 fire3/relu_squeeze1x1 fire3/expand1x1 1 16 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/relu_expand1x1 ,"
+"Convolution fire3/expand3x3 1 1 fire3/relu_squeeze1x1 fire3/expand3x3 1 16 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/relu_expand3x3 ,"
+"Concat fire3/concat 2 1 fire3/relu_expand1x1 fire3/relu_expand3x3 fire3/concat 1 ,"
+"Convolution fire4/squeeze1x1 1 1 fire3/concat fire4/squeeze1x1 1 128 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/relu_squeeze1x1 ,"
+"Convolution fire4/expand1x1 1 1 fire4/relu_squeeze1x1 fire4/expand1x1 1 32 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/relu_expand1x1 ,"
+"Convolution fire4/expand3x3 1 1 fire4/relu_squeeze1x1 fire4/expand3x3 1 32 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/relu_expand3x3 ,"
+"Concat fire4/concat 2 1 fire4/relu_expand1x1 fire4/relu_expand3x3 fire4/concat 1 ,"
+"Pooling pool4 1 1 fire4/concat pool4 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"Convolution fire5/squeeze1x1 1 1 pool4 fire5/squeeze1x1 1 256 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/relu_squeeze1x1 ,"
+"Convolution fire5/expand1x1 1 1 fire5/relu_squeeze1x1 fire5/expand1x1 1 32 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/relu_expand1x1 ,"
+"Convolution fire5/expand3x3 1 1 fire5/relu_squeeze1x1 fire5/expand3x3 1 32 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/relu_expand3x3 ,"
+"Concat fire5/concat 2 1 fire5/relu_expand1x1 fire5/relu_expand3x3 fire5/concat 1 ,"
+"Convolution fire6/squeeze1x1 1 1 fire5/concat fire6/squeeze1x1 1 256 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/relu_squeeze1x1 ,"
+"Convolution fire6/expand1x1 1 1 fire6/relu_squeeze1x1 fire6/expand1x1 1 48 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/relu_expand1x1 ,"
+"Convolution fire6/expand3x3 1 1 fire6/relu_squeeze1x1 fire6/expand3x3 1 48 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/relu_expand3x3 ,"
+"Concat fire6/concat 2 1 fire6/relu_expand1x1 fire6/relu_expand3x3 fire6/concat 1 ,"
+"Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 1 384 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/relu_squeeze1x1 ,"
+"Convolution fire7/expand1x1 1 1 fire7/relu_squeeze1x1 fire7/expand1x1 1 48 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/relu_expand1x1 ,"
+"Convolution fire7/expand3x3 1 1 fire7/relu_squeeze1x1 fire7/expand3x3 1 48 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/relu_expand3x3 ,"
+"Concat fire7/concat 2 1 fire7/relu_expand1x1 fire7/relu_expand3x3 fire7/concat 1 ,"
+"Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/relu_squeeze1x1 ,"
+"Convolution fire8/expand1x1 1 1 fire8/relu_squeeze1x1 fire8/expand1x1 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/relu_expand1x1 ,"
+"Convolution fire8/expand3x3 1 1 fire8/relu_squeeze1x1 fire8/expand3x3 1 64 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/relu_expand3x3 ,"
+"Concat fire8/concat 2 1 fire8/relu_expand1x1 fire8/relu_expand3x3 fire8/concat 1 ,"
+"Pooling pool8 1 1 fire8/concat pool8 0 3 3 2 2 0 0 -1 -1 -1 1 ,"
+"Convolution fire9/squeeze1x1 1 1 pool8 fire9/squeeze1x1 1 512 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/relu_squeeze1x1 ,"
+"Convolution fire9/expand1x1 1 1 fire9/relu_squeeze1x1 fire9/expand1x1 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/relu_expand1x1 ,"
+"Convolution fire9/expand3x3 1 1 fire9/relu_squeeze1x1 fire9/expand3x3 1 64 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/relu_expand3x3 ,"
+"Concat fire9/concat 2 1 fire9/relu_expand1x1 fire9/relu_expand3x3 fire9/concat 1 ,"
+"Convolution conv10 1 1 fire9/concat conv10 1 512 1000 1 1 1 1 1 1 1 -1 1 1 ,"
+"ReLU relu_conv10 1 1 conv10 relu_conv10 ,"
+"Pooling pool10 1 1 relu_conv10 pool10 1 0 0 1 1 0 0 -1 -1 -1 0 ,"
+"SoftmaxCaffe prob 1 1 pool10 prob 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.1.tnnproto b/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.1.tnnproto
new file mode 100644
index 0000000..bc87108
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark-model/squeezenet_v1.1.tnnproto
@@ -0,0 +1,70 @@
+"1 67 1 4206624770 ,"
+"data 1 3 224 224 ,"
+" data squeezenet0_concat0 squeezenet0_concat1 squeezenet0_concat2 squeezenet0_concat3 squeezenet0_concat4 squeezenet0_concat5 squeezenet0_concat6 squeezenet0_concat7 squeezenet0_conv0_fwd squeezenet0_conv10_fwd squeezenet0_conv11_fwd squeezenet0_conv12_fwd squeezenet0_conv13_fwd squeezenet0_conv14_fwd squeezenet0_conv15_fwd squeezenet0_conv16_fwd squeezenet0_conv17_fwd squeezenet0_conv18_fwd squeezenet0_conv19_fwd squeezenet0_conv1_fwd squeezenet0_conv20_fwd squeezenet0_conv21_fwd squeezenet0_conv22_fwd squeezenet0_conv23_fwd squeezenet0_conv24_fwd squeezenet0_conv25_fwd squeezenet0_conv2_fwd squeezenet0_conv3_fwd squeezenet0_conv4_fwd squeezenet0_conv5_fwd squeezenet0_conv6_fwd squeezenet0_conv7_fwd squeezenet0_conv8_fwd squeezenet0_conv9_fwd squeezenet0_dropout0_fwd squeezenet0_flatten0_reshape0 squeezenet0_pool0_fwd squeezenet0_pool1_fwd squeezenet0_pool2_fwd squeezenet0_pool3_fwd squeezenet0_relu0_fwd squeezenet0_relu10_fwd squeezenet0_relu11_fwd squeezenet0_relu12_fwd squeezenet0_relu13_fwd squeezenet0_relu14_fwd squeezenet0_relu15_fwd squeezenet0_relu16_fwd squeezenet0_relu17_fwd squeezenet0_relu18_fwd squeezenet0_relu19_fwd squeezenet0_relu1_fwd squeezenet0_relu20_fwd squeezenet0_relu21_fwd squeezenet0_relu22_fwd squeezenet0_relu23_fwd squeezenet0_relu24_fwd squeezenet0_relu25_fwd squeezenet0_relu2_fwd squeezenet0_relu3_fwd squeezenet0_relu4_fwd squeezenet0_relu5_fwd squeezenet0_relu6_fwd squeezenet0_relu7_fwd squeezenet0_relu8_fwd squeezenet0_relu9_fwd ,"
+"squeezenet0_flatten0_reshape0 ,"
+" 65 ,"
+"Convolution squeezenet0_conv0_fwd 1 1 data squeezenet0_conv0_fwd 1 3 64 3 3 2 2 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu0_fwd 1 1 squeezenet0_conv0_fwd squeezenet0_relu0_fwd ,"
+"Pooling squeezenet0_pool0_fwd 1 1 squeezenet0_relu0_fwd squeezenet0_pool0_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"Convolution squeezenet0_conv1_fwd 1 1 squeezenet0_pool0_fwd squeezenet0_conv1_fwd 1 64 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu1_fwd 1 1 squeezenet0_conv1_fwd squeezenet0_relu1_fwd ,"
+"Convolution squeezenet0_conv2_fwd 1 1 squeezenet0_relu1_fwd squeezenet0_conv2_fwd 1 16 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu2_fwd 1 1 squeezenet0_conv2_fwd squeezenet0_relu2_fwd ,"
+"Convolution squeezenet0_conv3_fwd 1 1 squeezenet0_relu1_fwd squeezenet0_conv3_fwd 1 16 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu3_fwd 1 1 squeezenet0_conv3_fwd squeezenet0_relu3_fwd ,"
+"Concat squeezenet0_concat0 2 1 squeezenet0_relu2_fwd squeezenet0_relu3_fwd squeezenet0_concat0 1 ,"
+"Convolution squeezenet0_conv4_fwd 1 1 squeezenet0_concat0 squeezenet0_conv4_fwd 1 128 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu4_fwd 1 1 squeezenet0_conv4_fwd squeezenet0_relu4_fwd ,"
+"Convolution squeezenet0_conv5_fwd 1 1 squeezenet0_relu4_fwd squeezenet0_conv5_fwd 1 16 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu5_fwd 1 1 squeezenet0_conv5_fwd squeezenet0_relu5_fwd ,"
+"Convolution squeezenet0_conv6_fwd 1 1 squeezenet0_relu4_fwd squeezenet0_conv6_fwd 1 16 64 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu6_fwd 1 1 squeezenet0_conv6_fwd squeezenet0_relu6_fwd ,"
+"Concat squeezenet0_concat1 2 1 squeezenet0_relu5_fwd squeezenet0_relu6_fwd squeezenet0_concat1 1 ,"
+"Pooling squeezenet0_pool1_fwd 1 1 squeezenet0_concat1 squeezenet0_pool1_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"Convolution squeezenet0_conv7_fwd 1 1 squeezenet0_pool1_fwd squeezenet0_conv7_fwd 1 128 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu7_fwd 1 1 squeezenet0_conv7_fwd squeezenet0_relu7_fwd ,"
+"Convolution squeezenet0_conv8_fwd 1 1 squeezenet0_relu7_fwd squeezenet0_conv8_fwd 1 32 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu8_fwd 1 1 squeezenet0_conv8_fwd squeezenet0_relu8_fwd ,"
+"Convolution squeezenet0_conv9_fwd 1 1 squeezenet0_relu7_fwd squeezenet0_conv9_fwd 1 32 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu9_fwd 1 1 squeezenet0_conv9_fwd squeezenet0_relu9_fwd ,"
+"Concat squeezenet0_concat2 2 1 squeezenet0_relu8_fwd squeezenet0_relu9_fwd squeezenet0_concat2 1 ,"
+"Convolution squeezenet0_conv10_fwd 1 1 squeezenet0_concat2 squeezenet0_conv10_fwd 1 256 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu10_fwd 1 1 squeezenet0_conv10_fwd squeezenet0_relu10_fwd ,"
+"Convolution squeezenet0_conv11_fwd 1 1 squeezenet0_relu10_fwd squeezenet0_conv11_fwd 1 32 128 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu11_fwd 1 1 squeezenet0_conv11_fwd squeezenet0_relu11_fwd ,"
+"Convolution squeezenet0_conv12_fwd 1 1 squeezenet0_relu10_fwd squeezenet0_conv12_fwd 1 32 128 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu12_fwd 1 1 squeezenet0_conv12_fwd squeezenet0_relu12_fwd ,"
+"Concat squeezenet0_concat3 2 1 squeezenet0_relu11_fwd squeezenet0_relu12_fwd squeezenet0_concat3 1 ,"
+"Pooling squeezenet0_pool2_fwd 1 1 squeezenet0_concat3 squeezenet0_pool2_fwd 0 3 3 2 2 0 0 -1 -1 -1 0 ,"
+"Convolution squeezenet0_conv13_fwd 1 1 squeezenet0_pool2_fwd squeezenet0_conv13_fwd 1 256 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu13_fwd 1 1 squeezenet0_conv13_fwd squeezenet0_relu13_fwd ,"
+"Convolution squeezenet0_conv14_fwd 1 1 squeezenet0_relu13_fwd squeezenet0_conv14_fwd 1 48 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu14_fwd 1 1 squeezenet0_conv14_fwd squeezenet0_relu14_fwd ,"
+"Convolution squeezenet0_conv15_fwd 1 1 squeezenet0_relu13_fwd squeezenet0_conv15_fwd 1 48 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu15_fwd 1 1 squeezenet0_conv15_fwd squeezenet0_relu15_fwd ,"
+"Concat squeezenet0_concat4 2 1 squeezenet0_relu14_fwd squeezenet0_relu15_fwd squeezenet0_concat4 1 ,"
+"Convolution squeezenet0_conv16_fwd 1 1 squeezenet0_concat4 squeezenet0_conv16_fwd 1 384 48 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu16_fwd 1 1 squeezenet0_conv16_fwd squeezenet0_relu16_fwd ,"
+"Convolution squeezenet0_conv17_fwd 1 1 squeezenet0_relu16_fwd squeezenet0_conv17_fwd 1 48 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu17_fwd 1 1 squeezenet0_conv17_fwd squeezenet0_relu17_fwd ,"
+"Convolution squeezenet0_conv18_fwd 1 1 squeezenet0_relu16_fwd squeezenet0_conv18_fwd 1 48 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu18_fwd 1 1 squeezenet0_conv18_fwd squeezenet0_relu18_fwd ,"
+"Concat squeezenet0_concat5 2 1 squeezenet0_relu17_fwd squeezenet0_relu18_fwd squeezenet0_concat5 1 ,"
+"Convolution squeezenet0_conv19_fwd 1 1 squeezenet0_concat5 squeezenet0_conv19_fwd 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu19_fwd 1 1 squeezenet0_conv19_fwd squeezenet0_relu19_fwd ,"
+"Convolution squeezenet0_conv20_fwd 1 1 squeezenet0_relu19_fwd squeezenet0_conv20_fwd 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu20_fwd 1 1 squeezenet0_conv20_fwd squeezenet0_relu20_fwd ,"
+"Convolution squeezenet0_conv21_fwd 1 1 squeezenet0_relu19_fwd squeezenet0_conv21_fwd 1 64 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu21_fwd 1 1 squeezenet0_conv21_fwd squeezenet0_relu21_fwd ,"
+"Concat squeezenet0_concat6 2 1 squeezenet0_relu20_fwd squeezenet0_relu21_fwd squeezenet0_concat6 1 ,"
+"Convolution squeezenet0_conv22_fwd 1 1 squeezenet0_concat6 squeezenet0_conv22_fwd 1 512 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu22_fwd 1 1 squeezenet0_conv22_fwd squeezenet0_relu22_fwd ,"
+"Convolution squeezenet0_conv23_fwd 1 1 squeezenet0_relu22_fwd squeezenet0_conv23_fwd 1 64 256 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu23_fwd 1 1 squeezenet0_conv23_fwd squeezenet0_relu23_fwd ,"
+"Convolution squeezenet0_conv24_fwd 1 1 squeezenet0_relu22_fwd squeezenet0_conv24_fwd 1 64 256 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU squeezenet0_relu24_fwd 1 1 squeezenet0_conv24_fwd squeezenet0_relu24_fwd ,"
+"Concat squeezenet0_concat7 2 1 squeezenet0_relu23_fwd squeezenet0_relu24_fwd squeezenet0_concat7 1 ,"
+"Convolution squeezenet0_conv25_fwd 1 1 squeezenet0_concat7 squeezenet0_conv25_fwd 1 512 1000 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU squeezenet0_relu25_fwd 1 1 squeezenet0_conv25_fwd squeezenet0_relu25_fwd ,"
+"Pooling squeezenet0_pool3_fwd 1 1 squeezenet0_relu25_fwd squeezenet0_pool3_fwd 1 13 13 13 13 0 0 -1 -1 -1 0 ,"
+"Reshape squeezenet0_flatten0_reshape0 1 1 squeezenet0_pool3_fwd squeezenet0_flatten0_reshape0 0 4 4 0 0 -1 1 ,"
diff --git a/3rdparty/TNN/benchmark/benchmark_android/.gitignore b/3rdparty/TNN/benchmark/benchmark_android/.gitignore
new file mode 100644
index 0000000..2211df6
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/3rdparty/TNN/benchmark/benchmark_android/README.md b/3rdparty/TNN/benchmark/benchmark_android/README.md
new file mode 100644
index 0000000..ce8879a
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/README.md
@@ -0,0 +1,7 @@
+models benchmark:
+push all benchmark models to android device dir /data/local/tmp/benchmark-model, then run benchmark_models.sh, you will get all model benchmark cost time info.
+
+layer benchmark:
+run benchmark_layer.sh -h, you can get help info. below is some import info:
+run benchmark_layer.sh --gtest_list_tests, you can get all layer benchmark list with parameters info, use --gtest_filter to filter layer benchmark. for example, run benchmark_layer.sh --gtest_filter=LayerTest/AddLayer*, you can benchmark add layer only;run benchmark_layer.sh --gtest_filter=LayerTest/AddLayerTest.AddLayer/0, you can benchmark add layer with one special parameter only.
+
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark-release.apk b/3rdparty/TNN/benchmark/benchmark_android/benchmark-release.apk
new file mode 100644
index 0000000..5f72527
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark-release.apk differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/.gitignore b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/.gitignore
new file mode 100644
index 0000000..4978360
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/.gitignore
@@ -0,0 +1,2 @@
+/build
+.idea
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/build.gradle b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/build.gradle
new file mode 100644
index 0000000..2cf9c39
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/build.gradle
@@ -0,0 +1,26 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.2"
+
+    defaultConfig {
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1000
+        versionName "1.0.0"
+    }
+    buildTypes {
+        release {
+            minifyEnabled true
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+}
+
+dependencies {
+    implementation fileTree(include: ['*.jar'], dir: 'libs')
+    implementation 'com.android.support:appcompat-v7:26.0.2'
+}
+
+
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..f6b961f
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..8126a14
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Tue Dec 15 11:14:35 CST 2020
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-all.zip
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew
new file mode 100755
index 0000000..cccdd3d
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew.bat b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew.bat
new file mode 100644
index 0000000..f955316
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/proguard-rules.pro b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/proguard-rules.pro
new file mode 100644
index 0000000..f61a577
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/proguard-rules.pro
@@ -0,0 +1,17 @@
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in C:\Users\neiltian\AppData\Local\Android\Sdk/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the proguardFiles
+# directive in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/AndroidManifest.xml b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..be81cec
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/AndroidManifest.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.tencent.tnn.benchmark" >
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="TNN" >
+        <activity android:name=".MainActivity">
+            <intent-filter>
+                <category android:name="android.intent.category.LAUNCHER" />
+                <action android:name="android.intent.action.MAIN" />
+            </intent-filter>
+        </activity>
+
+    </application>
+
+</manifest>
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/BenchmarkModel.java b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/BenchmarkModel.java
new file mode 100644
index 0000000..74af129
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/BenchmarkModel.java
@@ -0,0 +1,5 @@
+package com.tencent.tnn.benchmark;
+
+public class BenchmarkModel {
+    public native int nativeRun(String args, String fileDir);
+}
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/FileUtils.java b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/FileUtils.java
new file mode 100644
index 0000000..043c6b3
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/FileUtils.java
@@ -0,0 +1,52 @@
+package com.tencent.tnn.benchmark;
+
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.util.ArrayList;
+
+
+public class FileUtils {
+
+    private FileUtils() {
+        throw new AssertionError();
+    }
+
+    public static boolean copyFile(String input_path, String out_path) {
+        InputStream in = null;
+        OutputStream out = null;
+        try {
+            in = new FileInputStream(input_path);
+            out = new FileOutputStream(out_path);
+            copyFile(in, out);
+            in.close();
+            in = null;
+            out.flush();
+            out.close();
+            out = null;
+            return true;
+        } catch(Exception e) {
+            e.printStackTrace();
+            return false;
+        }
+    }
+
+    public static void copyFile(InputStream in, OutputStream out) throws IOException {
+        byte[] buffer = new byte[1024];
+        int read;
+        while((read = in.read(buffer)) != -1){
+            out.write(buffer, 0, read);
+        }
+    }
+}
+
+
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/MainActivity.java b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/MainActivity.java
new file mode 100644
index 0000000..f7796e3
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/java/com/tencent/tnn/benchmark/MainActivity.java
@@ -0,0 +1,70 @@
+package com.tencent.tnn.benchmark;
+
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+import android.widget.TextView;
+
+import java.io.File;
+
+public class MainActivity extends Activity {
+
+    private TextView lightLiveCheckBtn;
+
+    private static final String TAG = "TNN_BenchmarkModelActivity";
+    private BenchmarkModel benchmark = new BenchmarkModel();
+    private static final String ARGS_INTENT_KEY_ARGS_0 = "args";
+    private static final String ARGS_INTENT_KEY_ARGS_1 = "--args";
+    private static final String ARGS_INTENT_KEY_BENCHMARK_DIR = "benchmark-dir";
+    private static final String ARGS_INTENT_KEY_LOAD_LIST = "load-list";
+    private static final String ARGS_INTENT_KEY_MODEL = "model";
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.activity_main);
+
+        init();
+
+    }
+
+    private void init() {
+        String model = "";
+        try {
+            Intent intent = getIntent();
+            Bundle bundle = intent.getExtras();
+            String benchmark_dir = bundle.getString(ARGS_INTENT_KEY_BENCHMARK_DIR, "/data/local/tmp/tnn-benchmark/");
+            String[] load_list = bundle.getStringArray(ARGS_INTENT_KEY_LOAD_LIST);
+            model = bundle.getString(ARGS_INTENT_KEY_MODEL);
+            for(String element : load_list) {
+                FileUtils.copyFile(benchmark_dir + "/" + element, getFilesDir().getAbsolutePath() + "/" + element);
+                System.load(getFilesDir().getAbsolutePath() + "/" + element);
+            }
+            final String args = bundle.getString(ARGS_INTENT_KEY_ARGS_0, bundle.getString(ARGS_INTENT_KEY_ARGS_1));
+            final String file_dir  = this.getFilesDir().getAbsolutePath();
+            String output_path = file_dir + "/" + model;
+            File model_file = new  File(output_path);
+            if(model_file.exists()) {
+                model_file.delete();
+            }
+            model_file.createNewFile();
+
+            FileUtils.copyFile(benchmark_dir + "/" + "benchmark-model/" + model, output_path);
+            int result = benchmark.nativeRun(args, file_dir);
+            if(result != 0) {
+                Log.i("tnn", String.format(" %s TNN Benchmark time cost failed error code: %d \n", model , result));
+            }
+        } catch(Error | Exception e) {
+            Log.i("tnn", String.format(" %s TNN Benchmark time cost failed error/exception: %s \n", model, e.getMessage()));
+        }
+    }
+
+    @Override
+    protected void onResume() {
+        super.onResume();
+    }
+
+}
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.cc b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.cc
new file mode 100644
index 0000000..7bc6c82
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <jni.h>
+
+#include <sstream>
+#include <string>
+#include <fstream>
+
+#include "benchmark_model_jni.h"
+#include "test.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+JNIEXPORT jint JNICALL TNN_BENCHMARK_MODEL(nativeRun)(JNIEnv* env, jobject thiz, jstring args_obj, jstring file_dir) {
+    const char* args_chars = env->GetStringUTFChars(args_obj, nullptr);
+    const char* file_chars = env->GetStringUTFChars(file_dir, nullptr);
+
+    // Split the args string into individual arg tokens.
+    std::istringstream iss(args_chars);
+    std::vector<std::string> args_split{std::istream_iterator<std::string>(iss),
+                                        {}};
+
+    // Construct a fake argv command-line object for the benchmark.
+    std::vector<char*> argv;
+    std::string arg0 = "(BenchmarkModelAndroid)";
+    std::string model_file;
+    bool model_path_option = false;
+    argv.push_back(const_cast<char*>(arg0.data()));
+    for (auto& arg : args_split) {
+        // Deal with the model path
+        if (!model_path_option) {
+            argv.push_back(const_cast<char*>(arg.data()));
+        } else {
+            model_file = arg;
+            std::ifstream fin(arg);
+            if (!fin) {
+                model_file = std::string(file_chars) + "/" + arg;
+            }
+            argv.push_back(const_cast<char*>(model_file.data()));
+        }
+        model_path_option = (arg.find("-mp") != std::string::npos);
+    }
+
+    int result = TNN_NS::test::Run(static_cast<int>(argv.size()), argv.data());
+    env->ReleaseStringUTFChars(args_obj, args_chars);
+    return result;
+}
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.h b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.h
new file mode 100644
index 0000000..fb3ca42
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/jni/cc/benchmark_model_jni.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_BENCHMARK_MODEL_JNI_H_
+#define ANDROID_BENCHMARK_MODEL_JNI_H_
+
+#include <jni.h>
+
+#define TNN_BENCHMARK_MODEL(sig) Java_com_tencent_tnn_benchmark_BenchmarkModel_##sig
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+JNIEXPORT jint JNICALL TNN_BENCHMARK_MODEL(nativeRun)(JNIEnv *env, jobject thiz, jstring args_obj, jstring file_dir);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_BENCHMARK_MODEL_JNI_H_
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/drawable/tnn.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/drawable/tnn.png
new file mode 100644
index 0000000..b07ecc7
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/drawable/tnn.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/ic_launcher.png
new file mode 100644
index 0000000..3d82028
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/layout/activity_main.xml b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/layout/activity_main.xml
new file mode 100644
index 0000000..2fe84b4
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/layout/activity_main.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<ScrollView xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/scrollView1"
+    android:layout_width="fill_parent"
+    android:layout_height="wrap_content"
+    xmlns:android="http://schemas.android.com/apk/res/android">
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical"
+        tools:context="com.tencent.tnn.benchmark.MainActivity">
+
+    <ImageView
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:src="@drawable/tnn"
+        />
+
+    </LinearLayout>
+</ScrollView>
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-hdpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 0000000..96ff172
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-ldpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-ldpi/ic_launcher.png
new file mode 100644
index 0000000..45cc506
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-ldpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-mdpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 0000000..680c5f8
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xhdpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 0000000..40372f2
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxhdpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000..54219a3
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 0000000..8b610c5
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/benchmark/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/build.gradle b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/build.gradle
new file mode 100644
index 0000000..18ae165
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/build.gradle
@@ -0,0 +1,27 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+
+    repositories {
+        google()
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:3.5.2'
+
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle.properties b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle.properties
new file mode 100644
index 0000000..45a138d
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle.properties
@@ -0,0 +1,20 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+
+
+android.injected.testOnly=false
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..13372ae
Binary files /dev/null and b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..2cc90eb
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Fri Apr 24 15:10:50 CST 2020
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-all.zip
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew
new file mode 100644
index 0000000..9d82f78
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew.bat b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew.bat
new file mode 100644
index 0000000..8a0b282
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/settings.gradle b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/settings.gradle
new file mode 100644
index 0000000..81101f3
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_app/settings.gradle
@@ -0,0 +1 @@
+include ':benchmark'
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_layer.sh b/3rdparty/TNN/benchmark/benchmark_android/benchmark_layer.sh
new file mode 100755
index 0000000..1dfe3f2
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_layer.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+ABI="arm64-v8a"
+CLEAN=""
+WORK_DIR=`pwd`
+FILTER=""
+DEVICE_TYPE=""
+KERNEL_TUNE="-et"
+BUILD_DIR=build
+ANDROID_DIR=/data/local/tmp/tnn-benchmark
+OUTPUT_LOG_FILE=benchmark_layer_result.txt
+LOOP_COUNT=10
+ADB=adb
+
+function usage() {
+    echo "usage: ./benchmark_layer.sh  [-32] [-c] [-f] <filter-info> [-d] <device-id> [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32         Build 32 bit."
+    echo "        -c          Clean up build folders."
+    echo "        -d          run with specified device"
+    echo "        -f          specified layer"
+    echo "        -t          CPU/GPU specify the platform to run"
+    echo "        -et/-noet   set kernel enable tune on or off" 
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        exit_with_msg "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_android_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=c++_static \
+          -DANDROID_NATIVE_API_LEVEL=android-14  \
+          -DANDROID_TOOLCHAIN=clang \
+          -DTNN_ARM_ENABLE:BOOL=ON \
+          -DTNN_OPENCL_ENABLE:BOOL=ON \
+          -DTNN_TEST_ENABLE:BOOL=ON \
+          -DTNN_BENCHMARK_MODE:BOOL=ON \
+          -DTNN_UNIT_TEST_ENABLE:BOOL=ON \
+          -DTNN_UNIT_TEST_BENCHMARK:BOOL=ON \
+          -DTNN_PROFILER_ENABLE:BOOL=ON \
+          -DBUILD_FOR_ANDROID_COMMAND=true
+    make -j4
+}
+
+function bench_android() {
+    build_android_bench
+
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    $ADB shell "mkdir -p $ANDROID_DIR"
+    find . -name "*.so" | while read solib; do
+        $ADB push $solib  $ANDROID_DIR
+    done
+    $ADB push test/unit_test/unit_test $ANDROID_DIR/unit_test
+    $ADB shell chmod 0777 $ANDROID_DIR/unit_test
+
+    $ADB shell "getprop ro.product.model > ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ];then
+        DEVICE_TYPE=""
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        $ADB shell "echo '\nbenchmark device: ARM \n' >> ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+        $ADB shell "cd ${ANDROID_DIR}; LD_LIBRARY_PATH=. ./unit_test ${KERNEL_TUNE} -ic ${LOOP_COUNT} -dt ARM --gtest_filter="*${FILTER}*" -ub >> $OUTPUT_LOG_FILE"
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ];then
+        LOOP_COUNT=1
+        $ADB shell "echo '\nbenchmark device: OPENCL \n' >> ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+        $ADB shell "cd ${ANDROID_DIR}; LD_LIBRARY_PATH=. ./unit_test ${KERNEL_TUNE} -ic ${LOOP_COUNT} -dt OPENCL --gtest_filter="*${FILTER}*" -ub >> $OUTPUT_LOG_FILE"
+    fi
+
+    $ADB pull $ANDROID_DIR/$OUTPUT_LOG_FILE ../$OUTPUT_LOG_FILE
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a with NEON"
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -f)
+            shift
+            FILTER=$1
+            shift
+            ;;
+        -d)
+            shift
+            ADB="adb -s $1"
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_android
diff --git a/3rdparty/TNN/benchmark/benchmark_android/benchmark_models.sh b/3rdparty/TNN/benchmark/benchmark_android/benchmark_models.sh
new file mode 100755
index 0000000..15ce89a
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_android/benchmark_models.sh
@@ -0,0 +1,366 @@
+#!/bin/bash
+
+export PATH=$PATH:$ANDROID_HOME/platform-tools
+
+ABI="arm64-v8a"
+STL="c++_static"
+PROFILING="OFF"
+CLEAN=""
+DEVICE_TYPE=""
+MODEL_TYPE=TNN
+USE_NCNN_MODEL=0
+KERNEL_TUNE="-et"
+THREAD_NUM=1
+ADB=adb
+BENCHMARK_TYPE="APP"
+BENCHMARK_APP_DIR=benchmark_app/benchmark/
+
+WORK_DIR=`pwd`
+BENCHMARK_MODEL_DIR=$WORK_DIR/../benchmark-model
+BUILD_DIR=build
+BUILD_APP_DIR=build_app
+ANDROID_DIR=/data/local/tmp/tnn-benchmark
+ANDROID_DATA_DIR=$ANDROID_DIR/benchmark-model
+OUTPUT_LOG_FILE=benchmark_models_result.txt
+LOOP_COUNT=16
+WARM_UP_COUNT=5
+INTERVAL=5
+
+benchmark_model_list=(
+#test.tnnproto \
+)
+
+function usage() {
+    echo "usage: ./benchmark_models.sh  [-32] [-c] [-b] [-f] [-d] <device-id> [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32   Build 32 bit."
+    echo "        -c    Clean up build folders."
+    echo "        -b    build targets only"
+    echo "        -f    build profiling targets "
+    echo "        -d    run with specified device"
+    echo "        -t    CPU/GPU/HUAWEI_NPU specify the platform to run"
+    echo "        -th   num of threads to run, default: 1"
+    echo "        -n    use ncnn model"
+    echo "        -bs   benchmark shell"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        exit_with_msg "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_android_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    if [ "$DEVICE_TYPE" = "HUAWEI_NPU"  ]; then
+      echo "NPU Enable"
+      # set c++ shared
+      STL="c++_shared"
+      HUAWEI_NPU_ENABLE="ON"
+      #start to cp
+      if [ ! -d ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/ ]; then
+           mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/
+      fi
+      mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a
+      mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a
+      cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_shared.so  ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a/
+      cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a/
+    fi
+    mkdir -p $BUILD_DIR
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=${STL}\
+          -DANDROID_NATIVE_API_LEVEL=android-14  \
+          -DANDROID_TOOLCHAIN=clang \
+          -DTNN_ARM_ENABLE:BOOL=ON \
+          -DTNN_OPENCL_ENABLE:BOOL=ON \
+          -DTNN_HUAWEI_NPU_ENABLE:BOOL=${HUAWEI_NPU_ENABLE} \
+          -DTNN_OPENMP_ENABLE:BOOL=ON \
+          -DTNN_TEST_ENABLE:BOOL=ON \
+          -DTNN_BUILD_BENCHMARK_TEST_LIB_ENABLE:BOOL=ON \
+          -DTNN_BENCHMARK_MODE:BOOL=ON \
+          -DTNN_PROFILER_ENABLE:BOOL=${PROFILING} \
+          -DTNN_BUILD_SHARED:BOOL=ON \
+          -DBUILD_FOR_ANDROID_COMMAND=true
+    make -j4
+}
+
+function bench_android_shell() {
+    build_android_bench
+
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    $ADB shell "mkdir -p $ANDROID_DIR"
+    find . -name "*.so" | while read solib; do
+        $ADB push $solib  $ANDROID_DIR
+    done
+    $ADB push test/TNNTest $ANDROID_DIR/TNNTest
+    $ADB shell chmod 0777 $ANDROID_DIR/TNNTest
+
+    $ADB shell "mkdir -p $ANDROID_DIR/benchmark-model"
+    $ADB push ${BENCHMARK_MODEL_DIR} $ANDROID_DIR
+
+    cd ${BENCHMARK_MODEL_DIR}
+    $ADB shell "getprop ro.product.model > ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ] && [ "$DEVICE_TYPE" != "HUAWEI_NPU" ]; then
+        DEVICE_TYPE=""
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        device=ARM
+        $ADB shell "echo '\nbenchmark device: ${device} \n' >> ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            $ADB logcat -c
+            $ADB shell "cd ${ANDROID_DIR}; LD_LIBRARY_PATH=. ./TNNTest -th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${ANDROID_DATA_DIR}/${benchmark_model} >> $OUTPUT_LOG_FILE"
+            sleep $INTERVAL
+            $ADB shell "cd ${ANDROID_DIR}; logcat -d | grep \"TNN Benchmark time cost\" | grep ${device} | grep -w ${benchmark_model} | tail -n 1 >> $OUTPUT_LOG_FILE"
+        done
+    fi
+
+    if [ "ON" == $PROFILING ]; then
+        WARM_UP_COUNT=5
+        LOOP_COUNT=5
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ];then
+        device=OPENCL
+        $ADB shell "echo '\nbenchmark device: ${device} \n' >> ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            $ADB logcat -c
+            $ADB shell "cd ${ANDROID_DIR}; LD_LIBRARY_PATH=. ./TNNTest -th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${ANDROID_DATA_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE"
+            sleep $INTERVAL
+            $ADB shell "cd ${ANDROID_DIR}; logcat -d | grep \"TNN Benchmark time cost\" | grep ${device} | grep -w ${benchmark_model} | tail -n 1 >> $OUTPUT_LOG_FILE"
+        done
+    fi
+
+    if [ "$DEVICE_TYPE" = "HUAWEI_NPU" ];then
+        echo "Run Huawei Npu"
+        device=HUAWEI_NPU
+	$ADB push ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/${ABI}/* $ANDROID_DIR/
+        $ADB push ${WORK_DIR}/../../third_party/huawei_npu/hiai_ddk_latest/${ABI}/* $ANDROID_DIR/
+        $ADB shell "echo '\nbenchmark device: ${device} \n' >> ${ANDROID_DIR}/$OUTPUT_LOG_FILE"
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            $ADB logcat -c
+            $ADB shell "cd ${ANDROID_DIR}; LD_LIBRARY_PATH=. ./TNNTest -th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -nt ${device} -mt ${MODEL_TYPE} -mp ${ANDROID_DATA_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE"
+            sleep $INTERVAL
+            $ADB shell "cd ${ANDROID_DIR}; logcat -d | grep \"TNN Benchmark time cost\" | grep ${device} | grep -w ${benchmark_model} | tail -n 1 >> $OUTPUT_LOG_FILE"
+        done
+    fi
+
+    $ADB shell "echo '' >> $ANDROID_DIR/$OUTPUT_LOG_FILE"
+    $ADB shell "date  >> $ANDROID_DIR/$OUTPUT_LOG_FILE"
+
+    $ADB pull $ANDROID_DIR/$OUTPUT_LOG_FILE ${WORK_DIR}/$OUTPUT_LOG_FILE
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+
+}
+
+function build_android_bench_app() {
+    mkdir -p $BUILD_APP_DIR
+    cd $BUILD_APP_DIR
+
+    cmake ../../benchmark_app/benchmark/ \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=${STL}\
+          -DANDROID_NATIVE_API_LEVEL=android-14  \
+          -DANDROID_TOOLCHAIN=clang
+    make -j4
+    cd ../..
+}
+
+function bench_android_app() {
+    build_android_bench
+    build_android_bench_app
+
+    if [ "$ABI" = "armeabi-v7a with NEON" ];then
+        adb install -r --abi armeabi-v7a benchmark-release.apk 
+    else
+        adb install -r --abi $ABI benchmark-release.apk
+    fi
+
+    $ADB shell "mkdir -p $ANDROID_DIR/benchmark-model"
+    $ADB push ${BENCHMARK_MODEL_DIR} $ANDROID_DIR
+
+    $ADB shell "getprop ro.product.model" > $OUTPUT_LOG_FILE
+
+    cd ${BUILD_DIR}
+    $ADB shell "mkdir -p $ANDROID_DIR"
+    find . -name "*.so" | while read solib; do
+        $ADB push $solib  $ANDROID_DIR
+    done
+
+    cd ${BENCHMARK_MODEL_DIR}
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ] && [ "$DEVICE_TYPE" != "HUAWEI_NPU" ]; then
+        DEVICE_TYPE=""
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ]; then
+        device=ARM
+        echo -e "\nbenchmark device: ${device}\n"
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            TEST_ARGS="-th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${benchmark_model}"
+            $ADB logcat -c
+            $ADB shell am start -S -W \
+                -n com.tencent.tnn.benchmark/.MainActivity \
+                --es args \'${TEST_ARGS}\' --es benchmark-dir ${ANDROID_DIR} \
+                --es model ${benchmark_model} \
+                --esa load-list "libTNN.so,libTNNBenchmarkTest.so,libtnn_wrapper.so"
+            result=""
+            while [[ $result == "" ]]
+            do
+                sleep 1
+                result=$($ADB logcat -d | grep "TNN Benchmark time cost" | grep -w ${benchmark_model} | tail -n 1)
+            done
+            echo $result
+            echo $result | grep -v "failed" >> $WORK_DIR/$OUTPUT_LOG_FILE
+            sleep $INTERVAL
+        done
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ]; then
+        device=OPENCL
+        echo -e "\nbenchmark device: ${device}\n"
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            TEST_ARGS="-th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${benchmark_model}"
+            $ADB logcat -c
+            $ADB shell am start -S -W \
+                -n com.tencent.tnn.benchmark/.MainActivity \
+                --es args \'${TEST_ARGS}\' --es benchmark-dir ${ANDROID_DIR} \
+                --es model ${benchmark_model} \
+                --esa load-list "libTNN.so,libTNNBenchmarkTest.so,libtnn_wrapper.so"
+            result=""
+            while [[ $result == "" ]]
+            do
+                sleep 1
+                result=$($ADB logcat -d | grep "TNN Benchmark time cost" | grep -w ${benchmark_model} | tail -n 1)
+            done
+            echo $result
+            echo $result | grep -v "failed" >> $WORK_DIR/$OUTPUT_LOG_FILE
+            sleep $INTERVAL
+        done
+    fi
+
+    if [ "$DEVICE_TYPE" = "HUAWEI_NPU" ];then
+        device=HUAWEI_NPU
+        echo -e "\nbenchmark device: ${device}\n"
+        $ADB push ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/${ABI}/* $ANDROID_DIR/
+        $ADB push ${WORK_DIR}/../../third_party/huawei_npu/hiai_ddk_latest/${ABI}/* $ANDROID_DIR/
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            TEST_ARGS="-th ${THREAD_NUM} ${KERNEL_TUNE} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -nt ${device} -mt ${MODEL_TYPE} -mp ${benchmark_model}"
+            $ADB logcat -c
+            $ADB shell am start -S -W \
+                -n com.tencent.tnn.benchmark/.MainActivity \
+                --es args \'${TEST_ARGS}\' --es benchmark-dir ${ANDROID_DIR} \
+                --es model ${benchmark_model} \
+                --esa load-list "libc++_shared.so,libhiai_ir.so,libcpucl.so,libhcl.so,libhiai.so,libhiai_ir_build.so,libTNN.so,libTNNBenchmarkTest.so,libtnn_wrapper.so"
+            result=""
+            while [[ $result == "" ]]
+            do
+                sleep 1
+                result=$($ADB logcat -d | grep "TNN Benchmark time cost" | grep -w ${benchmark_model} | tail -n 1)
+            done
+            echo $result
+            echo $result | grep -v "failed" >> $WORK_DIR/$OUTPUT_LOG_FILE
+            sleep $INTERVAL
+        done
+    fi
+
+    $ADB uninstall com.tencent.tnn.benchmark
+
+    $ADB shell "echo ''" >> $WORK_DIR/$OUTPUT_LOG_FILE
+    $ADB shell "date"  >> $WORK_DIR/$OUTPUT_LOG_FILE
+
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a with NEON"
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            PROFILING="ON"
+            ;;
+        -d)
+            shift
+            ADB="adb -s $1"
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        -n)
+            shift
+            MODEL_TYPE=NCNN
+            ;;
+        -th)
+            shift
+            THREAD_NUM=$1
+            shift
+            ;;
+        -bs)
+            shift
+            BENCHMARK_TYPE="SHELL"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+if [[ "$BENCHMARK_TYPE" == "APP" && "$PROFILING" == "OFF" ]]; then
+    bench_android_app
+else
+    bench_android_shell
+fi
diff --git a/3rdparty/TNN/benchmark/benchmark_armlinux/benchmark_models.sh b/3rdparty/TNN/benchmark/benchmark_armlinux/benchmark_models.sh
new file mode 100755
index 0000000..8ca1760
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_armlinux/benchmark_models.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+PROFILING="OFF"
+CLEAN=""
+DEVICE_TYPE=""
+MODEL_TYPE=TNN
+USE_NCNN_MODEL=0
+SHARED_LIB="ON"
+ARM="ON"
+OPENMP="ON"
+OPENCL="ON"
+CC=aarch64-linux-gnu-gcc
+CXX=aarch64-linux-gnu-g++
+TARGET_ARCH=aarch64
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/../..
+fi
+
+WORK_DIR=`pwd`
+BENCHMARK_MODEL_DIR=$WORK_DIR/../benchmark-model
+BUILD_DIR=build
+OUTPUT_LOG_FILE=benchmark_models_result.txt
+LOOP_COUNT=16
+WARM_UP_COUNT=8
+
+benchmark_model_list=(
+#test.tnnproto \
+)
+
+function usage() {
+    echo "usage: ./benchmark_models.sh  [-32] [-c] [-b] [-f] [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32   Build 32 bit."
+    echo "        -c    Clean up build folders."
+    echo "        -b    build targets only"
+    echo "        -f    build profiling targets "
+    echo "        -t    CPU/GPU specify the platform to run"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        exit_with_msg "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_armlinux_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ${TNN_ROOT_PATH} \
+        -DCMAKE_SYSTEM_NAME=Linux  \
+        -DTNN_TEST_ENABLE=ON \
+        -DTNN_CPU_ENABLE=ON \
+        -DCMAKE_C_COMPILER=$CC \
+        -DCMAKE_CXX_COMPILER=$CXX \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DTNN_ARM_ENABLE:BOOL=$ARM \
+        -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+        -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+        -DTNN_PROFILER_ENABLE:BOOL=${PROFILING} \
+        -DTNN_TEST_ENABLE=ON \
+        -DTNN_UNIT_TEST_ENABLE=ON \
+        -DTNN_COVERAGE=ON \
+        -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+        -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB \
+        -DTNN_BENCHMARK_MODE=ON 
+
+    make -j4
+}
+
+function bench_armlinux() {
+    build_armlinux_bench
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    cd ${BENCHMARK_MODEL_DIR}
+
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ];then
+        DEVICE_TYPE=""
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        device=ARM
+        echo "benchmark device: ${device} " >> $WORK_DIR/$OUTPUT_LOG_FILE
+
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/TNNTest -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE
+        done
+    fi
+
+    if [ "ON" == $PROFILING ]; then
+        WARM_UP_COUNT=5
+        LOOP_COUNT=1
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ];then
+        device=OPENCL
+        echo "benchmark device: ${device} " >> $WORK_DIR/$OUTPUT_LOG_FILE
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/TNNTest -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE
+        done
+    fi
+
+    echo '' >> $OUTPUT_LOG_FILE
+    date  >> $OUTPUT_LOG_FILE
+
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            CC=arm-linux-gnueabihf-gcc
+            CXX=arm-linux-gnueabihf-g++
+            TARGET_ARCH=arm
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            PROFILING="ON"
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        -n)
+            shift
+            MODEL_TYPE=NCNN
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_armlinux
diff --git a/3rdparty/TNN/benchmark/benchmark_cuda_linux/benchmark_models.sh b/3rdparty/TNN/benchmark/benchmark_cuda_linux/benchmark_models.sh
new file mode 100755
index 0000000..a9e79ce
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_cuda_linux/benchmark_models.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+MODEL_TYPE=TNN
+NETWORK_TYPE=TENSORRT
+BUILD_ONLY="OFF"
+DOWNLOAD_MODEL="OFF"
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/../..
+fi
+
+WORK_DIR=`pwd`
+BENCHMARK_MODEL_DIR=$WORK_DIR/benchmark_model
+OUTPUT_LOG_FILE=benchmark_models_result.txt
+LOOP_COUNT=20
+WARM_UP_COUNT=5
+
+benchmark_model_list=(
+#test.tnnproto \
+)
+
+#URL, local path
+function download_file() { #URL, path
+  if [ -e $2 ]; then return 0; fi
+
+  name=`basename $2`
+  echo "downloading $name ..."
+  # status=`wget $1 -o $2`
+  status=`curl $1 -s -w %{http_code} -o $2`
+  if (( status == 200 )); then
+    return 0
+  else
+    echo "download $name failed" 1>&2
+    return -1
+  fi
+}
+
+#URL proto, URL model, directory
+function download_model() {
+  directory="./$3"
+  if [ ! -e ${directory} ]; then
+    mkdir -p ${directory}
+  fi
+
+  proto_name=`basename $1`
+  proto_path_local="${directory}/${proto_name}"
+  if [ ! -f ${proto_path_local} ]; then
+    download_file $1 $proto_path_local
+    succ=$?
+    if [ ! $succ -eq 0 ]; then
+      echo "please download model manually!!!(url:https://github.com/darrenyao87/tnn-models/tree/master/model)"
+      rm -r ${directory}
+    fi
+  fi
+
+  model_name=`basename $2`
+  model_path_local="${directory}/${model_name}"
+  if [ ! -f ${model_path_local} ]; then
+    download_file $2 $model_path_local
+    succ=$?
+    if [ ! $succ -eq 0 ]; then
+      echo "please download model manually!!!(url:https://github.com/darrenyao87/tnn-models/tree/master/model)"
+      rm -r ${directory}
+    fi
+  fi
+}
+
+function download_bench_model() {
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/resnet50/resnet50.opt.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/resnet50/resnet50.opt.tnnmodel" \
+      benchmark_model
+
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/bert-based/bert-based.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/bert-based/bert-based.tnnmodel" \
+      benchmark_model
+
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/bertsquad10/bertsquad10_clean.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/bertsquad10/bertsquad10_clean.tnnmodel" \
+      benchmark_model
+}
+
+function usage() {
+    echo "usage: ./benchmark_models.sh  [-b] [-dl] [-mp]"
+    echo "options:"
+    echo "        -b       build only "
+    echo "        -dl      download model from github "
+    echo "        -mp      model dir path"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function build_cuda_linux_bench() {
+    cd $TNN_ROOT_PATH/scripts
+    ./build_cuda_linux.sh
+    cp $TNN_ROOT_PATH/scripts/cuda_linux_release $TNN_ROOT_PATH/benchmark/benchmark_cuda_linux/ -r
+}
+
+function bench_cuda_linux() {
+    if [ "OFF" != "$DOWNLOAD_MODEL" ];then
+      download_bench_model
+    fi
+
+    build_cuda_linux_bench
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "OFF" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    if [ ! -d ${BENCHMARK_MODEL_DIR} ]; then
+	echo "please set model dir path or exec script with option -dl"
+        usage
+        exit -1
+    fi
+    cd ${BENCHMARK_MODEL_DIR}
+
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CUDA" ];then
+        device=CUDA
+        echo "benchmark device: ${device} " >> $WORK_DIR/$OUTPUT_LOG_FILE
+
+    for benchmark_model in ${benchmark_model_list[*]}
+    do
+        cd ${WORK_DIR}; LD_LIBRARY_PATH=cuda_linux_release/lib ./cuda_linux_release/bin/TNNTest -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -nt ${NETWORK_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE
+    done
+    fi
+
+    echo '' >> $OUTPUT_LOG_FILE
+    date  >> $OUTPUT_LOG_FILE
+
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -b)
+            shift
+            BUILD_ONLY=ON
+            ;;
+        -dl)
+            shift
+            DOWNLOAD_MODEL=ON
+            ;;
+        -mp)
+            shift
+            BENCHMARK_MODEL_DIR=$(cd $1; pwd)
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_cuda_linux
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.pbxproj b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.pbxproj
new file mode 100644
index 0000000..295c6f7
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.pbxproj
@@ -0,0 +1,469 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		9D961FEA241163EE009B3FB1 /* BenchmarkListController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D961FE9241163EE009B3FB1 /* BenchmarkListController.mm */; };
+		9D961FED24116548009B3FB1 /* RootNavController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D961FEB24116548009B3FB1 /* RootNavController.mm */; };
+		9DD579EF23B5A20500A96E63 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579EE23B5A20500A96E63 /* AppDelegate.m */; };
+		9DD579F223B5A20500A96E63 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579F123B5A20500A96E63 /* SceneDelegate.m */; };
+		9DD579F523B5A20500A96E63 /* BenchmarkController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579F423B5A20500A96E63 /* BenchmarkController.mm */; };
+		9DD579F823B5A20500A96E63 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579F623B5A20500A96E63 /* Main.storyboard */; };
+		9DD579FA23B5A20E00A96E63 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579F923B5A20E00A96E63 /* Assets.xcassets */; };
+		9DD579FD23B5A20E00A96E63 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */; };
+		9DD57A0023B5A20E00A96E63 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579FF23B5A20E00A96E63 /* main.m */; };
+		9DD57A0723B5A6BD00A96E63 /* model in Resources */ = {isa = PBXBuildFile; fileRef = 9DD57A0623B5A6BD00A96E63 /* model */; };
+		9DD57A1123B5A8D000A96E63 /* tnn.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A0D23B5A8C400A96E63 /* tnn.framework */; };
+		9DD57A1523B5ACEB00A96E63 /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1423B5ACEB00A96E63 /* CoreML.framework */; };
+		9DD57A1723B5ACF900A96E63 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1623B5ACF900A96E63 /* Foundation.framework */; };
+		9DD57A1923B5AD0100A96E63 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1823B5AD0100A96E63 /* Accelerate.framework */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		9DD57A0C23B5A8C400A96E63 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 9D2DB1D122D759C8000C508F;
+			remoteInfo = tnn;
+		};
+		9DD57A0E23B5A8CB00A96E63 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+			proxyType = 1;
+			remoteGlobalIDString = 9D2DB1D022D759C8000C508F;
+			remoteInfo = tnn;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		9D961FE8241163EE009B3FB1 /* BenchmarkListController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BenchmarkListController.h; sourceTree = "<group>"; };
+		9D961FE9241163EE009B3FB1 /* BenchmarkListController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkListController.mm; sourceTree = "<group>"; };
+		9D961FEB24116548009B3FB1 /* RootNavController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RootNavController.mm; sourceTree = "<group>"; };
+		9D961FEC24116548009B3FB1 /* RootNavController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RootNavController.h; sourceTree = "<group>"; };
+		9DD579EA23B5A20500A96E63 /* benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		9DD579ED23B5A20500A96E63 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		9DD579EE23B5A20500A96E63 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		9DD579F023B5A20500A96E63 /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = "<group>"; };
+		9DD579F123B5A20500A96E63 /* SceneDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SceneDelegate.m; sourceTree = "<group>"; };
+		9DD579F323B5A20500A96E63 /* BenchmarkController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = BenchmarkController.h; sourceTree = "<group>"; };
+		9DD579F423B5A20500A96E63 /* BenchmarkController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkController.mm; sourceTree = "<group>"; };
+		9DD579F723B5A20500A96E63 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		9DD579F923B5A20E00A96E63 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		9DD579FC23B5A20E00A96E63 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		9DD579FE23B5A20E00A96E63 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		9DD579FF23B5A20E00A96E63 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		9DD57A0623B5A6BD00A96E63 /* model */ = {isa = PBXFileReference; lastKnownFileType = folder; name = model; path = ../../../model; sourceTree = "<group>"; };
+		9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = tnn.xcodeproj; path = ../../../platforms/ios/tnn.xcodeproj; sourceTree = "<group>"; };
+		9DD57A1423B5ACEB00A96E63 /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
+		9DD57A1623B5ACF900A96E63 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
+		9DD57A1823B5AD0100A96E63 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		9DD579E723B5A20500A96E63 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9DD57A1923B5AD0100A96E63 /* Accelerate.framework in Frameworks */,
+				9DD57A1723B5ACF900A96E63 /* Foundation.framework in Frameworks */,
+				9DD57A1523B5ACEB00A96E63 /* CoreML.framework in Frameworks */,
+				9DD57A1123B5A8D000A96E63 /* tnn.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		9DD579E123B5A20500A96E63 = {
+			isa = PBXGroup;
+			children = (
+				9DD579EC23B5A20500A96E63 /* benchmark */,
+				9DD579EB23B5A20500A96E63 /* Products */,
+				9DD57A1023B5A8D000A96E63 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		9DD579EB23B5A20500A96E63 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				9DD579EA23B5A20500A96E63 /* benchmark.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		9DD579EC23B5A20500A96E63 /* benchmark */ = {
+			isa = PBXGroup;
+			children = (
+				9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */,
+				9DD57A0623B5A6BD00A96E63 /* model */,
+				9DD579ED23B5A20500A96E63 /* AppDelegate.h */,
+				9DD579EE23B5A20500A96E63 /* AppDelegate.m */,
+				9DD579F023B5A20500A96E63 /* SceneDelegate.h */,
+				9DD579F123B5A20500A96E63 /* SceneDelegate.m */,
+				9D961FEC24116548009B3FB1 /* RootNavController.h */,
+				9D961FEB24116548009B3FB1 /* RootNavController.mm */,
+				9DD579F323B5A20500A96E63 /* BenchmarkController.h */,
+				9DD579F423B5A20500A96E63 /* BenchmarkController.mm */,
+				9D961FE8241163EE009B3FB1 /* BenchmarkListController.h */,
+				9D961FE9241163EE009B3FB1 /* BenchmarkListController.mm */,
+				9DD579F623B5A20500A96E63 /* Main.storyboard */,
+				9DD579F923B5A20E00A96E63 /* Assets.xcassets */,
+				9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */,
+				9DD579FE23B5A20E00A96E63 /* Info.plist */,
+				9DD579FF23B5A20E00A96E63 /* main.m */,
+			);
+			path = benchmark;
+			sourceTree = "<group>";
+		};
+		9DD57A0923B5A8C400A96E63 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				9DD57A0D23B5A8C400A96E63 /* tnn.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		9DD57A1023B5A8D000A96E63 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				9DD57A1823B5AD0100A96E63 /* Accelerate.framework */,
+				9DD57A1623B5ACF900A96E63 /* Foundation.framework */,
+				9DD57A1423B5ACEB00A96E63 /* CoreML.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		9DD579E923B5A20500A96E63 /* benchmark */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 9DD57A0323B5A20E00A96E63 /* Build configuration list for PBXNativeTarget "benchmark" */;
+			buildPhases = (
+				9DD579E623B5A20500A96E63 /* Sources */,
+				9DD579E723B5A20500A96E63 /* Frameworks */,
+				9DD579E823B5A20500A96E63 /* Resources */,
+				9DD57A1323B5A91700A96E63 /* ShellScript */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				9DD57A0F23B5A8CB00A96E63 /* PBXTargetDependency */,
+			);
+			name = benchmark;
+			productName = benchmark;
+			productReference = 9DD579EA23B5A20500A96E63 /* benchmark.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		9DD579E223B5A20500A96E63 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1130;
+				ORGANIZATIONNAME = tencent;
+				TargetAttributes = {
+					9DD579E923B5A20500A96E63 = {
+						CreatedOnToolsVersion = 11.3;
+					};
+				};
+			};
+			buildConfigurationList = 9DD579E523B5A20500A96E63 /* Build configuration list for PBXProject "benchmark" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 9DD579E123B5A20500A96E63;
+			productRefGroup = 9DD579EB23B5A20500A96E63 /* Products */;
+			projectDirPath = "";
+			projectReferences = (
+				{
+					ProductGroup = 9DD57A0923B5A8C400A96E63 /* Products */;
+					ProjectRef = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+				},
+			);
+			projectRoot = "";
+			targets = (
+				9DD579E923B5A20500A96E63 /* benchmark */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXReferenceProxy section */
+		9DD57A0D23B5A8C400A96E63 /* tnn.framework */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.framework;
+			path = tnn.framework;
+			remoteRef = 9DD57A0C23B5A8C400A96E63 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+/* End PBXReferenceProxy section */
+
+/* Begin PBXResourcesBuildPhase section */
+		9DD579E823B5A20500A96E63 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9DD579FD23B5A20E00A96E63 /* LaunchScreen.storyboard in Resources */,
+				9DD579FA23B5A20E00A96E63 /* Assets.xcassets in Resources */,
+				9DD57A0723B5A6BD00A96E63 /* model in Resources */,
+				9DD579F823B5A20500A96E63 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		9DD57A1323B5A91700A96E63 /* ShellScript */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+			);
+			outputFileListPaths = (
+			);
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "echo $TARGET_BUILD_DIR\ncp $TARGET_BUILD_DIR/tnn.framework/default.metallib $TARGET_BUILD_DIR/$TARGET_NAME.app/tnn.metallib\n";
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		9DD579E623B5A20500A96E63 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9DD579F523B5A20500A96E63 /* BenchmarkController.mm in Sources */,
+				9DD579EF23B5A20500A96E63 /* AppDelegate.m in Sources */,
+				9DD57A0023B5A20E00A96E63 /* main.m in Sources */,
+				9DD579F223B5A20500A96E63 /* SceneDelegate.m in Sources */,
+				9D961FEA241163EE009B3FB1 /* BenchmarkListController.mm in Sources */,
+				9D961FED24116548009B3FB1 /* RootNavController.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		9DD57A0F23B5A8CB00A96E63 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			name = tnn;
+			targetProxy = 9DD57A0E23B5A8CB00A96E63 /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin PBXVariantGroup section */
+		9DD579F623B5A20500A96E63 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				9DD579F723B5A20500A96E63 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				9DD579FC23B5A20E00A96E63 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		9DD57A0123B5A20E00A96E63 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 13.2;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		9DD57A0223B5A20E00A96E63 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 13.2;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		9DD57A0423B5A20E00A96E63 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = T7UMVXQMZ6;
+				INFOPLIST_FILE = benchmark/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"$(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/tnn.framework/tnn",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.sdk.benchmark;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALID_ARCHS = arm64;
+			};
+			name = Debug;
+		};
+		9DD57A0523B5A20E00A96E63 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = T7UMVXQMZ6;
+				INFOPLIST_FILE = benchmark/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"$(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/tnn.framework/tnn",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.sdk.benchmark;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALID_ARCHS = arm64;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		9DD579E523B5A20500A96E63 /* Build configuration list for PBXProject "benchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9DD57A0123B5A20E00A96E63 /* Debug */,
+				9DD57A0223B5A20E00A96E63 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		9DD57A0323B5A20E00A96E63 /* Build configuration list for PBXNativeTarget "benchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9DD57A0423B5A20E00A96E63 /* Debug */,
+				9DD57A0523B5A20E00A96E63 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 9DD579E223B5A20500A96E63 /* Project object */;
+}
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000..bb70779
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:benchmark.xcodeproj">
+   </FileRef>
+</Workspace>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 0000000..18d9810
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcshareddata/xcschemes/benchmark.xcscheme b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcshareddata/xcschemes/benchmark.xcscheme
new file mode 100644
index 0000000..c9242b1
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcshareddata/xcschemes/benchmark.xcscheme
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1130"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+               BuildableName = "benchmark.app"
+               BlueprintName = "benchmark"
+               ReferencedContainer = "container:benchmark.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+            BuildableName = "benchmark.app"
+            BlueprintName = "benchmark"
+            ReferencedContainer = "container:benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+            BuildableName = "benchmark.app"
+            BlueprintName = "benchmark"
+            ReferencedContainer = "container:benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcuserdata/darrenyao.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcuserdata/darrenyao.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
new file mode 100644
index 0000000..276b079
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark.xcodeproj/xcuserdata/darrenyao.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Bucket
+   uuid = "F025E288-54E9-48FE-B0CD-C6538F68E77C"
+   type = "1"
+   version = "2.0">
+</Bucket>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.h b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.h
new file mode 100644
index 0000000..4667d4c
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.m b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.m
new file mode 100644
index 0000000..986fff3
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/AppDelegate.m
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import "AppDelegate.h"
+
+@interface AppDelegate ()
+@end
+
+@implementation AppDelegate
+@synthesize window = _window;
+
+- (BOOL)application:(UIApplication *)application
+didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    // Override point for customization after application launch.
+    return YES;
+}
+
+
+#pragma mark - UISceneSession lifecycle
+
+
+- (UISceneConfiguration *)application:(UIApplication *)application
+configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession
+                              options:(UISceneConnectionOptions *)options {
+    // Called when a new scene session is being created.
+    // Use this method to select a configuration to create the new scene with.
+    return [[UISceneConfiguration alloc] initWithName:@"Default Configuration"
+                                          sessionRole:connectingSceneSession.role];
+}
+
+
+- (void)application:(UIApplication *)application
+didDiscardSceneSessions:(NSSet<UISceneSession *> *)sceneSessions {
+    // Called when the user discards a scene session.
+    // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions.
+    // Use this method to release any resources that were specific to the discarded scenes, as they will not return.
+}
+
+
+@end
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/AppIcon.appiconset/Contents.json b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000..d8db8d6
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/Contents.json b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/Contents.json
new file mode 100644
index 0000000..da4a164
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/LaunchScreen.storyboard b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000..865e932
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" xcode11CocoaTouchSystemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/Main.storyboard b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/Main.storyboard
new file mode 100644
index 0000000..51b1a07
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Base.lproj/Main.storyboard
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="15705" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="1C3-Qb-WYm">
+    <device id="retina4_7" orientation="portrait" appearance="light"/>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="15706"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--Benchmark-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="BenchmarkController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="hDg-pg-hHt">
+                                <rect key="frame" x="127.5" y="583" width="120" height="40"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="40" id="7Sx-kK-C2Y"/>
+                                    <constraint firstAttribute="width" constant="120" id="TRN-i7-n0o"/>
+                                </constraints>
+                                <state key="normal" title="Run"/>
+                                <connections>
+                                    <action selector="onBtnBenchmark:" destination="BYZ-38-t0r" eventType="touchUpInside" id="YB3-9I-vRk"/>
+                                </connections>
+                            </button>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" alwaysBounceVertical="YES" showsHorizontalScrollIndicator="NO" editable="NO" text=" " textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="zHL-5b-LCp">
+                                <rect key="frame" x="15" y="59" width="345" height="509"/>
+                                <color key="backgroundColor" red="0.99953407049999998" green="0.98835557699999999" blue="0.47265523669999998" alpha="0.3400577910958904" colorSpace="custom" customColorSpace="sRGB"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="hDg-pg-hHt" firstAttribute="top" secondItem="zHL-5b-LCp" secondAttribute="bottom" constant="15" id="F3m-7a-nJA"/>
+                            <constraint firstItem="zHL-5b-LCp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="NfX-oN-ewa"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="hDg-pg-hHt" secondAttribute="bottom" constant="44" id="QAD-OF-z0F"/>
+                            <constraint firstItem="hDg-pg-hHt" firstAttribute="centerX" secondItem="6Tk-OE-BBY" secondAttribute="centerX" id="ibb-Tl-Rca"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="zHL-5b-LCp" secondAttribute="trailing" constant="15" id="jra-wS-pkB"/>
+                            <constraint firstItem="zHL-5b-LCp" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" constant="15" id="vj4-sp-TLH"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="Benchmark" id="It1-My-GFQ">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="ij5-NC-1g3">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="v5L-ug-QZk"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnBenchmark" destination="hDg-pg-hHt" id="o0K-lO-wRf"/>
+                        <outlet property="textViewResult" destination="zHL-5b-LCp" id="ElM-Y3-hAo"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="358" y="27"/>
+        </scene>
+        <!--Model List-->
+        <scene sceneID="1He-D9-9al">
+            <objects>
+                <tableViewController id="jDT-gM-iHE" customClass="BenchmarkListController" sceneMemberID="viewController">
+                    <tableView key="view" clipsSubviews="YES" contentMode="scaleToFill" alwaysBounceVertical="YES" dataMode="prototypes" style="plain" separatorStyle="default" rowHeight="-1" estimatedRowHeight="-1" sectionHeaderHeight="28" sectionFooterHeight="28" id="Khu-jM-gfo">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" systemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
+                        <prototypes>
+                            <tableViewCell clipsSubviews="YES" contentMode="scaleToFill" preservesSuperviewLayoutMargins="YES" selectionStyle="default" indentationWidth="10" id="J9g-sg-aAa">
+                                <rect key="frame" x="0.0" y="28" width="375" height="43.5"/>
+                                <autoresizingMask key="autoresizingMask"/>
+                                <tableViewCellContentView key="contentView" opaque="NO" clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="center" preservesSuperviewLayoutMargins="YES" insetsLayoutMarginsFromSafeArea="NO" tableViewCell="J9g-sg-aAa" id="J44-im-Lbq">
+                                    <rect key="frame" x="0.0" y="0.0" width="375" height="43.5"/>
+                                    <autoresizingMask key="autoresizingMask"/>
+                                </tableViewCellContentView>
+                            </tableViewCell>
+                        </prototypes>
+                        <connections>
+                            <outlet property="dataSource" destination="jDT-gM-iHE" id="ggX-Ye-BF7"/>
+                            <outlet property="delegate" destination="jDT-gM-iHE" id="NXu-H3-MJc"/>
+                        </connections>
+                    </tableView>
+                    <navigationItem key="navigationItem" title="Model List" id="3h5-ew-iZu"/>
+                </tableViewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="9yb-bE-TQ2" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="1146" y="27"/>
+        </scene>
+        <!--Root Nav Controller-->
+        <scene sceneID="93y-QD-ePb">
+            <objects>
+                <navigationController id="1C3-Qb-WYm" customClass="RootNavController" sceneMemberID="viewController">
+                    <navigationBar key="navigationBar" contentMode="scaleToFill" insetsLayoutMarginsFromSafeArea="NO" id="BzZ-xa-nSS">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="44"/>
+                        <autoresizingMask key="autoresizingMask"/>
+                    </navigationBar>
+                    <connections>
+                        <segue destination="BYZ-38-t0r" kind="relationship" relationship="rootViewController" id="gVK-IF-Mrp"/>
+                    </connections>
+                </navigationController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="ztT-Du-Oi3" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-354" y="27"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.h b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.h
new file mode 100644
index 0000000..f3ea6d6
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface BenchmarkController : UIViewController
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.mm b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.mm
new file mode 100644
index 0000000..bf1bf8d
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkController.mm
@@ -0,0 +1,294 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import "BenchmarkController.h"
+#import <tnn/tnn.h>
+#include <fstream>
+#include <cmath>
+#include <sys/time.h>
+#include <float.h>
+#include <sstream>
+
+using namespace std;
+using namespace TNN_NS;
+
+struct BenchModel {
+    string name;
+    string tnn_proto_content;
+    string tnn_model_content;
+    string coreml;
+};
+
+struct BenchOption {
+    int warm_count = 10;
+    int forward_count = 20;
+    int create_count = 1;
+    
+    string description() {
+        ostringstream ostr;
+        ostr << "create_count = " << create_count
+        << "  warm_count = " << warm_count
+        << "  forward_count = " << forward_count;
+        
+        ostr << std::endl;
+        return ostr.str();
+    };
+};
+
+struct BenchResult {
+    Status status;
+    
+    //time
+    float min = FLT_MAX;
+    float max = FLT_MIN;
+    float avg = 0;
+    float total = 0;
+    int count = 0;
+    
+    float diff = 0;
+    
+    int addTime(float time){
+        count++;
+        total += time;
+        min = std::min(min, time);
+        max = std::max(max, time);
+        avg = total/count;
+        return 0;
+    };
+    
+    string description() {
+        ostringstream ostr;
+        ostr << "min = " << min << "  max = " << max << "  avg = " <<avg;
+        
+        if (status != TNN_OK) {
+            ostr << "\nerror = "<<status.description();
+        }
+        ostr << std::endl;
+        
+        return ostr.str();
+    };
+};
+
+@interface BenchmarkController () {
+}
+@property (nonatomic, weak) IBOutlet UIButton *btnBenchmark;
+@property (nonatomic, weak) IBOutlet UITextView *textViewResult;
+@end
+
+@implementation BenchmarkController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    // Do any additional setup after loading the view.
+    
+    
+}
+
+- (vector<BenchModel>)getAllModels {
+    NSString *modelZone = [[NSBundle mainBundle] pathForResource:@"model"
+                                                          ofType:nil];
+    NSArray *modelList = [[NSFileManager defaultManager] contentsOfDirectoryAtPath:modelZone
+                                                                             error:nil];
+    
+    NSPredicate *predicateProto = [NSPredicate predicateWithFormat:@"self ENDSWITH 'proto'"];
+    NSPredicate *predicateModel = [NSPredicate predicateWithFormat:@"self ENDSWITH 'model'"];
+    NSPredicate *predicateCoreML = [NSPredicate predicateWithFormat:@"self ENDSWITH 'mlmodelc'"];
+    
+    vector<BenchModel> netmodels;
+    
+    for (NSString *modelDir in modelList) {
+//        if (![modelDir hasPrefix:@"mobilenetv1-ssd"]) {
+//            continue;
+//        }
+       NSString *modelDirPath = [modelZone stringByAppendingPathComponent:modelDir];
+       BOOL isDirectory = NO;
+
+       if ([[NSFileManager defaultManager] fileExistsAtPath:modelDirPath
+                                                isDirectory:&isDirectory]) {
+           if (!isDirectory) {
+               continue;
+           }
+           
+           BenchModel model;
+           model.name = modelDir.UTF8String;
+           
+           NSArray *modelFiles = [[NSFileManager defaultManager] contentsOfDirectoryAtPath:modelDirPath
+                                                                                     error:nil];
+           NSArray<NSString *> *protos = [modelFiles filteredArrayUsingPredicate:predicateProto];
+           if (protos.count > 0) {
+               auto proto = [NSString stringWithContentsOfFile:[modelDirPath stringByAppendingPathComponent:protos[0]]
+                                                        encoding:NSUTF8StringEncoding
+                                                           error:nil];
+               if (proto.length > 0) {
+                   model.tnn_proto_content = proto.UTF8String;
+               }
+           }
+           NSArray<NSString *> *models = [modelFiles filteredArrayUsingPredicate:predicateModel];
+           if (models.count > 0) {
+//               model.tnn_model_content = [modelDirPath stringByAppendingPathComponent:models[0]].UTF8String;
+               NSData *data = [NSData dataWithContentsOfFile:[modelDirPath
+                                                              stringByAppendingPathComponent:models[0]]];
+               model.tnn_model_content = string((const char *)[data bytes], [data length]);
+           }
+           NSArray<NSString *> *coremls = [modelFiles filteredArrayUsingPredicate:predicateCoreML];
+           if (coremls.count > 0) {
+               model.coreml = [modelDirPath stringByAppendingPathComponent:coremls[0]].UTF8String;
+           }
+           netmodels.push_back(model);
+       }
+    }
+    return netmodels;
+}
+
+- (IBAction)onBtnBenchmark:(id)sender {
+    //check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认意见调整到release模式
+    
+    //搜索model目录下的所有模型
+    auto allModels = [self getAllModels];
+    
+    BenchOption option;
+    option.warm_count = 5;
+    option.forward_count = 10;
+    option.create_count = 1;
+    
+    //Get metallib path from app bundle
+    //PS：A script(Build Phases -> Run Script) is added to copy the metallib file in tnn framework project to benchmark app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto pathLibrary = [[NSBundle mainBundle] pathForResource:@"tnn.metallib"
+                                                       ofType:nil];
+    pathLibrary = pathLibrary ? pathLibrary : @"";
+    
+    NSString *allResult = [NSString string];
+    for (auto model : allModels) {
+        NSLog(@"model: %s", model.name.c_str());
+        allResult = [allResult stringByAppendingFormat:@"model: %s\n", model.name.c_str()];
+        
+        //benchmark on arm cpu
+        auto result_arm = [self benchmarkWithProtoContent:model.tnn_proto_content
+                                                model:model.tnn_model_content
+                                               coreml:model.coreml
+                                              library:pathLibrary.UTF8String
+                                              netType:NETWORK_TYPE_DEFAULT
+                                              deviceType:DEVICE_ARM
+                                               option:option];
+        NSLog(@"arm: \ntime: %s", result_arm.description().c_str());
+        allResult = [allResult stringByAppendingFormat:@"arm: \ntime: %s",
+                     result_arm.description().c_str()];
+        
+        
+        //benchmark on gpu
+        auto result_gpu = [self benchmarkWithProtoContent:model.tnn_proto_content
+                                                model:model.tnn_model_content
+                                               coreml:model.coreml
+                                              library:pathLibrary.UTF8String
+                                              netType:NETWORK_TYPE_DEFAULT
+                                              deviceType:DEVICE_METAL
+                                               option:option];
+        NSLog(@"gpu: \ntime: %s", result_gpu.description().c_str());
+        allResult = [allResult stringByAppendingFormat:@"gpu: \ntime: %s\n",
+                     result_gpu.description().c_str()];
+    }
+    
+    self.textViewResult.text = allResult;
+}
+
+- (BenchResult)benchmarkWithProtoContent:(string)protoContent
+                                   model:(string)modelPathOrContent
+                                  coreml:(string)coremlDir
+                                 library:(string)metallibPath
+                                 netType:(NetworkType)net_type
+                              deviceType:(DeviceType)device_type
+                                  option:(BenchOption)option {
+    BenchResult result;
+    
+    net_type = net_type == NETWORK_TYPE_COREML ? NETWORK_TYPE_COREML : NETWORK_TYPE_DEFAULT;
+    
+    //network init
+    //网络初始化
+    TNN net;
+    {
+        ModelConfig config;
+        if (net_type == NETWORK_TYPE_COREML) {
+            config.model_type = MODEL_TYPE_COREML;
+            config.params = {coremlDir};
+        } else {
+            config.model_type = MODEL_TYPE_TNN;
+            config.params = {protoContent, modelPathOrContent};
+        }
+        
+        if (net_type == NETWORK_TYPE_COREML) {
+            config.model_type = MODEL_TYPE_COREML;
+        }
+        
+        result.status = net.Init(config);
+        if (result.status != TNN_OK) {
+            NSLog(@"net.Init Error: %s", result.status.description().c_str());
+            return result;
+        }
+    }
+    
+    //create instance
+    //创建实例instance
+    std::shared_ptr<TNN_NS::Instance> instance = nullptr;
+    {
+        NetworkConfig network_config;
+        network_config.network_type = net_type;
+        network_config.library_path = {metallibPath};
+        network_config.device_type =  device_type;
+        instance = net.CreateInst(network_config, result.status);
+        if (result.status != TNN_OK || !instance) {
+            NSLog(@"net.CreateInst Error: %s", result.status.description().c_str());
+            return result;
+        }
+    }
+    
+    //warm cpu, only used when benchmark
+    for (int cc=0; cc<option.warm_count; cc++) {
+        result.status = instance->Forward();
+        if (result.status != TNN_OK) {
+            NSLog(@"instance.Forward Error: %s", result.status.description().c_str());
+            return result;
+        }
+    }
+    
+    //inference
+    //前向推断
+    bool profile_layer_time = false;
+#if TNN_PROFILE
+    if (profile_layer_time) {
+        instance->StartProfile();
+    }
+#endif
+    for (int cc=0; cc<option.forward_count; cc++) {
+        timeval tv_begin, tv_end;
+        gettimeofday(&tv_begin, NULL);
+        
+        result.status = instance->Forward();
+        
+        gettimeofday(&tv_end, NULL);
+        double elapsed = (tv_end.tv_sec - tv_begin.tv_sec) * 1000.0 + (tv_end.tv_usec - tv_begin.tv_usec) / 1000.0;
+        result.addTime(elapsed);
+    }
+#if TNN_PROFILE
+    if (profile_layer_time) {
+        instance->FinishProfile(true);
+    }
+#endif
+
+    return result;
+}
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.h b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.h
new file mode 100644
index 0000000..86ae0cd
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface BenchmarkListController : UITableViewController
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.mm b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.mm
new file mode 100644
index 0000000..a4a1c31
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/BenchmarkListController.mm
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import "BenchmarkListController.h"
+#import <tnn/tnn.h>
+#include <fstream>
+#include <cmath>
+#include <sys/time.h>
+#include <float.h>
+#include <sstream>
+
+using namespace std;
+using namespace TNN_NS;
+
+
+@interface BenchmarkListController () {
+}
+@end
+
+@implementation BenchmarkListController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    // Do any additional setup after loading the view.
+    
+    
+}
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Info.plist b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Info.plist
new file mode 100644
index 0000000..7b6037c
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/Info.plist
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>$(PRODUCT_BUNDLE_PACKAGE_TYPE)</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIApplicationSceneManifest</key>
+	<dict>
+		<key>UIApplicationSupportsMultipleScenes</key>
+		<false/>
+		<key>UISceneConfigurations</key>
+		<dict>
+			<key>UIWindowSceneSessionRoleApplication</key>
+			<array>
+				<dict>
+					<key>UISceneConfigurationName</key>
+					<string>Default Configuration</string>
+					<key>UISceneDelegateClassName</key>
+					<string>SceneDelegate</string>
+					<key>UISceneStoryboardFile</key>
+					<string>Main</string>
+				</dict>
+			</array>
+		</dict>
+	</dict>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.h b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.h
new file mode 100644
index 0000000..a21668e
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface RootNavController : UINavigationController
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.mm b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.mm
new file mode 100644
index 0000000..5781b2a
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/RootNavController.mm
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import "RootNavController.h"
+
+@interface RootNavController () {
+}
+@end
+
+@implementation RootNavController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    // Do any additional setup after loading the view.
+    
+    
+}
+
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.h b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.h
new file mode 100644
index 0000000..5758a87
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.h
@@ -0,0 +1,22 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface SceneDelegate : UIResponder <UIWindowSceneDelegate>
+
+@property (strong, nonatomic) UIWindow * window;
+
+@end
+
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.m b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.m
new file mode 100644
index 0000000..62b8666
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/SceneDelegate.m
@@ -0,0 +1,52 @@
+#import "SceneDelegate.h"
+
+@interface SceneDelegate ()
+
+@end
+
+@implementation SceneDelegate
+
+
+- (void)scene:(UIScene *)scene
+willConnectToSession:(UISceneSession *)session
+      options:(UISceneConnectionOptions *)connectionOptions {
+    // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`.
+    // If using a storyboard, the `window` property will automatically be initialized and attached to the scene.
+    // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead).
+}
+
+
+- (void)sceneDidDisconnect:(UIScene *)scene {
+    // Called as the scene is being released by the system.
+    // This occurs shortly after the scene enters the background, or when its session is discarded.
+    // Release any resources associated with this scene that can be re-created the next time the scene connects.
+    // The scene may re-connect later, as its session was not neccessarily discarded (see `application:didDiscardSceneSessions` instead).
+}
+
+
+- (void)sceneDidBecomeActive:(UIScene *)scene {
+    // Called when the scene has moved from an inactive state to an active state.
+    // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive.
+}
+
+
+- (void)sceneWillResignActive:(UIScene *)scene {
+    // Called when the scene will move from an active state to an inactive state.
+    // This may occur due to temporary interruptions (ex. an incoming phone call).
+}
+
+
+- (void)sceneWillEnterForeground:(UIScene *)scene {
+    // Called as the scene transitions from the background to the foreground.
+    // Use this method to undo the changes made on entering the background.
+}
+
+
+- (void)sceneDidEnterBackground:(UIScene *)scene {
+    // Called as the scene transitions from the foreground to the background.
+    // Use this method to save data, release shared resources, and store enough scene-specific state information
+    // to restore the scene back to its current state.
+}
+
+
+@end
diff --git a/3rdparty/TNN/benchmark/benchmark_ios/benchmark/main.m b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/main.m
new file mode 100644
index 0000000..3bd5764
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_ios/benchmark/main.m
@@ -0,0 +1,12 @@
+
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+
+int main(int argc, char * argv[]) {
+    NSString * appDelegateClassName;
+    @autoreleasepool {
+        // Setup code that might create autoreleased objects goes here.
+        appDelegateClassName = NSStringFromClass([AppDelegate class]);
+    }
+    return UIApplicationMain(argc, argv, nil, appDelegateClassName);
+}
diff --git a/3rdparty/TNN/benchmark/benchmark_linux/benchmark_layer.sh b/3rdparty/TNN/benchmark/benchmark_linux/benchmark_layer.sh
new file mode 100755
index 0000000..8080717
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_linux/benchmark_layer.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+CLEAN=""
+WORK_DIR=`pwd`
+FILTER=""
+DEVICE_TYPE=""
+KERNEL_TUNE="-et"
+BUILD_DIR=build
+OUTPUT_LOG_FILE=benchmark_layer_result.txt
+LOOP_COUNT=10
+
+function usage() {
+    echo "usage: ./benchmark_layer.sh  [-32] [-c] [-f] <filter-info> [-d] <device-id> [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32         Build 32 bit."
+    echo "        -c          Clean up build folders."
+    echo "        -d          run with specified device"
+    echo "        -f          specified layer"
+    echo "        -t          CPU/GPU specify the platform to run"
+    echo "        -et/-noet   set kernel enable tune on or off"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        exit_with_msg "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_linux_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DTNN_ARM_ENABLE:BOOL=ON \
+          -DTNN_OPENCL_ENABLE:BOOL=ON \
+          -DTNN_TEST_ENABLE:BOOL=ON \
+          -DTNN_BENCHMARK_MODE:BOOL=ON \
+          -DTNN_UNIT_TEST_ENABLE:BOOL=ON \
+          -DTNN_UNIT_TEST_BENCHMARK:BOOL=ON \
+          -DTNN_PROFILER_ENABLE:BOOL=ON
+    make -j4
+}
+
+function bench_android() {
+    build_linux_bench
+
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ];then
+        DEVICE_TYPE=""
+    fi
+
+    echo 'layer benchmark' 2>&1 |tee $WORK_DIR/$OUTPUT_LOG_FILE
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        echo 'benchmark device: ARM' 2>&1 |tee -a $WORK_DIR/$OUTPUT_LOG_FILE
+        cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/unit_test/unit_test ${KERNEL_TUNE} -ic $LOOP_COUNT -dt ARM --gtest_filter="*${FILTER}*" -ub 2>&1 |tee -a $WORK_DIR/$OUTPUT_LOG_FILE
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ];then
+        LOOP_COUNT=1
+        echo 'benchmark device: OPENCL' 2>&1 |tee -a $WORK_DIR/$OUTPUT_LOG_FILE
+        cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/unit_test/unit_test ${KERNEL_TUNE} -ic $LOOP_COUNT -dt OPENCL --gtest_filter="*${FILTER}*" -ub 2>&1 |tee -a $WORK_DIR/$OUTPUT_LOG_FILE
+    fi
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -f)
+            shift
+            FILTER=$1
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_android
diff --git a/3rdparty/TNN/benchmark/benchmark_linux/benchmark_models.sh b/3rdparty/TNN/benchmark/benchmark_linux/benchmark_models.sh
new file mode 100755
index 0000000..23e8b5f
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_linux/benchmark_models.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+PROFILING="OFF"
+CLEAN=""
+DEVICE_TYPE=""
+MODEL_TYPE=TNN
+USE_NCNN_MODEL=0
+SHARED_LIB="ON"
+OPENCL="ON"
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/../..
+fi
+
+WORK_DIR=`pwd`
+BENCHMARK_MODEL_DIR=$WORK_DIR/../benchmark-model
+BUILD_DIR=build
+OUTPUT_LOG_FILE=benchmark_models_result.txt
+LOOP_COUNT=20
+WARM_UP_COUNT=10
+
+benchmark_model_list=(
+#test.tnnproto \
+)
+
+function usage() {
+    echo "usage: ./benchmark_models.sh  [-32] [-c] [-b] [-f] [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32   Build 32 bit."
+    echo "        -c    Clean up build folders."
+    echo "        -b    build targets only"
+    echo "        -f    build profiling targets "
+    echo "        -t    CPU/GPU specify the platform to run"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        exit_with_msg "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_bench() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ${TNN_ROOT_PATH} \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DTNN_CPU_ENABLE=OFF \
+        -DTNN_X86_ENABLE=ON \
+        -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+        -DTNN_PROFILER_ENABLE:BOOL=${PROFILING} \
+        -DTNN_TEST_ENABLE=ON \
+        -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB \
+        -DTNN_BENCHMARK_MODE=ON \
+        -DINTTYPES_FORMAT=C99
+
+    make -j4
+}
+
+function bench_linux() {
+    build_bench
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    cd ${BENCHMARK_MODEL_DIR}
+
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" != "GPU" ] && [ "$DEVICE_TYPE" != "CPU" ];then
+        DEVICE_TYPE=""
+    fi
+
+    echo "benchmark log:" > $WORK_DIR/log_temp.txt
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        device=X86
+        echo "benchmark device: ${device} " >> $WORK_DIR/log_temp.txt
+
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/TNNTest -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model}  >> log_temp.txt
+        done
+    fi
+
+    if [ "ON" == $PROFILING ]; then
+        WARM_UP_COUNT=5
+        LOOP_COUNT=1
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "GPU" ];then
+        device=OPENCL
+        echo "benchmark device: ${device} " >> $WORK_DIR/log_temp.txt
+        for benchmark_model in ${benchmark_model_list[*]}
+        do
+            cd ${WORK_DIR}; LD_LIBRARY_PATH=. ./build/test/TNNTest -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model} -et >> log_temp.txt
+        done
+    fi
+
+    cat $WORK_DIR/log_temp.txt |grep "time cost:" > $WORK_DIR/$OUTPUT_LOG_FILE
+    echo '' >> $OUTPUT_LOG_FILE
+    date  >> $OUTPUT_LOG_FILE
+
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            CC=arm-linux-gnueabihf-gcc
+            CXX=arm-linux-gnueabihf-g++
+            TARGET_ARCH=arm
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            PROFILING="ON"
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        -n)
+            shift
+            MODEL_TYPE=NCNN
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_linux
diff --git a/3rdparty/TNN/benchmark/benchmark_windows/benchmark_models.bat b/3rdparty/TNN/benchmark/benchmark_windows/benchmark_models.bat
new file mode 100644
index 0000000..6ffe3b7
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_windows/benchmark_models.bat
@@ -0,0 +1,50 @@
+set TNN_DIR=%~dp0..\..\
+
+@echo off
+echo %TNN_DIR%
+echo %1
+
+if "%2" == "" (
+    goto init_fold
+) else (
+    goto init_env
+)
+
+:init_env
+    if %1 == x86 (
+        echo "build x86"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars32.bat"
+    ) else (
+        echo "build x64"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"
+    )
+    goto init_fold
+
+:init_fold
+    mkdir build_win
+    cd build_win
+
+cmake %TNN_DIR% -G "Ninja" ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_OPENCL_ENABLE=ON ^
+-DTNN_X86_ENABLE=ON ^
+-DTNN_TEST_ENABLE=ON ^
+-DTNN_BENCHMARK_MODE=ON ^
+-DINTTYPES_FORMAT=C99
+
+cmake --build . --config Release
+
+copy TNN.dll test\
+cd test
+echo "Windows Benchmark" > log_temp.txt
+FOR %%C IN (..\..\..\benchmark-model\*.tnnproto) DO (.\TNNTest.exe -wc 10 -ic 20 -mp %%C -mt TNN -dt OPENCL >> log_temp.txt)
+
+echo "Windows Benchmark" > result.txt
+FOR /f "delims=] tokens=3" %%a IN (log_temp.txt) DO (
+echo "%%a"|find "time cost:" && echo %%a >>result.txt
+)
+
+del log_temp.txt
+copy result.txt ..\..\
\ No newline at end of file
diff --git a/3rdparty/TNN/benchmark/benchmark_x86_linux/benchmark_models.sh b/3rdparty/TNN/benchmark/benchmark_x86_linux/benchmark_models.sh
new file mode 100755
index 0000000..3832191
--- /dev/null
+++ b/3rdparty/TNN/benchmark/benchmark_x86_linux/benchmark_models.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+MODEL_TYPE=TNN
+NETWORK_TYPE=OPENVINO
+NUM_THREAD=4
+BUILD_ONLY="OFF"
+DOWNLOAD_MODEL="OFF"
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/../..
+fi
+
+WORK_DIR=`pwd`
+BENCHMARK_MODEL_DIR=$WORK_DIR/benchmark_model
+OUTPUT_LOG_FILE=benchmark_models_result.txt
+LOOP_COUNT=20
+WARM_UP_COUNT=5
+
+benchmark_model_list=(
+#test.tnnproto \
+)
+
+#URL, local path
+function download_file() { #URL, path
+  if [ -e $2 ]; then return 0; fi
+
+  name=`basename $2`
+  echo "downloading $name ..."
+  # status=`wget $1 -o $2`
+  status=`curl $1 -s -w %{http_code} -o $2`
+  if (( status == 200 )); then
+    return 0
+  else
+    echo "download $name failed" 1>&2
+    return -1
+  fi
+}
+
+#URL proto, URL model, directory
+function download_model() {
+  directory="./$3"
+  if [ ! -e ${directory} ]; then
+    mkdir -p ${directory}
+  fi
+
+  proto_name=`basename $1`
+  proto_path_local="${directory}/${proto_name}"
+  if [ ! -f ${proto_path_local} ]; then
+    download_file $1 $proto_path_local
+    succ=$?
+    if [ ! $succ -eq 0 ]; then
+      echo "please download model manually!!!(url:https://github.com/darrenyao87/tnn-models/tree/master/model)"
+      rm -r ${directory}
+    fi
+  fi
+
+  model_name=`basename $2`
+  model_path_local="${directory}/${model_name}"
+  if [ ! -f ${model_path_local} ]; then
+    download_file $2 $model_path_local
+    succ=$?
+    if [ ! $succ -eq 0 ]; then
+      echo "please download model manually!!!(url:https://github.com/darrenyao87/tnn-models/tree/master/model)"
+      rm -r ${directory}
+    fi
+  fi
+}
+
+function download_bench_model() {
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/resnet50/resnet50.opt.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/resnet50/resnet50.opt.tnnmodel" \
+      benchmark_model
+
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/bert-based/bert-based.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/bert-based/bert-based.tnnmodel" \
+      benchmark_model
+
+    download_model \
+      "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/model/bertsquad10/bertsquad10_clean.tnnproto" \
+      "https://media.githubusercontent.com/media/darrenyao87/tnn-models/master/model/bertsquad10/bertsquad10_clean.tnnmodel" \
+      benchmark_model
+}
+
+function usage() {
+    echo "usage: ./benchmark_models.sh  [-th] [-b] [-dl] [-mp] [-native]"
+    echo "options:"
+    echo "        -th      thread num, defalut 1"
+    echo "        -b       build only "
+    echo "        -dl      download model from github "
+    echo "        -mp      model dir path"
+    echo "        -native  bench with native optimization"
+}
+
+function exit_with_msg() {
+    echo $1
+    exit 1
+}
+
+function build_x86_linux_bench() {
+    cd $TNN_ROOT_PATH/scripts
+    ./build_x86_linux.sh
+    cp $TNN_ROOT_PATH/scripts/x86_linux_release $TNN_ROOT_PATH/benchmark/benchmark_x86_linux/ -r
+}
+
+function bench_x86_linux() {
+    if [ "OFF" != "$DOWNLOAD_MODEL" ];then
+      download_bench_model
+    fi
+
+    build_x86_linux_bench
+    if [ $? != 0 ];then
+        exit_with_msg "build failed"
+    fi
+
+    if [ "OFF" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    if [ ! -d ${BENCHMARK_MODEL_DIR} ]; then
+        echo "please set model dir path or exec script with option -dl"
+        usage
+        exit -1
+    fi
+    cd ${BENCHMARK_MODEL_DIR}
+
+    if [ ${#benchmark_model_list[*]} == 0 ];then
+        benchmark_model_list=`ls *.tnnproto`
+    fi
+
+    if [ "$DEVICE_TYPE" = "" ] || [ "$DEVICE_TYPE" = "CPU" ];then
+        device=X86
+        echo "benchmark device: ${device} " >> $WORK_DIR/$OUTPUT_LOG_FILE
+
+    for benchmark_model in ${benchmark_model_list[*]}
+    do
+        cd ${WORK_DIR}; LD_LIBRARY_PATH=x86_linux_release/lib ./x86_linux_release/bin/TNNTest -th ${NUM_THREAD} -wc ${WARM_UP_COUNT} -ic ${LOOP_COUNT} -dt ${device} -mt ${MODEL_TYPE} -nt ${NETWORK_TYPE} -mp ${BENCHMARK_MODEL_DIR}/${benchmark_model}  >> $OUTPUT_LOG_FILE
+    done
+    fi
+
+    echo '' >> $OUTPUT_LOG_FILE
+    date  >> $OUTPUT_LOG_FILE
+
+    cat ${WORK_DIR}/$OUTPUT_LOG_FILE
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -native)
+            shift
+            NETWORK_TYPE=DEFAULT
+            ;;
+        -th)
+            shift
+            NUM_THREAD="$1"
+            shift
+            ;;
+        -b)
+            shift
+            BUILD_ONLY=ON
+            ;;
+        -dl)
+            shift
+            DOWNLOAD_MODEL=ON
+            ;;
+        -mp)
+            shift
+            BENCHMARK_MODEL_DIR=$(cd $1; pwd)
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+bench_x86_linux
diff --git a/3rdparty/TNN/build_linux_native/libTNN.so.0 b/3rdparty/TNN/build_linux_native/libTNN.so.0
new file mode 100755
index 0000000..a7d6f0d
Binary files /dev/null and b/3rdparty/TNN/build_linux_native/libTNN.so.0 differ
diff --git a/3rdparty/TNN/build_linux_native/libTNN.so.0.1.0.0 b/3rdparty/TNN/build_linux_native/libTNN.so.0.1.0.0
new file mode 100755
index 0000000..a7d6f0d
Binary files /dev/null and b/3rdparty/TNN/build_linux_native/libTNN.so.0.1.0.0 differ
diff --git a/3rdparty/TNN/build_linux_native/test/TNNTest b/3rdparty/TNN/build_linux_native/test/TNNTest
new file mode 100755
index 0000000..b8ee8e0
Binary files /dev/null and b/3rdparty/TNN/build_linux_native/test/TNNTest differ
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-install.cmake b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-install.cmake
new file mode 100644
index 0000000..4629652
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-install.cmake
@@ -0,0 +1,169 @@
+## gflags CMake configuration file
+
+# library version information
+set (GFLAGS_VERSION_STRING "2.2.1")
+set (GFLAGS_VERSION_MAJOR  2)
+set (GFLAGS_VERSION_MINOR  2)
+set (GFLAGS_VERSION_PATCH  0)
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/gflags-targets.cmake")
+
+# installation prefix
+get_filename_component (CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component (_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+
+# include directory
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+set (GFLAGS_INCLUDE_DIR "${_INSTALL_PREFIX}/include")
+
+if (gflags_FIND_COMPONENTS)
+  foreach (gflags_FIND_COMPONENT IN LISTS gflags_FIND_COMPONENTS)
+    if (gflags_FIND_REQUIRED_${gflags_FIND_COMPONENT} AND NOT TARGET gflags_${gflags_FIND_COMPONENT})
+      message (FATAL_ERROR "Package gflags was installed without required component ${gflags_FIND_COMPONENT}!")
+    endif ()
+  endforeach ()
+  list (GET gflags_FIND_COMPONENTS 0 gflags_FIND_COMPONENT)
+else ()
+  set (gflags_FIND_COMPONENT)
+endif ()
+
+# default settings of GFLAGS_SHARED and GFLAGS_NOTHREADS
+#
+# It is recommended to use either one of the following find_package commands
+# instead of setting the GFLAGS_(SHARED|NOTHREADS) variables:
+# - find_package(gflags REQUIRED)
+# - find_package(gflags COMPONENTS nothreads_static)
+# - find_package(gflags COMPONENTS nothreads_shared)
+# - find_package(gflags COMPONENTS static)
+# - find_package(gflags COMPONENTS shared)
+if (NOT DEFINED GFLAGS_SHARED)
+  if (DEFINED gflags_SHARED)
+    set (GFLAGS_SHARED ${gflags_SHARED})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "shared")
+      set (GFLAGS_SHARED TRUE)
+    else ()
+      set (GFLAGS_SHARED FALSE)
+    endif ()
+  elseif (TARGET gflags_shared OR TARGET gflags_nothreads_shared)
+    set (GFLAGS_SHARED TRUE)
+  else ()
+    set (GFLAGS_SHARED FALSE)
+  endif ()
+endif ()
+if (NOT DEFINED GFLAGS_NOTHREADS)
+  if (DEFINED gflags_NOTHREADS)
+    set (GFLAGS_NOTHREADS ${gflags_NOTHREADS})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "nothreads")
+      set (GFLAGS_NOTHREADS TRUE)
+    else ()
+      set (GFLAGS_NOTHREADS FALSE)
+    endif ()
+  elseif (TARGET gflags_static OR TARGET gflags_shared)
+    set (GFLAGS_NOTHREADS FALSE)
+  else ()
+    set (GFLAGS_NOTHREADS TRUE)
+  endif ()
+endif ()
+
+# choose imported library target
+if (NOT GFLAGS_TARGET)
+  if (gflags_TARGET)
+    set (GFLAGS_TARGET ${gflags_TARGET})
+  elseif (GFLAGS_SHARED)
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_shared)
+    else ()
+      set (GFLAGS_TARGET gflags_shared)
+    endif ()
+  else ()
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_static)
+    else ()
+      set (GFLAGS_TARGET gflags_static)
+    endif ()
+  endif ()
+endif ()
+if (NOT TARGET ${GFLAGS_TARGET})
+  message (FATAL_ERROR "Your gflags installation does not contain a ${GFLAGS_TARGET} library target!"
+                       " Try a different combination of GFLAGS_SHARED and GFLAGS_NOTHREADS.")
+endif ()
+
+# add more convenient "gflags" import target
+if (NOT TARGET gflags)
+  if (GFLAGS_SHARED)
+    add_library (gflags SHARED IMPORTED)
+  else ()
+    add_library (gflags STATIC IMPORTED)
+  endif ()
+  # copy INTERFACE_* properties
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    COMPILE_DEFINITIONS
+    COMPILE_FEATURES
+    COMPILE_OPTIONS
+    INCLUDE_DIRECTORIES
+    LINK_LIBRARIES
+    POSITION_INDEPENDENT_CODE
+  )
+    get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} INTERFACE_${_GFLAGS_PROPERTY_NAME})
+    if (_GFLAGS_PROPERTY_VALUE)
+      set_target_properties(gflags PROPERTIES
+        INTERFACE_${_GFLAGS_PROPERTY_NAME} "${_GFLAGS_PROPERTY_VALUE}"
+      )
+    endif ()
+  endforeach ()
+  # copy IMPORTED_*_<CONFIG> properties
+  get_target_property (_GFLAGS_CONFIGURATIONS ${GFLAGS_TARGET} IMPORTED_CONFIGURATIONS)
+  set_target_properties (gflags PROPERTIES IMPORTED_CONFIGURATIONS "${_GFLAGS_CONFIGURATIONS}")
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    IMPLIB
+    LOCATION
+    LINK_DEPENDENT_LIBRARIES
+    LINK_INTERFACE_LIBRARIES
+    LINK_INTERFACE_LANGUAGES 
+    LINK_INTERFACE_MULTIPLICITY
+    NO_SONAME
+    SONAME
+  )
+    foreach (_GFLAGS_CONFIG IN LISTS _GFLAGS_CONFIGURATIONS)
+      get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG})
+      if (_GFLAGS_PROPERTY_VALUE)
+        set_target_properties(gflags PROPERTIES
+          IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG} "${_GFLAGS_PROPERTY_VALUE}"
+        )
+      endif ()
+    endforeach ()
+  endforeach ()
+  unset (_GFLAGS_CONFIGURATIONS)
+  unset (_GFLAGS_CONFIG)
+  unset (_GFLAGS_PROPERTY_NAME)
+  unset (_GFLAGS_PROPERTY_VALUE)
+endif ()
+
+# alias for default import target to be compatible with older CMake package configurations
+set (GFLAGS_LIBRARIES "${GFLAGS_TARGET}")
+
+# set gflags_* variables for backwards compatibility
+if (NOT "^gflags$" STREQUAL "^GFLAGS$")
+  foreach (_GFLAGS_VARIABLE IN ITEMS
+    VERSION_STRING
+    VERSION_MAJOR
+    VERSION_MINOR
+    VERSION_PATCH
+    INCLUDE_DIR
+    LIBRARIES
+    TARGET
+  )
+    set (gflags_${_GFLAGS_VARIABLE} "${GFLAGS_${_GFLAGS_VARIABLE}}")
+  endforeach ()
+  unset (_GFLAGS_VARIABLE)
+endif ()
+
+# unset private variables
+unset (gflags_FIND_COMPONENT)
+unset (_INSTALL_PREFIX)
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-version.cmake b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-version.cmake
new file mode 100644
index 0000000..d68a39f
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config-version.cmake
@@ -0,0 +1,21 @@
+## gflags CMake configuration version file
+
+# -----------------------------------------------------------------------------
+# library version
+set (PACKAGE_VERSION "2.2.1")
+
+# -----------------------------------------------------------------------------
+# check compatibility
+
+# Perform compatibility check here using the input CMake variables.
+# See example in http://www.cmake.org/Wiki/CMake_2.6_Notes.
+
+set (PACKAGE_VERSION_COMPATIBLE TRUE)
+set (PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if ("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "2" AND
+    "${PACKAGE_FIND_VERSION_MINOR}" EQUAL "2")
+  set (PACKAGE_VERSION_EXACT TRUE)
+else ()
+  set (PACKAGE_VERSION_EXACT FALSE)
+endif ()
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config.cmake b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config.cmake
new file mode 100644
index 0000000..82b5837
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-config.cmake
@@ -0,0 +1,169 @@
+## gflags CMake configuration file
+
+# library version information
+set (GFLAGS_VERSION_STRING "2.2.1")
+set (GFLAGS_VERSION_MAJOR  2)
+set (GFLAGS_VERSION_MINOR  2)
+set (GFLAGS_VERSION_PATCH  0)
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/gflags-targets.cmake")
+
+# installation prefix
+get_filename_component (CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component (_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/." ABSOLUTE)
+
+# include directory
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+set (GFLAGS_INCLUDE_DIR "${_INSTALL_PREFIX}/include")
+
+if (gflags_FIND_COMPONENTS)
+  foreach (gflags_FIND_COMPONENT IN LISTS gflags_FIND_COMPONENTS)
+    if (gflags_FIND_REQUIRED_${gflags_FIND_COMPONENT} AND NOT TARGET gflags_${gflags_FIND_COMPONENT})
+      message (FATAL_ERROR "Package gflags was installed without required component ${gflags_FIND_COMPONENT}!")
+    endif ()
+  endforeach ()
+  list (GET gflags_FIND_COMPONENTS 0 gflags_FIND_COMPONENT)
+else ()
+  set (gflags_FIND_COMPONENT)
+endif ()
+
+# default settings of GFLAGS_SHARED and GFLAGS_NOTHREADS
+#
+# It is recommended to use either one of the following find_package commands
+# instead of setting the GFLAGS_(SHARED|NOTHREADS) variables:
+# - find_package(gflags REQUIRED)
+# - find_package(gflags COMPONENTS nothreads_static)
+# - find_package(gflags COMPONENTS nothreads_shared)
+# - find_package(gflags COMPONENTS static)
+# - find_package(gflags COMPONENTS shared)
+if (NOT DEFINED GFLAGS_SHARED)
+  if (DEFINED gflags_SHARED)
+    set (GFLAGS_SHARED ${gflags_SHARED})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "shared")
+      set (GFLAGS_SHARED TRUE)
+    else ()
+      set (GFLAGS_SHARED FALSE)
+    endif ()
+  elseif (TARGET gflags_shared OR TARGET gflags_nothreads_shared)
+    set (GFLAGS_SHARED TRUE)
+  else ()
+    set (GFLAGS_SHARED FALSE)
+  endif ()
+endif ()
+if (NOT DEFINED GFLAGS_NOTHREADS)
+  if (DEFINED gflags_NOTHREADS)
+    set (GFLAGS_NOTHREADS ${gflags_NOTHREADS})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "nothreads")
+      set (GFLAGS_NOTHREADS TRUE)
+    else ()
+      set (GFLAGS_NOTHREADS FALSE)
+    endif ()
+  elseif (TARGET gflags_static OR TARGET gflags_shared)
+    set (GFLAGS_NOTHREADS FALSE)
+  else ()
+    set (GFLAGS_NOTHREADS TRUE)
+  endif ()
+endif ()
+
+# choose imported library target
+if (NOT GFLAGS_TARGET)
+  if (gflags_TARGET)
+    set (GFLAGS_TARGET ${gflags_TARGET})
+  elseif (GFLAGS_SHARED)
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_shared)
+    else ()
+      set (GFLAGS_TARGET gflags_shared)
+    endif ()
+  else ()
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_static)
+    else ()
+      set (GFLAGS_TARGET gflags_static)
+    endif ()
+  endif ()
+endif ()
+if (NOT TARGET ${GFLAGS_TARGET})
+  message (FATAL_ERROR "Your gflags installation does not contain a ${GFLAGS_TARGET} library target!"
+                       " Try a different combination of GFLAGS_SHARED and GFLAGS_NOTHREADS.")
+endif ()
+
+# add more convenient "gflags" import target
+if (NOT TARGET gflags)
+  if (GFLAGS_SHARED)
+    add_library (gflags SHARED IMPORTED)
+  else ()
+    add_library (gflags STATIC IMPORTED)
+  endif ()
+  # copy INTERFACE_* properties
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    COMPILE_DEFINITIONS
+    COMPILE_FEATURES
+    COMPILE_OPTIONS
+    INCLUDE_DIRECTORIES
+    LINK_LIBRARIES
+    POSITION_INDEPENDENT_CODE
+  )
+    get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} INTERFACE_${_GFLAGS_PROPERTY_NAME})
+    if (_GFLAGS_PROPERTY_VALUE)
+      set_target_properties(gflags PROPERTIES
+        INTERFACE_${_GFLAGS_PROPERTY_NAME} "${_GFLAGS_PROPERTY_VALUE}"
+      )
+    endif ()
+  endforeach ()
+  # copy IMPORTED_*_<CONFIG> properties
+  get_target_property (_GFLAGS_CONFIGURATIONS ${GFLAGS_TARGET} IMPORTED_CONFIGURATIONS)
+  set_target_properties (gflags PROPERTIES IMPORTED_CONFIGURATIONS "${_GFLAGS_CONFIGURATIONS}")
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    IMPLIB
+    LOCATION
+    LINK_DEPENDENT_LIBRARIES
+    LINK_INTERFACE_LIBRARIES
+    LINK_INTERFACE_LANGUAGES 
+    LINK_INTERFACE_MULTIPLICITY
+    NO_SONAME
+    SONAME
+  )
+    foreach (_GFLAGS_CONFIG IN LISTS _GFLAGS_CONFIGURATIONS)
+      get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG})
+      if (_GFLAGS_PROPERTY_VALUE)
+        set_target_properties(gflags PROPERTIES
+          IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG} "${_GFLAGS_PROPERTY_VALUE}"
+        )
+      endif ()
+    endforeach ()
+  endforeach ()
+  unset (_GFLAGS_CONFIGURATIONS)
+  unset (_GFLAGS_CONFIG)
+  unset (_GFLAGS_PROPERTY_NAME)
+  unset (_GFLAGS_PROPERTY_VALUE)
+endif ()
+
+# alias for default import target to be compatible with older CMake package configurations
+set (GFLAGS_LIBRARIES "${GFLAGS_TARGET}")
+
+# set gflags_* variables for backwards compatibility
+if (NOT "^gflags$" STREQUAL "^GFLAGS$")
+  foreach (_GFLAGS_VARIABLE IN ITEMS
+    VERSION_STRING
+    VERSION_MAJOR
+    VERSION_MINOR
+    VERSION_PATCH
+    INCLUDE_DIR
+    LIBRARIES
+    TARGET
+  )
+    set (gflags_${_GFLAGS_VARIABLE} "${GFLAGS_${_GFLAGS_VARIABLE}}")
+  endforeach ()
+  unset (_GFLAGS_VARIABLE)
+endif ()
+
+# unset private variables
+unset (gflags_FIND_COMPONENT)
+unset (_INSTALL_PREFIX)
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-targets.cmake b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-targets.cmake
new file mode 100644
index 0000000..89807e4
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags-targets.cmake
@@ -0,0 +1,65 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget gflags_nothreads_static)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  unset(_targetsDefined)
+  unset(_targetsNotDefined)
+  unset(_expectedTargets)
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+
+
+# Create imported target gflags_nothreads_static
+add_library(gflags_nothreads_static STATIC IMPORTED)
+
+set_target_properties(gflags_nothreads_static PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "GFLAGS_IS_A_DLL=0"
+  INTERFACE_INCLUDE_DIRECTORIES "/home/dm/project/SDK/TNN_lib/TNN-latest/build_linux_native/third_party/gflags/include"
+  INTERFACE_LINK_LIBRARIES "/usr/lib/gcc/x86_64-linux-gnu/7/libgomp.so;/usr/lib/x86_64-linux-gnu/libpthread.so;/usr/lib/gcc/x86_64-linux-gnu/7/libgomp.so;/usr/lib/x86_64-linux-gnu/libpthread.so"
+)
+
+# Import target "gflags_nothreads_static" for configuration "Release"
+set_property(TARGET gflags_nothreads_static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(gflags_nothreads_static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
+  IMPORTED_LOCATION_RELEASE "/home/dm/project/SDK/TNN_lib/TNN-latest/build_linux_native/third_party/gflags/libgflags_nothreads.a"
+  )
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/gflags.pc b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags.pc
new file mode 100644
index 0000000..d4662b8
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/gflags.pc
@@ -0,0 +1,14 @@
+prefix=/usr/local
+exec_prefix=${prefix}
+bindir=${prefix}/bin
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: gflags
+Version: 2.2.1
+Description: A commandline flags library that allows for distributed flags.
+URL: http://gflags.github.io/gflags
+Requires:
+Libs: -L${libdir} -lgflags
+Libs.private: -lpthread 
+Cflags: -I${includedir}
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/config.h b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/config.h
new file mode 100644
index 0000000..bf43a9a
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/config.h
@@ -0,0 +1,114 @@
+/* Generated from config.h.in during build configuration using CMake. */
+
+// Note: This header file is only used internally. It is not part of public interface!
+
+#ifndef GFLAGS_CONFIG_H_
+#define GFLAGS_CONFIG_H_
+
+
+// ---------------------------------------------------------------------------
+// System checks
+
+// Define if you build this library for a MS Windows OS.
+/* #undef OS_WINDOWS */
+
+// Define if you have the <stdint.h> header file.
+#define HAVE_STDINT_H
+
+// Define if you have the <sys/types.h> header file.
+#define HAVE_SYS_TYPES_H
+
+// Define if you have the <inttypes.h> header file.
+#define HAVE_INTTYPES_H
+
+// Define if you have the <sys/stat.h> header file.
+#define HAVE_SYS_STAT_H
+
+// Define if you have the <unistd.h> header file.
+#define HAVE_UNISTD_H
+
+// Define if you have the <fnmatch.h> header file.
+#define HAVE_FNMATCH_H
+
+// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
+/* #undef HAVE_SHLWAPI_H */
+
+// Define if you have the strtoll function.
+#define HAVE_STRTOLL
+
+// Define if you have the strtoq function.
+/* #undef HAVE_STRTOQ */
+
+// Define if you have the <pthread.h> header file.
+/* #undef HAVE_PTHREAD */
+
+// Define if your pthread library defines the type pthread_rwlock_t
+/* #undef HAVE_RWLOCK */
+
+// gcc requires this to get PRId64, etc.
+#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS 1
+#endif
+
+// ---------------------------------------------------------------------------
+// Package information
+
+// Name of package.
+#define PACKAGE gflags
+
+// Define to the full name of this package.
+#define PACKAGE_NAME gflags
+
+// Define to the full name and version of this package.
+#define PACKAGE_STRING gflags 2.2.1
+
+// Define to the one symbol short name of this package.
+#define PACKAGE_TARNAME gflags-2.2.1
+
+// Define to the version of this package.
+#define PACKAGE_VERSION 2.2.1
+
+// Version number of package.
+#define VERSION PACKAGE_VERSION
+
+// Define to the address where bug reports for this package should be sent.
+#define PACKAGE_BUGREPORT https://github.com/gflags/gflags/issues
+
+// ---------------------------------------------------------------------------
+// Path separator
+#ifndef PATH_SEPARATOR
+#  ifdef OS_WINDOWS
+#    define PATH_SEPARATOR  '\\'
+#  else
+#    define PATH_SEPARATOR  '/'
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Windows
+
+// Always export symbols when compiling a shared library as this file is only
+// included by internal modules when building the gflags library itself.
+// The gflags_declare.h header file will set it to import these symbols otherwise.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+// Flags defined by the gflags library itself must be exported
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
+#endif
+
+#ifdef OS_WINDOWS
+// The unittests import the symbols of the shared gflags library
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
+#  endif
+#  include "windows_port.h"
+#endif
+
+
+#endif // GFLAGS_CONFIG_H_
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags.h b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags.h
new file mode 100644
index 0000000..18cd369
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags.h
@@ -0,0 +1,605 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags/gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint32*      flag, bool (*validate_fn)(const char*, uint32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+}__attribute((unused));
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL uint32 Uint32FromEnv(const char *varname, uint32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  // We instantiate this template ctor for all supported types,
+  // so it is possible to place implementation of the FlagRegisterer ctor in
+  // .cc file.
+  // Calling this constructor with unsupported type will produce linker error.
+  template <typename FlagType>
+  FlagRegisterer(const char* name,
+                 const char* help, const char* filename,
+                 FlagType* current_storage, FlagType* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, MAYBE_STRIPPED_HELP(help), __FILE__,                       \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_uint32(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+
+// Auxiliary class used to explicitly call destructor of string objects
+// allocated using placement new during static program deinitialization.
+// The destructor MUST be an inline function such that the explicit
+// destruction occurs in the same compilation unit as the placement new.
+class StringFlagDestructor {
+  void *current_storage_;
+  void *defvalue_storage_;
+
+public: 
+
+  StringFlagDestructor(void *current, void *defvalue)
+  : current_storage_(current), defvalue_storage_(defvalue) {}
+
+  ~StringFlagDestructor() {
+    reinterpret_cast<clstring*>(current_storage_ )->~clstring();
+    reinterpret_cast<clstring*>(defvalue_storage_)->~clstring();
+  }
+};
+
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    using ::fLS::StringFlagDestructor;                                      \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, MAYBE_STRIPPED_HELP(txt), __FILE__,                          \
+        FLAGS_no##name, new (s_##name[1].s) clstring(*FLAGS_no##name));     \
+    static StringFlagDestructor d_##name(s_##name[0].s, s_##name[1].s);     \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+
+
+
+#endif  // GFLAGS_GFLAGS_H_
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_completions.h b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_completions.h
new file mode 100644
index 0000000..2fa0db6
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_completions.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                            \
+ '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GFLAGS_COMPLETIONS_H_
+#define GFLAGS_COMPLETIONS_H_
+
+namespace gflags {
+
+extern void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_declare.h b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_declare.h
new file mode 100644
index 0000000..2366380
--- /dev/null
+++ b/3rdparty/TNN/build_linux_native/third_party/gflags/include/gflags/gflags_declare.h
@@ -0,0 +1,153 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// command line flag.
+
+#ifndef GFLAGS_DECLARE_H_
+#define GFLAGS_DECLARE_H_
+
+
+// ---------------------------------------------------------------------------
+// Namespace of gflags library symbols.
+#define GFLAGS_NAMESPACE gflags
+
+// ---------------------------------------------------------------------------
+// Windows DLL import/export.
+
+// Whether gflags library is a DLL.
+//
+// Set to 1 by default when the shared gflags library was built on Windows.
+// Must be overwritten when this header file is used with the optionally also
+// built static library instead; set by CMake's INTERFACE_COMPILE_DEFINITIONS.
+#ifndef GFLAGS_IS_A_DLL
+#  define GFLAGS_IS_A_DLL 0
+#endif
+
+// We always want to import the symbols of the gflags library.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+
+// We always want to import variables declared in user code.
+#ifndef GFLAGS_DLL_DECLARE_FLAG
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECLARE_FLAG
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Flag types
+#include <string>
+#if 1
+#  include <stdint.h>                   // the normal place uint32_t is defined
+#elif 1
+#  include <sys/types.h>                // the normal place u_int32_t is defined
+#elif 1
+#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+#if 1 // C99
+typedef int32_t          int32;
+typedef uint32_t         uint32;
+typedef int64_t          int64;
+typedef uint64_t         uint64;
+#elif 0 // BSD
+typedef int32_t          int32;
+typedef u_int32_t        uint32;
+typedef int64_t          int64;
+typedef u_int64_t        uint64;
+#elif 0 // Windows
+typedef __int32          int32;
+typedef unsigned __int32 uint32;
+typedef __int64          int64;
+typedef unsigned __int64 uint64;
+#else
+#  error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+} // namespace GFLAGS_NAMESPACE
+
+
+namespace fLS {
+
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+} // namespace fLS
+
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, B, name)
+
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
+
+#define DECLARE_uint32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint32, U, name)
+
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
+
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, D, name)
+
+#define DECLARE_string(name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fLS { \
+  using ::fLS::clstring; \
+  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
+  } \
+  using fLS::FLAGS_##name
+
+
+#endif  // GFLAGS_DECLARE_H_
diff --git a/3rdparty/TNN/cmake/macros.cmake b/3rdparty/TNN/cmake/macros.cmake
new file mode 100644
index 0000000..65f71e5
--- /dev/null
+++ b/3rdparty/TNN/cmake/macros.cmake
@@ -0,0 +1,32 @@
+# The Lib Prefix
+if (UNIX)
+  set(LIB_PFX "lib")
+  if (APPLE)
+    set(LIB_EXT ".dylib")
+  else ()
+    set(LIB_EXT ".so")
+  endif ()
+else (UNIX)
+  set(LIB_PFX "")
+  set(LIB_EXT ".dll")
+endif (UNIX)
+
+if(CMAKE_SYSTEM_NAME MATCHES "^Android")
+  set(SYSTEM.Android 1)
+elseif(CMAKE_SYSTEM_NAME MATCHES "^Linux")
+  set(SYSTEM.Linux 1)
+elseif(CMAKE_SYSTEM_NAME MATCHES "^Darwin")
+  set(SYSTEM.Darwin 1)
+elseif(CMAKE_SYSTEM_NAME MATCHES "^iOS")
+  set(SYSTEM.iOS 1)
+elseif(CMAKE_SYSTEM_NAME MATCHES "^Windows")
+  set(SYSTEM.Windows 1)
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set(PROCESSOR.arm 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64")
+  set(PROCESSOR.aarch64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86")
+  set(PROCESSOR.x86 1)
+endif()
diff --git a/3rdparty/TNN/doc/benchmark_data.md b/3rdparty/TNN/doc/benchmark_data.md
new file mode 100644
index 0000000..3e2061c
--- /dev/null
+++ b/3rdparty/TNN/doc/benchmark_data.md
@@ -0,0 +1,119 @@
+# v0.3 benchmark
+
+* huawei P30 Pro(Kirin 980, Mali-G76 MP10)
+
+| benchmark model | cpu time(thread 1,fp16, ms) | gpu time(ms) |
+|-----------------|-----------------------------|--------------|
+| DenseNet 121    | 65.70                       | 45.83        |
+| Inception v3    | 130.98                      | 67.36        |
+| Inception v4    | 310.67                      | 129.59       |
+| MnasNet         | 11.74                       | 9.16         |
+| MobileNet v1    | 16.39                       | 11.18        |
+| MobileNet v2    | 14.81                       | 11.24        |
+| ResNet50 v1     | 77.11                       | 44.29        |
+| ResNet50 v2     | 90.53                       | 48.63        |
+| ShuffleNet v2   | 7.66                        | 10.39        |
+| SqueezeNet 1.0  | 8.38                        | 8.90         |
+| SqueezeNet 1.1  | 8.37                        | 8.66         |
+
+* xiaomi 6(Snapdragon 835, Adreno 540)
+
+| benchmark model | cpu time(thread 1,fp16, ms) | gpu time(ms) |
+|-----------------|-----------------------------|--------------|
+| DenseNet 121    | 349.65                      | 86.81        |
+| Inception v3    | 924.54                      | 77.01        |
+| Inception v4    | 2286.02                     | 229.54       |
+| MnasNet         | 61.80                       | 16.64        |
+| MobileNet v1    | 95.46                       | 12.30        |
+| MobileNet v2    | 82.85                       | 11.58        |
+| ResNet50 v1     | 465.54                      | 65.77        |
+| ResNet50 v2     | 575.29                      | 72.23        |
+| ShuffleNet v2   | 36.93                       | 22.30        |
+| SqueezeNet 1.0  | 53.37                       | 11.60        |
+| SqueezeNet 1.1  | 53.47                       | 12.18        |
+
+* samsung Galaxy S9+(Snapdragon 845, Adreno 630)
+
+| benchmark model | cpu time(thread 1,fp16, ms) | gpu time(ms) |
+|-----------------|-----------------------------|--------------|
+| DenseNet 121    | 128.19                      | 63.65        |
+| Inception v3    | 245.01                      | 71.00        |
+| Inception v4    | 591.45                      | 145.76       |
+| MnasNet         | 21.86                       | 9.35         |
+| MobileNet v1    | 31.91                       | 10.15        |
+| MobileNet v2    | 28.22                       | 9.89         |
+| ResNet50 v1     | 152.59                      | 39.94        |
+| ResNet50 v2     | 177.18                      | 45.34        |
+| ShuffleNet v2   | 13.78                       | 9.41         |
+| SqueezeNet 1.0  | 15.71                       | 6.58         |
+| SqueezeNet 1.1  | 15.64                       | 7.00         |
+
+* Oppo K3(Snapdragon 710, Adreno 616)
+
+| benchmark model | cpu time(thread 1,fp16, ms) | gpu time(ms) |
+|-----------------|-----------------------------|--------------|
+| DenseNet 121    | 157.61                      | 114.56       |
+| Inception v3    | 299.34                      | 163.22       |
+| Inception v4    | 711.74                      | 345.85       |
+| MnasNet         | 26.08                       | 18.69        |
+| MobileNet v1    | 39.69                       | 23.10        |
+| MobileNet v2    | 34.20                       | 22.21        |
+| ResNet50 v1     | 184.75                      | 94.61        |
+| ResNet50 v2     | 216.65                      | 107.23       |
+| ShuffleNet v2   | 16.29                       | 12.90        |
+| SqueezeNet 1.0  | 19.81                       | 15.70        |
+| SqueezeNet 1.1  | 19.74                       | 15.74        |
+
+* Intel(R) Xeon(R) Gold 6133 CPU
+
+| benchmark model | cpu time(thread 1,fp32, ms) |
+|-----------------|-----------------------------|
+| Resnet50        | 151.00                      |
+| YoloV5          | 2428.00                     |
+| Bert-Based      | 832.00                      |
+| Bert-Squad10    | 1093.00                     |
+
+* TITAN Xp GPU
+
+| benchmark model | gpu time(fp32, ms) |
+|-----------------|--------------------|
+| Resnet50        | 2.22               |
+| YoloV5          | 17.47              |
+| Bert-Based      | 8.16               |
+| Bert-Squad10    | 9.60               |
+
+
+# v0.1 benchmark
+
+* Kirin970：
+
+| model                     | cpu time(single thread, ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 88           |   12         |
+| Mobilenet_v1_int8         | 55           |              |
+| Mobilenet_v2              | 58           |   11         |
+| Mobilenet_v2_int8         | 41           |              |
+| squeezenet_v1.0           | 127          |   20         |
+| squeezenet_v1.0_int8      | 82           |              |
+
+* Snapdragon 835：
+
+| model                     | cpu time(single thread, ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 94           |   16         |
+| Mobilenet_v1_int8         | 62           |              |
+| Mobilenet_v2              | 61           |   14         |
+| Mobilenet_v2_int8         | 47           |              |
+| squeezenet_v1.0           | 122          |   28         |
+| squeezenet_v1.0_int8      | 93           |              |
+
+* Snapdragon 845：
+
+| model                     | cpu time(single thread, ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 60           |   10         |
+| Mobilenet_v1_int8         | 37           |              |
+| Mobilenet_v2              | 39           |   8          |
+| Mobilenet_v2_int8         | 28           |              |
+| squeezenet_v1.0           | 74           |   14         |
+| squeezenet_v1.0_int8      | 56           |              |
diff --git a/3rdparty/TNN/doc/cn/development/add_op.md b/3rdparty/TNN/doc/cn/development/add_op.md
new file mode 100644
index 0000000..b2f6022
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/add_op.md
@@ -0,0 +1,182 @@
+# 新增OP  
+
+[English Version](../../en/development/add_op_en.md)
+
+如果需要的算子不在[算子列表](../user/support.md)中，则需要通过以下步骤添加新的算子。  
+* [添加算子解析](#1)  
+* [添加Layer实现](#2)  
+* [添加LayerAcc实现](#3)  
+* [添加单元测试](#4)  
+
+## 1. 添加算子解析 <span id = "1"></span>
+### 1.1 添加算子参数 
+
+* 添加LayerType  
+（1）修改文件 `<path_to_TNN>/source/tnn/core/layer_type.h`，在`LayerType`中添加新算子的枚举，格式为`LAYER_XXX`。  
+（2）修改文件 `<path_to_TNN>/source/tnn/core/layer_type.cc`，在`global_layer_type_map`中添加新算子枚举值对应的算子名称，此名称与proto文件中层的名称一致。  
+
+* 添加LayerParam    
+如果新算子在proto里除了输入输出blob，还有其他参数，则需要添加LayerParam，修改文件 `<path_to_TNN>/source/tnn/interpreter/layer_param.h`，添加类似`ConvLayerParam`的结构，继承于`LayerParam`
+
+```cpp
+ struct ConvLayerParam : public LayerParam {
+     int pad_type = -1;
+     // input channels of blob, divide by group
+     int input_channel = 0;
+     // the total output channels of blob, not devide by group
+     int output_channel = 0;
+     //[w_begin w_end h_begin h_end d_begin d_end]
+     std::vector<int> pads;
+     // order [w h d]
+     std::vector<int> kernels;
+     // order [w h d]
+     std::vector<int> strides;
+     // order [w h d]
+     std::vector<int> dialations;
+     int group           = 1;
+     int bias            = 0;
+     int activation_type = ActivationType_None;
+ };
+```
+
+* 添加LayerResource    
+如果新算子有需要保存到model里的参数，则需要添加LayerResource，修改文件 `<path_to_TNN>/source/tnn/interpreter/layer_resource.h`，添加类似`ConvLayerResource`的结构，继承于`LayerResource`
+
+```cpp
+ struct ConvLayerResource : public LayerResource {
+     // conv layer filter format
+     ConvLayerFilterFormat filter_format = OIHW;
+
+     // conv layer handle
+     // NOTE: for deconv, the weight's default format is  [n][i][o][h][w]
+     RawBuffer filter_handle;
+
+     // bias handle
+     RawBuffer bias_handle;
+
+     // extra scale handle for different precision
+     RawBuffer scale_handle;
+ };
+```
+
+### 1.2 添加LayerInterpreter 
+如果新算子添加了LayerParam或者LayerResource，则需要添加对应的`LayerInterpreter`。在文件夹`<path_to_TNN>/source/tnn/interpreter/tnn/layer_interpreter`下添加对应的实现。  
+(1）通过`DECLARE_LAYER_INTERPRETER()`声明新算子的Interpreter；  
+(2）通过`REGISTER_LAYER_INTERPRETER()`注册新算子的Interpreter；  
+(3）实现以下接口：  
+* `InterpretProto()` -- 解析新算子的LayerParam  
+* `InterpretResource()`  -- 解析新算子的LayerResource  
+* `SaveProto()`  -- 保存新算子的LayerParam  
+* `SaveResource()`  -- 保存新算子的LayerResource  
+
+## 2. 添加Layer实现 <span id = "2"></span>  
+在文件夹 `<path_to_TNN>/source/tnn/layer` 下添加对应layer的实现。   
+(1）`DECLARE_LAYER()` 声明新算子的Layer实现；   
+(2）`REGISTER_LAYER()` 注册新算子的Layer实现；     
+(3）实现以下接口：   
+* `InferOutputDataType()` -- 设置对应层输出Blob的数据类型  
+* `InferOutputShape()` -- 计算对应层输出Blob的大小  
+
+## 3. 添加LayerAcc实现 <span id = "3"></span>
+每个新的算子都需要实现对应设备的LayerAcc。  
+### 3.1 CPU平台  
+在文件夹`<path_to_TNN>/source/tnn/device/cpu/acc`下添加对应算子的LayerAcc实现。  
+(1）`DECLARE_CPU_ACC()` 声明新算子的LayerAcc实现；  
+(2）`REGISTER_CPU_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Forward()` -- 新算子的cpu实现；  
+  
+### 3.2 ARM平台  
+在文件夹`<path_to_TNN>/source/tnn/device/arm/acc`下添加对应算子的LayerAcc实现。    
+(1）声明新算子的LayerAcc实现，如果没有特殊的参数，可以直接使用`DECLARE_ARM_ACC()`声明；  
+(2）`REGISTER_ARM_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Init()` -- 对LayerParam和LayerResource进行处理；  
+* `Reshape()` -- 实现在输入blob大小改变的情况下的逻辑；  
+* `Forward()` -- 新算子的ARM实现；  
+
+### 3.3 OpenCL平台  
+在文件夹`<path_to_TNN>/source/tnn/device/opencl/acc`下添加对应算子的LayerAcc实现。  
+(1）声明新算子的LayerAcc实现，如果没有特殊的参数，可以直接使用`DECLARE_OPENCL_ACC()`声明；  
+(2）`REGISTER_OPENCL_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Init()` -- 对LayerParam和LayerResource进行处理，创建OpenCL的kernel；  
+* `Reshape()` -- 实现在输入blob大小改变的情况下的逻辑，对于OpenCL，在此处调用SegArgs设置kernel参数；  
+* `Forward()` -- 执行OpenCL的kernel；  
+
+(4）实现OpenCL的kernel，在目录 `<path_to_TNN>/source/tnn/device/opencl/cl` 添加对应的kernel文件，以.cl为后缀。添加之后需要执行脚本:
+ 
+ ``` python
+ python opencl_codegen.py
+ ```
+
+### 3.4 Metal平台  
+在文件夹`<path_to_TNN>/source/tnn/device/metal/acc`下添加对应算子的LayerAcc实现。
+(1）声明新算子的LayerAcc实现，如果没有特殊的参数，可以直接使用`DECLARE_METAL_ACC()`声明；  
+(2）`REGISTER_METAL_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Init()`  
+* `Reshape()`    
+* `Forward()`    
+
+(4）实现Metal的kernel，在目录 `<path_to_TNN>/source/tnn/device/metal/acc` 添加对应的metal文件，以.metal为后缀。
+
+### 3.5 NPU平台  
+在文件夹`<path_to_TNN>/source/tnn/device/huawei_npu/convert`下添加对应算子的LayerConvert实现。  
+(1）声明新算子的LayerConvert实现，如果没有其他权重input，可以直接使用`DECLARE_NPU_LAYER`声明；  
+(2）`REGISTER_NPU_LAYER` 注册新算子的LayerConvert实现；  
+(3）实现以下接口：   
+* `Convert()` -- 使用ir翻译tnn模型算子；  
+
+### 3.6 X86平台  
+
+#### 3.6.1 openvino算子导入
+在文件夹`<path_to_TNN>/source/tnn/network/openvino/layer_builder`下添加对应算子的OpenVINOLayerBuilder实现。  
+(1）声明新算子的OpenVINOLayerBuilder实现，可以直接使用`DECLARE_OPENVINO_LAYER_BUILDER`声明；  
+(2）`REGISTER_OPENVINO_LAYER_BUILDER` 注册新算子的LayerConvert实现；  
+(3）实现以下接口：   
+* `Build()` -- 将tnn的算子转换成ngraph的node；  
+  
+对于openvino不支持或者性能较差的算子可以注册custom op来替代openvino的op。
+(1）在`<path_to_TNN>/source/tnn/network/openvino/custom_layer`下使用`DECLARE_CUSTOM_IMPLEMENTATION`和`REGISTER_CUSTOM_IMPLEMENTATION`进行声明和注册
+(2）在`Build()`函数里使用已注册的custom op来构建ngraph的node
+
+#### 3.6.2 kernel编写
+在文件夹`<path_to_TNN>/source/tnn/device/x86/acc`下添加对应算子的LayerAcc实现。    
+(1）声明新算子的LayerAcc实现，如果没有特殊的参数，可以直接使用`DECLARE_X86_ACC()`声明；  
+(2）`REGISTER_X86_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Init()` -- 对LayerParam和LayerResource进行处理；  
+* `Reshape()` -- 实现在输入blob大小改变的情况下的逻辑；  
+* `Forward()` -- 新算子的X86实现；  
+
+### 3.7 CUDA平台  
+
+#### 3.7.1 tensorrt算子导入
+在文件夹`<path_to_TNN>/source/tnn/network/tensorrt/layer_builder`下添加对应算子的TensorRTLayerBuilder实现。  
+(1）声明新算子的TensorRTLayerBuilder实现，可以直接使用`DECLARE_TENSORRT_LAYER_BUILDER`声明；  
+(2）`REGISTER_TENSORRT_LAYER_BUILDER` 注册新算子的TensorRTLayerBuilder实现；  
+(3）实现以下接口：   
+* `AddToNetwork()` -- 网络导入对应的tensorrt算子。  
+  
+对于tensorrt不支持或者性能较差的算子可以注册plugin op来替代tensorrt的op。
+(1）在`<path_to_TNN>/source/tnn/network/tensorrt/layer_build`下使用`DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER`和`REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER`进行声明和注册
+(2）实现以下接口：  
+*  `supportsFormatCombination` -- 判断支持的数据类型和数据排布；  
+*  `getPluginType` -- 自定义plugin type；  
+*  `getOutputDataType` -- 设定输出data type； 
+*  `AddToNetwork` -- 实现插件导入网络； 
+*  `getOutputDimensions` -- 返回输出尺寸计算公式
+*  `getPluginName` -- 自定义plugin name
+
+#### 3.7.2 kernel编写
+在文件夹`<path_to_TNN>/source/tnn/device/cuda/acc`下添加对应算子的LayerAcc实现。    
+(1）声明新算子的LayerAcc实现，如果没有特殊的参数，可以直接使用`DECLARE_CUDA_ACC()`声明；  
+(2）`REGISTER_CUDA_ACC()` 注册新算子的LayerAcc实现；  
+(3）实现以下接口：  
+* `Init()` -- 对LayerParam和LayerResource进行处理；  
+* `Reshape()` -- 实现在输入blob大小改变的情况下的逻辑;  
+* `Forward()` -- 新算子的CUDA实现;    
+
+## 4. 添加单元测试 <span id = "4"></span>  
+在文件夹 `<path_to_TNN>/test/unit_test/layer_test` 下添加对应层的单元测试文件。
diff --git a/3rdparty/TNN/doc/cn/development/architecture.md b/3rdparty/TNN/doc/cn/development/architecture.md
new file mode 100644
index 0000000..2dbd012
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/architecture.md
@@ -0,0 +1,147 @@
+# 架构详解
+
+[English Version](../../en/development/architecture_en.md)
+
+## 一、API设计
+考虑开源库后期维护及版本兼容性，所有对外暴露接口均通过include目录统一管理。具体API相关介绍可参见[API文档](../user/api.md)
+
+
+## 二、模型解析
+
+对模型解析相关接口进行了抽象，可支持多种模型格式解析和扩充，相关代码见source/tnn/interpreter模块。
+
+ <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/model_reinterpreter.png"/>
+
+AbstractModelInterpreter定义了抽象的Interpret接口，不同的模型解析器解析不同类型模型。DefaultModelInterpreter相关的接口将相关结果存入NetStruture和NetResource结构中，部分第三方模型无法完成内部结构解析的有单独适配，如CoreMLModelInterpreter，以完成第三方库适配。
+
+不同的模型解析器均有对应的creator
+
+```cpp
+// @brief ModelInterpreterCreator define model interpreter creator interface
+class ModelInterpreterCreator {
+public:
+    virtual ~ModelInterpreterCreator() {};
+    virtual AbstractModelInterpreter* CreateModelInterpreter() = 0;
+};
+
+// @brief TypeModelInterpreterCreator create different type model interpreter
+template<typename T>
+class TypeModelInterpreterCreator:public ModelInterpreterCreator {
+    virtual AbstractModelInterpreter* CreateModelInterpreter() {
+        return new T();
+    }
+};
+```
+
+不同的model interpreter creator均通过Register进行注册。
+
+```cpp
+//@brief TypeModelInterpreterRegister register TypeModelInterpreterCreator
+template <typename T>
+class TypeModelInterpreterRegister {
+public:
+    TypeModelInterpreterRegister(ModelType type) {
+        GetGlobalModelInterpreterCreatorMap()[type] = std::shared_ptr<T>(new T());
+    }
+};
+
+```
+
+以TNN模型解析注册为例： TypeModelInterpreterRegister\<TypeModelInterpreterCreator\<ModelInterpreter>> g\_tnn\_model\_interpreter\_register(MODEL\_TYPE\_TNN);
+
+通过TypeModelInterpreterRegister构造函数，可将TNN对应的TypeModelInterpreterCreator\<ModelInterpreter>注册到全局model interpreter creator map中，后续通过model type即可获取对应creator并构建对应的model interpreter。
+
+
+## 三、网络构建
+
+网络构建主要包含两大部分，第一部分为网络Layer构建，第二部分为Blob节点构建。
+
+
+```cpp
+
+//@brief BaseLaye define the layer interface
+class BaseLayer {
+public:
+
+    ...
+
+    virtual Status Init(Context* context, LayerParam* param,
+                        LayerResource* resource, std::vector<Blob*>& inputs,
+                        std::vector<Blob*>& outputs,
+                        AbstractDevice* device);
+
+    ...
+};
+
+```
+
+与前面模型注册机制类似，不同Layer会注册不同的Layer Creator。通过Layer Type获取对应的Layer Creator后即可构建出对应的Layer,Layer构建完成后可计算对应输出blob尺寸以及创建平台加速算子。
+
+Blob节点构建核心在于内存的分配和优化，主要分为blob内存循环复用，blob内存拼接与监控。
+
+<div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/blob_memory.png"/>
+
+首先不同layer输出blob间内存会通过内部算法实现循环复用，不同blob间内存复用会优先选择尺寸接近的blob。
+
+确定blob内存复用关系后，会对blob内存进行拼接，并统一分配内存，最终同一Instance不同blob间持有相同的base指针以及不同的偏移量，同一线程多个instance间以及不同线程instance间内存有了内存复用的基础。TNN内部提供了单一线程内不同instance间内存复用自动实现机制，通过SHARE\_MEMORY\_MODE\_SHARE\_ONE\_THREAD构建的Instance会自动实现多Instance内存复用。同时SHARE\_MEMORY\_MODE\_SET\_FROM\_EXTERNAL构建的Instance支持内存外部传入，由调用者维护内存复用关系以及内存分配释放，对于多线程复用还需要处理线程间加锁机制。
+
+## 四、多平台加速算子实现
+
+<div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/device.png"/>
+
+抽象AbstractDevice接口，用于隐藏不同Device实现细节。提供Device Memory 尺寸计算，Device Memory分配释放，内存CPU Memory与Device meomoy拷贝，Device Layer加速算子构建，以及Instance对应Device Context构建等接口。
+
+```cpp
+// @brief AbstractDevice define create memory, context and layer acc interface.
+class AbstractDevice {
+public:
+    ...
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc) = 0;
+    ...
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims) = 0;
+    ...
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info) = 0;
+    ...
+    virtual Status Free(void* handle) = 0;
+    ...
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src,
+                                BlobDesc& desc, void* command_queue) = 0;
+    ...
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src,
+                                  BlobDesc& desc, void* command_queue) = 0;
+    ...
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type) = 0;
+    ...
+    virtual Context* CreateContext(int device_id) = 0;
+    ...
+};
+```
+
+网络构建根据配置的DeviceType可获取对应的Device实现，不同的Layer通过CreateLayerAcc接口即可构建特定平台加速算子，并通过统一的抽象基类接口AbstractLayerAcc进行交互。
+
+```cpp
+
+// @brief AbstractLayerAcc define the layer acc interface
+class AbstractLayerAcc {
+public:
+
+    ...
+
+    virtual Status Init(Context *context, LayerParam *param,
+                        LayerResource *resource,
+                        const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) = 0;
+
+    ...
+
+    virtual Status Forward(const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs) = 0;
+};
+
+```
+
+同样不同的LayerAcc通过注册机制进行注册，Layer根据LayerType即可构建不同的LayerAcc。
+
+## 五、单元测试
+
+TNN 单元测试基于googletest构建，当前主要对Layer Acc以及blob converter构建了单元测试。单元测试以CPU Default实现为对齐基准，以监控不同平台加速算子实现，具体单元测试相关介绍可参见[单元测试](unit_test.md)
diff --git a/3rdparty/TNN/doc/cn/development/contributing.md b/3rdparty/TNN/doc/cn/development/contributing.md
new file mode 100644
index 0000000..9622d1e
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/contributing.md
@@ -0,0 +1,15 @@
+# 贡献代码
+
+[English Version](../../en/development/contributing_en.md)
+## 贡献代码
+
+[添加算子](add_op.md)
+
+## 编程风格
+
+TNN项目对于C、C++、Objective-C、Python、Shell代码风格参照
+[谷歌开源项目风格指南](https://zh-google-styleguide.readthedocs.io/en/latest/contents/)
+
+## 代码格式化
+
+TNN项目代码格式化使用clang-format、git-clang-format, 格式化后部分缩进格式与谷歌开源项目风格有差异，以clang-format为准。
diff --git a/3rdparty/TNN/doc/cn/development/model_check.md b/3rdparty/TNN/doc/cn/development/model_check.md
new file mode 100644
index 0000000..6984f4c
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/model_check.md
@@ -0,0 +1,58 @@
+# 模型结果校验
+
+[English Version](../../en/development/model_check_en.md)
+
+## 一、工具的作用
+校验对应平台（OpenCL，Metal，Cuda，ARM，HuaweiNPU）的模型输出结果是否正确。
+
+## 二、编译
+编译model_check工具需要将以下宏设置为ON：  
+* 打开以下选项编译TNN（编译方法参照[TNN编译文档](../user/compile.md))
+* `TNN_CPU_ENABLE`  
+* `TNN_MODEL_CHECK_ENABLE`
+* 对应device的宏，如`TNN_OPENCL_ENABLE`, `TNN_ARM_ENABLE`
+
+## 三、校验工具使用
+### 1. 命令
+```
+./model_check [-h] [-p] <tnnproto> [-m] <tnnmodel> [-d] <device> [-i] <input> [-f] <refernece> [-e] [-n] <val> [-s] <val> [-o] [-b]
+```
+### 2. 参数说明
+
+|命令参数           |是否必须|带参数 |参数说明                                       |  
+|:------------------|:------:|:-----:|:-------------------------------------------|  
+|-h, --help         |        |       |输出命令提示。                                |  
+|-p, --proto        |&radic; |&radic;|指定tnnproto模型描述文件。                   |   
+|-m, --model        |&radic; |&radic;|指定tnnmodel模型参数文件。                   |  
+|-d, --device       |&radic; |&radic;|指定模型执行的平台，如OPENCL，ARM，METAL，CUDA，HUAWEI_NPU等。    |  
+|-i, --input        |        |&radic;|指定输入文件。目前支持格式为：<br>&bull; 文本文件（文件后缀为.txt）, 格式与模型转换工具导出的输入格式一致。<br>&bull; 常用图片格式文件（文件后缀为 .jpg .jpeg .png .bmp）<br>如果不指定，则会使用 (-1, 1) 随机输入|  
+|-f, --ref          |        |&radic;|采用指定输出进行结果对比。目前支持格式为：<br>&bull; 文本文件（文件后缀为.txt），格式与模型转换工具导出的输出格式一致。|  
+|-e, --end          |        |       |仅校验模型的最终输出。                         |  
+|-n, --bias         |        |&radic;|预处理，仅对输入为图片时有效。对输入数据各通道进行bias操作，参数格式为：0.0,0.0,0.0|  
+|-s, --scale        |        |&radic;|预处理，仅对输入为图片时有效。对输入数据各通道进行scale操作，参数格式为：1.0,1.0,1.0|  
+|-o, --output       |        |       |是否保存最终的输出。                           |  
+|-b, --batch        |        |       |验证多batch情况下，每个batch结果是否正确。（还未开发完成） |  
+
+## 四、执行脚本
+### 1. Android
+#### 1.1 模型准备
+将待校验的模型的tnnproto和tnnmodel文件拷贝进`<path_to_tnn>/platforms/android/models`，并改名为`test.tnnproto`和`test.tnnmodel`
+#### 1.2 执行脚本
+```
+cd <path_to_tnn>/platforms/android/
+./model_check_android.sh -c -m <tnnproto> -p
+```
+### 2. Linux
+#### 2.1. 编译脚本
+```
+cd <path_to_tnn>/platforms/linux/
+./build_model_check.sh -c
+```
+#### 2.2. 执行命令
+```
+<path_to_tnn>/platforms/linux/build/model_check -p <path_to_tnnproto> -m <path_to_tnnmodel> -d <DEVICE>
+```
+
+## 五、工具限制
+* 目前只支持fp32的模型校验；
+* 对于逐层校验，只针对fp32精度下的结果进行校验；对于最后结果校验，使用Auto精度进行校验。
diff --git a/3rdparty/TNN/doc/cn/development/profiling.md b/3rdparty/TNN/doc/cn/development/profiling.md
new file mode 100644
index 0000000..0642331
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/profiling.md
@@ -0,0 +1,126 @@
+# 模型性能分析
+
+[English Version](../../en/development/profiling_en.md)
+
+分析模型耗时情况
+
+## 一、iOS平台耗时测试
+### 测试步骤
+1. 添加测试模型
+
+   在`<path_to_tnn>/model`目录下添加测试模型，每个模型一个文件夹，文件夹中包含以proto和model结尾的模型文件。目前工程中已有模型squeezenetv1.1
+
+2. 打开benchmark工程
+
+   进入目录`<path_to_tnn>/benchmark/benchmark_ios`，双击打开benchmark工程
+
+3. 设置开发者账号
+
+   如下图点击benchmark工程，找到工程设置`Signing & Capabilities`，点击Team选项卡选择`Add an Account...`
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/ios_add_account_benchmark.jpg" width = "75%" height = "75%"/>
+
+   在如下界面输入Apple ID账号和密码，添加完成后回到`Signing & Capabilities`界面，并在Team选项卡中选中添加的账号。如果没有Apple ID也可以通过`Create Apple ID`选项根据相关提示进行申请。
+
+    `PS：申请Apple ID无需付费，可以即时通过，通过后才可在真机上运行APP调试`
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/ios_set_account.jpg" width = "75%" height = "75%"/>
+
+
+4. 真机运行  
+
+   4.1 修改`Bundle Identitifier`
+
+   如图在现有`Bundle Identifier`后随机添加后缀（限数字和字母），避免个人账户遇到签名冲突。
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/ios_set_bundleid_benchmark.jpg" width = "75%" height = "75%"/>
+
+   4.2 验证授权
+
+   首次运行先利用快捷键`Command + Shift + K`对工程进行清理，再执行快捷键`Command + R`运行。如果是首次登陆Apple ID，Xcode会弹框报如下错误，需要在iOS设备上根据提示进行授权验证。一般来说手机上的授权路径为：设置 -> 通用 -> 描述文件与设备管理 -> Apple Development选项 -> 点击信任
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/ios_verify_certificate_benchmark.jpg" width = "75%" height = "75%"/>
+
+   4.3 运行结果
+
+   首次运行先利用快捷键`Command + Shift + K`对工程进行清理，再执行快捷键`Command + R`运行。在界面上点击Run按钮，界面会显示model目录下所有模型的CPU和GPU耗时情况。iPhone7真机运行结果如下图。
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/ios_benchmark_result.jpg" width = "50%" height = "50%"/>
+
+   PS：
+
+   a) 由于GPU和CPU加速原理不同，具体模型的GPU性能不一定比CPU高，与具体机型、模型结构以及工程实现有关。欢迎大家参与到TNN开发中，共同进步。
+
+   b) 如遇到`Unable to install...`错误提示，请在真机设备上删除已有的benchmark app，重新运行安装。
+
+   c) 真机运行时，如果遇到CodeSign错误`Command CodeSign failed with a nonzero exit code`，可参看issue20 `iOS Demo运行步骤说明`
+
+## 二、Android平台耗时测试
+### 1. 环境搭建  
+#### 1.1 编译环境  
+参考[TNN编译文档](../user/compile.md) 中Android库编译，检查环境是否满足要求。  
+
+#### 1.2 执行环境  
+* adb命令配置  
+下载[安卓SDK工具](https://developer.android.com/studio/releases/platform-tools)，将`platform-tool`目录加入`$PATH`环境变量中。  
+PS: 如果adb版本过低，可能执行脚本会失败。当前测试的adb版本为：29.0.5-5949299
+```
+export PATH=<path_to_android_sdk>/platform-tools:$PATH
+```
+
+### 2. 添加模型
+在`<path_to_tnn>/benchmark/benchmark-model`目录下，将要测试模型的tnnproto放入文件夹，例如，
+```
+cd <path_to_tnn>/benchmark/benchmark-model
+cp mobilenet_v1.tnnproto .
+```
+
+
+### 3. 修改脚本
+在脚本`benchmark_models.sh`中的`benchmark_model_list`变量里添加模型文件名，例如：
+```
+ benchmark_model_list=(
+ #test.tnnproto \
+ mobilenet_v1.tnnproto \    # 待测试的模型文件名
+)
+```
+
+### 4. 执行脚本
+```
+./benchmark_models.sh  [-32] [-c] [-b] [-f] [-d] [-bs] <device-id> [-t] <CPU/GPU>
+参数说明：
+    -32   编译32位的库，否则为64位
+    -c    删除之前的编译文件，重新编译
+    -b    仅编译，不执行
+    -f    打印每一层的耗时，否则是整个网络的平均耗时。
+    -t    指定执行的平台。需要加上<CPU/GPU/HUAWEI_NPU>
+    -bs   shell运行可执行文件测试
+```
+P.S. 不指定 -t, 默认跑CPU和GPU, 华为npu benchmark需通过-t HUAWEI_NPU特殊制定.
+#### 4.1 全网络性能分析：
+分析整体网络耗时，执行多次，获取平均性能。  
+执行脚本：
+```
+./benchmark_models.sh -c
+```
+结果如图：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/android_profiling.jpg" width = "75%" height = "75%"/>
+
+执行结果会保存在`benchmark_models_result.txt`中。
+
+
+#### 4.2 逐层性能分析：
+逐层性能分析工具可准备计算各层耗时，以便进行模型优化和op性能问题定位。  
+执行脚本:
+```
+./benchmark_models.sh -c -f
+```
+结果如图：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/opencl_profiling.jpg" width = "75%" height = "75%"/>
+
+执行结果会保存在`benchmark_models_result.txt`中。  
+P.S. 华为npu不支持每层分析。
+
+### 5. 特殊说明
+* 对于OpenCL平台，逐层性能分析的目的是分析kernel的耗时分布，其中为了打印每层耗时，有额外开销，只有kernel时间具有参考意义。如果要看整体实际性能，需要参考全网络性能分析。
+* Android系统相比shell执行可执行文件耗时测试，app耗时测试的性能更贴近真实安卓app执行的性能。受安卓调度策略的影响，两种方式的性能可能有明显差异。综上所述，安卓app耗时测试更为推荐。
diff --git a/3rdparty/TNN/doc/cn/development/resource/android_profiling.jpg b/3rdparty/TNN/doc/cn/development/resource/android_profiling.jpg
new file mode 100644
index 0000000..64d565d
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/android_profiling.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/ios_add_account_benchmark.jpg b/3rdparty/TNN/doc/cn/development/resource/ios_add_account_benchmark.jpg
new file mode 100644
index 0000000..b13c9f7
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/ios_add_account_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/ios_benchmark_result.jpg b/3rdparty/TNN/doc/cn/development/resource/ios_benchmark_result.jpg
new file mode 100644
index 0000000..ad38168
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/ios_benchmark_result.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/ios_set_account.jpg b/3rdparty/TNN/doc/cn/development/resource/ios_set_account.jpg
new file mode 100644
index 0000000..208a96d
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/ios_set_account.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/ios_set_bundleid_benchmark.jpg b/3rdparty/TNN/doc/cn/development/resource/ios_set_bundleid_benchmark.jpg
new file mode 100644
index 0000000..fde76f7
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/ios_set_bundleid_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/ios_verify_certificate_benchmark.jpg b/3rdparty/TNN/doc/cn/development/resource/ios_verify_certificate_benchmark.jpg
new file mode 100644
index 0000000..5ec90b2
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/ios_verify_certificate_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/resource/opencl_profiling.jpg b/3rdparty/TNN/doc/cn/development/resource/opencl_profiling.jpg
new file mode 100644
index 0000000..1e1b587
Binary files /dev/null and b/3rdparty/TNN/doc/cn/development/resource/opencl_profiling.jpg differ
diff --git a/3rdparty/TNN/doc/cn/development/unit_test.md b/3rdparty/TNN/doc/cn/development/unit_test.md
new file mode 100644
index 0000000..dc3a9e0
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/development/unit_test.md
@@ -0,0 +1,49 @@
+# 单元测试 
+
+[English Version](../../en/development/unit_test_en.md)
+
+本文档主要介绍当前单元测试的目的、用法及注意事项。
+
+##  单元测试用途
+
+当前单元测试有两个用途：
+
+1. 验证各个OP在不同平台上的结果正确性。
+2. 充当OP性能测试工具，在不需要模型的情况下测试OP性能。
+
+## 需了解的代码的信息
+
+TNN代码中OP通过Layer这个类型来实现，但Layer类型仅仅实现了Blob Shape推理等计算无关的逻辑。不同平台的计算由layer_acc实现。
+因此，Layer单元测试中通过两个层计算，然后对比结果，以此对比结果正确性。
+
+## 使用方法
+
+### 编译方法
+
+* 打开以下选项编译TNN（编译方法参照[TNN编译文档](../user/compile.md))
+* TNN_UNIT_TEST_ENABLE=ON 
+* 如果用于OP性能测试，需同时打开 TNN_BENCHMARK_ENABLE 开关:
+* TNN_BENCHMARK_ENABLE=ON 
+    
+### 运行方法
+
+编译成功后执行以下命令：
+
+    ./test/unit_test/unit_test -ic 1
+    
+ic 参数用于控制每个单元测试重复进行的次数，通常用1即可，其他可选参数如下:
+
+    -dt {ARM|OPENCL|METAL} // 测试的计算设备类型
+    -lp ${load_library_path} // OPENCL 及 METAL 需要加载的库路径
+    -th ${num_threads} // 线程数，默认为1
+    -ub {0|1} // 是否打印计算性能数据(GFLOPS)，用于性能测试
+    
+一个实际的测试例子如下:
+    
+    ./test/unit_test/unit_test -ic 1 -dt ARM -th 4 -ub 0
+    
+
+## 注意事项 
+
+单元测试中通过GTEST WithParamInterface 接口生成了很多参数组合。若需更改或自定义参数，可查看 INSTANTIATE_TEST_SUITE_P 宏相关代码。
+
diff --git a/3rdparty/TNN/doc/cn/faq.md b/3rdparty/TNN/doc/cn/faq.md
new file mode 100644
index 0000000..69b858d
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/faq.md
@@ -0,0 +1,154 @@
+# FAQ 常见问题
+
+[English Version](../en/faq_en.md)
+
+## 一、编译问题
+
+### 编译环境要求：
+    general:  
+        cmake >= 3.1  
+        gcc >= 4.8  
+        NDK >= r14b  
+    模型转换:  
+        python >= 3.5  
+        onnxruntime>=1.1  
+        onnx-simplifier>=0.2.4  
+        protobuf >= 3.0  
+
+### ARMv8.2编译报错
+若要支持ARMv8.2编译，ndk版本版本至少为r18b  
+        
+## 二、模型转换问题
+
+### 如何支持tensorflow, caffe, mxnet模型？
+* 我们统一通过onnx中间格式支持各大训练框架，开源社区维护有很好的各大框架转换为onnx的工具  
+* [tensorflow2onnx](https://github.com/onnx/tensorflow-onnx): typical usage: python -m tf2onnx.convert --inputs-as-nchw [输入tensor]:0   --graphdef [输入文件].pb  --inputs [输入tensor]:0  --outputs [输出tensor]:0  --opset 11 --output [输出文件].onnx  
+* [caffe2onnx](./user/caffe2tnn.md)  
+* [Mxnet: export onnx model](https://mxnet.apache.org/api/python/docs/tutorials/deploy/export/onnx.html)  
+* [Pytorch: EXPORTING A MODEL FROM PYTORCH TO ONNX](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) 
+
+### 模型对齐问题排查
+* [模型对齐问题排查](./model_align.md) 
+
+## 三、运行问题
+
+### 是否支持可以在PC上运行
+TNN支持在linux和windows上编译和运行
+
+### 如何运行bfp16代码
+TNNTest的运行参数-pr设为LOW
+
+### cv::Mat如何转换成TNN::Mat
+```cpp
+cv::Mat cv_mat;
+MatType mat_type = N8UC4; // if cv_mat.channels() == 3, then mat_type = N8UC3.
+DimsVector dims = {1, cv_mat.channels(), cv_mat.rows, cv_mat.cols};
+auto tnn_mat = new TNN::Mat(DeviceType, mat_type, dims, (void *)cv_mat.ptr);
+```
+
+### 常见错误码介绍. 
+Status调用description()接口可获取更多错误信息描述。
+
+0x1002(4098): 模型解析错误。检查确保ModelConfig配置的为文件内容而非文件路径。  
+
+0x6005(24581): 模型weights信息缺失。TNN的benchmark可以只用proto文件，是因为开启了TNN_BENCHMARK_MODE，weights自动生成，仅用来评估速度。  
+
+0x2000(8192): 错误信息not support model type。检查Android静态库集成链接需添加-Wl,--whole-archive tnn -Wl,--no-whole-archive，iOS库集成链接需要添加force_load。  
+
+0x9000(36864): device type类型不支持。（1）确保相关device type编译选项已开启 （2）Android静态库集成链接需添加-Wl,--whole-archive tnn -Wl,--no-whole-archive，iOS库集成链接需要添加force_load。  
+
+##  四、NPU相关问题
+
+### 如何创建华为NPU编译环境? 
+选项1: 
+  在 <TNN_PROJECT>/thrid_party/huawei_npu/ 下运行 ./download_ddk.sh 脚本下载最新版的ddk。
+  
+
+选项2：
+1. 到华为开发者联盟下载DDK[https://developer.huawei.com/consumer/cn/doc/overview/HUAWEI_HiAI]
+2. 解压缩
+3. 进入到下载文件夹下的`ddk/ai_ddk_lib`目录
+4. 在`<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`下创建`armeabi-v7a`文件夹， 并将ai_ddk_lib目录下的lib文件夹中所有文件复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a`
+5. 在`<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`下创建`arm64-v8a`文件夹，并将ai_ddk_lib目录下的lib64文件夹中所有文件复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/arm64-v8a`
+6. 将ai_ddk_lib目录下include`文件夹`复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`目录下
+
+`<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`文件结构应该如下：
+
+```
+hiai_ddk_latest
+├── arm64-v8a 
+│   ├── libcpucl.so 
+│   ├── libhcl.so
+│   ├── libhiai.so
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+├── armeabi-v7a
+│   ├── libcpucl.so
+│   ├── libhcl.so
+│   ├── libhiai.so
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+└── include
+    ├── HiAiAippPara.h
+    ├── HiAiModelManagerService.h
+    ├── HiAiModelManagerType.h
+    ├── graph
+    │   ├── attr_value.h
+    │   ├── buffer.h
+    │   ├── common
+    │   │   └── secures\tl.h
+    │   ├── debug
+    │   │   └── ge_error_codes.h
+    │   ├── detail
+    │   │   └── attributes_holder.h
+    │   ├── graph.h
+    │   ├── model.h
+    │   ├── op
+    │   │   ├── all_ops.h
+    │   │   ├── array_defs.h
+    │   │   ├── const_defs.h
+    │   │   ├── detection_defs.h
+    │   │   ├── image_defs.h
+    │   │   ├── math_defs.h
+    │   │   ├── nn_defs.h
+    │   │   └── random_defs.h
+    │   ├── operator.h
+    │   ├── operator_reg.h
+    │   ├── tensor.h 
+    │   └── types.h
+    └── hiai_ir_build.h
+```
+
+### NPU版本限制：
+* 如果获取手机的ROM在100.320.xxx.xxx以下
+  报错
+  ERROR: npu is installed but is below 100.320.xxx.xxx
+* 如果没有npu或是非华为手机 ：
+  报错 
+  ERROR: GetRomVersion(ROM): npu is not installed or rom version is too low
+  
+### 如何更新到最新的ROM去支持NPU？ 
+* 到 设置 >> 系统和更新 >> 软件更新中检查最新的ROM版本并更新。
+
+### 如何创建RKNPU编译环境? 
+1. 在`<path_to_tnn>/third_party`下创建rknpu文件夹并进入，然后执行： `git clone https://github.com/airockchip/rknpu_ddk.git`。
+2. 在`<path_to_tnn>/scripts/build_aarch64_linux.sh`文件中加入`-DTNN_RK_NPU_ENABLE:BOOL=ON`选项并编译即可。
+
+
+## 五、其他
+### 如何获取模型中间结果？
+* 修改项目目录下 /source/tnn/utils/blob_dump_utils.h 中
+*    \#define DUMP_INPUT_BLOB 0  --> #define DUMP_INPUT_BLOB 1，获取每层输入
+*    \#define DUMP_OUTPUT_BLOB 0 --> #define DUMP_OUTPUT_BLOB 1，获取每层输出
+* 仅作为调试使用
+
+### 七、如何获取模型各个layer耗时？
+* 参考profiling文档[性能测试](./development/profiling.md)
+
+### 网络问题
+```text
+//mac下homebrew安装
+//https://zhuanlan.zhihu.com/p/59805070
+//https://brew.sh/index_zh-cn
+//替换国内镜像的安装脚本
+```
diff --git a/3rdparty/TNN/doc/cn/front_page.md b/3rdparty/TNN/doc/cn/front_page.md
new file mode 100644
index 0000000..3efeab3
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/front_page.md
@@ -0,0 +1,124 @@
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+[English Version](../en/front_page_en.md)
+
+## 快速开始
+
+使用TNN非常简单，如果你有一个已经训练好的模型, 那么一般而言通过以下三个步骤就能完成模型在目标平台上的部署。
+1. 第一步是把训练好的模型转换成TNN的模型，为此我们提供了丰富的工具来帮助你完成这一步，无论你使用的是Tensorflow、Pytorch、或者Caffe，都可以轻松完成转换。
+详细的手把手教程可以参见这里[如何转换模型](./user/convert.md)。
+
+2. 当你完成了模型的转换，第二步就是编译目标平台的TNN引擎了，你可以根据自己的目标平台的硬件支持情况，选择CPU/ARM/OpenCL/Metal/NPU等加速方案。
+   对于这些平台，TNN都提供了一键编译的脚本，使用非常方便。详细步骤可以参考这里[如何编译TNN](./user/compile.md)。
+
+3. 最后一步就是使用编译好的TNN引擎进行推理，你可以在自己的应用程序中嵌入对TNN的调用，这方面我们提供了丰富而详实的demo来帮助你完成。
+    *  [从0开始跑通一个iOS Demo](./user/demo.md)
+    *  [从0开始跑通一个Android Demo](./user/demo.md)
+
+## 技术方案
+
+TNN作为一个移动端高性能、轻量级的推断框架，同时拥有跨平台、高性能、模型压缩、代码裁剪等众多突出优势。TNN框架借鉴了业界主流开源框架的优点，沉淀和整合了优图实验室Rapidnet，ncnn框架上的积累，并联合深度学习框架OTeam各个部门（PCG，TEG，IEG），共同打造公司级统一移动端推断框架。
+目前，TNN已在各大实际业务中上线，其具有的以下特性获得了广泛的好评。
+
+* 计算优化
+    * 针对不同架构在硬件指令发射、吞吐、延迟、缓存带宽、缓存延迟、寄存器数量等特点，深度优化底层算子，极致利用硬件算力
+    * 主流硬件平台(CPU: ARMv7， ARMv8， GPU: Mali， Adreno， Apple) 深度调优
+    * CNN核心卷积运算通过Winograd， Tile-GEMM， Direct Conv等多种算法实现，保证不同参数、计算尺度下高效计算
+    * Op融合：离线分析网络计算图，多个小Op（计算量小、功能较简单）融合运算，减少反复内存读取、kernel启动等开销
+
+* 低精度优化
+    * 支持INT8， FP16低精度计算，减少模型大小、内存消耗，同时利用硬件低精度计算指令加速计算
+    * 支持INT8 WINOGRAD算法，（输入6bit）， 在精度满足要求的情况下，进一步降低模型计算复杂度
+    * 支持单模型多种精度混合计算，加速计算同时保证模型精度
+
+* 内存优化
+    * 高效”内存池”实现：通过DAG网络计算图分析，实现无计算依赖的节点间复用内存，降低90%内存资源消耗
+    * 跨模型内存复用：支持外部实时指定用于网络内存，实现“多个模型，单份内存”。
+
+* 主流模型实测性能：v0.1 2020.05.29
+
+>  麒麟970：
+
+| model                     | cpu time(single thread, ms) | gpu time(ms) | npu time(ms)
+|---------------------------|--------------|--------------|---------------|
+| Mobilenet_v1              | 88           |   12         |       4.9     |
+| Mobilenet_v1_int8         | 55           |              |               |
+| Mobilenet_v2              | 58           |   11         |       8.0     |
+| Mobilenet_v2_int8         | 41           |              |               |
+| squeezenet_v1.0           | 127          |   20         |       5.1     |
+| squeezenet_v1.0_int8      | 82           |              |               |
+
+
+>  骁龙835：
+
+ | model                     | cpu 1 thread(ms) | gpu time(ms) |
+ |---------------------------|--------------|--------------|
+ | Mobilenet_v1              | 94           |   16         |
+ | Mobilenet_v1_int8         | 62           |              |
+ | Mobilenet_v2              | 61           |   14         |
+ | Mobilenet_v2_int8         | 47           |              |
+ | squeezenet_v1.0           | 122          |   28         |
+ | squeezenet_v1.0_int8      | 93           |              |
+
+
+>  骁龙845：
+
+
+| model                     | cpu 1 thread(ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 60           |   10         |
+| Mobilenet_v1_int8         | 37           |              |
+| Mobilenet_v2              | 39           |   8          |
+| Mobilenet_v2_int8         | 28           |              |
+| squeezenet_v1.0           | 74           |   14         |
+| squeezenet_v1.0_int8      | 56           |              |
+
+
+* TNN架构图：
+
+
+   <div><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/tnn_architect.jpg"/>
+
+* 通过ONNX支持TensorFlow， Pytorch， MxNet， Caffe等多种训练框架，充分利用和融入不断完善的ONNX开源生态。当前支持ONNX算子55个，近期会完善到约80个，覆盖主流CNN网络
+* 支持主流安卓、iOS、embedded Linux，windows操作系统，支持ARM CPU， GPU硬件平台（近期还会加入达芬奇NPU支持）
+* 模块化设计，将模型解析、计算图构建、优化、底层硬件适配、高性能kernel实现各部分抽象隔离，通过Factory Mode注册、构建设备，方便接入更多的底层硬件、加速方案。
+* Runtime无任何第三方库依赖，CPU动态库尺寸仅约400KB，并提供基础图像变换操作，调用简单便捷。跨平台模型统一、调用接口统一，通过单个配置参数快速切换。
+
+## 能力展示
+* [支持的算子](./user/support.md)
+* [支持的网络](./user/support.md)
+* [支持的架构](./user/support.md)
+* [Benchmark性能测试方法](./development/profiling.md)
+
+## 使用手册
+* [从源码编译](./user/compile.md)
+* [工具集]()
+    * [模型转换](./user/convert.md)
+    * [模型量化](./user/quantization.md)
+    * [模型可视化](https://lutzroeder.github.io/netron/)
+    * [性能分析工具](./development/profiling.md)
+    * [模型对齐工具](./development/model_check.md)
+
+## API文档
+* [API调用](./user/api.md)
+
+## 贡献者须知
+* [开发基础须知](./development/contributing.md)
+* [架构详解](./development/architecture.md)
+* [新增OP](./development/add_op.md)
+* [单元测试](./development/unit_test.md)
+
+## Roadmap
+* [Road map](./user/roadmap.md)
+
+## FAQ
+* [FAQ 常见问题](./faq.md)
+
+## 加入我们
+
+* 欢迎大家参与，协同共建，打造业界最好的移动端推理框架。
+
+* 技术交流QQ群： 913940506 答案：TNN
+
+* QQ群二维码：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN-QQ.png"/>
diff --git a/3rdparty/TNN/doc/cn/get_started.md b/3rdparty/TNN/doc/cn/get_started.md
new file mode 100644
index 0000000..be67e11
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/get_started.md
@@ -0,0 +1,16 @@
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+# 从0开始跑通一个Demo
+
+[English Version](../en/get_started_en.md)
+
+使用TNN非常简单，如果你有一个已经训练好的模型, 那么一般而言通过以下三个步骤就能完成模型在目标平台上的部署。
+1. 第一步是把训练好的模型转换成TNN的模型，为此我们提供了丰富的工具来帮助你完成这一步，无论你使用的是Tensorflow、Pytorch、或者Caffe，都可以轻松完成转换。
+详细的手把手教程可以参见这里[如何转换模型](./user/convert.md)。
+
+2. 当你完成了模型的转换，第二步就是编译目标平台的TNN引擎了，你可以根据自己的目标平台的硬件支持情况，选择CPU/ARM/OpenCL/Metal/NPU等加速方案。
+   对于这些平台，TNN都提供了一键编译的脚本，使用非常方便。详细步骤可以参考这里[如何编译TNN](./user/compile.md)。
+
+3. 最后一步就是使用编译好的TNN引擎进行推理，你可以在自己的应用程序中嵌入对TNN的调用，这方面我们提供了丰富而详实的demo来帮助你完成。
+    *  [从0开始跑通一个iOS Demo](./user/demo.md)
+    *  [从0开始跑通一个Android Demo](./user/demo.md)
diff --git a/3rdparty/TNN/doc/cn/imgs/blob_memory.png b/3rdparty/TNN/doc/cn/imgs/blob_memory.png
new file mode 100644
index 0000000..21ff77d
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/blob_memory.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/device.png b/3rdparty/TNN/doc/cn/imgs/device.png
new file mode 100644
index 0000000..5ee93fc
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/device.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/device_factory.png b/3rdparty/TNN/doc/cn/imgs/device_factory.png
new file mode 100644
index 0000000..931df24
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/device_factory.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/group.png b/3rdparty/TNN/doc/cn/imgs/group.png
new file mode 100644
index 0000000..cc236ff
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/group.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/model_align.png b/3rdparty/TNN/doc/cn/imgs/model_align.png
new file mode 100644
index 0000000..e523f27
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/model_align.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/model_reinterpreter.png b/3rdparty/TNN/doc/cn/imgs/model_reinterpreter.png
new file mode 100644
index 0000000..5af0188
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/model_reinterpreter.png differ
diff --git a/3rdparty/TNN/doc/cn/imgs/roadmap.jpg b/3rdparty/TNN/doc/cn/imgs/roadmap.jpg
new file mode 100644
index 0000000..7be2035
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/roadmap.jpg differ
diff --git a/3rdparty/TNN/doc/cn/imgs/tnn_architect.jpg b/3rdparty/TNN/doc/cn/imgs/tnn_architect.jpg
new file mode 100644
index 0000000..20fef51
Binary files /dev/null and b/3rdparty/TNN/doc/cn/imgs/tnn_architect.jpg differ
diff --git a/3rdparty/TNN/doc/cn/jobs.md b/3rdparty/TNN/doc/cn/jobs.md
new file mode 100644
index 0000000..cdfb8a2
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/jobs.md
@@ -0,0 +1 @@
+Todo: 工作机会
diff --git a/3rdparty/TNN/doc/cn/model_align.md b/3rdparty/TNN/doc/cn/model_align.md
new file mode 100644
index 0000000..ff14501
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/model_align.md
@@ -0,0 +1,145 @@
+# 模型对齐常见问题
+
+[English Version](../en/model_align_en.md)
+
+在使用转换得到的TNN模型进行推理时，有时会遇到TNN模型的推理结果与原始模型不对齐的情况。此文档总结了模型不对齐问题的主要原因、常见的不对齐算子以及分析和处理不对齐问题的方法。模型不对齐问题的整体处理流程可参考下图。
+
+<div><img src="imgs/model_align.png"/>
+
+## 一、模型对齐的验证与检查
+
+### 1. 模型转换时使用-align检查对齐情况
+
+TNN模型转换工具支持对齐功能，可以在模型转换时检查生成的TNN模型与源模型是否对齐。强烈建议在模型转换时打开对齐检查，具体文档请参考[模型转换文档](https://github.com/Tencent/TNN/blob/master/doc/cn/user/convert.md)。
+
+### 2. 使用model_check工具检查对齐情况
+
+对于已经转换完成的模型，TNN提供了**model_check**工具辅助模型对齐情况的验证。**model_check**工具主要用于比较TNN不同设备(例如ARM，OpenCL，Metal等)的执行结果是否与TNN CPU等价，当怀疑TNN在某些设备上的执行结果不正确时，可以使用此工具进行检查。
+
+**model_check**工具可以方便地在指定设备上，使用给出的输入数据或随机生成数据执行TNN模型，并与TNN CPU的执行结果进行逐算子的比较，从而准确定位存在问题的算子。**model_check**的使用方法请参考[model_check文档](https://github.com/Tencent/TNN/blob/master/doc/cn/development/model_check.md)。
+
+## 二、常见的模型对齐问题
+
+如果模型转换工具成功生成了TNN模型，但在使用中发现了不对齐的情况，可以按照以下方法排查问题。
+
+由于神经网络模型类型众多，且不同框架对算子的定义和支持不尽相同，再加之各框架的算子支持情况还会随版本变化，所以存在转换前后算子功能不完全等价的情况。下表按照源模型的类型，总结了在实践中遇到的可能存在对齐问题的算子，可用于快速定位可能存在对齐问题的算子。
+
+|源模型|问题算子列表|
+|-|-|
+|Pytorch    |upsample, batchnorm, AvgPool|
+|TensorFlow |TODO|
+|tflite     |ResizeBilinear|
+|onnx       |TODO|
+
+### 1.tensorflow
+TODO
+
+### 2.pytorch
+
+#### upsample 
+
+问题描述：将pytorch模型转换为onnx模型时，onnx的upsample算子与pytorch不等价
+
+解决方法：1）更新pytorch；2）导出onnx模型时，设置opset_version>=11，代码如下：
+```
+torch.onnx.export(model, input, filename, verbose=False,
+                  opset_version=11,...) # or other number greater than 11
+```
+
+#### batchnorm
+
+问题描述：将pytorch模型转换为onnx模型时，没有将pytorch切换到推理模式，导致batchnorm参数不固定
+
+解决方法：导出onnx模型前，切换pytorch到推理模式，代码如下：
+```torch_model.eval()``` or ```torch_model.train(False)```
+
+#### AvgPool
+
+问题描述：pytorch模型中的AvgPool算子有count_include_pad属性，取值可以为```True```或```False```，当前TNN仅支持count_include_pad=```Fasle```的情况。
+
+解决方法：导出onnx模型前，修改AvgPool算子的count_include_pad为```False```
+
+
+### 3.tflite
+
+#### ResizeBilinear
+
+问题描述：含有ResizeBilinear的tflite模型使用-align可能会不对齐，这是由于TensorFlow2.3之前tflite的ResizeBilinear实现存在问题导致的
+
+解决方法：升级TensorFlow让其版本不小于2.3即可
+
+### 4.onnx
+TODO
+
+## 三、模型对齐问题的分析与处理方法
+
+在排查模型对齐问题时，最直接有效的方法就是对比模型在相同输入下的计算结果。这一过程需要将TNN模型中特定算子的计算结果与原始模型中对应算子的计算结果进行比较。这可以通过保存算子的输入与输出实现。
+
+TNN支持逐层dump结果的功能，可以通过下面的方法获得每层的输入和输出结果。
+
+### 1. 打开blob dump功能
+
+打开[source/tnn/utils/blob_dump_utils.h](https://github.com/Tencent/TNN/blob/master/source/tnn/utils/blob_dump_utils.h)文件，根据需要修改`DUMP_INPUT_BLOB`和`DUMP_OUTPUT_BLOB`两个宏。其中`DUMP_INPUT_BLOB`为`1`表示保存TNN模型每个算子的输入;设置`DUMP_OUTPUT_BLOB`为`1`表示保存每个算子的输出。
+
+数据保存过程的调用在[source/tnn/core/default_network.cc](https://github.com/Tencent/TNN/blob/master/source/tnn/core/default_network.cc)的`Forward`方法中。
+
+具体来说，TNN将算子的每个输入和输出保存在独立的txt文本文件中，文件名由**算子在模型中的顺序、算子名称以及输入和输出自身的形状等因素共同**决定。例如，假设模型的第2层名为*foo*，其第1个输入被保存在前缀为*00001-foo-in-0*的文件中；其第2个输出被保存在前缀为*00001-foo-out-1*的文件中。每层的计算结果按照*N-C-H-W*的顺序保存在文件内，每行保存一个元素。TNN模型各算子的输入输出信息可借助[**Netron**可视化工具](https://netron.app/)查看。
+
+文件的保存目录由[source/tnn/utils/blob_dump_utils.cc](https://github.com/Tencent/TNN/blob/master/source/tnn/utils/blob_dump_utils.cc)中的变量 `g_tnn_dump_directory`控制，可以根据需要进行修改。
+
+### 2. 使用指定输入，获得每层计算结果
+
+考虑到保存数据的过程位于`Forward`方法中，我们可以通过调用`Forward`方法实现数据保存。此外，也可以借助TNN已有的工具执行这一过程，例如**TNNTest**工具。由于**TNNTest**默认使用异步方法执行推理，不调用`Forward`方法，所以需要进行修改。
+
+具体修改方法如下：打开[test/test.cc](https://github.com/Tencent/TNN/blob/master/test/test.cc)文件，找到其中的`ForwardAsync`方法，并将其替换为`Forward`方法。在不了解**TNNTest**具体工作流程的情况下，建议对代码中的**2处调用**均进行替换。替换过程如下所示：
+将
+```
+ ret = instance->ForwardAsync(nullptr);
+```
+替换为
+```
+ret = instance->Forward();
+```
+
+由于上述修改均位于源代码中，因此修改后需要重新编译TNN。TNN的编译可参考[TNN编译文档](https://github.com/Tencent/TNN/blob/master/doc/cn/user/compile.md)。
+
+编译后可以用**TNNTest**工具执行模型，并保存每层的输入和输出结果。可参考[TNNTest文档](https://github.com/Tencent/TNN/blob/master/doc/cn/user/test.md)了解**TNNTest**的使用方法和参数。
+
+### 3. 获得源模型算子的计算结果
+
+保存源模型算子结果的方法与源模型基于的框架紧密相关。这里以onnx模型为例，说明逐层保存模型结果的方法。
+- onnx模型：使用`onnxruntime`执行onnx模型，并保存每个算子的计算结果。
+```
+def forward_dump(model_path:str, input_data:numpy.ndarray) -> Dict[str, numpy.ndarray]:
+    # 1. Load onnx model
+    model = onnx.load(model_path)
+    onnx.checker.check_model(model)
+    model = copy.deepcopy(model)
+
+    # 2. Prepare input data
+    input_data = {'input_name': input_data}
+
+    # 3. Set the output of each operator as the output of the model
+    for node in model.graph.node:
+        for output in node.output:
+            model.graph.output.extend([onnx.ValueInfoProto(name=output)])
+
+    # 4. Use onnxruntime to execute onnx models
+    sess = onnxruntime.InferenceSession(model.SerializeToString())
+    outputs = [x.name for x in sess.get_outputs()]
+    result = OrderedDict(zip(outputs, sess.run(outputs, input_data)))
+    # 5. save the data in 'result'
+    
+    return result
+```
+`result`为一个`Dict`，将onnx模型中每个算子的`name`映射到该算子的计算结果(`numpy.ndarray`)。
+
+## 四、提交issue
+
+当遇到了TNN模型对齐的问题后，可以[提交issue](https://github.com/Tencent/TNN/issues)将问题反馈给我们，我们会尽快进行修复。
+
+为了方便我们复现和定位问题，请按照issue模板填写issue相关信息，并在描述问题时请尽量提供以下内容：
+1. 原模型与TNN模型；
+2. 指定的输入数据和参考计算结果；
+3. 对齐时使用的环境与方法：例如onnxruntime的版本、tflite版本、tnn版本等；
+4. 其他辅助信息：例如，已经定位到的不对齐算子等
diff --git a/3rdparty/TNN/doc/cn/user/api.md b/3rdparty/TNN/doc/cn/user/api.md
new file mode 100644
index 0000000..80df1ae
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/api.md
@@ -0,0 +1,558 @@
+# API说明
+
+[English Version](../../en/user/api_en.md)
+
+## 一、API兼容性
+
+TNN所有对外暴露接口均通过PUBLIC宏显示声明，非暴露接口符号均不可见。
+
+```cpp
+#if defined _WIN32 || defined __CYGWIN__
+  #ifdef BUILDING_DLL
+    #ifdef __GNUC__
+      #define PUBLIC __attribute__ ((dllexport))
+    #else
+      #define PUBLIC __declspec(dllexport)
+    #endif
+  #else
+    #ifdef __GNUC__
+      #define PUBLIC __attribute__ ((dllimport))
+    #else
+      #define PUBLIC __declspec(dllimport)
+    #endif
+  #endif
+  #define LOCAL
+#else
+  #if __GNUC__ >= 4
+    #define PUBLIC __attribute__ ((visibility ("default")))
+    #define LOCAL  __attribute__ ((visibility ("hidden")))
+  #else
+    #define PUBLIC
+    #define LOCAL
+  #endif
+#endif
+```
+
+不同版本API 兼容性遵守[语义化版本 2.0.0](https://semver.org/lang/zh-CN/)规则。
+
+## 二、API调用
+
+### 简介
+API调用主要对模型解析，网络构建，输入设定，输出获取四个步骤进行简要介绍，详细说明参见API详解部分。
+
+### 步骤1. 模型解析
+
+```cpp
+TNN tnn;
+TNN_NS::ModelConfig model_config;
+//proto文件内容存入proto_buffer
+model_config.params.push_back(proto_buffer);
+//model文件内容存入model_buffer
+model_config.params.push_back(model_buffer);
+Status ret = tnn.Init(model_config);
+```
+
+TNN模型解析需配置ModelConfig params参数，传入proto和model文件内容，并调用TNN Init接口即可完成模型解析。
+
+### 步骤2. 网络构建
+
+```cpp
+TNN_NS::NetworkConfig config;
+config.device_type = TNN_NS::DEVICE_ARM;
+TNN_NS::Status error;
+auto net_instance = tnn.CreateInst(config, error);
+```
+
+TNN网络构建需配置NetworkConfig，device_type可配置`DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU`等多种加速方式，通过CreateInst接口完成网络的构建。
+
+
+### 步骤3. 输入设定
+
+```cpp
+auto status = instance->SetInputMat(input_mat, input_cvt_param);
+```
+
+TNN输入设定通过调用SetInputMat接口完成，需要传入的数据保存在`input_mat`中。
+
+### 步骤4. 网络运行
+
+```cpp
+auto status = instante->Forward();
+```
+TNN Forward接口为同步调用接口，ForwardAsync接口为异步调用接口。
+
+### 步骤5. 输出获取
+
+```cpp
+auto status = instance->GetOutputMat(output_mat);
+```
+
+TNN输出获取通过调用GetOutputMat接口完成，输出结果将按照特定格式保存在`output_mat`中。
+
+## 二、API详解
+
+### API目录结构
+
+```bash
+.
+└── tnn
+    ├── core
+    │   ├── blob.h                  # 负责数据传递
+    │   ├── common.h                # 定义常用结构
+    │   ├── instance.h              # 网络实例
+    │   ├── macro.h                 # 常用宏定义
+    │   ├── mat.h                   # 输入接口，类cv::Mat
+    │   ├── status.h                # 接口状态
+    │   └── tnn.h                   # 模型解析
+    ├── utils
+    │   ├── bfp16_utils.h           # bfp16转换工具
+    │   ├── blob_converter.h        # blob输入输出转换工具
+    │   ├── cpu_utils.h             # CPU性能特定优化工具
+    │   ├── data_type_utils.h       # 数据类型转换工具
+    │   ├── dims_vector_utils.h     # 尺寸计算工具
+    │   ├── half_utils.h            # fp16转换工具
+    │   ├── mat_utils.h             # Mat转换工具
+    │   └── string_utils.h          # 字符串转换工具
+    └── version.h                   # 编译构建信息
+```
+
+### 1. core/common.h
+
+`DataType`：定义不同数据类型枚举值。  
+`DataFormat`：定义Blob Data不同数据排布方式。  
+`NetworkType`：定义不同网络构建类型，默认构建TNN网络，支持第三方库网络构建。  
+`DeviceType`：用于指定网络运行设备及加速方式。  
+`ModelType`：定义模型类型，TNN默认解析模型为TNN模型，同时支持其他第三方库模型格式传入。  
+`Precision `: 定义网络运行精度。    
+
+```cpp
+struct PUBLIC ModelConfig {
+
+    ModelType model_type = MODEL_TYPE_TNN;
+
+    // tnn model need two params: order is proto content, model content.
+    // ncnn need two: params: order is param content, bin content.
+    // openvino model need two params: order is xml content, model path.
+    // coreml model need one param: coreml model directory path.
+    // snpe model need one param: dlc model directory path.
+    // hiai model need two params: order is model name, model file path.
+    // atlas model need one param: config string.
+    std::vector<std::string> params;
+};
+```
+
+ModelConfig参数说明：
+
+- `model_type`: TNN当前开源版本仅支持传入`MODEL_TYPE_TNN`， `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML` 模型格式。  
+- `params`: TNN模型需传入proto文件内容以及model文件路径。NCNN模型需传入param文件内容以及bin文件路径, COREML模型需传入coreml 模型所在目录路径。
+
+
+```cpp
+struct PUBLIC NetworkConfig {
+    // device type default cpu
+    DeviceType device_type = DEVICE_ARM;
+
+    // device id default 0
+    int device_id = 0;
+
+    // blob data format, auto decided by device
+    DataFormat data_format = DATA_FORMAT_AUTO;
+
+    // network type, auto decided by device
+    NetworkType network_type = NETWORK_TYPE_AUTO;
+
+    // raidnet instances not share memory with others
+    ShareMemoryMode share_memory_mode = SHARE_MEMORY_MODE_DEFAULT;
+
+    // dependent library path
+    std::vector<std::string> library_path = {};
+
+    // compute precision
+    Precision precision = PRECISION_AUTO;
+
+    // cache path to store possible cache models or opt kernel or opencl program cache
+    std::string cache_path = "";
+
+    // network init or reshape may cost more time to select opt kernel implement if enable tune kernel
+    // cache_path can set to store tune kernel info.
+    bool enable_tune_kernel = false;
+};
+```
+
+NetworkConfig参数说明：  
+
+- `device_type`: 默认为`DEVICE_ARM`。 当前已支持 `DEVICE_NAIVE`、`DEVICE_ARM`、`DEVICE_X86`、`DEVICE_OPENCL`、`DEVICE_METAL`、`DEVICE_CUDA`、`DEVICE_HUAWEI_NPU`、`DEVICE_RK_NPU`。  
+- `device_id`: 默认为0，多个设备支持通过`device_id`选择，当前仅`DEVICE_CUDA`需配置此参数指定gpu id。  
+- `data_format`: 默认为tnn自动选择blob数据排布方式进行加速，可通过此参数设定特定blob数据排布进行加速。  
+- `network_type`: 默认根据`device_type`自动选择网络类型，可指定构建网络类型。  
+- `share_memory_mode`: tnn instance 内存共享方式。  
+- `library_path`: 支持外部依赖库加载，iOS metal kernel库放在app非默认路径需配置此参数。    
+- `precision`:  网络精度类型，默认根据不同的`device_type`自动选择精度。  
+- `cache_path`： 华为NPU指定cache路径可存放运行过程中转出的om文件，后续运行可直接通过加载cache路径对应om文件。OpenCL指定cache路径可缓存编译好的kernel二进制文件，后续初始化可直接通过二进制cache文件创建kernel， `enable_tune_kernel` 打开，可通过指定cache路径存放tune参数，后续可直接加载tune参数而无需每次运行都tune kernel。
+
+
+```cpp
+typedef enum {
+    // default
+    SHARE_MEMORY_MODE_DEFAULT = 0,
+    // same thread tnn instance share blob memory
+    SHARE_MEMORY_MODE_SHARE_ONE_THREAD = 1,
+    // set blob memory from external, different thread share blob memory need
+    // synchronize
+    SHARE_MEMORY_MODE_SET_FROM_EXTERNAL = 2
+} ShareMemoryMode;
+```
+
+ShareMemoryMode参数说明:  
+
+- `SHARED_MEMORY_MODE_DEFAULT`: 仅支持同一instance不同blob间内存共享。  
+- `SHARE_MEMORY_MODE_SHARE_ONE_THREAD`: 支持同一线程的不同Instance内存共享。  
+- `SHARE_MEMORY_MODE_SET_FROM_EXTERNAL`: 支持instance内存由外部传入，共享方式由调用侧决定，线程间共享需处理同步问题，内存分配释放均需调用侧维护。  
+
+### 2. core/tnn.h
+
+```cpp
+class PUBLIC TNN {
+public:
+    ...
+
+    // init tnn implement, interpret model.
+    Status Init(ModelConfig& config);
+
+    // denit tnn implement, release model interpreter.
+    Status DeInit();
+
+    // add output to the model.
+    // if output_name of blob not found, then search output_index of layer.
+    Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    // return input shapes map from model
+    Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // create tnn network instance with network config and inputs shape.
+    // if inputs shape not set, use default from model.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap inputs_shape = InputShapesMap());
+
+    // create tnn network instance with network config and min max inputs shape,
+    // instance reshape can support range from min inputs shape to max inputs shape.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+    ...
+};
+```
+
+TNN接口说明：  
+
+- Init接口：负责模型数据传入并解析，需配置并传入ModelConfig。  
+- DeInit接口: 负责tnn implement释放，默认析构函数可自动释放。  
+- AddOutput接口：支持增加模型输出，可将网络任意一层输出定义为模型输出。
+- GetModelInputShapesMap接口： 获取模型解析出的模型输入尺寸。  
+- CreateInst接口：负责网络实例Instance构建，如果运行过程中支持输入维度可变，需配置`min_inputs_shape`和`max_inputs_shape`指定输入每个维度支持的最大最小尺寸。
+
+### 3. core/instance.h
+
+```cpp
+class PUBLIC Instance {
+public:
+    Instance(NetworkConfig& net_config, ModelConfig& model_config);
+
+    ~Instance();
+
+    // init with model interpeter and inputs shape.
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape);
+
+    // deinit, release network
+    Status DeInit();
+
+    //  return memory bytes required for forward
+    Status GetForwardMemorySize(int& memory_size);
+
+    //  set memory to tnn instance. if success, return status code zero.
+    //  only instance created with SHARE_MEMORY_MODE_SET_FROM_EXTERNAL can be set from external.
+    //  the memory size need >=  GetForwardMemorySize().
+    //  releasing or otherwise using the memory for other purposes during the tnn network run 
+    //  will result in undefined behavior.
+    Status SetForwardMemory(void* memory);
+
+    // reshape instance with new input shapes
+    Status Reshape(const InputShapesMap& inputs);
+
+    // get tnn command queue
+    Status GetCommandQueue(void** command_queue);
+
+    // @brief tnn instance network infer, it will wait until all layer infer complete.
+    Status Forward();
+
+    ...
+
+    // tnn instance network infer async.
+    // device gpu, all layer infer complete will call Callback.
+    Status ForwardAsync(Callback call_back);
+
+    // get all input blobs
+    Status GetAllInputBlobs(BlobMap& blobs);
+
+    // get all output blobs
+    Status GetAllOutputBlobs(BlobMap& blobs);
+
+    // set threads run on cpu 
+    virtual Status SetCpuNumThreads(int num_threads);
+    ...
+
+    // set input Mat, if input_name is not set, take the first input as default
+    Status SetInputMat(std::shared_ptr<Mat> mat,
+                       MatConvertParam param,
+                       std::string input_name = "");
+    
+    // get output Mat, if output_name is not set, take the first output as default
+    Status GetOutputMat(std::shared_ptr<Mat>& mat,
+                        MatConvertParam param = MatConvertParam(),
+                        std::string output_name = "", 
+                        DeviceType device = DEVICE_ARM, MatType mat_type = NCHW_FLOAT);
+
+};
+```
+
+Instance接口说明：  
+
+- `Instance`和`Init`接口均由TNN CreateInst接口实现调用，用于生成Instance网络实例。  
+- `GetForwardMemorySize`可获取Instance所有Blob所需内存大小，`SetForwardMemory`用于传入外部内存。对于`SHARE_MEMORY_MODE_SET_FROM_EXTERNAL`内存模式构建的Instance，内存需由外部传入， 传入内存实际大小不得小于`GetForwardMemorySize`返回值大小。  
+- `Reshape`接口支持网络构建成功后重新设定输入尺寸，仅通过`min_inputs_shape`和`max_inputs_shape` 构建的网络可在运行过程中改变输入尺寸，可变尺寸范围由`min_inputs_shape`和`max_inputs_shape` 指定。  
+- `GetCommandQueue`接口支持获取网络运行对应的command queue，同一command queue消息顺序执行。  
+- `GetAllInputBlobs`和 `GetAllOutputBlobs`分别用于获取输入输出blob。  
+- `SetCpuNumThreads`可设置CPU线程并行数。  
+- `Forward`为网络运行同步接口，`ForwardAsync`为网络运行异步接口。  
+- `SetInputMat`用于设定输入Mat，其中MatConvertParam可设定[转换参数](#MatConvertParam参数说明)。对于多输入网络，可用`input_name`区分。  
+- `GetOutputMat`用于获取输出结果并保存在输出Mat中，其中MatConvertParam可设定[转换参数](#MatConvertParam参数说明)。对于多输出网络，可用`output_name`区分，DeviceType可指定输出Mat Memory构建在CPU还是GPU，MatType可用于设定输出Mat数据排列方式。  
+
+
+### 4. core/mat.h
+
+```cpp
+class PUBLIC Mat {
+public:
+    ...
+
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims, void* data);
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims);
+    //empty mat
+    Mat(DeviceType device_type, MatType mat_type);
+
+    DEPRECATED("use Mat(DeviceType, MatType, DimsVector, void*) instead")
+    Mat(DeviceType device_type, MatType mat_type, void* data) : Mat(device_type, mat_type, {1,0,0,0}, data) {};
+    
+    ...
+};
+```
+其中MatType支持常用的CV, NLP输入输出布局，且`DeviceType`可设定为CPU，GPU。
+
+```cpp
+typedef enum {
+    INVALID    = -1,
+    //bgr or rgb: uint8
+    N8UC3      = 0x00,
+    //bgra or rgba: uint8
+    N8UC4      = 0x01,
+    //gray: uint8
+    NGRAY      = 0x10,
+    //YUV420SP, YYYYVUVUVU
+    NNV21      = 0x11,
+    //YUV420SP, YYYYUVUVUV
+    NNV12      = 0x12,
+    //nchw: float
+    NCHW_FLOAT = 0x20,
+    // nchw: int32
+    NC_INT32 = 0x21,
+    ...
+} PUBLIC MatType;
+```
+
+### 5. core/macro.h
+提供不同平台Log宏，不同数据类型最大最小值宏，PUBLIC宏定义，以及部分数据pack转换等宏定义。
+
+### 6. core/status.h
+`Status`定义于status.h头文件中。
+
+```cpp
+enum StatusCode {
+
+    TNN_OK = 0x0,
+
+    // param errcode
+    TNNERR_PARAM_ERR        = 0x1000,
+    TNNERR_INVALID_NETCFG   = 0x1002,
+    ...
+}
+
+class PUBLIC Status {
+public:
+    Status(int code = TNN_OK, std::string message = "OK");
+
+    Status &operator=(int code);
+
+    bool operator==(int code_);
+    bool operator!=(int code_);
+    operator int();
+    operator bool();
+    std::string description();
+
+private:
+    int code_;
+    std::string message_;
+}
+```
+当Status code不为TNN_OK，通过`description`接口可返回错误描述信息。
+
+### 7. core/blob.h
+
+```cpp
+// @brief BlobDesc blob data info
+struct PUBLIC BlobDesc {
+    // device_type describes devie cpu, gpu, ...
+    DeviceType device_type = DEVICE_NAIVE;
+    // data_type describes data precion fp32, in8, ...
+    DataType data_type = DATA_TYPE_FLOAT;
+    // data_format describes data order nchw, nhwc, ...
+    DataFormat data_format = DATA_FORMAT_AUTO;
+    // DimsVector describes data dims
+    DimsVector dims;
+    // name describes the blob name
+    std::string name;
+    
+    std::string description(bool all_message = false);
+};
+
+struct PUBLIC BlobHandle {
+    void *base            = NULL;
+    uint64_t bytes_offset = 0;
+};
+
+// @brief Blob tnn data store and transfer interface.
+class PUBLIC Blob {
+public:
+    ...
+
+    //@brief create Blob with blob descript and data handle
+    Blob(BlobDesc desc, BlobHandle handle);
+
+    ...
+};
+
+```
+
+Blob当前主要由`BlobDesc`以及`BlobHandle`构成，其中`BlobDesc`描述Blob相关结构信息，`BlobHandle`用于读取和存储Blob数据。
+
+`BlobDesc`用于描述`device_type`, `data_type`, `data_format`, `dims`, `name`信息。
+
+dims描述blob维度信息，dims存储尺寸与data_format无关：  
+- dims尺寸为2，存储对应N, C。  
+- dims尺寸为4，存储尺寸对应N，C，H，W。  
+- dims尺寸为5，存储尺寸对应N，C，D，H，W。  
+
+当前不同平台blob输入输出数据类型及排布如下：  
+
+- `ARM`：CPU内存， NC4HW4.  
+- `OPENCL`: GPU显存（clImage）， NHC4W4. 其中NH为clImage高，C4W4为clImage宽。  
+- `METAL`: GPU显存（metal)， NC4HW4.  
+- `HUAWEI_NPU`: CPU内存, NCHW.
+- `X86`： CPU内存，NCHW。  
+- `CUDA`： GPU内存， NCHW。  
+
+其中最后4代表pack 4, C4代表最后1位4由4个C进行pack。  
+
+
+### 8. utils/mat\_utils.h
+```cpp
+class PUBLIC MatUtils {
+public:
+    //copy cpu <-> device, cpu<->cpu, device<->device, src and dst dims must be equal.
+    static Status Copy(Mat& src, Mat& dst, void* command_queue);
+
+    //src and dst device type must be same. when param scale_w or scale_h is 0, it is computed as
+    // (double)dst.GetWidth() / src.GetWidth() or (double)dst.GetHeight() / src.GetHeight().
+    static Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue);
+
+    //src and dst device type must be same. when param width or height is 0, it is equal to
+    //dst.GetWidth() or dst.GetHeight().
+    static Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue);
+
+    //src and dst device type must be same. param top, bottom, left and right must be non-negative.
+    static Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue);
+};
+```
+
+接口参数说明:
+
+- `Copy`: 支持不同DEVICE与CPU Mat数据拷贝，以及相同DEVICE间Mat数据拷贝。
+- `Resize `、`Crop`、`WarpAffine `、`CvtColor `、`CopyMakeBorder` 接口行为类似OpenCV，CPU与GPU均支持，`src` 和  `dst` 需拥有相同的`DEVICE_TYPE`。
+
+
+### 9. utils/bfp16\_utils.h
+接口提供了cpu内存fp32和bfp16转换工具。  
+
+### 10. utils/blob\_convert.h
+```cpp
+class PUBLIC BlobConverter {
+public:
+    explicit BlobConverter(Blob* blob);
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue);
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue);
+
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+
+private:
+    Blob* blob_;
+    std::shared_ptr<BlobConverterAcc> impl_ = nullptr;
+};
+```
+
+通过`ConvertToMat`可将blob数据按照Mat格式传入Mat，`ConvertFromMat`可将Mat数据按照blob格式传入blob, 接口对应的`command_queue`可通过 Instance `GetCommandQueue`接口获取。
+
+接口提供常用预处理，后处理支持，支持设定scale, bias参数以及reverse channel适配bgr, rgb等场景。
+
+```cpp
+struct PUBLIC MatConvertParam {
+    std::vector<float> scale = {1.0f, 1.0f, 1.0f, 1.0f};
+    std::vector<float> bias = {0.0f, 0.0f, 0.0f, 0.0f};
+    bool reverse_channel = false;
+};
+```
+
+#### MatConvertParam参数说明：  
+- `reverse_channel`: 默认为`false`，若需要交换图像的B和R维度，可将此参数设置为`true`。  
+    * 仅`N8UC3`和`N8UC4`类型的Mat支持reverse_channel，其他类型的Mat会忽略该参数。  
+    * `ConvertFromMat`和`ConvertToMat`过程都支持reverse_channel。  
+- `scale`和`bias`: scale默认为 `1`，bias默认为`0`，计算顺序为先乘scale，再加bias。  
+    * 所有类型的Mat都支持scale和bias。  
+    * `ConvertFromMat`和`ConvertToMat`过程都支持scale和bias。  
+    * 若指定的scale全为`1`，且bias全为`0`，或者使用默认的scale和bias值，则不做乘scale和加bias操作；否则用户需提供与channel大小对应的scale和bias值。  
+    * 对于多维数据，scale和bias中的数值顺序和推理过程使用的数据格式保持一致。例如，若模型实际使用BGR格式进行推理，则`ConvertFromMat`和`ConvertToMat`过程，无论reverse_channel与否，scale和bias都需按照BGR顺序指定。也可理解为，`ConvertFromMat`先reverse channel，再乘scale和加bias；`ConvertToMat`先乘scale和加bias，再reverse channel。  
+
+### 11. utils/cpu\_utils.h
+提供CPU线程核绑定以及省电模式等设定相关工具。
+
+### 12. utils/data\_type\_utils.h
+提供DataType尺寸和名称转换相关工具。
+
+### 13. utils/dims\_vector\_utils.h
+提供常用blob dims计算比较工具。
+
+### 14. utils/half\_utils.h
+接口提供cpu内存fp32和fp16转换工具。
+
+### 15. utils/string\_utils.h
+接口提供uchar string 到std::string的转换，主要用于TNN模型内存输入。
+
+### 16. version.h
+构建版本信息
diff --git a/3rdparty/TNN/doc/cn/user/caffe2tnn.md b/3rdparty/TNN/doc/cn/user/caffe2tnn.md
new file mode 100644
index 0000000..ac92739
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/caffe2tnn.md
@@ -0,0 +1,139 @@
+# Caffe 模型转换为 ONNX 模型
+
+[English Version](../../en/user/caffe2tnn_en.md)
+
+要将 Caffe 模型转换为 TNN 模型，首先将 Caffe 模型转换为 ONNX 模型，然后再将ONNX 模型转换为 TNN 模型。
+
+将 Caffe 模型转换为ONNX，我们借助于 caffe2onnx 工具, 它可以直接将 Caffe 模型转换为 ONNX 模型。在下面的文档中，会简单的介绍如何使用 caffe2onnx进行转换，然后建议参考 [onnx2tnn](onnx2tnn.md) 的相关文档，再将 ONNX 模型转换为 TNN。
+
+
+## 1. 环境搭建(Mac and Linux)
+
+- 安装protobuf(version >= 3.4.0)  
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+对于 linux 系统，我们建议参考protobuf 的官方[README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md)文档，直接从源码进行安装。  
+
+如果你使用的是Ubuntu 系统可以使用下面的指令进行安装：
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+- 安装python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- onnx(version == 1.6.0)
+```shell script
+pip3 install onnx==1.6.0
+```
+
+- numpy(version >= 1.17.0)
+```shell script
+pip3 install numpy
+```
+
+## 2. caffe2onnx 工具使用
+- 进入工具目录
+``` shell script
+cd <tnn_root_path>/tools/caffe2onnx/
+```
+- caffe 格式转换
+
+目前 caffe2onnx 的工具目前只支持最新版本的 caffe 的格式,所以在使用 caffe2onnx
+工具之前需要将老版本的 caffe 网络和模型转换为新版. caffe 自带了工具可以把老版本的
+caffe 网络和模型转换为新版本的格式. 具体的使用方式如下:
+```shell script
+upgrade_net_proto_text [老prototxt] [新prototxt]
+upgrade_net_proto_binary [老caffemodel] [新caffemodel]
+```
+修改后的输入的格式如下所示:
+
+```text
+layer {
+  name: "data"
+  type: "input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 224 dim: 224 } }
+}
+```
+- caffe2onnx 工具的使用
+
+```shell script
+python3 convert2onnx.py ./test.prototxt ./test.caffemodel -o ./test.onnx -align -input_file=in.txt -ref_file=ref.txt
+```
+
+```text
+usage: convert2onnx.py [-h] [-o ONNX_FILE] proto_file caffe_model_file
+
+convert caffe model to onnx
+
+positional arguments:
+  proto_file        the path for prototxt file, the file name must end with
+                    .prototxt
+  caffe_model_file  the path for caffe model file, the file name must end with
+                    .caffemodel!
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model, default v1.0
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save model using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for
+                        the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference
+                        data to compare the results.
+```
+注意：当前仅支持单输入单输出模型和单输入多输出模型。 align 只支持 FP32 模型的校验，所以使用 align 的时候不能使用 half。
+
+## 3. caffe2onnx 支持的算子
+
+| Number | caffe layer          | onnx operator                                       |
+| ------ | -------------------- | --------------------------------------------------- |
+| 1      | BatchNorm            | BatchNormalization                                  |
+| 2      | BatchNorm + Scale    | BatchNormalization                                  |
+| 3      | Concat               | Concat                                              |
+| 4      | Convolution          | Conv                                                |
+| 5      | ConvolutionDepthwise | Conv                                                |
+| 6      | Crop                 | Slice                                               |
+| 7      | Deconvolution        | ConvTranspose                                       |
+| 8      | DetectionOutput      | DetectionOutput(customer defination)                |
+| 9      | Dropout              | Dropout                                             |
+| 10     | Eltwise              | Mul/Add/Max                                         |
+| 11     | Flatten              | Reshape                                             |
+| 12     | InnerProduct         | Reshape + Gemm                                      |
+| 13     | LRN                  | LRN                                                 |
+| 14     | MaxUnPool            | MaxUnPool                                           |
+| 15     | MVN                  | InstanceNorm                                        |
+| 16     | PReLU                | PRelu                                               |
+| 17     | Permute              | Transpose                                           |
+| 18     | Pooling              | MaxPool/AveragePool/GlobalMaxPool/GlobalAveragePool |
+| 19     | Power                | Mul/Add/Pow                                         |
+| 20     | PriorBox             | PriorBox(customer defination)                       |
+| 21     | ReLU                 | Relu/LeakyRelu                                      |
+| 22     | ReLU6                | Clip                                                |
+| 23     | Reshape              | Reshape                                             |
+| 24     | Scale                | Mul + Reshape                                       |
+| 25     | ShuffleChannel       | Reshape + Transpose + Reshape                       |
+| 26     | Sigmoid              | Sigmoid                                             |
+| 27     | Slice                | Slice                                               |
+| 28     | Softmax              | Softmax                                             |
+| 29     | Upsample             | Resize                                              |
+
diff --git a/3rdparty/TNN/doc/cn/user/compile.md b/3rdparty/TNN/doc/cn/user/compile.md
new file mode 100644
index 0000000..831a8e8
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/compile.md
@@ -0,0 +1,255 @@
+# 从源代码编译
+
+[English Version](../../en/user/compile_en.md)
+
+## 一、iOS库编译
+### 1. 编译环境要求
+  - Mac系统, Xcode IDE
+  - cmake（使用3.1及以上版本）
+
+### 2. 编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2）执行编译脚本
+```
+./build_ios.sh
+```
+编译过程中如果出现xcrun、metal或metallib命令找不到，可尝试如下命令。
+```
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer/
+```
+编译完成后，在目录`platforms/ios`下产生`tnn.framework`库和`tnn.bundle`资源
+3）添加到工程  
+
+  - 在iOS app工程的根目录中添加`tnn.framework`库和`tnn.bundle`资源;  
+  - 在app Xcode工程的设置中找到`Build Setting -> Linking -> Other Linker Flags`选项;  
+  - 添加`-force_load "$(path_to_tnn)/tnn.framework/tnn"`;  
+
+### 3. 限制说明
+
+当前编译出的`tnn.framework`支持iOS设备上跑CPU和GPU，在Mac设备上当前仅支持跑GPU，CPU的支持在后续版本迭代中支持。
+
+## 二、Android库编译
+### 1. 环境要求
+#### 依赖库
+  - cmake（使用3.6及以上版本）
+
+#### NDK配置
+  - 下载ndk版本(>=15c)  <https://developer.android.com/ndk/downloads>
+    - 若要支持ARMv8.2编译，ndk版本版本至少为r18b
+  - 配置环境变量 `export ANDROID_NDK=<ndk_path>`
+### 2. 命令依赖
+centos:
+```shell script
+yum install attr.x86_64
+```
+ubuntu:
+```shell script
+sudo apt-get install attr
+```
+### 3. 编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2）编辑`build_android.sh`修改配置选项 
+```
+ ABIA32="armeabi-v7a with NEON"
+ ABIA64="arm64-v8a"
+ STL="c++_static"
+ SHARED_LIB="ON"                # ON表示编译动态库，OFF表示编译静态库
+ ARM="ON"                       # ON表示编译带有Arm CPU版本的库
+ OPENMP="ON"                    # ON表示打开OpenMP
+ OPENCL="ON"                    # ON表示编译带有Arm GPU版本的库
+ HUAWEI_NPU="ON"                # ON表示编译带有Arm GPU NPU版本的库
+ SHARING_MEM_WITH_OPENGL=0      # 1表示OpenGL的Texture可以与OpenCL共享
+```
+华为NPU PS: 
+运行前需要下载DDK, 并放到指定文件夹。 或是用脚本直接下载具体请参考:
+[FAQ](../faq.md)如何创建华为NPU编译环境? 
+
+3）执行编译脚本
+```
+./build_android.sh
+```
+
+编译完成后，在当前目录的`release`目录下生成对应的`armeabi-v7a`库，`arm64-v8a`库和`include`头文件。<font color="#dd0000">如果是编译成静态库，集成链接需添加`-Wl,--whole-archive tnn -Wl,--no-whole-archive`</font>。
+
+## 三、ARM Linux跨平台交叉编译
+
+### 1. 环境要求
+#### 依赖库
+  - cmake（使用3.1及以上版本）
+  - 交叉编译需要安装编译工具链
+  - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu
+            arm32hf: sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+  - other linux: 下载arm toolchain: https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+### 2. 编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2）编辑`build_aarch_linux.sh` 或 `build_armhf_linux.sh` 修改配置选项 
+```
+ SHARED_LIB="ON"                # ON表示编译动态库，OFF表示编译静态库
+ ARM="ON"                       # ON表示编译带有Arm CPU版本的库
+ OPENMP="ON"                    # ON表示打开OpenMP
+ OPENCL="OFF"                   # ON表示编译带有Arm GPU版本的库
+ RKNPU="OFF"                    # ON表示编译带有RKNPU版本的库
+ #ARM64:
+ CC=aarch64-linux-gnu-gcc       # 指定C编译器
+ CXX=aarch64-linux-gnu-g++      # 指定C++编译器
+ TARGET_ARCH=aarch64            # 指定指令架构
+ #ARM32HF:
+ CC=arm-linux-gnueabihf-gcc       
+ CXX=arm-linux-gnueabihf-g++      
+ TARGET_ARCH=arm
+```
+3）执行编译脚本
+```
+./build_aarch_linux.sh
+```
+RKNPU : 运行前需要下载DDK, 并放到指定文件夹。具体请参考:
+[FAQ](../faq.md#如何创建rknpu编译环境)如何创建RKNPU编译环境?
+
+## 四、Linux 环境编译
+### 1.环境要求
+依赖库
+  - cmake (使用3.11版本及以上)
+  - 网络访问
+
+### 2.编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2) 执行编译脚本
+  - 编译不带openvino的版本
+```
+./build_linux_native.sh
+```
+  - 编译带openvino的版本
+```
+./build_x86_linux.sh
+```
+注意：openvino只能编译成64位的库，且cmake版本必须要求3.13以上
+
+## 五、Linux CUDA库编译
+### 1.环境要求
+#### 依赖库
+  - cmake (使用3.8及以上版本）
+  - CUDA (使用10.2及以上版本)
+
+#### TensorRT配置
+  - 下载TensorRT(>=7.1) <https://developer.nvidia.com/nvidia-tensorrt-7x-download>
+  - 配置环境变量 `export TENSORRT_ROOT_DIR=<TensorRT_path>`
+
+#### CuDNN配置
+  - 下载CuDNN(>=8.0) <https://developer.nvidia.com/rdp/cudnn-download>
+  - 配置环境变量 `export CUDNN_ROOT_DIR=<CuDNN_path>`
+
+### 2.编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2) 执行编译脚本
+```
+./build_cuda_linux.sh
+```
+
+## 六、Windows 环境编译
+### 1.环境要求
+依赖库
+  - Visual Studio (2017 及更高版本)
+  - cmake (把3.11及以上版本cmake加入环境变量或使用 Visual Studio 自带cmake)
+  - ninja (编译速度更快，可以使用choco安装)
+
+### 2.编译步骤
+打开 `x64 Native Tools Command Prompt for VS 2017/2019`，如果想要编译32位的库，打开 `x86 Native Tools Command Prompt for VS 2017/2019`
+1) 切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2) 执行编译脚本
+  - 编译不带openvino的版本
+```
+.\build_msvc_naive.bat
+```
+  - 编译带openvino的版本
+```
+.\build_msvc.bat
+```
+openvino只能编译成64位的库，更多编译问题请参考 [FAQ](openvino.md)
+
+## 七、Windows CUDA 环境编译
+### 1.环境要求
+依赖库
+  - Visual Studio (2017 及更高版本)
+  - cmake (把3.11及以上版本cmake加入环境变量或使用 Visual Studio 自带cmake)
+  - CUDA (使用10.2及以上版本) 并且确保 `CUDA_PATH` 加入了环境变量
+
+#### TensorRT配置
+  - 下载TensorRT(>=7.1) <https://developer.nvidia.com/nvidia-tensorrt-7x-download>
+  - 在脚本文件 *build_cuda_msvc.bat* 中修改 `set TENSORRT_ROOT_DIR=<TensorRT_path>`
+
+#### CuDNN配置
+  - 下载CuDNN(>=8.0) <https://developer.nvidia.com/rdp/cudnn-download>
+  - 在脚本文件 *build_cuda_msvc.bat* 中修改 `set CUDNN_ROOT_DIR=<CuDNN_path>`
+
+### 2.编译步骤
+打开 `x64 Native Tools Command Prompt for VS 2017/2019` 或配置了cmake环境变量的 `cmd`
+1) 切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2) 执行编译脚本
+```
+.\build_cuda_msvc.bat
+```
+
+## 八、Macos 环境编译
+### 1.环境要求
+依赖库
+  - cmake 3.11 以上版本 
+  - xcode command line tools (需提前在应用商店安装好Xcode，然后再命令行执行xcode-select --install )
+  - automake, libtool (可通过brew安装，指令是brew install libtool, brew install automake)
+  - 网络访问
+
+### 2.编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2）执行编译脚本
+```
+./build_macos.sh
+```
+
+## 编译参数option说明
+
+|Option|默认值|说明|
+|------|:---:|----|
+|TNN_CPU_ENABLE| ON | 代码source/device/cpu编译开关，实现全部为c++代码，不包含特定CPU加速指令。|
+|TNN_X86_ENABLE| OFF | 代码source/device/x86编译开关, 当前适配openvino实现，后续会迁入更多加速代码实现。|
+|TNN_ARM_ENABLE| OFF | 代码source/device/arm编译开关，代码包含neon加速指令, 且部分实现了int8加速。|
+|TNN_ARM82_ENABLE| OFF | 代码source/device/arm/acc/compute_arm82编译开关，代码包含fp16指令加速。|
+|TNN_METAL_ENABLE| OFF | 代码source/device/metal编译开关，代码包含metal加速指令。|
+|TNN_OPENCL_ENABLE| OFF | 代码source/device/opencl编译开关，代码包含opencl加速指令。|
+|TNN_CUDA_ENABLE| OFF | 代码source/device/cuda编译开关，当前适配TensorRT实现，后续会迁入更多加速代码实现。|
+|TNN_DSP_ENABLE| OFF | 代码source/device/dsp编译开关，当前适配snpe实现。|
+|TNN_ATLAS_ENABLE| OFF | 代码source/device/atlas编译开关，当前适配华为atlas加速框架。|
+|TNN_HUAWEI_NPU_ENABLE| OFF | 代码source/device/huawei_npu编译开关，当前适配HiAI加速框架。|
+|TNN_RK_NPU_ENABLE| OFF | 代码source/device/rknpu编译开关，当前适配rknpu_ddk加速框架。|
+|TNN_SYMBOL_HIDE| ON | 加速库符号隐藏，release发布默认非public接口符号不可见。|
+|TNN_OPENMP_ENABLE| OFF | OpenMP开关，控制是否打开openmp加速。|
+|TNN_BUILD_SHARED| ON | 动态库编译开关，关闭则编译静态库。|
+|TNN_TEST_ENABLE| OFF | test代码编译开关|
+|TNN_UNIT_TEST_ENABLE| OFF | unit test编译开关，打开unit test编译开关会自动打开TNN_CPU_ENABLE开关，作为测试基准。|
+|TNN_PROFILER_ENABLE| OFF | 性能调试开关，打开后会打印更多性能信息，仅用于调试。|
+|TNN_QUANTIZATION_ENABLE| OFF | 量化工具编译开关|
+|TNN_BENCHMARK_MODE| OFF | benchmark开关，打开后支持model weights文件为空，可自动生成数据。|
+|TNN_ARM82_SIMU| OFF | ARM82仿真开关，需要和TNN_ARM82_ENABLE同时打开，打开后可以在普通CPU上运行half实现代码。|
+
diff --git a/3rdparty/TNN/doc/cn/user/convert.md b/3rdparty/TNN/doc/cn/user/convert.md
new file mode 100755
index 0000000..dfe2fea
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/convert.md
@@ -0,0 +1,648 @@
+# 模型转换介绍
+
+[English Version](../../en/user/convert_en.md)
+
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/convert.png"/>
+
+目前 TNN 支持业界主流的模型文件格式，包括ONNX、PyTorch、TensorFlow、TesorFlow-Lite 以及 Caffe 等。如上图所示，TNN 将 ONNX 作为中间层，借助于ONNX 开源社区的力量，来支持多种模型文件格式。如果要将PyTorch、TensorFlow 以及 Caffe 等模型文件格式转换为 TNN，首先需要使用对应的模型转换工具，统一将各种模型格式转换成为 ONNX 模型格式，然后将 ONNX 模型转换成 TNN 模型。  
+
+| 原始模型   | 转换工具        | 目标模型 |
+|------------|-----------------|----------|
+| PyTorch    | pytorch export  | ONNX     |
+| TensorFlow | tensorflow-onnx | ONNX     |
+| Caffe      | caffe2onnx      | ONNX     |
+| ONNX       | onnx2tnn        | TNN      |
+| TensorFlow-Lite     | tflite2tnn      | TNN      |
+目前 TNN 目前仅支持 CNN 等常用网络结构，RNN、GAN等网络结构正在逐步开发中。
+
+# TNN 模型转换工具
+
+通过上面的模型转换的总体介绍，可以发现如果想将 TensorFlow 模型转换成 TNN 模型需要最少两步，稍显麻烦，所以我们提供了 convert2tnn 工具。这个工具提供了集成的转换工具，可以将 TensorFlow、Caffe 和 ONNX 模型转换成 TNN 模型。由于 PyTorch 可以直接导出为 ONNX 模型，然后再将 ONNX 模型转换成 TNN 模型，所以本工具不再提供对于 PyTorch 模型的模型转换，
+
+大家可以使用 convert2tnn 工具对相关的模型直接进行转换，也可以基于后面文档的相关内容，先将对应的模型转换成 ONNX 模型，然后再将 ONNX 转换成 TNN 模型.
+
+本文中提供了两种方式帮助大家使用 convert2tnn工具：
+- 通过 docker image 的方式使用 covnert2tnn 转换工具；
+- 手动安装依赖工具和编译工具的方式使用 convert2tnn 转换工具；
+
+## Convert2tnn Docker (推荐)
+
+为了简化 convert2tnn转换工具的安装和编译步骤，目前 TNN 提供了 Dockerfile 文件以及 Docker image 的方式，你可以自己根据 Dockerfile 文件自己构建 docker 镜像，也可以从 Docker Hub 上直接拉取已经构建好的镜像。你可以选择自己喜欢的方式获取 docker 的镜像。
+
+### 拉取构建好的 docker 镜像（推荐）
+
+目前 TNN 已经在 docker hub 上准备好了构建好的 docker image，我们建议直接从 docker hub 上拉取镜像。
+
+```shell script
+docker pull turandotkay/tnn-convert
+```
+同样的，等待一会之后，你可以通过 `docker images` 来查看是否构建成功，如果构建成功之后，会有类似下面的输出信息：
+``` text
+REPOSITORY                TAG                 IMAGE ID            CREATED             SIZE
+turandotkay/tnn-convert   latest              28c93a738b08        15 minutes ago      2.81GB
+```
+我们发现pull 下来的 docker 镜像的 REPOSIOTY 的名称太长了，我们可以通过下面的命令进行重命名：
+```
+docker tag turandotkay/tnn-convert:latest tnn-convert:latest
+docker rmi turandotkay/tnn-convert:latest
+```
+此时再次执行 `docker images` 命令，会得到下面的类似的输出：
+``` text
+REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
+tnn-convert         latest              28c93a738b08        16 minutes ago      2.81GB
+```
+
+#### 更新 docker 镜像
+重复 [__拉取构建好的 docker 镜像__](#拉取构建好的-docker-镜像推荐) 中的操作即可
+
+### 构建 docker 镜像(如果上面已经拉取了 image，这一步，可直接跳过)
+``` shell script
+cd <path-to-tnn>/
+docker build -t tnn-convert:latest .
+```
+docker 会根据 Dockerfile 文件进行构建，这需要等待一会。等构建完成之后，你可以通过下面的命令进行验证是否构建完成。
+``` shell script
+docker images
+```
+在输出的列表中会有下面类似的输出，这表明docker 的镜像已经构建好了。
+``` text
+REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
+tnn-convert         latest              9fb83110d2c9        26 minutes ago      2.79GB
+```
+
+
+
+### convert2tnn 工具进行转换
+
+首先验证下 docker 镜像能够正常使用，首先我们通过下面的命令来看下 convert2tnn 的帮助信息：
+
+``` shell script
+docker run  -it tnn-convert:latest  python3 ./converter.py -h
+```
+如果docker 镜像是正确的话，你会得到下面的输出：
+```text
+
+usage: convert [-h] {onnx2tnn,caffe2tnn,tf2tnn} ...
+
+convert ONNX/Tensorflow/Caffe model to TNN model
+
+positional arguments:
+  {onnx2tnn,caffe2tnn,tf2tnn}
+    onnx2tnn            convert onnx model to tnn model
+    caffe2tnn           convert caffe model to tnn model
+    tf2tnn              convert tensorflow model to tnn model
+    tflite2tnn          convert tensorflow-lite model to tnn model
+
+optional arguments:
+  -h, --help            show this help message and exit
+```
+从上面的帮助信息中，我们可以得知，目前 convert2tnn 提供了 3 种模型格式的转换支持。假设我们这里想将 TensorFlow 模型转换成 TNN 模型，我们输入下面的命令继续获得帮助信息：
+
+``` shell script
+docker run  -it tnn-convert:latest  python3 ./converter.py tf2tnn -h
+```
+得到的输出信息如下：
+``` text
+usage: convert tf2tnn [-h] -tp TF_PATH -in input_info [input_info ...] -on output_name [output_name ...] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half] [-align] [-input_file INPUT_FILE_PATH]
+                      [-ref_file REFER_FILE_PATH]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -tp TF_PATH           the path for tensorflow graphdef file
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g., -in input1_name:1,128,128,3 input2_name:1,256,256,3
+  -on output_name [output_name ...]
+                        the tensorflow model's output name. e.g. -on output_name1 output_name2
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save the model using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference data to compare the results.
+```
+通过上面的输出，可以发现针对 TF 模型的转换，convert2tnn 工具提供了很多参数，我们一次对下面的参数进行解释：
+
+- tp 参数（必须）
+    通过 “-tp” 参数指定需要转换的模型的路径。目前只支持单个 TF模型的转换，不支持多个 TF 模型的一起转换。
+- in 参数（必须）
+    通过 “-in” 参数指定模型输入，例如：-in input_name_1:1,128,128,3 input_name_2:1,256,256,3。
+- on 参数（必须）
+    通过 “-on” 参数指定模型输出的名称，例如: -on output_name1 output_name2
+- output_dir 参数：
+    可以通过 “-o <path>” 参数指定输出路径，但是在 docker 中我们一般不使用这个参数，默认会将生成的 TNN 模型放在当前和 TF 模型相同的路径下。
+- optimize 参数（可选）
+    可以通过 “-optimize” 参数来对模型进行优化，**对于固定输入维度的模型，我们强烈建议你开启这个选项，对于动态可变输入维度的模型则关闭这个选项，否则可能在维度变化时造成结果错误或者运行报错**。
+- v 参数（可选）
+    可以通过 -v 来指定模型的版本号，以便于后期对模型进行追踪和区分。
+- half 参数（可选）
+    可以通过 -half 参数指定，模型数据通过 FP16 进行存储，减少模型的大小，默认是通过 FP32 的方式进行存储模型数据的。
+- align 参数（可选）
+    可以通过 -align 参数指定转换得到的 TNN 模型和原模型对齐的模式，确定 TNN 模型是否转换成功。例如：不使用 “-align” 参数，默认不进行对齐；如果只对比 TNN 模型和原模型最后一层的输出，可以使用命令 “-align” 或 “-align output”; 如果模型不对齐，可以使用命令 “-align all” 进行逐层对齐，并输出第一层不对齐层的信息。（TensorFlow Lite 模型暂时不支持 “-align all”）。__align 只支持 FP32 模型的校验，所以使用 align 的时候不能使用 half__
+- input_file 参数（可选）
+    可以通过 -input_file 参数指定模型对齐所需要的输入文件的名称，输入需要遵循如下[格式](#输入)。生成输入的代码可以[参考](#生成输入或输出文件示例代码)。
+- ref_file 参数（可选）
+    可以通过 -ref_file 参数指定待对齐的输出文件的名称，输出需遵循如下[格式](#输出)。生成输出的代码可以[参考](#生成输入或输出文件示例代码)。
+
+
+**当前 convert2tnn 的模型只支持 graphdef 模型，不支持 checkpoint 以及 saved_model 格式的文件，如果想将 checkpoint 或者 saved_model 的模型进行转换，可以参看下面[tf2tnn](./tf2tnn.md)的部分，自行进行转换。**
+
+下面我们通过一个例子来展示如何将 TF 模型转换到 TNN 模型，
+
+``` shell script
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest  python3 ./converter.py tf2tnn \
+    -tp /workspace/test.pb \
+    -in "input0:1,32,32,3 input2:1,32,32,3" \
+    -on output0 output1 \
+    -v v2.0 \
+    -optimize \
+    -align \
+    -input_file /workspace/in.txt \
+    -ref_file /workspace/ref.txt
+```
+
+由于 convert2tnn工具是部署在 docker 镜像中的，如果要进行模型的转换,需要先将模型传输到 docker 容器中。我们可以通过 docker run 的参数--volume 将包含模型的模型挂载到 docker 容器的某个路径下。上面的例子中是将执行shell 的当前目录（pwd）挂载到 docker 容器中的 "/workspace” 文件夹下面。当然了测试用到的test.pb 也**必须执行 shell 命令的当前路径下**。执行完成上面的命令后，convert2tnn 工具会将生成的 TNN 模型存放在 test.pb文件的同一级目录下，当然了生成的文件也就是在当前目录下。
+
+上面的文档中只是介绍了 TensorFlow 的模型的转换，其他模型的使用也是类似的，可以自行通过转换工具的帮助信息的提醒进行使用，我这里不在对这些转换命令进行详细的说明，只是简单的将这些转换命令列出来，你可以仿照着进行转换。
+
+``` shell script
+# convert onnx
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py onnx2tnn \
+    /workspace/mobilenetv3-small-c7eb32fe.onnx \
+    -optimize \
+    -v v3.0 \
+    -align output \
+    -input_file /workspace/in.txt \
+    -ref_file /workspace/ref.txt
+
+# convert caffe
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py caffe2tnn \
+    /workspace/squeezenet.prototxt \
+    /workspace/squeezenet.caffemodel \
+    -optimize \
+    -v v1.0 \
+    -align  \
+    -input_file /workspace/in.txt \
+    -ref_file /workspace/ref.txt
+    
+# convert tflite
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py tflite2tnn \
+    /workspace/mobilenet_v1_1.0_224.tflite \
+    -v v1.0 \
+    -align  \
+    -input_file /workspace/in.txt \
+    -ref_file /workspace/ref.txt
+
+
+```
+
+## Convert2tnn 手动安装
+如果你不想使用 docker 镜像的方式，也可以在自己的开发机上安装 convert2tnn 的依赖工具，并根据相关的说明进行编译，也可以同样使用 convert2tnn 工具机型模型转换。
+
+convert2tnn 的完整环境搭建包含下面的所有的工具的安装和编译。如果你只想转换某一类的模型，你只需要安装转换对应模型转换的依赖工具。例如你只想转换 caffe 的模型，你就不需要安装 转换 TensorFlow 模型依赖的工具。同理你需要转换 TensorFlow 的模型，就可以不用安装 Caffe 模型转换的依赖工具。但是 ONNX 模型依赖工具和安装和编译都是必须的。
+
+针对 Linux 系统下的环境配置，我使用 Centos 7.2 为例，Ubuntu 系统也可以适用，只要将相应的安装命令修改为 Ubuntu 上的对应命令即可。  
+
+### 环境搭建及编译
+#### 1. ONNX模型转换工具搭建（必须）
+- 安装protobuf(version >= 3.4.0)  
+Macos:
+```shell script
+brew install protobuf
+```
+
+- 安装python (version >=3.6)  
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+- 安装 python 依赖库
+onnx=1.6.0  
+onnxruntime>=1.1.0   
+numpy>=1.17.0  
+onnx-simplifier>=0.2.4  
+protobuf>=3.4.0
+requests
+```shell script
+pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier protobuf requests
+```
+
+- cmake （version >= 3.0）
+从的官网下载最新版本的 cmake，然后按照文档安装即可。建议使用最新版本的 cmake。
+
+##### 编译
+onnx2tnn 工具在 Mac 以及 Linux 上有自动编译脚本直接运行就可以。
+ ```shell script
+cd <path-to-tnn>/tools/convert2tnn
+./build.sh
+ ```
+
+#### 2. TensorFlow 模型转换（可选）
+
+
+- tensorflow (version == 1.15.0)
+建议使用 TensorFlow 1.15.0 的版本，目前 TensorFlow 2.+ 的版本的兼容性不好， 不建议使用。
+```shell script
+pip3 install tensorflow==1.15.0
+```
+
+- tf2onnx （version>= 1.5.5）
+```shell script
+pip3 install tf2onnx
+```
+- onnxruntime(version>=1.1.0)
+```shell script
+pip3 install onnxruntime
+```
+
+#### 3. Caffe 模型转换（可选）
+
+- 安装protobuf(version >= 3.4.0)  
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+对于 linux 系统，我们建议参考 protobuf 的官方[README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md)文档，直接从源码进行安装。  
+
+如果你使用的是 Ubuntu 系统可以使用下面的指令进行安装：
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+- 安装python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- onnx(version == 1.6.0)
+```shell script
+pip3 install onnx==1.6.0
+```
+
+- numpy(version >= 1.17.0)
+```shell script
+pip3 install numpy
+```
+
+#### convert2tnn 工具的使用
+配置后上面的环境依赖之后，就可以使用 convert2tnn 进行相应模型的转换
+
+```shell script
+cd <path_to_tnn_root>/tools/convert2tnn/
+python3 converter.py -h
+```
+执行上面的命令会打印下面的信息。目前 convert2tnn 提供了三个子命令，分别对相应的模型进行转换。
+
+```text
+usage: convert [-h] {onnx2tnn,caffe2tnn,tf2tnn} ...
+
+convert ONNX/Tensorflow/Caffe model to TNN model
+
+positional arguments:
+  {onnx2tnn,caffe2tnn,tf2tnn}
+    onnx2tnn            convert onnx model to tnn model
+    caffe2tnn           convert caffe model to tnn model
+    tf2tnn              convert tensorflow model to tnn model
+
+optional arguments:
+  -h, --help            show this help message and exit
+```
+- ONNX模型转换
+如果想相对 ONNX 模型进行转换，可以直接使用 onnx2tnn 的子命令来查看帮助信息。
+
+```shell script
+python3 converter.py onnx2tnn -h
+```
+usage 信息如下：
+```text
+usage: convert onnx2tnn [-h] [-in input_info [input_info ...]] [-optimize]
+                        [-half] [-v v1.0.0] [-o OUTPUT_DIR] [-align]
+                        [-input_file INPUT_FILE_PATH]
+                        [-ref_file REFER_FILE_PATH] [-debug]
+                        onnx_path
+
+positional arguments:
+  onnx_path             the path for onnx file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g.,
+                        -in input1_name:1,3,128,128 input2_name:1,3,256,256
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save model using half
+  -v v1.0.0             the version for model
+  -o OUTPUT_DIR         the output tnn directory
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for
+                        the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference
+                        data to compare the results.
+  -debug                Turn on the switch to debug the model.
+```
+示例：
+```shell script
+python3 converter.py onnx2tnn \
+    ~/mobilenetv3/mobilenetv3-small-c7eb32fe.onnx.onnx \
+    -optimize \
+    -v=v3.0 \
+    -o ~/mobilenetv3/ \
+    -align \
+    -input_file in.txt \
+    -ref_file ref.txt
+```
+
+- caffe2tnn
+
+Caffe 格式转换
+
+目前 convert2tnn 的工具目前只支持最新版本的 Caffe 的文件格式,所以如果想将 Caffe 模型转换为 TNN 模型。需要先将老版本的 Caffe 网络和模型转换为新版. Caffe 自带了工具可以把老版本的
+
+Caffe 网络和模型转换为新版本的格式. 具体的使用方式如下:
+```shell script
+upgrade_net_proto_text [老prototxt] [新prototxt]
+upgrade_net_proto_binary [老caffemodel] [新caffemodel]
+```
+修改后的输入的格式如下所示:
+
+```text
+layer {
+  name: "data"
+  type: "input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 224 dim: 224 } }
+}
+```
+
+
+```shell script
+python3 converter.py caffe2tnn -h
+```
+usage 信息如下：
+```text
+usage: convert caffe2tnn [-h] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half]
+                         prototxt_file_path caffemodel_file_path
+
+positional arguments:
+  prototxt_file_path    the path for prototxt file
+  caffemodel_file_path  the path for caffemodel file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model, default v1.0
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save model using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for
+                        the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference
+                        data to compare the results.
+```
+示例：
+```shell script
+python3 converter.py caffe2tnn \
+    ~/squeezenet/squeezenet.prototxt \
+    ~/squeezenet/squeezenet.caffemodel \
+    -optimize \
+    -v v1.0 \
+    -o ~/squeezenet/ \
+    -align \
+    -input_file in.txt \
+    -ref_file ref.txt
+```
+- tensorflow2tnn
+
+当前 convert2tnn 的模型只支持 graphdef 模型，不支持 checkpoint 以及 saved_model 格式的文件，如果想将 checkpoint 或者 saved_model 的模型进行转换，可以参看下面的 tf2onnx 的部分，自行进行转换。
+
+``` shell script
+python3 converter.py tf2tnn -h
+```
+usage 信息如下：
+```text
+usage: convert tf2tnn [-h] -tp TF_PATH -in input_info [input_info ...] -on output_name [output_name ...] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half] [-align] [-input_file INPUT_FILE_PATH]
+                      [-ref_file REFER_FILE_PATH]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -tp TF_PATH           the path for tensorflow graphdef file
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g., -in input1_name:1,128,128,3 input2_name:1,256,256,3
+  -on output_name [output_name ...]
+                        the tensorflow model's output name. e.g. -on output_name1 output_name2
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save the mode using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference data to compare the results.
+```
+- tensorflow-lite2tnn
+
+当前 tensorflow-lite2tnn 的转换支持tflite格式文件，从而方便移动端部署。
+
+``` shell script
+python3 converter.py tflite2tnn -h
+```
+usage 信息如下：
+```
+usage: convert tflite2tnn [-h] TF_PATH [-o OUTPUT_DIR] [-v v1.0] [-align]
+
+optional arguments:
+  -h, --help            show this help message and exit
+   TF_PATH           the path for tensorflow-lite graphdef file
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for
+                        the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference
+                        data to compare the results.
+```
+示例：
+```shell script
+python3 converter.py tflite2tnn \
+    ~/tf-model/test.tflite \
+    -o ~/tf-model/ \
+    -align \
+    -input_file in.txt \
+    -ref_file ref.txt
+```
+
+## 输入输出文件格式示例
+### 输入
+```text
+
+输入数量 
+输入名称 shape维度个数 具体shape信息 输入数据类型
+输入数据 
+输入名称 shape维度个数 具体shape信息 输入数据类型
+输入数据 
+......
+
+例如
+ 2 
+ in0 4 1 3 1 1 3
+ 2 
+ 4 
+ 3 
+ in1 4 1 2 2 1 0
+ 0.1 
+ 0.2 
+ 0.3 
+ 0.4 
+
+
+提示：
+如果输入数据是 float, 输入数据类型可以用 0 表示
+如果输入数据是 int  , 输入数据类型可以用 3 表示
+
+```
+
+### 输出
+```text
+
+输出数量 
+输出名称 shape维度个数 具体shape信息 输出数据类型
+输出数据 
+输出名称 shape维度个数 具体shape信息 输出数据类型
+输出数据
+......
+
+例如
+ 2 
+ out0 2 1 3 0
+ 0.1 
+ 0.2 
+ 0.3 
+ out1 4 1 2 2 1 0
+ 0.1 
+ 0.2 
+ 0.3 
+ 0.4 
+
+
+提示：
+如果输出数据是 float, 输出数据类型可以用 0 表示
+如果输出数据是 int  , 输出数据类型可以用 3 表示
+
+```
+
+### 生成输入或输出文件示例代码
+```python
+def write_pytorch_data(output_path, data, data_name_list):
+    """
+    Save the data of Pytorch needed to align TNN model.
+
+    The input and output names of pytorch model and onnx model may not match,
+    you can use Netron to visualize the onnx model to determine the data_name_list.
+
+    The following example converts ResNet50 to onnx model and saves input and output:
+    >>> from torchvision.models.resnet import resnet50
+    >>> model = resnet50(pretrained=False).eval()
+    >>> input_data = torch.randn(1, 3, 224, 224)
+    >>> input_names, output_names = ["input"], ["output"]
+    >>> torch.onnx.export(model, input_data, "ResNet50.onnx", input_names=input_names, output_names=output_names)
+    >>> with torch.no_grad():
+    ...     output_data = model(input_data)
+    ...
+    >>> write_pytorch_data("input.txt", input_data, input_names)
+    >>> write_pytorch_data("output.txt", output_data, output_names)
+
+    :param output_path: Path to save data.
+    :param data: The input or output data of Pytorch model.
+    :param data_name_list: The name of input or output data. You can get it after visualization through Netron.
+    :return:
+    """
+
+    if type(data) is not list and type(data) is not tuple:
+        data = [data, ]
+    assert len(data) == len(data_name_list), "The number of data and data_name_list are not equal!"
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(data)))
+        for name, data in zip(data_name_list, data):
+            data = data.numpy()
+            shape = data.shape
+            description = "{} {} ".format(name, len(shape))
+            for dim in shape:
+                description += "{} ".format(dim)
+            data_type = 0 if data.dtype == np.float32 else 3
+            fmt = "%0.6f" if data_type == 0 else "%i"
+            description += "{}".format(data_type)
+            f.write(description + "\n")
+            np.savetxt(f, data.reshape(-1), fmt=fmt)
+
+
+def write_tensorflow_data(output_path, data, data_name_list, data_usage=1):
+    """
+    Save the data of TensoFlow needed to align TNN model.
+
+    :param output_path: Path to save data. "You should use input.txt or output.txt to name input or output data"
+    :param data: The input or output data of TensorFlow model.
+    :param data_name_list: The name of input or output data. You can get it after visualization through Netron.
+    :param data_usage: Specify the data usage. If the data is input data, data_usage=0;
+                       if the data if outptu data, data_usage=1.
+    :return:
+    """
+    def convert_nhwc(data):
+        assert len(data.shape) <= 4
+        if len(data.shape) == 2:
+            return data
+        orders = (0, 2, 1) if len(data.shape) == 3 else (0, 2, 3, 1)
+        return data.transpose(orders)
+
+    if type(data) is not list and type(data) is not tuple:
+        data = [data, ]
+    assert len(data) == len(data_name_list), "The number of data and data_name_list are not equal!"
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(data)))
+        for name, data in zip(data_name_list, data):
+            data = convert_nhwc(data) if data_usage == 0 else data
+            shape = data.shape
+            description = "{} {} ".format(name, len(shape))
+            for dim in shape:
+                description += "{} ".format(dim)
+            data_type = 0 if data.dtype == np.float32 else 3
+            fmt = "%0.6f" if data_type == 0 else "%i"
+            description += "{}".format(data_type)
+            f.write(description + "\n")
+            np.savetxt(f, data.reshape(-1), fmt=fmt)
+
+
+```
+
+
+## 模型转换详细介绍
+convert2tnn 只是对多种模型转换的工具的封装，根据第一部分 “模型转换介绍”中原理说明，你也可以先将原始模型转换成 ONNX，然后再将 ONNX 模型转换成 TNN 模型。我们提供了如何手动的将 Caffe、PyTorch、TensorFlow 模型转换成 ONNX 模型，然后再将 ONNX 模型转换成 TNN 模型的文档。如果你在使用 convert2tnn 转换工具遇到问题时，我们建议你了解下相关的内容，这有可能帮助你更加顺利的进行模型转换。
+
+- [onnx2tnn](onnx2tnn.md)
+- [pytorch2tnn](onnx2tnn.md)
+- [tf2tnn](tf2tnn.md)
+- [caffe2tnn](caffe2tnn.md)
+- [tflite2tnn](tflite2tnn.md)
+
diff --git a/3rdparty/TNN/doc/cn/user/demo.md b/3rdparty/TNN/doc/cn/user/demo.md
new file mode 100644
index 0000000..1613dcc
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/demo.md
@@ -0,0 +1,591 @@
+# Demo 代码介绍
+
+[English Version](../../en/user/demo_en.md)
+
+## 一、iOS Demo 介绍
+
+### Demo运行步骤
+
+1. 下载Demo模型
+
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+   
+   可选：如果需要执行OCR demo，还需要准备opencv库。可以使用提供的脚本下载opencv。
+   ```
+   cd <path_to_tnn>/scripts
+   sh download_opencv.sh iOS
+   ```
+
+   PS: 如因网络问题脚本无法下载模型或依赖库，请根据脚本中的信息手动创建对应文件夹并自行下载
+
+2. 打开TNNExamples工程
+
+   进入目录`<path_to_tnn>/examples/ios/`，双击打开TNNExamples工程。
+   
+   可选：如果需要执行OCR demo，需要将opencv加入TNNExamples的依赖项中。
+   
+   如下图点击TNNExamples工程，找到工程设置`General`，在`Framworks, Libraries, and Embedded Content`选项卡下点击`+`。
+
+   <div align=left ><img src="./resource/ios_add_framework.jpg" width = "75%" height = "75%"/>
+
+   在打开的界面中选择`Add Other-Add Files...`，找到opencv2.framework，并添加。使用提供的`<path_to_tnn>/scripts/download_opencv.sh`时，下载的opencv位于`<path_to_tnn>/third_party/opencv/iOS`目录下。
+
+   <div align=left ><img src="./resource/ios_add_framework_from_files.jpg" width = "75%" height = "75%"/>
+
+   由于opencv2.framework中包含真机和模拟器多平台的代码，需要按下图将`Embed`选项设置为`Do Not Embed`。
+
+   <div align=left ><img src="./resource/ios_framework_notembed.jpg" width = "75%" height = "75%"/>
+
+   最后，为了确保编译器可以找到opencv.framework，需要确认opencv.framework所在目录被添加到`Framework Search Paths`中。如下图所示，找到工程设置`Build Settings`，在`Search Paths`选项卡下找到`Framework Search Paths`。如果opencv.framework所在的目录不存在，需要双击这一条目，并添加。
+
+    <div align=left ><img src="./resource/ios_framework_search_path.jpg" width = "75%" height = "75%"/>
+
+
+3. 设置开发者账号
+
+   如下图点击TNNExamples工程，找到工程设置`Signing & Capabilities`，点击Team选项卡选择`Add an Account...`
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_add_account_demo.jpg" width = "75%" height = "75%"/>
+
+   在如下界面输入Apple ID账号和密码，添加完成后回到`Signing & Capabilities`界面，并在Team选项卡中选中添加的账号。如果没有Apple ID也可以通过`Create Apple ID`选项根据相关提示进行申请。
+
+   `PS：申请Apple ID无需付费，可以即时通过，通过后才可在真机上运行APP调试`
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_set_account.jpg" width = "75%" height = "75%"/>
+
+4. 真机运行  
+
+   4.1 修改`Bundle Identitifier`
+
+   如图在现有`Bundle Identifier`后随机添加后缀（限数字和字母），避免个人账户遇到签名冲突。
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_set_bundleid_demo.jpg" width = "75%" height = "75%"/>
+
+4.2 验证授权
+
+首次运行先利用快捷键`Command + Shift + K`对工程进行清理，再执行快捷键`Command + R`运行。如果是首次登陆Apple ID，Xcode会弹框报如下错误，需要在iOS设备上根据提示进行授权验证。一般来说手机上的授权路径为：设置 -> 通用 -> 描述文件与设备管理 -> Apple Development选项 -> 点击信任
+
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_verify_certificate_demo.jpg" width = "75%" height = "75%"/>
+
+4.3 运行结果
+
+首次运行先利用快捷键`Command + Shift + K`对工程进行清理，再执行快捷键`Command + R`运行。默认界面为人脸检测，可以点击右上角编辑按钮切换图像分类等不同功能。
+
+PS：
+
+a) 由于GPU和CPU加速原理不同，具体模型的GPU性能不一定比CPU高，与具体机型、模型结构以及工程实现有关。欢迎大家参与到TNN开发中，共同进步。
+
+b) tnn_sdk_sample.h中的宏TNN_SDK_USE_NCNN_MODEL默认为0，运行TNN模型，可以设置为1来运行ncnn模型。
+
+   c) 如遇到`Unable to install...`错误提示，请在真机设备上删除已有的TNNExamples，重新运行安装。
+
+   d) 真机运行时，如果遇到CodeSign错误`Command CodeSign failed with a nonzero exit code`，可参看issue20 `iOS Demo运行步骤说明`
+
+c) 如果需要执行OCR demo，需要将tnn_sdk_sample.h中的宏HAS_OPENCV设置为1，否则不会编译OCR demo代码。
+
+### Demo运行效果
+
+1. 人脸检测
+
+   模型来源：https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   效果示例：iPhone 7, ARM 单线程 6.3206ms
+
+  <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/face_detector.jpg" width = "33%" height = "33%"/>
+
+2. 图像分类
+
+   模型来源：https://github.com/forresti/SqueezeNet
+
+   效果示例：iPhone 7, ARM 单线程 13.83ms
+
+  <div align =left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/image_classify.jpg" width = 33% height = "33%"/>
+
+## 二、Android Demo 介绍
+
+### 运行环境要求
+
+1. Android Studio 3.5 或以上
+2. NDK version >= 18, <= 21
+NDK 22和23在链接第三方动态库可能会出错，例如opencv，hiai，不建议使用。
+
+### 运行步骤
+
+1. 下载Demo模型
+
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+
+   可选：如果需要执行OCR demo，还需要下载opencv库。
+   ```
+   cd <path_to_tnn>/scripts
+   sh download_opencv.sh android
+   ```
+
+   PS: 
+   
+   如因网络问题脚本无法下载模型，请根据脚本中的信息手动创建对应文件夹并自行下载.
+   
+    想要使用NPU运行demo需要需首先下载NPU ddk。详情参考: [FAQ](../faq.md): 创建华为NPU编译环境。
+
+2. 打开TNNExamples工程
+
+   - 进入目录`<path_to_tnn>/examples/android/`，双击打开TNNExamples工程文件`build.gradle`。
+
+   - 将手机连接到电脑，点击`Run Demo`编译和运行demo。
+
+   - 工程默认编译64位armv8库，如要添加32位armv7库，可在`build.gradle`中修改为`abiFilters "armeabi-v7a", "arm64-v8a"`。
+
+   PS ：
+   
+   1).  想要使用NPU, 打开工程后，需要手动设置打开NPU：
+   在<path_to_tnn>/examples/android/demo/CMakeList.txt中, 更新指令为如下，使用华为NPU。
+   ````
+        set(TNN_HUAWEI_NPU_ENABLE ON CACHE BOOL "" FORCE)
+   ````
+   2). 第一次运行如果遇到 `<path_to_tnn>/examples/android/src/main/jni/thirdparty/hiai_ddk/include/graph`Permission Denied 的情况，
+   Clean Project 再重新运行。
+   
+   3). 当前只有rom版本 >= 100.320.xxx.xxxx的华为机型支持IR构建事例模型。参考：[FAQ](../faq.md): 更新到最新的ROM支持NPU。
+   
+   4). 运行demo需要需首先下载NPU DDK。参考: [FAQ](../faq.md): 创建华为NPU编译环境。
+
+   5). 想要执行OCR demo, 打开工程后，需要手动设置打开OPENCV依赖：
+   在<path_to_tnn>/examples/android/demo/CMakeList.txt中, 更新指令为如下，使用OPENCV。
+   ````
+        set(TNN_OPENCV_ENABLE ON CACHE BOOL "" FORCE)
+   ````
+
+   如果通过上述`download_opencv.sh`下载OpenCV库，不需要再指定路径。
+   如果想要使用自定义的OpenCV Android SDK，需要指定OPENCV_ANDROID_SDK_PATH路径。
+   在<path_to_tnn>/examples/android/demo/CMakeList.txt中, 更新指令为如下。
+   ````
+        set(OPENCV_ANDROID_SDK_PATH <path_to_opencv_android_sdk>)
+   ````
+      
+ 
+### 运行效果
+1. 人脸检测-图片
+
+   模型来源：https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   效果示例：华为P30, ARM 单线程 32.2359ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_face_detector_image.jpg" width = "25%" height = "25%"/>
+    
+    效果示例： 华为P30, 华为NPU rom 100.320.010.022 9.04ms
+    
+    <div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_face_detecor_image_npu.jpg" width = "25%" height = "25%"/>
+    
+
+2. 人脸检测-视频
+   模型来源：https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   效果示例：华为P30, ARM 单线程 122.296ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_face_detector_stream.jpg" width = "25%" height = "25%"/>
+    
+    效果示例： 华为P30, 华为NPU rom 100.320.010.022 28ms
+    
+    <div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_face_detector_stream_npu.jpg" width = "25%" height = "25%"/>
+
+3. 图像分类
+
+   模型来源：https://github.com/forresti/SqueezeNet
+
+   效果示例：华为P30, ARM 单线程 81.4047ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_image_classify.jpg" width = "25%" height = "25%"/>
+    
+   效果示例： 华为P30, NPU rom 100.320.010.022 2.48ms
+    
+   <div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_image_classify_npu.jpg" width = "25%" height = "25%"/>
+    
+## 三、Linux/Mac/Windows/ArmLinux/CudaLinux Demo 介绍
+### 功能
+* 快速在 Linux/Mac/Windows/ArmLinux/CudaLinux 环境下运行模型，展示 TNN 接口的使用方法。
+
+### 使用步骤
+#### 1. 下载 Demo 模型
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+   如因网络问题脚本无法下载模型，请根据脚本中的信息手动创建对应文件夹并自行下载
+
+#### 2. 编译
+##### Linux
+* 环境要求  
+   - Cmake (>=3.11)
+   - OpenCV3 (只有webcam的demo会用), 可在CMake中通过find_package(OpenCV 3) 成功找到依赖项。
+
+   ```
+   // 手动编译OpenCV3
+   wget https://github.com/opencv/opencv/archive/3.4.13.zip
+   unzip 3.4.13.zip
+   cd opencv-3.4.13
+
+   mkdir build
+   mkdir install
+   cd build
+
+   cmake -DCMAKE_INSTALL_PREFIX=../install ..
+   make -j4
+   make install
+
+   // 在CMakeList.txt的find_packpage之前添加OpenCV路径
+   // 例如，进入examples/linux/x86，打开CMakeList.txt
+   // 在find_package(OpenCV 3 REQUIRED)之前添加
+   set(OpenCV_DIR <path_to_opencv>/opencv-3.4.13/install/share/OpenCV)
+   ```
+
+* 编译  
+   进入 `examples/linux/x86` 目录，执行 `build_linux_native.sh`或`build_linux_openvino.sh`。前者使用TNN实现的优化X86后端执行，后者基于Intel OpenVINO后端执行。以`build_linux_native.sh`为例，默认仅编译处理图像的demo，如需编译基于摄像头的人脸配准demo，需要将`build_linux_native.sh`中的"-DTNN_DEMO_WITH_WEBCAM=OFF"修改为"-DTNN_DEMO_WITH_WEBCAM=ON":
+   ```
+   cd <path_to_tnn>/examples/linux/x86
+   ./build_linux_native.sh
+   ```
+* 执行  
+   进入 `examples/linux/x86/build_linux_native` 或 `examples/linux/x86/build_linux_openvino` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build_linux_native
+   ./demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和tnnmodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build_linux_native
+
+   图形分类 demo
+   ./demo_x86_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   人脸检测 demo
+   ./demo_x86_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   物体检测 demo
+   ./demo_x86_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   阅读理解 demo
+   ./demo_x86_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+
+   摄像头人脸配准 demo
+   ./demo_x86_webcam
+   ```
+
+##### MacOS
+* 环境要求
+   - Cmake (>=3.11)
+   - OpenCV3 (只有webcam的demo会用), 确保可在CMake中通过 `find_package(OpenCV 3)`找到， 可通过brew安装(```brew install opencv@3 && brew link --force opencv@3```)，如果失败可使用手动编译
+
+   ```
+   // 手动编译OpenCV3
+   wget https://github.com/opencv/opencv/archive/3.4.13.zip
+   unzip 3.4.13.zip
+   cd opencv-3.4.13
+
+   mkdir build
+   mkdir install
+   cd build
+
+   cmake -DCMAKE_INSTALL_PREFIX=../install ..
+   make -j4
+   make install
+
+   // 在CMakeList.txt的find_packpage之前添加OpenCV路径
+   // 例如，进入examples/mac/x86，打开CMakeList.txt
+   // 在find_package(OpenCV 3 REQUIRED)之前添加
+   set(OpenCV_DIR <path_to_opencv>/opencv-3.4.13/install/share/OpenCV)
+   ```
+
+* 编译  
+   进入 `examples/mac/x86` 目录，执行 `build_macos_native.sh`或`build_macos_openvino.sh`。前者使用TNN实现的优化X86后端执行，后者基于Intel OpenVINO后端执行。以`build_macos_native.sh`为例，默认仅编译处理图像的demo，如需编译基于摄像头的人脸配准demo，需要将`build_macos_native.sh`中的"-DTNN_DEMO_WITH_WEBCAM=OFF"修改为"-DTNN_DEMO_WITH_WEBCAM=ON":
+   ```
+   cd <path_to_tnn>/examples/mac/x86
+   ./build_macos_native.sh
+   ```
+* 执行  
+   进入 `examples/mac/x86/build_macos_native` 或 `examples/mac/x86/build_macos_openvino` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build_macos_native
+   ./demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和tnnmodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build_macos_native
+   
+   图形分类 demo
+   ./demo_x86_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   人脸检测 demo
+   ./demo_x86_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   物体检测 demo
+   ./demo_x86_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   阅读理解 demo
+   ./demo_x86_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+
+   摄像头人脸配准 demo
+   ./demo_x86_webcam
+   ```
+
+##### Windows
+* 环境要求  
+   - Visual Studio (>=2017)
+   - Cmake (>=3.11 或使用 Visual Studio Prompt 运行脚本)
+   - OpenCV3，需要使用相同版本的vc编译。
+* 编译  
+   打开 `x64 Native Tools Command Prompt for VS 2017/2019`.
+   进入 `examples\windows\x86` 目录，执行 `build_msvc_native.bat`或`build_msvc_openvino.bat`。前者使用TNN实现的优化X86后端执行，后者基于Intel OpenVINO后端执行。以`build_msvc_native.bat`为例，默认仅编译处理图像的demo，如需编译基于摄像头的人脸配准demo，需要将`build_msvc_native.bat`中的"-DTNN_DEMO_WITH_WEBCAM=OFF"修改为"-DTNN_DEMO_WITH_WEBCAM=ON":
+   ```
+   set OpenCV_DIR=`OPENCV_INSTALL_DIR`
+   cd <path_to_tnn>\examples\windows\x86
+   .\build_msvc_native.bat
+   ```
+* 执行  
+   进入 `examples\windows\x86\build_msvc_native\release` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build_msvc_native\release
+   .\demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >.\demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和tnnmodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build_msvc_native\release
+   
+   图形分类 demo
+   .\demo_x86_imageclassify -p ..\..\..\..\..\model\SqueezeNet\squeezenet_v1.1.tnnproto -m ..\..\..\..\..\model\SqueezeNet\squeezenet_v1.1.tnnmodel -i ..\..\..\..\assets\tiger_cat.jpg
+
+   人脸检测 demo
+   .\demo_x86_facedetector -p ..\..\..\..\..\model\face_detector\version-slim-320_simplified.tnnproto -m ..\..\..\..\..\model\face_detector\version-slim-320_simplified.tnnmodel -i ..\..\..\..\assets\test_face.jpg
+
+   物体检测 demo
+   .\demo_x86_objectdetector -p ..\..\..\..\model\mobilenet_v2-ssd\mobilenetv2_ssd.tnnproto -m ..\..\..\..\model\mobilenet_v2-ssd\mobilenetv2_ssd.tnnmodel -i ..\..\..\assets\004545.jpg
+
+   阅读理解 demo
+   .\demo_x86_readingcomprehension -p ..\..\..\..\..\model\bertsquad10\bertsquad10_clean.tnnproto -m ..\..\..\..\..\model\bertsquad10\bertsquad10_clean.tnnmodel -v ..\..\..\..\..\model\bertsquad10\vocab.txt
+   
+   摄像头人脸检测配准 demo
+   .\demo_x86_webcam
+   ```
+
+##### ArmLinux
+* 环境要求
+   - Cmake (>= 3.1)
+   - 交叉编译需要安装编译工具链
+   - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu      gcc-aarch64-linux-gnu  
+      arm32hf: sudo apt-get install g++-arm-linux-gnueabihf  gcc-arm-linux-gnueabihf
+   - other linux: 下载 arm toolchain: https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+* 编译  
+   进入 `examples/linux/cross` 目录
+   ```
+   cd <path_to_tnn>/examples/linux/cross
+   ```
+   修改 `build_aarch64_linux.sh` 或 `build_armhf_linux.sh`，以aarch64为例，需要配置编译选项：
+   ```
+   CC=aarch64-linux-gnu-gcc
+   CXX=aarch64-linux-gnu-g++
+   TNN_LIB_PATH=../../../scripts/build_aarch64_linux/
+   ```
+   执行 `build_aarch64_linux.sh`
+   ```
+   sh build_aarch64_linux.sh
+   ```
+* 执行  
+   进入 `examples/linux/cross/build` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build
+   ./demo_arm_linux_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_arm_linux_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和tnnmodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build
+
+   图形分类 demo
+   ./demo_arm_linux_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   人脸检测 demo
+   ./demo_arm_linux_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   物体检测 demo
+   ./demo_arm_linux_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+   ```
+
+##### CudaLinux
+* 环境要求
+   - Cmake (>= 3.8)
+   - CUDA (>= 10.2)
+   - TensorRT (>= 7.1)
+
+* 编译
+   设置环境变量 `TENSORRT_ROOT_DIR`
+   ```
+   export TENSORRT_ROOT_DIR = <TensorRT_path>
+   ```
+   设置环境变量 `CUDNN_ROOT_DIR`
+   ```
+   export CUDNN_ROOT_DIR = <CuDNN_path>
+   ```
+   进入 `examples/linux/cuda` 目录, 执行 `build_linux.sh`:
+   ```
+   cd <path_to_tnn>/examples/linux/cuda
+   sh build_linux.sh
+   ```
+* 执行
+   进入 `examples/linux/cuda/build_cuda_linux` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build_cuda_linux
+   ./demo_cuda_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_cuda_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和tnnmodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build_cuda_linux
+
+   图形分类 demo
+   ./demo_cuda_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   人脸检测 demo
+   ./demo_cuda_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   物体检测 demo
+   ./demo_cuda_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   阅读理解 demo
+   ./demo_cuda_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+   ```
+
+### 常见问题
+
+#### Demo执行问题
+1. 执行demo时报错: "open file xxx failed"
+
+   该错误由输入图像路径错误引起，请检查输入文件的路径
+
+2. 执行demo时报错: "open lable file xxx failed"
+
+   该错误由输入标签文件路径错误引起，图像分类demo需要预定义的标签文件，默认文件路径在`<path_to_tnn>/examples/assets/synset.txt`
+
+#### CUDA编译问题
+
+1. 编译时报错: "not defined environment variable:CUDNN_ROOT_DIR"或"not defined environment variable:TENSORRT_ROOT_DIR"
+
+   请根据CUDA编译步骤，检查环境变量`CUDNN_ROOT_DIR`和`TENSORRT_ROOT_DIR`是否正确设置
+### 函数流程
+#### 图像分类函数流程
+* 创建predictor
+   ```cpp
+   auto predictor = std::make_shared<ImageClassifier>();
+   ```
+* 初始化predictor  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Init(option));
+   // 对 Linux/Window(OpenVINO)
+   option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+   // 对 Linux/Window(X86 native)
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // 对 ArmLinux
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // 对 CUDA
+   option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+   ```
+* 创建输入mat  
+   ```cpp
+   // 对 Linux/Window(OpenVINO)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_X86, TNN_NS::N8UC3, nchw, data);
+   // 对 Linux/Window(X86 native)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   // 对 ArmLinux
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ARM, TNN_NS::N8UC3, nchw, data);
+   // 对 CUDA
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   ```
+* 执行predictor  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output));
+   ```
+#### 人脸检测函数流程
+* 创建predictor  
+   ```cpp
+   auto predictor = std::make_shared<UltraFaceDetector>();
+   ```
+* 初始化predictor  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Init(option));
+   // 对 Linux/Window(OpenVINO)
+   option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+   // 对 Linux/Window(X86 native)
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // 对 ArmLinux
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // 对 CUDA
+   option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+   ```
+* 创建输入mat  
+   ```cpp
+   // 对 Linux/Window(OpenVINO)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_X86, TNN_NS::N8UC3, nchw, data);
+   // 对 Linux/Window(X86 native)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   // 对 ArmLinux
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ARM, TNN_NS::N8UC3, nchw, data);
+   // 对 CUDA
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   ```
+* 执行predictor  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Predict(std::make_shared<UltraFaceDetectorInput>(image_mat), sdk_output));
+   ```
+* 人脸标记  
+   ```cpp
+   TNN_NS::Rectangle((void *)ifm_buf, image_orig_height, image_orig_width, face.x1, face.y1, face.x2, face.y2, scale_x, scale_y);
+   ```
+
+## 四、NCNN 模型使用及接口介绍
+
+- [NCNN相关](ncnn.md)
+
+
diff --git a/3rdparty/TNN/doc/cn/user/ncnn.md b/3rdparty/TNN/doc/cn/user/ncnn.md
new file mode 100644
index 0000000..67dd83f
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/ncnn.md
@@ -0,0 +1,105 @@
+# ncnn 接口及模型使用文档
+
+[English Version](../../en/user/ncnn_en.md)
+
+## ncnn 模型使用文档
+
+使用ncnn 模型时，需要在 TNN 初始化参数 NetworkConfig 中指明 ModelType 为 MODEL_TYPE_NCNN。
+
+具体代码参考：
+    
+    ModelConfig model_config;
+    model_config.model_type = MODEL_TYPE_NCNN;
+    TNN net;
+    Status ret = net.Init(model_config);
+    auto instance = net.CreateInst(network_config, ret);
+
+TNN Instance 在创建时需要声明默认InputShape，通常ncnn.param 的Input 层中会说明。如果其中未指明的话，需要在创建Instance 代码中指明。
+具体参考: 
+
+    InputShapesMap input_shape;
+    input_shape["input_name"] = {1, 3, 224, 224};
+    auto instance = net.CreateInst(network_config, ret, input_shape);
+
+其他方面使用与正常调用流程相同，可具体参考其他文档。
+
+[Demo示例](demo.md)中可将examples/samples/TNNSDKSample.h中的宏TNN_SDK_USE_NCNN_MODEL设置为1来运行ncnn模型。
+
+
+
+### 当前适配完成的NCNN Op
+
+目前对NCNN OP 支持情况如下, Int8 模型适配还在进行中。
+
+| Operators                  |    NCNN   |   TNN   |
+|----------------------------|-----------|---------|
+|MemoryData                  |    ✅     |    ✅     |
+|AbsVal                      |    ✅       |    ✅     |
+|ArgMax                      |    ✅   |      ✅     |
+|BatchNorm                   |    ✅       |    ✅     |
+|Bias                        |           |     TODO    |
+|BinaryOp                    |    ✅       |    ✅     |
+|BNLL                        |           |    TODO     |
+|Cast                        |           |    TODO     |
+|Clip                        |     ✅      |  ✅       |
+|Concat                      |     ✅      |    ✅     |
+|Convolution                 |     ✅      |    ✅     |
+|ConvolutionDepthWise        |     ✅      |    ✅     |
+|Crop                        |     ✅      |    ✅     |
+|Deconvolution               |     ✅      |    ✅     |
+|DeconvolutionDepthWise      |     ✅      |    ✅     |
+|Dequantize                  |           |     TODO    |
+|DetectionOutput             |     partial      |   ✅      |
+|Dropout                     |     ✅      |    ✅     |
+|Eltwise                     |     ✅      |    ✅     |
+|ELU                         |     ✅      |    ✅     |
+|Embed                       |           |     TODO    |
+|Exp                         |     TODO      |    ✅     |
+|ExpandDims                  |           |     TODO    |
+|Flatten                     |     ✅      |    ✅     |
+|HardSigmoid                 |     ✅      |    ✅     |
+|HardSwish                   |     ✅      |    ✅     |
+|InnerProduct                |     ✅      |    ✅     |
+|InstanceNorm                |     ✅      |    ✅     |
+|Interp                      |     ✅      |    ✅     |
+|Log                         |           |     TODO      |
+|LRN                         |     ✅      |    ✅     |
+|MVN                         |           |     TODO    |
+|Noop                        |           |     TODO    |
+|Normalize                   |     ✅      |    ✅    |
+|Packing                     |           |     TODO    |
+|Padding                     |     ✅      |    ✅     |
+|Permute                     |     ✅      |    ✅     |
+|Pooling                     |     ✅      |    ✅     |
+|Power                       |     TODO      |       ✅     |
+|PReLU                       |     ✅      |    ✅     |
+|PriorBox                    |     ✅      |    ✅     |
+|Proposal                    |           |     TODO    |
+|PSROIPooling                |           |     TODO    |
+|Quantize                    |           |     TODO    |
+|Reduction                   |     ✅      |    ✅     |
+|ReLU                        |     ✅      |    ✅     |
+|Reorg                       |     ✅      |    ✅     |
+|Requantize                  |           |     TODO    |
+|Reshape                     |     ✅      |    ✅     |
+|ROIAlign                    |           |     TODO    |
+|ROIPooling                  |     TODO     |       ✅  |
+|Scale                       |     ✅      |    ✅     |
+|SELU                        |     ✅      |    ✅     |
+|ShuffleChannel              |     ✅      |    ✅     |
+|Sigmoid                     |     ✅      |    ✅     |
+|Slice                       |     ✅      |    ✅     |
+|Softmax                     |     ✅      |    ✅     |
+|Split                       |     ✅      |    ✅     |
+|SPP                         |             |   TODO      |
+|Squeeze                     |           |     TODO    |
+|TanH                        |     ✅      |    ✅     |
+|Threshold                   |           |     TODO    |
+|Tile                        |      |      TODO   |
+|UnaryOp                     |     ✅      |    ✅     |
+|RNN                         |      |    TODO     |
+|LSTM                        |      |    TODO     |
+
+
+## ncnn 模型使用文档
+
diff --git a/3rdparty/TNN/doc/cn/user/onnx2tnn.md b/3rdparty/TNN/doc/cn/user/onnx2tnn.md
new file mode 100644
index 0000000..2d1d25b
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/onnx2tnn.md
@@ -0,0 +1,190 @@
+# ONNX(Pytorch) 模型转换为 TNN 模型
+
+[English Version](../../en/user/onnx2tnn_en.md)
+
+onnx2tnn 是 TNN 中最重要的模型转换工具，它的主要作用是将 ONNX 模型转换成 TNN 模型格式。目前 onnx2tnn 工具支持主要支持 CNN 常用网络结构。由于 Pytorch 模型官方支持支持导出为 ONNX 模型，并且保证导出的 ONNX 模型和原始的 Pytorch 模型是等效的，所以我们只需要保证 ONNX 模型能够转换为 TNN 模型，就直接能够保证 Pytorch 直接转换为 TNN 模型。
+
+onnx2tnn 有开箱即用的 [网页版](https://convertmodel.com/#outputFormat=tnn)，使用网页版的用户可以跳过环境搭建和编译的步骤。网页版在浏览器本地完成模型转换，不会将模型上传到云端，用户可以不用担心模型安全。
+
+## 1. 环境搭建及编译
+### 环境搭建
+以下的环境搭建适用于 Macos 以及 Linux 系统，其中 Linux 以 centos7.2 为例。
+
+- 安装protobuf(version >= 3.4.0)  
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+对于 linux 系统，我们建议参考protobuf 的官方[README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md)文档，直接从源码进行安装。  
+
+如果你使用的是Ubuntu 系统可以使用下面的指令进行安装：
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+
+
+- 安装python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+Centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- 安装 python 依赖库
+onnx=1.6.0  
+onnxruntime>=1.1.0   
+numpy>=1.17.0  
+onnx-simplifier>=0.2.4  
+protobuf>=3.4.0
+requests
+```shell script
+pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier protobuf requests
+```
+
+- cmake （version >= 3.0）
+从的官网下载最新版本的 cmake，然后按照文档安装即可。建议使用最新版本的 cmake。
+
+### 编译
+onnx2tnn 工具在 Mac 以及 Linux 上有自动编译脚本直接运行既可以。
+ ```shell script
+cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
+./build.sh 
+ ```
+
+如果想自己进行编译的话，可以按照下面的步骤自己进行编译。
+```shell script
+# 新建build目录进行编译
+mkdir build
+cd build cmake ./../
+make -j 4
+
+#复制生成的so库
+cp ./*.so ./../
+
+#删除build目录
+cd ./../
+rm -r build
+```
+
+##### 手动编译
+
+虽然我们提供了自动化编译的脚本，当然了你也可以自己手动的进行编译。手动编译过程如下所示：
+
+1. 切换到工具目录
+```shell script
+cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
+```
+
+2. onnx_converter 的编译
+```shell script
+# 新建build目录进行编译
+mkdir build
+cd build cmake ./../
+make -j 4
+
+#复制生成的so库
+cp ./*.so ./../
+
+#删除build目录
+cd ./../
+rm -r build
+```
+
+## 2. onnx2tnn 工具的使用
+
+首先查看工具的帮助信息：
+```shell script
+python3 onnx2tnn.py -h
+```
+help 信息如下所示：
+```text
+usage: onnx2tnn.py [-h] [-version VERSION] [-optimize OPTIMIZE] [-half HALF]
+                   [-o OUTPUT_DIR] [-input_shape INPUT_SHAPE]
+                   onnx_model_path
+
+positional arguments:
+  onnx_model_path       Input ONNX model path
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -version VERSION      Algorithm version string
+  -optimize OPTIMIZE    If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half HALF            Save model using half, 1:yes, 0:default no
+  -o OUTPUT_DIR         the output dir for tnn model
+  -input_shape INPUT_SHAPE
+                        manually-set static input shape, useful when the input
+                        shape is dynamic
+```
+
+
+```shell script
+python3 onnx2tnn.py model.onnx -version=v1.0 -optimize=1 -half=0 -o out_dir/ -input_shape input:1,3,224,224
+```
+```text
+参数说明：
+-version
+模型版本号，便于后续算法进行跟踪
+
+-optimize
+1（默认，开启）: 用于对模型进行无损融合优化，，如BN+Scale等f融合进Conv层；
+0 ：如果融合报错可以尝试设为此值
+
+-half
+1: 转为FP16模型存储，减小模型大小。
+0（默认，不开启）: 按照FP32模型存储。
+Note: 实际计算是否用FP16看各个平台特性决定，移动端GPU目前仅支持FP16运算
+
+-o
+output_dir : 指定 TNN 模型的存放的文件夹路径，该文件夹必须存在
+
+-input_shape
+模型输入的 shape，用于模型动态 batch 的情况
+```
+
+
+## 3. 算子支持情况以及使用限制
+目前 onnx2tnn 的工具能够支持的算子列表，可以在[模型支持](support.md)
+- 目前 tnn2onnx 只支持 4 维(nchw)的数据类型。
+- 建议将模型输入的 batch size 设置为 1，不建议将 batch_size设置为比较大的值。
+- 目前暂不支持 pool 中的 pad 为非对称的情况。（经测试inceptionv1 模型中的 "pool5/7x7\_s1\_1"会出现这种特殊的情况）
+- onnxruntime1.1版本 的 Upsample层与Pytoch的Upsample层在 align_corners = 0 模式下结果不一致，对齐结果时谨慎使用onnxruntime的计算结果。
+
+# Pytorch 模型转换为 ONNX 模型
+
+Pytorch 支持直接将训练好的模型转换为 ONNX 模型是，所以使用 Pytorch 自带的导出方法，会很方便的将Pytorch 模型导出为 ONNX 模型，下面的代码展示如何将 resnet50的 Pytorch 模型导出为 ONNX 模型。
+Pytorch 也提供了更详细的文档来介绍如何将 Pytorch 模型导出为 ONNX 模型，具体可以参考[pytorch export onnx](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html)
+
+```python
+import torch.hub
+import numpy as np
+
+torch.hub.list('pytorch/vision')
+
+se_resnet50 = torch.hub.load(
+    'moskomule/senet.pytorch',
+    'se_resnet50',
+    pretrained=True,)
+
+senet = se_resnet50()
+senet.load_state_dict(torch.load("./seresnet50-60a8950a85b2b.pkl"))
+senet.eval()
+random_data = np.random.rand(1, 3, 224, 224).astype(np.float)
+torch.onnx.export(senet,
+				  random_data,
+				  "./sent.onnx",
+				  export_params=True,
+				  opset_version=11,
+				  do_constant_folding=True,
+				  input_names= ['input'],
+				  output_names = ['output'])
+```
+通过上面的代码，将 Pytorch 模型转换为 ONNX 模型之后，可以参考上面“ONN转换为 TNN 模型”的章节，将生成的 ONNX 模型转换为 TNN 模型。
diff --git a/3rdparty/TNN/doc/cn/user/openvino.md b/3rdparty/TNN/doc/cn/user/openvino.md
new file mode 100644
index 0000000..c6f3ef8
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/openvino.md
@@ -0,0 +1,132 @@
+# TNN X86/Openvino 使用文档
+## TNN X86/Openvino 介绍
+本模块支持 x86 架构，封装了 OPENVINO 的框架到 TNN 内部，允许使用 TNN 的模型跑 OPENVINO 的网络。
+
+## 环境要求
+### Linux
+CMake(>=3.7.2)
+### Windows
+Visual Studio(>=2017) <br>
+CMake(>=3.7.2) 或使用 Visual Studio 的 CMake 工具
+## 编译方法
+    
+使用脚本快速部署：
+```
+Linux:
+$ cd scripts/
+$ sh build_linux.sh
+
+Windows:
+cd scripts\
+.\build_msvc.bat
+```  
+如编译失败，请参考[常见问题](#常见问题)
+
+
+<!-- ### 2. 手动编译
+#### Linux
+建议从 github 代码上手动编译安装 openvino (commit 9df6a8f)，并修改文件将其编译成静态版本，具体编译方法可参考脚本 scripts/build_openvino.sh
+安装完成后参照编译脚本将 ```inference_engine``` 和 ```ngraph``` 的 ```include``` 及 ```lib``` 文件放入 ```source/tnn/network/openvino/thirdparty``` 目录，具体要求的文件及目录如下：<br/>
+```
+source/tnn/network/openvino/thirdparty/openvino/lib 需要的文件
+libinference_engine.a
+libinference_engine_legacy.a
+libinference_engine_transformations.a
+libinference_engine_lp_transformations.a
+libMKLDNNPlugin.so
+libngraph.a
+libpugixml.a
+
+source/tnn/network/openvino/thirdparty/openvino/ 需要的文件
+openvino_install_path/deployment_tools/inference_engine/include
+
+source/tnn/network/openvino/thirdparty/ngraph 需要的文件
+openvino_install_path/deployment_tools/ngraph/include
+
+TNN build 目录下需要的文件
+plugins.xml
+```
+文件放置完成后，使用如下命令编译
+```
+cmake .. \
+    -DTNN_OPENIVNO_ENABLE=ON \
+    -DTNN_X86_ENABLE=ON \
+    -DTNN_TEST_ENABLE=ON 
+
+make -j4
+```
+
+#### Windows
+环境要求：Visual Studio 开发环境（VS2019）<br>
+建议从 github 代码上手动编译安装 openvino (commit 9df6a8f)，并修改文件将其编译成静态版本，具体编译方法可参考脚本 scripts/build_openvino.bat
+安装完成后参照编译脚本将 ```inference_engine``` 和 ```ngraph``` 的 ```include``` 及 ```lib``` 文件放入 ```source/tnn/network/openvino/thirdparty``` 目录，具体要求的文件及目录如下：<br/>
+```
+source/tnn/network/openvino/thirdparty/openvino/lib 需要的文件
+inference_engine.lib
+inference_engine_legacy.lib
+inference_engine_transformations.lib
+inference_engine_lp_transformations.lib
+MKLDNNPlugin.lib
+ngraph.lib
+pugixml.lib
+
+source/tnn/network/openvino/thirdparty/openvino/ 需要的文件
+openvino_install_path/deployment_tools/inference_engine/include
+
+source/tnn/network/openvino/thirdparty/ngraph 需要的文件
+openvino_install_path/deployment_tools/ngraph/include
+
+运行需要的文件
+plugins.xml MKLDNNPlugin.dll
+```
+文件放置完成后，使用如下命令编译
+```
+cmake .. ^
+    -G "Visual Studio 16 2019" -A x64 ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_SYSTEM_NAME=Windows ^
+    -DTNN_TEST_ENABLE=ON ^
+    -DINTTYPES_FORMAT=C99 ^
+    -DTNN_OPENVINO_ENABLE=ON ^
+    -DTNN_X86_ENABLE=ON ^
+
+cmake --build . --config Release
+``` -->
+
+## 使用方法
+### 1.  快速运行
+进入 build_x86/test/ 目录，使用 TNNTest 运行模型，并指定 ```-dt X86```
+```
+$ cd build_openvino/test/
+$ ./TNNTest -mp PATH_TO_MODEL -dt X86 -ip PATH_TO_INPUT -op PATH_TO_OUTPUT
+```
+### 2.  API 调用
+参考 [API 调用](api.md)，需要在初始化网络时设置 config.device_type 为 DEVICE_X86，config.network_type 为 NETWORK_TYPE_OPENVINO
+```cpp
+config.device_type  = TNN_NS::DEVICE_X86
+// 如果network_type不设的话，则运行的是原生的X86优化
+config.network_type = TNN_NS::NETWORK_TYPE_OPENVINO
+```
+
+## demo 运行
+参考 [demo 文档](demo.md)
+
+## 常见问题
+Q: Windows 找不到 Cmake？ <br>
+A: 如果本机安装了 Cmake，将环境变量添加到 Path 中，或者使用 Visual Studio Prompt 运行脚本 build_x86_msvc.bat。
+
+Q: Windows 找不到 Visual Studio？
+A: 运行脚本时加上自己的 VS 版本：如 ```.\build_x86_msvc.bat VS2019```
+
+Q：git 克隆 Openvino 仓库失败 <br>
+A：配置 git 代理：
+```
+git config --global https.proxy http://127.0.0.1:1080
+git config --global https.proxy https://127.0.0.1:1080
+```
+
+Q: 运行时报错 0x4001 或 16385，报错信息 Invalid Model Content<br>
+A: 读入 model 流时设置为 ```std::ios::binary```
+```cpp
+std::ifstream model_stream(model_path, std::ios::binary);
+```
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/cn/user/quantization.md b/3rdparty/TNN/doc/cn/user/quantization.md
new file mode 100644
index 0000000..8dcf5c4
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/quantization.md
@@ -0,0 +1,61 @@
+# 模型量化  
+
+[English Version](../../en/user/quantization_en.md)
+
+## 一、量化的作用  
+量化将网络中主要算子（Convolution，Pooling，Binary等）由原先的浮点计算转成低精度的Int8计算，减少模型大小并提升性能。  
+PS：    
+1、关于KL量化方法，可以参考：http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf  
+
+## 二、编译  
+### 1. 编译脚本  
+```
+cd <path_to_tnn>/platforms/linux/
+./build_quanttool.sh -c
+```
+### 2. 编译输出  
+量化模型命令：<path_to_tnn>/platforms/linux/build/quantization_cmd  
+## 三、量化工具的使用  
+### 1. 命令  
+```
+./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
+```
+### 2. 参数说明  
+
+|命令参数           |是否必须|带参数 |参数说明                                       |
+|:------------------|:------:|:-----:|:----------------------------------------------|
+|-h, --help         |        |       |输出命令提示。                                 |
+|-p, --proto        |✅ |✅|指定tnnproto模型描述文件。                   |
+|-m, --model        |✅ |✅|指定tnnmodel模型参数文件。                   |
+|-i, --input_path   |✅ |✅|指定量化输入文件夹路径。目前支持格式为：<br>&bull; 文本文件（文件后缀为.txt）<br>&bull; 常用图片格式文件（文件后缀为 .jpg .jpeg .png .bmp）<br>会将此目录下面的所有文件作为输入。|
+|-b, --blob_method  |        |✅|指定feature map的量化方法：<br>&bull; 0 Min-Max方法（默认）<br>&bull; 2 KL方法|
+|-w, --weight_method|        |✅|指定weights的量化方法：<br>&bull; 0 Min-Max方法（默认）<br>&bull; 1 ADMM方法|
+|-n, --bias         |        |✅|预处理，仅对输入为图片时起作用。对输入数据各通道进行bias操作，参数格式为：0.0,0.0,0.0|
+|-s, --scale        |        |✅|预处理，仅对输入为图片时起作用。对输入数据各通道进行scale操作，参数格式为：1.0,1.0,1.0|
+|-r, --reverse_channel|        |✅|预处理，仅对输入为图片时起作用：<br>&bull; 0 使用RGB顺序（默认）<br>&bull; 1 使用BGR顺序|
+|-t, --merge_type|        |✅|在量化的时候采用Per-Tensor还是Per-Channel的方式。<br>&bull; 0 Per-Channel方法（默认）<br>&bull; 1 混合方法，weights采用Per-Channel，blob采用Per-Tensor。<br>&bull; 2 Per-Tensor方法|  
+  
+### 3. 量化输入   
+#### 3.1 输入数据的选取   
+输入数据需要包括典型的输入，否则影响输出结果的精度，图片数量在20~50左右。  
+#### 3.2 输入预处理    
+对图片的输入数据进行预处理，主要通过bias和scale参数进行。公式为：   
+input_pre = (input - bias) * scale  
+
+### 4. 命令输出  
+在执行命令的当前目录下会生成两个文件：    
+* model_quantized.tnnproto　--　量化后的模型描述文件；
+* model_quantized.tnnmodel　--　量化后的模型参数文件；
+
+### 5. 注意事项  
+（1）-n和-s参数仅对输入为图片的时候有作用；  
+（2）输入为图片时，内部会转为RGB格式进行处理；  
+（3）输入为txt时，输入数据存储方式是NCHW，且为float类型。存储格式为，一行存储一个数据，总共N\*C\*H\*W行。例如，  
+```
+0.01
+1.1
+0.1
+255.0
+...
+```
+（4）scale和mean的值必须是计算之后的值，不能使用公式，例如1.0/128.0就是无效的，而0.0078125就是可以的。
diff --git a/3rdparty/TNN/doc/cn/user/resource/android_face_detector_image.jpg b/3rdparty/TNN/doc/cn/user/resource/android_face_detector_image.jpg
new file mode 100755
index 0000000..0f9ba62
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/android_face_detector_image.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/android_face_detector_stream.jpg b/3rdparty/TNN/doc/cn/user/resource/android_face_detector_stream.jpg
new file mode 100755
index 0000000..2462984
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/android_face_detector_stream.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/android_image_classify.jpg b/3rdparty/TNN/doc/cn/user/resource/android_image_classify.jpg
new file mode 100755
index 0000000..0d1dfe4
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/android_image_classify.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/convert.png b/3rdparty/TNN/doc/cn/user/resource/convert.png
new file mode 100644
index 0000000..b2ec948
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/convert.png differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/face_detector.jpg b/3rdparty/TNN/doc/cn/user/resource/face_detector.jpg
new file mode 100644
index 0000000..8198a77
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/face_detector.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/image_classify.jpg b/3rdparty/TNN/doc/cn/user/resource/image_classify.jpg
new file mode 100644
index 0000000..0fe1f5a
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/image_classify.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_add_account_demo.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_add_account_demo.jpg
new file mode 100644
index 0000000..34f928d
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_add_account_demo.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_add_framework.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_add_framework.jpg
new file mode 100644
index 0000000..324efe6
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_add_framework.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_add_framework_from_files.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_add_framework_from_files.jpg
new file mode 100644
index 0000000..dc41b55
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_add_framework_from_files.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_framework_notembed.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_framework_notembed.jpg
new file mode 100644
index 0000000..50afd60
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_framework_notembed.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_framework_search_path.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_framework_search_path.jpg
new file mode 100644
index 0000000..a147bfb
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_framework_search_path.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_set_account.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_set_account.jpg
new file mode 100644
index 0000000..208a96d
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_set_account.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_set_bundleid_demo.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_set_bundleid_demo.jpg
new file mode 100644
index 0000000..55a4a2e
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_set_bundleid_demo.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/ios_verify_certificate_demo.jpg b/3rdparty/TNN/doc/cn/user/resource/ios_verify_certificate_demo.jpg
new file mode 100644
index 0000000..5691f13
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/ios_verify_certificate_demo.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/resource/tnn-netron.jpg b/3rdparty/TNN/doc/cn/user/resource/tnn-netron.jpg
new file mode 100644
index 0000000..825a2e9
Binary files /dev/null and b/3rdparty/TNN/doc/cn/user/resource/tnn-netron.jpg differ
diff --git a/3rdparty/TNN/doc/cn/user/roadmap.md b/3rdparty/TNN/doc/cn/user/roadmap.md
new file mode 100644
index 0000000..be872da
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/roadmap.md
@@ -0,0 +1,3 @@
+
+# Roadmap
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/roadmap.jpg"/>
diff --git a/3rdparty/TNN/doc/cn/user/support.md b/3rdparty/TNN/doc/cn/user/support.md
new file mode 100644
index 0000000..48c741e
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/support.md
@@ -0,0 +1,212 @@
+Todo: 详细描述模型及OP支持情况, 包括不同加速平台的支持情况以及包括OP在不同框架间的对应关系
+# 支持网络
+
+[English Version](../../en/user/support_en.md)
+
+目前 TNN 支持常用的 CNN 、LSTM 和 BERT 等网络：
+- Classical CNN: Vgg AlexNet GoogleNet(v1,v2,v3)
+- Practical CNN: ResNet DenseNet SENet
+- Light-weight CNN: SqueezeNet MobileNet(v1,v2,v3) ShuffleNet(v1,v2) MNasNet
+- Detection: Mtcnn-v2
+- Detection: Vgg-ssd SqueezeNet-ssd MobileNetv2-SSDLite ...
+- Detection: Yolo-v2 MobileNet-YOLOV3 ...
+- Segmentation: FCN PSPNet
+- 3D CNN: C3D T3D
+- BERT: BERT-Base BERT-Squad MobileBERT DistilBERT
+- LSTM: Crnn-LSTM
+
+| model name                | onnx2tnn | Naive | armv7 | armv8 | opencl | metal | Huawei_Npu | CUDA | x86 | OpenVINO | RKNPU |
+|---------------------------|----------|-----|-------|-------|--------|-------|-----|------|------|------|------|
+| AlexNet                   | yes      | yes | -     | -     |        | yes   | yes | yes  | yes  | yes  |      |
+| DenseNet(121)             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| FCN                       | Yes      | yes | yes   | yes   | yes    | yes   |  -  | yes  | yes  | yes  |      |
+| GoogleNet-v1              | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| GoogleNet-v2              | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| GoogleNet-v3(inception)   | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| MnasNet                   | yes      | yes |       |       |        |       |     |      | yes  | yes  |      |
+| MobileNet-v1-ssd(caffe)   | yes      | yes | -     | -     | -      | -     |  -  |  -   | yes  | yes  |      |
+| MobileNet-v1-ssd(pytorch) | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| MobileNet-v2-SSDLite      | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| MobileNet-yolov3          | ?        | ?   |       |       |        |       |     |      |      |      |      |
+| MobileNet-v1              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| MobileNet-v2              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| MobileNet-v3(small,large) | yes      | yes | yes   | yes   | yes    | yes   | No  | yes  | yes  | yes  |      |
+| Mtcnn-v2                  | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| PSPNet                    | yes      | yes | yes   | yes   | yes    | yes   | No  | yes  | yes  | yes  |      |
+| ResNet50                  | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| SENet(154)                | yes      | yes | yes   | yes   | yes    | yes   |  -  | yes  | yes  | yes  |      |
+| ShuffleNet-v1             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| ShuffleNet-v2             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| SqueezeNet-ssd            | No       | -   | -     | -     | -      | -     |  -  |  -   | -    | -    |      |
+| SqueezeNet-v1             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| UNet                      | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Vgg-ssd                   | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Vgg16                     | yes      | yes | yes   | yes   |        | yes   | yes |      |      |      |      |
+| Yolo-v3-tiny              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Yolo-v2                   | ?        | ?   | yes   | yes   | yes    | yes   | yes |      |      |      |      |
+| Yolo-v2-tiny              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Yolo-v3                   | yes      | yes | yes   | yes   | yes    | yes   | -   | yes  | yes  | yes  |      |
+| Yolo-v5s                  | yes      | yes | yes   | yes   | yes    | yes   | yes |      | yes  | yes  |      |
+| C3D                       | yes      | yes | -     | -     | -      | -     | -   |      | -    | -    |      |
+| T3D                       | yes      | yes | -     | -     | -      | -     | -   |      | -    | -    |      |
+| BERT-Base                 | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| BERT-Squad                | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| MobileBERT                | yes      | yes | -     | -     | -      | -     | -   | yes  |      |      |      |
+| DistilBERT                | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| Crnn-LSTM                 | yes      | yes | yes   | yes   | yes    | yes   | -   | yes  | yes  | yes  |      |
+
+
+1. 关于 upsample 的计算,当参数mode == "bilinear" 或者 mode == "linear", pytorch 转化出的 onnx 模型是有问题的，pytorch 和 onnx 的计算结果是不对齐的。这是 onnx 本身的 bug，这一点尤其需要注意。但是遇到这种情况请不要担心，将转换后的 ONNX 模型转换为 TNN 后，我们保证了 TNN 和 Pytorch 的计算结果是对齐的。经过测试发现会出现上述问题的网络模型有 FCN 以及 PSPNet。
+2. 上述表格中的 "?" 表示未知，由于暂时找不到相应类型的模型，所以暂时无法就该模型的兼容性进行测试。
+
+# 支持OP 
+
+| TNN Operators            | Original Operators                             | Naive | armv7 | armv8 | opencl | metal | Huawei_Npu | CUDA | x86 | OpenVINO | RKNPU |
+|--------------------------|------------------------------------------------|-----|-------|-------|--------|-------|------|-------|-------|-------|------|
+| Abs                      | Abs                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Acos                     | Acos                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Add                      | Add                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ArgMaxOrMin(ArgMax)      | ArgMax                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ArgMaxOrMin(ArgMin)      | ArgMin                                         | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| Asin                     | Asin                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Atan                     | Atan                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| BatchNormCxx             | BatchNormalization                             | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| BitShift                 | BitShift                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| Cast                     | Cast                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Ceil                     | Ceil                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Clip                     | Clip                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Concat                   | Concat                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Const                    | Constant                                       |     |       |       |        |       |      |       |       |       |      |
+| ConstantOfShape          | ConstantOfShape                                | yes |       |       |        |       |      |       |       |       |      |
+| Convolution              | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution(depthwise)   | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution(group)       | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution1D            | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution1D(depthwise) | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution1D(group)     | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D            | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D(depthwise) | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D(group)     | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Cos                      | Cos                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution            | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution(depthwise) | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution(group)     | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| DetectionOutput          | DectectionOutput(custom operator)              | yes | yes   | yes   |        |       |      | yes   | yes   | yes   |      |
+| Div                      | Div                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Dropout                  | Dropout                                        |     |       |       |        |       |      |       |       |       |      |
+| Einsum                   | Einsum                                         | yes |       |       |        |       |      | yes   |       |       |      |
+| Elu                      | Elu                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Equal                    | Equal                                          | yes |       |       |        |       |      |       |       |       |      |
+| Erf                      | Erf                                            | yes |       |       |        |       |      | yes   | yes   | yes   |      |
+| Exp                      | Exp                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Expand                   | Expand                                         | yes | yes   | yes   |        |       |      | yes   | yes   | yes   |      |
+| Flatten                  | Flatten                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Flatten                  | Shape+Gather+Constant+Unsqueeze+Concat+Reshape |     |       |       |        |       |      |       |       |       |      |
+| Floor                    | Floor                                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Gather                   | Gather                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| GatherND                 | GatherND                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| GridSample               | GridSample(PyTorch)                            | yes |       |       |        |       |      | yes   |       |       |      |
+| GroupNorm                | GroupNorm(PyTorch)                             | yes |       |       |        |       |      | yes   |       |       |      |
+| HardSigmoid              | HardSigmoid                                    | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | Add + Clip + Div + Mul                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | Add + Clip + Mul + Div                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | HardSigmoid + Mul                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| InnerProduct             | Gemm                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| InstBatchNormCxx         | InstanceNormalization                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Inverse                  | Inverse(PyTorch)                               | yes |       |       |        |       |      | yes   |       |       |      |
+| LSTMONNX                 | LSTM                                           | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| LRN                      | LRN                                            | yes |       |       |        | yes   | yes  | yes   | yes   | yes   |      |
+| Log                      | Log                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| LogSigmoid               | Sigmoid + Log                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| MatMul                   | Matmul                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Max                      | Max                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Min                      | Min                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Mul                      | Mul                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Neg                      | Neg                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| NonZero                  | NonZero                                        | yes |       |       |        |       |      |       |       |       |      |
+| Normalize                | ReduceL2+Clip+Shape+Expand+Div                 | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Normalize                | Reduce + Clip + Expand + Div                   | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| Normalize                | Mul(square)+Reduce+Max+Sqrt+Mul                | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   | yes  |
+| OneHot                   | OneHot                                         | yes |       |       |        |       |      | yes   |       |       |      |
+| PRelu                    | LeakyRelu / PRelu                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pad                      | Pad                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Permute                  | Transpose                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| PixelShuffle             | PixelShuffle(PyTorch), Depth2Space(ONNX)       | yes | yes   | yes   | yes    | yes   |      | yes   | yes   |       | yes  |
+| Pooling (Avg)            | AveragePool                                    | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Pooling (GlobalAverage)  | GlobalAveragePool                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling (GlobalMax)      | GlobalMaxPool                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling (Max)            | MaxPool                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling3D (Avg)          | AveragePool                                    | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (GlobalAverage)| GlobalAveragePool                              | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (GlobalMax)    | GlobalMaxPool                                  | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (Max)          | MaxPool                                        | yes |       |       |        |       |      | yes   |       |       |      |
+| Power                    | Pow                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   |       |      |
+| PriorBox                 | PriorBox(custom operator)                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Range                    | Range                                          | yes |       |       |        |       |      |       |       |       |      |
+| Reciprocal               | Reciprocal                                     | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceL1                 | ReduceL1                                       | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| ReduceL2                 | ReduceL2                                       | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| ReduceLogSum             | ReduceLogSum                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| ReduceLogSumExp          | ReduceLogSumExp                                | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceMax                | ReduceMax                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceMean               | ReduceMean                                     | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| ReduceMin                | ReduceMin                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceProd               | ReduceProd                                     | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| ReduceSum                | ReduceSum                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceSumSquare          | ReduceSumSquare                                | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Relu                     | Relu                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Relu6                    | Clip                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Reorg                    | DepthToSpace                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Reorg                    | SpaceToDepth                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Repeat                   | Tile                                           |     |       |       |        |       |      |       |       |       |      |
+| Reshape                  | Reshape                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| RoiAlign                 | RoiAlign                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| Rsqrt                    | Rsqrt(TFLite)                                  | yes | yes   | yes   |        |       |      |       |       |       |      |
+| ScatterND                | ScatterND                                      | yes |       |       |        |       |      | yes   | yes   | yes   |      |
+| Selu                     | Selu                                           | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| Shape                    | Shape                                          | yes |       |       |        |       | yes  | yes   |       |       |      |
+| ShuffleChannel           | Reshape + Transpose + Reshape                  | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| Sigmoid                  | Sigmoid                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Sign                     | Sign                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| SignedMul                | Sub+Sign+Add+Div+Gather+Slice+Mul              | yes | yes   | yes   |        |       |      |       | yes   |       |      |
+| Sin                      | Sin                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Size                     | Size                                           | yes |       |       |        |       |      |       |       |       |      |
+| Slice(StrideSlice)       | Slice                                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Softmax                  | Softmax                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Softplus                 | Softplus                                       | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Softsign                 | Softsign                                       | yes |       |       |        |       | yes  |       | yes   | yes   |      |
+| Split                    | Split                                          |     | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Sqrt                     | Sqrt                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| SquaredDifference        | SquaredDifference(TFLite)                      | yes |       |       |        |       |      |       |       |       |      |
+| Squeeze                  | Squeeze                                        |     | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Sub                      | Sub                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Sum                      |                                                |     |       |       |        |       |      |       |       |       |      |
+| Tan                      | Tan                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Tanh                     | Tanh                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Tile                     | Tile                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Unsqueeze                | Unsqueeze                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Upsample                 | Upsample / Resize                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Where                    | Where                                          | yes |       |       |        |       |      |       |       |       |      |
+
+
+
+1. 在上面的表格中 TNN 的HardSwish 算子对应 ONNX 算子是"Add + Clip + Div + Mul"，这代表 ONNX 的四个算子共同组合成了 TNN 中的 HardSwsh 算子。表格中的 "+" 符号代表算子的组合，其他类似的算子同理。
+2. 在上面的表格中 TNN 的 PRelu 算子对应着 ONNX 的"LeakyRelu / PRelu"，这代表 TNN 中的 RRelu 算子同时支持 ONNX 中的 PRelu 和 LeakyRelu 算子。
+
+
+# 支持硬件
+
+| device  | support |
+|-------- |---------|
+| ARMv7   |  Yes    |
+| ARMv8   |  Yes    |
+| OpenCL  |  Yes    |
+| Metal   |  Yes    |
+| 华为Npu |  Yes    |
+| RK NPU  |  Yes    |
+| X86     |  Yes    |
+| CUDA    |  Yes    |
+
+
+1. 华为NPU仅支持达芬奇架构NPU，目前有：麒麟810，麒麟820，麒麟985，麒麟990，麒麟990 5G，麒麟990E，麒麟9000，麒麟9000E等。
+2. Rockchip NPU目前只支持rk1808的fp16运行模式
diff --git a/3rdparty/TNN/doc/cn/user/support_tflite_mode.md b/3rdparty/TNN/doc/cn/user/support_tflite_mode.md
new file mode 100644
index 0000000..910b211
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/support_tflite_mode.md
@@ -0,0 +1,62 @@
+
+
+# TFLite 算子的支持
+
+
+| tflite operator         | tnn operatpr         | support |
+|-------------------------+----------------------+---------|
+| Add                     | Add                  | yes     |
+| Average_Pool_2d         | Pooling              | yes     |
+| Concatenation           | Concat               | yes     |
+| Conv_2d                 | Convolution          | yes     |
+| Cos                     | Cos                  | yes     |
+| Depthwise_Conv_2d       | Convolution          | yes     |
+| Detetion_Post_Process   | DetectionPostProcess | yes     |
+| Div                     | Div                  | yes     |
+| Exp                     | Exp                  | yes     |
+| Full_Connected          | InnerProduct         | yes     |
+| LeakyRelu               | Prelu                | yes     |
+| Log                     | Logistic             | yes     |
+| Logistic                | Sigmoid              | yes     |
+| Max_Pool_2d             | Pooling              | yes     |
+| Maximum                 | Maximum              | yes     |
+| Mean                    | ReduceMean           | yes     |
+| Minimum                 | Minimum              | yes     |
+| Mul                     | Mul                  | yes     |
+| Neg                     | Neg                  | yes     |
+| Pad                     | Pad                  | yes     |
+| Padv2                   | Pad                  | yes     |
+| Prelu                   | Prelu                | yes     |
+| Reshape                 | Reshape              | yes     |
+| Resize_Biliner          | Upsample             | yes     |
+| Resize_Nearest_Neighbor | Upsample             | yes     |
+| Sin                     | Sin                  | yes     |
+| Softmax                 | Softmax              | yes     |
+| Split                   | SplitV               | yes     |
+| SplitV                  | SplitV               | yes     |
+| Squeeze                 | Squeeze              | yes     |
+| StridedSlice            | StridedSlice         | yes     |
+| Sub                     | Sub                  | yes     |
+| Tanh                    | Tanh                 | yes     |
+| Transpose_Conv          | Deconvolution        | yes     |
+
+
+# TFLite 模型的支持
+
+
+| tflite model                           | support align |
+|----------------------------------------+---------------|
+| alexnet                                | yes           |
+| densenet_2018_04_27                    | yes           |
+| face_landmark(media pipe)              | yes           |
+| inception_v3_2018_04_27                | yes           |
+| inception_v4_2018_04_27                | yes           |
+| mobiletnet_v1_1.0_224                  | yes           |
+| mobiletnet_v2_1.0_224                  | yes           |
+| object_detection_3d(shoes, media pipe) | yes           |
+| resnet_v2_101_229                      | yes           |
+| squeezenet_2018_04_26                  | yes           |
+| ssd                                    | yes           |
+| vgg16                                  | yes           |
+| yolo_tiny                              | yes           |
+| yolov2_tiny                            | yes           |
diff --git a/3rdparty/TNN/doc/cn/user/tech_solution.md b/3rdparty/TNN/doc/cn/user/tech_solution.md
new file mode 100644
index 0000000..056db45
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/tech_solution.md
@@ -0,0 +1,70 @@
+# 技术方案
+
+[English Version](../../en/user/tech_solution_en.md)
+
+TNN作为一个移动端高性能、轻量级的推理框架，同时拥有跨平台、高性能、模型压缩、代码裁剪等众多突出优势。TNN框架借鉴了业界主流开源框架的优点，沉淀和整合了优图实验室Rapidnet，ncnn框架上的积累，并联合深度学习框架OTeam各个部门（PCG，TEG，IEG），共同打造公司级统一移动端推理框架。
+目前，TNN已在各大实际业务中上线，其具有的以下特性获得了广泛的好评。
+
+* 计算优化
+    * 针对不同架构在硬件指令发射、吞吐、延迟、缓存带宽、缓存延迟、寄存器数量等特点，深度优化底层算子，极致利用硬件算力
+    * 主流硬件平台(CPU: ARMv7， ARMv8， GPU: Mali， Adreno， Apple) 深度调优
+    * CNN核心卷积运算通过Winograd， Tile-GEMM， Direct Conv等多种算法实现，保证不同参数、计算尺度下高效计算
+    * Op融合：离线分析网络计算图，多个小Op（计算量小、功能较简单）融合运算，减少反复内存读取、kernel启动等开销
+
+* 低精度优化
+    * 支持INT8， FP16低精度计算，减少模型大小、内存消耗，同时利用硬件低精度计算指令加速计算
+    * 支持INT8 WINOGRAD算法，（输入6bit）， 在精度满足要求的情况下，进一步降低模型计算复杂度
+    * 支持单模型多种精度混合计算，加速计算同时保证模型精度
+
+* 内存优化
+    * 高效”内存池”实现：通过DAG网络计算图分析，实现无计算依赖的节点间复用内存，降低90%内存资源消耗
+    * 跨模型内存复用：支持外部实时指定用于网络内存，实现“多个模型，单份内存”。
+
+* 主流模型实测性能：v0.1 2020.05.29
+
+>  麒麟970：
+
+   | model                     | cpu 1 thread(ms) | gpu time(ms) |
+   |---------------------------|--------------|--------------|
+   | Mobilenet_v1              | 88           |   12         |
+   | Mobilenet_v1_int8         | 55           |              |
+   | Mobilenet_v2              | 58           |   11         |
+   | Mobilenet_v2_int8         | 41           |              |
+   | squeezenet_v1.0           | 127          |   20         |
+   | squeezenet_v1.0_int8      | 82           |              |
+
+
+>  骁龙835：
+
+ | model                     | cpu 1 thread(ms) | gpu time(ms) |
+ |---------------------------|--------------|--------------|
+ | Mobilenet_v1              | 94           |   16         |
+ | Mobilenet_v1_int8         | 62           |              |
+ | Mobilenet_v2              | 61           |   14         |
+ | Mobilenet_v2_int8         | 47           |              |
+ | squeezenet_v1.0           | 122          |   28         |
+ | squeezenet_v1.0_int8      | 93           |              |
+
+
+>  骁龙845：
+
+
+| model                     | cpu 1 thread(ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 60           |   10         |
+| Mobilenet_v1_int8         | 37           |              |
+| Mobilenet_v2              | 39           |   8          |
+| Mobilenet_v2_int8         | 28           |              |
+| squeezenet_v1.0           | 74           |   14         |
+| squeezenet_v1.0_int8      | 56           |              |
+
+
+* TNN架构图：
+
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/tnn_architect.jpg" width = "75%" height = "75%"/>
+
+* 通过ONNX支持TensorFlow， Pytorch， MxNet， Caffe等多种训练框架，充分利用和融入不断完善的ONNX开源生态。当前支持ONNX算子55个，近期会完善到约80个，覆盖主流CNN网络
+* 支持主流安卓、iOS、embedded Linux，windows操作系统，支持ARM CPU， GPU硬件平台（近期还会加入达芬奇NPU支持）
+* 模块化设计，将模型解析、计算图构建、优化、底层硬件适配、高性能kernel实现各部分抽象隔离，通过Factory Mode注册、构建设备，方便接入更多的底层硬件、加速方案。
+* Runtime无任何第三方库依赖，CPU动态库尺寸仅约400KB，并提供基础图像变换操作，调用简单便捷。跨平台模型统一、调用接口统一，通过单个配置参数快速切换。
diff --git a/3rdparty/TNN/doc/cn/user/test.md b/3rdparty/TNN/doc/cn/user/test.md
new file mode 100644
index 0000000..36195f2
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/test.md
@@ -0,0 +1,37 @@
+# 测试方法
+
+[English Version](../../en/user/test_en.md)
+
+## 一、编译
+参考[安装编译](./compile.md)。
+打开测试模型开关：  
+* `TNN_TEST_ENABLE:BOOL=ON`  
+* 对应device的宏，如`TNN_OPENCL_ENABLE`, `TNN_ARM_ENABLE`,`TNN_HUAWEI_NPU_ENABLE`
+* 编译完成后，build目录下会生成测试可执行文件test/TNNTest，可在Linux, 安卓ADB等环境下直接运行
+
+## 二、测试方法使用
+### 1. 命令
+```
+TNNTest
+必选参数：
+    -mp 模型proto位置(模型model需要在同文件夹下同前缀名)
+    -dt DEVICE类型（ARM, OPENCL, HUAWEI_NPU, X86, CUDA）
+常用可选参数：
+    -nt network类型（默认naive， 华为Npu需要特殊指定 -nt HUAWEI_NPU）
+    -op 输出文件位置   
+    -ic 循环次数  
+    -wc warmup运行次数
+    -dl 设备list
+    -ip 输入文件
+    -it（输入类型，默认为NCHW float）
+    -th (CPU线程数)  
+
+测试会输出模型耗时：time cost: min = xx   ms  |  max = xx   ms  |  avg = xx   ms
+
+也可作为benchmark工具使用，使用时需要制定wc >= 1，因为第一次运行会准备内存、上下文等增加时间消耗
+
+```
+P.S. 华为NPU
+NPU需要把HiAI so动态库push到手机上，并将他们添加到LD_LIBRARY_PATH环境变量中.
+可以参考 TNN/platform/android/test_android.sh 运行TNNTest
+ 
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/cn/user/tf2tnn.md b/3rdparty/TNN/doc/cn/user/tf2tnn.md
new file mode 100644
index 0000000..0335291
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/tf2tnn.md
@@ -0,0 +1,38 @@
+# Tensorflow 模型转换为 TNN 模型
+
+[English Version](../../en/user/tf2tnn_en.md)
+
+要将 Tensorflow 模型转换为 TNN 模型，首先将 Tensorflow 模型转换为 ONNX 模型，然后再将ONNX 模型转换为 TNN 模型。
+
+将 Tensorflow 模型转换为ONNX，我们借助于 ONNX 开源社区的力量，ONNX 开源社区提供的开源的转换工具 tf2onnx，可以直接将 Tensorflow 模型转换为 ONNX 模型。在下面的文档中，会简单的介绍如何使用 tf2onnx 进行转换。我们强烈建议你去 tf2onnx 的官网，去了解 tf2onnx 的详细用法，这会帮助你更好的将 TensorFlow模型转换为 TNN 模型。当使用 tf2onnx 将 Tensorflow 模型转换为 ONNX 之后，建议参考 [onnx2tnn](onnx2tnn.md) 的相关文档，将 ONNX 模型转换为 TNN。
+
+
+tf2onnx项目地址：https://github.com/onnx/tensorflow-onnx
+
+
+## 1. 环境搭建（Mac and Linux）
+- tensorflow (version == 1.15.0)
+建议使用 tensorflow 1.15.0 的版本，目前 tensorflow 2.+ 的版本的兼容性不好， 不建议使用。
+```shell script
+pip3 install tensorflow==1.15.0
+```
+
+- tf2onnx （version>= 1.5.5）
+```shell script
+pip3 install tf2onnx
+```
+- onnxruntime(version>=1.1.0)
+```shell script
+pip3 install onnxruntime
+```
+
+## 2. tf2onnx 工具的使用
+
+下面是对 test.pb 进行转换的命令，使用起来非常方便。建议大家阅读 tf2onnx 的 README.md 文件，里面有详细的对该工具各个参数的说明。
+``` shell script
+python3 -m tf2onnx.convert  --graphdef test.pb   --inputs input_data:0[1,28,28,3]  --outputs pred:0   --opset 11 --output  test.onnx --inputs-as-nchw input_data:0
+```
+
+## 3. tf2onnx 支持的算子
+
+该工具支持的算子的列表: [support op](https://github.com/onnx/tensorflow-onnx/blob/master/support_status.md)
diff --git a/3rdparty/TNN/doc/cn/user/tflite2tnn.md b/3rdparty/TNN/doc/cn/user/tflite2tnn.md
new file mode 100644
index 0000000..62729f2
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/tflite2tnn.md
@@ -0,0 +1,93 @@
+#Tensorflow-lite 模型转换为 TNN 模型
+
+[English Version](../../en/user/onnx2tnn_en.md)
+
+tflite2tnn 是 TNN 中最重要的模型转换工具，它的主要作用是将 TensorFlow-Lite 模型转换成 TNN 模型格式。
+
+TensorFlow-Lite 模型可以直接转换成TNN模型。接下来文档将简要介绍如何用 tflite2tnn 工具转换模型。
+## 1. 环境搭建及编译
+### 环境搭建
+以下的环境搭建适用于 Macos 以及 Linux 系统，其中 Linux 以 centos7.2 为例。
+
+- 安装protobuf(version >= 3.4.0)  
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+对于 linux 系统，我们建议参考protobuf 的官方[README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md)文档，直接从源码进行安装。  
+
+如果你使用的是Ubuntu 系统可以使用下面的指令进行安装：
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+
+
+- 安装python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+Centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- 安装 python 依赖库  
+numpy>=1.17.0  
+protobuf>=3.4.0
+```shell script
+pip3 install numpy  protobuf
+```
+
+- cmake （version >= 3.0）
+从的官网下载最新版本的 cmake，然后按照文档安装即可。建议使用最新版本的 cmake。
+
+### 编译
+tflite2tnn 工具在 Mac 以及 Linux 上有自动编译脚本直接运行既可以。
+ ```shell script
+cd <path-to-tnn>/tools/convert2tnn
+./build.sh 
+ ```
+
+## 2. tflite2tnn 工具的使用
+
+首先查看工具的帮助信息：
+```shell script
+python3 converter.py tflite2tnn  -h
+```
+help 信息如下所示：
+```text
+usage: converter.py tflite2tnn [-h] tflitemodel_path [-version VERSION] [-o OUTPUT_DIR]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -version VERSION      Algorithm version string
+  -optimize OPTIMIZE    If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half HALF            Save model using half, 1:yes, 0:default no
+  -o OUTPUT_DIR         the output dir for tnn model
+  -input_shape INPUT_SHAPE
+                        manually-set static input shape, useful when the input
+                        shape is dynamic
+```
+
+
+```shell script
+python3 converter.py tflite2tnn  test.tflite
+```
+```text
+参数说明：
+-version
+模型版本号，便于后续算法进行跟踪
+-o
+output_dir : 指定 TNN 模型的存放的文件夹路径，该文件夹必须存在
+```
+
+
+## 3. 算子支持情况以及使用限制
+目前 tflite2tnn 的工具能够支持的算子列表，可以在[支持列表](support_tflite_mode.md)
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/cn/user/tnn2mem.md b/3rdparty/TNN/doc/cn/user/tnn2mem.md
new file mode 100644
index 0000000..f276627
--- /dev/null
+++ b/3rdparty/TNN/doc/cn/user/tnn2mem.md
@@ -0,0 +1,37 @@
+TNN为使用者提供了内存读取的模型的转换工具。首先，在TNN编译时打开开关
+
+```
+mkdir build
+cd build
+cmake ..  -DTNN_TNN2MEM_ENABLE=ON 
+```
+
+然后就可以在tools/tnn2mem 目录下得到可执行工具tnn2mem,这里我们以常见的mobilenetv2为例
+
+```
+cd tools/tnn2mem
+./tnn2mem mobilenetv2.tnnproto mobilenetv2.tnnmodel mobilenetv2.h
+```
+
+模型的参数会以非明文的形式保存在生成的mobilenetv2.h文件中
+
+为了读取模型，我们需要头文件
+
+```
+#include "mobilenetv2.h"
+#include "tnn/core/common.h"
+#include "tnn/utils/string_utils.h"
+#include <string>
+```
+
+在加载模型时，我们需要定义模型变量
+
+```
+ModelConfig model_config;
+std::string mobilenetv2_tnnproto_string = UcharToString(mobilenetv2_tnnproto,mobilenetv2_tnnproto_length);
+std::string mobilenetv2_tnnmodel_string = UcharToString(mobilenetv2_tnnmodel,mobilenetv2_model_length);
+model_config.params.push_back(mobilenetv2_tnnproto_string);
+model_config.params.push_back(mobilenetv2_tnnmodel_string);
+```
+
+在变量model_config中储存模型信息，之后便可按照所需要的补齐其他参数进行推理
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/conf.py b/3rdparty/TNN/doc/conf.py
new file mode 100644
index 0000000..dee77a8
--- /dev/null
+++ b/3rdparty/TNN/doc/conf.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = u'TNN'
+copyright = u'2020, Tencent'
+author = u'Tencent'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u'1.0.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark', 'sphinx_markdown_tables']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'TNNdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'TNN.tex', u'TNN Documentation',
+     u'Tencent', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'tnn', u'TNN Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'TNN', u'TNN Documentation',
+     author, 'TNN', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
diff --git a/3rdparty/TNN/doc/en/development/add_op_en.md b/3rdparty/TNN/doc/en/development/add_op_en.md
new file mode 100644
index 0000000..22507f4
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/add_op_en.md
@@ -0,0 +1,182 @@
+# Add a New Operator
+
+[中文版本](../../cn/development/add_op.md)
+
+If the operator is not found in [Operator List](../user/support_en.md), you can add a new operator through the following steps.
+* [Add operator parser](#1)
+* [Add Layer implementation](#2)
+* [Add LayerAcc implementation](#3)
+* [Add unit test](#4)
+
+## 1. Add Operator Parser <span id = "1"> </span>
+### 1.1 Add operator parameters
+
+* Add LayerType
+(1) Modify the file `<path_to_TNN>/source/tnn/core/layer_type.h`, add a new operator in` LayerType`, the format is `LAYER_XXX`.
+(2) Modify the file `<path_to_TNN>/source/tnn/core/layer_type.cc` and add the operator name corresponding to the new operator value in` global_layer_type_map`, which needs to be consistent with the name of the layer in the proto file.
+
+* Add LayerParam
+If the new operator has other parameters besides the input and output blobs in proto, you need to add LayerParam: modify the file `<path_to_TNN>/source/tnn/interpreter/layer_param.h`, add a structure similar to` ConvLayerParam`, inherit from `LayerParam`
+```
+ struct ConvLayerParam: public LayerParam {
+     int pad_type = -1;
+     // input channels of blob, devide by group
+     int input_channel = 0;
+     // the total output channels of blob, not devide by group
+     int output_channel = 0;
+     // [w_begin w_end h_begin h_end d_begin d_end]
+     std :: vector <int> pads;
+     // order [w h d]
+     std :: vector <int> kernels;
+     // order [w h d]
+     std :: vector <int> strides;
+     // order [w h d]
+     std :: vector <int> dialations;
+     int group = 1;
+     int bias = 0;
+     int activation_type = ActivationType_None;
+ };
+```
+
+
+* Add LayerResource
+If the new operator has parameters that need to be saved in the model, you need to add LayerResource, modify the file `<path_to_TNN>/source/tnn/interpreter/layer_resource.h`, add a structure similar to` ConvLayerResource`, inherited from `LayerResource`
+```
+ struct ConvLayerResource: public LayerResource {
+     // conv layer filter format
+     ConvLayerFilterFormat filter_format = OIHW;
+
+     // conv layer handle
+     // NOTE: for deconv, the weight's default format is [n] [i] [o] [h] [w]
+     RawBuffer filter_handle;
+
+     // bias handle
+     RawBuffer bias_handle;
+
+     // extra scale handle for different precision
+     RawBuffer scale_handle;
+ };
+```
+
+### 1.2 Add LayerInterpreter
+If the new operator adds LayerParam or LayerResource, you need to add the corresponding `LayerInterpreter`. Add the corresponding implementation in the folder `<path_to_TNN>/source/tnn/interpreter/tnn/layer_interpreter`.
+(1) Declare the Interpreter of the new operator through `DECLARE_LAYER_INTERPRETER ()`;
+(2) Register the Interpreter of the new operator through `REGISTER_LAYER_INTERPRETER ()`;
+(3) Implement the following interface:
+* `InterpretProto ()` -- Parsing the new operator's LayerParam
+* `InterpretResource ()` -- Parsing the new operator's LayerResource
+* `SaveProto ()` --  Save the new operator's LayerParam
+* `SaveResource ()` -- Save the new operator's LayerResource
+
+## 2. Add Layer Implementation <span id = "2"> </span>
+Add the corresponding layer implementation under the folder `<path_to_TNN>/source/tnn/layer`.
+(1) `DECLARE_LAYER ()` declares the Layer implementation of the new operator;
+(2) `REGISTER_LAYER ()` register Layer implementation for new operators;
+(3) Implement the following interface:
+* `InferOutputDataType ()` -- set the data type of the corresponding layer's output blob
+* `InferOutputShape ()` -- calculate the size of the output blob of the corresponding layer
+
+## 3. Add LayerAcc Implementation <span id = "3"> </span>
+Each new operator needs to implement the LayerAcc for the corresponding device.
+### 3.1 CPU platform
+Add the LayerAcc implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/device/cpu/acc`.
+(1) `DECLARE_CPU_ACC ()` declares the LayerAcc implementation of the new operator;
+(2) `REGISTER_CPU_ACC ()` register LayerAcc implementation for the new operators;
+(3) Implement the following interface:
+* `Forward ()` -- CPU implementation of the new operator;
+  
+### 3.2 ARM platform
+Add the LayerAcc implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/device/arm/acc`.
+(1) Declare the LayerAcc implementation of the new operator, if there are no special parameters, you can directly use the `DECLARE_ARM_ACC ()` declaration;
+(2) `REGISTER_ARM_ACC ()` register LayerAcc implementation for the new operator;
+(3) Implement the following interface:
+* `Init ()` -- process LayerParam and LayerResource;
+* `Reshape ()` -- implement logic when the input blob size changes;
+* `Forward ()` -- ARM implementation of the new operator;
+
+
+### 3.3 OpenCL platform
+Add the LayerAcc implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/device/opencl/acc`.
+(1) Declare the LayerAcc implementation of the new operator. If there are no special parameters, you can directly use the `DECLARE_OPENCL_ACC ()` declaration;
+(2) `REGISTER_OPENCL_ACC ()` register LayerAcc implementation for new operators;
+(3) Implement the following interface:
+* `Init ()` -- process LayerParam and LayerResource to create OpenCL kernel;
+* `Reshape ()` -- to implement the logic when the input blob size changes, for OpenCL, call SegArgs here to set the kernel parameters;
+* `Forward ()` -- implement OpenCL kernel;
+
+(4) To implement OpenCL kernel, add the corresponding kernel file in the directory `<path_to_TNN>/source/tnn/device/opencl/cl`, with .cl as the suffix. After that, you need to execute the script:
+ ```
+ python opencl_codegen.py
+ ```
+
+### 3.4 Metal platform
+Add the LayerAcc implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/device/metal/acc`.
+(1) Declare the LayerAcc implementation of the new operator, if there are no special parameters, you can directly use the `DECLARE_METAL_ACC ()` declaration;
+(2) `REGISTER_METAL_ACC ()` register LayerAcc implementation for the new operator;
+(3) Implement the following interface:
+* `Init ()`
+* `Reshape ()`
+* `Forward ()`
+
+### 3.5 Huawei NPU platform
+In folder`<path_to_TNN>/source/tnn/device/huawei_npu/convert`, add  the LayerConvert implementation of the corresponding operator.
+(1）Declare the LayerConvert implementation of the new operator，if no other input weights，Use`DECLARE_NPU_LAYER` to declare；  
+(2）`REGISTER_NPU_LAYER` register LayerConvert implementation of new Operator；  
+(3）Implement the following function：   
+* `Convert()` -- use IR to convert the tnn operator；  
+
+### 3.6 X86 platform
+
+#### 3.6.1 openvino operator import
+Add the OpenVINOLayerBuilder implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/network/openvino/layer_builder`
+(1) Declare the OpenVINOLayerBuilder implementation of the new operator with `DECLARE_OPENVINO_LAYER_BUILDER`
+(2) Register OpenVINOLayerBuilder implementation with `REGISTER_OPENVINO_LAYER_BUILDER`
+(3) Implement the following function:
+* `Build()` -- convert the tnn operator to ngraph node；  
+
+Register custom op for those operator with poor performance or unsupported in the floder `<path_to_TNN>/source/tnn/network/openvino/custom_layer`
+(1) use `DECLARE_CUSTOM_IMPLEMENTATION`to declare the custom op and `REGISTER_CUSTOM_IMPLEMENTATION` to register
+(2) construct ngrah node with custom op in `Build()`
+
+#### 3.6.2 kernel implementation
+Add the LayerAcc implementation of the corresponding operator in the folder `<path_to_TNN>/source/tnn/device/x86/acc`.
+(1) Declare the LayerAcc implementation of the new operator, if there are no special parameters, you can directly use the `DECLARE_X86_ACC ()` declaration;
+(2) `REGISTER_X86_ACC ()` register LayerAcc implementation for the new operator;
+(3) Implement the following interface:
+* `Init ()` -- process LayerParam and LayerResource;
+* `Reshape ()` -- implement logic when the input blob size changes;
+* `Forward ()` -- X86 implementation of the new operator
+
+
+
+### 3.7 CUDA platform
+
+#### 3.7.1 tensorrt operator import
+Add the TensorRTLayerBuilder implementation of the corresponding operator under the folder `<path_to_TNN>/source/tnn/network/tensorrt/layer_builder`.  
+(1) To declare the TensorRTLayerBuilder implementation of the new operator, you can directly use the `DECLARE_TENSORRT_LAYER_BUILDER` declaration;  
+(2) `REGISTER_TENSORRT_LAYER_BUILDER` registers the new operator implementation of TensorRTLayerBuilder;  
+(3) Implement the following interfaces:  
+* `AddToNetwork()` -- Network import corresponding tensorrt operator.
+  
+For operators that are not supported by tensorrt or have poor performance, plugin op can be registered to replace tensorrt's op.  
+(1) Use `DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER` and `REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER` to declare and register under `<path_to_TNN>/source/tnn/network/tensorrt/layer_build`  
+(2) Implement the following interfaces:  
+* `supportsFormatCombination` -- determine the supported data type and data arrangement;  
+* `getPluginType` -- custom plugin type;  
+* `getOutputDataType` -- set output data type;  
+* `AddToNetwork` -- implement plugin import network;  
+* `getOutputDimensions` -- return the output size calculation formula  
+* `getPluginName` -- custom plugin name  
+
+#### 3.7.2 kernel implementation
+Add the LayerAcc implementation of the corresponding operator under the folder `<path_to_TNN>/source/tnn/device/cuda/acc`.  
+(1) Declare the LayerAcc implementation of the new operator. If there are no special parameters, you can directly use the `DECLARE_CUDA_ACC()` statement;  
+(2) `REGISTER_CUDA_ACC()` Registers the LayerAcc implementation of the new operator;  
+(3) Implement the following interfaces:  
+* `Init()` -- Process LayerParam and LayerResource;  
+* `Reshape()` -- Realize the logic when the input blob size changes;  
+* `Forward()` -- CUDA implementation of the new operator;  
+
+
+## 4. Add Unit Test <span id = "4"> </span>
+Add the unit test file of the corresponding layer in the folder `<path_to_TNN>/test/unit_test/layer_test`.
diff --git a/3rdparty/TNN/doc/en/development/architecture_en.md b/3rdparty/TNN/doc/en/development/architecture_en.md
new file mode 100644
index 0000000..f711aa3
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/architecture_en.md
@@ -0,0 +1,109 @@
+# Code Architecture 
+
+[中文版本](../../cn/development/architecture.md)
+
+## I. API Design 
+
+Considering the maintenance and compatibility of the open-source library, all externally exposed interfaces are managed uniformly through the include directory. For introduction of specific API, please refer to [API document](../user/api_en.md) 
+
+## II. Model Reinterpreter 
+
+The interface related to the model interpreter is abstracted, which can support multiple model formats' parsing. See the source/tnn/interpreter module for related codes. 
+
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/model_reinterpreter.png"/> 
+
+AbstractModelInterpreter defines an abstract Interpret interface, and different model parsers parse different types of models. The interface related to DefaultModelInterpreter stores the relevant results in the NetStruture and NetResource structures. Some third-party models which cannot complete the interpretation would need a separate path such as CoreMLModelInterpreter, to complete third-party library adaptation. 
+
+Different model parsers have their corresponding creators. 
+
+```cpp 
+// @brief ModelInterpreterCreator define model interpreter creator interface 
+class ModelInterpreterCreator { 
+public: 
+virtual ~ ModelInterpreterCreator () {}; virtual AbstractModelInterpreter * CreateModelInterpreter () = 0;}; 
+
+// @brief TypeModelInterpreterCreator create different type model interpreter 
+template <typename T> 
+class TypeModelInterpreterCreator: public ModelInterpreterCreator { 
+virtual AbstractModelInterpreter * CreateModelInterpreter () { return new T (); }}; 
+``` 
+
+Different model interpreter creators are registered through Register. 
+
+```cpp 
+// @ brief TypeModelInterpreterRegister register TypeModelInterpreterCreator 
+template <typename T> 
+class TypeModelInterpreterRegister { 
+public: 
+TypeModelInterpreterRegister (ModelType type) { GetGlobalModelInterpreterCreatorMap () [type] = std :: shared_ptr <T> (new T ()); }}; 
+
+``` 
+
+Take the TNN model interpretation and registration as an example: TypeModelInterpreterRegister\<TypeModelInterpreterCreator\<ModelInterpreter>>g\_tnn\_model\_interpreter\_register(MODEL\_TYPE\_TNN); 
+
+Through the TypeModelInterpreterRegister constructor, the TypeModelInterpreterCreator\<ModelInterpreter>corresponding to the TNN can be registered in the global model interpreter creator map, and then the corresponding creator can be obtained through the model type and the corresponding model interpreter can be constructed. 
+
+
+## III. Network Construction 
+
+The network construction mainly includes two parts, the first part is the network layer construction, and the second part is the blob node construction. 
+
+
+```cpp 
+
+// @ brief BaseLaye define the layer interface 
+class BaseLayer { 
+public: 
+... 
+virtual Status Init (Context * context, LayerParam * param, 
+LayerResource * resource, std :: vector <Blob *> & inputs, std :: vector <Blob *> & outputs, AbstractDevice * device); 
+...}; 
+
+``` 
+
+Similar to the previous model registration mechanism, different Layers will register different Layer Creators. After obtaining the corresponding Layer Creator through the Layer Type, the corresponding Layer can then be constructed. The corresponding output blob size can be calculated and the operator of the platform can be created afterward. 
+
+The core of Blob node construction is memory allocation and optimization, which is mainly divided into blob memory recycling, blob memory splicing, and monitoring. 
+
+<div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/blob_memory.png"/> 
+
+First of all, the memory between the blobs output by different layers will be cyclically reused through an internal algorithm. The memory reuse between different blobs will preferentially select blobs of similar size. 
+
+After determining the blob memory reuse dependencies, the blob memory will be spliced and the memory will be allocated uniformly. Eventually, different blobs in the same Instance will have the same base pointer and different offsets. 
+The memory between multiple instances of the same thread/different threads has the basis of memory reuse TNN internally provides a mechanism for automatic memory reuse between different instances within a single thread. Instances built through SHARE\_MEMORY\_MODE\_SHARE\_ONE\_THREAD will automatically implement multiple Instance memory reuse. At the same time, the Instance built by SHARE\_MEMORY\_MODE\_SET\_FROM\_EXTERNAL supports importing from external memory. The caller maintains the memory reuse relationship and memory allocation and release. For multi-thread reuse, it also needs to deal with inter-thread locking mechanism. 
+
+## IV. Multi-platform Acceleration Operator Implementation 
+
+<div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/device.png"/> 
+
+Abstract AbstractDevice interface, used to hide the implementation details of different Devices. Provide an interface for Device Memory size calculation, Device Memory allocation/release, CPU Memory and Device memory copy, Device Layer accelerated operator construction, and instance corresponding Device Context construction. 
+
+```cpp 
+
+// @brief AbstractDevice define create memory, context and layer acc interface. 
+class AbstractDevice { 
+public: 
+... virtual BlobMemorySizeInfo Calculate (BlobDesc & desc) = 0; ... virtual Status Allocate (void ** handle, MatType mat_type, DimsVector dims) = 0; ... virtual Status Allocate (void ** handle, BlobMemorySizeInfo & size_info) = 0; ... virtual Status Free (void * handle) = 0; ... virtual Status CopyToDevice (BlobHandle * dst, const BlobHandle * src, BlobDesc & desc, void * command_queue) = 0; ... virtual Status CopyFromDevice (BlobHandle * dst, const BlobHandle * src, BlobDesc & desc, void * command_queue) = 0; ... virtual AbstractLayerAcc * CreateLayerAcc (LayerType type) = 0; ... virtual Context * CreateContext (int device_id) = 0; ...}; 
+``` 
+
+
+The network construction can obtain the corresponding Device implementation according to the configured DeviceType. Different Layers can build specific platform acceleration operators through the CreateLayerAcc interface, and interact through the unified abstract base class interface AbstractLayerAcc. 
+
+```cpp 
+
+// @brief AbstractLayerAcc define the layer acc interface 
+class AbstractLayerAcc { 
+public: 
+
+... virtual Status Init (Context * context, LayerParam * param, 
+LayerResource * resource, const std :: vector <Blob *> & inputs, const std :: vector <Blob *> & outputs) = 0; 
+... virtual Status Forward (const std :: vector <Blob *> & inputs, 
+const std :: vector <Blob *> & outputs) = 0;}; 
+
+``` 
+
+The same different LayerAcc is registered through the registration mechanism, and the Layer can construct different LayerAcc according to the LayerType. 
+
+## V. Unit Testing 
+
+TNN unit testing is based on googletest. Currently, unit testing is mainly built on Layer Acc and blob converter. The unit test uses the CPU Default implementation as the alignment benchmark to check the implementation of operators on different platforms. For specific unit test introduction, please refer to [unit test](./unit_test_en.md)
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/development/contributing_en.md b/3rdparty/TNN/doc/en/development/contributing_en.md
new file mode 100644
index 0000000..9db7cb8
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/contributing_en.md
@@ -0,0 +1,16 @@
+# Code Contribution
+
+[中文版本](../../cn/development/contributing.md)
+
+## Code Contribution
+
+[Add a New Operator](./add_op_en.md)
+
+## Programming Style
+
+The TNN project refers to the coding styles of C, C ++, Objective-C, Python, and Shell
+[Google Open Source Project Style Guide] (https://zh-google-styleguide.readthedocs.io/en/latest/contents/)
+
+## Code Format
+
+TNN project code formatting uses clang-format, git-clang-format, and the indentation after formatting has some difference with the style of Google's open source project, which should be subject to clang-format.
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/development/model_check_en.md b/3rdparty/TNN/doc/en/development/model_check_en.md
new file mode 100644
index 0000000..070e0bd
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/model_check_en.md
@@ -0,0 +1,59 @@
+# Model Verification Tool
+
+[中文版本](../../cn/development/model_check.md)
+
+## I. Function
+Check whether the output of the model on corresponding platform (OpenCL, Metal, Cuda, ARM, HuaweiNPU) is correct.
+
+## II. Compile
+To compile the model_check tool, the following macro must be set to ON:  
+* Turn on the following options to compile TNN (For the compilation method, please refer to [Compile TNN](../user/compile_en.md))
+* `TNN_CPU_ENABLE`  
+* `TNN_MODEL_CHECK_ENABLE`
+* set for corresponding device，such as `TNN_OPENCL_ENABLE`, `TNN_ARM_ENABLE`
+
+## III. Usage
+### 1. Command
+```
+./model_check [-h] [-p] <tnnproto> [-m] <tnnmodel> [-d] <device> [-i] <input> [-f] <refernece> [-e] [-n] <val> [-s] <val> [-o] [-b]
+```
+### 2. Parameter Description
+|option           |mandatory|with value |description                                       |  
+|:------------------|:------:|:-----:|:-------------------------------------------|  
+|-h, --help         |        |       |Output command prompt.                                |  
+|-p, --proto        |&radic; |&radic;|Specify tnnproto model description file.                   |   
+|-m, --model        |&radic; |&radic;|Specify the tnnmodel model parameter file.                   |  
+|-d, --device       |&radic; |&radic;|Specify the platform on which the model is executed, such as OPENCL, ARM, METAL, CUDA, HUAWEI_NPU etc.    |  
+|-i, --input_path   |        |&radic;|Specify the input file. The currently supported formats are:<br>&bull; Text file (the file suffix is ​​.txt). The format is the same as the input file dumped by model converter tool. <br>&bull; Common picture format files (file suffix is ​​.jpg .jpeg .png .bmp)<br>If not specified, (-1, 1) will be used for random input|  
+|-f, --ref          |        |&radic;|Use the specified output to compare the results. The currently supported formats are:<br>&bull; Text file (file suffix is ​​.txt), the format is the same as the output file dumped by model converter tool.|
+|-e, --end          |        |       |Only check output of model.                           |  
+|-n, --mean         |        |&radic;|Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|  
+|-s, --scale        |        |&radic;|Pre-processing, scale the input data channels, the parameter format is: 1.0, 1.0, 1.0|  
+|-o, --output       |        |       |Whether to save the final output.                           |  
+|-b, --batch        |        |       |Check the result of each batch. (Not finished yet) |  
+
+
+
+## IV. Execute the Script
+### 1. Android
+#### 1.1 Prepare models
+Copy the tnnproto and tnnmodel files of the model to be verified into `<path_to_tnn>/platforms/android/models` and rename them to` test.tnnproto` and `test.tnnmodel`
+#### 1.2 Execute the script
+`` `
+cd <path_to_tnn>/platforms/android/
+./model_check_android.sh -c -m <tnnproto> -p
+`` `
+### 2. Linux
+#### 2.1. Compile the script
+`` `
+cd <path_to_tnn>/platforms/linux/
+./build_model_check.sh -c
+`` `
+#### 2.2. Execute the command
+`` `
+<path_to_tnn>/platforms/linux/build/model_check -p <path_to_tnnproto> -m <path_to_tnnmodel> -d <DEVICE>
+`` `
+
+## V. Tool Restrictions
+* Currently the tool only supports fp32 model verification;
+* For per-layer model checking, only the fp32 precision is supported. For only-output model checking, the precision is decided by device automatically.
diff --git a/3rdparty/TNN/doc/en/development/profiling_en.md b/3rdparty/TNN/doc/en/development/profiling_en.md
new file mode 100644
index 0000000..9934aa9
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/profiling_en.md
@@ -0,0 +1,136 @@
+# Model Performance Analysis
+
+[中文版本](../../cn/development/profiling.md)
+
+Analyze the running time of a model.
+
+## I. Test the time cost on iOS platform
+### Test Steps
+1. Add a test model
+
+  Add test models in the `<path_to_tnn>/model` directory, each model has a folder, and the folder contains model files ending with `proto` and `model`. There is already a model `squeezenetv1.1` in the project.
+
+2. Open the benchmark project
+
+  Enter the directory `<path_to_tnn>/benchmark/benchmark_ios` and open the benchmark project.
+
+3. Set developer account
+
+  Click the benchmark project as shown below, find the project setting `Signing & Capabilities`, click the Team tab, and select` Add an Account ...`
+
+  <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource//ios_add_account_benchmark.jpg"/>
+
+  Enter the Apple ID account and password in the following interface. After the addition is complete, return to the `Signing & Capabilities` interface and select the added account in the Team tab. If you don’t have an Apple ID, you can also use the `Create Apple ID` option to apply.
+
+  `PS: There is no fee to apply for Apple ID, it can be passed immediately, and the APP can be debugged on the real machine after passing.`
+
+  <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource//ios_set_account.jpg"/>
+
+4. Run on real machines
+
+   4.1 Modify `Bundle Identifier`
+
+   As shown in the figure, after the existing `Bundle Identifier`, a suffix (limited to numbers and letters) is randomly added to prevent personal accounts from encountering signature conflicts.
+
+   <div align=left> <img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource//ios_set_bundleid_benchmark.jpg"/>
+
+   4.2 Verify authorization
+
+   For the first time, use the shortcut key `Command + Shift + K` to clean up the project, and then execute the shortcut key` Command + R` to run. If it is the first time to log in with Apple ID, Xcode will pop up a box and report the following error. You need to verify the authorization on the iOS device according to the prompt. Generally speaking, the authorization path on the phone is: Settings-> General-> Profile and Device Management-> Apple Development Options-> Click Trust
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource//ios_verify_certificate_benchmark.jpg" width = "50%" height = "50%"/>
+
+   
+   4.3 Result
+
+   For the first run, use the shortcut key `Command + Shift + K` to clean up the project, and then execute the shortcut key` Command + R` to run. Click the Run button on the interface, the interface will display the CPU and GPU time consumption of all models in the model directory. The running result of the iPhone7 real machine is shown below.
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource//ios_benchmark_result.jpg" width = "50%" height = "50%"/>
+
+   
+   PS:
+
+   a) Due to the different acceleration principles of GPU and CPU, the GPU performance of a specific model is not necessarily higher than that of the CPU, depending on the specific model/model structure/ engineering implementation. Everyone is welcome to participate in the development of TNN and make progress together.
+
+   b) If you encounter the error message: `Unable to install ...`, please delete the existing benchmark app on the real device and run the installation again.
+   
+   c) If the CodeSign error `Command CodeSign failed with nonzero exit code` is encountered when the real device is running, please refer to the issue 20` iOS Demo Operation Step Instructions`
+
+### II. Test the time cost on Android / ArmLinux platform
+
+### 1. Build environment 
+
+#### 1.1 Compile
+ 
+Please refer to [TNN Compile Document](../user/compile_en.md):Compile for Android, to check if the environment meets the requirements.
+
+#### 1.2 Execute
+
+* adb command configuration  
+Download[Android  tool](https://developer.android.com/studio/releases/platform-tools)，export `platform-tool` directory to`$PATH`。  
+PS: If the adb version is too old，the script might fail to work, current adb verison：29.0.5-5949299
+```
+export PATH=<path_to_android_sdk>/platform-tools:$PATH
+```
+
+### 2. Add models
+
+Put the model tnnproto into the models folder `<path_to_tnn>/benchmark/benchmark-model`，for example， 
+```
+cd <path_to_tnn>/benchmark/benchmark-model
+cp mobilenet_v1.tnnproto .
+```
+
+### 3. Modify the script
+
+Append model name to `benchmark_model_list` in `benchmark_models.sh`, such as：
+```
+ benchmark_model_list=(
+ #test.tnnproto \
+ mobilenet_v1.tnnproto \    # model name to be tested
+)
+```
+
+### 4. Execute the script
+
+```
+./benchmark_models.sh  [-32] [-c] [-b] [-f] [-d] <device-id> [-t] <CPU/GPU>
+Parameters：
+    -32   build 32-bit library, default is 64
+    -c    clean and recompile
+    -b    only build, no execute
+    -f    print out the time for every layer in network, otherwise the average time of all layers
+    -t    add <CPU/GPU> to indicate the platform to run.
+    -bs   executing binaries directly via shell
+```
+P.S. If -t is not set, the programme would run on CPU and GPU by default, "-t HUAWEI_NPU" needs to be specified to obtain Huawei NPU benchmark. 
+#### 4.1 Overall Network Performance Analysis：
+
+Analyze the overall network time-consuming and execute multiple times to obtain average performance.
+Execute the script：
+
+```
+./benchmark_models.sh -c
+```
+
+The result is shown in the figure and saved to `benchmark_models_result.txt`.
+
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/android_profiling.jpg" />
+
+#### 4.2 Layer-by-layer Performance Analysis：
+
+The layer-by-layer performance analysis tool is designed to calculate the running time of each layer and locate operator performance bottleneck.
+Execute script:
+```
+./benchmark_models.sh -c -f
+```
+P.S. Huawei NPU does not support layer by layer analysis.
+The result is shown in the figure and saved to `benchmark_models_result.txt`：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/development/resource/opencl_profiling.jpg"/>
+
+
+### 5.Special Instructions 
+
+* For OpenCL，the purpose of the layer-by-layer performance analysis is to analyze the distribution of the kernel's time-consuming. There is an extra cost in order to print the information of each layer, and only the kernel time has reference significance. If you want to see the overall actual performance, the overall network performance analysis is more accurate.
+
+* Compared with executing binaries directly via shell, the foreground benchmark app gets closer performance with an actual Android app. Due to Android's scheduler tailors behavior, it can result in observable differences in performance. Therefore, the benchmark app is preferred for performance measurement.
diff --git a/3rdparty/TNN/doc/en/development/resource/android_profiling.jpg b/3rdparty/TNN/doc/en/development/resource/android_profiling.jpg
new file mode 100644
index 0000000..64d565d
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/android_profiling.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/ios_add_account_benchmark.jpg b/3rdparty/TNN/doc/en/development/resource/ios_add_account_benchmark.jpg
new file mode 100644
index 0000000..b13c9f7
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/ios_add_account_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/ios_benchmark_result.jpg b/3rdparty/TNN/doc/en/development/resource/ios_benchmark_result.jpg
new file mode 100644
index 0000000..ad38168
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/ios_benchmark_result.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/ios_set_account.jpg b/3rdparty/TNN/doc/en/development/resource/ios_set_account.jpg
new file mode 100644
index 0000000..208a96d
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/ios_set_account.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/ios_set_bundleid_benchmark.jpg b/3rdparty/TNN/doc/en/development/resource/ios_set_bundleid_benchmark.jpg
new file mode 100644
index 0000000..fde76f7
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/ios_set_bundleid_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/ios_verify_certificate_benchmark.jpg b/3rdparty/TNN/doc/en/development/resource/ios_verify_certificate_benchmark.jpg
new file mode 100644
index 0000000..5ec90b2
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/ios_verify_certificate_benchmark.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/resource/opencl_profiling.jpg b/3rdparty/TNN/doc/en/development/resource/opencl_profiling.jpg
new file mode 100644
index 0000000..1e1b587
Binary files /dev/null and b/3rdparty/TNN/doc/en/development/resource/opencl_profiling.jpg differ
diff --git a/3rdparty/TNN/doc/en/development/unit_test_en.md b/3rdparty/TNN/doc/en/development/unit_test_en.md
new file mode 100644
index 0000000..bcc20fb
--- /dev/null
+++ b/3rdparty/TNN/doc/en/development/unit_test_en.md
@@ -0,0 +1,58 @@
+# Unit Test
+
+[中文版本](../../cn/development/unit_test.md)
+
+This document mainly introduces the purpose, usage, and note of the unit test.
+
+## Why Unit Test
+
+The current unit test has two purposes:
+
+1. Verify the correctness of each OPs' results on different platforms.
+2. Act as an OP performance testing tool to test OP performance without the need for constructing a whole new model.
+
+## How Unit Test Works
+
+The OP in the TNN code is implemented by the Layer class, but the Layer class only provides platform-independent logic such as Blob Shape inference. The actual operator implementations for different platforms are implemented in layer_acc.
+Therefore, the Layer unit test is calculated by comparing the results from two platforms, and then the correctness is checked.
+
+## Instructions
+
+### How to compile
+
+Turn on the TNN_UNIT_TEST_ENABLE switch when compiling:
+
+    cmake -DTNN_UNIT_TEST_ENABLE = ON ../
+    
+If it is used for OP performance test, you need to turn on the TNN_BENCHMARK_ENABLE switch at the same time:
+
+    cmake -DTNN_UNIT_TEST_ENABLE = ON -DTNN_BENCHMARK_ENABLE = ON ../
+
+
+* Turn on the following options to compile TNN (For the compilation method. Please refer to [Compile TNN](../user/compile_en.md))
+* TNN_UNIT_TEST_ENABLE = ON
+* You need to enable TNN_BENCHMARK_ENABLE at the same time for OP performance test:
+* TNN_BENCHMARK_ENABLE = ON
+
+
+### How to run
+
+After successful compilation, execute the following command:
+
+    ./test/unit_test/unit_test -ic 1
+    
+The ic parameter is used to control the number of times each unit test is repeated, usually 1 is sufficient. Other optional parameters are as follows:
+
+    -dt {ARM | OPENCL | METAL} // Type of computing device tested
+    -lp $ {load_library_path} // Library path to be loaded by OPENCL and METAL
+    -th $ {num_threads} // number of threads, default is 1
+    -ub {0 | 1} // Whether to print the calculation performance data (GFLOPS) for performance testing
+    
+An actual test example shows below:
+    
+    ./test/unit_test/unit_test -ic 1 -dt ARM -th 4 -ub 0
+    
+
+## Note 
+
+In the unit test, many parameter combinations are generated through the GTEST WithParamInterface interface. If you need to change or customize the parameters, you can take a look at the INSTANTIATE_TEST_SUITE_P macro.
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/faq_en.md b/3rdparty/TNN/doc/en/faq_en.md
new file mode 100644
index 0000000..7a04f26
--- /dev/null
+++ b/3rdparty/TNN/doc/en/faq_en.md
@@ -0,0 +1,156 @@
+# FAQ
+
+[中文版本](../cn/faq.md)
+
+## I. compilation questions
+
+### Compilation environment requirements:
+    General:  
+        cmake >= 3.1  
+        gcc >= 4.8  
+        NDK >= r14b  
+    Model conversion:  
+        python >= 3.5  
+        onnxruntime >= 1.1  
+        onnx-simplifier >= 0.2.4  
+        protobuf >= 3.0  
+ 
+### ARMv8.2 compilation error
+To support ARMv8.2 compilation, the ndk version must be at least r18b
+        
+## II. Model conversion questions
+
+### How to support tensorflow, caffe, mxnet models?
+* We support the popular machine-learning training frameworks through intermediate onnx format, and the open source community provides handful tools for converting these frameworks to onnx
+* [tensorflow2onnx](https://github.com/onnx/tensorflow-onnx): typical usage: python -m tf2onnx.convert --inputs-as-nchw [input tensor]: 0 --graphdef [input file].pb --inputs [input tensor]: 0 --outputs [output tensor]: 0 --opset 11 --output [output file].onnx
+* [caffe2onnx](./user/caffe2tnn_en.md)
+* [Mxnet: export onnx model](https://mxnet.apache.org/api/python/docs/tutorials/deploy/export/onnx.html)
+* [Pytorch: EXPORTING A MODEL FROM PYTORCH TO ONNX](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html)
+
+### Model alignment troubleshooting
+* [Model alignment troubleshooting](./model_align_en.md) 
+
+## III. Runtime questions
+
+### Does it support running on PC
+TNN supports compiling and running on linux and windows
+
+### How to run bfp16 code
+TNNTest's operating parameter -pr is set to LOW
+
+### How to convert cv::Mat to TNN::Mat
+```cpp
+cv::Mat cv_mat;
+MatType mat_type = N8UC4; // if cv_mat.channels() == 3, then mat_type = N8UC3.
+DimsVector dims = {1, cv_mat.channels(), cv_mat.rows, cv_mat.cols};
+auto tnn_mat = new TNN::Mat(DeviceType, mat_type, dims, (void *)cv_mat.ptr);
+```
+
+### Introduction to common error codes.
+Status call the description() interface to get more error information description.  
+
+0x1002(4098): Model parsing error. Check to make sure ModelConfig configures the file content instead of the file path.  
+
+0x6005(24581): The model weights information is missing. The TNN benchmark can only use the proto file, because TNN_BENCHMARK_MODE is turned on, and the weights are automatically generated, which is only used to evaluate the speed.  
+
+0x2000(8192): Error message not support model type. To check the Android static library integration link, you need to add -Wl,--whole-archive tnn -Wl,--no-whole-archive, and the iOS library integration link needs to add force_load.  
+
+0x9000(36864): The device type is not supported. (1) Make sure that the relevant device type compilation options are turned on. (2) Android static library integration links need to add -Wl, --whole-archive tnn -Wl, --no-whole-archive, and iOS library integration links need to add force_load.  
+
+## IV. NPU questions
+
+## Huawei NPU Compilation Prerequisite:
+You need the ddk to support where you could obain by 
+Either   
+Option 1 :  
+Go to <path_to_tnn>/thrid_party/huawei_npu/, use ./download_ddk.sh to download the ddk.  
+````
+cd <path_to_tnn>/thrid_party/huawei_npu/
+./download_ddk.sh 
+````
+Option 2 :
+1. Downlaod DDK from the following path [https://developer.huawei.com/consumer/cn/doc/overview/HUAWEI_HiAI]  
+2. unzip  
+3. Go to the `ddk/ai_ddk_lib` directory under the downloaded folder   
+4. Make directory named by `armeabi-v7a`under  `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`, and opy all files under the `ddk/ai_ddk_lib/lib` directory to `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a`  
+5.  Make directory named by `arm64-v8a`under  `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`,and copy all files under the `ddk/ai_ddk_lib/lib64` directory to  `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/arm64-v8a`    
+6. Copy the `include` directory to `<path_to_tnn>>/third_party/huawei_npu/hiai_ddk_latest/`  
+7. The structure of the `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/` should be like：  
+
+```
+hiai_ddk_latest
+├── arm64-v8a 
+│   ├── libcpucl.so 
+│   ├── libhcl.so
+│   ├── libhiai.so
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+├── armeabi-v7a
+│   ├── libcpucl.so
+│   ├── libhcl.so
+│   ├── libhiai.so
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+└── include
+    ├── HiAiAippPara.h
+    ├── HiAiModelManagerService.h
+    ├── HiAiModelManagerType.h
+    ├── graph
+    │   ├── attr_value.h
+    │   ├── buffer.h
+    │   ├── common
+    │   │   └── secures\tl.h
+    │   ├── debug
+    │   │   └── ge_error_codes.h
+    │   ├── detail
+    │   │   └── attributes_holder.h
+    │   ├── graph.h
+    │   ├── model.h
+    │   ├── op
+    │   │   ├── all_ops.h
+    │   │   ├── array_defs.h
+    │   │   ├── const_defs.h
+    │   │   ├── detection_defs.h
+    │   │   ├── image_defs.h
+    │   │   ├── math_defs.h
+    │   │   ├── nn_defs.h
+    │   │   └── random_defs.h
+    │   ├── operator.h
+    │   ├── operator_reg.h
+    │   ├── tensor.h 
+    │   └── types.h
+    └── hiai_ir_build.h
+```
+
+### NPU Restriction
+* If the NPU is of the version below 100.320.xxxxxxx
+  ERROR: npu is installed but is below 100.320.xxx.xxx
+* If the phone does not belong to Huawei or ROM version is too low：
+  ERROR: GetRomVersion(ROM): npu is not installed or rom version is too low
+  
+### How to update the latest ROM version to support NPU?
+* Go to Settings > System and Update > Software Update
+    
+### RKNPU Compilation Prerequisite:
+1. Make directory named by `rknpu` under `<path_to_tnn>/third_party` and enter `rknpu`, then execute: `git clone https://github.com/airockchip/rknpu_ddk.git`
+2. Add `-DTNN_RK_NPU_ENABLE:BOOL=ON` to `<path_to_tnn>/scripts/build_aarch64_linux.sh` and execute it.
+
+
+## V. Others questions
+
+### How to get intermediate results of the model?
+* Modify [blob_dump_utils.h] (source/tnn/utils/blob_dump_utils.h)
+*   \#define DUMP_INPUT_BLOB 0-> #define DUMP_INPUT_BLOB 1, get the input of each layer
+*   \#define DUMP_OUTPUT_BLOB 0-> #define DUMP_OUTPUT_BLOB 1, get the output of each layer
+* Only for debugging
+
+### How to get the time cost of each layer of the model?
+* Please refer to profiling document [performance test](./development/profiling_en.md)
+
+### Internet problem
+```text
+// Homebrew installation under mac
+//https://zhuanlan.zhihu.com/p/59805070
+//https://brew.sh/index_zh-cn
+// Replace the installation script of the domestic mirror
+```
diff --git a/3rdparty/TNN/doc/en/front_page_en.md b/3rdparty/TNN/doc/en/front_page_en.md
new file mode 100644
index 0000000..0ac98e2
--- /dev/null
+++ b/3rdparty/TNN/doc/en/front_page_en.md
@@ -0,0 +1,127 @@
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+[中文版本](../cn/front_page.md)
+
+## Get Started
+
+It is very simple to use TNN. If you have a trained model, the model can be deployed on the target platform through three steps.
+1. Convert the trained model into a TNN model. We provide a wealth of tools to help you complete this step, whether you are using Tensorflow, Pytorch, or Caffe, you can easily complete the conversion.
+Detailed hands-on tutorials can be found here [How to Create TNN Models](./user/convert_en.md).
+
+2. When you have finished converting the model, the second step is to compile the TNN engine of the target platform. You can choose among different acceleration solutions such as CPU/ARM/OpenCL/Metal/Npu according to the hardware support.
+   For these platforms, TNN provides convenient one-click scripts to compile. For detailed steps, please refer to [How to Compile TNN](./user/compile_en.md).
+
+3. The final step is to use the compiled TNN engine for inference. You can make program calls to TNN inside your application. We provide a rich and detailed demo as a reference to help you complete.
+    * [Run an iOS Demo](./user/demo_en.md)
+    * [Run an Android Demo](./user/demo_en.md)
+
+## Technical Solutions
+
+TNN is a high-performance and lightweight inference framework for mobile devices. It provides lots of advanced features such as cross-platform, model-compression, and code-pruning. TNN, inspired by mainstream open-source industry frameworks, integrates and leverages Youtu Lab's Rapidnet, ncnn framework. It also combines the efforts of the deep-learning framework Oteam from all departments(PCG, TEG, IEG) to create an enterprise-level mobile inference engine.
+At present, TNN has been launched in various major businesses, and its following characteristics have been widely praised.
+
+* Computation optimization
+    * The backend operators are primely optimized to make the best use of computing power in different architectures, regarding instruction issue, throughput, delay, cache bandwidth, cache delay, registers, etc..
+    * The TNN performance on mainstream hardware platforms (CPU: ARMv7, ARMv8, GPU: Mali, Adreno, Apple) has been greatly tuned and improved.
+    * The convolution function is implemented by various algorithms such as Winograd, Tile-GEMM, Direct Conv, etc., to ensure efficiency under different parameters and sizes.
+    * Op fusion: TNN can do offline analysis of network graph, fuse multiple simple operations and reduce overhead such as redundant memory access and kernel startup cost.
+
+* Low precision computation acceleration
+    * TNN supports INT8/FP16 mode, reduces model size & memory consumption, and utilizes specific hardware low-precision instructions to accelerate calculations.
+    * TNN supports INT8 WINOGRAD algorithm, (input 6bit), further reduces the model calculation complexity without sacrificing the accuracy.
+    * TNN supports mixed-precision data in one model, speeding up the model's calculation speed while preserving its accuracy.
+
+* Memory optimization
+    * Efficient "memory pool" implementation: Based on a full network DAG analysis, the implementation reuses memory between non-dependent nodes which reduces memory cost by 90%.
+    * Cross-model memory reduces: This supports external real-time design for network memory so that multiple models can share mutual memory.
+
+* Performance comparison among mainstream models: TNN outperforms other mainstream open-source mobile high-performance frameworks.
+
+>  Kirin970：
+
+| model                     | cpu time(single thread, ms) | gpu time(ms) | npu time(ms)|
+|---------------------------|--------------|--------------|---------------|
+| Mobilenet_v1              | 88           |   12         |       4.9     |
+| Mobilenet_v1_int8         | 55           |              |               |
+| Mobilenet_v2              | 58           |   11         |       8.0     |
+| Mobilenet_v2_int8         | 41           |              |               |
+| squeezenet_v1.0           | 127          |   20         |       5.1     |
+| squeezenet_v1.0_int8      | 82           |              |               |
+
+
+>  Snapdragon 835：
+
+ | model                     | cpu 1 thread(ms) | gpu time(ms) |
+ |---------------------------|--------------|--------------|
+ | Mobilenet_v1              | 94           |   16         |
+ | Mobilenet_v1_int8         | 62           |              |
+ | Mobilenet_v2              | 61           |   14         |
+ | Mobilenet_v2_int8         | 47           |              |
+ | squeezenet_v1.0           | 122          |   28         |
+ | squeezenet_v1.0_int8      | 93           |              |
+
+
+>  Snapdragon 845：
+
+
+| model                     | cpu 1 thread(ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 60           |   10         |
+| Mobilenet_v1_int8         | 37           |              |
+| Mobilenet_v2              | 39           |   8          |
+| Mobilenet_v2_int8         | 28           |              |
+| squeezenet_v1.0           | 74           |   14         |
+| squeezenet_v1.0_int8      | 56           |              |
+
+
+
+* TNN architecture diagram：
+
+   <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/tnn_architect.jpg" width="512" alt=TNN架构 />
+
+* TNN supports TensorFlow, Pytorch, MxNet, Caffe, and other training frameworks through ONNX, leveraging the continuous improvement of the ONNX open-source society.
+  Currently TNN supports 55 ONNX operators, and will be developed to cover 80 operators shortly, consisting of most of the mainstream CNN operators needed.
+* TNN runs on mainstream operating systems (Android, iOS, embedded Linux, Windows), and is compatible with ARM CPU, GPU hardware platform (Da Vinci NPU will be supported soon)
+* TNN is constructed through Modular Design, which abstracts and isolates components such as model analysis, graph construction, graph optimization, low-level hardware adaptation, and high-performance kernel.
+   It uses "Factory Mode" to register and build devices, that tries to minimize the cost of supporting more hardware and acceleration solutions.
+* TNN's running time does not rely on any third-party libraries. The size of the CPU dynamic library is only around 400KB, and it provides basic image conversion operations, which are light-weight and convenient. TNN uses unified models and interfaces across platforms and can switch easily by configuring just one single parameter.
+
+## Learn about TNN Abilities
+* [Operator Support](./user/support_en.md)
+* [Model Support](./user/support_en.md)
+* [Device Support](./user/support_en.md)
+* [Profiling](./development/profiling_en.md)
+
+## Manual
+* [Compile TNN](./user/compile_en.md)
+* [Tools]()
+    * [Create TNN Models](./user/convert_en.md)
+    * [Model Quantization](./user/quantization_en.md)
+    * [Model Visualization Netron](https://lutzroeder.github.io/netron/)
+    * [Performance Analysis](./development/profiling_en.md)
+    * [Model Alignment](./development/model_check_en.md)
+
+## API Document
+* [API call](./user/api_en.md)
+
+## Contribute to TNN
+* [Development basics](./development/contributing_en.md)
+* [Detailed architecture](./development/architecture_en.md)
+* [Add a New Operator](./development/add_op_en.md)
+* [Unit test](./development/unit_test_en.md)
+
+## Roadmap
+* [Road map](../cn/user/roadmap.md)
+
+## FAQ
+* [FAQ](./faq_en.md)
+
+## Join Us
+
+* Everyone is welcome to participate to build the best mobile inference framework in the industry.
+
+* Technical Discussion QQ Group: 913940506 Answer: TNN
+
+* Scan the QR code to join the TNN discussion group：
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN-QQ.png"/>
+
diff --git a/3rdparty/TNN/doc/en/get_started_en.md b/3rdparty/TNN/doc/en/get_started_en.md
new file mode 100644
index 0000000..4d8ee4e
--- /dev/null
+++ b/3rdparty/TNN/doc/en/get_started_en.md
@@ -0,0 +1,16 @@
+<div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/TNN.png"/>
+
+# Run a demo from scratch 
+
+[中文版本](../cn/get_started.md)
+
+It is very simple to use TNN. If you have a trained model, the model can be deployed on the target platform through three steps.
+1. Convert the trained model into a TNN model. We provide a wealth of tools to help you complete this step, whether you are using Tensorflow, Pytorch, or Caffe, you can easily complete the conversion.
+Detailed hands-on tutorials can be found here [How to Create TNN Models](./user/convert_en.md).
+
+2. When you have finished converting the model, the second step is to compile the TNN engine of the target platform. You can choose among different acceleration solutions such as ARM/OpenCL/Metal/Npu according to the hardware support.
+   For these platforms, TNN provides convenient one-click scripts to compile. For detailed steps, please refer to [How to Compile TNN](./user/compile_en.md).
+
+3. The final step is to use the compiled TNN engine for inference. You can make program calls to TNN inside your application. We provide a rich and detailed demo as a reference to help you complete.
+    * [Run an iOS Demo](./user/demo_en.md)
+    * [Run an Android Demo](./user/demo_en.md)
diff --git a/3rdparty/TNN/doc/en/imgs/model_align.png b/3rdparty/TNN/doc/en/imgs/model_align.png
new file mode 100644
index 0000000..de6069a
Binary files /dev/null and b/3rdparty/TNN/doc/en/imgs/model_align.png differ
diff --git a/3rdparty/TNN/doc/en/jobs_en.md b/3rdparty/TNN/doc/en/jobs_en.md
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/doc/en/model_align_en.md b/3rdparty/TNN/doc/en/model_align_en.md
new file mode 100644
index 0000000..dc933c2
--- /dev/null
+++ b/3rdparty/TNN/doc/en/model_align_en.md
@@ -0,0 +1,142 @@
+# Common Questions about Model Alignment
+
+[中文版本](../cn/model_align.md)
+
+When using TNN to perform inference with transformed tnn model, you may encounter disalignment of the inference result between the tnn model and the original model. This document summarizes the common reasons behind disalignment, operators related to disalignment, and solutions to resolve common alignment problems. The overal process of handling model alignment problems is shown below.
+
+<div><img src="imgs/model_align.png"/>
+
+## I. Check if the tnn model is aligned
+
+### 1. Using '-align' option when converting models
+
+The TNN model converter is equipped with the alignment function and able to check if the converted tnn model is aligned with the original model. We strongly advise to enable the alignment function when you perform model conversion. The details of model converter could be found here:[Model Converter Document](https://github.com/Tencent/TNN/blob/master/doc/en/user/convert_en.md).
+
+### 2. Using 'model_check' tool
+
+For converted tnn models, TNN provides the **model_check** tool to help the alignment. The **model_check** tool is mainly designed for comparing the results of executions on different devices(e.g., ARM, OpenCL, Metal) and chekcing if they are same. When douting the tnn model results are wrong on a specific device, you can use this tool to verify your assumption.
+
+The **model_check** tool can run the given TNN model on specified device with given or random input data, and compare the device result with the reference CPU result in a operator-to-operator manner. Thus, the oprtators that have the same input while giving different outputs between the test device and CPU are responsible for the disalignment. The usage of **model_check** can be found here:[model_check document](https://github.com/Tencent/TNN/blob/master/doc/en/development/model_check_en.md). 
+
+## II. Known Problems related to Model Alignment
+
+If the model converter has succeeded with alignment check, while you encountered alignment problems during the inference, you can try to solve the problem with the following steps.
+
+Due to the diversity of neural networks and the inconsistent definitions and supports for operators between different frameworks, plus the changes with the updates of frameworks, it is hard to completely elinimate the inequivalence of operators. The following table summarizes the operators which may result in alignment problems, and you can check if these operators are responsible for your situation.
+
+|Framework|Operators|
+|-|-|
+|Pytorch    |upsample, batchnorm, AvgPool|
+|TensorFlow |TODO|
+|tflite     |ResizeBilinear|
+|onnx       |TODO|
+
+### 1.tensorflow
+TODO
+
+### 2.pytorch
+
+#### upsample 
+
+Descciption: the upsample operator in onnx is inequivalent with the one in pytorch.
+
+Solution: 1）update pytorch; 2）when exporting onnx model，set **opset_version>=11** like this:
+```
+torch.onnx.export(model, input, filename, verbose=False,
+                  opset_version=11,...) # or other number greater than 11
+```
+
+#### batchnorm
+
+Description: Forgetting to switch pytorch to the inference mode, making the param of batchnorm incorrect.
+
+Solution: Switch to inference mode before exporting onnx model, the code should be like this:
+```torch_model.eval()``` or ```torch_model.train(False)```
+
+#### AvgPool
+
+Description: the count_include_pad attribute of the AvgPool operator is discarded when converting to TNN models. TNN only support 'count_include_pad=```False``` for now.
+
+Solution: change the count_include_pad to ```False``` before exporting the onnx model.
+
+### 3.tflite
+
+#### ResizeBilinear
+
+Description: the ResizeBilinear operator has problem in tflite with version below 2.3
+
+Solution: update TensorFlow to versions>=2.3
+
+### 4.onnx
+TODO
+
+## III. Analysis & Solutions to Model Alignment Problems
+
+When analyzing alignment problems, one of the most effective method is to compare the results for the same input. Specifically, we should get the TNN result of a specific operator and compare it with the result of the same operator from the original model. This could be achieved if we can save the input and output of each operator.
+
+TNN suports dumping the result of each operator, and this could be achieved by the following steps.
+
+### 1. eanble the 'blob dump' function
+
+Open the file [source/tnn/utils/blob_dump_utils.h](https://github.com/Tencent/TNN/blob/master/source/tnn/utils/blob_dump_utils.h), change the macros `DUMP_INPUT_BLOB` and `DUMP_OUTPUT_BLOB`. Specifically, setting `DUMP_INPUT_BLOB` to `1` means saving the input for each opertor, and setting `DUMP_INPUT_BLOB` to `1` means dumping the output of each operator.
+
+The procedure of saving data is called in the method `Forward` in file [source/tnn/core/default_network.cc](https://github.com/Tencent/TNN/blob/master/source/tnn/core/default_network.cc).
+
+Specifically, TNN will save the input and output of each opeator in a separate txt file, and the filename is determined by **the order of this operator in the tnn model, the operator name and the shape of the data**. For example, assuing a layer named *foo* and it is the 2nd layer of the model, its 1st input will be save in a file whose name is prefixed by *00001-foo-in-0*. And the file for its second output will have a name like *00001-foo-out-1*. The data will be saved following a *N-C-H-W* layout and each line of the file is filled with one single value. The information about the input and output of each operator could be found through the [**Netron**](https://netron.app/) visualization tool.
+
+The directory containg the files is determined by the variable `g_tnn_dump_directory` defined in file [source/tnn/utils/blob_dump_utils.cc](https://github.com/Tencent/TNN/blob/master/source/tnn/utils/blob_dump_utils.cc), and you could change it to your expected value.
+
+### 2. Get each layer's input and output
+
+Considering the saving process is called by the `Forward` method,myou can implement a simple program calling the `Forward` method. Besides, you can also make modifications on the code of existed TNN programs, such as the **TNNTest** tool. As **TNNTest** uses the asynchronous method `ForwardAsync` to perform the inference, you should open the file [test/test.cc](https://github.com/Tencent/TNN/blob/master/test/test.cc), find the calling to `ForwardAsync` and change it with the calling to `Forward`. If you are unfamiliar with the underlying details, you should change **all the 2 occurrences** like this:
+replace
+```
+ ret = instance->ForwardAsync(nullptr);
+```
+with
+```
+ret = instance->Forward();
+```
+
+As the changes are made in source code, you should re-compile TNN to make them work. The steps to compile TNN can be found here:[TNN Compiling Document](https://github.com/Tencent/TNN/blob/master/doc/en/user/compile_en.md).
+When the compilation finishes, you can use the **TNNTest** tool to execute the model and save the input and output of each operator. You can refer [TNNTest Document](https://github.com/Tencent/TNN/blob/master/doc/en/user/test_en.md) for usage of the **TNNTest**.
+
+### 3. Getting the Results of Operators in the Original Model
+
+The procedure to save operators' results closely relates to the corresponding framework. We use onnx as an example to show the method to save the result of each operator in the original model.
+- onnx model: use `onnxruntime` to execute the model and save the output of each operator.
+```
+def forward_dump(model_path:str, input_data:numpy.ndarray) -> Dict[str, numpy.ndarray]:
+    # 1. Load onnx model
+    model = onnx.load(model_path)
+    onnx.checker.check_model(model)
+    model = copy.deepcopy(model)
+
+    # 2. Prepare input data
+    input_data = {'input_name': input_data}
+
+    # 3. Set the output of each operator as the output of the model
+    for node in model.graph.node:
+        for output in node.output:
+            model.graph.output.extend([onnx.ValueInfoProto(name=output)])
+
+    # 4. Use onnxruntime to execute onnx models
+    sess = onnxruntime.InferenceSession(model.SerializeToString())
+    outputs = [x.name for x in sess.get_outputs()]
+    result = OrderedDict(zip(outputs, sess.run(outputs, input_data)))
+    # 5. save the data in 'result'
+
+    return result
+```
+`result` is a `Dict`, which maps the `name` of each operator in the onnx model to its result(`numpy.ndarray`).
+
+
+## IV. Submitting Issues
+
+When encountering model alignment problems, you can [submit a issue](https://github.com/Tencent/TNN/issues) to tell us your problem. We will fix it as soon as possible.
+
+To help us re-producing your problem, please fill the information according to the issue template, and provide the following contents if possible:
+1. the original model and the tnn model;
+2. the input data and reference result;
+3. your environment and the procedures: e.g., onnxrunrime version, tflite version, tnn version, etc;
+4. other helpful information: e.g., the operators causing the wrong result
diff --git a/3rdparty/TNN/doc/en/user/api_en.md b/3rdparty/TNN/doc/en/user/api_en.md
new file mode 100644
index 0000000..0f6a4f7
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/api_en.md
@@ -0,0 +1,558 @@
+# API Documentation
+
+[中文版本](../../cn/user/api.md)
+
+## I. API Interface Compatibility
+
+All exposed interfaces of TNN are displayed and declared by PUBLIC macro, while non-exposed interfaces and symbols are not visible to external.
+
+```cpp
+#if defined _WIN32 || defined __CYGWIN__
+  #ifdef BUILDING_DLL
+    #ifdef __GNUC__
+      #define PUBLIC __attribute__ ((dllexport))
+    #else
+      #define PUBLIC __declspec (dllexport)
+    #endif
+  #else
+    #ifdef __GNUC__
+      #define PUBLIC __attribute__ ((dllimport))
+    #else
+      #define PUBLIC __declspec (dllimport)
+    #endif
+  #endif
+  #define LOCAL
+#else
+  #if __GNUC__> = 4
+    #define PUBLIC __attribute__ ((visibility ("default")))
+    #define LOCAL __attribute__ ((visibility ("hidden")))
+  #else
+    #define PUBLIC
+    #define LOCAL
+  #endif
+#endif
+```
+
+Compatibility of different API versions follows[Semantic Version 2.0.0](https://semver.org) rules.
+
+## II. API Call
+
+### Introduction
+The API call mainly introduces the four steps: model analysis, network construction, input setting, and output acquisition. For detailed description, please refer to the API detailed explanation section.
+
+### Step1. Model analysis
+
+```cpp
+TNN tnn;
+TNN_NS::ModelConfig model_config;
+//proto file content saved to proto_buffer
+model_config.params.push_back(proto_buffer);
+//model file content saved to model_buffer
+model_config.params.push_back(model_buffer);
+tnn.Init(model_config);
+```
+
+TNN model analysis needs to configure the ModelConfig parameter, pass in the content of proto and model files, and call the TNN Init interface to complete the model analysis.
+
+
+### Step2. Network construction
+
+```cpp
+TNN_NS::NetworkConfig config;
+config.device_type = TNN_NS::DEVICE_ARM;
+TNN_NS::Status error;
+auto net_instance = tnn.CreateInst(config, error);
+```
+
+TNN network construction needs configure the NetworkConfig parameter，and device_type could be set as `DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU` or other acceleration method，the construction of the network is completed through CreateInst interface.
+
+```cpp
+config.network_type = TNN_NS::NETWORK_TYPE_HUAWEI_NPU;
+```
+
+### Step3. Input
+
+```cpp
+    auto status = instance->SetInputMat(input_mat, input_cvt_param);
+```
+
+TNN input is set by SetInputMat interface.The data to be passed in is saved to input_mat.
+
+### Step 4. Network Infer
+
+```cpp
+auto status = instante->Forward();
+```
+The TNN Forward interface is a synchronous call interface, and the ForwardAsync interface is an asynchronous call interface.
+
+### Step5. Output 
+
+```cpp
+    auto status = instance->GetOutputMat(output_mat);
+```
+
+TNN output is exported by ObtainingGetOutputMat interface. The result would be saved to output_mat in a specific format.
+
+## III. API Explanation
+
+### API directory structure 
+
+```bash
+.
+└── tnn
+    ├── core
+    │   ├── blob.h                  # data transfer
+    │   ├── common.h                # define common structure
+    │   ├── instance.h              # netwrok instance
+    │   ├── macro.h                 # common macro definition
+    │   ├── mat.h                   # input interface，like cv::Mat
+    │   ├── status.h                # interface status
+    │   └── tnn.h                   # model analysis
+    ├── utils
+    │   ├── bfp16_utils.h           # bfp16 conversion tool
+    │   ├── blob_converter.h        # blob input/output tool
+    │   ├── cpu_utils.h             # CPU performance specific optimization tool
+    │   ├── data_type_utils.h       # data type conversion tool
+    │   ├── dims_vector_utils.h     # dim size calculation tool
+    │   ├── half_utils.h            # fp16 conversion tool
+    │   ├── mat_utils.h             # mat conversion tool
+    │   └── string_utils.h          # string conversion tool
+    └── version.h                   # Compile and build information
+```
+
+### 1. core/common.h
+`DataType`: Define enumeration values ​​for different datatypes.  
+`DataFormat`: Define the different data arrangement methods of Blob Data.  
+`NetworkType`: define different network construction types, build TNN network by default, also support third-party library network construction.  
+`DeviceType`: Used to specify the device the network running on and the corresponding acceleration method.  
+`ModelType`: define the model type, default is TNN model, also supports the import of other third-party library model formats.  
+`Precision`: Define the accuracy of the network operation.  
+
+```cpp
+struct PUBLIC ModelConfig {
+
+    ModelType model_type = MODEL_TYPE_TNN;
+
+    // tnn model need two params: order is proto content, model content.
+    // ncnn need two: params: order is param content, bin content.
+    // openvino model need two params: order is xml content, model path.
+    // coreml model need one param: coreml model directory path.
+    // snpe model need one param: dlc model directory path.
+    // hiai model need two params: order is model name, model file path.
+    // atlas model need one param: config string.
+    std::vector<std::string> params;
+};
+```
+
+ModelConfig parameters：  
+
+- `model_type`: The current open source version of TNN only supports importing `MODEL_TYPE_TNN`, `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML` model formats.  
+- `params`: The TNN model needs to pass in the content of the proto file and the path of the model file. The NCNN model needs to input the content of the param file and the path of the bin file, and the COREML model needs to input the directory path where the coreml model is located.  
+
+```cpp
+struct PUBLIC NetworkConfig {
+    // device type default cpu
+    DeviceType device_type = DEVICE_ARM;
+
+    // device id default 0
+    int device_id = 0;
+
+    // blob data format, auto decided by device
+    DataFormat data_format = DATA_FORMAT_AUTO;
+
+    // network type, auto decided by device
+    NetworkType network_type = NETWORK_TYPE_AUTO;
+
+    // raidnet instances not share memory with others
+    ShareMemoryMode share_memory_mode = SHARE_MEMORY_MODE_DEFAULT;
+
+    // dependent library path
+    std::vector<std::string> library_path = {};
+
+    // compute precision
+    Precision precision = PRECISION_AUTO;
+
+    // cache path to store possible cache models or opt kernel or opencl program cache
+    std::string cache_path = "";
+
+    // network init or reshape may cost more time to select opt kernel implement if enable tune kernel
+    // cache_path can set to store tune kernel info.
+    bool enable_tune_kernel = false;
+};
+```
+NetworkConfig parameter description:  
+
+- `device_type`:   
+The default is `DEVICE_ARM`. `DEVICE_NAIVE`, `DEVICE_ARM`, `DEVICE_X86`, `DEVICE_OPENCL`, `DEVICE_METAL`, `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU` are currently supported.  
+- `device_id`: The default value is 0. Multiple devices can be selected by `device_id`. Currently, only `DEVICE_CUDA` needs to configure this parameter to specify the gpu id.  
+- `data_format`: By default, tnn automatically selects the blob data arrangement method for acceleration. You can set a specific blob data arrangement for acceleration through this parameter.  
+- `network_type`: By default, the network type is automatically selected according to the `device_type`, and the network type to be constructed can be specified.  
+- `share_memory_mode`: tnn instance memory sharing mode.  
+- `library_path`: support external dependent library loading, this parameter needs to be configured when the iOS metal kernel library is placed in the app non-default path.  
+- `precision`: Network precision type. The precision is automatically selected according to different `device_type` by default.  
+- `cache_path`: Huawei NPU specifies the cache path to store the om files transferred during operation, and subsequent operations can directly load the corresponding om files through the cache path. OpenCL specifies the cache path to store the compiled binary files of kernel, and subsequent initialization can directly create kernals through the binary cache files. If `enable_tune_kernel` is turned on, you can store the tune parameters by specifying the cache path, and then you can load the tune parameters directly without having to tune the kernel every time you run it.
+
+```cpp
+typedef enum {
+    // default
+    SHARE_MEMORY_MODE_DEFAULT = 0,
+    // same thread tnn instance share blob memory
+    SHARE_MEMORY_MODE_SHARE_ONE_THREAD = 1,
+    // set blob memory from external, different thread share blob memory need
+    // synchronize
+    SHARE_MEMORY_MODE_SET_FROM_EXTERNAL = 2
+} ShareMemoryMode;
+```
+
+- `SHARED_MEMORY_MODE_DEFAULT`: only supports memory sharing between different blobs of the same instance.  
+- `SHARE_MEMORY_MODE_SHARE_ONE_THREAD`: supports memory sharing of different instances of the same thread.  
+- `SHARE_MEMORY_MODE_SET_FROM_EXTERNAL`: supports instance memory to be passed in from outside, the sharing mode is determined by the calling side, synchronization among threads needs to deal with synchronization issues, and memory allocation and release all require maintenance on the calling side.  
+
+### 2. core/tnn.h
+
+```cpp
+class PUBLIC TNN {
+public:
+    ...
+
+    // init tnn implement, interpret model.
+    Status Init(ModelConfig& config);
+
+    // denit tnn implement, release model interpreter.
+    Status DeInit();
+
+    // add output to the model.
+    // if output_name of blob not found, then search output_index of layer.
+    Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    // return input shapes map from model
+    Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // create tnn network instance with network config and inputs shape.
+    // if inputs shape not set, use default from model.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap inputs_shape = InputShapesMap());
+
+    // create tnn network instance with network config and min max inputs shape,
+    // instance reshape can support range from min inputs shape to max inputs shape.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+    ...
+};
+```
+
+TNN interface description:  
+
+- Init interface: responsible for importing and parsing model data, need to configure and import ModelConfig.  
+- DeInit interface: responsible for the release of tnn implement, the default destructor can be automatically released.  
+- AddOutput interface: support to increase the model output, you can define any layer of network output as the model output.  
+- CreateInst interface: responsible for network instance Instance construction.  
+- GetModelInputShapesMap interface: Get the model input size parsed by the model.  
+- CreateInst interface: responsible for the construction of the network instance. If the input dimensions are variable during operation, you need to configure `min_inputs_shape` and `max_inputs_shape` to specify the maximum and minimum dimensions supported by each dimension of the input.  
+
+### 3. core/instance.h
+
+```cpp
+class PUBLIC Instance {
+public:
+    Instance(NetworkConfig& net_config, ModelConfig& model_config);
+
+    ~Instance();
+
+    // init with model interpeter and inputs shape.
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape);
+
+    // deinit, release network
+    Status DeInit();
+
+    //  return memory bytes required for forward
+    Status GetForwardMemorySize(int& memory_size);
+
+    //  set memory to tnn instance. if success, return status code zero.
+    //  only instance created with SHARE_MEMORY_MODE_SET_FROM_EXTERNAL can be set from external.
+    //  the memory size need >=  GetForwardMemorySize().
+    //  releasing or otherwise using the memory for other purposes during the tnn network run 
+    //  will result in undefined behavior.
+    Status SetForwardMemory(void* memory);
+
+    // reshape instance with new input shapes
+    Status Reshape(const InputShapesMap& inputs);
+
+    // get tnn command queue
+    Status GetCommandQueue(void** command_queue);
+
+    // @brief tnn instance network infer, it will wait until all layer infer complete.
+    Status Forward();
+
+    ...
+
+    // tnn instance network infer async.
+    // device gpu, all layer infer complete will call Callback.
+    Status ForwardAsync(Callback call_back);
+
+    // get all input blobs
+    Status GetAllInputBlobs(BlobMap& blobs);
+
+    // get all output blobs
+    Status GetAllOutputBlobs(BlobMap& blobs);
+
+    // set threads run on cpu 
+    virtual Status SetCpuNumThreads(int num_threads);
+    ...
+
+    // set input Mat, if input_name is not set, take the first input as default
+    Status SetInputMat(std::shared_ptr<Mat> mat,
+                       MatConvertParam param,
+                       std::string input_name = "");
+    
+    // get output Mat, if output_name is not set, take the first output as default
+    Status GetOutputMat(std::shared_ptr<Mat>& mat,
+                        MatConvertParam param = MatConvertParam(),
+                        std::string output_name = "", 
+                        DeviceType device = DEVICE_ARM, MatType mat_type = NCHW_FLOAT);
+
+};
+```
+
+Instance interface instruction：  
+
+- The `Instance` and `Init` interfaces are normally called by the TNN CreateInst interface, used to generate Instance network instances.  
+- `GetForwardMemorySize` can get the memory size required for all the blobs of Instance, `SetForwardMemory` is used to pass in external memory. For Instances built in `SHARE_MEMORY_MODE_SET_FROM_EXTERNAL` memory mode, the memory needs to be passed in from the outside, and the actual size of the incoming memory must not be less than the value returned by `GetForwardMemorySize`.  
+- The `Reshape` interface supports resetting the input size after the network is successfully constructed. Only the network built with `min_inputs_shape` and `max_inputs_shape` can change the input size during operation. The variable size range is specified by `min_inputs_shape` and `max_inputs_shape`.  
+- The `GetCommandQueue` interface supports obtaining the command queue corresponding to the network operation, and the same command queue message is executed sequentially.  
+- `GetAllInputBlobs` and `GetAllOutputBlobs` are used to get input and output blobs respectively.  
+- `SetCpuNumThreads` can set the number of parallel CPU threads.  
+- `Forward` runs a synchronous interface for the network, and `ForwardAsync` runs an asynchronous interface for the network.  
+- `SetInputMat` is used to set the input Mat, where MatConvertParam can set the conversion parameters([mat-convert-parameter description](#MatConvertParam-description)). For multi-input networks, it can be distinguished by input_name.  
+- `GetOutputMat` is used to obtain the output result and save it in the output Mat. Among them, MatConvertParam can set the conversion parameters([mat-convert-parameter description](#MatConvertParam-description)). For multi-output networks, it can be distinguished by output_name. DeviceType can specify whether the output Mat Memory is built on the CPU or GPU. MatType is applied to set the output Mat data arrangement.   
+
+### 4. core/mat.h
+
+```cpp
+class PUBLIC Mat {
+public:
+    ...
+
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims, void* data);
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims);
+    //empty mat
+    Mat(DeviceType device_type, MatType mat_type);
+
+    DEPRECATED("use Mat(DeviceType, MatType, DimsVector, void*) instead")
+    Mat(DeviceType device_type, MatType mat_type, void* data) : Mat(device_type, mat_type, {1,0,0,0}, data) {};
+
+    ...
+};
+```
+MatType supports common CV, NLP input and output layouts, and `DeviceType` can be set to CPU and GPU.
+
+```cpp
+typedef enum {
+    INVALID    = -1,
+    //bgr or rgb: uint8
+    N8UC3      = 0x00,
+    //bgra or rgba: uint8
+    N8UC4      = 0x01,
+    //gray: uint8
+    NGRAY      = 0x10,
+    //YUV420SP, YYYYVUVUVU
+    NNV21      = 0x11,
+    //YUV420SP, YYYYUVUVUV
+    NNV12      = 0x12,
+    //nchw: float
+    NCHW_FLOAT = 0x20,
+    // nchw: int32
+    NC_INT32 = 0x21,
+    ...
+} PUBLIC MatType;
+```
+
+### 5. core/macro.h
+Provide different platform Log macros, different data types maximum and minimum macros, PUBLIC macro definition, and some data pack conversion and other macro definitions.
+
+### 6. core/status.h
+`Status`is defined in status.h.
+
+```cpp
+enum StatusCode {
+
+    TNN_OK = 0x0,
+
+    // param errcode
+    TNNERR_PARAM_ERR        = 0x1000,
+    TNNERR_INVALID_NETCFG   = 0x1002,
+    ...
+}
+
+class PUBLIC Status {
+public:
+    Status(int code = TNN_OK, std::string message = "OK");
+
+    Status &operator=(int code);
+
+    bool operator==(int code_);
+    bool operator!=(int code_);
+    operator int();
+    operator bool();
+    std::string description();
+
+private:
+    int code_;
+    std::string message_;
+}
+```
+The error message will be returned in `description` interface when Status code is not equal to TNN_OK.
+
+### 7. core/blob.h
+
+```cpp
+// @brief BlobDesc blob data info
+struct PUBLIC BlobDesc {
+    // device_type describes devie cpu, gpu, ...
+    DeviceType device_type = DEVICE_NAIVE;
+    // data_type describes data precion fp32, in8, ...
+    DataType data_type = DATA_TYPE_FLOAT;
+    // data_format describes data order nchw, nhwc, ...
+    DataFormat data_format = DATA_FORMAT_AUTO;
+    // DimsVector describes data dims
+    DimsVector dims;
+    // name describes the blob name
+    std::string name;
+    
+    std::string description(bool all_message = false);
+};
+
+struct PUBLIC BlobHandle {
+    void *base            = NULL;
+    uint64_t bytes_offset = 0;
+};
+
+// @brief Blob tnn data store and transfer interface.
+class PUBLIC Blob {
+public:
+    ...
+
+    //@brief create Blob with blob descript and data handle
+    Blob(BlobDesc desc, BlobHandle handle);
+
+    ...
+};
+
+```
+
+Blob is composed of `BlobDesc` and `BlobHandle`, where `BlobDesc` describes Blob related structural information, and `BlobHandle` is used to read and store Blob data.
+
+`BlobDesc` contains `device_type`, `data_type`, `data_format`, `dims`, `name` information.
+
+The `dims` describes the blob dimension information, the dims storage size has nothing to do with data_format:  
+
+- The dims size is 2, and the storage corresponds to N, C.  
+- The dims size is 4, and the storage size corresponds to N, C, H, W.  
+- The dims size is 5, and the storage size corresponds to N, C, D, H, and W.  
+
+The current input and output data types and arrangements of blobs for different platforms are as follows:  
+
+- `ARM`: CPU memory, NC4HW4.  
+- `OPENCL`: GPU graphics memory (clImage), NHC4W4. Among which NH is clImage high, C4W4 is clImage wide.  
+- `METAL`: GPU video memory (metal), NC4HW4.  
+- `HUAWEI_NPU` :CPU memory, NCHW.  
+- `X86`: CPU memory, NCHW.  
+- `CUDA`: GPU memory, NCHW.  
+
+Among them, the last 4 represents pack 4 and C4 represents the last 1 bit 4 is packed by 4 Cs.  
+
+
+### 8. utils/mat\_utils.h
+```cpp
+class PUBLIC MatUtils {
+public:
+    //copy cpu <-> device, cpu<->cpu, device<->device, src and dst dims must be equal.
+    static Status Copy(Mat& src, Mat& dst, void* command_queue);
+
+    //src and dst device type must be same. when param scale_w or scale_h is 0, it is computed as
+    // (double)dst.GetWidth() / src.GetWidth() or (double)dst.GetHeight() / src.GetHeight().
+    static Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue);
+
+    //src and dst device type must be same. when param width or height is 0, it is equal to
+    //dst.GetWidth() or dst.GetHeight().
+    static Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue);
+
+    //src and dst device type must be same. param top, bottom, left and right must be non-negative.
+    static Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue);
+};
+```
+
+interface instruction:  
+
+- `Copy`: Support different DEVICE and CPU Mat data copy, and Mat data copy between the same DEVICE.  
+-  `Resize`, `Crop`, `WarpAffine`, `CvtColor`, `CopyMakeBorder` interface behavior is similar to OpenCV, both CPU and GPU support, `src` and `dst` must have the same `DEVICE_TYPE`.
+
+### 9. utils/bfp16\_utils.h
+The interface provides the cpu memory conversion tool between fp16 and fp32. 
+
+
+### 10. utils/blob\_convert.h
+```cpp
+class PUBLIC BlobConverter {
+public:
+    explicit BlobConverter(Blob* blob);
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue);
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue);
+
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+
+private:
+    Blob* blob_;
+    std::shared_ptr<BlobConverterAcc> impl_ = nullptr;
+};
+```
+
+Through `ConvertToMat`, you can import blob data into Mat in Mat format, and `ConvertFromMat` can import Mat data into blob in blob format, and the command_queue can be obtained by the Instance `GetCommandQueue` interface.
+
+It also provides common pre-processing/post-processing: support setting scale/bias parameter and reverse channel adaptation bgr, rgb or other scenarios.
+
+```cpp
+struct PUBLIC MatConvertParam {
+    std::vector<float> scale = {1.0f, 1.0f, 1.0f, 1.0f};
+    std::vector<float> bias = {0.0f, 0.0f, 0.0f, 0.0f};
+    bool reverse_channel = false;
+};
+```
+
+#### MatConvertParam description:  
+- `reverse_channel`: The default is `false`. If blue channel and red channel of the input mat need to be reversed, the parameter could be set to `true`.  
+    * The parameter is only valid for mats of `N8UC3` or `N8UC4` type. For other types of mat, the parameter is ignored.  
+    * Both `ConvertFromMat` and `ConvertToMat` procedures support reverse of channel.  
+- `scale` and `bias`: The default of scale is `1`, and the default of bias is `0`. The input mat is first multiplied by the scale, and then added with the bias.  
+    * All types of mat support scale and bias.  
+    * Both `ConvertFromMat` and `ConvertToMat` procedures support scale and bias.  
+    * If scale values are all equal to `1`, and bias values are all equal to `0`, or the default scale and bias are used, the scale and bias procedure will be skipped. Otherwise, both numbers of scale values and bias values should be consistent with the input channel.  
+    * For multi-channel inputs, the order of scale values and bias values should be consistent with the data format used in model inference. For example, if the model actually uses BGR images to do inference, then scale and bias should follow BGR order for both `ConvertFromMat` and `ConvertToMat` procedures, no matter reverse channel or not. This is also equivalent to first reverse channel, then do scale and bias for `ConvertFromMat`, and first do scale and bias, then reverse channel for `ConvertToMat`.  
+
+### 11. utils/cpu\_utils.h
+Provide tools that are related to CPU thread core binding and power saving mode setting.
+
+### 12. utils/data\_type\_utils.h
+Provide DataType size and name conversion-related tools.
+
+### 13. utils/dims\_vector\_utils.h
+Provide commonly-used blob dims calculation and comparison tools.
+
+### 14. utils/half\_utils.h
+The interface provides CPU memory conversion tools between fp32 and fp16.
+
+### 15. utils/string\_utils.h
+The interface provides conversion from uchar string to std::string, which is mainly used for TNN model memory input.
+
+### 16. version.h
+Build version information.
diff --git a/3rdparty/TNN/doc/en/user/caffe2tnn_en.md b/3rdparty/TNN/doc/en/user/caffe2tnn_en.md
new file mode 100644
index 0000000..ebbd726
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/caffe2tnn_en.md
@@ -0,0 +1,125 @@
+# Caffe Model to ONNX Model
+
+[中文版本](../../cn/user/caffe2tnn.md)
+
+To convert the Caffe model to a TNN model, first convert the Caffe model to an ONNX model, which is then converted to a TNN model.
+
+ We use the caffe2onnx tool to convert the Caffe model to ONNX. In the following document, it will briefly introduce how to use caffe2onnx to convert, and then it is recommended to refer to [onnx2tnn](onnx2tnn_en.md)to convert the ONNX model to TNN.
+
+
+## 1. Environment(Mac and Linux)
+
+- install protobuf(version >= 3.4.0)  
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+For Linux system，we recommend following protobuf's official [README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md).  
+
+If you are using the Ubuntu system, use the instructions below to install:
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+- Install python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- onnx(version == 1.6.0)
+```shell script
+pip3 install onnx==1.6.0
+```
+
+- numpy(version >= 1.17.0)
+```shell script
+pip3 install numpy
+```
+
+## 2. caffe2onnx tool usage
+- cd in tool directory
+``` shell script
+cd <tnn_root_path>/tools/caffe2onnx/
+```
+- Caffe format conversion
+
+At present, the caffe2onnx tool currently only supports the latest version of caffe format. So before using caffe2onnx, you need to upgrade the old version of the caffe network and model to the latest version. Caffe comes with the tool to convert from old version caffe model and network to a new version.
+```shell script
+upgrade_net_proto_text [old prototxt] [new prototxt]
+upgrade_net_proto_binary [old caffemodel] [new caffemodel]
+```
+The format of the modified input is as follows:
+
+```text
+layer {
+  name: "data"
+  type: "input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 224 dim: 224 } }
+}
+```
+- caffe2onnx tool usage
+
+```shell script
+python3 convert2onnx.py ./test.prototxt ./test.caffemodel -o ./test.onnx
+```
+
+```text
+usage: convert2onnx.py [-h] [-o ONNX_FILE] proto_file caffe_model_file
+
+convert caffe model to onnx
+
+positional arguments:
+  proto_file        the path for prototxt file, the file name must end with
+                    .prototxt
+  caffe_model_file  the path for caffe model file, the file name must end with
+                    .caffemodel!
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o ONNX_FILE          the path for generate onnx file
+  -align                align the onnx model with tnn model
+  -input_file in.txt    the input file path which contains the input data for the inference model
+  -ref_file   ref.txt   the reference file path which contains the reference data to compare the results
+```
+
+## 3. caffe2onnx operator support
+
+| Number | caffe layer            | onnx operator                                         |
+| ------ | ---------------------- | ----------------------------------------------------- |
+| 1      | BatchNorm              | BatchNormalization                                    |
+| 2      | BatchNorm + Scale      | BatchNormalization                                    |
+| 3      | Concat                 | Concat                                                |
+| 4      | Convolution            | Conv                                                  |
+| 5      | ConvolutionDepthwise   | Conv                                                  |
+| 6      | Deconvolution          | ConvTranspose                                         |
+| 7      | DetectionOutput        | DetectionOutput(customer defination)                  |
+| 8      | Dropout                | Dropout                                               |
+| 9      | Eltwise                | Mul/Add/Max                                           |
+| 10     | Flatten                | Reshape                                               |
+| 11     | InnerProduct           | Reshape + Gemm                                        |
+| 12     | LRN                    | LRN                                                   |
+| 13     | MaxUnPool              | MaxUnPool                                             |
+| 14     | PReLU                  | PRelu                                                 |
+| 15     | Permute                | Transpose                                             |
+| 16     | Pooling                | MaxPool/AveragePool/GlobalMaxPool/GlobalAveragePool   |
+| 17     | PriorBox               | PriorBox(customer defination)                         |
+| 18     | ReLU                   | Relu/LeakyRelu                                        |
+| 19     | ReLU6                  | Clip                                                  |
+| 20     | Reshape                | Reshape                                               |
+| 21     | Scale                  | Mul + Reshape                                         |
+| 22     | ShuffleChannel         | Reshape + Transpose + Reshape                         |
+| 23     | Sigmoid                | Sigmoid                                               |
+| 24     | Slice                  | Slice                                                 |
+| 25     | Softmax                | Softmax                                               |
+| 26     | Upsample               | Resize                                                |
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/user/compile_en.md b/3rdparty/TNN/doc/en/user/compile_en.md
new file mode 100644
index 0000000..86ecf9b
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/compile_en.md
@@ -0,0 +1,254 @@
+# Compile
+
+[中文版本](../../cn/user/compile.md)
+
+## I. Compile (iOS)
+### 1. Environment requirements
+  - Mac, Xcode IDE
+  - cmake（version 3.1 or higher）
+
+### 2. Steps
+1）switch to 'scripts' dir
+```
+cd <path_to_tnn>/scripts
+```
+2）execute the build script
+```
+./build_ios.sh
+```
+If the `xcrun`, `metal` or `metallib` commands cannot be found during the compilation, try the following commands.
+```
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer/
+```
+After the compilation, the `tnn.framework` library and` tnn.bundle` resources are generated under the directory `platforms/ios`
+3）Add to the project  
+
+  - Add `tnn.framework` library and` tnn.bundle` resource under the root directory of iOS app project;
+  - Find the `Build Setting -> Linking-> Other Linker Flags` option in the settings of the app Xcode project;
+  - Add `-force_load" $ (SRCROOT) /tnn.framework "` to it;
+
+### 3. Restrictions
+
+Currently, the compiled `tnn.framework` supports running on CPU and GPU of iOS devices, but only supports running on GPU of Mac. The support for Mac CPU will come in future TNN updates.
+
+## II. Compile (Android)
+### 1. Environment requirements
+#### Dependency
+  - cmake（version 3.6 or higher）
+
+#### NDK configuration
+  - Download Android NDK (version>=15c)  <https://developer.android.com/ndk/downloads>
+    - version>=r18b, when armv8.2 is enable
+  - Configure the NDK path in env `export ANDROID_NDK=<ndk_path>`
+
+### 2. Compile
+1）switch to 'scripts' dir
+```
+cd <path_to_tnn>/scripts
+```
+2）edit `build_android.sh` to config the building options 
+```
+ ABIA32="armeabi-v7a with NEON"
+ ABIA64="arm64-v8a"
+ STL="c++_static"
+ SHARED_LIB="ON"                # ON for dynamic lib，OFF for static lib
+ ARM="ON"                       # ON to build for ARM CPU
+ OPENMP="ON"                    # ON to build for OpenMP
+ OPENCL="ON"                    # ON to build for GPU
+ HUAWEI_NPU="ON"                # ON to enable HUAWEI NPU
+ SHARING_MEM_WITH_OPENGL=0      # 1 for sharing OpenGL texture with openCL
+```
+
+  Huawei NPU PS: 
+    You need to download the DDK library files and copy them to the specified directory. You could use a script to do.
+    See:
+    HuaweiNPU Compilation Prerequisite in [FAQ](../faq_en.md).
+    
+    
+    
+3）execute the building script
+```
+./build_android.sh
+```
+After the compilation is completed, the corresponding `armeabi-v7a` library, the` arm64-v8a` library and the `include` header file are generated in the` release` directory of the current directory. <font color="#dd0000">Notice that add `-Wl,--whole-archive tnn -Wl,--no-whole-archive` to the project, if tnn static library is compiled</font>.
+
+## III. Cross-Compile in Linux
+
+### 1. Environment requirements
+#### Dependencies
+
+  - cmake（version 3.1 or higher）
+  - Install arm toolchain
+  - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu
+            arm32hf: sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+  - other linux: download toolchains from https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+
+### 2. Compilation Steps
+1）switch to 'scripts' dir
+```
+cd <path_to_tnn>/scripts
+```
+2）edit `build_aarch64_linux.sh` or `build_armhf_linux.sh` to config the building options  
+```
+ SHARED_LIB="ON"                # ON for dynamic lib，OFF for static lib
+ ARM="ON"                       # ON to build for ARM CPU
+ OPENMP="ON"                    # ON to enable OpenMP
+ OPENCL="OFF"                   # ON to build for GPU
+ RKNPU="OFF"                    # ON to build for RKNPU 
+ #for arm64:
+ CC=aarch64-linux-gnu-gcc       # set compiler for aarch64 C
+ CXX=aarch64-linux-gnu-g++      # set compiler for aarch64 C++
+ TARGET_ARCH=aarch64
+ #for arm32hf:
+ CC=arm-linux-gnueabihf-gcc       # set compiler for aarch64 C
+ CXX=arm-linux-gnueabihf-g++      # set compiler for aarch64 C++
+ TARGET_ARCH=arm
+```
+3）execute the building script
+```
+./build_aarch64_linux.sh
+```
+RKNPU: 
+You need to download the DDK library files and copy them to the specified directory. 
+Please see:
+RKNPU Compilation Prerequisite in [FAQ](../faq_en.md#rknpu-compilation-prerequisite)RKNPU Compilation Prerequisite.
+
+## IV. Compile(x86 Linux)
+### 1. Enviromnment requirements
+#### Dependencies
+  - cmake (version 3.11 or higher)
+
+### 2. Compilation Steps
+1) switch to 'scripts' directory
+```
+cd <path_to_tnn>/scripts
+```
+2) execute the building scripts
+  - compile without openvino
+```
+./build_linux_native.sh
+```
+  - compile with openvino
+```
+./build_x86_linux.sh
+```
+Openvino can only be compiled to 64-bit version, cmake version 3.13 or higher
+
+## V. Compile(Linux CUDA)
+### 1. Enviromnment requirements
+#### Dependency
+  - cmake（version 3.8 or higher）
+  - CUDA (version 10.2 or higher)
+
+#### TensorRT configuration
+  - Download TensorRT (version>=7.1) <https://developer.nvidia.com/nvidia-tensorrt-7x-download>
+  - Configure the TensorRT path in env `export TENSORRT_ROOT_DIR=<TensorRT_path>`
+
+#### CuDNN configuration
+  - Download CuDNN (version>=8.0) <https://developer.nvidia.com/rdp/cudnn-download>
+  - Configure the CuDNN path in env `export CUDNN_ROOT_DIR=<CuDNN_path>`
+
+### 2. Compilation Steps
+1) switch to 'scripts' directory
+```
+cd <path_to_tnn>/scripts
+```
+2) execute the building scripts
+```
+./build_cuda_linux.sh
+```
+
+## VI. Compile(x86 Windows)
+### 1. Environment requirements
+#### Dependencies
+  - Visual Studio (version 2015 or higher)
+  - cmake (vsrsion 3.11 or higher; Or use build-in cmake in Visual Studio)
+  - ninja (faster compilation, installed with chocolatey)
+
+### 2. Compilation Steps
+Open `x64 Native Tools Command Prompt for VS 2017/2019`. Or open `x86 Native Tools Command Prompt for VS 2017/2019` to compile 32-bit version
+1) switch to 'scripts' directory
+```
+cd <path_to_tnn>/scripts
+```
+2) execute the building scripts
+  - compile without openvino
+```
+.\build_msvc_native.bat
+```
+  - compile with openvino
+```
+.\build_msvc.bat
+```
+Openvino can only be compiled to 64-bit version. More problems refer to [FAQ](openvino_en.md)
+
+## VII. Compile(CUDA Windows) 
+### 1. Enviroment requirements
+#### Dependencies
+  - Visual Studio (version 2015 or higher)
+  - cmake (vsrsion 3.11 or higher; Or use build-in cmake in Visual Studio)
+  - CUDA (version 10.2 or higher) and make sure `CUDA_PATH` was set in Environment Virables
+
+#### TensorRT configuration
+  - Download TensorRT (version>=7.1) <https://developer.nvidia.com/nvidia-tensorrt-7x-download>
+  - Configure the TensorRT path in *build_cuda_msvc.bat* : `set TENSORRT_ROOT_DIR=<TensorRT_path>`
+
+#### CuDNN configuration
+  - Download CuDNN (version>=8.0) <https://developer.nvidia.com/rdp/cudnn-download>
+  - Configure the CuDNN path in *build_cuda_msvc.bat* : `set CUDNN_ROOT_DIR=<CuDNN_path>`
+
+### 2. Compilation Steps
+Open `x64 Native Tools Command Prompt for VS 2017/2019`. Or open `cmd` with `cmake` environment virable setted.
+1) switch to 'scripts' directory
+```
+cd <path_to_tnn>/scripts
+```
+2) execute the building scripts
+```
+.\build_cuda_msvc.bat
+```
+
+
+## VIII. Compile(Macos)
+### 1. Environment requirements
+#### Dependencies
+  - cmake 3.11 or above
+  - xcode command line tools (Xcode shall be installed in AppStore，then execute ``xcode-select --install`` in terminal) 
+  - automake, libtool (can be installed with brew, ```brew install libtool automake```)
+  - Network access
+
+### 2. Compilation Steps
+1）switch to 'scripts' directory
+```
+cd <path_to_tnn>/scripts
+```
+2）execute the building scripts
+```
+./build_macos.sh
+```
+
+## Description for build options 
+
+|Option|Default|Description|
+|------|:---:|----|
+|TNN_CPU_ENABLE| ON | Code source/device/cpu compilation switch, the implementation is all c ++ code, does not contain specific CPU acceleration instructions.|
+|TNN_X86_ENABLE| OFF | The code source/device/x86 compilation switch is currently adapted to the openvino implementation, and more accelerated code implementation will be moved in later.|
+|TNN_ARM_ENABLE| OFF | Code source/device/arm compilation switch, the code contains neon acceleration instructions, and partially implements int8 acceleration.|
+|TNN_ARM82_ENABLE| OFF | Code source/device/arm/acc/compute_arm82 compilation switch, the code implements fp16 acceleration.|
+|TNN_METAL_ENABLE| OFF | Code source/device/metal compilation switch, the code contains metal acceleration instructions.|
+|TNN_OPENCL_ENABLE| OFF | Code source/device/opencl compilation switch, the code contains opencl acceleration instructions.|
+|TNN_CUDA_ENABLE| OFF | Code source/device/cuda compilation switch, the code contains cuda acceleration instructions, currently only a small part of the implementation has been migrated.|
+|TNN_DSP_ENABLE| OFF | Code source/device/dsp compilation switch, currently adapted to snpe implementation.|
+|TNN_ATLAS_ENABLE| OFF | The code source/device/atlas compilation switch is currently adapted to Huawei's atlas acceleration framework.|
+|TNN_HUAWEI_NPU_ENABLE| OFF | The code source/device/huawei_npu compilation switch is currently adapted to the HiAI acceleration framework.|
+|TNN_RK_NPU_ENABLE| OFF | The code source/device/rknpu compilation switch is currently adapted to the rknpu_ddk acceleration framework.|
+|TNN_SYMBOL_HIDE| ON | The symbols of the acceleration library are hidden, and the default non-public interface symbols of release are not visible.|
+|TNN_OPENMP_ENABLE| OFF | OpenMP switch, control whether to open openmp acceleration.|
+|TNN_BUILD_SHARED| ON | The dynamic library compilation switch, close to compile the static library.|
+|TNN_TEST_ENABLE| OFF | test code compilation switch|
+|TNN_UNIT_TEST_ENABLE| OFF | Unit test compilation switch, open the unit test compilation switch will automatically turn on the TNN_CPU_ENABLE switch, as a test benchmark.|
+|TNN_PROFILER_ENABLE| OFF | Performance debugging switch, after opening it will print more performance information, only for debugging.|
+|TNN_QUANTIZATION_ENABLE| OFF | Quantization tool compilation switch|
+|TNN_BENCHMARK_MODE| OFF | Benchmark switch, after opening, the model weights file is empty, and data can be automatically generated.|
+|TNN_ARM82_SIMU | OFF | Armv8.2 simulation switch, should be open together with TNN_ARM82_ENABLE, after opening, the code can be run on the CPU which without half precision support. |
+
diff --git a/3rdparty/TNN/doc/en/user/convert_en.md b/3rdparty/TNN/doc/en/user/convert_en.md
new file mode 100644
index 0000000..85f6a10
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/convert_en.md
@@ -0,0 +1,613 @@
+# How to Create a TNN Model
+
+[中文版本](../../cn/user/convert.md)
+
+## Overview
+
+<div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/convert.png"/>
+
+TNN currently supports the industry's mainstream model file formats, including ONNX, Pytorch, Tensorflow and Caffe. As shown in the figure above, TNN utilizes ONNX as the intermediate port to support multiple model file formats. 
+To convert model file formats such as Pytorch, Tensorflow, TensorFlow-Lite, and Caffe to TNN, you need to use corresponding tool to convert from the original format to ONNX model first, which then will be transferred into a TNN model.
+
+| Source Model   | Convertor        | Target Model |
+|------------|-----------------|----------|
+| Pytorch    | pytorch export directly | ONNX     |
+| Tensorflow | tensorflow-onnx | ONNX     |
+| Caffe      | caffe2onnx      | ONNX     |
+| ONNX       | onnx2tnn        | TNN      |
+| TensorFlow-Lite     | tflite2tnn      | TNN      |
+
+At present, TNN only supports common network structures such as CNN. Networks like RNN and GAN are under development.
+
+## TNN Model Converter
+
+Through the general introduction above, you can find that it takes at least two steps to convert a Tensorflow model into a TNN model. So we provide an integrated tool, convert2tnn, to simplify. The converter can convert Tensorflow, Caffe and ONNX models into TNN models by just one single operation. Since Pytorch can directly export ONNX models, this tool no longer provides support for the Pytorch model.
+
+You can use the convert2tnn tool to directly convert the models to TNN, or you can first convert the corresponding model into ONNX format and then convert it to a TNN model based on the documents.
+
+This article provides two ways to help you use the convert2tnn tool:
+- Use covnert2tnn via Docker image;
+- Manually install dependencies and tools to use convert2tnn converter;
+
+### Convert2tnn Docker (Recommend)
+
+In order to simplify the installation and compilation steps of the convert2tnn converter, TNN provides a Dockerfile and a Docker image method. You can build the Docker image yourself based on the Dockerfile file, or you can directly pull the built Docker image from Docker Hub. You can choose the way you like to obtain the Docker image.
+
+#### Pull from the Docker Hub (Recommend)
+
+At present, TNN has prepared a built Docker image on Docker Hub. We suggest pulling the Docker image directly from Docker Hub. 
+
+```shell script
+docker pull turandotkay/tnn-convert
+```
+ After waiting for a while, you can check through `docker images` command. If successful, there will be output similar to the following:
+
+``` text
+REPOSITORY                TAG                 IMAGE ID            CREATED             SIZE
+turandotkay/tnn-convert   latest              28c93a738b08        15 minutes ago      2.81GB
+```
+
+If the REPOSITORY name is too long, rename it with the following command:
+```
+docker tag turandotkay/tnn-convert:latest tnn-convert:latest
+docker rmi turandotkay/tnn-convert:latest
+```
+#### Build Docker image (If the image is pulled through previous step, skip this part)
+
+``` shell script
+cd <path-to-tnn>/
+docker build -t tnn-convert:latest.
+```
+
+Docker will build a Docker image based on the Dockerfile, which needs a while to complete. After the construction is completed, you can verify whether the installation process is successful by the following command.
+
+``` shell script
+docker images
+```
+
+There should be similar output as shown below, which indicates that the Docker image has been built.
+``` text
+REPOSITORY TAG IMAGE ID CREATED SIZE
+tnn-convert latest 9e2a73fbfb3b 18 hours ago 2.53GB
+```
+
+#### Use convert2tnn to convert
+
+First, verify that the Docker image's status. Let's take a look at the help information of convert2tnn by entering the following command:
+
+``` shell script
+docker run -it tnn-convert:latest python3 ./converter.py -h
+```
+
+If the Docker image is correct, you will obtain the following output:
+
+```text
+usage: convert [-h] {onnx2tnn,caffe2tnn,tf2tnn} ...
+
+convert ONNX/Tensorflow/Caffe model to TNN model
+
+positional arguments:
+  {onnx2tnn,caffe2tnn,tf2tnn}
+    onnx2tnn            convert onnx model to tnn model
+    caffe2tnn           convert caffe model to tnn model
+    tf2tnn              convert tensorflow model to tnn model
+    tflite2tnn          convert tensorflow-lite model to tnn model
+
+optional arguments:
+  -h, --help            show this help message and exit
+```
+From above，we can know that currently convert2tnn provides conversion support for 3 model formats. Suppose we want to convert the Tensorflow model to a TNN model here, we enter the following subcommand to continue to get help information:
+
+``` shell script
+docker run  -it tnn-convert:latest  python3 ./converter.py tf2tnn -h
+```
+The output shows below：
+``` text
+usage: convert tf2tnn [-h] -tp TF_PATH -in input_info [input_info ...] -on output_name [output_name ...] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half] [-align] [-input_file INPUT_FILE_PATH]
+                      [-ref_file REFER_FILE_PATH]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -tp TF_PATH           the path for tensorflow graphdef file
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g., -in input1_name:1,128,128,3 input2_name:1,256,256,3
+  -on output_name [output_name ...]
+                        the tensorflow model's output name. e.g. -on output_name1 output_name2
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save the model using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference data to compare the results.
+```
+Here are the explanations for each parameter:
+
+- tp parameter (required)
+    Use the "-tp" parameter to specify the path of the model to be converted. Currently only supports the conversion of a single TF model, does not support the conversion of multiple TF models together.
+- in parameter (required)
+    Specify the name of the model input through the "-in" parameter, for example "-in input1_name:1,128,128,3 input2_name:1,256,256,3"
+- on parameter (required)
+    Specify the name of the model output through the "-on" parameter, for example "-on output_name1 output_name2"
+- output_dir parameter:
+    You can specify the output path through the "-o <path>" parameter, but we generally do not apply this parameter in docker. By default, the generated TNN model will be placed in the same path as the TF model.
+- optimize parameter (optional)
+    You can optimize the model with the "-optimize" parameter. **If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result**.
+- v parameter (optional)
+    You can use -v to specify the version number of the model to facilitate later tracking and differentiation of the model.
+- half parameter (optional)
+   You can save the model with the "-half" parameter. The model data will be stored in FP16 to reduce the size of the model by setting this parameter. By default, the model data is stored in FP32.
+- align parameter (optional)
+    You can align the model with the "-align" parameter. Compare TNN model and original model to determine whether TNN model is correct. If you remove "-align", model align will not run; if you use "-align" or "-align output", this tool will compare the last output of TNN model and original model; if the model is not align, you can use '-align all' to address the first unaligned layer.
+- fold_const parameter (optional)
+    You can optimize the model with the "-fold_const" parameter. Enable tf constant_folding transformation before conversion.
+- input_file parameter (optional)
+    Specify the input file's name which will be used by model_check through the "-input_file" parameter. This is [input format](#Input).
+- ref_file parameter (optional)
+    Specify the reference file's name which will be used by model_check through the "-ref_file" parameter. This is [output format](#Output). 
+
+
+**Current convert2tnn input model only supports graphdef format，does not support checkpoint or saved_model format. Refer to [tf2tnn](./tf2tnn_en.md) to transfer checkpoint or saved_model models.**
+
+Here is an example of converting a TF model in a TNN model
+
+``` shell script
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest  python3 ./converter.py tf2tnn -tp=/workspace/test.pb -in=input0,input2 -on=output0 -v=v2.0 -optimize 
+```
+
+Since the convert2tnn tool is deployed in the Docker image, if you want to convert the model, you need to first push the model into the Docker container. We can use the docker run parameter --volume to mount certain path in the Docker container. In the above example, the current directory (pwd) for executing the shell is under the "/workspace" folder in the Docker container. The test.pb used in the test therefore **must be executed under the current path of the shell command**. After executing the above command, the convert2tnn tool will store the generated TNN model in the same level directory of the test.pb file. 
+
+The above information only introduces the conversion for Tensorflow's models. It is similar for other model formats. You can use the conversion tool's note to remind yourself. These subcommands are listed below:
+
+``` shell script
+# convert onnx
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py onnx2tnn /workspace/mobilenetv3-small-c7eb32fe.onnx -optimize -v=v3.0
+# convert caffe
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py caffe2tnn /workspace/squeezenet.prototxt /workspace/squeezenet.caffemodel -optimize -v=v1.0
+
+# convert tflite
+docker run --volume=$(pwd):/workspace -it tnn-convert:latest python3 ./converter.py tflite2tnn \
+    /workspace/mobilenet_v1_1.0_224.tflite \
+    -v v1.0 \
+    -align  \
+    -input_file /workspace/in.txt \
+    -ref_file /workspace/ref.txt
+
+
+```
+
+### Manual Convert2tnn Installation
+You can also install the dependencies of convert2tnn on your development machine manually and compile it according to the relevant instructions. 
+
+If you only want to convert the models of certain types, you just need to install the corresponding dependent tools. For example, if you only want to convert the caffe model, you do not need to install the tools that the Tensorflow model depends on. Similarly, if you need to convert Tensorflow's model, you don't need to install Caffe model conversion dependent tools. However, the ONNX model depends on tools and installation and compilation are required.
+
+The example runs on Centos 7.2. It can also be applied to Ubuntu, as long as the corresponding installation command is modified to the corresponding command on Ubuntu.
+
+#### Build the environment
+##### 1. Build ONNX model conversion tool (Required)
+
+- Install protobuf(version >= 3.4.0) 
+ 
+Macos:
+```shell script
+brew install protobuf
+```
+## Set proxy (optional)
+export http_proxy=http://{addr}:{port}
+export https_proxy=http://{addr}:{port}
+## Compile
+
+- install python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- install python dependencies
+onnx=1.6.0  
+onnxruntime>=1.1.0   
+numpy>=1.17.0  
+onnx-simplifier>=0.2.4 
+requests
+```shell script
+pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier requests
+```
+
+- cmake （version >= 3.0）
+Download the latest cmake，and follow the instructions in official documents to install.
+It is recommended to use the latest version.
+
+###### Compile
+The onnx2tnn tool can run directly on Mac and Linux with automatic compilation scripts.
+ ```shell script
+cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
+./build.sh
+ ```
+
+##### 2. Convert the TensorFlow Model (Optional)
+
+-tensorflow (version == 1.15.0)
+It is recommended to use tensorflow version 1.15.0. The current version of tensorflow 2.+ is not compatible and is not recommended.
+```shell script
+pip3 install tensorflow==1.15.0
+```
+
+-tf2onnx (version>= 1.5.5)
+```shell script
+pip3 install tf2onnx
+```
+-onnxruntime(version>=1.1.0)
+```shell script
+pip3 install onnxruntime
+```
+##### 3. Convert the Caffe model (Optional)
+
+-Install protobuf (version >= 3.4.0)
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+For linux system, we suggest referring to the official [README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md) document of protobuf and install directly from the source code.
+
+If you are using Ubuntu system, you can use the following instructions to install:
+
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+- Install python (version >=3.6)  
+
+Macos
+```shell script
+brew install python3
+```
+centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- onnx(version == 1.6.0)
+```shell script
+pip3 install onnx==1.6.0
+```
+
+- numpy(version >= 1.17.0)
+```shell script
+pip3 install numpy
+```
+
+#### convert2tnn Tool Usage
+After meeting the requirements, convert2tnn could convert the models.
+
+```shell script
+cd <path_to_tnn_root>/tools/convert2tnn/
+python3 converter.py -h
+```
+Execute the command above will output information below. There 3 options at present.
+
+```text
+usage: convert [-h] {onnx2tnn,caffe2tnn,tf2tnn} ...
+
+convert ONNX/Tensorflow/Caffe model to TNN model
+
+positional arguments:
+  {onnx2tnn,caffe2tnn,tf2tnn}
+    onnx2tnn            convert onnx model to tnn model
+    caffe2tnn           convert caffe model to tnn model
+    tf2tnn              convert tensorflow model to tnn model
+
+optional arguments:
+  -h, --help            show this help message and exit
+```
+- ONNX model conversion
+If you want to convert ONNX models, you can directly choose the onnx2tnn option to view the help information.
+
+```shell script
+python3 converter.py onnx2tnn -h
+```
+usage information：
+```text
+usage: convert onnx2tnn [-h] [-in input_info [input_info ...]] [-optimize]
+                        [-half] [-v v1.0.0] [-o OUTPUT_DIR] [-align]
+                        [-input_file INPUT_FILE_PATH]
+                        [-ref_file REFER_FILE_PATH] [-debug]
+                        onnx_path
+
+positional arguments:
+  onnx_path             the path for onnx file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g.,
+                        -in input1_name:1,3,128,128 input2_name:1,3,256,256
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save model using half
+  -v v1.0.0             the version for model
+  -o OUTPUT_DIR         the output tnn directory
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for
+                        the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference
+                        data to compare the results.
+  -debug                Turn on the switch to debug the model.
+```
+Example:
+```shell script
+python3 converter.py onnx2tnn ~/mobilenetv3/mobilenetv3-small-c7eb32fe.onnx.opt.onnx -optimize -v=v3.0 -o ~/mobilenetv3/ 
+```
+
+- caffe2tnn
+
+caffe format conversion
+
+The convert2tnn currently only supports the latest version of Caffe file format, so if you want to convert Caffe model to TNN model. You need to convert the old version caffe network and model into new version first. Caffe comes with such tools.
+
+The caffe network and model are converted to the new version format. The specific usage is as follows:
+
+```shell script
+upgrade_net_proto_text [old prototxt] [new prototxt]
+upgrade_net_proto_binary [old caffemodel] [new caffemodel]
+```
+
+The format after modification:
+
+```text
+layer {
+  name: "data"
+  type: "input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 224 dim: 224 } }
+}
+```
+
+
+```shell script
+python3 converter.py caffe2tnn -h
+```
+usage information：
+```text
+usage: convert caffe2tnn [-h] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half]
+                         prototxt_file_path caffemodel_file_path
+
+positional arguments:
+  prototxt_file_path    the path for prototxt file
+  caffemodel_file_path  the path for caffemodel file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model, default v1.0
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save model using half
+  -align                align the onnx model with tnn model
+  -input_file in.txt    the input file path which contains the input data for the inference model
+  -ref_file   out.txt   the reference file path which contains the reference data to compare the results
+```
+Example：
+```shell script
+python3 converter.py caffe2tnn ~/squeezenet/squeezenet.prototxt ~/squeezenet/squeezenet.caffemodel -optimize -v=v1.0 -o ~/squeezenet/ 
+```
+- tensorflow2tnn
+
+The current convert2tnn model only supports the graphdef model, but does not support checkpoint or saved_model format files. If you want to convert the checkpoint or saved_model model, you can refer to the tf2onnx section below to convert it yourself.
+
+``` shell script
+python3 converter.py tf2tnn -h
+```
+usage information：
+```text
+usage: convert tf2tnn [-h] -tp TF_PATH -in input_info [input_info ...] -on output_name [output_name ...] [-o OUTPUT_DIR] [-v v1.0] [-optimize] [-half] [-align] [-input_file INPUT_FILE_PATH]
+                      [-ref_file REFER_FILE_PATH]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -tp TF_PATH           the path for tensorflow graphdef file
+  -in input_info [input_info ...]
+                        specify the input name and shape of the model. e.g., -in input1_name:1,128,128,3 input2_name:1,256,256,3
+  -on output_name [output_name ...]
+                        the tensorflow model's output name. e.g. -on output_name1 output_name2
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -optimize             If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half                 save the model using half
+  -align                align the onnx model with tnn model
+  -input_file INPUT_FILE_PATH
+                        the input file path which contains the input data for the inference model.
+  -ref_file REFER_FILE_PATH
+                        the reference file path which contains the reference data to compare the results.
+```
+Example：
+```shell script
+python3 converter.py tf2tnn -tp ~/tf-model/test.pb -in=input0,input2 -on=output0 -v=v2.0 -optimize -o ~/tf-model/
+```
+- tensorflow-lite2tnn
+
+The current tensorflow2tnn only supports the tflite format model which is  to facilitate mobile deployment.
+
+``` shell script
+python3 converter.py tflite2tnn -h
+```
+usage information：
+```
+usage: convert tflite2tnn [-h] TF_PATH [-o OUTPUT_DIR] [-v v1.0] [-align]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  TF_PATH           the path for tensorflow-lite graphdef file
+  -o OUTPUT_DIR         the output tnn directory
+  -v v1.0               the version for model
+  -align                align the tensorflow-lite model with tnn model
+  -input_file in.txt    the input file path which contains the input data for the inference model
+  -ref_file   out.txt   the reference file path which contains the reference data to compare the results
+```
+Example：
+```shell script
+python3 converter.py tflite2tnn  ~/tf-model/test.tflite  -o ~/tf-model/
+```
+
+## Input and Output File Example
+### Input
+```text
+
+The number of input 
+input_name input_shape_size input_info input_data_type
+input_data 
+input_name input_shape_size input_info input_data_type
+input_data
+......
+
+Example
+ 2 
+ in0 4 1 3 1 1 3
+ 2 
+ 4 
+ 3 
+ in1 4 1 2 2 1 0
+ 0.1 
+ 0.2 
+ 0.3 
+ 0.4 
+
+
+Tips：
+If input data type is float, you can use 0 to specify input_data_type.
+If input data type is int,   you can use 3 to specify input_data_type.
+
+```
+
+### Output
+```text
+
+
+The number of output 
+output_name output_shape_size output_info output_data_type
+output_data 
+output_name output_shape_size output_info output_data_type
+output_data
+......
+
+Example
+ 2 
+ out0 2 1 3 0
+ 0.1 
+ 0.2 
+ 0.3 
+ out1 4 1 2 2 1 0
+ 0.1 
+ 0.2 
+ 0.3 
+ 0.4 
+
+
+Tips：
+If output data type is float, you can use 0 to specify output_data_type.
+If output data type is int,   you can use 3 to specify output_data_type.
+
+```
+
+### The Code Used to Generate Input or Output File
+```python
+def write_pytorch_data(output_path, data, data_name_list):
+    """
+    Save the data of Pytorch needed to align TNN model.
+
+    The input and output names of pytorch model and onnx model may not match,
+    you can use Netron to visualize the onnx model to determine the data_name_list.
+
+    The following example converts ResNet50 to onnx model and saves input and output:
+    >>> from torchvision.models.resnet import resnet50
+    >>> model = resnet50(pretrained=False).eval()
+    >>> input_data = torch.randn(1, 3, 224, 224)
+    >>> input_names, output_names = ["input"], ["output"]
+    >>> torch.onnx.export(model, input_data, "ResNet50.onnx", input_names=input_names, output_names=output_names)
+    >>> with torch.no_grad():
+    ...     output_data = model(input_data)
+    ...
+    >>> write_pytorch_data("input.txt", input_data, input_names)
+    >>> write_pytorch_data("output.txt", output_data, output_names)
+
+    :param output_path: Path to save data.
+    :param data: The input or output data of Pytorch model.
+    :param data_name_list: The name of input or output data. You can get it after visualization through Netron.
+    :return:
+    """
+
+    if type(data) is not list and type(data) is not tuple:
+        data = [data, ]
+    assert len(data) == len(data_name_list), "The number of data and data_name_list are not equal!"
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(data)))
+        for name, data in zip(data_name_list, data):
+            data = data.numpy()
+            shape = data.shape
+            description = "{} {} ".format(name, len(shape))
+            for dim in shape:
+                description += "{} ".format(dim)
+            data_type = 0 if data.dtype == np.float32 else 3
+            fmt = "%0.6f" if data_type == 0 else "%i"
+            description += "{}".format(data_type)
+            f.write(description + "\n")
+            np.savetxt(f, data.reshape(-1), fmt=fmt)
+
+
+def write_tensorflow_data(output_path, data, data_name_list, data_usage=1):
+    """
+    Save the data of TensoFlow needed to align TNN model.
+
+    :param output_path: Path to save data. "You should use input.txt or output.txt to name input or output data"
+    :param data: The input or output data of TensorFlow model.
+    :param data_name_list: The name of input or output data. You can get it after visualization through Netron.
+    :param data_usage: Specify the data usage. If the data is input data, data_usage=0;
+                       if the data if outptu data, data_usage=1.
+    :return:
+    """
+    def convert_nhwc(data):
+        assert len(data.shape) <= 4
+        if len(data.shape) == 2:
+            return data
+        orders = (0, 2, 1) if len(data.shape) == 3 else (0, 2, 3, 1)
+        return data.transpose(orders)
+
+    if type(data) is not list and type(data) is not tuple:
+        data = [data, ]
+    assert len(data) == len(data_name_list), "The number of data and data_name_list are not equal!"
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(data)))
+        for name, data in zip(data_name_list, data):
+            data = convert_nhwc(data) if data_usage == 0 else data
+            shape = data.shape
+            description = "{} {} ".format(name, len(shape))
+            for dim in shape:
+                description += "{} ".format(dim)
+            data_type = 0 if data.dtype == np.float32 else 3
+            fmt = "%0.6f" if data_type == 0 else "%i"
+            description += "{}".format(data_type)
+            f.write(description + "\n")
+            np.savetxt(f, data.reshape(-1), fmt=fmt)
+
+
+
+```
+
+## Model Conversion Details
+convert2tnn is just an encapsulation of a variety of tools for model converting. According to the principles explained in the previous part "Introduction to model conversion", you can also convert the original model into ONNX first, and then convert the ONNX model into a TNN model. We provide documentation on how to manually convert Caffe, Pytorch, TensorFlow models into ONNX models, and then convert ONNX models into TNN models. If you encounter problems when using the convert2tnn converter, we recommend that you understand the relevant content, which may help you to use the tool more smoothly.
+
+- [onnx2tnn](onnx2tnn_en.md)
+- [pytorch2tnn](onnx2tnn_en.md)
+- [tf2tnn](tf2tnn_en.md)
+- [caffe2tnn](caffe2tnn_en.md)
+- [tflite2tnn](tflite2tnn_en.md)
diff --git a/3rdparty/TNN/doc/en/user/demo_en.md b/3rdparty/TNN/doc/en/user/demo_en.md
new file mode 100644
index 0000000..7c78bb3
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/demo_en.md
@@ -0,0 +1,595 @@
+# Demo Introduction
+
+[中文版本](../../cn/user/demo.md)
+
+## I. Introduction to iOS Demo
+
+### How to run
+
+1. Download the Demo model
+
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+   
+   Optional: If you want to run the OCR demo, you also need to prepare the opencv framework. We provide a script to download opencv.
+   ```
+   cd <path_to_tnn>/scripts
+   sh download_opencv.sh iOS
+   ```
+   
+   PS: If the script cannot download the model or framework due to network problems, please manually create the corresponding folder according to the information in the script and download it by yourself.
+
+2. Open the TNNExamples project
+
+   Enter the directory `<path_to_tnn>/examples/ios/` and double-click to open the TNNExamples project.
+
+   Optional: If you want to run the OCR demo, you also need to add opencv as dependency.
+
+   Click the TNNExamples project as shown below, find the project setting `General`, click the `+` button under `Framworks, Libraries, and Embedded Content`.
+
+   <div align=left ><img src="../../cn/user/resource/ios_add_framework.jpg" width = "75%" height = "75%"/>
+
+   Choose `Add Other-Add Files...` in the following interface, find your `opencv2.framework` and add it. If you use the provided script, the downloaded opencv2.framework shoud be in the `<path_to_tnn>/third_party/opencv/iOS` directory.
+
+   <div align=left ><img src="../../cn/user/resource/ios_add_framework_from_files.jpg" width = "75%" height = "75%"/>
+
+   Due to the opencv2.framework includes code for real iOS devices and simulators, please set the `Embed` option to `Do Not Embed` as in the following interface.
+
+   <div align=left ><img src="../../cn/user/resource/ios_framework_notembed.jpg" width = "75%" height = "75%"/>
+
+   Finally, to ensure xcode can find the opencv2.framework, the directory containing the opencv2.framework must be added in the `Framework Search Paths`. As shown in the followng interface, find the project setting `Build Settings`,  then find the `Framework Search Paths` item under `Search Paths`. If the directory has not been included, double-click the item and add the directory.
+
+   <div align=left ><img src="../../cn/user/resource/ios_framework_search_path.jpg" width = "75%" height = "75%"/>
+
+3. Set up a developer account
+
+   Click the TNNExamples project as shown below, find the project setting `Signing & Capabilities`, click the Team tab and select `Add an Account...`
+
+  <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_add_account_demo.jpg" width = "75%" height = "75%">
+
+
+   Enter the Apple ID account and password in the following interface. Return to the `Signing & Capabilities` interface, and select the added account in the Team tab. If you don’t have an Apple ID, you can also use the “Create Apple ID” option to apply according to the relevant prompts.
+
+   `PS: There is no fee to apply for Apple ID, it can be passed immediately, and the APP can be run on the real machine after debugging.`
+
+  <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_set_account.jpg" width = "75%" height = "75%">
+
+
+4. Run on real machine
+
+   4.1 Modify `Bundle Identitifier`
+
+   As shown in the figure, after the existing `Bundle Identifier`, a suffix (limited to numbers and letters) is randomly added to avoid personal account conflicts.
+
+  <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_set_bundleid_demo.jpg" width = "75%" height = "75%">
+
+  4.2 Verify authorization
+     
+   For the first time, use the shortcut key `Command + Shift + K` to clean up the project, and then execute the shortcut key` Command + R` to run. If it is the first time to log in with Apple ID, Xcode will pop up a box and report the following error. You need to verify the authorization on the iOS device according to the prompt. Generally speaking, the authorization path on the phone is: Settings-> General-> Profile and Device Management-> Apple Development Options-> Click Trust
+     
+
+  <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/ios_verify_certificate_demo.jpg" width = "75%" height = "75%">
+
+  4.3 Result
+     
+   For the first run, use the shortcut key `Command + Shift + K` to clean up the project, and then execute the shortcut key` Command + R` to run. Click the Run button on the interface, the interface will display the CPU and GPU time consumption of all models in the model directory. The running result of the iPhone7 real machine is shown below.
+     
+  PS:
+     
+  a) Due to the different GPU and CPU acceleration principles, the GPU performance of a specific model is not necessarily higher than that of the CPU. It is related to the specific model, model structure, and engineering implementation. Everyone is welcome to participate in the development of TNN and make progress together.
+     
+  b) The macro TNN_SDK_USE_NCNN_MODEL in TNNSDKSample.h defaults to 0, and the TNN model can be set to 1 to run the ncnn model.
+     
+     c) If you encounter an error message of `Unable to install...`, please delete the existing TNNExamples on the real device and run the installation again.
+     
+     d) If the CodeSign error `Command CodeSign failed with a nonzero exit code` is encountered when the real device is running, please refer to issue20 `iOS Demo Operation Step Instructions`
+
+### Demo effect 
+
+1. Face detection
+
+   Model source: https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   Effect example: iPhone 7, ARM single thread 6.3206ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/face_detector.jpg" width = "33%" height = "33%"/>
+
+2. Image classification
+
+   Model source: https://github.com/forresti/SqueezeNet
+
+   Example: iPhone 7, ARM single thread 13.83ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/image_classify.jpg" width = "33%" height = "33%"/>
+
+   
+## II. Introduction to Android Demo
+### Environment requirements
+
+1. Android Studio 3.5 or above
+2. NDK version >= 18, <= 21
+NDK 22 and 23 are not suggested, because they may report error when link third party lib, eg. opencv, hiai.
+
+### Steps
+
+1. Download the Demo model
+
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+
+   Optional: If you want to run the OCR demo, you also need to download opencv framework.
+   ```
+   cd <path_to_tnn>/scripts
+   sh download_opencv.sh android
+   ```
+
+   PS: If the script cannot download the model due to network problems, please manually create the corresponding folder according to the information in the script and download it yourself.
+  
+   PS for Huawei NPU :
+   You need to download the DDK before run the demo. Refer to： [FAQ](../faq_en.md): Huawei NPU Compilation Prerequisite.
+ 
+
+2. Open the TNNExamples project
+
+   - Enter the directory `<path_to_tnn>/examples/android/` and double-click to open the TNN example project file `build.gradle`.
+
+   - Connect Android phone to compute, and click `Run Demo` to compile and run demo.
+
+   - TNN example only build 64bit armv8 lib by default, if you need 32bit armv7 lib, please modify `build.gradle` with `abiFilters "armeabi-v7a", "arm64-v8a"`.
+   
+   PS for Huawei NPU ：
+   
+   1).  After opening the TNN example project，you need to set the TNN_HUAWEI_NPU_ENABLE switch to ON in <path_to_tnn>/examples/android/demo/CMakeList.txt below to use Huawei NPU ：
+   
+   ````
+        set(TNN_HUAWEI_NPU_ENABLE ON CACHE BOOL "" FORCE)
+   ````
+      
+   2). If encountering  `<path_to_tnn>/examples/android/src/main/jni/thirdparty/hiai_ddk/include/graph`Permission Denied，
+   Clean Project and rerun.
+  
+   3). Only Huawei phones of rom version >= 100.320.xxx.xxxx supportS building the example TNN models.
+  
+   4). To run the demo, you need to first download the ddk. Refer to ： [FAQ](../faq_en.md) to check the current NPU support and how to update the ROM.
+
+   5). You need to set the TNN_OPENCV_ENABLE switch to ON in <path_to_tnn>/examples/android/demo/CMakeList.txt below to run the OCR demo ：
+
+   ````
+        set(TNN_OPENCV_ENABLE ON CACHE BOOL "" FORCE)
+   ````
+
+   If you want to use specific OpenCV SDK instead of downloading through above `download_opencv.sh` script, you need to set OPENCV_ANDROID_SDK_PATH manually:
+   In <path_to_tnn>/examples/android/demo/CMakeList.txt, change to specific OpenCV SDK path.
+   ````
+        set(OPENCV_ANDROID_SDK_PATH <path_to_opencv_android_sdk>)
+   ````
+
+### Running result
+1. Face Detection-Pictures
+   
+   Model source: https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   Effect example: Huawei P30, ARM single thread 32.2359ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_face_detector_image.jpg" width = "25%" height = " 25%"/>
+   
+   
+   Example： Huawei P30, NPU rom 100.320.010.022 9.04ms
+       
+   
+<div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_face_detecor_image_npu.jpg" width = "25%" height = "25%"/>
+   
+2. Face detection-video
+   Model source: https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+
+   Effect example: Huawei P30, ARM single thread 122.296ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_face_detector_stream.jpg" width = "25%" height = " 25%"/>
+
+   
+    Example： Huawei P30, NPU rom 100.320.010.022 28ms
+   
+ <div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_face_detector_stream_npu.jpg" width = "25%" height = "25%"/>
+   
+3. Image classification
+
+   Model source: https://github.com/forresti/SqueezeNet
+
+   Effect example: Huawei P30, ARM single thread 81.4047ms
+
+   <div align=left ><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/user/resource/android_image_classify.jpg" width = "25%" height = "25%"/>
+   
+   
+   Example： Huawei P30, NPU rom 100.320.010.022 2.48ms
+   
+   <div align=left ><img src="https://github.com/darrenyao87/tnn-models/blob/master/doc/cn/user/resource/android_image_classify_npu.jpg" width = "25%" height = "25%"/>
+   
+   
+## III. Introduction to Linux/Mac/Windows/Armlinux/CudaLinux Demo
+
+### Ability
+* Demonstrate the calling method of TNN basic interface, quickly run the model in Linux/Mac/Windows/ArmLinux/CudaLinux environment.
+
+### Running Steps
+#### 1. Downloand demo models
+   ```
+   cd <path_to_tnn>/model
+   sh download_model.sh
+   ```
+   If the script cannot download the model due to network problems, please manually create the corresponding folder according to the information in the script and download it yourself.
+
+#### 2. Compile Steps
+##### Linux
+* Environment Requirements  
+   - Cmake (>=3.11)
+   - OpenCV3 (only used by webcam demo), Can be imported in CMake by find_package(OpenCV 3)
+
+   ```
+   // Install OpenCV3 manually
+   wget https://github.com/opencv/opencv/archive/3.4.13.zip
+   unzip 3.4.13.zip
+   cd opencv-3.4.13
+
+   mkdir build
+   mkdir install
+   cd build
+
+   cmake -DCMAKE_INSTALL_PREFIX=../install ..
+   make -j4
+   make install
+
+   // add OpenCV path before find_package in CMakeList.txt
+   // for example, move to examples/linux/x86, open CMakeList.txt
+   // add the following before find_package(OpenCV 3 REQUIRED)
+   set(OpenCV_DIR <path_to_opencv>/opencv-3.4.13/install/share/OpenCV)
+   ```
+
+* Compile  
+   Move to `examples/linux/x86` directory and execute `build_linux_naitve.sh` or `build_linux_openvino.sh`. In the former case, TNN uses its optimized X86 backend to execute models, while the Intel OpenVINO backend is used in forward inference in the later case. Take `build_linux_native.sh` as an example, by default, only image demos will be compiled. If you want to compile the face alignmnt camera demo, you need to change the "-DTNN_DEMO_WITH_WEBCAM=OFF to "-DTNN_DEMO_WITH_WEBCAM=ON" in `build_linux_native.sh`:
+   ```
+   cd <path_to_tnn>/examples/linux/x86
+   ./build_linux_naitve.sh
+   ```
+* Execute  
+   Move to `examples/linux/x86/build_linux_native` or `examples/linux/x86/build_linux_openvino` directory, when execued without any parameters, demo executables will print usage message:
+   ```
+   cd build_linux_native
+   ./demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+
+   ```
+   `-p` and `-m` are used for the tnnproto and tnnmodel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build_linux_native
+   
+   image-classification demo
+   ./demo_x86_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   face-detector demo
+   ./demo_x86_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   object-detector demo
+   ./demo_x86_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   reading-comprehension demo
+   ./demo_x86_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+
+   face-alignment camera demo
+   ./demo_x86_webcam
+   ```
+
+##### MacOS
+* Environment Requirements  
+   - Cmake (>=3.11)
+   - OpenCV3 (only used by webcam demo), Can be imported in CMake by ```find_package(OpenCV 3)```. you can install opencv with brew (```brew install opencv@3 && brew link --force opencv@3```). If brew install failed, you can install opencv manually.
+
+   ```
+   // Install OpenCV3 manually
+   wget https://github.com/opencv/opencv/archive/3.4.13.zip
+   unzip 3.4.13.zip
+   cd opencv-3.4.13
+
+   mkdir build
+   mkdir install
+   cd build
+
+   cmake -DCMAKE_INSTALL_PREFIX=../install ..
+   make -j4
+   make install
+
+   // add OpenCV path before find_package in CMakeList.txt
+   // for example, move to examples/linux/x86, open CMakeList.txt
+   // add the following before find_package(OpenCV 3 REQUIRED)
+   set(OpenCV_DIR <path_to_opencv>/opencv-3.4.13/install/share/OpenCV)
+   ```
+
+* Compile  
+   Move to `examples/mac/x86` directory and execute `build_macos_native.sh` or `build_macos_openvino.sh`. In the former case, TNN uses its optimized X86 backend to execute models, while the Intel OpenVINO backend is used in forward inference in the later case. Take `build_macos_native.sh` as an example, by default, only image demos will be compiled. If you want to compile the face alignmnt camera demo, you need to change the "-DTNN_DEMO_WITH_WEBCAM=OFF to "-DTNN_DEMO_WITH_WEBCAM=ON" in `build_macos_native.sh`:
+   ```
+   cd <path_to_tnn>/examples/mac/x86
+   ./build_macos_native.sh
+   ```
+* Execute  
+   Move to `examples/mac/x86/build_macos_native` or `examples/mac/x86/build_macos_openvino` directory, when execued without any parameters, demo executables will print usage message:
+   ```
+   cd build_macos_native
+   ./demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+
+   ```
+   `-p` and `-m` are used for the tnnproto and tnnmodel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build_macos_native
+   
+   image-classification demo
+   ./demo_x86_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   face-detector demo
+   ./demo_x86_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   object-detector demo
+   ./demo_x86_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   reading-comprehension demo
+   ./demo_x86_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+
+   face-alignment camera demo
+   ./demo_x86_webcam
+   ```
+
+##### Windows
+* Environment Requirements  
+   - Visual Studio (>=2017)
+   - Cmake (>=3.11; Or run scripts with Visual Studio Prompt)
+   - OpenCV3, compiled by the same version of VC.
+* Comiple  
+   Open `x64 Native Tools Command Prompt for VS 2017/2019`.
+   Move to `examples\windows\x86` directory and execute `build_msvc_native.bat` or `build_msvc_openvino.bat`. In the former case, TNN uses its optimized X86 backend to execute models, while the Intel OpenVINO backend is used in forward inference in the later case. Take `build_msvc_native.bat` as an example, by default, only image demos will be compiled. If you want to compile the face alignmnt camera demo, you need to change the "-DTNN_DEMO_WITH_WEBCAM=OFF to "-DTNN_DEMO_WITH_WEBCAM=ON" in `build_msvc_native.bat`:
+   ```
+   set OpenCV_DIR=`OPENCV_INSTALL_DIR`
+   cd <path_to_tnn>\examples\windows\x86
+   .\build_msvc_native.bat
+   ```
+* Execute  
+   Move to `examples\windows\x86\build_msvc_native\release` or `examples\windows\x86\build_msvc_openvino\release` directory, when execued without any parameters, demo executables will print usage message:
+   ```
+   cd build_msvc_native\release
+   .\demo_x86_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >.\demo_x86_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p` and `-m` are used for the tnnproto and tnnmodel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build_msvc_native\release
+
+   image-classification demo
+   .\demo_x86_imageclassify -p ..\..\..\..\..\model\SqueezeNet\squeezenet_v1.1.tnnproto -m ..\..\..\..\..\model\SqueezeNet\squeezenet_v1.1.tnnmodel -i ..\..\..\..\assets\tiger_cat.jpg
+
+   face-detector demo
+   .\demo_x86_facedetector -p ..\..\..\..\..\model\face_detector\version-slim-320_simplified.tnnproto -m ..\..\..\..\..\model\face_detector\version-slim-320_simplified.tnnmodel -i ..\..\..\..\assets\test_face.jpg
+
+   object-detector demo
+   .\demo_x86_objectdetector -p ..\..\..\..\model\mobilenet_v2-ssd\mobilenetv2_ssd.tnnproto -m ..\..\..\..\model\mobilenet_v2-ssd\mobilenetv2_ssd.tnnmodel -i ..\..\..\assets\004545.jpg
+
+   reading-comprehension demo
+   .\demo_x86_readingcomprehension -p ..\..\..\..\..\model\bertsquad10\bertsquad10_clean.tnnproto -m ..\..\..\..\..\model\bertsquad10\bertsquad10_clean.tnnmodel -v ..\..\..\..\..\model\bertsquad10\vocab.txt
+
+   webcam base face alignment demo
+   .\demo_x86_webcam
+   ```
+
+##### ArmLinux
+* Environment Requirements  
+   - Cmake (>=3.1)
+   - Install arm toolchain
+   - ubuntu:  
+      aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu  
+      arm32hf: sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+  - other linux: download toolchains from https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+* Comiple  
+   Move to `examples/linux/cross` directory:
+   ```
+   cd <path_to_tnn>/examples/linux/cross
+   ```
+   Modify `build_aarch64_linux.sh` or `build_armhf_linux.sh`，take aarch64 as an example, modify building options:
+   ```
+   CC=aarch64-linux-gnu-gcc
+   CXX=aarch64-linux-gnu-g++
+   TNN_LIB_PATH=../../../scripts/build_aarch64_linux/
+   ```
+   execute `build_aarch64_linux.sh`
+   ```
+   sh build_aarch64_linux.sh
+   ```
+* Execute  
+   Move to `examples/linux/cross/build` directory, when execued without any parameters, demo executables will print usage message:
+    ```
+   cd build
+   ./demo_arm_linux_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_arm_linux_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p` and `-m` are used for the tnnproto and tnnmodel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build
+
+   image-classification demo
+   ./demo_arm_linux_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   face-detector demo
+   ./demo_arm_linux_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   reading-comprehension demo
+   ./demo_arm_linux_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+   ```
+
+##### CudaLinux
+* Environment Requirements
+   - Cmake (>= 3.8)
+   - CUDA (>= 10.2)
+   - TensorRT (>= 7.1)
+* Compile
+   Configure the TensorRT path in env
+   ```
+   export TENSORRT_ROOT_DIR=<TensorRT_path>
+   ```
+   Configure the CuDNN path in env
+   ```
+   export CUDNN_ROOT_DIR=<CuDNN_path>
+   ```
+   Move to `examples/linux/cuda` directory and execute `build_linux.sh` :
+   ```
+   cd <path_to_tnn>/examples/linux/cuda
+   sh build_linux.sh
+   ```
+* Execute
+    Move to `examples/linux/cuda/build_cuda_linux` directory, when execued without any parameters, demo executables will print usage message:
+    ```
+   cd build_cuda_linux
+   ./demo_cuda_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_cuda_imageclassify [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) tnn model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p` and `-m` are used for the tnnproto and tnnmodel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build_cuda_linux
+
+   image-classification demo
+   ./demo_cuda_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.1.tnnmodel -i ../../../assets/tiger_cat.jpg
+
+   face-detector demo
+   ./demo_cuda_facedetector -p ../../../../model/face_detector/version-slim-320_simplified.tnnproto -m ../../../../model/face_detector/version-slim-320_simplified.tnnmodel -i ../../../assets/test_face.jpg
+
+   object-detector demo
+   ./demo_cuda_objectdetector -p ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto -m ../../../../model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel -i ../../../assets/004545.jpg
+
+   reading-comprehension demo
+   ./demo_cuda_readingcomprehension -p ../../../../model/bertsquad10/bertsquad10_clean.tnnproto -m ../../../../model/bertsquad10/bertsquad10_clean.tnnmodel -v ../../../../model/bertsquad10/vocab.txt
+   ```
+### FAQ
+
+#### Demo Execution Questions
+1. Demo execution error: "open file xxx failed"
+
+   This error indicates invalid input image file path. Please check input the input image path.
+
+2. Demo execution error: "open lable file xxx failed"
+
+   This error indicates invalid classification label path. The image-classify demo requires predefined label file, and the default label file is located at: `<path_to_tnn>/examples/assets/synset.txt`.
+
+#### CUDA Compilation Questions
+1. Compilation Error: "not defined environment variable:CUDNN_ROOT_DIR"或"not defined environment variable:TENSORRT_ROOT_DIR"
+   Please follow the CUDA compilation steps and check if the env `CUDNN_ROOT_DIR` and `TENSORRT_ROOT_DIR` are set properly.
+
+### Function process
+#### Image classification function process
+* Create predictor:  
+   ```cpp
+   auto predictor = std::make_shared<ImageClassifier>();
+   ```
+* Init predictor:  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Init(option));
+   // for Linux/Windows(OpenVINO)
+   option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+   // for Linux/Windows(X86 native)
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // for ArmLinux
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // for CUDA
+   option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+   ```
+* Create image_mat:  
+   ```cpp
+   // for Linux/Windows(OpenVINO)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_X86, TNN_NS::N8UC3, nchw, data);
+   // for Linux/Windows(X86 native)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   // for ArmLinux
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ARM, TNN_NS::N8UC3, nchw, data);
+   // for CUDA
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   ```
+* Run predictor:  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output));
+   ```
+#### Face detection function process
+* Create predictor:  
+   ```cpp
+   auto predictor = std::make_shared<UltraFaceDetector>();
+   ```
+* Init predictor:  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Init(option));
+   // for Linux/Windows(OpenVINO)
+   option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+   // for Linux/Windows(X86 native)
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // for ArmLinux
+   option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+   // for CUDA
+   option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+   ```
+* Create image_mat:  
+   ```cpp
+   // for Linux/Window(OpenVINO)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_X86, TNN_NS::N8UC3, nchw, data);
+   // for Linux/Windows(X86 native)
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   // for ArmLinux
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ARM, TNN_NS::N8UC3, nchw, data);
+   // for CUDA
+   auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+   ```
+* Run predictor:  
+   ```cpp
+   CHECK_TNN_STATUS(predictor->Predict(std::make_shared<UltraFaceDetectorInput>(image_mat), sdk_output));
+   ```
+* Mark face:  
+   ```cpp
+   TNN_NS::Rectangle((void *)ifm_buf, image_orig_height, image_orig_width, face.x1, face.y1, face.x2, face.y2, scale_x, scale_y);
+   ```
+
+
+## IV. NCNN model usage and interface introduction
+
+- [NCNN related](./ncnn_en.md)
+
+
diff --git a/3rdparty/TNN/doc/en/user/ncnn_en.md b/3rdparty/TNN/doc/en/user/ncnn_en.md
new file mode 100644
index 0000000..5200f99
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/ncnn_en.md
@@ -0,0 +1,106 @@
+# NCNN Interface and Model Usage 
+
+[中文版本](../../cn/user/ncnn.md)
+
+## ncnn model usage documentation
+
+When using the ncnn model, you need to specify the ModelType as MODEL_TYPE_NCNN in NetworkConfig parameters.
+
+Reference code:
+    
+    ModelConfig model_config;
+    model_config.model_type = MODEL_TYPE_NCNN;
+    TNN net;
+    Status ret = net.Init (model_config);
+    auto instance = net.CreateInst (network_config, ret);
+
+The default InputShape needs to be declared in TNN when created, which is explained in the Input layer of ncnn.param. If not specified, the InputShape needs to be specified in the Instance creation code.
+Reference code:
+
+    InputShapesMap input_shape;
+    input_shape ["input_name"] = {1, 3, 224, 224};
+    auto instance = net.CsreateInst (network_config, ret, input_shape);
+
+For other aspects, it is the same as the normal flow, and you can refer to other documents for more details.
+ss
+[Demo example](./demo_en.md) Set the macro TNN_SDK_USE_NCNN_MODEL in examples/samples/TNNSDKSample.h to 1 to run the ncnn model.
+
+
+### Currently adapted NCNN Op
+
+The current supported NCNN OPs are as follows. Int8 model adaptation is still in progress.
+
+| Operators                  |    NCNN   |   TNN   |
+|----------------------------|-----------|---------|
+|MemoryData                  |           |    ❌     |
+|AbsVal                    sample  |           |    ✅     |
+|ArgMax                      | TODO      |         |
+|BatchNorm                   |           |    ✅     |
+|Bias                        |           |         |
+|BinaryOp                    |           |    ✅     |
+|BNLL                        |           |         |
+|Cast                        |           |         |
+|Clip                        |           |         |
+|Concat                      |           |    ✅     |
+|Convolution                 |           |    ✅     |
+|ConvolutionDepthWise        |           |    ✅     |
+|Crop                        |           |    ✅     |
+|Deconvolution               |           |    ✅     |
+|DeconvolutionDepthWise      |           |    ✅     |
+|Dequantize                  |           |         |
+|DetectionOutput             |           |         |
+|Dropout                     |           |    ✅     |
+|Eltwise                     |           |         |
+|ELU                         |           |         |
+|Embed                       |           |         |
+|Exp                         |           |         |
+|ExpandDims                  |           |         |
+|Flatten                     |           |    ✅     |
+|HardSigmoid                 |           |    ✅     |
+|HardSwish                   |           |    ✅     |
+|InnerProduct                |           |    ✅     |
+|Input                       |           |    ✅     |
+|InstanceNorm                |           |    TODO     |
+|Interp                      |           |    ✅     |
+|Log                         |           |         |
+|LRN                         |           |    ✅     |
+|MVN                         |           |         |
+|Noop                        |           |         |
+|Normalize                   |           |    TODO    |
+|Packing                     |           |         |
+|Padding                     |           |    TODO     |
+|Permute                     |           |    ✅     |
+|Pooling                     |           |    ✅     |
+|Power                       |           |    TODO     |
+|PReLU                       |           |    TODO     |
+|PriorBox                    |           |         |
+|Proposal                    |           |         |
+|PSROIPooling                |           |         |
+|Quantize                    |           |         |
+|Reduction                   |           |         |
+|ReLU                        |           |    ✅     |
+|Reorg                       |           |    TODO     |
+|Requantize                  |           |         |
+|Reshape                     |           |    ✅     |
+|ROIAlign                    |           |         |
+|ROIPooling                  |           |         |
+|Scale                       |           |         |
+|SELU                        |           |         |
+|ShuffleChannel              |           |    ✅     |
+|Sigmoid                     |           |    ✅     |
+|Slice                       |           |    ✅     |
+|Softmax                     |           |    ✅     |
+|Split                       |           |    ✅     |
+|SPP                         | TODO      |         |
+|Squeeze                     |           |         |
+|TanH                        |           |    ✅     |
+|Threshold                   |           |         |
+|Tile                        | TODO      |         |
+|UnaryOp                     |           |         |
+|RNN                         | TODO      |         |
+|LSTM                        | TODO      |         |
+
+
+## ncnn model usage documentation
+
+TODO interface is under development
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/user/onnx2tnn_en.md b/3rdparty/TNN/doc/en/user/onnx2tnn_en.md
new file mode 100644
index 0000000..f7a970c
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/onnx2tnn_en.md
@@ -0,0 +1,197 @@
+# ONNX(Pytorch) Model to TNN Model
+
+[中文版本](../../cn/user/onnx2tnn.md)
+
+The onnx2tnn is the most important converter in TNN, which converts the ONNX model to a TNN model. The onnx2tnn tool mainly supports CNN common network structure. Because the Pytorch officially supports exporting to ONNX format, we only need to ensure that the ONNX model can be converted into a TNN model so that we could directly guarantee the Pytorch model can be directly converted into a TNN model.
+
+onnx2tnn has an out-of-the-box web version available at https://convertmodel.com/#outputFormat=tnn. Skip the "Environment requirements and Compile" step if you use the web version. The web version converts the model locally, so there is no need to warry about the mode security.
+
+## 1. Environment requirements and Compile
+### Environment requirements
+The following environment is suitable for Macos and Linux systems.
+The example is based on centos7.2.
+
+-Install protobuf (version >= 3.4.0)
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+For linux systems, we recommend to refer to the official [README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md) document of protobuf and install directly from the source code.
+
+If you are using Ubuntu system, you can use the following instructions to install:
+
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+
+
+- Install python (version >= 3.6)
+
+Macos
+```shell script
+brew install python3
+```
+Centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- Install python dependencies
+onnx=1.6.0  
+onnxruntime>=1.1.0   
+numpy>=1.17.0  
+onnx-simplifier>=0.2.4 
+requests
+```shell script
+pip3 install onnx==1.6.0 onnxruntime numpy onnx-simplifier requests
+```
+
+- cmake （version >= 3.0）
+Download the latest cmake from official website, and follow the instructions. It is recommended to use the latest cmake.
+
+### Compile
+The onnx2tnn tool runs directly on Mac and Linux with automatic compilation scripts
+ ```shell script
+cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
+./build.sh 
+ ```
+
+you could compile the tool manually:
+```shell script
+# make a new build directory to build
+mkdir build
+cd build cmake ./../
+make -j 4
+
+#copy the so lib
+cp ./*.so ./../
+
+#delete build directory
+cd ./../
+rm -r build
+```
+
+##### Manual Compilation
+
+Although we provide the automated script to compile，you could compile the tool manually:
+
+1. cd to the directy
+```shell script
+cd <path-to-tnn>/tools/onnx2tnn/onnx-converter
+```
+
+2. onnx_converter Compile
+```shell script
+# make a new build directory to build
+mkdir build
+cd build cmake ./../
+make -j 4
+
+#copy the so lib
+cp ./*.so ./../
+
+#delete build directory
+cd ./../
+rm -r build
+```
+
+## 2. How to use onnx2tnn 
+
+Check the tool help information
+```shell script
+python3 onnx2tnn.py -h
+```
+help information shows as follow:
+```text
+usage: onnx2tnn.py [-h] [-version VERSION] [-optimize OPTIMIZE] [-half HALF]
+                   [-o OUTPUT_DIR]
+                   onnx_model_path
+
+positional arguments:
+  onnx_model_path     Input ONNX model path
+
+optional arguments:
+  -h, --help              show this help message and exit
+  -version VERSION        Algorithm version string
+  -optimize OPTIMIZE      If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result
+  -half HALF              Save model using half, 1:yes, 0:default no
+  -o OUTPUT_DIR           the output dir for tnn model
+  -align                  align the onnx model with tnn model
+  -input_file INPUT_FILE  the input file path which contains the input data for the inference model
+  -ref_file   REF_FILE    the reference file path which contains the reference data to compare the results
+```
+
+
+```shell script
+python3 onnx2tnn.py model.onnx -version=algo_version -optimize=1 -half=0
+```
+```text
+Parameter:
+-version
+Version information
+
+-optimize
+1（Default）: lossless operator fusion optimisation. For example, BN+Scale..fused into Conv layer；
+0 ：If the fusion goes wrong, try set this value. 
+
+-half
+1: store in FP16，to reduce the model size.
+0（Default）: Stored in fp32.
+Note: Whether using fp16 depends on the platform，mobile GPU only allows fp16 calculation
+
+-o
+output_dir : The directory the model to be saved in，the directory must exit already.
+
+-align
+model align, if you want to use it, you can add '-align' in your command
+
+-input_file
+input_file : The path of input file, which will be used in model align
+
+-ref_file
+reference_file : The path of reference file, which will be used in model align. Compare tnn's output and reference file.
+```
+
+
+## 3. Operator support and usage restrictions
+The list of operators currently supported by the onnx2tnn tool can be found in [Model Support](support_en.md)
+- The tnn2onnx only supports 4-dimensional (nchw) data type.
+- It is recommended to set the batch size of the model input to 1, and not recommended to set the batch_size to a relatively large value.
+- Asymmetric padding is not supported. (This special situation occurs in "pool5/7x7\_s1\_1" in the inceptionv1 model)
+- The upsample layer of onnxruntime version 1.1 and the Pytoch Upsample layer have inconsistent results in the align_corners = 0 mode. Be careful using the onnxruntime calculation result when aligning the results.
+
+# Pytorch model converted to ONNX model
+
+Pytorch supports direct converting from the trained model to the ONNX model, so using the Pytorch's own export method will easily export the Pytorch model to the ONNX model. The following code shows how to export the resnet50 Pytorch model to the ONNX model.
+Pytorch also provides more detailed documentation on how to export Pytorch models as ONNX models. For details, please refer to [pytorch export onnx](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html)
+
+```python
+import torch.hub
+import numpy as np
+
+torch.hub.list('pytorch/vision')
+
+se_resnet50 = torch.hub.load(
+    'moskomule/senet.pytorch',
+    'se_resnet50',
+    pretrained=True,)
+
+senet = se_resnet50()
+senet.load_state_dict(torch.load("./seresnet50-60a8950a85b2b.pkl"))
+senet.eval()
+random_data = np.random.rand(1, 3, 224, 224).astype(np.float)
+torch.onnx.export(senet,
+				  random_data,
+				  "./sent.onnx",
+				  export_params=True,
+				  opset_version=11,
+				  do_constant_folding=True,
+				  input_names= ['input'],
+				  output_names = ['output'])
+```
+Through the code above, after converting the Pytorch model to the ONNX model, you could refer to the onnx2TNN which then converts the onnx model to a TNN model.
diff --git a/3rdparty/TNN/doc/en/user/openvino_en.md b/3rdparty/TNN/doc/en/user/openvino_en.md
new file mode 100644
index 0000000..9f549ad
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/openvino_en.md
@@ -0,0 +1,61 @@
+# TNN X86/Openvino Documentation
+## Introduction to TNN X86/Openvino
+This module supports TNN on x86 architecture and includes Openvino framework into TNN, which allows a TNN model running on Openvino Network.
+
+## Environment Requirements
+### Linux
+Cmake(>=3.7.2)
+### Windows
+Visual Stuido(>=2017)<br>
+CMake(>=3.7.2 or use build-in CMake in Visual Studio Tools)
+
+## Compile with scripts
+```
+Linux:
+$ cd scripts/
+$ sh build_linux.sh
+
+Windows:
+cd scripts\
+.\build_msvc.bat [VS2015/VS2017/VS2019]
+```
+Refer to [FAQ](#FAQ) if failed.
+
+
+## How to run
+### 1. Run with intergrated test file
+Move to ```build_openvino/test/```, run ```TNNTest``` with model, and set device_type to X86
+```
+$ cd build_openvino/test/
+$ ./TNNTest -mp PATH_TO_MODEL -dt X86 -ip PATH_TO_INPUT -op PATH_TO_OUTPUT
+```
+
+### 2. API Documentation
+Refer to [API Documentation](api_en.md), which needs to set ```config.device_type``` as ```DEVICE_X86``` and ```config.network_type``` as ```NETWORK_TYPE_OPENVINO```
+```cpp
+config.device_type  = TNN_NS::DEVICE_X86
+// run with native x86 optimized code, if network type is not set
+config.network_type = TNN_NS::NETWORK_TYPE_OPENVINO
+```
+
+## Run with demo
+Move to ```example/openivno/``` and run ```build_openvino.sh``` to compile demos with x86 architecture. Then call ```demo_x86_linux_imageclassify``` or ```demo_x86_linux_facedetector``` to run demos. For details move to 
+
+## Run Demo
+Refer to [demo documentaion](demo_en.md)
+
+## FAQ
+Q: CMake not found in Windows?<br>
+A: If CMake was installed, add the CMake path to Windows Enviroment Viraibles. Or use Visual Studio Prompt to run build_x86_msvc.bat, which includes build-in CMake.
+
+Q: Visual Studio not found in Windows?<br>
+A: Execute the scripts with Visual Studio Version, Like
+```
+.\build_x86_msvc.bat VS2019
+```
+
+Q: Error 0x4001 or 16385 with message "Invalid Model Content"<br>
+A: set `std::ios::binary` when reading Model stream:
+```cpp
+std::ifstream model_stream(mode_path, std::ios::binary);
+```
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/user/quantization_en.md b/3rdparty/TNN/doc/en/user/quantization_en.md
new file mode 100644
index 0000000..7d61462
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/quantization_en.md
@@ -0,0 +1,62 @@
+# Model Quantization  
+
+[中文版本](../../cn/user/quantization.md)
+
+## I. Why Quantization
+Quantization converts the main operators (Convolution, Pooling, Binary, etc.) in the network from the original floating-point precision to the int8 precision, reducing the model size and improving performance.
+PS:
+1. For the KL quantization method, you can refer to: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+
+## II. Compile  
+### 1. Build   
+```
+cd <path_to_tnn>/platforms/linux/
+./build_quanttool.sh -c
+```
+### 2. Output 
+    Binary of the quantization tool: <path_to_tnn>/platforms/linux/build/quantization_cmd  
+
+## III. Usage
+### 1. Command  
+```
+./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
+```
+### 2. Parameter Description  
+
+|option           |mandatory|with value |description                                      |
+|:------------------|:------:|:-----:|:----------------------------------------------|
+|-h, --help         |        |       |Output command prompt.                                 |
+|-p, --proto        |&radic; |&radic;|Specify tnnproto model description file.                 |
+|-m, --model        |&radic; |&radic;|Specify the tnnmodel model parameter file.               |
+|-i, --input_path   |&radic; |&radic;|Specify the path of the quantitative input folder. The currently supported formats are: <br>&bull; Text file (the file suffix is ​​.txt) <br>&bull; Common picture format files (file suffix is ​​.jpg .jpeg .png .bmp) <br> All files under this directory will be used as input.|
+|-b, --blob_method  |        |&radic;|Specify the feature map quantization method：<br>&bull; 0 Min-Max method (default)<br>&bull; 2 KL method|
+|-w, --weight_method|        |&radic;|Specify the quantification method of weights: <br>&bull; 0 Min-Max method (default)<br>&bull; 1 ADMM method|
+|-n, --mean         |        |&radic;|Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|
+|-s, --scale        |        |&radic;|Pre-processing, scale the input data channels, the parameter format is: 1.0, 1.0, 1.0|
+|-r, --reverse_channel|        |&radic;|Pre-processing, valid for picture format files: <br>&bull; 0 use RGB order (default)<br>&bull; 1 use BGR order|
+|-t, --merge_type|        |&radic;|Whether use per-tensor or per-channel method when quantifying: <br>&bull; 0 per-channel method (default)<br>&bull; 1 mix method, weights: per-channel, blob: per-tensor.<br>&bull; 2 per-tensor method|  
+  
+### 3. Quantization Input   
+#### 3.1 Select input data    
+The input needs to include specific input data, otherwise it will affect the accuracy of the output result, and keep the number of pictures at about 20 ~ 50.
+#### 3.2 Input preprocess   
+The input data is preprocessed mainly through mean and scale parameters. The formula is:   
+input_pre = (input - mean) * scale  
+
+### 4. Quantization Output  
+Two files will be generated in the current directory where the command is executed:   
+* model_quantized.tnnproto　--　Quantified model description file;
+* model_quantized.tnnmodel　--　Quantified model parameter file;
+
+### 5. Note  
+（1）-n and -s parameter only works when the input is a picture；  
+（2）When the input is a picture，it will be converted to RGB format for processing internally;
+ (3) When the input is txt, the input data storage method is NCHW, and of type float. The storage format stores one data in one line, in total of N*C*H*W lines. E.g,
+```
+0.01
+1.1
+0.1
+255.0
+...
+```
+ (4) scale and mean need to be the value after calculation. For example, 1.0/128.0 is invalid and 0.0078125 is ok.  
diff --git a/3rdparty/TNN/doc/en/user/support_en.md b/3rdparty/TNN/doc/en/user/support_en.md
new file mode 100644
index 0000000..19a9c4c
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/support_en.md
@@ -0,0 +1,210 @@
+# TNN Support
+
+[中文版本](../../cn/user/support.md)
+
+## TNN supported models
+
+TNN currently support main-stream CNN, LSTM and BERT networks：
+- Classical CNN: Vgg AlexNet GoogleNet(v1,v2,v3)
+- Practical CNN: ResNet DenseNet SENet
+- Light-weight CNN: SqueezeNet MobileNet(v1,v2,v3) ShuffleNet(v1,v2) MNasNet
+- Detection: Mtcnn-v2
+- Detection: Vgg-ssd SqueezeNet-ssd MobileNetv2-SSDLite ...
+- Detection: Yolo-v2 MobileNet-YOLOV3 ...
+- Segmentation: FCN PSPNet
+- 3D CNN: C3D T3D
+- BERT: BERT-Base BERT-Squad MobileBERT DistilBERT
+- LSTM: Crnn-LSTM
+
+| model name                | onnx2tnn | Naive | armv7 | armv8 | opencl | metal | Huawei_Npu | CUDA | x86 | OpenVINO | RKNPU |
+|---------------------------|----------|-----|-------|-------|--------|-------|-----|------|------|------|------|
+| AlexNet                   | yes      | yes | -     | -     |        | yes   | yes | yes  | yes  | yes  |      |
+| DenseNet(121)             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| FCN                       | Yes      | yes | yes   | yes   | yes    | yes   |  -  | yes  | yes  | yes  |      |
+| GoogleNet-v1              | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| GoogleNet-v2              | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| GoogleNet-v3(inception)   | yes      | yes | yes   | yes   |        |       |     |      | yes  | yes  |      |
+| MnasNet                   | yes      | yes |       |       |        |       |     |      | yes  | yes  |      |
+| MobileNet-v1-ssd(caffe)   | yes      | yes | -     | -     | -      | -     |  -  |  -   | yes  | yes  |      |
+| MobileNet-v1-ssd(pytorch) | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| MobileNet-v2-SSDLite      | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| MobileNet-yolov3          | ?        | ?   |       |       |        |       |     |      |      |      |      |
+| MobileNet-v1              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| MobileNet-v2              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| MobileNet-v3(small,large) | yes      | yes | yes   | yes   | yes    | yes   | No  | yes  | yes  | yes  |      |
+| Mtcnn-v2                  | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| PSPNet                    | yes      | yes | yes   | yes   | yes    | yes   | No  | yes  | yes  | yes  |      |
+| ResNet50                  | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| SENet(154)                | yes      | yes | yes   | yes   | yes    | yes   |  -  | yes  | yes  | yes  |      |
+| ShuffleNet-v1             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| ShuffleNet-v2             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| SqueezeNet-ssd            | No       | -   | -     | -     | -      | -     |  -  |  -   | -    | -    |      |
+| SqueezeNet-v1             | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  | yes  |
+| UNet                      | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Vgg-ssd                   | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Vgg16                     | yes      | yes | yes   | yes   |        | yes   | yes |      |      |      |      |
+| Yolo-v3-tiny              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Yolo-v2                   | ?        | ?   | yes   | yes   | yes    | yes   | yes |      |      |      |      |
+| Yolo-v2-tiny              | yes      | yes | yes   | yes   | yes    | yes   | yes | yes  | yes  | yes  |      |
+| Yolo-v3                   | yes      | yes | yes   | yes   | yes    | yes   | -   | yes  | yes  | yes  |      |
+| Yolo-v5s                  | yes      | yes | yes   | yes   | yes    | yes   | yes |      | yes  | yes  |      |
+| C3D                       | yes      | yes | -     | -     | -      | -     | -   |      | -    | -    |      |
+| T3D                       | yes      | yes | -     | -     | -      | -     | -   |      | -    | -    |      |
+| BERT-Base                 | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| BERT-Squad                | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| MobileBERT                | yes      | yes | -     | -     | -      | -     | -   | yes  |      |      |      |
+| DistilBERT                | yes      | yes | -     | -     | -      | -     | -   |      |      |      |      |
+| Crnn-LSTM                 | yes      | yes | yes   | yes   | yes    | yes   | -   | yes  | yes  | yes  |      |
+
+
+1. Regarding the upsample calculation of upsample, when the parameter mode == "bilinear" or mode == "linear", the onnx model exported by pytorch has some issues, and the calculation results of pytorch and onnx are not aligned. This is a bug of onnx itself, which deserves special attention. But don't worry about this problem. After converting the converted ONNX model to TNN, we ensure that the calculation results of TNN and Pytorch are aligned. In our testing, FCN and PSPNet have such problems.
+2. The "?" In the above table means unknown. Since the corresponding type of the model cannot be found, the compatibility of the model cannot be tested now.
+
+## TNN supported operators
+
+| TNN Operators            | Original Operators                             | Naive | armv7 | armv8 | opencl | metal | Huawei_Npu | CUDA | x86 | OpenVINO | RKNPU |
+|--------------------------|------------------------------------------------|-----|-------|-------|--------|-------|------|-------|-------|-------|------|
+| Abs                      | Abs                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Acos                     | Acos                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Add                      | Add                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ArgMaxOrMin(ArgMax)      | ArgMax                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ArgMaxOrMin(ArgMin)      | ArgMin                                         | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| Asin                     | Asin                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Atan                     | Atan                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| BatchNormCxx             | BatchNormalization                             | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| BitShift                 | BitShift                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| Cast                     | Cast                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Ceil                     | Ceil                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Clip                     | Clip                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Concat                   | Concat                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Const                    | Constant                                       |     |       |       |        |       |      |       |       |       |      |
+| ConstantOfShape          | ConstantOfShape                                | yes |       |       |        |       |      |       |       |       |      |
+| Convolution              | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution(depthwise)   | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution(group)       | Conv                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Convolution1D            | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution1D(depthwise) | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution1D(group)     | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D            | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D(depthwise) | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Convolution3D(group)     | Conv                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Cos                      | Cos                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution            | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution(depthwise) | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Deconvolution(group)     | ConvTranspose                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| DetectionOutput          | DectectionOutput(custom operator)              | yes | yes   | yes   |        |       |      | yes   | yes   | yes   |      |
+| Div                      | Div                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Dropout                  | Dropout                                        |     |       |       |        |       |      |       |       |       |      |
+| Einsum                   | Einsum                                         | yes |       |       |        |       |      | yes   |       |       |      |
+| Elu                      | Elu                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Equal                    | Equal                                          | yes |       |       |        |       |      |       |       |       |      |
+| Erf                      | Erf                                            | yes |       |       |        |       |      | yes   | yes   | yes   |      |
+| Exp                      | Exp                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Expand                   | Expand                                         | yes | yes   | yes   |        |       |      | yes   | yes   | yes   |      |
+| Flatten                  | Flatten                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Flatten                  | Shape+Gather+Constant+Unsqueeze+Concat+Reshape |     |       |       |        |       |      |       |       |       |      |
+| Floor                    | Floor                                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Gather                   | Gather                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| GatherND                 | GatherND                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| GridSample               | GridSample(PyTorch)                            | yes |       |       |        |       |      | yes   |       |       |      |
+| GroupNorm                | GroupNorm(PyTorch)                             | yes |       |       |        |       |      | yes   |       |       |      |
+| HardSigmoid              | HardSigmoid                                    | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | Add + Clip + Div + Mul                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | Add + Clip + Mul + Div                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| HardSwish                | HardSigmoid + Mul                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| InnerProduct             | Gemm                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| InstBatchNormCxx         | InstanceNormalization                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Inverse                  | Inverse(PyTorch)                               | yes |       |       |        |       |      | yes   |       |       |      |
+| LSTMONNX                 | LSTM                                           | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| LRN                      | LRN                                            | yes |       |       |        | yes   | yes  | yes   | yes   | yes   |      |
+| Log                      | Log                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| LogSigmoid               | Sigmoid + Log                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| MatMul                   | Matmul                                         | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Max                      | Max                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Min                      | Min                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Mul                      | Mul                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Neg                      | Neg                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| NonZero                  | NonZero                                        | yes |       |       |        |       |      |       |       |       |      |
+| Normalize                | ReduceL2+Clip+Shape+Expand+Div                 | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Normalize                | Reduce + Clip + Expand + Div                   | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| Normalize                | Mul(square)+Reduce+Max+Sqrt+Mul                | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   | yes  |
+| OneHot                   | OneHot                                         | yes |       |       |        |       |      | yes   |       |       |      |
+| PRelu                    | LeakyRelu / PRelu                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pad                      | Pad                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Permute                  | Transpose                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| PixelShuffle             | PixelShuffle(PyTorch), Depth2Space(ONNX)       | yes | yes   | yes   | yes    | yes   |      | yes   | yes   |       | yes  |
+| Pooling (Avg)            | AveragePool                                    | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Pooling (GlobalAverage)  | GlobalAveragePool                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling (GlobalMax)      | GlobalMaxPool                                  | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling (Max)            | MaxPool                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Pooling3D (Avg)          | AveragePool                                    | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (GlobalAverage)| GlobalAveragePool                              | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (GlobalMax)    | GlobalMaxPool                                  | yes |       |       |        |       |      | yes   |       |       |      |
+| Pooling3D (Max)          | MaxPool                                        | yes |       |       |        |       |      | yes   |       |       |      |
+| Power                    | Pow                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   |       |      |
+| PriorBox                 | PriorBox(custom operator)                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Range                    | Range                                          | yes |       |       |        |       |      |       |       |       |      |
+| Reciprocal               | Reciprocal                                     | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceL1                 | ReduceL1                                       | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| ReduceL2                 | ReduceL2                                       | yes | yes   | yes   | yes    | yes   |      | yes   | yes   | yes   |      |
+| ReduceLogSum             | ReduceLogSum                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| ReduceLogSumExp          | ReduceLogSumExp                                | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceMax                | ReduceMax                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceMean               | ReduceMean                                     | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| ReduceMin                | ReduceMin                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceProd               | ReduceProd                                     | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| ReduceSum                | ReduceSum                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| ReduceSumSquare          | ReduceSumSquare                                | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Relu                     | Relu                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Relu6                    | Clip                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Reorg                    | DepthToSpace                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Reorg                    | SpaceToDepth                                   | yes | yes   | yes   | yes    | yes   |      |       | yes   | yes   |      |
+| Repeat                   | Tile                                           |     |       |       |        |       |      |       |       |       |      |
+| Reshape                  | Reshape                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| RoiAlign                 | RoiAlign                                       | yes |       |       |        |       |      | yes   |       |       |      |
+| Rsqrt                    | Rsqrt(TFLite)                                  | yes | yes   | yes   |        |       |      |       |       |       |      |
+| ScatterND                | ScatterND                                      | yes |       |       |        |       |      | yes   | yes   | yes   |      |
+| Selu                     | Selu                                           | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| Shape                    | Shape                                          | yes |       |       |        |       | yes  | yes   |       |       |      |
+| ShuffleChannel           | Reshape + Transpose + Reshape                  | yes | yes   | yes   | yes    | yes   | yes  |       | yes   | yes   |      |
+| Sigmoid                  | Sigmoid                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Sign                     | Sign                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| SignedMul                | Sub+Sign+Add+Div+Gather+Slice+Mul              | yes | yes   | yes   |        |       |      |       | yes   |       |      |
+| Sin                      | Sin                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Size                     | Size                                           | yes |       |       |        |       |      |       |       |       |      |
+| Slice(StrideSlice)       | Slice                                          | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Softmax                  | Softmax                                        | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Softplus                 | Softplus                                       | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Softsign                 | Softsign                                       | yes |       |       |        |       | yes  |       | yes   | yes   |      |
+| Split                    | Split                                          |     | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Sqrt                     | Sqrt                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| SquaredDifference        | SquaredDifference(TFLite)                      | yes |       |       |        |       |      |       |       |       |      |
+| Squeeze                  | Squeeze                                        |     | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Sub                      | Sub                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   | yes  |
+| Sum                      |                                                |     |       |       |        |       |      |       |       |       |      |
+| Tan                      | Tan                                            | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Tanh                     | Tanh                                           | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Tile                     | Tile                                           | yes |       |       |        |       |      | yes   |       |       |      |
+| Unsqueeze                | Unsqueeze                                      | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Upsample                 | Upsample / Resize                              | yes | yes   | yes   | yes    | yes   | yes  | yes   | yes   | yes   |      |
+| Where                    | Where                                          | yes |       |       |        |       |      |       |       |       |      |
+
+
+1. In the above table, the TNN HardSwish operator maps to "Add + Clip + Div + Mul" in ONNX, which means that the four operators of ONNX are combined into the HardSwsh operator in TNN. The "+" symbol in the table represents the combination of operators. This applies to other similar operators.
+2. In the table above, the PRelu operator of TNN maps to "LeakyRelu / PRelu" in ONNX, which means that the RRelu operator in TNN supports both PRelu and LeakyRelu operators in ONNX.
+
+## TNN supported devices
+
+| device     | support |
+|------------|---------|
+| ARMv7      |  Yes    |
+| ARMv8      |  Yes    |
+| OpenCL     |  Yes    |
+| Metal      |  Yes    |
+| HuaweiNPU  |  Yes    |
+| RKNPU      |  Yes    |
+| X86        |  Yes    |
+| CUDA       |  Yes    |
+
+1. HuaweiNPU is DaVinci NPU of Huawei, as follows: Kirin810, Kirin820, Kirin985, Kirin990, Kirin990 5G, Kirin990E, Kirin9000, Kirin9000E etc.
+2. RockchipNPU only support fp16 mode of rk1808
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/user/support_tflite_mode_en.md b/3rdparty/TNN/doc/en/user/support_tflite_mode_en.md
new file mode 100644
index 0000000..910b211
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/support_tflite_mode_en.md
@@ -0,0 +1,62 @@
+
+
+# TFLite 算子的支持
+
+
+| tflite operator         | tnn operatpr         | support |
+|-------------------------+----------------------+---------|
+| Add                     | Add                  | yes     |
+| Average_Pool_2d         | Pooling              | yes     |
+| Concatenation           | Concat               | yes     |
+| Conv_2d                 | Convolution          | yes     |
+| Cos                     | Cos                  | yes     |
+| Depthwise_Conv_2d       | Convolution          | yes     |
+| Detetion_Post_Process   | DetectionPostProcess | yes     |
+| Div                     | Div                  | yes     |
+| Exp                     | Exp                  | yes     |
+| Full_Connected          | InnerProduct         | yes     |
+| LeakyRelu               | Prelu                | yes     |
+| Log                     | Logistic             | yes     |
+| Logistic                | Sigmoid              | yes     |
+| Max_Pool_2d             | Pooling              | yes     |
+| Maximum                 | Maximum              | yes     |
+| Mean                    | ReduceMean           | yes     |
+| Minimum                 | Minimum              | yes     |
+| Mul                     | Mul                  | yes     |
+| Neg                     | Neg                  | yes     |
+| Pad                     | Pad                  | yes     |
+| Padv2                   | Pad                  | yes     |
+| Prelu                   | Prelu                | yes     |
+| Reshape                 | Reshape              | yes     |
+| Resize_Biliner          | Upsample             | yes     |
+| Resize_Nearest_Neighbor | Upsample             | yes     |
+| Sin                     | Sin                  | yes     |
+| Softmax                 | Softmax              | yes     |
+| Split                   | SplitV               | yes     |
+| SplitV                  | SplitV               | yes     |
+| Squeeze                 | Squeeze              | yes     |
+| StridedSlice            | StridedSlice         | yes     |
+| Sub                     | Sub                  | yes     |
+| Tanh                    | Tanh                 | yes     |
+| Transpose_Conv          | Deconvolution        | yes     |
+
+
+# TFLite 模型的支持
+
+
+| tflite model                           | support align |
+|----------------------------------------+---------------|
+| alexnet                                | yes           |
+| densenet_2018_04_27                    | yes           |
+| face_landmark(media pipe)              | yes           |
+| inception_v3_2018_04_27                | yes           |
+| inception_v4_2018_04_27                | yes           |
+| mobiletnet_v1_1.0_224                  | yes           |
+| mobiletnet_v2_1.0_224                  | yes           |
+| object_detection_3d(shoes, media pipe) | yes           |
+| resnet_v2_101_229                      | yes           |
+| squeezenet_2018_04_26                  | yes           |
+| ssd                                    | yes           |
+| vgg16                                  | yes           |
+| yolo_tiny                              | yes           |
+| yolov2_tiny                            | yes           |
diff --git a/3rdparty/TNN/doc/en/user/tech_solution_en.md b/3rdparty/TNN/doc/en/user/tech_solution_en.md
new file mode 100644
index 0000000..b1be283
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/tech_solution_en.md
@@ -0,0 +1,71 @@
+# Technical Solutions
+
+[中文版本](../../cn/user/tech_solution.md)
+
+TNN is a high-performance and lightweight inference framework for mobile devices. It provides lots of advanced features such as cross-platform, model-compression, and code-pruning. TNN, inspired by mainstream open-source industry frameworks, integrates and leverages Youtu Lab's Rapidnet, ncnn framework. It also combines the efforts of the deep-learning framework Oteam from all departments(PCG, TEG, IEG) to create an enterprise-level mobile inference engine.
+At present, TNN has been launched in various major businesses, and its following characteristics have been widely praised.
+
+* Computation optimization
+    * The backend operators are primely optimized to make the best use of computing power in different architectures, regarding instruction issue, throughput, delay, cache bandwidth, cache delay, registers, etc..
+    * The TNN performance on mainstream hardware platforms (CPU: ARMv7, ARMv8, GPU: Mali, Adreno, Apple) has been greatly tuned and improved.
+    * The convolution function is implemented by various algorithms such as Winograd, Tile-GEMM, Direct Conv, etc., to ensure efficiency under different parameters and sizes.
+    * Op fusion: TNN can do offline analysis of network graph, fuse multiple simple operations and reduce overhead such as redundant memory access and kernel startup cost.
+
+* Low precision computation acceleration
+    * TNN supports INT8/FP16 mode, reduces model size & memory consumption, and utilizes specific hardware low-precision instructions to accelerate calculations.
+    * TNN supports INT8 WINOGRAD algorithm, (input 6bit), further reduces the model calculation complexity without sacrificing the accuracy.
+    * TNN supports mixed-precision data in one model, speeding up the model's calculation speed while preserving its accuracy.
+
+* Memory optimization
+    * Efficient "memory pool" implementation: Based on a full network DAG analysis, the implementation reuses memory between non-dependent nodes which reduces memory cost by 90%.
+    * Cross-model memory reduces: This supports external real-time design for network memory so that multiple models can share mutual memory.
+
+* Performance comparison among mainstream models: TNN outperforms other mainstream open-source mobile high-performance frameworks.
+
+>  Kirin970：
+
+   | model                     | cpu 1 thread(ms) | gpu time(ms) |
+   |---------------------------|--------------|--------------|
+   | Mobilenet_v1              | 88           |   12         |
+   | Mobilenet_v1_int8         | 55           |              |
+   | Mobilenet_v2              | 58           |   11         |
+   | Mobilenet_v2_int8         | 41           |              |
+   | squeezenet_v1.0           | 127          |   20         |
+   | squeezenet_v1.0_int8      | 82           |              |
+
+
+>  Snapdragon 835：
+
+ | model                     | cpu 1 thread(ms) | gpu time(ms) |
+ |---------------------------|--------------|--------------|
+ | Mobilenet_v1              | 94           |   16         |
+ | Mobilenet_v1_int8         | 62           |              |
+ | Mobilenet_v2              | 61           |   14         |
+ | Mobilenet_v2_int8         | 47           |              |
+ | squeezenet_v1.0           | 122          |   28         |
+ | squeezenet_v1.0_int8      | 93           |              |
+
+
+>  Snapdragon 845：
+
+
+| model                     | cpu 1 thread(ms) | gpu time(ms) |
+|---------------------------|--------------|--------------|
+| Mobilenet_v1              | 60           |   10         |
+| Mobilenet_v1_int8         | 37           |              |
+| Mobilenet_v2              | 39           |   8          |
+| Mobilenet_v2_int8         | 28           |              |
+| squeezenet_v1.0           | 74           |   14         |
+| squeezenet_v1.0_int8      | 56           |              |
+
+    
+#### TNN Architecture Diagram：
+
+   <div align=left><img src="https://gitee.com/darren3d/tnn-resource/raw/master/doc/cn/imgs/tnn_architect.jpg" width="512"/>
+   
+* TNN supports TensorFlow, Pytorch, MxNet, Caffe, and other training frameworks through ONNX, leveraging the continuous improvement of the ONNX open-source society.
+  Currently, TNN supports 55 ONNX operators and will be developed to cover 80 operators shortly, consisting of most of the mainstream CNN operators needed.
+* TNN runs on mainstream operating systems (Android, iOS, embedded Linux, Windows), and is compatible with ARM CPU, GPU hardware platform (Da Vinci NPU will be supported soon)
+* TNN is constructed through Modular Design, which abstracts and isolates components such as model analysis, graph construction, graph optimization, low-level hardware adaptation, and high-performance kernel.
+   It uses "Factory Mode" to register and build devices, that tries to minimize the cost of supporting more hardware and acceleration solutions.
+* TNN's running time does not rely on any third-party libraries. The size of the CPU dynamic library is only around 400KB, and it provides basic image conversion operations, which are light-weight and convenient. TNN uses unified models and interfaces across platforms and can switch easily by configuring just one single parameter.
\ No newline at end of file
diff --git a/3rdparty/TNN/doc/en/user/test_en.md b/3rdparty/TNN/doc/en/user/test_en.md
new file mode 100644
index 0000000..6fa99ac
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/test_en.md
@@ -0,0 +1,34 @@
+# Model Test  
+
+[中文版本](../../cn/user/test.md)
+
+## I. Compile
+Refer to [Install and Compile](./compile_en.md).
+Enable the test options：
+* `TNN_TEST_ENABLE:BOOL=ON`  
+* set corresponding devices，such as `TNN_OPENCL_ENABLE`, `TNN_ARM_ENABLE`, `TNN_HUAWEI_NPU_ENABLE`
+* After the compilation is completed, an executable file 'test/ TNNTest' will be generated in the build directory, which can be run directly in Linux, Android ADB and other environments
+
+## II. Usage 
+### 1. Command
+```
+TNNTest 
+required parameters： 
+    -mp path to model and proto(The proto and model should have the same prefix in the same folder)
+    -dt device type (ARM, OPENCL, HUAWEI_NPU, X86, CUDA)
+optional parameters：
+    -nt network type（default naive， npu needs to be specified as -nt HUAWEI_NPU）
+    -op path of the output  
+    -ic loop counter
+    -wc warmup counter 
+    -dl device list 
+    -ip input 
+    -it input type，default is NCHW float
+    -th CPU thread number 
+
+The test will output the timing info as：time cost: min = xx   ms  |  max = xx   ms  |  avg = xx   ms
+
+It can also be used as a benchmark tool. When you use it, you need to formulate wc> = 1, because the first run will prepare memory, context, etc.,which increases time consumption
+```
+### 2.  NPU
+The HiAI so libraries needs to be pushed to the phone，and which 
diff --git a/3rdparty/TNN/doc/en/user/tf2tnn_en.md b/3rdparty/TNN/doc/en/user/tf2tnn_en.md
new file mode 100644
index 0000000..4394190
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/tf2tnn_en.md
@@ -0,0 +1,38 @@
+# Tensorflow Model to TNN Nodel
+
+[中文版本](../../cn/user/tf2tnn.md)
+
+To convert model file formats such as Pytorch, Tensorflow, and Caffe to TNN, you need to use the corresponding tool to convert from the original format to ONNX model first, which then will be transferred into a TNN model.
+
+With the help of the ONNX open-source community, the open-source converter tf2onnx  can directly convert the Tensorflow model to an ONNX model. The following document will briefly introduce how to use tf2onnx to convert. We strongly recommend that you go to the official website of tf2onnx to understand the detailed usage of tf2onnx, which will help you convert TensorFlow models to TNN models. After using tf2onnx to convert the Tensorflow model to ONNX, it is recommended to refer to the relevant documents of [onnx2tnn](onnx2tnn_en.md) to convert the ONNX model to TNN.
+
+
+tf2onnx：https://github.com/onnx/tensorflow-onnx
+
+
+## 1. Environment（Mac and Linux）
+- tensorflow (version == 1.15.0)
+Recommend to use tensorflow 1.15.0. The compatibility of tensorflow 2.+ is not very stable(bot recommend).
+```shell script
+pip3 install tensorflow==1.15.0
+```
+
+- tf2onnx （version>= 1.5.5）
+```shell script
+pip3 install tf2onnx
+```
+- onnxruntime(version>=1.1.0)
+```shell script
+pip3 install onnxruntime
+```
+
+## 2. How to use tf2onnx 
+
+The following is an example command to convert test.pb. It is recommended that you read tf2onnx's README.md file, which contains detailed descriptions of the various parameters of the tool.
+``` shell script
+python3 -m tf2onnx.convert  --graphdef test.pb   --inputs input_data:0  --outputs pred:0   --opset 11 --output  test.onnx --inputs-as-nchw input_data:0
+```
+
+## 3. tf2onnx Operator support
+
+List of operators supported by the tool: [support op](https://github.com/onnx/tensorflow-onnx/blob/master/support_status.md)
diff --git a/3rdparty/TNN/doc/en/user/tflite2tnn_en.md b/3rdparty/TNN/doc/en/user/tflite2tnn_en.md
new file mode 100644
index 0000000..1f63da9
--- /dev/null
+++ b/3rdparty/TNN/doc/en/user/tflite2tnn_en.md
@@ -0,0 +1,104 @@
+# Tensorflow-lite Model to TNN Nodel
+
+[中文版本](../../cn/user/tflite2tnn.md)
+
+To convert Tensorflow-lite model file  to TNN, you need to use the corresponding tool to convert from the original format to TNN model.
+
+The  Tensorflow-lite model can directly convert  to an TNN model. The following document will briefly introduce how to use tflite2tnn to convert.
+
+## 1. Environment requirements and Compile
+### Environment requirements
+The following environment is suitable for Macos and Linux systems.
+The example is based on centos7.2.
+
+-Install protobuf (version >= 3.4.0)
+
+Macos:
+```shell script
+brew install protobuf
+```
+
+Linux:
+
+For linux systems, we recommend to refer to the official [README](https://github.com/protocolbuffers/protobuf/blob/master/src/README.md) document of protobuf and install directly from the source code.
+
+If you are using Ubuntu system, you can use the following instructions to install:
+
+```shell script
+sudo apt-get install libprotobuf-dev protobuf-compiler
+```
+
+
+
+- Install python (version >= 3.6)
+
+Macos
+```shell script
+brew install python3
+```
+Centos:
+```shell script
+yum install  python3 python3-devel
+```
+
+- Install python dependencies
+numpy>=1.17.0   
+protobuf>=3.4.0
+```shell script
+pip3 install numpy protobuf
+```
+
+- cmake （version >= 3.0）
+Download the latest cmake from official website, and follow the instructions. It is recommended to use the latest cmake.
+
+### Compile
+The tflite2tnn tool runs directly on Mac and Linux with automatic compilation scripts
+ ```shell script
+cd <path-to-tnn>/tools/convert2tnn
+./build.sh 
+ ```
+
+## 2. How to use tflite2tnn 
+
+Check the tool help information
+```shell script
+python3 converter.py tflite2tnn  -h
+```
+help information shows as follow:
+```text
+usage: converter.py tflite2tnn [-h] tflitemodel_path [-version VERSION] [-o OUTPUT_DIR]
+
+optional arguments:
+  -h, --help              show this help message and exit
+  -version VERSION        Algorithm version string
+  -o OUTPUT_DIR           the output dir for tnn model
+  -align                  align the onnx model with tnn model
+  -input_file INPUT_FILE  the input file path which contains the input data for the inference model
+  -ref_file   REF_FILE    the reference file path which contains the reference data to compare the results
+```
+
+
+```shell script
+python3 converter.py tflite2tnn  test.tflite
+```
+```text
+Parameter:
+-version
+Version information
+
+-o
+output_dir : The directory the model to be saved in，the directory must exit already.
+
+-align
+model align, if you want to use it, you can add '-align' in your command
+
+-input_file
+input_file : The path of input file, which will be used in model align
+
+-ref_file
+reference_file : The path of reference file, which will be used in model align. Compare tnn's output and reference file.
+```
+
+
+## 3. Operator support and usage restrictions
+List of operators supported by the tool: [tflite support list](support_tflite_mode_en.md)
diff --git a/3rdparty/TNN/doc/index.rst b/3rdparty/TNN/doc/index.rst
new file mode 100644
index 0000000..e04b6b7
--- /dev/null
+++ b/3rdparty/TNN/doc/index.rst
@@ -0,0 +1,82 @@
+.. TNN documentation master file, created by
+   sphinx-quickstart on Tue Apr 28 08:31:39 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TNN's documentation!
+===============================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 快速开始
+
+   ./cn/get_started.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 技术方案
+
+   ./cn/user/tech_solution.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 能力展示
+
+   ./cn/user/support.md
+   Benchmark性能测试方法 <./cn/development/profiling.md>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 使用手册
+
+   从源码编译 <./cn/user/compile.md>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 工具集
+   
+   模型转换 <./cn/user/convert.md>
+   模型量化 <./cn/user/quantization.md>
+   模型可视化 <./cn/user/visual.md>
+   性能分析工具 <./cn/development/profiling.md>
+   模型对齐工具 <./cn/development/model_check.md>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API文档
+
+   API调用 <./cn/user/api.md>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 贡献者须知
+
+   开发基础须知 <./cn/development/contributing.md>
+   架构详解 <./cn/development/architecture.md>
+   新增OP <./cn/development/add_op.md>
+   单元测试 <./cn/development/unit_test.md>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Roadmap
+
+    ./cn/user/roadmap.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: FAQ
+
+   ./cn/faq.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: English
+
+   ./en/front_page_en.md
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/3rdparty/TNN/doc/make.bat b/3rdparty/TNN/doc/make.bat
new file mode 100644
index 0000000..7893348
--- /dev/null
+++ b/3rdparty/TNN/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/3rdparty/TNN/doc/requirements.txt b/3rdparty/TNN/doc/requirements.txt
new file mode 100644
index 0000000..f11fa32
--- /dev/null
+++ b/3rdparty/TNN/doc/requirements.txt
@@ -0,0 +1,4 @@
+sphinx
+recommonmark
+sphinx_markdown_tables
+sphinx_rtd_theme
diff --git a/3rdparty/TNN/examples/.gitignore b/3rdparty/TNN/examples/.gitignore
new file mode 100644
index 0000000..8691ba3
--- /dev/null
+++ b/3rdparty/TNN/examples/.gitignore
@@ -0,0 +1,6 @@
+.DS_Store
+.project
+.vscode
+.settings
+.cxx
+.classpath
diff --git a/3rdparty/TNN/examples/android/.gitignore b/3rdparty/TNN/examples/android/.gitignore
new file mode 100644
index 0000000..420e588
--- /dev/null
+++ b/3rdparty/TNN/examples/android/.gitignore
@@ -0,0 +1,5 @@
+.idea
+.gradle
+.externalNativeBuild
+*.iml
+local.properties
diff --git a/3rdparty/TNN/examples/android/build.gradle b/3rdparty/TNN/examples/android/build.gradle
new file mode 100644
index 0000000..18ae165
--- /dev/null
+++ b/3rdparty/TNN/examples/android/build.gradle
@@ -0,0 +1,27 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+
+    repositories {
+        google()
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:3.5.2'
+
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/.gitignore b/3rdparty/TNN/examples/android/demo/.gitignore
new file mode 100644
index 0000000..796b96d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/3rdparty/TNN/examples/android/demo/CMakeLists.txt b/3rdparty/TNN/examples/android/demo/CMakeLists.txt
new file mode 100644
index 0000000..87c02e7
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/CMakeLists.txt
@@ -0,0 +1,115 @@
+# Sets the minimum version of CMake required to build the native
+# library. You should either keep the default value or only pass a
+# value of 3.4.0 or lower.
+
+cmake_minimum_required(VERSION 3.4.1)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds it for you.
+# Gradle automatically packages shared libraries with your APK.
+
+set(TNN_ROOT ../../..)
+set(BASE_SRC ../../base)
+set(UTILS_SRC ../../utils)
+set(OPENCV_ANDROID_SDK_PATH ${CMAKE_SOURCE_DIR}/../../../third_party/opencv/Android/OpenCV-android-sdk)
+include_directories(${TNN_ROOT}/include)
+include_directories(${TNN_ROOT}/third_party/opencl/include)
+include_directories(${OPENCV_ANDROID_SDK_PATH}/sdk/native/jni/include)
+include_directories(../demo/src/main/jni/thirdparty)
+include_directories(../demo/src/main/jni/thirdparty/kannarotate-android-lib/include)
+include_directories(../demo/src/main/jni/thirdparty/kannarotate-android-lib/src)
+include_directories(${BASE_SRC}/)
+include_directories(${UTILS_SRC}/)
+include_directories(../demo/src/main/jni/cc/)
+include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/hiai_ddk/include)
+
+set(CMAKE_BUILD_TYPE Release)
+
+set(TNN_OPENCL_ENABLE ON CACHE BOOL "" FORCE)
+set(TNN_ARM_ENABLE ON CACHE BOOL "" FORCE)
+if(ANDROID_ABI STREQUAL "arm64-v8a")
+    set(TNN_ARM82_ENABLE ON CACHE BOOL "" FORCE)
+endif()
+set(TNN_BUILD_SHARED OFF CACHE BOOL "" FORCE)
+set(TNN_HUAWEI_NPU_ENABLE OFF CACHE BOOL "" FORCE)
+set(TNN_CPU_ENABLE ON CACHE BOOL "" FORCE)
+set(TNN_OPENCV_ENABLE OFF CACHE BOOL "" FORCE)
+
+add_subdirectory(${TNN_ROOT}/ ../build)
+
+if(TNN_OPENCV_ENABLE)
+    add_definitions(-DHAS_OPENCV)
+endif()
+
+file(GLOB_RECURSE WRAPPER_SRCS ../demo/src/main/jni/cc/*.cc)
+file(GLOB_RECURSE BASE_SRC ${BASE_SRC}/*.cc)
+file(GLOB_RECURSE UTILS_SRC ${UTILS_SRC}/*.cc)
+file(GLOB_RECURSE THIRDPARTY_SRCS ../demo/src/main/jni/thirdparty/kannarotate-android-lib/src/*.c)
+add_library(tnn_wrapper SHARED ${WRAPPER_SRCS} ${BASE_SRC} ${THIRDPARTY_SRCS} ${UTILS_SRC})
+
+if(TNN_OPENCV_ENABLE)
+    add_library(opencv
+            SHARED
+            IMPORTED)
+    set_target_properties(opencv
+            PROPERTIES
+            IMPORTED_LOCATION
+            ${OPENCV_ANDROID_SDK_PATH}/sdk/native/libs/${ANDROID_ABI}/libopencv_java3.so)
+endif()
+
+if(TNN_HUAWEI_NPU_ENABLE)
+    add_library(hiai
+            SHARED
+            IMPORTED)
+    set_target_properties(hiai
+            PROPERTIES
+            IMPORTED_LOCATION
+            ${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/hiai_ddk/${ANDROID_ABI}/libhiai.so)
+
+    add_library(hiai_ir
+            SHARED
+            IMPORTED)
+    set_target_properties(hiai_ir
+            PROPERTIES
+            IMPORTED_LOCATION
+            ${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/hiai_ddk/${ANDROID_ABI}/libhiai_ir.so)
+
+    add_library(hiai_ir_build
+            SHARED
+            IMPORTED)
+    set_target_properties(hiai_ir_build
+            PROPERTIES
+            IMPORTED_LOCATION
+            ${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/hiai_ddk/${ANDROID_ABI}/libhiai_ir_build.so)
+
+endif()
+
+
+find_library( # Sets the name of the path variable.
+              log-lib
+
+              # Specifies the name of the NDK library that
+              # you want CMake to locate.
+              log)
+
+# Specifies libraries CMake should link to your target library. You
+# can link multiple libraries, such as libraries you define in the
+# build script, prebuilt third-party libraries, or system libraries.
+
+target_link_libraries( # Specifies the target library.
+                        tnn_wrapper
+                        -ljnigraphics
+                        -Wl,--whole-archive TNN -Wl,--no-whole-archive
+                       # Links the target library to the log library
+                       # included in the NDK.
+                       ${log-lib})
+
+if(TNN_OPENCV_ENABLE)
+    target_link_libraries(tnn_wrapper opencv)
+endif()
+
+if(TNN_HUAWEI_NPU_ENABLE)
+    target_link_libraries( # Specifies the target library.
+            tnn_wrapper hiai hiai_ir hiai_ir_build)
+endif()
diff --git a/3rdparty/TNN/examples/android/demo/build.gradle b/3rdparty/TNN/examples/android/demo/build.gradle
new file mode 100644
index 0000000..9273b92
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/build.gradle
@@ -0,0 +1,72 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 24
+    buildToolsVersion "25.0.2"
+
+    defaultConfig {
+        minSdkVersion 15
+        targetSdkVersion 22
+        versionCode 1000
+        versionName "1.0.0"
+
+        externalNativeBuild {
+            cmake {
+                cppFlags "-std=c++11 -frtti -fexceptions -lz -DANDROID_ARM_NEON=ON -mfpu=neon"
+                arguments "-DANDROID_STL=c++_shared"
+            }
+        }
+        ndk {
+            abiFilters "arm64-v8a"
+        }
+
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+
+    externalNativeBuild {
+        cmake {
+            path "CMakeLists.txt"
+        }
+    }
+
+    sourceSets {
+        main {
+            jniLibs.srcDirs = ['libs', 'src/main/jni/thirdparty/hiai_ddk', '../../../third_party/opencv/Android/OpenCV-android-sdk/sdk/native/libs']
+            jni.srcDirs = ['src/main/jni']
+            assets.srcDirs = ['src/main/assets', '../../../model']
+        }
+    }
+}
+
+task deleteFiles(type: Delete) {
+    if ( file('src/main/jni/thirdparty/hiai_ddk/arm64-v8a').exists()) {
+        delete 'src/main/jni/thirdparty/hiai_ddk/arm64-v8a'
+    }
+    if (file('src/main/jni/thirdparty/hiai_ddk/armeabi-v7a').exists()) {
+        delete 'src/main/jni/thirdparty/hiai_ddk/armeabi-v7a'
+    }
+    if (file('src/main/jni/thirdparty/hiai_ddk/include').exists())
+        delete'src/main/jni/thirdparty/hiai_ddk/include'
+}
+clean.dependsOn(deleteFiles)
+
+task copyFiles(type: Copy) {
+    if (file('../../../third_party/huawei_npu/hiai_ddk_latest').exists()) {
+        from '../../../third_party/huawei_npu/hiai_ddk_latest'
+        into 'src/main/jni/thirdparty/hiai_ddk'
+    }
+}
+
+preBuild.dependsOn(copyFiles)
+
+dependencies {
+    implementation fileTree(include: ['*.jar'], dir: 'libs')
+    implementation 'com.android.support:appcompat-v7:24.2.1'
+}
+
+
diff --git a/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..f6b961f
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..2304ffc
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Mon Jul 22 19:31:35 CST 2019
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.1.1-all.zip
diff --git a/3rdparty/TNN/examples/android/demo/gradlew b/3rdparty/TNN/examples/android/demo/gradlew
new file mode 100644
index 0000000..cccdd3d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/3rdparty/TNN/examples/android/demo/gradlew.bat b/3rdparty/TNN/examples/android/demo/gradlew.bat
new file mode 100644
index 0000000..f955316
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/examples/android/demo/proguard-rules.pro b/3rdparty/TNN/examples/android/demo/proguard-rules.pro
new file mode 100644
index 0000000..f61a577
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/proguard-rules.pro
@@ -0,0 +1,17 @@
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in C:\Users\neiltian\AppData\Local\Android\Sdk/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the proguardFiles
+# directive in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/AndroidManifest.xml b/3rdparty/TNN/examples/android/demo/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..4d7dd1d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/AndroidManifest.xml
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.tencent.tnn.demo" >
+
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+    <!--开启相机权限-->
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:supportsRtl="true"
+        android:theme="@style/AppTheme"
+        android:debuggable="true" >
+        <activity android:name="com.tencent.tnn.demo.StreamOCRDetector.StreamOCRDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamPoseDetectLandmark.StreamPoseDetectLandmarkActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamSkeletonDetector.StreamSkeletonDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamHairSegmentation.StreamHairSegmentationActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamBlazeFaceAlign.StreamBlazeFaceAlignActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamBlazeFaceDetector.StreamBlazeFaceDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageBlazeFaceDetector.ImageBlazeFaceDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageOCRDetector.ImageOCRDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageObjectDetectorSSD.ImageObjectDetectSSDActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamObjectDetectorSSD.StreamObjectDetectSSDActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamFaceDetector.StreamFaceDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageFaceDetector.ImageFaceDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageClassifyDetector.ImageClassifyDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.ImageObjectDetector.ImageObjectDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name="com.tencent.tnn.demo.StreamObjectDetector.StreamObjectDetectActivity"
+            android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+            android:windowSoftInputMode="stateAlwaysHidden"
+            android:screenOrientation="portrait"
+            android:configChanges="keyboardHidden|orientation|screenSize"/>
+        <activity android:name=".MainActivity">
+            <intent-filter>
+                <category android:name="android.intent.category.LAUNCHER" />
+                <action android:name="android.intent.action.MAIN" />
+            </intent-filter>
+        </activity>
+
+    </application>
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-permission android:name="android.permission.WAKE_LOCK" />
+    <uses-permission android:name="android.permission.READ_PHONE_STATE" />
+    <uses-permission android:name="android.permission.RECORD_AUDIO"/>
+
+</manifest>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/004545.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/004545.jpg
new file mode 100644
index 0000000..7aba2ab
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/004545.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/blazeface_anchors.txt b/3rdparty/TNN/examples/android/demo/src/main/assets/blazeface_anchors.txt
new file mode 100644
index 0000000..36e7159
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/assets/blazeface_anchors.txt
@@ -0,0 +1,3584 @@
+0.031250
+0.031250
+1.000000
+1.000000
+0.031250
+0.031250
+1.000000
+1.000000
+0.093750
+0.031250
+1.000000
+1.000000
+0.093750
+0.031250
+1.000000
+1.000000
+0.156250
+0.031250
+1.000000
+1.000000
+0.156250
+0.031250
+1.000000
+1.000000
+0.218750
+0.031250
+1.000000
+1.000000
+0.218750
+0.031250
+1.000000
+1.000000
+0.281250
+0.031250
+1.000000
+1.000000
+0.281250
+0.031250
+1.000000
+1.000000
+0.343750
+0.031250
+1.000000
+1.000000
+0.343750
+0.031250
+1.000000
+1.000000
+0.406250
+0.031250
+1.000000
+1.000000
+0.406250
+0.031250
+1.000000
+1.000000
+0.468750
+0.031250
+1.000000
+1.000000
+0.468750
+0.031250
+1.000000
+1.000000
+0.531250
+0.031250
+1.000000
+1.000000
+0.531250
+0.031250
+1.000000
+1.000000
+0.593750
+0.031250
+1.000000
+1.000000
+0.593750
+0.031250
+1.000000
+1.000000
+0.656250
+0.031250
+1.000000
+1.000000
+0.656250
+0.031250
+1.000000
+1.000000
+0.718750
+0.031250
+1.000000
+1.000000
+0.718750
+0.031250
+1.000000
+1.000000
+0.781250
+0.031250
+1.000000
+1.000000
+0.781250
+0.031250
+1.000000
+1.000000
+0.843750
+0.031250
+1.000000
+1.000000
+0.843750
+0.031250
+1.000000
+1.000000
+0.906250
+0.031250
+1.000000
+1.000000
+0.906250
+0.031250
+1.000000
+1.000000
+0.968750
+0.031250
+1.000000
+1.000000
+0.968750
+0.031250
+1.000000
+1.000000
+0.031250
+0.093750
+1.000000
+1.000000
+0.031250
+0.093750
+1.000000
+1.000000
+0.093750
+0.093750
+1.000000
+1.000000
+0.093750
+0.093750
+1.000000
+1.000000
+0.156250
+0.093750
+1.000000
+1.000000
+0.156250
+0.093750
+1.000000
+1.000000
+0.218750
+0.093750
+1.000000
+1.000000
+0.218750
+0.093750
+1.000000
+1.000000
+0.281250
+0.093750
+1.000000
+1.000000
+0.281250
+0.093750
+1.000000
+1.000000
+0.343750
+0.093750
+1.000000
+1.000000
+0.343750
+0.093750
+1.000000
+1.000000
+0.406250
+0.093750
+1.000000
+1.000000
+0.406250
+0.093750
+1.000000
+1.000000
+0.468750
+0.093750
+1.000000
+1.000000
+0.468750
+0.093750
+1.000000
+1.000000
+0.531250
+0.093750
+1.000000
+1.000000
+0.531250
+0.093750
+1.000000
+1.000000
+0.593750
+0.093750
+1.000000
+1.000000
+0.593750
+0.093750
+1.000000
+1.000000
+0.656250
+0.093750
+1.000000
+1.000000
+0.656250
+0.093750
+1.000000
+1.000000
+0.718750
+0.093750
+1.000000
+1.000000
+0.718750
+0.093750
+1.000000
+1.000000
+0.781250
+0.093750
+1.000000
+1.000000
+0.781250
+0.093750
+1.000000
+1.000000
+0.843750
+0.093750
+1.000000
+1.000000
+0.843750
+0.093750
+1.000000
+1.000000
+0.906250
+0.093750
+1.000000
+1.000000
+0.906250
+0.093750
+1.000000
+1.000000
+0.968750
+0.093750
+1.000000
+1.000000
+0.968750
+0.093750
+1.000000
+1.000000
+0.031250
+0.156250
+1.000000
+1.000000
+0.031250
+0.156250
+1.000000
+1.000000
+0.093750
+0.156250
+1.000000
+1.000000
+0.093750
+0.156250
+1.000000
+1.000000
+0.156250
+0.156250
+1.000000
+1.000000
+0.156250
+0.156250
+1.000000
+1.000000
+0.218750
+0.156250
+1.000000
+1.000000
+0.218750
+0.156250
+1.000000
+1.000000
+0.281250
+0.156250
+1.000000
+1.000000
+0.281250
+0.156250
+1.000000
+1.000000
+0.343750
+0.156250
+1.000000
+1.000000
+0.343750
+0.156250
+1.000000
+1.000000
+0.406250
+0.156250
+1.000000
+1.000000
+0.406250
+0.156250
+1.000000
+1.000000
+0.468750
+0.156250
+1.000000
+1.000000
+0.468750
+0.156250
+1.000000
+1.000000
+0.531250
+0.156250
+1.000000
+1.000000
+0.531250
+0.156250
+1.000000
+1.000000
+0.593750
+0.156250
+1.000000
+1.000000
+0.593750
+0.156250
+1.000000
+1.000000
+0.656250
+0.156250
+1.000000
+1.000000
+0.656250
+0.156250
+1.000000
+1.000000
+0.718750
+0.156250
+1.000000
+1.000000
+0.718750
+0.156250
+1.000000
+1.000000
+0.781250
+0.156250
+1.000000
+1.000000
+0.781250
+0.156250
+1.000000
+1.000000
+0.843750
+0.156250
+1.000000
+1.000000
+0.843750
+0.156250
+1.000000
+1.000000
+0.906250
+0.156250
+1.000000
+1.000000
+0.906250
+0.156250
+1.000000
+1.000000
+0.968750
+0.156250
+1.000000
+1.000000
+0.968750
+0.156250
+1.000000
+1.000000
+0.031250
+0.218750
+1.000000
+1.000000
+0.031250
+0.218750
+1.000000
+1.000000
+0.093750
+0.218750
+1.000000
+1.000000
+0.093750
+0.218750
+1.000000
+1.000000
+0.156250
+0.218750
+1.000000
+1.000000
+0.156250
+0.218750
+1.000000
+1.000000
+0.218750
+0.218750
+1.000000
+1.000000
+0.218750
+0.218750
+1.000000
+1.000000
+0.281250
+0.218750
+1.000000
+1.000000
+0.281250
+0.218750
+1.000000
+1.000000
+0.343750
+0.218750
+1.000000
+1.000000
+0.343750
+0.218750
+1.000000
+1.000000
+0.406250
+0.218750
+1.000000
+1.000000
+0.406250
+0.218750
+1.000000
+1.000000
+0.468750
+0.218750
+1.000000
+1.000000
+0.468750
+0.218750
+1.000000
+1.000000
+0.531250
+0.218750
+1.000000
+1.000000
+0.531250
+0.218750
+1.000000
+1.000000
+0.593750
+0.218750
+1.000000
+1.000000
+0.593750
+0.218750
+1.000000
+1.000000
+0.656250
+0.218750
+1.000000
+1.000000
+0.656250
+0.218750
+1.000000
+1.000000
+0.718750
+0.218750
+1.000000
+1.000000
+0.718750
+0.218750
+1.000000
+1.000000
+0.781250
+0.218750
+1.000000
+1.000000
+0.781250
+0.218750
+1.000000
+1.000000
+0.843750
+0.218750
+1.000000
+1.000000
+0.843750
+0.218750
+1.000000
+1.000000
+0.906250
+0.218750
+1.000000
+1.000000
+0.906250
+0.218750
+1.000000
+1.000000
+0.968750
+0.218750
+1.000000
+1.000000
+0.968750
+0.218750
+1.000000
+1.000000
+0.031250
+0.281250
+1.000000
+1.000000
+0.031250
+0.281250
+1.000000
+1.000000
+0.093750
+0.281250
+1.000000
+1.000000
+0.093750
+0.281250
+1.000000
+1.000000
+0.156250
+0.281250
+1.000000
+1.000000
+0.156250
+0.281250
+1.000000
+1.000000
+0.218750
+0.281250
+1.000000
+1.000000
+0.218750
+0.281250
+1.000000
+1.000000
+0.281250
+0.281250
+1.000000
+1.000000
+0.281250
+0.281250
+1.000000
+1.000000
+0.343750
+0.281250
+1.000000
+1.000000
+0.343750
+0.281250
+1.000000
+1.000000
+0.406250
+0.281250
+1.000000
+1.000000
+0.406250
+0.281250
+1.000000
+1.000000
+0.468750
+0.281250
+1.000000
+1.000000
+0.468750
+0.281250
+1.000000
+1.000000
+0.531250
+0.281250
+1.000000
+1.000000
+0.531250
+0.281250
+1.000000
+1.000000
+0.593750
+0.281250
+1.000000
+1.000000
+0.593750
+0.281250
+1.000000
+1.000000
+0.656250
+0.281250
+1.000000
+1.000000
+0.656250
+0.281250
+1.000000
+1.000000
+0.718750
+0.281250
+1.000000
+1.000000
+0.718750
+0.281250
+1.000000
+1.000000
+0.781250
+0.281250
+1.000000
+1.000000
+0.781250
+0.281250
+1.000000
+1.000000
+0.843750
+0.281250
+1.000000
+1.000000
+0.843750
+0.281250
+1.000000
+1.000000
+0.906250
+0.281250
+1.000000
+1.000000
+0.906250
+0.281250
+1.000000
+1.000000
+0.968750
+0.281250
+1.000000
+1.000000
+0.968750
+0.281250
+1.000000
+1.000000
+0.031250
+0.343750
+1.000000
+1.000000
+0.031250
+0.343750
+1.000000
+1.000000
+0.093750
+0.343750
+1.000000
+1.000000
+0.093750
+0.343750
+1.000000
+1.000000
+0.156250
+0.343750
+1.000000
+1.000000
+0.156250
+0.343750
+1.000000
+1.000000
+0.218750
+0.343750
+1.000000
+1.000000
+0.218750
+0.343750
+1.000000
+1.000000
+0.281250
+0.343750
+1.000000
+1.000000
+0.281250
+0.343750
+1.000000
+1.000000
+0.343750
+0.343750
+1.000000
+1.000000
+0.343750
+0.343750
+1.000000
+1.000000
+0.406250
+0.343750
+1.000000
+1.000000
+0.406250
+0.343750
+1.000000
+1.000000
+0.468750
+0.343750
+1.000000
+1.000000
+0.468750
+0.343750
+1.000000
+1.000000
+0.531250
+0.343750
+1.000000
+1.000000
+0.531250
+0.343750
+1.000000
+1.000000
+0.593750
+0.343750
+1.000000
+1.000000
+0.593750
+0.343750
+1.000000
+1.000000
+0.656250
+0.343750
+1.000000
+1.000000
+0.656250
+0.343750
+1.000000
+1.000000
+0.718750
+0.343750
+1.000000
+1.000000
+0.718750
+0.343750
+1.000000
+1.000000
+0.781250
+0.343750
+1.000000
+1.000000
+0.781250
+0.343750
+1.000000
+1.000000
+0.843750
+0.343750
+1.000000
+1.000000
+0.843750
+0.343750
+1.000000
+1.000000
+0.906250
+0.343750
+1.000000
+1.000000
+0.906250
+0.343750
+1.000000
+1.000000
+0.968750
+0.343750
+1.000000
+1.000000
+0.968750
+0.343750
+1.000000
+1.000000
+0.031250
+0.406250
+1.000000
+1.000000
+0.031250
+0.406250
+1.000000
+1.000000
+0.093750
+0.406250
+1.000000
+1.000000
+0.093750
+0.406250
+1.000000
+1.000000
+0.156250
+0.406250
+1.000000
+1.000000
+0.156250
+0.406250
+1.000000
+1.000000
+0.218750
+0.406250
+1.000000
+1.000000
+0.218750
+0.406250
+1.000000
+1.000000
+0.281250
+0.406250
+1.000000
+1.000000
+0.281250
+0.406250
+1.000000
+1.000000
+0.343750
+0.406250
+1.000000
+1.000000
+0.343750
+0.406250
+1.000000
+1.000000
+0.406250
+0.406250
+1.000000
+1.000000
+0.406250
+0.406250
+1.000000
+1.000000
+0.468750
+0.406250
+1.000000
+1.000000
+0.468750
+0.406250
+1.000000
+1.000000
+0.531250
+0.406250
+1.000000
+1.000000
+0.531250
+0.406250
+1.000000
+1.000000
+0.593750
+0.406250
+1.000000
+1.000000
+0.593750
+0.406250
+1.000000
+1.000000
+0.656250
+0.406250
+1.000000
+1.000000
+0.656250
+0.406250
+1.000000
+1.000000
+0.718750
+0.406250
+1.000000
+1.000000
+0.718750
+0.406250
+1.000000
+1.000000
+0.781250
+0.406250
+1.000000
+1.000000
+0.781250
+0.406250
+1.000000
+1.000000
+0.843750
+0.406250
+1.000000
+1.000000
+0.843750
+0.406250
+1.000000
+1.000000
+0.906250
+0.406250
+1.000000
+1.000000
+0.906250
+0.406250
+1.000000
+1.000000
+0.968750
+0.406250
+1.000000
+1.000000
+0.968750
+0.406250
+1.000000
+1.000000
+0.031250
+0.468750
+1.000000
+1.000000
+0.031250
+0.468750
+1.000000
+1.000000
+0.093750
+0.468750
+1.000000
+1.000000
+0.093750
+0.468750
+1.000000
+1.000000
+0.156250
+0.468750
+1.000000
+1.000000
+0.156250
+0.468750
+1.000000
+1.000000
+0.218750
+0.468750
+1.000000
+1.000000
+0.218750
+0.468750
+1.000000
+1.000000
+0.281250
+0.468750
+1.000000
+1.000000
+0.281250
+0.468750
+1.000000
+1.000000
+0.343750
+0.468750
+1.000000
+1.000000
+0.343750
+0.468750
+1.000000
+1.000000
+0.406250
+0.468750
+1.000000
+1.000000
+0.406250
+0.468750
+1.000000
+1.000000
+0.468750
+0.468750
+1.000000
+1.000000
+0.468750
+0.468750
+1.000000
+1.000000
+0.531250
+0.468750
+1.000000
+1.000000
+0.531250
+0.468750
+1.000000
+1.000000
+0.593750
+0.468750
+1.000000
+1.000000
+0.593750
+0.468750
+1.000000
+1.000000
+0.656250
+0.468750
+1.000000
+1.000000
+0.656250
+0.468750
+1.000000
+1.000000
+0.718750
+0.468750
+1.000000
+1.000000
+0.718750
+0.468750
+1.000000
+1.000000
+0.781250
+0.468750
+1.000000
+1.000000
+0.781250
+0.468750
+1.000000
+1.000000
+0.843750
+0.468750
+1.000000
+1.000000
+0.843750
+0.468750
+1.000000
+1.000000
+0.906250
+0.468750
+1.000000
+1.000000
+0.906250
+0.468750
+1.000000
+1.000000
+0.968750
+0.468750
+1.000000
+1.000000
+0.968750
+0.468750
+1.000000
+1.000000
+0.031250
+0.531250
+1.000000
+1.000000
+0.031250
+0.531250
+1.000000
+1.000000
+0.093750
+0.531250
+1.000000
+1.000000
+0.093750
+0.531250
+1.000000
+1.000000
+0.156250
+0.531250
+1.000000
+1.000000
+0.156250
+0.531250
+1.000000
+1.000000
+0.218750
+0.531250
+1.000000
+1.000000
+0.218750
+0.531250
+1.000000
+1.000000
+0.281250
+0.531250
+1.000000
+1.000000
+0.281250
+0.531250
+1.000000
+1.000000
+0.343750
+0.531250
+1.000000
+1.000000
+0.343750
+0.531250
+1.000000
+1.000000
+0.406250
+0.531250
+1.000000
+1.000000
+0.406250
+0.531250
+1.000000
+1.000000
+0.468750
+0.531250
+1.000000
+1.000000
+0.468750
+0.531250
+1.000000
+1.000000
+0.531250
+0.531250
+1.000000
+1.000000
+0.531250
+0.531250
+1.000000
+1.000000
+0.593750
+0.531250
+1.000000
+1.000000
+0.593750
+0.531250
+1.000000
+1.000000
+0.656250
+0.531250
+1.000000
+1.000000
+0.656250
+0.531250
+1.000000
+1.000000
+0.718750
+0.531250
+1.000000
+1.000000
+0.718750
+0.531250
+1.000000
+1.000000
+0.781250
+0.531250
+1.000000
+1.000000
+0.781250
+0.531250
+1.000000
+1.000000
+0.843750
+0.531250
+1.000000
+1.000000
+0.843750
+0.531250
+1.000000
+1.000000
+0.906250
+0.531250
+1.000000
+1.000000
+0.906250
+0.531250
+1.000000
+1.000000
+0.968750
+0.531250
+1.000000
+1.000000
+0.968750
+0.531250
+1.000000
+1.000000
+0.031250
+0.593750
+1.000000
+1.000000
+0.031250
+0.593750
+1.000000
+1.000000
+0.093750
+0.593750
+1.000000
+1.000000
+0.093750
+0.593750
+1.000000
+1.000000
+0.156250
+0.593750
+1.000000
+1.000000
+0.156250
+0.593750
+1.000000
+1.000000
+0.218750
+0.593750
+1.000000
+1.000000
+0.218750
+0.593750
+1.000000
+1.000000
+0.281250
+0.593750
+1.000000
+1.000000
+0.281250
+0.593750
+1.000000
+1.000000
+0.343750
+0.593750
+1.000000
+1.000000
+0.343750
+0.593750
+1.000000
+1.000000
+0.406250
+0.593750
+1.000000
+1.000000
+0.406250
+0.593750
+1.000000
+1.000000
+0.468750
+0.593750
+1.000000
+1.000000
+0.468750
+0.593750
+1.000000
+1.000000
+0.531250
+0.593750
+1.000000
+1.000000
+0.531250
+0.593750
+1.000000
+1.000000
+0.593750
+0.593750
+1.000000
+1.000000
+0.593750
+0.593750
+1.000000
+1.000000
+0.656250
+0.593750
+1.000000
+1.000000
+0.656250
+0.593750
+1.000000
+1.000000
+0.718750
+0.593750
+1.000000
+1.000000
+0.718750
+0.593750
+1.000000
+1.000000
+0.781250
+0.593750
+1.000000
+1.000000
+0.781250
+0.593750
+1.000000
+1.000000
+0.843750
+0.593750
+1.000000
+1.000000
+0.843750
+0.593750
+1.000000
+1.000000
+0.906250
+0.593750
+1.000000
+1.000000
+0.906250
+0.593750
+1.000000
+1.000000
+0.968750
+0.593750
+1.000000
+1.000000
+0.968750
+0.593750
+1.000000
+1.000000
+0.031250
+0.656250
+1.000000
+1.000000
+0.031250
+0.656250
+1.000000
+1.000000
+0.093750
+0.656250
+1.000000
+1.000000
+0.093750
+0.656250
+1.000000
+1.000000
+0.156250
+0.656250
+1.000000
+1.000000
+0.156250
+0.656250
+1.000000
+1.000000
+0.218750
+0.656250
+1.000000
+1.000000
+0.218750
+0.656250
+1.000000
+1.000000
+0.281250
+0.656250
+1.000000
+1.000000
+0.281250
+0.656250
+1.000000
+1.000000
+0.343750
+0.656250
+1.000000
+1.000000
+0.343750
+0.656250
+1.000000
+1.000000
+0.406250
+0.656250
+1.000000
+1.000000
+0.406250
+0.656250
+1.000000
+1.000000
+0.468750
+0.656250
+1.000000
+1.000000
+0.468750
+0.656250
+1.000000
+1.000000
+0.531250
+0.656250
+1.000000
+1.000000
+0.531250
+0.656250
+1.000000
+1.000000
+0.593750
+0.656250
+1.000000
+1.000000
+0.593750
+0.656250
+1.000000
+1.000000
+0.656250
+0.656250
+1.000000
+1.000000
+0.656250
+0.656250
+1.000000
+1.000000
+0.718750
+0.656250
+1.000000
+1.000000
+0.718750
+0.656250
+1.000000
+1.000000
+0.781250
+0.656250
+1.000000
+1.000000
+0.781250
+0.656250
+1.000000
+1.000000
+0.843750
+0.656250
+1.000000
+1.000000
+0.843750
+0.656250
+1.000000
+1.000000
+0.906250
+0.656250
+1.000000
+1.000000
+0.906250
+0.656250
+1.000000
+1.000000
+0.968750
+0.656250
+1.000000
+1.000000
+0.968750
+0.656250
+1.000000
+1.000000
+0.031250
+0.718750
+1.000000
+1.000000
+0.031250
+0.718750
+1.000000
+1.000000
+0.093750
+0.718750
+1.000000
+1.000000
+0.093750
+0.718750
+1.000000
+1.000000
+0.156250
+0.718750
+1.000000
+1.000000
+0.156250
+0.718750
+1.000000
+1.000000
+0.218750
+0.718750
+1.000000
+1.000000
+0.218750
+0.718750
+1.000000
+1.000000
+0.281250
+0.718750
+1.000000
+1.000000
+0.281250
+0.718750
+1.000000
+1.000000
+0.343750
+0.718750
+1.000000
+1.000000
+0.343750
+0.718750
+1.000000
+1.000000
+0.406250
+0.718750
+1.000000
+1.000000
+0.406250
+0.718750
+1.000000
+1.000000
+0.468750
+0.718750
+1.000000
+1.000000
+0.468750
+0.718750
+1.000000
+1.000000
+0.531250
+0.718750
+1.000000
+1.000000
+0.531250
+0.718750
+1.000000
+1.000000
+0.593750
+0.718750
+1.000000
+1.000000
+0.593750
+0.718750
+1.000000
+1.000000
+0.656250
+0.718750
+1.000000
+1.000000
+0.656250
+0.718750
+1.000000
+1.000000
+0.718750
+0.718750
+1.000000
+1.000000
+0.718750
+0.718750
+1.000000
+1.000000
+0.781250
+0.718750
+1.000000
+1.000000
+0.781250
+0.718750
+1.000000
+1.000000
+0.843750
+0.718750
+1.000000
+1.000000
+0.843750
+0.718750
+1.000000
+1.000000
+0.906250
+0.718750
+1.000000
+1.000000
+0.906250
+0.718750
+1.000000
+1.000000
+0.968750
+0.718750
+1.000000
+1.000000
+0.968750
+0.718750
+1.000000
+1.000000
+0.031250
+0.781250
+1.000000
+1.000000
+0.031250
+0.781250
+1.000000
+1.000000
+0.093750
+0.781250
+1.000000
+1.000000
+0.093750
+0.781250
+1.000000
+1.000000
+0.156250
+0.781250
+1.000000
+1.000000
+0.156250
+0.781250
+1.000000
+1.000000
+0.218750
+0.781250
+1.000000
+1.000000
+0.218750
+0.781250
+1.000000
+1.000000
+0.281250
+0.781250
+1.000000
+1.000000
+0.281250
+0.781250
+1.000000
+1.000000
+0.343750
+0.781250
+1.000000
+1.000000
+0.343750
+0.781250
+1.000000
+1.000000
+0.406250
+0.781250
+1.000000
+1.000000
+0.406250
+0.781250
+1.000000
+1.000000
+0.468750
+0.781250
+1.000000
+1.000000
+0.468750
+0.781250
+1.000000
+1.000000
+0.531250
+0.781250
+1.000000
+1.000000
+0.531250
+0.781250
+1.000000
+1.000000
+0.593750
+0.781250
+1.000000
+1.000000
+0.593750
+0.781250
+1.000000
+1.000000
+0.656250
+0.781250
+1.000000
+1.000000
+0.656250
+0.781250
+1.000000
+1.000000
+0.718750
+0.781250
+1.000000
+1.000000
+0.718750
+0.781250
+1.000000
+1.000000
+0.781250
+0.781250
+1.000000
+1.000000
+0.781250
+0.781250
+1.000000
+1.000000
+0.843750
+0.781250
+1.000000
+1.000000
+0.843750
+0.781250
+1.000000
+1.000000
+0.906250
+0.781250
+1.000000
+1.000000
+0.906250
+0.781250
+1.000000
+1.000000
+0.968750
+0.781250
+1.000000
+1.000000
+0.968750
+0.781250
+1.000000
+1.000000
+0.031250
+0.843750
+1.000000
+1.000000
+0.031250
+0.843750
+1.000000
+1.000000
+0.093750
+0.843750
+1.000000
+1.000000
+0.093750
+0.843750
+1.000000
+1.000000
+0.156250
+0.843750
+1.000000
+1.000000
+0.156250
+0.843750
+1.000000
+1.000000
+0.218750
+0.843750
+1.000000
+1.000000
+0.218750
+0.843750
+1.000000
+1.000000
+0.281250
+0.843750
+1.000000
+1.000000
+0.281250
+0.843750
+1.000000
+1.000000
+0.343750
+0.843750
+1.000000
+1.000000
+0.343750
+0.843750
+1.000000
+1.000000
+0.406250
+0.843750
+1.000000
+1.000000
+0.406250
+0.843750
+1.000000
+1.000000
+0.468750
+0.843750
+1.000000
+1.000000
+0.468750
+0.843750
+1.000000
+1.000000
+0.531250
+0.843750
+1.000000
+1.000000
+0.531250
+0.843750
+1.000000
+1.000000
+0.593750
+0.843750
+1.000000
+1.000000
+0.593750
+0.843750
+1.000000
+1.000000
+0.656250
+0.843750
+1.000000
+1.000000
+0.656250
+0.843750
+1.000000
+1.000000
+0.718750
+0.843750
+1.000000
+1.000000
+0.718750
+0.843750
+1.000000
+1.000000
+0.781250
+0.843750
+1.000000
+1.000000
+0.781250
+0.843750
+1.000000
+1.000000
+0.843750
+0.843750
+1.000000
+1.000000
+0.843750
+0.843750
+1.000000
+1.000000
+0.906250
+0.843750
+1.000000
+1.000000
+0.906250
+0.843750
+1.000000
+1.000000
+0.968750
+0.843750
+1.000000
+1.000000
+0.968750
+0.843750
+1.000000
+1.000000
+0.031250
+0.906250
+1.000000
+1.000000
+0.031250
+0.906250
+1.000000
+1.000000
+0.093750
+0.906250
+1.000000
+1.000000
+0.093750
+0.906250
+1.000000
+1.000000
+0.156250
+0.906250
+1.000000
+1.000000
+0.156250
+0.906250
+1.000000
+1.000000
+0.218750
+0.906250
+1.000000
+1.000000
+0.218750
+0.906250
+1.000000
+1.000000
+0.281250
+0.906250
+1.000000
+1.000000
+0.281250
+0.906250
+1.000000
+1.000000
+0.343750
+0.906250
+1.000000
+1.000000
+0.343750
+0.906250
+1.000000
+1.000000
+0.406250
+0.906250
+1.000000
+1.000000
+0.406250
+0.906250
+1.000000
+1.000000
+0.468750
+0.906250
+1.000000
+1.000000
+0.468750
+0.906250
+1.000000
+1.000000
+0.531250
+0.906250
+1.000000
+1.000000
+0.531250
+0.906250
+1.000000
+1.000000
+0.593750
+0.906250
+1.000000
+1.000000
+0.593750
+0.906250
+1.000000
+1.000000
+0.656250
+0.906250
+1.000000
+1.000000
+0.656250
+0.906250
+1.000000
+1.000000
+0.718750
+0.906250
+1.000000
+1.000000
+0.718750
+0.906250
+1.000000
+1.000000
+0.781250
+0.906250
+1.000000
+1.000000
+0.781250
+0.906250
+1.000000
+1.000000
+0.843750
+0.906250
+1.000000
+1.000000
+0.843750
+0.906250
+1.000000
+1.000000
+0.906250
+0.906250
+1.000000
+1.000000
+0.906250
+0.906250
+1.000000
+1.000000
+0.968750
+0.906250
+1.000000
+1.000000
+0.968750
+0.906250
+1.000000
+1.000000
+0.031250
+0.968750
+1.000000
+1.000000
+0.031250
+0.968750
+1.000000
+1.000000
+0.093750
+0.968750
+1.000000
+1.000000
+0.093750
+0.968750
+1.000000
+1.000000
+0.156250
+0.968750
+1.000000
+1.000000
+0.156250
+0.968750
+1.000000
+1.000000
+0.218750
+0.968750
+1.000000
+1.000000
+0.218750
+0.968750
+1.000000
+1.000000
+0.281250
+0.968750
+1.000000
+1.000000
+0.281250
+0.968750
+1.000000
+1.000000
+0.343750
+0.968750
+1.000000
+1.000000
+0.343750
+0.968750
+1.000000
+1.000000
+0.406250
+0.968750
+1.000000
+1.000000
+0.406250
+0.968750
+1.000000
+1.000000
+0.468750
+0.968750
+1.000000
+1.000000
+0.468750
+0.968750
+1.000000
+1.000000
+0.531250
+0.968750
+1.000000
+1.000000
+0.531250
+0.968750
+1.000000
+1.000000
+0.593750
+0.968750
+1.000000
+1.000000
+0.593750
+0.968750
+1.000000
+1.000000
+0.656250
+0.968750
+1.000000
+1.000000
+0.656250
+0.968750
+1.000000
+1.000000
+0.718750
+0.968750
+1.000000
+1.000000
+0.718750
+0.968750
+1.000000
+1.000000
+0.781250
+0.968750
+1.000000
+1.000000
+0.781250
+0.968750
+1.000000
+1.000000
+0.843750
+0.968750
+1.000000
+1.000000
+0.843750
+0.968750
+1.000000
+1.000000
+0.906250
+0.968750
+1.000000
+1.000000
+0.906250
+0.968750
+1.000000
+1.000000
+0.968750
+0.968750
+1.000000
+1.000000
+0.968750
+0.968750
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/car.png b/3rdparty/TNN/examples/android/demo/src/main/assets/car.png
new file mode 100644
index 0000000..407d512
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/car.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/dog.png b/3rdparty/TNN/examples/android/demo/src/main/assets/dog.png
new file mode 100644
index 0000000..3bab67c
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/dog.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/dog_cropped.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/dog_cropped.jpg
new file mode 100644
index 0000000..802dbb1
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/dog_cropped.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase1.txt b/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase1.txt
new file mode 100644
index 0000000..0828d14
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase1.txt
@@ -0,0 +1,234 @@
+15.941798618861606
+9.926826477050781
+24.884270804268972
+8.73504202706473
+34.03078351702009
+8.315585000174385
+43.202767508370535
+8.905578068324496
+52.227116176060264
+9.469189235142299
+46.048754010881694
+1.6381108420235768
+34.56376211983817
+0.0
+23.394317626953125
+1.9033783503941126
+112.05820138113839
+9.926826477050781
+103.11574009486607
+8.73504202706473
+93.96921212332589
+8.315585000174385
+84.7972412109375
+8.905578068324496
+75.77287946428571
+9.469189235142299
+81.95124162946428
+1.6381108420235768
+93.43623570033482
+0.0
+104.60566929408482
+1.9033783503941126
+26.18139212472098
+25.667587280273438
+30.945561000279017
+27.8785400390625
+36.33588300432478
+28.732448032924104
+41.82240077427455
+28.302178519112722
+47.0606689453125
+27.047511509486604
+42.936854771205354
+23.41505650111607
+37.08025687081473
+22.198366437639507
+31.10617937360491
+22.9532230922154
+101.81861223493303
+25.667587280273438
+97.05445207868303
+27.8785400390625
+91.66411481584821
+28.732448032924104
+86.17760358537946
+28.302178519112722
+80.93933977399553
+27.047511509486604
+85.06314522879464
+23.41505650111607
+90.91973876953125
+22.198366437639507
+96.89381626674107
+22.9532230922154
+64.0
+54.491991315569194
+64.0
+47.76159232003348
+64.0
+41.0318123953683
+64.0
+34.301572527204236
+64.0
+27.57185799734933
+57.46892874581473
+27.543951851981024
+54.74309430803571
+39.03560529436384
+51.40421840122767
+50.329010009765625
+46.23475428989955
+59.220977783203125
+47.325993129185264
+64.26312691824776
+54.644413539341514
+67.39312308175222
+64.0
+67.82386125837053
+73.35558210100446
+67.39312308175222
+80.67401123046875
+64.26312691824776
+81.76524135044643
+59.220977783203125
+76.59578159877232
+50.329010009765625
+73.25690569196428
+39.03560529436384
+70.53107125418526
+27.543951851981024
+59.80152675083705
+35.15744890485491
+55.603053501674104
+42.74344744001116
+68.19847324916294
+35.15744890485491
+72.39694649832589
+42.74344744001116
+43.504887172154014
+88.50916399274553
+48.7764892578125
+94.39142717633928
+55.624664306640625
+98.8087158203125
+64.0
+100.29350934709821
+72.37533569335938
+98.8087158203125
+79.2235107421875
+94.39142717633928
+84.49510846819196
+88.50916399274553
+77.83380998883928
+83.34629603794643
+69.27566528320312
+79.58379255022321
+64.0
+80.6008562360491
+58.724334716796875
+79.58379255022321
+50.16619001116071
+83.34629603794643
+50.89141845703125
+89.74724469866071
+57.3452889578683
+90.64471435546875
+64.0
+91.18742152622768
+70.6547110421317
+90.64471435546875
+77.10858154296875
+89.74724469866071
+77.07768031529018
+87.48964146205357
+70.63646153041294
+86.86642020089285
+64.0
+86.96722412109375
+57.36353846958705
+86.86642020089285
+50.9223153250558
+87.48964146205357
+0.0183851420879364
+28.108745029994417
+0.0
+34.72972978864397
+0.06494628531592232
+41.350777762276785
+0.41133373124258854
+47.96626935686383
+1.0741017205374581
+54.56809779575892
+2.1054571696690147
+61.13047572544642
+3.48410279410226
+67.63246808733258
+5.19008799961635
+74.0465349469866
+7.2999758039202005
+80.32695661272321
+9.84906005859375
+86.4172624860491
+12.966961451939174
+92.20993477957589
+16.58299146379743
+97.6715349469866
+20.691375732421875
+102.7428240094866
+25.146484375
+107.48412214006696
+29.89637974330357
+111.89399937220982
+34.84847150530134
+116.02112688337053
+39.985382080078125
+119.81998116629464
+45.38229806082589
+123.17197963169642
+51.17150006975446
+125.84141322544642
+57.44042096819196
+127.51022774832589
+64.0
+128.00000871930803
+70.55957903180803
+127.51022774832589
+76.82849993024553
+125.84141322544642
+82.6177019391741
+123.17197963169642
+88.01460484095982
+119.81998116629464
+93.15153285435267
+116.02112688337053
+98.10362025669642
+111.89399937220982
+102.85349818638392
+107.48412214006696
+107.30861118861607
+102.7428240094866
+111.41700962611607
+97.6715349469866
+115.03303745814732
+92.20993477957589
+118.15093122209821
+86.4172624860491
+120.70002092633928
+80.32695661272321
+122.80990164620535
+74.0465349469866
+124.5158952985491
+67.63246808733258
+125.89454868861607
+61.13047572544642
+126.92589460100446
+54.56809779575892
+127.58866664341517
+47.96626935686383
+127.93504987444196
+41.350777762276785
+128.0
+34.72972978864397
+127.98161097935267
+28.108745029994417
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase2.txt b/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase2.txt
new file mode 100644
index 0000000..be46dce
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/assets/mean_pts_phase2.txt
@@ -0,0 +1,152 @@
+0.0
+12.669152396065847
+11.908857073102677
+11.148134504045759
+24.08943394252232
+10.61280277797154
+36.30393327985491
+11.365781511579241
+48.32182965959821
+12.085093906947543
+40.093985421316965
+2.090649196079799
+24.799209594726562
+0.0
+9.924659729003906
+2.429197583879743
+127.99999128069196
+12.669152396065847
+116.09113420758928
+11.148134504045759
+103.9105486188616
+10.61280277797154
+91.6960710797991
+11.365781511579241
+79.67816162109375
+12.085093906947543
+87.90598842075893
+2.090649196079799
+103.20077078683035
+0.0
+118.07530866350446
+2.429197583879743
+13.636254446847097
+32.75836181640625
+19.980789184570312
+35.58009556361607
+27.15917750767299
+36.66990443638392
+34.46567644391741
+36.12076677594866
+41.44157191685267
+34.519496372767854
+35.94981384277344
+29.88356454031808
+28.150475638253347
+28.330753871372767
+20.194686889648438
+29.29414803641183
+114.36373465401785
+32.75836181640625
+108.01921735491071
+35.58009556361607
+100.84081159319196
+36.66990443638392
+93.5343279157366
+36.12076677594866
+86.55841936383928
+34.519496372767854
+92.0501708984375
+29.88356454031808
+99.84950474330357
+28.330753871372767
+107.8052978515625
+29.29414803641183
+63.99999128069196
+69.54562377929688
+63.99999128069196
+60.95592389787946
+63.99999128069196
+52.367013113839285
+63.99999128069196
+43.77752249581473
+63.99999128069196
+35.18869454520089
+55.30244663783482
+35.153080531529014
+51.67239815848214
+49.819344656808035
+47.225956508091514
+64.23260062081472
+40.34169224330357
+75.58101109095982
+41.794908796037944
+82.01606968470982
+51.540985107421875
+86.0107421875
+63.99999128069196
+86.56048583984375
+76.45900181361607
+86.0107421875
+86.20508684430803
+82.01606968470982
+87.65829903738839
+75.58101109095982
+80.77403041294643
+64.23260062081472
+76.32758440290178
+49.819344656808035
+72.6975359235491
+35.153080531529014
+58.40880911690848
+44.869838169642854
+52.81761823381696
+54.55149623325892
+69.59117780412946
+44.869838169642854
+75.18236432756696
+54.55149623325892
+36.70627702985491
+112.96016148158482
+43.726566859654014
+120.4674333844866
+52.846400669642854
+126.10501534598214
+63.99999128069196
+127.99999128069196
+75.15358189174107
+126.10501534598214
+84.27342878069196
+120.4674333844866
+91.29370989118303
+112.96016148158482
+82.4227294921875
+106.37104143415178
+71.02569580078125
+101.56913539341517
+63.99999128069196
+102.86716134207589
+56.974291120256694
+101.56913539341517
+45.577257428850444
+106.37104143415178
+46.54305158342633
+114.54027448381696
+55.13779558454241
+115.68567766462053
+63.99999128069196
+116.37831333705357
+72.86218697684151
+115.68567766462053
+81.4569353376116
+114.54027448381696
+81.41578892299107
+111.65901402064732
+72.83788626534597
+110.86361258370535
+63.99999128069196
+110.99226597377232
+55.162096296037944
+110.86361258370535
+46.584197998046875
+111.65901402064732
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/synset.txt b/3rdparty/TNN/examples/android/demo/src/main/assets/synset.txt
new file mode 100644
index 0000000..722c984
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/assets/synset.txt
@@ -0,0 +1,1000 @@
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/test_blazeface.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/test_blazeface.jpg
new file mode 100644
index 0000000..a218be1
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/test_blazeface.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/test_face.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/test_face.jpg
new file mode 100644
index 0000000..feeb69b
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/test_face.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/test_ocr.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/test_ocr.jpg
new file mode 100644
index 0000000..614ef20
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/test_ocr.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/assets/tiger_cat.jpg b/3rdparty/TNN/examples/android/demo/src/main/assets/tiger_cat.jpg
new file mode 100644
index 0000000..ffcd2be
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/assets/tiger_cat.jpg differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/BlazeFaceDetector.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/BlazeFaceDetector.java
new file mode 100644
index 0000000..bd66108
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/BlazeFaceDetector.java
@@ -0,0 +1,13 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+import android.util.Pair;
+
+public class BlazeFaceDetector {
+    public native int init(String modelPath, int width, int height, float scoreThreshold, float iouThreshold, int topk, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native FaceInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+    public native FaceInfo[] detectFromImage(Bitmap bitmap, int width, int height);
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/DeviceConst.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/DeviceConst.java
new file mode 100644
index 0000000..3d16de1
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/DeviceConst.java
@@ -0,0 +1,10 @@
+package com.tencent.tnn.demo;
+
+public class DeviceConst {
+
+    public static String DEVICE_ARM = "ARM";
+
+    public static String DEVICE_OPENCL = "OPENCL";
+
+    public static String DEVICE_HUAWEI_NPU = "HUAWEI_NPU";
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceAlign.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceAlign.java
new file mode 100644
index 0000000..cf38f51
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceAlign.java
@@ -0,0 +1,10 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class FaceAlign {
+    public native int init(String modelPath, int width, int height, float scoreThreshold, float iouThreshold, int topk, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native FaceInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceDetector.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceDetector.java
new file mode 100644
index 0000000..98ca2cc
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceDetector.java
@@ -0,0 +1,11 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class FaceDetector {
+    public native int init(String modelPath, int width, int height, float scoreThreshold, float iouThreshold, int topk, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native FaceInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+    public native FaceInfo[] detectFromImage(Bitmap bitmap, int width, int height);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceInfo.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceInfo.java
new file mode 100644
index 0000000..0bbd229
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FaceInfo.java
@@ -0,0 +1,11 @@
+package com.tencent.tnn.demo;
+
+public class FaceInfo {
+    public float x1;
+    public float y1;
+    public float x2;
+    public float y2;
+    public float score;
+    public float[] landmarks;
+    public float[][] keypoints;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FileUtils.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FileUtils.java
new file mode 100644
index 0000000..d2cdb7e
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FileUtils.java
@@ -0,0 +1,83 @@
+package com.tencent.tnn.demo;
+
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.util.ArrayList;
+
+
+public class FileUtils {
+
+    private FileUtils() {
+        throw new AssertionError();
+    }
+
+    public static boolean copyAsset(AssetManager assetManager,
+                                     String fromAssetPath, String toPath) {
+        InputStream in = null;
+        OutputStream out = null;
+        try {
+            in = assetManager.open(fromAssetPath);
+            new File(toPath).createNewFile();
+            out = new FileOutputStream(toPath);
+            copyFile(in, out);
+            in.close();
+            in = null;
+            out.flush();
+            out.close();
+            out = null;
+            return true;
+        } catch(Exception e) {
+            e.printStackTrace();
+            return false;
+        }
+    }
+
+    public static void copyFile(InputStream in, OutputStream out) throws IOException {
+        byte[] buffer = new byte[1024];
+        int read;
+        while((read = in.read(buffer)) != -1){
+            out.write(buffer, 0, read);
+        }
+    }
+
+    public static Bitmap readBitmapFromFile(AssetManager assetManager, String filePath) {
+        InputStream istr = null;
+        try {
+            istr = assetManager.open(filePath);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        Bitmap bitmap = BitmapFactory.decodeStream(istr);
+        return bitmap;
+    }
+
+    public static ArrayList<String> ReadListFromFile(AssetManager assetManager, String filePath) {
+        ArrayList<String> list = new ArrayList<String>();
+        BufferedReader reader = null;
+        InputStream istr = null;
+        try {
+            reader = new BufferedReader(
+                    new InputStreamReader(assetManager.open(filePath)));
+            String line;
+            while ((line = reader.readLine()) != null) {
+                list.add(line);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return list;
+    }
+
+
+}
+
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FpsCounter.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FpsCounter.java
new file mode 100644
index 0000000..0f657a2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/FpsCounter.java
@@ -0,0 +1,10 @@
+package com.tencent.tnn.demo;
+
+public class FpsCounter {
+    public native int init();
+    public native int deinit();
+    public native int begin(String tag);
+    public native int end(String tag);
+    public native double getFps(String tag);
+    public native double getTime(String tag);
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/HairSegmentation.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/HairSegmentation.java
new file mode 100644
index 0000000..74cd781
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/HairSegmentation.java
@@ -0,0 +1,11 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class HairSegmentation {
+    public native int init(String modelPath, int width, int height, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native int setHairColor(byte[] rgba);
+    public native ImageInfo[] predictFromStream(byte[] yuv420sp, int width, int height, int rotate);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/Helper.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/Helper.java
new file mode 100644
index 0000000..a765b01
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/Helper.java
@@ -0,0 +1,5 @@
+package com.tencent.tnn.demo;
+
+public class Helper {
+    public static native String getBenchResult();
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectActivity.java
new file mode 100644
index 0000000..f1dab17
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectActivity.java
@@ -0,0 +1,62 @@
+package com.tencent.tnn.demo.ImageBlazeFaceDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class ImageBlazeFaceDetectActivity extends DemoBaseActivity {
+    private static final String TAG = ImageBlazeFaceDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageBlazeFaceDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java
new file mode 100644
index 0000000..94debd8
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java
@@ -0,0 +1,273 @@
+package com.tencent.tnn.demo.ImageBlazeFaceDetector;
+
+import android.graphics.Bitmap;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Rect;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.BlazeFaceDetector;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+import java.util.ArrayList;
+
+
+public class ImageBlazeFaceDetectFragment extends BaseFragment {
+
+    private final static String TAG = BlazeFaceDetector.class.getSimpleName();
+    private BlazeFaceDetector mFaceDetector = new BlazeFaceDetector();
+
+    private static final String IMAGE = "test_blazeface.jpg";
+    private static final int NET_H_INPUT = 128;
+    private static final int NET_W_INPUT = 128;
+    private Paint mPaint = new Paint();
+    private DrawView mDrawView;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = mFaceDetector.checkNpu(modelPath);
+    }
+
+    private String initModel() {
+
+        String targetDir = getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "blazeface.tnnmodel",
+                "blazeface.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath;
+            FileUtils.copyAsset(getActivity().getAssets(), "blazeface/" + modelFilePath, interModelFilePath);
+        }
+        FileUtils.copyAsset(getActivity().getAssets(),"blazeface_anchors.txt", targetDir + "/blazeface_anchors.txt");
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b) {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView) $(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b) {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView) $(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        mDrawView = (DrawView) $(R.id.drawView);
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView) $(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        Bitmap scaleBitmap = Bitmap.createScaledBitmap(originBitmap, NET_W_INPUT, NET_H_INPUT, false);
+        ImageView source = (ImageView) $(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if (mUseHuaweiNpu) {
+            device = 2;
+        } else if (mUseGPU) {
+            device = 1;
+        }
+        int result = mFaceDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+        Log.d(TAG, "detect from image");
+        FaceInfo[] faceInfoList = mFaceDetector.detectFromImage(originBitmap, originBitmap.getWidth(), originBitmap.getHeight());
+        int faceCount = 0;
+        if (faceInfoList != null) {
+            faceCount = faceInfoList.length;
+        }
+        mPaint.setStyle(Paint.Style.STROKE);
+        Bitmap scaleBitmap2 = originBitmap.copy(Bitmap.Config.ARGB_8888, true);
+        Canvas canvas = new Canvas(scaleBitmap2);
+        ArrayList<Rect> rects = new ArrayList<Rect>();
+
+        for (int i = 0; i < faceInfoList.length; i++) {
+            FaceInfo tmpFaceInfo = faceInfoList[i];
+            rects.add(new Rect((int) (tmpFaceInfo.x1), (int) (tmpFaceInfo.y1), (int) (tmpFaceInfo.x2), (int) (tmpFaceInfo.y2)));
+            for (int j = 0; j < tmpFaceInfo.keypoints.length; j++) {
+                mPaint.setARGB(255, 0, 255, 0);
+                mPaint.setStrokeWidth(5);
+                canvas.drawPoint(tmpFaceInfo.keypoints[j][0], tmpFaceInfo.keypoints[j][1], mPaint);
+            }
+        }
+        mPaint.setStrokeWidth(8);
+        for (int i = 0; i < rects.size(); i++) {
+            Log.d(TAG, "rect " + rects.get(i));
+            Rect rect = rects.get(i);
+            mPaint.setARGB(255, 0, 255, 0);
+            canvas.drawRect(rect, mPaint);
+        }
+        source.setImageBitmap(scaleBitmap2);
+        source.draw(canvas);
+        String benchResult = "face count: "  + faceCount+ " " + Helper.getBenchResult();
+        TextView result_view = (TextView) $(R.id.result);
+        result_view.setText(benchResult);
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+                if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                    clickBack();
+                    return true;
+                }
+                return false;
+            }
+        });
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassify.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassify.java
new file mode 100644
index 0000000..dbe7a38
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassify.java
@@ -0,0 +1,10 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class ImageClassify {
+    public native int init(String modelPath, int width, int height, int computeUnitType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native int[] detectFromImage(Bitmap image, int width, int height);
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectActivity.java
new file mode 100644
index 0000000..f45f8c2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectActivity.java
@@ -0,0 +1,64 @@
+package com.tencent.tnn.demo.ImageClassifyDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class ImageClassifyDetectActivity extends DemoBaseActivity {
+    private static final String TAG = ImageClassifyDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageClassifyDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java
new file mode 100644
index 0000000..b619120
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java
@@ -0,0 +1,250 @@
+package com.tencent.tnn.demo.ImageClassifyDetector;
+
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ImageClassify;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+import java.util.ArrayList;
+
+
+public class ImageClassifyDetectFragment extends BaseFragment {
+
+    private final static String TAG = ImageClassifyDetectFragment.class.getSimpleName();
+    private ImageClassify mImageClassify = new ImageClassify();
+
+    private static final String IMAGE = "tiger_cat.jpg";
+    private static final String RESULT_LIST = "synset.txt";
+    private static final int NET_INPUT = 224;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = mImageClassify.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "squeezenet_v1.1.tnnmodel",
+                "squeezenet_v1.1.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "SqueezeNet/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        final Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+
+        ArrayList<String> result_list = FileUtils.ReadListFromFile(getActivity().getAssets(), RESULT_LIST);
+        final Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+
+        final Bitmap scaleBitmap = Bitmap.createScaledBitmap(originBitmap, NET_INPUT, NET_INPUT, false);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if (mUseHuaweiNpu) {
+            device = 2;
+        } else if (mUseGPU) {
+            device = 1;
+        }
+        int result = mImageClassify.init(modelPath, NET_INPUT, NET_INPUT, device);
+        if(result == 0) {
+            Log.d(TAG, "detect from image");
+            int [] indexArray= mImageClassify.detectFromImage(scaleBitmap, NET_INPUT, NET_INPUT);
+            Log.d(TAG, "detect from image result " + result + " index :" + indexArray);
+            if(indexArray != null && indexArray.length > 0) {
+                Log.d(TAG, "detect index " + indexArray[0]);
+                String resultText = "result: " + result_list.get(indexArray[0]) + " " + Helper.getBenchResult();
+                TextView result_view = (TextView)$(R.id.result);
+                result_view.setText(resultText);
+            }
+            mImageClassify.deinit();
+        } else {
+            Log.e(TAG, "failed to init model " + result);
+        }
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+                if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                    clickBack();
+                    return true;
+                }
+                return false;
+            }
+        });
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectActivity.java
new file mode 100644
index 0000000..08806fb
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectActivity.java
@@ -0,0 +1,65 @@
+package com.tencent.tnn.demo.ImageFaceDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class ImageFaceDetectActivity extends DemoBaseActivity {
+    private static final String TAG = ImageFaceDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageFaceDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java
new file mode 100644
index 0000000..80384ed
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java
@@ -0,0 +1,284 @@
+package com.tencent.tnn.demo.ImageFaceDetector;
+
+import android.graphics.Bitmap;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Rect;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.FaceDetector;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+import java.util.ArrayList;
+
+
+public class ImageFaceDetectFragment extends BaseFragment {
+
+    private final static String TAG = ImageFaceDetectFragment.class.getSimpleName();
+    private FaceDetector mFaceDetector = new FaceDetector();
+
+    private static final String IMAGE = "test_face.jpg";
+    private static final int NET_H_INPUT = 240;
+    private static final int NET_W_INPUT = 320;
+    private Paint mPaint = new Paint();
+    private DrawView mDrawView;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = mFaceDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "version-slim-320_simplified.tnnmodel",
+                "version-slim-320_simplified.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "face_detector/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b)
+    {
+        if(b && mHuaweiNPUswitch.isChecked()){
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if(b && mGPUSwitch.isChecked()){
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        mDrawView = (DrawView) $(R.id.drawView);
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        float scalew = originBitmap.getWidth()/(float)NET_W_INPUT;
+        float scaleh = originBitmap.getHeight()/(float)NET_H_INPUT;
+        Bitmap scaleBitmap = Bitmap.createScaledBitmap(originBitmap, NET_W_INPUT, NET_H_INPUT, false);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if(mUseHuaweiNpu) {
+            device = 2;
+        }else if(mUseGPU) {
+            device = 1;
+        }
+        int result = mFaceDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+        if(result == 0) {
+            Log.d(TAG, "detect from image");
+            FaceInfo[] faceInfoList = mFaceDetector.detectFromImage(scaleBitmap, NET_W_INPUT, NET_H_INPUT);
+            Log.d(TAG, "detect from image result " + faceInfoList);
+            int faceCount = 0;
+            if (faceInfoList != null) {
+                faceCount = faceInfoList.length;
+            }
+            if(faceInfoList != null && faceInfoList.length > 0) {
+                Log.d(TAG, "detect face size " + faceInfoList.length);
+
+                mPaint.setARGB(255, 0, 255, 0);
+                mPaint.setStrokeWidth(3);
+                mPaint.setFilterBitmap(true);
+                mPaint.setStyle(Paint.Style.STROKE);
+                Bitmap scaleBitmap2 = originBitmap.copy(Bitmap.Config.ARGB_8888, true);
+                Canvas canvas = new Canvas(scaleBitmap2);
+                ArrayList<Rect> rects = new ArrayList<Rect>();
+                for (int i=0; i<faceInfoList.length; i++)
+                {
+                    rects.add(new Rect((int)(faceInfoList[i].x1 * scalew), (int)(faceInfoList[i].y1 * scaleh), (int)(faceInfoList[i].x2 * scalew), (int)(faceInfoList[i].y2*scaleh)));
+                }
+                for (int i=0; i<rects.size(); i++) {
+                    Log.d(TAG, "rect " + rects.get(i));
+                    mPaint.setARGB(255, 0, 255, 0);
+                    canvas.drawRect(rects.get(i), mPaint);
+                }
+                source.setImageBitmap(scaleBitmap2);
+
+                source.draw(canvas);
+
+
+            }
+            String benchResult = "face count: " + faceCount + " " + Helper.getBenchResult();
+            TextView result_view = (TextView)$(R.id.result);
+            result_view.setText(benchResult);
+        } else {
+            Log.e(TAG, "failed to init model " + result);
+        }
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageInfo.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageInfo.java
new file mode 100644
index 0000000..2f2be2d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageInfo.java
@@ -0,0 +1,8 @@
+package com.tencent.tnn.demo;
+
+public class ImageInfo {
+    public int image_width;
+    public int image_height;
+    public int image_channel;
+    public byte[] data;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectActivity.java
new file mode 100644
index 0000000..464a52a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectActivity.java
@@ -0,0 +1,65 @@
+package com.tencent.tnn.demo.ImageOCRDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class ImageOCRDetectActivity extends DemoBaseActivity {
+    private static final String TAG = ImageOCRDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageOCRDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java
new file mode 100644
index 0000000..a2b5e38
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java
@@ -0,0 +1,310 @@
+package com.tencent.tnn.demo.ImageOCRDetector;
+
+import android.graphics.Bitmap;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Rect;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.OCRDetector;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+import java.util.ArrayList;
+
+
+public class ImageOCRDetectFragment extends BaseFragment {
+
+    private final static String TAG = ImageOCRDetectFragment.class.getSimpleName();
+    private OCRDetector mOCRDetector = new OCRDetector();
+
+    private static final String IMAGE = "test_ocr.jpg";
+    private Paint mPaint = new Paint();
+    private DrawView mDrawView;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = mOCRDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        // copy ocr related models to sdcard
+        String[] modelPathsDetector = {
+                "angle_net.tnnmodel",
+                "angle_net.tnnproto",
+                "crnn_lite_lstm.tnnmodel",
+                "crnn_lite_lstm.tnnproto",
+                "dbnet.tnnmodel",
+                "dbnet.tnnproto",
+                "keys.txt",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "chinese-ocr/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b)
+    {
+        if(b && mHuaweiNPUswitch.isChecked()){
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if(b && mGPUSwitch.isChecked()){
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        mDrawView = (DrawView) $(R.id.drawView);
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        Bitmap scaleBitmap = Bitmap.createScaledBitmap(originBitmap, originBitmap.getWidth(), originBitmap.getHeight(), false);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if(mUseHuaweiNpu) {
+            device = 2;
+        }else if(mUseGPU) {
+            device = 1;
+        }
+        int result = mOCRDetector.init(modelPath, originBitmap.getWidth(), originBitmap.getHeight(), device);
+        String txt_result = "text result:\n";
+        if(result == 0) {
+            Log.d(TAG, "detect from image");
+            ObjectInfo[] objectInfoList = mOCRDetector.detectFromImage(scaleBitmap, originBitmap.getWidth(), originBitmap.getHeight());
+            Log.d(TAG, "detect from image result " + objectInfoList);
+            int objectCount = 0;
+            if (objectInfoList != null) {
+                objectCount = objectInfoList.length;
+            }
+            if (objectInfoList != null && objectInfoList.length > 0) {
+                Log.d(TAG, "detect object size " + objectInfoList.length);
+
+                mPaint.setARGB(255, 255, 0, 0);
+                mPaint.setStrokeWidth(3);
+                mPaint.setFilterBitmap(true);
+                mPaint.setStyle(Paint.Style.STROKE);
+                mPaint.setTextAlign(Paint.Align.CENTER);
+                mPaint.setTextSize(30);
+                Bitmap scaleBitmap2 = originBitmap.copy(Bitmap.Config.ARGB_8888, true);
+                Canvas canvas = new Canvas(scaleBitmap2);
+                ArrayList<float[]> point_lines_list = new ArrayList<float[]>();
+                ArrayList<String> labels = new ArrayList<String>();
+                for (int i = 0; i < objectInfoList.length; i++)
+                {
+                    float[] point_lines = new float[4 * 4];
+                    point_lines[0] = objectInfoList[i].key_points[0][0];
+                    point_lines[1] = objectInfoList[i].key_points[0][1];
+                    point_lines[2] = objectInfoList[i].key_points[1][0];
+                    point_lines[3] = objectInfoList[i].key_points[1][1];
+                    point_lines[4] = objectInfoList[i].key_points[1][0];
+                    point_lines[5] = objectInfoList[i].key_points[1][1];
+                    point_lines[6] = objectInfoList[i].key_points[2][0];
+                    point_lines[7] = objectInfoList[i].key_points[2][1];
+                    point_lines[8] = objectInfoList[i].key_points[2][0];
+                    point_lines[9] = objectInfoList[i].key_points[2][1];
+                    point_lines[10] = objectInfoList[i].key_points[3][0];
+                    point_lines[11] = objectInfoList[i].key_points[3][1];
+                    point_lines[12] = objectInfoList[i].key_points[3][0];
+                    point_lines[13] = objectInfoList[i].key_points[3][1];
+                    point_lines[14] = objectInfoList[i].key_points[0][0];
+                    point_lines[15] = objectInfoList[i].key_points[0][1];
+
+                    point_lines_list.add(point_lines);
+                    labels.add(String.format("%s", objectInfoList[i].label));
+                    txt_result += objectInfoList[i].label + "\n";
+                }
+                for (int i=0; i<point_lines_list.size(); i++) {
+                    float[] point_lines = point_lines_list.get(i);
+                    canvas.drawLines(point_lines, mPaint);
+                    if(labels.size() > 0) {
+                        canvas.drawText(labels.get(i), point_lines[0], point_lines[1], mPaint);
+                    }
+                }
+                source.setImageBitmap(scaleBitmap2);
+
+                source.draw(canvas);
+
+            }
+            String benchResult = "text box count: " + objectCount + " " + Helper.getBenchResult() + txt_result;
+            TextView result_view = (TextView)$(R.id.result);
+            result_view.setText(benchResult);
+        } else {
+            Log.e(TAG, "failed to init model " + result);
+        }
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectActivity.java
new file mode 100644
index 0000000..167a3c4
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectActivity.java
@@ -0,0 +1,62 @@
+package com.tencent.tnn.demo.ImageObjectDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class ImageObjectDetectActivity extends DemoBaseActivity {
+    private static final String TAG = ImageObjectDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageObjectDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java
new file mode 100644
index 0000000..e8608d6
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java
@@ -0,0 +1,285 @@
+package com.tencent.tnn.demo.ImageObjectDetector;
+
+import android.graphics.Bitmap;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Rect;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.ObjectDetector;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+import java.util.ArrayList;
+
+
+public class ImageObjectDetectFragment extends BaseFragment {
+
+    private final static String TAG = ImageObjectDetectFragment.class.getSimpleName();
+    private ObjectDetector mObjectDetector = new ObjectDetector();
+
+    private static final String IMAGE = "dog_cropped.jpg";
+    private static final int NET_H_INPUT = 448;
+    private static final int NET_W_INPUT = 640;
+    private Paint mPaint = new Paint();
+    private DrawView mDrawView;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = mObjectDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "yolov5s.tnnmodel",
+                "yolov5s-permute.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "yolov5/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b)
+    {
+        if(b && mHuaweiNPUswitch.isChecked()){
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if(b && mGPUSwitch.isChecked()){
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        mDrawView = (DrawView) $(R.id.drawView);
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if(mUseHuaweiNpu) {
+            device = 2;
+        }else if(mUseGPU) {
+            device = 1;
+        }
+        int result = mObjectDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+        if(result == 0) {
+            Log.d(TAG, "detect from image");
+            ObjectInfo[] objectInfoList = mObjectDetector.detectFromImage(originBitmap, originBitmap.getWidth(), originBitmap.getHeight());
+            Log.d(TAG, "detect from image result " + objectInfoList);
+            int objectCount = 0;
+            if (objectInfoList != null) {
+                objectCount = objectInfoList.length;
+            }
+            if(objectInfoList != null && objectInfoList.length > 0) {
+                Log.d(TAG, "detect object size " + objectInfoList.length);
+
+                mPaint.setARGB(255, 0, 255, 0);
+                mPaint.setFilterBitmap(true);
+                mPaint.setStyle(Paint.Style.STROKE);
+                Bitmap scaleBitmap2 = originBitmap.copy(Bitmap.Config.ARGB_8888, true);
+                Canvas canvas = new Canvas(scaleBitmap2);
+                ArrayList<Rect> rects = new ArrayList<Rect>();
+                for (int i=0; i<objectInfoList.length; i++)
+                {
+                    rects.add(new Rect((int)(objectInfoList[i].x1), (int)(objectInfoList[i].y1), (int)(objectInfoList[i].x2), (int)(objectInfoList[i].y2)));
+                }
+                for (int i=0; i<rects.size(); i++) {
+                    Log.d(TAG, "rect " + rects.get(i));
+                    Rect rect = rects.get(i);
+                    mPaint.setARGB(255, 0, 255, 0);
+                    canvas.drawRect(rect, mPaint);
+                    ObjectInfo info = objectInfoList[i];
+                    if(info.class_id < ObjectDetector.label_list.length) {
+                        canvas.drawText(String.format("%s : %f", ObjectDetector.label_list[info.class_id], info.score), rect.left, rect.top - 5, mPaint);
+                    }
+                }
+                source.setImageBitmap(scaleBitmap2);
+
+                source.draw(canvas);
+
+
+            }
+            String benchResult = "object count: " + objectCount + " " + Helper.getBenchResult();
+            TextView result_view = (TextView)$(R.id.result);
+            result_view.setText(benchResult);
+        } else {
+            Log.e(TAG, "failed to init model " + result);
+        }
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDActivity.java
new file mode 100644
index 0000000..0c1553f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDActivity.java
@@ -0,0 +1,62 @@
+package com.tencent.tnn.demo.ImageObjectDetectorSSD;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class ImageObjectDetectSSDActivity extends DemoBaseActivity {
+    private static final String TAG = ImageObjectDetectSSDActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new ImageObjectDetectSSDFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java
new file mode 100644
index 0000000..89f9d30
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java
@@ -0,0 +1,280 @@
+package com.tencent.tnn.demo.ImageObjectDetectorSSD;
+
+import android.graphics.Bitmap;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Rect;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectDetector;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.ObjectDetectorSSD;
+
+import java.util.ArrayList;
+
+
+public class ImageObjectDetectSSDFragment extends BaseFragment {
+
+    private final static String TAG = ImageObjectDetectSSDFragment.class.getSimpleName();
+    private ObjectDetectorSSD imageObjectDetectorSSD = new ObjectDetectorSSD();
+
+    private static final String IMAGE = "004545.jpg";
+    private static final int NET_H_INPUT = 300;
+    private static final int NET_W_INPUT = 300;
+    private Paint mPaint = new Paint();
+    private DrawView mDrawView;
+    private ToggleButton mGPUSwitch;
+    private Button mRunButton;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        String modelPath = initModel();
+        NpuEnable = imageObjectDetectorSSD.checkNpu(modelPath);
+    }
+
+    private String initModel() {
+
+        String targetDir = getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "mobilenetv2_ssd.tnnmodel",
+                "mobilenetv2_ssd.tnnproto",
+        };
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath;
+            FileUtils.copyAsset(getActivity().getAssets(), "mobilenet_v2-ssd/" + modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+
+    private void onSwichGPU(boolean b) {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView) $(R.id.result);
+        result_view.setText("");
+    }
+
+    private void onSwichNPU(boolean b) {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView) $(R.id.result);
+        result_view.setText("");
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_image_detector);
+        setTitleGone();
+        $$(R.id.back_rl);
+        $$(R.id.gpu_switch);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+
+        HuaweiNpuTextView = $(R.id.npu_text);
+
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        mDrawView = (DrawView) $(R.id.drawView);
+        mRunButton = $(R.id.run_button);
+        mRunButton.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                startDetect();
+            }
+        });
+
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView) $(R.id.origin);
+        source.setImageBitmap(originBitmap);
+    }
+
+    @Override
+    public void openCamera() {
+
+    }
+
+    @Override
+    public void startPreview(SurfaceHolder surfaceHolder) {
+
+    }
+
+    @Override
+    public void closeCamera() {
+
+    }
+
+
+    private void startDetect() {
+        Bitmap originBitmap = FileUtils.readBitmapFromFile(getActivity().getAssets(), IMAGE);
+        ImageView source = (ImageView)$(R.id.origin);
+        source.setImageBitmap(originBitmap);
+        String modelPath = initModel();
+        Log.d(TAG, "Init classify " + modelPath);
+        int device = 0;
+        if(mUseHuaweiNpu) {
+            device = 2;
+        }else if(mUseGPU) {
+            device = 1;
+        }
+        int result = imageObjectDetectorSSD.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+        if(result == 0) {
+            Log.d(TAG, "detect from image");
+            ObjectInfo[] objectInfoList = imageObjectDetectorSSD.detectFromImage(originBitmap, originBitmap.getWidth(), originBitmap.getHeight());
+            Log.d(TAG, "detect from image result " + objectInfoList);
+            int objectCount = 0;
+            if (objectInfoList != null) {
+                objectCount = objectInfoList.length;
+            }
+            if(objectInfoList != null && objectInfoList.length > 0) {
+                Log.d(TAG, "detect object size " + objectInfoList.length);
+
+                mPaint.setARGB(255, 0, 255, 0);
+                mPaint.setFilterBitmap(true);
+                mPaint.setStyle(Paint.Style.STROKE);
+                Bitmap scaleBitmap2 = originBitmap.copy(Bitmap.Config.ARGB_8888, true);
+                Canvas canvas = new Canvas(scaleBitmap2);
+                ArrayList<Rect> rects = new ArrayList<Rect>();
+                for (int i = 0; i < objectInfoList.length; i++) {
+                    rects.add(new Rect((int) (objectInfoList[i].x1), (int) (objectInfoList[i].y1), (int) (objectInfoList[i].x2), (int) (objectInfoList[i].y2)));
+                }
+                for (int i = 0; i < rects.size(); i++) {
+                    Log.d(TAG, "rect " + rects.get(i));
+                    Rect rect = rects.get(i);
+                    mPaint.setARGB(255, 0, 255, 0);
+                    canvas.drawRect(rect, mPaint);
+                    ObjectInfo info = objectInfoList[i];
+                    if (info.class_id < ObjectDetectorSSD.label_list.length) {
+                        mPaint.setTextSize(18);
+                        canvas.drawText(String.format("%s : %f", ObjectDetectorSSD.label_list[info.class_id], info.score), rect.left, rect.top - 5, mPaint);
+                    }
+                }
+                source.setImageBitmap(scaleBitmap2);
+                source.draw(canvas);
+
+
+            }
+            String benchResult = "object count: " + objectCount + " " + Helper.getBenchResult();
+            TextView result_view = (TextView) $(R.id.result);
+            result_view.setText(benchResult);
+        } else {
+            Log.e(TAG, "failed to init model " + result);
+        }
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+                if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                    clickBack();
+                    return true;
+                }
+                return false;
+            }
+        });
+    }
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/MainActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/MainActivity.java
new file mode 100644
index 0000000..827d307
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/MainActivity.java
@@ -0,0 +1,247 @@
+package com.tencent.tnn.demo;
+
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.os.Debug;
+import android.view.View;
+import android.widget.TextView;
+
+import com.tencent.tnn.demo.ImageBlazeFaceDetector.ImageBlazeFaceDetectActivity;
+import com.tencent.tnn.demo.ImageClassifyDetector.ImageClassifyDetectActivity;
+import com.tencent.tnn.demo.ImageFaceDetector.ImageFaceDetectActivity;
+import com.tencent.tnn.demo.ImageObjectDetectorSSD.ImageObjectDetectSSDActivity;
+import com.tencent.tnn.demo.ImageOCRDetector.ImageOCRDetectActivity;
+import com.tencent.tnn.demo.StreamBlazeFaceAlign.StreamBlazeFaceAlignActivity;
+import com.tencent.tnn.demo.StreamBlazeFaceDetector.StreamBlazeFaceDetectActivity;
+import com.tencent.tnn.demo.StreamFaceDetector.StreamFaceDetectActivity;
+import com.tencent.tnn.demo.ImageObjectDetector.ImageObjectDetectActivity;
+import com.tencent.tnn.demo.StreamHairSegmentation.StreamHairSegmentationActivity;
+import com.tencent.tnn.demo.StreamObjectDetector.StreamObjectDetectActivity;
+import com.tencent.tnn.demo.StreamObjectDetectorSSD.StreamObjectDetectSSDActivity;
+import com.tencent.tnn.demo.StreamPoseDetectLandmark.StreamPoseDetectLandmarkActivity;
+import com.tencent.tnn.demo.StreamSkeletonDetector.StreamSkeletonDetectActivity;
+import com.tencent.tnn.demo.StreamOCRDetector.StreamOCRDetectActivity;
+
+
+public class MainActivity extends Activity {
+
+    private TextView lightLiveCheckBtn;
+
+    private boolean isShowedActivity = false;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.activity_main);
+//       Debug.waitForDebugger();
+
+        init();
+
+    }
+
+    private void init() {
+        findViewById(R.id.stream_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamFaceDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+        findViewById(R.id.image_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageFaceDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.image_classify_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageClassifyDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.image_object_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageObjectDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        /*
+        findViewById(R.id.image_ocr_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageOCRDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+        */
+
+        findViewById(R.id.stream_object_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamObjectDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+        findViewById(R.id.image_object_detectssd_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageObjectDetectSSDActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_object_detectssd_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamObjectDetectSSDActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+        findViewById(R.id.image_facedetect_blaze_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, ImageBlazeFaceDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_facedetect_blaze_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamBlazeFaceDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_facealign_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamBlazeFaceAlignActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_hairsegmentation_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamHairSegmentationActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_pose_detect_landmark_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamPoseDetectLandmarkActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_skeleton_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamSkeletonDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+
+        findViewById(R.id.stream_ocr_detect_btn).setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                if (!isShowedActivity) {
+                    isShowedActivity = true;
+                    Intent intent = new Intent();
+                    Activity activity = MainActivity.this;
+                    intent.setClass(activity, StreamOCRDetectActivity.class);
+                    activity.startActivity(intent);
+                }
+            }
+        });
+    }
+
+    @Override
+    protected void onResume() {
+        super.onResume();
+        isShowedActivity = false;
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/OCRDetector.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/OCRDetector.java
new file mode 100644
index 0000000..b776a36
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/OCRDetector.java
@@ -0,0 +1,11 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class OCRDetector {
+    public native int init(String modelPath, int width, int height, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native ObjectInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+    public native ObjectInfo[] detectFromImage(Bitmap bitmap, int width, int height);
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetector.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetector.java
new file mode 100644
index 0000000..5dcde6c
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetector.java
@@ -0,0 +1,24 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class ObjectDetector {
+
+    public static final String[]  label_list = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+                "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+                "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+                "hair drier", "toothbrush"};
+
+    public native int init(String modelPath, int width, int height, float scoreThreshold, float iouThreshold, int topk, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native ObjectInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+    public native ObjectInfo[] detectFromImage(Bitmap bitmap, int width, int height);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetectorSSD.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetectorSSD.java
new file mode 100644
index 0000000..7ad15c7
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectDetectorSSD.java
@@ -0,0 +1,35 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class ObjectDetectorSSD {
+    public static final String[] label_list = {
+            "background",
+            "aeroplane",
+            "bicycle",
+            "bird",
+            "boat",
+            "bottle",
+            "bus",
+            "car",
+            "cat",
+            "chair",
+            "cow",
+            "diningtable",
+            "dog",
+            "horse",
+            "motorbike",
+            "person",
+            "pottedplant",
+            "sheep",
+            "sofa",
+            "train",
+            "tvmonitor"};
+
+    public native int init(String modelPath, int width, int height, float scoreThreshold, float iouThreshold, int topk, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native ObjectInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate);
+    public native ObjectInfo[] detectFromImage(Bitmap bitmap, int width, int height);
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectInfo.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectInfo.java
new file mode 100644
index 0000000..3ef8af9
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/ObjectInfo.java
@@ -0,0 +1,14 @@
+package com.tencent.tnn.demo;
+
+public class ObjectInfo {
+    public float x1;
+    public float y1;
+    public float x2;
+    public float y2;
+    public float score;
+    public float[] landmarks;
+    public int class_id;
+    public String label;
+    public float[][] key_points;
+    public int[][] lines;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/PoseDetectLandmark.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/PoseDetectLandmark.java
new file mode 100644
index 0000000..b0fde23
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/PoseDetectLandmark.java
@@ -0,0 +1,12 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class PoseDetectLandmark {
+
+    public native int init(String modelPath, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native ObjectInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate, int detector_type);
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/SkeletonDetector.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/SkeletonDetector.java
new file mode 100644
index 0000000..734d3d3
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/SkeletonDetector.java
@@ -0,0 +1,12 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+
+public class SkeletonDetector {
+
+    public native int init(String modelPath, int width, int height, int computeType);
+    public native boolean checkNpu(String modelPath);
+    public native int deinit();
+    public native ObjectInfo[] detectFromStream(byte[] yuv420sp, int width, int height, int view_width, int view_height, int rotate, int detector_type);
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignActivity.java
new file mode 100644
index 0000000..0df8d73
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignActivity.java
@@ -0,0 +1,61 @@
+package com.tencent.tnn.demo.StreamBlazeFaceAlign;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamBlazeFaceAlignActivity extends DemoBaseActivity {
+    private static final String TAG = StreamBlazeFaceAlignActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamBlazeFaceAlignFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java
new file mode 100644
index 0000000..cf043b9
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java
@@ -0,0 +1,445 @@
+package com.tencent.tnn.demo.StreamBlazeFaceAlign;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.BlazeFaceDetector;
+import com.tencent.tnn.demo.FaceAlign;
+import com.tencent.tnn.demo.FaceDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamBlazeFaceAlignFragment extends BaseFragment {
+
+    private final static String TAG = StreamBlazeFaceAlignFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private FaceAlign mFaceAlign = new FaceAlign();
+    private boolean mIsDetectingFace = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mFaceAlign.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] detectModelPathsDetector = {
+                "blazeface.tnnmodel",
+                "blazeface.tnnproto"
+        };
+
+        for (int i = 0; i < detectModelPathsDetector.length; i++) {
+            String modelFilePath = detectModelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "blazeface/"+modelFilePath, interModelFilePath);
+        }
+
+        //copy detect model to sdcard
+        String[] alignModelPathsDetector = {
+                "youtu_face_alignment_phase1.tnnmodel",
+                "youtu_face_alignment_phase1.tnnproto",
+                "youtu_face_alignment_phase2.tnnmodel",
+                "youtu_face_alignment_phase2.tnnproto"
+        };
+
+        for (int i = 0; i < alignModelPathsDetector.length; i++) {
+            String modelFilePath = alignModelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "youtu_face_alignment/"+modelFilePath, interModelFilePath);
+        }
+
+        FileUtils.copyAsset(getActivity().getAssets(),"blazeface_anchors.txt", targetDir + "/blazeface_anchors.txt");
+        FileUtils.copyAsset(getActivity().getAssets(),"mean_pts_phase1.txt", targetDir + "/mean_pts_phase1.txt");
+        FileUtils.copyAsset(getActivity().getAssets(),"mean_pts_phase2.txt", targetDir + "/mean_pts_phase2.txt");
+
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingFace = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mFaceAlign.init(modelPath, mCameraHeight, mCameraWidth, 0.975f, 0.23f, 1, device);
+                    if (ret == 0) {
+                        mIsDetectingFace = true;
+                    } else {
+                        mIsDetectingFace = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingFace) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            FaceInfo[] faceInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mFaceAlign.init(modelPath, mCameraHeight, mCameraWidth, 0.975f, 0.23f, 1, device);
+                                if (ret == 0) {
+                                    mIsDetectingFace = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingFace = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("BlazeFaceAlign");
+                            }
+                            faceInfoList = mFaceAlign.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("BlazeFaceAlign");
+                                double fps = mFpsCounter.getFps("BlazeFaceAlign");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + faceInfoList);
+                            int faceCount = 0;
+                            if (faceInfoList != null) {
+                                faceCount = faceInfoList.length;
+                            }
+                            mDrawView.addFaceRect(faceInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No face");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingFace = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mFaceAlign.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectActivity.java
new file mode 100644
index 0000000..97bd614
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectActivity.java
@@ -0,0 +1,61 @@
+package com.tencent.tnn.demo.StreamBlazeFaceDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamBlazeFaceDetectActivity extends DemoBaseActivity {
+    private static final String TAG = StreamBlazeFaceDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamBlazeFaceDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java
new file mode 100644
index 0000000..e2b0c25
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java
@@ -0,0 +1,422 @@
+package com.tencent.tnn.demo.StreamBlazeFaceDetector;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.BlazeFaceDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamBlazeFaceDetectFragment extends BaseFragment {
+
+    private final static String TAG = StreamBlazeFaceDetectFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+    private static final int NET_H_INPUT = 128;
+    private static final int NET_W_INPUT = 128;
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private BlazeFaceDetector mFaceDetector = new BlazeFaceDetector();
+    private boolean mIsDetectingFace = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mFaceDetector.checkNpu(modelPath);
+    }
+
+    private String initModel() {
+
+        String targetDir = getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "blazeface.tnnmodel",
+                "blazeface.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath;
+            FileUtils.copyAsset(getActivity().getAssets(), "blazeface/" + modelFilePath, interModelFilePath);
+        }
+        FileUtils.copyAsset(getActivity().getAssets(),"blazeface_anchors.txt", targetDir + "/blazeface_anchors.txt");
+        return targetDir;
+    }
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingFace = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mFaceDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.975f, 0.23f, 1, device);
+                    if (ret == 0) {
+                        mIsDetectingFace = true;
+                    } else {
+                        mIsDetectingFace = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingFace) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            FaceInfo[] faceInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mFaceDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.975f, 0.23f, 1, device);
+                                if (ret == 0) {
+                                    mIsDetectingFace = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingFace = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("BlazeFaceDetect");
+                            }
+                            faceInfoList = mFaceDetector.detectFromStream(data, mCameraWidth, mCameraHeight, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("BlazeFaceDetect");
+                                double fps = mFpsCounter.getFps("BlazeFaceDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + faceInfoList);
+                            int faceCount = 0;
+                            if (faceInfoList != null) {
+                                faceCount = faceInfoList.length;
+                            }
+                            mDrawView.addFaceRect(faceInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No face");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingFace = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mFaceDetector.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectActivity.java
new file mode 100644
index 0000000..cf9259a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectActivity.java
@@ -0,0 +1,64 @@
+package com.tencent.tnn.demo.StreamFaceDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class StreamFaceDetectActivity extends DemoBaseActivity {
+    private static final String TAG = StreamFaceDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamFaceDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java
new file mode 100644
index 0000000..b541bc6
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java
@@ -0,0 +1,424 @@
+package com.tencent.tnn.demo.StreamFaceDetector;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.FaceDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamFaceDetectFragment extends BaseFragment {
+
+    private final static String TAG = StreamFaceDetectFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private FaceDetector mFaceDetector = new FaceDetector();
+    private boolean mIsDetectingFace = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mFaceDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "version-slim-320_simplified.tnnmodel",
+                "version-slim-320_simplified.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "face_detector/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingFace = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mFaceDetector.init(modelPath, mCameraHeight, mCameraWidth, 0.975f, 0.23f, 1, device);
+                    if (ret == 0) {
+                        mIsDetectingFace = true;
+                    } else {
+                        mIsDetectingFace = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingFace) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            FaceInfo[] faceInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mFaceDetector.init(modelPath, mCameraHeight, mCameraWidth, 0.975f, 0.23f, 1, device);
+                                if (ret == 0) {
+                                    mIsDetectingFace = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingFace = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("FaceDetect");
+                            }
+                            faceInfoList = mFaceDetector.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("FaceDetect");
+                                double fps = mFpsCounter.getFps("FaceDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + faceInfoList);
+                            int faceCount = 0;
+                            if (faceInfoList != null) {
+                                faceCount = faceInfoList.length;
+                            }
+                            mDrawView.addFaceRect(faceInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No face");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingFace = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mFaceDetector.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationActivity.java
new file mode 100644
index 0000000..cf48519
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationActivity.java
@@ -0,0 +1,64 @@
+package com.tencent.tnn.demo.StreamHairSegmentation;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class StreamHairSegmentationActivity extends DemoBaseActivity {
+    private static final String TAG = StreamHairSegmentationActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamHairSegmentationFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java
new file mode 100644
index 0000000..1ac3238
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java
@@ -0,0 +1,478 @@
+package com.tencent.tnn.demo.StreamHairSegmentation;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.RadioGroup;
+import android.widget.RadioButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.HairSegmentation;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.ImageInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamHairSegmentationFragment extends BaseFragment {
+
+    private final static String TAG = StreamHairSegmentationFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private HairSegmentation mHairSegmentation = new HairSegmentation();
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsSegmentingHair = false;
+    private boolean mIsCountFps = false;
+    private RadioGroup color_button;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+    private byte[] mColor = {(byte)0, (byte)0, (byte)185, (byte)90};
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mHairSegmentation.checkNpu(modelPath);
+    }
+
+    private String initModel() {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        // copy segmentation model to sdcard
+        String[] modelPathsSegmentation = {
+                "segmentation.tnnmodel",
+                "segmentation.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsSegmentation.length; i++) {
+            String modelFilePath = modelPathsSegmentation[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "hair_segmentation/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera() {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b) {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b) {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_hair_segmentation);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+
+        RadioButton initBtn = $(R.id.button_blue);
+        initBtn.setSelected(true);
+        final int[] colorList = {R.id.button_blue, R.id.button_cyan, R.id.button_green, R.id.button_purple, R.id.button_red};
+        for (int j = 0; j < colorList.length; j++) {
+            RadioButton button = $(colorList[j]);
+            button.setOnClickListener(new View.OnClickListener(){
+                @Override
+                public void onClick(View v) {
+                    RadioButton btn = $(v.getId());
+                    boolean selected = btn.isSelected();
+                    // init hair color to black
+                    byte[] color = {(byte)0, (byte)0, (byte)0, (byte)0};
+                    mColor = color;
+                    if (!selected) {
+                        switch (v.getId()) {
+                            case R.id.button_blue:
+                                mColor[0] = (byte)0;
+                                mColor[1] = (byte)0;
+                                mColor[2] = (byte)185;
+                                mColor[3] = (byte)90;
+                                break;
+                            case R.id.button_cyan:
+                                mColor[0] = (byte)0;
+                                mColor[1] = (byte)185;
+                                mColor[2] = (byte)185;
+                                mColor[3] = (byte)40;
+                                break;
+                            case R.id.button_green:
+                                mColor[0] = (byte)0;
+                                mColor[1] = (byte)185;
+                                mColor[2] = (byte)0;
+                                mColor[3] = (byte)50;
+                                break;
+                            case R.id.button_purple:
+                                mColor[0] = (byte)185;
+                                mColor[1] = (byte)0;
+                                mColor[2] = (byte)185;
+                                mColor[3] = (byte)64;
+                                break;
+                            case R.id.button_red:
+                                mColor[0] = (byte)185;
+                                mColor[1] = (byte)0;
+                                mColor[2] = (byte)0;
+                                mColor[3] = (byte)64;
+                        }
+                    }
+                    mHairSegmentation.setHairColor(mColor);
+                    btn.setSelected(!selected);
+
+                    for (int j = 0; j < colorList.length; j++) {
+                        if (v.getId() != colorList[j]) {
+                            RadioButton tmpBtn = $(colorList[j]);
+                            tmpBtn.setSelected(false);
+                        }
+                    }
+                }
+            });
+        }
+
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+        mDrawView = (DrawView) $(R.id.drawView);
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsSegmentingHair = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mHairSegmentation.init(modelPath, mCameraHeight, mCameraWidth, device);
+                    if (ret == 0) {
+                        mIsSegmentingHair = true;
+                    } else {
+                        mIsSegmentingHair = false;
+                        Log.e(TAG, "Hair Segmentation init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsSegmentingHair) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ImageInfo[] imageInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mHairSegmentation.init(modelPath, mCameraHeight, mCameraWidth, device);
+                                if (ret == 0) {
+                                    mIsSegmentingHair = true;
+                                    mHairSegmentation.setHairColor(mColor);
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsSegmentingHair = false;
+                                    Log.e(TAG, "Hair Segmentation init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("HairSegmentation");
+                            }
+                            imageInfoList = mHairSegmentation.predictFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("HairSegmentation");
+                                double fps = mFpsCounter.getFps("HairSegmentation");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "predict from stream ret " + imageInfoList);
+                            mDrawView.addImageInfo(imageInfoList[1]);
+                        }
+                        else {
+                            Log.i(TAG,"No Hair Segmentating");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsSegmentingHair = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mHairSegmentation.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectActivity.java
new file mode 100644
index 0000000..0f0adc6
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectActivity.java
@@ -0,0 +1,64 @@
+package com.tencent.tnn.demo.StreamOCRDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.text.TextUtils;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+import java.io.File;
+
+public class StreamOCRDetectActivity extends DemoBaseActivity {
+    private static final String TAG = StreamOCRDetectActivity.class.getSimpleName();
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamOCRDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java
new file mode 100644
index 0000000..fda3f69
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java
@@ -0,0 +1,425 @@
+package com.tencent.tnn.demo.StreamOCRDetector;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.OCRDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamOCRDetectFragment extends BaseFragment {
+
+    private final static String TAG = StreamOCRDetectFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private OCRDetector mOCRDetector = new OCRDetector();
+    private boolean mIsDetectingOCR = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mOCRDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        // copy ocr related models to sdcard
+        String[] modelPathsDetector = {
+                "angle_net.tnnmodel",
+                "angle_net.tnnproto",
+                "crnn_lite_lstm.tnnmodel",
+                "crnn_lite_lstm.tnnproto",
+                "dbnet.tnnmodel",
+                "dbnet.tnnproto",
+                "keys.txt",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "chinese-ocr/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingOCR = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mOCRDetector.init(modelPath, mCameraHeight, mCameraWidth, device);
+                    if (ret == 0) {
+                        mIsDetectingOCR = true;
+                    } else {
+                        mIsDetectingOCR = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingOCR) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ObjectInfo[] objectInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mOCRDetector.init(modelPath, mCameraHeight, mCameraWidth, device);
+                                if (ret == 0) {
+                                    mIsDetectingOCR = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingOCR = false;
+                                    Log.e(TAG, "OCR detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("OCRDetect");
+                            }
+                            objectInfoList = mOCRDetector.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("OCRDetect");
+                                double fps = mFpsCounter.getFps("OCRDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + objectInfoList);
+                            mDrawView.addTextRect(objectInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No face");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingOCR = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mOCRDetector.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectActivity.java
new file mode 100644
index 0000000..14edd0f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectActivity.java
@@ -0,0 +1,60 @@
+package com.tencent.tnn.demo.StreamObjectDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamObjectDetectActivity extends DemoBaseActivity {
+    private static final String TAG = StreamObjectDetectActivity.class.getSimpleName();
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamObjectDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java
new file mode 100644
index 0000000..343830f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java
@@ -0,0 +1,427 @@
+package com.tencent.tnn.demo.StreamObjectDetector;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.ObjectDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamObjectDetectFragment extends BaseFragment {
+
+    private final static String TAG = StreamObjectDetectFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    private static final int NET_H_INPUT = 448;
+    private static final int NET_W_INPUT = 640;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private ObjectDetector mObjectDetector = new ObjectDetector();
+    private boolean mIsDetectingObject = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mObjectDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "yolov5s.tnnmodel",
+                "yolov5s-permute.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "yolov5/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingObject = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mObjectDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+                    if (ret == 0) {
+                        mIsDetectingObject = true;
+                    } else {
+                        mIsDetectingObject = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingObject) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ObjectInfo[] objectInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mObjectDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+                                if (ret == 0) {
+                                    mIsDetectingObject = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingObject = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("ObjectDetect");
+                            }
+                            objectInfoList = mObjectDetector.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("ObjectDetect");
+                                double fps = mFpsCounter.getFps("ObjectDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + objectInfoList);
+                            int objectCount = 0;
+                            if (objectInfoList != null) {
+                                objectCount = objectInfoList.length;
+                            }
+                            mDrawView.addObjectRect(objectInfoList,  ObjectDetector.label_list);
+                        }
+                        else {
+                            Log.i(TAG,"No object");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingObject = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mObjectDetector.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDActivity.java
new file mode 100644
index 0000000..fe95d51
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDActivity.java
@@ -0,0 +1,60 @@
+package com.tencent.tnn.demo.StreamObjectDetectorSSD;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamObjectDetectSSDActivity extends DemoBaseActivity {
+    private static final String TAG = StreamObjectDetectSSDActivity.class.getSimpleName();
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamObjectDetectSSDFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java
new file mode 100644
index 0000000..446afd5
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java
@@ -0,0 +1,427 @@
+package com.tencent.tnn.demo.StreamObjectDetectorSSD;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectDetector;
+import com.tencent.tnn.demo.ObjectDetectorSSD;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamObjectDetectSSDFragment extends BaseFragment {
+
+    private final static String TAG = StreamObjectDetectSSDFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    private static final int NET_H_INPUT = 300;
+    private static final int NET_W_INPUT = 300;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+
+    private ObjectDetectorSSD mObjectDetector = new ObjectDetectorSSD();
+    private boolean mIsDetectingObject = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mObjectDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "mobilenetv2_ssd.tnnmodel",
+                "mobilenetv2_ssd.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "mobilenet_v2-ssd/"+modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+                if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                    clickBack();
+                    return true;
+                }
+                return false;
+            }
+        });
+    }
+
+    /**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingObject = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mObjectDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+                    if (ret == 0) {
+                        mIsDetectingObject = true;
+                    } else {
+                        mIsDetectingObject = false;
+                        Log.e(TAG, "Face detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingObject) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ObjectInfo[] objectInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mObjectDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, 0.7f, 0.3f, -1, device);
+                                if (ret == 0) {
+                                    mIsDetectingObject = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingObject = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("ObjectDetect");
+                            }
+                            objectInfoList = mObjectDetector.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("ObjectDetect");
+                                double fps = mFpsCounter.getFps("ObjectDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + objectInfoList);
+                            int objectCount = 0;
+                            if (objectInfoList != null) {
+                                objectCount = objectInfoList.length;
+                            }
+                            mDrawView.addObjectRect(objectInfoList,  ObjectDetectorSSD.label_list);
+                        }
+                        else {
+                            Log.i(TAG,"No object");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingObject = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mObjectDetector.deinit();
+    }
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkActivity.java
new file mode 100644
index 0000000..97d3ef7
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkActivity.java
@@ -0,0 +1,60 @@
+package com.tencent.tnn.demo.StreamPoseDetectLandmark;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamPoseDetectLandmarkActivity extends DemoBaseActivity {
+    private static final String TAG = StreamPoseDetectLandmarkActivity.class.getSimpleName();
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamPoseDetectLandmarkFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java
new file mode 100644
index 0000000..932b45e
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java
@@ -0,0 +1,444 @@
+package com.tencent.tnn.demo.StreamPoseDetectLandmark;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.RadioGroup;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.PoseDetectLandmark;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamPoseDetectLandmarkFragment extends BaseFragment {
+
+    private final static String TAG = StreamPoseDetectLandmarkFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    private static final int DETECT_NET_H_INPUT = 128;
+    private static final int DETECT_NET_W_INPUT = 128;
+    private static final int LANDMARK_NET_H_INPUT = 256;
+    private static final int LANDMARK_NET_W_INPUT = 256;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+    RadioGroup radioGroup;
+
+    private PoseDetectLandmark mPoseDetectLandmark = new PoseDetectLandmark();
+    private boolean mIsDetectingObject = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+    private int detector_type = 0; // 0 : big, 1 : middle, 2 : small
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mPoseDetectLandmark.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "pose_detection.tnnproto",
+                "pose_detection.tnnmodel",
+                "pose_landmark_upper_body.tnnproto",
+                "pose_landmark_upper_body.tnnmodel",
+                "pose_landmark_full_body.tnnproto",
+                "pose_landmark_full_body.tnnmodel",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "blazepose/" + modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_pose_detect_landmark);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+
+        radioGroup = $(R.id.mode_button_group);
+        radioGroup.setOnCheckedChangeListener(new RadioGroup.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(RadioGroup group, int checkedId) {
+                if (checkedId == R.id.upper_body_mode_button) {
+                    detector_type = 0;
+                } else if (checkedId == R.id.full_body_mode_button) {
+                    detector_type = 1;
+                }
+            }
+        });
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingObject = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = initModel();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mPoseDetectLandmark.init(modelPath, device);
+                    if (ret == 0) {
+                        mIsDetectingObject = true;
+                    } else {
+                        mIsDetectingObject = false;
+                        Log.e(TAG, "BlazePose detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingObject) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ObjectInfo[] objectInfoList;
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mPoseDetectLandmark.init(modelPath, device);
+                                if (ret == 0) {
+                                    mIsDetectingObject = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingObject = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("PoseDetectLandmark");
+                            }
+                            objectInfoList = mPoseDetectLandmark.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate, detector_type);
+                            if (mIsCountFps) {
+                                mFpsCounter.end("PoseDetectLandmark");
+                                double fps = mFpsCounter.getFps("PoseDetectLandmark");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+                            Log.i(TAG, "detect from stream ret " + objectInfoList);
+                            mDrawView.addObjectRect(objectInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No object");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingObject = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mPoseDetectLandmark.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectActivity.java
new file mode 100644
index 0000000..817a62e
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectActivity.java
@@ -0,0 +1,60 @@
+package com.tencent.tnn.demo.StreamSkeletonDetector;
+
+import android.app.Fragment;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.activity.DemoBaseActivity;
+
+public class StreamSkeletonDetectActivity extends DemoBaseActivity {
+    private static final String TAG = StreamSkeletonDetectActivity.class.getSimpleName();
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.base_activity_layout);
+
+
+        if (Build.VERSION.SDK_INT >= 23) {
+            Log.d(TAG, "begin askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            askForPermission();
+        } else {
+            Log.d(TAG, "no need to askForPermission the sdk version is" + Build.VERSION.SDK_INT);
+            updateUI();
+        }
+    }
+
+    public void updateUI() {
+        Fragment fragment = new StreamSkeletonDetectFragment();
+        getFragmentManager().beginTransaction().add(R.id.fragment_container, fragment).commit();
+    }
+
+    @Override
+    protected void onResume() {
+        Log.d(TAG, "Activity onResume");
+        super.onResume();
+
+    }
+
+    @Override
+    protected void onPause() {
+        Log.d(TAG, "Activity onPause");
+        super.onPause();
+
+    }
+
+    @Override
+    protected void onStop() {
+        Log.d(TAG, "Activity onStop");
+        super.onStop();
+    }
+
+    @Override
+    protected void onDestroy() {
+        Log.d(TAG, "Activity onDestroy");
+        super.onDestroy();
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java
new file mode 100644
index 0000000..73811b2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java
@@ -0,0 +1,441 @@
+package com.tencent.tnn.demo.StreamSkeletonDetector;
+
+import android.hardware.Camera;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.KeyEvent;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.widget.Button;
+import android.widget.CompoundButton;
+import android.widget.RadioGroup;
+import android.widget.TextView;
+import android.widget.ToggleButton;
+
+import com.tencent.tnn.demo.SkeletonDetector;
+import com.tencent.tnn.demo.FpsCounter;
+import com.tencent.tnn.demo.FileUtils;
+import com.tencent.tnn.demo.Helper;
+import com.tencent.tnn.demo.ObjectInfo;
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.CameraSetting;
+import com.tencent.tnn.demo.common.component.DrawView;
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+import com.tencent.tnn.demo.common.sufaceHolder.DemoSurfaceHolder;
+
+import java.io.IOException;
+
+
+public class StreamSkeletonDetectFragment extends BaseFragment {
+
+    private final static String TAG = StreamSkeletonDetectFragment.class.getSimpleName();
+
+    /**********************************     Define    **********************************/
+
+    private SurfaceView mPreview;
+
+    private DrawView mDrawView;
+    private int mCameraWidth;
+    private int mCameraHeight;
+
+    Camera mOpenedCamera;
+    int mOpenedCameraId = 0;
+    DemoSurfaceHolder mDemoSurfaceHolder = null;
+
+    private static final int NET_H_INPUT = 352;
+    private static final int NET_W_INPUT = 192;
+
+    int mCameraFacing = -1;
+    int mRotate = -1;
+    SurfaceHolder mSurfaceHolder;
+    RadioGroup radioGroup;
+
+    private SkeletonDetector mSkeletonDetector = new SkeletonDetector();
+    private boolean mIsDetectingObject = false;
+    private FpsCounter mFpsCounter = new FpsCounter();
+    private boolean mIsCountFps = false;
+
+    private ToggleButton mGPUSwitch;
+    private boolean mUseGPU = false;
+    //add for npu
+    private ToggleButton mHuaweiNPUswitch;
+    private boolean mUseHuaweiNpu = false;
+    private TextView HuaweiNpuTextView;
+
+    private boolean mDeviceSwiched = false;
+    private int detector_type = 0; // 0 : high precision, 1 : fast
+
+    /**********************************     Get Preview Advised    **********************************/
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.d(TAG, "onCreate");
+        super.onCreate(savedInstanceState);
+        System.loadLibrary("tnn_wrapper");
+        //start SurfaceHolder
+        mDemoSurfaceHolder = new DemoSurfaceHolder(this);
+        String modelPath = initModel();
+        NpuEnable = mSkeletonDetector.checkNpu(modelPath);
+    }
+
+    private String initModel()
+    {
+
+        String targetDir =  getActivity().getFilesDir().getAbsolutePath();
+
+        //copy detect model to sdcard
+        String[] modelPathsDetector = {
+                "skeleton.tnnmodel",
+                "skeleton_big.tnnproto",
+                "skeleton_small.tnnproto",
+        };
+
+        for (int i = 0; i < modelPathsDetector.length; i++) {
+            String modelFilePath = modelPathsDetector[i];
+            String interModelFilePath = targetDir + "/" + modelFilePath ;
+            FileUtils.copyAsset(getActivity().getAssets(), "skeleton/" + modelFilePath, interModelFilePath);
+        }
+        return targetDir;
+    }
+
+    @Override
+    public void onClick(View view) {
+        int i = view.getId();
+        if (i == R.id.back_rl) {
+            clickBack();
+        }
+    }
+    private void restartCamera()
+    {
+        closeCamera();
+        openCamera(mCameraFacing);
+        startPreview(mSurfaceHolder);
+    }
+    private void onSwichGPU(boolean b)
+    {
+        if (b && mHuaweiNPUswitch.isChecked()) {
+            mHuaweiNPUswitch.setChecked(false);
+            mUseHuaweiNpu = false;
+        }
+        mUseGPU = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void onSwichNPU(boolean b)
+    {
+        if (b && mGPUSwitch.isChecked()) {
+            mGPUSwitch.setChecked(false);
+            mUseGPU = false;
+        }
+        mUseHuaweiNpu = b;
+        TextView result_view = (TextView)$(R.id.result);
+        result_view.setText("");
+        mDeviceSwiched = true;
+    }
+
+    private void clickBack() {
+        if (getActivity() != null) {
+            (getActivity()).finish();
+        }
+    }
+
+    @Override
+    public void setFragmentView() {
+        Log.d(TAG, "setFragmentView");
+        setView(R.layout.fragment_stream_skeleton_detector);
+        setTitleGone();
+        $$(R.id.gpu_switch);
+        $$(R.id.back_rl);
+        mGPUSwitch = $(R.id.gpu_switch);
+        mGPUSwitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichGPU(b);
+            }
+        });
+
+        $$(R.id.npu_switch);
+        mHuaweiNPUswitch = $(R.id.npu_switch);
+        mHuaweiNPUswitch.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(CompoundButton compoundButton, boolean b) {
+                onSwichNPU(b);
+            }
+        });
+        HuaweiNpuTextView = $(R.id.npu_text);
+        if (!NpuEnable) {
+            HuaweiNpuTextView.setVisibility(View.INVISIBLE);
+            mHuaweiNPUswitch.setVisibility(View.INVISIBLE);
+        }
+
+        radioGroup = $(R.id.mode_button_group);
+        radioGroup.setOnCheckedChangeListener(new RadioGroup.OnCheckedChangeListener() {
+            @Override
+            public void onCheckedChanged(RadioGroup group, int checkedId) {
+                if (checkedId == R.id.high_precision_mode_button) {
+                    detector_type = 0;
+                } else if (checkedId == R.id.fast_mode_button) {
+                    detector_type = 1;
+                }
+            }
+        });
+        init();
+    }
+
+
+    private void init() {
+        mPreview = $(R.id.live_detection_preview);
+
+        Button btnSwitchCamera = $(R.id.switch_camera);
+        btnSwitchCamera.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View v) {
+                closeCamera();
+                if (mCameraFacing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+                }
+                else {
+                    openCamera(Camera.CameraInfo.CAMERA_FACING_FRONT);
+                }
+                startPreview(mSurfaceHolder);
+            }
+        });
+
+
+        mDrawView = (DrawView) $(R.id.drawView);
+
+
+    }
+
+    @Override
+    public void onStart() {
+        Log.d(TAG, "onStart");
+        super.onStart();
+        if (null != mDemoSurfaceHolder) {
+            SurfaceHolder holder = mPreview.getHolder();
+            holder.setKeepScreenOn(true);
+            mDemoSurfaceHolder.setSurfaceHolder(holder);
+        }
+    }
+
+    @Override
+    public void onResume() {
+        super.onResume();
+        Log.d(TAG, "onResume");
+
+        getFocus();
+        preview();
+    }
+
+    @Override
+    public void onPause() {
+        Log.d(TAG, "onPause");
+        super.onPause();
+    }
+
+    @Override
+    public void onStop() {
+        Log.i(TAG, "onStop");
+        super.onStop();
+    }
+
+
+    @Override
+    public void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+    }
+
+    private void preview() {
+        Log.i(TAG, "preview");
+
+    }
+
+    private void getFocus() {
+        getView().setFocusableInTouchMode(true);
+        getView().requestFocus();
+        getView().setOnKeyListener(new View.OnKeyListener() {
+            @Override
+            public boolean onKey(View v, int keyCode, KeyEvent event) {
+            if (event.getAction() == KeyEvent.ACTION_UP && keyCode == KeyEvent.KEYCODE_BACK) {
+                clickBack();
+                return true;
+            }
+            return false;
+            }
+        });
+    }
+
+/**********************************     Camera    **********************************/
+
+
+    public void openCamera() {
+        openCamera(Camera.CameraInfo.CAMERA_FACING_BACK);
+    }
+
+    private void openCamera(int cameraFacing) {
+        mIsDetectingObject = true;
+        mCameraFacing = cameraFacing;
+        try {
+            int numberOfCameras = Camera.getNumberOfCameras();
+            if (numberOfCameras < 1) {
+                Log.e(TAG, "no camera device found");
+            } else if (1 == numberOfCameras) {
+                mOpenedCamera = Camera.open(0);
+                mOpenedCameraId = 0;
+            } else {
+                Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+                for (int i = 0; i < numberOfCameras; i++) {
+                    Camera.getCameraInfo(i, cameraInfo);
+                    if (cameraInfo.facing == cameraFacing) {
+                        mOpenedCamera = Camera.open(i);
+                        mOpenedCameraId = i;
+                        break;
+                    }
+                }
+            }
+            if (mOpenedCamera == null) {
+//                popTip("can't find camera","");
+                Log.e(TAG, "can't find camera");
+            }
+            else {
+
+                int r = CameraSetting.initCamera(getActivity().getApplicationContext(),mOpenedCamera,mOpenedCameraId);
+                if (r == 0) {
+                    //设置摄像头朝向
+                    CameraSetting.setCameraFacing(cameraFacing);
+
+                    Camera.Parameters parameters = mOpenedCamera.getParameters();
+                    mRotate = CameraSetting.getRotate(getActivity().getApplicationContext(), mOpenedCameraId, mCameraFacing);
+                    mCameraWidth = parameters.getPreviewSize().width;
+                    mCameraHeight = parameters.getPreviewSize().height;
+                    String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                    int device = 0;
+                    if (mUseHuaweiNpu) {
+                        device = 2;
+                    } else if (mUseGPU) {
+                        device = 1;
+                    }
+                    int ret = mSkeletonDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, device);
+                    if (ret == 0) {
+                        mIsDetectingObject = true;
+                    } else {
+                        mIsDetectingObject = false;
+                        Log.e(TAG, "Skeleton detector init failed " + ret);
+                    }
+
+                    ret = mFpsCounter.init();
+                    if (ret == 0) {
+                        mIsCountFps = true;
+                    } else {
+                        mIsCountFps = false;
+                        Log.e(TAG, "Fps Counter init failed " + ret);
+                    }
+                } else {
+                    Log.e(TAG, "Failed to init camera");
+                }
+            }
+        }
+        catch (Exception e) {
+            Log.e(TAG, "open camera failed:" + e.getLocalizedMessage());
+        }
+    }
+
+    public void startPreview(SurfaceHolder surfaceHolder) {
+        try {
+            if (null != mOpenedCamera) {
+                Log.i(TAG, "start preview, is previewing");
+                mOpenedCamera.setPreviewCallback(new Camera.PreviewCallback() {
+                    @Override
+                    public void onPreviewFrame(byte[] data, Camera camera) {
+                        if (mIsDetectingObject) {
+                            Camera.Parameters mCameraParameters = camera.getParameters();
+                            ObjectInfo[] objectInfoList = {};
+                            // reinit
+                            if (mDeviceSwiched) {
+                                String modelPath = getActivity().getFilesDir().getAbsolutePath();
+                                int device = 0;
+                                if (mUseHuaweiNpu) {
+                                    device = 2;
+                                } else if (mUseGPU) {
+                                    device = 1;
+                                }
+                                int ret = mSkeletonDetector.init(modelPath, NET_W_INPUT, NET_H_INPUT, device);
+                                if (ret == 0) {
+                                    mIsDetectingObject = true;
+                                    mFpsCounter.init();
+                                } else {
+                                    mIsDetectingObject = false;
+                                    Log.e(TAG, "Face detector init failed " + ret);
+                                }
+                                mDeviceSwiched = false;
+                            }
+                            if (mIsCountFps) {
+                                mFpsCounter.begin("SkeletonDetect");
+                            }
+
+                            objectInfoList = mSkeletonDetector.detectFromStream(data, mCameraParameters.getPreviewSize().width, mCameraParameters.getPreviewSize().height, mDrawView.getWidth(), mDrawView.getHeight(), mRotate, detector_type);
+
+                            if (mIsCountFps) {
+                                mFpsCounter.end("SkeletonDetect");
+                                double fps = mFpsCounter.getFps("SkeletonDetect");
+                                String monitorResult = "device: ";
+                                if (mUseGPU) {
+                                    monitorResult += "opencl\n";
+                                } else if (mUseHuaweiNpu) {
+                                    monitorResult += "huawei_npu\n";
+                                } else {
+                                    monitorResult += "arm\n";
+                                }
+                                monitorResult += "fps: " + String.format("%.02f", fps);
+                                TextView monitor_result_view = (TextView)$(R.id.monitor_result);
+                                monitor_result_view.setText(monitorResult);
+                            }
+
+                            mDrawView.addObjectRect(objectInfoList);
+                        }
+                        else {
+                            Log.i(TAG,"No object");
+                        }
+                    }
+                });
+                mOpenedCamera.setPreviewDisplay(surfaceHolder);
+                mOpenedCamera.startPreview();
+                mSurfaceHolder = surfaceHolder;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void closeCamera() {
+        Log.i(TAG, "closeCamera");
+        mIsDetectingObject = false;
+        if (mOpenedCamera != null) {
+            try {
+                mOpenedCamera.stopPreview();
+                mOpenedCamera.setPreviewCallback(null);
+                Log.i(TAG, "stop preview, not previewing");
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            }
+            try {
+                mOpenedCamera.release();
+                mOpenedCamera = null;
+            } catch (Exception e) {
+                e.printStackTrace();
+                Log.i(TAG, "Error setting camera preview: " + e.toString());
+            } finally {
+                mOpenedCamera = null;
+            }
+        }
+        mSkeletonDetector.deinit();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/TNNLib.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/TNNLib.java
new file mode 100644
index 0000000..196fe2a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/TNNLib.java
@@ -0,0 +1,36 @@
+package com.tencent.tnn.demo;
+
+import android.graphics.Bitmap;
+import android.util.Log;
+
+public class TNNLib {
+
+    private long nativePtr;
+
+    static {
+        try {
+            System.loadLibrary("tnn_wrapper");
+        }catch(Exception e) {
+        }catch(Error e) {
+        } finally {
+        }
+    }
+
+    public void setNativePtr(long nativePtr) {
+        this.nativePtr = nativePtr;
+    }
+
+    public long getNativePtr(){
+        return nativePtr;
+    }
+
+
+    public TNNLib() {}
+
+    public native int init(String protoFilePath, String modelFilePath, String device_type);
+
+    public native float[] forward(Bitmap imageSrc);
+
+    public native int deinit();
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/activity/DemoBaseActivity.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/activity/DemoBaseActivity.java
new file mode 100644
index 0000000..074c3fe
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/activity/DemoBaseActivity.java
@@ -0,0 +1,94 @@
+package com.tencent.tnn.demo.common.activity;
+
+import android.Manifest;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.support.v4.app.ActivityCompat;
+import android.support.v4.content.ContextCompat;
+import android.util.Log;
+
+public abstract class DemoBaseActivity extends Activity {
+    private static final String TAG = "DemoBaseActivity";
+    private static final int FACE_PERMISSION_QUEST_CAMERA = 1024;
+
+    public void askForPermission() {
+        //检测权限
+        if (ContextCompat.checkSelfPermission(this, Manifest.permission.CAMERA) != PackageManager.PERMISSION_GRANTED ||
+                ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED ||
+                ContextCompat.checkSelfPermission(this, Manifest.permission.READ_PHONE_STATE) != PackageManager.PERMISSION_GRANTED ||
+                ContextCompat.checkSelfPermission(this, Manifest.permission.WRITE_EXTERNAL_STORAGE) != PackageManager.PERMISSION_GRANTED) {
+            Log.w(TAG, "didnt get permission,ask for it!");
+            ActivityCompat.requestPermissions(this,
+                    new String[]{Manifest.permission.CAMERA, Manifest.permission.RECORD_AUDIO, Manifest.permission.READ_PHONE_STATE, Manifest.permission.WRITE_EXTERNAL_STORAGE},
+                    FACE_PERMISSION_QUEST_CAMERA);
+        } else {
+            updateUI();
+        }
+    }
+
+    @Override
+    public void onRequestPermissionsResult(int requestCode, String[] permissions,
+                                           int[] grantResults) {
+        switch (requestCode) {
+            case FACE_PERMISSION_QUEST_CAMERA:
+                //If request is cancelled, the result arrays are empty.
+                if (grantResults.length > 0) {
+                    if (grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+                        Log.i(TAG, "get camera permission!");
+                        if (grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+                            //permission was granted，yay！Do the
+                            // mic-related task u need to do.
+                            Log.i(TAG, "get mic permission!");
+                            if (grantResults[2] == PackageManager.PERMISSION_GRANTED) {
+                                Log.i(TAG, "get read_phone permission!");
+                                Log.i(TAG, "get all permission! Go on Verify!");
+                                if (grantResults[3] == PackageManager.PERMISSION_GRANTED) {
+                                    updateUI();
+                                    return;
+                                }else {
+                                    askReadPhonePermissionError();
+                                    return;
+                                }
+                            } else {
+                                askReadPhonePermissionError();
+                                return;
+                            }
+                        } else {
+                            askAudioPermissionError();
+                            return;
+                        }
+                    } else {
+                        askCameraPermissionError();
+                        return;
+                    }
+                }
+                break;
+        }
+    }
+
+    public abstract void updateUI();
+
+    private void askCameraPermissionError() {
+        Log.e(TAG, "Didn't get camera permission!");
+        String msg = "用户没有授权相机权限";
+        askPermissionError(msg);
+    }
+
+    private void askAudioPermissionError() {
+        Log.e(TAG, "Didn't get mic permission!");
+        String msg = "用户没有授权录音权限";
+        askPermissionError(msg);
+    }
+
+    private void askReadPhonePermissionError() {
+        Log.e(TAG, "Didn't get read_phone permission!");
+        String msg = "用户没有授权读取手机状态权限";
+        askPermissionError(msg);
+    }
+
+    private void askPermissionError(String msg) {
+        Log.w(TAG,"设备授权验证失败");
+        finish();
+    }
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CameraSetting.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CameraSetting.java
new file mode 100644
index 0000000..3ad4e62
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CameraSetting.java
@@ -0,0 +1,321 @@
+package com.tencent.tnn.demo.common.component;
+
+import android.content.Context;
+import android.hardware.Camera;
+import android.hardware.Camera.CameraInfo;
+import android.hardware.Camera.Parameters;
+import android.hardware.Camera.Size;
+import android.media.CamcorderProfile;
+import android.text.TextUtils;
+import android.util.Log;
+import android.view.Surface;
+import android.view.WindowManager;
+import java.util.Iterator;
+import java.util.List;
+
+public class CameraSetting {
+    private static final String TAG = CameraSetting.class.getSimpleName();
+    public static int mCameraFacing = 1;
+    public static int mRotate = 0;
+    static int mDesiredPreviewWidth = 640;
+    static int mDesiredPreviewHeight = 480;
+
+    public static void setCameraFacing(int cameraFacing) {
+        mCameraFacing = cameraFacing;
+    }
+
+    public static void setCameraRotate(int cameraRotate) {
+        mRotate = cameraRotate;
+    }
+
+    public static int getDesiredPreviewWidth() {
+        return mDesiredPreviewWidth;
+    }
+
+    public static int getDesiredPreviewHeight() {
+        return mDesiredPreviewHeight;
+    }
+
+    public static int getRotate(Context context, int cameraId, int cameraFacing) {
+        int cameraRotateAngle = getVideoRotate(context, cameraId);
+        int rotateTag = getRotateTag(cameraRotateAngle, cameraFacing);
+        return rotateTag;
+    }
+
+    public static int transBackFacingCameraRatateTag(int backRotate) {
+        if (backRotate == 1) {
+            return 2;
+        } else if (backRotate == 2) {
+            return 1;
+        } else if (backRotate == 3) {
+            return 4;
+        } else if (backRotate == 4) {
+            return 3;
+        } else if (backRotate == 5) {
+            return 8;
+        } else if (backRotate == 6) {
+            return 7;
+        } else if (backRotate == 7) {
+            return 6;
+        } else if (backRotate == 8) {
+            return 5;
+        } else {
+            Log.w(TAG, "[YtCameraSetting.transBackFacingCameraRatateTag] unsurported rotateTag: " + backRotate);
+            return 0;
+        }
+    }
+
+    public static int getRotateTag(int cameraRotate, int cameraFacing) {
+        int rotate = 1;
+        if (cameraRotate == 90) {
+            rotate = 7;
+        } else if (cameraRotate == 180) {
+            rotate = 3;
+        } else if (cameraRotate == 270) {
+            rotate = 5;
+        } else {
+            Log.i(TAG, "camera rotate not 90degree or 180degree, input: " + cameraRotate);
+        }
+
+        return cameraFacing == 1 ? rotate : transBackFacingCameraRatateTag(rotate);
+    }
+
+    public static int getVideoRotate(Context context, int mCameraID) {
+        CameraInfo cameraInfo = new CameraInfo();
+        Camera.getCameraInfo(mCameraID, cameraInfo);
+        int rotation = ((WindowManager)context.getSystemService(Context.WINDOW_SERVICE)).getDefaultDisplay().getRotation();
+        int degrees = 0;
+        switch(rotation) {
+            case Surface.ROTATION_0:
+                degrees = 0;
+                break;
+            case Surface.ROTATION_90:
+                degrees = 90;
+                break;
+            case Surface.ROTATION_180:
+                degrees = 180;
+                break;
+            case Surface.ROTATION_270:
+                degrees = 270;
+        }
+
+        int videoOrientation;
+        if (cameraInfo.facing == 1) {
+            int hintOrientation = (cameraInfo.orientation + degrees) % 360;
+            videoOrientation = (360 - hintOrientation) % 360;
+        } else {
+            videoOrientation = (cameraInfo.orientation - degrees + 360) % 360;
+        }
+
+        Log.i(TAG, "debug camera orientation is " + cameraInfo.orientation + " ui degrees is " + degrees);
+        return videoOrientation;
+    }
+
+    public static int initCamera(Context context, Camera camera, int mCameraID) {
+        Parameters parameters;
+        try {
+            parameters = camera.getParameters();
+        } catch (Exception var20) {
+            Log.e(TAG, "get camera parameters failed. 1. Check Camera.getParameters() interface. 2. Get logs for more detail.");
+            return 1;
+        }
+
+        List<String> suporrtedFocusModes = parameters.getSupportedFocusModes();
+
+        int videoOrientation;
+        for(videoOrientation = 0; videoOrientation < suporrtedFocusModes.size(); ++videoOrientation) {
+            Log.v(TAG, "suporrtedFocusModes " + videoOrientation + " :" + (String)suporrtedFocusModes.get(videoOrientation));
+        }
+
+        if (suporrtedFocusModes != null && suporrtedFocusModes.indexOf("continuous-video") >= 0) {
+            parameters.setFocusMode("continuous-video");
+            Log.d(TAG, "set camera focus mode continuous video");
+        } else if (suporrtedFocusModes != null && suporrtedFocusModes.indexOf("auto") >= 0) {
+            parameters.setFocusMode("auto");
+            Log.d(TAG, "set camera focus mode auto");
+        } else {
+            Log.d(TAG, "NOT set camera focus mode");
+        }
+
+        try {
+            camera.setParameters(parameters);
+        } catch (Exception var18) {
+            Log.e(TAG, "Camera.setParameters.setPreviewSize failed!!: " + var18.getLocalizedMessage());
+        } finally {
+            parameters = camera.getParameters();
+        }
+
+        videoOrientation = getVideoRotate(context, mCameraID);
+        camera.setDisplayOrientation(videoOrientation);
+        Log.d(TAG, "videoOrietation is" + videoOrientation);
+        CamcorderProfile camcorderProfile;
+        if (CamcorderProfile.hasProfile(mCameraID, 4)) {
+            camcorderProfile = CamcorderProfile.get(mCameraID, 4);
+            Log.d(TAG, "480P camcorderProfile:" + camcorderProfile.videoFrameWidth + "x" + camcorderProfile.videoFrameHeight);
+        } else if (CamcorderProfile.hasProfile(mCameraID, 5)) {
+            camcorderProfile = CamcorderProfile.get(mCameraID, 5);
+            Log.d(TAG, "720P camcorderProfile:" + camcorderProfile.videoFrameWidth + "x" + camcorderProfile.videoFrameHeight);
+        } else {
+            camcorderProfile = CamcorderProfile.get(mCameraID, 1);
+            Log.d(TAG, "High camcorderProfile:" + camcorderProfile.videoFrameWidth + "x" + camcorderProfile.videoFrameHeight);
+        }
+
+        setVideoSize(parameters, camcorderProfile);
+        parameters.setPreviewSize(mDesiredPreviewWidth, mDesiredPreviewHeight);
+        parameters.setPreviewFormat(17);
+
+        try {
+            camera.setParameters(parameters);
+        } catch (Exception var17) {
+            Log.e(TAG, "Camera.setParameters.setPreviewSize failed!!: " + var17.getLocalizedMessage());
+        }
+
+        parameters = camera.getParameters();
+        int fps = chooseFixedPreviewFps(parameters, 30000);
+        Log.d(TAG, "choose camera fps is : " + fps);
+
+        try {
+            camera.setParameters(parameters);
+        } catch (Exception var16) {
+            Log.e(TAG, "Camera.setParameters.preview fps failed!!: " + var16.getLocalizedMessage());
+        }
+
+        parameters = camera.getParameters();
+        int[] newFpsRange = new int[2];
+        parameters.getPreviewFpsRange(newFpsRange);
+        int newFps = parameters.getPreviewFrameRate();
+        Log.d(TAG, "after set parameters getPreviewFpsRange=" + newFpsRange[0] + "-" + newFpsRange[1] + " ;after set parameter fps=" + newFps);
+        Size previewSize = parameters.getPreviewSize();
+        Log.d(TAG, "camera preview size is " + previewSize.width + " " + previewSize.height);
+        return 0;
+    }
+
+    public static void setVideoSize(Parameters parameters, CamcorderProfile camcorderProfile) {
+        List<Size> sizes = parameters.getSupportedPreviewSizes();
+        if (parameters.getSupportedVideoSizes() == null) {
+            Size optimalSize = getOptimalPreviewSize(sizes, camcorderProfile.videoFrameWidth, camcorderProfile.videoFrameHeight);
+            if (null == optimalSize) {
+                Log.d(TAG, "do not find proper preview size, use default");
+                camcorderProfile.videoFrameWidth = 640;
+                camcorderProfile.videoFrameHeight = 480;
+            }
+        }
+
+        boolean isVideoSizeOptimal = false;
+        List<Size> videoSizes = parameters.getSupportedVideoSizes();
+        if (videoSizes != null) {
+            for(int i = 0; i < videoSizes.size(); ++i) {
+                Size temp = (Size)videoSizes.get(i);
+                if (temp.width == camcorderProfile.videoFrameWidth && temp.height == camcorderProfile.videoFrameHeight) {
+                    isVideoSizeOptimal = true;
+                }
+            }
+
+            if (!isVideoSizeOptimal) {
+                camcorderProfile.videoFrameWidth = 640;
+                camcorderProfile.videoFrameHeight = 480;
+            }
+        }
+
+        Log.d(TAG, "select video size camcorderProfile:" + camcorderProfile.videoFrameWidth + "x" + camcorderProfile.videoFrameHeight);
+    }
+
+    private static Size getOptimalPreviewSize(List<Size> sizes, int width, int height) {
+        double ASPECT_TOLERANCE = 0.001D;
+        if (sizes == null) {
+            return null;
+        } else {
+            Size optimalSize = null;
+            double minDiff = 1.7976931348623157E308D;
+            int targetWidth = Math.max(width, height);
+            int targetHeight = Math.min(width, height);
+            double targetRatio = (double)targetWidth / (double)targetHeight;
+            Log.d(TAG, "sizes size=" + sizes.size());
+            Iterator var12 = sizes.iterator();
+
+            Size size;
+            while(var12.hasNext()) {
+                size = (Size)var12.next();
+                double ratio = (double)size.width / (double)size.height;
+                if (Math.abs(ratio - targetRatio) <= 0.001D && (double)Math.abs(size.height - targetHeight) < minDiff) {
+                    optimalSize = size;
+                    minDiff = (double)Math.abs(size.height - targetHeight);
+                }
+            }
+
+            if (optimalSize == null) {
+                Log.d(TAG, "No preview size match the aspect ratio");
+                minDiff = 1.7976931348623157E308D;
+                var12 = sizes.iterator();
+
+                while(var12.hasNext()) {
+                    size = (Size)var12.next();
+                    if ((double)Math.abs(size.height - targetHeight) < minDiff) {
+                        optimalSize = size;
+                        minDiff = (double)Math.abs(size.height - targetHeight);
+                    }
+                }
+            }
+
+            return optimalSize;
+        }
+    }
+
+    private static int chooseFixedPreviewFps(Parameters parms, int desiredThousandFps) {
+        List<int[]> supported = parms.getSupportedPreviewFpsRange();
+        Iterator var3 = supported.iterator();
+
+        while(var3.hasNext()) {
+            int[] entry = (int[])var3.next();
+            Log.d(TAG, "entry: " + entry[0] + " - " + entry[1]);
+            if (entry[0] == entry[1] && entry[0] == desiredThousandFps) {
+                parms.setPreviewFpsRange(entry[0], entry[1]);
+                Log.d(TAG, "use preview fps range: " + entry[0] + " " + entry[1]);
+                return entry[0];
+            }
+        }
+
+        int[] tmp = new int[2];
+        parms.getPreviewFpsRange(tmp);
+        int guess;
+        if (tmp[0] == tmp[1]) {
+            guess = tmp[0];
+        } else {
+            guess = desiredThousandFps;
+            if (desiredThousandFps > tmp[1]) {
+                guess = tmp[1];
+            }
+
+            if (guess < tmp[0]) {
+                guess = tmp[0];
+            }
+        }
+
+        String preview_frame_rate_values = parms.get("preview-frame-rate-values");
+        if (!TextUtils.isEmpty(preview_frame_rate_values) && !preview_frame_rate_values.contains("" + guess / 1000)) {
+            String[] values = preview_frame_rate_values.split(",");
+            String[] var7 = values;
+            int var8 = values.length;
+
+            for(int var9 = 0; var9 < var8; ++var9) {
+                String string = var7[var9];
+                int fps = Integer.parseInt(string) * 1000;
+                if (guess < fps) {
+                    parms.setPreviewFrameRate(fps / 1000);
+                    return fps;
+                }
+            }
+
+            if (values.length > 0) {
+                int fps = Integer.parseInt(values[values.length - 1]) * 1000;
+                if (guess > fps) {
+                    guess = fps;
+                }
+            }
+        }
+
+        parms.setPreviewFrameRate(guess / 1000);
+        return guess;
+    }
+}
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CustomDialog.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CustomDialog.java
new file mode 100644
index 0000000..998f547
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/CustomDialog.java
@@ -0,0 +1,147 @@
+package com.tencent.tnn.demo.common.component;
+
+import android.app.Dialog;
+import android.content.Context;
+import android.os.Bundle;
+import android.text.TextPaint;
+import android.view.View;
+import android.view.Window;
+import android.widget.FrameLayout;
+import android.widget.LinearLayout;
+import android.widget.TextView;
+
+import com.tencent.tnn.demo.R;
+
+
+public class CustomDialog extends Dialog implements View.OnClickListener {
+    private String mDialogTitle;
+    private String mDialogTip;
+    private String mButtonYes;
+    private String mButtonNo;
+
+    public CustomDialog(Context context) {
+        super(context);
+    }
+
+    public CustomDialog(Context context, int themeResId) {
+        super(context, themeResId);
+    }
+
+    public CustomDialog(Context context, boolean cancelable, OnCancelListener cancelListener) {
+        super(context, cancelable, cancelListener);
+    }
+
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        requestWindowFeature(Window.FEATURE_NO_TITLE);
+
+        setContentView(R.layout.dialog_layout);
+        LinearLayout ll = (LinearLayout) findViewById(R.id.avd_root_view);
+
+        FrameLayout.LayoutParams lp1 = (FrameLayout.LayoutParams) ll.getLayoutParams();
+        lp1.width = getWindow().getWindowManager().getDefaultDisplay().getWidth() - 2 * dip2px(getContext(), 30);
+        ll.setLayoutParams(lp1);
+
+        TextView tvTitle = (TextView) findViewById(R.id.avd_dialog_title);
+        tvTitle.setText(mDialogTitle);
+
+        TextView tv1 = (TextView) findViewById(R.id.avd_dialog_tip);
+        tv1.setText(mDialogTip);
+
+        TextView mFirstButton = (TextView) findViewById(R.id.avd_button_yes);
+        mFirstButton.setText(mButtonYes);
+
+        TextPaint paint = mFirstButton.getPaint();
+        float y1 = paint.measureText(mButtonYes);
+
+        TextView mSecondButton = (TextView) findViewById(R.id.avd_button_no);
+        mSecondButton.setText(mButtonNo);
+
+        float y2;
+        if (mButtonNo != null) {
+            y2 = paint.measureText(mButtonNo);
+        } else {
+            y2 = 0;
+        }
+
+        float delta = dip2px(getContext(), 30) * 2;
+        float y = Math.max(y1, y2) + delta;
+
+        LinearLayout.LayoutParams lp = (LinearLayout.LayoutParams) mFirstButton.getLayoutParams();
+        //lp.width = (int) y;
+        mFirstButton.setLayoutParams(lp);
+
+        LinearLayout.LayoutParams lp2 = (LinearLayout.LayoutParams) mSecondButton.getLayoutParams();
+        //lp2.width = (int) y;
+        mSecondButton.setLayoutParams(lp2);
+
+        mSecondButton.setOnClickListener(this);
+        mFirstButton.setOnClickListener(this);
+
+        mFirstButton.setVisibility(View.GONE);
+        mSecondButton.setVisibility(View.GONE);
+
+        if (mButtonYes != null) {
+            mFirstButton.setVisibility(View.VISIBLE);
+        }
+
+        if (mButtonNo != null) {
+            mSecondButton.setVisibility(View.VISIBLE);
+        }
+
+        this.setCanceledOnTouchOutside(false);
+
+    }
+
+    public CustomDialog setTitle(String text) {
+        mDialogTitle = text;
+        return this;
+    }
+
+    public CustomDialog setTips(String text) {
+        mDialogTip = text;
+        return this;
+    }
+
+    public CustomDialog setPositiveText(String text) {
+        mButtonYes = text;
+        return this;
+    }
+
+    public CustomDialog setNegativeText(String text) {
+        mButtonNo = text;
+        return this;
+    }
+
+    public void setOnClickListener(DialogListener listener) {
+        mListener = listener;
+    }
+
+    @Override
+    public void onClick(View v) {
+
+        int id = v.getId();
+        if (id == R.id.avd_button_yes) {
+            mListener.onYesClick();
+        } else if (id == R.id.avd_button_no) {
+            mListener.onNoClick();
+        }
+        dismiss();
+    }
+
+
+    private DialogListener mListener;
+
+    public interface DialogListener {
+        void onYesClick();
+
+        void onNoClick();
+    }
+
+    private static int dip2px(Context context, float dpValue) {
+        final float scale = context.getResources().getDisplayMetrics().density;
+        return (int) (dpValue * scale + 0.5f);
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/DrawView.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/DrawView.java
new file mode 100644
index 0000000..2a4913d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/DrawView.java
@@ -0,0 +1,226 @@
+package com.tencent.tnn.demo.common.component;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.graphics.Point;
+import android.graphics.Rect;
+import android.util.AttributeSet;
+import android.util.Log;
+import android.view.SurfaceView;
+import android.graphics.Bitmap;
+
+import com.tencent.tnn.demo.BlazeFaceDetector;
+import com.tencent.tnn.demo.FaceDetector;
+import com.tencent.tnn.demo.FaceInfo;
+import com.tencent.tnn.demo.ImageInfo;
+import com.tencent.tnn.demo.ObjectDetector;
+import com.tencent.tnn.demo.ObjectDetectorSSD;
+import com.tencent.tnn.demo.ObjectInfo;
+
+
+import java.util.ArrayList;
+import java.nio.ByteBuffer;
+
+
+public class DrawView extends SurfaceView
+{
+    private static String TAG = DrawView.class.getSimpleName();
+    private Paint paint = new Paint();
+    private Paint key_paint = new Paint();
+    private Paint line_paint = new Paint();
+    private Paint line_point_paint = new Paint();
+    private Paint text_paint = new Paint();
+    private ArrayList<String> labels = new ArrayList<String>();
+    private ArrayList<Rect> rects = new ArrayList<Rect>();
+    private ArrayList<float[]> points_list = new ArrayList<float[]>();
+    private ArrayList<float[]> point_lines_list = new ArrayList<float[]>();
+    private ArrayList<ImageInfo> image_info_list = new ArrayList<ImageInfo>();
+
+    public DrawView(Context context, AttributeSet attrs)
+    {
+        super(context, attrs);
+        paint.setARGB(255, 0, 255, 0);
+        key_paint.setARGB(255, 0, 255, 0);
+        paint.setStyle(Paint.Style.STROKE);
+        key_paint.setStyle(Paint.Style.STROKE);
+        key_paint.setStrokeWidth(5);
+        line_paint.setARGB(255, 255, 0, 0);
+        line_paint.setStyle(Paint.Style.STROKE);
+        line_paint.setStrokeWidth(3);
+        line_point_paint.setARGB(255, 0, 255, 0);
+        line_point_paint.setStyle(Paint.Style.STROKE);
+        line_point_paint.setStrokeWidth(10);
+        text_paint.setARGB(255, 255, 0, 0);
+        text_paint.setStyle(Paint.Style.STROKE);
+        text_paint.setTextAlign(Paint.Align.CENTER);
+        text_paint.setTextSize(30);
+        setWillNotDraw(false);
+    }
+
+    public void addFaceRect(FaceInfo[] facestatus)
+    {
+        rects.clear();
+        points_list.clear();
+        if (facestatus != null && facestatus.length!=0)
+        {
+            for (int i=0; i<facestatus.length; i++)
+            {
+                rects.add(new Rect((int)facestatus[i].x1, (int)facestatus[i].y1, (int)facestatus[i].x2, (int)facestatus[i].y2));
+                float[][] keypoints = facestatus[i].keypoints;
+                if(keypoints != null) {
+                    float[] points = new float[facestatus[i].keypoints.length * 2];
+                    for(int j = 0; j < keypoints.length; ++j) {
+                        points[j * 2] = facestatus[i].keypoints[j][0];
+                        points[j * 2 + 1] = facestatus[i].keypoints[j][1];
+                    }
+                    points_list.add(points);
+                }
+            }
+        }
+
+        postInvalidate();
+    }
+
+    public void addObjectRect(ObjectInfo[] objectstatus, String[]  label_list)
+    {
+        rects.clear();
+        labels.clear();
+        if (objectstatus != null && objectstatus.length!=0)
+        {
+            for (int i=0; i<objectstatus.length; i++)
+            {
+                rects.add(new Rect((int)objectstatus[i].x1, (int)objectstatus[i].y1, (int)objectstatus[i].x2, (int)objectstatus[i].y2));
+                labels.add(String.format("%s : %f", label_list[objectstatus[i].class_id], objectstatus[i].score));
+            }
+        }
+
+        postInvalidate();
+    }
+
+    public void addTextRect(ObjectInfo[] objectstatus)
+    {
+        point_lines_list.clear();
+        labels.clear();
+        if (objectstatus != null && objectstatus.length!=0)
+        {
+            for (int i=0; i<objectstatus.length; i++)
+            {
+                float[] point_lines = new float[4 * 4];
+                point_lines[0] = objectstatus[i].key_points[0][0];
+                point_lines[1] = objectstatus[i].key_points[0][1];
+                point_lines[2] = objectstatus[i].key_points[1][0];
+                point_lines[3] = objectstatus[i].key_points[1][1];
+                point_lines[4] = objectstatus[i].key_points[1][0];
+                point_lines[5] = objectstatus[i].key_points[1][1];
+                point_lines[6] = objectstatus[i].key_points[2][0];
+                point_lines[7] = objectstatus[i].key_points[2][1];
+                point_lines[8] = objectstatus[i].key_points[2][0];
+                point_lines[9] = objectstatus[i].key_points[2][1];
+                point_lines[10] = objectstatus[i].key_points[3][0];
+                point_lines[11] = objectstatus[i].key_points[3][1];
+                point_lines[12] = objectstatus[i].key_points[3][0];
+                point_lines[13] = objectstatus[i].key_points[3][1];
+                point_lines[14] = objectstatus[i].key_points[0][0];
+                point_lines[15] = objectstatus[i].key_points[0][1];
+
+                point_lines_list.add(point_lines);
+                labels.add(String.format("%s", objectstatus[i].label));
+            }
+        }
+
+        postInvalidate();
+    }
+
+    public void addObjectRect(ObjectInfo[] objectstatus)
+    {
+        points_list.clear();
+        point_lines_list.clear();
+        if (objectstatus != null && objectstatus.length != 0)
+        {
+            for (int i = 0; i < objectstatus.length; i++) {
+                float[][] key_points = objectstatus[i].key_points;
+                if (key_points != null && key_points.length != 0) {
+                    float[] points = new float[key_points.length * 2];
+                    for (int j = 0; j < key_points.length; ++j) {
+                        points[j * 2] = key_points[j][0];
+                        points[j * 2 + 1] = key_points[j][1];
+                    }
+                    points_list.add(points);
+                }
+
+                int[][] lines = objectstatus[i].lines;
+                if (lines != null && lines.length != 0) {
+                    float[] point_lines = new float[lines.length * 4];
+                    for (int j = 0; j < lines.length; ++j) {
+                        point_lines[j * 4] = key_points[objectstatus[i].lines[j][0]][0];
+                        point_lines[j * 4 + 1] = key_points[objectstatus[i].lines[j][0]][1];
+                        point_lines[j * 4 + 2] = key_points[objectstatus[i].lines[j][1]][0];
+                        point_lines[j * 4 + 3] = key_points[objectstatus[i].lines[j][1]][1];
+                    }
+                    point_lines_list.add(point_lines);
+                }
+            }
+        }
+
+        postInvalidate();
+    }
+
+    public void addImageInfo(ImageInfo imageInfo)
+    {
+        image_info_list.clear();
+        image_info_list.add(imageInfo);
+
+        postInvalidate();
+    }
+
+    @Override
+    protected void onDraw(Canvas canvas)
+    {
+        if (rects.size() > 0)
+        {
+            for (int i=0; i<rects.size(); i++) {
+                Log.d(TAG, "rect " + rects.get(i));
+                paint.setARGB(255, 0, 255, 0);
+                canvas.drawRect(rects.get(i), paint);
+                if(labels.size() > 0) {
+                    canvas.drawText(labels.get(i), rects.get(i).left, rects.get(i).top - 5, paint);
+                }
+            }
+
+        }
+
+        if (points_list.size() > 0) {
+            for (int i = 0; i < points_list.size(); ++i) {
+                float[] points = points_list.get(i);
+                canvas.drawPoints(points, point_lines_list.isEmpty() ? key_paint : line_point_paint);
+            }
+        }
+
+        if (point_lines_list.size() > 0) {
+            for (int i = 0; i < point_lines_list.size(); ++i) {
+                float[] point_lines = point_lines_list.get(i);
+                canvas.drawLines(point_lines, line_paint);
+                if(labels.size() > 0) {
+                    canvas.drawText(labels.get(i), point_lines[0], point_lines[1], text_paint);
+                }
+            }
+        }
+
+        if (image_info_list.size() > 0) {
+            for (int i = 0; i < image_info_list.size(); i++) {
+                ImageInfo imageInfo = image_info_list.get(i);
+                if (imageInfo.image_channel != 4) {
+                    Log.e(TAG, "canvas get invalid image info, image_channel: " + imageInfo.image_channel);
+                } else {
+                    Bitmap bitmap = Bitmap.createBitmap(imageInfo.image_width, imageInfo.image_height, Bitmap.Config.ARGB_8888);
+                    ByteBuffer buffer = ByteBuffer.wrap(imageInfo.data);
+                    bitmap.copyPixelsFromBuffer(buffer);
+                    Rect rect = new Rect(0, 0, getWidth() - 1, getHeight() -1);
+                    canvas.drawBitmap(bitmap, null, rect, null);
+                    bitmap.recycle();
+                }
+            }
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/PreviewFrameLayout.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/PreviewFrameLayout.java
new file mode 100644
index 0000000..8c341ed
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/PreviewFrameLayout.java
@@ -0,0 +1,81 @@
+package com.tencent.tnn.demo.common.component;
+
+import android.annotation.TargetApi;
+import android.content.Context;
+import android.os.Build;
+import android.util.AttributeSet;
+import android.widget.RelativeLayout;
+
+/**
+ * A layout which handles the preview aspect ratio.
+ */
+public class PreviewFrameLayout extends RelativeLayout {
+    /**
+     * A callback to be invoked when the preview frame's size changes.
+     */
+    public interface OnSizeChangedListener {
+        void onSizeChanged(double ratio);
+    }
+
+    private double mAspectRatio = 4.0 / 3.0;
+
+    public PreviewFrameLayout(Context context, AttributeSet attrs) {
+        super(context, attrs);
+    }
+
+    public double getmAspectRatio() {
+        return mAspectRatio;
+    }
+
+    public void setAspectRatio(double ratio) {
+        if (ratio <= 0.0)
+            throw new IllegalArgumentException();
+        if (mAspectRatio != ratio) {
+            mAspectRatio = ratio;
+            requestLayout();
+        }
+    }
+
+    @TargetApi(Build.VERSION_CODES.HONEYCOMB)
+    public void showBorder(boolean enabled) {
+        setActivated(enabled);
+    }
+
+    @Override
+    protected void onMeasure(int widthSpec, int heightSpec) {
+        int previewWidth = MeasureSpec.getSize(widthSpec);
+        int previewHeight = MeasureSpec.getSize(heightSpec);
+
+        // Get the padding of the border background.
+        int hPadding = getPaddingLeft() + getPaddingRight();
+        int vPadding = getPaddingTop() + getPaddingBottom();
+
+        // Resize the preview frame with correct aspect ratio.
+        previewWidth -= hPadding;
+        previewHeight -= vPadding;
+
+        boolean widthLonger = previewWidth > previewHeight;
+        int longSide = (widthLonger ? previewWidth : previewHeight);
+        int shortSide = (widthLonger ? previewHeight : previewWidth);
+        if (longSide < shortSide * mAspectRatio) {
+            longSide = (int) ((double) shortSide * mAspectRatio);
+        } else {
+            shortSide = (int) ((double) longSide / mAspectRatio);
+        }
+        if (widthLonger) {
+            previewWidth = longSide;
+            previewHeight = shortSide;
+        } else {
+            previewWidth = shortSide;
+            previewHeight = longSide;
+        }
+
+        // Add the padding of the border.
+        previewWidth += hPadding;
+        previewHeight += vPadding;
+
+        // Ask children to follow the new preview dimension.
+        super.onMeasure(MeasureSpec.makeMeasureSpec(previewWidth, MeasureSpec.EXACTLY),
+                MeasureSpec.makeMeasureSpec(previewHeight, MeasureSpec.EXACTLY));
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/TitleBar.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/TitleBar.java
new file mode 100644
index 0000000..f40acb2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/component/TitleBar.java
@@ -0,0 +1,167 @@
+package com.tencent.tnn.demo.common.component;
+
+import android.content.Context;
+import android.content.res.TypedArray;
+import android.util.AttributeSet;
+import android.util.Log;
+import android.view.LayoutInflater;
+import android.view.View;
+import android.widget.ImageView;
+import android.widget.RelativeLayout;
+import android.widget.TextView;
+
+import com.tencent.tnn.demo.R;
+
+public class TitleBar extends RelativeLayout implements View.OnClickListener {
+
+    private static final String TAG = TitleBar.class.getSimpleName();
+
+    private TextView mLeftText;
+    private ImageView mLeftImage;
+    private TextView mRightText;
+    private ImageView mRightImage;
+    private TextView mTitle;
+
+    public TitleBar(Context context) {
+        super(context);
+        initView();
+    }
+
+    public TitleBar(Context context, AttributeSet attrs) {
+        super(context, attrs);
+        initView();
+        getAttrs(context, attrs);
+    }
+
+    public TitleBar(Context context, AttributeSet attrs, int defStyleAttr) {
+        super(context, attrs, defStyleAttr);
+        initView();
+        getAttrs(context, attrs);
+    }
+
+    private void initView() {
+
+        LayoutInflater layoutInflater = (LayoutInflater) getContext().getSystemService(Context.LAYOUT_INFLATER_SERVICE);
+        View root = layoutInflater.inflate(R.layout.title_bar_layout, this);
+        root.findViewById(R.id.avd_left_button).setOnClickListener(this);
+        root.findViewById(R.id.avd_right_button).setOnClickListener(this);
+        mLeftText = (TextView) root.findViewById(R.id.avd_left_text);
+        mRightText = (TextView) root.findViewById(R.id.avd_right_text);
+        mLeftImage = (ImageView) root.findViewById(R.id.avd_left_image);
+        mRightImage = (ImageView) root.findViewById(R.id.avd_right_image);
+        mTitle = (TextView) root.findViewById(R.id.avd_bar_title);
+    }
+
+    private void getAttrs(Context context, AttributeSet attrs) {
+        if (context == null) {
+            Log.e(TAG, "传入context为空");
+            return;
+        }
+        TypedArray ta = context.obtainStyledAttributes(attrs, R.styleable.TitleBarAttr);
+        String mLeftString = ta.getString(R.styleable.TitleBarAttr_left_text);
+        String mRightString = ta.getString(R.styleable.TitleBarAttr_right_text);
+        String mTitleString = ta.getString(R.styleable.TitleBarAttr_bar_title);
+
+        boolean leftImageVisible = ta.getBoolean(R.styleable.TitleBarAttr_left_image_visible, true);
+        if (!leftImageVisible) {
+            mLeftImage.setVisibility(GONE);
+        }
+
+        if (mTitleString != null) {
+            mTitle.setText(mTitleString);
+        } else {
+            mTitle.setVisibility(INVISIBLE);
+        }
+
+        int mLeftImageId = ta.getResourceId(R.styleable.TitleBarAttr_left_image, 0);
+
+        boolean rightImageVisible = ta.getBoolean(R.styleable.TitleBarAttr_right_image_visible, false);
+        if (rightImageVisible) {
+            mRightImage.setVisibility(VISIBLE);
+        } else {
+            mRightImage.setVisibility(GONE);
+        }
+
+        if (mRightString != null) {
+            mRightText.setVisibility(VISIBLE);
+            mRightText.setText(mRightString);
+        } else {
+            mRightText.setVisibility(GONE);
+        }
+
+        if (mLeftString != null) {
+            mLeftText.setVisibility(VISIBLE);
+            mLeftText.setText(mLeftString);
+        } else {
+            mLeftText.setVisibility(INVISIBLE);
+        }
+
+        if (mLeftImageId != 0) {
+            mLeftImage.setImageDrawable(getResources().getDrawable(mLeftImageId));
+        }
+        ta.recycle();
+    }
+
+
+    @Override
+    public void onClick(View view) {
+        if (view.getId() == R.id.avd_left_button) {
+            if (mClick != null) {
+                mClick.onLeftClick();
+            }
+        }
+
+        if (view.getId() == R.id.avd_right_button) {
+            if (mClick != null) {
+                mClick.onRightClick();
+            }
+        }
+    }
+
+    public interface TitleBarClick {
+        void onLeftClick();
+
+        void onRightClick();
+    }
+
+    private TitleBarClick mClick;
+
+    public void setClickListener(TitleBarClick click) {
+        mClick = click;
+    }
+
+
+    public void setLeftText(String text) {
+        mLeftText.setVisibility(VISIBLE);
+        mLeftText.setText(text);
+        mLeftImage.setVisibility(VISIBLE);
+    }
+
+    public void setTitle(String title) {
+        mTitle.setVisibility(VISIBLE);
+        mTitle.setText(title);
+    }
+
+    public void setTitleOnly(String title) {
+        mTitle.setVisibility(VISIBLE);
+        mTitle.setText(title);
+
+        mLeftText.setVisibility(INVISIBLE);
+        mRightText.setVisibility(INVISIBLE);
+        mLeftImage.setVisibility(INVISIBLE);
+        mRightImage.setVisibility(INVISIBLE);
+    }
+
+    public void setRightImge() {
+        mRightImage.setVisibility(VISIBLE);
+    }
+
+    public void setRightImageSrc(int id) {
+        mRightImage.setImageResource(id);
+    }
+
+    public void setRightText(String text) {
+        mRightText.setVisibility(VISIBLE);
+        mRightText.setText(text);
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/fragment/BaseFragment.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/fragment/BaseFragment.java
new file mode 100644
index 0000000..65e43cb
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/fragment/BaseFragment.java
@@ -0,0 +1,95 @@
+package com.tencent.tnn.demo.common.fragment;
+
+import android.app.Fragment;
+import android.os.Bundle;
+import android.view.LayoutInflater;
+import android.view.SurfaceHolder;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.LinearLayout;
+
+import com.tencent.tnn.demo.R;
+import com.tencent.tnn.demo.common.component.TitleBar;
+
+
+public abstract class BaseFragment extends Fragment implements View.OnClickListener {
+
+    private LinearLayout root;
+    private TitleBar titleBar;
+    private LayoutInflater mInflater;
+    public boolean NpuEnable = false;
+
+    @Override
+    public View onCreateView(LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) {
+        mInflater = inflater;
+        View v = inflater.inflate(R.layout.base_fragment_layout, container, false);
+        root = (LinearLayout) v.findViewById(R.id.avd_contain);
+        titleBar = $(R.id.avd_title_bar);
+        setFragmentView();
+        //  closeBackKey();
+        return v;
+    }
+
+    public <T> T $(int id) {
+        return (T) root.findViewById(id);
+    }
+
+    public <T extends View> T $$(int id) {
+        View v = root.findViewById(id);
+        v.setOnClickListener(this);
+        return (T) v;
+    }
+
+
+    public abstract void setFragmentView();
+    public abstract void openCamera();
+    public abstract void startPreview(SurfaceHolder surfaceHolder);
+    public abstract void closeCamera();
+
+    public View setView(int layoutId) {
+        View content = mInflater.inflate(layoutId, null);
+        LinearLayout.LayoutParams lp = new LinearLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.MATCH_PARENT);
+        content.setLayoutParams(lp);
+        root.addView(content);
+        return root;
+    }
+
+    public void setTitleGone() {
+        titleBar.setVisibility(View.GONE);
+    }
+
+    public void setTitle(String title) {
+        titleBar.setTitle(title);
+    }
+
+    public void setTitleOnly(String title) {
+        titleBar.setTitleOnly(title);
+    }
+
+
+    public void setRightImage() {
+        titleBar.setRightImge();
+    }
+
+    public void setRightImageSrc(int resId) {
+        titleBar.setRightImageSrc(resId);
+    }
+
+    public void setRightText(String right) {
+        titleBar.setRightText(right);
+    }
+
+    public void setClickListener(TitleBar.TitleBarClick clickListener) {
+
+        if (clickListener != null) {
+            titleBar.setClickListener(clickListener);
+        }
+    }
+
+    @Override
+    public void onClick(View view) {
+
+    }
+
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/sufaceHolder/DemoSurfaceHolder.java b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/sufaceHolder/DemoSurfaceHolder.java
new file mode 100644
index 0000000..f16c1fc
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/java/com/tencent/tnn/demo/common/sufaceHolder/DemoSurfaceHolder.java
@@ -0,0 +1,64 @@
+package com.tencent.tnn.demo.common.sufaceHolder;
+
+import android.util.Log;
+import android.view.SurfaceHolder;
+
+import com.tencent.tnn.demo.common.fragment.BaseFragment;
+
+public class DemoSurfaceHolder implements SurfaceHolder.Callback {
+    private static final String TAG = "DemoSurfaceHolder";
+
+    private BaseFragment mFaceLiveFragment;
+
+    private SurfaceHolder mHolder;
+
+
+
+    public DemoSurfaceHolder(BaseFragment faceLiveFragment) {
+        mFaceLiveFragment = faceLiveFragment;
+    }
+
+        /**
+         * @param surfaceHolder SurfaceView的holder
+         */
+    public void setSurfaceHolder(SurfaceHolder surfaceHolder) {
+        // Install a SurfaceHolder.Callback so we get notified when the
+        // underlying surface is created and destroyed.
+        mHolder = surfaceHolder;
+        mHolder.addCallback(this);
+    }
+
+
+    /********************************
+     * SurfaceHolder.Callback function start
+     *********************************/
+
+    public void surfaceCreated(SurfaceHolder holder) {
+        // The Surface has been created, now tell the camera where to draw the preview.
+        Log.i(TAG, "surfaceCreated");
+        mFaceLiveFragment.openCamera();
+    }
+
+    @Override
+    public void surfaceChanged(SurfaceHolder surfaceHolder, int format, int width, int height) {
+        Log.i(TAG, "surfaceChanged");
+        if (mHolder.getSurface() == null) {
+            // preview surface does not exist
+            return;
+        }
+
+        mFaceLiveFragment.startPreview(surfaceHolder);
+    }
+
+    public void surfaceDestroyed(SurfaceHolder holder) {
+        Log.i(TAG, "surfaceDestroyed");
+        holder.removeCallback(this);
+
+        mFaceLiveFragment.closeCamera();
+    }
+
+    /********************************
+     * SurfaceHolder.Callback function end
+     *********************************/
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc
new file mode 100644
index 0000000..3ae93fa
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc
@@ -0,0 +1,290 @@
+#include "blazeface_align_jni.h"
+#include "blazeface_detector.h"
+#include "youtu_face_align.h"
+#include "face_detect_aligner.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include <tnn/utils/mat_utils.h>
+#include <kannarotate-android-lib/include/kannarotate.h>
+#include <yuv420sp_to_rgb_fast_asm.h>
+
+static std::shared_ptr<TNN_NS::FaceDetectAligner> gAligner;
+
+static jclass clsFaceInfo;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int target_width = 128;
+static int target_height = 128;
+
+static std::string modelPathStr = "";
+static jmethodID midconstructorFaceInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidkeypoints;
+
+void makeBlazefaceAlignDetectOption(std::shared_ptr<TNN_NS::BlazeFaceDetectorOption> &option,
+                                    std::string &lib_path, std::string &proto_content,
+                                    std::string &model_content) {
+    option->library_path = lib_path;
+    option->proto_content = proto_content;
+    option->model_content = model_content;
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->input_width = target_width;
+    option->input_height = target_height;
+//    option->min_score_threshold = 0.75;
+    option->min_suppression_threshold = 0.3;
+    option->anchor_path = modelPathStr + "/blazeface_anchors.txt";
+}
+
+std::shared_ptr<TNN_NS::BlazeFaceDetector> CreateBlazeFaceDetector(JNIEnv *env, jobject thiz, jstring modelPath,
+                           jint width, jint height, jint topk,
+                           jint computUnitType) {
+    auto predictor = std::make_shared<TNN_NS::BlazeFaceDetector>();
+    std::string proto_content, model_content, lib_path = "";
+    modelPathStr = jstring2string(env, modelPath);
+    proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+    model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    LOGI("proto content size %d model content size %d", proto_content.length(),
+         model_content.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
+    makeBlazefaceAlignDetectOption(option, lib_path, proto_content, model_content);
+
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = predictor->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        LOGI("the device type  %d device huawei_npu", gComputeUnitType);
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        predictor->setNpuModelPath(modelPathStr + "/");
+        predictor->setCheckNpuSwitch(false);
+        status = predictor->Init(option);
+    } else {
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        status = predictor->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int) status);
+        return nullptr;
+    }
+
+    if (clsFaceInfo == NULL) {
+        clsFaceInfo = static_cast<jclass>(env->NewGlobalRef(
+                env->FindClass("com/tencent/tnn/demo/FaceInfo")));
+        midconstructorFaceInfo = env->GetMethodID(clsFaceInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsFaceInfo, "x1", "F");
+        fidy1 = env->GetFieldID(clsFaceInfo, "y1", "F");
+        fidx2 = env->GetFieldID(clsFaceInfo, "x2", "F");
+        fidy2 = env->GetFieldID(clsFaceInfo, "y2", "F");
+        fidkeypoints = env->GetFieldID(clsFaceInfo, "keypoints", "[[F");
+    }
+
+    return predictor;
+}
+
+std::shared_ptr<TNN_NS::YoutuFaceAlign> CreateBlazeFaceAlign(JNIEnv *env, jobject thiz, jstring modelPath,
+                           jint width, jint height, jint topk,
+                           jint computUnitType, int phase) {
+    auto predictor = std::make_shared<TNN_NS::YoutuFaceAlign>();
+    std::string proto_content, model_content, lib_path = "";
+    modelPathStr = jstring2string(env, modelPath);
+    if(phase == 1) {
+        proto_content = fdLoadFile(modelPathStr + "/youtu_face_alignment_phase1.tnnproto");
+        model_content = fdLoadFile(modelPathStr + "/youtu_face_alignment_phase1.tnnmodel");
+    } else if(phase == 2) {
+        proto_content = fdLoadFile(modelPathStr + "/youtu_face_alignment_phase2.tnnproto");
+        model_content = fdLoadFile(modelPathStr + "/youtu_face_alignment_phase2.tnnmodel");
+    }
+
+    LOGI("proto content size %d model content size %d", proto_content.length(),
+         model_content.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::YoutuFaceAlignOption>();
+
+    option->proto_content = proto_content;
+    option->model_content = model_content;
+
+    option->input_width = target_width;
+    option->input_height = target_height;
+    //face threshold
+    option->face_threshold = 0.5;
+    option->min_face_size = 20;
+    //model phase
+    option->phase = phase;
+    //net_scale
+    option->net_scale = phase == 1? 1.2 : 1.3;
+    //mean pts path
+    std::string mean_file_path = phase==1?  modelPathStr + "/mean_pts_phase1.txt" :  modelPathStr + "/mean_pts_phase2.txt";
+    option->mean_pts_path = std::move(mean_file_path);
+
+
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = predictor->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        LOGI("the device type  %d device huawei_npu", gComputeUnitType);
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        predictor->setNpuModelPath(modelPathStr + "/");
+        predictor->setCheckNpuSwitch(false);
+        status = predictor->Init(option);
+    } else {
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        status = predictor->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("align init failed %d", (int) status);
+        return nullptr;
+    }
+    if (clsFaceInfo == NULL) {
+        clsFaceInfo = static_cast<jclass>(env->NewGlobalRef(
+                env->FindClass("com/tencent/tnn/demo/FaceInfo")));
+        midconstructorFaceInfo = env->GetMethodID(clsFaceInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsFaceInfo, "x1", "F");
+        fidy1 = env->GetFieldID(clsFaceInfo, "y1", "F");
+        fidx2 = env->GetFieldID(clsFaceInfo, "x2", "F");
+        fidy2 = env->GetFieldID(clsFaceInfo, "y2", "F");
+        fidkeypoints = env->GetFieldID(clsFaceInfo, "keypoints", "[[F");
+    }
+    return predictor;
+}
+
+JNIEXPORT jint JNICALL TNN_BLAZEFACE_ALIGN(init)(JNIEnv *env, jobject thiz, jstring modelPath,
+                                                 jint width, jint height, jfloat scoreThreshold,
+                                                 jfloat iouThreshold, jint topk,
+                                                 jint computUnitType) {
+
+    gAligner = std::make_shared<TNN_NS::FaceDetectAligner>();
+
+    // Reset bench description
+    auto face_detector = CreateBlazeFaceDetector(env, thiz, modelPath,
+                                                 width, height, topk,
+                                                 computUnitType);
+    if(face_detector == nullptr) {
+        LOGE("create align phase1 failed \n");
+        return -1;
+    }
+
+    auto predictor_phase1 = CreateBlazeFaceAlign(env, thiz, modelPath,
+                                                 width, height, topk,
+                                                 computUnitType, 1);
+    if(predictor_phase1 == nullptr) {
+        LOGE("create align phase1 failed \n");
+        return -1;
+    }
+
+    auto predictor_phase2 = CreateBlazeFaceAlign(env, thiz, modelPath,
+                                                 width, height, topk,
+                                                 computUnitType, 2);
+    if(predictor_phase1 == nullptr) {
+        LOGE("create align phase2 failed \n");
+        return -1;
+    }
+
+    int ret = gAligner->Init({face_detector, predictor_phase1, predictor_phase2});
+
+    return ret;
+}
+
+JNIEXPORT JNICALL jboolean
+TNN_BLAZEFACE_ALIGN(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::BlazeFaceDetector tmpDetector;
+    std::string protoContent, modelContent, lib_path = "";
+    modelPathStr = jstring2string(env, modelPath);
+    protoContent = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(),
+         modelContent.length());
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
+    makeBlazefaceAlignDetectOption(option, lib_path, protoContent, modelContent);
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    LOGI("THE ret %s\n", ret.description().c_str());
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_BLAZEFACE_ALIGN(deinit)(JNIEnv *env, jobject thiz) {
+    gAligner = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray
+TNN_BLAZEFACE_ALIGN(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width,
+                                         jint height, jint view_width, jint view_height, jint rotate) {
+    jobjectArray faceInfoArray;
+    auto asyncRefDetector = gAligner ;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+    TNN_NS::DimsVector resize_dims = {1, 4, target_height, target_width};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+
+    delete[] yuvData;
+    delete[] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to align %d", (int) status);
+        return 0;
+    }
+
+    if (output && dynamic_cast<TNN_NS::YoutuFaceAlignOutput *> (output.get())) {
+        TNN_NS::YoutuFaceAlignInfo face = dynamic_cast<TNN_NS::YoutuFaceAlignOutput *>(output.get())->face;
+
+
+        faceInfoArray = env->NewObjectArray(1, clsFaceInfo, NULL);
+        jobject objFaceInfo = env->NewObject(clsFaceInfo, midconstructorFaceInfo);
+        int keypointsNum = face.key_points.size();
+
+        auto face_preview = face.AdjustToImageSize(width, height);
+        auto face_orig = face_preview.AdjustToViewSize(view_height, view_width, 2);
+        //from here start to create point
+        jclass cls1dArr = env->FindClass("[F");
+        // Create the returnable jobjectArray with an initial value
+        jobjectArray outer = env->NewObjectArray(keypointsNum, cls1dArr, NULL);
+        for (int j = 0; j < keypointsNum; j++) {
+            jfloatArray inner = env->NewFloatArray(2);
+            float temp[] = {face_orig.key_points[j].first, face_orig.key_points[j].second};
+            env->SetFloatArrayRegion(inner, 0, 2, temp);
+            env->SetObjectArrayElement(outer, j, inner);
+            env->DeleteLocalRef(inner);
+        }
+        env->SetObjectField(objFaceInfo, fidkeypoints, outer);
+        env->SetObjectArrayElement(faceInfoArray, 0, objFaceInfo);
+        env->DeleteLocalRef(objFaceInfo);
+        return faceInfoArray;
+    }
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.h
new file mode 100644
index 0000000..dc12ad3
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_align_jni.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_BLAZEFACE_ALIGN_JNI_H_
+#define ANDROID_BLAZEFACE_ALIGN_JNI_H_
+
+#include "blazeface_detector.h"
+#include "youtu_face_align.h"
+#include "face_detect_aligner.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+#define TNN_BLAZEFACE_ALIGN(sig) Java_com_tencent_tnn_demo_FaceAlign_##sig
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+std::shared_ptr<TNN_NS::BlazeFaceDetector> CreateBlazeFaceDetector(JNIEnv *env, jobject thiz, jstring modelPath,
+                           jint width, jint height, jint topk,
+                           jint computUnitType);
+
+std::shared_ptr<TNN_NS::YoutuFaceAlign> CreateBlazeFaceAlign(JNIEnv *env, jobject thiz, jstring modelPath,
+                           jint width, jint height, jint topk,
+                           jint computUnitType, jint phase);
+
+void makeBlazefaceAlignDetectOption(std::shared_ptr<TNN_NS::BlazeFaceDetectorOption>& option, std::string& lib_path, std::string& proto_content, std:: string& model_content);
+
+JNIEXPORT jint JNICALL TNN_BLAZEFACE_ALIGN(init)(JNIEnv *env, jobject thiz, jstring modelPath,
+                                                    jint width, jint height, jfloat scoreThreshold,
+                                                    jfloat iouThreshold, jint topk,
+                                                    jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_BLAZEFACE_ALIGN(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_BLAZEFACE_ALIGN(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_BLAZEFACE_ALIGN(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_BLAZEFACE_ALIGN_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc
new file mode 100644
index 0000000..cb41667
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc
@@ -0,0 +1,289 @@
+#include "blazeface_detector_jni.h"
+#include "blazeface_detector.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include <tnn/utils/mat_utils.h>
+#include <kannarotate-android-lib/include/kannarotate.h>
+#include <yuv420sp_to_rgb_fast_asm.h>
+
+static std::shared_ptr<TNN_NS::BlazeFaceDetector> gDetector;
+static jclass clsFaceInfo;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int target_width = 128;
+static int target_height = 128;
+
+static std::string modelPathStr = "";
+static jmethodID midconstructorFaceInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidkeypoints;
+
+JNIEXPORT jint JNICALL TNN_BLAZEFACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath,
+                                                    jint width, jint height, jfloat scoreThreshold,
+                                                    jfloat iouThreshold, jint topk,
+                                                    jint computUnitType) {
+    // Reset bench description
+    LOGI("image height width %d %d \n", height, width);
+    gDetector = std::make_shared<TNN_NS::BlazeFaceDetector>();
+    std::string proto_content, model_content, lib_path = "";
+    modelPathStr = jstring2string(env, modelPath);
+    proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+    model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    LOGI("proto content size %d model content size %d", proto_content.length(),
+         model_content.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
+    makeBlazefaceDetectOption(option, lib_path, proto_content, model_content);
+
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        LOGI("the device type  %d device huawei_npu", gComputeUnitType);
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        status = gDetector->Init(option);
+    } else {
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        status = gDetector->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int) status);
+        return -1;
+    }
+    if (clsFaceInfo == NULL) {
+        clsFaceInfo = static_cast<jclass>(env->NewGlobalRef(
+                env->FindClass("com/tencent/tnn/demo/FaceInfo")));
+        midconstructorFaceInfo = env->GetMethodID(clsFaceInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsFaceInfo, "x1", "F");
+        fidy1 = env->GetFieldID(clsFaceInfo, "y1", "F");
+        fidx2 = env->GetFieldID(clsFaceInfo, "x2", "F");
+        fidy2 = env->GetFieldID(clsFaceInfo, "y2", "F");
+        fidkeypoints = env->GetFieldID(clsFaceInfo, "keypoints", "[[F");
+    }
+    return 0;
+}
+
+void makeBlazefaceDetectOption(std::shared_ptr<TNN_NS::BlazeFaceDetectorOption> &option,
+                               std::string &lib_path, std::string &proto_content,
+                               std::string &model_content) {
+    option->library_path = lib_path;
+    option->proto_content = proto_content;
+    option->model_content = model_content;
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->input_width = target_width;
+    option->input_height = target_height;
+//    option->min_score_threshold = 0.75;
+    option->min_suppression_threshold = 0.3;
+    option->anchor_path = modelPathStr + "/blazeface_anchors.txt";
+}
+
+JNIEXPORT JNICALL jboolean
+TNN_BLAZEFACE_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::BlazeFaceDetector tmpDetector;
+    std::string protoContent, modelContent, lib_path = "";
+    modelPathStr = jstring2string(env, modelPath);
+    protoContent = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(),
+         modelContent.length());
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
+    makeBlazefaceDetectOption(option, lib_path, protoContent, modelContent);
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    LOGI("THE ret %s\n", ret.description().c_str());
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_BLAZEFACE_DETECTOR(deinit)(JNIEnv *env, jobject thiz) {
+    gDetector = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray
+TNN_BLAZEFACE_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width,
+                                        jint height) {
+    AndroidBitmapInfo sourceInfocolor;
+    void *sourcePixelscolor;
+    int orig_height = height;
+    int orig_width = width;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if (AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gDetector->SetBenchOption(bench_option);
+
+    //orgin dims
+    std::vector<int> origin_dims = {1, 4, orig_height, orig_width};
+    std::vector<int> resize_dims = {1, 4, target_height, target_width};
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, origin_dims,
+                                                   sourcePixelscolor);
+
+    //here add the resize
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = gDetector->CreateSDKOutput();
+    auto status = gDetector->Predict(std::make_shared<TNN_NS::BlazeFaceDetectorInput>(resize_mat),
+                                     output);
+    if (status != TNN_NS::TNN_OK) {
+        return 0;
+    }
+    AndroidBitmap_unlockPixels(env, imageSource);
+    gDetector->ProcessSDKOutput(output);
+    std::vector<TNN_NS::BlazeFaceInfo> face_info;
+    //check face info list null or not
+    if (output && dynamic_cast<TNN_NS::BlazeFaceDetectorOutput *> (output.get())) {
+        auto face_output = dynamic_cast<TNN_NS::BlazeFaceDetectorOutput *>(output.get());
+        face_info = face_output->face_list;
+    } else {
+        return 0;
+    }
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+
+    char temp[128] = "";
+    sprintf(temp, " device: %s \ntime: ", device.c_str());
+    std::string computeUnitTips(temp);
+    std::string resultTips = std::string(
+            computeUnitTips + gDetector->GetBenchResult().Description());
+    setBenchResult(resultTips);
+
+    jobjectArray faceInfoArray;
+
+    if (face_info.size() > 0) {
+        faceInfoArray = env->NewObjectArray(face_info.size(), clsFaceInfo, NULL);
+        for (int i = 0; i < face_info.size(); i++) {
+            jobject objFaceInfo = env->NewObject(clsFaceInfo, midconstructorFaceInfo);
+            int keypointsNum = face_info[i].key_points.size();
+            auto face_orig = face_info[i].AdjustToViewSize(orig_height, orig_width, 2);
+            LOGI("face[%d] %f %f %f %f score %f landmark size %d", i, face_orig.x1, face_orig.y1,
+                 face_orig.x2, face_orig.y2, face_orig.score, keypointsNum);
+            env->SetFloatField(objFaceInfo, fidx1, face_orig.x1);
+            env->SetFloatField(objFaceInfo, fidy1, face_orig.y1);
+            env->SetFloatField(objFaceInfo, fidx2, face_orig.x2);
+            env->SetFloatField(objFaceInfo, fidy2, face_orig.y2);
+
+            //from here start to create point
+            jclass cls1dArr = env->FindClass("[F");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray outer = env->NewObjectArray(keypointsNum, cls1dArr, NULL);
+            for (int j = 0; j < keypointsNum; j++) {
+                jfloatArray inner = env->NewFloatArray(2);
+                float temp[] = {face_orig.key_points[j].first, face_orig.key_points[j].second};
+                env->SetFloatArrayRegion(inner, 0, 2, temp);
+                env->SetObjectArrayElement(outer, j, inner);
+                env->DeleteLocalRef(inner);
+            }
+            env->SetObjectField(objFaceInfo, fidkeypoints, outer);
+            env->SetObjectArrayElement(faceInfoArray, i, objFaceInfo);
+            env->DeleteLocalRef(objFaceInfo);
+        }
+        return faceInfoArray;
+    }
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray
+TNN_BLAZEFACE_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width,
+                                         jint height, jint view_width, jint view_height, jint rotate) {
+    jobjectArray faceInfoArray;
+    auto asyncRefDetector = gDetector;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+    TNN_NS::DimsVector resize_dims = {1, 4, target_height, target_width};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    std::vector<TNN_NS::BlazeFaceInfo> face_info = dynamic_cast<TNN_NS::BlazeFaceDetectorOutput *>(output.get())->face_list;
+    LOGI("theithilehtisize %d \n", face_info.size());
+    delete[] yuvData;
+    delete[] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int) status);
+        return 0;
+    }
+
+    faceInfoArray = env->NewObjectArray(face_info.size(), clsFaceInfo, NULL);
+
+    if (face_info.size() > 0) {
+        for (int i = 0; i < face_info.size(); i++) {
+            jobject objFaceInfo = env->NewObject(clsFaceInfo, midconstructorFaceInfo);
+            int keypointsNum = face_info[i].key_points.size();
+            auto face_preview = face_info[i].AdjustToImageSize(width, height);
+            auto face_orig = face_preview.AdjustToViewSize(view_height, view_width, 2);
+            LOGI("face[%d] %f %f %f %f score %f landmark size %d", i, face_orig.x1, face_orig.y1,
+                 face_orig.x2, face_orig.y2, face_orig.score, keypointsNum);
+            env->SetFloatField(objFaceInfo, fidx1, face_orig.x1);
+            env->SetFloatField(objFaceInfo, fidy1, face_orig.y1);
+            env->SetFloatField(objFaceInfo, fidx2, face_orig.x2);
+            env->SetFloatField(objFaceInfo, fidy2, face_orig.y2);
+
+            //from here start to create point
+            jclass cls1dArr = env->FindClass("[F");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray outer = env->NewObjectArray(keypointsNum, cls1dArr, NULL);
+            for (int j = 0; j < keypointsNum; j++) {
+                jfloatArray inner = env->NewFloatArray(2);
+                float temp[] = {face_orig.key_points[j].first, face_orig.key_points[j].second};
+                env->SetFloatArrayRegion(inner, 0, 2, temp);
+                env->SetObjectArrayElement(outer, j, inner);
+                env->DeleteLocalRef(inner);
+            }
+            env->SetObjectField(objFaceInfo, fidkeypoints, outer);
+            env->SetObjectArrayElement(faceInfoArray, i, objFaceInfo);
+            env->DeleteLocalRef(objFaceInfo);
+        }
+        return faceInfoArray;
+    } else {
+        return 0;
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.h
new file mode 100644
index 0000000..99c2c17
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_BLAZEFACE_DETECTOR_JNI_H_
+#define ANDROID_BLAZEFACE_DETECTOR_JNI_H_
+#include "blazeface_detector.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+#define TNN_BLAZEFACE_DETECTOR(sig) Java_com_tencent_tnn_demo_BlazeFaceDetector_##sig
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+void makeBlazefaceDetectOption(std::shared_ptr<TNN_NS::BlazeFaceDetectorOption>& option, std::string& lib_path, std::string& proto_content, std:: string& model_content);
+JNIEXPORT jint JNICALL TNN_BLAZEFACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath,
+                                                    jint width, jint height, jfloat scoreThreshold,
+                                                    jfloat iouThreshold, jint topk,
+                                                    jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_BLAZEFACE_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_BLAZEFACE_DETECTOR(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_BLAZEFACE_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+JNIEXPORT JNICALL jobjectArray TNN_BLAZEFACE_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_BLAZEFACE_FACEDETECTOR_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.cc
new file mode 100644
index 0000000..7e5ef1b
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.cc
@@ -0,0 +1,258 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "face_detector_jni.h"
+#include "ultra_face_detector.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+static std::shared_ptr<TNN_NS::UltraFaceDetector> gDetector;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsFaceInfo;
+static jmethodID midconstructorFaceInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidscore;
+static jfieldID fidlandmarks;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jfloat scoreThreshold, jfloat iouThreshold, jint topk, jint computUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+    std::vector<int> nchw = {1, 3, height, width};
+    gDetector = std::make_shared<TNN_NS::UltraFaceDetector>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::UltraFaceDetectorOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->input_shapes.insert(std::pair<std::string, TNN_NS::DimsVector>("input", nchw));
+    option->library_path="";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->input_width = width;
+    option->input_height= height;
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        status = gDetector->Init(option);
+    } else {
+	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    	status = gDetector->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int)status);
+        return -1;
+    }
+
+    if (clsFaceInfo == NULL)
+    {
+        clsFaceInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/FaceInfo")));
+        midconstructorFaceInfo = env->GetMethodID(clsFaceInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsFaceInfo, "x1" , "F");
+        fidy1 = env->GetFieldID(clsFaceInfo, "y1" , "F");
+        fidx2 = env->GetFieldID(clsFaceInfo, "x2" , "F");
+        fidy2 = env->GetFieldID(clsFaceInfo, "y2" , "F");
+        fidscore = env->GetFieldID(clsFaceInfo, "score" , "F");
+        fidlandmarks = env->GetFieldID(clsFaceInfo, "landmarks" , "[F");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_FACE_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::UltraFaceDetector tmpDetector;
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnmodel");
+    auto option = std::make_shared<TNN_NS::UltraFaceDetectorOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->input_height= 240;
+    option->input_width = 320;
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(deinit)(JNIEnv *env, jobject thiz)
+{
+
+    gDetector = nullptr;
+    return 0;
+}
+
+//#define STB_IMAGE_WRITE_IMPLEMENTATION
+//#include "stb_image_write.h"
+JNIEXPORT JNICALL jobjectArray TNN_FACE_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate)
+{
+    jobjectArray faceInfoArray;
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::FaceInfo> faceInfoList;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+    TNN_NS::DimsVector resize_dims = {1, 4, 240, 320};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+    faceInfoList = dynamic_cast<TNN_NS::UltraFaceDetectorOutput *>(output.get())->face_list;
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    LOGI("face info list size %d", faceInfoList.size());
+    // TODO: copy face info list
+    if (faceInfoList.size() > 0) {
+        faceInfoArray = env->NewObjectArray(faceInfoList.size(), clsFaceInfo, NULL);
+        for (int i = 0; i < faceInfoList.size(); i++) {
+            jobject objFaceInfo = env->NewObject(clsFaceInfo, midconstructorFaceInfo);
+            int landmarkNum = faceInfoList[i].key_points.size();
+            LOGI("face[%d] %f %f %f %f score %f landmark size %d", i, faceInfoList[i].x1, faceInfoList[i].y1, faceInfoList[i].x2, faceInfoList[i].y2, faceInfoList[i].score, landmarkNum);
+
+            auto face_preview = faceInfoList[i].AdjustToImageSize(width, height);
+            auto face_orig = face_preview.AdjustToViewSize(view_height, view_width, 2);
+
+            env->SetFloatField(objFaceInfo, fidx1, face_orig.x1);
+            env->SetFloatField(objFaceInfo, fidy1, face_orig.y1);
+            env->SetFloatField(objFaceInfo, fidx2, face_orig.x2);
+            env->SetFloatField(objFaceInfo, fidy2, face_orig.y2);
+            env->SetFloatField(objFaceInfo, fidscore, face_orig.score);
+            env->SetObjectArrayElement(faceInfoArray, i, objFaceInfo);
+            env->DeleteLocalRef(objFaceInfo);
+        }
+        return faceInfoArray;
+    } else {
+        return 0;
+    }
+
+}
+JNIEXPORT JNICALL jobjectArray TNN_FACE_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height)
+{
+    jobjectArray faceInfoArray;;
+    int ret = -1;
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gDetector->SetBenchOption(bench_option);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector target_dims = {1, 4, height, width};
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims, sourcePixelscolor);
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::FaceInfo> faceInfoList;
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+    AndroidBitmap_unlockPixels(env, imageSource);
+
+    faceInfoList = dynamic_cast<TNN_NS::UltraFaceDetectorOutput *>(output.get())->face_list;
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+    LOGI("bench result: %s", asyncRefDetector->GetBenchResult().Description().c_str());
+    char temp[128] = "";
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+    sprintf(temp, " device: %s \ntime:", device.c_str());
+    std::string computeUnitTips(temp);
+    std::string resultTips = std::string(computeUnitTips + asyncRefDetector->GetBenchResult().Description());
+    setBenchResult(resultTips);
+    LOGI("face info list size %d", faceInfoList.size());
+    // TODO: copy face info list
+    if (faceInfoList.size() > 0) {
+        faceInfoArray = env->NewObjectArray(faceInfoList.size(), clsFaceInfo, NULL);
+        for (int i = 0; i < faceInfoList.size(); i++) {
+            jobject objFaceInfo = env->NewObject(clsFaceInfo, midconstructorFaceInfo);
+            int landmarkNum = faceInfoList[i].key_points.size();
+            LOGI("face[%d] %f %f %f %f score %f landmark size %d", i, faceInfoList[i].x1, faceInfoList[i].y1, faceInfoList[i].x2, faceInfoList[i].y2, faceInfoList[i].score, landmarkNum);
+            env->SetFloatField(objFaceInfo, fidx1, faceInfoList[i].x1);
+            env->SetFloatField(objFaceInfo, fidy1, faceInfoList[i].y1);
+            env->SetFloatField(objFaceInfo, fidx2, faceInfoList[i].x2);
+            env->SetFloatField(objFaceInfo, fidy2, faceInfoList[i].y2);
+            env->SetFloatField(objFaceInfo, fidscore, faceInfoList[i].score);
+//            jfloatArray jarrayLandmarks = env->NewFloatArray(landmarkNum);
+//            env->SetFloatArrayRegion(jarrayLandmarks, 0, landmarkNum , faceInfoList[i].landmarks);
+//            env->SetObjectField(objFaceInfo, fidlandmarks, jarrayLandmarks);
+            env->SetObjectArrayElement(faceInfoArray, i, objFaceInfo);
+            env->DeleteLocalRef(objFaceInfo);
+        }
+        return faceInfoArray;
+    } else {
+        return 0;
+    }
+
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.h
new file mode 100644
index 0000000..17006e4
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/face_detector_jni.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_FACE_DETECTOR_JNI_H_
+#define ANDROID_FACE_DETECTOR_JNI_H_
+
+#include <jni.h>
+#define TNN_FACE_DETECTOR(sig) Java_com_tencent_tnn_demo_FaceDetector_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jfloat scoreThreshold, jfloat iouThreshold, jint topk, jint computUnitType);
+JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jboolean TNN_FACE_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jobjectArray TNN_FACE_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+JNIEXPORT JNICALL jobjectArray TNN_FACE_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_FACE_DETECTOR_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.cc
new file mode 100644
index 0000000..75d2e81
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fps_counter_jni.h"
+#include "tnn_fps_counter.h"
+#include <jni.h>
+#include "helper_jni.h"
+
+static std::shared_ptr<TNNFPSCounter> gFpsCounter;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(init)(JNIEnv *env, jobject thiz) {
+    gFpsCounter = std::make_shared<TNNFPSCounter>();
+    return 0;
+}
+
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(deinit)(JNIEnv *env, jobject thiz) {
+    gFpsCounter = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(begin)(JNIEnv *env, jobject thiz, jstring tag) {
+    std::string tagStr(jstring2string(env, tag));
+    gFpsCounter->Begin(tagStr);
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(end)(JNIEnv *env, jobject thiz, jstring tag) {
+    std::string tagStr(jstring2string(env, tag));
+    gFpsCounter->End(tagStr);
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jdouble TNN_FPS_COUNTER(getFps)(JNIEnv *env, jobject thiz, jstring tag) {
+    std::string tagStr(jstring2string(env, tag));
+    double fps = gFpsCounter->GetFPS(tagStr);
+
+    return (jdouble)fps;
+}
+
+JNIEXPORT JNICALL jdouble TNN_FPS_COUNTER(getTime)(JNIEnv *env, jobject thiz, jstring tag) {
+    std::string tagStr(jstring2string(env, tag));
+    double time = gFpsCounter->GetTime(tagStr);
+
+    return (jdouble)time;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.h
new file mode 100644
index 0000000..016b261
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/fps_counter_jni.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_FPS_COUNTER_JNI_H_
+#define ANDROID_FPS_COUNTER_JNI_H_
+
+#include <jni.h>
+#define TNN_FPS_COUNTER(sig) Java_com_tencent_tnn_demo_FpsCounter_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(init)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(begin)(JNIEnv *env, jobject thiz, jstring tag);
+JNIEXPORT JNICALL jint TNN_FPS_COUNTER(end)(JNIEnv *env, jobject thiz, jstring tag);
+JNIEXPORT JNICALL jdouble TNN_FPS_COUNTER(getFps)(JNIEnv *env, jobject thiz, jstring tag);
+JNIEXPORT JNICALL jdouble TNN_FPS_COUNTER(getTime)(JNIEnv *env, jobject thiz, jstring tag);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_FPS_COUNTER_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc
new file mode 100644
index 0000000..1da269a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc
@@ -0,0 +1,172 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hair_segmentation_jni.h"
+#include "hair_segmentation.h"
+#include "tnn_sdk_sample.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+static std::shared_ptr<TNN_NS::HairSegmentation> gSegmentator;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsImageInfo;
+static jmethodID midconstructorImageInfo;
+static jfieldID fidimage_width;
+static jfieldID fidimage_height;
+static jfieldID fidimage_channel;
+static jfieldID fiddata;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType) {
+    // Reset bench description
+    setBenchResult("");
+    gSegmentator = std::make_shared<TNN_NS::HairSegmentation>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/segmentation.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/segmentation.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    gComputeUnitType = computeUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::HairSegmentationOption>();
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->input_width = 256;
+    option->input_height = 256;
+    option->mode = 1;
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gSegmentator->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gSegmentator->setNpuModelPath(modelPathStr + "/");
+        gSegmentator->setCheckNpuSwitch(false);
+        status = gSegmentator->Init(option);
+    } else {
+	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    	status = gSegmentator->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int)status);
+        return -1;
+    }
+
+    if (clsImageInfo == NULL) {
+        clsImageInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/ImageInfo")));
+        midconstructorImageInfo = env->GetMethodID(clsImageInfo, "<init>", "()V");
+        fidimage_width = env->GetFieldID(clsImageInfo, "image_width" , "I");
+        fidimage_height = env->GetFieldID(clsImageInfo, "image_height" , "I");
+        fidimage_channel = env->GetFieldID(clsImageInfo, "image_channel" , "I");
+        fiddata = env->GetFieldID(clsImageInfo, "data" , "[B");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_HAIR_SEGMENTATION(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::HairSegmentation tmpSegmentator;
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/segmentation.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/segmentation.tnnmodel");
+    auto option = std::make_shared<TNN_NS::HairSegmentationOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->input_height= 256;
+    option->input_width = 256;
+    tmpSegmentator.setNpuModelPath(modelPathStr + "/");
+    tmpSegmentator.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpSegmentator.Init(option);
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(deinit)(JNIEnv *env, jobject thiz) {
+    gSegmentator = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(setHairColor)(JNIEnv *env, jobject thiz, jbyteArray rgba) {
+    const unsigned char *rgbaDataRef = (const unsigned char*)env->GetByteArrayElements(rgba, 0);
+    TNN_NS::RGBA colorData(rgbaDataRef[0], rgbaDataRef[1], rgbaDataRef[2], rgbaDataRef[3]);
+    gSegmentator->SetHairColor(colorData);
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_HAIR_SEGMENTATION(predictFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint rotate)
+{
+    jobjectArray imageInfoArray;
+    auto asyncRefSegmentator = gSegmentator;
+    std::vector<TNN_NS::ImageInfo> imageInfoList;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector target_dims = {1, 4, width, height};
+
+    auto rgbTNN = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims, rgbaData);
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(rgbTNN);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = asyncRefSegmentator->CreateSDKOutput();
+
+    TNN_NS::Status status = asyncRefSegmentator->Predict(input, output);
+
+    imageInfoList.push_back(dynamic_cast<TNN_NS::HairSegmentationOutput *>(output.get())->hair_mask);
+    imageInfoList.push_back(dynamic_cast<TNN_NS::HairSegmentationOutput *>(output.get())->merged_image);
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    if (imageInfoList.size() > 0) {
+        imageInfoArray = env->NewObjectArray(imageInfoList.size(), clsImageInfo, NULL);
+        for (int i = 0; i < imageInfoList.size(); i++) {
+            jobject objImageInfo = env->NewObject(clsImageInfo, midconstructorImageInfo);
+            int image_width = imageInfoList[i].image_width;
+            int image_height = imageInfoList[i].image_height;
+            int image_channel = imageInfoList[i].image_channel;
+            int dataNum = image_channel * image_width * image_height;
+
+            env->SetIntField(objImageInfo, fidimage_width, image_width);
+            env->SetIntField(objImageInfo, fidimage_height, image_height);
+            env->SetIntField(objImageInfo, fidimage_channel, image_channel);
+
+            jbyteArray jarrayData = env->NewByteArray(dataNum);
+            env->SetByteArrayRegion(jarrayData, 0, dataNum , (jbyte*)imageInfoList[i].data.get());
+            env->SetObjectField(objImageInfo, fiddata, jarrayData);
+
+            env->SetObjectArrayElement(imageInfoArray, i, objImageInfo);
+            env->DeleteLocalRef(objImageInfo);
+        }
+        return imageInfoArray;
+    } else {
+        return 0;
+    }
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.h
new file mode 100644
index 0000000..6b87e87
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_HAIR_SEGMENTATION_JNI_H_
+#define ANDROID_HAIR_SEGMENTATION_JNI_H_
+
+#include <jni.h>
+#define TNN_HAIR_SEGMENTATION(sig) Java_com_tencent_tnn_demo_HairSegmentation_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType);
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jboolean TNN_HAIR_SEGMENTATION(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(setHairColor)(JNIEnv *env, jobject thiz, jbyteArray rgba);
+JNIEXPORT JNICALL jobjectArray TNN_HAIR_SEGMENTATION(predictFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint rotate);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_HAIR_SEGMENTATION_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.cc
new file mode 100644
index 0000000..de8fa74
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "helper_jni.h"
+#include <fstream>
+
+static std::string gBenchResultStr = "";
+// Helper functions
+std::string fdLoadFile(std::string path) {
+    std::ifstream file(path, std::ios::in);
+    if (file.is_open()) {
+        file.seekg(0, file.end);
+        int size = file.tellg();
+        char *content = new char[size];
+
+        file.seekg(0, file.beg);
+        file.read(content, size);
+        std::string fileContent;
+        fileContent.assign(content, size);
+        delete [] content;
+        file.close();
+        return fileContent;
+    } else {
+        return "";
+    }
+}
+
+char* jstring2string(JNIEnv* env, jstring jstr)
+{
+    char* rtn = NULL;
+    jclass clsstring = env->FindClass("java/lang/String");
+    jstring strencode = env->NewStringUTF("utf-8");
+    jmethodID mid = env->GetMethodID(clsstring, "getBytes", "(Ljava/lang/String;)[B");
+    jbyteArray barr= (jbyteArray)env->CallObjectMethod(jstr, mid, strencode);
+    jsize alen = env->GetArrayLength(barr);
+    jbyte* ba = env->GetByteArrayElements(barr, JNI_FALSE);
+    if (alen > 0)
+    {
+        rtn = (char*)malloc(alen + 1);
+        memcpy(rtn, ba, alen);
+        rtn[alen] = 0;
+    }
+    env->ReleaseByteArrayElements(barr, ba, 0);
+    return rtn;
+}
+
+jstring string2jstring(JNIEnv* env, const char* pat) {
+    jclass strClass = (env)->FindClass("java/lang/String");
+    jmethodID ctorID = (env)->GetMethodID(strClass, "<init>", "([BLjava/lang/String;)V");
+    jbyteArray bytes = (env)->NewByteArray(strlen(pat));
+    (env)->SetByteArrayRegion(bytes, 0, strlen(pat), (jbyte*) pat);
+    jstring encoding = (env)->NewStringUTF("GB2312");
+    jstring r = (jstring) (env)->NewObject(strClass, ctorID, bytes, encoding);
+    env->DeleteLocalRef( strClass );
+    env->DeleteLocalRef( bytes );
+    env->DeleteLocalRef( encoding );
+    return r;
+}
+
+void setBenchResult(std::string result)
+{
+    gBenchResultStr = result;
+}
+JNIEXPORT JNICALL jstring TNN_HELPER(getBenchResult)(JNIEnv *env, jobject thiz)
+{
+    return string2jstring(env, gBenchResultStr.c_str());
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.h
new file mode 100644
index 0000000..2f08d75
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/helper_jni.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_HELPER_JNI_H_
+#define ANDROID_HELPER_JNI_H_
+
+#include <string>
+#include <jni.h>
+#define TNN_HELPER(sig) Java_com_tencent_tnn_demo_Helper_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+std::string fdLoadFile(std::string path);
+char* jstring2string(JNIEnv* env, jstring jstr);
+jstring string2jstring(JNIEnv* env, const char* pat);
+void setBenchResult(std::string result);
+JNIEXPORT JNICALL jstring TNN_HELPER(getBenchResult)(JNIEnv *env, jobject thiz);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_HELPER_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.cc
new file mode 100644
index 0000000..fd936da
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.cc
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "image_classifier.h"
+#include "image_classify_jni.h"
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+static std::shared_ptr<TNN_NS::ImageClassifier> gDetector;
+static int gComputeUnitType = 0;
+
+JNIEXPORT JNICALL jint TNN_CLASSIFY(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+    std::vector<int> nchw = {1, 3, height, width};
+    gDetector = std::make_shared<TNN_NS::ImageClassifier>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    gComputeUnitType = computeUnitType;
+
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->input_shapes = {};
+    option->library_path="";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+    } else if (gComputeUnitType == 2) {
+        LOGI("the device type  %d device huawei_npu" ,gComputeUnitType);
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    } else {
+	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    }
+    status = gDetector->Init(option);
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int)status);
+        return -1;
+    }
+
+    return 0;
+}
+
+JNIEXPORT jboolean TNN_CLASSIFY(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::ImageClassifier tmpDetector;
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnmodel");
+
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->input_shapes = {};
+    option->library_path="";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_CLASSIFY(deinit)(JNIEnv *env, jobject thiz)
+{
+
+    gDetector = nullptr;
+    return 0;
+}
+JNIEXPORT JNICALL jintArray TNN_CLASSIFY(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height)
+{
+    jintArray resultArray;
+    int ret = -1;
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gDetector->SetBenchOption(bench_option);
+
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector target_dims = {1, 3, height, width};
+    std::shared_ptr<TNN_NS::Mat> input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims, sourcePixelscolor);
+    int resultList[1];
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = gDetector->CreateSDKOutput();
+    TNN_NS::Status status = gDetector->Predict(input, output);
+    //get output map
+    gDetector->ProcessSDKOutput(output);
+
+    AndroidBitmap_unlockPixels(env, imageSource);
+
+    if (status != TNN_NS::TNN_OK) {
+        return 0;
+    }
+    char temp[128] = "";
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+    sprintf(temp, " device: %s \ntime: ", device.c_str());
+    std::string computeUnitTips(temp);
+    std::string resultTips = std::string(computeUnitTips + gDetector->GetBenchResult().Description());
+    setBenchResult(resultTips);
+    LOGI("classify id %d", resultList[0]);
+    resultArray = env->NewIntArray(1);
+    resultList[0] =  dynamic_cast<TNN_NS::ImageClassifierOutput*>(output.get())->class_id;
+    env->SetIntArrayRegion(resultArray, 0, 1, resultList);
+
+    return resultArray;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.h
new file mode 100644
index 0000000..b3d3c28
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/image_classify_jni.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_IMAGE_CLASSIFY_JNI_H_
+#define ANDROID_IMAGE_CLASSIFY_JNI_H_
+
+
+#include <jni.h>
+#define TNN_CLASSIFY(sig) Java_com_tencent_tnn_demo_ImageClassify_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_CLASSIFY(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType);
+JNIEXPORT JNICALL jint TNN_CLASSIFY(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jboolean TNN_CLASSIFY(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jintArray TNN_CLASSIFY(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_IMAGE_CLASSIFY_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc
new file mode 100644
index 0000000..55e0db7
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc
@@ -0,0 +1,258 @@
+//
+// Created by tencent on 2020-04-29.
+//
+#include "objectdetector_jni.h"
+#include "object_detector_yolo.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include "tnn/utils/mat_utils.h"
+
+static std::shared_ptr<TNN_NS::ObjectDetectorYolo> gDetector;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsObjectInfo;
+static jmethodID midconstructorObjectInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidscore;
+static jfieldID fidlandmarks;
+static jfieldID fidcls;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jfloat scoreThreshold, jfloat iouThreshold, jint topk, jint computUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+    std::vector<int> nchw = {1, 3, height, width};
+    gDetector = std::make_shared<TNN_NS::ObjectDetectorYolo>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/yolov5s-permute.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/yolov5s.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->library_path="";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    LOGI("the device type  %d device huawei_npu" ,gComputeUnitType);
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        status = gDetector->Init(option);
+    } else {
+	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    	status = gDetector->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int)status);
+        return -1;
+    }
+
+    if (clsObjectInfo == NULL)
+    {
+        clsObjectInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/ObjectInfo")));
+        midconstructorObjectInfo = env->GetMethodID(clsObjectInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsObjectInfo, "x1" , "F");
+        fidy1 = env->GetFieldID(clsObjectInfo, "y1" , "F");
+        fidx2 = env->GetFieldID(clsObjectInfo, "x2" , "F");
+        fidy2 = env->GetFieldID(clsObjectInfo, "y2" , "F");
+        fidscore = env->GetFieldID(clsObjectInfo, "score" , "F");
+        fidlandmarks = env->GetFieldID(clsObjectInfo, "landmarks" , "[F");
+        fidcls = env->GetFieldID(clsObjectInfo, "class_id", "I");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_OBJECT_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::ObjectDetectorYolo tmpDetector;
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/yolov5s-permute.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/yolov5s.tnnmodel");
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(deinit)(JNIEnv *env, jobject thiz)
+{
+
+    gDetector = nullptr;
+    return 0;
+}
+
+//#define STB_IMAGE_WRITE_IMPLEMENTATION
+//#include "stb_image_write.h"
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate)
+{
+    jobjectArray objectInfoArray;
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::ObjectInfo> objectInfoList;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+    TNN_NS::DimsVector resize_dims = {1, 4, 448, 640};
+    float scale_h = input_dims[2] / 448.0f;
+    float scale_w = input_dims[3] / 640.0f;
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    objectInfoList = dynamic_cast<TNN_NS::ObjectDetectorYoloOutput *>(output.get())->object_list;
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    LOGI("object info list size %d", objectInfoList.size());
+    // TODO: copy object info list
+    if (objectInfoList.size() > 0) {
+        objectInfoArray = env->NewObjectArray(objectInfoList.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < objectInfoList.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+            int landmarkNum = objectInfoList[i].key_points.size();
+            LOGI("object[%d] %f %f %f %f score %f landmark size %d, label_id: %d", i, objectInfoList[i].x1, objectInfoList[i].y1, objectInfoList[i].x2, objectInfoList[i].y2, objectInfoList[i].score, landmarkNum, objectInfoList[i].class_id);
+            auto object_preview = objectInfoList[i].AdjustToImageSize(width, height);
+            auto object_orig = object_preview.AdjustToViewSize(view_height, view_width, 2);
+            env->SetFloatField(objObjectInfo, fidx1, object_orig.x1);
+            env->SetFloatField(objObjectInfo, fidy1, object_orig.y1);
+            env->SetFloatField(objObjectInfo, fidx2, object_orig.x2);
+            env->SetFloatField(objObjectInfo, fidy2, object_orig.y2);
+            env->SetFloatField(objObjectInfo, fidscore, object_orig.score);
+            env->SetIntField(objObjectInfo, fidcls, object_orig.class_id);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height)
+{
+    jobjectArray objectInfoArray;
+    int ret = -1;
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gDetector->SetBenchOption(bench_option);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector input_dims = {1, 4, height, width};
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, sourcePixelscolor);
+
+    TNN_NS::DimsVector target_dims = {1, 4, 448, 640};
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    float scale_h = height / 448.0f;
+    float scale_w = width / 640.0f;
+
+
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::ObjectInfo> objectInfoList;
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+    AndroidBitmap_unlockPixels(env, imageSource);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+    objectInfoList = dynamic_cast<TNN_NS::ObjectDetectorYoloOutput *>(output.get())->object_list;
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+    LOGI("bench result: %s", asyncRefDetector->GetBenchResult().Description().c_str());
+    char temp[128] = "";
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+    sprintf(temp, " device: %s \ntime:", device.c_str());
+    std::string computeUnitTips(temp);
+    std::string resultTips = std::string(computeUnitTips + asyncRefDetector->GetBenchResult().Description());
+    setBenchResult(resultTips);
+    LOGI("object info list size %d", objectInfoList.size());
+    // TODO: copy object info list
+    if (objectInfoList.size() > 0) {
+        objectInfoArray = env->NewObjectArray(objectInfoList.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < objectInfoList.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+            int landmarkNum = objectInfoList[i].key_points.size();
+            LOGI("object[%d] %f %f %f %f score %f landmark size %d", i, objectInfoList[i].x1, objectInfoList[i].y1, objectInfoList[i].x2, objectInfoList[i].y2, objectInfoList[i].score, landmarkNum);
+            env->SetFloatField(objObjectInfo, fidx1, objectInfoList[i].x1 * scale_w);
+            env->SetFloatField(objObjectInfo, fidy1, objectInfoList[i].y1 * scale_h);
+            env->SetFloatField(objObjectInfo, fidx2, objectInfoList[i].x2 * scale_w);
+            env->SetFloatField(objObjectInfo, fidy2, objectInfoList[i].y2 * scale_h);
+            env->SetFloatField(objObjectInfo, fidscore, objectInfoList[i].score);
+            env->SetIntField(objObjectInfo, fidcls, objectInfoList[i].class_id);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.h
new file mode 100644
index 0000000..f67ac7d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetector_jni.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_OBJECTDETECTOR_JNI_H
+#define ANDROID_OBJECTDETECTOR_JNI_H
+#include <jni.h>
+#define TNN_OBJECT_DETECTOR(sig) Java_com_tencent_tnn_demo_ObjectDetector_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jfloat scoreThreshold, jfloat iouThreshold, jint topk, jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_OBJECT_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_OBJECTDETECTOR_JNI_H
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc
new file mode 100644
index 0000000..960d6d6
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc
@@ -0,0 +1,264 @@
+//
+// Created by tencent on 2020-04-30.
+//
+#include <vector>
+#include <android/bitmap.h>
+#include "objectdetectorssd_jni.h"
+#include "object_detector_ssd.h"
+#include "helper_jni.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include "tnn/utils/mat_utils.h"
+
+static std::shared_ptr<TNN_NS::ObjectDetectorSSD> gDetector;
+static int gComputeUnitType = 0;
+static int target_height = 300;
+static int target_width = 300;
+static std::vector<int> target_dims = {1, 3, target_height, target_width};
+static jclass clsObjectInfo;
+static jmethodID midconstructorObjectInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidscore;
+static jfieldID fidlandmarks;
+static jfieldID fidcls;
+
+JNIEXPORT JNICALL jint
+TNN_OBJECT_DETECTORSSD(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height,
+                             jfloat scoreThreshold, jfloat iouThreshold, jint topk,
+                             jint computUnitType) {
+    // Reset bench description
+    setBenchResult("");
+    std::vector<int> nchw = {1, 3, height, width};
+    gDetector = std::make_shared<TNN_NS::ObjectDetectorSSD>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(),
+         modelContent.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+
+    LOGI("the device type %d device huawei_npu", gComputeUnitType);
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        LOGI("the device type  %d device huawei_npu", gComputeUnitType);
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        status = gDetector->Init(option);
+    } else {
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        status = gDetector->Init(option);
+    }
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("detector init failed %d", (int) status);
+        return -1;
+    }
+
+    if (clsObjectInfo == NULL) {
+        clsObjectInfo = static_cast<jclass>(env->NewGlobalRef(
+                env->FindClass("com/tencent/tnn/demo/ObjectInfo")));
+        midconstructorObjectInfo = env->GetMethodID(clsObjectInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsObjectInfo, "x1", "F");
+        fidy1 = env->GetFieldID(clsObjectInfo, "y1", "F");
+        fidx2 = env->GetFieldID(clsObjectInfo, "x2", "F");
+        fidy2 = env->GetFieldID(clsObjectInfo, "y2", "F");
+        fidscore = env->GetFieldID(clsObjectInfo, "score", "F");
+        fidlandmarks = env->GetFieldID(clsObjectInfo, "landmarks", "[F");
+        fidcls = env->GetFieldID(clsObjectInfo, "class_id", "I");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean
+TNN_OBJECT_DETECTORSSD(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::ObjectDetectorSSD tmpDetector;
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd.tnnmodel");
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    return ret == TNN_NS::TNN_OK;
+}
+
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTORSSD(deinit)(JNIEnv *env, jobject thiz) {
+
+    gDetector = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTORSSD(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate) {
+    jobjectArray objectInfoArray;
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::ObjectInfo> objectInfoList;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char *) yuvDataRef, (int) width, (int) height,
+                                   (unsigned char *) yuvData, (int) rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char *) yuvData, height, width,
+                              (unsigned char *) rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+    TNN_NS::DimsVector resize_dims = {1, 4, 300, 300};
+    float scale_h = input_dims[2] / 300.0f;
+    float scale_w = input_dims[3] / 300.0f;
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, resize_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+    objectInfoList = dynamic_cast<TNN_NS::ObjectDetectorSSDOutput *>(output.get())->object_list;
+    delete[] yuvData;
+    delete[] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int) status);
+        return 0;
+    }
+
+    LOGI("object info list size %d", objectInfoList.size());
+    // TODO: copy object info list
+    if (objectInfoList.size() > 0) {
+        objectInfoArray = env->NewObjectArray(objectInfoList.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < objectInfoList.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+            int landmarkNum = objectInfoList[i].key_points.size();
+            LOGI("object[%d] %f %f %f %f score %f landmark size %d", i, objectInfoList[i].x1,
+                 objectInfoList[i].y1, objectInfoList[i].x2, objectInfoList[i].y2,
+                 objectInfoList[i].score, landmarkNum);
+            auto object_preview = objectInfoList[i].AdjustToImageSize(width, height);
+            auto object_orig = object_preview.AdjustToViewSize(view_height, view_width, 2);
+            env->SetFloatField(objObjectInfo, fidx1, object_orig.x1);
+            env->SetFloatField(objObjectInfo, fidy1, object_orig.y1);
+            env->SetFloatField(objObjectInfo, fidx2, object_orig.x2);
+            env->SetFloatField(objObjectInfo, fidy2, object_orig.y2);
+            env->SetFloatField(objObjectInfo, fidscore, object_orig.score);
+            env->SetIntField(objObjectInfo, fidcls, object_orig.class_id);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTORSSD(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width,jint height) {
+    jobjectArray objectInfoArray;
+    int ret = -1;
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gDetector->SetBenchOption(bench_option);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector input_dims = {1, 4, height, width};
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, sourcePixelscolor);
+
+    TNN_NS::DimsVector target_dims = {1, 4, 300, 300};
+    auto resize_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims);
+
+    TNN_NS::ResizeParam param;
+    TNN_NS::MatUtils::Resize(*input_mat, *resize_mat, param, NULL);
+
+    float scale_h = height / 300.0f;
+    float scale_w = width / 300.0f;
+
+    auto asyncRefDetector = gDetector;
+    std::vector<TNN_NS::ObjectInfo> objectInfoList;
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(resize_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+    AndroidBitmap_unlockPixels(env, imageSource);
+
+    objectInfoList = dynamic_cast<TNN_NS::ObjectDetectorSSDOutput*>(output.get())->object_list;
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+    LOGI("bench result: %s", asyncRefDetector->GetBenchResult().Description().c_str());
+    char temp[128] = "";
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+    sprintf(temp, " device: %s \ntime:", device.c_str());
+    std::string computeUnitTips(temp);
+    std::string resultTips = std::string(computeUnitTips + asyncRefDetector->GetBenchResult().Description());
+    setBenchResult(resultTips);
+    LOGI("object info list size %d", objectInfoList.size());
+    // TODO: copy object info list
+    if (objectInfoList.size() > 0) {
+        objectInfoArray = env->NewObjectArray(objectInfoList.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < objectInfoList.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+            int landmarkNum = objectInfoList[i].key_points.size();
+            LOGI("object[%d] %f %f %f %f score %f landmark size %d", i, objectInfoList[i].x1, objectInfoList[i].y1, objectInfoList[i].x2, objectInfoList[i].y2, objectInfoList[i].score, landmarkNum);
+            env->SetFloatField(objObjectInfo, fidx1, objectInfoList[i].x1 * scale_w);
+            env->SetFloatField(objObjectInfo, fidy1, objectInfoList[i].y1 * scale_h);
+            env->SetFloatField(objObjectInfo, fidx2, objectInfoList[i].x2 * scale_w);
+            env->SetFloatField(objObjectInfo, fidy2, objectInfoList[i].y2 * scale_h);
+            env->SetFloatField(objObjectInfo, fidscore, objectInfoList[i].score);
+            env->SetIntField(objObjectInfo, fidcls, objectInfoList[i].class_id);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.h
new file mode 100644
index 0000000..736ef4b
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_OBJECTDETECTORSSD_JNI_H_
+#define ANDROID_OBJECTDETECTORSSD_JNI_H_
+#include "jni.h"
+#define TNN_OBJECT_DETECTORSSD(sig) Java_com_tencent_tnn_demo_ObjectDetectorSSD_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTORSSD(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jfloat scoreThreshold, jfloat iouThreshold, jint topk, jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_OBJECT_DETECTORSSD(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_OBJECT_DETECTORSSD(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTORSSD(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTORSSD(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_OBJECTDETECTORSSD_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc
new file mode 100644
index 0000000..c1a1e55
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc
@@ -0,0 +1,432 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ocr_detector_jni.h"
+#if HAS_OPENCV
+#include "ocr_driver.h"
+#include "ocr_textbox_detector.h"
+#include "ocr_text_recognizer.h"
+#include "ocr_angle_predictor.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#endif
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+
+#if HAS_OPENCV
+static std::shared_ptr<TNN_NS::OCRDriver> gOCRDriver;
+static std::shared_ptr<TNN_NS::OCRTextboxDetector> gOCRTextboxDetector;
+static std::shared_ptr<TNN_NS::OCRAnglePredictor> gOCRAnglePredictor;
+static std::shared_ptr<TNN_NS::OCRTextRecognizer> gOCRTextRecognizer;
+#endif
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsObjectInfo;
+static jmethodID midconstructorObjectInfo;
+static jfieldID fidkeypoints;
+static jfieldID fidlabel;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+#if HAS_OPENCV
+    std::vector<int> nchw = {1, 3, height, width};
+    gOCRDriver = std::make_shared<TNN_NS::OCRDriver>();
+    gOCRTextboxDetector = std::make_shared<TNN_NS::OCRTextboxDetector>();
+    gOCRAnglePredictor = std::make_shared<TNN_NS::OCRAnglePredictor>();
+    gOCRTextRecognizer = std::make_shared<TNN_NS::OCRTextRecognizer>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/dbnet.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/dbnet.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    {
+        auto option = std::make_shared<TNN_NS::OCRTextboxDetectorOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        option->library_path="";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->scale_down_ratio = 0.75f;
+        option->padding = 10;
+        if (gComputeUnitType == 1) {
+            option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gOCRTextboxDetector->Init(option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gOCRTextboxDetector->setNpuModelPath(modelPathStr + "/");
+            gOCRTextboxDetector->setCheckNpuSwitch(false);
+            status = gOCRTextboxDetector->Init(option);
+        } else {
+            option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gOCRTextboxDetector->Init(option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr textbox detector init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/angle_net.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/angle_net.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    status = TNN_NS::TNN_OK;
+    {
+        auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        option->library_path="";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        if (gComputeUnitType == 1) {
+            option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gOCRAnglePredictor->Init(option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gOCRAnglePredictor->setNpuModelPath(modelPathStr + "/");
+            gOCRAnglePredictor->setCheckNpuSwitch(false);
+            status = gOCRAnglePredictor->Init(option);
+        } else {
+            option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gOCRAnglePredictor->Init(option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr angle predictor init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/crnn_lite_lstm.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/crnn_lite_lstm.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    status = TNN_NS::TNN_OK;
+    {
+        auto recognizer_option = std::make_shared<TNN_NS::OCRTextRecognizerOption>();
+        recognizer_option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        recognizer_option->library_path="";
+        recognizer_option->vocab_path=modelPathStr + "/keys.txt";
+        recognizer_option->proto_content = protoContent;
+        recognizer_option->model_content = modelContent;
+        if (gComputeUnitType == 1) {
+            recognizer_option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gOCRTextRecognizer->Init(recognizer_option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            recognizer_option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gOCRTextRecognizer->setNpuModelPath(modelPathStr + "/");
+            gOCRTextRecognizer->setCheckNpuSwitch(false);
+            status = gOCRTextRecognizer->Init(recognizer_option);
+        } else {
+            recognizer_option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gOCRTextRecognizer->Init(recognizer_option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr text recognizer init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    status = gOCRDriver->Init({gOCRTextboxDetector, gOCRAnglePredictor, gOCRTextRecognizer});
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("ocr detector init failed %d", (int)status);
+        return -1;
+    }
+
+    if (clsObjectInfo == NULL) {
+        clsObjectInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/ObjectInfo")));
+        midconstructorObjectInfo = env->GetMethodID(clsObjectInfo, "<init>", "()V");
+        fidkeypoints = env->GetFieldID(clsObjectInfo, "key_points", "[[F");
+        fidlabel = env->GetFieldID(clsObjectInfo, "label" , "Ljava/lang/String;");
+    }
+#endif
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_OCR_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+#if HAS_OPENCV
+    // ocr detector relys on the support of Reshape which is not supported on NPU network for now
+    return false;
+    std::shared_ptr<TNN_NS::OCRDriver> tmpOCRDriver = std::make_shared<TNN_NS::OCRDriver>();
+    std::shared_ptr<TNN_NS::OCRTextboxDetector> tmpOCRTextboxDetector = std::make_shared<TNN_NS::OCRTextboxDetector>();
+    std::shared_ptr<TNN_NS::OCRAnglePredictor> tmpOCRAnglePredictor = std::make_shared<TNN_NS::OCRAnglePredictor>();
+    std::shared_ptr<TNN_NS::OCRTextRecognizer> tmpOCRTextRecognizer = std::make_shared<TNN_NS::OCRTextRecognizer>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/dbnet.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/dbnet.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    {
+        auto option = std::make_shared<TNN_NS::OCRTextboxDetectorOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        option->library_path="";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        //add for huawei_npu store the om file
+        tmpOCRTextboxDetector->setNpuModelPath(modelPathStr + "/");
+        tmpOCRTextboxDetector->setCheckNpuSwitch(false);
+        status = tmpOCRTextboxDetector->Init(option);
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr textbox detector init failed %d", (int)status);
+            return false;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/angle_net.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/angle_net.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    status = TNN_NS::TNN_OK;
+    {
+        auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        option->library_path="";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        tmpOCRAnglePredictor->setNpuModelPath(modelPathStr + "/");
+        tmpOCRAnglePredictor->setCheckNpuSwitch(false);
+        status = tmpOCRAnglePredictor->Init(option);
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr angle predictor init failed %d", (int)status);
+            return false;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/crnn_lite_lstm.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/crnn_lite_lstm.tnnmodel");
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    status = TNN_NS::TNN_OK;
+    {
+        auto recognizer_option = std::make_shared<TNN_NS::OCRTextRecognizerOption>();
+        recognizer_option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        recognizer_option->library_path="";
+        recognizer_option->vocab_path=modelPathStr + "/keys.txt";
+        recognizer_option->proto_content = protoContent;
+        recognizer_option->model_content = modelContent;
+        //add for huawei_npu store the om file
+        tmpOCRTextRecognizer->setNpuModelPath(modelPathStr + "/");
+        tmpOCRTextRecognizer->setCheckNpuSwitch(false);
+        status = tmpOCRTextRecognizer->Init(recognizer_option);
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("ocr text recognizer init failed %d", (int)status);
+            return false;
+        }
+    }
+
+    status = tmpOCRDriver->Init({tmpOCRTextboxDetector, tmpOCRAnglePredictor, tmpOCRTextRecognizer});
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("ocr detector init failed %d", (int)status);
+        return false;
+    }
+
+    return true;
+#else
+    return false;
+#endif
+}
+
+JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(deinit)(JNIEnv *env, jobject thiz)
+{
+
+#if HAS_OPENCV
+    gOCRTextboxDetector = nullptr;
+    gOCRAnglePredictor = nullptr;
+    gOCRTextRecognizer = nullptr;
+    gOCRDriver = nullptr;
+#endif
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_OCR_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate)
+{
+#if HAS_OPENCV
+    jobjectArray objectInfoArray;
+    auto asyncRefDetector = gOCRDriver;
+    TNN_NS::OCROutput* ocrOutput;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    ocrOutput = dynamic_cast<TNN_NS::OCROutput *>(output.get());
+    if (ocrOutput && ocrOutput->texts.size() > 0) {
+        objectInfoArray = env->NewObjectArray(ocrOutput->texts.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < ocrOutput->texts.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+
+            TNN_NS::ObjectInfo objectInfo;
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 1]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 2]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 3]);
+            objectInfo.image_width = ocrOutput->image_width;
+            objectInfo.image_height = ocrOutput->image_height;
+            objectInfo.label = ocrOutput->texts[i].c_str();
+
+            auto object_preview = objectInfo.AdjustToImageSize(width, height);
+            auto object_orig = object_preview.AdjustToViewSize(view_height, view_width, 2);
+
+            jclass cls1dArr = env->FindClass("[F");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray outer = env->NewObjectArray(objectInfo.key_points.size(), cls1dArr, NULL);
+            for (int j = 0; j < objectInfo.key_points.size(); j++) {
+                jfloatArray inner = env->NewFloatArray(2);
+                float temp[] = {object_orig.key_points[j].first, object_orig.key_points[j].second};
+                env->SetFloatArrayRegion(inner, 0, 2, temp);
+                env->SetObjectArrayElement(outer, j, inner);
+                env->DeleteLocalRef(inner);
+            }
+            env->SetObjectField(objObjectInfo, fidkeypoints, outer);
+            jstring str = env->NewStringUTF(objectInfo.label);
+            env->SetObjectField(objObjectInfo, fidlabel, str);
+            env->DeleteLocalRef(str);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+#endif
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_OCR_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height)
+{
+#if HAS_OPENCV
+    jobjectArray objectInfoArray;
+    TNN_NS::OCROutput* ocrOutput;
+    int ret = -1;
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+    TNN_NS::BenchOption bench_option;
+    bench_option.forward_count = 20;
+    gOCRDriver->SetBenchOption(bench_option);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+    TNN_NS::DimsVector target_dims = {1, 4, height, width};
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, target_dims, sourcePixelscolor);
+    auto asyncRefDetector = gOCRDriver;
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    char temp[128] = "";
+    std::string device = "arm";
+    if (gComputeUnitType == 1) {
+        device = "gpu";
+    } else if (gComputeUnitType == 2) {
+        device = "huawei_npu";
+    }
+    sprintf(temp, " device: %s \n", device.c_str());
+    std::string computeUnitTips(temp);
+    setBenchResult(computeUnitTips);
+
+    asyncRefDetector->ProcessSDKOutput(output);
+    AndroidBitmap_unlockPixels(env, imageSource);
+    ocrOutput = dynamic_cast<TNN_NS::OCROutput *>(output.get());
+
+    if (ocrOutput && ocrOutput->texts.size() > 0) {
+        objectInfoArray = env->NewObjectArray(ocrOutput->texts.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < ocrOutput->texts.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+
+            TNN_NS::ObjectInfo objectInfo;
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 1]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 2]);
+            objectInfo.key_points.push_back(ocrOutput->box[i * 4 + 3]);
+            objectInfo.image_width = ocrOutput->image_width;
+            objectInfo.image_height = ocrOutput->image_height;
+            objectInfo.label = ocrOutput->texts[i].c_str();
+
+            jclass cls1dArr = env->FindClass("[F");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray outer = env->NewObjectArray(objectInfo.key_points.size(), cls1dArr, NULL);
+            for (int j = 0; j < objectInfo.key_points.size(); j++) {
+                jfloatArray inner = env->NewFloatArray(2);
+                float temp[] = {objectInfo.key_points[j].first, objectInfo.key_points[j].second};
+                env->SetFloatArrayRegion(inner, 0, 2, temp);
+                env->SetObjectArrayElement(outer, j, inner);
+                env->DeleteLocalRef(inner);
+            }
+            env->SetObjectField(objObjectInfo, fidkeypoints, outer);
+            jstring str = env->NewStringUTF(objectInfo.label);
+            env->SetObjectField(objObjectInfo, fidlabel, str);
+            env->DeleteLocalRef(str);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    } else {
+        return 0;
+    }
+#endif
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.h
new file mode 100644
index 0000000..99fbc36
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/ocr_detector_jni.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_OCR_DETECTOR_JNI_H_
+#define ANDROID_OCR_DETECTOR_JNI_H_
+
+#include <jni.h>
+#define TNN_OCR_DETECTOR(sig) Java_com_tencent_tnn_demo_OCRDetector_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computUnitType);
+JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jboolean TNN_OCR_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jobjectArray TNN_OCR_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate);
+JNIEXPORT JNICALL jobjectArray TNN_OCR_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject imageSource, jint width, jint height);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_OCR_DETECTOR_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc
new file mode 100644
index 0000000..fe99609
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc
@@ -0,0 +1,365 @@
+//
+// Created by tencent on 2020-12-18.
+//
+#include "pose_detect_landmark_jni.h"
+#include "pose_detect_landmark.h"
+#include "blazepose_detector.h"
+#include "blazepose_landmark.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include "tnn/utils/mat_utils.h"
+
+static std::shared_ptr<TNN_NS::PoseDetectLandmark> gDetector;
+static std::shared_ptr<TNN_NS::PoseDetectLandmark> gFullBodyDetector;
+static std::shared_ptr<TNN_NS::BlazePoseDetector> gBlazePoseDetector;
+static std::shared_ptr<TNN_NS::BlazePoseLandmark> gBlazePoseLandmark;
+static std::shared_ptr<TNN_NS::BlazePoseLandmark> gBlazePoseFullBodyLandmark;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsObjectInfo;
+static jmethodID midconstructorObjectInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidscore;
+static jfieldID fidcls;
+static jfieldID fidkeypoints;
+static jfieldID fidlines;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint computUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+    gDetector = std::make_shared<TNN_NS::PoseDetectLandmark>();
+    gFullBodyDetector = std::make_shared<TNN_NS::PoseDetectLandmark>();
+    gBlazePoseDetector = std::make_shared<TNN_NS::BlazePoseDetector>();
+    gBlazePoseLandmark = std::make_shared<TNN_NS::BlazePoseLandmark>();
+    gBlazePoseFullBodyLandmark = std::make_shared<TNN_NS::BlazePoseLandmark>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/pose_detection.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_detection.tnnmodel");
+    LOGI("pose detection proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    gComputeUnitType = computUnitType;
+    LOGI("device type: %d", gComputeUnitType);
+
+    TNN_NS::Status status = TNN_NS::TNN_OK;
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseDetectorOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        option->library_path  = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->min_score_threshold = 0.5;
+        option->min_suppression_threshold = 0.3;
+        if (gComputeUnitType == 1) {
+            option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gBlazePoseDetector->Init(option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gBlazePoseDetector->setNpuModelPath(modelPathStr + "/");
+            gBlazePoseDetector->setCheckNpuSwitch(false);
+            status = gBlazePoseDetector->Init(option);
+        } else {
+            option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gBlazePoseDetector->Init(option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("blaze pose detector init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/pose_landmark_upper_body.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_landmark_upper_body.tnnmodel");
+    LOGI("pose landmark proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseLandmarkOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        option->library_path  = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->pose_presence_threshold = 0.5;
+        option->landmark_visibility_threshold = 0.1;
+        if (gComputeUnitType == 1) {
+            option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gBlazePoseLandmark->Init(option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gBlazePoseLandmark->setNpuModelPath(modelPathStr + "/");
+            gBlazePoseLandmark->setCheckNpuSwitch(false);
+            status = gBlazePoseLandmark->Init(option);
+        } else {
+            option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gBlazePoseLandmark->Init(option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("blaze pose landmark init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/pose_landmark_full_body.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_landmark_full_body.tnnmodel");
+    LOGI("pose landmark full body proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseLandmarkOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        option->library_path  = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->pose_presence_threshold = 0.5;
+        option->landmark_visibility_threshold = 0.1;
+        option->full_body = true;
+        if (gComputeUnitType == 1) {
+            option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+            status = gBlazePoseFullBodyLandmark->Init(option);
+        } else if (gComputeUnitType == 2) {
+            //add for huawei_npu store the om file
+            option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+            gBlazePoseFullBodyLandmark->setNpuModelPath(modelPathStr + "/");
+            gBlazePoseFullBodyLandmark->setCheckNpuSwitch(false);
+            status = gBlazePoseFullBodyLandmark->Init(option);
+        } else {
+            option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+            status = gBlazePoseFullBodyLandmark->Init(option);
+        }
+
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("blaze pose landmark init failed %d", (int)status);
+            return -1;
+        }
+    }
+
+    status = gDetector->Init({gBlazePoseDetector, gBlazePoseLandmark});
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("pose detector init failed %d", (int)status);
+        return -1;
+    }
+
+    status = gFullBodyDetector->Init({gBlazePoseDetector, gBlazePoseFullBodyLandmark});
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("pose full body detector init failed %d", (int)status);
+        return -1;
+    }
+
+    if (clsObjectInfo == NULL) {
+        clsObjectInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/ObjectInfo")));
+        midconstructorObjectInfo = env->GetMethodID(clsObjectInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsObjectInfo, "x1" , "F");
+        fidy1 = env->GetFieldID(clsObjectInfo, "y1" , "F");
+        fidx2 = env->GetFieldID(clsObjectInfo, "x2" , "F");
+        fidy2 = env->GetFieldID(clsObjectInfo, "y2" , "F");
+        fidscore = env->GetFieldID(clsObjectInfo, "score" , "F");
+        fidcls = env->GetFieldID(clsObjectInfo, "class_id", "I");
+        fidkeypoints = env->GetFieldID(clsObjectInfo, "key_points", "[[F");
+        fidlines = env->GetFieldID(clsObjectInfo, "lines", "[[I");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_POSE_DETECT_LANDMARK(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::PoseDetectLandmark tmpDetector, tmpFullBodyDetector;
+    std::shared_ptr<TNN_NS::BlazePoseDetector> blazePoseDetector = std::make_shared<TNN_NS::BlazePoseDetector>();
+    std::shared_ptr<TNN_NS::BlazePoseLandmark> blazePoseLandmark = std::make_shared<TNN_NS::BlazePoseLandmark>();
+    std::shared_ptr<TNN_NS::BlazePoseLandmark> blazePoseFullBodyLandmark = std::make_shared<TNN_NS::BlazePoseLandmark>();
+    std::string protoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/pose_detection.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_detection.tnnmodel");
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseDetectorOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        option->library_path = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->min_score_threshold = 0.5;
+        option->min_suppression_threshold = 0.3;
+        blazePoseDetector->setNpuModelPath(modelPathStr + "/");
+        blazePoseDetector->setCheckNpuSwitch(true);
+        TNN_NS::Status ret = blazePoseDetector->Init(option);
+        if (ret != TNN_NS::TNN_OK) {
+            LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+            return false;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/pose_landmark_upper_body.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_landmark_upper_body.tnnmodel");
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseLandmarkOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        option->library_path = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->pose_presence_threshold = 0.5;
+        option->landmark_visibility_threshold = 0.1;
+        blazePoseLandmark->setNpuModelPath(modelPathStr + "/");
+        blazePoseLandmark->setCheckNpuSwitch(true);
+        TNN_NS::Status ret = blazePoseLandmark->Init(option);
+        if (ret != TNN_NS::TNN_OK) {
+            LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+            return false;
+        }
+    }
+
+    protoContent = fdLoadFile(modelPathStr + "/pose_landmark_full_body.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/pose_landmark_full_body.tnnmodel");
+    {
+        auto option = std::make_shared<TNN_NS::BlazePoseLandmarkOption>();
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        option->library_path = "";
+        option->proto_content = protoContent;
+        option->model_content = modelContent;
+        option->pose_presence_threshold = 0.5;
+        option->landmark_visibility_threshold = 0.1;
+        option->full_body = true;
+        blazePoseFullBodyLandmark->setNpuModelPath(modelPathStr + "/");
+        blazePoseFullBodyLandmark->setCheckNpuSwitch(true);
+        TNN_NS::Status ret = blazePoseFullBodyLandmark->Init(option);
+        if (ret != TNN_NS::TNN_OK) {
+            LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+            return false;
+        }
+    }
+
+    TNN_NS::Status ret = tmpDetector.Init({blazePoseDetector, blazePoseLandmark});
+    if (ret != TNN_NS::TNN_OK) {
+        LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        return false;
+    }
+
+    ret = tmpFullBodyDetector.Init({blazePoseDetector, blazePoseFullBodyLandmark});
+    if (ret != TNN_NS::TNN_OK) {
+        LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        return false;
+    }
+    return true;
+}
+
+JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(deinit)(JNIEnv *env, jobject thiz)
+{
+    gDetector = nullptr;
+    gFullBodyDetector = nullptr;
+    gBlazePoseDetector = nullptr;
+    gBlazePoseLandmark = nullptr;
+    gBlazePoseFullBodyLandmark = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_POSE_DETECT_LANDMARK(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate, jint detector_type)
+{
+    jobjectArray objectInfoArray;
+    std::shared_ptr<TNN_NS::PoseDetectLandmark> asyncRefDetector;
+    if (detector_type == 0) {
+        asyncRefDetector = gDetector;
+    } else {
+        asyncRefDetector = gFullBodyDetector;
+    }
+    std::vector<TNN_NS::BlazePoseInfo> objectInfoList;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    if (!output) {
+        LOGD("Get empty output\n");
+        return 0;
+    } else {
+        TNN_NS::BlazePoseLandmarkOutput* ptr = dynamic_cast<TNN_NS::BlazePoseLandmarkOutput *>(output.get());
+        if (!ptr) {
+            LOGD("BlazePose Landmark output empty\n");
+            return 0;
+        }
+    }
+
+    objectInfoList = dynamic_cast<TNN_NS::BlazePoseLandmarkOutput *>(output.get())->body_list;
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    LOGI("object info list size %d", objectInfoList.size());
+    // TODO: copy object info list
+    if (objectInfoList.size() > 0) {
+        objectInfoArray = env->NewObjectArray(objectInfoList.size(), clsObjectInfo, NULL);
+        for (int i = 0; i < objectInfoList.size(); i++) {
+            jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+            auto& objectInfo = objectInfoList[i];
+            int keypointsNum = objectInfo.key_points_3d.size();
+            int linesNum = objectInfo.lines.size();
+            LOGI("object %f %f %f %f score %f key points size %d, label_id: %d, line num: %d",
+                    objectInfo.x1, objectInfo.y1, objectInfo.x2,
+                    objectInfo.y2, objectInfo.score, keypointsNum, objectInfo.class_id, linesNum);
+            auto object_orig = objectInfo.AdjustToViewSize(view_height, view_width, 2);
+            //from here start to create point
+            jclass cls1dArr = env->FindClass("[F");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray outer = env->NewObjectArray(keypointsNum, cls1dArr, NULL);
+            for (int j = 0; j < keypointsNum; j++) {
+                jfloatArray inner = env->NewFloatArray(2);
+                float temp[] = {std::get<0>(object_orig.key_points_3d[j]), std::get<1>(object_orig.key_points_3d[j])};
+                env->SetFloatArrayRegion(inner, 0, 2, temp);
+                env->SetObjectArrayElement(outer, j, inner);
+                env->DeleteLocalRef(inner);
+            }
+
+            //from here start to create line
+            jclass line1dArr = env->FindClass("[I");
+            // Create the returnable jobjectArray with an initial value
+            jobjectArray lineOuter = env->NewObjectArray(linesNum, line1dArr, NULL);
+            for (int j = 0; j < linesNum; j++) {
+                jintArray lineInner = env->NewIntArray(2);
+                int temp[] = {object_orig.lines[j].first, object_orig.lines[j].second};
+                env->SetIntArrayRegion(lineInner, 0, 2, temp);
+                env->SetObjectArrayElement(lineOuter, j, lineInner);
+                env->DeleteLocalRef(lineInner);
+            }
+            env->SetFloatField(objObjectInfo, fidx1, object_orig.x1);
+            env->SetFloatField(objObjectInfo, fidy1, object_orig.y1);
+            env->SetFloatField(objObjectInfo, fidx2, object_orig.x2);
+            env->SetFloatField(objObjectInfo, fidy2, object_orig.y2);
+            env->SetFloatField(objObjectInfo, fidscore, object_orig.score);
+            env->SetIntField(objObjectInfo, fidcls, object_orig.class_id);
+            env->SetObjectField(objObjectInfo, fidkeypoints, outer);
+            env->SetObjectField(objObjectInfo, fidlines, lineOuter);
+            env->SetObjectArrayElement(objectInfoArray, i, objObjectInfo);
+            env->DeleteLocalRef(objObjectInfo);
+        }
+        return objectInfoArray;
+    }
+
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.h
new file mode 100644
index 0000000..072f02f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_POSE_DETECT_LANDMARK_JNI_H
+#define ANDROID_POSE_DETECT_LANDMARK_JNI_H
+#include <jni.h>
+#define TNN_POSE_DETECT_LANDMARK(sig) Java_com_tencent_tnn_demo_PoseDetectLandmark_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_POSE_DETECT_LANDMARK(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_POSE_DETECT_LANDMARK(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate, jint detector_type);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_POSE_DETECT_LANDMARK_JNI_H
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc
new file mode 100644
index 0000000..11b3511
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc
@@ -0,0 +1,233 @@
+//
+// Created by tencent on 2020-12-11.
+//
+#include "skeleton_detector_jni.h"
+#include "skeleton_detector.h"
+#include "kannarotate.h"
+#include "yuv420sp_to_rgb_fast_asm.h"
+#include <jni.h>
+#include "helper_jni.h"
+#include <android/bitmap.h>
+#include "tnn/utils/mat_utils.h"
+
+static std::shared_ptr<TNN_NS::SkeletonDetector> gDetector;
+static std::shared_ptr<TNN_NS::SkeletonDetector> gSmallDetector;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static jclass clsObjectInfo;
+static jmethodID midconstructorObjectInfo;
+static jfieldID fidx1;
+static jfieldID fidy1;
+static jfieldID fidx2;
+static jfieldID fidy2;
+static jfieldID fidscore;
+static jfieldID fidcls;
+static jfieldID fidkeypoints;
+static jfieldID fidlines;
+// Jni functions
+
+JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computUnitType)
+{
+    // Reset bench description
+    setBenchResult("");
+    std::vector<int> nchw = {1, 3, height, width};
+    gDetector = std::make_shared<TNN_NS::SkeletonDetector>();
+    gSmallDetector = std::make_shared<TNN_NS::SkeletonDetector>();
+    std::string protoContent, middleProtoContent, smallProtoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/skeleton_big.tnnproto");
+    smallProtoContent = fdLoadFile(modelPathStr + "/skeleton_small.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/skeleton.tnnmodel");
+    LOGI("big proto content size: %d, "
+         "small proto content size: %d, "
+         "model content size %d", protoContent.length(),
+         smallProtoContent.length(), modelContent.length());
+    gComputeUnitType = computUnitType;
+
+    TNN_NS::Status status = TNN_NS::TNN_OK, status1 = TNN_NS::TNN_OK;
+    auto option = std::make_shared<TNN_NS::SkeletonDetectorOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+    option->library_path  = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->min_threshold = 0.15f;
+
+    auto smallDetectorOption = std::make_shared<TNN_NS::SkeletonDetectorOption>(*option);
+    smallDetectorOption->proto_content = smallProtoContent;
+    LOGI("device type: %d", gComputeUnitType);
+    if (gComputeUnitType == 1) {
+        option->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status = gDetector->Init(option);
+
+        smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsGPU;
+        status1 = gSmallDetector->Init(smallDetectorOption);
+    } else if (gComputeUnitType == 2) {
+        //add for huawei_npu store the om file
+        option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gDetector->setNpuModelPath(modelPathStr + "/");
+        gDetector->setCheckNpuSwitch(false);
+        status = gDetector->Init(option);
+
+        smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+        gSmallDetector->setNpuModelPath(modelPathStr + "/");
+        gSmallDetector->setCheckNpuSwitch(false);
+        status1 = gSmallDetector->Init(smallDetectorOption);
+    } else {
+    	status = gDetector->Init(option);
+        status1 = gSmallDetector->Init(smallDetectorOption);
+    }
+
+    if (status != TNN_NS::TNN_OK || status1 != TNN_NS::TNN_OK) {
+        LOGE("detector init failed high precision mode status: %d, fast mode status: %d",
+             (int)status, (int)status1);
+        return -1;
+    }
+
+    if (clsObjectInfo == NULL) {
+        clsObjectInfo = static_cast<jclass>(env->NewGlobalRef(env->FindClass("com/tencent/tnn/demo/ObjectInfo")));
+        midconstructorObjectInfo = env->GetMethodID(clsObjectInfo, "<init>", "()V");
+        fidx1 = env->GetFieldID(clsObjectInfo, "x1" , "F");
+        fidy1 = env->GetFieldID(clsObjectInfo, "y1" , "F");
+        fidx2 = env->GetFieldID(clsObjectInfo, "x2" , "F");
+        fidy2 = env->GetFieldID(clsObjectInfo, "y2" , "F");
+        fidscore = env->GetFieldID(clsObjectInfo, "score" , "F");
+        fidcls = env->GetFieldID(clsObjectInfo, "class_id", "I");
+        fidkeypoints = env->GetFieldID(clsObjectInfo, "key_points", "[[F");
+        fidlines = env->GetFieldID(clsObjectInfo, "lines", "[[I");
+    }
+
+    return 0;
+}
+
+JNIEXPORT JNICALL jboolean TNN_SKELETON_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath) {
+    TNN_NS::SkeletonDetector tmpDetector, tmpMiddleDetector, tmpSmallDetector;
+    std::string protoContent, middleProtoContent, smallProtoContent, modelContent;
+    std::string modelPathStr(jstring2string(env, modelPath));
+    protoContent = fdLoadFile(modelPathStr + "/skeleton_big.tnnproto");
+    middleProtoContent = fdLoadFile(modelPathStr + "/skeleton_middle.tnnproto");
+    smallProtoContent = fdLoadFile(modelPathStr + "/skeleton_small.tnnproto");
+    modelContent = fdLoadFile(modelPathStr + "/skeleton.tnnmodel");
+    auto option = std::make_shared<TNN_NS::SkeletonDetectorOption>();
+    option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    option->library_path = "";
+    option->proto_content = protoContent;
+    option->model_content = modelContent;
+    option->min_threshold = 0.15f;
+
+    auto middleDetectorOption = std::make_shared<TNN_NS::SkeletonDetectorOption>(*option);
+    middleDetectorOption->proto_content  = middleProtoContent;
+
+    auto smallDetectorOption = std::make_shared<TNN_NS::SkeletonDetectorOption>(*option);
+    smallDetectorOption->proto_content = smallProtoContent;
+    tmpDetector.setNpuModelPath(modelPathStr + "/");
+    tmpDetector.setCheckNpuSwitch(true);
+    TNN_NS::Status ret = tmpDetector.Init(option);
+    if (ret != TNN_NS::TNN_OK) {
+        LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        return false;
+    }
+
+    tmpMiddleDetector.setNpuModelPath(modelPathStr + "/");
+    tmpMiddleDetector.setCheckNpuSwitch(true);
+    ret = tmpMiddleDetector.Init(middleDetectorOption);
+    if (ret != TNN_NS::TNN_OK) {
+        LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        return false;
+    }
+
+    tmpSmallDetector.setNpuModelPath(modelPathStr + "/");
+    tmpSmallDetector.setCheckNpuSwitch(true);
+    ret = tmpSmallDetector.Init(smallDetectorOption);
+    if (ret != TNN_NS::TNN_OK) {
+        LOGE("checkNpu failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        return false;
+    }
+    return true;
+}
+
+JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(deinit)(JNIEnv *env, jobject thiz)
+{
+    gDetector = nullptr;
+    gSmallDetector = nullptr;
+    return 0;
+}
+
+JNIEXPORT JNICALL jobjectArray TNN_SKELETON_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate, jint detector_type)
+{
+    jobjectArray objectInfoArray;
+    std::shared_ptr<TNN_NS::SkeletonDetector> asyncRefDetector = gDetector;
+    if (detector_type == 1) {
+        asyncRefDetector = gSmallDetector;
+    }
+    TNN_NS::SkeletonInfo objectInfo;
+    // Convert yuv to rgb
+    LOGI("detect from stream %d x %d r %d", width, height, rotate);
+    unsigned char *yuvData = new unsigned char[height * width * 3 / 2];
+    jbyte *yuvDataRef = env->GetByteArrayElements(yuv420sp, 0);
+    int ret = kannarotate_yuv420sp((const unsigned char*)yuvDataRef, (int)width, (int)height, (unsigned char*)yuvData, (int)rotate);
+    env->ReleaseByteArrayElements(yuv420sp, yuvDataRef, 0);
+    unsigned char *rgbaData = new unsigned char[height * width * 4];
+    yuv420sp_to_rgba_fast_asm((const unsigned char*)yuvData, height, width, (unsigned char*)rgbaData);
+    TNN_NS::DeviceType dt = TNN_NS::DEVICE_ARM;
+
+    TNN_NS::DimsVector input_dims = {1, 4, width, height};
+
+    auto input_mat = std::make_shared<TNN_NS::Mat>(dt, TNN_NS::N8UC4, input_dims, rgbaData);
+
+    std::shared_ptr<TNN_NS::TNNSDKInput> input = std::make_shared<TNN_NS::TNNSDKInput>(input_mat);
+    std::shared_ptr<TNN_NS::TNNSDKOutput> output = std::make_shared<TNN_NS::TNNSDKOutput>();
+
+    TNN_NS::Status status = asyncRefDetector->Predict(input, output);
+
+    objectInfo = dynamic_cast<TNN_NS::SkeletonDetectorOutput *>(output.get())->keypoints;
+    delete [] yuvData;
+    delete [] rgbaData;
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("failed to detect %d", (int)status);
+        return 0;
+    }
+
+    objectInfoArray = env->NewObjectArray(1, clsObjectInfo, NULL);
+    jobject objObjectInfo = env->NewObject(clsObjectInfo, midconstructorObjectInfo);
+    int keypointsNum = objectInfo.key_points.size();
+    int linesNum = objectInfo.lines.size();
+    LOGI("object %f %f %f %f score %f key points size %d, label_id: %d, line num: %d",
+            objectInfo.x1, objectInfo.y1, objectInfo.x2,
+            objectInfo.y2, objectInfo.score, keypointsNum, objectInfo.class_id, linesNum);
+    auto object_orig = objectInfo.AdjustToViewSize(view_height, view_width, 2);
+    //from here start to create point
+    jclass cls1dArr = env->FindClass("[F");
+    // Create the returnable jobjectArray with an initial value
+    jobjectArray outer = env->NewObjectArray(keypointsNum, cls1dArr, NULL);
+    for (int j = 0; j < keypointsNum; j++) {
+        jfloatArray inner = env->NewFloatArray(2);
+        float temp[] = {object_orig.key_points[j].first, object_orig.key_points[j].second};
+        env->SetFloatArrayRegion(inner, 0, 2, temp);
+        env->SetObjectArrayElement(outer, j, inner);
+        env->DeleteLocalRef(inner);
+    }
+
+    //from here start to create line
+    jclass line1dArr = env->FindClass("[I");
+    // Create the returnable jobjectArray with an initial value
+    jobjectArray lineOuter = env->NewObjectArray(linesNum, line1dArr, NULL);
+    for (int j = 0; j < linesNum; j++) {
+        jintArray lineInner = env->NewIntArray(2);
+        int temp[] = {object_orig.lines[j].first, object_orig.lines[j].second};
+        env->SetIntArrayRegion(lineInner, 0, 2, temp);
+        env->SetObjectArrayElement(lineOuter, j, lineInner);
+        env->DeleteLocalRef(lineInner);
+    }
+    env->SetFloatField(objObjectInfo, fidx1, object_orig.x1);
+    env->SetFloatField(objObjectInfo, fidy1, object_orig.y1);
+    env->SetFloatField(objObjectInfo, fidx2, object_orig.x2);
+    env->SetFloatField(objObjectInfo, fidy2, object_orig.y2);
+    env->SetFloatField(objObjectInfo, fidscore, object_orig.score);
+    env->SetIntField(objObjectInfo, fidcls, object_orig.class_id);
+    env->SetObjectField(objObjectInfo, fidkeypoints, outer);
+    env->SetObjectField(objObjectInfo, fidlines, lineOuter);
+    env->SetObjectArrayElement(objectInfoArray, 0, objObjectInfo);
+    env->DeleteLocalRef(objObjectInfo);
+
+    return objectInfoArray;
+
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.h
new file mode 100644
index 0000000..b64e5f2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_SKELETON_DETECTOR_JNI_H
+#define ANDROID_SKELETON_DETECTOR_JNI_H
+#include <jni.h>
+#define TNN_SKELETON_DETECTOR(sig) Java_com_tencent_tnn_demo_SkeletonDetector_##sig
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computUnitType);
+JNIEXPORT JNICALL jboolean TNN_SKELETON_DETECTOR(checkNpu)(JNIEnv *env, jobject thiz, jstring modelPath);
+JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(deinit)(JNIEnv *env, jobject thiz);
+JNIEXPORT JNICALL jobjectArray TNN_SKELETON_DETECTOR(detectFromStream)(JNIEnv *env, jobject thiz, jbyteArray yuv420sp, jint width, jint height, jint view_width, jint view_height, jint rotate, jint detector_type);
+
+#ifdef __cplusplus
+}
+#endif
+#endif //ANDROID_SKELETON_DETECTOR_JNI_H
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.cc
new file mode 100644
index 0000000..687e0fc
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.cc
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_jni.h"
+#include "tnn_lib.h"
+#include <android/bitmap.h>
+#include <chrono>
+#include<stdlib.h>
+
+using namespace std::chrono;
+
+char * global_path = "/storage/emulated/0/log";
+
+static jfieldID getHandleField(JNIEnv *env, jobject obj){
+    jclass c = env->GetObjectClass(obj);
+    // J is the type signature for long:
+    return env->GetFieldID(c, "nativePtr", "J");
+}
+
+template <typename T>
+T *getHandle(JNIEnv *env, jobject obj){
+    jlong handle = env->GetLongField(obj, getHandleField(env, obj));
+    return reinterpret_cast<T *>(handle);
+}
+
+template <typename T>
+void setHandle(JNIEnv *env, jobject obj, T *t){
+    jlong handle = reinterpret_cast<jlong>(t);
+    env->SetLongField(obj, getHandleField(env, obj), handle);
+}
+
+
+char* jstringTostring(JNIEnv* env, jstring jstr)
+{
+    char* rtn = NULL;
+    jclass clsstring = env->FindClass("java/lang/String");
+    jstring strencode = env->NewStringUTF("utf-8");
+    jmethodID mid = env->GetMethodID(clsstring, "getBytes", "(Ljava/lang/String;)[B");
+    jbyteArray barr= (jbyteArray)env->CallObjectMethod(jstr, mid, strencode);
+    jsize alen = env->GetArrayLength(barr);
+    jbyte* ba = env->GetByteArrayElements(barr, JNI_FALSE);
+    if (alen > 0)
+    {
+        rtn = (char*)malloc(alen + 1);
+        memcpy(rtn, ba, alen);
+        rtn[alen] = 0;
+    }
+    env->ReleaseByteArrayElements(barr, ba, 0);
+    return rtn;
+}
+
+jstring stoJstring(JNIEnv* env, const char* pat)
+{
+    jclass strClass = env->FindClass("Ljava/lang/String;");
+    jmethodID ctorID = env->GetMethodID(strClass, "<init>", "([BLjava/lang/String;)V");
+    jbyteArray bytes = env->NewByteArray(strlen(pat));
+    env->SetByteArrayRegion(bytes, 0, strlen(pat), (jbyte*)pat);
+    jstring encoding = env->NewStringUTF("utf-8");
+    return (jstring)env->NewObject(strClass, ctorID, bytes, encoding);
+}
+
+
+JNIEXPORT jint JNICALL  Java_com_tencent_tnn_demo_TNNLib_init(JNIEnv* env, jobject thiz, jstring protoFilePath, jstring modelFilePath, jstring device) {
+    TNNLib * lib = new TNNLib();
+    int result = lib->Init(jstringTostring(env, protoFilePath), jstringTostring(env, modelFilePath), jstringTostring(env, device));
+    setHandle(env, thiz, lib);
+    return result;
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_tencent_tnn_demo_TNNLib_forward(JNIEnv* env, jobject thiz, jobject imageSource) {
+    TNNLib* inst = getHandle<TNNLib>(env, thiz);
+
+    AndroidBitmapInfo  sourceInfocolor;
+    void*              sourcePixelscolor;
+
+    if (AndroidBitmap_getInfo(env, imageSource, &sourceInfocolor) < 0) {
+        return 0;
+    }
+
+    if (sourceInfocolor.format != ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        return 0;
+    }
+
+    if ( AndroidBitmap_lockPixels(env, imageSource, &sourcePixelscolor) < 0) {
+        return 0;
+    }
+    std::vector<float> results = inst->Forward(sourcePixelscolor);
+
+    jfloatArray result_array;
+    result_array = env->NewFloatArray(results.size());
+    env->SetFloatArrayRegion(result_array, 0, results.size(), results.data());
+
+    AndroidBitmap_unlockPixels(env, imageSource);
+
+    return result_array;
+}
+
+JNIEXPORT jint JNICALL Java_com_tencent_tnn_demo_TNNLib_deinit(
+        JNIEnv *env, jobject thiz) {
+    TNNLib* inst = getHandle<TNNLib>(env, thiz);
+    delete inst;
+    setHandle(env, thiz, (TNNLib*)0);
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.h
new file mode 100644
index 0000000..28ba1b0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_jni.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef ANDROID_TNN_JNI_H_
+#define ANDROID_TNN_JNI_H_
+
+#include <jni.h>
+#include <string>
+#include <android/log.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT JNICALL jint Java_com_tencent_tnn_demo_TNNLib_init(
+        JNIEnv *env, jobject thiz, jstring protoFilePath, jstring modelFilePath, jstring device);
+
+JNIEXPORT JNICALL jfloatArray Java_com_tencent_tnn_demo_TNNLib_forward(
+        JNIEnv *env, jobject thiz, jobject imageSource);
+
+JNIEXPORT JNICALL jint Java_com_tencent_tnn_demo_TNNLib_deinit(
+        JNIEnv *env, jobject thiz);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANDROID_TNN_JNI_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.cc b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.cc
new file mode 100644
index 0000000..0ff7c71
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.cc
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_lib.h"
+
+#include <fstream>
+#include <random>
+#include <chrono>
+#include "tnn/core/common.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/blob_converter.h"
+
+
+TNNLib::TNNLib() {}
+
+TNNLib::~TNNLib() {
+    tnn_.DeInit();
+}
+
+int TNNLib::Init(const std::string& proto_file,
+               const std::string& model_file, const std::string& device) {
+    TNN_NS::ModelConfig model_config;
+    {
+        std::ifstream f(proto_file);
+        std::string buffer;
+        buffer = std::string((std::istreambuf_iterator<char>(f)),
+                             std::istreambuf_iterator<char>());
+
+        model_config.params.push_back(buffer);
+    }
+    {
+        std::ifstream f(model_file);
+        std::string buffer;
+        buffer = std::string((std::istreambuf_iterator<char>(f)),
+                             std::istreambuf_iterator<char>());
+
+        model_config.params.push_back(buffer);
+    }
+
+    tnn_.Init(model_config);
+
+    TNN_NS::Status error;
+
+    TNN_NS::NetworkConfig cpu_network_config;
+    if("ARM" == device) {
+        cpu_network_config.device_type = TNN_NS::DEVICE_ARM;
+    } else if("OPENCL" == device){
+        cpu_network_config.device_type = TNN_NS::DEVICE_OPENCL;
+    } else if ("HUAWEI_NPU" == device) {
+        cpu_network_config.device_type = TNN_NS::DEVICE_HUAWEI_NPU;
+    }
+    instance_ = tnn_.CreateInst(cpu_network_config, error);
+    return (int)error;
+
+}
+
+std::vector<float> TNNLib::Forward(void* sourcePixelscolor) {
+    if (!instance_) {
+        return {};
+    }
+
+    void* command_queue;
+    instance_->GetCommandQueue((void**)&command_queue);
+
+    TNN_NS::BlobMap input_blobs;
+    instance_->GetAllInputBlobs(input_blobs);
+    TNN_NS::Blob* input = input_blobs.begin()->second;
+    TNN_NS::Mat input_mat(TNN_NS::DEVICE_ARM, TNN_NS::N8UC4, sourcePixelscolor);
+
+    TNN_NS::BlobConverter input_blob_convert(input);
+    TNN_NS::MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0/ (255 * 0.229), 1.0/ (255 * 0.224), 1.0/(255 * 0.225), 0.0};
+    input_convert_param.bias = {-0.485/0.229, -0.456/ 0.224, -0.406/0.225, 0.0};
+    input_blob_convert.ConvertFromMat(input_mat, input_convert_param, command_queue);
+
+    instance_->Forward();
+
+    TNN_NS::BlobMap output_blobs;
+    instance_->GetAllOutputBlobs(output_blobs);
+    TNN_NS::Blob* output = output_blobs.begin()->second;
+
+    int output_count = TNN_NS::DimsVectorUtils::Count(output->GetBlobDesc().dims);
+    std::vector<float> result(output_count);
+    TNN_NS::Mat output_mat(TNN_NS::DEVICE_ARM, TNN_NS::NCHW_FLOAT, result.data());
+
+    TNN_NS::BlobConverter output_blob_convert(output);
+    TNN_NS::MatConvertParam output_convert_param;
+    output_blob_convert.ConvertToMat(output_mat, output_convert_param, command_queue);
+
+    return result;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.h b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.h
new file mode 100644
index 0000000..4bef391
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/cc/tnn_lib.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#ifndef ANDROID_TNN_LIB_H_
+#define ANDROID_TNN_LIB_H_
+
+#include "tnn/core/tnn.h"
+#include "tnn/core/instance.h"
+
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+
+class TNNLib {
+public:
+    TNNLib();
+
+    int Init(const std::string& proto_file, const std::string& model_file, const std::string& device);
+
+    std::vector<float> Forward(void* sourcePixelscolor);
+
+    ~TNNLib();
+
+private:
+
+    TNN_NS::TNN tnn_;
+    std::shared_ptr<TNN_NS::Instance> instance_;
+
+};
+
+#endif // ANDROID_TNN_LIB_H_
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/include/kannarotate.h b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/include/kannarotate.h
new file mode 100644
index 0000000..50675aa
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/include/kannarotate.h
@@ -0,0 +1,112 @@
+#ifndef KANNAROTATE_H
+#define KANNAROTATE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if 0
+/*
+            /:::
+            ;  ':/. _
+            `     -√
+            .       :-
+            |        :'
+           ,,         ;                     __',,,,,"-------------"
+           -          -                ',,'-,,'''                '',---+--''__-__          '--//                 '--//
+          ,'          -            ".-+-''                    `__''     '''  _'.'        `__`  :'              '__`  :`
+           .          -       -'-·:.·               ...---::--.+++,---,+-- ''     --.   _:`   :'              .:'   :'
+         `-           ;,     ___--            `__-/.--------.:        '/...:',_,   .:--`--   `.              :.    '.
+         ;             _ ,_,`             `.:/:::.''''       --      '-::--.       -/::-`    :              __     :
+         '.            '/:'            _-+-:.''         ·     _:- `·-·'      '---'  /-::::   /              /      /
+         `.             '_,'        `.::-`             ·-       ' ''            ''-.:. `-::` /             :'      /
+         _                ·-;.'   ,::.'               __        ·              '        '-:-:·           _-       :`
+         |-                  ·,-+-,                  '-        '-              :.         '_/.          _-        _-
+         '/       ,           ·_·                    :         ·                /           ':-       '-.          /
+          :      ;  ,        __                     -'         :                --            ./.   `,.`           /
+          .-      '' '     ·.·                     '-''''     ':                 .             ./-.-.·             /
+           `:`      ,,-'. ·-              __   ''../...'''____-.                 ''              +·                /
+            ':-       '·''·               /   __' .:         '/                  ·_              :'/              -.
+             /.;;'       -               '-       :           /                ''_/''            -.':            '/
+            ;;-·'·--''  ,               :        __          .:               _''':''''___`       : :' '' _''   '/
+           '√'      ·--+♪              ':       `:           /.                   :       _-_     :  / -.-...  -:
+          ';            ♪              :'       ::          .:               -    ::              :' :' :.`  ,:.
+          :            ';              /       .-:          .:    -          -    :/              __  / ` _-:.
+         --            :`             ':      ':.`         -.:    /          `    ::`         /   `:  :---`..
+        ';             ';             /`      -./          : :    :          ''   /.:        -`    /  `:    -
+       '♪            ,;'              /       : /         .: :   `:          ''   /':        '     /   :    ''
+''   .--          _.'-                /      :-'/         /'':   -;               / :              :   -'   '-
+ ♪·--'           '_;  _              .-      /`-:     '   :'':   //          _`   :'-        -`    :    _   -.
+ ,·'           '..'. '.              .'     -.':.    .:  --'-:   /\          :-   / _'       ''    :    :   :
+   ._..'            ·,·              ;      /'':',,,,:/''/.'/-   /\`         ::   /  _       _     /    _  .-
+     '''·-···-·----- ♪               ;     ··`.:.'''-:-.-:-:--   /\`         /-   / '_      -:     /    _  /
+         :  '''''   :/-             '-::___:.-::/-/-::-.:-.:.-   //`       '.:/___:'':      ::    _-    _` /
+        :'          //-.            .':/-__/-'../..':/'':''::-  `::`       '.:/''_:-':      :/    :`    __ .:`
+       .:          ::/-:            - -::-.-/'''---+-+::/--+--' '-.'      :-//:-'-:-,:     :`:    /     __  '---
+       /'         ':::-/            --//....   -:      /  .' '.--· _--++·`/./`:-'-/--/..''':':    /     .:'''.-/
+      :.          :::-./            .` :'-...  -:  '_.'/  .          ''.'.h:-.u/::-''/''+++::·   `:     .:....·
+     '-          -/.'.'/            -  __  '/''''  :_.': ''              ';----';''''-.:-.-::·   :'     _·
+    '·   ;      ·/:'/.·/            '  .'   -://::-' ':'''               :'__' _:   '  ': :.`:··:/:  :  :`
+   ';   .      '/:- -  /           ''   ___.-  ''-.'-:'''                :-.:. -:  -::''/ - .:--:'   /  /
+  _<    .      :-·'   :·           ·      ·_,,,,,:-:-''                  :` .:''''./ ./:..` /-''/    /  /
+ .,    ·      `" .-   ·/           ·                                     ':'--:::/:  '+-.`  .-.'':   :::-
+'·/    /      /·  .'  '_           ·   .·. .                              '-4 ''''/.:_,,'   : ''--  `;/ `
+:^:^   ;/;.   "'   |'  ':          -   ·____;                                ...--:--'     _· · -   -/
+·:·;.  .2'-- '^    ·__··/     ·    :     .,,..                                '            _ '' :   ::
+  __·   ;♪ --.'     ''..:                                                       ,,,       :` ''':   ♪
+   ·__· ·,_'//      '' '--    -    -                                          _-   -      /  '' /  ·
+      _,_';/':,  '  ''' _/                                                     ... .-     :.''  /  ..
+         _:;;:/' '  '   ':.   :    ''                                                     :''  '/  :
+            .:.:''--_____'/         :                                                    ./ '  -'  :
+                '____...../.  .     ..:.                                                -: ''''-  ♪
+                  __.      /        :  `--.                                          `-._  '''__  :
+                    '..."  _.   ·   .'    '...'                                  `--;-'  _  _♪   `:
+                         .../   :   ./.        ___'_       ~.               `-.--.''     `/'-/   ♪
+                        ·...:'  :.   /'''''''.',,,/:;..:'''''''''''_'..-/`---'''''       -:''"   :
+                            ':  :.   :-----"                              --::---------:-/`'':  _
+                             /  :.'  -''''                                    ''''''''   /`'.:  :
+                             ·- :-"  `-                                                 `- '   .-
+                              :':.:.  "                                                 :''    :
+                              '.:  -.'"                                                 /''.- .-
+                                    ···                                                 --.♪  /
+                                                                                          .:  ♪
+                                                                                           :__
+                                                                                            ·u·
+
+*/
+#endif
+
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+
+// src, input image, allocated size = srcw * srch * channels
+// srcw, input image width
+// srch, input image height
+// dst, output image, allocated size = srcw * srch * channels
+// channels, channel count, ex. 1=gray 3=rgb 4=rgba
+// type, rotate type from, 1/2/3/4/5/6/7/8
+// return 0 if success
+int kannarotate(const unsigned char* src, int srcw, int srch, unsigned char* dst, int channels, int type);
+
+// convenient routine for yuv420sp
+// src, input image, allocated size = srcw * srch * 3/2
+// srcw, input image width, even value
+// srch, input image height, even value
+// dst, output image, allocated size = srcw * srch * 3/2
+// type, rotate type from, 1/2/3/4/5/6/7/8
+// return 0 if success
+int kannarotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KANNAROTATE_H
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/kannarotate.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/kannarotate.c
new file mode 100644
index 0000000..68e4d6b
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/kannarotate.c
@@ -0,0 +1,81 @@
+#include "kannarotate.h"
+#include <string.h>
+#include "rotate_x.h"
+
+typedef void (*rotate_func)(const unsigned char*, int, int, unsigned char*);
+
+static const rotate_func kannarotate_func_table[4][8] =
+{
+    {
+        NULL,        rotate_2_c1, rotate_3_c1, rotate_4_c1,
+        rotate_5_c1, rotate_6_c1, rotate_7_c1, rotate_8_c1
+    },
+    {
+        NULL,        rotate_2_c2, rotate_3_c2, rotate_4_c2,
+        rotate_5_c2, rotate_6_c2, rotate_7_c2, rotate_8_c2
+    },
+    {
+        NULL,        rotate_2_c3, rotate_3_c3, rotate_4_c3,
+        rotate_5_c3, rotate_6_c3, rotate_7_c3, rotate_8_c3
+    },
+    {
+        NULL,        rotate_2_c4, rotate_3_c4, rotate_4_c4,
+        rotate_5_c4, rotate_6_c4, rotate_7_c4, rotate_8_c4
+    }
+};
+
+int kannarotate(const unsigned char* src, int srcw, int srch, unsigned char* dst, int channels, int type)
+{
+    if (!src || srcw <= 0 || srch <= 0 || !dst)
+        return -1;
+
+    if (channels < 1 || channels > 4)
+        return -2;
+
+    if (type < 1 || type > 8)
+        return -3;
+
+    rotate_func f = kannarotate_func_table[channels-1][type-1];
+    if (!f)
+    {
+        // type 1, copy
+        memcpy(dst, src, srcw*srch*channels);
+
+        return 0;
+    }
+
+    f(src, srcw, srch, dst);
+
+    return 0;
+}
+
+int kannarotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int type)
+{
+    if (!src || srcw <= 0 || srch <= 0 || !dst)
+        return -1;
+
+    if (type < 1 || type > 8)
+        return -3;
+
+    if (srcw % 2 != 0 || srch % 2 != 0)
+        return -4;
+
+    rotate_func f1 = kannarotate_func_table[0][type-1];
+    rotate_func f2 = kannarotate_func_table[1][type-1];
+
+    if (!f1 || !f2)
+    {
+        // type 1, copy
+        memcpy(dst, src, srcw*srch*3/2);
+
+        return 0;
+    }
+
+    // Y
+    f1(src, srcw, srch, dst);
+
+    // UV
+    f2(src + srcw*srch, srcw/2, srch/2, dst + srcw*srch);
+
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_2.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_2.c
new file mode 100644
index 0000000..a9ea1e0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_2.c
@@ -0,0 +1,338 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_2_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dst + w - 1;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 15;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src = vld1_u8(src0);
+            uint8x8_t _src2 = vld1_u8(src0 + 8);
+
+            _src = vrev64_u8(_src);
+            _src2 = vrev64_u8(_src2);
+
+            vst1_u8(dst0, _src2);
+            vst1_u8(dst0 + 8, _src);
+
+            src0 += 16;
+            dst0 -= 16;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-16            \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.u8    {d0-d1}, [%1]!      \n"
+            "vrev64.u8  d3, d0              \n"
+            "vrev64.u8  d2, d1              \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d2-d3}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 15;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 -= 1;
+        }
+
+        dst0 += w*2;
+    }
+}
+
+void rotate_2_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dst + w*2 - 2;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*2;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src = vld2_u8(src0);
+            uint8x8x2_t _src2 = vld2_u8(src0 + 8*2);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+
+            vst2_u8(dst0, _src);
+            vst2_u8(dst0 - 8*2, _src2);
+
+            src0 += 16*2;
+            dst0 -= 16*2;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-16            \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d2-d3}, [%1]!      \n"
+            "vrev64.u8  d1, d1              \n"
+            "vrev64.u8  d2, d2              \n"
+            "vst2.u8    {d0-d1}, [%2], r4   \n"
+            "vrev64.u8  d3, d3              \n"
+            "subs       %0, #1              \n"
+            "vst2.u8    {d2-d3}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*2;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 -= 2;
+        }
+
+        dst0 += w*2*2;
+    }
+}
+
+void rotate_2_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dst + w*3 - 3;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*3;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src = vld3_u8(src0);
+            uint8x8x3_t _src2 = vld3_u8(src0 + 8*3);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+            _src.val[2] = vrev64_u8(_src.val[2]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+            _src2.val[2] = vrev64_u8(_src2.val[2]);
+
+            vst3_u8(dst0, _src);
+            vst3_u8(dst0 - 8*3, _src2);
+
+            src0 += 16*3;
+            dst0 -= 16*3;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-24            \n"
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "vrev64.u8  d1, d1              \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d4-d6}, [%1]!      \n"
+            "vrev64.u8  d2, d2              \n"
+            "vrev64.u8  d4, d4              \n"
+            "vst3.u8    {d0-d2}, [%2], r4   \n"
+            "vrev64.u8  d5, d5              \n"
+            "vrev64.u8  d6, d6              \n"
+            "subs       %0, #1              \n"
+            "vst3.u8    {d4-d6}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "q2", "q3", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*3;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 -= 3;
+        }
+
+        dst0 += w*3*2;
+    }
+}
+
+void rotate_2_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dst + w*4 - 4;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*4;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src = vld4_u8(src0);
+            uint8x8x4_t _src2 = vld4_u8(src0 + 8*4);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+            _src.val[2] = vrev64_u8(_src.val[2]);
+            _src.val[3] = vrev64_u8(_src.val[3]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+            _src2.val[2] = vrev64_u8(_src2.val[2]);
+            _src2.val[3] = vrev64_u8(_src2.val[3]);
+
+            vst4_u8(dst0, _src);
+            vst4_u8(dst0 - 8*4, _src2);
+
+            src0 += 16*4;
+            dst0 -= 16*4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-32            \n"
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "vrev64.u8  d1, d1              \n"
+            "vrev64.u8  d2, d2              \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d4-d7}, [%1]!      \n"
+            "vrev64.u8  d3, d3              \n"
+            "vrev64.u8  d4, d4              \n"
+            "vrev64.u8  d5, d5              \n"
+            "vst4.u8    {d0-d3}, [%2], r4   \n"
+            "vrev64.u8  d6, d6              \n"
+            "vrev64.u8  d7, d7              \n"
+            "subs       %0, #1              \n"
+            "vst4.u8    {d4-d7}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "q2", "q3", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*4;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 -= 4;
+        }
+
+        dst0 += w*4*2;
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_3.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_3.c
new file mode 100644
index 0000000..a004a22
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_3.c
@@ -0,0 +1,343 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_3_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dstend - 1;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if 0//__ARM_NEON
+        dst0 -= 7;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src = vld1_u8(src0);
+            uint8x8_t _src2 = vld1_u8(src0 + 8);
+
+            _src = vrev64_u8(_src);
+
+            _src2 = vrev64_u8(_src2);
+
+            vst1_u8(dst0, _src);
+            vst1_u8(dst0 - 8, _src2);
+
+            src0 += 16;
+            dst0 -= 16;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-16            \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "vrev64.u8  d1, d1              \n"
+            "subs       %0, #1              \n"
+            "vst2.u8    {d0-d1}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 -= 1;
+        }
+    }
+}
+
+void rotate_3_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 2;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dstend - 2;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*2;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src = vld2_u8(src0);
+            uint8x8x2_t _src2 = vld2_u8(src0 + 8*2);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+
+            vst2_u8(dst0, _src);
+            vst2_u8(dst0 - 8*2, _src2);
+
+            src0 += 16*2;
+            dst0 -= 16*2;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-16            \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d2-d3}, [%1]!      \n"
+            "vrev64.u8  d1, d1              \n"
+            "vrev64.u8  d2, d2              \n"
+            "vst2.u8    {d0-d1}, [%2], r4   \n"
+            "vrev64.u8  d3, d3              \n"
+            "subs       %0, #1              \n"
+            "vst2.u8    {d2-d3}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*2;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 -= 2;
+        }
+    }
+}
+
+void rotate_3_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 3;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dstend - 3;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*3;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src = vld3_u8(src0);
+            uint8x8x3_t _src2 = vld3_u8(src0 + 8*3);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+            _src.val[2] = vrev64_u8(_src.val[2]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+            _src2.val[2] = vrev64_u8(_src2.val[2]);
+
+            vst3_u8(dst0, _src);
+            vst3_u8(dst0 - 8*3, _src2);
+
+            src0 += 16*3;
+            dst0 -= 16*3;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-24            \n"
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "vrev64.u8  d1, d1              \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d4-d6}, [%1]!      \n"
+            "vrev64.u8  d2, d2              \n"
+            "vrev64.u8  d4, d4              \n"
+            "vst3.u8    {d0-d2}, [%2], r4   \n"
+            "vrev64.u8  d5, d5              \n"
+            "vrev64.u8  d6, d6              \n"
+            "subs       %0, #1              \n"
+            "vst3.u8    {d4-d6}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "q2", "q3", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*3;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 -= 3;
+        }
+    }
+}
+
+void rotate_3_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 4;
+
+    const unsigned char* src0 = src;
+    unsigned char* dst0 = dstend - 4;
+
+    int y = 0;
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        dst0 -= 7*4;
+
+        int nn = srcw >> 4;
+        int remain = srcw - (nn << 4);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src = vld4_u8(src0);
+            uint8x8x4_t _src2 = vld4_u8(src0 + 8*4);
+
+            _src.val[0] = vrev64_u8(_src.val[0]);
+            _src.val[1] = vrev64_u8(_src.val[1]);
+            _src.val[2] = vrev64_u8(_src.val[2]);
+            _src.val[3] = vrev64_u8(_src.val[3]);
+
+            _src2.val[0] = vrev64_u8(_src2.val[0]);
+            _src2.val[1] = vrev64_u8(_src2.val[1]);
+            _src2.val[2] = vrev64_u8(_src2.val[2]);
+            _src2.val[3] = vrev64_u8(_src2.val[3]);
+
+            vst4_u8(dst0, _src);
+            vst4_u8(dst0 - 8*4, _src2);
+
+            src0 += 16*4;
+            dst0 -= 16*4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "mov        r4, #-32            \n"
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1]!      \n"
+            "vrev64.u8  d0, d0              \n"
+            "vrev64.u8  d1, d1              \n"
+            "vrev64.u8  d2, d2              \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d4-d7}, [%1]!      \n"
+            "vrev64.u8  d3, d3              \n"
+            "vrev64.u8  d4, d4              \n"
+            "vrev64.u8  d5, d5              \n"
+            "vst4.u8    {d0-d3}, [%2], r4   \n"
+            "vrev64.u8  d6, d6              \n"
+            "vrev64.u8  d7, d7              \n"
+            "subs       %0, #1              \n"
+            "vst4.u8    {d4-d7}, [%2], r4   \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1", "q2", "q3", "r4"
+        );
+        }
+#endif // __aarch64__
+
+        dst0 += 7*4;
+#else
+        int remain = srcw;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 -= 4;
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_4.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_4.c
new file mode 100644
index 0000000..db49013
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_4.c
@@ -0,0 +1,528 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_4_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1);
+
+    int size = srcw;
+
+    const unsigned char* src0 = src;
+    const unsigned char* src1 = src + size;
+    unsigned char* dst0 = dstend;
+    unsigned char* dst1 = dstend - size;
+
+    int y = 0;
+    for (; y+1 < srch; y+=2)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src0 = vld1q_u8(src0);
+            uint8x16_t _src0n = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src0);
+            vst1q_u8(dst0 + 16, _src0n);
+
+            uint8x16_t _src1 = vld1q_u8(src1);
+            uint8x16_t _src1n = vld1q_u8(src1 + 16);
+            vst1q_u8(dst1, _src1);
+            vst1q_u8(dst1 + 16, _src1n);
+
+            src0 += 32;
+            src1 += 32;
+            dst0 += 32;
+            dst1 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "pld        [%2, #256]          \n"
+            "vld1.u8    {d4-d7}, [%2]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%3]!      \n"
+            "vst1.u8    {d4-d7}, [%4]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1)
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+            *dst1++ = *src1++;
+        }
+
+        src0 += size;
+        src1 += size;
+        dst0 -= size*3;
+        dst1 -= size*3;
+    }
+
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src = vld1q_u8(src0);
+            uint8x16_t _src2 = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src);
+            vst1q_u8(dst0 + 16, _src2);
+
+            src0 += 32;
+            dst0 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%2]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+        }
+
+        dst0 -= size*2;
+    }
+}
+
+void rotate_4_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 2;
+
+    int size = srcw * 2;
+
+    const unsigned char* src0 = src;
+    const unsigned char* src1 = src + size;
+    unsigned char* dst0 = dstend;
+    unsigned char* dst1 = dstend - size;
+
+    int y = 0;
+    for (; y+1 < srch; y+=2)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src0 = vld1q_u8(src0);
+            uint8x16_t _src0n = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src0);
+            vst1q_u8(dst0 + 16, _src0n);
+
+            uint8x16_t _src1 = vld1q_u8(src1);
+            uint8x16_t _src1n = vld1q_u8(src1 + 16);
+            vst1q_u8(dst1, _src1);
+            vst1q_u8(dst1 + 16, _src1n);
+
+            src0 += 32;
+            src1 += 32;
+            dst0 += 32;
+            dst1 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "pld        [%2, #256]          \n"
+            "vld1.u8    {d4-d7}, [%2]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%3]!      \n"
+            "vst1.u8    {d4-d7}, [%4]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1)
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+            *dst1++ = *src1++;
+        }
+
+        src0 += size;
+        src1 += size;
+        dst0 -= size*3;
+        dst1 -= size*3;
+    }
+
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src = vld1q_u8(src0);
+            uint8x16_t _src2 = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src);
+            vst1q_u8(dst0 + 16, _src2);
+
+            src0 += 32;
+            dst0 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%2]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+        }
+
+        dst0 -= size*2;
+    }
+}
+
+void rotate_4_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 3;
+
+    int size = srcw * 3;
+
+    const unsigned char* src0 = src;
+    const unsigned char* src1 = src + size;
+    unsigned char* dst0 = dstend;
+    unsigned char* dst1 = dstend - size;
+
+    int y = 0;
+    for (; y+1 < srch; y+=2)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src0 = vld1q_u8(src0);
+            uint8x16_t _src0n = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src0);
+            vst1q_u8(dst0 + 16, _src0n);
+
+            uint8x16_t _src1 = vld1q_u8(src1);
+            uint8x16_t _src1n = vld1q_u8(src1 + 16);
+            vst1q_u8(dst1, _src1);
+            vst1q_u8(dst1 + 16, _src1n);
+
+            src0 += 32;
+            src1 += 32;
+            dst0 += 32;
+            dst1 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "pld        [%2, #256]          \n"
+            "vld1.u8    {d4-d7}, [%2]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%3]!      \n"
+            "vst1.u8    {d4-d7}, [%4]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1)
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+            *dst1++ = *src1++;
+        }
+
+        src0 += size;
+        src1 += size;
+        dst0 -= size*3;
+        dst1 -= size*3;
+    }
+
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src = vld1q_u8(src0);
+            uint8x16_t _src2 = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src);
+            vst1q_u8(dst0 + 16, _src2);
+
+            src0 += 32;
+            dst0 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%2]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+        }
+
+        dst0 -= size*2;
+    }
+}
+
+void rotate_4_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srcw;
+    int h = srch;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 4;
+
+    int size = srcw * 4;
+
+    const unsigned char* src0 = src;
+    const unsigned char* src1 = src + size;
+    unsigned char* dst0 = dstend;
+    unsigned char* dst1 = dstend - size;
+
+    int y = 0;
+    for (; y+1 < srch; y+=2)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src0 = vld1q_u8(src0);
+            uint8x16_t _src0n = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src0);
+            vst1q_u8(dst0 + 16, _src0n);
+
+            uint8x16_t _src1 = vld1q_u8(src1);
+            uint8x16_t _src1n = vld1q_u8(src1 + 16);
+            vst1q_u8(dst1, _src1);
+            vst1q_u8(dst1 + 16, _src1n);
+
+            src0 += 32;
+            src1 += 32;
+            dst0 += 32;
+            dst1 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "pld        [%2, #256]          \n"
+            "vld1.u8    {d4-d7}, [%2]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%3]!      \n"
+            "vst1.u8    {d4-d7}, [%4]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1)
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+            *dst1++ = *src1++;
+        }
+
+        src0 += size;
+        src1 += size;
+        dst0 -= size*3;
+        dst1 -= size*3;
+    }
+
+    for (; y < srch; y++)
+    {
+#if __ARM_NEON
+        int nn = size >> 5;
+        int remain = size - (nn << 5);
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x16_t _src = vld1q_u8(src0);
+            uint8x16_t _src2 = vld1q_u8(src0 + 16);
+            vst1q_u8(dst0, _src);
+            vst1q_u8(dst0 + 16, _src2);
+
+            src0 += 32;
+            dst0 += 32;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.u8    {d0-d3}, [%1]!      \n"
+            "subs       %0, #1              \n"
+            "vst1.u8    {d0-d3}, [%2]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(dst0)    // %2
+            : "0"(nn),
+              "1"(src0),
+              "2"(dst0)
+            : "cc", "memory", "q0", "q1"
+        );
+        }
+#endif // __aarch64__
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *dst0++ = *src0++;
+        }
+
+        dst0 -= size*2;
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_5.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_5.c
new file mode 100644
index 0000000..b995380
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_5.c
@@ -0,0 +1,1045 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_5_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw;
+
+        unsigned char* dst0 = dst + y;
+        unsigned char* dst1 = dst + y + w;
+
+        int src_step = 2*srcw;
+        int dst_step = 2*w;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src0 = vld1_u8(src0);
+            uint8x8_t _src1 = vld1_u8(src1);
+
+            uint8x8_t _src2 = vld1_u8(src0 + src_step);
+            uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+            uint8x8_t _src4 = vld1_u8(src0 + 2*src_step);
+            uint8x8_t _src5 = vld1_u8(src1 + 2*src_step);
+
+            uint8x8_t _src6 = vld1_u8(src0 + 3*src_step);
+            uint8x8_t _src7 = vld1_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            vst1_u8(dst0, _dst0);
+            vst1_u8(dst1, _dst1);
+            vst1_u8(dst0 + dst_step, _dst2);
+            vst1_u8(dst1 + dst_step, _dst3);
+            vst1_u8(dst0 + 2*dst_step, _dst4);
+            vst1_u8(dst1 + 2*dst_step, _dst5);
+            vst1_u8(dst0 + 3*dst_step, _dst6);
+            vst1_u8(dst1 + 3*dst_step, _dst7);
+
+            src0 += 8;
+            src1 += 8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d0}, [%1], %10     \n"
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d1}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d2}, [%1], %10     \n"
+
+            "vtrn.u8    d0, d1              \n"// _src01t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d3}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d4}, [%1], %10     \n"
+
+            "vtrn.u8    d2, d3              \n"// _src23t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d5}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d6}, [%1], %10     \n"
+
+            "vtrn.u8    d4, d5              \n"// _src45t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d7}, [%2], %10     \n"
+
+            "vtrn.u8    d6, d7              \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q1              \n"// _src02tt_r _src13tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q3              \n"// _src13tt_r _src46tt_r
+
+            "add        %1, #8              \n"// src0 += 8
+
+            "vtrn.u32   q0, q2              \n"// _src04ttt_r _src15ttt_r
+
+            "add        %2, #8              \n"// src1 += 8
+
+            "vtrn.u32   q1, q3              \n"// _src26ttt_r _src37ttt_r
+            "vst1.u8    {d0}, [%3], %11     \n"
+            "vst1.u8    {d1}, [%4], %11     \n"
+
+            "subs       %0, #1              \n"
+
+            "vst1.u8    {d2}, [%3], %11     \n"
+            "vst1.u8    {d3}, [%4], %11     \n"
+            "vst1.u8    {d4}, [%3], %11     \n"
+            "vst1.u8    {d5}, [%4], %11     \n"
+            "vst1.u8    {d6}, [%3], %11     \n"
+            "vst1.u8    {d7}, [%4], %11     \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src1[0];
+            dst0[2] = src0[0 + src_step];
+            dst0[3] = src1[0 + src_step];
+            dst0[4] = src0[0 + 2*src_step];
+            dst0[5] = src1[0 + 2*src_step];
+            dst0[6] = src0[0 + 3*src_step];
+            dst0[7] = src1[0 + 3*src_step];
+
+            src0 += 1;
+            src1 += 1;
+
+            dst0 += w;
+        }
+
+        src0 += 7*srcw;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dst + y;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 += w;
+        }
+    }
+}
+
+void rotate_5_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*2;
+
+        unsigned char* dst0 = dst + y*2;
+        unsigned char* dst1 = dst + y*2 + w*2;
+
+        int src_step = 2*srcw*2;
+        int dst_step = 2*w*2;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src0 = vld2_u8(src0);
+            uint8x8x2_t _src1 = vld2_u8(src1);
+
+            uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+            uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+            uint8x8x2_t _src4 = vld2_u8(src0 + 2*src_step);
+            uint8x8x2_t _src5 = vld2_u8(src1 + 2*src_step);
+
+            uint8x8x2_t _src6 = vld2_u8(src0 + 3*src_step);
+            uint8x8x2_t _src7 = vld2_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint8x8x2_t _dst0;
+            uint8x8x2_t _dst1;
+            uint8x8x2_t _dst2;
+            uint8x8x2_t _dst3;
+            uint8x8x2_t _dst4;
+            uint8x8x2_t _dst5;
+            uint8x8x2_t _dst6;
+            uint8x8x2_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            vst2_u8(dst0, _dst0);
+            vst2_u8(dst1, _dst1);
+            vst2_u8(dst0 + dst_step, _dst2);
+            vst2_u8(dst1 + dst_step, _dst3);
+            vst2_u8(dst0 + 2*dst_step, _dst4);
+            vst2_u8(dst1 + 2*dst_step, _dst5);
+            vst2_u8(dst0 + 3*dst_step, _dst6);
+            vst2_u8(dst1 + 3*dst_step, _dst7);
+
+            src0 += 2*8;
+            src1 += 2*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1], %10  \n"
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d2-d3}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d4-d5}, [%1], %10  \n"
+
+            "vtrn.u8    q0, q1              \n"// _src01t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d6-d7}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d16-d17}, [%1], %10\n"
+
+            "vtrn.u8    q2, q3              \n"// _src23t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d18-d19}, [%2], %10\n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d20-d21}, [%1], %10\n"
+
+            "vtrn.u8    q8, q9              \n"// _src45t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d22-d23}, [%2], %10\n"
+
+            "vtrn.u8    q10, q11            \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q2              \n"// _src02tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q1, q3              \n"// _src13tt_r
+
+            "add        %1, #16             \n"// src0 += 16
+
+            "vtrn.u16   q8, q10             \n"// _src46tt_r
+
+            "add        %2, #16             \n"// src1 += 16
+
+            "vtrn.u16   q9, q11             \n"// _src57tt_r
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+
+            "vtrn.u32   q1, q9              \n"// _src15ttt_r
+            "vst2.u8    {d0-d1}, [%3], %11  \n"
+
+            "vtrn.u32   q2, q10             \n"// _src26ttt_r
+            "vst2.u8    {d2-d3}, [%4], %11  \n"
+
+            "vtrn.u32   q3, q11             \n"// _src37ttt_r
+            "vst2.u8    {d4-d5}, [%3], %11  \n"
+
+            "subs       %0, #1              \n"
+
+            "vst2.u8    {d6-d7}, [%4], %11  \n"
+            "vst2.u8    {d16-d17}, [%3], %11\n"
+            "vst2.u8    {d18-d19}, [%4], %11\n"
+            "vst2.u8    {d20-d21}, [%3], %11\n"
+            "vst2.u8    {d22-d23}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src1[0];
+            dst0[3] = src1[1];
+            dst0[4] = src0[0 + src_step];
+            dst0[5] = src0[1 + src_step];
+            dst0[6] = src1[0 + src_step];
+            dst0[7] = src1[1 + src_step];
+            dst0[8] = src0[0 + 2*src_step];
+            dst0[9] = src0[1 + 2*src_step];
+            dst0[10] = src1[0 + 2*src_step];
+            dst0[11] = src1[1 + 2*src_step];
+            dst0[12] = src0[0 + 3*src_step];
+            dst0[13] = src0[1 + 3*src_step];
+            dst0[14] = src1[0 + 3*src_step];
+            dst0[15] = src1[1 + 3*src_step];
+
+            src0 += 2;
+            src1 += 2;
+
+            dst0 += w*2;
+        }
+
+        src0 += 7*srcw*2;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dst + y*2;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 += w*2;
+        }
+    }
+}
+
+void rotate_5_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*3;
+
+        unsigned char* dst0 = dst + y*3;
+        unsigned char* dst1 = dst + y*3 + w*3;
+
+        int src_step = 2*srcw*3;
+        int dst_step = 2*w*3;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src0 = vld3_u8(src0);
+            uint8x8x3_t _src1 = vld3_u8(src1);
+
+            uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+            uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+            uint8x8x3_t _src4 = vld3_u8(src0 + 2*src_step);
+            uint8x8x3_t _src5 = vld3_u8(src1 + 2*src_step);
+
+            uint8x8x3_t _src6 = vld3_u8(src0 + 3*src_step);
+            uint8x8x3_t _src7 = vld3_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+            uint8x8x3_t _dst0;
+            uint8x8x3_t _dst1;
+            uint8x8x3_t _dst2;
+            uint8x8x3_t _dst3;
+            uint8x8x3_t _dst4;
+            uint8x8x3_t _dst5;
+            uint8x8x3_t _dst6;
+            uint8x8x3_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+            vst3_u8(dst0, _dst0);
+            vst3_u8(dst1, _dst1);
+            vst3_u8(dst0 + dst_step, _dst2);
+            vst3_u8(dst1 + dst_step, _dst3);
+            vst3_u8(dst0 + 2*dst_step, _dst4);
+            vst3_u8(dst1 + 2*dst_step, _dst5);
+            vst3_u8(dst0 + 3*dst_step, _dst6);
+            vst3_u8(dst1 + 3*dst_step, _dst7);
+
+            src0 += 3*8;
+            src1 += 3*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1], %10  \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d4-d6}, [%2], %10  \n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d8-d10}, [%1], %10 \n"
+
+            "vtrn.u8    q0, q2              \n"// _src01t_r
+            "vtrn.u8    d2, d6              \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d12-d14}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d16-d18}, [%1], %10\n"
+
+            "vtrn.u8    q4, q6              \n"// _src23t_r
+            "vtrn.u8    d10, d14            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d20-d22}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d24-d26}, [%1], %10\n"
+
+            "vtrn.u8    q8, q10             \n"// _src45t_r
+            "vtrn.u8    d18, d22            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d28-d30}, [%2], %10\n"
+
+            "vtrn.u8    q12, q14            \n"// _src67t_r
+            "vtrn.u8    d26, d30            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q4              \n"// _src02tt_r
+            "vtrn.u16   d2, d10             \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q6              \n"// _src13tt_r
+            "vtrn.u16   d6, d14             \n"
+
+            "add        %1, #24             \n"// src0 += 24
+
+            "vtrn.u16   q8, q12             \n"// _src46tt_r
+            "vtrn.u16   d18, d26            \n"
+
+            "add        %2, #24             \n"// src1 += 24
+
+            "vtrn.u16   q10, q14            \n"// _src57tt_r
+            "vtrn.u16   d22, d30            \n"
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+            "vtrn.u32   d2, d18             \n"
+
+            "vtrn.u32   q2, q10             \n"// _src15ttt_r
+            "vst3.u8    {d0-d2}, [%3], %11  \n"
+            "vtrn.u32   d6, d22             \n"
+
+            "vtrn.u32   q4, q12             \n"// _src26ttt_r
+            "vst3.u8    {d4-d6}, [%4], %11  \n"
+            "vtrn.u32   d10, d26            \n"
+
+            "vtrn.u32   q6, q14             \n"// _src37ttt_r
+            "vst3.u8    {d8-d10}, [%3], %11 \n"
+            "vtrn.u32   d14, d30            \n"
+
+            "subs       %0, #1              \n"
+
+            "vst3.u8    {d16-d18}, [%3], %11\n"
+            "vst3.u8    {d12-d14}, [%4], %11\n"
+            "vst3.u8    {d20-d22}, [%4], %11\n"
+            "vst3.u8    {d24-d26}, [%3], %11\n"
+            "vst3.u8    {d28-d30}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src1[0];
+            dst0[4] = src1[1];
+            dst0[5] = src1[2];
+            dst0[6] = src0[0 + src_step];
+            dst0[7] = src0[1 + src_step];
+            dst0[8] = src0[2 + src_step];
+            dst0[9] = src1[0 + src_step];
+            dst0[10] = src1[1 + src_step];
+            dst0[11] = src1[2 + src_step];
+            dst0[12] = src0[0 + 2*src_step];
+            dst0[13] = src0[1 + 2*src_step];
+            dst0[14] = src0[2 + 2*src_step];
+            dst0[15] = src1[0 + 2*src_step];
+            dst0[16] = src1[1 + 2*src_step];
+            dst0[17] = src1[2 + 2*src_step];
+            dst0[18] = src0[0 + 3*src_step];
+            dst0[19] = src0[1 + 3*src_step];
+            dst0[20] = src0[2 + 3*src_step];
+            dst0[21] = src1[0 + 3*src_step];
+            dst0[22] = src1[1 + 3*src_step];
+            dst0[23] = src1[2 + 3*src_step];
+
+            src0 += 3;
+            src1 += 3;
+
+            dst0 += w*3;
+        }
+
+        src0 += 7*srcw*3;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dst + y*3;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 += w*3;
+        }
+    }
+}
+
+void rotate_5_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*4;
+
+        unsigned char* dst0 = dst + y*4;
+        unsigned char* dst1 = dst + y*4 + w*4;
+
+        int src_step = 2*srcw*4;
+        int dst_step = 2*w*4;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src0 = vld4_u8(src0);
+            uint8x8x4_t _src1 = vld4_u8(src1);
+
+            uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+            uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+            uint8x8x4_t _src4 = vld4_u8(src0 + 2*src_step);
+            uint8x8x4_t _src5 = vld4_u8(src1 + 2*src_step);
+
+            uint8x8x4_t _src6 = vld4_u8(src0 + 3*src_step);
+            uint8x8x4_t _src7 = vld4_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+            uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
+            uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
+            uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
+            uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+            uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
+            uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
+            uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
+            uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+            uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
+            uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
+            uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
+            uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
+
+            uint8x8x4_t _dst0;
+            uint8x8x4_t _dst1;
+            uint8x8x4_t _dst2;
+            uint8x8x4_t _dst3;
+            uint8x8x4_t _dst4;
+            uint8x8x4_t _dst5;
+            uint8x8x4_t _dst6;
+            uint8x8x4_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+            _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+            _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+            _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+            _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+            _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+            _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+            _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+            _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+
+            vst4_u8(dst0, _dst0);
+            vst4_u8(dst1, _dst1);
+            vst4_u8(dst0 + dst_step, _dst2);
+            vst4_u8(dst1 + dst_step, _dst3);
+            vst4_u8(dst0 + 2*dst_step, _dst4);
+            vst4_u8(dst1 + 2*dst_step, _dst5);
+            vst4_u8(dst0 + 3*dst_step, _dst6);
+            vst4_u8(dst1 + 3*dst_step, _dst7);
+
+            src0 += 4*8;
+            src1 += 4*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1], %10  \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d4-d7}, [%2], %10  \n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d8-d11}, [%1], %10 \n"
+
+            "vtrn.u8    q0, q2              \n"// _src01t_r
+            "vtrn.u8    q1, q3              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d12-d15}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d16-d19}, [%1], %10\n"
+
+            "vtrn.u8    q4, q6              \n"// _src23t_r
+            "vtrn.u8    q5, q7              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d20-d23}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d24-d27}, [%1], %10\n"
+
+            "vtrn.u8    q8, q10             \n"// _src45t_r
+            "vtrn.u8    q9, q11             \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d28-d31}, [%2], %10\n"
+
+            "vtrn.u8    q12, q14            \n"// _src67t_r
+            "vtrn.u8    q13, q15            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q4              \n"// _src02tt_r
+            "vtrn.u16   q1, q5              \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q6              \n"// _src13tt_r
+            "vtrn.u16   q3, q7              \n"
+
+            "add        %1, #32             \n"// src0 += 32
+
+            "vtrn.u16   q8, q12             \n"// _src46tt_r
+            "vtrn.u16   q9, q13             \n"
+
+            "add        %2, #32             \n"// src1 += 32
+
+            "vtrn.u16   q10, q14            \n"// _src57tt_r
+            "vtrn.u16   q11, q15            \n"
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+            "vtrn.u32   q1, q9              \n"
+
+            "vtrn.u32   q2, q10             \n"// _src15ttt_r
+            "vst4.u8    {d0-d3}, [%3], %11  \n"
+            "vtrn.u32   q3, q11             \n"
+
+            "vtrn.u32   q4, q12             \n"// _src26ttt_r
+            "vst4.u8    {d4-d7}, [%4], %11  \n"
+            "vtrn.u32   q5, q13             \n"
+
+            "vtrn.u32   q6, q14             \n"// _src37ttt_r
+            "vst4.u8    {d8-d11}, [%3], %11 \n"
+            "vtrn.u32   q7, q15             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst4.u8    {d16-d19}, [%3], %11\n"
+            "vst4.u8    {d12-d15}, [%4], %11\n"
+            "vst4.u8    {d20-d23}, [%4], %11\n"
+            "vst4.u8    {d24-d27}, [%3], %11\n"
+            "vst4.u8    {d28-d31}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+            dst0[4] = src1[0];
+            dst0[5] = src1[1];
+            dst0[6] = src1[2];
+            dst0[7] = src1[3];
+            dst0[8] = src0[0 + src_step];
+            dst0[9] = src0[1 + src_step];
+            dst0[10] = src0[2 + src_step];
+            dst0[11] = src0[3 + src_step];
+            dst0[12] = src1[0 + src_step];
+            dst0[13] = src1[1 + src_step];
+            dst0[14] = src1[2 + src_step];
+            dst0[15] = src1[3 + src_step];
+            dst0[16] = src0[0 + 2*src_step];
+            dst0[17] = src0[1 + 2*src_step];
+            dst0[18] = src0[2 + 2*src_step];
+            dst0[19] = src0[3 + 2*src_step];
+            dst0[20] = src1[0 + 2*src_step];
+            dst0[21] = src1[1 + 2*src_step];
+            dst0[22] = src1[2 + 2*src_step];
+            dst0[23] = src1[3 + 2*src_step];
+            dst0[24] = src0[0 + 3*src_step];
+            dst0[25] = src0[1 + 3*src_step];
+            dst0[26] = src0[2 + 3*src_step];
+            dst0[27] = src0[3 + 3*src_step];
+            dst0[28] = src1[0 + 3*src_step];
+            dst0[29] = src1[1 + 3*src_step];
+            dst0[30] = src1[2 + 3*src_step];
+            dst0[31] = src1[3 + 3*src_step];
+
+            src0 += 4;
+            src1 += 4;
+
+            dst0 += w*4;
+        }
+
+        src0 += 7*srcw*4;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dst + y*4;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 += w*4;
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_6.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_6.c
new file mode 100644
index 0000000..131f26b
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_6.c
@@ -0,0 +1,1057 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_6_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel in row
+    unsigned char* dstend = dst + w;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw;
+
+        unsigned char* dst0 = dstend - y - 8;
+        unsigned char* dst1 = dstend - y - 8 + w;
+
+        int src_step = 2*srcw;
+        int dst_step = 2*w;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src0 = vld1_u8(src0);
+            uint8x8_t _src1 = vld1_u8(src1);
+
+            uint8x8_t _src2 = vld1_u8(src0 + src_step);
+            uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+            uint8x8_t _src4 = vld1_u8(src0 + 2*src_step);
+            uint8x8_t _src5 = vld1_u8(src1 + 2*src_step);
+
+            uint8x8_t _src6 = vld1_u8(src0 + 3*src_step);
+            uint8x8_t _src7 = vld1_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            vst1_u8(dst0, _dst7);
+            vst1_u8(dst1, _dst6);
+            vst1_u8(dst0 + dst_step, _dst5);
+            vst1_u8(dst1 + dst_step, _dst4);
+            vst1_u8(dst0 + 2*dst_step, _dst3);
+            vst1_u8(dst1 + 2*dst_step, _dst2);
+            vst1_u8(dst0 + 3*dst_step, _dst1);
+            vst1_u8(dst1 + 3*dst_step, _dst0);
+
+            src0 += 8;
+            src1 += 8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d0}, [%1], %10     \n"
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d1}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d2}, [%1], %10     \n"
+
+            "vtrn.u8    d1, d0              \n"// _src01t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d3}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d4}, [%1], %10     \n"
+
+            "vtrn.u8    d3, d2              \n"// _src23t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d5}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d6}, [%1], %10     \n"
+
+            "vtrn.u8    d5, d4              \n"// _src45t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d7}, [%2], %10     \n"
+
+            "vtrn.u8    d7, d6              \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q1, q0              \n"// _src02tt_r _src13tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q3, q2              \n"// _src46tt_r _src57tt_r
+
+            "add        %1, #8              \n"// src0 += 8
+
+            "vtrn.u32   q3, q1              \n"// _src26ttt_r _src37ttt_r
+
+            "add        %2, #8              \n"// src1 += 8
+
+            "vtrn.u32   q2, q0              \n"// _src04ttt_r _src15ttt_r
+            "vst1.u8    {d6}, [%4], %11     \n"
+            "vst1.u8    {d7}, [%3], %11     \n"
+
+            "subs       %0, #1              \n"
+
+            "vst1.u8    {d4}, [%4], %11     \n"
+            "vst1.u8    {d5}, [%3], %11     \n"
+            "vst1.u8    {d2}, [%4], %11     \n"
+            "vst1.u8    {d3}, [%3], %11     \n"
+            "vst1.u8    {d0}, [%4], %11     \n"
+            "vst1.u8    {d1}, [%3], %11     \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src1[0 + 3*src_step];
+            dst0[1] = src0[0 + 3*src_step];
+            dst0[2] = src1[0 + 2*src_step];
+            dst0[3] = src0[0 + 2*src_step];
+            dst0[4] = src1[0 + src_step];
+            dst0[5] = src0[0 + src_step];
+            dst0[6] = src1[0];
+            dst0[7] = src0[0];
+
+            src0 += 1;
+            src1 += 1;
+
+            dst0 += w;
+        }
+
+        src0 += 7*srcw;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y - 1;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 += w;
+        }
+    }
+}
+
+void rotate_6_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel in row
+    unsigned char* dstend = dst + w * 2;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*2;
+
+        unsigned char* dst0 = dstend - y*2 - 8*2;
+        unsigned char* dst1 = dstend - y*2 - 8*2 + w*2;
+
+        int src_step = 2*srcw*2;
+        int dst_step = 2*w*2;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src0 = vld2_u8(src0);
+            uint8x8x2_t _src1 = vld2_u8(src1);
+
+            uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+            uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+            uint8x8x2_t _src4 = vld2_u8(src0 + 2*src_step);
+            uint8x8x2_t _src5 = vld2_u8(src1 + 2*src_step);
+
+            uint8x8x2_t _src6 = vld2_u8(src0 + 3*src_step);
+            uint8x8x2_t _src7 = vld2_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint8x8x2_t _dst0;
+            uint8x8x2_t _dst1;
+            uint8x8x2_t _dst2;
+            uint8x8x2_t _dst3;
+            uint8x8x2_t _dst4;
+            uint8x8x2_t _dst5;
+            uint8x8x2_t _dst6;
+            uint8x8x2_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            vst2_u8(dst0, _dst7);
+            vst2_u8(dst1, _dst6);
+            vst2_u8(dst0 + dst_step, _dst5);
+            vst2_u8(dst1 + dst_step, _dst4);
+            vst2_u8(dst0 + 2*dst_step, _dst3);
+            vst2_u8(dst1 + 2*dst_step, _dst2);
+            vst2_u8(dst0 + 3*dst_step, _dst1);
+            vst2_u8(dst1 + 3*dst_step, _dst0);
+
+            src0 += 2*8;
+            src1 += 2*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1], %10  \n"
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d2-d3}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d4-d5}, [%1], %10  \n"
+
+            "vtrn.u8    q1, q0              \n"// _src01t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d6-d7}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d16-d17}, [%1], %10\n"
+
+            "vtrn.u8    q3, q2              \n"// _src23t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d18-d19}, [%2], %10\n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d20-d21}, [%1], %10\n"
+
+            "vtrn.u8    q9, q8              \n"// _src45t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d22-d23}, [%2], %10\n"
+
+            "vtrn.u8    q11, q10            \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q2, q0              \n"// _src02tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q3, q1              \n"// _src13tt_r
+
+            "add        %1, #16             \n"// src0 += 16
+
+            "vtrn.u16   q10, q8             \n"// _src46tt_r
+
+            "add        %2, #16             \n"// src1 += 16
+
+            "vtrn.u16   q11, q9             \n"// _src57tt_r
+
+            "vtrn.u32   q10, q2             \n"// _src26ttt_r
+
+            "vtrn.u32   q11, q3             \n"// _src37ttt_r
+            "vst2.u8    {d20-d21}, [%4], %11\n"
+
+            "vtrn.u32   q8, q0              \n"// _src04ttt_r
+            "vst2.u8    {d22-d23}, [%3], %11\n"
+
+            "vtrn.u32   q9, q1              \n"// _src15ttt_r
+            "vst2.u8    {d16-d17}, [%4], %11\n"
+
+            "subs       %0, #1              \n"
+
+            "vst2.u8    {d18-d19}, [%3], %11\n"
+            "vst2.u8    {d4-d5}, [%4], %11  \n"
+            "vst2.u8    {d6-d7}, [%3], %11  \n"
+            "vst2.u8    {d0-d1}, [%4], %11  \n"
+            "vst2.u8    {d2-d3}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src1[0 + 3*src_step];
+            dst0[1] = src1[1 + 3*src_step];
+            dst0[2] = src0[0 + 3*src_step];
+            dst0[3] = src0[1 + 3*src_step];
+            dst0[4] = src1[0 + 2*src_step];
+            dst0[5] = src1[1 + 2*src_step];
+            dst0[6] = src0[0 + 2*src_step];
+            dst0[7] = src0[1 + 2*src_step];
+            dst0[8] = src1[0 + src_step];
+            dst0[9] = src1[1 + src_step];
+            dst0[10] = src0[0 + src_step];
+            dst0[11] = src0[1 + src_step];
+            dst0[12] = src1[0];
+            dst0[13] = src1[1];
+            dst0[14] = src0[0];
+            dst0[15] = src0[1];
+
+            src0 += 2;
+            src1 += 2;
+
+            dst0 += w*2;
+        }
+
+        src0 += 7*srcw*2;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*2 - 2;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 += w*2;
+        }
+    }
+}
+
+void rotate_6_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel in row
+    unsigned char* dstend = dst + w * 3;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*3;
+
+        unsigned char* dst0 = dstend - y*3 - 8*3;
+        unsigned char* dst1 = dstend - y*3 - 8*3 + w*3;
+
+        int src_step = 2*srcw*3;
+        int dst_step = 2*w*3;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src0 = vld3_u8(src0);
+            uint8x8x3_t _src1 = vld3_u8(src1);
+
+            uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+            uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+            uint8x8x3_t _src4 = vld3_u8(src0 + 2*src_step);
+            uint8x8x3_t _src5 = vld3_u8(src1 + 2*src_step);
+
+            uint8x8x3_t _src6 = vld3_u8(src0 + 3*src_step);
+            uint8x8x3_t _src7 = vld3_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+            uint8x8x3_t _dst0;
+            uint8x8x3_t _dst1;
+            uint8x8x3_t _dst2;
+            uint8x8x3_t _dst3;
+            uint8x8x3_t _dst4;
+            uint8x8x3_t _dst5;
+            uint8x8x3_t _dst6;
+            uint8x8x3_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+            vst3_u8(dst0, _dst7);
+            vst3_u8(dst1, _dst6);
+            vst3_u8(dst0 + dst_step, _dst5);
+            vst3_u8(dst1 + dst_step, _dst4);
+            vst3_u8(dst0 + 2*dst_step, _dst3);
+            vst3_u8(dst1 + 2*dst_step, _dst2);
+            vst3_u8(dst0 + 3*dst_step, _dst1);
+            vst3_u8(dst1 + 3*dst_step, _dst0);
+
+            src0 += 3*8;
+            src1 += 3*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1], %10  \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d4-d6}, [%2], %10  \n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d8-d10}, [%1], %10 \n"
+
+            "vtrn.u8    q2, q0              \n"// _src01t_r
+            "vtrn.u8    d6, d2              \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d12-d14}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d16-d18}, [%1], %10\n"
+
+            "vtrn.u8    q6, q4              \n"// _src23t_r
+            "vtrn.u8    d14, d10            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d20-d22}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d24-d26}, [%1], %10\n"
+
+            "vtrn.u8    q10, q8             \n"// _src45t_r
+            "vtrn.u8    d22, d18            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d28-d30}, [%2], %10\n"
+
+            "vtrn.u8    q14, q12            \n"// _src67t_r
+            "vtrn.u8    d30, d26            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q4, q0              \n"// _src02tt_r
+            "vtrn.u16   d10, d2             \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q6, q2              \n"// _src13tt_r
+            "vtrn.u16   d14, d6             \n"
+
+            "add        %1, #24             \n"// src0 += 24
+
+            "vtrn.u16   q12, q8             \n"// _src46tt_r
+            "vtrn.u16   d26, d18            \n"
+
+            "add        %2, #24             \n"// src1 += 24
+
+            "vtrn.u16   q14, q10            \n"// _src57tt_r
+            "vtrn.u16   d30, d22            \n"
+
+            "vtrn.u32   q12, q4             \n"// _src26ttt_r
+            "vtrn.u32   d26, d10            \n"
+
+            "vtrn.u32   q14, q6             \n"// _src37ttt_r
+            "vst3.u8    {d24-d26}, [%4], %11\n"
+            "vtrn.u32   d30, d14            \n"
+
+            "vtrn.u32   q8, q0              \n"// _src04ttt_r
+            "vst3.u8    {d28-d30}, [%3], %11\n"
+            "vtrn.u32   d18, d2             \n"
+
+            "vtrn.u32   q10, q2             \n"// _src15ttt_r
+            "vst3.u8    {d16-d18}, [%4], %11\n"
+            "vtrn.u32   d22, d6             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst3.u8    {d20-d22}, [%3], %11\n"
+            "vst3.u8    {d8-d10}, [%4], %11 \n"
+            "vst3.u8    {d12-d14}, [%3], %11\n"
+            "vst3.u8    {d0-d2}, [%4], %11  \n"
+            "vst3.u8    {d4-d6}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src1[0 + 3*src_step];
+            dst0[1] = src1[1 + 3*src_step];
+            dst0[2] = src1[2 + 3*src_step];
+            dst0[3] = src0[0 + 3*src_step];
+            dst0[4] = src0[1 + 3*src_step];
+            dst0[5] = src0[2 + 3*src_step];
+            dst0[6] = src1[0 + 2*src_step];
+            dst0[7] = src1[1 + 2*src_step];
+            dst0[8] = src1[2 + 2*src_step];
+            dst0[9] = src0[0 + 2*src_step];
+            dst0[10] = src0[1 + 2*src_step];
+            dst0[11] = src0[2 + 2*src_step];
+            dst0[12] = src1[0 + src_step];
+            dst0[13] = src1[1 + src_step];
+            dst0[14] = src1[2 + src_step];
+            dst0[15] = src0[0 + src_step];
+            dst0[16] = src0[1 + src_step];
+            dst0[17] = src0[2 + src_step];
+            dst0[18] = src1[0];
+            dst0[19] = src1[1];
+            dst0[20] = src1[2];
+            dst0[21] = src0[0];
+            dst0[22] = src0[1];
+            dst0[23] = src0[2];
+
+            src0 += 3;
+            src1 += 3;
+
+            dst0 += w*3;
+        }
+
+        src0 += 7*srcw*3;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*3 - 3;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 += w*3;
+        }
+    }
+}
+
+void rotate_6_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel in row
+    unsigned char* dstend = dst + w * 4;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*4;
+
+        unsigned char* dst0 = dstend - y*4 - 8*4;
+        unsigned char* dst1 = dstend - y*4 - 8*4 + w*4;
+
+        int src_step = 2*srcw*4;
+        int dst_step = 2*w*4;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src0 = vld4_u8(src0);
+            uint8x8x4_t _src1 = vld4_u8(src1);
+
+            uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+            uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+            uint8x8x4_t _src4 = vld4_u8(src0 + 2*src_step);
+            uint8x8x4_t _src5 = vld4_u8(src1 + 2*src_step);
+
+            uint8x8x4_t _src6 = vld4_u8(src0 + 3*src_step);
+            uint8x8x4_t _src7 = vld4_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+            uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
+            uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
+            uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
+            uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+            uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
+            uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
+            uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
+            uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+            uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
+            uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
+            uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
+            uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
+
+            uint8x8x4_t _dst0;
+            uint8x8x4_t _dst1;
+            uint8x8x4_t _dst2;
+            uint8x8x4_t _dst3;
+            uint8x8x4_t _dst4;
+            uint8x8x4_t _dst5;
+            uint8x8x4_t _dst6;
+            uint8x8x4_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+            _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+            _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+            _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+            _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+            _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+            _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+            _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+            _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+
+            vst4_u8(dst0, _dst7);
+            vst4_u8(dst1, _dst6);
+            vst4_u8(dst0 + dst_step, _dst5);
+            vst4_u8(dst1 + dst_step, _dst4);
+            vst4_u8(dst0 + 2*dst_step, _dst3);
+            vst4_u8(dst1 + 2*dst_step, _dst2);
+            vst4_u8(dst0 + 3*dst_step, _dst1);
+            vst4_u8(dst1 + 3*dst_step, _dst0);
+
+            src0 += 4*8;
+            src1 += 4*8;
+
+            dst0 += 4*dst_step;
+            dst1 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1], %10  \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d4-d7}, [%2], %10  \n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d8-d11}, [%1], %10 \n"
+
+            "vtrn.u8    q2, q0              \n"// _src01t_r
+            "vtrn.u8    q3, q1              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d12-d15}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d16-d19}, [%1], %10\n"
+
+            "vtrn.u8    q6, q4              \n"// _src23t_r
+            "vtrn.u8    q7, q5              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d20-d23}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d24-d27}, [%1], %10\n"
+
+            "vtrn.u8    q10, q8             \n"// _src45t_r
+            "vtrn.u8    q11, q9             \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d28-d31}, [%2], %10\n"
+
+            "vtrn.u8    q14, q12            \n"// _src67t_r
+            "vtrn.u8    q15, q13            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q4, q0              \n"// _src02tt_r
+            "vtrn.u16   q5, q1              \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q6, q2              \n"// _src13tt_r
+            "vtrn.u16   q7, q3              \n"
+
+            "add        %1, #32             \n"// src0 += 32
+
+            "vtrn.u16   q12, q8             \n"// _src46tt_r
+            "vtrn.u16   q13, q9             \n"
+
+            "add        %2, #32             \n"// src1 += 32
+
+            "vtrn.u16   q14, q10            \n"// _src57tt_r
+            "vtrn.u16   q15, q11            \n"
+
+            "vtrn.u32   q12, q4             \n"// _src26ttt_r
+            "vtrn.u32   q13, q5             \n"
+
+            "vtrn.u32   q14, q6             \n"// _src37ttt_r
+            "vst4.u8    {d24-d27}, [%4], %11\n"
+            "vtrn.u32   q15, q7             \n"
+
+            "vtrn.u32   q8, q0              \n"// _src04ttt_r
+            "vst4.u8    {d28-d31}, [%3], %11\n"
+            "vtrn.u32   q9, q1              \n"
+
+            "vtrn.u32   q10, q2             \n"// _src15ttt_r
+            "vst4.u8    {d16-d19}, [%4], %11\n"
+            "vtrn.u32   q11, q3             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst4.u8    {d8-d11}, [%4], %11 \n"
+            "vst4.u8    {d20-d23}, [%3], %11\n"
+            "vst4.u8    {d12-d15}, [%3], %11\n"
+            "vst4.u8    {d0-d3}, [%4], %11  \n"
+            "vst4.u8    {d4-d7}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst0),   // %3
+              "=r"(dst1)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst0),
+              "4"(dst1),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst0[0] = src1[0 + 3*src_step];
+            dst0[1] = src1[1 + 3*src_step];
+            dst0[2] = src1[2 + 3*src_step];
+            dst0[3] = src1[3 + 3*src_step];
+            dst0[4] = src0[0 + 3*src_step];
+            dst0[5] = src0[1 + 3*src_step];
+            dst0[6] = src0[2 + 3*src_step];
+            dst0[7] = src0[3 + 3*src_step];
+            dst0[8] = src1[0 + 2*src_step];
+            dst0[9] = src1[1 + 2*src_step];
+            dst0[10] = src1[2 + 2*src_step];
+            dst0[11] = src1[3 + 2*src_step];
+            dst0[12] = src0[0 + 2*src_step];
+            dst0[13] = src0[1 + 2*src_step];
+            dst0[14] = src0[2 + 2*src_step];
+            dst0[15] = src0[3 + 2*src_step];
+            dst0[16] = src1[0 + src_step];
+            dst0[17] = src1[1 + src_step];
+            dst0[18] = src1[2 + src_step];
+            dst0[19] = src1[3 + src_step];
+            dst0[20] = src0[0 + src_step];
+            dst0[21] = src0[1 + src_step];
+            dst0[22] = src0[2 + src_step];
+            dst0[23] = src0[3 + src_step];
+            dst0[24] = src1[0];
+            dst0[25] = src1[1];
+            dst0[26] = src1[2];
+            dst0[27] = src1[3];
+            dst0[28] = src0[0];
+            dst0[29] = src0[1];
+            dst0[30] = src0[2];
+            dst0[31] = src0[3];
+
+            src0 += 4;
+            src1 += 4;
+
+            dst0 += w*4;
+        }
+
+        src0 += 7*srcw*4;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*4 - 4;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 += w*4;
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_7.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_7.c
new file mode 100644
index 0000000..1cd8b93
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_7.c
@@ -0,0 +1,1057 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_7_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw;
+
+        unsigned char* dst6 = dstend - y - 8 - w;
+        unsigned char* dst7 = dstend - y - 8;
+
+        int src_step = 2*srcw;
+        int dst_step = - 2*w;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src0 = vld1_u8(src0);
+            uint8x8_t _src1 = vld1_u8(src1);
+
+            uint8x8_t _src2 = vld1_u8(src0 + src_step);
+            uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+            uint8x8_t _src4 = vld1_u8(src0 + 2*src_step);
+            uint8x8_t _src5 = vld1_u8(src1 + 2*src_step);
+
+            uint8x8_t _src6 = vld1_u8(src0 + 3*src_step);
+            uint8x8_t _src7 = vld1_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            vst1_u8(dst7, _dst7);
+            vst1_u8(dst6, _dst6);
+            vst1_u8(dst7 + dst_step, _dst5);
+            vst1_u8(dst6 + dst_step, _dst4);
+            vst1_u8(dst7 + 2*dst_step, _dst3);
+            vst1_u8(dst6 + 2*dst_step, _dst2);
+            vst1_u8(dst7 + 3*dst_step, _dst1);
+            vst1_u8(dst6 + 3*dst_step, _dst0);
+
+            src0 += 8;
+            src1 += 8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d0}, [%1], %10     \n"
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d1}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d2}, [%1], %10     \n"
+
+            "vtrn.u8    d1, d0              \n"// _src01t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d3}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d4}, [%1], %10     \n"
+
+            "vtrn.u8    d3, d2              \n"// _src23t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d5}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d6}, [%1], %10     \n"
+
+            "vtrn.u8    d5, d4              \n"// _src45t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d7}, [%2], %10     \n"
+
+            "vtrn.u8    d7, d6              \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q1, q0              \n"// _src02tt_r _src13tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q3, q2              \n"// _src46tt_r _src57tt_r
+
+            "add        %1, #8              \n"// src0 += 8
+
+            "vtrn.u32   q3, q1              \n"// _src26ttt_r _src37ttt_r
+
+            "add        %2, #8              \n"// src1 += 8
+
+            "vtrn.u32   q2, q0              \n"// _src04ttt_r _src15ttt_r
+            "vst1.u8    {d6}, [%4], %11     \n"
+            "vst1.u8    {d7}, [%3], %11     \n"
+
+            "subs       %0, #1              \n"
+
+            "vst1.u8    {d4}, [%4], %11     \n"
+            "vst1.u8    {d5}, [%3], %11     \n"
+            "vst1.u8    {d2}, [%4], %11     \n"
+            "vst1.u8    {d3}, [%3], %11     \n"
+            "vst1.u8    {d0}, [%4], %11     \n"
+            "vst1.u8    {d1}, [%3], %11     \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src1[0 + 3*src_step];
+            dst7[1] = src0[0 + 3*src_step];
+            dst7[2] = src1[0 + 2*src_step];
+            dst7[3] = src0[0 + 2*src_step];
+            dst7[4] = src1[0 + src_step];
+            dst7[5] = src0[0 + src_step];
+            dst7[6] = src1[0];
+            dst7[7] = src0[0];
+
+            src0 += 1;
+            src1 += 1;
+
+            dst7 -= w;
+        }
+
+        src0 += 7*srcw;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y - 1;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 -= w;
+        }
+    }
+}
+
+void rotate_7_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 2;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*2;
+
+        unsigned char* dst6 = dstend - y*2 - 8*2 - w*2;
+        unsigned char* dst7 = dstend - y*2 - 8*2;
+
+        int src_step = 2*srcw*2;
+        int dst_step = - 2*w*2;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src0 = vld2_u8(src0);
+            uint8x8x2_t _src1 = vld2_u8(src1);
+
+            uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+            uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+            uint8x8x2_t _src4 = vld2_u8(src0 + 2*src_step);
+            uint8x8x2_t _src5 = vld2_u8(src1 + 2*src_step);
+
+            uint8x8x2_t _src6 = vld2_u8(src0 + 3*src_step);
+            uint8x8x2_t _src7 = vld2_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint8x8x2_t _dst0;
+            uint8x8x2_t _dst1;
+            uint8x8x2_t _dst2;
+            uint8x8x2_t _dst3;
+            uint8x8x2_t _dst4;
+            uint8x8x2_t _dst5;
+            uint8x8x2_t _dst6;
+            uint8x8x2_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            vst2_u8(dst7, _dst7);
+            vst2_u8(dst6, _dst6);
+            vst2_u8(dst7 + dst_step, _dst5);
+            vst2_u8(dst6 + dst_step, _dst4);
+            vst2_u8(dst7 + 2*dst_step, _dst3);
+            vst2_u8(dst6 + 2*dst_step, _dst2);
+            vst2_u8(dst7 + 3*dst_step, _dst1);
+            vst2_u8(dst6 + 3*dst_step, _dst0);
+
+            src0 += 2*8;
+            src1 += 2*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1], %10  \n"
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d2-d3}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d4-d5}, [%1], %10  \n"
+
+            "vtrn.u8    q1, q0              \n"// _src01t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d6-d7}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d16-d17}, [%1], %10\n"
+
+            "vtrn.u8    q3, q2              \n"// _src23t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d18-d19}, [%2], %10\n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d20-d21}, [%1], %10\n"
+
+            "vtrn.u8    q9, q8              \n"// _src45t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d22-d23}, [%2], %10\n"
+
+            "vtrn.u8    q11, q10            \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q2, q0              \n"// _src02tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q3, q1              \n"// _src13tt_r
+
+            "add        %1, #16             \n"// src0 += 16
+
+            "vtrn.u16   q10, q8            \n"// _src46tt_r
+
+            "add        %2, #16             \n"// src1 += 16
+
+            "vtrn.u16   q11, q9             \n"// _src57tt_r
+
+            "vtrn.u32   q10, q2             \n"// _src26ttt_r
+
+            "vtrn.u32   q11, q3             \n"// _src37ttt_r
+            "vst2.u8    {d20-d21}, [%4], %11\n"
+
+            "vtrn.u32   q8, q0              \n"// _src04ttt_r
+            "vst2.u8    {d22-d23}, [%3], %11\n"
+
+            "vtrn.u32   q9, q1              \n"// _src15ttt_r
+            "vst2.u8    {d16-d17}, [%4], %11\n"
+
+            "subs       %0, #1              \n"
+
+            "vst2.u8    {d4-d5}, [%4], %11  \n"
+            "vst2.u8    {d18-d19}, [%3], %11\n"
+            "vst2.u8    {d6-d7}, [%3], %11  \n"
+            "vst2.u8    {d0-d1}, [%4], %11  \n"
+            "vst2.u8    {d2-d3}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src1[0 + 3*src_step];
+            dst7[1] = src1[1 + 3*src_step];
+            dst7[2] = src0[0 + 3*src_step];
+            dst7[3] = src0[1 + 3*src_step];
+            dst7[4] = src1[0 + 2*src_step];
+            dst7[5] = src1[1 + 2*src_step];
+            dst7[6] = src0[0 + 2*src_step];
+            dst7[7] = src0[1 + 2*src_step];
+            dst7[8] = src1[0 + src_step];
+            dst7[9] = src1[1 + src_step];
+            dst7[10] = src0[0 + src_step];
+            dst7[11] = src0[1 + src_step];
+            dst7[12] = src1[0];
+            dst7[13] = src1[1];
+            dst7[14] = src0[0];
+            dst7[15] = src0[1];
+
+            src0 += 2;
+            src1 += 2;
+
+            dst7 -= w*2;
+        }
+
+        src0 += 7*srcw*2;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*2 - 2;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 -= w*2;
+        }
+    }
+}
+
+void rotate_7_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 3;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*3;
+
+        unsigned char* dst6 = dstend - y*3 - 8*3 - w*3;
+        unsigned char* dst7 = dstend - y*3 - 8*3;
+
+        int src_step = 2*srcw*3;
+        int dst_step = - 2*w*3;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src0 = vld3_u8(src0);
+            uint8x8x3_t _src1 = vld3_u8(src1);
+
+            uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+            uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+            uint8x8x3_t _src4 = vld3_u8(src0 + 2*src_step);
+            uint8x8x3_t _src5 = vld3_u8(src1 + 2*src_step);
+
+            uint8x8x3_t _src6 = vld3_u8(src0 + 3*src_step);
+            uint8x8x3_t _src7 = vld3_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+            uint8x8x3_t _dst0;
+            uint8x8x3_t _dst1;
+            uint8x8x3_t _dst2;
+            uint8x8x3_t _dst3;
+            uint8x8x3_t _dst4;
+            uint8x8x3_t _dst5;
+            uint8x8x3_t _dst6;
+            uint8x8x3_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+            vst3_u8(dst7, _dst7);
+            vst3_u8(dst6, _dst6);
+            vst3_u8(dst7 + dst_step, _dst5);
+            vst3_u8(dst6 + dst_step, _dst4);
+            vst3_u8(dst7 + 2*dst_step, _dst3);
+            vst3_u8(dst6 + 2*dst_step, _dst2);
+            vst3_u8(dst7 + 3*dst_step, _dst1);
+            vst3_u8(dst6 + 3*dst_step, _dst0);
+
+            src0 += 3*8;
+            src1 += 3*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1], %10  \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d4-d6}, [%2], %10  \n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d8-d10}, [%1], %10 \n"
+
+            "vtrn.u8    q2, q0              \n"// _src01t_r
+            "vtrn.u8    d6, d2              \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d12-d14}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d16-d18}, [%1], %10\n"
+
+            "vtrn.u8    q6, q4             \n"// _src23t_r
+            "vtrn.u8    d14, d10            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d20-d22}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d24-d26}, [%1], %10\n"
+
+            "vtrn.u8    q10, q8             \n"// _src45t_r
+            "vtrn.u8    d22, d18            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d28-d30}, [%2], %10\n"
+
+            "vtrn.u8    q14, q12            \n"// _src67t_r
+            "vtrn.u8    d30, d26            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q4, q0              \n"// _src02tt_r
+            "vtrn.u16   d10, d2             \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q6, q2              \n"// _src13tt_r
+            "vtrn.u16   d14, d6             \n"
+
+            "add        %1, #24             \n"// src0 += 24
+
+            "vtrn.u16   q12, q8             \n"// _src46tt_r
+            "vtrn.u16   d26, d18            \n"
+
+            "add        %2, #24             \n"// src1 += 24
+
+            "vtrn.u16   q14, q10            \n"// _src57tt_r
+            "vtrn.u16   d30, d22            \n"
+
+            "vtrn.u32   q12, q4             \n"// _src26ttt_r
+            "vtrn.u32   d26, d10            \n"
+
+            "vtrn.u32   q14, q6             \n"// _src37ttt_r
+            "vst3.u8    {d24-d26}, [%4], %11\n"
+            "vtrn.u32   d30, d14            \n"
+
+            "vtrn.u32   q8, q0             \n"// _src04ttt_r
+            "vst3.u8    {d28-d30}, [%3], %11\n"
+            "vtrn.u32   d18, d2             \n"
+
+            "vtrn.u32   q10, q2             \n"// _src15ttt_r
+            "vst3.u8    {d16-d18}, [%4], %11\n"
+            "vtrn.u32   d22, d6             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst3.u8    {d8-d10}, [%4], %11 \n"
+            "vst3.u8    {d20-d22}, [%3], %11\n"
+            "vst3.u8    {d12-d14}, [%3], %11\n"
+            "vst3.u8    {d0-d2}, [%4], %11  \n"
+            "vst3.u8    {d4-d6}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src1[0 + 3*src_step];
+            dst7[1] = src1[1 + 3*src_step];
+            dst7[2] = src1[2 + 3*src_step];
+            dst7[3] = src0[0 + 3*src_step];
+            dst7[4] = src0[1 + 3*src_step];
+            dst7[5] = src0[2 + 3*src_step];
+            dst7[6] = src1[0 + 2*src_step];
+            dst7[7] = src1[1 + 2*src_step];
+            dst7[8] = src1[2 + 2*src_step];
+            dst7[9] = src0[0 + 2*src_step];
+            dst7[10] = src0[1 + 2*src_step];
+            dst7[11] = src0[2 + 2*src_step];
+            dst7[12] = src1[0 + src_step];
+            dst7[13] = src1[1 + src_step];
+            dst7[14] = src1[2 + src_step];
+            dst7[15] = src0[0 + src_step];
+            dst7[16] = src0[1 + src_step];
+            dst7[17] = src0[2 + src_step];
+            dst7[18] = src1[0];
+            dst7[19] = src1[1];
+            dst7[20] = src1[2];
+            dst7[21] = src0[0];
+            dst7[22] = src0[1];
+            dst7[23] = src0[2];
+
+            src0 += 3;
+            src1 += 3;
+
+            dst7 -= w*3;
+        }
+
+        src0 += 7*srcw*3;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*3 - 3;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 -= w*3;
+        }
+    }
+}
+
+void rotate_7_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel
+    unsigned char* dstend = dst + w * h * 4;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*4;
+
+        unsigned char* dst6 = dstend - y*4 - 8*4 - w*4;
+        unsigned char* dst7 = dstend - y*4 - 8*4;
+
+        int src_step = 2*srcw*4;
+        int dst_step = - 2*w*4;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src0 = vld4_u8(src0);
+            uint8x8x4_t _src1 = vld4_u8(src1);
+
+            uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+            uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+            uint8x8x4_t _src4 = vld4_u8(src0 + 2*src_step);
+            uint8x8x4_t _src5 = vld4_u8(src1 + 2*src_step);
+
+            uint8x8x4_t _src6 = vld4_u8(src0 + 3*src_step);
+            uint8x8x4_t _src7 = vld4_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+            uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
+            uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
+            uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
+            uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+            uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
+            uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
+            uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
+            uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+            uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
+            uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
+            uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
+            uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
+
+            uint8x8x4_t _dst0;
+            uint8x8x4_t _dst1;
+            uint8x8x4_t _dst2;
+            uint8x8x4_t _dst3;
+            uint8x8x4_t _dst4;
+            uint8x8x4_t _dst5;
+            uint8x8x4_t _dst6;
+            uint8x8x4_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+            _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+            _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+            _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+            _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+            _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+            _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+            _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+            _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+
+            vst4_u8(dst7, _dst7);
+            vst4_u8(dst6, _dst6);
+            vst4_u8(dst7 + dst_step, _dst5);
+            vst4_u8(dst6 + dst_step, _dst4);
+            vst4_u8(dst7 + 2*dst_step, _dst3);
+            vst4_u8(dst6 + 2*dst_step, _dst2);
+            vst4_u8(dst7 + 3*dst_step, _dst1);
+            vst4_u8(dst6 + 3*dst_step, _dst0);
+
+            src0 += 4*8;
+            src1 += 4*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1], %10  \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d4-d7}, [%2], %10  \n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d8-d11}, [%1], %10 \n"
+
+            "vtrn.u8    q2, q0              \n"// _src01t_r
+            "vtrn.u8    q3, q1              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d12-d15}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d16-d19}, [%1], %10\n"
+
+            "vtrn.u8    q6, q4              \n"// _src23t_r
+            "vtrn.u8    q7, q5              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d20-d23}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d24-d27}, [%1], %10\n"
+
+            "vtrn.u8    q10, q8             \n"// _src45t_r
+            "vtrn.u8    q11, q9             \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d28-d31}, [%2], %10\n"
+
+            "vtrn.u8    q14, q12            \n"// _src67t_r
+            "vtrn.u8    q15, q13            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q4, q0              \n"// _src02tt_r
+            "vtrn.u16   q5, q1              \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q6, q2              \n"// _src13tt_r
+            "vtrn.u16   q7, q3              \n"
+
+            "add        %1, #32             \n"// src0 += 32
+
+            "vtrn.u16   q12, q8             \n"// _src46tt_r
+            "vtrn.u16   q13, q9             \n"
+
+            "add        %2, #32             \n"// src1 += 32
+
+            "vtrn.u16   q14, q10            \n"// _src57tt_r
+            "vtrn.u16   q15, q11            \n"
+
+            "vtrn.u32   q12, q4             \n"// _src26ttt_r
+            "vtrn.u32   q13, q5             \n"
+
+            "vtrn.u32   q14, q6             \n"// _src37ttt_r
+            "vst4.u8    {d24-d27}, [%4], %11\n"
+            "vtrn.u32   q15, q7             \n"
+
+            "vtrn.u32   q8, q0              \n"// _src04ttt_r
+            "vst4.u8    {d28-d31}, [%3], %11\n"
+            "vtrn.u32   q9, q1              \n"
+
+            "vtrn.u32   q10, q2             \n"// _src15ttt_r
+            "vst4.u8    {d16-d19}, [%4], %11\n"
+            "vtrn.u32   q11, q3             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst4.u8    {d8-d11}, [%4], %11 \n"
+            "vst4.u8    {d20-d23}, [%3], %11\n"
+            "vst4.u8    {d12-d15}, [%3], %11\n"
+            "vst4.u8    {d0-d3}, [%4], %11  \n"
+            "vst4.u8    {d4-d7}, [%3], %11  \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src1[0 + 3*src_step];
+            dst7[1] = src1[1 + 3*src_step];
+            dst7[2] = src1[2 + 3*src_step];
+            dst7[3] = src1[3 + 3*src_step];
+            dst7[4] = src0[0 + 3*src_step];
+            dst7[5] = src0[1 + 3*src_step];
+            dst7[6] = src0[2 + 3*src_step];
+            dst7[7] = src0[3 + 3*src_step];
+            dst7[8] = src1[0 + 2*src_step];
+            dst7[9] = src1[1 + 2*src_step];
+            dst7[10] = src1[2 + 2*src_step];
+            dst7[11] = src1[3 + 2*src_step];
+            dst7[12] = src0[0 + 2*src_step];
+            dst7[13] = src0[1 + 2*src_step];
+            dst7[14] = src0[2 + 2*src_step];
+            dst7[15] = src0[3 + 2*src_step];
+            dst7[16] = src1[0 + src_step];
+            dst7[17] = src1[1 + src_step];
+            dst7[18] = src1[2 + src_step];
+            dst7[19] = src1[3 + src_step];
+            dst7[20] = src0[0 + src_step];
+            dst7[21] = src0[1 + src_step];
+            dst7[22] = src0[2 + src_step];
+            dst7[23] = src0[3 + src_step];
+            dst7[24] = src1[0];
+            dst7[25] = src1[1];
+            dst7[26] = src1[2];
+            dst7[27] = src1[3];
+            dst7[28] = src0[0];
+            dst7[29] = src0[1];
+            dst7[30] = src0[2];
+            dst7[31] = src0[3];
+
+            src0 += 4;
+            src1 += 4;
+
+            dst7 -= w*4;
+        }
+
+        src0 += 7*srcw*4;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend - y*4 - 4;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 -= w*4;
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_8.c b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_8.c
new file mode 100644
index 0000000..96a28a9
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_8.c
@@ -0,0 +1,1057 @@
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+void rotate_8_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1);
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw;
+
+        unsigned char* dst7 = dstend + y;
+        unsigned char* dst6 = dstend + y - w;
+
+        int src_step = 2*srcw;
+        int dst_step = - 2*w;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8_t _src0 = vld1_u8(src0);
+            uint8x8_t _src1 = vld1_u8(src1);
+
+            uint8x8_t _src2 = vld1_u8(src0 + src_step);
+            uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+            uint8x8_t _src4 = vld1_u8(src0 + 2*src_step);
+            uint8x8_t _src5 = vld1_u8(src1 + 2*src_step);
+
+            uint8x8_t _src6 = vld1_u8(src0 + 3*src_step);
+            uint8x8_t _src7 = vld1_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            vst1_u8(dst7, _dst0);
+            vst1_u8(dst6, _dst1);
+            vst1_u8(dst7 + dst_step, _dst2);
+            vst1_u8(dst6 + dst_step, _dst3);
+            vst1_u8(dst7 + 2*dst_step, _dst4);
+            vst1_u8(dst6 + 2*dst_step, _dst5);
+            vst1_u8(dst7 + 3*dst_step, _dst6);
+            vst1_u8(dst6 + 3*dst_step, _dst7);
+
+            src0 += 8;
+            src1 += 8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d0}, [%1], %10     \n"
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d1}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d2}, [%1], %10     \n"
+
+            "vtrn.u8    d0, d1              \n"// _src01t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d3}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d4}, [%1], %10     \n"
+
+            "vtrn.u8    d2, d3              \n"// _src23t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d5}, [%2], %10     \n"
+
+            "pld        [%1, #64]           \n"
+            "vld1.u8    {d6}, [%1], %10     \n"
+
+            "vtrn.u8    d4, d5              \n"// _src45t_r
+
+            "pld        [%2, #64]           \n"
+            "vld1.u8    {d7}, [%2], %10     \n"
+
+            "vtrn.u8    d6, d7              \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q1              \n"// _src02tt_r _src13tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q3              \n"// _src46tt_r _src57tt_r
+
+            "add        %1, #8              \n"// src0 += 8
+
+            "vtrn.u32   q0, q2              \n"// _src04ttt_r _src15ttt_r
+
+            "add        %2, #8              \n"// src1 += 8
+
+            "vtrn.u32   q1, q3              \n"// _src26ttt_r _src37ttt_r
+            "vst1.u8    {d0}, [%3], %11     \n"
+            "vst1.u8    {d1}, [%4], %11     \n"
+
+            "subs       %0, #1              \n"
+
+            "vst1.u8    {d2}, [%3], %11     \n"
+            "vst1.u8    {d3}, [%4], %11     \n"
+            "vst1.u8    {d4}, [%3], %11     \n"
+            "vst1.u8    {d5}, [%4], %11     \n"
+            "vst1.u8    {d6}, [%3], %11     \n"
+            "vst1.u8    {d7}, [%4], %11     \n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src0[0];
+            dst7[1] = src1[0];
+            dst7[2] = src0[0 + src_step];
+            dst7[3] = src1[0 + src_step];
+            dst7[4] = src0[0 + 2*src_step];
+            dst7[5] = src1[0 + 2*src_step];
+            dst7[6] = src0[0 + 3*src_step];
+            dst7[7] = src1[0 + 3*src_step];
+
+            src0 += 1;
+            src1 += 1;
+
+            dst7 -= w;
+        }
+
+        src0 += 7*srcw;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend + y;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            *dst0 = *src0;
+
+            src0 += 1;
+            dst0 -= w;
+        }
+    }
+}
+
+void rotate_8_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 2;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*2;
+
+        unsigned char* dst7 = dstend + y*2;
+        unsigned char* dst6 = dstend + y*2 - w*2;
+
+        int src_step = 2*srcw*2;
+        int dst_step = - 2*w*2;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x2_t _src0 = vld2_u8(src0);
+            uint8x8x2_t _src1 = vld2_u8(src1);
+
+            uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+            uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+            uint8x8x2_t _src4 = vld2_u8(src0 + 2*src_step);
+            uint8x8x2_t _src5 = vld2_u8(src1 + 2*src_step);
+
+            uint8x8x2_t _src6 = vld2_u8(src0 + 3*src_step);
+            uint8x8x2_t _src7 = vld2_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint8x8x2_t _dst0;
+            uint8x8x2_t _dst1;
+            uint8x8x2_t _dst2;
+            uint8x8x2_t _dst3;
+            uint8x8x2_t _dst4;
+            uint8x8x2_t _dst5;
+            uint8x8x2_t _dst6;
+            uint8x8x2_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            vst2_u8(dst7, _dst0);
+            vst2_u8(dst6, _dst1);
+            vst2_u8(dst7 + dst_step, _dst2);
+            vst2_u8(dst6 + dst_step, _dst3);
+            vst2_u8(dst7 + 2*dst_step, _dst4);
+            vst2_u8(dst6 + 2*dst_step, _dst5);
+            vst2_u8(dst7 + 3*dst_step, _dst6);
+            vst2_u8(dst6 + 3*dst_step, _dst7);
+
+            src0 += 2*8;
+            src1 += 2*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d0-d1}, [%1], %10  \n"
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d2-d3}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d4-d5}, [%1], %10  \n"
+
+            "vtrn.u8    q0, q1              \n"// _src01t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d6-d7}, [%2], %10  \n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d16-d17}, [%1], %10\n"
+
+            "vtrn.u8    q2, q3              \n"// _src23t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d18-d19}, [%2], %10\n"
+
+            "pld        [%1, #128]          \n"
+            "vld2.u8    {d20-d21}, [%1], %10\n"
+
+            "vtrn.u8    q8, q9              \n"// _src45t_r
+
+            "pld        [%2, #128]          \n"
+            "vld2.u8    {d22-d23}, [%2], %10\n"
+
+            "vtrn.u8    q10, q11            \n"// _src67t_r
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q2              \n"// _src02tt_r
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q1, q3              \n"// _src13tt_r
+
+            "add        %1, #16             \n"// src0 += 16
+
+            "vtrn.u16   q8, q10             \n"// _src46tt_r
+
+            "add        %2, #16             \n"// src1 += 16
+
+            "vtrn.u16   q9, q11             \n"// _src57tt_r
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+
+            "vtrn.u32   q1, q9              \n"// _src15ttt_r
+            "vst2.u8    {d0-d1}, [%3], %11  \n"
+
+            "vtrn.u32   q2, q10             \n"// _src26ttt_r
+            "vst2.u8    {d2-d3}, [%4], %11  \n"
+
+            "vtrn.u32   q3, q11             \n"// _src37ttt_r
+            "vst2.u8    {d4-d5}, [%3], %11  \n"
+
+            "subs       %0, #1              \n"
+
+            "vst2.u8    {d16-d17}, [%3], %11\n"
+            "vst2.u8    {d6-d7}, [%4], %11  \n"
+            "vst2.u8    {d18-d19}, [%4], %11\n"
+            "vst2.u8    {d20-d21}, [%3], %11\n"
+            "vst2.u8    {d22-d23}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src0[0];
+            dst7[1] = src0[1];
+            dst7[2] = src1[0];
+            dst7[3] = src1[1];
+            dst7[4] = src0[0 + src_step];
+            dst7[5] = src0[1 + src_step];
+            dst7[6] = src1[0 + src_step];
+            dst7[7] = src1[1 + src_step];
+            dst7[8] = src0[0 + 2*src_step];
+            dst7[9] = src0[1 + 2*src_step];
+            dst7[10] = src1[0 + 2*src_step];
+            dst7[11] = src1[1 + 2*src_step];
+            dst7[12] = src0[0 + 3*src_step];
+            dst7[13] = src0[1 + 3*src_step];
+            dst7[14] = src1[0 + 3*src_step];
+            dst7[15] = src1[1 + 3*src_step];
+
+            src0 += 2;
+            src1 += 2;
+
+            dst7 -= w*2;
+        }
+
+        src0 += 7*srcw*2;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend + y*2;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+
+            src0 += 2;
+            dst0 -= w*2;
+        }
+    }
+}
+
+void rotate_8_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 3;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*3;
+
+        unsigned char* dst7 = dstend + y*3;
+        unsigned char* dst6 = dstend + y*3 - w*3;
+
+        int src_step = 2*srcw*3;
+        int dst_step = - 2*w*3;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x3_t _src0 = vld3_u8(src0);
+            uint8x8x3_t _src1 = vld3_u8(src1);
+
+            uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+            uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+            uint8x8x3_t _src4 = vld3_u8(src0 + 2*src_step);
+            uint8x8x3_t _src5 = vld3_u8(src1 + 2*src_step);
+
+            uint8x8x3_t _src6 = vld3_u8(src0 + 3*src_step);
+            uint8x8x3_t _src7 = vld3_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+            uint8x8x3_t _dst0;
+            uint8x8x3_t _dst1;
+            uint8x8x3_t _dst2;
+            uint8x8x3_t _dst3;
+            uint8x8x3_t _dst4;
+            uint8x8x3_t _dst5;
+            uint8x8x3_t _dst6;
+            uint8x8x3_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+            vst3_u8(dst7, _dst0);
+            vst3_u8(dst6, _dst1);
+            vst3_u8(dst7 + dst_step, _dst2);
+            vst3_u8(dst6 + dst_step, _dst3);
+            vst3_u8(dst7 + 2*dst_step, _dst4);
+            vst3_u8(dst6 + 2*dst_step, _dst5);
+            vst3_u8(dst7 + 3*dst_step, _dst6);
+            vst3_u8(dst6 + 3*dst_step, _dst7);
+
+            src0 += 3*8;
+            src1 += 3*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d0-d2}, [%1], %10  \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d4-d6}, [%2], %10  \n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d8-d10}, [%1], %10 \n"
+
+            "vtrn.u8    q0, q2              \n"// _src01t_r
+            "vtrn.u8    d2, d6              \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d12-d14}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d16-d18}, [%1], %10\n"
+
+            "vtrn.u8    q4, q6              \n"// _src23t_r
+            "vtrn.u8    d10, d14            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d20-d22}, [%2], %10\n"
+
+            "pld        [%1, #192]          \n"
+            "vld3.u8    {d24-d26}, [%1], %10\n"
+
+            "vtrn.u8    q8, q10             \n"// _src45t_r
+            "vtrn.u8    d18, d22            \n"
+
+            "pld        [%2, #192]          \n"
+            "vld3.u8    {d28-d30}, [%2], %10\n"
+
+            "vtrn.u8    q12, q14            \n"// _src67t_r
+            "vtrn.u8    d26, d30            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q4              \n"// _src02tt_r
+            "vtrn.u16   d2, d10             \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q6              \n"// _src13tt_r
+            "vtrn.u16   d6, d14             \n"
+
+            "add        %1, #24             \n"// src0 += 24
+
+            "vtrn.u16   q8, q12             \n"// _src46tt_r
+            "vtrn.u16   d18, d26            \n"
+
+            "add        %2, #24             \n"// src1 += 24
+
+            "vtrn.u16   q10, q14            \n"// _src57tt_r
+            "vtrn.u16   d22, d30            \n"
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+            "vtrn.u32   d2, d18             \n"
+
+            "vtrn.u32   q2, q10             \n"// _src15ttt_r
+            "vst3.u8    {d0-d2}, [%3], %11  \n"
+            "vtrn.u32   d6, d22             \n"
+
+            "vtrn.u32   q4, q12             \n"// _src26ttt_r
+            "vst3.u8    {d4-d6}, [%4], %11  \n"
+            "vtrn.u32   d10, d26            \n"
+
+            "vtrn.u32   q6, q14             \n"// _src37ttt_r
+            "vst3.u8    {d8-d10}, [%3], %11 \n"
+            "vtrn.u32   d14, d30            \n"
+
+            "subs       %0, #1              \n"
+
+            "vst3.u8    {d16-d18}, [%3], %11\n"
+            "vst3.u8    {d12-d14}, [%4], %11\n"
+            "vst3.u8    {d20-d22}, [%4], %11\n"
+            "vst3.u8    {d24-d26}, [%3], %11\n"
+            "vst3.u8    {d28-d30}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src0[0];
+            dst7[1] = src0[1];
+            dst7[2] = src0[2];
+            dst7[3] = src1[0];
+            dst7[4] = src1[1];
+            dst7[5] = src1[2];
+            dst7[6] = src0[0 + src_step];
+            dst7[7] = src0[1 + src_step];
+            dst7[8] = src0[2 + src_step];
+            dst7[9] = src1[0 + src_step];
+            dst7[10] = src1[1 + src_step];
+            dst7[11] = src1[2 + src_step];
+            dst7[12] = src0[0 + 2*src_step];
+            dst7[13] = src0[1 + 2*src_step];
+            dst7[14] = src0[2 + 2*src_step];
+            dst7[15] = src1[0 + 2*src_step];
+            dst7[16] = src1[1 + 2*src_step];
+            dst7[17] = src1[2 + 2*src_step];
+            dst7[18] = src0[0 + 3*src_step];
+            dst7[19] = src0[1 + 3*src_step];
+            dst7[20] = src0[2 + 3*src_step];
+            dst7[21] = src1[0 + 3*src_step];
+            dst7[22] = src1[1 + 3*src_step];
+            dst7[23] = src1[2 + 3*src_step];
+
+            src0 += 3;
+            src1 += 3;
+
+            dst7 -= w*3;
+        }
+
+        src0 += 7*srcw*3;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend + y*3;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+
+            src0 += 3;
+            dst0 -= w*3;
+        }
+    }
+}
+
+void rotate_8_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst)
+{
+    int w = srch;
+    int h = srcw;
+
+    // point to the last dst pixel row
+    unsigned char* dstend = dst + w * (h-1) * 4;
+
+    const unsigned char* src0 = src;
+
+    int y = 0;
+#if __ARM_NEON
+    for (; y+7 < srch; y += 8)
+    {
+        const unsigned char* src1 = src0 + srcw*4;
+
+        unsigned char* dst7 = dstend + y*4;
+        unsigned char* dst6 = dstend + y*4 - w*4;
+
+        int src_step = 2*srcw*4;
+        int dst_step = - 2*w*4;
+
+        int nn = srcw >> 3;
+        int remain = srcw - (nn << 3);
+
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            uint8x8x4_t _src0 = vld4_u8(src0);
+            uint8x8x4_t _src1 = vld4_u8(src1);
+
+            uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+            uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+            uint8x8x4_t _src4 = vld4_u8(src0 + 2*src_step);
+            uint8x8x4_t _src5 = vld4_u8(src1 + 2*src_step);
+
+            uint8x8x4_t _src6 = vld4_u8(src0 + 3*src_step);
+            uint8x8x4_t _src7 = vld4_u8(src1 + 3*src_step);
+
+            uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+            uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+            uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+            uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+            uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+            uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+            uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+            uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+            uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+            uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+            uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+            uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+            uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
+            uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
+            uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
+            uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
+
+            uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+            uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+            uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+            uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+            uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+            uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+            uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+            uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+            uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+            uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+            uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+            uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+            uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
+            uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
+            uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
+            uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
+
+            uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+            uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+            uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+            uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+            uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+            uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+            uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+            uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+            uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+            uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+            uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+            uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+            uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
+            uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
+            uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
+            uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
+
+            uint8x8x4_t _dst0;
+            uint8x8x4_t _dst1;
+            uint8x8x4_t _dst2;
+            uint8x8x4_t _dst3;
+            uint8x8x4_t _dst4;
+            uint8x8x4_t _dst5;
+            uint8x8x4_t _dst6;
+            uint8x8x4_t _dst7;
+
+            _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+            _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+            _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+            _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+            _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+            _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+            _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+            _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+            _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+            _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+            _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+            _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+            _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+            _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+            _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+            _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+            _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+            _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+            _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+            _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+            _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+            _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+            _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+            _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+            _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+            _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+            _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+            _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+            _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+            _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+            _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+            _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+
+            vst4_u8(dst7, _dst0);
+            vst4_u8(dst6, _dst1);
+            vst4_u8(dst7 + dst_step, _dst2);
+            vst4_u8(dst6 + dst_step, _dst3);
+            vst4_u8(dst7 + 2*dst_step, _dst4);
+            vst4_u8(dst6 + 2*dst_step, _dst5);
+            vst4_u8(dst7 + 3*dst_step, _dst6);
+            vst4_u8(dst6 + 3*dst_step, _dst7);
+
+            src0 += 4*8;
+            src1 += 4*8;
+
+            dst7 += 4*dst_step;
+            dst6 += 4*dst_step;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d0-d3}, [%1], %10  \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d4-d7}, [%2], %10  \n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d8-d11}, [%1], %10 \n"
+
+            "vtrn.u8    q0, q2              \n"// _src01t_r
+            "vtrn.u8    q1, q3              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d12-d15}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d16-d19}, [%1], %10\n"
+
+            "vtrn.u8    q4, q6              \n"// _src23t_r
+            "vtrn.u8    q5, q7              \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d20-d23}, [%2], %10\n"
+
+            "pld        [%1, #256]          \n"
+            "vld4.u8    {d24-d27}, [%1], %10\n"
+
+            "vtrn.u8    q8, q10             \n"// _src45t_r
+            "vtrn.u8    q9, q11             \n"
+
+            "pld        [%2, #256]          \n"
+            "vld4.u8    {d28-d31}, [%2], %10\n"
+
+            "vtrn.u8    q12, q14            \n"// _src67t_r
+            "vtrn.u8    q13, q15            \n"
+
+            "sub        %1, %1, %10, lsl #2     \n"// restore src0
+
+            "vtrn.u16   q0, q4              \n"// _src02tt_r
+            "vtrn.u16   q1, q5              \n"
+
+            "sub        %2, %2, %10, lsl #2     \n"// restore src1
+
+            "vtrn.u16   q2, q6              \n"// _src13tt_r
+            "vtrn.u16   q3, q7              \n"
+
+            "add        %1, #32             \n"// src0 += 32
+
+            "vtrn.u16   q8, q12             \n"// _src46tt_r
+            "vtrn.u16   q9, q13             \n"
+
+            "add        %2, #32             \n"// src1 += 32
+
+            "vtrn.u16   q10, q14            \n"// _src57tt_r
+            "vtrn.u16   q11, q15            \n"
+
+            "vtrn.u32   q0, q8              \n"// _src04ttt_r
+            "vtrn.u32   q1, q9              \n"
+
+            "vtrn.u32   q2, q10             \n"// _src15ttt_r
+            "vst4.u8    {d0-d3}, [%3], %11  \n"
+            "vtrn.u32   q3, q11             \n"
+
+            "vtrn.u32   q4, q12             \n"// _src26ttt_r
+            "vst4.u8    {d4-d7}, [%4], %11  \n"
+            "vtrn.u32   q5, q13             \n"
+
+            "vtrn.u32   q6, q14             \n"// _src37ttt_r
+            "vst4.u8    {d8-d11}, [%3], %11 \n"
+            "vtrn.u32   q7, q15             \n"
+
+            "subs       %0, #1              \n"
+
+            "vst4.u8    {d16-d19}, [%3], %11\n"
+            "vst4.u8    {d12-d15}, [%4], %11\n"
+            "vst4.u8    {d20-d23}, [%4], %11\n"
+            "vst4.u8    {d24-d27}, [%3], %11\n"
+            "vst4.u8    {d28-d31}, [%4], %11\n"
+
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(src0),   // %1
+              "=r"(src1),   // %2
+              "=r"(dst7),   // %3
+              "=r"(dst6)    // %4
+            : "0"(nn),
+              "1"(src0),
+              "2"(src1),
+              "3"(dst7),
+              "4"(dst6),
+              "r"(src_step),// %10
+              "r"(dst_step) // %11
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+        }
+#endif // __aarch64__
+        for (; remain>0; remain--)
+        {
+            dst7[0] = src0[0];
+            dst7[1] = src0[1];
+            dst7[2] = src0[2];
+            dst7[3] = src0[3];
+            dst7[4] = src1[0];
+            dst7[5] = src1[1];
+            dst7[6] = src1[2];
+            dst7[7] = src1[3];
+            dst7[8] = src0[0 + src_step];
+            dst7[9] = src0[1 + src_step];
+            dst7[10] = src0[2 + src_step];
+            dst7[11] = src0[3 + src_step];
+            dst7[12] = src1[0 + src_step];
+            dst7[13] = src1[1 + src_step];
+            dst7[14] = src1[2 + src_step];
+            dst7[15] = src1[3 + src_step];
+            dst7[16] = src0[0 + 2*src_step];
+            dst7[17] = src0[1 + 2*src_step];
+            dst7[18] = src0[2 + 2*src_step];
+            dst7[19] = src0[3 + 2*src_step];
+            dst7[20] = src1[0 + 2*src_step];
+            dst7[21] = src1[1 + 2*src_step];
+            dst7[22] = src1[2 + 2*src_step];
+            dst7[23] = src1[3 + 2*src_step];
+            dst7[24] = src0[0 + 3*src_step];
+            dst7[25] = src0[1 + 3*src_step];
+            dst7[26] = src0[2 + 3*src_step];
+            dst7[27] = src0[3 + 3*src_step];
+            dst7[28] = src1[0 + 3*src_step];
+            dst7[29] = src1[1 + 3*src_step];
+            dst7[30] = src1[2 + 3*src_step];
+            dst7[31] = src1[3 + 3*src_step];
+
+            src0 += 4;
+            src1 += 4;
+
+            dst7 -= w*4;
+        }
+
+        src0 += 7*srcw*4;
+    }
+#endif // __ARM_NEON
+    for (; y < srch; y++)
+    {
+        unsigned char* dst0 = dstend + y*4;
+
+        int x = 0;
+        for (; x < srcw; x ++)
+        {
+            dst0[0] = src0[0];
+            dst0[1] = src0[1];
+            dst0[2] = src0[2];
+            dst0[3] = src0[3];
+
+            src0 += 4;
+            dst0 -= w*4;
+        }
+    }
+}
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_x.h b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_x.h
new file mode 100644
index 0000000..7558b1d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/kannarotate-android-lib/src/rotate_x.h
@@ -0,0 +1,48 @@
+#ifndef ROTATE_X_H
+#define ROTATE_X_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 1-channel
+void rotate_2_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_3_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_4_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_5_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_6_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_7_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_8_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+
+// 2-channel
+void rotate_2_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_3_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_4_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_5_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_6_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_7_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_8_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+
+// 3-channel
+void rotate_2_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_3_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_4_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_5_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_6_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_7_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_8_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+
+// 4-channel
+void rotate_2_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_3_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_4_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_5_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_6_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_7_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+void rotate_8_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ROTATE_X_H
diff --git a/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/yuv420sp_to_rgb_fast_asm.h b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/yuv420sp_to_rgb_fast_asm.h
new file mode 100644
index 0000000..0efc974
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/jni/thirdparty/yuv420sp_to_rgb_fast_asm.h
@@ -0,0 +1,409 @@
+
+#ifndef YUV420SP_TO_RGB_FAST_ASM_H
+#define YUV420SP_TO_RGB_FAST_ASM_H
+
+//#if __ARM_NEON
+#include <arm_neon.h>
+//#endif // __ARM_NEON
+
+//#include <opencv2/core/core.hpp>
+//#include <opencv2/highgui/highgui.hpp>
+//#include <opencv2/imgproc/imgproc.hpp>
+
+static void yuv420sp_to_rgb_fast_asm(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
+{
+    const unsigned char* yptr = yuv420sp;
+    const unsigned char* vuptr = yuv420sp + w * h;
+
+    int8x8_t _v128 = vdup_n_s8(128);
+    int8x8_t _v90 = vdup_n_s8(90);
+    int8x8_t _v46 = vdup_n_s8(46);
+    int8x8_t _v22 = vdup_n_s8(22);
+    int8x8_t _v113 = vdup_n_s8(113);
+
+    for (int y=0; y<h; y+=2)
+    {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = rgb;
+        unsigned char* rgb1 = rgb + w*3;
+
+#if __ARM_NEON
+        int nn = w >> 3;
+        int remain = w - (nn << 3);
+#else
+        int remain = w;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
+            int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
+
+            int8x8_t _vvuu = vsub_s8(vreinterpret_s8_u8(vld1_u8(vuptr)), _v128);
+            int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
+            int8x8_t _vv = _vvvvuuuu.val[0];
+            int8x8_t _uu = _vvvvuuuu.val[1];
+
+            int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
+            int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
+            _g0 = vmlsl_s8(_g0, _uu, _v22);
+            int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
+
+            int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
+            int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
+            _g1 = vmlsl_s8(_g1, _uu, _v22);
+            int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
+
+            uint8x8x3_t _rgb0;
+            _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
+            _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
+            _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
+
+            uint8x8x3_t _rgb1;
+            _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
+            _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
+            _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
+
+            vst3_u8(rgb0, _rgb0);
+            vst3_u8(rgb1, _rgb1);
+
+            yptr0 += 8;
+            yptr1 += 8;
+            vuptr += 8;
+            rgb0 += 24;
+            rgb1 += 24;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "pld        [%3, #128]          \n"
+            "vld1.u8    {d2}, [%3]!         \n"
+            "vsub.s8    d2, d2, %12         \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.u8    {d0}, [%1]!         \n"
+            "pld        [%2, #128]          \n"
+            "vld1.u8    {d1}, [%2]!         \n"
+            "vshll.u8   q2, d0, #6          \n"
+            "vorr       d3, d2, d2          \n"
+            "vshll.u8   q3, d1, #6          \n"
+            "vorr       q9, q2, q2          \n"
+            "vtrn.s8    d2, d3              \n"
+            "vorr       q11, q3, q3         \n"
+            "vmlsl.s8   q9, d2, %14         \n"
+            "vorr       q8, q2, q2          \n"
+            "vmlsl.s8   q11, d2, %14        \n"
+            "vorr       q10, q3, q3         \n"
+            "vmlal.s8   q8, d2, %13         \n"
+            "vmlal.s8   q2, d3, %16         \n"
+            "vmlal.s8   q10, d2, %13        \n"
+            "vmlsl.s8   q9, d3, %15         \n"
+            "vmlal.s8   q3, d3, %16         \n"
+            "vmlsl.s8   q11, d3, %15        \n"
+            "vqshrun.s16 d24, q8, #6        \n"
+            "vqshrun.s16 d26, q2, #6        \n"
+            "vqshrun.s16 d4, q10, #6        \n"
+            "vqshrun.s16 d25, q9, #6        \n"
+            "vqshrun.s16 d6, q3, #6         \n"
+            "vqshrun.s16 d5, q11, #6        \n"
+            "pld        [%3, #128]          \n"
+            "vld1.u8    {d2}, [%3]!         \n"
+            "subs       %0, #1              \n"
+            "vst3.u8    {d24-d26}, [%4]!    \n"
+            "vsub.s8    d2, d2, %12         \n"
+            "vst3.u8    {d4-d6}, [%5]!      \n"
+            "bne        0b                  \n"
+            "sub        %3, #8              \n"
+            : "=r"(nn),     // %0
+              "=r"(yptr0),  // %1
+              "=r"(yptr1),  // %2
+              "=r"(vuptr),  // %3
+              "=r"(rgb0),   // %4
+              "=r"(rgb1)    // %5
+            : "0"(nn),
+              "1"(yptr0),
+              "2"(yptr1),
+              "3"(vuptr),
+              "4"(rgb0),
+              "5"(rgb1),
+              "w"(_v128),   // %12
+              "w"(_v90),    // %13
+              "w"(_v46),    // %14
+              "w"(_v22),    // %15
+              "w"(_v113)    // %16
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+        for (; remain>0; remain-=2)
+        {
+            // R = 1.164 * yy + 1.596 * vv
+            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
+            // B = 1.164 * yy              + 2.018 * uu
+
+            // R = Y + (1.370705 * (V-128))
+            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
+            // B = Y + (1.732446 * (U-128))
+
+            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
+            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
+            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
+
+            // R = ((Y << 6) + 90 * (V-128)) >> 6
+            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
+            // B = ((Y << 6) + 113 * (U-128)) >> 6
+
+            // R = (yy + 90 * vv) >> 6
+            // G = (yy - 46 * vv - 22 * uu) >> 6
+            // B = (yy + 113 * uu) >> 6
+
+            int v = vuptr[0] - 128;
+            int u = vuptr[1] - 128;
+
+            int ruv = 90 * v;
+            int guv = -46 * v + -22 * u;
+            int buv = 113 * u;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max(X, 0), 255);
+
+            int y00 = yptr0[0] << 6;
+            rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
+            rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
+            rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
+
+            int y01 = yptr0[1] << 6;
+            rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
+            rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
+            rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
+
+            int y10 = yptr1[0] << 6;
+            rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
+            rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
+            rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
+
+            int y11 = yptr1[1] << 6;
+            rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
+            rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
+            rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
+
+#undef SATURATE_CAST_UCHAR
+
+            yptr0 += 2;
+            yptr1 += 2;
+            vuptr += 2;
+            rgb0 += 6;
+            rgb1 += 6;
+        }
+
+        yptr += 2*w;
+        rgb += 2*3*w;
+    }
+}
+
+static void yuv420sp_to_rgba_fast_asm(const unsigned char* yuv420sp, int w, int h, unsigned char* rgba)
+{
+    const unsigned char* yptr = yuv420sp;
+    const unsigned char* vuptr = yuv420sp + w * h;
+
+    int8x8_t _v128 = vdup_n_s8(128);
+    int8x8_t _v90 = vdup_n_s8(90);
+    int8x8_t _v46 = vdup_n_s8(46);
+    int8x8_t _v22 = vdup_n_s8(22);
+    int8x8_t _v113 = vdup_n_s8(113);
+
+    for (int y = 0; y < h; y += 2)
+    {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgba0 = rgba;
+        unsigned char* rgba1 = rgba + w * 4;
+
+#if __ARM_NEON
+        int nn = w >> 3;
+        int remain = w - (nn << 3);
+#else
+        int remain = w;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn > 0; nn--)
+        {
+            int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
+            int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
+
+            int8x8_t _vvuu = vsub_s8(vreinterpret_s8_u8(vld1_u8(vuptr)), _v128);
+            int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
+            int8x8_t _vv = _vvvvuuuu.val[0];
+            int8x8_t _uu = _vvvvuuuu.val[1];
+
+            int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
+            int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
+            _g0 = vmlsl_s8(_g0, _uu, _v22);
+            int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
+
+            int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
+            int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
+            _g1 = vmlsl_s8(_g1, _uu, _v22);
+            int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
+
+            uint8x8x4_t _rgba0;
+            _rgba0.val[0] = vqshrun_n_s16(_r0, 6);
+            _rgba0.val[1] = vqshrun_n_s16(_g0, 6);
+            _rgba0.val[2] = vqshrun_n_s16(_b0, 6);
+
+            uint8x8x4_t _rgba1;
+            _rgba1.val[0] = vqshrun_n_s16(_r1, 6);
+            _rgba1.val[1] = vqshrun_n_s16(_g1, 6);
+            _rgba1.val[2] = vqshrun_n_s16(_b1, 6);
+
+            vst4_u8(rgba0, _rgba0);
+            vst4_u8(rgba1, _rgba1);
+
+            yptr0 += 8;
+            yptr1 += 8;
+            vuptr += 8;
+            rgba0 += 8*4;
+            rgba1 += 8*4;
+        }
+#else
+        if (nn > 0)
+        {
+            asm volatile(
+                "pld        [%3, #128]          \n"
+                "vld1.u8    {d2}, [%3]!         \n"
+                "vsub.s8    d2, d2, %P12         \n"
+                "vmov.u8    d27, #255           \n"
+
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.u8    {d0}, [%1]!         \n"
+                "pld        [%2, #128]          \n"
+                "vld1.u8    {d1}, [%2]!         \n"
+                "vshll.u8   q2, d0, #6          \n"
+                "vorr       d3, d2, d2          \n"
+                "vshll.u8   q3, d1, #6          \n"
+                "vorr       q9, q2, q2          \n"
+                "vtrn.s8    d2, d3              \n"
+                "vorr       q11, q3, q3         \n"
+                "vmlsl.s8   q9, d2, %P14         \n"
+                "vorr       q8, q2, q2          \n"
+                "vmlsl.s8   q11, d2, %P14        \n"
+                "vorr       q10, q3, q3         \n"
+                "vmlal.s8   q8, d2, %P13         \n"
+                "vmlal.s8   q2, d3, %P16         \n"
+                "vmlal.s8   q10, d2, %P13        \n"
+                "vmlsl.s8   q9, d3, %P15         \n"
+                "vmlal.s8   q3, d3, %P16         \n"
+                "vmlsl.s8   q11, d3, %P15        \n"
+                "vqshrun.s16 d24, q8, #6        \n"
+                "vqshrun.s16 d26, q2, #6        \n"
+                "vqshrun.s16 d4, q10, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"
+                "vqshrun.s16 d6, q3, #6         \n"
+                "vqshrun.s16 d5, q11, #6        \n"
+                "vmov.u8     d7, #255           \n"
+                "pld        [%3, #128]          \n"
+                "vld1.u8    {d2}, [%3]!         \n"
+                "subs       %0, #1              \n"
+                "vst4.u8    {d24-d27}, [%4]!    \n"
+                "vsub.s8    d2, d2, %12         \n"
+                "vst4.u8    {d4-d7}, [%5]!      \n"
+                "bne        0b                  \n"
+                "sub        %3, #8              \n"
+                : "=r"(nn),     // %0
+                "=r"(yptr0),  // %1
+                "=r"(yptr1),  // %2
+                "=r"(vuptr),  // %3
+                "=r"(rgba0),   // %4
+                "=r"(rgba1)    // %5
+                : "0"(nn),
+                "1"(yptr0),
+                "2"(yptr1),
+                "3"(vuptr),
+                "4"(rgba0),
+                "5"(rgba1),
+                "w"(_v128),   // %12
+                "w"(_v90),    // %13
+                "w"(_v46),    // %14
+                "w"(_v22),    // %15
+                "w"(_v113)    // %16
+                : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26", "d27"
+            );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+        for (; remain > 0; remain-=2)
+        {
+            // R = 1.164 * yy + 1.596 * vv
+            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
+            // B = 1.164 * yy              + 2.018 * uu
+
+            // R = Y + (1.370705 * (V-128))
+            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
+            // B = Y + (1.732446 * (U-128))
+
+            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
+            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
+            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
+
+            // R = ((Y << 6) + 90 * (V-128)) >> 6
+            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
+            // B = ((Y << 6) + 113 * (U-128)) >> 6
+
+            // R = (yy + 90 * vv) >> 6
+            // G = (yy - 46 * vv - 22 * uu) >> 6
+            // B = (yy + 113 * uu) >> 6
+
+            int v = vuptr[0] - 128;
+            int u = vuptr[1] - 128;
+
+            int ruv = 90 * v;
+            int guv = -46 * v + -22 * u;
+            int buv = 113 * u;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max(X, 0), 255);
+
+            int y00 = yptr0[0] << 6;
+            rgba0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
+            rgba0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
+            rgba0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
+
+            int y01 = yptr0[1] << 6;
+            rgba0[4] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
+            rgba0[5] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
+            rgba0[6] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
+
+            int y10 = yptr1[0] << 6;
+            rgba1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
+            rgba1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
+            rgba1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
+
+            int y11 = yptr1[1] << 6;
+            rgba1[4] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
+            rgba1[5] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
+            rgba1[6] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
+
+#undef SATURATE_CAST_UCHAR
+
+            yptr0 += 2;
+            yptr1 += 2;
+            vuptr += 2;
+            rgba0 += 2*4;
+            rgba1 += 2*4;
+        }
+
+        yptr += 2 * w;
+        rgba += 2 * 4 * w;
+    }
+}
+
+#endif
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_bg.xml
new file mode 100644
index 0000000..21e10b0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/blue_radio_button_selected_style" android:state_enabled="true" android:state_selected="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/blue_radio_button_unselected_style" android:state_enabled="true" android:state_selected="false" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_selected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_selected_style.xml
new file mode 100644
index 0000000..f85969e
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_selected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#0000b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="@color/white"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_unselected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_unselected_style.xml
new file mode 100644
index 0000000..e17565f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/blue_radio_button_unselected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#0000b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="#0000b9"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg.xml
new file mode 100644
index 0000000..f38c2de
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/round_corner_bg_press" android:state_enabled="true" android:state_pressed="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/round_corner_bg" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg_cancle.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg_cancle.xml
new file mode 100644
index 0000000..8449127
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/button_bg_cancle.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/round_corner_bg_cancel" />
+</selector>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_bg.xml
new file mode 100644
index 0000000..c0f0bf0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/cyan_radio_button_selected_style" android:state_enabled="true" android:state_selected="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/cyan_radio_button_unselected_style" android:state_enabled="true" android:state_selected="false" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_selected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_selected_style.xml
new file mode 100644
index 0000000..f65c332
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_selected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#00b9b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="@color/white"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_unselected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_unselected_style.xml
new file mode 100644
index 0000000..c413af4
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/cyan_radio_button_unselected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#00b9b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="#00b9b9"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_bg.xml
new file mode 100644
index 0000000..1deb901
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/green_radio_button_selected_style" android:state_enabled="true" android:state_selected="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/green_radio_button_unselected_style" android:state_enabled="true" android:state_selected="false" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_selected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_selected_style.xml
new file mode 100644
index 0000000..08cdf3a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_selected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#00b900"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="@color/white"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_unselected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_unselected_style.xml
new file mode 100644
index 0000000..c368595
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/green_radio_button_unselected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#00b900"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="#00b900"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/ic_launcher_background.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 0000000..d5fccc5
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportHeight="108"
+    android:viewportWidth="108">
+    <path
+        android:fillColor="#26A69A"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+</vector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_bg.xml
new file mode 100644
index 0000000..b2b5734
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/purple_radio_button_selected_style" android:state_enabled="true" android:state_selected="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/purple_radio_button_unselected_style" android:state_enabled="true" android:state_selected="false" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_selected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_selected_style.xml
new file mode 100644
index 0000000..d8e35c0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_selected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#b900b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="@color/white"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_unselected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_unselected_style.xml
new file mode 100644
index 0000000..a997af7
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/purple_radio_button_unselected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#b900b9"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="#b900b9"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_bg.xml
new file mode 100644
index 0000000..1429ea0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_bg.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/red_radio_button_selected_style" android:state_enabled="true" android:state_selected="true" />
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/red_radio_button_unselected_style" android:state_enabled="true" android:state_selected="false" />
+</selector>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_selected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_selected_style.xml
new file mode 100644
index 0000000..8cd0c35
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_selected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#b90000"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="@color/white"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_unselected_style.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_unselected_style.xml
new file mode 100644
index 0000000..da89e1f
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/red_radio_button_unselected_style.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners
+        android:radius="40dp"/>
+
+    <solid
+        android:color="#b90000"/>
+
+    <size
+        android:height="40dp"
+        android:width="40dp"/>
+
+    <stroke
+        android:width="2dp"
+        android:color="#b90000"/>
+
+
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg.xml
new file mode 100644
index 0000000..5ddff65
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+       android:shape="rectangle" >
+    <size
+        android:width="140dp"
+        android:height="40dp"/>
+    <corners android:radius="5dp"/>
+    <solid android:color="@color/sdk_base_blue" />
+</shape>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_cancel.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_cancel.xml
new file mode 100644
index 0000000..0ddeab2
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_cancel.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="rectangle">
+    <size
+        android:width="140dp"
+        android:height="40dp" />
+    <corners android:radius="5dp" />
+    <solid android:color="@color/sdk_verify_bg" />
+    <stroke
+        android:width="1dp"
+        android:color="@color/white" />
+</shape>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_press.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_press.xml
new file mode 100644
index 0000000..d2212c1
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/round_corner_bg_press.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+       android:shape="rectangle" >
+    <size
+        android:width="140dp"
+        android:height="40dp"/>
+    <corners android:radius="5dp"/>
+    <solid android:color="@color/button_color_press" />
+</shape>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_off.png b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_off.png
new file mode 100644
index 0000000..ea64de0
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_off.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_on.png b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_on.png
new file mode 100644
index 0000000..2a34ad1
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_on.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_selector.xml b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_selector.xml
new file mode 100644
index 0000000..45d7854
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/drawable/toggle_selector.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- 按钮可用，并且处于按下状态时显示这个状态 -->
+    <item android:drawable="@drawable/toggle_on" android:state_enabled="true" android:state_checked="true"/>
+
+    <!-- 按钮可用，并且处于正常状态时显示这个状态 -->
+    <item android:drawable="@drawable/toggle_off" android:state_checked="false"/>
+</selector>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/ic_launcher.png
new file mode 100644
index 0000000..3d82028
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/activity_main.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/activity_main.xml
new file mode 100644
index 0000000..cf81701
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/activity_main.xml
@@ -0,0 +1,202 @@
+<?xml version="1.0" encoding="utf-8"?>
+<ScrollView xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/scrollView1"
+    android:layout_width="fill_parent"
+    android:layout_height="wrap_content"
+    xmlns:android="http://schemas.android.com/apk/res/android">
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:paddingBottom="@dimen/activity_vertical_margin"
+        android:paddingLeft="@dimen/activity_horizontal_margin"
+        android:paddingRight="@dimen/activity_horizontal_margin"
+        android:paddingTop="@dimen/activity_vertical_margin"
+        android:orientation="vertical"
+        tools:context="com.tencent.tnn.demo.MainActivity">
+
+        <TextView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center_horizontal"
+            android:layout_marginTop="100dp"
+            android:text="@string/tnn_title"
+            android:textColor="@color/grey_text"
+            android:textSize="20sp" />
+
+        <TextView
+            android:id="@+id/stream_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center_horizontal"
+            android:layout_marginTop="49dp"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:text="@string/tnn_fd_stream_btn_name"
+            android:textColor="@color/white"
+            android:textSize="16sp" />
+        <TextView
+            android:id="@+id/image_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_fd_image_btn_name"
+            android:textColor="@color/white"/>
+        <TextView
+            android:id="@+id/image_classify_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_ic_image_btn_name"
+            android:textColor="@color/white"/>
+        <TextView
+            android:id="@+id/image_object_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_od_image_btn_name"
+            android:textColor="@color/white"/>
+        <!--
+        <TextView
+            android:id="@+id/image_ocr_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_ocrdet_image_btn_name"
+            android:textColor="@color/white"/>
+         -->
+
+        <TextView
+            android:id="@+id/stream_object_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_od_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/image_object_detectssd_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_odssd_image_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_object_detectssd_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_odssd_stream_btn_name"
+            android:textColor="@color/white"/>
+        <TextView
+            android:id="@+id/image_facedetect_blaze_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_bf_image_btn_name"
+            android:textColor="@color/white"/>
+        <TextView
+            android:id="@+id/stream_facedetect_blaze_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_bf_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_facealign_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_fa_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_hairsegmentation_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_hs_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_pose_detect_landmark_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_pdl_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_skeleton_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_sd_stream_btn_name"
+            android:textColor="@color/white"/>
+
+        <TextView
+            android:id="@+id/stream_ocr_detect_btn"
+            android:layout_width="160dp"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="48dp"
+            android:layout_gravity="center_horizontal"
+            android:background="@drawable/button_bg"
+            android:gravity="center"
+            android:textSize="16sp"
+            android:text="@string/tnn_ocrdet_stream_btn_name"
+            android:textColor="@color/white"/>
+    </LinearLayout>
+</ScrollView>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_activity_layout.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_activity_layout.xml
new file mode 100644
index 0000000..455be25
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_activity_layout.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/fragment_container"
+    android:orientation="vertical"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+</FrameLayout>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_fragment_layout.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_fragment_layout.xml
new file mode 100644
index 0000000..9fbc4c0
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/base_fragment_layout.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:TitleBarAttr="http://schemas.android.com/apk/res-auto"
+    android:id="@+id/avd_contain"
+    android:orientation="vertical"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    >
+    <com.tencent.tnn.demo.common.component.TitleBar
+        android:id="@+id/avd_title_bar"
+        TitleBarAttr:left_image_visible="true"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"/>
+
+</LinearLayout>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/dialog_layout.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/dialog_layout.xml
new file mode 100644
index 0000000..ac59511
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/dialog_layout.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/avd_root_view"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:orientation="vertical"
+    android:background="@color/white">
+
+    <TextView
+        android:id="@+id/avd_dialog_title"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginTop="35dp"
+        android:textSize="22sp"
+        android:textColor="@color/black_text"/>
+
+    <TextView
+        android:id="@+id/avd_dialog_tip"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_gravity="center_horizontal"
+        android:paddingLeft="20dp"
+        android:paddingRight="20dp"
+        android:layout_marginTop="11dp"
+        android:textSize="18sp"
+        android:textColor="@color/dlg_text"/>
+
+    <View
+        android:layout_width="match_parent"
+        android:layout_height="1.5dp"
+        android:layout_marginTop="40dp"
+        android:background="@color/gray_gap"/>
+
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:orientation="horizontal">
+
+        <TextView
+            android:id="@+id/avd_button_no"
+            style="@style/AlertButton_No"
+            android:text="取消"
+            android:layout_weight="1"
+            android:layout_width="match_parent"
+            android:layout_height="48dp"
+            android:gravity="center"
+            android:layout_gravity="center_horizontal" />
+
+        <View
+            android:layout_width="1.5dp"
+            android:layout_height="match_parent"
+            android:layout_marginTop="10dp"
+            android:layout_marginBottom="10dp"
+            android:background="@color/gray_gap"/>
+
+        <TextView
+            android:id="@+id/avd_button_yes"
+            style="@style/AlertButton_Yes"
+            android:text="确定"
+            android:layout_weight="1"
+            android:layout_width="match_parent"
+            android:layout_height="48dp"
+            android:gravity="center"
+            android:layout_gravity="center_horizontal"/>
+
+    </LinearLayout>
+
+</LinearLayout>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_image_detector.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_image_detector.xml
new file mode 100644
index 0000000..a88c3cc
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_image_detector.xml
@@ -0,0 +1,90 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/cloud_face_verify_ll"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+    <RelativeLayout
+        android:id="@+id/back_rl"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_marginLeft="16dp"
+        android:layout_marginTop="26dp">
+
+        <ImageView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back" />
+    </RelativeLayout>
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:layout_gravity="center"
+        android:orientation="vertical">
+        <ImageView
+            android:id="@+id/origin"
+            android:layout_gravity="center_horizontal"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+        </ImageView>
+
+        <TextView
+            android:id="@+id/result"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:textSize="14sp"
+            android:layout_marginTop="10dp"
+            android:layout_gravity="center"
+            android:gravity="left">
+        </TextView>
+    </LinearLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="30dp"
+        android:layout_marginLeft="20dp"
+        android:gravity="bottom">
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text = "GPU"/>
+            <ToggleButton
+                android:id="@+id/gpu_switch"
+                android:layout_marginLeft="10dp"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:background="@drawable/toggle_selector"
+                android:textOn=""
+                android:textOff=""
+                android:text=""
+                android:checked="false"/>
+            <TextView
+                android:id ="@+id/npu_text"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text = "NPU"/>
+
+            <ToggleButton
+                android:id="@+id/npu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginRight="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+            <Button
+                android:id="@+id/run_button"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:background="@drawable/button_bg"
+                android:textColor="@color/white"
+                android:layout_marginLeft="50dp"
+                android:text="Run"/>
+        </LinearLayout>
+    </RelativeLayout>
+</FrameLayout>
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_detector.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_detector.xml
new file mode 100644
index 0000000..197e667
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_detector.xml
@@ -0,0 +1,159 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/cloud_face_verify_ll"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="576dp"
+        android:id="@+id/preview_layout">
+
+        <com.tencent.tnn.demo.common.component.PreviewFrameLayout
+            android:id="@+id/live_preview_layout"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_centerInParent="true">
+
+            <SurfaceView
+                android:id="@+id/live_detection_preview"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_centerInParent="true" />
+
+        </com.tencent.tnn.demo.common.component.PreviewFrameLayout>
+        <FrameLayout
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            >
+            <com.tencent.tnn.demo.common.component.DrawView
+                android:layout_alignParentTop="true"
+                android:id="@+id/drawView"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent" />
+
+        </FrameLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:id="@+id/back_rl"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_marginLeft="16dp"
+        android:layout_marginTop="50dp">
+
+        <ImageView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back" />
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="5dp"
+        android:layout_marginBottom="590dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/monitor_result"
+            android:layout_width="wrap_content"
+            android:layout_height="50dp"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:text="device: \nfps detect: "
+            android:textColor="#bf0000"
+            android:textSize="18dp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginTop="576dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/result"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:textSize="14sp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="0dp"
+        android:gravity="bottom">
+
+        <Button
+            android:id="@+id/switch_camera"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:background="#00000000"
+            android:drawableBottom="@mipmap/camera_rotate_fill"
+            android:gravity="center_horizontal"
+            android:scaleX="0.5"
+            android:scaleY="0.5"
+            android:visibility="visible" />
+
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="30dp"
+        android:layout_marginLeft="20dp"
+        android:gravity="bottom">
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="GPU" />
+
+            <ToggleButton
+                android:id="@+id/gpu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginRight="20dp"
+        android:layout_marginBottom="30dp"
+        android:gravity="bottom|right">
+
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+            <TextView
+                android:id ="@+id/npu_text"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="NPU" />
+
+            <ToggleButton
+                android:id="@+id/npu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginRight="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+</FrameLayout>
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_hair_segmentation.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_hair_segmentation.xml
new file mode 100644
index 0000000..2c2fc2c
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_hair_segmentation.xml
@@ -0,0 +1,223 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/cloud_face_verify_ll"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="518dp"
+        android:id="@+id/preview_layout">
+
+        <com.tencent.tnn.demo.common.component.PreviewFrameLayout
+            android:id="@+id/live_preview_layout"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_centerInParent="true">
+
+            <SurfaceView
+                android:id="@+id/live_detection_preview"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_centerInParent="true" />
+
+        </com.tencent.tnn.demo.common.component.PreviewFrameLayout>
+        <FrameLayout
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            >
+            <com.tencent.tnn.demo.common.component.DrawView
+                android:layout_alignParentTop="true"
+                android:id="@+id/drawView"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent" />
+
+        </FrameLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="10dp"
+        android:layout_marginBottom="90dp"
+        android:gravity="bottom">
+
+        <LinearLayout
+            android:layout_width="380dp"
+            android:layout_height="45dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="32dp"
+                android:gravity="center"
+                android:text="头发颜色： "
+                android:textSize="18dp" />
+
+            <RadioGroup
+                android:id="@+id/color_button"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:orientation="horizontal">
+
+                <RadioButton
+                    android:id="@+id/button_blue"
+                    android:layout_width="45dp"
+                    android:layout_height="wrap_content"
+                    android:checked="true"
+                    android:button="@drawable/blue_radio_button_bg" />
+
+                <RadioButton
+                    android:id="@+id/button_cyan"
+                    android:layout_width="45dp"
+                    android:layout_height="wrap_content"
+                    android:checked="false"
+                    android:button="@drawable/cyan_radio_button_bg" />
+
+                <RadioButton
+                    android:id="@+id/button_green"
+                    android:layout_width="45dp"
+                    android:layout_height="wrap_content"
+                    android:checked="false"
+                    android:button="@drawable/green_radio_button_bg" />
+
+                <RadioButton
+                    android:id="@+id/button_purple"
+                    android:layout_width="45dp"
+                    android:layout_height="wrap_content"
+                    android:checked="false"
+                    android:button="@drawable/purple_radio_button_bg" />
+
+                <RadioButton
+                    android:id="@+id/button_red"
+                    android:layout_width="45dp"
+                    android:layout_height="wrap_content"
+                    android:checked="false"
+                    android:button="@drawable/red_radio_button_bg" />
+            </RadioGroup>
+        </LinearLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:id="@+id/back_rl"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_marginLeft="16dp"
+        android:layout_marginTop="50dp">
+
+        <ImageView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back" />
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="5dp"
+        android:layout_marginBottom="590dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/monitor_result"
+            android:layout_width="wrap_content"
+            android:layout_height="50dp"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:text="device: \nfps detect: "
+            android:textColor="#bf0000"
+            android:textSize="18dp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginTop="518dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/result"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:textSize="14sp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="0dp"
+        android:gravity="bottom">
+
+        <Button
+            android:id="@+id/switch_camera"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:background="#00000000"
+            android:drawableBottom="@mipmap/camera_rotate_fill"
+            android:gravity="center_horizontal"
+            android:scaleX="0.5"
+            android:scaleY="0.5"
+            android:visibility="visible" />
+
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="30dp"
+        android:layout_marginLeft="20dp"
+        android:gravity="bottom">
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="GPU" />
+
+            <ToggleButton
+                android:id="@+id/gpu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginRight="20dp"
+        android:layout_marginBottom="30dp"
+        android:gravity="bottom|right">
+
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+            <TextView
+                android:id ="@+id/npu_text"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="NPU" />
+
+            <ToggleButton
+                android:id="@+id/npu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginRight="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+</FrameLayout>
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_pose_detect_landmark.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_pose_detect_landmark.xml
new file mode 100644
index 0000000..f680d78
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_pose_detect_landmark.xml
@@ -0,0 +1,203 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/cloud_face_verify_ll"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="510dp"
+        android:id="@+id/preview_layout">
+
+        <com.tencent.tnn.demo.common.component.PreviewFrameLayout
+            android:id="@+id/live_preview_layout"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_centerInParent="true">
+
+            <SurfaceView
+                android:id="@+id/live_detection_preview"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_centerInParent="true" />
+
+        </com.tencent.tnn.demo.common.component.PreviewFrameLayout>
+        <FrameLayout
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            >
+            <com.tencent.tnn.demo.common.component.DrawView
+                android:layout_alignParentTop="true"
+                android:id="@+id/drawView"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent" />
+
+        </FrameLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="10dp"
+        android:layout_marginBottom="90dp"
+        android:gravity="bottom">
+
+        <LinearLayout
+            android:layout_width="380dp"
+            android:layout_height="45dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="32dp"
+                android:gravity="center"
+                android:text="mode： "
+                android:textSize="18dp" />
+
+            <RadioGroup
+                android:id="@+id/mode_button_group"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:orientation="horizontal">
+
+                <RadioButton
+                    android:id="@+id/upper_body_mode_button"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:text="上半身"
+                    android:checked="true" />
+
+                <RadioButton
+                    android:id="@+id/full_body_mode_button"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:text="全身"
+                    android:checked="false" />
+
+            </RadioGroup>
+        </LinearLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:id="@+id/back_rl"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_marginLeft="16dp"
+        android:layout_marginTop="50dp">
+
+        <ImageView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back" />
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="5dp"
+        android:layout_marginBottom="590dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/monitor_result"
+            android:layout_width="wrap_content"
+            android:layout_height="50dp"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:text="device: \nfps detect: "
+            android:textColor="#bf0000"
+            android:textSize="18dp"/>
+    </RelativeLayout>·
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginTop="518dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/result"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:textSize="14sp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="0dp"
+        android:gravity="bottom">
+
+        <Button
+            android:id="@+id/switch_camera"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:background="#00000000"
+            android:drawableBottom="@mipmap/camera_rotate_fill"
+            android:gravity="center_horizontal"
+            android:scaleX="0.5"
+            android:scaleY="0.5"
+            android:visibility="visible" />
+
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="30dp"
+        android:layout_marginLeft="20dp"
+        android:gravity="bottom">
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="GPU" />
+
+            <ToggleButton
+                android:id="@+id/gpu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginRight="20dp"
+        android:layout_marginBottom="30dp"
+        android:gravity="bottom|right">
+
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+            <TextView
+                android:id ="@+id/npu_text"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="NPU" />
+
+            <ToggleButton
+                android:id="@+id/npu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginRight="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+</FrameLayout>
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_skeleton_detector.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_skeleton_detector.xml
new file mode 100644
index 0000000..e6350cb
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/fragment_stream_skeleton_detector.xml
@@ -0,0 +1,203 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/cloud_face_verify_ll"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="518dp"
+        android:id="@+id/preview_layout">
+
+        <com.tencent.tnn.demo.common.component.PreviewFrameLayout
+            android:id="@+id/live_preview_layout"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_centerInParent="true">
+
+            <SurfaceView
+                android:id="@+id/live_detection_preview"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_centerInParent="true" />
+
+        </com.tencent.tnn.demo.common.component.PreviewFrameLayout>
+        <FrameLayout
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            >
+            <com.tencent.tnn.demo.common.component.DrawView
+                android:layout_alignParentTop="true"
+                android:id="@+id/drawView"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent" />
+
+        </FrameLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="10dp"
+        android:layout_marginBottom="90dp"
+        android:gravity="bottom">
+
+        <LinearLayout
+            android:layout_width="380dp"
+            android:layout_height="45dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="32dp"
+                android:gravity="center"
+                android:text="mode： "
+                android:textSize="18dp" />
+
+            <RadioGroup
+                android:id="@+id/mode_button_group"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:orientation="horizontal">
+
+                <RadioButton
+                    android:id="@+id/high_precision_mode_button"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:text="高精度"
+                    android:checked="true" />
+
+                <RadioButton
+                    android:id="@+id/fast_mode_button"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:text="快速"
+                    android:checked="false" />
+
+            </RadioGroup>
+        </LinearLayout>
+
+    </RelativeLayout>
+
+    <RelativeLayout
+        android:id="@+id/back_rl"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_marginLeft="16dp"
+        android:layout_marginTop="50dp">
+
+        <ImageView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back" />
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginLeft="5dp"
+        android:layout_marginBottom="590dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/monitor_result"
+            android:layout_width="wrap_content"
+            android:layout_height="50dp"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:text="device: \nfps detect: "
+            android:textColor="#bf0000"
+            android:textSize="18dp"/>
+    </RelativeLayout>·
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginTop="518dp"
+        android:gravity="top">
+        <TextView
+            android:id="@+id/result"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:gravity="left"
+            android:textSize="14sp"/>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="0dp"
+        android:gravity="bottom">
+
+        <Button
+            android:id="@+id/switch_camera"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:background="#00000000"
+            android:drawableBottom="@mipmap/camera_rotate_fill"
+            android:gravity="center_horizontal"
+            android:scaleX="0.5"
+            android:scaleY="0.5"
+            android:visibility="visible" />
+
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginBottom="30dp"
+        android:layout_marginLeft="20dp"
+        android:gravity="bottom">
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="GPU" />
+
+            <ToggleButton
+                android:id="@+id/gpu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_gravity="center_horizontal"
+        android:layout_marginRight="20dp"
+        android:layout_marginBottom="30dp"
+        android:gravity="bottom|right">
+
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content">
+            <TextView
+                android:id ="@+id/npu_text"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:text="NPU" />
+
+            <ToggleButton
+                android:id="@+id/npu_switch"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginRight="10dp"
+                android:background="@drawable/toggle_selector"
+                android:checked="false"
+                android:text=""
+                android:textOff=""
+                android:textOn="" />
+        </LinearLayout>
+    </RelativeLayout>
+</FrameLayout>
+
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/layout/title_bar_layout.xml b/3rdparty/TNN/examples/android/demo/src/main/res/layout/title_bar_layout.xml
new file mode 100644
index 0000000..c42cbbe
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/layout/title_bar_layout.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="48dp"
+    android:orientation="vertical"
+    android:background="@color/sdk_guide_bg">
+
+    <LinearLayout
+        android:id="@+id/avd_left_button"
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_alignParentLeft="true"
+        android:gravity="center_vertical"
+        android:orientation="horizontal">
+
+        <ImageView
+            android:id="@+id/avd_left_image"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/back_ic"
+            android:layout_marginLeft="10dp"/>
+
+        <TextView
+            android:id="@+id/avd_left_text"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginLeft="10dp"
+            style="@style/white_text_16sp_style"
+            android:text="返回" />
+
+    </LinearLayout>
+
+    <TextView
+        android:id="@+id/avd_bar_title"
+        android:text="标题"
+        android:layout_centerInParent="true"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:textSize="20sp"
+        android:textColor="@color/white"/>
+
+    <LinearLayout
+        android:id="@+id/avd_right_button"
+        android:layout_width="wrap_content"
+        android:layout_height="match_parent"
+        android:layout_alignParentRight="true"
+        android:gravity="center"
+        android:orientation="horizontal">
+
+        <ImageView
+            android:id="@+id/avd_right_image"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginLeft="10dp"
+            android:layout_marginRight="10dp" />
+
+        <TextView
+            android:id="@+id/avd_right_text"
+            android:layout_width="wrap_content"
+            style="@style/white_text_16sp_style"
+            android:layout_height="wrap_content"
+            android:layout_marginRight="10dp"/>
+
+    </LinearLayout>
+</RelativeLayout>
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-hdpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 0000000..96ff172
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-ldpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-ldpi/ic_launcher.png
new file mode 100644
index 0000000..45cc506
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-ldpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-mdpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 0000000..680c5f8
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back.png
new file mode 100644
index 0000000..dd18edc
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back_ic.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back_ic.png
new file mode 100644
index 0000000..025740d
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/back_ic.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/camera_rotate_fill.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/camera_rotate_fill.png
new file mode 100644
index 0000000..b179dd6
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/camera_rotate_fill.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 0000000..40372f2
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxhdpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000..54219a3
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 0000000..8b610c5
Binary files /dev/null and b/3rdparty/TNN/examples/android/demo/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values-w820dp/dimens.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values-w820dp/dimens.xml
new file mode 100644
index 0000000..63fc816
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values-w820dp/dimens.xml
@@ -0,0 +1,6 @@
+<resources>
+    <!-- Example customization of dimensions originally defined in res/values/dimens.xml
+         (such as screen margins) for screens with more than 820dp of available width. This
+         would include 7" and 10" devices in landscape (~960dp and ~1280dp respectively). -->
+    <dimen name="activity_horizontal_margin">64dp</dimen>
+</resources>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values/attrs.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values/attrs.xml
new file mode 100644
index 0000000..00414c9
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values/attrs.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <declare-styleable name="TitleBarAttr">
+        <attr name="left_text" format="string"/>
+        <attr name="right_text" format="string"/>
+        <attr name="bar_title" format="string"/>
+        <attr name="left_image" format="reference"/>
+        <attr name="left_image_visible" format="boolean"/>
+        <attr name="right_image_visible" format="boolean"/>
+    </declare-styleable>
+</resources>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values/colors.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values/colors.xml
new file mode 100644
index 0000000..c06e2fa
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values/colors.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#3F51B5</color>
+    <color name="colorPrimaryDark">#303F9F</color>
+    <color name="colorAccent">#FF4081</color>
+
+    <color name="upload_bg">#e517191c</color>
+    <color name="sdk_base_blue">#409eff</color>
+    <color name="sdk_base_blue_white">#007eff</color>
+    <color name="dlg_text">#444444</color>
+    <color name="gray_gap">#e0e0e0</color>
+    <color name="black_text">#333333</color>
+    <color name="dlg_no_btn_text">#888888</color>
+    <color name="sdk_guide_bg">#23262b</color>
+    <color name="btn_unable">#d3d3d3</color>
+    <color name="line_color">#555555</color>
+    <color name="grey_bg">#3b3b3b</color>
+    <color name="grey_text">#aaaaaa</color>
+    <color name="result_text">#666666</color>
+    <color name="white">#ffffff</color>
+    <color name="sdk_verify_bg">#222629</color>
+    <color name="button_color_press">#88527ddd</color>
+</resources>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values/dimens.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values/dimens.xml
new file mode 100644
index 0000000..47c8224
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values/dimens.xml
@@ -0,0 +1,5 @@
+<resources>
+    <!-- Default screen margins, per the Android Design guidelines. -->
+    <dimen name="activity_horizontal_margin">16dp</dimen>
+    <dimen name="activity_vertical_margin">16dp</dimen>
+</resources>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values/strings.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values/strings.xml
new file mode 100644
index 0000000..9758bac
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values/strings.xml
@@ -0,0 +1,19 @@
+<resources>
+    <string name="app_name">tnn-demo</string>
+    <string name="tnn_title">Tencent TNN DEMO</string>
+    <string name="tnn_fd_stream_btn_name">人脸检测-视频输入</string>
+    <string name="tnn_fd_image_btn_name">人脸检测-图片输入</string>
+    <string name="tnn_ic_image_btn_name">图像分类-图片输入</string>
+    <string name="tnn_od_image_btn_name">物体检测-图片输入</string>
+    <string name="tnn_ocrdet_image_btn_name">OCR检测-图片输入</string>
+    <string name="tnn_od_stream_btn_name">物体检测-视频输入</string>
+    <string name="tnn_odssd_image_btn_name">物体检测ssd-图片输入</string>
+    <string name="tnn_odssd_stream_btn_name">物体检测ssd-视频输入</string>
+    <string name="tnn_bf_image_btn_name">人脸检测Blazeface-图片输入</string>
+    <string name="tnn_bf_stream_btn_name">人脸检测Blazeface-视频输入</string>
+    <string name="tnn_fa_stream_btn_name">优图人脸配准-视频输入</string>
+    <string name="tnn_hs_stream_btn_name">头发分割-视频输入</string>
+    <string name="tnn_pdl_stream_btn_name">人体关键点-BlazePose-视频输入</string>
+    <string name="tnn_sd_stream_btn_name">人体关键点-腾讯微视-视频输入</string>
+    <string name="tnn_ocrdet_stream_btn_name">OCR检测-视频输入</string>
+</resources>
diff --git a/3rdparty/TNN/examples/android/demo/src/main/res/values/styles.xml b/3rdparty/TNN/examples/android/demo/src/main/res/values/styles.xml
new file mode 100644
index 0000000..5197630
--- /dev/null
+++ b/3rdparty/TNN/examples/android/demo/src/main/res/values/styles.xml
@@ -0,0 +1,26 @@
+<resources>
+
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+
+    <style name="AlertButton_Yes">
+        <item name="android:textSize">20sp</item>
+        <item name="android:textColor">@color/sdk_base_blue</item>
+    </style>
+
+    <style name="AlertButton_No">
+        <item name="android:textSize">20sp</item>
+        <item name="android:textColor">@color/dlg_no_btn_text</item>
+    </style>
+
+    <style name="white_text_16sp_style">
+        <item name="android:textSize">20sp</item>
+        <item name="android:textColor">@color/white</item>
+    </style>
+
+</resources>
diff --git a/3rdparty/TNN/examples/android/gradle.properties b/3rdparty/TNN/examples/android/gradle.properties
new file mode 100644
index 0000000..45a138d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/gradle.properties
@@ -0,0 +1,20 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+
+
+android.injected.testOnly=false
diff --git a/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..13372ae
Binary files /dev/null and b/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..59ac33d
--- /dev/null
+++ b/3rdparty/TNN/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Fri Apr 24 15:10:50 CST 2020
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
diff --git a/3rdparty/TNN/examples/android/gradlew b/3rdparty/TNN/examples/android/gradlew
new file mode 100644
index 0000000..9d82f78
--- /dev/null
+++ b/3rdparty/TNN/examples/android/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/3rdparty/TNN/examples/android/gradlew.bat b/3rdparty/TNN/examples/android/gradlew.bat
new file mode 100644
index 0000000..8a0b282
--- /dev/null
+++ b/3rdparty/TNN/examples/android/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/examples/android/settings.gradle b/3rdparty/TNN/examples/android/settings.gradle
new file mode 100644
index 0000000..1a1624a
--- /dev/null
+++ b/3rdparty/TNN/examples/android/settings.gradle
@@ -0,0 +1 @@
+include ':demo'
diff --git a/3rdparty/TNN/examples/assets/004545.jpg b/3rdparty/TNN/examples/assets/004545.jpg
new file mode 100644
index 0000000..7aba2ab
Binary files /dev/null and b/3rdparty/TNN/examples/assets/004545.jpg differ
diff --git a/3rdparty/TNN/examples/assets/blazepose_full_body.png b/3rdparty/TNN/examples/assets/blazepose_full_body.png
new file mode 100644
index 0000000..bbdcdc8
Binary files /dev/null and b/3rdparty/TNN/examples/assets/blazepose_full_body.png differ
diff --git a/3rdparty/TNN/examples/assets/blazepose_upper_body.png b/3rdparty/TNN/examples/assets/blazepose_upper_body.png
new file mode 100644
index 0000000..bb54750
Binary files /dev/null and b/3rdparty/TNN/examples/assets/blazepose_upper_body.png differ
diff --git a/3rdparty/TNN/examples/assets/car.png b/3rdparty/TNN/examples/assets/car.png
new file mode 100644
index 0000000..407d512
Binary files /dev/null and b/3rdparty/TNN/examples/assets/car.png differ
diff --git a/3rdparty/TNN/examples/assets/dog.png b/3rdparty/TNN/examples/assets/dog.png
new file mode 100644
index 0000000..3bab67c
Binary files /dev/null and b/3rdparty/TNN/examples/assets/dog.png differ
diff --git a/3rdparty/TNN/examples/assets/full.png b/3rdparty/TNN/examples/assets/full.png
new file mode 100644
index 0000000..71b1c9f
Binary files /dev/null and b/3rdparty/TNN/examples/assets/full.png differ
diff --git a/3rdparty/TNN/examples/assets/lite.png b/3rdparty/TNN/examples/assets/lite.png
new file mode 100644
index 0000000..fb8e0e9
Binary files /dev/null and b/3rdparty/TNN/examples/assets/lite.png differ
diff --git a/3rdparty/TNN/examples/assets/synset.txt b/3rdparty/TNN/examples/assets/synset.txt
new file mode 100644
index 0000000..722c984
--- /dev/null
+++ b/3rdparty/TNN/examples/assets/synset.txt
@@ -0,0 +1,1000 @@
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/assets/test.jpg b/3rdparty/TNN/examples/assets/test.jpg
new file mode 100644
index 0000000..60d2489
Binary files /dev/null and b/3rdparty/TNN/examples/assets/test.jpg differ
diff --git a/3rdparty/TNN/examples/assets/test_blazeface.jpg b/3rdparty/TNN/examples/assets/test_blazeface.jpg
new file mode 100644
index 0000000..a218be1
Binary files /dev/null and b/3rdparty/TNN/examples/assets/test_blazeface.jpg differ
diff --git a/3rdparty/TNN/examples/assets/test_face.jpg b/3rdparty/TNN/examples/assets/test_face.jpg
new file mode 100644
index 0000000..7d4c914
Binary files /dev/null and b/3rdparty/TNN/examples/assets/test_face.jpg differ
diff --git a/3rdparty/TNN/examples/assets/tiger_cat.jpg b/3rdparty/TNN/examples/assets/tiger_cat.jpg
new file mode 100644
index 0000000..ffcd2be
Binary files /dev/null and b/3rdparty/TNN/examples/assets/tiger_cat.jpg differ
diff --git a/3rdparty/TNN/examples/base/bert_tokenizer.cc b/3rdparty/TNN/examples/base/bert_tokenizer.cc
new file mode 100644
index 0000000..d5e618f
--- /dev/null
+++ b/3rdparty/TNN/examples/base/bert_tokenizer.cc
@@ -0,0 +1,456 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bert_tokenizer.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+#include <time.h>
+#include <set>
+// #include "utf8.h"
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/time.h>
+#endif
+
+namespace TNN_NS {
+
+std::string BertTokenizer::kUnkToken = "[UNK]";
+std::string BertTokenizer::kMaskToken = "[MASK]";
+std::string BertTokenizer::kSepToken = "[SEP]";
+std::string BertTokenizer::kPadToken = "[PAD]";
+std::string BertTokenizer::kClsToken = "[CLS]";
+
+const std::set<uint16_t> kChinesePunts = {
+    12290, 65306, 65311, 8212, 8216, 12304, 12305, 12298, 12299, 65307};
+
+const int kMaxCharsPerWords = 100;
+const int MaxSeqCount = 256;
+const size_t maxAns = 3;
+
+bool BertTokenizer::is_punct_char(char cp) {
+  if ((cp >= 33 && cp <= 47) || (cp >= 58 && cp <= 64) ||
+      (cp >= 91 && cp <= 96) || (cp >= 123 && cp <= 126)) {
+    return true;
+  }
+  if (cp == ' ') {
+    return false;
+  }
+}
+
+std::string BertTokenizer::toLower(std::string s) {
+    for (size_t i = 0; i < s.size(); i++) {
+        if (s[i] <= 'Z' && s[i] >= 'A') s[i] += 32;
+    }
+    return s;
+}
+
+std::string BertTokenizer::basic_separate(std::string text) {
+    std::string result;
+    size_t len = text.size();
+    for (size_t i = 0; i < len; i++) {
+        char c = text[i];
+        if (is_punct_char(c)) {
+            if (!result.empty() && result.back() != ' ') {
+                result.append(1, ' ');
+                result.append(2, '#');
+                result.append(1, c);
+            } else {
+                result.append(1, c);
+            }
+        } else if (c == ' ') {
+            if (!result.empty() && result.back() != ' ')
+                result += c;
+        } else if (i > 0 && is_punct_char(text[i - 1])) {
+            result.append(1, ' ');
+            result.append(2, '#');
+            result.append(1, c);
+        } else {
+            result.append(1, c);
+        }
+    }
+    if (!result.empty() && result.back() == ' ') {
+        result.erase(result.end() - 1);
+    }
+    return result;
+}
+
+Status BertTokenizer::Init(std::string vocab_file) {
+    std::ifstream ifs(vocab_file);
+    if (!ifs) {
+        return Status(TNNERR_INVALID_INPUT, "Vocab file not found!");
+    }
+    std::string content((std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()));
+    return InitByFileContent(content);
+}
+
+size_t BertTokenizer::PadId() { return token_2_id_map_.at(kPadToken); }
+size_t BertTokenizer::SepId() { return token_2_id_map_.at(kSepToken); }
+size_t BertTokenizer::MaskId() { return token_2_id_map_.at(kMaskToken); }
+size_t BertTokenizer::UnkId() { return token_2_id_map_.at(kUnkToken); }
+size_t BertTokenizer::ClsId() { return token_2_id_map_.at(kClsToken); }
+
+Status BertTokenizer::SplitString(const char *str, size_t len, char sepChar, std::vector<std::string> &pOut) {
+    const char *ptr = str;
+    if (ptr == NULL || len == 0) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is empty!");
+    }
+    size_t start = 0;
+    while (start < len && (str[start] == sepChar)) {
+        start ++;
+    }
+    ptr = str + start;
+    len = len - start;
+    while (len > 0 && ptr[len - 1] == sepChar) {
+        len --;
+    }
+    if (len <= 0) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invalid, the seperate char should be \\n");
+    }
+
+    size_t ps = 0;
+    for (size_t i = 0; i < len; i++) {
+        if (ptr[i] == sepChar) {
+            if (ptr[i - 1] != sepChar) {
+                std::string ts(ptr, ps, i - ps);
+                pOut.push_back(ts);
+            }
+            ps = i + 1;
+        }
+    }
+
+    if (ps < len) {
+        pOut.push_back(std::string(ptr, ps, len - ps));
+    }
+    return TNN_OK;
+}
+
+Status BertTokenizer::InitByFileContent(std::string content) {
+    std::vector<std::string> lines;
+    SplitString(content.c_str(), content.size(), '\n', lines);
+
+    InitFromLines(lines);
+    if (token_2_id_map_.find(kPadToken) == token_2_id_map_.end()) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invalid, [PAD] needed.");
+    }
+    if (token_2_id_map_.find(kUnkToken) == token_2_id_map_.end()) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invliad, [UNK] needed.");
+    }
+    if (token_2_id_map_.find(kClsToken) == token_2_id_map_.end()) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invliad, [CLS] needed.");
+    }
+    if (token_2_id_map_.find(kSepToken) == token_2_id_map_.end()) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invliad, [SEP] needed.");
+    }
+    if (token_2_id_map_.find(kMaskToken) == token_2_id_map_.end()) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invliad, [MASK] needed.");
+    }
+    int v = token_2_id_map_.at(kPadToken);
+    if (v != 0) {
+        return Status(TNNERR_INVALID_INPUT, "The vocab file is invliad, [PAD] shoulde be at the head of file.");
+    }
+    return TNN_OK;
+}
+
+Status BertTokenizer::InitFromLines(const std::vector<std::string>& lines) {
+    int idx = 0;
+
+    for (size_t i = 0; i < lines.size(); i++) {
+        
+        std::string line(lines[i]);
+        
+        size_t nn = line.size();
+        while (nn > 0 && ((line[nn - 1] == '\n') || (line[nn - 1] == '\r'))) {
+            nn --;
+        }
+        if (nn == 0) {
+            continue;
+        }
+        std::string token = line.substr(0, nn);
+        tokens_.push_back(token);
+        token_2_id_map_[token] = idx;
+        idx ++;
+    }
+    return TNN_OK;
+}
+
+size_t BertTokenizer::Word2Id(std::string word) {
+    if (word.size() > kMaxCharsPerWords) {
+        return token_2_id_map_.at(kUnkToken);
+    }
+    auto it = token_2_id_map_.find(word);
+    if (it == token_2_id_map_.end()) {
+        return token_2_id_map_.at(kUnkToken);
+    } else {
+        return it->second;
+    }
+}
+
+std::string BertTokenizer::Id2Word(size_t id) {
+    if (id >= 0 && id < static_cast<int>(tokens_.size())) {
+        return tokens_[id];
+    }
+    return kUnkToken;
+}
+
+void BertTokenizer::max_seg_(std::string s, std::vector<size_t>& results) {
+  bool sep = false;
+  if (s.find("##") != std::string::npos) {
+      s.replace(s.find("##"), 2, "");
+      sep = true;
+  }
+  int end = s.size();
+  int start = 0;
+  bool firstOne = true;
+  while (start < end) {
+    std::string test(s.c_str() + start, end - start);
+    if (!firstOne) {
+      test = std::string("##") + test;
+    }
+    std::string test_low = toLower(test);
+    auto it = token_2_id_map_.find(test_low);
+    
+    if (it == token_2_id_map_.end()) {
+      end -= 1;
+    } else {
+      // spdlog::info("now got :{}", test);
+      if (sep) {
+        std::string test1 = "##" + test;
+        features_.push_back(test1);
+        sep = false;
+      } else {
+        features_.push_back(test);
+      }
+      results.push_back(it->second);
+      start = end;
+      end = s.size();
+      firstOne = false;
+    }
+  }
+  if (firstOne) {
+    // not any one matched
+    if (sep) {
+        std::string test1 = "##" + s;
+        features_.push_back(s.append(2, '#'));
+    } else
+        features_.push_back(s);
+    results.push_back(token_2_id_map_.at(kUnkToken));
+  }
+}
+
+std::vector<size_t> BertTokenizer::Encode(std::string text, Status &status) {
+    std::vector<size_t> results;
+    text = StripStringASCIIWhole(text);
+    text = basic_separate(text);
+    // for(size_t i = 0; i < text.length(); i++) {
+    //     if (text[i] <= 'Z' && text[i] >= 'A') text[i] += 32;
+    // }
+    std::vector<std::string> tokens;
+    SplitString(text.c_str(), text.size(), ' ', tokens);
+    for (auto s : tokens) {
+        // features_.push_back(s);
+        if (s.size() > kMaxCharsPerWords) {
+            results.push_back(token_2_id_map_.at(kUnkToken));
+        } else {
+            max_seg_(s, results);
+        }
+    }
+
+    status = TNN_OK;
+
+    return results;
+
+}
+
+std::string BertTokenizer::StripStringASCIIWhole(const std::string str) {
+    size_t nn = str.size();
+    while (nn > 0 && (str[nn - 1] == ' ' || str[nn - 1] == '\t' ||
+                      str[nn - 1] == '\r' || str[nn - 1] == '\n')) {
+      nn -= 1;
+    }
+    size_t off = 0;
+    while (off < nn && (str[off] == ' ' || str[off] == '\t' ||
+                        str[off] == '\r' || str[off] == '\n')) {
+      off += 1;
+    }
+    bool seeWhitespace = false;
+    std::string ret;
+    for (size_t k = off; k < nn; k++) {
+      if (str[k] == ' ' || str[k] == '\t' || str[k] == '\r' || str[k] == '\n') {
+        if (!seeWhitespace) {
+          seeWhitespace = true;
+          ret.append(1, ' ');
+        }
+      } else {
+        seeWhitespace = false;
+        ret.append(1, str[k]);
+      }
+    }
+    return ret;
+}
+
+Status BertTokenizer::buildInput(std::string paragraph, std::string question, std::shared_ptr<BertTokenizerInput> input) {
+    std::vector<size_t> code1, code2;
+    Status status;
+    features_.clear();
+    
+    features_.push_back("[CLS]");
+    code1 = Encode(question, status);
+    features_.push_back("[SEP]");
+    code2 = Encode(paragraph, status);
+    features_.push_back("[SEP]");
+
+    code1.insert(code1.begin(), ClsId());
+    code1.insert(code1.end(), SepId());
+    for (size_t i = 0; i < MaxSeqCount; i++) {
+        if (i >= code1.size() && i < code1.size() + code2.size() + 1) reinterpret_cast<int*>(input->segments)[i] = 1;
+        else reinterpret_cast<int*>(input->segments)[i] = 0;
+    }
+    code1.insert(code1.end(), code2.begin(), code2.end());
+    code1.insert(code1.end(), SepId());
+
+    if (code1.size() < MaxSeqCount) {
+        code1.insert(code1.end(), (MaxSeqCount - code1.size()), 0);
+    }
+
+    for (size_t i = 0; i < MaxSeqCount; i++) {
+        if (code1[i]) {
+            reinterpret_cast<int*>(input->inputIds)[i] = code1[i];
+            reinterpret_cast<int*>(input->inputMasks)[i] = 1;
+        } else {
+            reinterpret_cast<int*>(input->inputIds)[i] = 0;
+            reinterpret_cast<int*>(input->inputMasks)[i] = 0;
+            reinterpret_cast<int*>(input->segments)[i] = 0;
+        }
+    }
+
+    return TNN_OK;
+}
+
+BertTokenizerInput::BertTokenizerInput(DeviceType device_type) {
+    inputIds = (void*)malloc(sizeof(float) * MaxSeqCount);
+    inputMasks = (void*)malloc(sizeof(float) * MaxSeqCount);
+    segments = (void*)malloc(sizeof(float) * MaxSeqCount);
+    DimsVector nchw = {1, MaxSeqCount};
+    mat_map_.insert(std::pair<std::string, std::shared_ptr<Mat>>("input_ids_0", 
+        std::make_shared<TNN_NS::Mat>(device_type, NC_INT32, nchw, inputIds)));
+    mat_map_.insert(std::pair<std::string, std::shared_ptr<Mat>>("input_mask_0", 
+        std::make_shared<TNN_NS::Mat>(device_type, NC_INT32, nchw, inputMasks)));
+    mat_map_.insert(std::pair<std::string, std::shared_ptr<Mat>>("segment_ids_0", 
+        std::make_shared<TNN_NS::Mat>(device_type, NC_INT32, nchw, segments)));
+}
+
+BertTokenizerInput::~BertTokenizerInput() {
+    mat_map_.clear();
+    if (inputIds) free(inputIds);
+    if (inputMasks) free(inputMasks);
+    if (segments) free(segments);
+}
+ 
+std::vector<size_t> BertTokenizer::_get_best_indexes(float* logits, size_t size, size_t n_best_size) {
+    std::map<float, size_t, std::greater<float>> logits_index;
+    for (int i = 0; i < size; i++) {
+        logits_index.insert(std::pair<float, size_t>(logits[i], i));
+    }
+    std::vector<size_t> results;
+
+    size_t index = 0;
+    for (auto item : logits_index) {
+        if (index >= n_best_size) break;
+        results.push_back(item.second);
+        index++;
+    }
+    return results;
+}
+
+bool cmp(const std::shared_ptr<struct prelim_prediction> &a, const std::shared_ptr<struct prelim_prediction> &b) {
+    return (a->start_logit + a->end_logit) > (b->start_logit + b->end_logit);
+}
+
+Status BertTokenizer::CalProbs(std::vector<std::shared_ptr<prelim_prediction>> prelim_pres) {
+    std::vector<float> scores;
+    float max_score = -FLT_MAX;
+
+    for (auto prelim_pre : prelim_pres) {
+        scores.push_back(prelim_pre->start_logit + prelim_pre->end_logit);
+    }
+
+    for (auto score : scores) {
+        if (score > max_score) max_score = score;
+    }
+
+    std::vector<float> exp_scores;
+    float sum = 0.0;
+    for (auto score : scores) {
+        auto x = exp(score - max_score);
+        exp_scores.push_back(x);
+        sum += x;
+    }
+
+    for (size_t i = 0; i < prelim_pres.size(); i++) {
+        prelim_pres[i]->prob = exp_scores[i] / sum;
+    }
+
+    return TNN_OK;
+}
+
+Status BertTokenizer::ConvertResult(std::shared_ptr<TNNSDKOutput> output, std::string& ans) {
+    std::vector<size_t> start_index, end_index;
+    float *start_logits, *end_logits;
+    start_logits = reinterpret_cast<float*>(output->GetMat("unstack:0")->GetData());
+    end_logits   = reinterpret_cast<float*>(output->GetMat("unstack:1")->GetData());
+    start_index = _get_best_indexes(start_logits, MaxSeqCount, 20);
+    end_index   = _get_best_indexes(end_logits, MaxSeqCount, 20);
+
+    std::vector<std::shared_ptr<struct prelim_prediction>> prelim_predictions;
+
+    for (auto start : start_index) {
+        for (auto end : end_index) {
+            if (start >= features_.size()) continue;
+            if (end >= features_.size()) continue;
+            if (end < start) continue;
+            int length = end - start + 1;
+            if (length > 20) continue;
+            prelim_predictions.push_back(std::make_shared<struct prelim_prediction>(start, end, start_logits[start], end_logits[end]));
+        }
+    }
+
+    std::sort(prelim_predictions.begin(), prelim_predictions.end(), cmp);
+    size_t nums = 0;
+
+    // calc probabilities
+    CalProbs(prelim_predictions);
+    
+    for (auto item : prelim_predictions) {
+        if (nums >= maxAns) break;
+        std::string tok;
+        for (size_t i = item->start; i <= item->end; i++) { 
+            if (features_[i].find("##") != std::string::npos) {
+                auto s = features_[i].substr(features_[i].find("##") + 2); // ## represent connections between tokens(no white-space)
+                tok += s;
+            } else {
+                if (i == item->start) tok += features_[i];
+                else tok += " " + features_[i];
+            }
+        }
+        printf("ans%d[probability=%f]: %s\n", nums + 1, item->prob, tok.c_str());
+        nums++;
+    }
+    
+
+    return TNN_OK;
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/examples/base/bert_tokenizer.h b/3rdparty/TNN/examples/base/bert_tokenizer.h
new file mode 100644
index 0000000..3b4446c
--- /dev/null
+++ b/3rdparty/TNN/examples/base/bert_tokenizer.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BERT_TOKENIZER_H_
+#define TNN_EXAMPLES_BERT_TOKENIZER_H_
+
+#include "tnn_sdk_sample.h"
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace TNN_NS {
+
+typedef struct prelim_prediction {
+public:
+    prelim_prediction(size_t start, size_t end, float start_logit, float end_logit) :
+        start(start), end(end), start_logit(start_logit), end_logit(end_logit) {};
+    size_t start, end;
+    float start_logit, end_logit, prob = 1.0;
+} prelim_prediction;
+
+class BertTokenizerInput : public TNNSDKInput {
+public:
+    BertTokenizerInput(DeviceType device_type);
+    virtual ~BertTokenizerInput();
+    void* inputIds;
+    void* inputMasks;
+    void* segments;
+};
+
+class BertTokenizer {
+public:
+    // @brief Init vocabulary with vocab file
+    Status Init(std::string vocab);
+
+    // @brief Init vocabulary with vocab content
+    Status InitByFileContent(std::string content);
+
+    // @brief Encode the text to tokens and features
+    std::vector<size_t> Encode(std::string text, Status &status);
+
+    // @brief find vocabulary by id
+    size_t Word2Id(std::string word);
+
+    // @brief find id by vocabulary
+    std::string Id2Word(size_t id);
+    
+    size_t PadId();
+    size_t MaskId();
+    size_t SepId();
+    size_t ClsId();
+    size_t UnkId();
+
+    // build inputBert with paragraph and question
+    Status buildInput(std::string paragraph, std::string question, std::shared_ptr<BertTokenizerInput> input);
+
+    // @brief get indexes from result
+    std::vector<size_t> _get_best_indexes(float* logits, size_t size, size_t n_best_size);
+
+    // @brief judge if a charactor is punctuate chracter
+    bool is_punct_char(char cp);
+
+    // @brief seperate text with whitespace and punctuate character
+    std::string basic_separate(std::string text);
+
+    // @brief set a string to lower case
+    std::string toLower(std::string s);
+
+    // @brief tokenize BertOutput to text result and probalities
+    Status ConvertResult(std::shared_ptr<TNNSDKOutput> output, std::string& ans);
+
+    // @brief calculate probabilities for result
+    Status CalProbs(std::vector<std::shared_ptr<prelim_prediction>> scores);
+private:
+    // @brief seperate token to indivisible one
+    void max_seg_(std::string s, std::vector<size_t>& results);
+
+    // @brief get vocabulary by lines of char
+    Status InitFromLines(const std::vector<std::string>& lines);
+
+    // @brief split strings by sepChar (usually '\\n')
+    Status SplitString(const char *str, size_t len, char sepChar, std::vector<std::string> &pOut);
+
+    // @brief split string ASCII
+    std::string StripStringASCIIWhole(const std::string str);
+
+    // @param map between token and id in vocabulary
+    std::map<std::string, int> token_2_id_map_;
+    std::vector<std::string> tokens_;
+
+    // signs needed by vocabulary
+    // @param [Unk] for Unknown
+    static std::string kUnkToken;
+    // @param [Mask] for Mask
+    static std::string kMaskToken;
+    // @param [Sep] for Seperate
+    static std::string kSepToken;
+    // @param [Pad] for Pad
+    static std::string kPadToken;
+    // @param [Cls] for Cls
+    static std::string kClsToken;
+
+    // @param index to origin word 
+    std::vector<std::string> features_;
+};
+} // namespace TNN_NS
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/base/blazeface_detector.cc b/3rdparty/TNN/examples/base/blazeface_detector.cc
new file mode 100644
index 0000000..c08e89d
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazeface_detector.cc
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "blazeface_detector.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+#include <time.h>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/time.h>
+#endif
+
+namespace TNN_NS {
+
+Status BlazeFaceDetector::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazeFaceDetectorOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    std::ifstream inFile(option->anchor_path);
+    RETURN_VALUE_ON_NEQ(inFile.good(), true, Status(TNNERR_PARAM_ERR, "TNNSDKOption.anchor_path is invalid"));
+    std::string line;
+    anchors.reserve(num_anchors * 4);
+    int index = 0;
+    while(std::getline(inFile, line, '\n')) {
+        float val = std::stof(line);
+        anchors[index++] = val;
+    }
+    RETURN_VALUE_ON_NEQ(index == num_anchors*4, true,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOption.anchor_path doesnot contain valid blazeface anchors"));
+    
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+
+    return status;
+}
+
+std::shared_ptr<Mat> BlazeFaceDetector::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    return TNNSDKSample::ResizeToInputShape(input_mat, name);
+}
+
+MatConvertParam BlazeFaceDetector::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5, 0.0};
+    input_convert_param.bias  = {-1.0, -1.0, -1.0, 0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> BlazeFaceDetector::CreateSDKOutput() {
+    return std::make_shared<BlazeFaceDetectorOutput>();
+}
+
+Status BlazeFaceDetector::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazeFaceDetectorOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                           Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<BlazeFaceDetectorOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    
+    auto scores = output->GetMat("546");
+    auto boxes  = output->GetMat("544");
+    RETURN_VALUE_ON_NEQ(!scores, false,
+                           Status(TNNERR_PARAM_ERR, "scores mat is nil"));
+    RETURN_VALUE_ON_NEQ(!boxes, false,
+                           Status(TNNERR_PARAM_ERR, "boxes mat is nil"));
+    
+    std::vector<BlazeFaceInfo> bbox_collection;
+    //decode bbox
+    GenerateBBox(bbox_collection, *(scores.get()), *(boxes.get()), option->input_width, option->input_height, option->min_score_threshold);
+    std::vector<BlazeFaceInfo> face_list;
+    BlendingNMS(bbox_collection, face_list, option->min_suppression_threshold);
+    output->face_list = face_list;
+    
+    return status;
+}
+
+void BlazeFaceDetector::GenerateBBox(std::vector<BlazeFaceInfo> &detects, TNN_NS::Mat &scores, TNN_NS::Mat &boxes, int image_w, int image_h, float min_score_threshold) {
+    float *boxes_data = static_cast<float*>(boxes.GetData());
+    float *score_data = static_cast<float*>(scores.GetData());
+    
+    for(int i=0; i<num_anchors; ++i) {
+        if(score_data[i] < min_score_threshold)
+            continue;
+        float x_center = boxes_data[i * detect_dims + 0] / image_w * anchors[i * 4 + 2] + anchors[i * 4 + 0];
+        float y_center = boxes_data[i * detect_dims + 1] / image_h * anchors[i * 4 + 3] + anchors[i * 4 + 1];
+        float width    = boxes_data[i * detect_dims + 2] / image_w * anchors[i * 4 + 2] ;
+        float height   = boxes_data[i * detect_dims + 3] / image_h * anchors[i * 4 + 3] ;
+        BlazeFaceInfo info;
+        info.image_width = image_w;
+        info.image_height = image_h;
+        
+        info.score = score_data[i];
+        // bbox
+        info.x1 = (x_center - width / 2.0) * image_w;
+        info.y1 = (y_center - height / 2.0) * image_h;
+        info.x2 = (x_center + width / 2.0) * image_w;
+        info.y2 = (y_center + height / 2.0) * image_h;
+        // key points
+        for(int j=0; j<num_keypoints; ++j) {
+            int offset = j * 2 + 4;
+            float kp_x = (boxes_data[i * detect_dims + offset + 0] / image_w * anchors[i * 4 + 2] + anchors[i * 4 + 0]) * image_w;
+            float kp_y = (boxes_data[i * detect_dims + offset + 1] / image_h * anchors[i * 4 + 3] + anchors[i * 4 + 1]) * image_h;
+            info.key_points.push_back(std::make_pair(kp_x, kp_y));
+        }
+        detects.push_back(std::move(info));
+    }
+    
+}
+
+void BlazeFaceDetector::BlendingNMS(std::vector<BlazeFaceInfo> &input, std::vector<BlazeFaceInfo> &output, float min_suppression_threshold) {
+    ::TNN_NS::NMS(input, output, min_suppression_threshold, TNNBlendingNMS);
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/blazeface_detector.h b/3rdparty/TNN/examples/base/blazeface_detector.h
new file mode 100644
index 0000000..1a59bd7
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazeface_detector.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_BLAZEFACE_DETECTOR_H_
+#define TNN_EXAMPLES_BASE_BLAZEFACE_DETECTOR_H_
+
+#include "tnn_sdk_sample.h"
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace TNN_NS {
+
+typedef ObjectInfo BlazeFaceInfo;
+
+class BlazeFaceDetectorInput : public TNNSDKInput {
+public:
+    BlazeFaceDetectorInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~BlazeFaceDetectorInput(){}
+};
+
+class BlazeFaceDetectorOutput : public TNNSDKOutput {
+public:
+    BlazeFaceDetectorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~BlazeFaceDetectorOutput() {};
+    std::vector<BlazeFaceInfo> face_list;
+};
+
+class BlazeFaceDetectorOption : public TNNSDKOption {
+public:
+    BlazeFaceDetectorOption() {}
+    virtual ~BlazeFaceDetectorOption() {}
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    float min_score_threshold = 0.75;
+    float min_suppression_threshold = 0.3;
+    std::string anchor_path;
+};
+
+class BlazeFaceDetector : public TNNSDKSample {
+public:
+    virtual ~BlazeFaceDetector() {};
+    
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateBBox(std::vector<BlazeFaceInfo> &detects, Mat &scores, Mat &boxes,
+                      int image_w, int image_h, float min_score_threshold);
+    void BlendingNMS(std::vector<BlazeFaceInfo> &input, std::vector<BlazeFaceInfo> &output,
+                     float min_suppression_threshold);
+    
+    std::vector<float> anchors;
+    
+    int num_anchors = 896;
+    int detect_dims = 16;
+    int num_keypoints = 6;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_BLAZEFACE_DETECTOR_H_
diff --git a/3rdparty/TNN/examples/base/blazepose_detector.cc b/3rdparty/TNN/examples/base/blazepose_detector.cc
new file mode 100644
index 0000000..d11466d
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazepose_detector.cc
@@ -0,0 +1,365 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "blazepose_detector.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+
+namespace TNN_NS {
+
+float CalculateScale(float min_scale, float max_scale, int stride_index,
+                     int num_strides) {
+    if (num_strides == 1) {
+        return (min_scale + max_scale) * 0.5f;
+    } else {
+        return min_scale + (max_scale - min_scale) * 1.0 * stride_index / (num_strides - 1.0f);
+    }
+}
+
+Status BlazePoseDetector::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazePoseDetectorOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+    //init anchors
+    GenerateAnchor(&anchors);
+    return status;
+}
+
+std::shared_ptr<Mat> BlazePoseDetector::ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                                   std::string name) {
+    auto target_dims   = GetInputShape(name);
+    auto target_height = target_dims[2];
+    auto target_width  = target_dims[3];
+
+    auto input_height  = mat->GetHeight();
+    auto input_width   = mat->GetWidth();
+
+    letterbox_pads.fill(0.f);
+    const float input_aspect_ratio  = static_cast<float>(input_width) / input_height;
+    const float output_aspect_ratio = static_cast<float>(target_width) / target_height;
+    if (input_aspect_ratio < output_aspect_ratio) {
+        // compute left and right padding.
+        letterbox_pads[0] = (1.f - input_aspect_ratio / output_aspect_ratio) / 2.f;
+        letterbox_pads[2] = letterbox_pads[0];
+    } else if (output_aspect_ratio < input_aspect_ratio) {
+        // compute top and bottom padding.
+        letterbox_pads[1] = (1.f - output_aspect_ratio / input_aspect_ratio) / 2.f;
+        letterbox_pads[3] = letterbox_pads[1];
+    }
+
+    if (input_height != target_height || input_width !=target_width) {
+        const float scale = std::min(static_cast<float>(target_width) / input_width,
+                                     static_cast<float>(target_height) / input_height);
+        const int resized_width  = std::round(input_width * scale);
+        const int resized_height = std::round(input_height * scale);
+
+        // TODO: we should use INTER_AREA when scale<1.0
+        auto interp_mode = scale < 1.0f ? TNNInterpLinear : TNNInterpLinear;
+        DimsVector intermediate_shape = mat->GetDims();
+        intermediate_shape[2] = resized_height;
+        intermediate_shape[3] = resized_width;
+        auto intermediate_mat = std::make_shared<Mat>(mat->GetDeviceType(), mat->GetMatType(), intermediate_shape);
+        auto status = Resize(mat, intermediate_mat, interp_mode);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+        const int top    = (target_height - resized_height) / 2;
+        const int bottom = (target_height - resized_height) - top;
+        const int left   = (target_width  - resized_width) / 2;
+        const int right  = (target_width  - resized_width) - left;
+
+        auto input_mat = std::make_shared<Mat>(intermediate_mat->GetDeviceType(), mat->GetMatType(), target_dims);
+        status = CopyMakeBorder(intermediate_mat, input_mat, top, bottom, left, right, TNNBorderConstant);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+        return input_mat;
+    }
+    return mat;
+}
+
+MatConvertParam BlazePoseDetector::GetConvertParamForInput(std::string tag) {
+    MatConvertParam param;
+    param.scale = {2.0 / 255.0, 2.0 / 255.0, 2.0 / 255.0, 0.0};
+    param.bias  = {-1.0,        -1.0,        -1.0,        0.0};
+    return param;
+}
+
+std::shared_ptr<TNNSDKOutput> BlazePoseDetector::CreateSDKOutput() {
+    return std::make_shared<BlazePoseDetectorOutput>();
+}
+
+Status BlazePoseDetector::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazePoseDetectorOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                           Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<BlazePoseDetectorOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+
+    //TODO: tnn's output shape mismatches with tflite
+    auto scores = output->GetMat("classificators"); //(1, 896, 1)
+    auto boxes  = output->GetMat("regressors"); //(1, 896, 12)
+    RETURN_VALUE_ON_NEQ(!scores, false,
+                           Status(TNNERR_PARAM_ERR, "scores mat is nil"));
+    RETURN_VALUE_ON_NEQ(!boxes, false,
+                           Status(TNNERR_PARAM_ERR, "boxes mat is nil"));
+
+    std::vector<BlazePoseInfo> poses;
+    GenerateBBox(poses, *(scores.get()), *(boxes.get()), option->min_score_threshold);
+    NMS(poses, output->body_list, option->min_suppression_threshold, TNNWeightedNMS);
+    RemoveLetterBox(output->body_list);
+
+    return status;
+}
+
+/*
+ mediapipe ssd_anchors_calculator
+ */
+void BlazePoseDetector::GenerateAnchor(std::vector<Anchor>* anchors) {
+    const int stride_size = static_cast<int>(anchor_options.strides.size());
+    const int ar_size     = static_cast<int>(anchor_options.aspect_ratios.size());
+    int layer_id = 0;
+    while (layer_id < anchor_options.num_layers) {
+        std::vector<float> anchor_height;
+        std::vector<float> anchor_width;
+        std::vector<float> aspect_ratios;
+        std::vector<float> scales;
+
+        // For same strides, we merge the anchors in the same order.
+        int last_same_stride_layer = layer_id;
+        while (last_same_stride_layer < stride_size &&
+               anchor_options.strides[last_same_stride_layer] ==
+               anchor_options.strides[layer_id]) {
+            const float scale =
+            CalculateScale(anchor_options.min_scale, anchor_options.max_scale,
+                           last_same_stride_layer, stride_size);
+            for (int aspect_ratio_id = 0; aspect_ratio_id < ar_size; ++aspect_ratio_id) {
+                aspect_ratios.push_back(anchor_options.aspect_ratios[aspect_ratio_id]);
+                scales.push_back(scale);
+            }
+            if (anchor_options.interpolated_scale_aspect_ratio > 0.0) {
+                const float scale_next =
+                last_same_stride_layer == stride_size - 1 ? 1.0f
+                : CalculateScale(anchor_options.min_scale, anchor_options.max_scale,
+                                 last_same_stride_layer + 1,
+                                 stride_size);
+                scales.push_back(std::sqrt(scale * scale_next));
+                aspect_ratios.push_back(anchor_options.interpolated_scale_aspect_ratio);
+            }
+            last_same_stride_layer++;
+        }
+
+        for (int i = 0; i < aspect_ratios.size(); ++i) {
+            const float ratio_sqrts = std::sqrt(aspect_ratios[i]);
+            anchor_height.push_back(scales[i] / ratio_sqrts);
+            anchor_width.push_back(scales[i] * ratio_sqrts);
+        }
+
+        const int stride = anchor_options.strides[layer_id];
+        auto input_shape = GetInputShape();
+        const int input_height = input_shape[2];
+        const int input_wdith  = input_shape[3];
+        int feature_map_height = std::ceil(1.0f * input_height / stride);
+        int feature_map_width = std::ceil(1.0f * input_wdith / stride);
+
+        for (int y = 0; y < feature_map_height; ++y) {
+            for (int x = 0; x < feature_map_width; ++x) {
+                for (int anchor_id = 0; anchor_id < anchor_height.size(); ++anchor_id) {
+                    const float x_center = (x + anchor_options.anchor_offset_x) * 1.0f / feature_map_width;
+                    const float y_center = (y + anchor_options.anchor_offset_y) * 1.0f / feature_map_height;
+
+                    Anchor new_anchor;
+                    new_anchor.x_center = x_center;
+                    new_anchor.y_center = y_center;
+                    new_anchor.w = 1.0f;
+                    new_anchor.h = 1.0f;
+
+                    anchors->push_back(new_anchor);
+                }
+            }
+        }
+        layer_id = last_same_stride_layer;
+    }
+}
+
+void BlazePoseDetector::GenerateBBox(std::vector<BlazePoseInfo> &detects, Mat &scoreMat, Mat &boxMat,
+                                     float min_score_threshold) {
+    detects.clear();
+    // check shape
+    auto box_dims = boxMat.GetDims();
+    auto score_dims = scoreMat.GetDims();
+    if (box_dims[1] != decode_options.num_boxes ||
+        box_dims[2] != decode_options.num_coords) {
+        return;
+    }
+    if (score_dims[1] != decode_options.num_boxes ||
+        score_dims[2] != decode_options.num_classes) {
+        return;
+    }
+
+    const float* raw_boxes = static_cast<float *>(boxMat.GetData());
+    const float* raw_scores = static_cast<float *>(scoreMat.GetData());
+
+    std::vector<float> boxes(decode_options.num_boxes * decode_options.num_coords);
+    // decode box
+    DecodeBoxes(boxes, raw_boxes);
+    // decode score
+    std::vector<float> scores(decode_options.num_boxes);
+    std::vector<int> classes(decode_options.num_boxes);
+    DecodeScore(scores, classes, raw_scores);
+
+    // generate output
+    for (int i = 0; i < decode_options.num_boxes; ++i) {
+        if (scores[i] < min_score_threshold) {
+            continue;
+        }
+        BlazePoseInfo info;
+
+        const int box_offset = i * decode_options.num_coords;
+        float box_ymin = boxes[box_offset + 0];
+        float box_xmin = boxes[box_offset + 1];
+        float box_ymax = boxes[box_offset + 2];
+        float box_xmax = boxes[box_offset + 3];
+        float score = scores[i];
+        int class_id = classes[i];
+        info.score = score;
+        info.class_id = class_id;
+        info.x1 = box_xmin;
+        info.y1 = box_ymin;
+        info.x2 = box_xmax;
+        info.y2 = box_ymax;
+
+        // add keypoints.
+        int keypoint_index = box_offset + decode_options.keypoint_coord_offset;
+        for (int kp_id = 0; kp_id < decode_options.num_keypoints; kp_id += 1) {
+            float kpx = boxes[keypoint_index + 0];
+            float kpy = boxes[keypoint_index + 1];
+            info.key_points.push_back(std::make_pair(kpx, kpy));
+
+            keypoint_index += decode_options.num_values_per_keypoint;
+        }
+        detects.push_back(std::move(info));
+    }
+}
+
+void BlazePoseDetector::DecodeBoxes(std::vector<float>& boxes, const float* raw_boxes) {
+    auto input_shape = GetInputShape();
+    const float input_h = input_shape[2];
+    const float input_w = input_shape[3];
+
+    for (int i = 0; i < decode_options.num_boxes; ++i) {
+        const int box_offset = i * decode_options.num_coords;
+
+        float y_center = raw_boxes[box_offset];
+        float x_center = raw_boxes[box_offset + 1];
+        float h = raw_boxes[box_offset + 2];
+        float w = raw_boxes[box_offset + 3];
+        if (decode_options.reverse_output_order) {
+            x_center = raw_boxes[box_offset];
+            y_center = raw_boxes[box_offset + 1];
+            w = raw_boxes[box_offset + 2];
+            h = raw_boxes[box_offset + 3];
+        }
+
+        x_center = x_center / input_w * anchors[i].w + anchors[i].x_center;
+        y_center = y_center / input_h * anchors[i].h + anchors[i].y_center;
+        h = h / input_h * anchors[i].h;
+        w = w / input_w * anchors[i].w;
+
+        const float ymin = y_center - h / 2.f;
+        const float xmin = x_center - w / 2.f;
+        const float ymax = y_center + h / 2.f;
+        const float xmax = x_center + w / 2.f;
+
+        boxes[i * decode_options.num_coords + 0] = ymin;
+        boxes[i * decode_options.num_coords + 1] = xmin;
+        boxes[i * decode_options.num_coords + 2] = ymax;
+        boxes[i * decode_options.num_coords + 3] = xmax;
+
+        int kp_offset = i * decode_options.num_coords + decode_options.keypoint_coord_offset;
+        for (int k = 0; k < decode_options.num_keypoints; ++k) {
+            float keypoint_y = raw_boxes[kp_offset];
+            float keypoint_x = raw_boxes[kp_offset + 1];
+            if (decode_options.reverse_output_order) {
+                keypoint_x = raw_boxes[kp_offset];
+                keypoint_y = raw_boxes[kp_offset + 1];
+            }
+            boxes[kp_offset] = keypoint_x / input_w * anchors[i].w + anchors[i].x_center;
+            boxes[kp_offset + 1] = keypoint_y / input_h * anchors[i].h + anchors[i].y_center;
+
+            kp_offset += decode_options.num_values_per_keypoint;
+        }
+    }
+}
+
+void BlazePoseDetector::DecodeScore(std::vector<float>& scores, std::vector<int>& classes, const float* raw_scores) {
+    scores.resize(decode_options.num_boxes, 0);
+    classes.resize(decode_options.num_boxes, -1);
+    // Filter classes by scores.
+    for (int i = 0; i < decode_options.num_boxes; ++i) {
+        int class_id = -1;
+        float max_score = -std::numeric_limits<float>::max();
+        // Find the top score for box i.
+        for (int score_idx = 0; score_idx < decode_options.num_classes; ++score_idx) {
+            auto score = raw_scores[i * decode_options.num_classes + score_idx];
+            if (decode_options.sigmoid_score) {
+                if (decode_options.score_clipping_thresh != 0) {
+                    score = score < -decode_options.score_clipping_thresh ?
+                            -decode_options.score_clipping_thresh :
+                            score;
+                    score = score >  decode_options.score_clipping_thresh ?
+                             decode_options.score_clipping_thresh :
+                             score;
+                }
+                score = 1.0f / (1.0f + std::exp(-score));
+            }
+            if (max_score < score) {
+                max_score = score;
+                class_id = score_idx;
+            }
+        }
+        scores[i] = max_score;
+        classes[i] = class_id;
+    }
+}
+
+void BlazePoseDetector::RemoveLetterBox(std::vector<BlazePoseInfo>& detects) {
+    const float left = letterbox_pads[0];
+    const float top  = letterbox_pads[1];
+    const float left_and_right = letterbox_pads[0] + letterbox_pads[2];
+    const float top_and_bottom = letterbox_pads[1] + letterbox_pads[3];
+    for (auto& pose : detects) {
+        pose.x1 = (pose.x1 - left) / (1.0f - left_and_right);
+        pose.y1 = (pose.y1 - top)  / (1.0f - top_and_bottom);
+        pose.x2 = (pose.x2 - left) / (1.0f - left_and_right);
+        pose.y2 = (pose.y2 - top)  / (1.0f - top_and_bottom);
+        for (int i = 0; i<pose.key_points.size(); ++i) {
+            auto kp = pose.key_points[i];
+            const float new_x = (kp.first  - left) / (1.0f - left_and_right);
+            const float new_y = (kp.second - top)  / (1.0f - top_and_bottom);
+            pose.key_points[i] = std::make_pair(new_x, new_y);
+        }
+    }
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/blazepose_detector.h b/3rdparty/TNN/examples/base/blazepose_detector.h
new file mode 100644
index 0000000..bd1e657
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazepose_detector.h
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_BLAZEPOSE_DETECTOR_H_
+#define TNN_EXAMPLES_BASE_BLAZEPOSE_DETECTOR_H_
+
+#include "tnn_sdk_sample.h"
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <vector>
+#include <array>
+
+namespace TNN_NS {
+
+typedef ObjectInfo BlazePoseInfo;
+
+class BlazePoseDetectorInput : public TNNSDKInput {
+public:
+    BlazePoseDetectorInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~BlazePoseDetectorInput(){}
+};
+
+class BlazePoseDetectorOutput : public TNNSDKOutput {
+public:
+    BlazePoseDetectorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~BlazePoseDetectorOutput() {};
+    std::vector<BlazePoseInfo> body_list;
+};
+
+class BlazePoseDetectorOption : public TNNSDKOption {
+public:
+    BlazePoseDetectorOption() {}
+    virtual ~BlazePoseDetectorOption() {}
+    int input_width;
+    int input_height;
+
+    int num_thread = 1;
+    float min_score_threshold = 0.5;
+    float min_suppression_threshold = 0.3;
+};
+
+class BlazePoseDetector : public TNNSDKSample {
+public:
+    virtual ~BlazePoseDetector() {};
+
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    std::shared_ptr<Mat> input_;
+
+private:
+    // struct for a detection anchor
+    struct Anchor {
+        // Encoded anchor box center.
+        float x_center;
+        float y_center;
+        // Encoded anchor box height.
+        float h;
+        // Encoded anchor box width.
+        float w;
+    };
+    // configs for generating anchors
+    struct AnchorOptions {
+        int num_layers = 4;
+        float min_scale = 0.1484375;
+        float max_scale = 0.75;
+        float anchor_offset_x = 0.5;
+        float anchor_offset_y = 0.5;
+        std::vector<int> strides = {8, 16, 16, 16};
+        std::vector<float> aspect_ratios = {1.0};
+        float interpolated_scale_aspect_ratio = 1.0;
+    };
+    // configs for decoding score and boxes
+    struct DeocdingOptions {
+        int num_boxes = 896;
+        int num_coords = 12;
+        int num_classes = 1;
+        int num_keypoints = 4;
+        int keypoint_coord_offset = 4;
+        int num_values_per_keypoint = 2;
+        bool sigmoid_score = true;
+        float score_clipping_thresh = 100;
+        bool reverse_output_order = true;
+    };
+
+private:
+    void GenerateAnchor(std::vector<Anchor>* anchors);
+    void GenerateBBox(std::vector<BlazePoseInfo> &detects, Mat &scores, Mat &boxes,
+                      float min_score_threshold);
+    void DecodeBoxes(std::vector<float>& boxes, const float* raw_boxes);
+    void DecodeScore(std::vector<float>& scores, std::vector<int>& classes, const float* raw_scores);
+    void RemoveLetterBox(std::vector<BlazePoseInfo>& detects);
+    // pads for remove latterbox from detection
+    std::array<float, 4> letterbox_pads;
+
+    AnchorOptions anchor_options;
+    std::vector<Anchor> anchors;
+    // box decoding config
+    DeocdingOptions decode_options;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_BLAZEPOSE_DETECTOR_H_
+
diff --git a/3rdparty/TNN/examples/base/blazepose_landmark.cc b/3rdparty/TNN/examples/base/blazepose_landmark.cc
new file mode 100644
index 0000000..cfad405
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazepose_landmark.cc
@@ -0,0 +1,361 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "blazepose_landmark.h"
+#include "time_stamp.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+#include <algorithm>
+
+namespace TNN_NS {
+
+template <typename KeyPointType, int d>
+struct GetCoord {
+    static float get(KeyPointType& p) {
+        return -1;
+    }
+};
+
+using KeyPoint2D = std::pair<float, float>;
+template<>
+struct GetCoord<KeyPoint2D, 0> {
+    static float get(KeyPoint2D& p) {
+        return p.first;
+    }
+};
+
+template<>
+struct GetCoord<KeyPoint2D, 1> {
+    static float get(KeyPoint2D& p) {
+        return p.second;
+    }
+};
+
+using KeyPoint3D = std::tuple<float, float, float>;
+template<int d>
+struct GetCoord<KeyPoint3D, d> {
+    static float get(KeyPoint3D& p) {
+        return std::get<d>(p);
+    }
+};
+
+template <typename KeyPointType>
+void BlazePoseLandmark::KeyPoints2RoI(std::vector<KeyPointType>& kps, const RoIGenOptions& option) {
+    constexpr double PI = 3.141;
+    const int start_kp_idx = option.keypoints_start_idx;
+    const int end_kp_idx = option.keypoints_end_idx;
+    const float target_angle = option.rotation_target_angle / 180.0 * PI;
+
+    const int input_height = origin_input_shape[2];
+    const int input_width  = origin_input_shape[3];
+
+    float x_center = GetCoord<KeyPointType, 0>::get(kps[start_kp_idx]) * input_width;
+    float y_center = GetCoord<KeyPointType, 1>::get(kps[start_kp_idx]) * input_height;
+    float x_scale  = GetCoord<KeyPointType, 0>::get(kps[end_kp_idx]) * input_width;
+    float y_scale  = GetCoord<KeyPointType, 1>::get(kps[end_kp_idx]) * input_height;
+
+    // bounding box size as double distance from center to scale point.
+    const float box_size = std::sqrt((x_scale - x_center) * (x_scale - x_center) +
+                    (y_scale - y_center) * (y_scale - y_center)) * 2.0;
+    // rotation
+    float angle = target_angle - std::atan2(-(y_scale - y_center), x_scale - x_center);
+    float rotation = angle - 2 * PI * std::floor((angle - (-PI)) / (2 * PI));
+
+    // resulting bounding box.
+    x_center = x_center / input_width;
+    y_center = y_center / input_height;
+    float width  = box_size / input_width;
+    float height = box_size / input_height;
+
+    roi.x_center = x_center;
+    roi.y_center = y_center;
+    roi.width  = width  * option.scale_x;
+    roi.height = height * option.scale_y;
+    roi.rotation = rotation;
+}
+
+void BlazePoseLandmark::GetCropMatrix(float trans_mat[2][3], std::vector<float>&target_size) {
+    const int src_height = origin_input_shape[2];
+    const int src_width  = origin_input_shape[3];
+
+    float roi_width  = std::round(roi.width  * src_width);
+    float roi_height = std::round(roi.height * src_height);
+    float roi_x_center = std::round(roi.x_center * src_width);
+    float roi_y_center = std::round(roi.y_center * src_height);
+    float rotate = roi.rotation;
+
+    // compute resize scale
+    auto input_shape = GetInputShape();
+    const int input_width  = input_shape[3];
+    const int input_height = input_shape[2];
+    float scale = std::min({1.0f, input_height/roi.height, input_width/roi.width});
+    float target_width  = scale * roi_width;
+    float target_height = scale * roi_height;
+    trans_mat[0][0] = scale * std::cos(rotate);
+    trans_mat[0][1] = scale * std::sin(rotate);
+    trans_mat[1][0] = -scale* std::sin(rotate);
+    trans_mat[1][1] = scale * std::cos(rotate);
+
+    // compute offset
+    float trans_center_x = roi_x_center * trans_mat[0][0] + roi_y_center * trans_mat[0][1];
+    float trans_center_y = roi_x_center * trans_mat[1][0] + roi_y_center * trans_mat[1][1];
+    const float offset_x = (target_width / 2) - trans_center_x;
+    const float offset_y = (target_height/ 2) - trans_center_y;
+    trans_mat[0][2] = offset_x;
+    trans_mat[1][2] = offset_y;
+
+    target_size.resize(2);
+    target_size[0] = target_height;
+    target_size[1] = target_width;
+}
+
+Status BlazePoseLandmark::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazePoseLandmarkOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    landmark_filter = std::make_shared<VelocityFilter>(this->window_size,
+                                                       this->velocity_scale,
+                                                       this->min_allowed_object_scale,
+                                                       option->fps);
+
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+    roi_from_prev_frame = false;
+
+    if (option->full_body) {
+        // keypoints used to compute roi
+        this->roi_option.keypoints_start_idx = 33;
+        this->roi_option.keypoints_end_idx = 34;
+        // number of total landmarks
+        this->num_landmarks = 39;
+        // number of visible landmarks
+        this->num_visible_landmarks = 33;
+        // add lines only for the full-body landmark model
+        this->lines.insert(this->lines.end(),
+                           this->extended_lines_fb.begin(),
+                           this->extended_lines_fb.end());
+    }
+    return status;
+}
+
+std::shared_ptr<Mat> BlazePoseLandmark::ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                                   std::string name) {
+    // save the origianl input shape
+    origin_input_shape = mat->GetDims();
+
+    auto target_dims   = GetInputShape(name);
+    auto target_height = target_dims[2];
+    auto target_width  = target_dims[3];
+
+    // step1: crop the roi
+    float transMat[2][3];
+    std::vector<float> cropSize;
+    GetCropMatrix(transMat, cropSize);
+    auto crop_shape = target_dims;
+    crop_shape[2] = cropSize[0];
+    crop_shape[3] = cropSize[1];
+    auto cropped_mat = std::make_shared<Mat>(mat->GetDeviceType(), mat->GetMatType(), crop_shape);
+
+    auto status = WarpAffine(mat, cropped_mat, TNNInterpLinear, TNNBorderConstant, transMat);
+    RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+    // step2: transform the cropped
+    auto input_height  = cropped_mat->GetHeight();
+    auto input_width   = cropped_mat->GetWidth();
+
+    letterbox_pads.fill(0.f);
+    const float input_aspect_ratio  = static_cast<float>(input_width) / input_height;
+    const float output_aspect_ratio = static_cast<float>(target_width) / target_height;
+    if (input_aspect_ratio < output_aspect_ratio) {
+        // Compute left and right padding.
+        letterbox_pads[0] = (1.f - input_aspect_ratio / output_aspect_ratio) / 2.f;
+        letterbox_pads[2] = letterbox_pads[0];
+    } else if (output_aspect_ratio < input_aspect_ratio) {
+        // Compute top and bottom padding.
+        letterbox_pads[1] = (1.f - output_aspect_ratio / input_aspect_ratio) / 2.f;
+        letterbox_pads[3] = letterbox_pads[1];
+    }
+
+    if (input_height != target_height || input_width !=target_width) {
+        const float scale = std::min(static_cast<float>(target_width) / input_width,
+                                     static_cast<float>(target_height) / input_height);
+        const int resized_width  = std::round(input_width * scale);
+        const int resized_height = std::round(input_height * scale);
+
+        // TODO: we should use INTER_AREA when scale<1.0
+        auto interp_mode = scale < 1.0f ? TNNInterpLinear : TNNInterpLinear;
+        DimsVector intermediate_shape = {1, 3, resized_height, resized_width};
+        auto intermediate_mat = std::make_shared<Mat>(cropped_mat->GetDeviceType(), mat->GetMatType(), intermediate_shape);
+        auto status = Resize(cropped_mat, intermediate_mat, interp_mode);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+        const int top    = (target_height - resized_height) / 2;
+        const int bottom = (target_height - resized_height) - top;
+        const int left   = (target_width  - resized_width) / 2;
+        const int right  = (target_width  - resized_width) - left;
+
+        auto input_mat = std::make_shared<Mat>(intermediate_mat->GetDeviceType(), mat->GetMatType(), target_dims);
+        status = CopyMakeBorder(intermediate_mat, input_mat, top, bottom, left, right, TNNBorderConstant);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+        return input_mat;
+    }
+    return cropped_mat;
+}
+
+MatConvertParam BlazePoseLandmark::GetConvertParamForInput(std::string tag) {
+    MatConvertParam param;
+    param.scale = {1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0, 0.0};
+    param.bias  = {0.0,         0.0,         0.0,         0.0};
+    return param;
+}
+
+std::shared_ptr<TNNSDKOutput> BlazePoseLandmark::CreateSDKOutput() {
+    return std::make_shared<BlazePoseLandmarkOutput>();
+}
+
+Status BlazePoseLandmark::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<BlazePoseLandmarkOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                           Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<BlazePoseLandmarkOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+
+    auto landmarks = output->GetMat("ld_3d");
+    auto pose_presence_mat  = output->GetMat("output_poseflag");
+    RETURN_VALUE_ON_NEQ(!landmarks, false,
+                           Status(TNNERR_PARAM_ERR, "landmarks mat is nil"));
+    RETURN_VALUE_ON_NEQ(!pose_presence_mat, false,
+                           Status(TNNERR_PARAM_ERR, "pose_presence mat is nil"));
+    float pose_presence_score = static_cast<float *>(pose_presence_mat->GetData())[0];
+    if (pose_presence_score > option->pose_presence_threshold) {
+        // generate roi according to landmarks
+        roi_from_prev_frame = true;
+    } else {
+        // use pose_detection for the next frame
+        roi_from_prev_frame = false;
+    }
+    std::vector<BlazePoseInfo> detects;
+    ProcessLandmarks(*(landmarks.get()), detects);
+    RemoveLetterBoxAndProjection(detects);
+    //SmoothingLandmarks(detects);
+    if (roi_from_prev_frame) {
+        // generate roi for the next frame
+        KeyPoints2RoI(detects[0].key_points_3d, this->roi_option);
+    }
+    SmoothingLandmarks(detects);
+    DeNormalize(detects);
+    // upper body landmark model only have 25 points for show
+    detects[0].key_points_3d.resize(num_visible_landmarks);
+    output->body_list.push_back(std::move(detects[0]));
+
+    return status;
+}
+
+void BlazePoseLandmark::ProcessLandmarks(Mat& landmark_mat, std::vector<BlazePoseInfo>& detects) {
+    detects.clear();
+
+    const auto& input_dims = GetInputShape();
+    const int input_height = input_dims[2];
+    const int input_width  = input_dims[3];
+
+    const float* landmark_data = static_cast<float *>(landmark_mat.GetData());
+    const int num_dimensions = DimsVectorUtils::Count(landmark_mat.GetDims()) / num_landmarks;
+
+    BlazePoseInfo info;
+    info.key_points_3d.resize(num_landmarks);
+    for(int i=0; i<num_landmarks; ++i) {
+        int offset = i * num_dimensions;
+        float x = landmark_data[offset + 0] / input_width;
+        float y = landmark_data[offset + 1] / input_height;
+        float z = landmark_data[offset + 2] / input_width;
+        //float visibility = landmark_data[offset + 3];
+        info.key_points_3d[i] = std::make_tuple(x, y, z);
+    }
+    info.lines = this->lines;
+    detects.push_back(std::move(info));
+}
+
+void BlazePoseLandmark::RemoveLetterBoxAndProjection(std::vector<BlazePoseInfo>& detects) {
+    const float left = letterbox_pads[0];
+    const float top  = letterbox_pads[1];
+    const float left_and_right = letterbox_pads[0] + letterbox_pads[2];
+    const float top_and_bottom = letterbox_pads[1] + letterbox_pads[3];
+
+    for(auto& lm3d: detects[0].key_points_3d) {
+        // remove letterbox
+        float x = (std::get<0>(lm3d) - left) / (1.0f - left_and_right);
+        float y = (std::get<1>(lm3d) - top) / (1.0f - top_and_bottom);
+        float z = std::get<2>(lm3d) / (1.0f - left_and_right);  // scale z coordinate as X.
+        // projection
+        x = x - 0.5f;
+        y = y - 0.5f;
+        float angle = roi.rotation;
+        x = std::cos(angle) * x - std::sin(angle) * y;
+        y = std::sin(angle) * x + std::cos(angle) * y;
+
+        x = x * roi.width  + roi.x_center;
+        y = y * roi.height + roi.y_center;
+        z = z * roi.width;  // scale z coordinate as X.
+
+        lm3d = std::make_tuple(x, y, z);
+    }
+}
+
+void BlazePoseLandmark::DeNormalize(std::vector<BlazePoseInfo>& detects) {
+    const int src_height = origin_input_shape[2];
+    const int src_width  = origin_input_shape[3];
+
+    for(auto& lm3d: detects[0].key_points_3d) {
+        float x = std::get<0>(lm3d) * src_width;
+        float y = std::get<1>(lm3d) * src_height;
+        float z = std::get<2>(lm3d) * src_width;  // scale z coordinate as X.
+        lm3d = std::make_tuple(x, y, z);
+    }
+    detects[0].image_height = src_height;
+    detects[0].image_width  = src_width;
+}
+
+void BlazePoseLandmark::SmoothingLandmarks(std::vector<BlazePoseInfo> &detects) {
+    auto image_size = std::make_pair(this->origin_input_shape[2], this->origin_input_shape[3]);
+    std::vector<KeyPoint3D> out_landmarks;
+    landmark_filter->Apply(detects[0].key_points_3d, image_size, Now(), &out_landmarks);
+    if (out_landmarks.size() > 0) {
+        detects[0].key_points_3d = out_landmarks;
+    }
+}
+
+/*
+ Generate a RoI for the blazepose landmark model
+ This method corredponds to AlignmentPointsRectsCalculator+RectTransformationCalculator in mediapipe.
+ By setting keypoints_idx, we could merge SplitNormalizedLandmarkListCalculator
+ and LandmarksToDetectionCalculator together.
+*/
+// generate roi from the blazepose detection model
+void BlazePoseLandmark::Detection2RoI(BlazePoseInfo& detect, const RoIGenOptions& option) {
+    roi_from_prev_frame = false;
+    auto& keypoints_2d = detect.key_points;
+    return KeyPoints2RoI(keypoints_2d, option);
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/blazepose_landmark.h b/3rdparty/TNN/examples/base/blazepose_landmark.h
new file mode 100644
index 0000000..f3ac10d
--- /dev/null
+++ b/3rdparty/TNN/examples/base/blazepose_landmark.h
@@ -0,0 +1,185 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_BLAZEPOSE_LANDMARK_H_
+#define TNN_EXAMPLES_BASE_BLAZEPOSE_LANDMARK_H_
+
+#include "tnn_sdk_sample.h"
+#include "blazepose_detector.h"
+#include "landmark_smoothing_filter.h"
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <vector>
+#include <array>
+
+namespace TNN_NS {
+
+struct ROIRect {
+    // rotate angle in radian
+    float rotation;
+    float width;
+    float height;
+    float x_center;
+    float y_center;
+};
+
+class BlazePoseLandmarkInput : public TNNSDKInput {
+public:
+    BlazePoseLandmarkInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~BlazePoseLandmarkInput(){}
+};
+
+class BlazePoseLandmarkOutput : public TNNSDKOutput {
+public:
+    BlazePoseLandmarkOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~BlazePoseLandmarkOutput() {};
+    std::vector<BlazePoseInfo> body_list;
+};
+
+class BlazePoseLandmarkOption : public TNNSDKOption {
+public:
+    BlazePoseLandmarkOption() {}
+    virtual ~BlazePoseLandmarkOption() {}
+    int input_width;
+    int input_height;
+
+    int num_thread = 1;
+    // threshold for the pose presenting in landmark model
+    float pose_presence_threshold = 0.5;
+    // threshold for controlling whether to show the landmark
+    // not used for now
+    float landmark_visibility_threshold = 0.1;
+    // target fps, set it to the real fps of this app when running on your device
+    int fps = 30;
+    // if full-body model
+    bool full_body = false;
+};
+
+class BlazePoseLandmark : public TNNSDKSample {
+public:
+    virtual ~BlazePoseLandmark() {};
+
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                    std::string name = kTNNSDKDefaultName);
+    struct RoIGenOptions {
+        // which keypoints will be used to generate roi
+        int keypoints_start_idx;
+        int keypoints_end_idx;
+        // the target angle, in degrees
+        float rotation_target_angle;
+        float scale_x;
+        float scale_y;
+    };
+    void Detection2RoI(BlazePoseInfo& detect, const RoIGenOptions& option);
+    bool NeedPoseDetection() {
+        return !roi_from_prev_frame;
+    }
+    void SetOrigianlInputShape(int height, int width) {
+        if (this->origin_input_shape.size() <= 0)
+            this->origin_input_shape.resize(4, 0);
+        this->origin_input_shape[2] = height;
+        this->origin_input_shape[3] = width;
+    }
+    bool isFullBody() {
+        auto option = dynamic_cast<BlazePoseLandmarkOption *>(option_.get());
+        RETURN_VALUE_ON_NEQ(!option, false, false);
+        return option->full_body;
+    }
+private:
+    void GetCropMatrix(float trans_mat[2][3], std::vector<float>& target_size);
+    void ProcessLandmarks(Mat& landmark_mat, std::vector<BlazePoseInfo>& detects);
+    void RemoveLetterBoxAndProjection(std::vector<BlazePoseInfo>& detects);
+    void DeNormalize(std::vector<BlazePoseInfo>& detects);
+    void SmoothingLandmarks(std::vector<BlazePoseInfo>& detects);
+    
+    template <typename KeyPointType>
+    void KeyPoints2RoI(std::vector<KeyPointType>& key_points, const RoIGenOptions& option);
+    
+    // the orignal input shape
+    DimsVector origin_input_shape;
+    // the roi
+    ROIRect roi;
+    // if the roi comes from previous frame
+    bool roi_from_prev_frame = false;
+    // option used to generate roi for the next frame
+    RoIGenOptions roi_option = {
+        25,     // keypoints_start_idx
+        26,     // keypoints_end_idx
+        90.0f,  // rotation_target_angle, in degree
+        1.5f,   // scale_x
+        1.5f    // scale_y
+    };
+    // pads for remove latterbox from detection
+    std::array<float, 4> letterbox_pads;
+    int num_landmarks = 31;
+    int num_visible_landmarks = 25;
+    // hostory detect results
+    std::vector<BlazePoseInfo> history;
+    // lines connecting points
+    std::vector<std::pair<int, int>> lines = {
+        {0, 1},
+        {0, 4},
+        {1, 2},
+        {2, 3},
+        {3, 7},
+        {4, 5},
+        {5, 6},
+        {6, 8},
+        {9, 10},
+        {11,12},
+        {11,13},
+        {11,23},
+        {12,14},
+        {12,24},
+        {13,15},
+        {14,16},
+        {15,17},
+        {15,19},
+        {15,21},
+        {16,18},
+        {16,20},
+        {16,22},
+        {17,19},
+        {18,20},
+        {23,24}
+    };
+    // extra lines for full-body landmark model
+    std::vector<std::pair<int, int>> extended_lines_fb = {
+        {23, 25},
+        {24, 26},
+        {25, 27},
+        {26, 28},
+        {27, 29},
+        {27, 31},
+        {28, 30},
+        {28, 32},
+        {29, 31},
+        {30, 32}
+    };
+    // landmark filtering options
+    const int window_size = 5;
+    const float velocity_scale = 10.0;
+    const float min_allowed_object_scale = 1e-6;
+    std::shared_ptr<VelocityFilter> landmark_filter;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_BLAZEPOSE_LANDMARK_H_
diff --git a/3rdparty/TNN/examples/base/face_detect_aligner.cc b/3rdparty/TNN/examples/base/face_detect_aligner.cc
new file mode 100644
index 0000000..6012ab9
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_detect_aligner.cc
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "face_detect_aligner.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "blazeface_detector.h"
+#include "tnn_sdk_sample.h"
+#include "youtu_face_align.h"
+
+namespace TNN_NS {
+Status FaceDetectAligner::Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks) {
+    if (sdks.size() < 3) {
+        return Status(TNNERR_INST_ERR, "FaceDetectAligner::Init has invalid sdks, its size < 3");
+    }
+    
+    predictor_detect_ = sdks[0];
+    predictor_align_phase1_ = sdks[1];
+    predictor_align_phase2_ = sdks[2];
+    return TNNSDKComposeSample::Init(sdks);
+}
+
+Status FaceDetectAligner::Predict(std::shared_ptr<TNNSDKInput> sdk_input,
+                                  std::shared_ptr<TNNSDKOutput> &sdk_output) {
+    Status status = TNN_OK;
+    
+    if (!sdk_input || sdk_input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    auto predictor_detect_async = predictor_detect_;
+    auto predictor_align_phase1_async = predictor_align_phase1_;
+    auto predictor_align_phase2_async = predictor_align_phase2_;
+    auto predictor_align1_cast = dynamic_cast<YoutuFaceAlign *>(predictor_align_phase1_async.get());
+    auto predictor_align2_cast = dynamic_cast<YoutuFaceAlign *>(predictor_align_phase2_async.get());
+    
+    auto image_mat = sdk_input->GetMat();
+    const int image_orig_height = image_mat->GetHeight();
+    const int image_orig_width = image_mat->GetWidth();
+    
+    // output of each model
+    std::shared_ptr<TNNSDKOutput> sdk_output_face = nullptr;
+    std::shared_ptr<TNNSDKOutput> sdk_output1 = nullptr;
+    std::shared_ptr<TNNSDKOutput> sdk_output2 = nullptr;
+    
+    std::shared_ptr<TNN_NS::Mat> phase1_pts = nullptr;
+    
+    //phase1 model
+    {
+        // 1) prepare input for phase1 model
+        if(!has_prev_face_) {
+            // i) get face from detector
+            auto facedetector_input_dims = predictor_detect_->GetInputShape();
+            
+            //preprocess
+            auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), image_mat->GetMatType(), facedetector_input_dims);
+            
+            status = predictor_detect_async->Resize(image_mat, input_mat, TNNInterpLinear);
+            RETURN_ON_NEQ(status, TNN_OK);
+            
+            status = predictor_detect_async->Predict(std::make_shared<BlazeFaceDetectorInput>(input_mat), sdk_output_face);
+            RETURN_ON_NEQ(status, TNN_OK);
+            
+            std::vector<BlazeFaceInfo> face_info;
+            if (sdk_output_face && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output_face.get()))
+            {
+                auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output_face.get());
+                face_info = face_output->face_list;
+            }
+            if(face_info.size() <= 0) {
+                //no faces, return
+                LOGD("Error no faces found!\n");
+                return status;
+            }
+            auto face = face_info[0];
+            // scale the face point according to the original image size
+            auto face_orig = face.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+            LOGD("face_origin:(%f,%f,%f,%f), conf=%.4f\n", face_orig.x1, face_orig.y1, face_orig.x2, face_orig.y2, face_orig.score);
+            
+            // set face region for phase1 model
+            if (!(predictor_align1_cast &&
+                  predictor_align1_cast->SetFaceRegion(face_orig.x1, face_orig.y1, face_orig.x2, face_orig.y2))) {
+                //no invalid faces, return
+                LOGD("Error no valid faces found!\n");
+                return status;
+            }
+        }
+        
+        // 2) predict
+        status = predictor_align1_cast->Predict(std::make_shared<YoutuFaceAlignInput>(image_mat), sdk_output1);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        // update prev_face
+        has_prev_face_ = predictor_align1_cast->GetPrevFace();
+        if(!has_prev_face_) {
+            LOGD("Next frame will use face detector!\n");
+        }
+        phase1_pts = predictor_align1_cast->GetPrePts();
+    }
+    
+    //phase 2
+    std::shared_ptr<TNN_NS::Mat> phase2_pts = nullptr;
+    //phase2 model
+    {
+        // 1) prepare phase1 pts
+        predictor_align2_cast->SetPrePts(phase1_pts, true);
+        // 2) predict
+        status = predictor_align2_cast->Predict(std::make_shared<YoutuFaceAlignInput>(image_mat), sdk_output2);
+        RETURN_ON_NEQ(status, TNN_OK);
+        phase2_pts = predictor_align2_cast->GetPrePts();
+    }
+    
+
+    {
+        sdk_output = std::make_shared<YoutuFaceAlignOutput>();
+        auto phase1_output = dynamic_cast<YoutuFaceAlignOutput *>(sdk_output1.get());
+        auto phase2_output = dynamic_cast<YoutuFaceAlignOutput *>(sdk_output2.get());
+
+        auto& points        = phase1_output->face.key_points;
+        auto& points_phase2 = phase2_output->face.key_points;
+
+        points.insert(points.end(), points_phase2.begin(), points_phase2.end());
+
+        auto output = dynamic_cast<YoutuFaceAlignOutput *>(sdk_output.get());
+        output->face.key_points = points;
+        output->face.image_height = image_orig_height;
+        output->face.image_width  = image_orig_width;
+    }
+    return TNN_OK;
+}
+}
diff --git a/3rdparty/TNN/examples/base/face_detect_aligner.h b/3rdparty/TNN/examples/base/face_detect_aligner.h
new file mode 100644
index 0000000..f7525dc
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_detect_aligner.h
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_FACE_DETECT_ALIGNER_H_
+#define TNN_EXAMPLES_BASE_FACE_DETECT_ALIGNER_H_
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+
+#include "tnn_sdk_sample.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS{
+
+//struct YoutuFaceAlignInfo : TNN_NS::ObjectInfo {
+//};
+//
+//class YoutuFaceAlignInput : public TNNSDKInput {
+//public:
+//    YoutuFaceAlignInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+//    virtual ~YoutuFaceAlignInput(){}
+//};
+//
+//class YoutuFaceAlignOutput : public TNNSDKOutput {
+//public:
+//    YoutuFaceAlignOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+//    virtual ~YoutuFaceAlignOutput() {};
+//
+//    YoutuFaceAlignInfo face;
+//};
+//
+//class YoutuFaceAlignOption : public TNNSDKOption {
+//public:
+//    YoutuFaceAlignOption() {}
+//    virtual ~YoutuFaceAlignOption() {}
+//    int input_width;
+//    int input_height;
+//    int num_thread = 1;
+//    float net_scale;
+//    float face_threshold = 0.75;
+//    int min_face_size = 20;
+//    //phase
+//    int phase = -1;
+//    // mean_pts file path
+//    std::string mean_pts_path;
+//};
+
+class FaceDetectAligner : public TNN_NS::TNNSDKComposeSample {
+public:
+    virtual ~FaceDetectAligner() {}
+    
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+    
+    virtual Status Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks);
+    
+protected:
+    //前一帧是否有人脸
+    bool has_prev_face_ = false;
+    std::shared_ptr<TNNSDKSample> predictor_detect_ = nullptr;
+    std::shared_ptr<TNNSDKSample> predictor_align_phase1_ = nullptr;
+    std::shared_ptr<TNNSDKSample> predictor_align_phase2_ = nullptr;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_FACE_DETECT_ALIGNER_H_
diff --git a/3rdparty/TNN/examples/base/face_detect_mesh.cc b/3rdparty/TNN/examples/base/face_detect_mesh.cc
new file mode 100644
index 0000000..f7512b0
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_detect_mesh.cc
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "face_detect_mesh.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "blazeface_detector.h"
+#include "tnn_sdk_sample.h"
+#include "face_mesh.h"
+
+namespace TNN_NS {
+Status FaceDetectMesh::Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks) {
+    if (sdks.size() < 2) {
+        return Status(TNNERR_INST_ERR, "FaceDetectAligner::Init has invalid sdks, its size < 2");
+    }
+
+    predictor_detect_ = sdks[0];
+    predictor_mesh_ = sdks[1];
+    return TNNSDKComposeSample::Init(sdks);
+}
+
+Status FaceDetectMesh::Predict(std::shared_ptr<TNNSDKInput> sdk_input,
+                                  std::shared_ptr<TNNSDKOutput> &sdk_output) {
+    Status status = TNN_OK;
+
+    if (!sdk_input || sdk_input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    auto predictor_detect_async = predictor_detect_;
+    auto predictor_mesh_async = predictor_mesh_;
+    auto predictor_detect_cast = dynamic_cast<BlazeFaceDetector *>(predictor_detect_async.get());
+    auto predictor_mesh_cast = dynamic_cast<Facemesh *>(predictor_mesh_async.get());
+
+    auto image_mat = sdk_input->GetMat();
+    const int image_orig_height = image_mat->GetHeight();
+    const int image_orig_width = image_mat->GetWidth();
+
+    // output of each model
+    std::shared_ptr<TNNSDKOutput> sdk_output_face = nullptr;
+    std::shared_ptr<TNNSDKOutput> sdk_output_mesh = nullptr;
+
+    std::vector<BlazeFaceInfo> face_list;
+    // phase1: face detector
+    {
+        status = predictor_detect_cast->Predict(std::make_shared<BlazeFaceDetectorInput>(image_mat), sdk_output_face);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        if (sdk_output_face && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output_face.get())) {
+            auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output_face.get());
+            face_list = face_output->face_list;
+        }
+        if(face_list.size() <= 0) {
+            //no faces, return
+            LOGD("Error no faces found!\n");
+            return status;
+        }
+    }
+
+    int crop_height = -1, crop_width = -1;
+    int crop_x = -1, crop_y = -1;
+    // phase2: face mesh
+    {
+        // devan: only consider the 1st face
+        auto face = face_list[0];
+        auto face_orig = face.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+        //1.5*crop
+        crop_height = 1.5 * (face_orig.y2 - face_orig.y1);
+        crop_width  = 1.5 * (face_orig.x2 - face_orig.x1);
+        crop_x = (std::max)(0.0, face_orig.x1 - 0.25 * crop_width);
+        crop_y = (std::max)(0.0, face_orig.y1 - 0.25 * crop_height);
+        crop_width  = (std::min)(crop_width,  image_orig_width-crop_x);
+        crop_height = (std::min)(crop_height, image_orig_height-crop_y);
+
+        DimsVector crop_dims = {1, 3, static_cast<int>(crop_height), static_cast<int>(crop_width)};
+        std::shared_ptr<TNN_NS::Mat> croped_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, crop_dims);
+        status = predictor_mesh_cast->Crop(image_mat, croped_mat, crop_x, crop_y);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        status = predictor_mesh_cast->Predict(std::make_shared<FacemeshInput>(croped_mat), sdk_output_mesh);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+    //get output
+    {
+        if (sdk_output_mesh && dynamic_cast<FacemeshOutput *>(sdk_output_mesh.get())) {
+            auto face_output = dynamic_cast<FacemeshOutput *>(sdk_output_mesh.get());
+            auto face_mesh_list = face_output->face_list;
+
+            if (face_mesh_list.size() <= 0 ){
+                LOGD("Error no faces found!\n");
+                return status;
+            }
+            // only consider the first result
+            auto face_mesh = face_mesh_list[0];
+            auto face_mesh_cropped = face_mesh.AdjustToViewSize(crop_height, crop_width, 2);
+            face_mesh_cropped = face_mesh_cropped.AddOffset(crop_x, crop_y);
+
+            // set result
+            sdk_output = std::make_shared<FacemeshOutput>();
+            auto output = dynamic_cast<FacemeshOutput *>(sdk_output.get());
+            face_mesh_cropped.image_width = image_orig_width;
+            face_mesh_cropped.image_height = image_orig_height;
+            output->face_list.push_back(*reinterpret_cast<FacemeshInfo*>(&face_mesh_cropped));
+        }
+    }
+    return TNN_OK;
+}
+}
diff --git a/3rdparty/TNN/examples/base/face_detect_mesh.h b/3rdparty/TNN/examples/base/face_detect_mesh.h
new file mode 100644
index 0000000..ea973c2
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_detect_mesh.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_FACE_DETECT_MESH_H_
+#define TNN_EXAMPLES_BASE_FACE_DETECT_MESH_H_
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+
+#include "tnn_sdk_sample.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS{
+
+class FaceDetectMesh : public TNN_NS::TNNSDKComposeSample {
+public:
+    virtual ~FaceDetectMesh() {}
+
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+
+    virtual Status Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks);
+
+protected:
+    std::shared_ptr<TNNSDKSample> predictor_detect_ = nullptr;
+    std::shared_ptr<TNNSDKSample> predictor_mesh_ = nullptr;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_FACE_DETECT_MESH_H_
diff --git a/3rdparty/TNN/examples/base/face_gray_transfer.cc b/3rdparty/TNN/examples/base/face_gray_transfer.cc
new file mode 100644
index 0000000..a9710b2
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_gray_transfer.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "face_gray_transfer.h"
+#include <cmath>
+
+namespace TNN_NS {
+
+FaceGrayTransfer::~FaceGrayTransfer() {}
+
+MatConvertParam FaceGrayTransfer::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_cvt_param;
+    input_cvt_param.scale = {2.0 / 255, 2.0 / 255, 2.0 / 255, 0.0};
+    input_cvt_param.bias  = {-1.0, -1.0, -1.0, 0.0};
+    return input_cvt_param;
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/face_gray_transfer.h b/3rdparty/TNN/examples/base/face_gray_transfer.h
new file mode 100644
index 0000000..aac21ea
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_gray_transfer.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_FACE_GRAY_TRANSFER_H_
+#define TNN_EXAMPLES_BASE_FACE_GRAY_TRANSFER_H_
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "tnn_sdk_sample.h"
+
+namespace TNN_NS {
+
+class FaceGrayTransfer : public TNN_NS::TNNSDKSample {
+public:
+    ~FaceGrayTransfer();
+    virtual MatConvertParam GetConvertParamForInput(std::string tag = "");
+//    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+//    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+//    
+////    TNN_NS::Status Trasfer(std::shared_ptr<TNN_NS::Mat> input_mat,
+////                   std::shared_ptr<TNN_NS::Mat>&output_mat,
+////                   int input_width, int input_length);
+};
+
+}
+#endif //TNN_EXAMPLES_BASE_FACE_GRAY_TRANSFER_H_
diff --git a/3rdparty/TNN/examples/base/face_mesh.cc b/3rdparty/TNN/examples/base/face_mesh.cc
new file mode 100644
index 0000000..c838a72
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_mesh.cc
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "face_mesh.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+#include <tuple>
+
+namespace TNN_NS {
+
+Status Facemesh::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<FacemeshOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+
+    return status;
+}
+
+MatConvertParam Facemesh::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    //TODO: find proper preprocess parameters: do we need channel_swapping?
+    //devandong: from face_landmark_cpu.pbtxt:TfLiteConverterCalculator
+    input_convert_param.scale = {2.0 / 255.0, 2.0 / 255.0, 2.0 / 255.0, 0.0};
+    input_convert_param.bias  = {-1.0, -1.0, -1.0, 0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> Facemesh::CreateSDKOutput() {
+    return std::make_shared<FacemeshOutput>();
+}
+
+std::shared_ptr<Mat> Facemesh::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    auto target_dims = GetInputShape(name);
+    auto input_height = input_mat->GetHeight();
+    auto input_width = input_mat->GetWidth();
+    if (target_dims.size() >= 4 &&
+        (input_height != target_dims[2] || input_width != target_dims[3])) {
+        auto target_mat = std::make_shared<TNN_NS::Mat>(input_mat->GetDeviceType(),
+                                                        input_mat->GetMatType(), target_dims);
+        auto status = Resize(input_mat, target_mat, TNNInterpLinear);
+        if (status == TNN_OK) {
+            return target_mat;
+        } else {
+            LOGI("%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+    return input_mat;
+}
+
+Status Facemesh::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<FacemeshOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                           Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<FacemeshOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    auto landmarkMat  = output->GetMat("conv2d_20");
+    auto scoreMat     = output->GetMat("conv2d_30");
+    
+    std::vector<FacemeshInfo> landmarks;
+    // dummy rect， indicating the crop is the same as the input
+    //TODO: generate the rect from: 1) blazeface detection; 2) generated landmarks
+    FaceRect rect{
+        0,   //rotation
+        1,   //width
+        1,   //height,
+        0.5, //x_center,
+        0.5  //y_center
+    };
+    //decode landmarks
+    GenerateLandmarks(landmarks, *(scoreMat.get()), *(landmarkMat.get()), option, rect);
+    
+    output->face_list = landmarks;
+    return status;
+}
+
+/*
+ devandong: implement post-processing according to mediapipe(face_landmark_cpu.pbtxt):
+ 1) get face_presence_score and compare it with threshold;
+ 2) normalize the landmarks to the input crop;
+ 3) normalize the landmarks to the original input image;
+ */
+void Facemesh::GenerateLandmarks(std::vector<FacemeshInfo> &detects, TNN_NS::Mat &score_mat, TNN_NS::Mat &landmark_mat, FacemeshOption* option, FaceRect& rect) {
+    const int image_w = option->input_width;
+    const int image_h = option->input_height;
+    const float norm_z = option->norm_z;
+    const float face_presence_threshold = option->face_presence_threshold;
+    bool flip_horizontal = option->flip_horizontally;
+    bool flip_vertical = option->flip_vertically;
+    // check face presence score
+    float face_presence_score = static_cast<float*>(score_mat.GetData())[0];
+    if(face_presence_score < face_presence_threshold) {
+        //No faces found
+        LOGI("No faces found!\n");
+        return;
+    }
+    // landmark decoding & normalization
+    float* rawLandmarkData = static_cast<float*>(landmark_mat.GetData());
+    
+    FacemeshInfo info;
+    info.image_width = image_w;
+    info.image_height = image_h;
+    
+    for(int i=0; i < num_landmarks; ++i) {
+        int offset = i * landmark_dimensions;
+        
+        //decode landmark
+        float x = flip_horizontal ? image_w - rawLandmarkData[offset + 0] : rawLandmarkData[offset + 0];
+        float y = flip_vertical ? image_h - rawLandmarkData[offset + 1] : rawLandmarkData[offset + 1];
+        float z = rawLandmarkData[offset + 2];
+        
+        //normalization with the crop
+        x = x / input_crop_width;
+        y = y / input_crop_height;
+        z = z / input_crop_width / norm_z;
+        
+        //transform the coords of the crop to coords of the image
+        x = x - 0.5f;
+        y = y - 0.5f;
+        float angle = option->ignore_rotation ? 0 : rect.rotation;
+        x = std::cos(angle) * x - std::sin(angle) * y;
+        y = std::sin(angle) * x + std::cos(angle) * y;
+        x = x * rect.width  + rect.x_center;
+        y = y * rect.height + rect.y_center;
+        z = z * rect.width;
+        /*
+         transform to the coords of the original input iamge, references:
+         1) mediapipe/util/annnotation_renderer.cc
+         2) mediapipe/calculators/util/annotation_overlay_calculator.cc
+         3) mediapipe/util/render_data.ptoto
+         4) mediapipe/graphs/face_mesh/subgraphs/face_renderer_cpu.pbtxt
+         */
+        x = round(x * image_w);
+        y = round(y * image_h);
+        
+        info.key_points_3d.push_back(std::make_tuple(x, y, z));
+        info.key_points.push_back(std::make_pair(x, y));
+    }
+    detects.push_back(std::move(info));
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/face_mesh.h b/3rdparty/TNN/examples/base/face_mesh.h
new file mode 100644
index 0000000..d246bfb
--- /dev/null
+++ b/3rdparty/TNN/examples/base/face_mesh.h
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_FACE_MESH_H_
+#define TNN_EXAMPLES_BASE_FACE_MESH_H_
+
+#include "tnn_sdk_sample.h"
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <vector>
+#include <tuple>
+
+namespace TNN_NS {
+
+struct FacemeshInfo : TNN_NS::ObjectInfo {
+};
+
+struct FaceRect {
+    // rotate angle in radian
+    float rotation;
+    //TODO: what's the size?
+    float width;
+    float height;
+    float x_center;
+    float y_center;
+};
+
+class FacemeshInput : public TNNSDKInput {
+public:
+    FacemeshInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~FacemeshInput(){}
+};
+
+class FacemeshOutput : public TNNSDKOutput {
+public:
+    FacemeshOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~FacemeshOutput() {};
+    std::vector<FacemeshInfo> face_list;
+};
+
+class FacemeshOption : public TNNSDKOption {
+public:
+    FacemeshOption() {}
+    virtual ~FacemeshOption() {}
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    //TODO: add specific arguments, e.g., thresholds
+    float face_presence_threshold = 0.1;
+    bool flip_vertically   = false;
+    bool flip_horizontally = false;
+    float norm_z = 1.0;
+    bool ignore_rotation = false;
+};
+
+class Facemesh : public TNNSDKSample {
+public:
+    virtual ~Facemesh() {};
+    
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat, std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateLandmarks(std::vector<FacemeshInfo> &landmarks, TNN_NS::Mat &score_mat, TNN_NS::Mat &landmark_mat, FacemeshOption* option, FaceRect& rect);
+    // generate face rect accorfing to landmarks, used for the next frame
+    void GenerateRectFromLandmarks(FaceRect& rect) {}
+    // use blazeface to generate facerect
+    void DetectFace(FaceRect& rect) {}
+    
+    int num_landmarks = 468;
+    int landmark_dimensions = 3;
+    int input_crop_height = 192;
+    int input_crop_width  = 192;
+};
+
+}
+
+
+#endif // TNN_EXAMPLES_BASE_FACE_MESH_H_
diff --git a/3rdparty/TNN/examples/base/hair_segmentation.cc b/3rdparty/TNN/examples/base/hair_segmentation.cc
new file mode 100644
index 0000000..81f1a41
--- /dev/null
+++ b/3rdparty/TNN/examples/base/hair_segmentation.cc
@@ -0,0 +1,261 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hair_segmentation.h"
+
+namespace TNN_NS {
+
+Status HairSegmentation::ConvertMat(std::shared_ptr<Mat> src, std::shared_ptr<Mat> dst) {
+    if (DimsVectorUtils::Count(src->GetDims()) != DimsVectorUtils::Count(dst->GetDims()))
+        return Status(TNNERR_PARAM_ERR, "src and dst mat have different dims!");
+    if (src->GetMatType() == dst->GetMatType()) {
+        return TNN_OK;
+    } else {
+        auto src_mat_type = src->GetMatType();
+        auto dst_mat_type = dst->GetMatType();
+        auto count = DimsVectorUtils::Count(src->GetDims());
+        if (src_mat_type == NCHW_FLOAT && dst_mat_type == NGRAY) {
+            auto src_ptr = static_cast<float *>(src->GetData());
+            auto dst_ptr = static_cast<uint8_t *>(dst->GetData());
+            for(int i=0; i<count; ++i) {
+                dst_ptr[i] = static_cast<uint8_t>(src_ptr[i] * 255.0f);
+            }
+        } else if (src_mat_type == NGRAY && dst_mat_type == NCHW_FLOAT) {
+            auto src_ptr = static_cast<uint8_t *>(src->GetData());
+            auto dst_ptr = static_cast<float *>(dst->GetData());
+            for(int i=0; i<count; ++i) {
+                dst_ptr[i] = static_cast<float>(src_ptr[i] / 255.0f);
+            }
+        } else {
+            return Status(TNNERR_INST_ERR, "unsupported mat type pair!");
+        }
+    }
+    return TNN_OK;
+}
+
+Status HairSegmentation::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<HairSegmentationOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false, Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+    
+    return status;
+}
+
+MatConvertParam HairSegmentation::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / (255 * 0.229), 1.0 / (255 * 0.224), 1.0 / (255 * 0.225), 0.0};
+    input_convert_param.bias  = {-0.485 / 0.229,     -0.456 / 0.224,       -0.406 / 0.225,      0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> HairSegmentation::CreateSDKOutput() {
+    return std::make_shared<HairSegmentationOutput>();
+}
+
+std::shared_ptr<Mat> HairSegmentation::ProcessSDKInputMat(std::shared_ptr<Mat> input_image, std::string name) {
+    RETURN_VALUE_ON_NEQ(input_image->GetMatType(), N8UC4, nullptr);
+    this->orig_dims = input_image->GetDims();
+    // save input image mat for merging
+    auto dims = input_image->GetDims();
+    //dims[1] = 4;
+    this->input_image = std::make_shared<Mat>(DEVICE_ARM, N8UC4, dims);
+    auto status = Copy(input_image, this->input_image);
+    RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+
+    auto target_dims = GetInputShape(name);
+    auto input_height = input_image->GetHeight();
+    auto input_width = input_image->GetWidth();
+    if (target_dims.size() >= 4 &&
+        (input_height != target_dims[2] || input_width != target_dims[3])) {
+        auto target_mat = std::make_shared<TNN_NS::Mat>(input_image->GetDeviceType(),
+                                                        input_image->GetMatType(), target_dims);
+        auto status = Resize(input_image, target_mat, TNNInterpLinear);
+        if (status == TNN_OK) {
+            return target_mat;
+        } else {
+            LOGE("%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+    return input_image;
+}
+
+std::shared_ptr<Mat> HairSegmentation::MergeImage(std::shared_ptr<Mat> alpha, RGBA color) {
+    auto merged_image = std::make_shared<Mat>(alpha->GetDeviceType(), N8UC4, orig_dims);
+    auto alpha_data = static_cast<float *>(alpha->GetData());
+    auto image_data = static_cast<uint8_t *>(input_image->GetData());
+    auto merged_image_data = static_cast<uint8_t *>(merged_image->GetData());
+
+    auto hw = orig_dims[2] * orig_dims[3];
+    auto channel = orig_dims[1];
+    for(int s=0; s<hw; ++s) {
+        float hair_conf = alpha_data[s];
+        //float bg_conf   = 1 - hair_conf;
+        const float bg_conf = 1.0f;
+        const float merge_weight = color.a/255.0f;
+        float c0 = bg_conf * image_data[s*channel + 0] + merge_weight * hair_conf * color.r;
+        float c1 = bg_conf * image_data[s*channel + 1] + merge_weight * hair_conf * color.g;
+        float c2 = bg_conf * image_data[s*channel + 2] + merge_weight * hair_conf * color.b;
+        float c3 = 255;
+
+        merged_image_data[s*4 + 0] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, c0)));
+        merged_image_data[s*4 + 1] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, c1)));
+        merged_image_data[s*4 + 2] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, c2)));
+        merged_image_data[s*4 + 3] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, c3)));
+    }
+    return merged_image;
+}
+
+Status HairSegmentation::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<HairSegmentationOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false, Status(TNNERR_PARAM_ERR, "TNNOption is invalid"));
+    
+    auto output = dynamic_cast<HairSegmentationOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false, Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+
+    auto bg = output->GetMat("background");
+    auto fg = output->GetMat("foreground");
+    auto alpha = ProcessAlpha(fg, option->mode);
+    auto merged_image = MergeImage(alpha, this->hair_color_);
+    alpha = GenerateAlphaImage(alpha);
+
+    output->hair_mask = ImageInfo(alpha);
+    output->merged_image = ImageInfo(merged_image);
+
+    return status;
+}
+
+std::shared_ptr<Mat> HairSegmentation::ProcessAlpha(std::shared_ptr<Mat> alpha, int mode) {
+    std::shared_ptr<Mat> rtn = nullptr;
+    auto alpha_dims = alpha->GetDims();
+    if (mode == 0 || mode == 1) {
+        auto resized_dims = orig_dims;
+        resized_dims[0] = alpha_dims[0];
+        resized_dims[1] = alpha_dims[1];
+        // resize
+        rtn = std::make_shared<Mat>(alpha->GetDeviceType(), alpha->GetMatType(), resized_dims);
+        auto status = ResizeFloatMat(alpha, rtn, TNNInterpLinear);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        if (mode == 0) {
+            auto data  = static_cast<float *>(rtn->GetData());
+            auto count = DimsVectorUtils::Count(rtn->GetDims());
+            // clip
+            auto clip = [](float& val) {
+                val = val > 0.5? 1 :(val < 0.5? 0:val);
+            };
+            std::for_each(data, data+count, clip);
+        }
+    } else if (mode == 2) {
+        //downsample to 64*64
+        auto resized_dims = alpha->GetDims();
+        resized_dims[2] = 64;
+        resized_dims[3] = 64;
+        auto alpha_small = std::make_shared<Mat>(alpha->GetDeviceType(), alpha->GetMatType(), resized_dims);
+        auto status = ResizeFloatMat(alpha, alpha_small, TNNInterpLinear);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        // step.1: Gaussian Blur
+        auto alpha_data = static_cast<float *>(alpha_small->GetData());
+        auto expander = [](float& val){
+            val = val * 255.0;
+        };
+        std::for_each(alpha_data, alpha_data+DimsVectorUtils::Count(alpha_small->GetDims()), expander);
+        //TODO:: gaussian blur on alpha_data
+        
+        // step.2: resize & clip
+        rtn = std::make_shared<Mat>(alpha->GetDeviceType(), alpha->GetMatType(), orig_dims);
+        status = ResizeFloatMat(alpha_small, rtn, TNNInterpLinear);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        alpha_data = static_cast<float *>(rtn->GetData());
+        auto step2_processor = [](float& val) {
+            double x = std::exp(2*val - 1);
+            x = x / (x + 1.0);
+            x = std::min(1.0, std::max(0.0, (x - 0.5) * 1.5 + 0.4));
+            x = x < 0.45? 0 : (x>0.9 ? 1.0 : x);
+            x = std::min(1.0, std::max(0.0, (x - 0.5) * 1.5 + 0.5));
+            x = std::min(1.0, std::max(0.0, (x - 0.4) / 0.4));
+            x = 3*x*x - 2*x*x*x;
+            val = x;
+        };
+        std::for_each(alpha_data, alpha_data+DimsVectorUtils::Count(rtn->GetDims()), step2_processor);
+    } else{
+        LOGE("invalid alpha process mode!\n");
+    }
+    return rtn;
+}
+
+std::shared_ptr<Mat> HairSegmentation::GenerateAlphaImage(std::shared_ptr<Mat> alpha) {
+    RETURN_VALUE_ON_NEQ(alpha->GetChannel(), 1, nullptr);
+    auto alpha_image_dims = alpha->GetDims();
+    alpha_image_dims[1]   = 4;
+    auto alpha_image = std::make_shared<Mat>(alpha->GetDeviceType(), N8UC4, alpha_image_dims);
+    auto alpha_data = static_cast<float *>(alpha->GetData());
+    auto alpha_image_data = static_cast<uint8_t *>(alpha_image->GetData());
+    
+    auto alpha_dims = alpha->GetDims();
+    auto hw = alpha_dims[2] * alpha_dims[3];
+    for(int i=0; i<hw; ++i) {
+        float val = static_cast<uint8_t>(std::min(255.0, std::max(0.0, alpha_data[i]*255.0)));
+        alpha_image_data[i*4 + 0] = val;
+        alpha_image_data[i*4 + 1] = val;
+        alpha_image_data[i*4 + 2] = val;
+        alpha_image_data[i*4 + 3] = 0;
+    }
+    return alpha_image;
+}
+
+/*
+ Resize a NCHW_FLOAT mat
+ allocate buffer N8UC4 mat to perform resize
+ */
+Status HairSegmentation::ResizeFloatMat(std::shared_ptr<Mat> input_mat, std::shared_ptr<Mat> output_mat, TNNInterpType type) {
+    Status status = TNN_OK;
+    RETURN_VALUE_ON_NEQ(input_mat->GetMatType(), NCHW_FLOAT, Status(TNNERR_PARAM_ERR, "invalid input mat, only NCHW_FLAOT supported!"));
+
+    auto input_dims = input_mat->GetDims();
+    auto buffer_mat_type = INVALID;
+    if (input_dims[1] == 4)
+        buffer_mat_type = N8UC4;
+    else if (input_dims[1] == 3)
+        buffer_mat_type = N8UC3;
+    else if (input_dims[1] == 1)
+        buffer_mat_type = NGRAY;
+
+    // allocate temp buffer mat
+    auto input_image_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), buffer_mat_type, input_mat->GetDims());
+    auto output_image_mat = std::make_shared<Mat>(output_mat->GetDeviceType(), buffer_mat_type, output_mat->GetDims());
+
+    // copy input mat
+    status = ConvertMat(input_mat, input_image_mat);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // resize
+    status = Resize(input_image_mat, output_image_mat, type);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // copy back
+    status = ConvertMat(output_image_mat, output_mat);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return status;
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/hair_segmentation.h b/3rdparty/TNN/examples/base/hair_segmentation.h
new file mode 100644
index 0000000..3df3c61
--- /dev/null
+++ b/3rdparty/TNN/examples/base/hair_segmentation.h
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_HAIR_SEGMENTATION_H_
+#define TNN_EXAMPLES_BASE_HAIR_SEGMENTATION_H_
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+
+#include "tnn_sdk_sample.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+class HairSegmentationInput : public TNNSDKInput {
+public:
+    HairSegmentationInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~HairSegmentationInput() {}
+};
+
+class HairSegmentationOutput : public TNNSDKOutput {
+public:
+    HairSegmentationOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~HairSegmentationOutput() {};
+
+    ImageInfo hair_mask;
+    ImageInfo merged_image;
+};
+
+class HairSegmentationOption : public TNNSDKOption {
+public:
+    HairSegmentationOption() {}
+    virtual ~HairSegmentationOption() {}
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    // the processing mode of output mask
+    int mode = 0;
+};
+
+class HairSegmentation : public TNN_NS::TNNSDKSample {
+public:
+    virtual ~HairSegmentation() {}
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat, std::string name = kTNNSDKDefaultName);
+    // Set the color used on hair
+    void SetHairColor(const RGBA& color) {
+        this->hair_color_ = color;
+    }
+
+private:
+    std::shared_ptr<Mat> ProcessAlpha(std::shared_ptr<Mat> alpha, int mode);
+    std::shared_ptr<Mat> GenerateAlphaImage(std::shared_ptr<Mat> alpha);
+    std::shared_ptr<Mat> MergeImage(std::shared_ptr<Mat> alpha, RGBA color);
+    Status ResizeFloatMat(std::shared_ptr<Mat> input_mat, std::shared_ptr<Mat> output_mat, TNNInterpType type = TNNInterpLinear);
+    Status ConvertMat(std::shared_ptr<Mat>src, std::shared_ptr<Mat>dst);
+    // the original input image shape
+    DimsVector orig_dims;
+    // the original input image
+    std::shared_ptr<Mat> input_image;
+    // the color used on hair
+    RGBA hair_color_ = {0x00, 0x00, 0xbb, 0x55}; // blue
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_HAIR_SEGMENTATION_H_
diff --git a/3rdparty/TNN/examples/base/image_classifier.cc b/3rdparty/TNN/examples/base/image_classifier.cc
new file mode 100644
index 0000000..7f629a8
--- /dev/null
+++ b/3rdparty/TNN/examples/base/image_classifier.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "image_classifier.h"
+#include "sample_timer.h"
+#include <cmath>
+
+namespace TNN_NS {
+
+ImageClassifierOutput::~ImageClassifierOutput() {}
+
+ImageClassifier::~ImageClassifier() {}
+
+std::shared_ptr<Mat> ImageClassifier::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    auto target_dims = GetInputShape(name);
+    auto input_height = input_mat->GetHeight();
+    auto input_width = input_mat->GetWidth();
+    auto input_channel = input_mat->GetChannel();
+    if (target_dims.size() >= 4 &&  input_channel == target_dims[1] &&
+        (input_height != target_dims[2] || input_width != target_dims[3])) {
+        auto target_mat = std::make_shared<TNN_NS::Mat>(input_mat->GetDeviceType(),
+                                                        input_mat->GetMatType(), target_dims);
+        auto status = Resize(input_mat, target_mat, TNNInterpLinear);
+        if (status == TNN_OK) {
+            return target_mat;
+        } else {
+            LOGE("%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+    return input_mat;
+}
+
+
+MatConvertParam ImageClassifier::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_cvt_param;
+    input_cvt_param.scale = {1.0 / (255 * 0.229), 1.0 / (255 * 0.224), 1.0 / (255 * 0.225), 0.0};
+    input_cvt_param.bias  = {-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225, 0.0};
+    return input_cvt_param;
+}
+
+std::shared_ptr<TNNSDKOutput> ImageClassifier::CreateSDKOutput() {
+    return std::make_shared<ImageClassifierOutput>();
+}
+
+Status ImageClassifier::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto output = dynamic_cast<ImageClassifierOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    auto output_mat_scores = output->GetMat();
+    RETURN_VALUE_ON_NEQ(!output_mat_scores, false,
+                        Status(TNNERR_PARAM_ERR, "output_mat_scores is invalid"));
+    
+    int class_id           = 0;
+    float *scores_data = (float *)output_mat_scores.get()->GetData();
+    float max_v        = scores_data[0];
+    for (int i = 1; i < output_mat_scores->GetChannel(); ++i) {
+        if (max_v < scores_data[i]) {
+            max_v    = scores_data[i];
+            class_id = i;
+        }
+    }
+    output->class_id = class_id;
+    return status;
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/image_classifier.h b/3rdparty/TNN/examples/base/image_classifier.h
new file mode 100644
index 0000000..eb19ec0
--- /dev/null
+++ b/3rdparty/TNN/examples/base/image_classifier.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_IMAGE_CLASSIFIER_H_
+#define TNN_EXAMPLES_BASE_IMAGE_CLASSIFIER_H_
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tnn_sdk_sample.h"
+
+namespace TNN_NS {
+
+class ImageClassifierOutput : public TNNSDKOutput {
+public:
+    ImageClassifierOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~ImageClassifierOutput();
+    
+    int class_id = -1;
+};
+
+class ImageClassifier : public TNN_NS::TNNSDKSample {
+public:
+    virtual ~ImageClassifier();
+    virtual MatConvertParam GetConvertParamForInput(std::string tag = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<TNN_NS::Mat> ProcessSDKInputMat(std::shared_ptr<TNN_NS::Mat> mat,
+                                                              std::string name);
+
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_IMAGE_CLASSIFIER_H_
diff --git a/3rdparty/TNN/examples/base/macro.h b/3rdparty/TNN/examples/base/macro.h
new file mode 100644
index 0000000..79d80b8
--- /dev/null
+++ b/3rdparty/TNN/examples/base/macro.h
@@ -0,0 +1,53 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_X86_UTILS_MACRO_H_
+#define TNN_EXAMPLES_X86_UTILS_MACRO_H_
+
+#include <stdio.h>
+#include <string>
+
+#include "tnn/core/status.h"
+
+#define LOGERROR(status)                                \
+    fprintf(stderr, "%s", status.description().c_str());
+
+#define BREAK_ON_NEQ(status, expected)                  \
+    {                                                   \
+        auto _status = (status);                        \
+        if (_status != (expected)) {                    \
+            LOGERROR(_status);                          \
+            break;                                      \
+        }                                               \
+    } 
+
+#define CHECK_API(status)                                                      \
+    do {                                                                       \
+        if (status != 0) {                                                     \
+            fprintf(stderr, "API ERROR:%d\n", int(status));                    \
+            return -1;                                                         \
+        }                                                                      \
+    } while (0)
+
+#define CHECK_TNN_STATUS(status)                                               \
+    do {                                                                       \
+        if (status != TNN_NS::TNN_OK) {                                        \
+            fprintf(stderr, "TNN API ERROR:0x%x", int(status));                \
+            return status;                                                     \
+        }                                                                      \
+    } while (0)
+
+
+#endif // TNN_EXAMPLES_X86_UTILS_MACRO_H_
diff --git a/3rdparty/TNN/examples/base/object_detector_ssd.cc b/3rdparty/TNN/examples/base/object_detector_ssd.cc
new file mode 100644
index 0000000..226495b
--- /dev/null
+++ b/3rdparty/TNN/examples/base/object_detector_ssd.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "object_detector_ssd.h"
+#include <iostream>
+#include <algorithm>
+#include <unordered_set>
+
+namespace TNN_NS {
+
+ObjectDetectorSSDOutput::~ObjectDetectorSSDOutput() {}
+
+ObjectDetectorSSD::~ObjectDetectorSSD() {}
+
+std::shared_ptr<Mat> ObjectDetectorSSD::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    return TNNSDKSample::ResizeToInputShape(input_mat, name);
+}
+
+MatConvertParam ObjectDetectorSSD::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / (255 * 0.229), 1.0 / (255 * 0.224), 1.0 / (255 * 0.225), 0.0};
+    input_convert_param.bias  = {-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225, 0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> ObjectDetectorSSD::CreateSDKOutput() {
+    return std::make_shared<ObjectDetectorSSDOutput>();
+}
+
+Status ObjectDetectorSSD::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    
+    auto output = dynamic_cast<ObjectDetectorSSDOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    auto output_mat = output->GetMat();
+    RETURN_VALUE_ON_NEQ(!output_mat, false,
+                        Status(TNNERR_PARAM_ERR, "GetMat is invalid"));
+    
+    auto input_shape = GetInputShape();
+    RETURN_VALUE_ON_NEQ(input_shape.size() == 4, true,
+                        Status(TNNERR_PARAM_ERR, "GetInputShape is invalid"));
+    
+    auto num_detections = output_mat->GetHeight();
+    
+    std::vector<ObjectInfo> object_list;
+    GenerateDetectResult(output_mat, object_list, num_detections, input_shape[3], input_shape[2]);
+    output->object_list = object_list;
+    return status;
+}
+
+void ObjectDetectorSSD::GenerateDetectResult(std::shared_ptr<TNN_NS::Mat> output, std::vector<ObjectInfo>& detecs,
+                                             int num_detections, int image_width, int image_height) {
+    float* data = reinterpret_cast<float*>(output->GetData());
+    auto clip = [](float v){
+        return (std::min)(v>0.0?v:0.0, 1.0);
+    };
+    
+    for(int i=0; i<num_detections; ++i) {
+        ObjectInfo info;
+        info.image_width = image_width;
+        info.image_height = image_height;
+        
+        info.class_id = data[i*7+1];
+        if(info.class_id < 0 || info.class_id >= sizeof(voc_classes)/sizeof(*voc_classes)) {
+            continue;
+        }
+        info.score = data[i*7+2];
+        info.x1 = clip(data[i*7+3])*image_width;
+        info.y1 = clip(data[i*7+4])*image_height;
+        info.x2 = clip(data[i*7+5])*image_width;
+        info.y2 = clip(data[i*7+6])*image_height;
+        
+        detecs.push_back(std::move(info));
+    }
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/object_detector_ssd.h b/3rdparty/TNN/examples/base/object_detector_ssd.h
new file mode 100644
index 0000000..9e18d0d
--- /dev/null
+++ b/3rdparty/TNN/examples/base/object_detector_ssd.h
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OBJECT_DETECTOR_SSD_H_
+#define TNN_EXAMPLES_BASE_OBJECT_DETECTOR_SSD_H_
+
+#include "tnn_sdk_sample.h"
+#include <memory>
+#include <string>
+
+namespace TNN_NS {
+
+constexpr const char* voc_classes[] = {
+    "background",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor"
+};
+
+class ObjectDetectorSSDOutput : public TNNSDKOutput {
+public:
+    ObjectDetectorSSDOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~ObjectDetectorSSDOutput();
+    std::vector<ObjectInfo> object_list;
+};
+
+class ObjectDetectorSSD : public TNNSDKSample {
+public:
+    ~ObjectDetectorSSD();
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateDetectResult(std::shared_ptr<Mat> output, std::vector<ObjectInfo>& detects,
+                              int num_detections, int image_width, int image_height);
+    
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_OBJECT_DETECTOR_SSD_H_
diff --git a/3rdparty/TNN/examples/base/object_detector_yolo.cc b/3rdparty/TNN/examples/base/object_detector_yolo.cc
new file mode 100644
index 0000000..96ab7cc
--- /dev/null
+++ b/3rdparty/TNN/examples/base/object_detector_yolo.cc
@@ -0,0 +1,132 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "object_detector_yolo.h"
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+
+
+namespace TNN_NS {
+ObjectDetectorYoloOutput::~ObjectDetectorYoloOutput() {}
+
+MatConvertParam ObjectDetectorYolo::GetConvertParamForInput(std::string name) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / 255, 1.0 / 255, 1.0 / 255, 0.0};
+    input_convert_param.bias  = {0.0, 0.0, 0.0, 0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<Mat> ObjectDetectorYolo::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    return TNNSDKSample::ResizeToInputShape(input_mat, name);
+}
+
+std::shared_ptr<TNNSDKOutput> ObjectDetectorYolo::CreateSDKOutput() {
+    return std::make_shared<ObjectDetectorYoloOutput>();
+}
+
+Status ObjectDetectorYolo::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    
+    auto output = dynamic_cast<ObjectDetectorYoloOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    auto output_mat_0 = output->GetMat("428");
+    RETURN_VALUE_ON_NEQ(!output_mat_0, false,
+                        Status(TNNERR_PARAM_ERR, "GetMat is invalid"));
+    auto output_mat_1 = output->GetMat("427");
+    RETURN_VALUE_ON_NEQ(!output_mat_1, false,
+                        Status(TNNERR_PARAM_ERR, "GetMat is invalid"));
+    auto output_mat_2 = output->GetMat("426");
+    RETURN_VALUE_ON_NEQ(!output_mat_2, false,
+                        Status(TNNERR_PARAM_ERR, "GetMat is invalid"));
+    
+    auto input_shape = GetInputShape();
+    RETURN_VALUE_ON_NEQ(input_shape.size() ==4, true,
+                        Status(TNNERR_PARAM_ERR, "GetInputShape is invalid"));
+    
+    std::vector<ObjectInfo> object_list;
+    GenerateDetectResult({output_mat_0, output_mat_1, output_mat_2}, object_list, input_shape[3], input_shape[2]);
+    output->object_list = object_list;
+    return status;
+}
+
+void ObjectDetectorYolo::NMS(std::vector<ObjectInfo>& objs, std::vector<ObjectInfo>& results) {
+    ::TNN_NS::NMS(objs, results, iou_threshold_, TNNHardNMS);
+}
+
+ObjectDetectorYolo::~ObjectDetectorYolo() {}
+
+void ObjectDetectorYolo::GenerateDetectResult(std::vector<std::shared_ptr<Mat> >outputs,
+                                              std::vector<ObjectInfo>& detecs, int image_width, int image_height) {
+    std::vector<ObjectInfo> extracted_objs;
+    int blob_index = 0;
+    
+    for(auto& output:outputs){
+        auto dim = output->GetDims();
+  
+        if(dim[3] != num_anchor_ * detect_dim_) {
+            LOGE("Invalid detect output, the size of last dimension is: %d\n", dim[3]);
+            return;
+        }
+        float* data = static_cast<float*>(output->GetData());
+        
+        int num_potential_detecs = dim[1] * dim[2] * num_anchor_;
+        for(int i=0; i<num_potential_detecs; ++i){
+            float x = data[i * detect_dim_ + 0];
+            float y = data[i * detect_dim_ + 1];
+            float width = data[i * detect_dim_ + 2];
+            float height = data[i * detect_dim_ + 3];
+            
+            float objectness = data[i * detect_dim_ + 4];
+            if(objectness < conf_thres)
+                continue;
+            //center point coord
+            x = (x * 2 - 0.5 + ((i / num_anchor_) % dim[2])) * strides_[blob_index];
+            y = (y * 2 - 0.5 + ((i / num_anchor_) / dim[2]) % dim[1]) * strides_[blob_index];
+            width  = pow((width  * 2), 2) * anchor_grids_[blob_index * grid_per_input_ + (i%num_anchor_) * 2 + 0];
+            height = pow((height * 2), 2) * anchor_grids_[blob_index * grid_per_input_ + (i%num_anchor_) * 2 + 1];
+            // compute coords
+            float x1 = x - width  / 2;
+            float y1 = y - height / 2;
+            float x2 = x + width  / 2;
+            float y2 = y + height / 2;
+            // compute confidence
+            auto conf_start = data + i * detect_dim_ + 5;
+            auto conf_end   = data + (i+1) * detect_dim_;
+            auto max_conf_iter = std::max_element(conf_start, conf_end);
+            int conf_idx = static_cast<int>(std::distance(conf_start, max_conf_iter));
+            float score = (*max_conf_iter) * objectness;
+            
+            ObjectInfo obj_info;
+            obj_info.image_width = image_width;
+            obj_info.image_height = image_height;
+            obj_info.x1 = x1;
+            obj_info.y1 = y1;
+            obj_info.x2 = x2;
+            obj_info.y2 = y2;
+            obj_info.score = score;
+            obj_info.class_id = conf_idx;
+            extracted_objs.push_back(obj_info);
+        }
+        blob_index += 1;
+    }
+    NMS(extracted_objs, detecs);
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/object_detector_yolo.h b/3rdparty/TNN/examples/base/object_detector_yolo.h
new file mode 100644
index 0000000..1784751
--- /dev/null
+++ b/3rdparty/TNN/examples/base/object_detector_yolo.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OBJECT_DETECTOR_YOLO_H_
+#define TNN_EXAMPLES_BASE_OBJECT_DETECTOR_YOLO_H_
+
+#include "tnn_sdk_sample.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace TNN_NS {
+
+constexpr const char* coco_classes[] = {
+"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+"hair drier", "toothbrush"};
+
+class ObjectDetectorYoloOutput : public TNNSDKOutput {
+public:
+    ObjectDetectorYoloOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~ObjectDetectorYoloOutput();
+    std::vector<ObjectInfo> object_list;
+};
+
+class ObjectDetectorYolo : public TNN_NS::TNNSDKSample {
+public:
+    ~ObjectDetectorYolo();
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateDetectResult(std::vector<std::shared_ptr<Mat> >outputs, std::vector<ObjectInfo>& detects,
+                              int image_width, int image_height);
+    void NMS(std::vector<ObjectInfo>& objs, std::vector<ObjectInfo>& results);
+    
+    float conf_thres = 0.4;
+    float iou_thres = 0.5;
+    // yolov5s model configurations
+    std::vector<float> strides_ = {32.f, 16.f, 8.f};
+    std::vector<float> anchor_grids_ = {116.f, 90.f,  \
+                                        156.f, 198.f, \
+                                        373.f, 326.f, \
+                                        30.f, 61.f,   \
+                                        62.f, 45.f,   \
+                                        59.f, 119.f,  \
+                                        10.f, 13.f,   \
+                                        16.f, 30.f,   \
+                                        33.f, 23.f };
+    float iou_threshold_ = 0.5f;
+    int num_anchor_ = 3;
+    int detect_dim_ = 85;
+    int grid_per_input_ = 6;
+    
+};
+
+}
+#endif // TNN_EXAMPLES_BASE_OBJECT_DETECTOR_YOLO_H_
diff --git a/3rdparty/TNN/examples/base/ocr_angle_predictor.cc b/3rdparty/TNN/examples/base/ocr_angle_predictor.cc
new file mode 100644
index 0000000..10a041e
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_angle_predictor.cc
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ocr_angle_predictor.h"
+
+#if HAS_OPENCV
+
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <cmath>
+
+namespace TNN_NS {
+
+OCRAnglePredictorOutput::~OCRAnglePredictorOutput() {}
+
+MatConvertParam OCRAnglePredictor::GetConvertParamForInput(std::string name) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5, 0.0};
+    input_convert_param.bias  = {-1.0,        -1.0,        -1.0,       0.0};
+    // model requires RGB input
+    input_convert_param.reverse_channel = false;
+    
+    return input_convert_param;
+}
+
+std::shared_ptr<Mat> OCRAnglePredictor::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    Status status = TNN_OK;
+
+    // 0) copy if necessary
+    bool need_copy = false;
+    DeviceType origin_dev = input_mat->GetDeviceType();
+    if (input_mat->GetDeviceType() != DEVICE_ARM) {
+        need_copy = true;
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_mat->GetDims());
+        status = Copy(input_mat, input_arm_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        input_mat = input_arm_mat;
+    }
+
+    // 1) TNN::Mat to opencv Mat
+    int img_height = input_mat->GetHeight();
+    int img_width  = input_mat->GetWidth();
+    void *pixel = input_mat->GetData();
+    cv::Mat cv_src(img_height, img_width, CV_8UC4, pixel);
+
+    // 2) resize
+    float scale = static_cast<float>(dst_height_) / img_height;
+    int angleWidth = static_cast<int>(img_width * scale);
+    if (scale != 1) {
+        cv::Mat resized_src;
+        cv::resize(cv_src, resized_src, cv::Size(angleWidth, dst_height_));
+        cv_src = resized_src;
+    }
+    cv::Mat srcFit = cv::Mat(dst_height_, dst_width_, CV_8UC4, cv::Scalar(255, 255, 255));
+    if (angleWidth < dst_width_) {
+        cv::Rect rect(0, 0, cv_src.cols, cv_src.rows);
+        cv_src.copyTo(srcFit(rect));
+    } else {
+        cv::Rect rect(0, 0, dst_width_, dst_height_);
+        cv_src(rect).copyTo(srcFit);
+    }
+
+    // 3) cv::Mat to TNN::Mat
+    int input_height = srcFit.rows;
+    int input_width  = srcFit.cols;
+    auto input_shape = input_mat->GetDims();
+    input_shape[2] = input_height;
+    input_shape[3] = input_width;
+    
+    std::shared_ptr<Mat> result_mat = nullptr;
+    if (need_copy) {
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_shape, srcFit.data);
+        result_mat = std::make_shared<Mat>(origin_dev, input_mat->GetMatType(), input_shape);
+        status = Copy(input_arm_mat, result_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+    } else {
+        result_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), N8UC4, input_shape);
+        memcpy(result_mat->GetData(), srcFit.data, sizeof(uint8_t)*DimsVectorUtils::Count(input_shape));
+    }
+
+    return result_mat;
+}
+
+std::shared_ptr<TNNSDKOutput> OCRAnglePredictor::CreateSDKOutput() {
+    return std::make_shared<OCRAnglePredictorOutput>();
+}
+
+Status OCRAnglePredictor::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto output = dynamic_cast<OCRAnglePredictorOutput *>(output_.get());
+    auto output_mat = output->GetMat("out");
+    const auto output_count = DimsVectorUtils::Count(output_mat->GetDims());
+    float *output_data = static_cast<float *>(output_mat->GetData());
+
+    float max_score = output_data[0];
+    int max_idx = 0;
+    for(int i=1; i<output_count; ++i) {
+        if (output_data[i] > max_score) {
+            max_score = output_data[i];
+            max_idx   = i;
+        }
+    }
+    output->index = max_idx;
+    output->score = max_score;
+
+    return status;
+}
+
+void OCRAnglePredictor::ProcessAngles(std::vector<std::shared_ptr<TNNSDKOutput>>& angles) {
+    if (!do_angle_ || !most_angle_)
+        return ;
+
+    float index_sum = 0;
+    float half_percent = angles.size() / 2.0f;
+    for(const auto& output : angles) {
+        auto angle = dynamic_cast<OCRAnglePredictorOutput *>(output.get());
+        index_sum += angle->index;
+    }
+    int voted_angle = index_sum >= half_percent;
+    for(auto& output : angles) {
+        auto angle = dynamic_cast<OCRAnglePredictorOutput *>(output.get());
+        angle->index = voted_angle;
+    }
+}
+
+OCRAnglePredictor::~OCRAnglePredictor() {}
+
+}
+
+#endif// HAS_OPENCV
diff --git a/3rdparty/TNN/examples/base/ocr_angle_predictor.h b/3rdparty/TNN/examples/base/ocr_angle_predictor.h
new file mode 100644
index 0000000..754b78c
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_angle_predictor.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OCR_ANGLE_PREDICTOR_H_
+#define TNN_EXAMPLES_BASE_OCR_ANGLE_PREDICTOR_H_
+
+#include "tnn_sdk_sample.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <array>
+
+namespace TNN_NS {
+
+class OCRAnglePredictorOutput : public TNNSDKOutput {
+public:
+    OCRAnglePredictorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~OCRAnglePredictorOutput();
+    int index;
+    float score;
+};
+
+class OCRAnglePredictor : public TNN_NS::TNNSDKSample {
+public:
+    ~OCRAnglePredictor();
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+    // Process angles from same image
+    void ProcessAngles(std::vector<std::shared_ptr<TNNSDKOutput>>& angles);
+    bool DoAngle() { return do_angle_; }
+    
+private:
+    bool do_angle_   = true;
+    bool most_angle_ = true;
+    int dst_width_   = 192;
+    int dst_height_  = 32;
+};
+
+}
+#endif // TNN_EXAMPLES_BASE_OCR_ANGLE_PREDICTOR_H_
+
diff --git a/3rdparty/TNN/examples/base/ocr_driver.cc b/3rdparty/TNN/examples/base/ocr_driver.cc
new file mode 100644
index 0000000..eb31129
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_driver.cc
@@ -0,0 +1,248 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_sdk_sample.h"
+
+#if HAS_OPENCV
+
+#include "ocr_textbox_detector.h"
+#include "ocr_angle_predictor.h"
+#include "ocr_text_recognizer.h"
+#include "ocr_driver.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <vector>
+
+namespace TNN_NS {
+static cv::Mat getRotateCropImage(const cv::Mat &src, std::vector<cv::Point>& box) {
+    cv::Mat image;
+    src.copyTo(image);
+    std::vector<cv::Point>& points = box;
+
+    int collectX[4] = {box[0].x, box[1].x, box[2].x, box[3].x};
+    int collectY[4] = {box[0].y, box[1].y, box[2].y, box[3].y};
+    int left = int(*std::min_element(collectX, collectX + 4));
+    int right = int(*std::max_element(collectX, collectX + 4));
+    int top = int(*std::min_element(collectY, collectY + 4));
+    int bottom = int(*std::max_element(collectY, collectY + 4));
+
+    cv::Mat imgCrop;
+    image(cv::Rect(left, top, right - left, bottom - top)).copyTo(imgCrop);
+
+    for (int i = 0; i < points.size(); i++) {
+        points[i].x -= left;
+        points[i].y -= top;
+    }
+
+    int imgCropWidth = int(sqrt(pow(points[0].x - points[1].x, 2) +
+                                pow(points[0].y - points[1].y, 2)));
+    int imgCropHeight = int(sqrt(pow(points[0].x - points[3].x, 2) +
+                                 pow(points[0].y - points[3].y, 2)));
+
+    cv::Point2f ptsDst[4];
+    ptsDst[0] = cv::Point2f(0., 0.);
+    ptsDst[1] = cv::Point2f(imgCropWidth, 0.);
+    ptsDst[2] = cv::Point2f(imgCropWidth, imgCropHeight);
+    ptsDst[3] = cv::Point2f(0.f, imgCropHeight);
+
+    cv::Point2f ptsSrc[4];
+    ptsSrc[0] = cv::Point2f(points[0].x, points[0].y);
+    ptsSrc[1] = cv::Point2f(points[1].x, points[1].y);
+    ptsSrc[2] = cv::Point2f(points[2].x, points[2].y);
+    ptsSrc[3] = cv::Point2f(points[3].x, points[3].y);
+
+    cv::Mat M = cv::getPerspectiveTransform(ptsSrc, ptsDst);
+
+    cv::Mat partImg;
+    cv::warpPerspective(imgCrop, partImg, M,
+                        cv::Size(imgCropWidth, imgCropHeight),
+                        cv::BORDER_REPLICATE);
+
+    if (float(partImg.rows) >= float(partImg.cols) * 1.5) {
+        cv::Mat srcCopy = cv::Mat(partImg.rows, partImg.cols, partImg.depth());
+        cv::transpose(partImg, srcCopy);
+        cv::flip(srcCopy, srcCopy, 0);
+        return srcCopy;
+    } else {
+        return partImg;
+    }
+}
+
+static std::vector<cv::Mat> getPartImages(cv::Mat &src, std::vector<TextBox> &textBoxes) {
+    std::vector<cv::Mat> partImages;
+    for (int i = 0; i < textBoxes.size(); ++i) {
+        cv::Mat partImg = getRotateCropImage(src, textBoxes[i].box_points);
+        partImages.emplace_back(partImg);
+    }
+    return partImages;
+}
+
+// TODO:  a more general rorate method
+void matRotateClockwise180(cv::Mat& src) {
+    cv::flip(src, src, 0);
+    cv::flip(src, src, 1);
+}
+
+Status OCRDriver::Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks) {
+    if (sdks.size() < 3) {
+        return Status(TNNERR_INST_ERR, "OCRDriver::Init has invalid sdks, its size < 3");
+    }
+    
+    textbox_detector_ = sdks[0];
+    angle_predictor_  = sdks[1];
+    text_recognizer_  = sdks[2];
+    return TNNSDKComposeSample::Init(sdks);
+}
+
+Status OCRDriver::MatToTNNMat(const cv::Mat& mat, std::shared_ptr<Mat>& tnn_mat, bool try_share_data) {
+    const auto device = tnn_mat->GetDeviceType();
+    Status status = TNN_OK;
+    bool is_cpu_mat = device == DEVICE_ARM || device == DEVICE_NAIVE;
+    bool can_share_data = is_cpu_mat && (tnn_mat->GetData());
+    if (can_share_data && try_share_data) {
+        tnn_mat = std::make_shared<Mat>(device, tnn_mat->GetMatType(),
+                                        tnn_mat->GetDims(), mat.data);
+    } else {
+        // new memory
+        tnn_mat = std::make_shared<Mat>(device, tnn_mat->GetMatType(),
+                                        tnn_mat->GetDims());
+        if (is_cpu_mat) {
+            memcpy(tnn_mat->GetData(), mat.data, sizeof(uint8_t)*DimsVectorUtils::Count(tnn_mat->GetDims()));
+        } else {
+            auto tmp_arm_mat = std::make_shared<Mat>(DEVICE_ARM, tnn_mat->GetMatType(),
+                                            tnn_mat->GetDims());
+            memcpy(tmp_arm_mat->GetData(), mat.data, sizeof(uint8_t)*DimsVectorUtils::Count(tnn_mat->GetDims()));
+            auto status = Copy(tmp_arm_mat, tnn_mat);
+        }
+    }
+    return status;
+}
+
+bool OCRDriver::hideTextBox() {
+    return true;
+}
+
+Status OCRDriver::Predict(std::shared_ptr<TNNSDKInput> sdk_input,
+                                  std::shared_ptr<TNNSDKOutput> &sdk_output) {
+    Status status = TNN_OK;
+    
+    if (!sdk_input || sdk_input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    auto predictor_textbox_detector_async = textbox_detector_;
+    auto predictor_angle_predictor_async  = angle_predictor_;
+    auto predictor_text_recognizer_async  = text_recognizer_;
+    
+    auto predictor_textbox_detector_cast = dynamic_cast<OCRTextboxDetector *>(predictor_textbox_detector_async.get());
+    auto predictor_angle_predictor_cast = dynamic_cast<OCRAnglePredictor *>(predictor_angle_predictor_async.get());
+    auto predictor_text_recognizer_cast = dynamic_cast<OCRTextRecognizer *>(predictor_text_recognizer_async.get());
+    
+    const auto input_mat = sdk_input->GetMat();
+    cv::Mat origin_input(input_mat->GetHeight(), input_mat->GetWidth(), CV_8UC4, input_mat->GetData());
+
+    std::vector<TextBox> text_boxes;
+    std::shared_ptr<TNNSDKOutput> textbox_det;
+    {
+        // phase1: textbox detection
+        status = predictor_textbox_detector_cast->Predict(sdk_input, textbox_det);
+        if (textbox_det && dynamic_cast<OCRTextboxDetectorOutput *>(textbox_det.get())) {
+            auto output = dynamic_cast<OCRTextboxDetectorOutput *>(textbox_det.get());
+            text_boxes = output->text_boxes;
+        }
+        if(text_boxes.size() <= 0) {
+            LOGE("Error no text boxes found!");
+            return status;
+        }
+    }
+    std::vector<cv::Mat> part_images = getPartImages(predictor_textbox_detector_cast->GetPaddedInput(), text_boxes);
+    auto dims = input_mat->GetDims();
+    
+    if (predictor_angle_predictor_cast->DoAngle()) {
+        // phase2: angle prediction
+        std::vector<std::shared_ptr<TNNSDKOutput>>angles;
+        for(int i=0; i<part_images.size(); ++i) {
+            // cv::Mat to TNN::Mat
+            auto& cv_mat   = part_images[i];
+            dims[2] = cv_mat.rows;
+            dims[3] = cv_mat.cols;
+            auto tnn_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), input_mat->GetMatType(), dims, nullptr);
+            status = MatToTNNMat(cv_mat, tnn_mat, true);
+            RETURN_ON_NEQ(status, TNN_OK);
+            auto input   = std::make_shared<TNNSDKInput>(tnn_mat);
+            
+            std::shared_ptr<TNNSDKOutput> angle;
+            predictor_angle_predictor_cast->Predict(input, angle);
+            angles.push_back(angle);
+        }
+        predictor_angle_predictor_cast->ProcessAngles(angles);
+        for(int i=0; i<part_images.size(); ++i) {
+            auto angle = dynamic_cast<OCRAnglePredictorOutput *>(angles[i].get());
+            if(angle->index == 0) {
+                matRotateClockwise180(part_images[i]);
+            }
+        }
+    }
+    std::vector<std::shared_ptr<TNNSDKOutput>> texts;
+    {
+        // phase3: text recognize
+        for(int i=0; i<part_images.size(); ++i) {
+            // cv::Mat to TNN::Mat
+            auto& cv_mat   = part_images[i];
+            dims[2] = cv_mat.rows;
+            dims[3] = cv_mat.cols;
+            auto tnn_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), input_mat->GetMatType(), dims, nullptr);
+            status = MatToTNNMat(cv_mat, tnn_mat, true);
+            RETURN_ON_NEQ(status, TNN_OK);
+            auto input   = std::make_shared<TNNSDKInput>(tnn_mat);
+            
+            std::shared_ptr<TNNSDKOutput> text;
+            predictor_text_recognizer_cast->Predict(input, text);
+            if (text && dynamic_cast<OCRTextRecognizerOutput *>(text.get())) {
+                texts.push_back(text);
+            }
+        }
+    }
+
+    {
+        auto ocr_output = std::make_shared<OCROutput>();
+        for(int i=0; i<texts.size(); ++i) {
+            const auto& o = texts[i];
+            const auto& box = text_boxes[i];
+            auto text_output = dynamic_cast<OCRTextRecognizerOutput *>(o.get());
+            const auto& text = text_output->text;
+            ocr_output->texts.push_back(text);
+
+            for(const auto&p : box.box_points_input) {
+                ocr_output->box.push_back({p.x, p.y});
+            }
+
+            ocr_output->image_height = sdk_input->GetMat()->GetHeight();
+            ocr_output->image_width  = sdk_input->GetMat()->GetWidth();
+        }
+        // fill output
+        sdk_output = ocr_output;
+    }
+
+    return TNN_OK;
+}
+
+}
+
+#endif // HAS_OPENCV
diff --git a/3rdparty/TNN/examples/base/ocr_driver.h b/3rdparty/TNN/examples/base/ocr_driver.h
new file mode 100644
index 0000000..3d7b6e5
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_driver.h
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OCR_DRIVER_H_
+#define TNN_EXAMPLES_BASE_OCR_DRIVER_H_
+
+#include "tnn_sdk_sample.h"
+
+#if HAS_OPENCV
+
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+#include "opencv2/core/mat.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+
+namespace TNN_NS{
+
+class OCROutput : public TNNSDKOutput {
+public:
+    OCROutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~OCROutput() {};
+    //std::vector<float> scores;
+    int image_height;
+    int image_width;
+    std::vector<std::string> texts;
+    std::vector<std::pair<float, float>> box;
+    float angle;
+};
+
+class OCRDriver : public TNN_NS::TNNSDKComposeSample {
+public:
+    virtual ~OCRDriver() {}
+    
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+    
+    virtual Status Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks);
+
+    virtual bool hideTextBox();
+
+protected:
+    Status MatToTNNMat(const cv::Mat& mat, std::shared_ptr<Mat>& tnn_mat, bool try_share_data);
+
+    std::shared_ptr<TNNSDKSample> textbox_detector_;
+    std::shared_ptr<TNNSDKSample> angle_predictor_;
+    std::shared_ptr<TNNSDKSample> text_recognizer_;
+};
+
+}
+
+#endif // HAS_OPENCV
+
+#endif // TNN_EXAMPLES_BASE_OCR_DRIVER_H_
+
diff --git a/3rdparty/TNN/examples/base/ocr_text_recognizer.cc b/3rdparty/TNN/examples/base/ocr_text_recognizer.cc
new file mode 100644
index 0000000..f2dbc88
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_text_recognizer.cc
@@ -0,0 +1,199 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ocr_text_recognizer.h"
+
+#if HAS_OPENCV
+
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <cmath>
+
+
+namespace TNN_NS {
+
+OCRTextRecognizerOutput::~OCRTextRecognizerOutput() {}
+
+Status OCRTextRecognizer::Init(std::shared_ptr<TNNSDKOption> option) {
+    option->input_shapes.insert( {"input", DimsVector({1, 3, dst_height_, max_width_})} );
+    // load vocabulary
+    const auto& vocab_file_path = dynamic_cast<OCRTextRecognizerOption *>(option.get())->vocab_path;
+    std::ifstream in(vocab_file_path.c_str());
+    if (!in) {
+        return Status(TNNERR_PARAM_ERR, "invalid vocabulary file path!");
+    }
+    std::string line;
+    while(getline(in, line)) {
+        vocabulary_.push_back(line);
+    }
+    if (vocabulary_.size() != vocab_size_) {
+        return Status(TNNERR_PARAM_ERR, "invalid vocabulary file!");
+    }
+    return TNNSDKSample::Init(option);
+}
+
+MatConvertParam OCRTextRecognizer::GetConvertParamForInput(std::string name) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5, 0.0};
+    input_convert_param.bias  = {-1.0,        -1.0,        -1.0,       0.0};
+    // model requires RGB input
+    input_convert_param.reverse_channel = false;
+    
+    return input_convert_param;
+}
+
+std::shared_ptr<Mat> OCRTextRecognizer::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    Status status = TNN_OK;
+
+    // 0) copy if necessary
+    bool need_copy = false;
+    DeviceType origin_dev = input_mat->GetDeviceType();
+    if (input_mat->GetDeviceType() != DEVICE_ARM) {
+        need_copy = true;
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_mat->GetDims());
+        status = Copy(input_mat, input_arm_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        input_mat = input_arm_mat;
+    }
+
+    // 1) TNN::Mat to opencv Mat
+    int img_height = input_mat->GetHeight();
+    int img_width  = input_mat->GetWidth();
+    void *pixel = input_mat->GetData();
+    cv::Mat cv_src(img_height, img_width, CV_8UC4, pixel);
+
+    // 2) resize
+    float scale = static_cast<float>(dst_height_) / img_height;
+    if (scale != 1) {
+        int dst_width = static_cast<int>(img_width * scale);
+        cv::Mat resized_src;
+        cv::resize(cv_src, resized_src, cv::Size(dst_width, dst_height_));
+        cv_src = resized_src;
+    }
+
+    // 3) cv::Mat to TNN::Mat
+    int input_height = cv_src.rows;
+    int input_width  = cv_src.cols;
+    auto input_shape = input_mat->GetDims();
+    input_shape[2] = input_height;
+    input_shape[3] = input_width;
+
+    std::shared_ptr<Mat> result_mat = nullptr;
+    if (need_copy) {
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_shape, cv_src.data);
+        result_mat = std::make_shared<Mat>(origin_dev, input_mat->GetMatType(), input_shape);
+        status = Copy(input_arm_mat, result_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+    } else {
+        result_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), N8UC4, input_shape);
+        memcpy(result_mat->GetData(), cv_src.data, sizeof(uint8_t)*DimsVectorUtils::Count(input_shape));
+    }
+
+    // 4) reshape
+    InputShapesMap input_shape_map;
+    const auto input_name = GetInputNames()[0];
+    input_shape[1] = 3;
+    if (input_shape[3] > max_width_) {
+        LOGE("invalid input: input width:%d is too large!\n", input_shape[3]);
+        return nullptr;
+    }
+    input_shape_map.insert({input_name, input_shape});
+    status = instance_->Reshape(input_shape_map);
+    if (status != TNN_OK) {
+        LOGE("instance Reshape failed in text recognizer\n");
+        return nullptr;
+    }
+
+    return result_mat;
+}
+
+std::shared_ptr<TNNSDKOutput> OCRTextRecognizer::CreateSDKOutput() {
+    return std::make_shared<OCRTextRecognizerOutput>();
+}
+
+template<class ForwardIterator>
+inline static size_t argmax(ForwardIterator first, ForwardIterator last) {
+    return std::distance(first, std::max_element(first, last));
+}
+
+Status OCRTextRecognizer::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto output = dynamic_cast<OCRTextRecognizerOutput *>(output_.get());
+    auto output_mat = output->GetMat("out");
+    const auto output_shape = output_mat->GetDims();
+    float *output_data = static_cast<float *>(output_mat->GetData());
+    
+    const int seq_len = output_shape[0];
+    const int vocab_len = output_shape[2];
+    if (vocab_len != vocabulary_.size()) {
+        return Status(TNNERR_INST_ERR, "invalid result shape!");
+    }
+    
+    std::vector<float> scores;
+    std::string result;
+    
+    std::vector<float> exps(seq_len*vocab_len, 0);
+    int last_idx = 0;
+    // TODO: move this search into model
+    for(int s=0; s<seq_len; ++s) {
+        double sum = 0.f;
+        float max_score = -INFINITY;
+        float max_score_pre_exp = -INFINITY;
+        int max_idx = 0;
+        for(int i=0; i<vocab_len; ++i) {
+            float score = output_data[s * vocab_len + i];
+            if (score > max_score_pre_exp) {
+                max_score_pre_exp = score;
+            }
+        }
+        for(int i=0; i<vocab_len; ++i) {
+            float score = std::exp(output_data[s * vocab_len + i] - max_score_pre_exp);
+            //output_data[s * vocab_len + i] = score;
+            exps[s * vocab_len + i] = score;
+            if (score > max_score) {
+                max_score = score;
+                max_idx = i;
+            }
+            sum += score;
+        }
+
+        if (max_idx > 0 && !(s > 0 && max_idx == last_idx)) {
+            scores.emplace_back(max_score / static_cast<float>(sum));
+            result.append(vocabulary_[max_idx - 1]);
+        }
+        last_idx = max_idx;
+    }
+    
+    output->scores = scores;
+    output->text = result;
+
+    return status;
+}
+
+OCRTextRecognizer::~OCRTextRecognizer() {}
+
+}
+
+#endif // HAS_OPENCV
diff --git a/3rdparty/TNN/examples/base/ocr_text_recognizer.h b/3rdparty/TNN/examples/base/ocr_text_recognizer.h
new file mode 100644
index 0000000..38cfacb
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_text_recognizer.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OCR_TEXT_RECOGNIZER_H_
+#define TNN_EXAMPLES_BASE_OCR_TEXT_RECOGNIZER_H_
+
+#include "tnn_sdk_sample.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <array>
+
+namespace TNN_NS {
+
+class OCRTextRecognizerOption : public TNNSDKOption {
+public:
+    OCRTextRecognizerOption() {}
+    virtual ~OCRTextRecognizerOption() {}
+    std::string vocab_path;
+};
+
+class OCRTextRecognizerOutput : public TNNSDKOutput {
+public:
+    OCRTextRecognizerOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~OCRTextRecognizerOutput();
+    std::vector<float> scores;
+    std::string text;
+};
+
+class OCRTextRecognizer : public TNN_NS::TNNSDKSample {
+public:
+    Status Init(std::shared_ptr<TNNSDKOption> option);
+    ~OCRTextRecognizer();
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    std::vector<std::string> vocabulary_;
+    constexpr static int vocab_size_ = 5531;
+    int dst_height_ = 32;
+    int max_width_  = 4096;
+};
+
+}
+#endif // TNN_EXAMPLES_BASE_OCR_TEXT_RECOGNIZER_H_
+
diff --git a/3rdparty/TNN/examples/base/ocr_textbox_detector.cc b/3rdparty/TNN/examples/base/ocr_textbox_detector.cc
new file mode 100644
index 0000000..591fa68
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_textbox_detector.cc
@@ -0,0 +1,353 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ocr_textbox_detector.h"
+
+#if HAS_OPENCV
+
+#include "clipper.h"
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <cmath>
+
+
+namespace TNN_NS {
+
+static ScaleParam getScaleParam(cv::Mat &src, const int targetSize) {
+    int srcWidth, srcHeight, dstWidth, dstHeight;
+    srcWidth = dstWidth = src.cols;
+    srcHeight = dstHeight = src.rows;
+
+    float ratio = 1.f;
+    if (srcWidth > srcHeight) {
+        ratio = float(targetSize) / float(srcWidth);
+    } else {
+        ratio = float(targetSize) / float(srcHeight);
+    }
+    dstWidth = int(float(srcWidth) * ratio);
+    dstHeight = int(float(srcHeight) * ratio);
+    if (dstWidth % 32 != 0) {
+        dstWidth = (dstWidth / 32) * 32;
+        dstWidth = (std::max)(dstWidth, 32);
+    }
+    if (dstHeight % 32 != 0) {
+        dstHeight = (dstHeight / 32) * 32;
+        dstHeight = (std::max)(dstHeight, 32);
+    }
+    float ratioWidth = (float) dstWidth / (float) srcWidth;
+    float ratioHeight = (float) dstHeight / (float) srcHeight;
+    return {srcWidth, srcHeight, dstWidth, dstHeight, ratioWidth, ratioHeight};
+}
+
+static std::vector<cv::Point> getMinBoxes(const std::vector<cv::Point> &inVec, float &minSideLen, float &allEdgeSize) {
+    std::vector<cv::Point> minBoxVec;
+    cv::RotatedRect textRect = cv::minAreaRect(inVec);
+    cv::Mat boxPoints2f;
+    cv::boxPoints(textRect, boxPoints2f);
+
+    float *p1 = reinterpret_cast<float *>(boxPoints2f.data);
+    std::vector<cv::Point> tmpVec;
+    for (int i = 0; i < 4; ++i, p1 += 2) {
+        tmpVec.emplace_back(int(p1[0]), int(p1[1]));
+    }
+
+    const auto& cvPointCompare= [](const cv::Point &a, const cv::Point &b) {
+        return a.x < b.x;
+    };
+    std::sort(tmpVec.begin(), tmpVec.end(), cvPointCompare);
+
+    int index1, index2, index3, index4;
+    if (tmpVec[1].y > tmpVec[0].y) {
+        index1 = 0;
+        index4 = 1;
+    } else {
+        index1 = 1;
+        index4 = 0;
+    }
+
+    if (tmpVec[3].y > tmpVec[2].y) {
+        index2 = 2;
+        index3 = 3;
+    } else {
+        index2 = 3;
+        index3 = 2;
+    }
+
+    minBoxVec.push_back(tmpVec[index1]);
+    minBoxVec.push_back(tmpVec[index2]);
+    minBoxVec.push_back(tmpVec[index3]);
+    minBoxVec.push_back(tmpVec[index4]);
+
+    minSideLen = (std::min)(textRect.size.width, textRect.size.height);
+    allEdgeSize = 2.f * (textRect.size.width + textRect.size.height);
+
+    return minBoxVec;
+}
+
+static float boxScoreFast(const cv::Mat &inMat, const std::vector<cv::Point> &inBox) {
+    std::vector<cv::Point> box = inBox;
+    int width = inMat.cols;
+    int height = inMat.rows;
+    int maxX = -INFINITY, minX = INFINITY, maxY = -INFINITY, minY = INFINITY;
+    for (int i = 0; i < box.size(); ++i) {
+        if (maxX < box[i].x)
+            maxX = box[i].x;
+        if (minX > box[i].x)
+            minX = box[i].x;
+        if (maxY < box[i].y)
+            maxY = box[i].y;
+        if (minY > box[i].y)
+            minY = box[i].y;
+    }
+    maxX = std::min(std::max(maxX, 0), width - 1);
+    minX = std::max(std::min(minX, width - 1), 0);
+    maxY = std::min(std::max(maxY, 0), height - 1);
+    minY = std::max(std::min(minY, height - 1), 0);
+
+    for (int i = 0; i < box.size(); ++i) {
+        box[i].x = box[i].x - minX;
+        box[i].y = box[i].y - minY;
+    }
+
+    std::vector<std::vector<cv::Point>> maskBox;
+    maskBox.push_back(box);
+    cv::Mat maskMat(maxY - minY + 1, maxX - minX + 1, CV_8UC1, cv::Scalar(0, 0, 0));
+    cv::fillPoly(maskMat, maskBox, cv::Scalar(1, 1, 1), 1);
+
+    return cv::mean(inMat(cv::Rect(cv::Point(minX, minY), cv::Point(maxX + 1, maxY + 1))).clone(),
+                    maskMat).val[0];
+}
+
+static std::vector<cv::Point> unClip(const std::vector<cv::Point> &inBox, float perimeter, float unClipRatio) {
+    std::vector<cv::Point> outBox;
+
+    ClipperLib::Path poly;
+
+    for (int i = 0; i < inBox.size(); ++i) {
+        poly.push_back(ClipperLib::IntPoint(inBox[i].x, inBox[i].y));
+    }
+
+    double distance = unClipRatio * ClipperLib::Area(poly) / (double) perimeter;
+
+    ClipperLib::ClipperOffset clipperOffset;
+    clipperOffset.AddPath(poly, ClipperLib::JoinType::jtRound, ClipperLib::EndType::etClosedPolygon);
+    ClipperLib::Paths polys;
+    polys.push_back(poly);
+    clipperOffset.Execute(polys, distance);
+    
+    outBox.clear();
+    std::vector<cv::Point> rsVec;
+    for (int i = 0; i < polys.size(); ++i) {
+        ClipperLib::Path tmpPoly = polys[i];
+        for (int j = 0; j < tmpPoly.size(); ++j) {
+            outBox.emplace_back(tmpPoly[j].X, tmpPoly[j].Y);
+        }
+    }
+
+    return outBox;
+}
+
+OCRTextboxDetectorOutput::~OCRTextboxDetectorOutput() {}
+
+Status OCRTextboxDetector::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+
+    auto option = dynamic_cast<OCRTextboxDetectorOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false, Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+
+    int max_size = max_size_ + 2*padding_;
+    // input size must be multiple of 32
+    if (max_size % 32 != 0) {
+        max_size = (max_size + 31 ) / 32 * 32;
+    }
+    option->input_shapes.insert( {"input0", DimsVector({1, 3, max_size, max_size})} );
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    padding_ = option->padding;
+    box_score_thresh_ = option->box_score_threshold;
+    scale_down_ratio_ = option->scale_down_ratio;
+    return status;
+}
+
+MatConvertParam OCRTextboxDetector::GetConvertParamForInput(std::string name) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / (0.229 * 255), 1.0 / (0.224 * 255), 1.0 / (0.225 * 255), 0.0};
+    input_convert_param.bias  = {-0.485 / 0.229,      -0.456 / 0.224,      -0.406 / 0.225,      0.0};
+    // model requires RGB input
+    input_convert_param.reverse_channel = false;
+
+    return input_convert_param;
+}
+
+std::shared_ptr<Mat> OCRTextboxDetector::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    Status status = TNN_OK;
+
+    auto scale_down_dims = input_mat->GetDims();
+    scale_down_dims[2] = static_cast<int>(scale_down_dims[2] * scale_down_ratio_);
+    scale_down_dims[3] = static_cast<int>(scale_down_dims[3] * scale_down_ratio_);
+
+    // 0) copy if necessary
+    bool need_copy = false;
+    DeviceType origin_dev = input_mat->GetDeviceType();
+    if (input_mat->GetDeviceType() != DEVICE_ARM) {
+        need_copy = true;
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_mat->GetDims());
+        status = Copy(input_mat, input_arm_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        // sacle down
+        auto scale_down_mat = std::make_shared<Mat>(DEVICE_ARM, input_arm_mat->GetMatType(),
+                                                    scale_down_dims);
+        status = Resize(input_arm_mat, scale_down_mat, TNNInterpLinear);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        //input_mat = input_arm_mat;
+        input_mat = scale_down_mat;
+    } else {
+        // sacle down
+        auto scale_down_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                    scale_down_dims);
+        status = Resize(input_mat, scale_down_mat, TNNInterpLinear);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+        input_mat = scale_down_mat;
+    }
+    
+    // 1) TNN::Mat to opencv Mat
+    int img_height = input_mat->GetHeight();
+    int img_width  = input_mat->GetWidth();
+    void *pixel = input_mat->GetData();
+    cv::Mat cv_src(img_height, img_width, CV_8UC4, pixel);
+    // 2) padding + resize
+    int max_side_size = std::min(max_size_,
+                    std::max(img_width, img_height));
+    int resize_size = max_side_size + 2 * padding_;
+    if (padding_ > 0) {
+        cv::Scalar scalar = {255, 255, 255};
+        cv::copyMakeBorder(cv_src, padded_input_, padding_, padding_, padding_, padding_,
+                           cv::BORDER_ISOLATED, scalar);
+        cv_src = padded_input_;
+    } else {
+        // TODO: hold data 'pixel' to avoid copy
+        padded_input_ = cv_src.clone();
+    }
+    
+    scale_ = getScaleParam(cv_src, resize_size);
+    cv::Mat resized_src;
+    cv::resize(cv_src, resized_src, cv::Size(scale_.dstWidth, scale_.dstHeight));
+    // 3) cv::Mat to TNN::Mat
+    int input_height = resized_src.rows;
+    int input_width  = resized_src.cols;
+    auto input_shape = input_mat->GetDims();
+    input_shape[1] = 4;
+    input_shape[2] = input_height;
+    input_shape[3] = input_width;
+    input_height_ = input_height;
+    input_width_  = input_width;
+    
+    std::shared_ptr<Mat> result_mat = nullptr;
+    if (need_copy) {
+        auto input_arm_mat = std::make_shared<Mat>(DEVICE_ARM, input_mat->GetMatType(),
+                                                   input_shape, resized_src.data);
+        result_mat = std::make_shared<Mat>(origin_dev, input_mat->GetMatType(), input_shape);
+        status = Copy(input_arm_mat, result_mat);
+        RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+    } else {
+        result_mat = std::make_shared<Mat>(input_mat->GetDeviceType(), N8UC4, input_shape);
+        memcpy(result_mat->GetData(), resized_src.data, sizeof(uint8_t)*DimsVectorUtils::Count(input_shape));
+    }
+    // reshape
+    InputShapesMap input_shape_map;
+    const auto input_name = GetInputNames()[0];
+    input_shape[1] = 3;
+    input_shape_map.insert({input_name, input_shape});
+    status = instance_->Reshape(input_shape_map);
+    if (status != TNN_OK) {
+        LOGE("instance Reshape failed in ocr textbox detector\n");
+        return nullptr;
+    }
+    
+    return result_mat;
+}
+
+std::shared_ptr<TNNSDKOutput> OCRTextboxDetector::CreateSDKOutput() {
+    return std::make_shared<OCRTextboxDetectorOutput>();
+}
+
+Status OCRTextboxDetector::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto output_mat = output_->GetMat("out1");
+    const auto output_count = DimsVectorUtils::Count(output_mat->GetDims());
+    auto output = dynamic_cast<OCRTextboxDetectorOutput *>(output_.get());
+    // mask for boxThresh
+    cv::Mat fMapMat(output_mat->GetHeight(), output_mat->GetWidth(), CV_32FC1);
+    memcpy(fMapMat.data, output_mat->GetData(), output_count * sizeof(float));
+    
+    std::vector<TextBox> text_boxes;
+
+    cv::Mat norfMapMat = fMapMat > box_thresh_;
+    // find rs boxes
+    const float minArea = 3;
+
+    std::vector<std::vector<cv::Point>> contours;
+    cv::findContours(norfMapMat, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+    for (int i = 0; i < contours.size(); ++i) {
+        float minSideLen, perimeter;
+        std::vector<cv::Point> minBox = getMinBoxes(contours[i], minSideLen, perimeter);
+        if (minSideLen < minArea)
+            continue;
+        float score = boxScoreFast(fMapMat, contours[i]);
+        if (score < box_score_thresh_)
+            continue;
+
+        std::vector<cv::Point> clipBox = unClip(minBox, perimeter, un_clip_ratio_);
+        std::vector<cv::Point> clipMinBox = getMinBoxes(clipBox, minSideLen, perimeter);
+
+        if (minSideLen < minArea + 2)
+            continue;
+
+        for (int j = 0; j < clipMinBox.size(); ++j) {
+            clipMinBox[j].x = (clipMinBox[j].x / scale_.ratioWidth);
+            clipMinBox[j].x = (std::min)((std::max)(clipMinBox[j].x, 0), scale_.srcWidth);
+
+            clipMinBox[j].y = (clipMinBox[j].y / scale_.ratioHeight);
+            clipMinBox[j].y = (std::min)((std::max)(clipMinBox[j].y, 0), scale_.srcHeight);
+        }
+        std::vector<cv::Point> box_to_input(clipMinBox.size());
+        for (int j = 0; j < clipMinBox.size(); ++j) {
+            box_to_input[j].x = static_cast<int>((clipMinBox[j].x - padding_) / scale_down_ratio_);
+            box_to_input[j].y = static_cast<int>((clipMinBox[j].y - padding_) / scale_down_ratio_);
+        }
+        text_boxes.emplace_back(TextBox{clipMinBox, box_to_input, score,
+                                static_cast<int>((scale_.srcWidth - 2*padding_) / scale_down_ratio_),
+                                static_cast<int>((scale_.srcHeight - 2*padding_) / scale_down_ratio_)});
+    }
+    reverse(text_boxes.begin(), text_boxes.end());
+    output->text_boxes = text_boxes;
+    
+    return status;
+}
+
+OCRTextboxDetector::~OCRTextboxDetector() {}
+
+}
+
+#endif // HAS_OPENCV
diff --git a/3rdparty/TNN/examples/base/ocr_textbox_detector.h b/3rdparty/TNN/examples/base/ocr_textbox_detector.h
new file mode 100644
index 0000000..7129723
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ocr_textbox_detector.h
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_OCR_TEXTBOX_DETECTOR_H_
+#define TNN_EXAMPLES_BASE_OCR_TEXTBOX_DETECTOR_H_
+
+#include "tnn_sdk_sample.h"
+
+#if HAS_OPENCV
+
+#include "opencv2/core/core.hpp"
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <array>
+
+namespace TNN_NS {
+
+struct TextBox {
+    std::vector<cv::Point> box_points;
+    // box points coresponding to sdk input
+    std::vector<cv::Point> box_points_input;
+    float score;
+    int image_width;
+    int image_height;
+};
+
+struct ScaleParam {
+    int srcWidth;
+    int srcHeight;
+    int dstWidth;
+    int dstHeight;
+    float ratioWidth;
+    float ratioHeight;
+};
+
+class OCRTextboxDetectorOption : public TNNSDKOption {
+public:
+    OCRTextboxDetectorOption() {}
+    virtual ~OCRTextboxDetectorOption() {}
+    int padding = 50;
+    float box_score_threshold = 0.6f;
+    float scale_down_ratio    = 1.00f;
+};
+
+class OCRTextboxDetectorOutput : public TNNSDKOutput {
+public:
+    OCRTextboxDetectorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~OCRTextboxDetectorOutput();
+    std::vector<TextBox> text_boxes;
+};
+
+class OCRTextboxDetector : public TNN_NS::TNNSDKSample {
+public:
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    ~OCRTextboxDetector();
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+    cv::Mat& GetPaddedInput() { return padded_input_; }
+    
+private:
+    int padding_  = 10;
+    float box_score_thresh_ = 0.6f;
+    float scale_down_ratio_ = 0.75f;
+
+    float box_thresh_ = 0.3f;
+    int max_size_ = 1024;
+    float un_clip_ratio_ = 2.0f;
+    int input_height_;
+    int input_width_;
+    cv::Mat padded_input_;
+    ScaleParam scale_;
+};
+
+}
+#endif  // HAS_OPENCV
+
+#endif // TNN_EXAMPLES_BASE_OCR_TEXTBOX_DETECTOR_H_
diff --git a/3rdparty/TNN/examples/base/pose_detect_landmark.cc b/3rdparty/TNN/examples/base/pose_detect_landmark.cc
new file mode 100644
index 0000000..404b57a
--- /dev/null
+++ b/3rdparty/TNN/examples/base/pose_detect_landmark.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_sdk_sample.h"
+#include "pose_detect_landmark.h"
+#include "blazepose_landmark.h"
+#include "blazeface_detector.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+Status PoseDetectLandmark::Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks) {
+    if (sdks.size() < 2) {
+        return Status(TNNERR_INST_ERR, "FaceDetectAligner::Init has invalid sdks, its size < 2");
+    }
+
+    predictor_detect_ = sdks[0];
+    predictor_landmark_ = sdks[1];
+    auto predictor_landmark_cast = dynamic_cast<BlazePoseLandmark *>(predictor_landmark_.get());
+    if (predictor_landmark_cast->isFullBody()) {
+        this->detect2roi_option.keypoints_start_idx = 0;
+        this->detect2roi_option.keypoints_end_idx = 1;
+    } else {
+        this->detect2roi_option.keypoints_start_idx = 2;
+        this->detect2roi_option.keypoints_end_idx = 3;
+    }
+    return TNNSDKComposeSample::Init(sdks);
+}
+
+Status PoseDetectLandmark::Predict(std::shared_ptr<TNNSDKInput> sdk_input,
+                                  std::shared_ptr<TNNSDKOutput> &sdk_output) {
+    Status status = TNN_OK;
+
+    if (!sdk_input || sdk_input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    auto predictor_detect_async = predictor_detect_;
+    auto predictor_landmark_async = predictor_landmark_;
+    auto predictor_detect_cast = dynamic_cast<BlazePoseDetector *>(predictor_detect_async.get());
+    auto predictor_landmark_cast = dynamic_cast<BlazePoseLandmark *>(predictor_landmark_async.get());
+    const unsigned int input_height = sdk_input->GetMat()->GetHeight();
+    const unsigned int input_width  = sdk_input->GetMat()->GetWidth();
+
+    // output of each model
+    std::shared_ptr<TNNSDKOutput> sdk_output_detect = nullptr;
+    std::shared_ptr<TNNSDKOutput> sdk_output_landmark = nullptr;
+
+    // phase1: blazepose detect
+    if (predictor_landmark_cast->NeedPoseDetection()) {
+        status = predictor_detect_cast->Predict(sdk_input, sdk_output_detect);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        std::vector<BlazePoseInfo>* detects = nullptr;
+        if (sdk_output_detect && dynamic_cast<BlazePoseDetectorOutput *>(sdk_output_detect.get())) {
+            auto output = dynamic_cast<BlazePoseDetectorOutput *>(sdk_output_detect.get());
+            detects = &(output->body_list);
+        }
+        if (!detects || detects->size() <= 0) {
+            // no detects, return
+            return status;
+        }
+        // set the original input shape
+        predictor_landmark_cast->SetOrigianlInputShape(input_height, input_width);
+        // only use the first detect
+        predictor_landmark_cast->Detection2RoI((*detects)[0], this->detect2roi_option);
+    }
+    // phase2: blazepose landmark
+    {
+        // set the original input shape
+        predictor_landmark_cast->SetOrigianlInputShape(input_height, input_width);
+        status = predictor_landmark_cast->Predict(sdk_input, sdk_output_landmark);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+
+    //get output
+    {
+        sdk_output = sdk_output_landmark;
+    }
+    return TNN_OK;
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/pose_detect_landmark.h b/3rdparty/TNN/examples/base/pose_detect_landmark.h
new file mode 100644
index 0000000..b2e55a3
--- /dev/null
+++ b/3rdparty/TNN/examples/base/pose_detect_landmark.h
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_POSE_DETECT_LANDMARK_H_
+#define TNN_EXAMPLES_BASE_POSE_DETECT_LANDMARK_H_
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+
+#include "tnn_sdk_sample.h"
+#include "blazepose_detector.h"
+#include "blazepose_landmark.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS{
+
+class PoseDetectLandmark : public TNN_NS::TNNSDKComposeSample {
+public:
+    virtual ~PoseDetectLandmark() {}
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+    virtual Status Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks);
+    Status SwitchLandmarkModel(std::shared_ptr<TNNSDKSample> newLandmarkmodel, bool switchToFullBody) {
+        auto predictor_landmark_cast = dynamic_cast<BlazePoseLandmark *>(newLandmarkmodel.get());
+        RETURN_VALUE_ON_NEQ(!predictor_landmark_cast, false, Status(TNNERR_PARAM_ERR, "invalid landmark model!"));
+        this->predictor_landmark_ = newLandmarkmodel;
+        this->sdks_[1] = newLandmarkmodel;
+        bool isFullBody = predictor_landmark_cast->isFullBody();
+        if (isFullBody) {
+            this->detect2roi_option.keypoints_start_idx = 0;
+            this->detect2roi_option.keypoints_end_idx = 1;
+        } else {
+            this->detect2roi_option.keypoints_start_idx = 2;
+            this->detect2roi_option.keypoints_end_idx = 3;
+        }
+        return TNN_OK;
+    }
+protected:
+    std::shared_ptr<TNNSDKSample> predictor_detect_ = nullptr;
+    std::shared_ptr<TNNSDKSample> predictor_landmark_ = nullptr;
+private:
+    DimsVector origin_input_shape;
+    BlazePoseLandmark::RoIGenOptions detect2roi_option = {
+        2,     // keypoints_start_idx
+        3,     // keypoints_end_idx
+        90.0f, // rotation_target_angle, in degree
+        1.5f,  // scale_x
+        1.5f   // scale_y
+    };
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_POSE_DETECT_LANDMARK_H_
+
diff --git a/3rdparty/TNN/examples/base/sample_timer.cc b/3rdparty/TNN/examples/base/sample_timer.cc
new file mode 100644
index 0000000..33d7ddc
--- /dev/null
+++ b/3rdparty/TNN/examples/base/sample_timer.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sample_timer.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+void SampleTimer::Start() {
+    start_ = system_clock::now();
+}
+
+void SampleTimer::Stop() {
+    stop_ = system_clock::now();
+}
+
+double SampleTimer::GetTime() {
+    return duration_cast<microseconds>(stop_ - start_).count() / 1000.0f;
+}
+
+void SampleTimer::Reset() {
+    stop_ = start_ = system_clock::now();
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/sample_timer.h b/3rdparty/TNN/examples/base/sample_timer.h
new file mode 100644
index 0000000..b690e15
--- /dev/null
+++ b/3rdparty/TNN/examples/base/sample_timer.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_SAMPLE_TIMER_H_
+#define TNN_EXAMPLES_BASE_SAMPLE_TIMER_H_
+
+#include <chrono>
+#include <string>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+using std::chrono::time_point;
+using std::chrono::system_clock;
+
+class SampleTimer {
+public:
+    SampleTimer() {};
+    void Start();
+    void Stop();
+    void Reset();
+    double GetTime();
+
+private:
+    time_point<system_clock> start_;
+    time_point<system_clock> stop_;
+};
+
+} // namespace TNN_NS
+
+#endif // TNN_EXAMPLES_BASE_SAMPLE_TIMER_H_ 
diff --git a/3rdparty/TNN/examples/base/skeleton_detector.cc b/3rdparty/TNN/examples/base/skeleton_detector.cc
new file mode 100644
index 0000000..18a459a
--- /dev/null
+++ b/3rdparty/TNN/examples/base/skeleton_detector.cc
@@ -0,0 +1,164 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "skeleton_detector.h"
+#include <cmath>
+#include <fstream>
+#include <cstring>
+
+namespace TNN_NS {
+
+Status SkeletonDetector::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<SkeletonDetectorOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto input_dims = GetInputShape();
+    option->input_height = input_dims[2];
+    option->input_width  = input_dims[3];
+    
+    landmark_filter = std::make_shared<VelocityFilter>(this->window_size,
+                                                       this->velocity_scale,
+                                                       this->min_allowed_object_scale,
+                                                       option->fps);
+
+    return status;
+}
+
+std::shared_ptr<Mat> SkeletonDetector::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat, std::string name) {
+    this->orig_input_height = input_mat->GetHeight();
+    this->orig_input_width  = input_mat->GetWidth();
+    return TNNSDKSample::ResizeToInputShape(input_mat, name);
+}
+
+MatConvertParam SkeletonDetector::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    // rgb input required
+    input_convert_param.scale = {0.01712475,   0.017507,     0.01742919,  0.0};
+    input_convert_param.bias  = {-2.11790393,  -2.03571429,  -1.80444444, 0.0};
+
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> SkeletonDetector::CreateSDKOutput() {
+    return std::make_shared<SkeletonDetectorOutput>();
+}
+
+Status SkeletonDetector::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<SkeletonDetectorOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                           Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<SkeletonDetectorOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+    Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    auto heatmap = output->GetMat("heatmap");
+    RETURN_VALUE_ON_NEQ(!heatmap, false,
+                           Status(TNNERR_PARAM_ERR, "heatmap mat is nil"));
+    
+    //decode keypoints
+    GenerateSkeleton(output, heatmap, option->min_threshold);
+    SmoothingLandmarks(output);
+    DeNormalize(output);
+    
+    return status;
+}
+
+void SkeletonDetector::GenerateSkeleton(SkeletonDetectorOutput* output,
+                                        std::shared_ptr<TNN_NS::Mat> heatmap, float threshold) {
+    SkeletonInfo& skeleton = output->keypoints;
+    std::vector<float>& confidence_list = output->confidence_list;
+    std::vector<bool>& detected = output->detected;
+
+    const int heatmap_channels = heatmap->GetChannel();
+    const int heatmap_height   = heatmap->GetHeight();
+    const int heatmap_width    = heatmap->GetWidth();
+    
+    const int src_height = this->orig_input_height;
+    const int src_width  = this->orig_input_width;
+
+    float* heatmap_data = static_cast<float *>(heatmap->GetData());
+    int idx = 0;
+    skeleton.key_points.resize(heatmap_channels);
+    confidence_list.resize(heatmap_channels);
+    detected.resize(heatmap_channels);
+
+    for(int c=0; c<heatmap_channels; ++c) {
+        float* data_c = heatmap_data + c * heatmap_height * heatmap_width;
+        // locate the max value inside a channel
+        float max_pos_h = -1;
+        float max_pos_w = -1;
+        float max_val = -FLT_MAX;
+        idx = 0;
+        for(int h=0; h<heatmap_height; ++h) {
+            for(int w=0; w<heatmap_width; ++w) {
+                auto val = data_c[idx++];
+                if ( val > max_val) {
+                    max_val = val;
+                    max_pos_h = h;
+                    max_pos_w = w;
+                }
+            }
+        }
+        if (max_val < threshold) {
+            skeleton.key_points[c] = std::make_pair(-1, -1);
+            detected[c] = false;
+        } else {
+            skeleton.key_points[c] = std::make_pair(max_pos_w / heatmap_width,
+                                                    max_pos_h / heatmap_height);
+            detected[c] = true;
+        }
+        confidence_list[c] = max_val;
+    }
+    for(const auto& line:this->lines) {
+        if (detected[line.first] && detected[line.second])
+            skeleton.lines.push_back(line);
+    }
+    skeleton.image_width  = src_width;
+    skeleton.image_height = src_height;
+}
+
+void SkeletonDetector::SmoothingLandmarks(SkeletonDetectorOutput* output) {
+    std::vector<std::pair<float, float>> out_landmarks;
+    landmark_filter->Apply2D(output->keypoints.key_points,
+                           std::make_pair(orig_input_height, orig_input_width),
+                           Now(),
+                           &out_landmarks);
+    if (out_landmarks.size() > 0) {
+        output->keypoints.key_points = out_landmarks;
+    }
+}
+
+void SkeletonDetector::DeNormalize(SkeletonDetectorOutput* output) {
+    const int src_height = this->orig_input_height;
+    const int src_width  = this->orig_input_width;
+
+    SkeletonInfo& skeleton = output->keypoints;
+    for(auto& lm2d: skeleton.key_points) {
+        float x = lm2d.first * src_width;
+        float y = lm2d.second * src_height;
+        lm2d = std::make_pair(x, y);
+    }
+    skeleton.image_height = src_height;
+    skeleton.image_width  = src_width;
+}
+
+
+}
+
diff --git a/3rdparty/TNN/examples/base/skeleton_detector.h b/3rdparty/TNN/examples/base/skeleton_detector.h
new file mode 100644
index 0000000..9fc59ae
--- /dev/null
+++ b/3rdparty/TNN/examples/base/skeleton_detector.h
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_SKELETON_DETECTOR_H_
+#define TNN_EXAMPLES_BASE_SKELETON_DETECTOR_H_
+
+#include "tnn_sdk_sample.h"
+#include "landmark_smoothing_filter.h"
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+namespace TNN_NS {
+
+typedef ObjectInfo SkeletonInfo;
+
+class SkeletonDetectorInput : public TNNSDKInput {
+public:
+    SkeletonDetectorInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~SkeletonDetectorInput(){}
+};
+
+/*
+ the output of the skeleton model is a list of 2d points:
+ point0: nose
+ point1: left  eye
+ point2: right eye
+ point3: left  ear
+ point4: right ear
+ point5: left  shoulder
+ point6: right shoulder
+ point7: left  elbow
+ point8: right elbow
+ point9: left  wrist
+ point10:right wrist
+ point11:left  hip
+ point12:right hip
+ point13:left  knee
+ point14:right knee
+ point15:left  ankle
+ point16:right ankle
+ */
+
+class SkeletonDetectorOutput : public TNNSDKOutput {
+public:
+    SkeletonDetectorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~SkeletonDetectorOutput() {};
+    SkeletonInfo keypoints;
+    std::vector<float> confidence_list;
+    std::vector<bool> detected;
+};
+
+class SkeletonDetectorOption : public TNNSDKOption {
+public:
+    SkeletonDetectorOption() {}
+    virtual ~SkeletonDetectorOption() {}
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    float min_threshold = 0.15;
+    int fps = 20;
+};
+
+class SkeletonDetector : public TNNSDKSample {
+public:
+    virtual ~SkeletonDetector() {};
+    
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateSkeleton(SkeletonDetectorOutput* output, std::shared_ptr<TNN_NS::Mat> heatmap,
+                          float threshold);
+    void SmoothingLandmarks(SkeletonDetectorOutput* output);
+    void DeNormalize(SkeletonDetectorOutput* output);
+    // the input mat size
+    int orig_input_width;
+    int orig_input_height;
+    std::vector<SkeletonInfo> history;
+    // lines for skeleton model:
+    std::vector<std::pair<int, int>> lines = {
+        {0, 1},
+        {0, 2},
+        {1, 3},
+        {2, 4},
+        {5, 6},
+        {5, 7},
+        {5, 11},
+        {6, 8},
+        {6, 12},
+        {7, 9},
+        {8, 10},
+        {11,12},
+        {11,13},
+        {12,14},
+        {13,15},
+        {14,16}
+    };
+    // landmark filtering options
+    const int window_size = 5;
+    const float velocity_scale = 10.0;
+    const float min_allowed_object_scale = 1e-6;
+    std::shared_ptr<VelocityFilter> landmark_filter;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_SKELETON_DETECTOR_H_
+
diff --git a/3rdparty/TNN/examples/base/tnn_fps_counter.cc b/3rdparty/TNN/examples/base/tnn_fps_counter.cc
new file mode 100644
index 0000000..4a9ff63
--- /dev/null
+++ b/3rdparty/TNN/examples/base/tnn_fps_counter.cc
@@ -0,0 +1,85 @@
+//  Copyright © 2020 tencent. All rights reserved.
+
+#include "tnn_fps_counter.h"
+#include "sample_timer.h"
+
+const std::string kFPSCounterDefaultTag = "fps.default.tag";
+
+TNNFPSCounter::TNNFPSCounter(){
+    map_fps_ = {};
+    map_start_time_ = {};
+}
+
+std::string TNNFPSCounter::RetifiedTag(std::string tag) {
+    return tag = tag.length() <= 0 ? kFPSCounterDefaultTag : tag;
+}
+
+void TNNFPSCounter::Begin(std::string tag) {
+    tag = RetifiedTag(tag);
+    
+    map_timer["tag"] = std::make_shared<TNN_NS::SampleTimer>();
+    map_timer["tag"]->Start(); 
+}
+
+void TNNFPSCounter::End(std::string tag) {
+    tag = RetifiedTag(tag);
+    
+    map_timer["tag"]->Stop();
+    double time = map_timer["tag"]->GetTime();
+    
+    if (time > 0.1) {
+        double fps = GetFPS(tag);
+        double fps_current = 1000.0 / time;
+        const double smoothing = 0.75;
+        fps = smoothing*fps + (1 - smoothing)*fps_current;
+        //smoothing time
+        double time_history = GetTime(tag);
+        double smoothing_time = smoothing*time_history + (1-smoothing)*time;
+        
+        map_fps_[tag] = fps;
+        map_time_[tag] = smoothing_time;
+    }
+}
+
+double TNNFPSCounter::GetFPS(std::string tag) {
+    tag = RetifiedTag(tag);
+    
+    if (map_fps_.find(tag) != map_fps_.end()) {
+        return map_fps_[tag];
+    }
+    return 0;
+}
+
+double TNNFPSCounter::GetTime(std::string tag) {
+    tag = RetifiedTag(tag);
+
+    if (map_time_.find(tag) != map_time_.end()) {
+        return map_time_[tag];
+    }
+    return 0;
+}
+
+std::map<std::string, double> TNNFPSCounter::GetAllFPS() {
+    std::map<std::string, double> map_all;
+    for (auto iter : map_fps_) {
+        map_all[RetifiedTag(iter.first)] = iter.second;
+    }
+    return map_all;
+}
+
+std::map<std::string, double> TNNFPSCounter::GetAllTime() {
+    std::map<std::string, double> map_all;
+    for(auto iter : map_time_) {
+        map_all[RetifiedTag(iter.first)] = iter.second;
+    }
+    return map_all;
+}
+
+double TNNFPSCounter::GetStartTime(std::string tag) {
+    tag = RetifiedTag(tag);
+    
+    if (map_start_time_.find(tag) != map_start_time_.end()) {
+        return map_start_time_[tag];
+    }
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/base/tnn_fps_counter.h b/3rdparty/TNN/examples/base/tnn_fps_counter.h
new file mode 100644
index 0000000..d318d18
--- /dev/null
+++ b/3rdparty/TNN/examples/base/tnn_fps_counter.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_TNN_FPS_COUNTER_H_
+#define TNN_EXAMPLES_BASE_TNN_FPS_COUNTER_H_
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "sample_timer.h"
+#include <time.h>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/time.h>
+#endif
+
+class TNNFPSCounter {
+public:
+    TNNFPSCounter();
+    void Begin(std::string tag);
+    void End(std::string tag);
+    double GetFPS(std::string tag);
+    double GetTime(std::string tag);
+    std::map<std::string, double> GetAllFPS();
+    std::map<std::string, double> GetAllTime();
+    
+protected:
+    std::map<std::string, std::shared_ptr<TNN_NS::SampleTimer>> map_timer;
+    std::map<std::string, double> map_fps_ = {};
+    std::map<std::string, double> map_start_time_ = {};
+    std::map<std::string, double> map_time_ = {};
+    
+private:
+    std::string RetifiedTag(std::string tag);
+    double GetStartTime(std::string tag);
+};
+
+#endif //TNN_EXAMPLES_BASE_TNN_FPS_COUNTER_H_
diff --git a/3rdparty/TNN/examples/base/tnn_sdk_sample.cc b/3rdparty/TNN/examples/base/tnn_sdk_sample.cc
new file mode 100644
index 0000000..ec784df
--- /dev/null
+++ b/3rdparty/TNN/examples/base/tnn_sdk_sample.cc
@@ -0,0 +1,1103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_sdk_sample.h"
+#include "sample_timer.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include <algorithm>
+#include <cstring>
+#include <float.h>
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#define ENABLE_DUMP_BLOB_DATA 0
+#if ENABLE_DUMP_BLOB_DATA
+static int blob_id = 0;
+#endif
+
+namespace TNN_NS {
+const std::string kTNNSDKDefaultName = "TNN.sdk.default.name";
+
+void printShape(const std::string& msg, const DimsVector& shape) {
+    printf("%s:(%d,%d,%d,%d)\n", msg.c_str(), shape[0], shape[1], shape[2], shape[3]);
+}
+ImageInfo::ImageInfo() {
+    image_width = 0;
+    image_height = 0;
+    image_channel = 0;
+    data = nullptr;
+}
+
+ImageInfo::ImageInfo(const ImageInfo& info) {
+    image_width = info.image_width;
+    image_height = info.image_height;
+    image_channel = info.image_channel;
+    data = info.data;
+}
+
+ImageInfo::ImageInfo(std::shared_ptr<Mat>image) {
+    if (image != nullptr) {
+        const auto& dims = image->GetDims();
+        image_channel = dims[1];
+        image_height = dims[2];
+        image_width = dims[3];
+        auto count = DimsVectorUtils::Count(dims);
+        data.reset(new char[count]);
+        memcpy(data.get(), image->GetData(), count);
+    }
+}
+
+ImageInfo ImageInfo::FlipX() {
+    auto flip_image_row = [](const char*src, char*dst, int width, int channel){
+        int src_offset = (width-1) * channel;
+        int dst_offset = 0;
+        for(int w=0; w<width; ++w) {
+            for(int c=0; c<channel; ++c) {
+                dst[dst_offset + c] = src[src_offset + c];
+            }
+            src_offset -= channel;
+            dst_offset += channel;
+        }
+    };
+    ImageInfo info;
+    info.image_width = image_width;
+    info.image_height = image_height;
+    info.image_channel = image_channel;
+    info.data.reset(new char[info.image_height * info.image_width*info.image_channel]);
+    auto bytes_per_row = image_width * image_channel;
+    for(int h=0; h<image_height; ++h) {
+        flip_image_row(data.get()+h*bytes_per_row, info.data.get()+h*bytes_per_row, image_width, image_channel);
+    }
+
+    return info;
+}
+
+ObjectInfo ObjectInfo::FlipX() {
+    ObjectInfo  info;
+    info.score = this->score;
+    info.class_id = this->class_id;
+    info.image_width = this->image_width;
+    info.image_height = this->image_width;
+    info.lines = this->lines;
+    
+    info.x1 = this->image_width - this->x2;
+    info.x2 = this->image_width - this->x1;
+    info.y1 = this->y1;
+    info.y2 = this->y2;
+    
+    //key points
+    std::vector<std::pair<float, float>> key_points;
+    for (auto item : this->key_points) {
+        key_points.push_back(std::make_pair(this->image_width - item.first, item.second));
+    }
+    info.key_points = key_points;
+    
+    //key points 3d
+    std::vector<triple<float, float, float>> key_points_3d;
+    for (auto item : this->key_points_3d) {
+        key_points_3d.push_back(std::make_tuple(this->image_width - std::get<0>(item),
+                                                                     std::get<1>(item),
+                                                                     std::get<2>(item)));
+    }
+    info.key_points_3d = key_points_3d;
+    return info;
+}
+
+ObjectInfo ObjectInfo::AddOffset(float offset_x, float offset_y) {
+    ObjectInfo  info;
+    info.score = this->score;
+    info.class_id = this->class_id;
+    info.image_width = this->image_width;
+    info.image_height = this->image_width;
+    
+    info.x1 = this->x1 + offset_x;
+    info.x2 = this->x2 + offset_x;
+    info.y1 = this->y1 + offset_y;
+    info.y2 = this->y2 + offset_y;
+    
+    //key points
+    std::vector<std::pair<float, float>> key_points;
+    for (auto item : this->key_points) {
+        key_points.push_back(std::make_pair(item.first + offset_x, item.second + offset_y));
+    }
+    info.key_points = key_points;
+    
+    //key points 3d
+    std::vector<triple<float, float, float>> key_points_3d;
+    for (auto item : this->key_points_3d) {
+        key_points_3d.push_back(std::make_tuple(std::get<0>(item) + offset_x,
+                                                                     std::get<1>(item) + offset_y,
+                                                                     std::get<2>(item)));
+    }
+    info.key_points_3d = key_points_3d;
+    return info;
+}
+
+float ObjectInfo::IntersectionRatio(ObjectInfo *obj) {
+    if (!obj) {
+        return 0;
+    }
+    
+    float area1 = std::abs((this->x2 - this->x1) * (this->y2 - this->y1));
+    float area2 = std::abs((obj->x2 - obj->x1) * (obj->y2 - obj->y1));
+    
+    float x1 = std::max(obj->x1, this->x1);
+    float x2 = std::min(obj->x2, this->x2);
+    float y1 = std::max(obj->y1, this->y1);
+    float y2 = std::min(obj->y2, this->y2);
+    
+    float area = (x2 > x1 && y2 > y1) ? std::abs((x2 - x1) * (y2 - y1)) : 0;
+    
+    return area / (area1 + area2 - area);
+}
+
+ObjectInfo ObjectInfo::AdjustToImageSize(int orig_image_height, int orig_image_width) {
+    float scale_x = orig_image_width/(float)this->image_width;
+    float scale_y = orig_image_height/(float)this->image_height;
+    
+    ObjectInfo  info_orig;
+    info_orig.score = this->score;
+    info_orig.class_id = this->class_id;
+    info_orig.image_width = orig_image_width;
+    info_orig.image_height = orig_image_height;
+    
+    int x_min = std::min(this->x1, this->x2)*scale_x;
+    int x_max = std::max(this->x1, this->x2)*scale_x;
+    int y_min = std::min(this->y1, this->y2)*scale_y;
+    int y_max = std::max(this->y1, this->y2)*scale_y;
+    
+    x_min = std::min(std::max(x_min, 0), orig_image_width-1);
+    x_max = std::min(std::max(x_max, 0), orig_image_width-1);
+    y_min = std::min(std::max(y_min, 0), orig_image_height-1);
+    y_max = std::min(std::max(y_max, 0), orig_image_height-1);
+    
+    info_orig.x1 = x_min;
+    info_orig.x2 = x_max;
+    info_orig.y1 = y_min;
+    info_orig.y2 = y_max;
+    
+    
+    //key points
+    std::vector<std::pair<float, float>> key_points;
+    for (auto item : this->key_points) {
+        key_points.push_back(std::make_pair(item.first*scale_x, item.second*scale_y));
+    }
+    info_orig.key_points = key_points;
+    
+    //key points 3d
+    std::vector<triple<float, float, float>> key_points_3d;
+    for (auto item : this->key_points_3d) {
+        key_points_3d.push_back(std::make_tuple(std::get<0>(item) * scale_x,
+                                                                     std::get<1>(item) * scale_y,
+                                                                     std::get<2>(item)));
+    }
+    info_orig.key_points_3d = key_points_3d;
+    info_orig.lines = lines;
+    
+    return info_orig;
+}
+
+ObjectInfo ObjectInfo::AdjustToViewSize(int view_height, int view_width, int gravity) {
+    ObjectInfo  info;
+    info.score = this->score;
+    info.class_id = this->class_id;
+    info.image_width = view_width;
+    info.image_height = view_height;
+    info.lines = lines;
+    
+    float view_aspect = view_height/(float)(view_width + FLT_EPSILON);
+    float object_aspect = this->image_height/(float)(this->image_width + FLT_EPSILON);
+    
+    if (gravity == 2) {
+        if (view_aspect > object_aspect) {
+            float object_aspect_width = view_height / object_aspect;
+            auto info_aspect = AdjustToImageSize(view_height, object_aspect_width);
+            float offset_x = (object_aspect_width - view_width) / 2;
+            info_aspect = info_aspect.AddOffset(-offset_x, 0);
+            info.x1 = info_aspect.x1;
+            info.x2 = info_aspect.x2;
+            info.y1 = info_aspect.y1;
+            info.y2 = info_aspect.y2;
+            info.key_points = info_aspect.key_points;
+            info.key_points_3d = info_aspect.key_points_3d;
+        } else {
+            float object_aspect_height = view_width * object_aspect;
+            auto info_aspect = AdjustToImageSize(object_aspect_height, view_width);
+            float offset_y = (object_aspect_height - view_height) / 2;
+            info_aspect = info_aspect.AddOffset(0, -offset_y);
+            info.x1 = info_aspect.x1;
+            info.x2 = info_aspect.x2;
+            info.y1 = info_aspect.y1;
+            info.y2 = info_aspect.y2;
+            info.key_points = info_aspect.key_points;
+            info.key_points_3d = info_aspect.key_points_3d;
+        }
+    } else if (gravity == 1) {
+        if (view_aspect > object_aspect) {
+            float object_aspect_height = view_width * object_aspect;
+            auto info_aspect = AdjustToImageSize(object_aspect_height, view_width);
+            float offset_y = (object_aspect_height - view_height) / 2;
+            info_aspect = info_aspect.AddOffset(0, -offset_y);
+            info.x1 = info_aspect.x1;
+            info.x2 = info_aspect.x2;
+            info.y1 = info_aspect.y1;
+            info.y2 = info_aspect.y2;
+            info.key_points = info_aspect.key_points;
+            info.key_points_3d = info_aspect.key_points_3d;
+        } else {
+            float object_aspect_width = view_height / object_aspect;
+            auto info_aspect = AdjustToImageSize(view_height, object_aspect_width);
+            float offset_x = (object_aspect_width - view_width) / 2;
+            info_aspect = info_aspect.AddOffset(-offset_x, 0);
+            info.x1 = info_aspect.x1;
+            info.x2 = info_aspect.x2;
+            info.y1 = info_aspect.y1;
+            info.y2 = info_aspect.y2;
+            info.key_points = info_aspect.key_points;
+            info.key_points_3d = info_aspect.key_points_3d;
+        }
+    } else {
+        return AdjustToImageSize(view_height, view_width);
+    }
+    return info;
+}
+
+std::string BenchOption::Description() {
+    std::ostringstream ostr;
+    ostr << "create_count = " << create_count << "  warm_count = " << warm_count
+         << "  forward_count = " << forward_count;
+
+    ostr << std::endl;
+    return ostr.str();
+}
+
+void BenchResult::Reset() {
+    min   = FLT_MAX;
+    max   = FLT_MIN;
+    avg   = 0;
+    total = 0;
+    count = 0;
+
+    diff = 0;
+}
+
+int BenchResult::AddTime(float time) {
+    count++;
+    total += time;
+    min = std::min(min, time);
+    max = std::max(max, time);
+    avg = total / count;
+    return 0;
+}
+
+std::string BenchResult::Description() {
+    std::ostringstream ostr;
+    ostr << "min = " << min << "  max = " << max << "  avg = " << avg;
+
+    if (status != TNN_NS::TNN_OK) {
+        ostr << "\nerror = " << status.description();
+    }
+    ostr << std::endl;
+
+    return ostr.str();
+}
+
+DeviceType TNNSDKUtils::GetFallBackDeviceType(DeviceType dev) {
+    switch (dev) {
+        case DEVICE_CUDA:
+            return DEVICE_NAIVE;
+        case DEVICE_RK_NPU:
+        case DEVICE_HUAWEI_NPU:
+        case DEVICE_METAL:
+        case DEVICE_OPENCL:
+        case DEVICE_ATLAS:
+        case DEVICE_DSP:
+            return DEVICE_ARM;
+        case DEVICE_X86:
+        case DEVICE_ARM:
+        case DEVICE_NAIVE:
+            return dev;
+    }
+    return DEVICE_NAIVE;
+}
+
+#pragma mark - TNNSDKInput
+TNNSDKInput::TNNSDKInput(std::shared_ptr<TNN_NS::Mat> mat) {
+    if (mat) {
+        mat_map_[kTNNSDKDefaultName] = mat;
+    }
+}
+
+TNNSDKInput::~TNNSDKInput() {}
+
+bool TNNSDKInput::IsEmpty() {
+    if (mat_map_.size() <= 0) {
+        return true;
+    }
+    return false;
+}
+
+bool TNNSDKInput::AddMat(std::shared_ptr<TNN_NS::Mat> mat, std::string name) {
+    if (name.empty() || !mat) {
+        return false;
+    }
+    
+    mat_map_[name] = mat;
+    return true;
+}
+
+std::shared_ptr<TNN_NS::Mat> TNNSDKInput::GetMat(std::string name) {
+    std::shared_ptr<TNN_NS::Mat> mat = nullptr;
+    if (name == kTNNSDKDefaultName && mat_map_.size() > 0) {
+        return mat_map_.begin()->second;
+    }
+    
+    if (mat_map_.find(name) != mat_map_.end()) {
+        mat = mat_map_[name];
+    }
+    return mat;
+}
+
+#pragma mark - TNNSDKOutput
+TNNSDKOutput::~TNNSDKOutput() {}
+
+#pragma mark - TNNSDKOption
+TNNSDKOption::TNNSDKOption() {}
+
+TNNSDKOption::~TNNSDKOption() {}
+
+#pragma mark - TNNSDKSample
+TNNSDKSample::TNNSDKSample() {}
+
+TNNSDKSample::~TNNSDKSample() {}
+
+
+void TNNSDKSample::setCheckNpuSwitch(bool option)
+{
+    check_npu_ = option;
+}
+
+Status TNNSDKSample::GetCommandQueue(void **command_queue) {
+    if (instance_) {
+        return instance_->GetCommandQueue(command_queue);
+    }
+    return Status(TNNERR_INST_ERR, "instance_ GetCommandQueue return nil");
+}
+
+Status TNNSDKSample::Resize(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, TNNInterpType interp_type) {
+    Status status = TNN_OK;
+    
+    void * command_queue = nullptr;
+    status = GetCommandQueue(&command_queue);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("getCommandQueue failed with:%s\n", status.description().c_str());
+        return status;
+    }
+    
+    InterpType type = INTERP_TYPE_NEAREST;
+    if(interp_type == TNNInterpNearest){
+        type = TNN_NS::INTERP_TYPE_NEAREST;
+    } else if(interp_type == TNNInterpLinear) {
+        type = TNN_NS::INTERP_TYPE_LINEAR;
+    }
+    
+    ResizeParam param;
+    param.type = type;
+    
+    auto dst_dims = dst->GetDims();
+    auto src_dims = src->GetDims();
+    param.scale_w = dst_dims[3] / static_cast<float>(src_dims[3]);
+    param.scale_h = dst_dims[2] / static_cast<float>(src_dims[2]);
+    
+    status = MatUtils::Resize(*(src.get()), *(dst.get()), param, command_queue);
+    if (status != TNN_NS::TNN_OK){
+        LOGE("resize failed with:%s\n", status.description().c_str());
+    }
+    
+    return status;
+}
+
+Status TNNSDKSample::Crop(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, int start_x, int start_y) {
+    Status status = TNN_OK;
+    
+    void *command_queue = nullptr;
+    status = GetCommandQueue(&command_queue);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("getCommandQueue failed with:%s\n", status.description().c_str());
+        return status;
+    }
+    
+    CropParam param;
+    param.top_left_x = start_x;
+    param.top_left_y = start_y;
+    auto dst_dims = dst->GetDims();
+    param.width  = dst_dims[3];
+    param.height = dst_dims[2];
+    
+    status = MatUtils::Crop(*(src.get()), *(dst.get()), param, command_queue);
+    if (status != TNN_NS::TNN_OK){
+        LOGE("crop failed with:%s\n", status.description().c_str());
+    }
+    
+    return status;
+}
+
+Status TNNSDKSample::WarpAffine(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, TNNInterpType interp_type, TNNBorderType border_type, float trans_mat[2][3]) {
+    Status status = TNN_OK;
+    
+    void * command_queue = nullptr;
+    status = GetCommandQueue(&command_queue);
+    if (status != TNN_OK) {
+        LOGE("getCommandQueue failed with:%s\n", status.description().c_str());
+        return status;
+    }
+    
+    InterpType itype = INTERP_TYPE_NEAREST;
+    if (interp_type == TNNInterpNearest){
+        itype = INTERP_TYPE_NEAREST;
+    } else if(interp_type == TNNInterpLinear) {
+        itype = INTERP_TYPE_LINEAR;
+    }
+    BorderType btype = BORDER_TYPE_CONSTANT;
+    if (border_type == TNNBorderConstant) {
+        btype = BORDER_TYPE_CONSTANT;
+    } else if(border_type == TNNBorderReflect) {
+        btype = BORDER_TYPE_REFLECT;
+    } else if(border_type == TNNBorderEdge) {
+        btype = BORDER_TYPE_EDGE;
+    }
+    WarpAffineParam param;
+    param.interp_type = itype;
+    param.border_type = btype;
+    
+    auto dst_dims = dst->GetDims();
+    auto src_dims = src->GetDims();
+    memcpy(param.transform, trans_mat, sizeof(float)*2*3);
+    
+    status = MatUtils::WarpAffine(*(src.get()), *(dst.get()), param, command_queue);
+    if (status != TNN_NS::TNN_OK){
+        LOGE("warpaffine failed with:%s\n", status.description().c_str());
+    }
+    
+    return status;
+}
+
+Status TNNSDKSample::Copy(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst) {
+    Status status = TNN_OK;
+    
+    void *command_queue = nullptr;
+    status = GetCommandQueue(&command_queue);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("getCommandQueue failed with:%s\n", status.description().c_str());
+        return status;
+    }
+    
+    status = MatUtils::Copy(*(src.get()), *(dst.get()), command_queue);
+    if (status != TNN_NS::TNN_OK){
+        LOGE("copy failed with:%s\n", status.description().c_str());
+    }
+    
+    return status;
+}
+
+Status TNNSDKSample::CopyMakeBorder(std::shared_ptr<TNN_NS::Mat> src,
+                      std::shared_ptr<TNN_NS::Mat> dst,
+                      int top, int bottom, int left, int right,
+                                    TNNBorderType border_type, uint8_t border_value) {
+    Status status = TNN_OK;
+    
+    void *command_queue = nullptr;
+    status = GetCommandQueue(&command_queue);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("getCommandQueue failed with:%s\n", status.description().c_str());
+        return status;
+    }
+    
+    CopyMakeBorderParam param;
+    param.border_val = border_value;
+    param.top = top;
+    param.bottom = bottom;
+    param.left = left;
+    param.right = right;
+    param.border_type = BORDER_TYPE_CONSTANT;
+    if (border_type == TNNBorderEdge)
+        param.border_type = BORDER_TYPE_EDGE;
+    else if (border_type == TNNBorderReflect)
+        param.border_type = BORDER_TYPE_REFLECT;
+    
+    status = MatUtils::CopyMakeBorder(*(src.get()), *(dst.get()), param, command_queue);
+    if (status != TNN_NS::TNN_OK){
+        LOGE("copy failed with:%s\n", status.description().c_str());
+    }
+    
+    return status;
+}
+
+void TNNSDKSample::setNpuModelPath(std::string stored_path)
+{
+    model_path_str_ = stored_path;
+}
+
+TNN_NS::Status TNNSDKSample::Init(std::shared_ptr<TNNSDKOption> option) {
+    option_ = option;
+    //网络初始化
+    TNN_NS::Status status;
+    if (!net_) {
+        TNN_NS::ModelConfig config;
+#if TNN_SDK_USE_NCNN_MODEL
+        config.model_type = TNN_NS::MODEL_TYPE_NCNN;
+#else
+        config.model_type = TNN_NS::MODEL_TYPE_TNN;
+#endif
+        config.params = {option->proto_content, option->model_content, model_path_str_};
+
+        auto net = std::make_shared<TNN_NS::TNN>();
+        status   = net->Init(config);
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("instance.net init failed %d", (int)status);
+            return status;
+        }
+        net_ = net;
+    }
+
+    // network init
+#if defined(TNN_USE_NEON)
+    device_type_ = TNN_NS::DEVICE_ARM;
+#else
+    device_type_ = TNN_NS::DEVICE_X86;
+#endif
+    if(option->compute_units == TNNComputeUnitsGPU) {
+#if defined(__APPLE__) && TARGET_OS_IPHONE
+        device_type_ = TNN_NS::DEVICE_METAL;
+#else
+        device_type_ = TNN_NS::DEVICE_OPENCL;
+#endif
+    }
+    else if (option->compute_units == TNNComputeUnitsHuaweiNPU) {
+        device_type_      = TNN_NS::DEVICE_HUAWEI_NPU;
+#if defined(__APPLE__) && TARGET_OS_IPHONE
+        device_type_ = TNN_NS::DEVICE_METAL;
+#else
+        device_type_      = TNN_NS::DEVICE_HUAWEI_NPU;
+#endif
+    } else if (option->compute_units == TNNComputeUnitsTensorRT) {
+        device_type_ = TNN_NS::DEVICE_CUDA;
+    }
+    
+    //创建实例instance
+    {
+        TNN_NS::NetworkConfig network_config;
+        network_config.library_path = {option->library_path};
+        network_config.device_type  = device_type_;
+        network_config.precision = option->precision;
+        network_config.cache_path = "/sdcard/";
+        if(device_type_ == TNN_NS::DEVICE_HUAWEI_NPU){
+            network_config.network_type = NETWORK_TYPE_HUAWEI_NPU;
+        } else if (option->compute_units == TNNComputeUnitsOpenvino) {
+            network_config.network_type = NETWORK_TYPE_OPENVINO;
+        } else if (device_type_ == TNN_NS::DEVICE_CUDA) {
+            network_config.network_type = NETWORK_TYPE_TENSORRT;
+        }
+        auto instance               = net_->CreateInst(network_config, status, option->input_shapes);
+
+        if (!check_npu_ && (status != TNN_NS::TNN_OK || !instance)) {
+            // try device_arm
+            if (option->compute_units >= TNNComputeUnitsGPU) {
+                device_type_               = TNN_NS::DEVICE_ARM;
+                network_config.device_type = TNN_NS::DEVICE_ARM;
+                instance                   = net_->CreateInst(network_config, status,  option->input_shapes);
+            }
+        }
+        instance_ = instance;
+    }
+    return status;
+}
+
+TNNComputeUnits TNNSDKSample::GetComputeUnits() {
+    switch (device_type_) {
+        case DEVICE_HUAWEI_NPU:
+            return TNNComputeUnitsHuaweiNPU;
+        case DEVICE_METAL:
+        case DEVICE_OPENCL:
+            return TNNComputeUnitsGPU;
+        default:
+            return TNNComputeUnitsCPU;
+    }
+}
+
+void TNNSDKSample::SetBenchOption(BenchOption option) {
+    bench_option_ = option;
+}
+
+BenchResult TNNSDKSample::GetBenchResult() {
+    return bench_result_;
+}
+
+DimsVector TNNSDKSample::GetInputShape(std::string name) {
+    DimsVector shape = {};
+    BlobMap blob_map = {};
+    if (instance_) {
+        instance_->GetAllInputBlobs(blob_map);
+    }
+    
+    if (kTNNSDKDefaultName == name && blob_map.size() > 0) {
+        if (blob_map.begin()->second) {
+            shape = blob_map.begin()->second->GetBlobDesc().dims;
+        }
+    }
+    
+    if (blob_map.find(name) != blob_map.end() && blob_map[name]) {
+        shape = blob_map[name]->GetBlobDesc().dims;
+    }
+    return shape;
+}
+
+std::vector<std::string> TNNSDKSample::GetInputNames() {
+    std::vector<std::string> names;
+    if (instance_) {
+        BlobMap blob_map;
+        instance_->GetAllInputBlobs(blob_map);
+        for (const auto& item : blob_map) {
+            names.push_back(item.first);
+        }
+    }
+    return names;
+}
+
+std::vector<std::string> TNNSDKSample::GetOutputNames() {
+    std::vector<std::string> names;
+    if (instance_) {
+        BlobMap blob_map;
+        instance_->GetAllOutputBlobs(blob_map);
+        for (const auto& item : blob_map) {
+            names.push_back(item.first);
+        }
+    }
+    return names;
+}
+
+std::shared_ptr<Mat> TNNSDKSample::ResizeToInputShape(std::shared_ptr<Mat> input_mat, std::string name) {
+    auto target_dims = GetInputShape(name);
+    auto input_height = input_mat->GetHeight();
+    auto input_width = input_mat->GetWidth();
+    if (target_dims.size() >= 4 &&
+        (input_height != target_dims[2] || input_width != target_dims[3])) {
+        auto target_mat = std::make_shared<TNN_NS::Mat>(input_mat->GetDeviceType(),
+                                                        input_mat->GetMatType(), target_dims);
+        auto status = Resize(input_mat, target_mat, TNNInterpLinear);
+        if (status == TNN_OK) {
+            return target_mat;
+        } else {
+            LOGE("%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+    return input_mat;
+}
+
+bool TNNSDKSample::hideTextBox() {
+    return false;
+}
+
+TNN_NS::MatConvertParam TNNSDKSample::GetConvertParamForInput(std::string name) {
+    return TNN_NS::MatConvertParam();
+}
+
+TNN_NS::MatConvertParam TNNSDKSample::GetConvertParamForOutput(std::string name) {
+    return TNN_NS::MatConvertParam();
+}
+
+std::shared_ptr<TNNSDKOutput> TNNSDKSample::CreateSDKOutput() {
+    return std::make_shared<TNNSDKOutput>();
+}
+
+TNN_NS::Status TNNSDKSample::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output) {
+    return TNN_OK;
+}
+
+std::shared_ptr<TNN_NS::Mat> TNNSDKSample::ProcessSDKInputMat(std::shared_ptr<TNN_NS::Mat> mat,
+                                                              std::string name) {
+    return mat;
+}
+
+TNN_NS::Status TNNSDKSample::DumpBlob(const BlobMap& blob_map, std::string output_dir) {
+#if ENABLE_DUMP_BLOB_DATA
+    for (const auto& item : blob_map) {
+        std::string output_path = output_dir + "/" + item.first + "_" + std::to_string(blob_id++);
+        DeviceType device_type = DEVICE_NAIVE;
+        MatType mat_type = NCHW_FLOAT;
+
+        void* command_queue;
+        instance_->GetCommandQueue(&command_queue);
+        BlobConverter blob_converter(item.second);
+        MatConvertParam param;
+        Mat cpu_mat(device_type, mat_type, item.second->GetBlobDesc().dims);
+        Status ret = blob_converter.ConvertToMat(cpu_mat, param, command_queue);
+        if (ret != TNN_OK) {
+            LOGE("blob (name: %s) convert failed (%s)\n", item.first.c_str(), ret.description().c_str());
+            return ret;
+        }
+
+        std::ofstream out_stream(output_path);
+        if (out_stream.is_open()) {
+            out_stream << item.first << std::endl;
+            for (auto d : cpu_mat.GetDims()) {
+                out_stream << d << " ";
+            }
+            out_stream << std::endl;
+            float* data_ptr = reinterpret_cast<float*>(cpu_mat.GetData());
+            for (int index = 0; index < DimsVectorUtils::Count(cpu_mat.GetDims()); index++) {
+                out_stream << data_ptr[index] << std::endl;
+            }
+            out_stream.close();
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+TNN_NS::Status TNNSDKSample::Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output) {
+    Status status = TNN_OK;
+    if (!input || input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    
+#if TNN_SDK_ENABLE_BENCHMARK
+    bench_result_.Reset();
+    for (int fcount = 0; fcount < bench_option_.forward_count; fcount++) {
+        SampleTimer sample_time;
+        sample_time.Start();
+#endif
+        
+        // step 1. set input mat
+        auto input_names = GetInputNames();
+        if (input_names.size() == 1) {
+            auto input_mat = input->GetMat();
+            input_mat = ProcessSDKInputMat(input_mat);
+            auto input_convert_param = GetConvertParamForInput();
+            auto status = instance_->SetInputMat(input_mat, input_convert_param);
+            RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+        } else {
+            for (auto name : input_names) {
+                auto input_mat = input->GetMat(name);
+                input_mat = ProcessSDKInputMat(input_mat, name);
+                auto input_convert_param = GetConvertParamForInput(name);
+                auto status = instance_->SetInputMat(input_mat, input_convert_param, name);
+                RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+            }
+        }
+
+#if ENABLE_DUMP_BLOB_DATA
+        BlobMap blob_map = {};
+        instance_->GetAllInputBlobs(blob_map);
+        std::string output_dir = "/mnt/sdcard";
+        status = DumpBlob(blob_map, output_dir);
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("Dump Blob Error: %s\n", status.description().c_str());
+            return status;
+        }
+#endif
+        
+        // step 2. Forward
+        status = instance_->ForwardAsync(nullptr);
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("instance.Forward Error: %s\n", status.description().c_str());
+            return status;
+        }
+
+        // step 3. get output mat
+        auto input_device_type = input->GetMat()->GetDeviceType();
+        output = CreateSDKOutput();
+        auto output_names = GetOutputNames();
+        if (output_names.size() == 1) {
+            auto output_convert_param = GetConvertParamForOutput();
+            std::shared_ptr<TNN_NS::Mat> output_mat = nullptr;
+            status = instance_->GetOutputMat(output_mat, output_convert_param, "",
+                                             TNNSDKUtils::GetFallBackDeviceType(input_device_type));
+            RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+            output->AddMat(output_mat, output_names[0]);
+        } else {
+            for (auto name : output_names) {
+                auto output_convert_param = GetConvertParamForOutput(name);
+                std::shared_ptr<TNN_NS::Mat> output_mat = nullptr;
+                status = instance_->GetOutputMat(output_mat, output_convert_param, name,
+                                                 TNNSDKUtils::GetFallBackDeviceType(input_device_type));
+                RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+                output->AddMat(output_mat, name);
+            }
+        }
+  
+        
+#if TNN_SDK_ENABLE_BENCHMARK
+        sample_time.Stop();
+        double elapsed = sample_time.GetTime();
+        bench_result_.AddTime(elapsed);
+#endif
+        
+        ProcessSDKOutput(output);
+#if TNN_SDK_ENABLE_BENCHMARK
+    }
+#endif
+    // Detection done
+    
+    return status;
+}
+
+#pragma mark - TNNSDKComposeSample
+TNNSDKComposeSample::TNNSDKComposeSample() {}
+
+TNNSDKComposeSample::~TNNSDKComposeSample() {
+    sdks_ = {};
+}
+
+Status TNNSDKComposeSample::Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks) {
+    sdks_ = sdks;
+    return TNN_OK;
+}
+
+TNNComputeUnits TNNSDKComposeSample::GetComputeUnits() {
+    if (sdks_.size() > 0) {
+        return sdks_[0]->GetComputeUnits();
+    }
+    return TNNComputeUnitsCPU;
+}
+
+Status TNNSDKComposeSample::GetCommandQueue(void **command_queue) {
+    if (sdks_.size() > 0) {
+        return sdks_[0]->GetCommandQueue(command_queue);
+    }
+    return Status(TNNERR_INST_ERR, "instance_ GetCommandQueue return nil");
+}
+
+DimsVector TNNSDKComposeSample::GetInputShape(std::string name) {
+    DimsVector shape = {};
+    if (sdks_.size() > 0) {
+        return sdks_[0]->GetInputShape(name);
+    }
+    return shape;
+}
+
+TNN_NS::Status TNNSDKComposeSample::Predict(std::shared_ptr<TNNSDKInput> input,
+                                            std::shared_ptr<TNNSDKOutput> &output) {
+    LOGE("subclass of TNNSDKComposeSample must implement this interface\n");
+    return Status(TNNERR_NO_RESULT, "subclass of TNNSDKComposeSample must implement this interface");
+}
+
+/*
+* NMS, supporting hard-nms, blending-nms and weighted-nms
+*/
+void NMS(std::vector<ObjectInfo> &input, std::vector<ObjectInfo> &output, float iou_threshold, TNNNMSType type) {
+    std::sort(input.begin(), input.end(), [](const ObjectInfo &a, const ObjectInfo &b) { return a.score > b.score; });
+    output.clear();
+
+    int box_num = input.size();
+
+    std::vector<int> merged(box_num, 0);
+
+    for (int i = 0; i < box_num; i++) {
+        if (merged[i])
+            continue;
+        std::vector<ObjectInfo> buf;
+
+        buf.push_back(input[i]);
+        merged[i] = 1;
+
+        float h0 = input[i].y2 - input[i].y1 + 1;
+        float w0 = input[i].x2 - input[i].x1 + 1;
+
+        float area0 = h0 * w0;
+
+        for (int j = i + 1; j < box_num; j++) {
+            if (merged[j])
+                continue;
+
+            float inner_x0 = input[i].x1 > input[j].x1 ? input[i].x1 : input[j].x1;
+            float inner_y0 = input[i].y1 > input[j].y1 ? input[i].y1 : input[j].y1;
+
+            float inner_x1 = input[i].x2 < input[j].x2 ? input[i].x2 : input[j].x2;
+            float inner_y1 = input[i].y2 < input[j].y2 ? input[i].y2 : input[j].y2;
+
+            float inner_h = inner_y1 - inner_y0 + 1;
+            float inner_w = inner_x1 - inner_x0 + 1;
+
+            if (inner_h <= 0 || inner_w <= 0)
+                continue;
+
+            float inner_area = inner_h * inner_w;
+
+            float h1 = input[j].y2 - input[j].y1 + 1;
+            float w1 = input[j].x2 - input[j].x1 + 1;
+
+            float area1 = h1 * w1;
+
+            float score;
+
+            score = inner_area / (area0 + area1 - inner_area);
+
+            if (score > iou_threshold) {
+                merged[j] = 1;
+                buf.push_back(input[j]);
+            }
+        }
+        switch (type) {
+            case TNNHardNMS: {
+                output.push_back(buf[0]);
+                break;
+            }
+            case TNNBlendingNMS: {
+                float total = 0;
+                for (int i = 0; i < buf.size(); i++) {
+                    total += exp(buf[i].score);
+                }
+                ObjectInfo rects;
+                memset(&rects, 0, sizeof(rects));
+                rects.key_points.resize(buf[0].key_points.size());
+                for (int i = 0; i < buf.size(); i++) {
+                    float rate = exp(buf[i].score) / total;
+                    rects.x1 += buf[i].x1 * rate;
+                    rects.y1 += buf[i].y1 * rate;
+                    rects.x2 += buf[i].x2 * rate;
+                    rects.y2 += buf[i].y2 * rate;
+                    rects.score += buf[i].score * rate;
+                    for(int j = 0; j < buf[i].key_points.size(); ++j) {
+                        rects.key_points[j].first += buf[i].key_points[j].first * rate;
+                        rects.key_points[j].second += buf[i].key_points[j].second * rate;
+                    }
+                    rects.image_height = buf[0].image_height;
+                    rects.image_width  = buf[0].image_width;
+                }
+                output.push_back(rects);
+                break;
+            }
+            case TNNWeightedNMS: {
+                float total = 0;
+                for (int i = 0; i < buf.size(); i++) {
+                    total += buf[i].score;
+                }
+                ObjectInfo rects;
+                memset(&rects, 0, sizeof(rects));
+                rects.key_points.resize(buf[0].key_points.size());
+                for (int i = 0; i < buf.size(); i++) {
+                    float rate = buf[i].score / total;
+                    rects.x1 += buf[i].x1 * rate;
+                    rects.y1 += buf[i].y1 * rate;
+                    rects.x2 += buf[i].x2 * rate;
+                    rects.y2 += buf[i].y2 * rate;
+                    rects.score += buf[i].score * rate;
+                    for(int j = 0; j < buf[i].key_points.size(); ++j) {
+                        rects.key_points[j].first += buf[i].key_points[j].first * rate;
+                        rects.key_points[j].second += buf[i].key_points[j].second * rate;
+                    }
+                    rects.image_height = buf[0].image_height;
+                    rects.image_width  = buf[0].image_width;
+                }
+                output.push_back(rects);
+                break;
+            }
+            default: {
+            }
+        }
+    }
+}
+
+/*
+ * Rectangle
+ */
+void Rectangle(void *data_rgba, int image_height, int image_width,
+               int x0, int y0, int x1, int y1, float scale_x, float scale_y)
+{
+
+    
+    RGBA *image_rgba = (RGBA *)data_rgba;
+
+    int x_min = std::min(x0, x1) * scale_x;
+    int x_max = std::max(x0, x1) * scale_x;
+    int y_min = std::min(y0, y1) * scale_y;
+    int y_max = std::max(y0, y1) * scale_y;
+
+    x_min = std::min(std::max(x_min, 0), image_width - 1);
+    x_max = std::min(std::max(x_max, 0), image_width - 1);
+    y_min = std::min(std::max(y_min, 0), image_height - 1);
+    y_max = std::min(std::max(y_max, 0), image_height - 1);
+
+    // top bottom
+    if (x_max > x_min) {
+        for (int x = x_min; x <= x_max; x++) {
+            int offset                       = y_min * image_width + x;
+            image_rgba[offset]               = {0, 255, 0, 0};
+            image_rgba[offset + image_width] = {0, 255, 0, 0};
+
+            offset                           = y_max * image_width + x;
+            image_rgba[offset]               = {0, 255, 0, 0};
+            if (offset >= image_width) {
+                image_rgba[offset - image_width] = {0, 255, 0, 0};
+            }
+        }
+    }
+
+    // left right
+    if (y_max > y_min) {
+        for (int y = y_min; y <= y_max; y++) {
+            int offset             = y * image_width + x_min;
+            image_rgba[offset]     = {0, 255, 0, 0};
+            image_rgba[offset + 1] = {0, 255, 0, 0};
+
+            offset                 = y * image_width + x_max;
+            image_rgba[offset]     = {0, 255, 0, 0};
+            if (offset >= 1) {
+                image_rgba[offset - 1] = {0, 255, 0, 0};
+            }
+        }
+    }
+}
+
+/*
+ * Point
+ */
+void Point(void *data_rgba, int image_height, int image_width, int x, int y, float z, float scale_x, float scale_y)
+{
+    RGBA *image_rgba = (RGBA *)data_rgba;
+    int x_center = x * scale_x;
+    int y_center = y * scale_y;
+    int x_start = (x-1) * scale_x;
+    int x_end   = (x+1) * scale_x;
+    int y_start = (y-1) * scale_y;
+    int y_end   = (y+1) * scale_y;
+
+    x_center = std::min(std::max(0, x_center), image_width  - 1);
+    y_center = std::min(std::max(0, y_center), image_height - 1);
+    
+    x_start = std::min(std::max(0, x_start), image_width - 1);
+    x_end   = std::min(std::max(0, x_end), image_width - 1);
+    y_start = std::min(std::max(0, y_start), image_height - 1);
+    y_end   = std::min(std::max(0, y_end), image_height - 1);
+    
+    unsigned char color = std::min(std::max(0, int(175 + z*80)), 255);
+    
+    for(int x = x_start; x<=x_end; ++x) {
+        int offset                       = y_center * image_width + x;
+        image_rgba[offset]               = {color, 0, color, 0};
+    }
+    
+    for(int y = y_start; y<=y_end; ++y) {
+        int offset                       = y * image_width + x_center;
+        image_rgba[offset]               = {color, 0, color, 0};
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/examples/base/tnn_sdk_sample.h b/3rdparty/TNN/examples/base/tnn_sdk_sample.h
new file mode 100644
index 0000000..2ba9153
--- /dev/null
+++ b/3rdparty/TNN/examples/base/tnn_sdk_sample.h
@@ -0,0 +1,267 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_TNN_SDK_SAMPLE_H_
+#define TNN_EXAMPLES_BASE_TNN_SDK_SAMPLE_H_
+
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <chrono>
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+#define TNN_SDK_ENABLE_BENCHMARK 1
+#define TNN_SDK_USE_NCNN_MODEL 0
+
+#ifndef HAS_OPENCV
+#define HAS_OPENCV 0
+#endif
+
+namespace TNN_NS {
+
+template<typename T1, typename T2, typename T3>
+using triple = std::tuple<T1, T2, T3>;
+
+struct ObjectInfo {
+    int image_width = 0;
+    int image_height = 0;
+
+    float x1 = 0;
+    float y1 = 0;
+    float x2 = 0;
+    float y2 = 0;
+
+    //key_points <x y>
+    std::vector<std::pair<float, float>> key_points = {};
+    //key_points_3d <x y z>
+    std::vector<triple<float,float,float>> key_points_3d = {};
+    //lines connecting key_points
+    std::vector<std::pair<int, int>> lines;
+    // label
+    const char *label = nullptr;
+    
+    float score = 0;
+    int class_id = -1;
+
+    ObjectInfo AdjustToImageSize(int image_height, int image_width);
+    /**gravity
+     * 0:resize
+     * 1:resize fit the view and keep aspect, empty space may be remained zero
+     *  2:resize to fill the view and keep aspect, no empty space remain
+     */
+    ObjectInfo AdjustToViewSize(int view_height, int view_width, int gravity = 2);
+    ObjectInfo FlipX();
+    ObjectInfo AddOffset(float offset_x, float offset_y);
+    float IntersectionRatio(ObjectInfo *obj);
+};
+
+struct ImageInfo {
+    ImageInfo();
+    ImageInfo(std::shared_ptr<Mat>mat);
+    ImageInfo(const ImageInfo& info);
+    int image_width = 0;
+    int image_height = 0;
+    int image_channel = 0;
+    // 4-channel image data
+    std::shared_ptr<char> data;
+
+    ImageInfo FlipX();
+};
+
+struct BenchOption {
+    int warm_count    = 0;
+    int forward_count = 1;
+    int create_count  = 1;
+
+    std::string Description();
+};
+
+struct BenchResult {
+    TNN_NS::Status status;
+
+    // time
+    float min   = FLT_MAX;
+    float max   = FLT_MIN;
+    float avg   = 0;
+    float total = 0;
+    int count   = 0;
+
+    float diff = 0;
+
+    void Reset();
+    int AddTime(float time);
+    std::string Description();
+};
+
+typedef enum {
+    // run on cpu
+    TNNComputeUnitsCPU = 0,
+    // run on gpu, if failed run on cpu
+    TNNComputeUnitsGPU = 1,
+    // run on huawei_npu, if failed run on cpu
+    TNNComputeUnitsHuaweiNPU = 2,
+    // run on openvino
+    TNNComputeUnitsOpenvino = 3,
+    // run on TensorRT
+    TNNComputeUnitsTensorRT = 4,
+} TNNComputeUnits;
+
+struct RGBA{
+    RGBA(int r = 0, int g = 0, int b = 0, int a = 0) : r(r), g(g), b(b), a(a) {}
+    unsigned char r, g, b, a;
+};
+
+class TNNSDKUtils {
+public:
+    static DeviceType GetFallBackDeviceType(DeviceType dev);
+};
+
+extern const std::string kTNNSDKDefaultName;
+class TNNSDKInput {
+public:
+    TNNSDKInput(std::shared_ptr<TNN_NS::Mat> mat = nullptr);
+    virtual ~TNNSDKInput();
+
+    bool IsEmpty();
+    std::shared_ptr<TNN_NS::Mat> GetMat(std::string name = kTNNSDKDefaultName);
+    bool AddMat(std::shared_ptr<TNN_NS::Mat> mat, std::string name);
+
+protected:
+    std::map<std::string, std::shared_ptr<TNN_NS::Mat> > mat_map_ = {};
+};
+
+class TNNSDKOutput : public TNNSDKInput {
+public:
+    TNNSDKOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~TNNSDKOutput();
+};
+
+class TNNSDKOption {
+public:
+    TNNSDKOption();
+    virtual ~TNNSDKOption();
+
+    std::string proto_content = "";
+    std::string model_content = "";
+    std::string library_path = "";
+    TNNComputeUnits compute_units = TNNComputeUnitsCPU;
+    Precision precision = PRECISION_AUTO;
+    InputShapesMap input_shapes = {};
+};
+
+typedef enum {
+    TNNInterpNearest = 0,
+    TNNInterpLinear  = 1,
+    TNNInterpCubic   = 2,
+} TNNInterpType;
+
+typedef enum {
+    TNNBorderConstant = 0,
+    TNNBorderReflect  = 1,
+    TNNBorderEdge     = 2,
+    TNNBorderReplicate = 3,
+    TNNBorderReflect101 = 4,
+    TNNBorderWrap = 5,
+    
+} TNNBorderType;
+
+class TNNSDKSample {
+public:
+    TNNSDKSample();
+    virtual ~TNNSDKSample();
+    virtual TNNComputeUnits GetComputeUnits();
+    void SetBenchOption(BenchOption option);
+    BenchResult GetBenchResult();
+    virtual DimsVector GetInputShape(std::string name = kTNNSDKDefaultName);
+
+
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual MatConvertParam GetConvertParamForOutput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    
+    virtual std::shared_ptr<TNN_NS::Mat> ProcessSDKInputMat(std::shared_ptr<TNN_NS::Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+
+    void setNpuModelPath(std::string stored_path);
+    void setCheckNpuSwitch(bool option);
+    
+    virtual Status GetCommandQueue(void **command_queue);
+    virtual Status DumpBlob(const BlobMap& blob_map, std::string output_dir);
+    Status Resize(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, TNNInterpType interp_type);
+    Status Crop(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, int start_x, int start_y);
+    Status WarpAffine(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst, TNNInterpType interp_type, TNNBorderType border_type, float trans_mat[2][3]);
+    Status Copy(std::shared_ptr<TNN_NS::Mat> src, std::shared_ptr<TNN_NS::Mat> dst);
+    Status CopyMakeBorder(std::shared_ptr<TNN_NS::Mat> src,
+                          std::shared_ptr<TNN_NS::Mat> dst,
+                          int top, int bottom, int left, int right,
+                          TNNBorderType border_type, uint8_t border_value = 0);
+    virtual bool hideTextBox();
+
+protected:
+    BenchOption bench_option_;
+    BenchResult bench_result_;
+
+    std::vector<std::string> GetInputNames();
+    std::vector<std::string> GetOutputNames();
+    std::shared_ptr<Mat> ResizeToInputShape(std::shared_ptr<Mat> input_mat, std::string name);
+    
+protected:
+    std::shared_ptr<TNN> net_             = nullptr;
+    std::shared_ptr<Instance> instance_   = nullptr;
+    std::shared_ptr<TNNSDKOption> option_ = nullptr;
+    DeviceType device_type_               = DEVICE_ARM;
+    std::string model_path_str_           = "";
+    bool check_npu_                       = false;
+};
+
+class TNNSDKComposeSample : public TNNSDKSample {
+public:
+    TNNSDKComposeSample();
+    virtual ~TNNSDKComposeSample();
+    virtual TNNComputeUnits GetComputeUnits();
+    
+    virtual Status Init(std::vector<std::shared_ptr<TNNSDKSample>> sdks);
+    virtual DimsVector GetInputShape(std::string name = kTNNSDKDefaultName);
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+    virtual Status GetCommandQueue(void **command_queue);
+    
+protected:
+    std::vector<std::shared_ptr<TNNSDKSample>> sdks_ = {};
+    
+};
+
+typedef enum {
+    TNNHardNMS      = 0,
+    TNNBlendingNMS  = 1,
+    TNNWeightedNMS  = 2,
+} TNNNMSType;
+
+void NMS(std::vector<ObjectInfo> &input, std::vector<ObjectInfo> &output, float iou_threshold, TNNNMSType type);
+
+void Rectangle(void *data_rgba, int image_height, int image_width,
+               int x0, int y0, int x1, int y1, float scale_x = 1.0, float scale_y = 1.0);
+
+void Point(void *data_rgba, int image_height, int image_width,
+           int x, int y, float z, float scale_x = 1.0, float scale_y = 1.0);
+}  // namespace TNN_NS
+
+#endif // TNN_EXAMPLES_BASE_TNN_SDK_SAMPLE_H_
diff --git a/3rdparty/TNN/examples/base/ultra_face_detector.cc b/3rdparty/TNN/examples/base/ultra_face_detector.cc
new file mode 100644
index 0000000..7acfa72
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ultra_face_detector.cc
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ultra_face_detector.h"
+#include "sample_timer.h"
+#include <cmath>
+#include <cstring>
+
+#define num_featuremap 4
+#define hard_nms 1
+#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/
+#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x))
+
+namespace TNN_NS {
+UltraFaceDetectorOption::UltraFaceDetectorOption() {}
+
+UltraFaceDetectorOption::~UltraFaceDetectorOption() {}
+
+UltraFaceDetectorInput::~UltraFaceDetectorInput() {}
+
+UltraFaceDetectorOutput::~UltraFaceDetectorOutput() {}
+
+Status UltraFaceDetector::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<UltraFaceDetectorOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto input_dims = GetInputShape();
+    int in_h = input_dims[2];
+    int in_w = input_dims[3];
+    option->input_height = in_h;
+    option->input_width  = in_w;
+
+    auto w_h_list = {in_w, in_h};
+
+    for (auto size : w_h_list) {
+        std::vector<float> fm_item;
+        for (float stride : strides) {
+            fm_item.push_back(ceil(size / stride));
+        }
+        featuremap_size.push_back(fm_item);
+    }
+
+    for (auto size : w_h_list) {
+        shrinkage_size.push_back(strides);
+    }
+
+    /* generate prior anchors */
+    for (int index = 0; index < num_featuremap; index++) {
+        float scale_w = in_w / shrinkage_size[0][index];
+        float scale_h = in_h / shrinkage_size[1][index];
+        for (int j = 0; j < featuremap_size[1][index]; j++) {
+            for (int i = 0; i < featuremap_size[0][index]; i++) {
+                float x_center = (i + 0.5) / scale_w;
+                float y_center = (j + 0.5) / scale_h;
+
+                for (float k : min_boxes[index]) {
+                    float w = k / in_w;
+                    float h = k / in_h;
+                    priors.push_back({clip(x_center, 1), clip(y_center, 1), clip(w, 1), clip(h, 1)});
+                }
+            }
+        }
+    }
+    num_anchors = priors.size();
+    /* generate prior anchors finished */
+    
+    return status;
+}
+
+/*
+ * Destruct the FaceDetector.
+ */
+UltraFaceDetector::~UltraFaceDetector() {}
+
+MatConvertParam UltraFaceDetector::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+    input_convert_param.scale = {1.0 / 128, 1.0 / 128, 1.0 / 128, 0.0};
+    input_convert_param.bias  = {-127.0 / 128, -127.0 / 128, -127.0 / 128, 0.0};
+    return input_convert_param;
+}
+
+std::shared_ptr<TNNSDKOutput> UltraFaceDetector::CreateSDKOutput() {
+    return std::make_shared<UltraFaceDetectorOutput>();
+}
+
+std::shared_ptr<Mat> UltraFaceDetector::ProcessSDKInputMat(std::shared_ptr<Mat> input_mat,
+                                                                   std::string name) {
+    return TNNSDKSample::ResizeToInputShape(input_mat, name);
+}
+
+Status UltraFaceDetector::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<UltraFaceDetectorOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    auto output = dynamic_cast<UltraFaceDetectorOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false,
+                        Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    std::vector<FaceInfo> bbox_collection;
+    std::vector<FaceInfo> valid_input;
+    
+    auto output_mat_scores = output->GetMat("scores");
+    auto output_mat_boxes = output->GetMat("boxes");
+    RETURN_VALUE_ON_NEQ(!output_mat_scores, false,
+                        Status(TNNERR_PARAM_ERR, "output_mat_scores is invalid"));
+    RETURN_VALUE_ON_NEQ(!output_mat_boxes, false,
+                        Status(TNNERR_PARAM_ERR, "output_mat_boxes is invalid"));
+    
+    GenerateBBox(bbox_collection, *(output_mat_scores.get()), *(output_mat_boxes.get()),
+                 option->input_width, option->input_height,
+                 option->score_threshold, num_anchors);
+    
+    std::vector<FaceInfo> face_list;
+    NMS(bbox_collection, face_list, option->iou_threshold);
+    output->face_list = face_list;
+    return status;
+}
+
+/*
+ * Generating Bbox from output blobs
+ */
+void UltraFaceDetector::GenerateBBox(std::vector<FaceInfo> &bbox_collection,
+                                     TNN_NS::Mat &scores, TNN_NS::Mat &boxes,
+                                     int image_w, int image_h,
+                                     float score_threshold, int num_anchors) {
+    float *scores_data = (float *)scores.GetData();
+    float *boxes_data  = (float *)boxes.GetData();
+
+    for (int i = 0; i < num_anchors; i++) {
+        if (scores_data[i * 2 + 1] > score_threshold) {
+            FaceInfo rects;
+            rects.image_width = image_w;
+            rects.image_height = image_h;
+            
+            float x_center = boxes_data[i * 4] * center_variance * priors[i][2] + priors[i][0];
+            float y_center = boxes_data[i * 4 + 1] * center_variance * priors[i][3] + priors[i][1];
+            float w        = exp(boxes_data[i * 4 + 2] * size_variance) * priors[i][2];
+            float h        = exp(boxes_data[i * 4 + 3] * size_variance) * priors[i][3];
+
+            rects.x1    = clip(x_center - w / 2.0, 1) * image_w;
+            rects.y1    = clip(y_center - h / 2.0, 1) * image_h;
+            rects.x2    = clip(x_center + w / 2.0, 1) * image_w;
+            rects.y2    = clip(y_center + h / 2.0, 1) * image_h;
+            rects.score = clip(scores_data[i * 2 + 1], 1);
+            bbox_collection.push_back(rects);
+        }
+    }
+}
+
+/*
+ * NMS
+ */
+void UltraFaceDetector::NMS(std::vector<FaceInfo> &input, std::vector<FaceInfo> &output,
+                            float iou_threshold, TNNNMSType type) {
+    ::TNN_NS::NMS(input, output, iou_threshold, type);
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/ultra_face_detector.h b/3rdparty/TNN/examples/base/ultra_face_detector.h
new file mode 100644
index 0000000..10790af
--- /dev/null
+++ b/3rdparty/TNN/examples/base/ultra_face_detector.h
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_ULTRA_FACE_DETECTOR_H_
+#define TNN_EXAMPLES_BASE_ULTRA_FACE_DETECTOR_H_ 
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tnn_sdk_sample.h"
+
+namespace TNN_NS {
+
+typedef ObjectInfo FaceInfo;
+
+class UltraFaceDetectorInput : public TNNSDKInput {
+public:
+    UltraFaceDetectorInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~UltraFaceDetectorInput();
+};
+
+class UltraFaceDetectorOutput : public TNNSDKOutput {
+public:
+    UltraFaceDetectorOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~UltraFaceDetectorOutput();
+    std::vector<FaceInfo> face_list;
+};
+
+class UltraFaceDetectorOption : public TNNSDKOption {
+public:
+    UltraFaceDetectorOption();
+    virtual ~UltraFaceDetectorOption();
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    float score_threshold = 0.7;
+    float iou_threshold = 0.3;
+    int topk = -1;
+};
+
+class UltraFaceDetector : public TNNSDKSample {
+public:
+    virtual ~UltraFaceDetector();
+    
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual std::shared_ptr<Mat> ProcessSDKInputMat(std::shared_ptr<Mat> mat,
+                                                            std::string name = kTNNSDKDefaultName);
+    
+private:
+    void GenerateBBox(std::vector<FaceInfo> &bbox_collection, TNN_NS::Mat &scores, TNN_NS::Mat &boxes,
+                      int image_w, int image_h,
+                      float score_threshold, int num_anchors);
+
+    void NMS(std::vector<FaceInfo> &input, std::vector<FaceInfo> &output, float iou_threshold, TNNNMSType type = TNNBlendingNMS);
+    
+private:
+    int num_anchors;
+
+    const float mean_vals[3] = {127, 127, 127};
+    const float norm_vals[3] = {1.0 / 128, 1.0 / 128, 1.0 / 128};
+
+    const float center_variance = 0.1;
+    const float size_variance = 0.2;
+    const std::vector<std::vector<float>> min_boxes = {
+            {10.0f,  16.0f,  24.0f},
+            {32.0f,  48.0f},
+            {64.0f,  96.0f},
+            {128.0f, 192.0f, 256.0f}};
+    const std::vector<float> strides = {8.0, 16.0, 32.0, 64.0};
+    std::vector<std::vector<float>> featuremap_size;
+    std::vector<std::vector<float>> shrinkage_size;
+
+    std::vector<std::vector<float>> priors = {};
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_ULTRA_FACE_DETECTOR_H_
diff --git a/3rdparty/TNN/examples/base/utils/utils.cc b/3rdparty/TNN/examples/base/utils/utils.cc
new file mode 100644
index 0000000..173b3fb
--- /dev/null
+++ b/3rdparty/TNN/examples/base/utils/utils.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "utils.h"
+
+#include <string>
+#include <fstream>
+
+// Helper functions
+std::string fdLoadFile(std::string path) {
+    std::ifstream file(path, std::ios::binary);
+    if (file.is_open()) {
+        file.seekg(0, file.end);
+        int size      = file.tellg();
+        char* content = new char[size];
+        file.seekg(0, file.beg);
+        file.read(content, size);
+        std::string fileContent;
+        fileContent.assign(content, size);
+        delete[] content;
+        file.close();
+        return fileContent;
+    } else {
+        return "";
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/base/utils/utils.h b/3rdparty/TNN/examples/base/utils/utils.h
new file mode 100644
index 0000000..c807b23
--- /dev/null
+++ b/3rdparty/TNN/examples/base/utils/utils.h
@@ -0,0 +1,24 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_X86_UTILS_UTILS_H_
+#define TNN_EXAMPLES_X86_UTILS_UTILS_H_
+
+#include <string>
+
+std::string fdLoadFile(std::string path);
+
+#endif // TNN_EXAMPLES_X86_UTILS_UTILS_H_
+
diff --git a/3rdparty/TNN/examples/base/youtu_face_align.cc b/3rdparty/TNN/examples/base/youtu_face_align.cc
new file mode 100644
index 0000000..7b4488f
--- /dev/null
+++ b/3rdparty/TNN/examples/base/youtu_face_align.cc
@@ -0,0 +1,604 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "youtu_face_align.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS{
+
+Status YoutuFaceAlign::Init(std::shared_ptr<TNNSDKOption> option_i) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<YoutuFaceAlignOption *>(option_i.get());
+    RETURN_VALUE_ON_NEQ(!option, false, Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    status = TNNSDKSample::Init(option_i);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    image_h = option->input_height;
+    image_w = option->input_width;
+    face_threshold = option->face_threshold;
+    min_face_size = option->min_face_size;
+    prev_face = false;
+    phase = option->phase;
+    net_scale = option->net_scale;
+    pre_pts = nullptr;
+    // read mean file
+    std::ifstream inFile(option->mean_pts_path);
+    RETURN_VALUE_ON_NEQ(inFile.good(), true, Status(TNNERR_PARAM_ERR, "TNNSDKOption.mean_file_path is invalid"));
+    std::string line;
+    int index = 0;
+    while(std::getline(inFile, line, '\n')) {
+        float val = std::stof(line);
+        mean.push_back(val);
+        index += 1;
+    }
+    return TNN_OK;
+}
+
+MatConvertParam YoutuFaceAlign::GetConvertParamForInput(std::string tag) {
+    MatConvertParam input_convert_param;
+
+    TNN_NS::MatConvertParam param;
+    param.scale = {1.0 / 128.0, 1.0 / 128.0, 1.0 / 128.0, 0.0};
+    param.bias  = {-1.0, -1.0, -1.0, 0.0};
+        
+    return param;
+}
+
+std::shared_ptr<TNNSDKOutput> YoutuFaceAlign::CreateSDKOutput() {
+    return std::make_shared<YoutuFaceAlignOutput>();
+}
+
+Status YoutuFaceAlign::ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output_) {
+    Status status = TNN_OK;
+    auto option = dynamic_cast<YoutuFaceAlignOption *>(option_.get());
+    RETURN_VALUE_ON_NEQ(!option, false, Status(TNNERR_PARAM_ERR, "TNNSDKOption is invalid"));
+    
+    auto output = dynamic_cast<YoutuFaceAlignOutput *>(output_.get());
+    RETURN_VALUE_ON_NEQ(!output, false, Status(TNNERR_PARAM_ERR, "TNNSDKOutput is invalid"));
+    
+    std::shared_ptr<Mat> pts = nullptr;
+    std::shared_ptr<Mat> vis = nullptr;
+    if(phase ==  1) {
+        pts = output->GetMat("852");
+        
+        auto pred_label = output->GetMat("855");
+        float *label_ptr = static_cast<float*>(pred_label->GetData());
+        if(label_ptr[0] > face_threshold) {
+            prev_face = true;
+        }
+        else {
+            prev_face = false;
+        }
+    } else if(phase == 2) {
+        pts = output->GetMat("850");
+    }
+    auto InverseM = MatrixInverse2x3(M, 2, 3);
+    LandMarkWarpAffine(pts, InverseM);
+    if(phase == 1){
+        // save pts for next frame
+        pre_pts = pts;
+    } else if(phase == 2){
+        pre_pts = pts;
+    }
+    // prepare output
+    YoutuFaceAlignInfo face;
+
+    constexpr int pts_dim = 2;
+    auto pts_cnt = pts->GetDims()[1] / pts_dim;
+    auto pts_data = static_cast<float*>(pts->GetData());
+    face.key_points.resize(pts_cnt);
+
+    for(int i=0; i<pts_cnt; ++i) {
+        face.key_points[i] = std::make_pair(pts_data[i * pts_dim + 0], pts_data[i * pts_dim + 1]);
+    }
+
+    output->face = std::move(face);
+    
+    return status;
+}
+
+Status YoutuFaceAlign::Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output) {
+    Status status = TNN_OK;
+    
+    if (!input || input->IsEmpty()) {
+        status = Status(TNNERR_PARAM_ERR, "input image is empty ,please check!");
+        LOGE("input image is empty ,please check!\n");
+        return status;
+    }
+    
+#if TNN_SDK_ENABLE_BENCHMARK
+    bench_result_.Reset();
+    for (int fcount = 0; fcount < bench_option_.forward_count; fcount++) {
+        SampleTimer sample_time;
+        sample_time.Start();
+#endif
+        // step 1. set input mat for phase1
+        auto input_names = GetInputNames();
+        RETURN_VALUE_ON_NEQ(input_names.size(), 1, Status(TNNERR_PARAM_ERR, "TNNInput number is invalid"));
+        
+        auto input_mat = input->GetMat();
+        std::shared_ptr<Mat> input1 = nullptr;
+        if (phase == 1 && prev_face == false){
+            // use face region from face detector
+            input1 = WarpByRect(input_mat, x1, y1, x2, y2, image_h, net_scale, M);
+        } else{
+            input1 = AlignN(input_mat, pre_pts, mean, image_h, image_w, net_scale, M);
+        }
+        // BGR2Gray
+        input1 = BGRToGray(input1);
+        // Normalize
+        auto input_convert_param = GetConvertParamForInput();
+        status = instance_->SetInputMat(input1, input_convert_param);
+        RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+
+        // step 2. forward phase1 model
+        status = instance_->ForwardAsync(nullptr);
+        if (status != TNN_NS::TNN_OK) {
+            LOGE("instance.Forward Error: %s\n", status.description().c_str());
+            return status;
+        }
+
+        // step 3. get output mat of phase1 model
+        output = CreateSDKOutput();
+        auto input_device_type = input_mat->GetDeviceType();
+        auto output_names = GetOutputNames();
+        for (auto name : output_names) {
+            auto output_convert_param = GetConvertParamForOutput(name);
+            std::shared_ptr<TNN_NS::Mat> output_mat = nullptr;
+            status = instance_->GetOutputMat(output_mat, output_convert_param, name,
+                                             TNNSDKUtils::GetFallBackDeviceType(input_device_type));
+            RETURN_ON_NEQ(status, TNN_NS::TNN_OK);
+            output->AddMat(output_mat, name);
+        }
+        
+#if TNN_SDK_ENABLE_BENCHMARK
+        sample_time.Stop();
+        double elapsed = sample_time.GetTime();
+        bench_result_.AddTime(elapsed);
+#endif
+        // post-processing
+        ProcessSDKOutput(output);
+#if TNN_SDK_ENABLE_BENCHMARK
+    }
+#endif
+    
+    return status;
+}
+
+/*
+  wrap the face detected by the face detector rect to input
+ */
+std::shared_ptr<TNN_NS::Mat> YoutuFaceAlign::WarpByRect(std::shared_ptr<TNN_NS::Mat> image, float x1, float y1, float x2, float y2, int net_width, float enlarge, std::vector<float>&M) {
+    float xmin = x1;
+    float xmax = x2;
+    float ymin = y1;
+    float ymax = y2;
+    // drop forehead
+    ymin  = ymin + (ymax - ymin) * 0.3;
+    
+    float width = (std::max)(xmax - xmin, ymax - ymin) * enlarge;
+    if(width == 0)
+        width = 2.0;
+    
+    float scale = static_cast<float>(net_width) / width;
+    
+    float cx = (xmax + xmin) / 2.0;
+    float cy = (ymax + ymin) / 2.0;
+    
+    float xOffset = -(cx - width / 2.0);
+    float yOffset = -(cy - width / 2.0);
+    
+    // prepare the warpAffne transformation matrix
+    M.resize(2*3, 0);
+    float* transM = static_cast<float*>(&M[0]);
+    transM[0 * 3 + 0] = scale;
+    transM[0 * 3 + 1] = 0.0f;
+    transM[0 * 3 + 2] = xOffset * scale;
+    transM[1 * 3 + 0] = 0.0f;
+    transM[1 * 3 + 1] = scale;
+    transM[1 * 3 + 2] = yOffset * scale;
+    
+    auto transMatDims = image->GetDims();
+    transMatDims[2] = net_width;
+    transMatDims[3] = net_width;
+    
+    auto transMat = std::make_shared<TNN_NS::Mat>(image->GetDeviceType(), image->GetMatType(), transMatDims);
+    
+    TNN_NS::Status status = TNN_OK;
+    
+    //perform warpAffine
+    void* command_queue = nullptr;
+    status = instance_->GetCommandQueue(&command_queue);
+    if(status != TNN_OK) {
+        LOGE("GetCommandQueue Error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+    WarpAffineParam param;
+    param.border_type = BORDER_TYPE_CONSTANT;
+    param.interp_type = INTERP_TYPE_LINEAR;
+    param.border_val = 0;
+    memcpy(param.transform, transM, sizeof(float)*M.size());
+
+    status = MatUtils::WarpAffine(*(image.get()), *(transMat.get()), param, command_queue);
+    if (status != TNN_OK) {
+        LOGE("WarpAffine Error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+    
+    return transMat;
+}
+/*
+ warp the input for the phase2 model and the input for the phase1 model if prev_face exists
+ parameters:
+    @image:     the original image from video, 1280*720*3
+    @pre_pts:   landmarks from previous prediction, 117*2 for phase1, 76*2 for phase2
+    @mean:      mean landmarks defined by models. 76*2， constant
+    @net_h:     the input height for model, constant
+    @net_w:     the input width for model, constant
+    @net_scale: scale for the input of the model, constant
+    @M:         the warpAffine transformation matrix, set by this method
+returns:
+    the image sent to the phase2 model
+ */
+std::shared_ptr<TNN_NS::Mat> YoutuFaceAlign::AlignN(std::shared_ptr<TNN_NS::Mat> image, std::shared_ptr<TNN_NS::Mat> pre_pts, std::vector<float> mean_pts, int net_h, int net_w, float net_scale, std::vector<float>&M) {
+    DimsVector dims(3, 0);
+    // check shape
+    const int batch = pre_pts->GetDim(0);
+    dims[0] = batch;
+    if(phase == 1) {
+        auto channel = pre_pts->GetDim(1);
+        dims[1] = channel / 2;
+        dims[2] = 2;
+    } else if(phase == 2) {
+        // phase2 model only uses part of pts
+        dims[1] = 76;
+        dims[2] = 2;
+    }
+    if (TNN_NS::DimsVectorUtils::Count(dims) != mean_pts.size()){
+        // shapes not matching, return
+        return nullptr;
+    }
+    
+    float *pre_pts_data = static_cast<float*>(pre_pts->GetData());
+    float *mean_pts_data = &(mean_pts[0]);
+    
+    float dx = (net_scale * net_w - net_w) / 2.0;
+    float dy = (net_scale * net_h - net_h) / 2.0;
+    
+    for(int i=0; i<mean_pts.size(); i+=2) {
+        mean_pts[i]   = (mean_pts[i]   + dx) / net_scale;
+        mean_pts[i+1] = (mean_pts[i+1] + dy) / net_scale;
+    }
+    
+    std::vector<float> pre_pts_mean;
+    std::vector<float> mean_pts_mean;
+    
+    auto rows = dims[1];
+    auto cols = dims[2];
+    
+    // compute mean
+    MatrixMean(pre_pts_data, rows, cols, 0, pre_pts_mean);
+    MatrixMean(mean_pts_data, rows, cols, 0, mean_pts_mean);
+    
+    // sub mean
+    for(int r=0; r<rows; ++r) {
+        for(int c=0; c<cols; ++c) {
+            pre_pts_data[r * cols + c]  -= pre_pts_mean[c];
+            mean_pts_data[r * cols + c] -= mean_pts_mean[c];
+        }
+    }
+    // compute std
+    std::vector<float> pre_pts_std;
+    std::vector<float> mean_pts_std;
+    MatrixStd(pre_pts_data, rows, cols, -1, pre_pts_std);
+    MatrixStd(mean_pts_data, rows, cols, -1, mean_pts_std);
+    
+    // normalize
+    for(int r=0; r<rows; ++r) {
+        for(int c=0; c<cols; ++c) {
+            pre_pts_data[r * cols + c]  /= pre_pts_std[0];
+            mean_pts_data[r * cols + c] /= mean_pts_std[0];
+        }
+    }
+    // svd
+    // 1) matmul(pre_pts.T, mean_pts)
+    std::vector<float> mul(cols*cols, 0);
+    for(int c1 =0; c1<cols; ++c1){
+        for(int c2 = 0; c2<cols; ++c2){
+            for(int r=0; r<rows; ++r){
+                mul[c1 * cols + c2] += pre_pts_data[r * cols + c1] * mean_pts_data[r * cols + c2];
+            }
+        }
+    }
+    // 2) get svd result
+    std::vector<float> u_mul;
+    std::vector<float> vt_mul;
+    MatrixSVD2x2(mul, cols, cols, u_mul, vt_mul);
+    // 3) reconstruct r by (u_mul*vt_ul).T
+    std::vector<float> r_mat(mul.size(), 0);
+    for(int c1=0; c1<cols; ++c1){
+        for(int c2=0; c2<cols; ++c2){
+            for(int k=0; k<cols; ++k){
+                r_mat[c1 * cols + c2] += u_mul[c2 * cols + k] * vt_mul[k * cols + c1];
+            }
+        }
+    }
+    // compute the warpAffine transformation matrix
+    constexpr unsigned int trans_matrix_rows = 2;
+    constexpr unsigned int trans_matrix_cols = 3;
+    M.resize(6, 0);
+    for(int r=0; r<trans_matrix_rows; ++r){
+        for(int c=0; c<trans_matrix_cols; ++c){
+            float val = 0;
+            if(c < trans_matrix_cols - 1){
+                val = (mean_pts_std[0] / pre_pts_std[0]) * r_mat[r * cols + c];
+                
+            } else{
+                float val0 = M[r * trans_matrix_cols + 0];
+                float val1 = M[r * trans_matrix_cols + 1];
+                val = mean_pts_mean[r] - (val0 * pre_pts_mean[0] + val1 * pre_pts_mean[1]);
+            }
+            M[r * trans_matrix_cols + c] = val;
+        }
+    }
+    //perform warpAffine
+    auto transMatDims = image->GetDims();
+    transMatDims[2] = net_h;
+    transMatDims[3] = net_w;
+    auto transMat = std::make_shared<TNN_NS::Mat>(image->GetDeviceType(), image->GetMatType(), transMatDims);
+    
+    Status status = TNN_OK;
+    void* command_queue = nullptr;
+    status = instance_->GetCommandQueue(&command_queue);
+    if(status != TNN_OK) {
+        LOGE("GetCommandQueue Error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+    WarpAffineParam param;
+    param.border_type = BORDER_TYPE_CONSTANT;
+    param.interp_type = INTERP_TYPE_LINEAR;
+    param.border_val = 0;
+    float* transM = &(M[0]);
+    memcpy(param.transform, transM, sizeof(float)*M.size());
+
+    status = MatUtils::WarpAffine(*(image.get()), *(transMat.get()), param, command_queue);
+    if(status != TNN_OK) {
+        LOGE("WarpAffine Error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+    
+    return transMat;
+}
+
+// change BGR Mat to Gray Mat
+std::shared_ptr<TNN_NS::Mat> YoutuFaceAlign::BGRToGray(std::shared_ptr<TNN_NS::Mat> bgr_image) {
+    Status status = TNN_OK;
+
+    ColorConversionType cvt_type;
+
+    if(bgr_image->GetMatType() == N8UC4) {
+        cvt_type = COLOR_CONVERT_BGRATOGRAY;
+    } else if (bgr_image->GetMatType() == N8UC3) {
+        cvt_type = COLOR_CONVERT_BGRTOGRAY;
+    } else {
+        return nullptr;
+    }
+
+    // only arm supports bgr2gray for now, construct arm input mat when necessary
+    TNN_NS::DeviceType src_device_type = bgr_image->GetDeviceType();
+    TNN_NS::DeviceType dst_device_type = bgr_image->GetDeviceType();
+
+    std::shared_ptr<TNN_NS::Mat> bgrInputMat = nullptr;
+    if (DEVICE_ARM == src_device_type || DEVICE_NAIVE == src_device_type) {
+        bgrInputMat = bgr_image;
+    } else if (DEVICE_METAL == src_device_type) {
+        // condtruct an arm mat
+        dst_device_type = DEVICE_ARM;
+        bgrInputMat = std::make_shared<TNN_NS::Mat>(dst_device_type, bgr_image->GetMatType(), bgr_image->GetDims());
+        status = Copy(bgr_image, bgrInputMat);
+        if (status != TNN_OK) {
+            LOGE("Copy bgrInput Error:%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+
+    auto grayDims = bgrInputMat->GetDims();
+    grayDims[1] = 1;
+    auto grayMat = std::make_shared<TNN_NS::Mat>(dst_device_type, TNN_NS::NGRAY, grayDims);
+    
+    void* command_queue = nullptr;
+    status = instance_->GetCommandQueue(&command_queue);
+    if(status != TNN_OK) {
+        LOGE("GetCommandQueue Error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+    status = MatUtils::CvtColor(*(bgrInputMat.get()), *(grayMat.get()), cvt_type, command_queue);
+    if(status != TNN_OK) {
+        LOGE("CvtColor error:%s\n", status.description().c_str());
+        return nullptr;
+    }
+
+    // copy when necessary
+    std::shared_ptr<TNN_NS::Mat> outputMat = nullptr;
+    if (DEVICE_ARM == src_device_type || DEVICE_NAIVE == src_device_type ) {
+        outputMat = grayMat;
+    } else if (DEVICE_METAL == src_device_type ) {
+        // convert ngray mat to nchw_float
+        auto grayMatFloat = std::make_shared<TNN_NS::Mat>(dst_device_type, TNN_NS::NCHW_FLOAT, grayMat->GetDims());
+        float* grayFloatData  = static_cast<float*>(grayMatFloat->GetData());
+        uint8_t* grayUintData = static_cast<uint8_t*>(grayMat->GetData());
+        for(int i=0; i<grayDims[2]*grayDims[3]; ++i) {
+            grayFloatData[i] = grayUintData[i];
+        }
+        // copy cpu grat mat to metal
+        outputMat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::NCHW_FLOAT, grayMatFloat->GetDims());
+        status = Copy(grayMatFloat, outputMat);
+        if (status != TNN_OK) {
+            LOGE("Copy grayOutput Error:%s\n", status.description().c_str());
+            return nullptr;
+        }
+    }
+
+    return outputMat;
+}
+
+/*
+ Compute the inverse matrix for 2*3 warpAffine trans matrix
+*/
+std::vector<float> YoutuFaceAlign::MatrixInverse2x3(std::vector<float>& m, int rows, int cols, bool transMat) {
+    std::vector<float> inv(rows*cols, 0);
+    if (!transMat) {
+        return inv;
+    }
+    if (rows !=2 || cols != 3) {
+        return inv;
+    }
+    float d   = m[0] * m[4] - m[1] * m[3];
+    d          = d != 0 ? 1. / d : 0;
+
+    float a11 = m[4] * d, a22 = m[0] * d;
+    inv[0]      = a11;
+    inv[1]      = m[1] * (-d);
+    inv[3]      = m[3] * (-d);
+    inv[4]      = a22;
+
+    float b1 = -a11 * m[2] - inv[1] * m[5];
+    float b2 = -inv[3] * m[2] - a22 * m[5];
+    inv[2]      = b1;
+    inv[5]      = b2;
+    
+    return inv;
+}
+
+/*
+ perform warpAffine on pts
+ the maximum diff of pts before transform <=: phase1: /phase2: 0.41
+ the maximum diff of pts after transform <=: phase1: /phase2: 0.78
+ */
+void YoutuFaceAlign::LandMarkWarpAffine(std::shared_ptr<TNN_NS::Mat> pts, std::vector<float> &M) {
+    constexpr int N = 3;
+    
+    auto dims = pts->GetDims();
+    int pts_dim = 2;
+    int num_pts = dims[1] / pts_dim;
+    float* pts_data = static_cast<float*>(pts->GetData());
+    
+    for(int n=0; n<num_pts; ++n){
+        float x = M[0 * N + 0] * pts_data[n * pts_dim + 0] + M[0 * N + 1] * pts_data[n * pts_dim + 1] + M[0 * N + 2];
+        float y = M[1 * N + 0] * pts_data[n * pts_dim + 0] + M[1 * N + 1] * pts_data[n * pts_dim + 1] + M[1 * N + 2];
+        
+        pts_data[n * pts_dim + 0] = x;
+        pts_data[n * pts_dim + 1] = y;
+    }
+}
+
+/*
+ Compute the means of matrix along the axis
+ Parameters:
+    @ptr: the pointer to the matrix, data should be stored in ptr following a row-major layout contigunously
+    @rows, cols: the shape of matrix
+    @axis: which axis to compute mean, '0' for rows, '1' for cols, -1 for all
+    @means: vector to store the results
+ */
+void YoutuFaceAlign::MatrixMean(const float *ptr, unsigned int rows, unsigned int cols, int axis, std::vector<float>& means) {
+    unsigned int step_size = 0;
+    unsigned int steps = 0;
+    unsigned int stride = 0;
+   
+    means.clear();
+    if(axis == 0){
+        means.resize(cols);
+        step_size = cols;
+        steps = rows;
+        stride = 1;
+    } else if(axis == 1){
+        means.resize(rows);
+        step_size = 1;
+        steps = cols;
+        stride = cols;
+    } else if(axis == -1){
+        means.resize(1);
+        step_size = 1;
+        steps = rows*cols;
+        stride = 1;
+    }else{
+        return;
+    }
+    // sum
+    for(int s=0; s<steps; ++s){
+        for(int n=0; n<means.size(); ++n){
+            means[n] += ptr[n * stride + s * step_size];
+        }
+    }
+    // mean
+    for(int i=0; i<means.size(); ++i){
+        means[i] /= steps;
+    }
+}
+
+
+void YoutuFaceAlign::MatrixStd(const float *ptr, unsigned int rows, unsigned int cols, int axis, std::vector<float>& stds) {
+    stds.clear();
+    if(axis != -1){
+        // not supported, return
+        return;
+    }
+    
+    stds.resize(1);
+    std::vector<float> mean;
+    MatrixMean(ptr, rows, cols, -1, mean);
+    
+    double sum = 0;
+    auto count = rows*cols;
+    for(int i=0; i<count; ++i) {
+        sum += std::pow(std::abs(ptr[i] - mean[0]), 2);
+    }
+    double std = std::sqrt(sum / count);
+    stds[0] = static_cast<float>(std);
+}
+
+// svd for 2-by-2 matrix
+void YoutuFaceAlign::MatrixSVD2x2(const std::vector<float>a, int rows, int cols, std::vector<float>&u, std::vector<float>&vt) {
+    u.clear();
+    vt.clear();
+    if (rows != 2 || cols != 2) {
+        // not supported
+        return;
+    }
+    u.resize(2*2, 0);
+    vt.resize(2*2, 0);
+
+    float s[2];
+    float v[4];
+
+    s[0] = (sqrt(pow(a[0] - a[3], 2) + pow(a[1] + a[2], 2)) + sqrt(pow(a[0] + a[3], 2) + pow(a[1] - a[2], 2))) / 2;
+    s[1] = fabs(s[0] - sqrt(pow(a[0] - a[3], 2) + pow(a[1] + a[2], 2)));
+    v[2] = (s[0] > s[1]) ? sin((atan2(2 * (a[0] * a[1] + a[2] * a[3]), a[0] * a[0] - a[1] * a[1] + a[2] * a[2] - a[3] * a[3])) / 2) : 0;
+    v[0] = sqrt(1 - v[2] * v[2]);
+    v[1] = -v[2];
+    v[3] = v[0];
+
+    u[0] = (s[0] != 0) ? (a[0] * v[0] + a[1] * v[2]) / s[0] : 1;
+    u[2] = (s[0] != 0) ? (a[2] * v[0] + a[3] * v[2]) / s[0] : 0;
+    u[1] = (s[1] != 0) ? (a[0] * v[1] + a[1] * v[3]) / s[1] : -u[2];
+    u[3] = (s[1] != 0) ? (a[2] * v[1] + a[3] * v[3]) / s[1] : u[0];
+    // transpose
+    vt[0] = v[0];
+    vt[1] = v[2];
+    vt[2] = v[1];
+    vt[3] = v[3];
+}
+
+}
diff --git a/3rdparty/TNN/examples/base/youtu_face_align.h b/3rdparty/TNN/examples/base/youtu_face_align.h
new file mode 100644
index 0000000..d7b2d79
--- /dev/null
+++ b/3rdparty/TNN/examples/base/youtu_face_align.h
@@ -0,0 +1,159 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_BASE_YOUTU_FACE_ALIGN_H_
+#define TNN_EXAMPLES_BASE_YOUTU_FACE_ALIGN_H_
+
+#include "tnn_sdk_sample.h"
+
+#include "stdlib.h"
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <array>
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "sample_timer.h"
+
+namespace TNN_NS{
+
+struct YoutuFaceAlignInfo : TNN_NS::ObjectInfo {
+};
+
+class YoutuFaceAlignInput : public TNNSDKInput {
+public:
+    YoutuFaceAlignInput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKInput(mat) {};
+    virtual ~YoutuFaceAlignInput(){}
+};
+
+class YoutuFaceAlignOutput : public TNNSDKOutput {
+public:
+    YoutuFaceAlignOutput(std::shared_ptr<Mat> mat = nullptr) : TNNSDKOutput(mat) {};
+    virtual ~YoutuFaceAlignOutput() {};
+    
+    YoutuFaceAlignInfo face;
+};
+
+class YoutuFaceAlignOption : public TNNSDKOption {
+public:
+    YoutuFaceAlignOption() {}
+    virtual ~YoutuFaceAlignOption() {}
+    int input_width;
+    int input_height;
+    int num_thread = 1;
+    float net_scale;
+    float face_threshold = 0.75;
+    int min_face_size = 20;
+    //phase
+    int phase = -1;
+    // mean_pts file path
+    std::string mean_pts_path;
+};
+
+class YoutuFaceAlign : public TNN_NS::TNNSDKSample {
+public:
+    virtual ~YoutuFaceAlign() {}
+    
+    virtual Status Init(std::shared_ptr<TNNSDKOption> option);
+    virtual MatConvertParam GetConvertParamForInput(std::string name = "");
+    virtual std::shared_ptr<TNNSDKOutput> CreateSDKOutput();
+    virtual Status ProcessSDKOutput(std::shared_ptr<TNNSDKOutput> output);
+    virtual Status Predict(std::shared_ptr<TNNSDKInput> input, std::shared_ptr<TNNSDKOutput> &output);
+
+    bool SetFaceRegion(float x1, float y1, float x2, float y2) {
+        bool isValidFace = IsValidFace(x1, y1, x2, y2);
+        if(!isValidFace)
+            return false;
+        
+        this->x1 = x1;
+        this->y1 = y1;
+        this->x2 = x2;
+        this->y2 = y2;
+
+        return true;
+    }
+
+    std::shared_ptr<TNN_NS::Mat> GetPrePts() {
+        return this->pre_pts;
+    }
+
+    bool GetPrevFace() {
+        return this->prev_face;
+    }
+    void SetPrevFace(bool b) {
+        this->prev_face = b;
+    }
+
+    void SetPrePts(std::shared_ptr<Mat> p, bool deep_copy = false) {
+        if(deep_copy) {
+            this->pre_pts = std::make_shared<TNN_NS::Mat>(p->GetDeviceType(), p->GetMatType(), p->GetDims());
+            auto count = TNN_NS::DimsVectorUtils::Count(p->GetDims());
+            memcpy(this->pre_pts->GetData(), p->GetData(), sizeof(float)*count);
+        } else {
+            this->pre_pts = p;
+        }
+    }
+    
+private:
+    //prep-rocessing methods
+    std::shared_ptr<TNN_NS::Mat> WarpByRect(std::shared_ptr<TNN_NS::Mat> image, float x1, float y1, float x2, float y2, int net_width, float enlarge, std::vector<float>&M);
+    
+    std::shared_ptr<TNN_NS::Mat> AlignN(std::shared_ptr<TNN_NS::Mat> image, std::shared_ptr<TNN_NS::Mat> pre_pts, std::vector<float>mean, int net_h, int net_w, float net_scale, std::vector<float>&M);
+    
+    // methods used in pre-processing and post-processing
+    std::shared_ptr<TNN_NS::Mat> BGRToGray(std::shared_ptr<TNN_NS::Mat> bgr_mat);
+    
+    std::vector<float> MatrixInverse2x3(std::vector<float>& mat, int rows, int cols, bool transMat=true);
+    
+    void LandMarkWarpAffine(std::shared_ptr<TNN_NS::Mat>pts, std::vector<float>& M);
+    
+    void MatrixMean(const float *ptr, unsigned int rows, unsigned int cols, int axis, std::vector<float>& means);
+    
+    void MatrixStd(const float *ptr, unsigned int rows, unsigned int cols,int axis, std::vector<float>& stds);
+    
+    void MatrixSVD2x2(const std::vector<float>a, int rows, int cols, std::vector<float>&u, std::vector<float>&vt);
+
+    bool IsValidFace(float x1, float y1, float x2, float y2) {
+        return (x2 - x1 >= min_face_size) && (y2-y1 >= min_face_size);
+    }
+
+private:
+    // model phase
+    int phase;
+    // input shape
+    int image_w;
+    int image_h;
+    // whether faces in the previous frame
+    bool prev_face = false;
+    // face region
+    float x1, y1, x2, y2;
+    // the minimum face size
+    float min_face_size = 20;
+    // the confident threshold
+    float face_threshold = 0.5;
+    // model configs
+    float net_scale;
+    std::vector<float> mean;
+    // current pts data
+    std::shared_ptr<TNN_NS::Mat> pre_pts;
+    // warpAffine trans matrix
+    std::vector<float> M;
+};
+
+}
+
+#endif // TNN_EXAMPLES_BASE_YOUTU_FACE_ALIGN_H_
diff --git a/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.pbxproj b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.pbxproj
new file mode 100644
index 0000000..8a63ca5
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.pbxproj
@@ -0,0 +1,969 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		9D449865241D2D56003FBAA2 /* UIImage+Utility.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D449864241D2D56003FBAA2 /* UIImage+Utility.mm */; };
+		9D49E7A12504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D49E79F2504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.mm */; };
+		9D49E7A42504BB5D00DD81D7 /* face_detect_aligner.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D49E7A22504BB5D00DD81D7 /* face_detect_aligner.cc */; };
+		9D6A9F5F24268DD3000A5417 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9D6A9F5E24268DD3000A5417 /* CoreMedia.framework */; };
+		9D71E78B24C0AF4200074C1A /* TNNBoundingBox.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E78A24C0AF4200074C1A /* TNNBoundingBox.mm */; };
+		9D71E79324C1980700074C1A /* tnn_fps_counter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E79124C1980700074C1A /* tnn_fps_counter.cc */; };
+		9D71E79C24C4E92A00074C1A /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = 9D71E79824C4E92A00074C1A /* synset.txt */; };
+		9D71E79D24C4E92A00074C1A /* TNNImageClassifyController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E79A24C4E92A00074C1A /* TNNImageClassifyController.mm */; };
+		9D71E79E24C4E92A00074C1A /* tiger_cat.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 9D71E79B24C4E92A00074C1A /* tiger_cat.jpg */; };
+		9D71E7A324C4F35A00074C1A /* gray_00001.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 9D71E7A124C4F35A00074C1A /* gray_00001.jpg */; };
+		9D71E7A424C4F35A00074C1A /* TNNImageColourController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7A224C4F35A00074C1A /* TNNImageColourController.mm */; };
+		9D71E7AF24C560C900074C1A /* TNNObjectDetectorController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7A624C560C800074C1A /* TNNObjectDetectorController.mm */; };
+		9D71E7B024C560C900074C1A /* 004545.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 9D71E7A824C560C800074C1A /* 004545.jpg */; };
+		9D71E7BB24C5685F00074C1A /* TNNYoloObjectDetectorController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7B924C5685F00074C1A /* TNNYoloObjectDetectorController.mm */; };
+		9D71E7BC24C5685F00074C1A /* dog_cropped.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 9D71E7BA24C5685F00074C1A /* dog_cropped.jpg */; };
+		9D71E7C324C5BE1200074C1A /* TNNViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7C124C5BE1200074C1A /* TNNViewModel.mm */; };
+		9D71E7C624C5BE7000074C1A /* TNNFaceDetectorViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7C524C5BE7000074C1A /* TNNFaceDetectorViewModel.mm */; };
+		9D71E7C924C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7C724C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.mm */; };
+		9D71E7CC24C8404100074C1A /* TNNYoloObjectDetectorViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D71E7CB24C8404100074C1A /* TNNYoloObjectDetectorViewModel.mm */; };
+		9D8493C52510B4A400869EEF /* TNNExamplesListController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D8493C22510B4A400869EEF /* TNNExamplesListController.mm */; };
+		9D8493C62510B4A400869EEF /* TNNExamplesController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D8493C32510B4A400869EEF /* TNNExamplesController.mm */; };
+		9D8493C92510B52C00869EEF /* TNNExamplesListCell.m in Sources */ = {isa = PBXBuildFile; fileRef = 9D8493C82510B52C00869EEF /* TNNExamplesListCell.m */; };
+		9D9187E3242C30D600DF2B79 /* test.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 9D9187DE242C30D500DF2B79 /* test.jpg */; };
+		9D9187E4242C30D600DF2B79 /* TNNFaceDetectorController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D9187DF242C30D500DF2B79 /* TNNFaceDetectorController.mm */; };
+		9D961FED24116548009B3FB1 /* RootNavController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D961FEB24116548009B3FB1 /* RootNavController.mm */; };
+		9DC7116224D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DC7116024D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.mm */; };
+		9DD579EF23B5A20500A96E63 /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579EE23B5A20500A96E63 /* AppDelegate.mm */; };
+		9DD579F223B5A20500A96E63 /* SceneDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579F123B5A20500A96E63 /* SceneDelegate.mm */; };
+		9DD579F823B5A20500A96E63 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579F623B5A20500A96E63 /* Main.storyboard */; };
+		9DD579FA23B5A20E00A96E63 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579F923B5A20E00A96E63 /* Assets.xcassets */; };
+		9DD579FD23B5A20E00A96E63 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */; };
+		9DD57A0023B5A20E00A96E63 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 9DD579FF23B5A20E00A96E63 /* main.m */; };
+		9DD57A1123B5A8D000A96E63 /* tnn.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A0D23B5A8C400A96E63 /* tnn.framework */; };
+		9DD57A1523B5ACEB00A96E63 /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1423B5ACEB00A96E63 /* CoreML.framework */; };
+		9DD57A1723B5ACF900A96E63 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1623B5ACF900A96E63 /* Foundation.framework */; };
+		9DD57A1923B5AD0100A96E63 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9DD57A1823B5AD0100A96E63 /* Accelerate.framework */; };
+		9DDED30A2429A0BE000C09EA /* model in Resources */ = {isa = PBXBuildFile; fileRef = 9DDED3092429A0BE000C09EA /* model */; };
+		9DF1A0B824A8987C00E1376D /* face_gray_transfer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF1A0B724A8987C00E1376D /* face_gray_transfer.cc */; };
+		D1330F47244AC2D900CEBF29 /* TNNCameraPreviewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = D1330F46244AC2D900CEBF29 /* TNNCameraPreviewController.mm */; };
+		D1330F4C244AC4FA00CEBF29 /* TNNCameraVideoDevice.mm in Sources */ = {isa = PBXBuildFile; fileRef = D1330F4A244AC4FA00CEBF29 /* TNNCameraVideoDevice.mm */; };
+		D1330F4E244AFBFF00CEBF29 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = D1330F4D244AFBFF00CEBF29 /* AVFoundation.framework */; };
+		D18F1F1C244B4689002E71CA /* camera_rotate_fill.png in Resources */ = {isa = PBXBuildFile; fileRef = D18F1F1B244B4689002E71CA /* camera_rotate_fill.png */; };
+		EC1B9AA824BC0B7B0003A742 /* object_detector_yolo.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC1B9AA724BC0B7B0003A742 /* object_detector_yolo.cc */; };
+		EC1DF1812609D1F200AB11D1 /* clipper.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC1DF1802609D1F100AB11D1 /* clipper.cc */; };
+		EC2CF731250797C800EE3899 /* image_classifier.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72B250797C800EE3899 /* image_classifier.cc */; };
+		EC2CF732250797C800EE3899 /* tnn_sdk_sample.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72C250797C800EE3899 /* tnn_sdk_sample.cc */; };
+		EC2CF733250797C800EE3899 /* sample_timer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72E250797C800EE3899 /* sample_timer.cc */; };
+		EC2CF734250797C800EE3899 /* ultra_face_detector.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72F250797C800EE3899 /* ultra_face_detector.cc */; };
+		EC2F9CC625369FE8005A5F28 /* TNNPoseDetectLandmarkViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC2F9CC525369FE8005A5F28 /* TNNPoseDetectLandmarkViewModel.mm */; };
+		EC2F9CCC2536A14F005A5F28 /* blazepose_detector.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2F9CCA2536A14F005A5F28 /* blazepose_detector.cc */; };
+		EC2F9CDF25371536005A5F28 /* blazepose_landmark.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2F9CDD25371536005A5F28 /* blazepose_landmark.cc */; };
+		EC2F9CE525371769005A5F28 /* pose_detect_landmark.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC2F9CE325371769005A5F28 /* pose_detect_landmark.cc */; };
+		EC4E1B5024C97A0600B88179 /* test_blazeface.jpg in Resources */ = {isa = PBXBuildFile; fileRef = EC4E1B4D24C97A0500B88179 /* test_blazeface.jpg */; };
+		EC4E1B5124C97A0600B88179 /* TNNBlazefaceDetectorController.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC4E1B4F24C97A0500B88179 /* TNNBlazefaceDetectorController.mm */; };
+		EC4E1B5424C97A5E00B88179 /* blazeface_detector.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC4E1B5224C97A5E00B88179 /* blazeface_detector.cc */; };
+		EC9EC4FD2583948F0090AEEA /* relative_velocity_filter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC9EC4F62583948F0090AEEA /* relative_velocity_filter.cc */; };
+		EC9EC4FE2583948F0090AEEA /* low_pass_filter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC9EC4F72583948F0090AEEA /* low_pass_filter.cc */; };
+		EC9EC4FF2583948F0090AEEA /* time_stamp.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC9EC4F82583948F0090AEEA /* time_stamp.cc */; };
+		EC9EC5002583948F0090AEEA /* landmark_smoothing_filter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC9EC4FB2583948F0090AEEA /* landmark_smoothing_filter.cc */; };
+		ECAD84F924D7F01E0074EA95 /* TNNFacemeshController.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECAD84F824D7F01E0074EA95 /* TNNFacemeshController.mm */; };
+		ECAD84FD24D7F0AE0074EA95 /* face_mesh.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECAD84FB24D7F0AE0074EA95 /* face_mesh.cc */; };
+		ECAD84FF24D801690074EA95 /* test_facemesh.jpg in Resources */ = {isa = PBXBuildFile; fileRef = ECAD84FE24D801690074EA95 /* test_facemesh.jpg */; };
+		ECB077DE25FA0A2F007B1498 /* ocr_textbox_detector.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECB077DC25FA0A2F007B1498 /* ocr_textbox_detector.cc */; };
+		ECB077E125FA0A5B007B1498 /* ocr_angle_predictor.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECB077DF25FA0A5B007B1498 /* ocr_angle_predictor.cc */; };
+		ECB077E425FA0ABD007B1498 /* ocr_text_recognizer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECB077E225FA0ABD007B1498 /* ocr_text_recognizer.cc */; };
+		ECB077E725FA0AFA007B1498 /* ocr_driver.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECB077E525FA0AFA007B1498 /* ocr_driver.cc */; };
+		ECBA3C6225920A37001BC0A7 /* assets in Resources */ = {isa = PBXBuildFile; fileRef = ECBA3C6125920A37001BC0A7 /* assets */; };
+		ECD14A022505F88C00271443 /* face_detect_mesh.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD14A002505F88C00271443 /* face_detect_mesh.cc */; };
+		ECD14A042505FE1700271443 /* TNNFaceDetectMeshViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECD14A032505FE1700271443 /* TNNFaceDetectMeshViewModel.mm */; };
+		ECD9465B2559266D00BF9214 /* hair_segmentation.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD9465A2559266D00BF9214 /* hair_segmentation.cc */; };
+		ECD946682559295A00BF9214 /* DYFlatButton.m in Sources */ = {isa = PBXBuildFile; fileRef = ECD946642559295A00BF9214 /* DYFlatButton.m */; };
+		ECD946692559295A00BF9214 /* UIColor+Utility.m in Sources */ = {isa = PBXBuildFile; fileRef = ECD946662559295A00BF9214 /* UIColor+Utility.m */; };
+		ECD9466D2559296D00BF9214 /* TNNHairSegmentationViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECD9466C2559296C00BF9214 /* TNNHairSegmentationViewModel.mm */; };
+		ECD946712559298F00BF9214 /* TNNMaskImage.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECD946702559298F00BF9214 /* TNNMaskImage.mm */; };
+		ECD9467C2559306800BF9214 /* logo@2x.png in Resources */ = {isa = PBXBuildFile; fileRef = ECD9467B2559306700BF9214 /* logo@2x.png */; };
+		ECEC5DB224FFCC6C0044DDF1 /* youtu_face_align.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5DB124FFCC6C0044DDF1 /* youtu_face_align.cc */; };
+		ECEF96DD24AED66D0094CB8B /* object_detector_ssd.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEF96DC24AED66C0094CB8B /* object_detector_ssd.cc */; };
+		ECF121D02574E391006F79AC /* TNNSkeletonDetectorViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECF121CE2574E391006F79AC /* TNNSkeletonDetectorViewModel.mm */; };
+		ECF121D62574E408006F79AC /* skeleton_detector.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECF121D42574E408006F79AC /* skeleton_detector.cc */; };
+		ECFA85412603404200CF5233 /* TNNOCRViewModel.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECFA85402603404100CF5233 /* TNNOCRViewModel.mm */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		9DD57A0C23B5A8C400A96E63 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 9D2DB1D122D759C8000C508F;
+			remoteInfo = tnn;
+		};
+		9DD57A0E23B5A8CB00A96E63 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+			proxyType = 1;
+			remoteGlobalIDString = 9D2DB1D022D759C8000C508F;
+			remoteInfo = tnn;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		9D449863241D2D3A003FBAA2 /* UIImage+Utility.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "UIImage+Utility.h"; sourceTree = "<group>"; };
+		9D449864241D2D56003FBAA2 /* UIImage+Utility.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "UIImage+Utility.mm"; sourceTree = "<group>"; };
+		9D49E79F2504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNFaceDetectAlignerViewModel.mm; sourceTree = "<group>"; };
+		9D49E7A02504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNFaceDetectAlignerViewModel.h; sourceTree = "<group>"; };
+		9D49E7A22504BB5D00DD81D7 /* face_detect_aligner.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = face_detect_aligner.cc; sourceTree = "<group>"; };
+		9D49E7A32504BB5D00DD81D7 /* face_detect_aligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = face_detect_aligner.h; sourceTree = "<group>"; };
+		9D6A9F5C24268DBF000A5417 /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		9D6A9F5E24268DD3000A5417 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
+		9D71E78924C0AF4200074C1A /* TNNBoundingBox.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNBoundingBox.h; sourceTree = "<group>"; };
+		9D71E78A24C0AF4200074C1A /* TNNBoundingBox.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNBoundingBox.mm; sourceTree = "<group>"; };
+		9D71E79124C1980700074C1A /* tnn_fps_counter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tnn_fps_counter.cc; sourceTree = "<group>"; };
+		9D71E79224C1980700074C1A /* tnn_fps_counter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tnn_fps_counter.h; sourceTree = "<group>"; };
+		9D71E79824C4E92A00074C1A /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		9D71E79924C4E92A00074C1A /* TNNImageClassifyController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNImageClassifyController.h; sourceTree = "<group>"; };
+		9D71E79A24C4E92A00074C1A /* TNNImageClassifyController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNImageClassifyController.mm; sourceTree = "<group>"; };
+		9D71E79B24C4E92A00074C1A /* tiger_cat.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = tiger_cat.jpg; sourceTree = "<group>"; };
+		9D71E7A024C4F35A00074C1A /* TNNImageColourController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNImageColourController.h; sourceTree = "<group>"; };
+		9D71E7A124C4F35A00074C1A /* gray_00001.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = gray_00001.jpg; sourceTree = "<group>"; };
+		9D71E7A224C4F35A00074C1A /* TNNImageColourController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNImageColourController.mm; sourceTree = "<group>"; };
+		9D71E7A624C560C800074C1A /* TNNObjectDetectorController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNObjectDetectorController.mm; sourceTree = "<group>"; };
+		9D71E7A724C560C800074C1A /* TNNObjectDetectorController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNObjectDetectorController.h; sourceTree = "<group>"; };
+		9D71E7A824C560C800074C1A /* 004545.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = 004545.jpg; sourceTree = "<group>"; };
+		9D71E7B824C5685F00074C1A /* TNNYoloObjectDetectorController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNYoloObjectDetectorController.h; sourceTree = "<group>"; };
+		9D71E7B924C5685F00074C1A /* TNNYoloObjectDetectorController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNYoloObjectDetectorController.mm; sourceTree = "<group>"; };
+		9D71E7BA24C5685F00074C1A /* dog_cropped.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = dog_cropped.jpg; sourceTree = "<group>"; };
+		9D71E7C124C5BE1200074C1A /* TNNViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNViewModel.mm; sourceTree = "<group>"; };
+		9D71E7C224C5BE1200074C1A /* TNNViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNViewModel.h; sourceTree = "<group>"; };
+		9D71E7C424C5BE7000074C1A /* TNNFaceDetectorViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNFaceDetectorViewModel.h; sourceTree = "<group>"; };
+		9D71E7C524C5BE7000074C1A /* TNNFaceDetectorViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNFaceDetectorViewModel.mm; sourceTree = "<group>"; };
+		9D71E7C724C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNSSDObjectDetectorViewModel.mm; sourceTree = "<group>"; };
+		9D71E7C824C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNSSDObjectDetectorViewModel.h; sourceTree = "<group>"; };
+		9D71E7CA24C8404100074C1A /* TNNYoloObjectDetectorViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNYoloObjectDetectorViewModel.h; sourceTree = "<group>"; };
+		9D71E7CB24C8404100074C1A /* TNNYoloObjectDetectorViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNYoloObjectDetectorViewModel.mm; sourceTree = "<group>"; };
+		9D8493C12510B4A400869EEF /* TNNExamplesListController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNExamplesListController.h; sourceTree = "<group>"; };
+		9D8493C22510B4A400869EEF /* TNNExamplesListController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNExamplesListController.mm; sourceTree = "<group>"; };
+		9D8493C32510B4A400869EEF /* TNNExamplesController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNExamplesController.mm; sourceTree = "<group>"; };
+		9D8493C42510B4A400869EEF /* TNNExamplesController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNExamplesController.h; sourceTree = "<group>"; };
+		9D8493C72510B52C00869EEF /* TNNExamplesListCell.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNExamplesListCell.h; sourceTree = "<group>"; };
+		9D8493C82510B52C00869EEF /* TNNExamplesListCell.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = TNNExamplesListCell.m; sourceTree = "<group>"; };
+		9D9187DD242C30D500DF2B79 /* TNNFaceDetectorController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNFaceDetectorController.h; sourceTree = "<group>"; };
+		9D9187DE242C30D500DF2B79 /* test.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test.jpg; sourceTree = "<group>"; };
+		9D9187DF242C30D500DF2B79 /* TNNFaceDetectorController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNFaceDetectorController.mm; sourceTree = "<group>"; };
+		9D961FEB24116548009B3FB1 /* RootNavController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RootNavController.mm; sourceTree = "<group>"; };
+		9D961FEC24116548009B3FB1 /* RootNavController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RootNavController.h; sourceTree = "<group>"; };
+		9DC7116024D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNBlazeFaceDetectorViewModel.mm; sourceTree = "<group>"; };
+		9DC7116124D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNBlazeFaceDetectorViewModel.h; sourceTree = "<group>"; };
+		9DD579EA23B5A20500A96E63 /* TNNExamples.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TNNExamples.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		9DD579ED23B5A20500A96E63 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		9DD579EE23B5A20500A96E63 /* AppDelegate.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
+		9DD579F023B5A20500A96E63 /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = "<group>"; };
+		9DD579F123B5A20500A96E63 /* SceneDelegate.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = SceneDelegate.mm; sourceTree = "<group>"; };
+		9DD579F723B5A20500A96E63 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		9DD579F923B5A20E00A96E63 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		9DD579FC23B5A20E00A96E63 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		9DD579FE23B5A20E00A96E63 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		9DD579FF23B5A20E00A96E63 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = tnn.xcodeproj; path = ../../../platforms/ios/tnn.xcodeproj; sourceTree = "<group>"; };
+		9DD57A1423B5ACEB00A96E63 /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
+		9DD57A1623B5ACF900A96E63 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
+		9DD57A1823B5AD0100A96E63 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		9DDED3092429A0BE000C09EA /* model */ = {isa = PBXFileReference; lastKnownFileType = folder; name = model; path = ../../../model; sourceTree = "<group>"; };
+		9DF1A0B624A8987C00E1376D /* face_gray_transfer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = face_gray_transfer.h; sourceTree = "<group>"; };
+		9DF1A0B724A8987C00E1376D /* face_gray_transfer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = face_gray_transfer.cc; sourceTree = "<group>"; };
+		D1330F46244AC2D900CEBF29 /* TNNCameraPreviewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNCameraPreviewController.mm; sourceTree = "<group>"; };
+		D1330F48244AC30E00CEBF29 /* TNNCameraPreviewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNCameraPreviewController.h; sourceTree = "<group>"; };
+		D1330F4A244AC4FA00CEBF29 /* TNNCameraVideoDevice.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNCameraVideoDevice.mm; sourceTree = "<group>"; };
+		D1330F4B244AC4FA00CEBF29 /* TNNCameraVideoDevice.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNCameraVideoDevice.h; sourceTree = "<group>"; };
+		D1330F4D244AFBFF00CEBF29 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
+		D18F1F1B244B4689002E71CA /* camera_rotate_fill.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = camera_rotate_fill.png; sourceTree = "<group>"; };
+		EC1B9AA624BC0B4B0003A742 /* object_detector_yolo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = object_detector_yolo.h; sourceTree = "<group>"; };
+		EC1B9AA724BC0B7B0003A742 /* object_detector_yolo.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = object_detector_yolo.cc; sourceTree = "<group>"; };
+		EC1DF17F2609D1F100AB11D1 /* clipper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clipper.h; sourceTree = "<group>"; };
+		EC1DF1802609D1F100AB11D1 /* clipper.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clipper.cc; sourceTree = "<group>"; };
+		EC2CF729250797C800EE3899 /* sample_timer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sample_timer.h; sourceTree = "<group>"; };
+		EC2CF72A250797C800EE3899 /* tnn_sdk_sample.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tnn_sdk_sample.h; sourceTree = "<group>"; };
+		EC2CF72B250797C800EE3899 /* image_classifier.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = image_classifier.cc; sourceTree = "<group>"; };
+		EC2CF72C250797C800EE3899 /* tnn_sdk_sample.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tnn_sdk_sample.cc; sourceTree = "<group>"; };
+		EC2CF72D250797C800EE3899 /* ultra_face_detector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ultra_face_detector.h; sourceTree = "<group>"; };
+		EC2CF72E250797C800EE3899 /* sample_timer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sample_timer.cc; sourceTree = "<group>"; };
+		EC2CF72F250797C800EE3899 /* ultra_face_detector.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ultra_face_detector.cc; sourceTree = "<group>"; };
+		EC2CF730250797C800EE3899 /* image_classifier.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = image_classifier.h; sourceTree = "<group>"; };
+		EC2F9CC525369FE8005A5F28 /* TNNPoseDetectLandmarkViewModel.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNPoseDetectLandmarkViewModel.mm; sourceTree = "<group>"; };
+		EC2F9CC92536A001005A5F28 /* TNNPoseDetectLandmarkViewModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNPoseDetectLandmarkViewModel.h; sourceTree = "<group>"; };
+		EC2F9CCA2536A14F005A5F28 /* blazepose_detector.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = blazepose_detector.cc; sourceTree = "<group>"; };
+		EC2F9CCB2536A14F005A5F28 /* blazepose_detector.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = blazepose_detector.h; sourceTree = "<group>"; };
+		EC2F9CDD25371536005A5F28 /* blazepose_landmark.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = blazepose_landmark.cc; sourceTree = "<group>"; };
+		EC2F9CDE25371536005A5F28 /* blazepose_landmark.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = blazepose_landmark.h; sourceTree = "<group>"; };
+		EC2F9CE325371769005A5F28 /* pose_detect_landmark.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = pose_detect_landmark.cc; sourceTree = "<group>"; };
+		EC2F9CE425371769005A5F28 /* pose_detect_landmark.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = pose_detect_landmark.h; sourceTree = "<group>"; };
+		EC4E1B4D24C97A0500B88179 /* test_blazeface.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test_blazeface.jpg; sourceTree = "<group>"; };
+		EC4E1B4E24C97A0500B88179 /* TNNBlazefaceDetectorController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNBlazefaceDetectorController.h; sourceTree = "<group>"; };
+		EC4E1B4F24C97A0500B88179 /* TNNBlazefaceDetectorController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNBlazefaceDetectorController.mm; sourceTree = "<group>"; };
+		EC4E1B5224C97A5E00B88179 /* blazeface_detector.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = blazeface_detector.cc; sourceTree = "<group>"; };
+		EC4E1B5324C97A5E00B88179 /* blazeface_detector.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = blazeface_detector.h; sourceTree = "<group>"; };
+		EC9EC4F52583948F0090AEEA /* time_stamp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = time_stamp.h; sourceTree = "<group>"; };
+		EC9EC4F62583948F0090AEEA /* relative_velocity_filter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relative_velocity_filter.cc; sourceTree = "<group>"; };
+		EC9EC4F72583948F0090AEEA /* low_pass_filter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = low_pass_filter.cc; sourceTree = "<group>"; };
+		EC9EC4F82583948F0090AEEA /* time_stamp.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = time_stamp.cc; sourceTree = "<group>"; };
+		EC9EC4F92583948F0090AEEA /* low_pass_filter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = low_pass_filter.h; sourceTree = "<group>"; };
+		EC9EC4FA2583948F0090AEEA /* landmark_smoothing_filter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = landmark_smoothing_filter.h; sourceTree = "<group>"; };
+		EC9EC4FB2583948F0090AEEA /* landmark_smoothing_filter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = landmark_smoothing_filter.cc; sourceTree = "<group>"; };
+		EC9EC4FC2583948F0090AEEA /* relative_velocity_filter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = relative_velocity_filter.h; sourceTree = "<group>"; };
+		ECAD84F824D7F01E0074EA95 /* TNNFacemeshController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNFacemeshController.mm; sourceTree = "<group>"; };
+		ECAD84FA24D7F0400074EA95 /* TNNFacemeshController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNFacemeshController.h; sourceTree = "<group>"; };
+		ECAD84FB24D7F0AE0074EA95 /* face_mesh.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = face_mesh.cc; sourceTree = "<group>"; };
+		ECAD84FC24D7F0AE0074EA95 /* face_mesh.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = face_mesh.h; sourceTree = "<group>"; };
+		ECAD84FE24D801690074EA95 /* test_facemesh.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test_facemesh.jpg; sourceTree = "<group>"; };
+		ECB077DC25FA0A2F007B1498 /* ocr_textbox_detector.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ocr_textbox_detector.cc; sourceTree = "<group>"; };
+		ECB077DD25FA0A2F007B1498 /* ocr_textbox_detector.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ocr_textbox_detector.h; sourceTree = "<group>"; };
+		ECB077DF25FA0A5B007B1498 /* ocr_angle_predictor.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ocr_angle_predictor.cc; sourceTree = "<group>"; };
+		ECB077E025FA0A5B007B1498 /* ocr_angle_predictor.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ocr_angle_predictor.h; sourceTree = "<group>"; };
+		ECB077E225FA0ABD007B1498 /* ocr_text_recognizer.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ocr_text_recognizer.cc; sourceTree = "<group>"; };
+		ECB077E325FA0ABD007B1498 /* ocr_text_recognizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ocr_text_recognizer.h; sourceTree = "<group>"; };
+		ECB077E525FA0AFA007B1498 /* ocr_driver.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ocr_driver.cc; sourceTree = "<group>"; };
+		ECB077E625FA0AFA007B1498 /* ocr_driver.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ocr_driver.h; sourceTree = "<group>"; };
+		ECBA3C6125920A37001BC0A7 /* assets */ = {isa = PBXFileReference; lastKnownFileType = folder; name = assets; path = ../../assets; sourceTree = "<group>"; };
+		ECD14A002505F88C00271443 /* face_detect_mesh.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = face_detect_mesh.cc; sourceTree = "<group>"; };
+		ECD14A012505F88C00271443 /* face_detect_mesh.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = face_detect_mesh.h; sourceTree = "<group>"; };
+		ECD14A032505FE1700271443 /* TNNFaceDetectMeshViewModel.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNFaceDetectMeshViewModel.mm; sourceTree = "<group>"; };
+		ECD14A052505FE3100271443 /* TNNFaceDetectMeshViewModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TNNFaceDetectMeshViewModel.h; sourceTree = "<group>"; };
+		ECD946592559266D00BF9214 /* hair_segmentation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hair_segmentation.h; sourceTree = "<group>"; };
+		ECD9465A2559266D00BF9214 /* hair_segmentation.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hair_segmentation.cc; sourceTree = "<group>"; };
+		ECD946642559295A00BF9214 /* DYFlatButton.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = DYFlatButton.m; sourceTree = "<group>"; };
+		ECD946652559295A00BF9214 /* UIColor+Utility.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "UIColor+Utility.h"; sourceTree = "<group>"; };
+		ECD946662559295A00BF9214 /* UIColor+Utility.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "UIColor+Utility.m"; sourceTree = "<group>"; };
+		ECD946672559295A00BF9214 /* DYFlatButton.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DYFlatButton.h; sourceTree = "<group>"; };
+		ECD9466B2559296C00BF9214 /* TNNHairSegmentationViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNHairSegmentationViewModel.h; sourceTree = "<group>"; };
+		ECD9466C2559296C00BF9214 /* TNNHairSegmentationViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNHairSegmentationViewModel.mm; sourceTree = "<group>"; };
+		ECD9466F2559298F00BF9214 /* TNNMaskImage.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNMaskImage.h; sourceTree = "<group>"; };
+		ECD946702559298F00BF9214 /* TNNMaskImage.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNMaskImage.mm; sourceTree = "<group>"; };
+		ECD9467B2559306700BF9214 /* logo@2x.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "logo@2x.png"; sourceTree = "<group>"; };
+		ECEC5DB024FFCC6C0044DDF1 /* youtu_face_align.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = youtu_face_align.h; sourceTree = "<group>"; };
+		ECEC5DB124FFCC6C0044DDF1 /* youtu_face_align.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = youtu_face_align.cc; sourceTree = "<group>"; };
+		ECEF96D924AED3980094CB8B /* object_detector_ssd.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = object_detector_ssd.h; sourceTree = "<group>"; };
+		ECEF96DC24AED66C0094CB8B /* object_detector_ssd.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = object_detector_ssd.cc; sourceTree = "<group>"; };
+		ECF121CE2574E391006F79AC /* TNNSkeletonDetectorViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNSkeletonDetectorViewModel.mm; sourceTree = "<group>"; };
+		ECF121CF2574E391006F79AC /* TNNSkeletonDetectorViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNSkeletonDetectorViewModel.h; sourceTree = "<group>"; };
+		ECF121D42574E408006F79AC /* skeleton_detector.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = skeleton_detector.cc; sourceTree = "<group>"; };
+		ECF121D52574E408006F79AC /* skeleton_detector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = skeleton_detector.h; sourceTree = "<group>"; };
+		ECFA853F2603404100CF5233 /* TNNOCRViewModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNOCRViewModel.h; sourceTree = "<group>"; };
+		ECFA85402603404100CF5233 /* TNNOCRViewModel.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TNNOCRViewModel.mm; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		9DD579E723B5A20500A96E63 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				D1330F4E244AFBFF00CEBF29 /* AVFoundation.framework in Frameworks */,
+				9D6A9F5F24268DD3000A5417 /* CoreMedia.framework in Frameworks */,
+				9DD57A1923B5AD0100A96E63 /* Accelerate.framework in Frameworks */,
+				9DD57A1723B5ACF900A96E63 /* Foundation.framework in Frameworks */,
+				9DD57A1523B5ACEB00A96E63 /* CoreML.framework in Frameworks */,
+				9DD57A1123B5A8D000A96E63 /* tnn.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		9D71E79724C4E92A00074C1A /* TNNImageClassifyController */ = {
+			isa = PBXGroup;
+			children = (
+				9D71E79824C4E92A00074C1A /* synset.txt */,
+				9D71E79924C4E92A00074C1A /* TNNImageClassifyController.h */,
+				9D71E79A24C4E92A00074C1A /* TNNImageClassifyController.mm */,
+				9D71E79B24C4E92A00074C1A /* tiger_cat.jpg */,
+			);
+			path = TNNImageClassifyController;
+			sourceTree = "<group>";
+		};
+		9D71E79F24C4F35A00074C1A /* TNNImageColourController */ = {
+			isa = PBXGroup;
+			children = (
+				9D71E7A024C4F35A00074C1A /* TNNImageColourController.h */,
+				9D71E7A124C4F35A00074C1A /* gray_00001.jpg */,
+				9D71E7A224C4F35A00074C1A /* TNNImageColourController.mm */,
+			);
+			path = TNNImageColourController;
+			sourceTree = "<group>";
+		};
+		9D71E7A524C560C800074C1A /* TNNObjectDetectorController */ = {
+			isa = PBXGroup;
+			children = (
+				9D71E7A624C560C800074C1A /* TNNObjectDetectorController.mm */,
+				9D71E7A724C560C800074C1A /* TNNObjectDetectorController.h */,
+				9D71E7A824C560C800074C1A /* 004545.jpg */,
+			);
+			path = TNNObjectDetectorController;
+			sourceTree = "<group>";
+		};
+		9D71E7B724C5685F00074C1A /* TNNYoloObjectDetectorController */ = {
+			isa = PBXGroup;
+			children = (
+				9D71E7B824C5685F00074C1A /* TNNYoloObjectDetectorController.h */,
+				9D71E7B924C5685F00074C1A /* TNNYoloObjectDetectorController.mm */,
+				9D71E7BA24C5685F00074C1A /* dog_cropped.jpg */,
+			);
+			path = TNNYoloObjectDetectorController;
+			sourceTree = "<group>";
+		};
+		9D71E7C024C5BE1200074C1A /* TNNViewModel */ = {
+			isa = PBXGroup;
+			children = (
+				ECFA853F2603404100CF5233 /* TNNOCRViewModel.h */,
+				ECFA85402603404100CF5233 /* TNNOCRViewModel.mm */,
+				ECF121CF2574E391006F79AC /* TNNSkeletonDetectorViewModel.h */,
+				ECF121CE2574E391006F79AC /* TNNSkeletonDetectorViewModel.mm */,
+				ECD9466B2559296C00BF9214 /* TNNHairSegmentationViewModel.h */,
+				ECD9466C2559296C00BF9214 /* TNNHairSegmentationViewModel.mm */,
+				9D49E7A02504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.h */,
+				9D49E79F2504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.mm */,
+				9DC7116124D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.h */,
+				9DC7116024D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.mm */,
+				9D71E7CA24C8404100074C1A /* TNNYoloObjectDetectorViewModel.h */,
+				9D71E7CB24C8404100074C1A /* TNNYoloObjectDetectorViewModel.mm */,
+				9D71E7C824C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.h */,
+				9D71E7C724C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.mm */,
+				9D71E7C424C5BE7000074C1A /* TNNFaceDetectorViewModel.h */,
+				9D71E7C524C5BE7000074C1A /* TNNFaceDetectorViewModel.mm */,
+				9D71E7C124C5BE1200074C1A /* TNNViewModel.mm */,
+				9D71E7C224C5BE1200074C1A /* TNNViewModel.h */,
+				ECD14A032505FE1700271443 /* TNNFaceDetectMeshViewModel.mm */,
+				ECD14A052505FE3100271443 /* TNNFaceDetectMeshViewModel.h */,
+				EC2F9CC525369FE8005A5F28 /* TNNPoseDetectLandmarkViewModel.mm */,
+				EC2F9CC92536A001005A5F28 /* TNNPoseDetectLandmarkViewModel.h */,
+			);
+			path = TNNViewModel;
+			sourceTree = "<group>";
+		};
+		9D8493C02510B4A400869EEF /* TNNExamplesController */ = {
+			isa = PBXGroup;
+			children = (
+				9D8493C12510B4A400869EEF /* TNNExamplesListController.h */,
+				9D8493C22510B4A400869EEF /* TNNExamplesListController.mm */,
+				9D8493C32510B4A400869EEF /* TNNExamplesController.mm */,
+				9D8493C42510B4A400869EEF /* TNNExamplesController.h */,
+				9D8493C82510B52C00869EEF /* TNNExamplesListCell.m */,
+				9D8493C72510B52C00869EEF /* TNNExamplesListCell.h */,
+			);
+			path = TNNExamplesController;
+			sourceTree = "<group>";
+		};
+		9D9187DC242C30D500DF2B79 /* TNNFaceDetectorController */ = {
+			isa = PBXGroup;
+			children = (
+				9D9187DD242C30D500DF2B79 /* TNNFaceDetectorController.h */,
+				9D9187DE242C30D500DF2B79 /* test.jpg */,
+				9D9187DF242C30D500DF2B79 /* TNNFaceDetectorController.mm */,
+			);
+			path = TNNFaceDetectorController;
+			sourceTree = "<group>";
+		};
+		9DD579E123B5A20500A96E63 = {
+			isa = PBXGroup;
+			children = (
+				9DD579EC23B5A20500A96E63 /* TNNExamples */,
+				9DD579EB23B5A20500A96E63 /* Products */,
+				9DD57A1023B5A8D000A96E63 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		9DD579EB23B5A20500A96E63 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				9DD579EA23B5A20500A96E63 /* TNNExamples.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		9DD579EC23B5A20500A96E63 /* TNNExamples */ = {
+			isa = PBXGroup;
+			children = (
+				ECBA3C6125920A37001BC0A7 /* assets */,
+				EC9EC4F42583948F0090AEEA /* utils */,
+				ECD9467B2559306700BF9214 /* logo@2x.png */,
+				ECD946632559295A00BF9214 /* Component */,
+				9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */,
+				9DDED30B2429A839000C09EA /* base */,
+				9DDED3092429A0BE000C09EA /* model */,
+				9DD579ED23B5A20500A96E63 /* AppDelegate.h */,
+				9DD579EE23B5A20500A96E63 /* AppDelegate.mm */,
+				9DD579F023B5A20500A96E63 /* SceneDelegate.h */,
+				9DD579F123B5A20500A96E63 /* SceneDelegate.mm */,
+				9D961FEC24116548009B3FB1 /* RootNavController.h */,
+				9D961FEB24116548009B3FB1 /* RootNavController.mm */,
+				9D8493C02510B4A400869EEF /* TNNExamplesController */,
+				ECAD84F724D7EFCC0074EA95 /* TNNFacemeshController */,
+				EC4E1B4C24C97A0500B88179 /* TNNBlazefaceDetectorController */,
+				9D71E7B724C5685F00074C1A /* TNNYoloObjectDetectorController */,
+				9D71E7A524C560C800074C1A /* TNNObjectDetectorController */,
+				9D71E79F24C4F35A00074C1A /* TNNImageColourController */,
+				9D71E79724C4E92A00074C1A /* TNNImageClassifyController */,
+				D1330F45244AC2D900CEBF29 /* TNNCameraPreviewController */,
+				9D9187DC242C30D500DF2B79 /* TNNFaceDetectorController */,
+				9DD579F923B5A20E00A96E63 /* Assets.xcassets */,
+				9DD579F623B5A20500A96E63 /* Main.storyboard */,
+				9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */,
+				9DD579FE23B5A20E00A96E63 /* Info.plist */,
+				9DD579FF23B5A20E00A96E63 /* main.m */,
+				9D449863241D2D3A003FBAA2 /* UIImage+Utility.h */,
+				9D449864241D2D56003FBAA2 /* UIImage+Utility.mm */,
+			);
+			path = TNNExamples;
+			sourceTree = "<group>";
+		};
+		9DD57A0923B5A8C400A96E63 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				9DD57A0D23B5A8C400A96E63 /* tnn.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		9DD57A1023B5A8D000A96E63 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				D1330F4D244AFBFF00CEBF29 /* AVFoundation.framework */,
+				9D6A9F5E24268DD3000A5417 /* CoreMedia.framework */,
+				9D6A9F5C24268DBF000A5417 /* CoreImage.framework */,
+				9DD57A1823B5AD0100A96E63 /* Accelerate.framework */,
+				9DD57A1623B5ACF900A96E63 /* Foundation.framework */,
+				9DD57A1423B5ACEB00A96E63 /* CoreML.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		9DDED30B2429A839000C09EA /* base */ = {
+			isa = PBXGroup;
+			children = (
+				ECF121D42574E408006F79AC /* skeleton_detector.cc */,
+				ECF121D52574E408006F79AC /* skeleton_detector.h */,
+				ECD9465A2559266D00BF9214 /* hair_segmentation.cc */,
+				ECD946592559266D00BF9214 /* hair_segmentation.h */,
+				EC2CF72B250797C800EE3899 /* image_classifier.cc */,
+				EC2CF730250797C800EE3899 /* image_classifier.h */,
+				EC2CF72E250797C800EE3899 /* sample_timer.cc */,
+				EC2CF729250797C800EE3899 /* sample_timer.h */,
+				EC2CF72C250797C800EE3899 /* tnn_sdk_sample.cc */,
+				EC2CF72A250797C800EE3899 /* tnn_sdk_sample.h */,
+				EC2CF72F250797C800EE3899 /* ultra_face_detector.cc */,
+				EC2CF72D250797C800EE3899 /* ultra_face_detector.h */,
+				9D49E7A22504BB5D00DD81D7 /* face_detect_aligner.cc */,
+				9D49E7A32504BB5D00DD81D7 /* face_detect_aligner.h */,
+				ECEC5DB124FFCC6C0044DDF1 /* youtu_face_align.cc */,
+				ECEC5DB024FFCC6C0044DDF1 /* youtu_face_align.h */,
+				9D71E79124C1980700074C1A /* tnn_fps_counter.cc */,
+				9D71E79224C1980700074C1A /* tnn_fps_counter.h */,
+				9DF1A0B724A8987C00E1376D /* face_gray_transfer.cc */,
+				9DF1A0B624A8987C00E1376D /* face_gray_transfer.h */,
+				ECEF96DC24AED66C0094CB8B /* object_detector_ssd.cc */,
+				ECEF96D924AED3980094CB8B /* object_detector_ssd.h */,
+				EC1B9AA724BC0B7B0003A742 /* object_detector_yolo.cc */,
+				EC1B9AA624BC0B4B0003A742 /* object_detector_yolo.h */,
+				EC4E1B5224C97A5E00B88179 /* blazeface_detector.cc */,
+				EC4E1B5324C97A5E00B88179 /* blazeface_detector.h */,
+				ECAD84FB24D7F0AE0074EA95 /* face_mesh.cc */,
+				ECAD84FC24D7F0AE0074EA95 /* face_mesh.h */,
+				ECD14A002505F88C00271443 /* face_detect_mesh.cc */,
+				ECD14A012505F88C00271443 /* face_detect_mesh.h */,
+				EC2F9CCA2536A14F005A5F28 /* blazepose_detector.cc */,
+				EC2F9CCB2536A14F005A5F28 /* blazepose_detector.h */,
+				EC2F9CDD25371536005A5F28 /* blazepose_landmark.cc */,
+				EC2F9CDE25371536005A5F28 /* blazepose_landmark.h */,
+				EC2F9CE325371769005A5F28 /* pose_detect_landmark.cc */,
+				EC2F9CE425371769005A5F28 /* pose_detect_landmark.h */,
+				ECB077DC25FA0A2F007B1498 /* ocr_textbox_detector.cc */,
+				ECB077DD25FA0A2F007B1498 /* ocr_textbox_detector.h */,
+				ECB077DF25FA0A5B007B1498 /* ocr_angle_predictor.cc */,
+				ECB077E025FA0A5B007B1498 /* ocr_angle_predictor.h */,
+				ECB077E225FA0ABD007B1498 /* ocr_text_recognizer.cc */,
+				ECB077E325FA0ABD007B1498 /* ocr_text_recognizer.h */,
+				ECB077E525FA0AFA007B1498 /* ocr_driver.cc */,
+				ECB077E625FA0AFA007B1498 /* ocr_driver.h */,
+			);
+			name = base;
+			path = ../../base;
+			sourceTree = "<group>";
+		};
+		D1330F45244AC2D900CEBF29 /* TNNCameraPreviewController */ = {
+			isa = PBXGroup;
+			children = (
+				9D71E7C024C5BE1200074C1A /* TNNViewModel */,
+				D1330F49244AC4FA00CEBF29 /* CameraDevice */,
+				D1330F46244AC2D900CEBF29 /* TNNCameraPreviewController.mm */,
+				D1330F48244AC30E00CEBF29 /* TNNCameraPreviewController.h */,
+			);
+			path = TNNCameraPreviewController;
+			sourceTree = "<group>";
+		};
+		D1330F49244AC4FA00CEBF29 /* CameraDevice */ = {
+			isa = PBXGroup;
+			children = (
+				ECD9466F2559298F00BF9214 /* TNNMaskImage.h */,
+				ECD946702559298F00BF9214 /* TNNMaskImage.mm */,
+				D18F1F1B244B4689002E71CA /* camera_rotate_fill.png */,
+				D1330F4B244AC4FA00CEBF29 /* TNNCameraVideoDevice.h */,
+				D1330F4A244AC4FA00CEBF29 /* TNNCameraVideoDevice.mm */,
+				9D71E78924C0AF4200074C1A /* TNNBoundingBox.h */,
+				9D71E78A24C0AF4200074C1A /* TNNBoundingBox.mm */,
+			);
+			path = CameraDevice;
+			sourceTree = "<group>";
+		};
+		EC4E1B4C24C97A0500B88179 /* TNNBlazefaceDetectorController */ = {
+			isa = PBXGroup;
+			children = (
+				EC4E1B4D24C97A0500B88179 /* test_blazeface.jpg */,
+				EC4E1B4E24C97A0500B88179 /* TNNBlazefaceDetectorController.h */,
+				EC4E1B4F24C97A0500B88179 /* TNNBlazefaceDetectorController.mm */,
+			);
+			path = TNNBlazefaceDetectorController;
+			sourceTree = "<group>";
+		};
+		EC9EC4F42583948F0090AEEA /* utils */ = {
+			isa = PBXGroup;
+			children = (
+				EC1DF1802609D1F100AB11D1 /* clipper.cc */,
+				EC1DF17F2609D1F100AB11D1 /* clipper.h */,
+				EC9EC4F52583948F0090AEEA /* time_stamp.h */,
+				EC9EC4F62583948F0090AEEA /* relative_velocity_filter.cc */,
+				EC9EC4F72583948F0090AEEA /* low_pass_filter.cc */,
+				EC9EC4F82583948F0090AEEA /* time_stamp.cc */,
+				EC9EC4F92583948F0090AEEA /* low_pass_filter.h */,
+				EC9EC4FA2583948F0090AEEA /* landmark_smoothing_filter.h */,
+				EC9EC4FB2583948F0090AEEA /* landmark_smoothing_filter.cc */,
+				EC9EC4FC2583948F0090AEEA /* relative_velocity_filter.h */,
+			);
+			name = utils;
+			path = ../../utils;
+			sourceTree = "<group>";
+		};
+		ECAD84F724D7EFCC0074EA95 /* TNNFacemeshController */ = {
+			isa = PBXGroup;
+			children = (
+				ECAD84FE24D801690074EA95 /* test_facemesh.jpg */,
+				ECAD84F824D7F01E0074EA95 /* TNNFacemeshController.mm */,
+				ECAD84FA24D7F0400074EA95 /* TNNFacemeshController.h */,
+			);
+			path = TNNFacemeshController;
+			sourceTree = "<group>";
+		};
+		ECD946632559295A00BF9214 /* Component */ = {
+			isa = PBXGroup;
+			children = (
+				ECD946642559295A00BF9214 /* DYFlatButton.m */,
+				ECD946652559295A00BF9214 /* UIColor+Utility.h */,
+				ECD946662559295A00BF9214 /* UIColor+Utility.m */,
+				ECD946672559295A00BF9214 /* DYFlatButton.h */,
+			);
+			path = Component;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		9DD579E923B5A20500A96E63 /* TNNExamples */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 9DD57A0323B5A20E00A96E63 /* Build configuration list for PBXNativeTarget "TNNExamples" */;
+			buildPhases = (
+				9DD579E623B5A20500A96E63 /* Sources */,
+				9DD579E723B5A20500A96E63 /* Frameworks */,
+				9DD579E823B5A20500A96E63 /* Resources */,
+				9DD57A1323B5A91700A96E63 /* ShellScript */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				9DD57A0F23B5A8CB00A96E63 /* PBXTargetDependency */,
+			);
+			name = TNNExamples;
+			productName = TNNExamples;
+			productReference = 9DD579EA23B5A20500A96E63 /* TNNExamples.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		9DD579E223B5A20500A96E63 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1130;
+				ORGANIZATIONNAME = tencent;
+				TargetAttributes = {
+					9DD579E923B5A20500A96E63 = {
+						CreatedOnToolsVersion = 11.3;
+					};
+				};
+			};
+			buildConfigurationList = 9DD579E523B5A20500A96E63 /* Build configuration list for PBXProject "TNNExamples" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 9DD579E123B5A20500A96E63;
+			productRefGroup = 9DD579EB23B5A20500A96E63 /* Products */;
+			projectDirPath = "";
+			projectReferences = (
+				{
+					ProductGroup = 9DD57A0923B5A8C400A96E63 /* Products */;
+					ProjectRef = 9DD57A0823B5A8C400A96E63 /* tnn.xcodeproj */;
+				},
+			);
+			projectRoot = "";
+			targets = (
+				9DD579E923B5A20500A96E63 /* TNNExamples */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXReferenceProxy section */
+		9DD57A0D23B5A8C400A96E63 /* tnn.framework */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.framework;
+			path = tnn.framework;
+			remoteRef = 9DD57A0C23B5A8C400A96E63 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+/* End PBXReferenceProxy section */
+
+/* Begin PBXResourcesBuildPhase section */
+		9DD579E823B5A20500A96E63 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				ECBA3C6225920A37001BC0A7 /* assets in Resources */,
+				9D71E7A324C4F35A00074C1A /* gray_00001.jpg in Resources */,
+				EC4E1B5024C97A0600B88179 /* test_blazeface.jpg in Resources */,
+				9DD579FD23B5A20E00A96E63 /* LaunchScreen.storyboard in Resources */,
+				9D9187E3242C30D600DF2B79 /* test.jpg in Resources */,
+				9D71E7B024C560C900074C1A /* 004545.jpg in Resources */,
+				D18F1F1C244B4689002E71CA /* camera_rotate_fill.png in Resources */,
+				9DDED30A2429A0BE000C09EA /* model in Resources */,
+				ECAD84FF24D801690074EA95 /* test_facemesh.jpg in Resources */,
+				9D71E7BC24C5685F00074C1A /* dog_cropped.jpg in Resources */,
+				9D71E79C24C4E92A00074C1A /* synset.txt in Resources */,
+				9DD579FA23B5A20E00A96E63 /* Assets.xcassets in Resources */,
+				ECD9467C2559306800BF9214 /* logo@2x.png in Resources */,
+				9D71E79E24C4E92A00074C1A /* tiger_cat.jpg in Resources */,
+				9DD579F823B5A20500A96E63 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		9DD57A1323B5A91700A96E63 /* ShellScript */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+			);
+			outputFileListPaths = (
+			);
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "echo $TARGET_BUILD_DIR\ncp $TARGET_BUILD_DIR/tnn.framework/default.metallib $TARGET_BUILD_DIR/$TARGET_NAME.app/tnn.metallib\n";
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		9DD579E623B5A20500A96E63 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9D71E79D24C4E92A00074C1A /* TNNImageClassifyController.mm in Sources */,
+				EC2F9CE525371769005A5F28 /* pose_detect_landmark.cc in Sources */,
+				EC2F9CCC2536A14F005A5F28 /* blazepose_detector.cc in Sources */,
+				9D71E7CC24C8404100074C1A /* TNNYoloObjectDetectorViewModel.mm in Sources */,
+				9D71E7C324C5BE1200074C1A /* TNNViewModel.mm in Sources */,
+				9D71E7BB24C5685F00074C1A /* TNNYoloObjectDetectorController.mm in Sources */,
+				ECEC5DB224FFCC6C0044DDF1 /* youtu_face_align.cc in Sources */,
+				9DC7116224D0189D00FA71FA /* TNNBlazeFaceDetectorViewModel.mm in Sources */,
+				9DD579EF23B5A20500A96E63 /* AppDelegate.mm in Sources */,
+				ECB077E425FA0ABD007B1498 /* ocr_text_recognizer.cc in Sources */,
+				EC2CF731250797C800EE3899 /* image_classifier.cc in Sources */,
+				9D8493C52510B4A400869EEF /* TNNExamplesListController.mm in Sources */,
+				ECD9465B2559266D00BF9214 /* hair_segmentation.cc in Sources */,
+				ECD946682559295A00BF9214 /* DYFlatButton.m in Sources */,
+				ECB077E725FA0AFA007B1498 /* ocr_driver.cc in Sources */,
+				ECB077E125FA0A5B007B1498 /* ocr_angle_predictor.cc in Sources */,
+				ECD14A022505F88C00271443 /* face_detect_mesh.cc in Sources */,
+				ECF121D62574E408006F79AC /* skeleton_detector.cc in Sources */,
+				9D71E7C924C5D1A600074C1A /* TNNSSDObjectDetectorViewModel.mm in Sources */,
+				9DD57A0023B5A20E00A96E63 /* main.m in Sources */,
+				EC4E1B5124C97A0600B88179 /* TNNBlazefaceDetectorController.mm in Sources */,
+				EC2F9CC625369FE8005A5F28 /* TNNPoseDetectLandmarkViewModel.mm in Sources */,
+				9DD579F223B5A20500A96E63 /* SceneDelegate.mm in Sources */,
+				9D449865241D2D56003FBAA2 /* UIImage+Utility.mm in Sources */,
+				EC9EC5002583948F0090AEEA /* landmark_smoothing_filter.cc in Sources */,
+				9D71E7C624C5BE7000074C1A /* TNNFaceDetectorViewModel.mm in Sources */,
+				ECB077DE25FA0A2F007B1498 /* ocr_textbox_detector.cc in Sources */,
+				9D49E7A12504BA1D00DD81D7 /* TNNFaceDetectAlignerViewModel.mm in Sources */,
+				EC1DF1812609D1F200AB11D1 /* clipper.cc in Sources */,
+				ECD14A042505FE1700271443 /* TNNFaceDetectMeshViewModel.mm in Sources */,
+				EC9EC4FD2583948F0090AEEA /* relative_velocity_filter.cc in Sources */,
+				9D71E7AF24C560C900074C1A /* TNNObjectDetectorController.mm in Sources */,
+				EC9EC4FE2583948F0090AEEA /* low_pass_filter.cc in Sources */,
+				ECFA85412603404200CF5233 /* TNNOCRViewModel.mm in Sources */,
+				ECD946692559295A00BF9214 /* UIColor+Utility.m in Sources */,
+				ECEF96DD24AED66D0094CB8B /* object_detector_ssd.cc in Sources */,
+				9D49E7A42504BB5D00DD81D7 /* face_detect_aligner.cc in Sources */,
+				ECAD84F924D7F01E0074EA95 /* TNNFacemeshController.mm in Sources */,
+				9D9187E4242C30D600DF2B79 /* TNNFaceDetectorController.mm in Sources */,
+				ECF121D02574E391006F79AC /* TNNSkeletonDetectorViewModel.mm in Sources */,
+				EC4E1B5424C97A5E00B88179 /* blazeface_detector.cc in Sources */,
+				9D71E7A424C4F35A00074C1A /* TNNImageColourController.mm in Sources */,
+				ECAD84FD24D7F0AE0074EA95 /* face_mesh.cc in Sources */,
+				EC2F9CDF25371536005A5F28 /* blazepose_landmark.cc in Sources */,
+				D1330F4C244AC4FA00CEBF29 /* TNNCameraVideoDevice.mm in Sources */,
+				9D8493C92510B52C00869EEF /* TNNExamplesListCell.m in Sources */,
+				9D71E78B24C0AF4200074C1A /* TNNBoundingBox.mm in Sources */,
+				EC9EC4FF2583948F0090AEEA /* time_stamp.cc in Sources */,
+				9D961FED24116548009B3FB1 /* RootNavController.mm in Sources */,
+				ECD946712559298F00BF9214 /* TNNMaskImage.mm in Sources */,
+				ECD9466D2559296D00BF9214 /* TNNHairSegmentationViewModel.mm in Sources */,
+				EC2CF734250797C800EE3899 /* ultra_face_detector.cc in Sources */,
+				EC2CF732250797C800EE3899 /* tnn_sdk_sample.cc in Sources */,
+				D1330F47244AC2D900CEBF29 /* TNNCameraPreviewController.mm in Sources */,
+				9DF1A0B824A8987C00E1376D /* face_gray_transfer.cc in Sources */,
+				EC1B9AA824BC0B7B0003A742 /* object_detector_yolo.cc in Sources */,
+				EC2CF733250797C800EE3899 /* sample_timer.cc in Sources */,
+				9D71E79324C1980700074C1A /* tnn_fps_counter.cc in Sources */,
+				9D8493C62510B4A400869EEF /* TNNExamplesController.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		9DD57A0F23B5A8CB00A96E63 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			name = tnn;
+			targetProxy = 9DD57A0E23B5A8CB00A96E63 /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin PBXVariantGroup section */
+		9DD579F623B5A20500A96E63 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				9DD579F723B5A20500A96E63 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		9DD579FB23B5A20E00A96E63 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				9DD579FC23B5A20E00A96E63 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		9DD57A0123B5A20E00A96E63 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 13.2;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		9DD57A0223B5A20E00A96E63 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 13.2;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		9DD57A0423B5A20E00A96E63 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = 63GW8TBRYS;
+				ENABLE_BITCODE = NO;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/TNNExamples",
+					"$(PROJECT_DIR)/../../third_party/opencv/iOS",
+				);
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_FILE = TNNExamples/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"$(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/tnn.framework/tnn",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.devandong.TNNExmaples;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALID_ARCHS = arm64;
+			};
+			name = Debug;
+		};
+		9DD57A0523B5A20E00A96E63 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = 63GW8TBRYS;
+				ENABLE_BITCODE = NO;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/TNNExamples",
+					"$(PROJECT_DIR)/../../third_party/opencv/iOS",
+				);
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_FILE = TNNExamples/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"$(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/tnn.framework/tnn",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.devandong.TNNExmaples;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALID_ARCHS = arm64;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		9DD579E523B5A20500A96E63 /* Build configuration list for PBXProject "TNNExamples" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9DD57A0123B5A20E00A96E63 /* Debug */,
+				9DD57A0223B5A20E00A96E63 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		9DD57A0323B5A20E00A96E63 /* Build configuration list for PBXNativeTarget "TNNExamples" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9DD57A0423B5A20E00A96E63 /* Debug */,
+				9DD57A0523B5A20E00A96E63 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 9DD579E223B5A20500A96E63 /* Project object */;
+}
diff --git a/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000..93817ff
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:TNNExamples.xcodeproj">
+   </FileRef>
+</Workspace>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 0000000..18d9810
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/xcshareddata/xcschemes/TNNExamples.xcscheme b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/xcshareddata/xcschemes/TNNExamples.xcscheme
new file mode 100644
index 0000000..c1c07d1
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples.xcodeproj/xcshareddata/xcschemes/TNNExamples.xcscheme
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1130"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+               BuildableName = "TNNExamples.app"
+               BlueprintName = "TNNExamples"
+               ReferencedContainer = "container:TNNExamples.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+            BuildableName = "TNNExamples.app"
+            BlueprintName = "TNNExamples"
+            ReferencedContainer = "container:TNNExamples.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "9DD579E923B5A20500A96E63"
+            BuildableName = "TNNExamples.app"
+            BlueprintName = "TNNExamples"
+            ReferencedContainer = "container:TNNExamples.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.h b/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.h
new file mode 100644
index 0000000..3b71c49
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.mm b/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.mm
new file mode 100644
index 0000000..3d991e7
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/AppDelegate.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "AppDelegate.h"
+
+@interface AppDelegate ()
+@end
+
+@implementation AppDelegate
+@synthesize window = _window;
+
+- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    // Override point for customization after application launch.
+    return YES;
+}
+
+#pragma mark - UISceneSession lifecycle
+
+- (UISceneConfiguration *)application:(UIApplication *)application
+    configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession
+                                   options:(UISceneConnectionOptions *)options {
+    // Called when a new scene session is being created.
+    // Use this method to select a configuration to create the new scene with.
+    return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role];
+}
+
+- (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet<UISceneSession *> *)sceneSessions {
+    // Called when the user discards a scene session.
+    // If any sessions were discarded while the application was not running,
+    // this will be called shortly after
+    // application:didFinishLaunchingWithOptions. Use this method to release any
+    // resources that were specific to the discarded scenes, as they will not
+    // return.
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/Contents.json b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000..728e6c3
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,122 @@
+{
+    "images": [
+        {
+            "size": "20x20",
+            "idiom": "iphone",
+            "filename": "icon-20@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "20x20",
+            "idiom": "iphone",
+            "filename": "icon-20@3x.png",
+            "scale": "3x"
+        },
+        {
+            "size": "29x29",
+            "idiom": "iphone",
+            "filename": "icon-29.png",
+            "scale": "1x"
+        },
+        {
+            "size": "29x29",
+            "idiom": "iphone",
+            "filename": "icon-29@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "29x29",
+            "idiom": "iphone",
+            "filename": "icon-29@3x.png",
+            "scale": "3x"
+        },
+        {
+            "size": "40x40",
+            "idiom": "iphone",
+            "filename": "icon-40@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "40x40",
+            "idiom": "iphone",
+            "filename": "icon-40@3x.png",
+            "scale": "3x"
+        },
+        {
+            "size": "60x60",
+            "idiom": "iphone",
+            "filename": "icon-60@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "60x60",
+            "idiom": "iphone",
+            "filename": "icon-60@3x.png",
+            "scale": "3x"
+        },
+        {
+            "size": "20x20",
+            "idiom": "ipad",
+            "filename": "icon-20-ipad.png",
+            "scale": "1x"
+        },
+        {
+            "size": "20x20",
+            "idiom": "ipad",
+            "filename": "icon-20@2x-ipad.png",
+            "scale": "2x"
+        },
+        {
+            "size": "29x29",
+            "idiom": "ipad",
+            "filename": "icon-29-ipad.png",
+            "scale": "1x"
+        },
+        {
+            "size": "29x29",
+            "idiom": "ipad",
+            "filename": "icon-29@2x-ipad.png",
+            "scale": "2x"
+        },
+        {
+            "size": "40x40",
+            "idiom": "ipad",
+            "filename": "icon-40.png",
+            "scale": "1x"
+        },
+        {
+            "size": "40x40",
+            "idiom": "ipad",
+            "filename": "icon-40@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "76x76",
+            "idiom": "ipad",
+            "filename": "icon-76.png",
+            "scale": "1x"
+        },
+        {
+            "size": "76x76",
+            "idiom": "ipad",
+            "filename": "icon-76@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "83.5x83.5",
+            "idiom": "ipad",
+            "filename": "icon-83.5@2x.png",
+            "scale": "2x"
+        },
+        {
+            "size": "1024x1024",
+            "idiom": "ios-marketing",
+            "filename": "icon-1024.png",
+            "scale": "1x"
+        }
+    ],
+    "info": {
+        "version": 1,
+        "author": "icon.wuruihong.com"
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-1024.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-1024.png
new file mode 100644
index 0000000..b15c315
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-1024.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20-ipad.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20-ipad.png
new file mode 100644
index 0000000..e5528ba
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20-ipad.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x-ipad.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x-ipad.png
new file mode 100644
index 0000000..5f959ab
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x-ipad.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x.png
new file mode 100644
index 0000000..5f959ab
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@3x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@3x.png
new file mode 100644
index 0000000..7ee4d56
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-20@3x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29-ipad.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29-ipad.png
new file mode 100644
index 0000000..c041172
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29-ipad.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29.png
new file mode 100644
index 0000000..c041172
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x-ipad.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x-ipad.png
new file mode 100644
index 0000000..3729f68
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x-ipad.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x.png
new file mode 100644
index 0000000..3729f68
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@3x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@3x.png
new file mode 100644
index 0000000..b8ffa77
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-29@3x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40.png
new file mode 100644
index 0000000..5f959ab
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@2x.png
new file mode 100644
index 0000000..0a78f8c
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@3x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@3x.png
new file mode 100644
index 0000000..b9a7ea0
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-40@3x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@2x.png
new file mode 100644
index 0000000..b9a7ea0
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@3x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@3x.png
new file mode 100644
index 0000000..f64d180
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-60@3x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76.png
new file mode 100644
index 0000000..8f81b76
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76@2x.png
new file mode 100644
index 0000000..e2bbc10
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-76@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-83.5@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-83.5@2x.png
new file mode 100644
index 0000000..3caa9e5
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/AppIcon.appiconset/icon-83.5@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/Contents.json b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/Contents.json
new file mode 100644
index 0000000..da4a164
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/LaunchScreen.storyboard b/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000..1c8bdf4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="17156" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina6_1" orientation="portrait" appearance="light"/>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="17125"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="System colors in document resources" minToolsVersion="11.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="414" height="896"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" image="logo.png" translatesAutoresizingMaskIntoConstraints="NO" id="2Hc-Xw-CRv">
+                                <rect key="frame" x="93" y="289" width="228" height="228"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" secondItem="2Hc-Xw-CRv" secondAttribute="height" multiplier="1:1" id="6C3-Ww-r59"/>
+                                </constraints>
+                            </imageView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TNN" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="NOK-St-dcu">
+                                <rect key="frame" x="128" y="497" width="158.5" height="95.5"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="80"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
+                        <constraints>
+                            <constraint firstItem="2Hc-Xw-CRv" firstAttribute="centerY" secondItem="6Tk-OE-BBY" secondAttribute="centerY" constant="-50" id="1Fe-j9-fik"/>
+                            <constraint firstItem="2Hc-Xw-CRv" firstAttribute="width" secondItem="6Tk-OE-BBY" secondAttribute="width" multiplier="0.55" id="5eg-IV-lps"/>
+                            <constraint firstItem="NOK-St-dcu" firstAttribute="centerX" secondItem="6Tk-OE-BBY" secondAttribute="centerX" id="V4X-5n-b2Y"/>
+                            <constraint firstItem="2Hc-Xw-CRv" firstAttribute="centerX" secondItem="6Tk-OE-BBY" secondAttribute="centerX" id="XHh-kZ-oPD"/>
+                            <constraint firstItem="NOK-St-dcu" firstAttribute="top" secondItem="2Hc-Xw-CRv" secondAttribute="bottom" constant="-20" id="bbu-6w-Jc9"/>
+                        </constraints>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+    <resources>
+        <image name="logo.png" width="512" height="512"/>
+        <systemColor name="systemBackgroundColor">
+            <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+        </systemColor>
+    </resources>
+</document>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/Main.storyboard b/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/Main.storyboard
new file mode 100644
index 0000000..92d0590
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Base.lproj/Main.storyboard
@@ -0,0 +1,889 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="17156" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="1C3-Qb-WYm">
+    <device id="retina5_9" orientation="portrait" appearance="light"/>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="17125"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="System colors in document resources" minToolsVersion="11.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--TNNFaceDetectorController-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController storyboardIdentifier="TNNFaceDetectorController" id="BYZ-38-t0r" customClass="TNNFaceDetectorController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="812"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="hDg-pg-hHt">
+                                <rect key="frame" x="187" y="728" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <userDefinedRuntimeAttributes>
+                                    <userDefinedRuntimeAttribute type="string" keyPath="layer.cornerRadius" value="8"/>
+                                </userDefinedRuntimeAttributes>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="BYZ-38-t0r" eventType="touchUpInside" id="YB3-9I-vRk"/>
+                                </connections>
+                            </button>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="sZd-pf-Rbc">
+                                <rect key="frame" x="15" y="59" width="345" height="509"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="JE8-R1-9Xh">
+                                <rect key="frame" x="81" y="733" width="51" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwichChanged:" destination="BYZ-38-t0r" eventType="valueChanged" id="s9O-yk-Imq"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="center" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="u5B-PT-pN4">
+                                <rect key="frame" x="30" y="740" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="42J-kj-xG4">
+                                <rect key="frame" x="30" y="619" width="315" height="54"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNFaceDetectorController" id="It1-My-GFQ">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="ij5-NC-1g3">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="33x-pX-Avd"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnTNNExamples" destination="hDg-pg-hHt" id="o0K-lO-wRf"/>
+                        <outlet property="imageView" destination="sZd-pf-Rbc" id="SVh-Ls-MLq"/>
+                        <outlet property="labelResult" destination="42J-kj-xG4" id="ZYA-by-ccL"/>
+                        <outlet property="switchGPU" destination="JE8-R1-9Xh" id="iet-Kz-MJt"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="357.60000000000002" y="26.53673163418291"/>
+        </scene>
+        <!--TNNObjectDetectorController-->
+        <scene sceneID="QXE-Uc-qAl">
+            <objects>
+                <viewController storyboardIdentifier="TNNObjectDetectorController" id="AwB-4D-ExP" customClass="TNNObjectDetectorController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="lwv-Yl-rCH">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j8N-8N-DvR" userLabel="BtnTNN Examples">
+                                <rect key="frame" x="188" y="674" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="AwB-4D-ExP" eventType="touchDown" id="Ewj-OX-RE3"/>
+                                </connections>
+                            </button>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="BPL-CO-dcy">
+                                <rect key="frame" x="30" y="60" width="300" height="300"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="FVH-sk-W1O" userLabel="SwitchGPU">
+                                <rect key="frame" x="82" y="679" width="49" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwichChanged:" destination="AwB-4D-ExP" eventType="valueChanged" id="JCh-V4-ocx"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="natural" lineBreakMode="tailTruncation" numberOfLines="3" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="NHy-Fo-Lmj">
+                                <rect key="frame" x="30" y="688" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="5" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="ivE-Ud-4v3" userLabel="Label Result">
+                                <rect key="frame" x="30" y="791" width="300" height="80"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="gVz-JL-dqT"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <toolbarItems/>
+                    <navigationItem key="navigationItem" title="TNNObjectDetectorController" id="ZOX-Ve-2Hp">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="GoY-xj-UbG" userLabel="Compose">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="eRu-Rx-vNT"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnTNNExamples" destination="j8N-8N-DvR" id="2CT-vv-7qa"/>
+                        <outlet property="imageView" destination="BPL-CO-dcy" id="JBU-aC-4MZ"/>
+                        <outlet property="labelResult" destination="ivE-Ud-4v3" id="ek6-5o-EsT"/>
+                        <outlet property="switchGPU" destination="FVH-sk-W1O" id="Ya3-jL-wt1"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="uEe-Ji-48m" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-500" y="798.35082458770626"/>
+        </scene>
+        <!--TNNImageClassifyController-->
+        <scene sceneID="ySD-2M-GbC">
+            <objects>
+                <viewController storyboardIdentifier="TNNImageClassifyController" id="5Zr-9o-llY" customClass="TNNImageClassifyController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Blt-Oe-YQ0">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="zYu-99-Ksp">
+                                <rect key="frame" x="188" y="674" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <userDefinedRuntimeAttributes>
+                                    <userDefinedRuntimeAttribute type="string" keyPath="layer.cornerRadius" value="8"/>
+                                </userDefinedRuntimeAttributes>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="5Zr-9o-llY" eventType="touchUpInside" id="clm-lZ-lHD"/>
+                                </connections>
+                            </button>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="u6U-Ip-bJl">
+                                <rect key="frame" x="15" y="15" width="345" height="500"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="jvR-9y-nPz">
+                                <rect key="frame" x="82" y="679" width="51" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwichChanged:" destination="5Zr-9o-llY" eventType="valueChanged" id="O7P-aZ-8hW"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="center" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="ADi-Uz-g9m">
+                                <rect key="frame" x="30" y="685" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="dc9-Hp-SYk">
+                                <rect key="frame" x="20" y="577" width="335" height="100"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="XkY-ji-3ZQ"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNImageClassifyController" id="Wcx-Wg-dxl">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="GMT-CK-exI">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="g4Z-94-bEW"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnTNNExamples" destination="zYu-99-Ksp" id="ljs-Na-Za4"/>
+                        <outlet property="imageView" destination="u6U-Ip-bJl" id="apt-1E-KRB"/>
+                        <outlet property="labelResult" destination="dc9-Hp-SYk" id="zPS-XY-Oex"/>
+                        <outlet property="switchGPU" destination="jvR-9y-nPz" id="p4r-BK-maF"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="ACz-4T-aXD" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="380" y="799.2503748125938"/>
+        </scene>
+        <!--TNNYoutuFaceAlignController-->
+        <scene sceneID="W3c-4Z-HUS">
+            <objects>
+                <viewController storyboardIdentifier="TNNYoutuFaceAlignController" id="8dj-aO-8mp" userLabel="TNNYoutuFaceAlignController" customClass="TNNYoutuFaceAlignController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="TDZ-rl-rTm">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="AJb-7j-RD4">
+                                <rect key="frame" x="15" y="10" width="345" height="483"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="w5v-8d-e5c">
+                                <rect key="frame" x="46" y="664" width="30" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="foy-mE-wNR">
+                                <rect key="frame" x="190" y="650" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="8dj-aO-8mp" eventType="touchUpInside" id="Fjf-Kb-bIV"/>
+                                </connections>
+                            </button>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="SGe-GE-WWH" userLabel="SwitchGPU">
+                                <rect key="frame" x="96" y="654" width="49" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwitchChanged:" destination="8dj-aO-8mp" eventType="valueChanged" id="PbA-kc-tTl"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="NQs-5S-ymA" userLabel="LabelResult">
+                                <rect key="frame" x="27" y="538" width="320" height="70"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="PGG-dS-T8m"/>
+                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
+                    </view>
+                    <navigationItem key="navigationItem" id="ek6-Ro-1K3">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="NMO-Li-u3z" userLabel="Compose">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="0f6-Kd-cf5"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnExamples" destination="foy-mE-wNR" id="UiM-kl-ODc"/>
+                        <outlet property="imageView" destination="AJb-7j-RD4" id="iiy-y5-eSJ"/>
+                        <outlet property="labelGPU" destination="w5v-8d-e5c" id="Wll-Zn-wMF"/>
+                        <outlet property="labelResult" destination="NQs-5S-ymA" id="FNn-Yq-w58"/>
+                        <outlet property="switchGPU" destination="SGe-GE-WWH" id="Uk0-0A-d1M"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="o1t-4f-8cK" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="356" y="1523.3883058470765"/>
+        </scene>
+        <!--TNNImageColourController-->
+        <scene sceneID="B8Y-Rn-CJa">
+            <objects>
+                <viewController storyboardIdentifier="TNNImageColourController" id="PJc-D7-Ivv" userLabel="TNNImageColourController" customClass="TNNImageColourController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Wkn-ql-Vnr">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="Q6n-7v-8G6">
+                                <rect key="frame" x="188" y="674" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <userDefinedRuntimeAttributes>
+                                    <userDefinedRuntimeAttribute type="string" keyPath="layer.cornerRadius" value="8"/>
+                                </userDefinedRuntimeAttributes>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="PJc-D7-Ivv" eventType="touchUpInside" id="gjx-HK-Wbh"/>
+                                </connections>
+                            </button>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="Xt2-b2-uro">
+                                <rect key="frame" x="15" y="15" width="345" height="500"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <gestureRecognizers/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="zih-kX-m9R">
+                                <rect key="frame" x="82" y="679" width="51" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwichChanged:" destination="PJc-D7-Ivv" eventType="valueChanged" id="0m4-VL-PaN"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="center" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="4Yj-Bb-fER">
+                                <rect key="frame" x="30" y="685" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="t79-JY-bBc">
+                                <rect key="frame" x="20" y="577" width="335" height="100"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="7UQ-Y4-KzY"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNImageColourController" id="xzk-mj-0ea">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="l3Y-9y-0VW">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="Rxr-c4-ben"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnTNNExamples" destination="Q6n-7v-8G6" id="sGK-EO-Pj7"/>
+                        <outlet property="imageView" destination="Xt2-b2-uro" id="Z37-cR-4nU"/>
+                        <outlet property="labelResult" destination="t79-JY-bBc" id="FVv-bO-xJX"/>
+                        <outlet property="switchGPU" destination="zih-kX-m9R" id="pB0-b6-mqo"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="pJl-p4-ig9" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="1956" y="27"/>
+        </scene>
+        <!--Model List-->
+        <scene sceneID="1He-D9-9al">
+            <objects>
+                <tableViewController id="jDT-gM-iHE" customClass="TNNExamplesListController" sceneMemberID="viewController">
+                    <tableView key="view" clipsSubviews="YES" contentMode="scaleToFill" alwaysBounceVertical="YES" dataMode="prototypes" style="plain" separatorStyle="default" rowHeight="80" estimatedRowHeight="80" sectionHeaderHeight="-1" sectionFooterHeight="-1" id="Khu-jM-gfo">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
+                        <prototypes>
+                            <tableViewCell clipsSubviews="YES" contentMode="scaleToFill" preservesSuperviewLayoutMargins="YES" selectionStyle="default" indentationWidth="10" reuseIdentifier="TNNExamplesListCell" rowHeight="80" id="Gur-m6-EIK" customClass="TNNExamplesListCell">
+                                <rect key="frame" x="0.0" y="28" width="375" height="80"/>
+                                <autoresizingMask key="autoresizingMask"/>
+                                <tableViewCellContentView key="contentView" opaque="NO" clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="center" preservesSuperviewLayoutMargins="YES" insetsLayoutMarginsFromSafeArea="NO" tableViewCell="Gur-m6-EIK" id="eCj-Zi-oI2">
+                                    <rect key="frame" x="0.0" y="0.0" width="375" height="80"/>
+                                    <autoresizingMask key="autoresizingMask"/>
+                                    <subviews>
+                                        <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="人脸检测" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Ndg-mt-9DB">
+                                            <rect key="frame" x="20" y="15" width="69.666666666666671" height="21"/>
+                                            <fontDescription key="fontDescription" type="boldSystem" pointSize="17"/>
+                                            <nil key="textColor"/>
+                                            <nil key="highlightedColor"/>
+                                        </label>
+                                        <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="单输入多输出" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="e9c-AQ-Q3k">
+                                            <rect key="frame" x="20" y="43.666666666666664" width="104" height="20.999999999999993"/>
+                                            <fontDescription key="fontDescription" type="system" pointSize="17"/>
+                                            <nil key="textColor"/>
+                                            <nil key="highlightedColor"/>
+                                        </label>
+                                    </subviews>
+                                    <constraints>
+                                        <constraint firstItem="e9c-AQ-Q3k" firstAttribute="top" secondItem="Ndg-mt-9DB" secondAttribute="bottom" constant="7.5" id="CSz-z1-ry9"/>
+                                        <constraint firstItem="Ndg-mt-9DB" firstAttribute="top" secondItem="eCj-Zi-oI2" secondAttribute="top" constant="15" id="Fmo-EM-Lhw"/>
+                                        <constraint firstItem="Ndg-mt-9DB" firstAttribute="leading" secondItem="eCj-Zi-oI2" secondAttribute="leading" constant="20" id="YHO-S4-PI7"/>
+                                        <constraint firstItem="e9c-AQ-Q3k" firstAttribute="leading" secondItem="eCj-Zi-oI2" secondAttribute="leading" constant="20" id="bNC-wF-k86"/>
+                                        <constraint firstAttribute="bottom" secondItem="e9c-AQ-Q3k" secondAttribute="bottom" priority="250" constant="15.5" id="ioC-RR-3s7"/>
+                                    </constraints>
+                                </tableViewCellContentView>
+                                <connections>
+                                    <outlet property="labelDesc" destination="e9c-AQ-Q3k" id="vZS-fM-0Cd"/>
+                                    <outlet property="labelTitle" destination="Ndg-mt-9DB" id="PQW-j0-odf"/>
+                                </connections>
+                            </tableViewCell>
+                        </prototypes>
+                        <sections/>
+                        <connections>
+                            <outlet property="dataSource" destination="jDT-gM-iHE" id="ggX-Ye-BF7"/>
+                            <outlet property="delegate" destination="jDT-gM-iHE" id="NXu-H3-MJc"/>
+                        </connections>
+                    </tableView>
+                    <navigationItem key="navigationItem" title="Model List" id="3h5-ew-iZu"/>
+                    <connections>
+                        <segue destination="5Zr-9o-llY" kind="show" id="Mjh-EY-zsL"/>
+                        <segue destination="sgv-XJ-oS6" kind="show" id="K4E-B9-D8z"/>
+                        <segue destination="AwB-4D-ExP" kind="show" id="gjf-d3-x5D"/>
+                        <segue destination="PJc-D7-Ivv" kind="show" id="1wx-bU-IUG"/>
+                        <segue destination="gtC-O2-E5m" kind="show" id="CMy-Kf-o5J"/>
+                        <segue destination="aqD-Jk-Yfl" kind="show" id="uBb-25-v2q"/>
+                        <segue destination="X5Q-FQ-x2r" kind="show" id="h50-uG-JzL"/>
+                        <segue destination="Pzy-Q2-Mw4" kind="show" id="r4k-fv-Y5N"/>
+                        <segue destination="8dj-aO-8mp" kind="show" id="QzD-lh-ws0"/>
+                    </connections>
+                </tableViewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="9yb-bE-TQ2" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="1146" y="27"/>
+        </scene>
+        <!--Root Nav Controller-->
+        <scene sceneID="93y-QD-ePb">
+            <objects>
+                <navigationController id="1C3-Qb-WYm" customClass="RootNavController" sceneMemberID="viewController">
+                    <navigationBar key="navigationBar" contentMode="scaleToFill" insetsLayoutMarginsFromSafeArea="NO" id="BzZ-xa-nSS">
+                        <rect key="frame" x="0.0" y="44" width="375" height="44"/>
+                        <autoresizingMask key="autoresizingMask"/>
+                    </navigationBar>
+                    <connections>
+                        <segue destination="BYZ-38-t0r" kind="relationship" relationship="rootViewController" id="gVK-IF-Mrp"/>
+                    </connections>
+                </navigationController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="ztT-Du-Oi3" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-354" y="27"/>
+        </scene>
+        <!--TNNCameraPreviewController-->
+        <scene sceneID="WtO-vk-RGe">
+            <objects>
+                <viewController storyboardIdentifier="TNNCameraPreviewController" id="sgv-XJ-oS6" userLabel="TNNCameraPreviewController" customClass="TNNCameraPreviewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="zej-Cj-chf">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="Jzh-TA-ImK" userLabel="CameraPreview">
+                                <rect key="frame" x="0.0" y="0.0" width="375" height="724"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </imageView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="T7P-h8-aH9">
+                                <rect key="frame" x="0.0" y="0.0" width="375" height="0.0"/>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" systemColor="systemRedColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="n9i-g4-uPr">
+                                <rect key="frame" x="0.0" y="644" width="375" height="0.0"/>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="IZ8-Mb-NIU">
+                                <rect key="frame" x="0.0" y="564" width="375" height="80"/>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="80" id="MIF-DZ-7do"/>
+                                </constraints>
+                            </view>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Ccu-zY-PNM">
+                                <rect key="frame" x="0.0" y="644" width="375" height="80"/>
+                                <subviews>
+                                    <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="GPU" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="J7Z-IV-xzC" userLabel="GPU">
+                                        <rect key="frame" x="20" y="29.666666666666629" width="35" height="21"/>
+                                        <fontDescription key="fontDescription" type="system" pointSize="17"/>
+                                        <color key="textColor" red="0.062485367059999998" green="0.51609534030000004" blue="0.83378475900000004" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                                        <nil key="highlightedColor"/>
+                                    </label>
+                                    <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="JPX-6d-8un" userLabel="GPUSwitch">
+                                        <rect key="frame" x="70" y="24.666666666666629" width="51" height="31"/>
+                                        <connections>
+                                            <action selector="onSwitchGPU:" destination="sgv-XJ-oS6" eventType="valueChanged" id="DLo-gr-IjD"/>
+                                        </connections>
+                                    </switch>
+                                    <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" adjustsImageSizeForAccessibilityContentSizeCategory="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="3K3-CP-UwN">
+                                        <rect key="frame" x="320" y="22.666666666666629" width="35" height="35"/>
+                                        <constraints>
+                                            <constraint firstAttribute="width" constant="35" id="an1-4X-i1x"/>
+                                            <constraint firstAttribute="height" constant="35" id="ohV-JZ-Bbb"/>
+                                        </constraints>
+                                        <state key="normal" image="camera_rotate_fill.png">
+                                            <preferredSymbolConfiguration key="preferredSymbolConfiguration" scale="default"/>
+                                        </state>
+                                        <connections>
+                                            <action selector="onCameraRotate:" destination="sgv-XJ-oS6" eventType="touchUpInside" id="sPL-Qd-tl3"/>
+                                        </connections>
+                                    </button>
+                                </subviews>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="80" id="3ji-ex-7KH"/>
+                                    <constraint firstItem="JPX-6d-8un" firstAttribute="leading" secondItem="J7Z-IV-xzC" secondAttribute="trailing" constant="15" id="6B5-ux-aPU"/>
+                                    <constraint firstItem="J7Z-IV-xzC" firstAttribute="centerY" secondItem="Ccu-zY-PNM" secondAttribute="centerY" id="6mt-T6-Huz"/>
+                                    <constraint firstItem="J7Z-IV-xzC" firstAttribute="leading" secondItem="Ccu-zY-PNM" secondAttribute="leading" constant="20" id="SFL-IA-Mca"/>
+                                    <constraint firstItem="3K3-CP-UwN" firstAttribute="centerY" secondItem="Ccu-zY-PNM" secondAttribute="centerY" id="hr1-fi-ylE"/>
+                                    <constraint firstAttribute="trailing" secondItem="3K3-CP-UwN" secondAttribute="trailing" constant="20" id="r3M-EU-yWw"/>
+                                    <constraint firstItem="JPX-6d-8un" firstAttribute="centerY" secondItem="Ccu-zY-PNM" secondAttribute="centerY" id="tUp-Zu-xcl"/>
+                                </constraints>
+                            </view>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="xAB-bN-e2h"/>
+                        <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="trailing" secondItem="T7P-h8-aH9" secondAttribute="trailing" id="1eE-lD-cPB"/>
+                            <constraint firstItem="T7P-h8-aH9" firstAttribute="top" secondItem="xAB-bN-e2h" secondAttribute="top" id="4Y2-in-pSD"/>
+                            <constraint firstItem="Ccu-zY-PNM" firstAttribute="top" secondItem="IZ8-Mb-NIU" secondAttribute="bottom" id="8rE-zs-rq3"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="trailing" secondItem="IZ8-Mb-NIU" secondAttribute="trailing" id="C8F-Tm-tI3"/>
+                            <constraint firstItem="T7P-h8-aH9" firstAttribute="leading" secondItem="xAB-bN-e2h" secondAttribute="leading" id="FmH-Ra-dp8"/>
+                            <constraint firstItem="Ccu-zY-PNM" firstAttribute="top" secondItem="n9i-g4-uPr" secondAttribute="bottom" id="Fqp-5v-T0a"/>
+                            <constraint firstItem="Ccu-zY-PNM" firstAttribute="leading" secondItem="xAB-bN-e2h" secondAttribute="leading" id="JLH-Ya-h75"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="trailing" secondItem="Ccu-zY-PNM" secondAttribute="trailing" id="PGm-64-kGY"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="bottom" secondItem="Jzh-TA-ImK" secondAttribute="bottom" id="QHP-jt-KP9"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="bottom" secondItem="Ccu-zY-PNM" secondAttribute="bottom" id="WCC-Sm-zd7"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="trailing" secondItem="n9i-g4-uPr" secondAttribute="trailing" id="XoL-Dp-FgX"/>
+                            <constraint firstItem="n9i-g4-uPr" firstAttribute="leading" secondItem="xAB-bN-e2h" secondAttribute="leading" id="aol-XN-Qku"/>
+                            <constraint firstItem="Jzh-TA-ImK" firstAttribute="leading" secondItem="xAB-bN-e2h" secondAttribute="leading" id="kBJ-Zm-DgO"/>
+                            <constraint firstItem="Jzh-TA-ImK" firstAttribute="top" secondItem="xAB-bN-e2h" secondAttribute="top" id="pUt-D7-jLt"/>
+                            <constraint firstItem="xAB-bN-e2h" firstAttribute="trailing" secondItem="Jzh-TA-ImK" secondAttribute="trailing" id="viX-d2-ZzB"/>
+                            <constraint firstItem="IZ8-Mb-NIU" firstAttribute="leading" secondItem="xAB-bN-e2h" secondAttribute="leading" id="y2g-e3-BU7"/>
+                        </constraints>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNFaceDetectorController" id="waI-WJ-7hX" userLabel="TNNFaceDetectorCameraController">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="SNu-Cv-9xK">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="lkg-b7-2yB"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="cameraPreview" destination="Jzh-TA-ImK" id="KAV-On-2PA"/>
+                        <outlet property="customOptionView" destination="IZ8-Mb-NIU" id="x1m-8p-68S"/>
+                        <outlet property="customOptionViewHeight" destination="MIF-DZ-7do" id="oPY-Ur-oes"/>
+                        <outlet property="labelFPS" destination="T7P-h8-aH9" id="4MO-FE-bwN"/>
+                        <outlet property="labelResult" destination="n9i-g4-uPr" id="Iag-KH-vGI"/>
+                        <outlet property="rotateCamera" destination="3K3-CP-UwN" id="IwX-G3-mqR"/>
+                        <outlet property="switchGPU" destination="JPX-6d-8un" id="15a-bg-M7L"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="7KD-uk-w6J" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="1144.8" y="798.35082458770626"/>
+        </scene>
+        <!--TNNYoloObjectDetectorController-->
+        <scene sceneID="wDH-1C-XoK">
+            <objects>
+                <viewController storyboardIdentifier="TNNYoloObjectDetectorController" id="gtC-O2-E5m" userLabel="TNNYoloObjectDetectorController" customClass="TNNYoloObjectDetectorController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="9R5-KS-wKC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="cky-D9-xqF" userLabel="BtnTNN Examples">
+                                <rect key="frame" x="188" y="654" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="gtC-O2-E5m" eventType="touchUpInside" id="rBq-q6-6BZ"/>
+                                </connections>
+                            </button>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="bXw-cf-M1G">
+                                <rect key="frame" x="30" y="60" width="300" height="300"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="Ep7-G5-4tS" userLabel="SwitchGPU">
+                                <rect key="frame" x="82" y="659" width="49" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwichChanged:" destination="gtC-O2-E5m" eventType="valueChanged" id="Vga-cs-RaT"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="natural" lineBreakMode="tailTruncation" numberOfLines="3" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="tSy-PP-9oA">
+                                <rect key="frame" x="30" y="668" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="5" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="SDP-dL-upL" userLabel="Label Result">
+                                <rect key="frame" x="30" y="771" width="300" height="80"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="4Ge-z1-DzP"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNYoloiObjectDetectorController" id="o4x-jf-Ymw" userLabel="TNNYoloObjectDetectorController">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="OZJ-Hs-KbU">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="DrR-G2-YVm"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnTNNExamples" destination="cky-D9-xqF" id="NmV-Wn-cbC"/>
+                        <outlet property="imageView" destination="bXw-cf-M1G" id="saN-jf-h5g"/>
+                        <outlet property="labelResult" destination="SDP-dL-upL" id="yKG-II-rnZ"/>
+                        <outlet property="switchGPU" destination="Ep7-G5-4tS" id="UfR-Br-Y94"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="H3S-5O-l1a" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="2080.8000000000002" y="798.35082458770626"/>
+        </scene>
+        <!--TNNBlazefaceDetectorController-->
+        <scene sceneID="rjo-Ks-7TZ">
+            <objects>
+                <viewController storyboardIdentifier="TNNBlazefaceDetectorController" id="aqD-Jk-Yfl" userLabel="TNNBlazefaceDetectorController" customClass="TNNBlazefaceDetectorController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="DzL-jw-jBn">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="Mto-2t-9aH">
+                                <rect key="frame" x="15" y="59" width="345" height="509"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="CGw-IB-KII" userLabel="SwitchGPU">
+                                <rect key="frame" x="81" y="699" width="49" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwitchChanged:" destination="aqD-Jk-Yfl" eventType="valueChanged" id="ABP-6u-ivp"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="center" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2vr-dI-RkP">
+                                <rect key="frame" x="30" y="706" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="mis-ec-az7" userLabel="Label Result">
+                                <rect key="frame" x="30" y="585" width="315" height="54"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="URz-fZ-By3" userLabel="BtnTNN Examples">
+                                <rect key="frame" x="187" y="692" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="aqD-Jk-Yfl" eventType="touchUpInside" id="HCC-GO-7kH"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="ATJ-Vt-f8Y"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNBlazefaceDetectorController" id="ka2-J5-iXt" userLabel="TNNBlazefaceDetectorController">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="4zv-q4-iwU">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="KKB-gT-RBT"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnExample" destination="URz-fZ-By3" id="7jT-IM-FFo"/>
+                        <outlet property="imageView" destination="Mto-2t-9aH" id="ina-KE-gYP"/>
+                        <outlet property="labelGPU" destination="2vr-dI-RkP" id="JsF-Ja-IMs"/>
+                        <outlet property="labelResult" destination="mis-ec-az7" id="n4i-Rp-lKe"/>
+                        <outlet property="switchGPU" destination="CGw-IB-KII" id="vAU-1T-tB8"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="tvY-8u-yZu" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="3106.4000000000001" y="787.55622188905556"/>
+        </scene>
+        <!--TNNFacemeshController-->
+        <scene sceneID="nhZ-cR-Yu0">
+            <objects>
+                <viewController storyboardIdentifier="TNNFacemeshController" id="X5Q-FQ-x2r" userLabel="TNNFacemeshController" customClass="TNNFacemeshController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Phz-rU-rqa">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="Nwo-rT-tge">
+                                <rect key="frame" x="15" y="59" width="345" height="509"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </imageView>
+                            <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="8Sf-Ha-MPH" userLabel="SwitchGPU">
+                                <rect key="frame" x="81" y="699" width="49" height="31"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <connections>
+                                    <action selector="onSwitchChanged:" destination="X5Q-FQ-x2r" eventType="valueChanged" id="H2x-DK-HlI"/>
+                                </connections>
+                            </switch>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="GPU" textAlignment="center" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="sYV-Qm-j7m" userLabel="LabelGPU">
+                                <rect key="frame" x="30" y="706" width="35" height="17"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" red="1" green="1" blue="1" alpha="0.5" colorSpace="custom" customColorSpace="displayP3"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="UcO-DT-MPB" userLabel="LabelResult">
+                                <rect key="frame" x="30" y="585" width="315" height="54"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" red="1" green="1" blue="1" alpha="0.5" colorSpace="custom" customColorSpace="displayP3"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j9U-Qf-HZy" userLabel="BtnTNN Examples">
+                                <rect key="frame" x="187" y="692" width="120" height="40"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMinX="YES" flexibleMinY="YES"/>
+                                <color key="backgroundColor" systemColor="systemBlueColor"/>
+                                <state key="normal" title="Run">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="onBtnTNNExamples:" destination="X5Q-FQ-x2r" eventType="touchUpInside" id="fgL-Pu-Anl"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="n8C-Bx-h2n"/>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                    </view>
+                    <navigationItem key="navigationItem" title="Title" id="EjD-eN-R6u" userLabel="TNNFacemeshController">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="n8R-FW-Zny" userLabel="Compose">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="H1P-le-jWv"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="btnExample" destination="j9U-Qf-HZy" id="MAS-XF-TKZ"/>
+                        <outlet property="imageView" destination="Nwo-rT-tge" id="GNO-LB-6X9"/>
+                        <outlet property="labelGPU" destination="sYV-Qm-j7m" id="hzC-Ow-bbK"/>
+                        <outlet property="labelResult" destination="UcO-DT-MPB" id="EZt-rv-AbQ"/>
+                        <outlet property="switchGPU" destination="8Sf-Ha-MPH" id="Dpu-wA-KiO"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="mN2-Q0-TDK" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="1785" y="1453"/>
+        </scene>
+        <!--TNNYoutuCameraPreviewController-->
+        <scene sceneID="aoo-7z-FWT">
+            <objects>
+                <viewController storyboardIdentifier="TNNYoutuCameraPreviewController" id="Pzy-Q2-Mw4" userLabel="TNNYoutuCameraPreviewController" customClass="TNNYoutuCameraPreviewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="9L5-DJ-EpA">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="758"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView clipsSubviews="YES" userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="fvp-pd-SdI" userLabel="CameraPreview">
+                                <rect key="frame" x="0.0" y="0.0" width="375" height="724"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </imageView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="9ij-3k-iJm">
+                                <rect key="frame" x="0.0" y="0.0" width="375" height="0.0"/>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" systemColor="systemRedColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="" lineBreakMode="tailTruncation" numberOfLines="4" baselineAdjustment="alignBaselines" adjustsFontForContentSizeCategory="YES" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="DXG-MQ-sTv">
+                                <rect key="frame" x="0.0" y="644" width="375" height="0.0"/>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <color key="textColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="z9T-ZD-A1u">
+                                <rect key="frame" x="0.0" y="644" width="375" height="80"/>
+                                <subviews>
+                                    <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="GPU" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="I7y-uq-0zq" userLabel="GPU">
+                                        <rect key="frame" x="20" y="29.666666666666629" width="35" height="21"/>
+                                        <fontDescription key="fontDescription" type="system" pointSize="17"/>
+                                        <color key="textColor" red="0.062485367059999998" green="0.51609534030000004" blue="0.83378475900000004" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                                        <nil key="highlightedColor"/>
+                                    </label>
+                                    <switch opaque="NO" contentMode="scaleToFill" horizontalHuggingPriority="750" verticalHuggingPriority="750" contentHorizontalAlignment="center" contentVerticalAlignment="center" translatesAutoresizingMaskIntoConstraints="NO" id="oIo-h4-XYU" userLabel="GPUSwitch">
+                                        <rect key="frame" x="70" y="24.666666666666629" width="51" height="31"/>
+                                        <connections>
+                                            <action selector="onSwitchGPU:" destination="Pzy-Q2-Mw4" eventType="valueChanged" id="5xc-Q3-vGE"/>
+                                        </connections>
+                                    </switch>
+                                    <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" adjustsImageSizeForAccessibilityContentSizeCategory="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="IMH-yn-ygz">
+                                        <rect key="frame" x="320" y="22.666666666666629" width="35" height="35"/>
+                                        <constraints>
+                                            <constraint firstAttribute="height" constant="35" id="XBq-mC-nxC"/>
+                                            <constraint firstAttribute="width" constant="35" id="bRV-2c-mzw"/>
+                                        </constraints>
+                                        <state key="normal" image="camera_rotate_fill.png">
+                                            <preferredSymbolConfiguration key="preferredSymbolConfiguration" scale="default"/>
+                                        </state>
+                                        <connections>
+                                            <action selector="onCameraRotate:" destination="Pzy-Q2-Mw4" eventType="touchUpInside" id="0rt-JX-mdb"/>
+                                        </connections>
+                                    </button>
+                                </subviews>
+                                <color key="backgroundColor" white="0.0" alpha="0.5" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstItem="I7y-uq-0zq" firstAttribute="leading" secondItem="z9T-ZD-A1u" secondAttribute="leading" constant="20" id="39L-3h-hde"/>
+                                    <constraint firstItem="oIo-h4-XYU" firstAttribute="centerY" secondItem="z9T-ZD-A1u" secondAttribute="centerY" id="5h4-60-S9m"/>
+                                    <constraint firstItem="oIo-h4-XYU" firstAttribute="leading" secondItem="I7y-uq-0zq" secondAttribute="trailing" constant="15" id="Jxy-Ob-bqs"/>
+                                    <constraint firstAttribute="trailing" secondItem="IMH-yn-ygz" secondAttribute="trailing" constant="20" id="bjk-qJ-xcA"/>
+                                    <constraint firstAttribute="height" constant="80" id="fyq-Ed-WcC"/>
+                                    <constraint firstItem="IMH-yn-ygz" firstAttribute="centerY" secondItem="z9T-ZD-A1u" secondAttribute="centerY" id="geF-TJ-fbN"/>
+                                    <constraint firstItem="I7y-uq-0zq" firstAttribute="centerY" secondItem="z9T-ZD-A1u" secondAttribute="centerY" id="sJw-WJ-Sgq"/>
+                                </constraints>
+                            </view>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="Mgp-1m-y72"/>
+                        <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="DXG-MQ-sTv" firstAttribute="leading" secondItem="Mgp-1m-y72" secondAttribute="leading" id="43F-k5-xjl"/>
+                            <constraint firstItem="fvp-pd-SdI" firstAttribute="leading" secondItem="Mgp-1m-y72" secondAttribute="leading" id="6wi-LP-7P8"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="trailing" secondItem="9ij-3k-iJm" secondAttribute="trailing" id="BNt-Zd-PQC"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="bottom" secondItem="z9T-ZD-A1u" secondAttribute="bottom" id="Jsc-Mu-67d"/>
+                            <constraint firstItem="z9T-ZD-A1u" firstAttribute="leading" secondItem="Mgp-1m-y72" secondAttribute="leading" id="KtY-mb-kFt"/>
+                            <constraint firstItem="9ij-3k-iJm" firstAttribute="top" secondItem="Mgp-1m-y72" secondAttribute="top" id="MTj-yI-CWN"/>
+                            <constraint firstItem="fvp-pd-SdI" firstAttribute="top" secondItem="Mgp-1m-y72" secondAttribute="top" id="ZRi-Hs-mKN"/>
+                            <constraint firstItem="z9T-ZD-A1u" firstAttribute="top" secondItem="DXG-MQ-sTv" secondAttribute="bottom" id="aLU-1Y-JSi"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="bottom" secondItem="fvp-pd-SdI" secondAttribute="bottom" id="cwL-wO-vMI"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="trailing" secondItem="fvp-pd-SdI" secondAttribute="trailing" id="d9c-jt-DkI"/>
+                            <constraint firstItem="9ij-3k-iJm" firstAttribute="leading" secondItem="Mgp-1m-y72" secondAttribute="leading" id="eKq-qB-YcQ"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="trailing" secondItem="z9T-ZD-A1u" secondAttribute="trailing" id="l0F-my-Vle"/>
+                            <constraint firstItem="Mgp-1m-y72" firstAttribute="trailing" secondItem="DXG-MQ-sTv" secondAttribute="trailing" id="s0O-BJ-802"/>
+                        </constraints>
+                    </view>
+                    <navigationItem key="navigationItem" title="TNNFaceDetectorController" id="Du5-oK-LAM" userLabel="TNNYoutuFaceDetectorCameraController">
+                        <barButtonItem key="rightBarButtonItem" systemItem="compose" id="1Dr-8g-pSa">
+                            <connections>
+                                <segue destination="jDT-gM-iHE" kind="show" id="ZB4-bX-xVl"/>
+                            </connections>
+                        </barButtonItem>
+                    </navigationItem>
+                    <connections>
+                        <outlet property="cameraPreview" destination="fvp-pd-SdI" id="SM6-qb-M3z"/>
+                        <outlet property="labelFPS" destination="9ij-3k-iJm" id="OyX-Io-Fa1"/>
+                        <outlet property="labelResult" destination="DXG-MQ-sTv" id="jbT-BQ-itb"/>
+                        <outlet property="rotateCamera" destination="IMH-yn-ygz" id="dZo-oT-Pk8"/>
+                        <outlet property="switchGPU" destination="oIo-h4-XYU" id="rH0-Ma-Stg"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="I6S-Ka-JpH" userLabel="First Responder" customClass="UIResponder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="3015" y="1496"/>
+        </scene>
+    </scenes>
+    <inferredMetricsTieBreakers>
+        <segue reference="lkg-b7-2yB"/>
+    </inferredMetricsTieBreakers>
+    <resources>
+        <image name="camera_rotate_fill.png" width="200" height="200"/>
+        <systemColor name="systemBackgroundColor">
+            <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+        </systemColor>
+        <systemColor name="systemBlueColor">
+            <color red="0.0" green="0.47843137254901963" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+        </systemColor>
+        <systemColor name="systemRedColor">
+            <color red="1" green="0.23137254901960785" blue="0.18823529411764706" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+        </systemColor>
+    </resources>
+</document>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.h b/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.h
new file mode 100644
index 0000000..91575f9
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+typedef enum : NSUInteger {
+    DYAutoHighlightModeNone = 0,
+    DYAutoHighlightModeTitle = 1<<0,
+    DYAutoHighlightModeBorder = 1<<1,
+    DYAutoHighlightModeBackground = 1<<2,
+    DYAutoHighlightModeAll =     DYAutoHighlightModeTitle|DYAutoHighlightModeBorder|DYAutoHighlightModeBackground,
+} DYAutoHighlightMode;
+
+@interface DYFlatButton : UIButton
+/**highlight状态 亮度变化系数， >0, 变亮， <0,变暗*/
+@property (nonatomic, assign) CGFloat hightLightBrightnessPercent;
+/**是否自动根据hightLightBrightnessPercent确定highlight状态的颜色*/
+@property (nonatomic, assign) DYAutoHighlightMode autoHighlightMode;
+
+- (void)setBorderColor:(UIColor *)color forState:(UIControlState)state;
+- (UIColor *)borderColorForState:(UIControlState)state;
+
+- (void)setBackgroundColor:(UIColor *)color forState:(UIControlState)state;
+- (UIColor *)backgroundColorForState:(UIControlState)state;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.m b/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.m
new file mode 100644
index 0000000..766a2b5
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Component/DYFlatButton.m
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "DYFlatButton.h"
+#import "UIColor+Utility.h"
+
+@interface DYFlatButton ()
+@property (nonatomic, strong) NSMutableDictionary *borderColorDict;
+@property (nonatomic, strong) NSMutableDictionary *backgroundColorDict;
+@end
+
+@implementation DYFlatButton
+- (instancetype)initWithFrame:(CGRect)frame
+{
+    self = [super initWithFrame:frame];
+    if (self) {
+        [self setup];
+    }
+    return self;
+}
+
+- (instancetype)initWithCoder:(NSCoder *)aDecoder
+{
+    self = [super initWithCoder:aDecoder];
+    if (self) {
+        [self setup];
+    }
+    return self;
+}
+
+- (void)setup {
+    _borderColorDict = [NSMutableDictionary dictionary];
+    _backgroundColorDict = [NSMutableDictionary dictionary];
+    
+    _autoHighlightMode = DYAutoHighlightModeTitle|DYAutoHighlightModeBorder;
+    _hightLightBrightnessPercent = -0.2;
+    
+    BOOL containsEdgeInsets = ! UIEdgeInsetsEqualToEdgeInsets(self.contentEdgeInsets, UIEdgeInsetsZero);
+    self.contentEdgeInsets = containsEdgeInsets ? self.contentEdgeInsets : UIEdgeInsetsMake(0, 12, 0, 12);
+    
+    self.layer.masksToBounds = YES;
+    
+    if (self.backgroundColor) {
+        [self setBackgroundColor:self.backgroundColor forState:UIControlStateNormal];
+    }
+}
+
+- (void)layoutSubviews {
+    UIControlState state = self.state;
+    
+    self.layer.borderWidth = self.layer.borderWidth ?: 1.0f;
+    if (_autoHighlightMode & DYAutoHighlightModeTitle) {
+        [self setTitleColor:[self titleColorForState:state] forState:state];
+    }
+    if (_autoHighlightMode & DYAutoHighlightModeBackground) {
+        self.backgroundColor = [self backgroundColorForState:state];
+    }
+    
+    self.layer.borderColor = [self borderColorForState:state].CGColor;
+    
+    [super layoutSubviews];
+    self.layer.cornerRadius = self.layer.cornerRadius ?: CGRectGetHeight(self.frame) / 2.0f;
+}
+
+- (void)setHighlighted:(BOOL)highlighted {
+    [super setHighlighted:highlighted];
+}
+
+- (UIColor *)titleColorForState:(UIControlState)state {
+    if ((_autoHighlightMode & DYAutoHighlightModeTitle) &&
+        (UIControlStateHighlighted & state)) {
+        state = state & (~UIControlStateHighlighted);
+        UIColor *color = [super titleColorForState: state];
+        return [color lighten:_hightLightBrightnessPercent];
+    } else {
+        return [super titleColorForState:state];
+    }
+}
+
+- (UIColor *)borderColorForState:(UIControlState)state {
+    UIColor *color = _borderColorDict[@(state)];
+    if (color == nil) {
+        if (state == UIControlStateNormal) {
+            color = [self titleColorForState:UIControlStateNormal];
+        } else {
+            if ((_autoHighlightMode & DYAutoHighlightModeBorder)
+                && (UIControlStateHighlighted & state)) {
+                state = state & (~UIControlStateHighlighted);
+                UIColor *color = [self borderColorForState:state];
+                if (color) {
+                    return [color lighten:_hightLightBrightnessPercent];
+                }
+            } 
+        }
+    }
+    return color;
+}
+
+- (void)setBorderColor:(UIColor *)color forState:(UIControlState)state {
+    _borderColorDict[@(state)] = color;
+}
+
+- (UIColor *)backgroundColorForState:(UIControlState)state {
+    UIColor *color = _backgroundColorDict[@(state)];
+    if (color == nil) {
+        if (state == UIControlStateNormal) {
+            color = [self backgroundColor];
+        } else {
+            if ((_autoHighlightMode & DYAutoHighlightModeBackground)
+                && (UIControlStateHighlighted & state)) {
+                state = state & (~UIControlStateHighlighted);
+                UIColor *color = [self backgroundColorForState:state];
+                if (color) {
+                    return [color lighten:_hightLightBrightnessPercent];
+                }
+            } 
+        }
+    }
+    return color;
+}
+
+- (void)setBackgroundColor:(UIColor *)color forState:(UIControlState)state {
+    _backgroundColorDict[@(state)] = color;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.h b/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.h
new file mode 100644
index 0000000..7de1807
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface UIColor (Utility)
++ (UIColor *)colorWithHexNumber:(NSUInteger)hexColor;
++ (UIColor *)colorWithHexNumber:(NSUInteger)hexColor alpha:(CGFloat)alpha;
+
+- (BOOL)isDarkColor;
+- (BOOL)isBlackOrWhite;
+- (BOOL)isDistinct:(UIColor *)color;
+- (UIColor *)colorWithMinimumSaturation:(CGFloat)minSaturation;
+- (BOOL)isContrastingColor:(UIColor *)color;
+
+- (UIColor *)darken;
+- (UIColor *)darken:(CGFloat)percentage;
+- (UIColor *)lighten;
+- (UIColor *)lighten:(CGFloat)percentage;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.m b/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.m
new file mode 100644
index 0000000..a8d2d02
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Component/UIColor+Utility.m
@@ -0,0 +1,161 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "UIColor+Utility.h"
+
+@implementation UIColor (Utility)
++ (UIColor *)colorWithHexNumber:(NSUInteger)hexColor {
+    CGFloat r = ((hexColor>>16) & 0xFF) / 255.0;
+    CGFloat g = ((hexColor>>8) & 0xFF) / 255.0;
+    CGFloat b = (hexColor & 0xFF) / 255.0;
+    return [UIColor colorWithRed:r green:g blue:b alpha:1.0f];
+}
+
++ (UIColor *)colorWithHexNumber:(NSUInteger)hexColor alpha:(CGFloat)alpha{
+    CGFloat r = ((hexColor>>16) & 0xFF) / 255.0;
+    CGFloat g = ((hexColor>>8) & 0xFF) / 255.0;
+    CGFloat b = (hexColor & 0xFF) / 255.0;
+    return [UIColor colorWithRed:r green:g blue:b alpha:alpha];
+}
+
+- (BOOL)isDarkColor {
+    const CGFloat *RGB = CGColorGetComponents(self.CGColor);
+    if (RGB == nil) {
+        return YES;
+    }
+    return (0.2126 * RGB[0] + 0.7152 * RGB[1] + 0.0722 * RGB[2]) < 0.5;
+}
+
+- (BOOL)isBlackOrWhite {
+    const CGFloat *RGB = CGColorGetComponents(self.CGColor);
+    if (RGB == nil) {
+        return YES;
+    }
+    return (RGB[0] > 0.91 && RGB[1] > 0.91 && RGB[2] > 0.91) || (RGB[0] < 0.09 && RGB[1] < 0.09 && RGB[2] < 0.09);
+}
+
+- (BOOL)isDistinct:(UIColor *)color {
+    const CGFloat *bg = CGColorGetComponents(self.CGColor);
+    const CGFloat *fg = CGColorGetComponents(color.CGColor);
+    
+    const CGFloat threshold = 0.35;
+    if (fabs(bg[0] - fg[0]) > threshold || fabs(bg[1] - fg[1]) > threshold || fabs(bg[2] - fg[2]) > threshold) {
+        if (fabs(bg[0] - bg[1]) < 0.03 && fabs(bg[0] - bg[2]) < 0.03) {
+            if (fabs(fg[0] - fg[1]) < 0.03 && fabs(fg[0] - fg[2]) < 0.03) {
+                return NO;
+            }
+        }
+        return YES;
+    }
+    return NO;
+}
+
+- (UIColor *)colorWithMinimumSaturation:(CGFloat)minSaturation {
+    CGFloat hue = 0.0;
+    CGFloat saturation = 0.0;
+    CGFloat brightness = 0.0;
+    CGFloat alpha = 0.0;
+    
+    [self getHue:&hue saturation:&saturation brightness:&brightness alpha:&alpha];
+    
+    if (saturation < minSaturation) {
+        return [UIColor colorWithHue:hue saturation:minSaturation brightness:brightness alpha:alpha];
+    } else {
+        return self;
+    }
+}
+
+- (BOOL)isContrastingColor:(UIColor *)color {
+    const CGFloat *bg = CGColorGetComponents(self.CGColor);
+    const CGFloat *fg = CGColorGetComponents(color.CGColor);
+    
+    const CGFloat bgLum = 0.2126 * bg[0] + 0.7152 * bg[1] + 0.0722 * bg[2];
+    const CGFloat fgLum = 0.2126 * fg[0] + 0.7152 * fg[1] + 0.0722 * fg[2];
+    
+    const BOOL bgGreater = bgLum > fgLum;
+    const CGFloat nom = bgGreater ? bgLum : fgLum;
+    const CGFloat denom = bgGreater ? fgLum : bgLum;
+    const CGFloat contrast = (nom + 0.05) / (denom + 0.05);
+    //    return 1.6 < contrast;
+    return 2.4 < contrast;
+}
+
+- (UIColor *)darken {
+    return [self darken:0.2];
+}
+
+- (UIColor *)darken:(CGFloat)percentage {
+    CGFloat h=0, s=0, b=0, a=0;
+    [self getHue:&h
+      saturation:&s
+      brightness:&b
+           alpha:&a];
+    
+    b = b*(1-percentage);
+    
+    return [UIColor colorWithHue:h
+                      saturation:s
+                      brightness:b
+                           alpha:a];
+}
+
+- (UIColor *)lighten {
+    return [self lighten:0.2];
+}
+
+- (UIColor *)lighten:(CGFloat)percentage {
+    CGFloat h=0, s=0, b=0, a=0;
+    [self getHue:&h
+      saturation:&s
+      brightness:&b
+           alpha:&a];
+    
+    b = b*(1+percentage);
+    
+    return [UIColor colorWithHue:h
+                      saturation:s
+                      brightness:b
+                           alpha:a];
+}
+
+//+ (UIColor *)gb_pinkColor
+//{
+//    return [UIColor colorWithRed:206/255.0 green:67/255.0 blue:130/255.0 alpha:1];
+//}
+//
+//+ (UIColor *)gb_yellowColor
+//{
+//    return [UIColor colorWithRed:253/255.0 green:197/255.0 blue:0/255.0 alpha:1];
+//}
+//
+//+ (UIColor *)gb_orangeColor
+//{
+//    return [UIColor colorWithRed:255/255.0 green:167/255.0 blue:28/255.0 alpha:1];
+//}
+//
+//+ (UIColor *)gb_greenColor
+//{
+//    return [UIColor colorWithRed:158/255.0 green:211/255.0 blue:15/255.0 alpha:1];
+//}
+//
+//+ (UIColor *)gb_blueColor
+//{
+//    return [UIColor colorWithRed:100/255.0 green:194/255.0 blue:227/255.0 alpha:1];
+//}
+//
+//+ (UIColor *)gb_purpleColor
+//{
+//    return [UIColor colorWithRed:124/255.0 green:118/255.0 blue:247/255.0 alpha:1];
+//}
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/Info.plist b/3rdparty/TNN/examples/ios/TNNExamples/Info.plist
new file mode 100644
index 0000000..f21d6f0
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/Info.plist
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>$(PRODUCT_BUNDLE_PACKAGE_TYPE)</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>Use camera for detect</string>
+	<key>NSPhotoLibraryAddUsageDescription</key>
+	<string>Use Photo for saving photo</string>
+	<key>NSPhotoLibraryUsageDescription</key>
+	<string>Use Photo for saving photo</string>
+	<key>UIApplicationSceneManifest</key>
+	<dict>
+		<key>UIApplicationSupportsMultipleScenes</key>
+		<false/>
+		<key>UISceneConfigurations</key>
+		<dict>
+			<key>UIWindowSceneSessionRoleApplication</key>
+			<array>
+				<dict>
+					<key>UISceneConfigurationName</key>
+					<string>Default Configuration</string>
+					<key>UISceneDelegateClassName</key>
+					<string>SceneDelegate</string>
+					<key>UISceneStoryboardFile</key>
+					<string>Main</string>
+				</dict>
+			</array>
+		</dict>
+	</dict>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.h b/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.h
new file mode 100644
index 0000000..43c804b
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface RootNavController : UINavigationController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.mm b/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.mm
new file mode 100644
index 0000000..76ca471
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/RootNavController.mm
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "RootNavController.h"
+
+@interface RootNavController () {
+}
+@end
+
+@implementation RootNavController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    // Do any additional setup after loading the view.
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.h b/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.h
new file mode 100644
index 0000000..4915a9e
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface SceneDelegate : UIResponder <UIWindowSceneDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.mm b/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.mm
new file mode 100644
index 0000000..23d65f1
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/SceneDelegate.mm
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "SceneDelegate.h"
+
+@interface SceneDelegate ()
+
+@end
+
+@implementation SceneDelegate
+
+- (void)scene:(UIScene *)scene
+    willConnectToSession:(UISceneSession *)session
+                 options:(UISceneConnectionOptions *)connectionOptions {
+    // Use this method to optionally configure and attach the UIWindow `window`
+    // to the provided UIWindowScene `scene`. If using a storyboard, the
+    // `window` property will automatically be initialized and attached to the
+    // scene. This delegate does not imply the connecting scene or session are
+    // new (see `application:configurationForConnectingSceneSession` instead).
+}
+
+- (void)sceneDidDisconnect:(UIScene *)scene {
+    // Called as the scene is being released by the system.
+    // This occurs shortly after the scene enters the background, or when its
+    // session is discarded. Release any resources associated with this scene
+    // that can be re-created the next time the scene connects. The scene may
+    // re-connect later, as its session was not neccessarily discarded (see
+    // `application:didDiscardSceneSessions` instead).
+}
+
+- (void)sceneDidBecomeActive:(UIScene *)scene {
+    // Called when the scene has moved from an inactive state to an active
+    // state. Use this method to restart any tasks that were paused (or not yet
+    // started) when the scene was inactive.
+}
+
+- (void)sceneWillResignActive:(UIScene *)scene {
+    // Called when the scene will move from an active state to an inactive
+    // state. This may occur due to temporary interruptions (ex. an incoming
+    // phone call).
+}
+
+- (void)sceneWillEnterForeground:(UIScene *)scene {
+    // Called as the scene transitions from the background to the foreground.
+    // Use this method to undo the changes made on entering the background.
+}
+
+- (void)sceneDidEnterBackground:(UIScene *)scene {
+    // Called as the scene transitions from the foreground to the background.
+    // Use this method to save data, release shared resources, and store enough
+    // scene-specific state information to restore the scene back to its current
+    // state.
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.h
new file mode 100644
index 0000000..74be77c
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNBlazefaceDetectorController : TNNExamplesController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.mm
new file mode 100644
index 0000000..26ce6d4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/TNNBlazefaceDetectorController.mm
@@ -0,0 +1,202 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNBlazefaceDetectorController.h"
+#import "blazeface_detector.h"
+#import "UIImage+Utility.h"
+#import <Metal/Metal.h>
+#import <cstdlib>
+#import <sstream>
+#import <string>
+#import <tnn/tnn.h>
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNBlazefaceDetectorController ()
+
+@property (weak, nonatomic) IBOutlet UIButton *btnExample;
+@property (weak, nonatomic) IBOutlet UILabel *labelResult;
+@property (weak, nonatomic) IBOutlet UILabel *labelGPU;
+@property (weak, nonatomic) IBOutlet UISwitch *switchGPU;
+@property (weak, nonatomic) IBOutlet UIImageView *imageView;
+
+@property(nonatomic, strong) UIImage* image_orig;
+
+@end
+
+@implementation TNNBlazefaceDetectorController
+;
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+}
+
+- (void)viewWillAppear:(BOOL) animated {
+    [super viewWillAppear:animated];
+    
+    self.image_orig = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"test_blazeface.jpg" ofType:nil]];
+    
+    self.imageView.image = self.image_orig;
+    
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth      = view.frame.size.width;
+    int screenHeight     = view.frame.size.height;
+    int width            = self.imageView.frame.size.width;
+    int height           = self.imageView.frame.size.height;
+    int widthOffset      = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame =
+        CGRectMake(self.imageView.frame.origin.x,
+                   self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2,
+                   self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+    
+}
+- (IBAction)onSwitchChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+    
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnproto"
+                                                          ofType:nil];
+    auto anchor_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface_anchors.txt"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0 || anchor_path.length <= 0) {
+        self.labelResult.text = @"proto or model or anchor path is invalid";
+        NSLog(@"Error: proto or model or anchor path is invalid");
+        return;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    auto image_data = utility::UIImageGetData(self.image_orig, 128, 128, 1);
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<BlazeFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        //min_score_thresh
+        option->min_score_threshold = 0.75;
+        //min_suppression_thresh
+        option->min_suppression_threshold = 0.3;
+        //predefined anchor file path
+        option->anchor_path = string(anchor_path.UTF8String);
+    }
+        
+    auto predictor = std::make_shared<BlazeFaceDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+            return;
+    }
+
+    BenchOption bench_option;
+    bench_option.forward_count = 20;
+    predictor->SetBenchOption(bench_option);
+
+    DimsVector image_dims = {1, 3, 128, 128};
+    std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+
+    if(units == TNNComputeUnitsCPU) {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, image_dims, image_data.get());
+    } else {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, image_dims);
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, image_dims[3], image_dims[2])
+                        mipmapLevel:0
+                          withBytes:image_data.get()
+                        bytesPerRow:image_dims[3] * 4];
+    }
+    //preprocess
+    auto target_dims = predictor->GetInputShape();
+    auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_dims);
+    status = predictor->Resize(image_mat, input_mat, TNNInterpNearest);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+            return;
+    }
+
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    status = predictor->Predict(std::make_shared<BlazeFaceDetectorInput>(input_mat), sdk_output);
+
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+            return;
+    }
+    
+    std::vector<BlazeFaceInfo> face_info;
+    if (sdk_output && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get()))
+    {
+        auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get());
+        face_info = face_output->face_list;
+    }
+    
+    auto bench_result     = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@      face count:%d\ntime:\n%s", units == TNNComputeUnitsGPU ? @"gpu" : @"arm", (int)face_info.size(), bench_result.Description().c_str()];
+
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    auto image_orig_data        = utility::UIImageGetData(self.image_orig, image_orig_height, image_orig_width);
+    for (int i = 0; i < face_info.size(); i++) {
+        auto face = face_info[i];
+        auto face_orig = face.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+        Rectangle((void *)image_orig_data.get(), image_orig_height, image_orig_width, face_orig.x1, face_orig.y1, face_orig.x2,
+                      face_orig.y2);
+        for(auto& p:face_orig.key_points) {
+            TNN_NS::Point((void*)image_orig_data.get(), image_orig_height, image_orig_width, p.first, p.second, 1);
+        }
+    }
+    
+    UIImage *output_image =
+        utility::UIImageWithDataRGBA((void *)image_orig_data.get(), image_orig_height, image_orig_width);
+    self.imageView.image = output_image;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/blazeface_anchors.txt b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/blazeface_anchors.txt
new file mode 100644
index 0000000..36e7159
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/blazeface_anchors.txt
@@ -0,0 +1,3584 @@
+0.031250
+0.031250
+1.000000
+1.000000
+0.031250
+0.031250
+1.000000
+1.000000
+0.093750
+0.031250
+1.000000
+1.000000
+0.093750
+0.031250
+1.000000
+1.000000
+0.156250
+0.031250
+1.000000
+1.000000
+0.156250
+0.031250
+1.000000
+1.000000
+0.218750
+0.031250
+1.000000
+1.000000
+0.218750
+0.031250
+1.000000
+1.000000
+0.281250
+0.031250
+1.000000
+1.000000
+0.281250
+0.031250
+1.000000
+1.000000
+0.343750
+0.031250
+1.000000
+1.000000
+0.343750
+0.031250
+1.000000
+1.000000
+0.406250
+0.031250
+1.000000
+1.000000
+0.406250
+0.031250
+1.000000
+1.000000
+0.468750
+0.031250
+1.000000
+1.000000
+0.468750
+0.031250
+1.000000
+1.000000
+0.531250
+0.031250
+1.000000
+1.000000
+0.531250
+0.031250
+1.000000
+1.000000
+0.593750
+0.031250
+1.000000
+1.000000
+0.593750
+0.031250
+1.000000
+1.000000
+0.656250
+0.031250
+1.000000
+1.000000
+0.656250
+0.031250
+1.000000
+1.000000
+0.718750
+0.031250
+1.000000
+1.000000
+0.718750
+0.031250
+1.000000
+1.000000
+0.781250
+0.031250
+1.000000
+1.000000
+0.781250
+0.031250
+1.000000
+1.000000
+0.843750
+0.031250
+1.000000
+1.000000
+0.843750
+0.031250
+1.000000
+1.000000
+0.906250
+0.031250
+1.000000
+1.000000
+0.906250
+0.031250
+1.000000
+1.000000
+0.968750
+0.031250
+1.000000
+1.000000
+0.968750
+0.031250
+1.000000
+1.000000
+0.031250
+0.093750
+1.000000
+1.000000
+0.031250
+0.093750
+1.000000
+1.000000
+0.093750
+0.093750
+1.000000
+1.000000
+0.093750
+0.093750
+1.000000
+1.000000
+0.156250
+0.093750
+1.000000
+1.000000
+0.156250
+0.093750
+1.000000
+1.000000
+0.218750
+0.093750
+1.000000
+1.000000
+0.218750
+0.093750
+1.000000
+1.000000
+0.281250
+0.093750
+1.000000
+1.000000
+0.281250
+0.093750
+1.000000
+1.000000
+0.343750
+0.093750
+1.000000
+1.000000
+0.343750
+0.093750
+1.000000
+1.000000
+0.406250
+0.093750
+1.000000
+1.000000
+0.406250
+0.093750
+1.000000
+1.000000
+0.468750
+0.093750
+1.000000
+1.000000
+0.468750
+0.093750
+1.000000
+1.000000
+0.531250
+0.093750
+1.000000
+1.000000
+0.531250
+0.093750
+1.000000
+1.000000
+0.593750
+0.093750
+1.000000
+1.000000
+0.593750
+0.093750
+1.000000
+1.000000
+0.656250
+0.093750
+1.000000
+1.000000
+0.656250
+0.093750
+1.000000
+1.000000
+0.718750
+0.093750
+1.000000
+1.000000
+0.718750
+0.093750
+1.000000
+1.000000
+0.781250
+0.093750
+1.000000
+1.000000
+0.781250
+0.093750
+1.000000
+1.000000
+0.843750
+0.093750
+1.000000
+1.000000
+0.843750
+0.093750
+1.000000
+1.000000
+0.906250
+0.093750
+1.000000
+1.000000
+0.906250
+0.093750
+1.000000
+1.000000
+0.968750
+0.093750
+1.000000
+1.000000
+0.968750
+0.093750
+1.000000
+1.000000
+0.031250
+0.156250
+1.000000
+1.000000
+0.031250
+0.156250
+1.000000
+1.000000
+0.093750
+0.156250
+1.000000
+1.000000
+0.093750
+0.156250
+1.000000
+1.000000
+0.156250
+0.156250
+1.000000
+1.000000
+0.156250
+0.156250
+1.000000
+1.000000
+0.218750
+0.156250
+1.000000
+1.000000
+0.218750
+0.156250
+1.000000
+1.000000
+0.281250
+0.156250
+1.000000
+1.000000
+0.281250
+0.156250
+1.000000
+1.000000
+0.343750
+0.156250
+1.000000
+1.000000
+0.343750
+0.156250
+1.000000
+1.000000
+0.406250
+0.156250
+1.000000
+1.000000
+0.406250
+0.156250
+1.000000
+1.000000
+0.468750
+0.156250
+1.000000
+1.000000
+0.468750
+0.156250
+1.000000
+1.000000
+0.531250
+0.156250
+1.000000
+1.000000
+0.531250
+0.156250
+1.000000
+1.000000
+0.593750
+0.156250
+1.000000
+1.000000
+0.593750
+0.156250
+1.000000
+1.000000
+0.656250
+0.156250
+1.000000
+1.000000
+0.656250
+0.156250
+1.000000
+1.000000
+0.718750
+0.156250
+1.000000
+1.000000
+0.718750
+0.156250
+1.000000
+1.000000
+0.781250
+0.156250
+1.000000
+1.000000
+0.781250
+0.156250
+1.000000
+1.000000
+0.843750
+0.156250
+1.000000
+1.000000
+0.843750
+0.156250
+1.000000
+1.000000
+0.906250
+0.156250
+1.000000
+1.000000
+0.906250
+0.156250
+1.000000
+1.000000
+0.968750
+0.156250
+1.000000
+1.000000
+0.968750
+0.156250
+1.000000
+1.000000
+0.031250
+0.218750
+1.000000
+1.000000
+0.031250
+0.218750
+1.000000
+1.000000
+0.093750
+0.218750
+1.000000
+1.000000
+0.093750
+0.218750
+1.000000
+1.000000
+0.156250
+0.218750
+1.000000
+1.000000
+0.156250
+0.218750
+1.000000
+1.000000
+0.218750
+0.218750
+1.000000
+1.000000
+0.218750
+0.218750
+1.000000
+1.000000
+0.281250
+0.218750
+1.000000
+1.000000
+0.281250
+0.218750
+1.000000
+1.000000
+0.343750
+0.218750
+1.000000
+1.000000
+0.343750
+0.218750
+1.000000
+1.000000
+0.406250
+0.218750
+1.000000
+1.000000
+0.406250
+0.218750
+1.000000
+1.000000
+0.468750
+0.218750
+1.000000
+1.000000
+0.468750
+0.218750
+1.000000
+1.000000
+0.531250
+0.218750
+1.000000
+1.000000
+0.531250
+0.218750
+1.000000
+1.000000
+0.593750
+0.218750
+1.000000
+1.000000
+0.593750
+0.218750
+1.000000
+1.000000
+0.656250
+0.218750
+1.000000
+1.000000
+0.656250
+0.218750
+1.000000
+1.000000
+0.718750
+0.218750
+1.000000
+1.000000
+0.718750
+0.218750
+1.000000
+1.000000
+0.781250
+0.218750
+1.000000
+1.000000
+0.781250
+0.218750
+1.000000
+1.000000
+0.843750
+0.218750
+1.000000
+1.000000
+0.843750
+0.218750
+1.000000
+1.000000
+0.906250
+0.218750
+1.000000
+1.000000
+0.906250
+0.218750
+1.000000
+1.000000
+0.968750
+0.218750
+1.000000
+1.000000
+0.968750
+0.218750
+1.000000
+1.000000
+0.031250
+0.281250
+1.000000
+1.000000
+0.031250
+0.281250
+1.000000
+1.000000
+0.093750
+0.281250
+1.000000
+1.000000
+0.093750
+0.281250
+1.000000
+1.000000
+0.156250
+0.281250
+1.000000
+1.000000
+0.156250
+0.281250
+1.000000
+1.000000
+0.218750
+0.281250
+1.000000
+1.000000
+0.218750
+0.281250
+1.000000
+1.000000
+0.281250
+0.281250
+1.000000
+1.000000
+0.281250
+0.281250
+1.000000
+1.000000
+0.343750
+0.281250
+1.000000
+1.000000
+0.343750
+0.281250
+1.000000
+1.000000
+0.406250
+0.281250
+1.000000
+1.000000
+0.406250
+0.281250
+1.000000
+1.000000
+0.468750
+0.281250
+1.000000
+1.000000
+0.468750
+0.281250
+1.000000
+1.000000
+0.531250
+0.281250
+1.000000
+1.000000
+0.531250
+0.281250
+1.000000
+1.000000
+0.593750
+0.281250
+1.000000
+1.000000
+0.593750
+0.281250
+1.000000
+1.000000
+0.656250
+0.281250
+1.000000
+1.000000
+0.656250
+0.281250
+1.000000
+1.000000
+0.718750
+0.281250
+1.000000
+1.000000
+0.718750
+0.281250
+1.000000
+1.000000
+0.781250
+0.281250
+1.000000
+1.000000
+0.781250
+0.281250
+1.000000
+1.000000
+0.843750
+0.281250
+1.000000
+1.000000
+0.843750
+0.281250
+1.000000
+1.000000
+0.906250
+0.281250
+1.000000
+1.000000
+0.906250
+0.281250
+1.000000
+1.000000
+0.968750
+0.281250
+1.000000
+1.000000
+0.968750
+0.281250
+1.000000
+1.000000
+0.031250
+0.343750
+1.000000
+1.000000
+0.031250
+0.343750
+1.000000
+1.000000
+0.093750
+0.343750
+1.000000
+1.000000
+0.093750
+0.343750
+1.000000
+1.000000
+0.156250
+0.343750
+1.000000
+1.000000
+0.156250
+0.343750
+1.000000
+1.000000
+0.218750
+0.343750
+1.000000
+1.000000
+0.218750
+0.343750
+1.000000
+1.000000
+0.281250
+0.343750
+1.000000
+1.000000
+0.281250
+0.343750
+1.000000
+1.000000
+0.343750
+0.343750
+1.000000
+1.000000
+0.343750
+0.343750
+1.000000
+1.000000
+0.406250
+0.343750
+1.000000
+1.000000
+0.406250
+0.343750
+1.000000
+1.000000
+0.468750
+0.343750
+1.000000
+1.000000
+0.468750
+0.343750
+1.000000
+1.000000
+0.531250
+0.343750
+1.000000
+1.000000
+0.531250
+0.343750
+1.000000
+1.000000
+0.593750
+0.343750
+1.000000
+1.000000
+0.593750
+0.343750
+1.000000
+1.000000
+0.656250
+0.343750
+1.000000
+1.000000
+0.656250
+0.343750
+1.000000
+1.000000
+0.718750
+0.343750
+1.000000
+1.000000
+0.718750
+0.343750
+1.000000
+1.000000
+0.781250
+0.343750
+1.000000
+1.000000
+0.781250
+0.343750
+1.000000
+1.000000
+0.843750
+0.343750
+1.000000
+1.000000
+0.843750
+0.343750
+1.000000
+1.000000
+0.906250
+0.343750
+1.000000
+1.000000
+0.906250
+0.343750
+1.000000
+1.000000
+0.968750
+0.343750
+1.000000
+1.000000
+0.968750
+0.343750
+1.000000
+1.000000
+0.031250
+0.406250
+1.000000
+1.000000
+0.031250
+0.406250
+1.000000
+1.000000
+0.093750
+0.406250
+1.000000
+1.000000
+0.093750
+0.406250
+1.000000
+1.000000
+0.156250
+0.406250
+1.000000
+1.000000
+0.156250
+0.406250
+1.000000
+1.000000
+0.218750
+0.406250
+1.000000
+1.000000
+0.218750
+0.406250
+1.000000
+1.000000
+0.281250
+0.406250
+1.000000
+1.000000
+0.281250
+0.406250
+1.000000
+1.000000
+0.343750
+0.406250
+1.000000
+1.000000
+0.343750
+0.406250
+1.000000
+1.000000
+0.406250
+0.406250
+1.000000
+1.000000
+0.406250
+0.406250
+1.000000
+1.000000
+0.468750
+0.406250
+1.000000
+1.000000
+0.468750
+0.406250
+1.000000
+1.000000
+0.531250
+0.406250
+1.000000
+1.000000
+0.531250
+0.406250
+1.000000
+1.000000
+0.593750
+0.406250
+1.000000
+1.000000
+0.593750
+0.406250
+1.000000
+1.000000
+0.656250
+0.406250
+1.000000
+1.000000
+0.656250
+0.406250
+1.000000
+1.000000
+0.718750
+0.406250
+1.000000
+1.000000
+0.718750
+0.406250
+1.000000
+1.000000
+0.781250
+0.406250
+1.000000
+1.000000
+0.781250
+0.406250
+1.000000
+1.000000
+0.843750
+0.406250
+1.000000
+1.000000
+0.843750
+0.406250
+1.000000
+1.000000
+0.906250
+0.406250
+1.000000
+1.000000
+0.906250
+0.406250
+1.000000
+1.000000
+0.968750
+0.406250
+1.000000
+1.000000
+0.968750
+0.406250
+1.000000
+1.000000
+0.031250
+0.468750
+1.000000
+1.000000
+0.031250
+0.468750
+1.000000
+1.000000
+0.093750
+0.468750
+1.000000
+1.000000
+0.093750
+0.468750
+1.000000
+1.000000
+0.156250
+0.468750
+1.000000
+1.000000
+0.156250
+0.468750
+1.000000
+1.000000
+0.218750
+0.468750
+1.000000
+1.000000
+0.218750
+0.468750
+1.000000
+1.000000
+0.281250
+0.468750
+1.000000
+1.000000
+0.281250
+0.468750
+1.000000
+1.000000
+0.343750
+0.468750
+1.000000
+1.000000
+0.343750
+0.468750
+1.000000
+1.000000
+0.406250
+0.468750
+1.000000
+1.000000
+0.406250
+0.468750
+1.000000
+1.000000
+0.468750
+0.468750
+1.000000
+1.000000
+0.468750
+0.468750
+1.000000
+1.000000
+0.531250
+0.468750
+1.000000
+1.000000
+0.531250
+0.468750
+1.000000
+1.000000
+0.593750
+0.468750
+1.000000
+1.000000
+0.593750
+0.468750
+1.000000
+1.000000
+0.656250
+0.468750
+1.000000
+1.000000
+0.656250
+0.468750
+1.000000
+1.000000
+0.718750
+0.468750
+1.000000
+1.000000
+0.718750
+0.468750
+1.000000
+1.000000
+0.781250
+0.468750
+1.000000
+1.000000
+0.781250
+0.468750
+1.000000
+1.000000
+0.843750
+0.468750
+1.000000
+1.000000
+0.843750
+0.468750
+1.000000
+1.000000
+0.906250
+0.468750
+1.000000
+1.000000
+0.906250
+0.468750
+1.000000
+1.000000
+0.968750
+0.468750
+1.000000
+1.000000
+0.968750
+0.468750
+1.000000
+1.000000
+0.031250
+0.531250
+1.000000
+1.000000
+0.031250
+0.531250
+1.000000
+1.000000
+0.093750
+0.531250
+1.000000
+1.000000
+0.093750
+0.531250
+1.000000
+1.000000
+0.156250
+0.531250
+1.000000
+1.000000
+0.156250
+0.531250
+1.000000
+1.000000
+0.218750
+0.531250
+1.000000
+1.000000
+0.218750
+0.531250
+1.000000
+1.000000
+0.281250
+0.531250
+1.000000
+1.000000
+0.281250
+0.531250
+1.000000
+1.000000
+0.343750
+0.531250
+1.000000
+1.000000
+0.343750
+0.531250
+1.000000
+1.000000
+0.406250
+0.531250
+1.000000
+1.000000
+0.406250
+0.531250
+1.000000
+1.000000
+0.468750
+0.531250
+1.000000
+1.000000
+0.468750
+0.531250
+1.000000
+1.000000
+0.531250
+0.531250
+1.000000
+1.000000
+0.531250
+0.531250
+1.000000
+1.000000
+0.593750
+0.531250
+1.000000
+1.000000
+0.593750
+0.531250
+1.000000
+1.000000
+0.656250
+0.531250
+1.000000
+1.000000
+0.656250
+0.531250
+1.000000
+1.000000
+0.718750
+0.531250
+1.000000
+1.000000
+0.718750
+0.531250
+1.000000
+1.000000
+0.781250
+0.531250
+1.000000
+1.000000
+0.781250
+0.531250
+1.000000
+1.000000
+0.843750
+0.531250
+1.000000
+1.000000
+0.843750
+0.531250
+1.000000
+1.000000
+0.906250
+0.531250
+1.000000
+1.000000
+0.906250
+0.531250
+1.000000
+1.000000
+0.968750
+0.531250
+1.000000
+1.000000
+0.968750
+0.531250
+1.000000
+1.000000
+0.031250
+0.593750
+1.000000
+1.000000
+0.031250
+0.593750
+1.000000
+1.000000
+0.093750
+0.593750
+1.000000
+1.000000
+0.093750
+0.593750
+1.000000
+1.000000
+0.156250
+0.593750
+1.000000
+1.000000
+0.156250
+0.593750
+1.000000
+1.000000
+0.218750
+0.593750
+1.000000
+1.000000
+0.218750
+0.593750
+1.000000
+1.000000
+0.281250
+0.593750
+1.000000
+1.000000
+0.281250
+0.593750
+1.000000
+1.000000
+0.343750
+0.593750
+1.000000
+1.000000
+0.343750
+0.593750
+1.000000
+1.000000
+0.406250
+0.593750
+1.000000
+1.000000
+0.406250
+0.593750
+1.000000
+1.000000
+0.468750
+0.593750
+1.000000
+1.000000
+0.468750
+0.593750
+1.000000
+1.000000
+0.531250
+0.593750
+1.000000
+1.000000
+0.531250
+0.593750
+1.000000
+1.000000
+0.593750
+0.593750
+1.000000
+1.000000
+0.593750
+0.593750
+1.000000
+1.000000
+0.656250
+0.593750
+1.000000
+1.000000
+0.656250
+0.593750
+1.000000
+1.000000
+0.718750
+0.593750
+1.000000
+1.000000
+0.718750
+0.593750
+1.000000
+1.000000
+0.781250
+0.593750
+1.000000
+1.000000
+0.781250
+0.593750
+1.000000
+1.000000
+0.843750
+0.593750
+1.000000
+1.000000
+0.843750
+0.593750
+1.000000
+1.000000
+0.906250
+0.593750
+1.000000
+1.000000
+0.906250
+0.593750
+1.000000
+1.000000
+0.968750
+0.593750
+1.000000
+1.000000
+0.968750
+0.593750
+1.000000
+1.000000
+0.031250
+0.656250
+1.000000
+1.000000
+0.031250
+0.656250
+1.000000
+1.000000
+0.093750
+0.656250
+1.000000
+1.000000
+0.093750
+0.656250
+1.000000
+1.000000
+0.156250
+0.656250
+1.000000
+1.000000
+0.156250
+0.656250
+1.000000
+1.000000
+0.218750
+0.656250
+1.000000
+1.000000
+0.218750
+0.656250
+1.000000
+1.000000
+0.281250
+0.656250
+1.000000
+1.000000
+0.281250
+0.656250
+1.000000
+1.000000
+0.343750
+0.656250
+1.000000
+1.000000
+0.343750
+0.656250
+1.000000
+1.000000
+0.406250
+0.656250
+1.000000
+1.000000
+0.406250
+0.656250
+1.000000
+1.000000
+0.468750
+0.656250
+1.000000
+1.000000
+0.468750
+0.656250
+1.000000
+1.000000
+0.531250
+0.656250
+1.000000
+1.000000
+0.531250
+0.656250
+1.000000
+1.000000
+0.593750
+0.656250
+1.000000
+1.000000
+0.593750
+0.656250
+1.000000
+1.000000
+0.656250
+0.656250
+1.000000
+1.000000
+0.656250
+0.656250
+1.000000
+1.000000
+0.718750
+0.656250
+1.000000
+1.000000
+0.718750
+0.656250
+1.000000
+1.000000
+0.781250
+0.656250
+1.000000
+1.000000
+0.781250
+0.656250
+1.000000
+1.000000
+0.843750
+0.656250
+1.000000
+1.000000
+0.843750
+0.656250
+1.000000
+1.000000
+0.906250
+0.656250
+1.000000
+1.000000
+0.906250
+0.656250
+1.000000
+1.000000
+0.968750
+0.656250
+1.000000
+1.000000
+0.968750
+0.656250
+1.000000
+1.000000
+0.031250
+0.718750
+1.000000
+1.000000
+0.031250
+0.718750
+1.000000
+1.000000
+0.093750
+0.718750
+1.000000
+1.000000
+0.093750
+0.718750
+1.000000
+1.000000
+0.156250
+0.718750
+1.000000
+1.000000
+0.156250
+0.718750
+1.000000
+1.000000
+0.218750
+0.718750
+1.000000
+1.000000
+0.218750
+0.718750
+1.000000
+1.000000
+0.281250
+0.718750
+1.000000
+1.000000
+0.281250
+0.718750
+1.000000
+1.000000
+0.343750
+0.718750
+1.000000
+1.000000
+0.343750
+0.718750
+1.000000
+1.000000
+0.406250
+0.718750
+1.000000
+1.000000
+0.406250
+0.718750
+1.000000
+1.000000
+0.468750
+0.718750
+1.000000
+1.000000
+0.468750
+0.718750
+1.000000
+1.000000
+0.531250
+0.718750
+1.000000
+1.000000
+0.531250
+0.718750
+1.000000
+1.000000
+0.593750
+0.718750
+1.000000
+1.000000
+0.593750
+0.718750
+1.000000
+1.000000
+0.656250
+0.718750
+1.000000
+1.000000
+0.656250
+0.718750
+1.000000
+1.000000
+0.718750
+0.718750
+1.000000
+1.000000
+0.718750
+0.718750
+1.000000
+1.000000
+0.781250
+0.718750
+1.000000
+1.000000
+0.781250
+0.718750
+1.000000
+1.000000
+0.843750
+0.718750
+1.000000
+1.000000
+0.843750
+0.718750
+1.000000
+1.000000
+0.906250
+0.718750
+1.000000
+1.000000
+0.906250
+0.718750
+1.000000
+1.000000
+0.968750
+0.718750
+1.000000
+1.000000
+0.968750
+0.718750
+1.000000
+1.000000
+0.031250
+0.781250
+1.000000
+1.000000
+0.031250
+0.781250
+1.000000
+1.000000
+0.093750
+0.781250
+1.000000
+1.000000
+0.093750
+0.781250
+1.000000
+1.000000
+0.156250
+0.781250
+1.000000
+1.000000
+0.156250
+0.781250
+1.000000
+1.000000
+0.218750
+0.781250
+1.000000
+1.000000
+0.218750
+0.781250
+1.000000
+1.000000
+0.281250
+0.781250
+1.000000
+1.000000
+0.281250
+0.781250
+1.000000
+1.000000
+0.343750
+0.781250
+1.000000
+1.000000
+0.343750
+0.781250
+1.000000
+1.000000
+0.406250
+0.781250
+1.000000
+1.000000
+0.406250
+0.781250
+1.000000
+1.000000
+0.468750
+0.781250
+1.000000
+1.000000
+0.468750
+0.781250
+1.000000
+1.000000
+0.531250
+0.781250
+1.000000
+1.000000
+0.531250
+0.781250
+1.000000
+1.000000
+0.593750
+0.781250
+1.000000
+1.000000
+0.593750
+0.781250
+1.000000
+1.000000
+0.656250
+0.781250
+1.000000
+1.000000
+0.656250
+0.781250
+1.000000
+1.000000
+0.718750
+0.781250
+1.000000
+1.000000
+0.718750
+0.781250
+1.000000
+1.000000
+0.781250
+0.781250
+1.000000
+1.000000
+0.781250
+0.781250
+1.000000
+1.000000
+0.843750
+0.781250
+1.000000
+1.000000
+0.843750
+0.781250
+1.000000
+1.000000
+0.906250
+0.781250
+1.000000
+1.000000
+0.906250
+0.781250
+1.000000
+1.000000
+0.968750
+0.781250
+1.000000
+1.000000
+0.968750
+0.781250
+1.000000
+1.000000
+0.031250
+0.843750
+1.000000
+1.000000
+0.031250
+0.843750
+1.000000
+1.000000
+0.093750
+0.843750
+1.000000
+1.000000
+0.093750
+0.843750
+1.000000
+1.000000
+0.156250
+0.843750
+1.000000
+1.000000
+0.156250
+0.843750
+1.000000
+1.000000
+0.218750
+0.843750
+1.000000
+1.000000
+0.218750
+0.843750
+1.000000
+1.000000
+0.281250
+0.843750
+1.000000
+1.000000
+0.281250
+0.843750
+1.000000
+1.000000
+0.343750
+0.843750
+1.000000
+1.000000
+0.343750
+0.843750
+1.000000
+1.000000
+0.406250
+0.843750
+1.000000
+1.000000
+0.406250
+0.843750
+1.000000
+1.000000
+0.468750
+0.843750
+1.000000
+1.000000
+0.468750
+0.843750
+1.000000
+1.000000
+0.531250
+0.843750
+1.000000
+1.000000
+0.531250
+0.843750
+1.000000
+1.000000
+0.593750
+0.843750
+1.000000
+1.000000
+0.593750
+0.843750
+1.000000
+1.000000
+0.656250
+0.843750
+1.000000
+1.000000
+0.656250
+0.843750
+1.000000
+1.000000
+0.718750
+0.843750
+1.000000
+1.000000
+0.718750
+0.843750
+1.000000
+1.000000
+0.781250
+0.843750
+1.000000
+1.000000
+0.781250
+0.843750
+1.000000
+1.000000
+0.843750
+0.843750
+1.000000
+1.000000
+0.843750
+0.843750
+1.000000
+1.000000
+0.906250
+0.843750
+1.000000
+1.000000
+0.906250
+0.843750
+1.000000
+1.000000
+0.968750
+0.843750
+1.000000
+1.000000
+0.968750
+0.843750
+1.000000
+1.000000
+0.031250
+0.906250
+1.000000
+1.000000
+0.031250
+0.906250
+1.000000
+1.000000
+0.093750
+0.906250
+1.000000
+1.000000
+0.093750
+0.906250
+1.000000
+1.000000
+0.156250
+0.906250
+1.000000
+1.000000
+0.156250
+0.906250
+1.000000
+1.000000
+0.218750
+0.906250
+1.000000
+1.000000
+0.218750
+0.906250
+1.000000
+1.000000
+0.281250
+0.906250
+1.000000
+1.000000
+0.281250
+0.906250
+1.000000
+1.000000
+0.343750
+0.906250
+1.000000
+1.000000
+0.343750
+0.906250
+1.000000
+1.000000
+0.406250
+0.906250
+1.000000
+1.000000
+0.406250
+0.906250
+1.000000
+1.000000
+0.468750
+0.906250
+1.000000
+1.000000
+0.468750
+0.906250
+1.000000
+1.000000
+0.531250
+0.906250
+1.000000
+1.000000
+0.531250
+0.906250
+1.000000
+1.000000
+0.593750
+0.906250
+1.000000
+1.000000
+0.593750
+0.906250
+1.000000
+1.000000
+0.656250
+0.906250
+1.000000
+1.000000
+0.656250
+0.906250
+1.000000
+1.000000
+0.718750
+0.906250
+1.000000
+1.000000
+0.718750
+0.906250
+1.000000
+1.000000
+0.781250
+0.906250
+1.000000
+1.000000
+0.781250
+0.906250
+1.000000
+1.000000
+0.843750
+0.906250
+1.000000
+1.000000
+0.843750
+0.906250
+1.000000
+1.000000
+0.906250
+0.906250
+1.000000
+1.000000
+0.906250
+0.906250
+1.000000
+1.000000
+0.968750
+0.906250
+1.000000
+1.000000
+0.968750
+0.906250
+1.000000
+1.000000
+0.031250
+0.968750
+1.000000
+1.000000
+0.031250
+0.968750
+1.000000
+1.000000
+0.093750
+0.968750
+1.000000
+1.000000
+0.093750
+0.968750
+1.000000
+1.000000
+0.156250
+0.968750
+1.000000
+1.000000
+0.156250
+0.968750
+1.000000
+1.000000
+0.218750
+0.968750
+1.000000
+1.000000
+0.218750
+0.968750
+1.000000
+1.000000
+0.281250
+0.968750
+1.000000
+1.000000
+0.281250
+0.968750
+1.000000
+1.000000
+0.343750
+0.968750
+1.000000
+1.000000
+0.343750
+0.968750
+1.000000
+1.000000
+0.406250
+0.968750
+1.000000
+1.000000
+0.406250
+0.968750
+1.000000
+1.000000
+0.468750
+0.968750
+1.000000
+1.000000
+0.468750
+0.968750
+1.000000
+1.000000
+0.531250
+0.968750
+1.000000
+1.000000
+0.531250
+0.968750
+1.000000
+1.000000
+0.593750
+0.968750
+1.000000
+1.000000
+0.593750
+0.968750
+1.000000
+1.000000
+0.656250
+0.968750
+1.000000
+1.000000
+0.656250
+0.968750
+1.000000
+1.000000
+0.718750
+0.968750
+1.000000
+1.000000
+0.718750
+0.968750
+1.000000
+1.000000
+0.781250
+0.968750
+1.000000
+1.000000
+0.781250
+0.968750
+1.000000
+1.000000
+0.843750
+0.968750
+1.000000
+1.000000
+0.843750
+0.968750
+1.000000
+1.000000
+0.906250
+0.968750
+1.000000
+1.000000
+0.906250
+0.968750
+1.000000
+1.000000
+0.968750
+0.968750
+1.000000
+1.000000
+0.968750
+0.968750
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.062500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.187500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.312500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.437500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.562500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.687500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.812500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.937500
+0.062500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.062500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.187500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.312500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.437500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.562500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.687500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.812500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.937500
+0.187500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.062500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.187500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.312500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.437500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.562500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.687500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.812500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.937500
+0.312500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.062500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.187500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.312500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.437500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.562500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.687500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.812500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.937500
+0.437500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.062500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.187500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.312500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.437500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.562500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.687500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.812500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.937500
+0.562500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.062500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.187500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.312500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.437500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.562500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.687500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.812500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.937500
+0.687500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.062500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.187500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.312500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.437500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.562500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.687500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.812500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.937500
+0.812500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.062500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.187500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.312500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.437500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.562500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.687500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.812500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
+0.937500
+0.937500
+1.000000
+1.000000
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/test_blazeface.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/test_blazeface.jpg
new file mode 100644
index 0000000..a218be1
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNBlazefaceDetectorController/test_blazeface.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.h
new file mode 100644
index 0000000..9fd4f21
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
+#include <memory>
+#include <vector>
+
+@interface TNNBoundingBox : NSObject
+@property (nonatomic, strong, readonly) CAShapeLayer *boxLayer;
+@property (nonatomic, strong, readonly) CATextLayer *textLayer;
+
+@property (nonatomic, strong, readonly) NSArray<CAShapeLayer *> *markLayer;
+
+- (instancetype)init;
+
+-(void)addToLayer:(CALayer *)layer;
+-(void)removeFromSuperLayer;
+
+- (void)showText:(NSString *)text withColor:(UIColor *)color hideTextFrame:(bool)hideTextFrame atFrame:(CGRect)frame;
+- (void)showMarkAtPoints:(std::vector<std::pair<float, float>>)points withColor:(UIColor *)color circle:(BOOL)circle;
+- (void)showLines:(std::vector<std::pair<float, float>>)points lines:(std::vector<std::pair<int, int>>)lines
+        withColor:(UIColor *)color;
+- (void)hide;
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.mm
new file mode 100644
index 0000000..1641c87
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNBoundingBox.mm
@@ -0,0 +1,225 @@
+//  Copyright © 2020 tencent. All rights reserved.
+
+#import "TNNBoundingBox.h"
+
+@interface  TNNBoundingBox ()
+@property (nonatomic, strong) CAShapeLayer *boxLayer;
+@property (nonatomic, strong) CATextLayer *textLayer;
+
+@property (nonatomic, strong) NSArray<CAShapeLayer *> *markLayers;
+@property (nonatomic, strong) NSArray<CAShapeLayer *> *lineLayers;
+@end
+
+@implementation TNNBoundingBox
+- (instancetype)init {
+    self = [super init];
+    if (self != nil) {
+        _boxLayer = [[CAShapeLayer alloc] init];
+        _boxLayer.fillColor = [UIColor clearColor].CGColor;
+        _boxLayer.lineWidth = 2;
+        _boxLayer.hidden = YES;
+
+        _textLayer =[[CATextLayer alloc] init];
+        _textLayer.foregroundColor = [UIColor blackColor].CGColor;
+        _textLayer.hidden = YES;
+        _textLayer.contentsScale = [UIScreen mainScreen].scale;
+        _textLayer.fontSize = 14;
+        {
+            auto font = [UIFont systemFontOfSize:14];
+            CFStringRef fontName = (__bridge CFStringRef)font.fontName;
+            CGFontRef fontRef = CGFontCreateWithFontName(fontName);
+            _textLayer.font = fontRef;
+            CGFontRelease(fontRef);
+        }
+
+        _textLayer.alignmentMode = kCAAlignmentCenter;
+        
+        _markLayers = [NSArray array];
+        _lineLayers = [NSArray array];
+    }
+    return self;
+}
+
+- (void)addToLayer:(CALayer *)layer {
+    [layer addSublayer:_boxLayer];
+    [layer addSublayer:_textLayer];
+    
+    auto markLayers = _markLayers;
+    for (CAShapeLayer * item in markLayers) {
+        [layer addSublayer:item];
+    }
+    
+    auto lineLayers = _lineLayers;
+    for (CAShapeLayer * item in lineLayers) {
+        [layer addSublayer:item];
+    }
+}
+
+-(void)removeFromSuperLayer {
+    [_boxLayer removeFromSuperlayer];
+    [_textLayer removeFromSuperlayer];
+    
+    auto markLayers = _markLayers;
+    for (CAShapeLayer * item in markLayers) {
+        [item removeFromSuperlayer];
+    }
+    
+    auto lineLayers = _lineLayers;
+    for (CAShapeLayer * item in lineLayers) {
+        [item removeFromSuperlayer];
+    }
+}
+
+- (void)showText:(NSString *)text withColor:(UIColor *)color hideTextFrame:(bool)hideTextFrame atFrame:(CGRect)frame {
+    [CATransaction setDisableActions:YES];
+    
+    auto path = [UIBezierPath bezierPathWithRect:frame];
+    _boxLayer.path = path.CGPath;
+    _boxLayer.strokeColor = color.CGColor;
+    _boxLayer.hidden = hideTextFrame? YES : NO;
+
+    _textLayer.string = text;
+    _textLayer.backgroundColor = color.CGColor;
+    _textLayer.hidden = NO;
+
+    auto attributes = @{NSFontAttributeName:[UIFont systemFontOfSize:14]};
+
+    auto textRect = [text boundingRectWithSize:CGSizeMake(400, 100)
+                                       options:NSStringDrawingTruncatesLastVisibleLine
+                                    attributes:attributes
+                                       context:nil];
+    
+    _textLayer.frame = CGRectMake(frame.origin.x - 1,
+                                  frame.origin.y - textRect.size.height,
+                                  textRect.size.width + 10,
+                                  textRect.size.height);
+    
+    [CATransaction setDisableActions:NO];
+}
+
+- (void)showMarkAtPoints:(std::vector<std::pair<float, float>>)points withColor:(UIColor *)color
+                  circle:(BOOL)circle {
+    [CATransaction setDisableActions:YES];
+    
+    NSMutableArray<CAShapeLayer *> *newMarkLayers = [NSMutableArray arrayWithArray:_markLayers];
+    
+    //add more layers if need
+    for (auto i=_markLayers.count; i<points.size(); i++) {
+        auto boxLayer = [[CAShapeLayer alloc] init];
+        boxLayer.fillColor = [UIColor clearColor].CGColor;
+        if (circle == YES)
+            boxLayer.fillColor = color.CGColor;
+        boxLayer.lineWidth = 1;
+        boxLayer.hidden = YES;
+        
+        [newMarkLayers addObject:boxLayer];
+    }
+    
+    auto super_layer = _boxLayer.superlayer;
+    for (auto i=0; i<newMarkLayers.count; i++) {
+        auto layer = newMarkLayers[i];
+        if (layer.superlayer != super_layer) {
+            [layer removeFromSuperlayer];
+            [super_layer addSublayer:layer];
+        }
+        
+        if (i < points.size()) {
+            auto pt = points[i];
+            auto path = [UIBezierPath bezierPath];
+            if (circle == NO) {
+                [path moveToPoint:CGPointMake(pt.first-2, pt.second)];
+                [path addLineToPoint:CGPointMake(pt.first+2, pt.second)];
+                [path moveToPoint:CGPointMake(pt.first, pt.second-2)];
+                [path addLineToPoint:CGPointMake(pt.first, pt.second+2)];
+            } else {
+                CGPoint center = CGPointMake(pt.first, pt.second);
+                [path addArcWithCenter:center radius:2 startAngle:0 endAngle:2 * M_PI clockwise:YES];
+            }
+            [path closePath];
+            
+            layer.path = path.CGPath;
+            layer.strokeColor = color.CGColor;
+            layer.hidden = NO;
+        } else {
+            layer.hidden = YES;
+        }
+    }
+    _markLayers = newMarkLayers;
+    
+    [CATransaction setDisableActions:NO];
+}
+
+- (void)showLines:(std::vector<std::pair<float, float>>)points lines:(std::vector<std::pair<int, int>>)lines
+        withColor:(UIColor *)color {
+    [CATransaction setDisableActions:YES];
+    
+    NSMutableArray<CAShapeLayer *> *newLineLayers = [NSMutableArray arrayWithArray:_lineLayers];
+    
+    //add more layers if need
+    for (auto i=_lineLayers.count; i<lines.size(); i++) {
+        auto boxLayer = [[CAShapeLayer alloc] init];
+        boxLayer.fillColor = [UIColor clearColor].CGColor;
+        boxLayer.lineWidth = 1;
+        boxLayer.hidden = YES;
+        
+        [newLineLayers addObject:boxLayer];
+    }
+    
+    int line_cnt = 0;
+    auto super_layer = _boxLayer.superlayer;
+    for (auto i=0; i<newLineLayers.count; ++i) {
+        auto layer = newLineLayers[i];
+        if (layer.superlayer != super_layer) {
+            [layer removeFromSuperlayer];
+            [super_layer addSublayer:layer];
+        }
+        
+        if (i < lines.size()) {
+            auto line_start = lines[i].first;
+            auto line_end = lines[i].second;
+            
+            if (line_start >= _markLayers.count || line_end >= _markLayers.count)
+                continue;
+            if (line_start >= points.size() || line_end >= points.size())
+                continue;
+            
+            auto start_point = points[line_start];
+            auto end_point = points[line_end];
+            auto path = [UIBezierPath bezierPath];
+            path.lineWidth = 4.0;
+            [path moveToPoint:CGPointMake(start_point.first, start_point.second)];
+            [path addLineToPoint:CGPointMake(end_point.first, end_point.second)];
+            [path closePath];
+            
+            layer.path = path.CGPath;
+            layer.strokeColor = color.CGColor;
+            layer.hidden = NO;
+            line_cnt += 1;
+        } else {
+            layer.hidden = YES;
+        }
+    }
+    _lineLayers = newLineLayers;
+    
+    [CATransaction setDisableActions:NO];
+}
+
+- (void)hide {
+    [CATransaction setDisableActions:YES];
+    
+    _boxLayer.hidden = YES;
+    _textLayer.hidden = YES;
+    
+    auto markLayers = _markLayers;
+    for (CAShapeLayer * item in markLayers) {
+        item.hidden = YES;
+    }
+    
+    auto lineLayers = _lineLayers;
+    for(CAShapeLayer * item in lineLayers) {
+        item.hidden = YES;
+    }
+    
+    [CATransaction setDisableActions:NO];
+}
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.h
new file mode 100644
index 0000000..8363590
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.h
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import <AVFoundation/AVFoundation.h>
+#import <UIKit/UIKit.h>
+#import <Metal/Metal.h>
+#import <CoreMedia/CoreMedia.h>
+
+typedef NS_ENUM(NSInteger, CameraDeviceEvent) {
+    CameraDeviceEvent_Started = 0,
+    CameraDeviceEvent_Stopped ,
+    CameraDeviceEvent_Restarted,
+    CameraDeviceEvent_FrameStarted,
+    CameraDeviceEvent_FrameReceived,
+    CameraDeviceEvent_PositionChanged,
+    CameraDeviceEvent_FlashModeSetted,
+    CameraDeviceEvent_FocusBegan,
+    CameraDeviceEvent_FocusEnded,
+    CameraDeviceEvent_ExposureBegan,
+    CameraDeviceEvent_ExposureEnded,
+};
+@class TNNCameraVideoDevice;
+
+typedef void(^CameraSetupCallback)(BOOL);
+
+@protocol TNNCameraVideoDeviceDelegate <NSObject>
+@optional
+- (void)cameraDevice:(TNNCameraVideoDevice *)camera
+     didCaptureVideo:(CMSampleBufferRef)videoBuffer
+        withPosition:(AVCaptureDevicePosition)position
+         atTimestamp:(CMTime)time;
+- (void)cameraDevice:(TNNCameraVideoDevice *)camera
+     didCapturePhoto:(CMSampleBufferRef)photoBuffer
+         previewImage:(CMSampleBufferRef)previewBuffer;
+@end
+
+@interface TNNCameraVideoDevice : NSObject
+@property (nonatomic, weak) NSObject<TNNCameraVideoDeviceDelegate> *delegate;
+@property (nonatomic, strong, readonly) AVCaptureVideoPreviewLayer *videoPreviewLayer;
+@property (nonatomic, strong, readonly) dispatch_queue_t queue;
+
+- (instancetype)init;
+- (void)startSession;
+- (void)stopSession;
+
+- (void)switchCamera:(AVCaptureDevicePosition)sessionPreset
+          withPreset:(AVCaptureSessionPreset)sessionPreset
+         completion:(CameraSetupCallback)completion;
+
+-(id<MTLTexture>)getMTLTexture:(CMSampleBufferRef)sampleBuffer;
+-(id<MTLTexture>)getMTLTextureFromImageBuffer:(CVImageBufferRef)imageBuffer;
+-(UIImage *)getUIImage:(CMSampleBufferRef)sampleBuffer;
+
+- (AVCaptureDevicePosition)cameraPosition;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.mm
new file mode 100644
index 0000000..24998e0
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNCameraVideoDevice.mm
@@ -0,0 +1,349 @@
+//  Copyright © 2020 tencent. All rights reserved.
+
+#import "TNNCameraVideoDevice.h"
+#import <AVFoundation/AVFoundation.h>
+#import <CoreVideo/CoreVideo.h>
+#import <Metal/Metal.h>
+
+API_AVAILABLE(ios(10.0))
+@interface TNNCameraVideoDevice ()<AVCaptureVideoDataOutputSampleBufferDelegate,
+AVCapturePhotoCaptureDelegate> {
+}
+@property (nonatomic, strong) AVCaptureSession *captureSession;
+@property (nonatomic, strong) AVCaptureDevice *captureDevice;
+@property (nonatomic, strong) AVCaptureDeviceInput *captureDeviceInput;
+@property (nonatomic, strong) AVCaptureVideoDataOutput *videoOutput;
+@property (nonatomic, strong) AVCapturePhotoOutput *photoOutput;
+@property (nonatomic, strong) AVCaptureVideoPreviewLayer *videoPreviewLayer;
+
+@property (nonatomic, strong) dispatch_queue_t queue;
+@property (nonatomic, strong) id <MTLDevice> device;
+
+@property (nonatomic, assign) CVMetalTextureCacheRef textureCache;
+@end
+
+@implementation TNNCameraVideoDevice
+
+- (instancetype)init
+{
+    self = [super init];
+    if (self) {
+        _queue = dispatch_queue_create("camera.queue", NULL);
+        _captureSession = [[AVCaptureSession alloc] init];
+        _device = MTLCreateSystemDefaultDevice();
+        
+        _videoPreviewLayer = [AVCaptureVideoPreviewLayer layerWithSession:_captureSession];
+        _videoPreviewLayer.videoGravity = AVLayerVideoGravityResizeAspectFill;
+        _videoPreviewLayer.connection.videoOrientation = AVCaptureVideoOrientationPortrait;
+        
+        _textureCache = nil;
+    }
+    return self;
+}
+
+
+- (void)dealloc {
+    if (_textureCache) {
+        CFRelease(_textureCache);
+        _textureCache = nil;
+    }
+}
+
+- (void)switchCamera:(AVCaptureDevicePosition)position
+ withPreset:(AVCaptureSessionPreset)sessionPreset
+completion:(CameraSetupCallback)completion {
+    [self stopSession];
+    dispatch_async(_queue, ^{
+        auto success = [self switchCamera:position withPreset:sessionPreset];
+        dispatch_async(dispatch_get_main_queue(), ^{
+            if (success) {
+                [self startSession];
+            }
+            completion(success);
+        });
+    });
+}
+
+- (BOOL)switchCamera:(AVCaptureDevicePosition)position
+          withPreset:(AVCaptureSessionPreset)sessionPreset {
+    CVMetalTextureCacheFlush(_textureCache, 0);
+    if (CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, _device, nil, &_textureCache) != kCVReturnSuccess) {
+        NSLog(@"Error: setupCaptureSession could not create a texture cache");
+        return NO;
+    }
+    
+    // 1.创建会话
+    [_captureSession beginConfiguration];
+    _captureSession.sessionPreset = sessionPreset;
+    
+    // 2.创建输入设备
+    _captureDevice = [self cameraWithPosition:position];
+    if (!_captureDevice) {
+        NSLog(@"Error: no video AVCaptureDevice availablee");
+        return NO;
+    }
+    
+    // 3.创建输入，并连接会话
+    if (_captureDeviceInput) {
+        [_captureSession removeInput:_captureDeviceInput];
+        _captureDeviceInput = nil;
+    }
+    NSError *error = nil;
+    _captureDeviceInput = [AVCaptureDeviceInput deviceInputWithDevice:_captureDevice error:&error];
+    if (error || !_captureDeviceInput) {
+        NSLog(@"Error: no video AVCaptureDeviceInput availablee");
+        return NO;
+    }
+    if ([_captureSession canAddInput:_captureDeviceInput]) {
+        [_captureSession addInput:_captureDeviceInput];
+    }
+    
+    // 4.创建输出，并连接会话
+    if (_photoOutput) {
+        [_captureSession removeOutput:_photoOutput];
+        _photoOutput = nil;
+    }
+    if (@available(iOS 10.0, *)) {
+        _photoOutput = [[AVCapturePhotoOutput alloc] init];
+    }
+    if (_photoOutput && [_captureSession canAddOutput:_photoOutput]) {
+        [_captureSession addOutput:_photoOutput];
+    }
+    [self addVideoOutput];
+    
+    [_captureSession commitConfiguration];
+    return YES;
+}
+
+- (void)addVideoOutput
+{
+    if (_videoOutput) {
+        [_captureSession removeOutput:_videoOutput];
+        _videoOutput = nil;
+    }
+    _videoOutput = [[AVCaptureVideoDataOutput alloc] init];
+    _videoOutput.alwaysDiscardsLateVideoFrames = YES;
+    _videoOutput.videoSettings = @{ (NSString *)kCVPixelBufferPixelFormatTypeKey : @(kCVPixelFormatType_32BGRA) };
+    [_videoOutput setSampleBufferDelegate:self queue:_queue];
+    if ([_captureSession canAddOutput:_videoOutput]) {
+        [_captureSession addOutput:_videoOutput];
+    } else {
+        NSLog(@"couldn't add video output");
+    }
+    auto connection = [_videoOutput connectionWithMediaType:AVMediaTypeVideo];
+    connection.videoOrientation = AVCaptureVideoOrientationPortrait;
+}
+
+- (AVCaptureDevice *) cameraWithPosition:(AVCaptureDevicePosition) position
+{
+    NSArray *devices = [AVCaptureDevice devicesWithMediaType:AVMediaTypeVideo];
+    for (AVCaptureDevice *device in devices)
+    {
+        if ([device position] == position) return device;
+    }
+    return nil;
+}
+
+- (AVCaptureDevicePosition)cameraPosition {
+    auto pos = AVCaptureDevicePositionUnspecified;
+    if (_captureDevice) {
+        pos = _captureDevice.position;
+        return pos;
+    }
+    if (_captureSession.inputs.count > 0) {
+        auto currentCameraInput = [_captureSession.inputs objectAtIndex:0];
+        pos = ((AVCaptureDeviceInput*)currentCameraInput).device.position;
+    }
+    return pos;
+}
+
+- (AVCaptureDevicePosition)rotateCamera
+{
+    AVCaptureDevicePosition pos = AVCaptureDevicePositionUnspecified;
+    //Change camera source
+    if(_captureSession)
+    {
+        //Indicate that some changes will be made to the session
+        [_captureSession beginConfiguration];
+
+        //Remove existing input
+        AVCaptureInput* currentCameraInput = [_captureSession.inputs objectAtIndex:0];
+        [_captureSession removeInput:currentCameraInput];
+
+        //Get new input
+        AVCaptureDevice *newCamera = nil;
+        if(((AVCaptureDeviceInput*)currentCameraInput).device.position == AVCaptureDevicePositionBack)
+        {
+          newCamera = [self cameraWithPosition:AVCaptureDevicePositionFront];
+          pos = AVCaptureDevicePositionFront;
+        }
+        else
+        {
+          newCamera = [self cameraWithPosition:AVCaptureDevicePositionBack];
+          pos = AVCaptureDevicePositionBack;
+        }
+        _captureDevice = newCamera;
+        //Add input to session
+        NSError *err = nil;
+        _captureDeviceInput = [[AVCaptureDeviceInput alloc] initWithDevice:_captureDevice error:&err];
+        if(!_captureDeviceInput || err)
+        {
+          NSLog(@"Error creating capture device input: %@", err.localizedDescription);
+        }
+        else
+        {
+          [_captureSession addInput:_captureDeviceInput];
+        }
+        [_captureSession removeOutput:_videoOutput];
+        [self addVideoOutput];
+        //Commit all the configuration changes at once
+        [_captureSession commitConfiguration];
+    }
+    return pos;
+}
+
+
+- (void)startSession {
+    if(![_captureSession isRunning]) {
+        [_captureSession startRunning];
+    }
+}
+
+- (void)stopSession {
+    if([_captureSession isRunning]) {
+        [_captureSession stopRunning];
+    }
+}
+
+//Capture a single frame of the camera input
+-(void)capturePhoto:(AVCapturePhotoSettings *)settings  API_AVAILABLE(ios(10.0)){
+    if (settings == nil) {
+        settings = [AVCapturePhotoSettings photoSettingsWithFormat:
+                    @{(NSString *)kCVPixelBufferPixelFormatTypeKey : @(kCVPixelFormatType_32BGRA) }];
+        
+        settings.previewPhotoFormat = @{
+            (NSString *)kCVPixelBufferPixelFormatTypeKey : settings.availablePreviewPhotoPixelFormatTypes[0],
+            (NSString *)kCVPixelBufferHeightKey : @(480),
+            (NSString *)kCVPixelBufferWidthKey : @(360)
+        };
+    }
+    
+    [_photoOutput capturePhotoWithSettings:settings
+                                      delegate:self];
+}
+
+#pragma mark - AVCaptureVideoDataOutputSampleBufferDelegate
+- (void)captureOutput:(AVCaptureOutput *)captureOutput
+didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+       fromConnection:(AVCaptureConnection *)connection
+{
+    auto inputPort = [connection.inputPorts objectAtIndex:0];
+    auto input = (AVCaptureDeviceInput *)inputPort.input;
+    auto inputPos = input.device.position;
+
+    if (inputPos == AVCaptureDevicePositionUnspecified) {
+        NSLog(@"captureOutput, camera position unspecified, drop it!!!!");
+        return;
+    }
+    auto timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer);
+
+    if (_delegate && [_delegate respondsToSelector:@selector(cameraDevice:didCaptureVideo:withPosition:atTimestamp:)]) {
+        [_delegate cameraDevice:self
+                    didCaptureVideo:sampleBuffer
+                       withPosition:inputPos
+                        atTimestamp:timestamp];
+    }
+    
+}
+
+-(void)captureOutput:(AVCapturePhotoOutput *)output
+ didDropSampleBuffer:(nonnull CMSampleBufferRef)sampleBuffer
+      fromConnection:(nonnull AVCaptureConnection *)connection  API_AVAILABLE(ios(10.0)){
+    NSLog(@"dropped frame");
+}
+
+#pragma mark - AVCapturePhotoCaptureDelegate
+- (void)captureOutput:(AVCaptureOutput *)captureOutput
+didFinishProcessingPhotoSampleBuffer:(nullable CMSampleBufferRef)photoSampleBuffer
+previewPhotoSampleBuffer:(nullable CMSampleBufferRef)previewPhotoSampleBuffer
+     resolvedSettings:(nonnull AVCaptureResolvedPhotoSettings *)resolvedSettings
+      bracketSettings:(nullable AVCaptureBracketedStillImageSettings *)bracketSettings
+                error:(nullable NSError *)error
+API_AVAILABLE(ios(10.0)){
+    if (error) {
+        NSLog(@"captureOutput, error:%@", error.description);
+        return;
+    }
+    if (_delegate && [_delegate respondsToSelector:@selector(cameraDevice:didCapturePhoto:previewImage:)]) {
+        [_delegate cameraDevice:self
+                    didCapturePhoto:photoSampleBuffer
+                       previewImage:previewPhotoSampleBuffer];
+    }
+    
+}
+
+-(id<MTLTexture>)getMTLTexture:(CMSampleBufferRef)sampleBuffer {
+    if (!_textureCache || !sampleBuffer) {
+        return nil;
+    }
+    
+    auto imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+    if (!imageBuffer) {
+        return nil;
+    }
+    
+    return [self getMTLTextureFromImageBuffer:imageBuffer];
+}
+
+-(id<MTLTexture>)getMTLTextureFromImageBuffer:(CVImageBufferRef)imageBuffer {
+    if (!_textureCache || !imageBuffer) {
+        return nil;
+    }
+    
+    auto width = CVPixelBufferGetWidth(imageBuffer);
+    auto height = CVPixelBufferGetHeight(imageBuffer);
+    
+    CVMetalTextureRef texture_ref = nil;
+    if (CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault,
+                                                  _textureCache,
+                                                  imageBuffer,
+                                                  nil,
+                                                  //MTLPixelFormatBGRA8Unorm,
+                                                  MTLPixelFormatRGBA8Unorm,
+                                                  width,
+                                                  height,
+                                                  0,
+                                                  &texture_ref) != kCVReturnSuccess) {
+        NSLog(@"Error: CVMetalTextureCacheCreateTextureFromImage could not create a CVMetalTextureRef");
+        return nil;
+    }
+    auto mtl_texture =  CVMetalTextureGetTexture(texture_ref);
+    CVBufferRelease(texture_ref);
+    return mtl_texture;
+}
+
+-(UIImage *)getUIImage:(CMSampleBufferRef)sampleBuffer {
+    if (!sampleBuffer) {
+        return nil;
+    }
+    
+    auto imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+    if (!imageBuffer) {
+        return nil;
+    }
+    
+    auto width = CVPixelBufferGetWidth(imageBuffer);
+    auto height = CVPixelBufferGetHeight(imageBuffer);
+    auto rect = CGRectMake(0, 0, width, height);
+    
+    auto ciImage = [CIImage imageWithCVPixelBuffer:imageBuffer];
+    auto ciContext = [CIContext contextWithOptions:nil];
+    auto cgImage = [ciContext createCGImage:ciImage fromRect:rect];
+    if (!cgImage) {
+        return nil;
+    }
+    
+    return [UIImage imageWithCGImage:cgImage];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.h
new file mode 100644
index 0000000..8595561
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
+#include <memory>
+#include <vector>
+
+@interface TNNMaskImage : NSObject
+@property (nonatomic, strong, readonly) CALayer *imageLayer;
+
+- (instancetype)init;
+
+-(void)addToLayer:(CALayer *)layer;
+-(void)removeFromSuperLayer;
+
+- (void)showImage:(UIImage *)image atFrame:(CGRect)frame;
+
+
+- (void)hide;
+@end
+
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.mm
new file mode 100644
index 0000000..cb12003
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/TNNMaskImage.mm
@@ -0,0 +1,46 @@
+//  Copyright © 2020 tencent. All rights reserved.
+
+#import "TNNMaskImage.h"
+
+@interface  TNNMaskImage()
+@property (nonatomic, strong) CALayer *imageLayer;
+@end
+
+@implementation TNNMaskImage
+- (instancetype)init {
+    self = [super init];
+    if (self != nil) {
+        _imageLayer = [[CAShapeLayer alloc] init];
+        _imageLayer.hidden = YES;
+    }
+    return self;
+}
+
+- (void)addToLayer:(CALayer *)layer {
+    [layer addSublayer:_imageLayer];
+}
+
+-(void)removeFromSuperLayer {
+    [_imageLayer removeFromSuperlayer];
+}
+
+- (void)showImage:(UIImage *)image atFrame:(CGRect)frame {
+    [CATransaction setDisableActions:YES];
+    
+    _imageLayer.frame = frame;
+    _imageLayer.contents = (id)image.CGImage;
+    
+    _imageLayer.hidden = NO;
+    
+    [CATransaction setDisableActions:NO];
+}
+
+- (void)hide {
+    [CATransaction setDisableActions:YES];
+    
+    _imageLayer.hidden = YES;
+    
+    [CATransaction setDisableActions:NO];
+}
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/camera_rotate_fill.png b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/camera_rotate_fill.png
new file mode 100644
index 0000000..b179dd6
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/CameraDevice/camera_rotate_fill.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.h
new file mode 100644
index 0000000..21f12c4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+#import "tnn_sdk_sample.h"
+#import "tnn_fps_counter.h"
+
+
+@interface TNNCameraPreviewController : TNNExamplesController {
+    std::shared_ptr<TNNFPSCounter> _fps_counter;
+}
+
+
+
+- (void)showSDKOutput:(std::shared_ptr<TNNSDKOutput>)output
+  withOriginImageSize:(CGSize)size
+           withStatus:(Status)status;
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.mm
new file mode 100644
index 0000000..2adf513
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNCameraPreviewController.mm
@@ -0,0 +1,408 @@
+//  Copyright © 2020 tencent. All rights reserved.
+
+#import "TNNCameraPreviewController.h"
+#import "TNNCameraVideoDevice.h"
+#import <Metal/Metal.h>
+#import <CoreMedia/CoreMedia.h>
+#import <tnn/tnn.h>
+#import "TNNBoundingBox.h"
+#import "TNNMaskImage.h"
+#include "tnn_fps_counter.h"
+#import "UIImage+Utility.h"
+#import "ultra_face_detector.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+typedef void(^CommonCallback)(Status);
+#define kMaxBuffersInFlight 1
+
+@interface TNNCameraPreviewController () <TNNCameraVideoDeviceDelegate> {
+    std::vector<std::shared_ptr<ObjectInfo> > _object_list_last;
+}
+@property (nonatomic, weak) IBOutlet UIImageView *cameraPreview;
+@property (nonatomic, weak) IBOutlet UILabel *labelResult;
+@property (nonatomic, weak) IBOutlet UILabel *labelFPS;
+@property (nonatomic, weak) IBOutlet UISwitch *switchGPU;
+@property (nonatomic, weak) IBOutlet UIButton *rotateCamera;
+
+@property (nonatomic, strong) TNNCameraVideoDevice *cameraDevice;
+@property (nonatomic, strong) NSString *label;
+
+@property (nonatomic, strong) dispatch_semaphore_t inflightSemaphore;
+
+@property (nonatomic, strong) TNNMaskImage *maskImage;
+@property (nonatomic, strong) NSArray<TNNBoundingBox *> *boundingBoxes;
+@property (nonatomic, strong) NSArray<UIColor *> *colors;
+@end
+
+@implementation TNNCameraPreviewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    self.navigationItem.title = self.viewModel.title;
+    
+    //colors for each class
+    auto colors = [NSMutableArray array];
+    for (NSNumber *r in @[@(0.2), @(0.4), @(0.6), @(0.8), @(1.0)]) {
+        for (NSNumber *g in @[@(0.3), @(0.7)]) {
+            for (NSNumber *b in @[@(0.4), @(0.8)]) {
+                [colors addObject:[UIColor colorWithRed:[r floatValue]
+                                                  green:[g floatValue]
+                                                   blue:[b floatValue]
+                                                  alpha:1]];
+            }
+        }
+    }
+    self.colors = colors;
+    
+    _object_list_last = {};
+    _fps_counter = std::make_shared<TNNFPSCounter>();
+    
+    _boundingBoxes = [NSArray array];
+    // maskimage layer
+    _maskImage = [[TNNMaskImage alloc] init];
+    _inflightSemaphore = dispatch_semaphore_create(kMaxBuffersInFlight);
+    
+    self.cameraDevice = [[TNNCameraVideoDevice alloc] init];
+    self.cameraDevice.delegate = self;
+    if (self.cameraDevice.videoPreviewLayer) {
+        [self.cameraPreview.layer addSublayer:self.cameraDevice.videoPreviewLayer];
+        [self resizePreviewLayer];
+    }
+    
+    // add the bounding box layers to the UI, on top of the video preview.
+    [self setupBoundingBox:17];
+    
+    //set up camera
+    auto camera = self.viewModel.preferFrontCamera ? AVCaptureDevicePositionFront : AVCaptureDevicePositionBack;
+    [_cameraDevice switchCamera:camera
+                     withPreset:AVCaptureSessionPreset640x480
+                     completion:^(BOOL) {
+    }];
+    
+    //init network
+    [self.switchGPU setOn:self.viewModel.preferGPU];
+    auto units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    [self loadNeuralNetwork:units callback:^(Status status) {
+        if (status != TNN_OK) {
+            //刷新界面
+            [self showSDKOutput:nullptr withOriginImageSize:CGSizeZero withStatus:status];
+        }
+    }];
+    
+}
+
+- (void)viewDidAppear:(BOOL)animated {
+    [super viewDidAppear:animated];
+    [self resizePreviewLayer];
+}
+
+- (void)viewWillDisappear:(BOOL)animated
+{
+    [super viewWillDisappear:animated];
+    
+    //for safety, set _predictor nullptr after stop camera
+    [self.cameraDevice stopSession];
+}
+
+- (void)setupBoundingBox:(NSUInteger)maxNumber {
+    // Set up the bounding boxes.
+    auto boundingBoxes = [NSMutableArray arrayWithArray:_boundingBoxes];
+    for (NSUInteger i=_boundingBoxes.count; i<maxNumber; i++) {
+        [boundingBoxes addObject:[[TNNBoundingBox alloc] init]];
+    }
+    
+    for (TNNBoundingBox *iter in boundingBoxes) {
+        [iter hide];
+        [iter removeFromSuperLayer];
+        
+        [iter addToLayer:_cameraPreview.layer];
+    }
+    [_maskImage hide];
+    [_maskImage removeFromSuperLayer];
+    [_maskImage addToLayer:_cameraPreview.layer];
+    self.boundingBoxes = boundingBoxes;
+}
+
+- (void)resizePreviewLayer {
+    if (_cameraDevice && _cameraPreview) {
+        _cameraDevice.videoPreviewLayer.frame = _cameraPreview.bounds;
+    }
+}
+
+- (void)loadNeuralNetwork:(TNNComputeUnits)units
+                 callback:(CommonCallback)callback {
+    //异步加载模型
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        Status status = [self.viewModel loadNeuralNetworkModel:units];
+        dispatch_async(dispatch_get_main_queue(), ^{
+            if (callback) {
+                callback(status);
+            }
+        });
+    });
+}
+
+#pragma mark - IBAction Interfaces
+
+- (IBAction)onSwitchGPU:(id)sender
+{
+    //init network
+    auto units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    [self loadNeuralNetwork:units callback:^(Status status) {
+        if (status != TNN_OK) {
+            //刷新界面
+            [self showSDKOutput:nullptr withOriginImageSize:CGSizeZero withStatus:status];
+        }
+    }];
+}
+
+- (IBAction)onCameraRotate:(id)sender {
+    auto position = [self.cameraDevice cameraPosition];
+    position = (position == AVCaptureDevicePositionBack) ?
+    AVCaptureDevicePositionFront : AVCaptureDevicePositionBack;
+    
+    [self.cameraDevice switchCamera:position
+                         withPreset:AVCaptureSessionPreset640x480
+                         completion:^(BOOL succes) {
+    }];
+}
+
+#pragma mark - predict Interfaces
+- (void)predictSampleBuffer:(CMSampleBufferRef)video_buffer
+                 withCamera:(TNNCameraVideoDevice *)camera
+               andPosition:(AVCaptureDevicePosition)position
+                atTimestamp:(CMTime)time {
+    if (!self.viewModel || !self.viewModel.predictor) return;
+    
+    const auto target_dims = self.viewModel.predictor->GetInputShape();
+    // block until the next GPU buffer is available.
+    dispatch_semaphore_wait(_inflightSemaphore, DISPATCH_TIME_FOREVER);
+    
+    //for muti-thread safety, increase ref count, to insure predictor is not released while detecting object
+    auto fps_counter_async_thread = _fps_counter;
+    auto predictor_async_thread = self.viewModel.predictor;
+    auto compute_units = self.viewModel.predictor->GetComputeUnits();
+    
+    CVImageBufferRef image_buffer = CMSampleBufferGetImageBuffer(video_buffer);
+    int origin_width = (int)CVPixelBufferGetWidth(image_buffer);
+    int origin_height = (int)CVPixelBufferGetHeight(image_buffer);
+    CGSize origin_image_size = CGSizeMake(origin_width, origin_height);
+    
+    id<MTLTexture> image_texture = nil;
+    if (compute_units == TNNComputeUnitsGPU) {
+        image_texture = [camera getMTLTextureFromImageBuffer:image_buffer];
+    }
+    //NSLog(@"==== (%d, %d)", origin_height, origin_width);
+    
+    //异步运行模型
+    CVBufferRetain(image_buffer);
+    auto image_texture_ref = CFBridgingRetain(image_texture);
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        Status status = TNN_OK;
+        std::map<std::string, double> map_fps;
+
+        //Note：智能指针必须在resize后才能释放
+        std::shared_ptr<char> image_data = nullptr;
+        std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+        // devan: to support generate UIImage, set channel to 4
+        auto origin_dims = {1, 4, origin_height, origin_width};
+        if (compute_units == TNNComputeUnitsCPU) {
+            image_data = utility::CVImageBuffRefGetData(image_buffer);
+            image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, origin_dims, image_data.get());
+        } else {
+            image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, origin_dims, (void *)image_texture_ref);
+        }
+
+//        auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_dims);
+//#ifndef END2END
+//        //resize
+//        fps_counter_async_thread->Begin("resize");
+//#endif
+//        predictor_async_thread->Resize(image_mat, input_mat, TNNInterpLinear);
+//#ifndef END2END
+//        fps_counter_async_thread->End("resize");
+//#endif
+        
+        std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+        do {
+            fps_counter_async_thread->Begin("detect");
+            status = predictor_async_thread->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+            fps_counter_async_thread->End("detect");
+        } while (0);
+        // hide the textbox, use drawinglines instead to support box with angles
+        bool hideTextbox = predictor_async_thread->hideTextBox();
+        
+        CVBufferRelease(image_buffer);
+        CFBridgingRelease(image_texture_ref);
+        
+        map_fps = fps_counter_async_thread->GetAllFPS();
+        //auto time = fps_counter_async_thread->GetAllTime();
+
+        dispatch_sync(dispatch_get_main_queue(), ^{
+            [self showSDKOutput:sdk_output
+            withOriginImageSize:origin_image_size
+             hideTextFrame:hideTextbox
+                     withStatus:status];
+            [self showFPS:map_fps];
+            //[self showTime: time];
+        });
+        
+        dispatch_semaphore_signal(self.inflightSemaphore);
+    });
+
+}
+
+- (void)showSDKOutput:(std::shared_ptr<TNNSDKOutput>)output
+       withOriginImageSize:(CGSize)size
+        hideTextFrame:(bool) hideTextFrame
+           withStatus:(Status)status {
+    auto object_list = [self.viewModel getObjectList:output];
+    [self showObjectInfo:object_list withOriginImageSize:size hideTextFrame:hideTextFrame withStatus:status];
+    auto mask_data   = [self.viewModel getImage:output];
+    [self showMask:mask_data withOriginImageSize:size withStatus:status];
+}
+
+- (void)showObjectInfo:(std::vector<std::shared_ptr<ObjectInfo> >)object_list
+            withOriginImageSize:(CGSize)origin_size
+            hideTextFrame:(bool) hideTextFrame
+            withStatus:(Status)status {
+    //status
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        
+        for (int i=0; i<_boundingBoxes.count; i++) {
+            [_boundingBoxes[i] hide];
+        }
+    } else {
+        object_list = [self reorder:object_list];
+        
+        //Object
+        auto camera_pos = [self.cameraDevice cameraPosition];
+        auto camera_gravity = [self.cameraDevice.videoPreviewLayer videoGravity];
+        int video_gravity = 0;
+        if (camera_gravity == AVLayerVideoGravityResizeAspectFill) {
+            video_gravity = 2;
+        } else if(camera_gravity == AVLayerVideoGravityResizeAspect) {
+            video_gravity = 1;
+        }
+        for (int i=0; i<_boundingBoxes.count; i++) {
+            if ( i < object_list.size()) {
+                auto object = object_list[i];
+                auto view_width = self.cameraPreview.bounds.size.width;
+                auto view_height = self.cameraPreview.bounds.size.height;
+                auto label = [self.viewModel labelForObject:object];
+                if (!label && object->label) {
+                    label = [NSString stringWithUTF8String:object->label];
+                }
+                auto view_face = object->AdjustToImageSize(origin_size.height, origin_size.width);
+                view_face = view_face.AdjustToViewSize(view_height, view_width, video_gravity);
+                if (camera_pos == AVCaptureDevicePositionFront) {
+                    view_face = view_face.FlipX();
+                }
+                [_boundingBoxes[i] showText:label
+                                  withColor:self.colors[i]
+                              hideTextFrame:hideTextFrame
+                                    atFrame:CGRectMake(view_face.x1, view_face.y1,
+                                                       view_face.x2-view_face.x1,
+                                                       view_face.y2-view_face.y1)];
+    //            [_boundingBoxes[i] showMarkAtPoints:{{(view_face.x1+view_face.x2)/2, (view_face.y1+view_face.y2)/2}} withColor:[UIColor redColor]];
+                // When we need to draw lines connecting key points, we draw key points with circle.
+                // Otherwise, we draw cross-shaped points.
+                [_boundingBoxes[i] showMarkAtPoints:view_face.key_points withColor:[UIColor greenColor] circle:view_face.lines.size()!=0];
+                [_boundingBoxes[i] showLines:view_face.key_points lines:view_face.lines withColor:self.colors[i]];
+            } else {
+                [_boundingBoxes[i] hide];
+            }
+        }
+    }
+}
+
+- (void)showMask:(ImageInfo)image_info
+            withOriginImageSize:(CGSize)origin_size
+            withStatus:(Status)status {
+    if (!image_info.data)
+        return;
+    auto camera_pos = [self.cameraDevice cameraPosition];
+    if (camera_pos == AVCaptureDevicePositionFront) {
+        image_info = image_info.FlipX();
+    }
+    // devan: method to support RGB data?
+    UIImage* image = utility::UIImageWithDataRGBA(image_info.data.get(), image_info.image_height, image_info.image_width);
+    [_maskImage showImage:image atFrame:_cameraPreview.bounds];
+}
+
+
+- (std::vector<std::shared_ptr<ObjectInfo> >)reorder:(std::vector<std::shared_ptr<ObjectInfo> >) object_list {
+    if (_object_list_last.size() > 0 && object_list.size() > 0) {
+        std::vector<std::shared_ptr<ObjectInfo> > object_list_reorder;
+        //按照原有排序插入object_list中原先有的元素
+        for (int index_last = 0; index_last < _object_list_last.size(); index_last++) {
+            auto object_last = _object_list_last[index_last];
+            //寻找最匹配元素
+            int index_target = 0;
+            float area_target = -1;
+            for (int index=0; index<object_list.size(); index++) {
+                auto object = object_list[index];
+                auto area = object_last->IntersectionRatio(object.get());
+                if (area > area_target) {
+                    area_target = area;
+                    index_target = index;
+                }
+            }
+
+            if (area_target > 0) {
+                object_list_reorder.push_back(object_list[index_target]);
+                //删除指定下标元素
+                object_list.erase(object_list.begin() + index_target);
+            }
+        }
+
+        //插入原先没有的元素
+        if (object_list.size() > 0) {
+            object_list_reorder.insert(object_list_reorder.end(), object_list.begin(), object_list.end());
+        }
+
+        _object_list_last = object_list_reorder;
+        return object_list_reorder;
+    } else{
+        _object_list_last = object_list;
+        return object_list;
+    }
+}
+
+- (void)showFPS:(std::map<std::string, double>) map_fps {
+    NSMutableString *fps = [NSMutableString stringWithFormat:@"device: %@",
+                            self.switchGPU.isOn ? @"metal\n" : @"arm\n"];
+    int index = 0;
+    for (auto item : map_fps) {
+        [fps appendFormat:@"%@fps %s: %.2f", index++ > 0 ? @"\n" : @"", item.first.c_str(), item.second];
+        NSLog(@"%@fps %s: %.2f",  index++ > 0 ? @"\n" : @"", item.first.c_str(), item.second);
+    }
+    self.labelFPS.text = fps;
+}
+
+- (void)showTime:(std::map<std::string, double>) map_time {
+    NSMutableString *fps = [NSMutableString stringWithFormat:@"device: %@",
+                            self.switchGPU.isOn ? @"metal\n" : @"arm\n"];
+    int index = 0;
+    for (auto item : map_time) {
+        [fps appendFormat:@"%@time %s: %.4f ms", index++ > 0 ? @"\n" : @"", item.first.c_str(), item.second];
+        //LOGE("=== %s: %.4f\n", item.first.c_str(), item.second);
+    }
+    self.labelFPS.text = fps;
+}
+
+#pragma mark - TNNCameraVideoDeviceDelegate
+- (void)cameraDevice:(TNNCameraVideoDevice *)camera
+     didCaptureVideo:(CMSampleBufferRef)videoBuffer
+        withPosition:(AVCaptureDevicePosition)position
+         atTimestamp:(CMTime)time {
+    [self predictSampleBuffer:videoBuffer
+                   withCamera:camera
+                  andPosition:position
+                  atTimestamp:time];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.h
new file mode 100644
index 0000000..97bad03
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNBlazeFaceDetectorViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.mm
new file mode 100644
index 0000000..59c5a25
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNBlazeFaceDetectorViewModel.mm
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNBlazeFaceDetectorViewModel.h"
+#import "blazeface_detector.h"
+
+using namespace std;
+
+@implementation TNNBlazeFaceDetectorViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnproto"
+                                                          ofType:nil];
+    auto anchor_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface_anchors.txt"
+                                                          ofType:nil];
+    if (model_path.length <= 0 || proto_path.length <= 0 || anchor_path.length <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model or anchor path is invalid");
+        NSLog(@"Error: proto or model or anchor path is invalid");
+        return status;
+    }
+
+    NSString *protoFormat = [NSString stringWithContentsOfFile:proto_path
+                                                   encoding:NSUTF8StringEncoding
+                                                      error:nil];
+    string proto_content = protoFormat.UTF8String;
+    NSData *data = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <=0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+    
+    auto option = std::make_shared<BlazeFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        
+        option->input_width = 128;
+        option->input_height = 128;
+        //min_score_thresh
+        option->min_score_threshold = 0.75;
+        //min_suppression_thresh
+        option->min_suppression_threshold = 0.3;
+        //predefined anchor file path
+        option->anchor_path = string(anchor_path.UTF8String);
+    }
+    
+    auto predictor = std::make_shared<BlazeFaceDetector>();
+    status = predictor->Init(option);
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictor = predictor;
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get());
+        for (auto item : face_output->face_list) {
+            auto face = std::make_shared<BlazeFaceInfo>();
+            *face = item;
+            object_list.push_back(face);
+        }
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithFormat:@"%.2f", object->score];
+    }
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.h
new file mode 100644
index 0000000..cda8418
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+#import "tnn_fps_counter.h"
+
+#import "UIImage+Utility.h"
+
+@interface TNNFaceDetectAlignerViewModel : TNNViewModel
+@property bool prev_face;
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.mm
new file mode 100644
index 0000000..67e2897
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectAlignerViewModel.mm
@@ -0,0 +1,199 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNFaceDetectAlignerViewModel.h"
+#import "face_detect_aligner.h"
+#import "youtu_face_align.h"
+#import "blazeface_detector.h"
+#import "UIImage+Utility.h"
+
+#import <Metal/Metal.h>
+#import <memory>
+
+using namespace std;
+
+@implementation TNNFaceDetectAlignerViewModel
+
+- (std::shared_ptr<BlazeFaceDetector>) loadFaceDetector:(TNNComputeUnits)units {
+    std::shared_ptr<BlazeFaceDetector> predictor = nullptr;
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnproto"
+                                                      ofType:nil];
+    auto anchor_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface_anchors.txt"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0 || anchor_path.length <= 0) {
+        LOGE("Error: proto or model or anchor path is invalid\n");
+        return predictor;
+    }
+    
+    string proto_content =
+    [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        LOGE("Error: proto or model path is invalid\n");
+        return predictor;
+    }
+    //blazeface requires input with shape 128*128
+    const int target_height = 128;
+    const int target_width  = 128;
+    DimsVector target_dims  = {1, 3, target_height, target_width};
+    
+    auto option = std::make_shared<BlazeFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        
+        option->input_width = target_width;
+        option->input_height = target_height;
+        //min_score_thresh
+        option->min_score_threshold = 0.75;
+        //min_suppression_thresh
+        option->min_suppression_threshold = 0.3;
+        //predefined anchor file path
+        option->anchor_path = string(anchor_path.UTF8String);
+    }
+    
+    predictor = std::make_shared<BlazeFaceDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        LOGE("Error: %s\n", status.description().c_str());
+        return nullptr;
+    }
+    
+    return predictor;
+}
+
+- (std::shared_ptr<YoutuFaceAlign>) loadYoutuFaceAlign:(TNNComputeUnits)units : (int) phase {
+    std::shared_ptr<YoutuFaceAlign> predictor = nullptr;
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    NSString *model_path = nil;
+    NSString *proto_path = nil;
+    NSString *mean_pts_path = nil;
+    
+    if(1 == phase) {
+        model_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_face_alignment_phase1.tnnmodel"
+                                                     ofType:nil];
+        proto_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_face_alignment_phase1.tnnproto"
+                                                     ofType:nil];
+        mean_pts_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_mean_pts_phase1.txt"
+                                                     ofType:nil];
+    } else if(2 == phase) {
+        model_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_face_alignment_phase2.tnnmodel"
+                                                     ofType:nil];
+        proto_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_face_alignment_phase2.tnnproto"
+                                                     ofType:nil];
+        mean_pts_path = [[NSBundle mainBundle] pathForResource:@"model/youtu_face_alignment/youtu_mean_pts_phase2.txt"
+                                                     ofType:nil];
+    } else{
+        LOGE("Error: facealign model phase is invalid\n");
+        return nullptr;
+    }
+    
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        LOGE("Error: proto or model path is invalid\n");
+        return predictor;
+    }
+    
+    string proto_content =
+    [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        LOGE("Error: proto or model path is invalid\n");
+        return predictor;
+    }
+    //youtu facealign models require input with shape 128*128
+    const int target_height = 128;
+    const int target_width  = 128;
+    DimsVector target_dims  = {1, 1, target_height, target_width};
+    
+    auto option = std::make_shared<YoutuFaceAlignOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        
+        option->input_width = target_width;
+        option->input_height = target_height;
+        //face threshold
+        option->face_threshold = 0.5;
+        option->min_face_size = 20;
+        //model phase
+        option->phase = phase;
+        //net_scale
+        option->net_scale = phase == 1? 1.2 : 1.3;
+        //mean pts path
+        option->mean_pts_path = mean_pts_path ? string(mean_pts_path.UTF8String) : "";
+    }
+    
+    predictor = std::make_shared<YoutuFaceAlign>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        LOGE("Error: %s\n", status.description().c_str());
+        return nullptr;
+    }
+    
+    return predictor;
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    auto face_detector = [self loadFaceDetector:units];
+    auto predictor_phase1 = [self loadYoutuFaceAlign:units :1];
+    auto predictor_phase2 = [self loadYoutuFaceAlign:units :2];
+    
+    if (!face_detector) {
+        return Status(TNNERR_MODEL_ERR, "loadFaceDetector failed: pls make sure the face detect model is downloaded");
+    }
+    
+    if (!predictor_phase1 || !predictor_phase2) {
+        return Status(TNNERR_MODEL_ERR, "loadYoutuFaceAlign failed: pls make sure the face alignment model is downloaded");
+    }
+    
+    
+    auto predictor = std::make_shared<FaceDetectAligner>();
+    status = predictor->Init({face_detector, predictor_phase1, predictor_phase2});
+    
+    self.predictor = predictor;
+    
+    //TODO: we need to set it to false when change camera
+    self.prev_face = false;
+    
+    return status;
+}
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<YoutuFaceAlignOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<YoutuFaceAlignOutput *>(sdk_output.get());
+        auto face = std::make_shared<YoutuFaceAlignInfo>();
+        *face = face_output->face;
+        object_list.push_back(face);
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.h
new file mode 100644
index 0000000..71dfbd4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+#import "tnn_fps_counter.h"
+
+#import "UIImage+Utility.h"
+
+@interface TNNFaceDetectMeshViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.mm
new file mode 100644
index 0000000..3b813cb
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectMeshViewModel.mm
@@ -0,0 +1,158 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNFaceDetectMeshViewModel.h"
+#import "face_detect_mesh.h"
+#import "face_mesh.h"
+#import "blazeface_detector.h"
+#import "UIImage+Utility.h"
+
+#import <Metal/Metal.h>
+#import <memory>
+
+using namespace std;
+
+@implementation TNNFaceDetectMeshViewModel
+
+- (std::shared_ptr<BlazeFaceDetector>) loadFaceDetector:(TNNComputeUnits)units {
+    std::shared_ptr<BlazeFaceDetector> predictor = nullptr;
+
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnproto"
+                                                      ofType:nil];
+    auto anchor_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface_anchors.txt"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0 || anchor_path.length <= 0) {
+        NSLog(@"Error: proto or model or anchor path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+    [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+    //blazeface requires input with shape 128*128
+    const int target_height = 128;
+    const int target_width  = 128;
+    DimsVector target_dims  = {1, 3, target_height, target_width};
+
+    auto option = std::make_shared<BlazeFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->input_width = target_width;
+        option->input_height = target_height;
+        //min_score_thresh
+        option->min_score_threshold = 0.75;
+        //min_suppression_thresh
+        option->min_suppression_threshold = 0.3;
+        //predefined anchor file path
+        option->anchor_path = string(anchor_path.UTF8String);
+    }
+
+    predictor = std::make_shared<BlazeFaceDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+- (std::shared_ptr<Facemesh>) loadFacemesh:(TNNComputeUnits)units {
+    std::shared_ptr<Facemesh> predictor = nullptr;
+
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_mesh/face_mesh.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_mesh/face_mesh.tnnproto"
+                                                          ofType:nil];
+
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+    [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    auto option = std::make_shared<FacemeshOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->face_presence_threshold = 0.1;
+        option->flip_vertically = false;
+        option->flip_horizontally = false;
+        option->norm_z = 1.0f;
+        option->ignore_rotation = false;
+    }
+
+    predictor = std::make_shared<Facemesh>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    auto face_detector = [self loadFaceDetector:units];
+    auto face_mesher = [self loadFacemesh:units];
+
+    auto predictor = std::make_shared<FaceDetectMesh>();
+    status = predictor->Init({face_detector, face_mesher});
+
+    self.predictor = predictor;
+
+    return status;
+}
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<FacemeshOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<FacemeshOutput *>(sdk_output.get());
+        auto face = std::make_shared<ObjectInfo>();
+        *face = face_output->face_list[0];
+        object_list.push_back(face);
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.h
new file mode 100644
index 0000000..8c58251
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNFaceDetectorViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.mm
new file mode 100644
index 0000000..67e9f35
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNFaceDetectorViewModel.mm
@@ -0,0 +1,116 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNFaceDetectorViewModel.h"
+#import "ultra_face_detector.h"
+
+using namespace std;
+
+const int target_height = 640;
+const int target_width = 480;
+
+@implementation TNNFaceDetectorViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib"
+    ofType:nil];
+#if TNN_SDK_USE_NCNN_MODEL
+        auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.bin"
+                                                           ofType:nil];
+        auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.param"
+                                                           ofType:nil];
+#else
+        auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.tnnmodel"
+                                                           ofType:@""];
+        auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.tnnproto"
+                                                           ofType:@""];
+#endif
+    if (model_path.length <= 0 || proto_path.length <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+
+    NSString *protoFormat = [NSString stringWithContentsOfFile:proto_path
+                                                   encoding:NSUTF8StringEncoding
+                                                      error:nil];
+    string proto_content = protoFormat.UTF8String;
+    NSData *data = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <=0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+    
+    auto option = std::make_shared<UltraFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        //only one input, ignore the input name
+        option->input_shapes = {{"ignore", {1,3,target_height,target_width}}};
+        
+        option->input_width = target_width;
+        option->input_height = target_height;
+        option->score_threshold = 0.975;
+        option->iou_threshold = 0.23;
+        option->topk = 1;
+    }
+    
+    auto predictor = std::make_shared<UltraFaceDetector>();
+    status = predictor->Init(option);
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictor = predictor;
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get());
+        for (auto item : face_output->face_list) {
+            auto face = std::make_shared<FaceInfo>();
+            *face = item;
+            object_list.push_back(face);
+        }
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithFormat:@"%.2f", object->score];
+    }
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.h
new file mode 100644
index 0000000..d2daac8
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNHairSegmentationViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+
+//Custom UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight;
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.mm
new file mode 100644
index 0000000..01da76e
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNHairSegmentationViewModel.mm
@@ -0,0 +1,190 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNHairSegmentationViewModel.h"
+#import "hair_segmentation.h"
+
+#import "DYFlatButton.h"
+#import "UIColor+Utility.h"
+
+using namespace std;
+
+@interface TNNHairSegmentationViewModel ()
+@property (nonatomic, assign) vector<RGBA> colors;
+@property (nonatomic, strong) NSArray<DYFlatButton *> *colorButtons;
+@property (nonatomic, assign) RGBA active_color;
+@end
+
+@implementation TNNHairSegmentationViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+    
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/hair_segmentation/segmentation.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/hair_segmentation/segmentation.tnnproto"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+
+    NSString *protoFormat = [NSString stringWithContentsOfFile:proto_path
+    encoding:NSUTF8StringEncoding
+       error:nil];
+    string proto_content =
+        protoFormat.UTF8String;
+    NSData *data = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+
+    auto option = std::make_shared<HairSegmentationOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->mode = 1;
+    }
+        
+    auto predictor = std::make_shared<HairSegmentation>();
+    status = predictor->Init(option);
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictor = predictor;
+    
+    // color blue
+    [self setHairSegmentationRGBA:self.active_color];
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    return {};
+}
+
+-(ImageInfo)getImage:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    //std::shared_ptr<char> image_data = nullptr;
+    ImageInfo image;
+    if (sdk_output && dynamic_cast<HairSegmentationOutput *>(sdk_output.get())) {
+        auto output = dynamic_cast<HairSegmentationOutput *>(sdk_output.get());
+        //auto merged_image = output->merged_image;
+        image = output->merged_image;
+    }
+    return image;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    return nil;
+}
+
+-(void) setHairSegmentationRGBA:(RGBA) color{
+    if (self.predictor) {
+        auto* predictor_cast = dynamic_cast<HairSegmentation *>(self.predictor.get());
+        predictor_cast->SetHairColor(color);
+    }
+}
+
+#pragma mark - UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight {
+    self.colors = {
+        //蓝色
+        {0,0,185,90},
+        //青色
+        {0,185,185,40},
+        //绿色
+        {0,185,0,50},
+        //紫色
+        {185,0,185,64},
+        //红色
+        {185,0,0,64},
+    };
+    self.active_color = self.colors[0];
+    
+    viewLayoutHeight.constant = 60;
+    
+    //label
+    auto label = [[UILabel alloc] initWithFrame:CGRectMake(15, 0, 80, viewLayoutHeight.constant)];
+    label.font = [UIFont systemFontOfSize:14];
+    label.text = @"头发颜色：";
+    label.textColor = [UIColor whiteColor];
+    [view addSubview:label];
+    
+    auto colorButtons = [NSMutableArray new];
+    for (int i=0; i<self.colors.size(); i++) {
+        auto color = self.colors[i];
+        auto button = [[DYFlatButton alloc] initWithFrame:CGRectMake(15 + 80 + i*(36 + 12),
+                                                                 12, 36, 36)];
+        button.tag = i;
+        
+        button.autoHighlightMode = DYAutoHighlightModeAll;
+        [button setBackgroundColor:[UIColor colorWithRed:color.r/255.0 green:color.g/255.0 blue:color.b/255.0 alpha:0.85]
+                          forState:UIControlStateNormal];
+        [button setBackgroundColor:[UIColor colorWithRed:color.r/255.0 green:color.g/255.0 blue:color.b/255.0 alpha:0.85]
+                          forState:UIControlStateSelected];
+        [button setBorderColor:[UIColor clearColor] forState:UIControlStateNormal];
+        [button setBorderColor:[UIColor whiteColor] forState:UIControlStateSelected];
+        [button addTarget:self action:@selector(onButtonClick:) forControlEvents:UIControlEventTouchUpInside];
+        [view addSubview:button];
+        [colorButtons addObject:button];
+        button.selected = i == 0;
+    }
+    
+    self.colorButtons = colorButtons;
+}
+
+- (void)onButtonClick:(DYFlatButton *)button {
+    auto selected = button.selected;
+    if (selected) {
+        for (DYFlatButton *item in self.colorButtons) {
+            item.selected = NO;
+        }
+    } else {
+        for (DYFlatButton *item in self.colorButtons) {
+            item.selected = NO;
+        }
+        button.selected = YES;
+    }
+    
+    RGBA color = {0,0,0,0};
+    if (button.selected) {
+        color = self.colors[button.tag];
+    }
+    
+    self.active_color = color;
+    [self setHairSegmentationRGBA:color];
+}
+
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.h
new file mode 100644
index 0000000..e1b692d
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNOCRViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.mm
new file mode 100644
index 0000000..14fadf8
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNOCRViewModel.mm
@@ -0,0 +1,236 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNOCRViewModel.h"
+
+#if HAS_OPENCV
+
+#import "ocr_textbox_detector.h"
+#import "ocr_angle_predictor.h"
+#import "ocr_text_recognizer.h"
+#import "ocr_driver.h"
+
+using namespace std;
+
+@interface TNNOCRViewModel ()
+@property (nonatomic, strong) UILabel *label;
+@property (nonatomic, assign) std::array<std::shared_ptr<TNNSDKSample>, 3> ocrPredictors;
+@end
+
+@implementation TNNOCRViewModel
+
+- (std::shared_ptr<OCRTextboxDetector>) loadTextboxDetector:(TNNComputeUnits)units {
+    std::shared_ptr<OCRTextboxDetector> predictor = nullptr;
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/dbnet.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/dbnet.tnnproto"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    auto option = std::make_shared<OCRTextboxDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->padding = 10;
+        option->box_score_threshold = 0.6f;
+        option->scale_down_ratio    = 0.75f;
+    }
+        
+    predictor = std::make_shared<OCRTextboxDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+    
+    return predictor;
+}
+
+- (std::shared_ptr<OCRAnglePredictor>) loadAnglePredictor:(TNNComputeUnits)units {
+    std::shared_ptr<OCRAnglePredictor> predictor = nullptr;
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/angle_net.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/angle_net.tnnproto"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+    }
+        
+    predictor = std::make_shared<OCRAnglePredictor>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+    
+    return predictor;
+}
+
+- (std::shared_ptr<OCRTextRecognizer>)loadTextRecognizer:(TNNComputeUnits)units {
+    std::shared_ptr<OCRTextRecognizer> predictor = nullptr;
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/crnn_lite_lstm.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/crnn_lite_lstm.tnnproto"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+    auto vocab_path = [[NSBundle mainBundle] pathForResource:@"model/chinese-ocr/keys.txt"
+                                                      ofType:nil];
+    if (vocab_path.length <= 0) {
+        NSLog(@"Error: vocabulary file path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    auto option = std::make_shared<OCRTextRecognizerOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        option->vocab_path = vocab_path.UTF8String;
+    }
+        
+    predictor = std::make_shared<OCRTextRecognizer>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    auto textbox_detector = [self loadTextboxDetector:units];
+    RETURN_VALUE_ON_NEQ(!textbox_detector,
+                        false,
+                        Status(TNNERR_MODEL_ERR,
+                               "loadTextboxDetector failed: pls make sure the pose detect model is downloaded"));
+    
+    auto angle_predictor = [self loadAnglePredictor:units];
+    RETURN_VALUE_ON_NEQ(!angle_predictor,
+                            false,
+                            Status(TNNERR_MODEL_ERR,
+                                   "loadAnglePredictor failed: pls make sure the pose landmark model is downloaded"));
+    auto text_recognizer = [self loadTextRecognizer:units];
+    RETURN_VALUE_ON_NEQ(!text_recognizer,
+                         false,
+                         Status(TNNERR_MODEL_ERR,
+                                "loadTextRecognizer failed: pls make sure the pose landmark model is downloaded"));
+
+    self.ocrPredictors = {textbox_detector, angle_predictor, text_recognizer};
+
+    auto predictor = std::make_shared<OCRDriver>();
+    status = predictor->Init({textbox_detector, angle_predictor, text_recognizer});
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    self.predictor = predictor;
+
+    return status;
+}
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > text_list;
+    if (sdk_output && dynamic_cast<OCROutput *>(sdk_output.get())) {
+        auto ocr_output = dynamic_cast<OCROutput *>(sdk_output.get());
+        for(int i=0; i<ocr_output->texts.size(); ++i) {
+            auto textbox = std::make_shared<ObjectInfo>();
+
+            // fill ObjectInfo lines to support angles
+            auto box_offset = ocr_output->box.begin() + i * 4;
+            auto box_end    = box_offset + 4;
+            textbox->key_points.insert(textbox->key_points.begin(),
+                                       box_offset, box_end);
+            textbox->lines = {{0, 1}, {1, 2}, {2, 3}, {3, 0}};
+
+            textbox->x1 = box_offset->first;
+            textbox->x2 = box_offset->first;
+            textbox->y1 = box_offset->second;
+            textbox->y2 = box_offset->second;
+            for(const auto&p:textbox->key_points) {
+                textbox->x1 = std::min(textbox->x1, p.first);
+                textbox->x2 = std::max(textbox->x2, p.first);
+                textbox->y1 = std::min(textbox->y1, p.second);
+                textbox->y2 = std::max(textbox->y2, p.second);
+            }
+
+            textbox->image_width = ocr_output->image_width;
+            textbox->image_height = ocr_output->image_height;
+
+            textbox->label = ocr_output->texts[i].c_str();
+            text_list.push_back(textbox);
+        }
+    }
+
+    return text_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    return nil;
+}
+
+@end
+
+#endif
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.h
new file mode 100644
index 0000000..7029a83
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNPoseDetectLandmarkViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.mm
new file mode 100644
index 0000000..cbf00cc
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNPoseDetectLandmarkViewModel.mm
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNPoseDetectLandmarkViewModel.h"
+#import "blazepose_detector.h"
+#import "blazepose_landmark.h"
+#import "pose_detect_landmark.h"
+
+using namespace std;
+
+@interface TNNPoseDetectLandmarkViewModel ()
+@property (nonatomic, strong) UILabel *label;
+@property (nonatomic, strong) NSArray<UIButton *> *modelButtons;
+@property (nonatomic, strong) NSArray<UILabel *> *modelLabels;
+@property (nonatomic, assign) NSInteger activeModel;  // 0: upper_body, 1: full_body
+@property (nonatomic, assign) std::array<std::shared_ptr<TNNSDKSample>, 2> landmarkPredictors;
+
+-(Status) switchLandmarkModel:(NSInteger)modelIndex;
+@end
+
+@implementation TNNPoseDetectLandmarkViewModel
+
+- (std::shared_ptr<BlazePoseDetector>) loadPoseDetector:(TNNComputeUnits)units {
+    std::shared_ptr<BlazePoseDetector> predictor = nullptr;
+
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_detection.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_detection.tnnproto"
+                                                      ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        LOGE("Error: proto or model or anchor path is invalid\n");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        LOGE("Error: proto or model path is invalid\n");
+        return predictor;
+    }
+
+    auto option = std::make_shared<BlazePoseDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->min_score_threshold = 0.5;
+        option->min_suppression_threshold = 0.3;
+    }
+
+    predictor = std::make_shared<BlazePoseDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        LOGE("Error: %s\n", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+- (std::shared_ptr<BlazePoseLandmark>) loadPoseLandmark:(TNNComputeUnits)units full_body:(bool)full_body {
+    std::shared_ptr<BlazePoseLandmark> predictor = nullptr;
+
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_landmark_upper_body.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_landmark_upper_body.tnnproto"
+                                                      ofType:nil];
+    if (full_body) {
+        model_path = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_landmark_full_body.tnnmodel"
+                                                    ofType:nil];
+        proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazepose/pose_landmark_full_body.tnnproto"
+                                                    ofType:nil];
+    }
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        LOGE("Error: proto or model or anchor path is invalid\n");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        LOGE("Error: proto or model path is invalid\n");
+        return predictor;
+    }
+
+    auto option = std::make_shared<BlazePoseLandmarkOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        option->pose_presence_threshold = 0.5;
+        option->landmark_visibility_threshold = 0.1;
+        option->full_body = full_body;
+    }
+
+    predictor = std::make_shared<BlazePoseLandmark>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        LOGE("Error: %s\n", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    auto pose_detector = [self loadPoseDetector:units];
+    RETURN_VALUE_ON_NEQ(!pose_detector,
+                        false,
+                        Status(TNNERR_MODEL_ERR,
+                               "loadPoseDetector failed: pls make sure the pose detect model is downloaded"));
+    
+    auto pose_landmark_ub = [self loadPoseLandmark:units full_body:false];
+    RETURN_VALUE_ON_NEQ(!pose_landmark_ub,
+                            false,
+                            Status(TNNERR_MODEL_ERR,
+                                   "loadPoseLandmark failed: pls make sure the pose landmark model is downloaded"));
+    auto pose_landmark_fb = [self loadPoseLandmark:units full_body:true];
+    RETURN_VALUE_ON_NEQ(!pose_landmark_fb,
+                         false,
+                         Status(TNNERR_MODEL_ERR,
+                                "loadPoseLandmark failed: pls make sure the pose landmark model is downloaded"));
+
+    self.landmarkPredictors = {pose_landmark_ub, pose_landmark_fb};
+    
+    auto active_landmarkmodel = self.landmarkPredictors[self.activeModel];
+    auto predictor = std::make_shared<PoseDetectLandmark>();
+    status = predictor->Init({pose_detector, active_landmarkmodel});
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    self.predictor = predictor;
+
+    return status;
+}
+
+-(Status)switchLandmarkModel:(NSInteger)modelIndex {
+    auto activeLandmarkModel = self.landmarkPredictors[modelIndex];
+    // update detect_landmark
+    auto predictor_cast = dynamic_cast<PoseDetectLandmark *>(self.predictor.get());
+    RETURN_VALUE_ON_NEQ(!predictor_cast, false, Status(TNNERR_PARAM_ERR, "invalid sdk!"));
+
+    auto status = predictor_cast->SwitchLandmarkModel(self.landmarkPredictors[modelIndex], modelIndex==1);
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > body_list;
+    if (sdk_output && dynamic_cast<BlazePoseLandmarkOutput *>(sdk_output.get())) {
+        auto body_output = dynamic_cast<BlazePoseLandmarkOutput *>(sdk_output.get());
+        for (auto item : body_output->body_list) {
+            auto body = std::make_shared<BlazePoseInfo>();
+            for(const auto& kp3d: item.key_points_3d) {
+                item.key_points.push_back(std::make_pair(std::get<0>(kp3d), std::get<1>(kp3d)));
+            }
+            *body = item;
+            body_list.push_back(body);
+        }
+    }
+    return body_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithUTF8String:"body"];
+    }
+    return nil;
+}
+
+#pragma mark - UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight {
+    viewLayoutHeight.constant = 75;
+
+    // initialize buttons
+    auto label = [[UILabel alloc] initWithFrame:CGRectMake(0, 10, 80, viewLayoutHeight.constant)];
+    label.font = [UIFont systemFontOfSize:14];
+    label.text = @"选择模型：";
+    label.textColor = [UIColor whiteColor];
+    [view addSubview:label];
+
+    const int modelCnt = 2;
+    std::array<std::string, modelCnt> assetNames = {"blazepose_upper_body", "blazepose_full_body"};
+    std::array<std::string, modelCnt> labelTexts = {"上半身", "全身"};
+
+    auto modeButtons = [NSMutableArray new];
+    auto modeLabels = [NSMutableArray new];
+    NSString* iconDir = @"assets/";
+    for(int i=0; i<modelCnt; ++i) {
+        // set button
+        NSString *iconName = [NSString stringWithUTF8String:assetNames[i].c_str()];
+        NSMutableString *iconPath = [NSMutableString stringWithString:iconDir];
+        [iconPath appendString: iconName];
+        NSString *imagePath = [[NSBundle mainBundle] pathForResource:iconPath
+                                                              ofType:@"png"];
+        UIImage *btnImage = [UIImage imageNamed:imagePath];
+        UIButton *btn = [UIButton buttonWithType:UIButtonTypeCustom];
+        [btn setBackgroundImage:btnImage forState:UIControlStateNormal];
+        btn.tag = i;
+        [btn addTarget:self action:@selector(onButtonClick:) forControlEvents:UIControlEventTouchUpInside];
+        auto btnFrame = CGRectMake(15 + 80 + i*(45 + 20),
+                                    12, 45, 45);
+        btn.frame = btnFrame;
+        btn.selected = i == 0;
+        if (btn.selected) {
+            [btn setBackgroundColor:[UIColor redColor]];
+        }
+        [modeButtons addObject:btn];
+
+        // set label
+        auto labelFrame = CGRectMake(btnFrame.origin.x, btnFrame.origin.y+50, btnFrame.size.width, 10);
+        UILabel *label = [[UILabel alloc] initWithFrame:labelFrame];
+        label.font = [UIFont systemFontOfSize:12];
+        label.text = [NSString stringWithUTF8String:labelTexts[i].c_str()];
+        label.textColor = [UIColor whiteColor];
+        label.textAlignment = NSTextAlignmentCenter;
+        [modeLabels addObject:label];
+
+        [view addSubview:btn];
+        [view addSubview:label];
+    }
+
+    self.modelButtons = modeButtons;
+    self.modelLabels = modeLabels;
+    self.label = label;
+    // use big mode for default
+    self.activeModel = 0;
+}
+
+- (void) onButtonClick:(UIButton *)button {
+    auto selected = button.selected;
+    if (selected) {
+        for(UIButton *item in self.modelButtons) {
+            item.selected = NO;
+        }
+    } else {
+        for(UIButton *item in self.modelButtons) {
+            item.selected = NO;
+            [item setBackgroundColor:[UIColor clearColor]];
+        }
+        button.selected = YES;
+        [button setBackgroundColor:[UIColor redColor]];
+    }
+
+    if (button.selected) {
+        self.activeModel = button.tag;
+        [self switchLandmarkModel:self.activeModel];
+    }
+}
+
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.h
new file mode 100644
index 0000000..bf6aa1c
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNSSDObjectDetectorViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.mm
new file mode 100644
index 0000000..fc22613
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSSDObjectDetectorViewModel.mm
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNSSDObjectDetectorViewModel.h"
+#import "object_detector_ssd.h"
+
+using namespace std;
+
+@implementation TNNSSDObjectDetectorViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto"
+                                                      ofType:nil];
+    
+    if (model_path.length <= 0 || proto_path.length <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+    
+    // SSD model requires input with size=(300, 300)
+    const int target_height = 300;
+    const int target_width  = 300;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+    }
+    
+    auto predictor = std::make_shared<ObjectDetectorSSD>();
+    status = predictor->Init(option);
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictor = predictor;
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<ObjectDetectorSSDOutput *>(sdk_output.get())) {
+        auto object_output = dynamic_cast<ObjectDetectorSSDOutput *>(sdk_output.get());
+        for (auto item : object_output->object_list) {
+            auto face = std::make_shared<ObjectInfo>();
+            *face = item;
+            object_list.push_back(face);
+        }
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithFormat:@"%s %.1f", voc_classes[object->class_id], object->score];
+    }
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.h
new file mode 100644
index 0000000..6682c03
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNSkeletonDetectorViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+//Custom UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight;
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.mm
new file mode 100644
index 0000000..1c83f40
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNSkeletonDetectorViewModel.mm
@@ -0,0 +1,212 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNSkeletonDetectorViewModel.h"
+#import "skeleton_detector.h"
+#import <array>
+
+using namespace std;
+
+@interface TNNSkeletonDetectorViewModel ()
+@property (nonatomic, strong) UILabel *label;
+@property (nonatomic, strong) NSArray<UIButton *> *modelButtons;
+@property (nonatomic, strong) NSArray<UILabel *> *modelLabels;
+@property (nonatomic, assign) NSInteger activeModel;
+@property (nonatomic, assign) std::array<std::shared_ptr<TNNSDKSample>, 2> predictors;
+
+-(std::shared_ptr<TNNSDKSample>)loadSkeletonModel:(TNNComputeUnits)units path:(NSString *)proto_path;
+@end
+
+@implementation TNNSkeletonDetectorViewModel
+
+-(std::shared_ptr<TNNSDKSample>)loadSkeletonModel:(TNNComputeUnits)units path:(NSString *)proto_path {
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/skeleton/skeleton.tnnmodel"
+                                                      ofType:nil];
+    if (proto_path.length <= 0) {
+        NSLog(@"Error: proto path is invalid");
+        return nullptr;
+    }
+    if (model_path.length <= 0) {
+        NSLog(@"Error: model path is invalid");
+        return nullptr;
+    }
+
+    NSString *protoFormat = [NSString stringWithContentsOfFile:proto_path
+                                                   encoding:NSUTF8StringEncoding
+                                                      error:nil];
+    string proto_content = protoFormat.UTF8String;
+    NSData *data = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <=0) {
+        NSLog(@"Error: proto or model path is invalid");
+        return nullptr;
+    }
+    
+    auto option = std::make_shared<SkeletonDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        
+        option->min_threshold = 0.15f;
+    }
+    
+    auto predictor = std::make_shared<SkeletonDetector>();
+    auto status = predictor->Init(option);
+    RETURN_VALUE_ON_NEQ(status, TNN_OK, nullptr);
+    
+    return predictor;
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/skeleton/skeleton_big.tnnproto"
+                                                      ofType:nil];
+    auto small_proto_path = [[NSBundle mainBundle] pathForResource:@"model/skeleton/skeleton_small.tnnproto"
+                                                            ofType:nil];
+
+    auto bigPredictor = [self loadSkeletonModel:units path:proto_path];
+    RETURN_VALUE_ON_NEQ(!bigPredictor, false,
+                        Status(TNNERR_NET_ERR, "Error: proto or model path is invalid"));
+
+    auto smallPredictor = [self loadSkeletonModel:units path:small_proto_path];
+    RETURN_VALUE_ON_NEQ(!smallPredictor, false,
+                        Status(TNNERR_NET_ERR, "Error: proto or model path is invalid"));
+
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    bigPredictor->SetBenchOption(bench_option);
+    smallPredictor->SetBenchOption(bench_option);
+
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictors = {bigPredictor, smallPredictor};
+    self.predictor = self.predictors[self.activeModel];
+
+    return status;
+}
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<SkeletonDetectorOutput *>(sdk_output.get())) {
+        auto skeleton_output = dynamic_cast<SkeletonDetectorOutput *>(sdk_output.get());
+        auto skeleton = std::make_shared<SkeletonInfo>();
+        *skeleton = skeleton_output->keypoints;
+        object_list.push_back(skeleton);
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithFormat:@"%.2f", object->score];
+    }
+    return nil;
+}
+
+#pragma mark - UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight {
+    viewLayoutHeight.constant = 75;
+    
+    // initialize buttons
+    auto label = [[UILabel alloc] initWithFrame:CGRectMake(0, 10, 80, viewLayoutHeight.constant)];
+    label.font = [UIFont systemFontOfSize:14];
+    label.text = @"选择模式：";
+    label.textColor = [UIColor whiteColor];
+    [view addSubview:label];
+    
+    const int modeCnt = 2;
+    std::array<std::string, modeCnt> modeNames = {"高精度", "快速"};
+    std::array<std::string, modeCnt> assertNames = {"full", "lite"};
+    
+    auto modeButtons = [NSMutableArray new];
+    auto modeLabels = [NSMutableArray new];
+    NSString* iconDir = @"assets/";
+    for(int i=0; i<modeCnt; ++i) {
+        // set button
+        NSString *iconName = [NSString stringWithUTF8String:assertNames[i].c_str()];
+        NSMutableString *iconPath = [NSMutableString stringWithString:iconDir];
+        [iconPath appendString: iconName];
+        NSString *imagePath = [[NSBundle mainBundle] pathForResource:iconPath
+                                                              ofType:@"png"];
+        UIImage *btnImage = [UIImage imageNamed:imagePath];
+        UIButton *btn = [UIButton buttonWithType:UIButtonTypeCustom];
+        [btn setBackgroundImage:btnImage forState:UIControlStateNormal];
+        btn.tag = i;
+        [btn addTarget:self action:@selector(onButtonClick:) forControlEvents:UIControlEventTouchUpInside];
+        auto btnFrame = CGRectMake(15 + 80 + i*(45 + 20),
+                                    12, 45, 45);
+        btn.frame = btnFrame;
+        btn.selected = i == 0;
+        if (btn.selected) {
+            [btn setBackgroundColor:[UIColor redColor]];
+        }
+        [modeButtons addObject:btn];
+        
+        // set label
+        auto labelFrame = CGRectMake(btnFrame.origin.x, btnFrame.origin.y+50, btnFrame.size.width, 10);
+        UILabel *label = [[UILabel alloc] initWithFrame:labelFrame];
+        label.font = [UIFont systemFontOfSize:12];
+        label.text = [NSString stringWithUTF8String:modeNames[i].c_str()];
+        label.textColor = [UIColor whiteColor];
+        label.textAlignment = NSTextAlignmentCenter;
+        [modeLabels addObject:label];
+        
+        [view addSubview:btn];
+        [view addSubview:label];
+    }
+    
+    self.modelButtons = modeButtons;
+    self.modelLabels = modeLabels;
+    self.label = label;
+    // use big mode for default
+    self.activeModel = 0;
+}
+
+- (void) onButtonClick:(UIButton *)button {
+    auto selected = button.selected;
+    if (selected) {
+        for(UIButton *item in self.modelButtons) {
+            item.selected = NO;
+        }
+    } else {
+        for(UIButton *item in self.modelButtons) {
+            item.selected = NO;
+            [item setBackgroundColor:[UIColor clearColor]];
+        }
+        button.selected = YES;
+        [button setBackgroundColor:[UIColor redColor]];
+    }
+    
+    if (button.selected) {
+        self.predictor = self.predictors[button.tag];
+        self.activeModel = button.tag;
+    }
+    
+}
+
+@end
+
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.h
new file mode 100644
index 0000000..2cb4194
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+#import "tnn_sdk_sample.h"
+
+using namespace::TNN_NS;
+
+@interface TNNViewModel : NSObject
+@property (nonatomic, assign) std::shared_ptr<TNNSDKSample> predictor;
+
+@property (nonatomic, strong) NSString *title;
+@property (nonatomic, assign) bool preferFrontCamera;
+@property (nonatomic, assign) bool preferGPU;
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(ImageInfo)getImage:(std::shared_ptr<TNNSDKOutput>)sdk_output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+
+//Custom UI control
+- (void)setupCustomView:(UIView *)view
+           layoutHeight:(NSLayoutConstraint *)viewLayoutHeight;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.mm
new file mode 100644
index 0000000..1d6be0b
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNViewModel.mm
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNViewModel.h"
+
+@implementation TNNViewModel
+- (NSString *)title {
+    if (_title.length <= 0) {
+        return NSStringFromClass(self.class);
+    } else {
+        return _title;
+    }
+}
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    [NSException raise:NSInvalidArgumentException format:@"subclass must overide the func loadNeuralNetworkModel"];
+    return TNN_OK;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    [NSException raise:NSInvalidArgumentException format:@"subclass must overide the func getObjectList"];
+    return {};
+}
+
+-(ImageInfo)getImage:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    return {};
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    [NSException raise:NSInvalidArgumentException format:@"subclass must overide the func labelForObject"];
+    return nil;
+}
+
+- (void)setupCustomView:(UIView *)view layoutHeight:(NSLayoutConstraint *)viewLayoutHeight {
+    if (view && viewLayoutHeight) {
+        viewLayoutHeight.constant = 0;
+    }
+}
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.h
new file mode 100644
index 0000000..592cf6c
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import "TNNViewModel.h"
+
+@interface TNNYoloObjectDetectorViewModel : TNNViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units;
+
+//Object Detection
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)output;
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.mm
new file mode 100644
index 0000000..3ba187e
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNCameraPreviewController/TNNViewModel/TNNYoloObjectDetectorViewModel.mm
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNYoloObjectDetectorViewModel.h"
+#import "object_detector_yolo.h"
+
+using namespace std;
+
+@implementation TNNYoloObjectDetectorViewModel
+
+-(Status)loadNeuralNetworkModel:(TNNComputeUnits)units {
+    Status status = TNN_OK;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/yolov5/yolov5s.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/yolov5/yolov5s-permute.tnnproto"
+                                                      ofType:nil];
+    
+    if (model_path.length <= 0 || proto_path.length <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        status = Status(TNNERR_NET_ERR, "Error: proto or model path is invalid");
+        NSLog(@"Error: proto or model path is invalid");
+        return status;
+    }
+    
+    // SSD model requires input with size=(300, 300)
+    const int target_height = 448;
+    const int target_width  = 640;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+    }
+    
+    auto predictor = std::make_shared<ObjectDetectorYolo>();
+    status = predictor->Init(option);
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    //考虑多线程安全，最好初始化完全没问题后再赋值给成员变量
+    //for muti-thread safety, copy to member variable after allocate
+    self.predictor = predictor;
+    return status;
+}
+
+
+-(std::vector<std::shared_ptr<ObjectInfo> >)getObjectList:(std::shared_ptr<TNNSDKOutput>)sdk_output {
+    std::vector<std::shared_ptr<ObjectInfo> > object_list;
+    if (sdk_output && dynamic_cast<ObjectDetectorYoloOutput *>(sdk_output.get())) {
+        auto object_output = dynamic_cast<ObjectDetectorYoloOutput *>(sdk_output.get());
+        for (auto item : object_output->object_list) {
+            auto face = std::make_shared<ObjectInfo>();
+            *face = item;
+            object_list.push_back(face);
+        }
+    }
+    return object_list;
+}
+
+-(NSString*)labelForObject:(std::shared_ptr<ObjectInfo>)object {
+    if (object) {
+        return [NSString stringWithFormat:@"%s %.1f", coco_classes[object->class_id], object->score];
+    }
+    return nil;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.h
new file mode 100644
index 0000000..2f870e4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.h
@@ -0,0 +1,22 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+#import "TNNViewModel.h"
+
+@interface TNNExamplesController : UIViewController
+@property (nonatomic, weak) IBOutlet UIView *customOptionView;
+@property (nonatomic, weak) IBOutlet NSLayoutConstraint *customOptionViewHeight;
+@property (nonatomic, strong) TNNViewModel *viewModel;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.mm
new file mode 100644
index 0000000..0135b08
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesController.mm
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNExamplesController () {
+}
+
+@end
+
+@implementation TNNExamplesController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    
+    [self.viewModel setupCustomView:self.customOptionView
+                       layoutHeight:self.customOptionViewHeight];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.h
new file mode 100644
index 0000000..938e110
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.h
@@ -0,0 +1,24 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TNNExamplesListCell : UITableViewCell
+@property (weak, nonatomic) IBOutlet UILabel *labelTitle;
+@property (weak, nonatomic) IBOutlet UILabel *labelDesc;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.m b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.m
new file mode 100644
index 0000000..7e84a9c
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListCell.m
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesListCell.h"
+
+@implementation TNNExamplesListCell
+
+- (void)awakeFromNib {
+    [super awakeFromNib];
+    // Initialization code
+}
+
+- (void)setSelected:(BOOL)selected animated:(BOOL)animated {
+    [super setSelected:selected animated:animated];
+
+    // Configure the view for the selected state
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.h
new file mode 100644
index 0000000..06ea536
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface TNNExamplesListController : UITableViewController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.mm
new file mode 100644
index 0000000..4de8cf4
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNExamplesController/TNNExamplesListController.mm
@@ -0,0 +1,262 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesListController.h"
+#import "TNNCameraPreviewController.h"
+#import "TNNFaceDetectorViewModel.h"
+#import "TNNBlazeFaceDetectorViewModel.h"
+#import "TNNSSDObjectDetectorViewModel.h"
+#import "TNNYoloObjectDetectorViewModel.h"
+#import "TNNFaceDetectAlignerViewModel.h"
+#import "TNNFaceDetectMeshViewModel.h"
+#import "TNNHairSegmentationViewModel.h"
+#import "TNNPoseDetectLandmarkViewModel.h"
+#import "TNNSkeletonDetectorViewModel.h"
+#import "TNNOCRViewModel.h"
+
+#import "TNNExamplesListCell.h"
+
+using namespace std;
+
+@interface TNNExampleData : NSObject
+@property (strong, nonatomic) NSString *title;
+@property (strong, nonatomic) NSString *desc;
+
+@property (strong, nonatomic) NSString *viewControllerID;
+@property (strong, nonatomic) TNNViewModel *viewModel;
+@end
+
+@implementation TNNExampleData
+@end
+
+
+
+@interface TNNExamplesListController () {
+}
+@property (strong, nonatomic) NSArray<TNNExampleData*> *examples;
+@end
+
+@implementation TNNExamplesListController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    
+    [self setupTNNExampleDataSource];
+}
+
+- (void)setupTNNExampleDataSource {
+    auto examples = [NSMutableArray array];
+
+    //人脸检测 - Ultra Fast
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人脸检测 - Ultra Fast";
+        data.desc = @"图像类 - 单输入多输出";
+        data.viewControllerID = @"TNNFaceDetectorController";
+        [examples addObject:data];
+    }
+    
+    //图像分类 - SqueezeNet
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"图像分类 - SqueezeNet";
+        data.desc = @"图像类 - 单输入多输出";
+        data.viewControllerID = @"TNNImageClassifyController";
+        [examples addObject:data];
+    }
+    
+    //灰度图上色
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"灰度图上色";
+        data.desc = @"图像类 - 单输入（图）多输出（图）";
+        data.viewControllerID = @"TNNImageColourController";
+        [examples addObject:data];
+    }
+    
+    //人脸检测 - Blazeface
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人脸检测 - Blazeface";
+        data.desc = @"图像类 - 单输入多输出";
+        data.viewControllerID = @"TNNBlazefaceDetectorController";
+        [examples addObject:data];
+    }
+    
+    //人脸检测 - Blazeface
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人脸检测 - Blazeface";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNBlazeFaceDetectorViewModel new];
+            data.viewModel.title = data.title;
+            data.viewModel.preferFrontCamera = true;
+        }
+        [examples addObject:data];
+    }
+    
+    //物体检测 - mbv2+SSD
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"物体检测 - mbv2+SSD";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNSSDObjectDetectorViewModel new];
+            data.viewModel.title = data.title;
+        }
+        [examples addObject:data];
+    }
+    
+    //物体检测 - yolov5
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"物体检测 - yolov5";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNYoloObjectDetectorViewModel new];
+            data.viewModel.title = data.title;
+            data.viewModel.preferGPU = true;
+        }
+        [examples addObject:data];
+    }
+    
+    //人脸检测配准 - 腾讯优图
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人脸检测配准 - 腾讯优图实验室";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNFaceDetectAlignerViewModel new];
+            data.viewModel.title = @"人脸检测配准 - 腾讯优图实验室";
+            data.viewModel.preferFrontCamera = true;
+        }
+        [examples addObject:data];
+    }
+    
+    //人脸稠密点 - Facemesh
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人脸稠密点 - Facemesh";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNFaceDetectMeshViewModel new];
+            data.viewModel.title = @"Facemesh";
+            data.viewModel.preferFrontCamera = true;
+        }
+        [examples addObject:data];
+    }
+
+    //头发分割 - HairSegmentation
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"头发分割 - 腾讯光影实验室";
+        data.desc = @"摄像头 - 单输入单输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNHairSegmentationViewModel new];
+            data.viewModel.title = @"头发分割 - 腾讯光影实验室";
+            data.viewModel.preferFrontCamera = true;
+        }
+        [examples addObject:data];
+    }
+    
+    //人体姿势关键点 - BlazePose
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人体关键点 - BlazePose";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNPoseDetectLandmarkViewModel new];
+            data.viewModel.title = @"BlazePose";
+            data.viewModel.preferFrontCamera = false;
+        }
+        [examples addObject:data];
+    }
+    
+    //人体关键点 - SkeletonDetector
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"人体关键点 - 腾讯微视";
+        data.desc = @"摄像头 - 单输入单输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNSkeletonDetectorViewModel new];
+            data.viewModel.title = @"人体关键点 - 腾讯微视";
+            data.viewModel.preferFrontCamera = false;
+            data.viewModel.preferGPU = false;
+        }
+        [examples addObject:data];
+    }
+
+#if HAS_OPENCV
+    //光学字符识别 - OCR
+    {
+        auto data = [TNNExampleData new];
+        data.title = @"OCR";
+        data.desc = @"摄像头 - 单输入多输出";
+        data.viewControllerID = @"TNNCameraPreviewController";
+        {
+            data.viewModel = [TNNOCRViewModel new];
+            data.viewModel.title = @"OCR";
+            data.viewModel.preferFrontCamera = false;
+        }
+        [examples addObject:data];
+    }
+#endif
+
+    self.examples = examples;
+}
+
+#pragma mark - UITableViewDataSource
+- (NSInteger)numberOfSectionsInTableView:(UITableView *)tableView {
+    return self.examples.count;
+}
+
+- (NSInteger)tableView:(UITableView *)tableView
+ numberOfRowsInSection:(NSInteger)section {
+    return 1;
+}
+
+- (UITableViewCell *)tableView:(UITableView *)tableView
+ cellForRowAtIndexPath:(NSIndexPath *)indexPath {
+    auto cell = (TNNExamplesListCell *)[tableView dequeueReusableCellWithIdentifier:@"TNNExamplesListCell"];
+    auto data = self.examples[indexPath.section];
+    cell.labelTitle.text = data.title;
+    cell.labelDesc.text = data.desc;
+    return cell;
+}
+
+//#pragma mark - UITableViewDelegate
+- (void)tableView:(UITableView *)tableView didSelectRowAtIndexPath:(NSIndexPath *)indexPath {
+    auto data = self.examples[indexPath.section];
+    auto *vc = [self.storyboard instantiateViewControllerWithIdentifier:data.viewControllerID];
+    if (![vc isKindOfClass:TNNExamplesController.class]) {
+        LOGE("view controller must be subclass of TNNExamplesController\n");
+        return;
+    }
+    
+    auto exampleController = (TNNExamplesController *)vc;
+    exampleController.viewModel = data.viewModel;
+
+    [self.navigationController setViewControllers:@[ vc ] animated:YES];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.h
new file mode 100644
index 0000000..547e001
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNFaceDetectorController : TNNExamplesController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.mm
new file mode 100644
index 0000000..7f34a1f
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/TNNFaceDetectorController.mm
@@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNFaceDetectorController.h"
+#import "UIImage+Utility.h"
+#import "ultra_face_detector.h"
+#import <Metal/Metal.h>
+#import <tnn/tnn.h>
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNFaceDetectorController ()
+@property(nonatomic, weak) IBOutlet UIButton *btnTNNExamples;
+@property(nonatomic, weak) IBOutlet UIImageView *imageView;
+@property(nonatomic, weak) IBOutlet UILabel *labelResult;
+@property(nonatomic, weak) IBOutlet UISwitch *switchGPU;
+
+@property(nonatomic, strong) UIImage *image_orig;
+@end
+
+@implementation TNNFaceDetectorController
+
+- (void)viewWillAppear:(BOOL)animated {
+    [super viewWillAppear:animated];
+
+    self.image_orig = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"test.jpg" ofType:nil]];
+    self.imageView.image = self.image_orig;
+    auto view            = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth      = view.frame.size.width;
+    int screenHeight     = view.frame.size.height;
+    int width            = self.imageView.frame.size.width;
+    int height           = self.imageView.frame.size.height;
+    int widthOffset      = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame =
+        CGRectMake(self.imageView.frame.origin.x,
+                   self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2,
+                   self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+}
+
+- (IBAction)onSwichChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+#if TNN_SDK_USE_NCNN_MODEL
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.bin"
+                                                      ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.param"
+                                                      ofType:nil];
+#else
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_detector/version-slim-320_simplified.tnnproto"
+                                                      ofType:nil];
+#endif
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+    
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    
+    auto option = std::make_shared<UltraFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+        
+        option->score_threshold = 0.95;
+        option->iou_threshold = 0.15;
+    }
+    
+    auto predictor = std::make_shared<UltraFaceDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 20;
+    predictor->SetBenchOption(bench_option);
+    
+    auto image_data = utility::UIImageGetData(self.image_orig);
+    //preprocess
+    const int origin_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int origin_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    DimsVector image_dims = {1, 3, origin_height, origin_width};
+    std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+    if(units == TNNComputeUnitsCPU) {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, image_dims, image_data.get());
+    } else {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, image_dims);
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+        
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, image_dims[3], image_dims[2])
+                        mipmapLevel:0
+                        withBytes:image_data.get()
+                        bytesPerRow:image_dims[3] * 4];
+        
+        
+    }
+
+    auto target_dims = predictor->GetInputShape();
+    auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_dims);
+    status = predictor->Resize(image_mat, input_mat, TNNInterpLinear);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    status = predictor->Predict(std::make_shared<UltraFaceDetectorInput>(input_mat), sdk_output);
+    
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    std::vector<FaceInfo> face_info;
+    if (sdk_output && dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get());
+        face_info = face_output->face_list;
+    }
+    
+    auto bench_result     = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@      face count:%d\ntime:\n%s",
+                                                       units == TNNComputeUnitsGPU ? @"gpu" : @"arm",
+                                                       (int)face_info.size(), bench_result.Description().c_str()];
+
+    auto target_width  = target_dims[3];
+    auto target_height = target_dims[2];
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    float scale_x               = image_orig_width / (float)target_width;
+    float scale_y               = image_orig_height / (float)target_height;
+    auto image_orig_data        = utility::UIImageGetData(self.image_orig, image_orig_height, image_orig_width);
+    for (int i = 0; i < face_info.size(); i++) {
+        auto face = face_info[i];
+        Rectangle((void *)image_orig_data.get(), image_orig_height, image_orig_width, face.x1, face.y1, face.x2,
+                  face.y2, scale_x, scale_y);
+    }
+
+    //    UIImage *output_image = [UIImage yt_imageWithCVMat:input_mat_rgba];
+    UIImage *output_image =
+        utility::UIImageWithDataRGBA((void *)image_orig_data.get(), image_orig_height, image_orig_width);
+    self.imageView.image = output_image;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/test.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/test.jpg
new file mode 100644
index 0000000..feeb69b
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNFaceDetectorController/test.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.h
new file mode 100644
index 0000000..0fe3d8b
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNFacemeshController : TNNExamplesController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.mm
new file mode 100644
index 0000000..9292042
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/TNNFacemeshController.mm
@@ -0,0 +1,519 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNFacemeshController.h"
+#import <Metal/Metal.h>
+#import <cstdlib>
+#import <sstream>
+#import <string>
+#import <tuple>
+#import <tnn/tnn.h>
+
+#import "blazeface_detector.h"
+#import "face_mesh.h"
+#import "UIImage+Utility.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNFacemeshController ()
+
+@property (weak, nonatomic) IBOutlet UIButton *btnExample;
+@property (weak, nonatomic) IBOutlet UILabel *labelResult;
+@property (weak, nonatomic) IBOutlet UILabel *labelGPU;
+@property (weak, nonatomic) IBOutlet UISwitch *switchGPU;
+@property (weak, nonatomic) IBOutlet UIImageView *imageView;
+
+@property NSMutableArray *result;
+
+@property(nonatomic, strong) UIImage* image_orig;
+
+@end
+
+@implementation TNNFacemeshController
+;
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+}
+
+- (void)viewWillAppear:(BOOL) animated {
+    [super viewWillAppear:animated];
+    /*
+    self.image_orig = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"test_facemesh.jpg" ofType:nil]];
+    self.imageView.image = self.image_orig;
+    */
+    // Iterate all images
+    self.result = [NSMutableArray array];
+    [[[NSBundle mainBundle] pathsForResourcesOfType:@".jpg" inDirectory:@"decoded_images/."] enumerateObjectsUsingBlock:^(NSString *obj, NSUInteger idx, BOOL *stop) {
+        NSString *path = [obj lastPathComponent];
+        //printf("path:%s\n", std::string([path UTF8String]).c_str());
+        if ([path hasSuffix:@"jpg"]) {
+            [self.result addObject:obj];
+        }
+    }];
+    // sort according to the name, ensure the images are processed frame by frame
+    [self.result sortUsingSelector:@selector(localizedStandardCompare:)];
+    self.image_orig = [UIImage imageWithContentsOfFile:self.result[0]];
+    self.imageView.image = self.image_orig;
+    
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth      = view.frame.size.width;
+    int screenHeight     = view.frame.size.height;
+    int width            = self.imageView.frame.size.width;
+    int height           = self.imageView.frame.size.height;
+    int widthOffset      = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame =
+        CGRectMake(self.imageView.frame.origin.x,
+                   self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2,
+                   self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+    
+}
+- (IBAction)onSwitchChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (std::shared_ptr<BlazeFaceDetector>)loadBalzeFace {
+    std::shared_ptr<BlazeFaceDetector> predictor = nullptr;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+    
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface.tnnproto"
+                                                          ofType:nil];
+    auto anchor_path = [[NSBundle mainBundle] pathForResource:@"model/blazeface/blazeface_anchors.txt"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0 || anchor_path.length <= 0) {
+        self.labelResult.text = @"proto or model or anchor path is invalid";
+        NSLog(@"Error: proto or model or anchor path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+    //blazeface requires input with shape 128*128
+    const int target_height = 128;
+    const int target_width  = 128;
+    DimsVector target_dims  = {1, 3, target_height, target_width};
+
+    auto image_data = utility::UIImageGetData(self.image_orig, target_height, target_width);
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<BlazeFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        //min_score_thresh
+        option->min_score_threshold = 0.75;
+        //min_suppression_thresh
+        option->min_suppression_threshold = 0.3;
+        //predefined anchor file path
+        option->anchor_path = string(anchor_path.UTF8String);
+    }
+        
+    predictor = std::make_shared<BlazeFaceDetector>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+    
+    return predictor;
+}
+
+- (std::shared_ptr<Facemesh>)loadFaceMesh {
+    std::shared_ptr<Facemesh> predictor = nullptr;
+    
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+    
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/face_mesh/face_mesh.tnnmodel"
+                                                          ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/face_mesh/face_mesh.tnnproto"
+                                                          ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return predictor;
+    }
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<FacemeshOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+
+        //TODO: set parameters
+        option->face_presence_threshold = 0.1;
+        option->flip_vertically = false;
+        option->flip_horizontally = false;
+        option->norm_z = 1.0f;
+        option->ignore_rotation = false;
+    }
+        
+    predictor = std::make_shared<Facemesh>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return nullptr;
+    }
+
+    return predictor;
+}
+
+- (void) predictOnImageList:(std::shared_ptr<TNNSDKSample>)face_detector: (std::shared_ptr<TNNSDKSample>)face_mesh {
+    //clear result
+    self.labelResult.text = nil;
+
+    TNNComputeUnits compute_units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    TNN_NS::DimsVector orig_image_dims = {1, 3, image_orig_height, image_orig_width};
+
+    DimsVector facedetector_input_dims = face_detector->GetInputShape();
+    DimsVector target_face_mesh_dims = face_mesh->GetInputShape();
+
+    auto idx = 0;
+
+    Status status = TNN_OK;
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+
+    UIImage* last_frame = nil;
+    float sum_time = 0.f;
+    for (NSString * img_path in self.result) {
+        // use autoreleasepool to rease images allocated inside each loop right after each iteration completes,
+        // otherwise the memory will be released after the complete loop completes and the code will take too much memory.
+        @autoreleasepool {
+            LOGE("processing image[%d]:%s\n",idx++,  [[img_path lastPathComponent] UTF8String]);
+            auto input_image = [UIImage imageWithContentsOfFile:img_path];
+            auto image_data = utility::UIImageGetData(input_image);
+
+            std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+
+            std::vector<BlazeFaceInfo> face_info;
+            // face detector
+            {
+                face_detector->SetBenchOption(bench_option);
+                std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+
+                if (compute_units == TNNComputeUnitsGPU) {
+                    image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, orig_image_dims);
+
+                    id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+                    if (!texture_rgba) {
+                        self.labelResult.text = @"Error texture input rgba is nil";
+                        NSLog(@"Error texture input rgba is nil");
+                        return;
+                    }
+                    [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, orig_image_dims[3], orig_image_dims[2])
+                                    mipmapLevel:0
+                                      withBytes:image_data.get()
+                                    bytesPerRow:orig_image_dims[3] * 4];
+                } else if (compute_units == TNNComputeUnitsCPU) {
+                    image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, orig_image_dims, image_data.get());
+                }
+                // preprocess
+                auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), N8UC4, facedetector_input_dims);
+                face_detector->Resize(image_mat, input_mat, TNNInterpLinear);
+
+                status = face_detector->Predict(std::make_shared<BlazeFaceDetectorInput>(input_mat), sdk_output);
+
+                if (status != TNN_OK) {
+                    self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                    NSLog(@"Error: %s", status.description().c_str());
+                    return;
+                }
+                auto bench_result     = face_detector->GetBenchResult();
+                sum_time += bench_result.total;
+
+                if (sdk_output && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get()))
+                {
+                    auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get());
+                    face_info = face_output->face_list;
+                }
+                if(face_info.size() <= 0) {
+                    //no faces, return
+                    self.labelResult.text = @"Error no faces found!";
+                    NSLog(@"Error no faces found!");
+                    continue;
+                }
+            }
+            {
+                //face mesh
+                for (auto face : face_info) {
+                    auto face_orig = face.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+                    //1.5*crop
+                    int crop_h = face_orig.y2 - face_orig.y1;
+                    int crop_w = face_orig.x2 - face_orig.x1;
+                    auto crop_rect = CGRectMake(face_orig.x1-0.25*crop_w,
+                                                face_orig.y1-0.25*crop_h,
+                                                1.5*crop_w,
+                                                1.5*crop_h);
+
+                    DimsVector crop_dims = {1, 3, static_cast<int>(crop_rect.size.height), static_cast<int>(crop_rect.size.width)};
+                    std::shared_ptr<TNN_NS::Mat> croped_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, crop_dims);
+                    status = face_detector->Crop(image_mat, croped_mat, crop_rect.origin.x, crop_rect.origin.y);
+                    if (status != TNN_OK) {
+                        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                        NSLog(@"Error: %s", status.description().c_str());
+                        return;
+                    }
+
+                    std::shared_ptr<TNN_NS::Mat> input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_face_mesh_dims);
+                    status = face_detector->Resize(croped_mat, input_mat, TNNInterpLinear);
+                    if (status != TNN_OK) {
+                        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                        NSLog(@"Error: %s", status.description().c_str());
+                        return;
+                    }
+
+                    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+                    status = face_mesh->Predict(std::make_shared<FacemeshInput>(input_mat), sdk_output);
+                    if (status != TNN_OK) {
+                        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                        NSLog(@"Error: %s", status.description().c_str());
+                        return;
+                    }
+
+                    std::vector<FacemeshInfo> face_mesh_info;
+                    if (sdk_output && dynamic_cast<FacemeshOutput *>(sdk_output.get()))
+                    {
+                        auto face_output = dynamic_cast<FacemeshOutput *>(sdk_output.get());
+                        face_mesh_info = face_output->face_list;
+                    }
+
+                    Rectangle((void *)image_data.get(), image_orig_height, image_orig_width,
+                              crop_rect.origin.x,  crop_rect.origin.y,
+                              crop_rect.origin.x+crop_rect.size.width,
+                              crop_rect.origin.y+crop_rect.size.height);
+
+                    if (face_mesh_info.size() > 0) {
+                        auto face_mesh = face_mesh_info[0];
+                        auto face_mesh_crop = face_mesh.AdjustToViewSize(crop_rect.size.height, crop_rect.size.width, 2);
+                        face_mesh_crop = face_mesh_crop.AddOffset(crop_rect.origin.x, crop_rect.origin.y);
+                        //TODO: how to draw 2d points accoring to the 3d landmark
+                        for(auto& p:face_mesh_crop.key_points_3d) {
+                            TNN_NS::Point((void*)image_data.get(), image_orig_height, image_orig_width, std::get<0>(p), std::get<1>(p), std::get<2>(p)*(-7));
+                        }
+                    }
+                    UIImage *output_image = utility::UIImageWithDataRGBA((void *)image_data.get(), image_orig_height, image_orig_width);
+#if TARGET_IPHONE_SIMULATOR
+                    // save image on simulator
+                    NSString *out_name = [[img_path lastPathComponent] stringByReplacingOccurrencesOfString: @".jpg" withString:@"_out.jpg"];
+                    // set to destination directory
+                    const std::string save_dir = "/tmp";
+                    std::string save_path = save_dir+string([out_name UTF8String]);
+                    NSString *path = [NSString stringWithCString:save_path.c_str()
+                                                        encoding:[NSString defaultCStringEncoding]];
+                    [UIImageJPEGRepresentation(output_image, 1.0) writeToFile:path atomically:YES];
+#else
+                    // write to album on real device
+                    UIImageWriteToSavedPhotosAlbum(output_image, nil, nil, nil);
+#endif
+                    if(idx == [self.result count]) {
+                        last_frame = output_image;
+                    }
+                }
+            }
+        }
+    }
+    // update view image
+    self.imageView.image = last_frame;
+    // update perf
+    float avg_time = sum_time / (idx * bench_option.forward_count);
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@\ntotal %d images\ntime per frame:%.3f ms", \
+                             compute_units == TNNComputeUnitsGPU ? @"gpu" : @"arm", idx, avg_time];
+}
+
+- (void) predictOnImage:(std::shared_ptr<TNNSDKSample>)predictor_face_detector:(std::shared_ptr<TNNSDKSample>)predictor_face_mesh {
+    Status status = TNN_OK;
+    
+    DimsVector target_face_detector_dims = predictor_face_detector->GetInputShape();
+    DimsVector target_face_mesh_dims = predictor_face_mesh->GetInputShape();
+
+    auto units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    DimsVector image_dims = {1, 3, image_orig_height, image_orig_width};
+    
+    auto image_data_for_detector = utility::UIImageGetData(self.image_orig);
+    std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+    if (units == TNNComputeUnitsCPU) {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, image_dims, image_data_for_detector.get());
+    } else {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, image_dims);
+
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, image_orig_width, image_orig_height)
+                        mipmapLevel:0
+                          withBytes:image_data_for_detector.get()
+                        bytesPerRow:image_orig_width * 4];
+    }
+
+    //face detect
+    std::vector<BlazeFaceInfo> face_info;
+    {
+        auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_face_detector_dims);
+        status = predictor_face_detector->Resize(image_mat, input_mat, TNNInterpLinear);
+        if (status != TNN_OK) {
+            self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+            NSLog(@"Error: %s", status.description().c_str());
+            return;
+        }
+
+        std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+        status = predictor_face_detector->Predict(std::make_shared<BlazeFaceDetectorInput>(input_mat), sdk_output);
+
+        if (status != TNN_OK) {
+            self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+            NSLog(@"Error: %s", status.description().c_str());
+            return;
+        }
+
+        if (sdk_output && dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get()))
+        {
+            auto face_output = dynamic_cast<BlazeFaceDetectorOutput *>(sdk_output.get());
+            face_info = face_output->face_list;
+        }
+
+        auto bench_result     = predictor_face_detector->GetBenchResult();
+        self.labelResult.text = [NSString stringWithFormat:@"device: %@      face count:%d\ntime:\n%s", units == TNNComputeUnitsGPU ? @"gpu" : @"arm", (int)face_info.size(), bench_result.Description().c_str()];
+    }
+
+    //face mesh
+    {
+        for (auto face : face_info) {
+            auto face_orig = face.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+            //1.5*crop
+            int crop_h = face_orig.y2 - face_orig.y1;
+            int crop_w = face_orig.x2 - face_orig.x1;
+            auto crop_rect = CGRectMake(face_orig.x1-0.25*crop_w,
+                                        face_orig.y1-0.25*crop_h,
+                                        1.5*crop_w,
+                                        1.5*crop_h);
+
+            DimsVector crop_dims = {1, 3, static_cast<int>(crop_rect.size.height), static_cast<int>(crop_rect.size.width)};
+            std::shared_ptr<TNN_NS::Mat> croped_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, crop_dims);
+            status = predictor_face_detector->Crop(image_mat, croped_mat, crop_rect.origin.x, crop_rect.origin.y);
+            if (status != TNN_OK) {
+                self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                NSLog(@"Error: %s", status.description().c_str());
+                return;
+            }
+
+            std::shared_ptr<TNN_NS::Mat> input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_face_mesh_dims);
+            status = predictor_face_detector->Resize(croped_mat, input_mat, TNNInterpLinear);
+            if (status != TNN_OK) {
+                self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                NSLog(@"Error: %s", status.description().c_str());
+                return;
+            }
+
+            std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+            status = predictor_face_mesh->Predict(std::make_shared<FacemeshInput>(input_mat), sdk_output);
+
+            if (status != TNN_OK) {
+                self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+                NSLog(@"Error: %s", status.description().c_str());
+                return;
+            }
+
+            std::vector<FacemeshInfo> face_mesh_info;
+            if (sdk_output && dynamic_cast<FacemeshOutput *>(sdk_output.get()))
+            {
+                auto face_output = dynamic_cast<FacemeshOutput *>(sdk_output.get());
+                face_mesh_info = face_output->face_list;
+            }
+
+            auto image_orig_data  = utility::UIImageGetData(self.image_orig, image_orig_height, image_orig_width);
+            Rectangle((void *)image_orig_data.get(), image_orig_height, image_orig_width,
+                      crop_rect.origin.x,  crop_rect.origin.y,
+                      crop_rect.origin.x+crop_rect.size.width,
+                      crop_rect.origin.y+crop_rect.size.height);
+
+            if (face_mesh_info.size() > 0) {
+                auto face_mesh = face_mesh_info[0];
+                auto face_mesh_crop = face_mesh.AdjustToViewSize(crop_rect.size.height, crop_rect.size.width, 2);
+                face_mesh_crop = face_mesh_crop.AddOffset(crop_rect.origin.x, crop_rect.origin.y);
+                //TODO: how to draw 2d points accoring to the 3d landmark
+                for(auto& p:face_mesh_crop.key_points_3d) {
+                    TNN_NS::Point((void*)image_orig_data.get(), image_orig_height, image_orig_width, std::get<0>(p), std::get<1>(p), std::get<2>(p)*(-7));
+                }
+            }
+
+            UIImage *output_image = utility::UIImageWithDataRGBA((void *)image_orig_data.get(), image_orig_height, image_orig_width);
+            self.imageView.image = output_image;
+        }
+    }
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    auto face_detector = [self loadBalzeFace];
+    auto face_mesh = [self loadFaceMesh];
+
+    [self predictOnImageList:face_detector :face_mesh];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/test_facemesh.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/test_facemesh.jpg
new file mode 100644
index 0000000..4d88cfc
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNFacemeshController/test_facemesh.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.h
new file mode 100644
index 0000000..a6585ea
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNImageClassifyController : TNNExamplesController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.mm
new file mode 100644
index 0000000..c649c58
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/TNNImageClassifyController.mm
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNImageClassifyController.h"
+#import <Metal/Metal.h>
+#include <fstream>
+#include <iostream>
+#import <tnn/tnn.h>
+
+#import "image_classifier.h"
+#import "UIImage+Utility.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNImageClassifyController ()
+@property(nonatomic, weak) IBOutlet UIButton *btnTNNExamples;
+@property(nonatomic, weak) IBOutlet UIImageView *imageView;
+@property(nonatomic, weak) IBOutlet UILabel *labelResult;
+@property(nonatomic, weak) IBOutlet UISwitch *switchGPU;
+
+@property(nonatomic, strong) UIImage *image_orig;
+
+@property(nonatomic, strong) NSArray<NSString *> *allClasses;
+@end
+
+@implementation TNNImageClassifyController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+
+}
+
+- (void)viewWillAppear:(BOOL)animated
+{
+    [super viewWillAppear:animated];
+    self.image_orig      = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"tiger_cat.jpg"
+                                                                                       ofType:nil]];
+    self.imageView.image = self.image_orig;
+
+    self.allClasses = [self getAllClasses];
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth = view.frame.size.width;
+    int screenHeight = view.frame.size.height;
+    int width = self.imageView.frame.size.width;
+    int height = self.imageView.frame.size.height;
+    int widthOffset = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame = CGRectMake(self.imageView.frame.origin.x, self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2, self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+}
+
+- (NSArray<NSString *> *)getAllClasses {
+    NSMutableArray *classes = [NSMutableArray new];
+
+    auto path_class = [[NSBundle mainBundle] pathForResource:@"synset.txt" ofType:nil];
+    ifstream fin(path_class.UTF8String);
+    string s;
+    while (getline(fin, s)) {
+        [classes addObject:[NSString stringWithFormat:@"%s", s.c_str()]];
+    }
+
+    return classes;
+}
+
+- (IBAction)onSwichChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto path_library = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+#if TNN_SDK_USE_NCNN_MODEL
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/SqueezeNet/squeezenet_v1.1.bin" ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/SqueezeNet/squeezenet_v1.1.param" ofType:nil];
+#else
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/SqueezeNet/squeezenet_v1.1.tnnmodel" ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/SqueezeNet/squeezenet_v1.1.tnnproto" ofType:nil];
+#endif
+    if (model_path.length <= 0 || proto_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data         = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = path_library.UTF8String;
+        option->compute_units = units;
+    }
+    auto predictor = std::make_shared<ImageClassifier>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 20;
+    predictor->SetBenchOption(bench_option);
+    
+    auto image_data = utility::UIImageGetData(self.image_orig);
+    //preprocess
+    const int origin_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int origin_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    DimsVector image_dims = {1, 3, origin_height, origin_width};
+    std::shared_ptr<TNN_NS::Mat> image_mat = nullptr;
+    
+    if(units == TNNComputeUnitsCPU) {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, image_dims, image_data.get());
+    } else {
+        image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, image_dims);
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+        
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, image_dims[3], image_dims[2])
+                        mipmapLevel:0
+                          withBytes:image_data.get()
+                        bytesPerRow:image_dims[3] * 4];
+    }
+    
+    auto target_dims = predictor->GetInputShape();
+    auto input_mat = std::make_shared<TNN_NS::Mat>(image_mat->GetDeviceType(), TNN_NS::N8UC4, target_dims);
+    status = predictor->Resize(image_mat, input_mat, TNNInterpLinear);
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    status = predictor->Predict(std::make_shared<TNNSDKInput>(input_mat), sdk_output);
+   
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    int class_id = -1;
+    if (sdk_output && dynamic_cast<ImageClassifierOutput *>(sdk_output.get())) {
+        auto classfy_output = dynamic_cast<ImageClassifierOutput *>(sdk_output.get());
+        class_id = classfy_output->class_id;
+    }
+    
+    string class_result = "";
+    if (class_id < _allClasses.count) {
+        class_result = _allClasses[class_id].UTF8String;
+    }
+
+    auto bench_result     = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@\nclass:%s\ntime:\n%s",
+                                                       units == TNNComputeUnitsGPU ? @"gpu" : @"arm",
+                                                       class_result.c_str(), bench_result.Description().c_str()];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/synset.txt b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/synset.txt
new file mode 100644
index 0000000..722c984
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/synset.txt
@@ -0,0 +1,1000 @@
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/tiger_cat.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/tiger_cat.jpg
new file mode 100644
index 0000000..ffcd2be
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageClassifyController/tiger_cat.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.h
new file mode 100644
index 0000000..2fcc74b
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.h
@@ -0,0 +1,19 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNImageColourController : TNNExamplesController
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.mm
new file mode 100644
index 0000000..5c19b34
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/TNNImageColourController.mm
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNImageColourController.h"
+#import <Metal/Metal.h>
+#include <fstream>
+#include <iostream>
+#import <tnn/tnn.h>
+
+#import "face_gray_transfer.h"
+#import "UIImage+Utility.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNImageColourController ()
+@property(nonatomic, weak) IBOutlet UIButton *btnTNNExamples;
+@property(nonatomic, weak) IBOutlet UIImageView *imageView;
+@property(nonatomic, weak) IBOutlet UILabel *labelResult;
+@property(nonatomic, weak) IBOutlet UISwitch *switchGPU;
+
+@property(nonatomic, strong) UIImage *image_orig;
+@property(nonatomic, strong) UIImage *image_color;
+@end
+
+@implementation TNNImageColourController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+
+}
+
+- (void)viewWillAppear:(BOOL)animated
+{
+    [super viewWillAppear:animated];
+    self.image_orig      = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"gray_00001.jpg"
+                                                                                       ofType:nil]];
+    self.imageView.image = self.image_orig;
+
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth = view.frame.size.width;
+    int screenHeight = view.frame.size.height;
+    int width = self.imageView.frame.size.width;
+    int height = self.imageView.frame.size.height;
+    int widthOffset = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame = CGRectMake(self.imageView.frame.origin.x, self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2, self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+}
+
+
+- (IBAction)onSwichChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto path_library = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path = [[NSBundle mainBundle] pathForResource:@"model/gray_transfer/G_8_GRAY2RGB_256.tnnmodel" ofType:nil];
+    auto proto_path = [[NSBundle mainBundle] pathForResource:@"model/gray_transfer/G_8_GRAY2RGB_256.tnnproto" ofType:nil];
+    
+    if (model_path.length <= 0 || proto_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data         = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data length] > 0 ? string((const char *)[data bytes], [data length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    const int target_height = 256;
+    const int target_width  = 256;
+    DimsVector target_dims = {1, 3, target_height, target_width};
+
+    auto image_data = utility::UIImageGetData(self.image_orig, target_height, target_width);
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = path_library.UTF8String;
+        option->compute_units = units;
+    }
+    
+    auto predictor = std::make_shared<FaceGrayTransfer>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+    
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    auto compute_units = predictor->GetComputeUnits();
+    if (compute_units == TNNComputeUnitsGPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, target_dims);
+
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, target_width, target_height)
+                        mipmapLevel:0
+                          withBytes:image_data.get()
+                        bytesPerRow:target_width * 4];
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    } else if (compute_units == TNNComputeUnitsCPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, target_dims, image_data.get());
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    }
+    if (status != TNN_OK) {
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    std::shared_ptr<TNN_NS::Mat> output_mat = nullptr;
+    if (sdk_output) {
+        output_mat = sdk_output->GetMat();
+    }
+    
+    const int output_height = output_mat->GetHeight();
+    const int output_width = output_mat->GetWidth();
+    const float *output_data_0 = (const float *)output_mat->GetData();
+    const float *output_data_1 = output_data_0 + output_height*output_width;
+    const float *output_data_2 = output_data_1 + output_height*output_width;
+    
+    auto output_data_rgba = new RGBA[output_height*output_width];
+    
+    for (int i=0; i<output_height*output_width; i++) {
+        auto r = (unsigned char)(output_data_0[i]*255.f/2.f + 255.f/2.f);
+        auto g = (unsigned char)(output_data_1[i]*255.f/2.f + 255.f/2.f);
+        auto b = (unsigned char)(output_data_2[i]*255.f/2.f + 255.f/2.f);
+        output_data_rgba[i] = {r, g , b, 255};
+    }
+    
+    auto output_image = utility::UIImageWithDataRGBA(output_data_rgba, output_height, output_width);
+    self.imageView.image = output_image;
+    delete [] output_data_rgba;
+
+    auto bench_result   = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@\ntime:\n%s",
+                             compute_units == TNNComputeUnitsGPU ? @"gpu" : @"arm",
+                             bench_result.Description().c_str()];
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/gray_00001.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/gray_00001.jpg
new file mode 100644
index 0000000..83dad6c
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNImageColourController/gray_00001.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/004545.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/004545.jpg
new file mode 100644
index 0000000..4e06c20
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/004545.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.h
new file mode 100644
index 0000000..04fe5c8
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNObjectDetectorController : TNNExamplesController
+- (UIImage *)drawText:(NSString *)text inImage:(UIImage *)image atPoint:(CGPoint)point;
+
+- (float)getValidPosition:(float)start limit:(float)limit;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.mm
new file mode 100644
index 0000000..d2531fc
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNObjectDetectorController/TNNObjectDetectorController.mm
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNObjectDetectorController.h"
+#import "object_detector_ssd.h"
+#import "UIImage+Utility.h"
+#import <Metal/Metal.h>
+#import <cstdlib>
+#import <sstream>
+#import <string>
+#import <tnn/tnn.h>
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNObjectDetectorController ()
+@property(nonatomic, weak) IBOutlet UIButton *btnTNNExamples;
+@property(nonatomic, weak) IBOutlet UIImageView *imageView;
+@property(nonatomic, weak) IBOutlet UILabel *labelResult;
+@property(nonatomic, weak) IBOutlet UISwitch *switchGPU;
+
+@property(nonatomic, strong) UIImage *image_orig;
+
+@property(nonatomic, strong) NSArray<NSString *> *allClasses;
+@end
+
+@implementation TNNObjectDetectorController
+;
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+}
+
+- (void)viewWillAppear:(BOOL)animated {
+    [super viewWillAppear:animated];
+
+    self.image_orig      = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"004545.jpg"
+                                                                                       ofType:nil]];
+    self.imageView.image = self.image_orig;
+
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth      = view.frame.size.width;
+    int screenHeight     = view.frame.size.height;
+    int width            = self.imageView.frame.size.width;
+    int height           = self.imageView.frame.size.height;
+    int widthOffset      = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame =
+        CGRectMake(self.imageView.frame.origin.x,
+                   self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2,
+                   self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+}
+
+- (IBAction)onSwichChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    // check release mode at Product->Scheme when running
+    //运行时请在Product->Scheme中确认已经调整到release模式
+
+    // Get metallib path from app bundle
+    // PS：A script(Build Phases -> Run Script) is added to copy the metallib
+    // file from tnn framework project to TNNExamples app
+    //注意：此工程添加了脚本将tnn工程生成的tnn.metallib自动复制到app内
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/mobilenet_v2-ssd/mobilenetv2_ssd.tnnproto"
+                                                      ofType:nil];
+
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+    // SSD model requires input with size=(300, 300)
+    const int target_height = 300;
+    const int target_width  = 300;
+    DimsVector target_dims  = {1, 3, target_height, target_width};
+
+    auto image_data = utility::UIImageGetData(self.image_orig, target_height, target_width);
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+    }
+    
+    auto predictor = std::make_shared<ObjectDetectorSSD>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    BenchOption bench_option;
+    bench_option.forward_count = 20;
+    predictor->SetBenchOption(bench_option);
+
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    auto compute_units = predictor->GetComputeUnits();
+    if (compute_units == TNNComputeUnitsGPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, target_dims);
+
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, target_width, target_height)
+                        mipmapLevel:0
+                          withBytes:image_data.get()
+                        bytesPerRow:target_width * 4];
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    } else if (compute_units == TNNComputeUnitsCPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, target_dims, image_data.get());
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    }
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+    
+    std::vector<ObjectInfo> object_list;
+    if (sdk_output && dynamic_cast<ObjectDetectorSSDOutput *>(sdk_output.get())) {
+        auto obj_output = dynamic_cast<ObjectDetectorSSDOutput *>(sdk_output.get());
+        object_list = obj_output->object_list;
+    }
+    
+    auto bench_result     = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@      \nfind %d objects\ntime:\n%s",
+                                                       compute_units == TNNComputeUnitsGPU ? @"gpu" : @"arm",
+                                                       (int)object_list.size(), bench_result.Description().c_str()];
+
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    float scale_x               = image_orig_width / (float)target_width;
+    float scale_y               = image_orig_height / (float)target_height;
+    auto image_orig_data        = utility::UIImageGetData(self.image_orig, image_orig_height, image_orig_width);
+    // draw boxes
+    for (int i = 0; i < object_list.size(); i++) {
+        auto obj = object_list[i];
+        Rectangle((void *)image_orig_data.get(), image_orig_height, image_orig_width, obj.x1, obj.y1, obj.x2, obj.y2,
+                  scale_x, scale_y);
+    }
+    UIImage *output_image =
+        utility::UIImageWithDataRGBA((void *)image_orig_data.get(), image_orig_height, image_orig_width);
+    // draw texts
+    stringstream descStr;
+    for (int i = 0; i < object_list.size(); i++) {
+        auto &obj = object_list[i];
+
+        descStr.precision(3);
+        descStr << voc_classes[obj.class_id] << ",";
+        descStr << std::fixed << obj.score;
+        NSString *text = [NSString stringWithCString:descStr.str().c_str() encoding:[NSString defaultCStringEncoding]];
+        descStr.str("");
+
+        auto x    = obj.x1 * scale_x;
+        auto y    = [self getValidPosition:obj.y1 limit:image_orig_height] * scale_y;
+        CGPoint p = CGPointMake(x, y);
+
+        output_image = [self drawText:text inImage:output_image atPoint:p];
+    }
+
+    self.imageView.image = output_image;
+}
+
+- (UIImage *)drawText:(NSString *)text inImage:(UIImage *)image atPoint:(CGPoint)point {
+    // set text fond and color
+    UIFont *font   = [UIFont boldSystemFontOfSize:15];
+    UIColor *color = [UIColor redColor];
+    UIGraphicsBeginImageContext(image.size);
+    [image drawInRect:CGRectMake(0, 0, image.size.width, image.size.height)];
+    CGRect rect       = CGRectMake(point.x, point.y, image.size.width, image.size.height);
+    NSDictionary *att = @{NSFontAttributeName : font, NSForegroundColorAttributeName : color};
+    [text drawInRect:rect withAttributes:att];
+    UIImage *newImage = UIGraphicsGetImageFromCurrentImageContext();
+    UIGraphicsEndImageContext();
+
+    return newImage;
+}
+
+- (float)getValidPosition:(float)start limit:(float)limit {
+    // try upper first
+    if (start - 15 > 0)
+        return start - 15;
+    return start;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.h b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.h
new file mode 100644
index 0000000..f443af5
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNExamplesController.h"
+
+@interface TNNYoloObjectDetectorController : TNNExamplesController
+- (UIImage *)drawText:(NSString *)text inImage:(UIImage *)image atPoint:(CGPoint)point;
+
+- (float)getValidPosition:(float)start limit:(float)limit;
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.mm b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.mm
new file mode 100644
index 0000000..81a1fd1
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/TNNYoloObjectDetectorController.mm
@@ -0,0 +1,217 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "TNNYoloObjectDetectorController.h"
+#import "object_detector_yolo.h"
+#import "UIImage+Utility.h"
+#import <Metal/Metal.h>
+#import <cstdlib>
+#import <sstream>
+#import <string>
+#import <tnn/tnn.h>
+
+using namespace std;
+using namespace TNN_NS;
+
+@interface TNNYoloObjectDetectorController ()
+@property(nonatomic, weak) IBOutlet UIButton *btnTNNExamples;
+@property(nonatomic, weak) IBOutlet UIImageView *imageView;
+@property(nonatomic, weak) IBOutlet UILabel *labelResult;
+@property(nonatomic, weak) IBOutlet UISwitch *switchGPU;
+
+@property(nonatomic, strong) UIImage *image_orig;
+
+@property(nonatomic, strong) NSArray<NSString *> *allClasses;
+@end
+
+@implementation TNNYoloObjectDetectorController
+;
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+}
+
+- (void)viewWillAppear:(BOOL)animated {
+    [super viewWillAppear:animated];
+
+    self.image_orig      = [UIImage imageWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"dog_cropped.jpg"
+                                                                                       ofType:nil]];
+    self.imageView.image = self.image_orig;
+
+    auto view = self.labelResult.superview;
+    [self.imageView removeFromSuperview];
+    [self.labelResult removeFromSuperview];
+    int screenWidth      = view.frame.size.width;
+    int screenHeight     = view.frame.size.height;
+    int width            = self.imageView.frame.size.width;
+    int height           = self.imageView.frame.size.height;
+    int widthOffset      = (screenWidth - width) / 2;
+    self.imageView.frame = CGRectMake(widthOffset, (screenHeight - height) / 10, width, height);
+    [view addSubview:self.imageView];
+    self.labelResult.frame =
+        CGRectMake(self.imageView.frame.origin.x,
+                   self.imageView.frame.origin.y + height + 5 - self.labelResult.frame.size.height / 2,
+                   self.labelResult.frame.size.width, self.labelResult.frame.size.height);
+    [view addSubview:self.labelResult];
+}
+
+- (IBAction)onSwichChanged:(id)sender {
+    self.imageView.image  = self.image_orig;
+    self.labelResult.text = nil;
+}
+
+- (IBAction)onBtnTNNExamples:(id)sender {
+    
+    auto library_path = [[NSBundle mainBundle] pathForResource:@"tnn.metallib" ofType:nil];
+    auto model_path   = [[NSBundle mainBundle] pathForResource:@"model/yolov5/yolov5s.tnnmodel"
+                                                      ofType:nil];
+    auto proto_path   = [[NSBundle mainBundle] pathForResource:@"model/yolov5/yolov5s-permute.tnnproto"
+                                                      ofType:nil];
+    if (proto_path.length <= 0 || model_path.length <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+
+    string proto_content =
+        [NSString stringWithContentsOfFile:proto_path encoding:NSUTF8StringEncoding error:nil].UTF8String;
+    NSData *data_mode    = [NSData dataWithContentsOfFile:model_path];
+    string model_content = [data_mode length] > 0 ? string((const char *)[data_mode bytes], [data_mode length]) : "";
+    if (proto_content.size() <= 0 || model_content.size() <= 0) {
+        self.labelResult.text = @"proto or model path is invalid";
+        NSLog(@"Error: proto or model path is invalid");
+        return;
+    }
+    const int target_height = 448;
+    const int target_width  = 640;
+    DimsVector target_dims  = {1, 3, target_height, target_width};
+
+    auto image_data = utility::UIImageGetData(self.image_orig, target_height, target_width);
+
+    TNNComputeUnits units = self.switchGPU.isOn ? TNNComputeUnitsGPU : TNNComputeUnitsCPU;
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = library_path.UTF8String;
+        option->compute_units = units;
+    }
+    
+    auto predictor = std::make_shared<ObjectDetectorYolo>();
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithFormat:@"%s", status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    BenchOption bench_option;
+    bench_option.forward_count = 1;
+    predictor->SetBenchOption(bench_option);
+
+    std::shared_ptr<TNNSDKOutput> sdk_output = nullptr;
+    auto compute_units = predictor->GetComputeUnits();
+    if (compute_units == TNNComputeUnitsGPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, target_dims);
+
+        id<MTLTexture> texture_rgba = (__bridge id<MTLTexture>)image_mat->GetData();
+        if (!texture_rgba) {
+            self.labelResult.text = @"Error texture input rgba is nil";
+            NSLog(@"Error texture input rgba is nil");
+            return;
+        }
+
+        [texture_rgba replaceRegion:MTLRegionMake2D(0, 0, target_width, target_height)
+                        mipmapLevel:0
+                          withBytes:image_data.get()
+                        bytesPerRow:target_width * 4];
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    } else if (compute_units == TNNComputeUnitsCPU) {
+        auto image_mat = std::make_shared<TNN_NS::Mat>(DEVICE_ARM, TNN_NS::N8UC4, target_dims, image_data.get());
+        status = predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output);
+    }
+    if (status != TNN_OK) {
+        self.labelResult.text = [NSString stringWithUTF8String:status.description().c_str()];
+        NSLog(@"Error: %s", status.description().c_str());
+        return;
+    }
+
+    std::vector<ObjectInfo> object_list;
+    if (sdk_output && dynamic_cast<ObjectDetectorYoloOutput *>(sdk_output.get())) {
+        auto obj_output = dynamic_cast<ObjectDetectorYoloOutput *>(sdk_output.get());
+        object_list = obj_output->object_list;
+    }
+    
+    auto bench_result     = predictor->GetBenchResult();
+    self.labelResult.text = [NSString stringWithFormat:@"device: %@      \nfind %d objects\ntime:\n%s",
+                                                       compute_units == TNNComputeUnitsGPU ? @"gpu" : @"arm",
+                                                       (int)object_list.size(), bench_result.Description().c_str()];
+    
+    const int image_orig_height = (int)CGImageGetHeight(self.image_orig.CGImage);
+    const int image_orig_width  = (int)CGImageGetWidth(self.image_orig.CGImage);
+    float scale_x               = image_orig_width / (float)target_width;
+    float scale_y               = image_orig_height / (float)target_height;
+    auto image_orig_data        = utility::UIImageGetData(self.image_orig, image_orig_height, image_orig_width);
+    // draw boxes
+    for (int i = 0; i < object_list.size(); i++) {
+        auto obj = object_list[i];
+        Rectangle((void *)image_orig_data.get(), image_orig_height, image_orig_width, obj.x1, obj.y1, obj.x2, obj.y2,
+                  scale_x, scale_y);
+    }
+    UIImage *output_image =
+        utility::UIImageWithDataRGBA((void *)image_orig_data.get(), image_orig_height, image_orig_width);
+    // draw texts
+    stringstream descStr;
+    for (int i = 0; i < object_list.size(); i++) {
+        auto &obj = object_list[i];
+
+        descStr.precision(3);
+        descStr << coco_classes[obj.class_id] << ",";
+        descStr << std::fixed << obj.score;
+        NSString *text = [NSString stringWithCString:descStr.str().c_str() encoding:[NSString defaultCStringEncoding]];
+        descStr.str("");
+
+        auto x    = obj.x1 * scale_x;
+        auto y    = [self getValidPosition:obj.y1 limit:image_orig_height] * scale_y;
+        CGPoint p = CGPointMake(x, y);
+
+        output_image = [self drawText:text inImage:output_image atPoint:p];
+    }
+
+    self.imageView.image = output_image;
+}
+
+- (UIImage *)drawText:(NSString *)text inImage:(UIImage *)image atPoint:(CGPoint)point {
+    // set text fond and color
+    UIFont *font   = [UIFont boldSystemFontOfSize:15];
+    UIColor *color = [UIColor redColor];
+    UIGraphicsBeginImageContext(image.size);
+    [image drawInRect:CGRectMake(0, 0, image.size.width, image.size.height)];
+    CGRect rect       = CGRectMake(point.x, point.y, image.size.width, image.size.height);
+    NSDictionary *att = @{NSFontAttributeName : font, NSForegroundColorAttributeName : color};
+    [text drawInRect:rect withAttributes:att];
+    UIImage *newImage = UIGraphicsGetImageFromCurrentImageContext();
+    UIGraphicsEndImageContext();
+
+    return newImage;
+}
+
+- (float)getValidPosition:(float)start limit:(float)limit {
+    // try upper first
+    if (start - 15 > 0)
+        return start - 15;
+    return start;
+}
+
+@end
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/dog_cropped.jpg b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/dog_cropped.jpg
new file mode 100644
index 0000000..802dbb1
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/TNNYoloObjectDetectorController/dog_cropped.jpg differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.h b/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.h
new file mode 100644
index 0000000..537a94e
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.h
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <UIKit/UIKit.h>
+#include <memory>
+#include <tuple>
+
+namespace utility {
+/***
+@brief convert uiimage to rgba raw data without resizing
+@param image uimage
+ */
+std::shared_ptr<char> UIImageGetData(UIImage *image);
+
+/**
+@brief convert uiimage to rgba raw data, resize to height x width
+@param image uimage
+@param height target image height
+@param width target image width
+ @param gravity
+ * 0:resize
+ * 1:resize fit the view and keep aspect, empty space may be remained zero
+ *  2:resize to fill the view and keep aspect, no empty space remain
+ */
+std::shared_ptr<char> UIImageGetData(UIImage *image, int height, int width, int gravity = 0);
+
+UIImage * UIImageCrop(UIImage *image, CGRect rect);
+
+/**
+ @brief convert image rgba raw data to uiimage
+ @param image_data rgba raw data pointer
+ @param height image height
+ @param width image width
+ */
+UIImage *UIImageWithDataRGBA(void *image_data, int height, int width);
+
+/**
+@brief convert imageBuffer to rgba raw data without resizing
+@param imageBuffer image buffer
+*/
+std::shared_ptr<char> CVImageBuffRefGetData(CVImageBufferRef imageBuffer);
+
+/**
+@brief convert imageBuffer to rgba raw data, resize to height x width
+@param imageBuffer image buffer
+@param height target image height
+@param width target image width
+*/
+std::shared_ptr<char> CVImageBuffRefGetData(CVImageBufferRef imageBuffer, int height, int width);
+
+UIImage *UIImageWithCVImageBuffRef(CVImageBufferRef imageBuffer);
+}
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.mm b/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.mm
new file mode 100644
index 0000000..c538802
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/UIImage+Utility.mm
@@ -0,0 +1,208 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "UIImage+Utility.h"
+
+namespace utility {
+std::shared_ptr<char> UIImageGetData(UIImage *image) {
+    int height = (int)CGImageGetHeight(image.CGImage);
+    int width  = (int)CGImageGetWidth(image.CGImage);
+    return UIImageGetData(image, height, width, 0);
+}
+
+std::shared_ptr<char> UIImageGetData(UIImage *image, int height, int width, int gravity) {
+    std::shared_ptr<char> data = nullptr;
+    if (image == nil || image.CGImage == nil || height <= 0 || width <= 0) {
+        return data;
+    }
+
+    CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
+    int cols                   = width;
+    int rows                   = height;
+
+    if (cols == 0 || rows == 0) {
+        return data;
+    }
+    
+    data = std::shared_ptr<char>((char*)calloc(rows * cols * 4, 1), [](char *p) { free(p); });
+
+    CGContextRef contextRef =
+        CGBitmapContextCreate(data.get(),                                             // Pointer to backing data
+                              cols,                                                   // Width of bitmap
+                              rows,                                                   // Height of bitmap
+                              8,                                                      // Bits per component
+                              cols * 4,                                               // Bytes per row
+                              colorSpace,                                             // Colorspace
+                              kCGImageAlphaNoneSkipLast | kCGBitmapByteOrderDefault); // Bitmap info flags
+
+    CGContextSetInterpolationQuality(contextRef, kCGInterpolationHigh);
+    
+    int image_orig_height = (int)CGImageGetHeight(image.CGImage);
+    int image_orig_width = (int)CGImageGetWidth(image.CGImage);
+    
+    float image_target_aspect = height/(float)(width + FLT_EPSILON);
+    float image_orig_aspect = image_orig_height/(float)(image_orig_width + FLT_EPSILON);
+    
+    if (gravity == 1) {
+        int offset_x = 0;
+        int offset_y = 0;
+        if (image_orig_aspect > image_target_aspect) {
+            int object_aspect_width = height / image_orig_aspect;
+            offset_x = (width - object_aspect_width) / 2;
+        } else {
+            int object_aspect_height = width * image_orig_aspect;
+            offset_y = (height - object_aspect_height) / 2;
+        }
+        CGContextDrawImage(contextRef,
+                           CGRectMake(offset_x, offset_y, cols-2*offset_x, rows - 2*offset_y),
+                           image.CGImage);
+    } else if (gravity == 2) {
+        int offset_x = 0;
+        int offset_y = 0;
+        if (image_orig_aspect > image_target_aspect) {
+            int object_aspect_height = image_orig_width * image_target_aspect;
+            offset_y = (image_orig_height - object_aspect_height) / 2;
+        } else {
+            int object_aspect_width = image_orig_height / image_target_aspect;
+            offset_x = (image_orig_width - object_aspect_width) / 2;
+        }
+        
+        auto image_crop_ref = CGImageCreateWithImageInRect(image.CGImage,
+                                                           CGRectMake(offset_x, offset_y, image_orig_width-2*offset_x, image_orig_height-2*offset_y));
+        CGContextDrawImage(contextRef, CGRectMake(0, 0, cols, rows), image_crop_ref);
+        CGImageRelease(image_crop_ref);
+    } else {
+        CGContextDrawImage(contextRef, CGRectMake(0, 0, cols, rows), image.CGImage);
+    }
+    
+    CGContextRelease(contextRef);
+    CGColorSpaceRelease(colorSpace);
+
+    return data;
+}
+
+UIImage * UIImageCrop(UIImage *image, CGRect rect) {
+    CGImageRef imageRef = CGImageCreateWithImageInRect([image CGImage], rect);
+    UIImage *croppedImage = [UIImage imageWithCGImage:imageRef];
+    CGImageRelease(imageRef);
+    return croppedImage;
+}
+
+UIImage *UIImageWithDataRGBA(void *image_data, int height, int width) {
+    UIImage *image = nullptr;
+    if (image_data == nil || height <= 0 || width <= 0) {
+        return image;
+    }
+
+    NSData *data = [NSData dataWithBytes:image_data length:height * width * 4];
+
+    CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
+    CGDataProviderRef provider = CGDataProviderCreateWithCFData((CFDataRef)data);
+
+    CGImageRef imageRef = CGImageCreate(width,                                                 // Width
+                                        height,                                                // Height
+                                        8,                                                     // Bits per component
+                                        8 * 4,                                                 // Bits per pixel
+                                        width * 4,                                             // Bytes per row
+                                        colorSpace,                                            // Colorspace
+                                        kCGImageAlphaNoneSkipLast | kCGBitmapByteOrderDefault, // Bitmap info flags
+                                        provider,                                              // CGDataProviderRef
+                                        NULL,                                                  // Decode
+                                        false,                                                 // Should interpolate
+                                        kCGRenderingIntentDefault);                            // Intent
+
+    image = [UIImage imageWithCGImage:imageRef];
+
+    CGImageRelease(imageRef);
+    CGDataProviderRelease(provider);
+    CGColorSpaceRelease(colorSpace);
+    return image;
+}
+
+UIImage *UIImageWithCVImageBuffRef(CVImageBufferRef imageBuffer) {
+    CVPixelBufferLockBaseAddress(imageBuffer, 0);
+    void *baseAddress = CVPixelBufferGetBaseAddress(imageBuffer);
+    size_t bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer);
+    size_t width = CVPixelBufferGetWidth(imageBuffer);
+    size_t height = CVPixelBufferGetHeight(imageBuffer);
+    CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
+    CGContextRef context = CGBitmapContextCreate(baseAddress, width, height, 8,
+                                                 bytesPerRow, colorSpace, kCGBitmapByteOrder32Little | kCGImageAlphaPremultipliedFirst);
+    CGImageRef quartzImage = CGBitmapContextCreateImage(context);
+    CVPixelBufferUnlockBaseAddress(imageBuffer,0);
+
+    CGContextRelease(context);
+    CGColorSpaceRelease(colorSpace);
+    
+    UIImage *image = [UIImage imageWithCGImage:quartzImage];
+    CGImageRelease(quartzImage);
+    
+    return (image);
+}
+
+std::shared_ptr<char> CVImageBuffRefGetData(CVImageBufferRef image_buffer) {
+    CGSize size = CVImageBufferGetDisplaySize(image_buffer);
+    return CVImageBuffRefGetData(image_buffer, size.height, size.width);
+}
+
+std::shared_ptr<char> CVImageBuffRefGetData(CVImageBufferRef image_buffer, int target_height, int target_width) {
+    std::shared_ptr<char> data = nullptr;
+    if (image_buffer == nil){
+        return data;
+    }
+    CGSize size = CVImageBufferGetDisplaySize(image_buffer);
+    if (size.height <= 0 || size.width <= 0) {
+        return data;
+    }
+    
+    data = std::shared_ptr<char>(new char[target_height * target_width * 4], [](char* p) { delete[] p; });
+
+    CGColorSpaceRef color_space = CGColorSpaceCreateDeviceRGB();
+    
+    
+    CVPixelBufferLockBaseAddress(image_buffer, 0);
+    void *base_address = CVPixelBufferGetBaseAddress(image_buffer);
+    size_t bytes_per_row = CVPixelBufferGetBytesPerRow(image_buffer);
+    size_t width = CVPixelBufferGetWidth(image_buffer);
+    size_t height = CVPixelBufferGetHeight(image_buffer);
+    CGContextRef context_orig = CGBitmapContextCreate(base_address, width, height, 8,
+                                                 bytes_per_row, color_space,
+                                                 kCGBitmapByteOrder32Little | kCGImageAlphaPremultipliedFirst);
+    CGImageRef cgmage_orig = CGBitmapContextCreateImage(context_orig);
+    
+    {
+        //resize
+        CGContextRef context_target = CGBitmapContextCreate(data.get(),
+                                                        target_width,
+                                                        target_height,
+                                                        8,
+                                                        target_width * 4,
+                                                        color_space,
+                                                        kCGImageAlphaNoneSkipLast | kCGBitmapByteOrderDefault);
+
+        CGContextSetInterpolationQuality(context_target, kCGInterpolationHigh);
+        CGContextDrawImage(context_target, CGRectMake(0, 0, target_width, target_height), cgmage_orig);
+        CGContextRelease(context_target);
+    }
+    
+    CVPixelBufferUnlockBaseAddress(image_buffer,0);
+
+    CGContextRelease(context_orig);
+    CGColorSpaceRelease(color_space);
+    CGImageRelease(cgmage_orig);
+    
+    return data;
+}
+
+}
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/logo@2x.png b/3rdparty/TNN/examples/ios/TNNExamples/logo@2x.png
new file mode 100644
index 0000000..b15c315
Binary files /dev/null and b/3rdparty/TNN/examples/ios/TNNExamples/logo@2x.png differ
diff --git a/3rdparty/TNN/examples/ios/TNNExamples/main.m b/3rdparty/TNN/examples/ios/TNNExamples/main.m
new file mode 100644
index 0000000..dba295e
--- /dev/null
+++ b/3rdparty/TNN/examples/ios/TNNExamples/main.m
@@ -0,0 +1,11 @@
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+
+int main(int argc, char * argv[]) {
+    NSString * appDelegateClassName;
+    @autoreleasepool {
+        // Setup code that might create autoreleased objects goes here.
+        appDelegateClassName = NSStringFromClass([AppDelegate class]);
+    }
+    return UIApplicationMain(argc, argv, nil, appDelegateClassName);
+}
diff --git a/3rdparty/TNN/examples/linux/.gitignore b/3rdparty/TNN/examples/linux/.gitignore
new file mode 100644
index 0000000..9b684d9
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/.gitignore
@@ -0,0 +1,3 @@
+x86/build_x86_linux
+x86/build_x86_macos
+cuda/build_cuda_linux
diff --git a/3rdparty/TNN/examples/linux/cross/CMakeLists.txt b/3rdparty/TNN/examples/linux/cross/CMakeLists.txt
new file mode 100644
index 0000000..670c753
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/cross/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+set(CMAKE_SYSTEM_NAME Linux)
+set(TNN_OPENMP_ENABLE ON)
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_ARM_ENABLE ON)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+
+set(TNNRoot ${CMAKE_CURRENT_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${TNNRoot}/third_party/stb)
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+link_directories(${TNN_LIB_PATH})
+
+link_libraries(-Wl,--whole-archive TNN -Wl,--no-whole-archive)
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../src/*.cc")
+
+file(GLOB_RECURSE BASE_SRC "${CMAKE_SOURCE_DIR}/../../base/*.cc")
+file(GLOB_RECURSE UTIL_SRC "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+
+add_definitions(-D_ARM_)
+add_executable(demo_arm_imageclassify ../src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_arm_facedetector ../src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_arm_objectdetector ../src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
diff --git a/3rdparty/TNN/examples/linux/cross/build_aarch64_linux.sh b/3rdparty/TNN/examples/linux/cross/build_aarch64_linux.sh
new file mode 100644
index 0000000..d1058f1
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/cross/build_aarch64_linux.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+CC=aarch64-linux-gnu-gcc
+CXX=aarch64-linux-gnu-g++
+TNN_LIB_PATH=../../../scripts/build_aarch64_linux/
+
+cd ../../../scripts
+./build_aarch64_linux.sh
+cd -
+
+rm -r build
+mkdir build
+cd build
+cmake .. \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH
+
+make -j4
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/cross/build_armhf_linux.sh b/3rdparty/TNN/examples/linux/cross/build_armhf_linux.sh
new file mode 100644
index 0000000..afd1db5
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/cross/build_armhf_linux.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+CC=arm-linux-gnueabihf-gcc
+CXX=arm-linux-gnueabihf-g++
+TNN_LIB_PATH=../../../scripts/build_armhf_linux
+
+cd ../../../scripts
+./build_armhf_linux.sh
+cd -
+
+rm -r build
+mkdir -p build
+cd build
+cmake .. \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH
+
+make -j4
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/cuda/CMakeLists.txt b/3rdparty/TNN/examples/linux/cuda/CMakeLists.txt
new file mode 100644
index 0000000..990b2de
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/cuda/CMakeLists.txt
@@ -0,0 +1,56 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+
+if(NOT DEFINED ENV{TENSORRT_ROOT_DIR})
+    message(FATAL_ERROR "not defined environment variable:TENSORRT_ROOT_DIR")
+endif()
+
+if(NOT DEFINED ENV{CUDNN_ROOT_DIR})
+    message(FATAL_ERROR "not defined environment variable:CUDNN_ROOT_DIR")
+endif()
+
+find_package(CUDA REQUIRED)
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_CUDA_ENABLE ON)
+set(TNN_TENSORRT_ENABLE ON)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+
+set(TNNRoot ${CMAKE_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+include_directories(${CUDA_INCLUDE_DIRS})
+
+link_directories(${TNN_LIB_PATH})
+link_directories($ENV{TENSORRT_ROOT_DIR}/lib)
+link_directories($ENV{CUDNN_ROOT_DIR}/lib)
+set(CUDA_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+link_libraries(TNN)
+link_libraries(${CUDA_LINK_LIBS})
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../src/*.cc")
+file(GLOB_RECURSE BASE_SRC
+    "${CMAKE_SOURCE_DIR}/../../base/*.cc"
+    "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+file(GLOB_RECURSE WEBCAM_SRC "${CMAKE_SOURCE_DIR}/../../base/ocr*.cc")
+list(REMOVE_ITEM BASE_SRC ${WEBCAM_SRC})
+
+add_definitions(-D_CUDA_)
+add_executable(demo_cuda_imageclassify ../src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${FLAG_SRC})
+add_executable(demo_cuda_facedetector ../src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${FLAG_SRC})
+add_executable(demo_cuda_readingcomprehension ../src/BertReadingComprehension/BertReadingComprehension.cc ${BASE_SRC} ${FLAG_SRC})
+add_executable(demo_cuda_objectdetector ../src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+
diff --git a/3rdparty/TNN/examples/linux/cuda/build_linux.sh b/3rdparty/TNN/examples/linux/cuda/build_linux.sh
new file mode 100644
index 0000000..3c02942
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/cuda/build_linux.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+TNN_LIB_PATH=../../../scripts/build_cuda_linux/
+
+cd ../../../scripts
+sh build_cuda_linux.sh
+cd -
+
+rm -rf build_cuda_linux
+mkdir build_cuda_linux
+cd build_cuda_linux
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH \
+
+make -j4
diff --git a/3rdparty/TNN/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc b/3rdparty/TNN/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc
new file mode 100644
index 0000000..7894bf0
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/BertReadingComprehension/BertReadingComprehension.cc
@@ -0,0 +1,112 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "bert_tokenizer.h"
+#include "tnn_sdk_sample.h"
+#include "utils/utils.h"
+#include "macro.h"
+
+#include "../flags.h"
+
+using namespace TNN_NS;
+
+static const char demo_guide[] = "If you don't know how to use this demo or got an execution error.\n"
+    "step1. Check the model and vocab path (download from url: https://github.com/darrenyao87/tnn-models/tree/master/model/bertsquad10)\n"
+    "step2. Enter a paragraph found in the wiki or elsewhere\n"
+    "       eg. TNN README\n"
+    "       TNN: A high-performance, lightweight neural network inference framework open sourced by Tencent Youtu Lab. It also has many outstanding advantages such as cross-platform, high performance, model compression, and code tailoring. The TNN framework further strengthens the support and performance optimization of mobile devices on the basis of the original Rapidnet and ncnn frameworks. At the same time, it refers to the high performance and good scalability characteristics of the industry's mainstream open source frameworks, and expands the support for X86 and NV GPUs. On the mobile phone, TNN has been used by many applications such as mobile QQ, weishi, and Pitu. As a basic acceleration framework for Tencent Cloud AI, TNN has provided acceleration support for the implementation of many businesses. Everyone is welcome to participate in the collaborative construction to promote the further improvement of the TNN reasoning framework.\n"
+    "step3. Enter a question about the paragraph\n"
+    "       what is TNN?\n"
+    "       where TNN has been used?\n";
+
+static const char vocab_path_message[] = "(required) vocab file path";
+DEFINE_string(v, "", vocab_path_message);
+
+#define LETTER_MAX_COUNT 10000
+#define MAX_SEQ_LENGTH 256
+int main(int argc, char **argv) {
+    if (!ParseAndCheckCommandLine(argc, argv, false)) {
+        printf("%s\n", demo_guide);
+
+        ShowUsage(argv[0], false);
+        printf("\t-v, <vocab>    \t%s\n", vocab_path_message);
+        return -1;
+    }
+    if (FLAGS_v.empty()) {
+        printf("\t-v, <vocab>    \t%s\n", vocab_path_message);
+        return -1;
+    }
+
+    auto tokenizer = std::make_shared<BertTokenizer>();
+
+    std::cout << "Initializing Vocabularies..." << std::endl;
+    tokenizer->Init(FLAGS_v.c_str());
+    
+    // 创建tnn实例
+    std::cout << "Initializing TNN Instance..." << std::endl;
+    auto proto_content = fdLoadFile(FLAGS_p.c_str());
+    auto model_content = fdLoadFile(FLAGS_m.c_str());
+    int h = 1, w = 256;
+    std::vector<int> nchw = {1, 256};
+
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        
+        option->library_path = "";
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        // if enable openvino/tensorrt, set option compute_units to openvino/tensorrt
+        #ifdef _CUDA_
+            option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+        #elif _OPENVINO_
+            option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+        #endif
+        
+        option->input_shapes.insert(std::pair<std::string, DimsVector>("input_ids_0", nchw));
+        option->input_shapes.insert(std::pair<std::string, DimsVector>("input_mask_0", nchw));
+        option->input_shapes.insert(std::pair<std::string, DimsVector>("segment_ids_0", nchw));
+    }
+
+    
+    auto bertInput = std::make_shared<BertTokenizerInput>(DEVICE_NAIVE);  
+    auto predictor = std::make_shared<TNNSDKSample>();
+
+    auto bertOutput = predictor->CreateSDKOutput();
+    
+    CHECK_TNN_STATUS(predictor->Init(option));
+
+    char* paragraph = (char*)malloc(sizeof(char) * LETTER_MAX_COUNT);
+    char* question = (char*)malloc(sizeof(char) * LETTER_MAX_COUNT);
+
+    std::cout << "Please Enter the paragraph:(words limit count " << MAX_SEQ_LENGTH << ")" << std::endl;
+    std::cin.getline(paragraph, LETTER_MAX_COUNT);
+    std::cout << "Please Enter the question:" << std::endl;
+    std::cin.getline(question, LETTER_MAX_COUNT);
+
+    const std::string quit("exit");
+    while (quit.compare(question) != 0) {
+        tokenizer->buildInput(paragraph, question, bertInput);
+        CHECK_TNN_STATUS(predictor->Predict(bertInput, bertOutput));
+        std::string ans;
+        tokenizer->ConvertResult(bertOutput, ans);
+
+        std::cout << "Please Enter the question: (Enter exit to quit)" << std::endl;
+        std::cin.getline(question, LETTER_MAX_COUNT);
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc b/3rdparty/TNN/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc
new file mode 100644
index 0000000..69e213a
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "ultra_face_detector.h"
+#include "tnn_sdk_sample.h"
+#include "macro.h"
+#include "utils/utils.h"
+
+#include "../flags.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_resize.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_write.h"
+
+using namespace TNN_NS;
+
+int main(int argc, char** argv) {
+    if (!ParseAndCheckCommandLine(argc, argv)) {
+        ShowUsage(argv[0]);
+        return -1;
+    }
+
+    // 创建tnn实例
+    auto proto_content = fdLoadFile(FLAGS_p.c_str());
+    auto model_content = fdLoadFile(FLAGS_m.c_str());
+   // int h = 240, w = 320;
+
+    auto option = std::make_shared<UltraFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = "";
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        // if enable openvino/tensorrt, set option compute_units to openvino/tensorrt
+        #ifdef _CUDA_
+            option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+        #elif _OPENVINO_
+            option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+        #endif
+    
+        option->score_threshold = 0.95;
+        option->iou_threshold = 0.15;
+    }
+    
+    auto predictor = std::make_shared<UltraFaceDetector>();
+
+    char img_buff[256];
+    char *input_imgfn = img_buff;
+    strncpy(input_imgfn, FLAGS_i.c_str(), 256);
+    printf("Face-detector is about to start, and the picrture is %s\n",input_imgfn);
+
+    int image_width, image_height, image_channel;
+    unsigned char *data = stbi_load(input_imgfn, &image_width, &image_height, &image_channel, 3);
+    if (!data) {
+        fprintf(stderr, "Face-detector open file %s failed.\n", input_imgfn);
+    }
+
+    //Init
+    std::shared_ptr<TNNSDKOutput> sdk_output = predictor->CreateSDKOutput();
+    CHECK_TNN_STATUS(predictor->Init(option));
+    //Predict
+    std::vector<int> nchw = {1, 3, image_height, image_width};
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+    CHECK_TNN_STATUS(predictor->Predict(std::make_shared<UltraFaceDetectorInput>(image_mat), sdk_output));
+    std::vector<FaceInfo> face_info;
+    if (sdk_output && dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<UltraFaceDetectorOutput *>(sdk_output.get());
+        face_info = face_output->face_list;
+    }
+
+    const int image_orig_height = int(image_height);
+    const int image_orig_width  = int(image_width);
+    const auto& target_dims     = predictor->GetInputShape();
+    const int target_height     = target_dims[2];
+    const int target_width      = target_dims[3];
+    float scale_x               = image_orig_width  / (float)target_width;
+    float scale_y               = image_orig_height / (float)target_height;
+
+    //convert rgb to rgb-a
+    uint8_t *ifm_buf = new uint8_t[image_orig_height*image_orig_width*4];
+    for (int i = 0; i < image_orig_height*image_orig_width; ++i) {
+        ifm_buf[i*4]   = data[i*3];
+        ifm_buf[i*4+1] = data[i*3+1];
+        ifm_buf[i*4+2] = data[i*3+2];
+        ifm_buf[i*4+3] = 255;
+    }
+    for (int i = 0; i < face_info.size(); i++) {
+        auto face = face_info[i];
+        TNN_NS::Rectangle((void *)ifm_buf, image_orig_height, image_orig_width, face.x1, face.y1, face.x2,
+                  face.y2, scale_x, scale_y);
+    }
+
+    char buff[256];
+    sprintf(buff, "%s.png", "face-detector_predictions");
+    int success = stbi_write_bmp(buff, image_orig_width, image_orig_height, 4, ifm_buf);
+    if(!success) 
+        return -1;
+
+    fprintf(stdout, "Face-detector done. \nNumber of faces: %d\n",int(face_info.size()));
+    fprintf(stdout, "Save result image:%s\n", buff);
+    delete [] ifm_buf;
+    free(data);
+
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/linux/src/TNNImageClassify/TNNImageClassify.cc b/3rdparty/TNN/examples/linux/src/TNNImageClassify/TNNImageClassify.cc
new file mode 100644
index 0000000..4612c64
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNImageClassify/TNNImageClassify.cc
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "image_classifier.h"
+#include "macro.h"
+#include "utils/utils.h"
+
+#include "../flags.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_resize.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_write.h"
+
+using namespace TNN_NS;
+
+static const char label_path_message[] = "(optional) label file path. Default is: ../../../assets/synset.txt";
+DEFINE_string(l, "../../../assets/synset.txt", label_path_message);
+
+int main(int argc, char** argv) {
+    if (!ParseAndCheckCommandLine(argc, argv)) {
+        ShowUsage(argv[0]);
+        printf("\t-l, <label>    \t%s\n", label_path_message);
+        return -1;
+    }
+
+    // 创建tnn实例
+    auto proto_content = fdLoadFile(FLAGS_p.c_str());
+    auto model_content = fdLoadFile(FLAGS_m.c_str());
+
+    auto option = std::make_shared<TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = "";
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        // if enable openvino, set option compute_units to openvino
+        // if enable openvino/tensorrt, set option compute_units to openvino/tensorrt
+        #ifdef _CUDA_
+            option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+        #elif _OPENVINO_
+            option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+        #endif
+    }
+
+    auto predictor = std::make_shared<ImageClassifier>();
+
+    char* temp_p;
+    char line[256];
+    FILE *fp_label;
+    if((fp_label = fopen(FLAGS_l.c_str(), "r")) == NULL) {
+        fprintf(stderr, "ImageClassifier open lable file %s failed.\n", FLAGS_l.c_str());
+        return -1;
+    }
+    static unsigned char labels[1000][256];
+    for(int i = 0; i < 1000; i++){
+        temp_p = fgets(line, 256 ,fp_label);
+        memcpy(labels[i], line, 256);
+    }
+    fclose(fp_label);
+
+    char img_buff[256];
+    char *input_imgfn = img_buff;
+    strncpy(input_imgfn, FLAGS_i.c_str(), 256);
+    printf("Classify is about to start, and the picture is %s\n",input_imgfn);
+
+    int image_width, image_height, image_channel;
+    unsigned char *data = stbi_load(input_imgfn, &image_width, &image_height, &image_channel, 3);
+    if (!data) {
+        fprintf(stderr, "ImageClassifier open file %s failed.\n", input_imgfn);
+    }
+
+    std::vector<int> nchw = {1, image_channel, image_height, image_width};
+
+    //Init
+    std::shared_ptr<TNNSDKOutput> sdk_output = predictor->CreateSDKOutput();
+    CHECK_TNN_STATUS(predictor->Init(option));
+    //Predict
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+    CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output));
+
+    int class_id = -1;
+    if (sdk_output && dynamic_cast<ImageClassifierOutput *>(sdk_output.get())) {
+        auto classfy_output = dynamic_cast<ImageClassifierOutput *>(sdk_output.get());
+        class_id = classfy_output->class_id;
+    }
+    //完成计算，获取任意输出点
+    fprintf(stdout, "Classify done. Result: %sOutput argmax: %d\n", labels[class_id], class_id+1);
+    fprintf(stdout, "%s\n", predictor->GetBenchResult().Description().c_str());
+    free(data);
+    return 0;
+}
diff --git a/3rdparty/TNN/examples/linux/src/TNNObjectDetector/TNNObjectDetector.cc b/3rdparty/TNN/examples/linux/src/TNNObjectDetector/TNNObjectDetector.cc
new file mode 100644
index 0000000..abc8564
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNObjectDetector/TNNObjectDetector.cc
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "object_detector_ssd.h"
+#include "macro.h"
+#include "utils/utils.h"
+#include "tnn_sdk_sample.h"
+
+#include "../flags.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_resize.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "../../../../third_party/stb/stb_image_write.h"
+
+int main(int argc, char **argv) {
+    if (!ParseAndCheckCommandLine(argc, argv)) {
+        ShowUsage(argv[0]);
+        return -1;
+    }
+
+    auto proto_content = fdLoadFile(FLAGS_p.c_str());
+    auto model_content = fdLoadFile(FLAGS_m.c_str());
+
+    auto option = std::make_shared<TNN_NS::TNNSDKOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        // if enable openvino/tensorrt, set option compute_units to openvino/tensorrt
+        #ifdef _CUDA_
+            option->compute_units = TNN_NS::TNNComputeUnitsTensorRT;
+        #elif _OPENVINO_
+            option->compute_units = TNN_NS::TNNComputeUnitsOpenvino;
+        #endif
+    }
+
+    char img_buff[256];
+    char* input_imgfn = img_buff;
+    strncpy(input_imgfn, FLAGS_i.c_str(), 256);
+
+    int image_width, image_height, image_channel;
+    unsigned char *data = stbi_load(input_imgfn, &image_width, &image_height, &image_channel, 3);
+    std::vector<int> nchw = {1, 3, image_height, image_width};
+
+    if (!data) {
+        fprintf(stderr, "Object-Detector open file %s failed.\n", input_imgfn);
+    }
+
+    auto predictor = std::make_shared<TNN_NS::ObjectDetectorSSD>();
+    auto status = predictor->Init(option);
+    if (status != TNN_NS::TNN_OK) {
+        std::cout << "Predictor Initing failed, please check the option parameters" << std::endl;
+    }
+
+    std::shared_ptr<TNN_NS::TNNSDKOutput> sdk_output = nullptr;
+
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+    auto resize_mat = predictor->ProcessSDKInputMat(image_mat, "data_input");
+    CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNN_NS::TNNSDKInput>(resize_mat), sdk_output));
+
+    CHECK_TNN_STATUS(predictor->ProcessSDKOutput(sdk_output));
+    std::vector<TNN_NS::ObjectInfo> object_list;
+    if (sdk_output && dynamic_cast<TNN_NS::ObjectDetectorSSDOutput *>(sdk_output.get())) {
+        auto obj_output = dynamic_cast<TNN_NS::ObjectDetectorSSDOutput *>(sdk_output.get());
+        object_list = obj_output->object_list;
+    }
+
+    const int image_orig_height = int(image_height);
+    const int image_orig_width  = int(image_width);
+    const auto& target_dims     = predictor->GetInputShape();
+    const int target_height     = target_dims[2];
+    const int target_width      = target_dims[3];
+    float scale_x               = image_orig_width  / (float)target_width;
+    float scale_y               = image_orig_height / (float)target_height;
+
+    uint8_t *ifm_buf = new uint8_t[image_orig_width*image_orig_height*4];
+    for (int i = 0; i < image_orig_height * image_orig_width; i++) {
+        ifm_buf[i * 4] = data[i * 3];
+        ifm_buf[i * 4 + 1] = data[i * 3 + 1];
+        ifm_buf[i * 4 + 2] = data[i * 3 + 2];
+        ifm_buf[i * 4 + 3] = 255;
+    }
+    for (int i = 0; i < object_list.size(); i++) {
+        auto object = object_list[i];
+        TNN_NS::Rectangle((void*)ifm_buf, image_orig_height, image_orig_width, object.x1, object.y1,
+                           object.x2, object.y2, scale_x, scale_y);
+    }
+
+    char buff[256];
+    sprintf(buff, "%s.png", "object-detector_predictions");
+    int success = stbi_write_bmp(buff, image_orig_width, image_orig_height, 4, ifm_buf);
+    if (!success) return -1;
+
+    fprintf(stdout, "Object-Detector Done.\nNumber of objects: %d\n", int(object_list.size()));
+    fprintf(stdout, "Save result image:%s\n", buff);
+    delete [] ifm_buf;
+    free(data);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/webcam_demo.cc b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/webcam_demo.cc
new file mode 100644
index 0000000..2aa009d
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/webcam_demo.cc
@@ -0,0 +1,116 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <iostream>
+#include <string>
+#include <memory>
+#include <stdio.h>
+
+#include "tnn_sdk_sample.h"
+#include "worker.h"
+#include "utils/utils.h"
+#include "macro.h"
+
+#include "gflags/gflags.h"
+
+static const char help_message[] = "print a usage message.";
+static const char mode_dir_message[] = "(optional) model directory path. Default is: ../../../../model";
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(d, "../../../../model", mode_dir_message);
+
+static void ShowUsage(const char *exe) {
+    printf("usage:\n%s [-h] [-d] model_directory\n", exe);
+    printf("\t-h, <help>     \t%s\n", help_message);
+    printf("\t-d, <model_directory>     \t%s\n", mode_dir_message);
+}
+
+using namespace TNN_NS;
+
+// #define FAKE_FRAME
+
+int main(int argc, char** argv)
+{
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        ShowUsage(argv[0]);
+        return false;
+    }
+    cv::Mat frame;
+    
+#ifndef FAKE_FRAME
+    cv::VideoCapture cap;
+
+    int deviceID = 0;             // 0 = open default camera
+    int apiID = cv::CAP_ANY;      // 0 = autodetect default API
+    cap.open(deviceID, apiID);
+    if (!cap.isOpened()) {
+        std::cerr << "ERROR! Unable to open camera\n";
+        return -1;
+    }
+#endif
+
+    Worker worker;
+    auto status = worker.Init(FLAGS_d.c_str());
+    if (status != TNN_OK) {
+        LOGERROR(status);
+        return -1;
+    }
+
+    int cnt = 0;
+    while(true)
+    {
+        char fname[50];
+        snprintf(fname, 50, "images/%d.jpg", cnt);
+
+#ifndef FAKE_FRAME
+        cap.read(frame);
+        if (frame.empty()) {
+            std::cerr << "ERROR! blank frame grabbed\n";
+            break;
+        }
+#else   
+        frame = cv::imread(fname);
+        if (frame.empty()) {
+            fprintf(stderr, "%s get empty frame\n", fname);
+            break;
+        }
+#endif
+
+        cv::Mat frame_paint = frame.clone();
+        BREAK_ON_NEQ(worker.FrocessFrame(frame, frame_paint), TNN_OK);
+
+#ifdef FAKE_FRAME
+        cv::imwrite("result.jpg", frame_paint);
+        if (cnt > 50) {
+            break;
+        }
+#else
+        cv::imshow("Live", frame_paint);
+        int key = cv::waitKey(5);
+        if (key == 'c')
+            break;
+#endif
+
+        cnt = (cnt + 1 ) % 10000;
+
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.cc b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.cc
new file mode 100644
index 0000000..4038fca
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.cc
@@ -0,0 +1,300 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "worker.h"
+
+#include <string>
+#include <fstream>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+#include "ultra_face_detector.h"
+#include "blazeface_detector.h"
+#include "youtu_face_align.h"
+#include "face_detect_aligner.h"
+#include "tnn_fps_counter.h"
+
+#include "macro.h"
+#include "utils/utils.h"
+
+
+using TNN_NS::TNN_OK;
+
+
+std::shared_ptr<TNN_NS::YoutuFaceAlign> CreateBlazeFaceAlign(std::string modelPath,
+                           int width, int height, int topk,
+                           TNN_NS::TNNComputeUnits computUnitType, int phase) {
+
+    auto predictor = std::make_shared<TNN_NS::YoutuFaceAlign>();
+    std::string proto_content, model_content, lib_path = "";
+    if(phase == 1) {
+        proto_content = fdLoadFile(modelPath + "/youtu_face_alignment_phase1.tnnproto");
+        model_content = fdLoadFile(modelPath + "/youtu_face_alignment_phase1.tnnmodel");
+    } else if(phase == 2) {
+        proto_content = fdLoadFile(modelPath + "/youtu_face_alignment_phase2.tnnproto");
+        model_content = fdLoadFile(modelPath + "/youtu_face_alignment_phase2.tnnmodel");
+    }
+
+    auto option = std::make_shared<TNN_NS::YoutuFaceAlignOption>();
+    option->proto_content = proto_content;
+    option->model_content = model_content;
+    option->face_threshold = 0.5;
+    option->min_face_size = 20;
+    option->phase = phase;
+    option->input_width = width;
+    option->input_height = height;
+    option->net_scale = phase == 1? 1.2 : 1.3;
+    std::string mean_file_path = phase==1?  modelPath + "/youtu_mean_pts_phase1.txt" :  modelPath + "/youtu_mean_pts_phase2.txt";
+    option->mean_pts_path = std::move(mean_file_path);
+    option->compute_units = computUnitType;
+
+    auto status = predictor->Init(option);
+    if (status != TNN_OK) {
+        LOGE("align init failed %d %s", (int) status, status.description().c_str());
+        return nullptr;
+    }
+    return predictor;
+}
+
+Status Worker::Init(std::string model_path) {
+    fps_counter_ = std::make_shared<TNNFPSCounter>();
+
+    // Init FaceDetector
+    auto proto_content = fdLoadFile(model_path+"/face_detector/version-slim-320_simplified.tnnproto");
+    auto model_content = fdLoadFile(model_path+"/face_detector/version-slim-320_simplified.tnnmodel");
+
+    // if enable openvino, set computUnitType to TNNComputeUnitsOpenvino
+    TNN_NS::TNNComputeUnits computUnitType = TNN_NS::TNNComputeUnitsCPU;
+#ifdef _CUDA_
+    computUnitType = TNN_NS::TNNComputeUnitsTensorRT;
+#elif _OPENVINO_
+    computUnitType = TNN_NS::TNNComputeUnitsOpenvino;
+#endif
+
+    auto option = std::make_shared<TNN_NS::UltraFaceDetectorOption>();
+    {
+        option->proto_content = proto_content;
+        option->model_content = model_content;
+        option->library_path = "";
+        option->compute_units = computUnitType;
+        option->score_threshold = 0.95;
+        option->iou_threshold = 0.15;
+    }
+    
+    detecotr_ = std::make_shared<TNN_NS::UltraFaceDetector>();
+    CHECK_TNN_STATUS(detecotr_->Init(option));
+
+    // Init BlazeFaceDetector
+    int height = 128;
+    int width  = 128;
+    int topk = 5; 
+
+    auto blaze_detector_proto_content = fdLoadFile(model_path+"/blazeface/blazeface.tnnproto");
+    auto blaze_detector_model_content = fdLoadFile(model_path+"/blazeface/blazeface.tnnmodel");
+    auto blaze_detector_option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
+    {
+        blaze_detector_option->proto_content = blaze_detector_proto_content;
+        blaze_detector_option->model_content = blaze_detector_model_content;
+        blaze_detector_option->library_path = "";
+        blaze_detector_option->compute_units = computUnitType;
+        blaze_detector_option->min_suppression_threshold = 0.3;
+        blaze_detector_option->anchor_path = model_path + "/blazeface/blazeface_anchors.txt";
+    }
+
+    blaze_detecotr_ = std::make_shared<TNN_NS::BlazeFaceDetector>();
+    CHECK_TNN_STATUS(blaze_detecotr_->Init(blaze_detector_option));
+
+
+    // Init aligner
+    aligner_ = std::make_shared<TNN_NS::FaceDetectAligner>();
+
+    auto predictor_phase1 = CreateBlazeFaceAlign(model_path + "/youtu_face_alignment",
+                                                 width, height, topk,
+                                                 computUnitType, 1);
+    if(predictor_phase1 == nullptr) {
+        LOGE("create align phase1 failed \n");
+        return -1;
+    }
+
+    auto predictor_phase2 = CreateBlazeFaceAlign(model_path + "/youtu_face_alignment",
+                                                 width, height, topk,
+                                                 computUnitType, 2);
+    if(predictor_phase2 == nullptr) {
+        LOGE("create align phase2 failed \n");
+        return -1;
+    }
+
+    CHECK_TNN_STATUS(aligner_->Init({blaze_detecotr_, predictor_phase1, predictor_phase2}));
+    
+    return TNN_OK;
+};
+
+
+Status Worker::DrawUI(cv::Mat &frame) {
+    // FPS
+    char fps_char[200];
+    snprintf(fps_char, 200, "FPS:%3.0f", fps_counter_->GetFPS("frame"));
+    std::string text(fps_char);
+    int font_face = cv::FONT_HERSHEY_COMPLEX; 
+    double font_scale = 1.2;
+    int thickness = 2;
+    int baseline;
+    cv::Size text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
+ 
+    cv::Point origin; 
+    origin.x = 10;
+    origin.y = 10 + text_size.height;
+    cv::putText(frame, text, origin, font_face, font_scale, cv::Scalar(0, 255, 255), thickness, 8, 0);
+
+    // CMD
+    std::vector<std::string> sentences = {
+        // // "a: toggle facealign",
+        // "d: toggle facedetect",
+        "c: quit",
+        "Press:",
+    };
+    font_scale = 0.5;
+    thickness = 1;
+    origin.x = 10;
+    origin.y = frame.rows;
+    for(auto str : sentences) {
+        text_size = cv::getTextSize(str, font_face, font_scale, thickness, &baseline);
+        origin.y -= text_size.height + 5;
+        cv::putText(frame, str, origin, font_face, font_scale, cv::Scalar(0, 0, 255), thickness, 8, 0);
+    }
+    return TNN_OK;
+}
+
+Status Worker::FaceDetectWithPaint(cv::Mat &frame, cv::Mat &frame_paint) {
+    //prepare input
+    std::vector<int> nchw = {1, frame.channels(), frame.rows, frame.cols};
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, frame.data);
+
+    //Predict
+    std::shared_ptr<TNN_NS::TNNSDKOutput> sdk_output = detecotr_->CreateSDKOutput();
+    CHECK_TNN_STATUS(detecotr_->Predict(std::make_shared<TNN_NS::UltraFaceDetectorInput>(image_mat), sdk_output));
+
+    std::vector<TNN_NS::FaceInfo> face_info;
+    if (sdk_output && dynamic_cast<TNN_NS::UltraFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<TNN_NS::UltraFaceDetectorOutput *>(sdk_output.get());
+        face_info = face_output->face_list;
+    }
+
+    const int image_orig_height = nchw[2];
+    const int image_orig_width  = nchw[3];
+    const int h = detecotr_->GetInputShape()[2];
+    const int w = detecotr_->GetInputShape()[3];
+
+    for (int i = 0; i < face_info.size(); i++) {
+        auto face = face_info[i].AdjustToViewSize(image_orig_height, image_orig_width, 2);
+        cv::Point top_left(face.x1, face.y1);
+        cv::Point bottom_right(face.x2, face.y2);
+        cv::rectangle(frame_paint, top_left, bottom_right, cv::Scalar(0, 255, 127));
+    }
+
+    return TNN_OK;
+};
+
+Status Worker::BlazeFaceDetectWithPaint(cv::Mat &frame, cv::Mat &frame_paint) {
+    std::vector<int> nchw = {1, frame.channels(), frame.rows, frame.cols};
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, frame.data);
+
+    //Predict
+    std::shared_ptr<TNN_NS::TNNSDKOutput> sdk_output = blaze_detecotr_->CreateSDKOutput();
+    CHECK_TNN_STATUS(blaze_detecotr_->Predict(std::make_shared<TNN_NS::BlazeFaceDetectorInput>(image_mat), sdk_output));
+
+    std::vector<TNN_NS::BlazeFaceInfo> face_info;
+    if (sdk_output && dynamic_cast<TNN_NS::BlazeFaceDetectorOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<TNN_NS::BlazeFaceDetectorOutput *>(sdk_output.get());
+        face_info = face_output->face_list;
+    }
+
+    const int image_orig_height = nchw[2];
+    const int image_orig_width  = nchw[3];
+    const int h = blaze_detecotr_->GetInputShape()[2];
+    const int w = blaze_detecotr_->GetInputShape()[3];
+
+    for (int i = 0; i < face_info.size(); i++) {
+        auto face = face_info[i].AdjustToViewSize(image_orig_height, image_orig_width, 2);
+        cv::Point top_left(face.x1, face.y1);
+        cv::Point bottom_right(face.x2, face.y2);
+        cv::rectangle(frame_paint, top_left, bottom_right, cv::Scalar(0, 255, 0));
+    }
+
+    return TNN_OK;
+};
+
+Status Worker::AlignWithPaint(cv::Mat &frame, cv::Mat &frame_paint) {
+    std::vector<int> nchw = {1, frame.channels(), frame.rows, frame.cols};
+    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, frame.data);
+
+    //Predict
+    std::shared_ptr<TNN_NS::TNNSDKOutput> sdk_output = aligner_->CreateSDKOutput();
+    CHECK_TNN_STATUS(aligner_->Predict(std::make_shared<TNN_NS::TNNSDKInput>(image_mat), sdk_output));
+
+    TNN_NS::YoutuFaceAlignInfo face_info;
+    if (sdk_output && dynamic_cast<TNN_NS::YoutuFaceAlignOutput *>(sdk_output.get())) {
+        auto face_output = dynamic_cast<TNN_NS::YoutuFaceAlignOutput *>(sdk_output.get());
+        face_info = face_output->face;
+    }
+
+    const int image_orig_height = nchw[2];
+    const int image_orig_width  = nchw[3];
+
+    auto face = face_info.AdjustToViewSize(image_orig_height, image_orig_width, 2);
+    for(auto xy : face.key_points) {
+          cv::circle(frame_paint, cv::Point(xy.first, xy.second), 2, cv::Scalar( 0, 128, 255 ), cv::FILLED, cv::LINE_8);
+    }
+
+    return TNN_OK;
+};
+
+Status GetROI(cv::Mat &frame, cv::Rect &roi) {
+    float origin_h = (float)frame.rows;
+    float origin_w = (float)frame.cols;
+    float max_ratio = 1.1;
+    float max_width = max_ratio * origin_h;
+    int x, y, w, h;
+    roi.x = 0;
+    roi.y = 0;
+    roi.width  = origin_w;
+    roi.height = origin_h;
+    if ( origin_w > max_width ) {
+        roi.x = (origin_w - max_width) / 2.0;
+        roi.width = max_width;
+    }
+    return TNN_OK;
+}
+
+Status Worker::FrocessFrame(cv::Mat &frame, cv::Mat &frame_paint) {
+    fps_counter_->Begin("frame");
+
+    cv::Rect roi;
+    RETURN_ON_NEQ(GetROI(frame, roi), TNN_OK);
+    cv::Mat frame_croped = frame(roi).clone();
+    cv::Mat frame_paint_croped = frame_paint(roi);
+
+    RETURN_ON_NEQ(FaceDetectWithPaint(frame, frame_paint), TNN_OK);
+    // RETURN_ON_NEQ(BlazeFaceDetectWithPaint(frame_croped, frame_paint_croped), TNN_OK);
+    RETURN_ON_NEQ(AlignWithPaint(frame_croped, frame_paint_croped), TNN_OK);
+
+    fps_counter_->End("frame");
+    RETURN_ON_NEQ(DrawUI(frame_paint), TNN_OK);
+    return TNN_OK; 
+}
diff --git a/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.h b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.h
new file mode 100644
index 0000000..19d36b3
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/TNNWebCamBasedDemo/worker.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_X86_SRC_WORKER_H_
+#define TNN_EXAMPLES_X86_SRC_WORKER_H_
+
+#include <string>
+#include <memory>
+
+#include <opencv2/core.hpp>
+
+#include "tnn/core/status.h"
+#include "ultra_face_detector.h"
+#include "blazeface_detector.h"
+#include "youtu_face_align.h"
+#include "face_detect_aligner.h"
+#include "tnn_fps_counter.h"
+
+using TNN_NS::Status;
+
+class Worker {
+public:
+    Status Init(std::string model_path); 
+    Status FaceDetectWithPaint(cv::Mat &frame, cv::Mat &frame_paint);
+    Status BlazeFaceDetectWithPaint(cv::Mat &frame, cv::Mat &frame_paint);
+    Status AlignWithPaint(cv::Mat &frame, cv::Mat &frame_paint);
+    Status FrocessFrame(cv::Mat &frame, cv::Mat &frame_paint);
+private:
+    Status DrawUI(cv::Mat &frame_paint);
+    std::shared_ptr<TNN_NS::UltraFaceDetector> detecotr_;
+    std::shared_ptr<TNNFPSCounter> fps_counter_;
+
+    std::shared_ptr<TNN_NS::BlazeFaceDetector> blaze_detecotr_;
+    std::shared_ptr<TNN_NS::FaceDetectAligner> aligner_;
+
+};
+
+#endif // TNN_EXAMPLES_X86_SRC_WORKER_H_M
+
diff --git a/3rdparty/TNN/examples/linux/src/flags.cc b/3rdparty/TNN/examples/linux/src/flags.cc
new file mode 100644
index 0000000..ff6bd05
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/flags.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flags.h"
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(p, "", proto_path_message);
+DEFINE_string(m, "", model_path_message);
+DEFINE_string(i, "", input_path_message);
+
+void ShowUsage(const char *exe, bool input_required) {
+    printf("usage:\n%s [-h] [-p] tnnproto [-m] tnnmodel [-i] <input>\n", exe);
+    printf("\t-h, <help>     \t%s\n", help_message);
+    printf("\t-p, <proto>    \t%s\n", proto_path_message);
+    printf("\t-m, <model>    \t%s\n", model_path_message);
+    if (input_required) {
+        printf("\t-i, <input>    \t%s\n", input_path_message);
+    }
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[], bool input_required) {
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        return false;
+    }
+
+    if (FLAGS_m.empty() || FLAGS_p.empty()) {
+        printf("Parameter -m and -p should be set \n");
+        return false;
+    }
+
+    if (FLAGS_i.empty() && input_required) {
+        printf("Parameter -i should be set \n");
+
+        return false;
+    }
+
+    return true;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/src/flags.h b/3rdparty/TNN/examples/linux/src/flags.h
new file mode 100644
index 0000000..961e110
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/src/flags.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_LINUX_SRC_FLAGS_H_
+#define TNN_EXAMPLES_LINUX_SRC_FLAGS_H_
+
+#include "gflags/gflags.h"
+
+DECLARE_bool(h);
+DECLARE_string(p);
+DECLARE_string(m);
+DECLARE_string(i);
+
+static const char help_message[] = "print a usage message.";
+static const char proto_path_message[] = "(required) tnn proto file path";
+static const char model_path_message[] = "(required) tnn model file path";
+static const char input_path_message[] = "(required) input file path";
+
+void ShowUsage(const char* exe, bool input_requird=true);
+bool ParseAndCheckCommandLine(int argc, char* argv[], bool input_require=true);
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/linux/x86/CMakeLists.txt b/3rdparty/TNN/examples/linux/x86/CMakeLists.txt
new file mode 100644
index 0000000..1c53adf
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/x86/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+
+option(TNN_OPENVINO_ENABLE "with openvino lib" OFF)
+option(TNN_OPENVINO_LIB_PATH "openvino lib path" "")
+option(TNN_DEMO_WITH_WEBCAM "with webcam" OFF)
+
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_CPU_ENABLE ON)
+
+if (TNN_DEMO_WITH_WEBCAM) 
+    find_package(OpenCV 3 REQUIRED)
+    include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+endif()
+
+set(TNNRoot ${CMAKE_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+link_directories(${TNN_LIB_PATH})
+if (${TNN_OPENVINO_ENABLE})
+    link_directories(${TNN_OPENVINO_LIB_PATH})
+    add_definitions(-D_OPENVINO_)
+endif()
+
+link_libraries(TNN)
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../src/*.cc")
+
+file(GLOB_RECURSE BASE_SRC
+                    "${CMAKE_SOURCE_DIR}/../../base/*.cc")
+
+file(GLOB_RECURSE UTIL_SRC
+                   "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+
+file(GLOB_RECURSE WEBCAM_SRC "${CMAKE_SOURCE_DIR}/../../base/ocr*.cc")
+list(REMOVE_ITEM BASE_SRC ${WEBCAM_SRC})
+
+add_executable(demo_x86_imageclassify ../src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_facedetector ../src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_readingcomprehension ../src/BertReadingComprehension/BertReadingComprehension.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_objectdetector ../src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+
+if (TNN_DEMO_WITH_WEBCAM) 
+    file(GLOB_RECURSE SRC "${CMAKE_SOURCE_DIR}/../src/TNNWebCamBasedDemo/*.cc")
+    add_executable(demo_x86_webcam ${SRC} ${BASE_SRC} ${UTIL_SRC} ${WEBCAM_SRC})
+    target_link_libraries(demo_x86_webcam ${OpenCV_LIBS})
+endif()
diff --git a/3rdparty/TNN/examples/linux/x86/build_linux_native.sh b/3rdparty/TNN/examples/linux/x86/build_linux_native.sh
new file mode 100755
index 0000000..37a40c3
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/x86/build_linux_native.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+TNN_LIB_PATH=../../../scripts/build_linux_native/
+
+cd ../../../scripts
+sh build_linux_native.sh
+cd -
+
+rm -rf build_linux_native
+mkdir build_linux_native
+cd build_linux_native
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH \
+    -DTNN_DEMO_WITH_WEBCAM=OFF \
+
+make -j4
diff --git a/3rdparty/TNN/examples/linux/x86/build_linux_openvino.sh b/3rdparty/TNN/examples/linux/x86/build_linux_openvino.sh
new file mode 100755
index 0000000..db6df6f
--- /dev/null
+++ b/3rdparty/TNN/examples/linux/x86/build_linux_openvino.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+TNN_LIB_PATH=../../../scripts/build_x86_linux/
+TNN_OPENVINO_LIB_PATH=../../../scripts/x86_linux_release/lib/
+
+cd ../../../scripts
+sh build_x86_linux.sh
+cd -
+
+rm -rf build_linux_openvino
+mkdir build_linux_openvino
+cd build_linux_openvino
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH \
+    -DTNN_OPENVINO_ENABLE=ON \
+    -DTNN_OPENVINO_LIB_PATH=$TNN_OPENVINO_LIB_PATH \
+    -DTNN_DEMO_WITH_WEBCAM=OFF \
+
+make -j4
diff --git a/3rdparty/TNN/examples/mac/.gitignore b/3rdparty/TNN/examples/mac/.gitignore
new file mode 100644
index 0000000..9c470ce
--- /dev/null
+++ b/3rdparty/TNN/examples/mac/.gitignore
@@ -0,0 +1,2 @@
+x86/build_macos_native
+x86/build_macos_openvino
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/mac/x86/CMakeLists.txt b/3rdparty/TNN/examples/mac/x86/CMakeLists.txt
new file mode 100644
index 0000000..48b25c3
--- /dev/null
+++ b/3rdparty/TNN/examples/mac/x86/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+
+option(TNN_OPENVINO_ENABLE "with openvino lib" OFF)
+option(TNN_OPENVINO_LIB_PATH "openvino lib path" "")
+option(TNN_DEMO_WITH_WEBCAM "with webcam" OFF)
+
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_CPU_ENABLE ON)
+
+if (TNN_DEMO_WITH_WEBCAM) 
+    find_package(OpenCV 3 REQUIRED)
+    include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+endif()
+
+set(TNNRoot ${CMAKE_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+link_directories(${TNN_LIB_PATH})
+if (${TNN_OPENVINO_ENABLE})
+    link_directories(${TNN_OPENVINO_LIB_PATH})
+    add_definitions(-D_OPENVINO_)
+endif()
+
+link_libraries(TNN)
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../../linux/src/*.cc")
+
+
+file(GLOB_RECURSE BASE_SRC
+                    "${CMAKE_SOURCE_DIR}/../../base/*.cc")
+
+file(GLOB_RECURSE UTIL_SRC
+                   "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+
+file(GLOB_RECURSE WEBCAM_SRC "${CMAKE_SOURCE_DIR}/../../base/ocr*.cc")
+list(REMOVE_ITEM BASE_SRC ${WEBCAM_SRC})
+
+add_executable(demo_x86_imageclassify ${CMAKE_SOURCE_DIR}/../../linux/src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_facedetector ${CMAKE_SOURCE_DIR}/../../linux/src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_readingcomprehension ${CMAKE_SOURCE_DIR}/../../linux/src/BertReadingComprehension/BertReadingComprehension.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_objectdetector ${CMAKE_SOURCE_DIR}/../../linux/src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+
+if (TNN_DEMO_WITH_WEBCAM) 
+    file(GLOB_RECURSE SRC "${CMAKE_SOURCE_DIR}/../../linux/src/TNNWebCamBasedDemo/*.cc")
+    add_executable(demo_x86_webcam ${SRC} ${BASE_SRC} ${UTIL_SRC} ${WEBCAM_SRC})
+    target_link_libraries(demo_x86_webcam ${OpenCV_LIBS})
+endif()
diff --git a/3rdparty/TNN/examples/mac/x86/build_macos_native.sh b/3rdparty/TNN/examples/mac/x86/build_macos_native.sh
new file mode 100755
index 0000000..b9ada91
--- /dev/null
+++ b/3rdparty/TNN/examples/mac/x86/build_macos_native.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+TNN_LIB_PATH=../../../scripts/build_macos_native/
+
+cd ../../../scripts
+sh build_macos_native.sh
+cd -
+
+rm -rf build_macos_native
+mkdir build_macos_native
+cd build_macos_native
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH \
+    -DTNN_DEMO_WITH_WEBCAM=OFF
+
+make -j4
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/mac/x86/build_macos_openvino.sh b/3rdparty/TNN/examples/mac/x86/build_macos_openvino.sh
new file mode 100755
index 0000000..0d9bf64
--- /dev/null
+++ b/3rdparty/TNN/examples/mac/x86/build_macos_openvino.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+TNN_LIB_PATH=../../../scripts/build_macos/
+TNN_OPENVINO_LIB_PATH=../../../scripts/macos_release/lib/
+
+cd ../../../scripts
+sh build_macos.sh
+cd -
+
+rm -rf build_macos_openvino
+mkdir build_macos_openvino
+cd build_macos_openvino
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH \
+    -DTNN_OPENVINO_ENABLE=ON \
+    -DTNN_OPENVINO_LIB_PATH=$TNN_OPENVINO_LIB_PATH  \
+    -DTNN_DEMO_WITH_WEBCAM=OFF
+
+make -j4
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/utils/clipper.cc b/3rdparty/TNN/examples/utils/clipper.cc
new file mode 100644
index 0000000..2dbf1fe
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/clipper.cc
@@ -0,0 +1,4629 @@
+/*******************************************************************************
+*                                                                              *
+* Author    :  Angus Johnson                                                   *
+* Version   :  6.4.2                                                           *
+* Date      :  27 February 2017                                                *
+* Website   :  http://www.angusj.com                                           *
+* Copyright :  Angus Johnson 2010-2017                                         *
+*                                                                              *
+* License:                                                                     *
+* Use, modification & distribution is subject to Boost Software License Ver 1. *
+* http://www.boost.org/LICENSE_1_0.txt                                         *
+*                                                                              *
+* Attributions:                                                                *
+* The code in this library is an extension of Bala Vatti's clipping algorithm: *
+* "A generic solution to polygon clipping"                                     *
+* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+* http://portal.acm.org/citation.cfm?id=129906                                 *
+*                                                                              *
+* Computer graphics and geometric modeling: implementation and algorithms      *
+* By Max K. Agoston                                                            *
+* Springer; 1 edition (January 4, 2005)                                        *
+* http://books.google.com/books?q=vatti+clipping+agoston                       *
+*                                                                              *
+* See also:                                                                    *
+* "Polygon Offsetting by Computing Winding Numbers"                            *
+* Paper no. DETC2005-85513 pp. 565-575                                         *
+* ASME 2005 International Design Engineering Technical Conferences             *
+* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+* September 24-28, 2005 , Long Beach, California, USA                          *
+* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+*                                                                              *
+*******************************************************************************/
+
+/*******************************************************************************
+*                                                                              *
+* This is a translation of the Delphi Clipper library and the naming style     *
+* used has retained a Delphi flavour.                                          *
+*                                                                              *
+*******************************************************************************/
+
+#include "clipper.h"
+#include <cmath>
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <cstring>
+#include <cstdlib>
+#include <ostream>
+#include <functional>
+
+namespace ClipperLib {
+
+    static double const pi = 3.141592653589793238;
+    static double const two_pi = pi *2;
+    static double const def_arc_tolerance = 0.25;
+
+    enum Direction { dRightToLeft, dLeftToRight };
+
+    static int const Unassigned = -1;  //edge not currently 'owning' a solution
+    static int const Skip = -2;        //edge that would otherwise close a path
+
+#define HORIZONTAL (-1.0E+40)
+#define TOLERANCE (1.0e-20)
+#define NEAR_ZERO(val) (((val) > -TOLERANCE) && ((val) < TOLERANCE))
+
+    struct TEdge {
+        IntPoint Bot;
+        IntPoint Curr; //current (updated for every new scanbeam)
+        IntPoint Top;
+        double Dx;
+        PolyType PolyTyp;
+        EdgeSide Side; //side only refers to current side of solution poly
+        int WindDelta; //1 or -1 depending on winding direction
+        int WindCnt;
+        int WindCnt2; //winding count of the opposite polytype
+        int OutIdx;
+        TEdge *Next;
+        TEdge *Prev;
+        TEdge *NextInLML;
+        TEdge *NextInAEL;
+        TEdge *PrevInAEL;
+        TEdge *NextInSEL;
+        TEdge *PrevInSEL;
+    };
+
+    struct IntersectNode {
+        TEdge          *Edge1;
+        TEdge          *Edge2;
+        IntPoint        Pt;
+    };
+
+    struct LocalMinimum {
+        cInt          Y;
+        TEdge        *LeftBound;
+        TEdge        *RightBound;
+    };
+
+    struct OutPt;
+
+//OutRec: contains a path in the clipping solution. Edges in the AEL will
+//carry a pointer to an OutRec when they are part of the clipping solution.
+    struct OutRec {
+        int       Idx;
+        bool      IsHole;
+        bool      IsOpen;
+        OutRec   *FirstLeft;  //see comments in clipper.pas
+        PolyNode *PolyNd;
+        OutPt    *Pts;
+        OutPt    *BottomPt;
+    };
+
+    struct OutPt {
+        int       Idx;
+        IntPoint  Pt;
+        OutPt    *Next;
+        OutPt    *Prev;
+    };
+
+    struct Join {
+        OutPt    *OutPt1;
+        OutPt    *OutPt2;
+        IntPoint  OffPt;
+    };
+
+    struct LocMinSorter
+    {
+        inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
+        {
+            return locMin2.Y < locMin1.Y;
+        }
+    };
+
+//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+    inline cInt Round(double val)
+    {
+        if ((val < 0)) return static_cast<cInt>(val - 0.5);
+        else return static_cast<cInt>(val + 0.5);
+    }
+//------------------------------------------------------------------------------
+
+    inline cInt Abs(cInt val)
+    {
+        return val < 0 ? -val : val;
+    }
+
+//------------------------------------------------------------------------------
+// PolyTree methods ...
+//------------------------------------------------------------------------------
+
+    void PolyTree::Clear()
+    {
+        for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
+            delete AllNodes[i];
+        AllNodes.resize(0);
+        Childs.resize(0);
+    }
+//------------------------------------------------------------------------------
+
+    PolyNode* PolyTree::GetFirst() const
+    {
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return 0;
+    }
+//------------------------------------------------------------------------------
+
+    int PolyTree::Total() const
+    {
+        int result = (int)AllNodes.size();
+        //with negative offsets, ignore the hidden outer polygon ...
+        if (result > 0 && Childs[0] != AllNodes[0]) result--;
+        return result;
+    }
+
+//------------------------------------------------------------------------------
+// PolyNode methods ...
+//------------------------------------------------------------------------------
+
+    PolyNode::PolyNode(): Parent(0), Index(0), m_IsOpen(false)
+    {
+    }
+//------------------------------------------------------------------------------
+
+    int PolyNode::ChildCount() const
+    {
+        return (int)Childs.size();
+    }
+//------------------------------------------------------------------------------
+
+    void PolyNode::AddChild(PolyNode& child)
+    {
+        unsigned cnt = (unsigned)Childs.size();
+        Childs.push_back(&child);
+        child.Parent = this;
+        child.Index = cnt;
+    }
+//------------------------------------------------------------------------------
+
+    PolyNode* PolyNode::GetNext() const
+    {
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return GetNextSiblingUp();
+    }
+//------------------------------------------------------------------------------
+
+    PolyNode* PolyNode::GetNextSiblingUp() const
+    {
+        if (!Parent) //protects against PolyTree.GetNextSiblingUp()
+            return 0;
+        else if (Index == Parent->Childs.size() - 1)
+            return Parent->GetNextSiblingUp();
+        else
+            return Parent->Childs[Index + 1];
+    }
+//------------------------------------------------------------------------------
+
+    bool PolyNode::IsHole() const
+    {
+        bool result = true;
+        PolyNode* node = Parent;
+        while (node)
+        {
+            result = !result;
+            node = node->Parent;
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    bool PolyNode::IsOpen() const
+    {
+        return m_IsOpen;
+    }
+//------------------------------------------------------------------------------
+
+#ifndef use_int32
+
+//------------------------------------------------------------------------------
+// Int128 class (enables safe math on signed 64bit integers)
+// eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
+//    Int128 val2((long64)9223372036854775807);
+//    Int128 val3 = val1 * val2;
+//    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
+//------------------------------------------------------------------------------
+
+    class Int128
+    {
+    public:
+        ulong64 lo;
+        long64 hi;
+
+        Int128(long64 _lo = 0)
+        {
+            lo = (ulong64)_lo;
+            if (_lo < 0)  hi = -1; else hi = 0;
+        }
+
+
+        Int128(const Int128 &val): lo(val.lo), hi(val.hi){}
+
+        Int128(const long64& _hi, const ulong64& _lo): lo(_lo), hi(_hi){}
+
+        Int128& operator = (const long64 &val)
+        {
+            lo = (ulong64)val;
+            if (val < 0) hi = -1; else hi = 0;
+            return *this;
+        }
+
+        bool operator == (const Int128 &val) const
+        {return (hi == val.hi && lo == val.lo);}
+
+        bool operator != (const Int128 &val) const
+        { return !(*this == val);}
+
+        bool operator > (const Int128 &val) const
+        {
+            if (hi != val.hi)
+                return hi > val.hi;
+            else
+                return lo > val.lo;
+        }
+
+        bool operator < (const Int128 &val) const
+        {
+            if (hi != val.hi)
+                return hi < val.hi;
+            else
+                return lo < val.lo;
+        }
+
+        bool operator >= (const Int128 &val) const
+        { return !(*this < val);}
+
+        bool operator <= (const Int128 &val) const
+        { return !(*this > val);}
+
+        Int128& operator += (const Int128 &rhs)
+        {
+            hi += rhs.hi;
+            lo += rhs.lo;
+            if (lo < rhs.lo) hi++;
+            return *this;
+        }
+
+        Int128 operator + (const Int128 &rhs) const
+        {
+            Int128 result(*this);
+            result+= rhs;
+            return result;
+        }
+
+        Int128& operator -= (const Int128 &rhs)
+        {
+            *this += -rhs;
+            return *this;
+        }
+
+        Int128 operator - (const Int128 &rhs) const
+        {
+            Int128 result(*this);
+            result -= rhs;
+            return result;
+        }
+
+        Int128 operator-() const //unary negation
+        {
+            if (lo == 0)
+                return Int128(-hi, 0);
+            else
+                return Int128(~hi, ~lo + 1);
+        }
+
+        operator double() const
+        {
+            const double shift64 = 18446744073709551616.0; //2^64
+            if (hi < 0)
+            {
+                if (lo == 0) return (double)hi * shift64;
+                else return -(double)(~lo + ~hi * shift64);
+            }
+            else
+                return (double)(lo + hi * shift64);
+        }
+
+    };
+//------------------------------------------------------------------------------
+
+    Int128 Int128Mul (long64 lhs, long64 rhs)
+    {
+        bool negate = (lhs < 0) != (rhs < 0);
+
+        if (lhs < 0) lhs = -lhs;
+        ulong64 int1Hi = ulong64(lhs) >> 32;
+        ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);
+
+        if (rhs < 0) rhs = -rhs;
+        ulong64 int2Hi = ulong64(rhs) >> 32;
+        ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);
+
+        //nb: see comments in clipper.pas
+        ulong64 a = int1Hi * int2Hi;
+        ulong64 b = int1Lo * int2Lo;
+        ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;
+
+        Int128 tmp;
+        tmp.hi = long64(a + (c >> 32));
+        tmp.lo = long64(c << 32);
+        tmp.lo += long64(b);
+        if (tmp.lo < b) tmp.hi++;
+        if (negate) tmp = -tmp;
+        return tmp;
+    };
+#endif
+
+//------------------------------------------------------------------------------
+// Miscellaneous global functions
+//------------------------------------------------------------------------------
+
+    bool Orientation(const Path &poly)
+    {
+        return Area(poly) >= 0;
+    }
+//------------------------------------------------------------------------------
+
+    double Area(const Path &poly)
+    {
+        int size = (int)poly.size();
+        if (size < 3) return 0;
+
+        double a = 0;
+        for (int i = 0, j = size -1; i < size; ++i)
+        {
+            a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
+            j = i;
+        }
+        return -a * 0.5;
+    }
+//------------------------------------------------------------------------------
+
+    double Area(const OutPt *op)
+    {
+        const OutPt *startOp = op;
+        if (!op) return 0;
+        double a = 0;
+        do {
+            a +=  (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
+            op = op->Next;
+        } while (op != startOp);
+        return a * 0.5;
+    }
+//------------------------------------------------------------------------------
+
+    double Area(const OutRec &outRec)
+    {
+        return Area(outRec.Pts);
+    }
+//------------------------------------------------------------------------------
+
+    bool PointIsVertex(const IntPoint &Pt, OutPt *pp)
+    {
+        OutPt *pp2 = pp;
+        do
+        {
+            if (pp2->Pt == Pt) return true;
+            pp2 = pp2->Next;
+        }
+        while (pp2 != pp);
+        return false;
+    }
+//------------------------------------------------------------------------------
+
+//See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
+//http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
+    int PointInPolygon(const IntPoint &pt, const Path &path)
+    {
+        //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int result = 0;
+        size_t cnt = path.size();
+        if (cnt < 3) return 0;
+        IntPoint ip = path[0];
+        for(size_t i = 1; i <= cnt; ++i)
+        {
+            IntPoint ipNext = (i == cnt ? path[0] : path[i]);
+            if (ipNext.Y == pt.Y)
+            {
+                if ((ipNext.X == pt.X) || (ip.Y == pt.Y &&
+                                           ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
+            }
+            if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
+            {
+                if (ip.X >= pt.X)
+                {
+                    if (ipNext.X > pt.X) result = 1 - result;
+                    else
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                } else
+                {
+                    if (ipNext.X > pt.X)
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                }
+            }
+            ip = ipNext;
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    int PointInPolygon (const IntPoint &pt, OutPt *op)
+    {
+        //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int result = 0;
+        OutPt* startOp = op;
+        for(;;)
+        {
+            if (op->Next->Pt.Y == pt.Y)
+            {
+                if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y &&
+                                                 ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
+            }
+            if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
+            {
+                if (op->Pt.X >= pt.X)
+                {
+                    if (op->Next->Pt.X > pt.X) result = 1 - result;
+                    else
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                } else
+                {
+                    if (op->Next->Pt.X > pt.X)
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                }
+            }
+            op = op->Next;
+            if (startOp == op) break;
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    bool Poly2ContainsPoly1(OutPt *OutPt1, OutPt *OutPt2)
+    {
+        OutPt* op = OutPt1;
+        do
+        {
+            //nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
+            int res = PointInPolygon(op->Pt, OutPt2);
+            if (res >= 0) return res > 0;
+            op = op->Next;
+        }
+        while (op != OutPt1);
+        return true;
+    }
+//----------------------------------------------------------------------
+
+    bool SlopesEqual(const TEdge &e1, const TEdge &e2, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) ==
+                   Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
+        else
+#endif
+            return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) ==
+                   (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
+    }
+//------------------------------------------------------------------------------
+
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
+                     const IntPoint pt3, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y-pt2.Y, pt2.X-pt3.X) == Int128Mul(pt1.X-pt2.X, pt2.Y-pt3.Y);
+        else
+#endif
+            return (pt1.Y-pt2.Y)*(pt2.X-pt3.X) == (pt1.X-pt2.X)*(pt2.Y-pt3.Y);
+    }
+//------------------------------------------------------------------------------
+
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
+                     const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y-pt2.Y, pt3.X-pt4.X) == Int128Mul(pt1.X-pt2.X, pt3.Y-pt4.Y);
+        else
+#endif
+            return (pt1.Y-pt2.Y)*(pt3.X-pt4.X) == (pt1.X-pt2.X)*(pt3.Y-pt4.Y);
+    }
+//------------------------------------------------------------------------------
+
+    inline bool IsHorizontal(TEdge &e)
+    {
+        return e.Dx == HORIZONTAL;
+    }
+//------------------------------------------------------------------------------
+
+    inline double GetDx(const IntPoint pt1, const IntPoint pt2)
+    {
+        return (pt1.Y == pt2.Y) ?
+               HORIZONTAL : (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
+    }
+//---------------------------------------------------------------------------
+
+    inline void SetDx(TEdge &e)
+    {
+        cInt dy  = (e.Top.Y - e.Bot.Y);
+        if (dy == 0) e.Dx = HORIZONTAL;
+        else e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
+    }
+//---------------------------------------------------------------------------
+
+    inline void SwapSides(TEdge &Edge1, TEdge &Edge2)
+    {
+        EdgeSide Side =  Edge1.Side;
+        Edge1.Side = Edge2.Side;
+        Edge2.Side = Side;
+    }
+//------------------------------------------------------------------------------
+
+    inline void SwapPolyIndexes(TEdge &Edge1, TEdge &Edge2)
+    {
+        int OutIdx =  Edge1.OutIdx;
+        Edge1.OutIdx = Edge2.OutIdx;
+        Edge2.OutIdx = OutIdx;
+    }
+//------------------------------------------------------------------------------
+
+    inline cInt TopX(TEdge &edge, const cInt currentY)
+    {
+        return ( currentY == edge.Top.Y ) ?
+               edge.Top.X : edge.Bot.X + Round(edge.Dx *(currentY - edge.Bot.Y));
+    }
+//------------------------------------------------------------------------------
+
+    void IntersectPoint(TEdge &Edge1, TEdge &Edge2, IntPoint &ip)
+    {
+#ifdef use_xyz
+        ip.Z = 0;
+#endif
+
+        double b1, b2;
+        if (Edge1.Dx == Edge2.Dx)
+        {
+            ip.Y = Edge1.Curr.Y;
+            ip.X = TopX(Edge1, ip.Y);
+            return;
+        }
+        else if (Edge1.Dx == 0)
+        {
+            ip.X = Edge1.Bot.X;
+            if (IsHorizontal(Edge2))
+                ip.Y = Edge2.Bot.Y;
+            else
+            {
+                b2 = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
+                ip.Y = Round(ip.X / Edge2.Dx + b2);
+            }
+        }
+        else if (Edge2.Dx == 0)
+        {
+            ip.X = Edge2.Bot.X;
+            if (IsHorizontal(Edge1))
+                ip.Y = Edge1.Bot.Y;
+            else
+            {
+                b1 = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
+                ip.Y = Round(ip.X / Edge1.Dx + b1);
+            }
+        }
+        else
+        {
+            b1 = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
+            b2 = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
+            double q = (b2-b1) / (Edge1.Dx - Edge2.Dx);
+            ip.Y = Round(q);
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = Round(Edge1.Dx * q + b1);
+            else
+                ip.X = Round(Edge2.Dx * q + b2);
+        }
+
+        if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y)
+        {
+            if (Edge1.Top.Y > Edge2.Top.Y)
+                ip.Y = Edge1.Top.Y;
+            else
+                ip.Y = Edge2.Top.Y;
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge1, ip.Y);
+            else
+                ip.X = TopX(Edge2, ip.Y);
+        }
+        //finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
+        if (ip.Y > Edge1.Curr.Y)
+        {
+            ip.Y = Edge1.Curr.Y;
+            //use the more vertical edge to derive X ...
+            if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge2, ip.Y); else
+                ip.X = TopX(Edge1, ip.Y);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ReversePolyPtLinks(OutPt *pp)
+    {
+        if (!pp) return;
+        OutPt *pp1, *pp2;
+        pp1 = pp;
+        do {
+            pp2 = pp1->Next;
+            pp1->Next = pp1->Prev;
+            pp1->Prev = pp2;
+            pp1 = pp2;
+        } while( pp1 != pp );
+    }
+//------------------------------------------------------------------------------
+
+    void DisposeOutPts(OutPt*& pp)
+    {
+        if (pp == 0) return;
+        pp->Prev->Next = 0;
+        while( pp )
+        {
+            OutPt *tmpPp = pp;
+            pp = pp->Next;
+            delete tmpPp;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
+    {
+        std::memset(e, 0, sizeof(TEdge));
+        e->Next = eNext;
+        e->Prev = ePrev;
+        e->Curr = Pt;
+        e->OutIdx = Unassigned;
+    }
+//------------------------------------------------------------------------------
+
+    void InitEdge2(TEdge& e, PolyType Pt)
+    {
+        if (e.Curr.Y >= e.Next->Curr.Y)
+        {
+            e.Bot = e.Curr;
+            e.Top = e.Next->Curr;
+        } else
+        {
+            e.Top = e.Curr;
+            e.Bot = e.Next->Curr;
+        }
+        SetDx(e);
+        e.PolyTyp = Pt;
+    }
+//------------------------------------------------------------------------------
+
+    TEdge* RemoveEdge(TEdge* e)
+    {
+        //removes e from double_linked_list (but without removing from memory)
+        e->Prev->Next = e->Next;
+        e->Next->Prev = e->Prev;
+        TEdge* result = e->Next;
+        e->Prev = 0; //flag as removed (see ClipperBase.Clear)
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    inline void ReverseHorizontal(TEdge &e)
+    {
+        //swap horizontal edges' Top and Bottom x's so they follow the natural
+        //progression of the bounds - ie so their xbots will align with the
+        //adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
+        std::swap(e.Top.X, e.Bot.X);
+#ifdef use_xyz
+        std::swap(e.Top.Z, e.Bot.Z);
+#endif
+    }
+//------------------------------------------------------------------------------
+
+    void SwapPoints(IntPoint &pt1, IntPoint &pt2)
+    {
+        IntPoint tmp = pt1;
+        pt1 = pt2;
+        pt2 = tmp;
+    }
+//------------------------------------------------------------------------------
+
+    bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a,
+                           IntPoint pt2b, IntPoint &pt1, IntPoint &pt2)
+    {
+        //precondition: segments are Collinear.
+        if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
+        {
+            if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
+            if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
+            if (pt1a.X > pt2a.X) pt1 = pt1a; else pt1 = pt2a;
+            if (pt1b.X < pt2b.X) pt2 = pt1b; else pt2 = pt2b;
+            return pt1.X < pt2.X;
+        } else
+        {
+            if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
+            if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
+            if (pt1a.Y < pt2a.Y) pt1 = pt1a; else pt1 = pt2a;
+            if (pt1b.Y > pt2b.Y) pt2 = pt1b; else pt2 = pt2b;
+            return pt1.Y > pt2.Y;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
+    {
+        OutPt *p = btmPt1->Prev;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
+        double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+        p = btmPt1->Next;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
+        double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+
+        p = btmPt2->Prev;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
+        double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+        p = btmPt2->Next;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
+        double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+
+        if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
+            std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
+            return Area(btmPt1) > 0; //if otherwise identical use orientation
+        else
+            return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
+    }
+//------------------------------------------------------------------------------
+
+    OutPt* GetBottomPt(OutPt *pp)
+    {
+        OutPt* dups = 0;
+        OutPt* p = pp->Next;
+        while (p != pp)
+        {
+            if (p->Pt.Y > pp->Pt.Y)
+            {
+                pp = p;
+                dups = 0;
+            }
+            else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
+            {
+                if (p->Pt.X < pp->Pt.X)
+                {
+                    dups = 0;
+                    pp = p;
+                } else
+                {
+                    if (p->Next != pp && p->Prev != pp) dups = p;
+                }
+            }
+            p = p->Next;
+        }
+        if (dups)
+        {
+            //there appears to be at least 2 vertices at BottomPt so ...
+            while (dups != p)
+            {
+                if (!FirstIsBottomPt(p, dups)) pp = dups;
+                dups = dups->Next;
+                while (dups->Pt != pp->Pt) dups = dups->Next;
+            }
+        }
+        return pp;
+    }
+//------------------------------------------------------------------------------
+
+    bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
+                               const IntPoint pt2, const IntPoint pt3)
+    {
+        if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
+            return false;
+        else if (pt1.X != pt3.X)
+            return (pt2.X > pt1.X) == (pt2.X < pt3.X);
+        else
+            return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
+    }
+//------------------------------------------------------------------------------
+
+    bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
+    {
+        if (seg1a > seg1b) std::swap(seg1a, seg1b);
+        if (seg2a > seg2b) std::swap(seg2a, seg2b);
+        return (seg1a < seg2b) && (seg2a < seg1b);
+    }
+
+//------------------------------------------------------------------------------
+// ClipperBase class methods ...
+//------------------------------------------------------------------------------
+
+    ClipperBase::ClipperBase() //constructor
+    {
+        m_CurrentLM = m_MinimaList.begin(); //begin() == end() here
+        m_UseFullRange = false;
+    }
+//------------------------------------------------------------------------------
+
+    ClipperBase::~ClipperBase() //destructor
+    {
+        Clear();
+    }
+//------------------------------------------------------------------------------
+
+    void RangeTest(const IntPoint& Pt, bool& useFullRange)
+    {
+        if (useFullRange)
+        {
+            if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange)
+                throw clipperException("Coordinate outside allowed range");
+        }
+        else if (Pt.X > loRange|| Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange)
+        {
+            useFullRange = true;
+            RangeTest(Pt, useFullRange);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    TEdge* FindNextLocMin(TEdge* E)
+    {
+        for (;;)
+        {
+            while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
+            if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
+            while (IsHorizontal(*E->Prev)) E = E->Prev;
+            TEdge* E2 = E;
+            while (IsHorizontal(*E)) E = E->Next;
+            if (E->Top.Y == E->Prev->Bot.Y) continue; //ie just an intermediate horz.
+            if (E2->Prev->Bot.X < E->Bot.X) E = E2;
+            break;
+        }
+        return E;
+    }
+//------------------------------------------------------------------------------
+
+    TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
+    {
+        TEdge *Result = E;
+        TEdge *Horz = 0;
+
+        if (E->OutIdx == Skip)
+        {
+            //if edges still remain in the current bound beyond the skip edge then
+            //create another LocMin and call ProcessBound once more
+            if (NextIsForward)
+            {
+                while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
+                //don't include top horizontals when parsing a bound a second time,
+                //they will be contained in the opposite bound ...
+                while (E != Result && IsHorizontal(*E)) E = E->Prev;
+            }
+            else
+            {
+                while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
+                while (E != Result && IsHorizontal(*E)) E = E->Next;
+            }
+
+            if (E == Result)
+            {
+                if (NextIsForward) Result = E->Next;
+                else Result = E->Prev;
+            }
+            else
+            {
+                //there are more edges in the bound beyond result starting with E
+                if (NextIsForward)
+                    E = Result->Next;
+                else
+                    E = Result->Prev;
+                MinimaList::value_type locMin;
+                locMin.Y = E->Bot.Y;
+                locMin.LeftBound = 0;
+                locMin.RightBound = E;
+                E->WindDelta = 0;
+                Result = ProcessBound(E, NextIsForward);
+                m_MinimaList.push_back(locMin);
+            }
+            return Result;
+        }
+
+        TEdge *EStart;
+
+        if (IsHorizontal(*E))
+        {
+            //We need to be careful with open paths because this may not be a
+            //true local minima (ie E may be following a skip edge).
+            //Also, consecutive horz. edges may start heading left before going right.
+            if (NextIsForward)
+                EStart = E->Prev;
+            else
+                EStart = E->Next;
+            if (IsHorizontal(*EStart)) //ie an adjoining horizontal skip edge
+            {
+                if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
+                    ReverseHorizontal(*E);
+            }
+            else if (EStart->Bot.X != E->Bot.X)
+                ReverseHorizontal(*E);
+        }
+
+        EStart = E;
+        if (NextIsForward)
+        {
+            while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
+                Result = Result->Next;
+            if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
+            {
+                //nb: at the top of a bound, horizontals are added to the bound
+                //only when the preceding edge attaches to the horizontal's left vertex
+                //unless a Skip edge is encountered when that becomes the top divide
+                Horz = Result;
+                while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
+                if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
+            }
+            while (E != Result)
+            {
+                E->NextInLML = E->Next;
+                if (IsHorizontal(*E) && E != EStart &&
+                    E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                E = E->Next;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Next; //move to the edge just beyond current bound
+        } else
+        {
+            while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip)
+                Result = Result->Prev;
+            if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
+            {
+                Horz = Result;
+                while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
+                if (Horz->Next->Top.X == Result->Prev->Top.X ||
+                    Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
+            }
+
+            while (E != Result)
+            {
+                E->NextInLML = E->Prev;
+                if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                    ReverseHorizontal(*E);
+                E = E->Prev;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Prev; //move to the edge just beyond current bound
+        }
+
+        return Result;
+    }
+//------------------------------------------------------------------------------
+
+    bool ClipperBase::AddPath(const Path &pg, PolyType PolyTyp, bool Closed)
+    {
+#ifdef use_lines
+        if (!Closed && PolyTyp == ptClip)
+            throw clipperException("AddPath: Open paths must be subject.");
+#else
+        if (!Closed)
+    throw clipperException("AddPath: Open paths have been disabled.");
+#endif
+
+        int highI = (int)pg.size() -1;
+        if (Closed) while (highI > 0 && (pg[highI] == pg[0])) --highI;
+        while (highI > 0 && (pg[highI] == pg[highI -1])) --highI;
+        if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;
+
+        //create a new edge array ...
+        TEdge *edges = new TEdge [highI +1];
+
+        bool IsFlat = true;
+        //1. Basic (first) edge initialization ...
+        try
+        {
+            edges[1].Curr = pg[1];
+            RangeTest(pg[0], m_UseFullRange);
+            RangeTest(pg[highI], m_UseFullRange);
+            InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
+            InitEdge(&edges[highI], &edges[0], &edges[highI-1], pg[highI]);
+            for (int i = highI - 1; i >= 1; --i)
+            {
+                RangeTest(pg[i], m_UseFullRange);
+                InitEdge(&edges[i], &edges[i+1], &edges[i-1], pg[i]);
+            }
+        }
+        catch(...)
+        {
+            delete [] edges;
+            throw; //range test fails
+        }
+        TEdge *eStart = &edges[0];
+
+        //2. Remove duplicate vertices, and (when closed) collinear edges ...
+        TEdge *E = eStart, *eLoopStop = eStart;
+        for (;;)
+        {
+            //nb: allows matching start and end points when not Closed ...
+            if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
+            {
+                if (E == E->Next) break;
+                if (E == eStart) eStart = E->Next;
+                E = RemoveEdge(E);
+                eLoopStop = E;
+                continue;
+            }
+            if (E->Prev == E->Next)
+                break; //only two vertices
+            else if (Closed &&
+                     SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) &&
+                     (!m_PreserveCollinear ||
+                      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
+            {
+                //Collinear edges are allowed for open paths but in closed paths
+                //the default is to merge adjacent collinear edges into a single edge.
+                //However, if the PreserveCollinear property is enabled, only overlapping
+                //collinear edges (ie spikes) will be removed from closed paths.
+                if (E == eStart) eStart = E->Next;
+                E = RemoveEdge(E);
+                E = E->Prev;
+                eLoopStop = E;
+                continue;
+            }
+            E = E->Next;
+            if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
+        }
+
+        if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
+        {
+            delete [] edges;
+            return false;
+        }
+
+        if (!Closed)
+        {
+            m_HasOpenPaths = true;
+            eStart->Prev->OutIdx = Skip;
+        }
+
+        //3. Do second stage of edge initialization ...
+        E = eStart;
+        do
+        {
+            InitEdge2(*E, PolyTyp);
+            E = E->Next;
+            if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
+        }
+        while (E != eStart);
+
+        //4. Finally, add edge bounds to LocalMinima list ...
+
+        //Totally flat paths must be handled differently when adding them
+        //to LocalMinima list to avoid endless loops etc ...
+        if (IsFlat)
+        {
+            if (Closed)
+            {
+                delete [] edges;
+                return false;
+            }
+            E->Prev->OutIdx = Skip;
+            MinimaList::value_type locMin;
+            locMin.Y = E->Bot.Y;
+            locMin.LeftBound = 0;
+            locMin.RightBound = E;
+            locMin.RightBound->Side = esRight;
+            locMin.RightBound->WindDelta = 0;
+            for (;;)
+            {
+                if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                if (E->Next->OutIdx == Skip) break;
+                E->NextInLML = E->Next;
+                E = E->Next;
+            }
+            m_MinimaList.push_back(locMin);
+            m_edges.push_back(edges);
+            return true;
+        }
+
+        m_edges.push_back(edges);
+        bool leftBoundIsForward;
+        TEdge* EMin = 0;
+
+        //workaround to avoid an endless loop in the while loop below when
+        //open paths have matching start and end points ...
+        if (E->Prev->Bot == E->Prev->Top) E = E->Next;
+
+        for (;;)
+        {
+            E = FindNextLocMin(E);
+            if (E == EMin) break;
+            else if (!EMin) EMin = E;
+
+            //E and E.Prev now share a local minima (left aligned if horizontal).
+            //Compare their slopes to find which starts which bound ...
+            MinimaList::value_type locMin;
+            locMin.Y = E->Bot.Y;
+            if (E->Dx < E->Prev->Dx)
+            {
+                locMin.LeftBound = E->Prev;
+                locMin.RightBound = E;
+                leftBoundIsForward = false; //Q.nextInLML = Q.prev
+            } else
+            {
+                locMin.LeftBound = E;
+                locMin.RightBound = E->Prev;
+                leftBoundIsForward = true; //Q.nextInLML = Q.next
+            }
+
+            if (!Closed) locMin.LeftBound->WindDelta = 0;
+            else if (locMin.LeftBound->Next == locMin.RightBound)
+                locMin.LeftBound->WindDelta = -1;
+            else locMin.LeftBound->WindDelta = 1;
+            locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;
+
+            E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
+            if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);
+
+            TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
+            if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);
+
+            if (locMin.LeftBound->OutIdx == Skip)
+                locMin.LeftBound = 0;
+            else if (locMin.RightBound->OutIdx == Skip)
+                locMin.RightBound = 0;
+            m_MinimaList.push_back(locMin);
+            if (!leftBoundIsForward) E = E2;
+        }
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    bool ClipperBase::AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed)
+    {
+        bool result = false;
+        for (Paths::size_type i = 0; i < ppg.size(); ++i)
+            if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::Clear()
+    {
+        DisposeLocalMinimaList();
+        for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
+        {
+            TEdge* edges = m_edges[i];
+            delete [] edges;
+        }
+        m_edges.clear();
+        m_UseFullRange = false;
+        m_HasOpenPaths = false;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::Reset()
+    {
+        m_CurrentLM = m_MinimaList.begin();
+        if (m_CurrentLM == m_MinimaList.end()) return; //ie nothing to process
+        std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());
+
+        m_Scanbeam = ScanbeamList(); //clears/resets priority_queue
+        //reset all edges ...
+        for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
+        {
+            InsertScanbeam(lm->Y);
+            TEdge* e = lm->LeftBound;
+            if (e)
+            {
+                e->Curr = e->Bot;
+                e->Side = esLeft;
+                e->OutIdx = Unassigned;
+            }
+
+            e = lm->RightBound;
+            if (e)
+            {
+                e->Curr = e->Bot;
+                e->Side = esRight;
+                e->OutIdx = Unassigned;
+            }
+        }
+        m_ActiveEdges = 0;
+        m_CurrentLM = m_MinimaList.begin();
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeLocalMinimaList()
+    {
+        m_MinimaList.clear();
+        m_CurrentLM = m_MinimaList.begin();
+    }
+//------------------------------------------------------------------------------
+
+    bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum *&locMin)
+    {
+        if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
+        locMin = &(*m_CurrentLM);
+        ++m_CurrentLM;
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    IntRect ClipperBase::GetBounds()
+    {
+        IntRect result;
+        MinimaList::iterator lm = m_MinimaList.begin();
+        if (lm == m_MinimaList.end())
+        {
+            result.left = result.top = result.right = result.bottom = 0;
+            return result;
+        }
+        result.left = lm->LeftBound->Bot.X;
+        result.top = lm->LeftBound->Bot.Y;
+        result.right = lm->LeftBound->Bot.X;
+        result.bottom = lm->LeftBound->Bot.Y;
+        while (lm != m_MinimaList.end())
+        {
+            //todo - needs fixing for open paths
+            result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
+            TEdge* e = lm->LeftBound;
+            for (;;) {
+                TEdge* bottomE = e;
+                while (e->NextInLML)
+                {
+                    if (e->Bot.X < result.left) result.left = e->Bot.X;
+                    if (e->Bot.X > result.right) result.right = e->Bot.X;
+                    e = e->NextInLML;
+                }
+                result.left = std::min(result.left, e->Bot.X);
+                result.right = std::max(result.right, e->Bot.X);
+                result.left = std::min(result.left, e->Top.X);
+                result.right = std::max(result.right, e->Top.X);
+                result.top = std::min(result.top, e->Top.Y);
+                if (bottomE == lm->LeftBound) e = lm->RightBound;
+                else break;
+            }
+            ++lm;
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::InsertScanbeam(const cInt Y)
+    {
+        m_Scanbeam.push(Y);
+    }
+//------------------------------------------------------------------------------
+
+    bool ClipperBase::PopScanbeam(cInt &Y)
+    {
+        if (m_Scanbeam.empty()) return false;
+        Y = m_Scanbeam.top();
+        m_Scanbeam.pop();
+        while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); } // Pop duplicates.
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeAllOutRecs(){
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            DisposeOutRec(i);
+        m_PolyOuts.clear();
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
+    {
+        OutRec *outRec = m_PolyOuts[index];
+        if (outRec->Pts) DisposeOutPts(outRec->Pts);
+        delete outRec;
+        m_PolyOuts[index] = 0;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::DeleteFromAEL(TEdge *e)
+    {
+        TEdge* AelPrev = e->PrevInAEL;
+        TEdge* AelNext = e->NextInAEL;
+        if (!AelPrev &&  !AelNext && (e != m_ActiveEdges)) return; //already deleted
+        if (AelPrev) AelPrev->NextInAEL = AelNext;
+        else m_ActiveEdges = AelNext;
+        if (AelNext) AelNext->PrevInAEL = AelPrev;
+        e->NextInAEL = 0;
+        e->PrevInAEL = 0;
+    }
+//------------------------------------------------------------------------------
+
+    OutRec* ClipperBase::CreateOutRec()
+    {
+        OutRec* result = new OutRec;
+        result->IsHole = false;
+        result->IsOpen = false;
+        result->FirstLeft = 0;
+        result->Pts = 0;
+        result->BottomPt = 0;
+        result->PolyNd = 0;
+        m_PolyOuts.push_back(result);
+        result->Idx = (int)m_PolyOuts.size() - 1;
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::SwapPositionsInAEL(TEdge *Edge1, TEdge *Edge2)
+    {
+        //check that one or other edge hasn't already been removed from AEL ...
+        if (Edge1->NextInAEL == Edge1->PrevInAEL ||
+            Edge2->NextInAEL == Edge2->PrevInAEL) return;
+
+        if (Edge1->NextInAEL == Edge2)
+        {
+            TEdge* Next = Edge2->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge1;
+            TEdge* Prev = Edge1->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            Edge2->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2;
+            Edge1->NextInAEL = Next;
+        }
+        else if (Edge2->NextInAEL == Edge1)
+        {
+            TEdge* Next = Edge1->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge2;
+            TEdge* Prev = Edge2->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Prev;
+            Edge1->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+        }
+        else
+        {
+            TEdge* Next = Edge1->NextInAEL;
+            TEdge* Prev = Edge1->PrevInAEL;
+            Edge1->NextInAEL = Edge2->NextInAEL;
+            if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2->PrevInAEL;
+            if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+            if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
+        }
+
+        if (!Edge1->PrevInAEL) m_ActiveEdges = Edge1;
+        else if (!Edge2->PrevInAEL) m_ActiveEdges = Edge2;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperBase::UpdateEdgeIntoAEL(TEdge *&e)
+    {
+        if (!e->NextInLML)
+            throw clipperException("UpdateEdgeIntoAEL: invalid call");
+
+        e->NextInLML->OutIdx = e->OutIdx;
+        TEdge* AelPrev = e->PrevInAEL;
+        TEdge* AelNext = e->NextInAEL;
+        if (AelPrev) AelPrev->NextInAEL = e->NextInLML;
+        else m_ActiveEdges = e->NextInLML;
+        if (AelNext) AelNext->PrevInAEL = e->NextInLML;
+        e->NextInLML->Side = e->Side;
+        e->NextInLML->WindDelta = e->WindDelta;
+        e->NextInLML->WindCnt = e->WindCnt;
+        e->NextInLML->WindCnt2 = e->WindCnt2;
+        e = e->NextInLML;
+        e->Curr = e->Bot;
+        e->PrevInAEL = AelPrev;
+        e->NextInAEL = AelNext;
+        if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
+    }
+//------------------------------------------------------------------------------
+
+    bool ClipperBase::LocalMinimaPending()
+    {
+        return (m_CurrentLM != m_MinimaList.end());
+    }
+
+//------------------------------------------------------------------------------
+// TClipper methods ...
+//------------------------------------------------------------------------------
+
+    Clipper::Clipper(int initOptions) : ClipperBase() //constructor
+    {
+        m_ExecuteLocked = false;
+        m_UseFullRange = false;
+        m_ReverseOutput = ((initOptions & ioReverseSolution) != 0);
+        m_StrictSimple = ((initOptions & ioStrictlySimple) != 0);
+        m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
+        m_HasOpenPaths = false;
+#ifdef use_xyz
+        m_ZFill = 0;
+#endif
+    }
+//------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::ZFillFunction(ZFillCallback zFillFunc)
+{
+  m_ZFill = zFillFunc;
+}
+//------------------------------------------------------------------------------
+#endif
+
+    bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType fillType)
+    {
+        return Execute(clipType, solution, fillType, fillType);
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType)
+    {
+        return Execute(clipType, polytree, fillType, fillType);
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, Paths &solution,
+                          PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if( m_ExecuteLocked ) return false;
+        if (m_HasOpenPaths)
+            throw clipperException("Error: PolyTree struct is needed for open path clipping.");
+        m_ExecuteLocked = true;
+        solution.resize(0);
+        m_SubjFillType = subjFillType;
+        m_ClipFillType = clipFillType;
+        m_ClipType = clipType;
+        m_UsingPolyTree = false;
+        bool succeeded = ExecuteInternal();
+        if (succeeded) BuildResult(solution);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree& polytree,
+                          PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if( m_ExecuteLocked ) return false;
+        m_ExecuteLocked = true;
+        m_SubjFillType = subjFillType;
+        m_ClipFillType = clipFillType;
+        m_ClipType = clipType;
+        m_UsingPolyTree = true;
+        bool succeeded = ExecuteInternal();
+        if (succeeded) BuildResult2(polytree);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::FixHoleLinkage(OutRec &outrec)
+    {
+        //skip OutRecs that (a) contain outermost polygons or
+        //(b) already have the correct owner/child linkage ...
+        if (!outrec.FirstLeft ||
+            (outrec.IsHole != outrec.FirstLeft->IsHole &&
+             outrec.FirstLeft->Pts)) return;
+
+        OutRec* orfl = outrec.FirstLeft;
+        while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
+            orfl = orfl->FirstLeft;
+        outrec.FirstLeft = orfl;
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::ExecuteInternal()
+    {
+        bool succeeded = true;
+        try {
+            Reset();
+            m_Maxima = MaximaList();
+            m_SortedEdges = 0;
+
+            succeeded = true;
+            cInt botY, topY;
+            if (!PopScanbeam(botY)) return false;
+            InsertLocalMinimaIntoAEL(botY);
+            while (PopScanbeam(topY) || LocalMinimaPending())
+            {
+                ProcessHorizontals();
+                ClearGhostJoins();
+                if (!ProcessIntersections(topY))
+                {
+                    succeeded = false;
+                    break;
+                }
+                ProcessEdgesAtTopOfScanbeam(topY);
+                botY = topY;
+                InsertLocalMinimaIntoAEL(botY);
+            }
+        }
+        catch(...)
+        {
+            succeeded = false;
+        }
+
+        if (succeeded)
+        {
+            //fix orientations ...
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec *outRec = m_PolyOuts[i];
+                if (!outRec->Pts || outRec->IsOpen) continue;
+                if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
+                    ReversePolyPtLinks(outRec->Pts);
+            }
+
+            if (!m_Joins.empty()) JoinCommonEdges();
+
+            //unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec *outRec = m_PolyOuts[i];
+                if (!outRec->Pts) continue;
+                if (outRec->IsOpen)
+                    FixupOutPolyline(*outRec);
+                else
+                    FixupOutPolygon(*outRec);
+            }
+
+            if (m_StrictSimple) DoSimplePolygons();
+        }
+
+        ClearJoins();
+        ClearGhostJoins();
+        return succeeded;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::SetWindingCount(TEdge &edge)
+    {
+        TEdge *e = edge.PrevInAEL;
+        //find the edge of the same polytype that immediately preceeds 'edge' in AEL
+        while (e  && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
+        if (!e)
+        {
+            if (edge.WindDelta == 0)
+            {
+                PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
+                edge.WindCnt = (pft == pftNegative ? -1 : 1);
+            }
+            else
+                edge.WindCnt = edge.WindDelta;
+            edge.WindCnt2 = 0;
+            e = m_ActiveEdges; //ie get ready to calc WindCnt2
+        }
+        else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
+        {
+            edge.WindCnt = 1;
+            edge.WindCnt2 = e->WindCnt2;
+            e = e->NextInAEL; //ie get ready to calc WindCnt2
+        }
+        else if (IsEvenOddFillType(edge))
+        {
+            //EvenOdd filling ...
+            if (edge.WindDelta == 0)
+            {
+                //are we inside a subj polygon ...
+                bool Inside = true;
+                TEdge *e2 = e->PrevInAEL;
+                while (e2)
+                {
+                    if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0)
+                        Inside = !Inside;
+                    e2 = e2->PrevInAEL;
+                }
+                edge.WindCnt = (Inside ? 0 : 1);
+            }
+            else
+            {
+                edge.WindCnt = edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e = e->NextInAEL; //ie get ready to calc WindCnt2
+        }
+        else
+        {
+            //nonZero, Positive or Negative filling ...
+            if (e->WindCnt * e->WindDelta < 0)
+            {
+                //prev edge is 'decreasing' WindCount (WC) toward zero
+                //so we're outside the previous polygon ...
+                if (Abs(e->WindCnt) > 1)
+                {
+                    //outside prev poly but still inside another.
+                    //when reversing direction of prev poly use the same WC
+                    if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
+                        //otherwise continue to 'decrease' WC ...
+                    else edge.WindCnt = e->WindCnt + edge.WindDelta;
+                }
+                else
+                    //now outside all polys of same polytype so set own WC ...
+                    edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
+            } else
+            {
+                //prev edge is 'increasing' WindCount (WC) away from zero
+                //so we're inside the previous polygon ...
+                if (edge.WindDelta == 0)
+                    edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
+                    //if wind direction is reversing prev then use same WC
+                else if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
+                    //otherwise add to WC ...
+                else edge.WindCnt = e->WindCnt + edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e = e->NextInAEL; //ie get ready to calc WindCnt2
+        }
+
+        //update WindCnt2 ...
+        if (IsEvenOddAltFillType(edge))
+        {
+            //EvenOdd filling ...
+            while (e != &edge)
+            {
+                if (e->WindDelta != 0)
+                    edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
+                e = e->NextInAEL;
+            }
+        } else
+        {
+            //nonZero, Positive or Negative filling ...
+            while ( e != &edge )
+            {
+                edge.WindCnt2 += e->WindDelta;
+                e = e->NextInAEL;
+            }
+        }
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_SubjFillType == pftEvenOdd; else
+            return m_ClipFillType == pftEvenOdd;
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_ClipFillType == pftEvenOdd; else
+            return m_SubjFillType == pftEvenOdd;
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::IsContributing(const TEdge& edge) const
+    {
+        PolyFillType pft, pft2;
+        if (edge.PolyTyp == ptSubject)
+        {
+            pft = m_SubjFillType;
+            pft2 = m_ClipFillType;
+        } else
+        {
+            pft = m_ClipFillType;
+            pft2 = m_SubjFillType;
+        }
+
+        switch(pft)
+        {
+            case pftEvenOdd:
+                //return false if a subj line has been flagged as inside a subj polygon
+                if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
+                break;
+            case pftNonZero:
+                if (Abs(edge.WindCnt) != 1) return false;
+                break;
+            case pftPositive:
+                if (edge.WindCnt != 1) return false;
+                break;
+            default: //pftNegative
+                if (edge.WindCnt != -1) return false;
+        }
+
+        switch(m_ClipType)
+        {
+            case ctIntersection:
+                switch(pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 != 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 > 0);
+                    default:
+                        return (edge.WindCnt2 < 0);
+                }
+                break;
+            case ctUnion:
+                switch(pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 == 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 <= 0);
+                    default:
+                        return (edge.WindCnt2 >= 0);
+                }
+                break;
+            case ctDifference:
+                if (edge.PolyTyp == ptSubject)
+                    switch(pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    switch(pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 != 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 > 0);
+                        default:
+                            return (edge.WindCnt2 < 0);
+                    }
+                break;
+            case ctXor:
+                if (edge.WindDelta == 0) //XOr always contributing unless open
+                    switch(pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    return true;
+                break;
+            default:
+                return true;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
+    {
+        OutPt* result;
+        TEdge *e, *prevE;
+        if (IsHorizontal(*e2) || ( e1->Dx > e2->Dx ))
+        {
+            result = AddOutPt(e1, Pt);
+            e2->OutIdx = e1->OutIdx;
+            e1->Side = esLeft;
+            e2->Side = esRight;
+            e = e1;
+            if (e->PrevInAEL == e2)
+                prevE = e2->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        } else
+        {
+            result = AddOutPt(e2, Pt);
+            e1->OutIdx = e2->OutIdx;
+            e1->Side = esRight;
+            e2->Side = esLeft;
+            e = e2;
+            if (e->PrevInAEL == e1)
+                prevE = e1->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        }
+
+        if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y)
+        {
+            cInt xPrev = TopX(*prevE, Pt.Y);
+            cInt xE = TopX(*e, Pt.Y);
+            if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
+                SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
+            {
+                OutPt* outPt = AddOutPt(prevE, Pt);
+                AddJoin(result, outPt, e->Top);
+            }
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
+    {
+        AddOutPt( e1, Pt );
+        if (e2->WindDelta == 0) AddOutPt(e2, Pt);
+        if( e1->OutIdx == e2->OutIdx )
+        {
+            e1->OutIdx = Unassigned;
+            e2->OutIdx = Unassigned;
+        }
+        else if (e1->OutIdx < e2->OutIdx)
+            AppendPolygon(e1, e2);
+        else
+            AppendPolygon(e2, e1);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::AddEdgeToSEL(TEdge *edge)
+    {
+        //SEL pointers in PEdge are reused to build a list of horizontal edges.
+        //However, we don't need to worry about order with horizontal edge processing.
+        if( !m_SortedEdges )
+        {
+            m_SortedEdges = edge;
+            edge->PrevInSEL = 0;
+            edge->NextInSEL = 0;
+        }
+        else
+        {
+            edge->NextInSEL = m_SortedEdges;
+            edge->PrevInSEL = 0;
+            m_SortedEdges->PrevInSEL = edge;
+            m_SortedEdges = edge;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::PopEdgeFromSEL(TEdge *&edge)
+    {
+        if (!m_SortedEdges) return false;
+        edge = m_SortedEdges;
+        DeleteFromSEL(m_SortedEdges);
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::CopyAELToSEL()
+    {
+        TEdge* e = m_ActiveEdges;
+        m_SortedEdges = e;
+        while ( e )
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e = e->NextInAEL;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::AddJoin(OutPt *op1, OutPt *op2, const IntPoint OffPt)
+    {
+        Join* j = new Join;
+        j->OutPt1 = op1;
+        j->OutPt2 = op2;
+        j->OffPt = OffPt;
+        m_Joins.push_back(j);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::ClearJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+            delete m_Joins[i];
+        m_Joins.resize(0);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::ClearGhostJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
+            delete m_GhostJoins[i];
+        m_GhostJoins.resize(0);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::AddGhostJoin(OutPt *op, const IntPoint OffPt)
+    {
+        Join* j = new Join;
+        j->OutPt1 = op;
+        j->OutPt2 = 0;
+        j->OffPt = OffPt;
+        m_GhostJoins.push_back(j);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
+    {
+        const LocalMinimum *lm;
+        while (PopLocalMinima(botY, lm))
+        {
+            TEdge* lb = lm->LeftBound;
+            TEdge* rb = lm->RightBound;
+
+            OutPt *Op1 = 0;
+            if (!lb)
+            {
+                //nb: don't insert LB into either AEL or SEL
+                InsertEdgeIntoAEL(rb, 0);
+                SetWindingCount(*rb);
+                if (IsContributing(*rb))
+                    Op1 = AddOutPt(rb, rb->Bot);
+            }
+            else if (!rb)
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                SetWindingCount(*lb);
+                if (IsContributing(*lb))
+                    Op1 = AddOutPt(lb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+            else
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                InsertEdgeIntoAEL(rb, lb);
+                SetWindingCount( *lb );
+                rb->WindCnt = lb->WindCnt;
+                rb->WindCnt2 = lb->WindCnt2;
+                if (IsContributing(*lb))
+                    Op1 = AddLocalMinPoly(lb, rb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+
+            if (rb)
+            {
+                if (IsHorizontal(*rb))
+                {
+                    AddEdgeToSEL(rb);
+                    if (rb->NextInLML)
+                        InsertScanbeam(rb->NextInLML->Top.Y);
+                }
+                else InsertScanbeam( rb->Top.Y );
+            }
+
+            if (!lb || !rb) continue;
+
+            //if any output polygons share an edge, they'll need joining later ...
+            if (Op1 && IsHorizontal(*rb) &&
+                m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
+            {
+                for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
+                {
+                    Join* jr = m_GhostJoins[i];
+                    //if the horizontal Rb and a 'ghost' horizontal overlap, then convert
+                    //the 'ghost' join to a real join ready for later ...
+                    if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
+                        AddJoin(jr->OutPt1, Op1, jr->OffPt);
+                }
+            }
+
+            if (lb->OutIdx >= 0 && lb->PrevInAEL &&
+                lb->PrevInAEL->Curr.X == lb->Bot.X &&
+                lb->PrevInAEL->OutIdx >= 0 &&
+                SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
+                (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
+            {
+                OutPt *Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
+                AddJoin(Op1, Op2, lb->Top);
+            }
+
+            if(lb->NextInAEL != rb)
+            {
+
+                if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
+                    SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
+                    (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
+                {
+                    OutPt *Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
+                    AddJoin(Op1, Op2, rb->Top);
+                }
+
+                TEdge* e = lb->NextInAEL;
+                if (e)
+                {
+                    while( e != rb )
+                    {
+                        //nb: For calculating winding counts etc, IntersectEdges() assumes
+                        //that param1 will be to the Right of param2 ABOVE the intersection ...
+                        IntersectEdges(rb , e , lb->Curr); //order important here
+                        e = e->NextInAEL;
+                    }
+                }
+            }
+
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::DeleteFromSEL(TEdge *e)
+    {
+        TEdge* SelPrev = e->PrevInSEL;
+        TEdge* SelNext = e->NextInSEL;
+        if( !SelPrev &&  !SelNext && (e != m_SortedEdges) ) return; //already deleted
+        if( SelPrev ) SelPrev->NextInSEL = SelNext;
+        else m_SortedEdges = SelNext;
+        if( SelNext ) SelNext->PrevInSEL = SelPrev;
+        e->NextInSEL = 0;
+        e->PrevInSEL = 0;
+    }
+//------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
+{
+  if (pt.Z != 0 || !m_ZFill) return;
+  else if (pt == e1.Bot) pt.Z = e1.Bot.Z;
+  else if (pt == e1.Top) pt.Z = e1.Top.Z;
+  else if (pt == e2.Bot) pt.Z = e2.Bot.Z;
+  else if (pt == e2.Top) pt.Z = e2.Top.Z;
+  else (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt);
+}
+//------------------------------------------------------------------------------
+#endif
+
+    void Clipper::IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &Pt)
+    {
+        bool e1Contributing = ( e1->OutIdx >= 0 );
+        bool e2Contributing = ( e2->OutIdx >= 0 );
+
+#ifdef use_xyz
+        SetZ(Pt, *e1, *e2);
+#endif
+
+#ifdef use_lines
+        //if either edge is on an OPEN path ...
+        if (e1->WindDelta == 0 || e2->WindDelta == 0)
+        {
+            //ignore subject-subject open path intersections UNLESS they
+            //are both open paths, AND they are both 'contributing maximas' ...
+            if (e1->WindDelta == 0 && e2->WindDelta == 0) return;
+
+                //if intersecting a subj line with a subj poly ...
+            else if (e1->PolyTyp == e2->PolyTyp &&
+                     e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
+            {
+                if (e1->WindDelta == 0)
+                {
+                    if (e2Contributing)
+                    {
+                        AddOutPt(e1, Pt);
+                        if (e1Contributing) e1->OutIdx = Unassigned;
+                    }
+                }
+                else
+                {
+                    if (e1Contributing)
+                    {
+                        AddOutPt(e2, Pt);
+                        if (e2Contributing) e2->OutIdx = Unassigned;
+                    }
+                }
+            }
+            else if (e1->PolyTyp != e2->PolyTyp)
+            {
+                //toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
+                if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 &&
+                    (m_ClipType != ctUnion || e2->WindCnt2 == 0))
+                {
+                    AddOutPt(e1, Pt);
+                    if (e1Contributing) e1->OutIdx = Unassigned;
+                }
+                else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) &&
+                         (m_ClipType != ctUnion || e1->WindCnt2 == 0))
+                {
+                    AddOutPt(e2, Pt);
+                    if (e2Contributing) e2->OutIdx = Unassigned;
+                }
+            }
+            return;
+        }
+#endif
+
+        //update winding counts...
+        //assumes that e1 will be to the Right of e2 ABOVE the intersection
+        if ( e1->PolyTyp == e2->PolyTyp )
+        {
+            if ( IsEvenOddFillType( *e1) )
+            {
+                int oldE1WindCnt = e1->WindCnt;
+                e1->WindCnt = e2->WindCnt;
+                e2->WindCnt = oldE1WindCnt;
+            } else
+            {
+                if (e1->WindCnt + e2->WindDelta == 0 ) e1->WindCnt = -e1->WindCnt;
+                else e1->WindCnt += e2->WindDelta;
+                if ( e2->WindCnt - e1->WindDelta == 0 ) e2->WindCnt = -e2->WindCnt;
+                else e2->WindCnt -= e1->WindDelta;
+            }
+        } else
+        {
+            if (!IsEvenOddFillType(*e2)) e1->WindCnt2 += e2->WindDelta;
+            else e1->WindCnt2 = ( e1->WindCnt2 == 0 ) ? 1 : 0;
+            if (!IsEvenOddFillType(*e1)) e2->WindCnt2 -= e1->WindDelta;
+            else e2->WindCnt2 = ( e2->WindCnt2 == 0 ) ? 1 : 0;
+        }
+
+        PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
+        if (e1->PolyTyp == ptSubject)
+        {
+            e1FillType = m_SubjFillType;
+            e1FillType2 = m_ClipFillType;
+        } else
+        {
+            e1FillType = m_ClipFillType;
+            e1FillType2 = m_SubjFillType;
+        }
+        if (e2->PolyTyp == ptSubject)
+        {
+            e2FillType = m_SubjFillType;
+            e2FillType2 = m_ClipFillType;
+        } else
+        {
+            e2FillType = m_ClipFillType;
+            e2FillType2 = m_SubjFillType;
+        }
+
+        cInt e1Wc, e2Wc;
+        switch (e1FillType)
+        {
+            case pftPositive: e1Wc = e1->WindCnt; break;
+            case pftNegative: e1Wc = -e1->WindCnt; break;
+            default: e1Wc = Abs(e1->WindCnt);
+        }
+        switch(e2FillType)
+        {
+            case pftPositive: e2Wc = e2->WindCnt; break;
+            case pftNegative: e2Wc = -e2->WindCnt; break;
+            default: e2Wc = Abs(e2->WindCnt);
+        }
+
+        if ( e1Contributing && e2Contributing )
+        {
+            if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
+                (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor) )
+            {
+                AddLocalMaxPoly(e1, e2, Pt);
+            }
+            else
+            {
+                AddOutPt(e1, Pt);
+                AddOutPt(e2, Pt);
+                SwapSides( *e1 , *e2 );
+                SwapPolyIndexes( *e1 , *e2 );
+            }
+        }
+        else if ( e1Contributing )
+        {
+            if (e2Wc == 0 || e2Wc == 1)
+            {
+                AddOutPt(e1, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if ( e2Contributing )
+        {
+            if (e1Wc == 0 || e1Wc == 1)
+            {
+                AddOutPt(e2, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if ( (e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
+        {
+            //neither edge is currently contributing ...
+
+            cInt e1Wc2, e2Wc2;
+            switch (e1FillType2)
+            {
+                case pftPositive: e1Wc2 = e1->WindCnt2; break;
+                case pftNegative : e1Wc2 = -e1->WindCnt2; break;
+                default: e1Wc2 = Abs(e1->WindCnt2);
+            }
+            switch (e2FillType2)
+            {
+                case pftPositive: e2Wc2 = e2->WindCnt2; break;
+                case pftNegative: e2Wc2 = -e2->WindCnt2; break;
+                default: e2Wc2 = Abs(e2->WindCnt2);
+            }
+
+            if (e1->PolyTyp != e2->PolyTyp)
+            {
+                AddLocalMinPoly(e1, e2, Pt);
+            }
+            else if (e1Wc == 1 && e2Wc == 1)
+                switch( m_ClipType ) {
+                    case ctIntersection:
+                        if (e1Wc2 > 0 && e2Wc2 > 0)
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctUnion:
+                        if ( e1Wc2 <= 0 && e2Wc2 <= 0 )
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctDifference:
+                        if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
+                            ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctXor:
+                        AddLocalMinPoly(e1, e2, Pt);
+                }
+            else
+                SwapSides( *e1, *e2 );
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::SetHoleState(TEdge *e, OutRec *outrec)
+    {
+        TEdge *e2 = e->PrevInAEL;
+        TEdge *eTmp = 0;
+        while (e2)
+        {
+            if (e2->OutIdx >= 0 && e2->WindDelta != 0)
+            {
+                if (!eTmp) eTmp = e2;
+                else if (eTmp->OutIdx == e2->OutIdx) eTmp = 0;
+            }
+            e2 = e2->PrevInAEL;
+        }
+        if (!eTmp)
+        {
+            outrec->FirstLeft = 0;
+            outrec->IsHole = false;
+        }
+        else
+        {
+            outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
+            outrec->IsHole = !outrec->FirstLeft->IsHole;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    OutRec* GetLowermostRec(OutRec *outRec1, OutRec *outRec2)
+    {
+        //work out which polygon fragment has the correct hole state ...
+        if (!outRec1->BottomPt)
+            outRec1->BottomPt = GetBottomPt(outRec1->Pts);
+        if (!outRec2->BottomPt)
+            outRec2->BottomPt = GetBottomPt(outRec2->Pts);
+        OutPt *OutPt1 = outRec1->BottomPt;
+        OutPt *OutPt2 = outRec2->BottomPt;
+        if (OutPt1->Pt.Y > OutPt2->Pt.Y) return outRec1;
+        else if (OutPt1->Pt.Y < OutPt2->Pt.Y) return outRec2;
+        else if (OutPt1->Pt.X < OutPt2->Pt.X) return outRec1;
+        else if (OutPt1->Pt.X > OutPt2->Pt.X) return outRec2;
+        else if (OutPt1->Next == OutPt1) return outRec2;
+        else if (OutPt2->Next == OutPt2) return outRec1;
+        else if (FirstIsBottomPt(OutPt1, OutPt2)) return outRec1;
+        else return outRec2;
+    }
+//------------------------------------------------------------------------------
+
+    bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
+    {
+        do
+        {
+            outRec1 = outRec1->FirstLeft;
+            if (outRec1 == outRec2) return true;
+        } while (outRec1);
+        return false;
+    }
+//------------------------------------------------------------------------------
+
+    OutRec* Clipper::GetOutRec(int Idx)
+    {
+        OutRec* outrec = m_PolyOuts[Idx];
+        while (outrec != m_PolyOuts[outrec->Idx])
+            outrec = m_PolyOuts[outrec->Idx];
+        return outrec;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::AppendPolygon(TEdge *e1, TEdge *e2)
+    {
+        //get the start and ends of both output polygons ...
+        OutRec *outRec1 = m_PolyOuts[e1->OutIdx];
+        OutRec *outRec2 = m_PolyOuts[e2->OutIdx];
+
+        OutRec *holeStateRec;
+        if (OutRec1RightOfOutRec2(outRec1, outRec2))
+            holeStateRec = outRec2;
+        else if (OutRec1RightOfOutRec2(outRec2, outRec1))
+            holeStateRec = outRec1;
+        else
+            holeStateRec = GetLowermostRec(outRec1, outRec2);
+
+        //get the start and ends of both output polygons and
+        //join e2 poly onto e1 poly and delete pointers to e2 ...
+
+        OutPt* p1_lft = outRec1->Pts;
+        OutPt* p1_rt = p1_lft->Prev;
+        OutPt* p2_lft = outRec2->Pts;
+        OutPt* p2_rt = p2_lft->Prev;
+
+        //join e2 poly onto e1 poly and delete pointers to e2 ...
+        if(  e1->Side == esLeft )
+        {
+            if(  e2->Side == esLeft )
+            {
+                //z y x a b c
+                ReversePolyPtLinks(p2_lft);
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+                p1_rt->Next = p2_rt;
+                p2_rt->Prev = p1_rt;
+                outRec1->Pts = p2_rt;
+            } else
+            {
+                //x y z a b c
+                p2_rt->Next = p1_lft;
+                p1_lft->Prev = p2_rt;
+                p2_lft->Prev = p1_rt;
+                p1_rt->Next = p2_lft;
+                outRec1->Pts = p2_lft;
+            }
+        } else
+        {
+            if(  e2->Side == esRight )
+            {
+                //a b c z y x
+                ReversePolyPtLinks(p2_lft);
+                p1_rt->Next = p2_rt;
+                p2_rt->Prev = p1_rt;
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+            } else
+            {
+                //a b c x y z
+                p1_rt->Next = p2_lft;
+                p2_lft->Prev = p1_rt;
+                p1_lft->Prev = p2_rt;
+                p2_rt->Next = p1_lft;
+            }
+        }
+
+        outRec1->BottomPt = 0;
+        if (holeStateRec == outRec2)
+        {
+            if (outRec2->FirstLeft != outRec1)
+                outRec1->FirstLeft = outRec2->FirstLeft;
+            outRec1->IsHole = outRec2->IsHole;
+        }
+        outRec2->Pts = 0;
+        outRec2->BottomPt = 0;
+        outRec2->FirstLeft = outRec1;
+
+        int OKIdx = e1->OutIdx;
+        int ObsoleteIdx = e2->OutIdx;
+
+        e1->OutIdx = Unassigned; //nb: safe because we only get here via AddLocalMaxPoly
+        e2->OutIdx = Unassigned;
+
+        TEdge* e = m_ActiveEdges;
+        while( e )
+        {
+            if( e->OutIdx == ObsoleteIdx )
+            {
+                e->OutIdx = OKIdx;
+                e->Side = e1->Side;
+                break;
+            }
+            e = e->NextInAEL;
+        }
+
+        outRec2->Idx = outRec1->Idx;
+    }
+//------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddOutPt(TEdge *e, const IntPoint &pt)
+    {
+        if(  e->OutIdx < 0 )
+        {
+            OutRec *outRec = CreateOutRec();
+            outRec->IsOpen = (e->WindDelta == 0);
+            OutPt* newOp = new OutPt;
+            outRec->Pts = newOp;
+            newOp->Idx = outRec->Idx;
+            newOp->Pt = pt;
+            newOp->Next = newOp;
+            newOp->Prev = newOp;
+            if (!outRec->IsOpen)
+                SetHoleState(e, outRec);
+            e->OutIdx = outRec->Idx;
+            return newOp;
+        } else
+        {
+            OutRec *outRec = m_PolyOuts[e->OutIdx];
+            //OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
+            OutPt* op = outRec->Pts;
+
+            bool ToFront = (e->Side == esLeft);
+            if (ToFront && (pt == op->Pt)) return op;
+            else if (!ToFront && (pt == op->Prev->Pt)) return op->Prev;
+
+            OutPt* newOp = new OutPt;
+            newOp->Idx = outRec->Idx;
+            newOp->Pt = pt;
+            newOp->Next = op;
+            newOp->Prev = op->Prev;
+            newOp->Prev->Next = newOp;
+            op->Prev = newOp;
+            if (ToFront) outRec->Pts = newOp;
+            return newOp;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    OutPt* Clipper::GetLastOutPt(TEdge *e)
+    {
+        OutRec *outRec = m_PolyOuts[e->OutIdx];
+        if (e->Side == esLeft)
+            return outRec->Pts;
+        else
+            return outRec->Pts->Prev;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::ProcessHorizontals()
+    {
+        TEdge* horzEdge;
+        while (PopEdgeFromSEL(horzEdge))
+            ProcessHorizontal(horzEdge);
+    }
+//------------------------------------------------------------------------------
+
+    inline bool IsMinima(TEdge *e)
+    {
+        return e  && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
+    }
+//------------------------------------------------------------------------------
+
+    inline bool IsMaxima(TEdge *e, const cInt Y)
+    {
+        return e && e->Top.Y == Y && !e->NextInLML;
+    }
+//------------------------------------------------------------------------------
+
+    inline bool IsIntermediate(TEdge *e, const cInt Y)
+    {
+        return e->Top.Y == Y && e->NextInLML;
+    }
+//------------------------------------------------------------------------------
+
+    TEdge *GetMaximaPair(TEdge *e)
+    {
+        if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
+            return e->Next;
+        else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
+            return e->Prev;
+        else return 0;
+    }
+//------------------------------------------------------------------------------
+
+    TEdge *GetMaximaPairEx(TEdge *e)
+    {
+        //as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
+        TEdge* result = GetMaximaPair(e);
+        if (result && (result->OutIdx == Skip ||
+                       (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::SwapPositionsInSEL(TEdge *Edge1, TEdge *Edge2)
+    {
+        if(  !( Edge1->NextInSEL ) &&  !( Edge1->PrevInSEL ) ) return;
+        if(  !( Edge2->NextInSEL ) &&  !( Edge2->PrevInSEL ) ) return;
+
+        if(  Edge1->NextInSEL == Edge2 )
+        {
+            TEdge* Next = Edge2->NextInSEL;
+            if( Next ) Next->PrevInSEL = Edge1;
+            TEdge* Prev = Edge1->PrevInSEL;
+            if( Prev ) Prev->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            Edge2->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2;
+            Edge1->NextInSEL = Next;
+        }
+        else if(  Edge2->NextInSEL == Edge1 )
+        {
+            TEdge* Next = Edge1->NextInSEL;
+            if( Next ) Next->PrevInSEL = Edge2;
+            TEdge* Prev = Edge2->PrevInSEL;
+            if( Prev ) Prev->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Prev;
+            Edge1->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+        }
+        else
+        {
+            TEdge* Next = Edge1->NextInSEL;
+            TEdge* Prev = Edge1->PrevInSEL;
+            Edge1->NextInSEL = Edge2->NextInSEL;
+            if( Edge1->NextInSEL ) Edge1->NextInSEL->PrevInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2->PrevInSEL;
+            if( Edge1->PrevInSEL ) Edge1->PrevInSEL->NextInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+            if( Edge2->NextInSEL ) Edge2->NextInSEL->PrevInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            if( Edge2->PrevInSEL ) Edge2->PrevInSEL->NextInSEL = Edge2;
+        }
+
+        if( !Edge1->PrevInSEL ) m_SortedEdges = Edge1;
+        else if( !Edge2->PrevInSEL ) m_SortedEdges = Edge2;
+    }
+//------------------------------------------------------------------------------
+
+    TEdge* GetNextInAEL(TEdge *e, Direction dir)
+    {
+        return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
+    }
+//------------------------------------------------------------------------------
+
+    void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
+    {
+        if (HorzEdge.Bot.X < HorzEdge.Top.X)
+        {
+            Left = HorzEdge.Bot.X;
+            Right = HorzEdge.Top.X;
+            Dir = dLeftToRight;
+        } else
+        {
+            Left = HorzEdge.Top.X;
+            Right = HorzEdge.Bot.X;
+            Dir = dRightToLeft;
+        }
+    }
+//------------------------------------------------------------------------
+
+/*******************************************************************************
+* Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
+* Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
+* are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
+* (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
+* and with other non-horizontal edges [*]. Once these intersections are        *
+* processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
+* the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
+*******************************************************************************/
+
+    void Clipper::ProcessHorizontal(TEdge *horzEdge)
+    {
+        Direction dir;
+        cInt horzLeft, horzRight;
+        bool IsOpen = (horzEdge->WindDelta == 0);
+
+        GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        TEdge* eLastHorz = horzEdge, *eMaxPair = 0;
+        while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML))
+            eLastHorz = eLastHorz->NextInLML;
+        if (!eLastHorz->NextInLML)
+            eMaxPair = GetMaximaPair(eLastHorz);
+
+        MaximaList::const_iterator maxIt;
+        MaximaList::const_reverse_iterator maxRit;
+        if (m_Maxima.size() > 0)
+        {
+            //get the first maxima in range (X) ...
+            if (dir == dLeftToRight)
+            {
+                maxIt = m_Maxima.begin();
+                while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
+                if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
+                    maxIt = m_Maxima.end();
+            }
+            else
+            {
+                maxRit = m_Maxima.rbegin();
+                while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
+                if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
+                    maxRit = m_Maxima.rend();
+            }
+        }
+
+        OutPt* op1 = 0;
+
+        for (;;) //loop through consec. horizontal edges
+        {
+
+            bool IsLastHorz = (horzEdge == eLastHorz);
+            TEdge* e = GetNextInAEL(horzEdge, dir);
+            while(e)
+            {
+
+                //this code block inserts extra coords into horizontal edges (in output
+                //polygons) whereever maxima touch these horizontal edges. This helps
+                //'simplifying' polygons (ie if the Simplify property is set).
+                if (m_Maxima.size() > 0)
+                {
+                    if (dir == dLeftToRight)
+                    {
+                        while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
+                            maxIt++;
+                        }
+                    }
+                    else
+                    {
+                        while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
+                            maxRit++;
+                        }
+                    }
+                };
+
+                if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
+                    (dir == dRightToLeft && e->Curr.X < horzLeft)) break;
+
+                //Also break if we've got to the end of an intermediate horizontal edge ...
+                //nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
+                if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML &&
+                    e->Dx < horzEdge->NextInLML->Dx) break;
+
+                if (horzEdge->OutIdx >= 0 && !IsOpen)  //note: may be done multiple times
+                {
+#ifdef use_xyz
+                    if (dir == dLeftToRight) SetZ(e->Curr, *horzEdge, *e);
+			else SetZ(e->Curr, *e, *horzEdge);
+#endif
+                    op1 = AddOutPt(horzEdge, e->Curr);
+                    TEdge* eNextHorz = m_SortedEdges;
+                    while (eNextHorz)
+                    {
+                        if (eNextHorz->OutIdx >= 0 &&
+                            HorzSegmentsOverlap(horzEdge->Bot.X,
+                                                horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
+                        {
+                            OutPt* op2 = GetLastOutPt(eNextHorz);
+                            AddJoin(op2, op1, eNextHorz->Top);
+                        }
+                        eNextHorz = eNextHorz->NextInSEL;
+                    }
+                    AddGhostJoin(op1, horzEdge->Bot);
+                }
+
+                //OK, so far we're still in range of the horizontal Edge  but make sure
+                //we're at the last of consec. horizontals when matching with eMaxPair
+                if(e == eMaxPair && IsLastHorz)
+                {
+                    if (horzEdge->OutIdx >= 0)
+                        AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
+                    DeleteFromAEL(horzEdge);
+                    DeleteFromAEL(eMaxPair);
+                    return;
+                }
+
+                if(dir == dLeftToRight)
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges(horzEdge, e, Pt);
+                }
+                else
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges( e, horzEdge, Pt);
+                }
+                TEdge* eNext = GetNextInAEL(e, dir);
+                SwapPositionsInAEL( horzEdge, e );
+                e = eNext;
+            } //end while(e)
+
+            //Break out of loop if HorzEdge.NextInLML is not also horizontal ...
+            if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;
+
+            UpdateEdgeIntoAEL(horzEdge);
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
+            GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        } //end for (;;)
+
+        if (horzEdge->OutIdx >= 0 && !op1)
+        {
+            op1 = GetLastOutPt(horzEdge);
+            TEdge* eNextHorz = m_SortedEdges;
+            while (eNextHorz)
+            {
+                if (eNextHorz->OutIdx >= 0 &&
+                    HorzSegmentsOverlap(horzEdge->Bot.X,
+                                        horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
+                {
+                    OutPt* op2 = GetLastOutPt(eNextHorz);
+                    AddJoin(op2, op1, eNextHorz->Top);
+                }
+                eNextHorz = eNextHorz->NextInSEL;
+            }
+            AddGhostJoin(op1, horzEdge->Top);
+        }
+
+        if (horzEdge->NextInLML)
+        {
+            if(horzEdge->OutIdx >= 0)
+            {
+                op1 = AddOutPt( horzEdge, horzEdge->Top);
+                UpdateEdgeIntoAEL(horzEdge);
+                if (horzEdge->WindDelta == 0) return;
+                //nb: HorzEdge is no longer horizontal here
+                TEdge* ePrev = horzEdge->PrevInAEL;
+                TEdge* eNext = horzEdge->NextInAEL;
+                if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
+                    ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
+                    (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                     SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+                else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
+                         eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
+                {
+                    OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+            }
+            else
+                UpdateEdgeIntoAEL(horzEdge);
+        }
+        else
+        {
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
+            DeleteFromAEL(horzEdge);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::ProcessIntersections(const cInt topY)
+    {
+        if( !m_ActiveEdges ) return true;
+        try {
+            BuildIntersectList(topY);
+            size_t IlSize = m_IntersectList.size();
+            if (IlSize == 0) return true;
+            if (IlSize == 1 || FixupIntersectionOrder()) ProcessIntersectList();
+            else return false;
+        }
+        catch(...)
+        {
+            m_SortedEdges = 0;
+            DisposeIntersectNodes();
+            throw clipperException("ProcessIntersections error");
+        }
+        m_SortedEdges = 0;
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::DisposeIntersectNodes()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i )
+            delete m_IntersectList[i];
+        m_IntersectList.clear();
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::BuildIntersectList(const cInt topY)
+    {
+        if ( !m_ActiveEdges ) return;
+
+        //prepare for sorting ...
+        TEdge* e = m_ActiveEdges;
+        m_SortedEdges = e;
+        while( e )
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e->Curr.X = TopX( *e, topY );
+            e = e->NextInAEL;
+        }
+
+        //bubblesort ...
+        bool isModified;
+        do
+        {
+            isModified = false;
+            e = m_SortedEdges;
+            while( e->NextInSEL )
+            {
+                TEdge *eNext = e->NextInSEL;
+                IntPoint Pt;
+                if(e->Curr.X > eNext->Curr.X)
+                {
+                    IntersectPoint(*e, *eNext, Pt);
+                    if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
+                    IntersectNode * newNode = new IntersectNode;
+                    newNode->Edge1 = e;
+                    newNode->Edge2 = eNext;
+                    newNode->Pt = Pt;
+                    m_IntersectList.push_back(newNode);
+
+                    SwapPositionsInSEL(e, eNext);
+                    isModified = true;
+                }
+                else
+                    e = eNext;
+            }
+            if( e->PrevInSEL ) e->PrevInSEL->NextInSEL = 0;
+            else break;
+        }
+        while ( isModified );
+        m_SortedEdges = 0; //important
+    }
+//------------------------------------------------------------------------------
+
+
+    void Clipper::ProcessIntersectList()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i)
+        {
+            IntersectNode* iNode = m_IntersectList[i];
+            {
+                IntersectEdges( iNode->Edge1, iNode->Edge2, iNode->Pt);
+                SwapPositionsInAEL( iNode->Edge1 , iNode->Edge2 );
+            }
+            delete iNode;
+        }
+        m_IntersectList.clear();
+    }
+//------------------------------------------------------------------------------
+
+    bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
+    {
+        return node2->Pt.Y < node1->Pt.Y;
+    }
+//------------------------------------------------------------------------------
+
+    inline bool EdgesAdjacent(const IntersectNode &inode)
+    {
+        return (inode.Edge1->NextInSEL == inode.Edge2) ||
+               (inode.Edge1->PrevInSEL == inode.Edge2);
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::FixupIntersectionOrder()
+    {
+        //pre-condition: intersections are sorted Bottom-most first.
+        //Now it's crucial that intersections are made only between adjacent edges,
+        //so to ensure this the order of intersections may need adjusting ...
+        CopyAELToSEL();
+        std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
+        size_t cnt = m_IntersectList.size();
+        for (size_t i = 0; i < cnt; ++i)
+        {
+            if (!EdgesAdjacent(*m_IntersectList[i]))
+            {
+                size_t j = i + 1;
+                while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
+                if (j == cnt)  return false;
+                std::swap(m_IntersectList[i], m_IntersectList[j]);
+            }
+            SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
+        }
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::DoMaxima(TEdge *e)
+    {
+        TEdge* eMaxPair = GetMaximaPairEx(e);
+        if (!eMaxPair)
+        {
+            if (e->OutIdx >= 0)
+                AddOutPt(e, e->Top);
+            DeleteFromAEL(e);
+            return;
+        }
+
+        TEdge* eNext = e->NextInAEL;
+        while(eNext && eNext != eMaxPair)
+        {
+            IntersectEdges(e, eNext, e->Top);
+            SwapPositionsInAEL(e, eNext);
+            eNext = e->NextInAEL;
+        }
+
+        if(e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
+        {
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+        else if( e->OutIdx >= 0 && eMaxPair->OutIdx >= 0 )
+        {
+            if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+#ifdef use_lines
+        else if (e->WindDelta == 0)
+        {
+            if (e->OutIdx >= 0)
+            {
+                AddOutPt(e, e->Top);
+                e->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(e);
+
+            if (eMaxPair->OutIdx >= 0)
+            {
+                AddOutPt(eMaxPair, e->Top);
+                eMaxPair->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(eMaxPair);
+        }
+#endif
+        else throw clipperException("DoMaxima error");
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
+    {
+        TEdge* e = m_ActiveEdges;
+        while( e )
+        {
+            //1. process maxima, treating them as if they're 'bent' horizontal edges,
+            //   but exclude maxima with horizontal edges. nb: e can't be a horizontal.
+            bool IsMaximaEdge = IsMaxima(e, topY);
+
+            if(IsMaximaEdge)
+            {
+                TEdge* eMaxPair = GetMaximaPairEx(e);
+                IsMaximaEdge = (!eMaxPair || !IsHorizontal(*eMaxPair));
+            }
+
+            if(IsMaximaEdge)
+            {
+                if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
+                TEdge* ePrev = e->PrevInAEL;
+                DoMaxima(e);
+                if( !ePrev ) e = m_ActiveEdges;
+                else e = ePrev->NextInAEL;
+            }
+            else
+            {
+                //2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
+                if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
+                {
+                    UpdateEdgeIntoAEL(e);
+                    if (e->OutIdx >= 0)
+                        AddOutPt(e, e->Bot);
+                    AddEdgeToSEL(e);
+                }
+                else
+                {
+                    e->Curr.X = TopX( *e, topY );
+                    e->Curr.Y = topY;
+#ifdef use_xyz
+                    e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
+#endif
+                }
+
+                //When StrictlySimple and 'e' is being touched by another edge, then
+                //make sure both edges have a vertex here ...
+                if (m_StrictSimple)
+                {
+                    TEdge* ePrev = e->PrevInAEL;
+                    if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
+                        (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
+                    {
+                        IntPoint pt = e->Curr;
+#ifdef use_xyz
+                        SetZ(pt, *ePrev, *e);
+#endif
+                        OutPt* op = AddOutPt(ePrev, pt);
+                        OutPt* op2 = AddOutPt(e, pt);
+                        AddJoin(op, op2, pt); //StrictlySimple (type-3) join
+                    }
+                }
+
+                e = e->NextInAEL;
+            }
+        }
+
+        //3. Process horizontals at the Top of the scanbeam ...
+        m_Maxima.sort();
+        ProcessHorizontals();
+        m_Maxima.clear();
+
+        //4. Promote intermediate vertices ...
+        e = m_ActiveEdges;
+        while(e)
+        {
+            if(IsIntermediate(e, topY))
+            {
+                OutPt* op = 0;
+                if( e->OutIdx >= 0 )
+                    op = AddOutPt(e, e->Top);
+                UpdateEdgeIntoAEL(e);
+
+                //if output polygons share an edge, they'll need joining later ...
+                TEdge* ePrev = e->PrevInAEL;
+                TEdge* eNext = e->NextInAEL;
+                if (ePrev && ePrev->Curr.X == e->Bot.X &&
+                    ePrev->Curr.Y == e->Bot.Y && op &&
+                    ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                    SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
+                    (e->WindDelta != 0) && (ePrev->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+                else if (eNext && eNext->Curr.X == e->Bot.X &&
+                         eNext->Curr.Y == e->Bot.Y && op &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
+                         (e->WindDelta != 0) && (eNext->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(eNext, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+            }
+            e = e->NextInAEL;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolyline(OutRec &outrec)
+    {
+        OutPt *pp = outrec.Pts;
+        OutPt *lastPP = pp->Prev;
+        while (pp != lastPP)
+        {
+            pp = pp->Next;
+            if (pp->Pt == pp->Prev->Pt)
+            {
+                if (pp == lastPP) lastPP = pp->Prev;
+                OutPt *tmpPP = pp->Prev;
+                tmpPP->Next = pp->Next;
+                pp->Next->Prev = tmpPP;
+                delete pp;
+                pp = tmpPP;
+            }
+        }
+
+        if (pp == pp->Prev)
+        {
+            DisposeOutPts(pp);
+            outrec.Pts = 0;
+            return;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolygon(OutRec &outrec)
+    {
+        //FixupOutPolygon() - removes duplicate points and simplifies consecutive
+        //parallel edges by removing the middle vertex.
+        OutPt *lastOK = 0;
+        outrec.BottomPt = 0;
+        OutPt *pp = outrec.Pts;
+        bool preserveCol = m_PreserveCollinear || m_StrictSimple;
+
+        for (;;)
+        {
+            if (pp->Prev == pp || pp->Prev == pp->Next)
+            {
+                DisposeOutPts(pp);
+                outrec.Pts = 0;
+                return;
+            }
+
+            //test for duplicate points and collinear edges ...
+            if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
+                (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
+                 (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
+            {
+                lastOK = 0;
+                OutPt *tmp = pp;
+                pp->Prev->Next = pp->Next;
+                pp->Next->Prev = pp->Prev;
+                pp = pp->Prev;
+                delete tmp;
+            }
+            else if (pp == lastOK) break;
+            else
+            {
+                if (!lastOK) lastOK = pp;
+                pp = pp->Next;
+            }
+        }
+        outrec.Pts = pp;
+    }
+//------------------------------------------------------------------------------
+
+    int PointCount(OutPt *Pts)
+    {
+        if (!Pts) return 0;
+        int result = 0;
+        OutPt* p = Pts;
+        do
+        {
+            result++;
+            p = p->Next;
+        }
+        while (p != Pts);
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::BuildResult(Paths &polys)
+    {
+        polys.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            if (!m_PolyOuts[i]->Pts) continue;
+            Path pg;
+            OutPt* p = m_PolyOuts[i]->Pts->Prev;
+            int cnt = PointCount(p);
+            if (cnt < 2) continue;
+            pg.reserve(cnt);
+            for (int i = 0; i < cnt; ++i)
+            {
+                pg.push_back(p->Pt);
+                p = p->Prev;
+            }
+            polys.push_back(pg);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::BuildResult2(PolyTree& polytree)
+    {
+        polytree.Clear();
+        polytree.AllNodes.reserve(m_PolyOuts.size());
+        //add each output polygon/contour to polytree ...
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            int cnt = PointCount(outRec->Pts);
+            if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
+            FixHoleLinkage(*outRec);
+            PolyNode* pn = new PolyNode();
+            //nb: polytree takes ownership of all the PolyNodes
+            polytree.AllNodes.push_back(pn);
+            outRec->PolyNd = pn;
+            pn->Parent = 0;
+            pn->Index = 0;
+            pn->Contour.reserve(cnt);
+            OutPt *op = outRec->Pts->Prev;
+            for (int j = 0; j < cnt; j++)
+            {
+                pn->Contour.push_back(op->Pt);
+                op = op->Prev;
+            }
+        }
+
+        //fixup PolyNode links etc ...
+        polytree.Childs.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            if (!outRec->PolyNd) continue;
+            if (outRec->IsOpen)
+            {
+                outRec->PolyNd->m_IsOpen = true;
+                polytree.AddChild(*outRec->PolyNd);
+            }
+            else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd)
+                outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
+            else
+                polytree.AddChild(*outRec->PolyNd);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void SwapIntersectNodes(IntersectNode &int1, IntersectNode &int2)
+    {
+        //just swap the contents (because fIntersectNodes is a single-linked-list)
+        IntersectNode inode = int1; //gets a copy of Int1
+        int1.Edge1 = int2.Edge1;
+        int1.Edge2 = int2.Edge2;
+        int1.Pt = int2.Pt;
+        int2.Edge1 = inode.Edge1;
+        int2.Edge2 = inode.Edge2;
+        int2.Pt = inode.Pt;
+    }
+//------------------------------------------------------------------------------
+
+    inline bool E2InsertsBeforeE1(TEdge &e1, TEdge &e2)
+    {
+        if (e2.Curr.X == e1.Curr.X)
+        {
+            if (e2.Top.Y > e1.Top.Y)
+                return e2.Top.X < TopX(e1, e2.Top.Y);
+            else return e1.Top.X > TopX(e2, e1.Top.Y);
+        }
+        else return e2.Curr.X < e1.Curr.X;
+    }
+//------------------------------------------------------------------------------
+
+    bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2,
+                    cInt& Left, cInt& Right)
+    {
+        if (a1 < a2)
+        {
+            if (b1 < b2) {Left = std::max(a1,b1); Right = std::min(a2,b2);}
+            else {Left = std::max(a1,b2); Right = std::min(a2,b1);}
+        }
+        else
+        {
+            if (b1 < b2) {Left = std::max(a2,b1); Right = std::min(a1,b2);}
+            else {Left = std::max(a2,b2); Right = std::min(a1,b1);}
+        }
+        return Left < Right;
+    }
+//------------------------------------------------------------------------------
+
+    inline void UpdateOutPtIdxs(OutRec& outrec)
+    {
+        OutPt* op = outrec.Pts;
+        do
+        {
+            op->Idx = outrec.Idx;
+            op = op->Prev;
+        }
+        while(op != outrec.Pts);
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge)
+    {
+        if(!m_ActiveEdges)
+        {
+            edge->PrevInAEL = 0;
+            edge->NextInAEL = 0;
+            m_ActiveEdges = edge;
+        }
+        else if(!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
+        {
+            edge->PrevInAEL = 0;
+            edge->NextInAEL = m_ActiveEdges;
+            m_ActiveEdges->PrevInAEL = edge;
+            m_ActiveEdges = edge;
+        }
+        else
+        {
+            if(!startEdge) startEdge = m_ActiveEdges;
+            while(startEdge->NextInAEL  &&
+                  !E2InsertsBeforeE1(*startEdge->NextInAEL , *edge))
+                startEdge = startEdge->NextInAEL;
+            edge->NextInAEL = startEdge->NextInAEL;
+            if(startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
+            edge->PrevInAEL = startEdge;
+            startEdge->NextInAEL = edge;
+        }
+    }
+//----------------------------------------------------------------------
+
+    OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
+    {
+        OutPt* result = new OutPt;
+        result->Pt = outPt->Pt;
+        result->Idx = outPt->Idx;
+        if (InsertAfter)
+        {
+            result->Next = outPt->Next;
+            result->Prev = outPt;
+            outPt->Next->Prev = result;
+            outPt->Next = result;
+        }
+        else
+        {
+            result->Prev = outPt->Prev;
+            result->Next = outPt;
+            outPt->Prev->Next = result;
+            outPt->Prev = result;
+        }
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b,
+                  const IntPoint Pt, bool DiscardLeft)
+    {
+        Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
+        Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
+        if (Dir1 == Dir2) return false;
+
+        //When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
+        //want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
+        //So, to facilitate this while inserting Op1b and Op2b ...
+        //when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
+        //otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
+        if (Dir1 == dLeftToRight)
+        {
+            while (op1->Next->Pt.X <= Pt.X &&
+                   op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, !DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1 = op1b;
+                op1->Pt = Pt;
+                op1b = DupOutPt(op1, !DiscardLeft);
+            }
+        }
+        else
+        {
+            while (op1->Next->Pt.X >= Pt.X &&
+                   op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1 = op1b;
+                op1->Pt = Pt;
+                op1b = DupOutPt(op1, DiscardLeft);
+            }
+        }
+
+        if (Dir2 == dLeftToRight)
+        {
+            while (op2->Next->Pt.X <= Pt.X &&
+                   op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, !DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2 = op2b;
+                op2->Pt = Pt;
+                op2b = DupOutPt(op2, !DiscardLeft);
+            };
+        } else
+        {
+            while (op2->Next->Pt.X >= Pt.X &&
+                   op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2 = op2b;
+                op2->Pt = Pt;
+                op2b = DupOutPt(op2, DiscardLeft);
+            };
+        };
+
+        if ((Dir1 == dLeftToRight) == DiscardLeft)
+        {
+            op1->Prev = op2;
+            op2->Next = op1;
+            op1b->Next = op2b;
+            op2b->Prev = op1b;
+        }
+        else
+        {
+            op1->Next = op2;
+            op2->Prev = op1;
+            op1b->Prev = op2b;
+            op2b->Next = op1b;
+        }
+        return true;
+    }
+//------------------------------------------------------------------------------
+
+    bool Clipper::JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2)
+    {
+        OutPt *op1 = j->OutPt1, *op1b;
+        OutPt *op2 = j->OutPt2, *op2b;
+
+        //There are 3 kinds of joins for output polygons ...
+        //1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
+        //along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
+        //2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
+        //location at the Bottom of the overlapping segment (& Join.OffPt is above).
+        //3. StrictSimple joins where edges touch but are not collinear and where
+        //Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
+        bool isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);
+
+        if (isHorizontal  && (j->OffPt == j->OutPt1->Pt) &&
+            (j->OffPt == j->OutPt2->Pt))
+        {
+            //Strictly Simple join ...
+            if (outRec1 != outRec2) return false;
+            op1b = j->OutPt1->Next;
+            while (op1b != op1 && (op1b->Pt == j->OffPt))
+                op1b = op1b->Next;
+            bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
+            op2b = j->OutPt2->Next;
+            while (op2b != op2 && (op2b->Pt == j->OffPt))
+                op2b = op2b->Next;
+            bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
+            if (reverse1 == reverse2) return false;
+            if (reverse1)
+            {
+                op1b = DupOutPt(op1, false);
+                op2b = DupOutPt(op2, true);
+                op1->Prev = op2;
+                op2->Next = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1 = op1;
+                j->OutPt2 = op1b;
+                return true;
+            } else
+            {
+                op1b = DupOutPt(op1, true);
+                op2b = DupOutPt(op2, false);
+                op1->Next = op2;
+                op2->Prev = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1 = op1;
+                j->OutPt2 = op1b;
+                return true;
+            }
+        }
+        else if (isHorizontal)
+        {
+            //treat horizontal joins differently to non-horizontal joins since with
+            //them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
+            //may be anywhere along the horizontal edge.
+            op1b = op1;
+            while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
+                op1 = op1->Prev;
+            while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
+                op1b = op1b->Next;
+            if (op1b->Next == op1 || op1b->Next == op2) return false; //a flat 'polygon'
+
+            op2b = op2;
+            while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
+                op2 = op2->Prev;
+            while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
+                op2b = op2b->Next;
+            if (op2b->Next == op2 || op2b->Next == op1) return false; //a flat 'polygon'
+
+            cInt Left, Right;
+            //Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
+            if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
+                return false;
+
+            //DiscardLeftSide: when overlapping edges are joined, a spike will created
+            //which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
+            //on the discard Side as either may still be needed for other joins ...
+            IntPoint Pt;
+            bool DiscardLeftSide;
+            if (op1->Pt.X >= Left && op1->Pt.X <= Right)
+            {
+                Pt = op1->Pt; DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
+            }
+            else if (op2->Pt.X >= Left&& op2->Pt.X <= Right)
+            {
+                Pt = op2->Pt; DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
+            }
+            else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
+            {
+                Pt = op1b->Pt; DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
+            }
+            else
+            {
+                Pt = op2b->Pt; DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
+            }
+            j->OutPt1 = op1; j->OutPt2 = op2;
+            return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
+        } else
+        {
+            //nb: For non-horizontal joins ...
+            //    1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
+            //    2. Jr.OutPt1.Pt > Jr.OffPt.Y
+
+            //make sure the polygons are correctly oriented ...
+            op1b = op1->Next;
+            while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
+            bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
+                             !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse1)
+            {
+                op1b = op1->Prev;
+                while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
+                if ((op1b->Pt.Y > op1->Pt.Y) ||
+                    !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
+            };
+            op2b = op2->Next;
+            while ((op2b->Pt == op2->Pt) && (op2b != op2))op2b = op2b->Next;
+            bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
+                             !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse2)
+            {
+                op2b = op2->Prev;
+                while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
+                if ((op2b->Pt.Y > op2->Pt.Y) ||
+                    !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
+            }
+
+            if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
+                ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;
+
+            if (Reverse1)
+            {
+                op1b = DupOutPt(op1, false);
+                op2b = DupOutPt(op2, true);
+                op1->Prev = op2;
+                op2->Next = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1 = op1;
+                j->OutPt2 = op1b;
+                return true;
+            } else
+            {
+                op1b = DupOutPt(op1, true);
+                op2b = DupOutPt(op2, false);
+                op1->Next = op2;
+                op2->Prev = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1 = op1;
+                j->OutPt2 = op1b;
+                return true;
+            }
+        }
+    }
+//----------------------------------------------------------------------
+
+    static OutRec* ParseFirstLeft(OutRec* FirstLeft)
+    {
+        while (FirstLeft && !FirstLeft->Pts)
+            FirstLeft = FirstLeft->FirstLeft;
+        return FirstLeft;
+    }
+//------------------------------------------------------------------------------
+
+    void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        //tests if NewOutRec contains the polygon before reassigning FirstLeft
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts  && firstLeft == OldOutRec)
+            {
+                if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
+                    outRec->FirstLeft = NewOutRec;
+            }
+        }
+    }
+//----------------------------------------------------------------------
+
+    void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
+    {
+        //A polygon has split into two such that one is now the inner of the other.
+        //It's possible that these polygons now wrap around other polygons, so check
+        //every polygon that's also contained by OuterOutRec's FirstLeft container
+        //(including 0) to see if they've become inner to the new inner polygon ...
+        OutRec* orfl = OuterOutRec->FirstLeft;
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+
+            if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
+                continue;
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
+                continue;
+            if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
+                outRec->FirstLeft = InnerOutRec;
+            else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
+                outRec->FirstLeft = OuterOutRec;
+            else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
+                outRec->FirstLeft = orfl;
+        }
+    }
+//----------------------------------------------------------------------
+    void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        //reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts && firstLeft == OldOutRec)
+                outRec->FirstLeft = NewOutRec;
+        }
+    }
+//----------------------------------------------------------------------
+
+    void Clipper::JoinCommonEdges()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+        {
+            Join* join = m_Joins[i];
+
+            OutRec *outRec1 = GetOutRec(join->OutPt1->Idx);
+            OutRec *outRec2 = GetOutRec(join->OutPt2->Idx);
+
+            if (!outRec1->Pts || !outRec2->Pts) continue;
+            if (outRec1->IsOpen || outRec2->IsOpen) continue;
+
+            //get the polygon fragment with the correct hole state (FirstLeft)
+            //before calling JoinPoints() ...
+            OutRec *holeStateRec;
+            if (outRec1 == outRec2) holeStateRec = outRec1;
+            else if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2;
+            else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1;
+            else holeStateRec = GetLowermostRec(outRec1, outRec2);
+
+            if (!JoinPoints(join, outRec1, outRec2)) continue;
+
+            if (outRec1 == outRec2)
+            {
+                //instead of joining two polygons, we've just created a new one by
+                //splitting one polygon into two.
+                outRec1->Pts = join->OutPt1;
+                outRec1->BottomPt = 0;
+                outRec2 = CreateOutRec();
+                outRec2->Pts = join->OutPt2;
+
+                //update all OutRec2.Pts Idx's ...
+                UpdateOutPtIdxs(*outRec2);
+
+                if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
+                {
+                    //outRec1 contains outRec2 ...
+                    outRec2->IsHole = !outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1;
+
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);
+
+                    if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
+                        ReversePolyPtLinks(outRec2->Pts);
+
+                } else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
+                {
+                    //outRec2 contains outRec1 ...
+                    outRec2->IsHole = outRec1->IsHole;
+                    outRec1->IsHole = !outRec2->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+                    outRec1->FirstLeft = outRec2;
+
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);
+
+                    if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
+                        ReversePolyPtLinks(outRec1->Pts);
+                }
+                else
+                {
+                    //the 2 polygons are completely separate ...
+                    outRec2->IsHole = outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+
+                    //fixup FirstLeft pointers that may need reassigning to OutRec2
+                    if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
+                }
+
+            } else
+            {
+                //joined 2 polygons together ...
+
+                outRec2->Pts = 0;
+                outRec2->BottomPt = 0;
+                outRec2->Idx = outRec1->Idx;
+
+                outRec1->IsHole = holeStateRec->IsHole;
+                if (holeStateRec == outRec2)
+                    outRec1->FirstLeft = outRec2->FirstLeft;
+                outRec2->FirstLeft = outRec1;
+
+                if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
+            }
+        }
+    }
+
+//------------------------------------------------------------------------------
+// ClipperOffset support functions ...
+//------------------------------------------------------------------------------
+
+    DoublePoint GetUnitNormal(const IntPoint &pt1, const IntPoint &pt2)
+    {
+        if(pt2.X == pt1.X && pt2.Y == pt1.Y)
+            return DoublePoint(0, 0);
+
+        double Dx = (double)(pt2.X - pt1.X);
+        double dy = (double)(pt2.Y - pt1.Y);
+        double f = 1 *1.0/ std::sqrt( Dx*Dx + dy*dy );
+        Dx *= f;
+        dy *= f;
+        return DoublePoint(dy, -Dx);
+    }
+
+//------------------------------------------------------------------------------
+// ClipperOffset class
+//------------------------------------------------------------------------------
+
+    ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
+    {
+        this->MiterLimit = miterLimit;
+        this->ArcTolerance = arcTolerance;
+        m_lowest.X = -1;
+    }
+//------------------------------------------------------------------------------
+
+    ClipperOffset::~ClipperOffset()
+    {
+        Clear();
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::Clear()
+    {
+        for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            delete m_polyNodes.Childs[i];
+        m_polyNodes.Childs.clear();
+        m_lowest.X = -1;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
+    {
+        int highI = (int)path.size() - 1;
+        if (highI < 0) return;
+        PolyNode* newNode = new PolyNode();
+        newNode->m_jointype = joinType;
+        newNode->m_endtype = endType;
+
+        //strip duplicate points from path and also get index to the lowest point ...
+        if (endType == etClosedLine || endType == etClosedPolygon)
+            while (highI > 0 && path[0] == path[highI]) highI--;
+        newNode->Contour.reserve(highI + 1);
+        newNode->Contour.push_back(path[0]);
+        int j = 0, k = 0;
+        for (int i = 1; i <= highI; i++)
+            if (newNode->Contour[j] != path[i])
+            {
+                j++;
+                newNode->Contour.push_back(path[i]);
+                if (path[i].Y > newNode->Contour[k].Y ||
+                    (path[i].Y == newNode->Contour[k].Y &&
+                     path[i].X < newNode->Contour[k].X)) k = j;
+            }
+        if (endType == etClosedPolygon && j < 2)
+        {
+            delete newNode;
+            return;
+        }
+        m_polyNodes.AddChild(*newNode);
+
+        //if this path's lowest pt is lower than all the others then update m_lowest
+        if (endType != etClosedPolygon) return;
+        if (m_lowest.X < 0)
+            m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
+        else
+        {
+            IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
+            if (newNode->Contour[k].Y > ip.Y ||
+                (newNode->Contour[k].Y == ip.Y &&
+                 newNode->Contour[k].X < ip.X))
+                m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
+    {
+        for (Paths::size_type i = 0; i < paths.size(); ++i)
+            AddPath(paths[i], joinType, endType);
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::FixOrientations()
+    {
+        //fixup orientations of all closed paths if the orientation of the
+        //closed path with the lowermost vertex is wrong ...
+        if (m_lowest.X >= 0 &&
+            !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon ||
+                    (node.m_endtype == etClosedLine && Orientation(node.Contour)))
+                    ReversePath(node.Contour);
+            }
+        } else
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
+                    ReversePath(node.Contour);
+            }
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::Execute(Paths& solution, double delta)
+    {
+        solution.clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        //now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            if (solution.size() > 0) solution.erase(solution.begin());
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::Execute(PolyTree& solution, double delta)
+    {
+        solution.Clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        //now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            //remove the outer PolyNode rectangle ...
+            if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
+            {
+                PolyNode* outerNode = solution.Childs[0];
+                solution.Childs.reserve(outerNode->ChildCount());
+                solution.Childs[0] = outerNode->Childs[0];
+                solution.Childs[0]->Parent = outerNode->Parent;
+                for (int i = 1; i < outerNode->ChildCount(); ++i)
+                    solution.AddChild(*outerNode->Childs[i]);
+            }
+            else
+                solution.Clear();
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::DoOffset(double delta)
+    {
+        m_destPolys.clear();
+        m_delta = delta;
+
+        //if Zero offset, just copy any CLOSED polygons to m_p and return ...
+        if (NEAR_ZERO(delta))
+        {
+            m_destPolys.reserve(m_polyNodes.ChildCount());
+            for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon)
+                    m_destPolys.push_back(node.Contour);
+            }
+            return;
+        }
+
+        //see offset_triginometry3.svg in the documentation folder ...
+        if (MiterLimit > 2) m_miterLim = 2/(MiterLimit * MiterLimit);
+        else m_miterLim = 0.5;
+
+        double y;
+        if (ArcTolerance <= 0.0) y = def_arc_tolerance;
+        else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance)
+            y = std::fabs(delta) * def_arc_tolerance;
+        else y = ArcTolerance;
+        //see offset_triginometry2.svg in the documentation folder ...
+        double steps = pi / std::acos(1 - y / std::fabs(delta));
+        if (steps > std::fabs(delta) * pi)
+            steps = std::fabs(delta) * pi;  //ie excessive precision check
+        m_sin = std::sin(two_pi / steps);
+        m_cos = std::cos(two_pi / steps);
+        m_StepsPerRad = steps / two_pi;
+        if (delta < 0.0) m_sin = -m_sin;
+
+        m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
+        for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+        {
+            PolyNode& node = *m_polyNodes.Childs[i];
+            m_srcPoly = node.Contour;
+
+            int len = (int)m_srcPoly.size();
+            if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
+                continue;
+
+            m_destPoly.clear();
+            if (len == 1)
+            {
+                if (node.m_jointype == jtRound)
+                {
+                    double X = 1.0, Y = 0.0;
+                    for (cInt j = 1; j <= steps; j++)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                                Round(m_srcPoly[0].X + X * delta),
+                                Round(m_srcPoly[0].Y + Y * delta)));
+                        double X2 = X;
+                        X = X * m_cos - m_sin * Y;
+                        Y = X2 * m_sin + Y * m_cos;
+                    }
+                }
+                else
+                {
+                    double X = -1.0, Y = -1.0;
+                    for (int j = 0; j < 4; ++j)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                                Round(m_srcPoly[0].X + X * delta),
+                                Round(m_srcPoly[0].Y + Y * delta)));
+                        if (X < 0) X = 1;
+                        else if (Y < 0) Y = 1;
+                        else X = -1;
+                    }
+                }
+                m_destPolys.push_back(m_destPoly);
+                continue;
+            }
+            //build m_normals ...
+            m_normals.clear();
+            m_normals.reserve(len);
+            for (int j = 0; j < len - 1; ++j)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
+            if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
+            else
+                m_normals.push_back(DoublePoint(m_normals[len - 2]));
+
+            if (node.m_endtype == etClosedPolygon)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else if (node.m_endtype == etClosedLine)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+                m_destPoly.clear();
+                //re-build m_normals ...
+                DoublePoint n = m_normals[len -1];
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-n.X, -n.Y);
+                k = 0;
+                for (int j = len - 1; j >= 0; j--)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else
+            {
+                int k = 0;
+                for (int j = 1; j < len - 1; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+
+                IntPoint pt1;
+                if (node.m_endtype == etOpenButt)
+                {
+                    int j = len - 1;
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
+                                                                delta), (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
+                                                                delta), (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    int j = len - 1;
+                    k = len - 2;
+                    m_sinA = 0;
+                    m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(j, k);
+                    else
+                        DoRound(j, k);
+                }
+
+                //re-build m_normals ...
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);
+
+                k = len - 1;
+                for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);
+
+                if (node.m_endtype == etOpenButt)
+                {
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    k = 1;
+                    m_sinA = 0;
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(0, 1);
+                    else
+                        DoRound(0, 1);
+                }
+                m_destPolys.push_back(m_destPoly);
+            }
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
+    {
+        //cross product ...
+        m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
+        if (std::fabs(m_sinA * m_delta) < 1.0)
+        {
+            //dot product ...
+            double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y );
+            if (cosA > 0) // angle => 0 degrees
+            {
+                m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                              Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+                return;
+            }
+            //else angle => 180 degrees
+        }
+        else if (m_sinA > 1.0) m_sinA = 1.0;
+        else if (m_sinA < -1.0) m_sinA = -1.0;
+
+        if (m_sinA * m_delta < 0)
+        {
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+            m_destPoly.push_back(m_srcPoly[j]);
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+        }
+        else
+            switch (jointype)
+            {
+                case jtMiter:
+                {
+                    double r = 1 + (m_normals[j].X * m_normals[k].X +
+                                    m_normals[j].Y * m_normals[k].Y);
+                    if (r >= m_miterLim) DoMiter(j, k, r); else DoSquare(j, k);
+                    break;
+                }
+                case jtSquare: DoSquare(j, k); break;
+                case jtRound: DoRound(j, k); break;
+            }
+        k = j;
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::DoSquare(int j, int k)
+    {
+        double dx = std::tan(std::atan2(m_sinA,
+                                        m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) / 4);
+        m_destPoly.push_back(IntPoint(
+                Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
+                Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
+        m_destPoly.push_back(IntPoint(
+                Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
+                Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::DoMiter(int j, int k, double r)
+    {
+        double q = m_delta / r;
+        m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
+                                      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
+    }
+//------------------------------------------------------------------------------
+
+    void ClipperOffset::DoRound(int j, int k)
+    {
+        double a = std::atan2(m_sinA,
+                              m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
+        int steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);
+
+        double X = m_normals[k].X, Y = m_normals[k].Y, X2;
+        for (int i = 0; i < steps; ++i)
+        {
+            m_destPoly.push_back(IntPoint(
+                    Round(m_srcPoly[j].X + X * m_delta),
+                    Round(m_srcPoly[j].Y + Y * m_delta)));
+            X2 = X;
+            X = X * m_cos - m_sin * Y;
+            Y = X2 * m_sin + Y * m_cos;
+        }
+        m_destPoly.push_back(IntPoint(
+                Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+                Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+    }
+
+//------------------------------------------------------------------------------
+// Miscellaneous public functions
+//------------------------------------------------------------------------------
+
+    void Clipper::DoSimplePolygons()
+    {
+        PolyOutList::size_type i = 0;
+        while (i < m_PolyOuts.size())
+        {
+            OutRec* outrec = m_PolyOuts[i++];
+            OutPt* op = outrec->Pts;
+            if (!op || outrec->IsOpen) continue;
+            do //for each Pt in Polygon until duplicate found do ...
+            {
+                OutPt* op2 = op->Next;
+                while (op2 != outrec->Pts)
+                {
+                    if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op)
+                    {
+                        //split the polygon into two ...
+                        OutPt* op3 = op->Prev;
+                        OutPt* op4 = op2->Prev;
+                        op->Prev = op4;
+                        op4->Next = op;
+                        op2->Prev = op3;
+                        op3->Next = op2;
+
+                        outrec->Pts = op;
+                        OutRec* outrec2 = CreateOutRec();
+                        outrec2->Pts = op2;
+                        UpdateOutPtIdxs(*outrec2);
+                        if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
+                        {
+                            //OutRec2 is contained by OutRec1 ...
+                            outrec2->IsHole = !outrec->IsHole;
+                            outrec2->FirstLeft = outrec;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
+                        }
+                        else
+                        if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
+                        {
+                            //OutRec1 is contained by OutRec2 ...
+                            outrec2->IsHole = outrec->IsHole;
+                            outrec->IsHole = !outrec2->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            outrec->FirstLeft = outrec2;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
+                        }
+                        else
+                        {
+                            //the 2 polygons are separate ...
+                            outrec2->IsHole = outrec->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
+                        }
+                        op2 = op; //ie get ready for the Next iteration
+                    }
+                    op2 = op2->Next;
+                }
+                op = op->Next;
+            }
+            while (op != outrec->Pts);
+        }
+    }
+//------------------------------------------------------------------------------
+
+    void ReversePath(Path& p)
+    {
+        std::reverse(p.begin(), p.end());
+    }
+//------------------------------------------------------------------------------
+
+    void ReversePaths(Paths& p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); ++i)
+            ReversePath(p[i]);
+    }
+//------------------------------------------------------------------------------
+
+    void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPath(in_poly, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+//------------------------------------------------------------------------------
+
+    void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPaths(in_polys, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+//------------------------------------------------------------------------------
+
+    void SimplifyPolygons(Paths &polys, PolyFillType fillType)
+    {
+        SimplifyPolygons(polys, polys, fillType);
+    }
+//------------------------------------------------------------------------------
+
+    inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
+    {
+        double Dx = ((double)pt1.X - pt2.X);
+        double dy = ((double)pt1.Y - pt2.Y);
+        return (Dx*Dx + dy*dy);
+    }
+//------------------------------------------------------------------------------
+
+    double DistanceFromLineSqrd(
+            const IntPoint& pt, const IntPoint& ln1, const IntPoint& ln2)
+    {
+        //The equation of a line in general form (Ax + By + C = 0)
+        //given 2 points (x?y? & (x?y? is ...
+        //(y?- y?x + (x?- x?y + (y?- y?x?- (x?- x?y?= 0
+        //A = (y?- y?; B = (x?- x?; C = (y?- y?x?- (x?- x?y?
+        //perpendicular distance of point (x?y? = (Ax?+ By?+ C)/Sqrt(A?+ B?
+        //see http://en.wikipedia.org/wiki/Perpendicular_distance
+        double A = double(ln1.Y - ln2.Y);
+        double B = double(ln2.X - ln1.X);
+        double C = A * ln1.X  + B * ln1.Y;
+        C = A * pt.X + B * pt.Y - C;
+        return (C * C) / (A * A + B * B);
+    }
+//---------------------------------------------------------------------------
+
+    bool SlopesNearCollinear(const IntPoint& pt1,
+                             const IntPoint& pt2, const IntPoint& pt3, double distSqrd)
+    {
+        //this function is more accurate when the point that's geometrically
+        //between the other 2 points is the one that's tested for distance.
+        //ie makes it more likely to pick up 'spikes' ...
+        if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
+        {
+            if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
+            else
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
+        }
+        else
+        {
+            if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
+            else
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
+        }
+    }
+//------------------------------------------------------------------------------
+
+    bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
+    {
+        double Dx = (double)pt1.X - pt2.X;
+        double dy = (double)pt1.Y - pt2.Y;
+        return ((Dx * Dx) + (dy * dy) <= distSqrd);
+    }
+//------------------------------------------------------------------------------
+
+    OutPt* ExcludeOp(OutPt* op)
+    {
+        OutPt* result = op->Prev;
+        result->Next = op->Next;
+        op->Next->Prev = result;
+        result->Idx = 0;
+        return result;
+    }
+//------------------------------------------------------------------------------
+
+    void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
+    {
+        //distance = proximity in units/pixels below which vertices
+        //will be stripped. Default ~= sqrt(2).
+
+        size_t size = in_poly.size();
+
+        if (size == 0)
+        {
+            out_poly.clear();
+            return;
+        }
+
+        OutPt* outPts = new OutPt[size];
+        for (size_t i = 0; i < size; ++i)
+        {
+            outPts[i].Pt = in_poly[i];
+            outPts[i].Next = &outPts[(i + 1) % size];
+            outPts[i].Next->Prev = &outPts[i];
+            outPts[i].Idx = 0;
+        }
+
+        double distSqrd = distance * distance;
+        OutPt* op = &outPts[0];
+        while (op->Idx == 0 && op->Next != op->Prev)
+        {
+            if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
+            {
+                ExcludeOp(op->Next);
+                op = ExcludeOp(op);
+                size -= 2;
+            }
+            else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else
+            {
+                op->Idx = 1;
+                op = op->Next;
+            }
+        }
+
+        if (size < 3) size = 0;
+        out_poly.resize(size);
+        for (size_t i = 0; i < size; ++i)
+        {
+            out_poly[i] = op->Pt;
+            op = op->Next;
+        }
+        delete [] outPts;
+    }
+//------------------------------------------------------------------------------
+
+    void CleanPolygon(Path& poly, double distance)
+    {
+        CleanPolygon(poly, poly, distance);
+    }
+//------------------------------------------------------------------------------
+
+    void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
+    {
+        out_polys.resize(in_polys.size());
+        for (Paths::size_type i = 0; i < in_polys.size(); ++i)
+            CleanPolygon(in_polys[i], out_polys[i], distance);
+    }
+//------------------------------------------------------------------------------
+
+    void CleanPolygons(Paths& polys, double distance)
+    {
+        CleanPolygons(polys, polys, distance);
+    }
+//------------------------------------------------------------------------------
+
+    void Minkowski(const Path& poly, const Path& path,
+                   Paths& solution, bool isSum, bool isClosed)
+    {
+        int delta = (isClosed ? 1 : 0);
+        size_t polyCnt = poly.size();
+        size_t pathCnt = path.size();
+        Paths pp;
+        pp.reserve(pathCnt);
+        if (isSum)
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
+                pp.push_back(p);
+            }
+        else
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
+                pp.push_back(p);
+            }
+
+        solution.clear();
+        solution.reserve((pathCnt + delta) * (polyCnt + 1));
+        for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
+            for (size_t j = 0; j < polyCnt; ++j)
+            {
+                Path quad;
+                quad.reserve(4);
+                quad.push_back(pp[i % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
+                quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
+                if (!Orientation(quad)) ReversePath(quad);
+                solution.push_back(quad);
+            }
+    }
+//------------------------------------------------------------------------------
+
+    void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
+    {
+        Minkowski(pattern, path, solution, true, pathIsClosed);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+//------------------------------------------------------------------------------
+
+    void TranslatePath(const Path& input, Path& output, const IntPoint delta)
+    {
+        //precondition: input != output
+        output.resize(input.size());
+        for (size_t i = 0; i < input.size(); ++i)
+            output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
+    }
+//------------------------------------------------------------------------------
+
+    void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
+    {
+        Clipper c;
+        for (size_t i = 0; i < paths.size(); ++i)
+        {
+            Paths tmp;
+            Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
+            c.AddPaths(tmp, ptSubject, true);
+            if (pathIsClosed)
+            {
+                Path tmp2;
+                TranslatePath(paths[i], tmp2, pattern[0]);
+                c.AddPath(tmp2, ptClip, true);
+            }
+        }
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+//------------------------------------------------------------------------------
+
+    void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
+    {
+        Minkowski(poly1, poly2, solution, false, true);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+//------------------------------------------------------------------------------
+
+    enum NodeType {ntAny, ntOpen, ntClosed};
+
+    void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
+    {
+        bool match = true;
+        if (nodetype == ntClosed) match = !polynode.IsOpen();
+        else if (nodetype == ntOpen) return;
+
+        if (!polynode.Contour.empty() && match)
+            paths.push_back(polynode.Contour);
+        for (int i = 0; i < polynode.ChildCount(); ++i)
+            AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
+    }
+//------------------------------------------------------------------------------
+
+    void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntAny, paths);
+    }
+//------------------------------------------------------------------------------
+
+    void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntClosed, paths);
+    }
+//------------------------------------------------------------------------------
+
+    void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        //Open paths are top level only, so ...
+        for (int i = 0; i < polytree.ChildCount(); ++i)
+            if (polytree.Childs[i]->IsOpen())
+                paths.push_back(polytree.Childs[i]->Contour);
+    }
+//------------------------------------------------------------------------------
+
+    std::ostream& operator <<(std::ostream &s, const IntPoint &p)
+    {
+        s << "(" << p.X << "," << p.Y << ")";
+        return s;
+    }
+//------------------------------------------------------------------------------
+
+    std::ostream& operator <<(std::ostream &s, const Path &p)
+    {
+        if (p.empty()) return s;
+        Path::size_type last = p.size() -1;
+        for (Path::size_type i = 0; i < last; i++)
+            s << "(" << p[i].X << "," << p[i].Y << "), ";
+        s << "(" << p[last].X << "," << p[last].Y << ")\n";
+        return s;
+    }
+//------------------------------------------------------------------------------
+
+    std::ostream& operator <<(std::ostream &s, const Paths &p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); i++)
+            s << p[i];
+        s << "\n";
+        return s;
+    }
+//------------------------------------------------------------------------------
+
+} //ClipperLib namespace
diff --git a/3rdparty/TNN/examples/utils/clipper.h b/3rdparty/TNN/examples/utils/clipper.h
new file mode 100644
index 0000000..cbe30db
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/clipper.h
@@ -0,0 +1,406 @@
+/*******************************************************************************
+*                                                                              *
+* Author    :  Angus Johnson                                                   *
+* Version   :  6.4.2                                                           *
+* Date      :  27 February 2017                                                *
+* Website   :  http://www.angusj.com                                           *
+* Copyright :  Angus Johnson 2010-2017                                         *
+*                                                                              *
+* License:                                                                     *
+* Use, modification & distribution is subject to Boost Software License Ver 1. *
+* http://www.boost.org/LICENSE_1_0.txt                                         *
+*                                                                              *
+* Attributions:                                                                *
+* The code in this library is an extension of Bala Vatti's clipping algorithm: *
+* "A generic solution to polygon clipping"                                     *
+* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+* http://portal.acm.org/citation.cfm?id=129906                                 *
+*                                                                              *
+* Computer graphics and geometric modeling: implementation and algorithms      *
+* By Max K. Agoston                                                            *
+* Springer; 1 edition (January 4, 2005)                                        *
+* http://books.google.com/books?q=vatti+clipping+agoston                       *
+*                                                                              *
+* See also:                                                                    *
+* "Polygon Offsetting by Computing Winding Numbers"                            *
+* Paper no. DETC2005-85513 pp. 565-575                                         *
+* ASME 2005 International Design Engineering Technical Conferences             *
+* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+* September 24-28, 2005 , Long Beach, California, USA                          *
+* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+*                                                                              *
+*******************************************************************************/
+
+#ifndef clipper_hpp
+#define clipper_hpp
+
+#define CLIPPER_VERSION "6.4.2"
+
+//use_int32: When enabled 32bit ints are used instead of 64bit ints. This
+//improve performance but coordinate values are limited to the range +/- 46340
+//#define use_int32
+
+//use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
+//#define use_xyz
+
+//use_lines: Enables line clipping. Adds a very minor cost to performance.
+#define use_lines
+
+//use_deprecated: Enables temporary support for the obsolete functions
+//#define use_deprecated  
+
+#include <vector>
+#include <list>
+#include <set>
+#include <stdexcept>
+#include <cstring>
+#include <cstdlib>
+#include <ostream>
+#include <functional>
+#include <queue>
+
+namespace ClipperLib {
+
+    enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor };
+    enum PolyType { ptSubject, ptClip };
+//By far the most widely used winding rules for polygon filling are
+//EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
+//Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
+//see http://glprogramming.com/red/chapter11.html
+    enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative };
+
+#ifdef use_int32
+    typedef int cInt;
+  static cInt const loRange = 0x7FFF;
+  static cInt const hiRange = 0x7FFF;
+#else
+    typedef signed long long cInt;
+    static cInt const loRange = 0x3FFFFFFF;
+    static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL;
+    typedef signed long long long64;     //used by Int128 class
+    typedef unsigned long long ulong64;
+
+#endif
+
+    struct IntPoint {
+        cInt X;
+        cInt Y;
+#ifdef use_xyz
+        cInt Z;
+  IntPoint(cInt x = 0, cInt y = 0, cInt z = 0): X(x), Y(y), Z(z) {};
+#else
+        IntPoint(cInt x = 0, cInt y = 0): X(x), Y(y) {};
+#endif
+
+        friend inline bool operator== (const IntPoint& a, const IntPoint& b)
+        {
+            return a.X == b.X && a.Y == b.Y;
+        }
+        friend inline bool operator!= (const IntPoint& a, const IntPoint& b)
+        {
+            return a.X != b.X  || a.Y != b.Y;
+        }
+    };
+//------------------------------------------------------------------------------
+
+    typedef std::vector< IntPoint > Path;
+    typedef std::vector< Path > Paths;
+
+    inline Path& operator <<(Path& poly, const IntPoint& p) {poly.push_back(p); return poly;}
+    inline Paths& operator <<(Paths& polys, const Path& p) {polys.push_back(p); return polys;}
+
+    std::ostream& operator <<(std::ostream &s, const IntPoint &p);
+    std::ostream& operator <<(std::ostream &s, const Path &p);
+    std::ostream& operator <<(std::ostream &s, const Paths &p);
+
+    struct DoublePoint
+    {
+        double X;
+        double Y;
+        DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {}
+        DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {}
+    };
+//------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
+#endif
+
+    enum InitOptions {ioReverseSolution = 1, ioStrictlySimple = 2, ioPreserveCollinear = 4};
+    enum JoinType {jtSquare, jtRound, jtMiter};
+    enum EndType {etClosedPolygon, etClosedLine, etOpenButt, etOpenSquare, etOpenRound};
+
+    class PolyNode;
+    typedef std::vector< PolyNode* > PolyNodes;
+
+    class PolyNode
+    {
+    public:
+        PolyNode();
+        virtual ~PolyNode(){};
+        Path Contour;
+        PolyNodes Childs;
+        PolyNode* Parent;
+        PolyNode* GetNext() const;
+        bool IsHole() const;
+        bool IsOpen() const;
+        int ChildCount() const;
+    private:
+        //PolyNode& operator =(PolyNode& other);
+        unsigned Index; //node index in Parent.Childs
+        bool m_IsOpen;
+        JoinType m_jointype;
+        EndType m_endtype;
+        PolyNode* GetNextSiblingUp() const;
+        void AddChild(PolyNode& child);
+        friend class Clipper; //to access Index
+        friend class ClipperOffset;
+    };
+
+    class PolyTree: public PolyNode
+    {
+    public:
+        ~PolyTree(){ Clear(); };
+        PolyNode* GetFirst() const;
+        void Clear();
+        int Total() const;
+    private:
+        //PolyTree& operator =(PolyTree& other);
+        PolyNodes AllNodes;
+        friend class Clipper; //to access AllNodes
+    };
+
+    bool Orientation(const Path &poly);
+    double Area(const Path &poly);
+    int PointInPolygon(const IntPoint &pt, const Path &path);
+
+    void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
+    void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
+    void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd);
+
+    void CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
+    void CleanPolygon(Path& poly, double distance = 1.415);
+    void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
+    void CleanPolygons(Paths& polys, double distance = 1.415);
+
+    void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
+    void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
+    void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);
+
+    void PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
+    void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
+    void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);
+
+    void ReversePath(Path& p);
+    void ReversePaths(Paths& p);
+
+    struct IntRect { cInt left; cInt top; cInt right; cInt bottom; };
+
+//enums that are used internally ...
+    enum EdgeSide { esLeft = 1, esRight = 2};
+
+//forward declarations (for stuff used internally) ...
+    struct TEdge;
+    struct IntersectNode;
+    struct LocalMinimum;
+    struct OutPt;
+    struct OutRec;
+    struct Join;
+
+    typedef std::vector < OutRec* > PolyOutList;
+    typedef std::vector < TEdge* > EdgeList;
+    typedef std::vector < Join* > JoinList;
+    typedef std::vector < IntersectNode* > IntersectList;
+
+//------------------------------------------------------------------------------
+
+//ClipperBase is the ancestor to the Clipper class. It should not be
+//instantiated directly. This class simply abstracts the conversion of sets of
+//polygon coordinates into edge objects that are stored in a LocalMinima list.
+    class ClipperBase
+    {
+    public:
+        ClipperBase();
+        virtual ~ClipperBase();
+        virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed);
+        bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed);
+        virtual void Clear();
+        IntRect GetBounds();
+        bool PreserveCollinear() {return m_PreserveCollinear;};
+        void PreserveCollinear(bool value) {m_PreserveCollinear = value;};
+    protected:
+        void DisposeLocalMinimaList();
+        TEdge* AddBoundsToLML(TEdge *e, bool IsClosed);
+        virtual void Reset();
+        TEdge* ProcessBound(TEdge* E, bool IsClockwise);
+        void InsertScanbeam(const cInt Y);
+        bool PopScanbeam(cInt &Y);
+        bool LocalMinimaPending();
+        bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin);
+        OutRec* CreateOutRec();
+        void DisposeAllOutRecs();
+        void DisposeOutRec(PolyOutList::size_type index);
+        void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2);
+        void DeleteFromAEL(TEdge *e);
+        void UpdateEdgeIntoAEL(TEdge *&e);
+
+        typedef std::vector<LocalMinimum> MinimaList;
+        MinimaList::iterator m_CurrentLM;
+        MinimaList           m_MinimaList;
+
+        bool              m_UseFullRange;
+        EdgeList          m_edges;
+        bool              m_PreserveCollinear;
+        bool              m_HasOpenPaths;
+        PolyOutList       m_PolyOuts;
+        TEdge           *m_ActiveEdges;
+
+        typedef std::priority_queue<cInt> ScanbeamList;
+        ScanbeamList     m_Scanbeam;
+    };
+//------------------------------------------------------------------------------
+
+    class Clipper : public virtual ClipperBase
+    {
+    public:
+        Clipper(int initOptions = 0);
+        bool Execute(ClipType clipType,
+                     Paths &solution,
+                     PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType,
+                     Paths &solution,
+                     PolyFillType subjFillType,
+                     PolyFillType clipFillType);
+        bool Execute(ClipType clipType,
+                     PolyTree &polytree,
+                     PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType,
+                     PolyTree &polytree,
+                     PolyFillType subjFillType,
+                     PolyFillType clipFillType);
+        bool ReverseSolution() { return m_ReverseOutput; };
+        void ReverseSolution(bool value) {m_ReverseOutput = value;};
+        bool StrictlySimple() {return m_StrictSimple;};
+        void StrictlySimple(bool value) {m_StrictSimple = value;};
+        //set the callback function for z value filling on intersections (otherwise Z is 0)
+#ifdef use_xyz
+        void ZFillFunction(ZFillCallback zFillFunc);
+#endif
+    protected:
+        virtual bool ExecuteInternal();
+    private:
+        JoinList         m_Joins;
+        JoinList         m_GhostJoins;
+        IntersectList    m_IntersectList;
+        ClipType         m_ClipType;
+        typedef std::list<cInt> MaximaList;
+        MaximaList       m_Maxima;
+        TEdge           *m_SortedEdges;
+        bool             m_ExecuteLocked;
+        PolyFillType     m_ClipFillType;
+        PolyFillType     m_SubjFillType;
+        bool             m_ReverseOutput;
+        bool             m_UsingPolyTree;
+        bool             m_StrictSimple;
+#ifdef use_xyz
+        ZFillCallback   m_ZFill; //custom callback
+#endif
+        void SetWindingCount(TEdge& edge);
+        bool IsEvenOddFillType(const TEdge& edge) const;
+        bool IsEvenOddAltFillType(const TEdge& edge) const;
+        void InsertLocalMinimaIntoAEL(const cInt botY);
+        void InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge);
+        void AddEdgeToSEL(TEdge *edge);
+        bool PopEdgeFromSEL(TEdge *&edge);
+        void CopyAELToSEL();
+        void DeleteFromSEL(TEdge *e);
+        void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2);
+        bool IsContributing(const TEdge& edge) const;
+        bool IsTopHorz(const cInt XPos);
+        void DoMaxima(TEdge *e);
+        void ProcessHorizontals();
+        void ProcessHorizontal(TEdge *horzEdge);
+        void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
+        OutPt* AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
+        OutRec* GetOutRec(int idx);
+        void AppendPolygon(TEdge *e1, TEdge *e2);
+        void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt);
+        OutPt* AddOutPt(TEdge *e, const IntPoint &pt);
+        OutPt* GetLastOutPt(TEdge *e);
+        bool ProcessIntersections(const cInt topY);
+        void BuildIntersectList(const cInt topY);
+        void ProcessIntersectList();
+        void ProcessEdgesAtTopOfScanbeam(const cInt topY);
+        void BuildResult(Paths& polys);
+        void BuildResult2(PolyTree& polytree);
+        void SetHoleState(TEdge *e, OutRec *outrec);
+        void DisposeIntersectNodes();
+        bool FixupIntersectionOrder();
+        void FixupOutPolygon(OutRec &outrec);
+        void FixupOutPolyline(OutRec &outrec);
+        bool IsHole(TEdge *e);
+        bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl);
+        void FixHoleLinkage(OutRec &outrec);
+        void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt);
+        void ClearJoins();
+        void ClearGhostJoins();
+        void AddGhostJoin(OutPt *op, const IntPoint offPt);
+        bool JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2);
+        void JoinCommonEdges();
+        void DoSimplePolygons();
+        void FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
+        void FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
+        void FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
+#ifdef use_xyz
+        void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
+#endif
+    };
+//------------------------------------------------------------------------------
+
+    class ClipperOffset
+    {
+    public:
+        ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
+        ~ClipperOffset();
+        void AddPath(const Path& path, JoinType joinType, EndType endType);
+        void AddPaths(const Paths& paths, JoinType joinType, EndType endType);
+        void Execute(Paths& solution, double delta);
+        void Execute(PolyTree& solution, double delta);
+        void Clear();
+        double MiterLimit;
+        double ArcTolerance;
+    private:
+        Paths m_destPolys;
+        Path m_srcPoly;
+        Path m_destPoly;
+        std::vector<DoublePoint> m_normals;
+        double m_delta, m_sinA, m_sin, m_cos;
+        double m_miterLim, m_StepsPerRad;
+        IntPoint m_lowest;
+        PolyNode m_polyNodes;
+
+        void FixOrientations();
+        void DoOffset(double delta);
+        void OffsetPoint(int j, int& k, JoinType jointype);
+        void DoSquare(int j, int k);
+        void DoMiter(int j, int k, double r);
+        void DoRound(int j, int k);
+    };
+//------------------------------------------------------------------------------
+
+    class clipperException : public std::exception
+    {
+    public:
+        clipperException(const char* description): m_descr(description) {}
+        virtual ~clipperException() throw() {}
+        virtual const char* what() const throw() {return m_descr.c_str();}
+    private:
+        std::string m_descr;
+    };
+//------------------------------------------------------------------------------
+
+} //ClipperLib namespace
+
+#endif //clipper_hpp
+
+
diff --git a/3rdparty/TNN/examples/utils/landmark_smoothing_filter.cc b/3rdparty/TNN/examples/utils/landmark_smoothing_filter.cc
new file mode 100644
index 0000000..c60da6a
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/landmark_smoothing_filter.cc
@@ -0,0 +1,218 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "landmark_smoothing_filter.h"
+
+namespace TNN_NS {
+
+static float GetObjectScale(const NormalizedLandmarkList& landmarks, int image_width,
+                            int image_height) {
+    const auto& lm_min_x = std::min_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const NormalizedLandmark& a, const NormalizedLandmark& b) { return std::get<0>(a) < std::get<0>(b); });
+    const auto& lm_max_x = std::max_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const NormalizedLandmark& a, const NormalizedLandmark& b) { return std::get<0>(a) > std::get<0>(b); });
+    
+    if (landmarks.size() <= 0)
+        return 0;
+    
+    const float x_min = std::get<0>(*lm_min_x);
+    const float x_max = std::get<0>(*lm_max_x);
+    
+    const auto& lm_min_y = std::min_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const NormalizedLandmark& a, const NormalizedLandmark& b) { return std::get<1>(a) < std::get<1>(b); });
+    const auto& lm_max_y = std::max_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const NormalizedLandmark& a, const NormalizedLandmark& b) { return std::get<1>(a) > std::get<1>(b); });
+    
+    const float y_min = std::get<1>(*lm_min_y);
+    const float y_max = std::get<1>(*lm_max_y);
+    
+    const float object_width = (x_max - x_min) * image_width;
+    const float object_height = (y_max - y_min) * image_height;
+    
+    return (object_width + object_height) / 2.0f;
+}
+
+static float GetObjectScale(const Normalized2DLandmarkList& landmarks, int image_width,
+                            int image_height) {
+    const auto& lm_min_x = std::min_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const Normalized2DLandmark& a, const Normalized2DLandmark& b) { return a.first < b.first; });
+    const auto& lm_max_x = std::max_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const Normalized2DLandmark& a, const Normalized2DLandmark& b) { return a.first > b.first; });
+    
+    if (landmarks.size() <= 0)
+        return 0;
+    
+    const float x_min = lm_min_x->first;
+    const float x_max = lm_max_x->first;
+    
+    const auto& lm_min_y = std::min_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const Normalized2DLandmark& a, const Normalized2DLandmark& b) { return a.second < b.second; });
+    const auto& lm_max_y = std::max_element(
+                                            landmarks.begin(), landmarks.end(),
+                                            [](const Normalized2DLandmark& a, const Normalized2DLandmark& b) { return a.second > b.second; });
+    
+    const float y_min = lm_min_y->second;
+    const float y_max = lm_max_y->second;
+    
+    const float object_width = (x_max - x_min) * image_width;
+    const float object_height = (y_max - y_min) * image_height;
+    
+    return (object_width + object_height) / 2.0f;
+}
+
+TNN_NS::Status VelocityFilter::Reset() {
+    x_filters_.clear();
+    y_filters_.clear();
+    z_filters_.clear();
+    return TNN_NS::TNN_OK;
+}
+
+bool VelocityFilter::isValidLandMark(const NormalizedLandmark& m) {
+    bool valid = (std::get<0>(m) >= 0 &&
+                  std::get<1>(m) >= 0 &&
+                  std::get<2>(m) >= 0);
+    return valid;
+}
+
+bool VelocityFilter::isValidLandMark(const Normalized2DLandmark& m) {
+    bool valid = (m.first >= 0) && (m.second >= 0);
+    return valid;
+}
+
+TNN_NS::Status VelocityFilter::Apply(const NormalizedLandmarkList& in_landmarks,
+                                     const std::pair<int, int>& image_size,
+                                     const TimeStamp& timestamp,
+                                     NormalizedLandmarkList* out_landmarks) {
+    // Get image size.
+    int image_width;
+    int image_height;
+    std::tie(image_height, image_width) = image_size;
+    
+    // Get value scale as inverse value of the object scale.
+    // If value is too small smoothing will be disabled and landmarks will be
+    // returned as is.
+    NormalizedLandmarkList filterd;
+    auto filter = [](const NormalizedLandmark&m){
+        return isValidLandMark(m);
+    };
+    std::copy_if(in_landmarks.begin(), in_landmarks.end(), std::back_inserter(filterd), filter);
+    const float object_scale = GetObjectScale(filterd, image_width, image_height);
+    if (object_scale < min_allowed_object_scale_) {
+        *out_landmarks = in_landmarks;
+        return TNN_NS::TNN_OK;
+    }
+    const float value_scale = 1.0f / object_scale;
+    
+    // Initialize filters once.
+    auto status = InitializeFiltersIfEmpty(in_landmarks.size());
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    // Filter landmarks. Every axis of every landmark is filtered separately.
+    for (int i = 0; i < in_landmarks.size(); ++i) {
+        const NormalizedLandmark& in_landmark = in_landmarks[i];
+        
+        if (!isValidLandMark(in_landmark)) {
+            out_landmarks->push_back(in_landmark);
+            continue;
+        }
+        
+        float out_x = x_filters_[i].Apply(timestamp, value_scale,
+                                          std::get<0>(in_landmark) * image_width) / image_width;
+        float out_y = y_filters_[i].Apply(timestamp, value_scale,
+                                          std::get<1>(in_landmark) * image_height) / image_height;
+        float out_z = z_filters_[i].Apply(timestamp, value_scale,
+                                          std::get<2>(in_landmark) * image_width) / image_width;
+        
+        NormalizedLandmark out_landmark = std::make_tuple(out_x, out_y, out_z);
+        out_landmarks->push_back(std::move(out_landmark));
+    }
+    
+    return TNN_OK;
+}
+
+TNN_NS::Status VelocityFilter::Apply2D(const Normalized2DLandmarkList& in_landmarks,
+                                       const std::pair<int, int>& image_size,
+                                       const TimeStamp& timestamp,
+                                       Normalized2DLandmarkList* out_landmarks) {
+    // Get image size.
+    int image_width;
+    int image_height;
+    std::tie(image_height, image_width) = image_size;
+    
+    // Get value scale as inverse value of the object scale.
+    // If value is too small smoothing will be disabled and landmarks will be
+    // returned as is.
+    Normalized2DLandmarkList filterd;
+    auto filter = [](const Normalized2DLandmark&m){
+        return isValidLandMark(m);
+    };
+    std::copy_if(in_landmarks.begin(), in_landmarks.end(), std::back_inserter(filterd), filter);
+    const float object_scale = GetObjectScale(filterd, image_width, image_height);
+    if (object_scale < min_allowed_object_scale_) {
+        *out_landmarks = in_landmarks;
+        return TNN_NS::TNN_OK;
+    }
+    const float value_scale = 1.0f / object_scale;
+    
+    // Initialize filters once.
+    auto status = InitializeFiltersIfEmpty(in_landmarks.size());
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    // Filter landmarks. Every axis of every landmark is filtered separately.
+    for (int i = 0; i < in_landmarks.size(); ++i) {
+        const Normalized2DLandmark& in_landmark = in_landmarks[i];
+        
+        if (!isValidLandMark(in_landmark)) {
+            out_landmarks->push_back(in_landmark);
+            continue;
+        }
+        
+        float out_x = x_filters_[i].Apply(timestamp, value_scale,
+                                          in_landmark.first * image_width) / image_width;
+        float out_y = y_filters_[i].Apply(timestamp, value_scale,
+                                          in_landmark.second * image_height) / image_height;
+        
+        Normalized2DLandmark out_landmark = std::make_pair(out_x, out_y);
+        out_landmarks->push_back(std::move(out_landmark));
+    }
+    
+    return TNN_OK;
+}
+
+TNN_NS::Status VelocityFilter::InitializeFiltersIfEmpty(const size_t n_landmarks) {
+    if (!x_filters_.empty()) {
+        RETURN_VALUE_ON_NEQ(x_filters_.size(), n_landmarks, Status(TNNERR_PARAM_ERR, "invalid landmark size!"));
+        RETURN_VALUE_ON_NEQ(y_filters_.size(), n_landmarks, Status(TNNERR_PARAM_ERR, "invalid landmark size!"));
+        RETURN_VALUE_ON_NEQ(z_filters_.size(), n_landmarks, Status(TNNERR_PARAM_ERR, "invalid landmark size!"));
+        return TNN_OK;
+    }
+    
+    x_filters_.resize(n_landmarks,
+                      RelativeVelocityFilter(window_size_, velocity_scale_, target_fps_));
+    y_filters_.resize(n_landmarks,
+                      RelativeVelocityFilter(window_size_, velocity_scale_, target_fps_));
+    z_filters_.resize(n_landmarks,
+                      RelativeVelocityFilter(window_size_, velocity_scale_, target_fps_));
+    
+    return TNN_OK;
+}
+
+}
diff --git a/3rdparty/TNN/examples/utils/landmark_smoothing_filter.h b/3rdparty/TNN/examples/utils/landmark_smoothing_filter.h
new file mode 100644
index 0000000..6e91a16
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/landmark_smoothing_filter.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_UTILS_LANDMARK_SMOOTHING_FILTER_H_
+#define TNN_EXAMPLES_UTILS_LANDMARK_SMOOTHING_FILTER_H_
+
+#include "tnn/core/status.h"
+
+#include "relative_velocity_filter.h"
+#include "time_stamp.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <tuple>
+#include <chrono>
+#include <iterator>
+
+namespace TNN_NS {
+
+using NormalizedLandmark = std::tuple<float,float,float>;
+using NormalizedLandmarkList = std::vector<NormalizedLandmark>;
+
+using Normalized2DLandmark = std::pair<float,float>;
+using Normalized2DLandmarkList = std::vector<Normalized2DLandmark>;
+
+class VelocityFilter {
+public:
+    VelocityFilter(int window_size, float velocity_scale,
+                   float min_allowed_object_scale, int target_fps)
+    : target_fps_(target_fps),
+    window_size_(window_size),
+    velocity_scale_(velocity_scale),
+    min_allowed_object_scale_(min_allowed_object_scale) {}
+    
+    TNN_NS::Status Reset();
+    
+    TNN_NS::Status Apply(const NormalizedLandmarkList& in_landmarks,
+                         const std::pair<int, int>& image_size,
+                         const TimeStamp& timestamp,
+                         NormalizedLandmarkList* out_landmarks);
+    
+    TNN_NS::Status Apply2D(const Normalized2DLandmarkList& in_landmarks,
+                           const std::pair<int, int>& image_size,
+                           const TimeStamp& timestamp,
+                           Normalized2DLandmarkList* out_landmarks);
+    
+private:
+    // Initializes filters for the first time or after Reset. If initialized then
+    // check the size.
+    TNN_NS::Status InitializeFiltersIfEmpty(const size_t n_landmarks);
+    // Check if a Landmark is valid
+    static bool isValidLandMark(const NormalizedLandmark& m);
+    static bool isValidLandMark(const Normalized2DLandmark& m);
+    
+    // desired fps
+    int target_fps_;
+    int window_size_;
+    float velocity_scale_;
+    float min_allowed_object_scale_;
+    
+    std::vector<TNN_NS::RelativeVelocityFilter> x_filters_;
+    std::vector<TNN_NS::RelativeVelocityFilter> y_filters_;
+    std::vector<TNN_NS::RelativeVelocityFilter> z_filters_;
+};
+
+}
+
+#endif // TNN_EXAMPLES_UTILS_LANDMARK_SMOOTHING_FILTER_H_
diff --git a/3rdparty/TNN/examples/utils/low_pass_filter.cc b/3rdparty/TNN/examples/utils/low_pass_filter.cc
new file mode 100644
index 0000000..cf5450c
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/low_pass_filter.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "low_pass_filter.h"
+
+namespace TNN_NS {
+
+LowPassFilter::LowPassFilter(float alpha) : initialized_{false} {
+    SetAlpha(alpha);
+}
+
+float LowPassFilter::Apply(float value) {
+    float result;
+    if (initialized_) {
+        result = alpha_ * value + (1.0 - alpha_) * stored_value_;
+    } else {
+        result = value;
+        initialized_ = true;
+    }
+    raw_value_ = value;
+    stored_value_ = result;
+    return result;
+}
+
+float LowPassFilter::ApplyWithAlpha(float value, float alpha) {
+    SetAlpha(alpha);
+    return Apply(value);
+}
+
+bool LowPassFilter::HasLastRawValue() { return initialized_; }
+
+float LowPassFilter::LastRawValue() { return raw_value_; }
+
+float LowPassFilter::LastValue() { return stored_value_; }
+
+void LowPassFilter::SetAlpha(float alpha) {
+    if (alpha < 0.0f || alpha > 1.0f) {
+        return;
+    }
+    alpha_ = alpha;
+}
+
+}
diff --git a/3rdparty/TNN/examples/utils/low_pass_filter.h b/3rdparty/TNN/examples/utils/low_pass_filter.h
new file mode 100644
index 0000000..7c948d9
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/low_pass_filter.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_UTILS_LOW_PASS_FILTER_H_
+#define TNN_EXAMPLES_UTILS_LOW_PASS_FILTER_H_
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+class LowPassFilter {
+public:
+    explicit LowPassFilter(float alpha);
+    
+    float Apply(float value);
+    
+    float ApplyWithAlpha(float value, float alpha);
+    
+    bool HasLastRawValue();
+    
+    float LastRawValue();
+    
+    float LastValue();
+    
+private:
+    void SetAlpha(float alpha);
+    
+    float raw_value_;
+    float alpha_;
+    float stored_value_;
+    bool initialized_;
+};
+
+}
+
+#endif // TNN_EXAMPLES_UTILS_LOW_PASS_FILTER_H_
diff --git a/3rdparty/TNN/examples/utils/relative_velocity_filter.cc b/3rdparty/TNN/examples/utils/relative_velocity_filter.cc
new file mode 100644
index 0000000..2801eaa
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/relative_velocity_filter.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relative_velocity_filter.h"
+
+#include <cmath>
+
+using namespace std::chrono;
+
+namespace TNN_NS {
+
+float RelativeVelocityFilter::Apply(const TimeStamp& timestamp, float value_scale,
+                                    float value) {
+    const auto new_timestamp = timestamp;
+    if (last_timestamp_ >= new_timestamp) {
+        // Results are unpredictable in this case, so nothing to do but
+        // return same value
+        return value;
+    }
+    
+    float alpha;
+    if (isEmpty(last_timestamp_)) {
+        alpha = 1.0;
+    } else {
+        if(!(distance_mode_ == DistanceEstimationMode::kLegacyTransition ||
+             distance_mode_ == DistanceEstimationMode::kForceCurrentScale))
+            return 0;
+        
+        const float distance = distance_mode_ == DistanceEstimationMode::kLegacyTransition
+        ? value * value_scale - last_value_ * last_value_scale_    // Original.
+        : value_scale * (value - last_value_);  // Translation invariant.
+        
+        const int64_t duration = duration_cast<nanoseconds>(new_timestamp - last_timestamp_).count();
+        
+        float cumulative_distance = distance;
+        int64_t cumulative_duration = duration;
+        
+        // Define max cumulative duration assuming
+        // 30 frames per second is a good frame rate, so assuming 30 values
+        // per second or 1 / 30 of a second is a good duration per window element
+        const int64_t kAssumedMaxDuration = 1000000000 / target_fps_;
+        const int64_t max_cumulative_duration = (1 + window_.size()) * kAssumedMaxDuration;
+        for (const auto& el : window_) {
+            if (cumulative_duration + el.duration > max_cumulative_duration) {
+                // This helps in cases when durations are large and outdated
+                // window elements have bad impact on filtering results
+                break;
+            }
+            cumulative_distance += el.distance;
+            cumulative_duration += el.duration;
+        }
+        
+        constexpr double kNanoSecondsToSecond = 1e-9;
+        const float velocity = cumulative_distance / (cumulative_duration * kNanoSecondsToSecond);
+        alpha = 1.0f - 1.0f / (1.0f + velocity_scale_ * std::abs(velocity));
+        window_.push_front({distance, duration});
+        if (window_.size() > max_window_size_) {
+            window_.pop_back();
+        }
+    }
+    
+    last_value_ = value;
+    last_value_scale_ = value_scale;
+    last_timestamp_ = new_timestamp;
+    
+    return low_pass_filter_.ApplyWithAlpha(value, alpha);
+}
+
+}
diff --git a/3rdparty/TNN/examples/utils/relative_velocity_filter.h b/3rdparty/TNN/examples/utils/relative_velocity_filter.h
new file mode 100644
index 0000000..fc6e449
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/relative_velocity_filter.h
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_UTILS_RELATIVE_VELOCITY_FILTER_H_
+#define TNN_EXAMPLES_UTILS_RELATIVE_VELOCITY_FILTER_H_
+
+#include "low_pass_filter.h"
+#include "time_stamp.h"
+#include <deque>
+#include <cstdint>
+#include <chrono>
+
+namespace TNN_NS {
+
+class RelativeVelocityFilter {
+public:
+    enum class DistanceEstimationMode {
+        // When the value scale changes, uses a heuristic
+        // that is not translation invariant (see the implementation for details).
+        kLegacyTransition,
+        // The current (i.e. last) value scale is always used for scale estimation.
+        // When using this mode, the filter is translation invariant, i.e.
+        //     Filter(Data + Offset) = Filter(Data) + Offset.
+        kForceCurrentScale,
+        
+        kDefault = kLegacyTransition
+    };
+    
+public:
+    RelativeVelocityFilter(size_t window_size, float velocity_scale, int target_fps,
+                           DistanceEstimationMode distance_mode)
+    : max_window_size_{window_size},
+    window_{window_size},
+    velocity_scale_{velocity_scale},
+    target_fps_{target_fps},
+    distance_mode_{distance_mode} {}
+    
+    RelativeVelocityFilter(size_t window_size, float velocity_scale, int target_fps)
+    : RelativeVelocityFilter{window_size, velocity_scale, target_fps,
+        DistanceEstimationMode::kDefault} {}
+    
+    float Apply(const TimeStamp& timestamp, float value_scale, float value);
+    
+private:
+    struct WindowElement {
+        float distance;
+        int64_t duration;
+    };
+    
+    float last_value_ = 0.0;
+    float last_value_scale_ = 1.0;
+    TimeStamp last_timestamp_;
+    
+    size_t max_window_size_;
+    int target_fps_ = 30;
+    std::deque<WindowElement> window_;
+    LowPassFilter low_pass_filter_{1.0f};
+    float velocity_scale_;
+    DistanceEstimationMode distance_mode_;
+};
+
+}
+
+#endif // TNN_EXAMPLES_UTILS_RELATIVE_VELOCITY_FILTER_H_
diff --git a/3rdparty/TNN/examples/utils/time_stamp.cc b/3rdparty/TNN/examples/utils/time_stamp.cc
new file mode 100644
index 0000000..a1c1e85
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/time_stamp.cc
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "time_stamp.h"
+using namespace std::chrono;
+
+namespace TNN_NS {
+
+TimeStamp Now() {
+    return std::chrono::high_resolution_clock::now();
+}
+
+bool isEmpty(const TimeStamp& time_stamp) {
+    return duration_cast<nanoseconds>(time_stamp.time_since_epoch()).count() == 0;
+}
+
+}
diff --git a/3rdparty/TNN/examples/utils/time_stamp.h b/3rdparty/TNN/examples/utils/time_stamp.h
new file mode 100644
index 0000000..e1eac52
--- /dev/null
+++ b/3rdparty/TNN/examples/utils/time_stamp.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_EXAMPLES_UTILS_TIME_STAMP_H_
+#define TNN_EXAMPLES_UTILS_TIME_STAMP_H_
+
+#include "tnn/core/macro.h"
+#include <chrono>
+
+namespace TNN_NS {
+
+typedef std::chrono::high_resolution_clock Clock;
+typedef std::chrono::time_point<Clock> TimeStamp;
+
+// get a timestamp reflecting current moment
+TimeStamp Now();
+
+// check if time_stamp is empty(i.e., not a valid time stamp)
+bool isEmpty(const TimeStamp& time_stamp);
+
+}
+
+#endif // TNN_EXAMPLES_UTILS_TIME_STAMP_H_
diff --git a/3rdparty/TNN/examples/windows/.gitignore b/3rdparty/TNN/examples/windows/.gitignore
new file mode 100644
index 0000000..1321be3
--- /dev/null
+++ b/3rdparty/TNN/examples/windows/.gitignore
@@ -0,0 +1,3 @@
+x86/build_msvc
+x86/build_msvc_native
+x86/build_msvc_openvino
\ No newline at end of file
diff --git a/3rdparty/TNN/examples/windows/x86/CMakeLists.txt b/3rdparty/TNN/examples/windows/x86/CMakeLists.txt
new file mode 100644
index 0000000..ccc2696
--- /dev/null
+++ b/3rdparty/TNN/examples/windows/x86/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+
+option(TNN_OPENVINO_ENABLE "with openvino lib" OFF)
+option(TNN_OPENVINO_LIB_PATH "openvino lib path" "")
+option(TNN_DEMO_WITH_WEBCAM "with webcam" OFF)
+
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_CPU_ENABLE ON)
+
+if (TNN_DEMO_WITH_WEBCAM)
+    find_package(OpenCV 3 REQUIRED)
+    include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+endif()
+
+set(TNNRoot ${CMAKE_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+link_directories(${TNN_LIB_PATH})
+if (${TNN_OPENVINO_ENABLE})
+    link_directories(${TNN_OPENVINO_LIB_PATH})
+    add_definitions(-D_OPENVINO_)
+endif()
+
+link_libraries(TNN)
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../../linux/src/*.cc")
+
+file(GLOB_RECURSE BASE_SRC
+                    "${CMAKE_SOURCE_DIR}/../../base/*.cc")
+
+file(GLOB_RECURSE UTIL_SRC
+                   "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+
+file(GLOB_RECURSE WEBCAM_SRC "${CMAKE_SOURCE_DIR}/../../base/ocr*.cc")
+list(REMOVE_ITEM BASE_SRC ${WEBCAM_SRC})
+
+add_executable(demo_x86_imageclassify ${CMAKE_SOURCE_DIR}/../../linux/src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_facedetector ${CMAKE_SOURCE_DIR}/../../linux/src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_readingcomprehension ${CMAKE_SOURCE_DIR}/../../linux/src/BertReadingComprehension/BertReadingComprehension.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+add_executable(demo_x86_objectdetector ${CMAKE_SOURCE_DIR}/../../linux/src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+
+if (TNN_DEMO_WITH_WEBCAM)
+    file(GLOB_RECURSE SRC "${CMAKE_SOURCE_DIR}/../../linux/src/TNNWebCamBasedDemo/*.cc")
+    add_executable(demo_x86_webcam ${SRC} ${BASE_SRC} ${UTIL_SRC} ${WEBCAM_SRC})
+    target_link_libraries(demo_x86_webcam ${OpenCV_LIBS})
+endif()
diff --git a/3rdparty/TNN/examples/windows/x86/build_msvc_native.bat b/3rdparty/TNN/examples/windows/x86/build_msvc_native.bat
new file mode 100644
index 0000000..3769769
--- /dev/null
+++ b/3rdparty/TNN/examples/windows/x86/build_msvc_native.bat
@@ -0,0 +1,52 @@
+@echo off
+
+set ROOT_DIR=%~dp0
+set TNN_LIB_PATH=%ROOT_DIR%\..\..\..\scripts\build_win\
+set TNN_BIN_PATH=%ROOT_DIR%\..\..\..\scripts\build_win\
+set EXAMPLE_INSTALL_PATH=%ROOT_DIR%\build_msvc_native\release
+
+cd %ROOT_DIR%\..\..\..\scripts\
+call build_msvc_native.bat
+echo !cd!
+cd %ROOT_DIR%\..\..\..\examples\windows\x86\
+
+:: rmdir /s /q build_msvc_native
+mkdir build_msvc_native
+cd build_msvc_native
+
+cmake .. -G "Ninja" ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_SYSTEM_NAME=Windows ^
+    -DCMAKE_SYSTEM_PROCESSOR=AMD64 ^
+    -DTNN_LIB_PATH=%TNN_LIB_PATH% ^
+    -DTNN_DEMO_WITH_WEBCAM=OFF
+    @REM -DOpenCV_DIR=%OpenCV_DIR%
+
+if !errorlevel! == 1 (
+    echo Building TNN Examples Failed
+    goto errorHandle
+)
+
+cmake --build . --config Release -j4
+
+if !errorlevel! == 1 (
+    echo Building TNN Examples Failed
+    goto errorHandle
+)
+
+if not exist %EXAMPLE_INSTALL_PATH% (
+    mkdir %EXAMPLE_INSTALL_PATH%
+)
+
+for %%e in (.\*.exe) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+for %%e in (%TNN_BIN_PATH%\*.dll) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+for /R %OpenCV_DIR% %%e in (*.dll) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+
+cd %ROOT_DIR%
+echo Build Successfully!
+goto :eof
+
+:errorHandle
+    cd %ROOT_DIR%
+    echo Build Failed !
+
diff --git a/3rdparty/TNN/examples/windows/x86/build_msvc_openvino.bat b/3rdparty/TNN/examples/windows/x86/build_msvc_openvino.bat
new file mode 100644
index 0000000..90f2b95
--- /dev/null
+++ b/3rdparty/TNN/examples/windows/x86/build_msvc_openvino.bat
@@ -0,0 +1,55 @@
+@echo off
+
+set ROOT_DIR=%~dp0
+set TNN_LIB_PATH=%ROOT_DIR%\..\..\..\scripts\msvc_release\lib\
+set TNN_BIN_PATH=%ROOT_DIR%\..\..\..\scripts\msvc_release\bin\
+set EXAMPLE_INSTALL_PATH=%ROOT_DIR%\build_msvc_openvino\release
+
+cd %ROOT_DIR%\..\..\..\scripts\
+call build_msvc.bat
+echo !cd!
+cd %ROOT_DIR%\..\..\..\examples\windows\x86\
+
+:: rmdir /s /q build_msvc_openvino
+mkdir build_msvc_openvino
+cd build_msvc_openvino
+
+cmake .. -G "Ninja" ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_SYSTEM_NAME=Windows ^
+    -DCMAKE_SYSTEM_PROCESSOR=AMD64 ^
+    -DTNN_LIB_PATH=%TNN_LIB_PATH% ^
+    -DTNN_OPENVINO_ENABLE=ON ^
+    -DTNN_OPENVINO_LIB_PATH=%OPENVINO_LIB_PATH% ^
+    -DTNN_DEMO_WITH_WEBCAM=OFF
+    @REM -DOpenCV_DIR=%OpenCV_DIR%
+
+if !errorlevel! == 1 (
+    echo Building TNN Examples Failed
+    goto errorHandle
+)
+
+cmake --build . --config Release -j4
+
+if !errorlevel! == 1 (
+    echo Building TNN Examples Failed
+    goto errorHandle
+)
+
+if not exist %EXAMPLE_INSTALL_PATH% (
+    mkdir %EXAMPLE_INSTALL_PATH%
+)
+
+for %%e in (.\*.exe) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+for %%e in (%TNN_BIN_PATH%\*.dll) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+for /R %OpenCV_DIR% %%e in (*.dll) do copy "%%e" %EXAMPLE_INSTALL_PATH%
+copy %TNN_BIN_PATH%\plugins.xml  %EXAMPLE_INSTALL_PATH%
+
+cd %ROOT_DIR%
+echo Build Successfully!
+goto :eof
+
+:errorHandle
+    cd %ROOT_DIR%
+    echo Build Failed !
+
diff --git a/3rdparty/TNN/include/tnn/core/blob.h b/3rdparty/TNN/include/tnn/core/blob.h
new file mode 100644
index 0000000..6406a49
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/blob.h
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_BLOB_H_
+#define TNN_INCLUDE_TNN_CORE_BLOB_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+//@brief BlobDesc blob data info
+struct PUBLIC BlobDesc {
+    // device_type describes device cpu, gpu, ...
+    DeviceType device_type = DEVICE_NAIVE;
+    // data_type describes data precision fp32, in8, ...
+    DataType data_type = DATA_TYPE_FLOAT;
+    // data_format describes data order nchw, nhwc, ...
+    DataFormat data_format = DATA_FORMAT_AUTO;
+    // DimsVector describes data dims
+    DimsVector dims;
+    // name describes the blob name
+    std::string name = "";
+    
+    std::string description(bool all_message = false);
+};
+
+struct PUBLIC BlobHandle {
+    void *base            = NULL;
+    uint64_t bytes_offset = 0;
+};
+
+class BlobImpl;
+
+// @brief Blob tnn data store and transfer interface.
+class PUBLIC Blob {
+public:
+    //@brief create blob with blob descript
+    explicit Blob(BlobDesc desc);
+
+    Blob(BlobDesc desc, bool alloc_memory);
+
+    //@brief create Blob with blob descript and data handle
+    Blob(BlobDesc desc, BlobHandle handle);
+
+    virtual ~Blob();
+
+    //@brief return blob desc
+    BlobDesc &GetBlobDesc();
+
+    //@brief set blob description
+    //@param desc blob description
+    void SetBlobDesc(BlobDesc desc);
+
+    //@brief return handle to the stored data
+    BlobHandle GetHandle();
+
+    //@brief set blob handle
+    //@param handle to the stored data
+    void SetHandle(BlobHandle handle);
+
+    //@brief allocate blob handle in forward
+    bool NeedAllocateInForward();
+    
+    //@brief check if it is constant
+    bool IsConstant();
+
+    int GetFlag();
+
+    void SetFlag(int flag);
+private: 
+    BlobImpl* impl_;
+};
+
+// InputShapeMap input reshape info
+using InputShapesMap   = std::map<std::string, DimsVector>;
+using InputDataTypeMap = std::map<std::string, DataType>;
+
+using BlobMap = std::map<std::string, Blob *>;
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_BLOB_H_
diff --git a/3rdparty/TNN/include/tnn/core/common.h b/3rdparty/TNN/include/tnn/core/common.h
new file mode 100644
index 0000000..39b38f5
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/common.h
@@ -0,0 +1,212 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_COMMON_H_
+#define TNN_INCLUDE_TNN_CORE_COMMON_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+typedef std::function<void(void)> Callback;
+
+typedef enum {
+    //auto
+    //针对算子输入类型多变的情况，如二元算子中某个输入是权值，其可以为浮点也可以为整数
+    DATA_TYPE_AUTO = -1,
+    // float
+    DATA_TYPE_FLOAT = 0,
+    // half float
+    DATA_TYPE_HALF = 1,
+    // int8
+    DATA_TYPE_INT8 = 2,
+    // int32
+    DATA_TYPE_INT32 = 3,
+    // brain float 16
+    DATA_TYPE_BFP16 = 4,
+    // int64
+    DATA_TYPE_INT64 = 5,
+    // uint32
+    DATA_TYPE_UINT32 = 6
+} DataType;
+
+typedef enum {
+    // decided by device
+    DATA_FORMAT_AUTO     = -1,
+    DATA_FORMAT_NCHW     = 0,
+    DATA_FORMAT_NHWC     = 1,
+    DATA_FORMAT_NHWC4    = 2,
+    DATA_FORMAT_NC2HW2   = 3,
+    DATA_FORMAT_NC4HW4   = 4,
+    DATA_FORMAT_NC8HW8   = 5,
+    DATA_FORMAT_NC16HW16 = 6,
+    DATA_FORMAT_NCDHW    = 7,
+    DATA_FORMAT_NHC4W4   = 8,
+    // special for LSTM ONNX
+    DATA_FORMAT_CNH4     = 1000,
+} DataFormat;
+
+typedef enum {
+    // auto precision, each device choose default precision.
+    // ARM: prefer fp16
+    // OPENCL: prefer fp16
+    // METAL: prefer fp16
+    PRECISION_AUTO = -1,
+    // Normal precision
+    // ARM: run fp16 if device support fp16, else run fp32.
+    // OPNECL: run with mixed pricision
+    // METAL: run with fp16
+    PRECISION_NORMAL = 0,
+    // High precision
+    // ARM: run with fp32
+    // OPENCL: run with fp32
+    // METAL: run with fp16
+    PRECISION_HIGH = 1,
+    // Low precision
+    // ARM: run with bfp16
+    // OPENCL: run with fp16
+    // METAL: run with fp16
+    PRECISION_LOW = 2
+} Precision;
+
+typedef enum {
+    NETWORK_TYPE_AUTO       = -1,
+    NETWORK_TYPE_DEFAULT    = 0,
+    NETWORK_TYPE_OPENVINO   = 0x1000,
+    NETWORK_TYPE_COREML     = 0x2000,
+    NETWORK_TYPE_SNPE       = 0x3000,
+    NETWORK_TYPE_HIAI       = 0x4000,
+    NETWORK_TYPE_ATLAS      = 0x5000,
+    NETWORK_TYPE_HUAWEI_NPU = 0x6000,
+    NETWORK_TYPE_RK_NPU     = 0x7000,
+    NETWORK_TYPE_TENSORRT   = 0x8000,
+} NetworkType;
+
+typedef enum {
+    DEVICE_NAIVE      = 0x0000,
+    DEVICE_X86        = 0x0010,
+    DEVICE_ARM        = 0x0020,
+    DEVICE_OPENCL     = 0x1000,
+    DEVICE_METAL      = 0x1010,
+    DEVICE_CUDA       = 0x1020,
+    DEVICE_DSP        = 0x1030,
+    DEVICE_ATLAS      = 0x1040,
+    DEVICE_HUAWEI_NPU = 0x1050,
+    DEVICE_RK_NPU     = 0x1060,
+} DeviceType;
+
+typedef enum {
+    // default
+    SHARE_MEMORY_MODE_DEFAULT = 0,
+    // same thread tnn instance share blob memory
+    SHARE_MEMORY_MODE_SHARE_ONE_THREAD = 1,
+    // set blob memory from external, different thread share blob memory need
+    // synchronize
+    SHARE_MEMORY_MODE_SET_FROM_EXTERNAL = 2
+} ShareMemoryMode;
+
+typedef enum {
+    MODEL_TYPE_TNN      = 0x0001,
+    MODEL_TYPE_NCNN     = 0x0100,
+    MODEL_TYPE_OPENVINO = 0x1000,
+    MODEL_TYPE_COREML   = 0x2000,
+    MODEL_TYPE_SNPE     = 0x3000,
+    MODEL_TYPE_HIAI     = 0x4000,
+    MODEL_TYPE_ATLAS    = 0x5000,
+    MODEL_TYPE_RKCACHE  = 0x6000
+} ModelType;
+
+using DimsVector = std::vector<int>;
+
+//@brief Config used to create tnn instance, config
+// device type, network type and share memory mode.
+struct PUBLIC NetworkConfig {
+    // device type default cpu
+    DeviceType device_type = DEVICE_ARM;
+
+    // device id default 0
+    int device_id = 0;
+
+    // blob data format, auto decided by device
+    DataFormat data_format = DATA_FORMAT_AUTO;
+
+    // network type, auto decided by device
+    NetworkType network_type = NETWORK_TYPE_AUTO;
+
+    // raidnet instances not share memory with others
+    ShareMemoryMode share_memory_mode = SHARE_MEMORY_MODE_DEFAULT;
+
+    // dependent library path
+    std::vector<std::string> library_path = {};
+
+    // compute precision
+    Precision precision = PRECISION_AUTO;
+
+    // cache path to store possible cache models or opt kernel
+    std::string cache_path = "";
+
+    // network init or reshape may cost more time to select opt kernel implement if enable tune kernel
+    // cache_path can set to store tune kernel info.
+    bool enable_tune_kernel = false;
+};
+
+struct PUBLIC ModelConfig {
+    ModelType model_type = MODEL_TYPE_TNN;
+
+    // tnn model need two params: order is proto content, model content.
+    // ncnn need two: params: order is param, weights.
+    // openvino model need two params: order is xml content, model path.
+    // coreml model need one param: coreml model dir.
+    // snpe model need one param: dlc model dir.
+    // hiai model need two params: order is model name, model_file_path.
+    // atlas model need one param: config string.
+    std::vector<std::string> params = {};
+};
+
+typedef enum {
+    //normal runtime forward, only layers with varing output in tnn proto will be executed
+    RUNTIME_MODE_NORMAL = 0,
+    //normal runtime forward, only layers with constant output (eg. ShapeLayer) will be executed to do constant folding
+    RUNTIME_MODE_CONST_FOLD = 1,
+} RuntimeMode;
+
+typedef enum {
+    //data always change
+    DATA_FLAG_CHANGE_ALWAYS   = 0, //0x00000000
+    //data change if shape differ
+    DATA_FLAG_CHANGE_IF_SHAPE_DIFFER  = 1, //0x00000001
+    //data never change
+    DATA_FLAG_CHANGE_NEVER   = 2, //0x00000002
+
+    //data allocate in forward
+    DATA_FLAG_ALLOCATE_IN_FORWARD   = 65536, //0x00010000
+} DataFlag;
+
+typedef union {
+    int i;
+    float f;
+} RangeData;
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_COMMON_H_
diff --git a/3rdparty/TNN/include/tnn/core/instance.h b/3rdparty/TNN/include/tnn/core/instance.h
new file mode 100644
index 0000000..2ad16d8
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/instance.h
@@ -0,0 +1,150 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_INSTANCE_H_
+#define TNN_INCLUDE_TNN_CORE_INSTANCE_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/blob_converter.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+class AbstractNetwork;
+class AbstractModelInterpreter;
+
+struct LayerInfo;
+
+#ifdef FORWARD_CALLBACK_ENABLE
+typedef std::function<void(std::vector<Blob*>& blobs, LayerInfo* info)> BlobStatisticCallback;
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+class PUBLIC Instance {
+public:
+    Instance(NetworkConfig& net_config, ModelConfig& model_config);
+
+    ~Instance();
+
+    // init with model interpeter and inputs shape.
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape);
+
+    // init with model interpeter, min inputs shape and max inputs shape.
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+    // deinit, release network
+    Status DeInit();
+
+    //  return memory bytes required for forward
+    Status GetForwardMemorySize(int& memory_size);
+
+    //  set memory to tnn instance. if success, return status code zero.
+    //  only instance created with SHARE_MEMORY_MODE_SET_FROM_EXTERNAL can be set from external.
+    //  the memory size need >=  GetForwardMemorySize().
+    //  releasing or otherwise using the memory for other purposes during the tnn network run
+    //  will result in undefined behavior.
+    Status SetForwardMemory(void* memory);
+
+    // reshape instance with new input shapes
+    Status Reshape(const InputShapesMap& inputs);
+
+    // get tnn command queue
+    Status GetCommandQueue(void** command_queue);
+    
+    // @brief share command queue with another instance
+    // @param instance to share command queue
+    Status ShareCommandQueue(Instance *instance);
+
+    // @brief tnn instance network infer, it will wait until all layer infer complete.
+    Status Forward();
+
+#ifdef FORWARD_CALLBACK_ENABLE
+    // tnn instance network infer with callback to get blob info
+    Status ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after);
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+#ifdef GET_INTERP_ENABLE
+    // get model interpreter
+    std::shared_ptr<AbstractModelInterpreter> GetInterpreter();
+#endif  // end of GET_INTERP_ENABLE
+
+    // tnn instance network infer async.
+    // device gpu, all layer infer complete will call Callback.
+    Status ForwardAsync(Callback call_back);
+
+    // get all input blobs
+    Status GetAllInputBlobs(BlobMap& blobs);
+
+    // get all output blobs
+    Status GetAllOutputBlobs(BlobMap& blobs);
+
+    // set threads run on cpu
+    Status SetCpuNumThreads(int num_threads);
+
+#if TNN_PROFILE
+public:
+    /**start to profile each layer, dont call this func if you only want to profile the whole mode*/
+    void StartProfile();
+    /**finish profile each layer and show result*/
+    std::string FinishProfile(bool do_print = false);
+#endif
+
+private:
+    std::shared_ptr<AbstractModelInterpreter> interpreter_ = nullptr;
+    std::shared_ptr<AbstractNetwork> network_ = nullptr;
+    std::shared_ptr<AbstractNetwork> const_folder_ = nullptr;
+    NetworkConfig net_config_;
+    ModelConfig model_config_;
+    
+    AbstractNetwork *GetNetwork();
+    
+    //Mat interface for simple use
+public:
+    // set input Mat, if input_name is not set, take the first input as default
+    Status SetInputMat(std::shared_ptr<Mat> mat,
+                       MatConvertParam param,
+                       std::string input_name = "");
+    
+    // get output Mat, if output_name is not set, take the first output as default
+    Status GetOutputMat(std::shared_ptr<Mat>& mat,
+                        MatConvertParam param = MatConvertParam(),
+                        std::string output_name = "",
+                        DeviceType device = DEVICE_ARM, MatType mat_type = NCHW_FLOAT);
+    
+private:
+    // input converter
+    std::map<std::string, std::shared_ptr<BlobConverter>> input_converters_ = {};
+
+    // output converter
+    std::map<std::string, std::shared_ptr<BlobConverter>> output_converters_ = {};
+
+    // output mat
+    std::map<std::string, std::shared_ptr<Mat>> output_mats_ = {};
+    // output mat convert status
+    std::map<std::string, int> output_mats_convert_status_ = {};
+};
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_INSTANCE_H_
diff --git a/3rdparty/TNN/include/tnn/core/macro.h b/3rdparty/TNN/include/tnn/core/macro.h
new file mode 100644
index 0000000..2595f1e
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/macro.h
@@ -0,0 +1,228 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_MACRO_H_
+#define TNN_INCLUDE_TNN_CORE_MACRO_H_
+#include <stdio.h>
+#include <stdlib.h>
+
+// disable Warning 4003 in MSVC: warning C4003: param not enough to call “TNN_NS_”
+#pragma warning(disable:4003)
+// TNN namespcae
+#define TNN_NS__(x) tnn##x
+#define TNN_NS_(x) TNN_NS__(x)
+#define TNN_NS TNN_NS_()
+
+// TNN profile
+#ifndef TNN_PROFILE
+#define TNN_PROFILE 0
+#endif
+
+// Interface visibility
+#if defined _WIN32 || defined __CYGWIN__
+#ifdef BUILDING_DLL
+#ifdef __GNUC__
+#define PUBLIC __attribute__((dllexport))
+#else  // __GNUC__
+#define PUBLIC __declspec(dllexport)
+#endif // __GNUC__
+#else // BUILDING_DLL
+#ifdef __GNUC__
+#define PUBLIC __attribute__((dllimport))
+#else
+#define PUBLIC __declspec(dllimport)
+#endif // __GNUC__
+#endif // BUILDING_DLL
+#define LOCAL
+#else // _WIN32 || __CYGWIN__
+#if __GNUC__ >= 4
+#define PUBLIC __attribute__((visibility("default")))
+#define LOCAL __attribute__((visibility("hidden")))
+#else
+#define PUBLIC
+#define LOCAL
+#endif
+#endif
+
+// DATAPRECISION
+// float IEEE 754
+#ifndef FLT_MIN
+#define FLT_MIN 1.175494351e-38F
+#define FLT_MAX 3.402823466e+38F
+#define FLT_EPSILON 1.192092896e-07F
+#endif
+// int8
+#ifndef INT8_MIN
+#define INT8_MIN ((int8_t)-128)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX ((int8_t)127)
+#endif
+
+#define DEFAULT_TAG "tnn"
+
+#ifdef _WIN32
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
+// Log
+#ifdef __ANDROID__
+#include <android/log.h>
+#define LOGDT(fmt, tag, ...)                                                                                           \
+    __android_log_print(ANDROID_LOG_DEBUG, tag, ("%s [File %s][Line %d] " fmt), __PRETTY_FUNCTION__, __FILE__,         \
+                        __LINE__, ##__VA_ARGS__)
+#define LOGIT(fmt, tag, ...)                                                                                           \
+    __android_log_print(ANDROID_LOG_INFO, tag, ("%s [File %s][Line %d] " fmt), __PRETTY_FUNCTION__, __FILE__,          \
+                        __LINE__, ##__VA_ARGS__)
+#define LOGET(fmt, tag, ...)                                                                                           \
+    __android_log_print(ANDROID_LOG_ERROR, tag, ("%s [File %s][Line %d] " fmt), __PRETTY_FUNCTION__, __FILE__,         \
+                        __LINE__, ##__VA_ARGS__);                                                                      \
+    fprintf(stderr, ("E/%s: %s [File %s][Line %d] " fmt), tag, __PRETTY_FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__)
+#else
+#define LOGDT(fmt, tag, ...)                                                                                           \
+    fprintf(stdout, ("D/%s: %s [File %s][Line %d] " fmt), tag, __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__)
+#define LOGIT(fmt, tag, ...)                                                                                           \
+    fprintf(stdout, ("I/%s: %s [File %s][Line %d] " fmt), tag, __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__)
+#define LOGET(fmt, tag, ...)                                                                                           \
+    fprintf(stderr, ("E/%s: %s [File %s][Line %d] " fmt), tag, __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__)
+#endif  //__ANDROID__
+
+#define LOGD(fmt, ...) LOGDT(fmt, DEFAULT_TAG, ##__VA_ARGS__)
+#define LOGI(fmt, ...) LOGIT(fmt, DEFAULT_TAG, ##__VA_ARGS__)
+#define LOGE(fmt, ...) LOGET(fmt, DEFAULT_TAG, ##__VA_ARGS__)
+#define LOGE_IF(cond, fmt, ...) if(cond) { LOGET(fmt, DEFAULT_TAG, ##__VA_ARGS__); }
+
+// Assert
+#include <cassert>
+#define ASSERT(x)                                                                                                      \
+    {                                                                                                                  \
+        int res = (x);                                                                                                 \
+        if (!res) {                                                                                                    \
+            LOGE("Error: assert failed\n");                                                                              \
+            assert(res);                                                                                               \
+        }                                                                                                              \
+    }
+
+#ifndef DEBUG
+#undef LOGDT
+#undef LOGD
+#define LOGDT(fmt, tag, ...)
+#define LOGD(fmt, ...)
+#undef ASSERT
+#define ASSERT(x)
+#endif  // DEBUG
+
+// BREAK_IF
+#define BREAK_IF(cond)                                                                                                 \
+    if (cond)                                                                                                          \
+    break
+#ifdef __OPTIMIZE__
+#define BREAK_IF_MSG(cond, msg)                                                                                        \
+    if (cond)                                                                                                          \
+    break
+#else
+#define BREAK_IF_MSG(cond, msg)                                                                                        \
+    if (cond)                                                                                                          \
+        LOGD(msg);                                                                                                     \
+    if (cond)                                                                                                          \
+    break
+#endif
+
+// Math
+#ifndef UP_DIV
+#define UP_DIV(x, y) (((int)(x) + (int)(y) - (1)) / (int)(y))
+#endif
+#ifndef ROUND_UP
+#define ROUND_UP(x, y) (((int)(x) + (int)(y) - (1)) / (int)(y) * (int)(y))
+#endif
+#ifndef ALIGN_UP4
+#define ALIGN_UP4(x) ROUND_UP((x), 4)
+#endif
+#ifndef ALIGN_UP8
+#define ALIGN_UP8(x) ROUND_UP((x), 8)
+#endif
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+#ifndef ABS
+#define ABS(x) ((x) > (0) ? (x) : (-(x)))
+#endif
+
+#if (__arm__ || __aarch64__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#define TNN_USE_NEON
+#endif
+
+#if TNN_ARM82
+
+#ifndef TNN_ARM82_SIMU
+
+#if defined(__aarch64__) && defined(TNN_USE_NEON)
+#define TNN_ARM82_A64
+#elif defined(__arm__)  && defined(TNN_USE_NEON)
+#define TNN_ARM82_A32
+#else
+#define TNN_ARM82_SIMU
+#endif
+
+#endif  // TNN_ARM82_SIMU
+
+#else
+
+#ifdef TNN_ARM82_SIMU
+#undef TNN_ARM82_SIMU
+#endif
+
+#endif  // TNN_ARM82
+
+#if defined(TNN_ARM82_A64) || defined(TNN_ARM82_A32)
+#define TNN_ARM82_USE_NEON
+#endif
+
+#define RETURN_VALUE_ON_NEQ(status, expected, value)                  \
+    do {                                                                                                         \
+        auto _status = (status);                                                                         \
+        if (_status != (expected)) {                                                                     \
+            return (value);                                                                                 \
+        }                                                                                                          \
+    } while (0)
+
+#define RETURN_ON_NEQ(status, expected)                                         \
+    do {                                                                                                        \
+        auto _status = (status);                                                                        \
+        if (_status != (expected)) {                                                                    \
+            return _status;                                                                               \
+        }                                                                                                         \
+    } while (0)
+
+#define CHECK_PARAM_NULL(param)                                                   \
+    do {                                                                                                         \
+        if (!param) {                                                                                        \
+            return Status(TNNERR_PARAM_ERR, "Error: param is nil");                                                    \
+        }                                                                                                          \
+    } while (0)
+
+
+#if defined(__GNUC__) || defined(__clang__)
+#define DEPRECATED(msg) __attribute__((deprecated (msg)))
+#elif defined(_MSC_VER)
+#define DEPRECATED(msg) __declspec(deprecated (msg))
+#else
+#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
+#define DEPRECATED
+#endif
+
+#endif  // TNN_INCLUDE_TNN_CORE_MACRO_H_
diff --git a/3rdparty/TNN/include/tnn/core/mat.h b/3rdparty/TNN/include/tnn/core/mat.h
new file mode 100644
index 0000000..96031f6
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/mat.h
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_MAT_H_
+#define TNN_INCLUDE_TNN_CORE_MAT_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+typedef enum {
+    INVALID    = -1,
+    //bgr or rgb: uint8
+    N8UC3      = 0x00,
+    //bgra or rgba: uint8
+    N8UC4      = 0x01,
+    //gray: uint8
+    NGRAY      = 0x10,
+    //YUV420SP, YYYYVUVUVU
+    NNV21      = 0x11,
+    //YUV420SP, YYYYUVUVUV
+    NNV12      = 0x12,
+    //NCDi[0-4]: float
+    NCHW_FLOAT = 0x20,
+    //NCDi[0-4]: int32
+    NC_INT32 = 0x21,
+    
+    // RESERVED FOR INTERNAL TEST USE
+    RESERVED_BFP16_TEST = 0x200,
+    RESERVED_FP16_TEST  = 0x201,
+    RESERVED_INT8_TEST  = 0x202,
+} PUBLIC MatType;
+
+class PUBLIC Mat {
+public:
+    ~Mat();
+
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims, void* data);
+    Mat(DeviceType device_type, MatType mat_type, DimsVector shape_dims);
+    //empty mat
+    Mat(DeviceType device_type, MatType mat_type);
+
+    DEPRECATED("use Mat(DeviceType, MatType, DimsVector, void*) instead")
+    Mat(DeviceType device_type, MatType mat_type, void* data) : Mat(device_type, mat_type, {1,0,0,0}, data) {};
+
+public:
+    DeviceType GetDeviceType();
+    MatType GetMatType();
+    void* GetData();
+    int GetBatch();
+    int GetChannel();
+    int GetHeight();
+    int GetWidth();
+    int GetDim(int index);
+    DimsVector GetDims();
+
+private:
+    Mat(){};
+
+protected:
+    TNN_NS::DeviceType device_type_ = DEVICE_NAIVE;
+    TNN_NS::MatType mat_type_       = INVALID;
+    void* data_                     = nullptr;
+    DimsVector dims_ = {};
+
+private:
+    std::shared_ptr<void> data_alloc_ = nullptr;
+};
+
+using MatMap = std::map<std::string, std::shared_ptr<Mat>>;
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_MAT_H_
diff --git a/3rdparty/TNN/include/tnn/core/status.h b/3rdparty/TNN/include/tnn/core/status.h
new file mode 100644
index 0000000..0b059d1
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/status.h
@@ -0,0 +1,157 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_STATUS_H_
+#define TNN_INCLUDE_TNN_CORE_STATUS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+enum StatusCode {
+
+    TNN_OK = 0x0,
+
+    // param errcode
+    TNNERR_PARAM_ERR        = 0x1000,
+    TNNERR_INVALID_NETCFG   = 0x1002,
+    TNNERR_INVALID_LAYERCFG = 0x1003,
+    TNNERR_NULL_PARAM       = 0x1004,
+    TNNERR_INVALID_GROUP    = 0x1005,
+    TNNERR_INVALID_AXIS     = 0x1006,
+
+    // network errcode
+    TNNERR_NET_ERR       = 0x2000,
+    TNNERR_UNSUPPORT_NET = 0x2001,
+
+    // layer errcode
+    TNNERR_LAYER_ERR     = 0x3000,
+    TNNERR_UNKNOWN_LAYER = 0x3001,
+    TNNERR_CREATE_LAYER  = 0x3002,
+    TNNERR_INIT_LAYER    = 0x3003,
+    TNNERR_INVALID_DATA  = 0x3004,
+    TNNERR_ELT_UNSUP_OP  = 0x3005,
+
+    // model errcode
+    TNNERR_MODEL_ERR     = 0x4000,
+    TNNERR_INVALID_MODEL = 0x4001,
+    TNNERR_FIND_MODEL    = 0x4002,
+
+    // instance errcode
+    TNNERR_INST_ERR         = 0x5000,
+    TNNERR_MAXINST_COUNT    = 0x5001,
+    TNNERR_ALLOC_INSTANCE   = 0x5002,
+    TNNERR_INVALID_INSTANCE = 0x5003,
+    TNNERR_CONTEXT_ERR      = 0x5004,
+
+    // common errcode
+    TNNERR_COMMON_ERROR     = 0x6000,
+    TNNERR_OUTOFMEMORY      = 0x6001,
+    TNNERR_INVALID_INPUT    = 0x6002,
+    TNNERR_FIND_RESOURCE    = 0x6003,
+    TNNERR_NO_RESULT        = 0x6004,
+    TNNERR_LOAD_MODEL       = 0x6005,
+    TNNERR_PACK_MODEL       = 0x6006,
+    TNNERR_SET_CPU_AFFINITY = 0x6007,
+    TNNERR_OPEN_FILE        = 0x6008,
+
+    // forward memory error
+    TNNERR_NOT_SUPPORT_SET_FORWARD_MEM           = 0x8000,
+    TNNERR_FORWARD_MEM_NOT_SET                   = 0x8001,
+    TNNERR_SHARED_MEMORY_FORWARD_NOT_SAME_THREAD = 0x8003,
+    TNNERR_SHARE_MEMORY_MODE_NOT_SUPPORT         = 0x8004,
+
+    // device
+    TNNERR_DEVICE_NOT_SUPPORT                 = 0x9000,
+    TNNERR_DEVICE_LIBRARY_LOAD                = 0x9001,
+    TNNERR_DEVICE_CONTEXT_CREATE              = 0x9002,
+    TNNERR_DEVICE_INVALID_COMMAND_QUEUE       = 0x9003,
+    TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT = 0x9004,
+
+    // OpenCL
+    TNNERR_OPENCL_FINISH_ERROR      = 0xA000,
+    TNNERR_OPENCL_API_ERROR         = 0xA001,
+    TNNERR_OPENCL_RUNTIME_ERROR     = 0xA002,
+    TNNERR_OPENCL_ACC_INIT_ERROR    = 0xA003,
+    TNNERR_OPENCL_ACC_RESHAPE_ERROR = 0xA004,
+    TNNERR_OPENCL_ACC_FORWARD_ERROR = 0xA005,
+    TNNERR_OPENCL_KERNELBUILD_ERROR = 0xA006,
+    TNNERR_OPENCL_MEMALLOC_ERROR    = 0xA007,
+    TNNERR_OPENCL_MEMMAP_ERROR      = 0xA008,
+    TNNERR_OPENCL_MEMUNMAP_ERROR    = 0xA009,
+    TNNERR_OPENCL_UNSUPPORT_ERROR   = 0xA00A,
+
+    // SNPE
+    TNNERR_SNPE_API_ERROR = 0xB001,
+
+    // Atlas
+    TNNERR_ATLAS_RUNTIME_ERROR    = 0xC001,
+    TNNERR_ATLAS_TIMEOUT_ERROR    = 0xC002,
+    TNNERR_ATLAS_MALLOC_ERROR     = 0xC002,
+    TNNERR_ATLAS_GRAPH_INIT_ERROR = 0xC003,
+
+    // Hiai
+    TNNERR_HIAI_API_ERROR = 0xD001,
+    //Huawei NPU
+    TNNERR_NPU_LOAD_ERROR      = 0xE000,
+    TNNERR_NPU_UNSUPPORT_ERROR = 0xE001,
+    TNNERR_NPU_HIAI_API_ERROR  = 0xE002,
+
+    // Cuda
+    TNNERR_CUDA_TENSORRT_ERROR = 0xF001,
+    TNNERR_CUDA_SYNC_ERROR = 0xF002,
+    TNNERR_CUDA_MEMCPY_ERROR = 0xF003,
+
+    // Quantize
+    TNNERR_QUANTIZE_ERROR = 0x10001,
+
+    // TNN CONVERT
+    TNN_CONVERT_OK                 = 0x10000,
+    TNNERR_CONVERT_UNSUPPORT_LAYER = 0x10001,
+    TNNERR_CONVERT_GENERATE_MODEL  = 0x10002,
+    TNNERR_CONVERT_INVALID_MODEL   = 0x10003,
+    TNNERR_CONVERT_UNSUPPORT_PASS  = 0x10004,
+    TNNERR_CONVERT_OPTIMIZE_ERROR  = 0x10005,
+};
+
+class PUBLIC Status {
+public:
+    ~Status();
+    Status(int code = TNN_OK, std::string message = "OK");
+
+    Status &operator=(int code);
+
+    bool operator==(int code_);
+    bool operator!=(int code_);
+    operator int();
+    operator bool();
+    std::string description();
+
+private:
+    int code_ = 0;
+    std::string message_ = "";
+};
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_STATUS_H_
diff --git a/3rdparty/TNN/include/tnn/core/tnn.h b/3rdparty/TNN/include/tnn/core/tnn.h
new file mode 100644
index 0000000..75ee71c
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/core/tnn.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_TNN_H_
+#define TNN_INCLUDE_TNN_CORE_TNN_H_
+
+#include <memory>
+#include <string>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+#pragma warning(push)
+#pragma warning(disable:4251)
+
+namespace TNN_NS {
+
+class TNNImpl;
+
+class PUBLIC TNN {
+public:
+    TNN();
+    
+    ~TNN();
+
+    // init tnn implement, interpret model.
+    Status Init(ModelConfig& config);
+
+    // denit tnn implement, release model interpreter.
+    Status DeInit();
+
+    // add output to the model. 
+    // if output_name of blob not found, then search output_index of layer.
+    Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    // return input shapes map from model
+    Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // create tnn network instance with network config and inputs shape.
+    // if inputs shape not set, use default from model.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap inputs_shape = InputShapesMap());
+
+    // create tnn network instance with network config and min max inputs shape,
+    // instance reshape can support range from min inputs shape to max inputs shape.
+    std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+private:
+    std::shared_ptr<TNNImpl> impl_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_TNN_H_
diff --git a/3rdparty/TNN/include/tnn/utils/bfp16_utils.h b/3rdparty/TNN/include/tnn/utils/bfp16_utils.h
new file mode 100644
index 0000000..c40d37d
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/bfp16_utils.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_BFP16_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_BFP16_UTILS_H_
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+// @brief convert float to bfp16
+extern PUBLIC int ConvertFromFloatToBFP16(float *fp32, void *bfp16, int count);
+// @brief convert bfp16 to float
+extern PUBLIC int ConvertFromBFP16ToFloat(void *bfp16, float *fp32, int count);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_BFP16_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/blob_converter.h b/3rdparty/TNN/include/tnn/utils/blob_converter.h
new file mode 100644
index 0000000..00c74cf
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/blob_converter.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_BLOB_CONVERTER_H_
+#define TNN_INCLUDE_TNN_UTILS_BLOB_CONVERTER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/mat.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+//formular: y = scale*x + bias
+struct PUBLIC MatConvertParam {
+    std::vector<float> scale = {1.0f, 1.0f, 1.0f, 1.0f};
+    std::vector<float> bias  = {0.0f, 0.0f, 0.0f, 0.0f};
+    bool reverse_channel     = false;
+};
+
+class BlobConverterAcc;
+class PUBLIC BlobConverter {
+public:
+    explicit BlobConverter(Blob* blob);
+    Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue);
+    Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue);
+
+    Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+    Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue);
+
+private:
+    Blob* blob_ = nullptr;
+    std::shared_ptr<BlobConverterAcc> impl_ = nullptr;
+
+    Status CheckScaleBiasInParam(Mat& image, MatConvertParam& param, bool convert_to_mat);
+    bool NeedDoScaleBias(MatConvertParam &param);
+};
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_UTILS_BLOB_CONVERTER_H_
diff --git a/3rdparty/TNN/include/tnn/utils/cpu_utils.h b/3rdparty/TNN/include/tnn/utils/cpu_utils.h
new file mode 100644
index 0000000..6202e20
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/cpu_utils.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_CPU_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_CPU_UTILS_H_
+
+#include <utility>
+#include <vector>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class CpuUtils {
+public:
+    // @brief set cpu affinity
+    // @param cpu_list vector of cpuids("0,1,2,3")
+    PUBLIC static Status SetCpuAffinity(const std::vector<int>& cpu_list);
+
+    // @brief set cpu powersave
+    // @param powersave 0:all cpus 1:little cluster 2:big cluster
+    PUBLIC static Status SetCpuPowersave(int powersave);
+
+    // @brief get cpu fp16 capability
+    PUBLIC static bool CpuSupportFp16();
+
+    // @brief get cpu int8 dot capability
+    PUBLIC static bool CpuSupportInt8Dot();
+
+    // @brief set x86 cpu denormal ftz and daz, no use for other cpu.
+    // @param denormal 0:turn off denormal 1:turn on denormal
+    PUBLIC static void SetCpuDenormal(int denormal);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_CPU_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/data_type_utils.h b/3rdparty/TNN/include/tnn/utils/data_type_utils.h
new file mode 100644
index 0000000..99324a5
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/data_type_utils.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_DATA_TYPE_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_DATA_TYPE_UTILS_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+class PUBLIC DataTypeUtils {
+public:
+    // @brief get bytes
+    // @param data_tyep data type info
+    static int GetBytesSize(DataType data_type);
+
+    // @brief get string for DataType
+    // @param data_tyep data type info
+    static std::string GetDataTypeString(DataType data_type);
+    
+    // @brief safely cast int64 to int, int64_min to int_min and int64_max to int_max. avoid to cast int64_max to -1
+    static int SaturateCast(long long int data);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_DATA_TYPE_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/dims_vector_utils.h b/3rdparty/TNN/include/tnn/utils/dims_vector_utils.h
new file mode 100644
index 0000000..42bf5ab
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/dims_vector_utils.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_DIMS_VECTOR_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_DIMS_VECTOR_UTILS_H_
+
+#include <algorithm>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class PUBLIC DimsVectorUtils {
+public:
+    // @brief all dims product, [start_index, end_index)
+    // @param dims
+    static int Count(const DimsVector &dims, int start_index = 0, int end_index = -1);
+
+    // @brief max of dims0 and dims1, [start_index, end_index)
+    static DimsVector Max(const DimsVector &dims0, const DimsVector &dims1, int start_index = 0, int end_index = -1);
+
+    // @brief min of dims0 and dims1, [start_index, end_index)
+    static DimsVector Min(const DimsVector &dims0, const DimsVector &dims1, int start_index = 0, int end_index = -1);
+
+    // @brief equal of dims0 and dims1, [start_index, end_index)
+    static bool Equal(const DimsVector &dims0, const DimsVector &dims1, int start_index = 0, int end_index = -1);
+    
+    // @brief NCHW dims vector to NHWC dims vector
+    static DimsVector NCHW2NHWC(const DimsVector &dims);
+
+    // @brief NHWC dims vector to NCHW
+    static DimsVector NHWC2NCHW(const DimsVector &dims);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_DIMS_VECTOR_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/half_utils.h b/3rdparty/TNN/include/tnn/utils/half_utils.h
new file mode 100644
index 0000000..27bdcc3
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/half_utils.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_HALF_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_HALF_UTILS_H_
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+// @brief convert float to half
+extern PUBLIC int ConvertFromFloatToHalf(float *fp32, void *fp16, int count);
+// @brief convert half to float
+extern PUBLIC int ConvertFromHalfToFloat(void *fp16, float *fp32, int count);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_HALF_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/mat_utils.h b/3rdparty/TNN/include/tnn/utils/mat_utils.h
new file mode 100644
index 0000000..1a9f590
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/mat_utils.h
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_MAT_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_MAT_UTILS_H_
+
+#include "tnn/core/status.h"
+#include "tnn/core/mat.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    INTERP_TYPE_NEAREST = 0x00,
+    INTERP_TYPE_LINEAR  = 0x01,
+} PUBLIC InterpType;
+
+typedef enum {
+    BORDER_TYPE_CONSTANT = 0x00,
+    BORDER_TYPE_REFLECT  = 0x01,
+    BORDER_TYPE_EDGE     = 0x02,
+} PUBLIC BorderType;
+
+typedef enum {
+    COLOR_CONVERT_NV12TOBGR  = 0x00,
+    COLOR_CONVERT_NV12TOBGRA = 0x01,
+    COLOR_CONVERT_NV21TOBGR  = 0x02,
+    COLOR_CONVERT_NV21TOBGRA = 0x03,
+    COLOR_CONVERT_BGRTOGRAY  = 0x04,
+    COLOR_CONVERT_BGRATOGRAY = 0x05,
+    COLOR_CONVERT_RGBTOGRAY  = 0x06,
+    COLOR_CONVERT_RGBATOGRAY = 0x07,
+} PUBLIC ColorConversionType;
+
+struct PUBLIC ResizeParam {
+    float scale_w = 0.0f;
+    float scale_h = 0.0f;
+    InterpType type = INTERP_TYPE_LINEAR;
+};
+
+struct PUBLIC CropParam {
+    int top_left_x = 0;
+    int top_left_y = 0;
+    int width      = 0;
+    int height     = 0;
+};
+
+struct PUBLIC WarpAffineParam {
+    float transform[2][3];
+    InterpType interp_type = INTERP_TYPE_NEAREST;
+    BorderType border_type = BORDER_TYPE_CONSTANT;
+    float border_val       = 0.0f;
+};
+
+struct PUBLIC CopyMakeBorderParam {
+    int top    = 0;
+    int bottom = 0;
+    int left   = 0;
+    int right  = 0;
+    BorderType border_type = BORDER_TYPE_CONSTANT;
+    float border_val       = 0.0f;
+};
+
+class PUBLIC MatUtils {
+public:
+    //copy cpu <-> device, cpu<->cpu, device<->device, src and dst dims must be equal.
+    static Status Copy(Mat& src, Mat& dst, void* command_queue);
+
+    //src and dst device type must be same. when param scale_w or scale_h is 0, it is computed as
+    // (double)dst.GetWidth() / src.GetWidth() or (double)dst.GetHeight() / src.GetHeight().
+    static Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue);
+
+    //src and dst device type must be same. when param width or height is 0, it is equal to
+    //dst.GetWidth() or dst.GetHeight().
+    static Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue);
+
+    //src and dst device type must be same.
+    static Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue);
+
+    //src and dst device type must be same. param top, bottom, left and right must be non-negative.
+    static Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_MAT_UTILS_H_
diff --git a/3rdparty/TNN/include/tnn/utils/string_utils.h b/3rdparty/TNN/include/tnn/utils/string_utils.h
new file mode 100644
index 0000000..b2db5d0
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/utils/string_utils.h
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_STRING_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_STRING_UTILS_H_
+
+#include <string>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+extern PUBLIC std::string UcharToString(const unsigned char *buffer, int length);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_STRING_UTILS_INNER_H_
diff --git a/3rdparty/TNN/include/tnn/version.h b/3rdparty/TNN/include/tnn/version.h
new file mode 100644
index 0000000..7cdfcc8
--- /dev/null
+++ b/3rdparty/TNN/include/tnn/version.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifndef TNN_INCLUDE_TNN_VERSION_H_
+#define TNN_INCLUDE_TNN_VERSION_H_
+
+static char *branch_name_tnn = "feature_dynamic_dims";
+static char *commit_date_tnn = "2021-03-12";
+static char *commit_hash_tnn = "a4c6b4f5";
+
+#endif //TNN_INCLUDE_TNN_VERSION_H_
diff --git a/3rdparty/TNN/platforms/android/.gitignore b/3rdparty/TNN/platforms/android/.gitignore
new file mode 100644
index 0000000..d75e75f
--- /dev/null
+++ b/3rdparty/TNN/platforms/android/.gitignore
@@ -0,0 +1,2 @@
+*.data
+*.log
diff --git a/3rdparty/TNN/platforms/android/CMakeLists.txt b/3rdparty/TNN/platforms/android/CMakeLists.txt
new file mode 100644
index 0000000..65ca3fc
--- /dev/null
+++ b/3rdparty/TNN/platforms/android/CMakeLists.txt
@@ -0,0 +1,40 @@
+if(PROCESSOR.arm)
+    add_definitions(-mfloat-abi=softfp -mfpu=neon)
+endif()
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pie -fPIE -s")
+
+if(ANDROID_STL STREQUAL "gnustl_static")
+    message(STATUS "\tHALF_ENABLE_CPP11_CMATH:\t0")
+    add_definitions(-DHALF_ENABLE_CPP11_CMATH=0)
+endif()
+
+set(COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src/test_common.cc)
+
+if(${ANDROID_NDK_MAJOR})
+    if(${ANDROID_NDK_MAJOR} GREATER 20)
+        target_link_libraries(TNN log -fopenmp -static-openmp)
+    else()
+        target_link_libraries(TNN log -fopenmp)
+    endif()
+else()
+    target_link_libraries(TNN log -fopenmp)
+endif()
+
+if(NOT TNN_BUILD_SHARED)
+    string(ASCII 27 Esc)
+    set(ColourReset "${Esc}[m")
+    set(Red         "${Esc}[31m")
+    message(STATUS "${Red}[WARNING] TNN is built as a static library, link it with option -Wl,--whole-archive tnn -Wl,--no-whole-archive${ColourReset}")
+endif()
+
+if(TNN_OPENCL_ENABLE)
+    message(STATUS "Build TNN OpenCL")
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/opencl/include)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/source/device/opencl)
+endif()
+
+if(TNN_HUAWEI_NPU_ENABLE)
+    message(STATUS "Build TNN NPU")
+    target_link_libraries(TNN hiai hiai_ir hiai_ir_build)
+    include_directories(${PROJECT_SOURCE_DIR}/source/tnn/device/huawei_npu)
+endif()
diff --git a/3rdparty/TNN/platforms/android/model_check_android.sh b/3rdparty/TNN/platforms/android/model_check_android.sh
new file mode 100755
index 0000000..96e35c3
--- /dev/null
+++ b/3rdparty/TNN/platforms/android/model_check_android.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+ABI="arm64-v8a"
+STL="c++_static"
+CLEAN=""
+PUSH_MODEL=""
+BUILD_ONLY=""
+
+WORK_DIR=`pwd`
+BUILD_DIR=build_model_check
+ANDROID_DIR=/data/local/tmp/model_check
+ANDROID_DATA_DIR=$ANDROID_DIR/data
+DUMP_DIR=$WORK_DIR/dump_data_model_check
+ADB=adb
+
+DEVICE="ARM"
+ARMV82="OFF"
+TEST_PROTO_PATH=
+INPUT_PATH=
+OPTION_DUMP_OUTPUT=
+OPTION_CHECK_BATCH=
+OPTION_CHECK_OUTPUT=
+SET_PRECISION=
+
+function usage() {
+    echo "usage: ./model_check_android.sh  [-32] [-v82] [-c] [-b] [-d] <device-id> [-t] <CPU/GPU> [-m] <tnnproto> [-i] <input_file> [-p] [-o] [-s <AUTO/...>]"
+    echo "options:"
+    echo "        -32   Build 32 bit."
+    echo "        -v82  enable armv8.2."
+    echo "        -c    Clean up build folders."
+    echo "        -b    build targets only"
+    echo "        -d    run with specified device"
+    echo "        -t    ARM/OPENCL/HUAWEI_NPU specify the platform to run (default: ARM)"
+    echo "        -m    tnnproto"
+    echo "        -i    input file (NCHW Float)"
+    echo "        -p    Push models to device"
+    echo "        -o    dump output"
+    echo "        -a    check multi batch"
+    echo "        -e    only check output(precision: AUTO)"
+    echo "        -s    AUTO/NORMAL/HIGH/LOW specify the tnn precision(default: HIGH)"
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_android() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+
+    if [ "$DEVICE" == "HUAWEI_NPU" ]
+    then
+        echo "NPU Enable"
+        STL="c++_shared"
+        HUAWEI_NPU="ON"
+
+        #start to cp
+        if [ ! -d ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/ ]; then
+             mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/
+        fi
+        mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a
+        mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a
+        cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_shared.so  ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a/
+        cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a/
+    else
+        HUAWEI_NPU="OFF"
+    fi
+
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../../ \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=${STL}\
+          -DANDROID_NATIVE_API_LEVEL=android-14  \
+          -DANDROID_TOOLCHAIN=clang \
+          -DTNN_CPU_ENABLE:BOOL="ON"  \
+          -DTNN_OPENCL_ENABLE:BOOL="ON" \
+          -DTNN_ARM_ENABLE:BOOL="ON" \
+          -DTNN_ARM82_ENABLE:BOOL=$ARMV82 \
+          -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
+          -DTNN_MODEL_CHECK_ENABLE:BOOL="ON" \
+          -DBUILD_FOR_ANDROID_COMMAND=true
+    make -j4
+}
+
+function run_android() {
+    build_android
+    if [ $? != 0 ];then
+        echo "build failed"
+        exit 0
+    fi
+
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    mkdir -p $DUMP_DIR
+    $ADB shell "mkdir -p $ANDROID_DIR"
+    find . -name "*.so" | while read solib; do
+        $ADB push $solib  $ANDROID_DIR
+    done
+    $ADB push model_check $ANDROID_DIR
+    $ADB shell chmod 0777 $ANDROID_DIR/model_check
+
+    $ADB shell "rm $ANDROID_DIR/cpu_*.txt"
+    $ADB shell "rm $ANDROID_DIR/device_*.txt"
+
+    if [ "" != "$PUSH_MODEL" ]; then
+        if [ -z "$TEST_PROTO_PATH" ]
+        then
+            TEST_PROTO_PATH=../../model/SqueezeNet/squeezenet_v1.1.tnnproto
+        fi
+
+        $ADB shell "rm -r $ANDROID_DATA_DIR"
+        $ADB shell "mkdir -p $ANDROID_DATA_DIR"
+
+        if [ -n "$INPUT_PATH" ]
+        then
+            echo "push input file"
+            $ADB push ${INPUT_PATH} ${ANDROID_DATA_DIR}/input.txt
+        fi
+        TEST_MODEL_PATH=${TEST_PROTO_PATH/proto/model}
+        $ADB push ${TEST_PROTO_PATH} ${ANDROID_DATA_DIR}/test.tnnproto
+        $ADB push ${TEST_MODEL_PATH} ${ANDROID_DATA_DIR}/test.tnnmodel
+    fi
+
+    $ADB shell "echo \"${DEVICE}\" > $ANDROID_DIR/test_log.txt"
+    $ADB shell "echo \"model: ${TEST_PROTO_PATH}\" >> $ANDROID_DIR/test_log.txt"
+
+    if [ "$DEVICE" == "HUAWEI_NPU" ]
+    then
+        echo "Run Huawei Npu"
+        $ADB shell "mkdir -p $ANDROID_DIR/lib"
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/cpp_lib/$ABI/* $ANDROID_DIR/lib
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/hiai_ddk_latest/$ABI/* $ANDROID_DIR/lib
+
+        if [ -n "$INPUT_PATH" ]
+        then
+            $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR:${ANDROID_DIR}/lib ./model_check -d $DEVICE -p $ANDROID_DATA_DIR/test.tnnproto -m $ANDROID_DATA_DIR/test.tnnmodel -i $ANDROID_DATA_DIR/input.txt $OPTION_DUMP_OUTPUT $OPTION_CHECK_BATCH $OPTION_CHECK_OUTPUT $SET_PRECISION >> $ANDROID_DIR/test_log.txt"
+        else
+            $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR:${ANDROID_DIR}/lib ./model_check -d $DEVICE -p $ANDROID_DATA_DIR/test.tnnproto -m $ANDROID_DATA_DIR/test.tnnmodel $OPTION_DUMP_OUTPUT $OPTION_CHECK_BATCH $OPTION_CHECK_OUTPUT $SET_PRECISION >> $ANDROID_DIR/test_log.txt"
+        fi
+    else
+        if [ -n "$INPUT_PATH" ]
+        then
+            $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./model_check -d $DEVICE -p $ANDROID_DATA_DIR/test.tnnproto -m $ANDROID_DATA_DIR/test.tnnmodel -i $ANDROID_DATA_DIR/input.txt $OPTION_DUMP_OUTPUT $OPTION_CHECK_BATCH $OPTION_CHECK_OUTPUT $SET_PRECISION >> $ANDROID_DIR/test_log.txt"
+        else
+            $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./model_check -d $DEVICE -p $ANDROID_DATA_DIR/test.tnnproto -m $ANDROID_DATA_DIR/test.tnnmodel $OPTION_DUMP_OUTPUT $OPTION_CHECK_BATCH $OPTION_CHECK_OUTPUT $SET_PRECISION >> $ANDROID_DIR/test_log.txt"
+        fi
+    fi
+
+    $ADB pull $ANDROID_DIR/test_log.txt $DUMP_DIR
+    $ADB shell "ls $ANDROID_DIR/cpu_*.txt" |xargs -n1 -t -I file $ADB pull file $DUMP_DIR
+    $ADB shell "ls $ANDROID_DIR/device_*.txt" |xargs -n1 -t -I file $ADB pull file $DUMP_DIR
+    cat $DUMP_DIR/test_log.txt
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a"
+            ;;
+        -v82)
+            shift
+            ARMV82="ON"
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -p)
+            shift
+            PUSH_MODEL="-p"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -d)
+            shift
+            ADB="adb -s $1"
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE="$1"
+            shift
+            ;;
+        -m)
+            shift
+            TEST_PROTO_PATH="$1"
+            shift
+            ;;
+        -i)
+            shift
+            INPUT_PATH="$1"
+            shift
+            ;;
+        -o)
+            shift
+            OPTION_DUMP_OUTPUT=-o
+            ;;
+        -a)
+            shift
+            OPTION_CHECK_BATCH=-b
+            ;;
+        -e)
+            shift
+            OPTION_CHECK_OUTPUT=-e
+            ;;
+        -s)
+            shift
+            SET_PRECISION=" -sp $1"
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run_android
diff --git a/3rdparty/TNN/platforms/android/test_android.sh b/3rdparty/TNN/platforms/android/test_android.sh
new file mode 100755
index 0000000..843be77
--- /dev/null
+++ b/3rdparty/TNN/platforms/android/test_android.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+ABI="arm64-v8a"
+ADB=adb
+ANDROID_DIR=/data/local/tmp/tnn-test
+
+TEST_PROTO_PATH=
+#DEVIVE: ARM/OPENCL/HUAWEI_NPU
+DEVICE="ARM"
+WARM_UP_COUNT=0
+ITERATOR_COUNT=1
+NEED_CLEAN=false
+NEED_PUSH=true
+INPUT_PATH=
+
+WORK_DIR=`pwd`
+
+function usage() {
+    echo "usage: ./test_android.sh  [-32] [-c] [-b] [-f] [-d] <device-id> [-t] <CPU/GPU>"
+    echo "options:"
+    echo "        -32   Build 32 bit."
+    echo "        -c    Clean up build folders."
+    echo "        -d    run with specified device"
+    echo "        -t    ARM/OPENCL/HUAWEI_NPU specify the platform to run (default: ARM)"
+    echo "        -m    tnnproto"
+    echo "        -i    input file"
+}
+
+function android_test() {
+
+    if [ "$DEVICE" == "HUAWEI_NPU" ]
+    then
+        export HUAWEI_NPU="ON"
+    else
+        export HUAWEI_NPU="OFF"
+    fi
+    if $NEED_CLEAN
+    then
+        rm -r build32 build64
+    fi
+    ../../scripts/build_android.sh -ic
+
+    if $NEED_PUSH
+    then
+        $ADB shell "mkdir -p $ANDROID_DIR"
+        if [ "$ABI" == "arm64-v8a" ]; then
+            $ADB push build64/libTNN.so ${ANDROID_DIR}/libTNN.so
+            $ADB push build64/test/TNNTest ${ANDROID_DIR}/TNNTest
+        else
+            $ADB push build32/libTNN.so ${ANDROID_DIR}/libTNN.so
+            $ADB push build32/test/TNNTest ${ANDROID_DIR}/TNNTest
+        fi
+        if [ -z "$TEST_PROTO_PATH" ]
+        then
+            TEST_PROTO_PATH=../../model/SqueezeNet/squeezenet_v1.1.tnnproto
+        fi
+        if [ -n "$INPUT_PATH" ]
+        then
+            echo "input path"
+            $ADB push ${INPUT_PATH} ${ANDROID_DIR}/input.txt
+        fi
+        TEST_MODEL_PATH=${TEST_PROTO_PATH/proto/model}
+        $ADB push ${TEST_PROTO_PATH} ${ANDROID_DIR}/test.tnnproto
+        $ADB push ${TEST_MODEL_PATH} ${ANDROID_DIR}/test.tnnmodel
+    fi
+
+    $ADB shell "echo "${DEVICE}" > $ANDROID_DIR/test.log"
+    if [ "$DEVICE" == "HUAWEI_NPU" ]
+    then
+        echo "Run Huawei Npu"
+        $ADB shell "mkdir -p $ANDROID_DIR/lib"
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/cpp_lib/$ABI/* $ANDROID_DIR/lib
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/hiai_ddk_latest/$ABI/* $ANDROID_DIR/lib
+        if [ -n "$INPUT_PATH" ]
+        then
+          $ADB shell "cd $ANDROID_DIR; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ANDROID_DIR}/lib:$ANDROID_DIR; ./TNNTest -dt=${DEVICE} -nt=HUAWEI_NPU -mp=./test.tnnproto -ip=input.txt -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        else
+          $ADB shell "cd $ANDROID_DIR; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ANDROID_DIR}/lib:$ANDROID_DIR; ./TNNTest -dt=${DEVICE} -nt=HUAWEI_NPU -mp=./test.tnnproto -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        fi
+    else
+        if [ -n "$INPUT_PATH" ]
+        then
+          $ADB shell "cd $ANDROID_DIR; LD_LIBRARY_PATH=$ANDROID_DIR ./TNNTest -dt=${DEVICE} -mp=./test.tnnproto -ip=input.txt -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        else
+          $ADB shell "cd $ANDROID_DIR; LD_LIBRARY_PATH=$ANDROID_DIR ./TNNTest -dt=${DEVICE} -mp=./test.tnnproto -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        fi
+    fi
+    $ADB shell "cd ${ANDROID_DIR}; logcat -d | grep \"TNN Benchmark time cost\" | grep ${DEVICE} | grep -w \"test.tnnproto\" | tail -n 1 >> $ANDROID_DIR/test.log"
+
+    $ADB pull $ANDROID_DIR/${DEVICE}_output.data ${DEVICE}_output.data
+    $ADB pull $ANDROID_DIR/test.log ${DEVICE}_test.log
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a"
+            ;;
+        -c)
+            shift
+            NEED_CLEAN=true
+            ;;
+        -d)
+            shift
+            ADB="adb -s $1"
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE="$1"
+            shift
+            ;;
+        -m)
+            shift
+            TEST_PROTO_PATH="$1"
+            shift
+            ;;
+        -i)
+            shift
+            INPUT_PATH="$1"
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+android_test
diff --git a/3rdparty/TNN/platforms/android/unit_test_android.sh b/3rdparty/TNN/platforms/android/unit_test_android.sh
new file mode 100755
index 0000000..39b43b8
--- /dev/null
+++ b/3rdparty/TNN/platforms/android/unit_test_android.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+ABI="arm64-v8a"
+CLEAN=""
+BUILD_ONLY=""
+STL="c++_static"
+WORK_DIR=`pwd`
+BUILD_DIR=build_ut
+FILTER=""
+ANDROID_DIR=/data/local/tmp/unit_test
+DUMP_DIR=$WORK_DIR/dump_data_unittest
+ADB=adb
+
+DEVICE_TYPE=""
+
+function usage() {
+    echo "usage: ./unit_test_android.sh  [-32] [-c] [-b] [-f] <filter> [-t] <CPU/GPU/HUAWEI_NPU>"
+    echo "options:"
+    echo "        -32   Build 32bit."
+    echo "        -c    Clean up build folders."
+    echo "        -b    build targets only"
+    echo "        -f    filter"
+    echo "        -t    CPU/GPU/HUAWEI_NPU specify the platform to run"
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_android() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    if [ "$DEVICE_TYPE" = "HUAWEI_NPU"  ]; then
+        STL="c++_shared"
+        HUAWEI_NPU="ON"
+
+        #start to cp
+        if [ ! -d ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/ ]; then
+             mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/
+        fi
+        mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a
+        mkdir -p ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a
+        cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_shared.so  ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/armeabi-v7a/
+        cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so ${WORK_DIR}/../../third_party/huawei_npu/cpp_lib/arm64-v8a/
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../../ \
+          -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DANDROID_ABI="${ABI}" \
+          -DANDROID_STL=${STL} \
+          -DANDROID_NATIVE_API_LEVEL=android-14  \
+          -DANDROID_TOOLCHAIN=clang \
+          -DTNN_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_UNIT_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_ARM_ENABLE:BOOL="ON" \
+          -DTNN_OPENCL_ENABLE:BOOL="ON" \
+          -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
+          -DBUILD_FOR_ANDROID_COMMAND=true
+    make -j4
+}
+
+function run() {
+    build_android
+    if [ $? != 0 ]; then
+        echo "build failed!"
+        exit 0
+    fi
+
+    mkdir -p $DUMP_DIR
+
+    $ADB shell "mkdir -p $ANDROID_DIR"
+    find . -name "*.so" | while read solib; do
+        $ADB push $solib  $ANDROID_DIR
+    done
+
+    $ADB push  test/unit_test/unit_test  $ANDROID_DIR
+    $ADB shell chmod 0777 $ANDROID_DIR/unit_test
+
+    $ADB shell "mkdir -p $ANDROID_DIR/dump_data"
+
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+
+    $ADB shell "getprop ro.product.model > ${ANDROID_DIR}/test_log.txt"
+    if [ "$DEVICE_TYPE" = "HUAWEI_NPU"  ]; then
+        $ADB shell "echo 'Run Huawei Npu' >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "mkdir -p $ANDROID_DIR/lib"
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/cpp_lib/$ABI/* $ANDROID_DIR/lib
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/hiai_ddk_latest/$ABI/* $ANDROID_DIR/lib
+        $ADB shell "cd $ANDROID_DIR; LD_LIBRARY_PATH=${ANDROID_DIR}/lib:$ANDROID_DIR ./unit_test -dt HUAWEI_NPU  --gtest_filter=\"*${FILTER}*\"  >> $ANDROID_DIR/test_log.txt"
+    elif [ "$DEVICE_TYPE" = "CPU"  ]; then
+        $ADB shell "echo 'Run ARM' >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./unit_test -dt ARM  --gtest_filter=\"*${FILTER}*\" >> $ANDROID_DIR/test_log.txt"
+    elif [ "$DEVICE_TYPE" = "GPU"  ]; then
+        $ADB shell "echo 'Run GPU' > $ANDROID_DIR/test_log.txt"
+        $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./unit_test -dt OPENCL  --gtest_filter=\"*${FILTER}*\" >> $ANDROID_DIR/test_log.txt"
+    else
+        $ADB shell "echo 'Run ARM & GPU' >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "echo '===== ARM Unit Test =====' >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./unit_test -dt ARM  --gtest_filter=\"*${FILTER}*\" >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "echo '===== OPENCL Unit Test =====' >> $ANDROID_DIR/test_log.txt"
+        $ADB shell "cd $ANDROID_DIR ; LD_LIBRARY_PATH=$ANDROID_DIR ./unit_test -dt OPENCL  --gtest_filter=\"*${FILTER}*\" >> $ANDROID_DIR/test_log.txt"
+    fi
+    $ADB pull $ANDROID_DIR/test_log.txt $DUMP_DIR
+    cat $DUMP_DIR/test_log.txt
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -32)
+            shift
+            ABI="armeabi-v7a"
+            ;;
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            FILTER=$1
+            shift
+            ;;
+        -d)
+            shift
+            ADB="adb -s $1"
+            shift
+            ;;
+        -t)
+            shift
+            DEVICE_TYPE="$1"
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run
diff --git a/3rdparty/TNN/platforms/ios/.gitignore b/3rdparty/TNN/platforms/ios/.gitignore
new file mode 100644
index 0000000..d0e2bda
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/.gitignore
@@ -0,0 +1,37 @@
+xcuserdata
+gtnn.bundle/tnn.metallib
+gtnn.framework/Info.plist
+gtnn.framework/tnn
+gtnn.framework/Headers/tnn.h
+gtnn.framework/Headers/version.h
+gtnn.framework/Headers/core/blob.h
+gtnn.framework/Headers/core/common.h
+gtnn.framework/Headers/core/instance.h
+gtnn.framework/Headers/core/macro.h
+gtnn.framework/Headers/core/tnn.h
+gtnn.framework/Headers/core/status.h
+gtnn.framework/Headers/utils/blob_transfer_utils.h
+gtnn.framework/Headers/utils/data_type_utils.h
+gtnn.framework/Headers/utils/dims_vector_utils.h
+gtnn.framework/Headers/utils/half_utils.h
+gtnn.framework/Headers/utils/image_converter.h
+gtnn.framework/Modules/module.modulemap
+gtnn.xcodeproj/project.xcworkspace/xcuserdata/darrenyao.xcuserdatad/UserInterfaceState.xcuserstate
+gtnn.xcodeproj/xcuserdata/darrenyao.xcuserdatad/xcschemes/xcschememanagement.plist
+gtnn.bundle/tnn.metallib
+gtnn.framework/Info.plist
+gtnn.framework/tnn
+gtnn.framework/Headers/tnn.h
+gtnn.framework/Headers/version.h
+gtnn.framework/Headers/core/blob.h
+gtnn.framework/Headers/core/common.h
+gtnn.framework/Headers/core/instance.h
+gtnn.framework/Headers/core/macro.h
+gtnn.framework/Headers/core/rapidnet.h
+gtnn.framework/Headers/core/status.h
+gtnn.framework/Headers/utils/blob_transfer_utils.h
+gtnn.framework/Headers/utils/data_type_utils.h
+gtnn.framework/Headers/utils/dims_vector_utils.h
+gtnn.framework/Headers/utils/half_utils.h
+gtnn.framework/Headers/utils/image_converter.h
+gtnn.framework/Modules/module.modulemap
diff --git a/3rdparty/TNN/platforms/ios/scripts/build_ios.sh b/3rdparty/TNN/platforms/ios/scripts/build_ios.sh
new file mode 100755
index 0000000..7b4760d
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/scripts/build_ios.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+export DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+TNN_BUILD_PATH=$PWD/../project/ios
+#设置文件
+PLIST_PATH=$TNN_BUILD_PATH/tnn/Info.plist
+
+SDK_VERSION=0.2.0
+TARGET_NAME="tnn"
+CONFIGURATION="Release"
+
+TNN_VERSION_PATH=$PWD
+
+echo ' '
+echo '******************** update version.h ********************'
+cd $TNN_VERSION_PATH
+sh version.sh
+cd $TNN_BUILD_PATH
+
+echo ' '
+echo '******************** start build rpn ********************'
+#删除旧SDK文件
+#rm -r ./${TARGET_NAME}.bundle
+rm -r ./${TARGET_NAME}.framework
+rm -r build
+
+#更新Plist文件
+SDK_INFO_KEY="YTSDKInfo"
+GIT_BRANCH_NAME=$(eval "git symbolic-ref --short -q HEAD")
+GIT_COMMIT_DATE=$(eval "git log -1 --pretty=format:'%ad' --date=format:'%Y-%m-%d %H:%M:%S'")
+GIT_COMMIT_HASH=$(eval "git log -1 --pretty=format:'%h'")
+
+#修改plist的YTSDKInfo字段
+/usr/libexec/PlistBuddy -c "Delete  $SDK_INFO_KEY" $PLIST_PATH
+/usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY dict" $PLIST_PATH
+/usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY:hash string $GIT_COMMIT_HASH" $PLIST_PATH
+/usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY:date string $GIT_COMMIT_DATE" $PLIST_PATH
+
+#更新版本号
+agvtool new-marketing-version ${SDK_VERSION}
+#更新build号
+agvtool next-version -all
+
+
+#编译 SDK
+xcodebuild -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphoneos build
+cp -r build/$CONFIGURATION-iphoneos/$TARGET_NAME.framework build
+xcodebuild -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphonesimulator build
+lipo -create "build/$CONFIGURATION-iphonesimulator/$TARGET_NAME.framework/$TARGET_NAME" "build/$CONFIGURATION-iphoneos/$TARGET_NAME.framework/$TARGET_NAME" -output "build/$TARGET_NAME.framework/$TARGET_NAME"
+cp -r build/$TARGET_NAME.framework .
+rm -r build
+
+# 对于包含Metal的SDK, 转移metallib文件到bundle
+if [ ! -d $TARGET_NAME.bundle ]; then
+ mkdir $TARGET_NAME.bundle
+fi
+
+if [ ! -d $TARGET_NAME.framework/default.metallib ]; then
+ cp $TARGET_NAME.framework/default.metallib $TARGET_NAME.bundle/$TARGET_NAME.metallib
+ rm $TARGET_NAME.framework/default.metallib
+fi
diff --git a/3rdparty/TNN/platforms/ios/scripts/version.sh b/3rdparty/TNN/platforms/ios/scripts/version.sh
new file mode 100755
index 0000000..1b0c241
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/scripts/version.sh
@@ -0,0 +1,73 @@
+# add version
+# 1:git目录  2：git标识 3：version文件目录
+function AddVersion()
+{
+  if [[ ! -d "${1}" ]]; then
+    echo "文件夹不存在: "${1}
+  	return
+  fi
+
+  ORIG_PATH=$PWD
+  cd $1
+
+  GIT_BRANCH_NAME=$(eval "git symbolic-ref --short -q HEAD")
+  GIT_COMMIT_DATE=$(eval "git log -1 --pretty=format:'%ad' --date=format:'%Y-%m-%d %H:%M:%S'")
+  GIT_VERSION_DATE=$(eval "git log -1 --pretty=format:'%ad' --date=format:'%Y%m%d%H%M'")
+  GIT_COMMIT_HASH=$(eval "git log -1 --pretty=format:'%h'")
+
+  echo "" >> $3
+
+  echo "Target: "${2}
+  echo "Commit Branch: "${GIT_BRANCH_NAME}
+  echo "static char *branch_name_${2} = \""${GIT_BRANCH_NAME}"\";" >> $3
+  echo "Commit Date: "${GIT_COMMIT_DATE}
+  echo "static char *commit_date_${2} = \""${GIT_COMMIT_DATE}"\";" >> $3
+  echo "Commit Hash: "${GIT_COMMIT_HASH}
+  echo "static char *commit_hash_${2} = \""${GIT_COMMIT_HASH}"\";" >> $3
+
+  cd $ORIG_PATH
+}
+
+
+
+TNN_VERSION_BUILD_PATH=$PWD
+TNN_VERSION_FILE_PATH=$PWD/version.h
+ARM_VERSION_BUILD_PATH=$PWD/../source/device/arm
+CPU_VERSION_BUILD_PATH=$PWD/../source/device/cpu
+CUDA_VERSION_BUILD_PATH=$PWD/../source/device/cuda
+OPENCL_VERSION_BUILD_PATH=$PWD/../source/device/opencl
+METAL_VERSION_BUILD_PATH=$PWD/../source/device/metal
+X86_VERSION_BUILD_PATH=$PWD/../source/device/x86
+
+rm $TNN_VERSION_FILE_PATH
+
+echo "// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License." >> $TNN_VERSION_FILE_PATH
+echo "#ifndef TNN_VERSION_H" >> $TNN_VERSION_FILE_PATH
+echo "#define TNN_VERSION_H" >> $TNN_VERSION_FILE_PATH
+
+AddVersion $TNN_VERSION_BUILD_PATH tnn $TNN_VERSION_FILE_PATH
+AddVersion $ARM_VERSION_BUILD_PATH arm $TNN_VERSION_FILE_PATH
+AddVersion $CPU_VERSION_BUILD_PATH cpu $TNN_VERSION_FILE_PATH
+AddVersion $CUDA_VERSION_BUILD_PATH cuda $TNN_VERSION_FILE_PATH
+AddVersion $OPENCL_VERSION_BUILD_PATH opencl $TNN_VERSION_FILE_PATH
+AddVersion $METAL_VERSION_BUILD_PATH metal $TNN_VERSION_FILE_PATH
+AddVersion $X86_VERSION_BUILD_PATH x86 $TNN_VERSION_FILE_PATH
+
+echo "" >> $TNN_VERSION_FILE_PATH
+echo "#endif //TNN_VERSION_H" >> version.h
+
+cp version.h $TNN_VERSION_BUILD_PATH/../include
+
+rm version.h
diff --git a/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.pbxproj b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.pbxproj
new file mode 100644
index 0000000..7b659f2
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.pbxproj
@@ -0,0 +1,4599 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		320CCF252681CB910086E65C /* GEMM_INT8_SDOT_8X4.S in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF212681CB910086E65C /* GEMM_INT8_SDOT_8X4.S */; };
+		320CCF262681CB910086E65C /* GEMM_INT8_SDOT_8X8.S in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF222681CB910086E65C /* GEMM_INT8_SDOT_8X8.S */; };
+		320CCF272681CB910086E65C /* GEMV_INT8_SDOT.S in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF232681CB910086E65C /* GEMV_INT8_SDOT.S */; };
+		320CCF2E2681CC090086E65C /* arm_conv_int8_sdot_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF2A2681CC090086E65C /* arm_conv_int8_sdot_layer_common.cc */; };
+		320CCF302681CC090086E65C /* compute_sdot_int8.h in Headers */ = {isa = PBXBuildFile; fileRef = 320CCF2C2681CC090086E65C /* compute_sdot_int8.h */; };
+		320CCF312681CC090086E65C /* compute_sdot_int8.cc in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF2D2681CC090086E65C /* compute_sdot_int8.cc */; };
+		320CCF352681CC3D0086E65C /* arm_conv_int8_sdot_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 320CCF332681CC3D0086E65C /* arm_conv_int8_sdot_layer_common.h */; };
+		320CCF3B2681CE570086E65C /* CONV_DW_3X3_INT8_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 320CCF3A2681CE570086E65C /* CONV_DW_3X3_INT8_SLIDEW.S */; };
+		32BFADDF2684AACE00502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.h in Headers */ = {isa = PBXBuildFile; fileRef = 32BFADDE2684AACE00502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.h */; };
+		32BFADE22684AAE100502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.cc in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADE12684AAE100502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.cc */; };
+		32BFADE82684AAEE00502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADE42684AAEE00502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */; };
+		32BFADE92684AAEE00502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADE52684AAEE00502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */; };
+		32BFADEA2684AAEE00502FAC /* GEMM_INT8_SDOT_4X4.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADE62684AAEE00502FAC /* GEMM_INT8_SDOT_4X4.S */; };
+		32BFADEB2684AAEE00502FAC /* GEMM_INT8_SDOT_4X8.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADE72684AAEE00502FAC /* GEMM_INT8_SDOT_4X8.S */; };
+		32BFADEF2684AB0100502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADED2684AB0100502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */; };
+		32BFADF02684AB0100502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 32BFADEE2684AB0100502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */; };
+		32EE07CF268589E200656211 /* GEMV_INT8_SDOT.S in Sources */ = {isa = PBXBuildFile; fileRef = 32EE07CE268589E200656211 /* GEMV_INT8_SDOT.S */; };
+		369005C5267314D900412264 /* pad_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 369005C3267314D900412264 /* pad_utils.cc */; };
+		369005C6267314D900412264 /* pad_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 369005C4267314D900412264 /* pad_utils.h */; };
+		4E187CFA267202BF00804FDF /* roialign_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF0267202BE00804FDF /* roialign_layer_interpreter.cc */; };
+		4E187CFB267202BF00804FDF /* conv_1d_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF1267202BE00804FDF /* conv_1d_layer_interpreter.cc */; };
+		4E187CFC267202BF00804FDF /* topk_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF2267202BF00804FDF /* topk_layer_interpreter.cc */; };
+		4E187CFD267202BF00804FDF /* gridsample_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF3267202BF00804FDF /* gridsample_layer_interpreter.cc */; };
+		4E187CFE267202BF00804FDF /* layer_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF4267202BF00804FDF /* layer_norm_layer_interpreter.cc */; };
+		4E187CFF267202BF00804FDF /* const_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF5267202BF00804FDF /* const_layer_interpreter.cc */; };
+		4E187D00267202BF00804FDF /* group_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF6267202BF00804FDF /* group_norm_layer_interpreter.cc */; };
+		4E187D01267202BF00804FDF /* onehot_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF7267202BF00804FDF /* onehot_layer_interpreter.cc */; };
+		4E187D02267202BF00804FDF /* tile_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF8267202BF00804FDF /* tile_layer_interpreter.cc */; };
+		4E187D03267202BF00804FDF /* einsum_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187CF9267202BF00804FDF /* einsum_layer_interpreter.cc */; };
+		4E187D13267202D800804FDF /* where_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D05267202D800804FDF /* where_layer.cc */; };
+		4E187D14267202D800804FDF /* layer_norm_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D06267202D800804FDF /* layer_norm_layer.cc */; };
+		4E187D15267202D800804FDF /* equal_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D07267202D800804FDF /* equal_layer.cc */; };
+		4E187D16267202D800804FDF /* onehot_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D08267202D800804FDF /* onehot_layer.cc */; };
+		4E187D17267202D800804FDF /* group_norm_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D09267202D800804FDF /* group_norm_layer.cc */; };
+		4E187D18267202D800804FDF /* gelu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0A267202D800804FDF /* gelu_layer.cc */; };
+		4E187D19267202D800804FDF /* gridsample_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0B267202D800804FDF /* gridsample_layer.cc */; };
+		4E187D1A267202D800804FDF /* conv1d_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0C267202D800804FDF /* conv1d_layer.cc */; };
+		4E187D1B267202D800804FDF /* tile_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0D267202D800804FDF /* tile_layer.cc */; };
+		4E187D1C267202D800804FDF /* softsign_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0E267202D800804FDF /* softsign_layer.cc */; };
+		4E187D1D267202D800804FDF /* inverse_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D0F267202D800804FDF /* inverse_layer.cc */; };
+		4E187D1E267202D800804FDF /* topk_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D10267202D800804FDF /* topk_layer.cc */; };
+		4E187D1F267202D800804FDF /* einsum_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D11267202D800804FDF /* einsum_layer.cc */; };
+		4E187D20267202D800804FDF /* roialign_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D12267202D800804FDF /* roialign_layer.cc */; };
+		4E187D272672030500804FDF /* arm_inverse_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D222672030500804FDF /* arm_inverse_layer_acc.cc */; };
+		4E187D282672030500804FDF /* arm_padv2_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D232672030500804FDF /* arm_padv2_layer_acc.cc */; };
+		4E187D292672030500804FDF /* arm_tile_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D242672030500804FDF /* arm_tile_layer_acc.cc */; };
+		4E187D2A2672030500804FDF /* arm_grid_sample_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D252672030500804FDF /* arm_grid_sample_layer_acc.cc */; };
+		4E187D2B2672030500804FDF /* arm_concat_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 4E187D262672030500804FDF /* arm_concat_layer_acc.h */; };
+		4E187D392672036B00804FDF /* metal_cast_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D2D2672036A00804FDF /* metal_cast_layer_acc.metal */; };
+		4E187D3A2672036B00804FDF /* metal_tile_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D2E2672036A00804FDF /* metal_tile_layer_acc.metal */; };
+		4E187D3B2672036B00804FDF /* metal_mat_mul_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D2F2672036A00804FDF /* metal_mat_mul_layer_acc.mm */; };
+		4E187D3C2672036B00804FDF /* metal_gather_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D302672036A00804FDF /* metal_gather_layer_acc.metal */; };
+		4E187D3D2672036B00804FDF /* metal_cast_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 4E187D312672036A00804FDF /* metal_cast_layer_acc.h */; };
+		4E187D3E2672036B00804FDF /* metal_flatten_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D322672036A00804FDF /* metal_flatten_layer_acc.mm */; };
+		4E187D3F2672036B00804FDF /* metal_mat_mul_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 4E187D332672036A00804FDF /* metal_mat_mul_layer_acc.h */; };
+		4E187D402672036B00804FDF /* metal_cast_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D342672036A00804FDF /* metal_cast_layer_acc.mm */; };
+		4E187D412672036B00804FDF /* metal_tile_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D352672036A00804FDF /* metal_tile_layer_acc.mm */; };
+		4E187D422672036B00804FDF /* metal_gather_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D362672036A00804FDF /* metal_gather_layer_acc.mm */; };
+		4E187D432672036B00804FDF /* metal_gather_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 4E187D372672036A00804FDF /* metal_gather_layer_acc.h */; };
+		4E187D442672036B00804FDF /* metal_mat_mul_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4E187D382672036A00804FDF /* metal_mat_mul_layer_acc.metal */; };
+		6178F34B2590AA8C00B4B153 /* winograd_generator.cc in Sources */ = {isa = PBXBuildFile; fileRef = 6178F3472590AA8C00B4B153 /* winograd_generator.cc */; };
+		6178F34C2590AA8C00B4B153 /* winograd_generator.h in Headers */ = {isa = PBXBuildFile; fileRef = 6178F3482590AA8C00B4B153 /* winograd_generator.h */; };
+		6178F34D2590AA8C00B4B153 /* md5.h in Headers */ = {isa = PBXBuildFile; fileRef = 6178F3492590AA8C00B4B153 /* md5.h */; };
+		6178F34E2590AA8C00B4B153 /* md5.cc in Sources */ = {isa = PBXBuildFile; fileRef = 6178F34A2590AA8C00B4B153 /* md5.cc */; };
+		9D29E25122DC89310050DC63 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9D29E25022DC89300050DC63 /* Foundation.framework */; };
+		9D2DB1D622D759C8000C508F /* tnn.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D2DB1D422D759C8000C508F /* tnn.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		9D31636023169B1600531250 /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 9D31635F23169B1600531250 /* CoreML.framework */; };
+		9D32FC7E24557EEB002DCDAB /* net_optimizer_remove_layers.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F95F24557EE7002DCDAB /* net_optimizer_remove_layers.h */; };
+		9D32FC8124557EEC002DCDAB /* net_optimizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96224557EE7002DCDAB /* net_optimizer.h */; };
+		9D32FC8324557EEC002DCDAB /* optimizer_const.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96424557EE7002DCDAB /* optimizer_const.h */; };
+		9D32FC8424557EEC002DCDAB /* net_optimizer_manager.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96524557EE7002DCDAB /* net_optimizer_manager.h */; };
+		9D32FC8524557EEC002DCDAB /* net_optimizer_remove_layers.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96624557EE7002DCDAB /* net_optimizer_remove_layers.cc */; };
+		9D32FC8624557EEC002DCDAB /* net_optimizer_manager.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96724557EE7002DCDAB /* net_optimizer_manager.cc */; };
+		9D32FC8724557EEC002DCDAB /* blob_2d_memory_pool.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96924557EE7002DCDAB /* blob_2d_memory_pool.h */; };
+		9D32FC8824557EEC002DCDAB /* others_memory_mode_state.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96A24557EE7002DCDAB /* others_memory_mode_state.cc */; };
+		9D32FC8924557EEC002DCDAB /* blob_memory_size_info.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96B24557EE7002DCDAB /* blob_memory_size_info.cc */; };
+		9D32FC8A24557EEC002DCDAB /* shared_memory_manager.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96C24557EE7002DCDAB /* shared_memory_manager.h */; };
+		9D32FC8B24557EEC002DCDAB /* memory_unify_assign_strategy.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96D24557EE7002DCDAB /* memory_unify_assign_strategy.cc */; };
+		9D32FC8C24557EEC002DCDAB /* blob_1d_memory_pool.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F96E24557EE7002DCDAB /* blob_1d_memory_pool.h */; };
+		9D32FC8D24557EEC002DCDAB /* blob_1d_memory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F96F24557EE7002DCDAB /* blob_1d_memory.cc */; };
+		9D32FC8E24557EEC002DCDAB /* blob_memory_pool.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97024557EE7002DCDAB /* blob_memory_pool.cc */; };
+		9D32FC8F24557EEC002DCDAB /* shared_memory_manager.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97124557EE7002DCDAB /* shared_memory_manager.cc */; };
+		9D32FC9024557EEC002DCDAB /* blob_2d_memory_pool.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97224557EE7002DCDAB /* blob_2d_memory_pool.cc */; };
+		9D32FC9124557EEC002DCDAB /* blob_memory_pool_factory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97324557EE7002DCDAB /* blob_memory_pool_factory.h */; };
+		9D32FC9224557EEC002DCDAB /* blob_memory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97424557EE7002DCDAB /* blob_memory.cc */; };
+		9D32FC9324557EEC002DCDAB /* memory_mode_state.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97524557EE7002DCDAB /* memory_mode_state.cc */; };
+		9D32FC9424557EEC002DCDAB /* memory_seperate_assign_strategy.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97624557EE7002DCDAB /* memory_seperate_assign_strategy.cc */; };
+		9D32FC9524557EEC002DCDAB /* blob_1d_memory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97724557EE7002DCDAB /* blob_1d_memory.h */; };
+		9D32FC9624557EEC002DCDAB /* blob_memory_size_info.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97824557EE7002DCDAB /* blob_memory_size_info.h */; };
+		9D32FC9724557EEC002DCDAB /* blob_memory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97924557EE7002DCDAB /* blob_memory.h */; };
+		9D32FC9824557EEC002DCDAB /* memory_assign_strategy.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97A24557EE7002DCDAB /* memory_assign_strategy.h */; };
+		9D32FC9924557EEC002DCDAB /* memory_mode_state_factory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97B24557EE7002DCDAB /* memory_mode_state_factory.h */; };
+		9D32FC9A24557EEC002DCDAB /* others_memory_mode_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97C24557EE7002DCDAB /* others_memory_mode_state.h */; };
+		9D32FC9B24557EEC002DCDAB /* blob_2d_memory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97D24557EE7002DCDAB /* blob_2d_memory.h */; };
+		9D32FC9C24557EEC002DCDAB /* blob_2d_memory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F97E24557EE7002DCDAB /* blob_2d_memory.cc */; };
+		9D32FC9D24557EEC002DCDAB /* memory_unify_assign_strategy.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F97F24557EE7002DCDAB /* memory_unify_assign_strategy.h */; };
+		9D32FC9E24557EEC002DCDAB /* share_one_thread_memory_mode_state.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F98024557EE7002DCDAB /* share_one_thread_memory_mode_state.cc */; };
+		9D32FC9F24557EEC002DCDAB /* memory_mode_state_factory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F98124557EE7002DCDAB /* memory_mode_state_factory.cc */; };
+		9D32FCA024557EEC002DCDAB /* memory_mode_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98224557EE7002DCDAB /* memory_mode_state.h */; };
+		9D32FCA124557EEC002DCDAB /* share_one_thread_memory_mode_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98324557EE7002DCDAB /* share_one_thread_memory_mode_state.h */; };
+		9D32FCA224557EEC002DCDAB /* blob_memory_pool.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98424557EE7002DCDAB /* blob_memory_pool.h */; };
+		9D32FCA324557EEC002DCDAB /* blob_memory_pool_factory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F98524557EE7002DCDAB /* blob_memory_pool_factory.cc */; };
+		9D32FCA424557EEC002DCDAB /* memory_seperate_assign_strategy.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98624557EE7002DCDAB /* memory_seperate_assign_strategy.h */; };
+		9D32FCA524557EEC002DCDAB /* blob_1d_memory_pool.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F98724557EE7002DCDAB /* blob_1d_memory_pool.cc */; };
+		9D32FCA724557EEC002DCDAB /* omp_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98A24557EE7002DCDAB /* omp_utils.h */; };
+		9D32FCA824557EEC002DCDAB /* half_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F98B24557EE7002DCDAB /* half_utils.cc */; };
+		9D32FCA924557EEC002DCDAB /* blob_memory_size_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98C24557EE7002DCDAB /* blob_memory_size_utils.h */; };
+		9D32FCAA24557EEC002DCDAB /* string_utils_inner.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98D24557EE7002DCDAB /* string_utils_inner.h */; };
+		9D32FCAB24557EEC002DCDAB /* blob_transfer_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98E24557EE7002DCDAB /* blob_transfer_utils.h */; };
+		9D32FCAC24557EEC002DCDAB /* split_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F98F24557EE7002DCDAB /* split_utils.h */; };
+		9D32FCAD24557EEC002DCDAB /* string_format.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99024557EE7002DCDAB /* string_format.cc */; };
+		9D32FCAE24557EEC002DCDAB /* blob_dump_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99124557EE7002DCDAB /* blob_dump_utils.cc */; };
+		9D32FCAF24557EEC002DCDAB /* blob_transfer_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99224557EE7002DCDAB /* blob_transfer_utils.cc */; };
+		9D32FCB124557EEC002DCDAB /* data_format_converter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F99424557EE7002DCDAB /* data_format_converter.h */; };
+		9D32FCB324557EEC002DCDAB /* data_format_converter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99624557EE7002DCDAB /* data_format_converter.cc */; };
+		9D32FCB424557EEC002DCDAB /* blob_converter_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F99724557EE7002DCDAB /* blob_converter_internal.h */; };
+		9D32FCB624557EEC002DCDAB /* blob_converter_internal.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99924557EE7002DCDAB /* blob_converter_internal.cc */; };
+		9D32FCB724557EEC002DCDAB /* dims_vector_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99A24557EE7002DCDAB /* dims_vector_utils.cc */; };
+		9D32FCB824557EEC002DCDAB /* blob_memory_size_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99B24557EE7002DCDAB /* blob_memory_size_utils.cc */; };
+		9D32FCBA24557EEC002DCDAB /* data_type_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99D24557EE7002DCDAB /* data_type_utils.cc */; };
+		9D32FCBB24557EEC002DCDAB /* split_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F99E24557EE7002DCDAB /* split_utils.cc */; };
+		9D32FCBC24557EEC002DCDAB /* string_format.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F99F24557EE7002DCDAB /* string_format.h */; };
+		9D32FCBD24557EEC002DCDAB /* pribox_generator_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F9A024557EE7002DCDAB /* pribox_generator_utils.h */; };
+		9D32FCBE24557EEC002DCDAB /* cpu_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A124557EE7002DCDAB /* cpu_utils.cc */; };
+		9D32FCBF24557EEC002DCDAB /* pribox_generator_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A224557EE7002DCDAB /* pribox_generator_utils.cc */; };
+		9D32FCC024557EEC002DCDAB /* sigmoid_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A424557EE7002DCDAB /* sigmoid_layer.cc */; };
+		9D32FCC124557EEC002DCDAB /* pow_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A524557EE7002DCDAB /* pow_layer.cc */; };
+		9D32FCC224557EEC002DCDAB /* tanh_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A624557EE7002DCDAB /* tanh_layer.cc */; };
+		9D32FCC324557EEC002DCDAB /* sign_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A724557EE7002DCDAB /* sign_layer.cc */; };
+		9D32FCC424557EEC002DCDAB /* softplus_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A824557EE7002DCDAB /* softplus_layer.cc */; };
+		9D32FCC524557EEC002DCDAB /* tan_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9A924557EE7002DCDAB /* tan_layer.cc */; };
+		9D32FCC624557EEC002DCDAB /* reduce_prod_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9AA24557EE7002DCDAB /* reduce_prod_layer.cc */; };
+		9D32FCC724557EEC002DCDAB /* reduce_sum_square_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9AB24557EE7002DCDAB /* reduce_sum_square_layer.cc */; };
+		9D32FCC824557EEC002DCDAB /* add_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9AC24557EE7002DCDAB /* add_layer.cc */; };
+		9D32FCC924557EEC002DCDAB /* exp_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9AD24557EE7002DCDAB /* exp_layer.cc */; };
+		9D32FCCA24557EEC002DCDAB /* base_layer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F9AE24557EE7002DCDAB /* base_layer.h */; };
+		9D32FCCB24557EEC002DCDAB /* shuffle_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9AF24557EE7002DCDAB /* shuffle_layer.cc */; };
+		9D32FCCC24557EEC002DCDAB /* reduce_layer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F9B024557EE7002DCDAB /* reduce_layer.h */; };
+		9D32FCCD24557EEC002DCDAB /* sqrt_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B124557EE7002DCDAB /* sqrt_layer.cc */; };
+		9D32FCCE24557EEC002DCDAB /* flatten_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B224557EE7002DCDAB /* flatten_layer.cc */; };
+		9D32FCCF24557EEC002DCDAB /* pooling_3d_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B324557EE7002DCDAB /* pooling_3d_layer.cc */; };
+		9D32FCD024557EEC002DCDAB /* reshape_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B424557EE7002DCDAB /* reshape_layer.cc */; };
+		9D32FCD124557EEC002DCDAB /* elementwise_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B524557EE7002DCDAB /* elementwise_layer.cc */; };
+		9D32FCD224557EEC002DCDAB /* div_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B624557EE7002DCDAB /* div_layer.cc */; };
+		9D32FCD324557EEC002DCDAB /* inner_product_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B724557EE7002DCDAB /* inner_product_layer.cc */; };
+		9D32FCD424557EEC002DCDAB /* atan_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B824557EE7002DCDAB /* atan_layer.cc */; };
+		9D32FCD524557EEC002DCDAB /* sub_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9B924557EE7002DCDAB /* sub_layer.cc */; };
+		9D32FCD624557EEC002DCDAB /* elementwise_layer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F9BA24557EE7002DCDAB /* elementwise_layer.h */; };
+		9D32FCD724557EEC002DCDAB /* reciprocal_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9BB24557EE7002DCDAB /* reciprocal_layer.cc */; };
+		9D32FCD824557EEC002DCDAB /* floor_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9BC24557EE7002DCDAB /* floor_layer.cc */; };
+		9D32FCD924557EEC002DCDAB /* detection_output_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9BD24557EE7002DCDAB /* detection_output_layer.cc */; };
+		9D32FCDA24557EEC002DCDAB /* splitv_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9BE24557EE7002DCDAB /* splitv_layer.cc */; };
+		9D32FCDB24557EEC002DCDAB /* instance_norm_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9BF24557EE7002DCDAB /* instance_norm_layer.cc */; };
+		9D32FCDC24557EEC002DCDAB /* selu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C024557EE7002DCDAB /* selu_layer.cc */; };
+		9D32FCDD24557EEC002DCDAB /* prior_box_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C124557EE7002DCDAB /* prior_box_layer.cc */; };
+		9D32FCDE24557EEC002DCDAB /* conv_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C224557EE7002DCDAB /* conv_layer.cc */; };
+		9D32FCDF24557EEC002DCDAB /* conv3d_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C324557EE7002DCDAB /* conv3d_layer.cc */; };
+		9D32FCE024557EEC002DCDAB /* relu6_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C424557EE7002DCDAB /* relu6_layer.cc */; };
+		9D32FCE124557EEC002DCDAB /* prelu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C524557EE7002DCDAB /* prelu_layer.cc */; };
+		9D32FCE224557EEC002DCDAB /* neg_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C624557EE7002DCDAB /* neg_layer.cc */; };
+		9D32FCE324557EEC002DCDAB /* softmax_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C724557EE7002DCDAB /* softmax_layer.cc */; };
+		9D32FCE424557EEC002DCDAB /* acos_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C824557EE7002DCDAB /* acos_layer.cc */; };
+		9D32FCE524557EEC002DCDAB /* clip_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9C924557EE7002DCDAB /* clip_layer.cc */; };
+		9D32FCE624557EEC002DCDAB /* multidir_broadcast_layer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32F9CA24557EE7002DCDAB /* multidir_broadcast_layer.h */; };
+		9D32FCE724557EEC002DCDAB /* reduce_mean_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9CB24557EE7002DCDAB /* reduce_mean_layer.cc */; };
+		9D32FCE824557EEC002DCDAB /* reduce_sum_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9CC24557EE7002DCDAB /* reduce_sum_layer.cc */; };
+		9D32FCE924557EEC002DCDAB /* pad_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9CD24557EE7002DCDAB /* pad_layer.cc */; };
+		9D32FCEA24557EEC002DCDAB /* abs_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9CE24557EE7002DCDAB /* abs_layer.cc */; };
+		9D32FCEB24557EEC002DCDAB /* min_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9CF24557EE7002DCDAB /* min_layer.cc */; };
+		9D32FCEC24557EEC002DCDAB /* reduce_min_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D024557EE7002DCDAB /* reduce_min_layer.cc */; };
+		9D32FCED24557EEC002DCDAB /* sin_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D124557EE7002DCDAB /* sin_layer.cc */; };
+		9D32FCEE24557EEC002DCDAB /* reformat_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D224557EE7002DCDAB /* reformat_layer.cc */; };
+		9D32FCEF24557EEC002DCDAB /* concat_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D324557EE7002DCDAB /* concat_layer.cc */; };
+		9D32FCF024557EEC002DCDAB /* mul_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D424557EE7002DCDAB /* mul_layer.cc */; };
+		9D32FCF124557EEC002DCDAB /* base_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D524557EE7002DCDAB /* base_layer.cc */; };
+		9D32FCF224557EEC002DCDAB /* roi_pooling_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D624557EE7002DCDAB /* roi_pooling_layer.cc */; };
+		9D32FCF324557EEC002DCDAB /* hdrguide_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D724557EE7002DCDAB /* hdrguide_layer.cc */; };
+		9D32FCF424557EEC002DCDAB /* reduce_log_sum_exp_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D824557EE7002DCDAB /* reduce_log_sum_exp_layer.cc */; };
+		9D32FCF524557EEC002DCDAB /* reduce_l2_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9D924557EE7002DCDAB /* reduce_l2_layer.cc */; };
+		9D32FCF624557EEC002DCDAB /* normalize_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DA24557EE7002DCDAB /* normalize_layer.cc */; };
+		9D32FCF724557EEC002DCDAB /* pooling_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DB24557EE7002DCDAB /* pooling_layer.cc */; };
+		9D32FCF824557EEC002DCDAB /* elu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DC24557EE7002DCDAB /* elu_layer.cc */; };
+		9D32FCF924557EEC002DCDAB /* log_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DD24557EE7002DCDAB /* log_layer.cc */; };
+		9D32FCFA24557EEC002DCDAB /* scale_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DE24557EE7002DCDAB /* scale_layer.cc */; };
+		9D32FCFB24557EEC002DCDAB /* hard_swish_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9DF24557EE7002DCDAB /* hard_swish_layer.cc */; };
+		9D32FCFC24557EEC002DCDAB /* upsample_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E024557EE7002DCDAB /* upsample_layer.cc */; };
+		9D32FCFD24557EEC002DCDAB /* reduce_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E124557EE7002DCDAB /* reduce_layer.cc */; };
+		9D32FCFE24557EEC002DCDAB /* multidir_broadcast_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E224557EE7002DCDAB /* multidir_broadcast_layer.cc */; };
+		9D32FCFF24557EEC002DCDAB /* cos_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E324557EE7002DCDAB /* cos_layer.cc */; };
+		9D32FD0024557EEC002DCDAB /* relu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E424557EE7002DCDAB /* relu_layer.cc */; };
+		9D32FD0124557EEC002DCDAB /* asin_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E524557EE7002DCDAB /* asin_layer.cc */; };
+		9D32FD0224557EEC002DCDAB /* reorg_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E624557EE7002DCDAB /* reorg_layer.cc */; };
+		9D32FD0324557EEC002DCDAB /* batch_norm_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E724557EE7002DCDAB /* batch_norm_layer.cc */; };
+		9D32FD0424557EEC002DCDAB /* deconv_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E824557EE7002DCDAB /* deconv_layer.cc */; };
+		9D32FD0524557EEC002DCDAB /* stride_slice_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9E924557EE7002DCDAB /* stride_slice_layer.cc */; };
+		9D32FD0624557EEC002DCDAB /* split_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9EA24557EE8002DCDAB /* split_layer.cc */; };
+		9D32FD0724557EEC002DCDAB /* log_sigmoid_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9EB24557EE8002DCDAB /* log_sigmoid_layer.cc */; };
+		9D32FD0824557EEC002DCDAB /* lrn_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9EC24557EE8002DCDAB /* lrn_layer.cc */; };
+		9D32FD0924557EEC002DCDAB /* permute_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9ED24557EE8002DCDAB /* permute_layer.cc */; };
+		9D32FD0A24557EEC002DCDAB /* reduce_log_sum_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9EE24557EE8002DCDAB /* reduce_log_sum_layer.cc */; };
+		9D32FD0B24557EEC002DCDAB /* hard_sigmoid_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9EF24557EE8002DCDAB /* hard_sigmoid_layer.cc */; };
+		9D32FD0C24557EEC002DCDAB /* max_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9F024557EE8002DCDAB /* max_layer.cc */; };
+		9D32FD0D24557EEC002DCDAB /* reduce_max_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32F9F124557EE8002DCDAB /* reduce_max_layer.cc */; };
+		9D32FF0124557EED002DCDAB /* default_model_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FBFB24557EEB002DCDAB /* default_model_interpreter.h */; };
+		9D32FF0224557EED002DCDAB /* layer_resource_generator.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FBFC24557EEB002DCDAB /* layer_resource_generator.cc */; };
+		9D32FF0324557EED002DCDAB /* layer_param.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FBFD24557EEB002DCDAB /* layer_param.h */; };
+		9D32FF0424557EED002DCDAB /* default_model_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FBFE24557EEB002DCDAB /* default_model_interpreter.cc */; };
+		9D32FF0524557EED002DCDAB /* net_resource.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FBFF24557EEB002DCDAB /* net_resource.cc */; };
+		9D32FF0624557EED002DCDAB /* model_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC0124557EEB002DCDAB /* model_interpreter.h */; };
+		9D32FF0724557EED002DCDAB /* model_packer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC0224557EEB002DCDAB /* model_packer.h */; };
+		9D32FF0824557EED002DCDAB /* model_packer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0324557EEB002DCDAB /* model_packer.cc */; };
+		9D32FF0924557EED002DCDAB /* objseri.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC0424557EEB002DCDAB /* objseri.h */; };
+		9D32FF0A24557EED002DCDAB /* model_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0524557EEB002DCDAB /* model_interpreter.cc */; };
+		9D32FF0B24557EED002DCDAB /* batch_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0724557EEB002DCDAB /* batch_norm_layer_interpreter.cc */; };
+		9D32FF0C24557EED002DCDAB /* shuffle_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0824557EEB002DCDAB /* shuffle_layer_interpreter.cc */; };
+		9D32FF0D24557EED002DCDAB /* conv_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0924557EEB002DCDAB /* conv_layer_interpreter.cc */; };
+		9D32FF0E24557EED002DCDAB /* flatten_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0A24557EEB002DCDAB /* flatten_layer_interpreter.cc */; };
+		9D32FF0F24557EED002DCDAB /* unary_op_layer_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC0B24557EEB002DCDAB /* unary_op_layer_interpreter.h */; };
+		9D32FF1024557EED002DCDAB /* detection_output_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0C24557EEB002DCDAB /* detection_output_interpreter.cc */; };
+		9D32FF1124557EED002DCDAB /* reorg_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0D24557EEB002DCDAB /* reorg_layer_interpreter.cc */; };
+		9D32FF1224557EED002DCDAB /* prelu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0E24557EEB002DCDAB /* prelu_layer_interpreter.cc */; };
+		9D32FF1324557EED002DCDAB /* clip_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC0F24557EEB002DCDAB /* clip_layer_interpreter.cc */; };
+		9D32FF1424557EED002DCDAB /* div_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1024557EEB002DCDAB /* div_layer_interpreter.cc */; };
+		9D32FF1524557EED002DCDAB /* stride_slice_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1124557EEB002DCDAB /* stride_slice_layer_interpreter.cc */; };
+		9D32FF1624557EED002DCDAB /* pooling_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1224557EEB002DCDAB /* pooling_layer_interpreter.cc */; };
+		9D32FF1724557EED002DCDAB /* pad_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1324557EEB002DCDAB /* pad_layer_interpreter.cc */; };
+		9D32FF1824557EED002DCDAB /* sub_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1424557EEB002DCDAB /* sub_layer_interpreter.cc */; };
+		9D32FF1924557EED002DCDAB /* normalize_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1524557EEB002DCDAB /* normalize_layer_interpreter.cc */; };
+		9D32FF1A24557EED002DCDAB /* abstract_layer_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC1624557EEB002DCDAB /* abstract_layer_interpreter.h */; };
+		9D32FF1B24557EED002DCDAB /* upsample_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1724557EEB002DCDAB /* upsample_layer_interpreter.cc */; };
+		9D32FF1C24557EED002DCDAB /* max_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1824557EEB002DCDAB /* max_layer_interpreter.cc */; };
+		9D32FF1D24557EED002DCDAB /* add_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1924557EEB002DCDAB /* add_layer_interpreter.cc */; };
+		9D32FF1E24557EED002DCDAB /* permute_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1A24557EEB002DCDAB /* permute_layer_interpreter.cc */; };
+		9D32FF1F24557EED002DCDAB /* hard_swish_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1B24557EEB002DCDAB /* hard_swish_layer_interpreter.cc */; };
+		9D32FF2024557EED002DCDAB /* elu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1C24557EEB002DCDAB /* elu_layer_interpreter.cc */; };
+		9D32FF2124557EED002DCDAB /* selu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1D24557EEB002DCDAB /* selu_layer_interpreter.cc */; };
+		9D32FF2224557EED002DCDAB /* lrn_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1E24557EEB002DCDAB /* lrn_layer_interpreter.cc */; };
+		9D32FF2324557EED002DCDAB /* roi_pooling_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC1F24557EEB002DCDAB /* roi_pooling_layer_interpreter.cc */; };
+		9D32FF2424557EED002DCDAB /* hard_sigmoid_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2024557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */; };
+		9D32FF2524557EED002DCDAB /* softmax_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2124557EEB002DCDAB /* softmax_layer_interpreter.cc */; };
+		9D32FF2624557EED002DCDAB /* pow_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2224557EEB002DCDAB /* pow_layer_interpreter.cc */; };
+		9D32FF2724557EED002DCDAB /* pooling_3d_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2324557EEB002DCDAB /* pooling_3d_layer_interpreter.cc */; };
+		9D32FF2824557EED002DCDAB /* blob_scale_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2424557EEB002DCDAB /* blob_scale_layer_interpreter.cc */; };
+		9D32FF2924557EED002DCDAB /* mul_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2524557EEB002DCDAB /* mul_layer_interpreter.cc */; };
+		9D32FF2A24557EED002DCDAB /* reduce_op_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2624557EEB002DCDAB /* reduce_op_interpreter.cc */; };
+		9D32FF2B24557EED002DCDAB /* hdrguide_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2724557EEB002DCDAB /* hdrguide_layer_interpreter.cc */; };
+		9D32FF2C24557EED002DCDAB /* prior_box_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2824557EEB002DCDAB /* prior_box_layer_interpreter.cc */; };
+		9D32FF2D24557EED002DCDAB /* reshape_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2924557EEB002DCDAB /* reshape_layer_interpreter.cc */; };
+		9D32FF2E24557EED002DCDAB /* instance_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2A24557EEB002DCDAB /* instance_norm_layer_interpreter.cc */; };
+		9D32FF2F24557EED002DCDAB /* splitv_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2B24557EEB002DCDAB /* splitv_layer_interpreter.cc */; };
+		9D32FF3024557EED002DCDAB /* min_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2C24557EEB002DCDAB /* min_layer_interpreter.cc */; };
+		9D32FF3124557EED002DCDAB /* inner_product_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2D24557EEB002DCDAB /* inner_product_layer_interpreter.cc */; };
+		9D32FF3224557EED002DCDAB /* concat_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2E24557EEB002DCDAB /* concat_layer_interpreter.cc */; };
+		9D32FF3324557EED002DCDAB /* conv_3d_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC2F24557EEB002DCDAB /* conv_3d_layer_interpreter.cc */; };
+		9D32FF3424557EED002DCDAB /* unary_op_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3024557EEB002DCDAB /* unary_op_layer_interpreter.cc */; };
+		9D32FF3524557EED002DCDAB /* scale_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3124557EEB002DCDAB /* scale_layer_interpreter.cc */; };
+		9D32FF3624557EED002DCDAB /* reduce_op_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3224557EEB002DCDAB /* reduce_op_interpreter.h */; };
+		9D32FF3724557EED002DCDAB /* layer_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3324557EEB002DCDAB /* layer_interpreter.h */; };
+		9D32FF3824557EED002DCDAB /* raw_buffer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3424557EEB002DCDAB /* raw_buffer.h */; };
+		9D32FF3924557EED002DCDAB /* default_model_packer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3524557EEB002DCDAB /* default_model_packer.h */; };
+		9D32FF3A24557EED002DCDAB /* layer_resource.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3624557EEB002DCDAB /* layer_resource.h */; };
+		9D32FF3B24557EED002DCDAB /* net_resource.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3724557EEB002DCDAB /* net_resource.h */; };
+		9D32FF3C24557EED002DCDAB /* default_model_packer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3824557EEB002DCDAB /* default_model_packer.cc */; };
+		9D32FF3D24557EED002DCDAB /* abstract_model_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC3924557EEB002DCDAB /* abstract_model_interpreter.h */; };
+		9D32FF3E24557EED002DCDAB /* net_structure.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3A24557EEB002DCDAB /* net_structure.cc */; };
+		9D32FF3F24557EED002DCDAB /* raw_buffer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3B24557EEB002DCDAB /* raw_buffer.cc */; };
+		9D32FF4024557EED002DCDAB /* abstract_model_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3C24557EEB002DCDAB /* abstract_model_interpreter.cc */; };
+		9D32FF4124557EED002DCDAB /* ncnn_model_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC3E24557EEB002DCDAB /* ncnn_model_interpreter.cc */; };
+		9D32FF4224557EED002DCDAB /* ncnn_optimizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4024557EEB002DCDAB /* ncnn_optimizer.h */; };
+		9D32FF4324557EED002DCDAB /* memory_data_optimizer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4124557EEB002DCDAB /* memory_data_optimizer.cc */; };
+		9D32FF4424557EED002DCDAB /* ncnn_optimizer_manager.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4224557EEB002DCDAB /* ncnn_optimizer_manager.cc */; };
+		9D32FF4524557EED002DCDAB /* expand_slice_optimizer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4324557EEB002DCDAB /* expand_slice_optimizer.cc */; };
+		9D32FF4624557EED002DCDAB /* ncnn_optimizer_manager.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4424557EEB002DCDAB /* ncnn_optimizer_manager.h */; };
+		9D32FF4724557EED002DCDAB /* ncnn_param_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4524557EEB002DCDAB /* ncnn_param_utils.cc */; };
+		9D32FF4824557EED002DCDAB /* ncnn_layer_type.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4624557EEB002DCDAB /* ncnn_layer_type.h */; };
+		9D32FF4924557EED002DCDAB /* serializer.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4724557EEB002DCDAB /* serializer.h */; };
+		9D32FF4A24557EED002DCDAB /* ncnn_layer_type.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4824557EEB002DCDAB /* ncnn_layer_type.cc */; };
+		9D32FF4B24557EED002DCDAB /* ncnn_model_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4924557EEB002DCDAB /* ncnn_model_interpreter.h */; };
+		9D32FF4C24557EED002DCDAB /* ncnn_param_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC4A24557EEB002DCDAB /* ncnn_param_utils.h */; };
+		9D32FF4D24557EED002DCDAB /* batch_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4C24557EEB002DCDAB /* batch_norm_layer_interpreter.cc */; };
+		9D32FF4E24557EED002DCDAB /* conv_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4D24557EEB002DCDAB /* conv_layer_interpreter.cc */; };
+		9D32FF4F24557EED002DCDAB /* memory_data_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4E24557EEB002DCDAB /* memory_data_layer_interpreter.cc */; };
+		9D32FF5024557EED002DCDAB /* default_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC4F24557EEB002DCDAB /* default_layer_interpreter.cc */; };
+		9D32FF5124557EED002DCDAB /* pooling_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5024557EEB002DCDAB /* pooling_layer_interpreter.cc */; };
+		9D32FF5224557EED002DCDAB /* abstract_layer_interpreter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC5124557EEB002DCDAB /* abstract_layer_interpreter.h */; };
+		9D32FF5324557EED002DCDAB /* slice_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5224557EEB002DCDAB /* slice_layer_interpreter.cc */; };
+		9D32FF5424557EED002DCDAB /* permute_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5324557EEB002DCDAB /* permute_layer_interpreter.cc */; };
+		9D32FF5524557EED002DCDAB /* binary_op_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5424557EEB002DCDAB /* binary_op_interpreter.cc */; };
+		9D32FF5624557EED002DCDAB /* hard_swish_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5524557EEB002DCDAB /* hard_swish_layer_interpreter.cc */; };
+		9D32FF5724557EED002DCDAB /* lrn_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5624557EEB002DCDAB /* lrn_layer_interpreter.cc */; };
+		9D32FF5824557EED002DCDAB /* hard_sigmoid_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5724557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */; };
+		9D32FF5924557EED002DCDAB /* softmax_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5824557EEB002DCDAB /* softmax_layer_interpreter.cc */; };
+		9D32FF5A24557EED002DCDAB /* eltwise_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5924557EEB002DCDAB /* eltwise_layer_interpreter.cc */; };
+		9D32FF5B24557EED002DCDAB /* reshape_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5A24557EEB002DCDAB /* reshape_layer_interpreter.cc */; };
+		9D32FF5C24557EED002DCDAB /* relu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5B24557EEB002DCDAB /* relu_layer_interpreter.cc */; };
+		9D32FF5D24557EED002DCDAB /* deconv_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5C24557EEB002DCDAB /* deconv_layer_interpreter.cc */; };
+		9D32FF5E24557EED002DCDAB /* interp_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5D24557EEB002DCDAB /* interp_layer_interpreter.cc */; };
+		9D32FF5F24557EED002DCDAB /* inner_product_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5E24557EEB002DCDAB /* inner_product_layer_interpreter.cc */; };
+		9D32FF6024557EED002DCDAB /* concat_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC5F24557EEB002DCDAB /* concat_layer_interpreter.cc */; };
+		9D32FF6124557EED002DCDAB /* crop_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6024557EEB002DCDAB /* crop_layer_interpreter.cc */; };
+		9D32FF6224557EED002DCDAB /* shuffle_channel_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6124557EEB002DCDAB /* shuffle_channel_layer_interpreter.cc */; };
+		9D32FF6324557EED002DCDAB /* net_structure.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6224557EEB002DCDAB /* net_structure.h */; };
+		9D32FF6424557EED002DCDAB /* layer_resource_generator.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6324557EEB002DCDAB /* layer_resource_generator.h */; };
+		9D32FF6524557EED002DCDAB /* layer_type.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6524557EEB002DCDAB /* layer_type.cc */; };
+		9D32FF6624557EED002DCDAB /* abstract_network.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6624557EEB002DCDAB /* abstract_network.cc */; };
+		9D32FF6724557EED002DCDAB /* default_network.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6724557EEB002DCDAB /* default_network.h */; };
+		9D32FF6824557EED002DCDAB /* tnn_impl.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6824557EEB002DCDAB /* tnn_impl.cc */; };
+		9D32FF6924557EED002DCDAB /* abstract_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6924557EEB002DCDAB /* abstract_layer_acc.h */; };
+		9D32FF6A24557EED002DCDAB /* status.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6A24557EEB002DCDAB /* status.cc */; };
+		9D32FF6B24557EED002DCDAB /* blob.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6B24557EEB002DCDAB /* blob.cc */; };
+		9D32FF6C24557EED002DCDAB /* tnn_impl_default.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6C24557EEB002DCDAB /* tnn_impl_default.h */; };
+		9D32FF6D24557EED002DCDAB /* blob_int8.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6D24557EEB002DCDAB /* blob_int8.cc */; };
+		9D32FF6E24557EED002DCDAB /* context.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC6E24557EEB002DCDAB /* context.cc */; };
+		9D32FF6F24557EED002DCDAB /* context.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC6F24557EEB002DCDAB /* context.h */; };
+		9D32FF7024557EED002DCDAB /* tnn.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7024557EEB002DCDAB /* tnn.cc */; };
+		9D32FF7124557EED002DCDAB /* abstract_device.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7124557EEB002DCDAB /* abstract_device.h */; };
+		9D32FF7224557EED002DCDAB /* tnn_impl_default.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7224557EEB002DCDAB /* tnn_impl_default.cc */; };
+		9D32FF7324557EED002DCDAB /* abstract_network.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7324557EEB002DCDAB /* abstract_network.h */; };
+		9D32FF7424557EED002DCDAB /* abstract_device.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7424557EEB002DCDAB /* abstract_device.cc */; };
+		9D32FF7524557EED002DCDAB /* tnn_impl.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7524557EEB002DCDAB /* tnn_impl.h */; };
+		9D32FF7624557EED002DCDAB /* abstract_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7624557EEB002DCDAB /* abstract_layer_acc.cc */; };
+		9D32FF7724557EED002DCDAB /* layer_type.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7724557EEB002DCDAB /* layer_type.h */; };
+		9D32FF7824557EED002DCDAB /* instance.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7824557EEB002DCDAB /* instance.cc */; };
+		9D32FF7924557EED002DCDAB /* blob_manager.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7924557EEB002DCDAB /* blob_manager.cc */; };
+		9D32FF7A24557EED002DCDAB /* default_network.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D32FC7A24557EEB002DCDAB /* default_network.cc */; };
+		9D32FF7B24557EED002DCDAB /* blob_int8.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7B24557EEB002DCDAB /* blob_int8.h */; };
+		9D32FF7C24557EED002DCDAB /* blob_manager.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D32FC7C24557EEB002DCDAB /* blob_manager.h */; };
+		9D4C60CB246BF7A1006068D1 /* bbox_util.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D4C60C9246BF7A1006068D1 /* bbox_util.h */; };
+		9D4C60CC246BF7A1006068D1 /* bbox_util.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4C60CA246BF7A1006068D1 /* bbox_util.cc */; };
+		9D4C60CF246BF826006068D1 /* profile.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4C60CD246BF826006068D1 /* profile.cc */; };
+		9D4C60D0246BF826006068D1 /* profile.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D4C60CE246BF826006068D1 /* profile.h */; };
+		9D5B716024BF0A300062DF64 /* metal_prior_box_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9D5B715E24BF0A300062DF64 /* metal_prior_box_layer_acc.metal */; };
+		9D5B716124BF0A300062DF64 /* metal_prior_box_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D5B715F24BF0A300062DF64 /* metal_prior_box_layer_acc.mm */; };
+		9D852BCB24584E6A003F4E41 /* bfp16_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D852BC924584E6A003F4E41 /* bfp16_utils.cc */; };
+		9D852BCC24584E6A003F4E41 /* bfp16.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D852BCA24584E6A003F4E41 /* bfp16.h */; };
+		9DB341FD249B0A9300F23F65 /* metal_cpu_adapter_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DB341FB249B0A9300F23F65 /* metal_cpu_adapter_acc.mm */; };
+		9DD1FB31247CE9BE00800139 /* coreml_network.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA8D247CE9BE00800139 /* coreml_network.mm */; };
+		9DD1FB33247CE9BE00800139 /* metal_command_queue.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA8F247CE9BE00800139 /* metal_command_queue.h */; };
+		9DD1FB34247CE9BE00800139 /* coreml_network.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA90247CE9BE00800139 /* coreml_network.h */; };
+		9DD1FB35247CE9BE00800139 /* metal_device.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA91247CE9BE00800139 /* metal_device.mm */; };
+		9DD1FB36247CE9BE00800139 /* tnn_impl_coreml.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA92247CE9BE00800139 /* tnn_impl_coreml.h */; };
+		9DD1FB37247CE9BE00800139 /* metal_macro.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA93247CE9BE00800139 /* metal_macro.h */; };
+		9DD1FB38247CE9BE00800139 /* metal_blob_converter.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA94247CE9BE00800139 /* metal_blob_converter.mm */; };
+		9DD1FB39247CE9BE00800139 /* metal_blob_converter.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA95247CE9BE00800139 /* metal_blob_converter.metal */; };
+		9DD1FB3A247CE9BE00800139 /* metal_device.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA96247CE9BE00800139 /* metal_device.h */; };
+		9DD1FB3C247CE9BE00800139 /* metal_command_queue.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA98247CE9BE00800139 /* metal_command_queue.mm */; };
+		9DD1FB3D247CE9BE00800139 /* metal_context.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA99247CE9BE00800139 /* metal_context.mm */; };
+		9DD1FB3E247CE9BE00800139 /* tnn_impl_coreml.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA9A247CE9BE00800139 /* tnn_impl_coreml.mm */; };
+		9DD1FB3F247CE9BE00800139 /* metal_context.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA9B247CE9BE00800139 /* metal_context.h */; };
+		9DD1FB40247CE9BE00800139 /* metal_sigmoid_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA9D247CE9BE00800139 /* metal_sigmoid_layer_acc.mm */; };
+		9DD1FB41247CE9BE00800139 /* metal_permute_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FA9E247CE9BE00800139 /* metal_permute_layer_acc.metal */; };
+		9DD1FB42247CE9BE00800139 /* metal_prelu_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FA9F247CE9BE00800139 /* metal_prelu_layer_acc.h */; };
+		9DD1FB43247CE9BE00800139 /* metal_floor_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA0247CE9BE00800139 /* metal_floor_layer_acc.mm */; };
+		9DD1FB44247CE9BE00800139 /* metal_relu_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA1247CE9BE00800139 /* metal_relu_layer_acc.mm */; };
+		9DD1FB45247CE9BE00800139 /* metal_hard_swish_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA2247CE9BE00800139 /* metal_hard_swish_layer_acc.metal */; };
+		9DD1FB46247CE9BE00800139 /* metal_unary_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FAA3247CE9BE00800139 /* metal_unary_layer_acc.h */; };
+		9DD1FB47247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA4247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.mm */; };
+		9DD1FB48247CE9BE00800139 /* metal_mul_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA5247CE9BE00800139 /* metal_mul_layer_acc.mm */; };
+		9DD1FB49247CE9BE00800139 /* metal_ceil_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA6247CE9BE00800139 /* metal_ceil_layer_acc.metal */; };
+		9DD1FB4A247CE9BE00800139 /* metal_tan_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA7247CE9BE00800139 /* metal_tan_layer_acc.metal */; };
+		9DD1FB4B247CE9BE00800139 /* metal_stride_slice_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA8247CE9BE00800139 /* metal_stride_slice_layer_acc.mm */; };
+		9DD1FB4C247CE9BE00800139 /* metal_reduce_mean_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAA9247CE9BE00800139 /* metal_reduce_mean_layer_acc.mm */; };
+		9DD1FB4D247CE9BE00800139 /* metal_common.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAAA247CE9BE00800139 /* metal_common.metal */; };
+		9DD1FB4E247CE9BE00800139 /* metal_tanh_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAAB247CE9BE00800139 /* metal_tanh_layer_acc.mm */; };
+		9DD1FB4F247CE9BE00800139 /* metal_instance_norm_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAAC247CE9BE00800139 /* metal_instance_norm_layer_acc.mm */; };
+		9DD1FB50247CE9BE00800139 /* metal_sin_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAAD247CE9BE00800139 /* metal_sin_layer_acc.mm */; };
+		9DD1FB51247CE9BE00800139 /* metal_sub_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAAE247CE9BE00800139 /* metal_sub_layer_acc.mm */; };
+		9DD1FB53247CE9BE00800139 /* metal_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FAB0247CE9BE00800139 /* metal_common.h */; };
+		9DD1FB54247CE9BE00800139 /* metal_add_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB1247CE9BE00800139 /* metal_add_layer_acc.metal */; };
+		9DD1FB55247CE9BE00800139 /* metal_sqrt_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB2247CE9BE00800139 /* metal_sqrt_layer_acc.mm */; };
+		9DD1FB56247CE9BE00800139 /* metal_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB3247CE9BE00800139 /* metal_layer_acc.mm */; };
+		9DD1FB57247CE9BE00800139 /* metal_reduce_sum_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB4247CE9BE00800139 /* metal_reduce_sum_layer_acc.mm */; };
+		9DD1FB58247CE9BE00800139 /* metal_splitv_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB5247CE9BE00800139 /* metal_splitv_layer_acc.metal */; };
+		9DD1FB59247CE9BE00800139 /* metal_hard_swish_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB6247CE9BE00800139 /* metal_hard_swish_layer_acc.mm */; };
+		9DD1FB5A247CE9BE00800139 /* metal_log_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB7247CE9BE00800139 /* metal_log_layer_acc.metal */; };
+		9DD1FB5B247CE9BE00800139 /* metal_reshape_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB8247CE9BE00800139 /* metal_reshape_layer_acc.mm */; };
+		9DD1FB5C247CE9BE00800139 /* metal_ceil_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAB9247CE9BE00800139 /* metal_ceil_layer_acc.mm */; };
+		9DD1FB5D247CE9BE00800139 /* metal_reduce_sum_square_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABA247CE9BE00800139 /* metal_reduce_sum_square_layer_acc.mm */; };
+		9DD1FB5E247CE9BE00800139 /* metal_reduce_log_sum_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABB247CE9BE00800139 /* metal_reduce_log_sum_layer_acc.mm */; };
+		9DD1FB5F247CE9BE00800139 /* metal_atan_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABC247CE9BE00800139 /* metal_atan_layer_acc.metal */; };
+		9DD1FB60247CE9BE00800139 /* metal_selu_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABD247CE9BE00800139 /* metal_selu_layer_acc.metal */; };
+		9DD1FB61247CE9BE00800139 /* metal_batch_norm_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABE247CE9BE00800139 /* metal_batch_norm_layer_acc.metal */; };
+		9DD1FB62247CE9BE00800139 /* metal_floor_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FABF247CE9BE00800139 /* metal_floor_layer_acc.metal */; };
+		9DD1FB63247CE9BE00800139 /* metal_reduce_max_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC0247CE9BE00800139 /* metal_reduce_max_layer_acc.mm */; };
+		9DD1FB64247CE9BE00800139 /* metal_reduce_min_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC1247CE9BE00800139 /* metal_reduce_min_layer_acc.mm */; };
+		9DD1FB65247CE9BE00800139 /* metal_tanh_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC2247CE9BE00800139 /* metal_tanh_layer_acc.metal */; };
+		9DD1FB66247CE9BE00800139 /* metal_atan_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC3247CE9BE00800139 /* metal_atan_layer_acc.mm */; };
+		9DD1FB67247CE9BE00800139 /* metal_elu_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC4247CE9BE00800139 /* metal_elu_layer_acc.metal */; };
+		9DD1FB68247CE9BE00800139 /* metal_neg_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC5247CE9BE00800139 /* metal_neg_layer_acc.metal */; };
+		9DD1FB69247CE9BE00800139 /* metal_prelu_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC6247CE9BE00800139 /* metal_prelu_layer_acc.mm */; };
+		9DD1FB6A247CE9BE00800139 /* metal_reduce_l1_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC7247CE9BE00800139 /* metal_reduce_l1_layer_acc.mm */; };
+		9DD1FB6B247CE9BE00800139 /* metal_abs_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC8247CE9BE00800139 /* metal_abs_layer_acc.metal */; };
+		9DD1FB6C247CE9BE00800139 /* metal_reduce_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAC9247CE9BE00800139 /* metal_reduce_layer_acc.mm */; };
+		9DD1FB6D247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FACA247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.metal */; };
+		9DD1FB6F247CE9BE00800139 /* metal_acos_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FACC247CE9BE00800139 /* metal_acos_layer_acc.mm */; };
+		9DD1FB70247CE9BE00800139 /* metal_selu_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FACD247CE9BE00800139 /* metal_selu_layer_acc.mm */; };
+		9DD1FB71247CE9BE00800139 /* metal_batch_norm_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FACE247CE9BE00800139 /* metal_batch_norm_layer_acc.mm */; };
+		9DD1FB72247CE9BE00800139 /* metal_deconv_layer_common.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD0247CE9BE00800139 /* metal_deconv_layer_common.mm */; };
+		9DD1FB73247CE9BE00800139 /* metal_deconv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FAD1247CE9BE00800139 /* metal_deconv_layer_acc.h */; };
+		9DD1FB74247CE9BE00800139 /* metal_deconv_layer_common.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD2247CE9BE00800139 /* metal_deconv_layer_common.metal */; };
+		9DD1FB75247CE9BE00800139 /* metal_deconv_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FAD3247CE9BE00800139 /* metal_deconv_layer_depthwise.h */; };
+		9DD1FB76247CE9BE00800139 /* metal_deconv_layer_depthwise.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD4247CE9BE00800139 /* metal_deconv_layer_depthwise.mm */; };
+		9DD1FB77247CE9BE00800139 /* metal_deconv_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD5247CE9BE00800139 /* metal_deconv_layer_acc.mm */; };
+		9DD1FB78247CE9BE00800139 /* metal_deconv_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FAD6247CE9BE00800139 /* metal_deconv_layer_common.h */; };
+		9DD1FB79247CE9BE00800139 /* metal_deconv_layer_depthwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD7247CE9BE00800139 /* metal_deconv_layer_depthwise.metal */; };
+		9DD1FB7A247CE9BE00800139 /* metal_splitv_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD8247CE9BE00800139 /* metal_splitv_layer_acc.mm */; };
+		9DD1FB7B247CE9BE00800139 /* metal_sign_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAD9247CE9BE00800139 /* metal_sign_layer_acc.mm */; };
+		9DD1FB7C247CE9BE00800139 /* metal_softplus_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FADA247CE9BE00800139 /* metal_softplus_layer_acc.metal */; };
+		9DD1FB7D247CE9BE00800139 /* metal_relu6_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FADB247CE9BE00800139 /* metal_relu6_layer_acc.metal */; };
+		9DD1FB7E247CE9BE00800139 /* metal_reciprocal_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FADC247CE9BE00800139 /* metal_reciprocal_layer_acc.metal */; };
+		9DD1FB7F247CE9BE00800139 /* metal_log_sigmoid_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FADD247CE9BE00800139 /* metal_log_sigmoid_layer_acc.mm */; };
+		9DD1FB80247CE9BE00800139 /* metal_shuffle_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FADE247CE9BE00800139 /* metal_shuffle_layer_acc.metal */; };
+		9DD1FB81247CE9BE00800139 /* metal_reduce_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FADF247CE9BE00800139 /* metal_reduce_layer_acc.h */; };
+		9DD1FB82247CE9BE00800139 /* metal_div_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE0247CE9BE00800139 /* metal_div_layer_acc.mm */; };
+		9DD1FB83247CE9BE00800139 /* metal_exp_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE1247CE9BE00800139 /* metal_exp_layer_acc.mm */; };
+		9DD1FB84247CE9BE00800139 /* metal_reduce_prod_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE2247CE9BE00800139 /* metal_reduce_prod_layer_acc.mm */; };
+		9DD1FB85247CE9BE00800139 /* metal_min_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE3247CE9BE00800139 /* metal_min_layer_acc.metal */; };
+		9DD1FB86247CE9BE00800139 /* metal_hdrguide_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE4247CE9BE00800139 /* metal_hdrguide_layer_acc.metal */; };
+		9DD1FB87247CE9BE00800139 /* metal_sin_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE5247CE9BE00800139 /* metal_sin_layer_acc.metal */; };
+		9DD1FB88247CE9BE00800139 /* metal_pow_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE6247CE9BE00800139 /* metal_pow_layer_acc.mm */; };
+		9DD1FB89247CE9BE00800139 /* metal_softmax_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE7247CE9BE00800139 /* metal_softmax_layer_acc.mm */; };
+		9DD1FB8A247CE9BE00800139 /* metal_lrn_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE8247CE9BE00800139 /* metal_lrn_layer_acc.metal */; };
+		9DD1FB8B247CE9BE00800139 /* metal_mul_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAE9247CE9BE00800139 /* metal_mul_layer_acc.metal */; };
+		9DD1FB8C247CE9BE00800139 /* metal_normalize_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAEA247CE9BE00800139 /* metal_normalize_layer_acc.mm */; };
+		9DD1FB8D247CE9BE00800139 /* metal_reduce_l2_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAEB247CE9BE00800139 /* metal_reduce_l2_layer_acc.mm */; };
+		9DD1FB8E247CE9BE00800139 /* metal_elu_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAEC247CE9BE00800139 /* metal_elu_layer_acc.mm */; };
+		9DD1FB8F247CE9BE00800139 /* metal_abs_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAED247CE9BE00800139 /* metal_abs_layer_acc.mm */; };
+		9DD1FB90247CE9BE00800139 /* metal_concat_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAEE247CE9BE00800139 /* metal_concat_layer_acc.mm */; };
+		9DD1FB91247CE9BE00800139 /* metal_concat_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAEF247CE9BE00800139 /* metal_concat_layer_acc.metal */; };
+		9DD1FB92247CE9BE00800139 /* metal_tan_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF0247CE9BE00800139 /* metal_tan_layer_acc.mm */; };
+		9DD1FB93247CE9BE00800139 /* metal_clip_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF1247CE9BE00800139 /* metal_clip_layer_acc.metal */; };
+		9DD1FB94247CE9BE00800139 /* metal_cos_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF2247CE9BE00800139 /* metal_cos_layer_acc.mm */; };
+		9DD1FB95247CE9BE00800139 /* metal_min_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF3247CE9BE00800139 /* metal_min_layer_acc.mm */; };
+		9DD1FB96247CE9BE00800139 /* metal_normalize_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF4247CE9BE00800139 /* metal_normalize_layer_acc.metal */; };
+		9DD1FB97247CE9BE00800139 /* metal_relu_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF5247CE9BE00800139 /* metal_relu_layer_acc.metal */; };
+		9DD1FB98247CE9BE00800139 /* metal_softmax_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF6247CE9BE00800139 /* metal_softmax_layer_acc.metal */; };
+		9DD1FB99247CE9BE00800139 /* metal_clip_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF7247CE9BE00800139 /* metal_clip_layer_acc.mm */; };
+		9DD1FB9A247CE9BE00800139 /* metal_exp_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF8247CE9BE00800139 /* metal_exp_layer_acc.metal */; };
+		9DD1FB9B247CE9BE00800139 /* metal_max_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAF9247CE9BE00800139 /* metal_max_layer_acc.mm */; };
+		9DD1FB9C247CE9BE00800139 /* metal_sigmoid_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFA247CE9BE00800139 /* metal_sigmoid_layer_acc.metal */; };
+		9DD1FB9D247CE9BE00800139 /* metal_softplus_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFB247CE9BE00800139 /* metal_softplus_layer_acc.mm */; };
+		9DD1FB9E247CE9BE00800139 /* metal_div_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFC247CE9BE00800139 /* metal_div_layer_acc.metal */; };
+		9DD1FB9F247CE9BE00800139 /* metal_cos_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFD247CE9BE00800139 /* metal_cos_layer_acc.metal */; };
+		9DD1FBA0247CE9BE00800139 /* metal_reduce_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFE247CE9BE00800139 /* metal_reduce_layer_acc.metal */; };
+		9DD1FBA1247CE9BE00800139 /* metal_prelu_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FAFF247CE9BE00800139 /* metal_prelu_layer_acc.metal */; };
+		9DD1FBA2247CE9BE00800139 /* metal_pad_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB00247CE9BE00800139 /* metal_pad_layer_acc.mm */; };
+		9DD1FBA3247CE9BE00800139 /* metal_relu6_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB01247CE9BE00800139 /* metal_relu6_layer_acc.mm */; };
+		9DD1FBA4247CE9BE00800139 /* metal_stride_slice_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB02247CE9BE00800139 /* metal_stride_slice_layer_acc.metal */; };
+		9DD1FBA6247CE9BE00800139 /* metal_pad_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB04247CE9BE00800139 /* metal_pad_layer_acc.metal */; };
+		9DD1FBA7247CE9BE00800139 /* metal_add_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB05247CE9BE00800139 /* metal_add_layer_acc.mm */; };
+		9DD1FBA8247CE9BE00800139 /* metal_hdrguide_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB06247CE9BE00800139 /* metal_hdrguide_layer_acc.mm */; };
+		9DD1FBA9247CE9BE00800139 /* metal_reshape_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB07247CE9BE00800139 /* metal_reshape_layer_acc.metal */; };
+		9DD1FBAA247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB08247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.h */; };
+		9DD1FBAB247CE9BE00800139 /* metal_instance_norm_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB09247CE9BE00800139 /* metal_instance_norm_layer_acc.metal */; };
+		9DD1FBAC247CE9BE00800139 /* metal_sign_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0A247CE9BE00800139 /* metal_sign_layer_acc.metal */; };
+		9DD1FBAD247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0B247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.mm */; };
+		9DD1FBAE247CE9BE00800139 /* metal_reciprocal_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0C247CE9BE00800139 /* metal_reciprocal_layer_acc.mm */; };
+		9DD1FBAF247CE9BE00800139 /* metal_permute_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0D247CE9BE00800139 /* metal_permute_layer_acc.mm */; };
+		9DD1FBB0247CE9BE00800139 /* metal_asin_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0E247CE9BE00800139 /* metal_asin_layer_acc.metal */; };
+		9DD1FBB1247CE9BE00800139 /* metal_reduce_log_sum_exp_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB0F247CE9BE00800139 /* metal_reduce_log_sum_exp_layer_acc.mm */; };
+		9DD1FBB2247CE9BE00800139 /* metal_max_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB10247CE9BE00800139 /* metal_max_layer_acc.metal */; };
+		9DD1FBB3247CE9BE00800139 /* metal_asin_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB11247CE9BE00800139 /* metal_asin_layer_acc.mm */; };
+		9DD1FBB4247CE9BE00800139 /* metal_conv_layer_common.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB13247CE9BE00800139 /* metal_conv_layer_common.mm */; };
+		9DD1FBB5247CE9BE00800139 /* metal_conv_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB14247CE9BE00800139 /* metal_conv_layer_common.h */; };
+		9DD1FBB6247CE9BE00800139 /* metal_inner_product_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB15247CE9BE00800139 /* metal_inner_product_layer_acc.mm */; };
+		9DD1FBB7247CE9BE00800139 /* metal_conv_layer_winograd.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB16247CE9BE00800139 /* metal_conv_layer_winograd.h */; };
+		9DD1FBB8247CE9BE00800139 /* metal_conv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB17247CE9BE00800139 /* metal_conv_layer_acc.h */; };
+		9DD1FBB9247CE9BE00800139 /* metal_conv_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB18247CE9BE00800139 /* metal_conv_layer_depthwise.h */; };
+		9DD1FBBA247CE9BE00800139 /* metal_conv_layer_depthwise.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB19247CE9BE00800139 /* metal_conv_layer_depthwise.mm */; };
+		9DD1FBBB247CE9BE00800139 /* metal_conv_layer_common.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB1A247CE9BE00800139 /* metal_conv_layer_common.metal */; };
+		9DD1FBBC247CE9BE00800139 /* metal_conv_layer_1x1.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB1B247CE9BE00800139 /* metal_conv_layer_1x1.h */; };
+		9DD1FBBD247CE9BE00800139 /* metal_conv_layer_1x1.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB1C247CE9BE00800139 /* metal_conv_layer_1x1.mm */; };
+		9DD1FBBE247CE9BE00800139 /* metal_inner_product_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB1D247CE9BE00800139 /* metal_inner_product_layer_acc.h */; };
+		9DD1FBBF247CE9BE00800139 /* metal_conv_layer_winograd.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB1E247CE9BE00800139 /* metal_conv_layer_winograd.metal */; };
+		9DD1FBC0247CE9BE00800139 /* metal_conv_layer_1x1.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB1F247CE9BE00800139 /* metal_conv_layer_1x1.metal */; };
+		9DD1FBC1247CE9BE00800139 /* metal_inner_product_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB20247CE9BE00800139 /* metal_inner_product_layer_acc.metal */; };
+		9DD1FBC2247CE9BE00800139 /* metal_conv_layer_depthwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB21247CE9BE00800139 /* metal_conv_layer_depthwise.metal */; };
+		9DD1FBC3247CE9BE00800139 /* metal_conv_layer_winograd.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB22247CE9BE00800139 /* metal_conv_layer_winograd.mm */; };
+		9DD1FBC4247CE9BE00800139 /* metal_conv_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB23247CE9BE00800139 /* metal_conv_layer_acc.mm */; };
+		9DD1FBC5247CE9BE00800139 /* metal_unary_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB24247CE9BE00800139 /* metal_unary_layer_acc.mm */; };
+		9DD1FBC6247CE9BE00800139 /* metal_upsample_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB25247CE9BE00800139 /* metal_upsample_layer_acc.mm */; };
+		9DD1FBC7247CE9BE00800139 /* metal_upsample_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB26247CE9BE00800139 /* metal_upsample_layer_acc.metal */; };
+		9DD1FBC8247CE9BE00800139 /* metal_sqrt_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB27247CE9BE00800139 /* metal_sqrt_layer_acc.metal */; };
+		9DD1FBC9247CE9BE00800139 /* metal_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FB28247CE9BE00800139 /* metal_layer_acc.h */; };
+		9DD1FBCA247CE9BE00800139 /* metal_lrn_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB29247CE9BE00800139 /* metal_lrn_layer_acc.mm */; };
+		9DD1FBCB247CE9BF00800139 /* metal_shuffle_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2A247CE9BE00800139 /* metal_shuffle_layer_acc.mm */; };
+		9DD1FBCC247CE9BF00800139 /* metal_pow_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2B247CE9BE00800139 /* metal_pow_layer_acc.metal */; };
+		9DD1FBCD247CE9BF00800139 /* metal_neg_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2C247CE9BE00800139 /* metal_neg_layer_acc.mm */; };
+		9DD1FBCE247CE9BF00800139 /* metal_log_sigmoid_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2D247CE9BE00800139 /* metal_log_sigmoid_layer_acc.metal */; };
+		9DD1FBCF247CE9BF00800139 /* metal_log_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2E247CE9BE00800139 /* metal_log_layer_acc.mm */; };
+		9DD1FBD0247CE9BF00800139 /* metal_sub_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB2F247CE9BE00800139 /* metal_sub_layer_acc.metal */; };
+		9DD1FBD1247CE9BF00800139 /* metal_acos_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FB30247CE9BE00800139 /* metal_acos_layer_acc.metal */; };
+		9DD1FC65247CEA1400800139 /* arm_util.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FBD2247CEA1200800139 /* arm_util.cc */; };
+		9DD1FC66247CEA1400800139 /* arm_util.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FBD3247CEA1200800139 /* arm_util.h */; };
+		9DD1FC67247CEA1400800139 /* arm_context.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FBD4247CEA1200800139 /* arm_context.cc */; };
+		9DD1FC68247CEA1400800139 /* arm_context.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FBD5247CEA1200800139 /* arm_context.h */; };
+		9DD1FC69247CEA1400800139 /* arm_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FBD6247CEA1300800139 /* arm_common.h */; };
+		9DD1FCEE247CEA1500800139 /* arm_blob_converter.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FC61247CEA1400800139 /* arm_blob_converter.h */; };
+		9DD1FCEF247CEA1500800139 /* arm_blob_converter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FC62247CEA1400800139 /* arm_blob_converter.cc */; };
+		9DD1FCF0247CEA1500800139 /* arm_device.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DD1FC63247CEA1400800139 /* arm_device.h */; };
+		9DD1FCF1247CEA1500800139 /* arm_device.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DD1FC64247CEA1400800139 /* arm_device.cc */; };
+		9DDA7090241F99E700F17A1C /* version.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DDA7081241F99E600F17A1C /* version.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		9DDA70A1241F9A0300F17A1C /* core in Headers */ = {isa = PBXBuildFile; fileRef = 9DDA709E241F99F800F17A1C /* core */; settings = {ATTRIBUTES = (Public, ); }; };
+		9DDA70A2241F9A0300F17A1C /* utils in Headers */ = {isa = PBXBuildFile; fileRef = 9DDA709D241F99F800F17A1C /* utils */; settings = {ATTRIBUTES = (Public, ); }; };
+		9DF19E9F24A1FE8E00E1376D /* metal_pooling_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 9DF19E9D24A1FE8E00E1376D /* metal_pooling_layer_acc.metal */; };
+		9DF19EA024A1FE8E00E1376D /* metal_pooling_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9DF19E9E24A1FE8E00E1376D /* metal_pooling_layer_acc.mm */; };
+		9DF19EA224A200AC00E1376D /* metal_cpu_adapter_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF19EA124A200AC00E1376D /* metal_cpu_adapter_acc.h */; };
+		9DF26BD924645EA500F22F0D /* naive_compute.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF26BD724645EA500F22F0D /* naive_compute.cc */; };
+		9DF26BDA24645EA500F22F0D /* naive_compute.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF26BD824645EA500F22F0D /* naive_compute.h */; };
+		9DF54392258B1366006CEC97 /* arm_sqrt_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E0258B1363006CEC97 /* arm_sqrt_layer_acc.cc */; };
+		9DF54393258B1366006CEC97 /* arm_sigmoid_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E1258B1363006CEC97 /* arm_sigmoid_layer_acc.cc */; };
+		9DF54394258B1366006CEC97 /* arm_unary_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E2258B1363006CEC97 /* arm_unary_layer_acc.cc */; };
+		9DF54395258B1366006CEC97 /* arm_sub_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E3258B1363006CEC97 /* arm_sub_layer_acc.cc */; };
+		9DF54396258B1366006CEC97 /* Half8.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542E4258B1363006CEC97 /* Half8.h */; };
+		9DF54397258B1366006CEC97 /* arm_log_acc_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E5258B1363006CEC97 /* arm_log_acc_layer_acc.cc */; };
+		9DF54398258B1366006CEC97 /* arm_reduce_log_sum_exp_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E6258B1363006CEC97 /* arm_reduce_log_sum_exp_layer_acc.cc */; };
+		9DF54399258B1366006CEC97 /* arm_splitv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E7258B1363006CEC97 /* arm_splitv_layer_acc.cc */; };
+		9DF5439A258B1366006CEC97 /* arm_reduce_prod_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542E8258B1363006CEC97 /* arm_reduce_prod_layer_acc.cc */; };
+		9DF5439B258B1366006CEC97 /* arm_nchw_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542E9258B1363006CEC97 /* arm_nchw_layer_acc.h */; };
+		9DF5439C258B1366006CEC97 /* arm_scale_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542EA258B1363006CEC97 /* arm_scale_layer_acc.cc */; };
+		9DF5439D258B1366006CEC97 /* arm_instance_norm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542EB258B1363006CEC97 /* arm_instance_norm_layer_acc.cc */; };
+		9DF5439E258B1366006CEC97 /* arm_inner_product_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542EC258B1363006CEC97 /* arm_inner_product_layer_acc.cc */; };
+		9DF5439F258B1366006CEC97 /* arm_reduce_l2_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542ED258B1363006CEC97 /* arm_reduce_l2_layer_acc.cc */; };
+		9DF543A0258B1366006CEC97 /* arm_reduce_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542EE258B1363006CEC97 /* arm_reduce_layer_acc.cc */; };
+		9DF543A2258B1366006CEC97 /* arm_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F0258B1363006CEC97 /* arm_mul_layer_acc.cc */; };
+		9DF543A3258B1366006CEC97 /* arm_normalize_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F1258B1363006CEC97 /* arm_normalize_layer_acc.cc */; };
+		9DF543A4258B1366006CEC97 /* arm_binary_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542F2258B1363006CEC97 /* arm_binary_layer_acc.h */; };
+		9DF543A5258B1366006CEC97 /* TNNVector.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542F3258B1363006CEC97 /* TNNVector.h */; };
+		9DF543A6258B1366006CEC97 /* arm_reshape_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F4258B1363006CEC97 /* arm_reshape_layer_acc.cc */; };
+		9DF543A7258B1366006CEC97 /* arm_prelu_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542F5258B1363006CEC97 /* arm_prelu_layer_acc.h */; };
+		9DF543A8258B1366006CEC97 /* arm_sign_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F6258B1363006CEC97 /* arm_sign_layer_acc.cc */; };
+		9DF543A9258B1366006CEC97 /* arm_reformat_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F7258B1363006CEC97 /* arm_reformat_layer_acc.cc */; };
+		9DF543AA258B1366006CEC97 /* arm_unary_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542F8258B1363006CEC97 /* arm_unary_layer_acc.h */; };
+		9DF543AB258B1366006CEC97 /* arm_selu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542F9258B1363006CEC97 /* arm_selu_layer_acc.cc */; };
+		9DF543AC258B1366006CEC97 /* arm_relu6_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542FA258B1363006CEC97 /* arm_relu6_layer_acc.cc */; };
+		9DF543AD258B1366006CEC97 /* arm_trig_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542FB258B1363006CEC97 /* arm_trig_layer_acc.cc */; };
+		9DF543AE258B1366006CEC97 /* arm_reduce_log_sum_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542FC258B1363006CEC97 /* arm_reduce_log_sum_layer_acc.cc */; };
+		9DF543AF258B1366006CEC97 /* arm_reformat_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF542FD258B1363006CEC97 /* arm_reformat_layer_acc.h */; };
+		9DF543B0258B1366006CEC97 /* arm_arg_max_or_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542FE258B1363006CEC97 /* arm_arg_max_or_min_layer_acc.cc */; };
+		9DF543B1258B1366006CEC97 /* arm_reciprocal_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF542FF258B1363006CEC97 /* arm_reciprocal_layer_acc.cc */; };
+		9DF543B2258B1366006CEC97 /* arm_deconv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54301258B1363006CEC97 /* arm_deconv_layer_acc.cc */; };
+		9DF543B4258B1366006CEC97 /* arm_deconv_layer_stride.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54303258B1363006CEC97 /* arm_deconv_layer_stride.h */; };
+		9DF543B5258B1366006CEC97 /* arm_deconv_fp16_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54304258B1363006CEC97 /* arm_deconv_fp16_layer_common.h */; };
+		9DF543B6258B1366006CEC97 /* arm_deconv_layer_depthwise.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54305258B1363006CEC97 /* arm_deconv_layer_depthwise.cc */; };
+		9DF543B7258B1366006CEC97 /* arm_deconv_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54306258B1363006CEC97 /* arm_deconv_layer_common.cc */; };
+		9DF543B9258B1366006CEC97 /* arm_deconv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54308258B1363006CEC97 /* arm_deconv_layer_acc.h */; };
+		9DF543BA258B1366006CEC97 /* arm_deconv_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54309258B1364006CEC97 /* arm_deconv_layer_depthwise.h */; };
+		9DF543BB258B1366006CEC97 /* arm_deconv_fp16_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5430A258B1364006CEC97 /* arm_deconv_fp16_layer_depthwise.h */; };
+		9DF543BC258B1366006CEC97 /* arm_deconv_layer_stride.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5430B258B1364006CEC97 /* arm_deconv_layer_stride.cc */; };
+		9DF543BD258B1366006CEC97 /* arm_deconv_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5430C258B1364006CEC97 /* arm_deconv_layer_common.h */; };
+		9DF543BF258B1366006CEC97 /* arm_reorg_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5430E258B1364006CEC97 /* arm_reorg_layer_acc.cc */; };
+		9DF543C0258B1366006CEC97 /* arm_hard_sigmoid_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5430F258B1364006CEC97 /* arm_hard_sigmoid_acc.cc */; };
+		9DF543C1258B1366006CEC97 /* arm_reduce_mean_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54310258B1364006CEC97 /* arm_reduce_mean_layer_acc.cc */; };
+		9DF543C2258B1366006CEC97 /* arm_pool_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54311258B1364006CEC97 /* arm_pool_layer_acc.h */; };
+		9DF543C3258B1366006CEC97 /* arm_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54312258B1364006CEC97 /* arm_min_layer_acc.cc */; };
+		9DF543C4258B1366006CEC97 /* arm_max_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54313258B1364006CEC97 /* arm_max_layer_acc.cc */; };
+		9DF543C5258B1366006CEC97 /* arm_softmax_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54314258B1364006CEC97 /* arm_softmax_layer_acc.cc */; };
+		9DF543C7258B1366006CEC97 /* arm_add_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54316258B1364006CEC97 /* arm_add_layer_acc.h */; };
+		9DF543C8258B1366006CEC97 /* arm_clip_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54317258B1364006CEC97 /* arm_clip_layer_acc.cc */; };
+		9DF543C9258B1366006CEC97 /* arm_pad_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54318258B1364006CEC97 /* arm_pad_layer_acc.cc */; };
+		9DF543CA258B1366006CEC97 /* arm_pixel_shuffle_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54319258B1364006CEC97 /* arm_pixel_shuffle_layer_acc.cc */; };
+		9DF543CB258B1366006CEC97 /* compute.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5431B258B1364006CEC97 /* compute.cc */; };
+		9DF543CC258B1366006CEC97 /* winograd_function.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5431C258B1364006CEC97 /* winograd_function.cc */; };
+		9DF543CD258B1366006CEC97 /* compute.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5431D258B1364006CEC97 /* compute.h */; };
+		9DF543CE258B1366006CEC97 /* compute_int8.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5431E258B1364006CEC97 /* compute_int8.cc */; };
+		9DF543CF258B1366006CEC97 /* CONV_BFP16_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54320258B1364006CEC97 /* CONV_BFP16_O4.S */; };
+		9DF543D0258B1366006CEC97 /* GEMM_INT8_4X8.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54321258B1364006CEC97 /* GEMM_INT8_4X8.S */; };
+		9DF543D1258B1366006CEC97 /* GEMM_BFP16_N4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54322258B1364006CEC97 /* GEMM_BFP16_N4.S */; };
+		9DF543D2258B1366006CEC97 /* CONV_FLOAT_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54323258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */; };
+		9DF543D3258B1366006CEC97 /* CONV_DW_5X5_BFP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54324258B1364006CEC97 /* CONV_DW_5X5_BFP16_SLIDEW.S */; };
+		9DF543D4258B1366006CEC97 /* DECONV_FLOAT_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54325258B1364006CEC97 /* DECONV_FLOAT_O4.S */; };
+		9DF543D5258B1366006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54326258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */; };
+		9DF543D6258B1366006CEC97 /* GEMM_INT8_4X4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54327258B1364006CEC97 /* GEMM_INT8_4X4.S */; };
+		9DF543D7258B1366006CEC97 /* CONV_DW_3X3_BFP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54328258B1364006CEC97 /* CONV_DW_3X3_BFP16_SLIDEW.S */; };
+		9DF543D8258B1366006CEC97 /* CONV_BFP16_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54329258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */; };
+		9DF543D9258B1366006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5432A258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */; };
+		9DF543DA258B1366006CEC97 /* CONV_FLOAT_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5432B258B1364006CEC97 /* CONV_FLOAT_O4.S */; };
+		9DF543DB258B1366006CEC97 /* GEMM_FLOAT_N4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5432C258B1364006CEC97 /* GEMM_FLOAT_N4.S */; };
+		9DF543E3258B1366006CEC97 /* gemm_function.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54335258B1364006CEC97 /* gemm_function.h */; };
+		9DF543E4258B1366006CEC97 /* asm_func_name.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54336258B1364006CEC97 /* asm_func_name.S */; };
+		9DF543E5258B1366006CEC97 /* winograd_function.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54337258B1364006CEC97 /* winograd_function.h */; };
+		9DF543E6258B1366006CEC97 /* compute_int8.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54338258B1364006CEC97 /* compute_int8.h */; };
+		9DF543E7258B1366006CEC97 /* CONV_BFP16_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433A258B1364006CEC97 /* CONV_BFP16_O4.S */; };
+		9DF543E8258B1366006CEC97 /* CONV_FLOAT_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433B258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */; };
+		9DF543E9258B1366006CEC97 /* CONV_DW_5x5_BFP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433C258B1364006CEC97 /* CONV_DW_5x5_BFP16_SLIDEW.S */; };
+		9DF543EA258B1366006CEC97 /* DECONV_FLOAT_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433D258B1364006CEC97 /* DECONV_FLOAT_O4.S */; };
+		9DF543EB258B1366006CEC97 /* GEMM_FLOAT_N8.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433E258B1364006CEC97 /* GEMM_FLOAT_N8.S */; };
+		9DF543EC258B1366006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5433F258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */; };
+		9DF543ED258B1366006CEC97 /* GEMM_BFP16_N8.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54340258B1364006CEC97 /* GEMM_BFP16_N8.S */; };
+		9DF543EE258B1366006CEC97 /* GEMM_INT8_4X4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54341258B1364006CEC97 /* GEMM_INT8_4X4.S */; };
+		9DF543EF258B1366006CEC97 /* CONV_DW_3x3_BFP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54342258B1364006CEC97 /* CONV_DW_3x3_BFP16_SLIDEW.S */; };
+		9DF543F0258B1366006CEC97 /* CONV_BFP16_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54343258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */; };
+		9DF543F1258B1366006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54344258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */; };
+		9DF543F2258B1366006CEC97 /* CONV_FLOAT_O4.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54345258B1364006CEC97 /* CONV_FLOAT_O4.S */; };
+		9DF543F3258B1366006CEC97 /* GEMM_INT8_8X8.S in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54346258B1364006CEC97 /* GEMM_INT8_8X8.S */; };
+		9DF543F5258B1366006CEC97 /* gemm_function.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54348258B1364006CEC97 /* gemm_function.cc */; };
+		9DF543F6258B1366006CEC97 /* arm_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54349258B1365006CEC97 /* arm_layer_acc.h */; };
+		9DF543F7258B1366006CEC97 /* arm_div_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5434A258B1365006CEC97 /* arm_div_layer_acc.cc */; };
+		9DF543F8258B1366006CEC97 /* arm_exp_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5434B258B1365006CEC97 /* arm_exp_layer_acc.cc */; };
+		9DF543F9258B1366006CEC97 /* arm_detection_output_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5434C258B1365006CEC97 /* arm_detection_output_layer_acc.cc */; };
+		9DF543FA258B1366006CEC97 /* arm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5434D258B1365006CEC97 /* arm_layer_acc.cc */; };
+		9DF543FB258B1366006CEC97 /* neon_mathfun.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5434E258B1365006CEC97 /* neon_mathfun.h */; };
+		9DF543FC258B1366006CEC97 /* arm_hard_swish_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5434F258B1365006CEC97 /* arm_hard_swish_acc.cc */; };
+		9DF543FD258B1366006CEC97 /* arm_pow_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54350258B1365006CEC97 /* arm_pow_layer_acc.cc */; };
+		9DF543FE258B1366006CEC97 /* arm_binary_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54351258B1365006CEC97 /* arm_binary_layer_acc.cc */; };
+		9DF543FF258B1366006CEC97 /* arm_elu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54352258B1365006CEC97 /* arm_elu_layer_acc.cc */; };
+		9DF54400258B1366006CEC97 /* arm_reshape_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54353258B1365006CEC97 /* arm_reshape_layer_acc.h */; };
+		9DF54401258B1366006CEC97 /* arm_upsample_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54354258B1365006CEC97 /* arm_upsample_layer_acc.cc */; };
+		9DF54402258B1366006CEC97 /* arm_abs_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54355258B1365006CEC97 /* arm_abs_layer_acc.cc */; };
+		9DF54403258B1367006CEC97 /* arm_nchw_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54356258B1365006CEC97 /* arm_nchw_layer_acc.cc */; };
+		9DF54405258B1367006CEC97 /* arm_stride_slice_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54358258B1365006CEC97 /* arm_stride_slice_layer_acc.cc */; };
+		9DF54406258B1367006CEC97 /* arm_inner_product_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54359258B1365006CEC97 /* arm_inner_product_layer_acc.h */; };
+		9DF54407258B1367006CEC97 /* Float4.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5435A258B1365006CEC97 /* Float4.h */; };
+		9DF54408258B1367006CEC97 /* arm_signed_mul_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5435B258B1365006CEC97 /* arm_signed_mul_layer_acc.h */; };
+		9DF5440B258B1367006CEC97 /* arm_neg_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5435E258B1365006CEC97 /* arm_neg_layer_acc.cc */; };
+		9DF5440C258B1367006CEC97 /* arm_reduce_sum_square_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5435F258B1365006CEC97 /* arm_reduce_sum_square_layer_acc.cc */; };
+		9DF5440D258B1367006CEC97 /* arm_log_sigmoid_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54360258B1365006CEC97 /* arm_log_sigmoid_layer_acc.cc */; };
+		9DF5440E258B1367006CEC97 /* arm_reduce_l1_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54361258B1365006CEC97 /* arm_reduce_l1_layer_acc.cc */; };
+		9DF5440F258B1367006CEC97 /* arm_priorbox_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54362258B1365006CEC97 /* arm_priorbox_layer_acc.cc */; };
+		9DF54410258B1367006CEC97 /* arm_conv_int8_layer_depthwise.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54364258B1365006CEC97 /* arm_conv_int8_layer_depthwise.cc */; };
+		9DF54411258B1367006CEC97 /* arm_conv_layer_depthwise_s1.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54365258B1365006CEC97 /* arm_conv_layer_depthwise_s1.h */; };
+		9DF54412258B1367006CEC97 /* arm_conv_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54366258B1365006CEC97 /* arm_conv_layer_common.cc */; };
+		9DF54413258B1367006CEC97 /* arm_conv_layer_1x1.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54367258B1365006CEC97 /* arm_conv_layer_1x1.h */; };
+		9DF54415258B1367006CEC97 /* arm_conv_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54369258B1365006CEC97 /* arm_conv_layer_common.h */; };
+		9DF54416258B1367006CEC97 /* arm_conv_layer_c3.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5436A258B1365006CEC97 /* arm_conv_layer_c3.h */; };
+		9DF54417258B1367006CEC97 /* arm_conv_int8_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5436B258B1365006CEC97 /* arm_conv_int8_layer_common.h */; };
+		9DF54419258B1367006CEC97 /* arm_conv_fp16_layer_c3.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5436D258B1365006CEC97 /* arm_conv_fp16_layer_c3.h */; };
+		9DF5441A258B1367006CEC97 /* arm_conv_int8_layer_1x1.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5436E258B1365006CEC97 /* arm_conv_int8_layer_1x1.cc */; };
+		9DF5441B258B1367006CEC97 /* arm_conv_fp16_layer_depthwise_s1.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5436F258B1365006CEC97 /* arm_conv_fp16_layer_depthwise_s1.h */; };
+		9DF5441C258B1367006CEC97 /* arm_conv_fp16_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54370258B1365006CEC97 /* arm_conv_fp16_layer_depthwise.h */; };
+		9DF5441D258B1367006CEC97 /* arm_conv_layer_1x1.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54371258B1365006CEC97 /* arm_conv_layer_1x1.cc */; };
+		9DF5441E258B1367006CEC97 /* arm_conv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54372258B1365006CEC97 /* arm_conv_layer_acc.h */; };
+		9DF5441F258B1367006CEC97 /* arm_conv_fp16_layer_3x3.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54373258B1365006CEC97 /* arm_conv_fp16_layer_3x3.h */; };
+		9DF54420258B1367006CEC97 /* arm_conv_fp16_layer_common.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54374258B1365006CEC97 /* arm_conv_fp16_layer_common.h */; };
+		9DF54421258B1367006CEC97 /* arm_conv_layer_3x3.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54375258B1365006CEC97 /* arm_conv_layer_3x3.cc */; };
+		9DF54422258B1367006CEC97 /* arm_conv_int8_layer_1x1.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54376258B1365006CEC97 /* arm_conv_int8_layer_1x1.h */; };
+		9DF54423258B1367006CEC97 /* arm_conv_layer_depthwise.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54377258B1365006CEC97 /* arm_conv_layer_depthwise.cc */; };
+		9DF54424258B1367006CEC97 /* arm_conv_int8_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54378258B1365006CEC97 /* arm_conv_int8_layer_common.cc */; };
+		9DF54425258B1367006CEC97 /* arm_conv_layer_group.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54379258B1365006CEC97 /* arm_conv_layer_group.h */; };
+		9DF54426258B1367006CEC97 /* arm_conv_layer_3x3.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5437A258B1365006CEC97 /* arm_conv_layer_3x3.h */; };
+		9DF54428258B1367006CEC97 /* arm_conv_layer_c3.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5437C258B1365006CEC97 /* arm_conv_layer_c3.cc */; };
+		9DF54429258B1367006CEC97 /* arm_conv_layer_acc_factory.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5437D258B1365006CEC97 /* arm_conv_layer_acc_factory.h */; };
+		9DF5442A258B1367006CEC97 /* arm_conv_layer_acc_factory.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5437E258B1365006CEC97 /* arm_conv_layer_acc_factory.cc */; };
+		9DF5442B258B1367006CEC97 /* arm_conv_int8_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5437F258B1365006CEC97 /* arm_conv_int8_layer_depthwise.h */; };
+		9DF5442C258B1367006CEC97 /* arm_conv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54380258B1365006CEC97 /* arm_conv_layer_acc.cc */; };
+		9DF5442F258B1367006CEC97 /* arm_conv_layer_group.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54383258B1365006CEC97 /* arm_conv_layer_group.cc */; };
+		9DF54430258B1367006CEC97 /* arm_conv_layer_depthwise.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54384258B1365006CEC97 /* arm_conv_layer_depthwise.h */; };
+		9DF54431258B1367006CEC97 /* arm_conv_layer_depthwise_s1.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54385258B1365006CEC97 /* arm_conv_layer_depthwise_s1.cc */; };
+		9DF54432258B1367006CEC97 /* arm_reduce_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54386258B1366006CEC97 /* arm_reduce_layer_acc.h */; };
+		9DF54433258B1367006CEC97 /* arm_signed_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54387258B1366006CEC97 /* arm_signed_mul_layer_acc.cc */; };
+		9DF54436258B1367006CEC97 /* arm_reduce_max_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5438A258B1366006CEC97 /* arm_reduce_max_layer_acc.cc */; };
+		9DF54437258B1367006CEC97 /* arm_reduce_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5438B258B1366006CEC97 /* arm_reduce_min_layer_acc.cc */; };
+		9DF54438258B1367006CEC97 /* arm_upsample_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF5438C258B1366006CEC97 /* arm_upsample_layer_acc.h */; };
+		9DF54439258B1367006CEC97 /* arm_floor_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5438D258B1366006CEC97 /* arm_floor_layer_acc.cc */; };
+		9DF5443A258B1367006CEC97 /* arm_shuffle_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5438E258B1366006CEC97 /* arm_shuffle_layer_acc.cc */; };
+		9DF5443B258B1367006CEC97 /* arm_softplus_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF5438F258B1366006CEC97 /* arm_softplus_layer_acc.cc */; };
+		9DF5443C258B1367006CEC97 /* arm_reduce_sum_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54390258B1366006CEC97 /* arm_reduce_sum_layer_acc.cc */; };
+		9DF5443D258B1367006CEC97 /* arm_batch_norm_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54391258B1366006CEC97 /* arm_batch_norm_layer_acc.h */; };
+		9DF5444A258B162F006CEC97 /* npu_common_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54443258B162F006CEC97 /* npu_common_utils.h */; };
+		9DF5444B258B162F006CEC97 /* blob_converter_default.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54444258B162F006CEC97 /* blob_converter_default.cc */; };
+		9DF5444C258B162F006CEC97 /* blob_converter_default.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54445258B162F006CEC97 /* blob_converter_default.h */; };
+		9DF5444D258B162F006CEC97 /* npu_common_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54446258B162F006CEC97 /* npu_common_utils.cc */; };
+		9DF5444F258B162F006CEC97 /* random_data_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 9DF54448258B162F006CEC97 /* random_data_utils.h */; };
+		9DF54450258B162F006CEC97 /* random_data_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9DF54449258B162F006CEC97 /* random_data_utils.cc */; };
+		E43D68B525C8F38000FAAF54 /* CONV_DW_3X3_INT8_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = E43D68B425C8F38000FAAF54 /* CONV_DW_3X3_INT8_SLIDEW.S */; };
+		E44D945926048B7F003FE4A3 /* blob_impl.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D945526048B7F003FE4A3 /* blob_impl.cc */; };
+		E44D945A26048B7F003FE4A3 /* blob_impl.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D945626048B7F003FE4A3 /* blob_impl.h */; };
+		E44D945B26048B7F003FE4A3 /* const_folder.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D945726048B7F003FE4A3 /* const_folder.h */; };
+		E44D946426048BE7003FE4A3 /* arm_lstm_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D946226048BE7003FE4A3 /* arm_lstm_fp16_layer.cc */; };
+		E44D946526048BE7003FE4A3 /* arm_inner_product_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D946326048BE7003FE4A3 /* arm_inner_product_fp16_layer.cc */; };
+		E44D946826048BF3003FE4A3 /* gemm_function_fp16.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D946726048BF3003FE4A3 /* gemm_function_fp16.cc */; };
+		E44D947026048C1A003FE4A3 /* dims_function_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D946B26048C1A003FE4A3 /* dims_function_utils.h */; };
+		E44D947226048C1A003FE4A3 /* dims_function_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D946D26048C1A003FE4A3 /* dims_function_utils.cc */; };
+		E44D947326048C1A003FE4A3 /* dims_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D946E26048C1A003FE4A3 /* dims_utils.h */; };
+		E44D947A26048C32003FE4A3 /* cpu_info.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D947626048C32003FE4A3 /* cpu_info.h */; };
+		E44D947B26048C32003FE4A3 /* cpu_info.cc in Sources */ = {isa = PBXBuildFile; fileRef = E44D947726048C32003FE4A3 /* cpu_info.cc */; };
+		E44D947C26048C32003FE4A3 /* data_flag_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = E44D947826048C32003FE4A3 /* data_flag_utils.h */; };
+		E4D05BA5259DCB2E00921502 /* arm_conv_fp16_layer_depthwise.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B7C259DCB2D00921502 /* arm_conv_fp16_layer_depthwise.cc */; };
+		E4D05BA6259DCB2E00921502 /* compute_half.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D05B7D259DCB2D00921502 /* compute_half.h */; };
+		E4D05BA7259DCB2E00921502 /* arm_deconv_fp16_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B7E259DCB2D00921502 /* arm_deconv_fp16_layer_common.cc */; };
+		E4D05BA9259DCB2E00921502 /* DECONV_FP16_O8_C1.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B81259DCB2D00921502 /* DECONV_FP16_O8_C1.S */; };
+		E4D05BAA259DCB2E00921502 /* CONV_DW_3X3_FP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B82259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */; };
+		E4D05BAB259DCB2E00921502 /* GEMM_FP16_N8.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B83259DCB2D00921502 /* GEMM_FP16_N8.S */; };
+		E4D05BAC259DCB2E00921502 /* FLOAT2HALF.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B84259DCB2D00921502 /* FLOAT2HALF.S */; };
+		E4D05BAE259DCB2E00921502 /* CONV_FP16_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B86259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */; };
+		E4D05BAF259DCB2E00921502 /* DECONV_FP16_O8.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B87259DCB2D00921502 /* DECONV_FP16_O8.S */; };
+		E4D05BB0259DCB2E00921502 /* HALF2FLOAT.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B88259DCB2D00921502 /* HALF2FLOAT.S */; };
+		E4D05BB3259DCB2E00921502 /* arm_unary_fp16_layer.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D05B8B259DCB2D00921502 /* arm_unary_fp16_layer.h */; };
+		E4D05BB4259DCB2E00921502 /* arm_conv_fp16_layer_c3.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B8C259DCB2D00921502 /* arm_conv_fp16_layer_c3.cc */; };
+		E4D05BB7259DCB2E00921502 /* arm_conv_fp16_layer_depthwise_s1.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B8F259DCB2D00921502 /* arm_conv_fp16_layer_depthwise_s1.cc */; };
+		E4D05BB8259DCB2E00921502 /* arm_sigmoid_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B90259DCB2D00921502 /* arm_sigmoid_fp16_layer.cc */; };
+		E4D05BB9259DCB2E00921502 /* arm_deconv_fp16_layer_depthwise.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B91259DCB2D00921502 /* arm_deconv_fp16_layer_depthwise.cc */; };
+		E4D05BBA259DCB2E00921502 /* DECONV_FP16_O8_C1.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B93259DCB2D00921502 /* DECONV_FP16_O8_C1.S */; };
+		E4D05BBB259DCB2E00921502 /* CONV_DW_3X3_FP16_SLIDEW.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B94259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */; };
+		E4D05BBC259DCB2E00921502 /* GEMM_FP16_N8.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B95259DCB2D00921502 /* GEMM_FP16_N8.S */; };
+		E4D05BBD259DCB2E00921502 /* FLOAT2HALF.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B96259DCB2D00921502 /* FLOAT2HALF.S */; };
+		E4D05BBE259DCB2E00921502 /* CONV_FP16_SLIDEW_C3.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B97259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */; };
+		E4D05BBF259DCB2E00921502 /* DECONV_FP16_O8.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B98259DCB2D00921502 /* DECONV_FP16_O8.S */; };
+		E4D05BC0259DCB2E00921502 /* HALF2FLOAT.S in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B99259DCB2D00921502 /* HALF2FLOAT.S */; };
+		E4D05BC2259DCB2E00921502 /* compute_half.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B9B259DCB2D00921502 /* compute_half.cc */; };
+		E4D05BC3259DCB2E00921502 /* arm_softmax_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B9C259DCB2D00921502 /* arm_softmax_fp16_layer.cc */; };
+		E4D05BC6259DCB2E00921502 /* arm_conv_fp16_layer_3x3.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05B9F259DCB2D00921502 /* arm_conv_fp16_layer_3x3.cc */; };
+		E4D05BC7259DCB2E00921502 /* arm_relu6_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BA0259DCB2D00921502 /* arm_relu6_fp16_layer.cc */; };
+		E4D05BC8259DCB2E00921502 /* arm_conv_fp16_layer_common.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BA1259DCB2D00921502 /* arm_conv_fp16_layer_common.cc */; };
+		E4D05BCB259DCB2E00921502 /* winograd_function_fp16.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BA4259DCB2D00921502 /* winograd_function_fp16.cc */; };
+		E4D05BEC259F15C700921502 /* arm_relu_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D05BE4259F15C600921502 /* arm_relu_layer_acc.h */; };
+		E4D05BED259F15C700921502 /* arm_batch_norm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BE5259F15C600921502 /* arm_batch_norm_layer_acc.cc */; };
+		E4D05BEE259F15C700921502 /* arm_softmax_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D05BE6259F15C600921502 /* arm_softmax_layer_acc.h */; };
+		E4D05BEF259F15C700921502 /* arm_permute_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BE7259F15C600921502 /* arm_permute_layer_acc.cc */; };
+		E4D05BF0259F15C700921502 /* arm_pool_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BE8259F15C700921502 /* arm_pool_layer_acc.cc */; };
+		E4D05BF1259F15C700921502 /* arm_concat_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BE9259F15C700921502 /* arm_concat_layer_acc.cc */; };
+		E4D05BF2259F15C700921502 /* arm_prelu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BEA259F15C700921502 /* arm_prelu_layer_acc.cc */; };
+		E4D05BF3259F15C700921502 /* arm_relu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BEB259F15C700921502 /* arm_relu_layer_acc.cc */; };
+		E4D05BF8259F161000921502 /* arm_batch_norm_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BF5259F161000921502 /* arm_batch_norm_fp16_layer.cc */; };
+		E4D05BF9259F161000921502 /* arm_prelu_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BF6259F161000921502 /* arm_prelu_fp16_layer.cc */; };
+		E4D05BFA259F161000921502 /* arm_relu_fp16_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05BF7259F161000921502 /* arm_relu_fp16_layer.cc */; };
+		E4D05C03259F1BA700921502 /* arm_add_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = E4D05C02259F1BA700921502 /* arm_add_layer_acc.cc */; };
+		EC0BE13725144B5E009BD69A /* detection_post_process_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = EC0BE13425144B5D009BD69A /* detection_post_process_utils.h */; };
+		EC0BE13825144B5E009BD69A /* detection_post_process_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE13525144B5D009BD69A /* detection_post_process_utils.cc */; };
+		EC0BE13925144B5E009BD69A /* string_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE13625144B5D009BD69A /* string_utils.cc */; };
+		EC0BE14F25144BB8009BD69A /* reduce_l1_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14825144BB7009BD69A /* reduce_l1_layer.cc */; };
+		EC0BE15025144BB8009BD69A /* ceil_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14925144BB8009BD69A /* ceil_layer.cc */; };
+		EC0BE15125144BB8009BD69A /* signed_mul_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14A25144BB8009BD69A /* signed_mul_layer.cc */; };
+		EC0BE15225144BB8009BD69A /* squared_difference_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14B25144BB8009BD69A /* squared_difference_layer.cc */; };
+		EC0BE15325144BB8009BD69A /* detection_post_process_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14C25144BB8009BD69A /* detection_post_process_layer.cc */; };
+		EC0BE15425144BB8009BD69A /* arg_max_or_min_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14D25144BB8009BD69A /* arg_max_or_min_layer.cc */; };
+		EC0BE15525144BB8009BD69A /* rsqrt_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE14E25144BB8009BD69A /* rsqrt_layer.cc */; };
+		EC0BE15E25144BE4009BD69A /* layer_interpreter_macro.h in Headers */ = {isa = PBXBuildFile; fileRef = EC0BE15825144BE3009BD69A /* layer_interpreter_macro.h */; };
+		EC0BE15F25144BE4009BD69A /* squared_difference_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE15925144BE4009BD69A /* squared_difference_layer_interpreter.cc */; };
+		EC0BE16025144BE4009BD69A /* detection_post_process_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE15A25144BE4009BD69A /* detection_post_process_layer_interpreter.cc */; };
+		EC0BE16125144BE4009BD69A /* squeeze_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE15B25144BE4009BD69A /* squeeze_layer_interpreter.cc */; };
+		EC0BE16225144BE4009BD69A /* signed_mul_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE15C25144BE4009BD69A /* signed_mul_layer_interpreter.cc */; };
+		EC0BE16325144BE4009BD69A /* arg_max_or_min_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE15D25144BE4009BD69A /* arg_max_or_min_layer_interpreter.cc */; };
+		EC0BE17225144C10009BD69A /* elu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16425144C0F009BD69A /* elu_layer_interpreter.cc */; };
+		EC0BE17325144C10009BD69A /* selu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16525144C0F009BD69A /* selu_layer_interpreter.cc */; };
+		EC0BE17425144C10009BD69A /* normalize_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16625144C0F009BD69A /* normalize_layer_interpreter.cc */; };
+		EC0BE17525144C10009BD69A /* prior_box_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16725144C0F009BD69A /* prior_box_layer_interpreter.cc */; };
+		EC0BE17625144C10009BD69A /* scale_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16825144C0F009BD69A /* scale_layer_interpreter.cc */; };
+		EC0BE17725144C10009BD69A /* pad_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16925144C0F009BD69A /* pad_layer_interpreter.cc */; };
+		EC0BE17825144C10009BD69A /* clip_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16A25144C0F009BD69A /* clip_layer_interpreter.cc */; };
+		EC0BE17925144C10009BD69A /* reorg_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16B25144C0F009BD69A /* reorg_layer_interpreter.cc */; };
+		EC0BE17A25144C10009BD69A /* detection_output_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16C25144C0F009BD69A /* detection_output_layer_interpreter.cc */; };
+		EC0BE17B25144C10009BD69A /* reduce_op_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16D25144C0F009BD69A /* reduce_op_layer_interpreter.cc */; };
+		EC0BE17C25144C10009BD69A /* instance_norm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16E25144C10009BD69A /* instance_norm_layer_interpreter.cc */; };
+		EC0BE17D25144C10009BD69A /* roi_pooling_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE16F25144C10009BD69A /* roi_pooling_layer_interpreter.cc */; };
+		EC0BE17E25144C10009BD69A /* prelu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE17025144C10009BD69A /* prelu_layer_interpreter.cc */; };
+		EC0BE17F25144C10009BD69A /* unary_op_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE17125144C10009BD69A /* unary_op_layer_interpreter.cc */; };
+		EC0BE1BB251DBE65009BD69A /* mat_converter_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC0BE1B9251DBE65009BD69A /* mat_converter_utils.cc */; };
+		EC0BE1BC251DBE65009BD69A /* mat_converter_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = EC0BE1BA251DBE65009BD69A /* mat_converter_utils.h */; };
+		EC12EC7F25E67549007ADDE4 /* net_optimizer_insert_layout_reformat.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC7725E67548007ADDE4 /* net_optimizer_insert_layout_reformat.cc */; };
+		EC12EC8025E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.h in Headers */ = {isa = PBXBuildFile; fileRef = EC12EC7825E67548007ADDE4 /* net_optimizer_cbam_fused_pooling.h */; };
+		EC12EC8125E67549007ADDE4 /* net_optimizer_fuse_conv_add.h in Headers */ = {isa = PBXBuildFile; fileRef = EC12EC7925E67548007ADDE4 /* net_optimizer_fuse_conv_add.h */; };
+		EC12EC8225E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC7A25E67548007ADDE4 /* net_optimizer_cbam_fused_reduce.cc */; };
+		EC12EC8325E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.h in Headers */ = {isa = PBXBuildFile; fileRef = EC12EC7B25E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.h */; };
+		EC12EC8425E67549007ADDE4 /* net_optimizer_fuse_conv_add.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC7C25E67549007ADDE4 /* net_optimizer_fuse_conv_add.cc */; };
+		EC12EC8525E67549007ADDE4 /* net_optimizer_insert_layout_reformat.h in Headers */ = {isa = PBXBuildFile; fileRef = EC12EC7D25E67549007ADDE4 /* net_optimizer_insert_layout_reformat.h */; };
+		EC12EC8625E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC7E25E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc */; };
+		EC12EC8A25E68374007ADDE4 /* metal_reformat_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC8925E68374007ADDE4 /* metal_reformat_layer_acc.mm */; };
+		EC12EC8F25E7A7F4007ADDE4 /* metal_reformat_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC12EC8E25E7A7F4007ADDE4 /* metal_reformat_layer_acc.metal */; };
+		EC12EF9525EF365B007ADDE4 /* metal_squeeze_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC12EF9425EF365B007ADDE4 /* metal_squeeze_layer_acc.metal */; };
+		EC2CF72525078C1200EE3899 /* metal_mat_converter.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72325078C1200EE3899 /* metal_mat_converter.mm */; };
+		EC2CF72625078C1200EE3899 /* metal_mat_converter.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF72425078C1200EE3899 /* metal_mat_converter.metal */; };
+		EC2CF7822511F80500EE3899 /* metal_arg_max_or_min_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF7802511F80500EE3899 /* metal_arg_max_or_min_layer_acc.mm */; };
+		EC2CF7832511F80500EE3899 /* metal_arg_max_or_min_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC2CF7812511F80500EE3899 /* metal_arg_max_or_min_layer_acc.metal */; };
+		EC39A3FB25FB65E000891D9A /* half_utils_inner.h in Headers */ = {isa = PBXBuildFile; fileRef = EC39A3F725FB65E000891D9A /* half_utils_inner.h */; };
+		EC39A3FC25FB65E000891D9A /* half.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EC39A3F825FB65E000891D9A /* half.hpp */; };
+		EC39A40225FB662900891D9A /* metal_reorg_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC39A40025FB662900891D9A /* metal_reorg_layer_acc.metal */; };
+		EC39A40325FB662900891D9A /* metal_reorg_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC39A40125FB662900891D9A /* metal_reorg_layer_acc.mm */; };
+		EC39A41B25FC9DE100891D9A /* arm_lstm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC39A41925FC9DE100891D9A /* arm_lstm_layer_acc.cc */; };
+		EC39A41C25FC9DE100891D9A /* arm_squeeze_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC39A41A25FC9DE100891D9A /* arm_squeeze_layer_acc.cc */; };
+		EC39A41F25FCA8E600891D9A /* arm_lstm_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = EC39A41E25FCA8E600891D9A /* arm_lstm_layer_acc.h */; };
+		EC5932F425CA446100FF8F4B /* metal_lstm_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = EC5932F225CA446100FF8F4B /* metal_lstm_layer_acc.h */; };
+		EC5932F525CA446100FF8F4B /* metal_lstm_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC5932F325CA446100FF8F4B /* metal_lstm_layer_acc.mm */; };
+		EC5932FA25CFEBA900FF8F4B /* metal_reorg_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC5932F725CFEBA900FF8F4B /* metal_reorg_layer_acc.mm */; };
+		EC5932FB25CFEBAA00FF8F4B /* metal_reorg_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = EC5932F825CFEBA900FF8F4B /* metal_reorg_layer_acc.metal */; };
+		EC626D422615B1F800750B31 /* binary_function.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D402615B1F800750B31 /* binary_function.cc */; };
+		EC626D432615B1F800750B31 /* binary_function.h in Headers */ = {isa = PBXBuildFile; fileRef = EC626D412615B1F800750B31 /* binary_function.h */; };
+		EC626D462615B20300750B31 /* arm_binary_fp16_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D452615B20300750B31 /* arm_binary_fp16_layer_acc.cc */; };
+		EC626D4D2615B21A00750B31 /* arm_expand_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D482615B21A00750B31 /* arm_expand_layer_acc.cc */; };
+		EC626D4E2615B21A00750B31 /* arm_cast_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D492615B21A00750B31 /* arm_cast_layer_acc.cc */; };
+		EC626D4F2615B21A00750B31 /* arm_expand_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = EC626D4A2615B21A00750B31 /* arm_expand_layer_acc.h */; };
+		EC626D502615B21A00750B31 /* arm_gather_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D4B2615B21A00750B31 /* arm_gather_layer_acc.cc */; };
+		EC626D512615B21A00750B31 /* arm_unsqueeze_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC626D4C2615B21A00750B31 /* arm_unsqueeze_layer_acc.cc */; };
+		EC78BA2A26045787009271A8 /* metal_stride_slice_v2_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC78BA2926045787009271A8 /* metal_stride_slice_v2_layer_acc.mm */; };
+		EC78BA4026048309009271A8 /* arm_mat_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC78BA3E26048309009271A8 /* arm_mat_mul_layer_acc.cc */; };
+		EC78BA4126048309009271A8 /* arm_mat_mul_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = EC78BA3F26048309009271A8 /* arm_mat_mul_layer_acc.h */; };
+		EC7F4B0A25E6417200F73811 /* metal_squeeze_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = EC7F4B0925E6417200F73811 /* metal_squeeze_layer_acc.mm */; };
+		EC88054A255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.h in Headers */ = {isa = PBXBuildFile; fileRef = EC880548255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.h */; };
+		EC88054B255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC880549255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.cc */; };
+		EC8BDE0925E3B13D0085CCC2 /* const_folder.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC8BDE0725E3B13D0085CCC2 /* const_folder.cc */; };
+		EC8BDE1325E3B1600085CCC2 /* data_flag_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC8BDE0E25E3B15F0085CCC2 /* data_flag_utils.cc */; };
+		EC8BDE1425E3B1600085CCC2 /* dims_offset_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = EC8BDE0F25E3B15F0085CCC2 /* dims_offset_utils.h */; };
+		EC8BDE1525E3B1600085CCC2 /* dims_offset_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = EC8BDE1025E3B15F0085CCC2 /* dims_offset_utils.cc */; };
+		ECCDCE7F25DF536000D7D297 /* CMakeLists.txt in Resources */ = {isa = PBXBuildFile; fileRef = ECCDCDFF25DF536000D7D297 /* CMakeLists.txt */; };
+		ECCDCE8025DF536000D7D297 /* cpu_device.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0025DF536000D7D297 /* cpu_device.cc */; };
+		ECCDCE8125DF536000D7D297 /* cpu_mat_util.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE0125DF536000D7D297 /* cpu_mat_util.h */; };
+		ECCDCE8225DF536000D7D297 /* cpu_mat_converter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0225DF536000D7D297 /* cpu_mat_converter.cc */; };
+		ECCDCE8325DF536000D7D297 /* cpu_device.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE0325DF536000D7D297 /* cpu_device.h */; };
+		ECCDCE8425DF536000D7D297 /* cpu_context.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE0425DF536000D7D297 /* cpu_context.h */; };
+		ECCDCE8525DF536000D7D297 /* cpu_blob_converter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0525DF536000D7D297 /* cpu_blob_converter.cc */; };
+		ECCDCE8625DF536000D7D297 /* cpu_mat_converter.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE0625DF536000D7D297 /* cpu_mat_converter.h */; };
+		ECCDCE8725DF536000D7D297 /* cpu_context.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0725DF536000D7D297 /* cpu_context.cc */; };
+		ECCDCE8825DF536000D7D297 /* cpu_mat_util.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0825DF536000D7D297 /* cpu_mat_util.cc */; };
+		ECCDCE8925DF536000D7D297 /* cpu_cast_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0A25DF536000D7D297 /* cpu_cast_layer_acc.cc */; };
+		ECCDCE8A25DF536000D7D297 /* cpu_stride_slice_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0B25DF536000D7D297 /* cpu_stride_slice_layer_acc.cc */; };
+		ECCDCE8B25DF536000D7D297 /* cpu_reciprocal_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0C25DF536000D7D297 /* cpu_reciprocal_layer_acc.cc */; };
+		ECCDCE8C25DF536000D7D297 /* cpu_binary_op_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE0D25DF536000D7D297 /* cpu_binary_op_layer_acc.h */; };
+		ECCDCE8D25DF536000D7D297 /* cpu_erf_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0E25DF536000D7D297 /* cpu_erf_layer_acc.cc */; };
+		ECCDCE8E25DF536000D7D297 /* cpu_gather_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE0F25DF536000D7D297 /* cpu_gather_layer_acc.cc */; };
+		ECCDCE8F25DF536000D7D297 /* cpu_permute_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE1025DF536000D7D297 /* cpu_permute_layer_acc.h */; };
+		ECCDCE9025DF536000D7D297 /* cpu_reorg_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1125DF536000D7D297 /* cpu_reorg_layer_acc.cc */; };
+		ECCDCE9125DF536000D7D297 /* cpu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1225DF536000D7D297 /* cpu_layer_acc.cc */; };
+		ECCDCE9225DF536000D7D297 /* cpu_ histogram_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1325DF536000D7D297 /* cpu_ histogram_layer_acc.cc */; };
+		ECCDCE9325DF536000D7D297 /* cpu_detection_post_process_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE1425DF536000D7D297 /* cpu_detection_post_process_layer_acc.h */; };
+		ECCDCE9425DF536000D7D297 /* cpu_size_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1525DF536000D7D297 /* cpu_size_layer_acc.cc */; };
+		ECCDCE9525DF536000D7D297 /* cpu_conv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE1625DF536000D7D297 /* cpu_conv_layer_acc.h */; };
+		ECCDCE9625DF536000D7D297 /* cpu_reduce_log_sum_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1725DF536000D7D297 /* cpu_reduce_log_sum_layer_acc.cc */; };
+		ECCDCE9725DF536000D7D297 /* cpu_clip_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1825DF536000D7D297 /* cpu_clip_layer_acc.cc */; };
+		ECCDCE9825DF536000D7D297 /* cpu_reduce_l1_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1925DF536000D7D297 /* cpu_reduce_l1_layer_acc.cc */; };
+		ECCDCE9925DF536000D7D297 /* cpu_arg_max_or_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1A25DF536000D7D297 /* cpu_arg_max_or_min_layer_acc.cc */; };
+		ECCDCE9A25DF536000D7D297 /* cpu_softmax_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1B25DF536000D7D297 /* cpu_softmax_layer_acc.cc */; };
+		ECCDCE9B25DF536000D7D297 /* cpu_reduce_sum_square_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1C25DF536000D7D297 /* cpu_reduce_sum_square_layer_acc.cc */; };
+		ECCDCE9C25DF536000D7D297 /* cpu_squared_difference_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1D25DF536000D7D297 /* cpu_squared_difference_layer_acc.cc */; };
+		ECCDCE9D25DF536000D7D297 /* cpu_relu6_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1E25DF536000D7D297 /* cpu_relu6_layer_acc.cc */; };
+		ECCDCE9E25DF536000D7D297 /* cpu_concat_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE1F25DF536000D7D297 /* cpu_concat_layer_acc.cc */; };
+		ECCDCE9F25DF536000D7D297 /* cpu_pool_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2025DF536000D7D297 /* cpu_pool_layer_acc.cc */; };
+		ECCDCEA025DF536000D7D297 /* cpu_detection_output_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE2125DF536000D7D297 /* cpu_detection_output_layer_acc.h */; };
+		ECCDCEA125DF536000D7D297 /* cpu_reduce_mean_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2225DF536000D7D297 /* cpu_reduce_mean_layer_acc.cc */; };
+		ECCDCEA225DF536000D7D297 /* cpu_range_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2325DF536000D7D297 /* cpu_range_layer_acc.cc */; };
+		ECCDCEA325DF536000D7D297 /* cpu_reformat_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2425DF536000D7D297 /* cpu_reformat_layer_acc.cc */; };
+		ECCDCEA425DF536000D7D297 /* cpu_upsample_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE2525DF536000D7D297 /* cpu_upsample_layer_acc.h */; };
+		ECCDCEA525DF536000D7D297 /* cpu_prior_box_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2625DF536000D7D297 /* cpu_prior_box_layer_acc.cc */; };
+		ECCDCEA625DF536000D7D297 /* cpu_nonzero_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2725DF536000D7D297 /* cpu_nonzero_layer_acc.cc */; };
+		ECCDCEA725DF536000D7D297 /* cpu_scale_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2825DF536000D7D297 /* cpu_scale_layer_acc.cc */; };
+		ECCDCEA825DF536000D7D297 /* cpu_pixel_shuffle_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2925DF536000D7D297 /* cpu_pixel_shuffle_layer_acc.cc */; };
+		ECCDCEA925DF536000D7D297 /* cpu_shuffle_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2A25DF536000D7D297 /* cpu_shuffle_layer_acc.cc */; };
+		ECCDCEAA25DF536000D7D297 /* cpu_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE2B25DF536000D7D297 /* cpu_layer_acc.h */; };
+		ECCDCEAB25DF536000D7D297 /* cpu_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2C25DF536000D7D297 /* cpu_mul_layer_acc.cc */; };
+		ECCDCEAC25DF536000D7D297 /* cpu_prior_box_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE2D25DF536000D7D297 /* cpu_prior_box_layer_acc.h */; };
+		ECCDCEAD25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2E25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc */; };
+		ECCDCEAE25DF536000D7D297 /* cpu_sin_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE2F25DF536000D7D297 /* cpu_sin_layer_acc.cc */; };
+		ECCDCEAF25DF536000D7D297 /* cpu_sub_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3025DF536000D7D297 /* cpu_sub_layer_acc.cc */; };
+		ECCDCEB025DF536000D7D297 /* cpu_asin_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3125DF536000D7D297 /* cpu_asin_layer_acc.cc */; };
+		ECCDCEB125DF536000D7D297 /* cpu_unary_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3225DF536000D7D297 /* cpu_unary_layer_acc.cc */; };
+		ECCDCEB225DF536000D7D297 /* cpu_detection_output_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3325DF536000D7D297 /* cpu_detection_output_layer_acc.cc */; };
+		ECCDCEB325DF536000D7D297 /* cpu_expand_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3425DF536000D7D297 /* cpu_expand_layer_acc.cc */; };
+		ECCDCEB425DF536000D7D297 /* cpu_pool_3d_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3525DF536000D7D297 /* cpu_pool_3d_layer_acc.cc */; };
+		ECCDCEB525DF536000D7D297 /* cpu_unsqueeze_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3625DF536000D7D297 /* cpu_unsqueeze_layer_acc.cc */; };
+		ECCDCEB625DF536000D7D297 /* cpu_rsqrt_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3725DF536000D7D297 /* cpu_rsqrt_layer_acc.cc */; };
+		ECCDCEB725DF536000D7D297 /* cpu_detection_post_process_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3825DF536000D7D297 /* cpu_detection_post_process_layer_acc.cc */; };
+		ECCDCEB825DF536000D7D297 /* cpu_conv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3925DF536000D7D297 /* cpu_conv_layer_acc.cc */; };
+		ECCDCEB925DF536000D7D297 /* cpu_permute_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3A25DF536000D7D297 /* cpu_permute_layer_acc.cc */; };
+		ECCDCEBA25DF536000D7D297 /* cpu_add_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3B25DF536000D7D297 /* cpu_add_layer_acc.cc */; };
+		ECCDCEBB25DF536000D7D297 /* cpu_floor_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3C25DF536000D7D297 /* cpu_floor_layer_acc.cc */; };
+		ECCDCEBC25DF536000D7D297 /* cpu_signed_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3D25DF536000D7D297 /* cpu_signed_mul_layer_acc.cc */; };
+		ECCDCEBD25DF536000D7D297 /* cpu_reduce_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE3E25DF536000D7D297 /* cpu_reduce_layer_acc.h */; };
+		ECCDCEBE25DF536000D7D297 /* cpu_relu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE3F25DF536000D7D297 /* cpu_relu_layer_acc.cc */; };
+		ECCDCEBF25DF536000D7D297 /* cpu_reduce_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4025DF536000D7D297 /* cpu_reduce_min_layer_acc.cc */; };
+		ECCDCEC025DF536000D7D297 /* cpu_conv_3d_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4125DF536000D7D297 /* cpu_conv_3d_layer_acc.cc */; };
+		ECCDCEC125DF536000D7D297 /* cpu_reduce_max_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4225DF536000D7D297 /* cpu_reduce_max_layer_acc.cc */; };
+		ECCDCEC225DF536000D7D297 /* cpu_scatter_nd_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4325DF536000D7D297 /* cpu_scatter_nd_layer_acc.cc */; };
+		ECCDCEC325DF536000D7D297 /* cpu_bias_add_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4425DF536000D7D297 /* cpu_bias_add_layer_acc.cc */; };
+		ECCDCEC425DF536000D7D297 /* cpu_shape_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4525DF536000D7D297 /* cpu_shape_layer_acc.cc */; };
+		ECCDCEC525DF536000D7D297 /* cpu_hdrguide_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4625DF536000D7D297 /* cpu_hdrguide_layer_acc.cc */; };
+		ECCDCEC625DF536000D7D297 /* compute_int8.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4825DF536000D7D297 /* compute_int8.cc */; };
+		ECCDCEC725DF536000D7D297 /* compute_elewise.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE4925DF536000D7D297 /* compute_elewise.h */; };
+		ECCDCEC825DF536000D7D297 /* normalized_bbox.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE4A25DF536000D7D297 /* normalized_bbox.h */; };
+		ECCDCEC925DF536000D7D297 /* compute_elewise.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4B25DF536000D7D297 /* compute_elewise.cc */; };
+		ECCDCECA25DF536000D7D297 /* compute_int8.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE4C25DF536000D7D297 /* compute_int8.h */; };
+		ECCDCECB25DF536000D7D297 /* cpu_reduce_log_sum_exp_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4D25DF536000D7D297 /* cpu_reduce_log_sum_exp_layer_acc.cc */; };
+		ECCDCECC25DF536000D7D297 /* cpu_reduce_sum_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4E25DF536000D7D297 /* cpu_reduce_sum_layer_acc.cc */; };
+		ECCDCECD25DF536000D7D297 /* cpu_hard_swish_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE4F25DF536000D7D297 /* cpu_hard_swish_layer_acc.cc */; };
+		ECCDCECE25DF536000D7D297 /* cpu_reduce_prod_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5025DF536000D7D297 /* cpu_reduce_prod_layer_acc.cc */; };
+		ECCDCECF25DF536000D7D297 /* cpu_tanh_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5125DF536000D7D297 /* cpu_tanh_layer_acc.cc */; };
+		ECCDCED025DF536000D7D297 /* cpu_gathernd_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5225DF536000D7D297 /* cpu_gathernd_layer_acc.cc */; };
+		ECCDCED125DF536000D7D297 /* cpu_sigmoid_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5325DF536000D7D297 /* cpu_sigmoid_layer_acc.cc */; };
+		ECCDCED225DF536000D7D297 /* cpu_log_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5425DF536000D7D297 /* cpu_log_layer_acc.cc */; };
+		ECCDCED325DF536000D7D297 /* cpu_batch_norm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5525DF536000D7D297 /* cpu_batch_norm_layer_acc.cc */; };
+		ECCDCED425DF536000D7D297 /* cpu_upsample_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5625DF536000D7D297 /* cpu_upsample_layer_acc.cc */; };
+		ECCDCED525DF536000D7D297 /* cpu_lrn_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5725DF536000D7D297 /* cpu_lrn_layer_acc.cc */; };
+		ECCDCED625DF536000D7D297 /* cpu_sqrt_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5825DF536000D7D297 /* cpu_sqrt_layer_acc.cc */; };
+		ECCDCED725DF536000D7D297 /* cpu_neg_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5925DF536000D7D297 /* cpu_neg_layer_acc.cc */; };
+		ECCDCED825DF536000D7D297 /* cpu_deconv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5A25DF536000D7D297 /* cpu_deconv_layer_acc.cc */; };
+		ECCDCED925DF536000D7D297 /* cpu_stride_slice_v2_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5B25DF536000D7D297 /* cpu_stride_slice_v2_layer_acc.cc */; };
+		ECCDCEDA25DF536000D7D297 /* cpu_pow_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5C25DF536000D7D297 /* cpu_pow_layer_acc.cc */; };
+		ECCDCEDB25DF536000D7D297 /* cpu_splitv_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5D25DF536000D7D297 /* cpu_splitv_layer_acc.cc */; };
+		ECCDCEDC25DF536000D7D297 /* cpu_lstm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5E25DF536000D7D297 /* cpu_lstm_layer_acc.cc */; };
+		ECCDCEDD25DF536000D7D297 /* cpu_exp_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE5F25DF536000D7D297 /* cpu_exp_layer_acc.cc */; };
+		ECCDCEDE25DF536000D7D297 /* cpu_div_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6025DF536000D7D297 /* cpu_div_layer_acc.cc */; };
+		ECCDCEDF25DF536000D7D297 /* cpu_hard_sigmoid_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6125DF536000D7D297 /* cpu_hard_sigmoid_layer_acc.cc */; };
+		ECCDCEE025DF536000D7D297 /* cpu_squeeze_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6225DF536000D7D297 /* cpu_squeeze_layer_acc.cc */; };
+		ECCDCEE125DF536000D7D297 /* cpu_padv2_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6325DF536000D7D297 /* cpu_padv2_layer_acc.cc */; };
+		ECCDCEE225DF536000D7D297 /* cpu_ceil_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6425DF536000D7D297 /* cpu_ceil_layer_acc.cc */; };
+		ECCDCEE325DF536000D7D297 /* cpu_deconv_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE6525DF536000D7D297 /* cpu_deconv_layer_acc.h */; };
+		ECCDCEE425DF536000D7D297 /* cpu_atan_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6625DF536000D7D297 /* cpu_atan_layer_acc.cc */; };
+		ECCDCEE525DF536000D7D297 /* cpu_tan_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6725DF536000D7D297 /* cpu_tan_layer_acc.cc */; };
+		ECCDCEE625DF536000D7D297 /* cpu_prelu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6825DF536000D7D297 /* cpu_prelu_layer_acc.cc */; };
+		ECCDCEE725DF536000D7D297 /* cpu_elu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6925DF536000D7D297 /* cpu_elu_layer_acc.cc */; };
+		ECCDCEE825DF536000D7D297 /* cpu_abs_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6A25DF536000D7D297 /* cpu_abs_layer_acc.cc */; };
+		ECCDCEE925DF536000D7D297 /* cpu_min_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6B25DF536000D7D297 /* cpu_min_layer_acc.cc */; };
+		ECCDCEEA25DF536000D7D297 /* cpu_cos_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6C25DF536000D7D297 /* cpu_cos_layer_acc.cc */; };
+		ECCDCEEB25DF536000D7D297 /* cpu_inner_product_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6D25DF536000D7D297 /* cpu_inner_product_layer_acc.cc */; };
+		ECCDCEEC25DF536000D7D297 /* cpu_instance_norm_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE6E25DF536000D7D297 /* cpu_instance_norm_layer_acc.cc */; };
+		ECCDCEED25DF536000D7D297 /* cpu_unary_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE6F25DF536000D7D297 /* cpu_unary_layer_acc.h */; };
+		ECCDCEEE25DF536000D7D297 /* cpu_reshape_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7025DF536000D7D297 /* cpu_reshape_layer_acc.cc */; };
+		ECCDCEEF25DF536000D7D297 /* cpu_conv_3d_layer_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECCDCE7125DF536000D7D297 /* cpu_conv_3d_layer_acc.h */; };
+		ECCDCEF025DF536000D7D297 /* cpu_selu_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7225DF536000D7D297 /* cpu_selu_layer_acc.cc */; };
+		ECCDCEF125DF536000D7D297 /* cpu_constantofshape_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7325DF536000D7D297 /* cpu_constantofshape_layer_acc.cc */; };
+		ECCDCEF225DF536000D7D297 /* cpu_acos_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7425DF536000D7D297 /* cpu_acos_layer_acc.cc */; };
+		ECCDCEF325DF536000D7D297 /* cpu_reduce_l2_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7525DF536000D7D297 /* cpu_reduce_l2_layer_acc.cc */; };
+		ECCDCEF425DF536000D7D297 /* cpu_max_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7625DF536000D7D297 /* cpu_max_layer_acc.cc */; };
+		ECCDCEF525DF536000D7D297 /* cpu_sign_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7725DF536000D7D297 /* cpu_sign_layer_acc.cc */; };
+		ECCDCEF625DF536000D7D297 /* cpu_normalize_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7825DF536000D7D297 /* cpu_normalize_layer_acc.cc */; };
+		ECCDCEF725DF536000D7D297 /* cpu_pad_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7925DF536000D7D297 /* cpu_pad_layer_acc.cc */; };
+		ECCDCEF825DF536000D7D297 /* cpu_bitshift_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7A25DF536000D7D297 /* cpu_bitshift_layer_acc.cc */; };
+		ECCDCEF925DF536000D7D297 /* cpu_binary_op_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7B25DF536000D7D297 /* cpu_binary_op_layer_acc.cc */; };
+		ECCDCEFA25DF536000D7D297 /* cpu_reduce_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7C25DF536000D7D297 /* cpu_reduce_layer_acc.cc */; };
+		ECCDCEFB25DF536000D7D297 /* cpu_mat_mul_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7D25DF536000D7D297 /* cpu_mat_mul_layer_acc.cc */; };
+		ECCDCEFC25DF536000D7D297 /* cpu_softplus_layer_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCE7E25DF536000D7D297 /* cpu_softplus_layer_acc.cc */; };
+		ECCDCEFF25DF5C3F00D7D297 /* metal_lstm_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCEFE25DF5C3F00D7D297 /* metal_lstm_layer_acc.metal */; };
+		ECCDCF1725E0F97500D7D297 /* expand_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0125E0F97300D7D297 /* expand_layer.cc */; };
+		ECCDCF1825E0F97500D7D297 /* scatter_nd_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0225E0F97300D7D297 /* scatter_nd_layer.cc */; };
+		ECCDCF1925E0F97500D7D297 /* lstm_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0325E0F97400D7D297 /* lstm_layer.cc */; };
+		ECCDCF1A25E0F97500D7D297 /* erf_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0425E0F97400D7D297 /* erf_layer.cc */; };
+		ECCDCF1B25E0F97500D7D297 /* size_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0525E0F97400D7D297 /* size_layer.cc */; };
+		ECCDCF1C25E0F97500D7D297 /* histogram_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0625E0F97400D7D297 /* histogram_layer.cc */; };
+		ECCDCF1D25E0F97500D7D297 /* mat_mul_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0725E0F97400D7D297 /* mat_mul_layer.cc */; };
+		ECCDCF1E25E0F97500D7D297 /* bias_add_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0825E0F97400D7D297 /* bias_add_layer.cc */; };
+		ECCDCF1F25E0F97500D7D297 /* constantofshape_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0925E0F97400D7D297 /* constantofshape_layer.cc */; };
+		ECCDCF2025E0F97500D7D297 /* gathernd_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0A25E0F97400D7D297 /* gathernd_layer.cc */; };
+		ECCDCF2125E0F97500D7D297 /* cast_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0B25E0F97400D7D297 /* cast_layer.cc */; };
+		ECCDCF2225E0F97500D7D297 /* range_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0C25E0F97400D7D297 /* range_layer.cc */; };
+		ECCDCF2325E0F97500D7D297 /* nonzero_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0D25E0F97400D7D297 /* nonzero_layer.cc */; };
+		ECCDCF2425E0F97500D7D297 /* stride_slice_v2_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0E25E0F97400D7D297 /* stride_slice_v2_layer.cc */; };
+		ECCDCF2525E0F97500D7D297 /* cbam_fused_reduce_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF0F25E0F97400D7D297 /* cbam_fused_reduce_layer.cc */; };
+		ECCDCF2625E0F97500D7D297 /* gather_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1025E0F97400D7D297 /* gather_layer.cc */; };
+		ECCDCF2725E0F97500D7D297 /* cbam_fused_pooling_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1125E0F97400D7D297 /* cbam_fused_pooling_layer.cc */; };
+		ECCDCF2825E0F97500D7D297 /* bitshift_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1225E0F97400D7D297 /* bitshift_layer.cc */; };
+		ECCDCF2925E0F97500D7D297 /* shape_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1325E0F97400D7D297 /* shape_layer.cc */; };
+		ECCDCF2A25E0F97500D7D297 /* padv2_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1425E0F97500D7D297 /* padv2_layer.cc */; };
+		ECCDCF2B25E0F97500D7D297 /* squeeze_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1525E0F97500D7D297 /* squeeze_layer.cc */; };
+		ECCDCF2C25E0F97500D7D297 /* unsqueeze_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF1625E0F97500D7D297 /* unsqueeze_layer.cc */; };
+		ECCDCF4025E10B9D00D7D297 /* padv2_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF2E25E10B9C00D7D297 /* padv2_layer_interpreter.cc */; };
+		ECCDCF4125E10B9D00D7D297 /* constantofshape_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF2F25E10B9C00D7D297 /* constantofshape_layer_interpreter.cc */; };
+		ECCDCF4225E10B9D00D7D297 /* gather_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3025E10B9C00D7D297 /* gather_layer_interpreter.cc */; };
+		ECCDCF4325E10B9D00D7D297 /* stride_slice_v2_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3125E10B9C00D7D297 /* stride_slice_v2_layer_interpreter.cc */; };
+		ECCDCF4425E10B9D00D7D297 /* bias_add_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3225E10B9C00D7D297 /* bias_add_layer_interpreter.cc */; };
+		ECCDCF4525E10B9D00D7D297 /* reformat_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3325E10B9C00D7D297 /* reformat_layer_interpreter.cc */; };
+		ECCDCF4625E10B9D00D7D297 /* scatter_nd_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3425E10B9C00D7D297 /* scatter_nd_layer_interpreter.cc */; };
+		ECCDCF4725E10B9D00D7D297 /* range_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3525E10B9C00D7D297 /* range_layer_interpreter.cc */; };
+		ECCDCF4825E10B9D00D7D297 /* unsqueeze_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3625E10B9C00D7D297 /* unsqueeze_layer_interpreter.cc */; };
+		ECCDCF4925E10B9D00D7D297 /* bitshift_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3725E10B9C00D7D297 /* bitshift_layer_interpreter.cc */; };
+		ECCDCF4A25E10B9D00D7D297 /* expand_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3825E10B9D00D7D297 /* expand_layer_interpreter.cc */; };
+		ECCDCF4B25E10B9D00D7D297 /* gathernd_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3925E10B9D00D7D297 /* gathernd_layer_interpreter.cc */; };
+		ECCDCF4C25E10B9D00D7D297 /* cast_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3A25E10B9D00D7D297 /* cast_layer_interpreter.cc */; };
+		ECCDCF4D25E10B9D00D7D297 /* mat_mul_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3B25E10B9D00D7D297 /* mat_mul_layer_interpreter.cc */; };
+		ECCDCF4E25E10B9D00D7D297 /* lstm_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3C25E10B9D00D7D297 /* lstm_layer_interpreter.cc */; };
+		ECCDCF4F25E10B9D00D7D297 /* shape_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3D25E10B9D00D7D297 /* shape_layer_interpreter.cc */; };
+		ECCDCF5025E10B9D00D7D297 /* size_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3E25E10B9D00D7D297 /* size_layer_interpreter.cc */; };
+		ECCDCF5125E10B9D00D7D297 /* histogram_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECCDCF3F25E10B9D00D7D297 /* histogram_layer_interpreter.cc */; };
+		ECD945B0254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECD945AC254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.mm */; };
+		ECD945B1254ADD7100BF9214 /* metal_signed_mul_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = ECD945AD254ADD7100BF9214 /* metal_signed_mul_layer_acc.metal */; };
+		ECD945B2254ADD7100BF9214 /* metal_signed_mul_layer_acc.mm in Sources */ = {isa = PBXBuildFile; fileRef = ECD945AE254ADD7100BF9214 /* metal_signed_mul_layer_acc.mm */; };
+		ECD945B3254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.metal in Sources */ = {isa = PBXBuildFile; fileRef = ECD945AF254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.metal */; };
+		ECD945B6254ADD8400BF9214 /* pixel_shuffle_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD945B5254ADD8400BF9214 /* pixel_shuffle_layer.cc */; };
+		ECD945B9254ADDA800BF9214 /* pixel_shuffle_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD945B8254ADDA800BF9214 /* pixel_shuffle_layer_interpreter.cc */; };
+		ECD9464D2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD946492558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.cc */; };
+		ECD9464E2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.h in Headers */ = {isa = PBXBuildFile; fileRef = ECD9464A2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.h */; };
+		ECD9464F2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.h in Headers */ = {isa = PBXBuildFile; fileRef = ECD9464B2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.h */; };
+		ECD946502558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECD9464C2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.cc */; };
+		ECEC5D6424FCDBA50044DDF1 /* arm_mat_converter.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5D6024FCDBA40044DDF1 /* arm_mat_converter.cc */; };
+		ECEC5D6524FCDBA50044DDF1 /* arm_mat_util.h in Headers */ = {isa = PBXBuildFile; fileRef = ECEC5D6124FCDBA40044DDF1 /* arm_mat_util.h */; };
+		ECEC5D6624FCDBA50044DDF1 /* arm_mat_converter.h in Headers */ = {isa = PBXBuildFile; fileRef = ECEC5D6224FCDBA40044DDF1 /* arm_mat_converter.h */; };
+		ECEC5D6724FCDBA50044DDF1 /* arm_mat_util.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5D6324FCDBA50044DDF1 /* arm_mat_util.cc */; };
+		ECEC5D6B24FCE0780044DDF1 /* mat_converter_acc.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5D6824FCE0780044DDF1 /* mat_converter_acc.cc */; };
+		ECEC5D6C24FCE0780044DDF1 /* mat_converter_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECEC5D6924FCE0780044DDF1 /* mat_converter_acc.h */; };
+		ECEC5D6D24FCE0780044DDF1 /* mat_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5D6A24FCE0780044DDF1 /* mat_utils.cc */; };
+		ECEC5DA824FFC6FE0044DDF1 /* mat.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5DA724FFC6FD0044DDF1 /* mat.cc */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		320CCF212681CB910086E65C /* GEMM_INT8_SDOT_8X4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_SDOT_8X4.S; sourceTree = "<group>"; };
+		320CCF222681CB910086E65C /* GEMM_INT8_SDOT_8X8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_SDOT_8X8.S; sourceTree = "<group>"; };
+		320CCF232681CB910086E65C /* GEMV_INT8_SDOT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMV_INT8_SDOT.S; sourceTree = "<group>"; };
+		320CCF2A2681CC090086E65C /* arm_conv_int8_sdot_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_int8_sdot_layer_common.cc; sourceTree = "<group>"; };
+		320CCF2C2681CC090086E65C /* compute_sdot_int8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute_sdot_int8.h; sourceTree = "<group>"; };
+		320CCF2D2681CC090086E65C /* compute_sdot_int8.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute_sdot_int8.cc; sourceTree = "<group>"; };
+		320CCF332681CC3D0086E65C /* arm_conv_int8_sdot_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_int8_sdot_layer_common.h; sourceTree = "<group>"; };
+		320CCF3A2681CE570086E65C /* CONV_DW_3X3_INT8_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_INT8_SLIDEW.S; sourceTree = "<group>"; };
+		32BFADDE2684AACE00502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_int8_sdot_layer_depthwise_3x3.h; sourceTree = "<group>"; };
+		32BFADE12684AAE100502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_int8_sdot_layer_depthwise_3x3.cc; sourceTree = "<group>"; };
+		32BFADE42684AAEE00502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S; sourceTree = "<group>"; };
+		32BFADE52684AAEE00502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S; sourceTree = "<group>"; };
+		32BFADE62684AAEE00502FAC /* GEMM_INT8_SDOT_4X4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_SDOT_4X4.S; sourceTree = "<group>"; };
+		32BFADE72684AAEE00502FAC /* GEMM_INT8_SDOT_4X8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_SDOT_4X8.S; sourceTree = "<group>"; };
+		32BFADED2684AB0100502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S; sourceTree = "<group>"; };
+		32BFADEE2684AB0100502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S; sourceTree = "<group>"; };
+		32EE07CE268589E200656211 /* GEMV_INT8_SDOT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMV_INT8_SDOT.S; sourceTree = "<group>"; };
+		369005C3267314D900412264 /* pad_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pad_utils.cc; sourceTree = "<group>"; };
+		369005C4267314D900412264 /* pad_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pad_utils.h; sourceTree = "<group>"; };
+		4E187CF0267202BE00804FDF /* roialign_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = roialign_layer_interpreter.cc; path = tnn/layer_interpreter/roialign_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF1267202BE00804FDF /* conv_1d_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = conv_1d_layer_interpreter.cc; path = tnn/layer_interpreter/conv_1d_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF2267202BF00804FDF /* topk_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = topk_layer_interpreter.cc; path = tnn/layer_interpreter/topk_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF3267202BF00804FDF /* gridsample_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = gridsample_layer_interpreter.cc; path = tnn/layer_interpreter/gridsample_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF4267202BF00804FDF /* layer_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = layer_norm_layer_interpreter.cc; path = tnn/layer_interpreter/layer_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF5267202BF00804FDF /* const_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = const_layer_interpreter.cc; path = tnn/layer_interpreter/const_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF6267202BF00804FDF /* group_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = group_norm_layer_interpreter.cc; path = tnn/layer_interpreter/group_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF7267202BF00804FDF /* onehot_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = onehot_layer_interpreter.cc; path = tnn/layer_interpreter/onehot_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF8267202BF00804FDF /* tile_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tile_layer_interpreter.cc; path = tnn/layer_interpreter/tile_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187CF9267202BF00804FDF /* einsum_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = einsum_layer_interpreter.cc; path = tnn/layer_interpreter/einsum_layer_interpreter.cc; sourceTree = "<group>"; };
+		4E187D05267202D800804FDF /* where_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = where_layer.cc; sourceTree = "<group>"; };
+		4E187D06267202D800804FDF /* layer_norm_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = layer_norm_layer.cc; sourceTree = "<group>"; };
+		4E187D07267202D800804FDF /* equal_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = equal_layer.cc; sourceTree = "<group>"; };
+		4E187D08267202D800804FDF /* onehot_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = onehot_layer.cc; sourceTree = "<group>"; };
+		4E187D09267202D800804FDF /* group_norm_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = group_norm_layer.cc; sourceTree = "<group>"; };
+		4E187D0A267202D800804FDF /* gelu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gelu_layer.cc; sourceTree = "<group>"; };
+		4E187D0B267202D800804FDF /* gridsample_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gridsample_layer.cc; sourceTree = "<group>"; };
+		4E187D0C267202D800804FDF /* conv1d_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv1d_layer.cc; sourceTree = "<group>"; };
+		4E187D0D267202D800804FDF /* tile_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tile_layer.cc; sourceTree = "<group>"; };
+		4E187D0E267202D800804FDF /* softsign_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softsign_layer.cc; sourceTree = "<group>"; };
+		4E187D0F267202D800804FDF /* inverse_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = inverse_layer.cc; sourceTree = "<group>"; };
+		4E187D10267202D800804FDF /* topk_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = topk_layer.cc; sourceTree = "<group>"; };
+		4E187D11267202D800804FDF /* einsum_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = einsum_layer.cc; sourceTree = "<group>"; };
+		4E187D12267202D800804FDF /* roialign_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = roialign_layer.cc; sourceTree = "<group>"; };
+		4E187D222672030500804FDF /* arm_inverse_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_inverse_layer_acc.cc; sourceTree = "<group>"; };
+		4E187D232672030500804FDF /* arm_padv2_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_padv2_layer_acc.cc; sourceTree = "<group>"; };
+		4E187D242672030500804FDF /* arm_tile_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_tile_layer_acc.cc; sourceTree = "<group>"; };
+		4E187D252672030500804FDF /* arm_grid_sample_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_grid_sample_layer_acc.cc; sourceTree = "<group>"; };
+		4E187D262672030500804FDF /* arm_concat_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_concat_layer_acc.h; sourceTree = "<group>"; };
+		4E187D2D2672036A00804FDF /* metal_cast_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_cast_layer_acc.metal; sourceTree = "<group>"; };
+		4E187D2E2672036A00804FDF /* metal_tile_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_tile_layer_acc.metal; sourceTree = "<group>"; };
+		4E187D2F2672036A00804FDF /* metal_mat_mul_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_mat_mul_layer_acc.mm; sourceTree = "<group>"; };
+		4E187D302672036A00804FDF /* metal_gather_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_gather_layer_acc.metal; sourceTree = "<group>"; };
+		4E187D312672036A00804FDF /* metal_cast_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_cast_layer_acc.h; sourceTree = "<group>"; };
+		4E187D322672036A00804FDF /* metal_flatten_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_flatten_layer_acc.mm; sourceTree = "<group>"; };
+		4E187D332672036A00804FDF /* metal_mat_mul_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_mat_mul_layer_acc.h; sourceTree = "<group>"; };
+		4E187D342672036A00804FDF /* metal_cast_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_cast_layer_acc.mm; sourceTree = "<group>"; };
+		4E187D352672036A00804FDF /* metal_tile_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_tile_layer_acc.mm; sourceTree = "<group>"; };
+		4E187D362672036A00804FDF /* metal_gather_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_gather_layer_acc.mm; sourceTree = "<group>"; };
+		4E187D372672036A00804FDF /* metal_gather_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_gather_layer_acc.h; sourceTree = "<group>"; };
+		4E187D382672036A00804FDF /* metal_mat_mul_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_mat_mul_layer_acc.metal; sourceTree = "<group>"; };
+		6178F3472590AA8C00B4B153 /* winograd_generator.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = winograd_generator.cc; sourceTree = "<group>"; };
+		6178F3482590AA8C00B4B153 /* winograd_generator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = winograd_generator.h; sourceTree = "<group>"; };
+		6178F3492590AA8C00B4B153 /* md5.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = md5.h; sourceTree = "<group>"; };
+		6178F34A2590AA8C00B4B153 /* md5.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = md5.cc; sourceTree = "<group>"; };
+		9D24B9562351FFE1000E1F04 /* tnn copy-Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; name = "tnn copy-Info.plist"; path = "/Volumes/BANQ/Projects/MLProjects/tnn-open2/platforms/ios/tnn copy-Info.plist"; sourceTree = "<absolute>"; };
+		9D29E25022DC89300050DC63 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.2.sdk/System/Library/Frameworks/Foundation.framework; sourceTree = DEVELOPER_DIR; };
+		9D2DB1D122D759C8000C508F /* tnn.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = tnn.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		9D2DB1D422D759C8000C508F /* tnn.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tnn.h; sourceTree = "<group>"; };
+		9D2DB1D522D759C8000C508F /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		9D31635F23169B1600531250 /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.4.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; };
+		9D32F95F24557EE7002DCDAB /* net_optimizer_remove_layers.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_remove_layers.h; sourceTree = "<group>"; };
+		9D32F96224557EE7002DCDAB /* net_optimizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer.h; sourceTree = "<group>"; };
+		9D32F96424557EE7002DCDAB /* optimizer_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = optimizer_const.h; sourceTree = "<group>"; };
+		9D32F96524557EE7002DCDAB /* net_optimizer_manager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_manager.h; sourceTree = "<group>"; };
+		9D32F96624557EE7002DCDAB /* net_optimizer_remove_layers.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_remove_layers.cc; sourceTree = "<group>"; };
+		9D32F96724557EE7002DCDAB /* net_optimizer_manager.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_manager.cc; sourceTree = "<group>"; };
+		9D32F96924557EE7002DCDAB /* blob_2d_memory_pool.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_2d_memory_pool.h; sourceTree = "<group>"; };
+		9D32F96A24557EE7002DCDAB /* others_memory_mode_state.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = others_memory_mode_state.cc; sourceTree = "<group>"; };
+		9D32F96B24557EE7002DCDAB /* blob_memory_size_info.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_memory_size_info.cc; sourceTree = "<group>"; };
+		9D32F96C24557EE7002DCDAB /* shared_memory_manager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shared_memory_manager.h; sourceTree = "<group>"; };
+		9D32F96D24557EE7002DCDAB /* memory_unify_assign_strategy.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_unify_assign_strategy.cc; sourceTree = "<group>"; };
+		9D32F96E24557EE7002DCDAB /* blob_1d_memory_pool.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_1d_memory_pool.h; sourceTree = "<group>"; };
+		9D32F96F24557EE7002DCDAB /* blob_1d_memory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_1d_memory.cc; sourceTree = "<group>"; };
+		9D32F97024557EE7002DCDAB /* blob_memory_pool.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_memory_pool.cc; sourceTree = "<group>"; };
+		9D32F97124557EE7002DCDAB /* shared_memory_manager.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shared_memory_manager.cc; sourceTree = "<group>"; };
+		9D32F97224557EE7002DCDAB /* blob_2d_memory_pool.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_2d_memory_pool.cc; sourceTree = "<group>"; };
+		9D32F97324557EE7002DCDAB /* blob_memory_pool_factory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_memory_pool_factory.h; sourceTree = "<group>"; };
+		9D32F97424557EE7002DCDAB /* blob_memory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_memory.cc; sourceTree = "<group>"; };
+		9D32F97524557EE7002DCDAB /* memory_mode_state.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_mode_state.cc; sourceTree = "<group>"; };
+		9D32F97624557EE7002DCDAB /* memory_seperate_assign_strategy.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_seperate_assign_strategy.cc; sourceTree = "<group>"; };
+		9D32F97724557EE7002DCDAB /* blob_1d_memory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_1d_memory.h; sourceTree = "<group>"; };
+		9D32F97824557EE7002DCDAB /* blob_memory_size_info.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_memory_size_info.h; sourceTree = "<group>"; };
+		9D32F97924557EE7002DCDAB /* blob_memory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_memory.h; sourceTree = "<group>"; };
+		9D32F97A24557EE7002DCDAB /* memory_assign_strategy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_assign_strategy.h; sourceTree = "<group>"; };
+		9D32F97B24557EE7002DCDAB /* memory_mode_state_factory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_mode_state_factory.h; sourceTree = "<group>"; };
+		9D32F97C24557EE7002DCDAB /* others_memory_mode_state.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = others_memory_mode_state.h; sourceTree = "<group>"; };
+		9D32F97D24557EE7002DCDAB /* blob_2d_memory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_2d_memory.h; sourceTree = "<group>"; };
+		9D32F97E24557EE7002DCDAB /* blob_2d_memory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_2d_memory.cc; sourceTree = "<group>"; };
+		9D32F97F24557EE7002DCDAB /* memory_unify_assign_strategy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_unify_assign_strategy.h; sourceTree = "<group>"; };
+		9D32F98024557EE7002DCDAB /* share_one_thread_memory_mode_state.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = share_one_thread_memory_mode_state.cc; sourceTree = "<group>"; };
+		9D32F98124557EE7002DCDAB /* memory_mode_state_factory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_mode_state_factory.cc; sourceTree = "<group>"; };
+		9D32F98224557EE7002DCDAB /* memory_mode_state.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_mode_state.h; sourceTree = "<group>"; };
+		9D32F98324557EE7002DCDAB /* share_one_thread_memory_mode_state.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = share_one_thread_memory_mode_state.h; sourceTree = "<group>"; };
+		9D32F98424557EE7002DCDAB /* blob_memory_pool.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_memory_pool.h; sourceTree = "<group>"; };
+		9D32F98524557EE7002DCDAB /* blob_memory_pool_factory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_memory_pool_factory.cc; sourceTree = "<group>"; };
+		9D32F98624557EE7002DCDAB /* memory_seperate_assign_strategy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_seperate_assign_strategy.h; sourceTree = "<group>"; };
+		9D32F98724557EE7002DCDAB /* blob_1d_memory_pool.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_1d_memory_pool.cc; sourceTree = "<group>"; };
+		9D32F98A24557EE7002DCDAB /* omp_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = omp_utils.h; sourceTree = "<group>"; };
+		9D32F98B24557EE7002DCDAB /* half_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = half_utils.cc; sourceTree = "<group>"; };
+		9D32F98C24557EE7002DCDAB /* blob_memory_size_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_memory_size_utils.h; sourceTree = "<group>"; };
+		9D32F98D24557EE7002DCDAB /* string_utils_inner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = string_utils_inner.h; sourceTree = "<group>"; };
+		9D32F98E24557EE7002DCDAB /* blob_transfer_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_transfer_utils.h; sourceTree = "<group>"; };
+		9D32F98F24557EE7002DCDAB /* split_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = split_utils.h; sourceTree = "<group>"; };
+		9D32F99024557EE7002DCDAB /* string_format.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = string_format.cc; sourceTree = "<group>"; };
+		9D32F99124557EE7002DCDAB /* blob_dump_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_dump_utils.cc; sourceTree = "<group>"; };
+		9D32F99224557EE7002DCDAB /* blob_transfer_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_transfer_utils.cc; sourceTree = "<group>"; };
+		9D32F99424557EE7002DCDAB /* data_format_converter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = data_format_converter.h; sourceTree = "<group>"; };
+		9D32F99624557EE7002DCDAB /* data_format_converter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = data_format_converter.cc; sourceTree = "<group>"; };
+		9D32F99724557EE7002DCDAB /* blob_converter_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_converter_internal.h; sourceTree = "<group>"; };
+		9D32F99924557EE7002DCDAB /* blob_converter_internal.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_converter_internal.cc; sourceTree = "<group>"; };
+		9D32F99A24557EE7002DCDAB /* dims_vector_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dims_vector_utils.cc; sourceTree = "<group>"; };
+		9D32F99B24557EE7002DCDAB /* blob_memory_size_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_memory_size_utils.cc; sourceTree = "<group>"; };
+		9D32F99D24557EE7002DCDAB /* data_type_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = data_type_utils.cc; sourceTree = "<group>"; };
+		9D32F99E24557EE7002DCDAB /* split_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = split_utils.cc; sourceTree = "<group>"; };
+		9D32F99F24557EE7002DCDAB /* string_format.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = string_format.h; sourceTree = "<group>"; };
+		9D32F9A024557EE7002DCDAB /* pribox_generator_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pribox_generator_utils.h; sourceTree = "<group>"; };
+		9D32F9A124557EE7002DCDAB /* cpu_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_utils.cc; sourceTree = "<group>"; };
+		9D32F9A224557EE7002DCDAB /* pribox_generator_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pribox_generator_utils.cc; sourceTree = "<group>"; };
+		9D32F9A424557EE7002DCDAB /* sigmoid_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sigmoid_layer.cc; sourceTree = "<group>"; };
+		9D32F9A524557EE7002DCDAB /* pow_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pow_layer.cc; sourceTree = "<group>"; };
+		9D32F9A624557EE7002DCDAB /* tanh_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tanh_layer.cc; sourceTree = "<group>"; };
+		9D32F9A724557EE7002DCDAB /* sign_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sign_layer.cc; sourceTree = "<group>"; };
+		9D32F9A824557EE7002DCDAB /* softplus_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softplus_layer.cc; sourceTree = "<group>"; };
+		9D32F9A924557EE7002DCDAB /* tan_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tan_layer.cc; sourceTree = "<group>"; };
+		9D32F9AA24557EE7002DCDAB /* reduce_prod_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_prod_layer.cc; sourceTree = "<group>"; };
+		9D32F9AB24557EE7002DCDAB /* reduce_sum_square_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_sum_square_layer.cc; sourceTree = "<group>"; };
+		9D32F9AC24557EE7002DCDAB /* add_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = add_layer.cc; sourceTree = "<group>"; };
+		9D32F9AD24557EE7002DCDAB /* exp_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = exp_layer.cc; sourceTree = "<group>"; };
+		9D32F9AE24557EE7002DCDAB /* base_layer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = base_layer.h; sourceTree = "<group>"; };
+		9D32F9AF24557EE7002DCDAB /* shuffle_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shuffle_layer.cc; sourceTree = "<group>"; };
+		9D32F9B024557EE7002DCDAB /* reduce_layer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reduce_layer.h; sourceTree = "<group>"; };
+		9D32F9B124557EE7002DCDAB /* sqrt_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sqrt_layer.cc; sourceTree = "<group>"; };
+		9D32F9B224557EE7002DCDAB /* flatten_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = flatten_layer.cc; sourceTree = "<group>"; };
+		9D32F9B324557EE7002DCDAB /* pooling_3d_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling_3d_layer.cc; sourceTree = "<group>"; };
+		9D32F9B424557EE7002DCDAB /* reshape_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_layer.cc; sourceTree = "<group>"; };
+		9D32F9B524557EE7002DCDAB /* elementwise_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_layer.cc; sourceTree = "<group>"; };
+		9D32F9B624557EE7002DCDAB /* div_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = div_layer.cc; sourceTree = "<group>"; };
+		9D32F9B724557EE7002DCDAB /* inner_product_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = inner_product_layer.cc; sourceTree = "<group>"; };
+		9D32F9B824557EE7002DCDAB /* atan_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = atan_layer.cc; sourceTree = "<group>"; };
+		9D32F9B924557EE7002DCDAB /* sub_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sub_layer.cc; sourceTree = "<group>"; };
+		9D32F9BA24557EE7002DCDAB /* elementwise_layer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_layer.h; sourceTree = "<group>"; };
+		9D32F9BB24557EE7002DCDAB /* reciprocal_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reciprocal_layer.cc; sourceTree = "<group>"; };
+		9D32F9BC24557EE7002DCDAB /* floor_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = floor_layer.cc; sourceTree = "<group>"; };
+		9D32F9BD24557EE7002DCDAB /* detection_output_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_output_layer.cc; sourceTree = "<group>"; };
+		9D32F9BE24557EE7002DCDAB /* splitv_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = splitv_layer.cc; sourceTree = "<group>"; };
+		9D32F9BF24557EE7002DCDAB /* instance_norm_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = instance_norm_layer.cc; sourceTree = "<group>"; };
+		9D32F9C024557EE7002DCDAB /* selu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = selu_layer.cc; sourceTree = "<group>"; };
+		9D32F9C124557EE7002DCDAB /* prior_box_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_layer.cc; sourceTree = "<group>"; };
+		9D32F9C224557EE7002DCDAB /* conv_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_layer.cc; sourceTree = "<group>"; };
+		9D32F9C324557EE7002DCDAB /* conv3d_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv3d_layer.cc; sourceTree = "<group>"; };
+		9D32F9C424557EE7002DCDAB /* relu6_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu6_layer.cc; sourceTree = "<group>"; };
+		9D32F9C524557EE7002DCDAB /* prelu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prelu_layer.cc; sourceTree = "<group>"; };
+		9D32F9C624557EE7002DCDAB /* neg_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = neg_layer.cc; sourceTree = "<group>"; };
+		9D32F9C724557EE7002DCDAB /* softmax_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_layer.cc; sourceTree = "<group>"; };
+		9D32F9C824557EE7002DCDAB /* acos_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = acos_layer.cc; sourceTree = "<group>"; };
+		9D32F9C924557EE7002DCDAB /* clip_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clip_layer.cc; sourceTree = "<group>"; };
+		9D32F9CA24557EE7002DCDAB /* multidir_broadcast_layer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multidir_broadcast_layer.h; sourceTree = "<group>"; };
+		9D32F9CB24557EE7002DCDAB /* reduce_mean_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_mean_layer.cc; sourceTree = "<group>"; };
+		9D32F9CC24557EE7002DCDAB /* reduce_sum_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_sum_layer.cc; sourceTree = "<group>"; };
+		9D32F9CD24557EE7002DCDAB /* pad_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pad_layer.cc; sourceTree = "<group>"; };
+		9D32F9CE24557EE7002DCDAB /* abs_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abs_layer.cc; sourceTree = "<group>"; };
+		9D32F9CF24557EE7002DCDAB /* min_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = min_layer.cc; sourceTree = "<group>"; };
+		9D32F9D024557EE7002DCDAB /* reduce_min_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_min_layer.cc; sourceTree = "<group>"; };
+		9D32F9D124557EE7002DCDAB /* sin_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sin_layer.cc; sourceTree = "<group>"; };
+		9D32F9D224557EE7002DCDAB /* reformat_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reformat_layer.cc; sourceTree = "<group>"; };
+		9D32F9D324557EE7002DCDAB /* concat_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_layer.cc; sourceTree = "<group>"; };
+		9D32F9D424557EE7002DCDAB /* mul_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_layer.cc; sourceTree = "<group>"; };
+		9D32F9D524557EE7002DCDAB /* base_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = base_layer.cc; sourceTree = "<group>"; };
+		9D32F9D624557EE7002DCDAB /* roi_pooling_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = roi_pooling_layer.cc; sourceTree = "<group>"; };
+		9D32F9D724557EE7002DCDAB /* hdrguide_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hdrguide_layer.cc; sourceTree = "<group>"; };
+		9D32F9D824557EE7002DCDAB /* reduce_log_sum_exp_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_log_sum_exp_layer.cc; sourceTree = "<group>"; };
+		9D32F9D924557EE7002DCDAB /* reduce_l2_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_l2_layer.cc; sourceTree = "<group>"; };
+		9D32F9DA24557EE7002DCDAB /* normalize_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = normalize_layer.cc; sourceTree = "<group>"; };
+		9D32F9DB24557EE7002DCDAB /* pooling_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling_layer.cc; sourceTree = "<group>"; };
+		9D32F9DC24557EE7002DCDAB /* elu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elu_layer.cc; sourceTree = "<group>"; };
+		9D32F9DD24557EE7002DCDAB /* log_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = log_layer.cc; sourceTree = "<group>"; };
+		9D32F9DE24557EE7002DCDAB /* scale_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scale_layer.cc; sourceTree = "<group>"; };
+		9D32F9DF24557EE7002DCDAB /* hard_swish_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_swish_layer.cc; sourceTree = "<group>"; };
+		9D32F9E024557EE7002DCDAB /* upsample_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = upsample_layer.cc; sourceTree = "<group>"; };
+		9D32F9E124557EE7002DCDAB /* reduce_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_layer.cc; sourceTree = "<group>"; };
+		9D32F9E224557EE7002DCDAB /* multidir_broadcast_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = multidir_broadcast_layer.cc; sourceTree = "<group>"; };
+		9D32F9E324557EE7002DCDAB /* cos_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cos_layer.cc; sourceTree = "<group>"; };
+		9D32F9E424557EE7002DCDAB /* relu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_layer.cc; sourceTree = "<group>"; };
+		9D32F9E524557EE7002DCDAB /* asin_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = asin_layer.cc; sourceTree = "<group>"; };
+		9D32F9E624557EE7002DCDAB /* reorg_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reorg_layer.cc; sourceTree = "<group>"; };
+		9D32F9E724557EE7002DCDAB /* batch_norm_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batch_norm_layer.cc; sourceTree = "<group>"; };
+		9D32F9E824557EE7002DCDAB /* deconv_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconv_layer.cc; sourceTree = "<group>"; };
+		9D32F9E924557EE7002DCDAB /* stride_slice_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = stride_slice_layer.cc; sourceTree = "<group>"; };
+		9D32F9EA24557EE8002DCDAB /* split_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = split_layer.cc; sourceTree = "<group>"; };
+		9D32F9EB24557EE8002DCDAB /* log_sigmoid_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = log_sigmoid_layer.cc; sourceTree = "<group>"; };
+		9D32F9EC24557EE8002DCDAB /* lrn_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_layer.cc; sourceTree = "<group>"; };
+		9D32F9ED24557EE8002DCDAB /* permute_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = permute_layer.cc; sourceTree = "<group>"; };
+		9D32F9EE24557EE8002DCDAB /* reduce_log_sum_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_log_sum_layer.cc; sourceTree = "<group>"; };
+		9D32F9EF24557EE8002DCDAB /* hard_sigmoid_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_sigmoid_layer.cc; sourceTree = "<group>"; };
+		9D32F9F024557EE8002DCDAB /* max_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = max_layer.cc; sourceTree = "<group>"; };
+		9D32F9F124557EE8002DCDAB /* reduce_max_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_max_layer.cc; sourceTree = "<group>"; };
+		9D32FBFB24557EEB002DCDAB /* default_model_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = default_model_interpreter.h; sourceTree = "<group>"; };
+		9D32FBFC24557EEB002DCDAB /* layer_resource_generator.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = layer_resource_generator.cc; sourceTree = "<group>"; };
+		9D32FBFD24557EEB002DCDAB /* layer_param.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_param.h; sourceTree = "<group>"; };
+		9D32FBFE24557EEB002DCDAB /* default_model_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = default_model_interpreter.cc; sourceTree = "<group>"; };
+		9D32FBFF24557EEB002DCDAB /* net_resource.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_resource.cc; sourceTree = "<group>"; };
+		9D32FC0124557EEB002DCDAB /* model_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = model_interpreter.h; sourceTree = "<group>"; };
+		9D32FC0224557EEB002DCDAB /* model_packer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = model_packer.h; sourceTree = "<group>"; };
+		9D32FC0324557EEB002DCDAB /* model_packer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = model_packer.cc; sourceTree = "<group>"; };
+		9D32FC0424557EEB002DCDAB /* objseri.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = objseri.h; sourceTree = "<group>"; };
+		9D32FC0524557EEB002DCDAB /* model_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = model_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0724557EEB002DCDAB /* batch_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batch_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0824557EEB002DCDAB /* shuffle_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shuffle_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0924557EEB002DCDAB /* conv_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0A24557EEB002DCDAB /* flatten_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = flatten_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0B24557EEB002DCDAB /* unary_op_layer_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unary_op_layer_interpreter.h; sourceTree = "<group>"; };
+		9D32FC0C24557EEB002DCDAB /* detection_output_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_output_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0D24557EEB002DCDAB /* reorg_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reorg_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0E24557EEB002DCDAB /* prelu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prelu_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC0F24557EEB002DCDAB /* clip_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clip_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1024557EEB002DCDAB /* div_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = div_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1124557EEB002DCDAB /* stride_slice_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = stride_slice_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1224557EEB002DCDAB /* pooling_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1324557EEB002DCDAB /* pad_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pad_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1424557EEB002DCDAB /* sub_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sub_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1524557EEB002DCDAB /* normalize_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = normalize_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1624557EEB002DCDAB /* abstract_layer_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_layer_interpreter.h; sourceTree = "<group>"; };
+		9D32FC1724557EEB002DCDAB /* upsample_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = upsample_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1824557EEB002DCDAB /* max_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = max_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1924557EEB002DCDAB /* add_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = add_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1A24557EEB002DCDAB /* permute_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = permute_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1B24557EEB002DCDAB /* hard_swish_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_swish_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1C24557EEB002DCDAB /* elu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elu_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1D24557EEB002DCDAB /* selu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = selu_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1E24557EEB002DCDAB /* lrn_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC1F24557EEB002DCDAB /* roi_pooling_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = roi_pooling_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2024557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_sigmoid_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2124557EEB002DCDAB /* softmax_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2224557EEB002DCDAB /* pow_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pow_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2324557EEB002DCDAB /* pooling_3d_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling_3d_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2424557EEB002DCDAB /* blob_scale_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_scale_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2524557EEB002DCDAB /* mul_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2624557EEB002DCDAB /* reduce_op_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_op_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2724557EEB002DCDAB /* hdrguide_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hdrguide_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2824557EEB002DCDAB /* prior_box_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2924557EEB002DCDAB /* reshape_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2A24557EEB002DCDAB /* instance_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = instance_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2B24557EEB002DCDAB /* splitv_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = splitv_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2C24557EEB002DCDAB /* min_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = min_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2D24557EEB002DCDAB /* inner_product_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = inner_product_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2E24557EEB002DCDAB /* concat_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC2F24557EEB002DCDAB /* conv_3d_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_3d_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC3024557EEB002DCDAB /* unary_op_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unary_op_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC3124557EEB002DCDAB /* scale_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scale_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC3224557EEB002DCDAB /* reduce_op_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reduce_op_interpreter.h; sourceTree = "<group>"; };
+		9D32FC3324557EEB002DCDAB /* layer_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_interpreter.h; sourceTree = "<group>"; };
+		9D32FC3424557EEB002DCDAB /* raw_buffer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = raw_buffer.h; sourceTree = "<group>"; };
+		9D32FC3524557EEB002DCDAB /* default_model_packer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = default_model_packer.h; sourceTree = "<group>"; };
+		9D32FC3624557EEB002DCDAB /* layer_resource.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_resource.h; sourceTree = "<group>"; };
+		9D32FC3724557EEB002DCDAB /* net_resource.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_resource.h; sourceTree = "<group>"; };
+		9D32FC3824557EEB002DCDAB /* default_model_packer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = default_model_packer.cc; sourceTree = "<group>"; };
+		9D32FC3924557EEB002DCDAB /* abstract_model_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_model_interpreter.h; sourceTree = "<group>"; };
+		9D32FC3A24557EEB002DCDAB /* net_structure.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_structure.cc; sourceTree = "<group>"; };
+		9D32FC3B24557EEB002DCDAB /* raw_buffer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = raw_buffer.cc; sourceTree = "<group>"; };
+		9D32FC3C24557EEB002DCDAB /* abstract_model_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstract_model_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC3E24557EEB002DCDAB /* ncnn_model_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ncnn_model_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC4024557EEB002DCDAB /* ncnn_optimizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ncnn_optimizer.h; sourceTree = "<group>"; };
+		9D32FC4124557EEB002DCDAB /* memory_data_optimizer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_data_optimizer.cc; sourceTree = "<group>"; };
+		9D32FC4224557EEB002DCDAB /* ncnn_optimizer_manager.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ncnn_optimizer_manager.cc; sourceTree = "<group>"; };
+		9D32FC4324557EEB002DCDAB /* expand_slice_optimizer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_slice_optimizer.cc; sourceTree = "<group>"; };
+		9D32FC4424557EEB002DCDAB /* ncnn_optimizer_manager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ncnn_optimizer_manager.h; sourceTree = "<group>"; };
+		9D32FC4524557EEB002DCDAB /* ncnn_param_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ncnn_param_utils.cc; sourceTree = "<group>"; };
+		9D32FC4624557EEB002DCDAB /* ncnn_layer_type.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ncnn_layer_type.h; sourceTree = "<group>"; };
+		9D32FC4724557EEB002DCDAB /* serializer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = serializer.h; sourceTree = "<group>"; };
+		9D32FC4824557EEB002DCDAB /* ncnn_layer_type.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ncnn_layer_type.cc; sourceTree = "<group>"; };
+		9D32FC4924557EEB002DCDAB /* ncnn_model_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ncnn_model_interpreter.h; sourceTree = "<group>"; };
+		9D32FC4A24557EEB002DCDAB /* ncnn_param_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ncnn_param_utils.h; sourceTree = "<group>"; };
+		9D32FC4C24557EEB002DCDAB /* batch_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batch_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC4D24557EEB002DCDAB /* conv_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC4E24557EEB002DCDAB /* memory_data_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_data_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC4F24557EEB002DCDAB /* default_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = default_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5024557EEB002DCDAB /* pooling_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5124557EEB002DCDAB /* abstract_layer_interpreter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_layer_interpreter.h; sourceTree = "<group>"; };
+		9D32FC5224557EEB002DCDAB /* slice_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slice_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5324557EEB002DCDAB /* permute_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = permute_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5424557EEB002DCDAB /* binary_op_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binary_op_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5524557EEB002DCDAB /* hard_swish_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_swish_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5624557EEB002DCDAB /* lrn_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5724557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hard_sigmoid_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5824557EEB002DCDAB /* softmax_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5924557EEB002DCDAB /* eltwise_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = eltwise_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5A24557EEB002DCDAB /* reshape_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5B24557EEB002DCDAB /* relu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5C24557EEB002DCDAB /* deconv_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconv_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5D24557EEB002DCDAB /* interp_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = interp_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5E24557EEB002DCDAB /* inner_product_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = inner_product_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC5F24557EEB002DCDAB /* concat_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC6024557EEB002DCDAB /* crop_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = crop_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC6124557EEB002DCDAB /* shuffle_channel_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shuffle_channel_layer_interpreter.cc; sourceTree = "<group>"; };
+		9D32FC6224557EEB002DCDAB /* net_structure.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_structure.h; sourceTree = "<group>"; };
+		9D32FC6324557EEB002DCDAB /* layer_resource_generator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_resource_generator.h; sourceTree = "<group>"; };
+		9D32FC6524557EEB002DCDAB /* layer_type.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = layer_type.cc; sourceTree = "<group>"; };
+		9D32FC6624557EEB002DCDAB /* abstract_network.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstract_network.cc; sourceTree = "<group>"; };
+		9D32FC6724557EEB002DCDAB /* default_network.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = default_network.h; sourceTree = "<group>"; };
+		9D32FC6824557EEB002DCDAB /* tnn_impl.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tnn_impl.cc; sourceTree = "<group>"; };
+		9D32FC6924557EEB002DCDAB /* abstract_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_layer_acc.h; sourceTree = "<group>"; };
+		9D32FC6A24557EEB002DCDAB /* status.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = status.cc; sourceTree = "<group>"; };
+		9D32FC6B24557EEB002DCDAB /* blob.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob.cc; sourceTree = "<group>"; };
+		9D32FC6C24557EEB002DCDAB /* tnn_impl_default.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tnn_impl_default.h; sourceTree = "<group>"; };
+		9D32FC6D24557EEB002DCDAB /* blob_int8.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_int8.cc; sourceTree = "<group>"; };
+		9D32FC6E24557EEB002DCDAB /* context.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = context.cc; sourceTree = "<group>"; };
+		9D32FC6F24557EEB002DCDAB /* context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = context.h; sourceTree = "<group>"; };
+		9D32FC7024557EEB002DCDAB /* tnn.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tnn.cc; sourceTree = "<group>"; };
+		9D32FC7124557EEB002DCDAB /* abstract_device.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_device.h; sourceTree = "<group>"; };
+		9D32FC7224557EEB002DCDAB /* tnn_impl_default.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tnn_impl_default.cc; sourceTree = "<group>"; };
+		9D32FC7324557EEB002DCDAB /* abstract_network.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abstract_network.h; sourceTree = "<group>"; };
+		9D32FC7424557EEB002DCDAB /* abstract_device.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstract_device.cc; sourceTree = "<group>"; };
+		9D32FC7524557EEB002DCDAB /* tnn_impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tnn_impl.h; sourceTree = "<group>"; };
+		9D32FC7624557EEB002DCDAB /* abstract_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstract_layer_acc.cc; sourceTree = "<group>"; };
+		9D32FC7724557EEB002DCDAB /* layer_type.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_type.h; sourceTree = "<group>"; };
+		9D32FC7824557EEB002DCDAB /* instance.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = instance.cc; sourceTree = "<group>"; };
+		9D32FC7924557EEB002DCDAB /* blob_manager.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_manager.cc; sourceTree = "<group>"; };
+		9D32FC7A24557EEB002DCDAB /* default_network.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = default_network.cc; sourceTree = "<group>"; };
+		9D32FC7B24557EEB002DCDAB /* blob_int8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_int8.h; sourceTree = "<group>"; };
+		9D32FC7C24557EEB002DCDAB /* blob_manager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_manager.h; sourceTree = "<group>"; };
+		9D4C60C9246BF7A1006068D1 /* bbox_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bbox_util.h; sourceTree = "<group>"; };
+		9D4C60CA246BF7A1006068D1 /* bbox_util.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bbox_util.cc; sourceTree = "<group>"; };
+		9D4C60CD246BF826006068D1 /* profile.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = profile.cc; sourceTree = "<group>"; };
+		9D4C60CE246BF826006068D1 /* profile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = profile.h; sourceTree = "<group>"; };
+		9D5B715E24BF0A300062DF64 /* metal_prior_box_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_prior_box_layer_acc.metal; sourceTree = "<group>"; };
+		9D5B715F24BF0A300062DF64 /* metal_prior_box_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_prior_box_layer_acc.mm; sourceTree = "<group>"; };
+		9D852BC924584E6A003F4E41 /* bfp16_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bfp16_utils.cc; sourceTree = "<group>"; };
+		9D852BCA24584E6A003F4E41 /* bfp16.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bfp16.h; sourceTree = "<group>"; };
+		9DB341FB249B0A9300F23F65 /* metal_cpu_adapter_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_cpu_adapter_acc.mm; sourceTree = "<group>"; };
+		9DD1FA8D247CE9BE00800139 /* coreml_network.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = coreml_network.mm; sourceTree = "<group>"; };
+		9DD1FA8F247CE9BE00800139 /* metal_command_queue.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_command_queue.h; sourceTree = "<group>"; };
+		9DD1FA90247CE9BE00800139 /* coreml_network.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = coreml_network.h; sourceTree = "<group>"; };
+		9DD1FA91247CE9BE00800139 /* metal_device.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_device.mm; sourceTree = "<group>"; };
+		9DD1FA92247CE9BE00800139 /* tnn_impl_coreml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tnn_impl_coreml.h; sourceTree = "<group>"; };
+		9DD1FA93247CE9BE00800139 /* metal_macro.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_macro.h; sourceTree = "<group>"; };
+		9DD1FA94247CE9BE00800139 /* metal_blob_converter.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_blob_converter.mm; sourceTree = "<group>"; };
+		9DD1FA95247CE9BE00800139 /* metal_blob_converter.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_blob_converter.metal; sourceTree = "<group>"; };
+		9DD1FA96247CE9BE00800139 /* metal_device.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_device.h; sourceTree = "<group>"; };
+		9DD1FA98247CE9BE00800139 /* metal_command_queue.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_command_queue.mm; sourceTree = "<group>"; };
+		9DD1FA99247CE9BE00800139 /* metal_context.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_context.mm; sourceTree = "<group>"; };
+		9DD1FA9A247CE9BE00800139 /* tnn_impl_coreml.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tnn_impl_coreml.mm; sourceTree = "<group>"; };
+		9DD1FA9B247CE9BE00800139 /* metal_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_context.h; sourceTree = "<group>"; };
+		9DD1FA9D247CE9BE00800139 /* metal_sigmoid_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_sigmoid_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FA9E247CE9BE00800139 /* metal_permute_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_permute_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FA9F247CE9BE00800139 /* metal_prelu_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_prelu_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FAA0247CE9BE00800139 /* metal_floor_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_floor_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAA1247CE9BE00800139 /* metal_relu_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_relu_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAA2247CE9BE00800139 /* metal_hard_swish_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_hard_swish_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAA3247CE9BE00800139 /* metal_unary_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_unary_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FAA4247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_multidir_broadcast_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAA5247CE9BE00800139 /* metal_mul_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_mul_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAA6247CE9BE00800139 /* metal_ceil_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_ceil_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAA7247CE9BE00800139 /* metal_tan_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_tan_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAA8247CE9BE00800139 /* metal_stride_slice_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_stride_slice_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAA9247CE9BE00800139 /* metal_reduce_mean_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_mean_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAAA247CE9BE00800139 /* metal_common.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_common.metal; sourceTree = "<group>"; };
+		9DD1FAAB247CE9BE00800139 /* metal_tanh_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_tanh_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAAC247CE9BE00800139 /* metal_instance_norm_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_instance_norm_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAAD247CE9BE00800139 /* metal_sin_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_sin_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAAE247CE9BE00800139 /* metal_sub_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_sub_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB0247CE9BE00800139 /* metal_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_common.h; sourceTree = "<group>"; };
+		9DD1FAB1247CE9BE00800139 /* metal_add_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_add_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAB2247CE9BE00800139 /* metal_sqrt_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_sqrt_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB3247CE9BE00800139 /* metal_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB4247CE9BE00800139 /* metal_reduce_sum_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_sum_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB5247CE9BE00800139 /* metal_splitv_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_splitv_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAB6247CE9BE00800139 /* metal_hard_swish_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_hard_swish_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB7247CE9BE00800139 /* metal_log_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_log_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAB8247CE9BE00800139 /* metal_reshape_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reshape_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAB9247CE9BE00800139 /* metal_ceil_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_ceil_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FABA247CE9BE00800139 /* metal_reduce_sum_square_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_sum_square_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FABB247CE9BE00800139 /* metal_reduce_log_sum_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_log_sum_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FABC247CE9BE00800139 /* metal_atan_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_atan_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FABD247CE9BE00800139 /* metal_selu_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_selu_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FABE247CE9BE00800139 /* metal_batch_norm_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_batch_norm_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FABF247CE9BE00800139 /* metal_floor_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_floor_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAC0247CE9BE00800139 /* metal_reduce_max_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_max_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAC1247CE9BE00800139 /* metal_reduce_min_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_min_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAC2247CE9BE00800139 /* metal_tanh_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_tanh_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAC3247CE9BE00800139 /* metal_atan_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_atan_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAC4247CE9BE00800139 /* metal_elu_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_elu_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAC5247CE9BE00800139 /* metal_neg_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_neg_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAC6247CE9BE00800139 /* metal_prelu_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_prelu_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAC7247CE9BE00800139 /* metal_reduce_l1_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_l1_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAC8247CE9BE00800139 /* metal_abs_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_abs_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAC9247CE9BE00800139 /* metal_reduce_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FACA247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_hard_sigmoid_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FACC247CE9BE00800139 /* metal_acos_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_acos_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FACD247CE9BE00800139 /* metal_selu_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_selu_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FACE247CE9BE00800139 /* metal_batch_norm_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_batch_norm_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAD0247CE9BE00800139 /* metal_deconv_layer_common.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_deconv_layer_common.mm; sourceTree = "<group>"; };
+		9DD1FAD1247CE9BE00800139 /* metal_deconv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_deconv_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FAD2247CE9BE00800139 /* metal_deconv_layer_common.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_deconv_layer_common.metal; sourceTree = "<group>"; };
+		9DD1FAD3247CE9BE00800139 /* metal_deconv_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_deconv_layer_depthwise.h; sourceTree = "<group>"; };
+		9DD1FAD4247CE9BE00800139 /* metal_deconv_layer_depthwise.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_deconv_layer_depthwise.mm; sourceTree = "<group>"; };
+		9DD1FAD5247CE9BE00800139 /* metal_deconv_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_deconv_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAD6247CE9BE00800139 /* metal_deconv_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_deconv_layer_common.h; sourceTree = "<group>"; };
+		9DD1FAD7247CE9BE00800139 /* metal_deconv_layer_depthwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_deconv_layer_depthwise.metal; sourceTree = "<group>"; };
+		9DD1FAD8247CE9BE00800139 /* metal_splitv_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_splitv_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAD9247CE9BE00800139 /* metal_sign_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_sign_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FADA247CE9BE00800139 /* metal_softplus_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_softplus_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FADB247CE9BE00800139 /* metal_relu6_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_relu6_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FADC247CE9BE00800139 /* metal_reciprocal_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_reciprocal_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FADD247CE9BE00800139 /* metal_log_sigmoid_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_log_sigmoid_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FADE247CE9BE00800139 /* metal_shuffle_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_shuffle_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FADF247CE9BE00800139 /* metal_reduce_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_reduce_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FAE0247CE9BE00800139 /* metal_div_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_div_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAE1247CE9BE00800139 /* metal_exp_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_exp_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAE2247CE9BE00800139 /* metal_reduce_prod_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_prod_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAE3247CE9BE00800139 /* metal_min_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_min_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAE4247CE9BE00800139 /* metal_hdrguide_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_hdrguide_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAE5247CE9BE00800139 /* metal_sin_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_sin_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAE6247CE9BE00800139 /* metal_pow_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_pow_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAE7247CE9BE00800139 /* metal_softmax_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_softmax_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAE8247CE9BE00800139 /* metal_lrn_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_lrn_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAE9247CE9BE00800139 /* metal_mul_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_mul_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAEA247CE9BE00800139 /* metal_normalize_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_normalize_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAEB247CE9BE00800139 /* metal_reduce_l2_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_l2_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAEC247CE9BE00800139 /* metal_elu_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_elu_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAED247CE9BE00800139 /* metal_abs_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_abs_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAEE247CE9BE00800139 /* metal_concat_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_concat_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAEF247CE9BE00800139 /* metal_concat_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_concat_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF0247CE9BE00800139 /* metal_tan_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_tan_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAF1247CE9BE00800139 /* metal_clip_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_clip_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF2247CE9BE00800139 /* metal_cos_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_cos_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAF3247CE9BE00800139 /* metal_min_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_min_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAF4247CE9BE00800139 /* metal_normalize_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_normalize_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF5247CE9BE00800139 /* metal_relu_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_relu_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF6247CE9BE00800139 /* metal_softmax_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_softmax_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF7247CE9BE00800139 /* metal_clip_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_clip_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAF8247CE9BE00800139 /* metal_exp_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_exp_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAF9247CE9BE00800139 /* metal_max_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_max_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAFA247CE9BE00800139 /* metal_sigmoid_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_sigmoid_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAFB247CE9BE00800139 /* metal_softplus_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_softplus_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FAFC247CE9BE00800139 /* metal_div_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_div_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAFD247CE9BE00800139 /* metal_cos_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_cos_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAFE247CE9BE00800139 /* metal_reduce_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_reduce_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FAFF247CE9BE00800139 /* metal_prelu_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_prelu_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB00247CE9BE00800139 /* metal_pad_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_pad_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB01247CE9BE00800139 /* metal_relu6_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_relu6_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB02247CE9BE00800139 /* metal_stride_slice_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_stride_slice_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB04247CE9BE00800139 /* metal_pad_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_pad_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB05247CE9BE00800139 /* metal_add_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_add_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB06247CE9BE00800139 /* metal_hdrguide_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_hdrguide_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB07247CE9BE00800139 /* metal_reshape_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_reshape_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB08247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_multidir_broadcast_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FB09247CE9BE00800139 /* metal_instance_norm_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_instance_norm_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB0A247CE9BE00800139 /* metal_sign_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_sign_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB0B247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_hard_sigmoid_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB0C247CE9BE00800139 /* metal_reciprocal_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reciprocal_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB0D247CE9BE00800139 /* metal_permute_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_permute_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB0E247CE9BE00800139 /* metal_asin_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_asin_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB0F247CE9BE00800139 /* metal_reduce_log_sum_exp_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reduce_log_sum_exp_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB10247CE9BE00800139 /* metal_max_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_max_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB11247CE9BE00800139 /* metal_asin_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_asin_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB13247CE9BE00800139 /* metal_conv_layer_common.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_conv_layer_common.mm; sourceTree = "<group>"; };
+		9DD1FB14247CE9BE00800139 /* metal_conv_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_conv_layer_common.h; sourceTree = "<group>"; };
+		9DD1FB15247CE9BE00800139 /* metal_inner_product_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_inner_product_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB16247CE9BE00800139 /* metal_conv_layer_winograd.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_conv_layer_winograd.h; sourceTree = "<group>"; };
+		9DD1FB17247CE9BE00800139 /* metal_conv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_conv_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FB18247CE9BE00800139 /* metal_conv_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_conv_layer_depthwise.h; sourceTree = "<group>"; };
+		9DD1FB19247CE9BE00800139 /* metal_conv_layer_depthwise.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_conv_layer_depthwise.mm; sourceTree = "<group>"; };
+		9DD1FB1A247CE9BE00800139 /* metal_conv_layer_common.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_conv_layer_common.metal; sourceTree = "<group>"; };
+		9DD1FB1B247CE9BE00800139 /* metal_conv_layer_1x1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_conv_layer_1x1.h; sourceTree = "<group>"; };
+		9DD1FB1C247CE9BE00800139 /* metal_conv_layer_1x1.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_conv_layer_1x1.mm; sourceTree = "<group>"; };
+		9DD1FB1D247CE9BE00800139 /* metal_inner_product_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_inner_product_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FB1E247CE9BE00800139 /* metal_conv_layer_winograd.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_conv_layer_winograd.metal; sourceTree = "<group>"; };
+		9DD1FB1F247CE9BE00800139 /* metal_conv_layer_1x1.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_conv_layer_1x1.metal; sourceTree = "<group>"; };
+		9DD1FB20247CE9BE00800139 /* metal_inner_product_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_inner_product_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB21247CE9BE00800139 /* metal_conv_layer_depthwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_conv_layer_depthwise.metal; sourceTree = "<group>"; };
+		9DD1FB22247CE9BE00800139 /* metal_conv_layer_winograd.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_conv_layer_winograd.mm; sourceTree = "<group>"; };
+		9DD1FB23247CE9BE00800139 /* metal_conv_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_conv_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB24247CE9BE00800139 /* metal_unary_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_unary_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB25247CE9BE00800139 /* metal_upsample_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_upsample_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB26247CE9BE00800139 /* metal_upsample_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_upsample_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB27247CE9BE00800139 /* metal_sqrt_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_sqrt_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB28247CE9BE00800139 /* metal_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_layer_acc.h; sourceTree = "<group>"; };
+		9DD1FB29247CE9BE00800139 /* metal_lrn_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_lrn_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB2A247CE9BE00800139 /* metal_shuffle_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_shuffle_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB2B247CE9BE00800139 /* metal_pow_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_pow_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB2C247CE9BE00800139 /* metal_neg_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_neg_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB2D247CE9BE00800139 /* metal_log_sigmoid_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_log_sigmoid_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB2E247CE9BE00800139 /* metal_log_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_log_layer_acc.mm; sourceTree = "<group>"; };
+		9DD1FB2F247CE9BE00800139 /* metal_sub_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_sub_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FB30247CE9BE00800139 /* metal_acos_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_acos_layer_acc.metal; sourceTree = "<group>"; };
+		9DD1FBD2247CEA1200800139 /* arm_util.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_util.cc; sourceTree = "<group>"; };
+		9DD1FBD3247CEA1200800139 /* arm_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_util.h; sourceTree = "<group>"; };
+		9DD1FBD4247CEA1200800139 /* arm_context.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_context.cc; sourceTree = "<group>"; };
+		9DD1FBD5247CEA1200800139 /* arm_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_context.h; sourceTree = "<group>"; };
+		9DD1FBD6247CEA1300800139 /* arm_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_common.h; sourceTree = "<group>"; };
+		9DD1FC61247CEA1400800139 /* arm_blob_converter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_blob_converter.h; sourceTree = "<group>"; };
+		9DD1FC62247CEA1400800139 /* arm_blob_converter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_blob_converter.cc; sourceTree = "<group>"; };
+		9DD1FC63247CEA1400800139 /* arm_device.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_device.h; sourceTree = "<group>"; };
+		9DD1FC64247CEA1400800139 /* arm_device.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_device.cc; sourceTree = "<group>"; };
+		9DDA7081241F99E600F17A1C /* version.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = version.h; sourceTree = "<group>"; };
+		9DDA709D241F99F800F17A1C /* utils */ = {isa = PBXFileReference; lastKnownFileType = folder; path = utils; sourceTree = "<group>"; };
+		9DDA709E241F99F800F17A1C /* core */ = {isa = PBXFileReference; lastKnownFileType = folder; path = core; sourceTree = "<group>"; };
+		9DF19E9D24A1FE8E00E1376D /* metal_pooling_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_pooling_layer_acc.metal; sourceTree = "<group>"; };
+		9DF19E9E24A1FE8E00E1376D /* metal_pooling_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_pooling_layer_acc.mm; sourceTree = "<group>"; };
+		9DF19EA124A200AC00E1376D /* metal_cpu_adapter_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_cpu_adapter_acc.h; sourceTree = "<group>"; };
+		9DF26BD724645EA500F22F0D /* naive_compute.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = naive_compute.cc; sourceTree = "<group>"; };
+		9DF26BD824645EA500F22F0D /* naive_compute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = naive_compute.h; sourceTree = "<group>"; };
+		9DF542E0258B1363006CEC97 /* arm_sqrt_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_sqrt_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E1258B1363006CEC97 /* arm_sigmoid_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_sigmoid_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E2258B1363006CEC97 /* arm_unary_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_unary_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E3258B1363006CEC97 /* arm_sub_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_sub_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E4258B1363006CEC97 /* Half8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Half8.h; sourceTree = "<group>"; };
+		9DF542E5258B1363006CEC97 /* arm_log_acc_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_log_acc_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E6258B1363006CEC97 /* arm_reduce_log_sum_exp_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_log_sum_exp_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E7258B1363006CEC97 /* arm_splitv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_splitv_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E8258B1363006CEC97 /* arm_reduce_prod_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_prod_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542E9258B1363006CEC97 /* arm_nchw_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_nchw_layer_acc.h; sourceTree = "<group>"; };
+		9DF542EA258B1363006CEC97 /* arm_scale_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_scale_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542EB258B1363006CEC97 /* arm_instance_norm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_instance_norm_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542EC258B1363006CEC97 /* arm_inner_product_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_inner_product_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542ED258B1363006CEC97 /* arm_reduce_l2_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_l2_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542EE258B1363006CEC97 /* arm_reduce_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F0258B1363006CEC97 /* arm_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_mul_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F1258B1363006CEC97 /* arm_normalize_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_normalize_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F2258B1363006CEC97 /* arm_binary_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_binary_layer_acc.h; sourceTree = "<group>"; };
+		9DF542F3258B1363006CEC97 /* TNNVector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TNNVector.h; sourceTree = "<group>"; };
+		9DF542F4258B1363006CEC97 /* arm_reshape_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reshape_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F5258B1363006CEC97 /* arm_prelu_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_prelu_layer_acc.h; sourceTree = "<group>"; };
+		9DF542F6258B1363006CEC97 /* arm_sign_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_sign_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F7258B1363006CEC97 /* arm_reformat_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reformat_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542F8258B1363006CEC97 /* arm_unary_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_unary_layer_acc.h; sourceTree = "<group>"; };
+		9DF542F9258B1363006CEC97 /* arm_selu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_selu_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542FA258B1363006CEC97 /* arm_relu6_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_relu6_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542FB258B1363006CEC97 /* arm_trig_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_trig_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542FC258B1363006CEC97 /* arm_reduce_log_sum_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_log_sum_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542FD258B1363006CEC97 /* arm_reformat_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_reformat_layer_acc.h; sourceTree = "<group>"; };
+		9DF542FE258B1363006CEC97 /* arm_arg_max_or_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_arg_max_or_min_layer_acc.cc; sourceTree = "<group>"; };
+		9DF542FF258B1363006CEC97 /* arm_reciprocal_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reciprocal_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54301258B1363006CEC97 /* arm_deconv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54303258B1363006CEC97 /* arm_deconv_layer_stride.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_layer_stride.h; sourceTree = "<group>"; };
+		9DF54304258B1363006CEC97 /* arm_deconv_fp16_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_fp16_layer_common.h; sourceTree = "<group>"; };
+		9DF54305258B1363006CEC97 /* arm_deconv_layer_depthwise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_layer_depthwise.cc; sourceTree = "<group>"; };
+		9DF54306258B1363006CEC97 /* arm_deconv_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_layer_common.cc; sourceTree = "<group>"; };
+		9DF54308258B1363006CEC97 /* arm_deconv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_layer_acc.h; sourceTree = "<group>"; };
+		9DF54309258B1364006CEC97 /* arm_deconv_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_layer_depthwise.h; sourceTree = "<group>"; };
+		9DF5430A258B1364006CEC97 /* arm_deconv_fp16_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_fp16_layer_depthwise.h; sourceTree = "<group>"; };
+		9DF5430B258B1364006CEC97 /* arm_deconv_layer_stride.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_layer_stride.cc; sourceTree = "<group>"; };
+		9DF5430C258B1364006CEC97 /* arm_deconv_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_deconv_layer_common.h; sourceTree = "<group>"; };
+		9DF5430E258B1364006CEC97 /* arm_reorg_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reorg_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5430F258B1364006CEC97 /* arm_hard_sigmoid_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_hard_sigmoid_acc.cc; sourceTree = "<group>"; };
+		9DF54310258B1364006CEC97 /* arm_reduce_mean_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_mean_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54311258B1364006CEC97 /* arm_pool_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_pool_layer_acc.h; sourceTree = "<group>"; };
+		9DF54312258B1364006CEC97 /* arm_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_min_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54313258B1364006CEC97 /* arm_max_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_max_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54314258B1364006CEC97 /* arm_softmax_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_softmax_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54316258B1364006CEC97 /* arm_add_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_add_layer_acc.h; sourceTree = "<group>"; };
+		9DF54317258B1364006CEC97 /* arm_clip_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_clip_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54318258B1364006CEC97 /* arm_pad_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_pad_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54319258B1364006CEC97 /* arm_pixel_shuffle_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_pixel_shuffle_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5431B258B1364006CEC97 /* compute.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute.cc; sourceTree = "<group>"; };
+		9DF5431C258B1364006CEC97 /* winograd_function.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = winograd_function.cc; sourceTree = "<group>"; };
+		9DF5431D258B1364006CEC97 /* compute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute.h; sourceTree = "<group>"; };
+		9DF5431E258B1364006CEC97 /* compute_int8.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute_int8.cc; sourceTree = "<group>"; };
+		9DF54320258B1364006CEC97 /* CONV_BFP16_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_BFP16_O4.S; sourceTree = "<group>"; };
+		9DF54321258B1364006CEC97 /* GEMM_INT8_4X8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_4X8.S; sourceTree = "<group>"; };
+		9DF54322258B1364006CEC97 /* GEMM_BFP16_N4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_BFP16_N4.S; sourceTree = "<group>"; };
+		9DF54323258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FLOAT_SLIDEW_C3.S; sourceTree = "<group>"; };
+		9DF54324258B1364006CEC97 /* CONV_DW_5X5_BFP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_5X5_BFP16_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54325258B1364006CEC97 /* DECONV_FLOAT_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FLOAT_O4.S; sourceTree = "<group>"; };
+		9DF54326258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_FLOAT_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54327258B1364006CEC97 /* GEMM_INT8_4X4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_4X4.S; sourceTree = "<group>"; };
+		9DF54328258B1364006CEC97 /* CONV_DW_3X3_BFP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_BFP16_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54329258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_BFP16_SLIDEW_C3.S; sourceTree = "<group>"; };
+		9DF5432A258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_5X5_FLOAT_SLIDEW.S; sourceTree = "<group>"; };
+		9DF5432B258B1364006CEC97 /* CONV_FLOAT_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FLOAT_O4.S; sourceTree = "<group>"; };
+		9DF5432C258B1364006CEC97 /* GEMM_FLOAT_N4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_FLOAT_N4.S; sourceTree = "<group>"; };
+		9DF54335258B1364006CEC97 /* gemm_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gemm_function.h; sourceTree = "<group>"; };
+		9DF54336258B1364006CEC97 /* asm_func_name.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = asm_func_name.S; sourceTree = "<group>"; };
+		9DF54337258B1364006CEC97 /* winograd_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = winograd_function.h; sourceTree = "<group>"; };
+		9DF54338258B1364006CEC97 /* compute_int8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute_int8.h; sourceTree = "<group>"; };
+		9DF5433A258B1364006CEC97 /* CONV_BFP16_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_BFP16_O4.S; sourceTree = "<group>"; };
+		9DF5433B258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FLOAT_SLIDEW_C3.S; sourceTree = "<group>"; };
+		9DF5433C258B1364006CEC97 /* CONV_DW_5x5_BFP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_5x5_BFP16_SLIDEW.S; sourceTree = "<group>"; };
+		9DF5433D258B1364006CEC97 /* DECONV_FLOAT_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FLOAT_O4.S; sourceTree = "<group>"; };
+		9DF5433E258B1364006CEC97 /* GEMM_FLOAT_N8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_FLOAT_N8.S; sourceTree = "<group>"; };
+		9DF5433F258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_FLOAT_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54340258B1364006CEC97 /* GEMM_BFP16_N8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_BFP16_N8.S; sourceTree = "<group>"; };
+		9DF54341258B1364006CEC97 /* GEMM_INT8_4X4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_4X4.S; sourceTree = "<group>"; };
+		9DF54342258B1364006CEC97 /* CONV_DW_3x3_BFP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3x3_BFP16_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54343258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_BFP16_SLIDEW_C3.S; sourceTree = "<group>"; };
+		9DF54344258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_5X5_FLOAT_SLIDEW.S; sourceTree = "<group>"; };
+		9DF54345258B1364006CEC97 /* CONV_FLOAT_O4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FLOAT_O4.S; sourceTree = "<group>"; };
+		9DF54346258B1364006CEC97 /* GEMM_INT8_8X8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_INT8_8X8.S; sourceTree = "<group>"; };
+		9DF54348258B1364006CEC97 /* gemm_function.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gemm_function.cc; sourceTree = "<group>"; };
+		9DF54349258B1365006CEC97 /* arm_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_layer_acc.h; sourceTree = "<group>"; };
+		9DF5434A258B1365006CEC97 /* arm_div_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_div_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5434B258B1365006CEC97 /* arm_exp_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_exp_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5434C258B1365006CEC97 /* arm_detection_output_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_detection_output_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5434D258B1365006CEC97 /* arm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5434E258B1365006CEC97 /* neon_mathfun.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = neon_mathfun.h; sourceTree = "<group>"; };
+		9DF5434F258B1365006CEC97 /* arm_hard_swish_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_hard_swish_acc.cc; sourceTree = "<group>"; };
+		9DF54350258B1365006CEC97 /* arm_pow_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_pow_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54351258B1365006CEC97 /* arm_binary_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_binary_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54352258B1365006CEC97 /* arm_elu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_elu_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54353258B1365006CEC97 /* arm_reshape_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_reshape_layer_acc.h; sourceTree = "<group>"; };
+		9DF54354258B1365006CEC97 /* arm_upsample_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_upsample_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54355258B1365006CEC97 /* arm_abs_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_abs_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54356258B1365006CEC97 /* arm_nchw_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_nchw_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54358258B1365006CEC97 /* arm_stride_slice_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_stride_slice_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54359258B1365006CEC97 /* arm_inner_product_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_inner_product_layer_acc.h; sourceTree = "<group>"; };
+		9DF5435A258B1365006CEC97 /* Float4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Float4.h; sourceTree = "<group>"; };
+		9DF5435B258B1365006CEC97 /* arm_signed_mul_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_signed_mul_layer_acc.h; sourceTree = "<group>"; };
+		9DF5435E258B1365006CEC97 /* arm_neg_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_neg_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5435F258B1365006CEC97 /* arm_reduce_sum_square_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_sum_square_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54360258B1365006CEC97 /* arm_log_sigmoid_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_log_sigmoid_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54361258B1365006CEC97 /* arm_reduce_l1_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_l1_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54362258B1365006CEC97 /* arm_priorbox_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_priorbox_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54364258B1365006CEC97 /* arm_conv_int8_layer_depthwise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_int8_layer_depthwise.cc; sourceTree = "<group>"; };
+		9DF54365258B1365006CEC97 /* arm_conv_layer_depthwise_s1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_depthwise_s1.h; sourceTree = "<group>"; };
+		9DF54366258B1365006CEC97 /* arm_conv_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_common.cc; sourceTree = "<group>"; };
+		9DF54367258B1365006CEC97 /* arm_conv_layer_1x1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_1x1.h; sourceTree = "<group>"; };
+		9DF54369258B1365006CEC97 /* arm_conv_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_common.h; sourceTree = "<group>"; };
+		9DF5436A258B1365006CEC97 /* arm_conv_layer_c3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_c3.h; sourceTree = "<group>"; };
+		9DF5436B258B1365006CEC97 /* arm_conv_int8_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_int8_layer_common.h; sourceTree = "<group>"; };
+		9DF5436D258B1365006CEC97 /* arm_conv_fp16_layer_c3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_fp16_layer_c3.h; sourceTree = "<group>"; };
+		9DF5436E258B1365006CEC97 /* arm_conv_int8_layer_1x1.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_int8_layer_1x1.cc; sourceTree = "<group>"; };
+		9DF5436F258B1365006CEC97 /* arm_conv_fp16_layer_depthwise_s1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_fp16_layer_depthwise_s1.h; sourceTree = "<group>"; };
+		9DF54370258B1365006CEC97 /* arm_conv_fp16_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_fp16_layer_depthwise.h; sourceTree = "<group>"; };
+		9DF54371258B1365006CEC97 /* arm_conv_layer_1x1.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_1x1.cc; sourceTree = "<group>"; };
+		9DF54372258B1365006CEC97 /* arm_conv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_acc.h; sourceTree = "<group>"; };
+		9DF54373258B1365006CEC97 /* arm_conv_fp16_layer_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_fp16_layer_3x3.h; sourceTree = "<group>"; };
+		9DF54374258B1365006CEC97 /* arm_conv_fp16_layer_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_fp16_layer_common.h; sourceTree = "<group>"; };
+		9DF54375258B1365006CEC97 /* arm_conv_layer_3x3.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_3x3.cc; sourceTree = "<group>"; };
+		9DF54376258B1365006CEC97 /* arm_conv_int8_layer_1x1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_int8_layer_1x1.h; sourceTree = "<group>"; };
+		9DF54377258B1365006CEC97 /* arm_conv_layer_depthwise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_depthwise.cc; sourceTree = "<group>"; };
+		9DF54378258B1365006CEC97 /* arm_conv_int8_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_int8_layer_common.cc; sourceTree = "<group>"; };
+		9DF54379258B1365006CEC97 /* arm_conv_layer_group.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_group.h; sourceTree = "<group>"; };
+		9DF5437A258B1365006CEC97 /* arm_conv_layer_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_3x3.h; sourceTree = "<group>"; };
+		9DF5437C258B1365006CEC97 /* arm_conv_layer_c3.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_c3.cc; sourceTree = "<group>"; };
+		9DF5437D258B1365006CEC97 /* arm_conv_layer_acc_factory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_acc_factory.h; sourceTree = "<group>"; };
+		9DF5437E258B1365006CEC97 /* arm_conv_layer_acc_factory.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_acc_factory.cc; sourceTree = "<group>"; };
+		9DF5437F258B1365006CEC97 /* arm_conv_int8_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_int8_layer_depthwise.h; sourceTree = "<group>"; };
+		9DF54380258B1365006CEC97 /* arm_conv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54383258B1365006CEC97 /* arm_conv_layer_group.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_group.cc; sourceTree = "<group>"; };
+		9DF54384258B1365006CEC97 /* arm_conv_layer_depthwise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_conv_layer_depthwise.h; sourceTree = "<group>"; };
+		9DF54385258B1365006CEC97 /* arm_conv_layer_depthwise_s1.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_layer_depthwise_s1.cc; sourceTree = "<group>"; };
+		9DF54386258B1366006CEC97 /* arm_reduce_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_reduce_layer_acc.h; sourceTree = "<group>"; };
+		9DF54387258B1366006CEC97 /* arm_signed_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_signed_mul_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5438A258B1366006CEC97 /* arm_reduce_max_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_max_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5438B258B1366006CEC97 /* arm_reduce_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_min_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5438C258B1366006CEC97 /* arm_upsample_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_upsample_layer_acc.h; sourceTree = "<group>"; };
+		9DF5438D258B1366006CEC97 /* arm_floor_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_floor_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5438E258B1366006CEC97 /* arm_shuffle_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_shuffle_layer_acc.cc; sourceTree = "<group>"; };
+		9DF5438F258B1366006CEC97 /* arm_softplus_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_softplus_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54390258B1366006CEC97 /* arm_reduce_sum_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_reduce_sum_layer_acc.cc; sourceTree = "<group>"; };
+		9DF54391258B1366006CEC97 /* arm_batch_norm_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_batch_norm_layer_acc.h; sourceTree = "<group>"; };
+		9DF54443258B162F006CEC97 /* npu_common_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = npu_common_utils.h; sourceTree = "<group>"; };
+		9DF54444258B162F006CEC97 /* blob_converter_default.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_converter_default.cc; sourceTree = "<group>"; };
+		9DF54445258B162F006CEC97 /* blob_converter_default.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_converter_default.h; sourceTree = "<group>"; };
+		9DF54446258B162F006CEC97 /* npu_common_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = npu_common_utils.cc; sourceTree = "<group>"; };
+		9DF54448258B162F006CEC97 /* random_data_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = random_data_utils.h; sourceTree = "<group>"; };
+		9DF54449258B162F006CEC97 /* random_data_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = random_data_utils.cc; sourceTree = "<group>"; };
+		E43D68B425C8F38000FAAF54 /* CONV_DW_3X3_INT8_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_INT8_SLIDEW.S; sourceTree = "<group>"; };
+		E44D945526048B7F003FE4A3 /* blob_impl.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blob_impl.cc; sourceTree = "<group>"; };
+		E44D945626048B7F003FE4A3 /* blob_impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blob_impl.h; sourceTree = "<group>"; };
+		E44D945726048B7F003FE4A3 /* const_folder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = const_folder.h; sourceTree = "<group>"; };
+		E44D946226048BE7003FE4A3 /* arm_lstm_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_lstm_fp16_layer.cc; sourceTree = "<group>"; };
+		E44D946326048BE7003FE4A3 /* arm_inner_product_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_inner_product_fp16_layer.cc; sourceTree = "<group>"; };
+		E44D946726048BF3003FE4A3 /* gemm_function_fp16.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gemm_function_fp16.cc; sourceTree = "<group>"; };
+		E44D946B26048C1A003FE4A3 /* dims_function_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dims_function_utils.h; sourceTree = "<group>"; };
+		E44D946D26048C1A003FE4A3 /* dims_function_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dims_function_utils.cc; sourceTree = "<group>"; };
+		E44D946E26048C1A003FE4A3 /* dims_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dims_utils.h; sourceTree = "<group>"; };
+		E44D947626048C32003FE4A3 /* cpu_info.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_info.h; sourceTree = "<group>"; };
+		E44D947726048C32003FE4A3 /* cpu_info.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_info.cc; sourceTree = "<group>"; };
+		E44D947826048C32003FE4A3 /* data_flag_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = data_flag_utils.h; sourceTree = "<group>"; };
+		E4D05B7C259DCB2D00921502 /* arm_conv_fp16_layer_depthwise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_fp16_layer_depthwise.cc; sourceTree = "<group>"; };
+		E4D05B7D259DCB2D00921502 /* compute_half.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute_half.h; sourceTree = "<group>"; };
+		E4D05B7E259DCB2D00921502 /* arm_deconv_fp16_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_fp16_layer_common.cc; sourceTree = "<group>"; };
+		E4D05B81259DCB2D00921502 /* DECONV_FP16_O8_C1.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FP16_O8_C1.S; sourceTree = "<group>"; };
+		E4D05B82259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_FP16_SLIDEW.S; sourceTree = "<group>"; };
+		E4D05B83259DCB2D00921502 /* GEMM_FP16_N8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_FP16_N8.S; sourceTree = "<group>"; };
+		E4D05B84259DCB2D00921502 /* FLOAT2HALF.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = FLOAT2HALF.S; sourceTree = "<group>"; };
+		E4D05B86259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FP16_SLIDEW_C3.S; sourceTree = "<group>"; };
+		E4D05B87259DCB2D00921502 /* DECONV_FP16_O8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FP16_O8.S; sourceTree = "<group>"; };
+		E4D05B88259DCB2D00921502 /* HALF2FLOAT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = HALF2FLOAT.S; sourceTree = "<group>"; };
+		E4D05B8B259DCB2D00921502 /* arm_unary_fp16_layer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_unary_fp16_layer.h; sourceTree = "<group>"; };
+		E4D05B8C259DCB2D00921502 /* arm_conv_fp16_layer_c3.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_fp16_layer_c3.cc; sourceTree = "<group>"; };
+		E4D05B8F259DCB2D00921502 /* arm_conv_fp16_layer_depthwise_s1.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_fp16_layer_depthwise_s1.cc; sourceTree = "<group>"; };
+		E4D05B90259DCB2D00921502 /* arm_sigmoid_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_sigmoid_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05B91259DCB2D00921502 /* arm_deconv_fp16_layer_depthwise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_deconv_fp16_layer_depthwise.cc; sourceTree = "<group>"; };
+		E4D05B93259DCB2D00921502 /* DECONV_FP16_O8_C1.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FP16_O8_C1.S; sourceTree = "<group>"; };
+		E4D05B94259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_DW_3X3_FP16_SLIDEW.S; sourceTree = "<group>"; };
+		E4D05B95259DCB2D00921502 /* GEMM_FP16_N8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = GEMM_FP16_N8.S; sourceTree = "<group>"; };
+		E4D05B96259DCB2D00921502 /* FLOAT2HALF.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = FLOAT2HALF.S; sourceTree = "<group>"; };
+		E4D05B97259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = CONV_FP16_SLIDEW_C3.S; sourceTree = "<group>"; };
+		E4D05B98259DCB2D00921502 /* DECONV_FP16_O8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = DECONV_FP16_O8.S; sourceTree = "<group>"; };
+		E4D05B99259DCB2D00921502 /* HALF2FLOAT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = HALF2FLOAT.S; sourceTree = "<group>"; };
+		E4D05B9B259DCB2D00921502 /* compute_half.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute_half.cc; sourceTree = "<group>"; };
+		E4D05B9C259DCB2D00921502 /* arm_softmax_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_softmax_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05B9F259DCB2D00921502 /* arm_conv_fp16_layer_3x3.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_fp16_layer_3x3.cc; sourceTree = "<group>"; };
+		E4D05BA0259DCB2D00921502 /* arm_relu6_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_relu6_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05BA1259DCB2D00921502 /* arm_conv_fp16_layer_common.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_conv_fp16_layer_common.cc; sourceTree = "<group>"; };
+		E4D05BA4259DCB2D00921502 /* winograd_function_fp16.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = winograd_function_fp16.cc; sourceTree = "<group>"; };
+		E4D05BE4259F15C600921502 /* arm_relu_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_relu_layer_acc.h; sourceTree = "<group>"; };
+		E4D05BE5259F15C600921502 /* arm_batch_norm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_batch_norm_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BE6259F15C600921502 /* arm_softmax_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_softmax_layer_acc.h; sourceTree = "<group>"; };
+		E4D05BE7259F15C600921502 /* arm_permute_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_permute_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BE8259F15C700921502 /* arm_pool_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_pool_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BE9259F15C700921502 /* arm_concat_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_concat_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BEA259F15C700921502 /* arm_prelu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_prelu_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BEB259F15C700921502 /* arm_relu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_relu_layer_acc.cc; sourceTree = "<group>"; };
+		E4D05BF5259F161000921502 /* arm_batch_norm_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_batch_norm_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05BF6259F161000921502 /* arm_prelu_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_prelu_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05BF7259F161000921502 /* arm_relu_fp16_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_relu_fp16_layer.cc; sourceTree = "<group>"; };
+		E4D05C02259F1BA700921502 /* arm_add_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_add_layer_acc.cc; sourceTree = "<group>"; };
+		EC0BE13425144B5D009BD69A /* detection_post_process_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = detection_post_process_utils.h; sourceTree = "<group>"; };
+		EC0BE13525144B5D009BD69A /* detection_post_process_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_post_process_utils.cc; sourceTree = "<group>"; };
+		EC0BE13625144B5D009BD69A /* string_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = string_utils.cc; sourceTree = "<group>"; };
+		EC0BE14825144BB7009BD69A /* reduce_l1_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_l1_layer.cc; sourceTree = "<group>"; };
+		EC0BE14925144BB8009BD69A /* ceil_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ceil_layer.cc; sourceTree = "<group>"; };
+		EC0BE14A25144BB8009BD69A /* signed_mul_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = signed_mul_layer.cc; sourceTree = "<group>"; };
+		EC0BE14B25144BB8009BD69A /* squared_difference_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = squared_difference_layer.cc; sourceTree = "<group>"; };
+		EC0BE14C25144BB8009BD69A /* detection_post_process_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_post_process_layer.cc; sourceTree = "<group>"; };
+		EC0BE14D25144BB8009BD69A /* arg_max_or_min_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arg_max_or_min_layer.cc; sourceTree = "<group>"; };
+		EC0BE14E25144BB8009BD69A /* rsqrt_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rsqrt_layer.cc; sourceTree = "<group>"; };
+		EC0BE15825144BE3009BD69A /* layer_interpreter_macro.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = layer_interpreter_macro.h; sourceTree = "<group>"; };
+		EC0BE15925144BE4009BD69A /* squared_difference_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = squared_difference_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE15A25144BE4009BD69A /* detection_post_process_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_post_process_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE15B25144BE4009BD69A /* squeeze_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = squeeze_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE15C25144BE4009BD69A /* signed_mul_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = signed_mul_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE15D25144BE4009BD69A /* arg_max_or_min_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arg_max_or_min_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16425144C0F009BD69A /* elu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elu_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16525144C0F009BD69A /* selu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = selu_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16625144C0F009BD69A /* normalize_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = normalize_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16725144C0F009BD69A /* prior_box_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16825144C0F009BD69A /* scale_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scale_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16925144C0F009BD69A /* pad_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pad_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16A25144C0F009BD69A /* clip_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clip_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16B25144C0F009BD69A /* reorg_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reorg_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16C25144C0F009BD69A /* detection_output_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = detection_output_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16D25144C0F009BD69A /* reduce_op_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reduce_op_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16E25144C10009BD69A /* instance_norm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = instance_norm_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE16F25144C10009BD69A /* roi_pooling_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = roi_pooling_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE17025144C10009BD69A /* prelu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prelu_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE17125144C10009BD69A /* unary_op_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unary_op_layer_interpreter.cc; sourceTree = "<group>"; };
+		EC0BE1B9251DBE65009BD69A /* mat_converter_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_converter_utils.cc; sourceTree = "<group>"; };
+		EC0BE1BA251DBE65009BD69A /* mat_converter_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mat_converter_utils.h; sourceTree = "<group>"; };
+		EC12EC7725E67548007ADDE4 /* net_optimizer_insert_layout_reformat.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_insert_layout_reformat.cc; sourceTree = "<group>"; };
+		EC12EC7825E67548007ADDE4 /* net_optimizer_cbam_fused_pooling.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_cbam_fused_pooling.h; sourceTree = "<group>"; };
+		EC12EC7925E67548007ADDE4 /* net_optimizer_fuse_conv_add.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_fuse_conv_add.h; sourceTree = "<group>"; };
+		EC12EC7A25E67548007ADDE4 /* net_optimizer_cbam_fused_reduce.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_cbam_fused_reduce.cc; sourceTree = "<group>"; };
+		EC12EC7B25E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_cbam_fused_reduce.h; sourceTree = "<group>"; };
+		EC12EC7C25E67549007ADDE4 /* net_optimizer_fuse_conv_add.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_fuse_conv_add.cc; sourceTree = "<group>"; };
+		EC12EC7D25E67549007ADDE4 /* net_optimizer_insert_layout_reformat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_insert_layout_reformat.h; sourceTree = "<group>"; };
+		EC12EC7E25E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_cbam_fused_pooling.cc; sourceTree = "<group>"; };
+		EC12EC8825E682F9007ADDE4 /* metal_reformat_layer_acc.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = metal_reformat_layer_acc.h; sourceTree = "<group>"; };
+		EC12EC8925E68374007ADDE4 /* metal_reformat_layer_acc.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reformat_layer_acc.mm; sourceTree = "<group>"; };
+		EC12EC8E25E7A7F4007ADDE4 /* metal_reformat_layer_acc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = metal_reformat_layer_acc.metal; sourceTree = "<group>"; };
+		EC12EF9425EF365B007ADDE4 /* metal_squeeze_layer_acc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = metal_squeeze_layer_acc.metal; sourceTree = "<group>"; };
+		EC2CF72325078C1200EE3899 /* metal_mat_converter.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_mat_converter.mm; sourceTree = "<group>"; };
+		EC2CF72425078C1200EE3899 /* metal_mat_converter.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_mat_converter.metal; sourceTree = "<group>"; };
+		EC2CF7802511F80500EE3899 /* metal_arg_max_or_min_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_arg_max_or_min_layer_acc.mm; sourceTree = "<group>"; };
+		EC2CF7812511F80500EE3899 /* metal_arg_max_or_min_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_arg_max_or_min_layer_acc.metal; sourceTree = "<group>"; };
+		EC39A3F725FB65E000891D9A /* half_utils_inner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = half_utils_inner.h; sourceTree = "<group>"; };
+		EC39A3F825FB65E000891D9A /* half.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = half.hpp; sourceTree = "<group>"; };
+		EC39A40025FB662900891D9A /* metal_reorg_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = metal_reorg_layer_acc.metal; path = ../../metal/acc/metal_reorg_layer_acc.metal; sourceTree = "<group>"; };
+		EC39A40125FB662900891D9A /* metal_reorg_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; name = metal_reorg_layer_acc.mm; path = ../../metal/acc/metal_reorg_layer_acc.mm; sourceTree = "<group>"; };
+		EC39A40925FB667D00891D9A /* net_optimizer_fuse_conv_add.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_fuse_conv_add.cc; sourceTree = "<group>"; };
+		EC39A40A25FB667D00891D9A /* net_optimizer_fuse_conv_add.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_fuse_conv_add.h; sourceTree = "<group>"; };
+		EC39A41925FC9DE100891D9A /* arm_lstm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_lstm_layer_acc.cc; sourceTree = "<group>"; };
+		EC39A41A25FC9DE100891D9A /* arm_squeeze_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_squeeze_layer_acc.cc; sourceTree = "<group>"; };
+		EC39A41E25FCA8E600891D9A /* arm_lstm_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_lstm_layer_acc.h; sourceTree = "<group>"; };
+		EC5932F225CA446100FF8F4B /* metal_lstm_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metal_lstm_layer_acc.h; sourceTree = "<group>"; };
+		EC5932F325CA446100FF8F4B /* metal_lstm_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_lstm_layer_acc.mm; sourceTree = "<group>"; };
+		EC5932F725CFEBA900FF8F4B /* metal_reorg_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_reorg_layer_acc.mm; sourceTree = "<group>"; };
+		EC5932F825CFEBA900FF8F4B /* metal_reorg_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_reorg_layer_acc.metal; sourceTree = "<group>"; };
+		EC626D402615B1F800750B31 /* binary_function.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binary_function.cc; sourceTree = "<group>"; };
+		EC626D412615B1F800750B31 /* binary_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binary_function.h; sourceTree = "<group>"; };
+		EC626D452615B20300750B31 /* arm_binary_fp16_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_binary_fp16_layer_acc.cc; sourceTree = "<group>"; };
+		EC626D482615B21A00750B31 /* arm_expand_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_expand_layer_acc.cc; sourceTree = "<group>"; };
+		EC626D492615B21A00750B31 /* arm_cast_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_cast_layer_acc.cc; sourceTree = "<group>"; };
+		EC626D4A2615B21A00750B31 /* arm_expand_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_expand_layer_acc.h; sourceTree = "<group>"; };
+		EC626D4B2615B21A00750B31 /* arm_gather_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_gather_layer_acc.cc; sourceTree = "<group>"; };
+		EC626D4C2615B21A00750B31 /* arm_unsqueeze_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_unsqueeze_layer_acc.cc; sourceTree = "<group>"; };
+		EC78BA1126034506009271A8 /* blob_dump_utils.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = blob_dump_utils.h; sourceTree = "<group>"; };
+		EC78BA2926045787009271A8 /* metal_stride_slice_v2_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_stride_slice_v2_layer_acc.mm; sourceTree = "<group>"; };
+		EC78BA3E26048309009271A8 /* arm_mat_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_mat_mul_layer_acc.cc; sourceTree = "<group>"; };
+		EC78BA3F26048309009271A8 /* arm_mat_mul_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_mat_mul_layer_acc.h; sourceTree = "<group>"; };
+		EC7F4B0925E6417200F73811 /* metal_squeeze_layer_acc.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_squeeze_layer_acc.mm; sourceTree = "<group>"; };
+		EC880548255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_fuse_conv_post.h; sourceTree = "<group>"; };
+		EC880549255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_fuse_conv_post.cc; sourceTree = "<group>"; };
+		EC8BDE0725E3B13D0085CCC2 /* const_folder.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = const_folder.cc; sourceTree = "<group>"; };
+		EC8BDE0E25E3B15F0085CCC2 /* data_flag_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = data_flag_utils.cc; sourceTree = "<group>"; };
+		EC8BDE0F25E3B15F0085CCC2 /* dims_offset_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dims_offset_utils.h; sourceTree = "<group>"; };
+		EC8BDE1025E3B15F0085CCC2 /* dims_offset_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dims_offset_utils.cc; sourceTree = "<group>"; };
+		ECCDCDFF25DF536000D7D297 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
+		ECCDCE0025DF536000D7D297 /* cpu_device.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_device.cc; sourceTree = "<group>"; };
+		ECCDCE0125DF536000D7D297 /* cpu_mat_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_mat_util.h; sourceTree = "<group>"; };
+		ECCDCE0225DF536000D7D297 /* cpu_mat_converter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_mat_converter.cc; sourceTree = "<group>"; };
+		ECCDCE0325DF536000D7D297 /* cpu_device.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_device.h; sourceTree = "<group>"; };
+		ECCDCE0425DF536000D7D297 /* cpu_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_context.h; sourceTree = "<group>"; };
+		ECCDCE0525DF536000D7D297 /* cpu_blob_converter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_blob_converter.cc; sourceTree = "<group>"; };
+		ECCDCE0625DF536000D7D297 /* cpu_mat_converter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_mat_converter.h; sourceTree = "<group>"; };
+		ECCDCE0725DF536000D7D297 /* cpu_context.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_context.cc; sourceTree = "<group>"; };
+		ECCDCE0825DF536000D7D297 /* cpu_mat_util.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_mat_util.cc; sourceTree = "<group>"; };
+		ECCDCE0A25DF536000D7D297 /* cpu_cast_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_cast_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE0B25DF536000D7D297 /* cpu_stride_slice_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_stride_slice_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE0C25DF536000D7D297 /* cpu_reciprocal_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reciprocal_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE0D25DF536000D7D297 /* cpu_binary_op_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_binary_op_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE0E25DF536000D7D297 /* cpu_erf_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_erf_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE0F25DF536000D7D297 /* cpu_gather_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_gather_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1025DF536000D7D297 /* cpu_permute_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_permute_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE1125DF536000D7D297 /* cpu_reorg_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reorg_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1225DF536000D7D297 /* cpu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1325DF536000D7D297 /* cpu_ histogram_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "cpu_ histogram_layer_acc.cc"; sourceTree = "<group>"; };
+		ECCDCE1425DF536000D7D297 /* cpu_detection_post_process_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_detection_post_process_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE1525DF536000D7D297 /* cpu_size_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_size_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1625DF536000D7D297 /* cpu_conv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_conv_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE1725DF536000D7D297 /* cpu_reduce_log_sum_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_log_sum_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1825DF536000D7D297 /* cpu_clip_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_clip_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1925DF536000D7D297 /* cpu_reduce_l1_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_l1_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1A25DF536000D7D297 /* cpu_arg_max_or_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_arg_max_or_min_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1B25DF536000D7D297 /* cpu_softmax_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_softmax_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1C25DF536000D7D297 /* cpu_reduce_sum_square_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_sum_square_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1D25DF536000D7D297 /* cpu_squared_difference_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_squared_difference_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1E25DF536000D7D297 /* cpu_relu6_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_relu6_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE1F25DF536000D7D297 /* cpu_concat_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_concat_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2025DF536000D7D297 /* cpu_pool_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_pool_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2125DF536000D7D297 /* cpu_detection_output_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_detection_output_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE2225DF536000D7D297 /* cpu_reduce_mean_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_mean_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2325DF536000D7D297 /* cpu_range_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_range_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2425DF536000D7D297 /* cpu_reformat_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reformat_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2525DF536000D7D297 /* cpu_upsample_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_upsample_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE2625DF536000D7D297 /* cpu_prior_box_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_prior_box_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2725DF536000D7D297 /* cpu_nonzero_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_nonzero_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2825DF536000D7D297 /* cpu_scale_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_scale_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2925DF536000D7D297 /* cpu_pixel_shuffle_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_pixel_shuffle_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2A25DF536000D7D297 /* cpu_shuffle_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_shuffle_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2B25DF536000D7D297 /* cpu_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE2C25DF536000D7D297 /* cpu_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_mul_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2D25DF536000D7D297 /* cpu_prior_box_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_prior_box_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE2E25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_log_sigmoid_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE2F25DF536000D7D297 /* cpu_sin_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_sin_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3025DF536000D7D297 /* cpu_sub_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_sub_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3125DF536000D7D297 /* cpu_asin_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_asin_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3225DF536000D7D297 /* cpu_unary_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_unary_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3325DF536000D7D297 /* cpu_detection_output_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_detection_output_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3425DF536000D7D297 /* cpu_expand_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_expand_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3525DF536000D7D297 /* cpu_pool_3d_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_pool_3d_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3625DF536000D7D297 /* cpu_unsqueeze_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_unsqueeze_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3725DF536000D7D297 /* cpu_rsqrt_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_rsqrt_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3825DF536000D7D297 /* cpu_detection_post_process_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_detection_post_process_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3925DF536000D7D297 /* cpu_conv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_conv_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3A25DF536000D7D297 /* cpu_permute_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_permute_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3B25DF536000D7D297 /* cpu_add_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_add_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3C25DF536000D7D297 /* cpu_floor_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_floor_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3D25DF536000D7D297 /* cpu_signed_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_signed_mul_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE3E25DF536000D7D297 /* cpu_reduce_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_reduce_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE3F25DF536000D7D297 /* cpu_relu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_relu_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4025DF536000D7D297 /* cpu_reduce_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_min_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4125DF536000D7D297 /* cpu_conv_3d_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_conv_3d_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4225DF536000D7D297 /* cpu_reduce_max_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_max_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4325DF536000D7D297 /* cpu_scatter_nd_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_scatter_nd_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4425DF536000D7D297 /* cpu_bias_add_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_bias_add_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4525DF536000D7D297 /* cpu_shape_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_shape_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4625DF536000D7D297 /* cpu_hdrguide_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_hdrguide_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4825DF536000D7D297 /* compute_int8.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute_int8.cc; sourceTree = "<group>"; };
+		ECCDCE4925DF536000D7D297 /* compute_elewise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute_elewise.h; sourceTree = "<group>"; };
+		ECCDCE4A25DF536000D7D297 /* normalized_bbox.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = normalized_bbox.h; sourceTree = "<group>"; };
+		ECCDCE4B25DF536000D7D297 /* compute_elewise.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = compute_elewise.cc; sourceTree = "<group>"; };
+		ECCDCE4C25DF536000D7D297 /* compute_int8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compute_int8.h; sourceTree = "<group>"; };
+		ECCDCE4D25DF536000D7D297 /* cpu_reduce_log_sum_exp_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_log_sum_exp_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4E25DF536000D7D297 /* cpu_reduce_sum_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_sum_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE4F25DF536000D7D297 /* cpu_hard_swish_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_hard_swish_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5025DF536000D7D297 /* cpu_reduce_prod_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_prod_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5125DF536000D7D297 /* cpu_tanh_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_tanh_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5225DF536000D7D297 /* cpu_gathernd_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_gathernd_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5325DF536000D7D297 /* cpu_sigmoid_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_sigmoid_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5425DF536000D7D297 /* cpu_log_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_log_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5525DF536000D7D297 /* cpu_batch_norm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_batch_norm_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5625DF536000D7D297 /* cpu_upsample_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_upsample_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5725DF536000D7D297 /* cpu_lrn_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_lrn_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5825DF536000D7D297 /* cpu_sqrt_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_sqrt_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5925DF536000D7D297 /* cpu_neg_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_neg_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5A25DF536000D7D297 /* cpu_deconv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_deconv_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5B25DF536000D7D297 /* cpu_stride_slice_v2_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_stride_slice_v2_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5C25DF536000D7D297 /* cpu_pow_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_pow_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5D25DF536000D7D297 /* cpu_splitv_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_splitv_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5E25DF536000D7D297 /* cpu_lstm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_lstm_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE5F25DF536000D7D297 /* cpu_exp_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_exp_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6025DF536000D7D297 /* cpu_div_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_div_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6125DF536000D7D297 /* cpu_hard_sigmoid_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_hard_sigmoid_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6225DF536000D7D297 /* cpu_squeeze_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_squeeze_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6325DF536000D7D297 /* cpu_padv2_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_padv2_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6425DF536000D7D297 /* cpu_ceil_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_ceil_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6525DF536000D7D297 /* cpu_deconv_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_deconv_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE6625DF536000D7D297 /* cpu_atan_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_atan_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6725DF536000D7D297 /* cpu_tan_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_tan_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6825DF536000D7D297 /* cpu_prelu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_prelu_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6925DF536000D7D297 /* cpu_elu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_elu_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6A25DF536000D7D297 /* cpu_abs_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_abs_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6B25DF536000D7D297 /* cpu_min_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_min_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6C25DF536000D7D297 /* cpu_cos_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_cos_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6D25DF536000D7D297 /* cpu_inner_product_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_inner_product_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6E25DF536000D7D297 /* cpu_instance_norm_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_instance_norm_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE6F25DF536000D7D297 /* cpu_unary_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_unary_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE7025DF536000D7D297 /* cpu_reshape_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reshape_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7125DF536000D7D297 /* cpu_conv_3d_layer_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cpu_conv_3d_layer_acc.h; sourceTree = "<group>"; };
+		ECCDCE7225DF536000D7D297 /* cpu_selu_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_selu_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7325DF536000D7D297 /* cpu_constantofshape_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_constantofshape_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7425DF536000D7D297 /* cpu_acos_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_acos_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7525DF536000D7D297 /* cpu_reduce_l2_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_l2_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7625DF536000D7D297 /* cpu_max_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_max_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7725DF536000D7D297 /* cpu_sign_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_sign_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7825DF536000D7D297 /* cpu_normalize_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_normalize_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7925DF536000D7D297 /* cpu_pad_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_pad_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7A25DF536000D7D297 /* cpu_bitshift_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_bitshift_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7B25DF536000D7D297 /* cpu_binary_op_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_binary_op_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7C25DF536000D7D297 /* cpu_reduce_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_reduce_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7D25DF536000D7D297 /* cpu_mat_mul_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_mat_mul_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCE7E25DF536000D7D297 /* cpu_softplus_layer_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cpu_softplus_layer_acc.cc; sourceTree = "<group>"; };
+		ECCDCEFE25DF5C3F00D7D297 /* metal_lstm_layer_acc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = metal_lstm_layer_acc.metal; sourceTree = "<group>"; };
+		ECCDCF0125E0F97300D7D297 /* expand_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0225E0F97300D7D297 /* scatter_nd_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scatter_nd_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0325E0F97400D7D297 /* lstm_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lstm_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0425E0F97400D7D297 /* erf_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = erf_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0525E0F97400D7D297 /* size_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = size_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0625E0F97400D7D297 /* histogram_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = histogram_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0725E0F97400D7D297 /* mat_mul_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_mul_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0825E0F97400D7D297 /* bias_add_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bias_add_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0925E0F97400D7D297 /* constantofshape_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = constantofshape_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0A25E0F97400D7D297 /* gathernd_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gathernd_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0B25E0F97400D7D297 /* cast_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cast_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0C25E0F97400D7D297 /* range_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = range_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0D25E0F97400D7D297 /* nonzero_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nonzero_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0E25E0F97400D7D297 /* stride_slice_v2_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = stride_slice_v2_layer.cc; sourceTree = "<group>"; };
+		ECCDCF0F25E0F97400D7D297 /* cbam_fused_reduce_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cbam_fused_reduce_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1025E0F97400D7D297 /* gather_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gather_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1125E0F97400D7D297 /* cbam_fused_pooling_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cbam_fused_pooling_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1225E0F97400D7D297 /* bitshift_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bitshift_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1325E0F97400D7D297 /* shape_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shape_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1425E0F97500D7D297 /* padv2_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = padv2_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1525E0F97500D7D297 /* squeeze_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = squeeze_layer.cc; sourceTree = "<group>"; };
+		ECCDCF1625E0F97500D7D297 /* unsqueeze_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unsqueeze_layer.cc; sourceTree = "<group>"; };
+		ECCDCF2E25E10B9C00D7D297 /* padv2_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = padv2_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF2F25E10B9C00D7D297 /* constantofshape_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = constantofshape_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3025E10B9C00D7D297 /* gather_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gather_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3125E10B9C00D7D297 /* stride_slice_v2_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = stride_slice_v2_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3225E10B9C00D7D297 /* bias_add_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bias_add_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3325E10B9C00D7D297 /* reformat_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reformat_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3425E10B9C00D7D297 /* scatter_nd_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scatter_nd_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3525E10B9C00D7D297 /* range_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = range_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3625E10B9C00D7D297 /* unsqueeze_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unsqueeze_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3725E10B9C00D7D297 /* bitshift_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bitshift_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3825E10B9D00D7D297 /* expand_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3925E10B9D00D7D297 /* gathernd_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gathernd_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3A25E10B9D00D7D297 /* cast_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cast_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3B25E10B9D00D7D297 /* mat_mul_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_mul_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3C25E10B9D00D7D297 /* lstm_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lstm_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3D25E10B9D00D7D297 /* shape_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shape_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3E25E10B9D00D7D297 /* size_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = size_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECCDCF3F25E10B9D00D7D297 /* histogram_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = histogram_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECD945AC254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_pixel_shuffle_layer_acc.mm; sourceTree = "<group>"; };
+		ECD945AD254ADD7100BF9214 /* metal_signed_mul_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_signed_mul_layer_acc.metal; sourceTree = "<group>"; };
+		ECD945AE254ADD7100BF9214 /* metal_signed_mul_layer_acc.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = metal_signed_mul_layer_acc.mm; sourceTree = "<group>"; };
+		ECD945AF254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = metal_pixel_shuffle_layer_acc.metal; sourceTree = "<group>"; };
+		ECD945B5254ADD8400BF9214 /* pixel_shuffle_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pixel_shuffle_layer.cc; sourceTree = "<group>"; };
+		ECD945B8254ADDA800BF9214 /* pixel_shuffle_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pixel_shuffle_layer_interpreter.cc; sourceTree = "<group>"; };
+		ECD946492558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_insert_int8_reformat.cc; sourceTree = "<group>"; };
+		ECD9464A2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_insert_fp16_reformat.h; sourceTree = "<group>"; };
+		ECD9464B2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_insert_int8_reformat.h; sourceTree = "<group>"; };
+		ECD9464C2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_insert_fp16_reformat.cc; sourceTree = "<group>"; };
+		ECEC5D6024FCDBA40044DDF1 /* arm_mat_converter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_mat_converter.cc; sourceTree = "<group>"; };
+		ECEC5D6124FCDBA40044DDF1 /* arm_mat_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_mat_util.h; sourceTree = "<group>"; };
+		ECEC5D6224FCDBA40044DDF1 /* arm_mat_converter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_mat_converter.h; sourceTree = "<group>"; };
+		ECEC5D6324FCDBA50044DDF1 /* arm_mat_util.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arm_mat_util.cc; sourceTree = "<group>"; };
+		ECEC5D6824FCE0780044DDF1 /* mat_converter_acc.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_converter_acc.cc; sourceTree = "<group>"; };
+		ECEC5D6924FCE0780044DDF1 /* mat_converter_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mat_converter_acc.h; sourceTree = "<group>"; };
+		ECEC5D6A24FCE0780044DDF1 /* mat_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_utils.cc; sourceTree = "<group>"; };
+		ECEC5DA724FFC6FD0044DDF1 /* mat.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat.cc; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		9D2DB1CE22D759C8000C508F /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9D31636023169B1600531250 /* CoreML.framework in Frameworks */,
+				9D29E25122DC89310050DC63 /* Foundation.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		9D203E5522D75AB20019CAAA /* include */ = {
+			isa = PBXGroup;
+			children = (
+				9DDA7080241F99E600F17A1C /* tnn */,
+			);
+			name = include;
+			path = ../../../include;
+			sourceTree = "<group>";
+		};
+		9D203E5D22D75AB20019CAAA /* source */ = {
+			isa = PBXGroup;
+			children = (
+				9DDA70A5241F9E9400F17A1C /* tnn */,
+			);
+			name = source;
+			path = ../../../source;
+			sourceTree = "<group>";
+		};
+		9D29E24F22DC89300050DC63 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				9D31635F23169B1600531250 /* CoreML.framework */,
+				9D29E25022DC89300050DC63 /* Foundation.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		9D2DB1C722D759C8000C508F = {
+			isa = PBXGroup;
+			children = (
+				9D2DB1D322D759C8000C508F /* tnn */,
+				9D2DB1D222D759C8000C508F /* Products */,
+				9D29E24F22DC89300050DC63 /* Frameworks */,
+				9D24B9562351FFE1000E1F04 /* tnn copy-Info.plist */,
+			);
+			sourceTree = "<group>";
+		};
+		9D2DB1D222D759C8000C508F /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				9D2DB1D122D759C8000C508F /* tnn.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		9D2DB1D322D759C8000C508F /* tnn */ = {
+			isa = PBXGroup;
+			children = (
+				9D203E5522D75AB20019CAAA /* include */,
+				9D2DB1D522D759C8000C508F /* Info.plist */,
+				9D203E5D22D75AB20019CAAA /* source */,
+				9D2DB1D422D759C8000C508F /* tnn.h */,
+			);
+			path = tnn;
+			sourceTree = "<group>";
+		};
+		9D32F95D24557EE7002DCDAB /* optimizer */ = {
+			isa = PBXGroup;
+			children = (
+				EC12EC7E25E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc */,
+				EC12EC7825E67548007ADDE4 /* net_optimizer_cbam_fused_pooling.h */,
+				EC12EC7A25E67548007ADDE4 /* net_optimizer_cbam_fused_reduce.cc */,
+				EC12EC7B25E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.h */,
+				EC12EC7C25E67549007ADDE4 /* net_optimizer_fuse_conv_add.cc */,
+				EC12EC7925E67548007ADDE4 /* net_optimizer_fuse_conv_add.h */,
+				EC12EC7725E67548007ADDE4 /* net_optimizer_insert_layout_reformat.cc */,
+				EC12EC7D25E67549007ADDE4 /* net_optimizer_insert_layout_reformat.h */,
+				EC39A40925FB667D00891D9A /* net_optimizer_fuse_conv_add.cc */,
+				EC39A40A25FB667D00891D9A /* net_optimizer_fuse_conv_add.h */,
+				EC880549255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.cc */,
+				EC880548255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.h */,
+				ECD9464C2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.cc */,
+				ECD9464A2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.h */,
+				ECD946492558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.cc */,
+				ECD9464B2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.h */,
+				9D32F95F24557EE7002DCDAB /* net_optimizer_remove_layers.h */,
+				9D32F96224557EE7002DCDAB /* net_optimizer.h */,
+				9D32F96424557EE7002DCDAB /* optimizer_const.h */,
+				9D32F96524557EE7002DCDAB /* net_optimizer_manager.h */,
+				9D32F96624557EE7002DCDAB /* net_optimizer_remove_layers.cc */,
+				9D32F96724557EE7002DCDAB /* net_optimizer_manager.cc */,
+			);
+			path = optimizer;
+			sourceTree = "<group>";
+		};
+		9D32F96824557EE7002DCDAB /* memory_manager */ = {
+			isa = PBXGroup;
+			children = (
+				9D32F96924557EE7002DCDAB /* blob_2d_memory_pool.h */,
+				9D32F96A24557EE7002DCDAB /* others_memory_mode_state.cc */,
+				9D32F96B24557EE7002DCDAB /* blob_memory_size_info.cc */,
+				9D32F96C24557EE7002DCDAB /* shared_memory_manager.h */,
+				9D32F96D24557EE7002DCDAB /* memory_unify_assign_strategy.cc */,
+				9D32F96E24557EE7002DCDAB /* blob_1d_memory_pool.h */,
+				9D32F96F24557EE7002DCDAB /* blob_1d_memory.cc */,
+				9D32F97024557EE7002DCDAB /* blob_memory_pool.cc */,
+				9D32F97124557EE7002DCDAB /* shared_memory_manager.cc */,
+				9D32F97224557EE7002DCDAB /* blob_2d_memory_pool.cc */,
+				9D32F97324557EE7002DCDAB /* blob_memory_pool_factory.h */,
+				9D32F97424557EE7002DCDAB /* blob_memory.cc */,
+				9D32F97524557EE7002DCDAB /* memory_mode_state.cc */,
+				9D32F97624557EE7002DCDAB /* memory_seperate_assign_strategy.cc */,
+				9D32F97724557EE7002DCDAB /* blob_1d_memory.h */,
+				9D32F97824557EE7002DCDAB /* blob_memory_size_info.h */,
+				9D32F97924557EE7002DCDAB /* blob_memory.h */,
+				9D32F97A24557EE7002DCDAB /* memory_assign_strategy.h */,
+				9D32F97B24557EE7002DCDAB /* memory_mode_state_factory.h */,
+				9D32F97C24557EE7002DCDAB /* others_memory_mode_state.h */,
+				9D32F97D24557EE7002DCDAB /* blob_2d_memory.h */,
+				9D32F97E24557EE7002DCDAB /* blob_2d_memory.cc */,
+				9D32F97F24557EE7002DCDAB /* memory_unify_assign_strategy.h */,
+				9D32F98024557EE7002DCDAB /* share_one_thread_memory_mode_state.cc */,
+				9D32F98124557EE7002DCDAB /* memory_mode_state_factory.cc */,
+				9D32F98224557EE7002DCDAB /* memory_mode_state.h */,
+				9D32F98324557EE7002DCDAB /* share_one_thread_memory_mode_state.h */,
+				9D32F98424557EE7002DCDAB /* blob_memory_pool.h */,
+				9D32F98524557EE7002DCDAB /* blob_memory_pool_factory.cc */,
+				9D32F98624557EE7002DCDAB /* memory_seperate_assign_strategy.h */,
+				9D32F98724557EE7002DCDAB /* blob_1d_memory_pool.cc */,
+			);
+			path = memory_manager;
+			sourceTree = "<group>";
+		};
+		9D32F98824557EE7002DCDAB /* utils */ = {
+			isa = PBXGroup;
+			children = (
+				369005C3267314D900412264 /* pad_utils.cc */,
+				369005C4267314D900412264 /* pad_utils.h */,
+				EC78BA1126034506009271A8 /* blob_dump_utils.h */,
+				EC8BDE0E25E3B15F0085CCC2 /* data_flag_utils.cc */,
+				EC8BDE1025E3B15F0085CCC2 /* dims_offset_utils.cc */,
+				EC8BDE0F25E3B15F0085CCC2 /* dims_offset_utils.h */,
+				EC39A3F725FB65E000891D9A /* half_utils_inner.h */,
+				EC39A3F825FB65E000891D9A /* half.hpp */,
+				6178F34A2590AA8C00B4B153 /* md5.cc */,
+				E44D947726048C32003FE4A3 /* cpu_info.cc */,
+				E44D947626048C32003FE4A3 /* cpu_info.h */,
+				E44D947826048C32003FE4A3 /* data_flag_utils.h */,
+				6178F3492590AA8C00B4B153 /* md5.h */,
+				E44D946D26048C1A003FE4A3 /* dims_function_utils.cc */,
+				E44D946B26048C1A003FE4A3 /* dims_function_utils.h */,
+				E44D946E26048C1A003FE4A3 /* dims_utils.h */,
+				6178F3472590AA8C00B4B153 /* winograd_generator.cc */,
+				6178F3482590AA8C00B4B153 /* winograd_generator.h */,
+				9DF54444258B162F006CEC97 /* blob_converter_default.cc */,
+				9DF54445258B162F006CEC97 /* blob_converter_default.h */,
+				9DF54446258B162F006CEC97 /* npu_common_utils.cc */,
+				9DF54443258B162F006CEC97 /* npu_common_utils.h */,
+				9DF54449258B162F006CEC97 /* random_data_utils.cc */,
+				9DF54448258B162F006CEC97 /* random_data_utils.h */,
+				EC0BE1B9251DBE65009BD69A /* mat_converter_utils.cc */,
+				EC0BE1BA251DBE65009BD69A /* mat_converter_utils.h */,
+				EC0BE13525144B5D009BD69A /* detection_post_process_utils.cc */,
+				EC0BE13425144B5D009BD69A /* detection_post_process_utils.h */,
+				EC0BE13625144B5D009BD69A /* string_utils.cc */,
+				ECEC5D6824FCE0780044DDF1 /* mat_converter_acc.cc */,
+				ECEC5D6924FCE0780044DDF1 /* mat_converter_acc.h */,
+				ECEC5D6A24FCE0780044DDF1 /* mat_utils.cc */,
+				9D4C60CA246BF7A1006068D1 /* bbox_util.cc */,
+				9D4C60C9246BF7A1006068D1 /* bbox_util.h */,
+				9D852BC924584E6A003F4E41 /* bfp16_utils.cc */,
+				9D852BCA24584E6A003F4E41 /* bfp16.h */,
+				9D32F99924557EE7002DCDAB /* blob_converter_internal.cc */,
+				9D32F99724557EE7002DCDAB /* blob_converter_internal.h */,
+				9D32F99B24557EE7002DCDAB /* blob_memory_size_utils.cc */,
+				9D32F98C24557EE7002DCDAB /* blob_memory_size_utils.h */,
+				9D32F99124557EE7002DCDAB /* blob_dump_utils.cc */,
+				9D32F99224557EE7002DCDAB /* blob_transfer_utils.cc */,
+				9D32F98E24557EE7002DCDAB /* blob_transfer_utils.h */,
+				9D32F9A124557EE7002DCDAB /* cpu_utils.cc */,
+				9D32F99624557EE7002DCDAB /* data_format_converter.cc */,
+				9D32F99424557EE7002DCDAB /* data_format_converter.h */,
+				9D32F99D24557EE7002DCDAB /* data_type_utils.cc */,
+				9D32F99A24557EE7002DCDAB /* dims_vector_utils.cc */,
+				9D32F98B24557EE7002DCDAB /* half_utils.cc */,
+				9DF26BD724645EA500F22F0D /* naive_compute.cc */,
+				9DF26BD824645EA500F22F0D /* naive_compute.h */,
+				9D32F98A24557EE7002DCDAB /* omp_utils.h */,
+				9D32F9A224557EE7002DCDAB /* pribox_generator_utils.cc */,
+				9D32F9A024557EE7002DCDAB /* pribox_generator_utils.h */,
+				9D32F99E24557EE7002DCDAB /* split_utils.cc */,
+				9D32F98F24557EE7002DCDAB /* split_utils.h */,
+				9D32F99024557EE7002DCDAB /* string_format.cc */,
+				9D32F99F24557EE7002DCDAB /* string_format.h */,
+				9D32F98D24557EE7002DCDAB /* string_utils_inner.h */,
+			);
+			path = utils;
+			sourceTree = "<group>";
+		};
+		9D32F9A324557EE7002DCDAB /* layer */ = {
+			isa = PBXGroup;
+			children = (
+				4E187D0C267202D800804FDF /* conv1d_layer.cc */,
+				4E187D11267202D800804FDF /* einsum_layer.cc */,
+				4E187D07267202D800804FDF /* equal_layer.cc */,
+				4E187D0A267202D800804FDF /* gelu_layer.cc */,
+				4E187D0B267202D800804FDF /* gridsample_layer.cc */,
+				4E187D09267202D800804FDF /* group_norm_layer.cc */,
+				4E187D0F267202D800804FDF /* inverse_layer.cc */,
+				4E187D06267202D800804FDF /* layer_norm_layer.cc */,
+				4E187D08267202D800804FDF /* onehot_layer.cc */,
+				4E187D12267202D800804FDF /* roialign_layer.cc */,
+				4E187D0E267202D800804FDF /* softsign_layer.cc */,
+				4E187D0D267202D800804FDF /* tile_layer.cc */,
+				4E187D10267202D800804FDF /* topk_layer.cc */,
+				4E187D05267202D800804FDF /* where_layer.cc */,
+				ECCDCF0825E0F97400D7D297 /* bias_add_layer.cc */,
+				ECCDCF1225E0F97400D7D297 /* bitshift_layer.cc */,
+				ECCDCF0B25E0F97400D7D297 /* cast_layer.cc */,
+				ECCDCF1125E0F97400D7D297 /* cbam_fused_pooling_layer.cc */,
+				ECCDCF0F25E0F97400D7D297 /* cbam_fused_reduce_layer.cc */,
+				ECCDCF0925E0F97400D7D297 /* constantofshape_layer.cc */,
+				ECCDCF0425E0F97400D7D297 /* erf_layer.cc */,
+				ECCDCF0125E0F97300D7D297 /* expand_layer.cc */,
+				ECCDCF1025E0F97400D7D297 /* gather_layer.cc */,
+				ECCDCF0A25E0F97400D7D297 /* gathernd_layer.cc */,
+				ECCDCF0625E0F97400D7D297 /* histogram_layer.cc */,
+				ECCDCF0325E0F97400D7D297 /* lstm_layer.cc */,
+				ECCDCF0725E0F97400D7D297 /* mat_mul_layer.cc */,
+				ECCDCF0D25E0F97400D7D297 /* nonzero_layer.cc */,
+				ECCDCF1425E0F97500D7D297 /* padv2_layer.cc */,
+				ECCDCF0C25E0F97400D7D297 /* range_layer.cc */,
+				ECCDCF0225E0F97300D7D297 /* scatter_nd_layer.cc */,
+				ECCDCF1325E0F97400D7D297 /* shape_layer.cc */,
+				ECCDCF0525E0F97400D7D297 /* size_layer.cc */,
+				ECCDCF1525E0F97500D7D297 /* squeeze_layer.cc */,
+				ECCDCF0E25E0F97400D7D297 /* stride_slice_v2_layer.cc */,
+				ECCDCF1625E0F97500D7D297 /* unsqueeze_layer.cc */,
+				ECD945B5254ADD8400BF9214 /* pixel_shuffle_layer.cc */,
+				EC0BE14D25144BB8009BD69A /* arg_max_or_min_layer.cc */,
+				EC0BE14925144BB8009BD69A /* ceil_layer.cc */,
+				EC0BE14C25144BB8009BD69A /* detection_post_process_layer.cc */,
+				EC0BE14825144BB7009BD69A /* reduce_l1_layer.cc */,
+				EC0BE14E25144BB8009BD69A /* rsqrt_layer.cc */,
+				EC0BE14A25144BB8009BD69A /* signed_mul_layer.cc */,
+				EC0BE14B25144BB8009BD69A /* squared_difference_layer.cc */,
+				9D32F9CE24557EE7002DCDAB /* abs_layer.cc */,
+				9D32F9C824557EE7002DCDAB /* acos_layer.cc */,
+				9D32F9AC24557EE7002DCDAB /* add_layer.cc */,
+				9D32F9E524557EE7002DCDAB /* asin_layer.cc */,
+				9D32F9B824557EE7002DCDAB /* atan_layer.cc */,
+				9D32F9D524557EE7002DCDAB /* base_layer.cc */,
+				9D32F9AE24557EE7002DCDAB /* base_layer.h */,
+				9D32F9E724557EE7002DCDAB /* batch_norm_layer.cc */,
+				9D32F9C924557EE7002DCDAB /* clip_layer.cc */,
+				9D32F9D324557EE7002DCDAB /* concat_layer.cc */,
+				9D32F9C224557EE7002DCDAB /* conv_layer.cc */,
+				9D32F9C324557EE7002DCDAB /* conv3d_layer.cc */,
+				9D32F9E324557EE7002DCDAB /* cos_layer.cc */,
+				9D32F9E824557EE7002DCDAB /* deconv_layer.cc */,
+				9D32F9BD24557EE7002DCDAB /* detection_output_layer.cc */,
+				9D32F9B624557EE7002DCDAB /* div_layer.cc */,
+				9D32F9B524557EE7002DCDAB /* elementwise_layer.cc */,
+				9D32F9BA24557EE7002DCDAB /* elementwise_layer.h */,
+				9D32F9DC24557EE7002DCDAB /* elu_layer.cc */,
+				9D32F9AD24557EE7002DCDAB /* exp_layer.cc */,
+				9D32F9B224557EE7002DCDAB /* flatten_layer.cc */,
+				9D32F9BC24557EE7002DCDAB /* floor_layer.cc */,
+				9D32F9EF24557EE8002DCDAB /* hard_sigmoid_layer.cc */,
+				9D32F9DF24557EE7002DCDAB /* hard_swish_layer.cc */,
+				9D32F9D724557EE7002DCDAB /* hdrguide_layer.cc */,
+				9D32F9B724557EE7002DCDAB /* inner_product_layer.cc */,
+				9D32F9BF24557EE7002DCDAB /* instance_norm_layer.cc */,
+				9D32F9DD24557EE7002DCDAB /* log_layer.cc */,
+				9D32F9EB24557EE8002DCDAB /* log_sigmoid_layer.cc */,
+				9D32F9EC24557EE8002DCDAB /* lrn_layer.cc */,
+				9D32F9F024557EE8002DCDAB /* max_layer.cc */,
+				9D32F9CF24557EE7002DCDAB /* min_layer.cc */,
+				9D32F9D424557EE7002DCDAB /* mul_layer.cc */,
+				9D32F9E224557EE7002DCDAB /* multidir_broadcast_layer.cc */,
+				9D32F9CA24557EE7002DCDAB /* multidir_broadcast_layer.h */,
+				9D32F9C624557EE7002DCDAB /* neg_layer.cc */,
+				9D32F9DA24557EE7002DCDAB /* normalize_layer.cc */,
+				9D32F9CD24557EE7002DCDAB /* pad_layer.cc */,
+				9D32F9ED24557EE8002DCDAB /* permute_layer.cc */,
+				9D32F9B324557EE7002DCDAB /* pooling_3d_layer.cc */,
+				9D32F9DB24557EE7002DCDAB /* pooling_layer.cc */,
+				9D32F9A524557EE7002DCDAB /* pow_layer.cc */,
+				9D32F9C524557EE7002DCDAB /* prelu_layer.cc */,
+				9D32F9C124557EE7002DCDAB /* prior_box_layer.cc */,
+				9D32F9BB24557EE7002DCDAB /* reciprocal_layer.cc */,
+				9D32F9D924557EE7002DCDAB /* reduce_l2_layer.cc */,
+				9D32F9E124557EE7002DCDAB /* reduce_layer.cc */,
+				9D32F9B024557EE7002DCDAB /* reduce_layer.h */,
+				9D32F9D824557EE7002DCDAB /* reduce_log_sum_exp_layer.cc */,
+				9D32F9EE24557EE8002DCDAB /* reduce_log_sum_layer.cc */,
+				9D32F9F124557EE8002DCDAB /* reduce_max_layer.cc */,
+				9D32F9CB24557EE7002DCDAB /* reduce_mean_layer.cc */,
+				9D32F9D024557EE7002DCDAB /* reduce_min_layer.cc */,
+				9D32F9AA24557EE7002DCDAB /* reduce_prod_layer.cc */,
+				9D32F9CC24557EE7002DCDAB /* reduce_sum_layer.cc */,
+				9D32F9AB24557EE7002DCDAB /* reduce_sum_square_layer.cc */,
+				9D32F9D224557EE7002DCDAB /* reformat_layer.cc */,
+				9D32F9E424557EE7002DCDAB /* relu_layer.cc */,
+				9D32F9C424557EE7002DCDAB /* relu6_layer.cc */,
+				9D32F9E624557EE7002DCDAB /* reorg_layer.cc */,
+				9D32F9B424557EE7002DCDAB /* reshape_layer.cc */,
+				9D32F9D624557EE7002DCDAB /* roi_pooling_layer.cc */,
+				9D32F9DE24557EE7002DCDAB /* scale_layer.cc */,
+				9D32F9C024557EE7002DCDAB /* selu_layer.cc */,
+				9D32F9AF24557EE7002DCDAB /* shuffle_layer.cc */,
+				9D32F9A424557EE7002DCDAB /* sigmoid_layer.cc */,
+				9D32F9A724557EE7002DCDAB /* sign_layer.cc */,
+				9D32F9D124557EE7002DCDAB /* sin_layer.cc */,
+				9D32F9C724557EE7002DCDAB /* softmax_layer.cc */,
+				9D32F9A824557EE7002DCDAB /* softplus_layer.cc */,
+				9D32F9EA24557EE8002DCDAB /* split_layer.cc */,
+				9D32F9BE24557EE7002DCDAB /* splitv_layer.cc */,
+				9D32F9B124557EE7002DCDAB /* sqrt_layer.cc */,
+				9D32F9E924557EE7002DCDAB /* stride_slice_layer.cc */,
+				9D32F9B924557EE7002DCDAB /* sub_layer.cc */,
+				9D32F9A924557EE7002DCDAB /* tan_layer.cc */,
+				9D32F9A624557EE7002DCDAB /* tanh_layer.cc */,
+				9D32F9E024557EE7002DCDAB /* upsample_layer.cc */,
+			);
+			path = layer;
+			sourceTree = "<group>";
+		};
+		9D32F9F224557EE8002DCDAB /* device */ = {
+			isa = PBXGroup;
+			children = (
+				ECCDCDFE25DF536000D7D297 /* cpu */,
+				9DD1FA8C247CE9BE00800139 /* metal */,
+				9D32FB7E24557EEA002DCDAB /* arm */,
+			);
+			path = device;
+			sourceTree = "<group>";
+		};
+		9D32FB7E24557EEA002DCDAB /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				9DF542DF258B1363006CEC97 /* acc */,
+				ECEC5D6024FCDBA40044DDF1 /* arm_mat_converter.cc */,
+				ECEC5D6224FCDBA40044DDF1 /* arm_mat_converter.h */,
+				ECEC5D6324FCDBA50044DDF1 /* arm_mat_util.cc */,
+				ECEC5D6124FCDBA40044DDF1 /* arm_mat_util.h */,
+				9DD1FC62247CEA1400800139 /* arm_blob_converter.cc */,
+				9DD1FC61247CEA1400800139 /* arm_blob_converter.h */,
+				9DD1FBD6247CEA1300800139 /* arm_common.h */,
+				9DD1FBD4247CEA1200800139 /* arm_context.cc */,
+				9DD1FBD5247CEA1200800139 /* arm_context.h */,
+				9DD1FC64247CEA1400800139 /* arm_device.cc */,
+				9DD1FC63247CEA1400800139 /* arm_device.h */,
+				9DD1FBD2247CEA1200800139 /* arm_util.cc */,
+				9DD1FBD3247CEA1200800139 /* arm_util.h */,
+			);
+			path = arm;
+			sourceTree = "<group>";
+		};
+		9D32FBFA24557EEB002DCDAB /* interpreter */ = {
+			isa = PBXGroup;
+			children = (
+				4E187CF5267202BF00804FDF /* const_layer_interpreter.cc */,
+				4E187CF1267202BE00804FDF /* conv_1d_layer_interpreter.cc */,
+				4E187CF9267202BF00804FDF /* einsum_layer_interpreter.cc */,
+				4E187CF3267202BF00804FDF /* gridsample_layer_interpreter.cc */,
+				4E187CF6267202BF00804FDF /* group_norm_layer_interpreter.cc */,
+				4E187CF4267202BF00804FDF /* layer_norm_layer_interpreter.cc */,
+				4E187CF7267202BF00804FDF /* onehot_layer_interpreter.cc */,
+				4E187CF0267202BE00804FDF /* roialign_layer_interpreter.cc */,
+				4E187CF8267202BF00804FDF /* tile_layer_interpreter.cc */,
+				4E187CF2267202BF00804FDF /* topk_layer_interpreter.cc */,
+				9D32FBFB24557EEB002DCDAB /* default_model_interpreter.h */,
+				9D32FBFC24557EEB002DCDAB /* layer_resource_generator.cc */,
+				9D32FBFD24557EEB002DCDAB /* layer_param.h */,
+				9D32FBFE24557EEB002DCDAB /* default_model_interpreter.cc */,
+				9D32FBFF24557EEB002DCDAB /* net_resource.cc */,
+				9D32FC0024557EEB002DCDAB /* tnn */,
+				9D32FC3424557EEB002DCDAB /* raw_buffer.h */,
+				9D32FC3524557EEB002DCDAB /* default_model_packer.h */,
+				9D32FC3624557EEB002DCDAB /* layer_resource.h */,
+				9D32FC3724557EEB002DCDAB /* net_resource.h */,
+				9D32FC3824557EEB002DCDAB /* default_model_packer.cc */,
+				9D32FC3924557EEB002DCDAB /* abstract_model_interpreter.h */,
+				9D32FC3A24557EEB002DCDAB /* net_structure.cc */,
+				9D32FC3B24557EEB002DCDAB /* raw_buffer.cc */,
+				9D32FC3C24557EEB002DCDAB /* abstract_model_interpreter.cc */,
+				9D32FC3D24557EEB002DCDAB /* ncnn */,
+				9D32FC6224557EEB002DCDAB /* net_structure.h */,
+				9D32FC6324557EEB002DCDAB /* layer_resource_generator.h */,
+			);
+			path = interpreter;
+			sourceTree = "<group>";
+		};
+		9D32FC0024557EEB002DCDAB /* tnn */ = {
+			isa = PBXGroup;
+			children = (
+				9D32FC0124557EEB002DCDAB /* model_interpreter.h */,
+				9D32FC0224557EEB002DCDAB /* model_packer.h */,
+				9D32FC0324557EEB002DCDAB /* model_packer.cc */,
+				9D32FC0424557EEB002DCDAB /* objseri.h */,
+				9D32FC0524557EEB002DCDAB /* model_interpreter.cc */,
+				9D32FC0624557EEB002DCDAB /* layer_interpreter */,
+			);
+			path = tnn;
+			sourceTree = "<group>";
+		};
+		9D32FC0624557EEB002DCDAB /* layer_interpreter */ = {
+			isa = PBXGroup;
+			children = (
+				ECCDCF3225E10B9C00D7D297 /* bias_add_layer_interpreter.cc */,
+				ECCDCF3725E10B9C00D7D297 /* bitshift_layer_interpreter.cc */,
+				ECCDCF3A25E10B9D00D7D297 /* cast_layer_interpreter.cc */,
+				ECCDCF2F25E10B9C00D7D297 /* constantofshape_layer_interpreter.cc */,
+				ECCDCF3825E10B9D00D7D297 /* expand_layer_interpreter.cc */,
+				ECCDCF3025E10B9C00D7D297 /* gather_layer_interpreter.cc */,
+				ECCDCF3925E10B9D00D7D297 /* gathernd_layer_interpreter.cc */,
+				ECCDCF3F25E10B9D00D7D297 /* histogram_layer_interpreter.cc */,
+				ECCDCF3C25E10B9D00D7D297 /* lstm_layer_interpreter.cc */,
+				ECCDCF3B25E10B9D00D7D297 /* mat_mul_layer_interpreter.cc */,
+				ECCDCF2E25E10B9C00D7D297 /* padv2_layer_interpreter.cc */,
+				ECCDCF3525E10B9C00D7D297 /* range_layer_interpreter.cc */,
+				ECCDCF3325E10B9C00D7D297 /* reformat_layer_interpreter.cc */,
+				ECCDCF3425E10B9C00D7D297 /* scatter_nd_layer_interpreter.cc */,
+				ECCDCF3D25E10B9D00D7D297 /* shape_layer_interpreter.cc */,
+				ECCDCF3E25E10B9D00D7D297 /* size_layer_interpreter.cc */,
+				ECCDCF3125E10B9C00D7D297 /* stride_slice_v2_layer_interpreter.cc */,
+				ECCDCF3625E10B9C00D7D297 /* unsqueeze_layer_interpreter.cc */,
+				ECD945B8254ADDA800BF9214 /* pixel_shuffle_layer_interpreter.cc */,
+				EC0BE15D25144BE4009BD69A /* arg_max_or_min_layer_interpreter.cc */,
+				EC0BE15A25144BE4009BD69A /* detection_post_process_layer_interpreter.cc */,
+				EC0BE15825144BE3009BD69A /* layer_interpreter_macro.h */,
+				EC0BE15C25144BE4009BD69A /* signed_mul_layer_interpreter.cc */,
+				EC0BE15925144BE4009BD69A /* squared_difference_layer_interpreter.cc */,
+				EC0BE15B25144BE4009BD69A /* squeeze_layer_interpreter.cc */,
+				9D32FC0724557EEB002DCDAB /* batch_norm_layer_interpreter.cc */,
+				9D32FC0824557EEB002DCDAB /* shuffle_layer_interpreter.cc */,
+				9D32FC0924557EEB002DCDAB /* conv_layer_interpreter.cc */,
+				9D32FC0A24557EEB002DCDAB /* flatten_layer_interpreter.cc */,
+				9D32FC0B24557EEB002DCDAB /* unary_op_layer_interpreter.h */,
+				9D32FC0C24557EEB002DCDAB /* detection_output_interpreter.cc */,
+				9D32FC0D24557EEB002DCDAB /* reorg_layer_interpreter.cc */,
+				9D32FC0E24557EEB002DCDAB /* prelu_layer_interpreter.cc */,
+				9D32FC0F24557EEB002DCDAB /* clip_layer_interpreter.cc */,
+				9D32FC1024557EEB002DCDAB /* div_layer_interpreter.cc */,
+				9D32FC1124557EEB002DCDAB /* stride_slice_layer_interpreter.cc */,
+				9D32FC1224557EEB002DCDAB /* pooling_layer_interpreter.cc */,
+				9D32FC1324557EEB002DCDAB /* pad_layer_interpreter.cc */,
+				9D32FC1424557EEB002DCDAB /* sub_layer_interpreter.cc */,
+				9D32FC1524557EEB002DCDAB /* normalize_layer_interpreter.cc */,
+				9D32FC1624557EEB002DCDAB /* abstract_layer_interpreter.h */,
+				9D32FC1724557EEB002DCDAB /* upsample_layer_interpreter.cc */,
+				9D32FC1824557EEB002DCDAB /* max_layer_interpreter.cc */,
+				9D32FC1924557EEB002DCDAB /* add_layer_interpreter.cc */,
+				9D32FC1A24557EEB002DCDAB /* permute_layer_interpreter.cc */,
+				9D32FC1B24557EEB002DCDAB /* hard_swish_layer_interpreter.cc */,
+				9D32FC1C24557EEB002DCDAB /* elu_layer_interpreter.cc */,
+				9D32FC1D24557EEB002DCDAB /* selu_layer_interpreter.cc */,
+				9D32FC1E24557EEB002DCDAB /* lrn_layer_interpreter.cc */,
+				9D32FC1F24557EEB002DCDAB /* roi_pooling_layer_interpreter.cc */,
+				9D32FC2024557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */,
+				9D32FC2124557EEB002DCDAB /* softmax_layer_interpreter.cc */,
+				9D32FC2224557EEB002DCDAB /* pow_layer_interpreter.cc */,
+				9D32FC2324557EEB002DCDAB /* pooling_3d_layer_interpreter.cc */,
+				9D32FC2424557EEB002DCDAB /* blob_scale_layer_interpreter.cc */,
+				9D32FC2524557EEB002DCDAB /* mul_layer_interpreter.cc */,
+				9D32FC2624557EEB002DCDAB /* reduce_op_interpreter.cc */,
+				9D32FC2724557EEB002DCDAB /* hdrguide_layer_interpreter.cc */,
+				9D32FC2824557EEB002DCDAB /* prior_box_layer_interpreter.cc */,
+				9D32FC2924557EEB002DCDAB /* reshape_layer_interpreter.cc */,
+				9D32FC2A24557EEB002DCDAB /* instance_norm_layer_interpreter.cc */,
+				9D32FC2B24557EEB002DCDAB /* splitv_layer_interpreter.cc */,
+				9D32FC2C24557EEB002DCDAB /* min_layer_interpreter.cc */,
+				9D32FC2D24557EEB002DCDAB /* inner_product_layer_interpreter.cc */,
+				9D32FC2E24557EEB002DCDAB /* concat_layer_interpreter.cc */,
+				9D32FC2F24557EEB002DCDAB /* conv_3d_layer_interpreter.cc */,
+				9D32FC3024557EEB002DCDAB /* unary_op_layer_interpreter.cc */,
+				9D32FC3124557EEB002DCDAB /* scale_layer_interpreter.cc */,
+				9D32FC3224557EEB002DCDAB /* reduce_op_interpreter.h */,
+				9D32FC3324557EEB002DCDAB /* layer_interpreter.h */,
+			);
+			path = layer_interpreter;
+			sourceTree = "<group>";
+		};
+		9D32FC3D24557EEB002DCDAB /* ncnn */ = {
+			isa = PBXGroup;
+			children = (
+				9D32FC3E24557EEB002DCDAB /* ncnn_model_interpreter.cc */,
+				9D32FC3F24557EEB002DCDAB /* optimizer */,
+				9D32FC4524557EEB002DCDAB /* ncnn_param_utils.cc */,
+				9D32FC4624557EEB002DCDAB /* ncnn_layer_type.h */,
+				9D32FC4724557EEB002DCDAB /* serializer.h */,
+				9D32FC4824557EEB002DCDAB /* ncnn_layer_type.cc */,
+				9D32FC4924557EEB002DCDAB /* ncnn_model_interpreter.h */,
+				9D32FC4A24557EEB002DCDAB /* ncnn_param_utils.h */,
+				9D32FC4B24557EEB002DCDAB /* layer_interpreter */,
+			);
+			path = ncnn;
+			sourceTree = "<group>";
+		};
+		9D32FC3F24557EEB002DCDAB /* optimizer */ = {
+			isa = PBXGroup;
+			children = (
+				9D32FC4024557EEB002DCDAB /* ncnn_optimizer.h */,
+				9D32FC4124557EEB002DCDAB /* memory_data_optimizer.cc */,
+				9D32FC4224557EEB002DCDAB /* ncnn_optimizer_manager.cc */,
+				9D32FC4324557EEB002DCDAB /* expand_slice_optimizer.cc */,
+				9D32FC4424557EEB002DCDAB /* ncnn_optimizer_manager.h */,
+			);
+			path = optimizer;
+			sourceTree = "<group>";
+		};
+		9D32FC4B24557EEB002DCDAB /* layer_interpreter */ = {
+			isa = PBXGroup;
+			children = (
+				EC0BE16A25144C0F009BD69A /* clip_layer_interpreter.cc */,
+				EC0BE16C25144C0F009BD69A /* detection_output_layer_interpreter.cc */,
+				EC0BE16425144C0F009BD69A /* elu_layer_interpreter.cc */,
+				EC0BE16E25144C10009BD69A /* instance_norm_layer_interpreter.cc */,
+				EC0BE16625144C0F009BD69A /* normalize_layer_interpreter.cc */,
+				EC0BE16925144C0F009BD69A /* pad_layer_interpreter.cc */,
+				EC0BE17025144C10009BD69A /* prelu_layer_interpreter.cc */,
+				EC0BE16725144C0F009BD69A /* prior_box_layer_interpreter.cc */,
+				EC0BE16D25144C0F009BD69A /* reduce_op_layer_interpreter.cc */,
+				EC0BE16B25144C0F009BD69A /* reorg_layer_interpreter.cc */,
+				EC0BE16F25144C10009BD69A /* roi_pooling_layer_interpreter.cc */,
+				EC0BE16825144C0F009BD69A /* scale_layer_interpreter.cc */,
+				EC0BE16525144C0F009BD69A /* selu_layer_interpreter.cc */,
+				EC0BE17125144C10009BD69A /* unary_op_layer_interpreter.cc */,
+				9D32FC4C24557EEB002DCDAB /* batch_norm_layer_interpreter.cc */,
+				9D32FC4D24557EEB002DCDAB /* conv_layer_interpreter.cc */,
+				9D32FC4E24557EEB002DCDAB /* memory_data_layer_interpreter.cc */,
+				9D32FC4F24557EEB002DCDAB /* default_layer_interpreter.cc */,
+				9D32FC5024557EEB002DCDAB /* pooling_layer_interpreter.cc */,
+				9D32FC5124557EEB002DCDAB /* abstract_layer_interpreter.h */,
+				9D32FC5224557EEB002DCDAB /* slice_layer_interpreter.cc */,
+				9D32FC5324557EEB002DCDAB /* permute_layer_interpreter.cc */,
+				9D32FC5424557EEB002DCDAB /* binary_op_interpreter.cc */,
+				9D32FC5524557EEB002DCDAB /* hard_swish_layer_interpreter.cc */,
+				9D32FC5624557EEB002DCDAB /* lrn_layer_interpreter.cc */,
+				9D32FC5724557EEB002DCDAB /* hard_sigmoid_layer_interpreter.cc */,
+				9D32FC5824557EEB002DCDAB /* softmax_layer_interpreter.cc */,
+				9D32FC5924557EEB002DCDAB /* eltwise_layer_interpreter.cc */,
+				9D32FC5A24557EEB002DCDAB /* reshape_layer_interpreter.cc */,
+				9D32FC5B24557EEB002DCDAB /* relu_layer_interpreter.cc */,
+				9D32FC5C24557EEB002DCDAB /* deconv_layer_interpreter.cc */,
+				9D32FC5D24557EEB002DCDAB /* interp_layer_interpreter.cc */,
+				9D32FC5E24557EEB002DCDAB /* inner_product_layer_interpreter.cc */,
+				9D32FC5F24557EEB002DCDAB /* concat_layer_interpreter.cc */,
+				9D32FC6024557EEB002DCDAB /* crop_layer_interpreter.cc */,
+				9D32FC6124557EEB002DCDAB /* shuffle_channel_layer_interpreter.cc */,
+			);
+			path = layer_interpreter;
+			sourceTree = "<group>";
+		};
+		9D32FC6424557EEB002DCDAB /* core */ = {
+			isa = PBXGroup;
+			children = (
+				EC8BDE0725E3B13D0085CCC2 /* const_folder.cc */,
+				ECEC5DA724FFC6FD0044DDF1 /* mat.cc */,
+				9D32FC7424557EEB002DCDAB /* abstract_device.cc */,
+				9D32FC7124557EEB002DCDAB /* abstract_device.h */,
+				9D32FC7624557EEB002DCDAB /* abstract_layer_acc.cc */,
+				9D32FC6924557EEB002DCDAB /* abstract_layer_acc.h */,
+				9D32FC6624557EEB002DCDAB /* abstract_network.cc */,
+				9D32FC7324557EEB002DCDAB /* abstract_network.h */,
+				9D32FC6D24557EEB002DCDAB /* blob_int8.cc */,
+				9D32FC7B24557EEB002DCDAB /* blob_int8.h */,
+				9D32FC7924557EEB002DCDAB /* blob_manager.cc */,
+				E44D945526048B7F003FE4A3 /* blob_impl.cc */,
+				E44D945626048B7F003FE4A3 /* blob_impl.h */,
+				E44D945726048B7F003FE4A3 /* const_folder.h */,
+				9D32FC7C24557EEB002DCDAB /* blob_manager.h */,
+				9D32FC6B24557EEB002DCDAB /* blob.cc */,
+				9D32FC6E24557EEB002DCDAB /* context.cc */,
+				9D32FC6F24557EEB002DCDAB /* context.h */,
+				9D32FC7A24557EEB002DCDAB /* default_network.cc */,
+				9D32FC6724557EEB002DCDAB /* default_network.h */,
+				9D32FC7824557EEB002DCDAB /* instance.cc */,
+				9D32FC6524557EEB002DCDAB /* layer_type.cc */,
+				9D32FC7724557EEB002DCDAB /* layer_type.h */,
+				9D4C60CD246BF826006068D1 /* profile.cc */,
+				9D4C60CE246BF826006068D1 /* profile.h */,
+				9D32FC6A24557EEB002DCDAB /* status.cc */,
+				9D32FC7224557EEB002DCDAB /* tnn_impl_default.cc */,
+				9D32FC6C24557EEB002DCDAB /* tnn_impl_default.h */,
+				9D32FC6824557EEB002DCDAB /* tnn_impl.cc */,
+				9D32FC7524557EEB002DCDAB /* tnn_impl.h */,
+				9D32FC7024557EEB002DCDAB /* tnn.cc */,
+			);
+			path = core;
+			sourceTree = "<group>";
+		};
+		9DD1FA8C247CE9BE00800139 /* metal */ = {
+			isa = PBXGroup;
+			children = (
+				EC2CF72425078C1200EE3899 /* metal_mat_converter.metal */,
+				EC2CF72325078C1200EE3899 /* metal_mat_converter.mm */,
+				9DD1FA8D247CE9BE00800139 /* coreml_network.mm */,
+				9DD1FA8F247CE9BE00800139 /* metal_command_queue.h */,
+				9DD1FA90247CE9BE00800139 /* coreml_network.h */,
+				9DD1FA91247CE9BE00800139 /* metal_device.mm */,
+				9DD1FA92247CE9BE00800139 /* tnn_impl_coreml.h */,
+				9DD1FA93247CE9BE00800139 /* metal_macro.h */,
+				9DD1FA94247CE9BE00800139 /* metal_blob_converter.mm */,
+				9DD1FA95247CE9BE00800139 /* metal_blob_converter.metal */,
+				9DD1FA96247CE9BE00800139 /* metal_device.h */,
+				9DD1FA98247CE9BE00800139 /* metal_command_queue.mm */,
+				9DD1FA99247CE9BE00800139 /* metal_context.mm */,
+				9DD1FA9A247CE9BE00800139 /* tnn_impl_coreml.mm */,
+				9DD1FA9B247CE9BE00800139 /* metal_context.h */,
+				9DD1FA9C247CE9BE00800139 /* acc */,
+			);
+			path = metal;
+			sourceTree = "<group>";
+		};
+		9DD1FA9C247CE9BE00800139 /* acc */ = {
+			isa = PBXGroup;
+			children = (
+				4E187D312672036A00804FDF /* metal_cast_layer_acc.h */,
+				4E187D2D2672036A00804FDF /* metal_cast_layer_acc.metal */,
+				4E187D342672036A00804FDF /* metal_cast_layer_acc.mm */,
+				4E187D322672036A00804FDF /* metal_flatten_layer_acc.mm */,
+				4E187D372672036A00804FDF /* metal_gather_layer_acc.h */,
+				4E187D302672036A00804FDF /* metal_gather_layer_acc.metal */,
+				4E187D362672036A00804FDF /* metal_gather_layer_acc.mm */,
+				4E187D332672036A00804FDF /* metal_mat_mul_layer_acc.h */,
+				4E187D382672036A00804FDF /* metal_mat_mul_layer_acc.metal */,
+				4E187D2F2672036A00804FDF /* metal_mat_mul_layer_acc.mm */,
+				4E187D2E2672036A00804FDF /* metal_tile_layer_acc.metal */,
+				4E187D352672036A00804FDF /* metal_tile_layer_acc.mm */,
+				EC78BA2926045787009271A8 /* metal_stride_slice_v2_layer_acc.mm */,
+				EC5932F825CFEBA900FF8F4B /* metal_reorg_layer_acc.metal */,
+				EC5932F725CFEBA900FF8F4B /* metal_reorg_layer_acc.mm */,
+				EC5932F125CA446100FF8F4B /* recurrent */,
+				ECD945AF254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.metal */,
+				ECD945AC254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.mm */,
+				ECD945AD254ADD7100BF9214 /* metal_signed_mul_layer_acc.metal */,
+				ECD945AE254ADD7100BF9214 /* metal_signed_mul_layer_acc.mm */,
+				EC2CF7812511F80500EE3899 /* metal_arg_max_or_min_layer_acc.metal */,
+				EC2CF7802511F80500EE3899 /* metal_arg_max_or_min_layer_acc.mm */,
+				9DD1FB12247CE9BE00800139 /* convolution */,
+				9DD1FACF247CE9BE00800139 /* deconvolution */,
+				9DD1FAC8247CE9BE00800139 /* metal_abs_layer_acc.metal */,
+				9DD1FAED247CE9BE00800139 /* metal_abs_layer_acc.mm */,
+				9DD1FB30247CE9BE00800139 /* metal_acos_layer_acc.metal */,
+				9DD1FACC247CE9BE00800139 /* metal_acos_layer_acc.mm */,
+				9DD1FAB1247CE9BE00800139 /* metal_add_layer_acc.metal */,
+				9DD1FB05247CE9BE00800139 /* metal_add_layer_acc.mm */,
+				9DD1FB0E247CE9BE00800139 /* metal_asin_layer_acc.metal */,
+				9DD1FB11247CE9BE00800139 /* metal_asin_layer_acc.mm */,
+				9DD1FABC247CE9BE00800139 /* metal_atan_layer_acc.metal */,
+				9DD1FAC3247CE9BE00800139 /* metal_atan_layer_acc.mm */,
+				9DD1FABE247CE9BE00800139 /* metal_batch_norm_layer_acc.metal */,
+				9D5B715E24BF0A300062DF64 /* metal_prior_box_layer_acc.metal */,
+				9D5B715F24BF0A300062DF64 /* metal_prior_box_layer_acc.mm */,
+				9DD1FACE247CE9BE00800139 /* metal_batch_norm_layer_acc.mm */,
+				9DD1FAA6247CE9BE00800139 /* metal_ceil_layer_acc.metal */,
+				9DD1FAB9247CE9BE00800139 /* metal_ceil_layer_acc.mm */,
+				9DD1FAF1247CE9BE00800139 /* metal_clip_layer_acc.metal */,
+				9DD1FAF7247CE9BE00800139 /* metal_clip_layer_acc.mm */,
+				9DD1FAB0247CE9BE00800139 /* metal_common.h */,
+				9DD1FAAA247CE9BE00800139 /* metal_common.metal */,
+				9DD1FAEF247CE9BE00800139 /* metal_concat_layer_acc.metal */,
+				9DD1FAEE247CE9BE00800139 /* metal_concat_layer_acc.mm */,
+				9DD1FAFD247CE9BE00800139 /* metal_cos_layer_acc.metal */,
+				9DD1FAF2247CE9BE00800139 /* metal_cos_layer_acc.mm */,
+				9DF19EA124A200AC00E1376D /* metal_cpu_adapter_acc.h */,
+				9DB341FB249B0A9300F23F65 /* metal_cpu_adapter_acc.mm */,
+				9DD1FAFC247CE9BE00800139 /* metal_div_layer_acc.metal */,
+				9DD1FAE0247CE9BE00800139 /* metal_div_layer_acc.mm */,
+				9DD1FAC4247CE9BE00800139 /* metal_elu_layer_acc.metal */,
+				9DD1FAEC247CE9BE00800139 /* metal_elu_layer_acc.mm */,
+				9DD1FAF8247CE9BE00800139 /* metal_exp_layer_acc.metal */,
+				9DD1FAE1247CE9BE00800139 /* metal_exp_layer_acc.mm */,
+				9DD1FABF247CE9BE00800139 /* metal_floor_layer_acc.metal */,
+				9DD1FAA0247CE9BE00800139 /* metal_floor_layer_acc.mm */,
+				9DD1FACA247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.metal */,
+				9DD1FB0B247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.mm */,
+				9DD1FAA2247CE9BE00800139 /* metal_hard_swish_layer_acc.metal */,
+				9DD1FAB6247CE9BE00800139 /* metal_hard_swish_layer_acc.mm */,
+				9DD1FAE4247CE9BE00800139 /* metal_hdrguide_layer_acc.metal */,
+				9DD1FB06247CE9BE00800139 /* metal_hdrguide_layer_acc.mm */,
+				9DD1FB09247CE9BE00800139 /* metal_instance_norm_layer_acc.metal */,
+				9DD1FAAC247CE9BE00800139 /* metal_instance_norm_layer_acc.mm */,
+				9DD1FB28247CE9BE00800139 /* metal_layer_acc.h */,
+				9DD1FAB3247CE9BE00800139 /* metal_layer_acc.mm */,
+				9DD1FAB7247CE9BE00800139 /* metal_log_layer_acc.metal */,
+				9DD1FB2E247CE9BE00800139 /* metal_log_layer_acc.mm */,
+				9DD1FB2D247CE9BE00800139 /* metal_log_sigmoid_layer_acc.metal */,
+				9DD1FADD247CE9BE00800139 /* metal_log_sigmoid_layer_acc.mm */,
+				9DD1FAE8247CE9BE00800139 /* metal_lrn_layer_acc.metal */,
+				9DD1FB29247CE9BE00800139 /* metal_lrn_layer_acc.mm */,
+				9DD1FB10247CE9BE00800139 /* metal_max_layer_acc.metal */,
+				9DD1FAF9247CE9BE00800139 /* metal_max_layer_acc.mm */,
+				9DD1FAE3247CE9BE00800139 /* metal_min_layer_acc.metal */,
+				9DD1FAF3247CE9BE00800139 /* metal_min_layer_acc.mm */,
+				9DD1FAE9247CE9BE00800139 /* metal_mul_layer_acc.metal */,
+				9DD1FAA5247CE9BE00800139 /* metal_mul_layer_acc.mm */,
+				9DD1FB08247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.h */,
+				9DD1FAA4247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.mm */,
+				9DD1FAC5247CE9BE00800139 /* metal_neg_layer_acc.metal */,
+				9DD1FB2C247CE9BE00800139 /* metal_neg_layer_acc.mm */,
+				9DD1FAF4247CE9BE00800139 /* metal_normalize_layer_acc.metal */,
+				9DD1FAEA247CE9BE00800139 /* metal_normalize_layer_acc.mm */,
+				9DD1FB04247CE9BE00800139 /* metal_pad_layer_acc.metal */,
+				9DD1FB00247CE9BE00800139 /* metal_pad_layer_acc.mm */,
+				9DD1FA9E247CE9BE00800139 /* metal_permute_layer_acc.metal */,
+				9DD1FB0D247CE9BE00800139 /* metal_permute_layer_acc.mm */,
+				9DF19E9D24A1FE8E00E1376D /* metal_pooling_layer_acc.metal */,
+				9DF19E9E24A1FE8E00E1376D /* metal_pooling_layer_acc.mm */,
+				9DD1FB2B247CE9BE00800139 /* metal_pow_layer_acc.metal */,
+				9DD1FAE6247CE9BE00800139 /* metal_pow_layer_acc.mm */,
+				9DD1FA9F247CE9BE00800139 /* metal_prelu_layer_acc.h */,
+				9DD1FAFF247CE9BE00800139 /* metal_prelu_layer_acc.metal */,
+				9DD1FAC6247CE9BE00800139 /* metal_prelu_layer_acc.mm */,
+				9DD1FADC247CE9BE00800139 /* metal_reciprocal_layer_acc.metal */,
+				9DD1FB0C247CE9BE00800139 /* metal_reciprocal_layer_acc.mm */,
+				9DD1FAC7247CE9BE00800139 /* metal_reduce_l1_layer_acc.mm */,
+				9DD1FAEB247CE9BE00800139 /* metal_reduce_l2_layer_acc.mm */,
+				9DD1FADF247CE9BE00800139 /* metal_reduce_layer_acc.h */,
+				9DD1FAFE247CE9BE00800139 /* metal_reduce_layer_acc.metal */,
+				9DD1FAC9247CE9BE00800139 /* metal_reduce_layer_acc.mm */,
+				9DD1FB0F247CE9BE00800139 /* metal_reduce_log_sum_exp_layer_acc.mm */,
+				9DD1FABB247CE9BE00800139 /* metal_reduce_log_sum_layer_acc.mm */,
+				9DD1FAC0247CE9BE00800139 /* metal_reduce_max_layer_acc.mm */,
+				9DD1FAA9247CE9BE00800139 /* metal_reduce_mean_layer_acc.mm */,
+				9DD1FAC1247CE9BE00800139 /* metal_reduce_min_layer_acc.mm */,
+				9DD1FAE2247CE9BE00800139 /* metal_reduce_prod_layer_acc.mm */,
+				9DD1FAB4247CE9BE00800139 /* metal_reduce_sum_layer_acc.mm */,
+				9DD1FABA247CE9BE00800139 /* metal_reduce_sum_square_layer_acc.mm */,
+				9DD1FAF5247CE9BE00800139 /* metal_relu_layer_acc.metal */,
+				9DD1FAA1247CE9BE00800139 /* metal_relu_layer_acc.mm */,
+				9DD1FADB247CE9BE00800139 /* metal_relu6_layer_acc.metal */,
+				9DD1FB01247CE9BE00800139 /* metal_relu6_layer_acc.mm */,
+				9DD1FB07247CE9BE00800139 /* metal_reshape_layer_acc.metal */,
+				9DD1FAB8247CE9BE00800139 /* metal_reshape_layer_acc.mm */,
+				9DD1FABD247CE9BE00800139 /* metal_selu_layer_acc.metal */,
+				9DD1FACD247CE9BE00800139 /* metal_selu_layer_acc.mm */,
+				9DD1FADE247CE9BE00800139 /* metal_shuffle_layer_acc.metal */,
+				9DD1FB2A247CE9BE00800139 /* metal_shuffle_layer_acc.mm */,
+				9DD1FAFA247CE9BE00800139 /* metal_sigmoid_layer_acc.metal */,
+				9DD1FA9D247CE9BE00800139 /* metal_sigmoid_layer_acc.mm */,
+				9DD1FB0A247CE9BE00800139 /* metal_sign_layer_acc.metal */,
+				9DD1FAD9247CE9BE00800139 /* metal_sign_layer_acc.mm */,
+				9DD1FAE5247CE9BE00800139 /* metal_sin_layer_acc.metal */,
+				9DD1FAAD247CE9BE00800139 /* metal_sin_layer_acc.mm */,
+				9DD1FAF6247CE9BE00800139 /* metal_softmax_layer_acc.metal */,
+				9DD1FAE7247CE9BE00800139 /* metal_softmax_layer_acc.mm */,
+				9DD1FADA247CE9BE00800139 /* metal_softplus_layer_acc.metal */,
+				9DD1FAFB247CE9BE00800139 /* metal_softplus_layer_acc.mm */,
+				9DD1FAB5247CE9BE00800139 /* metal_splitv_layer_acc.metal */,
+				9DD1FAD8247CE9BE00800139 /* metal_splitv_layer_acc.mm */,
+				9DD1FB27247CE9BE00800139 /* metal_sqrt_layer_acc.metal */,
+				9DD1FAB2247CE9BE00800139 /* metal_sqrt_layer_acc.mm */,
+				9DD1FB02247CE9BE00800139 /* metal_stride_slice_layer_acc.metal */,
+				9DD1FAA8247CE9BE00800139 /* metal_stride_slice_layer_acc.mm */,
+				9DD1FB2F247CE9BE00800139 /* metal_sub_layer_acc.metal */,
+				9DD1FAAE247CE9BE00800139 /* metal_sub_layer_acc.mm */,
+				9DD1FAA7247CE9BE00800139 /* metal_tan_layer_acc.metal */,
+				9DD1FAF0247CE9BE00800139 /* metal_tan_layer_acc.mm */,
+				9DD1FAC2247CE9BE00800139 /* metal_tanh_layer_acc.metal */,
+				9DD1FAAB247CE9BE00800139 /* metal_tanh_layer_acc.mm */,
+				9DD1FAA3247CE9BE00800139 /* metal_unary_layer_acc.h */,
+				9DD1FB24247CE9BE00800139 /* metal_unary_layer_acc.mm */,
+				9DD1FB26247CE9BE00800139 /* metal_upsample_layer_acc.metal */,
+				9DD1FB25247CE9BE00800139 /* metal_upsample_layer_acc.mm */,
+				EC7F4B0925E6417200F73811 /* metal_squeeze_layer_acc.mm */,
+				EC12EC8825E682F9007ADDE4 /* metal_reformat_layer_acc.h */,
+				EC12EC8925E68374007ADDE4 /* metal_reformat_layer_acc.mm */,
+				EC12EC8E25E7A7F4007ADDE4 /* metal_reformat_layer_acc.metal */,
+				EC12EF9425EF365B007ADDE4 /* metal_squeeze_layer_acc.metal */,
+			);
+			path = acc;
+			sourceTree = "<group>";
+		};
+		9DD1FACF247CE9BE00800139 /* deconvolution */ = {
+			isa = PBXGroup;
+			children = (
+				9DD1FAD0247CE9BE00800139 /* metal_deconv_layer_common.mm */,
+				9DD1FAD1247CE9BE00800139 /* metal_deconv_layer_acc.h */,
+				9DD1FAD2247CE9BE00800139 /* metal_deconv_layer_common.metal */,
+				9DD1FAD3247CE9BE00800139 /* metal_deconv_layer_depthwise.h */,
+				9DD1FAD4247CE9BE00800139 /* metal_deconv_layer_depthwise.mm */,
+				9DD1FAD5247CE9BE00800139 /* metal_deconv_layer_acc.mm */,
+				9DD1FAD6247CE9BE00800139 /* metal_deconv_layer_common.h */,
+				9DD1FAD7247CE9BE00800139 /* metal_deconv_layer_depthwise.metal */,
+			);
+			path = deconvolution;
+			sourceTree = "<group>";
+		};
+		9DD1FB12247CE9BE00800139 /* convolution */ = {
+			isa = PBXGroup;
+			children = (
+				9DD1FB13247CE9BE00800139 /* metal_conv_layer_common.mm */,
+				9DD1FB14247CE9BE00800139 /* metal_conv_layer_common.h */,
+				9DD1FB15247CE9BE00800139 /* metal_inner_product_layer_acc.mm */,
+				9DD1FB16247CE9BE00800139 /* metal_conv_layer_winograd.h */,
+				9DD1FB17247CE9BE00800139 /* metal_conv_layer_acc.h */,
+				9DD1FB18247CE9BE00800139 /* metal_conv_layer_depthwise.h */,
+				9DD1FB19247CE9BE00800139 /* metal_conv_layer_depthwise.mm */,
+				9DD1FB1A247CE9BE00800139 /* metal_conv_layer_common.metal */,
+				9DD1FB1B247CE9BE00800139 /* metal_conv_layer_1x1.h */,
+				9DD1FB1C247CE9BE00800139 /* metal_conv_layer_1x1.mm */,
+				9DD1FB1D247CE9BE00800139 /* metal_inner_product_layer_acc.h */,
+				9DD1FB1E247CE9BE00800139 /* metal_conv_layer_winograd.metal */,
+				9DD1FB1F247CE9BE00800139 /* metal_conv_layer_1x1.metal */,
+				9DD1FB20247CE9BE00800139 /* metal_inner_product_layer_acc.metal */,
+				9DD1FB21247CE9BE00800139 /* metal_conv_layer_depthwise.metal */,
+				9DD1FB22247CE9BE00800139 /* metal_conv_layer_winograd.mm */,
+				9DD1FB23247CE9BE00800139 /* metal_conv_layer_acc.mm */,
+			);
+			path = convolution;
+			sourceTree = "<group>";
+		};
+		9DDA7080241F99E600F17A1C /* tnn */ = {
+			isa = PBXGroup;
+			children = (
+				9DDA709E241F99F800F17A1C /* core */,
+				9DDA709D241F99F800F17A1C /* utils */,
+				9DDA7081241F99E600F17A1C /* version.h */,
+			);
+			path = tnn;
+			sourceTree = "<group>";
+		};
+		9DDA70A5241F9E9400F17A1C /* tnn */ = {
+			isa = PBXGroup;
+			children = (
+				9D32FC6424557EEB002DCDAB /* core */,
+				9D32F9F224557EE8002DCDAB /* device */,
+				9D32FBFA24557EEB002DCDAB /* interpreter */,
+				9D32F9A324557EE7002DCDAB /* layer */,
+				9D32F96824557EE7002DCDAB /* memory_manager */,
+				9D32F95D24557EE7002DCDAB /* optimizer */,
+				9D32F98824557EE7002DCDAB /* utils */,
+			);
+			path = tnn;
+			sourceTree = "<group>";
+		};
+		9DF542DF258B1363006CEC97 /* acc */ = {
+			isa = PBXGroup;
+			children = (
+				4E187D262672030500804FDF /* arm_concat_layer_acc.h */,
+				4E187D252672030500804FDF /* arm_grid_sample_layer_acc.cc */,
+				4E187D222672030500804FDF /* arm_inverse_layer_acc.cc */,
+				4E187D232672030500804FDF /* arm_padv2_layer_acc.cc */,
+				4E187D242672030500804FDF /* arm_tile_layer_acc.cc */,
+				EC626D492615B21A00750B31 /* arm_cast_layer_acc.cc */,
+				EC626D482615B21A00750B31 /* arm_expand_layer_acc.cc */,
+				EC626D4A2615B21A00750B31 /* arm_expand_layer_acc.h */,
+				EC626D4B2615B21A00750B31 /* arm_gather_layer_acc.cc */,
+				EC626D4C2615B21A00750B31 /* arm_unsqueeze_layer_acc.cc */,
+				EC78BA3E26048309009271A8 /* arm_mat_mul_layer_acc.cc */,
+				EC78BA3F26048309009271A8 /* arm_mat_mul_layer_acc.h */,
+				EC39A41E25FCA8E600891D9A /* arm_lstm_layer_acc.h */,
+				EC39A41925FC9DE100891D9A /* arm_lstm_layer_acc.cc */,
+				EC39A41A25FC9DE100891D9A /* arm_squeeze_layer_acc.cc */,
+				EC39A40025FB662900891D9A /* metal_reorg_layer_acc.metal */,
+				EC39A40125FB662900891D9A /* metal_reorg_layer_acc.mm */,
+				9DF54355258B1365006CEC97 /* arm_abs_layer_acc.cc */,
+				E4D05BE5259F15C600921502 /* arm_batch_norm_layer_acc.cc */,
+				E4D05BE9259F15C700921502 /* arm_concat_layer_acc.cc */,
+				E4D05BE7259F15C600921502 /* arm_permute_layer_acc.cc */,
+				E4D05BE8259F15C700921502 /* arm_pool_layer_acc.cc */,
+				E4D05BEA259F15C700921502 /* arm_prelu_layer_acc.cc */,
+				E4D05BEB259F15C700921502 /* arm_relu_layer_acc.cc */,
+				E4D05BE4259F15C600921502 /* arm_relu_layer_acc.h */,
+				E4D05BE6259F15C600921502 /* arm_softmax_layer_acc.h */,
+				9DF542FE258B1363006CEC97 /* arm_arg_max_or_min_layer_acc.cc */,
+				9DF54351258B1365006CEC97 /* arm_binary_layer_acc.cc */,
+				9DF54317258B1364006CEC97 /* arm_clip_layer_acc.cc */,
+				9DF5434C258B1365006CEC97 /* arm_detection_output_layer_acc.cc */,
+				9DF5434A258B1365006CEC97 /* arm_div_layer_acc.cc */,
+				9DF54352258B1365006CEC97 /* arm_elu_layer_acc.cc */,
+				9DF5434B258B1365006CEC97 /* arm_exp_layer_acc.cc */,
+				9DF5438D258B1366006CEC97 /* arm_floor_layer_acc.cc */,
+				9DF5430F258B1364006CEC97 /* arm_hard_sigmoid_acc.cc */,
+				9DF5434F258B1365006CEC97 /* arm_hard_swish_acc.cc */,
+				9DF542EC258B1363006CEC97 /* arm_inner_product_layer_acc.cc */,
+				9DF542EB258B1363006CEC97 /* arm_instance_norm_layer_acc.cc */,
+				9DF5434D258B1365006CEC97 /* arm_layer_acc.cc */,
+				9DF542E5258B1363006CEC97 /* arm_log_acc_layer_acc.cc */,
+				9DF54360258B1365006CEC97 /* arm_log_sigmoid_layer_acc.cc */,
+				9DF54313258B1364006CEC97 /* arm_max_layer_acc.cc */,
+				9DF54312258B1364006CEC97 /* arm_min_layer_acc.cc */,
+				9DF542F0258B1363006CEC97 /* arm_mul_layer_acc.cc */,
+				9DF54356258B1365006CEC97 /* arm_nchw_layer_acc.cc */,
+				9DF5435E258B1365006CEC97 /* arm_neg_layer_acc.cc */,
+				9DF542F1258B1363006CEC97 /* arm_normalize_layer_acc.cc */,
+				9DF54318258B1364006CEC97 /* arm_pad_layer_acc.cc */,
+				9DF54319258B1364006CEC97 /* arm_pixel_shuffle_layer_acc.cc */,
+				9DF54350258B1365006CEC97 /* arm_pow_layer_acc.cc */,
+				9DF54362258B1365006CEC97 /* arm_priorbox_layer_acc.cc */,
+				9DF542FF258B1363006CEC97 /* arm_reciprocal_layer_acc.cc */,
+				9DF54361258B1365006CEC97 /* arm_reduce_l1_layer_acc.cc */,
+				9DF542ED258B1363006CEC97 /* arm_reduce_l2_layer_acc.cc */,
+				9DF542EE258B1363006CEC97 /* arm_reduce_layer_acc.cc */,
+				9DF542E6258B1363006CEC97 /* arm_reduce_log_sum_exp_layer_acc.cc */,
+				9DF542FC258B1363006CEC97 /* arm_reduce_log_sum_layer_acc.cc */,
+				9DF5438A258B1366006CEC97 /* arm_reduce_max_layer_acc.cc */,
+				9DF54310258B1364006CEC97 /* arm_reduce_mean_layer_acc.cc */,
+				9DF5438B258B1366006CEC97 /* arm_reduce_min_layer_acc.cc */,
+				9DF542E8258B1363006CEC97 /* arm_reduce_prod_layer_acc.cc */,
+				9DF54390258B1366006CEC97 /* arm_reduce_sum_layer_acc.cc */,
+				9DF5435F258B1365006CEC97 /* arm_reduce_sum_square_layer_acc.cc */,
+				9DF542F7258B1363006CEC97 /* arm_reformat_layer_acc.cc */,
+				9DF542FA258B1363006CEC97 /* arm_relu6_layer_acc.cc */,
+				9DF5430E258B1364006CEC97 /* arm_reorg_layer_acc.cc */,
+				9DF542F4258B1363006CEC97 /* arm_reshape_layer_acc.cc */,
+				9DF542EA258B1363006CEC97 /* arm_scale_layer_acc.cc */,
+				9DF542F9258B1363006CEC97 /* arm_selu_layer_acc.cc */,
+				9DF5438E258B1366006CEC97 /* arm_shuffle_layer_acc.cc */,
+				9DF542E1258B1363006CEC97 /* arm_sigmoid_layer_acc.cc */,
+				9DF542F6258B1363006CEC97 /* arm_sign_layer_acc.cc */,
+				9DF54387258B1366006CEC97 /* arm_signed_mul_layer_acc.cc */,
+				9DF54314258B1364006CEC97 /* arm_softmax_layer_acc.cc */,
+				9DF5438F258B1366006CEC97 /* arm_softplus_layer_acc.cc */,
+				9DF542E7258B1363006CEC97 /* arm_splitv_layer_acc.cc */,
+				9DF542E0258B1363006CEC97 /* arm_sqrt_layer_acc.cc */,
+				9DF54358258B1365006CEC97 /* arm_stride_slice_layer_acc.cc */,
+				9DF542E3258B1363006CEC97 /* arm_sub_layer_acc.cc */,
+				9DF542FB258B1363006CEC97 /* arm_trig_layer_acc.cc */,
+				9DF542E2258B1363006CEC97 /* arm_unary_layer_acc.cc */,
+				9DF54354258B1365006CEC97 /* arm_upsample_layer_acc.cc */,
+				9DF54316258B1364006CEC97 /* arm_add_layer_acc.h */,
+				9DF54391258B1366006CEC97 /* arm_batch_norm_layer_acc.h */,
+				9DF542F2258B1363006CEC97 /* arm_binary_layer_acc.h */,
+				9DF54359258B1365006CEC97 /* arm_inner_product_layer_acc.h */,
+				9DF54349258B1365006CEC97 /* arm_layer_acc.h */,
+				9DF542E9258B1363006CEC97 /* arm_nchw_layer_acc.h */,
+				9DF54311258B1364006CEC97 /* arm_pool_layer_acc.h */,
+				9DF542F5258B1363006CEC97 /* arm_prelu_layer_acc.h */,
+				9DF54386258B1366006CEC97 /* arm_reduce_layer_acc.h */,
+				9DF542FD258B1363006CEC97 /* arm_reformat_layer_acc.h */,
+				9DF54353258B1365006CEC97 /* arm_reshape_layer_acc.h */,
+				9DF5435B258B1365006CEC97 /* arm_signed_mul_layer_acc.h */,
+				9DF542F8258B1363006CEC97 /* arm_unary_layer_acc.h */,
+				9DF5438C258B1366006CEC97 /* arm_upsample_layer_acc.h */,
+				E4D05C02259F1BA700921502 /* arm_add_layer_acc.cc */,
+				9DF5435A258B1365006CEC97 /* Float4.h */,
+				9DF542E4258B1363006CEC97 /* Half8.h */,
+				9DF5434E258B1365006CEC97 /* neon_mathfun.h */,
+				9DF542F3258B1363006CEC97 /* TNNVector.h */,
+				9DF5431A258B1364006CEC97 /* compute */,
+				E4D05B7B259DCB2D00921502 /* compute_arm82 */,
+				9DF54363258B1365006CEC97 /* convolution */,
+				9DF54300258B1363006CEC97 /* deconvolution */,
+			);
+			path = acc;
+			sourceTree = "<group>";
+		};
+		9DF54300258B1363006CEC97 /* deconvolution */ = {
+			isa = PBXGroup;
+			children = (
+				9DF54301258B1363006CEC97 /* arm_deconv_layer_acc.cc */,
+				9DF54303258B1363006CEC97 /* arm_deconv_layer_stride.h */,
+				9DF54304258B1363006CEC97 /* arm_deconv_fp16_layer_common.h */,
+				9DF54305258B1363006CEC97 /* arm_deconv_layer_depthwise.cc */,
+				9DF54306258B1363006CEC97 /* arm_deconv_layer_common.cc */,
+				9DF54308258B1363006CEC97 /* arm_deconv_layer_acc.h */,
+				9DF54309258B1364006CEC97 /* arm_deconv_layer_depthwise.h */,
+				9DF5430A258B1364006CEC97 /* arm_deconv_fp16_layer_depthwise.h */,
+				9DF5430B258B1364006CEC97 /* arm_deconv_layer_stride.cc */,
+				9DF5430C258B1364006CEC97 /* arm_deconv_layer_common.h */,
+			);
+			path = deconvolution;
+			sourceTree = "<group>";
+		};
+		9DF5431A258B1364006CEC97 /* compute */ = {
+			isa = PBXGroup;
+			children = (
+				EC626D402615B1F800750B31 /* binary_function.cc */,
+				EC626D412615B1F800750B31 /* binary_function.h */,
+				9DF5431B258B1364006CEC97 /* compute.cc */,
+				9DF5431C258B1364006CEC97 /* winograd_function.cc */,
+				9DF5431D258B1364006CEC97 /* compute.h */,
+				9DF5431E258B1364006CEC97 /* compute_int8.cc */,
+				9DF5431F258B1364006CEC97 /* arm32 */,
+				9DF54335258B1364006CEC97 /* gemm_function.h */,
+				9DF54336258B1364006CEC97 /* asm_func_name.S */,
+				9DF54337258B1364006CEC97 /* winograd_function.h */,
+				9DF54338258B1364006CEC97 /* compute_int8.h */,
+				9DF54339258B1364006CEC97 /* arm64 */,
+				9DF54348258B1364006CEC97 /* gemm_function.cc */,
+			);
+			path = compute;
+			sourceTree = "<group>";
+		};
+		9DF5431F258B1364006CEC97 /* arm32 */ = {
+			isa = PBXGroup;
+			children = (
+				9DF54320258B1364006CEC97 /* CONV_BFP16_O4.S */,
+				9DF54321258B1364006CEC97 /* GEMM_INT8_4X8.S */,
+				9DF54322258B1364006CEC97 /* GEMM_BFP16_N4.S */,
+				E43D68B425C8F38000FAAF54 /* CONV_DW_3X3_INT8_SLIDEW.S */,
+				9DF54323258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */,
+				9DF54324258B1364006CEC97 /* CONV_DW_5X5_BFP16_SLIDEW.S */,
+				9DF54325258B1364006CEC97 /* DECONV_FLOAT_O4.S */,
+				9DF54326258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */,
+				9DF54327258B1364006CEC97 /* GEMM_INT8_4X4.S */,
+				9DF54328258B1364006CEC97 /* CONV_DW_3X3_BFP16_SLIDEW.S */,
+				9DF54329258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */,
+				9DF5432A258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */,
+				9DF5432B258B1364006CEC97 /* CONV_FLOAT_O4.S */,
+				9DF5432C258B1364006CEC97 /* GEMM_FLOAT_N4.S */,
+			);
+			path = arm32;
+			sourceTree = "<group>";
+		};
+		9DF54339258B1364006CEC97 /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				320CCF3A2681CE570086E65C /* CONV_DW_3X3_INT8_SLIDEW.S */,
+				9DF5433A258B1364006CEC97 /* CONV_BFP16_O4.S */,
+				9DF5433B258B1364006CEC97 /* CONV_FLOAT_SLIDEW_C3.S */,
+				9DF5433C258B1364006CEC97 /* CONV_DW_5x5_BFP16_SLIDEW.S */,
+				9DF5433D258B1364006CEC97 /* DECONV_FLOAT_O4.S */,
+				9DF5433E258B1364006CEC97 /* GEMM_FLOAT_N8.S */,
+				9DF5433F258B1364006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S */,
+				9DF54340258B1364006CEC97 /* GEMM_BFP16_N8.S */,
+				9DF54341258B1364006CEC97 /* GEMM_INT8_4X4.S */,
+				9DF54342258B1364006CEC97 /* CONV_DW_3x3_BFP16_SLIDEW.S */,
+				9DF54343258B1364006CEC97 /* CONV_BFP16_SLIDEW_C3.S */,
+				9DF54344258B1364006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S */,
+				9DF54345258B1364006CEC97 /* CONV_FLOAT_O4.S */,
+				9DF54346258B1364006CEC97 /* GEMM_INT8_8X8.S */,
+			);
+			path = arm64;
+			sourceTree = "<group>";
+		};
+		9DF54363258B1365006CEC97 /* convolution */ = {
+			isa = PBXGroup;
+			children = (
+				32BFADDE2684AACE00502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.h */,
+				320CCF332681CC3D0086E65C /* arm_conv_int8_sdot_layer_common.h */,
+				9DF54364258B1365006CEC97 /* arm_conv_int8_layer_depthwise.cc */,
+				9DF54365258B1365006CEC97 /* arm_conv_layer_depthwise_s1.h */,
+				9DF54366258B1365006CEC97 /* arm_conv_layer_common.cc */,
+				9DF54367258B1365006CEC97 /* arm_conv_layer_1x1.h */,
+				9DF54369258B1365006CEC97 /* arm_conv_layer_common.h */,
+				9DF5436A258B1365006CEC97 /* arm_conv_layer_c3.h */,
+				9DF5436B258B1365006CEC97 /* arm_conv_int8_layer_common.h */,
+				9DF5436D258B1365006CEC97 /* arm_conv_fp16_layer_c3.h */,
+				9DF5436E258B1365006CEC97 /* arm_conv_int8_layer_1x1.cc */,
+				9DF5436F258B1365006CEC97 /* arm_conv_fp16_layer_depthwise_s1.h */,
+				9DF54370258B1365006CEC97 /* arm_conv_fp16_layer_depthwise.h */,
+				9DF54371258B1365006CEC97 /* arm_conv_layer_1x1.cc */,
+				9DF54372258B1365006CEC97 /* arm_conv_layer_acc.h */,
+				9DF54373258B1365006CEC97 /* arm_conv_fp16_layer_3x3.h */,
+				9DF54374258B1365006CEC97 /* arm_conv_fp16_layer_common.h */,
+				9DF54375258B1365006CEC97 /* arm_conv_layer_3x3.cc */,
+				9DF54376258B1365006CEC97 /* arm_conv_int8_layer_1x1.h */,
+				9DF54377258B1365006CEC97 /* arm_conv_layer_depthwise.cc */,
+				9DF54378258B1365006CEC97 /* arm_conv_int8_layer_common.cc */,
+				9DF54379258B1365006CEC97 /* arm_conv_layer_group.h */,
+				9DF5437A258B1365006CEC97 /* arm_conv_layer_3x3.h */,
+				9DF5437C258B1365006CEC97 /* arm_conv_layer_c3.cc */,
+				9DF5437D258B1365006CEC97 /* arm_conv_layer_acc_factory.h */,
+				9DF5437E258B1365006CEC97 /* arm_conv_layer_acc_factory.cc */,
+				9DF5437F258B1365006CEC97 /* arm_conv_int8_layer_depthwise.h */,
+				9DF54380258B1365006CEC97 /* arm_conv_layer_acc.cc */,
+				9DF54383258B1365006CEC97 /* arm_conv_layer_group.cc */,
+				9DF54384258B1365006CEC97 /* arm_conv_layer_depthwise.h */,
+				9DF54385258B1365006CEC97 /* arm_conv_layer_depthwise_s1.cc */,
+			);
+			path = convolution;
+			sourceTree = "<group>";
+		};
+		E4D05B7B259DCB2D00921502 /* compute_arm82 */ = {
+			isa = PBXGroup;
+			children = (
+				32BFADE12684AAE100502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.cc */,
+				320CCF2A2681CC090086E65C /* arm_conv_int8_sdot_layer_common.cc */,
+				320CCF2D2681CC090086E65C /* compute_sdot_int8.cc */,
+				320CCF2C2681CC090086E65C /* compute_sdot_int8.h */,
+				EC626D452615B20300750B31 /* arm_binary_fp16_layer_acc.cc */,
+				E4D05B7C259DCB2D00921502 /* arm_conv_fp16_layer_depthwise.cc */,
+				E4D05BF5259F161000921502 /* arm_batch_norm_fp16_layer.cc */,
+				E44D946326048BE7003FE4A3 /* arm_inner_product_fp16_layer.cc */,
+				E44D946726048BF3003FE4A3 /* gemm_function_fp16.cc */,
+				E44D946226048BE7003FE4A3 /* arm_lstm_fp16_layer.cc */,
+				E4D05BF6259F161000921502 /* arm_prelu_fp16_layer.cc */,
+				E4D05BF7259F161000921502 /* arm_relu_fp16_layer.cc */,
+				E4D05B7D259DCB2D00921502 /* compute_half.h */,
+				E4D05B7E259DCB2D00921502 /* arm_deconv_fp16_layer_common.cc */,
+				E4D05B80259DCB2D00921502 /* asm_32 */,
+				E4D05B8B259DCB2D00921502 /* arm_unary_fp16_layer.h */,
+				E4D05B8C259DCB2D00921502 /* arm_conv_fp16_layer_c3.cc */,
+				E4D05B8F259DCB2D00921502 /* arm_conv_fp16_layer_depthwise_s1.cc */,
+				E4D05B90259DCB2D00921502 /* arm_sigmoid_fp16_layer.cc */,
+				E4D05B91259DCB2D00921502 /* arm_deconv_fp16_layer_depthwise.cc */,
+				E4D05B92259DCB2D00921502 /* asm_64 */,
+				E4D05B9B259DCB2D00921502 /* compute_half.cc */,
+				E4D05B9C259DCB2D00921502 /* arm_softmax_fp16_layer.cc */,
+				E4D05B9F259DCB2D00921502 /* arm_conv_fp16_layer_3x3.cc */,
+				E4D05BA0259DCB2D00921502 /* arm_relu6_fp16_layer.cc */,
+				E4D05BA1259DCB2D00921502 /* arm_conv_fp16_layer_common.cc */,
+				E4D05BA4259DCB2D00921502 /* winograd_function_fp16.cc */,
+			);
+			path = compute_arm82;
+			sourceTree = "<group>";
+		};
+		E4D05B80259DCB2D00921502 /* asm_32 */ = {
+			isa = PBXGroup;
+			children = (
+				32EE07CE268589E200656211 /* GEMV_INT8_SDOT.S */,
+				32BFADE52684AAEE00502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */,
+				32BFADE42684AAEE00502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */,
+				32BFADE62684AAEE00502FAC /* GEMM_INT8_SDOT_4X4.S */,
+				32BFADE72684AAEE00502FAC /* GEMM_INT8_SDOT_4X8.S */,
+				E4D05B81259DCB2D00921502 /* DECONV_FP16_O8_C1.S */,
+				E4D05B82259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */,
+				E4D05B83259DCB2D00921502 /* GEMM_FP16_N8.S */,
+				E4D05B84259DCB2D00921502 /* FLOAT2HALF.S */,
+				E4D05B86259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */,
+				E4D05B87259DCB2D00921502 /* DECONV_FP16_O8.S */,
+				E4D05B88259DCB2D00921502 /* HALF2FLOAT.S */,
+			);
+			path = asm_32;
+			sourceTree = "<group>";
+		};
+		E4D05B92259DCB2D00921502 /* asm_64 */ = {
+			isa = PBXGroup;
+			children = (
+				32BFADED2684AB0100502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S */,
+				32BFADEE2684AB0100502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S */,
+				320CCF212681CB910086E65C /* GEMM_INT8_SDOT_8X4.S */,
+				320CCF222681CB910086E65C /* GEMM_INT8_SDOT_8X8.S */,
+				320CCF232681CB910086E65C /* GEMV_INT8_SDOT.S */,
+				E4D05B93259DCB2D00921502 /* DECONV_FP16_O8_C1.S */,
+				E4D05B94259DCB2D00921502 /* CONV_DW_3X3_FP16_SLIDEW.S */,
+				E4D05B95259DCB2D00921502 /* GEMM_FP16_N8.S */,
+				E4D05B96259DCB2D00921502 /* FLOAT2HALF.S */,
+				E4D05B97259DCB2D00921502 /* CONV_FP16_SLIDEW_C3.S */,
+				E4D05B98259DCB2D00921502 /* DECONV_FP16_O8.S */,
+				E4D05B99259DCB2D00921502 /* HALF2FLOAT.S */,
+			);
+			path = asm_64;
+			sourceTree = "<group>";
+		};
+		EC5932F125CA446100FF8F4B /* recurrent */ = {
+			isa = PBXGroup;
+			children = (
+				EC5932F225CA446100FF8F4B /* metal_lstm_layer_acc.h */,
+				EC5932F325CA446100FF8F4B /* metal_lstm_layer_acc.mm */,
+				ECCDCEFE25DF5C3F00D7D297 /* metal_lstm_layer_acc.metal */,
+			);
+			path = recurrent;
+			sourceTree = "<group>";
+		};
+		ECCDCDFE25DF536000D7D297 /* cpu */ = {
+			isa = PBXGroup;
+			children = (
+				ECCDCDFF25DF536000D7D297 /* CMakeLists.txt */,
+				ECCDCE0025DF536000D7D297 /* cpu_device.cc */,
+				ECCDCE0125DF536000D7D297 /* cpu_mat_util.h */,
+				ECCDCE0225DF536000D7D297 /* cpu_mat_converter.cc */,
+				ECCDCE0325DF536000D7D297 /* cpu_device.h */,
+				ECCDCE0425DF536000D7D297 /* cpu_context.h */,
+				ECCDCE0525DF536000D7D297 /* cpu_blob_converter.cc */,
+				ECCDCE0625DF536000D7D297 /* cpu_mat_converter.h */,
+				ECCDCE0725DF536000D7D297 /* cpu_context.cc */,
+				ECCDCE0825DF536000D7D297 /* cpu_mat_util.cc */,
+				ECCDCE0925DF536000D7D297 /* acc */,
+			);
+			path = cpu;
+			sourceTree = "<group>";
+		};
+		ECCDCE0925DF536000D7D297 /* acc */ = {
+			isa = PBXGroup;
+			children = (
+				ECCDCE0A25DF536000D7D297 /* cpu_cast_layer_acc.cc */,
+				ECCDCE0B25DF536000D7D297 /* cpu_stride_slice_layer_acc.cc */,
+				ECCDCE0C25DF536000D7D297 /* cpu_reciprocal_layer_acc.cc */,
+				ECCDCE0D25DF536000D7D297 /* cpu_binary_op_layer_acc.h */,
+				ECCDCE0E25DF536000D7D297 /* cpu_erf_layer_acc.cc */,
+				ECCDCE0F25DF536000D7D297 /* cpu_gather_layer_acc.cc */,
+				ECCDCE1025DF536000D7D297 /* cpu_permute_layer_acc.h */,
+				ECCDCE1125DF536000D7D297 /* cpu_reorg_layer_acc.cc */,
+				ECCDCE1225DF536000D7D297 /* cpu_layer_acc.cc */,
+				ECCDCE1325DF536000D7D297 /* cpu_ histogram_layer_acc.cc */,
+				ECCDCE1425DF536000D7D297 /* cpu_detection_post_process_layer_acc.h */,
+				ECCDCE1525DF536000D7D297 /* cpu_size_layer_acc.cc */,
+				ECCDCE1625DF536000D7D297 /* cpu_conv_layer_acc.h */,
+				ECCDCE1725DF536000D7D297 /* cpu_reduce_log_sum_layer_acc.cc */,
+				ECCDCE1825DF536000D7D297 /* cpu_clip_layer_acc.cc */,
+				ECCDCE1925DF536000D7D297 /* cpu_reduce_l1_layer_acc.cc */,
+				ECCDCE1A25DF536000D7D297 /* cpu_arg_max_or_min_layer_acc.cc */,
+				ECCDCE1B25DF536000D7D297 /* cpu_softmax_layer_acc.cc */,
+				ECCDCE1C25DF536000D7D297 /* cpu_reduce_sum_square_layer_acc.cc */,
+				ECCDCE1D25DF536000D7D297 /* cpu_squared_difference_layer_acc.cc */,
+				ECCDCE1E25DF536000D7D297 /* cpu_relu6_layer_acc.cc */,
+				ECCDCE1F25DF536000D7D297 /* cpu_concat_layer_acc.cc */,
+				ECCDCE2025DF536000D7D297 /* cpu_pool_layer_acc.cc */,
+				ECCDCE2125DF536000D7D297 /* cpu_detection_output_layer_acc.h */,
+				ECCDCE2225DF536000D7D297 /* cpu_reduce_mean_layer_acc.cc */,
+				ECCDCE2325DF536000D7D297 /* cpu_range_layer_acc.cc */,
+				ECCDCE2425DF536000D7D297 /* cpu_reformat_layer_acc.cc */,
+				ECCDCE2525DF536000D7D297 /* cpu_upsample_layer_acc.h */,
+				ECCDCE2625DF536000D7D297 /* cpu_prior_box_layer_acc.cc */,
+				ECCDCE2725DF536000D7D297 /* cpu_nonzero_layer_acc.cc */,
+				ECCDCE2825DF536000D7D297 /* cpu_scale_layer_acc.cc */,
+				ECCDCE2925DF536000D7D297 /* cpu_pixel_shuffle_layer_acc.cc */,
+				ECCDCE2A25DF536000D7D297 /* cpu_shuffle_layer_acc.cc */,
+				ECCDCE2B25DF536000D7D297 /* cpu_layer_acc.h */,
+				ECCDCE2C25DF536000D7D297 /* cpu_mul_layer_acc.cc */,
+				ECCDCE2D25DF536000D7D297 /* cpu_prior_box_layer_acc.h */,
+				ECCDCE2E25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc */,
+				ECCDCE2F25DF536000D7D297 /* cpu_sin_layer_acc.cc */,
+				ECCDCE3025DF536000D7D297 /* cpu_sub_layer_acc.cc */,
+				ECCDCE3125DF536000D7D297 /* cpu_asin_layer_acc.cc */,
+				ECCDCE3225DF536000D7D297 /* cpu_unary_layer_acc.cc */,
+				ECCDCE3325DF536000D7D297 /* cpu_detection_output_layer_acc.cc */,
+				ECCDCE3425DF536000D7D297 /* cpu_expand_layer_acc.cc */,
+				ECCDCE3525DF536000D7D297 /* cpu_pool_3d_layer_acc.cc */,
+				ECCDCE3625DF536000D7D297 /* cpu_unsqueeze_layer_acc.cc */,
+				ECCDCE3725DF536000D7D297 /* cpu_rsqrt_layer_acc.cc */,
+				ECCDCE3825DF536000D7D297 /* cpu_detection_post_process_layer_acc.cc */,
+				ECCDCE3925DF536000D7D297 /* cpu_conv_layer_acc.cc */,
+				ECCDCE3A25DF536000D7D297 /* cpu_permute_layer_acc.cc */,
+				ECCDCE3B25DF536000D7D297 /* cpu_add_layer_acc.cc */,
+				ECCDCE3C25DF536000D7D297 /* cpu_floor_layer_acc.cc */,
+				ECCDCE3D25DF536000D7D297 /* cpu_signed_mul_layer_acc.cc */,
+				ECCDCE3E25DF536000D7D297 /* cpu_reduce_layer_acc.h */,
+				ECCDCE3F25DF536000D7D297 /* cpu_relu_layer_acc.cc */,
+				ECCDCE4025DF536000D7D297 /* cpu_reduce_min_layer_acc.cc */,
+				ECCDCE4125DF536000D7D297 /* cpu_conv_3d_layer_acc.cc */,
+				ECCDCE4225DF536000D7D297 /* cpu_reduce_max_layer_acc.cc */,
+				ECCDCE4325DF536000D7D297 /* cpu_scatter_nd_layer_acc.cc */,
+				ECCDCE4425DF536000D7D297 /* cpu_bias_add_layer_acc.cc */,
+				ECCDCE4525DF536000D7D297 /* cpu_shape_layer_acc.cc */,
+				ECCDCE4625DF536000D7D297 /* cpu_hdrguide_layer_acc.cc */,
+				ECCDCE4725DF536000D7D297 /* compute */,
+				ECCDCE4D25DF536000D7D297 /* cpu_reduce_log_sum_exp_layer_acc.cc */,
+				ECCDCE4E25DF536000D7D297 /* cpu_reduce_sum_layer_acc.cc */,
+				ECCDCE4F25DF536000D7D297 /* cpu_hard_swish_layer_acc.cc */,
+				ECCDCE5025DF536000D7D297 /* cpu_reduce_prod_layer_acc.cc */,
+				ECCDCE5125DF536000D7D297 /* cpu_tanh_layer_acc.cc */,
+				ECCDCE5225DF536000D7D297 /* cpu_gathernd_layer_acc.cc */,
+				ECCDCE5325DF536000D7D297 /* cpu_sigmoid_layer_acc.cc */,
+				ECCDCE5425DF536000D7D297 /* cpu_log_layer_acc.cc */,
+				ECCDCE5525DF536000D7D297 /* cpu_batch_norm_layer_acc.cc */,
+				ECCDCE5625DF536000D7D297 /* cpu_upsample_layer_acc.cc */,
+				ECCDCE5725DF536000D7D297 /* cpu_lrn_layer_acc.cc */,
+				ECCDCE5825DF536000D7D297 /* cpu_sqrt_layer_acc.cc */,
+				ECCDCE5925DF536000D7D297 /* cpu_neg_layer_acc.cc */,
+				ECCDCE5A25DF536000D7D297 /* cpu_deconv_layer_acc.cc */,
+				ECCDCE5B25DF536000D7D297 /* cpu_stride_slice_v2_layer_acc.cc */,
+				ECCDCE5C25DF536000D7D297 /* cpu_pow_layer_acc.cc */,
+				ECCDCE5D25DF536000D7D297 /* cpu_splitv_layer_acc.cc */,
+				ECCDCE5E25DF536000D7D297 /* cpu_lstm_layer_acc.cc */,
+				ECCDCE5F25DF536000D7D297 /* cpu_exp_layer_acc.cc */,
+				ECCDCE6025DF536000D7D297 /* cpu_div_layer_acc.cc */,
+				ECCDCE6125DF536000D7D297 /* cpu_hard_sigmoid_layer_acc.cc */,
+				ECCDCE6225DF536000D7D297 /* cpu_squeeze_layer_acc.cc */,
+				ECCDCE6325DF536000D7D297 /* cpu_padv2_layer_acc.cc */,
+				ECCDCE6425DF536000D7D297 /* cpu_ceil_layer_acc.cc */,
+				ECCDCE6525DF536000D7D297 /* cpu_deconv_layer_acc.h */,
+				ECCDCE6625DF536000D7D297 /* cpu_atan_layer_acc.cc */,
+				ECCDCE6725DF536000D7D297 /* cpu_tan_layer_acc.cc */,
+				ECCDCE6825DF536000D7D297 /* cpu_prelu_layer_acc.cc */,
+				ECCDCE6925DF536000D7D297 /* cpu_elu_layer_acc.cc */,
+				ECCDCE6A25DF536000D7D297 /* cpu_abs_layer_acc.cc */,
+				ECCDCE6B25DF536000D7D297 /* cpu_min_layer_acc.cc */,
+				ECCDCE6C25DF536000D7D297 /* cpu_cos_layer_acc.cc */,
+				ECCDCE6D25DF536000D7D297 /* cpu_inner_product_layer_acc.cc */,
+				ECCDCE6E25DF536000D7D297 /* cpu_instance_norm_layer_acc.cc */,
+				ECCDCE6F25DF536000D7D297 /* cpu_unary_layer_acc.h */,
+				ECCDCE7025DF536000D7D297 /* cpu_reshape_layer_acc.cc */,
+				ECCDCE7125DF536000D7D297 /* cpu_conv_3d_layer_acc.h */,
+				ECCDCE7225DF536000D7D297 /* cpu_selu_layer_acc.cc */,
+				ECCDCE7325DF536000D7D297 /* cpu_constantofshape_layer_acc.cc */,
+				ECCDCE7425DF536000D7D297 /* cpu_acos_layer_acc.cc */,
+				ECCDCE7525DF536000D7D297 /* cpu_reduce_l2_layer_acc.cc */,
+				ECCDCE7625DF536000D7D297 /* cpu_max_layer_acc.cc */,
+				ECCDCE7725DF536000D7D297 /* cpu_sign_layer_acc.cc */,
+				ECCDCE7825DF536000D7D297 /* cpu_normalize_layer_acc.cc */,
+				ECCDCE7925DF536000D7D297 /* cpu_pad_layer_acc.cc */,
+				ECCDCE7A25DF536000D7D297 /* cpu_bitshift_layer_acc.cc */,
+				ECCDCE7B25DF536000D7D297 /* cpu_binary_op_layer_acc.cc */,
+				ECCDCE7C25DF536000D7D297 /* cpu_reduce_layer_acc.cc */,
+				ECCDCE7D25DF536000D7D297 /* cpu_mat_mul_layer_acc.cc */,
+				ECCDCE7E25DF536000D7D297 /* cpu_softplus_layer_acc.cc */,
+			);
+			path = acc;
+			sourceTree = "<group>";
+		};
+		ECCDCE4725DF536000D7D297 /* compute */ = {
+			isa = PBXGroup;
+			children = (
+				ECCDCE4825DF536000D7D297 /* compute_int8.cc */,
+				ECCDCE4925DF536000D7D297 /* compute_elewise.h */,
+				ECCDCE4A25DF536000D7D297 /* normalized_bbox.h */,
+				ECCDCE4B25DF536000D7D297 /* compute_elewise.cc */,
+				ECCDCE4C25DF536000D7D297 /* compute_int8.h */,
+			);
+			path = compute;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		9D2DB1CC22D759C8000C508F /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				9DF54419258B1367006CEC97 /* arm_conv_fp16_layer_c3.h in Headers */,
+				EC12EC8125E67549007ADDE4 /* net_optimizer_fuse_conv_add.h in Headers */,
+				9D32FCCC24557EEC002DCDAB /* reduce_layer.h in Headers */,
+				ECCDCEC825DF536000D7D297 /* normalized_bbox.h in Headers */,
+				9D32FF4924557EED002DCDAB /* serializer.h in Headers */,
+				9D32FF7324557EED002DCDAB /* abstract_network.h in Headers */,
+				E44D947026048C1A003FE4A3 /* dims_function_utils.h in Headers */,
+				9DD1FBB7247CE9BE00800139 /* metal_conv_layer_winograd.h in Headers */,
+				9DF543E3258B1366006CEC97 /* gemm_function.h in Headers */,
+				9DF54396258B1366006CEC97 /* Half8.h in Headers */,
+				9D32FF0924557EED002DCDAB /* objseri.h in Headers */,
+				ECCDCEC725DF536000D7D297 /* compute_elewise.h in Headers */,
+				9D32FCA424557EEC002DCDAB /* memory_seperate_assign_strategy.h in Headers */,
+				9D32FF3A24557EED002DCDAB /* layer_resource.h in Headers */,
+				ECCDCE9525DF536000D7D297 /* cpu_conv_layer_acc.h in Headers */,
+				9D32FCA424557EEC002DCDAB /* memory_seperate_assign_strategy.h in Headers */,
+				9D32FF3A24557EED002DCDAB /* layer_resource.h in Headers */,
+				E44D945A26048B7F003FE4A3 /* blob_impl.h in Headers */,
+				9D32FCA024557EEC002DCDAB /* memory_mode_state.h in Headers */,
+				EC626D432615B1F800750B31 /* binary_function.h in Headers */,
+				9D32FCBD24557EEC002DCDAB /* pribox_generator_utils.h in Headers */,
+				9DD1FC68247CEA1400800139 /* arm_context.h in Headers */,
+				4E187D432672036B00804FDF /* metal_gather_layer_acc.h in Headers */,
+				9DD1FC69247CEA1400800139 /* arm_common.h in Headers */,
+				ECEC5D6C24FCE0780044DDF1 /* mat_converter_acc.h in Headers */,
+				ECCDCE8C25DF536000D7D297 /* cpu_binary_op_layer_acc.h in Headers */,
+				ECCDCEBD25DF536000D7D297 /* cpu_reduce_layer_acc.h in Headers */,
+				9DD1FBC9247CE9BE00800139 /* metal_layer_acc.h in Headers */,
+				9DD1FB46247CE9BE00800139 /* metal_unary_layer_acc.h in Headers */,
+				ECCDCEA425DF536000D7D297 /* cpu_upsample_layer_acc.h in Headers */,
+				9D32FCB124557EEC002DCDAB /* data_format_converter.h in Headers */,
+				4E187D3D2672036B00804FDF /* metal_cast_layer_acc.h in Headers */,
+				9D32FC8424557EEC002DCDAB /* net_optimizer_manager.h in Headers */,
+				9DDA70A1241F9A0300F17A1C /* core in Headers */,
+				EC8BDE1425E3B1600085CCC2 /* dims_offset_utils.h in Headers */,
+				9DF5442B258B1367006CEC97 /* arm_conv_int8_layer_depthwise.h in Headers */,
+				9DF543B9258B1366006CEC97 /* arm_deconv_layer_acc.h in Headers */,
+				ECCDCE8325DF536000D7D297 /* cpu_device.h in Headers */,
+				9DD1FB3A247CE9BE00800139 /* metal_device.h in Headers */,
+				9DF543B4258B1366006CEC97 /* arm_deconv_layer_stride.h in Headers */,
+				9DF54411258B1367006CEC97 /* arm_conv_layer_depthwise_s1.h in Headers */,
+				9D32FF6F24557EED002DCDAB /* context.h in Headers */,
+				9D32FF7524557EED002DCDAB /* tnn_impl.h in Headers */,
+				ECD9464E2558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.h in Headers */,
+				EC12EC8525E67549007ADDE4 /* net_optimizer_insert_layout_reformat.h in Headers */,
+				EC0BE15E25144BE4009BD69A /* layer_interpreter_macro.h in Headers */,
+				9D32FF1A24557EED002DCDAB /* abstract_layer_interpreter.h in Headers */,
+				9D32FC8A24557EEC002DCDAB /* shared_memory_manager.h in Headers */,
+				EC88054A255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.h in Headers */,
+				ECEC5D6524FCDBA50044DDF1 /* arm_mat_util.h in Headers */,
+				9DF543AF258B1366006CEC97 /* arm_reformat_layer_acc.h in Headers */,
+				9D852BCC24584E6A003F4E41 /* bfp16.h in Headers */,
+				9DF543B5258B1366006CEC97 /* arm_deconv_fp16_layer_common.h in Headers */,
+				9D32FC9824557EEC002DCDAB /* memory_assign_strategy.h in Headers */,
+				9D32FC8C24557EEC002DCDAB /* blob_1d_memory_pool.h in Headers */,
+				E44D945B26048B7F003FE4A3 /* const_folder.h in Headers */,
+				9D32FCA924557EEC002DCDAB /* blob_memory_size_utils.h in Headers */,
+				ECCDCEEF25DF536000D7D297 /* cpu_conv_3d_layer_acc.h in Headers */,
+				9D32FCCA24557EEC002DCDAB /* base_layer.h in Headers */,
+				9DF543A7258B1366006CEC97 /* arm_prelu_layer_acc.h in Headers */,
+				9DF54420258B1367006CEC97 /* arm_conv_fp16_layer_common.h in Headers */,
+				9DDA70A2241F9A0300F17A1C /* utils in Headers */,
+				9DF543CD258B1366006CEC97 /* compute.h in Headers */,
+				9DD1FB53247CE9BE00800139 /* metal_common.h in Headers */,
+				ECEC5D6624FCDBA50044DDF1 /* arm_mat_converter.h in Headers */,
+				E4D05BA6259DCB2E00921502 /* compute_half.h in Headers */,
+				9D32FCD624557EEC002DCDAB /* elementwise_layer.h in Headers */,
+				9DD1FC66247CEA1400800139 /* arm_util.h in Headers */,
+				9DD1FBB9247CE9BE00800139 /* metal_conv_layer_depthwise.h in Headers */,
+				9D32FF3D24557EED002DCDAB /* abstract_model_interpreter.h in Headers */,
+				9DF5441B258B1367006CEC97 /* arm_conv_fp16_layer_depthwise_s1.h in Headers */,
+				9DF54422258B1367006CEC97 /* arm_conv_int8_layer_1x1.h in Headers */,
+				9DDA7090241F99E700F17A1C /* version.h in Headers */,
+				9DF54415258B1367006CEC97 /* arm_conv_layer_common.h in Headers */,
+				ECCDCEED25DF536000D7D297 /* cpu_unary_layer_acc.h in Headers */,
+				9DF543C2258B1366006CEC97 /* arm_pool_layer_acc.h in Headers */,
+				9D32FCB424557EEC002DCDAB /* blob_converter_internal.h in Headers */,
+				9D32FC9D24557EEC002DCDAB /* memory_unify_assign_strategy.h in Headers */,
+				9DF54413258B1367006CEC97 /* arm_conv_layer_1x1.h in Headers */,
+				9D32FF6424557EED002DCDAB /* layer_resource_generator.h in Headers */,
+				9DF543FB258B1366006CEC97 /* neon_mathfun.h in Headers */,
+				9DD1FB3F247CE9BE00800139 /* metal_context.h in Headers */,
+				9D32FF0624557EED002DCDAB /* model_interpreter.h in Headers */,
+				EC5932F425CA446100FF8F4B /* metal_lstm_layer_acc.h in Headers */,
+				9D32FF6924557EED002DCDAB /* abstract_layer_acc.h in Headers */,
+				9DF543BB258B1366006CEC97 /* arm_deconv_fp16_layer_depthwise.h in Headers */,
+				9D32FC8724557EEC002DCDAB /* blob_2d_memory_pool.h in Headers */,
+				ECCDCE8425DF536000D7D297 /* cpu_context.h in Headers */,
+				9DF19EA224A200AC00E1376D /* metal_cpu_adapter_acc.h in Headers */,
+				9DD1FB78247CE9BE00800139 /* metal_deconv_layer_common.h in Headers */,
+				ECCDCE9325DF536000D7D297 /* cpu_detection_post_process_layer_acc.h in Headers */,
+				9DF54408258B1367006CEC97 /* arm_signed_mul_layer_acc.h in Headers */,
+				EC39A41F25FCA8E600891D9A /* arm_lstm_layer_acc.h in Headers */,
+				9DD1FB33247CE9BE00800139 /* metal_command_queue.h in Headers */,
+				9DF543BA258B1366006CEC97 /* arm_deconv_layer_depthwise.h in Headers */,
+				9D32FF7C24557EED002DCDAB /* blob_manager.h in Headers */,
+				9DF543A4258B1366006CEC97 /* arm_binary_layer_acc.h in Headers */,
+				9D32FF0F24557EED002DCDAB /* unary_op_layer_interpreter.h in Headers */,
+				E4D05BEE259F15C700921502 /* arm_softmax_layer_acc.h in Headers */,
+				9DD1FB42247CE9BE00800139 /* metal_prelu_layer_acc.h in Headers */,
+				9DD1FBB8247CE9BE00800139 /* metal_conv_layer_acc.h in Headers */,
+				9DF26BDA24645EA500F22F0D /* naive_compute.h in Headers */,
+				9DF54430258B1367006CEC97 /* arm_conv_layer_depthwise.h in Headers */,
+				320CCF352681CC3D0086E65C /* arm_conv_int8_sdot_layer_common.h in Headers */,
+				4E187D2B2672030500804FDF /* arm_concat_layer_acc.h in Headers */,
+				EC0BE1BC251DBE65009BD69A /* mat_converter_utils.h in Headers */,
+				4E187D3F2672036B00804FDF /* metal_mat_mul_layer_acc.h in Headers */,
+				9DD1FB36247CE9BE00800139 /* tnn_impl_coreml.h in Headers */,
+				9DF543F6258B1366006CEC97 /* arm_layer_acc.h in Headers */,
+				EC78BA4126048309009271A8 /* arm_mat_mul_layer_acc.h in Headers */,
+				9D32FC9924557EEC002DCDAB /* memory_mode_state_factory.h in Headers */,
+				9DF54406258B1367006CEC97 /* arm_inner_product_layer_acc.h in Headers */,
+				9DF54416258B1367006CEC97 /* arm_conv_layer_c3.h in Headers */,
+				ECCDCEA025DF536000D7D297 /* cpu_detection_output_layer_acc.h in Headers */,
+				9D4C60CB246BF7A1006068D1 /* bbox_util.h in Headers */,
+				ECCDCEAA25DF536000D7D297 /* cpu_layer_acc.h in Headers */,
+				9D32FF4824557EED002DCDAB /* ncnn_layer_type.h in Headers */,
+				9D32FF3624557EED002DCDAB /* reduce_op_interpreter.h in Headers */,
+				ECCDCECA25DF536000D7D297 /* compute_int8.h in Headers */,
+				369005C6267314D900412264 /* pad_utils.h in Headers */,
+				9D32FF0724557EED002DCDAB /* model_packer.h in Headers */,
+				EC0BE13725144B5E009BD69A /* detection_post_process_utils.h in Headers */,
+				9DF543AA258B1366006CEC97 /* arm_unary_layer_acc.h in Headers */,
+				9D32FC9624557EEC002DCDAB /* blob_memory_size_info.h in Headers */,
+				9D32FCA224557EEC002DCDAB /* blob_memory_pool.h in Headers */,
+				E44D947326048C1A003FE4A3 /* dims_utils.h in Headers */,
+				9D32FF4224557EED002DCDAB /* ncnn_optimizer.h in Headers */,
+				9DD1FB37247CE9BE00800139 /* metal_macro.h in Headers */,
+				9D32FC9724557EEC002DCDAB /* blob_memory.h in Headers */,
+				E44D947A26048C32003FE4A3 /* cpu_info.h in Headers */,
+				9D32FF4B24557EED002DCDAB /* ncnn_model_interpreter.h in Headers */,
+				ECCDCEAC25DF536000D7D297 /* cpu_prior_box_layer_acc.h in Headers */,
+				32BFADDF2684AACE00502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.h in Headers */,
+				9D32FCE624557EEC002DCDAB /* multidir_broadcast_layer.h in Headers */,
+				9DD1FBAA247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.h in Headers */,
+				9DF54426258B1367006CEC97 /* arm_conv_layer_3x3.h in Headers */,
+				9D32FC9A24557EEC002DCDAB /* others_memory_mode_state.h in Headers */,
+				9D32FF0124557EED002DCDAB /* default_model_interpreter.h in Headers */,
+				9DF5441C258B1367006CEC97 /* arm_conv_fp16_layer_depthwise.h in Headers */,
+				9D32FC9B24557EEC002DCDAB /* blob_2d_memory.h in Headers */,
+				ECCDCE8625DF536000D7D297 /* cpu_mat_converter.h in Headers */,
+				9D32FF4624557EED002DCDAB /* ncnn_optimizer_manager.h in Headers */,
+				EC12EC8325E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.h in Headers */,
+				9DF5444C258B162F006CEC97 /* blob_converter_default.h in Headers */,
+				9DD1FB34247CE9BE00800139 /* coreml_network.h in Headers */,
+				9DD1FB73247CE9BE00800139 /* metal_deconv_layer_acc.h in Headers */,
+				ECCDCEE325DF536000D7D297 /* cpu_deconv_layer_acc.h in Headers */,
+				9D32FF6324557EED002DCDAB /* net_structure.h in Headers */,
+				ECD9464F2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.h in Headers */,
+				9D32FF3824557EED002DCDAB /* raw_buffer.h in Headers */,
+				9DD1FB81247CE9BE00800139 /* metal_reduce_layer_acc.h in Headers */,
+				9DF543E6258B1366006CEC97 /* compute_int8.h in Headers */,
+				9DF543E5258B1366006CEC97 /* winograd_function.h in Headers */,
+				E44D947C26048C32003FE4A3 /* data_flag_utils.h in Headers */,
+				9DF54425258B1367006CEC97 /* arm_conv_layer_group.h in Headers */,
+				9DF54438258B1367006CEC97 /* arm_upsample_layer_acc.h in Headers */,
+				E4D05BB3259DCB2E00921502 /* arm_unary_fp16_layer.h in Headers */,
+				EC12EC8025E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.h in Headers */,
+				9DD1FBB5247CE9BE00800139 /* metal_conv_layer_common.h in Headers */,
+				9DF543A5258B1366006CEC97 /* TNNVector.h in Headers */,
+				EC626D4F2615B21A00750B31 /* arm_expand_layer_acc.h in Headers */,
+				6178F34D2590AA8C00B4B153 /* md5.h in Headers */,
+				9D32FCA124557EEC002DCDAB /* share_one_thread_memory_mode_state.h in Headers */,
+				9D32FC9124557EEC002DCDAB /* blob_memory_pool_factory.h in Headers */,
+				9D32FCA724557EEC002DCDAB /* omp_utils.h in Headers */,
+				E4D05BEC259F15C700921502 /* arm_relu_layer_acc.h in Headers */,
+				9D32FF6C24557EED002DCDAB /* tnn_impl_default.h in Headers */,
+				9DF543BD258B1366006CEC97 /* arm_deconv_layer_common.h in Headers */,
+				9D32FC9524557EEC002DCDAB /* blob_1d_memory.h in Headers */,
+				9D32FF6724557EED002DCDAB /* default_network.h in Headers */,
+				9DD1FBBC247CE9BE00800139 /* metal_conv_layer_1x1.h in Headers */,
+				9DF5444F258B162F006CEC97 /* random_data_utils.h in Headers */,
+				9D4C60D0246BF826006068D1 /* profile.h in Headers */,
+				9D32FF7124557EED002DCDAB /* abstract_device.h in Headers */,
+				9D32FF7724557EED002DCDAB /* layer_type.h in Headers */,
+				9D32FF5224557EED002DCDAB /* abstract_layer_interpreter.h in Headers */,
+				9D32FF0324557EED002DCDAB /* layer_param.h in Headers */,
+				9D32FF7B24557EED002DCDAB /* blob_int8.h in Headers */,
+				9D32FF4C24557EED002DCDAB /* ncnn_param_utils.h in Headers */,
+				9DF54407258B1367006CEC97 /* Float4.h in Headers */,
+				EC39A3FC25FB65E000891D9A /* half.hpp in Headers */,
+				9DF54400258B1366006CEC97 /* arm_reshape_layer_acc.h in Headers */,
+				9D32FF3924557EED002DCDAB /* default_model_packer.h in Headers */,
+				320CCF302681CC090086E65C /* compute_sdot_int8.h in Headers */,
+				9DF5444A258B162F006CEC97 /* npu_common_utils.h in Headers */,
+				9DF5439B258B1366006CEC97 /* arm_nchw_layer_acc.h in Headers */,
+				9D2DB1D622D759C8000C508F /* tnn.h in Headers */,
+				9DF54429258B1367006CEC97 /* arm_conv_layer_acc_factory.h in Headers */,
+				9DF5441E258B1367006CEC97 /* arm_conv_layer_acc.h in Headers */,
+				9D32FF3724557EED002DCDAB /* layer_interpreter.h in Headers */,
+				9DF54417258B1367006CEC97 /* arm_conv_int8_layer_common.h in Headers */,
+				9DF5443D258B1367006CEC97 /* arm_batch_norm_layer_acc.h in Headers */,
+				9DF543C7258B1366006CEC97 /* arm_add_layer_acc.h in Headers */,
+				9D32FC8124557EEC002DCDAB /* net_optimizer.h in Headers */,
+				9D32FCAA24557EEC002DCDAB /* string_utils_inner.h in Headers */,
+				9D32FC8324557EEC002DCDAB /* optimizer_const.h in Headers */,
+				6178F34C2590AA8C00B4B153 /* winograd_generator.h in Headers */,
+				9D32FF3B24557EED002DCDAB /* net_resource.h in Headers */,
+				9D32FC7E24557EEB002DCDAB /* net_optimizer_remove_layers.h in Headers */,
+				ECCDCE8F25DF536000D7D297 /* cpu_permute_layer_acc.h in Headers */,
+				9DD1FBBE247CE9BE00800139 /* metal_inner_product_layer_acc.h in Headers */,
+				9D32FCAB24557EEC002DCDAB /* blob_transfer_utils.h in Headers */,
+				9DD1FCF0247CEA1500800139 /* arm_device.h in Headers */,
+				9DD1FB75247CE9BE00800139 /* metal_deconv_layer_depthwise.h in Headers */,
+				ECCDCE8125DF536000D7D297 /* cpu_mat_util.h in Headers */,
+				9DD1FCEE247CEA1500800139 /* arm_blob_converter.h in Headers */,
+				9DF5441F258B1367006CEC97 /* arm_conv_fp16_layer_3x3.h in Headers */,
+				9DF54432258B1367006CEC97 /* arm_reduce_layer_acc.h in Headers */,
+				9D32FCAC24557EEC002DCDAB /* split_utils.h in Headers */,
+				EC39A3FB25FB65E000891D9A /* half_utils_inner.h in Headers */,
+				9D32FCBC24557EEC002DCDAB /* string_format.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		9D2DB1D022D759C8000C508F /* tnn */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 9D2DB1D922D759C8000C508F /* Build configuration list for PBXNativeTarget "tnn" */;
+			buildPhases = (
+				9D2DB1CC22D759C8000C508F /* Headers */,
+				9D2DB1CD22D759C8000C508F /* Sources */,
+				9D2DB1CE22D759C8000C508F /* Frameworks */,
+				9D2DB1CF22D759C8000C508F /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tnn;
+			productName = tnn;
+			productReference = 9D2DB1D122D759C8000C508F /* tnn.framework */;
+			productType = "com.apple.product-type.framework";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		9D2DB1C822D759C8000C508F /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1020;
+				ORGANIZATIONNAME = tencent;
+				TargetAttributes = {
+					9D2DB1D022D759C8000C508F = {
+						CreatedOnToolsVersion = 10.2.1;
+					};
+				};
+			};
+			buildConfigurationList = 9D2DB1CB22D759C8000C508F /* Build configuration list for PBXProject "tnn" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = 9D2DB1C722D759C8000C508F;
+			productRefGroup = 9D2DB1D222D759C8000C508F /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				9D2DB1D022D759C8000C508F /* tnn */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		9D2DB1CF22D759C8000C508F /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				ECCDCE7F25DF536000D7D297 /* CMakeLists.txt in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		9D2DB1CD22D759C8000C508F /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				6178F34E2590AA8C00B4B153 /* md5.cc in Sources */,
+				9D32FCF824557EEC002DCDAB /* elu_layer.cc in Sources */,
+				4E187CFE267202BF00804FDF /* layer_norm_layer_interpreter.cc in Sources */,
+				EC12EC8225E67549007ADDE4 /* net_optimizer_cbam_fused_reduce.cc in Sources */,
+				9DD1FB4D247CE9BE00800139 /* metal_common.metal in Sources */,
+				9D32FF2724557EED002DCDAB /* pooling_3d_layer_interpreter.cc in Sources */,
+				E4D05BF8259F161000921502 /* arm_batch_norm_fp16_layer.cc in Sources */,
+				9D32FCC624557EEC002DCDAB /* reduce_prod_layer.cc in Sources */,
+				9D32FF1F24557EED002DCDAB /* hard_swish_layer_interpreter.cc in Sources */,
+				ECCDCEFF25DF5C3F00D7D297 /* metal_lstm_layer_acc.metal in Sources */,
+				EC39A40325FB662900891D9A /* metal_reorg_layer_acc.mm in Sources */,
+				9DF54398258B1366006CEC97 /* arm_reduce_log_sum_exp_layer_acc.cc in Sources */,
+				9DF543F3258B1366006CEC97 /* GEMM_INT8_8X8.S in Sources */,
+				9DD1FB5E247CE9BE00800139 /* metal_reduce_log_sum_layer_acc.mm in Sources */,
+				9D32FCC924557EEC002DCDAB /* exp_layer.cc in Sources */,
+				EC78BA4026048309009271A8 /* arm_mat_mul_layer_acc.cc in Sources */,
+				9D32FF6A24557EED002DCDAB /* status.cc in Sources */,
+				9DD1FBA2247CE9BE00800139 /* metal_pad_layer_acc.mm in Sources */,
+				9DD1FB55247CE9BE00800139 /* metal_sqrt_layer_acc.mm in Sources */,
+				4E187D1D267202D800804FDF /* inverse_layer.cc in Sources */,
+				9DD1FB6A247CE9BE00800139 /* metal_reduce_l1_layer_acc.mm in Sources */,
+				9D32FF2324557EED002DCDAB /* roi_pooling_layer_interpreter.cc in Sources */,
+				9D32FC9E24557EEC002DCDAB /* share_one_thread_memory_mode_state.cc in Sources */,
+				EC8BDE0925E3B13D0085CCC2 /* const_folder.cc in Sources */,
+				ECCDCE9925DF536000D7D297 /* cpu_arg_max_or_min_layer_acc.cc in Sources */,
+				ECCDCE9F25DF536000D7D297 /* cpu_pool_layer_acc.cc in Sources */,
+				9D32FCDC24557EEC002DCDAB /* selu_layer.cc in Sources */,
+				9DF543D4258B1366006CEC97 /* DECONV_FLOAT_O4.S in Sources */,
+				9DF19EA024A1FE8E00E1376D /* metal_pooling_layer_acc.mm in Sources */,
+				EC0BE15225144BB8009BD69A /* squared_difference_layer.cc in Sources */,
+				9DF543DB258B1366006CEC97 /* GEMM_FLOAT_N4.S in Sources */,
+				ECCDCEC925DF536000D7D297 /* compute_elewise.cc in Sources */,
+				E4D05BF1259F15C700921502 /* arm_concat_layer_acc.cc in Sources */,
+				9DF5444D258B162F006CEC97 /* npu_common_utils.cc in Sources */,
+				9DD1FB50247CE9BE00800139 /* metal_sin_layer_acc.mm in Sources */,
+				E4D05BC2259DCB2E00921502 /* compute_half.cc in Sources */,
+				9DD1FB58247CE9BE00800139 /* metal_splitv_layer_acc.metal in Sources */,
+				ECCDCF2725E0F97500D7D297 /* cbam_fused_pooling_layer.cc in Sources */,
+				9DF543C5258B1366006CEC97 /* arm_softmax_layer_acc.cc in Sources */,
+				9D32FF1824557EED002DCDAB /* sub_layer_interpreter.cc in Sources */,
+				ECCDCF2525E0F97500D7D297 /* cbam_fused_reduce_layer.cc in Sources */,
+				9DF543B7258B1366006CEC97 /* arm_deconv_layer_common.cc in Sources */,
+				ECCDCF1725E0F97500D7D297 /* expand_layer.cc in Sources */,
+				9D32FF0224557EED002DCDAB /* layer_resource_generator.cc in Sources */,
+				9DD1FBA6247CE9BE00800139 /* metal_pad_layer_acc.metal in Sources */,
+				ECD945B1254ADD7100BF9214 /* metal_signed_mul_layer_acc.metal in Sources */,
+				ECCDCF4325E10B9D00D7D297 /* stride_slice_v2_layer_interpreter.cc in Sources */,
+				9DD1FB70247CE9BE00800139 /* metal_selu_layer_acc.mm in Sources */,
+				9DF543AE258B1366006CEC97 /* arm_reduce_log_sum_layer_acc.cc in Sources */,
+				9DF54431258B1367006CEC97 /* arm_conv_layer_depthwise_s1.cc in Sources */,
+				9D32FF1624557EED002DCDAB /* pooling_layer_interpreter.cc in Sources */,
+				ECD945B6254ADD8400BF9214 /* pixel_shuffle_layer.cc in Sources */,
+				ECCDCEDF25DF536000D7D297 /* cpu_hard_sigmoid_layer_acc.cc in Sources */,
+				9D32FF1924557EED002DCDAB /* normalize_layer_interpreter.cc in Sources */,
+				9DD1FB64247CE9BE00800139 /* metal_reduce_min_layer_acc.mm in Sources */,
+				9D32FF0B24557EED002DCDAB /* batch_norm_layer_interpreter.cc in Sources */,
+				ECCDCEEC25DF536000D7D297 /* cpu_instance_norm_layer_acc.cc in Sources */,
+				ECCDCE8825DF536000D7D297 /* cpu_mat_util.cc in Sources */,
+				9D32FF5124557EED002DCDAB /* pooling_layer_interpreter.cc in Sources */,
+				ECCDCEF125DF536000D7D297 /* cpu_constantofshape_layer_acc.cc in Sources */,
+				ECCDCE9425DF536000D7D297 /* cpu_size_layer_acc.cc in Sources */,
+				9DF5442F258B1367006CEC97 /* arm_conv_layer_group.cc in Sources */,
+				ECCDCEEB25DF536000D7D297 /* cpu_inner_product_layer_acc.cc in Sources */,
+				9D32FCC724557EEC002DCDAB /* reduce_sum_square_layer.cc in Sources */,
+				4E187D3C2672036B00804FDF /* metal_gather_layer_acc.metal in Sources */,
+				4E187CFD267202BF00804FDF /* gridsample_layer_interpreter.cc in Sources */,
+				ECCDCEA625DF536000D7D297 /* cpu_nonzero_layer_acc.cc in Sources */,
+				9D32FF2024557EED002DCDAB /* elu_layer_interpreter.cc in Sources */,
+				9D32FF1524557EED002DCDAB /* stride_slice_layer_interpreter.cc in Sources */,
+				9D32FF4124557EED002DCDAB /* ncnn_model_interpreter.cc in Sources */,
+				9DF543A8258B1366006CEC97 /* arm_sign_layer_acc.cc in Sources */,
+				9DD1FB7A247CE9BE00800139 /* metal_splitv_layer_acc.mm in Sources */,
+				ECCDCEF525DF536000D7D297 /* cpu_sign_layer_acc.cc in Sources */,
+				9DF543D1258B1366006CEC97 /* GEMM_BFP16_N4.S in Sources */,
+				9DF543D6258B1366006CEC97 /* GEMM_INT8_4X4.S in Sources */,
+				9DF5439F258B1366006CEC97 /* arm_reduce_l2_layer_acc.cc in Sources */,
+				9D32FF6B24557EED002DCDAB /* blob.cc in Sources */,
+				9D32FCC124557EEC002DCDAB /* pow_layer.cc in Sources */,
+				9D32FF5E24557EED002DCDAB /* interp_layer_interpreter.cc in Sources */,
+				ECCDCED425DF536000D7D297 /* cpu_upsample_layer_acc.cc in Sources */,
+				9DD1FBD1247CE9BF00800139 /* metal_acos_layer_acc.metal in Sources */,
+				9D32FCC824557EEC002DCDAB /* add_layer.cc in Sources */,
+				9D32FF1124557EED002DCDAB /* reorg_layer_interpreter.cc in Sources */,
+				ECCDCEA325DF536000D7D297 /* cpu_reformat_layer_acc.cc in Sources */,
+				EC12EC8425E67549007ADDE4 /* net_optimizer_fuse_conv_add.cc in Sources */,
+				9DD1FB74247CE9BE00800139 /* metal_deconv_layer_common.metal in Sources */,
+				E4D05BED259F15C700921502 /* arm_batch_norm_layer_acc.cc in Sources */,
+				9D32FCEF24557EEC002DCDAB /* concat_layer.cc in Sources */,
+				9DF54395258B1366006CEC97 /* arm_sub_layer_acc.cc in Sources */,
+				ECCDCF2A25E0F97500D7D297 /* padv2_layer.cc in Sources */,
+				4E187D3E2672036B00804FDF /* metal_flatten_layer_acc.mm in Sources */,
+				9DD1FB4A247CE9BE00800139 /* metal_tan_layer_acc.metal in Sources */,
+				EC88054B255FE59D00BC4EDD /* net_optimizer_fuse_conv_post.cc in Sources */,
+				ECCDCEC525DF536000D7D297 /* cpu_hdrguide_layer_acc.cc in Sources */,
+				9DF543ED258B1366006CEC97 /* GEMM_BFP16_N8.S in Sources */,
+				9D32FCD524557EEC002DCDAB /* sub_layer.cc in Sources */,
+				E4D05BC8259DCB2E00921502 /* arm_conv_fp16_layer_common.cc in Sources */,
+				EC0BE17425144C10009BD69A /* normalize_layer_interpreter.cc in Sources */,
+				9DD1FBCF247CE9BF00800139 /* metal_log_layer_acc.mm in Sources */,
+				EC12EF9525EF365B007ADDE4 /* metal_squeeze_layer_acc.metal in Sources */,
+				9D32FC8D24557EEC002DCDAB /* blob_1d_memory.cc in Sources */,
+				9DD1FBC6247CE9BE00800139 /* metal_upsample_layer_acc.mm in Sources */,
+				9DD1FBA9247CE9BE00800139 /* metal_reshape_layer_acc.metal in Sources */,
+				9D32FF7924557EED002DCDAB /* blob_manager.cc in Sources */,
+				9D32FD0824557EEC002DCDAB /* lrn_layer.cc in Sources */,
+				ECCDCEF225DF536000D7D297 /* cpu_acos_layer_acc.cc in Sources */,
+				9DD1FC65247CEA1400800139 /* arm_util.cc in Sources */,
+				ECCDCE8A25DF536000D7D297 /* cpu_stride_slice_layer_acc.cc in Sources */,
+				9D32FCB824557EEC002DCDAB /* blob_memory_size_utils.cc in Sources */,
+				ECCDCEA125DF536000D7D297 /* cpu_reduce_mean_layer_acc.cc in Sources */,
+				9DF543F1258B1366006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S in Sources */,
+				ECCDCEE925DF536000D7D297 /* cpu_min_layer_acc.cc in Sources */,
+				32BFADF02684AB0100502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S in Sources */,
+				9DF54436258B1367006CEC97 /* arm_reduce_max_layer_acc.cc in Sources */,
+				9DF543A3258B1366006CEC97 /* arm_normalize_layer_acc.cc in Sources */,
+				ECCDCF2325E0F97500D7D297 /* nonzero_layer.cc in Sources */,
+				9DD1FB9E247CE9BE00800139 /* metal_div_layer_acc.metal in Sources */,
+				9D32FF2624557EED002DCDAB /* pow_layer_interpreter.cc in Sources */,
+				9D32FF3024557EED002DCDAB /* min_layer_interpreter.cc in Sources */,
+				9DD1FBB0247CE9BE00800139 /* metal_asin_layer_acc.metal in Sources */,
+				E4D05BBF259DCB2E00921502 /* DECONV_FP16_O8.S in Sources */,
+				EC0BE14F25144BB8009BD69A /* reduce_l1_layer.cc in Sources */,
+				9D32FCD124557EEC002DCDAB /* elementwise_layer.cc in Sources */,
+				EC0BE16125144BE4009BD69A /* squeeze_layer_interpreter.cc in Sources */,
+				9DD1FB67247CE9BE00800139 /* metal_elu_layer_acc.metal in Sources */,
+				ECCDCEA925DF536000D7D297 /* cpu_shuffle_layer_acc.cc in Sources */,
+				E43D68B525C8F38000FAAF54 /* CONV_DW_3X3_INT8_SLIDEW.S in Sources */,
+				9DD1FBC0247CE9BE00800139 /* metal_conv_layer_1x1.metal in Sources */,
+				9D32FCFC24557EEC002DCDAB /* upsample_layer.cc in Sources */,
+				9D32FC9F24557EEC002DCDAB /* memory_mode_state_factory.cc in Sources */,
+				32BFADEB2684AAEE00502FAC /* GEMM_INT8_SDOT_4X8.S in Sources */,
+				9D32FCCE24557EEC002DCDAB /* flatten_layer.cc in Sources */,
+				9DD1FB95247CE9BE00800139 /* metal_min_layer_acc.mm in Sources */,
+				9D32FF5F24557EED002DCDAB /* inner_product_layer_interpreter.cc in Sources */,
+				9DD1FB84247CE9BE00800139 /* metal_reduce_prod_layer_acc.mm in Sources */,
+				9D32FC9024557EEC002DCDAB /* blob_2d_memory_pool.cc in Sources */,
+				9D32FF7624557EED002DCDAB /* abstract_layer_acc.cc in Sources */,
+				9DD1FBAF247CE9BE00800139 /* metal_permute_layer_acc.mm in Sources */,
+				EC8BDE1525E3B1600085CCC2 /* dims_offset_utils.cc in Sources */,
+				EC626D512615B21A00750B31 /* arm_unsqueeze_layer_acc.cc in Sources */,
+				9D32FF0A24557EED002DCDAB /* model_interpreter.cc in Sources */,
+				9DD1FB99247CE9BE00800139 /* metal_clip_layer_acc.mm in Sources */,
+				ECCDCF1F25E0F97500D7D297 /* constantofshape_layer.cc in Sources */,
+				9D32FCD824557EEC002DCDAB /* floor_layer.cc in Sources */,
+				9D32FD0A24557EEC002DCDAB /* reduce_log_sum_layer.cc in Sources */,
+				9DD1FB4E247CE9BE00800139 /* metal_tanh_layer_acc.mm in Sources */,
+				ECCDCEE125DF536000D7D297 /* cpu_padv2_layer_acc.cc in Sources */,
+				9D32FF4424557EED002DCDAB /* ncnn_optimizer_manager.cc in Sources */,
+				E44D946526048BE7003FE4A3 /* arm_inner_product_fp16_layer.cc in Sources */,
+				9D32FF4724557EED002DCDAB /* ncnn_param_utils.cc in Sources */,
+				9D32FCE024557EEC002DCDAB /* relu6_layer.cc in Sources */,
+				9D32FCAD24557EEC002DCDAB /* string_format.cc in Sources */,
+				9D32FCD324557EEC002DCDAB /* inner_product_layer.cc in Sources */,
+				ECCDCF1825E0F97500D7D297 /* scatter_nd_layer.cc in Sources */,
+				9DF54423258B1367006CEC97 /* arm_conv_layer_depthwise.cc in Sources */,
+				9D32FF3E24557EED002DCDAB /* net_structure.cc in Sources */,
+				ECCDCEB225DF536000D7D297 /* cpu_detection_output_layer_acc.cc in Sources */,
+				9D32FCA824557EEC002DCDAB /* half_utils.cc in Sources */,
+				4E187D17267202D800804FDF /* group_norm_layer.cc in Sources */,
+				9D32FCFA24557EEC002DCDAB /* scale_layer.cc in Sources */,
+				9DF54433258B1367006CEC97 /* arm_signed_mul_layer_acc.cc in Sources */,
+				ECCDCF4925E10B9D00D7D297 /* bitshift_layer_interpreter.cc in Sources */,
+				9DF543E9258B1366006CEC97 /* CONV_DW_5x5_BFP16_SLIDEW.S in Sources */,
+				9D32FCBF24557EEC002DCDAB /* pribox_generator_utils.cc in Sources */,
+				9DF54412258B1367006CEC97 /* arm_conv_layer_common.cc in Sources */,
+				9D32FF5A24557EED002DCDAB /* eltwise_layer_interpreter.cc in Sources */,
+				9D32FF6624557EED002DCDAB /* abstract_network.cc in Sources */,
+				EC0BE17225144C10009BD69A /* elu_layer_interpreter.cc in Sources */,
+				EC39A41C25FC9DE100891D9A /* arm_squeeze_layer_acc.cc in Sources */,
+				4E187CFB267202BF00804FDF /* conv_1d_layer_interpreter.cc in Sources */,
+				9DF543AC258B1366006CEC97 /* arm_relu6_layer_acc.cc in Sources */,
+				E4D05BBD259DCB2E00921502 /* FLOAT2HALF.S in Sources */,
+				E4D05BB7259DCB2E00921502 /* arm_conv_fp16_layer_depthwise_s1.cc in Sources */,
+				9D32FCEE24557EEC002DCDAB /* reformat_layer.cc in Sources */,
+				9D32FD0424557EEC002DCDAB /* deconv_layer.cc in Sources */,
+				9DD1FBB3247CE9BE00800139 /* metal_asin_layer_acc.mm in Sources */,
+				9D32FCC324557EEC002DCDAB /* sign_layer.cc in Sources */,
+				9D32FF2524557EED002DCDAB /* softmax_layer_interpreter.cc in Sources */,
+				9DF54439258B1367006CEC97 /* arm_floor_layer_acc.cc in Sources */,
+				9DF543EE258B1366006CEC97 /* GEMM_INT8_4X4.S in Sources */,
+				ECCDCEBC25DF536000D7D297 /* cpu_signed_mul_layer_acc.cc in Sources */,
+				ECCDCF2225E0F97500D7D297 /* range_layer.cc in Sources */,
+				9DF5444B258B162F006CEC97 /* blob_converter_default.cc in Sources */,
+				ECCDCEAE25DF536000D7D297 /* cpu_sin_layer_acc.cc in Sources */,
+				9DD1FBAC247CE9BE00800139 /* metal_sign_layer_acc.metal in Sources */,
+				ECD945B9254ADDA800BF9214 /* pixel_shuffle_layer_interpreter.cc in Sources */,
+				9D32FF2E24557EED002DCDAB /* instance_norm_layer_interpreter.cc in Sources */,
+				4E187D1C267202D800804FDF /* softsign_layer.cc in Sources */,
+				9DD1FB8C247CE9BE00800139 /* metal_normalize_layer_acc.mm in Sources */,
+				9D32FC8B24557EEC002DCDAB /* memory_unify_assign_strategy.cc in Sources */,
+				E4D05BFA259F161000921502 /* arm_relu_fp16_layer.cc in Sources */,
+				9DF543FE258B1366006CEC97 /* arm_binary_layer_acc.cc in Sources */,
+				ECCDCF5125E10B9D00D7D297 /* histogram_layer_interpreter.cc in Sources */,
+				EC0BE17B25144C10009BD69A /* reduce_op_layer_interpreter.cc in Sources */,
+				ECCDCF4D25E10B9D00D7D297 /* mat_mul_layer_interpreter.cc in Sources */,
+				EC0BE15125144BB8009BD69A /* signed_mul_layer.cc in Sources */,
+				ECCDCEDC25DF536000D7D297 /* cpu_lstm_layer_acc.cc in Sources */,
+				6178F34B2590AA8C00B4B153 /* winograd_generator.cc in Sources */,
+				4E187D14267202D800804FDF /* layer_norm_layer.cc in Sources */,
+				9D32FCF724557EEC002DCDAB /* pooling_layer.cc in Sources */,
+				4E187D1F267202D800804FDF /* einsum_layer.cc in Sources */,
+				E44D946826048BF3003FE4A3 /* gemm_function_fp16.cc in Sources */,
+				9DD1FB66247CE9BE00800139 /* metal_atan_layer_acc.mm in Sources */,
+				ECCDCF2C25E0F97500D7D297 /* unsqueeze_layer.cc in Sources */,
+				9D32FF3324557EED002DCDAB /* conv_3d_layer_interpreter.cc in Sources */,
+				32EE07CF268589E200656211 /* GEMV_INT8_SDOT.S in Sources */,
+				9DF543E4258B1366006CEC97 /* asm_func_name.S in Sources */,
+				E4D05BAB259DCB2E00921502 /* GEMM_FP16_N8.S in Sources */,
+				E4D05BA5259DCB2E00921502 /* arm_conv_fp16_layer_depthwise.cc in Sources */,
+				9D32FCCD24557EEC002DCDAB /* sqrt_layer.cc in Sources */,
+				9DF54401258B1366006CEC97 /* arm_upsample_layer_acc.cc in Sources */,
+				ECEC5D6B24FCE0780044DDF1 /* mat_converter_acc.cc in Sources */,
+				9DF54428258B1367006CEC97 /* arm_conv_layer_c3.cc in Sources */,
+				9DF543EB258B1366006CEC97 /* GEMM_FLOAT_N8.S in Sources */,
+				9DD1FBC3247CE9BE00800139 /* metal_conv_layer_winograd.mm in Sources */,
+				9DD1FBC5247CE9BE00800139 /* metal_unary_layer_acc.mm in Sources */,
+				ECCDCEC125DF536000D7D297 /* cpu_reduce_max_layer_acc.cc in Sources */,
+				9D32FF1024557EED002DCDAB /* detection_output_interpreter.cc in Sources */,
+				9D32FCDB24557EEC002DCDAB /* instance_norm_layer.cc in Sources */,
+				9D32FF4024557EED002DCDAB /* abstract_model_interpreter.cc in Sources */,
+				ECCDCE8225DF536000D7D297 /* cpu_mat_converter.cc in Sources */,
+				9DD1FB5A247CE9BE00800139 /* metal_log_layer_acc.metal in Sources */,
+				9D32FC9424557EEC002DCDAB /* memory_seperate_assign_strategy.cc in Sources */,
+				ECCDCEEE25DF536000D7D297 /* cpu_reshape_layer_acc.cc in Sources */,
+				9D32FF5C24557EED002DCDAB /* relu_layer_interpreter.cc in Sources */,
+				320CCF2E2681CC090086E65C /* arm_conv_int8_sdot_layer_common.cc in Sources */,
+				ECCDCEAF25DF536000D7D297 /* cpu_sub_layer_acc.cc in Sources */,
+				9DD1FBD0247CE9BF00800139 /* metal_sub_layer_acc.metal in Sources */,
+				9D32FF6524557EED002DCDAB /* layer_type.cc in Sources */,
+				9D32FF0D24557EED002DCDAB /* conv_layer_interpreter.cc in Sources */,
+				ECCDCE8725DF536000D7D297 /* cpu_context.cc in Sources */,
+				E4D05BC6259DCB2E00921502 /* arm_conv_fp16_layer_3x3.cc in Sources */,
+				ECCDCEE225DF536000D7D297 /* cpu_ceil_layer_acc.cc in Sources */,
+				E44D946426048BE7003FE4A3 /* arm_lstm_fp16_layer.cc in Sources */,
+				320CCF272681CB910086E65C /* GEMV_INT8_SDOT.S in Sources */,
+				9D32FCD424557EEC002DCDAB /* atan_layer.cc in Sources */,
+				9D32FF2124557EED002DCDAB /* selu_layer_interpreter.cc in Sources */,
+				9D32FF2224557EED002DCDAB /* lrn_layer_interpreter.cc in Sources */,
+				9DF5442C258B1367006CEC97 /* arm_conv_layer_acc.cc in Sources */,
+				E4D05BAF259DCB2E00921502 /* DECONV_FP16_O8.S in Sources */,
+				E44D947B26048C32003FE4A3 /* cpu_info.cc in Sources */,
+				E4D05BF2259F15C700921502 /* arm_prelu_layer_acc.cc in Sources */,
+				320CCF3B2681CE570086E65C /* CONV_DW_3X3_INT8_SLIDEW.S in Sources */,
+				9DD1FB83247CE9BE00800139 /* metal_exp_layer_acc.mm in Sources */,
+				ECCDCEB825DF536000D7D297 /* cpu_conv_layer_acc.cc in Sources */,
+				32BFADEA2684AAEE00502FAC /* GEMM_INT8_SDOT_4X4.S in Sources */,
+				ECCDCEB725DF536000D7D297 /* cpu_detection_post_process_layer_acc.cc in Sources */,
+				ECCDCE8925DF536000D7D297 /* cpu_cast_layer_acc.cc in Sources */,
+				ECCDCF4025E10B9D00D7D297 /* padv2_layer_interpreter.cc in Sources */,
+				EC0BE17A25144C10009BD69A /* detection_output_layer_interpreter.cc in Sources */,
+				9DF54405258B1367006CEC97 /* arm_stride_slice_layer_acc.cc in Sources */,
+				9DD1FB88247CE9BE00800139 /* metal_pow_layer_acc.mm in Sources */,
+				9D32FF5B24557EED002DCDAB /* reshape_layer_interpreter.cc in Sources */,
+				9D32FF5624557EED002DCDAB /* hard_swish_layer_interpreter.cc in Sources */,
+				9DD1FB38247CE9BE00800139 /* metal_blob_converter.mm in Sources */,
+				9DF19E9F24A1FE8E00E1376D /* metal_pooling_layer_acc.metal in Sources */,
+				ECCDCEF425DF536000D7D297 /* cpu_max_layer_acc.cc in Sources */,
+				9D32FCDF24557EEC002DCDAB /* conv3d_layer.cc in Sources */,
+				9D32FC8524557EEC002DCDAB /* net_optimizer_remove_layers.cc in Sources */,
+				ECCDCF4825E10B9D00D7D297 /* unsqueeze_layer_interpreter.cc in Sources */,
+				9D32FF2D24557EED002DCDAB /* reshape_layer_interpreter.cc in Sources */,
+				ECEC5D6424FCDBA50044DDF1 /* arm_mat_converter.cc in Sources */,
+				EC0BE17825144C10009BD69A /* clip_layer_interpreter.cc in Sources */,
+				4E187D02267202BF00804FDF /* tile_layer_interpreter.cc in Sources */,
+				9DD1FB31247CE9BE00800139 /* coreml_network.mm in Sources */,
+				9D32FF7424557EED002DCDAB /* abstract_device.cc in Sources */,
+				4E187D292672030500804FDF /* arm_tile_layer_acc.cc in Sources */,
+				EC0BE16325144BE4009BD69A /* arg_max_or_min_layer_interpreter.cc in Sources */,
+				9D32FCD224557EEC002DCDAB /* div_layer.cc in Sources */,
+				4E187D422672036B00804FDF /* metal_gather_layer_acc.mm in Sources */,
+				ECCDCEBF25DF536000D7D297 /* cpu_reduce_min_layer_acc.cc in Sources */,
+				9DF54393258B1366006CEC97 /* arm_sigmoid_layer_acc.cc in Sources */,
+				9DD1FB87247CE9BE00800139 /* metal_sin_layer_acc.metal in Sources */,
+				9DD1FB8B247CE9BE00800139 /* metal_mul_layer_acc.metal in Sources */,
+				9D32FCEC24557EEC002DCDAB /* reduce_min_layer.cc in Sources */,
+				4E187CFC267202BF00804FDF /* topk_layer_interpreter.cc in Sources */,
+				E4D05BAC259DCB2E00921502 /* FLOAT2HALF.S in Sources */,
+				EC0BE15F25144BE4009BD69A /* squared_difference_layer_interpreter.cc in Sources */,
+				9DD1FBB2247CE9BE00800139 /* metal_max_layer_acc.metal in Sources */,
+				9DD1FB91247CE9BE00800139 /* metal_concat_layer_acc.metal in Sources */,
+				ECCDCF4125E10B9D00D7D297 /* constantofshape_layer_interpreter.cc in Sources */,
+				ECCDCEF725DF536000D7D297 /* cpu_pad_layer_acc.cc in Sources */,
+				ECCDCEE825DF536000D7D297 /* cpu_abs_layer_acc.cc in Sources */,
+				EC78BA2A26045787009271A8 /* metal_stride_slice_v2_layer_acc.mm in Sources */,
+				9D32FC8F24557EEC002DCDAB /* shared_memory_manager.cc in Sources */,
+				9D32FCF024557EEC002DCDAB /* mul_layer.cc in Sources */,
+				ECCDCF4C25E10B9D00D7D297 /* cast_layer_interpreter.cc in Sources */,
+				9D32FF2B24557EED002DCDAB /* hdrguide_layer_interpreter.cc in Sources */,
+				ECCDCEAD25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc in Sources */,
+				9DD1FB9D247CE9BE00800139 /* metal_softplus_layer_acc.mm in Sources */,
+				9DF543A6258B1366006CEC97 /* arm_reshape_layer_acc.cc in Sources */,
+				9DD1FB48247CE9BE00800139 /* metal_mul_layer_acc.mm in Sources */,
+				9DD1FC67247CEA1400800139 /* arm_context.cc in Sources */,
+				9DF543C1258B1366006CEC97 /* arm_reduce_mean_layer_acc.cc in Sources */,
+				E4D05BF0259F15C700921502 /* arm_pool_layer_acc.cc in Sources */,
+				ECCDCED025DF536000D7D297 /* cpu_gathernd_layer_acc.cc in Sources */,
+				ECCDCEB325DF536000D7D297 /* cpu_expand_layer_acc.cc in Sources */,
+				9D32FC9224557EEC002DCDAB /* blob_memory.cc in Sources */,
+				9DD1FB76247CE9BE00800139 /* metal_deconv_layer_depthwise.mm in Sources */,
+				ECCDCF2825E0F97500D7D297 /* bitshift_layer.cc in Sources */,
+				9D32FF5424557EED002DCDAB /* permute_layer_interpreter.cc in Sources */,
+				9D32FD0124557EEC002DCDAB /* asin_layer.cc in Sources */,
+				9D32FF3424557EED002DCDAB /* unary_op_layer_interpreter.cc in Sources */,
+				ECCDCED825DF536000D7D297 /* cpu_deconv_layer_acc.cc in Sources */,
+				9DF54437258B1367006CEC97 /* arm_reduce_min_layer_acc.cc in Sources */,
+				9DD1FB54247CE9BE00800139 /* metal_add_layer_acc.metal in Sources */,
+				9DD1FB5D247CE9BE00800139 /* metal_reduce_sum_square_layer_acc.mm in Sources */,
+				E4D05BBA259DCB2E00921502 /* DECONV_FP16_O8_C1.S in Sources */,
+				9D32FF4E24557EED002DCDAB /* conv_layer_interpreter.cc in Sources */,
+				9DD1FB82247CE9BE00800139 /* metal_div_layer_acc.mm in Sources */,
+				9DF5443B258B1367006CEC97 /* arm_softplus_layer_acc.cc in Sources */,
+				EC0BE17D25144C10009BD69A /* roi_pooling_layer_interpreter.cc in Sources */,
+				9D32FF1C24557EED002DCDAB /* max_layer_interpreter.cc in Sources */,
+				9DD1FB65247CE9BE00800139 /* metal_tanh_layer_acc.metal in Sources */,
+				E4D05BEF259F15C700921502 /* arm_permute_layer_acc.cc in Sources */,
+				EC626D502615B21A00750B31 /* arm_gather_layer_acc.cc in Sources */,
+				9D32FD0924557EEC002DCDAB /* permute_layer.cc in Sources */,
+				ECCDCE9825DF536000D7D297 /* cpu_reduce_l1_layer_acc.cc in Sources */,
+				9DD1FBCC247CE9BF00800139 /* metal_pow_layer_acc.metal in Sources */,
+				EC0BE17325144C10009BD69A /* selu_layer_interpreter.cc in Sources */,
+				9DF543A0258B1366006CEC97 /* arm_reduce_layer_acc.cc in Sources */,
+				9DD1FB4B247CE9BE00800139 /* metal_stride_slice_layer_acc.mm in Sources */,
+				9D32FF4F24557EED002DCDAB /* memory_data_layer_interpreter.cc in Sources */,
+				ECCDCEB925DF536000D7D297 /* cpu_permute_layer_acc.cc in Sources */,
+				9D32FCF224557EEC002DCDAB /* roi_pooling_layer.cc in Sources */,
+				9DD1FBA3247CE9BE00800139 /* metal_relu6_layer_acc.mm in Sources */,
+				9DF543DA258B1366006CEC97 /* CONV_FLOAT_O4.S in Sources */,
+				9DD1FB39247CE9BE00800139 /* metal_blob_converter.metal in Sources */,
+				9DD1FBBD247CE9BE00800139 /* metal_conv_layer_1x1.mm in Sources */,
+				9DD1FBC2247CE9BE00800139 /* metal_conv_layer_depthwise.metal in Sources */,
+				EC12EC8A25E68374007ADDE4 /* metal_reformat_layer_acc.mm in Sources */,
+				ECEC5D6724FCDBA50044DDF1 /* arm_mat_util.cc in Sources */,
+				9DD1FB47247CE9BE00800139 /* metal_multidir_broadcast_layer_acc.mm in Sources */,
+				ECCDCECC25DF536000D7D297 /* cpu_reduce_sum_layer_acc.cc in Sources */,
+				9D32FCCB24557EEC002DCDAB /* shuffle_layer.cc in Sources */,
+				9DF543B0258B1366006CEC97 /* arm_arg_max_or_min_layer_acc.cc in Sources */,
+				EC2CF72525078C1200EE3899 /* metal_mat_converter.mm in Sources */,
+				ECCDCECF25DF536000D7D297 /* cpu_tanh_layer_acc.cc in Sources */,
+				ECCDCEDB25DF536000D7D297 /* cpu_splitv_layer_acc.cc in Sources */,
+				ECD945B3254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.metal in Sources */,
+				ECCDCEFC25DF536000D7D297 /* cpu_softplus_layer_acc.cc in Sources */,
+				EC0BE13925144B5E009BD69A /* string_utils.cc in Sources */,
+				9D32FD0B24557EEC002DCDAB /* hard_sigmoid_layer.cc in Sources */,
+				E4D05BB4259DCB2E00921502 /* arm_conv_fp16_layer_c3.cc in Sources */,
+				9D32FCD024557EEC002DCDAB /* reshape_layer.cc in Sources */,
+				9D32FF5324557EED002DCDAB /* slice_layer_interpreter.cc in Sources */,
+				9DD1FB71247CE9BE00800139 /* metal_batch_norm_layer_acc.mm in Sources */,
+				9DD1FBAD247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.mm in Sources */,
+				9DD1FBAB247CE9BE00800139 /* metal_instance_norm_layer_acc.metal in Sources */,
+				9DB341FD249B0A9300F23F65 /* metal_cpu_adapter_acc.mm in Sources */,
+				E4D05BF3259F15C700921502 /* arm_relu_layer_acc.cc in Sources */,
+				ECCDCF2125E0F97500D7D297 /* cast_layer.cc in Sources */,
+				9DF543FF258B1366006CEC97 /* arm_elu_layer_acc.cc in Sources */,
+				9DD1FB51247CE9BE00800139 /* metal_sub_layer_acc.mm in Sources */,
+				9D32FF1D24557EED002DCDAB /* add_layer_interpreter.cc in Sources */,
+				ECCDCE8E25DF536000D7D297 /* cpu_gather_layer_acc.cc in Sources */,
+				9D32FCAF24557EEC002DCDAB /* blob_transfer_utils.cc in Sources */,
+				ECCDCEA725DF536000D7D297 /* cpu_scale_layer_acc.cc in Sources */,
+				9DD1FB9B247CE9BE00800139 /* metal_max_layer_acc.mm in Sources */,
+				9DF5440D258B1367006CEC97 /* arm_log_sigmoid_layer_acc.cc in Sources */,
+				9DD1FB92247CE9BE00800139 /* metal_tan_layer_acc.mm in Sources */,
+				9DF543FA258B1366006CEC97 /* arm_layer_acc.cc in Sources */,
+				9DD1FB8E247CE9BE00800139 /* metal_elu_layer_acc.mm in Sources */,
+				ECCDCF2925E0F97500D7D297 /* shape_layer.cc in Sources */,
+				9D32FCD724557EEC002DCDAB /* reciprocal_layer.cc in Sources */,
+				9D32FC9324557EEC002DCDAB /* memory_mode_state.cc in Sources */,
+				9D32FF4D24557EED002DCDAB /* batch_norm_layer_interpreter.cc in Sources */,
+				9D4C60CF246BF826006068D1 /* profile.cc in Sources */,
+				EC0BE15425144BB8009BD69A /* arg_max_or_min_layer.cc in Sources */,
+				9D32FCE124557EEC002DCDAB /* prelu_layer.cc in Sources */,
+				9D32FCA524557EEC002DCDAB /* blob_1d_memory_pool.cc in Sources */,
+				ECCDCF4725E10B9D00D7D297 /* range_layer_interpreter.cc in Sources */,
+				ECCDCEC025DF536000D7D297 /* cpu_conv_3d_layer_acc.cc in Sources */,
+				9D32FC8824557EEC002DCDAB /* others_memory_mode_state.cc in Sources */,
+				9DF54450258B162F006CEC97 /* random_data_utils.cc in Sources */,
+				ECD9464D2558F4CD00BF9214 /* net_optimizer_insert_int8_reformat.cc in Sources */,
+				9DF5443C258B1367006CEC97 /* arm_reduce_sum_layer_acc.cc in Sources */,
+				ECCDCEDD25DF536000D7D297 /* cpu_exp_layer_acc.cc in Sources */,
+				9D32FF2424557EED002DCDAB /* hard_sigmoid_layer_interpreter.cc in Sources */,
+				4E187D20267202D800804FDF /* roialign_layer.cc in Sources */,
+				320CCF262681CB910086E65C /* GEMM_INT8_SDOT_8X8.S in Sources */,
+				9DD1FB72247CE9BE00800139 /* metal_deconv_layer_common.mm in Sources */,
+				EC626D4D2615B21A00750B31 /* arm_expand_layer_acc.cc in Sources */,
+				E4D05BC0259DCB2E00921502 /* HALF2FLOAT.S in Sources */,
+				ECCDCEE625DF536000D7D297 /* cpu_prelu_layer_acc.cc in Sources */,
+				9D32FCB724557EEC002DCDAB /* dims_vector_utils.cc in Sources */,
+				9DF543F8258B1366006CEC97 /* arm_exp_layer_acc.cc in Sources */,
+				9DF543A9258B1366006CEC97 /* arm_reformat_layer_acc.cc in Sources */,
+				9D32FF3F24557EED002DCDAB /* raw_buffer.cc in Sources */,
+				9DD1FBA4247CE9BE00800139 /* metal_stride_slice_layer_acc.metal in Sources */,
+				9D32FF7A24557EED002DCDAB /* default_network.cc in Sources */,
+				ECD945B0254ADD7100BF9214 /* metal_pixel_shuffle_layer_acc.mm in Sources */,
+				9DD1FB77247CE9BE00800139 /* metal_deconv_layer_acc.mm in Sources */,
+				ECEC5DA824FFC6FE0044DDF1 /* mat.cc in Sources */,
+				ECCDCED725DF536000D7D297 /* cpu_neg_layer_acc.cc in Sources */,
+				9D32FF1324557EED002DCDAB /* clip_layer_interpreter.cc in Sources */,
+				9D32FF0824557EED002DCDAB /* model_packer.cc in Sources */,
+				9DD1FB9A247CE9BE00800139 /* metal_exp_layer_acc.metal in Sources */,
+				9D32FCBA24557EEC002DCDAB /* data_type_utils.cc in Sources */,
+				ECCDCEE425DF536000D7D297 /* cpu_atan_layer_acc.cc in Sources */,
+				EC12EC8625E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc in Sources */,
+				9D32FCE824557EEC002DCDAB /* reduce_sum_layer.cc in Sources */,
+				9D32FCE424557EEC002DCDAB /* acos_layer.cc in Sources */,
+				9D32FF5024557EED002DCDAB /* default_layer_interpreter.cc in Sources */,
+				ECCDCE9A25DF536000D7D297 /* cpu_softmax_layer_acc.cc in Sources */,
+				ECCDCED625DF536000D7D297 /* cpu_sqrt_layer_acc.cc in Sources */,
+				32BFADEF2684AB0100502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S in Sources */,
+				ECCDCED125DF536000D7D297 /* cpu_sigmoid_layer_acc.cc in Sources */,
+				9DD1FB94247CE9BE00800139 /* metal_cos_layer_acc.mm in Sources */,
+				9DD1FBBF247CE9BE00800139 /* metal_conv_layer_winograd.metal in Sources */,
+				ECCDCEA825DF536000D7D297 /* cpu_pixel_shuffle_layer_acc.cc in Sources */,
+				9DF543AB258B1366006CEC97 /* arm_selu_layer_acc.cc in Sources */,
+				9DD1FB43247CE9BE00800139 /* metal_floor_layer_acc.mm in Sources */,
+				9DD1FBC8247CE9BE00800139 /* metal_sqrt_layer_acc.metal in Sources */,
+				9D32FCFF24557EEC002DCDAB /* cos_layer.cc in Sources */,
+				9DF543C8258B1366006CEC97 /* arm_clip_layer_acc.cc in Sources */,
+				9DD1FB98247CE9BE00800139 /* metal_softmax_layer_acc.metal in Sources */,
+				9D32FF6824557EED002DCDAB /* tnn_impl.cc in Sources */,
+				9DD1FB8D247CE9BE00800139 /* metal_reduce_l2_layer_acc.mm in Sources */,
+				EC626D462615B20300750B31 /* arm_binary_fp16_layer_acc.cc in Sources */,
+				9DD1FBCA247CE9BE00800139 /* metal_lrn_layer_acc.mm in Sources */,
+				4E187D282672030500804FDF /* arm_padv2_layer_acc.cc in Sources */,
+				9DF54410258B1367006CEC97 /* arm_conv_int8_layer_depthwise.cc in Sources */,
+				9D32FCB624557EEC002DCDAB /* blob_converter_internal.cc in Sources */,
+				ECCDCF4A25E10B9D00D7D297 /* expand_layer_interpreter.cc in Sources */,
+				9DF543C4258B1366006CEC97 /* arm_max_layer_acc.cc in Sources */,
+				ECCDCF4425E10B9D00D7D297 /* bias_add_layer_interpreter.cc in Sources */,
+				4E187D03267202BF00804FDF /* einsum_layer_interpreter.cc in Sources */,
+				9DD1FB4F247CE9BE00800139 /* metal_instance_norm_layer_acc.mm in Sources */,
+				9DF543C0258B1366006CEC97 /* arm_hard_sigmoid_acc.cc in Sources */,
+				9DF543D9258B1366006CEC97 /* CONV_DW_5X5_FLOAT_SLIDEW.S in Sources */,
+				ECCDCF1B25E0F97500D7D297 /* size_layer.cc in Sources */,
+				9D32FC9C24557EEC002DCDAB /* blob_2d_memory.cc in Sources */,
+				E4D05BAE259DCB2E00921502 /* CONV_FP16_SLIDEW_C3.S in Sources */,
+				9D32FF6124557EED002DCDAB /* crop_layer_interpreter.cc in Sources */,
+				9DF543B1258B1366006CEC97 /* arm_reciprocal_layer_acc.cc in Sources */,
+				9DD1FBCB247CE9BF00800139 /* metal_shuffle_layer_acc.mm in Sources */,
+				E44D945926048B7F003FE4A3 /* blob_impl.cc in Sources */,
+				9DD1FBAE247CE9BE00800139 /* metal_reciprocal_layer_acc.mm in Sources */,
+				9DD1FB56247CE9BE00800139 /* metal_layer_acc.mm in Sources */,
+				9DF54421258B1367006CEC97 /* arm_conv_layer_3x3.cc in Sources */,
+				ECCDCEF925DF536000D7D297 /* cpu_binary_op_layer_acc.cc in Sources */,
+				9DD1FB7C247CE9BE00800139 /* metal_softplus_layer_acc.metal in Sources */,
+				4E187D442672036B00804FDF /* metal_mat_mul_layer_acc.metal in Sources */,
+				4E187D272672030500804FDF /* arm_inverse_layer_acc.cc in Sources */,
+				ECCDCEAB25DF536000D7D297 /* cpu_mul_layer_acc.cc in Sources */,
+				9DD1FCF1247CEA1500800139 /* arm_device.cc in Sources */,
+				9DF543A2258B1366006CEC97 /* arm_mul_layer_acc.cc in Sources */,
+				9DF543E7258B1366006CEC97 /* CONV_BFP16_O4.S in Sources */,
+				EC0BE16225144BE4009BD69A /* signed_mul_layer_interpreter.cc in Sources */,
+				9D32FCFD24557EEC002DCDAB /* reduce_layer.cc in Sources */,
+				9DD1FB90247CE9BE00800139 /* metal_concat_layer_acc.mm in Sources */,
+				9D32FF5D24557EED002DCDAB /* deconv_layer_interpreter.cc in Sources */,
+				EC0BE15025144BB8009BD69A /* ceil_layer.cc in Sources */,
+				9D32FF2C24557EED002DCDAB /* prior_box_layer_interpreter.cc in Sources */,
+				9D32FF1B24557EED002DCDAB /* upsample_layer_interpreter.cc in Sources */,
+				ECCDCE8025DF536000D7D297 /* cpu_device.cc in Sources */,
+				ECCDCEE525DF536000D7D297 /* cpu_tan_layer_acc.cc in Sources */,
+				4E187D3A2672036B00804FDF /* metal_tile_layer_acc.metal in Sources */,
+				9D32FF3224557EED002DCDAB /* concat_layer_interpreter.cc in Sources */,
+				9DF5441A258B1367006CEC97 /* arm_conv_int8_layer_1x1.cc in Sources */,
+				9D32FCD924557EEC002DCDAB /* detection_output_layer.cc in Sources */,
+				ECCDCF2625E0F97500D7D297 /* gather_layer.cc in Sources */,
+				9DD1FBBB247CE9BE00800139 /* metal_conv_layer_common.metal in Sources */,
+				9D32FCE924557EEC002DCDAB /* pad_layer.cc in Sources */,
+				9DD1FB7B247CE9BE00800139 /* metal_sign_layer_acc.mm in Sources */,
+				4E187D412672036B00804FDF /* metal_tile_layer_acc.mm in Sources */,
+				9DD1FBB1247CE9BE00800139 /* metal_reduce_log_sum_exp_layer_acc.mm in Sources */,
+				E4D05BA9259DCB2E00921502 /* DECONV_FP16_O8_C1.S in Sources */,
+				9D32FD0224557EEC002DCDAB /* reorg_layer.cc in Sources */,
+				9D32FCF124557EEC002DCDAB /* base_layer.cc in Sources */,
+				9D32FF2924557EED002DCDAB /* mul_layer_interpreter.cc in Sources */,
+				4E187D13267202D800804FDF /* where_layer.cc in Sources */,
+				E4D05BC7259DCB2E00921502 /* arm_relu6_fp16_layer.cc in Sources */,
+				E4D05C03259F1BA700921502 /* arm_add_layer_acc.cc in Sources */,
+				9D32FF4324557EED002DCDAB /* memory_data_optimizer.cc in Sources */,
+				ECCDCEE025DF536000D7D297 /* cpu_squeeze_layer_acc.cc in Sources */,
+				9D32FD0724557EEC002DCDAB /* log_sigmoid_layer.cc in Sources */,
+				9D32FCDA24557EEC002DCDAB /* splitv_layer.cc in Sources */,
+				EC5932F525CA446100FF8F4B /* metal_lstm_layer_acc.mm in Sources */,
+				ECCDCECE25DF536000D7D297 /* cpu_reduce_prod_layer_acc.cc in Sources */,
+				9DF543E8258B1366006CEC97 /* CONV_FLOAT_SLIDEW_C3.S in Sources */,
+				9D32FCEA24557EEC002DCDAB /* abs_layer.cc in Sources */,
+				EC39A41B25FC9DE100891D9A /* arm_lstm_layer_acc.cc in Sources */,
+				9DF543CE258B1366006CEC97 /* compute_int8.cc in Sources */,
+				9DF5443A258B1367006CEC97 /* arm_shuffle_layer_acc.cc in Sources */,
+				9D32FCB324557EEC002DCDAB /* data_format_converter.cc in Sources */,
+				ECD945B2254ADD7100BF9214 /* metal_signed_mul_layer_acc.mm in Sources */,
+				ECCDCE9D25DF536000D7D297 /* cpu_relu6_layer_acc.cc in Sources */,
+				9D32FF4A24557EED002DCDAB /* ncnn_layer_type.cc in Sources */,
+				9DF543D3258B1366006CEC97 /* CONV_DW_5X5_BFP16_SLIDEW.S in Sources */,
+				4E187CFA267202BF00804FDF /* roialign_layer_interpreter.cc in Sources */,
+				9D32FD0C24557EEC002DCDAB /* max_layer.cc in Sources */,
+				9D32FD0324557EEC002DCDAB /* batch_norm_layer.cc in Sources */,
+				EC8BDE1325E3B1600085CCC2 /* data_flag_utils.cc in Sources */,
+				9DF543CB258B1366006CEC97 /* compute.cc in Sources */,
+				ECCDCEB425DF536000D7D297 /* cpu_pool_3d_layer_acc.cc in Sources */,
+				EC12EC8F25E7A7F4007ADDE4 /* metal_reformat_layer_acc.metal in Sources */,
+				ECCDCE9B25DF536000D7D297 /* cpu_reduce_sum_square_layer_acc.cc in Sources */,
+				9DD1FCEF247CEA1500800139 /* arm_blob_converter.cc in Sources */,
+				9DF5442A258B1367006CEC97 /* arm_conv_layer_acc_factory.cc in Sources */,
+				ECCDCEB625DF536000D7D297 /* cpu_rsqrt_layer_acc.cc in Sources */,
+				EC2CF7832511F80500EE3899 /* metal_arg_max_or_min_layer_acc.metal in Sources */,
+				9DD1FB49247CE9BE00800139 /* metal_ceil_layer_acc.metal in Sources */,
+				9D32FF0424557EED002DCDAB /* default_model_interpreter.cc in Sources */,
+				ECCDCEDE25DF536000D7D297 /* cpu_div_layer_acc.cc in Sources */,
+				9DD1FB5F247CE9BE00800139 /* metal_atan_layer_acc.metal in Sources */,
+				ECCDCF4625E10B9D00D7D297 /* scatter_nd_layer_interpreter.cc in Sources */,
+				EC0BE17F25144C10009BD69A /* unary_op_layer_interpreter.cc in Sources */,
+				4E187D19267202D800804FDF /* gridsample_layer.cc in Sources */,
+				9D32FF6E24557EED002DCDAB /* context.cc in Sources */,
+				ECCDCEF325DF536000D7D297 /* cpu_reduce_l2_layer_acc.cc in Sources */,
+				EC0BE17725144C10009BD69A /* pad_layer_interpreter.cc in Sources */,
+				9D32FF5724557EED002DCDAB /* lrn_layer_interpreter.cc in Sources */,
+				9DD1FB7F247CE9BE00800139 /* metal_log_sigmoid_layer_acc.mm in Sources */,
+				E4D05BBB259DCB2E00921502 /* CONV_DW_3X3_FP16_SLIDEW.S in Sources */,
+				9DF26BD924645EA500F22F0D /* naive_compute.cc in Sources */,
+				9DD1FBC4247CE9BE00800139 /* metal_conv_layer_acc.mm in Sources */,
+				32BFADE22684AAE100502FAC /* arm_conv_int8_sdot_layer_depthwise_3x3.cc in Sources */,
+				9D32FCF524557EEC002DCDAB /* reduce_l2_layer.cc in Sources */,
+				9DF543F9258B1366006CEC97 /* arm_detection_output_layer_acc.cc in Sources */,
+				9D32FF0E24557EED002DCDAB /* flatten_layer_interpreter.cc in Sources */,
+				EC0BE15525144BB8009BD69A /* rsqrt_layer.cc in Sources */,
+				E4D05BB8259DCB2E00921502 /* arm_sigmoid_fp16_layer.cc in Sources */,
+				9DD1FB7D247CE9BE00800139 /* metal_relu6_layer_acc.metal in Sources */,
+				ECCDCE8D25DF536000D7D297 /* cpu_erf_layer_acc.cc in Sources */,
+				ECCDCEB525DF536000D7D297 /* cpu_unsqueeze_layer_acc.cc in Sources */,
+				369005C5267314D900412264 /* pad_utils.cc in Sources */,
+				E4D05BBC259DCB2E00921502 /* GEMM_FP16_N8.S in Sources */,
+				9D32FCBE24557EEC002DCDAB /* cpu_utils.cc in Sources */,
+				9DD1FB6F247CE9BE00800139 /* metal_acos_layer_acc.mm in Sources */,
+				9D32FF3524557EED002DCDAB /* scale_layer_interpreter.cc in Sources */,
+				E44D947226048C1A003FE4A3 /* dims_function_utils.cc in Sources */,
+				4E187D15267202D800804FDF /* equal_layer.cc in Sources */,
+				9DD1FB59247CE9BE00800139 /* metal_hard_swish_layer_acc.mm in Sources */,
+				9DF54424258B1367006CEC97 /* arm_conv_int8_layer_common.cc in Sources */,
+				9D5B716024BF0A300062DF64 /* metal_prior_box_layer_acc.metal in Sources */,
+				9DF543C3258B1366006CEC97 /* arm_min_layer_acc.cc in Sources */,
+				4E187D392672036B00804FDF /* metal_cast_layer_acc.metal in Sources */,
+				9DD1FB6C247CE9BE00800139 /* metal_reduce_layer_acc.mm in Sources */,
+				9D32FF0524557EED002DCDAB /* net_resource.cc in Sources */,
+				EC0BE1BB251DBE65009BD69A /* mat_converter_utils.cc in Sources */,
+				4E187D18267202D800804FDF /* gelu_layer.cc in Sources */,
+				9D32FCAE24557EEC002DCDAB /* blob_dump_utils.cc in Sources */,
+				4E187D3B2672036B00804FDF /* metal_mat_mul_layer_acc.mm in Sources */,
+				9DD1FBC1247CE9BE00800139 /* metal_inner_product_layer_acc.metal in Sources */,
+				ECCDCEC625DF536000D7D297 /* compute_int8.cc in Sources */,
+				9DD1FB57247CE9BE00800139 /* metal_reduce_sum_layer_acc.mm in Sources */,
+				9D32FCED24557EEC002DCDAB /* sin_layer.cc in Sources */,
+				9DF543BC258B1366006CEC97 /* arm_deconv_layer_stride.cc in Sources */,
+				9DD1FBCE247CE9BF00800139 /* metal_log_sigmoid_layer_acc.metal in Sources */,
+				9D32FCC224557EEC002DCDAB /* tanh_layer.cc in Sources */,
+				9D32FD0524557EEC002DCDAB /* stride_slice_layer.cc in Sources */,
+				4E187D00267202BF00804FDF /* group_norm_layer_interpreter.cc in Sources */,
+				EC0BE17525144C10009BD69A /* prior_box_layer_interpreter.cc in Sources */,
+				EC0BE15325144BB8009BD69A /* detection_post_process_layer.cc in Sources */,
+				9DD1FB9C247CE9BE00800139 /* metal_sigmoid_layer_acc.metal in Sources */,
+				ECCDCE9C25DF536000D7D297 /* cpu_squared_difference_layer_acc.cc in Sources */,
+				9DF543CF258B1366006CEC97 /* CONV_BFP16_O4.S in Sources */,
+				9DF5440B258B1367006CEC97 /* arm_neg_layer_acc.cc in Sources */,
+				ECCDCEE725DF536000D7D297 /* cpu_elu_layer_acc.cc in Sources */,
+				9D32FCBB24557EEC002DCDAB /* split_utils.cc in Sources */,
+				ECCDCF4F25E10B9D00D7D297 /* shape_layer_interpreter.cc in Sources */,
+				ECCDCF4E25E10B9D00D7D297 /* lstm_layer_interpreter.cc in Sources */,
+				9DF54399258B1366006CEC97 /* arm_splitv_layer_acc.cc in Sources */,
+				9DF5440C258B1367006CEC97 /* arm_reduce_sum_square_layer_acc.cc in Sources */,
+				9DD1FB63247CE9BE00800139 /* metal_reduce_max_layer_acc.mm in Sources */,
+				9DD1FB89247CE9BE00800139 /* metal_softmax_layer_acc.mm in Sources */,
+				ECCDCEBB25DF536000D7D297 /* cpu_floor_layer_acc.cc in Sources */,
+				9DD1FB96247CE9BE00800139 /* metal_normalize_layer_acc.metal in Sources */,
+				9DD1FB5C247CE9BE00800139 /* metal_ceil_layer_acc.mm in Sources */,
+				E4D05BC3259DCB2E00921502 /* arm_softmax_fp16_layer.cc in Sources */,
+				ECCDCF1A25E0F97500D7D297 /* erf_layer.cc in Sources */,
+				9DD1FB8A247CE9BE00800139 /* metal_lrn_layer_acc.metal in Sources */,
+				9DF543C9258B1366006CEC97 /* arm_pad_layer_acc.cc in Sources */,
+				9DF5441D258B1367006CEC97 /* arm_conv_layer_1x1.cc in Sources */,
+				9DD1FB9F247CE9BE00800139 /* metal_cos_layer_acc.metal in Sources */,
+				9D32FC8924557EEC002DCDAB /* blob_memory_size_info.cc in Sources */,
+				9D4C60CC246BF7A1006068D1 /* bbox_util.cc in Sources */,
+				ECCDCED525DF536000D7D297 /* cpu_lrn_layer_acc.cc in Sources */,
+				ECCDCEFA25DF536000D7D297 /* cpu_reduce_layer_acc.cc in Sources */,
+				9DD1FBB4247CE9BE00800139 /* metal_conv_layer_common.mm in Sources */,
+				9DD1FBB6247CE9BE00800139 /* metal_inner_product_layer_acc.mm in Sources */,
+				9DF5439D258B1366006CEC97 /* arm_instance_norm_layer_acc.cc in Sources */,
+				9DD1FB3D247CE9BE00800139 /* metal_context.mm in Sources */,
+				EC0BE17E25144C10009BD69A /* prelu_layer_interpreter.cc in Sources */,
+				9D32FF4524557EED002DCDAB /* expand_slice_optimizer.cc in Sources */,
+				ECCDCF4525E10B9D00D7D297 /* reformat_layer_interpreter.cc in Sources */,
+				ECCDCE8B25DF536000D7D297 /* cpu_reciprocal_layer_acc.cc in Sources */,
+				9D32FCFB24557EEC002DCDAB /* hard_swish_layer.cc in Sources */,
+				9D32FF1724557EED002DCDAB /* pad_layer_interpreter.cc in Sources */,
+				EC626D4E2615B21A00750B31 /* arm_cast_layer_acc.cc in Sources */,
+				9DD1FB35247CE9BE00800139 /* metal_device.mm in Sources */,
+				9DF543AD258B1366006CEC97 /* arm_trig_layer_acc.cc in Sources */,
+				320CCF312681CC090086E65C /* compute_sdot_int8.cc in Sources */,
+				E4D05BBE259DCB2E00921502 /* CONV_FP16_SLIDEW_C3.S in Sources */,
+				ECCDCEBA25DF536000D7D297 /* cpu_add_layer_acc.cc in Sources */,
+				ECCDCF1D25E0F97500D7D297 /* mat_mul_layer.cc in Sources */,
+				9DF54394258B1366006CEC97 /* arm_unary_layer_acc.cc in Sources */,
+				9DD1FBBA247CE9BE00800139 /* metal_conv_layer_depthwise.mm in Sources */,
+				9DD1FB93247CE9BE00800139 /* metal_clip_layer_acc.metal in Sources */,
+				9D32FCC424557EEC002DCDAB /* softplus_layer.cc in Sources */,
+				9DF543F2258B1366006CEC97 /* CONV_FLOAT_O4.S in Sources */,
+				EC12EC7F25E67549007ADDE4 /* net_optimizer_insert_layout_reformat.cc in Sources */,
+				ECEC5D6D24FCE0780044DDF1 /* mat_utils.cc in Sources */,
+				ECCDCF2425E0F97500D7D297 /* stride_slice_v2_layer.cc in Sources */,
+				9DF54397258B1366006CEC97 /* arm_log_acc_layer_acc.cc in Sources */,
+				9DD1FB3C247CE9BE00800139 /* metal_command_queue.mm in Sources */,
+				ECCDCEC225DF536000D7D297 /* cpu_scatter_nd_layer_acc.cc in Sources */,
+				4E187D1B267202D800804FDF /* tile_layer.cc in Sources */,
+				9D32FCC024557EEC002DCDAB /* sigmoid_layer.cc in Sources */,
+				9D32FCE724557EEC002DCDAB /* reduce_mean_layer.cc in Sources */,
+				ECCDCEBE25DF536000D7D297 /* cpu_relu_layer_acc.cc in Sources */,
+				320CCF252681CB910086E65C /* GEMM_INT8_SDOT_8X4.S in Sources */,
+				EC0BE17925144C10009BD69A /* reorg_layer_interpreter.cc in Sources */,
+				32BFADE82684AAEE00502FAC /* CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S in Sources */,
+				EC626D422615B1F800750B31 /* binary_function.cc in Sources */,
+				9DD1FB45247CE9BE00800139 /* metal_hard_swish_layer_acc.metal in Sources */,
+				9D32FCEB24557EEC002DCDAB /* min_layer.cc in Sources */,
+				9DD1FB4C247CE9BE00800139 /* metal_reduce_mean_layer_acc.mm in Sources */,
+				9DD1FB69247CE9BE00800139 /* metal_prelu_layer_acc.mm in Sources */,
+				9D32FC8624557EEC002DCDAB /* net_optimizer_manager.cc in Sources */,
+				9D32FF1E24557EED002DCDAB /* permute_layer_interpreter.cc in Sources */,
+				9DF543BF258B1366006CEC97 /* arm_reorg_layer_acc.cc in Sources */,
+				9DF54403258B1367006CEC97 /* arm_nchw_layer_acc.cc in Sources */,
+				9DD1FB61247CE9BE00800139 /* metal_batch_norm_layer_acc.metal in Sources */,
+				9DD1FB79247CE9BE00800139 /* metal_deconv_layer_depthwise.metal in Sources */,
+				9D32FCDE24557EEC002DCDAB /* conv_layer.cc in Sources */,
+				E4D05BF9259F161000921502 /* arm_prelu_fp16_layer.cc in Sources */,
+				9D32FCF424557EEC002DCDAB /* reduce_log_sum_exp_layer.cc in Sources */,
+				EC0BE13825144B5E009BD69A /* detection_post_process_utils.cc in Sources */,
+				9D32FCF924557EEC002DCDAB /* log_layer.cc in Sources */,
+				ECCDCF4B25E10B9D00D7D297 /* gathernd_layer_interpreter.cc in Sources */,
+				ECCDCEFB25DF536000D7D297 /* cpu_mat_mul_layer_acc.cc in Sources */,
+				ECCDCEF025DF536000D7D297 /* cpu_selu_layer_acc.cc in Sources */,
+				9D32FF6024557EED002DCDAB /* concat_layer_interpreter.cc in Sources */,
+				9DF543CA258B1366006CEC97 /* arm_pixel_shuffle_layer_acc.cc in Sources */,
+				9DF543FD258B1366006CEC97 /* arm_pow_layer_acc.cc in Sources */,
+				9D32FF2F24557EED002DCDAB /* splitv_layer_interpreter.cc in Sources */,
+				9DD1FB85247CE9BE00800139 /* metal_min_layer_acc.metal in Sources */,
+				4E187D2A2672030500804FDF /* arm_grid_sample_layer_acc.cc in Sources */,
+				4E187D1A267202D800804FDF /* conv1d_layer.cc in Sources */,
+				9D32FD0D24557EEC002DCDAB /* reduce_max_layer.cc in Sources */,
+				9D32FF2824557EED002DCDAB /* blob_scale_layer_interpreter.cc in Sources */,
+				ECCDCED225DF536000D7D297 /* cpu_log_layer_acc.cc in Sources */,
+				ECCDCE8525DF536000D7D297 /* cpu_blob_converter.cc in Sources */,
+				9DD1FB8F247CE9BE00800139 /* metal_abs_layer_acc.mm in Sources */,
+				4E187D402672036B00804FDF /* metal_cast_layer_acc.mm in Sources */,
+				9DD1FB6B247CE9BE00800139 /* metal_abs_layer_acc.metal in Sources */,
+				9DD1FB44247CE9BE00800139 /* metal_relu_layer_acc.mm in Sources */,
+				9DD1FBA8247CE9BE00800139 /* metal_hdrguide_layer_acc.mm in Sources */,
+				ECCDCEF625DF536000D7D297 /* cpu_normalize_layer_acc.cc in Sources */,
+				9DD1FB62247CE9BE00800139 /* metal_floor_layer_acc.metal in Sources */,
+				9DF5439E258B1366006CEC97 /* arm_inner_product_layer_acc.cc in Sources */,
+				ECCDCE9225DF536000D7D297 /* cpu_ histogram_layer_acc.cc in Sources */,
+				9DF543F0258B1366006CEC97 /* CONV_BFP16_SLIDEW_C3.S in Sources */,
+				9DD1FBA7247CE9BE00800139 /* metal_add_layer_acc.mm in Sources */,
+				9DF543D8258B1366006CEC97 /* CONV_BFP16_SLIDEW_C3.S in Sources */,
+				9DF543B6258B1366006CEC97 /* arm_deconv_layer_depthwise.cc in Sources */,
+				9D32FF7024557EED002DCDAB /* tnn.cc in Sources */,
+				9DF543B2258B1366006CEC97 /* arm_deconv_layer_acc.cc in Sources */,
+				9DD1FBA1247CE9BE00800139 /* metal_prelu_layer_acc.metal in Sources */,
+				9DF543FC258B1366006CEC97 /* arm_hard_swish_acc.cc in Sources */,
+				ECCDCE9025DF536000D7D297 /* cpu_reorg_layer_acc.cc in Sources */,
+				4E187D1E267202D800804FDF /* topk_layer.cc in Sources */,
+				9DD1FB41247CE9BE00800139 /* metal_permute_layer_acc.metal in Sources */,
+				9D852BCB24584E6A003F4E41 /* bfp16_utils.cc in Sources */,
+				ECCDCF4225E10B9D00D7D297 /* gather_layer_interpreter.cc in Sources */,
+				ECCDCECD25DF536000D7D297 /* cpu_hard_swish_layer_acc.cc in Sources */,
+				ECCDCED925DF536000D7D297 /* cpu_stride_slice_v2_layer_acc.cc in Sources */,
+				E4D05BCB259DCB2E00921502 /* winograd_function_fp16.cc in Sources */,
+				9DF543D2258B1366006CEC97 /* CONV_FLOAT_SLIDEW_C3.S in Sources */,
+				9D32FF1424557EED002DCDAB /* div_layer_interpreter.cc in Sources */,
+				9D32FCA324557EEC002DCDAB /* blob_memory_pool_factory.cc in Sources */,
+				ECCDCE9125DF536000D7D297 /* cpu_layer_acc.cc in Sources */,
+				9D32FF6224557EED002DCDAB /* shuffle_channel_layer_interpreter.cc in Sources */,
+				ECCDCEB125DF536000D7D297 /* cpu_unary_layer_acc.cc in Sources */,
+				ECCDCEA525DF536000D7D297 /* cpu_prior_box_layer_acc.cc in Sources */,
+				32BFADE92684AAEE00502FAC /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S in Sources */,
+				9D32FF3124557EED002DCDAB /* inner_product_layer_interpreter.cc in Sources */,
+				ECCDCF1E25E0F97500D7D297 /* bias_add_layer.cc in Sources */,
+				ECCDCEC325DF536000D7D297 /* cpu_bias_add_layer_acc.cc in Sources */,
+				EC5932FA25CFEBA900FF8F4B /* metal_reorg_layer_acc.mm in Sources */,
+				ECCDCEC425DF536000D7D297 /* cpu_shape_layer_acc.cc in Sources */,
+				9D32FD0024557EEC002DCDAB /* relu_layer.cc in Sources */,
+				9DF54392258B1366006CEC97 /* arm_sqrt_layer_acc.cc in Sources */,
+				9DF5439C258B1366006CEC97 /* arm_scale_layer_acc.cc in Sources */,
+				E4D05BB9259DCB2E00921502 /* arm_deconv_fp16_layer_depthwise.cc in Sources */,
+				4E187CFF267202BF00804FDF /* const_layer_interpreter.cc in Sources */,
+				9DD1FB7E247CE9BE00800139 /* metal_reciprocal_layer_acc.metal in Sources */,
+				9DD1FB80247CE9BE00800139 /* metal_shuffle_layer_acc.metal in Sources */,
+				EC0BE17625144C10009BD69A /* scale_layer_interpreter.cc in Sources */,
+				9D32FF5524557EED002DCDAB /* binary_op_interpreter.cc in Sources */,
+				9DF543EA258B1366006CEC97 /* DECONV_FLOAT_O4.S in Sources */,
+				9D32FF7824557EED002DCDAB /* instance.cc in Sources */,
+				9D32FF3C24557EED002DCDAB /* default_model_packer.cc in Sources */,
+				9DD1FB60247CE9BE00800139 /* metal_selu_layer_acc.metal in Sources */,
+				EC7F4B0A25E6417200F73811 /* metal_squeeze_layer_acc.mm in Sources */,
+				9DD1FB5B247CE9BE00800139 /* metal_reshape_layer_acc.mm in Sources */,
+				9DF5440E258B1367006CEC97 /* arm_reduce_l1_layer_acc.cc in Sources */,
+				ECCDCF5025E10B9D00D7D297 /* size_layer_interpreter.cc in Sources */,
+				9D32FF5924557EED002DCDAB /* softmax_layer_interpreter.cc in Sources */,
+				9DD1FBA0247CE9BE00800139 /* metal_reduce_layer_acc.metal in Sources */,
+				ECD946502558F4CD00BF9214 /* net_optimizer_insert_fp16_reformat.cc in Sources */,
+				4E187D01267202BF00804FDF /* onehot_layer_interpreter.cc in Sources */,
+				EC0BE16025144BE4009BD69A /* detection_post_process_layer_interpreter.cc in Sources */,
+				9D32FCF624557EEC002DCDAB /* normalize_layer.cc in Sources */,
+				EC2CF7822511F80500EE3899 /* metal_arg_max_or_min_layer_acc.mm in Sources */,
+				9D32FD0624557EEC002DCDAB /* split_layer.cc in Sources */,
+				ECCDCE9725DF536000D7D297 /* cpu_clip_layer_acc.cc in Sources */,
+				9D32FCFE24557EEC002DCDAB /* multidir_broadcast_layer.cc in Sources */,
+				9D32FCE524557EEC002DCDAB /* clip_layer.cc in Sources */,
+				9DF543D7258B1366006CEC97 /* CONV_DW_3X3_BFP16_SLIDEW.S in Sources */,
+				9D32FF0C24557EED002DCDAB /* shuffle_layer_interpreter.cc in Sources */,
+				EC5932FB25CFEBAA00FF8F4B /* metal_reorg_layer_acc.metal in Sources */,
+				9D32FCE324557EEC002DCDAB /* softmax_layer.cc in Sources */,
+				9DF543EC258B1366006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S in Sources */,
+				9D32FF5824557EED002DCDAB /* hard_sigmoid_layer_interpreter.cc in Sources */,
+				9D32FF2A24557EED002DCDAB /* reduce_op_interpreter.cc in Sources */,
+				ECCDCEDA25DF536000D7D297 /* cpu_pow_layer_acc.cc in Sources */,
+				E4D05BB0259DCB2E00921502 /* HALF2FLOAT.S in Sources */,
+				ECCDCEB025DF536000D7D297 /* cpu_asin_layer_acc.cc in Sources */,
+				9DF5439A258B1366006CEC97 /* arm_reduce_prod_layer_acc.cc in Sources */,
+				9DD1FBCD247CE9BF00800139 /* metal_neg_layer_acc.mm in Sources */,
+				9D32FC8E24557EEC002DCDAB /* blob_memory_pool.cc in Sources */,
+				9DD1FB40247CE9BE00800139 /* metal_sigmoid_layer_acc.mm in Sources */,
+				ECCDCF1C25E0F97500D7D297 /* histogram_layer.cc in Sources */,
+				9DD1FB97247CE9BE00800139 /* metal_relu_layer_acc.metal in Sources */,
+				9D32FF1224557EED002DCDAB /* prelu_layer_interpreter.cc in Sources */,
+				9DF543F7258B1366006CEC97 /* arm_div_layer_acc.cc in Sources */,
+				9D32FCCF24557EEC002DCDAB /* pooling_3d_layer.cc in Sources */,
+				ECCDCF1925E0F97500D7D297 /* lstm_layer.cc in Sources */,
+				9D32FF7224557EED002DCDAB /* tnn_impl_default.cc in Sources */,
+				ECCDCECB25DF536000D7D297 /* cpu_reduce_log_sum_exp_layer_acc.cc in Sources */,
+				9DF543D5258B1366006CEC97 /* CONV_DW_3X3_FLOAT_SLIDEW.S in Sources */,
+				9D5B716124BF0A300062DF64 /* metal_prior_box_layer_acc.mm in Sources */,
+				9DD1FBC7247CE9BE00800139 /* metal_upsample_layer_acc.metal in Sources */,
+				9DF543D0258B1366006CEC97 /* GEMM_INT8_4X8.S in Sources */,
+				4E187D16267202D800804FDF /* onehot_layer.cc in Sources */,
+				ECCDCEA225DF536000D7D297 /* cpu_range_layer_acc.cc in Sources */,
+				ECCDCE9E25DF536000D7D297 /* cpu_concat_layer_acc.cc in Sources */,
+				ECCDCE9625DF536000D7D297 /* cpu_reduce_log_sum_layer_acc.cc in Sources */,
+				ECCDCEF825DF536000D7D297 /* cpu_bitshift_layer_acc.cc in Sources */,
+				9D32FCDD24557EEC002DCDAB /* prior_box_layer.cc in Sources */,
+				ECCDCEEA25DF536000D7D297 /* cpu_cos_layer_acc.cc in Sources */,
+				EC39A40225FB662900891D9A /* metal_reorg_layer_acc.metal in Sources */,
+				9D32FCF324557EEC002DCDAB /* hdrguide_layer.cc in Sources */,
+				9DD1FB3E247CE9BE00800139 /* tnn_impl_coreml.mm in Sources */,
+				9DD1FB68247CE9BE00800139 /* metal_neg_layer_acc.metal in Sources */,
+				ECCDCF2B25E0F97500D7D297 /* squeeze_layer.cc in Sources */,
+				9DD1FB6D247CE9BE00800139 /* metal_hard_sigmoid_layer_acc.metal in Sources */,
+				ECCDCED325DF536000D7D297 /* cpu_batch_norm_layer_acc.cc in Sources */,
+				9D32FCE224557EEC002DCDAB /* neg_layer.cc in Sources */,
+				9D32FF6D24557EED002DCDAB /* blob_int8.cc in Sources */,
+				9DF543F5258B1366006CEC97 /* gemm_function.cc in Sources */,
+				9DF543CC258B1366006CEC97 /* winograd_function.cc in Sources */,
+				9D32FCC524557EEC002DCDAB /* tan_layer.cc in Sources */,
+				EC0BE17C25144C10009BD69A /* instance_norm_layer_interpreter.cc in Sources */,
+				E4D05BA7259DCB2E00921502 /* arm_deconv_fp16_layer_common.cc in Sources */,
+				9DF543EF258B1366006CEC97 /* CONV_DW_3x3_BFP16_SLIDEW.S in Sources */,
+				EC2CF72625078C1200EE3899 /* metal_mat_converter.metal in Sources */,
+				E4D05BAA259DCB2E00921502 /* CONV_DW_3X3_FP16_SLIDEW.S in Sources */,
+				9DD1FB86247CE9BE00800139 /* metal_hdrguide_layer_acc.metal in Sources */,
+				9DF54402258B1366006CEC97 /* arm_abs_layer_acc.cc in Sources */,
+				ECCDCF2025E0F97500D7D297 /* gathernd_layer.cc in Sources */,
+				9DF5440F258B1367006CEC97 /* arm_priorbox_layer_acc.cc in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		9D2DB1D722D759C8000C508F /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 71;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = macosx;
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Debug;
+		};
+		9D2DB1D822D759C8000C508F /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 71;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = macosx;
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Release;
+		};
+		9D2DB1DA22D759C8000C508F /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_STYLE = Manual;
+				COMBINE_HIDPI_IMAGES = YES;
+				DEFINES_MODULE = YES;
+				DEVELOPMENT_TEAM = "";
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 71;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = YES;
+				FRAMEWORK_VERSION = A;
+				GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
+				GCC_INPUT_FILETYPE = automatic;
+				GCC_OPTIMIZATION_LEVEL = s;
+				HEADER_SEARCH_PATHS = (
+					"\"$(SRCROOT)/../../source\"",
+					"\"$(SRCROOT)/../../include\"",
+					"\"$(SRCROOT)/../../include/core\"",
+					"\"$(SRCROOT)/../../include/utils\"",
+				);
+				INFOPLIST_FILE = tnn/Info.plist;
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/../Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MACH_O_TYPE = staticlib;
+				MTL_HEADER_SEARCH_PATHS = "\"$(SRCROOT)/../../source/device/metal/acc\" \"$(SRCROOT)/../../source\"";
+				MTL_LANGUAGE_REVISION = Metal11;
+				MTL_OPTIMIZATION_LEVEL = s;
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_CFLAGS = (
+					"-fvisibility=hidden",
+					"-Wno-deprecated-declarations",
+					"-Wno-ignored-attributes",
+					"-Wno-unused-variable",
+					"-Wno-pass-failed",
+					"-Wunused-function",
+					"-march=armv8.2-a+fp16+nolse",
+					"-DTNN_ARM82=defined(__aarch64__)",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.tnn.tnn;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
+				SDKROOT = iphoneos;
+				SKIP_INSTALL = YES;
+				VALID_ARCHS = "x86_64 arm64 i386 armv7";
+			};
+			name = Debug;
+		};
+		9D2DB1DB22D759C8000C508F /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_STYLE = Manual;
+				COMBINE_HIDPI_IMAGES = YES;
+				COPY_PHASE_STRIP = YES;
+				DEFINES_MODULE = YES;
+				DEVELOPMENT_TEAM = "";
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 71;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = YES;
+				FRAMEWORK_VERSION = A;
+				GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
+				GCC_INPUT_FILETYPE = automatic;
+				GCC_OPTIMIZATION_LEVEL = s;
+				GCC_SYMBOLS_PRIVATE_EXTERN = YES;
+				GENERATE_MASTER_OBJECT_FILE = YES;
+				HEADER_SEARCH_PATHS = (
+					"\"$(SRCROOT)/../../source\"",
+					"\"$(SRCROOT)/../../include\"",
+					"\"$(SRCROOT)/../../include/core\"",
+					"\"$(SRCROOT)/../../include/utils\"",
+				);
+				INFOPLIST_FILE = tnn/Info.plist;
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/../Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MACH_O_TYPE = staticlib;
+				MTL_HEADER_SEARCH_PATHS = "\"$(SRCROOT)/../../source/device/metal/acc\" \"$(SRCROOT)/../../source\"";
+				MTL_LANGUAGE_REVISION = Metal11;
+				MTL_OPTIMIZATION_LEVEL = s;
+				ONLY_ACTIVE_ARCH = NO;
+				OTHER_CFLAGS = (
+					"-fvisibility=hidden",
+					"-Wno-deprecated-declarations",
+					"-Wno-ignored-attributes",
+					"-Wno-unused-variable",
+					"-Wno-pass-failed",
+					"-Wunused-function",
+					"-march=armv8.2-a+fp16+nolse",
+					"-DTNN_ARM82=defined(__aarch64__)",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tencent.youtu.tnn.tnn;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
+				SDKROOT = iphoneos;
+				SKIP_INSTALL = YES;
+				STRIP_STYLE = "non-global";
+				VALID_ARCHS = "x86_64 arm64 i386 armv7";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		9D2DB1CB22D759C8000C508F /* Build configuration list for PBXProject "tnn" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9D2DB1D722D759C8000C508F /* Debug */,
+				9D2DB1D822D759C8000C508F /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		9D2DB1D922D759C8000C508F /* Build configuration list for PBXNativeTarget "tnn" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				9D2DB1DA22D759C8000C508F /* Debug */,
+				9D2DB1DB22D759C8000C508F /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 9D2DB1C822D759C8000C508F /* Project object */;
+}
diff --git a/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000..304f08c
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:tnn.xcodeproj">
+   </FileRef>
+</Workspace>
diff --git a/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 0000000..18d9810
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/platforms/ios/tnn.xcodeproj/xcshareddata/xcschemes/tnn.xcscheme b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/xcshareddata/xcschemes/tnn.xcscheme
new file mode 100644
index 0000000..58afeb7
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn.xcodeproj/xcshareddata/xcschemes/tnn.xcscheme
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1130"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "9D2DB1D022D759C8000C508F"
+               BuildableName = "tnn.framework"
+               BlueprintName = "tnn"
+               ReferencedContainer = "container:tnn.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <MacroExpansion>
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "9D2DB1D022D759C8000C508F"
+            BuildableName = "tnn.framework"
+            BlueprintName = "tnn"
+            ReferencedContainer = "container:tnn.xcodeproj">
+         </BuildableReference>
+      </MacroExpansion>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
diff --git a/3rdparty/TNN/platforms/ios/tnn/Info.plist b/3rdparty/TNN/platforms/ios/tnn/Info.plist
new file mode 100644
index 0000000..ebd8d26
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn/Info.plist
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>BuildMachineOSBuild</key>
+	<string>19E287</string>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>tnn</string>
+	<key>CFBundleIdentifier</key>
+	<string>com.tencent.youtu.tnn.tnn</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>tnn</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>0.2.0</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>72</string>
+	<key>DTCompiler</key>
+	<string>com.apple.compilers.llvm.clang.1_0</string>
+	<key>DTPlatformBuild</key>
+	<string>17E8258</string>
+	<key>DTPlatformName</key>
+	<string>iphoneos</string>
+	<key>DTPlatformVersion</key>
+	<string>13.4</string>
+	<key>DTSDKBuild</key>
+	<string>17E8258</string>
+	<key>DTSDKName</key>
+	<string>iphoneos13.4</string>
+	<key>DTXcode</key>
+	<string>1141</string>
+	<key>DTXcodeBuild</key>
+	<string>11E503a</string>
+	<key>MinimumOSVersion</key>
+	<string>9.0</string>
+	<key>NSHumanReadableCopyright</key>
+	<string>Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+	</array>
+	<key>YTSDKInfo</key>
+	<dict>
+		<key>tnn_commit_branch</key>
+		<string>feature_demo</string>
+		<key>tnn_commit_date</key>
+		<string>2020-09-08</string>
+		<key>tnn_commit_hash</key>
+		<string>6c96d79</string>
+	</dict>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/platforms/ios/tnn/tnn.h b/3rdparty/TNN/platforms/ios/tnn/tnn.h
new file mode 100644
index 0000000..089b8bf
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/tnn/tnn.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#import "tnn/core/macro.h"
+#import "tnn/core/common.h"
+#import "tnn/core/instance.h"
+#import "tnn/core/blob.h"
+#import "tnn/core/tnn.h"
+#import "tnn/core/status.h"
+#import "tnn/utils/half_utils.h"
+#import "tnn/utils/data_type_utils.h"
+#import "tnn/utils/dims_vector_utils.h"
+//#import "tnn/utils/data_format_converter.h"
+#import "tnn/utils/blob_converter.h"
diff --git a/3rdparty/TNN/platforms/ios/unit_test_x86_metal.sh b/3rdparty/TNN/platforms/ios/unit_test_x86_metal.sh
new file mode 100755
index 0000000..7f4d26d
--- /dev/null
+++ b/3rdparty/TNN/platforms/ios/unit_test_x86_metal.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+METAL="ON"
+CLEAN=""
+BUILD_ONLY=""
+
+WORK_DIR=`pwd`
+BUILD_DIR=build
+FILTER=""
+DUMP_DIR=$WORK_DIR/dump_data
+
+function usage() {
+    echo "-c\tClean up build folders."
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_unit_test() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=DEBUG \
+          -DTNN_BENCHMARK_MODE:BOOL="ON" \
+          -DTNN_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_UNIT_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_METAL_ENABLE:BOOL=$METAL
+    make -j
+}
+
+function run_unit_test() {
+    build_unit_test
+    if [ $? != 0 ]; then
+        echo "build failed!"
+        exit 0
+    fi
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+    mkdir -p $DUMP_DIR
+    ./test/unit_test/unit_test -dt METAL -lp ./tnn.metallib --gtest_filter="*${FILTER}*"
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            FILTER=$1
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run_unit_test
diff --git a/3rdparty/TNN/platforms/linux/CMakeLists.txt b/3rdparty/TNN/platforms/linux/CMakeLists.txt
new file mode 100644
index 0000000..3208d67
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/CMakeLists.txt
@@ -0,0 +1,70 @@
+if(DEBUG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/source/tnn/device/opencl)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/opencl/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/stb)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src)
+
+set(COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src/test_common.cc)
+
+target_link_libraries(TNN dl)
+
+if(TNN_X86_ENABLE)
+endif()
+
+if(TNN_OPENVINO_ENABLE)
+
+    if(NOT DEFINED ENV{OPENVINO_ROOT_DIR})
+        message(FATAL_ERROR "not defined environment variable:OPENVINO_ROOT_DIR")
+    endif()
+
+    if (TNN_OPENVINO_BUILD_SHARED)
+        set(LINK_TYPE "SHARED")
+        set(LIB_EXT ".so")
+    else()
+        set(LINK_TYPE "STATIC")
+        set(LIB_EXT ".a")
+    endif()
+
+    add_library(inference_engine ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_legacy ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_transformations ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_lp_transformations ${LINK_TYPE} IMPORTED)
+    # add_library(MKLDNNPlugin ${LINK_TYPE} IMPORTED)
+    add_library(ngraph ${LINK_TYPE} IMPORTED)
+    # add_library(pugixml STATIC IMPORTED)
+    add_library(tbb ${LINK_TYPE} IMPORTED)
+
+    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine${LIB_EXT})
+    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_legacy${LIB_EXT})
+    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_transformations${LIB_EXT})
+    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
+    # set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}MKLDNNPlugin.so)
+    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
+    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/external/tbb/lib/libtbb.so.2)
+    # set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.a)
+    
+    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations ngraph tbb)
+endif()
+
+if(TNN_CUDA_ENABLE)
+    enable_language(CUDA)
+    include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    add_library(nvinfer SHARED IMPORTED)
+    add_library(nvinfer_plugin SHARED IMPORTED)
+    add_library(cudnn SHARED IMPORTED)
+    set_target_properties(nvinfer PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}/lib/libnvinfer.so)
+    set_target_properties(nvinfer_plugin PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}/lib/libnvinfer_plugin.so)
+    set_target_properties(cudnn PROPERTIES IMPORTED_LOCATION $ENV{CUDNN_ROOT_DIR}/lib64/libcudnn.so)
+    target_link_libraries(TNN nvinfer nvinfer_plugin cudnn)
+endif()
+
+if(TNN_RK_NPU_ENABLE)
+    message(STATUS "Build TNN RKNPU")
+    target_link_libraries(TNN rknpu_ddk)
+endif()
+
diff --git a/3rdparty/TNN/platforms/linux/build_model_check.sh b/3rdparty/TNN/platforms/linux/build_model_check.sh
new file mode 100755
index 0000000..24ac052
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/build_model_check.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+CLEAN=""
+
+BUILD_DIR=build
+
+function usage() {
+    echo "-c\tClean up build folders."
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DTNN_CPU_ENABLE:BOOL="ON"  \
+          -DTNN_OPENCL_ENABLE:BOOL="ON" \
+          -DTNN_MODEL_CHECK_ENABLE:BOOL="ON"
+    make -j4
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+build
diff --git a/3rdparty/TNN/platforms/linux/build_quanttool.sh b/3rdparty/TNN/platforms/linux/build_quanttool.sh
new file mode 100755
index 0000000..be3d9cf
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/build_quanttool.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+CLEAN=""
+
+BUILD_DIR=build
+
+function usage() {
+    echo "-c\tClean up build folders."
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DTNN_CPU_ENABLE:BOOL="ON"  \
+          -DTNN_QUANTIZATION_ENABLE:BOOL="ON"
+    make -j4
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+build
diff --git a/3rdparty/TNN/platforms/linux/test_x86_cpu.sh b/3rdparty/TNN/platforms/linux/test_x86_cpu.sh
new file mode 100755
index 0000000..836abfc
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/test_x86_cpu.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+DEBUG=0
+CLEAN=""
+
+WORK_DIR=`pwd`
+BUILD_DIR=build
+MODEL_DIR=$WORK_DIR/models
+DUMP_DIR=$WORK_DIR/dump_data
+#INPUT_FILE_NAME=hdr_test.jpg
+INPUT_FILE_NAME=rpn_in_0_n1_c3_h320_w320.txt
+
+function usage() {
+    echo "-c\tClean up build folders."
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_x86() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DDEBUG=$DEBUG \
+          -DTNN_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_CPU_ENABLE:BOOL="ON"  \
+          -DTNN_BUILD_SHARED:BOOL="ON"  \
+          -DTNN_OPENMP_ENABLE:BOOL="ON"
+    make -j4
+}
+
+function run_x86() {
+    build_x86
+    mkdir -p $DUMP_DIR
+    ./test/TNNTest -mp=$MODEL_DIR/test.tnnproto -ip=$MODEL_DIR/$INPUT_FILE_NAME -dt="NAIVE" -op=dump_data.txt -wc=0 -ic=1
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run_x86
diff --git a/3rdparty/TNN/platforms/linux/test_x86_ocl.sh b/3rdparty/TNN/platforms/linux/test_x86_ocl.sh
new file mode 100755
index 0000000..7379bab
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/test_x86_ocl.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+DEBUG=0
+OPENCL="ON"
+CLEAN=""
+PROFILING="OFF"
+WARM_UP_COUNT=10
+ITERATOR_COUNT=100
+
+WORK_DIR=`pwd`
+BUILD_DIR=build
+MODEL_DIR=$WORK_DIR/models
+DUMP_DIR=$WORK_DIR/dump_data
+INPUT_FILE_NAME=rpn_in_0_n1_c3_h320_w320.txt
+#INPUT_FILE_NAME=rpn_in_input_array_n1_c3_h256_w192.txt
+
+function usage() {
+    echo "-c\tClean up build folders."
+    echo "-f\tbuild profiling targets "
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_x86() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Debug \
+          -DDEBUG=$DEBUG \
+          -DTNN_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_PROFILER_ENABLE:BOOL=${PROFILING} \
+          -DTNN_X86_ENABLE:BOOL="OFF"  \
+          -DTNN_OPENCL_ENABLE:BOOL=$OPENCL
+    make -j4
+}
+
+function run_x86() {
+    build_x86
+
+    if [ "ON" == $PROFILING ]; then
+        WARM_UP_COUNT=0
+        ITERATOR_COUNT=1
+    fi
+
+    mkdir -p $DUMP_DIR
+    ./test/TNNTest -dt=OPENCL -mp=$MODEL_DIR/test.tnnproto -ip=$MODEL_DIR/$INPUT_FILE_NAME -op=$DUMP_DIR/dump_data.txt -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -f)
+            shift
+            PROFILING="ON"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run_x86
diff --git a/3rdparty/TNN/platforms/linux/unit_test_x86_ocl.sh b/3rdparty/TNN/platforms/linux/unit_test_x86_ocl.sh
new file mode 100755
index 0000000..c6f4837
--- /dev/null
+++ b/3rdparty/TNN/platforms/linux/unit_test_x86_ocl.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+OPENCL="ON"
+CLEAN=""
+BUILD_ONLY=""
+
+WORK_DIR=`pwd`
+BUILD_DIR=build
+FILTER=""
+DUMP_DIR=$WORK_DIR/dump_data
+
+function usage() {
+    echo "-c\tClean up build folders."
+    echo "-f\tfilter"
+    echo "-b\tBuild only."
+}
+function die() {
+    echo $1
+    exit 1
+}
+
+function clean_build() {
+    echo $1 | grep "$BUILD_DIR\b" > /dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir $1
+}
+
+function build_x86() {
+    if [ "-c" == "$CLEAN" ]; then
+        clean_build $BUILD_DIR
+    fi
+    mkdir -p build
+    cd $BUILD_DIR
+    cmake ../../.. \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DTNN_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_UNIT_TEST_ENABLE:BOOL="ON"  \
+          -DTNN_OPENCL_ENABLE:BOOL=$OPENCL
+    make -j4
+}
+
+function run_x86() {
+    build_x86
+    if [ $? != 0 ]; then
+        echo "build failed!"
+        exit 0
+    fi
+    if [ "" != "$BUILD_ONLY" ]; then
+        echo "build done!"
+        exit 0
+    fi
+    mkdir -p $DUMP_DIR
+    ./test/unit_test/unit_test -dt OPENCL --gtest_filter="*${FILTER}*"
+}
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -c)
+            shift
+            CLEAN="-c"
+            ;;
+        -b)
+            shift
+            BUILD_ONLY="-b"
+            ;;
+        -f)
+            shift
+            FILTER=$1
+            shift
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+run_x86
diff --git a/3rdparty/TNN/platforms/mac/CMakeLists.txt b/3rdparty/TNN/platforms/mac/CMakeLists.txt
new file mode 100644
index 0000000..b4f0f05
--- /dev/null
+++ b/3rdparty/TNN/platforms/mac/CMakeLists.txt
@@ -0,0 +1,53 @@
+if(TNN_OPENCL_ENABLE)
+    find_package(OpenCL)
+    target_link_libraries(TNN ${OpenCL_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
+endif()
+
+# Darwin third party
+if(TNN_METAL_ENABLE)
+    target_link_libraries(TNN "-framework Foundation")
+    target_link_libraries(TNN "-framework Metal")
+    target_link_libraries(TNN "-framework CoreML")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/source/tnn/device/opencl)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/opencl/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/stb)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src)
+
+set(COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src/test_common.cc)
+
+
+if(TNN_OPENVINO_ENABLE)
+
+    if(NOT DEFINED ENV{OPENVINO_ROOT_DIR})
+        message(FATAL_ERROR "not defined environment variable:OPENVINO_ROOT_DIR")
+    endif()
+
+    if (TNN_OPENVINO_BUILD_SHARED)
+        set(LINK_TYPE "SHARED")
+        set(LIB_EXT ".dylib")
+    else()
+        set(LINK_TYPE "STATIC")
+        set(LIB_EXT ".a")
+    endif()
+
+    add_library(inference_engine ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_legacy ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_transformations ${LINK_TYPE} IMPORTED)
+    add_library(inference_engine_lp_transformations ${LINK_TYPE} IMPORTED)
+    add_library(MKLDNNPlugin SHARED IMPORTED)
+    add_library(ngraph ${LINK_TYPE} IMPORTED)
+    add_library(pugixml STATIC IMPORTED)
+
+    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine${LIB_EXT})
+    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_legacy${LIB_EXT})
+    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_transformations${LIB_EXT})
+    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
+    set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}MKLDNNPlugin.dylib)
+    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
+    set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.a)
+
+    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations MKLDNNPlugin ngraph pugixml)
+endif()
diff --git a/3rdparty/TNN/platforms/windows/CMakeLists.txt b/3rdparty/TNN/platforms/windows/CMakeLists.txt
new file mode 100644
index 0000000..41b91a7
--- /dev/null
+++ b/3rdparty/TNN/platforms/windows/CMakeLists.txt
@@ -0,0 +1,54 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/source/tnn/device/opencl)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/opencl/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/stb)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src)
+
+set(COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/src/test_common.cc)
+
+if (TNN_OPENVINO_ENABLE)
+
+    if(NOT DEFINED ENV{OPENVINO_ROOT_DIR})
+        message(FATAL_ERROR "not defined environment variable:OPENVINO_ROOT_DIR")
+    endif()
+
+    set(LIB_EXT ".lib")
+
+    add_library(inference_engine STATIC IMPORTED)
+    add_library(inference_engine_legacy STATIC IMPORTED)
+    add_library(inference_engine_transformations STATIC IMPORTED)
+    add_library(inference_engine_lp_transformations STATIC IMPORTED)
+    add_library(MKLDNNPlugin STATIC IMPORTED)
+    add_library(ngraph STATIC IMPORTED)
+    add_library(pugixml STATIC IMPORTED)
+
+    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine${LIB_EXT})
+    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_legacy${LIB_EXT})
+    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_transformations${LIB_EXT})
+    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
+    set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}MKLDNNPlugin${LIB_EXT})
+    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
+    set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.lib)
+
+    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations MKLDNNPlugin ngraph pugixml)
+
+endif()
+
+
+if(TNN_CUDA_ENABLE)
+    set(LINK_TYPE "STATIC")
+    set(LIB_EXT ".lib")
+    enable_language(CUDA)
+    include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    add_library(nvinfer ${LINK_TYPE} IMPORTED)
+    add_library(nvinfer_plugin ${LINK_TYPE} IMPORTED)
+    add_library(cudnn ${LINK_TYPE} IMPORTED)
+    add_library(cublas ${LINK_TYPE} IMPORTED)
+    set_target_properties(nvinfer PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}\\lib\\nvinfer${LIB_EXT})
+    set_target_properties(nvinfer_plugin PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}\\lib\\nvinfer_plugin${LIB_EXT})
+    set_target_properties(cudnn PROPERTIES IMPORTED_LOCATION $ENV{CUDNN_ROOT_DIR}\\lib\\x64\\cudnn${LIB_EXT})
+    set_target_properties(cublas PROPERTIES IMPORTED_LOCATION $ENV{CUDA_PATH}\\lib\\x64\\cublas${LIB_EXT})
+    message($ENV{TENSORRT_ROOT_DIR}\\lib\\nvinfer${LIB_EXT})
+    message($ENV{TENSORRT_ROOT_DIR}\\lib\\nvinfer_plugin${LIB_EXT})
+    message($ENV{CUDNN_ROOT_DIR}\\lib\\x64\\cudnn${LIB_EXT})
+    target_link_libraries(TNN nvinfer nvinfer_plugin cudnn cublas)
+endif()
diff --git a/3rdparty/TNN/platforms/windows/build_model_check.bat b/3rdparty/TNN/platforms/windows/build_model_check.bat
new file mode 100644
index 0000000..8a94fbc
--- /dev/null
+++ b/3rdparty/TNN/platforms/windows/build_model_check.bat
@@ -0,0 +1,38 @@
+set TNN_DIR=%~dp0..\..\
+
+@echo off
+echo %TNN_DIR%
+echo %1
+
+if "%2" == "" (
+    goto init_fold
+) else (
+    goto init_env
+)
+
+:init_env
+    if %1 == x86 (
+        echo "build x86"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars32.bat"
+    ) else (
+        echo "build x64"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"
+    )
+    goto init_fold
+
+:init_fold
+    mkdir build_win
+    cd build_win
+
+cmake %TNN_DIR% -G "Ninja" ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_X86_ENABLE=ON ^
+-DTNN_OPENCL_ENABLE=ON ^
+-DTNN_MODEL_CHECK_ENABLE:BOOL="ON" ^
+-DTNN_BUILD_SHARED:BOOL="ON" ^
+-DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=TRUE ^
+-DINTTYPES_FORMAT=C99
+
+cmake --build . --config Release
\ No newline at end of file
diff --git a/3rdparty/TNN/scripts/.ci/preflight.sh b/3rdparty/TNN/scripts/.ci/preflight.sh
new file mode 100755
index 0000000..c3dfe52
--- /dev/null
+++ b/3rdparty/TNN/scripts/.ci/preflight.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+set -e
+
+ci_type=$1
+
+git fetch origin master:master
+CHANGED_FILES=`git diff --name-only master`
+echo -e "\n>> Changed Files:"
+for CHANGED_FILE in $CHANGED_FILES; do
+  echo ${CHANGED_FILE}
+done
+RELEVANT=False
+
+PATTERNS=("CMakeLists.txt"
+          "cmake/"
+          "include/tnn/"
+          "source/tnn/core/"
+          "source/tnn/device/cpu/"
+          "source/tnn/interpreter/"
+          "source/tnn/layer/"
+          "source/tnn/memory_manager/"
+          "source/tnn/network/"
+          "source/tnn/optimizer/"
+          "source/tnn/utils/"
+          "test/"
+          )
+
+if [[ ${ci_type} == 'android' ]]; then
+  PATTERNS+=("platforms/android/"
+             "scripts/build_android.sh"
+             "source/tnn/device/arm/"
+             "source/tnn/device/opencl/"
+             )
+elif [[ ${ci_type} == 'arm' ]]; then
+  PATTERNS+=("scripts/build_aarch64_linux.sh"
+             "scripts/build_armhf_linux.sh"
+             "scripts/build_test.sh"
+             "source/tnn/device/arm/"
+             )
+elif [[ ${ci_type} == 'ios' ]]; then
+  PATTERNS+=("platforms/ios/"
+             "scripts/build_ios.sh"
+             "source/tnn/device/arm/"
+             "source/tnn/device/metal/"
+             )
+elif [[ ${ci_type} == 'x86' ]]; then
+  PATTERNS+=("scripts/build_linux.sh"
+             "scripts/build_macos.sh"
+             "source/tnn/device/x86/"
+             "source/tnn/extern_wrapper/"
+             )
+else
+  PATTERNS+=("*")
+fi
+
+echo -e "\n>> Patterns:"
+for PATTERN in ${PATTERNS[@]}; do
+  echo ${PATTERN}
+done
+echo ""
+
+for CHANGED_FILE in $CHANGED_FILES; do
+  for PATTERN in ${PATTERNS[@]}; do
+    if [[ $CHANGED_FILE =~ $PATTERN ]]; then
+      echo $CHANGED_FILE " -> MATCHES <- " $PATTERN
+      RELEVANT=True
+      break
+    fi
+  done
+  if [[ $RELEVANT == True ]]; then
+    break
+  fi
+done
+
+if [[ $RELEVANT == True ]]; then
+  echo "Code changes relevant to" ${ci_type} ", continuing with build."
+else
+  echo "Code changes not relevant to" ${ci_type} ", exiting."
+  exit 11
+fi
diff --git a/3rdparty/TNN/scripts/.gitignore b/3rdparty/TNN/scripts/.gitignore
new file mode 100644
index 0000000..d74e690
--- /dev/null
+++ b/3rdparty/TNN/scripts/.gitignore
@@ -0,0 +1,15 @@
+build32
+build64
+release
+build_x86
+build_linux
+build_x86_linux
+x86_linux_release
+build_macos
+macos_release
+build_msvc
+msvc_release
+build_cuda_linux
+cuda_linux_release
+build_cuda_msvc
+cuda_msvc_release
\ No newline at end of file
diff --git a/3rdparty/TNN/scripts/build_aarch64_linux.sh b/3rdparty/TNN/scripts/build_aarch64_linux.sh
new file mode 100755
index 0000000..fc089c2
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_aarch64_linux.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+ARM="ON"
+OPENMP="ON"
+OPENCL="OFF"
+RKNPU="OFF"
+CC=aarch64-linux-gnu-gcc
+CXX=aarch64-linux-gnu-g++
+TARGET_ARCH=aarch64
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_aarch64_linux
+cd build_aarch64_linux
+
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux  \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_ARM_ENABLE:BOOL=$ARM \
+    -DTNN_TEST_ENABLE:BOOL=ON \
+    -DTNN_CPU_ENABLE:BOOL=ON \
+    -DTNN_RK_NPU_ENABLE:BOOL=$RKNPU \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+
+
+make -j4
diff --git a/3rdparty/TNN/scripts/build_aarch64_macos.sh b/3rdparty/TNN/scripts/build_aarch64_macos.sh
new file mode 100755
index 0000000..9aaccd9
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_aarch64_macos.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+ARM="ON"
+ARM82="ON"
+METAL="OFF"
+DEBUG="OFF"
+TARGET_ARCH=aarch64
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+rm -rf build_aarch64_macos
+mkdir build_aarch64_macos
+cd build_aarch64_macos
+
+cmake ${TNN_ROOT_PATH} \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_UNIT_TEST_ENABLE=ON \
+    -DDEBUG:BOOL=$DEBUG \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_ARM_ENABLE:BOOL=$ARM \
+    -DTNN_ARM82_ENABLE:BOOL=$ARM82 \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DTNN_METAL_ENABLE:BOOL=$METAL \
+    -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+
+
+make -j6
diff --git a/3rdparty/TNN/scripts/build_android.sh b/3rdparty/TNN/scripts/build_android.sh
new file mode 100755
index 0000000..0700660
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_android.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+#ABIA32="armeabi-v7a with NEON"
+ABIA32="armeabi-v7a"
+ABIA64="arm64-v8a"
+STL="c++_static"
+#STL="gnustl_static"
+SHARED_LIB="ON"
+ARM="ON"
+OPENMP="ON"
+OPENCL="ON"
+#HUAWEI_NPU="ON"
+if [ -z "$HUAWEI_NPU" ]; then
+    HUAWEI_NPU="OFF"
+fi
+BENMARK_MODE="OFF"
+DEBUG="OFF"
+INCREMENTAL_COMPILE="OFF"
+SHARING_MEM_WITH_OPENGL=0
+ANDROID_API_LEVEL="android-14"
+# check ANDROID_NDK whether set.
+if [ ! -f "$ANDROID_NDK/build/cmake/android.toolchain.cmake" ]; then
+   echo -e "Not found: build/cmake/android.toolchain.cmake in ANDROID_NDK:$ANDROID_NDK"
+   echo -e "Please download android ndk and set ANDROID_NDK environment variable."
+   exit -1
+fi
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -ic)
+            shift
+            INCREMENTAL_COMPILE="ON"
+            ;;
+        *)
+            usage
+            exit 1
+    esac
+done
+
+
+TNN_BUILD_PATH=$PWD
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+    echo $TNN_ROOT_PATH
+fi
+
+if [ "$HUAWEI_NPU" == "ON" ]
+then
+    echo "NPU Enable"
+    # set c++ shared
+    STL="c++_shared"
+    #start to cp
+    if [ ! -d ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/ ]; then
+         mkdir -p ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/
+    fi
+    mkdir -p ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/armeabi-v7a
+    mkdir -p ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/arm64-v8a
+    cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_shared.so ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/armeabi-v7a/
+    cp $ANDROID_NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so ${TNN_ROOT_PATH}/third_party/huawei_npu/cpp_lib/arm64-v8a/
+fi
+
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+echo $TNN_ROOT_PATH
+echo $TNN_VERSION_PATH
+echo $ABI
+echo ' '
+echo '******************** step 1: update version.h ********************'
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+
+echo ' '
+echo '******************** step 2: start build rpn arm32 ********************'
+cd $TNN_BUILD_PATH
+if [ -x "build32" ];then
+    if [ "${INCREMENTAL_COMPILE}" = "OFF" ];then
+        echo 'remove build32'
+        rm -r build32
+        mkdir build32
+    fi
+else
+    mkdir -p build32
+fi
+
+cd build32
+echo $ABIA32
+cmake ${TNN_ROOT_PATH} \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DDEBUG:BOOL=$DEBUG \
+      -DANDROID_ABI="${ABIA32}" \
+      -DANDROID_STL=${STL} \
+      -DANDROID_NATIVE_API_LEVEL=${ANDROID_API_LEVEL}  \
+      -DANDROID_TOOLCHAIN=clang \
+      -DBUILD_FOR_ANDROID_COMMAND=true \
+      -DTNN_CPU_ENABLE:BOOL=ON \
+      -DTNN_ARM_ENABLE:BOOL=$ARM \
+      -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
+      -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+      -DTNN_BENCHMARK_MODE:BOOL=$BENMARK_MODE \
+      -DTNN_TEST_ENABLE:BOOL=ON \
+      -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+      -DSHARING_MEM_WITH_OPENGL=${SHARING_MEM_WITH_OPENGL} \
+      -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+make -j8
+
+# check ret code for ci
+if [ 0 -ne $? ]
+then
+  exit -1
+fi
+
+echo ' '
+echo '******************** step 3: start build rpn arm64 ********************'
+cd $TNN_BUILD_PATH
+if [ -x "build64" ];then
+    if [ "${INCREMENTAL_COMPILE}" = "OFF" ];then
+        echo 'remove build64'
+        rm -r build64
+        mkdir build64
+    fi
+else
+    mkdir -p build64
+fi
+
+cd build64
+echo $ABIA64
+cmake ${TNN_ROOT_PATH} \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DDEBUG:BOOL=$DEBUG \
+      -DANDROID_ABI="${ABIA64}" \
+      -DANDROID_STL=${STL} \
+      -DANDROID_NATIVE_API_LEVEL=${ANDROID_API_LEVEL}  \
+      -DANDROID_TOOLCHAIN=clang \
+      -DBUILD_FOR_ANDROID_COMMAND=true \
+      -DTNN_CPU_ENABLE:BOOL=ON \
+      -DTNN_ARM_ENABLE:BOOL=$ARM \
+      -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
+      -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+      -DTNN_TEST_ENABLE:BOOL=ON \
+      -DTNN_BENCHMARK_MODE:BOOL=$BENMARK_MODE \
+      -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+      -DSHARING_MEM_WITH_OPENGL=${SHARING_MEM_WITH_OPENGL} \
+      -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+make -j8
+
+# check ret code for ci
+if [ 0 -ne $? ]
+then
+  exit -1
+fi
+
+echo ' '
+echo '******************** step 4: add version attr ********************'
+#添加版本信息到库文件
+cd $TNN_BUILD_PATH
+if [ "$SHARED_LIB" = "ON" ];then
+AddAllVersionAttr "$TNN_BUILD_PATH/build32/libTNN.so"
+AddAllVersionAttr "$TNN_BUILD_PATH/build64/libTNN.so"
+else
+AddAllVersionAttr "$TNN_BUILD_PATH/build32/libTNN.a"
+AddAllVersionAttr "$TNN_BUILD_PATH/build64/libTNN.a"
+fi
+
+
+echo '******************** step 4: copy to release ********************'
+cd $TNN_BUILD_PATH
+mkdir -p release
+cd release
+rm -rf *
+mkdir -p armeabi-v7a
+mkdir -p arm64-v8a
+cd ..
+if [ "$SHARED_LIB" = "ON" ];then
+    cp build32/libTNN.so release/armeabi-v7a
+    cp build64/libTNN.so release/arm64-v8a
+else
+    cp build32/libTNN.a release/armeabi-v7a
+    cp build64/libTNN.a release/arm64-v8a
+fi
+cp -r ${TNN_ROOT_PATH}/include release
+if [  "$HUAWEI_NPU" == "ON" ]; then
+    cp ${TNN_ROOT_PATH}/third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a/* release/armeabi-v7a/
+    cp ${TNN_ROOT_PATH}/third_party/huawei_npu/hiai_ddk_latest/arm64-v8a/* release/arm64-v8a/
+fi
+echo "build done!"
+
+if [ "$SHARED_LIB" != "ON" ]; then
+    echo -e "\033[31m[WARNING] TNN is built as a static library, link it with option -Wl,--whole-archive tnn -Wl,--no-whole-archive\033[0m"
+fi
diff --git a/3rdparty/TNN/scripts/build_armhf_linux.sh b/3rdparty/TNN/scripts/build_armhf_linux.sh
new file mode 100755
index 0000000..b4bfa4e
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_armhf_linux.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+ARM="ON"
+OPENMP="ON"
+OPENCL="OFF"
+RKNPU="OFF"
+CC=arm-linux-gnueabihf-gcc
+CXX=arm-linux-gnueabihf-g++
+TARGET_ARCH=arm
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_armhf_linux
+cd build_armhf_linux
+
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux  \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DDEBUG=OFF \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_ARM_ENABLE:BOOL=$ARM \
+    -DTNN_RK_NPU_ENABLE:BOOL=$RKNPU \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+
+make -j4
diff --git a/3rdparty/TNN/scripts/build_cuda_linux.sh b/3rdparty/TNN/scripts/build_cuda_linux.sh
new file mode 100755
index 0000000..f93e897
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_cuda_linux.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_cuda_linux
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/cuda_linux_release
+
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+cmake ${TNN_ROOT_PATH} \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_X86_ENABLE=OFF \
+    -DTNN_CUDA_ENABLE=ON \
+    -DTNN_TENSORRT_ENABLE=ON \
+    -DTNN_BENCHMARK_MODE=OFF \
+    -DTNN_BUILD_SHARED=ON \
+    -DTNN_CONVERTER_ENABLE=OFF 
+
+echo "Building TNN ..."
+make -j32
+
+if [ -d ${TNN_INSTALL_DIR} ]
+then 
+    rm -rf ${TNN_INSTALL_DIR}
+fi
+mkdir ${TNN_INSTALL_DIR}
+mkdir ${TNN_INSTALL_DIR}/lib
+mkdir ${TNN_INSTALL_DIR}/bin
+
+cp -r ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+cp libTNN.so* ${TNN_INSTALL_DIR}/lib
+cp test/TNNTest ${TNN_INSTALL_DIR}/bin
+
+echo "Done"
diff --git a/3rdparty/TNN/scripts/build_cuda_msvc.bat b/3rdparty/TNN/scripts/build_cuda_msvc.bat
new file mode 100644
index 0000000..f368017
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_cuda_msvc.bat
@@ -0,0 +1,86 @@
+@echo off
+SETLOCAL EnableDelayedExpansion
+
+if defined CUDA_PATH (
+    echo "Found CUDA: %CUDA_PATH%"
+)else (
+    echo "CUDA Compiler not found. try install it and set the CUDA_PATH environment variable"
+    goto :eof
+)
+
+set TNN_DIR=%~dp0..\
+set BUILD_DIR=%~dp0build_cuda_msvc
+set TENSORRT_ROOT_DIR=
+@REM for example    F:\Deps\TensorRT-7.1.3.4.Windows10.x86_64.cuda-11.0.cudnn8.0\TensorRT-7.1.3.4
+set CUDNN_ROOT_DIR=
+@REM for example    F:\Deps\cudnn-11.0-windows-x64-v8.0.5.39\cuda\
+set TNN_INSTALL_DIR=%~dp0cuda_msvc_release
+
+if not exist %BUILD_DIR% (
+    mkdir %BUILD_DIR%
+)
+
+echo Building TNN ...
+cd %BUILD_DIR%
+cmake ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CUDA_ENABLE=ON ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_TENSORRT_ENABLE=ON ^
+-DTNN_TEST_ENABLE=ON ^
+-DINTTYPES_FORMAT=C99 ^
+../..
+
+cmake --build . --config Release -j4
+if !errorlevel! == 1 (
+    echo Building TNN Failed
+    goto errorHandle
+)
+
+call :pack_tnn
+
+echo "Building Completes. check %TNN_INSTALL_DIR%"
+
+goto :eof
+
+:: Function, pack tnn files
+:pack_tnn
+    if not exist %TNN_INSTALL_DIR% (
+        mkdir %TNN_INSTALL_DIR%
+        mkdir %TNN_INSTALL_DIR%\bin
+        mkdir %TNN_INSTALL_DIR%\lib
+        mkdir %TNN_INSTALL_DIR%\include
+    )
+
+    :: include
+    xcopy /s/e/y %TNN_DIR%\include %TNN_INSTALL_DIR%\include
+
+    :: lib
+    copy %BUILD_DIR%\Release\TNN.lib %TNN_INSTALL_DIR%\lib\
+
+    :: bin
+    copy %BUILD_DIR%\Release\TNN.dll %TNN_INSTALL_DIR%\bin\
+    copy %BUILD_DIR%\test\Release\TNNTest.exe %TNN_INSTALL_DIR%\bin\
+
+    :: deps bin
+    copy %TENSORRT_ROOT_DIR%\lib\nvinfer.dll %TNN_INSTALL_DIR%\bin\
+    copy %TENSORRT_ROOT_DIR%\lib\nvinfer_plugin.dll %TNN_INSTALL_DIR%\bin\
+    copy %TENSORRT_ROOT_DIR%\lib\myelin64_1.dll %TNN_INSTALL_DIR%\bin\
+    copy %CUDNN_ROOT_DIR%\bin\cudnn64_8.dll %TNN_INSTALL_DIR%\bin\
+    copy %CUDNN_ROOT_DIR%\bin\cudnn_ops_infer64_8.dll %TNN_INSTALL_DIR%\bin\
+    copy %CUDNN_ROOT_DIR%\bin\cudnn_cnn_infer64_8.dll %TNN_INSTALL_DIR%\bin\
+
+    goto :returnOk
+
+:returnOk
+    set return=0
+    goto :eof
+
+:returnError
+    set return=1
+    goto :eof
+
+:errorHandle
+    echo Building Failed
+    goto :eof
diff --git a/3rdparty/TNN/scripts/build_ios.sh b/3rdparty/TNN/scripts/build_ios.sh
new file mode 100755
index 0000000..a89b313
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_ios.sh
@@ -0,0 +1,111 @@
+#!/bin/sh
+export DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+
+if [ -z $TNN_ROOT_PATH ]
+then
+  TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+TNN_BUILD_PATH=$TNN_ROOT_PATH/platforms/ios
+#设置文件
+PLIST_PATH=$TNN_BUILD_PATH/tnn/Info.plist
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+
+# iPhone、iPhone+Simulator、 Mac
+DEVICE_PLATFORM="iPhone+Simulator"
+# DEVICE_PLATFORM="Mac"
+
+SDK_VERSION=0.2.0
+TARGET_NAME="tnn"
+CONFIGURATION="Release"
+XCODE_VERSION=`xcodebuild -version | awk 'NR == 1 {print $2}'`
+XCODE_MAJOR_VERSION=`echo $XCODE_VERSION | awk -F. '{print $1}'`
+
+
+echo ' '
+echo '******************** step 1: update version.h ********************'
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+cd $TNN_BUILD_PATH
+
+echo ' '
+echo '******************** step 2: start build rpn ********************'
+#删除旧SDK文件
+#rm -r ./${TARGET_NAME}.bundle
+rm -r ./${TARGET_NAME}.framework
+rm -r build
+
+
+#更新版本号
+agvtool new-marketing-version ${SDK_VERSION}
+#更新build号
+agvtool next-version -all
+#更新Plist文件
+source $TNN_VERSION_PATH/add_version_plist.sh
+AddAllVersion2Plist $PLIST_PATH
+
+
+#编译 SDK
+if [[ $DEVICE_PLATFORM == iPhone* ]]; then
+  echo ' '
+  echo '******************** Build iPhone SDK ********************'
+  # 指定 arm64
+  # xcodebuild -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphoneos -arch arm64 build
+  xcodebuild -quiet -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphoneos build CODE_SIGN_IDENTITY="" CODE_SIGNING_REQUIRED=NO
+  cp -r build/$CONFIGURATION-iphoneos/$TARGET_NAME.framework build
+elif [ $DEVICE_PLATFORM == "Mac" ]; then
+  echo ' '
+  echo '******************** Build Mac SDK ********************'
+  xcodebuild -quiet -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk macosx -arch x86_64 build CODE_SIGN_IDENTITY="" CODE_SIGNING_REQUIRED=NO
+  cp -r build/$CONFIGURATION/$TARGET_NAME.framework build
+fi
+
+
+if [ $DEVICE_PLATFORM == "iPhone+Simulator" ]; then
+  echo ' '
+  echo '******************** Build Simulator SDK ********************'
+  # 指定 i386
+  # xcodebuild -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphonesimulator -arch i386 build
+  if [ $XCODE_MAJOR_VERSION -ge 12 ]; then
+      xcodebuild -quiet -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphonesimulator OTHER_CFLAGS="-march=x86-64" EXCLUDED_ARCHS=arm64 build CODE_SIGN_IDENTITY="" CODE_SIGNING_REQUIRED=NO
+  else
+      xcodebuild -quiet -target "$TARGET_NAME" -configuration ${CONFIGURATION}  -sdk iphonesimulator OTHER_CFLAGS="-march=x86-64" build CODE_SIGN_IDENTITY="" CODE_SIGNING_REQUIRED=NO
+  fi
+  # merge lib
+  lipo -create "build/$CONFIGURATION-iphonesimulator/$TARGET_NAME.framework/$TARGET_NAME" "build/$CONFIGURATION-iphoneos/$TARGET_NAME.framework/$TARGET_NAME" -output "build/$TARGET_NAME.framework/$TARGET_NAME"
+  # copy metallib
+  cp -r "build/$CONFIGURATION-iphonesimulator/$TARGET_NAME.framework/default.metallib" "build/$TARGET_NAME.framework/default.simulator.metallib"
+fi
+
+cp -r build/$TARGET_NAME.framework .
+rm -r build
+
+# 对于包含Metal的SDK, 转移metallib文件到bundle
+if [ ! -d $TARGET_NAME.bundle ]; then
+ mkdir $TARGET_NAME.bundle
+fi
+
+if [ ! -d $TARGET_NAME.framework/default.metallib ]; then
+  cp $TARGET_NAME.framework/default.metallib $TARGET_NAME.bundle/${TARGET_NAME}.metallib
+  rm $TARGET_NAME.framework/default.metallib
+  cp $TARGET_NAME.framework/default.simulator.metallib $TARGET_NAME.bundle/${TARGET_NAME}.simulator.metallib
+  rm $TARGET_NAME.framework/default.simulator.metallib
+fi
+
+echo ' '
+echo '******************** step 3: add version attr ********************'
+#添加版本信息到库文件
+AddAllVersionAttr "$TNN_BUILD_PATH/$TARGET_NAME.framework/$TARGET_NAME"
+AddAllVersionAttr "$TNN_BUILD_PATH/$TARGET_NAME.bundle/$TARGET_NAME.metallib"
+
+if [ ! -f $TARGET_NAME.framework/${TARGET_NAME} ]; then
+    echo 'Error: building failed.'
+    exit -1
+fi
+
+if [ ! -f $TARGET_NAME.bundle/${TARGET_NAME}.metallib ]; then
+    echo 'Error: building metallib failed.'
+    exit -1
+fi
+echo 'building completes.'
diff --git a/3rdparty/TNN/scripts/build_linux.sh b/3rdparty/TNN/scripts/build_linux.sh
new file mode 100755
index 0000000..08644d8
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_linux.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+set -euo pipefail
+
+BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_linux
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/linux_release
+OPENVINO_BUILD_SHARED="ON"
+
+OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallShared
+if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+then
+    OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallStatic
+fi
+
+export OPENVINO_ROOT_DIR=${OPENVINO_INSTALL_PATH}
+export GIT_LFS_SKIP_SMUDGE=1
+
+check_cmake() {
+    if !(command -v cmake > /dev/null 2>&1); then
+        echo "Cmake not found!"
+        exit 1
+    fi
+
+    for var in $(cmake --version | awk 'NR==1{print $3}')
+    do
+        cmake_version=$var
+    done
+    function version_lt { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" != "$1"; }
+
+    if (version_lt $cmake_version 3.11); then
+        echo "Cmake 3.11 or higher is required. You are running version ${cmake_version}"
+        exit 2
+    fi
+}
+
+clone_openvino() {
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    if [ ! -d openvino ]
+    then
+        git clone --recursive https://github.com/openvinotoolkit/openvino.git
+    fi
+    cd openvino
+    git reset --hard 4795391
+    git submodule update --init --recursive
+    #sed -i '152 i /*' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
+    #sed -i '157 i */' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
+
+    # 编译静态库
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        sed -i '152,152s/SHARED/STATIC/g' inference-engine/src/inference_engine/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/legacy_api/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/transformations/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/low_precision_transformations/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' ngraph/src/ngraph/CMakeLists.txt
+    fi
+}
+
+build_openvino() {
+
+    if [ ! -d ${OPENVINO_INSTALL_PATH} ]
+    then
+        cd ${BUILD_DIR}/openvino
+        mkdir -p build && cd build
+        echo "Configuring Openvino ..."
+        cmake ../ \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DENABLE_OPENCV=OFF \
+        -DCMAKE_INSTALL_PREFIX=${OPENVINO_INSTALL_PATH} \
+        -DENABLE_TBB_RELEASE_ONLY=OFF \
+        -DTHREADING=TBB_AUTO \
+        -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" \
+        -DENABLE_MYRIAD=OFF \
+        -DENABLE_CLDNN=OFF \
+        -DENABLE_GNA=OFF \
+        -DENABLE_VPU=OFF \
+        -DENABLE_SAMPLES=OFF \
+        -DNGRAPH_JSON_ENABLE=OFF \
+        -DENABLE_SPEECH_DEMO=OFF \
+        -DNGRAPH_ONNX_IMPORT_ENABLE=OFF \
+        -DENABLE_PROFILING_ITT=OFF \
+        -DTREAT_WARNING_AS_ERROR=OFF \
+
+        echo "Building Openvino ..."
+        make -j4
+        make install
+    fi
+}
+
+copy_openvino_libraries() {
+
+    local LIB_EXT=".so"
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        LIB_EXT=".a"
+    fi
+
+    cd ${BUILD_DIR}
+
+    if [ -d ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/ ]
+    then
+        mkdir -p ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/libngraph${LIB_EXT} ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/
+    fi
+
+    if [ -d ${OPENVINO_INSTALL_PATH}/lib64/ ]
+    then
+        mkdir -p ${OPENVINO_INSTALL_PATH}/lib
+        cp ${OPENVINO_INSTALL_PATH}/lib64/libpugixml.a ${OPENVINO_INSTALL_PATH}/lib/
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR} ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/bin ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/bin
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/lib ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/lib
+    fi
+
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${TNN_INSTALL_DIR}/lib
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${BUILD_DIR}/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.so ${TNN_INSTALL_DIR}/lib/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/external/tbb/lib/* ${TNN_INSTALL_DIR}/lib/
+
+
+    if [ "${OPENVINO_BUILD_SHARED}" = "ON" ]
+    then
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_legacy${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/libngraph${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+    fi
+}
+
+pack_tnn() {
+    cd ${BUILD_DIR}
+    mkdir -p ${TNN_INSTALL_DIR}/lib
+
+    if [ -d ${TNN_INSTALL_DIR}/include ]
+    then 
+        rm -rf ${TNN_INSTALL_DIR}/include
+    fi 
+
+    cp -RP ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+    cp -P libTNN.so* ${TNN_INSTALL_DIR}/lib
+    cp test/TNNTest ${TNN_INSTALL_DIR}/bin
+}
+
+# building procedure of TNN X86
+
+check_cmake
+
+clone_openvino
+
+build_openvino
+
+copy_openvino_libraries
+
+# 编译 TNN
+echo "Configuring TNN ..."
+cd ${BUILD_DIR}
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_X86_ENABLE=ON \
+    -DTNN_OPENVINO_ENABLE=ON \
+    -DTNN_OPENVINO_BUILD_SHARED=${OPENVINO_BUILD_SHARED} \
+    -DTNN_CUDA_ENABLE=ON \
+    -DTNN_TENSORRT_ENABLE=ON \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_BENCHMARK_MODE=OFF \
+    -DTNN_BUILD_SHARED=ON \
+    -DTNN_CONVERTER_ENABLE=OFF
+
+echo "Building TNN ..."
+make -j4
+
+if [ 0 -ne $? ]
+then
+    exit -1
+fi
+
+export LD_LIBRARY_PATH='$LD_LIBRARY_PATH:'${TNN_INSTALL_DIR}/lib
+ctest --output-on-failure -j 2
+pack_tnn
+
+echo "Done"
diff --git a/3rdparty/TNN/scripts/build_linux_native.sh b/3rdparty/TNN/scripts/build_linux_native.sh
new file mode 100755
index 0000000..98754dc
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_linux_native.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+OPENMP="ON"
+OPENCL="OFF"
+QUANTIZATION="OFF"
+CC=gcc
+CXX=g++
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_linux_native
+cd build_linux_native
+
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux  \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DTNN_X86_ENABLE:BOOL=ON \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DTNN_QUANTIZATION_ENABLE:BOOL=$QUANTIZATION \
+    -DTNN_BENCHMARK_MODE=ON \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB 
+
+make -j7
+
+# check compile error, or ci will not stop
+if [ 0 -ne $? ]
+then
+    exit -1
+fi
diff --git a/3rdparty/TNN/scripts/build_linux_pjq.sh b/3rdparty/TNN/scripts/build_linux_pjq.sh
new file mode 100755
index 0000000..98754dc
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_linux_pjq.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+OPENMP="ON"
+OPENCL="OFF"
+QUANTIZATION="OFF"
+CC=gcc
+CXX=g++
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_linux_native
+cd build_linux_native
+
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux  \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DTNN_X86_ENABLE:BOOL=ON \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DTNN_QUANTIZATION_ENABLE:BOOL=$QUANTIZATION \
+    -DTNN_BENCHMARK_MODE=ON \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB 
+
+make -j7
+
+# check compile error, or ci will not stop
+if [ 0 -ne $? ]
+then
+    exit -1
+fi
diff --git a/3rdparty/TNN/scripts/build_macos.sh b/3rdparty/TNN/scripts/build_macos.sh
new file mode 100755
index 0000000..d3d116a
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_macos.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+
+set -euo pipefail
+
+TNN_DIR=$(cd `dirname $0`; pwd)/..
+BUILD_DIR=${TNN_DIR}/scripts/build_macos
+TNN_INSTALL_DIR=${TNN_DIR}/scripts/macos_release
+OPENVINO_BUILD_SHARED="OFF"
+
+OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallShared
+if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+then
+    OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallStatic
+fi
+
+export OPENVINO_ROOT_DIR=${OPENVINO_INSTALL_PATH}
+export GIT_LFS_SKIP_SMUDGE=1
+
+check_cmake() {
+    if !(command -v cmake > /dev/null 2>&1); then
+        echo "Cmake not found!"
+        exit 1
+    fi
+
+    for var in $(cmake --version | awk 'NR==1{print $3}')
+    do
+        cmake_version=$var
+    done
+    function version_lt { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" != "$1"; }
+
+    if (version_lt $cmake_version 3.11); then
+        echo "Cmake 3.11 or higher is required. You are running version ${cmake_version}"
+        exit 2
+    fi
+}
+
+clone_openvino() {
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    if [ ! -d openvino ]
+    then
+        git clone --recursive https://github.com/openvinotoolkit/openvino.git
+    fi
+    cd openvino
+    git reset --hard 9df6a8f
+    git submodule update
+
+    # 编译静态库
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        sed -i .bak '152,152s/SHARED/STATIC/g' inference-engine/src/inference_engine/CMakeLists.txt
+        sed -i .bak 's/SHARED/STATIC/g' inference-engine/src/legacy_api/CMakeLists.txt
+        sed -i .bak 's/SHARED/STATIC/g' inference-engine/src/transformations/CMakeLists.txt
+        sed -i .bak 's/SHARED/STATIC/g' inference-engine/src/low_precision_transformations/CMakeLists.txt
+        sed -i .bak 's/SHARED/STATIC/g' ngraph/src/ngraph/CMakeLists.txt
+    fi
+}
+
+build_openvino() {
+
+    if [ ! -d ${OPENVINO_INSTALL_PATH} ]
+    then
+        cd ${BUILD_DIR}/openvino
+        mkdir -p build && cd build
+        echo "Configuring Openvino ..."
+        cmake ../ \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DENABLE_OPENCV=OFF \
+        -DCMAKE_INSTALL_PREFIX=${OPENVINO_INSTALL_PATH} \
+        -DENABLE_TBB_RELEASE_ONLY=OFF \
+        -DTHREADING=SEQ \
+        -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" \
+        -DENABLE_MYRIAD=OFF \
+        -DENABLE_CLDNN=OFF \
+        -DENABLE_GNA=OFF \
+        -DENABLE_VPU=OFF \
+        -DENABLE_SAMPLES=OFF \
+        -DNGRAPH_JSON_ENABLE=OFF \
+        -DENABLE_SPEECH_DEMO=OFF \
+        -DNGRAPH_ONNX_IMPORT_ENABLE=OFF \
+        -DENABLE_PROFILING_ITT=OFF \
+        -DTREAT_WARNING_AS_ERROR=OFF \
+
+        echo "Building Openvino ..."
+        make -j4
+        make install
+    fi
+}
+
+copy_openvino_libraries() {
+
+    local LIB_EXT=".dylib"
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        LIB_EXT=".a"
+    fi
+
+    cd ${BUILD_DIR}
+
+    if [ -d ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/ ]
+    then
+        mkdir -p ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/libngraph${LIB_EXT} ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/
+    fi
+
+    if [ -d ${OPENVINO_INSTALL_PATH}/lib64/ ]
+    then
+        mkdir -p ${OPENVINO_INSTALL_PATH}/lib
+        cp ${OPENVINO_INSTALL_PATH}/lib64/libpugixml.a ${OPENVINO_INSTALL_PATH}/lib/
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR} ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/bin ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/bin
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/lib ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/lib
+    fi
+
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${TNN_INSTALL_DIR}/lib/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${BUILD_DIR}/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.dylib ${TNN_INSTALL_DIR}/lib/
+
+
+    if [ "${OPENVINO_BUILD_SHARED}" = "ON" ]
+    then
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_legacy${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/libngraph${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+    fi
+}
+
+pack_tnn() {
+    cd ${BUILD_DIR}
+    mkdir -p ${TNN_INSTALL_DIR}/lib
+
+    if [ -d ${TNN_INSTALL_DIR}/include ]
+    then 
+        rm -rf ${TNN_INSTALL_DIR}/include
+    fi 
+
+    cp -RP ${TNN_DIR}/include ${TNN_INSTALL_DIR}/
+    cp -R libTNN* ${TNN_INSTALL_DIR}/lib
+    cp test/TNNTest ${TNN_INSTALL_DIR}/bin
+}
+
+# building procedure of TNN X86
+#brew install automake
+#brew install libtool 
+
+check_cmake
+
+clone_openvino
+
+build_openvino
+
+copy_openvino_libraries
+
+# 编译 TNN
+echo "Configuring TNN ..."
+cd ${BUILD_DIR}
+cmake ${TNN_DIR} \
+-DTNN_OPENVINO_ENABLE=ON \
+-DTNN_X86_ENABLE=ON \
+-DTNN_TEST_ENABLE=ON \
+-DTNN_CPU_ENABLE=ON \
+-DTNN_METAL_ENABLE=OFF \
+-DTNN_OPENVINO_BUILD_SHARED=${OPENVINO_BUILD_SHARED} \
+
+echo "Building TNN ..."
+make -j4
+
+# check if compiling error occurs, or ci will ignore building errors
+if [ 0 -ne $? ]
+then
+    echo 'building failed.'
+    exit -1
+fi
+
+pack_tnn
+
+echo "Done"
diff --git a/3rdparty/TNN/scripts/build_macos_native.sh b/3rdparty/TNN/scripts/build_macos_native.sh
new file mode 100755
index 0000000..0deed33
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_macos_native.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+METAL="ON"
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_macos_native
+cd build_macos_native
+
+cmake ${TNN_ROOT_PATH} \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_X86_ENABLE=ON \
+    -DTNN_METAL_ENABLE:BOOL=$METAL \
+    -DTNN_UNIT_TEST_ENABLE=ON \
+    -DTNN_COVERAGE=ON \
+    -DTNN_BENCHMARK_MODE=ON \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB \
+    -DTNN_CONVERTER_ENABLE=ON
+
+make -j4
diff --git a/3rdparty/TNN/scripts/build_msvc.bat b/3rdparty/TNN/scripts/build_msvc.bat
new file mode 100644
index 0000000..704e43d
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_msvc.bat
@@ -0,0 +1,184 @@
+@echo off
+SETLOCAL EnableDelayedExpansion
+
+set GIT_LFS_SKIP_SMUDGE=1
+set OPENVINO_BUILD_SHARED="ON"
+
+set TNN_DIR=%~dp0..\
+set BUILD_DIR=%~dp0build_msvc
+set OPENVINO_DIR=%BUILD_DIR%\openvino
+set TNN_INSTALL_DIR=%~dp0msvc_release
+
+set OPENVINO_INSTALL_DIR=%BUILD_DIR%\openvinoInstallStatic
+if %OPENVINO_BUILD_SHARED% == "ON" (
+    set OPENVINO_INSTALL_DIR=%BUILD_DIR%\openvinoInstallShared
+)
+set OPENVINO_ROOT_DIR=%OPENVINO_INSTALL_DIR%
+
+if not exist %BUILD_DIR% (
+    mkdir %BUILD_DIR%
+)
+
+call :clone_openvino
+if not %return% == 0 (
+    goto :errorHandle
+)
+
+call :build_openvino
+if not %return% == 0 (
+    goto :errorHandle
+)
+
+echo Building TNN ...
+cd %BUILD_DIR%
+cmake -G "Ninja" ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_TEST_ENABLE=ON ^
+-DINTTYPES_FORMAT=C99 ^
+-DTNN_OPENVINO_ENABLE=ON ^
+-DTNN_OPENVINO_BUILD_SHARED=%OPENVINO_BUILD_SHARED% ^
+-DTNN_X86_ENABLE=ON ^
+../..
+
+cmake --build . --config Release -j4
+if !errorlevel! == 1 (
+    echo Building Openvino Failed
+    goto errorHandle
+)
+
+call :pack_tnn
+
+echo "Building Completes. check %TNN_INSTALL_DIR%"
+
+goto :eof
+
+:: Function, pack tnn files
+:pack_tnn
+    if not exist %TNN_INSTALL_DIR% (
+        mkdir %TNN_INSTALL_DIR%
+        mkdir %TNN_INSTALL_DIR%\bin
+        mkdir %TNN_INSTALL_DIR%\lib
+        mkdir %TNN_INSTALL_DIR%\include
+    )
+
+    copy %BUILD_DIR%\test\TNNTest.exe %TNN_INSTALL_DIR%\bin\
+    copy %BUILD_DIR%\TNN.dll %TNN_INSTALL_DIR%\bin\
+    copy %BUILD_DIR%\TNN.lib %TNN_INSTALL_DIR%\lib\
+
+    xcopy /s/e/y %TNN_DIR%\include %TNN_INSTALL_DIR%\include
+    copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\MKLDNNPlugin.dll %TNN_INSTALL_DIR%\bin\
+    copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\plugins.xml  %TNN_INSTALL_DIR%\lib\
+    copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\plugins.xml  %TNN_INSTALL_DIR%\bin\
+    copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\plugins.xml  %BUILD_DIR%\
+
+    if %OPENVINO_BUILD_SHARED% == "ON" (
+        copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\inference_engine.dll %TNN_INSTALL_DIR%\bin\
+        copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\inference_engine_legacy.dll %TNN_INSTALL_DIR%\bin\
+        copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\inference_engine_transformations.dll %TNN_INSTALL_DIR%\bin\
+        copy %OPENVINO_INSTALL_DIR%\deployment_tools\inference_engine\bin\intel64\Release\inference_engine_lp_transformations.dll %TNN_INSTALL_DIR%\bin\
+        copy %OPENVINO_INSTALL_DIR%\deployment_tools\ngraph\lib\ngraph.dll %TNN_INSTALL_DIR%\bin\
+    )
+
+    goto :returnOk
+
+:: Function, build_openvino
+:build_openvino
+    if exist %OPENVINO_INSTALL_DIR% (
+        goto :returnOk
+    )
+
+    if not exist %OPENVINO_DIR%\build (
+        mkdir build
+    )
+
+    cd %OPENVINO_DIR%/build
+    echo Configuring Openvino ...
+
+    cmake -G "Ninja" ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_SYSTEM_NAME=Windows ^
+    -DCMAKE_SYSTEM_PROCESSOR=AMD64 ^
+    -DCMAKE_CROSSCOMPILING=OFF ^
+    -DENABLE_OPENCV=OFF ^
+    -DCMAKE_INSTALL_PREFIX="!OPENVINO_INSTALL_DIR!" ^
+    -DENABLE_TBB_RELEASE_ONLY=OFF ^
+    -DTHREADING=SEQ ^
+    -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" ^
+    -DNGRAPH_JSON_ENABLE=OFF ^
+    -DENABLE_PROFILING_ITT=OFF ^
+    -DENABLE_MYRIAD=OFF ^
+    -DENABLE_CLDNN=OFF ^
+    -DENABLE_GNA=OFF ^
+    -DENABLE_VPU=OFF ^
+    -DTREAT_WARNING_AS_ERROR=OFF ^
+    -DENABLE_SAMPLES=OFF ^
+    -DENABLE_SPEECH_DEMO=OFF ^
+    -DNGRAPH_ONNX_IMPORT_ENABLE=OFF ^
+    ..
+
+    if not !errorlevel! == 0 (
+        echo Configure Openvino Failed
+        goto :returnError
+    )
+
+    cmake --build . --config Release -j4
+    if not !errorlevel! == 0 (
+        echo Building Openvino Failed
+        goto :returnError
+    )
+
+    cmake --install .
+
+    goto :returnOk
+
+:: Function, check and clone openvino
+:clone_openvino
+    cd !BUILD_DIR!
+    if not exist !OPENVINO_DIR! (
+        git clone --recursive https://github.com/openvinotoolkit/openvino.git
+        if !errorlevel! == 1 (
+            echo Openvino Clone Failed!
+            rd /s /Q openvino
+            goto :returnError
+        )
+    ) 
+
+    cd !OPENVINO_DIR!
+    git reset --hard 4795391
+    git submodule update
+    if !errorlevel! == 1 (
+        echo Openvino Clone Failed!
+        rd /s /Q openvino
+        goto :returnError
+    )
+
+    set "sed=!OPENVINO_DIR!\sed\sed.exe"
+    if %OPENVINO_BUILD_SHARED% == "OFF" (
+        git clone https://github.com/Maosquerade/sed.git
+        %sed% -i 152,152s/SHARED/STATIC/g inference-engine/src/inference_engine/CMakeLists.txt
+        %sed% -i s/SHARED/STATIC/g inference-engine/src/legacy_api/CMakeLists.txt
+        %sed% -i s/SHARED/STATIC/g inference-engine/src/transformations/CMakeLists.txt
+        %sed% -i s/SHARED/STATIC/g inference-engine/src/low_precision_transformations/CMakeLists.txt
+        %sed% -i s/SHARED/STATIC/g ngraph/src/ngraph/CMakeLists.txt
+        %sed% -i "32a\if (CMAKE_SYSTEM_NAME MATCHES \"Windows\")" cmake/developer_package.cmake
+        %sed% -i "33a\    set(CMAKE_SYSTEM_PROCESSOR \"AMD64\")" cmake/developer_package.cmake
+        %sed% -i "34a\endif()" cmake/developer_package.cmake
+        %sed% -i "s/__declspec(dllimport)//g" ngraph/src/ngraph/visibility.hpp
+        %sed% -i "s/__declspec(dllimport)//g" inference-engine/include/ie_api.h
+    )
+
+    goto :returnOk
+
+:returnOk
+    set return=0
+    goto :eof
+
+:returnError
+    set return=1
+    goto :eof
+
+:errorHandle
+    echo Building Failed
+    goto :eof
diff --git a/3rdparty/TNN/scripts/build_msvc_native.bat b/3rdparty/TNN/scripts/build_msvc_native.bat
new file mode 100644
index 0000000..a589408
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_msvc_native.bat
@@ -0,0 +1,35 @@
+set TNN_DIR=%~dp0..\
+
+@echo off
+echo %TNN_DIR%
+echo %1
+
+if "%2" == "" (
+    goto init_fold
+) else (
+    goto init_env
+)
+
+:init_env
+    if %1 == x86 (
+        echo "build x86"
+        call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars32.bat"
+    ) else (
+        echo "build x64"
+        call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars64.bat"
+    )
+    goto init_fold
+
+:init_fold
+    mkdir build_win
+    cd build_win
+
+cmake %TNN_DIR% -G "Ninja" ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_X86_ENABLE=ON ^
+-DTNN_TEST_ENABLE=ON ^
+-DINTTYPES_FORMAT=C99
+
+cmake --build . --config Release
diff --git a/3rdparty/TNN/scripts/build_test.sh b/3rdparty/TNN/scripts/build_test.sh
new file mode 100755
index 0000000..f37eebe
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_test.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+SHARED_LIB="ON"
+ARM="ON"
+OPENMP="ON"
+OPENCL="OFF"
+CC=gcc
+CXX=g++
+TARGET_ARCH=aarch64
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+mkdir build_aarch64_linux
+cd build_aarch64_linux
+
+cmake ${TNN_ROOT_PATH} \
+    -DCMAKE_SYSTEM_NAME=Linux  \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DTNN_ARM_ENABLE:BOOL=$ARM \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_UNIT_TEST_ENABLE=ON \
+    -DTNN_COVERAGE=ON \
+    -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+    -DTNN_BUILD_SHARED:BOOL=$SHARED_LIB
+
+
+make -j4
+
+# check compile error, or ci will not stop
+if [ 0 -ne $? ]
+then
+    exit -1
+fi
+
+ctest --output-on-failure -j 2
diff --git a/3rdparty/TNN/scripts/build_win_x86_opencl.bat b/3rdparty/TNN/scripts/build_win_x86_opencl.bat
new file mode 100644
index 0000000..03528c9
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_win_x86_opencl.bat
@@ -0,0 +1,36 @@
+set TNN_DIR=%~dp0..\
+
+@echo off
+echo %TNN_DIR%
+echo %1
+
+if "%2" == "" (
+    goto init_fold
+) else (
+    goto init_env
+)
+
+:init_env
+    if %1 == x86 (
+        echo "build x86"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars32.bat"
+    ) else (
+        echo "build x64"
+        call "D:\Microsoft\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"
+    )
+    goto init_fold
+
+:init_fold
+    mkdir build_win
+    cd build_win
+
+cmake %TNN_DIR% -G "Ninja" ^
+-DCMAKE_BUILD_TYPE=Release ^
+-DCMAKE_SYSTEM_NAME=Windows ^
+-DTNN_CPU_ENABLE=ON ^
+-DTNN_OPENCL_ENABLE=ON ^
+-DTNN_X86_ENABLE=ON ^
+-DTNN_TEST_ENABLE=ON ^
+-DINTTYPES_FORMAT=C99
+
+cmake --build . --config Release
diff --git a/3rdparty/TNN/scripts/build_x86_linux.sh b/3rdparty/TNN/scripts/build_x86_linux.sh
new file mode 100755
index 0000000..1cb71aa
--- /dev/null
+++ b/3rdparty/TNN/scripts/build_x86_linux.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+set -euo pipefail
+
+BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_x86_linux
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/x86_linux_release
+OPENVINO_BUILD_SHARED="ON"
+
+OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallShared
+if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+then
+    OPENVINO_INSTALL_PATH=${BUILD_DIR}/openvinoInstallStatic
+fi
+
+export OPENVINO_ROOT_DIR=${OPENVINO_INSTALL_PATH}
+export GIT_LFS_SKIP_SMUDGE=1
+
+check_cmake() {
+    if !(command -v cmake > /dev/null 2>&1); then
+        echo "Cmake not found!"
+        exit 1
+    fi
+
+    for var in $(cmake --version | awk 'NR==1{print $3}')
+    do
+        cmake_version=$var
+    done
+    function version_lt { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" != "$1"; }
+
+    if (version_lt $cmake_version 3.11); then
+        echo "Cmake 3.11 or higher is required. You are running version ${cmake_version}"
+        exit 2
+    fi
+}
+
+clone_openvino() {
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    if [ ! -d openvino ]
+    then
+        git clone --recursive https://github.com/openvinotoolkit/openvino.git
+    fi
+    cd openvino
+    git reset --hard 18e83a2
+    git submodule update --init --recursive
+    #sed -i '152 i /*' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
+    #sed -i '157 i */' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
+
+    # 编译静态库
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        sed -i '152,152s/SHARED/STATIC/g' inference-engine/src/inference_engine/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/legacy_api/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/transformations/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' inference-engine/src/low_precision_transformations/CMakeLists.txt
+        sed -i 's/SHARED/STATIC/g' ngraph/src/ngraph/CMakeLists.txt
+    fi
+}
+
+build_openvino() {
+
+    if [ ! -d ${OPENVINO_INSTALL_PATH} ]
+    then
+        cd ${BUILD_DIR}/openvino
+        mkdir -p build && cd build
+        echo "Configuring Openvino ..."
+        cmake ../ \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DENABLE_OPENCV=OFF \
+        -DCMAKE_INSTALL_PREFIX=${OPENVINO_INSTALL_PATH} \
+        -DENABLE_TBB_RELEASE_ONLY=OFF \
+        -DTHREADING=TBB_AUTO \
+        -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" \
+        -DENABLE_MYRIAD=OFF \
+        -DENABLE_CLDNN=OFF \
+        -DENABLE_GNA=OFF \
+        -DENABLE_VPU=OFF \
+        -DENABLE_SAMPLES=OFF \
+        -DNGRAPH_JSON_ENABLE=OFF \
+        -DENABLE_SPEECH_DEMO=OFF \
+        -DNGRAPH_ONNX_IMPORT_ENABLE=OFF \
+        -DENABLE_PROFILING_ITT=OFF \
+        -DTREAT_WARNING_AS_ERROR=OFF \
+
+        echo "Building Openvino ..."
+        make -j7
+        make install
+    fi
+}
+
+copy_openvino_libraries() {
+
+    local LIB_EXT=".so"
+    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    then
+        LIB_EXT=".a"
+    fi
+
+    cd ${BUILD_DIR}
+
+    if [ -d ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/ ]
+    then
+        mkdir -p ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/libngraph${LIB_EXT} ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/
+    fi
+
+    # if [ -d ${OPENVINO_INSTALL_PATH}/lib64/ ]
+    # then
+    #     mkdir -p ${OPENVINO_INSTALL_PATH}/lib
+    #     cp ${OPENVINO_INSTALL_PATH}/lib64/libpugixml.a ${OPENVINO_INSTALL_PATH}/lib/
+    # fi
+
+    if [ ! -d ${TNN_INSTALL_DIR} ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/bin ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/bin
+    fi
+
+    if [ ! -d ${TNN_INSTALL_DIR}/lib ] 
+    then
+        mkdir -p ${TNN_INSTALL_DIR}/lib
+    fi
+
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${TNN_INSTALL_DIR}/lib
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${BUILD_DIR}/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.so ${TNN_INSTALL_DIR}/lib/
+    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/external/tbb/lib/* ${TNN_INSTALL_DIR}/lib/
+
+
+    if [ "${OPENVINO_BUILD_SHARED}" = "ON" ]
+    then
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_legacy${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/libngraph${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+    fi
+}
+
+pack_tnn() {
+    cd ${BUILD_DIR}
+    mkdir -p ${TNN_INSTALL_DIR}/lib
+
+    if [ -d ${TNN_INSTALL_DIR}/include ]
+    then 
+        rm -rf ${TNN_INSTALL_DIR}/include
+    fi 
+
+    cp -RP ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+    cp -P libTNN.so* ${TNN_INSTALL_DIR}/lib
+    cp test/TNNTest ${TNN_INSTALL_DIR}/bin
+}
+
+# building procedure of TNN X86
+
+check_cmake
+
+clone_openvino
+
+build_openvino
+
+copy_openvino_libraries
+
+# 编译 TNN
+echo "Configuring TNN ..."
+cd ${BUILD_DIR}
+cmake ${TNN_ROOT_PATH} \
+-DTNN_OPENVINO_ENABLE=ON \
+-DTNN_X86_ENABLE=ON \
+-DTNN_TEST_ENABLE=ON \
+-DTNN_CPU_ENABLE=ON \
+-DTNN_OPENVINO_BUILD_SHARED=${OPENVINO_BUILD_SHARED} \
+
+echo "Building TNN ..."
+make -j7
+
+if [ 0 -ne $? ]
+then
+    exit -1
+fi
+
+export LD_LIBRARY_PATH='$LD_LIBRARY_PATH:'${TNN_INSTALL_DIR}/lib
+pack_tnn
+
+echo "Done"
diff --git a/3rdparty/TNN/scripts/download_opencv.sh b/3rdparty/TNN/scripts/download_opencv.sh
new file mode 100755
index 0000000..b20c0cb
--- /dev/null
+++ b/3rdparty/TNN/scripts/download_opencv.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+OPENCV_INSTALL_DIR=`dirname $0`/../third_party/opencv
+
+download_opencv() { #platfotm:iOS/Android, URL
+    PLATFORM=$1
+    SAVEDIR=${OPENCV_INSTALL_DIR}/${PLATFORM}
+    if [ ! -d ${SAVEDIR} ]; then
+        mkdir -p ${SAVEDIR}
+    fi
+    FILENAME=`basename $2`
+    SAVEPATH=${SAVEDIR}/${FILENAME}
+    if [ ! -e ${SAVEPATH} ]; then
+        echo "downloading ${FILENAME}"
+        status=`curl -L $2 -w %{http_code} -o ${SAVEPATH}`
+        if [ $status -ne 200 ]; then
+            echo "download ${FILENAME} failed!"
+            if [ -d ${SAVEDIR} ]; then
+                rm -r ${SAVEDIR}
+            fi
+            return -1
+        fi
+    fi
+    echo "unzip ${FILENAME} into ${SAVEDIR}"
+    #unzip
+    unzip -q ${SAVEPATH} -d ${SAVEDIR}
+}
+os="all"
+if [ "$#" -ge 1 ]; then
+    os=`echo $1 | tr 'A-Z' 'a-z'`
+fi
+
+if [ "ios" == $os ]; then
+    download_opencv iOS https://sourceforge.net/projects/opencvlibrary/files/3.4.13/opencv-3.4.13-ios-framework.zip
+elif [ "android" == $os ]; then
+    download_opencv Android https://sourceforge.net/projects/opencvlibrary/files/3.4.13/opencv-3.4.13-android-sdk.zip
+else
+    download_opencv iOS https://sourceforge.net/projects/opencvlibrary/files/3.4.13/opencv-3.4.13-ios-framework.zip
+    download_opencv Android https://sourceforge.net/projects/opencvlibrary/files/3.4.13/opencv-3.4.13-android-sdk.zip
+fi
diff --git a/3rdparty/TNN/scripts/version/add_version_attr.sh b/3rdparty/TNN/scripts/version/add_version_attr.sh
new file mode 100755
index 0000000..61b8c04
--- /dev/null
+++ b/3rdparty/TNN/scripts/version/add_version_attr.sh
@@ -0,0 +1,27 @@
+# add git attr
+# 1-4:version from func GetGitVersion 5：lib file full path
+#
+function AddVersionAttr()
+{
+  if [[ ! -f "${5}" ]]; then
+    echo "文件不存在: "${5}
+    return 1
+  fi
+
+  TARGET=${1}
+  BRANCH=${2}
+  DATE=${3}
+  HASH=${4}
+  BRANCH_KEY=${TARGET}_commit_branch
+  DATE_KEY=${TARGET}_commit_date
+  HASH_KEY=${TARGET}_commit_hash
+  if [[ "$(uname)" == "Darwin" ]]; then
+    xattr -w ${BRANCH_KEY} ${BRANCH} ${5}
+    xattr -w ${DATE_KEY} ${DATE} ${5}
+    xattr -w ${HASH_KEY} ${HASH} ${5}
+  elif [[ "$(expr substr $(uname -s) 1 5)" == "Linux" ]]; then
+    attr -s ${BRANCH_KEY} -V ${BRANCH} ${5} > /dev/null
+    attr -s ${DATE_KEY} -V ${DATE} ${5} > /dev/null
+    attr -s ${HASH_KEY} -V ${HASH} ${5} > /dev/null
+  fi
+}
diff --git a/3rdparty/TNN/scripts/version/add_version_plist.sh b/3rdparty/TNN/scripts/version/add_version_plist.sh
new file mode 100755
index 0000000..20e95fe
--- /dev/null
+++ b/3rdparty/TNN/scripts/version/add_version_plist.sh
@@ -0,0 +1,26 @@
+# add version to plist
+# 1:parent key
+# 2-5:version from func GetGitVersion
+# 6：lib file full path
+function AddVersion2Plist()
+{
+  if [[ ! -f "${6}" ]]; then
+    echo "文件不存在: "${6}
+    return 1
+  fi
+
+  SDK_INFO_KEY=${1}
+
+  TARGET=${2}
+  BRANCH=${3}
+  DATE=${4}
+  HASH=${5}
+
+  BRANCH_KEY=${TARGET}_commit_branch
+  DATE_KEY=${TARGET}_commit_date
+  HASH_KEY=${TARGET}_commit_hash
+
+  /usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY:${BRANCH_KEY} string ${BRANCH}" ${6}
+  /usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY:${DATE_KEY} string ${DATE}" ${6}
+  /usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY:${HASH_KEY} string ${HASH}" ${6}
+}
diff --git a/3rdparty/TNN/scripts/version/get_git_version.sh b/3rdparty/TNN/scripts/version/get_git_version.sh
new file mode 100755
index 0000000..ea12a68
--- /dev/null
+++ b/3rdparty/TNN/scripts/version/get_git_version.sh
@@ -0,0 +1,25 @@
+# get git version
+# 1:git目录  2：git标识
+function GetGitVersion()
+{
+  if [[ ! -d "${1}" ]]; then
+    echo "文件夹不存在: "${1}
+  fi
+
+  ORIG_PATH=$PWD
+  cd $1
+
+  GIT_BRANCH_NAME=$(eval "git symbolic-ref --short -q HEAD")
+  GIT_COMMIT_DATE=$(eval "git log -1 --pretty=format:'%ad' --date=short")
+  GIT_VERSION_DATE=$(eval "git log -1 --pretty=format:'%ad' --date=short")
+  GIT_COMMIT_HASH=$(eval "git log -1 --pretty=format:'%h'")
+  if [ "$GIT_BRANCH_NAME" = "" ]; then
+    GIT_BRANCH_NAME='HEAD'
+  fi
+
+  # 输出
+  VERSION_INFO=(${2} ${GIT_BRANCH_NAME} ${GIT_COMMIT_DATE} ${GIT_COMMIT_HASH})
+  echo ${VERSION_INFO[*]}
+
+  cd $ORIG_PATH
+}
diff --git a/3rdparty/TNN/scripts/version/version.sh b/3rdparty/TNN/scripts/version/version.sh
new file mode 100755
index 0000000..09f4ada
--- /dev/null
+++ b/3rdparty/TNN/scripts/version/version.sh
@@ -0,0 +1,82 @@
+# add version to file
+# 1:target 2:branch 3:date 4:hash  5：file
+function AddToVersionFileH()
+{
+  echo "" >> $5
+  echo "Target: "${1}
+  echo "Commit Branch: "${2}
+  echo "static char *branch_name_${1} = \""${2}"\";" >> $5
+  echo "Commit Date: "${3}
+  echo "static char *commit_date_${1} = \""${3}"\";" >> $5
+  echo "Commit Hash: "${4}
+  echo "static char *commit_hash_${1} = \""${4}"\";" >> $5
+}
+
+# add all git attr
+# 1：lib file full path
+function AddAllVersionAttr()
+{
+  if [[ ! -f "${1}" ]]; then
+    echo "文件不存在: "${1}
+    return 1
+  fi
+
+  AddVersionAttr $(echo ${VERSION_INFO_TNN[*]}) ${1}
+}
+
+# add all git attr
+# 1：lib file full path
+function AddAllVersion2Plist()
+{
+  if [[ ! -f "${1}" ]]; then
+    echo "文件不存在: "${1}
+    return 1
+  fi
+
+  PLIST_PATH=${1}
+  SDK_INFO_KEY="YTSDKInfo"
+
+  #修改plist的YTSDKInfo字段
+  /usr/libexec/PlistBuddy -c "Delete  $SDK_INFO_KEY" $PLIST_PATH
+  /usr/libexec/PlistBuddy -c "Add  $SDK_INFO_KEY dict" $PLIST_PATH
+
+  AddVersion2Plist ${SDK_INFO_KEY} $(echo ${VERSION_INFO_TNN[*]}) ${1}
+}
+
+echo $PWD
+
+TNN_VERSION_BUILD_PATH=$PWD
+TNN_VERSION_FILE_PATH=$PWD/version.h
+
+# 获取各个git信息
+source $TNN_VERSION_BUILD_PATH/get_git_version.sh
+VERSION_INFO_TNN=($(GetGitVersion $TNN_VERSION_BUILD_PATH tnn))
+
+# 写入version.h文件
+rm $TNN_VERSION_FILE_PATH
+echo "// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the \"License\"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License." >> $TNN_VERSION_FILE_PATH
+echo "#ifndef TNN_INCLUDE_TNN_VERSION_H_" >> $TNN_VERSION_FILE_PATH
+echo "#define TNN_INCLUDE_TNN_VERSION_H_" >> $TNN_VERSION_FILE_PATH
+
+
+AddToVersionFileH $(echo ${VERSION_INFO_TNN[*]}) $TNN_VERSION_FILE_PATH
+
+echo "" >> $TNN_VERSION_FILE_PATH
+echo "#endif //TNN_INCLUDE_TNN_VERSION_H_" >> version.h
+
+cp version.h $TNN_VERSION_BUILD_PATH/../../include/tnn/
+
+# 删除临时文件
+rm version.h
diff --git a/3rdparty/TNN/source/tnn/core/abstract_device.cc b/3rdparty/TNN/source/tnn/core/abstract_device.cc
new file mode 100644
index 0000000..e4414bf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_device.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/abstract_device.h"
+
+#include <map>
+#include <mutex>
+
+namespace TNN_NS {
+
+AbstractDevice::AbstractDevice(DeviceType device_type) : device_type_(device_type) {}
+
+AbstractDevice::~AbstractDevice() {}
+
+DeviceType AbstractDevice::GetDeviceType() {
+    return device_type_;
+}
+
+Status AbstractDevice::Allocate(BlobHandle* handle, BlobMemorySizeInfo& size_info) {
+    void* data = nullptr;
+
+    auto status = Allocate(&data, size_info);
+    if (status != TNN_OK) {
+        return status;
+    }
+    handle->base         = data;
+    handle->bytes_offset = 0;
+
+    return TNN_OK;
+}
+
+std::shared_ptr<const ImplementedPrecision> AbstractDevice::GetImplementedPrecision(LayerType type) {
+    return std::make_shared<ImplementedPrecision>();
+}
+
+std::shared_ptr<const ImplementedLayout> AbstractDevice::GetImplementedLayout(LayerType type) {
+    return std::make_shared<ImplementedLayout>();
+}
+
+AbstractDevice* GetDevice(DeviceType type) {
+    return GetGlobalDeviceMap()[type].get();
+}
+
+/*
+ * All devices are stored in this map.
+ * The actual Device is registered as runtime.
+ */
+std::map<DeviceType, std::shared_ptr<AbstractDevice>>& GetGlobalDeviceMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<DeviceType, std::shared_ptr<AbstractDevice>>> device_map;
+    std::call_once(once, []() { device_map.reset(new std::map<DeviceType, std::shared_ptr<AbstractDevice>>); });
+    return *device_map;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/abstract_device.h b/3rdparty/TNN/source/tnn/core/abstract_device.h
new file mode 100644
index 0000000..a7b2f26
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_device.h
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_ABSTRACT_DEVICE_H_
+#define TNN_SOURCE_TNN_CORE_ABSTRACT_DEVICE_H_
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+struct ImplementedPrecision {
+    bool fp32_implemented  = false;
+    bool fp16_implemented  = false;
+    bool bfp16_implemented = false;
+};
+
+struct ImplementedLayout {
+    std::vector<DataFormat> layouts;
+};
+
+// @brief AbstractDevice define create memory, context and layer acc interface.
+class AbstractDevice {
+public:
+    // @brief constructor
+    explicit AbstractDevice(DeviceType);
+
+    // @brief virtual destructor
+    virtual ~AbstractDevice();
+
+    // @brief calculate blob memory size for different dims
+    // @param BlobDesc blob description
+    // @return blob memory size info
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc) = 0;
+
+    // @brief Allocates mat  memory
+    // @param MatType mat type description
+    // @param DimsVector mat dims
+    // @return blob memory size info
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims) = 0;
+
+    // @brief Allocates memory
+    // @param size info blob size info to allocate
+    // @param handle handle blob memory
+    // @return TNN_OK if free success, otherwise error code.
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info) = 0;
+
+    // @brief Allocates memory
+    // @param size info blob size info to allocate
+    // @param handle handle blob memory
+    // @return TNN_OK if free success, otherwise error code.
+    virtual Status Allocate(BlobHandle* handle, BlobMemorySizeInfo& size_info);
+
+    // @brief Releases memory resources associated by the handle.
+    // @return TNN_OK if free success, otherwise error code.
+    virtual Status Free(void* handle) = 0;
+
+    // @brief Transfer memory from Host to Device
+    // @return TNN_OK if copy success, otherwise error code.
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) = 0;
+
+    // @brief Transfer memory from Device to Host
+    // @return TNN_OK if copy success, otherwise error code.
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) = 0;
+
+    // @brief CreateLayerAcc create different layer type acc
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type) = 0;
+
+    // @brief CreateContext create tnn instance device context
+    virtual Context* CreateContext(int device_id) = 0;
+
+    // @brief get implemented precisions on the device by layer type
+    virtual std::shared_ptr<const ImplementedPrecision> GetImplementedPrecision(LayerType type);
+
+    // @brief get implemented layouts on the device by layer type
+    virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    // @brief get factory device type
+    DeviceType GetDeviceType();
+
+    // @brief auto network type decided by device.
+    virtual NetworkType ConvertAutoNetworkType() = 0;
+
+private:
+    DeviceType device_type_;
+};
+
+// @brief GetGlobalDeviceMap device type map
+std::map<DeviceType, std::shared_ptr<AbstractDevice>>& GetGlobalDeviceMap();
+
+// @brief Get Device
+AbstractDevice* GetDevice(DeviceType type);
+
+// @brief TypeDeviceRegister construct register device
+template <typename T>
+class TypeDeviceRegister {
+public:
+    explicit TypeDeviceRegister(DeviceType type) {
+        auto& device_map = GetGlobalDeviceMap();
+        if (device_map.find(type) == device_map.end()) {
+            device_map[type] = std::shared_ptr<T>(new T(type));
+        }
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_ABSTRACT_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/core/abstract_layer_acc.cc b/3rdparty/TNN/source/tnn/core/abstract_layer_acc.cc
new file mode 100644
index 0000000..19085c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_layer_acc.cc
@@ -0,0 +1,192 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/core/profile.h"
+#include "tnn/memory_manager/blob_memory_pool.h"
+
+#include <algorithm>
+
+namespace TNN_NS {
+
+Status AbstractLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                              const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    /*
+     * Check whether the format is supported by LayerAcc or not.
+     * The supported format of each layer is given by LayerAcc.
+     */
+    for (auto blob : outputs) {
+        Status ret = ResolveBlobDataFormat(blob, BLOB_OUTPUT);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    for (auto blob : inputs) {
+        Status ret = ResolveBlobDataFormat(blob, BLOB_INPUT);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+    
+    return TNN_OK;
+}
+
+Status AbstractLayerAcc::BeforeForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (runtime_model_ == RUNTIME_MODE_CONST_FOLD) {
+        auto status = InferRuntimeOutputShape(inputs, outputs);
+        RETURN_ON_NEQ(status, TNN_OK);
+        return AllocateRuntimeOutputBlob(inputs, outputs);
+    }
+    return TNN_OK;
+}
+
+Status AbstractLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status AbstractLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    return TNN_OK;
+}
+
+Status AbstractLayerAcc::AllocateRuntimeOutputBlob(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    //runtime blob allocate
+    for (auto iter : outputs) {
+        if (!iter->NeedAllocateInForward()) {
+            continue;
+        }
+        
+        if (runtime_blob_pool_ == nullptr) {
+            return Status(TNNERR_LAYER_ERR, "layer acc has no runtime_blob_pool_");
+        }
+        auto info = runtime_blob_pool_->GetDevice()->Calculate(iter->GetBlobDesc());
+        auto blob_memory = runtime_blob_pool_->BorrowBlobMemory(0, info, true);
+        auto status = blob_memory->AllocateHandle();
+        RETURN_ON_NEQ(status, TNN_OK);
+        iter->SetHandle(blob_memory->GetHandle());
+    }
+    return TNN_OK;
+}
+
+Status AbstractLayerAcc::AfterForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+void AbstractLayerAcc::SetRuntimeBlobMemoryPool(BlobMemoryPool *runtime_blob_pool) {
+    runtime_blob_pool_ = runtime_blob_pool;
+}
+
+void AbstractLayerAcc::SetConstantResource(ConstantResource* consts) {
+    const_resource_ = consts;
+}
+
+void AbstractLayerAcc::SetConstantResourceFlag(ConstantResourceFlag *flags) {
+    const_resource_flag_ = flags;
+}
+
+// @brief set runtime mode
+void AbstractLayerAcc::SetRuntimeMode(RuntimeMode mode) {
+    runtime_model_ = mode;
+}
+
+#if TNN_PROFILE
+void AbstractLayerAcc::UpdateProfilingData(ProfilingData *pdata, LayerParam *param, DimsVector input_dim,
+                                           DimsVector output_dim) {
+    if (!pdata) {
+        return;
+    }
+
+    if (param) {
+        pdata->op_name    = param->type;
+        pdata->layer_name = param->name;
+    }
+
+    if (input_dim.size() > 0) {
+        pdata->input_dims = input_dim;
+    }
+    if (output_dim.size() > 0) {
+        pdata->output_dims = output_dim;
+    }
+
+    pdata->flops     = GetFlops();
+    pdata->bandwidth = GetBandwidth();
+
+    // for conv/deconv
+    {
+        auto conv_param = dynamic_cast<ConvLayerParam *>(param);
+        if (conv_param) {
+            pdata->kernel_shape.push_back(conv_param->output_channel);
+            pdata->kernel_shape.push_back(conv_param->input_channel);
+            pdata->kernel_shape.push_back(conv_param->kernels[1]);
+            pdata->kernel_shape.push_back(conv_param->kernels[0]);
+            pdata->stride_shape.push_back(conv_param->strides[1]);
+            pdata->stride_shape.push_back(conv_param->strides[0]);
+            pdata->pad_shape.push_back(conv_param->pads[2]);
+            pdata->pad_shape.push_back(conv_param->pads[0]);
+            pdata->dilation_shape.push_back(conv_param->dialations[1]);
+            pdata->dilation_shape.push_back(conv_param->dialations[0]);
+            pdata->group = conv_param->group;
+        }
+    }
+
+    // for pool
+    {
+        auto pool_param = dynamic_cast<PoolingLayerParam *>(param);
+        if (pool_param) {
+            pdata->kernel_shape.push_back(pool_param->kernels[1]);
+            pdata->kernel_shape.push_back(pool_param->kernels[0]);
+            pdata->stride_shape.push_back(pool_param->strides[1]);
+            pdata->stride_shape.push_back(pool_param->strides[0]);
+            pdata->pad_shape.push_back(pool_param->pads[2]);
+            pdata->pad_shape.push_back(pool_param->pads[0]);
+        }
+    }
+}
+
+double AbstractLayerAcc::GetFlops() {
+    return 0;
+}
+
+double AbstractLayerAcc::GetBandwidth() {
+    return 0;
+}
+#endif
+
+Status AbstractLayerAcc::ResolveBlobDataFormat(Blob *blob, BlobType blob_type) {
+    auto desc = blob->GetBlobDesc();
+    auto support_list = SupportDataFormat(desc.data_type, static_cast<int>(desc.dims.size()), blob_type);
+    if (support_list.size() <= 0) {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT,
+                      "unsupported data format for device acc");
+    }
+
+    /*
+     * DATA_FORMAT_AUTO : first format supported by the LayerAcc
+     * Others:  return error if LayerAcc not support.
+     */
+    if (desc.data_format == DATA_FORMAT_AUTO) {
+        desc.data_format = support_list[0];
+        blob->SetBlobDesc(desc);
+        return TNN_OK;
+    } else {
+        auto iter = std::find(support_list.begin(), support_list.end(), desc.data_format);
+        if (iter != support_list.end()) {
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "unsupported data format for device acc");
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/abstract_layer_acc.h b/3rdparty/TNN/source/tnn/core/abstract_layer_acc.h
new file mode 100644
index 0000000..aa3d048
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_layer_acc.h
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_CORE_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+//#include "tnn/memory_manager/blob_memory_pool.h"
+
+namespace TNN_NS {
+class BlobMemoryPool;
+
+enum BlobType {
+    BLOB_INPUT  = 0,
+    BLOB_OUTPUT = 1
+};
+
+// @brief AbstractLayerAcc define the layer acc interface
+class AbstractLayerAcc {
+public:
+    // @brief virtual destructor
+    virtual ~AbstractLayerAcc(){};
+
+    // @brief init layer with param, resouce, input blobs and output blobs.
+    // @context tnn instance device context
+    // @param param    layer param
+    // @param resouce  layer resouce
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) = 0;
+
+    // @brief prepare with inputs and outpus.
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return reshape result
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) = 0;
+
+    // @brief layer forward acc
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return execution result
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) = 0;
+
+    // @brief before layer acc forward
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return execution result
+    virtual Status BeforeForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // @brief infer output blob shape in runtime where shape is determined by input data like constant of shape, it is different from layer::InferOutputShape where shape is determined by input shape and param.
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return execution result
+    virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    // @brief allocate output blob in runtime
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return execution result
+    virtual Status AllocateRuntimeOutputBlob(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    // @brief allocate or update constant blobs if constant resource change。
+    // @param only_reload_shape_differ_blob   If only_reload_shape_differ_blob is true, only reload blobs with flag DATA_FLAG_CHANGE_IF_SHAPE_DIFFER, otherwise reload blobs with flag DATA_FLAG_CHANGE_IF_SHAPE_DIFFER or DATA_FLAG_CHANGE_NEVER
+    // Note: this func may cost much time, call this func only when necessary。
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+    
+    // @brief after layer acc forward
+    // @param inputs    input blobs
+    // @param outputs   output blobs
+    // @return execution result
+    virtual Status AfterForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // @brief decide Blob Data Format based on support data format list
+    virtual Status ResolveBlobDataFormat(Blob *blob, BlobType blob_type);
+    
+    // @brief set runtime bolob pool
+    void SetRuntimeBlobMemoryPool(BlobMemoryPool *runtime_blob_pool);
+    
+    // @brief set constant resource
+    void SetConstantResource(ConstantResource* consts);
+    
+    // @brief set constant resource flags
+    void SetConstantResourceFlag(ConstantResourceFlag* flags);
+    
+    // @brief set runtime mode
+    void SetRuntimeMode(RuntimeMode mode);
+    
+#if TNN_PROFILE
+    virtual void UpdateProfilingData(ProfilingData *pdata, LayerParam *param, DimsVector input_dim,
+                                     DimsVector output_dim);
+    virtual double GetFlops();
+    virtual double GetBandwidth();
+#endif
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) = 0;
+    
+protected:
+    BlobMemoryPool *runtime_blob_pool_ = nullptr;
+    ConstantResource* const_resource_ = nullptr;
+    ConstantResourceFlag *const_resource_flag_ = nullptr;
+    
+    std::map<std::string, std::shared_ptr<Blob> > const_blob_map_ = {};
+    RuntimeMode runtime_model_ = RUNTIME_MODE_NORMAL;
+};
+
+// @brief LayerAccCreator define create layer acc interface
+class LayerAccCreator {
+public:
+    virtual AbstractLayerAcc *CreateLayerAcc(LayerType layer_type) = 0;
+    virtual ~LayerAccCreator(){};
+};
+
+// @brief TypeLayerAccCreator template define type layer acc creator, different
+// type layer acc instantiate template as parameter T.
+template <typename T>
+class TypeLayerAccCreator : public LayerAccCreator {
+    AbstractLayerAcc *CreateLayerAcc(LayerType layer_type) {
+        return new T();
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/core/abstract_network.cc b/3rdparty/TNN/source/tnn/core/abstract_network.cc
new file mode 100644
index 0000000..dd265d1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_network.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/abstract_network.h"
+
+#include "tnn/core/profile.h"
+
+namespace TNN_NS {
+
+#ifdef FORWARD_CALLBACK_ENABLE
+Status AbstractNetwork::ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after) {
+    return TNN_OK;
+}
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+Status AbstractNetwork::ShareCommandQueue(AbstractNetwork *network) {
+    LOGE("Subclass of AbstractNetwork must implement this func ShareCommandQueue\n");
+    return Status(TNNERR_COMMON_ERROR, "Subclass of AbstractNetwork must implement this func ShareCommandQueue");
+}
+
+Status AbstractNetwork::SetCpuNumThreads(int num_threads) {
+    return TNN_OK;
+}
+
+#if TNN_PROFILE
+void AbstractNetwork::StartProfile() {
+    LOGI("subclass should implement the func: StartProfile\n");
+}
+
+std::shared_ptr<ProfileResult> AbstractNetwork::FinishProfile() {
+    LOGI("subclass should implement the func: FinishProfile\n");
+    return std::make_shared<ProfileResult>();
+}
+#endif
+
+std::map<NetworkType, std::shared_ptr<AbstractNetworkImplFactory>> &NetworkImplManager::GetNetworkImplFactoryMap() {
+    static std::map<NetworkType, std::shared_ptr<AbstractNetworkImplFactory>> s_network_impl_factory_map;
+    return s_network_impl_factory_map;
+}
+
+std::shared_ptr<AbstractNetwork> NetworkImplManager::GetNetworkImpl(NetworkType type) {
+    auto &impl_map = NetworkImplManager::GetNetworkImplFactoryMap();
+    auto iter      = impl_map.find(type);
+    if (iter != impl_map.end()) {
+        return iter->second->CreateNetworkImp();
+    }
+
+    return nullptr;
+}
+
+/*
+ * NetworkImpl is registered the map at runtime.
+ */
+void NetworkImplManager::RegisterNetworkImplFactory(NetworkType type, AbstractNetworkImplFactory *factory) {
+    if (factory) {
+        auto &impl_map = NetworkImplManager::GetNetworkImplFactoryMap();
+        impl_map[type] = std::shared_ptr<AbstractNetworkImplFactory>(factory);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/abstract_network.h b/3rdparty/TNN/source/tnn/core/abstract_network.h
new file mode 100644
index 0000000..040c97b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/abstract_network.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_ABSTRACT_NETWORK_H_
+#define TNN_SOURCE_TNN_CORE_ABSTRACT_NETWORK_H_
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+class AbstractNetwork {
+public:
+    // @brief virtual default destructor
+    virtual ~AbstractNetwork() {}
+
+    // @brief init network with net cfg and net res.
+    // @param net_cfg
+    // @param net_res
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true) = 0;
+
+    // @brief deinit release init create resource
+    virtual Status DeInit() = 0;
+
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by tnn layers for
+    //  forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    virtual Status GetForwardMemorySize(int &memory_size) = 0;
+
+    //  @brief: set memory used by the tnn instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the tnn network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by tnn layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    //
+    virtual Status SetForwardMemory(void *memory) = 0;
+
+    // @brief network infer
+    virtual Status Reshape(const InputShapesMap &inputs) = 0;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue) = 0;
+
+    // @brief share tnn command queue to another network。
+    virtual Status ShareCommandQueue(AbstractNetwork *network);
+    
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward() = 0;
+
+#ifdef FORWARD_CALLBACK_ENABLE
+    // @brief network infer with callbach to statistic blob info
+    virtual Status ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after);
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back) = 0;
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs) = 0;
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs) = 0;
+
+    // @brief set threads run on device
+    virtual Status SetCpuNumThreads(int num_threads);
+
+#if TNN_PROFILE
+public:
+    virtual void StartProfile();
+    virtual std::shared_ptr<ProfileResult> FinishProfile();
+#endif
+};
+
+class AbstractNetworkImplFactory {
+public:
+    virtual ~AbstractNetworkImplFactory() {}
+    virtual std::shared_ptr<AbstractNetwork> CreateNetworkImp() = 0;
+};
+
+template <typename T>
+class NetworkImplFactory : public AbstractNetworkImplFactory {
+public:
+    virtual std::shared_ptr<AbstractNetwork> CreateNetworkImp() {
+        return std::make_shared<T>();
+    }
+};
+
+class NetworkImplManager {
+public:
+    static std::shared_ptr<AbstractNetwork> GetNetworkImpl(NetworkType type);
+    static void RegisterNetworkImplFactory(NetworkType type, AbstractNetworkImplFactory *factory);
+
+private:
+    static std::map<NetworkType, std::shared_ptr<AbstractNetworkImplFactory>> &GetNetworkImplFactoryMap();
+};
+
+template <typename T>
+class NetworkImplFactoryRegister {
+public:
+    explicit NetworkImplFactoryRegister(NetworkType type) {
+        NetworkImplManager::RegisterNetworkImplFactory(type, new T());
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_ABSTRACT_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/core/blob.cc b/3rdparty/TNN/source/tnn/core/blob.cc
new file mode 100644
index 0000000..4f2f4e6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <iomanip>
+#include <sstream>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/blob_impl.h"
+
+namespace TNN_NS {
+
+std::string BlobDesc::description(bool all_message) {
+    std::ostringstream os;
+    //name
+    os << "name: " <<name;
+
+    //data type
+    os << " data type: " << data_type;
+
+    //shape
+    os << " shape: [ " ;
+    for (auto iter : dims) {
+        os << iter << " " ;
+    }
+    os << "]";
+
+    return os.str();
+}
+
+Blob::Blob(BlobDesc desc) {
+    impl_ = new BlobImpl(desc);
+}
+
+Blob::Blob(BlobDesc desc, bool alloc_memory) {
+    impl_ = new BlobImpl(desc, alloc_memory);
+}
+
+Blob::~Blob() {
+    delete impl_;
+}
+
+Blob::Blob(BlobDesc desc, BlobHandle handle) {
+    impl_ = new BlobImpl(desc, handle);
+}
+
+// Set the descriptor of the blob.
+void Blob::SetBlobDesc(BlobDesc desc) {
+    impl_->SetBlobDesc(desc);
+}
+
+// Get a reference of the descriptor of the blob.
+BlobDesc &Blob::GetBlobDesc() {
+    return impl_->GetBlobDesc();
+}
+
+// Get a copy of the handle of the blob.
+BlobHandle Blob::GetHandle() {
+    return impl_->GetHandle();
+}
+
+// Set the handle of the blob.
+void Blob::SetHandle(BlobHandle handle) {
+    impl_->SetHandle(handle);
+}
+
+//@brief allocate blob handle in forward
+bool Blob::NeedAllocateInForward() {
+    return impl_->NeedAllocateInForward();
+}
+
+bool Blob::IsConstant() {
+    return impl_->IsConstant();
+}
+
+int Blob::GetFlag() {
+    return impl_->GetFlag();
+}
+
+void Blob::SetFlag(int flag) {
+    impl_->SetFlag(flag);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/blob_impl.cc b/3rdparty/TNN/source/tnn/core/blob_impl.cc
new file mode 100644
index 0000000..fe317ea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_impl.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <iomanip>
+#include <sstream>
+
+#include "tnn/core/blob_impl.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/data_flag_utils.h"
+
+namespace TNN_NS {
+
+BlobImpl::BlobImpl(BlobDesc desc) {
+    desc_ = desc;
+    alloc_memory_ = false;
+}
+
+BlobImpl::BlobImpl(BlobDesc desc, bool alloc_memory) {
+    desc_ = desc;
+    alloc_memory_ = alloc_memory;
+    if(alloc_memory) {
+        auto device = GetDevice(desc.device_type);
+        if(device != NULL) {
+            BlobMemorySizeInfo size_info = device->Calculate(desc);
+            device->Allocate(&handle_.base, size_info);
+        }
+    }
+}
+
+BlobImpl::~BlobImpl() {
+    if(alloc_memory_ && handle_.base != NULL) {
+        auto device = GetDevice(desc_.device_type);
+        if(device != NULL) {
+            device->Free(handle_.base);
+        }
+    }
+}
+
+BlobImpl::BlobImpl(BlobDesc desc, BlobHandle handle) {
+    desc_   = desc;
+    handle_ = handle;
+    alloc_memory_ = false;
+}
+
+// Set the descriptor of the blob.
+void BlobImpl::SetBlobDesc(BlobDesc desc) {
+    desc_ = desc;
+}
+
+// Get a reference of the descriptor of the blob.
+BlobDesc &BlobImpl::GetBlobDesc() {
+    return desc_;
+}
+
+// Get a copy of the handle of the blob.
+BlobHandle BlobImpl::GetHandle() {
+    return handle_;
+}
+
+// Set the handle of the blob.
+void BlobImpl::SetHandle(BlobHandle handle) {
+    if(alloc_memory_) {
+        auto device = GetDevice(desc_.device_type);
+        if(device != NULL) {
+            device->Free(handle_.base);
+        }
+    }
+    handle_ = handle;
+    alloc_memory_ = false;
+}
+
+//@brief allocate blob handle in forward
+bool BlobImpl::NeedAllocateInForward() {
+    return DataFlagUtils::AllocateInForward(flag_);
+}
+
+bool BlobImpl::IsConstant() {
+    return DataFlagUtils::ChangeStatus(flag_) > 0;
+}
+
+int BlobImpl::GetFlag() {
+    return flag_;
+}
+
+void BlobImpl::SetFlag(int flag) {
+    flag_ = flag;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/blob_impl.h b/3rdparty/TNN/source/tnn/core/blob_impl.h
new file mode 100644
index 0000000..b27d09e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_impl.h
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_BLOB_IMPL_H_
+#define TNN_INCLUDE_TNN_CORE_BLOB_IMPL_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+// @brief BlobImpl tnn data store and transfer interface.
+class PUBLIC BlobImpl {
+public:
+    //@brief create blob with blob descript
+    explicit BlobImpl(BlobDesc desc);
+
+    BlobImpl(BlobDesc desc, bool alloc_memory);
+
+    //@brief create Blob with blob descript and data handle
+    BlobImpl(BlobDesc desc, BlobHandle handle);
+
+    virtual ~BlobImpl();
+
+    //@brief return blob desc
+    BlobDesc &GetBlobDesc();
+
+    //@brief set blob description
+    //@param desc blob description
+    void SetBlobDesc(BlobDesc desc);
+
+    //@brief return handle to the stored data
+    BlobHandle GetHandle();
+
+    //@brief set blob handle
+    //@param handle to the stored data
+    void SetHandle(BlobHandle handle);
+
+    //@brief allocate blob handle in forward
+    bool NeedAllocateInForward();
+    
+    //@brief check if it is constant
+    bool IsConstant();
+
+    //@brief get blob flag
+    int GetFlag();
+  
+    //@brief set blob flag
+    void SetFlag(int flag);
+
+private:
+    BlobDesc desc_;
+    BlobHandle handle_;
+    bool alloc_memory_;
+    //0: data alwalys change
+    //1: data change if shape differ
+    //2: data never change
+    int flag_ = DATA_FLAG_CHANGE_ALWAYS;
+};
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_BLOB_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/core/blob_int8.cc b/3rdparty/TNN/source/tnn/core/blob_int8.cc
new file mode 100644
index 0000000..3ecb9e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_int8.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_int8.h"
+
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+BlobInt8::BlobInt8(BlobDesc desc) : Blob(desc) {
+    this->GetBlobDesc().data_type = DATA_TYPE_INT8;
+}
+
+BlobInt8::BlobInt8(BlobDesc desc, BlobHandle handle) : Blob(desc, handle) {
+    this->GetBlobDesc().data_type = DATA_TYPE_INT8;
+}
+
+/*
+ * The resource of Int8Blob stores the quantization scale.
+ * Per-tensor scale and per-channel scale are supported Currently.
+ */
+IntScaleResource *BlobInt8::GetIntResource() {
+    return resource_;
+}
+
+void BlobInt8::SetIntResource(IntScaleResource *resource) {
+    resource_ = resource;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/blob_int8.h b/3rdparty/TNN/source/tnn/core/blob_int8.h
new file mode 100644
index 0000000..390a902
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_int8.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_BLOB_INT8_H_
+#define TNN_INCLUDE_TNN_CORE_BLOB_INT8_H_
+
+#include "tnn/core/blob.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+// @brief Blob tnn data store and transfer interface.
+class BlobInt8 : public Blob {
+public:
+    //@brief create BlobInt8 with blob descript
+    explicit BlobInt8(BlobDesc desc);
+
+    //@brief create BlobInt8 with blob descript and data handle
+    BlobInt8(BlobDesc desc, BlobHandle handle);
+
+    //@brief get layer int8 resources
+    IntScaleResource *GetIntResource();
+
+    //@brief set layer int8 resources
+    void SetIntResource(IntScaleResource *resource);
+
+private:
+    IntScaleResource *resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_CORE_BLOB_INT8_H_
diff --git a/3rdparty/TNN/source/tnn/core/blob_manager.cc b/3rdparty/TNN/source/tnn/core/blob_manager.cc
new file mode 100644
index 0000000..84df100
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_manager.cc
@@ -0,0 +1,383 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_manager.h"
+
+#include <algorithm>
+#include <cstring>
+#include <set>
+
+#include "tnn/memory_manager/blob_memory_pool_factory.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/memory_manager/memory_mode_state_factory.h"
+#include "tnn/memory_manager/memory_seperate_assign_strategy.h"
+#include "tnn/memory_manager/memory_unify_assign_strategy.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_flag_utils.h"
+
+namespace TNN_NS {
+
+BlobManager::BlobManager(AbstractDevice *device) {
+    device_            = device;
+    // create 1d memory pool
+    blob_memory_pool_map_[1] = BlobMemoryPoolFactory::CreateBlobMemoryPool(device);
+    if (device->GetDeviceType() == DEVICE_OPENCL) {
+        // create 2d memory pool
+        blob_memory_pool_map_[2] = BlobMemoryPoolFactory::CreateBlobMemoryPool(device, 2);
+    }
+    net_structure_     = nullptr;
+    memory_mode_state_ = nullptr;
+}
+
+BlobManager::~BlobManager() {
+    DeInit();
+
+    for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+        if (blob_memory_pool_iter.second != nullptr) {
+            delete blob_memory_pool_iter.second;
+            blob_memory_pool_iter.second = nullptr;
+        }
+    }
+}
+
+static void UpdateDeviceInputDataFormat(NetworkConfig &config, Blob *input, const DeviceType &type) {
+    if (config.data_format != DATA_FORMAT_AUTO)
+        return;
+    if (type == DEVICE_ARM || type == DEVICE_METAL) {
+        input->GetBlobDesc().data_format = DATA_FORMAT_NC4HW4;
+    } else if (type == DEVICE_OPENCL) {
+        input->GetBlobDesc().data_format = DATA_FORMAT_NHC4W4;
+    }
+}
+
+Status BlobManager::Init(NetworkConfig &config, NetStructure *net_structure, InputShapesMap inputs_shape_map,
+                         DataType input_data_type) {
+    if (net_structure->blobs.empty()) {
+        LOGE("net_structure blobs is empty\n");
+        return Status(TNNERR_PARAM_ERR, "net_structure blobs is empty");
+    }
+
+    net_structure_ = net_structure;
+
+    auto instance_input_shapes_map = net_structure_->inputs_shape_map;
+    if (instance_input_shapes_map.size() == 1 && inputs_shape_map.size() == 1) {
+        // modify input shape if only one input, ignore the key
+        instance_input_shapes_map.begin()->second = inputs_shape_map.begin()->second;
+    } else {
+        // modify input shape, only set invalid net input shape
+        for (auto iter : inputs_shape_map) {
+            if (instance_input_shapes_map.count(iter.first) > 0) {
+                instance_input_shapes_map[iter.first] = iter.second;
+            }
+        }
+    }
+
+    config_            = config;
+    init_thread_id_    = std::this_thread::get_id();
+    shared_memory_allocated_ = false;
+    memory_mode_state_ = MemoryModeStateFactory::CreateMemoryModeState(config.share_memory_mode);
+
+    // get the maximum dimension of all inputs
+    int input_dims = 0;
+    for (auto blob_dims : instance_input_shapes_map) {
+        int dims   = (int)blob_dims.second.size();
+        input_dims = std::max(input_dims, dims);
+    }
+
+    for (auto node_name : net_structure_->blobs) {
+        BlobDesc desc;
+        desc.device_type = config.device_type;
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.name        = node_name;
+        // set to the specified data format
+        if (config.data_format != DATA_FORMAT_AUTO) {
+            desc.data_format = config.data_format;
+        }
+
+        // check whether the input_shape is defined or not.
+        if (instance_input_shapes_map.count(node_name) > 0) {
+            desc.dims = instance_input_shapes_map[node_name];
+        }
+        BlobHandle handle;
+        blobs_[node_name] = new Blob(desc, handle);
+    }
+
+    // input blobs
+    const auto& input_data_type_map = net_structure->input_data_type_map;
+    for (auto iter : instance_input_shapes_map) {
+        auto current_blob_name = iter.first;
+        if (blobs_.find(current_blob_name) == blobs_.end()) {
+            continue;
+        }
+        auto current_blob = blobs_[current_blob_name];
+        if (input_data_type_map.find(current_blob_name) != input_data_type_map.end()) {
+            current_blob->GetBlobDesc().data_type = input_data_type_map.find(current_blob_name)->second;
+        } else {
+            current_blob->GetBlobDesc().data_type = input_data_type;
+        }
+        UpdateDeviceInputDataFormat(config, current_blob, device_->GetDeviceType());
+        input_blobs_[current_blob_name]         = current_blob;
+    }
+
+    // output blobs
+    std::set<std::string> &output_blob_names = net_structure_->outputs;
+    for (auto name : output_blob_names) {
+        Blob *blob          = blobs_[name];
+        output_blobs_[name] = blob;
+    }
+
+    return TNN_OK;
+}
+
+/*
+ *  This function allocates the memory for all blobs.
+ *  The memory size is calculated by each Device according to data_type \
+ *  and data format.
+ *  The size may be different for different devices.
+ */
+Status BlobManager::AllocateBlobMemory(int flag) {
+    const auto &input_shapes_map = net_structure_->inputs_shape_map;
+
+    for (auto iter : input_shapes_map) {
+        std::string current_blob_name = iter.first;
+        Blob *current_blob            = blobs_[current_blob_name];
+        if (current_blob->NeedAllocateInForward() ||
+            DataFlagUtils::ChangeStatus(current_blob->GetFlag()) != DataFlagUtils::ChangeStatus(flag)) {
+            continue;
+        }
+        // todo. need refactor
+        BlobMemorySizeInfo info = device_->Calculate(current_blob->GetBlobDesc());
+        if (info.dims.size() > 1 && config_.share_memory_mode != SHARE_MEMORY_MODE_DEFAULT) {
+            return Status(TNNERR_SHARE_MEMORY_MODE_NOT_SUPPORT, "share_memory_mode option is unsupported");
+        }
+        int use_count           = 1;
+        BlobMemory *blob_memory = NULL;
+        blob_memory             = blob_memory_pool_map_[info.dims.size()]->BorrowBlobMemory(use_count, info, true);
+        blob_memory_mapping_.insert(std::make_pair(current_blob, blob_memory));
+    }
+
+    /*
+     *  We reuse blob memory of the previous layers if it is not referenced.
+     *  So, a use_count is calculated here.
+     */
+    for (size_t layer_index = 0; layer_index < net_structure_->layers.size(); layer_index++) {
+        LayerInfo *layer_info = net_structure_->layers[layer_index].get();
+        // allocating blob memory for every out nodes of this layer
+        for (auto current_blob_name : layer_info->outputs) {
+            Blob *current_blob = blobs_[current_blob_name];
+            if (current_blob->NeedAllocateInForward() ||
+                DataFlagUtils::ChangeStatus(current_blob->GetFlag()) != DataFlagUtils::ChangeStatus(flag)) {
+                continue;
+            }
+            
+            // ASSERT(current_blob->count() > 0);
+            if (DimsVectorUtils::Count(current_blob->GetBlobDesc().dims) < 0) {
+                LOGE("Got empty blob, name:%s\n", current_blob_name.c_str());
+                return Status(TNNERR_LAYER_ERR, "blob dims is invaid");
+            }
+
+            if (blob_memory_mapping_.find(current_blob) == blob_memory_mapping_.end()) {
+                // calculate the use count of this blob
+                int use_count = GetBlobUseCount(layer_index, current_blob_name);
+
+                BlobMemorySizeInfo info = device_->Calculate(current_blob->GetBlobDesc());
+                // find an available BlobMemory
+                BlobMemory *blob_memory = blob_memory_pool_map_[info.dims.size()]->BorrowBlobMemory(use_count, info, false);
+                blob_memory_mapping_.insert(std::make_pair(current_blob, blob_memory));
+            }
+        }
+
+        // refund the input blob memory
+        for (auto current_blob_name : layer_info->inputs) {
+            Blob *current_blob = blobs_[current_blob_name];
+            if (current_blob->NeedAllocateInForward() ||
+                DataFlagUtils::ChangeStatus(current_blob->GetFlag()) != DataFlagUtils::ChangeStatus(flag)) {
+                continue;
+            }
+            
+            if (input_shapes_map.count(current_blob_name) == 0) {
+                std::map<Blob *, BlobMemory *>::const_iterator blob_memory_iter =
+                    blob_memory_mapping_.find(current_blob);
+                ASSERT(blob_memory_iter->second->GetUseCount() > 0);
+                blob_memory_iter->second->DecrementUseCount();
+                if (blob_memory_iter->second->GetUseCount() == 0) {
+                    int dimensions = blob_memory_iter->second->GetBlobMemorySizeInfo().dims.size();
+                    blob_memory_pool_map_[dimensions]->RefundBlobMemory(blob_memory_iter->second);
+                }
+            }
+        }
+    }
+
+    Status status = TNN_OK;
+
+    do {
+        if (config_.share_memory_mode == SHARE_MEMORY_MODE_DEFAULT) {
+            // The default strategy allocated the blob memory separately.
+            MemorySeperateAssignStrategy strategy;
+            for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+                status = blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
+                BREAK_IF(status != TNN_OK);
+            }
+            BREAK_IF(status != TNN_OK);
+            BindBlobMemory();
+        } else if (config_.share_memory_mode == SHARE_MEMORY_MODE_SHARE_ONE_THREAD) {
+            // The share_on_thread strategy may share memory of different models-
+            // within the same thread.
+            for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+                int forward_memory_size   = blob_memory_pool_iter.second->GetAllBlobMemorySize();
+                SharedMemory share_memory = SharedMemoryManager::GetSharedMemory(
+                        forward_memory_size, init_thread_id_, device_,
+                        config_.device_id, this, status);
+                BREAK_IF(status != TNN_OK);
+		shared_memory_allocated_ = true;
+                MemoryUnifyAssignStrategy strategy(share_memory.shared_memory_data);
+                status = blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
+                BREAK_IF(status != TNN_OK);
+            }
+            BREAK_IF(status != TNN_OK);
+            BindBlobMemory();
+        }
+    } while (0);
+
+    return status;
+}
+
+/*
+ * This function calculate the use count of the given blob.
+ * output layer is regarded as an additional reference.
+ */
+int BlobManager::GetBlobUseCount(int layer_index, std::string current_blob_name) {
+    int use_count                            = 0;
+    std::set<std::string> &output_blob_names = net_structure_->outputs;
+    for (size_t next_layer_id = layer_index + 1; next_layer_id != net_structure_->layers.size(); ++next_layer_id) {
+        LayerInfo *next_layer_info = net_structure_->layers[next_layer_id].get();
+        for (auto blob_name : next_layer_info->inputs) {
+            if (strcmp(current_blob_name.c_str(), blob_name.c_str()) == 0) {
+                ++use_count;
+            }
+        }
+    }
+
+    bool is_output_layer = output_blob_names.count(current_blob_name) > 0;
+    if (use_count == 0 || is_output_layer) {
+        use_count += 1;
+    }
+    return use_count;
+}
+
+Status BlobManager::DeInit() {
+    if(shared_memory_allocated_) {
+    	SharedMemoryManager::ReleaseSharedMemory(init_thread_id_, device_, config_.device_id, this);
+    }
+
+    for (auto blob : blobs_) {
+        delete blob.second;
+    }
+
+    if (memory_mode_state_ != NULL) {
+        delete memory_mode_state_;
+        memory_mode_state_ = NULL;
+    }
+    return TNN_OK;
+}
+
+void BlobManager::OnSharedForwardMemoryChanged(void *memory) {
+    MemoryUnifyAssignStrategy strategy(memory);
+    for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+        blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
+    }
+    BindBlobMemory();
+}
+
+/*
+ * Blob memory may be allocated by the user.
+ * The total size required is given by GetAllBlobMemorySize().
+ */
+Status BlobManager::SetForwardMemory(void *memory) {
+    if (config_.share_memory_mode != SHARE_MEMORY_MODE_SET_FROM_EXTERNAL) {
+        return Status(TNNERR_NOT_SUPPORT_SET_FORWARD_MEM, "set memory from external is unsupported");
+    }
+    MemoryUnifyAssignStrategy strategy(memory);
+    Status status = TNN_OK;
+    for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+        status = blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
+    }
+    if (status == TNN_OK) {
+        BindBlobMemory();
+    }
+    return status;
+}
+
+void BlobManager::BindBlobMemory() {
+    memory_mode_state_->SetMemoryAllocatedFlag();
+    // bind every blob_memory's data_ into every blob's data
+    for (auto iter : blob_memory_mapping_) {
+        iter.first->SetHandle(iter.second->GetHandle());
+        // set blob data format to nchw when blob memory is 1d on opencl
+        if (device_->GetDeviceType() == DEVICE_OPENCL &&
+            iter.second->GetBlobMemorySizeInfo().dims.size() == 1) {
+            auto desc = iter.first->GetBlobDesc();
+            desc.data_format = DATA_FORMAT_NCHW;
+            iter.first->SetBlobDesc(desc);
+        }
+    }
+}
+
+int BlobManager::GetAllBlobMemorySize() {
+    int mem_size_all_blob = 0;
+    for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+        mem_size_all_blob += blob_memory_pool_iter.second->GetAllBlobMemorySize();
+    }
+    return mem_size_all_blob;
+}
+
+Status BlobManager::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blobs_;
+    return TNN_OK;
+}
+
+Status BlobManager::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blobs_;
+    return TNN_OK;
+}
+
+Blob *BlobManager::GetBlob(std::string name) {
+    auto iter = blobs_.find(name);
+    if (iter != blobs_.end()) {
+        return iter->second;
+    }
+    return nullptr;
+}
+
+void BlobManager::ReplaceBlob(std::string name, Blob *new_blob) {
+    if (blobs_.find(name) != blobs_.end()) {
+        auto ori_blob = blobs_[name];
+        if (ori_blob) {
+            delete ori_blob;
+        }
+    }
+    blobs_[name] = new_blob;
+
+    if (input_blobs_.find(name) != input_blobs_.end())
+        input_blobs_[name] = new_blob;
+
+    if (output_blobs_.find(name) != output_blobs_.end())
+        output_blobs_[name] = new_blob;
+}
+
+Status BlobManager::CheckBlobMemoryState() {
+    return memory_mode_state_->GetStatus();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/blob_manager.h b/3rdparty/TNN/source/tnn/core/blob_manager.h
new file mode 100644
index 0000000..1643acf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/blob_manager.h
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_BLOB_MANAGER_H_
+#define TNN_SOURCE_TNN_CORE_BLOB_MANAGER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/memory_manager/blob_memory.h"
+#include "tnn/memory_manager/blob_memory_pool.h"
+#include "tnn/memory_manager/memory_assign_strategy.h"
+#include "tnn/memory_manager/memory_mode_state.h"
+#include "tnn/memory_manager/shared_memory_manager.h"
+
+namespace TNN_NS {
+
+class BlobManager : public ISharedMemoryChangeListener {
+public:
+    // @brief BlobManager constructor
+    explicit BlobManager(AbstractDevice *device);
+
+    // @brief BlobManager destructor
+    ~BlobManager();
+
+    // @brief InitBlobs init blobs
+    // @param structure net structure
+    virtual Status Init(NetworkConfig &config, NetStructure *net_structure, InputShapesMap inputs_shape_map,
+                DataType input_data_type);
+
+    // @brief DeInit release Init create resource
+    Status DeInit();
+
+    // @brief GetBlobs get blob by name
+    // @param name blob name
+    Blob *GetBlob(std::string name);
+
+    // @brief check blob memory state for different share memory mode
+    Status CheckBlobMemoryState();
+
+    // @brief set blob forward memory
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief get all input blobs
+    // @param blobs blob map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs blob map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    // @brief AllocateBlobMemory for blob with flag
+    virtual Status AllocateBlobMemory(int flag = DATA_FLAG_CHANGE_ALWAYS);
+
+    // @brief OnSharedForwardMemoryChanged for share memory change observer
+    virtual void OnSharedForwardMemoryChanged(void *memory);
+
+    // @brief get all blob memory size
+    int GetAllBlobMemorySize();
+
+    // @brief replace blob with new_blob, and delete the original blob if exist
+    void ReplaceBlob(std::string name, Blob *new_blob);
+
+protected:
+    void BindBlobMemory();
+    int GetBlobUseCount(int layer_index, std::string current_blob_name);
+
+    NetworkConfig config_;
+    NetStructure *net_structure_;
+    // dimension-memory pool
+    std::map<int, BlobMemoryPool *> blob_memory_pool_map_;
+    AbstractDevice *device_;
+    BlobMap input_blobs_;
+    BlobMap output_blobs_;
+    std::shared_ptr<MemoryAssignStrategy> strategy_;
+    std::map<std::string, Blob *> blobs_;
+    std::map<Blob *, BlobMemory *> blob_memory_mapping_;
+    bool shared_memory_allocated_;
+
+    std::thread::id init_thread_id_;
+    MemoryModeState *memory_mode_state_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_BLOB_MANAGER_H_
diff --git a/3rdparty/TNN/source/tnn/core/const_folder.cc b/3rdparty/TNN/source/tnn/core/const_folder.cc
new file mode 100644
index 0000000..1faea52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/const_folder.cc
@@ -0,0 +1,221 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/const_folder.h"
+
+#include <string.h>
+#include <sstream>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/profile.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/memory_manager/blob_memory_pool_factory.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/utils/blob_dump_utils.h"
+#include "tnn/utils/blob_transfer_utils.h"
+#include "tnn/utils/data_flag_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+ConstFolder::ConstFolder() {
+    runtime_model_ = RUNTIME_MODE_CONST_FOLD;
+}
+
+ConstFolder::~ConstFolder() {
+}
+
+/*
+ * The Network holds blob, blobmanager, layers etc.
+ * Those object is initialized in this function.
+ */
+Status ConstFolder::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                            InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    config_ = net_config;
+    config_.device_type = DEVICE_NAIVE;
+    runtime_blob_pool_ = BlobMemoryPoolFactory::CreateBlobMemoryPool(GetDevice(DEVICE_NAIVE));
+    
+    runtime_model_ = RUNTIME_MODE_CONST_FOLD;
+    
+    return DefaultNetwork::Init(config_, model_config, interpreter, min_inputs_shape, max_inputs_shape);
+}
+
+Status ConstFolder::AllocateBlobMemory() {
+    return blob_manager_->AllocateBlobMemory(DATA_FLAG_CHANGE_IF_SHAPE_DIFFER);
+}
+
+Status ConstFolder::Reshape(const InputShapesMap &inputs) {
+    return DefaultNetwork::Reshape(inputs);
+}
+
+Status ConstFolder::DeInit() {
+    return DefaultNetwork::DeInit();
+}
+
+Status ConstFolder::Forward() {
+    auto status = DefaultNetwork::Forward();
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    BlobShapesMap shapes_map;
+    //save all input and output blob shapes, better for cuda device
+    for (auto layer : layers_){
+        auto inputs = layer->GetInputBlobs();
+        for (auto blob : inputs) {
+            shapes_map[blob->GetBlobDesc().name]  = blob->GetBlobDesc().dims;
+        }
+        auto outputs = layer->GetOutputBlobs();
+        for (auto blob : outputs) {
+            shapes_map[blob->GetBlobDesc().name]  = blob->GetBlobDesc().dims;
+        }
+    }
+    
+    std::set<std::string> constant_layers;
+    std::set<std::string> shape_differ_layers;
+    //In Forward, keep old const resource for reuse, save new const blobs to ConstantResource
+    //In GetOptimizeNetStructure, remove redundant constants of layer NEVER CHANGE
+    ConstantResource constant_map = net_resource_->constant_map;
+    auto constant_blob_flags = net_resource_->constant_blob_flags;
+    
+    for (auto layer : layers_) {
+        auto layer_flag = layer->GetLayerChangeFlag();
+        if (layer_flag == DATA_FLAG_CHANGE_NEVER) {
+            constant_layers.insert(layer->GetLayerName());
+            continue;
+        } else if (layer_flag == DATA_FLAG_CHANGE_IF_SHAPE_DIFFER) {
+            constant_layers.insert(layer->GetLayerName());
+            // never change input of layers SHAPE_DIFFER must be saved
+            shape_differ_layers.insert(layer->GetLayerName());
+        }
+
+        //save const input blob
+        auto inputs = layer->GetInputBlobs();
+        for (auto blob : inputs) {
+            auto blob_flag = DataFlagUtils::ChangeStatus(blob->GetFlag());
+            if ((layer_flag == DATA_FLAG_CHANGE_ALWAYS && blob_flag > 0) ||
+                (layer_flag == DATA_FLAG_CHANGE_IF_SHAPE_DIFFER && blob_flag == DATA_FLAG_CHANGE_NEVER)) {
+                //save constant resource
+                std::shared_ptr<RawBuffer> buffer = nullptr;
+                status= Blob2RawBuffer(blob, buffer);
+                RETURN_ON_NEQ(status, TNN_OK);
+                
+#ifdef DEBUG
+                {
+                    std::stringstream ss;
+                    ss << "<" << blob->GetBlobDesc().name << "> shape:[";
+                    for(int i: blob->GetBlobDesc().dims) {ss <<  i << ","; } ss << "]";
+                    LOGD("ConstFolder save const with name: %s\n", ss.str().c_str());
+                }
+#endif
+                
+                constant_map[blob->GetBlobDesc().name] = buffer;
+                constant_blob_flags[blob->GetBlobDesc().name] = blob_flag;
+            }
+        }
+    }
+    net_resource_->constant_layers = constant_layers;
+    net_resource_->shape_differ_layers = shape_differ_layers;
+    net_resource_->constant_map = constant_map;
+    net_resource_->constant_blob_flags = constant_blob_flags;
+    net_resource_->blob_shapes_map = shapes_map;
+    
+    return TNN_OK;
+}
+
+Status ConstFolder::GetOptimizedNet(std::shared_ptr<NetStructure> &const_fold_struct,
+                               std::shared_ptr<NetResource> &const_fold_resource,
+                               int  target_flag) {
+    target_flag = DataFlagUtils::ChangeStatus(target_flag);
+    
+    auto net_structure = net_structure_;
+    auto net_resource = net_resource_;
+    
+    auto constant_layers = net_resource_->constant_layers;
+    auto shape_differ_layers = net_resource_->shape_differ_layers;
+    
+    //optimized layers, remove redundant layer
+    std::vector<std::shared_ptr<LayerInfo>> optmized_layers;
+    
+    const_fold_struct = std::make_shared<NetStructure>();
+    *const_fold_struct = *net_structure;
+    {
+        
+        for (auto iter : net_structure->layers) {
+            if (target_flag == DATA_FLAG_CHANGE_IF_SHAPE_DIFFER) {
+                //layers with output flag DATA_FLAG_CHANGE_NEVER or DATA_FLAG_CHANGE_IF_SHAPE_DIFFER will be removed
+                if (constant_layers.find(iter->name) == constant_layers.end()) {
+                    optmized_layers.push_back(iter);
+                }
+            } else {
+                //only layers with output flag DATA_FLAG_CHANGE_NEVER will be removed
+                if (!(constant_layers.find(iter->name) != constant_layers.end() && shape_differ_layers.find(iter->name) == shape_differ_layers.end())) {
+                    optmized_layers.push_back(iter);
+                }
+            }
+        }
+        const_fold_struct->layers = optmized_layers;
+    }
+    
+    const_fold_resource = std::make_shared<NetResource>();
+    *const_fold_resource = *net_resource;
+    {
+        //In GetOptimizeNetStructure,  remove redundant constants of layer NEVER CHANGE
+        std::map<std::string, std::shared_ptr<LayerResource>> optmized_resource_map;
+        ConstantResource optmized_constant_map;
+        
+        auto resource_map = net_resource->resource_map;
+        auto constant_map = net_resource->constant_map;
+        
+        for (auto layer_info : optmized_layers) {
+            BaseLayer * layer = nullptr;
+            for (auto item : layers_) {
+                if (item->GetLayerName() == layer_info->name) {
+                    layer = item;
+                    break;
+                }
+            }
+            RETURN_VALUE_ON_NEQ(!layer, false, Status(TNNERR_LAYER_ERR, "layer is nil, internal error"));
+            
+            if (resource_map.find(layer_info->name) != resource_map.end()) {
+                optmized_resource_map[layer_info->name] = resource_map[layer_info->name];
+            }
+            
+            auto layer_flag = layer->GetLayerChangeFlag();
+            for (auto blob : layer->GetInputBlobs()) {
+                auto blob_name = blob->GetBlobDesc().name;
+                if (constant_map.find(blob_name) == constant_map.end()) {
+                    continue;
+                }
+                
+                auto blob_flag = DataFlagUtils::ChangeStatus(blob->GetFlag());
+   
+                if ((target_flag == DATA_FLAG_CHANGE_IF_SHAPE_DIFFER && blob_flag > 0) ||
+                    (target_flag == DATA_FLAG_CHANGE_NEVER && blob_flag == DATA_FLAG_CHANGE_NEVER)) {
+                    optmized_constant_map[blob_name] = constant_map[blob_name];
+                    LOGD("GetOptimizedNet save const with name: %s\n", blob_name.c_str());
+                }
+            }
+        }
+        
+        const_fold_resource->resource_map = optmized_resource_map;
+        const_fold_resource->constant_map = optmized_constant_map;
+    }
+
+    
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/const_folder.h b/3rdparty/TNN/source/tnn/core/const_folder.h
new file mode 100644
index 0000000..deef0a7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/const_folder.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_CONST_FOLDER_H_
+#define TNN_SOURCE_TNN_CORE_CONST_FOLDER_H_
+
+#include <vector>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/blob_manager.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/default_network.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+class ConstFolder : public DefaultNetwork {
+public:
+    // @brief ConstFolder Constructor
+    ConstFolder();
+
+    // @brief ConstFolder virtual Destructor
+    virtual ~ConstFolder();
+    
+    // @brief network deinit to release init create resource
+    virtual Status DeInit();
+    
+    // @brief get optimized NetStructure and NetResource without const layers of flag >= flag0, it must be called after Forward
+    Status GetOptimizedNet(std::shared_ptr<NetStructure> &opt_structure,
+                                   std::shared_ptr<NetResource> &opt_resource,
+                                   int  flag0 = DATA_FLAG_CHANGE_NEVER);
+    
+public:
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param net_structure network structure info
+    // @param net_resource network resource info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+    // @brief reshape with input shape info
+    // @inputs input shape info
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief network forward
+    virtual Status Forward();
+    
+protected:
+    virtual Status AllocateBlobMemory();
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_CONST_FOLDER_H_
diff --git a/3rdparty/TNN/source/tnn/core/context.cc b/3rdparty/TNN/source/tnn/core/context.cc
new file mode 100644
index 0000000..8bee9fe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/context.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/context.h"
+#include "tnn/core/profile.h"
+#include "tnn/utils/string_format.h"
+
+namespace TNN_NS {
+
+// this function is called before forward by Network.
+Status Context::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+
+// this function is called before Reshape by Network.
+Status Context::OnInstanceReshapeBegin() {
+    return TNN_OK;
+}
+
+// this function is called after Reshape by Network.
+Status Context::OnInstanceReshapeEnd() {
+    return TNN_OK;
+}
+
+Status Context::ShareCommandQueue(Context* context) {
+    LOGE("Subclass of Context must implement this func SetCommandQueue\n");
+    return Status(TNNERR_COMMON_ERROR, "Subclass of Context must implement this func SetCommandQueue");
+}
+
+/*
+ * Implement by the actual context such as ArmContext etc.
+ * Not implemented for this default context.
+ */
+Status Context::SetNumThreads(int num_threads) {
+    return TNN_OK;
+}
+
+void Context::SetPrecision(Precision precision) {
+    precision_ = precision;
+}
+
+Precision Context::GetPrecision() {
+    return precision_;
+}
+
+void Context::SetEnableTuneKernel(bool enable_tune_kernel) {
+    enable_tune_kernel_ = enable_tune_kernel;
+}
+
+bool Context::GetEnableTuneKernel() {
+    return enable_tune_kernel_;
+}
+
+void Context::SetCachePath(std::string cache_path) {
+    cache_path_ = cache_path;
+}
+
+std::string Context::GetCachePath() {
+    return cache_path_;
+}
+
+void Context::SetCacheFilePath(std::string cache_file_path) {
+    cache_file_path_ = cache_file_path;
+}
+
+std::string Context::GetCacheFilePath() {
+    return cache_file_path_;
+}
+
+#if TNN_PROFILE
+void Context::StartProfile() {
+    profile_layer     = true;
+    profiling_result_ = std::make_shared<ProfileResult>();
+}
+
+std::shared_ptr<ProfileResult> Context::FinishProfile() {
+    profile_layer = false;
+    return profiling_result_;
+}
+
+void Context::AddProfilingData(std::shared_ptr<ProfilingData> pdata) {
+    if (profile_layer && profiling_result_) {
+        profiling_result_->AddProfilingData(pdata);
+    }
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/context.h b/3rdparty/TNN/source/tnn/core/context.h
new file mode 100644
index 0000000..5be4061
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/context.h
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_CONTEXT_H_
+#define TNN_SOURCE_TNN_CORE_CONTEXT_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+class Context {
+public:
+    // @brief virtual destructor
+    virtual ~Context() {}
+
+    // @brief load library
+    virtual Status LoadLibrary(std::vector<std::string> path) = 0;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) = 0;
+
+    // @brief share tnn command queue to another context
+    virtual Status ShareCommandQueue(Context* context);
+    
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin();
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() = 0;
+
+    // @brief before instance Reshape
+    virtual Status OnInstanceReshapeBegin();
+
+    // @brief after instance Reshape
+    virtual Status OnInstanceReshapeEnd();
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() = 0;
+
+    // @brief set threads run on device
+    virtual Status SetNumThreads(int num_threads);
+
+    void SetPrecision(Precision precision);
+
+    Precision GetPrecision();
+
+    void SetEnableTuneKernel(bool enable_tune_kernel);
+
+    bool GetEnableTuneKernel();
+
+    void SetCachePath(std::string cache_path);
+
+    std::string GetCachePath();
+
+    void SetCacheFilePath(std::string cache_file_path);
+
+    std::string GetCacheFilePath();
+
+#if TNN_PROFILE
+public:
+    virtual void StartProfile();
+    virtual std::shared_ptr<ProfileResult> FinishProfile();
+    void AddProfilingData(std::shared_ptr<ProfilingData> pdata);
+
+    bool profile_layer = false;
+
+protected:
+    std::shared_ptr<ProfileResult> profiling_result_ = nullptr;
+#endif
+
+protected:
+    Precision precision_ = PRECISION_AUTO;
+    bool enable_tune_kernel_ = true;
+    std::string cache_path_ = ""; // dir to save cache files
+    std::string cache_file_path_ = "";
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/core/default_network.cc b/3rdparty/TNN/source/tnn/core/default_network.cc
new file mode 100644
index 0000000..48f6cf8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/default_network.cc
@@ -0,0 +1,698 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/default_network.h"
+
+#include <string.h>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/profile.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/memory_manager/blob_memory_pool_factory.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/utils/blob_dump_utils.h"
+#include "tnn/utils/blob_transfer_utils.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/data_flag_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/md5.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+//reserved for incompatible
+const std::string CACHE_TAG = "d1";
+
+NetworkImplFactoryRegister<NetworkImplFactory<DefaultNetwork>> g_network_impl_default_factory_register(
+    NETWORK_TYPE_DEFAULT);
+
+std::mutex DefaultNetwork::optimize_mtx_;
+
+DefaultNetwork::DefaultNetwork()
+    : device_(nullptr), context_(nullptr), blob_manager_(nullptr), net_structure_(nullptr) {}
+
+DefaultNetwork::~DefaultNetwork() {
+    DeInit();
+}
+
+Status DefaultNetwork::SetCpuNumThreads(int num_threads) {
+    if (context_)
+        return context_->SetNumThreads(num_threads);
+    else
+        return Status(TNNERR_CONTEXT_ERR, "context is nil");
+}
+
+/*
+ * The Network holds blob, blobmanager, layers etc.
+ * Those object is initialized in this function.
+ */
+Status DefaultNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+    config_                                      = net_config;
+    Status ret                                   = TNN_OK;
+    DefaultModelInterpreter *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    CHECK_PARAM_NULL(default_interpreter);
+
+    NetStructure *net_structure = default_interpreter->GetNetStructure();
+    NetResource *net_resource   = default_interpreter->GetNetResource();
+
+    if (net_structure == NULL || net_resource == NULL) {
+        LOGE("ERROR: network_ is nil, network_type may not support\n");
+        return Status(TNNERR_NULL_PARAM, "network_ is nil, network_type may not support");
+    }
+
+    device_ = GetDevice(net_config.device_type);
+    RETURN_VALUE_ON_NEQ(device_ != NULL, true, TNNERR_DEVICE_NOT_SUPPORT);
+
+    context_ = device_->CreateContext(net_config.device_id);
+    RETURN_VALUE_ON_NEQ(context_ != NULL, true, TNNERR_DEVICE_CONTEXT_CREATE);
+
+#ifdef DEBUG
+    {
+        static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
+        LOGD("support fp 16: %d\n", cpu_support_fp16 ? 1 : 0);
+    }
+#endif
+    context_->SetPrecision(net_config.precision);
+    context_->SetEnableTuneKernel(net_config.enable_tune_kernel);
+
+    if(!net_config.cache_path.empty()) {
+        auto params_md5 = default_interpreter->GetParamsMd5();
+        if (params_md5.size() < 1) {
+            return Status(TNNERR_PARAM_ERR, "model params md5 missing");
+        }
+        context_->SetCachePath(net_config.cache_path);
+        context_->SetCacheFilePath(GenerateCacheFileName(model_config, params_md5[0]));
+    }
+
+    ret = context_->LoadLibrary(net_config.library_path);
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    /*
+     * The NetOptimizeManager holds a list of network optimization processes.
+     * The optimization process may change the network structure accoundingly.
+     * eg. fuse conv+bn, conv+relu.
+     */
+    if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+        // use mutex to protect net_resource and net_structure in multi-thread
+        std::unique_lock<std::mutex> lck(optimize_mtx_);
+        ret = optimizer::NetOptimizerManager::Optimize(net_structure, net_resource, net_config);
+        RETURN_ON_NEQ(ret, TNN_OK);
+    }
+
+    blob_manager_ = new BlobManager(device_);
+
+    ret = blob_manager_->Init(net_config, net_structure, max_inputs_shape, GetNetResourceDataType(net_resource));
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ret = InitLayers(net_structure, net_resource);
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ret = AllocateBlobMemory();
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    net_structure_ = net_structure;
+    net_resource_ = net_resource;
+    
+    ret = context_->OnInstanceReshapeBegin();
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ret = ReshapeLayers();
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ret = context_->OnInstanceReshapeEnd();
+    return ret;
+}
+
+static inline bool IsLayoutReformatLayer(std::shared_ptr<LayerInfo> layer) {
+    if (layer->type == LAYER_REFORMAT) {
+        auto param = dynamic_cast<ReformatLayerParam *>(layer->param.get());
+        if (param->src_format != param->dst_format) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/*
+ * InitLayer function does the following things:
+ *  1. Set Blob type accordingly.
+ *  2. Set data_type accordingly.
+ *  3. Infer the blob shapes.
+ *  4. Check the weights required.
+ */
+Status DefaultNetwork::InitLayers(NetStructure *net_structure, NetResource *net_resource) {
+    Status ret            = TNN_OK;
+    bool is_quantized_net = GetQuantizedInfoFromNetStructure(net_structure);
+    
+    // mark const blobs and blob data type
+    auto const_blobs = net_resource->constant_map;
+    for (auto layer_info : net_structure->layers) {
+        std::vector<std::string> &input_names  = layer_info->inputs;
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (const_blobs.find(name) != const_blobs.end()) {
+                if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+                    blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+                }
+                blob->GetBlobDesc().data_type = const_blobs[name]->GetDataType();
+            }
+        }
+    }
+    
+
+    auto const_layers = net_resource->constant_layers;
+
+    // update blob precision, alloc new blob required
+    for (auto layer_info : net_structure->layers) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL && const_layers.find(layer_info->name) != const_layers.end()) {
+            continue;
+        }
+        
+        // set layer nodes
+        std::vector<std::string> &input_names  = layer_info->inputs;
+        std::vector<std::string> &output_names = layer_info->outputs;
+
+        DataFormat input_fmt = DATA_FORMAT_AUTO;
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            // skip const blobs
+            if (const_blobs.count(name) == 0) {
+                input_fmt = blob->GetBlobDesc().data_format;
+                auto ret  = UpdateBlobPrecision(layer_info, true, is_quantized_net, name, net_resource, &blob);
+                RETURN_ON_NEQ(ret, TNN_OK);
+            }
+        }
+
+        // output layout equals to input layout except for layout_reformat layer
+        DataFormat output_fmt = IsLayoutReformatLayer(layer_info) ?
+            dynamic_cast<ReformatLayerParam *>(layer_info->param.get())->dst_format : input_fmt;
+
+#ifdef GENERATE_RESOURCE
+        if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+            LayerType type       = layer_info->type;
+            BaseLayer *cur_layer = CreateLayer(type);
+            if (cur_layer == NULL) {
+                LOGE("Error: CreateLayer failed, type:%d\n", type);
+                return Status(TNNERR_PARAM_ERR, "CreateLayer failed");
+            }
+            std::string layer_name = layer_info->name;
+            cur_layer->SetLayerName(layer_name);
+            cur_layer->SetRuntimeMode(runtime_model_);
+            cur_layer->SetConstantResource(&net_resource->constant_map);
+            cur_layer->SetConstantResourceFlag(&net_resource->constant_blob_flags);
+
+            std::vector<Blob *> inputs;
+            std::vector<Blob *> outputs_for_shape;
+            for (auto name : input_names) {
+                inputs.push_back(blob_manager_->GetBlob(name));
+            }
+
+            for (auto name : output_names) {
+                outputs_for_shape.push_back(blob_manager_->GetBlob(name));
+            }
+
+            // generate resource if null
+            if (net_resource->resource_map.count(layer_name) == 0) {
+                LayerParam *layer_param  = layer_info->param.get();
+                LayerResource *layer_res = nullptr;
+                GenerateRandomResource(type, layer_param, &layer_res, inputs, &net_resource->constant_map);
+                net_resource->resource_map[layer_name] = std::shared_ptr<LayerResource>(layer_res);
+            }
+
+            cur_layer->InferShapeAhead(inputs, outputs_for_shape, layer_info->param.get(),
+                                       net_resource->resource_map[layer_name].get());
+
+            delete cur_layer;
+        }
+#endif
+
+        for (auto name : output_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            // skip const blobs
+            if (const_blobs.count(name) == 0) {
+                blob->GetBlobDesc().data_format = output_fmt;
+                auto ret = UpdateBlobPrecision(layer_info, false, is_quantized_net, name, net_resource, &blob);
+                RETURN_ON_NEQ(ret, TNN_OK);
+            }
+        }
+    }
+
+    // init layer
+    for (auto layer_info : net_structure->layers) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL && const_layers.find(layer_info->name) != const_layers.end()) {
+            continue;
+        }
+        
+        LayerType type       = layer_info->type;
+        BaseLayer *cur_layer = CreateLayer(type);
+        if (cur_layer == NULL) {
+            LOGE("Error: CreateLayer failed, type:%d\n", type);
+            return Status(TNNERR_PARAM_ERR, "CreateLayer failed");
+        }
+        std::string layer_name = layer_info->name;
+        cur_layer->SetLayerName(layer_name);
+        // set layer nodes
+        std::vector<Blob *> inputs;
+        std::vector<std::string> &input_names = layer_info->inputs;
+
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (blob == nullptr) {
+                delete cur_layer;
+                LOGE("Input of layer(%s) are invalid", layer_name.c_str());
+                return Status(TNNERR_PARAM_ERR, "Input of layer are invalid");
+           }
+            // update layout reformat layer's param and blob datatype
+            if (IsLayoutReformatLayer(layer_info)) {
+                // only need to update model's input blob datatype
+                // others are already updated in UpdateBlobPrecision method
+                if (net_structure->inputs_shape_map.find(name) != net_structure->inputs_shape_map.end()) {
+                    auto dtype = blob_manager_->GetBlob(layer_info->outputs[0])->GetBlobDesc().data_type;
+                    LOGD("DefaultNetwork::InitLayers LayoutReformat set input: %s datatype as: %d\n",
+                         name.c_str(), dtype);
+                    blob->GetBlobDesc().data_type = dtype;
+                }
+                auto param      = dynamic_cast<ReformatLayerParam *>(layer_info->param.get());
+                param->src_type = blob->GetBlobDesc().data_type;
+                param->dst_type = param->src_type;
+            }
+            inputs.push_back(blob);
+        }
+
+        std::vector<Blob *> outputs;
+        std::vector<std::string> &output_names = layer_info->outputs;
+
+        for (auto name : output_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (blob == nullptr) {
+                delete cur_layer;
+                LOGE("Output of layer(%s) are invalid", layer_name.c_str());
+                return Status(TNNERR_PARAM_ERR, "Output of layer are invalid");
+            }
+            outputs.push_back(blob);
+        }
+
+        LayerResource *layer_resource = nullptr;
+        if (net_resource->resource_map.count(layer_name) != 0) {
+            layer_resource = net_resource->resource_map[layer_name].get();
+        }
+        
+        cur_layer->SetRuntimeMode(runtime_model_);
+        cur_layer->SetConstantResource(&net_resource->constant_map);
+        cur_layer->SetConstantResourceFlag(&net_resource->constant_blob_flags);
+        ret = cur_layer->Init(context_, layer_info->param.get(), layer_resource, inputs, outputs, device_);
+        if (ret != TNN_OK) {
+            LOGE("Error Init layer %s (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(), (int)ret, (int)ret);
+            // release layer if Init failed
+            delete cur_layer;
+            return ret;
+        }
+        cur_layer->SetRuntimeBlobMemoryPool(runtime_blob_pool_);
+
+        layers_.push_back(cur_layer);
+    }
+    return ret;
+}
+
+Status DefaultNetwork::AllocateBlobMemory() {
+    return blob_manager_->AllocateBlobMemory(DATA_FLAG_CHANGE_ALWAYS);
+}
+
+Status DefaultNetwork::GenerateInt8Blob(const std::string &name, NetResource *net_resource, Blob **blob) {
+    auto new_blob = new BlobInt8((*blob)->GetBlobDesc(), (*blob)->GetHandle());
+    CHECK_PARAM_NULL(new_blob);
+
+    std::string blob_scale_name = name + "_scale_data_";
+#ifdef GENERATE_RESOURCE
+    if (net_resource->resource_map.count(blob_scale_name) == 0) {
+        LayerResource *layer_res  = nullptr;
+        std::vector<Blob *> blobs = {*blob};
+        GenerateRandomResource(LAYER_BLOB_SCALE, nullptr, &layer_res, blobs);
+        net_resource->resource_map[blob_scale_name] = std::shared_ptr<LayerResource>(layer_res);
+    }
+#endif
+    if (net_resource->resource_map.find(blob_scale_name) == net_resource->resource_map.end()) {
+        LOGE("Error Init layer, can not get output blob scale %s \n", blob_scale_name.c_str());
+        return TNNERR_NULL_PARAM;
+    }
+
+    new_blob->SetIntResource(reinterpret_cast<IntScaleResource *>(net_resource->resource_map[blob_scale_name].get()));
+    blob_manager_->ReplaceBlob(name, new_blob);
+    *blob = new_blob;
+
+    return TNN_OK;
+}
+
+Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info, bool is_input, bool is_quantized_net,
+                                           const std::string &name, NetResource *net_resource, Blob **blob) {
+    if (device_->GetDeviceType() != DEVICE_ARM && device_->GetDeviceType() != DEVICE_NAIVE &&
+        device_->GetDeviceType() != DEVICE_X86) {
+        return TNN_OK;
+    }
+
+    auto &desc      = (*blob)->GetBlobDesc();
+    auto layer_type = layer_info->type;
+
+    if (layer_type != LAYER_REFORMAT) {
+        // non-reformat layer
+        if (is_quantized_net) {
+            // update blob of quantized network by layer info
+            if (layer_info->param->quantized && desc.data_type != DATA_TYPE_INT8) {
+                RETURN_ON_NEQ(GenerateInt8Blob(name, net_resource, blob), TNN_OK);
+            }
+        } else {
+            // update blob of non-quantized network by precision
+            auto original_data_type = desc.data_type;
+            if (original_data_type == DATA_TYPE_FLOAT || original_data_type == DATA_TYPE_HALF ||
+                original_data_type == DATA_TYPE_BFP16) {
+                if (config_.precision == PRECISION_NORMAL || config_.precision == PRECISION_AUTO) {
+                    static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
+                    bool layer_implemented_fp16  = device_->GetImplementedPrecision(layer_type)->fp16_implemented;
+                    desc.data_type = (cpu_support_fp16 && layer_implemented_fp16) ? DATA_TYPE_HALF : DATA_TYPE_FLOAT;
+                } else if (config_.precision == PRECISION_LOW) {
+                    if (device_->GetDeviceType() == DEVICE_ARM) {
+                        desc.data_type = DATA_TYPE_BFP16;
+                    } else if (device_->GetDeviceType() == DEVICE_NAIVE ||
+                               device_->GetDeviceType() == DEVICE_X86) {
+                        desc.data_type = DATA_TYPE_FLOAT;
+                    }
+                } else if (config_.precision == PRECISION_HIGH) {
+                    desc.data_type = DATA_TYPE_FLOAT;
+                } else {
+                    return Status(TNNERR_PARAM_ERR, "invalid precision");
+                }
+            }
+        }
+    } else {
+        // layout reformat, update later
+        if (IsLayoutReformatLayer(layer_info)) {
+            return TNN_OK;
+        }
+        // datatype reformat, update by layer param
+        if (is_input) {
+            auto src_type = reinterpret_cast<ReformatLayerParam *>(layer_info->param.get())->src_type;
+            if (src_type == DATA_TYPE_INT8) {
+                RETURN_ON_NEQ(GenerateInt8Blob(name, net_resource, blob), TNN_OK);
+            } else {
+                desc.data_type = src_type;
+            }
+        } else {
+            auto dst_type = reinterpret_cast<ReformatLayerParam *>(layer_info->param.get())->dst_type;
+            if (dst_type == DATA_TYPE_INT8) {
+                RETURN_ON_NEQ(GenerateInt8Blob(name, net_resource, blob), TNN_OK);
+            } else {
+                desc.data_type = dst_type;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status DefaultNetwork::GetForwardMemorySize(int &memory_size) {
+    memory_size = blob_manager_->GetAllBlobMemorySize();
+    return TNN_OK;
+}
+
+Status DefaultNetwork::SetForwardMemory(void *memory) {
+    return blob_manager_->SetForwardMemory(memory);
+}
+
+Status DefaultNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    blob_manager_->GetAllInputBlobs(blobs);
+    return TNN_OK;
+}
+
+/*
+ * Returns the default output blobs in the network.
+ * Additional output blob may be assigned with TNN::AddOutput function
+ */
+Status DefaultNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    blob_manager_->GetAllOutputBlobs(blobs);
+    return TNN_OK;
+}
+
+/*
+ * Reshape function is called when the input shape changes.
+ * Memory allocation may be involved in Reshape function.
+ */
+Status DefaultNetwork::Reshape(const InputShapesMap &inputs) {
+    Status ret = TNN_OK;
+    bool do_reshape = false;
+    for (auto iter : inputs) {
+        Blob *blob = blob_manager_->GetBlob(iter.first);
+        if (blob == nullptr) {
+            LOGE("DefaultNetwork reshape blob is empty, maybe the blob name is wrong\n");
+            return Status(TNNERR_PARAM_ERR, "DefaultNetwork reshape blob is empty, maybe the blob name is wrong");
+        }
+        if(!DimsVectorUtils::Equal(blob->GetBlobDesc().dims, iter.second)) {
+            blob->GetBlobDesc().dims = iter.second;
+            do_reshape = true;
+        }
+    }
+
+    if(do_reshape) {
+        ret = context_->OnInstanceReshapeBegin();
+        if (ret != TNN_OK) {
+            return ret;
+        }
+
+        ret = ReshapeLayers();
+        if (ret != TNN_OK) {
+            return ret;
+        }
+ 
+        ret = context_->OnInstanceReshapeEnd();
+    }
+    return ret;
+}
+
+Status DefaultNetwork::DeInit() {
+    for (size_t i = 0; i < layers_.size(); i++) {
+        if (layers_[i] != NULL) {
+            delete layers_[i];
+        }
+    }
+    layers_.clear();
+
+    if (blob_manager_ != NULL) {
+        delete blob_manager_;
+        blob_manager_ = NULL;
+    }
+    
+    if (runtime_blob_pool_ != nullptr) {
+        delete runtime_blob_pool_;
+        runtime_blob_pool_ = nullptr;
+    }
+
+    if (context_ != NULL) {
+        delete context_;
+        context_ = NULL;
+    }
+
+    return TNN_OK;
+}
+/*
+ * CommandQueue is an abstract object.
+ * The actual object maybe:
+ *  1. OpenCl commnadqueue.
+ *  2. Metal command buffer.
+ *  3. Cuda Stream
+ *  ...
+ */
+Status DefaultNetwork::GetCommandQueue(void **command_queue) {
+    if (context_ == NULL) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+    return context_->GetCommandQueue(command_queue);
+}
+
+Status DefaultNetwork::ShareCommandQueue(AbstractNetwork *network) {
+    if (context_ == NULL) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+
+    auto network_target = dynamic_cast<DefaultNetwork *>(network);
+    if (!network_target) {
+        return Status(TNNERR_DEVICE_CONTEXT_CREATE, "inpute network is DefaultNetwork");
+    }
+    return context_->ShareCommandQueue(network_target->GetContext());
+}
+
+Context* DefaultNetwork::GetContext() {
+    return context_;
+}
+
+Status DefaultNetwork::Forward() {
+    auto status = blob_manager_->CheckBlobMemoryState();
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    if (runtime_blob_pool_) {
+        //now we allocate blob eachtime when running acc, so clear blob pool to avoid memory leak
+        runtime_blob_pool_->ClearBlobMemoryPool();
+    }
+    
+    status = context_->OnInstanceForwardBegin();
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    int cnt = 0;
+    for (auto layer : layers_) {
+        std::vector<Blob *> inputs  = layer->GetInputBlobs();
+        std::vector<Blob *> outputs = layer->GetOutputBlobs();
+
+        {
+            
+#if DUMP_INPUT_BLOB
+            if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+                // InputBlob data in dumped into files in NCHW_FLOAT format as default
+                std::string filename = layer->GetLayerName();
+                std::replace(filename.begin(), filename.end(), '/', '_');
+                for (int i = 0; i < inputs.size(); i++) {
+                    char ss[1000];
+                    if (g_tnn_dump_directory.length() > 0) {
+                        snprintf(ss, 1000, "%s/%05d-%s-in-%d", g_tnn_dump_directory.c_str(), cnt, filename.c_str(), i);
+                    } else {
+                        snprintf(ss, 1000, "%05d-%s-in-%d", cnt, filename.c_str(), i);
+                    }
+
+                    auto ret = DumpDeviceBlob(inputs[i], context_, std::string(ss));
+                    if (ret != TNN_OK) {
+                        LOGE("dump blob failed\n");
+                        return ret;
+                    }
+                }
+            }
+#endif  // DUMP_INPUT_BLOB
+            
+            status = layer->Forward();
+            LOGD("layer name: %s, forward result: %d \n", layer->GetLayerName().c_str(), (int)status);
+            LOGD("Output Shape: [%s]\n", layer->GetOutputBlobs()[0]->GetBlobDesc().description().c_str());
+            if (status != TNN_OK) {
+                LOGE("Forward error %s, exit\n", status.description().c_str());
+                return status;
+            }
+
+#if DUMP_OUTPUT_BLOB
+            if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+                // OutBlob data in dumped into files in NCHW_FLOAT format as default
+                std::string out_file_name = layer->GetLayerName();
+                std::replace(out_file_name.begin(), out_file_name.end(), '/', '_');
+                for (int i = 0; i < outputs.size(); i++) {
+                    char ss[1000];
+                    if (g_tnn_dump_directory.length() > 0) {
+                        snprintf(ss, 1000, "%s/%05d-%s-out-%d", g_tnn_dump_directory.c_str(), cnt, out_file_name.c_str(), i);
+                    } else {
+                        snprintf(ss, 1000, "%05d-%s-out-%d", cnt, out_file_name.c_str(), i);
+                    }
+
+                    auto ret = DumpDeviceBlob(outputs[i], context_, std::string(ss));
+                    if (ret != TNN_OK) {
+                        LOGE("dump blob failed\n");
+                        return ret;
+                    }
+                }
+            }
+#endif  // DUMP_OUTPUT_BLOB
+        }
+        
+        cnt++;
+    }
+    context_->OnInstanceForwardEnd();
+    context_->Synchronize();
+    return status;
+}
+
+#ifdef FORWARD_CALLBACK_ENABLE
+Status DefaultNetwork::ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after) {
+    Status result = TNN_OK;
+    result        = blob_manager_->CheckBlobMemoryState();
+    if (result != TNN_OK) {
+        return result;
+    }
+
+    context_->OnInstanceForwardBegin();
+    int cnt = 0;
+    for (auto layer : layers_) {
+        std::vector<Blob *> inputs  = layer->GetInputBlobs();
+        std::vector<Blob *> outputs = layer->GetOutputBlobs();
+
+        auto layer_info = GetLayerInfoFromName(net_structure_, layer->GetLayerName());
+        if (before != nullptr)
+            before(inputs, layer_info.get());
+
+        result = layer->Forward();
+        if (result != TNN_OK) {
+            LOGE("Forward error %s, exit\n", result.description().c_str());
+            return result;
+        }
+        context_->Synchronize();
+
+        if (after != nullptr)
+            after(outputs, layer_info.get());
+
+        cnt++;
+    }
+    context_->OnInstanceForwardEnd();
+    return result;
+}
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+// @brief tnn instance network infer, it will not wait
+// blob dump is not implement in this funciton.
+Status DefaultNetwork::ForwardAsync(Callback call_back) {
+    Status result = TNN_OK;
+    result        = blob_manager_->CheckBlobMemoryState();
+    if (result != TNN_OK) {
+        return result;
+    }
+
+    context_->OnInstanceForwardBegin();
+    for (auto layer : layers_) {
+        result = layer->Forward();
+        RETURN_ON_NEQ(result, TNN_OK);
+    }
+    context_->OnInstanceForwardEnd();
+    return result;
+}
+
+#if TNN_PROFILE
+void DefaultNetwork::StartProfile() {
+    context_->StartProfile();
+}
+
+std::shared_ptr<ProfileResult> DefaultNetwork::FinishProfile() {
+    return context_->FinishProfile();
+}
+#endif
+
+std::string DefaultNetwork::GenerateCacheFileName(ModelConfig &model_config, std::string& md5_str) {
+    return CACHE_TAG + "_" + ToString(config_.device_type) + "_" + ToString(config_.device_id)
+        + "_" + ToString(config_.precision) + "_" + ToString(model_config.model_type) +
+        "_" + md5_str;
+}
+
+Status DefaultNetwork::ReshapeLayers() {
+    for (auto cur_layer : layers_) {
+        auto status = cur_layer->Reshape();
+        RETURN_ON_NEQ(status, TNN_OK);
+        //Note output shape may not change after reshape for const folder, but will do change after forward because shape may be determined at rumtime
+        LOGD("ReshapeLayers Output Shape: [%s]\n", cur_layer->GetOutputBlobs()[0]->GetBlobDesc().description().c_str());
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/default_network.h b/3rdparty/TNN/source/tnn/core/default_network.h
new file mode 100644
index 0000000..0da2f79
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/default_network.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_DEFAULT_NETWORK_H_
+#define TNN_SOURCE_TNN_CORE_DEFAULT_NETWORK_H_
+
+#include <vector>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/blob_manager.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+class DefaultNetwork : public AbstractNetwork {
+public:
+    // @brief DefaultNetwork Constructor
+    DefaultNetwork();
+
+    // @brief DefaultNetwork virtual Destructor
+    virtual ~DefaultNetwork();
+
+public:
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param net_structure network structure info
+    // @param net_resource network resource info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+
+    // @brief reshape with input shape info
+    // @inputs input shape info
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+    
+    // @brief share tnn command queue to another network
+    virtual Status ShareCommandQueue(AbstractNetwork *network);
+
+    // @brief network forward
+    virtual Status Forward();
+
+#ifdef FORWARD_CALLBACK_ENABLE
+    // @brief network infer with callbach to statistic blob info
+    virtual Status ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after);
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief network deinit to release init create resource
+    virtual Status DeInit();
+
+    // @brief get network forward for all blob memory size
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    // @brief set forward memory when share memory mode is set from external
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief get all input blobs
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    // @brief set threads run on device
+    virtual Status SetCpuNumThreads(int num_threads);
+
+#if TNN_PROFILE
+public:
+    virtual void StartProfile();
+    virtual std::shared_ptr<ProfileResult> FinishProfile();
+#endif
+
+protected:
+    virtual Status InitLayers(NetStructure *net_structure, NetResource *net_resource);
+    virtual Status AllocateBlobMemory();
+    RuntimeMode runtime_model_ = RUNTIME_MODE_NORMAL;
+    
+    Status GenerateInt8Blob(const std::string &name, NetResource *net_resource, Blob **blob);
+    Status UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info, bool is_input, bool is_quantized_net,
+                               const std::string &name, NetResource *net_resource, Blob **blob);
+
+    std::string GenerateCacheFileName(ModelConfig &model_config, std::string& md5_str);
+
+    AbstractDevice *device_ = nullptr;
+    Context *context_       = nullptr;
+    Context *GetContext();
+
+    std::vector<BaseLayer *> layers_;
+
+    BlobManager *blob_manager_ = nullptr;
+    BlobMemoryPool *runtime_blob_pool_ = nullptr;
+
+    NetStructure *net_structure_ = nullptr;
+    NetResource *net_resource_ = nullptr;
+
+    NetworkConfig config_;
+
+    static std::mutex optimize_mtx_;
+
+private:
+
+   Status ReshapeLayers();
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_DEFAULT_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/core/instance.cc b/3rdparty/TNN/source/tnn/core/instance.cc
new file mode 100644
index 0000000..28d3a8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/instance.cc
@@ -0,0 +1,333 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/instance.h"
+
+#include <memory>
+
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/common.h"
+#include "tnn/core/const_folder.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/profile.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+/*
+ * The Instance Object mainly holds the network object now.
+ * It wraps the network object to keep consistency of the header.
+ */
+
+Instance::Instance(NetworkConfig &net_config, ModelConfig &model_config) {
+    net_config_   = net_config;
+    model_config_ = model_config; // note that, the params in model_config is empty, don't use it
+}
+Instance::~Instance() {
+    DeInit();
+}
+
+Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape) {
+    return Init(interpreter, inputs_shape, inputs_shape);
+}
+
+Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    auto device = GetDevice(net_config_.device_type);
+    RETURN_VALUE_ON_NEQ(device != NULL, true, TNNERR_DEVICE_NOT_SUPPORT);
+    interpreter_ = interpreter->Copy();
+    if (nullptr == interpreter_) {
+        // The ModelInterpreter not implement Copy API, just use interpreter
+        LOGI("Interpreter Copy failed, use interpreter in params instead\n");
+        interpreter_ = interpreter;
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter_.get());
+
+    auto network_type = net_config_.network_type;
+    if(network_type == NETWORK_TYPE_AUTO) {
+        network_type = device->ConvertAutoNetworkType();
+    }
+    //NetworkImpl is register by each Impl.
+    //TNN model runs with the default_network.
+    network_ = NetworkImplManager::GetNetworkImpl(network_type);
+    if (!network_) {
+        LOGE("ERROR: network_ is nil, network_type may not support\n");
+        return Status(TNNERR_NET_ERR, "network_ is nil, network_type may not support");
+    }
+    if (net_config_.device_type == DEVICE_CUDA) {
+        auto ret = network_->Init(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, false);
+        if (ret == TNN_OK) {
+            return ret;
+        }
+
+        LOGI("Init network failed. Try to re-init it with const folder, and if succeed all of error info above can be ignored.\n");
+        network_.reset();
+    }
+
+    if (default_interpreter && default_interpreter->GetNetStructure() &&
+        (NeedDoConstantFolding(default_interpreter->GetNetStructure()) || net_config_.device_type == DEVICE_CUDA)) {
+        auto const_folder = std::make_shared<ConstFolder>();
+        auto folder_net_config = net_config_;
+	folder_net_config.share_memory_mode = SHARE_MEMORY_MODE_DEFAULT;
+	auto status = const_folder->Init(folder_net_config, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        if (min_inputs_shape.size() != 0) {
+            status = const_folder->Reshape(min_inputs_shape);
+            RETURN_ON_NEQ(status, TNN_OK);
+            status = const_folder->Forward();
+            RETURN_ON_NEQ(status, TNN_OK);
+            auto min_blob_shapes_map = default_interpreter->GetNetResource()->blob_shapes_map;
+            
+            //Note output shape may not change after reshape for const folder, but will do change after forward because shape may be determined at rumtime
+            status = const_folder->Reshape(max_inputs_shape);
+            RETURN_ON_NEQ(status, TNN_OK);
+            status = const_folder->Forward();
+            RETURN_ON_NEQ(status, TNN_OK);
+            
+            default_interpreter->GetNetResource()->min_blob_shapes_map = min_blob_shapes_map;
+        } else {
+            status = const_folder->Forward();
+            RETURN_ON_NEQ(status, TNN_OK);
+            auto max_constant_map = default_interpreter->GetNetResource()->blob_shapes_map;
+            default_interpreter->GetNetResource()->min_blob_shapes_map = max_constant_map;
+        }
+ 
+        const_folder_ = const_folder;
+    }
+
+    network_ = NetworkImplManager::GetNetworkImpl(network_type);
+    auto ret = network_->Init(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, true);
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    return TNN_OK;
+}
+
+Status Instance::DeInit() {
+    network_ = nullptr;
+    return TNN_OK;
+}
+
+Status Instance::GetForwardMemorySize(int &memory_size) {
+    return network_->GetForwardMemorySize(memory_size);
+}
+
+Status Instance::SetForwardMemory(void *memory) {
+    return network_->SetForwardMemory(memory);
+}
+
+Status Instance::Reshape(const InputShapesMap &inputs) {
+    Status status = TNN_OK;
+    if (const_folder_) {
+        status = const_folder_->Reshape(inputs);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        status = const_folder_->Forward();
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+    status = network_->Reshape(inputs);
+    return status;
+}
+
+Status Instance::GetCommandQueue(void **command_queue) {
+    return network_->GetCommandQueue(command_queue);
+}
+
+Status Instance::ShareCommandQueue(Instance *instance) {
+    return network_->ShareCommandQueue(instance->GetNetwork());
+}
+
+AbstractNetwork *Instance::GetNetwork() {
+    return network_.get();
+}
+
+Status Instance::Forward() {
+    output_mats_convert_status_.clear();
+    return network_->Forward();
+}
+
+#ifdef FORWARD_CALLBACK_ENABLE
+Status Instance::ForwardWithCallback(BlobStatisticCallback before, BlobStatisticCallback after) {
+    output_mats_convert_status_.clear();
+    return network_->ForwardWithCallback(before, after);
+}
+#endif  // end of FORWARD_CALLBACK_ENABLE
+
+#ifdef GET_INTERP_ENABLE
+// Get Model Interpreter
+std::shared_ptr<AbstractModelInterpreter> Instance::GetInterpreter() {
+    return interpreter_;
+}
+#endif  // end of GET_INTERP_ENABLE
+
+Status Instance::ForwardAsync(Callback call_back) {
+    output_mats_convert_status_.clear();
+    return (Status)network_->ForwardAsync(call_back);
+}
+
+Status Instance::GetAllInputBlobs(BlobMap &blobs) {
+    return network_->GetAllInputBlobs(blobs);
+}
+
+Status Instance::GetAllOutputBlobs(BlobMap &blobs) {
+    return network_->GetAllOutputBlobs(blobs);
+}
+
+Status Instance::SetCpuNumThreads(int num_threads) {
+    return network_->SetCpuNumThreads(num_threads);
+}
+
+// set input Mat
+Status Instance::SetInputMat(std::shared_ptr<Mat> mat, MatConvertParam param, std::string input_name) {
+    if (!mat) {
+        LOGE("input mat is empty ,please check!\n");
+        return Status(TNNERR_PARAM_ERR, "input mat is empty ,please check!");
+    }
+
+    // get input blobs
+    BlobMap input_blobs;
+    auto status = network_->GetAllInputBlobs(input_blobs);
+    if (status != TNN_OK || input_blobs.size() <= 0) {
+        LOGE("instance.GetAllInputBlobs Error: %s\n", status.description().c_str());
+        return status;
+    }
+
+    // insure name is valid, take the first input name for default
+    if (input_name.length() <= 0) {
+        input_name = input_blobs.begin()->first;
+    } else {
+        if (input_blobs.find(input_name) == input_blobs.end()) {
+            LOGE("instance dont have the input with name: %s\n", input_name.c_str());
+            return Status(TNNERR_MODEL_ERR, "instance dont have the input with name");
+        }
+    }
+
+    // check blob convert
+    std::shared_ptr<BlobConverter> blob_converter = nullptr;
+    if (input_converters_.size() > 0 && input_converters_.find(input_name) != input_converters_.end()) {
+        blob_converter = input_converters_[input_name];
+    } else {
+        auto input_blob               = input_blobs[input_name];
+        blob_converter                = std::make_shared<BlobConverter>(input_blob);
+        input_converters_[input_name] = blob_converter;
+    }
+
+    // get command queue
+    void *command_queue = nullptr;
+    network_->GetCommandQueue(&command_queue);
+
+    status = blob_converter->ConvertFromMatAsync(*(mat.get()), param, command_queue);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("input_blob_convert.ConvertFromMatAsync Error: %s\n", status.description().c_str());
+        return status;
+    }
+
+    return TNN_OK;
+}
+
+// get output Mat
+Status Instance::GetOutputMat(std::shared_ptr<Mat> &mat, MatConvertParam param, std::string output_name,
+                              DeviceType device, MatType mat_type) {
+    // get output blobs
+    BlobMap output_blobs;
+    auto status = network_->GetAllOutputBlobs(output_blobs);
+    if (status != TNN_OK || output_blobs.size() <= 0) {
+        LOGE("instance.GetAllOutputBlobs Error: %s\n", status.description().c_str());
+        return status;
+    }
+
+    // insure name is valid, take the first output name for default
+    if (output_name.length() <= 0) {
+        output_name = output_blobs.begin()->first;
+    } else {
+        if (output_blobs.find(output_name) == output_blobs.end()) {
+            LOGE("instance dont have the output with name: %s\n", output_name.c_str());
+            return Status(TNNERR_MODEL_ERR, "instance dont have the output with name");
+        }
+    }
+
+    // check if it has been converted
+    if (output_mats_convert_status_.find(output_name) != output_mats_convert_status_.end() &&
+        output_mats_.find(output_name) != output_mats_.end()) {
+        mat = output_mats_[output_name];
+        return TNN_OK;
+    }
+
+    // check if it has been allocated or reallocated for dims change.
+    // allocate output mat
+    bool need_allocate = true;
+    if (output_mats_.find(output_name) != output_mats_.end()) {
+        auto mat_dims  = output_mats_[output_name]->GetDims();
+        auto blob_dims = output_blobs[output_name]->GetBlobDesc().dims;
+        if (DimsVectorUtils::Equal(mat_dims, blob_dims)) {
+            need_allocate = false;
+        }
+    }
+
+    if (need_allocate) {
+        auto dims                 = output_blobs[output_name]->GetBlobDesc().dims;
+        std::shared_ptr<TNN_NS::Mat> output_mat(new TNN_NS::Mat(device, mat_type, dims));
+        output_mats_[output_name] = output_mat;
+    }
+
+    mat = output_mats_[output_name];
+
+    // check blob convert
+    std::shared_ptr<BlobConverter> blob_converter = nullptr;
+    if (output_converters_.size() > 0 && output_converters_.find(output_name) != output_converters_.end()) {
+        blob_converter = output_converters_[output_name];
+    } else {
+        auto input_blob                 = output_blobs[output_name];
+        blob_converter                  = std::make_shared<BlobConverter>(input_blob);
+        output_converters_[output_name] = blob_converter;
+    }
+
+    // get command queue
+    void *command_queue = nullptr;
+    network_->GetCommandQueue(&command_queue);
+    status = blob_converter->ConvertToMat(*(mat.get()), param, command_queue);
+    if (status == TNN_NS::TNN_OK) {
+        // set output mat convert status
+        output_mats_convert_status_[output_name] = 1;
+    } else {
+        LOGE("output_blob_convert.ConvertFromMat Error: %s\n", status.description().c_str());
+    }
+
+    return status;
+}
+
+#if TNN_PROFILE
+void Instance::StartProfile() {
+    network_->StartProfile();
+}
+
+std::string Instance::FinishProfile(bool do_print) {
+    std::shared_ptr<ProfileResult> profile_result = network_->FinishProfile();
+    std::string result_str                        = " ";
+    if (profile_result) {
+        result_str = profile_result->GetProfilingDataInfo();
+        if (do_print) {
+            printf("%s", result_str.c_str());
+        }
+    }
+
+    return result_str;
+}
+
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/layer_type.cc b/3rdparty/TNN/source/tnn/core/layer_type.cc
new file mode 100644
index 0000000..4ac148e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/layer_type.cc
@@ -0,0 +1,256 @@
+#// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/layer_type.h"
+
+#include <map>
+#include <string>
+
+namespace TNN_NS {
+
+static const std::string int8_prefix = "Int8";
+
+static std::map<std::string, LayerType> global_layer_type_map = {
+    // LAYER_Convolution, including depthwise convolution
+    {"Convolution1D", LAYER_CONVOLUTION_1D},
+    {"Convolution", LAYER_CONVOLUTION},
+    {"Convolution3D", LAYER_CONVOLUTION_3D},
+    {"BatchNormalization", LAYER_BATCH_NORM},
+    {"BatchNormCxx", LAYER_BATCH_NORM},
+    {"Softmax", LAYER_SOFTMAX},
+    {"Pooling", LAYER_POOLING},
+    {"Pooling3D", LAYER_POOLING_3D},
+    {"Pooling_split_CC", LAYER_POOLING},
+    {"ReLU", LAYER_RELU},
+    {"Relu", LAYER_RELU},
+    {"Split", LAYER_SPLITING},
+    {"Concat", LAYER_CONCAT},
+    {"Reshape", LAYER_RESHAPE},
+    {"Flatten", LAYER_FLATTEN},
+    {"Dropout", LAYER_DROPOUT},
+    // LAYER_LRN
+    {"LRN", LAYER_LRN},
+    {"Proposal", LAYER_PROPOSAL},
+    {"ROIPooling", LAYER_ROIPOOLING},
+    {"Eltwise", LAYER_ELTWISE},
+    {"Scale", LAYER_SCALE},
+    {"ArbitraryDimensionSpp", LAYER_ARB_DIM_SPP},
+    {"BatchNorm", LAYER_BATCH_NORM_EX},
+    // LAYER_FC
+    {"InnerProduct", LAYER_INNER_PRODUCT},
+    {"ReshapeC", LAYER_RESHAPEC},
+    {"SoftmaxCaffe", LAYER_SOFTMAX},
+    {"Deconvolution", LAYER_DECONVOLUTION},
+    {"Sigmoid", LAYER_SIGMOID},
+    {"Convolution_nhwc", LAYER_CONVOLUTION},
+    {"BatchNormCxx_nhwc", LAYER_BATCH_NORM},
+    {"Pooling_nhwc", LAYER_POOLING},
+    {"Softmax_nhwc", LAYER_SOFTMAX},
+    {"Concat_nhwc", LAYER_CONCAT},
+    {"Flatten_nhwc", LAYER_FLATTEN},
+    {"Permute", LAYER_PERMUTE},
+    // SSD related layer type
+    {"PriorBox", LAYER_PRIOR_BOX},
+    {"DetectionOutput", LAYER_DETECTION_OUTPUT},
+    {"PReLU", LAYER_PRELU},
+    {"InnerProduct_nhwc", LAYER_INNER_PRODUCT},
+    {"PReLU_nhwc", LAYER_PRELU},
+    {"Add", LAYER_ADD},
+    {"Tanh", LAYER_TANH},
+    {"LeakyRelu", LAYER_LEAKY_RELU},
+    {"Abs", LAYER_ABS},
+    {"Mul", LAYER_MUL},
+    {"InstBatchNormCxx", LAYER_INST_BATCH_NORM},
+    {"Pad", LAYER_PAD},
+    {"Normalize", LAYER_NORMALIZE},
+    {"QuantizeV2", LAYER_QUANTIZEV2},
+    {"Lstm", LAYER_LSTM},
+    {"QuantizedConvolution_nhwc", LAYER_CONVOLUTION},
+    {"QuantizedPooling", LAYER_POOLING},
+    // 50
+    {"Dequantize", LAYER_DEQUANTIZE},
+    {"QuantizedReshapeTensorflow", LAYER_RESHAPE},
+    {"ConvolutionDepthwise", LAYER_CONVOLUTION_DEPTHWISE},
+    {"QuantizedBiasAdd", LAYER_BIAS_ADD},
+    {"QuantizedSum", LAYER_BIAS_ADD},
+    {"BiasAdd", LAYER_BIAS_ADD},
+    {"ContinuationIndicator", LAYER_CONTINUATION_INDICATOR},
+    {"QuantizedReLU", LAYER_RELU},
+    {"QuantizedAdd", LAYER_ADD},
+    {"StridedSlice", LAYER_STRIDED_SLICE},
+    {"ReshapeTensorflow", LAYER_RESHAPE_TENSORFLOW},
+    {"QuantizedInnerProduct", LAYER_INNER_PRODUCT},
+    {"lstm_ctc", LAYER_LSTM_CTC},
+    {"LabelsequenceAccuracy", LAYER_LABEL_SEQUENCE_ACCURACY},
+    {"ShuffleChannel", LAYER_SHUFFLE_CHANNEL},
+    {"Im2colTranspose", LAYER_IM2COL_TRANSPOSE},
+    {"Im2col", LAYER_IM2COL},
+    {"Transpose", LAYER_TRANSPOSE},
+    {"FileInput", LAYER_FILEINPUT},
+    {"Reverse", LAYER_REVERSE},
+    {"Power", LAYER_POWER},
+    {"Neg", LAYER_NEG},
+    {"Tensordot", LAYER_TENSORDOT},
+    {"Shape", LAYER_SHAPE},
+    {"Prod", LAYER_PROD},
+    // 100
+    {"Const", LAYER_CONST},
+    {"Identity", LAYER_IDENTITY},
+    {"Slice", LAYER_SLICE},
+    {"SliceCaffe", LAYER_SLICE},
+    {"Cast", LAYER_CAST},
+    {"Gather", LAYER_GATHER},
+    {"MatMul", LAYER_MATMUL},
+    {"Pack", LAYER_PACK},
+    {"Placeholder", LAYER_PLACEHOLDER},
+    {"Sub", LAYER_SUB},
+    {"Add_tf", LAYER_ADD_TF},
+    {"Mul_tf", LAYER_MUL_TF},
+    {"Slice_tf", LAYER_SLICE_TF},
+    {"StridedSlice_nhwc", LAYER_STRIDED_SLICE},
+    {"Split_tf", LAYER_SPLIT_TF},
+    {"NegReLUMul", LAYER_NEGRELUMUL},
+    {"NCHW2NHWC", LAYER_NCHW2NHWC},
+    {"NHWC2NCHW", LAYER_NHWC2NCHW},
+    {"QuantizedConvolution", LAYER_CONVOLUTION},
+    {"Squeeze", LAYER_SQUEEZE},
+    {"PReLU_X", LAYER_PRELUX},
+    {"Requantize", LAYER_REQUANTIZE},
+    {"QuantizedBNGlobal", LAYER_BATCH_NORM_QUANTIZE},
+    {"QuantizedMul", LAYER_BATCH_NORM_CXX_QUANTIZE},
+    {"QuantizedBatchNormCxx", LAYER_BATCH_NORM_CXX_QUANTIZE},
+    {"ReLU6", LAYER_RELU6},
+    {"Relu6", LAYER_RELU6},
+    {"QuantizedConcat", LAYER_CONCAT},
+    {"QuantizeNCHWTONCHW4", LAYER_QUANTIZED_NCHW_TO_NCHW4},
+    {"DequantizeNCHW4TONCHW", LAYER_DEQUANTIZED_NCHW4_TO_NCHW},
+    {"Square", LAYER_SQUARE},
+    {"Sqrt", LAYER_SQRT},
+    {"Reorg", LAYER_REORG},
+    {"Elu", LAYER_ELU},
+    {"Reduce_Sum", LAYER_REDUCE_SUM},
+    {"ReduceMean", LAYER_REDUCE_MEAN},
+    {"ReduceMax", LAYER_REDUCE_MAX},
+    {"RealDiv", LAYER_REALDIV},
+    {"BN", LAYER_BN},
+    {"Interp", LAYER_INTERP},
+    {"Maximum", LAYER_MAXIMUM},
+    {"Rsqrt", LAYER_RSQRT},
+    {"DetectionOutputREF", LAYER_DETECTION_OUTPUT_REF},
+    {"Minimum", LAYER_MINIMUM},
+    {"Exp", LAYER_EXP},
+    {"DequantizeNCHW4TONCHWByChannel", LAYER_DEQUANTIZE_NCHW4_TO_NCHW},
+    {"QuantizedBatchNormCxxSignedInput", LAYER_QUANTIZED_BATCH_NORM_CXX_SIGNED_INPUT},
+    {"QuantizedAddSignedInput", LAYER_QUANTIZED_ADD_SIGNED_INPUT},
+    {"QuantizedConvolutionSignedInput", LAYER_QUANTIZED_CONVOLUTION_SIGNED_INPUT},
+    {"QuantizedReluSignedInput", LAYER_QUANTIZED_RELU_SIGNED_INPUT},
+    {"LogSigmoid", LAYER_LOGSIGMOID},
+    {"Repeat", LAYER_REPEAT},
+    {"Tile", LAYER_REPEAT},
+    {"Upsample", LAYER_UPSAMPLE},
+    {"QuantizedUpsample", LAYER_UPSAMPLE},
+    // 150
+    {"Pooling_nchwc4", LAYER_POOLING_NCHWC4},
+    {"QConv2DDequantizeMulAddQuantizeQRelu", LAYER_QUANTIZED_CONVOLUTION_DEQUANTIZE_BN_QUANTIZE_RELU},
+    {"DequantizeBnAddBnQuantize", LAYER_DEQUANTIZE_BN_ADD_BN_QUANTIZE},
+    {"SplitV", LAYER_SPLITV},
+    {"BatchNormQuantizeV2", LAYER_BATCH_NORM_QUANTIZE_V2},
+    {"QuantizeV2ByChannel", LAYER_QUANTIZE_BY_CHANNEL},
+    {"DequantizeNCHW4TONCHWByChannel", LAYER_DEQUANTIZE_NCHW4_TO_NCHW_BY_CHANNEL},
+    {"QuantizedConvolutionByChannel", LAYER_QUANTIZED_CONVOLUTION_BY_CHANNEL},
+    {"QFusedCBRByChannel", LAYER_QUANTIZED_FUSED_CBR_BY_CHANNEL},
+    {"Unpack", LAYER_UNPACK},
+    {"Fill", LAYER_FILL},
+    {"ResizeBicubic", LAYER_RESIZE_BICUBIC},
+    {"FusedBatchNorm", LAYER_FUSED_BATCH_NORM},
+    {"Unsqueeze", LAYER_UNSQUEEZE},
+    {"Gru", LAYER_GRU},
+    {"HardTanH", LAYER_HARDTANH},
+    {"AdaptiveAvgPool2d", LAYER_ADAPTIVE_AVG_POOL},
+    {"AdaptiveMaxPool2d", LAYER_ADAPTIVE_MAX_POOL},
+    {"HDRGuide", LAYER_HDRGUIDE},
+    {"BlobScale", LAYER_BLOB_SCALE},
+    {"Reformat", LAYER_REFORMAT},
+    {"Clip", LAYER_CLIP},
+    {"HardSigmoid", LAYER_HARDSIGMOID},
+    {"HardSwish", LAYER_HARDSWISH},
+    {"Softplus", LAYER_SOFTPLUS},
+    {"Div", LAYER_DIV},
+    {"Sign", LAYER_SIGN},
+    {"Cos", LAYER_COS},
+    {"Acos", LAYER_ACOS},
+    {"Sin", LAYER_SIN},
+    {"Asin", LAYER_ASIN},
+    {"Tan", LAYER_TAN},
+    {"Atan", LAYER_ATAN},
+    {"Log", LAYER_LOG},
+    {"Reciprocal", LAYER_RECIPROCAL},
+    {"Selu", LAYER_SELU},
+    {"Floor", LAYER_FLOOR},
+    {"Ceil", LAYER_CEIL},
+    {"ReduceL1", LAYER_REDUCE_L1},
+    {"ReduceL2", LAYER_REDUCE_L2},
+    {"ReduceLogSum", LAYER_REDUCE_LOG_SUM},
+    {"ReduceLogSumExp", LAYER_REDUCE_LOG_SUM_EXP},
+    {"ReduceMin", LAYER_REDUCE_MIN},
+    {"ReduceProd", LAYER_REDUCE_PROD},
+    {"ReduceSum", LAYER_REDUCE_SUM},
+    {"ReduceSumSquare", LAYER_REDUCE_SUM_SQUARE},
+    {"RoiAlign", LAYER_ROIALIGN},
+    {"GroupNorm", LAYER_GROUP_NORM},
+    {"Einsum", LAYER_EINSUM},
+    {"Inverse", LAYER_INVERSE},
+    {"GridSample", LAYER_GRIDSAMPLE},
+    {"Equal", LAYER_EQUAL},
+    {"Where", LAYER_WHERE},
+    {"LayerNorm", LAYER_LAYER_NORM},
+    {"GELU", LAYER_GELU},
+    // LAYER_INT8_RANGE
+    // LAYER_TRT_ENGINE
+
+    {"SignedMul", LAYER_SIGNED_MUL},
+    {"DetectionPostProcess", LAYER_DETECTION_POST_PROCESS},
+    {"SquaredDifference", LAYER_SQUARED_DIFFERENCE},
+    {"ArgMaxOrMin", LAYER_ARG_MAX_OR_MIN},
+    {"PixelShuffle", LAYER_PIXEL_SHUFFLE},
+    {"Expand", LAYER_EXPAND},
+    {"ScatterND", LAYER_SCATTER_ND},
+    {"ConstantOfShape", LAYER_CONSTANT_OF_SHAPE},
+    {"NonZero", LAYER_NONZERO},
+    {"LSTMONNX", LAYER_LSTMONNX},
+    {"QuantizedSigmoid", LAYER_SIGMOID},
+    {"StridedSliceV2", LAYER_STRIDED_SLICE_V2},
+    {"Erf", LAYER_ERF},
+    {"Range", LAYER_RANGE},
+    {"Size", LAYER_SIZE},
+    {"Histogram", LAYER_HISTOGRAM},
+    {"GatherND", LAYER_GATHERND},
+    {"BitShift", LAYER_BITSHIFT},
+    {"PadV2", LAYER_PADV2},
+    {"OneHot", LAYER_ONEHOT},
+    {"CbamFusedReduce", LAYER_CBAM_FUSED_REDUCE},
+    {"CbamFusedPooling", LAYER_CBAM_FUSED_POOLING},
+    {"Softsign", LAYER_SOFTSIGN},
+    {"TopK", LAYER_TOPK}
+};
+
+LayerType GlobalConvertLayerType(std::string layer_type_str) {
+    if (global_layer_type_map.count(layer_type_str) > 0) {
+        return global_layer_type_map[layer_type_str];
+    } else {
+        return LAYER_NOT_SUPPORT;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/layer_type.h b/3rdparty/TNN/source/tnn/core/layer_type.h
new file mode 100644
index 0000000..7745df0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/layer_type.h
@@ -0,0 +1,234 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_CORE_LAYER_TYPE_H_
+#define TNN_SOURCE_TNN_CORE_LAYER_TYPE_H_
+
+#include <string>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+/*
+ * Here, we list all the layer types in TNN.
+ * Not all of following types are used now.
+ */
+enum LayerType {
+    // pls do not change the existing numbers
+    LAYER_NOT_SUPPORT      = 0,
+    LAYER_CONVOLUTION      = 1,
+    LAYER_BATCH_NORM       = 2,
+    LAYER_POOLING          = 4,
+    LAYER_RELU             = 5,
+    LAYER_FC               = 6,
+    LAYER_SPLITING         = 7,
+    LAYER_CONCAT           = 8,
+    LAYER_RESHAPE          = 9,
+    LAYER_FLATTEN          = 10,
+    LAYER_DROPOUT          = 11,
+    LAYER_LRN              = 12,
+    LAYER_PROPOSAL         = 13,
+    LAYER_ROIPOOLING       = 14,
+    LAYER_ELTWISE          = 15,
+    LAYER_SCALE            = 16,
+    LAYER_ARB_DIM_SPP      = 17,
+    LAYER_BATCH_NORM_EX    = 18,
+    LAYER_INNER_PRODUCT    = 19,
+    LAYER_RESHAPEC         = 20,
+    LAYER_SOFTMAX          = 21,
+    LAYER_DECONVOLUTION    = 22,
+    LAYER_SIGMOID          = 23,
+    LAYER_PERMUTE          = 32,
+    LAYER_PRIOR_BOX        = 33,
+    LAYER_DETECTION_OUTPUT = 34,
+    LAYER_PRELU            = 35,
+    LAYER_ADD              = 38,
+    LAYER_TANH             = 39,
+    LAYER_LEAKY_RELU       = 40,
+    LAYER_ABS              = 41,
+    LAYER_MUL              = 42,
+    LAYER_INST_BATCH_NORM  = 43,
+    LAYER_PAD              = 44,
+    LAYER_NORMALIZE        = 45,
+    LAYER_QUANTIZEV2       = 46,
+    LAYER_LSTM             = 47,
+    // Quantization related layers
+    LAYER_QUANTIZEDPOOLING                                  = 49,
+    LAYER_DEQUANTIZE                                        = 50,
+    LAYER_QUANTIZEDRESHAPE                                  = 51,
+    LAYER_CONVOLUTION_DEPTHWISE                             = 52,
+    LAYER_QUANTIZEDBIASADD                                  = 53,
+    LAYER_BIAS_ADD                                          = 54,
+    LAYER_CONTINUATION_INDICATOR                            = 55,
+    LAYER_QUANTIZEDRELU                                     = 56,
+    LAYER_STRIDED_SLICE                                     = 57,
+    LAYER_RESHAPE_TENSORFLOW                                = 58,
+    LAYER_QUANTIZEDINNERPRODUCT                             = 60,
+    LAYER_LSTM_CTC                                          = 61,
+    LAYER_LABEL_SEQUENCE_ACCURACY                           = 62,
+    LAYER_SHUFFLE_CHANNEL                                   = 63,
+    LAYER_IM2COL_TRANSPOSE                                  = 64,
+    LAYER_IM2COL                                            = 65,
+    LAYER_TRANSPOSE                                         = 66,
+    LAYER_FILEINPUT                                         = 67,
+    LAYER_REVERSE                                           = 68,
+    LAYER_POWER                                             = 69,
+    LAYER_NEG                                               = 70,
+    LAYER_TENSORDOT                                         = 71,
+    LAYER_SHAPE                                             = 72,
+    LAYER_PROD                                              = 73,
+    LAYER_CONST                                             = 100,
+    LAYER_IDENTITY                                          = 101,
+    LAYER_SLICE                                             = 102,
+    LAYER_CAST                                              = 103,
+    LAYER_GATHER                                            = 104,
+    LAYER_MATMUL                                            = 105,
+    LAYER_PACK                                              = 106,
+    LAYER_PLACEHOLDER                                       = 107,
+    LAYER_SUB                                               = 108,
+    LAYER_ADD_TF                                            = 109,
+    LAYER_MUL_TF                                            = 110,
+    LAYER_SLICE_TF                                          = 111,
+    LAYER_SPLIT_TF                                          = 113,
+    LAYER_NEGRELUMUL                                        = 114,
+    LAYER_NCHW2NHWC                                         = 115,
+    LAYER_NHWC2NCHW                                         = 116,
+    LAYER_QUANTIZEDCONVOLUTION                              = 117,
+    LAYER_SQUEEZE                                           = 118,
+    LAYER_PRELUX                                            = 121,
+    LAYER_REQUANTIZE                                        = 122,
+    LAYER_BATCH_NORM_QUANTIZE                               = 123,
+    LAYER_BATCH_NORM_CXX_QUANTIZE                           = 124,
+    LAYER_RELU6                                             = 125,
+    LAYER_QUANTIZED_CONCAT                                  = 126,
+    LAYER_QUANTIZED_NCHW_TO_NCHW4                           = 127,
+    LAYER_DEQUANTIZED_NCHW4_TO_NCHW                         = 128,
+    LAYER_SQUARE                                            = 129,
+    LAYER_SQRT                                              = 130,
+    LAYER_REORG                                             = 131,
+    LAYER_ELU                                               = 132,
+    LAYER_REDUCE_SUM                                        = 133,
+    LAYER_REALDIV                                           = 134,
+    LAYER_BN                                                = 135,
+    LAYER_INTERP                                            = 136,
+    LAYER_MAXIMUM                                           = 137,
+    LAYER_RSQRT                                             = 138,
+    LAYER_DETECTION_OUTPUT_REF                              = 139,
+    LAYER_MINIMUM                                           = 140,
+    LAYER_EXP                                               = 141,
+    LAYER_DEQUANTIZE_NCHW4_TO_NCHW                          = 142,
+    LAYER_QUANTIZED_BATCH_NORM_CXX_SIGNED_INPUT             = 143,
+    LAYER_QUANTIZED_ADD_SIGNED_INPUT                        = 144,
+    LAYER_QUANTIZED_CONVOLUTION_SIGNED_INPUT                = 145,
+    LAYER_QUANTIZED_RELU_SIGNED_INPUT                       = 146,
+    LAYER_LOGSIGMOID                                        = 147,
+    LAYER_REPEAT                                            = 148,
+    LAYER_UPSAMPLE                                          = 149,
+    LAYER_POOLING_NCHWC4                                    = 150,
+    LAYER_QUANTIZED_CONVOLUTION_DEQUANTIZE_BN_QUANTIZE_RELU = 151,
+    LAYER_DEQUANTIZE_BN_ADD_BN_QUANTIZE                     = 152,
+    LAYER_SPLITV                                            = 153,
+    LAYER_BATCH_NORM_QUANTIZE_V2                            = 154,
+    LAYER_QUANTIZE_BY_CHANNEL                               = 155,
+    LAYER_DEQUANTIZE_NCHW4_TO_NCHW_BY_CHANNEL               = 156,
+    LAYER_QUANTIZED_CONVOLUTION_BY_CHANNEL                  = 157,
+    LAYER_QUANTIZED_FUSED_CBR_BY_CHANNEL                    = 158,
+    LAYER_UNPACK                                            = 159,
+    LAYER_FILL                                              = 160,
+    LAYER_RESIZE_BICUBIC                                    = 161,
+    LAYER_FUSED_BATCH_NORM                                  = 162,
+    LAYER_UNSQUEEZE                                         = 164,
+    LAYER_GRU                                               = 165,
+    LAYER_HARDTANH                                          = 166,
+    LAYER_ADAPTIVE_AVG_POOL                                 = 167,
+    LAYER_ADAPTIVE_MAX_POOL                                 = 168,
+    LAYER_REDUCE_MEAN                                       = 169,
+    LAYER_REFORMAT                                          = 170,
+    LAYER_CLIP                                              = 171,
+    LAYER_HARDSIGMOID                                       = 172,
+    LAYER_HARDSWISH                                         = 173,
+    LAYER_SOFTPLUS                                          = 174,
+    LAYER_DIV                                               = 175,
+    LAYER_SIGN                                              = 176,
+    LAYER_REDUCE_MAX                                        = 177,
+    LAYER_COS                                               = 178,
+    LAYER_ACOS                                              = 179,
+    LAYER_SIN                                               = 180,
+    LAYER_ASIN                                              = 181,
+    LAYER_TAN                                               = 182,
+    LAYER_ATAN                                              = 183,
+    LAYER_LOG                                               = 184,
+    LAYER_RECIPROCAL                                        = 185,
+    LAYER_FLOOR                                             = 186,
+    LAYER_SELU                                              = 187,
+    LAYER_REDUCE_L1                                         = 188,
+    LAYER_REDUCE_L2                                         = 189,
+    LAYER_REDUCE_LOG_SUM                                    = 190,
+    LAYER_REDUCE_LOG_SUM_EXP                                = 191,
+    LAYER_REDUCE_MIN                                        = 192,
+    LAYER_REDUCE_PROD                                       = 193,
+    LAYER_REDUCE_SUM_SQUARE                                 = 194,
+    LAYER_CEIL                                              = 195,
+    LAYER_SIGNED_MUL                                        = 196,
+    LAYER_DETECTION_POST_PROCESS                            = 197,
+    LAYER_SQUARED_DIFFERENCE                                = 198,
+    LAYER_ARG_MAX_OR_MIN                                    = 199,
+
+    LAYER_CONVOLUTION_3D                                    = 201,
+    LAYER_POOLING_3D                                        = 202,
+    LAYER_CONVOLUTION_1D                                    = 203,
+
+    LAYER_HDRGUIDE                                          = 302,
+    LAYER_PIXEL_SHUFFLE                                     = 303,
+    LAYER_EXPAND                                            = 304,
+    LAYER_SCATTER_ND                                        = 305,
+    LAYER_STRIDED_SLICE_V2                                  = 306,
+    LAYER_ERF                                               = 307,
+    LAYER_CONSTANT_OF_SHAPE                                 = 308,
+    LAYER_NONZERO                                           = 309,
+    LAYER_LSTMONNX                                          = 310,
+    LAYER_RANGE                                             = 311,
+    LAYER_SIZE                                              = 312,
+    LAYER_BITSHIFT                                          = 313,
+    LAYER_HISTOGRAM                                         = 314,
+    LAYER_GATHERND                                          = 315,
+    LAYER_PADV2                                             = 316,
+    LAYER_ROIALIGN                                          = 317,
+    LAYER_GROUP_NORM                                        = 318,
+    LAYER_ONEHOT                                            = 319,
+    LAYER_SOFTSIGN                                          = 320,
+    LAYER_EINSUM                                            = 321,
+    LAYER_INVERSE                                           = 322,
+    LAYER_GRIDSAMPLE                                        = 323,
+    LAYER_EQUAL                                             = 324,
+    LAYER_WHERE                                             = 325,
+    LAYER_LAYER_NORM                                        = 326,
+    LAYER_GELU                                              = 327,
+    LAYER_TOPK                                              = 328,
+    LAYER_NOT                                               = 329,
+
+    LAYER_BLOB_SCALE                                        = 600,
+
+    LAYER_INT8_RANGE                                        = 700,
+    LAYER_TRT_ENGINE                                        = 701,
+
+    LAYER_CBAM_FUSED_REDUCE                                 = 800,
+    LAYER_CBAM_FUSED_POOLING                                = 801
+};
+
+LayerType GlobalConvertLayerType(std::string layer_type_str);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_CORE_LAYER_TYPE_H_
diff --git a/3rdparty/TNN/source/tnn/core/mat.cc b/3rdparty/TNN/source/tnn/core/mat.cc
new file mode 100644
index 0000000..1413948
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/mat.cc
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/mat.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Mat::~Mat() {
+    data_alloc_ = nullptr;
+    data_       = nullptr;
+}
+
+DeviceType Mat::GetDeviceType() {
+    return device_type_;
+}
+
+MatType Mat::GetMatType() {
+    return mat_type_;
+}
+
+void* Mat::GetData() {
+    return data_;
+}
+
+DimsVector Mat::GetDims() {
+    return dims_;
+}
+
+int Mat::GetDim(int index) {
+    if ( (index >= 0) && (index < (int)dims_.size()) ) {
+        return dims_[index];
+    } else {
+        return 0;
+    }
+}
+
+int Mat::GetBatch() {
+    return GetDim(0);
+}
+
+int Mat::GetChannel() {
+    return GetDim(1);
+}
+
+int Mat::GetHeight() {
+    return GetDim(2);
+}
+
+int Mat::GetWidth() {
+    return GetDim(3);
+}
+
+Mat::Mat(DeviceType device_type, MatType mat_type, DimsVector dims) {
+    dims_ = dims;
+
+    auto device = GetDevice(device_type);
+    ASSERT(device != NULL);
+
+    int count = DimsVectorUtils::Count(dims);
+    if (count < 0) {
+        LOGE("Mat::Mat has invalid dims with count < 0\n");
+    }
+    ASSERT(count >= 0);
+
+    device_type_     = device_type;
+    mat_type_        = mat_type;
+    void* data_alloc = nullptr;
+    auto status      = device->Allocate(&data_alloc, mat_type, dims);
+    if (status == TNN_OK) {
+        data_alloc_ = std::shared_ptr<void>(data_alloc, [=](void* p) {
+            auto device = GetDevice(device_type);
+            if (device) {
+                device->Free(p);
+            }
+        });
+        data_       = data_alloc_.get();
+    } else {
+        data_       = nullptr;
+        data_alloc_ = nullptr;
+    }
+}
+
+Mat::Mat(DeviceType device_type, MatType mat_type, DimsVector dims, void* data) {
+    dims_ = dims;
+
+    data_alloc_ = nullptr;
+
+    device_type_ = device_type;
+    mat_type_    = mat_type;
+    data_        = data;
+}
+
+Mat::Mat(DeviceType device_type, MatType mat_type) {
+    device_type_ = device_type;
+    mat_type_    = mat_type;
+    data_ = nullptr;
+    data_alloc_ = nullptr;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/profile.cc b/3rdparty/TNN/source/tnn/core/profile.cc
new file mode 100644
index 0000000..c239cb4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/profile.cc
@@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/profile.h"
+#include <time.h>
+#include <iomanip>
+#include <sstream>
+
+#include "tnn/core/status.h"
+#include "tnn/utils/string_format.h"
+
+namespace TNN_NS {
+
+ProfilingData::~ProfilingData() {}
+
+bool ProfilingData::IsSameID(ProfilingData* data) {
+    return data && op_name == data->op_name && layer_name == data->layer_name;
+}
+
+void ProfilingData::Add(ProfilingData* data) {
+    if (!data || !IsSameID(data)) {
+        return;
+    }
+
+    kernel_time += data->kernel_time;
+    count += data->count;
+
+    if (input_dims.size() <= 0) {
+        input_dims = data->input_dims;
+    }
+
+    if (output_dims.size() <= 0) {
+        output_dims = data->output_dims;
+    }
+
+    if (kernel_shape.size() <= 0) {
+        kernel_shape = data->kernel_shape;
+    }
+
+    if (stride_shape.size() <= 0) {
+        stride_shape = data->stride_shape;
+    }
+
+    if (pad_shape.size() <= 0) {
+        pad_shape = data->pad_shape;
+    }
+
+    if (dilation_shape.size() <= 0) {
+        dilation_shape = data->dilation_shape;
+    }
+
+    if (group <= 0) {
+        group = data->group;
+    }
+}
+
+#if TNN_PROFILE
+ProfileResult::~ProfileResult() {}
+
+void ProfileResult::Reset() {
+    profiling_data_.clear();
+}
+
+/*
+call this function in each layer
+*/
+void ProfileResult::AddProfilingData(std::shared_ptr<ProfilingData> pdata) {
+    std::shared_ptr<ProfilingData> internal = nullptr;
+    for (auto& item : profiling_data_) {
+        if (item->IsSameID(pdata.get())) {
+            internal = item;
+            break;
+        }
+    }
+
+    if (internal) {
+        internal->Add(pdata.get());
+    } else {
+        profiling_data_.push_back(pdata);
+    }
+}
+
+/*
+call this function in network
+*/
+void ProfileResult::AddProfileResult(std::shared_ptr<ProfileResult> result) {
+    auto result_profiling_data = result->GetData();
+    for (auto pf_data : result_profiling_data) {
+        AddProfilingData(pf_data);
+    }
+}
+
+/*
+get profilint data vector
+*/
+std::vector<std::shared_ptr<ProfilingData>> ProfileResult::GetData() {
+    return profiling_data_;
+}
+
+/*
+format print profile info
+*/
+std::string ProfileResult::GetProfilingDataInfo() {
+    // show the time cost of each layer
+    std::string title                     = "Profiling Data";
+    const std::vector<std::string> header = {"name",         "Op Type", "Kernel(ms)", "Input Dims", "Output Dims",
+                                             "Filter(OIHW)", "Group", "Stride",  "Pad",        "Dilation"};
+
+    std::vector<std::vector<std::string>> data;
+
+    double kernel_time_sum = 0;
+
+    for (auto p : profiling_data_) {
+        std::vector<std::string> tuple;
+        tuple.reserve(16);
+
+        tuple.push_back(p->layer_name);
+        tuple.push_back(p->op_name);
+        tuple.push_back(DoubleToString(p->kernel_time / p->count));
+        tuple.push_back(VectorToString(p->input_dims));
+        tuple.push_back(VectorToString(p->output_dims));
+        tuple.push_back(VectorToString(p->kernel_shape));
+        tuple.push_back(IntToStringFilter(p->group));
+        tuple.push_back(VectorToString(p->stride_shape));
+        tuple.push_back(VectorToString(p->pad_shape));
+        tuple.push_back(VectorToString(p->dilation_shape));
+
+        data.emplace_back(tuple);
+
+        kernel_time_sum += p->kernel_time / p->count;
+    }
+
+    std::string detailed_string = StringFormatter::Table(title, header, data);
+    std::string summary_string  = GetProfilingDataSummary(true);
+
+    std::ostringstream ostr;
+    ostr << "kernel runtime total: " << kernel_time_sum << " ms\n\n";
+
+    return detailed_string + summary_string + ostr.str();
+}
+
+std::string ProfileResult::GetProfilingDataSummary(bool do_average) {
+    // show the time cost of each type layer
+    std::string title_summary                     = "Summary";
+    const std::vector<std::string> header_summary = {"Op Type", "Total Kernel Time(ms)", "Percent (%)"};
+
+    double kernel_time_sum = 0;
+    std::map<std::string, std::vector<float>> summary_map;
+    for (auto p : profiling_data_) {
+        if (do_average)
+            kernel_time_sum += p->kernel_time / p->count;
+        else
+            kernel_time_sum += p->kernel_time;
+        if (summary_map.find(p->op_name) == summary_map.end()) {
+            std::vector<float> p_data;
+            p_data.push_back(0.0f);
+            summary_map[p->op_name] = p_data;
+        }
+    }
+    for (auto p : profiling_data_) {
+        if (summary_map.find(p->op_name) != summary_map.end()) {
+            if (do_average)
+                summary_map[p->op_name][0] += p->kernel_time / p->count;
+            else
+                summary_map[p->op_name][0] += p->kernel_time;
+        }
+    }
+    auto summary_pair = SortMapByValue(summary_map);
+    std::vector<std::vector<std::string>> data_summary;
+    for (auto s : summary_pair) {
+        std::vector<std::string> tuples;
+        tuples.reserve(4);
+
+        tuples.push_back(s.first);
+        tuples.push_back(DoubleToString(s.second[0]));
+        tuples.push_back(DoubleToString(s.second[0] / kernel_time_sum * 100));
+
+        data_summary.emplace_back(tuples);
+    }
+    std::string show_string_summary = StringFormatter::Table(title_summary, header_summary, data_summary);
+    return show_string_summary;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/profile.h b/3rdparty/TNN/source/tnn/core/profile.h
new file mode 100644
index 0000000..bfb1dd0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/profile.h
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_PROFILE_H_
+#define TNN_INCLUDE_TNN_CORE_PROFILE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+namespace TNN_NS {
+
+struct ProfilingData {
+    virtual ~ProfilingData();
+    /**layer name*/
+    std::string layer_name = "";
+    /**op type*/
+    std::string op_name = "";
+    /**kernel time*/
+    double enqueue_time = 0;
+    /**submit time*/
+    double submit_time = 0;
+    /**kernel time*/
+    double kernel_time = 0;
+
+    double flops     = 0;
+    double bandwidth = 0;
+
+    std::vector<int> input_dims     = {};
+    std::vector<int> output_dims    = {};
+    std::vector<int> kernel_shape   = {};
+    std::vector<int> stride_shape   = {};
+    std::vector<int> pad_shape      = {};
+    std::vector<int> dilation_shape = {};
+    int group                       = 0;
+
+    int count = 1;
+
+    void Add(ProfilingData *data);
+    bool IsSameID(ProfilingData *data);
+};
+
+#if TNN_PROFILE
+class ProfileResult {
+public:
+    virtual ~ProfileResult();
+
+    // @brief reset for profile again
+    void Reset();
+
+    // @brief add profiling data of each layer
+    void AddProfilingData(std::shared_ptr<ProfilingData> pdata);
+
+    // @brief add profiling result
+    void AddProfileResult(std::shared_ptr<ProfileResult> result);
+
+    // @brief get profiling data
+    virtual std::vector<std::shared_ptr<ProfilingData>> GetData();
+
+    // @brief This function shows the detailed timing for each layer in the model.
+    virtual std::string GetProfilingDataInfo();
+
+protected:
+    /*
+     * This function shows an overview of the timings in the model.
+     * the timing is grouped by the type of layer.
+     */
+    virtual std::string GetProfilingDataSummary(bool do_average);
+
+    std::vector<std::shared_ptr<ProfilingData>> profiling_data_ = {};
+};
+#endif
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_INCLUDE_TNN_CORE_PROFILE_H_
diff --git a/3rdparty/TNN/source/tnn/core/status.cc b/3rdparty/TNN/source/tnn/core/status.cc
new file mode 100644
index 0000000..8e81fdd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/status.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/status.h"
+#include "tnn/utils/string_format.h"
+
+#include <iomanip>
+#include <sstream>
+
+namespace TNN_NS {
+
+std::string StatusGetDefaultMessage(int code) {
+    switch (code) {
+        case TNNERR_INVALID_NETCFG:
+            return "invalid net config, proto or model is invalid";
+        case TNNERR_SET_CPU_AFFINITY:
+            return "failed to set cpu affinity";
+        case TNNERR_DEVICE_NOT_SUPPORT:
+            return "device is nil or unsupported";
+        case TNNERR_DEVICE_CONTEXT_CREATE:
+            return "context is nil or created failed";
+        default:
+            return "";
+    }
+}
+
+Status::~Status() {
+    code_    = 0;
+    message_ = "";
+}
+
+//constructor with code and message
+Status::Status(int code, std::string message) {
+    code_    = code;
+    message_ = (message != "OK" && message.length() > 0) ? message : StatusGetDefaultMessage(code);
+}
+
+//int and status convert,assign,compare operator
+Status& Status::operator=(int code) {
+    code_    = code;
+    message_ = StatusGetDefaultMessage(code);
+    return *this;
+}
+
+bool Status::operator==(int code) {
+    return code_ == code;
+}
+
+bool Status::operator!=(int code) {
+    return code_ != code;
+}
+
+Status::operator int() {
+    return code_;
+}
+
+//status convert to bool operator
+Status::operator bool() {
+    return code_ == TNN_OK;
+}
+
+//description with code(0x) and msg
+std::string Status::description() {
+    std::ostringstream os;
+    os << "code: 0x" << std::uppercase << std::setfill('0') << std::setw(4) << std::hex << code_
+       << " msg: " << message_;
+    return os.str();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/tnn.cc b/3rdparty/TNN/source/tnn/core/tnn.cc
new file mode 100644
index 0000000..3a25a9d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/tnn.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/tnn.h"
+
+#include "tnn/core/tnn_impl.h"
+
+namespace TNN_NS {
+
+TNN::TNN() {}
+TNN::~TNN() {
+    DeInit();
+}
+
+Status TNN::Init(ModelConfig& config) {
+    impl_ = TNNImplManager::GetTNNImpl(config.model_type);
+    if (!impl_) {
+        LOGE("Error: not support mode type: %d. If TNN is a static library, link it with option -Wl,--whole-archive tnn -Wl,--no-whole-archive on android or add -force_load on iOS\n", config.model_type);
+        return Status(TNNERR_NET_ERR, "unsupported mode type, If TNN is a static library, link it with option -Wl,--whole-archive tnn -Wl,--no-whole-archive on android or add -force_load on iOS");
+    }
+    return impl_->Init(config);
+}
+
+Status TNN::DeInit() {
+    impl_ = nullptr;
+    return TNN_OK;
+}
+
+Status TNN::AddOutput(const std::string& layer_name, int output_index) {
+    // todo for output index
+    if (!impl_) {
+        LOGE("Error: impl_ is nil\n");
+        return Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+    }
+    return impl_->AddOutput(layer_name, output_index);
+}
+
+Status TNN::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+     if (!impl_) {
+        LOGE("Error: impl_ is nil\n");
+        return Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+    }
+    return impl_->GetModelInputShapesMap(shapes_map);
+}
+
+std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap inputs_shape) {
+    if (!impl_) {
+        status = Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+        return nullptr;
+    }
+
+    return impl_->CreateInst(config, status, inputs_shape);
+}
+
+std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    if (!impl_) {
+        status = Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+        return nullptr;
+    }
+
+    return impl_->CreateInst(config, status, min_inputs_shape, max_inputs_shape);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/tnn_impl.cc b/3rdparty/TNN/source/tnn/core/tnn_impl.cc
new file mode 100644
index 0000000..b96e22d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/tnn_impl.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/tnn_impl.h"
+
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+TNNImpl::~TNNImpl() {
+    DeInit();
+}
+
+Status TNNImpl::DeInit() {
+    return TNN_OK;
+}
+
+Status TNNImpl::Init(ModelConfig &config) {
+    model_config_.model_type = config.model_type;
+    return TNN_OK;
+}
+
+std::map<ModelType, std::shared_ptr<AbstractTNNImplFactory>> &TNNImplManager::GetTNNImplFactoryMap() {
+    static std::map<ModelType, std::shared_ptr<AbstractTNNImplFactory>> s_tnn_impl_factory_map;
+    return s_tnn_impl_factory_map;
+}
+
+std::shared_ptr<TNNImpl> TNNImplManager::GetTNNImpl(ModelType type) {
+    auto &impl_map = TNNImplManager::GetTNNImplFactoryMap();
+    auto iter      = impl_map.find(type);
+    if (iter != impl_map.end()) {
+        return iter->second->CreateTNNImp();
+    }
+
+    return nullptr;
+}
+
+void TNNImplManager::RegisterTNNImplFactory(ModelType type, AbstractTNNImplFactory *factory) {
+    if (factory) {
+        auto &optimizer_map = TNNImplManager::GetTNNImplFactoryMap();
+        optimizer_map[type] = std::shared_ptr<AbstractTNNImplFactory>(factory);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/tnn_impl.h b/3rdparty/TNN/source/tnn/core/tnn_impl.h
new file mode 100644
index 0000000..2840eeb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/tnn_impl.h
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CORE_TNN_IMPL_H_
+#define TNN_CORE_TNN_IMPL_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class Instance;
+class AbstractModelInterpreter;
+
+class TNNImpl {
+public:
+    virtual ~TNNImpl();
+
+    // @brief init the tnn, construct model interpreter
+    // @param config config model type and params
+    // @return status code: Successful, returns zero. Otherwise, returns
+    // error code.
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: If successful, returns zero. Otherwise, returns
+    // error
+    // code.
+    virtual Status AddOutput(const std::string& output_name, int output_index = 0) = 0;
+
+    //@brief get input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map) = 0;
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use the shape in the
+    // proto
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap inputs_shape = InputShapesMap()) = 0;
+
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) = 0;
+
+
+protected:
+    ModelConfig model_config_;
+};
+
+class AbstractTNNImplFactory {
+public:
+    virtual ~AbstractTNNImplFactory() {}
+    virtual std::shared_ptr<TNNImpl> CreateTNNImp() = 0;
+};
+
+template <typename T>
+class TNNImplFactory : public AbstractTNNImplFactory {
+public:
+    virtual std::shared_ptr<TNNImpl> CreateTNNImp() {
+        return std::make_shared<T>();
+    }
+};
+
+class TNNImplManager {
+public:
+    static std::shared_ptr<TNNImpl> GetTNNImpl(ModelType type);
+
+    static void RegisterTNNImplFactory(ModelType type, AbstractTNNImplFactory* factory);
+
+private:
+    static std::map<ModelType, std::shared_ptr<AbstractTNNImplFactory>>& GetTNNImplFactoryMap();
+};
+
+template <typename T>
+class TNNImplFactoryRegister {
+public:
+    explicit TNNImplFactoryRegister(ModelType type) {
+        TNNImplManager::RegisterTNNImplFactory(type, new T());
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_CORE_TNN_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/core/tnn_impl_default.cc b/3rdparty/TNN/source/tnn/core/tnn_impl_default.cc
new file mode 100644
index 0000000..6c22563
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/tnn_impl_default.cc
@@ -0,0 +1,135 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/tnn_impl_default.h"
+
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/utils/blob_dump_utils.h"
+
+namespace TNN_NS {
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplDefault>> g_tnn_impl_default_factory_register(MODEL_TYPE_TNN);
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplDefault>> g_tnn_impl_ncnn_factory_register(MODEL_TYPE_NCNN);
+
+TNNImplDefault::TNNImplDefault() {}
+
+TNNImplDefault::~TNNImplDefault() {}
+
+Status TNNImplDefault::Init(ModelConfig& config) {
+    auto status = TNNImpl::Init(config);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    auto interpreter = CreateModelInterpreter(config.model_type);
+    if (!interpreter) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+    interpreter_ = std::shared_ptr<AbstractModelInterpreter>(interpreter);
+    return interpreter_->Interpret(config.params);
+}
+
+Status TNNImplDefault::DeInit() {
+    interpreter_ = nullptr;
+    return TNN_OK;
+}
+
+Status TNNImplDefault::AddOutput(const std::string& layer_name, int output_index) {
+    if (!interpreter_) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+
+    default_interpreter->GetNetStructure()->outputs.insert(layer_name);
+    return TNN_OK;
+}
+
+Status TNNImplDefault::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+    if (!interpreter_) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+    shapes_map = default_interpreter->GetNetStructure()->inputs_shape_map;
+    return TNN_OK;
+} 
+
+
+std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config, Status& status,
+                                                     InputShapesMap inputs_shape) {
+    if (!interpreter_) {
+        status = Status(TNNERR_NET_ERR, "interpreter is nil");
+        return nullptr;
+    }
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+    //todo: refactor later
+    if(net_config.device_type == DEVICE_CUDA) {
+        status = AddAllLayersOutput();
+        if(status != TNN_OK) {
+            return nullptr;
+        }
+    }
+#endif
+
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, inputs_shape);
+
+    if (status != TNN_OK) {
+        return nullptr;
+    }
+    return instance;
+}
+
+std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config, Status& status,
+                                                     InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    if (!interpreter_) {
+        status = Status(TNNERR_NET_ERR, "interpreter is nil");
+        return nullptr;
+    }
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+    //todo: refactor later
+    if(net_config.device_type == DEVICE_CUDA) {
+        status = AddAllLayersOutput();
+        if(status != TNN_OK) {
+            return nullptr;
+        }
+    }
+#endif
+
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape);
+
+    if (status != TNN_OK) {
+        return nullptr;
+    }
+    return instance;
+}
+
+Status TNNImplDefault::AddAllLayersOutput() {
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+    auto net_structure = default_interpreter->GetNetStructure();
+    for(auto layer_info : net_structure->layers) {
+        for(auto output_name : layer_info->outputs) {
+            AddOutput(output_name);
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/core/tnn_impl_default.h b/3rdparty/TNN/source/tnn/core/tnn_impl_default.h
new file mode 100644
index 0000000..75c5c43
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/core/tnn_impl_default.h
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CORE_TNN_IMPL_DEFAULT_H_
+#define TNN_CORE_TNN_IMPL_DEFAULT_H_
+
+#include <memory>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/tnn_impl.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+// @brief tnn impl with interpreter
+class TNNImplDefault : public TNNImpl {
+public:
+    // @brief tnn constructor
+    TNNImplDefault();
+
+    // @brief tnn destructor
+    virtual ~TNNImplDefault();
+
+    // @brief init the tnn, construct model interpreter
+    // @param config config model type and params
+    // @return status code: Successful, returns zero. Otherwise, returns
+    // error code.
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: If successful, returns zero. Otherwise, returns
+    // error
+    // code.
+    virtual Status AddOutput(const std::string& output_name,
+                             int output_index = 0);
+
+    //@brief get input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use the shape in the
+    // proto
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap inputs_shape = InputShapesMap());
+
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(
+        NetworkConfig& config, Status& status,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+
+private:
+    Status AddAllLayersOutput();
+    std::shared_ptr<AbstractModelInterpreter> interpreter_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_CORE_TNN_IMPL_DEFAULT_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/arm/CMakeLists.txt
new file mode 100644
index 0000000..2c0b7c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/CMakeLists.txt
@@ -0,0 +1,19 @@
+file(GLOB_RECURSE ARM_SRC *.h *.cc)
+file(GLOB_RECURSE REMOVE_SOURCES "*/compute_arm82/*")
+list(REMOVE_ITEM ARM_SRC ${REMOVE_SOURCES})
+if(TNN_ARM82_ENABLE)
+    add_subdirectory(acc/compute_arm82)
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    file(GLOB_RECURSE ARM_SRC_ASM acc/compute/arm64/*.S)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    file(GLOB_RECURSE ARM_SRC_ASM acc/compute/arm32/*.S)
+    add_definitions( -mfpu=neon )
+endif()
+
+add_library(TNNArm OBJECT ${ARM_SRC} ${ARM_SRC_ASM})
+
+#if(SYSTEM.Android)
+#    target_link_libraries(TNNArm log)
+#endif()
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/Float4.h b/3rdparty/TNN/source/tnn/device/arm/acc/Float4.h
new file mode 100644
index 0000000..84b588f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/Float4.h
@@ -0,0 +1,602 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef Float4_hpp
+#define Float4_hpp
+#include <algorithm>  // supply std::max and std::min
+#include <cmath>
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/device/arm/acc/TNNVector.h"
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#include "tnn/device/arm/acc/neon_mathfun.h"
+#endif
+namespace TNN_NS {
+#ifdef TNN_USE_NEON
+
+struct Float2 {
+    float32x2_t value;
+};
+
+struct Float4 {
+    float32x4_t value;
+    Float4() {}
+    Float4(const float v) {
+        value = vdupq_n_f32(v);
+    }
+    Float4(const float32x4_t& v) {
+        value = v;
+    }
+    Float4(const float32x4_t&& v) {
+        value = std::move(v);
+    }
+    Float4(const Float4& lr) {
+        value = lr.value;
+    }
+    Float4(const Float4&& lr) {
+        value = std::move(lr.value);
+    }
+
+    void set_lane(float v, int i) {
+        value[i] = v;
+    }
+
+    const float operator[](const int i) const {
+        return value[i];
+    }
+
+    static Float4 load(const float* addr) {
+        Float4 v;
+        v.value = vld1q_f32(addr);
+        return v;
+    }
+    static void save(float* addr, const Float4& v) {
+        vst1q_f32(addr, v.value);
+    }
+    static Float4 load(const bfp16_t* addr) {
+        Float4 v;
+#if __aarch64__
+        asm volatile(
+            "ld1    {v1.4h}, [%1]\n"
+            "shll   %0.4s, v1.4h, #16\n"
+            : "=w"(v.value)
+            : "r"(addr)
+            : "memory", "v1");
+#else   // __aarch64__
+        asm volatile(
+            "vld1.u16    d1, [%1]\n"
+            "vshll.u16  %q0, d1, #16\n"
+            : "=w"(v.value)
+            : "r"(addr)
+            : "memory", "v1");
+#endif  // __aarch64__
+        return v;
+    }
+    static void save(bfp16_t* addr, const Float4& v) {
+#if __aarch64__
+        asm volatile(
+            "shrn   v1.4h, %1.4s, #16\n"
+            "st1    {v1.4h}, [%0]\n"
+            : "+r"(addr)
+            : "w"(v.value)
+            : "memory", "v1");
+#else   // __aarch64__
+        asm volatile(
+            "vshrn.u32  d1, %q1, #16\n"
+            "vst1.u16    d1, [%0]\n"
+            : "+r"(addr)
+            : "w"(v.value)
+            : "memory", "v1");
+#endif  // __aarch64__
+    }
+    static void get_low(Float4& v1, Float2& v2) {
+        v2.value = vget_low_f32(v1.value);
+    }
+    static void get_high(Float4& v1, Float2& v2) {
+        v2.value = vget_high_f32(v1.value);
+    }
+    static Float4 combine(Float2& v1, Float2& v2) {
+        return vcombine_f32(v1.value, v2.value);
+    }
+    static Float4 extract(const Float4& v1, const Float4& v2, const int n) {
+        Float4 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vextq_f32(v1.value, v2.value, 1);
+        } else if (n == 2) {
+            dst.value = vextq_f32(v1.value, v2.value, 2);
+        } else if (n == 3) {
+            dst.value = vextq_f32(v1.value, v2.value, 3);
+        } else if (n == 4) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static Float4 pad(const Float4& v1, const Float4& v2, const int n) {
+        static const uint32_t select  = uint32_t(-1);
+        static const uint32x4_t mask1 = {select,select,select,0};
+        static const uint32x4_t mask2 = {select,select,0,0};
+        static const uint32x4_t mask3 = {select,0,0,0};
+        Float4 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vbslq_f32(mask1, v1.value, v2.value);
+        } else if (n == 2) {
+            dst.value = vbslq_f32(mask2, v1.value, v2.value);
+        } else if (n == 3) {
+            dst.value =  vbslq_f32(mask3, v1.value, v2.value);
+        } else if (n == 4) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static void mla(Float4& v1, const Float4& v2, const Float4& v3) {
+        v1.value = vmlaq_f32(v1.value, v2.value, v3.value);
+    }
+    static void mla_lane0(Float4& v1, const Float4& v2, const Float2& v3) {
+        v1.value = vmlaq_lane_f32(v1.value, v2.value, v3.value, 0);
+    }
+    static void mla_lane1(Float4& v1, const Float4& v2, const Float2& v3) {
+        v1.value = vmlaq_lane_f32(v1.value, v2.value, v3.value, 1);
+    }
+    static void mls(Float4& v1, const Float4& v2, const Float4& v3) {
+        v1.value = vmlsq_f32(v1.value, v2.value, v3.value);
+    }
+    static void mls_lane0(Float4& v1, const Float4& v2, const Float2& v3) {
+        v1.value = vmlsq_lane_f32(v1.value, v2.value, v3.value, 0);
+    }
+    static void mls_lane1(Float4& v1, const Float4& v2, const Float2& v3) {
+        v1.value = vmlsq_lane_f32(v1.value, v2.value, v3.value, 1);
+    }
+    static Float4 bsl_cle(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vbslq_f32(vcleq_f32(c1.value, c2.value), v1.value, v2.value);
+        return dst;
+    }
+    static Float4 bsl_clt(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vbslq_f32(vcltq_f32(c1.value, c2.value), v1.value, v2.value);
+        return dst;
+    }
+    static Float4 bsl_cge(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vbslq_f32(vcgeq_f32(c1.value, c2.value), v1.value, v2.value);
+        return dst;
+    }
+    static Float4 bsl_cgt(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vbslq_f32(vcgtq_f32(c1.value, c2.value), v1.value, v2.value);
+        return dst;
+    }
+    static Float4 neg(const Float4& v) {
+        Float4 dst;
+        dst.value = vnegq_f32(v.value);
+        return dst;
+    }
+    static Float4 floor(const Float4& v) {
+        Float4 dst;
+#if __aarch64__
+        dst.value = vcvtq_f32_s32(vcvtmq_s32_f32(v.value));
+#else
+        int32x4_t s32   = vcvtq_s32_f32(v.value);
+        uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(s32), v.value);
+        dst.value       = vcvtq_f32_s32(vaddq_s32(s32, vreinterpretq_s32_u32(mask)));
+#endif
+        return dst;
+    }
+    static Float4 ceil(const Float4& v) {
+        Float4 dst;
+#if __aarch64__
+        dst.value = vcvtq_f32_s32(vcvtpq_s32_f32(v.value));
+#else
+        int32x4_t s32   = vcvtq_s32_f32(v.value);
+        uint32x4_t mask = vcgtq_f32(v.value, vcvtq_f32_s32(s32));
+        dst.value       = vcvtq_f32_s32(vsubq_s32(s32, vreinterpretq_s32_u32(mask)));
+#endif
+        return dst;
+    }
+    static Float4 max(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vmaxq_f32(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 min(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = vminq_f32(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 div(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = div_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 exp(const Float4& v) {
+        Float4 dst;
+        dst.value = exp_ps(v.value);
+        return dst;
+    }
+    static Float4 pow(const Float4& v, const Float4& e) {
+        Float4 dst;
+        dst.value = pow_ps(v.value, e.value);
+        return dst;
+    }
+    static Float4 sqrt(const Float4& v) {
+        Float4 dst;
+        static float32x4_t zero = vdupq_n_f32(0.0f);
+        dst.value = vbslq_f32(vceqq_f32(v.value, zero), zero, sqrt_ps(v.value));
+        return dst;
+    }
+    static Float4 tanh(const Float4& v) {
+        Float4 dst;
+        dst.value = tanh_ps(v.value);
+        return dst;
+    }
+    static Float4 tan(const Float4& v) {
+        Float4 dst;
+
+        float32x4_t ysin, ycos;
+        sincos_ps(v.value, &ysin, &ycos);
+        dst.value = div_ps(ysin, ycos);
+        return dst;
+    }
+    static Float4 sin(const Float4& v) {
+        Float4 dst;
+        dst.value = sin_ps(v.value);
+        return dst;
+    }
+    static Float4 cos(const Float4& v) {
+        Float4 dst;
+        dst.value = cos_ps(v.value);
+        return dst;
+    }
+    static Float4 sigmoid(const Float4& v) {
+        Float4 dst;
+        dst.value = sigmoid_ps(v.value);
+        return dst;
+    }
+    static Float4 fast_sigmoid(const Float4& v) {
+        Float4 dst;
+        dst.value = fast_sigmoid_ps(v.value);
+        return dst;
+    }
+    static Float4 log(const Float4& v) {
+        Float4 dst;
+        dst.value = log_ps(v.value);
+        return dst;
+    }
+    static Float4 abs(const Float4& v) {
+        Float4 dst;
+        dst.value = vabsq_f32(v.value);
+        return dst;
+    }
+    Float4 operator+(const Float4& lr) const {
+        Float4 dst;
+        dst.value = value + lr.value;
+        return dst;
+    }
+    Float4 operator-(const Float4& lr) const {
+        Float4 dst;
+        dst.value = value - lr.value;
+        return dst;
+    }
+    Float4 operator*(float lr) const {
+        Float4 dst;
+        dst.value = vmulq_n_f32(value, lr);
+        return dst;
+    }
+    Float4 operator*(const Float4& lr) const {
+        Float4 dst;
+        dst.value = value * lr.value;
+        return dst;
+    }
+    Float4& operator=(const Float4& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Float4& operator=(const Float4&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Float4 operator-() const {
+        Float4 dst;
+        dst.value = -value;
+        return dst;
+    }
+};
+
+struct Float4x4 {
+    float32x4x4_t value;
+    static Float4x4 ld4(const float* addr) {
+        Float4x4 v;
+        v.value = vld4q_f32(addr);
+        return v;
+    }
+    static Float4x4 load(const float* addr) {
+        Float4x4 v;
+        v.value.val[0] = vld1q_f32(addr);
+        v.value.val[1] = vld1q_f32(addr + 4);
+        v.value.val[2] = vld1q_f32(addr + 8);
+        v.value.val[3] = vld1q_f32(addr + 12);
+        return v;
+    }
+    static void save(float* addr, const Float4x4& v) {
+        vst1q_f32(addr, v.value.val[0]);
+        vst1q_f32(addr + 4, v.value.val[1]);
+        vst1q_f32(addr + 8, v.value.val[2]);
+        vst1q_f32(addr + 12, v.value.val[3]);
+    }
+
+    static Float4x4 max(const Float4x4& v1, const Float4& v2) {
+        Float4x4 dst;
+        dst.value.val[0] = vmaxq_f32(v1.value.val[0], v2.value);
+        dst.value.val[1] = vmaxq_f32(v1.value.val[1], v2.value);
+        dst.value.val[2] = vmaxq_f32(v1.value.val[2], v2.value);
+        dst.value.val[3] = vmaxq_f32(v1.value.val[3], v2.value);
+        return dst;
+    }
+    static Float4x4 min(const Float4x4& v1, const Float4& v2) {
+        Float4x4 dst;
+        dst.value.val[0] = vminq_f32(v1.value.val[0], v2.value);
+        dst.value.val[1] = vminq_f32(v1.value.val[1], v2.value);
+        dst.value.val[2] = vminq_f32(v1.value.val[2], v2.value);
+        dst.value.val[3] = vminq_f32(v1.value.val[3], v2.value);
+        return dst;
+    }
+
+    Float4x4 operator+(const Float4& v2) {
+        Float4x4 dst;
+        dst.value.val[0] = value.val[0] + v2.value;
+        dst.value.val[1] = value.val[1] + v2.value;
+        dst.value.val[2] = value.val[2] + v2.value;
+        dst.value.val[3] = value.val[3] + v2.value;
+        return dst;
+    }
+
+    static Float4x4 load(const bfp16_t* addr) {
+        Float4x4 v;
+        asm volatile(
+#if __aarch64__
+            "ld1    {v1.4s, v2.4s}, [%4]\n"
+            "shll   %0.4s, v1.4h, #16\n"
+            "shll   %2.4s, v2.4h, #16\n"
+            "shll2   %1.4s, v1.8h, #16\n"
+            "shll2   %3.4s, v2.8h, #16\n"
+            : "=w"(v.value.val[0]), "=w"(v.value.val[1]), "=w"(v.value.val[2]), "=w"(v.value.val[3])
+            : "r"(addr)
+            : "memory", "v1", "v2");
+#else
+            "vld1.32 {q1, q2}, [%4]\n"
+            "vshll.u16  %q0, d2, #16\n"
+            "vshll.u16  %q1, d3, #16\n"
+            "vshll.u16  %q2, d4, #16\n"
+            "vshll.u16  %q3, d5, #16\n"
+            : "=w"(v.value.val[0]), "=w"(v.value.val[1]), "=w"(v.value.val[2]), "=w"(v.value.val[3])
+            : "r"(addr)
+            : "memory", "q1", "q2");
+#endif  // __aarch64__
+        return v;
+    }
+
+    static void save(bfp16_t* addr, const Float4x4& v) {
+        asm volatile(
+#if __aarch64__
+            "shrn   v4.4h, %1.4s, #16\n"
+            "shrn   v5.4h, %2.4s, #16\n"
+            "shrn   v6.4h, %3.4s, #16\n"
+            "shrn   v7.4h, %4.4s, #16\n"
+            "st1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%0]\n"
+            :
+            : "r"(addr), "w"(v.value.val[0]), "w"(v.value.val[1]), "w"(v.value.val[2]), "w"(v.value.val[3])
+            : "memory", "v4", "v5", "v6", "v7");
+#else   //
+            "vshrn.u32  d8, %q1, #16\n"
+            "vshrn.u32  d9, %q2, #16\n"
+            "vshrn.u32  d10, %q3, #16\n"
+            "vshrn.u32  d11, %q4, #16\n"
+            "vst1.u16    {q4, q5}, [%0]\n"
+            :
+            : "r"(addr), "w"(v.value.val[0]), "w"(v.value.val[1]), "w"(v.value.val[2]), "w"(v.value.val[3])
+            : "memory", "q4", "q5");
+#endif  // __aarch64__
+    }
+    static void st1_lane(float* addr, const Float4x4& v, int index) {
+        vst1q_f32(addr, v.value.val[index]);
+    }
+
+    void get_lane(Float4& v, int index) {
+        v.value = value.val[index];
+    }
+};
+
+struct Short4x4 {
+    int16x4x4_t value;
+    static Short4x4 ld4(const int16_t* addr) {
+        Short4x4 v;
+        v.value = vld4_s16(addr);
+        return v;
+    }
+    static void st1_lane(int16_t* addr, const Short4x4& v, int index) {
+        vst1_s16(addr, v.value.val[index]);
+    }
+};
+
+#else
+
+struct Float2 : TNNVector<float, 2> {
+    using TNNVector<float, 2>::TNNVector;
+};
+
+struct Float4 : TNNVector<float, 4> {
+    using TNNVector<float, 4>::TNNVector;
+    Float4() {}
+    Float4(const Float4& lr) {
+        for (int i = 0; i < 4; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+    Float4(const TNNVector<float, 4>& lr) {
+        for (int i = 0; i < 4; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+
+    static void get_low(const Float4& v1, Float2& v2) {
+        v2.value[0] = v1.value[0];
+        v2.value[1] = v1.value[1];
+    }
+    static void get_high(const Float4& v1, Float2& v2) {
+        v2.value[0] = v1.value[2];
+        v2.value[1] = v1.value[3];
+    }
+    static Float4 combine(const Float2& v1, const Float2& v2) {
+        Float4 dst;
+        dst.value[0] = v1.value[0];
+        dst.value[1] = v1.value[1];
+        dst.value[2] = v2.value[0];
+        dst.value[3] = v2.value[1];
+        return dst;
+    }
+    static void mla_lane0(Float4& v1, const Float4& v2, const Float2& v3) {
+        for (int i = 0; i < 4; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[0];
+        }
+    }
+    static void mla_lane1(Float4& v1, const Float4& v2, const Float2& v3) {
+        for (int i = 0; i < 4; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[1];
+        }
+    }
+    static void mls_lane0(Float4& v1, const Float4& v2, const Float2& v3) {
+        for (int i = 0; i < 4; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[0];
+        }
+    }
+    static void mls_lane1(Float4& v1, const Float4& v2, const Float2& v3) {
+        for (int i = 0; i < 4; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[1];
+        }
+    }
+
+    using TNNVector<float, 4>::load;
+    using TNNVector<float, 4>::save;
+    static Float4 load(const bfp16_t* addr) {
+        Float4 v;
+        for (int i = 0; i < 4; ++i) {
+            v.value[i] = static_cast<float>(addr[i]);
+        }
+        return v;
+    }
+    static void save(bfp16_t* addr, const Float4& v) {
+        for (int i = 0; i < 4; ++i) {
+            addr[i] = static_cast<bfp16_t>(v.value[i]);
+        }
+    }
+};
+
+struct Short4 {
+    int16_t value[4];
+    static void save(int16_t* addr, const Short4& v) {
+        for (int i = 0; i < 4; ++i) {
+            addr[i] = v.value[i];
+        }
+    }
+};
+
+struct Short4x4 {
+    Short4 value[4];
+    static Short4x4 ld4(const int16_t* addr) {
+        Short4x4 v;
+        for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+                v.value[j].value[i] = static_cast<float>(addr[i * 4 + j]);
+            }
+        }
+        return v;
+    }
+
+    static void st1_lane(int16_t* addr, const Short4x4& v, int index) {
+        Short4::save(addr, v.value[index]);
+    }
+};
+
+struct Float4x4 {
+    Float4 value[4];
+    static Float4x4 ld4(const float* addr) {
+        Float4x4 v;
+        for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+                v.value[j].value[i] = static_cast<float>(addr[i * 4 + j]);
+            }
+        }
+        return v;
+    }
+
+    static Float4x4 max(const Float4x4& v1, const Float4& v2) {
+        Float4x4 dst;
+        dst.value[0] = Float4::max(v1.value[0], v2);
+        dst.value[1] = Float4::max(v1.value[1], v2);
+        dst.value[2] = Float4::max(v1.value[2], v2);
+        dst.value[3] = Float4::max(v1.value[3], v2);
+        return dst;
+    }
+    static Float4x4 min(const Float4x4& v1, const Float4& v2) {
+        Float4x4 dst;
+        dst.value[0] = Float4::min(v1.value[0], v2);
+        dst.value[1] = Float4::min(v1.value[1], v2);
+        dst.value[2] = Float4::min(v1.value[2], v2);
+        dst.value[3] = Float4::min(v1.value[3], v2);
+        return dst;
+    }
+
+    Float4x4 operator+(const Float4& v2) {
+        Float4x4 dst;
+        dst.value[0] = value[0] + v2;
+        dst.value[1] = value[1] + v2;
+        dst.value[2] = value[2] + v2;
+        dst.value[3] = value[3] + v2;
+        return dst;
+    }
+    template <typename T>
+    static Float4x4 load(const T* addr) {
+        Float4x4 v;
+        v.value[0] = Float4::load(addr);
+        v.value[1] = Float4::load(addr + 4);
+        v.value[2] = Float4::load(addr + 8);
+        v.value[3] = Float4::load(addr + 12);
+        return v;
+    }
+
+    template <typename T>
+    static void save(T* addr, const Float4x4& v) {
+        Float4::save(addr, v.value[0]);
+        Float4::save(addr + 4, v.value[1]);
+        Float4::save(addr + 8, v.value[2]);
+        Float4::save(addr + 12, v.value[3]);
+    }
+    static void st1_lane(float* addr, const Float4x4& v, int index) {
+        Float4::save(addr, v.value[index]);
+    }
+
+    void get_lane(Float4 &v, int index) {
+        v = value[index];
+    }
+};
+
+#endif
+}  // namespace TNN_NS
+
+#endif /* Float4_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/Half8.h b/3rdparty/TNN/source/tnn/device/arm/acc/Half8.h
new file mode 100644
index 0000000..748152b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/Half8.h
@@ -0,0 +1,1383 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef Half8_hpp
+#define Half8_hpp
+#include <algorithm>  // supply std::max and std::min
+#include <cmath>
+#include "tnn/core/macro.h"
+#include "tnn/utils/half.hpp"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/device/arm/acc/TNNVector.h"
+#include "tnn/device/arm/acc/Float4.h"
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#include "tnn/device/arm/acc/neon_mathfun.h"
+#endif
+
+namespace TNN_NS {
+
+#ifdef TNN_ARM82_A64
+
+struct Half4 {
+    float16x4_t value;
+    const __fp16 operator[](const int i) const {
+        return value[i];
+    }
+    Half4() {}
+    Half4(const float16x4_t& v) {
+        value = v;
+    }
+    Half4(const float16x4_t&& v) {
+        value = std::move(v);
+    }
+    Half4(const Half4& lr) {
+        value = lr.value;
+    }
+    Half4(const Half4&& lr) {
+        value = std::move(lr.value);
+    }
+    Half4(const Float4& lr) {
+        value = vcvt_f16_f32(lr.value);
+    }
+    static Half4 load(const __fp16* addr) {
+        Half4 v;
+        v.value = vld1_f16(addr);
+        return v;
+    }
+    static void save(__fp16* addr, const Half4& v) {
+        vst1_f16(addr, v.value);
+    }
+    static void zip(Half4& v1, Half4& v2) {
+        float16x4x2_t v = vzip_f16(v1.value, v2.value);
+        v1.value = v.val[0];
+        v2.value = v.val[1];
+    }
+    static void add_to_f32(Half4& v1, Float4& v2) {
+        v2.value = vaddq_f32(v2.value, vcvt_f32_f16(v1.value));
+    }
+    Half4& operator=(const Half4& lr) {
+        value = lr.value;
+        return *this;
+    }
+};
+
+struct Half8 {
+    float16x8_t value;
+    Half8() {}
+    Half8(const __fp16 v) {
+        value = vdupq_n_f16(v);
+    }
+    Half8(const float16x8_t& v) {
+        value = v;
+    }
+    Half8(const float16x8_t&& v) {
+        value = std::move(v);
+    }
+    Half8(const Half8& lr) {
+        value = lr.value;
+    }
+    Half8(const Half8&& lr) {
+        value = std::move(lr.value);
+    }
+
+    void set_lane(__fp16 v, const int i) {
+        // vsetq_lane_f16(v, value, i);
+        value[i] = v;
+    }
+
+    const __fp16 operator[](const int i) const {
+        // return vgetq_lane_f16(value, i);
+        return value[i];
+    }
+
+    static Half8 load(const __fp16* addr) {
+        Half8 v;
+        v.value = vld1q_f16(addr);
+        return v;
+    }
+    static void save(__fp16* addr, const Half8& v) {
+        vst1q_f16(addr, v.value);
+    }
+
+    static void get_low(const Half8& v1, Half4& v2) {
+        v2.value = vget_low_f16(v1.value);
+    }
+    static void get_high(const Half8& v1, Half4& v2) {
+        v2.value = vget_high_f16(v1.value);
+    }
+    static Half8 combine(const Half4& v1, const Half4& v2) {
+        return vcombine_f16(v1.value, v2.value);
+    }
+    static Half8 extract(const Half8& v1, const Half8& v2, const int n) {
+        Half8 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vextq_f16(v1.value, v2.value, 1);
+        } else if (n == 2) {
+            dst.value = vextq_f16(v1.value, v2.value, 2);
+        } else if (n == 3) {
+            dst.value = vextq_f16(v1.value, v2.value, 3);
+        } else if (n == 4) {
+            dst.value = vextq_f16(v1.value, v2.value, 4);
+        } else if (n == 5) {
+            dst.value = vextq_f16(v1.value, v2.value, 5);
+        } else if (n == 6) {
+            dst.value = vextq_f16(v1.value, v2.value, 6);
+        } else if (n == 7) {
+            dst.value = vextq_f16(v1.value, v2.value, 7);
+        } else if (n == 8) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static Half8 pad(const Half8& v1, const Half8& v2, const int n) {
+        static const uint16_t select  = uint16_t(-1);
+        static const uint16x8_t mask1 = {select,select,select,select,select,select,select,0};
+        static const uint16x8_t mask2 = {select,select,select,select,select,select,0,0};
+        static const uint16x8_t mask3 = {select,select,select,select,select,0,0,0};
+        static const uint16x8_t mask4 = {select,select,select,select,0,0,0,0};
+        static const uint16x8_t mask5 = {select,select,select,0,0,0,0,0};
+        static const uint16x8_t mask6 = {select,select,0,0,0,0,0,0};
+        static const uint16x8_t mask7 = {select,0,0,0,0,0,0,0};
+
+        Half8 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vbslq_f16(mask1, v1.value, v2.value);
+        } else if (n == 2) {
+            dst.value = vbslq_f16(mask2, v1.value, v2.value);
+        } else if (n == 3) {
+            dst.value = vbslq_f16(mask3, v1.value, v2.value);
+        } else if (n == 4) {
+            dst.value = vbslq_f16(mask4, v1.value, v2.value);
+        } else if (n == 5) {
+            dst.value = vbslq_f16(mask5, v1.value, v2.value);
+        } else if (n == 6) {
+            dst.value = vbslq_f16(mask6, v1.value, v2.value);
+        } else if (n == 7) {
+            dst.value = vbslq_f16(mask7, v1.value, v2.value);
+        } else if (n == 8) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static void mla(Half8& v1, const Half8& v2, const Half8& v3) {
+        v1.value = vfmaq_f16(v1.value, v2.value, v3.value);
+    }
+    static void mla_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmaq_lane_f16(v1.value, v2.value, v3.value, 0);
+    }
+    static void mla_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmaq_lane_f16(v1.value, v2.value, v3.value, 1);
+    }
+    static void mla_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmaq_lane_f16(v1.value, v2.value, v3.value, 2);
+    }
+    static void mla_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmaq_lane_f16(v1.value, v2.value, v3.value, 3);
+    }
+    static void mls(Half8& v1, const Half8& v2, const Half8& v3) {
+        v1.value = vfmsq_f16(v1.value, v2.value, v3.value);
+    }
+    static void mls_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmsq_lane_f16(v1.value, v2.value, v3.value, 0);
+    }
+    static void mls_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmsq_lane_f16(v1.value, v2.value, v3.value, 1);
+    }
+    static void mls_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmsq_lane_f16(v1.value, v2.value, v3.value, 2);
+    }
+    static void mls_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        v1.value = vfmsq_lane_f16(v1.value, v2.value, v3.value, 3);
+    }
+    static Half8 bsl_cle(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile (
+            "fcmgt %0.8h, %3.8h, %2.8h\n\t"
+            "bsl %0.16b, %4.16b, %5.16b\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value), "w"(c1.value), "w"(c2.value), "w"(v1.value), "w"(v2.value)
+            :"cc", "memory"
+        );
+        return dst;
+    }
+    static Half8 bsl_clt(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile (
+            "fcmge %0.8h, %3.8h, %2.8h\n\t"
+            "bsl %0.16b, %4.16b, %5.16b\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value), "w"(c1.value), "w"(c2.value), "w"(v1.value), "w"(v2.value)
+            :"cc", "memory"
+        );
+        return dst;
+    }
+    static Half8 bsl_cge(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile (
+            "fcmge %0.8h, %2.8h, %3.8h\n\t"
+            "bsl %0.16b, %4.16b, %5.16b\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value), "w"(c1.value), "w"(c2.value), "w"(v1.value), "w"(v2.value)
+            :"cc", "memory"
+        );
+        return dst;
+    }
+    static Half8 bsl_cgt(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile (
+            "fcmgt %0.8h, %2.8h, %3.8h\n\t"
+            "bsl %0.16b, %4.16b, %5.16b\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value), "w"(c1.value), "w"(c2.value), "w"(v1.value), "w"(v2.value)
+            :"cc", "memory"
+        );
+        return dst;
+    }
+    static Half8 neg(const Half8& v) {
+        Half8 dst;
+        dst.value = vnegq_f16(v.value);
+        return dst;
+    }
+    static Half8 floor(const Half8& v) {
+        Half8 dst;
+        dst.value = vcvtq_f16_s16(vcvtmq_s16_f16(v.value));
+        return dst;
+    }
+    static Half8 ceil(const Half8& v) {
+        Half8 dst;
+        dst.value = vcvtq_f16_s16(vcvtpq_s16_f16(v.value));
+        return dst;
+    }
+    static Half8 max(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        dst.value = vmaxq_f16(v1.value, v2.value);
+        return dst;
+    }
+    static Half8 min(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        dst.value = vminq_f16(v1.value, v2.value);
+        return dst;
+    }
+    static Half8 div(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        float16x8_t reciprocal = vrecpeq_f16(v2.value);
+        reciprocal = vmulq_f16(vrecpsq_f16(v2.value, reciprocal), reciprocal);
+        reciprocal = vmulq_f16(vrecpsq_f16(v2.value, reciprocal), reciprocal);
+        dst.value = vmulq_f16(v1.value, reciprocal);
+        return dst;
+    }
+    static Half8 exp(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = exp_ps(v_low);
+        v_high = exp_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 pow(const Half8& v, const Half8& e) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        float32x4_t e_low  = vcvt_f32_f16(vget_low_f16(e.value));
+        float32x4_t e_high = vcvt_f32_f16(vget_high_f16(e.value));
+        v_low = pow_ps(v_low, e_low);
+        v_high = pow_ps(v_high, e_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 sqrt(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = sqrt_ps(v_low);
+        v_high = sqrt_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+
+        static float16x8_t zero = vdupq_n_f16(0.0f);
+        dst.value = vbslq_f16(vceqq_f16(v.value, zero), zero, dst.value);
+        return dst;
+    }
+    static Half8 tanh(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = tanh_ps(v_low);
+        v_high = tanh_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 tan(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        float32x4_t ysin_low, ycos_low;
+        float32x4_t ysin_high, ycos_high;
+        sincos_ps(v_low, &ysin_low, &ycos_low);
+        sincos_ps(v_high, &ysin_high, &ycos_high);
+        v_low = div_ps(ysin_low, ycos_low);
+        v_high = div_ps(ysin_high, ycos_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 sin(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = sin_ps(v_low);
+        v_high = sin_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 cos(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = cos_ps(v_low);
+        v_high = cos_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 sigmoid(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = sigmoid_ps(v_low);
+        v_high = sigmoid_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 fast_sigmoid(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = fast_sigmoid_ps(v_low);
+        v_high = fast_sigmoid_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 log(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vget_low_f16(v.value));
+        float32x4_t v_high = vcvt_f32_f16(vget_high_f16(v.value));
+        v_low = log_ps(v_low);
+        v_high = log_ps(v_high);
+        dst.value = vcombine_f16(vcvt_f16_f32(v_low), vcvt_f16_f32(v_high));
+        return dst;
+    }
+    static Half8 abs(const Half8& v) {
+        Half8 dst;
+        dst.value = vabsq_f16(v.value);
+        return dst;
+    }
+    static void zip(Half8& v1, Half8& v2) {
+        float16x8x2_t v = vzipq_f16(v1.value, v2.value);
+        v1.value = v.val[0];
+        v2.value = v.val[1];
+    }
+    Half8 operator+(const Half8& lr) const {
+        Half8 dst;
+        dst.value = vaddq_f16(value, lr.value);
+        return dst;
+    }
+    Half8 operator-(const Half8& lr) const {
+        Half8 dst;
+        dst.value = vsubq_f16(value, lr.value);
+        return dst;
+    }
+    Half8 operator*(__fp16 lr) const {
+        Half8 dst;
+        dst.value = vmulq_n_f16(value, lr);
+        return dst;
+    }
+    Half8 operator*(const Half8& lr) const {
+        Half8 dst;
+        dst.value = vmulq_f16(value, lr.value);
+        return dst;
+    }
+    Half8& operator=(const Half8& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Half8& operator=(const Half8&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Half8 operator-() const {
+        Half8 dst;
+        dst.value = -value;
+        return dst;
+    }
+};
+
+struct Half8x4 {
+    float16x8x4_t value;
+    static Half8x4 ld4(const __fp16* addr) {
+        Half8x4 v;
+        v.value = vld4q_f16(addr);
+        return v;
+    }
+    void get_lane(Half8& v, int index) {
+        v.value = value.val[index];
+    }
+};
+
+struct Half8x8 {
+    float16x8x4_t value0;
+    float16x8x4_t value1;
+    Half8x8() {}
+
+    void set_value0(const Half8& lr) {
+        value0.val[0] = lr.value;
+    }
+    void set_value1(const Half8& lr) {
+        value0.val[1] = lr.value;
+    }
+    void set_value2(const Half8& lr) {
+        value0.val[2] = lr.value;
+    }
+    void set_value3(const Half8& lr) {
+        value0.val[3] = lr.value;
+    }
+    void set_value4(const Half8& lr) {
+        value1.val[0] = lr.value;
+    }
+    void set_value5(const Half8& lr) {
+        value1.val[1] = lr.value;
+    }
+    void set_value6(const Half8& lr) {
+        value1.val[2] = lr.value;
+    }
+    void set_value7(const Half8& lr) {
+        value1.val[3] = lr.value;
+    }
+
+    void save_transpose(fp16_t* addr) {
+        float16x8x4_t v_tmp0;
+        float16x8x4_t v_tmp1;
+        v_tmp0.val[0] = vzip1q_f16(value0.val[0], value1.val[0]);
+        v_tmp0.val[1] = vzip1q_f16(value0.val[1], value1.val[1]);
+        v_tmp0.val[2] = vzip1q_f16(value0.val[2], value1.val[2]);
+        v_tmp0.val[3] = vzip1q_f16(value0.val[3], value1.val[3]);
+        vst4q_f16(addr, v_tmp0);
+        v_tmp1.val[0] = vzip2q_f16(value0.val[0], value1.val[0]);
+        v_tmp1.val[1] = vzip2q_f16(value0.val[1], value1.val[1]);
+        v_tmp1.val[2] = vzip2q_f16(value0.val[2], value1.val[2]);
+        v_tmp1.val[3] = vzip2q_f16(value0.val[3], value1.val[3]);
+        vst4q_f16(addr + 32, v_tmp1);
+    }
+};
+
+#elif defined(TNN_ARM82_A32)
+
+struct Half4 {
+    // use int16x4 to store the d register, avoiding compile error 
+    int16x4_t value;
+    const fp16_t operator[](const int i) const {
+        int16_t tmp_v;
+        if (i == 0) {
+            tmp_v = vget_lane_s16(value, 0);
+        } else if (i == 1) {
+            tmp_v = vget_lane_s16(value, 1);
+        } else if (i == 2) {
+            tmp_v = vget_lane_s16(value, 2);
+        } else if (i == 3) {
+            tmp_v = vget_lane_s16(value, 3);
+        }
+        return *((fp16_t*)(&tmp_v));
+    }
+    Half4() {}
+    Half4(const int16x4_t& v) {
+        value = v;
+    }
+    Half4(const int16x4_t&& v) {
+        value = std::move(v);
+    }
+    Half4(fp16_t* v) {
+        value[0] = *(int16_t*)(v);
+        value[1] = *(int16_t*)(v + 1);
+        value[2] = *(int16_t*)(v + 2);
+        value[3] = *(int16_t*)(v + 3);
+    }
+    Half4(const Half4& lr) {
+        value = lr.value;
+    }
+    Half4(const Half4&& lr) {
+        value = std::move(lr.value);
+    }
+    Half4(const Float4& lr) {
+        value = vreinterpret_s16_f16(vcvt_f16_f32(lr.value));
+    }
+    static Half4 load(const fp16_t* addr) {
+        Half4 v;
+        asm volatile(
+            "vld1.16 {%P0}, [%2]\n\t"
+            :"=w"(v.value)
+            :"0"(v.value),"r"(addr)
+            :
+        );
+        return v;
+    }
+    static void save(fp16_t* addr, const Half4& v) {
+        asm volatile(
+            "vst1.16 {%P0}, [%1]\n\t"
+            :
+            :"w"(v.value),"r"(addr)
+            :
+        );
+    }
+    static void zip(Half4& v1, Half4& v2) {
+        int16x4x2_t v = vzip_s16(v1.value, v2.value);
+        v1.value = v.val[0];
+        v2.value = v.val[1];
+    }
+    static void add_to_f32(Half4& v1, Float4& v2) {
+        v2.value = vaddq_f32(v2.value, vcvt_f32_f16(vreinterpret_f16_s16(v1.value)));
+    }
+    Half4& operator=(const Half4& lr) {
+        value = lr.value;
+        return *this;
+    }
+};
+
+struct Half8 {
+    // use int16x8 to store the q register, avoiding compile error 
+    int16x8_t value;
+    Half8() {}
+    Half8(const fp16_t v) {
+        asm volatile(
+            "vdup.16 %0, %2\n\t"
+            :"=w"(value)
+            :"0"(value),"r"(v)
+            :
+        );
+    }
+    Half8(const int16x8_t& v) {
+        value = v;
+    }
+    Half8(const int16x8_t&& v) {
+        value = std::move(v);
+    }
+    Half8(const Half8& lr) {
+        value = lr.value;
+    }
+    Half8(const Half8&& lr) {
+        value = std::move(lr.value);
+    }
+
+    void set_lane(fp16_t v, const int i) {
+        fp16_t tmp_v = v;
+        if (i == 0) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 0);
+        } else if (i == 1) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 1);
+        } else if (i == 2) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 2);
+        } else if (i == 3) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 3);
+        } else if (i == 4) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 4);
+        } else if (i == 5) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 5);
+        } else if (i == 6) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 6);
+        } else if (i == 7) {
+            value = vsetq_lane_s16(*((int16_t*)(&tmp_v)), value, 7);
+        }
+    }
+
+    void save_lane0(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 0);
+    }
+    void save_lane1(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 1);
+    }
+    void save_lane2(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 2);
+    }
+    void save_lane3(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 3);
+    }
+    void save_lane4(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 4);
+    }
+    void save_lane5(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 5);
+    }
+    void save_lane6(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 6);
+    }
+    void save_lane7(fp16_t* addr) {
+        vst1q_lane_s16((int16_t*)addr, value, 7);
+    }
+
+    const fp16_t operator[](const int i) const {
+        int16_t tmp_v;
+        if (i == 0) {
+            tmp_v = vgetq_lane_s16(value, 0);
+        } else if (i == 1) {
+            tmp_v = vgetq_lane_s16(value, 1);
+        } else if (i == 2) {
+            tmp_v = vgetq_lane_s16(value, 2);
+        } else if (i == 3) {
+            tmp_v = vgetq_lane_s16(value, 3);
+        } else if (i == 4) {
+            tmp_v = vgetq_lane_s16(value, 4);
+        } else if (i == 5) {
+            tmp_v = vgetq_lane_s16(value, 5);
+        } else if (i == 6) {
+            tmp_v = vgetq_lane_s16(value, 6);
+        } else if (i == 7) {
+            tmp_v = vgetq_lane_s16(value, 7);
+        }
+        return *((fp16_t*)(&tmp_v));
+    }
+
+    static Half8 cvt(const uint16x8_t& src) {
+        Half8 v;
+        asm volatile (
+            "vcvtq.f16.u16 %0, %2\n\t"
+            :"=w"(v.value)
+            :"0"(v.value),"w"(src)
+            :
+        );
+        return v;
+    }
+    static Half8 load(const fp16_t* addr) {
+        Half8 v;
+        asm volatile(
+            "vld1.16 {%0}, [%2]\n\t"
+            :"=w"(v.value)
+            :"0"(v.value),"r"(addr)
+            :
+        );
+        return v;
+    }
+    static void save(fp16_t* addr, const Half8& v) {
+        asm volatile(
+            "vst1.16 {%0}, [%1]\n\t"
+            :
+            :"w"(v.value),"r"(addr)
+            :
+        );
+    }
+    static void get_low(const Half8& v1, Half4& v2) {
+        v2.value = vget_low_s16(v1.value);
+    }
+    static void get_high(const Half8& v1, Half4& v2) {
+        v2.value = vget_high_s16(v1.value);
+    }
+    static Half8 combine(const Half4& v1, const Half4& v2) {
+        return vcombine_s16(v1.value, v2.value);
+    }
+    static Half8 extract(const Half8& v1, const Half8& v2, const int n) {
+        Half8 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vextq_s16(v1.value, v2.value, 1);
+        } else if (n == 2) {
+            dst.value = vextq_s16(v1.value, v2.value, 2);
+        } else if (n == 3) {
+            dst.value = vextq_s16(v1.value, v2.value, 3);
+        } else if (n == 4) {
+            dst.value = vextq_s16(v1.value, v2.value, 4);
+        } else if (n == 5) {
+            dst.value = vextq_s16(v1.value, v2.value, 5);
+        } else if (n == 6) {
+            dst.value = vextq_s16(v1.value, v2.value, 6);
+        } else if (n == 7) {
+            dst.value = vextq_s16(v1.value, v2.value, 7);
+        } else if (n == 8) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static Half8 pad(const Half8& v1, const Half8& v2, const int n) {
+        static const uint16_t select  = uint16_t(-1);
+        static const uint16x8_t mask1 = {select,select,select,select,select,select,select,0};
+        static const uint16x8_t mask2 = {select,select,select,select,select,select,0,0};
+        static const uint16x8_t mask3 = {select,select,select,select,select,0,0,0};
+        static const uint16x8_t mask4 = {select,select,select,select,0,0,0,0};
+        static const uint16x8_t mask5 = {select,select,select,0,0,0,0,0};
+        static const uint16x8_t mask6 = {select,select,0,0,0,0,0,0};
+        static const uint16x8_t mask7 = {select,0,0,0,0,0,0,0};
+
+        Half8 dst;
+        if (n == 0) {
+            dst.value = v1.value;
+        } else if (n == 1) {
+            dst.value = vbslq_s16(mask1, v1.value, v2.value);
+        } else if (n == 2) {
+            dst.value = vbslq_s16(mask2, v1.value, v2.value);
+        } else if (n == 3) {
+            dst.value = vbslq_s16(mask3, v1.value, v2.value);
+        } else if (n == 4) {
+            dst.value = vbslq_s16(mask4, v1.value, v2.value);
+        } else if (n == 5) {
+            dst.value = vbslq_s16(mask5, v1.value, v2.value);
+        } else if (n == 6) {
+            dst.value = vbslq_s16(mask6, v1.value, v2.value);
+        } else if (n == 7) {
+            dst.value = vbslq_s16(mask7, v1.value, v2.value);
+        } else if (n == 8) {
+            dst.value = v2.value;
+        }
+        return dst;
+    }
+    static void mla(Half8& v1, const Half8& v2, const Half8& v3) {
+        asm volatile (
+            "vmla.f16 %0, %2, %3\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mla_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmla.f16 %q0, %q2, %P3[0]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mla_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmla.f16 %q0, %q2, %P3[1]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mla_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmla.f16 %q0, %q2, %P3[2]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mla_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmla.f16 %q0, %q2, %P3[3]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mla_3_lanes(Half8& a0, const Half8& m0,
+                            Half8& a1, const Half8& m1,
+                            Half8& a2, const Half8& m2,
+                            const Half4& m) {
+        asm volatile(
+            "vmov     d0, %9       \n\t"
+            "vmla.f16 %0, %6, d0[0]\n\t"
+            "vmla.f16 %1, %7, d0[1]\n\t"
+            "vmla.f16 %2, %8, d0[2]\n\t"
+            :"=w"(a0.value),"=w"(a1.value),"=w"(a2.value)
+            :"0"(a0.value),"1"(a1.value),"2"(a2.value),
+            "w"(m0.value),"w"(m1.value),"w"(m2.value),"w"(m.value)
+            :"cc","q0"
+        );
+    }
+    static void mla_4_lanes(Half8& a0, const Half8& m0,
+                            Half8& a1, const Half8& m1,
+                            Half8& a2, const Half8& m2,
+                            Half8& a3, const Half8& m3,
+                            const Half4& m) {
+        asm volatile(
+            "vmov     d0, %12       \n\t"
+            "vmla.f16 %0,  %8, d0[0]\n\t"
+            "vmla.f16 %1,  %9, d0[1]\n\t"
+            "vmla.f16 %2, %10, d0[2]\n\t"
+            "vmla.f16 %3, %11, d0[3]\n\t"
+            :"=w"(a0.value),"=w"(a1.value),"=w"(a2.value),"=w"(a3.value)
+            :"0"(a0.value),"1"(a1.value),"2"(a2.value),"3"(a3.value),
+            "w"(m0.value),"w"(m1.value),"w"(m2.value),"w"(m3.value),"w"(m.value)
+            :"cc","q0"
+        );
+    }
+    static void mls(Half8& v1, const Half8& v2, const Half8& v3) {
+        asm volatile (
+            "vmls.f16 %0, %2, %3\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mls_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmls.f16 %q0, %q2, %P3[0]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mls_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmls.f16 %q0, %q2, %P3[1]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mls_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmls.f16 %q0, %q2, %P3[2]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static void mls_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        asm volatile(
+            "vmls.f16 %q0, %q2, %P3[3]\n\t"
+            :"=w"(v1.value)
+            :"0"(v1.value),"w"(v2.value),"w"(v3.value)
+            :
+        );
+    }
+    static Half8 bsl_cle(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        uint16x8_t cmp_vec;
+        asm volatile(
+            "vcle.f16 %0, %2, %3\n\t"
+            :"=w"(cmp_vec)
+            :"0"(cmp_vec),"w"(c1.value),"w"(c2.value)
+            :
+        );
+        dst.value = vbslq_s16(cmp_vec, v1.value, v2.value);
+        return dst;
+    }
+    static Half8 bsl_clt(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        uint16x8_t cmp_vec;
+        asm volatile(
+            "vclt.f16 %0, %2, %3\n\t"
+            :"=w"(cmp_vec)
+            :"0"(cmp_vec),"w"(c1.value),"w"(c2.value)
+            :
+        );
+        dst.value = vbslq_s16(cmp_vec, v1.value, v2.value);
+        return dst;
+    }
+    static Half8 bsl_cge(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        uint16x8_t cmp_vec;
+        asm volatile(
+            "vcge.f16 %0, %2, %3\n\t"
+            :"=w"(cmp_vec)
+            :"0"(cmp_vec),"w"(c1.value),"w"(c2.value)
+            :
+        );
+        dst.value = vbslq_s16(cmp_vec, v1.value, v2.value);
+        return dst;
+    }
+    static Half8 bsl_cgt(const Half8& c1, const Half8& c2, const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        uint16x8_t cmp_vec;
+        asm volatile(
+            "vcgt.f16 %0, %2, %3\n\t"
+            :"=w"(cmp_vec)
+            :"0"(cmp_vec),"w"(c1.value),"w"(c2.value)
+            :
+        );
+        dst.value = vbslq_s16(cmp_vec, v1.value, v2.value);
+        return dst;
+    }
+    static Half8 neg(const Half8& v) {
+        Half8 dst;
+        asm volatile(
+            "vneg.f16 %0, %2\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(v.value)
+            :
+        );
+        return dst;
+    }
+    static Half8 max(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile(
+            "vmax.f16 %0, %2, %3\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(v1.value),"w"(v2.value)
+            :
+        );
+        return dst;
+    }
+    static Half8 min(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile(
+            "vmin.f16 %0, %2, %3\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(v1.value),"w"(v2.value)
+            :
+        );
+        return dst;
+    }
+    static Half8 div(const Half8& v1, const Half8& v2) {
+        Half8 dst;
+        asm volatile(
+            "vrecpe.f16 q5, %3\n\t"
+            "vrecps.f16 q6, %3, q5\n\t"
+            "vmul.f16 q5, q6, q5\n\t"
+            "vrecps.f16 q6, %3, q5\n\t"
+            "vmul.f16 q5, q6, q5\n\t"
+            "vmul.f16 %0, %2, q5\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(v1.value),"w"(v2.value)
+            :"q5","q6"
+        );
+        return dst;
+    }
+    static Half8 exp(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = exp_ps(v_low);
+        v_high = exp_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 pow(const Half8& v, const Half8& e) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        float32x4_t e_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(e.value)));
+        float32x4_t e_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(e.value)));
+        v_low = pow_ps(v_low, e_low);
+        v_high = pow_ps(v_high, e_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 sqrt(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = sqrt_ps(v_low);
+        v_high = sqrt_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+
+        uint16x8_t cmp_vec;
+        int16x8_t zero = vdupq_n_s16(0);
+        asm volatile(
+            "vceq.f16 %0, %2, #0\n\t"
+            :"=w"(cmp_vec)
+            :"0"(cmp_vec),"w"(v.value)
+            :
+        );
+        dst.value = vbslq_s16(cmp_vec, zero, dst.value);
+        return dst;
+    }
+    static Half8 tanh(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = tanh_ps(v_low);
+        v_high = tanh_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 tan(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        float32x4_t ysin_low, ycos_low;
+        float32x4_t ysin_high, ycos_high;
+        sincos_ps(v_low, &ysin_low, &ycos_low);
+        sincos_ps(v_high, &ysin_high, &ycos_high);
+        v_low = div_ps(ysin_low, ycos_low);
+        v_high = div_ps(ysin_high, ycos_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 sin(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = sin_ps(v_low);
+        v_high = sin_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 cos(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = cos_ps(v_low);
+        v_high = cos_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 sigmoid(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = sigmoid_ps(v_low);
+        v_high = sigmoid_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 fast_sigmoid(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = fast_sigmoid_ps(v_low);
+        v_high = fast_sigmoid_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 log(const Half8& v) {
+        Half8 dst;
+        float32x4_t v_low  = vcvt_f32_f16(vreinterpret_f16_s16(vget_low_s16(v.value)));
+        float32x4_t v_high = vcvt_f32_f16(vreinterpret_f16_s16(vget_high_s16(v.value)));
+        v_low = log_ps(v_low);
+        v_high = log_ps(v_high);
+        dst.value = vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(v_low)),
+                        vreinterpret_s16_f16(vcvt_f16_f32(v_high)));
+        return dst;
+    }
+    static Half8 abs(const Half8& v) {
+        Half8 dst;
+        asm volatile(
+            "vabs.f16 %0, %2\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(v.value)
+            :
+        );
+        return dst;
+    }
+    static void zip(Half8& v1, Half8& v2) {
+        int16x8x2_t v = vzipq_s16(v1.value, v2.value);
+        v1.value = v.val[0];
+        v2.value = v.val[1];
+    }
+    Half8 operator+(const Half8& lr) const {
+        Half8 dst;
+        asm volatile(
+            "vadd.f16 %0, %2, %3\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(value),"w"(lr.value)
+            :
+        );
+        return dst;
+    }
+    Half8 operator-(const Half8& lr) const {
+        Half8 dst;
+        asm volatile(
+            "vsub.f16 %0, %2, %3\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(value),"w"(lr.value)
+            :
+        );
+        return dst;
+    }
+    Half8 operator*(const Half8& lr) const {
+        Half8 dst;
+        asm volatile(
+            "vmul.f16 %0, %2, %3\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(value),"w"(lr.value)
+            :
+        );
+        return dst;
+    }
+    Half8& operator=(const Half8& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Half8& operator=(const Half8&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Half8 operator-() const {
+        Half8 dst;
+        asm volatile(
+            "vsub.f16 %0, %2\n\t"
+            :"=w"(dst.value)
+            :"0"(dst.value),"w"(value)
+            :
+        );
+        return dst;
+    }
+};
+
+struct Half8x4 {
+    int16x8x4_t value;
+    Half8x4() {}
+
+    static Half8x4 ld4(const fp16_t* addr) {
+        Half8x4 v;
+        v.value = vld4q_s16((int16_t*)addr);
+        return v;
+    }
+    void get_lane(Half8& v, int index) {
+        v.value = value.val[index];
+    }
+
+    void set_value0(const Half8& lr) {
+        value.val[0] = lr.value;
+    }
+    void set_value1(const Half8& lr) {
+        value.val[1] = lr.value;
+    }
+    void set_value2(const Half8& lr) {
+        value.val[2] = lr.value;
+    }
+    void set_value3(const Half8& lr) {
+        value.val[3] = lr.value;
+    }
+
+    const Half8 operator[](const int i) const {
+        Half8 tmp_v;
+        if (i == 0) {
+            tmp_v = Half8(value.val[0]);
+        } else if (i == 1) {
+            tmp_v = Half8(value.val[1]);
+        } else if (i == 2) {
+            tmp_v = Half8(value.val[2]);
+        } else if (i == 3) {
+            tmp_v = Half8(value.val[3]);
+        }
+        return tmp_v;
+    }
+
+    void save_transpose(fp16_t* addr, const Half8& pad) {
+        int16x8x4_t v_tmp0;
+        int16x8x4_t v_tmp1;
+        v_tmp0.val[0] = vzipq_s16(value.val[0], pad.value).val[0];
+        v_tmp1.val[0] = vzipq_s16(value.val[0], pad.value).val[1];
+        v_tmp0.val[1] = vzipq_s16(value.val[1], pad.value).val[0];
+        v_tmp1.val[1] = vzipq_s16(value.val[1], pad.value).val[1];
+        v_tmp0.val[2] = vzipq_s16(value.val[2], pad.value).val[0];
+        v_tmp1.val[2] = vzipq_s16(value.val[2], pad.value).val[1];
+        v_tmp0.val[3] = vzipq_s16(value.val[3], pad.value).val[0];
+        v_tmp1.val[3] = vzipq_s16(value.val[3], pad.value).val[1];
+        vst4q_s16((int16_t*)addr, v_tmp0);
+        vst4q_s16((int16_t*)addr + 32, v_tmp1);
+    }
+};
+
+struct Half8x8 {
+    int16x8x4_t value0;
+    int16x8x4_t value1;
+    Half8x8() {}
+
+    void set_value0(const Half8& lr) {
+        value0.val[0] = lr.value;
+    }
+    void set_value1(const Half8& lr) {
+        value0.val[1] = lr.value;
+    }
+    void set_value2(const Half8& lr) {
+        value0.val[2] = lr.value;
+    }
+    void set_value3(const Half8& lr) {
+        value0.val[3] = lr.value;
+    }
+    void set_value4(const Half8& lr) {
+        value1.val[0] = lr.value;
+    }
+    void set_value5(const Half8& lr) {
+        value1.val[1] = lr.value;
+    }
+    void set_value6(const Half8& lr) {
+        value1.val[2] = lr.value;
+    }
+    void set_value7(const Half8& lr) {
+        value1.val[3] = lr.value;
+    }
+
+    void save_transpose(fp16_t* addr) {
+        int16x8x4_t v_tmp0;
+        int16x8x4_t v_tmp1;
+        v_tmp0.val[0] = vzipq_s16(value0.val[0], value1.val[0]).val[0];
+        v_tmp1.val[0] = vzipq_s16(value0.val[0], value1.val[0]).val[1];
+        v_tmp0.val[1] = vzipq_s16(value0.val[1], value1.val[1]).val[0];
+        v_tmp1.val[1] = vzipq_s16(value0.val[1], value1.val[1]).val[1];
+        v_tmp0.val[2] = vzipq_s16(value0.val[2], value1.val[2]).val[0];
+        v_tmp1.val[2] = vzipq_s16(value0.val[2], value1.val[2]).val[1];
+        v_tmp0.val[3] = vzipq_s16(value0.val[3], value1.val[3]).val[0];
+        v_tmp1.val[3] = vzipq_s16(value0.val[3], value1.val[3]).val[1];
+        vst4q_s16((int16_t*)addr, v_tmp0);
+        vst4q_s16((int16_t*)addr + 32, v_tmp1);
+    }
+};
+
+#else
+
+struct Half4 : TNNVector<fp16_t, 4> {
+    using TNNVector<fp16_t, 4>::TNNVector;
+    Half4() {}
+    Half4(const Half4& lr) {
+        for (int i = 0; i < 4; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+    Half4(const Float4& lr) {
+        for (int i = 0; i < 4; ++i) {
+            value[i] = (fp16_t)lr.value[i];
+        }
+    }
+    Half4(const TNNVector<fp16_t, 4>& lr) {
+        for (int i = 0; i < 4; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+    static void add_to_f32(Half4& v1, Float4& v2) {
+        for (int i = 0; i < 4; ++i) {
+            v2.value[i] = v2.value[i] + (float)v1.value[i];
+        }
+    }
+};
+
+struct Half8 : TNNVector<fp16_t, 8> {
+    using TNNVector<fp16_t, 8>::TNNVector;
+    Half8() {}
+    Half8(const Half8& lr) {
+        for (int i = 0; i < 8; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+    Half8(const TNNVector<fp16_t, 8>& lr) {
+        for (int i = 0; i < 8; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+
+    static void get_low(const Half8& v1, Half4& v2) {
+        v2.value[0] = v1.value[0];
+        v2.value[1] = v1.value[1];
+        v2.value[2] = v1.value[2];
+        v2.value[3] = v1.value[3];
+    }
+    static void get_high(const Half8& v1, Half4& v2) {
+        v2.value[0] = v1.value[4];
+        v2.value[1] = v1.value[5];
+        v2.value[2] = v1.value[6];
+        v2.value[3] = v1.value[7];
+    }
+    static Half8 combine(const Half4& v1, const Half4& v2) {
+        Half8 dst;
+        dst.value[0] = v1.value[0];
+        dst.value[1] = v1.value[1];
+        dst.value[2] = v1.value[2];
+        dst.value[3] = v1.value[3];
+        dst.value[4] = v2.value[0];
+        dst.value[5] = v2.value[1];
+        dst.value[6] = v2.value[2];
+        dst.value[7] = v2.value[3];
+        return dst;
+    }
+    static void mlaq_lane0(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[0];
+        }
+    }
+    static void mlaq_lane1(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[1];
+        }
+    }
+    static void mlaq_lane2(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[2];
+        }
+    }
+    static void mlaq_lane3(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[3];
+        }
+    }
+    static void mlaq_lane4(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[4];
+        }
+    }
+    static void mlaq_lane5(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[5];
+        }
+    }
+    static void mlaq_lane6(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[6];
+        }
+    }
+    static void mlaq_lane7(Half8& v1, const Half8& v2, const Half8& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[7];
+        }
+    }
+    static void mla_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[0];
+        }
+    }
+    static void mla_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[1];
+        }
+    }
+    static void mla_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[2];
+        }
+    }
+    static void mla_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[3];
+        }
+    }
+    static void mls_lane0(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[0];
+        }
+    }
+    static void mls_lane1(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[1];
+        }
+    }
+    static void mls_lane2(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[2];
+        }
+    }
+    static void mls_lane3(Half8& v1, const Half8& v2, const Half4& v3) {
+        for (int i = 0; i < 8; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[3];
+        }
+    }
+};
+
+#endif
+
+}  // namespace TNN_NS
+
+#endif /* Half8_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/TNNVector.h b/3rdparty/TNN/source/tnn/device/arm/acc/TNNVector.h
new file mode 100644
index 0000000..21227ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/TNNVector.h
@@ -0,0 +1,340 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNNVector_hpp
+#define TNNVector_hpp
+#include <algorithm>  // supply std::max and std::min
+#include <cmath>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+template <typename T, unsigned int len>
+struct TNNVector {
+    T value[len];
+    TNNVector<T, len> operator+(const TNNVector<T, len>& lr) const {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = value[i] + lr.value[i];
+        }
+        return dst;
+    }
+    TNNVector<T, len> operator-(const TNNVector<T, len>& lr) const {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = value[i] - lr.value[i];
+        }
+        return dst;
+    }
+    TNNVector<T, len> operator*(const TNNVector<T, len>& lr) const {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = value[i] * lr.value[i];
+        }
+        return dst;
+    }
+    TNNVector<T, len> operator*(T lr) const {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = value[i] * lr;
+        }
+        return dst;
+    }
+
+    TNNVector<T, len>& operator=(const TNNVector<T, len>& lr) {
+        for (int i = 0; i < len; ++i) {
+            value[i] = lr.value[i];
+        }
+        return *this;
+    }
+    TNNVector<T, len> operator-() const {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = -value[i];
+        }
+        return dst;
+    }
+    TNNVector<T, len>() {}
+    TNNVector<T, len>(const T v) {
+        for (int i = 0; i < len; ++i) {
+            value[i] = v;
+        }
+    }
+
+    TNNVector<T, len>(const TNNVector<T, len>& lr) {
+        for (int i = 0; i < len; ++i) {
+            value[i] = lr.value[i];
+        }
+    }
+
+    TNNVector<T, len>(const float *addr) {
+        for (int i = 0; i < len; ++i) {
+            value[i] = *addr;
+        }
+    }
+
+    void set_lane(T v, int i) {
+        value[i] = v;
+    }
+
+    const T operator[](const int i) const {
+        return value[i];
+    }
+
+    static TNNVector<T, len> load(const T* addr) {
+        TNNVector<T, len> v;
+        for (int i = 0; i < len; ++i) {
+            v.value[i] = addr[i];
+        }
+        return v;
+    }
+    static TNNVector<T, len> loadu(const T* addr) {
+        return load(addr);
+    }
+    static void save(T* addr, const TNNVector<T, len>& v) {
+        for (int i = 0; i < len; ++i) {
+            addr[i] = v.value[i];
+        }
+    }
+    static void saveu(T* addr, const TNNVector<T, len>& v) {
+        save(addr, v);
+    }
+
+    static TNNVector<T, len> bsl_cle(const TNNVector<T, len>& c1, const TNNVector<T, len>& c2, const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = c1.value[i] <= c2.value[i] ? v1.value[i] : v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> bsl_clt(const TNNVector<T, len>& c1, const TNNVector<T, len>& c2, const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = c1.value[i] < c2.value[i] ? v1.value[i] : v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> bsl_cge(const TNNVector<T, len>& c1, const TNNVector<T, len>& c2, const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = c1.value[i] >= c2.value[i] ? v1.value[i] : v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> bsl_cgt(const TNNVector<T, len>& c1, const TNNVector<T, len>& c2, const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = c1.value[i] > c2.value[i] ? v1.value[i] : v2.value[i];
+        }
+        return dst;
+    }
+
+    static TNNVector<T, len> extract(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2, const int n) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = (n + i < len) ? v1[n + i] : v2[n + i - len];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> pad(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2, const int n) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len - n; ++i) {
+            dst.value[i] = v1[i];
+        }
+        for (int i = len - n; i < len; ++i) {
+            dst.value[i] = v2[i];
+        }
+        return dst;
+    }
+    static void mla(TNNVector<T, len>& v1, const TNNVector<T, len>& v2, const TNNVector<T, len>& v3) {
+        for (int i = 0; i < len; ++i) {
+            v1.value[i] = v1.value[i] + v2.value[i] * v3.value[i];
+        }
+    }
+    static void mls(TNNVector<T, len>& v1, const TNNVector<T, len>& v2, const TNNVector<T, len>& v3) {
+        for (int i = 0; i < len; ++i) {
+            v1.value[i] = v1.value[i] - v2.value[i] * v3.value[i];
+        }
+    }
+    static TNNVector<T, len> neg(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = -v.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> max(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::max(v1.value[i], v2.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> min(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::min(v1.value[i], v2.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> div(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = v1.value[i] / v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> add(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = v1.value[i] + v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> sub(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = v1.value[i] - v2.value[i];
+        }
+        return dst;
+    }
+    static TNNVector<T, len> mul(const TNNVector<T, len>& v1, const TNNVector<T, len>& v2) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = v1.value[i] * v2.value[i];
+        }
+        return dst;
+    }
+
+    // the following functions only work for fp32 and fp16, int8 need to override these functions
+    static TNNVector<T, len> floor(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::floor(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> ceil(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::ceil(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> exp(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::exp(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> pow(const TNNVector<T, len>& v, const TNNVector<T, len>& e) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            if (v.value[i] <= 0) {
+                LOGE("%s\n", "neon pow does not support zero or negative input value");
+            }
+            dst.value[i] = std::pow(v.value[i], e.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> sqrt(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::sqrt(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> tanh(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::tanh(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> erf(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::erff(v.value[i]);
+        }
+        return dst;
+    }
+
+    static TNNVector<T, len> tan(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::tan(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> sin(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::sin(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> cos(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::cos(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> sigmoid(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = 1.0f / (1.0f + std::exp(-v.value[i]));
+        }
+        return dst;
+    }
+    static TNNVector<T, len> fast_sigmoid(const TNNVector<T, len>& v) {
+        return TNNVector<T, len>::sigmoid(v);
+    }
+    static TNNVector<T, len> log(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::log(v.value[i]);
+        }
+        return dst;
+    }
+    static TNNVector<T, len> abs(const TNNVector<T, len>& v) {
+        TNNVector<T, len> dst;
+        for (int i = 0; i < len; ++i) {
+            dst.value[i] = std::fabs(v.value[i]);
+        }
+        return dst;
+    }
+    static void zip(TNNVector<T, len>& v1, TNNVector<T, len>& v2) {
+        if (len % 2 != 0) {
+            LOGE("%s\n", "vecotr zip does not support len is odd");
+        }
+        T tmp[len];
+        for (int i = 0; i < len; i++) {
+            tmp[i] = v1.value[i];
+        }
+        for (int i = 0; i < len / 2; i++) {
+            v1.value[i * 2] = tmp[i];
+            v1.value[i * 2 + 1] = v2.value[i];
+        }
+        for (int i = 0; i < len / 2; i++) {
+            v2.value[i * 2] = tmp[len / 2 + i];
+            v2.value[i * 2 + 1] = v2.value[len / 2 + i];
+        }
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif /* TNNVector_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_abs_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_abs_layer_acc.cc
new file mode 100644
index 0000000..7fa658d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_abs_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_abs_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::abs(v);
+    }
+} ARM_ABS_OP;
+
+DECLARE_ARM_UNARY_ACC(Abs, ARM_ABS_OP);
+
+REGISTER_ARM_ACC(Abs, LAYER_ABS)
+REGISTER_ARM_LAYOUT(LAYER_ABS, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.cc
new file mode 100644
index 0000000..35599d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_add_layer_acc.h"
+
+namespace TNN_NS {
+
+ArmAddLayerAcc::~ArmAddLayerAcc() {}
+
+Status ArmAddLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+        return allocateBufferParamInt8(inputs, outputs);
+    } else {
+        Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+        if (status != TNN_OK) {
+            return status;
+        }
+        op_type_ = ArmBinaryOpType::kADD;
+        return TNN_OK;
+    }
+}
+
+// SUPPORTED DATATYPES
+bool ArmAddLayerAcc::DataTypeSupported(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_INT8)
+        return true;
+    else
+        return false;
+}
+
+Status ArmAddLayerAcc::allocateBufferParamInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // alloc scale buffer, two input scales and output scale
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8 && !input0_int_scale_.GetBytesSize()) {
+        auto dims_output    = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+
+        const float *i0_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *i1_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+        int scale_cnt = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer0(total_byte_size);
+        RawBuffer temp_buffer1(total_byte_size);
+        RawBuffer temp_buffer2(total_byte_size);
+        float *temp_ptr0 = temp_buffer0.force_to<float *>();
+        float *temp_ptr1 = temp_buffer1.force_to<float *>();
+        float *temp_ptr2 = temp_buffer2.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx = scale_cnt == 1 ? 0 : i;
+            temp_ptr0[i]  = i0_scale[scale_idx];
+            temp_ptr1[i]  = i1_scale[scale_idx];
+            temp_ptr2[i]  = 1.0 / o_scale[scale_idx];
+        }
+        input0_int_scale_ = temp_buffer0;
+        input1_int_scale_ = temp_buffer1;
+        output_int_scale_ = temp_buffer2;
+    }
+    return TNN_OK;
+}
+
+Status ArmAddLayerAcc::ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto output = outputs[0];
+    auto dims   = output->GetBlobDesc().dims;
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        // only support inputs.size() == 2
+        if (inputs.size() > 2) {
+            return Status(TNNERR_UNSUPPORT_NET, "INPUT > 2 NOT IMPLEMENT FOR INT8");
+        }
+        auto output_ptr   = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+        auto input0_ptr   = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        auto input1_ptr   = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[1]->GetHandle()));
+        auto output_scale = output_int_scale_.force_to<float *>();
+        auto input0_scale = input0_int_scale_.force_to<float *>();
+        auto input1_scale = input1_int_scale_.force_to<float *>();
+        MatrixAddInt8(output_ptr, input0_ptr, input1_ptr, output_scale, input0_scale, input1_scale,
+                      ROUND_UP(dims[1], 4), DimsVectorUtils::Count(dims, 2));
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output->GetBlobDesc().data_type);
+        return TNNERR_LAYER_ERR;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Add, LAYER_ADD)
+REGISTER_ARM_PRECISION_FP16(LAYER_ADD)
+REGISTER_ARM_LAYOUT(LAYER_ADD, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.h
new file mode 100644
index 0000000..b2b98ae
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_add_layer_acc.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_ADD_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_ADD_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+// @brief conv layer cpu acc
+class ArmAddLayerAcc : public ArmBinaryLayerAcc {
+public:
+    virtual ~ArmAddLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    
+    virtual bool DataTypeSupported(DataType data_type) override;
+
+    Status allocateBufferParamInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    virtual Status ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    RawBuffer input0_int_scale_;
+    RawBuffer input1_int_scale_;
+    RawBuffer output_int_scale_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_ADD_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_arg_max_or_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_arg_max_or_min_layer_acc.cc
new file mode 100644
index 0000000..9caf128
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_arg_max_or_min_layer_acc.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+Status ArmArgMaxOrMinLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param       = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    int axis         = param->axis;
+    int num          = DimsVectorUtils::Count(input_dims, 0, axis);
+    int channels     = input_dims[axis];
+    int stride       = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (stride == 0) {
+        stride = 1;
+    }
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        auto input_ptr  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        auto output_ptr = reinterpret_cast<int *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        for (int n = 0; n < num; ++n) {
+            for (int s = 0; s < stride; ++s) {
+                int guard_index = 0;
+                for (int c = 1; c < channels; ++c) {
+                    float guard_value = input_ptr[n * stride * channels + guard_index * stride + s];
+                    float cur_value   = input_ptr[n * stride * channels + c * stride + s];
+                    if (param->mode == 0) {
+                        // ArgMin
+                        guard_index = cur_value < guard_value ? c : guard_index;
+                    } else {
+                        // ArgMax
+                        guard_index = cur_value > guard_value ? c : guard_index;
+                    }
+                };  // end for loop
+                output_ptr[n * stride + s] = guard_index;
+                // std::cout << output_ptr[n * stride + s] << " ";
+            }
+        }  // end for
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }  // end if
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+REGISTER_ARM_LAYOUT(LAYER_ARG_MAX_OR_MIN, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.cc
new file mode 100644
index 0000000..5eca203
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.cc
@@ -0,0 +1,168 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_batch_norm_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+ArmBatchNormLayerAcc::~ArmBatchNormLayerAcc() {}
+
+Status ArmBatchNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    return allocateBufferParam(inputs, outputs);
+}
+
+Status ArmBatchNormLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    BatchNormLayerResource *batch_norm_res = dynamic_cast<BatchNormLayerResource *>(resource_);
+    CHECK_PARAM_NULL(batch_norm_res);
+
+    RawBuffer scale_handle = batch_norm_res->scale_handle;
+    RawBuffer bias_handle  = batch_norm_res->bias_handle;
+
+    if (scale_handle.GetDataType() == DATA_TYPE_HALF)
+        scale_handle = ConvertHalfHandle(scale_handle);
+    if (bias_handle.GetDataType() == DATA_TYPE_HALF)
+        bias_handle = ConvertHalfHandle(bias_handle);
+
+    auto data_bytes_size = DataTypeUtils::GetBytesSize(scale_handle.GetDataType());
+
+    shared_channel_ = (scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(scale_handle.GetDataType()));
+
+    if (!buffer_scale_.GetBytesSize()) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            int channel       = shared_channel_ ? 1 : dims_output[1];
+            int channel_count = shared_channel_ ? 1 : ROUND_UP(dims_output[1], 8);
+            RawBuffer temp_buffer(channel_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            Float2Half(temp_buffer.force_to<fp16_t *>(), scale_handle.force_to<float *>(), channel);
+            buffer_scale_ = temp_buffer;
+        } else {
+            int channel       = shared_channel_ ? 1 : dims_output[1];
+            int channel_count = shared_channel_ ? 1 : ROUND_UP(dims_output[1], 4);
+            RawBuffer temp_buffer(channel_count * data_bytes_size);
+            memcpy(temp_buffer.force_to<void *>(), scale_handle.force_to<void *>(), channel * data_bytes_size);
+            buffer_scale_ = temp_buffer;
+        }
+    }
+
+    if (!buffer_bias_.GetBytesSize()) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            int channel       = shared_channel_ ? 1 : dims_output[1];
+            int channel_count = shared_channel_ ? 1 : ROUND_UP(dims_output[1], 8);
+            RawBuffer temp_buffer(channel_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            if (bias_handle.force_to<void *>()) {
+                Float2Half(temp_buffer.force_to<fp16_t *>(), bias_handle.force_to<float *>(), channel);
+            }
+            buffer_bias_ = temp_buffer;
+        } else {
+            int channel       = shared_channel_ ? 1 : dims_output[1];
+            int channel_count = shared_channel_ ? 1 : ROUND_UP(dims_output[1], 4);
+            RawBuffer temp_buffer(channel_count * data_bytes_size);
+            if (bias_handle.force_to<void *>()) {
+                memcpy(temp_buffer.force_to<void *>(), bias_handle.force_to<void *>(), channel * data_bytes_size);
+            }
+            buffer_bias_ = temp_buffer;
+        }
+    }
+
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmBatchNormLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto ic = dims_input[1], input_slice = UP_DIV(dims_input[1], 4);
+    auto oc = dims_output[1], output_slice = UP_DIV(dims_output[1], 4);
+    auto i_hw = DimsVectorUtils::Count(dims_input, 2);
+    auto o_hw = DimsVectorUtils::Count(dims_output, 2);
+
+    auto batch = dims_output[0];
+
+    T *input_orign  = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *output_orign = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    float *k_data = buffer_scale_.force_to<float *>();
+    float *b_data = buffer_bias_.force_to<float *>();
+
+    auto src_z_step = i_hw * 4;
+    auto dst_z_step = o_hw * 4;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = input_orign + batch_idx * input_slice * 4 * i_hw;
+        auto output_ptr = output_orign + batch_idx * output_slice * 4 * o_hw;
+
+        if (!shared_channel_) {
+            for (int dz = 0; dz < output_slice; dz++) {
+                for (int x_i = 0; x_i < o_hw; x_i++) {
+                    Float4 input_v  = Float4::load(input_ptr + dz * src_z_step + x_i * 4);
+                    Float4 k_data_v = Float4::load(k_data + dz * 4);
+                    Float4 b_data_v = Float4::load(b_data + dz * 4);
+                    Float4::mla(b_data_v, input_v, k_data_v);
+                    Float4::save(output_ptr + dz * dst_z_step + x_i * 4, b_data_v);
+                }
+            }
+        } else {
+            Float4 k_data_v = Float4(k_data[0]);
+            Float4 b_data_v = Float4(b_data[0]);
+            for (int dz = 0; dz < output_slice; dz++) {
+                for (int x_i = 0; x_i < o_hw; x_i++) {
+                    Float4 input_v = Float4::load(input_ptr + dz * src_z_step + x_i * 4);
+                    Float4 dst_v   = b_data_v;
+                    Float4::mla(dst_v, input_v, k_data_v);
+                    Float4::save(output_ptr + dz * dst_z_step + x_i * 4, dst_v);
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmBatchNormLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (in_data_type == DATA_TYPE_HALF) {
+        return ExecFp16(inputs, outputs);
+    }
+#endif
+    else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+REGISTER_ARM_ACC(BatchNorm, LAYER_BATCH_NORM)
+REGISTER_ARM_PRECISION_FP16(LAYER_BATCH_NORM)
+REGISTER_ARM_LAYOUT(LAYER_BATCH_NORM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.h
new file mode 100644
index 0000000..378ca27
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_batch_norm_layer_acc.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_BATCH_NORM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_BATCH_NORM_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class ArmBatchNormLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmBatchNormLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    DECLARE_ARM_FP16_LAYER_FUNC;
+
+protected:
+    bool shared_channel_;
+    RawBuffer buffer_scale_;
+    RawBuffer buffer_bias_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_BATCH_NORM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.cc
new file mode 100644
index 0000000..06fb2c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.cc
@@ -0,0 +1,431 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+#include "tnn/device/arm/acc/compute/binary_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+template<> float binary_op<ArmBinaryOpType::kADD, float>(const float &a, const float &b, float alpha, float beta) {
+    return a + b;
+}
+template<> float binary_op<ArmBinaryOpType::kSUB, float>(const float &a, const float &b, float alpha, float beta) {
+    return a - b;
+}
+template<> float binary_op<ArmBinaryOpType::kMUL, float>(const float &a, const float &b, float alpha, float beta) {
+    return a * b;
+}
+template<> float binary_op<ArmBinaryOpType::kDIV, float>(const float &a, const float &b, float alpha, float beta) {
+    return a / b;
+}
+template<> float binary_op<ArmBinaryOpType::kMAX, float>(const float &a, const float &b, float alpha, float beta) {
+    return a > b ? a : b;
+}
+template<> float binary_op<ArmBinaryOpType::kMIN, float>(const float &a, const float &b, float alpha, float beta) {
+    return a < b ? a : b;
+}
+template<> float binary_op<ArmBinaryOpType::kHARDSWISH, float>(const float &a, const float &b, float alpha, float beta) {
+    return a * MAX(MIN(b * alpha + beta, 1.0f), 0.f);
+}
+
+template<> Float4 binary_op<ArmBinaryOpType::kADD, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return a + b;
+}
+template<> Float4 binary_op<ArmBinaryOpType::kSUB, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return a - b;
+}
+template<> Float4 binary_op<ArmBinaryOpType::kMUL, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return a * b;
+}
+template<> Float4 binary_op<ArmBinaryOpType::kDIV, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return Float4::div(a, b);
+}
+template<> Float4 binary_op<ArmBinaryOpType::kMAX, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return Float4::max(a, b);
+}
+template<> Float4 binary_op<ArmBinaryOpType::kMIN, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return Float4::min(a, b);
+}
+template<> Float4 binary_op<ArmBinaryOpType::kHARDSWISH, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    return a * Float4::max(Float4::min(b * alpha + beta, 1.0f), 0.f);
+}
+
+Status ArmBinaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    desc_for_config_const_blob_ = outputs[0]->GetBlobDesc();
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        RETURN_ON_NEQ(allocateBufferParam(inputs, outputs), TNN_OK);
+    }
+#if TNN_ARM82
+    else if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        RETURN_ON_NEQ(allocateBufferParamHalf(inputs, outputs), TNN_OK);
+    }
+#endif
+
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    // prepare input shapes
+    input_shapes_.clear();
+    input_shapes_.reserve(4);
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (broadcast_.GetBytesSize() > 0) {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_shapes_.push_back(layer_res->element_shape);
+            input_shapes_.push_back(input_shape0);
+        } else {
+            input_shapes_.push_back(input_shape0);
+            input_shapes_.push_back(layer_res->element_shape);
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_shapes_.push_back(inputs[inid]->GetBlobDesc().dims);
+            }
+        }
+    }
+
+    btype_ = BroadcastTypeUnknown;
+    // check broadcast type is general or other optimized ncxhwx types
+    // if type is general, go to nchw general impl
+    DimsVector input_pad_shape;
+    input_pad_shape.resize(output_dims.size());
+    for (int i = 0; i < input_shapes_.size(); i++) {
+        int pad_size = output_dims.size() - input_shapes_[i].size();
+        PadShape(pad_size, output_dims.size(), input_pad_shape, input_shapes_[i]);
+        BroadCastTypeFilter(output_dims, input_pad_shape, btype_);
+        if (btype_ == BroadcastTypeGeneral) {
+            break;
+        }
+    }
+
+    return TNN_OK;
+}
+
+// if reshape, reset input_shapes and broadcast type
+Status ArmBinaryLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    // prepare input shapes
+    input_shapes_.clear();
+    input_shapes_.reserve(4);
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (broadcast_.GetBytesSize() > 0) {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_shapes_.push_back(layer_res->element_shape);
+            input_shapes_.push_back(input_shape0);
+        } else {
+            input_shapes_.push_back(input_shape0);
+            input_shapes_.push_back(layer_res->element_shape);
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_shapes_.push_back(inputs[inid]->GetBlobDesc().dims);
+            }
+        }
+    }
+
+    btype_ = BroadcastTypeUnknown;
+    // check broadcast type is general or other optimized ncxhwx types
+    // if type is general, go to nchw general impl
+    DimsVector input_pad_shape;
+    input_pad_shape.resize(output_dims.size());
+    for (int i = 0; i < input_shapes_.size(); i++) {
+        int pad_size = output_dims.size() - input_shapes_[i].size();
+        PadShape(pad_size, output_dims.size(), input_pad_shape, input_shapes_[i]);
+        BroadCastTypeFilter(output_dims, input_pad_shape, btype_);
+        if (btype_ == BroadcastTypeGeneral) {
+            break;
+        }
+    }
+
+    return TNN_OK;
+}
+
+// SUPPORTED DATATYPES
+bool ArmBinaryLayerAcc::DataTypeSupported(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_HALF)
+        return true;
+    else
+        return false;
+}
+
+Status ArmBinaryLayerAcc::ConfigBuffer2ArmBlobDesc(BlobDesc &desc) {
+    DimsVector config_dims = desc_for_config_const_blob_.dims;
+    DimsVector original_dims = desc.dims;
+    DimsVector pad_dims;
+    if (config_dims.size() > 0) {
+        pad_dims.resize(config_dims.size());
+        int pad_size = config_dims.size() - original_dims.size();
+        PadShape(pad_size, config_dims.size(), pad_dims, original_dims);
+    } else {
+        pad_dims = original_dims;
+    }
+
+    desc.dims = pad_dims;
+    desc.device_type = desc_for_config_const_blob_.device_type;
+    desc.data_type = desc_for_config_const_blob_.data_type;
+    desc.data_format = desc_for_config_const_blob_.data_format;
+    return TNN_OK;
+}
+
+ArmBinaryLayerAcc::~ArmBinaryLayerAcc() {}
+
+Status ArmBinaryLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if (layer_res && broadcast_.GetBytesSize() == 0) {
+        RawBuffer element_handle = layer_res->element_handle;
+        auto dims                = layer_res->element_shape;
+        auto output_dims         = outputs[0]->GetBlobDesc().dims;
+        DimsVector dims_pad;
+        dims_pad.resize(output_dims.size());
+        PadShape(output_dims.size() - dims.size(), output_dims.size(), dims_pad, dims);
+
+        if (element_handle.GetDataType() == DATA_TYPE_HALF)
+            element_handle = ConvertHalfHandle(element_handle);
+
+        auto layer_res_size = element_handle.GetDataCount();
+        auto data_byte_size = DataTypeUtils::GetBytesSize(element_handle.GetDataType());
+        auto layer_data     = element_handle.force_to<void *>();
+        if (element_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            if (layer_res_size == 1) {
+                // broadcast single, just memcpy
+                RawBuffer temp(layer_res_size * data_byte_size);
+                memcpy(temp.force_to<void *>(), layer_data, layer_res_size * data_byte_size);
+                broadcast_ = temp;
+            } else {
+                // pack bias from nchw to nc4hw4
+                int count = DimsVectorUtils::Count(dims_pad);
+                if (dims_pad.size() >= 2) {
+                    count = count / dims_pad[1];
+                    count = count * ROUND_UP(dims_pad[1], 4);
+                }
+                int channel = 1;
+                if (dims_pad.size() > 1) {
+                    channel = dims_pad[1];
+                }
+                int hw_stride = 1;
+                if (dims_pad.size() > 2) {
+                    hw_stride = DimsVectorUtils::Count(dims_pad, 2);
+                }
+                RawBuffer temp(count * data_byte_size);
+                DataFormatConverter::ConvertFromNCHWToNCHW4Float(
+                    static_cast<float *>(layer_data), temp.force_to<float *>(), dims_pad[0], channel, hw_stride, 1);
+                broadcast_ = temp;
+            }
+
+            if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+                RawBuffer bfp16_temp(broadcast_.GetBytesSize() / 2);
+                bfp16_temp.SetDataType(DATA_TYPE_BFP16);
+                auto src = broadcast_.force_to<float *>();
+                auto dst = bfp16_temp.force_to<bfp16_t *>();
+                if (broadcast_.GetDataCount() == 1) {
+                    dst[0] = src[0];
+                } else {
+                    FloatConvert(src, dst, broadcast_.GetDataCount() / 4);
+                }
+            }
+        } else {
+            // Todo
+        }
+    }
+
+    return TNN_OK;
+}
+
+template <typename T, ArmBinaryOpType op_type>
+Status ArmBinaryLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (btype_ == BroadcastTypeUnknown) {
+        LOGE("Error: unknown broadcast type\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unknown broadcast type");
+    } else if (btype_ == BroadcastTypeGeneral) {
+        auto output_ptr = GetBlobHandlePtr(output->GetHandle());
+        size_t output_size = DimsVectorUtils::Count(output_dims);
+        void *workspace = context_->GetSharedWorkSpace(output_size * 2 * sizeof(T));
+        BinaryGeneralFunc<T, op_type>(output_ptr, input_ptrs_, output_dims, input_shapes_, workspace, alpha_, beta_);
+    } else {
+        auto output_ptr = GetBlobHandlePtr(output->GetHandle());
+        auto input0_ptr = input_ptrs_[0];
+        auto input1_ptr = input_ptrs_[1];
+
+        // input0_shape != output_shape && input1_shape != output_shape -> general impl
+        if (!DimsVectorUtils::Equal(output_dims, input_shapes_[0]) &&
+            !DimsVectorUtils::Equal(output_dims, input_shapes_[1])) {
+            std::vector<DimsVector> shapes_tmp = {input_shapes_[0], input_shapes_[1]};
+            std::vector<void *> ptrs_tmp = {input0_ptr, input1_ptr};
+            size_t output_size = DimsVectorUtils::Count(output_dims);
+            void *workspace = context_->GetSharedWorkSpace(output_size * 2 * sizeof(T));
+            BinaryGeneralFunc<T, op_type>(output_ptr, ptrs_tmp, output_dims, shapes_tmp, workspace, alpha_, beta_);
+        } else {
+            DimsVector input0_pad_shape, input1_pad_shape;
+            input0_pad_shape.resize(output_dims.size());
+            input1_pad_shape.resize(output_dims.size());
+            PadShape(output_dims.size() - input_shapes_[0].size(), output_dims.size(), input0_pad_shape, input_shapes_[0]);
+            PadShape(output_dims.size() - input_shapes_[1].size(), output_dims.size(), input1_pad_shape, input_shapes_[1]);
+
+            BinaryFunc<T, op_type>(output_ptr, input0_ptr, input1_ptr, input0_pad_shape, input1_pad_shape, alpha_, beta_);
+        }
+
+        for (int i = 2; i < input_ptrs_.size(); i++) {
+            auto input_ptr = input_ptrs_[i];
+            DimsVector input0_pad_shape;
+            PadShape(output_dims.size() - input_shapes_[i].size(), output_dims.size(), input0_pad_shape, input_shapes_[i]);
+            BinaryFunc<T, op_type>(output_ptr, output_ptr, input_ptr, output_dims, input0_pad_shape, alpha_, beta_);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmBinaryLayerAcc::ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status ArmBinaryLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    // prepare input ptrs, since blob memory is allocted after init
+    input_ptrs_.clear();
+    input_ptrs_.reserve(4);
+    if (broadcast_.GetBytesSize() > 0) {
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_ptrs_.push_back(broadcast_.force_to<void *>());
+            input_ptrs_.push_back(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        } else {
+            input_ptrs_.push_back(GetBlobHandlePtr(inputs[0]->GetHandle()));
+            input_ptrs_.push_back(broadcast_.force_to<void *>());
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_ptrs_.push_back(GetBlobHandlePtr(inputs[0]->GetHandle()));
+            input_ptrs_.push_back(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_ptrs_.push_back(GetBlobHandlePtr(inputs[inid]->GetHandle()));
+            }
+        }
+    }
+
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        // return Exec<float>(inputs, outputs);
+        switch(op_type_) {
+            case ArmBinaryOpType::kADD :
+                return Exec<float, ArmBinaryOpType::kADD>(inputs, outputs);
+            case ArmBinaryOpType::kSUB :
+                return Exec<float, ArmBinaryOpType::kSUB>(inputs, outputs);
+            case ArmBinaryOpType::kMUL :
+                return Exec<float, ArmBinaryOpType::kMUL>(inputs, outputs);
+            case ArmBinaryOpType::kDIV :
+                return Exec<float, ArmBinaryOpType::kDIV>(inputs, outputs);
+            case ArmBinaryOpType::kMAX :
+                return Exec<float, ArmBinaryOpType::kMAX>(inputs, outputs);
+            case ArmBinaryOpType::kMIN :
+                return Exec<float, ArmBinaryOpType::kMIN>(inputs, outputs);
+            case ArmBinaryOpType::kHARDSWISH :
+                return Exec<float, ArmBinaryOpType::kHARDSWISH>(inputs, outputs);
+
+            default :
+                LOGE("Error, unknown binary op_type\n");
+                return TNNERR_LAYER_ERR;
+        }
+    } else if (data_type == DATA_TYPE_BFP16) {
+        switch(op_type_) {
+            case ArmBinaryOpType::kADD :
+                return Exec<bfp16_t, ArmBinaryOpType::kADD>(inputs, outputs);
+            case ArmBinaryOpType::kSUB :
+                return Exec<bfp16_t, ArmBinaryOpType::kSUB>(inputs, outputs);
+            case ArmBinaryOpType::kMUL :
+                return Exec<bfp16_t, ArmBinaryOpType::kMUL>(inputs, outputs);
+            case ArmBinaryOpType::kDIV :
+                return Exec<bfp16_t, ArmBinaryOpType::kDIV>(inputs, outputs);
+            case ArmBinaryOpType::kMAX :
+                return Exec<bfp16_t, ArmBinaryOpType::kMAX>(inputs, outputs);
+            case ArmBinaryOpType::kMIN :
+                return Exec<bfp16_t, ArmBinaryOpType::kMIN>(inputs, outputs);
+            case ArmBinaryOpType::kHARDSWISH :
+                return Exec<bfp16_t, ArmBinaryOpType::kHARDSWISH>(inputs, outputs);
+
+            default :
+                LOGE("Error, unknown binary op_type\n");
+                return TNNERR_LAYER_ERR;
+        }
+    } else if (data_type == DATA_TYPE_INT8) {
+        if (op_type_ == ArmBinaryOpType::kADD) {
+            return ExecInt8(inputs, outputs);
+        } else {
+            LOGE("Error, int8 binary op only support add\n");
+            return TNNERR_LAYER_ERR;
+        }
+    }
+#if TNN_ARM82
+    else if (data_type == DATA_TYPE_HALF) {
+        switch(op_type_) {
+            case ArmBinaryOpType::kADD :
+                return ExecFp16<ArmBinaryOpType::kADD>(inputs, outputs);
+            case ArmBinaryOpType::kSUB :
+                return ExecFp16<ArmBinaryOpType::kSUB>(inputs, outputs);
+            case ArmBinaryOpType::kMUL :
+                return ExecFp16<ArmBinaryOpType::kMUL>(inputs, outputs);
+            case ArmBinaryOpType::kDIV :
+                return ExecFp16<ArmBinaryOpType::kDIV>(inputs, outputs);
+            case ArmBinaryOpType::kMAX :
+                return ExecFp16<ArmBinaryOpType::kMAX>(inputs, outputs);
+            case ArmBinaryOpType::kMIN :
+                return ExecFp16<ArmBinaryOpType::kMIN>(inputs, outputs);
+                break;
+
+            default :
+                LOGE("Error, unknown binary op_type\n");
+                return TNNERR_LAYER_ERR;
+        }
+    }
+#endif
+    else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.h
new file mode 100644
index 0000000..f9770e1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_binary_layer_acc.h
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_BINARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_BINARY_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+
+namespace TNN_NS {
+enum class ArmBinaryOpType : int {
+    kADD = 0,
+    kSUB = 1,
+    kMUL = 2,
+    kDIV = 3,
+    kMAX = 4,
+    kMIN = 5,
+    kHARDSWISH = 6,
+};
+
+class ArmBinaryLayerAcc : public ArmLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+
+    virtual ~ArmBinaryLayerAcc();
+
+    Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T, ArmBinaryOpType op_type>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // int8 will be implemented inside op
+    virtual Status ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+#if TNN_ARM82
+    virtual Status allocateBufferParamHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    template <ArmBinaryOpType op_type>
+    Status ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+#endif
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    virtual bool DataTypeSupported(DataType data_type) override;
+    virtual Status ConfigBuffer2ArmBlobDesc(BlobDesc &desc) override;
+
+    ArmBinaryOpType op_type_;
+    // used for hardswish
+    float alpha_ = 0.f;
+    float beta_ = 0.f;
+private:
+    RawBuffer broadcast_;
+
+    std::vector<void *> input_ptrs_;
+    std::vector<DimsVector> input_shapes_;
+    BroadcastType btype_;
+    BlobDesc desc_for_config_const_blob_;
+};
+
+#define DECLARE_ARM_BINARY_ACC(type_string)                                                                            \
+    class Arm##type_string##LayerAcc : public ArmBinaryLayerAcc {                                                      \
+    public:                                                                                                            \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;           \
+        virtual ~Arm##type_string##LayerAcc() override;                                                                \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_Binary_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_cast_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_cast_layer_acc.cc
new file mode 100644
index 0000000..6e20c4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_cast_layer_acc.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Cast, LAYER_CAST);
+
+Status ArmCastLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto param = dynamic_cast<CastLayerParam*>(param_);
+    void *input_data = GetBlobHandlePtr(inputs[0]->GetHandle());
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    void *output_data = GetBlobHandlePtr(outputs[0]->GetHandle());
+    auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+    
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    
+    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+
+    if (outputs[0]->GetBlobDesc().data_format != inputs[0]->GetBlobDesc().data_format) {
+        return Status(TNNERR_LAYER_ERR, "Unsupported data format in cast");
+    }
+
+    if (outputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC4HW4) {
+        DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+        int channel = 1;
+        if (output_dims.size() > 1) {
+            channel = output_dims[1];
+        }
+        count = count / channel;
+        count = count * ROUND_UP(channel, 4);
+    }
+
+    if (input_data_type == output_data_type) {
+        if (output_data_type == DATA_TYPE_FLOAT ||
+            output_data_type == DATA_TYPE_BFP16 ||
+            output_data_type == DATA_TYPE_INT32) {
+            if (output_data != input_data) {
+                memcpy(output_data, input_data, count * ele_size);
+            }
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Unsupported data type in cast");
+        }
+    } else {
+        if (input_data_type == DATA_TYPE_FLOAT &&
+            output_data_type == DATA_TYPE_INT32) {
+            auto *input_data_ptr = (float *)input_data;
+            auto *output_data_ptr = (int *)output_data;
+            for(int i = 0; i < count; ++i) {
+                output_data_ptr[i] = static_cast<float>(input_data_ptr[i]);
+            }
+        } else if (input_data_type == DATA_TYPE_INT32 &&
+                   output_data_type == DATA_TYPE_FLOAT) {
+            auto *input_data_ptr = (int *)input_data;
+            auto *output_data_ptr = (float *)output_data;
+            for(int i = 0; i < count; ++i) {
+                output_data_ptr[i] = static_cast<int>(input_data_ptr[i]);
+            }
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Unsupported data type in cast");
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Cast, LAYER_CAST);
+REGISTER_ARM_LAYOUT(LAYER_CAST, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_CAST, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_clip_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_clip_layer_acc.cc
new file mode 100644
index 0000000..f7d6d90
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_clip_layer_acc.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Clip, LAYER_CLIP);
+
+Status ArmClipLayerAcc::DoForward(const std::vector<Blob *> &input_blobs, const std::vector<Blob *> &output_blobs) {
+    auto layer_param = dynamic_cast<ClipLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is nil");
+    }
+
+    auto input_blob  = input_blobs[0];
+    auto output_blob = output_blobs[0];
+    auto dims        = output_blob->GetBlobDesc().dims;
+    int count        = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+    int count_quad   = UP_DIV(count, 4);
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+
+        for (int n = 0; n < count_quad; n++) {
+            Float4::save(output_data + n * 4,
+                         Float4::min(Float4(layer_param->max),
+                                     Float4::max(Float4(layer_param->min), Float4::load(input_data + n * 4))));
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: ArmClipLayerAcc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: ArmClipLayerAcc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Clip, LAYER_CLIP);
+REGISTER_ARM_LAYOUT(LAYER_CLIP, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.cc
new file mode 100644
index 0000000..f0a9ce7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.cc
@@ -0,0 +1,510 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_concat_layer_acc.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+/*
+directly copy in c4 mode, nc4hw4 format
+*/
+template <typename T>
+int concat_channel_c4(Blob *output, const std::vector<Blob *> &inputs) {
+    bool concat_c4     = true;
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 4);
+
+    auto *output_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto *output_ptr = output_origin + n * output_stride;
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input        = inputs[b];
+            auto dims_input   = input->GetBlobDesc().dims;
+            auto input_stride = DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 4);
+            auto input_ptr    = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+            memcpy(output_ptr, input_ptr, input_stride * sizeof(T));
+            output_ptr += input_stride;
+        }
+    }
+
+    return 0;
+}
+
+/*
+need extra buf to pack and unpack, channel not align with 4, nc4hw4 format
+*/
+template <typename T>
+int concat_channel(Blob *output, const std::vector<Blob *> &inputs, T *unpack_buf) {
+    auto dims_output    = output->GetBlobDesc().dims;
+    auto output_stride  = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 4);
+    auto *output_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto *output_ptr = output_origin + n * output_stride;
+        auto *unpack_ptr = unpack_buf;
+        int area         = DimsVectorUtils::Count(dims_output, 2);
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input      = inputs[b];
+            auto dims_input = input->GetBlobDesc().dims;
+            auto c_r4       = ROUND_UP(dims_input[1], 4);
+            auto input_ptr  = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle())) + n * c_r4 * area;
+            UnpackC4(unpack_ptr, input_ptr, area, dims_input[1]);
+            unpack_ptr += dims_input[1] * area;
+        }
+        PackC4(output_ptr, unpack_buf, area, dims_output[1]);
+    }
+
+    return 0;
+}
+
+/*
+checkout per tensor quantization
+*/
+static bool is_per_tensor_quant(const std::vector<Blob *> &inputs) {
+    bool int8_per_tensor_flag = true;
+    for (auto &blob : inputs) {
+        if (reinterpret_cast<BlobInt8 *>(blob)->GetIntResource()->scale_handle.GetDataCount() > 1) {
+            int8_per_tensor_flag = false;
+            break;
+        }
+    }
+
+    return int8_per_tensor_flag;
+}
+
+/*
+rescale int8, only per tensor
+*/
+static void rescale_int8(int8_t *dst, int8_t *src, float *rescale, int len) {
+    int n = 0;
+#ifdef TNN_USE_NEON
+    for (; n + 7 < len; n += 8) {
+        int8x8_t v_src         = vld1_s8(src + n);
+        int16x8_t v_src_s16    = vmovl_s8(v_src);
+        float32x4_t v_src0_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src_s16)));
+        float32x4_t v_src1_f32 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src_s16)));
+        int16x4_t v_mul0_s16   = vqmovn_s32(VCVTAQ_S32_F32(vmulq_n_f32(v_src0_f32, rescale[0])));
+        int16x8_t v_mul_s16    = VQMOVN_HIGH_S32_T(v_mul0_s16, VCVTAQ_S32_F32(vmulq_n_f32(v_src1_f32, rescale[0])));
+        vst1_s8(dst + n, vqmovn_s16(v_mul_s16));
+    }
+#endif
+    for (; n < len; n++) {
+        dst[n] = float2int8(src[n] * rescale[0]);
+    }
+}
+
+/*
+concat channel int8, nhwc format
+*/
+static int concat_channel_i8(Blob *output, const std::vector<Blob *> &inputs) {
+    auto dims_output = output->GetBlobDesc().dims;
+    int full_hw      = DimsVectorUtils::Count(dims_output, 2);
+    auto oc_c4       = ROUND_UP(dims_output[1], 4);
+
+    int8_t *output_origin = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    if (!is_per_tensor_quant(inputs)) {
+        for (int n = 0; n < dims_output[0]; n++) {
+            int c_offset = 0;
+            for (int b = 0; b < inputs.size(); b++) {
+                auto input_channel = inputs[b]->GetBlobDesc().dims[1];
+                auto ic_c4         = ROUND_UP(input_channel, 4);
+                auto input_ptr =
+                    reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[b]->GetHandle())) + n * ic_c4 * full_hw;
+                auto output_ptr = output_origin + n * full_hw * oc_c4 + c_offset;
+                for (int cur_hw = 0; cur_hw < full_hw; cur_hw++) {
+                    memcpy(output_ptr + cur_hw * oc_c4, input_ptr + cur_hw * ic_c4, input_channel);
+                }
+                c_offset += input_channel;
+            }
+        }
+    } else {
+        float *output_scale = reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>();
+        for (int n = 0; n < dims_output[0]; n++) {
+            int c_offset = 0;
+            for (int b = 0; b < inputs.size(); b++) {
+                float *input_scale =
+                    reinterpret_cast<BlobInt8 *>(inputs[b])->GetIntResource()->scale_handle.force_to<float *>();
+                float rescale      = input_scale[0] / output_scale[0];
+                auto input_channel = inputs[b]->GetBlobDesc().dims[1];
+                auto ic_c4         = ROUND_UP(input_channel, 4);
+                auto input_ptr =
+                    reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[b]->GetHandle())) + n * ic_c4 * full_hw;
+                auto output_ptr = output_origin + n * full_hw * oc_c4 + c_offset;
+                for (int cur_hw = 0; cur_hw < full_hw; cur_hw++) {
+                    // memcpy(output_ptr + cur_hw * oc_c4, input_ptr + cur_hw * ic_c4, input_channel);
+                    rescale_int8(output_ptr + cur_hw * oc_c4, input_ptr + cur_hw * ic_c4, &rescale, input_channel);
+                }
+                c_offset += input_channel;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static DimsVector GetNHWCXRoundDims(const DimsVector &dims, const int round) {
+    DimsVector round_dims = {dims[0]};
+    for (int i = 2; i < dims.size(); ++i) {
+        round_dims.push_back(dims[i]);
+    }
+    round_dims.push_back(ROUND_UP(dims[1], round));
+    return round_dims;
+}
+
+/*
+concat common int8, nhwc format
+*/
+static int concat_common_i8(Blob *output, const std::vector<Blob *> &inputs, int axis) {
+    auto output_dims             = output->GetBlobDesc().dims;
+    DimsVector round_output_dims = GetNHWCXRoundDims(output_dims, 4);
+    auto slice_count             = DimsVectorUtils::Count(round_output_dims, 0, axis - 1);
+    auto output_stride           = DimsVectorUtils::Count(round_output_dims, axis - 1);
+    auto *output_origin          = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    if (!is_per_tensor_quant(inputs)) {
+        for (int n = 0; n < slice_count; n++) {
+            auto output_ptr = output_origin + n * output_stride;
+            for (int b = 0; b < inputs.size(); b++) {
+                auto input                  = inputs[b];
+                auto input_dims             = input->GetBlobDesc().dims;
+                DimsVector round_input_dims = GetNHWCXRoundDims(input_dims, 4);
+                auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis - 1);
+                auto input_ptr = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+                memcpy(output_ptr, input_ptr, input_stride * sizeof(int8_t));
+                output_ptr += input_stride;
+            }
+        }
+    } else {
+        float *output_scale = reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>();
+        for (int n = 0; n < slice_count; n++) {
+            auto output_ptr = output_origin + n * output_stride;
+            for (int b = 0; b < inputs.size(); b++) {
+                float *input_scale =
+                    reinterpret_cast<BlobInt8 *>(inputs[b])->GetIntResource()->scale_handle.force_to<float *>();
+                float rescale               = input_scale[0] / output_scale[0];
+                auto input                  = inputs[b];
+                auto input_dims             = input->GetBlobDesc().dims;
+                DimsVector round_input_dims = GetNHWCXRoundDims(input_dims, 4);
+                auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis - 1);
+                auto input_ptr = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+                // memcpy(output_ptr, input_ptr, input_stride * sizeof(int8_t));
+                rescale_int8(output_ptr, input_ptr, &rescale, input_stride);
+                output_ptr += input_stride;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static DimsVector GetCXRoundDims(const DimsVector &dims, const int round) {
+    DimsVector round_dims = {dims[0], UP_DIV(dims[1], round)};
+    for (int i = 2; i < dims.size(); ++i) {
+        round_dims.push_back(dims[i]);
+    }
+    round_dims.push_back(round);
+    return round_dims;
+}
+
+/*
+concat channel common(float & bf16), nc4hw4 format
+*/
+template <typename T>
+static int concat_common(Blob *output, const std::vector<Blob *> &inputs, int axis) {
+    auto output_dims             = output->GetBlobDesc().dims;
+    DimsVector round_output_dims = GetCXRoundDims(output_dims, 4);
+    auto slice_count             = DimsVectorUtils::Count(round_output_dims, 0, axis);
+    auto output_stride           = DimsVectorUtils::Count(round_output_dims, axis);
+    auto *output_origin          = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < slice_count; n++) {
+        auto output_ptr = output_origin + n * output_stride;
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input                  = inputs[b];
+            auto input_dims             = input->GetBlobDesc().dims;
+            DimsVector round_input_dims = GetCXRoundDims(input_dims, 4);
+            auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis);
+            auto input_ptr = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+            memcpy(output_ptr, input_ptr, input_stride * sizeof(T));
+            output_ptr += input_stride;
+        }
+    }
+
+    return 0;
+}
+
+#if TNN_ARM82
+/*
+directly copy in c8 mode, nc8hw8 format
+*/
+int concat_channel_c8(Blob *output, const std::vector<Blob *> &inputs) {
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 8);
+
+    fp16_t *output_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto *output_ptr = output_origin + n * output_stride;
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input        = inputs[b];
+            auto dims_input   = input->GetBlobDesc().dims;
+            auto input_stride = DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 8);
+            auto input_ptr    = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+            memcpy(output_ptr, input_ptr, input_stride * sizeof(fp16_t));
+            output_ptr += input_stride;
+        }
+    }
+    return 0;
+}
+
+/*
+need extra buf to pack and unpack, channel not align with 8, nc8hw8 format
+*/
+template <>
+int concat_channel<fp16_t>(Blob *output, const std::vector<Blob *> &inputs, fp16_t *unpack_buf) {
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 8);
+
+    fp16_t *output_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto *output_ptr = output_origin + n * output_stride;
+        auto *unpack_ptr = unpack_buf;
+        int area         = DimsVectorUtils::Count(dims_output, 2);
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input      = inputs[b];
+            auto dims_input = input->GetBlobDesc().dims;
+            auto c_r8       = ROUND_UP(dims_input[1], 8);
+            auto input_ptr  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle())) + n * c_r8 * area;
+            UnpackC8(unpack_ptr, input_ptr, area, dims_input[1]);
+            unpack_ptr += dims_input[1] * area;
+        }
+        PackC8(output_ptr, unpack_buf, area, dims_output[1]);
+    }
+
+    return 0;
+}
+
+/*
+concat channel common(float & bf16), nc4hw4 format
+*/
+template <>
+int concat_common<fp16_t>(Blob *output, const std::vector<Blob *> &inputs, int axis) {
+    auto output_dims             = output->GetBlobDesc().dims;
+    DimsVector round_output_dims = GetCXRoundDims(output_dims, 8);
+    auto slice_count             = DimsVectorUtils::Count(round_output_dims, 0, axis);
+    auto output_stride           = DimsVectorUtils::Count(round_output_dims, axis);
+    fp16_t *output_origin        = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int n = 0; n < slice_count; n++) {
+        auto output_ptr = output_origin + n * output_stride;
+        for (int b = 0; b < inputs.size(); b++) {
+            auto input                  = inputs[b];
+            auto input_dims             = input->GetBlobDesc().dims;
+            DimsVector round_input_dims = GetCXRoundDims(input_dims, 8);
+            auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis);
+            fp16_t *input_ptr           = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
+            memcpy(output_ptr, input_ptr, input_stride * sizeof(fp16_t));
+            output_ptr += input_stride;
+        }
+    }
+
+    return 0;
+}
+#endif
+
+ArmConcatLayerAcc::~ArmConcatLayerAcc() {}
+
+Status ArmConcatLayerAcc::ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConcatLayerParam *concat_param = dynamic_cast<ConcatLayerParam *>(param_);
+    CHECK_PARAM_NULL(concat_param);
+
+    switch (concat_param->axis) {
+        case 1:
+            concat_channel_i8(outputs[0], inputs);
+            break;
+        default:
+            concat_common_i8(outputs[0], inputs, concat_param->axis);
+            break;
+    }
+    return TNN_OK;
+}
+
+Status ArmConcatLayerAcc::ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConcatLayerParam *concat_param = dynamic_cast<ConcatLayerParam *>(param_);
+    CHECK_PARAM_NULL(concat_param);
+
+    auto dims   = inputs[0]->GetBlobDesc().dims;
+
+    const int axis = concat_param->axis;
+    if (axis > dims.size() || axis < 0) {
+        LOGE("Error: Concat layer param invalid\n");
+        return Status(TNNERR_PARAM_ERR, "Concat layer param invalid");
+    }
+
+    int num_concats = 1;
+    for (int i = 0; i < axis; i++) {
+        num_concats *= dims[i];
+    }
+
+    int concate_size = 1;
+    for (int i = axis + 1; i < dims.size(); i++) {
+        concate_size *= dims[i];
+    }
+
+    auto datasize                 = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+    int8_t *output_data           = reinterpret_cast<int8_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    int output_concat_axis        = outputs[0]->GetBlobDesc().dims[axis];
+    int output_concat_axis_offset = 0;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        // use int8_t for all types
+        int8_t *input_data          = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[i]->GetHandle()));
+        const int input_concat_axis = inputs[i]->GetBlobDesc().dims[axis];
+        for (int n = 0; n < num_concats; ++n) {
+            memcpy(output_data + (n * output_concat_axis + output_concat_axis_offset) * concate_size * datasize,
+                input_data + n * input_concat_axis * concate_size * datasize,
+                input_concat_axis * concate_size * datasize);
+        }
+        output_concat_axis_offset += input_concat_axis;
+    }
+    return TNN_OK;
+}
+
+Status ArmConcatLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConcatLayerParam *concat_param = dynamic_cast<ConcatLayerParam *>(param_);
+    CHECK_PARAM_NULL(concat_param);
+
+    bool concat_c4 = true;
+    for (int i = 0; i < inputs.size() - 1; i++) {
+        if (inputs[i]->GetBlobDesc().dims[1] % 4 != 0) {
+            concat_c4 = false;
+            break;
+        }
+    }
+
+#if TNN_ARM82
+    bool concat_c8 = true;
+    for (int i = 0; i < inputs.size() - 1; i++) {
+        if (inputs[i]->GetBlobDesc().dims[1] % 8 != 0) {
+            concat_c8 = false;
+            break;
+        }
+    }
+#endif
+
+    switch (concat_param->axis) {
+        case 1:
+            if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                if (concat_c4) {
+                    concat_channel_c4<float>(outputs[0], inputs);
+                } else {
+                    auto dims_output   = outputs[0]->GetBlobDesc().dims;
+                    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 4);
+                    float *unpack_buf =
+                        static_cast<float *>(context_->GetSharedWorkSpace(output_stride * sizeof(float)));
+                    concat_channel<float>(outputs[0], inputs, unpack_buf);
+                }
+            } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+                if (concat_c4) {
+                    concat_channel_c4<bfp16_t>(outputs[0], inputs);
+                } else {
+                    auto dims_output   = outputs[0]->GetBlobDesc().dims;
+                    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 4);
+                    bfp16_t *unpack_buf =
+                        static_cast<bfp16_t *>(context_->GetSharedWorkSpace(output_stride * sizeof(bfp16_t)));
+                    concat_channel<bfp16_t>(outputs[0], inputs, unpack_buf);
+                }
+            }
+#if TNN_ARM82
+            else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+                if (concat_c8) {
+                    concat_channel_c8(outputs[0], inputs);
+                } else {
+                    auto dims_output   = outputs[0]->GetBlobDesc().dims;
+                    auto output_stride = DimsVectorUtils::Count(dims_output, 2) * ROUND_UP(dims_output[1], 8);
+                    fp16_t *unpack_buf =
+                        static_cast<fp16_t *>(context_->GetSharedWorkSpace(output_stride * sizeof(fp16_t)));
+                    concat_channel<fp16_t>(outputs[0], inputs, unpack_buf);
+                }
+            }
+#endif
+            else {
+                return TNNERR_LAYER_ERR;
+            }
+            break;
+        default:
+            if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                concat_common<float>(outputs[0], inputs, concat_param->axis);
+            } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+                concat_common<bfp16_t>(outputs[0], inputs, concat_param->axis);
+            }
+#if TNN_ARM82
+            else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+                concat_common<fp16_t>(outputs[0], inputs, concat_param->axis);
+            }
+#endif
+            else {
+                return TNNERR_LAYER_ERR;
+            }
+            break;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConcatLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 2) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 2");
+    }
+
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return ExecInt8(inputs, outputs);
+    }
+
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+        inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+    }
+#if TNN_ARM82
+    else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+    }
+#endif
+    else {
+        return Status(TNNERR_LAYER_ERR, "Unsupported data type in concat");
+    }
+
+    if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+        return ExecNchw(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC4HW4 ||
+               inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC8HW8) {
+        return Exec(inputs, outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Unsupported data format in concat");
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+REGISTER_ARM_ACC(Concat, LAYER_CONCAT)
+REGISTER_ARM_PRECISION_FP16(LAYER_CONCAT)
+REGISTER_ARM_LAYOUT(LAYER_CONCAT, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_CONCAT, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.h
new file mode 100644
index 0000000..430e2bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_concat_layer_acc.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONCAT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONCAT_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class ArmConcatLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmConcatLayerAcc();
+
+    Status ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONCAT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_detection_output_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_detection_output_layer_acc.cc
new file mode 100644
index 0000000..3db9e94
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_detection_output_layer_acc.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_nchw_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_NCHW_ACC(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+Status ArmDetectionOutputLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<DetectionOutputLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    AllocConvertBuffer(inputs, outputs);
+
+    // call cpu naive prior box
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        UnPackInputs<float>(inputs);
+        NaiveDetectionOutput(GetNchwBlobVector(nchw_blob_in), GetNchwBlobVector(nchw_blob_out), param);
+        PackOutputs<float>(outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(DetectionOutput, LAYER_DETECTION_OUTPUT)
+REGISTER_ARM_LAYOUT(LAYER_DETECTION_OUTPUT, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_div_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_div_layer_acc.cc
new file mode 100644
index 0000000..0f09f9a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_div_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Div);
+
+Status ArmDivLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kDIV;
+
+    return TNN_OK;
+}
+
+ArmDivLayerAcc::~ArmDivLayerAcc() {}
+
+REGISTER_ARM_ACC(Div, LAYER_DIV)
+REGISTER_ARM_PRECISION_FP16(LAYER_DIV)
+REGISTER_ARM_LAYOUT(LAYER_DIV, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_elu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_elu_layer_acc.cc
new file mode 100644
index 0000000..f6f5d57
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_elu_layer_acc.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_elu_operator : arm_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<EluLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error:  layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        return TNN_OK;
+    }
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::bsl_clt(v, Float4(0.f), Float4(alpha_) * (Float4::exp(v) - Float4(1.0f)), v);
+    }
+
+private:
+    float alpha_ = 0.f;
+} ARM_ELU_OP;
+
+DECLARE_ARM_UNARY_ACC(Elu, ARM_ELU_OP);
+
+REGISTER_ARM_ACC(Elu, LAYER_ELU);
+REGISTER_ARM_LAYOUT(LAYER_ELU, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_exp_layer_acc.cc
new file mode 100644
index 0000000..f2743dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_exp_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_exp_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::exp(v);
+    }
+} ARM_EXP_OP;
+
+DECLARE_ARM_UNARY_ACC(Exp, ARM_EXP_OP);
+
+REGISTER_ARM_ACC(Exp, LAYER_EXP)
+REGISTER_ARM_LAYOUT(LAYER_EXP, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.cc
new file mode 100644
index 0000000..821e464
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.cc
@@ -0,0 +1,158 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/acc/arm_expand_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+ArmExpandLayerAcc::~ArmExpandLayerAcc() {}
+
+Status ArmExpandLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto expand_param = dynamic_cast<ExpandLayerParam*>(param_);
+    CHECK_PARAM_NULL(expand_param);
+    
+    if (inputs.size() == 2) {
+        auto data_dims = inputs[0]->GetBlobDesc().dims;
+        DimsVector shape_dims;
+        auto shape_data = reinterpret_cast<int *>(GetBlobHandlePtr(inputs[1]->GetHandle()));
+        auto shape_data_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        for (int i=0; i<shape_data_count; i++) {
+            shape_dims.push_back(shape_data[i]);
+        }
+        
+        expand_param->shape = shape_dims;
+        
+        auto output_dims = DimsFunctionUtils::Expand(data_dims, shape_dims, nullptr);
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+static void ExpandComputeOffset(DimsVector &offset, const DimsVector dims_in, const DimsVector dims_out) {
+    DimsVector dims_pad_in;
+    int pad_size = dims_out.size() - dims_in.size();
+    int i = 0;
+    for (; i < pad_size; i++) {
+        dims_pad_in.push_back(1);
+    }
+    for (; i < dims_out.size(); i++) {
+        dims_pad_in.push_back(dims_in[i - pad_size]);
+    }
+
+    offset.resize(dims_out.size());
+    int s = 1;
+    for (i = dims_out.size() - 1; i >= 0; i--) {
+        offset[i] = (dims_pad_in[i] == dims_out[i]) ? s : 0;
+        s *= dims_pad_in[i];
+    }
+}
+
+template <typename T>
+static void ArmExpand(DimsVector output_shape, DimsVector input_shape,
+                      T *output_ptr, const T *input_ptr) {
+    DimsVector output_offset;
+    ExpandComputeOffset(output_offset, output_shape, output_shape);
+
+    DimsVector input_offset;
+    ExpandComputeOffset(input_offset, input_shape, output_shape);
+
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = in_i5[0];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+Status ArmExpandLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+
+    if (output_dims.size() > 6) {
+        return Status(TNNERR_MODEL_ERR, "arm expand only support dims <= 6");
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        const float *input_data = reinterpret_cast<const float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmExpand<float>(output_dims, input_dims, output_data, input_data);
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        const bfp16_t *input_data = reinterpret_cast<const bfp16_t *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        bfp16_t *output_data = reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmExpand<bfp16_t>(output_dims, input_dims, output_data, input_data);
+    }
+#ifdef TNN_ARM82
+    else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        const fp16_t *input_data = reinterpret_cast<const fp16_t *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        fp16_t *output_data = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmExpand<fp16_t>(output_dims, input_dims, output_data, input_data);
+    }
+#endif
+    else {
+        return Status(TNNERR_MODEL_ERR, "blob type is unsupported");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Expand, LAYER_EXPAND);
+REGISTER_ARM_PRECISION_FP16(LAYER_EXPAND)
+REGISTER_ARM_LAYOUT(LAYER_EXPAND, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.h
new file mode 100644
index 0000000..81d1ecf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_expand_layer_acc.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_EXPAND_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_EXPAND_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class ArmExpandLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmExpandLayerAcc();
+
+    virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_EXPAND_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_floor_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_floor_layer_acc.cc
new file mode 100644
index 0000000..a9ba6c1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_floor_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_floor_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::floor(v);
+    }
+} ARM_FLOOR_OP;
+
+DECLARE_ARM_UNARY_ACC(Floor, ARM_FLOOR_OP);
+REGISTER_ARM_ACC(Floor, LAYER_FLOOR);
+REGISTER_ARM_LAYOUT(LAYER_FLOOR, DATA_FORMAT_NC4HW4)
+
+typedef struct arm_ceil_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::ceil(v);
+    }
+} ARM_CEIL_OP;
+
+DECLARE_ARM_UNARY_ACC(Ceil, ARM_CEIL_OP);
+REGISTER_ARM_ACC(Ceil, LAYER_CEIL);
+REGISTER_ARM_LAYOUT(LAYER_CEIL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_gather_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_gather_layer_acc.cc
new file mode 100644
index 0000000..dc7f32b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_gather_layer_acc.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Gather, LAYER_GATHER);
+
+Status ArmGatherLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int axis = layer_param->axis;
+
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+
+    DimsVector input_data_dims;
+    char *input_data_ptr = nullptr;
+    if (layer_param->data_in_resource) {
+        input_data_dims = layer_resource->data.GetBufferDims();
+        input_data_ptr = layer_resource->data.force_to<char*>();
+    } else {
+        input_data_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+        input_data_ptr = GetBlobHandlePtr((*(inputs.begin()))->GetHandle());
+    }
+
+    DimsVector indices_dims;
+    int *indices_data_ptr = nullptr;
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_data_ptr = layer_resource->indices.force_to<int*>();
+    } else {
+        indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+        indices_data_ptr = reinterpret_cast<int *>(GetBlobHandlePtr((*(inputs.rbegin()))->GetHandle()));
+    }
+
+    const int slice_size = DimsVectorUtils::Count(input_data_dims, axis + 1);
+    const int input_slice_count = DimsVectorUtils::Count(input_data_dims, axis, axis + 1);
+    const int batch = DimsVectorUtils::Count(input_data_dims, 0, axis);
+
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const int output_slice_count = DimsVectorUtils::Count(indices_dims);
+
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto output_data_ptr = GetBlobHandlePtr(outputs[0]->GetHandle());
+
+    for (int b = 0; b < batch; b++) {
+        int input_index_b = b * input_slice_count * slice_size;
+        int output_index_b = b * output_slice_count * slice_size;
+        for (int i = 0; i < output_slice_count; i++) {
+            int slice_index = indices_data_ptr[i];
+            if (slice_index < 0 || slice_index >= input_slice_count) {
+                LOGE("ArmGatherLayerAcc::Forward invalid slice_index\n");
+                return Status(TNNERR_MODEL_ERR, "ArmGatherLayerAcc::Forward invalid slice_index");
+            }
+            int input_index = input_index_b + slice_index * slice_size;
+            int output_index = output_index_b + i * slice_size;
+            
+            memcpy(output_data_ptr + output_index * ele_size,
+                   input_data_ptr + input_index * ele_size,
+                   slice_size * ele_size);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Gather, LAYER_GATHER);
+REGISTER_ARM_PRECISION_FP16(LAYER_GATHER)
+REGISTER_ARM_LAYOUT(LAYER_GATHER, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_grid_sample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_grid_sample_layer_acc.cc
new file mode 100644
index 0000000..3dade71
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_grid_sample_layer_acc.cc
@@ -0,0 +1,218 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(GridSample, LAYER_GRIDSAMPLE);
+
+static inline bool within_bounds_2d(int32_t h, int32_t w, int32_t H, int32_t W) {
+    return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static void ComputeNCHW(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob          = inputs[0];
+    auto grid_blob           = inputs[1];
+    auto output_blob         = outputs[0];
+    auto input_dims          = input_blob->GetBlobDesc().dims;
+    auto grid_dims           = grid_blob->GetBlobDesc().dims;
+    auto output_dims         = output_blob->GetBlobDesc().dims;
+    auto batch               = input_dims[0];
+    auto channel             = input_dims[1];
+    auto input_height        = input_dims[2];
+    auto input_width         = input_dims[3];
+    auto input_channel_area  = DimsVectorUtils::Count(input_dims, 2);
+    auto grid_area           = DimsVectorUtils::Count(grid_dims, 1);
+    auto output_channel_area = DimsVectorUtils::Count(output_dims, 2);
+    auto input_base_ptr      = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+    auto grid_base_ptr       = reinterpret_cast<float *>(GetBlobHandlePtr(grid_blob->GetHandle()));
+    auto output_base_ptr     = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+
+    for (int n = 0; n < batch; n++) {
+        auto input_data_b  = input_base_ptr + n * channel * input_channel_area;
+        auto grid_data_b   = grid_base_ptr + n * grid_area;
+        auto output_data_b = output_base_ptr + n * channel * output_channel_area;
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < output_channel_area; ++i) {
+            auto grid_position = grid_data_b + i * 2;
+            float x            = grid_position[0];
+            float y            = grid_position[1];
+            // unnormalize
+            float ix = (x + 1) * input_width * 0.5 - 0.5;
+            float iy = (y + 1) * input_height * 0.5 - 0.5;
+            // get corner pixel values from (x, y)
+            // for 4d, we use north-east-south-west
+            int ix_nw = static_cast<int>(std::floor(ix));
+            int iy_nw = static_cast<int>(std::floor(iy));
+
+            int ix_ne = ix_nw + 1;
+            int iy_ne = iy_nw;
+
+            int ix_sw = ix_nw;
+            int iy_sw = iy_nw + 1;
+
+            int ix_se = ix_nw + 1;
+            int iy_se = iy_nw + 1;
+            // get surfaces to each neighbor:
+            bool nw_within_bound = within_bounds_2d(iy_nw, ix_nw, input_height, input_width);
+            bool ne_within_bound = within_bounds_2d(iy_ne, ix_ne, input_height, input_width);
+            bool sw_within_bound = within_bounds_2d(iy_sw, ix_sw, input_height, input_width);
+            bool se_within_bound = within_bounds_2d(iy_se, ix_se, input_height, input_width);
+            float nw             = nw_within_bound ? (ix_se - ix) * (iy_se - iy) : 0;
+            float ne             = ne_within_bound ? (ix - ix_sw) * (iy_sw - iy) : 0;
+            float sw             = sw_within_bound ? (ix_ne - ix) * (iy - iy_ne) : 0;
+            float se             = se_within_bound ? (ix - ix_nw) * (iy - iy_nw) : 0;
+            int nw_index         = nw_within_bound ? iy_nw * input_width + ix_nw : 0;
+            int ne_index         = ne_within_bound ? iy_ne * input_width + ix_ne : 0;
+            int sw_index         = sw_within_bound ? iy_sw * input_width + ix_sw : 0;
+            int se_index         = se_within_bound ? iy_se * input_width + ix_se : 0;
+
+            // calculate bilinear weighted pixel value and set output pixel
+            float *input_data  = input_data_b;
+            float *output_data = output_data_b + i;
+            for (int c = 0; c < channel; ++c, output_data += output_channel_area, input_data += input_channel_area) {
+                auto res = static_cast<float>(0);
+                res += input_data[nw_index] * nw;
+                res += input_data[ne_index] * ne;
+                res += input_data[sw_index] * sw;
+                res += input_data[se_index] * se;
+                *output_data = res;
+            }
+        }
+    }
+}
+
+static void ComputeNC4HW4(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob          = inputs[0];
+    auto grid_blob           = inputs[1];
+    auto output_blob         = outputs[0];
+    auto input_dims          = input_blob->GetBlobDesc().dims;
+    auto grid_dims           = grid_blob->GetBlobDesc().dims;
+    auto output_dims         = output_blob->GetBlobDesc().dims;
+    auto batch               = input_dims[0];
+    auto channel             = input_dims[1];
+    auto input_height        = input_dims[2];
+    auto input_width         = input_dims[3];
+    auto input_channel_area  = DimsVectorUtils::Count(input_dims, 2);
+    auto grid_area           = DimsVectorUtils::Count(grid_dims, 1);
+    auto grid_hw             = DimsVectorUtils::Count(grid_dims, 2);
+    auto output_channel_area = DimsVectorUtils::Count(output_dims, 2);
+    auto channel_ud4         = UP_DIV(channel, 4);
+    auto grid_channel_ud4    = UP_DIV(grid_dims[1], 4);
+    auto input_base_ptr      = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+    auto grid_base_ptr       = reinterpret_cast<float *>(GetBlobHandlePtr(grid_blob->GetHandle()));
+    auto output_base_ptr     = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+    bool grid_packed         = grid_blob->GetBlobDesc().data_format == DATA_FORMAT_NC4HW4;
+    for (int n = 0; n < batch; n++) {
+        auto input_data_b  = input_base_ptr + n * channel_ud4 * input_channel_area * 4;
+        auto grid_data_b   = grid_base_ptr + n * grid_channel_ud4 * grid_hw * 4;
+        auto output_data_b = output_base_ptr + n * channel_ud4 * output_channel_area * 4;
+        RawBuffer reorder_grid_buffer;
+        float *grid_buffer = nullptr;
+        if (grid_packed) {
+            reorder_grid_buffer = RawBuffer(grid_area * sizeof(float));
+            UnpackC4(reorder_grid_buffer.force_to<float *>(), grid_data_b, DimsVectorUtils::Count(grid_dims, 2),
+                     grid_dims[1]);
+            grid_buffer = reorder_grid_buffer.force_to<float *>();
+        }
+        for (int c = 0; c < channel_ud4; c++) {
+            OMP_PARALLEL_FOR_
+            for (int i = 0; i < output_channel_area; ++i) {
+                auto grid_position = grid_buffer + 2 * i;
+                float x            = grid_position[0];
+                float y            = grid_position[1];
+                // unnormalize
+                float ix = (x + 1) * input_width * 0.5 - 0.5;
+                float iy = (y + 1) * input_height * 0.5 - 0.5;
+                // get corner pixel values from (x, y)
+                // for 4d, we use north-east-south-west
+                int ix_nw = static_cast<int>(std::floor(ix));
+                int iy_nw = static_cast<int>(std::floor(iy));
+
+                int ix_ne = ix_nw + 1;
+                int iy_ne = iy_nw;
+
+                int ix_sw = ix_nw;
+                int iy_sw = iy_nw + 1;
+
+                int ix_se = ix_nw + 1;
+                int iy_se = iy_nw + 1;
+
+                // get surfaces to each neighbor:
+                float nw = (ix_se - ix) * (iy_se - iy);
+                float ne = (ix - ix_sw) * (iy_sw - iy);
+                float sw = (ix_ne - ix) * (iy - iy_ne);
+                float se = (ix - ix_nw) * (iy - iy_nw);
+
+                Float4 nw_v = Float4(nw);
+                Float4 ne_v = Float4(ne);
+                Float4 sw_v = Float4(sw);
+                Float4 se_v = Float4(se);
+
+                // calculate bilinear weighted pixel value and set output pixel
+                float *input_data  = input_data_b + c * input_channel_area * 4;
+                float *output_data = output_data_b + c * output_channel_area * 4 + i * 4;
+                Float4 res_v       = Float4(0.0f);
+                if (within_bounds_2d(iy_nw, ix_nw, input_height, input_width)) {
+                    res_v = res_v + Float4::load(input_data + iy_nw * input_width * 4 + ix_nw * 4) * nw_v;
+                }
+                if (within_bounds_2d(iy_ne, ix_ne, input_height, input_width)) {
+                    res_v = res_v + Float4::load(input_data + iy_ne * input_width * 4 + ix_ne * 4) * ne_v;
+                }
+                if (within_bounds_2d(iy_sw, ix_sw, input_height, input_width)) {
+                    res_v = res_v + Float4::load(input_data + iy_sw * input_width * 4 + ix_sw * 4) * sw_v;
+                }
+                if (within_bounds_2d(iy_se, ix_se, input_height, input_width)) {
+                    res_v = res_v + Float4::load(input_data + iy_se * input_width * 4 + ix_se * 4) * se_v;
+                }
+                Float4::save(output_data, res_v);
+            }
+        }
+    }
+}
+
+Status ArmGridSampleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param      = dynamic_cast<GridSampleLayerParam *>(param_);
+    auto input_blob = inputs[0];
+    auto grid_blob  = inputs[1];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto grid_dims  = grid_blob->GetBlobDesc().dims;
+    if (!(input_dims.size() == 4 && param->mode == 2 && param->pad_type == 0 && param->align_corners == 0)) {
+        LOGE("Error: Arm layer acc don't support GridSample input size(%lu) or param:(%d, %d, %d)\n", input_dims.size(),
+             param->mode, param->pad_type, param->align_corners);
+        return Status(TNNERR_MODEL_ERR, "Error: Arm layer acc don't support.\n");
+    }
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto data_format = input_blob->GetBlobDesc().data_format;
+        if (data_format == DATA_FORMAT_NC4HW4) {
+            ComputeNC4HW4(inputs, outputs);
+        } else if (data_format == DATA_FORMAT_NCHW) {
+            ComputeNCHW(inputs, outputs);
+        }
+    } else {
+        LOGE("Error: Arm layer acc don't support datatype: %d\n", inputs[0]->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: Arm layer acc don't support datatype\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(GridSample, LAYER_GRIDSAMPLE);
+REGISTER_ARM_LAYOUT(LAYER_GRIDSAMPLE, DATA_FORMAT_NCHW)
+REGISTER_ARM_LAYOUT(LAYER_GRIDSAMPLE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_sigmoid_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_sigmoid_acc.cc
new file mode 100644
index 0000000..891a141
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_sigmoid_acc.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_hard_sigmoid_operator : arm_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<HardSigmoidLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error:  layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        beta_  = layer_param->beta;
+        minV_  = -beta_ / alpha_;
+        maxV_  = (1.0f - beta_) / alpha_;
+        return TNN_OK;
+    }
+    virtual Float4 operator()(const Float4 &v) {
+        Float4 val = v;
+        Float4 res = Float4::bsl_cle(val, Float4(minV_), Float4(0.f), val * alpha_ + Float4(beta_));
+        return Float4::bsl_cge(val, Float4(maxV_), Float4(1.f), res);
+    }
+
+private:
+    float alpha_ = 1.f;
+    float beta_  = 1.f;
+    float minV_  = 1.f;
+    float maxV_  = 1.f;
+} ARM_HARD_SIGMOID_OP;
+
+DECLARE_ARM_UNARY_ACC(HardSigmoid, ARM_HARD_SIGMOID_OP);
+
+REGISTER_ARM_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+REGISTER_ARM_LAYOUT(LAYER_HARDSIGMOID, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_swish_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_swish_acc.cc
new file mode 100644
index 0000000..fdc371e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_hard_swish_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(HardSwish);
+
+Status ArmHardSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kHARDSWISH;
+
+    auto layer_param = dynamic_cast<HardSwishLayerParam *>(param);
+    alpha_ = layer_param->alpha;
+    beta_ = layer_param->beta;
+
+    return TNN_OK;
+}
+
+ArmHardSwishLayerAcc::~ArmHardSwishLayerAcc() {}
+
+REGISTER_ARM_ACC(HardSwish, LAYER_HARDSWISH)
+REGISTER_ARM_LAYOUT(LAYER_HARDSWISH, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc
new file mode 100644
index 0000000..32a665b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc
@@ -0,0 +1,508 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_inner_product_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/cpu_utils.h"
+#ifdef TNN_ARM82_USE_NEON
+#include "tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h"
+#endif
+
+namespace TNN_NS {
+
+ArmInnerProductLayerAcc::~ArmInnerProductLayerAcc() {}
+
+// pack int8 kernel: round up ic4, round up oc4
+static void packweight_i8(const int8_t *src, int8_t *dst, const int oc, const int ic, const int hw) {
+    auto ic_r4       = ROUND_UP(ic, 4);
+    auto dst_step    = ic * hw;
+    auto dst_step_r4 = ic_r4 * hw;
+    for (int o = 0; o < oc; o++) {
+        for (int i = 0; i < hw; i++) {
+            for (int c = 0; c < ic; c++) {
+                dst[o * dst_step_r4 + i * ic_r4 + c] = src[o * dst_step + c * hw + i];
+            }
+        }
+    }
+}
+
+// pack float and bfp16 weights
+template <typename T>
+static void PackWeightO4(const T *src, T *dst, const int oc, const int ic) {
+    const int oc_r4 = ROUND_UP(oc, 4);
+    const int ic_r4 = ROUND_UP(ic, 4);
+    for (int o = 0; o < oc_r4; o++) {
+        int o_inner = o % 4;
+        int o_outer = o / 4 * 4;
+        for (int i = 0; i < ic_r4; i++) {
+            if (i >= ic || o >= oc) {
+                dst[i * 4 + o_outer * ic_r4 + o_inner] = 0;
+            } else {
+                dst[i * 4 + o_outer * ic_r4 + o_inner] = src[o * ic + i];
+            }
+        }
+    }
+}
+
+// get 4 result at a time
+template <typename T>
+static void SGEMV(T *dst, const T *src, T *weight, const int oc_r4, const int ic_r4) {
+    OMP_PARALLEL_FOR_
+    for (int o = 0; o < oc_r4; o += 4) {
+        auto weight_z = weight + o * ic_r4;
+        Float4 acc(0.f);
+        for (int i = 0; i < ic_r4; i += 4) {
+            Float4 w0 = Float4::load(weight_z + i * 4 + 0);
+            Float4 w1 = Float4::load(weight_z + i * 4 + 4);
+            Float4 w2 = Float4::load(weight_z + i * 4 + 8);
+            Float4 w3 = Float4::load(weight_z + i * 4 + 12);
+            Float4 v0 = Float4::load(src + i);
+            Float2 v0_0, v0_1;
+            Float4::get_low(v0, v0_0);
+            Float4::get_high(v0, v0_1);
+            Float4::mla_lane0(acc, w0, v0_0);
+            Float4::mla_lane1(acc, w1, v0_0);
+            Float4::mla_lane0(acc, w2, v0_1);
+            Float4::mla_lane1(acc, w3, v0_1);
+        }
+        Float4::save(dst + o, acc);
+    }
+}
+
+Status ArmInnerProductLayerAcc::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+    InnerProductLayerResource *fc_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(fc_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        DimsVector dims_input  = inputs[0]->GetBlobDesc().dims;
+        DimsVector dims_output = outputs[0]->GetBlobDesc().dims;
+
+        RawBuffer w_handle = fc_res->weight_handle;
+        CHECK_PARAM_NULL(w_handle.force_to<void *>());
+
+        if (w_handle.GetDataType() == DATA_TYPE_HALF)
+            w_handle = ConvertHalfHandle(w_handle);
+
+        auto weight_data_type = w_handle.GetDataType();
+        int ic                = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+        const int oc          = fc_param->num_output;
+        auto data_byte_size   = DataTypeUtils::GetBytesSize(weight_data_type);
+        if (weight_data_type == DATA_TYPE_FLOAT) {
+            if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+                // transform weight dims from 4 to 2
+                if (DimsVectorUtils::Count(dims_input, 2) > 1) {
+                    RawBuffer reorder_buffer = RawBuffer(DimsVectorUtils::Count(dims_input, 2) *
+                                                         ROUND_UP(dims_input[1], 4) * oc * data_byte_size);
+                    for (int i = 0; i < oc; i++) {
+                        auto dst_ptr = reorder_buffer.force_to<float *>() +
+                                       i * DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 4);
+                        auto src_ptr = w_handle.force_to<float *>() + i * ic;
+                        PackC4(dst_ptr, src_ptr, DimsVectorUtils::Count(dims_input, 2), dims_input[1]);
+                    }
+
+                    ic       = DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 4);
+                    w_handle = reorder_buffer;
+                }
+
+                auto weight_count = ROUND_UP(oc, 4) * ROUND_UP(ic, 4);
+                buffer_weight_    = RawBuffer(weight_count * data_byte_size);
+                PackWeightO4(w_handle.force_to<float *>(), buffer_weight_.force_to<float *>(), oc, ic);
+
+                // both data and weight will use type bfp16
+                RawBuffer bfp16_buffer(weight_count * sizeof(bfp16_t));
+                ConvertFromFloatToBFP16(buffer_weight_.force_to<float *>(), bfp16_buffer.force_to<void *>(),
+                                        weight_count);
+                buffer_weight_ = bfp16_buffer;
+            } else {
+                // weight [oc, ic] -> transpose -> [ic, oc]
+                RawBuffer tmp_transpose = RawBuffer(ic * oc * data_byte_size);
+                float *transpose_ptr    = tmp_transpose.force_to<float *>();
+                for (int i = 0; i < ic; ++i) {
+                    for (int o = 0; o < oc; ++o) {
+                        transpose_ptr[i * oc + o] = w_handle.force_to<float *>()[o * ic + i];
+                    }
+                }
+                // weight [ic, oc] -> [oc/8, ic, 8]
+                buffer_weight_ = RawBuffer(ic * ROUND_UP(oc, 8) * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+                PackB_8(ic, oc, transpose_ptr, oc, buffer_weight_.force_to<float *>());
+            }
+        } else {
+            auto hw = DimsVectorUtils::Count(dims_input, 2);
+            auto weight_count = ROUND_UP(oc, 4) * ROUND_UP(dims_input[1], 4) * hw;
+            buffer_weight_    = RawBuffer(weight_count * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+#ifdef TNN_ARM82_USE_NEON
+            if (support_int8_sdot_) {
+                PackSDOTINT8WeightGemv(w_handle.force_to<int8_t *>(), buffer_weight_.force_to<int8_t *>(), oc, dims_input[1], hw);
+            } else {
+                packweight_i8(w_handle.force_to<int8_t *>(), buffer_weight_.force_to<int8_t *>(), oc, dims_input[1], hw);
+            }
+#else
+            packweight_i8(w_handle.force_to<int8_t *>(), buffer_weight_.force_to<int8_t *>(), oc, dims_input[1], hw);
+#endif
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::allocateBufferBias(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+    InnerProductLayerResource *fc_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(fc_res);
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_bias_.GetBytesSize()) {
+        if (fc_param->has_bias) {
+            auto bias_handle = fc_res->bias_handle;
+
+            if (bias_handle.GetDataType() == DATA_TYPE_HALF)
+                bias_handle = ConvertHalfHandle(bias_handle);
+
+            int total_byte_size = ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(bias_handle.GetDataType());
+
+            const int bias_handle_size = bias_handle.GetBytesSize();
+
+            buffer_bias_ = RawBuffer(total_byte_size);
+            memcpy(buffer_bias_.force_to<void *>(), bias_handle.force_to<void *>(), bias_handle_size);
+        } else if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            // int 8 kernel always add bias, if not, set zeros
+            buffer_bias_ = RawBuffer(ROUND_UP(dims_output[1], 4) * sizeof(int32_t));
+        }
+    }
+
+    // alloc scale buffer for int8 kernel
+    if (!buffer_scale_.GetBytesSize()) {
+        if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            auto o_scale = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle;
+            auto w_scale = fc_res->scale_handle;
+
+            if (w_scale.GetDataType() == DATA_TYPE_HALF)
+                w_scale = ConvertHalfHandle(w_scale);
+
+            int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+            buffer_scale_       = RawBuffer(total_byte_size);
+            auto w_scale_ptr    = w_scale.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale_ptr);
+            auto o_scale_ptr = o_scale.force_to<float *>();
+            CHECK_PARAM_NULL(o_scale_ptr);
+            for (int i = 0; i < dims_output[1]; i++) {
+                int scale_idx_w = w_scale.GetDataCount() == 1 ? 0 : i;
+                int scale_idx_o = o_scale.GetDataCount() == 1 ? 0 : i;
+
+                if (o_scale_ptr[scale_idx_o] >= FLT_MIN)
+                    buffer_scale_.force_to<float *>()[i] = w_scale_ptr[scale_idx_w] / o_scale_ptr[scale_idx_o];
+                else
+                    buffer_scale_.force_to<float *>()[i] = 0.0;
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (input_data_type == DATA_TYPE_FLOAT || input_data_type == DATA_TYPE_BFP16 || input_data_type == DATA_TYPE_INT8) {
+        if (input_data_type == DATA_TYPE_INT8) {
+            gemv_func_ = GemvInt8;
+#ifdef TNN_ARM82_USE_NEON
+            support_int8_sdot_ = CpuUtils::CpuSupportInt8Dot();
+            if (support_int8_sdot_) {
+                gemv_func_ = GemvInt8Sdot;
+            }
+#endif
+        }
+        RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+        RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+    }
+#if TNN_ARM82
+    else if (input_data_type == DATA_TYPE_HALF) {
+        RETURN_ON_NEQ(allocateBufferWeightHalf(inputs, outputs), TNN_OK);
+        RETURN_ON_NEQ(allocateBufferBiasHalf(inputs, outputs), TNN_OK);
+    }
+#endif  // TNN_ARM82
+    else {
+        LOGE("ARM InnerProduct not support data type: %d\n", input_data_type);
+        return Status(TNNERR_LAYER_ERR, "ARM InnerProduct not support data type");
+    }
+    return TNN_OK;
+}
+
+/*
+general template function for float and bfp16
+in bfp16 mode, both data and weight data type are bfp16, is there any precision problem
+*/
+template <typename T>
+Status ArmInnerProductLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    auto ic          = DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 4);
+    auto oc_r4       = ROUND_UP(dims_output[1], 4);
+
+    auto input_origin  = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    auto output_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto input_ptr  = input_origin + n * ic;
+        auto output_ptr = output_origin + n * oc_r4;
+
+        SGEMV(output_ptr, input_ptr, buffer_weight_.force_to<T *>(), oc_r4, ic);
+
+        if (fc_param->has_bias) {
+            PostAddBias<T>(output_ptr, buffer_bias_.force_to<float *>(), 1, oc_r4 / 4);
+        }
+    }
+
+    return TNN_OK;
+}
+
+template <>
+Status ArmInnerProductLayerAcc::Exec<float>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = reinterpret_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    DimsVector dims_input = inputs[0]->GetBlobDesc().dims;
+    int batch             = dims_input[0];
+    int channel           = dims_input[1];
+    int hw                = DimsVectorUtils::Count(dims_input, 2);
+    int ic                = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+    const int oc          = fc_param->num_output;
+    auto data_byte_size   = DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT);
+    const int input_size  = batch * ic * data_byte_size;
+    const int bias_size   = oc * data_byte_size;
+    const int output_size = batch * oc * data_byte_size;
+
+    float *input_ptr  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    float *output_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    // input: nc4hw4 -> nchw if needed
+    RawBuffer input_reordered;
+    if (!FloatBlobCanIgnorePack(channel, hw)) {
+        input_reordered      = RawBuffer(input_size);
+        float *reordered_ptr = input_reordered.force_to<float *>();
+        UnpackFloatBlob(reordered_ptr, input_ptr, batch, channel, hw);
+        input_ptr = reordered_ptr;
+    }
+
+    float *tmp_output_ptr = output_ptr;
+    RawBuffer output_reordered;
+    if (!FloatBlobCanIgnorePack(oc, 1)) {
+        output_reordered = RawBuffer(output_size);
+        tmp_output_ptr   = output_reordered.force_to<float *>();
+    }
+
+    if (fc_param->has_bias) {
+        OMP_PARALLEL_FOR_
+        for (int b = 0; b < batch; ++b) {
+            // output shape: [batch, oc]
+            auto dst_ptr_b = tmp_output_ptr + b * oc;
+            memcpy(dst_ptr_b, buffer_bias_.force_to<float *>(), bias_size);
+        }
+    } else {
+        memset(tmp_output_ptr, 0, output_size);
+    }
+
+    // buffer for PackA in gemm
+    auto input_pack_ptr = reinterpret_cast<float *>(context_->GetSharedWorkSpace(input_size + NEON_KERNEL_EXTRA_LOAD));
+
+    GemmFloatPackA(batch, oc, ic, input_ptr, input_pack_ptr, ic, buffer_weight_.force_to<float *>(), oc, tmp_output_ptr,
+                   oc);
+
+    // output: nchw -> nc4hw4 if needed
+    if (!FloatBlobCanIgnorePack(oc, 1)) {
+        PackFloatBlob(output_ptr, tmp_output_ptr, batch, oc, 1);
+    }
+
+    return TNN_OK;
+}
+
+/*
+template specification for int8
+in int8 mode, weight has been packed to oc8
+*/
+template <>
+Status ArmInnerProductLayerAcc::Exec<int8_t>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto dims_input                  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output                 = outputs[0]->GetBlobDesc().dims;
+    auto input_origin  = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    auto output_origin = reinterpret_cast<int8_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    auto ic    = dims_input[1];
+    auto ic_r4 = ROUND_UP(ic, 4);
+    auto hw    = DimsVectorUtils::Count(dims_input, 2);
+    auto ik_r4 = ic_r4 * hw;
+    auto oc_r4 = ROUND_UP(dims_output[1], 4);
+
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto input_ptr  = input_origin + n * ik_r4;
+        auto output_ptr = output_origin + n * oc_r4;
+
+        gemv_func_(output_ptr, input_ptr, buffer_weight_.force_to<int8_t *>(), buffer_bias_.force_to<int32_t *>(),
+                 buffer_scale_.force_to<float *>(), ik_r4, oc_r4);
+    }
+
+    return TNN_OK;
+}
+
+/*
+general template function for bfp16 nchw
+use n4chw4 impl, nchw impl tbd
+*/
+template <typename T>
+Status ArmInnerProductLayerAcc::ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    auto ic          = DimsVectorUtils::Count(dims_input, 1);
+    auto ic_r4       = DimsVectorUtils::Count(dims_input, 2) * ROUND_UP(dims_input[1], 4);
+    auto oc          = dims_output[1];
+    auto oc_r4       = ROUND_UP(dims_output[1], 4);
+
+    auto data_byte_size = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+    auto *work_space = reinterpret_cast<T *>(context_->GetSharedWorkSpace((ic_r4 + oc_r4) * data_byte_size));
+
+    auto input_origin  = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    auto output_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+    auto input_pack    = work_space;
+    auto output_pack   = work_space + ic_r4;
+    for (int n = 0; n < dims_output[0]; n++) {
+        auto input_ptr  = input_origin + n * ic;
+        auto output_ptr = output_origin + n * oc;
+
+        PackC4(input_pack, input_ptr, DimsVectorUtils::Count(dims_input, 2), dims_input[1]);
+
+        SGEMV(output_pack, input_pack, buffer_weight_.force_to<T *>(), oc_r4, ic_r4);
+
+        if (fc_param->has_bias) {
+            PostAddBias<T>(output_pack, buffer_bias_.force_to<float *>(), 1, oc_r4 / 4);
+        }
+
+        UnpackC4(output_ptr, output_pack, 1, oc);
+    }
+
+    return TNN_OK;
+}
+
+template <>
+Status ArmInnerProductLayerAcc::ExecNchw<float>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = reinterpret_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    DimsVector dims_input = inputs[0]->GetBlobDesc().dims;
+    int batch             = dims_input[0];
+    int channel           = dims_input[1];
+    int hw                = DimsVectorUtils::Count(dims_input, 2);
+    int ic                = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+    const int oc          = fc_param->num_output;
+    auto data_byte_size   = DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT);
+    const int input_size  = batch * ic * data_byte_size;
+    const int bias_size   = oc * data_byte_size;
+    const int output_size = batch * oc * data_byte_size;
+
+    float *input_ptr  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    float *output_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    float *tmp_output_ptr = output_ptr;
+
+    if (fc_param->has_bias) {
+        OMP_PARALLEL_FOR_
+        for (int b = 0; b < batch; ++b) {
+            // output shape: [batch, oc]
+            auto dst_ptr_b = tmp_output_ptr + b * oc;
+            memcpy(dst_ptr_b, buffer_bias_.force_to<float *>(), bias_size);
+        }
+    } else {
+        memset(tmp_output_ptr, 0, output_size);
+    }
+
+    // buffer for PackA in gemm
+    auto input_pack_ptr = reinterpret_cast<float *>(context_->GetSharedWorkSpace(input_size + NEON_KERNEL_EXTRA_LOAD));
+
+    GemmFloatPackA(batch, oc, ic, input_ptr, input_pack_ptr, ic, buffer_weight_.force_to<float *>(), oc, tmp_output_ptr,
+                   oc);
+
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return Exec<int8_t>(inputs, outputs);
+    }
+
+    if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            return ExecNchw<float>(inputs, outputs);
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+            return ExecNchw<bfp16_t>(inputs, outputs);
+        }
+#if TNN_ARM82
+        else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            return ExecNchwFp16(inputs, outputs);
+        }
+#endif
+        else {
+            return Status(TNNERR_LAYER_ERR, "Unsupported data type in innerproduct");
+        }
+    } else if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC4HW4 ||
+               inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC8HW8) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            return Exec<float>(inputs, outputs);
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+            return Exec<bfp16_t>(inputs, outputs);
+        }
+#if TNN_ARM82
+        else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            return ExecFp16(inputs, outputs);
+        }
+#endif
+        else {
+            return Status(TNNERR_LAYER_ERR, "Unsupported data type in innerproduct");
+        }
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Unsupported data format in innerproduct");
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+REGISTER_ARM_ACC(InnerProduct, LAYER_INNER_PRODUCT)
+REGISTER_ARM_PRECISION_FP16(LAYER_INNER_PRODUCT)
+REGISTER_ARM_LAYOUT(LAYER_INNER_PRODUCT, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_INNER_PRODUCT, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.h
new file mode 100644
index 0000000..6b5117a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inner_product_layer_acc.h
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_INNER_PRODUCT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_INNER_PRODUCT_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+class ArmInnerProductLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmInnerProductLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc for fc weights and pack GOIHW16
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc for fc bias and pack c4
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+#if TNN_ARM82
+    virtual Status allocateBufferWeightHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status allocateBufferBiasHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ExecNchwFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+#endif  // TNN_ARM82
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    RawBuffer buffer_scale_;
+
+    std::function<void(int8_t *, const int8_t *, const int8_t*, const int32_t*,
+                       const float*, long, long)> gemv_func_;
+    bool support_int8_sdot_ = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_INNER_PRODUCT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_instance_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_instance_norm_layer_acc.cc
new file mode 100644
index 0000000..b1beea4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_instance_norm_layer_acc.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+Status ArmInstanceNormLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_res = dynamic_cast<InstanceNormLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: layer resource is nil");
+    }
+
+    auto desc    = outputs[0]->GetBlobDesc();
+    int batch    = desc.dims[0];
+    int channels = desc.dims[1];
+    int c_r4     = ROUND_UP(channels, 4);
+    int area     = DimsVectorUtils::Count(desc.dims, 2);
+
+    float *k_data = layer_res->scale_handle.force_to<float *>();
+    float *b_data = layer_res->bias_handle.force_to<float *>();
+
+    if (desc.data_type == DATA_TYPE_FLOAT) {
+        if (channels != c_r4) {
+            float *tmp = new float[c_r4];
+            memcpy(tmp, k_data, channels * sizeof(float));
+            memset(tmp + channels, 0, (c_r4 - channels) * sizeof(float));
+            k_data = tmp;
+            if (b_data) {
+                tmp = new float[c_r4];
+                memcpy(tmp, b_data, channels * sizeof(float));
+                memset(tmp + channels, 0, (c_r4 - channels) * sizeof(float));
+                b_data = tmp;
+            }
+        }
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+        for (int b = 0; b < batch; b++) {
+            for (int c = 0; c < c_r4; c += 4) {
+                Float4 sum(0.f);
+                auto input_c  = input_data + b * c_r4 * area + c * area;
+                auto output_c = output_data + b * c_r4 * area + c * area;
+                for (int hw = 0; hw < area; ++hw) {
+                    auto v = Float4::load(input_c + hw * 4);
+                    sum    = sum + v;
+                }
+                Float4 mean = Float4::div(sum, area);
+                Float4 sum2(0.f);
+                for (int hw = 0; hw < area; ++hw) {
+                    auto v = Float4::load(input_c + hw * 4) - mean;
+                    sum2   = sum2 + v * v;
+                }
+                auto variance = Float4::div(sum2, area);
+                Float4 k      = Float4::load(k_data + c);
+                variance      = Float4::div(1.0f, Float4::sqrt(variance + Float4(0.00001f)));
+                variance      = variance * k;
+
+                Float4 b = b_data ? Float4::load(b_data + c) : Float4(0.f);
+                b        = b - mean * variance;
+
+                for (int hw = 0; hw < area; ++hw) {
+                    Float4::save(output_c + hw * 4, Float4::load(input_c + hw * 4) * variance + b);
+                }
+            }
+        }
+        if (channels != c_r4) {
+            delete[] k_data;
+            if (b_data)
+                delete[] b_data;
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", desc.data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+REGISTER_ARM_LAYOUT(LAYER_INST_BATCH_NORM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_inverse_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inverse_layer_acc.cc
new file mode 100644
index 0000000..7605f7c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_inverse_layer_acc.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Inverse, LAYER_INVERSE);
+
+Status ArmInverseLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto input_blob = inputs[0];
+    auto input_dims       = input_blob->GetBlobDesc().dims;
+    if (input_dims.size() < 2) {
+        return Status(TNNERR_PARAM_ERR, "CpuInverseLayerAcc has invalid input dims");
+    }
+    if ((input_dims[input_dims.size() - 1] != 2) || (input_dims[input_dims.size() - 2] != 2)) {
+        LOGE("ArmInverseLayerAcc now only support inverse of matrix batchx2x2\n");
+        return Status(TNNERR_UNSUPPORT_NET, "ArmInverseLayerAcc now only support inverse of matrix batchx2x2\n");
+    }
+    if (input_blob->GetBlobDesc().data_type != DATA_TYPE_FLOAT) {
+        LOGE("ArmInverseLayerAcc now only support float data type\n");
+        return Status(TNNERR_UNSUPPORT_NET,"ArmInverseLayerAcc now only support float data type\n");
+    }
+    float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    const int batch = DimsVectorUtils::Count(input_dims, 0, (int)input_dims.size() - 2);
+    for (int b = 0; b < batch; b++) {
+        float det         = input_data[0] * input_data[3] - input_data[1] * input_data[2];
+        float det_inverse = 1.0f / det;
+
+        output_data[0] = input_data[3] * det_inverse;
+        output_data[1] = -input_data[1] * det_inverse;
+        output_data[2] = -input_data[2] * det_inverse;
+        output_data[3] = input_data[0] * det_inverse;
+
+        input_data += 4;
+        output_data += 4;
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Inverse, LAYER_INVERSE);
+REGISTER_ARM_LAYOUT(LAYER_INVERSE, DATA_FORMAT_NCHW);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.cc
new file mode 100644
index 0000000..c279a4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.cc
@@ -0,0 +1,263 @@
+//
+//  arm_layer_acc.cpp
+//  tnn
+//
+//  Created by seanxcwang on 2019/9/17.
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+#include "tnn/core/profile.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/blob_transfer_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status ArmLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    context_ = reinterpret_cast<ArmContext *>(context);
+
+    param_    = param;
+    resource_ = resource;
+    k_param_  = std::make_shared<ArmKernelParam>();
+
+    // init base k_param_
+    auto input_dim  = inputs[0]->GetBlobDesc().dims;
+    auto output_dim = outputs[0]->GetBlobDesc().dims;
+    int ic          = input_dim[1];
+    int ih          = input_dim.size() > 2 ? input_dim[2] : 1;
+    int iw          = input_dim.size() > 3 ? input_dim[3] : 1;
+    int oc          = output_dim[1];
+    int oh          = output_dim.size() > 2 ? output_dim[2] : 1;
+    int ow          = output_dim.size() > 3 ? output_dim[3] : 1;
+    k_param_->set_dims(ROUND_UP(ic, 4), ROUND_UP(ic, 8), ih, iw, ROUND_UP(oc, 4), ROUND_UP(oc, 8), oh, ow);
+
+    RETURN_ON_NEQ(ReloadConstantBlobs(inputs, false), TNN_OK);
+
+    return TNN_OK;
+}
+
+std::vector<DataFormat> ArmLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size == 4) {
+        if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16)
+            support_list.push_back(DATA_FORMAT_NC4HW4);
+        else if (data_type == DATA_TYPE_INT8)
+            support_list.push_back(DATA_FORMAT_NHWC4);
+        else if (data_type == DATA_TYPE_HALF) {
+            support_list.push_back(DATA_FORMAT_NC8HW8);
+        }
+    }
+    return support_list;
+}
+
+bool ArmLayerAcc::UseNaiveConstantBlobs() {
+    return false;
+}
+
+ArmLayerAcc::~ArmLayerAcc() {}
+
+Status ArmLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // reinit k_param_ h,w
+    auto input_dim  = inputs[0]->GetBlobDesc().dims;
+    auto output_dim = outputs[0]->GetBlobDesc().dims;
+    int ic          = input_dim[1];
+    int ih          = input_dim.size() > 2 ? input_dim[2] : 1;
+    int iw          = input_dim.size() > 3 ? input_dim[3] : 1;
+    int oc          = output_dim[1];
+    int oh          = output_dim.size() > 2 ? output_dim[2] : 1;
+    int ow          = output_dim.size() > 3 ? output_dim[3] : 1;
+    k_param_->set_dims(ROUND_UP(ic, 4), ROUND_UP(ic, 8), ih, iw, ROUND_UP(oc, 4), ROUND_UP(oc, 8), oh, ow);
+    return TNN_OK;
+}
+
+Status ArmLayerAcc::ConfigBuffer2ArmBlobDesc(BlobDesc &desc) {
+    return TNN_OK;
+}
+
+Status ArmLayerAcc::RawBuffer2ArmBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, BlobDesc &desc) {
+    if (!buffer) {
+        LOGE("RawBuffer2ArmBlob:: buffer is null \n");
+        return Status(TNNERR_PARAM_ERR, "RawBuffer2ArmBlob:: buffer is null");
+    }
+
+    const int count = blob ? DimsVectorUtils::Count(blob->GetBlobDesc().dims) : 0;
+
+    if (!blob || buffer->GetDataCount() != count) {
+        {
+            desc.device_type = DEVICE_ARM;
+            desc.dims        = buffer->GetBufferDims();
+            ConfigBuffer2ArmBlobDesc(desc);
+        }
+        if (buffer->GetBytesSize() > 0) {
+            blob = std::make_shared<Blob>(desc, true);
+        } else {
+            blob = std::make_shared<Blob>(desc, false);
+        }
+    }
+
+    if (blob->GetHandle().base && buffer->GetBytesSize() > 0) {
+        auto buff_dtype = buffer->GetDataType();
+        auto blob_dtype = blob->GetBlobDesc().data_type;
+        auto blob_fmt   = blob->GetBlobDesc().data_format;
+        auto dims       = desc.dims;
+
+        if (dims.size() < 2) {
+            memcpy(GetBlobHandlePtr(blob->GetHandle()), buffer->force_to<void *>(), buffer->GetBytesSize());
+            return TNN_OK;
+        }
+
+        int batch       = DimsFunctionUtils::GetDim(dims, 0);
+        int channel     = DimsFunctionUtils::GetDim(dims, 1);
+        int hw          = DimsVectorUtils::Count(dims, 2);
+        auto buff_count = batch * channel * hw;
+
+        if (buff_dtype == DATA_TYPE_FLOAT) {
+            auto src_ptr = buffer->force_to<float *>();
+            if (blob_dtype == DATA_TYPE_FLOAT) {
+                if (blob_fmt == DATA_FORMAT_NCHW) {
+                    memcpy(reinterpret_cast<float *>(GetBlobHandlePtr(blob->GetHandle())), src_ptr, buff_count * sizeof(float));
+                } else {
+                    PackFloatBlob(reinterpret_cast<float *>(GetBlobHandlePtr(blob->GetHandle())), src_ptr, batch, channel, hw);
+                }
+            } else if (blob_dtype == DATA_TYPE_HALF) {
+                RawBuffer tmp_fp16_buff = RawBuffer(buff_count * sizeof(fp16_t));
+                auto tmp_buff_ptr       = tmp_fp16_buff.force_to<fp16_t *>();
+                ConvertFromFloatToHalf(src_ptr, tmp_buff_ptr, buff_count);
+                if (blob_fmt == DATA_FORMAT_NCHW) {
+                    memcpy(reinterpret_cast<fp16_t *>(GetBlobHandlePtr(blob->GetHandle())), tmp_buff_ptr,
+                           buff_count * sizeof(fp16_t));
+                } else {
+                    PackHalfBlob(reinterpret_cast<fp16_t *>(GetBlobHandlePtr(blob->GetHandle())), tmp_buff_ptr, batch, channel, hw);
+                }
+            } else {
+                LOGE("RawBuffer2ArmBlob:: unsupported blob data type: %d\n", blob_dtype);
+                return Status(TNNERR_PARAM_ERR, "RawBuffer2ArmBlob:: unsupported blob data type");
+            }
+        } else {
+            LOGE("RawBuffer2ArmBlob:: unsupported buffer data type: %d\n", buff_dtype);
+            return Status(TNNERR_PARAM_ERR, "RawBuffer2ArmBlob:: unsupported buffer data type");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+    if (const_resource == nullptr) {
+        return TNN_OK;
+    }
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+
+    // The default blob desc has the same data type and data format with non-constant input blob
+    BlobDesc arm_default_desc;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        // skip const blobs
+        if (const_resource->find(name) != const_resource->end()) {
+            continue;
+        }
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        arm_default_desc.device_type = DEVICE_ARM;
+        arm_default_desc.data_type   = iter->GetBlobDesc().data_type;
+        arm_default_desc.data_format = iter->GetBlobDesc().data_format;
+    }
+
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        // deal with const blobs
+        if (const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        LOGD("Reloading constant blob: %s, default data_type = %d, data_format = %d\n", name.c_str(),
+             arm_default_desc.data_type, arm_default_desc.data_format);
+        auto buffer                = (*const_resource)[name];
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+        Status status;
+        if (UseNaiveConstantBlobs()) {
+            status = RawBuffer2Blob(buffer.get(), blob);
+        } else {
+            status = RawBuffer2ArmBlob(buffer.get(), blob, arm_default_desc);
+        }
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        iter->GetBlobDesc() = blob->GetBlobDesc();
+        LOGD("Reload constant blob: %s done\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+bool ArmLayerAcc::DataTypeSupported(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16 || data_type == DATA_TYPE_INT8 ||
+        data_type == DATA_TYPE_HALF) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+Status ArmLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status;
+#if TNN_PROFILE
+    auto pdata = std::make_shared<ProfilingData>();
+    UpdateProfilingData(pdata.get(), param_, inputs[0]->GetBlobDesc().dims, outputs[0]->GetBlobDesc().dims);
+    timer.Start();
+#endif
+
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (DataTypeSupported(in_data_type)) {
+        status = this->DoForward(inputs, outputs);
+    } else {
+        LOGE("Error : arm layer acc got unsupported data type %d\n", in_data_type);
+        return Status(TNNERR_LAYER_ERR, "Error: arm layer acc got unsupported data type.");
+    }
+
+#if TNN_PROFILE
+    pdata->kernel_time = timer.TimeEclapsed();
+    context_->AddProfilingData(pdata);
+#endif
+
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return TNN_OK;
+}
+
+Status ArmLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return Status(TNNERR_LAYER_ERR, "DoForward not implement");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.h
new file mode 100644
index 0000000..dc20793
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_layer_acc.h
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_LAYER_ACC_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/device/arm/arm_util.h"
+
+namespace TNN_NS {
+// @brief conv layer arm acc
+class ArmLayerAcc : public AbstractLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs);
+
+    virtual ~ArmLayerAcc();
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return forward result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief allocate or update constant blobs if constant resource change.
+     * Note: this func may cost much time, call this func only when necessary.
+     */
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+
+    /**
+     * @brief layer Doforward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+#if TNN_PROFILE
+    Timer timer;
+#endif
+
+protected:
+    LayerParam *param_       = nullptr;
+    LayerResource *resource_ = nullptr;
+
+    ArmContext *context_                     = nullptr;
+    std::shared_ptr<ArmKernelParam> k_param_ = nullptr;
+
+    virtual bool DataTypeSupported(DataType data_type);
+
+    // @brief if true, const blobs are loaded the same as naive device
+    virtual bool UseNaiveConstantBlobs();
+    // @brief config blobdesc for reload buffer to arm blob
+    virtual Status ConfigBuffer2ArmBlobDesc(BlobDesc &desc);
+    // @brief reload buffer to arm blob using packed format
+    virtual Status RawBuffer2ArmBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, BlobDesc &desc);
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+};
+
+#if TNN_ARM82
+#define DECLARE_ARM_FP16_LAYER_FUNC                                                                                    \
+    Status ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+#else
+#define DECLARE_ARM_FP16_LAYER_FUNC
+#endif  // TNN_ARM82
+
+#define DECLARE_ARM_ACC(type_string, layer_type)                                                                       \
+    class Arm##type_string##LayerAcc : public ArmLayerAcc {                                                            \
+    public:                                                                                                            \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+        virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);               \
+                                                                                                                       \
+    private:                                                                                                           \
+        template <typename T>                                                                                          \
+        Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                            \
+        DECLARE_ARM_FP16_LAYER_FUNC;                                                                                   \
+    }
+
+#define DECLARE_ARM_ACC_WITH_EXTRA(type_string, layer_type, extra)                                                     \
+    class Arm##type_string##LayerAcc : public ArmLayerAcc {                                                            \
+    public:                                                                                                            \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+        virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);               \
+                                                                                                                       \
+    private:                                                                                                           \
+        template <typename T>                                                                                          \
+        Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                            \
+        DECLARE_ARM_FP16_LAYER_FUNC;                                                                                   \
+        extra;                                                                                                         \
+    }
+
+#define REGISTER_ARM_ACC(type_string, layer_type)                                                                      \
+    ArmTypeLayerAccRegister<TypeLayerAccCreator<Arm##type_string##LayerAcc>> g_arm_##layer_type##_acc_register(        \
+        layer_type);
+
+class ArmTypeLayerFp16PrecisionCreator {
+public:
+    static std::shared_ptr<ImplementedPrecision> UpdateImplementedPrecision(LayerType layer_type) {
+        // make sure arm device has been registered
+        TypeDeviceRegister<ArmDevice> arm_device_register(DEVICE_ARM);
+        auto implemented_precision          = GetDevice(DEVICE_ARM)->GetImplementedPrecision(layer_type);
+        auto updated_precision              = std::make_shared<ImplementedPrecision>(*implemented_precision);
+        updated_precision->fp16_implemented = true;
+        return updated_precision;
+    };
+};
+
+#if TNN_ARM82
+#define REGISTER_ARM_PRECISION_FP16(layer_type)                                                                        \
+    ArmTypeLayerPrecisionRegister g_arm_##layer_type##_fp16_precision_register(                                        \
+        layer_type, ArmTypeLayerFp16PrecisionCreator::UpdateImplementedPrecision(layer_type));
+#else
+#define REGISTER_ARM_PRECISION_FP16(layer_type)
+#endif  // TNN_ARM82
+
+class ArmTypeLayerLayoutCreator {
+public:
+    static std::shared_ptr<ImplementedLayout> UpdateImplementedLayout(LayerType layer_type, DataFormat layout) {
+        // make sure arm device has been registered
+        TypeDeviceRegister<ArmDevice> arm_device_register(DEVICE_ARM);
+        auto implemented_layout = GetDevice(DEVICE_ARM)->GetImplementedLayout(layer_type);
+        auto updated_layout     = std::make_shared<ImplementedLayout>(*implemented_layout);
+        updated_layout->layouts.push_back(layout);
+        return updated_layout;
+    }
+};
+
+// DATA_FORMAT_NC4HW4 represents packed layouts for both fp32 and fp16
+#define REGISTER_ARM_LAYOUT(layer_type, layout)                                                                        \
+    ArmTypeLayerLayoutRegister g_arm_##layer_type##_##layout##_layout_register(                                        \
+        layer_type, ArmTypeLayerLayoutCreator::UpdateImplementedLayout(layer_type, layout));
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_acc_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_acc_layer_acc.cc
new file mode 100644
index 0000000..f1c638c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_acc_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_log_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::log(v);
+    }
+} ARM_LOG_OP;
+
+DECLARE_ARM_UNARY_ACC(Log, ARM_LOG_OP);
+
+REGISTER_ARM_ACC(Log, LAYER_LOG)
+REGISTER_ARM_LAYOUT(LAYER_LOG, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..1bd9caa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_log_sigmoid_layer_acc.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_logsigmoid_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::log(Float4::sigmoid(v));
+    }
+    virtual Float4 fast_op(const Float4& v) {
+        return Float4::log(Float4::fast_sigmoid(v));
+    }
+} ARM_LOGSIGMOID_OP;
+
+DECLARE_ARM_UNARY_ACC(LogSigmoid, ARM_LOGSIGMOID_OP);
+
+REGISTER_ARM_ACC(LogSigmoid, LAYER_LOGSIGMOID)
+REGISTER_ARM_LAYOUT(LAYER_LOGSIGMOID, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.cc
new file mode 100644
index 0000000..ff57650
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.cc
@@ -0,0 +1,322 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_lstm_layer_acc.h"
+
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+static void LstmActivate(const int count, const float *g_ptr, float *c_ptr, float *h_ptr, float *o_ptr) {
+    OMP_PARALLEL_FOR_
+    for (int q = 0; q < count - 3; q += 4) {
+        Float4x4 gates_iofc = Float4x4::ld4(g_ptr + q * 4);
+        Float4 I, O, F, C;
+        gates_iofc.get_lane(I, 0);
+        gates_iofc.get_lane(O, 1);
+        gates_iofc.get_lane(F, 2);
+        gates_iofc.get_lane(C, 3);
+
+        I = Float4::sigmoid(I);
+        F = Float4::sigmoid(F);
+        O = Float4::sigmoid(O);
+        C = Float4::tanh(C);
+
+        Float4 cell2 = F * Float4::load(c_ptr + q) + I * C;
+        Float4 H     = O * Float4::tanh(cell2);
+        Float4::save(c_ptr + q, cell2);
+        Float4::save(h_ptr + q, H);
+        Float4::save(o_ptr + q, H);
+    }
+    int remain = count % 4;
+    int offset = count / 4 * 4;
+    g_ptr += offset * 4;
+    c_ptr += offset;
+    h_ptr += offset;
+    o_ptr += offset;
+    if (remain) {
+        Float4x4 gates_iofc = Float4x4::ld4(g_ptr);
+        Float4 I, O, F, C;
+        gates_iofc.get_lane(I, 0);
+        gates_iofc.get_lane(O, 1);
+        gates_iofc.get_lane(F, 2);
+        gates_iofc.get_lane(C, 3);
+
+        I = Float4::sigmoid(I);
+        F = Float4::sigmoid(F);
+        O = Float4::sigmoid(O);
+        C = Float4::tanh(C);
+
+        Float4 c_old;
+        for (int r = 0; r < remain; ++r) {
+            c_old.set_lane(c_ptr[r], r);
+        }
+        Float4 cell2 = F * c_old + I * C;
+        Float4 H     = O * Float4::tanh(cell2);
+        for (int r = 0; r < remain; ++r) {
+            c_ptr[r] = cell2[r];
+            h_ptr[r] = H[r];
+            o_ptr[r] = H[r];
+        }
+    }
+}
+
+Status ArmLSTMONNXLayerAcc::LstmSingleDirection(const float *x, float *y, const float *w, const float *r,
+                                                const float *b, float *h_t, float *c_t, const int batch_size,
+                                                int reverse) {
+    const int input_size  = input_size_;
+    const int hidden_size = hidden_size_;
+    const int seq_len     = seq_len_;
+
+    int gates_count      = seq_len * batch_size * hidden_size * 4;
+    int input_pack_count = MAX(seq_len * batch_size * input_size, batch_size * hidden_size);
+    auto workspace =
+        context_->GetSharedWorkSpace((gates_count + input_pack_count) * sizeof(float) + NEON_KERNEL_EXTRA_LOAD);
+    auto gates_ptr      = reinterpret_cast<float *>(workspace);
+    auto input_pack_ptr = gates_ptr + gates_count;
+    for (int i = 0; i < seq_len * batch_size; ++i) {
+        float *gates_i = gates_ptr + i * hidden_size * 4;
+        memcpy(gates_i, b, hidden_size * 4 * sizeof(float));
+    }
+
+    GemmFloatPackA(seq_len * batch_size, hidden_size * 4, input_size, x, input_pack_ptr, input_size, w, hidden_size * 4,
+                   gates_ptr, hidden_size * 4);
+
+    for (int t = 0; t < seq_len; ++t) {
+        int ti         = reverse ? seq_len - 1 - t : t;
+        float *y_t     = y + ti * batch_size * hidden_size;
+        float *gates_t = gates_ptr + ti * batch_size * hidden_size * 4;
+
+        GemmFloatPackA(batch_size, hidden_size * 4, hidden_size, h_t, input_pack_ptr, hidden_size, r, hidden_size * 4,
+                       gates_t, hidden_size * 4);
+
+        LstmActivate(batch_size * hidden_size, gates_t, c_t, h_t, y_t);
+    }
+
+    return TNN_OK;
+}
+
+// [4, hidden_size, input_size] -> transpose -> [input_size, hidden_size, 4]
+// [input_size, hidden_size * 4] -> PackB_8 -> [hidden_size * 4 / 8, input_size, 8]
+static void TransposeAndPackWeight(const float *src, float *dst, int input_size, int hidden_size) {
+    RawBuffer tmp_transpose = RawBuffer(input_size * hidden_size * 4 * sizeof(float));
+    float *src_transpose    = tmp_transpose.force_to<float *>();
+    const float *vsrc[4];
+    vsrc[0]   = src;
+    vsrc[1]   = vsrc[0] + input_size * hidden_size;
+    vsrc[2]   = vsrc[1] + input_size * hidden_size;
+    vsrc[3]   = vsrc[2] + input_size * hidden_size;
+    int count = 0;
+    for (int i = 0; i < input_size; ++i) {
+        for (int h = 0; h < hidden_size; ++h) {
+            src_transpose[count++] = vsrc[0][h * input_size + i];
+            src_transpose[count++] = vsrc[1][h * input_size + i];
+            src_transpose[count++] = vsrc[2][h * input_size + i];
+            src_transpose[count++] = vsrc[3][h * input_size + i];
+        }
+    }
+    PackB_8(input_size, hidden_size * 4, src_transpose, hidden_size * 4, dst);
+}
+
+bool ArmLSTMONNXLayerAcc::UseNaiveConstantBlobs() {
+    return true;
+}
+
+ArmLSTMONNXLayerAcc::~ArmLSTMONNXLayerAcc() {}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferWeightInput(Blob *weight_i) {
+    // W[iofc], weight tensor for the gates, shape [num_directions, 4*hidden_size, input_size]
+    float *weight_i_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(weight_i->GetHandle()));
+
+    int weight_page       = input_size_ * ROUND_UP(4 * hidden_size_, 8);
+    int weight_byte_count = num_directions_ * weight_page * sizeof(float);
+    buffer_weight_input_  = RawBuffer(weight_byte_count + NEON_KERNEL_EXTRA_LOAD);
+    for (int dir = 0; dir < num_directions_; ++dir) {
+        float *buffer_ptr = buffer_weight_input_.force_to<float *>() + dir * weight_page;
+        TransposeAndPackWeight(weight_i_ptr, buffer_ptr, input_size_, hidden_size_);
+        weight_i_ptr += 4 * hidden_size_ * input_size_;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferWeightRecurrent(Blob *weight_r) {
+    // R[iofc], recurrence weight tensor, shape [num_directions, 4*hidden_size, hidden_size]
+    float *weight_r_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(weight_r->GetHandle()));
+
+    int weight_page          = hidden_size_ * ROUND_UP(4 * hidden_size_, 8);
+    int weight_byte_count    = num_directions_ * weight_page * sizeof(float);
+    buffer_weight_recurrent_ = RawBuffer(weight_byte_count + NEON_KERNEL_EXTRA_LOAD);
+    for (int dir = 0; dir < num_directions_; ++dir) {
+        float *buffer_ptr = buffer_weight_recurrent_.force_to<float *>() + dir * weight_page;
+        TransposeAndPackWeight(weight_r_ptr, buffer_ptr, hidden_size_, hidden_size_);
+        weight_r_ptr += 4 * hidden_size_ * hidden_size_;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferBias(Blob *bias) {
+    // B[iofc] Concatenation of [Wb[iofc], Rb[iofc]], [num_directions, 8*hidden_size]
+    float *bias_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(bias->GetHandle()));
+
+    int bias_count       = num_directions_ * 4 * hidden_size_;
+    int bias_byte_count  = bias_count * sizeof(float);
+    buffer_bias_         = RawBuffer(bias_byte_count);
+    auto buffer_bias_ptr = buffer_bias_.force_to<float *>();
+    for (int d = 0; d < num_directions_; ++d) {
+        auto src_d = bias_ptr + d * 8 * hidden_size_;
+        auto dst_d = buffer_bias_ptr + d * 4 * hidden_size_;
+        for (int i = 0; i < hidden_size_; ++i) {
+            dst_d[i * 4 + 0] = src_d[i + 0 * hidden_size_] + src_d[i + 4 * hidden_size_];
+            dst_d[i * 4 + 1] = src_d[i + 1 * hidden_size_] + src_d[i + 5 * hidden_size_];
+            dst_d[i * 4 + 2] = src_d[i + 2 * hidden_size_] + src_d[i + 6 * hidden_size_];
+            dst_d[i * 4 + 3] = src_d[i + 3 * hidden_size_] + src_d[i + 7 * hidden_size_];
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    direction_      = layer_param->direction;
+    num_directions_ = direction_ >= 2 ? 2 : 1;
+    hidden_size_    = layer_param->hidden_size;
+
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+    }
+    if (outputs.size() < 3) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid outputs");
+    }
+    seq_len_    = inputs[0]->GetBlobDesc().dims[0];
+    input_size_ = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims, 2);
+
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (input_data_type == DATA_TYPE_FLOAT) {
+        RETURN_ON_NEQ((AllocateBufferWeightInput(inputs[1])), TNN_OK);
+        RETURN_ON_NEQ((AllocateBufferWeightRecurrent(inputs[2])), TNN_OK);
+        RETURN_ON_NEQ(AllocateBufferBias(inputs[3]), TNN_OK);
+    }
+#if TNN_ARM82
+    else if (input_data_type == DATA_TYPE_HALF) {
+        RETURN_ON_NEQ((AllocateBufferWeightInputHalf(inputs[1])), TNN_OK);
+        RETURN_ON_NEQ((AllocateBufferWeightRecurrentHalf(inputs[2])), TNN_OK);
+        RETURN_ON_NEQ(AllocateBufferBiasHalf(inputs[3]), TNN_OK);
+    }
+#endif  // TNN_ARM82
+    else {
+        LOGE("ARM LSTM not support data type: %d\n", input_data_type);
+        return Status(TNNERR_LAYER_ERR, "ARM LSTM not support data type");
+    }
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmLSTMONNXLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto batch          = inputs[0]->GetBlobDesc().dims[1];
+    const auto direction      = direction_;
+    const auto num_directions = num_directions_;
+    const auto seq_len        = seq_len_;
+    const auto input_size     = input_size_;
+    const auto hidden_size    = hidden_size_;
+
+    // X shape [sequence batch_size input_size]
+    T *x = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+
+    // Y shape [sequence batch_size num_directions *hidden_size]
+    T *y = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    // Initial states. If not specified, assumed to be 0.
+    // shape [num_directions, batch_size, hidden_size]
+    auto h_t = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[1]->GetHandle()));
+    auto c_t = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[2]->GetHandle()));
+    if (inputs.size() >= 6) {
+        auto h_0 = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[4]->GetHandle()));
+        memcpy((void *)h_t, h_0, num_directions * batch * hidden_size * sizeof(T));
+        auto c_0 = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[5]->GetHandle()));
+        memcpy((void *)c_t, c_0, num_directions * batch * hidden_size * sizeof(T));
+    } else {
+        memset((void *)h_t, 0, num_directions * batch * hidden_size * sizeof(T));
+        memset((void *)c_t, 0, num_directions * batch * hidden_size * sizeof(T));
+    }
+
+    T *w = buffer_weight_input_.force_to<T *>();
+    T *r = buffer_weight_recurrent_.force_to<T *>();
+    T *b = buffer_bias_.force_to<T *>();
+
+    if (direction == 0 || direction == 1) {
+        return LstmSingleDirection(x, y, w, r, b, h_t, c_t, batch, direction);
+    } else if (direction == 2) {
+        // Y shape [num_directions sequence batch_size hidden_size]
+        RawBuffer y_temp = RawBuffer(num_directions * seq_len * batch * hidden_size * sizeof(T));
+        auto y0          = y_temp.force_to<T *>();
+        auto y1          = y0 + seq_len * batch * hidden_size;
+        LstmSingleDirection(x, y0, w, r, b, h_t, c_t, batch, 0);
+
+        auto w1   = w + ROUND_UP(4 * hidden_size, 8) * input_size;
+        auto r1   = r + ROUND_UP(4 * hidden_size, 8) * hidden_size;
+        auto b1   = b + 4 * hidden_size;
+        auto h_t1 = h_t + batch * hidden_size;
+        auto c_t1 = c_t + batch * hidden_size;
+        LstmSingleDirection(x, y1, w1, r1, b1, h_t1, c_t1, batch, 1);
+
+        // transpose [num_directions sequence batch_size hidden_size] to [sequence batch_size
+        // num_directions*hidden_size]
+        for (int i = 0; i < seq_len * batch; i++) {
+            auto y0_data = y0 + i * hidden_size;
+            auto y1_data = y1 + i * hidden_size;
+            auto y_data  = y + i * num_directions * hidden_size;
+
+            memcpy(y_data, y0_data, hidden_size * sizeof(T));
+            memcpy(y_data + hidden_size, y1_data, hidden_size * sizeof(T));
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "LSTMONNX has invalid direction param");
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (input_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (input_data_type == DATA_TYPE_HALF) {
+        return ExecFp16(inputs, outputs);
+    }
+#endif  // TNN_ARM82
+    else {
+        LOGE("ARM LSTM not support data type: %d\n", input_data_type);
+        return Status(TNNERR_LAYER_ERR, "ARM LSTM not support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(LSTMONNX, LAYER_LSTMONNX);
+REGISTER_ARM_PRECISION_FP16(LAYER_LSTMONNX)
+REGISTER_ARM_LAYOUT(LAYER_LSTMONNX, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.h
new file mode 100644
index 0000000..292e0bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_lstm_layer_acc.h
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_LSTM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_LSTM_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+class ArmLSTMONNXLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmLSTMONNXLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    Status AllocateBufferWeightInput(Blob *weight_i);
+    Status AllocateBufferWeightRecurrent(Blob *weight_r);
+    Status AllocateBufferBias(Blob *bias);
+#if TNN_ARM82
+    Status AllocateBufferWeightInputHalf(Blob *weight_i);
+    Status AllocateBufferWeightRecurrentHalf(Blob *weight_r);
+    Status AllocateBufferBiasHalf(Blob *bias);
+#endif  // TNN_ARM82
+
+    virtual bool UseNaiveConstantBlobs() override;
+
+    RawBuffer buffer_weight_input_;
+    RawBuffer buffer_weight_recurrent_;
+    RawBuffer buffer_bias_;
+
+    int direction_;
+    int num_directions_;
+    int input_size_;
+    int hidden_size_;
+    int seq_len_;
+
+private:
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status LstmSingleDirection(const float *x, float *y, const float *w, const float *r, const float *b, float *h_t,
+                               float *c_t, const int batch_size, int reverse);
+#if TNN_ARM82
+    Status ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status LstmSingleDirection(const fp16_t *x, fp16_t *y, const fp16_t *w, const fp16_t *r, const fp16_t *b,
+                               fp16_t *h_t, fp16_t *c_t, const int batch_size, int reverse);
+#endif  // TNN_ARM82
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_LSTM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.cc
new file mode 100644
index 0000000..ecd305e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.cc
@@ -0,0 +1,150 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_mat_mul_layer_acc.h"
+
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+#if TNN_ARM82
+#include "tnn/device/arm/acc/compute_arm82/compute_half.h"
+#endif
+
+namespace TNN_NS {
+
+ArmMatMulLayerAcc::~ArmMatMulLayerAcc() {}
+
+Status ArmMatMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+            const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto res = dynamic_cast<MatMulLayerResource *>(resource);
+
+    if (!res) {
+        if (inputs.size() == 2) {
+            // weights are get from inputs
+            return TNN_OK;
+        } else {
+            LOGE("ArmMatMulLayerAcc::Init resource is null\n");
+            return Status(TNNERR_PARAM_ERR, "ArmMatMulLayerAcc::Init resource is null");
+        }
+    }
+
+#if TNN_ARM82
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        RawBuffer weight_handle = res->weight;
+        CHECK_PARAM_NULL(weight_handle.force_to<void *>());
+        if (weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            buffer_weight_ = RawBuffer(weight_handle.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            ConvertFromFloatToHalf(weight_handle.force_to<float *>(), buffer_weight_.force_to<fp16_t *>(), weight_handle.GetDataCount());
+            buffer_weight_.SetDataType(DATA_TYPE_HALF);
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmMatMulLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param               = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource            = dynamic_cast<MatMulLayerResource *>(resource_);
+    DimsVector matrix_a_dims = param->matrix_a_dims;
+    DimsVector matrix_b_dims = param->matrix_b_dims;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+    }
+    DataType data_type       = inputs[0]->GetBlobDesc().data_type;
+    auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
+
+    T *matrix_a;
+    T *matrix_b;
+
+    if (inputs.size() == 2) {
+        matrix_a = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        matrix_b = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[1]->GetHandle()));
+    } else {
+        auto weight = resource->weight.force_to<T *>();
+        if (buffer_weight_.force_to<T *>()) {
+            weight = buffer_weight_.force_to<T *>();
+        }
+        matrix_a    = param->weight_position == 0 ? weight : reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        matrix_b    = param->weight_position == 1 ? weight : reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    }
+    auto matrix_c = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    int N = matrix_b_dims[matrix_b_dims.size() - 1];
+    int K = matrix_a_dims[matrix_a_dims.size() - 1];
+    int M = matrix_a_dims[matrix_a_dims.size() - 2];
+
+    auto data_byte_size   = DataTypeUtils::GetBytesSize(data_type);
+    size_t pack_a_size = M * K * data_byte_size + NEON_KERNEL_EXTRA_LOAD;
+    int n_pack = 8;
+    if (data_type == DATA_TYPE_HALF) {
+        n_pack = 16;
+    }
+    size_t pack_b_size = K * ROUND_UP(N, n_pack) * data_byte_size + NEON_KERNEL_EXTRA_LOAD;
+    size_t workspace_size = pack_a_size + pack_b_size;
+    char *workspace = reinterpret_cast<char *>(context_->GetSharedWorkSpace(workspace_size));
+    T *pack_a_ptr = reinterpret_cast<T *>(workspace);
+    T *pack_b_ptr = reinterpret_cast<T *>(workspace + pack_a_size);
+
+    int count_a     = DimsVectorUtils::Count(matrix_a_dims);
+    int count_b     = DimsVectorUtils::Count(matrix_b_dims);
+    int count_c     = DimsVectorUtils::Count(matrix_c_dims);
+    int batch_a   = count_a / (M * K);
+    int batch_b   = count_b / (K * N);
+    int batch_c   = count_c / (M * N);
+
+    for (int bc = 0; bc < batch_c; ++bc) {
+        int ba = bc < batch_a ? bc : 0;
+        int bb = bc < batch_b ? bc : 0;
+        auto a_ptr = matrix_a + ba * M * K;
+        auto b_ptr = matrix_b + bb * K * N;
+        auto c_ptr = matrix_c + bc * M * N;
+
+        memset(c_ptr, 0, M * N * data_byte_size);
+        // row major A[M * K] * B[K * N] = C[M * n]
+        GemmFloatPackAB(M, N, K, a_ptr, pack_a_ptr, K, b_ptr, pack_b_ptr, N, c_ptr, N);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmMatMulLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (input_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (input_data_type == DATA_TYPE_HALF) {
+        return Exec<fp16_t>(inputs, outputs);
+    }
+#endif  // TNN_ARM82
+    else {
+        LOGE("ARM LSTM not support data type: %d\n", input_data_type);
+        return Status(TNNERR_LAYER_ERR, "ARM LSTM not support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(MatMul, LAYER_MATMUL);
+REGISTER_ARM_PRECISION_FP16(LAYER_MATMUL)
+REGISTER_ARM_LAYOUT(LAYER_MATMUL, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.h
new file mode 100644
index 0000000..ace42cd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mat_mul_layer_acc.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_MUL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_MUL_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+class ArmMatMulLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmMatMulLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+private:
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+protected:
+    RawBuffer buffer_weight_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_MUL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_max_layer_acc.cc
new file mode 100644
index 0000000..156de2d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_max_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Max);
+
+Status ArmMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kMAX;
+
+    return TNN_OK;
+}
+
+ArmMaxLayerAcc::~ArmMaxLayerAcc() {}
+
+REGISTER_ARM_ACC(Max, LAYER_MAXIMUM)
+REGISTER_ARM_PRECISION_FP16(LAYER_MAXIMUM)
+REGISTER_ARM_LAYOUT(LAYER_MAXIMUM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_min_layer_acc.cc
new file mode 100644
index 0000000..802aaaf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_min_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Min);
+
+Status ArmMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kMIN;
+
+    return TNN_OK;
+}
+
+ArmMinLayerAcc::~ArmMinLayerAcc() {}
+
+REGISTER_ARM_ACC(Min, LAYER_MINIMUM)
+REGISTER_ARM_PRECISION_FP16(LAYER_MINIMUM)
+REGISTER_ARM_LAYOUT(LAYER_MINIMUM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mul_layer_acc.cc
new file mode 100644
index 0000000..7ed50bd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_mul_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Mul);
+
+Status ArmMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kMUL;
+
+    return TNN_OK;
+}
+
+ArmMulLayerAcc::~ArmMulLayerAcc() {}
+
+REGISTER_ARM_ACC(Mul, LAYER_MUL)
+REGISTER_ARM_PRECISION_FP16(LAYER_MUL)
+REGISTER_ARM_LAYOUT(LAYER_MUL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.cc
new file mode 100644
index 0000000..0fed90b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_nchw_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+ArmNchwLayerAcc::~ArmNchwLayerAcc(){};
+
+template <typename T>
+Status ArmNchwLayerAcc::UnPackInputs(const std::vector<Blob *> &inputs) {
+    int ic_round_up = 4;
+    if (std::is_same<T, float>::value) {
+        ic_round_up = 4;
+    } else if (std::is_same<T, fp16_t>::value) {
+        ic_round_up = 8;
+    }
+
+    for (int i = 0; i < inputs.size(); i++) {
+        auto input_dims = inputs[i]->GetBlobDesc().dims;
+        for (int n = 0; n < input_dims[0]; ++n) {
+            auto in_count  = DimsVectorUtils::Count(input_dims, 2) * ROUND_UP(input_dims[1], ic_round_up);
+            auto out_count = DimsVectorUtils::Count(input_dims, 2) * input_dims[1];
+            T *src         = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[i]->GetHandle())) + n * in_count;
+            T *dst         = reinterpret_cast<T *>(GetBlobHandlePtr(nchw_blob_in[i]->GetHandle())) + n * out_count;
+            UnpackCX(dst, src, DimsVectorUtils::Count(input_dims, 2), input_dims[1]);
+        }
+    }
+    return TNN_OK;
+}
+
+template Status ArmNchwLayerAcc::UnPackInputs<float>(const std::vector<Blob *> &inputs);
+template Status ArmNchwLayerAcc::UnPackInputs<fp16_t>(const std::vector<Blob *> &inputs);
+
+template <typename T>
+Status ArmNchwLayerAcc::PackOutputs(const std::vector<Blob *> &outputs) {
+    int oc_round_up = 4;
+    if (std::is_same<T, float>::value) {
+        oc_round_up = 4;
+    } else if (std::is_same<T, fp16_t>::value) {
+        oc_round_up = 8;
+    }
+
+    for (int i = 0; i < outputs.size(); i++) {
+        auto out_dims                  = nchw_blob_out[i]->GetBlobDesc().dims;
+        outputs[i]->GetBlobDesc().dims = out_dims;
+        for (int n = 0; n < out_dims[0]; ++n) {
+            auto in_count  = DimsVectorUtils::Count(out_dims, 2) * out_dims[1];
+            auto out_count = DimsVectorUtils::Count(out_dims, 2) * ROUND_UP(out_dims[1], oc_round_up);
+            T *src         = reinterpret_cast<T *>(GetBlobHandlePtr(nchw_blob_out[i]->GetHandle())) + n * in_count;
+            T *dst         = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[i]->GetHandle())) + n * out_count;
+            PackCX(dst, src, DimsVectorUtils::Count(out_dims, 2), out_dims[1]);
+        }
+    }
+    return TNN_OK;
+}
+
+template Status ArmNchwLayerAcc::PackOutputs<float>(const std::vector<Blob *> &outputs);
+template Status ArmNchwLayerAcc::PackOutputs<fp16_t>(const std::vector<Blob *> &outputs);
+
+Status ArmNchwLayerAcc::AllocConvertBuffer(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    int space_id = 0;
+    nchw_blob_in.clear();
+    nchw_blob_out.clear();
+    for (auto blob : inputs) {
+        auto desc = blob->GetBlobDesc();
+        BlobHandle handle;
+        handle.base = context_->GetSharedWorkSpace(
+            DimsVectorUtils::Count(desc.dims) * DataTypeUtils::GetBytesSize(desc.data_type), space_id++);
+        nchw_blob_in.push_back(std::make_shared<Blob>(desc, handle));
+    }
+
+    for (auto blob : outputs) {
+        auto desc = blob->GetBlobDesc();
+        BlobHandle handle;
+        handle.base = context_->GetSharedWorkSpace(
+            DimsVectorUtils::Count(desc.dims) * DataTypeUtils::GetBytesSize(desc.data_type), space_id++);
+        nchw_blob_out.push_back(std::make_shared<Blob>(desc, handle));
+    }
+
+    return TNN_OK;
+}
+
+Status ArmNchwLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return Status(TNNERR_LAYER_ERR, "CALL ERROR: NCHW BASE TYPE, NOT IMPLEMENT");
+}
+
+std::vector<Blob *> ArmNchwLayerAcc::GetNchwBlobVector(const std::vector<std::shared_ptr<Blob>> &blobs) {
+    std::vector<Blob *> ret;
+    for (auto v : blobs) {
+        ret.push_back(v.get());
+    }
+    return ret;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.h
new file mode 100644
index 0000000..914ae0b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_nchw_layer_acc.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_NCHW_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_NCHW_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief NCHW layer cpu acc
+class ArmNchwLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmNchwLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    virtual Status AllocConvertBuffer(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status UnPackInputs(const std::vector<Blob *> &inputs);
+
+    template <typename T>
+    Status PackOutputs(const std::vector<Blob *> &outputs);
+
+    std::vector<Blob *> GetNchwBlobVector(const std::vector<std::shared_ptr<Blob>> &blobs);
+
+    std::vector<std::shared_ptr<Blob>> nchw_blob_in;
+    std::vector<std::shared_ptr<Blob>> nchw_blob_out;
+};
+
+}  // namespace TNN_NS
+
+#define DECLARE_ARM_NCHW_ACC(type_string, layer_type)                                                                  \
+    class Arm##type_string##LayerAcc : public ArmNchwLayerAcc {                                                        \
+    public:                                                                                                            \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+        virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);               \
+    }
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_NCHW_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_neg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_neg_layer_acc.cc
new file mode 100644
index 0000000..3f82eb2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_neg_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_neg_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::neg(v);
+    }
+} ARM_NEG_OP;
+
+DECLARE_ARM_UNARY_ACC(Neg, ARM_NEG_OP);
+
+REGISTER_ARM_ACC(Neg, LAYER_NEG);
+REGISTER_ARM_LAYOUT(LAYER_NEG, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_normalize_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_normalize_layer_acc.cc
new file mode 100644
index 0000000..c3a7538
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_normalize_layer_acc.cc
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+#include <limits.h>
+#include <cmath>
+
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Normalize, LAYER_NORMALIZE);
+
+static inline void _sum_abs(float *dst, float *src, int channel, int plane_num) {
+    for (int c = 0; c < UP_DIV(channel, 4); c++) {
+        float *input_data_c = src + c * 4 * plane_num;
+        for (int i = 0; i < plane_num; i++) {
+            Float4::save(dst + i * 4, Float4::load(dst + i * 4) + Float4::abs(Float4::load(input_data_c + i * 4)));
+        }
+    }
+    for (int i = 0; i < plane_num; i++) {
+        dst[i] = dst[i * 4 + 0] + dst[i * 4 + 1] + dst[i * 4 + 2] + dst[i * 4 + 3];
+    }
+}
+
+static inline void _sum_valsq(float *dst, float *src, int channel, int plane_num) {
+    // sum  x*x
+    for (int c = 0; c < UP_DIV(channel, 4); c++) {
+        float *input_data_c = src + c * 4 * plane_num;
+        for (int i = 0; i < plane_num; i++) {
+            Float4::save(dst + i * 4, Float4::load(dst + i * 4) +
+                                        Float4::load(input_data_c + i * 4) * Float4::load(input_data_c + i * 4));
+        }
+    }
+    for (int i = 0; i < plane_num; i++) {
+        dst[i] = dst[i * 4 + 0] + dst[i * 4 + 1] + dst[i * 4 + 2] + dst[i * 4 + 3];
+    }
+}
+
+static inline void _max(float *dst, float *src, int channel, int plane_num) {
+    for (int c = 0; c < UP_DIV(channel, 4); c++) {
+        float *input_data_c = src + c * 4 * plane_num;
+        for (int i = 0; i < plane_num; i++) {
+            Float4::save(dst + i * 4, Float4::max(Float4::load(dst + i * 4), Float4::load(input_data_c + i * 4)));
+        }
+    }
+    for (int i = 0; i < plane_num; i++) {
+        dst[i] = std::max(std::max(std::max(dst[i * 4 + 0], dst[i * 4 + 1]), dst[i * 4 + 2]), dst[i * 4 + 3]);
+    }
+}
+
+static inline void _min(float *dst, float *src, int channel, int plane_num) {
+    for (int c = 0; c < UP_DIV(channel, 4); c++) {
+        float *input_data_c = src + c * 4 * plane_num;
+        for (int i = 0; i < plane_num; i++) {
+            Float4::save(dst + i * 4, Float4::min(Float4::load(dst + i * 4), Float4::load(input_data_c + i * 4)));
+        }
+    }
+    for (int i = 0; i < plane_num; i++) {
+        dst[i] = std::min(std::min(std::min(dst[i * 4 + 0], dst[i * 4 + 1]), dst[i * 4 + 2]), dst[i * 4 + 3]);
+    }
+}
+
+Status ArmNormalizeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 1) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 2");
+    }
+    auto layer_param = dynamic_cast<NormalizeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    int axis           = layer_param->axis;
+    int p              = layer_param->p;
+    int across_spatial = layer_param->across_spatial;
+
+    // old tnn support scale the result of normalize and only norm2
+    if ((p != 1 && p != 2 && p != INT_MAX && p != INT_MIN) || axis != 1 || across_spatial != 0) {
+        LOGE("Error: layer param is not supported now\n");
+        return Status(TNNERR_INST_ERR, "Error: layer param is not supported now");
+    }
+
+    float epsilon      = layer_param->epsilon;
+    int channel_shared = layer_param->channel_shared;
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto output_dims  = output_blob->GetBlobDesc().dims;
+    int batch         = output_dims[0];
+    int channel       = output_dims[1];
+    int plane_num     = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+
+        RawBuffer temp(4 * plane_num * sizeof(float));
+
+        for (int b = 0; b < batch; b++) {
+            float *input_data_b  = input_data + b * channel * plane_num;
+            float *output_data_b = output_data + b * channel * plane_num;
+
+            if (layer_param->p == INT_MAX) {
+                float fl_min = FLT_MIN;
+                memset(temp.force_to<void *>(), *(reinterpret_cast<int *>(&fl_min)), temp.GetBytesSize());
+            } else if (layer_param->p == INT_MIN) {
+                float fl_max = FLT_MAX;
+                memset(temp.force_to<void *>(), *(reinterpret_cast<int *>(&fl_max)), temp.GetBytesSize());
+            } else {
+                memset(temp.force_to<void *>(), 0, temp.GetBytesSize());
+            }
+
+            if (layer_param->p == 1) {
+                auto workspace = temp.force_to<float *>();
+                // sum - abs(x)
+                _sum_abs(workspace, input_data_b, channel, plane_num);
+            } else if (layer_param->p == 2) {
+                auto workspace = temp.force_to<float *>();
+                // sum  x*x
+                _sum_valsq(workspace, input_data_b, channel, plane_num);
+                // max - sqrt
+                for (int i = 0; i < plane_num; i++) {
+                    workspace[i] = std::max(sqrtf(workspace[i]), epsilon);
+                }
+            } else if (layer_param->p == INT_MAX) {
+                auto workspace = temp.force_to<float *>();
+                _max(workspace, input_data_b, channel, plane_num);
+            } else if (layer_param->p == INT_MIN) {
+                auto workspace = temp.force_to<float *>();
+                _min(workspace, input_data_b, channel, plane_num);
+            }
+
+            // div
+            auto workspace = temp.force_to<float *>();
+            for (int c = 0; c < UP_DIV(channel, 4); c++) {
+                float *input_data_c  = input_data_b + c * 4 * plane_num;
+                float *output_data_c = output_data_b + c * 4 * plane_num;
+                for (int i = 0; i < plane_num; i++) {
+                    Float4::save(output_data_c + i * 4,
+                                 Float4::div(Float4::load(input_data_c + i * 4), Float4(workspace[i])));
+                }
+            }
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Normalize, LAYER_NORMALIZE);
+REGISTER_ARM_LAYOUT(LAYER_NORMALIZE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_pad_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pad_layer_acc.cc
new file mode 100644
index 0000000..1f30b0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pad_layer_acc.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/pad_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Pad, LAYER_PAD);
+
+Status ArmPadLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PadLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    PadUtils::PadContext pad_context;
+    if (input_dims.size() != 4) {
+        LOGE("Error: ArmPadLayerAcc only support 4 dims input, but now dims size is %lu/n", input_dims.size());
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadLayerAcc only support 4 dims input");
+    }
+    pad_context.input_batch       = input_dims[0];
+    pad_context.output_batch      = output_dims[0];
+    pad_context.input_channel     = input_dims[1];
+    pad_context.input_channel_r4  = ROUND_UP(input_dims[1], 4);
+    pad_context.output_channel    = output_dims[1];
+    pad_context.output_channel_r4 = ROUND_UP(output_dims[1], 4);
+    pad_context.input_height      = input_dims[2];
+    pad_context.output_height     = output_dims[2];
+    pad_context.input_width       = input_dims[3];
+    pad_context.output_width      = output_dims[3];
+    const auto pads               = layer_param->pads;
+    if (pads.size() != 6) {
+        LOGE("Error: ArmPadLayerAcc layer acc does not support pas size %lu\n", pads.size());
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadV2LayerAcc layer acc does not support");
+    }
+    pad_context.pad_l   = layer_param->pads[0];
+    pad_context.pad_r   = layer_param->pads[1];
+    pad_context.pad_t   = layer_param->pads[2];
+    pad_context.pad_b   = layer_param->pads[3];
+    pad_context.pad_c_b = layer_param->pads[4];
+    pad_context.pad_c_e = layer_param->pads[5];
+    pad_context.type    = layer_param->type;
+    pad_context.value   = layer_param->value;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_UINT32) {
+        auto input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        if (layer_param->type == 0) {
+            // mode: const
+            return PadUtils::ConstPadV2(input_data, output_data, input_dims, output_dims, pad_context);
+        } else if (layer_param->type == 1) {
+            // mode: reflect
+            return PadUtils::ReflectPadV2(input_data, output_data, input_dims, output_dims, pad_context);
+        } else {
+            LOGE("Error: ArmPadLayerAcc does not support pad type:%d\n", layer_param->type);
+            return Status(TNNERR_PARAM_ERR, "Error: ArmPadV2LayerAcc does not support pad type");
+        }
+    } else {
+        LOGE("Error: ArmPadLayerAcc layer acc does not support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadLayerAcc does not support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Pad, LAYER_PAD);
+REGISTER_ARM_LAYOUT(LAYER_PAD, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_padv2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_padv2_layer_acc.cc
new file mode 100644
index 0000000..249ec20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_padv2_layer_acc.cc
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/pad_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(PadV2, LAYER_PADV2);
+
+Status ArmPadV2LayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    PadUtils::PadContext pad_context;
+    if (input_dims.size() < 2 || input_dims.size() > 5) {
+        LOGE("Error: ArmPadV2LayerAcc layer acc does not support input dims size %lu\n", input_dims.size());
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadV2LayerAcc layer acc does not support;");
+    }
+    pad_context.input_batch  = input_dims[0];
+    pad_context.output_batch = output_dims[0];
+
+    if (input_dims.size() >= 2) {
+        pad_context.input_channel     = input_dims[1];
+        pad_context.input_channel_r4  = ROUND_UP(input_dims[1], 4);
+        pad_context.output_channel    = output_dims[1];
+        pad_context.output_channel_r4 = ROUND_UP(output_dims[1], 4);
+    }
+    if (input_dims.size() >= 3) {
+        pad_context.input_height  = input_dims[2];
+        pad_context.output_height = output_dims[2];
+    }
+    if (input_dims.size() >= 4) {
+        pad_context.input_width  = input_dims[3];
+        pad_context.output_width = output_dims[3];
+    }
+    if (input_dims.size() == 5) {
+        pad_context.input_depth  = input_dims[2];
+        pad_context.input_height = input_dims[3];
+        pad_context.input_width  = input_dims[4];
+        pad_context.output_depth = output_dims[2];
+        pad_context.input_height = output_dims[3];
+        pad_context.output_width = output_dims[4];
+    }
+    const auto pads = layer_param->pads;
+    if (pads.size() < 2 || pads.size() > 10) {
+        LOGE("Error: ArmPadV2LayerAcc layer acc does not support pas size %lu\n", pads.size());
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadV2LayerAcc layer acc does not support");
+    }
+    switch (pads.size()) {
+        case 4: {
+            pad_context.pad_b_b = pads[0];  // pad batch begin
+            pad_context.pad_c_b = pads[1];  // pad channel begin
+            pad_context.pad_b_e = pads[2];  // pad batch end
+            pad_context.pad_c_e = pads[3];  // pad channel end
+            break;
+        }
+        case 6: {
+            pad_context.pad_b_b = pads[0];  // pad batch begin
+            pad_context.pad_c_b = pads[1];  // pad channel begin
+            pad_context.pad_t   = pads[2];  // pad height begin
+            pad_context.pad_b_e = pads[3];  // pad batch end
+            pad_context.pad_c_e = pads[4];  // pad channel end
+            pad_context.pad_b   = pads[5];  // pad height end
+            break;
+        }
+        case 8: {
+            pad_context.pad_b_b = pads[0];  // pad batch begin
+            pad_context.pad_c_b = pads[1];  // pad channel begin
+            pad_context.pad_t   = pads[2];  // pad height begin
+            pad_context.pad_l   = pads[3];  // pad width begin
+            pad_context.pad_b_e = pads[4];  // pad batch end
+            pad_context.pad_c_e = pads[5];  // pad channel end
+            pad_context.pad_b   = pads[6];  // pad height end
+            pad_context.pad_r   = pads[7];  // pad width end
+            break;
+        }
+        case 10: {
+            pad_context.pad_b_b = pads[0];  // pad batch begin
+            pad_context.pad_c_b = pads[1];  // pad channel begin
+            pad_context.pad_d_b = pads[2];  // pad depth begin
+            pad_context.pad_t   = pads[3];  // pad height begin
+            pad_context.pad_l   = pads[4];  // pad width begin
+            pad_context.pad_b_e = pads[5];  // pad batch end
+            pad_context.pad_c_e = pads[6];  // pad channel end
+            pad_context.pad_d_e = pads[7];  // pad depth end
+            pad_context.pad_b   = pads[8];  // pad height end
+            pad_context.pad_r   = pads[9];  // pad width end
+            break;
+        }
+    }
+
+    pad_context.type  = layer_param->type;
+    pad_context.value = layer_param->value;
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_UINT32) {
+        auto input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        if (layer_param->type == 0) {
+            // mode: const
+            return PadUtils::ConstPadV2(input_data, output_data, input_dims, output_dims, pad_context);
+        } else if (layer_param->type == 1) {
+            // mode: reflect
+            return PadUtils::ReflectPadV2(input_data, output_data, input_dims, output_dims, pad_context);
+        } else {
+            LOGE("Error: ArmPadV2LayerAcc does not support pad type: type:%d\n", layer_param->type);
+            return Status(TNNERR_PARAM_ERR, "Error: ArmPadV2LayerAcc layer param does not support pad type");
+        }
+    } else {
+        LOGE("Error: ArmPadV2LayerAcc does not support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: ArmPadV2LayerAcc does not support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(PadV2, LAYER_PADV2);
+REGISTER_ARM_LAYOUT(LAYER_PADV2, DATA_FORMAT_NC4HW4);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_permute_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_permute_layer_acc.cc
new file mode 100644
index 0000000..76f7159
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_permute_layer_acc.cc
@@ -0,0 +1,173 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_nchw_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_NCHW_ACC(Permute, LAYER_PERMUTE);
+
+template <typename T>
+void ArmPermute(const int count, DimsVector dims, T *bottom_data, const std::vector<int> &permute_order,
+                const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                T *top_data) {
+    if (num_axes == 5) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    for (int w = 0; w < dims[3]; ++w) {
+                        int idx_w     = idx_h + w * new_steps[3];
+                        int old_idx_w = old_idx_h + w * old_steps[permute_order[3]];
+                        for (int x = 0; x < dims[4]; ++x) {
+                            int idx_x     = idx_w + x * new_steps[4];
+                            int old_idx_x = old_idx_w + x * old_steps[permute_order[4]];
+                            top_data[idx_x] = bottom_data[old_idx_x];
+                        }
+                    }
+                }
+            }
+        }
+    } else if (num_axes == 4) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    for (int w = 0; w < dims[3]; ++w) {
+                        int idx_w     = idx_h + w * new_steps[3];
+                        int old_idx_w = old_idx_h + w * old_steps[permute_order[3]];
+                        top_data[idx_w] = bottom_data[old_idx_w];
+                    }
+                }
+            }
+        }
+    } else if (num_axes == 3) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    top_data[idx_h] = bottom_data[old_idx_h];
+                }
+            }
+        }
+    } else if (num_axes == 2) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                top_data[idx_c] = bottom_data[old_idx_c];
+            }
+        }
+    } else if (num_axes == 1) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            top_data[idx] = bottom_data[old_idx];
+        }
+    } else {
+        for (int i = 0; i < count; ++i) {
+            int old_idx = 0;
+            int idx     = i;
+            for (int j = num_axes-1; j >= 0; --j) {
+                int order = permute_order[j];
+                old_idx += (idx % dims[j]) * old_steps[order];
+                idx  /= dims[j];
+            }
+            top_data[i] = bottom_data[old_idx];
+        }
+    }
+};
+
+Status ArmPermuteLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto permute_param = dynamic_cast<PermuteLayerParam *>(param_);
+    CHECK_PARAM_NULL(permute_param);
+
+    auto packed = inputs[0]->GetBlobDesc().data_format != DATA_FORMAT_NCHW;
+
+    Blob *input_blob;
+    Blob *output_blob;
+    if (packed) {
+        AllocConvertBuffer(inputs, outputs);
+        input_blob  = nchw_blob_in[0].get();
+        output_blob = nchw_blob_out[0].get();
+    } else {
+        input_blob  = inputs[0];
+        output_blob = outputs[0];
+    }
+
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    int num_dims     = int(input_dims.size());
+    int output_count = DimsVectorUtils::Count(output_dims);
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(DimsVectorUtils::Count(input_dims, i + 1));
+        output_step.push_back(DimsVectorUtils::Count(output_dims, i + 1));
+    }
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        if (packed) {
+            UnPackInputs<float>(inputs);
+        }
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmPermute<float>(output_count, output_dims, input_data, permute_param->orders, input_step, output_step,
+                            num_dims, output_data);
+        if (packed) {
+            PackOutputs<float>(outputs);
+        }
+    }
+#if TNN_ARM82
+    else if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        if (packed) {
+            UnPackInputs<fp16_t>(inputs);
+        }
+        fp16_t *input_data  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        fp16_t *output_data = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmPermute<fp16_t>(output_count, output_dims, input_data, permute_param->orders, input_step, output_step,
+                             num_dims, output_data);
+        if (packed) {
+            PackOutputs<fp16_t>(outputs);
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Permute, LAYER_PERMUTE);
+REGISTER_ARM_PRECISION_FP16(LAYER_PERMUTE)
+REGISTER_ARM_LAYOUT(LAYER_PERMUTE, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_PERMUTE, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_pixel_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pixel_shuffle_layer_acc.cc
new file mode 100644
index 0000000..21d52c2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pixel_shuffle_layer_acc.cc
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status ArmPixelShuffleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type   = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    } else {
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+}
+
+template <typename T>
+static Status ExecFactor1(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+
+    auto *input_ptr  = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    auto *output_ptr = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    int data_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto size_in_bytes = input_dims[0] * ROUND_UP(input_dims[1], 4) * input_dims[2] * input_dims[3] * data_byte_size;
+
+    memcpy(output_ptr, input_ptr, size_in_bytes);
+
+    return TNN_OK;
+}
+
+#define PixelShufflePreparation                                                                                        \
+    auto input_dims      = inputs[0]->GetBlobDesc().dims;                                                              \
+    auto output_dims     = outputs[0]->GetBlobDesc().dims;                                                             \
+    auto ic              = input_dims[1];                                                                              \
+    auto ic_r4           = ROUND_UP(input_dims[1], 4);                                                                 \
+    auto ih              = input_dims[2];                                                                              \
+    auto iw              = input_dims[3];                                                                              \
+    auto oc              = output_dims[1];                                                                             \
+    auto oc_r4           = ROUND_UP(output_dims[1], 4);                                                                \
+    auto oh              = output_dims[2];                                                                             \
+    auto ow              = output_dims[3];                                                                             \
+    auto input_plane     = ic * ih * iw;                                                                               \
+    auto input_plane_r4  = ic_r4 * ih * iw;                                                                            \
+    auto output_plane    = oc * oh * ow;                                                                               \
+    auto output_plane_r4 = oc_r4 * oh * ow;                                                                            \
+    auto *input_ptr      = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));                            \
+    auto *output_ptr     = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+template <typename T>
+static void UnfoldPlane2x2(int oh, int ow, T *workspace_data_c, T *input_data_c) {
+    for (int h = 0; h < oh; h += 2) {
+        auto workspace_data_h = workspace_data_c + h * ow;
+        auto input_data_h     = input_data_c + h * ow;
+        for (int w = 0; w < ow>>2<<2; w += 4) {
+            auto workspace_data_w0 = workspace_data_h + w;
+            auto workspace_data_w1 = workspace_data_w0 + ow;
+            auto input_data_w0     = input_data_h + w / 2 * 4;
+            auto input_data_w1     = input_data_w0 + 4;
+            Float4 c0 = Float4::load(input_data_w0);
+            Float4 c1 = Float4::load(input_data_w1);
+            Float2 temp0, temp1;
+            Float4::get_low(c0, temp0);
+            Float4::get_low(c1, temp1);
+            Float4 h0 = Float4::combine(temp0, temp1);
+            Float4::get_high(c0, temp0);
+            Float4::get_high(c1, temp1);
+            Float4 h1 = Float4::combine(temp0, temp1);
+            Float4::save(workspace_data_w0, h0);
+            Float4::save(workspace_data_w1, h1);
+        }
+        if (ow % 4 != 0) {
+            auto workspace_data_w0 = workspace_data_h + (ow>>2<<2);
+            auto workspace_data_w1 = workspace_data_w0 + ow;
+            auto input_data_w0     = input_data_h + (ow>>2<<2) / 2 * 4;
+            Float4 c0 = Float4::load(input_data_w0);
+            Float2 temp0, temp1;
+            Float4::get_low(c0, temp0);
+            Float4::get_high(c0, temp1);
+            workspace_data_w0[0] = temp0.value[0];
+            workspace_data_w0[1] = temp0.value[1];
+            workspace_data_w1[0] = temp1.value[0];
+            workspace_data_w1[1] = temp1.value[1];
+        }
+    }
+}
+
+template <typename T>
+static void ShuffleChannelLane(int upscale_factor, int oc, int ow, T *src_data_w, T *dst_data_w) {
+    for (int rh = 0; rh < upscale_factor; ++rh) {
+        for (int rw = 0; rw < upscale_factor; ++rw) {
+            auto src_data_rw = src_data_w + rh * upscale_factor + rw;
+            auto dst_data_rw = dst_data_w + rh * ow * oc + rw * oc;
+            int stride    = oc;
+            int stride_r4 = oc>>2<<2;
+            int rc = 0;
+            for (; rc < stride_r4; rc += 4) {
+                Float4 src_value;
+                auto src_val0 = *(src_data_rw + rc * upscale_factor * upscale_factor);
+                auto src_val1 = *(src_data_rw + (rc + 1) * upscale_factor * upscale_factor);
+                auto src_val2 = *(src_data_rw + (rc + 2) * upscale_factor * upscale_factor);
+                auto src_val3 = *(src_data_rw + (rc + 3) * upscale_factor * upscale_factor);
+                src_value.set_lane(src_val0, 0);
+                src_value.set_lane(src_val1, 1);
+                src_value.set_lane(src_val2, 2);
+                src_value.set_lane(src_val3, 3);
+                Float4::save(dst_data_rw + rc, src_value);
+            }
+            if (stride_r4 > 0 && (stride % 4 != 0)) {
+                rc -= 4;
+            }
+            for (; rc < oc; ++rc) {
+                auto src_data_c = src_data_rw + rc * upscale_factor * upscale_factor;
+                auto dst_data_c = dst_data_rw + rc;
+                *dst_data_c = *src_data_c;
+            }
+        }
+    }
+}
+
+template <typename T>
+static Status ExecFactor2(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, void *workspace) {
+    PixelShufflePreparation;
+
+    for (int b = 0; b < output_dims[0]; ++b) {
+        auto workspace_data = reinterpret_cast<T *>(workspace) + b * output_plane;
+        auto input_data     = input_ptr + b * input_plane_r4;
+
+        for (int c = 0; c < oc; ++c) {
+            auto workspace_data_c = workspace_data + c * oh * ow;
+            auto input_data_c     = input_data + c * ih * iw * 4;
+            UnfoldPlane2x2(oh, ow, workspace_data_c, input_data_c);
+        }
+
+        auto output_data = output_ptr + b * output_plane_r4;
+        PackC4(output_data, workspace_data, oh * ow, oc);
+    }
+
+    return TNN_OK;
+}
+
+template <typename T>
+static Status ExecFactorCommon(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, void *workspace,
+                               int upscale_factor) {
+    PixelShufflePreparation;
+
+    for (int b = 0; b < output_dims[0]; ++b) {
+        auto workspace_data_src = reinterpret_cast<T *>(workspace) + b * output_plane;
+        auto workspace_data_dst = reinterpret_cast<T *>(workspace) + (output_dims[0] + b) * output_plane;
+        auto input_data         = input_ptr + b * input_plane_r4;
+        auto output_data        = output_ptr + b * output_plane_r4;
+
+        UnpackC4ToNHWC(workspace_data_src, input_data, ih * iw, ic);
+
+        for (int h = 0; h < ih; ++h) {
+            auto dst_data_h = workspace_data_dst + h * iw * ic;
+            auto src_data_h = workspace_data_src + h * iw * ic;
+            for (int w = 0; w < iw; ++w) {
+                auto dst_data_w = dst_data_h + w * ic / upscale_factor;
+                auto src_data_w = src_data_h + w * ic;
+                ShuffleChannelLane(upscale_factor, oc, ow, src_data_w, dst_data_w);
+            }
+        }
+
+        PackC4FromNHWC(output_data, workspace_data_dst, oh * ow, oc);
+    }
+
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmPixelShuffleLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param         = dynamic_cast<PixelShuffleLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    int upscale_factor = param->upscale_factor;
+
+    int data_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto size_in_bytes = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims) * data_byte_size;
+
+    if (upscale_factor == 1) {
+        return ExecFactor1<T>(inputs, outputs);
+    } else if (upscale_factor == 2) {
+        void *workspace = context_->GetSharedWorkSpace(size_in_bytes);
+        return ExecFactor2<T>(inputs, outputs, workspace);
+    } else if (upscale_factor > 0) {
+        void *workspace = context_->GetSharedWorkSpace(size_in_bytes * 2);
+        return ExecFactorCommon<T>(inputs, outputs, workspace, upscale_factor);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "pixel shuffle upscale factor not support");
+    }
+}
+
+REGISTER_ARM_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+REGISTER_ARM_LAYOUT(LAYER_PIXEL_SHUFFLE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.cc
new file mode 100644
index 0000000..89d7de9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.cc
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_pool_layer_acc.h"
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+ArmPoolingLayerAcc::~ArmPoolingLayerAcc(){};
+
+Status ArmPoolingLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ArmLayerAcc::Reshape(inputs, outputs);
+    PoolingLayerParam *param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    corner_l_ = 0, corner_t_ = 0, corner_r_ = k_param_->ow, corner_b_ = k_param_->oh;
+    for (; corner_l_ * param->strides[0] - param->pads[0] < 0; corner_l_++)
+        ;
+    for (; corner_t_ * param->strides[1] - param->pads[2] < 0; corner_t_++)
+        ;
+    for (; (corner_r_ - 1) * param->strides[0] - param->pads[0] + param->kernels[0] > k_param_->iw &&
+            corner_r_ > corner_l_;
+        corner_r_--)
+        ;
+    for (; (corner_b_ - 1) * param->strides[1] - param->pads[2] + param->kernels[1] > k_param_->ih &&
+            corner_b_ > corner_t_;
+        corner_b_--)
+        ;
+    return TNN_OK;
+}
+
+Status ArmPoolingLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    PoolingLayerParam *param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto oc_4       = UP_DIV(dims_output[1], 4);
+    auto batch      = dims_output[0];
+    auto input_ptr  = GetBlobHandlePtr(input->GetHandle());
+    auto output_ptr = GetBlobHandlePtr(output->GetHandle());
+
+    // run
+    if (input->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_plane_stride  = 4 * k_param_->iw * k_param_->ih;
+        auto output_plane_stride = 4 * k_param_->ow * k_param_->oh;
+        OMP_PARALLEL_FOR_
+        for (int plane = (int)0; plane < batch * oc_4; plane++) {
+            if (param->pool_type == 0) {
+                MaxPooling(reinterpret_cast<float *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                           k_param_->ih, reinterpret_cast<float *>(output_ptr) + output_plane_stride * plane,
+                           k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                           param->strides[1], param->pads[0], param->pads[2], corner_l_, corner_r_, corner_t_,
+                           corner_b_);
+            } else {
+                AvgPooling(reinterpret_cast<float *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                           k_param_->ih, reinterpret_cast<float *>(output_ptr) + output_plane_stride * plane,
+                           k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                           param->strides[1], param->pads[0], param->pads[2]);
+            }
+        }
+    } else if (input->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        auto input_plane_stride  = 4 * k_param_->iw * k_param_->ih;
+        auto output_plane_stride = 4 * k_param_->ow * k_param_->oh;
+        OMP_PARALLEL_FOR_
+        for (int plane = (int)0; plane < batch * oc_4; plane++) {
+            if (param->pool_type == 0) {
+                MaxPooling(reinterpret_cast<bfp16_t *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                           k_param_->ih, reinterpret_cast<bfp16_t *>(output_ptr) + output_plane_stride * plane,
+                           k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                           param->strides[1], param->pads[0], param->pads[2], corner_l_, corner_r_, corner_t_,
+                           corner_b_);
+            } else {
+                AvgPooling(reinterpret_cast<bfp16_t *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                           k_param_->ih, reinterpret_cast<bfp16_t *>(output_ptr) + output_plane_stride * plane,
+                           k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                           param->strides[1], param->pads[0], param->pads[2]);
+            }
+        }
+    }
+#if TNN_ARM82
+    else if (input->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        auto oc_8       = UP_DIV(dims_output[1], 8);
+        auto input_plane_stride  = 8 * k_param_->iw * k_param_->ih;
+        auto output_plane_stride = 8 * k_param_->ow * k_param_->oh;
+        OMP_PARALLEL_FOR_
+        for (int plane = (int)0; plane < batch * oc_8; plane++) {
+            if (param->pool_type == 0) {
+                MaxPoolingHalf(reinterpret_cast<fp16_t *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                               k_param_->ih, reinterpret_cast<fp16_t *>(output_ptr) + output_plane_stride * plane,
+                               k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                               param->strides[1], param->pads[0], param->pads[2]);
+            } else {
+                AvgPoolingHalf(reinterpret_cast<fp16_t *>(input_ptr) + plane * input_plane_stride, k_param_->iw,
+                               k_param_->ih, reinterpret_cast<fp16_t *>(output_ptr) + output_plane_stride * plane,
+                               k_param_->ow, k_param_->oh, param->kernels[0], param->kernels[1], param->strides[0],
+                               param->strides[1], param->pads[0], param->pads[2]);
+            }
+        }
+    }
+#endif
+    else if (input->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        // INT8
+        for (int n = 0; n < batch; n++) {
+            auto input_batch_stride  = k_param_->iw * k_param_->ih * oc_4 * 4;
+            auto output_batch_stride = k_param_->ow * k_param_->oh * oc_4 * 4;
+            if (param->pool_type == 0) {
+                MaxPoolingINT8(reinterpret_cast<int8_t *>(input_ptr) + n * input_batch_stride, k_param_->iw,
+                               k_param_->ih, reinterpret_cast<int8_t *>(output_ptr) + n * output_batch_stride,
+                               k_param_->ow, k_param_->oh, oc_4 * 4, param->kernels[0], param->kernels[1],
+                               param->strides[0], param->strides[1], param->pads[0], param->pads[2]);
+            } else {
+                AvgPoolingINT8(reinterpret_cast<int8_t *>(input_ptr) + n * input_batch_stride, k_param_->iw,
+                               k_param_->ih, reinterpret_cast<int8_t *>(output_ptr) + n * output_batch_stride,
+                               k_param_->ow, k_param_->oh, oc_4 * 4, param->kernels[0], param->kernels[1],
+                               param->strides[0], param->strides[1], param->pads[0], param->pads[2]);
+            }
+        }
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Error: arm pooling layer got unsupported data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Pooling, LAYER_POOLING)
+REGISTER_ARM_PRECISION_FP16(LAYER_POOLING)
+REGISTER_ARM_LAYOUT(LAYER_POOLING, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.h
new file mode 100644
index 0000000..aba4408
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pool_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_POOL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_POOL_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class ArmPoolingLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmPoolingLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    int corner_l_;
+    int corner_r_;
+    int corner_t_;
+    int corner_b_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_POOL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_pow_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pow_layer_acc.cc
new file mode 100644
index 0000000..c0279fa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_pow_layer_acc.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+#include <cmath>
+
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/arm_device.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Pow, LAYER_POWER);
+
+Status ArmPowLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PowLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob *output_blob = outputs[0];
+    auto dims         = output_blob->GetBlobDesc().dims;
+    int count         = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+    int count_quad    = UP_DIV(count, 4);
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+
+        int pow = round(layer_param->exponent);
+
+        if (ABS(pow - layer_param->exponent) < FLT_EPSILON) {
+            if (pow == 0) {
+                for (int n = 0; n < count_quad; n++) {
+                    Float4::save(output_data + n * 4, Float4(1.0f));
+                }
+                return TNN_OK;
+            }
+
+            bool reciprocal = pow < 0;
+            if(reciprocal)
+                pow = -pow;
+            for (int n = 0; n < count_quad; n++) {
+                Float4 val = Float4::load(input_data + n * 4) * layer_param->scale + layer_param->shift;
+                if (reciprocal) {
+                    val = Float4::div(1.0f, val);
+                }
+                Float4 res = val;
+                for (int i = 0; i < pow - 1; i++) {
+                    res = res * val;
+                }
+                Float4::save(output_data + n * 4, res);
+            }
+        } else {
+            for (int n = 0; n < count_quad; n++) {
+                Float4 val = Float4::load(input_data + n * 4);
+                Float4 res =
+                    Float4::pow(val * layer_param->scale + Float4(layer_param->shift), Float4(layer_param->exponent));
+                Float4::save(output_data + n * 4, res);
+            }
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Pow, LAYER_POWER);
+REGISTER_ARM_LAYOUT(LAYER_POWER, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.cc
new file mode 100644
index 0000000..c9a1d4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.cc
@@ -0,0 +1,133 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_prelu_layer_acc.h"
+
+#include <cmath>
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+ArmPReluLayerAcc::~ArmPReluLayerAcc(){};
+
+Status ArmPReluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                              const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    return allocateBufferParam(inputs, outputs);
+}
+
+Status ArmPReluLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    PReluLayerResource *prelu_res = dynamic_cast<PReluLayerResource *>(resource_);
+    CHECK_PARAM_NULL(prelu_res);
+
+    RawBuffer slope_handle = prelu_res->slope_handle;
+
+    if (slope_handle.GetDataType() == DATA_TYPE_HALF)
+        slope_handle = ConvertHalfHandle(slope_handle);
+
+    auto data_bytes_size = DataTypeUtils::GetBytesSize(slope_handle.GetDataType());
+
+    if (!buffer_slope_.GetBytesSize()) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            int channel       = layer_param->channel_shared ? 1 : dims_output[1];
+            int channel_count = layer_param->channel_shared ? 1 : ROUND_UP(dims_output[1], 8);
+            RawBuffer temp_buffer(channel_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            Float2Half(temp_buffer.force_to<fp16_t *>(), slope_handle.force_to<float *>(), channel);
+            buffer_slope_ = temp_buffer;
+        } else {
+            int channel       = layer_param->channel_shared ? 1 : dims_output[1];
+            int channel_count = layer_param->channel_shared ? 1 : ROUND_UP(dims_output[1], 4);
+            RawBuffer temp_buffer(channel_count * data_bytes_size);
+            memcpy(temp_buffer.force_to<void *>(), slope_handle.force_to<void *>(), channel * data_bytes_size);
+            buffer_slope_ = temp_buffer;
+        }
+    }
+
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmPReluLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto dims              = inputs[0]->GetBlobDesc().dims;
+    const int channel      = dims[1];
+    const int hw           = DimsVectorUtils::Count(dims, 2);
+    const int count        = dims[0] * ROUND_UP(dims[1], 4) * hw;
+
+    const float *slope_data = buffer_slope_.force_to<float *>();
+
+    T *input_data  = reinterpret_cast<T *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    T *output_data = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    if (layer_param->channel_shared) {
+        for (int n = 0; n < UP_DIV(count, 4); n++) {
+            Float4 v_data = Float4::load(input_data + n * 4);
+            Float4 v_res  = Float4::bsl_clt(v_data, Float4(0.f), v_data * slope_data[0], v_data);
+            Float4::save(output_data + n * 4, v_res);
+        }
+    } else {
+        for (int batch_idx = 0; batch_idx < dims[0]; ++batch_idx) {
+            auto input_ptr  = input_data + batch_idx * hw * ROUND_UP(channel, 4);
+            auto output_ptr = output_data + batch_idx * hw * ROUND_UP(channel, 4);
+            for (int dz = 0; dz < UP_DIV(channel, 4); ++dz) {
+                T *src_z       = input_ptr + dz * hw * 4;
+                T *dst_z       = output_ptr + dz * hw * 4;
+                Float4 v_slope = Float4::load(slope_data + dz * 4);
+                for (int p = 0; p < hw; p++) {
+                    Float4 v_data = Float4::load(src_z + p * 4);
+                    Float4 v_res  = Float4::bsl_clt(v_data, Float4(0.f), v_data * v_slope, v_data);
+                    Float4::save(dst_z + p * 4, v_res);
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+
+Status ArmPReluLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (in_data_type == DATA_TYPE_HALF) {
+        return ExecFp16(inputs, outputs);
+    }
+#endif
+    else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+REGISTER_ARM_ACC(PRelu, LAYER_PRELU)
+REGISTER_ARM_PRECISION_FP16(LAYER_PRELU)
+REGISTER_ARM_LAYOUT(LAYER_PRELU, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.h
new file mode 100644
index 0000000..33b1605
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_prelu_layer_acc.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_PRELU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_PRELU_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class ArmPReluLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmPReluLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    DECLARE_ARM_FP16_LAYER_FUNC;
+
+protected:
+    RawBuffer buffer_slope_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_PRELU_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_priorbox_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_priorbox_layer_acc.cc
new file mode 100644
index 0000000..823ac68
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_priorbox_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_nchw_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_NCHW_ACC(PriorBox, LAYER_PRIOR_BOX);
+
+Status ArmPriorBoxLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    AllocConvertBuffer(inputs, outputs);
+
+    auto nchw_blobs = GetNchwBlobVector(nchw_blob_out);
+
+    // call cpu naive prior box
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto prior_box = GeneratePriorBox(inputs, outputs, param);
+        memcpy(GetBlobHandlePtr(nchw_blob_out[0]->GetHandle()), reinterpret_cast<void *>(prior_box.data()),
+               prior_box.size() * sizeof(float));
+        PackOutputs<float>(outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(PriorBox, LAYER_PRIOR_BOX)
+REGISTER_ARM_LAYOUT(LAYER_PRIOR_BOX, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reciprocal_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reciprocal_layer_acc.cc
new file mode 100644
index 0000000..4692477
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reciprocal_layer_acc.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reciprocal_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::div(1.0f, v);
+    }
+} ARM_RECIPROCAL_OP;
+
+DECLARE_ARM_UNARY_ACC(Reciprocal, ARM_RECIPROCAL_OP);
+REGISTER_ARM_ACC(Reciprocal, LAYER_RECIPROCAL);
+REGISTER_ARM_LAYOUT(LAYER_RECIPROCAL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l1_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l1_layer_acc.cc
new file mode 100644
index 0000000..12cada5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l1_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_l1_operator : arm_reduce_operator {
+    virtual void DataInit(void *data, size_t count) {
+        memset(data, 0, count * sizeof(float));
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(0.f);
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return v + Float4::abs(t);
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return v + std::abs(t);
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return v;
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v;
+    };
+} ARM_REDUCE_L1_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceL1, ARM_REDUCE_L1_OP);
+
+REGISTER_ARM_ACC(ReduceL1, LAYER_REDUCE_L1);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_L1, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l2_layer_acc.cc
new file mode 100644
index 0000000..667fa85
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_l2_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_l2_operator : arm_reduce_operator {
+    virtual void DataInit(void *data, size_t count) {
+        memset(data, 0, count * sizeof(float));
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(0.f);
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return v + t * t;
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return v + t * t;
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return Float4::sqrt(v);
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return std::sqrt(v);
+    };
+} ARM_REDUCE_L2_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceL2, ARM_REDUCE_L2_OP);
+
+REGISTER_ARM_ACC(ReduceL2, LAYER_REDUCE_L2);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_L2, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.cc
new file mode 100644
index 0000000..a0f0fd2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.cc
@@ -0,0 +1,227 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+ArmReduceLayerAcc::~ArmReduceLayerAcc() {}
+
+Status ArmReduceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto ret = ArmLayerAcc::Init(context, param, resource, inputs, outputs);
+    return ret;
+}
+
+template <bool post_cal>
+void ArmReduceLayerAcc::ReduceChannel(float *input_data, float *output_data, DimsVector &dims_in, const int c4n,
+                                      const int c4r, const Float4 axis_n, const int hw_r, const int hw_c,
+                                      const int hw) {
+    float reduce_c = dims_in[1];
+    for (int n = 0; n < dims_in[0]; n++) {
+        for (int c = 0; c < c4n; c++) {
+            OMP_PARALLEL_FOR_
+            for (int i = 0; i < hw_c; i++) {
+                int p      = i * 16;
+                Float4x4 v = Float4x4::ld4(input_data + p);
+                Float4 r, t;
+                r.set_lane(*(output_data + p), 0);
+                r.set_lane(*(output_data + p + 4), 1);
+                r.set_lane(*(output_data + p + 8), 2);
+                r.set_lane(*(output_data + p + 12), 3);
+                int e = 4;
+                if ((c == c4n - 1) && (c4r != 0)) {
+                    e = c4r;
+                }
+                for (int j = 0; j < e; j++) {
+                    // t.value = v.value.val[j];
+                    v.get_lane(t, j);
+                    r = op_->Calculate(r, t);
+                }
+                if (c == c4n - 1) {
+                    if (post_cal)
+                        r = op_->PostCalculate(r, axis_n);
+                }
+                *(output_data + p)      = r.value[0];
+                *(output_data + p + 4)  = r.value[1];
+                *(output_data + p + 8)  = r.value[2];
+                *(output_data + p + 12) = r.value[3];
+            }
+
+            for (int i = 0; i < hw_r; i++) {
+                int p = hw_c * 16 + i * 4;
+                int e = 4;
+                if ((c == c4n - 1) && (c4r != 0)) {
+                    e = c4r;
+                }
+                for (int j = 0; j < e; j++) {
+                    *(output_data + p) = op_->Calculate(*(output_data + p), *(input_data + p + j));
+                }
+                if (c == c4n - 1) {
+                    if (post_cal)
+                        *(output_data + p) = op_->PostCalculate(*(output_data + p), reduce_c);
+                }
+            }
+
+            input_data += hw << 2;
+        }
+        output_data += hw << 2;
+    }
+}
+
+static bool NeedRepack(const DimsVector &src_dims, const DimsVector &dst_dims) {
+    return ((src_dims.size() != dst_dims.size()) && (src_dims[1] != dst_dims[1]));
+}
+
+Status ArmReduceLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReduceLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto input   = inputs[0];
+    auto output  = outputs[0];
+    auto dims_in = input->GetBlobDesc().dims;
+
+    int data_byte_size = DataTypeUtils::GetBytesSize(input->GetBlobDesc().data_type);
+
+    if (input->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+        auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output->GetHandle()));
+
+        if (op_->NeedPreCalculate()) {
+            auto in_count = dims_in[0] * ROUND_UP(dims_in[1], 4) * DimsVectorUtils::Count(dims_in, 2);
+            OMP_PARALLEL_FOR_
+            for (int i = 0; i < in_count; i += 4) {
+                Float4 v = Float4::load(input_data + i);
+                Float4 r = op_->PreCalculate(v);
+                Float4::save(input_data + i, r);
+            }
+        }
+
+        auto input_data_a  = input_data;
+        auto output_data_a = output_data;
+        RawBuffer tmp_out[2];
+        for (int i = 0; i < param->axis.size(); ++i) {
+            int axis = param->axis[i];
+            axis     = axis >= 0 ? axis : axis + (int)dims_in.size();
+
+            auto dims_out  = dims_in;
+            dims_out[axis] = 1;
+            int out_count  = dims_out[0] * ROUND_UP(dims_out[1], 4) * DimsVectorUtils::Count(dims_out, 2);
+
+            if (i == 0) {
+                input_data_a = input_data;
+            } else {
+                input_data_a = output_data_a;
+            }
+            if (i == param->axis.size() - 1 && !NeedRepack(dims_out, output->GetBlobDesc().dims)) {
+                output_data_a = output_data;
+            } else {
+                tmp_out[0]    = RawBuffer(out_count * data_byte_size);
+                output_data_a = tmp_out[0].force_to<float *>();
+            }
+
+            bool post_cal = op_->PosCalculateOnce() ? (i == param->axis.size() - 1) : true;
+            if (post_cal) {
+                ReduceOneAxis<true>(input_data_a, output_data_a, dims_in, out_count, axis);
+            } else {
+                ReduceOneAxis<false>(input_data_a, output_data_a, dims_in, out_count, axis);
+            }
+
+            tmp_out[1] = tmp_out[0];
+        }
+
+        if (NeedRepack(dims_in, output->GetBlobDesc().dims)) {
+            tmp_out[0]    = RawBuffer(ROUND_UP(DimsVectorUtils::Count(dims_in), 4) * data_byte_size);
+            auto tmp_data = tmp_out[0].force_to<float *>();
+            auto c_src    = dims_in[1];
+            auto hw_src   = DimsVectorUtils::Count(dims_in, 2);
+            auto c_dst    = output->GetBlobDesc().dims[1];
+            auto hw_dst   = DimsVectorUtils::Count(output->GetBlobDesc().dims, 2);
+            for (int b = 0; b < dims_in[0]; ++b) {
+                UnpackC4(tmp_data, output_data_a + b * ROUND_UP(c_src, 4) * hw_src, hw_src, c_src);
+                PackC4(output_data + b * ROUND_UP(c_dst, 4) * hw_dst, tmp_data, hw_dst, c_dst);
+            }
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output->GetBlobDesc().data_type);
+        return TNNERR_LAYER_ERR;
+    }
+
+    return TNN_OK;
+}
+
+template <bool post_cal>
+void ArmReduceLayerAcc::ReduceOneAxis(float *input_data, float *output_data, DimsVector &dims_in, int out_count,
+                                      int axis) {
+    int c4u  = ROUND_UP(dims_in[1], 4);
+    int c4n  = UP_DIV(dims_in[1], 4);
+    int c4r  = dims_in[1] % 4;
+    int hw   = DimsVectorUtils::Count(dims_in, 2);
+    int hw_c = hw / 4;
+    int hw_r = hw % 4;
+    Float4 axis_n(dims_in[axis]);
+
+    op_->DataInit(output_data, out_count);
+
+    if (axis == 1) {
+        ReduceChannel<post_cal>(input_data, output_data, dims_in, c4n, c4r, axis_n, hw_r, hw_c, hw);
+    } else if (axis == 0) {
+        int outer_dim = dims_in[0];
+        int inner_dim = c4u * hw;
+        int count     = outer_dim * inner_dim;
+
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < inner_dim; i += 4) {
+            Float4 r = op_->DataInit();
+            for (int j = 0; j < count; j += inner_dim) {
+                Float4 v = Float4::load(input_data + j + i);
+                r        = op_->Calculate(r, v);
+            }
+            if (post_cal)
+                r = op_->PostCalculate(r, axis_n);
+            Float4::save(output_data + i, r);
+        }
+    } else {
+        int outer_dim  = dims_in[0] * c4n * DimsVectorUtils::Count(dims_in, 2, axis);
+        int reduce_dim = dims_in[axis];
+        int inner_dim  = DimsVectorUtils::Count(dims_in, axis + 1);
+        OMP_PARALLEL_FOR_
+        for (int o = 0; o < outer_dim; ++o) {
+            auto input_data_o  = input_data + o * reduce_dim * inner_dim * 4;
+            auto output_data_o = output_data + o * inner_dim * 4;
+            for (int i = 0; i < inner_dim; ++i) {
+                auto input_data_i  = input_data_o + i * 4;
+                auto output_data_i = output_data_o + i * 4;
+                Float4 res         = op_->DataInit();
+                for (int r = 0; r < reduce_dim; ++r) {
+                    Float4 val = Float4::load(input_data_i + r * inner_dim * 4);
+                    res        = op_->Calculate(res, val);
+                }
+                if (post_cal)
+                    res = op_->PostCalculate(res, axis_n);
+                Float4::save(output_data_i, res);
+            }
+        }
+    }
+
+    dims_in[axis] = 1;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.h
new file mode 100644
index 0000000..a2e902e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_layer_acc.h
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_REDUCE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_REDUCE_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_operator {
+public:
+    virtual void DataInit(void *data, size_t count) {
+        memset(data, 0, count * sizeof(float));
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(0.f);
+    };
+
+    virtual Float4 PreCalculate(Float4 &v) {
+        return v;
+    };
+
+    virtual float PreCalculate(const float &v) {
+        return v;
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return v + t;
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return v + t;
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return v;
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v;
+    };
+
+    virtual bool NeedPreCalculate() {
+        return false;
+    };
+
+    virtual bool PosCalculateOnce() {
+        return false;
+    };
+
+} ARM_REDUCE_OP;
+
+class ArmReduceLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmReduceLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    std::shared_ptr<ARM_REDUCE_OP> op_;
+    template <bool post_cal>
+    void ReduceOneAxis(float* input_data, float* output_data, DimsVector& dims_in, int out_count, int axis);
+    template <bool post_cal>
+    void ReduceChannel(float* input_data, float* output_data, DimsVector& dims_in,
+        const int c4n, const int c4r, const Float4 axis_n, const int hw_r, const int hw_c, const int hw);
+};
+
+#define DECLARE_ARM_REDUCE_ACC(type_string, op_type)                                                                   \
+    class Arm##type_string##LayerAcc : public ArmReduceLayerAcc {                                                      \
+    public:                                                                                                            \
+        Arm##type_string##LayerAcc() {                                                                                 \
+            op_ = std::make_shared<op_type>();                                                                         \
+        }                                                                                                              \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+    }
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_REDUCE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_exp_layer_acc.cc
new file mode 100644
index 0000000..2903a61
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_exp_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_log_sum_exp_operator : arm_reduce_operator {
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return v + Float4::exp(t);
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return v + std::exp(t);
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return Float4::log(v);
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return std::log(v);
+    };
+} ARM_REDUCE_LOG_SUM_EXP_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceLogSumExp, ARM_REDUCE_LOG_SUM_EXP_OP);
+
+REGISTER_ARM_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_LOG_SUM_EXP, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_layer_acc.cc
new file mode 100644
index 0000000..975244c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_log_sum_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_log_sum_operator : arm_reduce_operator {
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return Float4::log(v);
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return std::log(v);
+    };
+
+    virtual bool PosCalculateOnce() {
+        return true;
+    };
+} ARM_REDUCE_LOG_SUM_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceLogSum, ARM_REDUCE_LOG_SUM_OP);
+
+REGISTER_ARM_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_LOG_SUM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_max_layer_acc.cc
new file mode 100644
index 0000000..7c14e66
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_max_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_max_operator : arm_reduce_operator {
+    virtual void DataInit(void *data, size_t count) {
+        float *p = static_cast<float *>(data);
+        for (size_t i = 0; i < count; i++) {
+            p[i] = -FLT_MAX;
+        }
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(-FLT_MAX);
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return Float4::max(v, t);
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return std::max(v, t);
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return v;
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v;
+    };
+} ARM_REDUCE_MAX_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceMax, ARM_REDUCE_MAX_OP);
+REGISTER_ARM_ACC(ReduceMax, LAYER_REDUCE_MAX);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_MAX, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_mean_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_mean_layer_acc.cc
new file mode 100644
index 0000000..fb56b68
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_mean_layer_acc.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_mean_operator : arm_reduce_operator {
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return Float4::div(v, t);
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v / t;
+    };
+} ARM_REDUCE_MEAN_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceMean, ARM_REDUCE_MEAN_OP);
+
+REGISTER_ARM_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_MEAN, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_min_layer_acc.cc
new file mode 100644
index 0000000..5296d20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_min_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_min_operator : arm_reduce_operator {
+    virtual void DataInit(void *data, size_t count) {
+        float *p = static_cast<float *>(data);
+        for (size_t i = 0; i < count; i++) {
+            p[i] = FLT_MAX;
+        }
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(FLT_MAX);
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return Float4::min(v, t);
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return std::min(v, t);
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return v;
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v;
+    };
+} ARM_REDUCE_MIN_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceMin, ARM_REDUCE_MIN_OP);
+REGISTER_ARM_ACC(ReduceMin, LAYER_REDUCE_MIN);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_MIN, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_prod_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_prod_layer_acc.cc
new file mode 100644
index 0000000..a8cbe9d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_prod_layer_acc.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_prod_operator : arm_reduce_operator {
+    virtual void DataInit(void *data, size_t count) {
+        float *p = static_cast<float *>(data);
+        for (size_t i = 0; i < count; i++) {
+            p[i] = 1;
+        }
+    };
+
+    virtual Float4 DataInit() {
+        return Float4(1.f);
+    };
+
+    virtual Float4 Calculate(Float4 &v, Float4 &t) {
+        return v * t;
+    };
+
+    virtual float Calculate(const float &v, const float &t) {
+        return v * t;
+    };
+
+    virtual Float4 PostCalculate(const Float4 &v, const Float4 &t) {
+        return v;
+    };
+
+    virtual float PostCalculate(const float &v, const float &t) {
+        return v;
+    };
+} ARM_REDUCE_PROD_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceProd, ARM_REDUCE_PROD_OP);
+
+REGISTER_ARM_ACC(ReduceProd, LAYER_REDUCE_PROD);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_PROD, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_layer_acc.cc
new file mode 100644
index 0000000..d7640d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_layer_acc.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_sum_operator : arm_reduce_operator {
+} ARM_REDUCE_SUM_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceSum, ARM_REDUCE_SUM_OP);
+
+REGISTER_ARM_ACC(ReduceSum, LAYER_REDUCE_SUM);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_SUM, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_square_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_square_layer_acc.cc
new file mode 100644
index 0000000..6e56160
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reduce_sum_square_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_reduce_sum_square_operator : arm_reduce_operator {
+    virtual Float4 PreCalculate(Float4 &v) {
+        return v * v;
+    };
+
+    virtual float PreCalculate(const float &v) {
+        return v * v;
+    };
+
+    virtual bool NeedPreCalculate() {
+        return true;
+    };
+} ARM_REDUCE_SUM_SQUARE_OP;
+
+DECLARE_ARM_REDUCE_ACC(ReduceSumSquare, ARM_REDUCE_SUM_SQUARE_OP);
+
+REGISTER_ARM_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+REGISTER_ARM_LAYOUT(LAYER_REDUCE_SUM_SQUARE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.cc
new file mode 100644
index 0000000..99b0d9f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reformat_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status ArmReformatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_format == reformat_param->dst_format) {
+        if (reformat_param->src_type == DATA_TYPE_INT8 && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+            reformat_param->type = DEQUANT_ONLY;
+        } else if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_INT8) {
+            reformat_param->type = QUANT_ONLY;
+        } else if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_HALF) {
+            reformat_param->type = NC4HW4FP32_2_NC8HW8FP16;
+        } else if (reformat_param->src_type == DATA_TYPE_HALF && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+            reformat_param->type = NC8HW8FP16_2_NC4HW4FP32;
+        } else {
+            if (reformat_param->src_type == DATA_TYPE_BFP16 || reformat_param->dst_type == DATA_TYPE_BFP16) {
+                LOGE("unsupport precision mode, please dont use precision = low for int8");
+            }
+            return Status(TNNERR_MODEL_ERR, "unsupport precision mode");
+        }
+    } else if (reformat_param->src_format == DATA_FORMAT_NC4HW4 && reformat_param->dst_format == DATA_FORMAT_NCHW) {
+        if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+            reformat_param->type = NC4HW4FP32_2_NCHWFP32;
+        } else if (reformat_param->src_type == DATA_TYPE_HALF && reformat_param->dst_type == DATA_TYPE_HALF) {
+            reformat_param->type = NC8HW8FP16_2_NCHWFP16;
+        } else {
+            LOGE("ArmReformatLayerAcc::Init Error: src_fmt: %d, dst_fmt: %d, src_type: %d, dst_type: %d\n",
+                 reformat_param->src_format, reformat_param->dst_format, reformat_param->src_type,
+                 reformat_param->dst_type);
+            return Status(TNNERR_MODEL_ERR, "ArmReformatLayerAcc::Init unsupport reformat type");
+        }
+    } else if (reformat_param->src_format == DATA_FORMAT_NCHW && reformat_param->dst_format == DATA_FORMAT_NC4HW4) {
+        if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+            reformat_param->type = NCHWFP32_2_NC4HW4FP32;
+        } else if (reformat_param->src_type == DATA_TYPE_HALF && reformat_param->dst_type == DATA_TYPE_HALF) {
+            reformat_param->type = NCHWFP16_2_NC8HW8FP16;
+        } else {
+            LOGE("ArmReformatLayerAcc::Init Error: src_fmt: %d, dst_fmt: %d, src_type: %d, dst_type: %d\n",
+                 reformat_param->src_format, reformat_param->dst_format, reformat_param->src_type,
+                 reformat_param->dst_type);
+            return Status(TNNERR_MODEL_ERR, "ArmReformatLayerAcc::Init unsupport reformat type");
+        }
+    } else {
+        LOGE("ArmReformatLayerAcc::Init Error: src_fmt: %d, dst_fmt: %d, src_type: %d, dst_type: %d\n",
+             reformat_param->src_format, reformat_param->dst_format, reformat_param->src_type,
+             reformat_param->dst_type);
+        return Status(TNNERR_MODEL_ERR, "ArmReformatLayerAcc::Init unsupport reformat type");
+    }
+    return allocateBufferParam(inputs, outputs);
+}
+
+ArmReformatLayerAcc::~ArmReformatLayerAcc() {}
+
+Status ArmReformatLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (param->type == NC4HW4FP32_2_NC8HW8FP16 || param->type == NC8HW8FP16_2_NC4HW4FP32) {
+        return TNN_OK;
+    }
+
+    if (param->src_type != param->dst_type && !scale_buffer_.GetBytesSize()) {
+        auto dims_output    = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+        IntScaleResource *reformat_scale;
+        if (param->src_type == DATA_TYPE_INT8) {
+            reformat_scale = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource();
+        } else {
+            reformat_scale = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        }
+        const float *scale = reformat_scale->scale_handle.force_to<float *>();
+        int scale_cnt      = reformat_scale->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx = scale_cnt == 1 ? 0 : i;
+            if (param->type == QUANT_ONLY)
+                temp_ptr[i] = 1.0 / scale[scale_idx];
+            if (param->type == DEQUANT_ONLY)
+                temp_ptr[i] = scale[scale_idx];
+        }
+        scale_buffer_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmReformatLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    for (int i = 0; i < inputs.size(); ++i) {
+        auto dims   = outputs[i]->GetBlobDesc().dims;
+        int batch   = dims[0];
+        int channel = dims[1];
+        int hw      = DimsVectorUtils::Count(dims, 2);
+        if (param->type == DEQUANT_ONLY) {
+            Int8ToFloat(reinterpret_cast<float *>(GetBlobHandlePtr(outputs[i]->GetHandle())),
+                        reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[i]->GetHandle())),
+                        scale_buffer_.force_to<float *>(), batch, channel, hw);
+        } else if (param->type == QUANT_ONLY) {
+            FloatToInt8(reinterpret_cast<int8_t *>(GetBlobHandlePtr(outputs[i]->GetHandle())),
+                        reinterpret_cast<float *>(GetBlobHandlePtr(inputs[i]->GetHandle())),
+                        scale_buffer_.force_to<float *>(), batch, channel, hw);
+        } else if (param->type == NC4HW4FP32_2_NCHWFP32) {
+            auto dst_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[i]->GetHandle()));
+            auto src_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[i]->GetHandle()));
+            UnpackFloatBlob(dst_ptr, src_ptr, batch, channel, hw);
+        } else if (param->type == NCHWFP32_2_NC4HW4FP32) {
+            auto dst_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[i]->GetHandle()));
+            auto src_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[i]->GetHandle()));
+            PackFloatBlob(dst_ptr, src_ptr, batch, channel, hw);
+        }
+#if TNN_ARM82
+        else if (param->type == NC4HW4FP32_2_NC8HW8FP16) {
+            FloatC4ToHalfC8(reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[i]->GetHandle())),
+                            reinterpret_cast<float *>(GetBlobHandlePtr(inputs[i]->GetHandle())), batch, channel, hw);
+        } else if (param->type == NC8HW8FP16_2_NC4HW4FP32) {
+            HalfC8ToFloatC4(reinterpret_cast<float *>(GetBlobHandlePtr(outputs[i]->GetHandle())),
+                            reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[i]->GetHandle())), batch, channel, hw);
+        } else if (param->type == NC8HW8FP16_2_NCHWFP16) {
+            auto dst_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[i]->GetHandle()));
+            auto src_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[i]->GetHandle()));
+            UnpackHalfBlob(dst_ptr, src_ptr, batch, channel, hw);
+        } else if (param->type == NCHWFP16_2_NC8HW8FP16) {
+            auto dst_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[i]->GetHandle()));
+            auto src_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[i]->GetHandle()));
+            PackHalfBlob(dst_ptr, src_ptr, batch, channel, hw);
+        }
+#endif  // TNN_ARM82
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Reformat, LAYER_REFORMAT)
+REGISTER_ARM_LAYOUT(LAYER_REFORMAT, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.h
new file mode 100644
index 0000000..4c9025a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reformat_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_REFORMAT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_REFORMAT_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class ArmReformatLayerAcc : public ArmLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~ArmReformatLayerAcc();
+
+    Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer scale_buffer_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_REFORMAT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu6_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu6_layer_acc.cc
new file mode 100644
index 0000000..b0e6a4b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu6_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_relu6_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::min(Float4(6.0), Float4::max(Float4(0.0), v));
+    }
+} ARM_RELU6_OP;
+
+DECLARE_ARM_UNARY_ACC_FP16(Relu6, ARM_RELU6_OP);
+
+REGISTER_ARM_ACC(Relu6, LAYER_RELU6);
+REGISTER_ARM_PRECISION_FP16(LAYER_RELU6);
+REGISTER_ARM_LAYOUT(LAYER_RELU6, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.cc
new file mode 100644
index 0000000..44965df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_relu_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+Status ArmReluLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims = output->GetBlobDesc().dims;
+
+    long count = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+
+    auto &data_type = input->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_INT8) {
+        ReluInt8(reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle())),
+                 reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle())), count);
+    } else if (data_type == DATA_TYPE_FLOAT) {
+        auto dst = reinterpret_cast<float *>(GetBlobHandlePtr(output->GetHandle()));
+        auto src = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+        Float4 vzero(0.f);
+        for (long i = 0; i < count; i += 4) {
+            Float4::save(dst + i, Float4::max(Float4::load(src + i), vzero));
+        }
+    } else if (data_type == DATA_TYPE_BFP16) {
+        auto dst = reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+        auto src = reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+        Float4 vzero(0.f);
+        for (long i = 0; i < count; i += 4) {
+            Float4::save(dst + i, Float4::max(Float4::load(src + i), vzero));
+        }
+    }
+#if TNN_ARM82
+    else if (data_type == DATA_TYPE_HALF) {
+        ExecFp16(inputs, outputs);
+    }
+#endif
+    else {
+        return TNNERR_LAYER_ERR;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Relu, LAYER_RELU)
+REGISTER_ARM_PRECISION_FP16(LAYER_RELU)
+REGISTER_ARM_LAYOUT(LAYER_RELU, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.h
new file mode 100644
index 0000000..88543e5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_relu_layer_acc.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_RELU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_RELU_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Relu, LAYER_RELU);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_RELU_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reorg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reorg_layer_acc.cc
new file mode 100644
index 0000000..21c42d1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reorg_layer_acc.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_nchw_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_NCHW_ACC(Reorg, LAYER_REORG);
+
+Status ArmReorgLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReorgLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    DataType data_type = inputs[0]->GetBlobDesc().data_type;
+    auto input_dims    = inputs[0]->GetBlobDesc().dims;
+    auto output_dims   = outputs[0]->GetBlobDesc().dims;
+
+    int stride  = param->stride;
+    int forward = param->forward;
+    int mode    = param->mode;
+
+    AllocConvertBuffer(inputs, outputs);
+
+    if (data_type == DATA_TYPE_FLOAT) {
+        UnPackInputs<float>(inputs);
+        auto *bottom_data = reinterpret_cast<float *>(GetBlobHandlePtr(nchw_blob_in[0]->GetHandle()));
+        auto *top_data = reinterpret_cast<float *>(GetBlobHandlePtr(nchw_blob_out[0]->GetHandle()));
+
+        if (forward) {
+            int batch             = input_dims[0];
+            int channel           = input_dims[1];
+            int height            = input_dims[2];
+            int width             = input_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        } else {
+            int batch              = output_dims[0];
+            int channel            = output_dims[1];
+            int height             = output_dims[2];
+            int width              = output_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        }
+        PackOutputs<float>(outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8/bfp16 shuffle, in todo list");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Reorg, LAYER_REORG)
+REGISTER_ARM_LAYOUT(LAYER_REORG, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.cc
new file mode 100644
index 0000000..0b4e1ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.cc
@@ -0,0 +1,236 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_reshape_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_format_converter.h"
+
+namespace TNN_NS {
+
+bool ArmReshapeLayerAcc::UseNaiveConstantBlobs() {
+    return true;
+}
+
+Status ArmReshapeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reshape Acc\n");
+    Status ret = ArmLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ReshapeLayerParam *reshape_param = dynamic_cast<ReshapeLayerParam *>(param_);
+    if (!reshape_param) {
+        FlattenLayerParam *flatten_param = dynamic_cast<FlattenLayerParam *>(param_);
+        if(!flatten_param) {
+            LOGE("Error: layer param is null\n");
+            return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+        } else {
+            reshape_type_ = 0;
+        }
+    } else {
+        reshape_type_ = reshape_param->reshape_type;
+    }
+
+    return TNN_OK;
+}
+
+template <typename T>
+Status ArmReshapeLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    char *input_origin  = GetBlobHandlePtr(inputs[0]->GetHandle());
+    char *output_origin = GetBlobHandlePtr(outputs[0]->GetHandle());
+
+    auto ic    = dims_input[1];
+    auto ic_r4 = ROUND_UP(dims_input[1], 4);
+    auto ihw   = DimsVectorUtils::Count(dims_input, 2);
+    auto oc    = dims_output[1];
+    auto oc_r4 = ROUND_UP(dims_output[1], 4);
+    auto ohw   = DimsVectorUtils::Count(dims_output, 2);
+
+    auto input_plane     = ic * ihw;
+    auto input_plane_r4  = ic_r4 * ihw;
+    auto output_plane    = oc * ohw;
+    auto output_plane_r4 = oc_r4 * ohw;
+
+    for (int b = 0; b < dims_input[0]; b++) {
+        auto input_data     = reinterpret_cast<T *>(input_origin) + b * input_plane_r4;
+        auto workspace_data = reinterpret_cast<T *>(workspace_) + b * input_plane;
+        if (reshape_type_ == 0)
+            UnpackC4(workspace_data, input_data, ihw, ic);
+        else if (reshape_type_ == 1)
+            UnpackC4ToNHWC(workspace_data, input_data, ihw, ic);
+        else
+            return Status(TNNERR_LAYER_ERR, "Unsupport reshape type");
+    }
+    for (int b = 0; b < dims_output[0]; b++) {
+        auto workspace_data = reinterpret_cast<T *>(workspace_) + b * output_plane;
+        auto output_data    = reinterpret_cast<T *>(output_origin) + b * output_plane_r4;
+        if (reshape_type_ == 0)
+            PackC4(output_data, workspace_data, ohw, oc);
+        else if (reshape_type_ == 1)
+            PackC4FromNHWC(output_data, workspace_data, ohw, oc);
+        else
+            return Status(TNNERR_LAYER_ERR, "Unsupport reshape type");
+    }
+
+    return TNN_OK;
+}
+
+#if TNN_ARM82
+template <>
+Status ArmReshapeLayerAcc::Exec<fp16_t>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    char *input_origin  = GetBlobHandlePtr(inputs[0]->GetHandle());
+    char *output_origin = GetBlobHandlePtr(outputs[0]->GetHandle());
+
+    auto ic    = dims_input[1];
+    auto ic_r8 = ROUND_UP(dims_input[1], 8);
+    auto ihw   = DimsVectorUtils::Count(dims_input, 2);
+    auto oc    = dims_output[1];
+    auto oc_r8 = ROUND_UP(dims_output[1], 8);
+    auto ohw   = DimsVectorUtils::Count(dims_output, 2);
+
+    auto input_plane     = ic * ihw;
+    auto input_plane_r8  = ic_r8 * ihw;
+    auto output_plane    = oc * ohw;
+    auto output_plane_r8 = oc_r8 * ohw;
+
+    for (int b = 0; b < dims_input[0]; b++) {
+        auto input_data     = reinterpret_cast<fp16_t *>(input_origin) + b * input_plane_r8;
+        auto workspace_data = reinterpret_cast<fp16_t *>(workspace_) + b * input_plane;
+        if (reshape_type_ == 0)
+            UnpackC8(workspace_data, input_data, ihw, ic);
+        else if (reshape_type_ == 1)
+            UnpackC8ToNHWC(workspace_data, input_data, ihw, ic);
+        else
+            return Status(TNNERR_LAYER_ERR, "Unsupport reshape type");
+    }
+    for (int b = 0; b < dims_output[0]; b++) {
+        auto workspace_data = reinterpret_cast<fp16_t *>(workspace_) + b * output_plane;
+        auto output_data    = reinterpret_cast<fp16_t *>(output_origin) + b * output_plane_r8;
+        if (reshape_type_ == 0)
+            PackC8(output_data, workspace_data, ohw, oc);
+        else if (reshape_type_ == 1)
+            PackC8FromNHWC(output_data, workspace_data, ohw, oc);
+        else
+            return Status(TNNERR_LAYER_ERR, "Unsupport reshape type");
+    }
+
+    return TNN_OK;
+}
+#endif
+
+template <typename T>
+Status ArmReshapeLayerAcc::ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    char *input_origin  = GetBlobHandlePtr(inputs[0]->GetHandle());
+    char *output_origin = GetBlobHandlePtr(outputs[0]->GetHandle());
+
+    auto ele_size = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+
+    if (reshape_type_ == 0) {
+        if (input_origin != output_origin) {
+            memcpy(output_origin, input_origin, DimsVectorUtils::Count(dims_input) * ele_size);
+        }
+    } else if (reshape_type_ == 1) {
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            DataFormatConverter::ConvertFromNCHWToNHWC<float>(inputs[0], outputs[0]);
+            DataFormatConverter::ConvertFromNHWCToNCHW<float>(outputs[0], nullptr);
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+            DataFormatConverter::ConvertFromNCHWToNHWC<bfp16_t>(inputs[0], outputs[0]);
+            DataFormatConverter::ConvertFromNHWCToNCHW<bfp16_t>(outputs[0], nullptr);
+        }
+#if TNN_ARM82
+        else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            DataFormatConverter::ConvertFromNCHWToNHWC<fp16_t>(inputs[0], outputs[0]);
+            DataFormatConverter::ConvertFromNHWCToNCHW<fp16_t>(outputs[0], nullptr);
+        }
+#endif
+        else {
+            return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8 reshape, in todo list");
+        }
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Unsupport reshape type");
+    }
+    return TNN_OK;
+}
+
+Status ArmReshapeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 1) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 2");
+    }
+
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+    auto size_in_bytes = DimsVectorUtils::Count(input->GetBlobDesc().dims) * data_byte_size;
+    workspace_         = context_->GetSharedWorkSpace(size_in_bytes);
+
+    if (DATA_FORMAT_NC4HW4 == input->GetBlobDesc().data_format ||
+        DATA_FORMAT_NC8HW8 == input->GetBlobDesc().data_format) {
+        if (DATA_TYPE_FLOAT == in_data_type) {
+            return Exec<float>(inputs, outputs);
+        } else if (DATA_TYPE_BFP16 == in_data_type) {
+            return Exec<bfp16_t>(inputs, outputs);
+        } 
+#if TNN_ARM82
+        else if (DATA_TYPE_HALF == in_data_type) {
+            return Exec<fp16_t>(inputs, outputs);
+        }
+#endif
+        else {
+            return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8 reshape, in todo list");
+        }
+    } else if (DATA_FORMAT_NCHW == input->GetBlobDesc().data_format) {
+        if (DATA_TYPE_FLOAT == in_data_type) {
+            return ExecNchw<float>(inputs, outputs);
+        } else if (DATA_TYPE_BFP16 == in_data_type) {
+            return ExecNchw<bfp16_t>(inputs, outputs);
+        }
+#if TNN_ARM82
+        else if (DATA_TYPE_HALF == in_data_type) {
+            return ExecNchw<fp16_t>(inputs, outputs);
+        }
+#endif
+        else {
+            return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8 reshape, in todo list");
+        }
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Unsupported data format in reshape");
+    }
+}
+
+REGISTER_ARM_ACC(Reshape, LAYER_RESHAPE);
+REGISTER_ARM_ACC(Reshape, LAYER_FLATTEN);
+REGISTER_ARM_PRECISION_FP16(LAYER_RESHAPE)
+REGISTER_ARM_PRECISION_FP16(LAYER_FLATTEN)
+REGISTER_ARM_LAYOUT(LAYER_RESHAPE, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_FLATTEN, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_RESHAPE, DATA_FORMAT_NCHW)
+REGISTER_ARM_LAYOUT(LAYER_FLATTEN, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.h
new file mode 100644
index 0000000..e53bcd1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_reshape_layer_acc.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_RESHAPE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_RESHAPE_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+class ArmReshapeLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmReshapeLayerAcc(){};
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    virtual bool UseNaiveConstantBlobs() override;
+
+private:
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    void *workspace_ = nullptr;
+    int reshape_type_ = -1;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_RESHAPE_LAYER_ACC_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_scale_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_scale_layer_acc.cc
new file mode 100644
index 0000000..3f1845d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_scale_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_batch_norm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+// @brief conv layer cpu acc
+class ArmScaleLayerAcc : public ArmBatchNormLayerAcc {
+public:
+    virtual ~ArmScaleLayerAcc(){};
+};
+
+REGISTER_ARM_ACC(Scale, LAYER_SCALE)
+REGISTER_ARM_PRECISION_FP16(LAYER_SCALE)
+REGISTER_ARM_LAYOUT(LAYER_SCALE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_selu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_selu_layer_acc.cc
new file mode 100644
index 0000000..068d28c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_selu_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_selu_operator : arm_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<SeluLayerParam *>(param);
+        CHECK_PARAM_NULL(layer_param);
+        alpha_ = layer_param->alpha;
+        gamma_ = layer_param->gamma;
+        return TNN_OK;
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::bsl_cle(v, 0.f, (Float4::exp(v) - 1.0f) * alpha_, v) * gamma_;
+    }
+
+private:
+    float alpha_ = 0.f;
+    float gamma_ = 0.f;
+} ARM_SELU_OP;
+
+DECLARE_ARM_UNARY_ACC(Selu, ARM_SELU_OP);
+REGISTER_ARM_ACC(Selu, LAYER_SELU);
+REGISTER_ARM_LAYOUT(LAYER_SELU, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_shuffle_layer_acc.cc
new file mode 100644
index 0000000..0db0aa8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_shuffle_layer_acc.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+static void inline ShuffleGeneral(float *output_ptr, const float *input_ptr, int group_row, int group_column,
+                                  int channel, int area) {
+    const int feature_map_size = channel * area;
+    UnpackC4(output_ptr, input_ptr, area, channel);
+    RawBuffer reorder_buffer(feature_map_size * sizeof(float));
+    for (int i = 0; i < group_row; ++i)  // 2
+    {
+        for (int j = 0; j < group_column; ++j)  // 3
+        {
+            const float *p_i = output_ptr + (i * group_column + j) * area;
+            float *p_o       = reorder_buffer.force_to<float *>() + (j * group_row + i) * area;
+            memcpy(p_o, p_i, area * sizeof(float));
+        }
+    }
+    PackC4(output_ptr, reorder_buffer.force_to<float *>(), area, channel);
+}
+
+#ifdef TNN_USE_NEON
+static void inline Shuffle2(float *output_ptr, const float *input_ptr, int group_row, int group_column, int channel,
+                            int area) {
+    for (int c = 0; c < channel / 2; c += 4) {
+        auto in_group_0  = input_ptr + c * area;
+        auto in_group_1  = input_ptr + (c + channel / 2) * area;
+        auto out_group_0 = output_ptr + c * 2 * area;
+        auto out_group_1 = output_ptr + (c * 2 + 4) * area;
+        for (int i = 0; i < area; i++) {
+            float32x4_t v0 = vld1q_f32(in_group_0 + i * 4);
+            float32x4_t v1 = vld1q_f32(in_group_1 + i * 4);
+
+            float32x4x2_t vout = vzipq_f32(v0, v1);
+            vst1q_f32(out_group_0 + i * 4, vout.val[0]);
+            vst1q_f32(out_group_1 + i * 4, vout.val[1]);
+        }
+    }
+}
+
+#endif
+
+Status ArmShuffleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ShuffleLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto input         = inputs[0];
+    auto output        = outputs[0];
+    auto dims          = input->GetBlobDesc().dims;
+    DataType data_type = output->GetBlobDesc().data_type;
+    const int area     = DimsVectorUtils::Count(dims, 2);
+
+    int group_row    = param->group;
+    int group_column = dims[1] / group_row;
+
+    assert(dims[1] == (group_column * group_row));
+    auto shuffle_func = ShuffleGeneral;
+#ifdef TNN_USE_NEON
+    if (param->group == 2 && dims[1] % 8 == 0) {
+        shuffle_func = Shuffle2;
+    }
+#endif
+
+    for (int n = 0; n < dims[0]; ++n) {
+        if (data_type == DATA_TYPE_FLOAT) {
+            auto input_ptr  = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle())) + n * area * ROUND_UP(dims[1], 4);
+            auto output_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(output->GetHandle())) + n * area * ROUND_UP(dims[1], 4);
+
+            shuffle_func(output_ptr, input_ptr, group_row, group_column, dims[1], area);
+        } else {
+            return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8/bfp16 shuffle, in todo list");
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL)
+REGISTER_ARM_LAYOUT(LAYER_SHUFFLE_CHANNEL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..19ac6aa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sigmoid_layer_acc.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_sigmoid_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::sigmoid(v);
+    }
+    virtual Float4 fast_op(const Float4& v) {
+        return Float4::fast_sigmoid(v);
+    }
+} ARM_SIGMOID_OP;
+
+DECLARE_ARM_UNARY_ACC_FP16(Sigmoid, ARM_SIGMOID_OP);
+
+REGISTER_ARM_ACC(Sigmoid, LAYER_SIGMOID)
+REGISTER_ARM_PRECISION_FP16(LAYER_SIGMOID)
+REGISTER_ARM_LAYOUT(LAYER_SIGMOID, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_sign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sign_layer_acc.cc
new file mode 100644
index 0000000..9d79f30
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sign_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+
+#include "tnn/device/arm/acc/Float4.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Sign, LAYER_SIGN);
+
+Status ArmSignLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto dims      = outputs[0]->GetBlobDesc().dims;
+    int count      = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+    int count_quad = UP_DIV(count, 4);
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+        Float4 zero        = Float4(0.f);
+        Float4 one         = Float4(1.f);
+        Float4 neg_one     = Float4(-1.f);
+        for (int n = 0; n < count_quad; n++) {
+            Float4 val = Float4::load(input_data + n * 4);
+            Float4 res = Float4::bsl_clt(val, zero, neg_one, val);
+            res        = Float4::bsl_cgt(val, zero, one, res);
+            Float4::save(output_data + n * 4, res);
+        }
+    } else {
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Sign, LAYER_SIGN);
+REGISTER_ARM_LAYOUT(LAYER_SIGN, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.cc
new file mode 100644
index 0000000..9ff361e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.cc
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_signed_mul_layer_acc.h"
+
+namespace TNN_NS {
+
+Status ArmSignedMulLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+template <typename T>
+Status ArmSignedMulLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SignedMulLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: SignedMulLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SignedMulLayerParam is nil");
+    }
+
+    auto alpha     = layer_param->alpha;
+    auto beta      = layer_param->beta;
+    auto gamma_inv = 1.0f / layer_param->gamma;
+
+    Float4 alpha_4 = Float4(alpha);
+    Float4 val_gt4 = Float4((beta + 1.0) * gamma_inv / 2.0);
+    Float4 val_lt4 = Float4((beta - 1.0) * gamma_inv / 2.0);
+
+    auto input_blob    = inputs[0];
+    auto output_blob   = outputs[0];
+    T *input_data  = reinterpret_cast<T *>(GetBlobHandlePtr(input_blob->GetHandle()));
+    T *output_data = reinterpret_cast<T *>(GetBlobHandlePtr(output_blob->GetHandle()));
+    int batch          = input_blob->GetBlobDesc().dims[0];
+    int channel        = input_blob->GetBlobDesc().dims[1];
+    int channel_r4     = UP_DIV(channel, 4);
+    int channel_size   = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+
+    for (int b = 0; b < batch; b++) {
+        T *input_data_c  = input_data  + b * channel_r4 * channel_size * 4;
+        T *output_data_c = output_data + b * channel_r4 * channel_size * 4;
+        for (int c = 0; c < channel_r4; c++) {
+            for (int i = 0; i < channel_size; i++) {
+                Float4 val  = Float4::load(input_data_c);
+                Float4 res1 = Float4::bsl_cgt(val, alpha_4, val_gt4, val_lt4);
+                Float4 res2 = Float4::bsl_clt(val, alpha_4, val_lt4, val_gt4);
+                Float4 res  = res1 + res2;
+                Float4::save(output_data_c, res);
+                input_data_c  += 4;
+                output_data_c += 4;
+            }
+        }
+
+        for (int c = channel_r4 - 1; c >= 0; c--) {
+            T *output_data_c  = output_data + (b * channel_r4 + c) * channel_size * 4;
+            T *output_data_c0 = output_data + b * channel_r4 * channel_size * 4;
+            for (int i = 0; i < channel_size; i++) {
+                Float4 val = Float4::load(output_data_c);
+                Float4 res = val * output_data_c0[0];
+                Float4::save(output_data_c, res);
+                output_data_c  += 4;
+                output_data_c0 += 4;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(SignedMul, LAYER_SIGNED_MUL);
+REGISTER_ARM_LAYOUT(LAYER_SIGNED_MUL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.h
new file mode 100644
index 0000000..924efcb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_signed_mul_layer_acc.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_SIGNED_MUL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_SIGNED_MUL_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ArmSignedMulLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmSignedMulLayerAcc(){};
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_SIGNED_MUL_LAYER_ACC_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.cc
new file mode 100644
index 0000000..f476214
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.cc
@@ -0,0 +1,219 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_softmax_layer_acc.h"
+
+#include <cmath>
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+static void SoftmaxChannelFunc(float *dst, float *src, int channel) {
+    // max
+    Float4 max_v = Float4(src[0]);
+    float max    = src[0];
+    int c        = 0;
+    for (; c < channel - 4; c += 4) {
+        max_v = Float4::max(Float4::load(src + c), max_v);
+    }
+    for (; c < channel; ++c) {
+        max = std::max(max, src[c]);
+    }
+    for (int i = 0; i < 4; ++i) {
+        max = std::max(max, max_v[i]);
+    }
+    // exp
+    c = 0;
+    for (; c < channel - 4; c += 4) {
+        Float4::save(dst + c, Float4::exp(Float4::load(src + c) - Float4(max)));
+    }
+    for (; c < channel; ++c) {
+        dst[c] = expf(src[c] - max);
+    }
+    // sum
+    c            = 0;
+    Float4 sum_v = Float4(0.0f);
+    float sum    = 0.0f;
+    for (; c < channel - 4; c += 4) {
+        sum_v = Float4::load(dst + c) + sum_v;
+    }
+    for (; c < channel; ++c) {
+        sum += dst[c];
+    }
+    for (int i = 0; i < 4; ++i) {
+        sum += sum_v[i];
+    }
+    // div
+    c                 = 0;
+    float denominator = 1.0f / sum;
+    for (; c < channel - 4; c += 4) {
+        Float4::save(dst + c, Float4::load(dst + c) * denominator);
+    }
+    for (; c < channel; ++c) {
+        dst[c] *= denominator;
+    }
+}
+
+template <typename T>
+Status ArmSoftmaxLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    SoftmaxLayerParam *layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int data_byte_size = sizeof(float);
+    SoftmaxPreparation();
+
+    RawBuffer reorder_buffer;
+    if (packed) {
+        reorder_buffer = RawBuffer(dims[1] * hw * data_byte_size);
+    }
+    RawBuffer max_value_buffer(inside * data_byte_size);
+    RawBuffer sum_value_buffer(inside * data_byte_size);
+    RawBuffer input_buffer;
+    RawBuffer output_buffer;
+
+    float *input_orign  = nullptr;
+    float *output_orign = nullptr;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        input_orign  = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+        output_orign = reinterpret_cast<float *>(GetBlobHandlePtr(output->GetHandle()));
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        bfp16_t *in_ptr = reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+        input_buffer    = RawBuffer(count * sizeof(float));
+        output_buffer   = RawBuffer(count * sizeof(float));
+        input_orign     = input_buffer.force_to<float *>();
+        output_orign    = output_buffer.force_to<float *>();
+        ConvertFromBFP16ToFloat(in_ptr, input_orign, count);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+
+    auto *max_value_ptr = max_value_buffer.force_to<float *>();
+    auto *sum_value_ptr = sum_value_buffer.force_to<float *>();
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr          = input_orign + batch_idx * hw * ROUND_UP(dims[1], packed ? 4 : 1);
+        auto output_ptr         = output_orign + batch_idx * hw * ROUND_UP(dims[1], packed ? 4 : 1);
+        auto reorder_buffer_ptr = output_ptr;
+
+        if (packed) {
+            UnpackC4(output_ptr, input_ptr, hw, dims[1]);
+            input_ptr          = output_ptr;
+            reorder_buffer_ptr = reorder_buffer.force_to<float *>();
+        }
+        if (inside == 1) {
+            for (int y = 0; y < outside; ++y) {
+                auto src_y = input_ptr + y * step_y;
+                auto dst_y = reorder_buffer_ptr + y * step_y;
+                SoftmaxChannelFunc(dst_y, src_y, channel);
+            }
+        } else {
+            for (int y = 0; y < outside; y++) {
+                auto src_y = input_ptr + y * step_y;
+                auto dst_y = reorder_buffer_ptr + y * step_y;
+                memcpy(max_value_ptr, src_y, sizeof(float) * inside);
+                // max
+                auto src = src_y + inside;
+                for (int c = 1; c < channel; ++c, src += inside) {
+                    int x = 0;
+                    for (; x < inside - 4; x += 4) {
+                        Float4 src_v = Float4::load(src + x);
+                        Float4 max_v = Float4::load(max_value_ptr + x);
+                        max_v        = Float4::max(src_v, max_v);
+                        Float4::save(max_value_ptr + x, max_v);
+                    }
+                    for (; x < inside; ++x) {
+                        max_value_ptr[x] = src[x] > max_value_ptr[x] ? src[x] : max_value_ptr[x];
+                    }
+                }
+                memset(sum_value_ptr, 0, sizeof(float) * inside);
+                src        = src_y;
+                float *dst = dst_y;
+                for (int c = 0; c < channel; ++c, src += inside, dst += inside) {
+                    int x = 0;
+                    for (; x < inside - 4; x += 4) {
+                        Float4 src_v = Float4::load(src + x);
+                        Float4 max_v = Float4::load(max_value_ptr + x);
+                        Float4 sum_v = Float4::load(sum_value_ptr + x);
+                        Float4 dst_v = Float4::exp(src_v - max_v);
+                        sum_v        = sum_v + dst_v;
+                        Float4::save(dst + x, dst_v);
+                        Float4::save(sum_value_ptr + x, sum_v);
+                    }
+                    for (; x < inside; ++x) {
+                        dst[x] = expf(src[x] - max_value_ptr[x]);
+                        sum_value_ptr[x] += dst[x];
+                    }
+                }
+                dst = dst_y;
+                for (int c = 0; c < channel; ++c, dst += inside) {
+                    int x = 0;
+                    for (; x < inside - 4; x += 4) {
+                        Float4 dst_v = Float4::load(dst + x);
+                        Float4 sum_v = Float4::load(sum_value_ptr + x);
+                        dst_v        = Float4::div(dst_v, sum_v);
+                        Float4::save(dst + x, dst_v);
+                    }
+                    for (; x < inside; ++x) {
+                        dst[x] /= sum_value_ptr[x];
+                    }
+                }
+            }
+        }
+        if (packed) {
+            PackC4(output_ptr, reorder_buffer_ptr, hw, dims[1]);
+        }
+    }
+
+    if (in_data_type == DATA_TYPE_BFP16) {
+        bfp16_t *out_ptr = reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+        ConvertFromFloatToBFP16(output_orign, out_ptr, count);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmSoftmaxLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type              = inputs[0]->GetBlobDesc().data_type;
+    SoftmaxLayerParam *layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto axis = layer_param->axis;
+    if (axis == 0) {
+        LOGE("ARM Softmax not support axis = 0\n");
+        return Status(TNNERR_LAYER_ERR, "ARM Softmax not support axis = 0");
+    }
+
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (in_data_type == DATA_TYPE_HALF) {
+        return ExecFp16(inputs, outputs);
+    }
+#endif
+    else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+REGISTER_ARM_ACC(Softmax, LAYER_SOFTMAX)
+REGISTER_ARM_PRECISION_FP16(LAYER_SOFTMAX)
+REGISTER_ARM_LAYOUT(LAYER_SOFTMAX, DATA_FORMAT_NC4HW4)
+REGISTER_ARM_LAYOUT(LAYER_SOFTMAX, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.h
new file mode 100644
index 0000000..a417705
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softmax_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_SOFTMAX_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_SOFTMAX_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Softmax, LAYER_SOFTMAX);
+
+#define SoftmaxPreparation()                                                                                           \
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;                                                            \
+    auto axis         = layer_param->axis;                                                                             \
+    auto input        = inputs[0];                                                                                     \
+    auto output       = outputs[0];                                                                                    \
+    auto dims         = output->GetBlobDesc().dims;                                                                    \
+    auto hw           = DimsVectorUtils::Count(dims, 2);                                                               \
+    auto batch        = dims[0];                                                                                       \
+    bool packed       = input->GetBlobDesc().data_format != DATA_FORMAT_NCHW;                                          \
+    size_t count      = hw * batch * ROUND_UP(dims[1], packed ? 4 : 1);                                                \
+    int inside        = 1;                                                                                             \
+    int outside       = 1;                                                                                             \
+    int channel       = 1;                                                                                             \
+    for (int i = 1; i < axis; i++) {                                                                                   \
+        outside *= dims[i];                                                                                            \
+    }                                                                                                                  \
+    channel = dims[axis];                                                                                              \
+    for (int i = axis + 1; i < dims.size(); i++) {                                                                     \
+        inside *= dims[i];                                                                                             \
+    }                                                                                                                  \
+    auto step_y = channel * inside;
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_SOFTMAX_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_softplus_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softplus_layer_acc.cc
new file mode 100644
index 0000000..47cd552
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_softplus_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_softplus_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::log(Float4::exp(v) + Float4(1.0f));
+    }
+} ARM_SOFTPLUS_OP;
+
+DECLARE_ARM_UNARY_ACC(Softplus, ARM_SOFTPLUS_OP);
+
+REGISTER_ARM_ACC(Softplus, LAYER_SOFTPLUS);
+REGISTER_ARM_LAYOUT(LAYER_SOFTPLUS, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_splitv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_splitv_layer_acc.cc
new file mode 100644
index 0000000..4bf1fbe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_splitv_layer_acc.cc
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(SplitV, LAYER_SPLITV);
+
+static DimsVector GetNCXHWXRoundDims(const DimsVector &dims, const int round) {
+    DimsVector round_dims = {dims[0], UP_DIV(dims[1], round)};
+    for (int i = 2; i < dims.size(); ++i) {
+        round_dims.push_back(dims[i]);
+    }
+    round_dims.push_back(round);
+    return round_dims;
+}
+
+// batch || height || width, no channel
+static int splitv_common(Blob *input, const std::vector<Blob *> &outputs, SplitVLayerParam *param) {
+    const int axis        = param->axis;
+    auto input_dims       = input->GetBlobDesc().dims;
+    auto round_input_dims = GetNCXHWXRoundDims(input_dims, 4);
+    const int batch       = DimsVectorUtils::Count(round_input_dims, 0, axis);
+    const int slice_size  = DimsVectorUtils::Count(round_input_dims, axis + 1);
+    const int slice_input = input_dims[axis];
+    auto input_data       = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+
+    for (int b = 0; b < batch; b++) {
+        int slice_input_offset = 0;
+        for (int i = 0; i < outputs.size(); i++) {
+            auto output_blob = outputs[i];
+            auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+            const int slice  = output_blob->GetBlobDesc().dims[axis];
+
+            auto output_data_ptr = output_data + b * slice * slice_size;
+            auto input_data_ptr  = input_data + b * slice_input * slice_size + slice_input_offset * slice_size;
+
+            memcpy(output_data_ptr, input_data_ptr, slice * slice_size * sizeof(float));
+            slice_input_offset += slice;
+        }
+    }
+    return 0;
+}
+
+static int splitv_channel(Blob *input, const std::vector<Blob *> &outputs, SplitVLayerParam *param) {
+    const int axis              = param->axis;
+    auto input_dims             = input->GetBlobDesc().dims;
+    auto input_data             = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+    DimsVector round_input_dims = GetNCXHWXRoundDims(input_dims, 4);
+
+    int slice_offset = 0;
+    for (int i = 0; i < outputs.size(); i++) {
+        auto output                  = outputs[i];
+        auto output_dims             = output->GetBlobDesc().dims;
+        DimsVector round_output_dims = GetNCXHWXRoundDims(output_dims, 4);
+        auto output_data             = reinterpret_cast<float *>(GetBlobHandlePtr(output->GetHandle()));
+        const int slice              = output_dims[axis];
+        auto plane                   = DimsVectorUtils::Count(output_dims, 2);
+        for (int b = 0; b < output_dims[0]; b++) {
+            auto input_b  = input_data + b * DimsVectorUtils::Count(round_input_dims, 1);
+            auto output_b = output_data + b * DimsVectorUtils::Count(round_output_dims, 1);
+            for (int c = 0; c < UP_DIV(output_dims[1], 4); c++) {
+                auto output_z    = output_b + c * DimsVectorUtils::Count(round_output_dims, 2);
+                auto input_c_idx = c * 4 + slice_offset;
+                auto c_remain    = output_dims[1] - c * 4;
+                auto c_c         = c_remain >= 4 ? 4 : c_remain;
+                // both src and dst can use float4
+                if (slice_offset % 4 == 0 && c * 4 + 3 < output_dims[1]) {
+                    auto input_z = input_b + input_c_idx * plane;
+                    for (int p = 0; p < plane; p++) {
+                        Float4::save(output_z + p * 4, Float4::load(input_z + p * 4));
+                    }
+                } else {
+                    int s;
+                    for (s = 0; s < c_c; s++) {
+                        auto src_start = ((input_c_idx + s) / 4) * plane * 4 + ((input_c_idx + s) % 4);
+                        auto dst_start = s;
+                        for (int p = 0; p < plane; p++)
+                            output_z[dst_start + p * 4] = input_b[src_start + p * 4];
+                    }
+                    for (; s < 4; s++) {
+                        for (int p = 0; p < plane; p++)
+                            output_z[s + p * 4] = 0.f;
+                    }
+                }
+            }
+        }
+
+        slice_offset += slice;
+    }
+    return 0;
+}
+
+static int splitv_channel_c4(Blob *input, const std::vector<Blob *> &outputs, SplitVLayerParam *param) {
+    const int axis        = param->axis;
+    auto input_dims       = input->GetBlobDesc().dims;
+    auto round_input_dims = GetNCXHWXRoundDims(input_dims, 4);
+    const int batch       = DimsVectorUtils::Count(round_input_dims, 0, axis);
+    const int slice_size  = DimsVectorUtils::Count(round_input_dims, axis + 1);
+    // different from split common, treat 4 element in channel as one
+    const int slice_input = UP_DIV(input_dims[axis], 4);
+    auto input_data       = reinterpret_cast<float *>(GetBlobHandlePtr(input->GetHandle()));
+
+    for (int b = 0; b < batch; b++) {
+        int slice_input_offset = 0;
+        for (int i = 0; i < outputs.size(); i++) {
+            auto output_blob = outputs[i];
+            auto output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+            // different from split common, treat 4 element in channel as one
+            const int slice  = UP_DIV(output_blob->GetBlobDesc().dims[axis], 4);
+
+            auto output_data_ptr = output_data + b * slice * slice_size;
+            auto input_data_ptr  = input_data + b * slice_input * slice_size + slice_input_offset * slice_size;
+
+            memcpy(output_data_ptr, input_data_ptr, slice * slice_size * sizeof(float));
+            slice_input_offset += slice;
+        }
+    }
+    return 0;
+}
+
+Status ArmSplitVLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    if (!layer_param || layer_param->slices.size() != outputs.size()) {
+        return Status(TNNERR_PARAM_ERR, "ArmSplitVLayerAcc has invalid param, slices size != output blobs size");
+    }
+
+    const int axis    = layer_param->axis;
+    auto input_blob   = inputs[0];
+    bool is_chanel_c4 = false;
+    if (axis == 1) {
+        is_chanel_c4 = true;
+        for (int i = 0; i < outputs.size() - 1; i++) {
+            auto output_dims = outputs[i]->GetBlobDesc().dims;
+            if (output_dims[1] % 4) {
+                is_chanel_c4 = false;
+                break;
+            }
+        }
+    }
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        if (axis == 1) {
+            if (is_chanel_c4) {
+                splitv_channel_c4(input_blob, outputs, layer_param);
+            } else {
+                splitv_channel(input_blob, outputs, layer_param);
+            }
+        } else {
+            splitv_common(input_blob, outputs, layer_param);
+        }
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc don't support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(SplitV, LAYER_SPLITV);
+REGISTER_ARM_LAYOUT(LAYER_SPLITV, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_sqrt_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sqrt_layer_acc.cc
new file mode 100644
index 0000000..021a5ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sqrt_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct arm_sqrt_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::sqrt(v);
+    }
+} ARM_SQRT_OP;
+
+DECLARE_ARM_UNARY_ACC(Sqrt, ARM_SQRT_OP);
+REGISTER_ARM_ACC(Sqrt, LAYER_SQRT)
+REGISTER_ARM_LAYOUT(LAYER_SQRT, DATA_FORMAT_NC4HW4)
+
+typedef struct arm_rsqrt_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::div(1.0f, Float4::sqrt(v));
+    }
+} ARM_RSQRT_OP;
+
+DECLARE_ARM_UNARY_ACC(Rsqrt, ARM_RSQRT_OP);
+REGISTER_ARM_ACC(Rsqrt, LAYER_RSQRT)
+REGISTER_ARM_LAYOUT(LAYER_RSQRT, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_squeeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_squeeze_layer_acc.cc
new file mode 100644
index 0000000..e243804
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_squeeze_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Squeeze, LAYER_SQUEEZE);
+
+Status ArmSqueezeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto input_data  = GetBlobHandlePtr(input->GetHandle());
+    auto output_data = GetBlobHandlePtr(output->GetHandle());
+    auto count       = DimsVectorUtils::Count(output->GetBlobDesc().dims);
+    auto ele_size    = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Squeeze, LAYER_SQUEEZE);
+REGISTER_ARM_PRECISION_FP16(LAYER_SQUEEZE);
+REGISTER_ARM_LAYOUT(LAYER_SQUEEZE, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_stride_slice_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_stride_slice_layer_acc.cc
new file mode 100644
index 0000000..988b060
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_stride_slice_layer_acc.cc
@@ -0,0 +1,213 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+static Status ExecStrideSlice(Blob *input_blob, Blob *output_blob, const std::vector<int> &begins,
+                              const std::vector<int> &ends, const std::vector<int> &strides) {
+    auto dims_input   = input_blob->GetBlobDesc().dims;
+    auto dims_output  = output_blob->GetBlobDesc().dims;
+    int input_slice  = UP_DIV(dims_input[1], 4);
+    int output_slice = UP_DIV(dims_output[1], 4);
+
+    // support maximum dim 5
+    int input_strides[4];
+    int output_strides[4];
+    input_strides[0] = DimsVectorUtils::Count(dims_input, 2) * 4 * input_slice;
+    output_strides[0] = DimsVectorUtils::Count(dims_output, 2) * 4 * output_slice;
+    for (int i = 1; i < 4; i++) {
+        input_strides[i] = DimsVectorUtils::Count(dims_input, i + 1) * 4;
+        output_strides[i] = DimsVectorUtils::Count(dims_output, i + 1) * 4;
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+
+        int nn = 0, nc = 0, nh = 0, nw = 0, nx = 0;
+        if (begins.size() == 5) {
+            for (int n = begins[0]; n < ends[0]; n += strides[0], nn++) {
+                auto input_n  = input_data + n * input_strides[0];
+                auto output_n = output_data + nn * output_strides[0];
+                nc = 0;
+                for (int c = begins[1]; c < ends[1]; c += strides[1], nc++) {
+                    auto zi = c / 4, ri = c % 4;
+                    auto zo = nc / 4, ro = nc % 4;
+
+                    auto input_c  = input_n + zi * input_strides[1];
+                    auto output_c = output_n + zo * output_strides[1];
+                    nh = 0;
+                    for (int h = begins[2]; h < ends[2]; h += strides[2], nh++) {
+                        auto input_h  = input_c + h * input_strides[2];
+                        auto output_h = output_c + nh * output_strides[2];
+                        nw = 0;
+                        for (int w = begins[3]; w < ends[3]; w += strides[3], nw++) {
+                            auto input_w  = input_h + w * input_strides[3];
+                            auto output_w = output_h + nw * output_strides[3];
+                            nx = 0;
+                            for (int x = begins[4]; x < ends[4]; x += strides[4], nx++) {
+                                output_w[nx * 4 + ro] = input_w[x * 4 + ri];
+                            }
+                        }
+                    }
+                }
+            }
+        } else if (begins.size() == 4) {
+            for (int n = begins[0]; n < ends[0]; n += strides[0], nn++) {
+                auto input_n  = input_data + n * input_strides[0];
+                auto output_n = output_data + nn * output_strides[0];
+                nc = 0;
+                for (int c = begins[1]; c < ends[1]; c += strides[1], nc++) {
+                    auto zi = c / 4, ri = c % 4;
+                    auto zo = nc / 4, ro = nc % 4;
+
+                    auto input_c  = input_n + zi * input_strides[1];
+                    auto output_c = output_n + zo * output_strides[1];
+                    nh = 0;
+                    for (int h = begins[2]; h < ends[2]; h += strides[2], nh++) {
+                        auto input_h  = input_c + h * input_strides[2];
+                        auto output_h = output_c + nh * output_strides[2];
+                        nw = 0;
+                        for (int w = begins[3]; w < ends[3]; w += strides[3], nw++) {
+                            output_h[nw * 4 + ro] = input_h[w * 4 + ri];
+                        }
+                    }
+                }
+            }
+        } else if (begins.size() == 3) {
+            for (int n = begins[0]; n < ends[0]; n += strides[0], nn++) {
+                auto input_n  = input_data + n * input_strides[0];
+                auto output_n = output_data + nn * output_strides[0];
+                nc = 0;
+                for (int c = begins[1]; c < ends[1]; c += strides[1], nc++) {
+                    auto zi = c / 4, ri = c % 4;
+                    auto zo = nc / 4, ro = nc % 4;
+
+                    auto input_c  = input_n + zi * input_strides[1];
+                    auto output_c = output_n + zo * output_strides[1];
+                    nh = 0;
+                    for (int h = begins[2]; h < ends[2]; h += strides[2], nh++) {
+                        output_c[nh * 4 + ro] = input_c[h * 4 + ri];
+                    }
+                }
+            }
+        } else if (begins.size() == 2) {
+            for (int n = begins[0]; n < ends[0]; n += strides[0], nn++) {
+                auto input_n  = input_data + n * input_strides[0];
+                auto output_n = output_data + nn * output_strides[0];
+                nc = 0;
+                for (int c = begins[1]; c < ends[1]; c += strides[1], nc++) {
+                    output_n[nc] = input_n[c];
+                }
+            }
+        }
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8/bfp16 StrideSlice, in todo list");
+    }
+    return TNN_OK;
+}
+
+Status ArmStrideSliceLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims_input   = input_blob->GetBlobDesc().dims;
+    auto dims_output  = output_blob->GetBlobDesc().dims;
+    auto dim_size     = dims_output.size();
+    if ((dim_size > 5 || dim_size < 2) || dim_size != dims_input.size()) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam not support!");
+    }
+
+    auto begins  = layer_param->begins;
+    auto ends    = layer_param->ends;
+    auto strides = layer_param->strides;
+    std::reverse(begins.begin(), begins.end());
+    std::reverse(ends.begin(), ends.end());
+    std::reverse(strides.begin(), strides.end());
+
+    for (int i = 0; i < ends.size(); ++i) {
+        if (ends[i] == 0) {
+            ends[i] = input_blob->GetBlobDesc().dims[i];
+        }
+    }
+
+    return ExecStrideSlice(input_blob, output_blob, begins, ends, strides);
+}
+
+REGISTER_ARM_ACC(StrideSlice, LAYER_STRIDED_SLICE)
+REGISTER_ARM_LAYOUT(LAYER_STRIDED_SLICE, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+Status ArmStrideSliceV2LayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceV2LayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceV2LayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims_input   = input_blob->GetBlobDesc().dims;
+    auto dims_output  = output_blob->GetBlobDesc().dims;
+    auto dim_size     = dims_output.size();
+    if ((dim_size > 5 || dim_size < 2) || dim_size != dims_input.size()) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceV2LayerParam not support!");
+    }
+
+    auto begins  = layer_param->begins;
+    auto ends    = layer_param->ends;
+    auto strides = layer_param->strides;
+    auto axes    = layer_param->axes;
+
+    Status status = TNN_OK;
+    DimsFunctionUtils::StrideSlice(dims_input, begins, ends, strides, axes, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    std::vector<int> rectified_begins(dim_size, 0);
+    std::vector<int> rectified_ends(dim_size, 0);
+    std::vector<int> rectified_strides(dim_size, 0);
+    for (int i = 0, axes_idx = 0; i < dim_size; ++i) {
+        if (axes_idx >= axes.size() || i != axes[axes_idx]) {
+            rectified_begins[i]  = 0;
+            rectified_ends[i]    = dims_input[i];
+            rectified_strides[i] = 1;
+        } else {
+            rectified_begins[i]  = begins[axes_idx];
+            rectified_ends[i]    = ends[axes_idx];
+            rectified_strides[i] = strides[axes_idx];
+            axes_idx += 1;
+        }
+    }
+
+    return ExecStrideSlice(input_blob, output_blob, rectified_begins, rectified_ends, rectified_strides);
+}
+
+REGISTER_ARM_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2)
+REGISTER_ARM_LAYOUT(LAYER_STRIDED_SLICE_V2, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_sub_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sub_layer_acc.cc
new file mode 100644
index 0000000..59d6337
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_sub_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Sub);
+
+Status ArmSubLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kSUB;
+
+    return TNN_OK;
+}
+
+ArmSubLayerAcc::~ArmSubLayerAcc() {}
+
+REGISTER_ARM_ACC(Sub, LAYER_SUB)
+REGISTER_ARM_PRECISION_FP16(LAYER_SUB)
+REGISTER_ARM_LAYOUT(LAYER_SUB, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_tile_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_tile_layer_acc.cc
new file mode 100644
index 0000000..fcc6c97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_tile_layer_acc.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Tile, LAYER_REPEAT);
+
+template <typename T>
+Status ArmTileLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    int count        = DimsVectorUtils::Count(output_dims);
+    T *input_data    = reinterpret_cast<T *>(GetBlobHandlePtr(input_blob->GetHandle()));
+    T *output_data   = reinterpret_cast<T *>(GetBlobHandlePtr(output_blob->GetHandle()));
+    OMP_PARALLEL_FOR_
+    for (int index = 0; index < count; ++index) {
+        int offset = 0;
+        int prod   = count;
+        for (int i = 0; i < input_dims.size(); i++) {
+            prod /= output_dims[i];
+            int mod = index / prod % input_dims[i];
+            offset  = offset * input_dims[i] + mod;
+        }
+        output_data[index] = input_data[offset];
+    }
+
+    return TNN_OK;
+}
+
+Status ArmTileLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<TileLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        Exec<float>(inputs, outputs);
+    } else if (data_type == DATA_TYPE_INT32) {
+        Exec<int32_t>(inputs, outputs);
+    } else if (data_type == DATA_TYPE_INT8) {
+        Exec<int8_t>(inputs, outputs);
+    } else {
+        return Status(Status(TNNERR_MODEL_ERR, "ArmTileLayerAcc input has invalid data type"));
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Tile, LAYER_REPEAT);
+REGISTER_ARM_LAYOUT(LAYER_REPEAT, DATA_FORMAT_NCHW);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_trig_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_trig_layer_acc.cc
new file mode 100644
index 0000000..acfe81e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_trig_layer_acc.cc
@@ -0,0 +1,112 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+/// unary tanh op
+typedef struct arm_tanh_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::tanh(v);
+    }
+} ARM_TANH_OP;
+
+/// unary tan op
+typedef struct arm_tan_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::tan(v);
+    }
+} ARM_TAN_OP;
+
+/// unary atan op
+typedef struct arm_atan_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        Float4 ret;
+        ret.set_lane(atan(v[0]), 0);
+        ret.set_lane(atan(v[1]), 1);
+        ret.set_lane(atan(v[2]), 2);
+        ret.set_lane(atan(v[3]), 3);
+        return ret;
+    }
+} ARM_ATAN_OP;
+
+/// unary sin op
+typedef struct arm_sin_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::sin(v);
+    }
+} ARM_SIN_OP;
+
+/// unary asin op
+typedef struct arm_asin_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        Float4 ret;
+        ret.set_lane(asin(v[0]), 0);
+        ret.set_lane(asin(v[1]), 1);
+        ret.set_lane(asin(v[2]), 2);
+        ret.set_lane(asin(v[3]), 3);
+        return ret;
+    }
+} ARM_ASIN_OP;
+
+/// unary cos op
+typedef struct arm_cos_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        return Float4::cos(v);
+    }
+} ARM_COS_OP;
+
+/// unary acos op
+typedef struct arm_acos_operator : arm_unary_operator {
+    virtual Float4 operator()(const Float4& v) {
+        Float4 ret;
+        ret.set_lane(acos(v[0]), 0);
+        ret.set_lane(acos(v[1]), 1);
+        ret.set_lane(acos(v[2]), 2);
+        ret.set_lane(acos(v[3]), 3);
+        return ret;
+    }
+} ARM_ACOS_OP;
+
+/// register ops
+DECLARE_ARM_UNARY_ACC(Tanh, ARM_TANH_OP);
+REGISTER_ARM_ACC(Tanh, LAYER_TANH);
+REGISTER_ARM_LAYOUT(LAYER_TANH, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Tan, ARM_TAN_OP);
+REGISTER_ARM_ACC(Tan, LAYER_TAN);
+REGISTER_ARM_LAYOUT(LAYER_TAN, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Atan, ARM_ATAN_OP);
+REGISTER_ARM_ACC(Atan, LAYER_ATAN);
+REGISTER_ARM_LAYOUT(LAYER_ATAN, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Sin, ARM_SIN_OP);
+REGISTER_ARM_ACC(Sin, LAYER_SIN);
+REGISTER_ARM_LAYOUT(LAYER_SIN, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Asin, ARM_ASIN_OP);
+REGISTER_ARM_ACC(Asin, LAYER_ASIN);
+REGISTER_ARM_LAYOUT(LAYER_ASIN, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Cos, ARM_COS_OP);
+REGISTER_ARM_ACC(Cos, LAYER_COS);
+REGISTER_ARM_LAYOUT(LAYER_COS, DATA_FORMAT_NC4HW4)
+
+DECLARE_ARM_UNARY_ACC(Acos, ARM_ACOS_OP);
+REGISTER_ARM_ACC(Acos, LAYER_ACOS);
+REGISTER_ARM_LAYOUT(LAYER_ACOS, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.cc
new file mode 100644
index 0000000..23e800d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+ArmUnaryLayerAcc::~ArmUnaryLayerAcc() {}
+
+Status ArmUnaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                              const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    return op_->Init(param);
+}
+
+// SUPPORTED DATATYPES
+bool ArmUnaryLayerAcc::DataTypeSupported(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16 || data_type == DATA_TYPE_HALF)
+        return true;
+    else
+        return false;
+}
+
+template <typename T>
+Status ArmUnaryLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims = output->GetBlobDesc().dims;
+
+    int count      = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+    int count_quad = UP_DIV(count, 4);
+
+    auto input_ptr  = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    auto output_ptr = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    if (context_->GetPrecision() == PRECISION_HIGH) {
+        OMP_PARALLEL_FOR_
+        for (int n = 0; n < count_quad; n++) {
+            Float4::save(output_ptr + n * 4, (*op_)(Float4::load(input_ptr + n * 4)));
+        }
+    } else {
+        OMP_PARALLEL_FOR_
+        for (int n = 0; n < count_quad; n++) {
+            Float4::save(output_ptr + n * 4, op_->fast_op(Float4::load(input_ptr + n * 4)));
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmUnaryLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.h
new file mode 100644
index 0000000..040e469
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unary_layer_acc.h
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+
+namespace TNN_NS {
+
+typedef struct arm_unary_operator {
+public:
+    virtual Status Init(LayerParam *param = nullptr) {
+        param_ = param;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float &v) {
+        return v;
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return v;
+    };
+    virtual Float4 fast_op(const Float4 &v) {
+        return operator()(v);
+    };
+
+    virtual fp16_t operator()(const fp16_t &v) {
+        return v;
+    }
+
+protected:
+    LayerParam *param_ = nullptr;
+} ARM_UNARY_OP;
+
+class ArmUnaryLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmUnaryLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    virtual bool DataTypeSupported(DataType data_type) override;
+
+    std::shared_ptr<ARM_UNARY_OP> op_;
+
+private:
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+#define DECLARE_ARM_UNARY_ACC(type_string, op_type)                                                                    \
+    class Arm##type_string##LayerAcc : public ArmUnaryLayerAcc {                                                       \
+    public:                                                                                                            \
+        Arm##type_string##LayerAcc() {                                                                                 \
+            op_ = std::make_shared<op_type>();                                                                         \
+        }                                                                                                              \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+    }
+
+#if TNN_ARM82
+#define DEFINE_ARM_FP16_UNARY_DO_FORWARD                                                                               \
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {                  \
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {                                                    \
+            return ExecFp16(inputs, outputs);                                                                          \
+        } else {                                                                                                       \
+            return ArmUnaryLayerAcc::DoForward(inputs, outputs);                                                       \
+        }                                                                                                              \
+    }
+#else
+#define DEFINE_ARM_FP16_UNARY_DO_FORWARD
+#endif  // TNN_ARM82
+
+#define DECLARE_ARM_UNARY_ACC_FP16(type_string, op_type)                                                               \
+    class Arm##type_string##LayerAcc : public ArmUnaryLayerAcc {                                                       \
+    public:                                                                                                            \
+        Arm##type_string##LayerAcc() {                                                                                 \
+            op_ = std::make_shared<op_type>();                                                                         \
+        }                                                                                                              \
+        DEFINE_ARM_FP16_UNARY_DO_FORWARD;                                                                              \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+                                                                                                                       \
+    private:                                                                                                           \
+        DECLARE_ARM_FP16_LAYER_FUNC;                                                                                   \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_unsqueeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unsqueeze_layer_acc.cc
new file mode 100644
index 0000000..e1c6091
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_unsqueeze_layer_acc.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status ArmUnsqueezeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *input_data  = GetBlobHandlePtr(inputs[0]->GetHandle());
+    void *output_data = GetBlobHandlePtr(outputs[0]->GetHandle());
+    auto dims         = outputs[0]->GetBlobDesc().dims;
+    auto count        = DimsVectorUtils::Count(dims);
+    auto ele_size     = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+REGISTER_ARM_PRECISION_FP16(LAYER_UNSQUEEZE)
+REGISTER_ARM_LAYOUT(LAYER_UNSQUEEZE, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.cc
new file mode 100644
index 0000000..5ad33a5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.cc
@@ -0,0 +1,1001 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_upsample_layer_acc.h"
+
+#include "math.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#define SATURATE_CAST_SHORT(X)                                                                                         \
+    (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX)
+
+static inline bool need_do_scale(const float *scale, int len) {
+    for (int i = 0; i < len; ++i) {
+        if (fabs(scale[i] - 1.0) > 0.0078125) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static inline int upsample_nearest2d(float *output_data, const float *input_data, int ih, int iw, int oh, int ow,
+                                     int c_4) {
+    auto src_z_step = iw * ih * 4;
+    auto dst_z_step = ow * oh * 4;
+
+    const float height_scale = (float)ih / (float)oh;
+    const float width_scale  = (float)iw / (float)ow;
+
+    OMP_PARALLEL_FOR_
+    for (int z = 0; z < c_4; z++) {
+        auto dst_z = output_data + z * dst_z_step;
+        auto src_z = input_data + z * src_z_step;
+        for (int h = 0; h < oh; h++) {
+            int scale_h = h * height_scale;
+            auto dst_y  = dst_z + h * ow * 4;
+            auto src_y  = src_z + scale_h * iw * 4;
+            for (int w = 0; w < ow; w++) {
+                int scale_w = w * width_scale;
+                Float4::save(dst_y + w * 4, Float4::load(src_y + scale_w * 4));
+            }
+        }
+    }
+
+    return 0;
+}
+
+template <bool do_scale>
+static int upsample_nearest2d(int8_t *output_data, const int8_t *input_data, int ih, int iw, int oh, int ow, int c_4,
+                              const float *scale) {
+    auto c_r4       = c_4 * 4;
+    auto src_y_step = iw * c_r4;
+    auto dst_y_step = ow * c_r4;
+
+    const float height_scale = (float)ih / (float)oh;
+    const float width_scale  = (float)iw / (float)ow;
+
+    OMP_PARALLEL_FOR_
+    for (int h = 0; h < oh; h++) {
+        int scale_h = h * height_scale;
+        auto dst_y  = output_data + h * dst_y_step;
+        auto src_y  = input_data + scale_h * src_y_step;
+        for (int w = 0; w < ow; w++) {
+            int scale_w = w * width_scale;
+            auto dst_x  = dst_y + w * c_r4;
+            auto src_x  = src_y + scale_w * c_r4;
+            if (!do_scale) {
+                memcpy(dst_x, src_x, c_r4);
+            } else {
+#ifndef TNN_USE_NEON
+                for (int c = 0; c < c_r4; ++c) {
+                    dst_x[c] = float2int8(src_x[c] * scale[c]);
+                }
+#else
+                auto scale_p = scale;
+                for (int z = 0; z < c_4 / 2; z++) {
+                    float32x4_t v_scale0 = vld1q_f32(scale_p);
+                    float32x4_t v_scale1 = vld1q_f32(scale_p + 4);
+                    int8x8_t data_src    = vld1_s8(src_x);
+                    int16x8_t data_h     = vmovl_s8(data_src);
+                    float32x4_t data_f0  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data_h)));
+                    float32x4_t data_f1  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data_h)));
+                    float32x4_t res_f0   = vmulq_f32(data_f0, v_scale0);
+                    float32x4_t res_f1   = vmulq_f32(data_f1, v_scale1);
+                    int16x4_t res_s16    = vqmovn_s32(VCVTAQ_S32_F32(res_f0));
+                    vst1_s8(dst_x, vqmovn_s16(VQMOVN_HIGH_S32_T(res_s16, VCVTAQ_S32_F32(res_f1))));
+
+                    src_x += 8;
+                    dst_x += 8;
+                    scale_p += 8;
+                }
+                if (c_4 % 2) {
+                    float32x4_t v_scale = vld1q_f32(scale_p);
+                    int8x8_t data_src   = int8x8_t();
+                    data_src            = vld1_lane_s8(src_x, data_src, 0);
+                    data_src            = vld1_lane_s8(src_x + 1, data_src, 1);
+                    data_src            = vld1_lane_s8(src_x + 2, data_src, 2);
+                    data_src            = vld1_lane_s8(src_x + 3, data_src, 3);
+                    int16x8_t data_h    = vmovl_s8(data_src);
+                    float32x4_t data_f  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data_h)));
+                    float32x4_t res_f   = vmulq_f32(data_f, v_scale);
+                    int16x4_t res_s16   = vqmovn_s32(VCVTAQ_S32_F32(res_f));
+                    int8x8_t res_s8     = vqmovn_s16(vcombine_s16(res_s16, res_s16));
+                    vst1_lane_s32(reinterpret_cast<int32_t *>(dst_x), vreinterpret_s32_s8(res_s8), 0);
+                }
+#endif
+            }
+        }
+    }
+
+    return 0;
+}
+
+static inline void get_bilinear_coeffs(float *h_coeffs_ptr, float *w_coeffs_ptr, int ih, int iw, int oh, int ow,
+                                       bool align_corners) {
+    if (align_corners) {
+        const float rheight = (oh > 1) ? (float)(ih - 1) / (oh - 1) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw - 1) / (ow - 1) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            h_coeffs_ptr[h] = h * rheight;
+        }
+        for (int w = 0; w < ow; ++w) {
+            w_coeffs_ptr[w] = w * rwidth;
+        }
+    } else {
+        const float rheight = (oh > 1) ? (float)(ih) / (oh) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw) / (ow) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            h_coeffs_ptr[h] = rheight * (h + 0.5) - 0.5;
+            h_coeffs_ptr[h] = h_coeffs_ptr[h] >= 0 ? h_coeffs_ptr[h] : 0;
+        }
+        for (int w = 0; w < ow; ++w) {
+            w_coeffs_ptr[w] = rwidth * (w + 0.5) - 0.5;
+            w_coeffs_ptr[w] = w_coeffs_ptr[w] >= 0 ? w_coeffs_ptr[w] : 0;
+        }
+    }
+}
+
+static inline int upsample_bilinear2d(float *output_data, const float *input_data, int batch, int ih, int iw, int oh,
+                                      int ow, int c_4, bool align_corners) {
+    auto src_z_step = iw * ih * 4;
+    auto dst_z_step = ow * oh * 4;
+    auto src_y_step = iw * 4;
+    auto src_plane  = iw * ih * c_4 * 4;
+    auto dst_plane  = ow * oh * c_4 * 4;
+
+    RawBuffer h_coeffs(oh * sizeof(float));
+    RawBuffer w_coeffs(ow * sizeof(float));
+    auto h_coeffs_ptr = h_coeffs.force_to<float *>();
+    auto w_coeffs_ptr = w_coeffs.force_to<float *>();
+
+    get_bilinear_coeffs(h_coeffs_ptr, w_coeffs_ptr, ih, iw, oh, ow, align_corners);
+
+    for (int b = 0; b < batch; ++b) {
+        auto input_b  = input_data + b * src_plane;
+        auto output_b = output_data + b * dst_plane;
+
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < oh; ++h2) {
+            const float h1r      = h_coeffs_ptr[h2];
+            const int h1         = h1r;
+            const int h1p        = (h1 < ih - 1) ? 1 : 0;
+            const float h1lambda = h1r - h1;
+            const float h0lambda = (float)1. - h1lambda;
+            for (int w2 = 0; w2 < ow; ++w2) {
+                const float w1r      = w_coeffs_ptr[w2];
+                const int w1         = w1r;
+                const int w1p        = (w1 < iw - 1) ? 1 : 0;
+                const float w1lambda = w1r - w1;
+                const float w0lambda = (float)1. - w1lambda;
+                const float *Xdata   = &(input_b[h1 * iw * 4 + w1 * 4]);
+                float *Ydata         = &(output_b[h2 * ow * 4 + w2 * 4]);
+                for (int z = 0; z < c_4; z++) {
+                    Float4::save(
+                        Ydata, (Float4::load(Xdata) * w0lambda + Float4::load(Xdata + w1p * 4) * w1lambda) * h0lambda +
+                                   (Float4::load(Xdata + h1p * src_y_step) * w0lambda +
+                                    Float4::load(Xdata + h1p * src_y_step + w1p * 4) * w1lambda) *
+                                       h1lambda);
+
+                    Xdata += src_z_step;
+                    Ydata += dst_z_step;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+// cubic interpolate weights
+static void get_cubic_weights(float coor, float *coeffs) {
+    // opencv uses -0.75
+    static const float A = -0.75f;
+    float x              = coor - std::floor(coor);
+
+    coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
+    coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static inline void get_cubic_pos_coeffs(int *h_pos_ptr, int *w_pos_ptr, float *h_coeffs_ptr, float *w_coeffs_ptr,
+                                        int ih, int iw, int oh, int ow, bool align_corners) {
+#define ClipC4(x, X) (((x) >= 0 ? ((x) < (X) ? (x) : ((X)-1)) : 0) * 4)
+#define SET_POS4(ptr, x, X)                                                                                            \
+    ptr[0] = ClipC4(x - 1, X);                                                                                         \
+    ptr[1] = ClipC4(x, X);                                                                                             \
+    ptr[2] = ClipC4(x + 1, X);                                                                                         \
+    ptr[3] = ClipC4(x + 2, X);                                                                                         \
+    ptr += 4;
+
+    auto h_pos4_ptr = h_pos_ptr + oh;
+    auto w_pos4_ptr = w_pos_ptr + ow;
+    if (align_corners) {
+        const float rheight = (oh > 1) ? (float)(ih - 1) / (oh - 1) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw - 1) / (ow - 1) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            auto h1      = std::floor(static_cast<float>(h * rheight));
+            h_pos_ptr[h] = h1;
+            SET_POS4(h_pos4_ptr, h1, ih);
+            get_cubic_weights(h * rheight, h_coeffs_ptr + h * 4);
+        }
+        for (int w = 0; w < ow; ++w) {
+            auto w1      = std::floor(static_cast<float>(w * rwidth));
+            w_pos_ptr[w] = w1;
+            SET_POS4(w_pos4_ptr, w1, iw);
+            get_cubic_weights(w * rwidth, w_coeffs_ptr + w * 4);
+        }
+    } else {
+        const float rheight = (oh > 1) ? (float)(ih) / (oh) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw) / (ow) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            auto h1      = std::floor(static_cast<float>(rheight * (h + 0.5) - 0.5));
+            h_pos_ptr[h] = h1;
+            SET_POS4(h_pos4_ptr, h1, ih);
+            get_cubic_weights(rheight * (h + 0.5) - 0.5, h_coeffs_ptr + h * 4);
+        }
+        for (int w = 0; w < ow; ++w) {
+            auto w1      = std::floor(static_cast<float>(rwidth * (w + 0.5) - 0.5));
+            w_pos_ptr[w] = w1;
+            SET_POS4(w_pos4_ptr, w1, iw);
+            get_cubic_weights(rwidth * (w + 0.5) - 0.5, w_coeffs_ptr + w * 4);
+        }
+    }
+#undef SET_POS4
+#undef ClipC4
+}
+
+struct UpsampleCubicKernelParm {
+    UpsampleCubicKernelParm(float **rows0_t_, float **rows1_t_, float **rows2_t_, float **rows3_t_, int *prev_h1_,
+                            int *h_pos_ptr_, int *w_pos_ptr_, int *h_pos4_ptr_, int *w_pos4_ptr_) {
+        rows0_t    = rows0_t_;
+        rows1_t    = rows1_t_;
+        rows2_t    = rows2_t_;
+        rows3_t    = rows3_t_;
+        prev_h1    = prev_h1_;
+        h_pos_ptr  = h_pos_ptr_;
+        w_pos_ptr  = w_pos_ptr_;
+        h_pos4_ptr = h_pos4_ptr_;
+        w_pos4_ptr = w_pos4_ptr_;
+    };
+
+    float **rows0_t;
+    float **rows1_t;
+    float **rows2_t;
+    float **rows3_t;
+    int *prev_h1;
+    int *h_pos_ptr;
+    int *w_pos_ptr;
+    int *h_pos4_ptr;
+    int *w_pos4_ptr;
+};
+
+static inline int upsample_cubic2d(float *output_data, const float *input_data, int batch, int ih, int iw, int oh,
+                                   int ow, int c_4, bool align_corners) {
+    auto src_z_step = iw * ih * 4;
+    auto dst_z_step = ow * oh * 4;
+    auto src_y_step = iw * 4;
+    auto src_plane  = iw * ih * c_4 * 4;
+    auto dst_plane  = ow * oh * c_4 * 4;
+
+    RawBuffer h_coeffs(oh * sizeof(float) * 4);
+    RawBuffer w_coeffs(ow * sizeof(float) * 4);
+    auto h_coeffs_ptr = h_coeffs.force_to<float *>();
+    auto w_coeffs_ptr = w_coeffs.force_to<float *>();
+    RawBuffer h_pos(5 * oh * sizeof(float));
+    RawBuffer w_pos(5 * ow * sizeof(float));
+    auto h_pos_ptr = h_pos.force_to<int *>();
+    auto w_pos_ptr = w_pos.force_to<int *>();
+
+    get_cubic_pos_coeffs(h_pos_ptr, w_pos_ptr, h_coeffs_ptr, w_coeffs_ptr, ih, iw, oh, ow, align_corners);
+
+    auto h_pos4_ptr = h_pos_ptr + oh;
+    auto w_pos4_ptr = w_pos_ptr + ow;
+
+#define ROW_CAL_START                                                                                                  \
+    const int w1  = param.w_pos_ptr[w2];                                                                               \
+    const int *wp = param.w_pos4_ptr + 4 * w2;                                                                         \
+    auto w_lambda = Float4::load(w_coeffs_ptr + 4 * w2);
+#define ROW_CAL(src, dst)                                                                                               \
+    auto Xdata##src = input_z + hp[dst] * iw;                                                                           \
+    auto row_##dst  = Float4::load(Xdata##src + wp[0]) * w_lambda[0] + Float4::load(Xdata##src + wp[1]) * w_lambda[1] + \
+                     Float4::load(Xdata##src + wp[2]) * w_lambda[2] + Float4::load(Xdata##src + wp[3]) * w_lambda[3];   \
+    Float4::save(param.rows##dst##_t[thread_id] + buf_offset, row_##dst);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    int buf_count       = ow * 4 * max_num_threads;
+    RawBuffer workspace(4 * buf_count * sizeof(float));
+    float *rows0 = workspace.force_to<float *>();
+    float *rows1 = rows0 + buf_count;
+    float *rows2 = rows1 + buf_count;
+    float *rows3 = rows2 + buf_count;
+    float *rows0_t[max_num_threads];
+    float *rows1_t[max_num_threads];
+    float *rows2_t[max_num_threads];
+    float *rows3_t[max_num_threads];
+    int prev_h1[max_num_threads];
+
+    UpsampleCubicKernelParm param(rows0_t, rows1_t, rows2_t, rows3_t, prev_h1, h_pos_ptr, w_pos_ptr, h_pos4_ptr,
+                                  w_pos4_ptr);
+
+    for (int b = 0; b < batch; ++b) {
+        auto input_b  = input_data + b * src_plane;
+        auto output_b = output_data + b * dst_plane;
+
+        for (int z = 0; z < c_4; z++) {
+            auto input_z  = input_b + z * src_z_step;
+            auto output_z = output_b + z * dst_z_step;
+
+            for (int t = 0; t < max_num_threads; ++t) {
+                prev_h1[t] = INT_MIN;
+                rows0_t[t] = rows0 + t * (ow * 4);
+                rows1_t[t] = rows1 + t * (ow * 4);
+                rows2_t[t] = rows2 + t * (ow * 4);
+                rows3_t[t] = rows3 + t * (ow * 4);
+            }
+
+            OMP_PARALLEL_FOR_
+            for (int h2 = 0; h2 < oh; ++h2) {
+                int thread_id  = OMP_TID_;
+                const int h1   = param.h_pos_ptr[h2];
+                const int *hp  = param.h_pos4_ptr + 4 * h2;
+                int buf_offset = 0;
+
+                int diff_h = h1 - prev_h1[thread_id];
+
+                if (diff_h == 0) {
+                    // reuse all rows
+                } else if (diff_h == 1) {
+                    auto rows_tmp            = param.rows0_t[thread_id];
+                    param.rows0_t[thread_id] = param.rows1_t[thread_id];
+                    param.rows1_t[thread_id] = param.rows2_t[thread_id];
+                    param.rows2_t[thread_id] = param.rows3_t[thread_id];
+                    param.rows3_t[thread_id] = rows_tmp;
+                    for (int w2 = 0; w2 < ow; ++w2) {
+                        ROW_CAL_START;
+                        ROW_CAL(0, 3);
+                        buf_offset += 4;
+                    }
+                } else if (diff_h == 2) {
+                    auto rows_tmp            = param.rows0_t[thread_id];
+                    param.rows0_t[thread_id] = param.rows2_t[thread_id];
+                    param.rows2_t[thread_id] = rows_tmp;
+                    rows_tmp                 = param.rows1_t[thread_id];
+                    param.rows1_t[thread_id] = param.rows3_t[thread_id];
+                    param.rows3_t[thread_id] = rows_tmp;
+                    for (int w2 = 0; w2 < ow; ++w2) {
+                        ROW_CAL_START;
+                        ROW_CAL(0, 2);
+                        ROW_CAL(1, 3);
+                        buf_offset += 4;
+                    }
+                } else if (diff_h == 3) {
+                    auto rows_tmp            = param.rows0_t[thread_id];
+                    param.rows0_t[thread_id] = param.rows3_t[thread_id];
+                    param.rows3_t[thread_id] = param.rows2_t[thread_id];
+                    param.rows2_t[thread_id] = param.rows1_t[thread_id];
+                    param.rows1_t[thread_id] = rows_tmp;
+                    for (int w2 = 0; w2 < ow; ++w2) {
+                        ROW_CAL_START;
+                        ROW_CAL(0, 1);
+                        ROW_CAL(1, 2);
+                        ROW_CAL(2, 3);
+                        buf_offset += 4;
+                    }
+                } else {
+                    for (int w2 = 0; w2 < ow; ++w2) {
+                        ROW_CAL_START;
+                        ROW_CAL(0, 0);
+                        ROW_CAL(1, 1);
+                        ROW_CAL(2, 2);
+                        ROW_CAL(3, 3);
+                        buf_offset += 4;
+                    }
+                }
+                param.prev_h1[thread_id] = h1;
+
+                auto h_lambda = Float4::load(h_coeffs_ptr + 4 * h2);
+                buf_offset    = 0;
+                for (int w2 = 0; w2 < ow; ++w2) {
+                    float *Ydata = output_z + h2 * ow * 4 + w2 * 4;
+                    Float4::save(Ydata, Float4::load(param.rows0_t[thread_id] + buf_offset) * h_lambda[0] +
+                                            Float4::load(param.rows1_t[thread_id] + buf_offset) * h_lambda[1] +
+                                            Float4::load(param.rows2_t[thread_id] + buf_offset) * h_lambda[2] +
+                                            Float4::load(param.rows3_t[thread_id] + buf_offset) * h_lambda[3]);
+
+                    buf_offset += 4;
+                    Ydata += dst_z_step;
+                }
+            }
+        }
+    }
+
+#undef TROW_CAL
+#undef ROW_CAL_START
+
+    return 0;
+}
+
+static void calculate_position_ratio(int length, double scale, int border, int channel, int *position, short *ratio,
+                                     bool align_corners) {
+    const int INTER_RESIZE_COEF_BITS  = 11;
+    const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
+    for (int i = 0; i < length; i++) {
+        float rat_f = align_corners ? (float)(i * scale) : (float)((i + 0.5) * scale - 0.5);
+        int pos_i   = static_cast<int>(floor(rat_f));
+        rat_f       = rat_f - pos_i;
+        if (pos_i < 0) {
+            pos_i = 0;
+            rat_f = 0.f;
+        }
+        if (pos_i >= border - 1) {
+            pos_i = border - 2;
+            rat_f = 1.f;
+        }
+        position[i] = pos_i * channel;
+
+        float a0         = (1.f - rat_f) * INTER_RESIZE_COEF_SCALE;
+        float a1         = rat_f * INTER_RESIZE_COEF_SCALE;
+        ratio[i * 2]     = SATURATE_CAST_SHORT(a0);
+        ratio[i * 2 + 1] = SATURATE_CAST_SHORT(a1);
+    }
+}
+
+static void get_upsample_buf(int src_w, int src_h, int w, int h, int c, int **buf, bool align_corners) {
+    double scale_x;
+    double scale_y;
+    if (align_corners) {
+        scale_x = (w > 1) ? (double)(src_w - 1) / (w - 1) : 0.0;
+        scale_y = (h > 1) ? (double)(src_h - 1) / (h - 1) : 0.0;
+    } else {
+        scale_x = (w > 1) ? (double)src_w / w : 0.0;
+        scale_y = (h > 1) ? (double)src_h / h : 0.0;
+    }
+    *buf          = new int[w + h + w + h];
+    int *xofs     = *buf;
+    int *yofs     = *buf + w;
+    short *ialpha = (short *)(*buf + w + h);
+    short *ibeta  = (short *)(*buf + w + h + w);
+
+    calculate_position_ratio(w, scale_x, src_w, c, xofs, ialpha, align_corners);
+    calculate_position_ratio(h, scale_y, src_h, 1, yofs, ibeta, align_corners);
+}
+
+struct UpsampleBilinearKernelParm {
+    UpsampleBilinearKernelParm(int *_xofs, int *_yofs, short *_ialpha, short *_ibeta, const int8_t *_src, int8_t *_dst,
+                               int _src_plane, int _src_stride) {
+        xofs       = _xofs;
+        yofs       = _yofs;
+        ialpha     = _ialpha;
+        ibeta      = _ibeta;
+        src        = _src;
+        dst        = _dst;
+        src_plane  = _src_plane;
+        src_stride = _src_stride;
+    };
+
+    int *xofs;
+    int *yofs;
+    short *ialpha;
+    short *ibeta;
+    const int8_t *src;
+    int8_t *dst;
+    int src_plane;
+    int src_stride;
+};
+
+static void upsample_get_adjacent_rows(int sy, int prev_sy, short **rows0, short **rows1, int *xofs, const int8_t *src,
+                                       int src_stride, int w, const short *ialphap) {
+    if (sy == prev_sy) {
+        // reuse all rows
+    } else if (sy == prev_sy + 1) {
+        // hresize one row
+        short *rows0_old = *rows0;
+        *rows0           = *rows1;
+        *rows1           = rows0_old;
+        const int8_t *S1 = src + src_stride * (sy + 1);
+
+        short *rows1p = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const int8_t *S1p = S1 + sx;
+
+#ifndef TNN_USE_NEON
+            for (int dc = 0; dc < 4; ++dc) {
+                rows1p[dc] = (S1p[dc] * a0 + S1p[dc + 4] * a1) >> 4;
+            }
+#else
+            int16x4_t _a0 = vdup_n_s16(a0);
+            int16x4_t _a1 = vdup_n_s16(a1);
+            int8x8_t _S1 = vld1_s8(S1p);
+
+            int16x8_t _S116 = vmovl_s8(_S1);
+            int16x4_t _S1low = vget_low_s16(_S116);
+            int16x4_t _S1high = vget_high_s16(_S116);
+            int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+            _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+            int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+            vst1_s16(rows1p, _rows1_sr4);
+#endif
+            ialphap += 2;
+            rows1p += 4;
+        }
+    } else {
+        // hresize two rows
+        const int8_t *S0 = src + src_stride * (sy);
+        const int8_t *S1 = src + src_stride * (sy + 1);
+
+        short *rows0p = *rows0;
+        short *rows1p = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const int8_t *S0p = S0 + sx;
+            const int8_t *S1p = S1 + sx;
+
+#ifndef TNN_USE_NEON
+            for (int dc = 0; dc < 4; ++dc) {
+                rows0p[dc] = (S0p[dc] * a0 + S0p[dc + 4] * a1) >> 4;
+                rows1p[dc] = (S1p[dc] * a0 + S1p[dc + 4] * a1) >> 4;
+            }
+#else
+            int16x4_t _a0 = vdup_n_s16(a0);
+            int16x4_t _a1 = vdup_n_s16(a1);
+            int8x8_t _S0 = vld1_s8(S0p);
+            int8x8_t _S1 = vld1_s8(S1p);
+            int16x8_t _S016 = vmovl_s8(_S0);
+            int16x8_t _S116 = vmovl_s8(_S1);
+            int16x4_t _S0low = vget_low_s16(_S016);
+            int16x4_t _S1low = vget_low_s16(_S116);
+            int16x4_t _S0high = vget_high_s16(_S016);
+            int16x4_t _S1high = vget_high_s16(_S116);
+            int32x4_t _rows0 = vmull_s16(_S0low, _a0);
+            int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+            _rows0 = vmlal_s16(_rows0, _S0high, _a1);
+            _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+            int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
+            int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+            vst1_s16(rows0p, _rows0_sr4);
+            vst1_s16(rows1p, _rows1_sr4);
+#endif
+            ialphap += 2;
+            rows0p += 4;
+            rows1p += 4;
+        }
+    }
+}
+
+static void upsample_calculate_one_row(short *rows0p, short *rows1p, const int b0, const int b1, const int w,
+                                       int8_t *Dp) {
+#ifndef TNN_USE_NEON
+    int remain = w * 4;
+#else
+    int nn = (w * 4) >> 3;
+    int remain = (w * 4) - (nn << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (; nn > 0; nn--) {
+        int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+        int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+        int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+        int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+        int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+        int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+        int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+        int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+        int32x4_t _acc = _v2;
+        _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+        _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+        int32x4_t _acc_1 = _v2;
+        _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+        _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+        int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+        int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+        int8x8_t _D = vqmovn_s16(vcombine_s16(_acc16, _acc16_1));
+
+        vst1_s8(Dp, _D);
+
+        Dp += 8;
+        rows0p += 8;
+        rows1p += 8;
+    }
+#endif
+    for (; remain; --remain) {
+        *Dp++ =
+            (int8_t)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
+    }
+}
+
+void upsample_bilinear_one_row(UpsampleBilinearKernelParm &param, int thread_id, short **rows0_t, short **rows1_t,
+                               int *prev_sy, int b, int w, int h, int stride, int dy) {
+    int sy = param.yofs[dy];
+    upsample_get_adjacent_rows(sy, prev_sy[thread_id], &rows0_t[thread_id], &rows1_t[thread_id], param.xofs,
+                               param.src + b * param.src_plane, param.src_stride, w, param.ialpha);
+    prev_sy[thread_id] = sy;
+
+    // vresize
+    short b0 = param.ibeta[dy * 2];
+    short b1 = param.ibeta[dy * 2 + 1];
+
+    int8_t *Dp = param.dst + stride * (b * h + dy);
+
+    upsample_calculate_one_row(rows0_t[thread_id], rows1_t[thread_id], b0, b1, w, Dp);
+}
+
+static int upsample_bilinear_c4(int8_t *dst, const int8_t *src, int batch, int src_h, int src_w, int h, int w,
+                                bool align_corners) {
+    int src_stride = src_w * 4;
+    int stride     = w * 4;
+    int *buf       = nullptr;
+    get_upsample_buf(src_w, src_h, w, h, 4, &buf, align_corners);
+    int *xofs     = buf;
+    int *yofs     = buf + w;
+    short *ialpha = (short *)(buf + w + h);
+    short *ibeta  = (short *)(buf + w + h + w);
+    int src_plane = src_h * src_stride;
+
+    UpsampleBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short *rows0        = new short[(w * 4) * max_num_threads];
+    short *rows1        = new short[(w * 4) * max_num_threads];
+    short *rows0_t[max_num_threads];
+    short *rows1_t[max_num_threads];
+    int prev_sy[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 4);
+            rows1_t[t] = rows1 + t * (w * 4);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id = OMP_TID_;
+            upsample_bilinear_one_row(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+    return 0;
+}
+
+template <bool do_scale>
+static void upsample_bilinear_cn(int8_t *output_data, const int8_t *input_data, const float *h_coeffs_ptr,
+                                 const float *w_coeffs_ptr, int c_4, int ih, int iw, int oh, int ow,
+                                 const float *scale) {
+    auto c_r4       = c_4 * 4;
+    auto src_y_step = iw * c_r4;
+    auto dst_y_step = ow * c_r4;
+
+    const float INTER_RESIZE_COEF_SCALE = float(1 << 11);
+
+    OMP_PARALLEL_FOR_
+    for (int h2 = 0; h2 < oh; ++h2) {
+        const float h1r      = h_coeffs_ptr[h2];
+        const int h1         = h1r;
+        const int h1p        = (h1 < ih - 1) ? 1 : 0;
+        const float h1lambda = h1r - h1;
+        const float h0lambda = (float)1. - h1lambda;
+        const short h1_short = SATURATE_CAST_SHORT(h1lambda * INTER_RESIZE_COEF_SCALE);
+        const short h0_short = SATURATE_CAST_SHORT(h0lambda * INTER_RESIZE_COEF_SCALE);
+        for (int w2 = 0; w2 < ow; ++w2) {
+            const float w1r       = w_coeffs_ptr[w2];
+            const int w1          = w1r;
+            const int w1p         = (w1 < iw - 1) ? 1 : 0;
+            const float w1lambda  = w1r - w1;
+            const float w0lambda  = (float)1. - w1lambda;
+            const short w1_short  = SATURATE_CAST_SHORT(w1lambda * INTER_RESIZE_COEF_SCALE);
+            const short w0_short  = SATURATE_CAST_SHORT(w0lambda * INTER_RESIZE_COEF_SCALE);
+            const int8_t *Xdata00 = &(input_data[h1 * src_y_step + w1 * c_r4]);
+            const int8_t *Xdata01 = Xdata00 + w1p * c_r4;
+            const int8_t *Xdata10 = Xdata00 + h1p * src_y_step;
+            const int8_t *Xdata11 = Xdata10 + w1p * c_r4;
+            int8_t *Ydata         = &(output_data[h2 * dst_y_step + w2 * c_r4]);
+            const float *scale_p  = scale;
+#ifndef TNN_USE_NEON
+            for (int c = 0; c < c_r4; ++c) {
+                if (do_scale) {
+                    // compute as float
+                    Ydata[c] = float2int8(((Xdata00[c] * w0lambda + Xdata01[c] * w1lambda) * h0lambda +
+                                           (Xdata10[c] * w0lambda + Xdata11[c] * w1lambda) * h1lambda) *
+                                          scale_p[c]);
+                } else {
+                    // compute as int
+                    short h0_res = (Xdata00[c] * w0_short + Xdata01[c] * w1_short) >> 4;
+                    short h1_res = (Xdata10[c] * w0_short + Xdata11[c] * w1_short) >> 4;
+                    int8_t res   = (((h0_res * h0_short) >> 16) + ((h1_res * h1_short) >> 16) + 2) >> 2;
+                    Ydata[c]     = res;
+                }
+            }
+#else
+            if (do_scale) {
+                float32x4_t v_w0lambda = vdupq_n_f32(w0lambda);
+                float32x4_t v_w1lambda = vdupq_n_f32(w1lambda);
+                float32x4_t v_h0lambda = vdupq_n_f32(h0lambda);
+                float32x4_t v_h1lambda = vdupq_n_f32(h1lambda);
+                for (int z = 0; z < c_4 / 2; z++) {
+                    float32x4_t v_scale0 = vld1q_f32(scale_p);
+                    float32x4_t v_scale1 = vld1q_f32(scale_p + 4);
+                    int8x8_t data00 = vld1_s8(Xdata00);
+                    int8x8_t data01 = vld1_s8(Xdata01);
+                    int8x8_t data10 = vld1_s8(Xdata10);
+                    int8x8_t data11 = vld1_s8(Xdata11);
+                    int16x8_t data00h = vmovl_s8(data00);
+                    int16x8_t data01h = vmovl_s8(data01);
+                    int16x8_t data10h = vmovl_s8(data10);
+                    int16x8_t data11h = vmovl_s8(data11);
+                    float32x4_t data00_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data00h)));
+                    float32x4_t data00_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data00h)));
+                    float32x4_t data01_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data01h)));
+                    float32x4_t data01_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data01h)));
+                    float32x4_t data10_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data10h)));
+                    float32x4_t data10_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data10h)));
+                    float32x4_t data11_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data11h)));
+                    float32x4_t data11_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data11h)));
+                    float32x4_t acc0 = vmlaq_f32(vmulq_f32(data00_0, v_w0lambda), data01_0, v_w1lambda);
+                    float32x4_t acc1 = vmlaq_f32(vmulq_f32(data10_0, v_w0lambda), data11_0, v_w1lambda);
+                    float32x4_t acc = vmulq_f32(vmlaq_f32(vmulq_f32(acc0, v_h0lambda), acc1, v_h1lambda), v_scale0);
+                    int16x4_t res_s16 = vqmovn_s32(VCVTAQ_S32_F32(acc));
+                    acc0 = vmlaq_f32(vmulq_f32(data00_1, v_w0lambda), data01_1, v_w1lambda);
+                    acc1 = vmlaq_f32(vmulq_f32(data10_1, v_w0lambda), data11_1, v_w1lambda);
+                    acc = vmulq_f32(vmlaq_f32(vmulq_f32(acc0, v_h0lambda), acc1, v_h1lambda), v_scale1);
+                    vst1_s8(Ydata, vqmovn_s16(VQMOVN_HIGH_S32_T(res_s16, VCVTAQ_S32_F32(acc))));
+
+                    Xdata00 += 8;
+                    Xdata01 += 8;
+                    Xdata10 += 8;
+                    Xdata11 += 8;
+                    Ydata += 8;
+                    scale_p += 8;
+                }
+                if (c_4 % 2) {
+                    float32x4_t v_scale = vld1q_f32(scale_p);
+                    int8x8_t data00 = vld1_s8(Xdata00);
+                    int8x8_t data01 = vld1_s8(Xdata01 - 4);
+                    int8x8_t data10 = vld1_s8(Xdata10 - 4);
+                    int8x8_t data11 = vld1_s8(Xdata11 - 4);
+                    int16x8_t data00h = vmovl_s8(data00);
+                    int16x8_t data01h = vmovl_s8(data01);
+                    int16x8_t data10h = vmovl_s8(data10);
+                    int16x8_t data11h = vmovl_s8(data11);
+                    float32x4_t data00_1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(data00h)));
+                    float32x4_t data01_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data01h)));
+                    float32x4_t data10_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data10h)));
+                    float32x4_t data11_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(data11h)));
+                    float32x4_t acc0 = vmlaq_f32(vmulq_f32(data00_1, v_w0lambda), data01_1, v_w1lambda);
+                    float32x4_t acc1 = vmlaq_f32(vmulq_f32(data10_1, v_w0lambda), data11_1, v_w1lambda);
+                    float32x4_t acc = vmulq_f32(vmlaq_f32(vmulq_f32(acc0, v_h0lambda), acc1, v_h1lambda), v_scale);
+                    int16x4_t res_s16 = vqmovn_s32(VCVTAQ_S32_F32(acc));
+                    int8x8_t res_s8 = vqmovn_s16(vcombine_s16(res_s16, res_s16));
+                    vst1_lane_s32(reinterpret_cast<int32_t *>(Ydata), vreinterpret_s32_s8(res_s8), 0);
+                }
+            } else {
+                int16x4_t v_w0 = vdup_n_s16(w0_short);
+                int16x4_t v_w1 = vdup_n_s16(w1_short);
+                int16x4_t v_h0 = vdup_n_s16(h0_short);
+                int16x4_t v_h1 = vdup_n_s16(h1_short);
+                int32x4_t v_2 = vdupq_n_s32(2);
+                for (int z = 0; z < c_4 / 2; z++) {
+                    int8x8_t data00 = vld1_s8(Xdata00);
+                    int8x8_t data01 = vld1_s8(Xdata01);
+                    int8x8_t data10 = vld1_s8(Xdata10);
+                    int8x8_t data11 = vld1_s8(Xdata11);
+                    int16x8_t data00h = vmovl_s8(data00);
+                    int16x8_t data01h = vmovl_s8(data01);
+                    int16x8_t data10h = vmovl_s8(data10);
+                    int16x8_t data11h = vmovl_s8(data11);
+                    int32x4_t acc0 = vmlal_s16(vmull_s16(vget_low_s16(data00h), v_w0), vget_low_s16(data01h), v_w1);
+                    int32x4_t acc1 = vmlal_s16(vmull_s16(vget_low_s16(data10h), v_w0), vget_low_s16(data11h), v_w1);
+                    int32x4_t acc_h0 = vsraq_n_s32(
+                        v_2, vmlal_s16(vmull_s16(vshrn_n_s32(acc0, 4), v_h0), vshrn_n_s32(acc1, 4), v_h1), 16);
+                    acc0 = vmlal_s16(vmull_s16(vget_high_s16(data00h), v_w0), vget_high_s16(data01h), v_w1);
+                    acc1 = vmlal_s16(vmull_s16(vget_high_s16(data10h), v_w0), vget_high_s16(data11h), v_w1);
+                    int32x4_t acc_h1 = vsraq_n_s32(
+                        v_2, vmlal_s16(vmull_s16(vshrn_n_s32(acc0, 4), v_h0), vshrn_n_s32(acc1, 4), v_h1), 16);
+                    vst1_s8(Ydata, vqmovn_s16(vcombine_s16(vshrn_n_s32(acc_h0, 2), vshrn_n_s32(acc_h1, 2))));
+
+                    Xdata00 += 8;
+                    Xdata01 += 8;
+                    Xdata10 += 8;
+                    Xdata11 += 8;
+                    Ydata += 8;
+                }
+                if (c_4 % 2) {
+                    int8x8_t data00 = vld1_s8(Xdata00);
+                    int8x8_t data01 = vld1_s8(Xdata01 - 4);
+                    int8x8_t data10 = vld1_s8(Xdata10 - 4);
+                    int8x8_t data11 = vld1_s8(Xdata11 - 4);
+                    int16x8_t data00h = vmovl_s8(data00);
+                    int16x8_t data01h = vmovl_s8(data01);
+                    int16x8_t data10h = vmovl_s8(data10);
+                    int16x8_t data11h = vmovl_s8(data11);
+                    int32x4_t acc0 = vmlal_s16(vmull_s16(vget_low_s16(data00h), v_w0), vget_high_s16(data01h), v_w1);
+                    int32x4_t acc1 = vmlal_s16(vmull_s16(vget_high_s16(data10h), v_w0), vget_high_s16(data11h), v_w1);
+                    int32x4_t acc = vsraq_n_s32(
+                        v_2, vmlal_s16(vmull_s16(vshrn_n_s32(acc0, 4), v_h0), vshrn_n_s32(acc1, 4), v_h1), 16);
+                    int16x4_t res_s16 = vshrn_n_s32(acc, 2);
+                    int8x8_t res_s8 = vqmovn_s16(vcombine_s16(res_s16, res_s16));
+                    vst1_lane_s32(reinterpret_cast<int32_t *>(Ydata), vreinterpret_s32_s8(res_s8), 0);
+                }
+            }
+#endif
+        }
+    }
+}
+
+template <bool do_scale>
+static int upsample_bilinear2d(int8_t *output_data, const int8_t *input_data, int batch, int ih, int iw, int oh, int ow,
+                               int c_4, bool align_corners, const float *scale) {
+    if (!do_scale && c_4 == 1) {
+        return upsample_bilinear_c4(output_data, input_data, batch, ih, iw, oh, ow, align_corners);
+    }
+
+    auto src_plane = iw * ih * c_4 * 4;
+    auto dst_plane = ow * oh * c_4 * 4;
+
+    RawBuffer h_coeffs(oh * sizeof(float));
+    RawBuffer w_coeffs(ow * sizeof(float));
+    auto h_coeffs_ptr = h_coeffs.force_to<float *>();
+    auto w_coeffs_ptr = w_coeffs.force_to<float *>();
+
+    get_bilinear_coeffs(h_coeffs_ptr, w_coeffs_ptr, ih, iw, oh, ow, align_corners);
+
+    for (int b = 0; b < batch; ++b) {
+        auto input_b  = input_data + b * src_plane;
+        auto output_b = output_data + b * dst_plane;
+        if (do_scale) {
+            upsample_bilinear_cn<true>(output_b, input_b, h_coeffs_ptr, w_coeffs_ptr, c_4, ih, iw, oh, ow, scale);
+        } else {
+            upsample_bilinear_cn<false>(output_b, input_b, h_coeffs_ptr, w_coeffs_ptr, c_4, ih, iw, oh, ow, scale);
+        }
+    }
+
+    return 0;
+}
+
+ArmUpsampleLayerAcc::~ArmUpsampleLayerAcc() {}
+
+bool ArmUpsampleLayerAcc::UseNaiveConstantBlobs() {
+    return true;
+}
+
+Status ArmUpsampleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<UpsampleLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto dims_input   = inputs[0]->GetBlobDesc().dims;
+    auto dims_output  = outputs[0]->GetBlobDesc().dims;
+    auto batch        = dims_input[0];
+    auto input_plane  = dims_input[2] * dims_input[3] * ROUND_UP(dims_input[1], 4);
+    auto output_plane = dims_output[2] * dims_output[3] * ROUND_UP(dims_output[1], 4);
+
+    DataType data_type = outputs[0]->GetBlobDesc().data_type;
+
+    if (data_type == DATA_TYPE_INT8) {
+        auto dims_output    = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+
+        auto input_resource  = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource();
+        auto output_resource = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        const float *i_scale = input_resource->scale_handle.force_to<float *>();
+        const float *o_scale = output_resource->scale_handle.force_to<float *>();
+        int scale_len_i      = input_resource->scale_handle.GetDataCount();
+        int scale_len_o      = output_resource->scale_handle.GetDataCount();
+
+        if (buffer_scale_.GetBytesSize() < total_byte_size) {
+            buffer_scale_ = RawBuffer(total_byte_size);
+        }
+        float *temp_ptr = buffer_scale_.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_i = scale_len_i == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        do_scale_ = need_do_scale(temp_ptr, dims_output[1]);
+    } else {
+        do_scale_ = false;
+    }
+
+    float *input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    float *output_data = reinterpret_cast<float *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    auto oc_4 = UP_DIV(dims_output[1], 4);
+
+    if (dims_input[2] == dims_output[2] && dims_input[3] == dims_output[3] && !do_scale_) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, batch * input_plane * DataTypeUtils::GetBytesSize(data_type));
+        }
+    } else if (param->mode == 1) {  // nearest
+        if (data_type == DATA_TYPE_FLOAT) {
+            for (int b = 0; b < batch; ++b) {
+                upsample_nearest2d(output_data + b * output_plane, input_data + b * input_plane, dims_input[2],
+                                   dims_input[3], dims_output[2], dims_output[3], oc_4);
+            }
+        } else if (data_type == DATA_TYPE_INT8) {
+            for (int b = 0; b < batch; ++b) {
+                auto output_b = reinterpret_cast<int8_t *>(output_data) + b * output_plane;
+                auto input_b  = reinterpret_cast<int8_t *>(input_data) + b * input_plane;
+                if (do_scale_)
+                    upsample_nearest2d<true>(output_b, input_b, dims_input[2], dims_input[3], dims_output[2],
+                                             dims_output[3], oc_4, buffer_scale_.force_to<float *>());
+                else
+                    upsample_nearest2d<false>(output_b, input_b, dims_input[2], dims_input[3], dims_output[2],
+                                              dims_output[3], oc_4, buffer_scale_.force_to<float *>());
+            }
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: Not supported data type for upsample nearest");
+        }
+    } else if (param->mode == 2) {  // bilinear/linear
+        if (data_type == DATA_TYPE_FLOAT) {
+            upsample_bilinear2d(output_data, input_data, batch, dims_input[2], dims_input[3], dims_output[2],
+                                dims_output[3], oc_4, (bool)param->align_corners);
+        } else if (data_type == DATA_TYPE_INT8) {
+            if (do_scale_)
+                upsample_bilinear2d<true>(reinterpret_cast<int8_t *>(output_data),
+                                          reinterpret_cast<int8_t *>(input_data), batch, dims_input[2], dims_input[3],
+                                          dims_output[2], dims_output[3], oc_4, (bool)param->align_corners,
+                                          buffer_scale_.force_to<float *>());
+            else
+                upsample_bilinear2d<false>(reinterpret_cast<int8_t *>(output_data),
+                                           reinterpret_cast<int8_t *>(input_data), batch, dims_input[2], dims_input[3],
+                                           dims_output[2], dims_output[3], oc_4, (bool)param->align_corners,
+                                           buffer_scale_.force_to<float *>());
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: Not supported data type for upsample bilinear");
+        }
+    } else if (param->mode == 3) {  // cubic
+        if (data_type == DATA_TYPE_FLOAT) {
+            upsample_cubic2d(output_data, input_data, batch, dims_input[2], dims_input[3], dims_output[2],
+                             dims_output[3], oc_4, (bool)param->align_corners);
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: Not supported data type for upsample cubic");
+        }
+    } else {
+        LOGE("Error: Upsample dont support resize mode\n");
+        return Status(TNNERR_MODEL_ERR, "Error: Upsample dont support resize mode");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Upsample, LAYER_UPSAMPLE)
+REGISTER_ARM_LAYOUT(LAYER_UPSAMPLE, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.h
new file mode 100644
index 0000000..92e80b0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/arm_upsample_layer_acc.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_UPSAMPLE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_UPSAMPLE_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+class ArmUpsampleLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmUpsampleLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    virtual bool UseNaiveConstantBlobs();
+
+    RawBuffer buffer_scale_;
+
+private:
+    bool do_scale_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_UPSAMPLE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_O4.S
new file mode 100644
index 0000000..0c00644
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_O4.S
@@ -0,0 +1,312 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvBfp16O4
+//void ConvBfp16O4(bfp16_t* dst, const bfp16_t* src, const float* weight, 
+//                 int width, int src_w_step, int src_depth_quad, int src_depth_step, 
+//                 int fw, int fh, int dilate_x_step, int dilate_y_step)
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:src_w_step, r5:src_depth_quad, r6: src_depth_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+
+vpush {q4-q7}
+
+//step multi by sizeof(bfp16)
+mov r12, #2
+mul r10, r12, r10
+mul r9, r12, r9
+mul r6, r12, r6
+mul r4, r12, r4
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+mul r12, r8, r10
+sub r6, r6, r12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul r12, r7, r9
+sub r10, r10, r12
+
+L8:
+cmp r3, #7
+ble L4
+
+
+L8Loop:
+    vmov.i32 d6[0], r1
+    vmov.i32 d6[1], r2
+    vmov.i32 d7[0], r3
+    vmov.i32 d7[1], r5
+    mov r3, #8
+    mul r3, r4, r3
+    vmov.i32 q8,  #0
+    vmov.i32 q9,  #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+    L8LoopZ:
+        mov r11, r8
+        L8LoopFY:
+            mov r12, r7
+            L8LoopFX:
+                vld1.32 {q4, q5}, [r2]!
+                vld1.32 {q6, q7}, [r2]!
+
+                vld1.32 {d0}, [r1], r4
+                vshll.u16  q0, d0, #16
+
+                vmla.f32 q8, q4, d0[0]
+
+                vld1.32 {d2}, [r1], r4
+                vshll.u16  q1, d2, #16
+
+                vmla.f32 q8, q5, d0[1]
+                vmla.f32 q9, q4, d2[0]
+                vmla.f32 q8, q6, d1[0]
+
+                vld1.32 {d4}, [r1], r4
+                vshll.u16  q2, d4, #16
+
+                vmla.f32 q8, q7, d1[1]
+                vmla.f32 q9, q5, d2[1]
+                vmla.f32 q10, q4, d4[0]
+
+                vld1.32 {d0}, [r1], r4
+                vshll.u16  q0, d0, #16
+
+                vmla.f32 q9, q6, d3[0]
+                vmla.f32 q11, q7, d1[1]
+                vmla.f32 q10, q5, d4[1]
+                vmla.f32 q11, q5, d0[1]
+                vmla.f32 q9, q7, d3[1]
+                vmla.f32 q11, q6, d1[0]
+                vmla.f32 q10, q6, d5[0]
+                vmla.f32 q11, q4, d0[0]
+
+                vld1.32 {d0}, [r1], r4
+                vshll.u16  q0, d0, #16
+
+                vmla.f32 q10, q7, d5[1]
+                vmla.f32 q12, q4, d0[0]
+
+                vld1.32 {d2}, [r1], r4
+                vshll.u16  q1, d2, #16
+
+                vmla.f32 q12, q5, d0[1]
+                vmla.f32 q13, q4, d2[0]
+                
+                vld1.32 {d4}, [r1], r4
+                vshll.u16  q2, d4, #16
+
+                vmla.f32 q12, q6, d1[0]
+                vmla.f32 q13, q5, d2[1]
+                vmla.f32 q12, q7, d1[1]
+                vmla.f32 q14, q4, d4[0]
+
+                vld1.32 {d0}, [r1], r4
+                vshll.u16  q0, d0, #16
+                vmla.f32 q13, q6, d3[0]
+                vmla.f32 q14, q5, d4[1]
+                vmla.f32 q15, q5, d0[1]
+                vmla.f32 q13, q7, d3[1]
+                vmla.f32 q15, q6, d1[0]
+                vmla.f32 q14, q6, d5[0]
+                vmla.f32 q15, q4, d0[0]
+                vmla.f32 q14, q7, d5[1]
+                vmla.f32 q15, q7, d1[1]
+
+                sub r1, r1, r3
+                subs r7, r7, #1
+                add r1, r1, r9
+                bne L8LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L8LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L8LoopZ
+    vmov.i32 r1, d6[0]
+    add r1, r1, r3
+    vmov.i32 r2, d6[1]
+    vmov.i32 r3, d7[0]
+    vmov.i32 r5, d7[1]
+    vshrn.u32  d16, q8, #16
+    vshrn.u32  d17, q9, #16
+    vshrn.u32  d18, q10, #16
+    vshrn.u32  d19, q11, #16
+    vst1.32 {q8, q9}, [r0]!
+    sub r3, r3, #8
+    cmp r3, #8
+    vshrn.u32  d24, q12, #16
+    vshrn.u32  d25, q13, #16
+    vshrn.u32  d26, q14, #16
+    vshrn.u32  d27, q15, #16
+    vst1.32 {q12, q13}, [r0]!
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+
+L4Loop:
+    vmov.i32 d30[0], r1
+    vmov.i32 d30[1], r2
+    vmov.i32 d31[0], r3
+    vmov.i32 d31[1], r5
+    mov r3, #4
+    mul r3, r4, r3
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    L4LoopZ:
+        mov r11, r8
+        L4LoopFY:
+            mov r12, r7
+            L4LoopFX:
+                vld1.32 {q4, q5}, [r2]!
+                vld1.32 {q6, q7}, [r2]!
+
+                vld1.32 {d0}, [r1], r4
+                vshll.u16  q0, d0, #16
+
+                vmla.f32 q8, q4, d0[0]
+
+                vld1.32 {d2}, [r1], r4
+                vshll.u16  q1, d2, #16
+
+                vmla.f32 q8, q5, d0[1]
+                vmla.f32 q9, q4, d2[0]
+                vmla.f32 q8, q6, d1[0]
+
+                vld1.32 {d4}, [r1], r4
+                vshll.u16  q2, d4, #16
+
+                vmla.f32 q8, q7, d1[1]
+                vmla.f32 q9, q5, d2[1]
+                vmla.f32 q10, q4, d4[0]
+
+                vld1.32 {d6}, [r1], r4
+                vshll.u16  q3, d6, #16
+
+                vmla.f32 q9, q6, d3[0]
+                vmla.f32 q11, q7, d7[1]
+                vmla.f32 q10, q5, d4[1]
+                vmla.f32 q11, q5, d6[1]
+                vmla.f32 q9, q7, d3[1]
+                vmla.f32 q11, q6, d7[0]
+                vmla.f32 q10, q6, d5[0]
+                vmla.f32 q11, q4, d6[0]
+                vmla.f32 q10, q7, d5[1]
+
+                sub r1, r1, r3
+                subs r7, r7, #1
+                add r1, r1, r9
+                bne L4LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L4LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L4LoopZ
+    vmov.i32 r1, d30[0]
+    add r1, r1, r3
+    vmov.i32 r2, d30[1]
+    vshrn.u32  d16, q8, #16
+    vshrn.u32  d17, q9, #16
+    vshrn.u32  d18, q10, #16
+    vshrn.u32  d19, q11, #16
+    vmov.i32 r3, d31[0]
+    vmov.i32 r5, d31[1]
+    sub r3, r3, #4
+    vst1.32 {q8, q9}, [r0]!
+
+
+L1:
+cmp r3, #0
+ble End
+
+L1Loop:
+    vmov.i32 d16[0], r1
+    vmov.i32 d16[1], r2
+    vmov.i32 d17[0], r5
+    vmov.i32 q0, #0
+    vmov.i32 q1, #0
+    L1LoopZ:
+        mov r11, r8
+        L1LoopFY:
+            mov r12, r7
+            L1LoopFX:
+                vld1.32 {d6}, [r1], r9
+                vshll.u16  q3, d6, #16
+                vld1.32 {q4, q5}, [r2]!
+                vmla.f32 q0, q4, d6[0]
+                vmla.f32 q1, q5, d6[1]
+                vld1.32 {q6, q7}, [r2]!
+                vmla.f32 q0, q6, d7[0]
+                vmla.f32 q1, q7, d7[1]
+                subs r7, r7, #1
+                bne L1LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L1LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L1LoopZ
+
+    vadd.f32 q0, q0, q1
+    vmov.i32 r1, d16[0]
+    vmov.i32 r2, d16[1]
+    vmov.i32 r5, d17[0]
+    add r1, r1, r4
+    vshrn.u32  d0, q0, #16
+    vst1.32 {d0}, [r0]!
+    subs r3, r3, #1
+    bne L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_SLIDEW_C3.S
new file mode 100644
index 0000000..696c765
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_BFP16_SLIDEW_C3.S
@@ -0,0 +1,284 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmBfp16SlidewC3
+//void GemmBfp16SlidewC3(bfp16_t* dst,                //r0: dst 
+//                       const bfp16_t* src,          //r1: src
+//                       const float* weight,       //r2: weight
+//                       int width,                 //r3: width
+//                       int src_w_setup,           //r4: src_w_step,   load from stack
+//                       int fw,                    //r7: fw,           load from stack
+//                       int fh,                    //r8: fh,           load from stack
+//                       int dilate_x_step,          //r9: dilate_x_step, load from stack
+//                       int dilateY_step);         //ra: dilateY_step, load from stack
+
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+src_w_step   .req r4
+fw           .req r7
+fh           .req r8
+dilate_x_step .req r9
+dilate_y_step .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:src_w_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r7, [sp, #40]
+ldr r8, [sp, #44]
+ldr r9, [sp, #48]
+ldr r10, [sp, #52]
+
+vpush {q4-q7}
+
+//step multi by sizeof(bfp16)
+mov r12, #2
+mul r10, r12, r10
+mul r9, r12, r9
+mul r4, r12, r4
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul r12, r7, r9
+sub r10, r10, r12
+
+L8:
+cmp r3, #7
+ble L4
+
+
+L8Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 d14[0], width
+    mov width, #8
+    mul width, src_w_step, width
+    vmov.i32 q8,  #0
+    vmov.i32 q9,  #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+    mov r11, fh
+    L8LoopFY:
+        mov r12, fw
+        L8LoopFX:
+            vld1.32 {q4, q5}, [weight]!
+            vld1.32 {q6}, [weight]!
+
+            vld1.32 {d0}, [src], src_w_step
+            vshll.u16  q0, d0, #16
+            vld1.32 {d2}, [src], src_w_step
+            vshll.u16  q1, d2, #16
+
+            vmla.f32 q8, q4, d0[0]
+            vmla.f32 q8, q5, d0[1]
+            vmla.f32 q9, q4, d2[0]
+
+            vld1.32 {d4}, [src], src_w_step
+            vshll.u16  q2, d4, #16
+
+            vmla.f32 q8, q6, d1[0]
+            vmla.f32 q9, q5, d2[1]
+            vmla.f32 q10, q4, d4[0]
+
+            vld1.32 {d6}, [src], src_w_step
+            vshll.u16  q3, d6, #16
+
+            vmla.f32 q9, q6, d3[0]
+            vmla.f32 q10, q5, d4[1]
+            vmla.f32 q11, q5, d6[1]
+            
+            vld1.32 {q0}, [src], src_w_step
+            vshll.u16  q0, d0, #16
+            vmla.f32 q11, q6, d7[0]
+            vmla.f32 q10, q6, d5[0]
+            vmla.f32 q11, q4, d6[0]
+
+
+            vld1.32 {d2}, [src], src_w_step
+            vshll.u16  q1, d2, #16
+            vmla.f32 q12, q4, d0[0]
+            vmla.f32 q12, q5, d0[1]
+            vmla.f32 q13, q4, d2[0]
+            
+            vld1.32 {d4}, [src], src_w_step
+            vshll.u16  q2, d4, #16
+
+            vmla.f32 q12, q6, d1[0]
+            vmla.f32 q13, q5, d2[1]
+            vmla.f32 q14, q4, d4[0]
+
+            vld1.32 {d6}, [src], src_w_step
+            vshll.u16  q3, d6, #16
+            vmla.f32 q13, q6, d3[0]
+            vmla.f32 q14, q5, d4[1]
+            vmla.f32 q15, q5, d6[1]
+            vmla.f32 q15, q6, d7[0]
+            vmla.f32 q14, q6, d5[0]
+            vmla.f32 q15, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilate_x_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_step
+        bne L8LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vmov.i32 width, d14[0]
+    vshrn.u32  d16, q8, #16
+    vshrn.u32  d17, q9, #16
+    vshrn.u32  d18, q10, #16
+    vshrn.u32  d19, q11, #16
+    vst1.32 {q8, q9}, [dst]!
+    sub width, width, #8
+    cmp width, #8
+    vshrn.u32  d24, q12, #16
+    vshrn.u32  d25, q13, #16
+    vshrn.u32  d26, q14, #16
+    vshrn.u32  d27, q15, #16
+    vst1.32 {q12, q13}, [dst]!
+    bge L8Loop
+
+L4:
+cmp width, #3
+ble L1
+
+
+L4Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 d14[0], width
+    mov width, #4
+    mul width, src_w_step, width
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vld1.32 {q4, q5}, [weight]!
+            vld1.32 {q6}, [weight]!
+
+            vld1.32 {d0}, [src], src_w_step
+            vshll.u16  q0, d0, #16
+            vld1.32 {d2}, [src], src_w_step
+            vshll.u16  q1, d2, #16
+
+            vmla.f32 q8, q4, d0[0]
+            vmla.f32 q8, q5, d0[1]
+            vmla.f32 q9, q4, d2[0]
+
+            vld1.32 {d4}, [src], src_w_step
+            vshll.u16  q2, d4, #16
+            vmla.f32 q8, q6, d1[0]
+            vmla.f32 q9, q5, d2[1]
+            vmla.f32 q10, q4, d4[0]
+
+            vld1.32 {d6}, [src], src_w_step
+            vshll.u16  q3, d6, #16
+
+            vmla.f32 q9, q6, d3[0]
+            vmla.f32 q10, q5, d4[1]
+            vmla.f32 q11, q5, d6[1]
+            vmla.f32 q11, q6, d7[0]
+            vmla.f32 q10, q6, d5[0]
+            vmla.f32 q11, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilate_x_step
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_step
+        bne L4LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vshrn.u32  d16, q8, #16
+    vshrn.u32  d17, q9, #16
+    vshrn.u32  d18, q10, #16
+    vshrn.u32  d19, q11, #16
+    vmov.i32 width, d14[0]
+    sub width, width, #4
+    vst1.32 {q8, q9}, [dst]!
+
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 q0, #0
+    vmov.i32 q1, #0
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vld1.32 {d6}, [src], dilate_x_step
+            vshll.u16  q3, d6, #16
+            vld1.32 {q4, q5}, [weight]!
+            vmla.f32 q0, q4, d6[0]
+            vmla.f32 q1, q5, d6[1]
+            vld1.32 {q6}, [weight]!
+            vmla.f32 q0, q6, d7[0]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_step
+        bne L1LoopFY
+    mov fh, r11
+    vadd.f32 q0, q0, q1
+    mov src, r5
+    mov weight, r6
+    add src, src, src_w_step
+    vshrn.u32  d0, q0, #16
+    vst1.32 {d0}, [dst]!
+    subs width, width, #1
+    bne L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_BFP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_BFP16_SLIDEW.S
new file mode 100644
index 0000000..0df8cd8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_BFP16_SLIDEW.S
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3Bfp16SlideW 
+//void ConvDw3x3Bfp16SlideW(bfp16_t *dst_z,
+//                          bfp16_t **cache_line,
+//                          const float* weight_z,
+//                          int dst_width)
+
+dst      .req r0
+line0    .req r4
+line1    .req r5
+line2    .req r6
+weight   .req r2
+width    .req r3
+
+w_00      .req q0
+w_01      .req q1
+w_02      .req q2
+w_10      .req q3
+w_11      .req q4
+w_12      .req q5
+w_20      .req q6
+w_21      .req q7
+w_22      .req q8
+
+push {r4-r6, lr}
+
+//Auto Load:
+//r0:dst_z, r1:cache_line, r2:weight_z, r3: dst_width
+
+vpush {q4-q7}
+
+cmp width, #0
+ble End
+
+ldr r4, [r1]
+ldr r5, [r1, #4]
+ldr r6, [r1, #8]
+
+vldm weight!, {d0-d15}
+vld1.32 {q8}, [weight]
+vld1.32 {d22}, [line0]!
+vld1.32 {d24}, [line1]!
+vld1.32 {d26}, [line2]!
+vshll.u16  q11, d22, #16
+vshll.u16  q12, d24, #16
+vshll.u16  q13, d26, #16
+vmul.f32 q9, q11, w_00
+vmla.f32 q9, q12, w_10
+vmla.f32 q9, q13, w_20
+vld1.32 {d28}, [line0]!
+vld1.32 {d30}, [line1]!
+vld1.32 {d22}, [line2]!
+vshll.u16  q14, d28, #16
+vshll.u16  q15, d30, #16
+vshll.u16  q11, d22, #16
+vmul.f32 q10, q14, w_00
+vmla.f32 q9, q14, w_01
+vmla.f32 q10, q15, w_10
+vmla.f32 q9, q15, w_11
+vmla.f32 q10, q11, w_20
+vmla.f32 q9, q11, w_21
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    vld1.32 {d22}, [line0]!
+    vld1.32 {d24}, [line1]!
+    vld1.32 {d26}, [line2]!
+    vshll.u16  q11, d22, #16
+    vshll.u16  q12, d24, #16
+    vshll.u16  q13, d26, #16
+    vmul.f32 q14, q11, w_00
+    vmla.f32 q10, q11, w_01
+    vmla.f32 q9, q11, w_02
+
+    vmla.f32 q14, q12, w_10
+    vmla.f32 q10, q12, w_11
+    vmla.f32 q9, q12, w_12
+
+    vmla.f32 q14, q13, w_20
+    vmla.f32 q10, q13, w_21
+    vmla.f32 q9, q13, w_22
+    vshrn.u32  d18, q9, #16
+    vst1.32 {d18}, [dst]!
+    subs width, width, #1
+    vmov q9, q10
+    vmov q10, q14
+
+    bne LoopDw
+LoopDwEnd:
+
+vld1.32 {d22}, [line0]!
+vld1.32 {d24}, [line1]!
+vld1.32 {d26}, [line2]!
+vshll.u16  q11, d22, #16
+vshll.u16  q12, d24, #16
+vshll.u16  q13, d26, #16
+vmla.f32 q9, q11, w_02
+vmla.f32 q9, q12, w_12
+vmla.f32 q9, q13, w_22
+vshrn.u32  d18, q9, #16
+vst1.32 {d18}, [dst]!
+
+End:
+
+vpop {q4-q7}
+pop {r4-r6, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_FLOAT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_FLOAT_SLIDEW.S
new file mode 100644
index 0000000..d8516c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_FLOAT_SLIDEW.S
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3FloatSlideW 
+//void ConvDw3x3FloatSlideW(float *dst_z,
+//                        float **cache_line,
+//                        const float* weight_z,
+//                        int dst_width)
+
+dst      .req r0
+line0    .req r4
+line1    .req r5
+line2    .req r6
+weight   .req r2
+width    .req r3
+
+w_00      .req q0
+w_01      .req q1
+w_02      .req q2
+w_10      .req q3
+w_11      .req q4
+w_12      .req q5
+w_20      .req q6
+w_21      .req q7
+w_22      .req q8
+
+push {r4-r6, lr}
+
+//Auto Load:
+//r0:dst_z, r1:cache_line, r2:weight_z, r3: dst_width
+
+vpush {q4-q7}
+
+cmp width, #0
+ble End
+
+ldr r4, [r1]
+ldr r5, [r1, #4]
+ldr r6, [r1, #8]
+
+vldm weight!, {d0-d15}
+vld1.32 {q8}, [weight]
+vld1.32 {q11}, [line0]!
+vld1.32 {q12}, [line1]!
+vld1.32 {q13}, [line2]!
+vmul.f32 q9, q11, w_00
+vmla.f32 q9, q12, w_10
+vmla.f32 q9, q13, w_20
+vld1.32 {q14}, [line0]!
+vld1.32 {q15}, [line1]!
+vld1.32 {q11}, [line2]!
+vmul.f32 q10, q14, w_00
+vmla.f32 q9, q14, w_01
+vmla.f32 q10, q15, w_10
+vmla.f32 q9, q15, w_11
+vmla.f32 q10, q11, w_20
+vmla.f32 q9, q11, w_21
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    vld1.f32 {q11}, [line0]!
+    vld1.f32 {q12}, [line1]!
+    vld1.f32 {q13}, [line2]!
+
+    vmul.f32 q14, q11, w_00
+    vmla.f32 q10, q11, w_01
+    vmla.f32 q9, q11, w_02
+
+    vmla.f32 q14, q12, w_10
+    vmla.f32 q10, q12, w_11
+    vmla.f32 q9, q12, w_12
+
+    vmla.f32 q14, q13, w_20
+    vmla.f32 q10, q13, w_21
+    vmla.f32 q9, q13, w_22
+
+    vst1.32 {q9}, [dst]!
+    subs width, width, #1
+    vmov q9, q10
+    vmov q10, q14
+
+    bne LoopDw
+LoopDwEnd:
+vld1.f32 {q11}, [line0]!
+vld1.f32 {q12}, [line1]!
+vld1.f32 {q13}, [line2]!
+vmla.f32 q9, q11, w_02
+vmla.f32 q9, q12, w_12
+vmla.f32 q9, q13, w_22
+vst1.32 {q9}, [dst]!
+
+End:
+
+vpop {q4-q7}
+pop {r4-r6, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_INT8_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_INT8_SLIDEW.S
new file mode 100644
index 0000000..1802531
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_3X3_INT8_SLIDEW.S
@@ -0,0 +1,232 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DepthwiseI8K3S1Kernel 
+
+//void DepthwiseI8K3S1Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+//                           long src_y_step, long src_w_step, long dst_depth, const float* scale_z, long dx, long dc)
+
+dst             .req r0
+src             .req r1
+weight          .req r2
+bias_z          .req r3
+src_y_step      .req r4
+src_w_step      .req r5
+dst_depth       .req r6
+scale_z         .req r7
+dx              .req r8
+dc              .req r9
+
+push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:bias_z
+
+//Load from sp
+//r4:src_y_step, r5:src_w_step, r6:dst_depth, r7:scale_z, r8:dx, r9:dc
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+
+vpush {q4-q7}
+sub   sp, sp, #16
+
+lsl     r10, dc,  #2
+add     r11, r10, #16
+add     r12, bias_z, r11
+add     r3,  bias_z, r10
+vld1.32 {d20-d21}, [r12]
+vld1.32 {d4-d5},   [r3]
+vmov    q5,  q10
+vmov    q1,  q10
+vmov    q0,  q10
+vmov    q4,  q2
+vmov    q8,  q2
+vmov    q9,  q2
+mov     r14, #3
+
+mla     r3,  dx, src_w_step, dc
+mla     r11, dx, dst_depth,  dc
+add     weight, weight, dc
+add     src,    src,    r3
+
+.L2:
+    mov     r3, src
+    vld1.8  {d22}, [r3], dst_depth
+    vld1.8  {d30}, [r3], dst_depth
+    vld1.8  {d26}, [weight], dst_depth
+    vld1.8  {d6},  [r3], dst_depth
+    vmovl.s8        q13, d26            // b0
+    vmovl.s8        q3,  d6             // a2
+    vmovl.s8        q15, d30            // a1
+    vld1.8  {d12}, [r3], dst_depth
+    vstr    d22, [sp]
+    vmovl.s8        q6, d12             // a3
+    vmov    d22, d27    // b0 h
+    vmov    d27, d6     // a2 l
+    vmov    d6,  d30    // a1 l
+    vldr    d30, [sp]
+    vmov    d29, d12    // a3 l
+    vmov    d28, d13    // a3 h
+    vld1.8  {d14}, [weight], dst_depth
+    vmovl.s8        q6, d30             // a0
+    vld1.8  {d24}, [r3], dst_depth
+    vmov    d23, d26    // b0 l
+    vmovl.s8        q12, d24            // a4
+    vmovl.s8        q7, d14             // b1
+    vmov    d26, d7     // a2 h
+    vmlal.s16       q10, d13, d22       // a0 h, b0 h -> acc01
+    vld1.8  {d7}, [weight], dst_depth
+    vmlal.s16       q2,  d12, d23       // a0 l, b0 l -> acc00
+    vstr    d7, [sp, #8]
+    vmov    q6,  q10
+    vmov    d7,  d31    // a1 h
+    vmov    d31, d24    // a4 l
+    vmov    d24, d15    // b1 h
+    vmov    d30, d25    // a4 h
+    vld1.8  {d21}, [r3]
+    vmov    d25, d14    // b1 l
+    vmlal.s16       q6, d7,  d24        // a1 h, b1 h -> acc01
+    vmlal.s16       q4, d29, d23        // a3 l, b0 l -> acc30
+    vmlal.s16       q5, d28, d22        // a3 h, b0 h -> acc31
+    vmlal.s16       q8, d27, d23        // a2 l, b0 l -> acc20
+    vmlal.s16       q1, d26, d22        // a2 h, b0 h -> acc21
+    vldr    d14, [sp, #8]
+    vmlal.s16       q9, d6, d23         // a1 l, b0 l -> acc10
+    vmlal.s16       q0, d7, d22         // a1 h, b0 h -> acc11
+    vmovl.s8        q7, d14             // b2
+    vmlal.s16       q4, d31, d25        // a4 l, b1 l -> acc30
+    vmlal.s16       q5, d30, d24        // a4 h, b1 h -> acc31
+    vmlal.s16       q8, d29, d25        // a3 l, b1 l -> acc20
+    vmlal.s16       q1, d28, d24        // a3 h, b1 h -> acc21
+    vmlal.s16       q9, d27, d25        // a2 l, b1 l -> acc10
+    vmlal.s16       q0, d26, d24        // a2 h, b1 h -> acc11
+    vmlal.s16       q2, d6, d25         // a1 l, b1 l -> acc00
+    vmovl.s8        q12, d21            // a5
+    vmov    q10, q6
+    subs    r14, r14, #1
+    add     src, src, src_y_step
+    vmlal.s16       q8, d31, d14        // a4 l, b2 l -> acc20
+    vmlal.s16       q1, d30, d15        // a4 h, b2 h -> acc21
+    vmlal.s16       q9, d29, d14        // a3 l, b2 l -> acc10
+    vmlal.s16       q0, d28, d15        // a3 h, b2 h -> acc11
+    vmlal.s16       q2, d27, d14        // a2 l, b2 l -> acc00
+    vmlal.s16       q10, d26, d15       // a2 h, b2 h -> acc01
+    vmlal.s16       q4, d24, d14        // a5 l, b2 l -> acc30
+    vmlal.s16       q5, d25, d15        // a4 h, b2 h -> acc31
+    bne     .L2
+
+add     r9, r10, #16
+add     r10, scale_z, r10
+add     scale_z, scale_z, r9
+vld1.32 {d28-d29}, [scale_z]
+vld1.32 {d30-d31}, [r10]
+
+vcvt.f32.s32    q2, q2
+vcvt.f32.s32    q10, q10
+vcvt.f32.s32    q8, q8
+vcvt.f32.s32    q1, q1
+vcvt.f32.s32    q9, q9
+vcvt.f32.s32    q0, q0
+vcvt.f32.s32    q4, q4
+vcvt.f32.s32    q5, q5
+
+vmul.f32        q2, q2, q15
+vmul.f32        q10, q10, q14
+vmul.f32        q8, q8, q15
+vmul.f32        q1, q1, q14
+vmul.f32        q9, q9, q15
+vmul.f32        q0, q0, q14
+vmul.f32        q4, q4, q15
+vmul.f32        q5, q5, q14
+
+// f32 --> s32 --> s8
+// val + (val >= 0.f ? 0.5f : -0.5f)
+vmov.f32        q6, #0.5
+vmov.f32        q7, #-0.5
+
+vcge.f32        q3,  q2, #0
+vcge.f32        q11, q10, #0
+vcge.f32        q12, q8, #0
+vcge.f32        q13, q1, #0
+vcge.f32        q14, q9, #0
+vcge.f32        q15, q0, #0
+vbsl.f32        q3, q6, q7
+vbsl.f32        q11, q6, q7
+vbsl.f32        q12, q6, q7
+vbsl.f32        q13, q6, q7
+vbsl.f32        q14, q6, q7
+vbsl.f32        q15, q6, q7
+vadd.f32        q2, q2, q3
+vadd.f32        q10, q10, q11
+vcge.f32        q3,  q4, #0
+vcge.f32        q11, q5, #0
+vadd.f32        q8, q8, q12
+vadd.f32        q1, q1, q13
+vadd.f32        q9, q9, q14
+vadd.f32        q0, q0, q15
+vbsl.f32        q3, q6, q7
+vbsl.f32        q11, q6, q7
+vadd.f32        q4, q4, q3
+vadd.f32        q5, q5, q11
+
+vcvt.s32.f32    q2, q2
+vcvt.s32.f32    q10, q10
+vcvt.s32.f32    q11, q8
+vcvt.s32.f32    q14, q1
+vcvt.s32.f32    q12, q9
+vcvt.s32.f32    q0, q0
+vcvt.s32.f32    q9, q4
+vcvt.s32.f32    q8, q5
+
+vqmovn.s32      d4, q2
+vqmovn.s32      d5, q10
+vqmovn.s32      d22, q11
+vqmovn.s32      d23, q14
+vqmovn.s32      d24, q12
+vqmovn.s32      d25, q0
+vqmovn.s32      d18, q9
+vqmovn.s32      d19, q8
+
+vqmovn.s16      d4, q2
+vqmovn.s16      d22, q11
+vqmovn.s16      d24, q12
+vqmovn.s16      d16, q9
+
+add     r0, r0, r11
+mov     r3, r0
+vst1.8  {d4}, [r3], dst_depth
+lsl     r5, dst_depth, #1
+add     r5, r0, r5
+vst1.8  {d24}, [r3]
+vst1.8  {d22}, [r5], dst_depth
+vst1.8  {d16}, [r5]
+
+add  sp, sp, #16
+vpop {q4-q7}
+pop  {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_BFP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_BFP16_SLIDEW.S
new file mode 100644
index 0000000..63394b6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_BFP16_SLIDEW.S
@@ -0,0 +1,342 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw5x5Bfp16SlideW
+//void ConvDw5x5Bfp16SlideW(bfp16_t *dst_z,
+//                        bfp16_t **cache_line,
+//                        const float* weight_z,
+//                        int dst_width)
+
+dst      .req r0
+line0    .req r4
+line1    .req r5
+line2    .req r6
+line3    .req r7
+line4    .req r8
+weight   .req r2
+width    .req r3
+
+push {r4-r8, lr}
+
+//Auto Load:
+//r0:dst_z, r1:cache_line, r2:weight_z, r3: dst_width
+
+vpush {q4-q7}
+
+cmp width, #0
+ble End
+
+ldr r4, [r1]
+ldr r5, [r1, #4]
+ldr r6, [r1, #8]
+ldr r7, [r1, #12]
+ldr r8, [r1, #16]
+
+cmp width, #3
+ble L1
+
+vld1.32 {d0-d3}, [weight]!
+vld1.32 {d4-d7}, [weight]!
+vld1.32 {d8-d11}, [line0]!
+vshll.u16 q7, d11, #16
+vshll.u16 q6, d10, #16
+vshll.u16 q5, d9, #16
+vshll.u16 q4, d8, #16
+
+L4Loop:
+    //  line0
+    vld1.32  {d16-d19},  [line0]!
+    vshll.u16 q11, d19, #16
+    vshll.u16 q10, d18, #16
+    vshll.u16 q9, d17, #16
+    vshll.u16 q8, d16, #16
+    vmul.f32 q12,  q4,   q0
+    vmul.f32 q13,  q5,   q0
+    vmul.f32 q14,  q6,   q0
+    vmul.f32 q15,  q7,   q0
+    sub  line0, line0, #32
+    vmla.f32 q12,  q5,   q1
+    vmla.f32 q13,  q6,   q1
+    vmla.f32 q14,  q7,   q1
+    vmla.f32 q15,  q8,   q1
+    vmla.f32 q12,  q6,   q2
+    vmla.f32 q13,  q7,   q2
+    vmla.f32 q14,  q8,   q2
+    vmla.f32 q15,  q9,   q2
+    vld1.32   {d0-d3},   [weight]!
+    vmla.f32 q12,  q7,   q3
+    vmla.f32 q13,  q8,   q3
+    vld1.32  {d8-d11},   [line1]!
+    vshll.u16 q7, d11, #16
+    vshll.u16 q6, d10, #16
+    vshll.u16 q5, d9, #16
+    vshll.u16 q4, d8, #16
+    vmla.f32 q14,  q9,   q3
+    vmla.f32 q15,  q10,  q3
+    vmla.f32 q12,  q8,   q0
+    vmla.f32 q13,  q9,   q0
+    vmla.f32 q14,  q10,  q0
+    vmla.f32 q15,  q11,  q0
+    vld1.32   {d4-d7},   [weight]!
+    //  line1
+    vmla.f32 q12,  q4,   q1
+    vmla.f32 q13,  q5,   q1
+    vmla.f32 q14,  q6,   q1
+    vmla.f32 q15,  q7,   q1
+    vld1.32  {d16-d19},  [line1]!
+    vshll.u16 q11, d19, #16
+    vshll.u16 q10, d18, #16
+    vshll.u16 q9, d17, #16
+    vshll.u16 q8, d16, #16
+    vmla.f32 q12,  q5,   q2
+    vmla.f32 q13,  q6,   q2
+    vmla.f32 q14,  q7,   q2
+    vmla.f32 q15,  q8,   q2
+    vld1.32   {d0-d3},   [weight]!
+    vmla.f32 q12,  q6,   q3
+    vmla.f32 q13,  q7,   q3
+    vmla.f32 q14,  q8,   q3
+    vmla.f32 q15,  q9,   q3
+    sub  line1, line1, #32
+    vmla.f32 q12,  q7,   q0
+    vmla.f32 q13,  q8,   q0
+    vld1.32  {d8-d11},   [line2]!
+    vshll.u16 q7, d11, #16
+    vshll.u16 q6, d10, #16
+    vshll.u16 q5, d9, #16
+    vshll.u16 q4, d8, #16
+    vmla.f32 q14,  q9,   q0
+    vmla.f32 q15,  q10,  q0
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q8,   q1
+    vmla.f32 q13,  q9,   q1
+    vmla.f32 q14,  q10,  q1
+    vmla.f32 q15,  q11,  q1
+    //  line2
+    vmla.f32 q12,  q4,   q2
+    vmla.f32 q13,  q5,   q2
+    vld1.32  {d16-d19},  [line2]!
+    vshll.u16 q11, d19, #16
+    vshll.u16 q10, d18, #16
+    vshll.u16 q9, d17, #16
+    vshll.u16 q8, d16, #16
+    vmla.f32 q14,  q6,   q2
+    vmla.f32 q15,  q7,   q2
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q5,   q3
+    vmla.f32 q13,  q6,   q3
+    vmla.f32 q14,  q7,   q3
+    vmla.f32 q15,  q8,   q3
+    vld1.32  {d4-d7},    [weight]!
+    sub  line2, line2, #32
+    vmla.f32 q12,  q6,   q0
+    vmla.f32 q13,  q7,   q0
+    vmla.f32 q14,  q8,   q0
+    vmla.f32 q15,  q9,   q0
+    vmla.f32 q12,  q7,   q1
+    vmla.f32 q13,  q8,   q1
+    vld1.32  {d8-d11},   [line3]!
+    vshll.u16 q7, d11, #16
+    vshll.u16 q6, d10, #16
+    vshll.u16 q5, d9, #16
+    vshll.u16 q4, d8, #16
+    vmla.f32 q14,  q9,   q1
+    vmla.f32 q15,  q10,  q1
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q8,   q2
+    vmla.f32 q13,  q9,   q2
+    vmla.f32 q14,  q10,  q2
+    vmla.f32 q15,  q11,  q2
+    //  line3
+    vmla.f32 q12,  q4,   q3
+    vmla.f32 q13,  q5,   q3
+    vld1.32  {d16-d19},  [line3]!
+    vshll.u16 q11, d19, #16
+    vshll.u16 q10, d18, #16
+    vshll.u16 q9, d17, #16
+    vshll.u16 q8, d16, #16
+    vmla.f32 q14,  q6,   q3
+    vmla.f32 q15,  q7,   q3
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q5,   q0
+    vmla.f32 q13,  q6,   q0
+    vmla.f32 q14,  q7,   q0
+    vmla.f32 q15,  q8,   q0
+    sub  line3, line3, #32
+    vmla.f32 q12,  q6,   q1
+    vmla.f32 q13,  q7,   q1
+    vmla.f32 q14,  q8,   q1
+    vmla.f32 q15,  q9,   q1
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q7,   q2
+    vmla.f32 q13,  q8,   q2
+    vmla.f32 q14,  q9,   q2
+    vmla.f32 q15,  q10,  q2
+    vld1.32  {d8-d11},  [line4]!
+    vshll.u16 q7, d11, #16
+    vshll.u16 q6, d10, #16
+    vshll.u16 q5, d9, #16
+    vshll.u16 q4, d8, #16
+    vmla.f32 q12,  q8,   q3
+    vmla.f32 q13,  q9,   q3
+    vmla.f32 q14,  q10,  q3
+    vmla.f32 q15,  q11,  q3
+    //  line4
+    vmla.f32 q12,  q4,   q0
+    vmla.f32 q13,  q5,   q0
+    vld1.32  {d16-d19},  [line4]!
+    vshll.u16 q11, d19, #16
+    vshll.u16 q10, d18, #16
+    vshll.u16 q9, d17, #16
+    vshll.u16 q8, d16, #16
+    vmla.f32 q14,  q6,   q0
+    vmla.f32 q15,  q7,   q0
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q5,   q1
+    vmla.f32 q13,  q6,   q1
+    vmla.f32 q14,  q7,   q1
+    vmla.f32 q15,  q8,   q1
+    vmla.f32 q12,  q6,   q2
+    vmla.f32 q13,  q7,   q2
+    vmla.f32 q14,  q8,   q2
+    vmla.f32 q15,  q9,   q2
+    vld1.32  {d4-d5},    [weight]!
+    sub  line4, line4, #32
+    vmla.f32 q12,  q7,   q3
+    vmla.f32 q13,  q8,   q3
+    vld1.32 {d8-d11}, [line0]!
+    vshll.u16 q7, d11, #16
+    vshll.u16 q6, d10, #16
+    vshll.u16 q5, d9, #16
+    vshll.u16 q4, d8, #16
+    sub  weight,  weight, #400
+    vmla.f32 q14,  q9,   q3
+    vmla.f32 q15,  q10,  q3
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q8,   q2
+    vmla.f32 q13,  q9,   q2
+    vmla.f32 q14,  q10,  q2
+    vmla.f32 q15,  q11,  q2
+    
+    vshrn.u32  d24, q12, #16
+    vshrn.u32  d25, q13, #16
+    vshrn.u32  d26, q14, #16
+    vshrn.u32  d27, q15, #16
+    vst1.32  {d24-d27},  [dst]!
+    vld1.32  {d4-d7},    [weight]!
+    sub  width,   width, #4
+    cmp width, #4
+    bge L4Loop
+
+sub weight, weight, #64
+sub line0, line0, #32
+
+L1:
+cmp width, #0
+ble End
+
+    L1Loop:
+    vldm weight!, {d0-d9}
+    vldm line0, {d10-d14}
+    vshll.u16 q9, d14, #16
+    vshll.u16 q8, d13, #16
+    vshll.u16 q7, d12, #16
+    vshll.u16 q6, d11, #16
+    vshll.u16 q5, d10, #16
+    vmul.f32 q12, q5, q0
+    vmul.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line0, line0, #8
+
+    vldm weight!, {d0-d9}
+    vldm line1, {d10-d14}
+    vshll.u16 q9, d14, #16
+    vshll.u16 q8, d13, #16
+    vshll.u16 q7, d12, #16
+    vshll.u16 q6, d11, #16
+    vshll.u16 q5, d10, #16
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line1, line1, #8
+
+    vldm weight!, {d0-d9}
+    vldm line2, {d10-d14}
+    vshll.u16 q9, d14, #16
+    vshll.u16 q8, d13, #16
+    vshll.u16 q7, d12, #16
+    vshll.u16 q6, d11, #16
+    vshll.u16 q5, d10, #16
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line2, line2, #8
+
+    vldm weight!, {d0-d9}
+    vldm line3, {d10-d14}
+    vshll.u16 q9, d14, #16
+    vshll.u16 q8, d13, #16
+    vshll.u16 q7, d12, #16
+    vshll.u16 q6, d11, #16
+    vshll.u16 q5, d10, #16
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line3, line3, #8
+
+    vldm weight!, {d0-d9}
+    vldm line4, {d10-d14}
+    vshll.u16 q9, d14, #16
+    vshll.u16 q8, d13, #16
+    vshll.u16 q7, d12, #16
+    vshll.u16 q6, d11, #16
+    vshll.u16 q5, d10, #16
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line4, line4, #8
+    sub weight, weight, #400
+
+    vadd.f32 q12, q12, q13
+    sub width, width, #1
+    vshrn.u32  d24, q12, #16
+    vst1.32 {d24}, [dst]!
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r8, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_FLOAT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_FLOAT_SLIDEW.S
new file mode 100644
index 0000000..86a1a8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_DW_5X5_FLOAT_SLIDEW.S
@@ -0,0 +1,280 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw5x5FloatSlideW
+//void ConvDw5x5FloatSlideW(float *dst_z,
+//                        float **cache_line,
+//                        const float* weight_z,
+//                        int dst_width)
+
+dst      .req r0
+line0    .req r4
+line1    .req r5
+line2    .req r6
+line3    .req r7
+line4    .req r8
+weight   .req r2
+width    .req r3
+
+push {r4-r8, lr}
+
+//Auto Load:
+//r0:dst_z, r1:cache_line, r2:weight_z, r3: dst_width
+
+vpush {q4-q7}
+
+cmp width, #0
+ble End
+
+ldr r4, [r1]
+ldr r5, [r1, #4]
+ldr r6, [r1, #8]
+ldr r7, [r1, #12]
+ldr r8, [r1, #16]
+
+L4:
+cmp width, #3
+ble L1
+
+vld1.32 {d0-d3}, [weight]!
+vld1.32 {d4-d7}, [weight]!
+vld1.32 {d8-d11}, [line0]!
+vld1.32 {d12-d15}, [line0]!
+
+    L4Loop:
+    //  line0
+    vld1.32  {d16-d19},  [line0]!
+    vmul.f32 q12,  q4,   q0
+    vmul.f32 q13,  q5,   q0
+    vmul.f32 q14,  q6,   q0
+    vmul.f32 q15,  q7,   q0
+    vld1.32  {d20-d23},  [line0]!
+    sub  line0, line0, #64
+    vmla.f32 q12,  q5,   q1
+    vmla.f32 q13,  q6,   q1
+    vmla.f32 q14,  q7,   q1
+    vmla.f32 q15,  q8,   q1
+    vld1.32  {d8-d11},   [line1]!
+    vmla.f32 q12,  q6,   q2
+    vmla.f32 q13,  q7,   q2
+    vmla.f32 q14,  q8,   q2
+    vmla.f32 q15,  q9,   q2
+    vld1.32   {d0-d3},   [weight]!
+    vmla.f32 q12,  q7,   q3
+    vmla.f32 q13,  q8,   q3
+    vmla.f32 q14,  q9,   q3
+    vmla.f32 q15,  q10,  q3
+    vld1.32  {d12-d15},  [line1]!
+    vmla.f32 q12,  q8,   q0
+    vmla.f32 q13,  q9,   q0
+    vmla.f32 q14,  q10,  q0
+    vmla.f32 q15,  q11,  q0
+    vld1.32   {d4-d7},   [weight]!
+    //  line1
+    vmla.f32 q12,  q4,   q1
+    vmla.f32 q13,  q5,   q1
+    vmla.f32 q14,  q6,   q1
+    vmla.f32 q15,  q7,   q1
+    vld1.32  {d16-d19},  [line1]!
+    vmla.f32 q12,  q5,   q2
+    vmla.f32 q13,  q6,   q2
+    vmla.f32 q14,  q7,   q2
+    vmla.f32 q15,  q8,   q2
+    vld1.32   {d0-d3},   [weight]!
+    vmla.f32 q12,  q6,   q3
+    vmla.f32 q13,  q7,   q3
+    vld1.32  {d20-d23},  [line1]!
+    vmla.f32 q14,  q8,   q3
+    vmla.f32 q15,  q9,   q3
+    sub  line1, line1, #64
+    vmla.f32 q12,  q7,   q0
+    vmla.f32 q13,  q8,   q0
+    vld1.32  {d8-d11},   [line2]!
+    vmla.f32 q14,  q9,   q0
+    vmla.f32 q15,  q10,  q0
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q8,   q1
+    vmla.f32 q13,  q9,   q1
+    vld1.32  {d12-d15},  [line2]!
+    vmla.f32 q14,  q10,  q1
+    vmla.f32 q15,  q11,  q1
+    //  line2
+    vmla.f32 q12,  q4,   q2
+    vmla.f32 q13,  q5,   q2
+    vld1.32  {d16-d19},  [line2]!
+    vmla.f32 q14,  q6,   q2
+    vmla.f32 q15,  q7,   q2
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q5,   q3
+    vmla.f32 q13,  q6,   q3
+    vld1.32  {d20-d23},  [line2]!
+    vmla.f32 q14,  q7,   q3
+    vmla.f32 q15,  q8,   q3
+    vld1.32  {d4-d7},    [weight]!
+    sub  line2, line2, #64
+    vmla.f32 q12,  q6,   q0
+    vmla.f32 q13,  q7,   q0
+    vmla.f32 q14,  q8,   q0
+    vmla.f32 q15,  q9,   q0
+    vld1.32  {d8-d11},   [line3]!
+    vmla.f32 q12,  q7,   q1
+    vmla.f32 q13,  q8,   q1
+    vmla.f32 q14,  q9,   q1
+    vmla.f32 q15,  q10,  q1
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q8,   q2
+    vmla.f32 q13,  q9,   q2
+    vld1.32  {d12-d15},  [line3]!
+    vmla.f32 q14,  q10,  q2
+    vmla.f32 q15,  q11,  q2
+    //  line3
+    vmla.f32 q12,  q4,   q3
+    vmla.f32 q13,  q5,   q3
+    vld1.32  {d16-d19},  [line3]!
+    vmla.f32 q14,  q6,   q3
+    vmla.f32 q15,  q7,   q3
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q5,   q0
+    vmla.f32 q13,  q6,   q0
+    vld1.32  {d20-d23},  [line3]!
+    vmla.f32 q14,  q7,   q0
+    vmla.f32 q15,  q8,   q0
+    sub  line3, line3, #64
+    vmla.f32 q12,  q6,   q1
+    vmla.f32 q13,  q7,   q1
+    vmla.f32 q14,  q8,   q1
+    vmla.f32 q15,  q9,   q1
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q7,   q2
+    vmla.f32 q13,  q8,   q2
+    vmla.f32 q14,  q9,   q2
+    vmla.f32 q15,  q10,  q2
+    vld1.32  {d8-d11},  [line4]!
+    vmla.f32 q12,  q8,   q3
+    vmla.f32 q13,  q9,   q3
+    vld1.32  {d12-d15},  [line4]!
+    vmla.f32 q14,  q10,  q3
+    vmla.f32 q15,  q11,  q3
+    //  line4
+    vmla.f32 q12,  q4,   q0
+    vmla.f32 q13,  q5,   q0
+    vld1.32  {d16-d19},  [line4]!
+    vmla.f32 q14,  q6,   q0
+    vmla.f32 q15,  q7,   q0
+    vld1.32  {d4-d7},    [weight]!
+    vmla.f32 q12,  q5,   q1
+    vmla.f32 q13,  q6,   q1
+    vmla.f32 q14,  q7,   q1
+    vmla.f32 q15,  q8,   q1
+    vld1.32  {d20-d23},  [line4]!
+    vmla.f32 q12,  q6,   q2
+    vmla.f32 q13,  q7,   q2
+    vmla.f32 q14,  q8,   q2
+    vmla.f32 q15,  q9,   q2
+    vld1.32  {d4-d5},    [weight]!
+    sub  line4, line4, #64
+    vmla.f32 q12,  q7,   q3
+    vmla.f32 q13,  q8,   q3
+    vld1.32  {d8-d11},   [line0]!
+    sub  weight,  weight, #400
+    vmla.f32 q14,  q9,   q3
+    vmla.f32 q15,  q10,  q3
+    vld1.32  {d0-d3},    [weight]!
+    vmla.f32 q12,  q8,   q2
+    vmla.f32 q13,  q9,   q2
+    vld1.32  {d12-d15},  [line0]!
+    vmla.f32 q14,  q10,  q2
+    vmla.f32 q15,  q11,  q2
+    vst1.32  {d24-d27},  [dst]!
+    vld1.32  {d4-d7},    [weight]!
+    sub  width,   width, #4
+    vst1.32  {d28-d31},  [dst]!
+    cmp width, #4
+    bge L4Loop
+
+sub weight, weight, #64
+sub line0, line0, #64
+
+L1:
+cmp width, #0
+ble End
+
+    L1Loop:
+    vldm weight!, {d0-d9}
+    vldm line0, {d10-d19}
+    vmul.f32 q12, q5, q0
+    vmul.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line0, line0, #16
+
+    vldm weight!, {d0-d9}
+    vldm line1, {d10-d19}
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line1, line1, #16
+
+    vldm weight!, {d0-d9}
+    vldm line2, {d10-d19}
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line2, line2, #16
+
+    vldm weight!, {d0-d9}
+    vldm line3, {d10-d19}
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line3, line3, #16
+
+    vldm weight!, {d0-d9}
+    vldm line4, {d10-d19}
+    vmla.f32 q12, q5, q0
+    vmla.f32 q13, q6, q1
+    vmla.f32 q12, q7, q2
+    vmla.f32 q13, q8, q3
+    vmla.f32 q12, q9, q4
+    add line4, line4, #16
+    sub weight, weight, #400
+
+    vadd.f32 q12, q12, q13
+    sub width, width, #1
+    vst1.32 {d24-d25}, [dst]!
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r8, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_O4.S
new file mode 100644
index 0000000..d3efdcd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_O4.S
@@ -0,0 +1,287 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvFloatO4
+//void ConvFloatO4(float* dst, const float* src, const float* weight, int width, int src_w_step, int src_depth_quad, int src_depth_step, int fw, int fh, int dilate_x_step, int dilate_y_step)
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:src_w_step, r5:src_depth_quad, r6: src_depth_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+
+vpush {q4-q7}
+
+//step multi by sizeof(float)
+mov r12, #4
+mul r10, r12, r10
+mul r9, r12, r9
+mul r6, r12, r6
+mul r4, r12, r4
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+mul r12, r8, r10
+sub r6, r6, r12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul r12, r7, r9
+sub r10, r10, r12
+
+L8:
+cmp r3, #7
+ble L4
+
+
+L8Loop:
+    vmov.i32 d6[0], r1
+    vmov.i32 d6[1], r2
+    vmov.i32 d7[0], r3
+    vmov.i32 d7[1], r5
+    mov r3, #8
+    mul r3, r4, r3
+    vmov.i32 q8,  #0
+    vmov.i32 q9,  #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+    L8LoopZ:
+        mov r11, r8
+        L8LoopFY:
+            mov r12, r7
+            L8LoopFX:
+                vld1.32 {q4, q5}, [r2]!
+                vld1.32 {q6, q7}, [r2]!
+
+                vld1.32 {q0}, [r1], r4
+
+                vmla.f32 q8, q4, d0[0]
+
+                vld1.32 {q1}, [r1], r4
+
+                vmla.f32 q8, q5, d0[1]
+                vmla.f32 q9, q4, d2[0]
+                vmla.f32 q8, q6, d1[0]
+
+                vld1.32 {q2}, [r1], r4
+
+                vmla.f32 q8, q7, d1[1]
+                vmla.f32 q9, q5, d2[1]
+                vmla.f32 q10, q4, d4[0]
+
+                vld1.32 {q0}, [r1], r4
+
+                vmla.f32 q9, q6, d3[0]
+                vmla.f32 q11, q7, d1[1]
+                vmla.f32 q10, q5, d4[1]
+                vmla.f32 q11, q5, d0[1]
+                vmla.f32 q9, q7, d3[1]
+                vmla.f32 q11, q6, d1[0]
+                vmla.f32 q10, q6, d5[0]
+                vmla.f32 q11, q4, d0[0]
+
+                vld1.32 {q0}, [r1], r4
+
+                vmla.f32 q10, q7, d5[1]
+                vmla.f32 q12, q4, d0[0]
+
+                vld1.32 {q1}, [r1], r4
+
+                vmla.f32 q12, q5, d0[1]
+                vmla.f32 q13, q4, d2[0]
+                
+                vld1.32 {q2}, [r1], r4
+
+                vmla.f32 q12, q6, d1[0]
+                vmla.f32 q13, q5, d2[1]
+                vmla.f32 q12, q7, d1[1]
+                vmla.f32 q14, q4, d4[0]
+
+                vld1.32 {q0}, [r1], r4
+                vmla.f32 q13, q6, d3[0]
+                vmla.f32 q14, q5, d4[1]
+                vmla.f32 q15, q5, d0[1]
+                vmla.f32 q13, q7, d3[1]
+                vmla.f32 q15, q6, d1[0]
+                vmla.f32 q14, q6, d5[0]
+                vmla.f32 q15, q4, d0[0]
+                vmla.f32 q14, q7, d5[1]
+                vmla.f32 q15, q7, d1[1]
+
+                sub r1, r1, r3
+                subs r7, r7, #1
+                add r1, r1, r9
+                bne L8LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L8LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L8LoopZ
+    vmov.i32 r1, d6[0]
+    add r1, r1, r3
+    vmov.i32 r2, d6[1]
+    vmov.i32 r3, d7[0]
+    vmov.i32 r5, d7[1]
+    vst1.32 {q8, q9}, [r0]!
+    vst1.32 {q10, q11}, [r0]!
+    sub r3, r3, #8
+    vst1.32 {q12, q13}, [r0]!
+    cmp r3, #8
+    vst1.32 {q14, q15}, [r0]!
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+
+L4Loop:
+    vmov.i32 d30[0], r1
+    vmov.i32 d30[1], r2
+    vmov.i32 d31[0], r3
+    vmov.i32 d31[1], r5
+    mov r3, #4
+    mul r3, r4, r3
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    L4LoopZ:
+        mov r11, r8
+        L4LoopFY:
+            mov r12, r7
+            L4LoopFX:
+                vld1.32 {q4, q5}, [r2]!
+                vld1.32 {q6, q7}, [r2]!
+
+                vld1.32 {q0}, [r1], r4
+
+                vmla.f32 q8, q4, d0[0]
+
+                vld1.32 {q1}, [r1], r4
+
+                vmla.f32 q8, q5, d0[1]
+                vmla.f32 q9, q4, d2[0]
+                vmla.f32 q8, q6, d1[0]
+
+                vld1.32 {q2}, [r1], r4
+
+                vmla.f32 q8, q7, d1[1]
+                vmla.f32 q9, q5, d2[1]
+                vmla.f32 q10, q4, d4[0]
+
+                vld1.32 {q3}, [r1], r4
+
+                vmla.f32 q9, q6, d3[0]
+                vmla.f32 q11, q7, d7[1]
+                vmla.f32 q10, q5, d4[1]
+                vmla.f32 q11, q5, d6[1]
+                vmla.f32 q9, q7, d3[1]
+                vmla.f32 q11, q6, d7[0]
+                vmla.f32 q10, q6, d5[0]
+                vmla.f32 q11, q4, d6[0]
+                vmla.f32 q10, q7, d5[1]
+
+                sub r1, r1, r3
+                subs r7, r7, #1
+                add r1, r1, r9
+                bne L4LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L4LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L4LoopZ
+    vmov.i32 r1, d30[0]
+    add r1, r1, r3
+    vmov.i32 r2, d30[1]
+    vst1.32 {q8, q9}, [r0]!
+    vmov.i32 r3, d31[0]
+    vmov.i32 r5, d31[1]
+    sub r3, r3, #4
+    vst1.32 {q10, q11}, [r0]!
+
+
+L1:
+cmp r3, #0
+ble End
+
+L1Loop:
+    vmov.i32 d16[0], r1
+    vmov.i32 d16[1], r2
+    vmov.i32 d17[0], r5
+    vmov.i32 q0, #0
+    vmov.i32 q1, #0
+    L1LoopZ:
+        mov r11, r8
+        L1LoopFY:
+            mov r12, r7
+            L1LoopFX:
+                vld1.32 {q3}, [r1], r9
+                vld1.32 {q4, q5}, [r2]!
+                vmla.f32 q0, q4, d6[0]
+                vmla.f32 q1, q5, d6[1]
+                vld1.32 {q6, q7}, [r2]!
+                vmla.f32 q0, q6, d7[0]
+                vmla.f32 q1, q7, d7[1]
+                subs r7, r7, #1
+                bne L1LoopFX
+            subs r8, r8, #1
+            mov r7, r12
+            add r1, r1, r10
+            bne L1LoopFY
+        subs r5, r5, #1
+        mov r8, r11
+        add r1, r1, r6
+        bne L1LoopZ
+
+    vadd.f32 q0, q0, q1
+    vmov.i32 r1, d16[0]
+    vmov.i32 r2, d16[1]
+    vmov.i32 r5, d17[0]
+    add r1, r1, r4
+    vst1.32 {q0}, [r0]!
+    subs r3, r3, #1
+    bne L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_SLIDEW_C3.S
new file mode 100644
index 0000000..9c8d250
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/CONV_FLOAT_SLIDEW_C3.S
@@ -0,0 +1,261 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmFloatSlidewC3
+//void GemmFloatSlidewC3(float* dst,                //r0: dst 
+//                          const float* src,          //r1: src
+//                          const float* weight,       //r2: weight
+//                          int width,                 //r3: width
+//                          int src_w_setup,           //r4: src_w_step,   load from stack
+//                          int fw,                    //r7: fw,           load from stack
+//                          int fh,                    //r8: fh,           load from stack
+//                          int dilateX_step,          //r9: dilateX_step, load from stack
+//                          int dilateY_step);         //ra: dilateY_step, load from stack
+
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+src_w_step   .req r4
+fw           .req r7
+fh           .req r8
+dilateX_step .req r9
+dilate_y_gap .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:src_w_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r7, [sp, #40]
+ldr r8, [sp, #44]
+ldr r9, [sp, #48]
+ldr r10, [sp, #52]
+
+vpush {q4-q7}
+
+//step multi by sizeof(float)
+mov r12, #4
+mul r10, r12, r10
+mul r9, r12, r9
+mul r4, r12, r4
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul r12, r7, r9
+sub r10, r10, r12
+
+L8:
+cmp r3, #7
+ble L4
+
+
+L8Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 d14[0], width
+    mov width, #8
+    mul width, src_w_step, width
+    vmov.i32 q8,  #0
+    vmov.i32 q9,  #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    vmov.i32 q12, #0
+    vmov.i32 q13, #0
+    vmov.i32 q14, #0
+    vmov.i32 q15, #0
+    mov r11, fh
+    L8LoopFY:
+        mov r12, fw
+        L8LoopFX:
+            vld1.32 {q4, q5}, [weight]!
+            vld1.32 {q6}, [weight]!
+
+            vld1.32 {q0}, [src], src_w_step
+            vld1.32 {q1}, [src], src_w_step
+
+            vmla.f32 q8, q4, d0[0]
+            vmla.f32 q8, q5, d0[1]
+            vmla.f32 q9, q4, d2[0]
+
+            vld1.32 {q2}, [src], src_w_step
+
+            vmla.f32 q8, q6, d1[0]
+            vmla.f32 q9, q5, d2[1]
+            vmla.f32 q10, q4, d4[0]
+
+            vld1.32 {q3}, [src], src_w_step
+
+            vmla.f32 q9, q6, d3[0]
+            vmla.f32 q10, q5, d4[1]
+            vmla.f32 q11, q5, d6[1]
+            
+            vld1.32 {q0}, [src], src_w_step
+            vmla.f32 q11, q6, d7[0]
+            vmla.f32 q10, q6, d5[0]
+            vmla.f32 q11, q4, d6[0]
+
+
+            vld1.32 {q1}, [src], src_w_step
+            vmla.f32 q12, q4, d0[0]
+            vmla.f32 q12, q5, d0[1]
+            vmla.f32 q13, q4, d2[0]
+            
+            vld1.32 {q2}, [src], src_w_step
+
+            vmla.f32 q12, q6, d1[0]
+            vmla.f32 q13, q5, d2[1]
+            vmla.f32 q14, q4, d4[0]
+
+            vld1.32 {q3}, [src], src_w_step
+            vmla.f32 q13, q6, d3[0]
+            vmla.f32 q14, q5, d4[1]
+            vmla.f32 q15, q5, d6[1]
+            vmla.f32 q15, q6, d7[0]
+            vmla.f32 q14, q6, d5[0]
+            vmla.f32 q15, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L8LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vmov.i32 width, d14[0]
+    vst1.32 {q8, q9}, [dst]!
+    vst1.32 {q10, q11}, [dst]!
+    sub width, width, #8
+    vst1.32 {q12, q13}, [dst]!
+    cmp width, #8
+    vst1.32 {q14, q15}, [dst]!
+    bge L8Loop
+
+L4:
+cmp width, #3
+ble L1
+
+
+L4Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 d14[0], width
+    mov width, #4
+    mul width, src_w_step, width
+    vmov.i32 q8, #0
+    vmov.i32 q9, #0
+    vmov.i32 q10, #0
+    vmov.i32 q11, #0
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vld1.32 {q4, q5}, [weight]!
+            vld1.32 {q6}, [weight]!
+
+            vld1.32 {q0}, [src], src_w_step
+            vld1.32 {q1}, [src], src_w_step
+
+            vmla.f32 q8, q4, d0[0]
+            vmla.f32 q8, q5, d0[1]
+            vmla.f32 q9, q4, d2[0]
+
+            vld1.32 {q2}, [src], src_w_step
+            vmla.f32 q8, q6, d1[0]
+            vmla.f32 q9, q5, d2[1]
+            vmla.f32 q10, q4, d4[0]
+
+            vld1.32 {q3}, [src], src_w_step
+
+            vmla.f32 q9, q6, d3[0]
+            vmla.f32 q10, q5, d4[1]
+            vmla.f32 q11, q5, d6[1]
+            vmla.f32 q11, q6, d7[0]
+            vmla.f32 q10, q6, d5[0]
+            vmla.f32 q11, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L4LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vst1.32 {q8, q9}, [dst]!
+    vmov.i32 width, d14[0]
+    sub width, width, #4
+    vst1.32 {q10, q11}, [dst]!
+
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 q0, #0
+    vmov.i32 q1, #0
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vld1.32 {q3}, [src], dilateX_step
+            vld1.32 {q4, q5}, [weight]!
+            vmla.f32 q0, q4, d6[0]
+            vmla.f32 q1, q5, d6[1]
+            vld1.32 {q6}, [weight]!
+            vmla.f32 q0, q6, d7[0]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L1LoopFY
+    mov fh, r11
+    vadd.f32 q0, q0, q1
+    mov src, r5
+    mov weight, r6
+    add src, src, src_w_step
+    vst1.32 {q0}, [dst]!
+    subs width, width, #1
+    bne L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/DECONV_FLOAT_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/DECONV_FLOAT_O4.S
new file mode 100644
index 0000000..c2480f6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/DECONV_FLOAT_O4.S
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFloatO4
+//void DeconvFloatO4(float* dst,           // r0
+//                     const float* src,     // r1
+//                     const float* weight,  // r2
+//                     int width,            // r3
+//                     int dst_w_step,       // r4
+//                     int src_depth_quad,   // r5
+//                     int src_depth_step,   // r6
+//                     int fw,               // r7
+//                     int fh,               // r8
+//                     int dilate_x_step,    // r9
+//                     int dilate_y_step)    // r10
+
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+dst_w_step   .req r4
+ic4          .req r5
+fw           .req r7
+fh           .req r8
+dilate_x_step .req r9
+dilate_y_step .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:dst_w_step, r5:src_depth_quad, r6: src_depth_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+
+vpush {q4-q7}
+
+//step multi by sizeof(float)
+mov r12, #4
+mul r10, r12, r10
+mul r9, r12, r9
+mul r6, r12, r6
+mul dst_w_step, r12, dst_w_step
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+//mul r12, fh, r10
+//sub r6, r6, r12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+//mul r12, fw, r9
+//sub r10, r10, r12
+
+L4:
+cmp r3, #3
+ble L1
+
+
+L4Loop:
+    vmov.i32 d6[0], src
+    vmov.i32 d6[1], weight
+    vmov.i32 d7[0], width
+
+    mov r3, #4
+    mul r3, r3, dst_w_step
+
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vmov.i32 d7[1], ic4 
+            vld1.32 {q12, q13}, [weight]!
+            vld1.32 {q14, q15}, [weight]!
+
+            vld1.32 {q0}, [src]!
+            vld1.32 {q1}, [src]!
+            vld1.32 {q2}, [src]!
+            vld1.32 {q4}, [src]!
+            vmul.f32 q8, q12, d0[0]
+            vmul.f32 q9, q12, d2[0]
+            vmul.f32 q10, q12, d4[0]
+            vmul.f32 q11, q12, d8[0]
+
+            subs ic4, ic4, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                sub src, src, #64
+                add src, src, r6
+                vmla.f32 q8, q13, d0[1]
+                vmla.f32 q9, q13, d2[1]
+                vmla.f32 q10, q13, d4[1]
+                vmla.f32 q11, q13, d8[1]
+
+                vmla.f32 q8, q14, d1[0]
+                vmla.f32 q9, q14, d3[0]
+                vmla.f32 q10, q14, d5[0]
+                vmla.f32 q11, q14, d9[0]
+
+                vmla.f32 q8, q15, d1[1]
+                vmla.f32 q9, q15, d3[1]
+                vmla.f32 q10, q15, d5[1]
+                vmla.f32 q11, q15, d9[1]
+
+                vld1.32 {q12, q13}, [weight]!
+                vld1.32 {q14, q15}, [weight]!
+
+                vld1.32 {q0}, [src]!
+                vld1.32 {q1}, [src]!
+                vld1.32 {q2}, [src]!
+                vld1.32 {q4}, [src]!
+
+                vmla.f32 q8, q12, d0[0]
+                vmla.f32 q9, q12, d2[0]
+                vmla.f32 q10, q12, d4[0]
+                vmla.f32 q11, q12, d8[0]
+
+                subs ic4, ic4, #1
+                bne L4LoopZ
+            L4LoopZEnd:
+            vmla.f32 q8, q13, d0[1]
+            vmla.f32 q9, q13, d2[1]
+            vmla.f32 q10, q13, d4[1]
+            vmla.f32 q11, q13, d8[1]
+
+            vmla.f32 q8, q14, d1[0]
+            vmla.f32 q9, q14, d3[0]
+            vmla.f32 q10, q14, d5[0]
+            vmla.f32 q11, q14, d9[0]
+
+            vmla.f32 q8, q15, d1[1]
+            vmla.f32 q9, q15, d3[1]
+            vmla.f32 q10, q15, d5[1]
+            vmla.f32 q11, q15, d9[1]
+
+            // add with stride
+            vld1.32 {q0}, [r0]
+            vadd.f32 q8, q8, q0
+            vst1.32 {q8}, [r0], dst_w_step
+
+            vld1.32 {q0}, [r0]
+            vadd.f32 q9, q9, q0
+            vst1.32 {q9}, [r0], dst_w_step
+
+            vld1.32 {q0}, [r0]
+            vadd.f32 q10, q10, q0
+            vst1.32 {q10}, [r0], dst_w_step
+
+            vld1.32 {q0}, [r0]
+            vadd.f32 q11, q11, q0
+            vst1.32 {q11}, [r0], dst_w_step
+
+            sub r0, r0, r3
+            add r0, r0, dilate_x_step
+
+            vmov.i32 ic4, d7[1]
+            subs fw, fw, #1
+            vmov.i32 src, d6[0]
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub r0, r12
+        add r0, r0, dilate_y_step 
+        bne L4LoopFY
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub r0, r0, r12
+    add src, src, #64
+    add r0, r0, r3
+    vmov.i32 weight, d6[1]
+    vmov.i32 r3, d7[0]
+    sub r3, r3, #4
+    cmp r3, #4
+    bge L4Loop
+
+
+L1:
+cmp r3, #0
+ble End
+
+L1Loop:
+    vmov.i32 d6[0], src
+    vmov.i32 d6[1], weight
+    vmov.i32 d7[0], width
+
+    mov r3, #1
+    mul r3, r3, dst_w_step
+
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vmov.i32 d7[1], ic4 
+            vmov.i32 q8,  #0
+            vmov.i32 q9,  #0
+            L1LoopZ:
+                vld1.32 {q0}, [src]!
+                vld1.32 {q4, q5}, [weight]!
+                vmla.f32 q8, q4, d0[0]
+                vmla.f32 q9, q5, d0[1]
+                vld1.32 {q6, q7}, [weight]!
+                vmla.f32 q8, q6, d1[0]
+                vmla.f32 q9, q7, d1[1]
+
+                subs ic4, ic4, #1
+                sub src, src, #16
+                add src, src, r6
+                bne L1LoopZ
+            L1LoopZEnd:
+            // add with stride
+            vadd.f32 q8, q8, q9
+            vld1.32 {q0}, [r0]
+            vadd.f32 q8, q8, q0
+            vst1.32 {q8}, [r0], dst_w_step
+
+            sub r0, r0, r3
+            add r0, r0, dilate_x_step
+
+            vmov.i32 ic4, d7[1]
+            subs fw, fw, #1
+            vmov.i32 src, d6[0]
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub r0, r12
+        add r0, r0, dilate_y_step 
+        bne L1LoopFY
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub r0, r0, r12
+    add src, src, #16
+    add r0, r0, r3
+    vmov.i32 weight, d6[1]
+    vmov.i32 r3, d7[0]
+    sub r3, r3, #1
+    cmp r3, #1
+    bge L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_BFP16_N4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_BFP16_N4.S
new file mode 100644
index 0000000..2b258c8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_BFP16_N4.S
@@ -0,0 +1,329 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_BFP16_N4
+//void GEMM_BFP16_N4(bfp16_t* dst,               //r0: dst 
+//                   const bfp16_t* src,         //r1: src
+//                   const float* weight,      //r2: weight
+//                   int src_depth_quad,       //r3: src_depth_quad
+//                   int dst_step              //r4: dst_step            load from stack
+//                   int dst_depth_quad,       //r5: dst_depth_quad      load from stack
+//                   int width,                //r6: width               load from stack
+//                   float *bias               //r7: bias
+//                   int relu);                //reuse r12: relu?
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d0[0]
+vmla.f32 \z1, \y, d0[1]
+vmla.f32 \z2, \y, d1[0]
+vmla.f32 \z3, \y, d1[1]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d2[0]
+vmla.f32 \z1, \y, d2[1]
+vmla.f32 \z2, \y, d3[0]
+vmla.f32 \z3, \y, d3[1]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d4[0]
+vmla.f32 \z1, \y, d4[1]
+vmla.f32 \z2, \y, d5[0]
+vmla.f32 \z3, \y, d5[1]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d6[0]
+vmla.f32 \z1, \y, d6[1]
+vmla.f32 \z2, \y, d7[0]
+vmla.f32 \z3, \y, d7[1]
+.endm
+
+dst            .req r0
+src            .req r1
+weight         .req r2
+src_depth_quad .req r3
+dst_step       .req r4
+dst_depth_quad .req r5
+width          .req r6
+bias           .req r7
+
+push {r4-r11, lr}
+vpush {q4-q7}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:src_depth_quad
+
+//Load from sp
+//r4:dst_step, r5:dst_depth_quad, r6:width, r7:bias
+ldr dst_step, [sp, #100]
+ldr r5, [sp, #104]
+ldr width, [sp, #108]
+ldr bias, [sp, #112]
+
+
+//step multi by sizeof(bfp16)
+mov r12, #2
+mul dst_step, r12, dst_step
+
+//src_z_step
+mov r12, #8
+mul r8, r12, width
+
+//weight_z_step
+mov r12, #64
+mul r9, r12, r3
+
+//save outside loop, src_depth_quad, width
+vmov.i32 d14[0], src_depth_quad
+vmov.i32 d14[1], width
+
+LoopDz:
+vmov.i32 d15[1], dst
+mov r10, src
+mov r11, weight
+
+L8:
+cmp width, #7
+ble L4
+
+mov r12, src
+vld1.32 {q8}, [bias]
+vmov q9,  q8
+vmov q10, q8
+vld1.32 {q4}, [weight]!
+vmov q11, q8
+vmov q12, q8
+vmov q13, q8
+vld1.32 {q0}, [src]!
+vshll.u16  q1, d1, #16
+vshll.u16  q0, d0, #16
+vmov q14, q8
+vmov q15, q8
+
+vmov.i32 d15[0], weight
+
+L8Loop:
+    pld [weight, #256]
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q4
+    pld [src, #256]
+    vld1.32 {q5}, [weight]!
+    vld1.32 {q2}, [src]!
+    vshll.u16  q3, d5, #16
+    vshll.u16  q2, d4, #16
+    COMPUTE_UNIT_1 q12, q13, q14, q15, q4
+
+    vld1.32 {q4}, [weight]!
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q5
+    vld1.32 {q0}, [src]!
+    vshll.u16  q1, d1, #16
+    vshll.u16  q0, d0, #16
+    pld [weight, #256]
+    COMPUTE_UNIT_3 q12, q13, q14, q15, q5
+
+    vld1.32 {q5}, [weight]!
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q4
+    vld1.32 {q2}, [src]!
+    vshll.u16  q3, d5, #16
+    vshll.u16  q2, d4, #16
+    COMPUTE_UNIT_1 q12, q13, q14, q15, q4
+
+    subs r3, r3, #1
+
+    pld [src, #256]
+    vld1.32 {q4}, [weight]!
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q5
+    sub src, src, #64
+    add src, src, r8
+
+    vld1.32 {q0}, [src]!
+    vshll.u16  q1, d1, #16
+    vshll.u16  q0, d0, #16
+    COMPUTE_UNIT_3 q12, q13, q14, q15, q5
+    bne L8Loop
+
+add src, r12, #64
+ldr r12, [sp, #116]
+sub width, width, #8
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store8
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+vmax.f32 q9, q9, q0
+vmax.f32 q10, q10, q0
+vmax.f32 q11, q11, q0
+vmax.f32 q12, q12, q0
+vmax.f32 q13, q13, q0
+vmax.f32 q14, q14, q0
+vmax.f32 q15, q15, q0
+Store8:
+cmp width, #8
+vshrn.u32 d16, q8, #16
+vshrn.u32 d17, q9, #16
+vshrn.u32 d18, q10, #16
+vshrn.u32 d19, q11, #16
+vstm dst!, {d16-d19}
+vshrn.u32 d24, q12, #16
+vshrn.u32 d25, q13, #16
+vshrn.u32 d26, q14, #16
+vshrn.u32 d27, q15, #16
+vstm dst!, {d24-d27}
+
+bge L8
+
+L4:
+cmp width, #3
+ble L1
+
+mov r12, src
+vld1.32 {q8}, [bias]
+vld1.32 {q0}, [src]!
+vld1.32 {q4, q5}, [weight]!
+vshll.u16  q1, d1, #16
+vshll.u16  q0, d0, #16
+vmov q9, q8
+vmov q10, q8
+vmov q11, q8
+
+vmov.i32 d15[0], weight
+L4Loop:
+    pld [src, #256]
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q4
+    vld1.32 {q2}, [src]!
+    vshll.u16  q3, d5, #16
+    vshll.u16  q2, d4, #16
+    pld [weight, #256]
+    COMPUTE_UNIT_1 q8, q9, q10, q11, q5
+    vld1.32 {q12, q13}, [weight]!
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q12
+    pld [weight, #256]
+    vld1.32 {q4, q5}, [weight]!
+    pld [src, #256]
+    COMPUTE_UNIT_3 q8, q9, q10, q11, q13
+
+    subs r3, r3, #1
+
+    sub src, src, #32
+    add src, src, r8
+    pld [src, #256]
+    vld1.32 {q0}, [src]!
+    vshll.u16  q1, d1, #16
+    vshll.u16  q0, d0, #16
+    pld [weight, #256]
+    bne L4Loop
+
+add src, r12, #32
+ldr r12, [sp, #116]
+subs width, width, #4
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store4
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+vmax.f32 q9, q9, q0
+vmax.f32 q10, q10, q0
+vmax.f32 q11, q11, q0
+
+Store4:
+cmp width, #4
+vshrn.u32 d16, q8, #16
+vshrn.u32 d17, q9, #16
+vshrn.u32 d18, q10, #16
+vshrn.u32 d19, q11, #16
+vstm dst!, {d16-d19}
+bge L4
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov r12, src
+    vld1.32 {q9}, [bias] 
+    vmov.i32 d15[0], weight
+    pld [src, #256]
+    vld1.32 {d0}, [src], r8
+    vshll.u16  q0, d0, #16
+    pld [weight, #256]
+    vldm weight!, {d6-d13}
+    vmul.f32 q8, q3, d0[0]
+    subs r3, r3, #1
+    vmla.f32 q9, q4, d0[1]
+
+    beq L1LoopZEnd
+
+    L1LoopZ:
+        vmla.f32 q8, q5, d1[0]
+        vmla.f32 q9, q6, d1[1]
+        pld [src, #256]
+        vld1.32 {d0}, [src], r8
+        vshll.u16  q0, d0, #16
+        pld [weight, #256]
+        vldm weight!, {d6-d13}
+
+        subs r3, r3, #1
+        vmla.f32 q8, q3, d0[0]
+        vmla.f32 q9, q4, d0[1]
+
+        bne L1LoopZ
+    L1LoopZEnd:
+
+add src, r12, #8
+ldr r12, [sp, #116]
+vmla.f32 q8, q5, d1[0]
+vmla.f32 q9, q6, d1[1]
+vadd.f32 q8, q8, q9
+
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store1
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+Store1:
+    subs width, width, #1
+    vshrn.u32 d16, q8, #16
+    vst1.f32 {d16}, [dst]!
+bne L1Loop
+
+End:
+
+mov src, r10
+subs r5, r5, #1
+vmov.i32 r10, d15[1]
+add dst, r10, dst_step
+add weight, r11, r9
+add bias, bias, #16
+vmov.i32 width, d14[1]
+bne LoopDz
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_FLOAT_N4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_FLOAT_N4.S
new file mode 100644
index 0000000..531c87b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_FLOAT_N4.S
@@ -0,0 +1,275 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_FLOAT_N4
+//void GEMM_FLOAT_N4(float* dst,               //r0: dst 
+//                          const float* src,         //r1: src
+//                          const float* weight,      //r2: weight
+//                          int src_depth_quad,       //r3: src_depth_quad
+//                          int dst_step              //r4: dst_step            load from stack
+//                          int dst_depth_quad,       //r5: dst_depth_quad      load from stack
+//                          int width,                //r6: width               load from stack
+//                          float *bias               //r7: bias
+//                          int relu);                //reuse r12: relu?
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d0[0]
+vmla.f32 \z1, \y, d0[1]
+vmla.f32 \z2, \y, d1[0]
+vmla.f32 \z3, \y, d1[1]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d2[0]
+vmla.f32 \z1, \y, d2[1]
+vmla.f32 \z2, \y, d3[0]
+vmla.f32 \z3, \y, d3[1]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d4[0]
+vmla.f32 \z1, \y, d4[1]
+vmla.f32 \z2, \y, d5[0]
+vmla.f32 \z3, \y, d5[1]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 y
+vmla.f32 \z0, \y, d6[0]
+vmla.f32 \z1, \y, d6[1]
+vmla.f32 \z2, \y, d7[0]
+vmla.f32 \z3, \y, d7[1]
+.endm
+
+dst            .req r0
+src            .req r1
+weight         .req r2
+src_depth_quad .req r3
+dst_step       .req r4
+dst_depth_quad .req r5
+width          .req r6
+bias           .req r7
+
+push {r4-r11, lr}
+vpush {q4-q7}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:src_depth_quad
+
+//Load from sp
+//r4:dst_step, r5:dst_depth_quad, r6:width, r7:bias
+ldr dst_step, [sp, #100]
+ldr r5, [sp, #104]
+ldr width, [sp, #108]
+ldr bias, [sp, #112]
+
+
+//step multi by sizeof(float)
+mov r12, #4
+mul dst_step, r12, dst_step
+
+//src_z_step
+mov r12, #16
+mul r8, r12, width
+
+//weight_z_step
+mov r12, #64
+mul r9, r12, r3
+
+//save outside loop, src_depth_quad, width
+vmov.i32 d14[0], src_depth_quad
+vmov.i32 d14[1], width
+
+LoopDz:
+vmov.i32 d15[1], dst
+mov r10, src
+mov r11, weight
+
+L8:
+cmp width, #7
+ble L4
+
+mov r12, src
+vld1.32 {q8}, [bias]
+vldm weight!, {d8-d11}
+vldm src!, {d0-d7}
+vmov q9,  q8
+vmov q10, q8
+vmov q11, q8
+vmov q12, q8
+vmov q13, q8
+vmov q14, q8
+vmov q15, q8
+
+vmov.i32 d15[0], weight
+
+L8Loop:
+    subs r3, r3, #1
+    COMPUTE_UNIT_0 q8,  q9,  q10, q11, q4
+    COMPUTE_UNIT_1 q12, q13, q14, q15, q4
+    COMPUTE_UNIT_2 q8,  q9,  q10, q11, q5
+    COMPUTE_UNIT_3 q12, q13, q14, q15, q5
+    vldm src!, {d0-d7}
+    vldm weight!, {d8-d11}
+
+    COMPUTE_UNIT_0 q8,  q9,  q10, q11, q4
+    COMPUTE_UNIT_1 q12, q13, q14, q15, q4
+    COMPUTE_UNIT_2 q8,  q9,  q10, q11, q5
+    COMPUTE_UNIT_3 q12, q13, q14, q15, q5
+    vldm src!, {d0-d7}
+    vldm weight!, {d8-d11}
+    bne L8Loop
+
+add src, r12, #128 
+ldr r12, [sp, #116]
+sub width, width, #8
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store8
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+vmax.f32 q9, q9, q0
+vmax.f32 q10, q10, q0
+vmax.f32 q11, q11, q0
+vmax.f32 q12, q12, q0
+vmax.f32 q13, q13, q0
+vmax.f32 q14, q14, q0
+vmax.f32 q15, q15, q0
+Store8:
+cmp width, #8
+    vstm dst!, {d16-d23}
+    vstm dst!, {d24-d31}
+
+bge L8
+
+L4:
+cmp width, #3
+ble L1
+
+vmov.i32 d15[0], weight
+mov r12, src
+//vld1.32 {q4, q5}, [weight]!
+vld1.32 {q8}, [bias]
+//vld1.32 {q0, q1}, [src]!
+vldm src, {d0-d7}
+vldm weight!, {d24-d31}
+vmov q9, q8
+vmov q10, q8
+vmov q11, q8
+
+L4Loop:
+    add src, src, r8
+    subs r3, r3, #1
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q12
+    COMPUTE_UNIT_1 q8, q9, q10, q11, q13
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q14
+    COMPUTE_UNIT_3 q8, q9, q10, q11, q15
+    vldm src, {d0-d7}
+    vldm weight!, {d24-d31}
+    bne L4Loop
+
+add src, r12, #64
+ldr r12, [sp, #116]
+subs width, width, #4
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store4
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+vmax.f32 q9, q9, q0
+vmax.f32 q10, q10, q0
+vmax.f32 q11, q11, q0
+
+Store4:
+cmp width, #4
+    vstm dst!, {d16-d23}
+bge L4
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov r12, src
+    vld1.32 {q9}, [bias] 
+    vmov.i32 d15[0], weight
+    pld [src, #256]
+    vld1.32 {q0}, [src], r8
+    pld [weight, #256]
+    vldm weight!, {d6-d13}
+    vmul.f32 q8, q3, d0[0]
+    subs r3, r3, #1
+    vmla.f32 q9, q4, d0[1]
+
+    beq L1LoopZEnd
+
+    L1LoopZ:
+        vmla.f32 q8, q5, d1[0]
+        vmla.f32 q9, q6, d1[1]
+        pld [src, #256]
+        vld1.32 {q0}, [src], r8
+        pld [weight, #256]
+        vldm weight!, {d6-d13}
+
+        subs r3, r3, #1
+        vmla.f32 q8, q3, d0[0]
+        vmla.f32 q9, q4, d0[1]
+
+        bne L1LoopZ
+    L1LoopZEnd:
+
+add src, r12, #16
+ldr r12, [sp, #116]
+vmla.f32 q8, q5, d1[0]
+vmla.f32 q9, q6, d1[1]
+vadd.f32 q8, q8, q9
+
+vmov.i32 weight, d15[0]
+vmov.i32 r3, d14[0]
+
+cmp r12, #1
+blt Store1
+vmov.i32 q0, #0
+vmax.f32 q8, q8, q0
+Store1:
+    subs width, width, #1
+    vst1.f32 {q8}, [dst]!
+bne L1Loop
+
+End:
+
+mov src, r10
+subs r5, r5, #1
+vmov.i32 r10, d15[1]
+add dst, r10, dst_step
+add weight, r11, r9
+add bias, bias, #16
+vmov.i32 width, d14[1]
+bne LoopDz
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X4.S
new file mode 100644
index 0000000..0606330
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X4.S
@@ -0,0 +1,254 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.align 5
+asm_function GemmInt8Unit4x4 
+//void GemmInt8Unit4x4(int8_t* src, const int8_t* weight, int8_t* dst, int src_w_step, int dst_depth, 
+//                     int cdiv8, float *scale, int32_t*bias, long relu, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max)
+src          .req r0
+weight       .req r1
+dst          .req r2
+src_w_step   .req r3
+dst_depth    .req r4  // (sp, #36)
+cdiv8        .req r5  // (sp, #40)
+scale        .req r6  // (sp, #44)
+bias         .req r7  // (sp, #48)
+relu         .req r8  // (sp, #52)
+add_input    .req r9  // (sp, #56)
+add_scale    .req r10 // (sp, #60)
+relu6_max    .req r11 // (sp, #64)
+
+push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+
+//prefetch data
+//assume buffer c>=16, even c==8
+vld1.8 {q12, q13}, [weight]!
+vld1.8 {q14, q15}, [src]!
+
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+ldr r11, [sp, #64]
+vpush {q4-q7}
+
+C8Start:
+    subs cdiv8, cdiv8, #1
+    vmull.s8 q0, d28, d24 
+    vmull.s8 q1, d30, d24 
+    vmull.s8 q2, d28, d26
+    vmull.s8 q3, d30, d26
+    vmlal.s8 q0, d29, d25
+    vmlal.s8 q1, d31, d25
+    vrev64.32 q12, q12
+    vmlal.s8 q2, d29, d27 
+    vmlal.s8 q3, d31, d27
+    vrev64.32 q13, q13
+    vpaddl.s16 q4, q0 
+    vmull.s8 q0, d28, d24 
+    vpaddl.s16 q5, q1 
+    vmull.s8 q1, d30, d24 
+    vpaddl.s16 q6, q2 
+    vmull.s8 q2, d28, d26
+    vpaddl.s16 q7, q3 
+    vmull.s8 q3, d30, d26
+
+    vmlal.s8 q0, d29, d25
+    vmlal.s8 q1, d31, d25
+    vld1.8 {q12}, [weight]!
+    vmlal.s8 q2, d29, d27 
+    vmlal.s8 q3, d31, d27
+    vld1.8 {q13}, [weight]!
+    vpaddl.s16 q8, q0 
+    vld1.8 {q14, q15}, [src]!
+    vpaddl.s16 q9, q1 
+    vpaddl.s16 q10, q2
+    vpaddl.s16 q11, q3 
+     
+    beq LoopEnd 
+      
+    C8Loop: 
+        subs cdiv8, cdiv8, #1
+        vmull.s8 q0, d28, d24 
+        vmull.s8 q1, d30, d24 
+        vmull.s8 q2, d28, d26
+        vmull.s8 q3, d30, d26
+        vmlal.s8 q0, d29, d25
+        vmlal.s8 q1, d31, d25
+        vrev64.32 q12, q12
+        vmlal.s8 q2, d29, d27 
+        vmlal.s8 q3, d31, d27
+        vrev64.32 q13, q13
+        vpadal.s16 q4, q0 
+        vmull.s8 q0, d28, d24 
+        vpadal.s16 q5, q1 
+        vmull.s8 q1, d30, d24 
+        vpadal.s16 q6, q2 
+        vmull.s8 q2, d28, d26
+        vpadal.s16 q7, q3 
+        vmull.s8 q3, d30, d26
+        
+        vmlal.s8 q0, d29, d25
+        vmlal.s8 q1, d31, d25
+        vld1.8 {q12}, [weight]!
+        vmlal.s8 q2, d29, d27 
+        vmlal.s8 q3, d31, d27
+        vld1.8 {q13}, [weight]!
+        vpadal.s16 q8, q0 
+        vpadal.s16 q9, q1 
+        vld1.8 {q14, q15}, [src]!
+        vpadal.s16 q10, q2
+        vpadal.s16 q11, q3 
+
+        bne C8Loop 
+
+      
+LoopEnd: 
+    //bias q14, scale q15
+    vld1.8 {q14}, [bias]
+    vld1.8 {q15}, [scale]
+    //q4 ~ q11  --> q4, q5 
+    //c00, c11; c20, c31;  d8 -d11
+    //c02, c13; c22, c33;  d12-d15
+    //c01, c10; c21, c30   d16-d19
+    //c03, c12; c23, c32   d20-d23
+    
+    //c00 c01, c02 c03
+    vpadd.s32 d0, d8, d16
+    vpadd.s32 d1, d12, d20 
+    //c10 c11, c12 c13
+    vpadd.s32 d2, d17, d9
+    vpadd.s32 d3, d21, d13 
+    //c20 c21 c22 c23
+    vpadd.s32 d4, d10, d18 
+    vpadd.s32 d5, d14, d22
+    //c32 c31 c32 c33
+    vpadd.s32 d6, d19, d11 
+    vpadd.s32 d7, d23, d15
+
+
+    //c0x ~ c3x
+    vqadd.s32 q0, q14 
+    vqadd.s32 q1, q14 
+    vqadd.s32 q2, q14 
+    vqadd.s32 q3, q14 
+
+    //(q2, q3 + bias) * scale --> q0, q1
+    vcvt.f32.s32 q0, q0 
+    vcvt.f32.s32 q1, q1 
+    vcvt.f32.s32 q2, q2 
+    vcvt.f32.s32 q3, q3 
+
+    vmul.f32 q12, q0, q15   // result = result * scale
+    vmul.f32 q13, q1, q15
+    vmul.f32 q4,  q2, q15
+    vmul.f32 q5,  q3, q15
+
+    cmp relu, #-1           // relu (conv - relu - add, relu == -1)
+    bne Add
+    vmov.i32 q6,  #0
+    vmax.f32 q12, q6
+    vmax.f32 q13, q6
+    vmax.f32 q4,  q6
+    vmax.f32 q5,  q6
+
+Add:
+    cmp add_input, #0       // if add_input_ptr == 0, skip
+    beq Relu
+
+    vld1.s32 d4[0], [add_input], dst_depth
+    vld1.s32 d4[1], [add_input], dst_depth
+    vld1.s32 d6[0], [add_input], dst_depth
+    vld1.s32 d6[1], [add_input]
+    vld1.f32 {q14}, [add_scale]
+
+    vmovl.s8 q0,d4
+    vmovl.s8 q1,d6
+    vmovl.s16 q8,d0
+    vmovl.s16 q9,d1
+    vmovl.s16 q10,d2
+    vmovl.s16 q11,d3
+
+    vcvt.f32.s32 q8, q8
+    vcvt.f32.s32 q9, q9
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+
+    vmla.f32 q12, q8,  q14   // result += add_input * add_scale
+    vmla.f32 q13, q9,  q14
+    vmla.f32 q4,  q10, q14
+    vmla.f32 q5,  q11, q14
+
+Relu:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q6, #0.5
+    vmov.f32 q7, #-0.5
+
+    vcge.f32 q0, q12, #0
+    vcge.f32 q1, q13, #0
+    vcge.f32 q2, q4,  #0
+    vcge.f32 q3, q5,  #0
+    vbsl.f32 q0, q6, q7
+    vbsl.f32 q1, q6, q7
+    vbsl.f32 q2, q6, q7
+    vbsl.f32 q3, q6, q7
+
+    vadd.f32 q12, q12, q0
+    vadd.f32 q13, q13, q1
+    vadd.f32 q4,  q4,  q2
+    vadd.f32 q5,  q5,  q3
+    vcvt.s32.f32 q12, q12
+    vcvt.s32.f32 q13, q13
+    vcvt.s32.f32 q4, q4
+    vcvt.s32.f32 q5, q5
+
+    vqmovn.s32 d0,q12
+    vqmovn.s32 d1,q13
+    vqmovn.s32 d2,q4
+    vqmovn.s32 d3,q5
+    vqmovn.s16 d4,q0
+    vqmovn.s16 d6,q1
+
+    cmp relu, #1            // relu (conv add relu, relu == 1 or relu6 == 2)
+    blt Store
+    vmov.i32 q6,  #0
+    vmax.s8 d4, d12
+    vmax.s8 d6, d12
+
+    cmp relu, #2            // relu6
+    bne Store
+    vld1.32 {d12[]}, [r11]  // relu6_max
+    vmin.s8 d4, d12
+    vmin.s8 d6, d12
+
+Store:
+    vst1.s32 d4[0], [dst], dst_depth
+    vst1.s32 d4[1], [dst], dst_depth
+    vst1.s32 d6[0], [dst], dst_depth
+    vst1.s32 d6[1], [dst]
+    
+vpop {q4-q7}
+pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X8.S
new file mode 100644
index 0000000..f450ba8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm32/GEMM_INT8_4X8.S
@@ -0,0 +1,564 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+asm_function GemmInt8Unit4x8
+//void GemmInt8Unit4x8(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c, long c_stride,
+//                     const float* scales, long relu, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max)
+//r0(mr),
+//r1(nr),
+//r2(k),
+//r3(src),
+//4  from stack(a_stride),
+//5  from stack(weight),
+//6  from stack(dst),
+//7  from stack(c_stride)
+//8  from stack(scale)
+//9  from stack(relu)
+//10 from stack(add_input)
+//11 from stack(add_scale)
+//12 from stack(relu6_max)
+
+// |relu6_max|   <-- sp 132
+// |add_scale|   <-- sp 128
+// |add_input|   <-- sp 124
+// |relu     |   <-- sp 120
+// |scale    |   <-- sp 116
+// |c_stride |   <-- sp 112
+// |dst      |   <-- sp 108
+// |weight   |   <-- sp 104
+// |a_stride |   <-- sp 100
+// |r4-r11,lr|   total 36
+// |q4-q7    |   total 64 <-- sp
+
+// r3-r6 a0-a3
+
+push {r4-r11, lr}
+vpush {q4-q7}
+
+// load bias 32bit, accumulator 8 reg
+ldr r7, [sp, #104] // weight
+vldm r7!, {d16-d19}
+vmov q10, q8
+vmov q11, q9
+vmov q12, q8
+vmov q13, q9
+vmov q14, q8
+vmov q15, q9
+
+ldr r6, [sp, #100] // a_stride
+
+ldr r8, [sp, #108] // dst
+ldr r9, [sp, #112] // c_stride
+ldr r10, [sp, #116] // scale
+ldr r11, [sp, #120] // relu
+
+# a1
+cmp r0, #2
+add r4, r3, r6
+movlo r4, r3
+# a2
+add r5, r4, r6
+movls r5, r4
+# a3
+cmp r0, #4
+add r6, r5, r6
+movne r6, r5
+
+subs r2, r2, #8
+blo 1f
+
+0:
+    vld1.8 d9, [r7]!
+    vmovl.s8 q4, d9
+    vld1.8 d1, [r3]!
+    vmovl.s8 q0, d1
+    vld1.8 d3, [r4]!
+    vmovl.s8 q1, d3
+    vld1.8 d5, [r5]!
+    vmovl.s8 q2, d5
+    vld1.8 d7, [r6]!
+    vmovl.s8 q3, d7
+
+    // c0
+    vld1.8 d11, [r7]!
+    vmlal.s16 q8, d8, d0[0]
+    vmlal.s16 q9, d9, d0[0]
+    vmlal.s16 q10, d8, d2[0]
+    vmlal.s16 q11, d9, d2[0]
+    vmovl.s8 q5, d11
+    vmlal.s16 q12, d8, d4[0]
+    vmlal.s16 q13, d9, d4[0]
+    vmlal.s16 q14, d8, d6[0]
+    vmlal.s16 q15, d9, d6[0]
+
+    // c1
+    vld1.8 d9, [r7]!
+    vmlal.s16 q8, d10, d0[1]
+    vmlal.s16 q9, d11, d0[1]
+    vmlal.s16 q10, d10, d2[1]
+    vmlal.s16 q11, d11, d2[1]
+    vmovl.s8 q4, d9
+    vmlal.s16 q12, d10, d4[1]
+    vmlal.s16 q13, d11, d4[1]
+    vmlal.s16 q14, d10, d6[1]
+    vmlal.s16 q15, d11, d6[1]
+
+    // c2
+    vld1.8 d11, [r7]!
+    vmlal.s16 q8, d8, d0[2]
+    vmlal.s16 q9, d9, d0[2]
+    vmlal.s16 q10, d8, d2[2]
+    vmlal.s16 q11, d9, d2[2]
+    vmovl.s8 q5, d11
+    vmlal.s16 q12, d8, d4[2]
+    vmlal.s16 q13, d9, d4[2]
+    vmlal.s16 q14, d8, d6[2]
+    vmlal.s16 q15, d9, d6[2]
+
+    // c3
+    vld1.8 d9, [r7]!
+    vmlal.s16 q8, d10, d0[3]
+    vmlal.s16 q9, d11, d0[3]
+    vmlal.s16 q10, d10, d2[3]
+    vmlal.s16 q11, d11, d2[3]
+    vmovl.s8 q4, d9
+    vmlal.s16 q12, d10, d4[3]
+    vmlal.s16 q13, d11, d4[3]
+    vmlal.s16 q14, d10, d6[3]
+    vmlal.s16 q15, d11, d6[3]
+
+    // c4
+    vld1.8 d11, [r7]!
+    vmlal.s16 q8, d8, d1[0]
+    vmlal.s16 q9, d9, d1[0]
+    vmlal.s16 q10, d8, d3[0]
+    vmlal.s16 q11, d9, d3[0]
+    vmovl.s8 q5, d11
+    vmlal.s16 q12, d8, d5[0]
+    vmlal.s16 q13, d9, d5[0]
+    vmlal.s16 q14, d8, d7[0]
+    vmlal.s16 q15, d9, d7[0]
+
+    // c5
+    vld1.8 d9, [r7]!
+    vmlal.s16 q8, d10, d1[1]
+    vmlal.s16 q9, d11, d1[1]
+    vmlal.s16 q10, d10, d3[1]
+    vmlal.s16 q11, d11, d3[1]
+    vmovl.s8 q4, d9
+    vmlal.s16 q12, d10, d5[1]
+    vmlal.s16 q13, d11, d5[1]
+    vmlal.s16 q14, d10, d7[1]
+    vmlal.s16 q15, d11, d7[1]
+
+    // c6
+    vld1.8 d11, [r7]!
+    vmlal.s16 q8, d8, d1[2]
+    vmlal.s16 q9, d9, d1[2]
+    vmlal.s16 q10, d8, d3[2]
+    vmlal.s16 q11, d9, d3[2]
+    vmovl.s8 q5, d11
+    vmlal.s16 q12, d8, d5[2]
+    vmlal.s16 q13, d9, d5[2]
+    vmlal.s16 q14, d8, d7[2]
+    vmlal.s16 q15, d9, d7[2]
+
+    subs r2, r2, #8
+
+    // c7
+    vmlal.s16 q8, d10, d1[3]
+    vmlal.s16 q9, d11, d1[3]
+    vmlal.s16 q10, d10, d3[3]
+    vmlal.s16 q11, d11, d3[3]
+    vmlal.s16 q12, d10, d5[3]
+    vmlal.s16 q13, d11, d5[3]
+    vmlal.s16 q14, d10, d7[3]
+    vmlal.s16 q15, d11, d7[3]
+
+    bhs 0b
+
+1:
+    cmp r2, #-8
+    beq 2f
+
+    add r3, r3, r2
+    add r4, r4, r2
+    add r5, r5, r2
+    add r6, r6, r2
+
+    lsl r2, r2, #3
+    vdup.32 d13, r2
+    
+    vld1.8 d1, [r3]!
+    vld1.8 d3, [r4]!
+    vld1.8 d5, [r5]!
+    vld1.8 d7, [r6]!
+
+    vshl.s64 d1, d1, d13
+    vshl.s64 d3, d3, d13
+    vshl.s64 d5, d5, d13
+    vshl.s64 d7, d7, d13
+
+    vmovl.s8 q0, d1
+    vmovl.s8 q1, d3
+    vmovl.s8 q2, d5
+    vmovl.s8 q3, d7
+
+    // c0
+    vld1.8 d9, [r7]!
+    vmovl.s8 q4, d9
+    vmlal.s16 q8, d8, d0[0]
+    vmlal.s16 q9, d9, d0[0]
+    vmlal.s16 q10, d8, d2[0]
+    vmlal.s16 q11, d9, d2[0]
+    vmlal.s16 q12, d8, d4[0]
+    vmlal.s16 q13, d9, d4[0]
+    vmlal.s16 q14, d8, d6[0]
+    vmlal.s16 q15, d9, d6[0]
+
+    cmp r2, #-48
+    blo 2f
+
+    // c1
+    vld1.8 d11, [r7]!
+    vmovl.s8 q5, d11
+    vmlal.s16 q8, d10, d0[1]
+    vmlal.s16 q9, d11, d0[1]
+    vmlal.s16 q10, d10, d2[1]
+    vmlal.s16 q11, d11, d2[1]
+    vmlal.s16 q12, d10, d4[1]
+    vmlal.s16 q13, d11, d4[1]
+    vmlal.s16 q14, d10, d6[1]
+    vmlal.s16 q15, d11, d6[1]
+
+    bls 2f
+
+    // c2
+    vld1.8 d9, [r7]!
+    vmovl.s8 q4, d9
+    vmlal.s16 q8, d8, d0[2]
+    vmlal.s16 q9, d9, d0[2]
+    vmlal.s16 q10, d8, d2[2]
+    vmlal.s16 q11, d9, d2[2]
+    vmlal.s16 q12, d8, d4[2]
+    vmlal.s16 q13, d9, d4[2]
+    vmlal.s16 q14, d8, d6[2]
+    vmlal.s16 q15, d9, d6[2]
+
+    cmp r2, #-32
+    blo 2f
+
+    // c3
+    vld1.8 d11, [r7]!
+    vmovl.s8 q5, d11
+    vmlal.s16 q8, d10, d0[3]
+    vmlal.s16 q9, d11, d0[3]
+    vmlal.s16 q10, d10, d2[3]
+    vmlal.s16 q11, d11, d2[3]
+    vmlal.s16 q12, d10, d4[3]
+    vmlal.s16 q13, d11, d4[3]
+    vmlal.s16 q14, d10, d6[3]
+    vmlal.s16 q15, d11, d6[3]
+
+    bls 2f
+
+    // c4
+    vld1.8 d9, [r7]!
+    vmovl.s8 q4, d9
+    vmlal.s16 q8, d8, d1[0]
+    vmlal.s16 q9, d9, d1[0]
+    vmlal.s16 q10, d8, d3[0]
+    vmlal.s16 q11, d9, d3[0]
+    vmlal.s16 q12, d8, d5[0]
+    vmlal.s16 q13, d9, d5[0]
+    vmlal.s16 q14, d8, d7[0]
+    vmlal.s16 q15, d9, d7[0]
+
+    cmp r2, #-16
+    blo 2f
+
+    // c5
+    vld1.8 d11, [r7]!
+    vmovl.s8 q5, d11
+    vmlal.s16 q8, d10, d1[1]
+    vmlal.s16 q9, d11, d1[1]
+    vmlal.s16 q10, d10, d3[1]
+    vmlal.s16 q11, d11, d3[1]
+    vmlal.s16 q12, d10, d5[1]
+    vmlal.s16 q13, d11, d5[1]
+    vmlal.s16 q14, d10, d7[1]
+    vmlal.s16 q15, d11, d7[1]
+
+    bls 2f
+
+    // c6
+    vld1.8 d9, [r7]!
+    vmovl.s8 q4, d9
+    vmlal.s16 q8, d8, d1[2]
+    vmlal.s16 q9, d9, d1[2]
+    vmlal.s16 q10, d8, d3[2]
+    vmlal.s16 q11, d9, d3[2]
+    vmlal.s16 q12, d8, d5[2]
+    vmlal.s16 q13, d9, d5[2]
+    vmlal.s16 q14, d8, d7[2]
+    vmlal.s16 q15, d9, d7[2]
+
+2:
+
+    vld1.32 {d12,d13}, [r10]!    // float scales c0, c1, c2, c3
+    vmov.i32 q7, #0
+    cmp r1, #4
+    ble 22f
+    vld1.32 {d14,d15}, [r10]     // float scales c4, c5, c6, c7
+
+22:
+    cmp r11, #-1                 // relu (conv - relu - add, relu == -1)
+    bne 23f
+    vmov.i32 q0,  #0
+    vmax.s32 q8,  q0
+    vmax.s32 q9,  q0
+    vmax.s32 q10, q0
+    vmax.s32 q11, q0
+    vmax.s32 q12, q0
+    vmax.s32 q13, q0
+    vmax.s32 q14, q0
+    vmax.s32 q15, q0
+23:
+    vcvt.f32.s32 q8, q8          // result from int32 to float
+    vcvt.f32.s32 q9, q9
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+    vcvt.f32.s32 q12, q12
+    vcvt.f32.s32 q13, q13
+    vcvt.f32.s32 q14, q14
+    vcvt.f32.s32 q15, q15
+
+    ldr r6, [sp, #124]           // add_input
+
+    vmul.f32 q8, q8, q6          // result = result * scale
+    vmul.f32 q9, q9, q7
+    vmul.f32 q10, q10, q6
+    vmul.f32 q11, q11, q7
+
+    cmp r6, #0                   // if add_input_ptr == 0, skip
+    beq 25f
+    add r7, r6, r9
+    cmp r0, #2
+    movlo r7, r6
+
+    vld1.s32 d0, [r6]
+    vld1.s32 d2, [r7]
+    vmovl.s8 q0,d0
+    vmovl.s8 q1,d2
+    vmovl.s16 q2,d0
+    vmovl.s16 q3,d1
+    vmovl.s16 q4,d2
+    vmovl.s16 q5,d3
+
+    ldr r10, [sp, #128]          // add_scale
+    vld1.32 {d0,d1}, [r10]!
+    vmov.i32 q1, #0
+    cmp r1, #4
+    ble 24f
+    vld1.32 {d2,d3}, [r10]
+
+24:
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q3, q3
+    vcvt.f32.s32 q4, q4
+    vcvt.f32.s32 q5, q5
+
+    vmla.f32 q8,  q2, q0         // result += add_input * add_scale
+    vmla.f32 q9,  q3, q1
+    vmla.f32 q10, q4, q0
+    vmla.f32 q11, q5, q1
+
+25:
+    vmul.f32 q12, q12, q6        // result = result * scale
+    vmul.f32 q13, q13, q7
+    vmul.f32 q14, q14, q6
+    vmul.f32 q15, q15, q7
+
+    cmp r6, #0                   // if add_input_ptr == 0, skip
+    beq 26f
+    cmp r0, #2
+    add r3, r7, r9
+    movls r3, r7
+    cmp r0, #4
+    add r4, r3, r9
+    movne r4, r3
+
+    vld1.s32 d12, [r3]
+    vld1.s32 d14, [r4]
+    vmovl.s8 q6,d12
+    vmovl.s8 q7,d14
+    vmovl.s16 q2,d12
+    vmovl.s16 q3,d13
+    vmovl.s16 q4,d14
+    vmovl.s16 q5,d15
+
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q3, q3
+    vcvt.f32.s32 q4, q4
+    vcvt.f32.s32 q5, q5
+
+    vmla.f32 q12, q2, q0        // result += add_input * add_scale
+    vmla.f32 q13, q3, q1
+    vmla.f32 q14, q4, q0
+    vmla.f32 q15, q5, q1
+
+26:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q0, #0.5
+    vmov.f32 q1, #-0.5
+
+    vcge.f32 q2, q8,  #0
+    vcge.f32 q3, q9,  #0
+    vcge.f32 q4, q10, #0
+    vcge.f32 q5, q11, #0
+    vcge.f32 q6, q12, #0
+    vcge.f32 q7, q13, #0
+    vbsl.f32 q2, q0, q1
+    vbsl.f32 q3, q0, q1
+    vbsl.f32 q4, q0, q1
+    vbsl.f32 q5, q0, q1
+    vbsl.f32 q6, q0, q1
+    vbsl.f32 q7, q0, q1
+    vadd.f32 q8, q8, q2
+    vadd.f32 q9, q9, q3
+    vcge.f32 q2, q14, #0
+    vcge.f32 q3, q15, #0
+    vadd.f32 q10, q10, q4
+    vadd.f32 q11, q11, q5
+    vadd.f32 q12, q12, q6
+    vadd.f32 q13, q13, q7
+    vbsl.f32 q2, q0, q1
+    vbsl.f32 q3, q0, q1
+    vadd.f32 q14, q14, q2
+    vadd.f32 q15, q15, q3
+
+    vcvt.s32.f32 q8, q8
+    vcvt.s32.f32 q9, q9
+    vcvt.s32.f32 q10, q10
+    vcvt.s32.f32 q11, q11
+    vcvt.s32.f32 q12, q12
+    vcvt.s32.f32 q13, q13
+    vcvt.s32.f32 q14, q14
+    vcvt.s32.f32 q15, q15
+
+    vqmovn.s32 d16, q8
+    vqmovn.s32 d17, q9
+    vqmovn.s32 d18, q10
+    vqmovn.s32 d19, q11
+    vqmovn.s32 d20, q12
+    vqmovn.s32 d21, q13
+    vqmovn.s32 d22, q14
+    vqmovn.s32 d23, q15
+
+    vqmovn.s16 d16, q8
+    vqmovn.s16 d18, q9
+    vqmovn.s16 d20, q10
+    vqmovn.s16 d22, q11
+
+    cmp r11, #1                // relu (conv add relu, relu == 1 or relu6 == 2)
+    blt 3f
+    vmov.i32 q0, #0
+    vmax.s8 d16, d0
+    vmax.s8 d18, d0
+    vmax.s8 d20, d0
+    vmax.s8 d22, d0
+
+    cmp r11, #2                // relu6
+    bne 3f
+    ldr r11, [sp, #132]        // relu6_max
+    vld1.s32 d0, [r11]
+    vmin.s8 d16, d0
+    vmin.s8 d18, d0
+    vmin.s8 d20, d0
+    vmin.s8 d22, d0
+
+3:
+    add r3, r8, r9
+    cmp r0, #2
+    movlo r3, r8
+
+    add r4, r3, r9
+    movls r4, r3
+
+    cmp r0, #4
+    add r5, r4, r9
+    movne r5, r4
+
+    cmp r1, #8
+    bne 4f
+
+    vst1.8 d16, [r8]
+    vst1.8 d18, [r3]
+    vst1.8 d20, [r4]
+    vst1.8 d22, [r5]
+
+4:
+    cmp r1, #4
+    blo 5f
+
+    vst1.32 d16[0], [r8]!
+    vst1.32 d18[0], [r3]!
+    vst1.32 d20[0], [r4]!
+    vst1.32 d22[0], [r5]!
+
+    sub r1, r1, #4
+    vext.8 d16, d16, #4
+    vext.8 d18, d18, #4
+    vext.8 d20, d20, #4
+    vext.8 d22, d22, #4
+
+5:
+    cmp r1, #2
+    blo 6f
+
+    vst1.16 d16[0], [r8]!
+    vst1.16 d18[0], [r3]!
+    vst1.16 d20[0], [r4]!
+    vst1.16 d22[0], [r5]!
+
+    sub r1, r1, #2
+    vext.8 d16, d16, #2
+    vext.8 d18, d18, #2
+    vext.8 d20, d20, #2
+    vext.8 d22, d22, #2
+
+6:
+    cmp r1, #1
+    blo 7f
+
+    vst1.8 d16[0], [r8]!
+    vst1.8 d18[0], [r3]!
+    vst1.8 d20[0], [r4]!
+    vst1.8 d22[0], [r5]!
+
+7:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_O4.S
new file mode 100644
index 0000000..2fd7380
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_O4.S
@@ -0,0 +1,447 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvBfp16O4
+//void ConvBfp16O4(bfp16_t* dst, const bfp16_t* src, const float* weight, int width, int src_w_step, int src_depth_quad, int src_depth_step, int fw, int fh, int dilate_x_step, int dilate_y_step)
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step, x5:src_depth_quad, x6: src_depth_step, x7:fw
+
+//Load from sp
+//x8:fh, x9:dilate_x_step, x10:dilate_y_step
+//eor x8, x8, x8
+ldr x8, [sp, #0]
+//eor x9, x9, x9
+ldr x9, [sp, #8]
+//eor x10, x10, x10   
+ldr x10, [sp, #16]
+
+//step multi by sizeof(bfp16_t)
+mov x12, #2
+mul x10, x12, x10
+mul x9, x12, x9
+mul x6, x12, x6
+mul x4, x12, x4
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+mul x12, x8, x10
+sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul x12, x7, x9
+sub x10, x10, x12
+
+sub sp, sp, #144
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+
+L14:
+cmp x3, #13
+ble L8
+
+mov x14, #14
+mul x14, x4, x14
+
+L14Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v14.4s, #0
+    movi v15.4s, #0
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+    movi v24.4s, #0
+    movi v25.4s, #0
+    movi v26.4s, #0
+    movi v27.4s, #0
+    L14LoopZ:
+        mov x19, x8
+        L14LoopFY:
+            mov x20, x7
+            L14LoopFX:
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+                ld1 {v0.4h}, [x1], x4
+                shll v0.4s, v0.4h, #16
+                fmla v14.4s, v28.4s, v0.s[0]
+                ld1 {v1.4h}, [x1], x4
+                shll v1.4s, v1.4h, #16
+                fmla v14.4s, v29.4s, v0.s[1]
+                fmla v14.4s, v30.4s, v0.s[2]
+                ld1 {v2.4h}, [x1], x4
+                shll v2.4s, v2.4h, #16
+                fmla v15.4s, v28.4s, v1.s[0]
+                fmla v14.4s, v31.4s, v0.s[3]
+                ld1 {v3.4h}, [x1], x4
+                shll v3.4s, v3.4h, #16
+                fmla v15.4s, v29.4s, v1.s[1]
+                fmla v16.4s, v29.4s, v2.s[1]
+                ld1 {v4.4h}, [x1], x4
+                shll v4.4s, v4.4h, #16
+                fmla v15.4s, v30.4s, v1.s[2]
+                fmla v16.4s, v28.4s, v2.s[0]
+                fmla v15.4s, v31.4s, v1.s[3]
+                ld1 {v5.4h}, [x1], x4
+                shll v5.4s, v5.4h, #16
+                fmla v16.4s, v30.4s, v2.s[2]
+                ld1 {v6.4h}, [x1], x4
+                shll v6.4s, v6.4h, #16
+                fmla v16.4s, v31.4s, v2.s[3]
+                ld1 {v7.4h}, [x1], x4
+                shll v7.4s, v7.4h, #16
+                fmla v17.4s, v28.4s, v3.s[0]
+                fmla v18.4s, v28.4s, v4.s[0]
+                fmla v17.4s, v29.4s, v3.s[1]
+                ld1 {v8.4h}, [x1], x4
+                shll v8.4s, v8.4h, #16
+                fmla v17.4s, v30.4s, v3.s[2]
+                fmla v18.4s, v29.4s, v4.s[1]
+                fmla v17.4s, v31.4s, v3.s[3]
+
+                ld1 {v9.4h}, [x1], x4
+                shll v9.4s, v9.4h, #16
+                fmla v18.4s, v30.4s, v4.s[2]
+                fmla v19.4s, v28.4s, v5.s[0]
+                fmla v20.4s, v28.4s, v6.s[0]
+                ld1 {v10.4h}, [x1], x4
+                shll v10.4s, v10.4h, #16
+                fmla v18.4s, v31.4s, v4.s[3]
+                fmla v19.4s, v29.4s, v5.s[1]
+                fmla v20.4s, v29.4s, v6.s[1]
+                ld1 {v11.4h}, [x1], x4
+                shll v11.4s, v11.4h, #16
+                fmla v19.4s, v30.4s, v5.s[2]
+                fmla v20.4s, v30.4s, v6.s[2]
+                ld1 {v12.4h}, [x1], x4
+                shll v12.4s, v12.4h, #16
+                fmla v19.4s, v31.4s, v5.s[3]
+                ld1 {v13.4h}, [x1], x4
+                shll v13.4s, v13.4h, #16
+                fmla v20.4s, v31.4s, v6.s[3]
+
+                fmla v21.4s, v28.4s, v7.s[0]
+                fmla v22.4s, v28.4s, v8.s[0]
+                fmla v23.4s, v28.4s, v9.s[0]
+                fmla v24.4s, v28.4s, v10.s[0]
+                fmla v25.4s, v28.4s, v11.s[0]
+                fmla v26.4s, v28.4s, v12.s[0]
+                fmla v27.4s, v28.4s, v13.s[0]
+                
+                fmla v21.4s, v29.4s, v7.s[1]
+                fmla v22.4s, v29.4s, v8.s[1]
+                fmla v23.4s, v29.4s, v9.s[1]
+                fmla v24.4s, v29.4s, v10.s[1]
+                fmla v25.4s, v29.4s, v11.s[1]
+                fmla v26.4s, v29.4s, v12.s[1]
+                fmla v27.4s, v29.4s, v13.s[1]
+
+                fmla v21.4s, v30.4s, v7.s[2]
+                fmla v22.4s, v30.4s, v8.s[2]
+                fmla v23.4s, v30.4s, v9.s[2]
+                fmla v24.4s, v30.4s, v10.s[2]
+                fmla v25.4s, v30.4s, v11.s[2]
+                fmla v26.4s, v30.4s, v12.s[2]
+                fmla v27.4s, v30.4s, v13.s[2]
+
+                fmla v21.4s, v31.4s, v7.s[3]
+                fmla v22.4s, v31.4s, v8.s[3]
+                fmla v23.4s, v31.4s, v9.s[3]
+                fmla v24.4s, v31.4s, v10.s[3]
+                fmla v25.4s, v31.4s, v11.s[3]
+                fmla v26.4s, v31.4s, v12.s[3]
+                fmla v27.4s, v31.4s, v13.s[3]
+
+                subs x7, x7, #1
+                sub x1, x1, x14
+
+                add x1, x1, x9
+                bne L14LoopFX
+            subs x8, x8, #1
+            mov x7, x20 
+            add x1, x1, x10
+            bne L14LoopFY
+        subs x5, x5, #1
+        mov x8, x19
+        add x1, x1, x6
+        bne L14LoopZ
+    sub x3, x3, #14
+    shrn v14.4h, v14.4s, #16
+    shrn v15.4h, v15.4s, #16
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+    shrn v20.4h, v20.4s, #16
+    shrn v21.4h, v21.4s, #16
+    st1 {v14.4h, v15.4h, v16.4h, v17.4h}, [x0], #32
+    add x1, x11, x14
+    shrn v22.4h, v22.4s, #16
+    shrn v23.4h, v23.4s, #16
+    shrn v24.4h, v24.4s, #16
+    shrn v25.4h, v25.4s, #16
+    st1 {v18.4h, v19.4h, v20.4h, v21.4h}, [x0], #32
+    mov x2, x12
+    cmp x3, #14
+    shrn v26.4h, v26.4s, #16
+    shrn v27.4h, v27.4s, #16
+    st1 {v22.4h, v23.4h, v24.4h, v25.4h}, [x0], #32
+    mov x5, x13
+    st1 {v26.4h, v27.4h}, [x0], #16
+    bge L14Loop
+
+
+L8:
+cmp x3, #7
+ble L4
+
+mov x14, #8
+mul x14, x4, x14
+
+L8Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    movi v3.4s, #0
+    movi v4.4s, #0
+    movi v5.4s, #0
+    movi v6.4s, #0
+    movi v7.4s, #0
+    L8LoopZ:
+        mov v27.d[0], x8
+        L8LoopFY:
+            mov v27.d[1], x7
+            L8LoopFX:
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+                ld1 {v16.4s}, [x1], x4
+                shll v16.4s, v16.4h, #16
+                fmla v0.4s, v28.4s, v16.s[0]
+                ld1 {v17.4s}, [x1], x4
+                shll v17.4s, v17.4h, #16
+                fmla v0.4s, v29.4s, v16.s[1]
+                fmla v0.4s, v30.4s, v16.s[2]
+                ld1 {v18.4s}, [x1], x4
+                shll v18.4s, v18.4h, #16
+                fmla v1.4s, v28.4s, v17.s[0]
+                ld1 {v19.4s}, [x1], x4
+                shll v19.4s, v19.4h, #16
+                fmla v1.4s, v29.4s, v17.s[1]
+                fmla v2.4s, v28.4s, v18.s[0]
+                ld1 {v20.4s}, [x1], x4
+                shll v20.4s, v20.4h, #16
+                fmla v3.4s, v28.4s, v19.s[0]
+                ld1 {v21.4s}, [x1], x4
+                shll v21.4s, v21.4h, #16
+                fmla v4.4s, v28.4s, v20.s[0]
+                ld1 {v22.4s}, [x1], x4
+                shll v22.4s, v22.4h, #16
+                fmla v5.4s, v28.4s, v21.s[0]
+                ld1 {v23.4s}, [x1], x4
+                shll v23.4s, v23.4h, #16
+                fmla v6.4s, v28.4s, v22.s[0]
+                fmla v7.4s, v28.4s, v23.s[0]
+
+                fmla v2.4s, v29.4s, v18.s[1]
+                fmla v3.4s, v29.4s, v19.s[1]
+                fmla v4.4s, v29.4s, v20.s[1]
+                fmla v5.4s, v29.4s, v21.s[1]
+                fmla v6.4s, v29.4s, v22.s[1]
+                fmla v7.4s, v29.4s, v23.s[1]
+
+                fmla v1.4s, v30.4s, v17.s[2]
+                fmla v2.4s, v30.4s, v18.s[2]
+                fmla v3.4s, v30.4s, v19.s[2]
+                fmla v4.4s, v30.4s, v20.s[2]
+                fmla v5.4s, v30.4s, v21.s[2]
+                fmla v6.4s, v30.4s, v22.s[2]
+                fmla v7.4s, v30.4s, v23.s[2]
+
+                fmla v0.4s, v31.4s, v16.s[3]
+                fmla v1.4s, v31.4s, v17.s[3]
+                fmla v2.4s, v31.4s, v18.s[3]
+                fmla v3.4s, v31.4s, v19.s[3]
+                fmla v4.4s, v31.4s, v20.s[3]
+                fmla v5.4s, v31.4s, v21.s[3]
+                fmla v6.4s, v31.4s, v22.s[3]
+                fmla v7.4s, v31.4s, v23.s[3]
+
+                sub x1, x1, x14
+                subs x7, x7, #1
+                add x1, x1, x9
+                bne L8LoopFX
+            subs x8, x8, #1
+            mov x7, v27.d[1]
+            add x1, x1, x10
+            bne L8LoopFY
+        subs x5, x5, #1
+        mov x8, v27.d[0]
+        add x1, x1, x6
+        bne L8LoopZ
+    shrn v0.4h, v0.4s, #16
+    shrn v1.4h, v1.4s, #16
+    shrn v2.4h, v2.4s, #16
+    shrn v3.4h, v3.4s, #16
+    shrn v4.4h, v4.4s, #16
+    shrn v5.4h, v5.4s, #16
+    shrn v6.4h, v6.4s, #16
+    shrn v7.4h, v7.4s, #16
+    st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+    add x1, x11, x14
+    mov x2, x12
+    mov x5, x13
+    sub x3, x3, #8
+    cmp x3, #8
+    st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x0], #32
+    bge L8Loop
+
+
+
+L4:
+cmp x3, #3
+ble L1
+
+mov x14, #4
+mul x14, x4, x14
+
+mov x11, x1
+mov x12, x2
+mov x13, x5
+movi v0.4s, #0
+movi v1.4s, #0
+movi v2.4s, #0
+movi v3.4s, #0
+L4LoopZ:
+mov v27.d[0], x8
+L4LoopFY:
+mov v27.d[1], x7
+L4LoopFX:
+ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+ld1 {v16.4s}, [x1], x4
+shll v16.4s, v16.4h, #16
+fmla v0.4s, v28.4s, v16.s[0]
+ld1 {v17.4s}, [x1], x4
+shll v17.4s, v17.4h, #16
+fmla v0.4s, v29.4s, v16.s[1]
+fmla v1.4s, v29.4s, v17.s[1]
+ld1 {v18.4s}, [x1], x4
+shll v18.4s, v18.4h, #16
+fmla v1.4s, v28.4s, v17.s[0]
+fmla v0.4s, v30.4s, v16.s[2]
+ld1 {v19.4s}, [x1], x4
+shll v19.4s, v19.4h, #16
+
+fmla v2.4s, v28.4s, v18.s[0]
+fmla v3.4s, v28.4s, v19.s[0]
+
+fmla v2.4s, v29.4s, v18.s[1]
+fmla v3.4s, v29.4s, v19.s[1]
+
+fmla v1.4s, v30.4s, v17.s[2]
+fmla v2.4s, v30.4s, v18.s[2]
+fmla v3.4s, v30.4s, v19.s[2]
+
+fmla v0.4s, v31.4s, v16.s[3]
+fmla v1.4s, v31.4s, v17.s[3]
+fmla v2.4s, v31.4s, v18.s[3]
+fmla v3.4s, v31.4s, v19.s[3]
+
+sub x1, x1, x14
+subs x7, x7, #1
+add x1, x1, x9
+bne L4LoopFX
+subs x8, x8, #1
+mov x7, v27.d[1]
+add x1, x1, x10
+bne L4LoopFY
+subs x5, x5, #1
+mov x8, v27.d[0]
+add x1, x1, x6
+bne L4LoopZ
+shrn v0.4h, v0.4s, #16
+shrn v1.4h, v1.4s, #16
+shrn v2.4h, v2.4s, #16
+shrn v3.4h, v3.4s, #16
+st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+add x1, x11, x14
+mov x2, x12
+mov x5, x13
+sub x3, x3, #4
+
+L1:
+cmp x3, #0
+ble End
+
+L1Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v0.4s, #0
+    movi v1.4s, #0
+    L1LoopZ:
+        mov x14, x8
+        L1LoopFY:
+            mov x15, x7
+            L1LoopFX:
+                ld1 {v3.4s}, [x1], x9
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+                shll v3.4s, v3.4h, #16
+                fmla v0.4s, v28.4s, v3.s[0]
+                fmla v1.4s, v29.4s, v3.s[1]
+                fmla v0.4s, v30.4s, v3.s[2]
+                fmla v1.4s, v31.4s, v3.s[3]
+                subs x7, x7, #1
+                bne L1LoopFX
+            subs x8, x8, #1
+            mov x7, x15
+            add x1, x1, x10
+            bne L1LoopFY
+        subs x5, x5, #1
+        mov x8, x14
+        add x1, x1, x6
+        bne L1LoopZ
+
+    fadd v0.4s, v0.4s, v1.4s
+    add x1, x11, x4
+    mov x2, x12
+    shrn v0.4h, v0.4s, #16
+    mov x5, x13
+    subs x3, x3, #1
+    st1 {v0.4h}, [x0], #8
+    bne L1Loop
+
+End:
+
+sub sp, sp, #144
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_SLIDEW_C3.S
new file mode 100644
index 0000000..e533f6c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_BFP16_SLIDEW_C3.S
@@ -0,0 +1,335 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmBfp16SlidewC3
+//void Gemmbfp16SlidewC3(   bfp16* dst,            //x0: dst 
+//                          const bfp16* src,      //x1: src
+//                          const bfp16* weight,   //x2: weight
+//                          int width,             //x3: width
+//                          int src_w_setup,       //x4: src_w_step
+//                          int fw,                //x5: fw
+//                          int fh,                //x6: fh
+//                          int dilateX_step,      //x7: dilateX_step
+//                          int dilateY_step);     //x8: dilateY_step, load from stack
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+src_w_step   .req x4
+fw           .req x5
+fh           .req x6
+dilateX_step .req x7
+dilateY_step .req x8
+dilate_y_gap .req x10
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+
+//step multi by sizeof(bfp16)
+mov x12, #2
+mul dilateY_step, x12, dilateY_step
+mul dilateX_step, x12, dilateX_step
+mul src_w_step, x12, src_w_step
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul x12, fw, dilateX_step
+sub dilate_y_gap, dilateY_step, x12
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+L14:
+cmp width, #13
+ble L8
+
+mov x14, #14
+mul x14, src_w_step, x14
+
+L14Loop:
+    mov x11, src
+    mov x12, weight
+    movi v14.4s, #0
+    movi v15.4s, #0
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+    movi v24.4s, #0
+    movi v25.4s, #0
+    movi v26.4s, #0
+    movi v27.4s, #0
+    mov x9, fh
+    L14LoopFY:
+        mov x13, fw
+        L14LoopFX:
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+            ld1 {v0.4h}, [src], src_w_step
+            shll v0.4s, v0.4h, #16
+            fmla v14.4s, v28.4s, v0.s[0]
+            ld1 {v1.4h}, [src], src_w_step
+            shll v1.4s, v1.4h, #16
+            fmla v14.4s, v29.4s, v0.s[1]
+            fmla v14.4s, v30.4s, v0.s[2]
+            ld1 {v2.4h}, [src], src_w_step
+            shll v2.4s, v2.4h, #16
+            fmla v15.4s, v28.4s, v1.s[0]
+            ld1 {v3.4h}, [src], src_w_step
+            shll v3.4s, v3.4h, #16
+            fmla v15.4s, v29.4s, v1.s[1]
+            fmla v16.4s, v28.4s, v2.s[0]
+            ld1 {v4.4h}, [src], src_w_step
+            shll v4.4s, v4.4h, #16
+            fmla v17.4s, v28.4s, v3.s[0]
+            ld1 {v5.4h}, [src], src_w_step
+            shll v5.4s, v5.4h, #16
+            fmla v17.4s, v29.4s, v3.s[1]
+            fmla v18.4s, v28.4s, v4.s[0]
+            ld1 {v6.4h}, [src], src_w_step
+            shll v6.4s, v6.4h, #16
+            fmla v19.4s, v28.4s, v5.s[0]
+            ld1 {v7.4h}, [src], src_w_step
+            shll v7.4s, v7.4h, #16
+            fmla v16.4s, v29.4s, v2.s[1]
+            ld1 {v8.4h}, [src], src_w_step
+            shll v8.4s, v8.4h, #16
+            fmla v20.4s, v28.4s, v6.s[0]
+            ld1 {v9.4h}, [src], src_w_step
+            shll v9.4s, v9.4h, #16
+            fmla v18.4s, v29.4s, v4.s[1]
+            ld1 {v10.4h}, [src], src_w_step
+            shll v10.4s, v10.4h, #16
+            fmla v21.4s, v28.4s, v7.s[0]
+            ld1 {v11.4h}, [src], src_w_step
+            shll v11.4s, v11.4h, #16
+            fmla v22.4s, v28.4s, v8.s[0]
+            ld1 {v12.4h}, [src], src_w_step
+            shll v12.4s, v12.4h, #16
+            fmla v23.4s, v28.4s, v9.s[0]
+            ld1 {v13.4h}, [src], src_w_step
+            shll v13.4s, v13.4h, #16
+            fmla v24.4s, v28.4s, v10.s[0]
+            fmla v25.4s, v28.4s, v11.s[0]
+            fmla v26.4s, v28.4s, v12.s[0]
+            fmla v27.4s, v28.4s, v13.s[0]
+
+            fmla v19.4s, v29.4s, v5.s[1]
+            fmla v20.4s, v29.4s, v6.s[1]
+            fmla v21.4s, v29.4s, v7.s[1]
+            fmla v22.4s, v29.4s, v8.s[1]
+            fmla v23.4s, v29.4s, v9.s[1]
+            fmla v24.4s, v29.4s, v10.s[1]
+            fmla v25.4s, v29.4s, v11.s[1]
+            fmla v26.4s, v29.4s, v12.s[1]
+            fmla v27.4s, v29.4s, v13.s[1]
+            
+            fmla v15.4s, v30.4s, v1.s[2]
+            fmla v16.4s, v30.4s, v2.s[2]
+            fmla v17.4s, v30.4s, v3.s[2]
+            fmla v18.4s, v30.4s, v4.s[2]
+            fmla v19.4s, v30.4s, v5.s[2]
+            fmla v20.4s, v30.4s, v6.s[2]
+            fmla v21.4s, v30.4s, v7.s[2]
+            fmla v22.4s, v30.4s, v8.s[2]
+            fmla v23.4s, v30.4s, v9.s[2]
+            fmla v24.4s, v30.4s, v10.s[2]
+            fmla v25.4s, v30.4s, v11.s[2]
+            fmla v26.4s, v30.4s, v12.s[2]
+            fmla v27.4s, v30.4s, v13.s[2]
+            
+            subs fw, fw, #1
+            sub src, src, x14
+
+            add src, src, dilateX_step
+            bne L14LoopFX
+        subs fh, fh, #1
+        mov fw, x13
+        add src, src, dilate_y_gap
+        bne L14LoopFY
+
+    shrn v14.4h, v14.4s, #16
+    shrn v15.4h, v15.4s, #16
+    shrn v16.4h, v16.4s, #16
+    shrn v17.4h, v17.4s, #16
+    shrn v18.4h, v18.4s, #16
+    shrn v19.4h, v19.4s, #16
+    shrn v20.4h, v20.4s, #16
+    shrn v21.4h, v21.4s, #16
+    shrn v22.4h, v22.4s, #16
+    shrn v23.4h, v23.4s, #16
+    shrn v24.4h, v24.4s, #16
+    shrn v25.4h, v25.4s, #16
+    shrn v26.4h, v26.4s, #16
+    shrn v27.4h, v27.4s, #16
+    mov fh, x9 
+    sub width, width, #14
+    //st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [dst], #64
+    st1 {v14.4h, v15.4h, v16.4h, v17.4h}, [dst], #32
+    add src, x11, x14
+    st1 {v18.4h, v19.4h, v20.4h, v21.4h}, [dst], #32
+    mov weight, x12
+    cmp width, #14
+    st1 {v22.4h, v23.4h, v24.4h, v25.4h}, [dst], #32
+    st1 {v26.4h, v27.4h}, [dst], #16
+    bge L14Loop
+
+L8:
+cmp width, #7
+ble L1
+
+mov x14, #8
+mul x14, src_w_step, x14
+
+L8Loop:
+    mov x11, src
+    mov x12, weight
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    movi v3.4s, #0
+    movi v4.4s, #0
+    movi v5.4s, #0
+    movi v6.4s, #0
+    movi v7.4s, #0
+    mov v27.d[0], fh
+    L8LoopFY:
+        mov v27.d[1], fw
+        L8LoopFX:
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+            ld1 {v16.4h}, [src], src_w_step
+            shll v16.4s, v16.4h, #16
+            fmla v0.4s, v28.4s, v16.s[0]
+            ld1 {v17.4h}, [src], src_w_step
+            shll v17.4s, v17.4h, #16
+            fmla v0.4s, v29.4s, v16.s[1]
+            fmla v0.4s, v30.4s, v16.s[2]
+            ld1 {v18.4h}, [src], src_w_step
+            shll v18.4s, v18.4h, #16
+            fmla v1.4s, v28.4s, v17.s[0]
+            ld1 {v19.4h}, [src], src_w_step
+            shll v19.4s, v19.4h, #16
+            fmla v1.4s, v29.4s, v17.s[1]
+            fmla v2.4s, v28.4s, v18.s[0]
+            ld1 {v20.4h}, [src], src_w_step
+            shll v20.4s, v20.4h, #16
+            fmla v3.4s, v28.4s, v19.s[0]
+            ld1 {v21.4h}, [src], src_w_step
+            shll v21.4s, v21.4h, #16
+            fmla v4.4s, v28.4s, v20.s[0]
+            ld1 {v22.4h}, [src], src_w_step
+            shll v22.4s, v22.4h, #16
+            fmla v5.4s, v28.4s, v21.s[0]
+            ld1 {v23.4h}, [src], src_w_step
+            shll v23.4s, v23.4h, #16
+            fmla v6.4s, v28.4s, v22.s[0]
+            fmla v7.4s, v28.4s, v23.s[0]
+
+            fmla v2.4s, v29.4s, v18.s[1]
+            fmla v3.4s, v29.4s, v19.s[1]
+            fmla v4.4s, v29.4s, v20.s[1]
+            fmla v5.4s, v29.4s, v21.s[1]
+            fmla v6.4s, v29.4s, v22.s[1]
+            fmla v7.4s, v29.4s, v23.s[1]
+
+            fmla v1.4s, v30.4s, v17.s[2]
+            fmla v2.4s, v30.4s, v18.s[2]
+            fmla v3.4s, v30.4s, v19.s[2]
+            fmla v4.4s, v30.4s, v20.s[2]
+            fmla v5.4s, v30.4s, v21.s[2]
+            fmla v6.4s, v30.4s, v22.s[2]
+            fmla v7.4s, v30.4s, v23.s[2]
+
+            sub src, src, x14
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, v27.d[1]
+        add src, src, dilate_y_gap
+        bne L8LoopFY
+    mov fh, v27.d[0]
+    shrn v0.4h, v0.4s, #16
+    shrn v1.4h, v1.4s, #16
+    shrn v2.4h, v2.4s, #16
+    shrn v3.4h, v3.4s, #16
+    shrn v4.4h, v4.4s, #16
+    shrn v5.4h, v5.4s, #16
+    shrn v6.4h, v6.4s, #16
+    shrn v7.4h, v7.4s, #16
+    sub width, width, #8
+    st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [dst], #32
+    add src, x11, x14
+    mov weight, x12
+    cmp width, #8
+    st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [dst], #32
+    bge L8Loop
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov x11, src
+    mov x12, weight
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    
+    mov x14, fh
+    L1LoopFY:
+        mov x15, fw
+        L1LoopFX:
+            ld1 {v3.4h}, [src], dilateX_step
+            shll v3.4s, v3.4h, #16
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+            fmla v0.4s, v28.4s, v3.s[0]
+            fmla v1.4s, v29.4s, v3.s[1]
+            fmla v2.4s, v30.4s, v3.s[2]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x15
+        add src, src, dilate_y_gap
+        bne L1LoopFY
+    mov fh, x14
+
+    fadd v0.4s, v0.4s, v1.4s
+    fadd v0.4s, v0.4s, v2.4s
+    shrn v0.4h, v0.4s, #16
+    add src, x11, src_w_step
+    mov weight, x12
+    subs width, width, #1
+    st1 {v0.4h}, [dst], #8
+    bne L1Loop
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_FLOAT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_FLOAT_SLIDEW.S
new file mode 100644
index 0000000..70c5050
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_FLOAT_SLIDEW.S
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3FloatSlideW 
+//void ConvDw3x3FloatSlideW(float *dst_z,
+//                        float **cache_line,
+//                        const float* weight_z,
+//                        int dst_width)
+
+dst      .req x0
+line0    .req x4
+line1    .req x5
+line2    .req x6
+weight   .req x2
+width    .req x3
+
+w_00      .req v0
+w_01      .req v1
+w_02      .req v2
+w_10      .req v3
+w_11      .req v4
+w_12      .req v5
+w_20      .req v6
+w_21      .req v7
+w_22      .req v16
+
+
+//Auto Load:
+//x0:dst_z, x1:cache_line, x2:weight_z, x3: dst_width
+
+cmp width, #0
+ble End
+
+ldr x4, [x1]
+ldr x5, [x1, #8]
+ldr x6, [x1, #16]
+
+ld1 {w_00.4s, w_01.4s, w_02.4s}, [weight], #48
+ld1 {w_10.4s, w_11.4s, w_12.4s}, [weight], #48
+ld1 {w_20.4s, w_21.4s}, [weight], #32
+ld1 {w_22.4s}, [weight]
+
+ld1 {v21.4s}, [line0], #16
+ld1 {v22.4s}, [line1], #16
+ld1 {v23.4s}, [line2], #16
+
+fmul v17.4s, v21.4s, w_00.4s
+fmla v17.4s, v22.4s, w_10.4s
+fmla v17.4s, v23.4s, w_20.4s
+
+ld1 {v21.4s}, [line0], #16
+ld1 {v22.4s}, [line1], #16
+ld1 {v23.4s}, [line2], #16
+
+fmul v18.4s, v21.4s, w_00.4s
+fmla v17.4s, v21.4s, w_01.4s
+fmla v18.4s, v22.4s, w_10.4s
+fmla v17.4s, v22.4s, w_11.4s
+fmla v18.4s, v23.4s, w_20.4s
+fmla v17.4s, v23.4s, w_21.4s
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    ld1 {v21.4s}, [line0], #16
+    ld1 {v22.4s}, [line1], #16
+    ld1 {v23.4s}, [line2], #16
+
+    fmul v19.4s, v21.4s, w_00.4s
+    fmla v18.4s, v21.4s, w_01.4s
+    fmla v17.4s, v21.4s, w_02.4s
+
+    fmla v19.4s, v22.4s, w_10.4s
+    fmla v18.4s, v22.4s, w_11.4s
+    fmla v17.4s, v22.4s, w_12.4s
+
+    fmla v19.4s, v23.4s, w_20.4s
+    fmla v18.4s, v23.4s, w_21.4s
+    fmla v17.4s, v23.4s, w_22.4s
+
+    st1 {v17.4s}, [dst], #16
+    subs width, width, #1
+    mov v17.16b, v18.16b
+    mov v18.16b, v19.16b
+
+    bne LoopDw
+LoopDwEnd:
+ld1 {v21.4s}, [line0], #16
+ld1 {v22.4s}, [line1], #16
+ld1 {v23.4s}, [line2], #16
+fmla v17.4s, v21.4s, w_02.4s
+fmla v17.4s, v22.4s, w_12.4s
+fmla v17.4s, v23.4s, w_22.4s
+st1 {v17.4s}, [dst], #16
+
+End:
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_INT8_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_INT8_SLIDEW.S
new file mode 100644
index 0000000..b0b84c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3X3_INT8_SLIDEW.S
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DepthwiseI8K3S1Kernel
+//void DepthwiseI8K3S1Kernel(int8_t *dst_z,
+//                        const int8_t *src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long src_y_step,
+//                        long dst_depth,
+//                        long width)
+//x0(dst_z),
+//x1(src),
+//x2(weight_z),
+//x3(bias_z),
+//x4(scale_z),
+//x5(src_y_step),
+//x6(dst_depth),
+//x7(width)
+
+.macro COMPUTE_SMLAL_UNIT z0 z1 z2 z3 y
+    smlal  v20.4s, \z0\().4h, \y\().4h
+    smlal2 v21.4s, \z0\().8h, \y\().8h
+    smlal  v22.4s, \z1\().4h, \y\().4h
+    smlal2 v23.4s, \z1\().8h, \y\().8h
+    smlal  v24.4s, \z2\().4h, \y\().4h
+    smlal2 v25.4s, \z2\().8h, \y\().8h
+    smlal  v26.4s, \z3\().4h, \y\().4h
+    smlal2 v27.4s, \z3\().8h, \y\().8h
+.endm
+
+cmp x7, #0
+ble End
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+mov x9, #3
+mul x9, x6, x9
+add x10, x2, x9
+add x11, x2, x9, lsl#1
+
+// weight
+ld1 {v0.8b}, [x2], x6
+ld1 {v1.8b}, [x2], x6
+ld1 {v2.8b}, [x2]
+ld1 {v3.8b}, [x10], x6
+ld1 {v4.8b}, [x10], x6
+ld1 {v5.8b}, [x10]
+ld1 {v6.8b}, [x11], x6
+ld1 {v7.8b}, [x11], x6
+ld1 {v8.8b}, [x11]
+
+sxtl v0.8h, v0.8b
+sxtl v1.8h, v1.8b
+sxtl v2.8h, v2.8b
+sxtl v3.8h, v3.8b
+sxtl v4.8h, v4.8b
+sxtl v5.8h, v5.8b
+sxtl v6.8h, v6.8b
+sxtl v7.8h, v7.8b
+sxtl v8.8h, v8.8b
+
+// bias
+ldr q9, [x3]
+ldr q10, [x3, #16]
+
+// scale
+ldr q11, [x4]
+ldr q12, [x4, #16]
+
+LoopDw:
+    cmp x7, #3
+    ble LoopDwEnd
+
+    sub x7, x7, #4
+
+    mov x9, x1
+    ld1 {v13.8b}, [x9], x6
+    ld1 {v14.8b}, [x9], x6
+    ld1 {v15.8b}, [x9], x6
+    ld1 {v16.8b}, [x9], x6
+    ld1 {v17.8b}, [x9], x6
+    ld1 {v18.8b}, [x9]
+    add x9, x1, x5
+    add x10, x1, x5, lsl#1
+
+    mov v20.16b, v9.16b
+    mov v21.16b, v10.16b
+    mov v22.16b, v9.16b
+    mov v23.16b, v10.16b
+    mov v24.16b, v9.16b
+    mov v25.16b, v10.16b
+    mov v26.16b, v9.16b
+    mov v27.16b, v10.16b
+
+    sxtl v13.8h, v13.8b
+    sxtl v14.8h, v14.8b
+    sxtl v15.8h, v15.8b
+    sxtl v16.8h, v16.8b
+    sxtl v17.8h, v17.8b
+    sxtl v18.8h, v18.8b
+
+    ld1 {v19.8b}, [x9], x6
+    ld1 {v28.8b}, [x9], x6
+    ld1 {v29.8b}, [x9], x6
+    ld1 {v30.8b}, [x9], x6
+    ld1 {v31.8b}, [x9], x6
+    COMPUTE_SMLAL_UNIT v13, v14, v15, v16, v0
+    ld1 {v13.8b}, [x9]
+    COMPUTE_SMLAL_UNIT v14, v15, v16, v17, v1
+    COMPUTE_SMLAL_UNIT v15, v16, v17, v18, v2
+
+    sxtl v19.8h, v19.8b
+    sxtl v28.8h, v28.8b
+    sxtl v29.8h, v29.8b
+    sxtl v30.8h, v30.8b
+    sxtl v31.8h, v31.8b
+    sxtl v13.8h, v13.8b
+
+    ld1 {v14.8b}, [x10], x6
+    ld1 {v15.8b}, [x10], x6
+    ld1 {v16.8b}, [x10], x6
+    ld1 {v17.8b}, [x10], x6
+    ld1 {v18.8b}, [x10], x6
+    COMPUTE_SMLAL_UNIT v19, v28, v29, v30, v3
+    ld1 {v19.8b}, [x10]
+    COMPUTE_SMLAL_UNIT v28, v29, v30, v31, v4
+    COMPUTE_SMLAL_UNIT v29, v30, v31, v13, v5
+
+    sxtl v14.8h, v14.8b
+    sxtl v15.8h, v15.8b
+    sxtl v16.8h, v16.8b
+    sxtl v17.8h, v17.8b
+    sxtl v18.8h, v18.8b
+    sxtl v19.8h, v19.8b
+
+    COMPUTE_SMLAL_UNIT v14, v15, v16, v17, v6
+    COMPUTE_SMLAL_UNIT v15, v16, v17, v18, v7
+    COMPUTE_SMLAL_UNIT v16, v17, v18, v19, v8
+
+    scvtf v20.4s, v20.4s
+    scvtf v21.4s, v21.4s
+    scvtf v22.4s, v22.4s
+    scvtf v23.4s, v23.4s
+    scvtf v24.4s, v24.4s
+    scvtf v25.4s, v25.4s
+    scvtf v26.4s, v26.4s
+    scvtf v27.4s, v27.4s
+
+    fmul v20.4s, v20.4s, v11.4s
+    fmul v21.4s, v21.4s, v12.4s
+    fmul v22.4s, v22.4s, v11.4s
+    fmul v23.4s, v23.4s, v12.4s
+    fmul v24.4s, v24.4s, v11.4s
+    fmul v25.4s, v25.4s, v12.4s
+    fmul v26.4s, v26.4s, v11.4s
+    fmul v27.4s, v27.4s, v12.4s
+
+    fcvtas v20.4s, v20.4s
+    fcvtas v21.4s, v21.4s
+    fcvtas v22.4s, v22.4s
+    fcvtas v23.4s, v23.4s
+    fcvtas v24.4s, v24.4s
+    fcvtas v25.4s, v25.4s
+    fcvtas v26.4s, v26.4s
+    fcvtas v27.4s, v27.4s
+
+    sqxtn  v20.4h, v20.4s
+    sqxtn  v22.4h, v22.4s
+    sqxtn  v24.4h, v24.4s
+    sqxtn  v26.4h, v26.4s
+    sqxtn2 v20.8h, v21.4s
+    sqxtn2 v22.8h, v23.4s
+    sqxtn2 v24.8h, v25.4s
+    sqxtn2 v26.8h, v27.4s
+
+    sqxtn v20.8b, v20.8h
+    sqxtn v22.8b, v22.8h
+    sqxtn v24.8b, v24.8h
+    sqxtn v26.8b, v26.8h
+
+    st1 {v20.8b}, [x0], x6
+    st1 {v22.8b}, [x0], x6
+    st1 {v24.8b}, [x0], x6
+    st1 {v26.8b}, [x0], x6
+
+    // src += 4 * dst_depth
+    add x1, x1, x6, lsl#2
+
+    b LoopDw
+
+LoopDwEnd:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+End:
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3x3_BFP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3x3_BFP16_SLIDEW.S
new file mode 100644
index 0000000..70e61ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_3x3_BFP16_SLIDEW.S
@@ -0,0 +1,230 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3Bfp16SlideW 
+//void ConvDw3x3Bfp16SlideW(bfp16_t *dst_z,
+//                        bfp16_t **cache_line,
+//                        const bfp16_t* weight_z,
+//                        int dst_width)
+
+dst      .req x0
+line0    .req x4
+line1    .req x5
+line2    .req x6
+weight   .req x2
+width    .req x3
+
+w_00      .req v0
+w_01      .req v1
+w_02      .req v2
+w_10      .req v3
+w_11      .req v4
+w_12      .req v5
+w_20      .req v6
+w_21      .req v7
+w_22      .req v8
+
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+//Auto Load:
+//x0:dst_z, x1:cache_line, x2:weight_z, x3: dst_width
+
+cmp width, #0
+ble End
+
+ldr line0, [x1]
+ldr line1, [x1, #8]
+ldr line2, [x1, #16]
+
+ld1 {w_00.4s, w_01.4s, w_02.4s}, [weight], #48
+ld1 {w_10.4s, w_11.4s, w_12.4s}, [weight], #48
+ld1 {w_20.4s, w_21.4s, w_22.4s}, [weight], #48
+
+ld1 {v24.4h}, [line0], #8
+ld1 {v25.4h}, [line1], #8
+ld1 {v26.4h}, [line2], #8
+shll v24.4s, v24.4h, #16
+shll v25.4s, v25.4h, #16
+shll v26.4s, v26.4h, #16
+
+fmul v17.4s, v24.4s, w_00.4s
+fmla v17.4s, v25.4s, w_10.4s
+fmla v17.4s, v26.4s, w_20.4s
+
+ld1 {v24.4h}, [line0], #8
+ld1 {v25.4h}, [line1], #8
+ld1 {v26.4h}, [line2], #8
+shll v24.4s, v24.4h, #16
+shll v25.4s, v25.4h, #16
+shll v26.4s, v26.4h, #16
+
+fmul v18.4s, v24.4s, w_00.4s
+fmla v17.4s, v24.4s, w_01.4s
+fmla v18.4s, v25.4s, w_10.4s
+fmla v17.4s, v25.4s, w_11.4s
+fmla v18.4s, v26.4s, w_20.4s
+fmla v17.4s, v26.4s, w_21.4s
+
+subs width, width, #1
+
+cmp width, #4
+blt LoopDw
+LoopDwUnroll4:
+    
+    ld1 {v24.4h,v25.4h, v26.4h, v27.4h}, [line0], #32
+    shll v24.4s, v24.4h, #16
+    shll v25.4s, v25.4h, #16
+    shll v26.4s, v26.4h, #16
+    shll v27.4s, v27.4h, #16
+    ld1 {v12.4h,v13.4h, v14.4h, v15.4h}, [line1], #32
+    shll v12.4s, v12.4h, #16
+    shll v13.4s, v13.4h, #16
+    shll v14.4s, v14.4h, #16
+    shll v15.4s, v15.4h, #16
+    ld1 {v28.4h,v29.4h, v30.4h, v31.4h}, [line2], #32
+    shll v28.4s, v28.4h, #16
+    shll v29.4s, v29.4h, #16
+    shll v30.4s, v30.4h, #16
+    shll v31.4s, v31.4h, #16
+
+    fmla v17.4s, v24.4s, w_02.4s
+    fmla v18.4s, v24.4s, w_01.4s
+    fmul v19.4s, v24.4s, w_00.4s
+
+
+    fmla v17.4s, v12.4s, w_12.4s
+    fmla v18.4s, v12.4s, w_11.4s
+    fmla v19.4s, v12.4s, w_10.4s
+
+    fmla v17.4s, v28.4s, w_22.4s
+    fmla v18.4s, v28.4s, w_21.4s
+    fmla v19.4s, v28.4s, w_20.4s
+
+    shrn v17.4h, v17.4s, #16
+    fmla v18.4s, v25.4s, w_02.4s
+    fmla v19.4s, v25.4s, w_01.4s
+    fmul v16.4s, v25.4s, w_00.4s
+
+
+    fmla v18.4s, v13.4s, w_12.4s
+    fmla v19.4s, v13.4s, w_11.4s
+    fmla v16.4s, v13.4s, w_10.4s
+
+    st1 {v17.4h}, [dst], #8
+
+    fmla v18.4s, v29.4s, w_22.4s
+    fmla v19.4s, v29.4s, w_21.4s
+    fmla v16.4s, v29.4s, w_20.4s
+
+
+    fmla v19.4s, v26.4s, w_02.4s
+    fmla v16.4s, v26.4s, w_01.4s
+    fmul v17.4s, v26.4s, w_00.4s
+
+    shrn v18.4h, v18.4s, #16
+    
+    fmla v19.4s, v14.4s, w_12.4s
+    fmla v16.4s, v14.4s, w_11.4s
+    fmla v17.4s, v14.4s, w_10.4s
+
+
+    fmla v19.4s, v30.4s, w_22.4s
+    fmla v16.4s, v30.4s, w_21.4s
+    fmla v17.4s, v30.4s, w_20.4s
+
+    st1 {v18.4h}, [dst], #8
+
+    fmla v16.4s, v27.4s, w_02.4s
+    fmla v17.4s, v27.4s, w_01.4s
+    fmul v18.4s, v27.4s, w_00.4s
+
+    shrn v19.4h, v19.4s, #16
+
+    fmla v16.4s, v15.4s, w_12.4s
+    fmla v17.4s, v15.4s, w_11.4s
+    fmla v18.4s, v15.4s, w_10.4s
+
+    st1 {v19.4h}, [dst], #8
+
+    fmla v16.4s, v31.4s, w_22.4s
+    fmla v17.4s, v31.4s, w_21.4s
+    fmla v18.4s, v31.4s, w_20.4s
+
+    shrn v16.4h, v16.4s, #16
+
+    subs width, width, #4
+    cmp width, #4
+    st1 {v16.4h}, [dst], #8
+    bge LoopDwUnroll4
+
+cmp width, #0
+beq LoopDwEnd
+
+LoopDw:
+    ld1 {v24.4h}, [line0], #8
+    ld1 {v25.4h}, [line1], #8
+    ld1 {v26.4h}, [line2], #8
+    shll v24.4s, v24.4h, #16
+    shll v25.4s, v25.4h, #16
+    shll v26.4s, v26.4h, #16
+
+    fmla v17.4s, v24.4s, w_02.4s
+    fmla v18.4s, v24.4s, w_01.4s
+    fmul v19.4s, v24.4s, w_00.4s
+
+    fmla v17.4s, v25.4s, w_12.4s
+    fmla v18.4s, v25.4s, w_11.4s
+    fmla v19.4s, v25.4s, w_10.4s
+
+    fmla v17.4s, v26.4s, w_22.4s
+    fmla v18.4s, v26.4s, w_21.4s
+    fmla v19.4s, v26.4s, w_20.4s
+
+    shrn v31.4h, v17.4s, #16
+    subs width, width, #1
+    mov v17.16b, v18.16b
+    mov v18.16b, v19.16b
+    st1 {v31.4h}, [dst], #8
+
+    bne LoopDw
+LoopDwEnd:
+ld1 {v24.4h}, [line0], #8
+ld1 {v25.4h}, [line1], #8
+ld1 {v26.4h}, [line2], #8
+shll v24.4s, v24.4h, #16
+shll v25.4s, v25.4h, #16
+shll v26.4s, v26.4h, #16
+fmla v17.4s, v24.4s, w_02.4s
+fmla v17.4s, v25.4s, w_12.4s
+fmla v17.4s, v26.4s, w_22.4s
+shrn v31.4h, v17.4s, #16
+st1 {v31.4h}, [dst], #8
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5X5_FLOAT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5X5_FLOAT_SLIDEW.S
new file mode 100644
index 0000000..c7620b0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5X5_FLOAT_SLIDEW.S
@@ -0,0 +1,226 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw5x5FloatSlideW 
+//void ConvDw5x5FloatSlideW(float *dst_z,
+//                        float **cache_line,
+//                        const float* weight_z,
+//                        int dst_width)
+
+dst      .req x0
+line0    .req x4
+line1    .req x5
+line2    .req x6
+line3    .req x7
+line4    .req x8
+weight   .req x2
+width    .req x3
+
+w_00      .req v0
+w_01      .req v1
+w_02      .req v2
+w_03      .req v3
+w_04      .req v4
+w_10      .req v5
+w_11      .req v6
+w_12      .req v7
+w_13      .req v8
+w_14      .req v9
+w_20      .req v10
+w_21      .req v11
+w_22      .req v12
+w_23      .req v13
+w_24      .req v14
+w_30      .req v15
+w_31      .req v16
+w_32      .req v17
+w_33      .req v18
+w_34      .req v19
+w_40      .req v20
+w_41      .req v21
+w_42      .req v22
+w_43      .req v23
+w_44      .req v24
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+//Auto Load:
+//x0:dst_z, x1:cache_line, x2:weight_z, x3: dst_width
+
+cmp width, #0
+ble End
+
+ldr x4, [x1]
+ldr x5, [x1, #8]
+ldr x6, [x1, #16]
+ldr x7, [x1, #24]
+ldr x8, [x1, #32]
+
+ld1 {w_00.4s, w_01.4s, w_02.4s, w_03.4s}, [weight], #64
+ld1 {w_04.4s, w_10.4s, w_11.4s, w_12.4s}, [weight], #64
+ld1 {w_13.4s, w_14.4s, w_20.4s, w_21.4s}, [weight], #64
+ld1 {w_22.4s, w_23.4s, w_24.4s, w_30.4s}, [weight], #64
+ld1 {w_31.4s, w_32.4s, w_33.4s, w_34.4s}, [weight], #64
+ld1 {w_40.4s, w_41.4s, w_42.4s, w_43.4s}, [weight], #64
+ld1 {w_44.4s}, [weight]
+
+ld1 {v30.4s}, [line0], #16
+ld1 {v31.4s}, [line1], #16
+fmul v25.4s, v30.4s, w_00.4s
+fmla v25.4s, v31.4s, w_10.4s
+ld1 {v30.4s}, [line2], #16
+ld1 {v31.4s}, [line3], #16
+fmla v25.4s, v30.4s, w_20.4s
+fmla v25.4s, v31.4s, w_30.4s
+ld1 {v30.4s}, [line4], #16
+fmla v25.4s, v30.4s, w_40.4s
+
+ld1 {v30.4s}, [line0], #16
+ld1 {v31.4s}, [line1], #16
+fmul v26.4s, v30.4s, w_00.4s
+fmla v25.4s, v30.4s, w_01.4s
+fmla v26.4s, v31.4s, w_10.4s
+fmla v25.4s, v31.4s, w_11.4s
+ld1 {v30.4s}, [line2], #16
+ld1 {v31.4s}, [line3], #16
+fmla v26.4s, v30.4s, w_20.4s
+fmla v25.4s, v30.4s, w_21.4s
+fmla v26.4s, v31.4s, w_30.4s
+fmla v25.4s, v31.4s, w_31.4s
+ld1 {v30.4s}, [line4], #16
+fmla v26.4s, v30.4s, w_40.4s
+fmla v25.4s, v30.4s, w_41.4s
+
+ld1 {v30.4s}, [line0], #16
+ld1 {v31.4s}, [line1], #16
+fmul v27.4s, v30.4s, w_00.4s
+fmla v26.4s, v30.4s, w_01.4s
+fmla v25.4s, v30.4s, w_02.4s
+fmla v27.4s, v31.4s, w_10.4s
+fmla v26.4s, v31.4s, w_11.4s
+fmla v25.4s, v31.4s, w_12.4s
+ld1 {v30.4s}, [line2], #16
+ld1 {v31.4s}, [line3], #16
+fmla v27.4s, v30.4s, w_20.4s
+fmla v26.4s, v30.4s, w_21.4s
+fmla v25.4s, v30.4s, w_22.4s
+fmla v27.4s, v31.4s, w_30.4s
+fmla v26.4s, v31.4s, w_31.4s
+fmla v25.4s, v31.4s, w_32.4s
+ld1 {v30.4s}, [line4], #16
+fmla v27.4s, v30.4s, w_40.4s
+fmla v26.4s, v30.4s, w_41.4s
+fmla v25.4s, v30.4s, w_42.4s
+
+ld1 {v30.4s}, [line0], #16
+ld1 {v31.4s}, [line1], #16
+fmul v28.4s, v30.4s, w_00.4s
+fmla v27.4s, v30.4s, w_01.4s
+fmla v26.4s, v30.4s, w_02.4s
+fmla v25.4s, v30.4s, w_03.4s
+fmla v28.4s, v31.4s, w_10.4s
+fmla v27.4s, v31.4s, w_11.4s
+fmla v26.4s, v31.4s, w_12.4s
+fmla v25.4s, v31.4s, w_13.4s
+ld1 {v30.4s}, [line2], #16
+ld1 {v31.4s}, [line3], #16
+fmla v28.4s, v30.4s, w_20.4s
+fmla v27.4s, v30.4s, w_21.4s
+fmla v26.4s, v30.4s, w_22.4s
+fmla v25.4s, v30.4s, w_23.4s
+fmla v28.4s, v31.4s, w_30.4s
+fmla v27.4s, v31.4s, w_31.4s
+fmla v26.4s, v31.4s, w_32.4s
+fmla v25.4s, v31.4s, w_33.4s
+ld1 {v30.4s}, [line4], #16
+fmla v28.4s, v30.4s, w_40.4s
+fmla v27.4s, v30.4s, w_41.4s
+fmla v26.4s, v30.4s, w_42.4s
+fmla v25.4s, v30.4s, w_43.4s
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    ld1 {v30.4s}, [line0], #16
+    ld1 {v31.4s}, [line1], #16
+    fmul v29.4s, v30.4s, w_00.4s
+    fmla v28.4s, v30.4s, w_01.4s
+    fmla v27.4s, v30.4s, w_02.4s
+    fmla v26.4s, v30.4s, w_03.4s
+    fmla v25.4s, v30.4s, w_04.4s
+
+    fmla v29.4s, v31.4s, w_10.4s
+    fmla v28.4s, v31.4s, w_11.4s
+    ld1 {v30.4s}, [line2], #16
+    fmla v27.4s, v31.4s, w_12.4s
+    fmla v26.4s, v31.4s, w_13.4s
+    fmla v25.4s, v31.4s, w_14.4s
+
+    fmla v29.4s, v30.4s, w_20.4s
+    fmla v28.4s, v30.4s, w_21.4s
+    ld1 {v31.4s}, [line3], #16
+    fmla v27.4s, v30.4s, w_22.4s
+    fmla v26.4s, v30.4s, w_23.4s
+    fmla v25.4s, v30.4s, w_24.4s
+
+    fmla v29.4s, v31.4s, w_30.4s
+    fmla v28.4s, v31.4s, w_31.4s
+    ld1 {v30.4s}, [line4], #16
+    fmla v27.4s, v31.4s, w_32.4s
+    fmla v26.4s, v31.4s, w_33.4s
+    fmla v25.4s, v31.4s, w_34.4s
+
+    fmla v29.4s, v30.4s, w_40.4s
+    fmla v28.4s, v30.4s, w_41.4s
+    fmla v27.4s, v30.4s, w_42.4s
+    fmla v26.4s, v30.4s, w_43.4s
+    fmla v25.4s, v30.4s, w_44.4s
+    
+    st1 {v25.4s}, [dst], #16
+    subs width, width, #1
+    mov v25.16b, v26.16b
+    mov v26.16b, v27.16b
+    mov v27.16b, v28.16b
+    mov v28.16b, v29.16b
+
+    bne LoopDw
+LoopDwEnd:
+ld1 {v30.4s}, [line0], #16
+ld1 {v31.4s}, [line1], #16
+fmla v25.4s, v30.4s, w_04.4s
+fmla v25.4s, v31.4s, w_14.4s
+ld1 {v30.4s}, [line2], #16
+ld1 {v31.4s}, [line3], #16
+fmla v25.4s, v30.4s, w_24.4s
+fmla v25.4s, v31.4s, w_34.4s
+ld1 {v30.4s}, [line4], #16
+fmla v25.4s, v30.4s, w_44.4s
+st1 {v25.4s}, [dst], #16
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5x5_BFP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5x5_BFP16_SLIDEW.S
new file mode 100644
index 0000000..16d6dfa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_DW_5x5_BFP16_SLIDEW.S
@@ -0,0 +1,258 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw5x5Bfp16SlideW 
+//void ConvDw5x5Bfp16SlideW(bfp16_t *dst_z,
+//                        bfp16_t **cache_line,
+//                        const bfp16_t * weight_z,
+//                        int dst_width)
+
+dst      .req x0
+line0    .req x4
+line1    .req x5
+line2    .req x6
+line3    .req x7
+line4    .req x8
+weight   .req x2
+width    .req x3
+
+w_00      .req v0
+w_01      .req v1
+w_02      .req v2
+w_03      .req v3
+w_04      .req v4
+w_10      .req v5
+w_11      .req v6
+w_12      .req v7
+w_13      .req v8
+w_14      .req v9
+w_20      .req v10
+w_21      .req v11
+w_22      .req v12
+w_23      .req v13
+w_24      .req v14
+w_30      .req v15
+w_31      .req v16
+w_32      .req v17
+w_33      .req v18
+w_34      .req v19
+w_40      .req v20
+w_41      .req v21
+w_42      .req v22
+w_43      .req v23
+w_44      .req v24
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+//Auto Load:
+//x0:dst_z, x1:cache_line, x2:weight_z, x3: dst_width
+
+cmp width, #0
+ble End
+
+ldr x4, [x1]
+ldr x5, [x1, #8]
+ldr x6, [x1, #16]
+ldr x7, [x1, #24]
+ldr x8, [x1, #32]
+
+ld1 {w_00.4s, w_01.4s, w_02.4s, w_03.4s}, [weight], #64
+ld1 {w_04.4s, w_10.4s, w_11.4s, w_12.4s}, [weight], #64
+ld1 {w_13.4s, w_14.4s, w_20.4s, w_21.4s}, [weight], #64
+ld1 {w_22.4s, w_23.4s, w_24.4s, w_30.4s}, [weight], #64
+ld1 {w_31.4s, w_32.4s, w_33.4s, w_34.4s}, [weight], #64
+ld1 {w_40.4s, w_41.4s, w_42.4s, w_43.4s}, [weight], #64
+ld1 {w_44.4s}, [weight]
+
+ld1 {v30.4h}, [line0], #8
+shll v30.4s, v30.4h, #16
+ld1 {v31.4h}, [line1], #8
+shll v31.4s, v31.4h, #16
+fmul v25.4s, v30.4s, w_00.4s
+fmla v25.4s, v31.4s, w_10.4s
+ld1 {v30.4h}, [line2], #8
+ld1 {v31.4h}, [line3], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v25.4s, v30.4s, w_20.4s
+fmla v25.4s, v31.4s, w_30.4s
+ld1 {v30.4h}, [line4], #8
+shll v30.4s, v30.4h, #16
+fmla v25.4s, v30.4s, w_40.4s
+
+ld1 {v30.4h}, [line0], #8
+ld1 {v31.4h}, [line1], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmul v26.4s, v30.4s, w_00.4s
+fmla v25.4s, v30.4s, w_01.4s
+fmla v26.4s, v31.4s, w_10.4s
+fmla v25.4s, v31.4s, w_11.4s
+ld1 {v30.4h}, [line2], #8
+ld1 {v31.4h}, [line3], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v26.4s, v30.4s, w_20.4s
+fmla v25.4s, v30.4s, w_21.4s
+fmla v26.4s, v31.4s, w_30.4s
+fmla v25.4s, v31.4s, w_31.4s
+ld1 {v30.4h}, [line4], #8
+shll v30.4s, v30.4h, #16
+fmla v26.4s, v30.4s, w_40.4s
+fmla v25.4s, v30.4s, w_41.4s
+
+ld1 {v30.4h}, [line0], #8
+ld1 {v31.4h}, [line1], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmul v27.4s, v30.4s, w_00.4s
+fmla v26.4s, v30.4s, w_01.4s
+fmla v25.4s, v30.4s, w_02.4s
+fmla v27.4s, v31.4s, w_10.4s
+fmla v26.4s, v31.4s, w_11.4s
+fmla v25.4s, v31.4s, w_12.4s
+ld1 {v30.4h}, [line2], #8
+ld1 {v31.4h}, [line3], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v27.4s, v30.4s, w_20.4s
+fmla v26.4s, v30.4s, w_21.4s
+fmla v25.4s, v30.4s, w_22.4s
+fmla v27.4s, v31.4s, w_30.4s
+fmla v26.4s, v31.4s, w_31.4s
+fmla v25.4s, v31.4s, w_32.4s
+ld1 {v30.4h}, [line4], #8
+shll v30.4s, v30.4h, #16
+fmla v27.4s, v30.4s, w_40.4s
+fmla v26.4s, v30.4s, w_41.4s
+fmla v25.4s, v30.4s, w_42.4s
+
+ld1 {v30.4h}, [line0], #8
+ld1 {v31.4h}, [line1], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmul v28.4s, v30.4s, w_00.4s
+fmla v27.4s, v30.4s, w_01.4s
+fmla v26.4s, v30.4s, w_02.4s
+fmla v25.4s, v30.4s, w_03.4s
+fmla v28.4s, v31.4s, w_10.4s
+fmla v27.4s, v31.4s, w_11.4s
+fmla v26.4s, v31.4s, w_12.4s
+fmla v25.4s, v31.4s, w_13.4s
+ld1 {v30.4h}, [line2], #8
+ld1 {v31.4h}, [line3], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v28.4s, v30.4s, w_20.4s
+fmla v27.4s, v30.4s, w_21.4s
+fmla v26.4s, v30.4s, w_22.4s
+fmla v25.4s, v30.4s, w_23.4s
+fmla v28.4s, v31.4s, w_30.4s
+fmla v27.4s, v31.4s, w_31.4s
+fmla v26.4s, v31.4s, w_32.4s
+fmla v25.4s, v31.4s, w_33.4s
+ld1 {v30.4h}, [line4], #8
+shll v30.4s, v30.4h, #16
+fmla v28.4s, v30.4s, w_40.4s
+fmla v27.4s, v30.4s, w_41.4s
+fmla v26.4s, v30.4s, w_42.4s
+fmla v25.4s, v30.4s, w_43.4s
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    ld1 {v30.4h}, [line0], #8
+    ld1 {v31.4h}, [line1], #8
+    shll v30.4s, v30.4h, #16
+    shll v31.4s, v31.4h, #16
+    fmul v29.4s, v30.4s, w_00.4s
+    fmla v28.4s, v30.4s, w_01.4s
+    fmla v27.4s, v30.4s, w_02.4s
+    fmla v26.4s, v30.4s, w_03.4s
+    fmla v25.4s, v30.4s, w_04.4s
+
+    fmla v29.4s, v31.4s, w_10.4s
+    fmla v28.4s, v31.4s, w_11.4s
+    ld1 {v30.4h}, [line2], #8
+    shll v30.4s, v30.4h, #16
+    fmla v27.4s, v31.4s, w_12.4s
+    fmla v26.4s, v31.4s, w_13.4s
+    fmla v25.4s, v31.4s, w_14.4s
+
+    fmla v29.4s, v30.4s, w_20.4s
+    fmla v28.4s, v30.4s, w_21.4s
+    ld1 {v31.4h}, [line3], #8
+    shll v31.4s, v31.4h, #16
+    fmla v27.4s, v30.4s, w_22.4s
+    fmla v26.4s, v30.4s, w_23.4s
+    fmla v25.4s, v30.4s, w_24.4s
+
+    fmla v29.4s, v31.4s, w_30.4s
+    fmla v28.4s, v31.4s, w_31.4s
+    ld1 {v30.4h}, [line4], #8
+    shll v30.4s, v30.4h, #16
+    fmla v27.4s, v31.4s, w_32.4s
+    fmla v26.4s, v31.4s, w_33.4s
+    fmla v25.4s, v31.4s, w_34.4s
+
+    fmla v29.4s, v30.4s, w_40.4s
+    fmla v28.4s, v30.4s, w_41.4s
+    fmla v27.4s, v30.4s, w_42.4s
+    fmla v26.4s, v30.4s, w_43.4s
+    fmla v25.4s, v30.4s, w_44.4s
+    
+    shrn v25.4h, v25.4s, #16
+    st1 {v25.4h}, [dst], #8
+    subs width, width, #1
+    mov v25.16b, v26.16b
+    mov v26.16b, v27.16b
+    mov v27.16b, v28.16b
+    mov v28.16b, v29.16b
+
+    bne LoopDw
+LoopDwEnd:
+ld1 {v30.4h}, [line0], #8
+ld1 {v31.4h}, [line1], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v25.4s, v30.4s, w_04.4s
+fmla v25.4s, v31.4s, w_14.4s
+ld1 {v30.4h}, [line2], #8
+ld1 {v31.4h}, [line3], #8
+shll v30.4s, v30.4h, #16
+shll v31.4s, v31.4h, #16
+fmla v25.4s, v30.4s, w_24.4s
+fmla v25.4s, v31.4s, w_34.4s
+ld1 {v30.4h}, [line4], #8
+shll v30.4s, v30.4h, #16
+fmla v25.4s, v30.4s, w_44.4s
+shrn v25.4h, v25.4s, #16
+st1 {v25.4h}, [dst], #8
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_O4.S
new file mode 100644
index 0000000..114119e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_O4.S
@@ -0,0 +1,395 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvFloatO4
+//void ConvFloatO4(float* dst, const float* src, const float* weight, int width, int src_w_step, int src_depth_quad, int src_depth_step, int fw, int fh, int dilate_x_step, int dilate_y_step)
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step, x5:src_depth_quad, x6: src_depth_step, x7:fw
+
+//Load from sp
+//x8:fh, x9:dilate_x_step, x10:dilate_y_step
+//eor x8, x8, x8
+ldr x8, [sp, #0]
+//eor x9, x9, x9
+ldr x9, [sp, #8]
+//eor x10, x10, x10   
+ldr x10, [sp, #16]
+
+//step multi by sizeof(float)
+mov x12, #4
+mul x10, x12, x10
+mul x9, x12, x9
+mul x6, x12, x6
+mul x4, x12, x4
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+mul x12, x8, x10
+sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul x12, x7, x9
+sub x10, x10, x12
+
+sub sp, sp, #144
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+
+L14:
+cmp x3, #13
+ble L8
+
+mov x14, #14
+mul x14, x4, x14
+
+L14Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v14.4s, #0
+    movi v15.4s, #0
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+    movi v24.4s, #0
+    movi v25.4s, #0
+    movi v26.4s, #0
+    movi v27.4s, #0
+    L14LoopZ:
+        mov x19, x8
+        L14LoopFY:
+            mov x20, x7
+            L14LoopFX:
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+
+                ld1 {v0.4s}, [x1], x4
+                fmla v14.4s, v28.4s, v0.s[0]
+                ld1 {v1.4s}, [x1], x4
+                fmla v14.4s, v29.4s, v0.s[1]
+                fmla v14.4s, v30.4s, v0.s[2]
+                ld1 {v2.4s}, [x1], x4
+                fmla v15.4s, v28.4s, v1.s[0]
+                fmla v14.4s, v31.4s, v0.s[3]
+                ld1 {v3.4s}, [x1], x4
+                fmla v15.4s, v29.4s, v1.s[1]
+                fmla v16.4s, v29.4s, v2.s[1]
+                ld1 {v4.4s}, [x1], x4
+                fmla v15.4s, v30.4s, v1.s[2]
+                fmla v16.4s, v28.4s, v2.s[0]
+                fmla v15.4s, v31.4s, v1.s[3]
+                ld1 {v5.4s}, [x1], x4
+                fmla v16.4s, v30.4s, v2.s[2]
+                ld1 {v6.4s}, [x1], x4
+                fmla v16.4s, v31.4s, v2.s[3]
+                ld1 {v7.4s}, [x1], x4
+                fmla v17.4s, v28.4s, v3.s[0]
+                fmla v18.4s, v28.4s, v4.s[0]
+                fmla v17.4s, v29.4s, v3.s[1]
+                ld1 {v8.4s}, [x1], x4
+                fmla v17.4s, v30.4s, v3.s[2]
+                fmla v18.4s, v29.4s, v4.s[1]
+                fmla v17.4s, v31.4s, v3.s[3]
+
+                ld1 {v9.4s}, [x1], x4
+                fmla v18.4s, v30.4s, v4.s[2]
+                fmla v19.4s, v28.4s, v5.s[0]
+                fmla v20.4s, v28.4s, v6.s[0]
+                ld1 {v10.4s}, [x1], x4
+                fmla v18.4s, v31.4s, v4.s[3]
+                fmla v19.4s, v29.4s, v5.s[1]
+                fmla v20.4s, v29.4s, v6.s[1]
+                ld1 {v11.4s}, [x1], x4
+                fmla v19.4s, v30.4s, v5.s[2]
+                fmla v20.4s, v30.4s, v6.s[2]
+                ld1 {v12.4s}, [x1], x4
+                fmla v19.4s, v31.4s, v5.s[3]
+                ld1 {v13.4s}, [x1], x4
+                fmla v20.4s, v31.4s, v6.s[3]
+
+                fmla v21.4s, v28.4s, v7.s[0]
+                fmla v22.4s, v28.4s, v8.s[0]
+                fmla v23.4s, v28.4s, v9.s[0]
+                fmla v24.4s, v28.4s, v10.s[0]
+                fmla v25.4s, v28.4s, v11.s[0]
+                fmla v26.4s, v28.4s, v12.s[0]
+                fmla v27.4s, v28.4s, v13.s[0]
+                
+                fmla v21.4s, v29.4s, v7.s[1]
+                fmla v22.4s, v29.4s, v8.s[1]
+                fmla v23.4s, v29.4s, v9.s[1]
+                fmla v24.4s, v29.4s, v10.s[1]
+                fmla v25.4s, v29.4s, v11.s[1]
+                fmla v26.4s, v29.4s, v12.s[1]
+                fmla v27.4s, v29.4s, v13.s[1]
+
+                fmla v21.4s, v30.4s, v7.s[2]
+                fmla v22.4s, v30.4s, v8.s[2]
+                fmla v23.4s, v30.4s, v9.s[2]
+                fmla v24.4s, v30.4s, v10.s[2]
+                fmla v25.4s, v30.4s, v11.s[2]
+                fmla v26.4s, v30.4s, v12.s[2]
+                fmla v27.4s, v30.4s, v13.s[2]
+
+                fmla v21.4s, v31.4s, v7.s[3]
+                fmla v22.4s, v31.4s, v8.s[3]
+                fmla v23.4s, v31.4s, v9.s[3]
+                fmla v24.4s, v31.4s, v10.s[3]
+                fmla v25.4s, v31.4s, v11.s[3]
+                fmla v26.4s, v31.4s, v12.s[3]
+                fmla v27.4s, v31.4s, v13.s[3]
+
+                subs x7, x7, #1
+                sub x1, x1, x14
+
+                add x1, x1, x9
+                bne L14LoopFX
+            subs x8, x8, #1
+            mov x7, x20 
+            add x1, x1, x10
+            bne L14LoopFY
+        subs x5, x5, #1
+        mov x8, x19
+        add x1, x1, x6
+        bne L14LoopZ
+    sub x3, x3, #14
+    st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [x0], #64
+    add x1, x11, x14
+    st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [x0], #64
+    mov x2, x12
+    cmp x3, #14
+    st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [x0], #64
+    mov x5, x13
+    st1 {v26.4s, v27.4s}, [x0], #32
+    bge L14Loop
+
+
+L8:
+cmp x3, #7
+ble L4
+
+mov x14, #8
+mul x14, x4, x14
+
+L8Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    movi v3.4s, #0
+    movi v4.4s, #0
+    movi v5.4s, #0
+    movi v6.4s, #0
+    movi v7.4s, #0
+    L8LoopZ:
+        mov v27.d[0], x8
+        L8LoopFY:
+            mov v27.d[1], x7
+            L8LoopFX:
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+                ld1 {v16.4s}, [x1], x4
+                fmla v0.4s, v28.4s, v16.s[0]
+                ld1 {v17.4s}, [x1], x4
+                fmla v0.4s, v29.4s, v16.s[1]
+                fmla v0.4s, v30.4s, v16.s[2]
+                ld1 {v18.4s}, [x1], x4
+                fmla v1.4s, v28.4s, v17.s[0]
+                ld1 {v19.4s}, [x1], x4
+                fmla v1.4s, v29.4s, v17.s[1]
+                fmla v2.4s, v28.4s, v18.s[0]
+                ld1 {v20.4s}, [x1], x4
+                fmla v3.4s, v28.4s, v19.s[0]
+                ld1 {v21.4s}, [x1], x4
+                fmla v4.4s, v28.4s, v20.s[0]
+                ld1 {v22.4s}, [x1], x4
+                fmla v5.4s, v28.4s, v21.s[0]
+                ld1 {v23.4s}, [x1], x4
+                fmla v6.4s, v28.4s, v22.s[0]
+                fmla v7.4s, v28.4s, v23.s[0]
+
+                fmla v2.4s, v29.4s, v18.s[1]
+                fmla v3.4s, v29.4s, v19.s[1]
+                fmla v4.4s, v29.4s, v20.s[1]
+                fmla v5.4s, v29.4s, v21.s[1]
+                fmla v6.4s, v29.4s, v22.s[1]
+                fmla v7.4s, v29.4s, v23.s[1]
+
+                fmla v1.4s, v30.4s, v17.s[2]
+                fmla v2.4s, v30.4s, v18.s[2]
+                fmla v3.4s, v30.4s, v19.s[2]
+                fmla v4.4s, v30.4s, v20.s[2]
+                fmla v5.4s, v30.4s, v21.s[2]
+                fmla v6.4s, v30.4s, v22.s[2]
+                fmla v7.4s, v30.4s, v23.s[2]
+
+                fmla v0.4s, v31.4s, v16.s[3]
+                fmla v1.4s, v31.4s, v17.s[3]
+                fmla v2.4s, v31.4s, v18.s[3]
+                fmla v3.4s, v31.4s, v19.s[3]
+                fmla v4.4s, v31.4s, v20.s[3]
+                fmla v5.4s, v31.4s, v21.s[3]
+                fmla v6.4s, v31.4s, v22.s[3]
+                fmla v7.4s, v31.4s, v23.s[3]
+
+                sub x1, x1, x14
+                subs x7, x7, #1
+                add x1, x1, x9
+                bne L8LoopFX
+            subs x8, x8, #1
+            mov x7, v27.d[1]
+            add x1, x1, x10
+            bne L8LoopFY
+        subs x5, x5, #1
+        mov x8, v27.d[0]
+        add x1, x1, x6
+        bne L8LoopZ
+    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    add x1, x11, x14
+    mov x2, x12
+    mov x5, x13
+    sub x3, x3, #8
+    cmp x3, #8
+    st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
+    bge L8Loop
+
+
+
+L4:
+cmp x3, #3
+ble L1
+
+mov x14, #4
+mul x14, x4, x14
+
+mov x11, x1
+mov x12, x2
+mov x13, x5
+movi v0.4s, #0
+movi v1.4s, #0
+movi v2.4s, #0
+movi v3.4s, #0
+L4LoopZ:
+mov v27.d[0], x8
+L4LoopFY:
+mov v27.d[1], x7
+L4LoopFX:
+ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+ld1 {v16.4s}, [x1], x4
+fmla v0.4s, v28.4s, v16.s[0]
+ld1 {v17.4s}, [x1], x4
+fmla v0.4s, v29.4s, v16.s[1]
+fmla v1.4s, v29.4s, v17.s[1]
+ld1 {v18.4s}, [x1], x4
+fmla v1.4s, v28.4s, v17.s[0]
+fmla v0.4s, v30.4s, v16.s[2]
+ld1 {v19.4s}, [x1], x4
+
+fmla v2.4s, v28.4s, v18.s[0]
+fmla v3.4s, v28.4s, v19.s[0]
+
+fmla v2.4s, v29.4s, v18.s[1]
+fmla v3.4s, v29.4s, v19.s[1]
+
+fmla v1.4s, v30.4s, v17.s[2]
+fmla v2.4s, v30.4s, v18.s[2]
+fmla v3.4s, v30.4s, v19.s[2]
+
+fmla v0.4s, v31.4s, v16.s[3]
+fmla v1.4s, v31.4s, v17.s[3]
+fmla v2.4s, v31.4s, v18.s[3]
+fmla v3.4s, v31.4s, v19.s[3]
+
+sub x1, x1, x14
+subs x7, x7, #1
+add x1, x1, x9
+bne L4LoopFX
+subs x8, x8, #1
+mov x7, v27.d[1]
+add x1, x1, x10
+bne L4LoopFY
+subs x5, x5, #1
+mov x8, v27.d[0]
+add x1, x1, x6
+bne L4LoopZ
+st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+add x1, x11, x14
+mov x2, x12
+mov x5, x13
+sub x3, x3, #4
+
+L1:
+cmp x3, #0
+ble End
+
+L1Loop:
+    mov x11, x1
+    mov x12, x2
+    mov x13, x5
+    movi v0.4s, #0
+    movi v1.4s, #0
+    L1LoopZ:
+        mov x14, x8
+        L1LoopFY:
+            mov x15, x7
+            L1LoopFX:
+                ld1 {v3.4s}, [x1], x9
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
+
+                fmla v0.4s, v28.4s, v3.s[0]
+                fmla v1.4s, v29.4s, v3.s[1]
+                fmla v0.4s, v30.4s, v3.s[2]
+                fmla v1.4s, v31.4s, v3.s[3]
+                subs x7, x7, #1
+                bne L1LoopFX
+            subs x8, x8, #1
+            mov x7, x15
+            add x1, x1, x10
+            bne L1LoopFY
+        subs x5, x5, #1
+        mov x8, x14
+        add x1, x1, x6
+        bne L1LoopZ
+
+    fadd v0.4s, v0.4s, v1.4s
+    add x1, x11, x4
+    mov x2, x12
+    mov x5, x13
+    subs x3, x3, #1
+    st1 {v0.4s}, [x0], #16
+    bne L1Loop
+
+End:
+
+sub sp, sp, #144
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_SLIDEW_C3.S
new file mode 100644
index 0000000..da03bbf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/CONV_FLOAT_SLIDEW_C3.S
@@ -0,0 +1,290 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmFloatSlidewC3
+//void GemmFloatSlidewC3(float* dst,            //x0: dst 
+//                          const float* src,      //x1: src
+//                          const float* weight,   //x2: weight
+//                          int width,             //x3: width
+//                          int src_w_setup,       //x4: src_w_step
+//                          int fw,                //x5: fw
+//                          int fh,                //x6: fh
+//                          int dilateX_step,      //x7: dilateX_step
+//                          int dilateY_step);     //x8: dilateY_step, load from stack
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+src_w_step   .req x4
+fw           .req x5
+fh           .req x6
+dilateX_step .req x7
+dilateY_step .req x8
+dilate_y_gap .req x10
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+
+//step multi by sizeof(float)
+mov x12, #4
+mul dilateY_step, x12, dilateY_step
+mul dilateX_step, x12, dilateX_step
+mul src_w_step, x12, src_w_step
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul x12, fw, dilateX_step
+sub dilate_y_gap, dilateY_step, x12
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+L14:
+cmp width, #13
+ble L8
+
+mov x14, #14
+mul x14, src_w_step, x14
+
+L14Loop:
+    mov x11, src
+    mov x12, weight
+    movi v14.4s, #0
+    movi v15.4s, #0
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+    movi v19.4s, #0
+    movi v20.4s, #0
+    movi v21.4s, #0
+    movi v22.4s, #0
+    movi v23.4s, #0
+    movi v24.4s, #0
+    movi v25.4s, #0
+    movi v26.4s, #0
+    movi v27.4s, #0
+    mov x9, fh
+    L14LoopFY:
+        mov x13, fw
+        L14LoopFX:
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+
+            ld1 {v0.4s}, [src], src_w_step
+            fmla v14.4s, v28.4s, v0.s[0]
+            ld1 {v1.4s}, [src], src_w_step
+            fmla v14.4s, v29.4s, v0.s[1]
+            fmla v14.4s, v30.4s, v0.s[2]
+            ld1 {v2.4s}, [src], src_w_step
+            fmla v15.4s, v28.4s, v1.s[0]
+            ld1 {v3.4s}, [src], src_w_step
+            fmla v15.4s, v29.4s, v1.s[1]
+            fmla v16.4s, v28.4s, v2.s[0]
+            ld1 {v4.4s}, [src], src_w_step
+            fmla v17.4s, v28.4s, v3.s[0]
+            ld1 {v5.4s}, [src], src_w_step
+            fmla v17.4s, v29.4s, v3.s[1]
+            fmla v18.4s, v28.4s, v4.s[0]
+            ld1 {v6.4s}, [src], src_w_step
+            fmla v19.4s, v28.4s, v5.s[0]
+            ld1 {v7.4s}, [src], src_w_step
+            fmla v16.4s, v29.4s, v2.s[1]
+            ld1 {v8.4s}, [src], src_w_step
+            fmla v20.4s, v28.4s, v6.s[0]
+            ld1 {v9.4s}, [src], src_w_step
+            fmla v18.4s, v29.4s, v4.s[1]
+            ld1 {v10.4s}, [src], src_w_step
+            fmla v21.4s, v28.4s, v7.s[0]
+            ld1 {v11.4s}, [src], src_w_step
+            fmla v22.4s, v28.4s, v8.s[0]
+            ld1 {v12.4s}, [src], src_w_step
+            fmla v23.4s, v28.4s, v9.s[0]
+            ld1 {v13.4s}, [src], src_w_step
+            fmla v24.4s, v28.4s, v10.s[0]
+            fmla v25.4s, v28.4s, v11.s[0]
+            fmla v26.4s, v28.4s, v12.s[0]
+            fmla v27.4s, v28.4s, v13.s[0]
+
+            fmla v19.4s, v29.4s, v5.s[1]
+            fmla v20.4s, v29.4s, v6.s[1]
+            fmla v21.4s, v29.4s, v7.s[1]
+            fmla v22.4s, v29.4s, v8.s[1]
+            fmla v23.4s, v29.4s, v9.s[1]
+            fmla v24.4s, v29.4s, v10.s[1]
+            fmla v25.4s, v29.4s, v11.s[1]
+            fmla v26.4s, v29.4s, v12.s[1]
+            fmla v27.4s, v29.4s, v13.s[1]
+            
+            fmla v15.4s, v30.4s, v1.s[2]
+            fmla v16.4s, v30.4s, v2.s[2]
+            fmla v17.4s, v30.4s, v3.s[2]
+            fmla v18.4s, v30.4s, v4.s[2]
+            fmla v19.4s, v30.4s, v5.s[2]
+            fmla v20.4s, v30.4s, v6.s[2]
+            fmla v21.4s, v30.4s, v7.s[2]
+            fmla v22.4s, v30.4s, v8.s[2]
+            fmla v23.4s, v30.4s, v9.s[2]
+            fmla v24.4s, v30.4s, v10.s[2]
+            fmla v25.4s, v30.4s, v11.s[2]
+            fmla v26.4s, v30.4s, v12.s[2]
+            fmla v27.4s, v30.4s, v13.s[2]
+            
+            subs fw, fw, #1
+            sub src, src, x14
+
+            add src, src, dilateX_step
+            bne L14LoopFX
+        subs fh, fh, #1
+        mov fw, x13
+        add src, src, dilate_y_gap
+        bne L14LoopFY
+
+    mov fh, x9 
+    sub width, width, #14
+    st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [dst], #64
+    add src, x11, x14
+    st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [dst], #64
+    mov weight, x12
+    cmp width, #14
+    st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [dst], #64
+    st1 {v26.4s, v27.4s}, [dst], #32
+    bge L14Loop
+
+L8:
+cmp width, #7
+ble L1
+
+mov x14, #8
+mul x14, src_w_step, x14
+
+L8Loop:
+    mov x11, src
+    mov x12, weight
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    movi v3.4s, #0
+    movi v4.4s, #0
+    movi v5.4s, #0
+    movi v6.4s, #0
+    movi v7.4s, #0
+    mov v27.d[0], fh
+    L8LoopFY:
+        mov v27.d[1], fw
+        L8LoopFX:
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+            ld1 {v16.4s}, [src], src_w_step
+            fmla v0.4s, v28.4s, v16.s[0]
+            ld1 {v17.4s}, [src], src_w_step
+            fmla v0.4s, v29.4s, v16.s[1]
+            fmla v0.4s, v30.4s, v16.s[2]
+            ld1 {v18.4s}, [src], src_w_step
+            fmla v1.4s, v28.4s, v17.s[0]
+            ld1 {v19.4s}, [src], src_w_step
+            fmla v1.4s, v29.4s, v17.s[1]
+            fmla v2.4s, v28.4s, v18.s[0]
+            ld1 {v20.4s}, [src], src_w_step
+            fmla v3.4s, v28.4s, v19.s[0]
+            ld1 {v21.4s}, [src], src_w_step
+            fmla v4.4s, v28.4s, v20.s[0]
+            ld1 {v22.4s}, [src], src_w_step
+            fmla v5.4s, v28.4s, v21.s[0]
+            ld1 {v23.4s}, [src], src_w_step
+            fmla v6.4s, v28.4s, v22.s[0]
+            fmla v7.4s, v28.4s, v23.s[0]
+
+            fmla v2.4s, v29.4s, v18.s[1]
+            fmla v3.4s, v29.4s, v19.s[1]
+            fmla v4.4s, v29.4s, v20.s[1]
+            fmla v5.4s, v29.4s, v21.s[1]
+            fmla v6.4s, v29.4s, v22.s[1]
+            fmla v7.4s, v29.4s, v23.s[1]
+
+            fmla v1.4s, v30.4s, v17.s[2]
+            fmla v2.4s, v30.4s, v18.s[2]
+            fmla v3.4s, v30.4s, v19.s[2]
+            fmla v4.4s, v30.4s, v20.s[2]
+            fmla v5.4s, v30.4s, v21.s[2]
+            fmla v6.4s, v30.4s, v22.s[2]
+            fmla v7.4s, v30.4s, v23.s[2]
+
+            sub src, src, x14
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, v27.d[1]
+        add src, src, dilate_y_gap
+        bne L8LoopFY
+    mov fh, v27.d[0]
+    sub width, width, #8
+    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [dst], #64
+    add src, x11, x14
+    mov weight, x12
+    cmp width, #8
+    st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [dst], #64
+    bge L8Loop
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov x11, src
+    mov x12, weight
+    movi v0.4s, #0
+    movi v1.4s, #0
+    movi v2.4s, #0
+    
+    mov x14, fh
+    L1LoopFY:
+        mov x15, fw
+        L1LoopFX:
+            ld1 {v3.4s}, [src], dilateX_step
+            ld1 {v28.4s, v29.4s, v30.4s}, [weight], #48
+
+            fmla v0.4s, v28.4s, v3.s[0]
+            fmla v1.4s, v29.4s, v3.s[1]
+            fmla v2.4s, v30.4s, v3.s[2]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x15
+        add src, src, dilate_y_gap
+        bne L1LoopFY
+    mov fh, x14
+
+    fadd v0.4s, v0.4s, v1.4s
+    fadd v0.4s, v0.4s, v2.4s
+    add src, x11, src_w_step
+    mov weight, x12
+    subs width, width, #1
+    st1 {v0.4s}, [dst], #16
+    bne L1Loop
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/DECONV_FLOAT_O4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/DECONV_FLOAT_O4.S
new file mode 100644
index 0000000..48e9c11
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/DECONV_FLOAT_O4.S
@@ -0,0 +1,507 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFloatO4
+//void DeconvFloatO4(float* dst,           // x0
+//                     const float* src,     // x1
+//                     const float* weight,  // x2
+//                     int width,            // x3
+//                     int dst_w_step,       // x4
+//                     int src_depth_quad,   // x5
+//                     int src_depth_step,   // x6
+//                     int fw,               // x7
+//                     int fh,               // x8
+//                     int dilate_x_step,    // x9
+//                     int dilate_y_step)    // x10
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+dst_w_step   .req x4
+ic4          .req x5
+fw           .req x7
+fh           .req x8
+dilate_x_step .req x9
+dilate_y_step .req x10
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step, x5:src_depth_quad, x6: src_depth_step, x7:fw
+
+//Load from sp
+//x8:fh, x9:dilate_x_step, x10:dilate_y_step
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+// eor x9, x9, x9
+ldr x9, [sp, #8]
+// eor x10, x10, x10   
+ldr x10, [sp, #16]
+
+//step multi by sizeof(float)
+mov x12, #4
+mul x10, x12, x10
+mul x9, x12, x9
+mul x6, x12, x6
+mul x4, x12, x4
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+//mul x12, x8, x10
+//sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+//mul x12, x7, x9
+//sub x10, x10, x12
+
+sub sp, sp, #144
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+
+L14:
+cmp x3, #13
+ble L4
+
+
+L14Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #14
+    mul x14, dst_w_step, x14
+    mov x19, fh
+    L14LoopFY:
+        mov x20, fw
+        L14LoopFX:
+            mov x13, ic4
+            ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [weight], #64
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+            ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x1], #64
+            ld1 {v12.4s, v13.4s}, [x1], #32
+
+            fmul v14.4s, v28.4s, v0.s[0]
+            fmul v15.4s, v28.4s, v1.s[0]
+            fmul v16.4s, v28.4s, v2.s[0]
+            fmul v17.4s, v28.4s, v3.s[0]
+            fmul v18.4s, v28.4s, v4.s[0]
+            fmul v19.4s, v28.4s, v5.s[0]
+            fmul v20.4s, v28.4s, v6.s[0]
+            fmul v21.4s, v28.4s, v7.s[0]
+            fmul v22.4s, v28.4s, v8.s[0]
+            fmul v23.4s, v28.4s, v9.s[0]
+            fmul v24.4s, v28.4s, v10.s[0]
+            fmul v25.4s, v28.4s, v11.s[0]
+            fmul v26.4s, v28.4s, v12.s[0]
+            fmul v27.4s, v28.4s, v13.s[0]
+
+            subs ic4, ic4, #1
+            beq L14LoopZEnd
+            L14LoopZ:
+                sub src, src, #224
+                add src, src, x6
+
+                fmla v14.4s, v29.4s, v0.s[1]
+                fmla v15.4s, v29.4s, v1.s[1]
+                fmla v16.4s, v29.4s, v2.s[1]
+                fmla v17.4s, v29.4s, v3.s[1]
+                fmla v18.4s, v29.4s, v4.s[1]
+                fmla v19.4s, v29.4s, v5.s[1]
+                fmla v20.4s, v29.4s, v6.s[1]
+                fmla v21.4s, v29.4s, v7.s[1]
+                fmla v22.4s, v29.4s, v8.s[1]
+                fmla v23.4s, v29.4s, v9.s[1]
+                fmla v24.4s, v29.4s, v10.s[1]
+                fmla v25.4s, v29.4s, v11.s[1]
+                fmla v26.4s, v29.4s, v12.s[1]
+                fmla v27.4s, v29.4s, v13.s[1]
+
+                fmla v14.4s, v30.4s, v0.s[2]
+                fmla v15.4s, v30.4s, v1.s[2]
+                fmla v16.4s, v30.4s, v2.s[2]
+                fmla v17.4s, v30.4s, v3.s[2]
+                fmla v18.4s, v30.4s, v4.s[2]
+                fmla v19.4s, v30.4s, v5.s[2]
+                fmla v20.4s, v30.4s, v6.s[2]
+                fmla v21.4s, v30.4s, v7.s[2]
+                fmla v22.4s, v30.4s, v8.s[2]
+                fmla v23.4s, v30.4s, v9.s[2]
+                fmla v24.4s, v30.4s, v10.s[2]
+                fmla v25.4s, v30.4s, v11.s[2]
+                fmla v26.4s, v30.4s, v12.s[2]
+                fmla v27.4s, v30.4s, v13.s[2]
+
+                fmla v14.4s, v31.4s, v0.s[3]
+                fmla v15.4s, v31.4s, v1.s[3]
+                fmla v16.4s, v31.4s, v2.s[3]
+                fmla v17.4s, v31.4s, v3.s[3]
+                fmla v18.4s, v31.4s, v4.s[3]
+                fmla v19.4s, v31.4s, v5.s[3]
+                fmla v20.4s, v31.4s, v6.s[3]
+                fmla v21.4s, v31.4s, v7.s[3]
+                fmla v22.4s, v31.4s, v8.s[3]
+                fmla v23.4s, v31.4s, v9.s[3]
+                fmla v24.4s, v31.4s, v10.s[3]
+                fmla v25.4s, v31.4s, v11.s[3]
+                fmla v26.4s, v31.4s, v12.s[3]
+                fmla v27.4s, v31.4s, v13.s[3]
+
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [weight], #64
+                ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+                ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+                ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x1], #64
+                ld1 {v12.4s, v13.4s}, [x1], #32
+
+                fmla v14.4s, v28.4s, v0.s[0]
+                fmla v15.4s, v28.4s, v1.s[0]
+                fmla v16.4s, v28.4s, v2.s[0]
+                fmla v17.4s, v28.4s, v3.s[0]
+                fmla v18.4s, v28.4s, v4.s[0]
+                fmla v19.4s, v28.4s, v5.s[0]
+                fmla v20.4s, v28.4s, v6.s[0]
+                fmla v21.4s, v28.4s, v7.s[0]
+                fmla v22.4s, v28.4s, v8.s[0]
+                fmla v23.4s, v28.4s, v9.s[0]
+                fmla v24.4s, v28.4s, v10.s[0]
+                fmla v25.4s, v28.4s, v11.s[0]
+                fmla v26.4s, v28.4s, v12.s[0]
+                fmla v27.4s, v28.4s, v13.s[0]
+
+                subs ic4, ic4, #1
+                bne L14LoopZ
+
+            L14LoopZEnd:
+            fmla v14.4s, v29.4s, v0.s[1]
+            fmla v15.4s, v29.4s, v1.s[1]
+            fmla v16.4s, v29.4s, v2.s[1]
+            fmla v17.4s, v29.4s, v3.s[1]
+            fmla v18.4s, v29.4s, v4.s[1]
+            fmla v19.4s, v29.4s, v5.s[1]
+            fmla v20.4s, v29.4s, v6.s[1]
+            fmla v21.4s, v29.4s, v7.s[1]
+            fmla v22.4s, v29.4s, v8.s[1]
+            fmla v23.4s, v29.4s, v9.s[1]
+            fmla v24.4s, v29.4s, v10.s[1]
+            fmla v25.4s, v29.4s, v11.s[1]
+            fmla v26.4s, v29.4s, v12.s[1]
+            fmla v27.4s, v29.4s, v13.s[1]
+
+            fmla v14.4s, v30.4s, v0.s[2]
+            fmla v15.4s, v30.4s, v1.s[2]
+            fmla v16.4s, v30.4s, v2.s[2]
+            fmla v17.4s, v30.4s, v3.s[2]
+            fmla v18.4s, v30.4s, v4.s[2]
+            fmla v19.4s, v30.4s, v5.s[2]
+            fmla v20.4s, v30.4s, v6.s[2]
+            fmla v21.4s, v30.4s, v7.s[2]
+            fmla v22.4s, v30.4s, v8.s[2]
+            fmla v23.4s, v30.4s, v9.s[2]
+            fmla v24.4s, v30.4s, v10.s[2]
+            fmla v25.4s, v30.4s, v11.s[2]
+            fmla v26.4s, v30.4s, v12.s[2]
+            fmla v27.4s, v30.4s, v13.s[2]
+
+            fmla v14.4s, v31.4s, v0.s[3]
+            fmla v15.4s, v31.4s, v1.s[3]
+            fmla v16.4s, v31.4s, v2.s[3]
+            fmla v17.4s, v31.4s, v3.s[3]
+            fmla v18.4s, v31.4s, v4.s[3]
+            fmla v19.4s, v31.4s, v5.s[3]
+            fmla v20.4s, v31.4s, v6.s[3]
+            fmla v21.4s, v31.4s, v7.s[3]
+            fmla v22.4s, v31.4s, v8.s[3]
+            fmla v23.4s, v31.4s, v9.s[3]
+            fmla v24.4s, v31.4s, v10.s[3]
+            fmla v25.4s, v31.4s, v11.s[3]
+            fmla v26.4s, v31.4s, v12.s[3]
+            fmla v27.4s, v31.4s, v13.s[3]            
+            // add with stride
+            ld1 {v0.4s}, [dst]
+            fadd v14.4s, v14.4s, v0.4s
+            st1 {v14.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v15.4s, v15.4s, v0.4s
+            st1 {v15.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v16.4s, v16.4s, v0.4s
+            st1 {v16.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v17.4s, v17.4s, v0.4s
+            st1 {v17.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v18.4s, v18.4s, v0.4s
+            st1 {v18.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v19.4s, v19.4s, v0.4s
+            st1 {v19.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v20.4s, v20.4s, v0.4s
+            st1 {v20.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v21.4s, v21.4s, v0.4s
+            st1 {v21.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v22.4s, v22.4s, v0.4s
+            st1 {v22.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v23.4s, v23.4s, v0.4s
+            st1 {v23.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v24.4s, v24.4s, v0.4s
+            st1 {v24.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v25.4s, v25.4s, v0.4s
+            st1 {v25.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v26.4s, v26.4s, v0.4s
+            st1 {v26.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v27.4s, v27.4s, v0.4s
+            st1 {v27.4s}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic4, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L14LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L14LoopFY
+    
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #224
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #14
+    cmp width, #14
+    bge L14Loop
+
+
+L4:
+cmp x3, #3
+ble L1
+
+L4Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #4
+    mul x14, x14, dst_w_step
+
+    mov x19, fh
+    L4LoopFY:
+        mov x20, fw
+        L4LoopFX:
+            mov x13, ic4
+            ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [weight], #64
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            fmul v14.4s, v28.4s, v0.s[0]
+            fmul v15.4s, v28.4s, v1.s[0]
+            fmul v16.4s, v28.4s, v2.s[0]
+            fmul v17.4s, v28.4s, v3.s[0]
+
+            subs ic4, ic4, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                sub src, src, #64
+                add src, src, x6
+
+                fmla v14.4s, v29.4s, v0.s[1]
+                fmla v15.4s, v29.4s, v1.s[1]
+                fmla v16.4s, v29.4s, v2.s[1]
+                fmla v17.4s, v29.4s, v3.s[1]
+
+                fmla v14.4s, v30.4s, v0.s[2]
+                fmla v15.4s, v30.4s, v1.s[2]
+                fmla v16.4s, v30.4s, v2.s[2]
+                fmla v17.4s, v30.4s, v3.s[2]
+
+                fmla v14.4s, v31.4s, v0.s[3]
+                fmla v15.4s, v31.4s, v1.s[3]
+                fmla v16.4s, v31.4s, v2.s[3]
+                fmla v17.4s, v31.4s, v3.s[3]
+
+                ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [weight], #64
+
+                ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 
+
+                fmla v14.4s, v28.4s, v0.s[0]
+                fmla v15.4s, v28.4s, v1.s[0]
+                fmla v16.4s, v28.4s, v2.s[0]
+                fmla v17.4s, v28.4s, v3.s[0]
+
+                subs ic4, ic4, #1
+                bne L4LoopZ
+
+            L4LoopZEnd:
+            fmla v14.4s, v29.4s, v0.s[1]
+            fmla v15.4s, v29.4s, v1.s[1]
+            fmla v16.4s, v29.4s, v2.s[1]
+            fmla v17.4s, v29.4s, v3.s[1]
+
+            fmla v14.4s, v30.4s, v0.s[2]
+            fmla v15.4s, v30.4s, v1.s[2]
+            fmla v16.4s, v30.4s, v2.s[2]
+            fmla v17.4s, v30.4s, v3.s[2]
+
+            fmla v14.4s, v31.4s, v0.s[3]
+            fmla v15.4s, v31.4s, v1.s[3]
+            fmla v16.4s, v31.4s, v2.s[3]
+            fmla v17.4s, v31.4s, v3.s[3]
+            // add with stride
+            ld1 {v0.4s}, [dst]
+            fadd v14.4s, v14.4s, v0.4s
+            st1 {v14.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v15.4s, v15.4s, v0.4s
+            st1 {v15.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v16.4s, v16.4s, v0.4s
+            st1 {v16.4s}, [dst], dst_w_step
+
+            ld1 {v0.4s}, [dst]
+            fadd v17.4s, v17.4s, v0.4s
+            st1 {v17.4s}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic4, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L4LoopFY
+    
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #64
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #4
+    cmp width, #4
+    bge L4Loop
+
+
+L1:
+cmp x3, #0
+ble End
+
+L1Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #1
+    mul x14, dst_w_step, x14
+
+    mov x19, fh
+    L1LoopFY:
+        mov x20, fw
+        L1LoopFX:
+            mov x13, ic4
+            movi v14.4s, #0
+            movi v15.4s, #0
+            L1LoopZ:
+                ld1 {v28.4s, v29.4s}, [weight], #32
+                ld1 {v0.4s}, [x1], #16
+
+                fmla v14.4s, v28.4s, v0.s[0]
+                fmla v15.4s, v29.4s, v0.s[1]
+                ld1 {v30.4s, v31.4s}, [weight], #32
+                fmla v14.4s, v30.4s, v0.s[2]
+                fmla v15.4s, v31.4s, v0.s[3]
+
+                sub src, src, #16
+                add src, src, x6
+
+                subs ic4, ic4, #1
+                bne L1LoopZ
+
+            L1LoopZEnd:
+            // add with stride
+            fadd v14.4s, v14.4s, v15.4s
+            ld1 {v0.4s}, [dst]
+            fadd v14.4s, v14.4s, v0.4s
+            st1 {v14.4s}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic4, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L1LoopFY
+    
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #16
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #1
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+sub sp, sp, #144
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_BFP16_N8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_BFP16_N8.S
new file mode 100644
index 0000000..fc19762
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_BFP16_N8.S
@@ -0,0 +1,429 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_BFP16_N8
+//void GEMM_BFP16_N8(bfp16_t* dst, const bfp16_t* src, const float* weight, int ic4,
+//                            int dst_step, int oc4, int width, float *bias, int64_t relu)
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:ic4, x4:dst_step, x5:oc4, x6: width, x7: bias
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 y
+fmla \z0, \y, v0.s[0]
+fmla \z1, \y, v0.s[1]
+fmla \z2, \y, v0.s[2]
+fmla \z3, \y, v0.s[3]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 y
+fmla \z0, \y, v1.s[0]
+fmla \z1, \y, v1.s[1]
+fmla \z2, \y, v1.s[2]
+fmla \z3, \y, v1.s[3]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 y
+fmla \z0, \y, v2.s[0]
+fmla \z1, \y, v2.s[1]
+fmla \z2, \y, v2.s[2]
+fmla \z3, \y, v2.s[3]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 y
+fmla \z0, \y, v3.s[0]
+fmla \z1, \y, v3.s[1]
+fmla \z2, \y, v3.s[2]
+fmla \z3, \y, v3.s[3]
+.endm
+
+//Load from sp
+
+
+//step multi by sizeof(int16_t)
+//bf16
+mov x12, #2
+mul x4, x12, x4
+
+prfm pldl1keep, [x7]
+sub sp, sp, #160
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+stp x21, x22, [sp], #16
+
+//x8: src_z_step
+//bf16
+mov x12, #8
+mul x8, x12, x6
+
+//x9: weight_z_step
+mov x12, #128
+mul x9, x12, x3
+mov x13, x3
+mov x19, x6
+
+LoopDz:
+mov x10, x0
+mov x14, x1
+mov x15, x2
+add x20, x2, x9
+add x21, x0, x4
+
+L12:
+cmp x6, #11
+ble L4
+ld1 {v19.16b, v20.16b}, [x7]
+
+mov x11, x1
+mov x12, x2
+mov v8.16b, v19.16b
+mov v9.16b, v19.16b
+mov v10.16b, v19.16b
+mov v11.16b, v19.16b
+mov v12.16b, v19.16b
+mov v13.16b, v19.16b
+mov v14.16b, v19.16b
+mov v15.16b, v19.16b
+mov v16.16b, v19.16b
+mov v17.16b, v19.16b
+mov v18.16b, v19.16b
+mov v21.16b, v20.16b
+mov v22.16b, v20.16b
+mov v23.16b, v20.16b
+mov v24.16b, v20.16b
+mov v25.16b, v20.16b
+mov v26.16b, v20.16b
+mov v27.16b, v20.16b
+
+mov v28.16b, v20.16b
+mov v29.16b, v20.16b
+mov v30.16b, v20.16b
+mov v31.16b, v20.16b
+
+L12Loop:
+    prfm pldl1keep, [x1, #256]
+	//ld1 {v2.4s, v3.4s}, [x1], #32
+    ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+    shll v2.4s, v2.4h, #16
+    shll v3.4s, v3.4h, #16
+    prfm pldl1keep, [x2, #256]
+    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], #64
+	
+    COMPUTE_UNIT_0 v8.4s, v9.4s, v10.4s, v11.4s, v4.4s
+    COMPUTE_UNIT_1 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+    COMPUTE_UNIT_0 v20.4s, v21.4s, v22.4s, v23.4s, v5.4s
+    COMPUTE_UNIT_1 v24.4s, v25.4s, v26.4s, v27.4s, v5.4s
+
+    COMPUTE_UNIT_2 v16.4s, v17.4s, v18.4s, v19.4s, v4.4s
+    COMPUTE_UNIT_2 v28.4s, v29.4s, v30.4s, v31.4s, v5.4s
+    
+    COMPUTE_UNIT_3 v8.4s, v9.4s, v10.4s, v11.4s, v6.4s
+    COMPUTE_UNIT_3 v20.4s, v21.4s, v22.4s, v23.4s, v7.4s
+
+    prfm pldl1keep, [x1, #256]
+    ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+    shll v2.4s, v2.4h, #16
+    shll v3.4s, v3.4h, #16
+
+    COMPUTE_UNIT_0 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    COMPUTE_UNIT_1 v16.4s, v17.4s, v18.4s, v19.4s, v6.4s
+
+    COMPUTE_UNIT_0 v24.4s, v25.4s, v26.4s, v27.4s, v7.4s
+    COMPUTE_UNIT_1 v28.4s, v29.4s, v30.4s, v31.4s, v7.4s
+
+
+    prfm pldl1keep, [x2, #256]
+    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], #64
+
+    COMPUTE_UNIT_2 v8.4s, v9.4s, v10.4s, v11.4s, v4.4s
+    COMPUTE_UNIT_3 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+
+    COMPUTE_UNIT_2 v20.4s, v21.4s, v22.4s, v23.4s, v5.4s
+    COMPUTE_UNIT_3 v24.4s, v25.4s, v26.4s, v27.4s, v5.4s
+	
+    prfm pldl1keep, [x1, #256]
+    ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+    shll v2.4s, v2.4h, #16
+    shll v3.4s, v3.4h, #16
+
+    COMPUTE_UNIT_0 v16.4s, v17.4s, v18.4s, v19.4s, v4.4s
+    COMPUTE_UNIT_0 v28.4s, v29.4s, v30.4s, v31.4s, v5.4s
+
+    COMPUTE_UNIT_1 v8.4s, v9.4s, v10.4s, v11.4s, v6.4s
+    COMPUTE_UNIT_1 v20.4s, v21.4s, v22.4s, v23.4s, v7.4s
+    
+    //sub x1, x1, #192
+    sub x1, x1, #96
+    add x1, x1, x8
+
+    COMPUTE_UNIT_2 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    COMPUTE_UNIT_3 v16.4s, v17.4s, v18.4s, v19.4s, v6.4s
+
+    subs x3, x3, #1
+
+    COMPUTE_UNIT_2 v24.4s, v25.4s, v26.4s, v27.4s, v7.4s
+    COMPUTE_UNIT_3 v28.4s, v29.4s, v30.4s, v31.4s, v7.4s
+
+    bne L12Loop
+
+    ldr x1, [sp, #0]
+    cbz x1, Store12
+    movi v0.4s, #0
+    fmax   v8.4s, v8.4s, v0.4s
+    fmax   v9.4s, v9.4s, v0.4s
+    fmax   v10.4s, v10.4s, v0.4s
+    fmax   v11.4s, v11.4s, v0.4s
+    fmax   v12.4s, v12.4s, v0.4s
+    fmax   v13.4s, v13.4s, v0.4s
+    fmax   v14.4s, v14.4s, v0.4s
+    fmax   v15.4s, v15.4s, v0.4s
+    fmax   v16.4s,v16.4s,v0.4s
+    fmax   v17.4s,v17.4s,v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+    fmax   v19.4s, v19.4s, v0.4s
+    fmax   v20.4s, v20.4s, v0.4s
+    fmax   v21.4s, v21.4s, v0.4s
+    fmax   v22.4s, v22.4s, v0.4s
+    fmax   v23.4s, v23.4s, v0.4s
+    fmax   v24.4s,v24.4s,v0.4s
+    fmax   v25.4s,v25.4s,v0.4s
+    fmax   v26.4s, v26.4s, v0.4s
+    fmax   v27.4s, v27.4s, v0.4s
+    fmax   v28.4s, v28.4s, v0.4s
+    fmax   v29.4s, v29.4s, v0.4s
+    fmax   v30.4s, v30.4s, v0.4s
+    fmax   v31.4s, v31.4s, v0.4s
+Store12:
+shrn v8.4h, v8.4s, #16
+shrn v9.4h, v9.4s, #16
+shrn v10.4h, v10.4s, #16
+shrn v11.4h, v11.4s, #16
+shrn v12.4h, v12.4s, #16
+shrn v13.4h, v13.4s, #16
+shrn v14.4h, v14.4s, #16
+shrn v15.4h, v15.4s, #16
+st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x0], #32
+shrn v16.4h, v16.4s, #16
+shrn v17.4h, v17.4s, #16
+shrn v18.4h, v18.4s, #16
+shrn v19.4h, v19.4s, #16
+st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32
+shrn v20.4h, v20.4s, #16
+shrn v21.4h, v21.4s, #16
+shrn v22.4h, v22.4s, #16
+shrn v23.4h, v23.4s, #16
+st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32
+shrn v24.4h, v24.4s, #16
+shrn v25.4h, v25.4s, #16
+shrn v26.4h, v26.4s, #16
+shrn v27.4h, v27.4s, #16
+st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x21], #32
+shrn v28.4h, v28.4s, #16
+shrn v29.4h, v29.4s, #16
+shrn v30.4h, v30.4s, #16
+shrn v31.4h, v31.4s, #16
+sub x6, x6, #12
+st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x21], #32
+st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x21], #32
+cmp x6, #12
+
+
+//add x1, x11, #192
+add x1, x11, #96
+mov x2, x12
+mov x3, x13
+bge L12
+
+L4:
+cmp x6, #3
+ble L1
+
+ld1 {v15.16b, v16.16b}, [x7]
+mov x11, x1
+mov x12, x2
+ld1 {v0.4h, v1.4h}, [x1], #16
+shll v0.4s, v0.4h, #16
+shll v1.4s, v1.4h, #16
+mov v12.16b, v15.16b
+mov v13.16b, v15.16b
+ld1 {v4.4s, v5.4s}, [x2], #32
+mov v14.16b, v15.16b
+mov v17.16b, v16.16b
+mov v18.16b, v16.16b
+mov v19.16b, v16.16b
+ 
+
+L4Loop:
+    prfm pldl1keep, [x1, #256]
+    ld1 {v2.4h, v3.4h}, [x1], #16
+    shll v2.4s, v2.4h, #16
+    shll v3.4s, v3.4h, #16
+    ld1 {v6.4s, v7.4s}, [x2], #32
+
+    COMPUTE_UNIT_0 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+    ld1 {v8.4s, v9.4s}, [x2], #32
+    COMPUTE_UNIT_0 v16.4s, v17.4s, v18.4s, v19.4s, v5.4s
+
+    ld1 {v10.4s, v11.4s}, [x2], #32
+    COMPUTE_UNIT_1 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    
+    ld1 {v4.4s, v5.4s}, [x2], #32
+    sub x1, x1, #32
+    add x1, x1, x8
+    COMPUTE_UNIT_1 v16.4s, v17.4s, v18.4s, v19.4s, v7.4s
+
+    ld1 {v0.4h, v1.4h}, [x1], #16
+    shll v0.4s, v0.4h, #16
+    shll v1.4s, v1.4h, #16
+
+    prfm pldl1keep, [x2, #256]
+
+    COMPUTE_UNIT_2 v12.4s, v13.4s, v14.4s, v15.4s, v8.4s
+    COMPUTE_UNIT_2 v16.4s, v17.4s, v18.4s, v19.4s, v9.4s
+
+    subs x3, x3, #1
+
+    COMPUTE_UNIT_3 v12.4s, v13.4s, v14.4s, v15.4s, v10.4s
+    COMPUTE_UNIT_3 v16.4s, v17.4s, v18.4s, v19.4s, v11.4s
+
+    bne L4Loop
+    ldr x1, [sp, #0]
+    cbz x1, Store4
+    movi v0.4s, #0
+    fmax   v12.4s, v12.4s, v0.4s
+    fmax   v13.4s, v13.4s, v0.4s
+    fmax   v14.4s, v14.4s, v0.4s
+    fmax   v15.4s, v15.4s, v0.4s
+    fmax   v16.4s, v16.4s, v0.4s
+    fmax   v17.4s, v17.4s, v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+    fmax   v19.4s, v19.4s, v0.4s
+Store4:
+shrn v12.4h, v12.4s, #16
+shrn v13.4h, v13.4s, #16
+shrn v14.4h, v14.4s, #16
+shrn v15.4h, v15.4s, #16
+st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32
+shrn v16.4h, v16.4s, #16
+shrn v17.4h, v17.4s, #16
+shrn v18.4h, v18.4s, #16
+shrn v19.4h, v19.4s, #16
+st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x21], #32
+
+subs x6, x6, #4
+cmp x6, #4
+add x1, x11, #32
+mov x2, x12
+mov x3, x13
+bge L4
+
+L1:
+cmp x6, #0
+ble End
+
+L1Loop:
+    ld1 {v17.4s, v18.4s}, [x7]
+    mov x11, x1
+    mov x12, x2
+    prfm pldl1keep, [x2, #256]
+    ld1 {v0.4h}, [x1], x8
+    shll v0.4s, v0.4h, #16
+    ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
+    fmul v16.4s, v2.4s, v0.s[0]
+    fmla v18.4s, v3.4s, v0.s[0]
+    subs x3, x3, #1
+    fmla v17.4s, v4.4s, v0.s[1]
+	fmul v19.4s, v5.4s, v0.s[1]
+
+    beq L1LoopZEnd
+
+    L1LoopZ:
+        fmla v16.4s, v24.4s, v0.s[2]
+        fmla v18.4s, v25.4s, v0.s[2]
+        fmla v17.4s, v26.4s, v0.s[3]
+        fmla v19.4s, v27.4s, v0.s[3]
+
+        prfm pldl1keep, [x2, #256]
+        ld1 {v0.4h}, [x1], x8
+        shll v0.4s, v0.4h, #16
+        ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
+	    ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
+       
+        fmla v16.4s, v2.4s, v0.s[0]
+        fmla v18.4s, v3.4s, v0.s[0]
+        fmla v17.4s, v4.4s, v0.s[1]
+        fmla v19.4s, v5.4s, v0.s[1]
+
+        subs x3, x3, #1
+        bne L1LoopZ
+    L1LoopZEnd:
+
+    fmla v16.4s, v24.4s, v0.s[2]
+    fmla v18.4s, v25.4s, v0.s[2]
+	fmla v17.4s, v26.4s, v0.s[3]
+    fmla v19.4s, v27.4s, v0.s[3]
+
+    fadd v16.4s, v16.4s, v17.4s
+	fadd v18.4s, v18.4s, v19.4s
+    ldr x1, [sp, #0]
+    cbz x1, Store1
+    movi v0.4s, #0
+    fmax   v16.4s, v16.4s, v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+Store1:
+    add x1, x11, #8
+    mov x2, x12
+    mov x3, x13
+    subs x6, x6, #1
+    shrn v16.4h, v16.4s, #16
+    shrn v18.4h, v18.4s, #16
+    st1 {v16.4h}, [x0], #8
+	st1 {v18.4h}, [x21], #8
+    bne L1Loop
+
+	
+End:
+
+subs x5, x5, #2
+add x0, x10, x4
+mov x1, x14
+add x7, x7, #32
+add x2, x15, x9
+add x0, x0, x4
+mov x6, x19
+bne LoopDz
+
+sub sp, sp, #160
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+ldp x21, x22, [sp], #16
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_FLOAT_N8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_FLOAT_N8.S
new file mode 100644
index 0000000..a5c2411
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_FLOAT_N8.S
@@ -0,0 +1,377 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_FLOAT_N8
+//void GEMM_FLOAT_N8(float* dst, const float* src, const float* weight, int src_depth_quad,
+//                            int dst_step, int dst_depth_quad, int width, float *bias, int64_t relu)
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step, x5:dst_depth_quad, x6: width, x7: bias
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 y
+fmla \z0, \y, v0.s[0]
+fmla \z1, \y, v0.s[1]
+fmla \z2, \y, v0.s[2]
+fmla \z3, \y, v0.s[3]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 y
+fmla \z0, \y, v1.s[0]
+fmla \z1, \y, v1.s[1]
+fmla \z2, \y, v1.s[2]
+fmla \z3, \y, v1.s[3]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 y
+fmla \z0, \y, v2.s[0]
+fmla \z1, \y, v2.s[1]
+fmla \z2, \y, v2.s[2]
+fmla \z3, \y, v2.s[3]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 y
+fmla \z0, \y, v3.s[0]
+fmla \z1, \y, v3.s[1]
+fmla \z2, \y, v3.s[2]
+fmla \z3, \y, v3.s[3]
+.endm
+
+//Load from sp
+
+
+//step multi by sizeof(float)
+mov x12, #4
+mul x4, x12, x4
+
+prfm pldl1keep, [x7]
+sub sp, sp, #160
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+stp x21, x22, [sp], #16
+
+//x8: src_z_step
+mov x12, #16
+mul x8, x12, x6
+
+//x9: weight_z_step
+mov x12, #128
+mul x9, x12, x3
+mov x13, x3
+mov x19, x6
+
+LoopDz:
+mov x10, x0
+mov x14, x1
+mov x15, x2
+add x20, x2, x9
+add x21, x0, x4
+
+L12:
+cmp x6, #11
+ble L4
+ld1 {v19.16b, v20.16b}, [x7]
+
+mov x11, x1
+mov x12, x2
+ld1 {v0.4s, v1.4s}, [x1], #32
+mov v8.16b, v19.16b
+mov v9.16b, v19.16b
+mov v10.16b, v19.16b
+mov v11.16b, v19.16b
+mov v12.16b, v19.16b
+mov v13.16b, v19.16b
+mov v14.16b, v19.16b
+ld1 {v4.4s, v5.4s}, [x2], #32
+mov v15.16b, v19.16b
+mov v16.16b, v19.16b
+mov v17.16b, v19.16b
+mov v18.16b, v19.16b
+mov v21.16b, v20.16b
+mov v22.16b, v20.16b
+mov v23.16b, v20.16b
+mov v24.16b, v20.16b
+mov v25.16b, v20.16b
+mov v26.16b, v20.16b
+mov v27.16b, v20.16b
+
+mov v28.16b, v20.16b
+mov v29.16b, v20.16b
+mov v30.16b, v20.16b
+mov v31.16b, v20.16b
+
+L12Loop:
+    prfm pldl1keep, [x1, #512]
+	ld1 {v2.4s, v3.4s}, [x1], #32
+    prfm pldl1keep, [x2, #512]
+	
+    COMPUTE_UNIT_0 v8.4s, v9.4s, v10.4s, v11.4s, v4.4s
+	ld1 {v6.4s, v7.4s}, [x2], #32
+    COMPUTE_UNIT_1 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+    COMPUTE_UNIT_0 v20.4s, v21.4s, v22.4s, v23.4s, v5.4s
+    COMPUTE_UNIT_1 v24.4s, v25.4s, v26.4s, v27.4s, v5.4s
+
+    ld1 {v0.4s, v1.4s}, [x1], #32
+    COMPUTE_UNIT_2 v16.4s, v17.4s, v18.4s, v19.4s, v4.4s
+    COMPUTE_UNIT_2 v28.4s, v29.4s, v30.4s, v31.4s, v5.4s
+    
+	ld1 {v4.4s, v5.4s}, [x2], #32
+    COMPUTE_UNIT_3 v8.4s, v9.4s, v10.4s, v11.4s, v6.4s
+    COMPUTE_UNIT_3 v20.4s, v21.4s, v22.4s, v23.4s, v7.4s
+
+    prfm pldl1keep, [x1, #512]
+	ld1 {v2.4s, v3.4s}, [x1], #32
+
+    COMPUTE_UNIT_0 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    COMPUTE_UNIT_1 v16.4s, v17.4s, v18.4s, v19.4s, v6.4s
+
+    COMPUTE_UNIT_0 v24.4s, v25.4s, v26.4s, v27.4s, v7.4s
+    COMPUTE_UNIT_1 v28.4s, v29.4s, v30.4s, v31.4s, v7.4s
+
+    ld1 {v0.4s, v1.4s}, [x1], #32
+
+    prfm pldl1keep, [x2, #512]
+	ld1 {v6.4s, v7.4s}, [x2], #32
+
+    COMPUTE_UNIT_2 v8.4s, v9.4s, v10.4s, v11.4s, v4.4s
+    COMPUTE_UNIT_3 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+
+    COMPUTE_UNIT_2 v20.4s, v21.4s, v22.4s, v23.4s, v5.4s
+    COMPUTE_UNIT_3 v24.4s, v25.4s, v26.4s, v27.4s, v5.4s
+	
+    prfm pldl1keep, [x1, #512]
+    ld1 {v2.4s, v3.4s}, [x1], #32
+
+    COMPUTE_UNIT_0 v16.4s, v17.4s, v18.4s, v19.4s, v4.4s
+    COMPUTE_UNIT_0 v28.4s, v29.4s, v30.4s, v31.4s, v5.4s
+
+	ld1 {v4.4s, v5.4s}, [x2], #32
+    COMPUTE_UNIT_1 v8.4s, v9.4s, v10.4s, v11.4s, v6.4s
+    COMPUTE_UNIT_1 v20.4s, v21.4s, v22.4s, v23.4s, v7.4s
+    
+    sub x1, x1, #192
+    add x1, x1, x8
+    ld1 {v0.4s, v1.4s}, [x1], #32
+
+    COMPUTE_UNIT_2 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    COMPUTE_UNIT_3 v16.4s, v17.4s, v18.4s, v19.4s, v6.4s
+
+    subs x3, x3, #1
+
+    COMPUTE_UNIT_2 v24.4s, v25.4s, v26.4s, v27.4s, v7.4s
+    COMPUTE_UNIT_3 v28.4s, v29.4s, v30.4s, v31.4s, v7.4s
+
+    bne L12Loop
+
+    ldr x1, [sp, #0]
+    cbz x1, Store12
+    movi v0.4s, #0
+    fmax   v8.4s, v8.4s, v0.4s
+    fmax   v9.4s, v9.4s, v0.4s
+    fmax   v10.4s, v10.4s, v0.4s
+    fmax   v11.4s, v11.4s, v0.4s
+    fmax   v12.4s, v12.4s, v0.4s
+    fmax   v13.4s, v13.4s, v0.4s
+    fmax   v14.4s, v14.4s, v0.4s
+    fmax   v15.4s, v15.4s, v0.4s
+    fmax   v16.4s,v16.4s,v0.4s
+    fmax   v17.4s,v17.4s,v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+    fmax   v19.4s, v19.4s, v0.4s
+    fmax   v20.4s, v20.4s, v0.4s
+    fmax   v21.4s, v21.4s, v0.4s
+    fmax   v22.4s, v22.4s, v0.4s
+    fmax   v23.4s, v23.4s, v0.4s
+    fmax   v24.4s,v24.4s,v0.4s
+    fmax   v25.4s,v25.4s,v0.4s
+    fmax   v26.4s, v26.4s, v0.4s
+    fmax   v27.4s, v27.4s, v0.4s
+    fmax   v28.4s, v28.4s, v0.4s
+    fmax   v29.4s, v29.4s, v0.4s
+    fmax   v30.4s, v30.4s, v0.4s
+    fmax   v31.4s, v31.4s, v0.4s
+Store12:
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
+sub x6, x6, #12
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
+st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x21], #64
+st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x21], #64
+st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x21], #64
+cmp x6, #12
+
+
+add x1, x11, #192
+mov x2, x12
+mov x3, x13
+bge L12
+
+L4:
+cmp x6, #3
+ble L1
+
+ld1 {v15.16b, v16.16b}, [x7]
+mov x11, x1
+mov x12, x2
+ld1 {v0.4s, v1.4s}, [x1], #32
+mov v12.16b, v15.16b
+mov v13.16b, v15.16b
+ld1 {v4.4s, v5.4s}, [x2], #32
+mov v14.16b, v15.16b
+mov v17.16b, v16.16b
+mov v18.16b, v16.16b
+mov v19.16b, v16.16b
+ 
+
+L4Loop:
+    prfm pldl1keep, [x1, #512]
+    ld1 {v2.4s, v3.4s}, [x1], #32
+    prfm pldl1keep, [x2, #512]
+    ld1 {v6.4s, v7.4s}, [x2], #32
+
+    COMPUTE_UNIT_0 v12.4s, v13.4s, v14.4s, v15.4s, v4.4s
+    ld1 {v8.4s, v9.4s}, [x2], #32
+    COMPUTE_UNIT_0 v16.4s, v17.4s, v18.4s, v19.4s, v5.4s
+
+    ld1 {v10.4s, v11.4s}, [x2], #32
+    COMPUTE_UNIT_1 v12.4s, v13.4s, v14.4s, v15.4s, v6.4s
+    
+    ld1 {v4.4s, v5.4s}, [x2], #32
+    sub x1, x1, #64
+    add x1, x1, x8
+    COMPUTE_UNIT_1 v16.4s, v17.4s, v18.4s, v19.4s, v7.4s
+
+    ld1 {v0.4s, v1.4s}, [x1], #32
+
+    prfm pldl1keep, [x2, #512]
+
+    COMPUTE_UNIT_2 v12.4s, v13.4s, v14.4s, v15.4s, v8.4s
+    COMPUTE_UNIT_2 v16.4s, v17.4s, v18.4s, v19.4s, v9.4s
+
+    subs x3, x3, #1
+
+    COMPUTE_UNIT_3 v12.4s, v13.4s, v14.4s, v15.4s, v10.4s
+    COMPUTE_UNIT_3 v16.4s, v17.4s, v18.4s, v19.4s, v11.4s
+
+    bne L4Loop
+    ldr x1, [sp, #0]
+    cbz x1, Store4
+    movi v0.4s, #0
+    fmax   v12.4s, v12.4s, v0.4s
+    fmax   v13.4s, v13.4s, v0.4s
+    fmax   v14.4s, v14.4s, v0.4s
+    fmax   v15.4s, v15.4s, v0.4s
+    fmax   v16.4s, v16.4s, v0.4s
+    fmax   v17.4s, v17.4s, v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+    fmax   v19.4s, v19.4s, v0.4s
+Store4:
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
+st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x21], #64
+
+subs x6, x6, #4
+cmp x6, #4
+add x1, x11, #64
+mov x2, x12
+mov x3, x13
+bge L4
+
+L1:
+cmp x6, #0
+ble End
+
+L1Loop:
+    ld1 {v17.4s, v18.4s}, [x7]
+    mov x11, x1
+    mov x12, x2
+    prfm pldl1keep, [x2, #512]
+    ld1 {v0.4s}, [x1], x8
+    ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
+    fmul v16.4s, v2.4s, v0.s[0]
+    fmla v18.4s, v3.4s, v0.s[0]
+    subs x3, x3, #1
+    fmla v17.4s, v4.4s, v0.s[1]
+	fmul v19.4s, v5.4s, v0.s[1]
+
+    beq L1LoopZEnd
+
+    L1LoopZ:
+        fmla v16.4s, v24.4s, v0.s[2]
+        fmla v18.4s, v25.4s, v0.s[2]
+        fmla v17.4s, v26.4s, v0.s[3]
+        fmla v19.4s, v27.4s, v0.s[3]
+
+        prfm pldl1keep, [x2, #512]
+        ld1 {v0.4s}, [x1], x8
+        ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
+	    ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
+        fmla v16.4s, v2.4s, v0.s[0]
+        fmla v18.4s, v3.4s, v0.s[0]
+        fmla v17.4s, v4.4s, v0.s[1]
+        fmla v19.4s, v5.4s, v0.s[1]
+
+        subs x3, x3, #1
+        bne L1LoopZ
+    L1LoopZEnd:
+
+    fmla v16.4s, v24.4s, v0.s[2]
+    fmla v18.4s, v25.4s, v0.s[2]
+	fmla v17.4s, v26.4s, v0.s[3]
+    fmla v19.4s, v27.4s, v0.s[3]
+
+    fadd v16.4s, v16.4s, v17.4s
+	fadd v18.4s, v18.4s, v19.4s
+    ldr x1, [sp, #0]
+    cbz x1, Store1
+    movi v0.4s, #0
+    fmax   v16.4s, v16.4s, v0.4s
+    fmax   v18.4s, v18.4s, v0.4s
+Store1:
+    add x1, x11, #16
+    mov x2, x12
+    mov x3, x13
+    subs x6, x6, #1
+    st1 {v16.4s}, [x0], #16
+	st1 {v18.4s}, [x21], #16
+    bne L1Loop
+
+	
+End:
+
+subs x5, x5, #2
+add x0, x10, x4
+mov x1, x14
+add x7, x7, #32
+add x2, x15, x9
+add x0, x0, x4
+mov x6, x19
+bne LoopDz
+
+sub sp, sp, #160
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+ldp x21, x22, [sp], #16
+
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_4X4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_4X4.S
new file mode 100644
index 0000000..f6932a6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_4X4.S
@@ -0,0 +1,372 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+asm_function GemmInt8Unit4x4 
+//void GemmInt8Unit4x4(int8_t* src, const int8_t* weight, int8_t* dst, int src_w_step, int dst_depth, 
+//     int cdiv8, float *scale, int32_t*bias, long relu, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max)
+//x0(src),
+//x1(weight),
+//x2(dst),
+//x3(src_w_step),
+//x4(dst_depth),
+//x5(cdiv8),
+//x6(scale),
+//x7(bias)
+//stack(relu)      [sp, 0]
+//stack(add_input) [sp, 8]
+//stack(add_scale) [sp, 16]
+//stack(relu6_max) [sp, 24]
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+//prefetch data
+//assume buffer c>=16, even c==8
+ld1 {v12.16b, v13.16b}, [x1], #32 
+add x11, x0, x3 
+add x12, x11, x3 
+add x13, x12, x3 
+
+cmp x5, #2
+ld1 {v14.16b, v15.16b}, [x1], #32 
+ld1 {v8.16b}, [x0], #16 
+ld1 {v9.16b}, [x11], #16 
+
+blt C8First 
+
+C16Start:
+     sub x5, x5, #2 
+     
+     smull v0.8h, v12.8b, v8.8b 
+     smull v1.8h, v13.8b, v8.8b 
+     smlal2 v0.8h, v12.16b, v8.16b 
+     smlal2 v1.8h, v13.16b, v8.16b 
+     saddlp v16.4s, v0.8h 
+     saddlp v17.4s, v1.8h 
+      
+     smull v2.8h, v14.8b, v8.8b 
+     smull v3.8h, v15.8b, v8.8b 
+     smull v4.8h, v12.8b, v9.8b 
+     ld1 {v10.16b}, [x12], #16 
+     smull v5.8h, v13.8b, v9.8b 
+     smull v6.8h, v14.8b, v9.8b 
+     smull v7.8h, v15.8b, v9.8b 
+     smlal2 v2.8h, v14.16b, v8.16b 
+     ld1 {v11.16b}, [x13], #16 
+     smlal2 v3.8h, v15.16b, v8.16b 
+     smlal2 v4.8h, v12.16b, v9.16b 
+     smlal2 v5.8h, v13.16b, v9.16b 
+     smlal2 v6.8h, v14.16b, v9.16b 
+     smlal2 v7.8h, v15.16b, v9.16b 
+     saddlp v18.4s, v2.8h 
+     saddlp v19.4s, v3.8h 
+     saddlp v20.4s, v4.8h 
+     saddlp v21.4s, v5.8h 
+     saddlp v22.4s, v6.8h 
+     saddlp v23.4s, v7.8h 
+     
+     cmp x5, #2
+     
+     smull v0.8h, v12.8b, v10.8b 
+     smull v1.8h, v13.8b, v10.8b 
+     smull v2.8h, v14.8b, v10.8b 
+     smull v3.8h, v15.8b, v10.8b 
+     smlal2 v0.8h, v12.16b, v10.16b 
+     smlal2 v1.8h, v13.16b, v10.16b 
+     smlal2 v2.8h, v14.16b, v10.16b 
+     smlal2 v3.8h, v15.16b, v10.16b 
+         ld1 {v8.16b}, [x0], #16 
+     saddlp v24.4s, v0.8h 
+     saddlp v25.4s, v1.8h 
+         ld1 {v9.16b}, [x11], #16 
+     saddlp v26.4s, v2.8h 
+     saddlp v27.4s, v3.8h 
+      
+     smull v4.8h, v12.8b, v11.8b 
+     smull v5.8h, v13.8b, v11.8b 
+     smull v6.8h, v14.8b, v11.8b 
+     smull v7.8h, v15.8b, v11.8b 
+     smlal2 v4.8h, v12.16b, v11.16b 
+     smlal2 v5.8h, v13.16b, v11.16b 
+     ld1 {v12.16b, v13.16b}, [x1], #32 
+     smlal2 v6.8h, v14.16b, v11.16b 
+     smlal2 v7.8h, v15.16b, v11.16b 
+     saddlp v28.4s, v4.8h 
+     saddlp v29.4s, v5.8h 
+         ld1 {v14.16b, v15.16b}, [x1], #32 
+     saddlp v30.4s, v6.8h 
+     saddlp v31.4s, v7.8h 
+      
+     blt C8Last 
+      
+     C16Loop: 
+         smull v0.8h, v12.8b, v8.8b 
+         ld1 {v10.16b}, [x12], #16 
+         smull v1.8h, v13.8b, v8.8b 
+         smull v2.8h, v14.8b, v8.8b 
+         smull v3.8h, v15.8b, v8.8b 
+         smlal2 v0.8h, v12.16b, v8.16b 
+         ld1 {v11.16b}, [x13], #16 
+         smlal2 v1.8h, v13.16b, v8.16b 
+         smlal2 v2.8h, v14.16b, v8.16b 
+         smlal2 v3.8h, v15.16b, v8.16b 
+         sadalp v16.4s, v0.8h 
+         smull v4.8h, v12.8b, v9.8b 
+         sadalp v17.4s, v1.8h 
+         smull v5.8h, v13.8b, v9.8b 
+         sadalp v18.4s, v2.8h 
+         smull v6.8h, v14.8b, v9.8b 
+         sadalp v19.4s, v3.8h 
+         smull v7.8h, v15.8b, v9.8b 
+      
+         smlal2 v4.8h, v12.16b, v9.16b 
+         ld1 {v8.16b}, [x0], #16 
+         smlal2 v5.8h, v13.16b, v9.16b 
+         smlal2 v6.8h, v14.16b, v9.16b 
+         sub x5, x5, #2 
+         smlal2 v7.8h, v15.16b, v9.16b 
+         sadalp v20.4s, v4.8h 
+         ld1 {v9.16b}, [x11], #16 
+         smull v0.8h, v12.8b, v10.8b 
+         sadalp v21.4s, v5.8h 
+         smull v1.8h, v13.8b, v10.8b 
+         sadalp v22.4s, v6.8h 
+         smull v2.8h, v14.8b, v10.8b 
+         sadalp v23.4s, v7.8h 
+         smull v3.8h, v15.8b, v10.8b 
+      
+         smlal2 v0.8h, v12.16b, v10.16b 
+         smlal2 v1.8h, v13.16b, v10.16b 
+         smlal2 v2.8h, v14.16b, v10.16b 
+         smlal2 v3.8h, v15.16b, v10.16b 
+         sadalp v24.4s, v0.8h 
+         smull v4.8h, v12.8b, v11.8b 
+         sadalp v25.4s, v1.8h 
+         smull v5.8h, v13.8b, v11.8b 
+         sadalp v26.4s, v2.8h 
+         smull v6.8h, v14.8b, v11.8b 
+         sadalp v27.4s, v3.8h 
+         smull v7.8h, v15.8b, v11.8b 
+      
+         smlal2 v4.8h, v12.16b, v11.16b 
+         smlal2 v5.8h, v13.16b, v11.16b 
+         cmp x5, #2 
+         smlal2 v6.8h, v14.16b, v11.16b 
+         ld1 {v12.16b, v13.16b}, [x1], #32 
+         smlal2 v7.8h, v15.16b, v11.16b 
+         sadalp v28.4s, v4.8h 
+         ld1 {v14.16b, v15.16b}, [x1], #32 
+         sadalp v29.4s, v5.8h 
+         sadalp v30.4s, v6.8h 
+         sadalp v31.4s, v7.8h 
+         bge C16Loop 
+ 
+C8Last:
+     cmp x5, #1
+     blt LoopEnd 
+     smull v0.8h, v12.8b, v8.8b 
+     ld1 {v10.16b}, [x12], #16 
+     smull v1.8h, v13.8b, v8.8b 
+     smull v2.8h, v14.8b, v8.8b 
+     smull v3.8h, v15.8b, v8.8b 
+     ld1 {v11.16b}, [x13], #16 
+     sadalp v16.4s, v0.8h 
+     smull v4.8h, v12.8b, v9.8b 
+     sadalp v17.4s, v1.8h 
+     smull v5.8h, v13.8b, v9.8b 
+     sadalp v18.4s, v2.8h 
+     smull v6.8h, v14.8b, v9.8b 
+     sadalp v19.4s, v3.8h 
+     smull v7.8h, v15.8b, v9.8b 
+     
+     sadalp v20.4s, v4.8h 
+     smull v0.8h, v12.8b, v10.8b 
+     sadalp v21.4s, v5.8h 
+     smull v1.8h, v13.8b, v10.8b 
+     sadalp v22.4s, v6.8h 
+     smull v2.8h, v14.8b, v10.8b 
+     sadalp v23.4s, v7.8h 
+     smull v3.8h, v15.8b, v10.8b 
+     
+     sadalp v24.4s, v0.8h 
+     smull v4.8h, v12.8b, v11.8b 
+     sadalp v25.4s, v1.8h 
+     smull v5.8h, v13.8b, v11.8b 
+     sadalp v26.4s, v2.8h 
+     smull v6.8h, v14.8b, v11.8b 
+     sadalp v27.4s, v3.8h 
+     smull v7.8h, v15.8b, v11.8b 
+     
+     sadalp v28.4s, v4.8h 
+     sadalp v29.4s, v5.8h 
+     sadalp v30.4s, v6.8h 
+     sadalp v31.4s, v7.8h 
+     b LoopEnd
+      
+C8First:
+     cmp x5, #1
+     blt LoopEnd 
+     smull v0.8h, v12.8b, v8.8b 
+     ld1 {v10.16b}, [x12], #16 
+     smull v1.8h, v13.8b, v8.8b 
+     smull v2.8h, v14.8b, v8.8b 
+     smull v3.8h, v15.8b, v8.8b 
+     ld1 {v11.16b}, [x13], #16 
+     saddlp v16.4s, v0.8h 
+     smull v4.8h, v12.8b, v9.8b 
+     saddlp v17.4s, v1.8h 
+     smull v5.8h, v13.8b, v9.8b 
+     saddlp v18.4s, v2.8h 
+     smull v6.8h, v14.8b, v9.8b 
+     saddlp v19.4s, v3.8h 
+     smull v7.8h, v15.8b, v9.8b 
+     
+     saddlp v20.4s, v4.8h 
+     smull v0.8h, v12.8b, v10.8b 
+     saddlp v21.4s, v5.8h 
+     smull v1.8h, v13.8b, v10.8b 
+     saddlp v22.4s, v6.8h 
+     smull v2.8h, v14.8b, v10.8b 
+     saddlp v23.4s, v7.8h 
+     smull v3.8h, v15.8b, v10.8b 
+     
+     saddlp v24.4s, v0.8h 
+     smull v4.8h, v12.8b, v11.8b 
+     saddlp v25.4s, v1.8h 
+     smull v5.8h, v13.8b, v11.8b 
+     saddlp v26.4s, v2.8h 
+     smull v6.8h, v14.8b, v11.8b 
+     saddlp v27.4s, v3.8h 
+     smull v7.8h, v15.8b, v11.8b 
+     
+     saddlp v28.4s, v4.8h 
+     saddlp v29.4s, v5.8h 
+     saddlp v30.4s, v6.8h 
+     saddlp v31.4s, v7.8h 
+LoopEnd: 
+      
+     ld1 {v0.4s}, [x7], #16 
+     addp v4.4s, v16.4s, v17.4s 
+     addp v5.4s, v18.4s, v19.4s 
+     addp v6.4s, v20.4s, v21.4s 
+     addp v7.4s, v22.4s, v23.4s 
+     addp v8.4s, v24.4s, v25.4s 
+     addp v9.4s, v26.4s, v27.4s 
+     addp v10.4s, v28.4s, v29.4s 
+     addp v11.4s, v30.4s, v31.4s 
+      
+     addp v12.4s, v4.4s, v5.4s 
+     addp v13.4s, v6.4s, v7.4s 
+     addp v14.4s, v8.4s, v9.4s 
+     addp v15.4s, v10.4s, v11.4s 
+     ld1 {v1.4s}, [x6], #16 
+     add v16.4s, v12.4s, v0.4s 
+     add v17.4s, v13.4s, v0.4s 
+     add v18.4s, v14.4s, v0.4s 
+     add v19.4s, v15.4s, v0.4s 
+      
+     scvtf v4.4s, v16.4s 
+     scvtf v5.4s, v17.4s 
+     scvtf v6.4s, v18.4s 
+     scvtf v7.4s, v19.4s 
+      
+     fmul v12.4s, v4.4s, v1.4s  // result = result * scale
+     fmul v13.4s, v5.4s, v1.4s 
+      
+     fmul v14.4s, v6.4s, v1.4s 
+     fmul v15.4s, v7.4s, v1.4s 
+
+     ldr x9, [sp, #0]    // relu
+     ldr x7, [sp, #8]    // add_input
+     ldr x8, [sp, #16]   // add_scale
+
+     cmp x9, #-1         // relu (conv - relu - add, relu == -1)
+     bne Add
+     movi v3.16b, #0
+     fmax v12.4s, v12.4s, v3.4s
+     fmax v13.4s, v13.4s, v3.4s
+     fmax v14.4s, v14.4s, v3.4s
+     fmax v15.4s, v15.4s, v3.4s
+
+Add:
+     cbz x7, Relu        // if add_input == 0, skip
+     ld1 {v28.s}[0], [x7], x4
+     ld1 {v28.s}[1], [x7], x4
+     ld1 {v29.s}[0], [x7], x4
+     ld1 {v29.s}[1], [x7]
+     ld1 {v30.4s}, [x8]
+
+     sxtl  v26.8h, v28.8b
+     sxtl  v27.8h, v29.8b
+     sxtl  v20.4s, v26.4h
+     sxtl2 v21.4s, v26.8h
+     sxtl  v22.4s, v27.4h
+     sxtl2 v23.4s, v27.8h
+
+     scvtf v24.4s, v20.4s
+     scvtf v25.4s, v21.4s
+     scvtf v26.4s, v22.4s
+     scvtf v27.4s, v23.4s
+
+     fmla v12.4s, v24.4s, v30.4s  // result += add_input * add_scale
+     fmla v13.4s, v25.4s, v30.4s
+     fmla v14.4s, v26.4s, v30.4s
+     fmla v15.4s, v27.4s, v30.4s
+
+Relu:
+     fcvtas v8.4s, v12.4s 
+     fcvtas v9.4s, v13.4s 
+     fcvtas v10.4s, v14.4s 
+     fcvtas v11.4s, v15.4s 
+      
+     sqxtn v0.4h, v8.4s 
+     sqxtn v1.4h, v10.4s 
+     sqxtn2 v0.8h, v9.4s 
+     sqxtn2 v1.8h, v11.4s 
+     sqxtn v2.8b, v0.8h 
+     sqxtn v4.8b, v1.8h
+
+     cmp x9, #1         // relu (conv add relu, relu == 1 or relu6 == 2)
+     blt Store
+     movi v3.8b, #0
+     smax v2.8b, v2.8b, v3.8b
+     smax v4.8b, v4.8b, v3.8b
+
+     cmp x9, #2         // relu6
+     bne Store
+     ldr x8, [sp, #24]  // relu6_max
+     ld1r {v3.2s}, [x8]
+     smin v2.8b, v2.8b, v3.8b
+     smin v4.8b, v4.8b, v3.8b
+
+Store:
+     st1 {v2.s}[0], [x2], x4
+     st1 {v2.s}[1], [x2], x4
+
+     st1 {v4.s}[0], [x2], x4
+     st1 {v4.s}[1], [x2]
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_8X8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_8X8.S
new file mode 100644
index 0000000..30f0331
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/arm64/GEMM_INT8_8X8.S
@@ -0,0 +1,827 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+asm_function GemmInt8Unit8x8
+//void GemmInt8Unit8x8(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c, long c_stride,
+//                     const float* scales, long relu, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max)
+//x0(mr),
+//x1(nr),
+//x2(k),
+//x3(src),
+//x4(a_stride),
+//x5(weight),
+//x6(dst),
+//x7(c_stride)
+//from stack(scale)     [sp, #0]
+//from stack(relu)      [sp, #8]
+//from stack(add_input) [sp, #16]
+//from stack(add_scale) [sp, #24]
+//from stack(relu6_max) [sp, #32]
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ldr x8, [sp, #0]
+
+// load bias 32bit, accumulator 16 reg
+ld1 {v8.4s, v9.4s}, [x5], #32
+mov v10.16b, v8.16b
+mov v11.16b, v9.16b
+mov v12.16b, v8.16b
+mov v13.16b, v9.16b
+mov v14.16b, v8.16b
+mov v15.16b, v9.16b
+mov v16.16b, v8.16b
+mov v17.16b, v9.16b
+mov v18.16b, v8.16b
+mov v19.16b, v9.16b
+mov v20.16b, v8.16b
+mov v21.16b, v9.16b
+mov v22.16b, v8.16b
+mov v23.16b, v9.16b
+
+# a1
+cmp x0, #2
+add x9, x3, x4
+csel x9, x3, x9, lo
+# a2
+add x10, x9, x4
+csel x10, x9, x10, ls
+# a3
+cmp x0, #4
+add x11, x10, x4
+csel x11, x10, x11, lo
+# a4
+add x12, x11, x4
+csel x12, x11, x12, ls
+# a5
+cmp x0, #6
+add x13, x12, x4
+csel x13, x12, x13, lo
+# a6
+add x14, x13, x4
+csel x14, x13, x14, ls
+# a7
+cmp x0, #8
+add x15, x14, x4
+csel x15, x14, x15, ne 
+
+subs x2, x2, #8
+blo 1f
+
+0:
+    ld1 {v27.8b}, [x5], #8
+    sxtl v27.8h, v27.8b
+    ld1 {v0.8b}, [x3], #8
+    sxtl v0.8h, v0.8b
+    ld1 {v1.8b}, [x9], #8
+    sxtl v1.8h, v1.8b
+    ld1 {v2.8b}, [x10], #8
+    sxtl v2.8h, v2.8b
+    ld1 {v3.8b}, [x11], #8
+    sxtl v3.8h, v3.8b
+    ld1 {v4.8b}, [x12], #8
+    sxtl v4.8h, v4.8b
+    ld1 {v5.8b}, [x13], #8
+    sxtl v5.8h, v5.8b
+    ld1 {v6.8b}, [x14], #8
+    sxtl v6.8h, v6.8b
+    ld1 {v7.8b}, [x15], #8
+    sxtl v7.8h, v7.8b
+
+    // c0
+    ld1 {v28.8b}, [x5], #8
+    smlal v8.4s, v27.4h, v0.h[0]
+    smlal2 v9.4s, v27.8h, v0.h[0]
+    smlal v10.4s, v27.4h, v1.h[0]
+    smlal2 v11.4s, v27.8h, v1.h[0]
+    smlal v12.4s, v27.4h, v2.h[0]
+    smlal2 v13.4s, v27.8h, v2.h[0]
+    smlal v14.4s, v27.4h, v3.h[0]
+    smlal2 v15.4s, v27.8h, v3.h[0]
+    sxtl v28.8h, v28.8b
+    smlal v16.4s, v27.4h, v4.h[0]
+    smlal2 v17.4s, v27.8h, v4.h[0]
+    smlal v18.4s, v27.4h, v5.h[0]
+    smlal2 v19.4s, v27.8h, v5.h[0]
+    smlal v20.4s, v27.4h, v6.h[0]
+    smlal2 v21.4s, v27.8h, v6.h[0]
+    smlal v22.4s, v27.4h, v7.h[0]
+    smlal2 v23.4s, v27.8h, v7.h[0]
+
+    // c1            
+    ld1 {v27.8b}, [x5], #8
+    smlal v8.4s, v28.4h, v0.h[1]
+    smlal2 v9.4s, v28.8h, v0.h[1]
+    smlal v10.4s, v28.4h, v1.h[1]
+    smlal2 v11.4s, v28.8h, v1.h[1]
+    smlal v12.4s, v28.4h, v2.h[1]
+    smlal2 v13.4s, v28.8h, v2.h[1]
+    smlal v14.4s, v28.4h, v3.h[1]
+    smlal2 v15.4s, v28.8h, v3.h[1]
+    sxtl v27.8h, v27.8b
+    smlal v16.4s, v28.4h, v4.h[1]
+    smlal2 v17.4s, v28.8h, v4.h[1]
+    smlal v18.4s, v28.4h, v5.h[1]
+    smlal2 v19.4s, v28.8h, v5.h[1]
+    smlal v20.4s, v28.4h, v6.h[1]
+    smlal2 v21.4s, v28.8h, v6.h[1]
+    smlal v22.4s, v28.4h, v7.h[1]
+    smlal2 v23.4s, v28.8h, v7.h[1]
+
+    // c2            
+    ld1 {v28.8b}, [x5], #8
+    smlal v8.4s, v27.4h, v0.h[2]
+    smlal2 v9.4s, v27.8h, v0.h[2]
+    smlal v10.4s, v27.4h, v1.h[2]
+    smlal2 v11.4s, v27.8h, v1.h[2]
+    smlal v12.4s, v27.4h, v2.h[2]
+    smlal2 v13.4s, v27.8h, v2.h[2]
+    smlal v14.4s, v27.4h, v3.h[2]
+    smlal2 v15.4s, v27.8h, v3.h[2]
+    sxtl v28.8h, v28.8b
+    smlal v16.4s, v27.4h, v4.h[2]
+    smlal2 v17.4s, v27.8h, v4.h[2]
+    smlal v18.4s, v27.4h, v5.h[2]
+    smlal2 v19.4s, v27.8h, v5.h[2]
+    smlal v20.4s, v27.4h, v6.h[2]
+    smlal2 v21.4s, v27.8h, v6.h[2]
+    smlal v22.4s, v27.4h, v7.h[2]
+    smlal2 v23.4s, v27.8h, v7.h[2]
+
+    // c3            
+    ld1 {v27.8b}, [x5], #8
+    smlal v8.4s, v28.4h, v0.h[3]
+    smlal2 v9.4s, v28.8h, v0.h[3]
+    smlal v10.4s, v28.4h, v1.h[3]
+    smlal2 v11.4s, v28.8h, v1.h[3]
+    smlal v12.4s, v28.4h, v2.h[3]
+    smlal2 v13.4s, v28.8h, v2.h[3]
+    smlal v14.4s, v28.4h, v3.h[3]
+    smlal2 v15.4s, v28.8h, v3.h[3]
+    sxtl v27.8h, v27.8b
+    smlal v16.4s, v28.4h, v4.h[3]
+    smlal2 v17.4s, v28.8h, v4.h[3]
+    smlal v18.4s, v28.4h, v5.h[3]
+    smlal2 v19.4s, v28.8h, v5.h[3]
+    smlal v20.4s, v28.4h, v6.h[3]
+    smlal2 v21.4s, v28.8h, v6.h[3]
+    smlal v22.4s, v28.4h, v7.h[3]
+    smlal2 v23.4s, v28.8h, v7.h[3]
+
+    // c4
+    ld1 {v28.8b}, [x5], #8
+    smlal v8.4s, v27.4h, v0.h[4]
+    smlal2 v9.4s, v27.8h, v0.h[4]
+    smlal v10.4s, v27.4h, v1.h[4]
+    smlal2 v11.4s, v27.8h, v1.h[4]
+    smlal v12.4s, v27.4h, v2.h[4]
+    smlal2 v13.4s, v27.8h, v2.h[4]
+    smlal v14.4s, v27.4h, v3.h[4]
+    smlal2 v15.4s, v27.8h, v3.h[4]
+    sxtl v28.8h, v28.8b
+    smlal v16.4s, v27.4h, v4.h[4]
+    smlal2 v17.4s, v27.8h, v4.h[4]
+    smlal v18.4s, v27.4h, v5.h[4]
+    smlal2 v19.4s, v27.8h, v5.h[4]
+    smlal v20.4s, v27.4h, v6.h[4]
+    smlal2 v21.4s, v27.8h, v6.h[4]
+    smlal v22.4s, v27.4h, v7.h[4]
+    smlal2 v23.4s, v27.8h, v7.h[4]
+
+    // c5            
+    ld1 {v27.8b}, [x5], #8
+    smlal v8.4s, v28.4h, v0.h[5]
+    smlal2 v9.4s, v28.8h, v0.h[5]
+    smlal v10.4s, v28.4h, v1.h[5]
+    smlal2 v11.4s, v28.8h, v1.h[5]
+    smlal v12.4s, v28.4h, v2.h[5]
+    smlal2 v13.4s, v28.8h, v2.h[5]
+    smlal v14.4s, v28.4h, v3.h[5]
+    smlal2 v15.4s, v28.8h, v3.h[5]
+    sxtl v27.8h, v27.8b
+    smlal v16.4s, v28.4h, v4.h[5]
+    smlal2 v17.4s, v28.8h, v4.h[5]
+    smlal v18.4s, v28.4h, v5.h[5]
+    smlal2 v19.4s, v28.8h, v5.h[5]
+    smlal v20.4s, v28.4h, v6.h[5]
+    smlal2 v21.4s, v28.8h, v6.h[5]
+    smlal v22.4s, v28.4h, v7.h[5]
+    smlal2 v23.4s, v28.8h, v7.h[5]
+
+    // c6            
+    ld1 {v28.8b}, [x5], #8
+    smlal v8.4s, v27.4h, v0.h[6]
+    smlal2 v9.4s, v27.8h, v0.h[6]
+    smlal v10.4s, v27.4h, v1.h[6]
+    smlal2 v11.4s, v27.8h, v1.h[6]
+    smlal v12.4s, v27.4h, v2.h[6]
+    smlal2 v13.4s, v27.8h, v2.h[6]
+    smlal v14.4s, v27.4h, v3.h[6]
+    smlal2 v15.4s, v27.8h, v3.h[6]
+    sxtl v28.8h, v28.8b
+    smlal v16.4s, v27.4h, v4.h[6]
+    smlal2 v17.4s, v27.8h, v4.h[6]
+    smlal v18.4s, v27.4h, v5.h[6]
+    smlal2 v19.4s, v27.8h, v5.h[6]
+    smlal v20.4s, v27.4h, v6.h[6]
+    smlal2 v21.4s, v27.8h, v6.h[6]
+    smlal v22.4s, v27.4h, v7.h[6]
+    smlal2 v23.4s, v27.8h, v7.h[6]
+
+    // c7            
+    subs x2, x2, #8
+
+    smlal v8.4s, v28.4h, v0.h[7]
+    smlal2 v9.4s, v28.8h, v0.h[7]
+    smlal v10.4s, v28.4h, v1.h[7]
+    smlal2 v11.4s, v28.8h, v1.h[7]
+    smlal v12.4s, v28.4h, v2.h[7]
+    smlal2 v13.4s, v28.8h, v2.h[7]
+    smlal v14.4s, v28.4h, v3.h[7]
+    smlal2 v15.4s, v28.8h, v3.h[7]
+    smlal v16.4s, v28.4h, v4.h[7]
+    smlal2 v17.4s, v28.8h, v4.h[7]
+    smlal v18.4s, v28.4h, v5.h[7]
+    smlal2 v19.4s, v28.8h, v5.h[7]
+    smlal v20.4s, v28.4h, v6.h[7]
+    smlal2 v21.4s, v28.8h, v6.h[7]
+    smlal v22.4s, v28.4h, v7.h[7]
+    smlal2 v23.4s, v28.8h, v7.h[7]
+
+    bhs 0b
+
+1:
+    cmp x2, #-8
+    beq 2f
+
+    // Adjust a0-a7
+    add x3, x3, x2
+    add x9, x9, x2
+    add x10, x10, x2
+    add x11, x11, x2
+    add x12, x12, x2
+    add x13, x13, x2
+    add x14, x14, x2
+    add x15, x15, x2
+
+    lsl x2, x2, #3
+    fmov d29, x2
+
+    // Load x0-a7
+    ld1 {v0.8b}, [x3], #8
+    sshl d0, d0, d29
+    sxtl v0.8h, v0.8b
+
+    ld1 {v1.8b}, [x9], #8
+    sshl d1, d1, d29
+    sxtl v1.8h, v1.8b
+
+    ld1 {v2.8b}, [x10], #8
+    sshl d2, d2, d29
+    sxtl v2.8h, v2.8b
+
+    ld1 {v3.8b}, [x11], #8
+    sshl d3, d3, d29
+    sxtl v3.8h, v3.8b
+
+    ld1 {v4.8b}, [x12], #8
+    sshl d4, d4, d29
+    sxtl v4.8h, v4.8b
+
+    ld1 {v5.8b}, [x13], #8
+    sshl d5, d5, d29
+    sxtl v5.8h, v5.8b
+
+    ld1 {v6.8b}, [x14], #8
+    sshl d6, d6, d29
+    sxtl v6.8h, v6.8b
+
+    ld1 {v7.8b}, [x15], #8
+    sshl d7, d7, d29
+    sxtl v7.8h, v7.8b
+
+    // c0
+    ld1 {v27.8b}, [x5], #8
+    sxtl v27.8h, v27.8b
+
+    smlal v8.4s, v27.4h, v0.h[0]
+    smlal2 v9.4s, v27.8h, v0.h[0]
+    smlal v10.4s, v27.4h, v1.h[0]
+    smlal2 v11.4s, v27.8h, v1.h[0]
+    smlal v12.4s, v27.4h, v2.h[0]
+    smlal2 v13.4s, v27.8h, v2.h[0]
+    smlal v14.4s, v27.4h, v3.h[0]
+    smlal2 v15.4s, v27.8h, v3.h[0]
+    smlal v16.4s, v27.4h, v4.h[0]
+    smlal2 v17.4s, v27.8h, v4.h[0]
+    smlal v18.4s, v27.4h, v5.h[0]
+    smlal2 v19.4s, v27.8h, v5.h[0]
+    smlal v20.4s, v27.4h, v6.h[0]
+    smlal2 v21.4s, v27.8h, v6.h[0]
+    smlal v22.4s, v27.4h, v7.h[0]
+    smlal2 v23.4s, v27.8h, v7.h[0]
+
+    cmp x2, #-48
+    blo 2f
+
+    // c1            
+    ld1 {v28.8b}, [x5], #8
+    sxtl v28.8h, v28.8b
+
+    smlal v8.4s, v28.4h, v0.h[1]
+    smlal2 v9.4s, v28.8h, v0.h[1]
+    smlal v10.4s, v28.4h, v1.h[1]
+    smlal2 v11.4s, v28.8h, v1.h[1]
+    smlal v12.4s, v28.4h, v2.h[1]
+    smlal2 v13.4s, v28.8h, v2.h[1]
+    smlal v14.4s, v28.4h, v3.h[1]
+    smlal2 v15.4s, v28.8h, v3.h[1]
+    smlal v16.4s, v28.4h, v4.h[1]
+    smlal2 v17.4s, v28.8h, v4.h[1]
+    smlal v18.4s, v28.4h, v5.h[1]
+    smlal2 v19.4s, v28.8h, v5.h[1]
+    smlal v20.4s, v28.4h, v6.h[1]
+    smlal2 v21.4s, v28.8h, v6.h[1]
+    smlal v22.4s, v28.4h, v7.h[1]
+    smlal2 v23.4s, v28.8h, v7.h[1]
+
+    bls 2f
+
+    // c2            
+    ld1 {v27.8b}, [x5], #8
+    sxtl v27.8h, v27.8b
+
+    smlal v8.4s, v27.4h, v0.h[2]
+    smlal2 v9.4s, v27.8h, v0.h[2]
+    smlal v10.4s, v27.4h, v1.h[2]
+    smlal2 v11.4s, v27.8h, v1.h[2]
+    smlal v12.4s, v27.4h, v2.h[2]
+    smlal2 v13.4s, v27.8h, v2.h[2]
+    smlal v14.4s, v27.4h, v3.h[2]
+    smlal2 v15.4s, v27.8h, v3.h[2]
+    smlal v16.4s, v27.4h, v4.h[2]
+    smlal2 v17.4s, v27.8h, v4.h[2]
+    smlal v18.4s, v27.4h, v5.h[2]
+    smlal2 v19.4s, v27.8h, v5.h[2]
+    smlal v20.4s, v27.4h, v6.h[2]
+    smlal2 v21.4s, v27.8h, v6.h[2]
+    smlal v22.4s, v27.4h, v7.h[2]
+    smlal2 v23.4s, v27.8h, v7.h[2]
+
+    cmp x2, #-32
+    blo 2f
+
+    // c3            
+    ld1 {v28.8b}, [x5], #8
+    sxtl v28.8h, v28.8b
+
+    smlal v8.4s, v28.4h, v0.h[3]
+    smlal2 v9.4s, v28.8h, v0.h[3]
+    smlal v10.4s, v28.4h, v1.h[3]
+    smlal2 v11.4s, v28.8h, v1.h[3]
+    smlal v12.4s, v28.4h, v2.h[3]
+    smlal2 v13.4s, v28.8h, v2.h[3]
+    smlal v14.4s, v28.4h, v3.h[3]
+    smlal2 v15.4s, v28.8h, v3.h[3]
+    smlal v16.4s, v28.4h, v4.h[3]
+    smlal2 v17.4s, v28.8h, v4.h[3]
+    smlal v18.4s, v28.4h, v5.h[3]
+    smlal2 v19.4s, v28.8h, v5.h[3]
+    smlal v20.4s, v28.4h, v6.h[3]
+    smlal2 v21.4s, v28.8h, v6.h[3]
+    smlal v22.4s, v28.4h, v7.h[3]
+    smlal2 v23.4s, v28.8h, v7.h[3]
+
+    bls 2f
+
+    // c4
+    ld1 {v27.8b}, [x5], #8
+    sxtl v27.8h, v27.8b
+
+    smlal v8.4s, v27.4h, v0.h[4]
+    smlal2 v9.4s, v27.8h, v0.h[4]
+    smlal v10.4s, v27.4h, v1.h[4]
+    smlal2 v11.4s, v27.8h, v1.h[4]
+    smlal v12.4s, v27.4h, v2.h[4]
+    smlal2 v13.4s, v27.8h, v2.h[4]
+    smlal v14.4s, v27.4h, v3.h[4]
+    smlal2 v15.4s, v27.8h, v3.h[4]
+    smlal v16.4s, v27.4h, v4.h[4]
+    smlal2 v17.4s, v27.8h, v4.h[4]
+    smlal v18.4s, v27.4h, v5.h[4]
+    smlal2 v19.4s, v27.8h, v5.h[4]
+    smlal v20.4s, v27.4h, v6.h[4]
+    smlal2 v21.4s, v27.8h, v6.h[4]
+    smlal v22.4s, v27.4h, v7.h[4]
+    smlal2 v23.4s, v27.8h, v7.h[4]
+
+    cmp x2, #-16
+    blo 2f
+
+    // c5            
+    ld1 {v28.8b}, [x5], #8
+    sxtl v28.8h, v28.8b
+
+    smlal v8.4s, v28.4h, v0.h[5]
+    smlal2 v9.4s, v28.8h, v0.h[5]
+    smlal v10.4s, v28.4h, v1.h[5]
+    smlal2 v11.4s, v28.8h, v1.h[5]
+    smlal v12.4s, v28.4h, v2.h[5]
+    smlal2 v13.4s, v28.8h, v2.h[5]
+    smlal v14.4s, v28.4h, v3.h[5]
+    smlal2 v15.4s, v28.8h, v3.h[5]
+    smlal v16.4s, v28.4h, v4.h[5]
+    smlal2 v17.4s, v28.8h, v4.h[5]
+    smlal v18.4s, v28.4h, v5.h[5]
+    smlal2 v19.4s, v28.8h, v5.h[5]
+    smlal v20.4s, v28.4h, v6.h[5]
+    smlal2 v21.4s, v28.8h, v6.h[5]
+    smlal v22.4s, v28.4h, v7.h[5]
+    smlal2 v23.4s, v28.8h, v7.h[5]
+
+    bls 2f
+
+    // c6            
+    ld1 {v27.8b}, [x5], #8
+    sxtl v27.8h, v27.8b
+
+    smlal v8.4s, v27.4h, v0.h[6]
+    smlal2 v9.4s, v27.8h, v0.h[6]
+    smlal v10.4s, v27.4h, v1.h[6]
+    smlal2 v11.4s, v27.8h, v1.h[6]
+    smlal v12.4s, v27.4h, v2.h[6]
+    smlal2 v13.4s, v27.8h, v2.h[6]
+    smlal v14.4s, v27.4h, v3.h[6]
+    smlal2 v15.4s, v27.8h, v3.h[6]
+    smlal v16.4s, v27.4h, v4.h[6]
+    smlal2 v17.4s, v27.8h, v4.h[6]
+    smlal v18.4s, v27.4h, v5.h[6]
+    smlal2 v19.4s, v27.8h, v5.h[6]
+    smlal v20.4s, v27.4h, v6.h[6]
+    smlal2 v21.4s, v27.8h, v6.h[6]
+    smlal v22.4s, v27.4h, v7.h[6]
+    smlal2 v23.4s, v27.8h, v7.h[6]
+
+2:
+    ld1 {v24.4s}, [x8], #16   // float scales c0, c1, c2, c3
+    movi v25.4s, #0
+    cmp x1, #4
+    ble 22f
+    ld1 {v25.4s}, [x8]        // float scales c4, c5, c6, c7
+22:
+    ldr x8, [sp, #8]   // relu (conv - relu - add, relu == -1)
+    cmp x8, #-1
+    bne 23f
+    movi v0.16b, #0
+    smax v8.4s, v8.4s, v0.4s
+    smax v9.4s, v9.4s, v0.4s
+    smax v10.4s, v10.4s, v0.4s
+    smax v11.4s, v11.4s, v0.4s
+    smax v12.4s, v12.4s, v0.4s
+    smax v13.4s, v13.4s, v0.4s
+    smax v14.4s, v14.4s, v0.4s
+    smax v15.4s, v15.4s, v0.4s
+    smax v16.4s, v16.4s, v0.4s
+    smax v17.4s, v17.4s, v0.4s
+    smax v18.4s, v18.4s, v0.4s
+    smax v19.4s, v19.4s, v0.4s
+    smax v20.4s, v20.4s, v0.4s
+    smax v21.4s, v21.4s, v0.4s
+    smax v22.4s, v22.4s, v0.4s
+    smax v23.4s, v23.4s, v0.4s
+23:
+    scvtf v8.4s, v8.4s   // result from int32 to float
+    scvtf v9.4s, v9.4s
+    scvtf v10.4s, v10.4s
+    scvtf v11.4s, v11.4s
+    scvtf v12.4s, v12.4s
+    scvtf v13.4s, v13.4s
+    scvtf v14.4s, v14.4s
+    scvtf v15.4s, v15.4s
+    scvtf v16.4s, v16.4s
+    scvtf v17.4s, v17.4s
+    scvtf v18.4s, v18.4s
+    scvtf v19.4s, v19.4s
+    scvtf v20.4s, v20.4s
+    scvtf v21.4s, v21.4s
+    scvtf v22.4s, v22.4s
+    scvtf v23.4s, v23.4s
+
+    ldr x8, [sp, #16]  // add_input
+
+    fmul v8.4s, v8.4s, v24.4s   // result = result * scale
+    fmul v9.4s, v9.4s, v25.4s
+    fmul v10.4s, v10.4s, v24.4s
+    fmul v11.4s, v11.4s, v25.4s
+    fmul v12.4s, v12.4s, v24.4s
+    fmul v13.4s, v13.4s, v25.4s
+    fmul v14.4s, v14.4s, v24.4s
+    fmul v15.4s, v15.4s, v25.4s
+
+    cbz  x8,  25f      // if add_input_ptr == 0, skip
+    add  x9,  x8,  x7
+    cmp  x0,  #2
+    csel x9,  x8,  x9,  lo
+    add  x10, x9,  x7
+    csel x10, x9,  x10, ls
+    add  x11, x10, x7
+    cmp  x0,  #4
+    csel x11, x10, x11, lo
+
+    ld1 {v28.8b}, [x8]   // load add_input, convert int8_t to float
+    sxtl  v28.8h, v28.8b
+    sxtl  v0.4s,  v28.4h
+    sxtl2 v1.4s,  v28.8h
+    ld1 {v29.8b}, [x9]
+    sxtl  v29.8h, v29.8b
+    sxtl  v2.4s,  v29.4h
+    sxtl2 v3.4s,  v29.8h
+    ld1 {v28.8b}, [x10]
+    sxtl  v28.8h, v28.8b
+    sxtl  v4.4s,  v28.4h
+    sxtl2 v5.4s,  v28.8h
+    ld1 {v29.8b}, [x11]
+    sxtl  v29.8h, v29.8b
+    sxtl  v6.4s,  v29.4h
+    sxtl2 v7.4s,  v29.8h
+
+    ldr x8, [sp, #24]  // add_scale
+    ld1 {v26.4s}, [x8], #16  // float add_scale c0, c1, c2, c3
+    movi v27.4s, #0
+    cmp x1, #4
+    ble 24f
+    ld1 {v27.4s}, [x8]       // float add_scale c4, c5, c6, c7
+
+24:
+    scvtf v0.4s, v0.4s
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    scvtf v3.4s, v3.4s
+    scvtf v4.4s, v4.4s
+    scvtf v5.4s, v5.4s
+    scvtf v6.4s, v6.4s
+    scvtf v7.4s, v7.4s
+
+    fmla v8.4s, v0.4s, v26.4s   // result += add_input * add_scale
+    fmla v9.4s, v1.4s, v27.4s
+    fmla v10.4s, v2.4s, v26.4s
+    fmla v11.4s, v3.4s, v27.4s
+    fmla v12.4s, v4.4s, v26.4s
+    fmla v13.4s, v5.4s, v27.4s
+    fmla v14.4s, v6.4s, v26.4s
+    fmla v15.4s, v7.4s, v27.4s
+
+25:
+    fmul v16.4s, v16.4s, v24.4s  // result = result * scale
+    fmul v17.4s, v17.4s, v25.4s
+    fmul v18.4s, v18.4s, v24.4s
+    fmul v19.4s, v19.4s, v25.4s
+    fmul v20.4s, v20.4s, v24.4s
+    fmul v21.4s, v21.4s, v25.4s
+    fmul v22.4s, v22.4s, v24.4s
+    fmul v23.4s, v23.4s, v25.4s
+
+    cbz x8,  26f      // if add_input_ptr == 0, skip
+    cmp x0, #4
+    add  x12, x11, x7
+    csel x12, x11, x12, ls
+    add  x13, x12, x7
+    cmp  x0,  #6
+    csel x13, x12, x13, lo
+    add  x14, x13, x7
+    csel x14, x13, x14, ls
+    add  x15, x14, x7
+    cmp  x0,  #8
+    csel x15, x14, x15, ne
+
+    ld1 {v28.8b}, [x12]
+    sxtl  v28.8h, v28.8b
+    sxtl  v0.4s,  v28.4h
+    sxtl2 v1.4s,  v28.8h
+    ld1 {v29.8b}, [x13]
+    sxtl  v29.8h, v29.8b
+    sxtl  v2.4s,  v29.4h
+    sxtl2 v3.4s,  v29.8h
+    ld1 {v28.8b}, [x14]
+    sxtl  v28.8h, v28.8b
+    sxtl  v4.4s,  v28.4h
+    sxtl2 v5.4s,  v28.8h
+    ld1 {v29.8b}, [x15]
+    sxtl  v29.8h, v29.8b
+    sxtl  v6.4s,  v29.4h
+    sxtl2 v7.4s,  v29.8h
+
+    scvtf v0.4s, v0.4s
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    scvtf v3.4s, v3.4s
+    scvtf v4.4s, v4.4s
+    scvtf v5.4s, v5.4s
+    scvtf v6.4s, v6.4s
+    scvtf v7.4s, v7.4s
+
+    fmla v16.4s, v0.4s, v26.4s  // result += add_input * add_scale
+    fmla v17.4s, v1.4s, v27.4s
+    fmla v18.4s, v2.4s, v26.4s
+    fmla v19.4s, v3.4s, v27.4s
+    fmla v20.4s, v4.4s, v26.4s
+    fmla v21.4s, v5.4s, v27.4s
+    fmla v22.4s, v6.4s, v26.4s
+    fmla v23.4s, v7.4s, v27.4s
+
+26:
+    fcvtas v8.4s, v8.4s
+    fcvtas v9.4s, v9.4s
+    fcvtas v10.4s, v10.4s
+    fcvtas v11.4s, v11.4s
+    fcvtas v12.4s, v12.4s
+    fcvtas v13.4s, v13.4s
+    fcvtas v14.4s, v14.4s
+    fcvtas v15.4s, v15.4s
+    fcvtas v16.4s, v16.4s
+    fcvtas v17.4s, v17.4s
+    fcvtas v18.4s, v18.4s
+    fcvtas v19.4s, v19.4s
+    fcvtas v20.4s, v20.4s
+    fcvtas v21.4s, v21.4s
+    fcvtas v22.4s, v22.4s
+    fcvtas v23.4s, v23.4s
+
+    sqxtn v8.4h, v8.4s
+    sqxtn v10.4h, v10.4s
+    sqxtn v12.4h, v12.4s
+    sqxtn v14.4h, v14.4s
+    sqxtn v16.4h, v16.4s
+    sqxtn v18.4h, v18.4s
+    sqxtn v20.4h, v20.4s
+    sqxtn v22.4h, v22.4s
+    sqxtn2 v8.8h, v9.4s
+    sqxtn2 v10.8h, v11.4s
+    sqxtn2 v12.8h, v13.4s
+    sqxtn2 v14.8h, v15.4s
+    sqxtn2 v16.8h, v17.4s
+    sqxtn2 v18.8h, v19.4s
+    sqxtn2 v20.8h, v21.4s
+    sqxtn2 v22.8h, v23.4s
+
+    sqxtn v8.8b, v8.8h
+    sqxtn v10.8b, v10.8h
+    sqxtn v12.8b, v12.8h
+    sqxtn v14.8b, v14.8h
+    sqxtn v16.8b, v16.8h
+    sqxtn v18.8b, v18.8h
+    sqxtn v20.8b, v20.8h
+    sqxtn v22.8b, v22.8h
+
+    ldr x8, [sp, #8]  // relu (conv add relu, relu == 1 or relu6 == 2)
+    cmp x8, #1
+    blt 3f
+    movi v0.16b, #0
+    smax v8.8b, v8.8b, v0.8b
+    smax v10.8b, v10.8b, v0.8b
+    smax v12.8b, v12.8b, v0.8b
+    smax v14.8b, v14.8b, v0.8b
+    smax v16.8b, v16.8b, v0.8b
+    smax v18.8b, v18.8b, v0.8b
+    smax v20.8b, v20.8b, v0.8b
+    smax v22.8b, v22.8b, v0.8b
+
+    cmp x8, #2         // relu6
+    bne 3f
+    ldr x8, [sp, #32]  // relu6_max
+    ld1 {v0.8b}, [x8]
+    smin v8.8b, v8.8b, v0.8b
+    smin v10.8b, v10.8b, v0.8b
+    smin v12.8b, v12.8b, v0.8b
+    smin v14.8b, v14.8b, v0.8b
+    smin v16.8b, v16.8b, v0.8b
+    smin v18.8b, v18.8b, v0.8b
+    smin v20.8b, v20.8b, v0.8b
+    smin v22.8b, v22.8b, v0.8b
+
+3:
+    add  x9, x6,  x7
+    cmp x0, #2
+    csel x9, x6, x9, lo
+
+    add x10, x9,  x7
+    csel x10, x9, x10, ls
+
+    add x11, x10, x7
+    cmp x0, #4
+    csel x11, x10, x11, lo
+
+    add x12, x11, x7
+    csel x12, x11, x12, ls
+
+    add x13, x12, x7
+    cmp x0, #6
+    csel x13, x12, x13, lo
+
+    add x14, x13, x7
+    csel x14, x13, x14, ls
+
+    add x15, x14, x7
+    cmp x0, #8
+    csel x15, x14, x15, ne
+
+    cmp x1, #8
+    bne 4f
+
+    st1 {v8.d}[0], [x6]
+    st1 {v10.d}[0], [x9]
+    st1 {v12.d}[0], [x10]
+    st1 {v14.d}[0], [x11]
+    st1 {v16.d}[0], [x12]
+    st1 {v18.d}[0], [x13]
+    st1 {v20.d}[0], [x14]
+    st1 {v22.d}[0], [x15]
+
+    b 7f
+
+4:
+    cmp x1, #4
+    blo 5f
+
+    st1 {v8.s}[0], [x6], #4
+    st1 {v10.s}[0], [x9], #4
+    st1 {v12.s}[0], [x10], #4
+    st1 {v14.s}[0], [x11], #4
+    st1 {v16.s}[0], [x12], #4
+    st1 {v18.s}[0], [x13], #4
+    st1 {v20.s}[0], [x14], #4
+    st1 {v22.s}[0], [x15], #4
+
+    sub x1, x1, #4
+    ext v8.8b, v8.8b, v8.8b, #4
+    ext v10.8b, v10.8b, v10.8b, #4
+    ext v12.8b, v12.8b, v12.8b, #4
+    ext v14.8b, v14.8b, v14.8b, #4
+    ext v16.8b, v16.8b, v16.8b, #4
+    ext v18.8b, v18.8b, v18.8b, #4
+    ext v20.8b, v20.8b, v20.8b, #4
+    ext v22.8b, v22.8b, v22.8b, #4
+
+5:
+    cmp x1, #2
+    blo 6f
+
+    st1 {v8.h}[0], [x6], #2
+    st1 {v10.h}[0], [x9], #2
+    st1 {v12.h}[0], [x10], #2
+    st1 {v14.h}[0], [x11], #2
+    st1 {v16.h}[0], [x12], #2
+    st1 {v18.h}[0], [x13], #2
+    st1 {v20.h}[0], [x14], #2
+    st1 {v22.h}[0], [x15], #2
+
+    sub x1, x1, #2
+    ext v8.8b, v8.8b, v8.8b, #2
+    ext v10.8b, v10.8b, v10.8b, #2
+    ext v12.8b, v12.8b, v12.8b, #2
+    ext v14.8b, v14.8b, v14.8b, #2
+    ext v16.8b, v16.8b, v16.8b, #2
+    ext v18.8b, v18.8b, v18.8b, #2
+    ext v20.8b, v20.8b, v20.8b, #2
+    ext v22.8b, v22.8b, v22.8b, #2
+
+6:
+    cmp x1, #1
+    blo 7f
+
+    st1 {v8.b}[0], [x6]
+    st1 {v10.b}[0], [x9]
+    st1 {v12.b}[0], [x10]
+    st1 {v14.b}[0], [x11]
+    st1 {v16.b}[0], [x12]
+    st1 {v18.b}[0], [x13]
+    st1 {v20.b}[0], [x14]
+    st1 {v22.b}[0], [x15]
+
+
+7:
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ret
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/asm_func_name.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute/asm_func_name.S
new file mode 100644
index 0000000..3e89116
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/asm_func_name.S
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#ifndef TNN_ARM_COMPUTE_ASMFUNCNAME_S_
+#define TNN_ARM_COMPUTE_ASMFUNCNAME_S_
+.macro asm_function fname
+#ifdef __APPLE__
+.globl _\fname
+_\fname:
+#else
+.global \fname
+#ifdef __ELF__
+.hidden \fname
+.type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#endif /* TNN_ARM_COMPUTE_ASMFUNCNAME_S_ */
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.cc
new file mode 100644
index 0000000..d9170a9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute/binary_function.h"
+
+namespace TNN_NS {
+
+void PadShape(const int pad_size, const int dim_size, DimsVector &pad_shape, DimsVector in_shape) {
+    int j = 0;
+    for (; j < pad_size; j++) {
+        pad_shape[j] = 1;
+    }
+    for (; j < dim_size; j++) {
+        pad_shape[j] = in_shape[j - pad_size];
+    }
+}
+
+void BroadCastTypeFilter(const DimsVector &dims_output, const DimsVector &dims_input, BroadcastType &type) {
+    if (DimsVectorUtils::Equal(dims_output, dims_input)) {
+        type = BroadcastTypeNormal;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 1) &&
+        DimsVectorUtils::Count(dims_input, 0, 1) == 1) {
+        type = BroadcastTypeElement;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 2) &&
+        DimsVectorUtils::Count(dims_input, 0, 2) == 1) {
+        type = BroadcastTypeHeightWidth;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 3) &&
+        DimsVectorUtils::Count(dims_input, 0, 3) == 1) {
+        type = BroadcastTypeWidth;
+        return;
+    }
+    int broadcast_count = DimsVectorUtils::Count(dims_input);
+    if (broadcast_count == 1) {
+        type = BroadcastTypeSingle;
+    } else if (broadcast_count == dims_output[1]) {
+        // broadcast dim = [1, channel, 1...]
+        if (dims_input[1] == dims_output[1]) {
+            type = BroadcastTypeChannel;
+        } else {
+            type = BroadcastTypeGeneral;
+        }
+    } else {
+        type = BroadcastTypeGeneral;
+    }
+    return;
+}
+
+void BroadCastInit(const DimsVector &dims, const DimsVector &dims0, const DimsVector &dims1,
+                   BroadcastType &type, DimsVector &dims_broadcast, bool &swap_flag) {
+    if (DimsVectorUtils::Equal(dims0, dims1)) {
+        type = BroadcastTypeNormal;
+        dims_broadcast.clear();
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 1)) {
+        type = BroadcastTypeElement;
+        dims_broadcast.clear();
+        if (dims0[0] < dims1[0])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 2)) {
+        type = BroadcastTypeHeightWidth;
+        dims_broadcast.clear();
+        if (dims0[1] < dims1[1])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 3)) {
+        type = BroadcastTypeWidth;
+        dims_broadcast.clear();
+        if (dims0[1] < dims1[1])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims)) {
+        dims_broadcast = dims1;
+    } else {
+        dims_broadcast = dims0;
+        swap_flag      = true;
+    }
+}
+
+void BinaryComputeOffset(DimsVector &offset, const DimsVector dims_in, const DimsVector dims_out) {
+    DimsVector dims_pad_in;
+    dims_pad_in.resize(dims_out.size());
+    int pad_size = dims_out.size() - dims_in.size();
+    PadShape(pad_size, dims_out.size(), dims_pad_in, dims_in);
+
+    offset.resize(dims_out.size());
+    int s = 1;
+    for (int i = dims_out.size() - 1; i >= 0; i--) {
+        offset[i] = (dims_pad_in[i] == dims_out[i]) ? s : 0;
+        s *= dims_pad_in[i];
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.h
new file mode 100644
index 0000000..7a2881a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/binary_function.h
@@ -0,0 +1,328 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_COMPUTE_BINARY_FUNCTION_H_
+#define TNN_ARM_COMPUTE_BINARY_FUNCTION_H_
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+// alpha and beta used for hardswish
+template<ArmBinaryOpType type, typename dtype>
+dtype binary_op(const dtype &a, const dtype &b, float alpha = 0, float beta = 0) {
+    return a;
+}
+
+void PadShape(const int pad_size, const int dim_size, DimsVector &pad_shape, DimsVector in_shape);
+
+void BroadCastTypeFilter(const DimsVector &dims_output, const DimsVector &dims_input, BroadcastType &type);
+
+void BroadCastInit(const DimsVector &dims, const DimsVector &dims0, const DimsVector &dims1,
+                   BroadcastType &type, DimsVector &dims_broadcast, bool &swap_flag);
+                   
+void BinaryComputeOffset(DimsVector &offset, const DimsVector dims_in, const DimsVector dims_out);
+
+template <typename T>
+void BinaryComputeFirst(const DimsVector input_offset, const DimsVector output_offset,
+                        const DimsVector output_shape, T* input_ptr, T* output_ptr) {
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = in_i5[0];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T, ArmBinaryOpType op_type>
+void BinaryCompute(const DimsVector input_offset, const DimsVector output_offset,
+                   const DimsVector output_shape, T* input_ptr, T* output_ptr,
+                   float alpha = 0.f, float beta = 0.f) {
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = binary_op<op_type, T>(ou_i5[0], in_i5[0], alpha, beta);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T, ArmBinaryOpType op_type>
+Status BinaryGeneralFunc(void *output_ptr, std::vector<void*> &input_ptrs, DimsVector output_shape,
+                         std::vector<DimsVector> &input_shapes, void *workspace,
+                         float alpha = 0.f, float beta = 0.f) {
+    size_t output_size = DimsVectorUtils::Count(output_shape);
+    T *output_nchw = reinterpret_cast<T *>(workspace);
+    T *input_nchw = output_nchw + output_size;
+    T *ou_ptr = reinterpret_cast<T *>(output_ptr);
+
+    DimsVector output_offset;
+    BinaryComputeOffset(output_offset, output_shape, output_shape);
+    for (int i = 0; i < input_shapes.size(); i++) {
+        auto input_shape = input_shapes[i];
+        T *input_data = reinterpret_cast<T *>(input_ptrs[i]);
+
+        DimsVector input_shape_pad;
+        input_shape_pad.resize(output_shape.size());
+        PadShape(output_shape.size() - input_shape.size(), output_shape.size(), input_shape_pad, input_shape);
+
+        int input_batch = input_shape_pad[0];
+        int input_channel = input_shape_pad[1];
+        int input_hw = DimsVectorUtils::Count(input_shape_pad, 2);
+        // nc4hw4 to nchw
+        UnpackFloatBlob(input_nchw, input_data, input_batch, input_channel, input_hw);
+
+        DimsVector input_offset;
+        BinaryComputeOffset(input_offset, input_shape, output_shape);
+        if (i == 0) {
+            BinaryComputeFirst<T>(input_offset, output_offset, output_shape, input_nchw, output_nchw);
+        } else {
+            BinaryCompute<T, op_type>(input_offset, output_offset, output_shape, input_nchw, output_nchw, alpha, beta);
+        }
+    }
+
+    int output_batch = output_shape[0];
+    int output_channel = output_shape[1];
+    int output_hw = DimsVectorUtils::Count(output_shape, 2);
+    PackFloatBlob(ou_ptr, output_nchw, output_batch, output_channel, output_hw);
+
+    return TNN_OK;
+}
+
+/*
+Binary func with different opreator,
+set dims0 full shape, dims1 broadcast shape, so we need to swap input ptrs
+*/
+template <typename T, ArmBinaryOpType op_type, typename VEC = Float4, int pack = 4>
+Status BinaryFunc(void *out_ptr, void *input0_ptr, void *input1_ptr, DimsVector &dims0,
+                  DimsVector &dims1, float alpha = 0.f, float beta = 0.f) {
+    DimsVector dims = DimsVectorUtils::Max(dims0, dims1);
+    DimsVector dims_broadcast;
+    BroadcastType type = BroadcastTypeUnknown;
+    auto _input0       = reinterpret_cast<T *>(input0_ptr);
+    auto _input1       = reinterpret_cast<T *>(input1_ptr);
+    auto output_ptr    = reinterpret_cast<T *>(out_ptr);
+    bool swap_flag     = false;
+
+    BroadCastInit(dims, dims0, dims1, type, dims_broadcast, swap_flag);
+
+    if (swap_flag) {
+        std::swap(_input0, _input1);
+    }
+
+    if (dims_broadcast.size()) {
+        type = (dims_broadcast[1] == 1) ? BroadcastTypeSingle : BroadcastTypeChannel;
+    }
+
+    int count = DimsVectorUtils::Count(dims);
+    if (dims.size() >= 2) {
+        count = count / dims[1];
+        count = count * ROUND_UP(dims[1], pack);
+    }
+    int count_quad = UP_DIV(count, pack);
+
+    int hw_stride = 1;
+    if (dims.size() > 2) {
+        hw_stride = DimsVectorUtils::Count(dims, 2);
+    }
+    int w_stride = 1;
+    if (dims.size() > 3) {
+        w_stride = DimsVectorUtils::Count(dims, 3);
+    }
+
+    if (type == BroadcastTypeNormal) {
+        for (int n = 0; n < count_quad; n++) {
+            auto v1 = VEC::load(_input0 + n * pack);
+            auto v2 = VEC::load(_input1 + n * pack);
+            VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+        }
+
+        return TNN_OK;
+    }
+
+    if (swap_flag) {
+        if (type == BroadcastTypeSingle) {
+            // broadcast single
+            for (int n = 0; n < count_quad; n++) {
+                auto v1 = VEC::load(_input0 + n * pack);
+                auto v2 = VEC(_input1[0]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v2, v1, alpha, beta));
+            }
+        } else if (type == BroadcastTypeChannel) {
+            // broadcast channel
+            for (int n = 0; n < count_quad; n++) {
+                int b               = n / (hw_stride * UP_DIV(dims[1], pack));
+                int channel_4_index = n / (hw_stride) - b * UP_DIV(dims[1], pack);
+                auto v1             = VEC::load(_input0 + n * pack);
+                auto v2             = VEC::load(_input1 + channel_4_index * pack);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v2, v1, alpha, beta));
+            }
+        } else if (type == BroadcastTypeElement) {
+            // broadcast chw
+            for (int n = 0; n < count_quad; n++) {
+                int channel_4_index = n % (hw_stride * UP_DIV(dims[1], pack));
+                auto v1             = VEC::load(_input0 + n * pack);
+                auto v2             = VEC::load(_input1 + channel_4_index * pack);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v2, v1, alpha, beta));
+            }
+        } else if (type == BroadcastTypeHeightWidth) {
+            // broadcast hw
+            for (int n = 0; n < count_quad; n++) {
+                int hw_index = n % (hw_stride);
+                auto v1      = VEC::load(_input0 + n * pack);
+                auto v2      = VEC(_input1[hw_index * pack]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v2, v1, alpha, beta));
+            }
+        } else if (type == BroadcastTypeWidth) {
+            // broadcast w
+            for (int n = 0; n < count_quad; n++) {
+                int w_index = n % (w_stride);
+                auto v1      = VEC::load(_input0 + n * pack);
+                auto v2      = VEC(_input1[w_index * pack]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v2, v1, alpha, beta));
+            }
+        } else {
+            LOGE("Error: invalid add type\n");
+            return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unsupported broadcast type");
+        }
+    } else {
+        if (type == BroadcastTypeSingle) {
+            // broadcast single
+            for (int n = 0; n < count_quad; n++) {
+                auto v1 = VEC::load(_input0 + n * pack);
+                auto v2 = VEC(_input1[0]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+            }
+        } else if (type == BroadcastTypeChannel) {
+            // broadcast channel
+            for (int n = 0; n < count_quad; n++) {
+                int b               = n / (hw_stride * UP_DIV(dims[1], pack));
+                int channel_4_index = n / (hw_stride) - b * UP_DIV(dims[1], pack);
+                auto v1             = VEC::load(_input0 + n * pack);
+                auto v2             = VEC::load(_input1 + channel_4_index * pack);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+            }
+        } else if (type == BroadcastTypeElement) {
+            // broadcast chw
+            for (int n = 0; n < count_quad; n++) {
+                int channel_4_index = n % (hw_stride * UP_DIV(dims[1], pack));
+                auto v1             = VEC::load(_input0 + n * pack);
+                auto v2             = VEC::load(_input1 + channel_4_index * pack);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+            }
+        } else if (type == BroadcastTypeHeightWidth) {
+            // broadcast hw
+            for (int n = 0; n < count_quad; n++) {
+                int hw_index = n % (hw_stride);
+                auto v1      = VEC::load(_input0 + n * pack);
+                auto v2      = VEC(_input1[hw_index * pack]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+            }
+        } else if (type == BroadcastTypeWidth) {
+            // broadcast w
+            for (int n = 0; n < count_quad; n++) {
+                int w_index = n % (w_stride);
+                auto v1      = VEC::load(_input0 + n * pack);
+                auto v2      = VEC(_input1[w_index * pack]);
+                VEC::save(output_ptr + n * pack, binary_op<op_type, VEC>(v1, v2, alpha, beta));
+            }
+        } else {
+            LOGE("Error: invalid add type\n");
+            return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unsupported broadcast type");
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.cc
new file mode 100644
index 0000000..6c32d6b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.cc
@@ -0,0 +1,906 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute/compute.h"
+
+#include <string.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+/*
+add bias
+*/
+template <typename T1, typename T2>
+void PostAddBias(void* dst, const void* bias, long area, long oc4) {
+    for (long z = oc4 - 1; z >= 0; --z) {
+        Float4 vbias = Float4::load(reinterpret_cast<const T2*>(bias) + 4 * z);
+        auto dst_z   = reinterpret_cast<T1*>(dst) + area * 4 * z;
+        long p       = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 4 * p;
+            Float4x4::save(dst_p, Float4x4::load(dst_p) + vbias);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 4 * p;
+            Float4::save(dst_p, Float4::load(dst_p) + vbias);
+        }
+    }
+}
+template void PostAddBias<float>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBias<bfp16_t>(void* dst, const void* bias, long area, long oc4);
+
+/*
+bias + relu
+*/
+template <typename T1, typename T2>
+void PostAddBiasRelu(void* dst, const void* bias, long area, long oc4) {
+    Float4 vzero(0.f);
+    for (long z = oc4 - 1; z >= 0; --z) {
+        Float4 vbias = Float4::load(reinterpret_cast<const T2*>(bias) + 4 * z);
+        auto dst_z   = reinterpret_cast<T1*>(dst) + area * 4 * z;
+        long p       = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 4 * p;
+            Float4x4 v = Float4x4::load(dst_p);
+            v          = Float4x4::max(v + vbias, vzero);
+            Float4x4::save(dst_p, v);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 4 * p;
+            Float4::save(dst_p, Float4::max(Float4::load(dst_p) + vbias, vzero));
+        }
+    }
+}
+template void PostAddBiasRelu<float>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBiasRelu<bfp16_t>(void* dst, const void* bias, long area, long oc4);
+
+/*
+bias + relu6
+*/
+template <typename T1, typename T2>
+void PostAddBiasRelu6(void* dst, const void* bias, long area, long oc4) {
+    Float4 vzero(0.f);
+    Float4 vrelu6(6.f);
+    for (long z = oc4 - 1; z >= 0; --z) {
+        Float4 vbias = Float4::load(reinterpret_cast<const T2*>(bias) + 4 * z);
+        auto dst_z   = reinterpret_cast<T1*>(dst) + area * 4 * z;
+        long p       = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 4 * p;
+            Float4x4 v = Float4x4::load(dst_p);
+            v          = Float4x4::min(Float4x4::max(v + vbias, vzero), vrelu6);
+            Float4x4::save(dst_p, v);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 4 * p;
+            Float4::save(dst_p, Float4::min(Float4::max(Float4::load(dst_p) + vbias, vzero), vrelu6));
+        }
+    }
+}
+template void PostAddBiasRelu6<float>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBiasRelu6<bfp16_t>(void* dst, const void* bias, long area, long oc4);
+
+template <typename T1, typename T2, bool Fast>
+void PostAddBiasSwish(void* dst, const void* bias, long area, long oc4) {
+    auto f = Fast ? Float4::fast_sigmoid : Float4::sigmoid;
+
+    if (!bias) {
+        for (long z = oc4 - 1; z >= 0; --z) {
+            auto dst_z = reinterpret_cast<T1*>(dst) + area * 4 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 4 * p;
+                Float4 val = Float4::load(dst_p);
+                Float4::save(dst_p, val * f(val));
+            }
+        }
+    } else {
+        for (long z = oc4 - 1; z >= 0; --z) {
+            Float4 vbias = Float4::load(reinterpret_cast<const T2*>(bias) + 4 * z);
+            auto dst_z   = reinterpret_cast<T1*>(dst) + area * 4 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 4 * p;
+                Float4 val = Float4::load(dst_p) + vbias;
+                Float4::save(dst_p, val * f(val));
+            }
+        }
+    }
+}
+template void PostAddBiasSwish<float, float, false>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBiasSwish<float, float, true>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBiasSwish<bfp16_t, float, false>(void* dst, const void* bias, long area, long oc4);
+template void PostAddBiasSwish<bfp16_t, float, true>(void* dst, const void* bias, long area, long oc4);
+
+/*
+min(x, clap)
+*/
+template <typename T>
+void PostClap(void* dst, long size4, float val) {
+    Float4 vclap(val);
+    long i = 0;
+    for (; i < size4 - 3; i += 4) {
+        auto dst_p = reinterpret_cast<T*>(dst) + 4 * i;
+        Float4x4 v = Float4x4::load(dst_p);
+        v          = Float4x4::min(v, vclap);
+        Float4x4::save(dst_p, v);
+    }
+    for (; i < size4; i++) {
+        Float4::save(reinterpret_cast<T*>(dst) + 4 * i,
+                     Float4::min(Float4::load(reinterpret_cast<T*>(dst) + 4 * i), vclap));
+    }
+}
+template void PostClap<float>(void* dst, long size4, float val);
+template void PostClap<bfp16_t>(void* dst, long size4, float val);
+
+#ifndef TNN_USE_NEON
+/*
+kernel func used in linux debug mode
+*/
+
+/*
+general conv micro kernel
+*/
+template <typename T>
+void ConvCommonO4(T* dst, const T* src, const float* weight, long width, long src_w_setup, long src_depth_quad,
+                  long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        T* dst_x             = dst + dx * 4;
+        float dst_x_float[4] = {0};
+        auto src_dx          = src + src_w_setup * dx;
+        for (sz = 0; sz < src_depth_quad; ++sz) {
+            auto src_z    = src_dx + sz * src_depth_step;
+            auto weight_z = weight + sz * fh * fw * 16;
+            for (fy = 0; fy < fh; ++fy) {
+                auto src_y    = src_z + fy * dilate_y_step;
+                auto weight_y = weight_z + fy * fw * 16;
+                for (fx = 0; fx < fw; ++fx) {
+                    auto weight_x = weight_y + 16 * fx;
+                    auto src_x    = src_y + fx * dilate_x_step;
+                    for (long i = 0; i < 4; ++i) {
+                        for (long j = 0; j < 4; ++j) {
+                            dst_x_float[j] += float(src_x[i]) * float(weight_x[4 * i + j]);
+                        }
+                    }
+                }
+            }
+        }
+        dst_x[0] = dst_x_float[0];
+        dst_x[1] = dst_x_float[1];
+        dst_x[2] = dst_x_float[2];
+        dst_x[3] = dst_x_float[3];
+    }
+}
+
+template void ConvCommonO4(float* dst, const float* src, const float* weight, long width, long src_w_setup,
+                           long src_depth_quad, long src_depth_step, long fw, long fh, long dilate_x_step,
+                           long dilate_y_step);
+
+template void ConvCommonO4(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_setup,
+                           long src_depth_quad, long src_depth_step, long fw, long fh, long dilate_x_step,
+                           long dilate_y_step);
+
+/*
+general deconv micro kernel
+*/
+void DeconvFloatO4(float* dst, const float* src, const float* weight, long width, long dst_w_step, long src_depth_quad,
+                   long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_dx = dst + dx * dst_w_step;
+        for (fy = 0; fy < fh; ++fy) {
+            auto dst_y    = dst_dx + fy * dilate_y_step;
+            auto weight_y = weight + fy * fw * src_depth_quad * 16;
+            for (fx = 0; fx < fw; ++fx) {
+                auto dst_x    = dst_y + fx * dilate_x_step;
+                auto weight_x = weight_y + fx * src_depth_quad * 16;
+                float temp[4] = {0.f};
+                for (sz = 0; sz < src_depth_quad; ++sz) {
+                    auto weight_z = weight_x + sz * 16;
+                    auto src_z    = src + dx * 4 + sz * src_depth_step;
+                    for (long i = 0; i < 4; ++i) {
+                        for (long j = 0; j < 4; ++j) {
+                            temp[j] = temp[j] + float(src_z[i]) * weight_z[4 * i + j];
+                        }
+                    }
+                }
+                for (long j = 0; j < 4; ++j) {
+                    dst_x[j] = float(dst_x[j]) + temp[j];
+                }
+            }
+        }
+    }
+}
+
+/*
+micro kernel used in conv c3
+*/
+template <typename T>
+void GemmSlidew(T* dst, const T* src, const float* weight, long width, long src_w_setup, long fw, long fh,
+                long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_x  = dst + dx * 4;
+        dst_x[0]    = 0.0f;
+        dst_x[1]    = 0.0f;
+        dst_x[2]    = 0.0f;
+        dst_x[3]    = 0.0f;
+        auto src_dx = src + src_w_setup * dx;
+
+        for (fy = 0; fy < fh; ++fy) {
+            auto src_y    = src_dx + fy * dilate_y_step;
+            auto weight_y = weight + fy * fw * 12;
+            for (fx = 0; fx < fw; ++fx) {
+                auto weight_x = weight_y + 12 * fx;
+                auto src_x    = src_y + fx * dilate_x_step;
+                for (long i = 0; i < 3; ++i) {
+                    for (long j = 0; j < 4; ++j) {
+                        dst_x[j] = float(dst_x[j]) + float(src_x[i]) * float(weight_x[4 * i + j]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GemmFloatSlidewC3(float* dst, const float* src, const float* weight, long width, long src_w_setup, long fw,
+                       long fh, long dilate_x_step, long dilate_y_step) {
+    GemmSlidew(dst, src, weight, width, src_w_setup, fw, fh, dilate_x_step, dilate_y_step);
+}
+
+void GemmBfp16SlidewC3(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_setup, long fw,
+                       long fh, long dilate_x_step, long dilate_y_step) {
+    GemmSlidew(dst, src, weight, width, src_w_setup, fw, fh, dilate_x_step, dilate_y_step);
+}
+
+/*
+micro kernel used in convdw s1
+*/
+template <typename T>
+void ConvDwSlideW(T* dst_z, T** cache_line, const float* weight_z, long dst_width, long fh, long fw) {
+    long dx, fy, fx;
+    for (dx = 0; dx < dst_width; ++dx) {
+        auto dst_x = dst_z + dx * 4;
+        dst_x[0]   = 0.0f;
+        dst_x[1]   = 0.0f;
+        dst_x[2]   = 0.0f;
+        dst_x[3]   = 0.0f;
+        for (fy = 0; fy < fh; ++fy) {
+            for (fx = 0; fx < fw; ++fx) {
+                for (long i = 0; i < 4; ++i) {
+                    dst_x[i] = dst_x[i] + cache_line[fy][dx * 4 + fx * 4 + i] * weight_z[fw * fy * 4 + fx * 4 + i];
+                }
+            }
+        }
+    }
+}
+
+void ConvDw3x3FloatSlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width) {
+    ConvDwSlideW(reinterpret_cast<float*>(dst_z), reinterpret_cast<float**>(cache_line),
+                 reinterpret_cast<const float*>(weight_z), dst_width, 3, 3);
+}
+void ConvDw3x3Bfp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width) {
+    ConvDwSlideW(reinterpret_cast<bfp16_t*>(dst_z), reinterpret_cast<bfp16_t**>(cache_line),
+                 reinterpret_cast<const float*>(weight_z), dst_width, 3, 3);
+}
+void ConvDw5x5FloatSlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width) {
+    ConvDwSlideW(reinterpret_cast<float*>(dst_z), reinterpret_cast<float**>(cache_line),
+                 reinterpret_cast<const float*>(weight_z), dst_width, 5, 5);
+}
+void ConvDw5x5Bfp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width) {
+    ConvDwSlideW(reinterpret_cast<bfp16_t*>(dst_z), reinterpret_cast<bfp16_t**>(cache_line),
+                 reinterpret_cast<const float*>(weight_z), dst_width, 5, 5);
+}
+
+template <typename T>
+void ActiveOutput(T* dst, const Float4* src, long relu, int num) {
+    for (long i = 0; i < num; i++) {
+        if (relu) {
+            Float4::save(dst + i * 4, Float4::max(src[i], Float4(0.f)));
+        } else {
+            Float4::save(dst + i * 4, src[i]);
+        }
+    }
+}
+
+/*
+micro kernel used in gemm like conv, such as conv1x1, conv3x3 winograd
+*/
+template <typename T>
+void GEMM_FLOAT_NCHW(T* dst, const T* src, const float* weight, long src_depth_quad, long dst_step, long dst_depth_quad,
+                     long width, float* bias, long relu) {
+    long dx, sz, dz;
+    long src_z_step = width * 4;
+    for (dz = 0; dz < dst_depth_quad; ++dz) {
+        auto dst_z     = dst + dz * dst_step;
+        auto weight_dz = weight + dz * (src_depth_quad * 16);
+        Float4 v_bias  = Float4::load(bias + dz * 4);
+        // process 8x4 results in one loop
+        for (dx = 0; dx + 7 < width; dx += 8) {
+            auto dst_dx = dst_z + dx * 4;
+            auto src_dx = src + dx * 4;
+            Float4 v_dst[8];
+            for (long i = 0; i < 8; i++)
+                v_dst[i] = v_bias;
+            for (long sz = 0; sz < src_depth_quad; ++sz) {
+                auto src_z     = src_dx + sz * src_z_step;
+                auto weight_sz = weight_dz + sz * 16;
+                Float4 v_weight[4];
+                for (long i = 0; i < 4; i++)
+                    v_weight[i] = Float4::load(weight_sz + 4 * i);
+                Float4 v_src[4][2];
+                for (long i = 0; i < 4; i++) {
+                    for (long j = 0; j < 2; j++) {
+                        v_src[i][j] = Float4::load(src_z + i * 8 + j * 4);
+                    }
+                }
+                for (long i = 0; i < 4; i++) {
+                    v_dst[0] = v_dst[0] + v_weight[i] * v_src[i][0].value[0];
+                    v_dst[1] = v_dst[1] + v_weight[i] * v_src[i][0].value[1];
+                    v_dst[2] = v_dst[2] + v_weight[i] * v_src[i][0].value[2];
+                    v_dst[3] = v_dst[3] + v_weight[i] * v_src[i][0].value[3];
+                    v_dst[4] = v_dst[4] + v_weight[i] * v_src[i][1].value[0];
+                    v_dst[5] = v_dst[5] + v_weight[i] * v_src[i][1].value[1];
+                    v_dst[6] = v_dst[6] + v_weight[i] * v_src[i][1].value[2];
+                    v_dst[7] = v_dst[7] + v_weight[i] * v_src[i][1].value[3];
+                }
+            }
+            ActiveOutput(dst_dx, v_dst, relu, 8);
+        }
+        // process 4x4 results in one loop
+        for (; dx + 3 < width; dx += 4) {
+            auto dst_dx = dst_z + dx * 4;
+            auto src_dx = src + dx * 4;
+            Float4 v_dst[4];
+            for (long i = 0; i < 4; i++)
+                v_dst[i] = v_bias;
+            for (long sz = 0; sz < src_depth_quad; ++sz) {
+                auto src_z     = src_dx + sz * src_z_step;
+                auto weight_sz = weight_dz + sz * 16;
+                Float4 v_weight[4];
+                for (long i = 0; i < 4; i++)
+                    v_weight[i] = Float4::load(weight_sz + 4 * i);
+                Float4 v_src[4];
+                for (long i = 0; i < 4; i++)
+                    v_src[i] = Float4::load(src_z + i * 4);
+
+                for (long i = 0; i < 4; i++) {
+                    v_dst[0] = v_dst[0] + v_weight[i] * v_src[i].value[0];
+                    v_dst[1] = v_dst[1] + v_weight[i] * v_src[i].value[1];
+                    v_dst[2] = v_dst[2] + v_weight[i] * v_src[i].value[2];
+                    v_dst[3] = v_dst[3] + v_weight[i] * v_src[i].value[3];
+                }
+            }
+            ActiveOutput(dst_dx, v_dst, relu, 4);
+        }
+        // the process 1x4 results in one loop
+        for (; dx < width; ++dx) {
+            auto dst_dx  = dst_z + dx * 4;
+            auto src_dx  = src + dx * 4;
+            Float4 v_dst = v_bias;
+            for (long sz = 0; sz < src_depth_quad; ++sz) {
+                auto src_z     = src_dx + sz * src_z_step;
+                auto weight_sz = weight_dz + sz * 16;
+                Float4 v_weight[4];
+                for (long i = 0; i < 4; i++)
+                    v_weight[i] = Float4::load(weight_sz + 4 * i);
+                Float4 v_src = Float4::load(src_z);
+
+                for (long i = 0; i < 4; i++) {
+                    v_dst = v_dst + v_weight[i] * v_src.value[i];
+                }
+            }
+            ActiveOutput(dst_dx, &v_dst, relu, 1);
+        }
+    }
+}
+
+void GEMM_BFP16_N4(bfp16_t* dst, const bfp16_t* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu) {
+    GEMM_FLOAT_NCHW(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, bias, relu);
+}
+
+void GEMM_FLOAT_N4(float* dst, const float* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu) {
+    GEMM_FLOAT_NCHW(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, bias, relu);
+}
+
+#endif
+
+#ifdef TNN_USE_NEON
+/*
+assemble arm neon kernel, used in conv common
+*/
+template <>
+void ConvCommonO4(float* dst, const float* src, const float* weight, long width, long src_w_step, long src_depth_quad,
+                  long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    ConvFloatO4(dst, src, weight, width, src_w_step, src_depth_quad, src_depth_step, fw, fh, dilate_x_step,
+                dilate_y_step);
+}
+
+template <>
+void ConvCommonO4(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_step,
+                  long src_depth_quad, long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    ConvBfp16O4(dst, src, weight, width, src_w_step, src_depth_quad, src_depth_step, fw, fh, dilate_x_step,
+                dilate_y_step);
+}
+#endif
+
+/*
+max pooling corner func, left/right/top/bottom
+*/
+template <typename T>
+void MaxPoolingCorner(const T* src, long iw, long ih, T* dst, long ow, long kw, long kh, long stride_w, long stride_h,
+                      long pad_w, long pad_h, long l, long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            Float4 vmax(-FLT_MAX);
+
+            const long srcOriginX = ox * stride_w - pad_w;
+            const long srcOriginY = oy * stride_h - pad_h;
+            const long kxs        = MAX(0, -srcOriginX);
+            const long kxe        = MIN(kw, iw - srcOriginX);
+            const long kys        = MAX(0, -srcOriginY);
+            const long kye        = MIN(kh, ih - srcOriginY);
+            const auto src_ptr    = src + (srcOriginY * iw + srcOriginX) * 4;
+            auto dst_ptr          = dst + (oy * ow + ox) * 4;
+
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 4;
+                for (long kx = kxs; kx < kxe; kx++) {
+                    vmax = Float4::max(vmax, Float4::load(src_ptr_h + kx * 4));
+                }
+            }
+
+            Float4::save(dst_ptr, vmax);
+        }
+    }
+}
+
+template void MaxPoolingCorner(const float* src, long iw, long ih, float* dst, long ow, long kw, long kh, long stride_w,
+                               long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+
+template void MaxPoolingCorner(const bfp16_t* src, long iw, long ih, bfp16_t* dst, long ow, long kw, long kh,
+                               long stride_w, long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+
+/*
+max pooling 3x3s2 kernel
+*/
+template <typename T>
+void MaxPoolingCenter3x3s2(const T* src, long iw, long ih, T* dst, long ow, long oh, long pad_w, long pad_h, long l,
+                           long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            Float4 vmax(-FLT_MAX);
+
+            const long src_offset_x = ox * 2 - pad_w;
+            const long src_offset_y = oy * 2 - pad_h;
+            const auto src_ptr      = src + (src_offset_y * iw + src_offset_x) * 4;
+            auto dst_ptr            = dst + (oy * ow + ox) * 4;
+
+            for (long ky = 0; ky < 3; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 4;
+                vmax                 = Float4::max(vmax, Float4::load(src_ptr_h + 0 * 4));
+                vmax                 = Float4::max(vmax, Float4::load(src_ptr_h + 1 * 4));
+                vmax                 = Float4::max(vmax, Float4::load(src_ptr_h + 2 * 4));
+            }
+            Float4::save(dst_ptr, vmax);
+        }
+    }
+}
+
+template void MaxPoolingCenter3x3s2(const float* src, long iw, long ih, float* dst, long ow, long oh, long pad_w,
+                                    long pad_h, long l, long r, long t, long b);
+template void MaxPoolingCenter3x3s2(const bfp16_t* src, long iw, long ih, bfp16_t* dst, long ow, long oh, long pad_w,
+                                    long pad_h, long l, long r, long t, long b);
+
+/*
+general max pooling center kernel
+*/
+template <typename T>
+void MaxPoolingCenter(const T* src, long iw, long ih, T* dst, long ow, long oh, long kw, long kh, long stride_w,
+                      long stride_h, long pad_w, long pad_h, long l, long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            Float4 vmax(-FLT_MAX);
+
+            const long src_offset_x = ox * stride_w - pad_w;
+            const long src_offset_y = oy * stride_h - pad_h;
+            const auto src_ptr      = src + (src_offset_y * iw + src_offset_x) * 4;
+            auto dst_ptr            = dst + (oy * ow + ox) * 4;
+
+            for (long ky = 0; ky < kh; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 4;
+                for (long kx = 0; kx < kw; kx++) {
+                    vmax = Float4::max(vmax, Float4::load(src_ptr_h + kx * 4));
+                }
+            }
+
+            Float4::save(dst_ptr, vmax);
+        }
+    }
+}
+
+/*
+max pooling func, process four corners and center
+*/
+template <typename T>
+void MaxPooling(const T* src, long iw, long ih, T* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h, long l, long r, long t, long b) {
+    // top corner
+    MaxPoolingCorner<T>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, ow, 0, t);
+    if (kw == 3 && kh == 3 && stride_h == 2 && stride_w == 2) {
+        MaxPoolingCenter3x3s2<T>(src, iw, ih, dst, ow, oh, pad_w, pad_h, l, r, t, b);
+    } else {
+        MaxPoolingCenter<T>(src, iw, ih, dst, ow, oh, kw, kh, stride_w, stride_h, pad_w, pad_h, l, r, t, b);
+    }
+
+    // bottom corner
+    MaxPoolingCorner<T>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, ow, b, oh);
+    // left corner
+    MaxPoolingCorner<T>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, l, t, b);
+    // right corner
+    MaxPoolingCorner<T>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, r, ow, t, b);
+}
+template void MaxPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh,
+                         long stride_w, long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+template void MaxPooling(const bfp16_t* src, long iw, long ih, bfp16_t* dst, long ow, long oh, long kw, long kh,
+                         long stride_w, long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+
+/*
+general avg pooling func
+*/
+template <typename T>
+void AvgPooling(const T* src, long iw, long ih, T* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h) {
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            Float4 vavg(0.f);
+
+            const long srcOriginX    = ox * stride_w - pad_w;
+            const long srcOriginY    = oy * stride_h - pad_h;
+            const long kxs           = MAX(0, -srcOriginX);
+            const long kxe           = MIN(kw, iw - srcOriginX);
+            const long kys           = MAX(0, -srcOriginY);
+            const long kye           = MIN(kh, ih - srcOriginY);
+            const float kernel_count = 1.0 / ((kxe - kxs) * (kye - kys));
+            const auto src_ptr       = src + (srcOriginY * iw + srcOriginX) * 4;
+            auto dst_ptr             = dst + (oy * ow + ox) * 4;
+
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 4;
+                for (long kx = kxs; kx < kxe; kx++) {
+                    vavg = vavg + Float4::load(src_ptr_h + kx * 4);
+                }
+            }
+
+            vavg = vavg * Float4(kernel_count);
+            Float4::save(dst_ptr, vavg);
+        }
+    }
+}
+
+template void AvgPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh,
+                         long stride_w, long stride_h, long pad_w, long pad_h);
+template void AvgPooling(const bfp16_t* src, long iw, long ih, bfp16_t* dst, long ow, long oh, long kw, long kh,
+                         long stride_w, long stride_h, long pad_w, long pad_h);
+
+/*
+convdw unit, used in four cornels calc
+*/
+template <typename T1, typename T2>
+void DepthwiseUnit(T1* dst, const T1* src, const T2* weight, long fw, long fh, long weight_y_step, long dilate_x_step,
+                   long dilate_y_step) {
+    long fx, fy;
+    Float4 dst_v(0.0f);
+    const auto* src_z    = src;
+    const auto* weight_z = weight;
+    for (fy = 0; fy < fh; ++fy) {
+        const auto* src_y    = src_z + fy * dilate_y_step;
+        const auto* weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            Float4 src_v    = Float4::load(src_y + fx * dilate_x_step);
+            Float4 weight_v = Float4::load(weight_y + 4 * fx);
+            Float4::mla(dst_v, src_v, weight_v);
+        }
+    }
+    Float4::save(dst, dst_v);
+}
+template void DepthwiseUnit<float>(float* dst, const float* src, const float* weight, long fw, long fh,
+                                   long weight_y_step, long dilate_x_step, long dilate_y_step);
+template void DepthwiseUnit<bfp16_t>(bfp16_t* dst, const bfp16_t* src, const float* weight, long fw, long fh,
+                                     long weight_y_step, long dilate_x_step, long dilate_y_step);
+
+/*
+general convdw func
+*/
+template <typename T1, typename T2>
+void DepthwiseConv(T1* dst, const T1* src, const T2* weight, long width, long src_w_step, long fw, long fh,
+                   long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep) {
+    long dx, fx, fy;
+    for (long y = 0; y < height; ++y) {
+        auto srcY = src + y * srcHStep;
+        auto dstY = dst + y * dstHStep;
+        dx        = 0;
+        for (; dx + 3 < width; dx += 4) {
+            Float4 dst_v[4];
+            for (long i = 0; i < 4; i++)
+                dst_v[i] = 0.0f;
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * 4;
+                for (fx = 0; fx < fw; ++fx) {
+                    Float4 weight_v = Float4::load(weight_y + 4 * fx);
+                    Float4 src_v0   = Float4::load(src_y + fx * dilate_x_step);
+                    Float4::mla(dst_v[0], src_v0, weight_v);
+                    Float4 src_v1 = Float4::load(src_y + fx * dilate_x_step + src_w_step);
+                    Float4::mla(dst_v[1], src_v1, weight_v);
+                    Float4 src_v2 = Float4::load(src_y + fx * dilate_x_step + 2 * src_w_step);
+                    Float4::mla(dst_v[2], src_v2, weight_v);
+                    Float4 src_v3 = Float4::load(src_y + fx * dilate_x_step + 3 * src_w_step);
+                    Float4::mla(dst_v[3], src_v3, weight_v);
+                }
+            }
+            Float4::save(dstY + (dx + 0) * 4, dst_v[0]);
+            Float4::save(dstY + (dx + 1) * 4, dst_v[1]);
+            Float4::save(dstY + (dx + 2) * 4, dst_v[2]);
+            Float4::save(dstY + (dx + 3) * 4, dst_v[3]);
+        }
+        for (; dx < width; ++dx) {
+            Float4 dst_v(0.0f);
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * 4;
+                for (fx = 0; fx < fw; ++fx) {
+                    Float4 src_v    = Float4::load(src_y + fx * dilate_x_step);
+                    Float4 weight_v = Float4::load(weight_y + 4 * fx);
+                    dst_v           = dst_v + src_v * weight_v;
+                }
+            }
+            Float4::save(dstY + dx * 4, dst_v);
+        }
+    }
+}
+template void DepthwiseConv<float, float>(float* dst, const float* src, const float* weight, long width,
+                                          long src_w_step, long fw, long fh, long dilate_x_step, long dilate_y_step,
+                                          long height, long srcHStep, long dstHStep);
+template void DepthwiseConv<bfp16_t, float>(bfp16_t* dst, const bfp16_t* src, const float* weight, long width,
+                                            long src_w_step, long fw, long fh, long dilate_x_step, long dilate_y_step,
+                                            long height, long srcHStep, long dstHStep);
+
+/*
+convdw3x3 center func
+*/
+template <typename T>
+void DepthwiseConv3x3(T* dst, const T* src, const float* weight, long width, long src_w_step, long fw, long fh,
+                      long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep) {
+    long dx, fx, fy;
+    Float4 weight_v[9];
+    for (long i = 0; i < 9; i++)
+        weight_v[i] = Float4::load(weight + i * 4);
+
+    for (long y = 0; y < height; ++y) {
+        auto srcY = src + y * srcHStep;
+        auto dstY = dst + y * dstHStep;
+        dx        = 0;
+        for (; dx + 3 < width; dx += 4) {
+            Float4 dst_v[4];
+            for (long i = 0; i < 4; i++)
+                dst_v[i] = 0.0f;
+            const auto* src_z = srcY + src_w_step * dx;
+            for (fy = 0; fy < 3; ++fy) {
+                const auto* src_y = src_z + fy * dilate_y_step;
+                for (fx = 0; fx < 3; ++fx) {
+                    Float4 src_v0 = Float4::load(src_y + fx * dilate_x_step);
+                    Float4::mla(dst_v[0], src_v0, weight_v[fy * 3 + fx]);
+                    Float4 src_v1 = Float4::load(src_y + fx * dilate_x_step + src_w_step);
+                    Float4::mla(dst_v[1], src_v1, weight_v[fy * 3 + fx]);
+                    Float4 src_v2 = Float4::load(src_y + fx * dilate_x_step + 2 * src_w_step);
+                    Float4::mla(dst_v[2], src_v2, weight_v[fy * 3 + fx]);
+                    Float4 src_v3 = Float4::load(src_y + fx * dilate_x_step + 3 * src_w_step);
+                    Float4::mla(dst_v[3], src_v3, weight_v[fy * 3 + fx]);
+                }
+            }
+            Float4::save(dstY + (dx + 0) * 4, dst_v[0]);
+            Float4::save(dstY + (dx + 1) * 4, dst_v[1]);
+            Float4::save(dstY + (dx + 2) * 4, dst_v[2]);
+            Float4::save(dstY + (dx + 3) * 4, dst_v[3]);
+        }
+        for (; dx < width; ++dx) {
+            Float4 dst_v(0.0f);
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * 4;
+                for (fx = 0; fx < fw; ++fx) {
+                    Float4 src_v    = Float4::load(src_y + fx * dilate_x_step);
+                    Float4 weight_v = Float4::load(weight_y + 4 * fx);
+                    dst_v           = dst_v + src_v * weight_v;
+                }
+            }
+            Float4::save(dstY + dx * 4, dst_v);
+        }
+    }
+}
+
+template void DepthwiseConv3x3(float* dst, const float* src, const float* weight, long width, long src_w_step, long fw,
+                               long fh, long dilate_x_step, long dilate_y_step, long height, long srcHStep,
+                               long dstHStep);
+template void DepthwiseConv3x3(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_step,
+                               long fw, long fh, long dilate_x_step, long dilate_y_step, long height, long srcHStep,
+                               long dstHStep);
+
+template <typename Tin, typename Tout>
+void FloatConvert(const Tin* src, Tout* dst, long area_quad) {
+    // need support inplace
+    int i = area_quad - 1;
+    // for (; i >= 3; i -= 4) {
+    //     Float4x4::save(dst + i * 16, Float4x4::load(src + i * 16));
+    // }
+    for (; i >= 0; --i) {
+        Float4::save(dst + i * 4, Float4::load(src + i * 4));
+    }
+}
+
+/*
+data convert between bfp16 and float32
+*/
+template void FloatConvert(const float* src, bfp16_t* dst, long area_quad);
+template void FloatConvert(const bfp16_t* src, float* dst, long area_quad);
+
+/*
+deconv dw unit
+*/
+template <typename T1, typename T2>
+void DepthwiseUnitDeconv(const T1* dst, T1* src, const T2* weight, long fw, long fh, long weight_y_step,
+                         long dilate_x_step, long dilate_y_step) {
+    long fx, fy;
+    T1* src_z             = src;
+    const float* weight_z = weight;
+    Float4 dstV           = Float4::load(dst);
+    for (fy = 0; fy < fh; ++fy) {
+        T1* src_y             = src_z + fy * dilate_y_step;
+        const float* weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            Float4 weight_x = Float4::load(weight_y + 4 * fx);
+            Float4 src_x    = Float4::load(src_y + fx * dilate_x_step);
+            Float4::save(src_y + fx * dilate_x_step, src_x + weight_x * dstV);
+        }
+    }
+}
+
+template void DepthwiseUnitDeconv(const float* dst, float* src, const float* weight, long fw, long fh,
+                                  long weight_y_step, long dilate_x_step, long dilate_y_step);
+template void DepthwiseUnitDeconv(const bfp16_t* dst, bfp16_t* src, const float* weight, long fw, long fh,
+                                  long weight_y_step, long dilate_x_step, long dilate_y_step);
+
+/*
+general deconv dw func
+*/
+template <typename T1, typename T2>
+void DepthwiseDeconv(const T1* dst, T1* src, const T2* weight, long width, long src_w_setup, long fw, long fh,
+                     long dilate_x_step, long dilate_y_step) {
+    long dx;
+    for (dx = 0; dx < width; ++dx) {
+        const T1* dst_x = dst + dx * 4;
+        T1* src_dx      = src + src_w_setup * dx;
+        DepthwiseUnitDeconv(dst_x, src_dx, weight, fw, fh, fw * 4, dilate_x_step, dilate_y_step);
+    }
+}
+
+template void DepthwiseDeconv(const float* dst, float* src, const float* weight, long width, long src_w_setup, long fw,
+                              long fh, long dilate_x_step, long dilate_y_step);
+template void DepthwiseDeconv(const bfp16_t* dst, bfp16_t* src, const float* weight, long width, long src_w_setup,
+                              long fw, long fh, long dilate_x_step, long dilate_y_step);
+
+template <typename T>
+void ScaleBias(T* src, int channel, int hw, const float* scale, const float* bias, T* dst) {
+    if (dst == nullptr) {
+        dst = src;
+    }
+    RawBuffer scale_buffer(ROUND_UP(channel, 4) * sizeof(float));
+    RawBuffer bias_buffer(ROUND_UP(channel, 4) * sizeof(float));
+    memcpy(scale_buffer.force_to<void*>(), scale, sizeof(float) * channel);
+    memcpy(bias_buffer.force_to<void*>(), bias, sizeof(float) * channel);
+    auto local_scale = scale_buffer.force_to<float*>();
+    auto local_bias  = bias_buffer.force_to<float*>();
+
+    for (int z = 0; z < UP_DIV(channel, 4); ++z) {
+        auto src_z   = src + z * hw * 4;
+        auto dst_z   = dst + z * hw * 4;
+        auto v_scale = Float4::load(local_scale + z * 4);
+        auto v_bias  = Float4::load(local_bias + z * 4);
+        for (int s = 0; s < hw; ++s) {
+            Float4::save(dst_z + s * 4, Float4::load(src_z + s * 4) * v_scale + v_bias);
+        }
+    }
+}
+
+template void ScaleBias(float* src, int channel, int hw, const float* scale, const float* bias, float* dst);
+template void ScaleBias(bfp16_t* src, int channel, int hw, const float* scale, const float* bias, bfp16_t* dst);
+
+void Half2Float(float* dst, const fp16_t* src, const size_t length) {
+#ifdef TNN_ARM82_USE_NEON
+    Half2FloatKernel(dst, src, length);
+#else
+    for (auto i = 0; i < length; i++) {
+        dst[i] = src[i];
+    }
+#endif
+}
+void Float2Half(fp16_t* dst, const float* src, const size_t length) {
+#ifdef TNN_ARM82_USE_NEON
+    Float2HalfKernel(dst, src, length);
+#else
+    for (auto i = 0; i < length; i++) {
+        dst[i] = src[i];
+    }
+#endif
+}
+
+void GemmFloatPackA(int m, int n, int k, const float* a, float* pack_a, int lda, const float* b, int ldb, float* c,
+                    int ldc) {
+#ifdef __aarch64__
+    PackA_12(m, k, a, lda, pack_a);
+    Kernel_12x8(m, n, k, pack_a, b, c, ldc);
+    a += (m / 12) * 12 * lda;
+    c += (m / 12) * 12 * ldc;
+    m = m % 12;
+#endif
+
+    PackA_4(m, k, a, lda, pack_a);
+    Kernel_4x8(m, n, k, pack_a, b, c, ldc);
+    a += (m / 4) * 4 * lda;
+    c += (m / 4) * 4 * ldc;
+    m = m % 4;
+
+    PackA_1(m, k, a, lda, pack_a);
+    Kernel_1x8(m, n, k, pack_a, b, c, ldc);
+}
+
+void GemmFloatPackAB(int m, int n, int k, const float* a, float* pack_a, int lda, const float* b, float* pack_b, int ldb, float* c,
+                    int ldc) {
+    PackB_8(k, n, b, ldb, pack_b);
+#ifdef __aarch64__
+    PackA_12(m, k, a, lda, pack_a);
+    Kernel_12x8(m, n, k, pack_a, pack_b, c, ldc);
+    a += (m / 12) * 12 * lda;
+    c += (m / 12) * 12 * ldc;
+    m = m % 12;
+#endif
+
+    PackA_4(m, k, a, lda, pack_a);
+    Kernel_4x8(m, n, k, pack_a, pack_b, c, ldc);
+    a += (m / 4) * 4 * lda;
+    c += (m / 4) * 4 * ldc;
+    m = m % 4;
+
+    PackA_1(m, k, a, lda, pack_a);
+    Kernel_1x8(m, n, k, pack_a, pack_b, c, ldc);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.h
new file mode 100644
index 0000000..f055622
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute.h
@@ -0,0 +1,163 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_COMPUTE_H_
+#define TNN_ARM_COMPUTE_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/compute_arm82/compute_half.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+struct ArmKernelParam {
+    long ic_r4;
+    long ic_r8;
+    long ih;
+    long iw;
+    long oc_r4;
+    long oc_r8;
+    long oh;
+    long ow;
+    void* fil_ptr;
+    float* scale;
+    void* bias;
+    void set_dims(long ic_r4_, long ic_r8_, long ih_, long iw_, long oc_r4_, long oc_r8_, long oh_, long ow_) {
+        this->ic_r4 = ic_r4_;
+        this->ic_r8 = ic_r8_;
+        this->ih    = ih_;
+        this->iw    = iw_;
+        this->oc_r4 = oc_r4_;
+        this->oc_r8 = oc_r8_;
+        this->oh    = oh_;
+        this->ow    = ow_;
+    }
+};
+
+typedef void (*PostFunc)(void* dst, const void* bias, long area, long oc4);
+typedef void (*ConvDwSliceFunc)(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+
+template <typename T1, typename T2 = float>
+void PostAddBias(void* dst, const void* bias, long area, long oc4);
+
+template <typename T1, typename T2 = float>
+void PostAddBiasRelu(void* dst, const void* bias, long area, long oc4);
+
+template <typename T1, typename T2 = float>
+void PostAddBiasRelu6(void* dst, const void* bias, long area, long oc4);
+
+template <typename T1, typename T2 = float, bool Fast>
+void PostAddBiasSwish(void* dst, const void* bias, long area, long oc4);
+
+template <typename T>
+void PostClap(void* dst, long size4, float val);
+
+template <typename T1, typename T2 = float>
+void DepthwiseUnit(T1* dst, const T1* src, const T2* weight, long fw, long fh, long weight_y_step, long dilateX_step,
+                   long dilateY_step);
+template <typename T1, typename T2 = float>
+void DepthwiseConv(T1* dst, const T1* src, const T2* weight, long width, long src_w_setup, long fw, long fh,
+                   long dilateX_step, long dilateY_step, long height, long srcHStep, long dstHStep);
+template <typename T>
+void DepthwiseConv3x3(T* dst, const T* src, const float* weight, long width, long src_w_setup, long fw, long fh,
+                      long dilateX_step, long dilateY_step, long height, long srcHStep, long dstHStep);
+
+template <typename T1, typename T2 = float>
+void DepthwiseUnitDeconv(const T1* dst, T1* src, const T2* weight, long fw, long fh, long weight_y_step,
+                         long dilateX_step, long dilateY_step);
+
+template <typename T1, typename T2 = float>
+void DepthwiseDeconv(const T1* dst, T1* src, const T2* weight, long width, long src_w_setup, long fw, long fh,
+                     long dilateX_step, long dilateY_step);
+
+template <typename T>
+void MaxPooling(const T* src, long inputWidth, long inputHeight, T* dst, long outputWidth, long outputHeight,
+                long kernelWidth, long kernelHeight, long strideWidth, long strideHeight, long padWidth, long padHeight,
+                long l, long r, long t, long b);
+
+template <typename T>
+void AvgPooling(const T* src, long inputWidth, long inputHeight, T* dst, long outputWidth, long outputHeight,
+                long kernelWidth, long kernelHeight, long strideWidth, long strideHeight, long padWidth,
+                long padHeight);
+
+void MaxPoolingHalf(const fp16_t* src, long inputWidth, long inputHeight, fp16_t* dst, long outputWidth,
+                    long outputHeight, long kernelWidth, long kernelHeight, long strideWidth, long strideHeight,
+                    long padWidth, long padHeight);
+
+void AvgPoolingHalf(const fp16_t* src, long inputWidth, long inputHeight, fp16_t* dst, long outputWidth,
+                    long outputHeight, long kernelWidth, long kernelHeight, long strideWidth, long strideHeight,
+                    long padWidth, long padHeight);
+
+template <typename T>
+void ConvCommonO4(T* dst, const T* src, const float* weight, long width, long src_w_step, long src_depth_quad,
+                  long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step);
+
+template <typename Tin, typename Tout>
+void FloatConvert(const Tin* src, Tout* dst, long area_quad);
+
+template <typename T>
+void ScaleBias(T* src, int channel, int hw, const float* scale, const float* bias, T* dst = nullptr);
+
+void Half2Float(float* dst, const fp16_t* src, const size_t length);
+void Float2Half(fp16_t* dst, const float* src, const size_t length);
+
+void GemmFloatPackA(int m, int n, int k, const float* a, float* pack_a, int lda, const float* b, int ldb, float* c,
+                    int ldc);
+
+void GemmFloatPackAB(int m, int n, int k, const float* a, float* pack_a, int lda, const float* b, float* pack_b, int ldb, float* c,
+                    int ldc);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ConvFloatO4(float* dst, const float* src, const float* weight, long width, long src_w_step, long src_depth_quad,
+                 long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step);
+void ConvBfp16O4(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_step,
+                 long src_depth_quad, long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step);
+
+void DeconvFloatO4(float* dst, const float* src, const float* weight, long width, long dst_w_step, long src_depth_quad,
+                   long src_depth_step, long fw, long fh, long dilateX_step, long dilateY_step);
+void DeconvBfp16O4(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long dst_w_step,
+                   long src_depth_quad, long src_depth_step, long fw, long fh, long dilateX_step, long dilateY_step);
+
+void GemmFloatSlidewC3(float* dst, const float* src, const float* weight, long width, long src_w_setup, long fw,
+                       long fh, long dilateX_step, long dilateY_step);
+void GemmBfp16SlidewC3(bfp16_t* dst, const bfp16_t* src, const float* weight, long width, long src_w_setup, long fw,
+                       long fh, long dilateX_step, long dilateY_step);
+
+void GEMM_FLOAT_N8(float* dst, const float* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu);
+void GEMM_BFP16_N8(bfp16_t* dst, const bfp16_t* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu);
+
+void GEMM_FLOAT_N4(float* dst, const float* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu);
+void GEMM_BFP16_N4(bfp16_t* dst, const bfp16_t* src, const float* weight, long src_depth_quad, long dst_step,
+                   long dst_depth_quad, long width, float* bias, long relu);
+
+void ConvDw3x3FloatSlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+void ConvDw3x3Bfp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+void ConvDw5x5FloatSlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+void ConvDw5x5Bfp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+
+#ifdef __cplusplus
+}
+#endif
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.cc
new file mode 100644
index 0000000..cbecc64
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.cc
@@ -0,0 +1,1021 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <string.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#ifndef TNN_USE_NEON
+void GemmInt8UnitN8Naive(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c,
+                         long c_stride, const float* scales, long relu, const int8_t* add_input,
+                         const float* add_scale, const int8_t* relu6_max) {
+    union {
+        const void* as_void_ptr;
+        int8_t* as_int8_ptr;
+        int32_t* as_int32_ptr;
+    } packed = {w};
+
+    for (int m = 0; m < mr; m++) {
+        for (int n = 0; n < nr; n++) {
+            int acc          = packed.as_int32_ptr[n];
+            int8_t* packed_w = reinterpret_cast<int8_t*>(packed.as_int32_ptr + 8);
+            for (int kk = 0; kk < k; kk++) {
+                acc += (int32_t)a[m * a_stride + kk] * (int32_t)packed_w[kk * 8 + n];
+            }
+
+            auto res = acc * scales[n];
+            // Conv-Relu-Add
+            if (relu == -1) {
+                res = MAX(0, res);
+            }
+            if (add_input) {
+                res += add_input[m * c_stride + n] * add_scale[n];
+            }
+            // Conv-Add-Relu
+            if (relu == 1) {
+                res = MAX(0, res);
+            }
+            // Conv-Add-Relu6
+            else if (relu == 2) {
+                int8_t res_int8 = MIN(float2int8(res), relu6_max[n]);
+                res_int8 = MAX(0, res_int8);
+                c[m * c_stride + n] = res_int8;
+                continue;
+            }
+            c[m * c_stride + n] = float2int8(res);
+        }
+    }
+}
+#else
+extern "C" {
+void GemmInt8Unit4x8(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c, long c_stride,
+                     const float* scales, long, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max);
+void GemmInt8Unit8x8(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c, long c_stride,
+                     const float* scales, long, const int8_t* add_input, const float* add_scale, const int8_t* relu6_max);
+}
+#endif
+
+static void ComputeQ8GemmTile(const Q8GemmContext* context, long mr_block_start, long nr_block_start,
+                              long mr_block_size, long nr_block_size) {
+    const long k         = context->k;
+    const long k_stride  = context->k_stride;
+    const long n         = context->n;
+    const long n_stride  = context->n_stride;
+    const int8_t* a      = context->a;
+    const long a_stride  = context->a_stride;
+    const void* packed_w = context->packed_w;
+    int8_t* c            = context->c;
+    const long c_stride  = context->c_stride;
+
+#ifndef TNN_USE_NEON
+    GemmInt8N8Func gemm_int8_func = GemmInt8UnitN8Naive;
+#elif defined(__aarch64__)
+    GemmInt8N8Func gemm_int8_func = GemmInt8Unit8x8;
+#else
+    GemmInt8N8Func gemm_int8_func = GemmInt8Unit4x8;
+#endif
+
+    auto add_input = context->add_input ? context->add_input + mr_block_start * c_stride + nr_block_start : nullptr;
+    auto add_scale = context->add_scale ? context->add_scale + nr_block_start : nullptr;
+    auto relu6_max = context->relu6_max ? context->relu6_max + nr_block_start : nullptr;
+
+    gemm_int8_func(mr_block_size, nr_block_size, k, a + (mr_block_start)*a_stride, a_stride,
+                   (const void*)((intptr_t)packed_w + nr_block_start * (k_stride * sizeof(int8_t) + sizeof(int32_t))),
+                   c + mr_block_start * c_stride + nr_block_start, c_stride, context->scales + nr_block_start,
+                   context->relu, add_input, add_scale, relu6_max);
+}
+
+void ComputeQ8Gemm(const Q8GemmContext* context, int32_t range_k, int32_t range_l, int32_t tile_k, int32_t tile_l) {
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int32_t k = 0; k < range_k; k += tile_k) {
+        for (int32_t l = 0; l < range_l; l += tile_l) {
+            ComputeQ8GemmTile(context, k, l, std::min(range_k - k, tile_k), std::min(range_l - l, tile_l));
+        }
+    }
+}
+
+#ifndef TNN_USE_NEON
+/*
+kernel func used in linux debug mode
+conv int8 fuse with add common micro kernel
+*/
+void GemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max) {
+    for (long w = 0; w < 4; ++w) {
+        const auto src_x   = src + w * src_w_step;
+        auto dst_x         = dst + w * dst_depth;
+        auto add_input_x   = add_input ? add_input + w * dst_depth : nullptr;
+        int32_t dstTemp[4] = {0, 0, 0, 0};
+        long sz            = 0;
+        for (; sz < cdiv8 / 2; ++sz) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            for (long j = 0; j < 4; ++j) {
+                const auto weight_j = weight_sz + j * 16;
+                for (long i = 0; i < 16; ++i) {
+                    dstTemp[j] += (int32_t)src_z[i] * (int32_t)weight_j[i];
+                }
+            }
+        }
+        for (; sz < cdiv8 / 2 + cdiv8 % 2; ++sz) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            for (long j = 0; j < 4; ++j) {
+                const auto weight_j = weight_sz + j * 16;
+                for (long i = 0; i < 8; ++i) {
+                    dstTemp[j] += (int32_t)src_z[i] * (int32_t)weight_j[i];
+                }
+            }
+        }
+        for (long j = 0; j < 4; ++j) {
+            auto res = static_cast<float>(dstTemp[j] + bias[j]) * scale[j];
+            // Conv-Relu-Add
+            if (relu == -1) {
+                res = MAX(0, res);
+            }
+            if (add_input_x) {
+                res += add_input_x[j] * add_scale[j];
+            }
+            // Conv-Add-Relu
+            if (relu == 1) {
+                res = MAX(0, res);
+            }
+            // Conv-Add-Relu6
+            else if (relu == 2) {
+                int8_t res_int8 = MIN(float2int8(res), relu6_max[j]);
+                res_int8 = MAX(0, res_int8);
+                dst_x[j] = res_int8;
+                continue;
+            }
+
+            dst_x[j] = float2int8(res);
+        }
+    }
+}
+#endif
+
+#ifdef TNN_USE_NEON
+/*
+convert float data to s16, vqmovn_high_s32 can only used int armv8
+(int32)(v * scale) -> int16
+*/
+inline int8x8_t Float4x2ScaleTos8(const float32x4_t v0, const float32x4_t v1, const float32x4_t s0,
+                                  const float32x4_t s1) {
+    float32x4_t mul0 = vmulq_f32(v0, s0);
+    float32x4_t mul1 = vmulq_f32(v1, s1);
+    int16x8_t s16    = VQMOVN_HIGH_S32_T(vqmovn_s32(VCVTAQ_S32_F32(mul0)), VCVTAQ_S32_F32(mul1));
+    return vqmovn_s16(s16);
+}
+/*
+convert float data to s8, pack four int8 values to one int32 value
+(int32)(v * scale) -> int16 -> int8, pack four int8 to one int32 value
+*/
+inline int32_t Float4ScaleTos8(const float32x4_t v, const float32x4_t s) {
+    float32x4_t mul = vmulq_f32(v, s);
+    int8x8_t s8     = vqmovn_s16(vcombine_s16(vqmovn_s32(VCVTAQ_S32_F32(mul)), vdup_n_s16(0)));
+    return vreinterpret_s32_s8(s8)[0];
+}
+
+/*
+quant data from float to int8
+*/
+void FloatToInt8C4(int8_t* dst, const float* src, const float* scale, long batch, long channel, long hw) {
+    const long c_4         = 4;
+    float32x4_t scale_neon = vld1q_f32(scale);
+    for (long n = 0; n < batch; n++) {
+        int8_t* dst_c      = dst + n * c_4 * hw;
+        const float* src_c = src + n * c_4 * hw;
+        long idx           = hw - hw % 2;
+        OMP_PARALLEL_FOR_GUIDED_
+        for (long cnt = 0; cnt < idx; cnt += 2) {
+            // nhwc4 to nchw4
+            float32x4_t val0 = vmulq_f32(vld1q_f32(src_c + cnt * c_4), scale_neon);
+            float32x4_t val1 = vmulq_f32(vld1q_f32(src_c + cnt * c_4 + 4), scale_neon);
+            int16x4_t s16_0  = vqmovn_s32(VCVTAQ_S32_F32(val0));
+            int16x8_t s16    = VQMOVN_HIGH_S32_T(s16_0, VCVTAQ_S32_F32(val1));
+            vst1_s8(dst_c + cnt * c_4, vqmovn_s16(s16));
+        }
+        if (idx == hw - 1) {
+            float32x4_t val0 = vmulq_f32(vld1q_f32(src_c + idx * c_4), scale_neon);
+            int16x4_t s16_0  = vqmovn_s32(VCVTAQ_S32_F32(val0));
+            int8x8_t s8      = vqmovn_s16(VQMOVN_HIGH_S32_T(s16_0, VCVTAQ_S32_F32(val0)));
+            vst1_lane_s32((int32_t*)(dst_c + idx * c_4), vreinterpret_s32_s8(s8), 0);
+        }
+    }
+}
+
+/*
+pack line used in gemm int8
+*/
+void PackLineV7(long cin, const int32_t* src, int32_t* dst) {
+    cin = cin / 4;
+    long temp[8];
+    for (long c = 0; c < cin; c += 2) {
+        int32x2x2_t v[2];
+        v[0].val[0] = vld1_s32(src + c + 0 * cin);
+        v[0].val[1] = vld1_s32(src + c + 1 * cin);
+        v[1].val[0] = vld1_s32(src + c + 2 * cin);
+        v[1].val[1] = vld1_s32(src + c + 3 * cin);
+        vst2_s32(dst + c * 4, v[0]);
+        vst2_s32(dst + c * 4 + 4, v[1]);
+    }
+}
+
+#endif
+
+/*
+general max pooling int8 kernel
+*/
+void MaxPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX = ox * stride_w - pad_w;
+            const long srcOriginY = oy * stride_h - pad_h;
+            const long kxs        = MAX(0, -srcOriginX);
+            const long kxe        = MIN(kw, iw - srcOriginX);
+            const long kys        = MAX(0, -srcOriginY);
+            const long kye        = MIN(kh, ih - srcOriginY);
+            long oc               = 0;
+#ifdef TNN_USE_NEON
+            for (; oc < c_r4 - 4; oc += 8) {
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                int8x8_t max_reg   = vdup_n_s8(-127);
+                // find kernel_w * kernel_h max value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; kx++) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        max_reg                = vmax_s8(max_reg, vld1_s8(srcPtrStart));
+                    }
+                }
+                vst1_s8(dst_ptr, max_reg);
+            }
+#endif
+            for (; oc < c_r4; oc += 4) {
+                int8_t maxValue[4] = {-127, -127, -127, -127};
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h max value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; ++kx) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        for (long j = 0; j < 4; ++j) {
+                            maxValue[j] = MAX(maxValue[j], srcPtrStart[j]);
+                        }
+                    }
+                }
+                // output
+                *(int32_t*)dst_ptr = *(int32_t*)maxValue;
+            }
+        }
+    }
+}
+
+/*
+general avg pooling int8 kernel
+*/
+void AvgPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX   = ox * stride_w - pad_w;
+            const long srcOriginY   = oy * stride_h - pad_h;
+            const long kxs          = MAX(0, -srcOriginX);
+            const long kxe          = MIN(kw, iw - srcOriginX);
+            const long kys          = MAX(0, -srcOriginY);
+            const long kye          = MIN(kh, ih - srcOriginY);
+            const long kernel_count = (kxe - kxs) * (kye - kys);
+            long oc                 = 0;
+#ifdef TNN_USE_NEON
+            int16_t sum[8];
+            for (; oc < c_r4 - 4; oc += 8) {
+                int16x8_t avg_reg  = vdupq_n_s16(0);
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h avg value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; kx++) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        int16x8_t cur_val      = vmovl_s8(vld1_s8(srcPtrStart));
+                        avg_reg                = vaddq_s16(avg_reg, cur_val);
+                    }
+                }
+                vst1q_s16(sum, avg_reg);
+                for (long j = 0; j < 8; j++) {
+                    dst_ptr[j] = sum[j] / kernel_count;
+                }
+            }
+#endif
+            for (; oc < c_r4; oc += 4) {
+                int16_t sum[4]     = {0, 0, 0, 0};
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h avg value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+
+                    for (; kx < kxe; ++kx) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        for (long j = 0; j < 4; ++j) {
+                            sum[j] += srcPtrStart[j];
+                        }
+                    }
+                }
+                // output
+                for (long j = 0; j < 4; j++) {
+                    dst_ptr[j] = static_cast<int8_t>(sum[j] / kernel_count);
+                }
+            }
+        }
+    }
+}
+
+/*
+element add int8 func
+*/
+void MatrixAddInt8(int8_t* dst, const int8_t* A, const int8_t* B, float* dst_scale, const float* a_scale,
+                   float* b_scale, long channel, long HW) {
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long hw = 0; hw < HW; hw++) {
+        long c = 0;
+
+#ifdef TNN_USE_NEON
+        for (; c < channel - 4; c += 8) {
+            float32x4_t scale_a_neon0   = vld1q_f32(a_scale + c);
+            float32x4_t scale_a_neon1   = vld1q_f32(a_scale + c + 4);
+            float32x4_t scale_b_neon0   = vld1q_f32(b_scale + c);
+            float32x4_t scale_b_neon1   = vld1q_f32(b_scale + c + 4);
+            float32x4_t scale_dst_neon0 = vld1q_f32(dst_scale + c);
+            float32x4_t scale_dst_neon1 = vld1q_f32(dst_scale + c + 4);
+
+            long offset        = hw * channel + c;
+            int8x8_t aval      = vld1_s8(A + offset);
+            int8x8_t bval      = vld1_s8(B + offset);
+            int16x8_t a_s16    = vmovl_s8(aval);
+            int16x8_t b_s16    = vmovl_s8(bval);
+            float32x4_t a0     = vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16)));
+            float32x4_t b0     = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16)));
+            float32x4_t a1     = vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16)));
+            float32x4_t b1     = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16)));
+            float32x4_t mul0   = vaddq_f32(vmulq_f32(a0, scale_a_neon0), vmulq_f32(b0, scale_b_neon0));
+            float32x4_t mul1   = vaddq_f32(vmulq_f32(a1, scale_a_neon1), vmulq_f32(b1, scale_b_neon1));
+            int16x4_t mul0_s16 = vqmovn_s32(VCVTAQ_S32_F32(vmulq_f32(mul0, scale_dst_neon0)));
+            int16x8_t mul_s16  = VQMOVN_HIGH_S32_T(mul0_s16, VCVTAQ_S32_F32(vmulq_f32(mul1, scale_dst_neon1)));
+            vst1_s8(dst + offset, vqmovn_s16(mul_s16));
+        }
+#endif
+        for (; c < channel; c++) {
+            long offset = hw * channel + c;
+            float aval  = A[offset] * a_scale[c] + B[offset] * b_scale[c];
+            dst[offset] = float2int8(aval * dst_scale[c]);
+        }
+    }
+}
+void Int8ToFloat(float* dst, const int8_t* src, const float* scale, long batch, long channel, long hw) {
+    long c_4 = ROUND_UP(channel, 4);
+    for (long n = 0; n < batch; n++) {
+        float* dst_c        = dst + n * c_4 * hw;
+        const int8_t* src_c = src + n * c_4 * hw;
+
+        long c = 0;
+#ifdef TNN_USE_NEON
+        for (; c < channel - 4; c += 8) {
+            auto dst_c_0 = dst_c + c * hw;
+            auto dst_c_1 = dst_c_0 + 4 * hw;
+            auto src_c_0 = src_c + c;
+
+            float32x4_t scale_neon0 = vld1q_f32(scale + c);
+            float32x4_t scale_neon1 = vld1q_f32(scale + c + 4);
+
+            for (long cnt = 0; cnt < hw; cnt++) {
+                int8x8_t val            = vld1_s8(src_c_0 + cnt * c_4);
+                int16x8_t val_s16       = vmovl_s8(val);
+                float32x4_t f32_0       = vcvtq_f32_s32(vmovl_s16(vget_low_s16(val_s16)));
+                float32x4_t f32_1       = vcvtq_f32_s32(VMOVL_HIGH_S16_T(val_s16));
+                f32_0                   = vmulq_f32(f32_0, scale_neon0);
+                f32_1                   = vmulq_f32(f32_1, scale_neon1);
+                vst1q_f32(dst_c_0 + cnt * 4, f32_0);
+                vst1q_f32(dst_c_1 + cnt * 4, f32_1);
+            }
+        }
+#endif
+        for (; c < channel; c++) {
+            long ci = c % 4;
+            long co = c / 4;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                dst_c[co * hw * 4 + cnt * 4 + ci] = static_cast<float>(src_c[cnt * c_4 + c]) * scale[c];
+            }
+        }
+    }
+}
+
+void FloatToInt8(int8_t* dst, const float* src, const float* scale, long batch, long channel, long hw) {
+#ifdef TNN_USE_NEON
+    if (channel <= 4)
+        return FloatToInt8C4(dst, src, scale, batch, channel, hw);
+#endif
+    long c_4 = ROUND_UP(channel, 4);
+    for (long n = 0; n < batch; n++) {
+        int8_t* dst_c      = dst + n * c_4 * hw;
+        const float* src_c = src + n * c_4 * hw;
+
+        long c = 0;
+#ifdef TNN_USE_NEON
+        for (; c < channel - 4; c += 8) {
+            float32x4_t scale_neon0 = vld1q_f32(scale + c);
+            float32x4_t scale_neon1 = vld1q_f32(scale + c + 4);
+
+            auto dst_c_0 = dst_c + c;
+            auto src_c_0 = src_c + c * hw;
+            auto src_c_1 = src_c_0 + 4 * hw;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                float32x4_t val0 = vmulq_f32(vld1q_f32(src_c_0 + cnt * 4), scale_neon0);
+                float32x4_t val1 = vmulq_f32(vld1q_f32(src_c_1 + cnt * 4), scale_neon1);
+                int16x4_t s16_0  = vqmovn_s32(VCVTAQ_S32_F32(val0));
+                int16x8_t s16    = VQMOVN_HIGH_S32_T(s16_0, VCVTAQ_S32_F32(val1));
+                vst1_s8(dst_c_0 + cnt * c_4, vqmovn_s16(s16));
+            }
+        }
+#endif
+        for (; c < channel; c++) {
+            long ci = c % 4;
+            long co = c / 4;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                dst_c[cnt * c_4 + c] = float2int8(src_c[co * hw * 4 + cnt * 4 + ci] * scale[c]);
+            }
+        }
+    }
+}
+
+#ifdef TNN_USE_NEON
+/*
+assemble kernel used int gemm int8 func
+*/
+extern "C" {
+void GemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max);
+}
+#endif
+
+/*
+gemm int8 fuse with add func used in linux debug mode
+*/
+void GemmInt8(int8_t* dst, const int8_t* src, int8_t* work_space, const int8_t* weight, const int32_t* bias,
+              const float* scale, long src_depth_d8, long src_w_step, long dst_depth, long relu,
+              const int8_t* add_input, const float* add_scale, const int8_t* relu6_max) {
+    const long src_depth_d16 = UP_DIV(src_depth_d8, 2);
+#if !defined(__aarch64__) && defined(TNN_USE_NEON)
+    PackLineV7(src_depth_d8 * 8, reinterpret_cast<const int32_t*>(src), reinterpret_cast<int32_t*>(work_space));
+    src = work_space;
+#endif
+    for (long j = 0; j < dst_depth; j += 4) {
+        GemmInt8Unit4x4(src, weight, dst, src_w_step, dst_depth, src_depth_d8, scale + j, bias + j, relu, add_input,
+                        add_scale, relu6_max);
+        dst += 4;
+        weight += 4 * src_depth_d16 * 16;
+        if (add_input) {
+            add_input += 4;
+            add_scale += 4;
+        }
+        if (relu6_max) {
+            relu6_max += 4;
+        }
+    }
+}
+
+#ifdef TNN_USE_NEON
+inline int16x8x2_t Load16x8x2(const int8_t* src) {
+    int8x16_t src_s8 = vld1q_s8(src);
+    int16x8x2_t result;
+    result.val[0] = vmovl_s8(vget_low_s8(src_s8));
+    result.val[1] = vmovl_s8(vget_high_s8(src_s8));
+    return result;
+}
+#endif
+/*
+gemm int8 func, used in conv int8 common(img2col + gemm)
+*/
+void GemvInt8(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, const float* scale, long ic_r4,
+              long oc_r4) {
+#ifdef TNN_USE_NEON
+    OMP_PARALLEL_FOR_
+    for (long dc = 0; dc < oc_r4; dc += 4) {
+        int32x4_t acc0 = vdupq_n_s32(0);
+        int32x4_t acc1 = vdupq_n_s32(0);
+        int32x4_t acc2 = vdupq_n_s32(0);
+        int32x4_t acc3 = vdupq_n_s32(0);
+        auto weight_o  = weight + dc * ic_r4;
+        long c         = 0;
+        for (; c < ic_r4 - 15; c += 16) {
+            int16x8x2_t a0 = Load16x8x2(src + c);
+            int16x8x2_t b0 = Load16x8x2(weight_o + 0 * ic_r4 + c);
+            __builtin_prefetch(weight_o + 0 * ic_r4 + c + 256);
+            int16x8x2_t b1 = Load16x8x2(weight_o + 1 * ic_r4 + c);
+            __builtin_prefetch(weight_o + 1 * ic_r4 + c + 256);
+            int16x8x2_t b2 = Load16x8x2(weight_o + 2 * ic_r4 + c);
+            __builtin_prefetch(weight_o + 2 * ic_r4 + c + 256);
+            int16x8x2_t b3 = Load16x8x2(weight_o + 3 * ic_r4 + c);
+            __builtin_prefetch(weight_o + 3 * ic_r4 + c + 256);
+            acc0 = vmlal_s16(acc0, vget_low_s16(a0.val[0]), vget_low_s16(b0.val[0]));
+            acc1 = vmlal_s16(acc1, vget_low_s16(a0.val[0]), vget_low_s16(b1.val[0]));
+            acc2 = vmlal_s16(acc2, vget_low_s16(a0.val[0]), vget_low_s16(b2.val[0]));
+            acc3 = vmlal_s16(acc3, vget_low_s16(a0.val[0]), vget_low_s16(b3.val[0]));
+
+            acc0 = vmlal_s16(acc0, vget_low_s16(a0.val[1]), vget_low_s16(b0.val[1]));
+            acc1 = vmlal_s16(acc1, vget_low_s16(a0.val[1]), vget_low_s16(b1.val[1]));
+            acc2 = vmlal_s16(acc2, vget_low_s16(a0.val[1]), vget_low_s16(b2.val[1]));
+            acc3 = vmlal_s16(acc3, vget_low_s16(a0.val[1]), vget_low_s16(b3.val[1]));
+
+            acc0 = vmlal_s16(acc0, vget_high_s16(a0.val[0]), vget_high_s16(b0.val[0]));
+            acc1 = vmlal_s16(acc1, vget_high_s16(a0.val[0]), vget_high_s16(b1.val[0]));
+            acc2 = vmlal_s16(acc2, vget_high_s16(a0.val[0]), vget_high_s16(b2.val[0]));
+            acc3 = vmlal_s16(acc3, vget_high_s16(a0.val[0]), vget_high_s16(b3.val[0]));
+
+            acc0 = vmlal_s16(acc0, vget_high_s16(a0.val[1]), vget_high_s16(b0.val[1]));
+            acc1 = vmlal_s16(acc1, vget_high_s16(a0.val[1]), vget_high_s16(b1.val[1]));
+            acc2 = vmlal_s16(acc2, vget_high_s16(a0.val[1]), vget_high_s16(b2.val[1]));
+            acc3 = vmlal_s16(acc3, vget_high_s16(a0.val[1]), vget_high_s16(b3.val[1]));
+        }
+
+        for (; c < ic_r4 - 7; c += 8) {
+            int16x8_t a  = vmovl_s8(vld1_s8(src + c));
+            int16x8_t b0 = vmovl_s8(vld1_s8(weight_o + 0 * ic_r4 + c));
+            int16x8_t b1 = vmovl_s8(vld1_s8(weight_o + 1 * ic_r4 + c));
+            int16x8_t b2 = vmovl_s8(vld1_s8(weight_o + 2 * ic_r4 + c));
+            int16x8_t b3 = vmovl_s8(vld1_s8(weight_o + 3 * ic_r4 + c));
+            acc0         = vmlal_s16(acc0, vget_low_s16(a), vget_low_s16(b0));
+            acc1         = vmlal_s16(acc1, vget_low_s16(a), vget_low_s16(b1));
+            acc2         = vmlal_s16(acc2, vget_low_s16(a), vget_low_s16(b2));
+            acc3         = vmlal_s16(acc3, vget_low_s16(a), vget_low_s16(b3));
+            acc0         = vmlal_s16(acc0, vget_high_s16(a), vget_high_s16(b0));
+            acc1         = vmlal_s16(acc1, vget_high_s16(a), vget_high_s16(b1));
+            acc2         = vmlal_s16(acc2, vget_high_s16(a), vget_high_s16(b2));
+            acc3         = vmlal_s16(acc3, vget_high_s16(a), vget_high_s16(b3));
+        }
+
+        for (; c < ic_r4; c += 4) {
+            int32x2_t a_i8x4;
+            int32x2_t b0_i8x4;
+            int32x2_t b1_i8x4;
+            int32x2_t b2_i8x4;
+            int32x2_t b3_i8x4;
+            a_i8x4       = vld1_lane_s32((int32_t*)(src + c), a_i8x4, 0);
+            b0_i8x4      = vld1_lane_s32((int32_t*)(weight_o + 0 * ic_r4 + c), b0_i8x4, 0);
+            b1_i8x4      = vld1_lane_s32((int32_t*)(weight_o + 1 * ic_r4 + c), b1_i8x4, 0);
+            b2_i8x4      = vld1_lane_s32((int32_t*)(weight_o + 2 * ic_r4 + c), b2_i8x4, 0);
+            b3_i8x4      = vld1_lane_s32((int32_t*)(weight_o + 3 * ic_r4 + c), b3_i8x4, 0);
+            int16x8_t a  = vmovl_s8(vreinterpret_s8_s32(a_i8x4));
+            int16x8_t b0 = vmovl_s8(vreinterpret_s8_s32(b0_i8x4));
+            int16x8_t b1 = vmovl_s8(vreinterpret_s8_s32(b1_i8x4));
+            int16x8_t b2 = vmovl_s8(vreinterpret_s8_s32(b2_i8x4));
+            int16x8_t b3 = vmovl_s8(vreinterpret_s8_s32(b3_i8x4));
+            acc0         = vmlal_s16(acc0, vget_low_s16(a), vget_low_s16(b0));
+            acc1         = vmlal_s16(acc1, vget_low_s16(a), vget_low_s16(b1));
+            acc2         = vmlal_s16(acc2, vget_low_s16(a), vget_low_s16(b2));
+            acc3         = vmlal_s16(acc3, vget_low_s16(a), vget_low_s16(b3));
+        }
+        acc0                  = VPADDQ_S32(acc0, acc1);
+        acc2                  = VPADDQ_S32(acc2, acc3);
+        int32x4_t acc         = VPADDQ_S32(acc0, acc2);
+        int32x4_t bias0       = vld1q_s32(bias + dc);
+        float32x4_t scale0    = vld1q_f32(scale + dc);
+        *(int32_t*)(dst + dc) = Float4ScaleTos8(vcvtq_f32_s32(vaddq_s32(acc, bias0)), scale0);
+    }
+#else
+    for (long dc = 0; dc < oc_r4; dc++) {
+        int32_t acc = bias[dc];
+        for (long c = 0; c < ic_r4; c++) {
+            acc += src[c] * weight[dc * ic_r4 + c];
+        }
+        dst[dc] = float2int8(acc * scale[dc]);
+    }
+#endif
+}
+
+/*
+convdw int8 kernel, used in corner process
+*/
+void DepthwiseI8Unit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, long fw, long fh,
+                     long weight_y_step, long dilate_y_step, long dilate_x_step, const float* scale, long dst_depth) {
+    long dc = 0;
+#ifdef TNN_USE_NEON
+    for (; dc < dst_depth - 4; dc += 8) {
+        int32x4_t acc0 = vld1q_s32(bias + dc);
+        int32x4_t acc1 = vld1q_s32(bias + dc + 4);
+        for (long fy = 0; fy < fh; ++fy) {
+            const auto src_y    = src + fy * dilate_y_step + dc;
+            const auto weight_y = weight + fy * weight_y_step + dc;
+            for (long fx = 0; fx < fw; ++fx) {
+                const auto src_x    = src_y + fx * dilate_x_step;
+                const auto weight_x = weight_y + dst_depth * fx;
+                int16x8_t a         = vmovl_s8(vld1_s8(src_x));
+                int16x8_t b         = vmovl_s8(vld1_s8(weight_x));
+                acc0                = vmlal_s16(acc0, vget_low_s16(a), vget_low_s16(b));
+                acc1                = vmlal_s16(acc1, vget_high_s16(a), vget_high_s16(b));
+            }
+        }
+        float32x4_t scale0 = vld1q_f32(scale + dc);
+        float32x4_t scale1 = vld1q_f32(scale + dc + 4);
+        int8x8_t acc_s8    = Float4x2ScaleTos8(vcvtq_f32_s32(acc0), vcvtq_f32_s32(acc1), scale0, scale1);
+        vst1_s8(dst + dc, acc_s8);
+    }
+#endif
+    for (; dc < dst_depth; dc += 4) {
+        long dst_temp[4] = {0, 0, 0, 0};
+        for (long fy = 0; fy < fh; ++fy) {
+            const auto src_y    = src + fy * dilate_y_step + dc;
+            const auto weight_y = weight + fy * weight_y_step + dc;
+            for (long fx = 0; fx < fw; ++fx) {
+                const auto weight_x = weight_y + fx * dst_depth;
+                const auto src_x    = src_y + fx * dilate_x_step;
+                for (long j = 0; j < 4; ++j) {
+                    dst_temp[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
+                }
+            }
+        }
+        for (long i = 0; i < 4; ++i) {
+            dst[dc + i] = float2int8(static_cast<float>(dst_temp[i] + bias[dc + i]) * scale[dc + i]);
+        }
+    }
+}
+
+/*
+general convdw int8 func
+*/
+void DepthwiseI8General(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                        long dilate_y_step, long dilate_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                        const float* scale_z) {
+    long dx, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        long dc = 0;
+#ifdef TNN_USE_NEON
+        for (; dc < dst_depth - 4; dc += 8) {
+            auto dst_x       = dst + dx * dst_depth + dc;
+            const auto src_z = src + dx * src_w_step + dc;
+            int32x4_t acc0   = vld1q_s32(bias_z + dc);
+            int32x4_t acc1   = vld1q_s32(bias_z + dc + 4);
+
+            for (fy = 0; fy < fh; ++fy) {
+                const auto src_y    = src_z + fy * dilate_y_step;
+                const auto weight_y = weight + fy * fw * dst_depth + dc;
+                for (fx = 0; fx < fw; ++fx) {
+                    const auto src_x    = src_y + fx * dilate_x_step;
+                    const auto weight_x = weight_y + dst_depth * fx;
+                    int16x8_t a         = vmovl_s8(vld1_s8(src_x));
+                    int16x8_t b         = vmovl_s8(vld1_s8(weight_x));
+                    acc0                = vmlal_s16(acc0, vget_low_s16(a), vget_low_s16(b));
+                    acc1                = vmlal_s16(acc1, vget_high_s16(a), vget_high_s16(b));
+                }
+            }
+            float32x4_t scale0 = vld1q_f32(scale_z + dc);
+            float32x4_t scale1 = vld1q_f32(scale_z + dc + 4);
+
+            int8x8_t acc_s8 = Float4x2ScaleTos8(vcvtq_f32_s32(acc0), vcvtq_f32_s32(acc1), scale0, scale1);
+            vst1_s8(dst_x, acc_s8);
+        }
+#endif
+        for (; dc < dst_depth; dc += 4) {
+            auto dst_x          = dst + dx * dst_depth + dc;
+            const auto src_z    = src + dx * src_w_step + dc;
+            int32_t dstInt32[4] = {0, 0, 0, 0};
+            for (fy = 0; fy < fh; ++fy) {
+                const auto src_y    = src_z + fy * dilate_y_step;
+                const auto weight_y = weight + fy * fw * dst_depth + dc;
+                for (fx = 0; fx < fw; ++fx) {
+                    const auto src_x    = src_y + fx * dilate_x_step;
+                    const auto weight_x = weight_y + dst_depth * fx;
+                    for (long j = 0; j < 4; ++j) {
+                        dstInt32[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
+                    }
+                }
+            }
+
+            for (long i = 0; i < 4; ++i) {
+                dst_x[i] = float2int8(static_cast<float>(dstInt32[i] + bias_z[i + dc]) * scale_z[i + dc]);
+            }
+        }
+    }
+}
+
+#ifdef TNN_USE_NEON
+/*
+convdw 3x3 int8 func
+*/
+#ifdef __aarch64__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void DepthwiseI8K3S1Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, 
+                         const float* scale_z, long src_y_step, long dst_depth, long width);
+#ifdef __cplusplus
+}
+#endif
+
+#else  // __aarch64__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void DepthwiseI8K3S1Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long src_y_step,
+                           long src_w_step, long dst_depth, const float* scale_z, long dx, long dc);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __aarch64__
+
+void DepthwiseI8K5S1Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+                           long src_y_step, long src_w_step, long dst_depth, const float* scale_z,
+                           long dx, long dc) {
+    auto dst_x       = dst + dx * dst_depth + dc;
+    const auto src_z = src + dx * src_w_step + dc;
+    int32x4_t acc[4][2];
+    int16x8_t a[8], b[5];
+    acc[0][0] = vld1q_s32(bias_z + dc);
+    acc[0][1] = vld1q_s32(bias_z + dc + 4);
+    acc[1][0] = acc[0][0];
+    acc[1][1] = acc[0][1];
+    acc[2][0] = acc[0][0];
+    acc[2][1] = acc[0][1];
+    acc[3][0] = acc[0][0];
+    acc[3][1] = acc[0][1];
+
+    for (long fy = 0; fy < 5; ++fy) {
+        const auto src_y    = src_z + fy * src_y_step;
+        const auto weight_y = weight + fy * 5 * dst_depth + dc;
+        // unroll loops
+        a[0] = vmovl_s8(vld1_s8(src_y + 0 * dst_depth));
+        a[1] = vmovl_s8(vld1_s8(src_y + 1 * dst_depth));
+        a[2] = vmovl_s8(vld1_s8(src_y + 2 * dst_depth));
+        a[3] = vmovl_s8(vld1_s8(src_y + 3 * dst_depth));
+        a[4] = vmovl_s8(vld1_s8(src_y + 4 * dst_depth));
+        a[5] = vmovl_s8(vld1_s8(src_y + 5 * dst_depth));
+        a[6] = vmovl_s8(vld1_s8(src_y + 6 * dst_depth));
+        a[7] = vmovl_s8(vld1_s8(src_y + 7 * dst_depth));
+
+        b[0] = vmovl_s8(vld1_s8(weight_y + 0 * dst_depth));
+        b[1] = vmovl_s8(vld1_s8(weight_y + 1 * dst_depth));
+        b[2] = vmovl_s8(vld1_s8(weight_y + 2 * dst_depth));
+        b[3] = vmovl_s8(vld1_s8(weight_y + 3 * dst_depth));
+        b[4] = vmovl_s8(vld1_s8(weight_y + 4 * dst_depth));
+
+        for (long fx = 0; fx < 5; fx++) {
+            acc[0][0] = vmlal_s16(acc[0][0], vget_low_s16(a[fx + 0]), vget_low_s16(b[fx]));
+            acc[0][1] = vmlal_s16(acc[0][1], vget_high_s16(a[fx + 0]), vget_high_s16(b[fx]));
+            acc[1][0] = vmlal_s16(acc[1][0], vget_low_s16(a[fx + 1]), vget_low_s16(b[fx]));
+            acc[1][1] = vmlal_s16(acc[1][1], vget_high_s16(a[fx + 1]), vget_high_s16(b[fx]));
+            acc[2][0] = vmlal_s16(acc[2][0], vget_low_s16(a[fx + 2]), vget_low_s16(b[fx]));
+            acc[2][1] = vmlal_s16(acc[2][1], vget_high_s16(a[fx + 2]), vget_high_s16(b[fx]));
+            acc[3][0] = vmlal_s16(acc[3][0], vget_low_s16(a[fx + 3]), vget_low_s16(b[fx]));
+            acc[3][1] = vmlal_s16(acc[3][1], vget_high_s16(a[fx + 3]), vget_high_s16(b[fx]));
+        }
+    }
+    float32x4_t scale0 = vld1q_f32(scale_z + dc);
+    float32x4_t scale1 = vld1q_f32(scale_z + dc + 4);
+    for (long ww = 0; ww < 4; ww++) {
+        int8x8_t acc_s8 = Float4x2ScaleTos8(vcvtq_f32_s32(acc[ww][0]), vcvtq_f32_s32(acc[ww][1]), scale0, scale1);
+        vst1_s8(dst_x + ww * dst_depth, acc_s8);
+    }
+}
+
+void DepthwiseI8K3Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+                         long src_y_step, long src_w_step, long dst_depth, const float* scale_z,
+                         long dx, long dc) {
+    auto dst_x       = dst + dx * dst_depth + dc;
+    const auto src_z = src + dx * src_w_step + dc;
+    int32x4_t acc0   = vld1q_s32(bias_z + dc);
+    int32x4_t acc1   = vld1q_s32(bias_z + dc + 4);
+
+    for (long fy = 0; fy < 3; ++fy) {
+        const auto src_y    = src_z + fy * src_y_step;
+        const auto weight_y = weight + fy * 3 * dst_depth + dc;
+        int16x8_t a[3], b[3];
+        a[0] = vmovl_s8(vld1_s8(src_y + 0 * dst_depth));
+        b[0] = vmovl_s8(vld1_s8(weight_y + 0 * dst_depth));
+        a[1] = vmovl_s8(vld1_s8(src_y + 1 * dst_depth));
+        b[1] = vmovl_s8(vld1_s8(weight_y + 1 * dst_depth));
+        a[2] = vmovl_s8(vld1_s8(src_y + 2 * dst_depth));
+        b[2] = vmovl_s8(vld1_s8(weight_y + 2 * dst_depth));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[0]), vget_low_s16(b[0]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[0]), vget_high_s16(b[0]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[1]), vget_low_s16(b[1]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[1]), vget_high_s16(b[1]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[2]), vget_low_s16(b[2]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[2]), vget_high_s16(b[2]));
+    }
+    float32x4_t scale0 = vld1q_f32(scale_z + dc);
+    float32x4_t scale1 = vld1q_f32(scale_z + dc + 4);
+
+    int8x8_t acc_s8 = Float4x2ScaleTos8(vcvtq_f32_s32(acc0), vcvtq_f32_s32(acc1), scale0, scale1);
+    vst1_s8(dst_x, acc_s8);
+}
+
+void DepthwiseI8K5Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+                         long src_y_step, long src_w_step, long dst_depth, const float* scale_z,
+                         long dx, long dc) {
+    auto dst_x       = dst + dx * dst_depth + dc;
+    const auto src_z = src + dx * src_w_step + dc;
+    int32x4_t acc0   = vld1q_s32(bias_z + dc);
+    int32x4_t acc1   = vld1q_s32(bias_z + dc + 4);
+
+    for (long fy = 0; fy < 5; ++fy) {
+        const auto src_y    = src_z + fy * src_y_step;
+        const auto weight_y = weight + fy * 5 * dst_depth + dc;
+        int16x8_t a[5], b[5];
+        a[0] = vmovl_s8(vld1_s8(src_y + 0 * dst_depth));
+        a[1] = vmovl_s8(vld1_s8(src_y + 1 * dst_depth));
+        a[2] = vmovl_s8(vld1_s8(src_y + 2 * dst_depth));
+        a[3] = vmovl_s8(vld1_s8(src_y + 3 * dst_depth));
+        a[4] = vmovl_s8(vld1_s8(src_y + 4 * dst_depth));
+
+        b[0] = vmovl_s8(vld1_s8(weight_y + 0 * dst_depth));
+        b[1] = vmovl_s8(vld1_s8(weight_y + 1 * dst_depth));
+        b[2] = vmovl_s8(vld1_s8(weight_y + 2 * dst_depth));
+        b[3] = vmovl_s8(vld1_s8(weight_y + 3 * dst_depth));
+        b[4] = vmovl_s8(vld1_s8(weight_y + 4 * dst_depth));
+
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[0]), vget_low_s16(b[0]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[0]), vget_high_s16(b[0]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[1]), vget_low_s16(b[1]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[1]), vget_high_s16(b[1]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[2]), vget_low_s16(b[2]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[2]), vget_high_s16(b[2]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[3]), vget_low_s16(b[3]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[3]), vget_high_s16(b[3]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(a[4]), vget_low_s16(b[4]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(a[4]), vget_high_s16(b[4]));
+    }
+    float32x4_t scale0 = vld1q_f32(scale_z + dc);
+    float32x4_t scale1 = vld1q_f32(scale_z + dc + 4);
+
+    int8x8_t acc_s8 = Float4x2ScaleTos8(vcvtq_f32_s32(acc0), vcvtq_f32_s32(acc1), scale0, scale1);
+    vst1_s8(dst_x, acc_s8);
+}
+
+void DepthwiseI8K3(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z) {
+    long dx = 0;
+    // todo:3x8 for arm v7 16regs
+    // stride == 1, fully use arm registers
+#ifdef __aarch64__
+    if (src_w_step == dst_depth) {
+        auto width_align4 = width / 4 * 4;
+        long dc = 0;
+        for (; dc < dst_depth - 7; dc += 8) {
+            DepthwiseI8K3S1Kernel(dst + dc, src + dc, weight + dc, bias_z + dc, scale_z + dc, dilate_y_step, dst_depth, width_align4);
+        }
+        if (dc < dst_depth) {
+            dc = dst_depth - 8;
+            DepthwiseI8K3S1Kernel(dst + dc, src + dc, weight + dc, bias_z + dc, scale_z + dc, dilate_y_step, dst_depth, width_align4);
+        }
+        dx = width_align4;
+    }
+#else
+    if (src_w_step == dst_depth) {
+        for (dx = 0; dx < width - 3; dx += 4) {
+            long dc = 0;
+            for (; dc < dst_depth - 7; dc += 8) {
+                DepthwiseI8K3S1Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z, dx, dc);
+            }
+
+            if (dc < dst_depth) {
+                dc = dst_depth - 8;
+                DepthwiseI8K3S1Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z, dx, dc);
+            }
+        }
+    }
+#endif
+
+    // general k3 process, calc left dx
+    for (; dx < width; dx++) {
+        long dc = 0;
+        for (; dc < dst_depth - 7; dc += 8) {
+            DepthwiseI8K3Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z, dx, dc);
+        }
+
+        if (dc < dst_depth) {
+            dc = dst_depth - 8;
+            DepthwiseI8K3Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z, dx, dc);
+        }
+    }
+}
+
+void DepthwiseI8K5(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z) {
+    long dx = 0;
+
+    // stride == 1, fully use arm registers
+    if (src_w_step == dst_depth) {
+        for (dx = 0; dx < width - 3; dx += 4) {
+            long dc = 0;
+            for (; dc < dst_depth - 7; dc += 8) {
+                DepthwiseI8K5S1Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth,
+                                      scale_z, dx, dc);
+            }
+
+            if (dc < dst_depth) {
+                dc = dst_depth - 8;
+                DepthwiseI8K5S1Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth,
+                                      scale_z, dx, dc);
+            }
+        }
+    }
+
+    // general k3 process, calc left dx
+    for (; dx < width; dx++) {
+        long dc = 0;
+        for (; dc < dst_depth - 7; dc += 8) {
+            DepthwiseI8K5Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+
+        if (dc < dst_depth) {
+            dc = dst_depth - 8;
+            DepthwiseI8K5Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+    }
+}
+#endif
+
+void ReluInt8(int8_t* dst, const int8_t* src, long len) {
+    long idx = 0;
+#ifdef TNN_USE_NEON
+    int8x8_t zero = vdup_n_s8(0);
+    idx           = len - len % 8;
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long i = 0; i < idx; i += 8) {
+        int8x8_t val = vld1_s8(src + i);
+        vst1_s8(dst + i, vmax_s8(val, zero));
+    }
+#endif
+    for (; idx < len; idx++) {
+        dst[idx] = MAX(0, src[idx]);
+    }
+}
+
+void Relu6Int8(int8_t* dst, const int8_t* src, const int8_t* relu6_max, long width, long dst_depth) {
+    long idx = 0;
+
+#ifdef TNN_USE_NEON
+    int8x8_t vzero = vdup_n_s8(0);
+#endif
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long dx = 0; dx < width; dx++) {
+        auto src_dx = src + dx * dst_depth;
+        auto dst_dx = dst + dx * dst_depth;
+
+        long dc = 0;
+#ifdef TNN_USE_NEON
+        for (; dc + 7 < dst_depth; dc += 8) {
+            int8x8_t src_vec   = vld1_s8(src_dx + dc);
+            int8x8_t relu6_vec = vld1_s8(relu6_max + dc);
+            vst1_s8(dst_dx + dc, vmax_s8(vzero, vmin_s8(src_vec, relu6_vec)));
+        }
+#endif
+        for (; dc < dst_depth; dc++) {
+            int8_t tmp = MIN(src_dx[dc], relu6_max[dc]);
+            dst_dx[dc] = MAX(0, tmp);
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.h
new file mode 100644
index 0000000..e58d0ac
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/compute_int8.h
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_COMPUTE_INT8_H_
+#define TNN_ARM_COMPUTE_INT8_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+void MaxPoolingINT8(const int8_t* src, long inputWidth, long inputHeight, int8_t* dst, long outputWidth,
+                    long outputHeight, long channel, long kernelWidth, long kernelHeight, long strideWidth,
+                    long strideHeight, long padWidth, long padHeight);
+
+void AvgPoolingINT8(const int8_t* src, long inputWidth, long inputHeight, int8_t* dst, long outputWidth,
+                    long outputHeight, long channel, long kernelWidth, long kernelHeight, long strideWidth,
+                    long strideHeight, long padWidth, long padHeight);
+
+void MatrixAddInt8(int8_t* dst, const int8_t* A, const int8_t* B, float* dst_scale, const float* a_scale,
+                   float* b_scale, long channel, long hw);
+
+void Int8ToFloat(float* dst, const int8_t* src, const float* scale, long batch, long channel, long hw);
+
+void FloatToInt8(int8_t* dst, const float* src, const float* scale, long batch, long channel, long hw);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct Q8GemmContext {
+    int32_t k;
+    int32_t k_stride;
+    int32_t n;
+    int32_t n_stride;
+    const int8_t* a;
+    int32_t a_stride;
+    const int8_t* packed_w;
+    int8_t* c;
+    int32_t c_stride;
+    float* scales;
+    long relu;
+    const int8_t* add_input;
+    float* add_scale;
+    const int8_t* relu6_max;
+};
+
+typedef void (*GemmInt8N8Func)(long mr, long nr, long k, const int8_t* a, long a_stride, const void* w, int8_t* c,
+                               long c_stride, const float* scales, long relu, const int8_t* add_input,
+                               const float* add_scale, const int8_t* relu6_max);
+
+void ComputeQ8Gemm(const Q8GemmContext* context, int32_t range_k, int32_t range_l, int32_t tile_k, int32_t tile_l);
+
+void DepthwiseI8Unit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, long fw, long fh,
+                     long weight_y_step, long dilate_y_step, long dilate_x_step, const float* scale, long dst_depth);
+
+void DepthwiseI8General(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                        long dilate_y_step, long dilate_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                        const float* scale_z);
+
+void DepthwiseI8K3(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dilate_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z);
+
+void DepthwiseI8K5(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z);
+
+void ReluInt8(int8_t* dst, const int8_t* src, long len);
+void Relu6Int8(int8_t* dst, const int8_t* src, const int8_t* relu6_max, long width, long dst_depth);
+
+void GemmInt8(int8_t* dst, const int8_t* src, int8_t* work_space, const int8_t* weight, const int32_t* bias,
+              const float* scale, long src_depth_d8, long src_w_step, long dst_depth, long relu,
+              const int8_t* add_input = nullptr, const float* add_scale = nullptr, const int8_t* relu6_max = nullptr);
+
+void GemvInt8(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, const float* scale, long ic_r4,
+              long oc_r4);
+
+#ifdef __cplusplus
+}
+#endif
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.cc
new file mode 100644
index 0000000..8e34ceb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.cc
@@ -0,0 +1,1047 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#ifdef __aarch64__
+#define GEMM_KERNEL_FLOAT GEMM_FLOAT_N8
+#define GEMM_KERNEL_BFP16 GEMM_BFP16_N8
+#else
+#define GEMM_KERNEL_FLOAT GEMM_FLOAT_N4
+#define GEMM_KERNEL_BFP16 GEMM_BFP16_N4
+#endif
+
+template <typename T>
+void GEMM_FUNC(T *dst, const T *src, const float *weight, int src_depth_quad, int dst_step, int dst_depth_quad,
+               int width, float *bias, long relu) {
+    LOGE("TYPE NOT IMPLEMENT");
+}
+
+template <>
+void GEMM_FUNC(bfp16_t *dst, const bfp16_t *src, const float *weight, int src_depth_quad, int dst_step,
+               int dst_depth_quad, int width, float *bias, long relu) {
+    GEMM_KERNEL_BFP16(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, bias, relu);
+}
+
+template <>
+void GEMM_FUNC(float *dst, const float *src, const float *weight, int src_depth_quad, int dst_step, int dst_depth_quad,
+               int width, float *bias, long relu) {
+    GEMM_KERNEL_FLOAT(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, bias, relu);
+}
+
+void set_block_size(int &a_block, int &b_block, int l2_size, const int plane_num, const int oc_r4, const int ic_r4,
+                    int byte_size) {
+    const int l1cache = 32 * 1024 / byte_size;
+    if (plane_num >= oc_r4) {
+        // inner kernel also a first, safe in l1 cache
+        a_block = MAX(l1cache / ic_r4 - ARM_SGEMM_TILE_N, 1);
+        // b safe in l2 cache
+        int l2_size_b = l2_size / ic_r4 - a_block;
+        b_block       = MIN(l2_size_b, oc_r4);
+    } else {
+        if (plane_num < l2_size / ic_r4 - ARM_SGEMM_TILE_N) {
+            a_block = plane_num;
+        } else {
+            a_block = MAX(l2_size / ic_r4 - ARM_SGEMM_TILE_N, 1);
+        }
+        b_block = ARM_SGEMM_TILE_N;
+    }
+    b_block = ROUND_UP(b_block, ARM_SGEMM_TILE_N);
+    a_block = ROUND_UP(a_block, ARM_SGEMM_TILE_M);
+}
+
+static inline void repack_lane(float *src, float *dst) {
+    Float4x4 q0 = Float4x4::ld4(src + 0);
+    Float4x4 q4 = Float4x4::ld4(src + 16);
+#ifdef __aarch64__
+    Float4x4 q8 = Float4x4::ld4(src + 32);
+
+    Float4x4::st1_lane(dst + 0, q0, 0);
+    Float4x4::st1_lane(dst + 4, q4, 0);
+    Float4x4::st1_lane(dst + 8, q8, 0);
+    Float4x4::st1_lane(dst + 12, q0, 1);
+    Float4x4::st1_lane(dst + 16, q4, 1);
+    Float4x4::st1_lane(dst + 20, q8, 1);
+    Float4x4::st1_lane(dst + 24, q0, 2);
+    Float4x4::st1_lane(dst + 28, q4, 2);
+    Float4x4::st1_lane(dst + 32, q8, 2);
+    Float4x4::st1_lane(dst + 36, q0, 3);
+    Float4x4::st1_lane(dst + 40, q4, 3);
+    Float4x4::st1_lane(dst + 44, q8, 3);
+#else
+    Float4x4::st1_lane(dst + 0, q0, 0);
+    Float4x4::st1_lane(dst + 4, q4, 0);
+    Float4x4::st1_lane(dst + 8, q0, 1);
+    Float4x4::st1_lane(dst + 12, q4, 1);
+    Float4x4::st1_lane(dst + 16, q0, 2);
+    Float4x4::st1_lane(dst + 20, q4, 2);
+    Float4x4::st1_lane(dst + 24, q0, 3);
+    Float4x4::st1_lane(dst + 28, q4, 3);
+#endif
+}
+
+static inline void _repack_4(float *src, float *dst) {
+    Float4x4 q0 = Float4x4::ld4(src + 0);
+    Float4x4::st1_lane(dst + 0, q0, 0);
+    Float4x4::st1_lane(dst + 4, q0, 1);
+    Float4x4::st1_lane(dst + 8, q0, 2);
+    Float4x4::st1_lane(dst + 12, q0, 3);
+}
+
+static inline void _repack_4(bfp16_t *src, bfp16_t *dst) {
+    int16_t *src_b = reinterpret_cast<int16_t *>(src);
+    int16_t *dst_b = reinterpret_cast<int16_t *>(dst);
+    Short4x4 q0    = Short4x4::ld4(src_b + 0);
+    Short4x4::st1_lane(dst_b + 0, q0, 0);
+    Short4x4::st1_lane(dst_b + 4, q0, 1);
+    Short4x4::st1_lane(dst_b + 8, q0, 2);
+    Short4x4::st1_lane(dst_b + 12, q0, 3);
+}
+
+static inline void repack_lane(bfp16_t *src, bfp16_t *dst) {
+    int16_t *src_b = reinterpret_cast<int16_t *>(src);
+    int16_t *dst_b = reinterpret_cast<int16_t *>(dst);
+    Short4x4 q0    = Short4x4::ld4(src_b + 0);
+    Short4x4 q4    = Short4x4::ld4(src_b + 16);
+#ifdef __aarch64__
+    Short4x4 q8 = Short4x4::ld4(src_b + 32);
+
+    Short4x4::st1_lane(dst_b + 0, q0, 0);
+    Short4x4::st1_lane(dst_b + 4, q4, 0);
+    Short4x4::st1_lane(dst_b + 8, q8, 0);
+    Short4x4::st1_lane(dst_b + 12, q0, 1);
+    Short4x4::st1_lane(dst_b + 16, q4, 1);
+    Short4x4::st1_lane(dst_b + 20, q8, 1);
+    Short4x4::st1_lane(dst_b + 24, q0, 2);
+    Short4x4::st1_lane(dst_b + 28, q4, 2);
+    Short4x4::st1_lane(dst_b + 32, q8, 2);
+    Short4x4::st1_lane(dst_b + 36, q0, 3);
+    Short4x4::st1_lane(dst_b + 40, q4, 3);
+    Short4x4::st1_lane(dst_b + 44, q8, 3);
+#else
+    Short4x4::st1_lane(dst_b + 0, q0, 0);
+    Short4x4::st1_lane(dst_b + 4, q4, 0);
+    Short4x4::st1_lane(dst_b + 8, q0, 1);
+    Short4x4::st1_lane(dst_b + 12, q4, 1);
+    Short4x4::st1_lane(dst_b + 16, q0, 2);
+    Short4x4::st1_lane(dst_b + 20, q4, 2);
+    Short4x4::st1_lane(dst_b + 24, q0, 3);
+    Short4x4::st1_lane(dst_b + 28, q4, 3);
+#endif
+}
+
+/*
+armv8: repack12 x n(n>=0) + repack4 x n(0<=n<=2) + memcpy
+armv8: repack8  x n(n>=0) + repack4 x n(0<=n<=1) + memcpy
+*/
+template <typename T>
+void load_repack_A(T *dst, T *src, int width, int src_z_step, int ic4) {
+    int loop   = width / ARM_SGEMM_TILE_M;
+    int remain = width % ARM_SGEMM_TILE_M;
+
+    // OMP_PARALLEL_FOR_
+    for (int db = 0; db <= loop; db++) {
+        auto src_b = src + db * ARM_SGEMM_TILE_M * 4;
+        auto dst_b = dst + db * ARM_SGEMM_TILE_M * 4 * ic4;
+        int len    = (db < loop) ? ARM_SGEMM_TILE_M : remain;
+        if (len == ARM_SGEMM_TILE_M) {
+            for (int c_i = 0; c_i < ic4; c_i++) {
+                auto src_z = src_b + c_i * src_z_step;
+                auto dst_z = dst_b + c_i * ARM_SGEMM_TILE_M * 4;
+                repack_lane(src_z, dst_z);
+            }
+        } else if (len > 0) {
+            for (int c_i = 0; c_i < ic4; c_i++) {
+                auto src_z = src_b + c_i * src_z_step;
+                auto dst_z = dst_b + c_i * len * 4;
+                memcpy(dst_z, src_z, remain * 4 * sizeof(T));
+                for (int b_i = 0; b_i < remain / 4; b_i++) {
+                    auto src_r = src_z + b_i * 4 * 4;
+                    auto dst_r = dst_z + b_i * 4 * 4;
+                    _repack_4(src_r, dst_r);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void sgemm_repack_lhs(T *dst, T *src, float *weight, int ic4, int oc4, int plane_num, int dst_z_step, int a_block,
+                      int b_block, T *work_space, float *bias, int act_type, bool fast_post) {
+    int loop                 = plane_num / a_block;
+    int remain               = plane_num % a_block;
+    int workspace_per_thread = a_block * ic4 * 4;
+    int do_relu              = act_type == 1 || act_type == 2;
+
+    OMP_PARALLEL_FOR_
+    for (int db = 0; db <= loop; db++) {
+        int thread_id = OMP_TID_;
+        auto dst_b    = work_space + thread_id * workspace_per_thread;
+        auto src_b    = src + db * a_block * 4;
+        auto width    = (db < loop) ? a_block : remain;
+        auto x_loop   = width / ARM_SGEMM_TILE_M;
+        auto x_remain = width % ARM_SGEMM_TILE_M;
+
+        load_repack_A(dst_b, src_b, width, plane_num * 4, ic4);
+
+        auto weight_z_step = ic4 * b_block * 4;
+
+        for (int c_o = 0; c_o < UP_DIV(oc4 * 4, b_block); c_o++) {
+            /*
+            a_block is much smaller in sgemm_lhs than that in sgemm_rhs
+            same process with sgemm_lhs, but we load weight repeatedly
+            */
+            auto calc_b_block = MIN(b_block, oc4 * 4 - c_o * b_block);
+            auto weight_ptr   = weight + c_o * weight_z_step;
+            auto output_ptr   = dst + c_o * plane_num * b_block + db * a_block * 4;
+            for (int x_i = 0; x_i <= x_loop; x_i++) {
+                auto x_width = (x_i < x_loop) ? ARM_SGEMM_TILE_M : x_remain;
+                GEMM_FUNC(output_ptr + x_i * ARM_SGEMM_TILE_M * 4, dst_b + x_i * ARM_SGEMM_TILE_M * ic4 * 4, weight_ptr,
+                          ic4, dst_z_step, calc_b_block / 4, x_width, bias + c_o * b_block, do_relu);
+            }
+        }
+    }
+
+    // only bias + relu6 here, bias and bias + relu has been fused to gemm kernel
+    if (act_type == 2)
+        PostClap<T>(dst, plane_num * oc4, 6);
+    else if (act_type == 0x0100) {
+        if (fast_post)
+            PostAddBiasSwish<T, float, true>(dst, nullptr, plane_num, oc4);
+        else
+            PostAddBiasSwish<T, float, false>(dst, nullptr, plane_num, oc4);
+    }
+}
+
+template void sgemm_repack_lhs(float *dst, float *src, float *weight, int ic4, int oc4, int plane_num, int dst_z_step,
+                               int a_block, int b_block, float *work_space, float *bias, int act_type, bool fast_post);
+
+template void sgemm_repack_lhs(bfp16_t *dst, bfp16_t *src, float *weight, int ic4, int oc4, int plane_num,
+                               int dst_z_step, int a_block, int b_block, bfp16_t *work_space, float *bias, int act_type,
+                               bool fast_post);
+
+template <typename T>
+void sgemm_repack_rhs(T *dst, T *src, float *weight, int ic4, int oc4, int plane_num, int dst_z_step, int a_block,
+                      int b_block, T *work_space, float *bias, int act_type, bool fast_post) {
+    int loop    = plane_num / a_block;
+    int remain  = plane_num % a_block;
+    int do_relu = act_type == 1 || act_type == 2;
+
+    for (int db = 0; db <= loop; db++) {
+        auto src_b    = src + db * a_block * 4;
+        auto dst_b    = work_space;
+        auto width    = (db < loop) ? a_block : remain;
+        auto x_loop   = width / ARM_SGEMM_TILE_M;
+        auto x_remain = width % ARM_SGEMM_TILE_M;
+
+        load_repack_A(dst_b, src_b, width, plane_num * 4, ic4);
+
+        auto weight_z_step = ic4 * b_block * 4;
+
+        OMP_PARALLEL_FOR_
+        for (int c_o = 0; c_o < UP_DIV(oc4 * 4, b_block); c_o++) {
+            /*
+            a_block is much greater in sgemm_rhs than that in sgemm_lhs
+            same process with sgemm_lhs, but we load data repeatedly
+            */
+            auto output_ptr   = dst + c_o * plane_num * b_block + db * a_block * 4;
+            auto calc_b_block = MIN(b_block, oc4 * 4 - c_o * b_block);
+            auto weight_ptr   = weight + c_o * weight_z_step;
+            for (int x_i = 0; x_i <= x_loop; x_i++) {
+                auto x_width = (x_i < x_loop) ? ARM_SGEMM_TILE_M : x_remain;
+                GEMM_FUNC(output_ptr + x_i * ARM_SGEMM_TILE_M * 4, dst_b + x_i * ARM_SGEMM_TILE_M * ic4 * 4, weight_ptr,
+                          ic4, dst_z_step, calc_b_block / 4, x_width, bias + c_o * b_block, do_relu);
+            }
+        }
+    }
+
+    // only bias + relu6 here, bias and bias + relu has been fused to gemm kernel
+    if (act_type == 2)
+        PostClap<T>(dst, plane_num * oc4, 6);
+    else if (act_type == 0x0100) {
+        if (fast_post)
+            PostAddBiasSwish<T, float, true>(dst, nullptr, plane_num, oc4);
+        else
+            PostAddBiasSwish<T, float, false>(dst, nullptr, plane_num, oc4);
+    }
+}
+
+template void sgemm_repack_rhs(float *dst, float *src, float *weight, int ic4, int oc4, int plane_num, int dst_z_step,
+                               int a_block, int b_block, float *work_space, float *bias, int act_type, bool fast_post);
+
+template void sgemm_repack_rhs(bfp16_t *dst, bfp16_t *src, float *weight, int ic4, int oc4, int plane_num,
+                               int dst_z_step, int a_block, int b_block, bfp16_t *work_space, float *bias, int act_type,
+                               bool fast_post);
+
+template <int mr, int nr, typename T>
+void NaiveKernel(int m, int n, int k, const T *sa, const T *sb, T *sc, int ldc) {
+    const T *a = sa;
+    const T *b = sb;
+    T *c       = sc;
+    for (int i = 0; i < m - mr + 1; i += mr) {
+        for (int j = 0; j < n - nr + 1; j += nr) {
+            for (int p = 0; p < k; ++p) {
+                for (int ir = 0; ir < mr; ++ir) {
+                    for (int jr = 0; jr < nr; ++jr) {
+                        c[ir * ldc + jr] += a[ir] * b[jr];
+                    }
+                }
+                a += mr;
+                b += nr;
+            }
+            c += nr;
+            a -= mr * k;
+        }
+        int remain = n % nr;
+        if (remain) {
+            for (int p = 0; p < k; ++p) {
+                for (int ir = 0; ir < mr; ++ir) {
+                    for (int jr = 0; jr < remain; ++jr) {
+                        c[ir * ldc + jr] += a[ir] * b[jr];
+                    }
+                }
+                a += mr;
+                b += nr;
+            }
+            a -= mr * k;
+        }
+        sc += ldc * mr;
+        c = sc;
+        a += mr * k;
+        b = sb;
+    }
+}
+
+#if TNN_ARM82
+template void NaiveKernel<8, 16, fp16_t>(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc);
+template void NaiveKernel<4, 16, fp16_t>(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc);
+template void NaiveKernel<1, 16, fp16_t>(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc);
+#endif  // TNN_ARM82
+
+void Kernel_12x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc) {
+#if defined(TNN_USE_NEON) && defined(__aarch64__)
+    for (int i = 0; i < m - 11; i += 12) {
+        const float *ar = sa + i * k;
+        const float *br = sb;
+        float *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 7; j += 8) {
+            const float *a     = ar;
+            const float *b     = br + j * k;
+            float *c           = cr + j;
+            int64_t ldc_offset = ldc * sizeof(float) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT12x8                    \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.4s},  [x9], #16         \n"
+                "   ld1 {v20.4s}, [x9], %3          \n"
+                "   ld1 {v9.4s},  [x9], #16         \n"
+                "   ld1 {v21.4s}, [x9], %3          \n"
+                "   ld1 {v10.4s}, [x9], #16         \n"
+                "   ld1 {v22.4s}, [x9], %3          \n"
+                "   ld1 {v11.4s}, [x9], #16         \n"
+                "   ld1 {v23.4s}, [x9], %3          \n"
+                "   ld1 {v12.4s}, [x9], #16         \n"
+                "   ld1 {v24.4s}, [x9], %3          \n"
+                "   ld1 {v13.4s}, [x9], #16         \n"
+                "   ld1 {v25.4s}, [x9], %3          \n"
+                "   ld1 {v14.4s}, [x9], #16         \n"
+                "   ld1 {v26.4s}, [x9], %3          \n"
+                "   ld1 {v15.4s}, [x9], #16         \n"
+                "   ld1 {v27.4s}, [x9], %3          \n"
+                "   ld1 {v16.4s}, [x9], #16         \n"
+                "   ld1 {v28.4s}, [x9], %3          \n"
+                "   ld1 {v17.4s}, [x9], #16         \n"
+                "   ld1 {v29.4s}, [x9], %3          \n"
+                "   ld1 {v18.4s}, [x9], #16         \n"
+                "   ld1 {v30.4s}, [x9], %3          \n"
+                "   ld1 {v19.4s}, [x9], #16         \n"
+                "   ld1 {v31.4s}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE12x8                    \n"
+                "   mov x9,        %2               \n"
+                "   st1 {v8.4s},  [x9], #16         \n"
+                "   st1 {v20.4s}, [x9], %3          \n"
+                "   st1 {v9.4s},  [x9], #16         \n"
+                "   st1 {v21.4s}, [x9], %3          \n"
+                "   st1 {v10.4s}, [x9], #16         \n"
+                "   st1 {v22.4s}, [x9], %3          \n"
+                "   st1 {v11.4s}, [x9], #16         \n"
+                "   st1 {v23.4s}, [x9], %3          \n"
+                "   st1 {v12.4s}, [x9], #16         \n"
+                "   st1 {v24.4s}, [x9], %3          \n"
+                "   st1 {v13.4s}, [x9], #16         \n"
+                "   st1 {v25.4s}, [x9], %3          \n"
+                "   st1 {v14.4s}, [x9], #16         \n"
+                "   st1 {v26.4s}, [x9], %3          \n"
+                "   st1 {v15.4s}, [x9], #16         \n"
+                "   st1 {v27.4s}, [x9], %3          \n"
+                "   st1 {v16.4s}, [x9], #16         \n"
+                "   st1 {v28.4s}, [x9], %3          \n"
+                "   st1 {v17.4s}, [x9], #16         \n"
+                "   st1 {v29.4s}, [x9], %3          \n"
+                "   st1 {v18.4s}, [x9], #16         \n"
+                "   st1 {v30.4s}, [x9], %3          \n"
+                "   st1 {v19.4s}, [x9], #16         \n"
+                "   st1 {v31.4s}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "INIT12x8                           \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+
+                "   fmla v8.4s , v0.4s, v2.s[0]     \n"
+                "   ld1 {v3.4s}, [%1], #16          \n"
+                "   fmla v9.4s , v0.4s, v2.s[1]     \n"
+                "   fmla v10.4s, v0.4s, v2.s[2]     \n"
+                "   ld1 {v4.4s}, [%1], #16          \n"
+                "   fmla v11.4s, v0.4s, v2.s[3]     \n"
+
+                "   fmla v12.4s, v0.4s, v3.s[0]     \n"
+                "   ld1 {v1.4s}, [%0], #16          \n"
+                "   fmla v13.4s, v0.4s, v3.s[1]     \n"
+                "   fmla v14.4s, v0.4s, v3.s[2]     \n"
+                "   prfm pldl1keep, [%1, #64]       \n"
+                "   fmla v15.4s, v0.4s, v3.s[3]     \n"
+
+                "   fmla v16.4s, v0.4s, v4.s[0]     \n"
+                "   prfm pldl1keep, [%0, #64]       \n"
+                "   fmla v17.4s, v0.4s, v4.s[1]     \n"
+                "   fmla v18.4s, v0.4s, v4.s[2]     \n"
+                "   prfm pldl1keep, [%1, #128]      \n"
+                "   fmla v19.4s, v0.4s, v4.s[3]     \n"
+
+                "   fmla v20.4s, v1.4s, v2.s[0]     \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   fmla v21.4s, v1.4s, v2.s[1]     \n"
+                "   fmla v22.4s, v1.4s, v2.s[2]     \n"
+                "   prfm pldl1keep, [%0, #128]      \n"
+                "   fmla v23.4s, v1.4s, v2.s[3]     \n"
+                "   subs x8, x8, #1                 \n"
+
+                "   fmla v24.4s, v1.4s, v3.s[0]     \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "   fmla v25.4s, v1.4s, v3.s[1]     \n"
+                "   fmla v26.4s, v1.4s, v3.s[2]     \n"
+                "   prfm pldl1keep, [%1, #192]      \n"
+                "   fmla v27.4s, v1.4s, v3.s[3]     \n"
+
+                "   fmla v28.4s, v1.4s, v4.s[0]     \n"
+                "   prfm pldl1keep, [%0, #192]      \n"
+                "   fmla v29.4s, v1.4s, v4.s[1]     \n"
+                "   fmla v30.4s, v1.4s, v4.s[2]     \n"
+                "   prfm pldl1keep, [%1, #256]      \n"
+                "   fmla v31.4s, v1.4s, v4.s[3]     \n"
+                "   bne 0b                          \n"
+                "SAVE12x8                           \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13",
+                  "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+                  "v28", "v29", "v30", "v31");
+        }
+        int remain = n % 8;
+        if (remain) {
+            const float *a     = ar;
+            const float *b     = br + (n / 8) * 8 * k;
+            float *c           = cr + (n / 8) * 8;
+            float32x4_t c0[12] = {vdupq_n_f32(0)};
+            float32x4_t c1[12] = {vdupq_n_f32(0)};
+            float32x4_t b0, b1, av, a0, a1, a2, a3;
+            for (int kk = 0; kk < k; ++kk) {
+                b0    = vld1q_f32(b);
+                b1    = vld1q_f32(b + 4);
+                av    = vld1q_f32(a);
+                a0    = vdupq_n_f32(av[0]);
+                a1    = vdupq_n_f32(av[1]);
+                a2    = vdupq_n_f32(av[2]);
+                a3    = vdupq_n_f32(av[3]);
+                c0[0] = vmlaq_f32(c0[0], a0, b0);
+                c1[0] = vmlaq_f32(c1[0], a0, b1);
+                c0[1] = vmlaq_f32(c0[1], a1, b0);
+                c1[1] = vmlaq_f32(c1[1], a1, b1);
+                c0[2] = vmlaq_f32(c0[2], a2, b0);
+                c1[2] = vmlaq_f32(c1[2], a2, b1);
+                c0[3] = vmlaq_f32(c0[3], a3, b0);
+                c1[3] = vmlaq_f32(c1[3], a3, b1);
+
+                av    = vld1q_f32(a + 4);
+                a0    = vdupq_n_f32(av[0]);
+                a1    = vdupq_n_f32(av[1]);
+                a2    = vdupq_n_f32(av[2]);
+                a3    = vdupq_n_f32(av[3]);
+                c0[4] = vmlaq_f32(c0[4], a0, b0);
+                c1[4] = vmlaq_f32(c1[4], a0, b1);
+                c0[5] = vmlaq_f32(c0[5], a1, b0);
+                c1[5] = vmlaq_f32(c1[5], a1, b1);
+                c0[6] = vmlaq_f32(c0[6], a2, b0);
+                c1[6] = vmlaq_f32(c1[6], a2, b1);
+                c0[7] = vmlaq_f32(c0[7], a3, b0);
+                c1[7] = vmlaq_f32(c1[7], a3, b1);
+
+                av     = vld1q_f32(a + 8);
+                a0     = vdupq_n_f32(av[0]);
+                a1     = vdupq_n_f32(av[1]);
+                a2     = vdupq_n_f32(av[2]);
+                a3     = vdupq_n_f32(av[3]);
+                c0[8]  = vmlaq_f32(c0[8], a0, b0);
+                c1[8]  = vmlaq_f32(c1[8], a0, b1);
+                c0[9]  = vmlaq_f32(c0[9], a1, b0);
+                c1[9]  = vmlaq_f32(c1[9], a1, b1);
+                c0[10] = vmlaq_f32(c0[10], a2, b0);
+                c1[10] = vmlaq_f32(c1[10], a2, b1);
+                c0[11] = vmlaq_f32(c0[11], a3, b0);
+                c1[11] = vmlaq_f32(c1[11], a3, b1);
+
+                b += 8;
+                a += 12;
+            }
+            for (int ms = 0; ms < 12; ++ms) {
+                for (int rr = 0; rr < remain; ++rr) {
+                    c[rr] += rr < 4 ? c0[ms][rr] : c1[ms][rr - 4];
+                }
+                c += ldc;
+            }
+        }
+    }
+#else
+    return NaiveKernel<12, 8>(m, n, k, sa, sb, sc, ldc);
+#endif
+}
+
+void Kernel_4x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc) {
+#ifdef TNN_USE_NEON
+    for (int i = 0; i < m - 3; i += 4) {
+        const float *ar = sa + i * k;
+        const float *br = sb;
+        float *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 7; j += 8) {
+            const float *a = ar;
+            const float *b = br + j * k;
+            float *c       = cr + j;
+#ifdef __aarch64__
+            int64_t ldc_offset = ldc * sizeof(float) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT4x8                     \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.4s},  [x9], #16         \n"
+                "   ld1 {v20.4s}, [x9], %3          \n"
+                "   ld1 {v9.4s},  [x9], #16         \n"
+                "   ld1 {v21.4s}, [x9], %3          \n"
+                "   ld1 {v10.4s}, [x9], #16         \n"
+                "   ld1 {v22.4s}, [x9], %3          \n"
+                "   ld1 {v11.4s}, [x9], #16         \n"
+                "   ld1 {v23.4s}, [x9], %3          \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE4x8                     \n"
+                "   mov x9,        %2               \n"
+                "   st1 {v8.4s},  [x9], #16         \n"
+                "   st1 {v20.4s}, [x9], %3          \n"
+                "   st1 {v9.4s},  [x9], #16         \n"
+                "   st1 {v21.4s}, [x9], %3          \n"
+                "   st1 {v10.4s}, [x9], #16         \n"
+                "   st1 {v22.4s}, [x9], %3          \n"
+                "   st1 {v11.4s}, [x9], #16         \n"
+                "   st1 {v23.4s}, [x9], %3          \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "INIT4x8                            \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+
+                "   fmla v8.4s , v0.4s, v2.s[0]     \n"
+                "   fmla v9.4s , v0.4s, v2.s[1]     \n"
+                "   fmla v10.4s, v0.4s, v2.s[2]     \n"
+                "   fmla v11.4s, v0.4s, v2.s[3]     \n"
+
+                "   ld1 {v1.4s}, [%0], #16          \n"
+                "   prfm pldl1keep, [%1, #64]       \n"
+                "   prfm pldl1keep, [%0, #64]       \n"
+
+                "   fmla v20.4s, v1.4s, v2.s[0]     \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   fmla v21.4s, v1.4s, v2.s[1]     \n"
+                "   fmla v22.4s, v1.4s, v2.s[2]     \n"
+                "   prfm pldl1keep, [%0, #128]      \n"
+                "   fmla v23.4s, v1.4s, v2.s[3]     \n"
+                "   subs x8, x8, #1                 \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "   bne 0b                          \n"
+                "SAVE4x8                            \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v8", "v9", "v10", "v11", "v20", "v21", "v22", "v23");
+#else
+            int ldc_offset = ldc * sizeof(float) - 16;
+            asm volatile(
+                ".macro INIT4x8                     \n"
+                "   mov r9,        %2               \n"
+                "   vld1.f32 {d16,d17},  [r9]!      \n"
+                "   vld1.f32 {d18,d19},  [r9]       \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.f32 {d20,d21}, [r9]!       \n"
+                "   vld1.f32 {d22,d23}, [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.f32 {d24,d25}, [r9]!       \n"
+                "   vld1.f32 {d26,d27}, [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.f32 {d28,d29}, [r9]!       \n"
+                "   vld1.f32 {d30,d31}, [r9]        \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE4x8                     \n"
+                "   mov r9,        %2               \n"
+                "   vst1.f32 {d16,d17},  [r9]!      \n"
+                "   vst1.f32 {d18,d19},  [r9]       \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.f32 {d20,d21}, [r9]!       \n"
+                "   vst1.f32 {d22,d23}, [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.f32 {d24,d25}, [r9]!       \n"
+                "   vst1.f32 {d26,d27}, [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.f32 {d28,d29}, [r9]!       \n"
+                "   vst1.f32 {d30,d31}, [r9]        \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   vld1.f32 {d0,d1},  [%0]!        \n"
+                "   vld1.f32 {d4,d5},  [%1]!        \n"
+                "INIT4x8                            \n"
+                "mov r8,%4                          \n"
+                "0:                                 \n"
+                "   vmla.f32 q8,  q0, d4[0]         \n"
+                "   vld1.f32 {d2,d3},  [%0]!        \n"
+                "   vmla.f32 q10, q0, d4[1]         \n"
+                "   vmla.f32 q12, q0, d5[0]         \n"
+                "   vmla.f32 q14, q0, d5[1]         \n"
+                "   subs r8, r8, #1                 \n"
+
+                "   vmla.f32 q9,  q1, d4[0]         \n"
+                "   vmla.f32 q11, q1, d4[1]         \n"
+                "   vmla.f32 q13, q1, d5[0]         \n"
+                "   vld1.f32 {d0,d1},  [%0]!        \n"
+                "   vmla.f32 q15, q1, d5[1]         \n"
+                "   vld1.f32 {d4,d5},  [%1]!        \n"
+                "   bne 0b                          \n"
+                "SAVE4x8                            \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k)
+                : "memory", "cc", "r8", "r9", "q0", "q1", "q2", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#endif  // __aarch64__
+        }
+        int remain = n % 8;
+        if (remain) {
+            const float *a    = ar;
+            const float *b    = br + (n / 8) * 8 * k;
+            float *c          = cr + (n / 8) * 8;
+            float32x4_t c0[4] = {vdupq_n_f32(0)};
+            float32x4_t c1[4] = {vdupq_n_f32(0)};
+            float32x4_t b0, b1, av, a0, a1, a2, a3;
+            for (int kk = 0; kk < k; ++kk) {
+                b0    = vld1q_f32(b);
+                b1    = vld1q_f32(b + 4);
+                av    = vld1q_f32(a);
+                a0    = vdupq_n_f32(av[0]);
+                a1    = vdupq_n_f32(av[1]);
+                a2    = vdupq_n_f32(av[2]);
+                a3    = vdupq_n_f32(av[3]);
+                c0[0] = vmlaq_f32(c0[0], a0, b0);
+                c1[0] = vmlaq_f32(c1[0], a0, b1);
+                c0[1] = vmlaq_f32(c0[1], a1, b0);
+                c1[1] = vmlaq_f32(c1[1], a1, b1);
+                c0[2] = vmlaq_f32(c0[2], a2, b0);
+                c1[2] = vmlaq_f32(c1[2], a2, b1);
+                c0[3] = vmlaq_f32(c0[3], a3, b0);
+                c1[3] = vmlaq_f32(c1[3], a3, b1);
+                b += 8;
+                a += 4;
+            }
+            for (int ms = 0; ms < 4; ++ms) {
+                for (int rr = 0; rr < remain; ++rr) {
+                    c[rr] += rr < 4 ? c0[ms][rr] : c1[ms][rr - 4];
+                }
+                c += ldc;
+            }
+        }
+    }
+#else
+    return NaiveKernel<4, 8>(m, n, k, sa, sb, sc, ldc);
+#endif  // TNN_USE_NEON
+}
+
+void Kernel_1x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc) {
+#ifdef TNN_USE_NEON
+    for (int i = 0; i < m; ++i) {
+        const float *ar = sa + i * k;
+        const float *br = sb;
+        float *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 7; j += 8) {
+            const float *a = ar;
+            const float *b = br + j * k;
+            float *c       = cr + j;
+#ifdef __aarch64__
+            int64_t ldc_offset = ldc * sizeof(float) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT1x8                     \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.4s},  [x9], #16         \n"
+                "   ld1 {v20.4s}, [x9], %3          \n"
+                "   movi v9.4s,    #0               \n"
+                "   movi v21.4s,   #0               \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE1x8                     \n"
+                "   mov x9,        %2               \n"
+                "   fadd v8.4s,  v8.4s,  v9.4s      \n"
+                "   fadd v20.4s, v20.4s, v21.4s     \n"
+                "   st1 {v8.4s},  [x9], #16         \n"
+                "   st1 {v20.4s}, [x9], %3          \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "INIT1x8                            \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+                "   subs x9, x8, #4                 \n"
+                "   blt 1f                          \n"
+                "   ld1 {v1.4s}, [%0], #16          \n"
+                "   fmla v8.4s , v0.4s, v2.s[0]     \n"
+                "   ld1 {v3.4s}, [%0], #16          \n"
+                "   fmla v20.4s, v1.4s, v2.s[0]     \n"
+                "   ld1 {v4.4s}, [%0], #16          \n"
+                "   fmla v9.4s , v3.4s, v2.s[1]     \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   fmla v21.4s, v4.4s, v2.s[1]     \n"
+                "   subs x8, x8, #4                 \n"
+
+                "   ld1 {v1.4s}, [%0], #16          \n"
+                "   fmla v8.4s , v0.4s, v2.s[2]     \n"
+                "   ld1 {v3.4s}, [%0], #16          \n"
+                "   fmla v20.4s, v1.4s, v2.s[2]     \n"
+                "   ld1 {v4.4s}, [%0], #16          \n"
+                "   fmla v9.4s , v3.4s, v2.s[3]     \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   fmla v21.4s, v4.4s, v2.s[3]     \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "   bgt 0b                          \n"
+                "1:                                 \n"
+                "   subs x8, x8, #1                 \n"
+                "   ld1 {v1.4s}, [%0], #16          \n"
+                "   blt 2f                          \n"
+                "   fmla v8.4s , v0.4s, v2.s[0]     \n"
+                "   fmla v20.4s, v1.4s, v2.s[0]     \n"
+                "   sub %1, %1, #12                 \n"
+                "   ld1 {v0.4s}, [%0], #16          \n"
+                "   ld1 {v2.4s}, [%1], #16          \n"
+                "   bne 1b                          \n"
+                "2:                                 \n"
+                "   SAVE1x8                         \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v20", "v21");
+#else
+            int ldc_offset = ldc * sizeof(float) - 16;
+            asm volatile(
+                ".macro INIT1x8                     \n"
+                "   mov r9,        %2               \n"
+                "   vld1.f32 {d16,d17}, [r9]!       \n"
+                "   vld1.f32 {d20,d21}, [r9]        \n"
+                "   vmov.u32 q9,   #0               \n"
+                "   vmov.u32 q11,  #0               \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE1x8                     \n"
+                "   mov r9,       %2                \n"
+                "   vadd.f32 q8,  q8,  q9           \n"
+                "   vadd.f32 q10, q10, q11          \n"
+                "   vst1.f32 {d16,d17}, [r9]!       \n"
+                "   vst1.f32 {d20,d21}, [r9]        \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   vld1.f32 {d0,d1}, [%0]!         \n"
+                "   vld1.f32 {d4,d5}, [%1]!         \n"
+                "INIT1x8                            \n"
+                "mov r8,%4                          \n"
+                "0:                                 \n"
+                "   subs r9, r8,  #4                \n"
+                "   blt 1f                          \n"
+                "   vld1.f32 {d2,d3},  [%0]!        \n"
+                "   vmla.f32 q8,  q0, d4[0]         \n"
+                "   vld1.f32 {d6,d7},  [%0]!        \n"
+                "   vmla.f32 q10, q1, d4[0]         \n"
+                "   vld1.f32 {d8,d9},  [%0]!        \n"
+                "   vmla.f32 q9,  q3, d4[1]         \n"
+                "   vld1.f32 {d0,d1},  [%0]!        \n"
+                "   vmla.f32 q11, q4, d4[1]         \n"
+                "   subs r8, r8,  #4                \n"
+
+                "   vld1.f32 {d2,d3},  [%0]!        \n"
+                "   vmla.f32 q8,  q0, d5[0]         \n"
+                "   vld1.f32 {d6,d7},  [%0]!        \n"
+                "   vmla.f32 q10, q1, d5[0]         \n"
+                "   vld1.f32 {d8,d9},  [%0]!        \n"
+                "   vmla.f32 q9,  q3, d5[1]         \n"
+                "   vld1.f32 {d0,d1},  [%0]!        \n"
+                "   vmla.f32 q11, q4, d5[1]         \n"
+                "   vld1.f32 {d4,d5},  [%1]!        \n"
+                "   bgt 0b                          \n"
+                "1:                                 \n"
+                "   subs r8, r8,  #1                \n"
+                "   vld1.f32 {d2,d3},  [%0]!        \n"
+                "   blt 2f                          \n"
+                "   vmla.f32 q8,  q0, d4[0]         \n"
+                "   vmla.f32 q10, q1, d4[0]         \n"
+                "   sub %1, %1,   #12               \n"
+                "   vld1.f32 {d0,d1},  [%0]!        \n"
+                "   vld1.f32 {d4,d5},  [%1]!        \n"
+                "   bne 1b                          \n"
+                "2:                                 \n"
+                "   SAVE1x8                         \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k)
+                : "memory", "cc", "r8", "r9", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11");
+#endif  // __aarch64__
+        }
+        int remain = n % 8;
+        if (remain) {
+            const float *a = ar;
+            const float *b = br + (n / 8) * 8 * k;
+            float *c       = cr + (n / 8) * 8;
+            float32x4_t c0 = vdupq_n_f32(0);
+            float32x4_t c1 = vdupq_n_f32(0);
+            for (int kk = 0; kk < k; ++kk) {
+                float32x4_t b0 = vld1q_f32(b);
+                float32x4_t b1 = vld1q_f32(b + 4);
+                float32x4_t a0 = vdupq_n_f32(a[kk]);
+                c0             = vmlaq_f32(c0, a0, b0);
+                c1             = vmlaq_f32(c1, a0, b1);
+                b += 8;
+            }
+            for (int rr = 0; rr < remain; ++rr) {
+                c[rr] += rr < 4 ? c0[rr] : c1[rr - 4];
+            }
+        }
+    }
+#else
+    return NaiveKernel<1, 8>(m, n, k, sa, sb, sc, ldc);
+#endif  // TNN_USE_NEON
+}
+
+template <int nr, typename T>
+void NaivePackB(int k, int n, const T *from, int ldb, T *to) {
+    const T *src = from;
+    T *dst;
+
+    for (int j = 0; j < k; ++j) {
+        dst = to + j * nr;
+        src = from + j * ldb;
+        for (int i = 0; i < n / nr; i++) {
+            for (int r = 0; r < nr; ++r) {
+                dst[r] = src[r];
+            }
+            src += nr;
+            dst += k * nr;
+        }
+        int remain = n % nr;
+        if (remain) {
+            for (int r = 0; r < remain; ++r) {
+                dst[r] = src[r];
+            }
+            for (int r = remain; r < nr; ++r) {
+                dst[r] = 0;
+            }
+        }
+    }
+}
+
+#if TNN_ARM82
+template void NaivePackB<16, fp16_t>(int k, int n, const fp16_t *from, int ldb, fp16_t *to);
+#endif  // TNN_ARM82
+
+void PackB_8(int k, int n, const float *from, int ldb, float *to) {
+#ifdef TNN_USE_NEON
+    int j = 0;
+
+    const float *src[4];
+    float *dst;
+    float32x4_t val[8];
+    for (int loop = 0; loop < k / 4; ++loop, j += 4) {
+        dst    = to + j * 8;
+        src[0] = from + j * ldb;
+        src[1] = src[0] + ldb;
+        src[2] = src[1] + ldb;
+        src[3] = src[2] + ldb;
+        for (int i = 0; i < n / 8; i++) {
+            val[0] = vld1q_f32(src[0]);
+            val[1] = vld1q_f32(src[0] + 4);
+            src[0] += 8;
+
+            val[2] = vld1q_f32(src[1]);
+            val[3] = vld1q_f32(src[1] + 4);
+            src[1] += 8;
+
+            val[4] = vld1q_f32(src[2]);
+            val[5] = vld1q_f32(src[2] + 4);
+            src[2] += 8;
+
+            val[6] = vld1q_f32(src[3]);
+            val[7] = vld1q_f32(src[3] + 4);
+            src[3] += 8;
+
+            vst1q_f32(dst, val[0]);
+            vst1q_f32(dst + 4, val[1]);
+            vst1q_f32(dst + 8, val[2]);
+            vst1q_f32(dst + 12, val[3]);
+            vst1q_f32(dst + 16, val[4]);
+            vst1q_f32(dst + 20, val[5]);
+            vst1q_f32(dst + 24, val[6]);
+            vst1q_f32(dst + 28, val[7]);
+            dst += k * 8;
+        }
+        int remain = n % 8;
+        if (remain) {
+            for (int kr = 0; kr < 4; ++kr) {
+                for (int r = 0; r < remain; ++r) {
+                    dst[r] = src[kr][r];
+                }
+                for (int r = remain; r < 8; ++r) {
+                    dst[r] = 0;
+                }
+                dst += 8;
+            }
+        }
+    }
+
+    for (; j < k; ++j) {
+        dst    = to + j * 8;
+        src[0] = from + j * ldb;
+        for (int i = 0; i < n / 8; i++) {
+            val[0] = vld1q_f32(src[0]);
+            val[1] = vld1q_f32(src[0] + 4);
+            src[0] += 8;
+
+            vst1q_f32(dst, val[0]);
+            vst1q_f32(dst + 4, val[1]);
+            dst += k * 8;
+        }
+        int remain = n % 8;
+        if (remain) {
+            for (int r = 0; r < remain; ++r) {
+                dst[r] = src[0][r];
+            }
+            for (int r = remain; r < 8; ++r) {
+                dst[r] = 0;
+            }
+        }
+    }
+#else
+    return NaivePackB<8>(k, n, from, ldb, to);
+#endif
+}
+
+void PackA_12(int m, int k, const float *src, int lda, float *dst) {
+    const float *src_offset[12];
+    for (int j = 0; j < m - 11; j += 12) {
+        float *dst_r   = dst + j * k;
+        src_offset[0]  = src;
+        src_offset[1]  = src_offset[0] + lda;
+        src_offset[2]  = src_offset[1] + lda;
+        src_offset[3]  = src_offset[2] + lda;
+        src_offset[4]  = src_offset[3] + lda;
+        src_offset[5]  = src_offset[4] + lda;
+        src_offset[6]  = src_offset[5] + lda;
+        src_offset[7]  = src_offset[6] + lda;
+        src_offset[8]  = src_offset[7] + lda;
+        src_offset[9]  = src_offset[8] + lda;
+        src_offset[10] = src_offset[9] + lda;
+        src_offset[11] = src_offset[10] + lda;
+        src += 12 * lda;
+
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < k; ++i) {
+            float *dst_t  = dst_r + i * 12;
+            *(dst_t + 0)  = *(src_offset[0] + i);
+            *(dst_t + 1)  = *(src_offset[1] + i);
+            *(dst_t + 2)  = *(src_offset[2] + i);
+            *(dst_t + 3)  = *(src_offset[3] + i);
+            *(dst_t + 4)  = *(src_offset[4] + i);
+            *(dst_t + 5)  = *(src_offset[5] + i);
+            *(dst_t + 6)  = *(src_offset[6] + i);
+            *(dst_t + 7)  = *(src_offset[7] + i);
+            *(dst_t + 8)  = *(src_offset[8] + i);
+            *(dst_t + 9)  = *(src_offset[9] + i);
+            *(dst_t + 10) = *(src_offset[10] + i);
+            *(dst_t + 11) = *(src_offset[11] + i);
+        }
+    }
+}
+
+void PackA_4(int m, int k, const float *src, int lda, float *dst) {
+    const float *src_offset[4];
+    for (int j = 0; j < m - 3; j += 4) {
+        float *dst_r  = dst + j * k;
+        src_offset[0] = src;
+        src_offset[1] = src_offset[0] + lda;
+        src_offset[2] = src_offset[1] + lda;
+        src_offset[3] = src_offset[2] + lda;
+        src += 4 * lda;
+
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < k; ++i) {
+            float *dst_t = dst_r + i * 4;
+            *(dst_t + 0) = *(src_offset[0] + i);
+            *(dst_t + 1) = *(src_offset[1] + i);
+            *(dst_t + 2) = *(src_offset[2] + i);
+            *(dst_t + 3) = *(src_offset[3] + i);
+        }
+    }
+}
+
+void PackA_1(int m, int k, const float *src, int lda, float *dst) {
+    OMP_PARALLEL_FOR_
+    for (int j = 0; j < m; ++j) {
+        memcpy(dst + j * k, src + j * lda, k * sizeof(float));
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.h
new file mode 100644
index 0000000..bcb03e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/gemm_function.h
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_GEMM_FUNCTION_H_
+#define TNN_ARM_GEMM_FUNCTION_H_
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+
+namespace TNN_NS {
+
+#ifdef __aarch64__
+#define ARM_SGEMM_TILE_M 12
+#define ARM_SGEMM_TILE_N 8
+#else
+#define ARM_SGEMM_TILE_M 8
+#define ARM_SGEMM_TILE_N 4
+#endif
+
+template <typename T>
+void GEMM_FUNC(T *dst, const T *src, const float *weight, int src_depth_quad, int dst_step, int dst_depth_quad,
+               int width, float *bias, long relu);
+
+void set_block_size(int &a_block, int &b_block, int l2_size, const int plane_num, const int oc_r4, const int ic_r4,
+                    int byte_size);
+template <typename T>
+void sgemm_repack_lhs(T *dst, T *src, float *weight, int ic4, int oc4, int width, int dst_z_step, int a_block,
+                      int b_block, T *work_space, float *bias, int act_type, bool fast_post);
+template <typename T>
+void sgemm_repack_rhs(T *dst, T *src, float *weight, int ic4, int oc4, int width, int dst_z_step, int a_block,
+                      int b_block, T *work_space, float *bias, int act_type, bool fast_post);
+
+void Kernel_12x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc);
+void Kernel_4x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc);
+void Kernel_1x8(int m, int n, int k, const float *sa, const float *sb, float *sc, int ldc);
+
+void PackB_8(int k, int n, const float *src, int ldb, float *dst);
+void PackA_12(int m, int k, const float *src, int lda, float *dst);
+void PackA_4(int m, int k, const float *src, int lda, float *dst);
+void PackA_1(int m, int k, const float *src, int lda, float *dst);
+
+#if TNN_ARM82
+template <int mr, int nr, typename T>
+void NaiveKernel(int m, int n, int k, const T *sa, const T *sb, T *sc, int ldc);
+void Kernel_8x16(int m, int n, int k, const fp16_t* sa, const fp16_t* sb, fp16_t* sc, int ldc);
+void Kernel_4x16(int m, int n, int k, const fp16_t* sa, const fp16_t* sb, fp16_t* sc, int ldc);
+void Kernel_1x16(int m, int n, int k, const fp16_t* sa, const fp16_t* sb, fp16_t* sc, int ldc);
+
+template <int nr, typename T>
+void NaivePackB(int k, int n, const T *from, int ldb, T *to);
+void PackB_16(int k, int n, const fp16_t* src, int ldb, fp16_t* dst);
+void PackA_8(int m, int k, const fp16_t* src, int lda, fp16_t* dst);
+void PackA_4(int m, int k, const fp16_t* src, int lda, fp16_t* dst);
+void PackA_1(int m, int k, const fp16_t* src, int lda, fp16_t* dst);
+#endif
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.cc
new file mode 100644
index 0000000..1444ff4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.cc
@@ -0,0 +1,374 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include <cstring>
+#include <memory>
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+#include "tnn/device/arm/acc/Float4.h"
+
+namespace TNN_NS {
+
+static inline void WeightTransform(const float *src, float *dst, int kernel_size, int unit, int in_channel,
+                                   int out_channel, const float (*G)[3]) {
+    float M[unit][3];
+    float K_trans[unit * unit];
+
+    int ic_4        = UP_DIV(in_channel, 4);
+    int oc_4        = UP_DIV(out_channel, 4);
+    int unit_stride = ic_4 * oc_4 * 4 * 4;
+    int oc_stride   = ic_4 * 4 * 4;
+    int ic_stride   = 4 * 4;
+
+    for (int oc = 0; oc < out_channel; oc++) {
+        int zo        = oc / 4;
+        int ro        = oc % 4;
+        float *dst_oz = dst + zo * oc_stride + ro;
+        for (int ic = 0; ic < in_channel; ic++) {
+            const float *src_z = src + (oc * in_channel + ic) * 3 * 3;
+            const float *k0    = src_z;
+            const float *k1    = k0 + 3;
+            const float *k2    = k1 + 3;
+
+            int zi = ic / 4;
+            int ri = ic % 4;
+
+            // M=G*g
+            for (int i = 0; i < unit; i++) {
+                M[i][0] = k0[0] * G[i][0] + k1[0] * G[i][1] + k2[0] * G[i][2];
+                M[i][1] = k0[1] * G[i][0] + k1[1] * G[i][1] + k2[1] * G[i][2];
+                M[i][2] = k0[2] * G[i][0] + k1[2] * G[i][1] + k2[2] * G[i][2];
+            }
+
+            // K_trans=M*GT
+            for (int j = 0; j < unit; j++) {
+                float *Mp = &M[j][0];
+                for (int i = 0; i < unit; i++) {
+                    K_trans[j * unit + i] = Mp[0] * G[i][0] + Mp[1] * G[i][1] + Mp[2] * G[i][2];
+                }
+            }
+
+            auto dst_sz = dst_oz + zi * ic_stride + 4 * ri;
+
+            for (int k = 0; k < unit * unit; k++) {
+                *(dst_sz + k * unit_stride) = K_trans[k];
+            }
+        }
+    }
+}
+
+void WeightTransform4x4(const float *src, float *dst, int kernel_size, int in_channel, int out_channel) {
+    const float G[4][3] = {{1.0f, 0.0f, 0.0f}, {0.5f, 0.5f, 0.5f}, {0.5f, -0.5f, 0.5f}, {0.0f, 0.0f, 1.0f}};
+
+    WeightTransform(src, dst, kernel_size, 4, in_channel, out_channel, G);
+}
+
+void WeightTransform6x6(const float *src, float *dst, int kernel_size, int in_channel, int out_channel) {
+    const float G[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},           {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
+
+    WeightTransform(src, dst, kernel_size, 6, in_channel, out_channel, G);
+}
+
+template <typename T>
+void SrcTransformInOne4x4(const T *src, float *dst, int w_stride, int h_stride) {
+    Float4 vec_src[4][4];
+    Float4 vec_mid[4][4];
+    Float4 vec_dst[4][4];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            vec_src[i][j] = Float4::load(src + i * h_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_mid[0][i] = vec_src[i][0] - vec_src[i][2];
+        vec_mid[1][i] = vec_src[i][1] + vec_src[i][2];
+        vec_mid[2][i] = vec_src[i][2] - vec_src[i][1];
+        vec_mid[3][i] = vec_src[i][1] - vec_src[i][3];
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_dst[0][i] = vec_mid[i][0] - vec_mid[i][2];
+        vec_dst[1][i] = vec_mid[i][1] + vec_mid[i][2];
+        vec_dst[2][i] = vec_mid[i][2] - vec_mid[i][1];
+        vec_dst[3][i] = vec_mid[i][1] - vec_mid[i][3];
+        Float4::save(dst + 4 * i, vec_dst[0][i]);
+        Float4::save(dst + 16 + 4 * i, vec_dst[1][i]);
+        Float4::save(dst + 32 + 4 * i, vec_dst[2][i]);
+        Float4::save(dst + 48 + 4 * i, vec_dst[3][i]);
+    }
+}
+
+void SrcTransformInOne4x4Float(const void *src, void *dst, int w_stride, int h_stride) {
+    SrcTransformInOne4x4<float>(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                h_stride);
+}
+
+void SrcTransformInOne4x4BFP16(const void *src, void *dst, int w_stride, int h_stride) {
+    SrcTransformInOne4x4<bfp16_t>(reinterpret_cast<const bfp16_t *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                  h_stride);
+}
+
+// A = [1  0]
+//     [1  1]
+//     [1 -1]
+//     [0 -1]
+template <typename T>
+void DstTransformInOne4x2(const float *src, T *dst, int w_stride, int h_stride, int ey) {
+    Float4 vec_src[4][4];
+    Float4 vec_mid[4][2];
+    Float4 vec_dst[2][2];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            vec_src[i][j] = Float4::load(src + i * 4 * w_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_mid[i][0] = vec_src[0][i] + vec_src[1][i] + vec_src[2][i];
+        vec_mid[i][1] = vec_src[1][i] - vec_src[2][i] - vec_src[3][i];
+    }
+
+    for (int i = 0; i < ey; i++) {
+        vec_dst[i][0] = vec_mid[0][i] + vec_mid[1][i] + vec_mid[2][i];
+        vec_dst[i][1] = vec_mid[1][i] - vec_mid[2][i] - vec_mid[3][i];
+        Float4::save(dst + i * h_stride, vec_dst[i][0]);
+        Float4::save(dst + i * h_stride + 4, vec_dst[i][1]);
+    }
+}
+
+void DstTransformInOne4x2Float(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+    DstTransformInOne4x2<float>(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                h_stride, ey);
+}
+
+void DstTransformInOne4x2BFP16(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+    DstTransformInOne4x2<bfp16_t>(reinterpret_cast<const float *>(src), reinterpret_cast<bfp16_t *>(dst), w_stride,
+                                  h_stride, ey);
+}
+
+// B = [4  0  0  0  0  0] s0
+//     [0 -4  4 -2  2  4] s1
+//     [-5 -4 -4 -1 -1 0] s2
+//     [0  1 -1  2 -2 -5] s3
+//     [1  1  1  1  1  0] s4
+//     [0  0  0  0  0  1] s5
+
+// auto m0 = s0 * 4.f - s2 * 5.f + s4;
+// auto m1 = s3 + s4 - (s1 + s2) * 4.f;
+// auto m2 = (s1 - s2) * 4.f + s4 - s3;
+// auto m3 = (s3 - s1) * 2.f + s4 - s2;
+// auto m4 = s4 - s2 - (s3 - s1) * 2.f;
+// auto m5 = s1 * 4.f - s3 * 5.f + s5
+template <typename T>
+void SrcTransformInOne6x6(const T *src, float *dst, int w_stride, int h_stride) {
+    Float4 vec_src[6][6];
+    Float4 vec_mid[6][6];
+    Float4 vec_dst[6][6];
+
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 6; j++) {
+            vec_src[i][j] = Float4::load(src + i * h_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_mid[0][i] = vec_src[i][0] * 4.f - vec_src[i][2] * 5.f + vec_src[i][4];
+        vec_mid[1][i] = vec_src[i][3] + vec_src[i][4] - (vec_src[i][1] + vec_src[i][2]) * 4.f;
+        vec_mid[2][i] = (vec_src[i][1] - vec_src[i][2]) * 4.f + vec_src[i][4] - vec_src[i][3];
+        vec_mid[3][i] = (vec_src[i][3] - vec_src[i][1]) * 2.f + vec_src[i][4] - vec_src[i][2];
+        vec_mid[4][i] = vec_src[i][4] - vec_src[i][2] - (vec_src[i][3] - vec_src[i][1]) * 2.f;
+        vec_mid[5][i] = vec_src[i][1] * 4.f - vec_src[i][3] * 5.f + vec_src[i][5];
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_dst[0][i] = vec_mid[i][0] * 4.f - vec_mid[i][2] * 5.f + vec_mid[i][4];
+        vec_dst[1][i] = vec_mid[i][3] + vec_mid[i][4] - (vec_mid[i][1] + vec_mid[i][2]) * 4.f;
+        vec_dst[2][i] = (vec_mid[i][1] - vec_mid[i][2]) * 4.f + vec_mid[i][4] - vec_mid[i][3];
+        vec_dst[3][i] = (vec_mid[i][3] - vec_mid[i][1]) * 2.f + vec_mid[i][4] - vec_mid[i][2];
+        vec_dst[4][i] = vec_mid[i][4] - vec_mid[i][2] - (vec_mid[i][3] - vec_mid[i][1]) * 2.f;
+        vec_dst[5][i] = vec_mid[i][1] * 4.f - vec_mid[i][3] * 5.f + vec_mid[i][5];
+        Float4::save(dst + 4 * i, vec_dst[0][i]);
+        Float4::save(dst + 24 + 4 * i, vec_dst[1][i]);
+        Float4::save(dst + 48 + 4 * i, vec_dst[2][i]);
+        Float4::save(dst + 72 + 4 * i, vec_dst[3][i]);
+        Float4::save(dst + 96 + 4 * i, vec_dst[4][i]);
+        Float4::save(dst + 120 + 4 * i, vec_dst[5][i]);
+    }
+}
+
+void SrcTransformInOne6x6Float(const void *src, void *dst, int w_stride, int h_stride) {
+#if defined(__aarch64__)
+    float32x2_t vec_src[6][6];
+    float32x2_t vec_mid[6][6];
+    float32x2_t vec_dst[6][6];
+
+    auto src_f = reinterpret_cast<const float *>(src);
+    auto dst_f = reinterpret_cast<float *>(dst);
+
+    for (int k = 0; k < 2; k++) {
+        for (int i = 0; i < 6; i++) {
+            for (int j = 0; j < 6; j++) {
+                vec_src[i][j] = vld1_f32(src_f + k * 2 + i * h_stride + j * w_stride);
+            }
+        }
+
+        for (int i = 0; i < 6; i++) {
+            vec_mid[0][i] = vmls_n_f32(vmla_n_f32(vec_src[i][4], vec_src[i][0], 4.f), vec_src[i][2], 5.f);
+            vec_mid[1][i] =
+                vmls_n_f32(vadd_f32(vec_src[i][3], vec_src[i][4]), vadd_f32(vec_src[i][1], vec_src[i][2]), 4.0f);
+            vec_mid[2][i] =
+                vmla_n_f32(vsub_f32(vec_src[i][4], vec_src[i][3]), vsub_f32(vec_src[i][1], vec_src[i][2]), 4.0f);
+            vec_mid[3][i] =
+                vmla_n_f32(vsub_f32(vec_src[i][4], vec_src[i][2]), vsub_f32(vec_src[i][3], vec_src[i][1]), 2.0f);
+            vec_mid[4][i] =
+                vmla_n_f32(vsub_f32(vec_src[i][4], vec_src[i][2]), vsub_f32(vec_src[i][1], vec_src[i][3]), 2.0f);
+            vec_mid[5][i] = vmls_n_f32(vmla_n_f32(vec_src[i][5], vec_src[i][1], 4.0f), vec_src[i][3], 5.0f);
+        }
+
+        for (int i = 0; i < 6; i++) {
+            vec_dst[0][i] = vmls_n_f32(vmla_n_f32(vec_mid[i][4], vec_mid[i][0], 4.f), vec_mid[i][2], 5.f);
+            vec_dst[1][i] =
+                vmls_n_f32(vadd_f32(vec_mid[i][3], vec_mid[i][4]), vadd_f32(vec_mid[i][1], vec_mid[i][2]), 4.0f);
+            vec_dst[2][i] =
+                vmla_n_f32(vsub_f32(vec_mid[i][4], vec_mid[i][3]), vsub_f32(vec_mid[i][1], vec_mid[i][2]), 4.0f);
+            vec_dst[3][i] =
+                vmla_n_f32(vsub_f32(vec_mid[i][4], vec_mid[i][2]), vsub_f32(vec_mid[i][3], vec_mid[i][1]), 2.0f);
+            vec_dst[4][i] =
+                vmla_n_f32(vsub_f32(vec_mid[i][4], vec_mid[i][2]), vsub_f32(vec_mid[i][1], vec_mid[i][3]), 2.0f);
+            vec_dst[5][i] = vmls_n_f32(vmla_n_f32(vec_mid[i][5], vec_mid[i][1], 4.0f), vec_mid[i][3], 5.0f);
+            vst1_f32(dst_f + k * 2 + 4 * i, vec_dst[0][i]);
+            vst1_f32(dst_f + k * 2 + 24 + 4 * i, vec_dst[1][i]);
+            vst1_f32(dst_f + k * 2 + 48 + 4 * i, vec_dst[2][i]);
+            vst1_f32(dst_f + k * 2 + 72 + 4 * i, vec_dst[3][i]);
+            vst1_f32(dst_f + k * 2 + 96 + 4 * i, vec_dst[4][i]);
+            vst1_f32(dst_f + k * 2 + 120 + 4 * i, vec_dst[5][i]);
+        }
+    }
+#else
+    SrcTransformInOne6x6<float>(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                h_stride);
+#endif
+}
+
+void SrcTransformInOne6x6BFP16(const void *src, void *dst, int w_stride, int h_stride) {
+    SrcTransformInOne6x6<bfp16_t>(reinterpret_cast<const bfp16_t *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                  h_stride);
+}
+
+// A = [1  0  0  0]
+//     [1  1  1  1]
+//     [1 -1  1 -1]
+//     [1  2  4  8]
+//     [1 -2  4 -8]
+//     [0  0  0  1]
+// auto m0 = s0 + s1 + s2 + s3 + s4;
+// auto m1 = s1 - s2 + (s3 - s4) * 2.f;
+// auto m2 = s1 + s2 + (s3 + s4) * 4.f;
+// auto m3 = s1 - s2 + (s3 - s4) * 8.f + s5;
+template <typename T>
+void DstTransformInOne6x4(const float *src, T *dst, int w_stride, int h_stride, int ey) {
+    Float4 vec_src[6][6];
+    Float4 vec_mid[6][4];
+    Float4 vec_dst[4][4];
+
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 6; j++) {
+            vec_src[i][j] = Float4::load(src + i * 6 * w_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_mid[i][0] = vec_src[0][i] + vec_src[1][i] + vec_src[2][i] + vec_src[3][i] + vec_src[4][i];
+        vec_mid[i][1] = vec_src[1][i] - vec_src[2][i] + (vec_src[3][i] - vec_src[4][i]) * 2.f;
+        vec_mid[i][2] = vec_src[1][i] + vec_src[2][i] + (vec_src[3][i] + vec_src[4][i]) * 4.f;
+        vec_mid[i][3] = vec_src[1][i] - vec_src[2][i] + (vec_src[3][i] - vec_src[4][i]) * 8.f + vec_src[5][i];
+    }
+
+    for (int i = 0; i < ey; i++) {
+        vec_dst[i][0] = vec_mid[0][i] + vec_mid[1][i] + vec_mid[2][i] + vec_mid[3][i] + vec_mid[4][i];
+        vec_dst[i][1] = vec_mid[1][i] - vec_mid[2][i] + (vec_mid[3][i] - vec_mid[4][i]) * 2.f;
+        vec_dst[i][2] = vec_mid[1][i] + vec_mid[2][i] + (vec_mid[3][i] + vec_mid[4][i]) * 4.f;
+        vec_dst[i][3] = vec_mid[1][i] - vec_mid[2][i] + (vec_mid[3][i] - vec_mid[4][i]) * 8.f + vec_mid[5][i];
+        Float4::save(dst + i * h_stride, vec_dst[i][0]);
+        Float4::save(dst + i * h_stride + 4, vec_dst[i][1]);
+        Float4::save(dst + i * h_stride + 8, vec_dst[i][2]);
+        Float4::save(dst + i * h_stride + 12, vec_dst[i][3]);
+    }
+}
+
+void DstTransformInOne6x4Float(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+#if defined(__aarch64__)
+    float32x2_t vec_src[6][6];
+    float32x2_t vec_mid[6][4];
+    float32x2_t vec_dst[4][4];
+
+    auto src_f = reinterpret_cast<const float *>(src);
+    auto dst_f = reinterpret_cast<float *>(dst);
+
+    for (int k = 0; k < 2; k++) {
+        for (int i = 0; i < 6; i++) {
+            for (int j = 0; j < 6; j++) {
+                vec_src[i][j] = vld1_f32(src_f + k * 2 + i * 6 * w_stride + j * w_stride);
+            }
+        }
+
+        for (int i = 0; i < 6; i++) {
+            vec_mid[i][0] =
+                vadd_f32(vadd_f32(vadd_f32(vec_src[0][i], vec_src[1][i]), vadd_f32(vec_src[2][i], vec_src[3][i])),
+                         vec_src[4][i]);
+            vec_mid[i][1] =
+                vmla_n_f32(vsub_f32(vec_src[1][i], vec_src[2][i]), vsub_f32(vec_src[3][i], vec_src[4][i]), 2.0f);
+            vec_mid[i][2] =
+                vmla_n_f32(vadd_f32(vec_src[1][i], vec_src[2][i]), vadd_f32(vec_src[3][i], vec_src[4][i]), 4.0f);
+            vec_mid[i][3] = vadd_f32(
+                vmla_n_f32(vsub_f32(vec_src[1][i], vec_src[2][i]), vsub_f32(vec_src[3][i], vec_src[4][i]), 8.0f),
+                vec_src[5][i]);
+        }
+
+        for (int i = 0; i < ey; i++) {
+            vec_dst[i][0] =
+                vadd_f32(vadd_f32(vadd_f32(vec_mid[0][i], vec_mid[1][i]), vadd_f32(vec_mid[2][i], vec_mid[3][i])),
+                         vec_mid[4][i]);
+            vec_dst[i][1] =
+                vmla_n_f32(vsub_f32(vec_mid[1][i], vec_mid[2][i]), vsub_f32(vec_mid[3][i], vec_mid[4][i]), 2.0f);
+            vec_dst[i][2] =
+                vmla_n_f32(vadd_f32(vec_mid[1][i], vec_mid[2][i]), vadd_f32(vec_mid[3][i], vec_mid[4][i]), 4.0f);
+            vec_dst[i][3] = vadd_f32(
+                vmla_n_f32(vsub_f32(vec_mid[1][i], vec_mid[2][i]), vsub_f32(vec_mid[3][i], vec_mid[4][i]), 8.0f),
+                vec_mid[5][i]);
+            vst1_f32(dst_f + k * 2 + i * h_stride, vec_dst[i][0]);
+            vst1_f32(dst_f + k * 2 + i * h_stride + 4, vec_dst[i][1]);
+            vst1_f32(dst_f + k * 2 + i * h_stride + 8, vec_dst[i][2]);
+            vst1_f32(dst_f + k * 2 + i * h_stride + 12, vec_dst[i][3]);
+        }
+    }
+#else
+    DstTransformInOne6x4<float>(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), w_stride,
+                                h_stride, ey);
+#endif
+}
+
+void DstTransformInOne6x4BFP16(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+    DstTransformInOne6x4<bfp16_t>(reinterpret_cast<const float *>(src), reinterpret_cast<bfp16_t *>(dst), w_stride,
+                                  h_stride, ey);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.h
new file mode 100644
index 0000000..f2f9364
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute/winograd_function.h
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_WINOGRAD_FUNCTION_H_
+#define TNN_ARM_WINOGRAD_FUNCTION_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_common.h"
+
+namespace TNN_NS {
+
+typedef void (*SrcTransformFunc)(const void* src, void* dst,
+                            int w_stride, int h_stride);
+typedef void (*DstTransformFunc)(const void* src, void* dst,
+                            int w_stride, int h_stride, int ey);
+
+void WeightTransform4x4(const float *src, float *dst, int kernel_size, int in_channel, int out_channel);
+void WeightTransform6x6(const float *src, float *dst, int kernel_size, int in_channel, int out_channel);
+
+void SrcTransform4x4(const float *src, float *dst, int src_stride, int dst_stride);
+void SrcTransform6x6(const float *src, float *dst, int src_stride, int dst_stride);
+
+void DstTransform4x2(const float *src, float *dst, int src_stride, int dst_stride);
+void DstTransform6x4(const float *src, float *dst, int src_stride, int dst_stride);
+
+void SrcTransformInOne4x4Float(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne4x2Float(const void *src, void *dst, int w_stride, int h_stride, int ey);
+void SrcTransformInOne6x6Float(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne6x4Float(const void *src, void *dst, int w_stride, int h_stride, int ey);
+
+void SrcTransformInOne4x4BFP16(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne4x2BFP16(const void *src, void *dst, int w_stride, int h_stride, int ey);
+void SrcTransformInOne6x6BFP16(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne6x4BFP16(const void *src, void *dst, int w_stride, int h_stride, int ey);
+
+#if TNN_ARM82
+void WeightTransformHalf4x4(const float *src, float *dst, int kernel_size, int in_channel, int out_channel);
+void WeightTransformHalf6x6(const float *src, float *dst, int kernel_size, int in_channel, int out_channel);
+
+void SrcTransform4x4(const fp16_t *src, fp16_t *dst, int src_stride, int dst_stride);
+void SrcTransform6x6(const fp16_t *src, fp16_t *dst, int src_stride, int dst_stride);
+
+void DstTransform4x2(const fp16_t *src, fp16_t *dst, int src_stride, int dst_stride);
+void DstTransform6x4(const fp16_t *src, fp16_t *dst, int src_stride, int dst_stride);
+
+void SrcTransformInOne4x4Fp16(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne4x2Fp16(const void *src, void *dst, int w_stride, int h_stride, int ey);
+void SrcTransformInOne6x6Fp16(const void *src, void *dst, int w_stride, int h_stride);
+void DstTransformInOne6x4Fp16(const void *src, void *dst, int w_stride, int h_stride, int ey);
+#endif
+
+}  // namespace TNN_NS
+
+#endif /* WinogradOptFunction_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/CMakeLists.txt
new file mode 100644
index 0000000..830ba4d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/CMakeLists.txt
@@ -0,0 +1,27 @@
+file(GLOB ARM82_SRC *.h *.cc)
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    file(GLOB_RECURSE ARM82_SRC_ASM asm_64/*.S)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    file(GLOB_RECURSE ARM82_SRC_ASM asm_32/*.S)
+endif()
+
+add_library(
+    TNNArm82
+    OBJECT
+    ${ARM82_SRC}
+    ${ARM82_SRC_ASM}
+    )
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    target_compile_options(TNNArm82 PRIVATE -march=armv8.2-a+fp16)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    if(CMAKE_SYSTEM_NAME MATCHES "Android")    
+        target_compile_options(TNNArm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp)
+    else()
+        message(STATUS "error")
+    endif()
+endif()
+
+# target_compile_options(TNNArm82 PRIVATE -march=armv7-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp)
+
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_batch_norm_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_batch_norm_fp16_layer.cc
new file mode 100644
index 0000000..18b5792
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_batch_norm_fp16_layer.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/acc/arm_batch_norm_layer_acc.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+Status ArmBatchNormLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto ic = dims_input[1], input_slice = UP_DIV(dims_input[1], 8);
+    auto oc = dims_output[1], output_slice = UP_DIV(dims_output[1], 8);
+    auto i_hw = DimsVectorUtils::Count(dims_input, 2);
+    auto o_hw = DimsVectorUtils::Count(dims_output, 2);
+
+    auto batch = dims_output[0];
+
+    fp16_t *input_orign  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *output_orign = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    fp16_t *k_data = buffer_scale_.force_to<fp16_t *>();
+    fp16_t *b_data = buffer_bias_.force_to<fp16_t *>();
+
+    auto src_z_step = i_hw * 8;
+    auto dst_z_step = o_hw * 8;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = input_orign + batch_idx * input_slice * 8 * i_hw;
+        auto output_ptr = output_orign + batch_idx * output_slice * 8 * o_hw;
+
+        if (!shared_channel_) {
+            for (int dz = 0; dz < output_slice; dz++) {
+                for (int x_i = 0; x_i < o_hw; x_i++) {
+                    Half8 input_v  = Half8::load(input_ptr + dz * src_z_step + x_i * 8);
+                    Half8 k_data_v = Half8::load(k_data + dz * 8);
+                    Half8 b_data_v = Half8::load(b_data + dz * 8);
+                    Half8::mla(b_data_v, input_v, k_data_v);
+                    Half8::save(output_ptr + dz * dst_z_step + x_i * 8, b_data_v);
+                }
+            }
+        } else {
+            Half8 k_data_v = Half8(k_data[0]);
+            Half8 b_data_v = Half8(b_data[0]);
+            for (int dz = 0; dz < output_slice; dz++) {
+                for (int x_i = 0; x_i < o_hw; x_i++) {
+                    Half8 input_v = Half8::load(input_ptr + dz * src_z_step + x_i * 8);
+                    Half8 dst_v   = b_data_v;
+                    Half8::mla(dst_v, input_v, k_data_v);
+                    Half8::save(output_ptr + dz * dst_z_step + x_i * 8, dst_v);
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_binary_fp16_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_binary_fp16_layer_acc.cc
new file mode 100644
index 0000000..8625b10
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_binary_fp16_layer_acc.cc
@@ -0,0 +1,212 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+#include "tnn/device/arm/acc/compute/binary_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+namespace TNN_NS {
+
+template<> fp16_t binary_op<ArmBinaryOpType::kADD, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a + b;
+}
+template<> fp16_t binary_op<ArmBinaryOpType::kSUB, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a - b;
+}
+template<> fp16_t binary_op<ArmBinaryOpType::kMUL, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a * b;
+}
+template<> fp16_t binary_op<ArmBinaryOpType::kDIV, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a / b;
+}
+template<> fp16_t binary_op<ArmBinaryOpType::kMAX, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a > b ? a : b;
+}
+template<> fp16_t binary_op<ArmBinaryOpType::kMIN, fp16_t>(const fp16_t &a, const fp16_t &b, float alpha, float beta) {
+    return a < b ? a : b;
+}
+
+template<> Half8 binary_op<ArmBinaryOpType::kADD, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return a + b;
+}
+template<> Half8 binary_op<ArmBinaryOpType::kSUB, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return a - b;
+}
+template<> Half8 binary_op<ArmBinaryOpType::kMUL, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return a * b;
+}
+template<> Half8 binary_op<ArmBinaryOpType::kDIV, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return Half8::div(a, b);
+}
+template<> Half8 binary_op<ArmBinaryOpType::kMAX, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return Half8::max(a, b);
+}
+template<> Half8 binary_op<ArmBinaryOpType::kMIN, Half8>(const Half8 &a, const Half8 &b, float alpha, float beta) {
+    return Half8::min(a, b);
+}
+
+template <ArmBinaryOpType op_type>
+Status BinaryGeneralFp16Func(void *output_ptr, std::vector<void *> &input_ptrs, DimsVector output_shape,
+                             std::vector<DimsVector> &input_shapes, void *workspace) {
+    size_t output_size = DimsVectorUtils::Count(output_shape);
+    fp16_t *output_nchw = reinterpret_cast<fp16_t *>(workspace);
+    fp16_t *input_nchw = output_nchw + output_size;
+    fp16_t *out_ptr = reinterpret_cast<fp16_t *>(output_ptr);
+
+    DimsVector output_offset;
+    BinaryComputeOffset(output_offset, output_shape, output_shape);
+    for (int i = 0; i < input_shapes.size(); i++) {
+        auto input_shape = input_shapes[i];
+        fp16_t *input_data = reinterpret_cast<fp16_t *>(input_ptrs[i]);
+
+        DimsVector input_shape_pad;
+        input_shape_pad.resize(output_shape.size());
+        PadShape(output_shape.size() - input_shape.size(), output_shape.size(), input_shape_pad, input_shape);
+
+        int input_batch = input_shape_pad[0];
+        int input_channel = input_shape_pad[1];
+        int input_hw = DimsVectorUtils::Count(input_shape_pad, 2);
+        // nc8hw8 to nchw
+        UnpackHalfBlob(input_nchw, input_data, input_batch, input_channel, input_hw);
+
+        DimsVector input_offset;
+        BinaryComputeOffset(input_offset, input_shape, output_shape);
+        if (i == 0) {
+            BinaryComputeFirst<fp16_t>(input_offset, output_offset, output_shape, input_nchw, output_nchw);
+        } else {
+            BinaryCompute<fp16_t, op_type>(input_offset, output_offset, output_shape, input_nchw, output_nchw);
+        }
+    }
+
+    int output_batch = output_shape[0];
+    int output_channel = output_shape[1];
+    int output_hw = DimsVectorUtils::Count(output_shape, 2);
+    PackHalfBlob(out_ptr, output_nchw, output_batch, output_channel, output_hw);
+
+    return TNN_OK;
+}
+
+Status ArmBinaryLayerAcc::allocateBufferParamHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if (layer_res && broadcast_.GetBytesSize() == 0) {
+        RawBuffer element_handle = layer_res->element_handle;
+        auto dims                = layer_res->element_shape;
+        auto output_dims         = outputs[0]->GetBlobDesc().dims;
+        DimsVector dims_pad;
+        dims_pad.resize(output_dims.size());
+        PadShape(output_dims.size() - dims.size(), output_dims.size(), dims_pad, dims);
+
+        auto layer_res_size = element_handle.GetDataCount();
+        auto data_byte_size = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+        auto layer_data     = element_handle.force_to<void *>();
+        if (element_handle.GetDataType() == DATA_TYPE_FLOAT ||
+            element_handle.GetDataType() == DATA_TYPE_HALF) {
+
+            fp16_t *filter_half_ptr = nullptr;
+            RawBuffer filter_half(layer_res_size * data_byte_size);
+            if (element_handle.GetDataType() == DATA_TYPE_HALF) {
+                filter_half_ptr = reinterpret_cast<fp16_t *>(layer_data);
+            } else {
+                Float2Half(filter_half.force_to<fp16_t *>(), reinterpret_cast<const float *>(layer_data), layer_res_size);
+                filter_half_ptr = filter_half.force_to<fp16_t *>();
+            }
+
+            if (layer_res_size == 1) {
+                // broadcast single, just memcpy
+                RawBuffer temp(layer_res_size * data_byte_size);
+                memcpy(temp.force_to<void *>(), filter_half_ptr, layer_res_size * data_byte_size);
+                broadcast_ = temp;
+            } else {
+                // pack bias from nchw to nc8hw8
+                int count = DimsVectorUtils::Count(dims_pad);
+                if (dims_pad.size() >= 2) {
+                    count = count / dims_pad[1];
+                    count = count * ROUND_UP(dims_pad[1], 8);
+                }
+                int channel = 1;
+                if (dims_pad.size() > 1) {
+                    channel = dims_pad[1];
+                }
+                int hw_stride = 1;
+                if (dims_pad.size() > 2) {
+                    hw_stride = DimsVectorUtils::Count(dims_pad, 2);
+                }
+
+                RawBuffer temp(count * data_byte_size);
+                for (int b = 0; b < dims_pad[0]; b++) {
+                    fp16_t *src = filter_half_ptr + b * DimsVectorUtils::Count(dims_pad, 1);
+                    fp16_t *dst = temp.force_to<fp16_t *>() + b * channel * hw_stride;
+                    PackC8(dst, src, hw_stride, channel);
+                }
+                broadcast_ = temp;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+template <ArmBinaryOpType op_type>
+Status ArmBinaryLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (btype_ == BroadcastTypeUnknown) {
+        LOGE("Error: unknown broadcast type\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unknown broadcast type");
+    } else if (btype_ == BroadcastTypeGeneral) {
+        auto output_ptr = GetBlobHandlePtr(output->GetHandle());
+        size_t output_size = DimsVectorUtils::Count(output_dims);
+        void *workspace = context_->GetSharedWorkSpace(output_size * 2 * sizeof(fp16_t));
+        BinaryGeneralFp16Func<op_type>(output_ptr, input_ptrs_, output_dims, input_shapes_, workspace);
+    } else {
+        auto output_ptr = GetBlobHandlePtr(output->GetHandle());
+        auto input0_ptr = input_ptrs_[0];
+        auto input1_ptr = input_ptrs_[1];
+
+        DimsVector input0_pad_shape, input1_pad_shape;
+        input0_pad_shape.resize(output_dims.size());
+        input1_pad_shape.resize(output_dims.size());
+        PadShape(output_dims.size() - input_shapes_[0].size(), output_dims.size(), input0_pad_shape, input_shapes_[0]);
+        PadShape(output_dims.size() - input_shapes_[1].size(), output_dims.size(), input1_pad_shape, input_shapes_[1]);
+
+        BinaryFunc<fp16_t, op_type, Half8, 8>(output_ptr, input0_ptr, input1_ptr, input0_pad_shape, input1_pad_shape);
+
+        for (int i = 2; i < input_ptrs_.size(); i++) {
+            auto input_ptr = input_ptrs_[i];
+            PadShape(output_dims.size() - input_shapes_[i].size(), output_dims.size(), input0_pad_shape, input_shapes_[i]);
+            BinaryFunc<fp16_t, op_type, Half8, 8>(output_ptr, output_ptr, input_ptr, output_dims, input0_pad_shape);
+        }
+    }
+
+    return TNN_OK;
+}
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kADD>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kSUB>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kMUL>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kMAX>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kMIN>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+template Status ArmBinaryLayerAcc::ExecFp16<ArmBinaryOpType::kDIV>(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_3x3.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_3x3.cc
new file mode 100644
index 0000000..b2c6b9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_3x3.cc
@@ -0,0 +1,598 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_3x3.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+#ifdef TNN_ARM82_A64
+#define NEON_GEMM_TILE_HW (16)
+#else
+#define NEON_GEMM_TILE_HW (8)
+#endif
+
+namespace TNN_NS {
+
+#ifdef TNN_ARM82_A64
+template <int stride>
+static inline void _repack_half_16(__fp16 *dst_b, const __fp16 *src_b) {
+    Half8 v[16];
+    v[0] =  Half8::load(src_b + 0);
+    v[1] =  Half8::load(src_b + stride);
+    v[2] =  Half8::load(src_b + stride * 2);
+    v[3] =  Half8::load(src_b + stride * 3);
+    v[4] =  Half8::load(src_b + stride * 4);
+    v[5] =  Half8::load(src_b + stride * 5);
+    v[6] =  Half8::load(src_b + stride * 6);
+    v[7] =  Half8::load(src_b + stride * 7);
+    v[8] =  Half8::load(src_b + stride * 8);
+    v[9] =  Half8::load(src_b + stride * 9);
+    v[10] = Half8::load(src_b + stride * 10);
+    v[11] = Half8::load(src_b + stride * 11);
+    v[12] = Half8::load(src_b + stride * 12);
+    v[13] = Half8::load(src_b + stride * 13);
+    v[14] = Half8::load(src_b + stride * 14);
+    v[15] = Half8::load(src_b + stride * 15);
+
+    Half8::zip(v[0], v[4]);
+    Half8::zip(v[2], v[6]);
+    Half8::zip(v[1], v[5]);
+    Half8::zip(v[3], v[7]);
+    Half8::zip(v[0], v[2]);
+    Half8::zip(v[1], v[3]);
+    Half8::zip(v[4], v[6]);
+    Half8::zip(v[5], v[7]);
+    Half8::zip(v[0], v[1]);
+    Half8::zip(v[2], v[3]);
+    Half8::zip(v[4], v[5]);
+    Half8::zip(v[6], v[7]);
+    Half8::zip(v[8],  v[12]);
+    Half8::zip(v[10], v[14]);
+    Half8::zip(v[9],  v[13]);
+    Half8::zip(v[11], v[15]);
+    Half8::zip(v[8],  v[10]);
+    Half8::zip(v[9],  v[11]);
+    Half8::zip(v[12], v[14]);
+    Half8::zip(v[13], v[15]);
+    Half8::zip(v[8],  v[9]);
+    Half8::zip(v[10], v[11]);
+    Half8::zip(v[12], v[13]);
+    Half8::zip(v[14], v[15]);
+
+    Half8::save(dst_b + 0,  v[0]);
+    Half8::save(dst_b + 8,  v[8]);
+    Half8::save(dst_b + 16, v[1]);
+    Half8::save(dst_b + 24, v[9]);
+    Half8::save(dst_b + 32, v[2]);
+    Half8::save(dst_b + 40, v[10]);
+    Half8::save(dst_b + 48, v[3]);
+    Half8::save(dst_b + 56, v[11]);
+    Half8::save(dst_b + 64, v[4]);
+    Half8::save(dst_b + 72, v[12]);
+    Half8::save(dst_b + 80, v[5]);
+    Half8::save(dst_b + 88, v[13]);
+    Half8::save(dst_b + 96, v[6]);
+    Half8::save(dst_b + 104, v[14]);
+    Half8::save(dst_b + 112, v[7]);
+    Half8::save(dst_b + 120, v[15]);
+}
+#endif
+
+template <int stride>
+static inline void _repack_half_8(fp16_t *dst_b, const fp16_t *src_b) {
+    Half8 v[8];
+    v[0] = Half8::load(src_b + 0);
+    v[1] = Half8::load(src_b + stride);
+    v[2] = Half8::load(src_b + stride * 2);
+    v[3] = Half8::load(src_b + stride * 3);
+    v[4] = Half8::load(src_b + stride * 4);
+    v[5] = Half8::load(src_b + stride * 5);
+    v[6] = Half8::load(src_b + stride * 6);
+    v[7] = Half8::load(src_b + stride * 7);
+    Half8::zip(v[0], v[4]);
+    Half8::zip(v[2], v[6]);
+    Half8::zip(v[1], v[5]);
+    Half8::zip(v[3], v[7]);
+    Half8::zip(v[0], v[2]);
+    Half8::zip(v[1], v[3]);
+    Half8::zip(v[4], v[6]);
+    Half8::zip(v[5], v[7]);
+    Half8::zip(v[0], v[1]);
+    Half8::zip(v[2], v[3]);
+    Half8::zip(v[4], v[5]);
+    Half8::zip(v[6], v[7]);
+    Half8::save(dst_b + 0,  v[0]);
+    Half8::save(dst_b + 8,  v[1]);
+    Half8::save(dst_b + 16, v[2]);
+    Half8::save(dst_b + 24, v[3]);
+    Half8::save(dst_b + 32, v[4]);
+    Half8::save(dst_b + 40, v[5]);
+    Half8::save(dst_b + 48, v[6]);
+    Half8::save(dst_b + 56, v[7]);
+}
+
+template <int stride>
+static inline void _repack_half_4(fp16_t *dst_b, const fp16_t *src_b) {
+    Half4 v[8];
+    v[0] = Half4::load(src_b + 0);
+    v[1] = Half4::load(src_b + 4);
+    v[2] = Half4::load(src_b + stride);
+    v[3] = Half4::load(src_b + stride + 4);
+    v[4] = Half4::load(src_b + stride * 2);
+    v[5] = Half4::load(src_b + stride * 2 + 4);
+    v[6] = Half4::load(src_b + stride * 3);
+    v[7] = Half4::load(src_b + stride * 3 + 4);
+    Half4::zip(v[0], v[4]);
+    Half4::zip(v[2], v[6]);
+    Half4::zip(v[1], v[5]);
+    Half4::zip(v[3], v[7]);
+    Half4::zip(v[0], v[2]);
+    Half4::zip(v[4], v[6]);
+    Half4::zip(v[1], v[3]);
+    Half4::zip(v[5], v[7]);
+    Half4::save(dst_b + 0,  v[0]);
+    Half4::save(dst_b + 4,  v[2]);
+    Half4::save(dst_b + 8,  v[4]);
+    Half4::save(dst_b + 12, v[6]);
+    Half4::save(dst_b + 16, v[1]);
+    Half4::save(dst_b + 20, v[3]);
+    Half4::save(dst_b + 24, v[5]);
+    Half4::save(dst_b + 28, v[7]);
+}
+
+template <int src_unit_size>
+static void load_repack_half(
+    fp16_t *dst,
+    const fp16_t *src,
+    int dst_cnt,
+    int z,
+    int ic,
+    int ic_r8) {
+    if (dst_cnt == NEON_GEMM_TILE_HW) {
+        auto repack_dst = dst + NEON_GEMM_TILE_HW * z;
+        auto repack_src = src;
+        for (int i = 0; i < src_unit_size; i++) {
+            auto repack_dst_i = repack_dst + i * NEON_GEMM_TILE_HW * ic_r8;
+            auto repack_src_i = repack_src + i * 8;
+            if (src_unit_size == 16) {
+#ifdef TNN_ARM82_A64
+                _repack_half_16<128>(repack_dst_i, repack_src_i);
+#else
+                _repack_half_8<128>(repack_dst_i, repack_src_i);
+#endif
+            } else if (src_unit_size == 36) {
+#ifdef TNN_ARM82_A64
+                _repack_half_16<288>(repack_dst_i, repack_src_i);
+#else
+                _repack_half_8<288>(repack_dst_i, repack_src_i);
+#endif
+            }
+        }
+    } else {
+        int x_i = 0;
+#ifdef TNN_ARM82_A64
+        if (x_i <= dst_cnt - 8) {
+            auto repack_dst = dst + 8 * z;
+            auto repack_src = src;
+            for (int i = 0; i < src_unit_size; i++) {
+                auto repack_dst_i = repack_dst + i * NEON_GEMM_TILE_HW * ic_r8;
+                auto repack_src_i = repack_src + i * 8;
+                if (src_unit_size == 16) {
+                    _repack_half_8<128>(repack_dst_i, repack_src_i);
+                } else if (src_unit_size == 36) {
+                    _repack_half_8<288>(repack_dst_i, repack_src_i);
+                }
+            }
+            x_i += 8;
+        }
+#endif
+        if (x_i <= dst_cnt - 4) {
+            auto repack_dst = dst + x_i * ic_r8 + 4 * z;
+            auto repack_src = src + x_i * src_unit_size * 8;
+            for (int i = 0; i < src_unit_size; i++) {
+                auto repack_dst_i = repack_dst + i * NEON_GEMM_TILE_HW * ic_r8;
+                auto repack_src_i = repack_src + i * 8;
+                if (src_unit_size == 16) {
+                    _repack_half_4<128>(repack_dst_i, repack_src_i);
+                } else if (src_unit_size == 36) {
+                    _repack_half_4<288>(repack_dst_i, repack_src_i);
+                }
+            }
+            x_i += 4;
+        }
+        if (x_i < dst_cnt) {
+            auto repack_dst = dst + x_i * ic_r8 + 4 * z;
+            auto repack_src = src + x_i * src_unit_size * 8;
+            for (int i = 0; i < src_unit_size; i++) {
+                auto repack_dst_i = repack_dst + i * NEON_GEMM_TILE_HW * ic_r8;
+                auto repack_src_i = repack_src + i * 8;
+                if (src_unit_size == 16) {
+                    _repack_half_4<128>(repack_dst_i, repack_src_i);
+                } else if (src_unit_size == 36) {
+                    _repack_half_4<288>(repack_dst_i, repack_src_i);
+                }
+            }
+        }
+    }
+}
+
+bool ArmConvFp16Layer3x3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_HALF) {
+        return false;
+    }
+
+    if (param->group != 1 || param->dialations[0] != 1 || param->dialations[1] != 1 || param->strides[0] != 1 ||
+        param->kernels[0] != param->kernels[1] || param->strides[1] != 1 ||
+        (inputs[0]->GetBlobDesc().dims[1] < 8 && outputs[0]->GetBlobDesc().dims[1] < 8)) {
+        return false;
+    }
+
+    if (!SelectWinograd(param, inputs, outputs)) {
+        return false;
+    }
+
+    return true;
+}
+
+int ArmConvFp16Layer3x3::SelectWinograd(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return 0;
+    }
+
+    int ic          = inputs[0]->GetBlobDesc().dims[1];
+    int oc          = outputs[0]->GetBlobDesc().dims[1];
+    int kernel_size = param->kernels[0];
+    int ow          = outputs[0]->GetBlobDesc().dims[3];
+    int oh          = outputs[0]->GetBlobDesc().dims[2];
+
+    if (kernel_size != 3) {
+        return 0;
+    }
+
+    int dst_unit      = 2;
+    float max_rate    = 1.f;
+    float origin_cost = (float)ow * oh * (float)ROUND_UP(ic, 8) * ROUND_UP(oc, 8) * kernel_size * kernel_size;
+
+    // only support F(2x2, 3x3) and F(4x4, 3x3)
+    for (int u = 2; u <= 4; u += 2) {
+        float src_unit = (float)(u + kernel_size - 1);
+
+        // winograd cost = src transform + gemm + dst transform
+        float winograd_cost =
+            (2 * src_unit * src_unit * src_unit * ROUND_UP(ic, 8) +
+             src_unit * src_unit * ROUND_UP(ic, 8) * ROUND_UP(oc, 8) + 2 * src_unit * u * u * ROUND_UP(oc, 8)) *
+            (UP_DIV(ow, u) * UP_DIV(oh, u));
+
+        float acc_rate = origin_cost / winograd_cost;
+
+        if (acc_rate > max_rate * 1.1f) {
+            max_rate = acc_rate;
+            dst_unit = u;
+        }
+    }
+
+    // 10% penalty, winograd will result in more cache miss
+    if (max_rate < 1.1f) {
+        return 0;
+    }
+
+    return dst_unit;
+}
+
+ArmConvFp16Layer3x3::~ArmConvFp16Layer3x3() {}
+
+Status ArmConvFp16Layer3x3::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int ic  = dims_input[1];
+        const int oc = dims_output[1];
+
+        const int kw = conv_param->kernels[0];
+        const int kh = conv_param->kernels[1];
+
+        const float *src = conv_res->filter_handle.force_to<float *>();
+        size_t data_byte_size = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+
+        dst_unit_ = SelectWinograd(conv_param, inputs, outputs);
+        src_unit_ = dst_unit_ + kw - 1;
+
+        const size_t weight_count = src_unit_ * src_unit_ * k_param_->oc_r8 * k_param_->ic_r8;
+        const size_t weight_nchw_count = oc * ic * kh * kw;
+
+        RawBuffer pack_weight_f32(weight_count * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT));
+        RawBuffer pack_weight_f16(weight_count * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+        // use float to transform weight, may be helpful to higher precision
+        // in aarch32 fp16, need to avoid using half_t to compute weight, because it is slow
+        switch (dst_unit_) {
+            case 2:
+                WeightTransformHalf4x4(src, pack_weight_f32.force_to<float *>(), 3, ic, oc);
+                break;
+            case 4:
+                WeightTransformHalf6x6(src, pack_weight_f32.force_to<float *>(), 3, ic, oc);
+                break;
+            default:
+                LOGE("Unsupport winograd dst unit\n");
+                break;
+        }
+        Float2Half(pack_weight_f16.force_to<fp16_t *>(), pack_weight_f32.force_to<float *>(), weight_count);
+        buffer_weight_ = pack_weight_f16;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvFp16Layer3x3::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmConvFp16LayerCommon::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    if (conv_param) {
+        if (in_data_type == DATA_TYPE_HALF) {
+            if (dst_unit_ == 2) {
+                SrcTransformFunc_ = SrcTransformInOne4x4Fp16;
+                DstTransformFunc_ = DstTransformInOne4x2Fp16;
+            } else if (dst_unit_ == 4) {
+                SrcTransformFunc_ = SrcTransformInOne6x6Fp16;
+                DstTransformFunc_ = DstTransformInOne6x4Fp16;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else {
+            return TNNERR_LAYER_ERR;
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvFp16Layer3x3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (dst_unit_ == 2) {
+        return Exec<2, 4>(inputs, outputs);
+    } else if (dst_unit_ == 4) {
+        return Exec<4, 6>(inputs, outputs);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+template <int dst_unit, int src_unit>
+Status ArmConvFp16Layer3x3::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto input                 = inputs[0];
+    auto output                = outputs[0];
+
+    size_t data_byte_size = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+
+    const int batch = output->GetBlobDesc().dims[0];
+    int ic = input->GetBlobDesc().dims[1];
+
+    auto w_unit      = UP_DIV(k_param_->ow, dst_unit);
+    auto h_unit      = UP_DIV(k_param_->oh, dst_unit);
+    auto tile_count  = UP_DIV(w_unit * h_unit, NEON_GEMM_TILE_HW);
+
+    const fp16_t *src_origin = reinterpret_cast<const fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *dst_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int max_num_threads           = OMP_MAX_THREADS_NUM_;
+    size_t fake_bias_size         = k_param_->oc_r8 * data_byte_size;
+
+    size_t src_pad_buf_per_thread = src_unit * src_unit * 8;
+    size_t src_pad_buf_size       = src_pad_buf_per_thread * max_num_threads * data_byte_size;
+
+    size_t src_trans_tmp_per_thread = 8 * src_unit * src_unit * NEON_GEMM_TILE_HW;
+    size_t src_trans_size           = k_param_->ic_r8 * src_unit * src_unit * NEON_GEMM_TILE_HW;
+    size_t dst_trans_size           = k_param_->oc_r8 * src_unit * src_unit * NEON_GEMM_TILE_HW;
+    size_t work_buf_size            = (src_trans_tmp_per_thread * max_num_threads + src_trans_size + dst_trans_size) * data_byte_size;
+
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(
+        context_->GetSharedWorkSpace(fake_bias_size + src_pad_buf_size + work_buf_size + NEON_KERNEL_EXTRA_LOAD));
+
+    fp16_t *fake_bias      = work_space;
+    fp16_t *src_pad_buffer = work_space + fake_bias_size / data_byte_size;
+    fp16_t *work_buf       = src_pad_buffer + src_pad_buf_size / data_byte_size;
+
+    // memset fake bias data to get correct results
+    memset(fake_bias, 0, fake_bias_size);
+
+    if (DstTransformFunc_ == nullptr || SrcTransformFunc_ == nullptr) {
+        return TNNERR_COMMON_ERROR;
+    }
+
+    struct tile_info {
+        int src_sy;
+        int src_ey;
+        int src_sx;
+        int src_ex;
+        int src_loc;
+
+        int dst_ey;
+        int dst_ex;
+        int dst_loc;
+    };
+    tile_info tiles_info[NEON_GEMM_TILE_HW];
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r8;
+        auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+
+        for (int t_idx = 0; t_idx < tile_count; t_idx++) {
+            auto src_trans_buf = work_buf;
+            auto repack_buf    = src_trans_buf + src_trans_tmp_per_thread * max_num_threads;
+            auto dst_trans_buf = repack_buf + src_trans_size + NEON_KERNEL_EXTRA_LOAD / data_byte_size;
+
+            int x_idx    = t_idx * NEON_GEMM_TILE_HW;
+            int x_remain = w_unit * h_unit - x_idx;
+            int x_c      = x_remain > NEON_GEMM_TILE_HW ? NEON_GEMM_TILE_HW : x_remain;
+
+            size_t src_z_step = k_param_->iw * k_param_->ih;
+            size_t dst_z_step = x_c * src_unit * src_unit;
+
+            // pre-compute index and offset
+            for (int x_i = 0; x_i < x_c; x_i++) {
+                int idx = x_idx + x_i;
+                int w_idx = idx % w_unit;
+                int h_idx = idx / w_unit;
+
+                int src_x = w_idx * dst_unit - conv_param->pads[0];
+                int src_y = h_idx * dst_unit - conv_param->pads[2];
+                int dst_x = w_idx * dst_unit;
+                int dst_y = h_idx * dst_unit;
+
+                tiles_info[x_i].src_sy  = MAX(0, src_y) - src_y;
+                tiles_info[x_i].src_ey  = MIN(src_y + src_unit, k_param_->ih) - src_y;
+                tiles_info[x_i].src_sx  = MAX(0, src_x) - src_x;
+                tiles_info[x_i].src_ex  = MIN(src_x + src_unit, k_param_->iw) - src_x;
+                tiles_info[x_i].src_loc = (src_x + src_y * k_param_->iw) * 8;
+
+                tiles_info[x_i].dst_ey  = MIN(dst_y + dst_unit, k_param_->oh) - dst_y;
+                tiles_info[x_i].dst_ex  = MIN(dst_x + dst_unit, k_param_->ow) - dst_x;
+                tiles_info[x_i].dst_loc = (dst_x + dst_y * k_param_->ow) * 8;
+            }
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z <= k_param_->ic_r8 - 8; z += 8) {
+                int tid         = OMP_TID_;
+                auto mid_buffer = src_pad_buffer + tid * src_pad_buf_per_thread;
+                auto src_z      = input_ptr + z * src_z_step;
+                auto dst_z      = src_trans_buf + tid * src_trans_tmp_per_thread;
+                for (int x_i = 0; x_i < x_c; x_i++) {
+                    int sy    = tiles_info[x_i].src_sy;
+                    int ey    = tiles_info[x_i].src_ey;
+                    int sx    = tiles_info[x_i].src_sx;
+                    int ex    = tiles_info[x_i].src_ex;
+                    int count = (ex - sx) * 8;
+
+                    // source transform start
+                    auto src_start              = src_z + tiles_info[x_i].src_loc;
+                    fp16_t *transform_dst       = dst_z + x_i * src_unit * src_unit * 8;
+                    const fp16_t *transform_src = nullptr;
+
+                    int h_stride0 = 0;
+
+                    if (ex - sx == src_unit && ey - sy == src_unit) {
+                        transform_src = src_start;
+                        h_stride0     = 8 * k_param_->iw;
+                    } else {
+                        memset(mid_buffer, 0, src_unit * src_unit * 8 * data_byte_size);
+                        if (count > 0) {
+                            for (int yy = sy; yy < ey; yy++) {
+                                auto dst_yy = mid_buffer + yy * src_unit * 8 + sx * 8;
+                                auto src_yy = src_start + 8 * k_param_->iw * yy + sx * 8;
+                                memcpy(dst_yy, src_yy, count * data_byte_size);
+                            }
+                        }
+
+                        transform_src = mid_buffer;
+                        h_stride0     = 8 * src_unit;
+                    }
+
+                    SrcTransformFunc_(transform_src, transform_dst, 8, h_stride0);
+                    // source transform end
+                }
+
+                /*
+                repack data format to nchw for gemm func
+                total data num : ic * tile * unit * unit
+                */
+                if (src_unit == 4) {
+                    load_repack_half<16>(repack_buf, dst_z, x_c, z, ic, k_param_->ic_r8);
+                } else if (src_unit == 6) {
+                    load_repack_half<36>(repack_buf, dst_z, x_c, z, ic, k_param_->ic_r8);
+                }
+            }
+
+            // gemm multi (n8 for armv8, n4 for armv7)
+            OMP_PARALLEL_FOR_
+            for (int i = 0; i < src_unit * src_unit; i++) {
+                GEMM_FP16_N8(dst_trans_buf + i * 8 * NEON_GEMM_TILE_HW,
+                             repack_buf + i * k_param_->ic_r8 * NEON_GEMM_TILE_HW,
+                             reinterpret_cast<fp16_t *>(k_param_->fil_ptr) + i * k_param_->ic_r8 * k_param_->oc_r8,
+                             k_param_->ic_r8, NEON_GEMM_TILE_HW * src_unit * src_unit * 8, k_param_->oc_r8, x_c, fake_bias, 0);
+            }
+
+            src_z_step = NEON_GEMM_TILE_HW * src_unit * src_unit;
+            dst_z_step = k_param_->ow * k_param_->oh;
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z <= k_param_->oc_r8 - 8; z += 8) {
+                int tid         = OMP_TID_;
+                auto mid_buffer = src_pad_buffer + tid * src_pad_buf_per_thread;
+                auto src_z      = dst_trans_buf + z * src_z_step;
+                auto dst_z      = output_ptr + z * dst_z_step;
+                for (int x_i = 0; x_i < x_c; x_i++) {
+                    int ey = tiles_info[x_i].dst_ey;
+                    int ex = tiles_info[x_i].dst_ex;
+
+                    int count = ex * 8;
+                    // dst transform start
+                    fp16_t *transform_src = src_z + x_i * 8;
+                    auto dst_start        = dst_z + tiles_info[x_i].dst_loc;
+                    fp16_t *transform_dst = nullptr;
+                    int h_stride0         = 8 * dst_unit;
+                    int h_stride1         = 0;
+
+                    if (ex == dst_unit) {
+                        transform_dst = dst_start;
+                        h_stride1     = 8 * k_param_->ow;
+                    } else {
+                        transform_dst = mid_buffer;
+                        h_stride1     = 8 * dst_unit;
+                    }
+
+                    DstTransformFunc_(transform_src, transform_dst, NEON_GEMM_TILE_HW * 8, h_stride1, ey);
+
+                    if (ex != dst_unit) {
+                        for (int yy = 0; yy < ey; ++yy) {
+                            auto dst_yy = dst_start + yy * 8 * k_param_->ow;
+                            auto src_yy = mid_buffer + yy * 8 * dst_unit;
+                            memcpy(dst_yy, src_yy, count * data_byte_size);
+                        }
+                    }
+                    // dst transform end
+                }
+            }
+        }
+    }
+
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_c3.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_c3.cc
new file mode 100644
index 0000000..a31f49d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_c3.cc
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_c3.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+namespace TNN_NS {
+// usually appears on the first conv layer
+bool ArmConvFp16LayerC3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+    return inputs[0]->GetBlobDesc().dims[1] == 3 && param->group == 1;
+}
+
+ArmConvFp16LayerC3::~ArmConvFp16LayerC3() {}
+
+Status ArmConvFp16LayerC3::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int ic   = dims_input[1];
+        const int oc   = dims_output[1];
+        const int oc_8 = UP_DIV(oc, 8);
+
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        int weight_count = oc_8 * ic * kh * kw * 8;
+        buffer_weight_   = RawBuffer(weight_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            size_t weight_nchw_count = oc * ic * kh * kw;
+            RawBuffer filter_half(weight_nchw_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            Float2Half(filter_half.force_to<fp16_t *>(), conv_res->filter_handle.force_to<float *>(),
+                       weight_nchw_count);
+            // use int16_t to covert weights
+            ConvertWeightsFromOI3HWToOHW24(filter_half.force_to<int16_t *>(), buffer_weight_.force_to<int16_t *>(),
+                                           ic, oc, conv_param->kernels[1], conv_param->kernels[0]);
+        } else if (conv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+            // soft fp16 -> fp32 -> hard fp16 TBD
+            ConvertWeightsFromOI3HWToOHW24(conv_res->filter_handle.force_to<int16_t *>(), buffer_weight_.force_to<int16_t *>(),
+                                           ic, oc, conv_param->kernels[1], conv_param->kernels[0]);
+        } else {
+            LOGE("WEIGHT DATATYPE NOT SUPPORTED NOW\n");
+            return Status(TNNERR_PARAM_ERR, "FP16 CONV C3 ONLY SUPPORT WEIGHT DATATYPE FLOAT AND HALF");
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvFp16LayerC3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input         = inputs[0];
+    auto output        = outputs[0];
+    auto dims_input    = input->GetBlobDesc().dims;
+    auto dims_output   = output->GetBlobDesc().dims;
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch = dims_output[0];
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    int kernel_x               = conv_param->kernels[0];
+    int kernel_y               = conv_param->kernels[1];
+    int dilate_y_step          = k_param_->iw * 8 * conv_param->dialations[1];
+    int dilate_x_step          = 8 * conv_param->dialations[0];
+
+    int weight_z_step = kernel_y * kernel_x * 3;
+
+    const fp16_t *src_origin = reinterpret_cast<const fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *dst_origin       = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+
+    int src_xc = 1 + (k_param_->ow - 1) * conv_param->strides[0] + conv_param->dialations[0] * (kernel_x - 1);
+    int workspace_per_thread = src_xc * kernel_y * k_param_->ic_r8 * data_byte_size;
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(context_->GetSharedWorkSpace(max_num_threads * workspace_per_thread));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r8;
+        auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+        int src_start_x = 0 - conv_param->pads[0];
+        int src_end_x   = src_start_x + src_xc >= k_param_->iw ? k_param_->iw : src_start_x + src_xc;
+
+        int dst_offset = 0;
+        if (src_start_x < 0) {
+            dst_offset  = -src_start_x;
+            src_start_x = 0;
+        }
+        int copy_count = src_end_x - src_start_x;
+        auto src_x     = input_ptr + 8 * src_start_x;
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < k_param_->oh; dy++) {
+            int thread_id = OMP_TID_;
+
+            auto work_space_t = work_space + thread_id * workspace_per_thread / data_byte_size;
+            memset(work_space_t, 0, workspace_per_thread);
+            int src_start_y = dy * conv_param->strides[1] - conv_param->pads[2];
+            int sfy         = MAX(0, (UP_DIV(-src_start_y, conv_param->dialations[1])));
+            int efy         = MIN(kernel_y, UP_DIV(k_param_->ih - src_start_y, conv_param->dialations[1]));
+
+            // copy make board
+            for (int ky = sfy; ky < efy; ky++) {
+                int sy     = src_start_y + ky * conv_param->dialations[1];
+                auto src_y = src_x + 8 * sy * k_param_->iw;
+                auto dst_y = work_space_t + (ky * src_xc + dst_offset) * 8;
+                memcpy(dst_y, src_y, copy_count * 8 * data_byte_size);
+            }
+            for (int dz = 0; dz <= k_param_->oc_r8 - 8; dz += 8) {
+                auto dst_z = output_ptr + dz * k_param_->ow * k_param_->oh + k_param_->ow * 8 * dy;
+                auto weight_dz = reinterpret_cast<fp16_t *>(k_param_->fil_ptr) + dz * weight_z_step;
+                GemmFp16SlidewC3(dst_z, work_space_t, weight_dz, k_param_->ow, 
+                                 conv_param->strides[0] * 8, kernel_x, kernel_y, dilate_x_step, src_xc * 8);
+            }
+        }
+    }
+
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_common.cc
new file mode 100644
index 0000000..def8ae8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_common.cc
@@ -0,0 +1,694 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+#ifdef TNN_ARM82_A64 // aarch64 fp16
+#define NEON_FP16CONV_TILE_HW (16)
+#else // aarch32 fp16 or fp16 simu
+#define NEON_FP16CONV_TILE_HW (8)
+#endif
+
+#ifdef TNN_ARM82_A64
+static inline void _repack_half_16(fp16_t *dst_b, const fp16_t *src_b) {
+    asm volatile (
+        "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64\n\t"
+        "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64\n\t"
+        "ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%0], #64\n\t"
+        "ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%0]\n\t"
+        "uzp1 v16.8h, v0.8h, v4.8h\n\t"
+        "uzp2 v20.8h, v0.8h, v4.8h\n\t"
+        "uzp1 v17.8h, v1.8h, v5.8h\n\t"
+        "uzp2 v21.8h, v1.8h, v5.8h\n\t"
+        "uzp1 v18.8h, v2.8h, v6.8h\n\t"
+        "uzp2 v22.8h, v2.8h, v6.8h\n\t"
+        "uzp1 v19.8h, v3.8h, v7.8h\n\t"
+        "uzp2 v23.8h, v3.8h, v7.8h\n\t"
+        "uzp1 v0.8h,  v8.8h, v12.8h\n\t"
+        "uzp2 v4.8h,  v8.8h, v12.8h\n\t"
+        "uzp1 v1.8h,  v9.8h, v13.8h\n\t"
+        "uzp2 v5.8h,  v9.8h, v13.8h\n\t"
+        "uzp1 v2.8h,  v10.8h, v14.8h\n\t"
+        "uzp2 v6.8h,  v10.8h, v14.8h\n\t"
+        "uzp1 v3.8h,  v11.8h, v15.8h\n\t"
+        "uzp2 v7.8h,  v11.8h, v15.8h\n\t"
+        "str q16, [%2, #0]\n\t"
+        "str q0,  [%2, #16]\n\t"
+        "str q17, [%2, #32]\n\t"
+        "str q1,  [%2, #48]\n\t"
+        "str q18, [%2, #64]\n\t"
+        "str q2,  [%2, #80]\n\t"
+        "str q19, [%2, #96]\n\t"
+        "str q3,  [%2, #112]\n\t"
+        "str q20, [%2, #128]\n\t"
+        "str q4,  [%2, #144]\n\t"
+        "str q21, [%2, #160]\n\t"
+        "str q5,  [%2, #176]\n\t"
+        "str q22, [%2, #192]\n\t"
+        "str q6,  [%2, #208]\n\t"
+        "str q23, [%2, #224]\n\t"
+        "str q7,  [%2, #240]\n\t"
+        :"=r"(src_b)
+        :"0"(src_b),"r"(dst_b)
+        :"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9",
+        "v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20",
+        "v21","v22","v23"
+    );
+}
+#endif
+
+static inline void _repack_half_8(fp16_t *dst_b, const fp16_t *src_b) {
+#ifdef TNN_ARM82_A64
+    asm volatile (
+        "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64\n\t"
+        "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%0]\n\t"
+        "uzp1 v8.8h,  v0.8h, v4.8h\n\t"
+        "uzp2 v12.8h, v0.8h, v4.8h\n\t"
+        "uzp1 v9.8h,  v1.8h, v5.8h\n\t"
+        "uzp2 v13.8h, v1.8h, v5.8h\n\t"
+        "uzp1 v10.8h, v2.8h, v6.8h\n\t"
+        "uzp2 v14.8h, v2.8h, v6.8h\n\t"
+        "uzp1 v11.8h, v3.8h, v7.8h\n\t"
+        "uzp2 v15.8h, v3.8h, v7.8h\n\t"
+        "str q8,  [%2, #0]\n\t"
+        "str q9,  [%2, #16]\n\t"
+        "str q10, [%2, #32]\n\t"
+        "str q11, [%2, #48]\n\t"
+        "str q12, [%2, #64]\n\t"
+        "str q13, [%2, #80]\n\t"
+        "str q14, [%2, #96]\n\t"
+        "str q15, [%2, #112]\n\t"
+        :"=r"(src_b)
+        :"0"(src_b),"r"(dst_b)
+        :"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9",
+        "v10","v11","v12","v13","v14","v15"
+    );
+#elif defined(TNN_ARM82_A32)
+    asm volatile (
+        "vld4.16 {d0, d2,  d4,  d6},  [%0]!\n\t"
+        "vld4.16 {d1, d3,  d5,  d7},  [%0]!\n\t"
+        "vld4.16 {d8, d10, d12, d14}, [%0]!\n\t"
+        "vld4.16 {d9, d11, d13, d15}, [%0]\n\t"
+        "vuzp.16 q0, q4\n\t"
+        "vuzp.16 q1, q5\n\t"
+        "vuzp.16 q2, q6\n\t"
+        "vuzp.16 q3, q7\n\t"
+        "vstr d0,  [%2, #0]\n\t"   "vstr d1,  [%2, #8]\n\t"
+        "vstr d2,  [%2, #16]\n\t"  "vstr d3,  [%2, #24]\n\t"
+        "vstr d4,  [%2, #32]\n\t"  "vstr d5,  [%2, #40]\n\t"
+        "vstr d6,  [%2, #48]\n\t"  "vstr d7,  [%2, #56]\n\t"
+        "vstr d8,  [%2, #64]\n\t"  "vstr d9,  [%2, #72]\n\t"
+        "vstr d10, [%2, #80]\n\t"  "vstr d11, [%2, #88]\n\t"
+        "vstr d12, [%2, #96]\n\t"  "vstr d13, [%2, #104]\n\t"
+        "vstr d14, [%2, #112]\n\t" "vstr d15, [%2, #120]\n\t"
+        :"=r"(src_b)
+        :"0"(src_b),"r"(dst_b)
+        :"cc","memory","q0","q1","q2","q3","q4","q5","q6","q7"
+    );
+#else
+    fp16_t tmp[64];
+    for (int i = 0; i < 8; i++) {
+        for (int j = 0; j < 8; j++) {
+            tmp[j * 8 + i] = src_b[i * 8 + j];
+        }
+    }
+    for (int i = 0; i < 64; i++) {
+        dst_b[i] = tmp[i];
+    }
+#endif
+}
+
+static inline void _repack_half_4(fp16_t *dst_b, const fp16_t *src_b) {
+#ifdef TNN_ARM82_A64
+    asm volatile (
+        "ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32\n\t"
+        "ld4 {v4.4h, v5.4h, v6.4h, v7.4h}, [%0]\n\t"
+        "uzp1 v8.4h,  v0.4h, v4.4h\n\t"
+        "uzp2 v12.4h, v0.4h, v4.4h\n\t"
+        "uzp1 v9.4h,  v1.4h, v5.4h\n\t"
+        "uzp2 v13.4h, v1.4h, v5.4h\n\t"
+        "uzp1 v10.4h, v2.4h, v6.4h\n\t"
+        "uzp2 v14.4h, v2.4h, v6.4h\n\t"
+        "uzp1 v11.4h, v3.4h, v7.4h\n\t"
+        "uzp2 v15.4h, v3.4h, v7.4h\n\t"
+        "str d8,  [%2, #0]\n\t"
+        "str d9,  [%2, #8]\n\t"
+        "str d10, [%2, #16]\n\t"
+        "str d11, [%2, #24]\n\t"
+        "str d12, [%2, #32]\n\t"
+        "str d13, [%2, #40]\n\t"
+        "str d14, [%2, #48]\n\t"
+        "str d15, [%2, #56]\n\t"
+        :"=r"(src_b)
+        :"0"(src_b),"r"(dst_b)
+        :"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9",
+        "v10","v11","v12","v13","v14","v15"
+    );
+#elif defined(TNN_ARM82_A32)
+    asm volatile (
+        "vld4.16 {d0, d1, d2, d3}, [%0]!\n\t"
+        "vld4.16 {d4, d5, d6, d7}, [%0]\n\t"
+        "vuzp.16 d0, d4\n\t"
+        "vuzp.16 d1, d5\n\t"
+        "vuzp.16 d2, d6\n\t"
+        "vuzp.16 d3, d7\n\t"
+        "vstr d0,  [%2, #0]\n\t"   "vstr d1,  [%2, #8]\n\t"
+        "vstr d2,  [%2, #16]\n\t"  "vstr d3,  [%2, #24]\n\t"
+        "vstr d4,  [%2, #32]\n\t"  "vstr d5,  [%2, #40]\n\t"
+        "vstr d6,  [%2, #48]\n\t"  "vstr d7,  [%2, #56]\n\t"
+        :"=r"(src_b)
+        :"0"(src_b),"r"(dst_b)
+        :"cc","memory","q0","q1","q2","q3"
+    );
+#else
+    fp16_t tmp[32];
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 8; j++) {
+            tmp[j * 4 + i] = src_b[i * 8 + j];
+        }
+    }
+    for (int i = 0; i < 32; i++) {
+        dst_b[i] = tmp[i];
+    }
+#endif
+}
+
+static void load_repack_half_align(
+    fp16_t *dst, 
+    const fp16_t *src, 
+    int dst_cnt, 
+    int ic,
+    int kernel_size) {
+    int c = 0;
+    for (; c <= ic - 8; c += 8) {
+        for (int k = 0; k < kernel_size; k++) {
+#ifdef TNN_ARM82_A64
+            _repack_half_16(dst, src);
+#else
+            _repack_half_8(dst, src);
+#endif
+            src += 8 * NEON_FP16CONV_TILE_HW;
+            dst += 8 * NEON_FP16CONV_TILE_HW;
+        }
+    }
+    if (c < ic) {
+        int c_eff = ic - c;
+        for (int k = 0; k < kernel_size; k++) {
+#ifdef TNN_ARM82_A64
+            _repack_half_16(dst, src);
+#else
+            _repack_half_8(dst, src);
+#endif
+            src += 8 * NEON_FP16CONV_TILE_HW;
+            dst += c_eff * NEON_FP16CONV_TILE_HW;
+        }
+    }
+}
+
+static void load_repack_half(
+    fp16_t *dst, 
+    const fp16_t *src, 
+    int dst_cnt, 
+    int ic,
+    int kernel_size) {
+    int dst_i = 0;
+#ifdef TNN_ARM82_A64
+    if (dst_cnt >= dst_i + 8) {
+        auto src_p = src;
+        int c = 0;
+        for (; c <= ic - 8; c += 8) {
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_8(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += 8 * 8;
+            }
+        }
+        if (c < ic) {
+            int c_eff = ic - c;
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_8(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += c_eff * 8;
+            }
+        }
+        src += 8 * 8;
+        dst_i += 8;
+    }
+#endif
+    if (dst_cnt >= dst_i + 4) {
+        auto src_p = src;
+        int c = 0;
+        for (; c <= ic - 8; c += 8) {
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_4(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += 8 * 4;
+            }
+        }
+        if (c < ic) {
+            int c_eff = ic - c;
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_4(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += c_eff * 4;
+            }
+        }
+        src += 4 * 8;
+        dst_i += 4;
+    }
+    // when dst_cnt < 4, transpose tile = 4
+    if (dst_cnt > dst_i) {
+        auto src_p = src;
+        int c = 0;
+        for (; c <= ic - 8; c += 8) {
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_4(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += 8 * 4;
+            }
+        }
+        if (c < ic) {
+            int c_eff = ic - c;
+            for (int k = 0; k < kernel_size; k++) {
+                _repack_half_4(dst, src_p);
+                src_p += 8 * NEON_FP16CONV_TILE_HW;
+                dst += c_eff * 4;
+            }
+        }
+    }
+}
+
+namespace TNN_NS {
+/*
+ArmConvFp16LayerCommon as the last conv fp16 solution
+*/
+bool ArmConvFp16LayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        return true;
+    }
+    return false;
+}
+
+ArmConvFp16LayerCommon::~ArmConvFp16LayerCommon() {}
+
+/*
+f1s1p0 img2col func
+*/
+static void img2col_f1s1p0(
+    fp16_t *dst, 
+    const fp16_t *src, 
+    const ConvLayerParam *param, 
+    size_t x_start, 
+    size_t dst_cnt, 
+    const ArmKernelParam *kparam) {
+
+    auto src_s = src + x_start * 8;
+    for (int c = 0; c <= kparam->ic_r8 - 8; c += 8) {
+        auto src_c = src_s + c * kparam->ih * kparam->iw;
+        auto dst_c = dst + c * NEON_FP16CONV_TILE_HW;
+        memcpy(dst_c, src_c, dst_cnt * 8 * sizeof(fp16_t));
+    }
+}
+
+/*
+general img2col func
+*/
+static void img2col(
+    fp16_t *dst, 
+    const fp16_t *src, 
+    const ConvLayerParam *param, 
+    size_t x_start, 
+    size_t dst_cnt, 
+    const ArmKernelParam *kparam) {
+
+    int oh_start = ((int)x_start) / kparam->ow;
+    int ow_start = ((int)x_start) % kparam->ow;
+    int oh_end   = ((int)(x_start + dst_cnt - 1)) / kparam->ow;
+    auto kh = param->kernels[1];
+    auto kw = param->kernels[0];
+
+    struct tile_info{
+        int sfw;
+        int efw;
+        int sfh;
+        int efh;
+        const fp16_t *src;
+    };
+
+    tile_info tiles_info[NEON_FP16CONV_TILE_HW];
+    tile_info *tiles_info_ptr = tiles_info;
+    size_t dst_cnt_tmp = dst_cnt;
+    int fast_mode_cnt = 0;
+    // precompute src idx and ptr
+    for (int oh = oh_start; oh <= oh_end; ++oh) {
+        int sh = oh * param->strides[1] - param->pads[2];
+        int sfh = MAX(0, (UP_DIV(-sh, param->dialations[1])));
+        int efh = MIN(kh, UP_DIV(kparam->ih - sh, param->dialations[1]));
+        int eff_ow = MIN(kparam->ow - ow_start, dst_cnt_tmp);
+        auto src_sh = src + sh * kparam->iw * 8;
+        int fast_flag = 1;
+        if (efh - sfh != kh || oh_end != oh_start) {
+            fast_flag = 0;
+        }
+
+        for (int i = 0; i < eff_ow; ++i) {
+            int ow = ow_start + i;
+            int sw = ow * param->strides[0] - param->pads[0];
+            int sfw = MAX(0, (UP_DIV(-sw, param->dialations[0])));
+            int efw = MIN(kw, UP_DIV(kparam->iw - sw, param->dialations[0]));
+            if (efw - sfw != kw) {
+                fast_flag = 0;
+            }
+
+            tiles_info_ptr[i].sfw = sfw;
+            tiles_info_ptr[i].efw = efw;
+            tiles_info_ptr[i].sfh = sfh;
+            tiles_info_ptr[i].efh = efh;
+            tiles_info_ptr[i].src = src_sh + sw * 8;
+            if (fast_flag == 1) {
+                fast_mode_cnt++;
+            }
+        }
+        ow_start = 0;
+        dst_cnt_tmp -= eff_ow;
+        tiles_info_ptr += eff_ow;
+    }
+
+    size_t src_c_step = 8 * kparam->ih * kparam->iw;
+    // img2col memcpy fast mode
+    if (fast_mode_cnt == dst_cnt) {
+        for (int c = 0; c <= kparam->ic_r8 - 8; c += 8) {
+            auto src_c = tiles_info[0].src + c * kparam->ih * kparam->iw;
+            auto dst_c = dst + c * NEON_FP16CONV_TILE_HW * kh * kw;
+            for (int fh = 0; fh < kh; ++fh) {
+                auto src_fh = src_c + fh * param->dialations[1] * kparam->iw * 8;
+                for (int fw = 0; fw < kw; ++fw) {
+                    auto src_fw = src_fh + fw * param->dialations[0] * 8;
+                    for (int i = 0; i < dst_cnt; i++) {
+                        auto src_i = src_fw + i * 8 * param->strides[0];
+                        Half8::save(dst_c + i * 8, Half8::load(src_i));
+                    }
+                    dst_c += NEON_FP16CONV_TILE_HW * 8;
+                }
+            }
+        }
+    }
+    // img2col memcpy normal mode
+    else {
+        // memset padding 0
+        memset(dst, 0, kparam->ic_r8 * kh * kw * NEON_FP16CONV_TILE_HW * sizeof(fp16_t));
+        for (int i = 0; i < dst_cnt; i++) {
+            auto dst_i = dst + i * 8;
+            for (int c = 0; c <= kparam->ic_r8 - 8; c += 8) {
+                auto src_c = tiles_info[i].src + c * kparam->ih * kparam->iw;
+                auto dst_c = dst_i + c * NEON_FP16CONV_TILE_HW * kh * kw;
+                for (int fh = tiles_info[i].sfh; fh < tiles_info[i].efh; ++fh) {
+                    auto src_fh = src_c + fh * param->dialations[1] * kparam->iw * 8;
+                    auto dst_fh = dst_c + fh * NEON_FP16CONV_TILE_HW * 8 * kw;
+                    for (int fw = tiles_info[i].sfw; fw < tiles_info[i].efw; ++fw) {
+                        auto src_fw = src_fh + fw * param->dialations[0] * 8;
+                        auto dst_fw = dst_fh + fw * NEON_FP16CONV_TILE_HW * 8;
+                        Half8::save(dst_fw, Half8::load(src_fw));
+                    }
+                }
+            }
+        }
+    }
+}
+
+Status ArmConvFp16LayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        // only support group == 1
+        const int group = conv_param->group;
+        if (group != 1) {
+            LOGE("GROUP NOT SUPPORTED NOW\n");
+            return Status(TNNERR_PARAM_ERR, "FP16 CONV COMMON GROUP > 1 NOT SUPPORT");
+        }
+        const int oc   = output_channel;
+        const int ic   = input_channel;
+        const int oc_8 = ROUND_UP(oc, 8);
+
+        size_t weight_count   = group * oc_8 * ic * kh * kw;
+        size_t data_byte_size = weight_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+        RawBuffer temp_buffer(data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            size_t weight_nchw_count = group * oc * ic * kh * kw;
+            RawBuffer filter_half(weight_nchw_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            Float2Half(filter_half.force_to<fp16_t *>(), conv_res->filter_handle.force_to<float *>(),
+                       weight_nchw_count);
+            // use int16_t to copy data, avoiding bad performance cased by fp16_t datatype in aarch32 fp16
+            ConvertWeightsFromGOIHWToGOIHW64(filter_half.force_to<int16_t *>(), temp_buffer.force_to<int16_t *>(), group,
+                                             ic, oc, conv_param->kernels[1], conv_param->kernels[0]);
+        } else if (conv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+            // soft fp16 -> fp32 -> hard fp16 TBD
+            ConvertWeightsFromGOIHWToGOIHW64(conv_res->filter_handle.force_to<int16_t *>(),
+                                             temp_buffer.force_to<int16_t *>(), group, ic, oc, conv_param->kernels[1],
+                                             conv_param->kernels[0]);
+        } else {
+            LOGE("WEIGHT DATATYPE NOT SUPPORTED NOW\n");
+            return Status(TNNERR_PARAM_ERR, "FP16 CONV COMMON ONLY SUPPORT WEIGHT DATATYPE FLOAT AND HALF");
+        }
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvFp16LayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    if (!buffer_bias_.GetBytesSize()) {
+        RawBuffer temp_buffer(ROUND_UP(dims_output[1], 8) * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+        if (conv_param->bias) {
+            if (conv_res->bias_handle.GetDataType() == DATA_TYPE_FLOAT) {
+                RawBuffer bias_nchw(dims_output[1] * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+                Float2Half(bias_nchw.force_to<fp16_t *>(), conv_res->bias_handle.force_to<float *>(), dims_output[1]);
+                memcpy(temp_buffer.force_to<fp16_t *>(), bias_nchw.force_to<fp16_t *>(),
+                       dims_output[1] * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            } else if (conv_res->bias_handle.GetDataType() == DATA_TYPE_HALF) {
+                memcpy(temp_buffer.force_to<fp16_t *>(), conv_res->bias_handle.force_to<fp16_t *>(),
+                       dims_output[1] * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            } else {
+                LOGE("BIAS DATATYPE NOT SUPPORTED NOW\n");
+                return Status(TNNERR_PARAM_ERR, "FP16 CONV COMMON ONLY SUPPORT BIAS DATATYPE FLOAT AND HALF");
+            }
+        }
+        buffer_bias_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvFp16LayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    // init base k_param_
+    k_param_->bias    = buffer_bias_.force_to<void *>();
+    k_param_->fil_ptr = buffer_weight_.force_to<void *>();
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto dims_input = inputs[0]->GetBlobDesc().dims;
+    int kernel_x    = conv_param->kernels[0];
+    int kernel_y    = conv_param->kernels[1];
+    int stride_x    = conv_param->strides[0];
+    int stride_y    = conv_param->strides[1];
+    int pad_x       = conv_param->pads[0];
+    int pad_y       = conv_param->pads[2];
+    int dia_x       = conv_param->dialations[0];
+    int dia_y       = conv_param->dialations[1];
+
+    // im2col f1s1p0d1 fast mode
+    bool f1s1p0d1 = kernel_x == 1 && kernel_y == 1 && stride_x == 1 && stride_y == 1 && pad_x == 0 && pad_y == 0 &&
+                    dia_x == 1 && dia_y == 1;
+
+    if (f1s1p0d1) {
+        img2col_func = img2col_f1s1p0;
+    } else {
+        img2col_func = img2col;
+    }
+
+    // set tile blk size, which be limit to 16KB
+    // 16 * 1024 / sizeof(fp16_t)
+    int tile_blk = 8192 / (k_param_->ic_r8 * kernel_x * kernel_y);
+    tile_blk = ROUND_UP(tile_blk, NEON_FP16CONV_TILE_HW);
+    if (tile_blk < NEON_FP16CONV_TILE_HW) {
+        tile_blk = NEON_FP16CONV_TILE_HW;
+    }
+    if (tile_blk > 512) {
+        tile_blk = 512;
+    }
+    tile_blk_size = tile_blk;
+
+    if (conv_param->activation_type == ActivationType_ReLU) {
+        post_func_ = PostAddBiasRelu<fp16_t, fp16_t>;
+    } else if (conv_param->activation_type == ActivationType_ReLU6) {
+        post_func_ = PostAddBiasRelu6<fp16_t, fp16_t>;
+    } else if (conv_param->activation_type == ActivationType_SIGMOID_MUL) {
+        post_func_ = context_->GetPrecision() == PRECISION_NORMAL ? PostAddBiasSwish<fp16_t, fp16_t, false>
+                                                                  : PostAddBiasSwish<fp16_t, fp16_t, true>;
+    } else {
+        post_func_ = PostAddBias<fp16_t, fp16_t>;
+    }
+
+    return TNN_OK;
+}
+
+template <>
+void ArmConvFp16LayerCommon::PostExec<fp16_t>(const std::vector<Blob *> &outputs) {
+    const int batch = outputs[0]->GetBlobDesc().dims[0];
+    auto dst_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    if (post_func_) {
+        OMP_PARALLEL_FOR_
+        for (int batch_idx = 0; batch_idx < batch; ++batch_idx) {
+            auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+            for (int dz = 0; dz < k_param_->oc_r8; dz += 8) {
+                auto dst_z    = output_ptr + dz * k_param_->ow * k_param_->oh;
+                fp16_t *bias_z = reinterpret_cast<fp16_t *>(k_param_->bias) + dz;
+                post_func_(dst_z, bias_z, k_param_->ow * k_param_->oh, 1);
+            }
+        }
+    }
+}
+
+void ArmConvFp16LayerCommon::PostExecNoBias(const std::vector<Blob *> &outputs) {
+    const int batch = outputs[0]->GetBlobDesc().dims[0];
+    auto dst_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    if (post_func_) {
+        OMP_PARALLEL_FOR_
+        for (int batch_idx = 0; batch_idx < batch; ++batch_idx) {
+            auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+            for (int dz = 0; dz < k_param_->oc_r8; dz += 8) {
+                auto dst_z    = output_ptr + dz * k_param_->ow * k_param_->oh;
+                post_func_(dst_z, nullptr, k_param_->ow * k_param_->oh, 1);
+            }
+        }
+    }
+}
+
+Status ArmConvFp16LayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    const int batch  = dims_output[0];
+    auto ic          = dims_input[1];
+
+    fp16_t *input_data  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *output_data = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    const int crs = ic * conv_param->kernels[1] * conv_param->kernels[0];
+    const int crs_r8 = k_param_->ic_r8 * conv_param->kernels[1] * conv_param->kernels[0];
+    const int tile_count = UP_DIV(k_param_->oh * k_param_->ow, tile_blk_size);
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    size_t img2col_size = tile_blk_size * crs_r8;
+    size_t repack_size = NEON_FP16CONV_TILE_HW * crs_r8;
+    size_t workspace_size_per_thread = img2col_size + repack_size + NEON_KERNEL_EXTRA_LOAD;
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(
+        context_->GetSharedWorkSpace(workspace_size_per_thread * max_num_threads * sizeof(fp16_t)));
+
+    long act_type = conv_param->activation_type;
+    if (conv_param->activation_type == ActivationType_SIGMOID_MUL) {
+        act_type = 0;
+    }
+
+    for (int n = 0; n < batch; ++n) {
+        const auto input_batch = input_data + n * k_param_->iw * k_param_->ih * k_param_->ic_r8;
+        auto output_batch      = output_data + n * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+
+        OMP_PARALLEL_FOR_DYNAMIC_
+        for (int t_idx = 0; t_idx < tile_count; t_idx++) {
+            int thread_id          = OMP_TID_;
+            auto workspace_per_thread = work_space + thread_id * workspace_size_per_thread;
+            const int hw_start     = t_idx * tile_blk_size;
+            const int real_hw_tile = MIN(k_param_->oh * k_param_->ow - hw_start, tile_blk_size);
+            auto img2col_buffer = workspace_per_thread;
+            auto output_kernel = output_batch + hw_start * 8;
+
+            for (int i = 0; i < real_hw_tile; i += NEON_FP16CONV_TILE_HW) {
+                int tile_eff = MIN(real_hw_tile - i, NEON_FP16CONV_TILE_HW);
+                auto img2col_dst = img2col_buffer + i * crs_r8;
+                img2col_func(img2col_dst, input_batch, conv_param, hw_start + i, tile_eff, k_param_.get());
+            }
+
+            auto repack_src = img2col_buffer;
+            auto repack_dst = img2col_buffer;
+            auto repack_tmp = img2col_buffer + crs_r8 * tile_blk_size + NEON_KERNEL_EXTRA_LOAD;
+
+            int i = 0;
+            for (; i <= real_hw_tile - NEON_FP16CONV_TILE_HW; i += NEON_FP16CONV_TILE_HW) {
+                int tile_eff = MIN(real_hw_tile - i, NEON_FP16CONV_TILE_HW);
+                // repack in-place if aligned
+                load_repack_half_align(repack_dst + crs * i, repack_src + crs_r8 * i,
+                                tile_eff, ic, conv_param->kernels[1] * conv_param->kernels[0]);
+            }
+            if (real_hw_tile > i) {
+                int tile_eff = real_hw_tile - i;
+                memcpy(repack_tmp, repack_src + crs_r8 * i, crs_r8 * NEON_FP16CONV_TILE_HW * sizeof(fp16_t));
+                load_repack_half(repack_dst + crs * i, repack_tmp,
+                                tile_eff, ic, conv_param->kernels[1] * conv_param->kernels[0]);
+            }
+
+            GEMM_FP16_N8(output_kernel, repack_dst, reinterpret_cast<fp16_t *>(k_param_->fil_ptr),
+                        crs, 8 * k_param_->ow * k_param_->oh, k_param_->oc_r8, real_hw_tile, 
+                        reinterpret_cast<fp16_t *>(k_param_->bias), act_type);
+        }
+    }
+
+    if (conv_param->activation_type == ActivationType_SIGMOID_MUL) {
+        PostExecNoBias(outputs);
+    }
+
+    return TNN_OK;
+}
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise.cc
new file mode 100644
index 0000000..3e216bd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise.cc
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+namespace TNN_NS {
+bool ArmConvFp16LayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_HALF) {
+        return false;
+    }
+
+    const int group          = param->group;
+    const int input_channel  = inputs[0]->GetBlobDesc().dims[1];
+    const int output_channel = outputs[0]->GetBlobDesc().dims[1];
+
+    return group == input_channel && group == output_channel && group != 1;
+}
+
+ArmConvFp16LayerDepthwise::~ArmConvFp16LayerDepthwise() {}
+
+Status ArmConvFp16LayerDepthwise::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = param->kernels[0];
+        int kh = param->kernels[1];
+
+        const int group  = param->group;
+        const int group8 = ROUND_UP(group, 8);
+
+        size_t weight_count = group8 * kh * kw;
+        size_t data_byte_size = weight_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+        RawBuffer temp_buffer(data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            size_t weight_nchw_count = group * kh * kw;
+            RawBuffer filter_half(weight_nchw_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            Float2Half(filter_half.force_to<fp16_t *>(), conv_res->filter_handle.force_to<float *>(),
+                       weight_nchw_count);
+            PackC8(temp_buffer.force_to<fp16_t *>(),
+                   filter_half.force_to<fp16_t *>(),
+                   kh * kw, group);
+        } else if (conv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+            // soft fp16 -> fp32 -> hard fp16 TBD
+            PackC8(temp_buffer.force_to<fp16_t *>(),
+                   conv_res->filter_handle.force_to<fp16_t *>(),
+                   kh * kw, group);
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvFp16LayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch    = dims_output[0];
+    int dst_z_step     = k_param_->ow * k_param_->oh;
+    int src_z_step     = k_param_->iw * k_param_->ih;
+    int dilate_y_step  = k_param_->iw * 8 * param->dialations[1];
+    int dilate_x_step  = 8 * param->dialations[0];
+    int weight_z_step  = param->kernels[0] * param->kernels[1];
+
+    int l = 0, t = 0, r = k_param_->ow, b = k_param_->oh;
+    for (; l * param->strides[0] - param->pads[0] < 0; l++)
+        ;
+    for (; t * param->strides[1] - param->pads[2] < 0; t++)
+        ;
+    for (; (r - 1) * param->strides[0] - param->pads[0] + param->kernels[0] * param->dialations[0] > k_param_->iw &&
+            r > l; r--)
+        ;
+    for (; (b - 1) * param->strides[1] - param->pads[2] + param->kernels[1] * param->dialations[1] > k_param_->ih &&
+            b > t; b--)
+        ;
+
+    // lamda function to process left/right/top/bottom corner
+    auto RunCorner = [=](fp16_t *dst_z, const fp16_t *src_z, const fp16_t *weight_dz, int left, int top, int right, int bottom) {
+        for (int dy = top; dy < bottom; ++dy) {
+            auto *dst_y        = dst_z + dy * k_param_->ow * 8;
+            int srcStartY      = dy * param->strides[1] - param->pads[2];
+            const auto *src_dy = src_z + srcStartY * k_param_->iw * 8;
+            int sfy            = MAX(0, (UP_DIV(-srcStartY, param->dialations[1])));
+            int efy            = MIN(param->kernels[1], UP_DIV(k_param_->ih - srcStartY, param->dialations[1]));
+            for (int dx = left; dx < right; ++dx) {
+                auto *dst_x        = dst_y + 8 * dx;
+                int srcStartX      = dx * param->strides[0] - param->pads[0];
+                const auto *src_dx = src_dy + srcStartX * 8;
+                int sfx            = MAX(0, (UP_DIV(-srcStartX, param->dialations[0])));
+                int efx            = MIN(param->kernels[0], UP_DIV(k_param_->iw - srcStartX, param->dialations[0]));
+                DepthwiseUnit(dst_x,
+                              src_dx + (sfx * param->dialations[0] + sfy * param->dialations[1] * k_param_->iw) * 8,
+                              weight_dz + 8 * (param->kernels[0] * sfy + sfx), efx - sfx, efy - sfy,
+                              8 * param->kernels[0], dilate_x_step, dilate_y_step);
+            }
+        }
+    };
+
+    auto *src_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    auto *dst_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r8;
+        auto dst_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+
+        OMP_PARALLEL_FOR_
+        for (int dz = 0; dz < k_param_->oc_r8; dz += 8) {
+            auto *dst_z     = dst_ptr + dst_z_step * dz;
+            auto *src_z     = src_ptr + src_z_step * dz;
+            auto *weight_dz = reinterpret_cast<fp16_t *>(k_param_->fil_ptr) + dz * weight_z_step;
+            auto *bias_z    = reinterpret_cast<fp16_t *>(k_param_->bias) + dz;
+
+            RunCorner(dst_z, src_z, weight_dz, 0, 0, k_param_->ow, t);
+            RunCorner(dst_z, src_z, weight_dz, 0, b, k_param_->ow, k_param_->oh);
+            RunCorner(dst_z, src_z, weight_dz, 0, t, l, b);
+            RunCorner(dst_z, src_z, weight_dz, r, t, k_param_->ow, b);
+
+            if (r > l && b > t) {
+                DepthwiseConv(dst_z + t * k_param_->ow * 8 + l * 8,
+                              src_z + (t * param->strides[1] - param->pads[2]) * k_param_->iw * 8 +
+                              (l * param->strides[0] - param->pads[0]) * 8,
+                              weight_dz, r - l, param->strides[0] * 8, param->kernels[0], param->kernels[1], dilate_x_step,
+                              dilate_y_step, b - t, k_param_->iw * 8 * param->strides[1], k_param_->ow * 8);
+            }
+        }
+    }
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise_s1.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise_s1.cc
new file mode 100644
index 0000000..615ef03
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_fp16_layer_depthwise_s1.cc
@@ -0,0 +1,173 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise_s1.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+#define MAX_CACHE_LINE_NUM 7
+
+namespace TNN_NS {
+
+template <typename T>
+static inline void cache_lines_slide(T **cache_lines, int n) {
+    auto temp = cache_lines[0];
+    for (int i = 0; i < n - 1; i++) {
+        cache_lines[i] = cache_lines[i + 1];
+    }
+    cache_lines[n - 1] = temp;
+}
+
+bool ArmConvFp16LayerDepthwiseS1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    // only support convdw3x3
+    return param->group == input_channel && param->group == output_channel &&
+           (param->kernels[0] == param->kernels[1] && (param->kernels[0] == 3)) &&
+           param->dialations[0] == 1 && param->dialations[1] == 1 && param->strides[0] == 1 && param->strides[1] == 1;
+}
+
+ArmConvFp16LayerDepthwiseS1::~ArmConvFp16LayerDepthwiseS1() {}
+
+Status ArmConvFp16LayerDepthwiseS1::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ArmConvFp16LayerDepthwise::Reshape(inputs, outputs);
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    if (conv_param) {
+        if (in_data_type == DATA_TYPE_HALF) {
+            if (conv_param->kernels[1] == 3) {
+                SlideFunc_ = ConvDw3x3Fp16SlideW;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else {
+            return TNNERR_LAYER_ERR;
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvFp16LayerDepthwiseS1::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch    = dims_output[0];
+    int dst_z_step     = k_param_->ow * k_param_->oh;
+    int src_z_step     = k_param_->iw * k_param_->ih;
+    int pad_l          = conv_param->pads[0];
+    int pad_r          = conv_param->pads[1];
+    int pad_t          = conv_param->pads[2];
+    int pad_b          = conv_param->pads[3];
+    int weight_z_step  = conv_param->kernels[0] * conv_param->kernels[1];
+
+    const fp16_t *src_origin = reinterpret_cast<const fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *dst_origin       = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+    int max_num_threads      = OMP_MAX_THREADS_NUM_;
+    int workspace_per_thread = conv_param->kernels[1] * (k_param_->iw + pad_l + pad_r) * 8 * data_byte_size;
+
+    if (!SlideFunc_) {
+        LOGE("Error: ConvDw slide func is nil\n");
+        return Status(TNNERR_LAYER_ERR, "Error: ConvDw slide func is nil");
+    }
+
+    if (pad_t > conv_param->kernels[1]) {
+        LOGE("ERROR: ConvDw pad_t must small than kernel_h\n");
+        return Status(TNNERR_LAYER_ERR, "ERROR: ConvDw pad_t must small than kernel_h");
+    }
+
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(context_->GetSharedWorkSpace(max_num_threads * workspace_per_thread));
+
+    /*
+    [ATTENTION]
+    data in workspace are dirty, must be clear first
+    */
+    memset(work_space, 0, max_num_threads * workspace_per_thread);
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r8;
+        auto dst_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r8;
+
+        OMP_PARALLEL_FOR_
+        for (int dz = 0; dz < k_param_->oc_r8; dz += 8) {
+            auto *dst_z                       = dst_ptr + dst_z_step * dz;
+            auto *src_z                       = src_ptr + src_z_step * dz;
+            const auto *weight_dz             = reinterpret_cast<fp16_t *>(k_param_->fil_ptr) + dz * weight_z_step;
+            int thread_id                     = OMP_TID_;
+            auto thread_work_space            = work_space + thread_id * workspace_per_thread / data_byte_size;
+            fp16_t *cache_line[MAX_CACHE_LINE_NUM] = {nullptr};
+            for (int i = 0; i < conv_param->kernels[1]; i++) {
+                cache_line[i] = thread_work_space + i * (k_param_->iw + pad_l + pad_r) * 8;
+            }
+
+            auto src_y = src_z;
+            auto dst_y = dst_z;
+            // memset pat_t lines
+            for (int ky = 0; ky < pad_t; ky++) {
+                memset(cache_line[ky] + pad_l * 8, 0, k_param_->iw * 8 * data_byte_size);
+            }
+            // load mid lines
+            for (int ky = pad_t; ky < conv_param->kernels[1] - 1; ky++) {
+                memcpy(cache_line[ky] + pad_l * 8, src_y, k_param_->iw * 8 * data_byte_size);
+                src_y += k_param_->iw * 8;
+            }
+            for (int dy = 0; dy < k_param_->oh - pad_b; dy++) {
+                // load only one line every loop
+                memcpy(cache_line[conv_param->kernels[1] - 1] + pad_l * 8, src_y, k_param_->iw * 8 * data_byte_size);
+                // kernel func
+                SlideFunc_(dst_y, (void **)cache_line, weight_dz, k_param_->ow);
+
+                src_y += k_param_->iw * 8;
+                dst_y += k_param_->ow * 8;
+                cache_lines_slide(cache_line, conv_param->kernels[1]);
+            }
+            // memset pad_b lines
+            for (int ky = pad_b; ky > 0; ky--) {
+                memset(cache_line[conv_param->kernels[1] - 1] + pad_l * 8, 0, k_param_->iw * 8 * data_byte_size);
+                // kernel func
+                SlideFunc_(dst_y, (void **)cache_line, weight_dz, k_param_->ow);
+
+                dst_y += k_param_->ow * 8;
+                cache_lines_slide(cache_line, conv_param->kernels[1]);
+            }
+        }
+    }
+
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_common.cc
new file mode 100644
index 0000000..b98b050
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_common.cc
@@ -0,0 +1,313 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_common.h"
+#include "tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+#ifdef TNN_ARM82_USE_NEON
+
+namespace TNN_NS {
+
+bool ArmConvInt8SdotLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    bool support_dot = CpuUtils::CpuSupportInt8Dot();
+    if (support_dot && inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return true;
+    }
+    return false;
+}
+
+ArmConvInt8SdotLayerCommon::~ArmConvInt8SdotLayerCommon() {}
+
+Status ArmConvInt8SdotLayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        int oc     = dims_output[1];
+        int ic     = dims_input[1];
+        int oc_r4  = ROUND_UP(oc, 4);
+        int crs_r4 = ROUND_UP(ic, 4) * kw * kh;
+
+        int weight_count     = oc_r4 * crs_r4;
+        int weight_byte_size = weight_count * DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+        RawBuffer temp_buffer(weight_byte_size + NEON_KERNEL_EXTRA_LOAD);
+
+        auto weight_src = conv_res->filter_handle.force_to<int8_t *>();
+        // temp_buffer has been memset to 0
+        auto weight_dst = temp_buffer.force_to<int8_t *>();
+        PackSDOTINT8Weight(weight_src, weight_dst, oc, ic, kh, kw);
+
+        buffer_weight_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+// aarch32 memcpy small size has poor performance, use intrinsic to speed up
+static inline void memcpy_intrinsic(int8_t *dst, const int8_t *src, int ic_r4) {
+    int i = 0;
+    for (; i + 31 < ic_r4; i += 32) {
+        vst1q_s8(dst + i, vld1q_s8(src + i));
+        vst1q_s8(dst + i + 16, vld1q_s8(src + i + 16));
+    }
+    for (; i + 15 < ic_r4; i += 16) {
+        vst1q_s8(dst + i, vld1q_s8(src + i));
+    }
+    for (; i + 7 < ic_r4; i += 8) {
+        vst1_s8(dst + i, vld1_s8(src + i));
+    }
+    for (; i + 3 < ic_r4; i += 4) {
+        *((int32_t*)(dst + i)) = *((int32_t*)(src + i));
+    }
+}
+
+#define DEF_IMG2COL_VAL                                                                                                \
+    int x_id = (int)x_start + i;                                                                                       \
+    int ox   = x_id % output_dims[3];                                                                                  \
+    int oy   = x_id / output_dims[3];                                                                                  \
+    int sx   = ox * param->strides[0] - param->pads[0];                                                                \
+    int sy   = oy * param->strides[1] - param->pads[2];                                                                \
+    int sfy  = MAX(0, (UP_DIV(-sy, param->dialations[1])));                                                            \
+    int efy  = MIN(kh, UP_DIV(input_dims[2] - sy, param->dialations[1]));                                              \
+    int sfx  = MAX(0, (UP_DIV(-sx, param->dialations[0])));                                                            \
+    int efx  = MIN(kw, UP_DIV(input_dims[3] - sx, param->dialations[0]));                                              \
+    int fyC  = efy - sfy;                                                                                              \
+    int fxC  = efx - sfx;
+
+/*
+general img2col func
+*/
+static void im2col(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                   int crs_r4, DimsVector input_dims, DimsVector output_dims) {
+    const int src_w_step = ROUND_UP(input_dims[1], 4);
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r4 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * input_dims[3]) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * src_w_step;
+        memset(dst_i, 0, crs_r4 * sizeof(int8_t));
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * src_w_step;
+            auto src_y = input_offset + fy * input_dims[3] * src_w_step * dilate_y;
+            for (int fx = 0; fx < fxC; ++fx) {
+                auto dst_x = dst_y + fx * src_w_step;
+                auto src_x = src_y + fx * dilate_x * src_w_step;
+                memcpy_intrinsic(dst_x, src_x, src_w_step);
+            }
+        }
+    }
+}
+
+// hard code src_w_step to 4 can speed up aarch32
+static void im2col_smallc(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                          int crs_r4, DimsVector input_dims, DimsVector output_dims) {
+    const int src_w_step = 4; // ROUND_UP(input_dims[1], 4)
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r4 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * input_dims[3]) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * src_w_step;
+        memset(dst_i, 0, crs_r4 * sizeof(int8_t));
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * src_w_step;
+            auto src_y = input_offset + fy * input_dims[3] * src_w_step * dilate_y;
+            for (int fx = 0; fx < fxC; ++fx) {
+                auto dst_x = dst_y + fx * src_w_step;
+                auto src_x = src_y + fx * dilate_x * src_w_step;
+                memcpy(dst_x, src_x, src_w_step);
+            }
+        }
+    }
+}
+
+Status ArmConvInt8SdotLayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferScale(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(setFusionParam(inputs, outputs), TNN_OK);
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto dims_input = inputs[0]->GetBlobDesc().dims;
+    int kernel_x    = conv_param->kernels[0];
+    int kernel_y    = conv_param->kernels[1];
+    int stride_x    = conv_param->strides[0];
+    int stride_y    = conv_param->strides[1];
+    int pad_x       = conv_param->pads[0];
+    int pad_y       = conv_param->pads[2];
+    int ic          = dims_input[1];
+    int ic_r4       = ROUND_UP(ic, 4);
+
+    // fast mode
+    bool no_im2col = kernel_x == 1 && kernel_y == 1 && ic_r4 % 4 == 0 && stride_x == 1 && stride_y == 1 &&
+                     pad_x == 0 && pad_y == 0;
+    if (!no_im2col) {
+        im_col_func_ = im2col;
+        if (dims_input[1] <= 4) {
+            im_col_func_ = im2col_smallc;
+        }
+    } else {
+        im_col_func_ = nullptr;
+    }
+
+    // set tile blk size, which be limit to 16KB
+    // 16 * 1024 / sizeof(int8_t)
+    int tile_blk = 16384 / (ic_r4 * kernel_x * kernel_y);
+    tile_blk = ROUND_UP(tile_blk, NEON_INT8_SDOT_TILE_HW);
+    if (tile_blk < NEON_INT8_SDOT_TILE_HW) {
+        tile_blk = NEON_INT8_SDOT_TILE_HW;
+    }
+    if (tile_blk > 1024) {
+        tile_blk = 1024;
+    }
+    tile_blk_ = tile_blk;
+
+    return TNN_OK;
+}
+
+Status ArmConvInt8SdotLayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    auto input     = inputs[0];
+    auto output    = outputs[0];
+    auto add_input = (conv_param->fusion_type == FusionType_None) ? nullptr : inputs[1];
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    const int batch  = dims_output[0];
+    auto ic          = dims_input[1];
+    auto ic_r4       = ROUND_UP(ic, 4);
+    auto oc_r4       = ROUND_UP(dims_output[1], 4);
+    auto oc_r4_align = oc_r4 / 8 * 8;
+
+    auto input_channel_stride  = DimsVectorUtils::Count(dims_input, 2);
+    auto output_channel_stride = DimsVectorUtils::Count(dims_output, 2);
+    auto input_batch_stride    = input_channel_stride * ic_r4;
+    auto output_batch_stride   = output_channel_stride * oc_r4;
+
+    int8_t *input_data     = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle()));
+    int8_t *output_data    = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+    int8_t *add_input_data = add_input ? reinterpret_cast<int8_t *>(GetBlobHandlePtr(add_input->GetHandle())) : nullptr;
+
+    float *scale_ptr      = buffer_scale_.force_to<float *>();
+    int32_t *bias_ptr     = buffer_bias_.force_to<int32_t *>();
+    int8_t *weight_ptr    = buffer_weight_.force_to<int8_t *>();
+    float *add_scale_ptr  = buffer_add_scale_.force_to<float *>();
+    int8_t *relu6_max_ptr = relu6_max_.force_to<int8_t *>();
+
+    const int crs_r4 = ic_r4 * conv_param->kernels[1] * conv_param->kernels[0];
+    int tile_count = UP_DIV(dims_output[2] * dims_output[3], tile_blk_);
+
+    // for multi-threads, adjust tile_blk to make more threads parallel
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    if (max_num_threads > 1) {
+        while (tile_count < max_num_threads && tile_blk_ > NEON_INT8_SDOT_TILE_HW) {
+            tile_blk_ = ROUND_UP(tile_blk_ / 2, NEON_INT8_SDOT_TILE_HW);
+            tile_count = UP_DIV(dims_output[2] * dims_output[3], tile_blk_);
+        }
+    }
+
+    size_t workspace_size = 0;
+    size_t im2col_buf_size = crs_r4 * tile_blk_ * max_num_threads + NEON_KERNEL_EXTRA_LOAD;
+    int8_t *workspace = reinterpret_cast<int8_t *>(context_->GetSharedWorkSpace(im2col_buf_size));
+    auto im2col_buf_ptr = workspace;
+
+#if defined(TNN_ARM82_A64)
+    auto GemmInt8SdotUnit = GemmInt8SdotUnit8x8;
+    auto GemmInt8SdotLeft = GemmInt8SdotUnit8x4;
+#elif defined(TNN_ARM82_A32)
+    auto GemmInt8SdotUnit = GemmInt8SdotUnit4x8;
+    auto GemmInt8SdotLeft = GemmInt8SdotUnit4x4;
+#endif
+
+    for (int n = 0; n < batch; ++n) {
+        auto input_batch     = input_data + n * input_batch_stride;
+        auto output_batch    = output_data + n * output_batch_stride;
+        auto add_input_batch = add_input_data ? add_input_data + n * output_batch_stride : nullptr;
+
+        OMP_PARALLEL_FOR_GUIDED_
+        for (int t_idx = 0; t_idx < tile_count; t_idx++) {
+            int thread_id          = OMP_TID_;
+            int8_t *input_kernel   = nullptr;
+            const int hw_start     = t_idx * tile_blk_;
+            const int real_hw_tile = MIN(output_channel_stride - hw_start, tile_blk_);
+            const int input_count  = crs_r4 * tile_blk_;
+
+            // im2col
+            if (im_col_func_) {
+                input_kernel = im2col_buf_ptr + input_count * thread_id;
+                im_col_func_(input_kernel, input_batch, conv_param,
+                                hw_start, real_hw_tile, crs_r4, dims_input, dims_output);
+            } else {
+                input_kernel = input_batch + hw_start * ic_r4;
+            }
+            auto output_kernel = output_batch + hw_start * oc_r4;
+            auto add_input_kernel = add_input_batch ? add_input_batch + hw_start * oc_r4 : nullptr;
+
+            GemmInt8SdotUnit(output_kernel, input_kernel, weight_ptr, crs_r4, oc_r4, real_hw_tile,
+                                bias_ptr, scale_ptr, relu_, add_input_kernel, add_scale_ptr, relu6_max_ptr);
+
+            if (oc_r4 > oc_r4_align) {
+                auto add_input_tmp = add_input_kernel ? add_input_kernel + oc_r4_align : nullptr;
+                GemmInt8SdotLeft(output_kernel + oc_r4_align, input_kernel,
+                                   weight_ptr + oc_r4_align * crs_r4,
+                                   crs_r4, oc_r4, real_hw_tile,
+                                   bias_ptr + oc_r4_align, scale_ptr + oc_r4_align,
+                                   relu_, add_input_tmp, add_scale_ptr + oc_r4_align,
+                                   relu6_max_ptr + oc_r4_align);
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_depthwise_3x3.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_depthwise_3x3.cc
new file mode 100644
index 0000000..a8b9d3d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_conv_int8_sdot_layer_depthwise_3x3.cc
@@ -0,0 +1,313 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_depthwise_3x3.h"
+#include "tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+#include "tnn/utils/omp_utils.h"
+
+#ifdef TNN_ARM82_USE_NEON
+namespace TNN_NS {
+
+Status ArmConvInt8SdotLayerDepthwise3x3::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                             const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        int oc     = dims_output[1];
+        int oc_r4  = ROUND_UP(oc, 4);
+
+        int weight_count     = oc_r4 * 12;
+        int weight_byte_size = weight_count * DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+        RawBuffer temp_buffer(weight_byte_size + NEON_KERNEL_EXTRA_LOAD);
+
+        auto weight_src = conv_res->filter_handle.force_to<int8_t *>();
+        // temp_buffer has been memset to 0
+        auto weight_dst = temp_buffer.force_to<int8_t *>();
+        
+        PackSDOTDW3X3INT8Weight(weight_src, weight_dst, oc);
+
+        buffer_weight_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+bool ArmConvInt8SdotLayerDepthwise3x3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    bool support_dot = CpuUtils::CpuSupportInt8Dot();
+    // only support convdw3x3 stride1/2 pad1 dialation1
+    return (param->group == input_channel && param->group == output_channel) &&
+           (param->kernels[0] == param->kernels[1] && (param->kernels[0] == 3)) &&
+           (param->dialations[0] == 1 && param->dialations[1] == 1) &&
+           ((param->strides[0] == 1 && param->strides[1] == 1) ||
+            (param->strides[0] == 2 && param->strides[1] == 2)) &&
+           (param->pads[0] == param->pads[1] && param->pads[0] == param->pads[2] &&
+            param->pads[0] == param->pads[3] && param->pads[0] == 1) &&
+           (param->fusion_type == FusionType_None) && support_dot;
+}
+
+ArmConvInt8SdotLayerDepthwise3x3::~ArmConvInt8SdotLayerDepthwise3x3() {}
+
+static inline void cache_lines_slide_s1(int8_t **cache_lines, int n) {
+    auto temp = cache_lines[0];
+    for (int i = 0; i < n - 1; i++) {
+        cache_lines[i] = cache_lines[i + 1];
+    }
+    cache_lines[n - 1] = temp;
+}
+
+static inline void cache_lines_slide_s2(int8_t **cache_lines, int n) {
+    auto temp0 = cache_lines[0];
+    auto temp1 = cache_lines[1];
+    for (int i = 0; i < n - 2; i++) {
+        cache_lines[i] = cache_lines[i + 2];
+    }
+    cache_lines[n - 2] = temp0;
+    cache_lines[n - 1] = temp1;
+}
+
+static void DepthwiseI8K3S1Sdot(int8_t* dst, int8_t** src, const int8_t* weight, const int32_t* bias_z, long width,
+                              long dst_depth, const float* scale_z, const int8_t* relu6_max, int activation_type) {
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long dc = 0; dc < dst_depth - 7; dc += 8) {
+        ConvDw3x3Int8SdotSlideW(dst + dc, src, weight + dc * 12, bias_z + dc, scale_z + dc, dc, dst_depth, width);
+    }
+    long dc = dst_depth / 8 * 8;
+    if (dc < dst_depth) {
+        ConvDw3x3Int8SdotSlideWLeftC4(dst + dc, src, weight + dc * 12, bias_z + dc, scale_z + dc, dc, dst_depth, width);
+    }
+
+    if (activation_type == ActivationType_ReLU) {
+        ReluInt8(dst, dst, dst_depth * width);
+    } else if (activation_type == ActivationType_ReLU6) {
+        Relu6Int8(dst, dst, relu6_max, width, dst_depth);
+    }
+}
+
+static void DepthwiseI8K3S2Sdot(int8_t* dst, int8_t** src, const int8_t* weight, const int32_t* bias_z, long width,
+                              long dst_depth, const float* scale_z, const int8_t* relu6_max, int activation_type) {
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long dc = 0; dc < dst_depth - 7; dc += 8) {
+        ConvDw3x3S2Int8SdotSlideW(dst + dc, src, weight + dc * 12, bias_z + dc, scale_z + dc, dc, dst_depth, width);
+    }
+    long dc = dst_depth / 8 * 8;
+    if (dc < dst_depth) {
+        ConvDw3x3S2Int8SdotSlideWLeftC4(dst + dc, src, weight + dc * 12, bias_z + dc, scale_z + dc, dc, dst_depth, width);
+    }
+
+    if (activation_type == ActivationType_ReLU) {
+        ReluInt8(dst, dst, dst_depth * width);
+    } else if (activation_type == ActivationType_ReLU6) {
+        Relu6Int8(dst, dst, relu6_max, width, dst_depth);
+    }
+}
+
+static void DepthwiseK3S1SlideW(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias,
+                                const float* scale, const int8_t* relu6_max, int8_t* work_space,
+                                int8_t** cache_line, ArmKernelParam* k_param, int activation_type,
+                                int batch, int workspace_w_stride) {
+    auto src_h_stride = k_param->iw * k_param->ic_r4;
+    auto dst_h_stride = k_param->ow * k_param->oc_r4;
+    int pad_l = 1, pad_t = 1, pad_b = 1;
+    int kernel_h = 3;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src + batch_idx * k_param->ih * src_h_stride;
+        auto dst_ptr = dst + batch_idx * k_param->oh * dst_h_stride;
+
+        for (int h = 0; h < kernel_h; h++) {
+            cache_line[h] = work_space + h * workspace_w_stride;
+        }
+
+        // memset top pad_t if batch > 0, batch = 0 already memset 0
+        if (batch_idx > 0) {
+            for (int h = 0; h < pad_t; h++) {
+                memset(cache_line[h] + pad_l * k_param->oc_r4, 0, sizeof(int8_t) * src_h_stride);
+            }
+        }
+
+        for (int h = pad_t; h < kernel_h - 1; h++) {
+            auto cache_line_h_ptr = cache_line[h] + pad_l * k_param->oc_r4;
+            memcpy(cache_line[h] + pad_l * k_param->oc_r4, src_ptr, sizeof(int8_t) * src_h_stride);
+            src_ptr += src_h_stride;
+        }
+
+        int cache_line_idx = kernel_h - 1;
+        for (int h = 0; h < k_param->oh - pad_b; h++) {
+            memcpy(cache_line[cache_line_idx] + pad_l * k_param->oc_r4, src_ptr, sizeof(int8_t) * src_h_stride);
+
+            DepthwiseI8K3S1Sdot(dst_ptr, cache_line, weight, bias, k_param->ow, k_param->oc_r4,
+                              scale, relu6_max, activation_type);
+
+            src_ptr += src_h_stride;
+            dst_ptr += dst_h_stride;
+            cache_lines_slide_s1(cache_line, kernel_h);
+        }
+
+        for (int h = pad_b; h > 0; h--) {
+            memset(cache_line[cache_line_idx] + pad_l * k_param->oc_r4, 0, sizeof(int8_t) * src_h_stride);
+
+            DepthwiseI8K3S1Sdot(dst_ptr, cache_line, weight, bias, k_param->ow, k_param->oc_r4,
+                              scale, relu6_max, activation_type);
+
+            dst_ptr += dst_h_stride;
+            cache_lines_slide_s1(cache_line, kernel_h);
+        }
+    }
+}
+
+static void DepthwiseK3S2SlideW(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias,
+                                const float* scale, const int8_t* relu6_max, int8_t* work_space,
+                                int8_t** cache_line, ArmKernelParam* k_param, int activation_type,
+                                int batch, int workspace_w_stride) {
+    auto src_h_stride = k_param->iw * k_param->ic_r4;
+    auto dst_h_stride = k_param->ow * k_param->oc_r4;
+    int pad_l = 1, pad_t = 1, pad_b = 1;
+    int kernel_h = 3;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src + batch_idx * k_param->ih * src_h_stride;
+        auto dst_ptr = dst + batch_idx * k_param->oh * dst_h_stride;
+
+        for (int h = 0; h < kernel_h; h++) {
+            cache_line[h] = work_space + h * workspace_w_stride;
+        }
+
+        // memset top pad_t if batch > 0, batch = 0 already memset 0
+        if (batch_idx > 0) {
+            for (int h = 0; h < pad_t; h++) {
+                memset(cache_line[h] + pad_l * k_param->oc_r4, 0, sizeof(int8_t) * src_h_stride);
+            }
+        }
+
+        int cache_line_idx_0 = kernel_h - 2;
+        int cache_line_idx_1 = kernel_h - 1;
+        for (int h = 0; h < k_param->oh - pad_b; h++) {
+            memcpy(cache_line[cache_line_idx_0] + pad_l * k_param->oc_r4, src_ptr, sizeof(int8_t) * src_h_stride);
+            memcpy(cache_line[cache_line_idx_1] + pad_l * k_param->oc_r4, src_ptr + src_h_stride, sizeof(int8_t) * src_h_stride);
+
+            DepthwiseI8K3S2Sdot(dst_ptr, cache_line, weight, bias, k_param->ow, k_param->oc_r4,
+                              scale, relu6_max, activation_type);
+
+            src_ptr += 2 * src_h_stride;
+            dst_ptr += dst_h_stride;
+            cache_lines_slide_s2(cache_line, kernel_h);
+        }
+
+        // corner case oh - 1
+        int h = k_param->oh - pad_b;
+        int ih_end = h * 2 - pad_l + 3 - 1;
+        if (ih_end > k_param->ih - 1) {
+            memcpy(cache_line[cache_line_idx_0] + pad_l * k_param->oc_r4, src_ptr, sizeof(int8_t) * src_h_stride);
+            memset(cache_line[cache_line_idx_1] + pad_l * k_param->oc_r4, 0, sizeof(int8_t) * src_h_stride);
+
+            DepthwiseI8K3S2Sdot(dst_ptr, cache_line, weight, bias, k_param->ow, k_param->oc_r4,
+                              scale, relu6_max, activation_type);
+        } else {
+            memcpy(cache_line[cache_line_idx_0] + pad_l * k_param->oc_r4, src_ptr, sizeof(int8_t) * src_h_stride);
+            memcpy(cache_line[cache_line_idx_1] + pad_l * k_param->oc_r4, src_ptr + src_h_stride, sizeof(int8_t) * src_h_stride);
+
+            DepthwiseI8K3S2Sdot(dst_ptr, cache_line, weight, bias, k_param->ow, k_param->oc_r4,
+                              scale, relu6_max, activation_type);
+        }
+    }
+}
+
+Status ArmConvInt8SdotLayerDepthwise3x3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch    = dims_output[0];
+    int pad_l          = conv_param->pads[0];
+    int pad_r          = conv_param->pads[1];
+    int pad_t          = conv_param->pads[2];
+    int pad_b          = conv_param->pads[3];
+
+    auto *src_origin     = reinterpret_cast<const int8_t *>(GetBlobHandlePtr(input->GetHandle()));
+    auto *dst_origin     = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+    auto src_h_stride    = k_param_->iw * k_param_->ic_r4;
+    auto dst_h_stride    = k_param_->ow * k_param_->oc_r4;
+    auto workspace_w_pad = k_param_->iw + pad_l + pad_r;
+    int extra_load = 0;
+    // 1 for extra preload in f3s2 asm kernel
+    if (conv_param->strides[0] == 2) extra_load = 1;
+    auto workspace_w_stride   = (workspace_w_pad + extra_load) * k_param_->oc_r4;
+    auto workspace_size = conv_param->kernels[1] * workspace_w_stride * data_byte_size;
+
+    if (pad_t > conv_param->kernels[1]) {
+        LOGE("ERROR: ConvDw pad_t must small than kernel_h\n");
+        return Status(TNNERR_LAYER_ERR, "ERROR: ConvDw pad_t must small than kernel_h");
+    }
+
+    auto work_space = reinterpret_cast<int8_t *>(context_->GetSharedWorkSpace(workspace_size));
+    int8_t** cache_line = (int8_t**)malloc(conv_param->kernels[1] * sizeof(int8_t*));
+
+    float *scale_ptr      = buffer_scale_.force_to<float *>();
+    int32_t *bias_ptr     = buffer_bias_.force_to<int32_t *>();
+    int8_t *weight_ptr    = buffer_weight_.force_to<int8_t *>();
+    int8_t *relu6_max_ptr = relu6_max_.force_to<int8_t *>();
+
+    // data in workspace are dirty, must be clear first for left and right padding area
+    memset(work_space, 0, workspace_size);
+
+    if (conv_param->strides[0] == 1) {
+        DepthwiseK3S1SlideW(dst_origin, src_origin, weight_ptr, bias_ptr, scale_ptr, relu6_max_ptr,
+                            work_space, cache_line, k_param_.get(), conv_param->activation_type,
+                            batch, workspace_w_stride);
+    } else if (conv_param->strides[0] == 2) {
+        DepthwiseK3S2SlideW(dst_origin, src_origin, weight_ptr, bias_ptr, scale_ptr, relu6_max_ptr,
+                            work_space, cache_line, k_param_.get(), conv_param->activation_type,
+                            batch, workspace_w_stride);
+    }
+
+    free(cache_line);
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_common.cc
new file mode 100644
index 0000000..0aba14c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_common.cc
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_common.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+/*
+ArmDeconvFp16LayerCommon as the last solution, always return true
+handle the case group != 1, dilate != 1, any pads and strides
+*/
+bool ArmDeconvFp16LayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+ArmDeconvFp16LayerCommon::~ArmDeconvFp16LayerCommon() {}
+
+Status ArmDeconvFp16LayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw          = conv_param->kernels[0];
+        int kh          = conv_param->kernels[1];
+        const int ic    = inputs[0]->GetBlobDesc().dims[1];
+        const int oc    = outputs[0]->GetBlobDesc().dims[1];
+        const int group = conv_param->group;
+        const int goc   = oc / group;
+        const int gic   = ic / group;
+        const int goc_8 = UP_DIV(goc, 8);
+        const int gic_8 = UP_DIV(gic, 8);
+
+        size_t weight_count   = group * goc_8 * gic_8 * kh * kw * 64;
+        size_t data_byte_size = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+        RawBuffer temp_buffer(weight_count * data_byte_size);
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            size_t weight_nchw_count = group * goc * gic * kh * kw;
+            RawBuffer filter_half(weight_nchw_count * data_byte_size);
+            Float2Half(filter_half.force_to<fp16_t *>(), conv_res->filter_handle.force_to<float *>(),
+                       weight_nchw_count);
+            // using int16_t to copy weights
+            ConvertWeightsFromGIOHWToGOHWI64(filter_half.force_to<int16_t *>(), temp_buffer.force_to<int16_t *>(), group,
+                                             ic, oc, conv_param->kernels[1], conv_param->kernels[0]);
+        } else if (conv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+            // soft fp16 -> fp32 -> hard fp16 TBD
+            ConvertWeightsFromGIOHWToGOHWI64(conv_res->filter_handle.force_to<int16_t *>(),
+                                             temp_buffer.force_to<int16_t *>(), group, ic, oc, conv_param->kernels[1],
+                                             conv_param->kernels[0]);
+        } else {
+            LOGE("WEIGHT DATATYPE NOT SUPPORTED NOW\n");
+            return Status(TNNERR_PARAM_ERR, "FP16 DECONV COMMON ONLY SUPPORT WEIGHT DATATYPE FLOAT AND HALF");
+        }
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmDeconvFp16LayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type       = output->GetBlobDesc().data_type;
+    const size_t data_byte_size = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+
+    const int batch  = dims_output[0];
+    const int group  = conv_param->group;
+    auto input_width = dims_input[3], input_height = dims_input[2], ic = dims_input[1],
+         input_slice  = UP_DIV(dims_input[1], 8);
+    auto output_width = dims_output[3], output_height = dims_output[2], oc = dims_output[1],
+         output_slice = UP_DIV(dims_output[1], 8);
+
+    auto gic                      = dims_input[1] / group;
+    auto goc                      = dims_output[1] / group;
+    auto gic_8                    = UP_DIV(gic, 8);
+    auto goc_8                    = UP_DIV(goc, 8);
+    size_t input_size_per_group  = input_width * input_height * gic_8 * 8;
+    size_t output_size_per_group = output_width * output_height * goc_8 * 8;
+
+    int kernel_x = conv_param->kernels[0];
+    int kernel_y = conv_param->kernels[1];
+
+    const fp16_t *src_origin = reinterpret_cast<const fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *dst_origin       = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int dst_w_pad = output_width + conv_param->pads[0] + conv_param->pads[2];
+    int dst_h_pad = output_height + conv_param->pads[1] + conv_param->pads[3] + 1;
+
+    size_t i_buf_size     = group * input_size_per_group;
+    size_t o_buf_size     = group * output_size_per_group;
+    size_t trans_buf_size = group * (MAX(input_size_per_group, output_size_per_group));
+    size_t pad_img_size   = dst_w_pad * dst_h_pad * goc_8 * 8;
+
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(
+        context_->GetSharedWorkSpace((i_buf_size + o_buf_size + trans_buf_size + pad_img_size) * data_byte_size));
+
+    const fp16_t *input_fp16 = src_origin;
+    fp16_t *output_fp16      = dst_origin;
+
+    auto i_buffer = work_space;
+    auto o_buffer = i_buffer + i_buf_size;
+    auto t_buffer = o_buffer + o_buf_size;
+    auto p_buffer = t_buffer + trans_buf_size;
+
+    int weight_z_step;
+    int src_z_step;
+    int ic_step;
+    int ic_counter;
+    int w_step;
+#ifdef TNN_ARM82_A64
+    auto DeconvFunc = DeconvFp16O8;
+    int CONVOLUTION_TILED_NUMBER = 14;
+    if (gic < 8) {
+        weight_z_step  = kernel_y * kernel_x * gic * 8;
+        src_z_step     = k_param_->iw * k_param_->ih * 1;
+        ic_step = gic;
+        ic_counter = gic;
+        w_step = 1;
+        DeconvFunc = DeconvFp16O8C1;
+        CONVOLUTION_TILED_NUMBER = 16;
+    } else {
+        weight_z_step  = kernel_y * kernel_x * gic_8 * 64;
+        src_z_step     = k_param_->iw * k_param_->ih * 8;
+        ic_step = gic_8 * 8;
+        ic_counter = gic_8;
+        w_step = 8;
+    }
+#else
+    auto DeconvFunc = DeconvFp16O8;
+    int CONVOLUTION_TILED_NUMBER = 8;
+    if (gic < 8) {
+        weight_z_step  = kernel_y * kernel_x * gic * 8;
+        src_z_step     = k_param_->iw * k_param_->ih * 1;
+        ic_step = gic;
+        ic_counter = gic;
+        w_step = 1;
+        DeconvFunc = DeconvFp16O8C1;
+    } else {
+        weight_z_step  = kernel_y * kernel_x * gic_8 * 64;
+        src_z_step     = k_param_->iw * k_param_->ih * 8;
+        ic_step = gic_8 * 8;
+        ic_counter = gic_8;
+        w_step = 8;
+    }
+#endif
+    int dst_z_step     = k_param_->ow * k_param_->oh * 8;
+    int dst_z_step_pad = dst_w_pad * dst_h_pad * 8;
+
+    int loop   = input_width / CONVOLUTION_TILED_NUMBER;
+    int remain = input_width % CONVOLUTION_TILED_NUMBER;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        const fp16_t *input_ptr;
+        fp16_t *output_ptr;
+
+        /*
+        first unpack input tensor to nchw data format
+        if gic < 8, use nchw as input
+        if gic > 8, pack data to make sure every group channel algin8
+        */
+        if (gic < 8) {
+            UnpackC8(t_buffer, input_fp16 + batch_idx * input_width * input_height * k_param_->ic_r8,
+                        input_width * input_height, ic);
+            input_ptr = t_buffer;
+        } else if (gic_8 != (gic / 8) && group != 1) {
+            UnpackC8(t_buffer, input_fp16 + batch_idx * input_width * input_height * k_param_->ic_r8,
+                     input_width * input_height, ic);
+            for (int g = 0; g < group; g++) {
+                PackC8(i_buffer + g * input_size_per_group, t_buffer + g * input_width * input_height * gic,
+                       input_width * input_height, gic);
+            }
+            input_ptr = i_buffer;
+        } else {
+            input_ptr = input_fp16 + batch_idx * input_width * input_height * k_param_->ic_r8;
+        }
+
+        if (goc_8 != (goc / 8) && group != 1) {
+            output_ptr = o_buffer;
+        } else {
+            output_ptr = output_fp16 + batch_idx * output_width * output_height * k_param_->oc_r8;
+        }
+
+        for (int g = 0; g < group; g++) {
+            auto input_g_ptr  = input_ptr + g * input_width * input_height * ic_step;
+            auto output_g_ptr = output_ptr + g * output_width * output_height * goc_8 * 8;
+            auto weight_ptr   = buffer_weight_.force_to<fp16_t *>() + g * goc_8 * weight_z_step;
+
+            // prepare init value
+            memset(p_buffer, 0, pad_img_size * data_byte_size);
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < goc_8; z++) {
+                auto weight_z = weight_ptr + z * weight_z_step;
+                auto dst_z    = p_buffer + z * dst_z_step_pad;
+                for (int dy = 0; dy < k_param_->ih; dy++) {
+                    auto src_y = input_g_ptr + dy * k_param_->iw * w_step;
+                    auto dst_y = dst_z + dy * conv_param->strides[1] * dst_w_pad * 8;
+                    for (int dx = 0; dx <= loop; dx++) {
+                        auto x_idx   = dx * CONVOLUTION_TILED_NUMBER;
+                        auto x_count = MIN(CONVOLUTION_TILED_NUMBER, k_param_->iw - x_idx);
+                        auto src_x   = input_g_ptr + dy * k_param_->iw * w_step + x_idx * w_step;
+                        auto dst_x   = dst_y + x_idx * conv_param->strides[0] * 8;
+                        // avoid using too much variables inside omp region when compile armv7
+                        // int dilate_y_step = dst_w_pad * 8 * conv_param->dialations[1];
+                        // int dilate_x_step = 8 * conv_param->dialations[0];
+                        // int dst_w_step    = conv_param->strides[0] * 8;
+                        DeconvFunc((fp16_t*)dst_x, (const fp16_t*)src_x, (const fp16_t*)weight_z, x_count,
+                                    conv_param->strides[0] * 8, ic_counter, src_z_step, conv_param->kernels[0], conv_param->kernels[1],
+                                    8 * conv_param->dialations[0], dst_w_pad * 8 * conv_param->dialations[1]);
+                    }
+                }
+            }
+
+            // crop inner image
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < goc_8; z++) {
+                auto src_z = p_buffer + z * dst_z_step_pad;
+                auto dst_z = output_g_ptr + z * dst_z_step;
+                for (int dy = 0; dy < output_height; dy++) {
+                    auto src_y = src_z + (dy + conv_param->pads[2]) * dst_w_pad * 8 + conv_param->pads[0] * 8;
+                    auto dst_y = dst_z + dy * output_width * 8;
+                    memcpy(dst_y, src_y, output_width * 8 * data_byte_size);
+                }
+            }
+        }
+
+        /*
+        first unpack every group output data to get nchw data format
+        pack data to make sure output tensor channel algin8 and continuously
+        */
+        if (goc_8 != (goc / 8) && group != 1) {
+            for (int g = 0; g < group; g++) {
+                UnpackC8(t_buffer + g * output_width * output_height * goc,
+                         output_ptr + g * output_width * output_height * goc_8 * 8, output_width * output_height, goc);
+            }
+            PackC8(output_fp16 + batch_idx * output_width * output_height * k_param_->oc_r8, t_buffer,
+                   output_width * output_height, oc);
+        }
+    }
+
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_depthwise.cc
new file mode 100644
index 0000000..0fcd1af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_deconv_fp16_layer_depthwise.cc
@@ -0,0 +1,140 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_depthwise.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool ArmDeconvFp16LayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    return param->group == input_channel && param->group == output_channel;
+}
+
+ArmDeconvFp16LayerDepthwise::~ArmDeconvFp16LayerDepthwise() {}
+
+Status ArmDeconvFp16LayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto output      = inputs[0];
+    auto input       = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch    = dims_output[0];
+    int src_width      = dims_input[3];
+    int src_height     = dims_input[2];
+    int dst_width      = dims_output[3];
+    int dst_height     = dims_output[2];
+    int dst_depth_div8 = UP_DIV(dims_output[1], 8);
+    int dst_z_step     = dst_width * dst_height * 8;
+    int src_z_step     = src_width * src_height * 8;
+    int dst_y_step     = dst_width * 8;
+    int src_y_step     = src_width * 8;
+    int kernel_x       = conv_param->kernels[0];
+    int kernel_y       = conv_param->kernels[1];
+    int stride_x       = conv_param->strides[0];
+    int stride_y       = conv_param->strides[1];
+    int pad_x          = conv_param->pads[0];
+    int pad_y          = conv_param->pads[2];
+    int dilate_x       = conv_param->dialations[0];
+    int dilate_y       = conv_param->dialations[1];
+    int dilate_y_step  = src_width * 8 * dilate_y;
+    int dilate_x_step  = 8 * dilate_x;
+    int weight_z_step  = kernel_x * kernel_y * 8;
+
+    int l = 0, t = 0, r = dst_width, b = dst_height;
+    for (; l * stride_x - pad_x < 0; l++) {
+        // do nothing
+    }
+    for (; t * stride_y - pad_y < 0; t++) {
+        // do nothing
+    }
+    for (; (r - 1) * stride_x - pad_x + kernel_x * dilate_x > src_width && r > l; r--) {
+        // do nothing
+    }
+    for (; (b - 1) * stride_y - pad_y + kernel_y * dilate_y > src_height && b > t; b--) {
+        // do nothing
+    }
+    auto RunCorner = [=](fp16_t *dst_z, fp16_t *src_z, const fp16_t *weight_dz, int Left, int Top, int Right, int Bot) {
+        for (int dy = Top; dy < Bot; ++dy) {
+            fp16_t *dst_y  = dst_z + dy * dst_y_step;
+            int srcStartY  = dy * stride_y - pad_y;
+            fp16_t *src_dy = src_z + srcStartY * src_y_step;
+            int sfy        = MAX(0, (UP_DIV(-srcStartY, dilate_y)));
+            int efy        = MIN(kernel_y, UP_DIV(src_height - srcStartY, dilate_y));
+            for (int dx = Left; dx < Right; ++dx) {
+                fp16_t *dst_x  = dst_y + 8 * dx;
+                int srcStartX  = dx * stride_x - pad_x;
+                fp16_t *src_dx = src_dy + srcStartX * 8;
+                int sfx        = MAX(0, (UP_DIV(-srcStartX, dilate_x)));
+                int efx        = MIN(kernel_x, UP_DIV(src_width - srcStartX, dilate_x));
+                DepthwiseUnitDeconv(dst_x, src_dx + (sfx * dilate_x + sfy * dilate_y * src_width) * 8,
+                                    weight_dz + 8 * (kernel_x * sfy + sfx), efx - sfx, efy - sfy, 8 * kernel_x,
+                                    dilate_x_step, dilate_y_step);
+            }
+        }
+    };
+
+    fp16_t *src_orign = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *dst_orign = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_orign + batch_idx * src_width * src_height * ROUND_UP(dims_input[1], 8);
+        auto dst_ptr = dst_orign + batch_idx * dst_width * dst_height * ROUND_UP(dims_output[1], 8);
+
+        memset(src_ptr, 0, src_width * src_height * dst_depth_div8 * 8 * sizeof(fp16_t));
+
+        for (int dz = 0; dz < dst_depth_div8; dz++) {
+            fp16_t *dst_z     = dst_ptr + dst_z_step * dz;
+            fp16_t *src_z     = src_ptr + src_z_step * dz;
+            fp16_t *weight_dz = buffer_weight_.force_to<fp16_t *>() + dz * weight_z_step;
+
+            RunCorner(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
+            RunCorner(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
+            RunCorner(dst_z, src_z, weight_dz, 0, t, l, b);
+            RunCorner(dst_z, src_z, weight_dz, r, t, dst_width, b);
+
+            if (r > l) {
+                for (int dy = t; dy < b; dy++) {
+                    fp16_t *dst_y  = dst_z + dy * dst_y_step;
+                    int srcStartY  = dy * stride_y - pad_y;
+                    fp16_t *src_dy = src_z + srcStartY * src_y_step;
+                    DepthwiseDeconv(dst_y + l * 8, src_dy + (l * stride_x - pad_x) * 8, weight_dz, r - l, stride_x * 8,
+                                    kernel_x, kernel_y, dilate_x_step, dilate_y_step);
+                }
+            }
+        }
+    }
+
+    PostExec<fp16_t>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_inner_product_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_inner_product_fp16_layer.cc
new file mode 100644
index 0000000..4e1880b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_inner_product_fp16_layer.cc
@@ -0,0 +1,185 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_inner_product_layer_acc.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+
+Status ArmInnerProductLayerAcc::allocateBufferWeightHalf(const std::vector<Blob *> &inputs,
+                                                         const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+    InnerProductLayerResource *fc_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(fc_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        DimsVector dims_input  = inputs[0]->GetBlobDesc().dims;
+        DimsVector dims_output = outputs[0]->GetBlobDesc().dims;
+
+        RawBuffer w_handle = fc_res->weight_handle;
+        CHECK_PARAM_NULL(w_handle.force_to<void *>());
+
+        if (w_handle.GetDataType() == DATA_TYPE_HALF)
+            w_handle = ConvertHalfHandle(w_handle);
+
+        auto data_byte_size = sizeof(fp16_t);
+        int ic              = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+        const int oc        = fc_param->num_output;
+        // weight [oc, ic] -> transpose -> [ic, oc]
+        RawBuffer tmp_transpose = RawBuffer(ic * oc * data_byte_size);
+        fp16_t *transpose_ptr   = tmp_transpose.force_to<fp16_t *>();
+        for (int i = 0; i < ic; ++i) {
+            for (int o = 0; o < oc; ++o) {
+                transpose_ptr[i * oc + o] = (fp16_t)w_handle.force_to<float *>()[o * ic + i];
+            }
+        }
+        // weight [ic, oc] -> [oc/16, ic, 16]
+        buffer_weight_ = RawBuffer(ic * ROUND_UP(oc, 16) * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+        PackB_16(ic, oc, transpose_ptr, oc, buffer_weight_.force_to<fp16_t *>());
+    }
+
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::allocateBufferBiasHalf(const std::vector<Blob *> &inputs,
+                                                       const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+    InnerProductLayerResource *fc_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(fc_res);
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_bias_.GetBytesSize()) {
+        if (fc_param->has_bias) {
+            auto bias_handle = fc_res->bias_handle;
+
+            if (bias_handle.GetDataType() == DATA_TYPE_HALF)
+                bias_handle = ConvertHalfHandle(bias_handle);
+
+            int total_byte_size = ROUND_UP(dims_output[1], 8) * sizeof(fp16_t);
+            buffer_bias_        = RawBuffer(total_byte_size);
+            ConvertFromFloatToHalf(bias_handle.force_to<float *>(), buffer_bias_.force_to<fp16_t *>(), dims_output[1]);
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = reinterpret_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    DimsVector dims_input = inputs[0]->GetBlobDesc().dims;
+    int batch             = dims_input[0];
+    int channel           = dims_input[1];
+    int hw                = DimsVectorUtils::Count(dims_input, 2);
+    int ic                = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+    const int oc          = fc_param->num_output;
+    auto data_byte_size   = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+    const int input_size  = batch * ic * data_byte_size;
+    const int bias_size   = oc * data_byte_size;
+    const int output_size = batch * oc * data_byte_size;
+
+    fp16_t *input_ptr  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    fp16_t *output_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    // input: nc8hw8 -> nchw if needed
+    RawBuffer input_reordered;
+    if (!HalfBlobCanIgnorePack(channel, hw)) {
+        input_reordered       = RawBuffer(input_size);
+        fp16_t *reordered_ptr = input_reordered.force_to<fp16_t *>();
+        UnpackHalfBlob(reordered_ptr, input_ptr, batch, channel, hw);
+        input_ptr = reordered_ptr;
+    }
+
+    fp16_t *tmp_output_ptr = output_ptr;
+    RawBuffer output_reordered;
+    if (!HalfBlobCanIgnorePack(oc, 1)) {
+        output_reordered = RawBuffer(output_size);
+        tmp_output_ptr   = output_reordered.force_to<fp16_t *>();
+    }
+
+    if (fc_param->has_bias) {
+        OMP_PARALLEL_FOR_
+        for (int b = 0; b < batch; ++b) {
+            // output shape: [batch, oc]
+            auto dst_ptr_b = tmp_output_ptr + b * oc;
+            memcpy(dst_ptr_b, buffer_bias_.force_to<fp16_t *>(), bias_size);
+        }
+    } else {
+        memset(tmp_output_ptr, 0, output_size);
+    }
+
+    // buffer for PackA in gemm
+    auto input_pack_ptr = reinterpret_cast<fp16_t *>(context_->GetSharedWorkSpace(input_size + NEON_KERNEL_EXTRA_LOAD));
+
+    GemmHalfPackA(batch, oc, ic, input_ptr, input_pack_ptr, ic, buffer_weight_.force_to<fp16_t *>(), oc, tmp_output_ptr,
+                  oc);
+
+    // output: nchw -> nc8hw8 if needed
+    if (!HalfBlobCanIgnorePack(oc, 1)) {
+        PackHalfBlob(output_ptr, tmp_output_ptr, batch, oc, 1);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmInnerProductLayerAcc::ExecNchwFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *fc_param = reinterpret_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(fc_param);
+
+    DimsVector dims_input = inputs[0]->GetBlobDesc().dims;
+    int batch             = dims_input[0];
+    int channel           = dims_input[1];
+    int hw                = DimsVectorUtils::Count(dims_input, 2);
+    int ic                = dims_input[1] * DimsVectorUtils::Count(dims_input, 2);
+    const int oc          = fc_param->num_output;
+    auto data_byte_size   = DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+    const int input_size  = batch * ic * data_byte_size;
+    const int bias_size   = oc * data_byte_size;
+    const int output_size = batch * oc * data_byte_size;
+
+    fp16_t *input_ptr  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    fp16_t *output_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    fp16_t *tmp_output_ptr = output_ptr;
+
+    if (fc_param->has_bias) {
+        OMP_PARALLEL_FOR_
+        for (int b = 0; b < batch; ++b) {
+            // output shape: [batch, oc]
+            auto dst_ptr_b = tmp_output_ptr + b * oc;
+            memcpy(dst_ptr_b, buffer_bias_.force_to<fp16_t *>(), bias_size);
+        }
+    } else {
+        memset(tmp_output_ptr, 0, output_size);
+    }
+
+    // buffer for PackA in gemm
+    auto input_pack_ptr = reinterpret_cast<fp16_t *>(context_->GetSharedWorkSpace(input_size + NEON_KERNEL_EXTRA_LOAD));
+
+    GemmHalfPackA(batch, oc, ic, input_ptr, input_pack_ptr, ic, buffer_weight_.force_to<fp16_t *>(), oc, tmp_output_ptr,
+                  oc);
+
+    return TNN_OK;
+}
+
+#endif  // TNN_ARM82
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_lstm_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_lstm_fp16_layer.cc
new file mode 100644
index 0000000..56a17d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_lstm_fp16_layer.cc
@@ -0,0 +1,281 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/acc/arm_lstm_layer_acc.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+
+static void LstmActivate(const int count, const fp16_t *g_ptr, fp16_t *c_ptr, fp16_t *h_ptr, fp16_t *o_ptr) {
+#ifdef TNN_ARM82_USE_NEON
+    OMP_PARALLEL_FOR_
+    for (int q = 0; q < count - 7; q += 8) {
+        Half8x4 gates_iofc = Half8x4::ld4(g_ptr + q * 4);
+        Half8 I, O, F, C;
+        gates_iofc.get_lane(I, 0);
+        gates_iofc.get_lane(O, 1);
+        gates_iofc.get_lane(F, 2);
+        gates_iofc.get_lane(C, 3);
+
+        I = Half8::sigmoid(I);
+        F = Half8::sigmoid(F);
+        O = Half8::sigmoid(O);
+        C = Half8::tanh(C);
+
+        Half8 cell2 = F * Half8::load(c_ptr + q) + I * C;
+        Half8 H     = O * Half8::tanh(cell2);
+        Half8::save(c_ptr + q, cell2);
+        Half8::save(h_ptr + q, H);
+        Half8::save(o_ptr + q, H);
+    }
+    int remain = count % 8;
+    int offset = count / 8 * 8;
+    g_ptr += offset * 4;
+    c_ptr += offset;
+    h_ptr += offset;
+    o_ptr += offset;
+    if (remain) {
+        Half8x4 gates_iofc = Half8x4::ld4(g_ptr);
+        Half8 I, O, F, C;
+        gates_iofc.get_lane(I, 0);
+        gates_iofc.get_lane(O, 1);
+        gates_iofc.get_lane(F, 2);
+        gates_iofc.get_lane(C, 3);
+
+        I = Half8::sigmoid(I);
+        F = Half8::sigmoid(F);
+        O = Half8::sigmoid(O);
+        C = Half8::tanh(C);
+
+        Half8 c_old;
+        for (int r = 0; r < remain; ++r) {
+            c_old.set_lane(c_ptr[r], r);
+        }
+        Half8 cell2 = F * c_old + I * C;
+        Half8 H     = O * Half8::tanh(cell2);
+        for (int r = 0; r < remain; ++r) {
+            c_ptr[r] = cell2[r];
+            h_ptr[r] = H[r];
+            o_ptr[r] = H[r];
+        }
+    }
+#else
+    for (int q = 0; q < count; ++q) {
+        const auto gates_data = g_ptr + q * 4;
+
+        float I = (float)gates_data[0];
+        float O = (float)gates_data[1];
+        float F = (float)gates_data[2];
+        float C = (float)gates_data[3];
+
+        I = 1.f / (1.f + exp(-I));
+        F = 1.f / (1.f + exp(-F));
+        O = 1.f / (1.f + exp(-O));
+        C = tanh(C);
+
+        float cell2 = F * (float)c_ptr[q] + I * C;
+        float H     = O * tanh(cell2);
+        c_ptr[q]    = (fp16_t)cell2;
+        h_ptr[q]    = (fp16_t)H;
+        o_ptr[q]    = (fp16_t)H;
+    }
+#endif  // TNN_ARM82_USE_NEON
+}
+
+Status ArmLSTMONNXLayerAcc::LstmSingleDirection(const fp16_t *x, fp16_t *y, const fp16_t *w, const fp16_t *r,
+                                                const fp16_t *b, fp16_t *h_t, fp16_t *c_t, const int batch_size,
+                                                int reverse) {
+    const int input_size  = input_size_;
+    const int hidden_size = hidden_size_;
+    const int seq_len     = seq_len_;
+
+    int gates_count      = seq_len * batch_size * hidden_size * 4;
+    int input_pack_count = MAX(seq_len * batch_size * input_size, batch_size * hidden_size);
+    auto workspace =
+        context_->GetSharedWorkSpace((gates_count + input_pack_count) * sizeof(fp16_t) + NEON_KERNEL_EXTRA_LOAD);
+    auto gates_ptr      = reinterpret_cast<fp16_t *>(workspace);
+    auto input_pack_ptr = gates_ptr + gates_count;
+    for (int i = 0; i < seq_len * batch_size; ++i) {
+        fp16_t *gates_i = gates_ptr + i * hidden_size * 4;
+        memcpy(gates_i, b, hidden_size * 4 * sizeof(fp16_t));
+    }
+
+    GemmHalfPackA(seq_len * batch_size, hidden_size * 4, input_size, x, input_pack_ptr, input_size, w, hidden_size * 4,
+                  gates_ptr, hidden_size * 4);
+
+    for (int t = 0; t < seq_len; ++t) {
+        int ti          = reverse ? seq_len - 1 - t : t;
+        fp16_t *y_t     = y + ti * batch_size * hidden_size;
+        fp16_t *gates_t = gates_ptr + ti * batch_size * hidden_size * 4;
+
+        GemmHalfPackA(batch_size, hidden_size * 4, hidden_size, h_t, input_pack_ptr, hidden_size, r, hidden_size * 4,
+                      gates_t, hidden_size * 4);
+
+        LstmActivate(batch_size * hidden_size, gates_t, c_t, h_t, y_t);
+    }
+
+    return TNN_OK;
+}
+
+// [4, hidden_size, input_size] -> transpose -> [input_size, hidden_size, 4]
+// [input_size, hidden_size * 4] -> PackB_16 -> [hidden_size * 4 / 16, input_size, 16]
+static void TransposeAndPackWeight(const float *src, fp16_t *dst, int input_size, int hidden_size) {
+    RawBuffer tmp_transpose = RawBuffer(input_size * hidden_size * 4 * sizeof(fp16_t));
+    fp16_t *src_transpose   = tmp_transpose.force_to<fp16_t *>();
+    const float *vsrc[4];
+    vsrc[0]   = src;
+    vsrc[1]   = vsrc[0] + input_size * hidden_size;
+    vsrc[2]   = vsrc[1] + input_size * hidden_size;
+    vsrc[3]   = vsrc[2] + input_size * hidden_size;
+    int count = 0;
+    for (int i = 0; i < input_size; ++i) {
+        for (int h = 0; h < hidden_size; ++h) {
+            src_transpose[count++] = (fp16_t)vsrc[0][h * input_size + i];
+            src_transpose[count++] = (fp16_t)vsrc[1][h * input_size + i];
+            src_transpose[count++] = (fp16_t)vsrc[2][h * input_size + i];
+            src_transpose[count++] = (fp16_t)vsrc[3][h * input_size + i];
+        }
+    }
+    PackB_16(input_size, hidden_size * 4, src_transpose, hidden_size * 4, dst);
+}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferWeightInputHalf(Blob *weight_i) {
+    // W[iofc], weight tensor for the gates, shape [num_directions, 4*hidden_size, input_size]
+    float *weight_i_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(weight_i->GetHandle()));
+
+    int weight_page       = input_size_ * ROUND_UP(4 * hidden_size_, 16);
+    int weight_byte_count = num_directions_ * weight_page * sizeof(fp16_t);
+    buffer_weight_input_  = RawBuffer(weight_byte_count + NEON_KERNEL_EXTRA_LOAD);
+    for (int dir = 0; dir < num_directions_; ++dir) {
+        fp16_t *buffer_ptr = buffer_weight_input_.force_to<fp16_t *>() + dir * weight_page;
+        TransposeAndPackWeight(weight_i_ptr, buffer_ptr, input_size_, hidden_size_);
+        weight_i_ptr += 4 * hidden_size_ * input_size_;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferWeightRecurrentHalf(Blob *weight_r) {
+    // R[iofc], recurrence weight tensor, shape [num_directions, 4*hidden_size, hidden_size]
+    float *weight_r_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(weight_r->GetHandle()));
+
+    int weight_page          = hidden_size_ * ROUND_UP(4 * hidden_size_, 16);
+    int weight_byte_count    = num_directions_ * weight_page * sizeof(fp16_t);
+    buffer_weight_recurrent_ = RawBuffer(weight_byte_count + NEON_KERNEL_EXTRA_LOAD);
+    for (int dir = 0; dir < num_directions_; ++dir) {
+        fp16_t *buffer_ptr = buffer_weight_recurrent_.force_to<fp16_t *>() + dir * weight_page;
+        TransposeAndPackWeight(weight_r_ptr, buffer_ptr, hidden_size_, hidden_size_);
+        weight_r_ptr += 4 * hidden_size_ * hidden_size_;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::AllocateBufferBiasHalf(Blob *bias) {
+    // B[iofc] Concatenation of [Wb[iofc], Rb[iofc]], [num_directions, 8*hidden_size]
+    float *bias_ptr = reinterpret_cast<float *>(GetBlobHandlePtr(bias->GetHandle()));
+
+    int bias_count       = num_directions_ * 4 * hidden_size_;
+    int bias_byte_count  = bias_count * sizeof(fp16_t);
+    buffer_bias_         = RawBuffer(bias_byte_count);
+    auto buffer_bias_ptr = buffer_bias_.force_to<fp16_t *>();
+    for (int d = 0; d < num_directions_; ++d) {
+        auto src_d = bias_ptr + d * 8 * hidden_size_;
+        auto dst_d = buffer_bias_ptr + d * 4 * hidden_size_;
+        for (int i = 0; i < hidden_size_; ++i) {
+            dst_d[i * 4 + 0] = (fp16_t)(src_d[i + 0 * hidden_size_] + src_d[i + 4 * hidden_size_]);
+            dst_d[i * 4 + 1] = (fp16_t)(src_d[i + 1 * hidden_size_] + src_d[i + 5 * hidden_size_]);
+            dst_d[i * 4 + 2] = (fp16_t)(src_d[i + 2 * hidden_size_] + src_d[i + 6 * hidden_size_]);
+            dst_d[i * 4 + 3] = (fp16_t)(src_d[i + 3 * hidden_size_] + src_d[i + 7 * hidden_size_]);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmLSTMONNXLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto batch          = inputs[0]->GetBlobDesc().dims[1];
+    const auto direction      = direction_;
+    const auto num_directions = num_directions_;
+    const auto seq_len        = seq_len_;
+    const auto input_size     = input_size_;
+    const auto hidden_size    = hidden_size_;
+
+    // X shape [sequence batch_size input_size]
+    fp16_t *x = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+
+    // Y shape [sequence batch_size num_directions *hidden_size]
+    fp16_t *y = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    // Initial states. If not specified, assumed to be 0.
+    // shape [num_directions, batch_size, hidden_size]
+    auto h_t = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[1]->GetHandle()));
+    auto c_t = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[2]->GetHandle()));
+    if (inputs.size() >= 6) {
+        auto h_0 = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[4]->GetHandle()));
+        memcpy((void *)h_t, h_0, num_directions * batch * hidden_size * sizeof(fp16_t));
+        auto c_0 = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[5]->GetHandle()));
+        memcpy((void *)c_t, c_0, num_directions * batch * hidden_size * sizeof(fp16_t));
+    } else {
+        memset((void *)h_t, 0, num_directions * batch * hidden_size * sizeof(fp16_t));
+        memset((void *)c_t, 0, num_directions * batch * hidden_size * sizeof(fp16_t));
+    }
+
+    fp16_t *w = buffer_weight_input_.force_to<fp16_t *>();
+    fp16_t *r = buffer_weight_recurrent_.force_to<fp16_t *>();
+    fp16_t *b = buffer_bias_.force_to<fp16_t *>();
+
+    if (direction == 0 || direction == 1) {
+        return LstmSingleDirection(x, y, w, r, b, h_t, c_t, batch, direction);
+    } else if (direction == 2) {
+        // Y shape [num_directions sequence batch_size hidden_size]
+        RawBuffer y_temp = RawBuffer(num_directions * seq_len * batch * hidden_size * sizeof(fp16_t));
+        auto y0          = y_temp.force_to<fp16_t *>();
+        auto y1          = y0 + seq_len * batch * hidden_size;
+        LstmSingleDirection(x, y0, w, r, b, h_t, c_t, batch, 0);
+
+        auto w1   = w + ROUND_UP(4 * hidden_size, 16) * input_size;
+        auto r1   = r + ROUND_UP(4 * hidden_size, 16) * hidden_size;
+        auto b1   = b + 4 * hidden_size;
+        auto h_t1 = h_t + batch * hidden_size;
+        auto c_t1 = c_t + batch * hidden_size;
+        LstmSingleDirection(x, y1, w1, r1, b1, h_t1, c_t1, batch, 1);
+
+        // transpose [num_directions sequence batch_size hidden_size] to [sequence batch_size
+        // num_directions*hidden_size]
+        for (int i = 0; i < seq_len * batch; i++) {
+            auto y0_data = y0 + i * hidden_size;
+            auto y1_data = y1 + i * hidden_size;
+            auto y_data  = y + i * num_directions * hidden_size;
+
+            memcpy(y_data, y0_data, hidden_size * sizeof(fp16_t));
+            memcpy(y_data + hidden_size, y1_data, hidden_size * sizeof(fp16_t));
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "LSTMONNX has invalid direction param");
+    }
+
+    return TNN_OK;
+}
+
+#endif  // TNN_ARM82
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_prelu_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_prelu_fp16_layer.cc
new file mode 100644
index 0000000..80c6e3a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_prelu_fp16_layer.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_prelu_layer_acc.h"
+
+#include <cmath>
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+Status ArmPReluLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto dims         = inputs[0]->GetBlobDesc().dims;
+    const int channel = dims[1];
+    const int hw      = DimsVectorUtils::Count(dims, 2);
+    const int count   = dims[0] * ROUND_UP(dims[1], 8) * hw;
+
+    const fp16_t *slope_data = buffer_slope_.force_to<fp16_t *>();
+
+    fp16_t *input_data  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    fp16_t *output_data = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    if (layer_param->channel_shared) {
+        Half8 v_slope = Half8(slope_data[0]);
+        Half8 v_zero  = Half8((fp16_t)(0.f));
+        for (int n = 0; n < count; n += 8) {
+            Half8 v_data = Half8::load(input_data + n);
+            Half8 v_res  = Half8::bsl_clt(v_data, v_zero, v_data * v_slope, v_data);
+            Half8::save(output_data + n, v_res);
+        }
+    } else {
+        Half8 v_zero = Half8((fp16_t)(0.f));
+        for (int batch_idx = 0; batch_idx < dims[0]; ++batch_idx) {
+            auto input_ptr  = input_data + batch_idx * hw * ROUND_UP(channel, 8);
+            auto output_ptr = output_data + batch_idx * hw * ROUND_UP(channel, 8);
+            for (int dz = 0; dz < UP_DIV(channel, 8); ++dz) {
+                auto *src_z         = input_ptr + dz * hw * 8;
+                auto *dst_z         = output_ptr + dz * hw * 8;
+                Half8 v_slope = Half8::load(slope_data + dz * 8);
+                for (int p = 0; p < hw; p++) {
+                    Half8 v_data = Half8::load(src_z + p * 8);
+                    Half8 v_res  = Half8::bsl_clt(v_data, v_zero, v_data * v_slope, v_data);
+                    Half8::save(dst_z + p * 8, v_res);
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu6_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu6_fp16_layer.cc
new file mode 100644
index 0000000..9ca0808
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu6_fp16_layer.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute_arm82/arm_unary_fp16_layer.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+typedef struct arm_relu6_fp16_operator {
+    virtual Half8 operator()(const Half8 &v) {
+        return Half8::min(Half8(fp16_t(6.0)), Half8::max(Half8(fp16_t(0.0)), v));
+    }
+} ARM_RELU6_OP;
+
+DEFINE_ARM_UNARY_ACC_FP16(Relu6, ARM_RELU6_OP);
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu_fp16_layer.cc
new file mode 100644
index 0000000..39e9df1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_relu_fp16_layer.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/acc/arm_relu_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+Status ArmReluLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto dims   = output->GetBlobDesc().dims;
+
+    auto count  = dims[0] * ROUND_UP(dims[1], 8) * DimsVectorUtils::Count(dims, 2);
+    fp16_t *dst = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+    fp16_t *src = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    Half8 vzero = Half8((fp16_t)0.f);
+    for (long i = 0; i < count; i += 8) {
+        Half8::save(dst + i, Half8::max(Half8::load(src + i), vzero));
+    }
+
+    return TNN_OK;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_sigmoid_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_sigmoid_fp16_layer.cc
new file mode 100644
index 0000000..0e08176
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_sigmoid_fp16_layer.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute_arm82/arm_unary_fp16_layer.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+typedef struct arm_sigmoid_fp16_operator {
+    virtual Half8 operator()(const Half8 &v) {
+        return Half8::sigmoid(v);
+    }
+} ARM_SIGMOID_OP;
+
+DEFINE_ARM_UNARY_ACC_FP16(Sigmoid, ARM_SIGMOID_OP);
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_softmax_fp16_layer.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_softmax_fp16_layer.cc
new file mode 100644
index 0000000..0b16318
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_softmax_fp16_layer.cc
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_softmax_layer_acc.h"
+#include "tnn/device/arm/acc/Half8.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+Status ArmSoftmaxLayerAcc::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    SoftmaxLayerParam *layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int data_byte_size = sizeof(fp16_t);
+    SoftmaxPreparation();
+
+    size_t reorder_size   = packed ? dims[1] * hw : 0;
+    size_t max_value_size = inside;
+    size_t sum_value_size = inside;
+
+    fp16_t *work_space = reinterpret_cast<fp16_t *>(
+        context_->GetSharedWorkSpace((reorder_size + max_value_size + sum_value_size) * data_byte_size));
+
+    fp16_t *reorder_buffer_ptr = work_space;
+    fp16_t *max_value_ptr      = reorder_buffer_ptr + reorder_size;
+    fp16_t *sum_value_ptr      = max_value_ptr + max_value_size;
+
+    fp16_t *input_origin  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));
+    fp16_t *output_origin = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr     = input_origin + batch_idx * hw * ROUND_UP(dims[1], 8);
+        auto output_ptr    = output_origin + batch_idx * hw * ROUND_UP(dims[1], 8);
+        reorder_buffer_ptr = output_ptr;
+
+        if (packed) {
+            UnpackC8(output_ptr, input_ptr, hw, dims[1]);
+            input_ptr          = output_ptr;
+            reorder_buffer_ptr = work_space;
+        }
+
+        for (int y = 0; y < outside; y++) {
+            auto src_y = input_ptr + y * step_y;
+            auto dst_y = reorder_buffer_ptr + y * step_y;
+            memcpy(max_value_ptr, src_y, sizeof(fp16_t) * inside);
+
+            auto src = src_y + inside;
+            for (int c = 1; c < channel; ++c, src += inside) {
+                int x = 0;
+                for (; x <= inside - 8; x += 8) {
+                    Half8 src_v = Half8::load(src + x);
+                    Half8 max_v = Half8::load(max_value_ptr + x);
+                    max_v       = Half8::max(src_v, max_v);
+                    Half8::save(max_value_ptr + x, max_v);
+                }
+                for (; x < inside; ++x) {
+                    max_value_ptr[x] = src[x] > max_value_ptr[x] ? src[x] : max_value_ptr[x];
+                }
+            }
+
+            memset(sum_value_ptr, 0, sizeof(fp16_t) * inside);
+            src         = src_y;
+            fp16_t *dst = dst_y;
+            for (int c = 0; c < channel; ++c, src += inside, dst += inside) {
+                int x = 0;
+                for (; x <= inside - 8; x += 8) {
+                    Half8 src_v = Half8::load(src + x);
+                    Half8 max_v = Half8::load(max_value_ptr + x);
+                    Half8 sum_v = Half8::load(sum_value_ptr + x);
+                    Half8 dst_v = Half8::exp(src_v - max_v);
+                    sum_v       = sum_v + dst_v;
+                    Half8::save(dst + x, dst_v);
+                    Half8::save(sum_value_ptr + x, sum_v);
+                }
+                for (; x < inside; ++x) {
+                    dst[x] = std::exp(src[x] - max_value_ptr[x]);
+                    sum_value_ptr[x] += dst[x];
+                }
+            }
+
+            for (int x = 0; x < inside; ++x) {
+#ifdef TNN_ARM82_SIMU
+                sum_value_ptr[x] = 1.0f / sum_value_ptr[x];
+#else
+                ((__fp16*)sum_value_ptr)[x] = 1.0f / ((__fp16*)sum_value_ptr)[x];
+#endif
+            }
+
+            dst = dst_y;
+            for (int c = 0; c < channel; ++c, dst += inside) {
+                int x = 0;
+                for (; x < inside - 8; x += 8) {
+                    Half8 dst_v = Half8::load(dst + x);
+                    Half8 sum_v = Half8::load(sum_value_ptr + x);
+                    dst_v       = dst_v * sum_v;
+                    Half8::save(dst + x, dst_v);
+                }
+                for (; x < inside; ++x) {
+                    dst[x] *= sum_value_ptr[x];
+                }
+            }
+        }
+
+        if (packed) {
+            PackC8(output_ptr, reorder_buffer_ptr, hw, dims[1]);
+        }
+    }
+    return TNN_OK;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_unary_fp16_layer.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_unary_fp16_layer.h
new file mode 100644
index 0000000..150cb96
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/arm_unary_fp16_layer.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_FP16_LAYER_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_FP16_LAYER_H_
+
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/acc/arm_unary_layer_acc.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+#define DECLARE_ARM_FP16_UNARY_DO_FORWARD                                                                              \
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+#define DEFINE_ARM_FP16_UNARY_LAYER_FUNC(acc_class, op_type)                                                           \
+    Status acc_class::ExecFp16(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {                \
+        auto input      = inputs[0];                                                                                   \
+        auto output     = outputs[0];                                                                                  \
+        auto dims       = output->GetBlobDesc().dims;                                                                  \
+        int count       = dims[0] * ROUND_UP(dims[1], 8) * DimsVectorUtils::Count(dims, 2);                            \
+        int count_div8  = UP_DIV(count, 8);                                                                            \
+        auto input_ptr  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input->GetHandle()));                            \
+        auto output_ptr = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output->GetHandle()));                           \
+        OMP_PARALLEL_FOR_                                                                                              \
+        for (int n = 0; n < count_div8; n++) {                                                                         \
+            Half8::save(output_ptr + n * 8, op_type()(Half8::load(input_ptr + n * 8)));                                \
+        }                                                                                                              \
+        return TNN_OK;                                                                                                 \
+    }
+#else
+#define DECLARE_ARM_FP16_UNARY_DO_FORWARD
+#define DEFINE_ARM_FP16_UNARY_LAYER_FUNC(acc_class, op_type)
+#endif  // TNN_ARM82
+
+#define DEFINE_ARM_UNARY_ACC_FP16(type_string, op_type)                                                                \
+    class Arm##type_string##LayerAcc : public ArmUnaryLayerAcc {                                                       \
+    public:                                                                                                            \
+        Arm##type_string##LayerAcc();                                                                                  \
+        DECLARE_ARM_FP16_UNARY_DO_FORWARD;                                                                             \
+        virtual ~Arm##type_string##LayerAcc(){};                                                                       \
+                                                                                                                       \
+    private:                                                                                                           \
+        DECLARE_ARM_FP16_LAYER_FUNC;                                                                                   \
+    };                                                                                                                 \
+    DEFINE_ARM_FP16_UNARY_LAYER_FUNC(Arm##type_string##LayerAcc, op_type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_UNARY_FP16_LAYER_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S
new file mode 100644
index 0000000..0cf3d58
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S
@@ -0,0 +1,268 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+.macro TRANSPOSE_2X16_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3,w0c4,w0c5,w0c6,w0c7 | w2c0,w2c1,w2c2,w2c3,w2c4,w2c5,w2c6,w2c7
+// r1: w1c0,w1c1,w1c2,w1c3,w1c4,w1c5,w1c6,w1c7 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3 | w0c4,w1c4,w0c5,w1c5,w0c6,w1c6,w0c7,w1c7
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0,   | w2c4,0,   w2c5,0,   w2c6,0,   w2c7,0
+    vzip.8 \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    // w0c4,w1c4,w2c4,0,w0c5,w1c5,w2c5,0,w0c6,w1c6,w2c6,0,w0c7,w1c7,w2c7,0
+    vzip.16 \r0, \r1
+.endm
+
+.macro TRANSPOSE_2X8_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3 | w2c0,w2c1,w2c2,w2c3
+// r1: w1c0,w1c1,w1c2,w1c3 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0
+    vzip.8 \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    vzip.16 \r0, \r1
+.endm
+
+asm_function ConvDw3x3Int8SdotSlideW
+//void ConvDw3x3Int8SdotSlideW(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//r0(dst_z),
+//r1(int8_t** src),
+//r2(weight_z),
+//r3(bias_z)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(scale_z)   [sp, #100]
+//from stack(dc)        [sp, #104]
+//from stack(dst_depth) [sp, #108]
+//from stack(width)     [sp, #112]
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+ldr r7, [sp, #112]
+
+cmp r7, #0
+ble End
+
+// weight q0 - q5
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+// c4 k0k1k2-, c5 k0k1k2-, c6 k0k1k2-, c7 k0k1k2-
+vldm r2, {d0-d11}
+
+// q6, q7 for convert fp32 to int32
+vmov.f32 q6, #0.5
+vmov.f32 q7, #-0.5
+
+ldr r9,  [r1]
+ldr r10, [r1, #4]
+ldr r11, [r1, #8]
+add r9, r5     // h0 ptr += dc
+add r10, r5    // h1 ptr += dc
+add r11, r5    // h2 ptr += dc
+
+veor d19, d19, d19
+vld1.8 {d16}, [r9], r6
+vld1.8 {d18}, [r10], r6
+vld1.8 {d17}, [r11], r6
+vld1.32 {q10, q11}, [r3]   // init from bias
+TRANSPOSE_2X16_S8 q8, q9
+.word 0xfc604dc0 // vsdot.s8 q10, q8, q0
+.word 0xfc626dc2 // vsdot.s8 q11, q9, q1
+
+veor d19, d19, d19
+vld1.8 {d16}, [r9], r6
+vld1.8 {d18}, [r10], r6
+vld1.8 {d17}, [r11], r6
+vld1.32 {q12, q13}, [r3]   // init from bias
+TRANSPOSE_2X16_S8 q8, q9
+.word 0xfc604dc4 // vsdot.s8 q10, q8, q2
+.word 0xfc626dc6 // vsdot.s8 q11, q9, q3
+.word 0xfc608dc0 // vsdot.s8 q12, q8, q0
+.word 0xfc62adc2 // vsdot.s8 q13, q9, q1
+
+LoopDw:
+    veor d19, d19, d19
+    vld1.8 {d16}, [r9], r6
+    vld1.8 {d18}, [r10], r6
+    vld1.8 {d17}, [r11], r6
+    vld1.32 {q14, q15}, [r3]  // init from bias
+    pld [r9]
+    pld [r10]
+    pld [r11]
+    TRANSPOSE_2X16_S8 q8, q9
+    .word 0xfc604dc8 // vsdot.s8 q10, q8, q4
+    .word 0xfc626dca // vsdot.s8 q11, q9, q5
+    .word 0xfc608dc4 // vsdot.s8 q12, q8, q2
+    .word 0xfc62adc6 // vsdot.s8 q13, q9, q3
+    .word 0xfc60cdc0 // vsdot.s8 q14, q8, q0
+    .word 0xfc62edc2 // vsdot.s8 q15, q9, q1
+
+    subs r7, r7, #1
+
+    vld1.32 {q8, q9}, [r4] // load scale
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+    vmul.f32 q10, q10, q8  // result *= scale
+    vmul.f32 q11, q11, q9
+
+    vcge.f32 q8, q10, #0
+    vcge.f32 q9, q11, #0
+    vbsl.f32 q8, q6, q7
+    vbsl.f32 q9, q6, q7
+
+    vadd.f32 q10, q10, q8
+    vadd.f32 q11, q11, q9
+    vcvt.s32.f32 q10, q10
+    vcvt.s32.f32 q11, q11
+    vqmovn.s32 d20, q10
+    vqmovn.s32 d21, q11
+    vqmovn.s16 d20, q10
+    vst1.8 {d20}, [r0], r6
+
+    vmov q10, q12
+    vmov q11, q13
+    vmov q12, q14
+    vmov q13, q15
+
+    bne LoopDw
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+asm_function ConvDw3x3Int8SdotSlideWLeftC4
+//void ConvDw3x3Int8SdotSlideWLeftC4(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//r0(dst_z),
+//r1(int8_t** src),
+//r2(weight_z),
+//r3(bias_z)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(scale_z)   [sp, #100]
+//from stack(dc)        [sp, #104]
+//from stack(dst_depth) [sp, #108]
+//from stack(width)     [sp, #112]
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+ldr r7, [sp, #112]
+
+cmp r7, #0
+ble C4End
+
+// weight q0 - q2
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+vldm r2, {d0-d5}
+
+// q6, q7 for convert fp32 to int32
+vmov.f32 q6, #0.5
+vmov.f32 q7, #-0.5
+
+// bias
+vld1.32 {q3}, [r3]
+
+// scale
+vld1.32 {q4}, [r4]
+
+ldr r9,  [r1]
+ldr r10, [r1, #4]
+ldr r11, [r1, #8]
+add r9, r5     // h0 ptr += dc
+add r10, r5    // h1 ptr += dc
+add r11, r5    // h2 ptr += dc
+
+veor d17, d17, d17
+vld1.32 {d16[0]}, [r9], r6
+vld1.32 {d17[0]}, [r10], r6
+vld1.32 {d16[1]}, [r11], r6
+vmov q10, q3
+TRANSPOSE_2X8_S8 d16, d17
+.word 0xfc604dc0 // vsdot.s8 q10, q8, q0
+
+veor d17, d17, d17
+vld1.32 {d16[0]}, [r9], r6
+vld1.32 {d17[0]}, [r10], r6
+vld1.32 {d16[1]}, [r11], r6
+vmov q11, q3
+TRANSPOSE_2X8_S8 d16, d17
+.word 0xfc604dc2 // vsdot.s8 q10, q8, q1
+.word 0xfc606dc0 // vsdot.s8 q11, q8, q0
+
+C4LoopDw:
+    veor d17, d17, d17
+    vld1.32 {d16[0]}, [r9], r6
+    vld1.32 {d17[0]}, [r10], r6
+    vld1.32 {d16[1]}, [r11], r6
+    vmov q12, q3
+    pld [r9]
+    pld [r10]
+    pld [r11]
+    TRANSPOSE_2X8_S8 d16, d17
+    .word 0xfc604dc4 // vsdot.s8 q10, q8, q2
+    .word 0xfc606dc2 // vsdot.s8 q11, q8, q1
+    .word 0xfc608dc0 // vsdot.s8 q12, q8, q0
+
+    subs r7, r7, #1
+
+    vcvt.f32.s32 q13, q10
+    vmov q10, q11
+    vmov q11, q12
+    vmul.f32 q13, q13, q4  // result *= scale
+    vcge.f32 q8, q13, #0
+    vbsl.f32 q8, q6, q7
+    vadd.f32 q13, q13, q8
+    vcvt.s32.f32 q13, q13
+    vqmovn.s32 d26, q13
+    vqmovn.s16 d26, q13
+    vst1.32 {d26[0]}, [r0], r6
+
+    bne C4LoopDw
+
+C4End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S
new file mode 100644
index 0000000..53c2888
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S
@@ -0,0 +1,280 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+.macro TRANSPOSE_2X16_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3,w0c4,w0c5,w0c6,w0c7 | w2c0,w2c1,w2c2,w2c3,w2c4,w2c5,w2c6,w2c7
+// r1: w1c0,w1c1,w1c2,w1c3,w1c4,w1c5,w1c6,w1c7 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3 | w0c4,w1c4,w0c5,w1c5,w0c6,w1c6,w0c7,w1c7
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0,   | w2c4,0,   w2c5,0,   w2c6,0,   w2c7,0
+    vzip.8 \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    // w0c4,w1c4,w2c4,0,w0c5,w1c5,w2c5,0,w0c6,w1c6,w2c6,0,w0c7,w1c7,w2c7,0
+    vzip.16 \r0, \r1
+.endm
+
+.macro TRANSPOSE_2X8_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3 | w2c0,w2c1,w2c2,w2c3
+// r1: w1c0,w1c1,w1c2,w1c3 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0
+    vzip.8 \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    vzip.16 \r0, \r1
+.endm
+
+asm_function ConvDw3x3S2Int8SdotSlideW
+//void ConvDw3x3Int8SdotSlideW(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//r0(dst_z),
+//r1(int8_t** src),
+//r2(weight_z),
+//r3(bias_z)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(scale_z)   [sp, #100]
+//from stack(dc)        [sp, #104]
+//from stack(dst_depth) [sp, #108]
+//from stack(width)     [sp, #112]
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+ldr r7, [sp, #112]
+
+cmp r7, #0
+ble End
+
+// weight q0 - q5
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+// c4 k0k1k2-, c5 k0k1k2-, c6 k0k1k2-, c7 k0k1k2-
+vldm r2, {d0-d11}
+
+// scale q14, q15
+vld1.32 {q14, q15}, [r4]
+
+// q6, q7 for convert fp32 to int32
+vmov.f32 q6, #0.5
+vmov.f32 q7, #-0.5
+
+ldr r9,  [r1]
+ldr r10, [r1, #4]
+ldr r11, [r1, #8]
+add r9, r5     // h0 ptr += dc
+add r10, r5    // h1 ptr += dc
+add r11, r5    // h2 ptr += dc
+
+veor d19, d19, d19
+vld1.8 {d16}, [r9], r6
+vld1.8 {d18}, [r10], r6
+vld1.8 {d17}, [r11], r6
+vld1.32 {q10, q11}, [r3]   // init from bias
+TRANSPOSE_2X16_S8 q8, q9
+.word 0xfc604dc0 // vsdot.s8 q10, q8, q0
+.word 0xfc626dc2 // vsdot.s8 q11, q9, q1
+
+veor d19, d19, d19
+vld1.8 {d16}, [r9], r6
+vld1.8 {d18}, [r10], r6
+vld1.8 {d17}, [r11], r6
+TRANSPOSE_2X16_S8 q8, q9
+.word 0xfc604dc4 // vsdot.s8 q10, q8, q2
+.word 0xfc626dc6 // vsdot.s8 q11, q9, q3
+
+LoopDw:
+    veor d19, d19, d19
+    vld1.8 {d16}, [r9], r6
+    vld1.8 {d18}, [r10], r6
+    vld1.8 {d17}, [r11], r6
+    vld1.32 {q12, q13}, [r3]  // init from bias
+    pld [r9]
+    pld [r10]
+    pld [r11]
+    TRANSPOSE_2X16_S8 q8, q9
+    .word 0xfc604dc8 // vsdot.s8 q10, q8, q4
+    .word 0xfc626dca // vsdot.s8 q11, q9, q5
+    .word 0xfc608dc0 // vsdot.s8 q12, q8, q0
+    .word 0xfc62adc2 // vsdot.s8 q13, q9, q1
+
+    veor d19, d19, d19
+    vld1.8 {d16}, [r9], r6
+    vld1.8 {d18}, [r10], r6
+    vld1.8 {d17}, [r11], r6
+    pld [r9]
+    pld [r10]
+    pld [r11]
+
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+    vmul.f32 q10, q10, q14  // result *= scale
+    vmul.f32 q11, q11, q15
+
+    TRANSPOSE_2X16_S8 q8, q9
+    .word 0xfc608dc4 // vsdot.s8 q12, q8, q2
+    .word 0xfc62adc6 // vsdot.s8 q13, q9, q3
+
+    subs r7, r7, #1
+
+    vcge.f32 q8, q10, #0
+    vcge.f32 q9, q11, #0
+    vbsl.f32 q8, q6, q7
+    vbsl.f32 q9, q6, q7
+    vadd.f32 q10, q10, q8
+    vadd.f32 q11, q11, q9
+    vcvt.s32.f32 q10, q10
+    vcvt.s32.f32 q11, q11
+    vqmovn.s32 d20, q10
+    vqmovn.s32 d21, q11
+    vqmovn.s16 d20, q10
+    vst1.8 {d20}, [r0], r6
+
+    vmov q10, q12
+    vmov q11, q13
+
+    bne LoopDw
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+asm_function ConvDw3x3S2Int8SdotSlideWLeftC4
+//void ConvDw3x3Int8SdotSlideWLeftC4(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//r0(dst_z),
+//r1(int8_t** src),
+//r2(weight_z),
+//r3(bias_z)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(scale_z)   [sp, #100]
+//from stack(dc)        [sp, #104]
+//from stack(dst_depth) [sp, #108]
+//from stack(width)     [sp, #112]
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+ldr r7, [sp, #112]
+
+cmp r7, #0
+ble C4End
+
+// weight q0 - q2
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+vldm r2, {d0-d5}
+
+// q6, q7 for convert fp32 to int32
+vmov.f32 q6, #0.5
+vmov.f32 q7, #-0.5
+
+// bias
+vld1.32 {q3}, [r3]
+
+// scale
+vld1.32 {q4}, [r4]
+
+ldr r9,  [r1]
+ldr r10, [r1, #4]
+ldr r11, [r1, #8]
+add r9, r5     // h0 ptr += dc
+add r10, r5    // h1 ptr += dc
+add r11, r5    // h2 ptr += dc
+
+veor d17, d17, d17
+vld1.32 {d16[0]}, [r9], r6
+vld1.32 {d17[0]}, [r10], r6
+vld1.32 {d16[1]}, [r11], r6
+vmov q10, q3
+TRANSPOSE_2X8_S8 d16, d17
+.word 0xfc604dc0 // vsdot.s8 q10, q8, q0
+
+veor d17, d17, d17
+vld1.32 {d16[0]}, [r9], r6
+vld1.32 {d17[0]}, [r10], r6
+vld1.32 {d16[1]}, [r11], r6
+TRANSPOSE_2X8_S8 d16, d17
+.word 0xfc604dc2 // vsdot.s8 q10, q8, q1
+
+C4LoopDw:
+    veor d17, d17, d17
+    vld1.32 {d16[0]}, [r9], r6
+    vld1.32 {d17[0]}, [r10], r6
+    vld1.32 {d16[1]}, [r11], r6
+    vmov q11, q3
+    pld [r9]
+    pld [r10]
+    pld [r11]
+    TRANSPOSE_2X8_S8 d16, d17
+    .word 0xfc604dc4 // vsdot.s8 q10, q8, q2
+    .word 0xfc606dc0 // vsdot.s8 q11, q8, q0
+
+    veor d17, d17, d17
+    vld1.32 {d16[0]}, [r9], r6
+    vld1.32 {d17[0]}, [r10], r6
+    vld1.32 {d16[1]}, [r11], r6
+    pld [r9]
+    pld [r10]
+    pld [r11]
+    TRANSPOSE_2X8_S8 d16, d17
+    .word 0xfc606dc2 // vsdot.s8 q11, q8, q1
+
+    subs r7, r7, #1
+
+    vcvt.f32.s32 q13, q10
+    vmov q10, q11
+    vmul.f32 q13, q13, q4  // result *= scale
+    vcge.f32 q8, q13, #0
+    vbsl.f32 q8, q6, q7
+    vadd.f32 q13, q13, q8
+    vcvt.s32.f32 q13, q13
+    vqmovn.s32 d26, q13
+    vqmovn.s16 d26, q13
+    vst1.32 {d26[0]}, [r0], r6
+
+    bne C4LoopDw
+
+C4End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3_FP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3_FP16_SLIDEW.S
new file mode 100644
index 0000000..c74501e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_DW_3X3_FP16_SLIDEW.S
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3Fp16SlideW 
+//void ConvDw3x3Fp16SlideW(__fp16 *dst_z,
+//                        __fp16 **cache_line,
+//                        const __fp16* weight_z,
+//                        int dst_width)
+dst      .req r0
+line0    .req r4
+line1    .req r5
+line2    .req r6
+weight   .req r2
+width    .req r3
+
+w_00      .req q0
+w_01      .req q1
+w_02      .req q2
+w_10      .req q3
+w_11      .req q4
+w_12      .req q5
+w_20      .req q6
+w_21      .req q7
+w_22      .req q8
+
+push {r4-r6, lr}
+
+//Auto Load:
+//r0:dst_z, r1:cache_line, r2:weight_z, r3: dst_width
+
+vpush {q4-q7}
+
+cmp width, #0
+ble End
+
+ldr r4, [r1]
+ldr r5, [r1, #4]
+ldr r6, [r1, #8]
+
+vldm weight!, {d0-d15}
+vld1.16 {q8}, [weight]
+vld1.16 {q11}, [line0]!
+vld1.16 {q12}, [line1]!
+vld1.16 {q13}, [line2]!
+vmul.f16 q9, q11, w_00
+vmla.f16 q9, q12, w_10
+vmla.f16 q9, q13, w_20
+vld1.16 {q14}, [line0]!
+vld1.16 {q15}, [line1]!
+vld1.16 {q11}, [line2]!
+vmul.f16 q10, q14, w_00
+vmla.f16 q9, q14, w_01
+vmla.f16 q10, q15, w_10
+vmla.f16 q9, q15, w_11
+vmla.f16 q10, q11, w_20
+vmla.f16 q9, q11, w_21
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    vld1.16 {q11}, [line0]!
+    vld1.16 {q12}, [line1]!
+    vld1.16 {q13}, [line2]!
+
+    vmul.f16 q14, q11, w_00
+    vmla.f16 q10, q11, w_01
+    vmla.f16 q9, q11, w_02
+
+    vmla.f16 q14, q12, w_10
+    vmla.f16 q10, q12, w_11
+    vmla.f16 q9, q12, w_12
+
+    vmla.f16 q14, q13, w_20
+    vmla.f16 q10, q13, w_21
+    vmla.f16 q9, q13, w_22
+
+    vst1.16 {q9}, [dst]!
+    subs width, width, #1
+    vmov q9, q10
+    vmov q10, q14
+
+    bne LoopDw
+LoopDwEnd:
+vld1.16 {q11}, [line0]!
+vld1.16 {q12}, [line1]!
+vld1.16 {q13}, [line2]!
+vmla.f16 q9, q11, w_02
+vmla.f16 q9, q12, w_12
+vmla.f16 q9, q13, w_22
+vst1.16 {q9}, [dst]!
+
+End:
+
+vpop {q4-q7}
+pop {r4-r6, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_FP16_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_FP16_SLIDEW_C3.S
new file mode 100644
index 0000000..eff8807
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/CONV_FP16_SLIDEW_C3.S
@@ -0,0 +1,264 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmFp16SlidewC3
+//void GemmFp16SlidewC3(__fp16* dst,            //x0: dst 
+//                          const __fp16* src,      //x1: src
+//                          const __fp16* weight,   //x2: weight
+//                          int width,             //x3: width
+//                          int src_w_setup,       //x4: src_w_step
+//                          int fw,                //x5: fw
+//                          int fh,                //x6: fh
+//                          int dilateX_step,      //x7: dilateX_step
+//                          int dilateY_step);     //x8: dilateY_step, load from stack
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+src_w_step   .req r4
+fw           .req r7
+fh           .req r8
+dilateX_step .req r9
+dilate_y_gap .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:src_w_step, r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r7, [sp, #40]
+ldr r8, [sp, #44]
+ldr r9, [sp, #48]
+ldr r10, [sp, #52]
+
+vpush {q4-q7}
+
+//step multi by sizeof(float)
+mov r12, #2
+mul r10, r12, r10
+mul r9, r12, r9
+mul r4, r12, r4
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul r12, r7, r9
+sub r10, r10, r12
+
+L8:
+cmp r3, #7
+ble L4
+
+
+L8Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i32 d14[0], width
+    mov width, #8
+    mul width, src_w_step, width
+    vmov.i16 q8,  #0
+    vmov.i16 q9,  #0
+    vmov.i16 q10, #0
+    vmov.i16 q11, #0
+    vmov.i16 q12, #0
+    vmov.i16 q13, #0
+    vmov.i16 q14, #0
+    vmov.i16 q15, #0
+    mov r11, fh
+    L8LoopFY:
+        mov r12, fw
+        L8LoopFX:
+            vld1.16 {q4, q5}, [weight]!
+            vld1.16 {q6}, [weight]!
+
+            vld1.16 {q0}, [src], src_w_step
+            vld1.16 {q1}, [src], src_w_step
+
+            vmla.f16 q8, q4, d0[0]
+            vmla.f16 q8, q5, d0[1]
+            vmla.f16 q9, q4, d2[0]
+
+            vld1.16 {q2}, [src], src_w_step
+
+            vmla.f16 q8, q6, d0[2]
+            vmla.f16 q9, q5, d2[1]
+            vmla.f16 q10, q4, d4[0]
+
+            vld1.16 {q3}, [src], src_w_step
+
+            vmla.f16 q9, q6, d2[2]
+            vmla.f16 q10, q5, d4[1]
+            vmla.f16 q11, q5, d6[1]
+            
+            vld1.16 {q0}, [src], src_w_step
+            vmla.f16 q11, q6, d6[2]
+            vmla.f16 q10, q6, d4[2]
+            vmla.f16 q11, q4, d6[0]
+
+
+            vld1.16 {q1}, [src], src_w_step
+            vmla.f16 q12, q4, d0[0]
+            vmla.f16 q12, q5, d0[1]
+            vmla.f16 q13, q4, d2[0]
+            
+            vld1.16 {q2}, [src], src_w_step
+
+            vmla.f16 q12, q6, d0[2]
+            vmla.f16 q13, q5, d2[1]
+            vmla.f16 q14, q4, d4[0]
+
+            vld1.16 {q3}, [src], src_w_step
+            vmla.f16 q13, q6, d2[2]
+            vmla.f16 q14, q5, d4[1]
+            vmla.f16 q15, q5, d6[1]
+            vmla.f16 q15, q6, d6[2]
+            vmla.f16 q14, q6, d4[2]
+            vmla.f16 q15, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L8LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vmov.i32 width, d14[0]
+    vst1.16 {q8, q9}, [dst]!
+    vst1.16 {q10, q11}, [dst]!
+    sub width, width, #8
+    vst1.16 {q12, q13}, [dst]!
+    cmp width, #8
+    vst1.16 {q14, q15}, [dst]!
+    bge L8Loop
+
+L4:
+cmp width, #3
+ble L1
+
+
+L4Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i16 d14[0], width
+    mov width, #4
+    mul width, src_w_step, width
+    vmov.i16 q8, #0
+    vmov.i16 q9, #0
+    vmov.i16 q10, #0
+    vmov.i16 q11, #0
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vld1.16 {q4, q5}, [weight]!
+            vld1.16 {q6}, [weight]!
+
+            vld1.16 {q0}, [src], src_w_step
+            vld1.16 {q1}, [src], src_w_step
+
+            vmla.f16 q8, q4, d0[0]
+            vmla.f16 q8, q5, d0[1]
+            vmla.f16 q9, q4, d2[0]
+
+            vld1.16 {q2}, [src], src_w_step
+            vmla.f16 q8, q6, d0[2]
+            vmla.f16 q9, q5, d2[1]
+            vmla.f16 q10, q4, d4[0]
+
+            vld1.16 {q3}, [src], src_w_step
+
+            vmla.f16 q9, q6, d2[2]
+            vmla.f16 q10, q5, d4[1]
+            vmla.f16 q11, q5, d6[1]
+            vmla.f16 q11, q6, d6[2]
+            vmla.f16 q10, q6, d4[2]
+            vmla.f16 q11, q4, d6[0]
+
+            sub src, src, width
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L4LoopFY
+    mov fh, r11
+    mov src, r5
+    add src, src, width
+    mov weight, r6
+    vst1.16 {q8, q9}, [dst]!
+    vmov.i32 width, d14[0]
+    sub width, width, #4
+    vst1.16 {q10, q11}, [dst]!
+
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov r5, src
+    mov r6, weight
+    vmov.i16 q0, #0
+    vmov.i16 q1, #0
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vld1.16 {q3}, [src], dilateX_step
+            vld1.16 {q4, q5}, [weight]!
+            vmla.f16 q0, q4, d6[0]
+            vmla.f16 q1, q5, d6[1]
+            vld1.16 {q6}, [weight]!
+            vmla.f16 q0, q6, d6[2]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        add src, src, dilate_y_gap
+        bne L1LoopFY
+    mov fh, r11
+    vadd.f16 q0, q0, q1
+    mov src, r5
+    mov weight, r6
+    add src, src, src_w_step
+    vst1.16 {q0}, [dst]!
+    subs width, width, #1
+    bne L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8.S
new file mode 100644
index 0000000..5b06103
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8.S
@@ -0,0 +1,330 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFp16O8
+//void DeconvFp16O8(__fp16* dst,             // r0
+//                     const __fp16* src,    // r1
+//                     const __fp16* weight, // r2
+//                     int width,            // r3
+//                     int dst_w_step,       // r4
+//                     int src_depth_div8,   // r5
+//                     int src_depth_step,   // r6
+//                     int fw,               // r7
+//                     int fh,               // r8
+//                     int dilate_x_step,    // r9
+//                     int dilate_y_step)    // r10
+
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+dst_w_step   .req r4
+ic8          .req r5
+fw           .req r7
+fh           .req r8
+dilate_x_step .req r9
+dilate_y_step .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:dst_w_step, r5:src_depth_div8, r6: src_depth_step
+//r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+
+vpush {q4-q7}
+
+//step multi by sizeof(__fp16)
+lsl r10, r10, #1
+lsl r9, r9, #1
+lsl r6, r6, #1
+lsl r4, r4, #1
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+//mul x12, x8, x10
+//sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+//mul x12, x7, x9
+//sub x10, x10, x12
+
+L4:
+cmp r3, #3
+ble L1
+
+L4Loop:
+    vmov.i32 d8[0], src
+    vmov.i32 d8[1], weight
+    vmov.i32 d9[0], width
+
+    lsl width, dst_w_step, #2
+
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vmov.i32 d9[1], ic8
+            vld1.16 {q12, q13}, [weight]!
+            vld1.16 {q14, q15}, [weight]!
+            vldr d0, [src, #0]
+            vldr d1, [src, #8]
+            vldr d2, [src, #16]
+            vldr d3, [src, #24]
+            vldr d4, [src, #32]
+            vldr d5, [src, #40]
+            vldr d6, [src, #48]
+            vldr d7, [src, #56]
+            vmul.f16 q8,  q12, d0[0]
+            vmul.f16 q9,  q12, d2[0]
+            vmul.f16 q10, q12, d4[0]
+            vmul.f16 q11, q12, d6[0]
+
+            subs ic8, ic8, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                add src, src, r6
+
+                vmla.f16 q8,  q13, d0[1]
+                vmla.f16 q9,  q13, d2[1]
+                vmla.f16 q10, q13, d4[1]
+                vmla.f16 q11, q13, d6[1]
+
+                vld1.16 {q12, q13}, [weight]!
+                vmla.f16 q8,  q14, d0[2]
+                vmla.f16 q9,  q14, d2[2]
+                vmla.f16 q10, q14, d4[2]
+                vmla.f16 q11, q14, d6[2]
+
+                vmla.f16 q8,  q15, d0[3]
+                vmla.f16 q9,  q15, d2[3]
+                vmla.f16 q10, q15, d4[3]
+                vmla.f16 q11, q15, d6[3]
+
+                vldr d0, [src, #0]
+                vldr d2, [src, #16]
+                vldr d4, [src, #32]
+                vldr d6, [src, #48]
+                vld1.16 {q14, q15}, [weight]!
+                vmla.f16 q8,  q12, d1[0]
+                vmla.f16 q9,  q12, d3[0]
+                vmla.f16 q10, q12, d5[0]
+                vmla.f16 q11, q12, d7[0]
+
+                vmla.f16 q8,  q13, d1[1]
+                vmla.f16 q9,  q13, d3[1]
+                vmla.f16 q10, q13, d5[1]
+                vmla.f16 q11, q13, d7[1]
+
+                vld1.16 {q12, q13}, [weight]!
+                vmla.f16 q8,  q14, d1[2]
+                vmla.f16 q9,  q14, d3[2]
+                vmla.f16 q10, q14, d5[2]
+                vmla.f16 q11, q14, d7[2]
+
+                vmla.f16 q8,  q15, d1[3]
+                vmla.f16 q9,  q15, d3[3]
+                vmla.f16 q10, q15, d5[3]
+                vmla.f16 q11, q15, d7[3]
+
+                vldr d1, [src, #8]
+                vldr d3, [src, #24]
+                vldr d5, [src, #40]
+                vldr d7, [src, #56]
+                subs ic8, ic8, #1
+                vld1.16 {q14, q15}, [weight]!
+                vmla.f16 q8,  q12, d0[0]
+                vmla.f16 q9,  q12, d2[0]
+                vmla.f16 q10, q12, d4[0]
+                vmla.f16 q11, q12, d6[0]
+
+                bne L4LoopZ
+
+            L4LoopZEnd:
+            vmla.f16 q8,  q13, d0[1]
+            vmla.f16 q9,  q13, d2[1]
+            vmla.f16 q10, q13, d4[1]
+            vmla.f16 q11, q13, d6[1]
+            vld1.16 {q12, q13}, [weight]! 
+
+            vmla.f16 q8,  q14, d0[2]
+            vmla.f16 q9,  q14, d2[2]
+            vmla.f16 q10, q14, d4[2]
+            vmla.f16 q11, q14, d6[2]
+
+            vmla.f16 q8,  q15, d0[3]
+            vmla.f16 q9,  q15, d2[3]
+            vmla.f16 q10, q15, d4[3]
+            vmla.f16 q11, q15, d6[3]
+            vld1.16 {q14, q15}, [weight]!
+
+            vmla.f16 q8,  q12, d1[0]
+            vmla.f16 q9,  q12, d3[0]
+            vmla.f16 q10, q12, d5[0]
+            vmla.f16 q11, q12, d7[0]
+
+            vmla.f16 q8,  q13, d1[1]
+            vmla.f16 q9,  q13, d3[1]
+            vmla.f16 q10, q13, d5[1]
+            vmla.f16 q11, q13, d7[1]
+
+            vmla.f16 q8,  q14, d1[2]
+            vmla.f16 q9,  q14, d3[2]
+            vmla.f16 q10, q14, d5[2]
+            vmla.f16 q11, q14, d7[2]
+
+            vmla.f16 q8,  q15, d1[3]
+            vmla.f16 q9,  q15, d3[3]
+            vmla.f16 q10, q15, d5[3]
+            vmla.f16 q11, q15, d7[3]
+
+            vld1.16 {q0}, [dst], dst_w_step
+            vld1.16 {q1}, [dst], dst_w_step
+            vld1.16 {q2}, [dst], dst_w_step
+            vld1.16 {q3}, [dst], dst_w_step
+            sub dst, dst, width
+
+            // add with stride
+            vadd.f16 q8,  q8,  q0
+            vadd.f16 q9,  q9,  q1
+            vadd.f16 q10, q10, q2
+            vadd.f16 q11, q11, q3
+
+            vst1.16 {q8},  [dst], dst_w_step
+            vst1.16 {q9},  [dst], dst_w_step
+            vst1.16 {q10}, [dst], dst_w_step
+            vst1.16 {q11}, [dst], dst_w_step
+            sub dst, dst, width
+            add dst, dst, dilate_x_step
+
+            vmov.i32 ic8, d9[1]
+            subs fw, fw, #1
+            vmov.i32 src, d8[0] 
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub dst, dst, r12
+        add dst, dst, dilate_y_step
+        bne L4LoopFY
+    
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub dst, dst, r12
+    add src, src, #64
+    add dst, dst, width
+    vmov.i32 weight, d8[1]
+    vmov.i32 width, d9[0]
+    sub width, width, #4
+    cmp width, #4
+    bge L4Loop
+
+
+L1:
+cmp r3, #0
+ble End
+
+L1Loop:
+    vmov.i32 d8[0], src
+    vmov.i32 d8[1], weight
+    vmov.i32 d9[0], width
+
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vmov.i32 d9[1], ic8
+            veor q8,  q8,  q8
+            veor q9,  q9,  q9
+            veor q10, q10, q10
+            veor q11, q11, q11
+            L1LoopZ:
+                vld1.16 {q1, q2}, [weight]!
+                vld1.16 {q6, q7}, [weight]!
+                vld1.16 {q12, q13}, [weight]!
+                vld1.16 {q14, q15}, [weight]!
+                vld1.16 {q0}, [src]
+                add src, src, r6
+
+                vmla.f16 q8,  q1,  d0[0]
+                vmla.f16 q9,  q2,  d0[1]
+                vmla.f16 q10, q6,  d0[2]
+                vmla.f16 q11, q7,  d0[3]
+                vmla.f16 q8,  q12, d1[0]
+                vmla.f16 q9,  q13, d1[1]
+                vmla.f16 q10, q14, d1[2]
+                vmla.f16 q11, q15, d1[3]
+
+                subs ic8, ic8, #1
+                bne L1LoopZ
+
+            L1LoopZEnd:
+            vld1.16 {q0}, [dst]
+            vadd.f16 q8,  q8,  q9
+            vadd.f16 q10, q10, q11
+            vadd.f16 q8,  q8,  q10
+            // add with stride
+            vadd.f16 q8,  q8,  q0
+            vst1.16 {q8}, [dst]
+
+            add dst, dst, dilate_x_step
+
+            vmov.i32 ic8, d9[1]
+            subs fw, fw, #1
+            vmov.i32 src, d8[0]
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub dst, dst, r12
+        add dst, dst, dilate_y_step
+        bne L1LoopFY
+
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub dst, dst, r12
+    add src, src, #16
+    add dst, dst, dst_w_step
+    vmov.i32 weight, d8[1]
+    vmov.i32 width, d9[0]
+    sub width, width, #1
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8_C1.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8_C1.S
new file mode 100644
index 0000000..200c9a6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/DECONV_FP16_O8_C1.S
@@ -0,0 +1,324 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFp16O8C1
+//input is nchw
+//output is nc8hw8
+//void DeconvFp16O8C1(__fp16* dst,             // r0
+//                     const __fp16* src,    // r1
+//                     const __fp16* weight, // r2
+//                     int width,            // r3
+//                     int dst_w_step,       // r4
+//                     int src_depth_div8,   // r5
+//                     int src_depth_step,   // r6
+//                     int fw,               // r7
+//                     int fh,               // r8
+//                     int dilate_x_step,    // r9
+//                     int dilate_y_step)    // r10
+
+dst          .req r0
+src          .req r1
+weight       .req r2
+width        .req r3
+dst_w_step   .req r4
+ic           .req r5
+fw           .req r7
+fh           .req r8
+dilate_x_step .req r9
+dilate_y_step .req r10
+
+push {r4-r11, lr}
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+//Load from sp
+//r4:dst_w_step, r5:src_depth_div8, r6: src_depth_step
+//r7:fw, r8:fh, r9:dilate_x_step, r10:dilate_y_step
+ldr r4, [sp, #36]
+ldr r5, [sp, #40]
+ldr r6, [sp, #44]
+ldr r7, [sp, #48]
+ldr r8, [sp, #52]
+ldr r9, [sp, #56]
+ldr r10, [sp, #60]
+
+vpush {q4-q7}
+
+//step multi by sizeof(__fp16)
+lsl r10, r10, #1
+lsl r9, r9, #1
+lsl r6, r6, #1
+lsl r4, r4, #1
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+//mul x12, x8, x10
+//sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+//mul x12, x7, x9
+//sub x10, x10, x12
+
+L8:
+cmp r3, #7
+ble L4
+
+L8Loop:
+    vmov.i32 d8[0], src
+    vmov.i32 d8[1], weight
+    vmov.i32 d9[0], width
+
+    lsl width, dst_w_step, #3
+
+    mov r11, fh
+    L8LoopFY:
+        mov r12, fw
+        L8LoopFX:
+            vmov.i32 d9[1], ic
+            vld1.16 {q7}, [weight]!
+            vld1.16 {q0}, [src], r6
+            vmul.f16 q8,  q7, d0[0]
+            vmul.f16 q9,  q7, d0[1]
+            vmul.f16 q10, q7, d0[2]
+            vmul.f16 q11, q7, d0[3]
+            vmul.f16 q12, q7, d1[0]
+            vmul.f16 q13, q7, d1[1]
+            vmul.f16 q14, q7, d1[2]
+            vmul.f16 q15, q7, d1[3]
+
+            subs ic, ic, #1
+            beq L8LoopZEnd
+            L8LoopZ:
+                vld1.16 {q7}, [weight]!
+                vld1.16 {q0}, [src], r6
+
+                subs ic, ic, #1
+                vmla.f16 q8,  q7, d0[0]
+                vmla.f16 q9,  q7, d0[1]
+                vmla.f16 q10, q7, d0[2]
+                vmla.f16 q11, q7, d0[3]
+                vmla.f16 q12, q7, d1[0]
+                vmla.f16 q13, q7, d1[1]
+                vmla.f16 q14, q7, d1[2]
+                vmla.f16 q15, q7, d1[3]
+
+                bne L8LoopZ
+            L8LoopZEnd:
+
+            vld1.16 {q0}, [dst], dst_w_step
+            vld1.16 {q1}, [dst], dst_w_step
+            vld1.16 {q2}, [dst], dst_w_step
+            vld1.16 {q3}, [dst], dst_w_step
+
+            // add with stride
+            vadd.f16 q8,  q8,  q0
+            vld1.16 {q0}, [dst], dst_w_step
+            vadd.f16 q9,  q9,  q1
+            vld1.16 {q1}, [dst], dst_w_step
+            vadd.f16 q10, q10, q2
+            vld1.16 {q2}, [dst], dst_w_step
+            vadd.f16 q11, q11, q3
+            vld1.16 {q3}, [dst], dst_w_step
+
+            sub dst, dst, width
+            vadd.f16 q12, q12, q0
+            vadd.f16 q13, q13, q1
+            vadd.f16 q14, q14, q2
+            vadd.f16 q15, q15, q3
+
+            vst1.16 {q8},  [dst], dst_w_step
+            vst1.16 {q9},  [dst], dst_w_step
+            vst1.16 {q10}, [dst], dst_w_step
+            vst1.16 {q11}, [dst], dst_w_step
+            vst1.16 {q12}, [dst], dst_w_step
+            vst1.16 {q13}, [dst], dst_w_step
+            vst1.16 {q14}, [dst], dst_w_step
+            vst1.16 {q15}, [dst], dst_w_step
+            sub dst, dst, width
+            add dst, dst, dilate_x_step
+
+            vmov.i32 ic, d9[1]
+            subs fw, fw, #1
+            vmov.i32 src, d8[0]
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub dst, dst, r12
+        add dst, dst, dilate_y_step
+        bne L8LoopFY
+
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub dst, dst, r12
+    add src, src, #16
+    add dst, dst, width
+    vmov.i32 weight, d8[1]
+    vmov.i32 width, d9[0]
+    sub width, width, #8
+    cmp width, #8
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+L4Loop:
+    vmov.i32 d8[0], src
+    vmov.i32 d8[1], weight
+    vmov.i32 d9[0], width
+
+    lsl width, dst_w_step, #2
+
+    mov r11, fh
+    L4LoopFY:
+        mov r12, fw
+        L4LoopFX:
+            vmov.i32 d9[1], ic
+            vld1.16 {q12}, [weight]!
+            vld1.16 {d0}, [src], r6
+            vmul.f16 q8,  q12, d0[0]
+            vmul.f16 q9,  q12, d0[1]
+            vmul.f16 q10, q12, d0[2]
+            vmul.f16 q11, q12, d0[3]
+
+            subs ic, ic, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                vld1.16 {q12}, [weight]!
+                vld1.16 {d0}, [src], r6
+
+                vmla.f16 q8,  q12, d0[0]
+                vmla.f16 q9,  q12, d0[1]
+                vmla.f16 q10, q12, d0[2]
+                vmla.f16 q11, q12, d0[3]
+
+                subs ic, ic, #1
+                bne L4LoopZ
+
+            L4LoopZEnd:
+
+            vld1.16 {q0}, [dst], dst_w_step
+            vld1.16 {q1}, [dst], dst_w_step
+            vld1.16 {q2}, [dst], dst_w_step
+            vld1.16 {q3}, [dst], dst_w_step
+            sub dst, dst, width
+
+            // add with stride
+            vadd.f16 q8,  q8,  q0
+            vadd.f16 q9,  q9,  q1
+            vadd.f16 q10, q10, q2
+            vadd.f16 q11, q11, q3
+
+            vst1.16 {q8},  [dst], dst_w_step
+            vst1.16 {q9},  [dst], dst_w_step
+            vst1.16 {q10}, [dst], dst_w_step
+            vst1.16 {q11}, [dst], dst_w_step
+            sub dst, dst, width
+            add dst, dst, dilate_x_step
+
+            vmov.i32 ic, d9[1]
+            subs fw, fw, #1
+            vmov.i32 src, d8[0] 
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub dst, dst, r12
+        add dst, dst, dilate_y_step
+        bne L4LoopFY
+    
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub dst, dst, r12
+    add src, src, #8
+    add dst, dst, width
+    vmov.i32 weight, d8[1]
+    vmov.i32 width, d9[0]
+    sub width, width, #4
+    cmp width, #4
+    bge L4Loop
+
+L1:
+cmp r3, #0
+ble End
+
+L1Loop:
+    vmov.i32 d8[0], src
+    vmov.i32 d8[1], weight
+    vmov.i32 d9[0], width
+
+    mov r11, fh
+    L1LoopFY:
+        mov r12, fw
+        L1LoopFX:
+            vmov.i32 d9[1], ic
+            veor q8,  q8,  q8
+            L1LoopZ:
+                vld1.16 {q1}, [weight]!
+                vld1.16 {d0[0]}, [src], r6
+
+                vmla.f16 q8,  q1,  d0[0]
+
+                subs ic, ic, #1
+                bne L1LoopZ
+
+            L1LoopZEnd:
+            vld1.16 {q0}, [dst]
+            // add with stride
+            vadd.f16 q8,  q8,  q0
+            vst1.16 {q8}, [dst]
+
+            add dst, dst, dilate_x_step
+
+            vmov.i32 ic, d9[1]
+            subs fw, fw, #1
+            vmov.i32 src, d8[0]
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, r12
+        mul r12, fw, dilate_x_step
+        sub dst, dst, r12
+        add dst, dst, dilate_y_step
+        bne L1LoopFY
+
+    mov fh, r11
+    mul r12, fh, dilate_y_step
+    sub dst, dst, r12
+    add src, src, #2
+    add dst, dst, dst_w_step
+    vmov.i32 weight, d8[1]
+    vmov.i32 width, d9[0]
+    sub width, width, #1
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/FLOAT2HALF.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/FLOAT2HALF.S
new file mode 100644
index 0000000..7373ae6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/FLOAT2HALF.S
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function Float2HalfKernel
+//void Half2FloatKernel(__fp16* dst, const float* src, const size_t length)
+//Auto Load:
+//r0:dst, r1:src, r2:length
+push {r4-r11, lr}
+vpush {q4-q7}
+
+cmp r2, #15
+ble L16END
+
+L16:
+    sub r2, r2, #16
+    vld1.32 {q0}, [r1]!
+    vld1.32 {q1}, [r1]!
+    vld1.32 {q2}, [r1]!
+    vld1.32 {q3}, [r1]!
+    cmp r2, #16
+    vcvt.f16.f32 d8, q0
+    vcvt.f16.f32 d9, q1
+    vcvt.f16.f32 d10, q2
+    vcvt.f16.f32 d11, q3
+    vst1.16 {q4, q5}, [r0]!
+    bge L16
+
+L16END:
+cmp r2, #7
+ble L8END
+L8:
+    sub r2, r2, #8
+    vld1.32 {q0}, [r1]!
+    vld1.32 {q1}, [r1]!
+    cmp r2, #8
+    vcvt.f16.f32 d8, q0
+    vcvt.f16.f32 d9, q1
+    vst1.16 {q4}, [r0]!
+    bge L8
+
+L8END:
+cmp r2, #3
+ble L4END
+L4:
+    sub r2, r2, #4
+    vld1.32 {q0}, [r1]!
+    cmp r2, #4
+    vcvt.f16.f32 d8, q0
+    vst1.16 {d8}, [r0]!
+    bge L4
+
+L4END:
+cmp r2, #0
+beq L1END
+
+L1:
+    subs r2, r2, #1
+    vld1.32 d0[0], [r1]!
+    vcvt.f16.f32 d8, q0
+    vst1.16 d8[0], [r0]!
+    bne L1
+L1END:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_FP16_N8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_FP16_N8.S
new file mode 100644
index 0000000..96e89d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_FP16_N8.S
@@ -0,0 +1,531 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_FP16_N8
+//void GEMM_FP16_N8(fp16_t* dst, const fp16_t* src, const fp16_t* weight, int src_depth,
+//                            int dst_step, int dst_depth, int width, fp16_t *bias, int32_t relu)
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 z4 z5 z6 z7 y
+vmla.f16 \z0, \y, d0[0]
+vmla.f16 \z1, \y, d0[1]
+vmla.f16 \z2, \y, d0[2]
+vmla.f16 \z3, \y, d0[3]
+vmla.f16 \z4, \y, d1[0]
+vmla.f16 \z5, \y, d1[1]
+vmla.f16 \z6, \y, d1[2]
+vmla.f16 \z7, \y, d1[3]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 z4 z5 z6 z7 y
+vmla.f16 \z0, \y, d2[0]
+vmla.f16 \z1, \y, d2[1]
+vmla.f16 \z2, \y, d2[2]
+vmla.f16 \z3, \y, d2[3]
+vmla.f16 \z4, \y, d3[0]
+vmla.f16 \z5, \y, d3[1]
+vmla.f16 \z6, \y, d3[2]
+vmla.f16 \z7, \y, d3[3]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 z4 z5 z6 z7 y
+vmla.f16 \z0, \y, d4[0]
+vmla.f16 \z1, \y, d4[1]
+vmla.f16 \z2, \y, d4[2]
+vmla.f16 \z3, \y, d4[3]
+vmla.f16 \z4, \y, d5[0]
+vmla.f16 \z5, \y, d5[1]
+vmla.f16 \z6, \y, d5[2]
+vmla.f16 \z7, \y, d5[3]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 z4 z5 z6 z7 y
+vmla.f16 \z0, \y, d6[0]
+vmla.f16 \z1, \y, d6[1]
+vmla.f16 \z2, \y, d6[2]
+vmla.f16 \z3, \y, d6[3]
+vmla.f16 \z4, \y, d7[0]
+vmla.f16 \z5, \y, d7[1]
+vmla.f16 \z6, \y, d7[2]
+vmla.f16 \z7, \y, d7[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_0 z0 z1 z2 z3 y
+vmla.f16 \z0, \y, d0[0]
+vmla.f16 \z1, \y, d0[1]
+vmla.f16 \z2, \y, d0[2]
+vmla.f16 \z3, \y, d0[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_1 z0 z1 z2 z3 y
+vmla.f16 \z0, \y, d1[0]
+vmla.f16 \z1, \y, d1[1]
+vmla.f16 \z2, \y, d1[2]
+vmla.f16 \z3, \y, d1[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_2 z0 z1 z2 z3 y
+vmla.f16 \z0, \y, d2[0]
+vmla.f16 \z1, \y, d2[1]
+vmla.f16 \z2, \y, d2[2]
+vmla.f16 \z3, \y, d2[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_3 z0 z1 z2 z3 y
+vmla.f16 \z0, \y, d3[0]
+vmla.f16 \z1, \y, d3[1]
+vmla.f16 \z2, \y, d3[2]
+vmla.f16 \z3, \y, d3[3]
+.endm
+
+.macro COMPUTE_M3_UNIT_0 z0 z1 z2 y
+vmla.f16 \z0, \y, d0[0]
+vmla.f16 \z1, \y, d0[1]
+vmla.f16 \z2, \y, d0[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_1 z0 z1 z2 y
+vmla.f16 \z0, \y, d1[0]
+vmla.f16 \z1, \y, d1[1]
+vmla.f16 \z2, \y, d1[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_2 z0 z1 z2 y
+vmla.f16 \z0, \y, d2[0]
+vmla.f16 \z1, \y, d2[1]
+vmla.f16 \z2, \y, d2[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_3 z0 z1 z2 y
+vmla.f16 \z0, \y, d3[0]
+vmla.f16 \z1, \y, d3[1]
+vmla.f16 \z2, \y, d3[2]
+.endm
+
+push {r4-r11, lr}
+vpush {q4-q7}
+
+// Auto Load:
+// r0:dst, r1:src, r2:weight, r3:src_depth
+
+// Load from sp:
+// r4:dst_step, r5:dst_depth, r6:width, r7:bias, r12:store_flag
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+ldr r7, [sp, #112]
+ldr r12, [sp, #116]
+
+//dst_step multi by sizeof(__fp16)
+lsl r4, r4, #1
+
+// r11:weight_step (src_depth * 8 * sizeof(__fp16))
+lsl r11, r3, #4
+
+// src_depth align with 8
+lsr r8, r3, #3
+lsl r8, r8, #3
+// src_depth remain
+sub r9, r3, r8
+mov r3, r8
+mov r10, r9
+
+// store src_ptr to [sp, #-4]
+str r1, [sp, #-4]
+
+LoopDz:
+// dst_ptr tmp, store to [sp, #-8]
+str r0, [sp, #-8]
+// weight ptr tmp, [sp, #-12]
+str r2, [sp, #-12]
+
+L8:
+    cmp r6, #7
+    ble L4
+
+    vld1.16 q8, [r7]
+    pld [r1, #512]
+    vld1.16 q0, [r1]!
+    pld [r2, #512]
+    vld1.16 q4, [r2]!
+
+    vmov q9,  q8
+    vmov q10, q8
+    vmov q11, q8
+    vmov q12, q8
+    vmov q13, q8
+    vmov q14, q8
+    vmov q15, q8
+
+    cmp r8, #7
+    ble L8MAC8CEND
+
+L8MAC8C:
+    vld1.16 {q1, q2}, [r1]!
+    vld1.16 {q5, q6}, [r2]!
+    // oc8ic0 * [ic0m0, ic0m7]
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q12, q13, q14, q15, q4
+
+    vld1.16 q3, [r1]!
+    vld1.16 q0, [r1]!
+    vld1.16 q7, [r2]!
+    vld1.16 q4, [r2]!
+    // oc8ic1 * [ic1m0, ic1m7]
+    COMPUTE_UNIT_1 q8, q9, q10, q11, q12, q13, q14, q15, q5
+
+    vld1.16 q1, [r1]!
+    vld1.16 q5, [r2]!
+    // oc8ic2 * [ic2m0, ic2m7]
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q12, q13, q14, q15, q6
+
+    vld1.16 q2, [r1]!
+    vld1.16 q6, [r2]!
+    // oc8ic3 * [ic3m0, ic3m7]
+    COMPUTE_UNIT_3 q8, q9, q10, q11, q12, q13, q14, q15, q7
+
+    vld1.16 q3, [r1]!
+    vld1.16 q7, [r2]!
+    // oc8ic4 * [ic4m0, ic4m7]
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q12, q13, q14, q15, q4
+
+    pld [r1, #512]
+    vld1.16 q0, [r1]!
+    // oc8ic5 * [ic5m0, ic5m7]
+    COMPUTE_UNIT_1 q8, q9, q10, q11, q12, q13, q14, q15, q5
+
+    pld [r2, #512]
+    vld1.16 q4, [r2]!
+    // oc8ic6 * [ic6m0, ic6m7]
+    COMPUTE_UNIT_2 q8, q9, q10, q11, q12, q13, q14, q15, q6
+
+    subs r3, r3, #8
+    // oc8ic7 * [ic7m0, ic7m7]
+    COMPUTE_UNIT_3 q8, q9, q10, q11, q12, q13, q14, q15, q7
+
+    bne L8MAC8C
+
+L8MAC8CEND:
+    cmp r10, #0
+    // sub pre-load offset
+    sub r1, r1, #16
+    sub r2, r2, #16
+    beq L8MAC1CEND
+
+L8MAC1C:
+    // add pre-load offset
+    add r1, r1, #16
+    add r2, r2, #16
+    subs r9, #1
+    COMPUTE_UNIT_0 q8, q9, q10, q11, q12, q13, q14, q15, q4
+    vld1.16 {q0}, [r1]
+    vld1.16 {q4}, [r2]
+    bne L8MAC1C
+
+L8MAC1CEND:
+    // if relu
+    cmp r12, #0
+    beq Store8
+    veor q0, q0, q0
+    vmax.f16 q8,  q8,  q0
+    vmax.f16 q9,  q9,  q0
+    vmax.f16 q10, q10, q0
+    vmax.f16 q11, q11, q0
+    vmax.f16 q12, q12, q0
+    vmax.f16 q13, q13, q0
+    vmax.f16 q14, q14, q0
+    vmax.f16 q15, q15, q0
+
+    // if relu6
+    cmp r12, #2
+    bne Store8
+    // 6.0f
+    vmov.i16 q1, #0x4600
+    vmin.f16 q8,  q8,  q1
+    vmin.f16 q9,  q9,  q1
+    vmin.f16 q10, q10, q1
+    vmin.f16 q11, q11, q1
+    vmin.f16 q12, q12, q1
+    vmin.f16 q13, q13, q1
+    vmin.f16 q14, q14, q1
+    vmin.f16 q15, q15, q1
+Store8:
+    vstm r0!, {d16-d23}
+    sub r6, r6, #8
+    vstm r0!, {d24-d31}
+
+    cmp r6, #8
+    // reset weight ptr
+    ldr r2, [sp, #-12]
+    // reset loop counter
+    mov r3, r8
+    mov r9, r10
+
+    bge L8
+
+L4:
+    cmp r6, #3
+    ble L1
+
+    vld1.16 q8, [r7]
+    pld [r1, #256]
+    vldr d0, [r1, #0]
+    pld [r2, #512]
+    vld1.16 q4, [r2]!
+    vmov q9,  q8
+    vmov q10, q8
+    vmov q11, q8
+
+    cmp r8, #7
+    ble L4MAC8CEND
+
+L4MAC8C:
+    vldr d1, [r1, #8]
+    vldr d2, [r1, #16]
+    vld1.16 {q5, q6}, [r2]!
+    // oc8ic0 * [ic0m0, ic0m3]
+    COMPUTE_M4_UNIT_0 q8, q9, q10, q11, q4
+
+    vldr d3, [r1, #24]
+    vldr d0, [r1, #32]
+    vld1.16 {q7}, [r2]!
+    vld1.16 {q4}, [r2]!
+    // oc8ic1 * [ic1m0, ic1m3]
+    COMPUTE_M4_UNIT_1 q8, q9, q10, q11, q5
+
+    vldr d1, [r1, #40]
+    vld1.16 {q5}, [r2]!
+    // oc8ic2 * [ic2m0, ic2m3]
+    COMPUTE_M4_UNIT_2 q8, q9, q10, q11, q6
+
+    vldr d2, [r1, #48]
+    vld1.16 {q6}, [r2]!
+    // oc8ic3 * [ic3m0, ic3m3]
+    COMPUTE_M4_UNIT_3 q8, q9, q10, q11, q7
+
+    vldr d3, [r1, #56]
+    vld1.16 {q7}, [r2]!
+    add r1, r1, #64
+    // oc8ic4 * [ic4m0, ic4m3]
+    COMPUTE_M4_UNIT_0 q8, q9, q10, q11, q4
+
+    pld [r1, #256]
+    vldr d0, [r1, #0]
+    // oc8ic5 * [ic5m0, ic5m3]
+    COMPUTE_M4_UNIT_1 q8, q9, q10, q11, q5
+
+    pld [r2, #512]
+    vld1.16 {q4}, [r2]!
+    // oc8ic6 * [ic6m0, ic6m3]
+    COMPUTE_M4_UNIT_2 q8, q9, q10, q11, q6
+
+    subs r3, r3, #8
+    // oc8ic7 * [ic7m0, ic7m3]
+    COMPUTE_M4_UNIT_3 q8, q9, q10, q11, q7
+
+    bne L4MAC8C
+
+L4MAC8CEND:
+    cmp r10, #0
+    beq L4MAC1CEND
+
+L4MAC1C:
+    // add pre-load offset
+    add r1, r1, #8
+    subs r9, r9, #1
+    // oc8ic0 * [ic0m0, ic0m3]
+    COMPUTE_M4_UNIT_0 q8, q9, q10, q11, q4
+    vldr d0, [r1]
+    vld1.16 {q4}, [r2]!
+    bne L4MAC1C
+
+L4MAC1CEND:
+    // if relu
+    cmp r12, #0
+    beq Store4
+    veor q0, q0, q0
+    vmax.f16 q8,  q8,  q0
+    vmax.f16 q9,  q9,  q0
+    vmax.f16 q10, q10, q0
+    vmax.f16 q11, q11, q0
+
+    // if relu6
+    cmp r12, #2
+    bne Store4
+    // 6.0f
+    vmov.i16 q1, #0x4600
+    vmin.f16 q8,  q8,  q1
+    vmin.f16 q9,  q9,  q1
+    vmin.f16 q10, q10, q1
+    vmin.f16 q11, q11, q1
+Store4:
+    vstm r0!, {d16-d23}
+    sub r6, r6, #4
+
+    cmp r6, #4
+    // reset weight ptr
+    ldr r2, [sp, #-12]
+    // reset loop counter
+    mov r3, r8
+    mov r9, r10
+
+    bge L4
+
+L1:
+    cmp r6, #0
+    ble END
+
+    vld1.16 q8, [r7]
+    pld [r1, #256]
+    // when L1, src is 4 x crr
+    vldr d0, [r1, #0]
+    pld [r2, #512]
+    vld1.16 {q4}, [r2]!
+
+    vmov q9,  q8
+    vmov q10, q8
+
+    cmp r8, #7
+    ble L1MAC8CEND
+
+L1MAC8C:
+    vldr d1, [r1, #8]
+    vldr d2, [r1, #16]
+    vld1.16 {q5, q6}, [r2]!
+    // oc8ic0 * [ic0m0, ic0m2]
+    COMPUTE_M3_UNIT_0 q8, q9, q10, q4
+
+    vldr d3, [r1, #24]
+    vldr d0, [r1, #32]
+    vld1.16 {q7}, [r2]!
+    vld1.16 {q4}, [r2]!
+    // oc8ic1 * [ic1m0, ic1m2]
+    COMPUTE_M3_UNIT_1 q8, q9, q10, q5
+
+    vldr d1, [r1, #40]
+    vld1.16 {q5}, [r2]!
+    // oc8ic2 * [ic2m0, ic2m2]
+    COMPUTE_M3_UNIT_2 q8, q9, q10, q6
+
+    vldr d2, [r1, #48]
+    vld1.16 {q6}, [r2]!
+    // oc8ic3 * [ic3m0, ic3m2]
+    COMPUTE_M3_UNIT_3 q8, q9, q10, q7
+
+    vldr d3, [r1, #56]
+    vld1.16 {q7}, [r2]!
+    add r1, r1, #64
+    // oc8ic4 * [ic4m0, ic4m2]
+    COMPUTE_M3_UNIT_0 q8, q9, q10, q4
+
+    pld [r1, #256]
+    vldr d0, [r1, #0]
+    // oc8ic5 * [ic5m0, ic5m2]
+    COMPUTE_M3_UNIT_1 q8, q9, q10, q5
+
+    pld [r2, #512]
+    vld1.16 {q4}, [r2]!
+    // oc8ic6 * [ic6m0, ic6m2]
+    COMPUTE_M3_UNIT_2 q8, q9, q10, q6
+
+    subs r3, r3, #8
+    // oc8ic7 * [ic7m0, ic7m2]
+    COMPUTE_M3_UNIT_3 q8, q9, q10, q7
+
+    bne L1MAC8C
+
+L1MAC8CEND:
+    cmp r10, #0
+    beq L1MAC1CEND
+
+L1MAC1C:
+    // add pre-load offset
+    add r1, r1, #8
+    subs r9, r9, #1
+    // oc8ic0 * [ic0m0, ic0m2]
+    COMPUTE_M3_UNIT_0 q8, q9, q10, q4
+    vldr d0, [r1]
+    vld1.16 {q4}, [r2]!
+    bne L1MAC1C
+
+L1MAC1CEND:
+    // if relu
+    cmp r12, #0
+    beq Store1
+    veor q0, q0, q0
+    vmax.f16 q8,  q8,  q0
+    vmax.f16 q9,  q9,  q0
+    vmax.f16 q10, q10, q0
+
+    // if relu6
+    cmp r12, #2
+    bne Store1
+    // 6.0f
+    vmov.i16 q1, #0x4600
+    vmin.f16 q8,  q8,  q1
+    vmin.f16 q9,  q9,  q1
+    vmin.f16 q10, q10, q1
+Store1:
+    cmp r6, #3
+    beq Store1_3reg
+    cmp r6, #2
+    beq Store1_2reg
+    cmp r6, #1
+    beq Store1_1reg
+Store1_3reg:
+    vstm r0!, {d16-d21}
+    b Store1_End
+Store1_2reg:
+    vstm r0!, {d16-d19}
+    b Store1_End
+Store1_1reg:
+    vstm r0!, {d16-d17}
+Store1_End:
+    // reset weight ptr
+    ldr r2, [sp, #-12]
+    // reset loop counter
+    mov r3, r8
+    mov r9, r10
+
+END:
+
+subs r5, r5, #8
+// update bias_ptr
+add r7, r7, #16
+// update dst_ptr
+ldr r0, [sp, #-8]
+add r0, r0, r4
+// reset src ptr
+ldr r1, [sp, #-4]
+// update weight ptr
+ldr r2, [sp, #-12]
+add r2, r2, r11
+// reset M counter
+ldr r6, [sp, #108]
+bne LoopDz
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X4.S
new file mode 100644
index 0000000..c7b6540
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X4.S
@@ -0,0 +1,473 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmInt8SdotUnit4x4
+//void GemmInt8SdotUnit4x4(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                         long src_depth, long dst_depth, long hw, 
+//                         const int32_t* bias, const float* scale,
+//                         long relu, const int8_t* add_input, 
+//                         const float* add_scale, const int8_t* relu6_max)
+//r0(dst),
+//r1(src),
+//r2(weight),
+//r3(src_depth)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(dst_depth) [sp, #100]
+//from stack(hw)        [sp, #104]
+//from stack(bias)      [sp, #108]
+//from stack(scale)     [sp, #112]
+//from stack(relu)      [sp, #116]
+//from stack(add_input) [sp, #120]
+//from stack(add_scale) [sp, #124]
+//from stack(relu6_max) [sp, #128]
+
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+
+LoopHW4:
+    // if hw counter <= 3, skip
+    cmp r5, #3
+    ble LoopHW1
+
+    // src_ptr 0 ~ 3
+    mov r9,  r1
+    add r10, r1, r3
+    add r11, r1, r3, lsl#1
+    add r12, r10, r3, lsl#1
+
+    // load bias 16bytes, accumulator 4 (hw4 oc4) reg
+    vld1.32 {q8}, [r6]
+    vmov q9,  q8
+    vmov q10, q8
+    vmov q11, q8
+
+    // src_depth counter
+    mov r7, r3
+
+    // weight_ptr
+    mov r8, r2
+
+    cmp r7, #15
+    ble LoopCrr8
+
+    vld1.8 {q0, q1}, [r8]!
+
+    vld1.8 {q4}, [r9]!
+    vld1.8 {q5}, [r10]!
+    vld1.8 {q6}, [r11]!
+    vld1.8 {q7}, [r12]!
+
+    vld1.8 {q2, q3}, [r8]!
+
+    .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+    .word 0xfe602d4a // vsdot.s8 q9,  q0, d10[0]
+    .word 0xfe604d4c // vsdot.s8 q10, q0, d12[0]
+    .word 0xfe606d4e // vsdot.s8 q11, q0, d14[0]
+
+    sub r7, #16
+
+    LoopCrr16:
+        cmp r7, #15
+        ble LoopCrr16End
+
+        vld1.8 {q0}, [r8]!
+
+        .word 0xfe620d68 // vsdot.s8 q8,  q1, d8[1]
+        .word 0xfe622d6a // vsdot.s8 q9,  q1, d10[1]
+        .word 0xfe624d6c // vsdot.s8 q10, q1, d12[1]
+        .word 0xfe626d6e // vsdot.s8 q11, q1, d14[1]
+
+        vld1.8 {q1}, [r8]!
+
+        .word 0xfe640d49 // vsdot.s8 q8,  q2, d9[0]
+        .word 0xfe642d4b // vsdot.s8 q9,  q2, d11[0]
+        .word 0xfe644d4d // vsdot.s8 q10, q2, d13[0]
+        .word 0xfe646d4f // vsdot.s8 q11, q2, d15[0]
+
+        vld1.8 {q2}, [r8]!
+
+        .word 0xfe660d69 // vsdot.s8 q8,  q3, d9[1]
+        vld1.8 {q4}, [r9]!
+        .word 0xfe662d6b // vsdot.s8 q9,  q3, d11[1]
+        vld1.8 {q5}, [r10]!
+        .word 0xfe664d6d // vsdot.s8 q10, q3, d13[1]
+        vld1.8 {q6}, [r11]!
+        .word 0xfe666d6f // vsdot.s8 q11, q3, d15[1]
+        vld1.8 {q7}, [r12]!
+
+        vld1.8 {q3}, [r8]!
+        sub r7, r7, #16
+
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe602d4a // vsdot.s8 q9,  q0, d10[0]
+        .word 0xfe604d4c // vsdot.s8 q10, q0, d12[0]
+        .word 0xfe606d4e // vsdot.s8 q11, q0, d14[0]
+
+        b LoopCrr16
+
+    LoopCrr16End:
+        .word 0xfe620d68 // vsdot.s8 q8,  q1, d8[1]
+        .word 0xfe622d6a // vsdot.s8 q9,  q1, d10[1]
+        .word 0xfe624d6c // vsdot.s8 q10, q1, d12[1]
+        .word 0xfe626d6e // vsdot.s8 q11, q1, d14[1]
+        .word 0xfe640d49 // vsdot.s8 q8,  q2, d9[0]
+        .word 0xfe642d4b // vsdot.s8 q9,  q2, d11[0]
+        .word 0xfe644d4d // vsdot.s8 q10, q2, d13[0]
+        .word 0xfe646d4f // vsdot.s8 q11, q2, d15[0]
+        .word 0xfe660d69 // vsdot.s8 q8,  q3, d9[1]
+        .word 0xfe662d6b // vsdot.s8 q9,  q3, d11[1]
+        .word 0xfe664d6d // vsdot.s8 q10, q3, d13[1]
+        .word 0xfe666d6f // vsdot.s8 q11, q3, d15[1]
+
+    LoopCrr8:
+        cmp r7, #7
+        ble LoopCrr4
+
+        vld1.8 {d8}, [r9]!
+        vld1.8 {d9}, [r10]!
+        vld1.8 {d10}, [r11]!
+        vld1.8 {d11}, [r12]!
+
+        vld1.8 {q0, q1}, [r8]!
+
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe602d49 // vsdot.s8 q9,  q0, d9[0]
+        .word 0xfe604d4a // vsdot.s8 q10, q0, d10[0]
+        .word 0xfe606d4b // vsdot.s8 q11, q0, d11[0]
+
+        sub r7, #8
+
+        .word 0xfe620d68 // vsdot.s8 q8,  q1, d8[1]
+        .word 0xfe622d69 // vsdot.s8 q9,  q1, d9[1]
+        .word 0xfe624d6a // vsdot.s8 q10, q1, d10[1]
+        .word 0xfe626d6b // vsdot.s8 q11, q1, d11[1]
+
+        b LoopCrr8
+    
+    LoopCrr4:
+        cmp r7, #3
+        ble LoopEnd
+
+        vld1.32 {d8[0]}, [r9]!
+        vld1.32 {d9[0]}, [r10]!
+        vld1.32 {d10[0]}, [r11]!
+        vld1.32 {d11[0]}, [r12]!
+        vld1.8 {q0}, [r8]!
+
+        sub r7, #4
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe602d49 // vsdot.s8 q9,  q0, d9[0]
+        .word 0xfe604d4a // vsdot.s8 q10, q0, d10[0]
+        .word 0xfe606d4b // vsdot.s8 q11, q0, d11[0]
+
+        b LoopCrr4
+
+LoopEnd:
+    // hw counter -= 4
+    sub r5, #4
+    // src_ptr += 4 * src_depth
+    add r1, r1, r3, lsl#2
+
+    ldr r8, [sp, #112]  // scale_ptr
+    ldr r7, [sp, #116]  // relu
+
+    // scale oc0 ~ oc3
+    vld1.32 {q0}, [r8]
+
+    ldr r8, [sp, #120]  // add_input
+
+ConvReluAdd:
+    cmp r7, #-1   // if relu == -1, Conv-Relu-Add
+    bne MulScale
+
+    veor q2, q2, q2
+    vmax.s32 q8,  q8,  q2
+    vmax.s32 q9,  q9,  q2
+    vmax.s32 q10, q10, q2
+    vmax.s32 q11, q11, q2
+MulScale:
+    vcvt.f32.s32 q8,  q8
+    vcvt.f32.s32 q9,  q9
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+
+    vmul.f32 q8,  q8,  q0
+    vmul.f32 q9,  q9,  q0
+    vmul.f32 q10, q10, q0
+    vmul.f32 q11, q11, q0
+
+    cmp r8, #0       // if add_input == 0, skip
+    beq ConvAddPost
+
+AddInputScale:
+    ldr r12, [sp, #124]   // add_scale
+    // add_input_ptr 0 ~ 3
+    add r9, r8, r4
+    add r10, r8, r4, lsl#1
+    add r11, r9, r4, lsl#1
+
+    vld1.32 {d0[0]}, [r8]
+    vld1.32 {d2[0]}, [r9]
+    vld1.32 {d4[0]}, [r10]
+    vld1.32 {d6[0]}, [r11]
+
+    // add_input_ptr += 4 * dst_depth
+    add r8, r8, r4, lsl#2
+
+    // add_scale
+    vld1.32 {q6}, [r12]
+
+    // convert add_input int8 to fp32
+    vmovl.s8 q0, d0
+    vmovl.s8 q1, d2
+    vmovl.s8 q2, d4
+    vmovl.s8 q3, d6
+    vmovl.s16 q0, d0
+    vmovl.s16 q1, d2
+    vmovl.s16 q2, d4
+    vmovl.s16 q3, d6
+    vcvt.f32.s32 q0, q0
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q3, q3
+
+    vmla.f32 q8,  q0, q6   // result += add_input * add_scale
+    vmla.f32 q9,  q1, q6
+    vmla.f32 q10, q2, q6
+    vmla.f32 q11, q3, q6
+
+    str r8, [sp, #120]  // update add_input
+
+ConvAddPost:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q0, #0.5
+    vmov.f32 q1, #-0.5
+
+    vcge.f32 q2, q8,  #0
+    vcge.f32 q3, q9,  #0
+    vcge.f32 q4, q10, #0
+    vcge.f32 q5, q11, #0
+    vbsl.f32 q2, q0, q1
+    vbsl.f32 q3, q0, q1
+    vbsl.f32 q4, q0, q1
+    vbsl.f32 q5, q0, q1
+
+    vadd.f32 q8, q8, q2
+    vadd.f32 q9, q9, q3
+    vadd.f32 q10, q10, q4
+    vadd.f32 q11, q11, q5
+
+    vcvt.s32.f32 q8,  q8
+    vcvt.s32.f32 q9,  q9
+    vcvt.s32.f32 q10, q10
+    vcvt.s32.f32 q11, q11
+
+    vqmovn.s32 d16, q8
+    vqmovn.s32 d17, q9
+    vqmovn.s32 d18, q10
+    vqmovn.s32 d19, q11
+
+    vqmovn.s16 d0, q8
+    vqmovn.s16 d1, q9
+
+    cmp r7, #1   // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt ConvAddPostEnd
+
+    veor d4, d4, d4
+    vmax.s8 d0, d0, d4
+    vmax.s8 d1, d1, d4
+
+    cmp r7, #2   // relu6
+    bne ConvAddPostEnd
+    ldr r8, [sp, #128]  // relu6_max
+    vld1.32 {d4[]}, [r8]
+    vmin.s8 d0, d0, d4
+    vmin.s8 d1, d1, d4
+
+ConvAddPostEnd:
+    // store to dst_ptr 0 ~ 3
+    add r9, r0, r4
+    add r10, r0, r4, lsl#1
+    add r11, r9, r4, lsl#1
+
+    vst1.32 {d0[0]}, [r0]
+    vst1.32 {d0[1]}, [r9]
+    vst1.32 {d1[0]}, [r10]
+    vst1.32 {d1[1]}, [r11]
+
+    // dst_ptr += 4 * dst_depth
+    add r0, r0, r4, lsl#2
+
+    b LoopHW4
+
+LoopHW1:
+    cmp r5, #0
+    ble LoopHW1End
+
+    // src_ptr 0
+    mov r9,  r1
+
+    // load bias 4bytes, accumulator 1 (hw1 oc4) reg
+    vld1.32 {q0}, [r6]
+
+    // src_depth counter
+    mov r7, r3
+
+    // weight_ptr
+    mov r8, r2
+
+    HW1LoopCrr16:
+        cmp r7, #15
+        ble HW1LoopCrr8
+
+        vld1.8 {q2}, [r9]!
+
+        vld1.8 {q8, q9}, [r8]!
+        vld1.8 {q10, q11}, [r8]!
+
+        sub r7, #16
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        .word 0xfe220de4 // vsdot.s8 q0, q9,  d4[1]
+        .word 0xfe240dc5 // vsdot.s8 q0, q10, d5[0]
+        .word 0xfe260de5 // vsdot.s8 q0, q11, d5[1]
+        b HW1LoopCrr16
+
+    HW1LoopCrr8:
+        cmp r7, #7
+        ble HW1LoopCrr4
+
+        vld1.8 {d4}, [r9]!
+
+        vld1.8 {q8, q9}, [r8]!
+
+        sub r7, #8
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        .word 0xfe220de4 // vsdot.s8 q0, q9,  d4[1]
+        b HW1LoopCrr8
+
+    HW1LoopCrr4:
+        cmp r7, #3
+        ble HW1LoopEnd
+
+        vld1.32 {d4[0]}, [r9]!
+        vld1.8 {q8}, [r8]!
+        sub r7, #4
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        b HW1LoopCrr4
+
+HW1LoopEnd:
+    // hw counter -= 1
+    sub r5, #1
+    // src_ptr += 1 * src_depth
+    add r1, r1, r3
+
+    ldr r8, [sp, #112]  // scale_ptr
+    ldr r7, [sp, #116]  // relu
+
+    // scale oc0 ~ oc7
+    vld1.32 {q8}, [r8]
+
+    ldr r8, [sp, #120]  // add_input
+
+HW1ConvReluAdd:
+    cmp r7, #-1   // if relu == -1, Conv-Relu-Add
+    bne HW1MulScale
+
+    veor q10, q10, q10
+    vmax.s32 q0, q0, q10
+HW1MulScale:
+    vcvt.f32.s32 q0, q0
+    vmul.f32 q0, q0, q8
+
+    cmp r8, #0       // if add_input == 0, skip
+    beq HW1ConvAddPost
+
+HW1AddInputScale:
+    ldr r12, [sp, #124]   // add_scale
+
+    vld1.32 {d16[0]}, [r8]
+    // add_input_ptr += 1 * dst_depth
+    add r8, r8, r4
+
+    // add_scale
+    vld1.32 {q14}, [r12]
+
+    // convert add_input int8 to fp32
+    vmovl.s8 q8, d16
+    vmovl.s16 q12, d16
+    vcvt.f32.s32 q12, q12
+
+    vmla.f32 q0, q12, q14   // result += add_input * add_scale
+
+    str r8, [sp, #120]  // update add_input
+
+HW1ConvAddPost:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q8, #0.5
+    vmov.f32 q9, #-0.5
+
+    vcge.f32 q10, q0, #0
+    vbsl.f32 q10, q8, q9
+    vadd.f32 q0, q0, q10
+    vcvt.s32.f32 q0, q0
+
+    vqmovn.s32 d16, q0
+    vqmovn.s16 d0, q8
+
+    cmp r7, #1   // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt HW1ConvAddPostEnd
+
+    veor d4, d4, d4
+    vmax.s8 d0, d0, d4
+
+    cmp r7, #2   // relu6
+    bne HW1ConvAddPostEnd
+    ldr r8, [sp, #128]  // relu6_max
+    vldr d4, [r8]
+    vmin.s8 d0, d0, d4
+
+HW1ConvAddPostEnd:
+    vst1.32 {d0[0]}, [r0]
+    // dst_ptr += 1 * dst_depth
+    add r0, r0, r4
+
+    b LoopHW1
+
+LoopHW1End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+END:
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X8.S
new file mode 100644
index 0000000..2eba736
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMM_INT8_SDOT_4X8.S
@@ -0,0 +1,602 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmInt8SdotUnit4x8Kernel
+//void GemmInt8SdotUnit4x8Kernel(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                         long src_depth, long dst_depth, long hw, 
+//                         const int32_t* bias, const float* scale,
+//                         long relu, const int8_t* add_input, 
+//                         const float* add_scale, const int8_t* relu6_max)
+//r0(dst),
+//r1(src),
+//r2(weight),
+//r3(src_depth)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+
+//from stack(dst_depth) [sp, #100]
+//from stack(hw)        [sp, #104]
+//from stack(bias)      [sp, #108]
+//from stack(scale)     [sp, #112]
+//from stack(relu)      [sp, #116]
+//from stack(add_input) [sp, #120]
+//from stack(add_scale) [sp, #124]
+//from stack(relu6_max) [sp, #128]
+
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+
+LoopHW4:
+    // if hw counter <= 3, skip
+    cmp r5, #3
+    ble LoopHW1
+
+    // src_ptr 0 ~ 3
+    mov r9,  r1
+    add r10, r1, r3
+    add r11, r1, r3, lsl#1
+    add r12, r10, r3, lsl#1
+
+    // load bias 32bytes, accumulator 8 (hw4 oc8) reg
+    vld1.32 {q8, q9}, [r6]
+    vmov q10, q8
+    vmov q11, q9
+    vmov q12, q8
+    vmov q13, q9
+    vmov q14, q8
+    vmov q15, q9
+
+    // src_depth counter
+    mov r7, r3
+
+    // weight_ptr
+    mov r8, r2
+
+    cmp r7, #15
+    ble LoopCrr8
+
+    vld1.8 {q0, q1}, [r8]!
+
+    vld1.8 {q4}, [r9]!
+    vld1.8 {q5}, [r10]!
+    vld1.8 {q6}, [r11]!
+    vld1.8 {q7}, [r12]!
+
+    vld1.8 {q2, q3}, [r8]!
+
+    .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+    .word 0xfe622d48 // vsdot.s8 q9,  q1, d8[0]
+    .word 0xfe604d4a // vsdot.s8 q10, q0, d10[0]
+    .word 0xfe626d4a // vsdot.s8 q11, q1, d10[0]
+    .word 0xfe608d4c // vsdot.s8 q12, q0, d12[0]
+    .word 0xfe62ad4c // vsdot.s8 q13, q1, d12[0]
+    .word 0xfe60cd4e // vsdot.s8 q14, q0, d14[0]
+    .word 0xfe62ed4e // vsdot.s8 q15, q1, d14[0]
+
+    sub r7, #16
+
+    LoopCrr16:
+        cmp r7, #15
+        ble LoopCrr16End
+
+        vld1.8 {q0, q1}, [r8]!
+
+        .word 0xfe640d68 // vsdot.s8 q8,  q2, d8[1]
+        .word 0xfe662d68 // vsdot.s8 q9,  q3, d8[1]
+        pld [r9, #64]
+        .word 0xfe644d6a // vsdot.s8 q10, q2, d10[1]
+        .word 0xfe666d6a // vsdot.s8 q11, q3, d10[1]
+        pld [r10, #64]
+        .word 0xfe648d6c // vsdot.s8 q12, q2, d12[1]
+        .word 0xfe66ad6c // vsdot.s8 q13, q3, d12[1]
+        pld [r11, #64]
+        .word 0xfe64cd6e // vsdot.s8 q14, q2, d14[1]
+        .word 0xfe66ed6e // vsdot.s8 q15, q3, d14[1]
+        pld [r12, #64]
+
+        vld1.8 {q2, q3}, [r8]!
+
+        .word 0xfe600d49 // vsdot.s8 q8,  q0, d9[0]
+        .word 0xfe622d49 // vsdot.s8 q9,  q1, d9[0]
+        .word 0xfe604d4b // vsdot.s8 q10, q0, d11[0]
+        .word 0xfe626d4b // vsdot.s8 q11, q1, d11[0]
+        .word 0xfe608d4d // vsdot.s8 q12, q0, d13[0]
+        .word 0xfe62ad4d // vsdot.s8 q13, q1, d13[0]
+        .word 0xfe60cd4f // vsdot.s8 q14, q0, d15[0]
+        .word 0xfe62ed4f // vsdot.s8 q15, q1, d15[0]
+
+        vld1.8 {q0, q1}, [r8]!
+
+        .word 0xfe640d69 // vsdot.s8 q8,  q2, d9[1]
+        .word 0xfe662d69 // vsdot.s8 q9,  q3, d9[1]
+        vld1.8 {q4}, [r9]!
+        .word 0xfe644d6b // vsdot.s8 q10, q2, d11[1]
+        .word 0xfe666d6b // vsdot.s8 q11, q3, d11[1]
+        vld1.8 {q5}, [r10]!
+        .word 0xfe648d6d // vsdot.s8 q12, q2, d13[1]
+        .word 0xfe66ad6d // vsdot.s8 q13, q3, d13[1]
+        vld1.8 {q6}, [r11]!
+        .word 0xfe64cd6f // vsdot.s8 q14, q2, d15[1]
+        .word 0xfe66ed6f // vsdot.s8 q15, q3, d15[1]
+        vld1.8 {q7}, [r12]!
+
+        vld1.8 {q2, q3}, [r8]!
+        sub r7, r7, #16
+
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe622d48 // vsdot.s8 q9,  q1, d8[0]
+        .word 0xfe604d4a // vsdot.s8 q10, q0, d10[0]
+        .word 0xfe626d4a // vsdot.s8 q11, q1, d10[0]
+        .word 0xfe608d4c // vsdot.s8 q12, q0, d12[0]
+        .word 0xfe62ad4c // vsdot.s8 q13, q1, d12[0]
+        .word 0xfe60cd4e // vsdot.s8 q14, q0, d14[0]
+        .word 0xfe62ed4e // vsdot.s8 q15, q1, d14[0]
+
+        b LoopCrr16
+
+    LoopCrr16End:
+        vld1.8 {q0, q1}, [r8]!
+        .word 0xfe640d68 // vsdot.s8 q8,  q2, d8[1]
+        .word 0xfe662d68 // vsdot.s8 q9,  q3, d8[1]
+        .word 0xfe644d6a // vsdot.s8 q10, q2, d10[1]
+        .word 0xfe666d6a // vsdot.s8 q11, q3, d10[1]
+        .word 0xfe648d6c // vsdot.s8 q12, q2, d12[1]
+        .word 0xfe66ad6c // vsdot.s8 q13, q3, d12[1]
+        .word 0xfe64cd6e // vsdot.s8 q14, q2, d14[1]
+        .word 0xfe66ed6e // vsdot.s8 q15, q3, d14[1]
+        vld1.8 {q2, q3}, [r8]!
+        .word 0xfe600d49 // vsdot.s8 q8,  q0, d9[0]
+        .word 0xfe622d49 // vsdot.s8 q9,  q1, d9[0]
+        .word 0xfe604d4b // vsdot.s8 q10, q0, d11[0]
+        .word 0xfe626d4b // vsdot.s8 q11, q1, d11[0]
+        .word 0xfe608d4d // vsdot.s8 q12, q0, d13[0]
+        .word 0xfe62ad4d // vsdot.s8 q13, q1, d13[0]
+        .word 0xfe60cd4f // vsdot.s8 q14, q0, d15[0]
+        .word 0xfe62ed4f // vsdot.s8 q15, q1, d15[0]
+        .word 0xfe640d69 // vsdot.s8 q8,  q2, d9[1]
+        .word 0xfe662d69 // vsdot.s8 q9,  q3, d9[1]
+        .word 0xfe644d6b // vsdot.s8 q10, q2, d11[1]
+        .word 0xfe666d6b // vsdot.s8 q11, q3, d11[1]
+        .word 0xfe648d6d // vsdot.s8 q12, q2, d13[1]
+        .word 0xfe66ad6d // vsdot.s8 q13, q3, d13[1]
+        .word 0xfe64cd6f // vsdot.s8 q14, q2, d15[1]
+        .word 0xfe66ed6f // vsdot.s8 q15, q3, d15[1]
+    
+    LoopCrr8:
+        cmp r7, #7
+        ble LoopCrr4
+
+        vld1.8 {d8}, [r9]!
+        vld1.8 {d9}, [r10]!
+        vld1.8 {d10}, [r11]!
+        vld1.8 {d11}, [r12]!
+
+        vld1.8 {q0, q1}, [r8]!
+        vld1.8 {q2, q3}, [r8]!
+
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe622d48 // vsdot.s8 q9,  q1, d8[0]
+        .word 0xfe604d49 // vsdot.s8 q10, q0, d9[0]
+        .word 0xfe626d49 // vsdot.s8 q11, q1, d9[0]
+        .word 0xfe608d4a // vsdot.s8 q12, q0, d10[0]
+        .word 0xfe62ad4a // vsdot.s8 q13, q1, d10[0]
+        .word 0xfe60cd4b // vsdot.s8 q14, q0, d11[0]
+        .word 0xfe62ed4b // vsdot.s8 q15, q1, d11[0]
+
+        sub r7, #8
+
+        .word 0xfe640d68 // vsdot.s8 q8,  q2, d8[1]
+        .word 0xfe662d68 // vsdot.s8 q9,  q3, d8[1]
+        .word 0xfe644d69 // vsdot.s8 q10, q2, d9[1]
+        .word 0xfe666d69 // vsdot.s8 q11, q3, d9[1]
+        .word 0xfe648d6a // vsdot.s8 q12, q2, d10[1]
+        .word 0xfe66ad6a // vsdot.s8 q13, q3, d10[1]
+        .word 0xfe64cd6b // vsdot.s8 q14, q2, d11[1]
+        .word 0xfe66ed6b // vsdot.s8 q15, q3, d11[1]
+
+        b LoopCrr8
+    
+    LoopCrr4:
+        cmp r7, #3
+        ble LoopEnd
+
+        vld1.32 {d8[0]}, [r9]!
+        vld1.32 {d9[0]}, [r10]!
+        vld1.32 {d10[0]}, [r11]!
+        vld1.32 {d11[0]}, [r12]!
+        vld1.8 {q0, q1}, [r8]!
+
+        sub r7, #4
+        .word 0xfe600d48 // vsdot.s8 q8,  q0, d8[0]
+        .word 0xfe622d48 // vsdot.s8 q9,  q1, d8[0]
+        .word 0xfe604d49 // vsdot.s8 q10, q0, d9[0]
+        .word 0xfe626d49 // vsdot.s8 q11, q1, d9[0]
+        .word 0xfe608d4a // vsdot.s8 q12, q0, d10[0]
+        .word 0xfe62ad4a // vsdot.s8 q13, q1, d10[0]
+        .word 0xfe60cd4b // vsdot.s8 q14, q0, d11[0]
+        .word 0xfe62ed4b // vsdot.s8 q15, q1, d11[0]
+
+        b LoopCrr4
+
+LoopEnd:
+    // hw counter -= 4
+    sub r5, #4
+    // src_ptr += 4 * src_depth
+    add r1, r1, r3, lsl#2
+
+    ldr r8, [sp, #112]  // scale_ptr
+    ldr r7, [sp, #116]  // relu
+    
+    // scale oc0 ~ oc7
+    vld1.32 {q0, q1}, [r8]
+
+    ldr r8, [sp, #120]  // add_input
+
+ConvReluAdd:
+    cmp r7, #-1   // if relu == -1, Conv-Relu-Add
+    bne MulScale
+
+    veor q2, q2, q2
+    vmax.s32 q8,  q8,  q2
+    vmax.s32 q9,  q9,  q2
+    vmax.s32 q10, q10, q2
+    vmax.s32 q11, q11, q2
+    vmax.s32 q12, q12, q2
+    vmax.s32 q13, q13, q2
+    vmax.s32 q14, q14, q2
+    vmax.s32 q15, q15, q2
+MulScale:
+    vcvt.f32.s32 q8,  q8
+    vcvt.f32.s32 q9,  q9
+    vcvt.f32.s32 q10, q10
+    vcvt.f32.s32 q11, q11
+    vcvt.f32.s32 q12, q12
+    vcvt.f32.s32 q13, q13
+    vcvt.f32.s32 q14, q14
+    vcvt.f32.s32 q15, q15
+
+    vmul.f32 q8,  q8,  q0
+    vmul.f32 q9,  q9,  q1
+    vmul.f32 q10, q10, q0
+    vmul.f32 q11, q11, q1
+    vmul.f32 q12, q12, q0
+    vmul.f32 q13, q13, q1
+    vmul.f32 q14, q14, q0
+    vmul.f32 q15, q15, q1
+
+    cmp r8, #0       // if add_input == 0, skip
+    beq ConvAddPost
+
+AddInputScale:
+    ldr r12, [sp, #124]   // add_scale
+    // add_input_ptr 0 ~ 3
+    add r9, r8, r4
+    add r10, r8, r4, lsl#1
+    add r11, r9, r4, lsl#1
+
+    vldr d0, [r8]
+    vldr d2, [r9]
+    vldr d4, [r10]
+    vldr d6, [r11]
+
+    // add_input_ptr += 4 * dst_depth
+    add r8, r8, r4, lsl#2
+
+    // add_scale
+    vld1.32 {q6, q7}, [r12]
+
+    // convert add_input int8 to fp32
+    vmovl.s8 q0, d0
+    vmovl.s8 q1, d2
+    vmovl.s8 q2, d4
+    vmovl.s8 q3, d6
+    vmovl.s16 q4, d0
+    vmovl.s16 q5, d1
+    vmovl.s16 q0, d2
+    vmovl.s16 q1, d3
+    vcvt.f32.s32 q4, q4
+    vcvt.f32.s32 q5, q5
+    vcvt.f32.s32 q0, q0
+    vcvt.f32.s32 q1, q1
+
+    vmla.f32 q8,  q4, q6   // result += add_input * add_scale
+    vmovl.s16 q4, d4
+    vmla.f32 q9,  q5, q7
+    vmovl.s16 q5, d5
+    vmla.f32 q10, q0, q6
+    vmovl.s16 q0, d6
+    vmla.f32 q11, q1, q7
+    vmovl.s16 q1, d7
+
+    vcvt.f32.s32 q4, q4
+    vcvt.f32.s32 q5, q5
+    vcvt.f32.s32 q0, q0
+    vcvt.f32.s32 q1, q1
+
+    vmla.f32 q12, q4, q6
+    vmla.f32 q13, q5, q7
+    vmla.f32 q14, q0, q6
+    vmla.f32 q15, q1, q7
+
+    str r8, [sp, #120]  // update add_input
+
+ConvAddPost:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q0, #0.5
+    vmov.f32 q1, #-0.5
+
+    vcge.f32 q2, q8,  #0
+    vcge.f32 q3, q9,  #0
+    vcge.f32 q4, q10, #0
+    vcge.f32 q5, q11, #0
+    vbsl.f32 q2, q0, q1
+    vbsl.f32 q3, q0, q1
+    vbsl.f32 q4, q0, q1
+    vbsl.f32 q5, q0, q1
+
+    vadd.f32 q8, q8, q2
+    vcge.f32 q2, q12, #0
+    vadd.f32 q9, q9, q3
+    vcge.f32 q3, q13, #0
+    vadd.f32 q10, q10, q4
+    vcge.f32 q4, q14, #0
+    vadd.f32 q11, q11, q5
+    vcge.f32 q5, q15, #0
+
+    vbsl.f32 q2, q0, q1
+    vbsl.f32 q3, q0, q1
+    vbsl.f32 q4, q0, q1
+    vbsl.f32 q5, q0, q1
+
+    vadd.f32 q12, q12, q2
+    vadd.f32 q13, q13, q3
+    vadd.f32 q14, q14, q4
+    vadd.f32 q15, q15, q5
+
+    vcvt.s32.f32 q8,  q8
+    vcvt.s32.f32 q9,  q9
+    vcvt.s32.f32 q10, q10
+    vcvt.s32.f32 q11, q11
+    vcvt.s32.f32 q12, q12
+    vcvt.s32.f32 q13, q13
+    vcvt.s32.f32 q14, q14
+    vcvt.s32.f32 q15, q15
+
+    vqmovn.s32 d16, q8
+    vqmovn.s32 d17, q9
+    vqmovn.s32 d18, q10
+    vqmovn.s32 d19, q11
+    vqmovn.s32 d20, q12
+    vqmovn.s32 d21, q13
+    vqmovn.s32 d22, q14
+    vqmovn.s32 d23, q15
+
+    vqmovn.s16 d0, q8
+    vqmovn.s16 d1, q9
+    vqmovn.s16 d2, q10
+    vqmovn.s16 d3, q11
+
+    cmp r7, #1   // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt ConvAddPostEnd
+
+    veor d4, d4, d4
+    vmax.s8 d0, d0, d4
+    vmax.s8 d1, d1, d4
+    vmax.s8 d2, d2, d4
+    vmax.s8 d3, d3, d4
+
+    cmp r7, #2   // relu6
+    bne ConvAddPostEnd
+    ldr r8, [sp, #128]  // relu6_max
+    vldr d4, [r8]
+    vmin.s8 d0, d0, d4
+    vmin.s8 d1, d1, d4
+    vmin.s8 d2, d2, d4
+    vmin.s8 d3, d3, d4
+
+ConvAddPostEnd:
+    // store to dst_ptr 0 ~ 3
+    add r9, r0, r4
+    add r10, r0, r4, lsl#1
+    add r11, r9, r4, lsl#1
+
+    vstr d0, [r0]
+    vstr d1, [r9]
+    vstr d2, [r10]
+    vstr d3, [r11]
+
+    // dst_ptr += 4 * dst_depth
+    add r0, r0, r4, lsl#2
+
+    b LoopHW4
+
+LoopHW1:
+    cmp r5, #0
+    ble LoopHW1End
+
+    // src_ptr 0
+    mov r9,  r1
+
+    // load bias 32bytes, accumulator 8 (hw4 oc8) reg
+    vld1.32 {q0, q1}, [r6]
+
+    // src_depth counter
+    mov r7, r3
+
+    // weight_ptr
+    mov r8, r2
+
+    HW1LoopCrr16:
+        cmp r7, #15
+        ble HW1LoopCrr8
+
+        vld1.8 {q2}, [r9]!
+
+        vld1.8 {q8, q9}, [r8]!
+        vld1.8 {q10, q11}, [r8]!
+        vld1.8 {q12, q13}, [r8]!
+        vld1.8 {q14, q15}, [r8]!
+
+        sub r7, #16
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        .word 0xfe222dc4 // vsdot.s8 q1, q9,  d4[0]
+        .word 0xfe240de4 // vsdot.s8 q0, q10, d4[1]
+        .word 0xfe262de4 // vsdot.s8 q1, q11, d4[1]
+        .word 0xfe280dc5 // vsdot.s8 q0, q12, d5[0]
+        .word 0xfe2a2dc5 // vsdot.s8 q1, q13, d5[0]
+        .word 0xfe2c0de5 // vsdot.s8 q0, q14, d5[1]
+        .word 0xfe2e2de5 // vsdot.s8 q1, q15, d5[1]
+        b HW1LoopCrr16
+
+    HW1LoopCrr8:
+        cmp r7, #7
+        ble HW1LoopCrr4
+
+        vld1.8 {d4}, [r9]!
+
+        vld1.8 {q8, q9}, [r8]!
+        vld1.8 {q10, q11}, [r8]!
+
+        sub r7, #8
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        .word 0xfe222dc4 // vsdot.s8 q1, q9,  d4[0]
+        .word 0xfe240de4 // vsdot.s8 q0, q10, d4[1]
+        .word 0xfe262de4 // vsdot.s8 q1, q11, d4[1]
+        b HW1LoopCrr8
+
+    HW1LoopCrr4:
+        cmp r7, #3
+        ble HW1LoopEnd
+
+        vld1.32 {d4[0]}, [r9]!
+        vld1.8 {q8, q9}, [r8]!
+        sub r7, #4
+        .word 0xfe200dc4 // vsdot.s8 q0, q8,  d4[0]
+        .word 0xfe222dc4 // vsdot.s8 q1, q9,  d4[0]
+        b HW1LoopCrr4
+
+HW1LoopEnd:
+    // hw counter -= 1
+    sub r5, #1
+    // src_ptr += 1 * src_depth
+    add r1, r1, r3
+
+    ldr r8, [sp, #112]  // scale_ptr
+    ldr r7, [sp, #116]  // relu
+    
+    // scale oc0 ~ oc7
+    vld1.32 {q8, q9}, [r8]
+
+    ldr r8, [sp, #120]  // add_input
+
+HW1ConvReluAdd:
+    cmp r7, #-1   // if relu == -1, Conv-Relu-Add
+    bne HW1MulScale
+
+    veor q10, q10, q10
+    vmax.s32 q0, q0, q10
+    vmax.s32 q1, q1, q10
+HW1MulScale:
+    vcvt.f32.s32 q0, q0
+    vcvt.f32.s32 q1, q1
+    vmul.f32 q0, q0, q8
+    vmul.f32 q1, q1, q9
+
+    cmp r8, #0       // if add_input == 0, skip
+    beq HW1ConvAddPost
+
+HW1AddInputScale:
+    ldr r12, [sp, #124]   // add_scale
+
+    vldr d16, [r8]
+    // add_input_ptr += 1 * dst_depth
+    add r8, r8, r4
+
+    // add_scale
+    vld1.32 {q14, q15}, [r12]
+
+    // convert add_input int8 to fp32
+    vmovl.s8 q8, d16
+    vmovl.s16 q12, d16
+    vmovl.s16 q13, d17
+    vcvt.f32.s32 q12, q12
+    vcvt.f32.s32 q13, q13
+
+    vmla.f32 q0, q12, q14   // result += add_input * add_scale
+    vmla.f32 q1, q13, q15
+
+    str r8, [sp, #120]  // update add_input
+
+HW1ConvAddPost:
+    // f32 --> s32 --> s8
+    // val + (val >= 0.f ? 0.5f : -0.5f)
+    vmov.f32 q8, #0.5
+    vmov.f32 q9, #-0.5
+
+    vcge.f32 q10, q0, #0
+    vcge.f32 q11, q1, #0
+    vbsl.f32 q10, q8, q9
+    vbsl.f32 q11, q8, q9
+    vadd.f32 q0, q0, q10
+    vadd.f32 q1, q1, q11
+    vcvt.s32.f32 q0, q0
+    vcvt.s32.f32 q1, q1
+
+    vqmovn.s32 d16, q0
+    vqmovn.s32 d17, q1
+    vqmovn.s16 d0, q8
+
+    cmp r7, #1   // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt HW1ConvAddPostEnd
+
+    veor d4, d4, d4
+    vmax.s8 d0, d0, d4
+
+    cmp r7, #2   // relu6
+    bne HW1ConvAddPostEnd
+    ldr r8, [sp, #128]  // relu6_max
+    vldr d4, [r8]
+    vmin.s8 d0, d0, d4
+
+HW1ConvAddPostEnd:
+    vstr d0, [r0]
+    // dst_ptr += 1 * dst_depth
+    add r0, r0, r4
+
+    b LoopHW1
+
+LoopHW1End:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+END:
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMV_INT8_SDOT.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMV_INT8_SDOT.S
new file mode 100644
index 0000000..0f96495
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/GEMV_INT8_SDOT.S
@@ -0,0 +1,197 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemvInt8Sdot
+//void GemvInt8Sdot(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                  const int32_t* bias, const float* scale, long ic_r4, long oc_r4)
+//r0(dst),
+//r1(src),
+//r2(weight),
+//r3(bias)
+
+push {r4-r11, lr}
+vpush {q4-q7}
+// sp offset 9 x 4 + 16 x 4 = 100
+//from stack (scale)   [sp, #100]
+//from stack (ic_r4)   [sp, #104]
+//from stack (oc_r4)   [sp, #108]
+
+ldr r4, [sp, #100]
+ldr r5, [sp, #104]
+ldr r6, [sp, #108]
+
+// cvt from fp32 to int8
+vmov.f32 q5, #0.5
+vmov.f32 q6, #-0.5
+
+LoopOc16:
+    cmp r6, #15
+    ble LoopOc4
+
+    sub r6, #16
+    vldm r3!, {d0-d7}  // init from bias q0, q1, q2, q3
+
+    mov r9, r1   // src_ptr
+    mov r10, r5  // ic counter
+    mov r11, r2  // weight_ptr
+
+    Oc16L8:
+        cmp r10, #7
+        ble Oc16L4
+
+        sub r10, #8
+        vld1.8 {d8}, [r9]!
+        vldm r11!, {d16-d31}    // weight
+
+        .word 0xfe200dc8 // vsdot.s8 q0, q8,  d8[0]
+        .word 0xfe222dc8 // vsdot.s8 q1, q9,  d8[0]
+        .word 0xfe244dc8 // vsdot.s8 q2, q10, d8[0]
+        .word 0xfe266dc8 // vsdot.s8 q3, q11, d8[0]
+        .word 0xfe280de8 // vsdot.s8 q0, q12, d8[1]
+        .word 0xfe2a2de8 // vsdot.s8 q1, q13, d8[1]
+        .word 0xfe2c4de8 // vsdot.s8 q2, q14, d8[1]
+        .word 0xfe2e6de8 // vsdot.s8 q3, q15, d8[1]
+        b Oc16L8
+    
+    Oc16L4:
+        cmp r10, #3
+        ble Oc16L4End
+
+        sub r10, #4
+        vld1.32 {d8[0]}, [r9]!
+        vldm r11!, {d16-d23}
+
+        .word 0xfe200dc8 // vsdot.s8 q0, q8,  d8[0]
+        .word 0xfe222dc8 // vsdot.s8 q1, q9,  d8[0]
+        .word 0xfe244dc8 // vsdot.s8 q2, q10, d8[0]
+        .word 0xfe266dc8 // vsdot.s8 q3, q11, d8[0]
+        b Oc16L4
+
+    Oc16L4End:
+        vldm r4!, {d16-d23}  // scale
+
+        vcvt.f32.s32 q0, q0
+        vcvt.f32.s32 q1, q1
+        vcvt.f32.s32 q2, q2
+        vcvt.f32.s32 q3, q3
+        vmul.f32 q0, q0, q8
+        vmul.f32 q1, q1, q9
+        vmul.f32 q2, q2, q10
+        vmul.f32 q3, q3, q11
+
+        vcge.f32 q8,  q0, #0
+        vcge.f32 q9,  q1, #0
+        vcge.f32 q10, q2, #0
+        vcge.f32 q11, q3, #0
+        vbsl.f32 q8,  q5, q6
+        vbsl.f32 q9,  q5, q6
+        vbsl.f32 q10, q5, q6
+        vbsl.f32 q11, q5, q6
+
+        vadd.f32 q0, q0, q8
+        vadd.f32 q1, q1, q9
+        vadd.f32 q2, q2, q10
+        vadd.f32 q3, q3, q11
+
+        vcvt.s32.f32 q0, q0
+        vcvt.s32.f32 q1, q1
+        vcvt.s32.f32 q2, q2
+        vcvt.s32.f32 q3, q3
+
+        vqmovn.s32 d0, q0
+        vqmovn.s32 d1, q1
+        vqmovn.s32 d2, q2
+        vqmovn.s32 d3, q3
+
+        vqmovn.s16 d0, q0
+        vqmovn.s16 d1, q1
+
+        vst1.8 {q0}, [r0]!
+        // weight += 16 * ic_r4
+        add r2, r2, r5, lsl#4
+
+        b LoopOc16
+
+LoopOc4:
+    cmp r6, #3
+    ble END
+
+    sub r6, #4
+    vld1.8 {q0}, [r3]!
+
+    mov r9, r1   // src_ptr
+    mov r10, r5  // ic counter
+    mov r11, r2  // weight_ptr
+
+    Oc4L16:
+        cmp r10, #15
+        ble Oc4L4
+
+        sub r10, #16
+        vld1.8 {q4}, [r9]!
+        vldm r11!, {d16-d23}    // weight
+
+        .word 0xfe200dc8 // vsdot.s8 q0, q8,  d8[0]
+        .word 0xfe220de8 // vsdot.s8 q0, q9,  d8[1]
+        .word 0xfe240dc9 // vsdot.s8 q0, q10, d9[0]
+        .word 0xfe260de9 // vsdot.s8 q0, q11, d9[1]
+
+        b Oc4L16
+    
+    Oc4L4:
+        cmp r10, #3
+        ble Oc4L4End
+
+        sub r10, #4
+        vld1.32 {d8[0]}, [r9]!
+        vld1.8 {q8}, [r11]!
+
+        .word 0xfe200dc8 // vsdot.s8 q0, q8, d8[0]
+        b Oc4L4
+    
+    Oc4L4End:
+        vld1.8 {q8}, [r4]!     // scale
+        vcvt.f32.s32 q0, q0
+        vmul.f32 q0, q0, q8
+
+        vcge.f32 q8, q0, #0
+        vbsl.f32 q8, q5, q6
+        vadd.f32 q0, q0, q8
+        vcvt.s32.f32 q0, q0
+        vqmovn.s32 d0, q0
+        vqmovn.s16 d0, q0
+
+        vst1.32 {d0[0]}, [r0]!
+        // weight += 4 * ic_r4
+        add r2, r2, r5, lsl#2
+
+        b LoopOc4
+
+END:
+vpop {q4-q7}
+pop {r4-r11, pc}
+
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/HALF2FLOAT.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/HALF2FLOAT.S
new file mode 100644
index 0000000..07dc487
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_32/HALF2FLOAT.S
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function Half2FloatKernel
+//void Half2FloatKernel(float* dst, const __fp16* src, const size_t length)
+//Auto Load:
+//r0:dst, r1:src, r2:length
+push {r4-r11, lr}
+vpush {q4-q7}
+
+cmp r2, #15
+ble L16END
+
+L16:
+    sub r2, r2, #16
+    vld1.16 {q0}, [r1]!
+    vld1.16 {q1}, [r1]!
+    cmp r2, #16
+    vcvt.f32.f16 q2, d0
+    vcvt.f32.f16 q3, d1
+    vcvt.f32.f16 q4, d2
+    vcvt.f32.f16 q5, d3
+    vstm r0!, {d4-d11}
+    bge L16
+
+L16END:
+cmp r2, #7
+ble L8END
+L8:
+    sub r2, r2, #8
+    vld1.16 {q0}, [r1]!
+    cmp r2, #8
+    vcvt.f32.f16 q2, d0
+    vcvt.f32.f16 q3, d1
+    vstm r0!, {d4-d7}
+    bge L8
+
+L8END:
+cmp r2, #3
+ble L4END
+L4:
+    sub r2, r2, #4
+    vld1.16 d0, [r1]!
+    cmp r2, #4
+    vcvt.f32.f16 q2, d0
+    vst1.32 {q2}, [r0]!
+    bge L4
+
+L4END:
+cmp r2, #0
+beq L1END
+
+L1:
+    subs r2, r2, #1
+    vld1.16 d0[0], [r1]!
+    vcvt.f32.f16 q2, d0
+    vst1.32 d4[0], [r0]!
+    bne L1
+L1END:
+
+vpop {q4-q7}
+pop {r4-r11, pc}
+#endif
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S
new file mode 100644
index 0000000..36b22e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S
@@ -0,0 +1,245 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+.macro TRANSPOSE_2X16_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3,w0c4,w0c5,w0c6,w0c7 | w2c0,w2c1,w2c2,w2c3,w2c4,w2c5,w2c6,w2c7
+// r1: w1c0,w1c1,w1c2,w1c3,w1c4,w1c5,w1c6,w1c7 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3 | w0c4,w1c4,w0c5,w1c5,w0c6,w1c6,w0c7,w1c7
+    zip1 v24.16b, \r0, \r1
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0,   | w2c4,0,   w2c5,0,   w2c6,0,   w2c7,0
+    zip2 v25.16b, \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    zip1 v28.8h, v24.8h, v25.8h
+    // w0c4,w1c4,w2c4,0,w0c5,w1c5,w2c5,0,w0c6,w1c6,w2c6,0,w0c7,w1c7,w2c7,0
+    zip2 v29.8h, v24.8h, v25.8h
+.endm
+
+.macro TRANSPOSE_2X8_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3 | w2c0,w2c1,w2c2,w2c3
+// r1: w1c0,w1c1,w1c2,w1c3 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3
+    zip1 v24.8b, \r0, \r1
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0
+    zip2 v25.8b, \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    zip1 v28.4h, v24.4h, v25.4h
+    zip2 v29.4h, v24.4h, v25.4h
+    ins v28.d[1], v29.d[0]
+.endm
+
+asm_function ConvDw3x3Int8SdotSlideW
+//void ConvDw3x3Int8SdotSlideW(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//x0(dst_z),
+//x1(int8_t** src),
+//x2(weight_z),
+//x3(bias_z),
+//x4(scale_z),
+//x5(dc),
+//x6(dst_depth),
+//x7(width)
+
+cmp x7, #0
+ble End
+
+// weight
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+ldr q0, [x2]
+// c4 k0k1k2-, c5 k0k1k2-, c6 k0k1k2-, c7 k0k1k2-
+ldr q1, [x2, #16]
+ldr q2, [x2, #32]
+ldr q3, [x2, #48]
+ldr q4, [x2, #64]
+ldr q5, [x2, #80]
+
+// bias
+ldr q6, [x3]
+ldr q7, [x3, #16]
+
+// scale
+ldr q30, [x4]
+ldr q31, [x4, #16]
+
+ldr x9,  [x1]
+ldr x10, [x1, #8]
+ldr x11, [x1, #16]
+add x9, x9, x5      // h0 ptr += dc
+add x10, x10, x5    // h1 ptr += dc
+add x11, x11, x5    // h2 ptr += dc
+
+eor v17.16b, v17.16b, v17.16b
+
+ld1 {v16.d}[0], [x9], x6
+ld1 {v17.d}[0], [x10], x6
+ld1 {v16.d}[1], [x11], x6
+mov v18.16b, v6.16b
+mov v19.16b, v7.16b
+TRANSPOSE_2X16_S8 v16.16b, v17.16b
+.word 0x4e809792 // sdot v18.4s, v28.16b, v0.16b
+.word 0x4e8197b3 // sdot v19.4s, v29.16b, v1.16b
+
+ld1 {v16.d}[0], [x9], x6
+ld1 {v17.d}[0], [x10], x6
+ld1 {v16.d}[1], [x11], x6
+mov v20.16b, v6.16b
+mov v21.16b, v7.16b
+TRANSPOSE_2X16_S8 v16.16b, v17.16b
+.word 0x4e829792 // sdot v18.4s, v28.16b, v2.16b
+.word 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b
+.word 0x4e809794 // sdot v20.4s, v28.16b, v0.16b
+.word 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b
+
+LoopDw:
+    ld1 {v16.d}[0], [x9], x6
+    ld1 {v17.d}[0], [x10], x6
+    ld1 {v16.d}[1], [x11], x6
+    mov v22.16b, v6.16b
+    mov v23.16b, v7.16b
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X16_S8 v16.16b, v17.16b
+    .word 0x4e849792 // sdot v18.4s, v28.16b, v4.16b
+    .word 0x4e8597b3 // sdot v19.4s, v29.16b, v5.16b
+    .word 0x4e829794 // sdot v20.4s, v28.16b, v2.16b
+    .word 0x4e8397b5 // sdot v21.4s, v29.16b, v3.16b
+    .word 0x4e809796 // sdot v22.4s, v28.16b, v0.16b
+    .word 0x4e8197b7 // sdot v23.4s, v29.16b, v1.16b
+
+    subs x7, x7, #1
+
+    scvtf v26.4s, v18.4s
+    scvtf v27.4s, v19.4s
+    mov v18.16b, v20.16b
+    mov v19.16b, v21.16b
+    mov v20.16b, v22.16b
+    mov v21.16b, v23.16b
+    fmul v26.4s, v26.4s, v30.4s  // result *= scale
+    fmul v27.4s, v27.4s, v31.4s
+    fcvtas v26.4s, v26.4s
+    fcvtas v27.4s, v27.4s
+    sqxtn  v26.4h, v26.4s
+    sqxtn2 v26.8h, v27.4s
+    sqxtn v26.8b, v26.8h
+    st1 {v26.8b}, [x0], x6
+
+    bne LoopDw
+
+End:
+
+ret
+
+asm_function ConvDw3x3Int8SdotSlideWLeftC4
+//void ConvDw3x3Int8SdotSlideWLeftC4(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//x0(dst_z),
+//x1(int8_t** src),
+//x2(weight_z),
+//x3(bias_z),
+//x4(scale_z),
+//x5(dc),
+//x6(dst_depth),
+//x7(width)
+
+cmp x7, #0
+ble C4End
+
+// weight
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+ldr q0, [x2]
+ldr q1, [x2, #16]
+ldr q2, [x2, #32]
+
+// bias
+ldr q3, [x3]
+
+// scale
+ldr q4, [x4]
+
+ldr x9,  [x1]
+ldr x10, [x1, #8]
+ldr x11, [x1, #16]
+add x9, x9, x5      // h0 ptr += dc
+add x10, x10, x5    // h1 ptr += dc
+add x11, x11, x5    // h2 ptr += dc
+
+eor v17.16b, v17.16b, v17.16b
+
+ld1 {v16.s}[0], [x9], x6
+ld1 {v17.s}[0], [x10], x6
+ld1 {v16.s}[1], [x11], x6
+mov v18.16b, v3.16b
+TRANSPOSE_2X8_S8 v16.8b, v17.8b
+.word 0x4e809792 // sdot v18.4s, v28.16b, v0.16b
+
+ld1 {v16.s}[0], [x9], x6
+ld1 {v17.s}[0], [x10], x6
+ld1 {v16.s}[1], [x11], x6
+mov v19.16b, v3.16b
+TRANSPOSE_2X8_S8 v16.8b, v17.8b
+.word 0x4e819792 // sdot v18.4s, v28.16b, v1.16b
+.word 0x4e809793 // sdot v19.4s, v28.16b, v0.16b
+
+C4LoopDw:
+    ld1 {v16.s}[0], [x9], x6
+    ld1 {v17.s}[0], [x10], x6
+    ld1 {v16.s}[1], [x11], x6
+    mov v20.16b, v3.16b
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X8_S8 v16.8b, v17.8b
+    .word 0x4e829792 // sdot v18.4s, v28.16b, v2.16b
+    .word 0x4e819793 // sdot v19.4s, v28.16b, v1.16b
+    .word 0x4e809794 // sdot v20.4s, v28.16b, v0.16b
+
+    subs x7, x7, #1
+
+    scvtf v21.4s, v18.4s
+    mov v18.16b, v19.16b
+    mov v19.16b, v20.16b
+    fmul v21.4s, v21.4s, v4.4s
+    fcvtas v21.4s, v21.4s
+    sqxtn v21.4h, v21.4s
+    sqxtn v21.8b, v21.8h
+    st1 {v21.s}[0], [x0], x6
+
+    bne C4LoopDw
+
+C4End:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S
new file mode 100644
index 0000000..a09d4d0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3S2_INT8_SDOT_SLIDEW.S
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+.macro TRANSPOSE_2X16_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3,w0c4,w0c5,w0c6,w0c7 | w2c0,w2c1,w2c2,w2c3,w2c4,w2c5,w2c6,w2c7
+// r1: w1c0,w1c1,w1c2,w1c3,w1c4,w1c5,w1c6,w1c7 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3 | w0c4,w1c4,w0c5,w1c5,w0c6,w1c6,w0c7,w1c7
+    zip1 v24.16b, \r0, \r1
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0,   | w2c4,0,   w2c5,0,   w2c6,0,   w2c7,0
+    zip2 v25.16b, \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    zip1 v28.8h, v24.8h, v25.8h
+    // w0c4,w1c4,w2c4,0,w0c5,w1c5,w2c5,0,w0c6,w1c6,w2c6,0,w0c7,w1c7,w2c7,0
+    zip2 v29.8h, v24.8h, v25.8h
+.endm
+
+.macro TRANSPOSE_2X8_S8 r0 r1
+// r0: w0c0,w0c1,w0c2,w0c3 | w2c0,w2c1,w2c2,w2c3
+// r1: w1c0,w1c1,w1c2,w1c3 | 0
+    // w0c0,w1c0,w0c1,w1c1,w0c2,w1c2,w0c3,w1c3
+    zip1 v24.8b, \r0, \r1
+    // w2c0,0,   w2c1,0,   w2c2,0,   w2c3,0
+    zip2 v25.8b, \r0, \r1
+    // w0c0,w1c0,w2c0,0,w0c1,w1c1,w2c1,0,w0c2,w1c2,w2c2,0,w0c3,w1c3,w2c3,0
+    zip1 v28.4h, v24.4h, v25.4h
+    zip2 v29.4h, v24.4h, v25.4h
+    ins v28.d[1], v29.d[0]
+.endm
+
+asm_function ConvDw3x3S2Int8SdotSlideW
+//void ConvDw3x3S2Int8SdotSlideW(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//x0(dst_z),
+//x1(int8_t** src),
+//x2(weight_z),
+//x3(bias_z),
+//x4(scale_z),
+//x5(dc),
+//x6(dst_depth),
+//x7(width)
+
+cmp x7, #0
+ble End
+
+// weight
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+ldr q0, [x2]
+// c4 k0k1k2-, c5 k0k1k2-, c6 k0k1k2-, c7 k0k1k2-
+ldr q1, [x2, #16]
+ldr q2, [x2, #32]
+ldr q3, [x2, #48]
+ldr q4, [x2, #64]
+ldr q5, [x2, #80]
+
+// bias
+ldr q6, [x3]
+ldr q7, [x3, #16]
+
+// scale
+ldr q30, [x4]
+ldr q31, [x4, #16]
+
+ldr x9,  [x1]
+ldr x10, [x1, #8]
+ldr x11, [x1, #16]
+add x9, x9, x5      // h0 ptr += dc
+add x10, x10, x5    // h1 ptr += dc
+add x11, x11, x5    // h2 ptr += dc
+
+eor v17.16b, v17.16b, v17.16b
+
+ld1 {v16.d}[0], [x9], x6
+ld1 {v17.d}[0], [x10], x6
+ld1 {v16.d}[1], [x11], x6
+mov v18.16b, v6.16b
+mov v19.16b, v7.16b
+TRANSPOSE_2X16_S8 v16.16b, v17.16b
+.word 0x4e809792 // sdot v18.4s, v28.16b, v0.16b
+.word 0x4e8197b3 // sdot v19.4s, v29.16b, v1.16b
+
+ld1 {v16.d}[0], [x9], x6
+ld1 {v17.d}[0], [x10], x6
+ld1 {v16.d}[1], [x11], x6
+TRANSPOSE_2X16_S8 v16.16b, v17.16b
+.word 0x4e829792 // sdot v18.4s, v28.16b, v2.16b
+.word 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b
+
+LoopDw:
+    ld1 {v16.d}[0], [x9], x6
+    ld1 {v17.d}[0], [x10], x6
+    ld1 {v16.d}[1], [x11], x6
+    mov v20.16b, v6.16b
+    mov v21.16b, v7.16b
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X16_S8 v16.16b, v17.16b
+    .word 0x4e849792 // sdot v18.4s, v28.16b, v4.16b
+    .word 0x4e8597b3 // sdot v19.4s, v29.16b, v5.16b
+    .word 0x4e809794 // sdot v20.4s, v28.16b, v0.16b
+    .word 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b
+
+    ld1 {v16.d}[0], [x9], x6
+    ld1 {v17.d}[0], [x10], x6
+    ld1 {v16.d}[1], [x11], x6
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X16_S8 v16.16b, v17.16b
+    .word 0x4e829794 // sdot v20.4s, v28.16b, v2.16b
+    .word 0x4e8397b5 // sdot v21.4s, v29.16b, v3.16b
+
+    subs x7, x7, #1
+
+    scvtf v26.4s, v18.4s
+    scvtf v27.4s, v19.4s
+    mov v18.16b, v20.16b
+    mov v19.16b, v21.16b
+    fmul v26.4s, v26.4s, v30.4s  // result *= scale
+    fmul v27.4s, v27.4s, v31.4s
+    fcvtas v26.4s, v26.4s
+    fcvtas v27.4s, v27.4s
+    sqxtn  v26.4h, v26.4s
+    sqxtn2 v26.8h, v27.4s
+    sqxtn v26.8b, v26.8h
+    st1 {v26.8b}, [x0], x6
+
+    bne LoopDw
+
+End:
+
+ret
+
+asm_function ConvDw3x3S2Int8SdotSlideWLeftC4
+//void ConvDw3x3S2Int8SdotSlideWLeftC4(int8_t *dst_z,
+//                        int8_t **src,
+//                        const int8_t* weight_z,
+//                        const int32_t* bias_z,
+//                        const float* scale_z,
+//                        long dc,
+//                        long dst_depth,
+//                        long width)
+//x0(dst_z),
+//x1(int8_t** src),
+//x2(weight_z),
+//x3(bias_z),
+//x4(scale_z),
+//x5(dc),
+//x6(dst_depth),
+//x7(width)
+
+cmp x7, #0
+ble C4End
+
+// weight
+// c0 k0k1k2-, c1 k0k1k2-, c2 k0k1k2-, c3 k0k1k2-
+ldr q0, [x2]
+ldr q1, [x2, #16]
+ldr q2, [x2, #32]
+
+// bias
+ldr q3, [x3]
+
+// scale
+ldr q4, [x4]
+
+ldr x9,  [x1]
+ldr x10, [x1, #8]
+ldr x11, [x1, #16]
+add x9, x9, x5      // h0 ptr += dc
+add x10, x10, x5    // h1 ptr += dc
+add x11, x11, x5    // h2 ptr += dc
+
+eor v17.16b, v17.16b, v17.16b
+
+ld1 {v16.s}[0], [x9], x6
+ld1 {v17.s}[0], [x10], x6
+ld1 {v16.s}[1], [x11], x6
+mov v18.16b, v3.16b
+TRANSPOSE_2X8_S8 v16.8b, v17.8b
+.word 0x4e809792 // sdot v18.4s, v28.16b, v0.16b
+
+ld1 {v16.s}[0], [x9], x6
+ld1 {v17.s}[0], [x10], x6
+ld1 {v16.s}[1], [x11], x6
+TRANSPOSE_2X8_S8 v16.8b, v17.8b
+.word 0x4e819792 // sdot v18.4s, v28.16b, v1.16b
+
+C4LoopDw:
+    ld1 {v16.s}[0], [x9], x6
+    ld1 {v17.s}[0], [x10], x6
+    ld1 {v16.s}[1], [x11], x6
+    mov v19.16b, v3.16b
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X8_S8 v16.8b, v17.8b
+    .word 0x4e829792 // sdot v18.4s, v28.16b, v2.16b
+    .word 0x4e809793 // sdot v19.4s, v28.16b, v0.16b
+
+    ld1 {v16.s}[0], [x9], x6
+    ld1 {v17.s}[0], [x10], x6
+    ld1 {v16.s}[1], [x11], x6
+    prfm pldl1keep, [x9]
+    prfm pldl1keep, [x10]
+    prfm pldl1keep, [x11]
+    TRANSPOSE_2X8_S8 v16.8b, v17.8b
+    .word 0x4e819793 // sdot v19.4s, v28.16b, v1.16b
+
+    subs x7, x7, #1
+
+    scvtf v21.4s, v18.4s
+    mov v18.16b, v19.16b
+    fmul v21.4s, v21.4s, v4.4s
+    fcvtas v21.4s, v21.4s
+    sqxtn v21.4h, v21.4s
+    sqxtn v21.8b, v21.8h
+    st1 {v21.s}[0], [x0], x6
+
+    bne C4LoopDw
+
+C4End:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3_FP16_SLIDEW.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3_FP16_SLIDEW.S
new file mode 100644
index 0000000..bc8d74b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_DW_3X3_FP16_SLIDEW.S
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function ConvDw3x3Fp16SlideW 
+//void ConvDw3x3Fp16SlideW(__fp16 *dst_z,
+//                        __fp16 **cache_line,
+//                        const __fp16* weight_z,
+//                        int dst_width)
+
+dst      .req x0
+line0    .req x4
+line1    .req x5
+line2    .req x6
+weight   .req x2
+width    .req x3
+
+w_00      .req v0
+w_01      .req v1
+w_02      .req v2
+w_10      .req v3
+w_11      .req v4
+w_12      .req v5
+w_20      .req v6
+w_21      .req v7
+w_22      .req v16
+
+
+//Auto Load:
+//x0:dst_z, x1:cache_line, x2:weight_z, x3: dst_width
+
+cmp width, #0
+ble End
+
+ldr line0, [x1]
+ldr line1, [x1, #8]
+ldr line2, [x1, #16]
+
+ldr q0, [weight, #0]
+ldr q1, [weight, #16]
+ldr q2, [weight, #32]
+ldr q3, [weight, #48]
+ldr q4, [weight, #64]
+ldr q5, [weight, #80]
+ldr q6, [weight, #96]
+ldr q7, [weight, #112]
+ldr q16, [weight, #128]
+
+ld1 {v21.8h}, [line0], #16
+ld1 {v22.8h}, [line1], #16
+ld1 {v23.8h}, [line2], #16
+
+fmul v17.8h, v21.8h, w_00.8h
+fmla v17.8h, v22.8h, w_10.8h
+fmla v17.8h, v23.8h, w_20.8h
+
+ld1 {v21.8h}, [line0], #16
+ld1 {v22.8h}, [line1], #16
+ld1 {v23.8h}, [line2], #16
+
+fmul v18.8h, v21.8h, w_00.8h
+fmla v17.8h, v21.8h, w_01.8h
+fmla v18.8h, v22.8h, w_10.8h
+fmla v17.8h, v22.8h, w_11.8h
+fmla v18.8h, v23.8h, w_20.8h
+fmla v17.8h, v23.8h, w_21.8h
+
+subs width, width, #1
+beq LoopDwEnd
+LoopDw:
+    ld1 {v21.8h}, [line0], #16
+    ld1 {v22.8h}, [line1], #16
+    ld1 {v23.8h}, [line2], #16
+
+    fmul v19.8h, v21.8h, w_00.8h
+    fmla v18.8h, v21.8h, w_01.8h
+    fmla v17.8h, v21.8h, w_02.8h
+
+    fmla v19.8h, v22.8h, w_10.8h
+    fmla v18.8h, v22.8h, w_11.8h
+    fmla v17.8h, v22.8h, w_12.8h
+
+    fmla v19.8h, v23.8h, w_20.8h
+    fmla v18.8h, v23.8h, w_21.8h
+    fmla v17.8h, v23.8h, w_22.8h
+
+    st1 {v17.8h}, [dst], #16
+    subs width, width, #1
+    mov v17.16b, v18.16b
+    mov v18.16b, v19.16b
+
+    bne LoopDw
+LoopDwEnd:
+ld1 {v21.8h}, [line0], #16
+ld1 {v22.8h}, [line1], #16
+ld1 {v23.8h}, [line2], #16
+fmla v17.8h, v21.8h, w_02.8h
+fmla v17.8h, v22.8h, w_12.8h
+fmla v17.8h, v23.8h, w_22.8h
+st1 {v17.8h}, [dst], #16
+
+End:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_FP16_SLIDEW_C3.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_FP16_SLIDEW_C3.S
new file mode 100644
index 0000000..2da185c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/CONV_FP16_SLIDEW_C3.S
@@ -0,0 +1,291 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemmFp16SlidewC3
+//void GemmFp16SlidewC3(__fp16* dst,            //x0: dst 
+//                          const __fp16* src,      //x1: src
+//                          const __fp16* weight,   //x2: weight
+//                          int width,             //x3: width
+//                          int src_w_setup,       //x4: src_w_step
+//                          int fw,                //x5: fw
+//                          int fh,                //x6: fh
+//                          int dilateX_step,      //x7: dilateX_step
+//                          int dilateY_step);     //x8: dilateY_step, load from stack
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+src_w_step   .req x4
+fw           .req x5
+fh           .req x6
+dilateX_step .req x7
+dilateY_step .req x8
+dilate_y_gap .req x10
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+
+//step multi by sizeof(__fp16)
+lsl dilateY_step, dilateY_step, #1
+lsl dilateX_step, dilateX_step, #1
+lsl src_w_step, src_w_step, #1
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+mul x12, fw, dilateX_step
+sub dilate_y_gap, dilateY_step, x12
+
+sub sp, sp, #128
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+L14:
+cmp width, #13
+ble L8
+
+mov x14, #14
+mul x14, src_w_step, x14
+
+L14Loop:
+    mov x11, src
+    mov x12, weight
+    movi v14.8h, #0
+    movi v15.8h, #0
+    movi v16.8h, #0
+    movi v17.8h, #0
+    movi v18.8h, #0
+    movi v19.8h, #0
+    movi v20.8h, #0
+    movi v21.8h, #0
+    movi v22.8h, #0
+    movi v23.8h, #0
+    movi v24.8h, #0
+    movi v25.8h, #0
+    movi v26.8h, #0
+    movi v27.8h, #0
+    mov x9, fh
+    L14LoopFY:
+        mov x13, fw
+        L14LoopFX:
+            ld1 {v28.8h, v29.8h, v30.8h}, [weight], #48
+
+            ld1 {v0.8h}, [src], src_w_step
+            fmla v14.8h, v28.8h, v0.h[0]
+            ld1 {v1.8h}, [src], src_w_step
+            fmla v14.8h, v29.8h, v0.h[1]
+            fmla v14.8h, v30.8h, v0.h[2]
+            ld1 {v2.8h}, [src], src_w_step
+            fmla v15.8h, v28.8h, v1.h[0]
+            ld1 {v3.8h}, [src], src_w_step
+            fmla v15.8h, v29.8h, v1.h[1]
+            fmla v16.8h, v28.8h, v2.h[0]
+            ld1 {v4.8h}, [src], src_w_step
+            fmla v17.8h, v28.8h, v3.h[0]
+            ld1 {v5.8h}, [src], src_w_step
+            fmla v17.8h, v29.8h, v3.h[1]
+            fmla v18.8h, v28.8h, v4.h[0]
+            ld1 {v6.8h}, [src], src_w_step
+            fmla v19.8h, v28.8h, v5.h[0]
+            ld1 {v7.8h}, [src], src_w_step
+            fmla v16.8h, v29.8h, v2.h[1]
+            ld1 {v8.8h}, [src], src_w_step
+            fmla v20.8h, v28.8h, v6.h[0]
+            ld1 {v9.8h}, [src], src_w_step
+            fmla v18.8h, v29.8h, v4.h[1]
+            ld1 {v10.8h}, [src], src_w_step
+            fmla v21.8h, v28.8h, v7.h[0]
+            ld1 {v11.8h}, [src], src_w_step
+            fmla v22.8h, v28.8h, v8.h[0]
+            ld1 {v12.8h}, [src], src_w_step
+            fmla v23.8h, v28.8h, v9.h[0]
+            ld1 {v13.8h}, [src], src_w_step
+            fmla v24.8h, v28.8h, v10.h[0]
+            fmla v25.8h, v28.8h, v11.h[0]
+            fmla v26.8h, v28.8h, v12.h[0]
+            fmla v27.8h, v28.8h, v13.h[0]
+
+            fmla v19.8h, v29.8h, v5.h[1]
+            fmla v20.8h, v29.8h, v6.h[1]
+            fmla v21.8h, v29.8h, v7.h[1]
+            fmla v22.8h, v29.8h, v8.h[1]
+            fmla v23.8h, v29.8h, v9.h[1]
+            fmla v24.8h, v29.8h, v10.h[1]
+            fmla v25.8h, v29.8h, v11.h[1]
+            fmla v26.8h, v29.8h, v12.h[1]
+            fmla v27.8h, v29.8h, v13.h[1]
+            
+            fmla v15.8h, v30.8h, v1.h[2]
+            fmla v16.8h, v30.8h, v2.h[2]
+            fmla v17.8h, v30.8h, v3.h[2]
+            fmla v18.8h, v30.8h, v4.h[2]
+            fmla v19.8h, v30.8h, v5.h[2]
+            fmla v20.8h, v30.8h, v6.h[2]
+            fmla v21.8h, v30.8h, v7.h[2]
+            fmla v22.8h, v30.8h, v8.h[2]
+            fmla v23.8h, v30.8h, v9.h[2]
+            fmla v24.8h, v30.8h, v10.h[2]
+            fmla v25.8h, v30.8h, v11.h[2]
+            fmla v26.8h, v30.8h, v12.h[2]
+            fmla v27.8h, v30.8h, v13.h[2]
+            
+            subs fw, fw, #1
+            sub src, src, x14
+
+            add src, src, dilateX_step
+            bne L14LoopFX
+        subs fh, fh, #1
+        mov fw, x13
+        add src, src, dilate_y_gap
+        bne L14LoopFY
+
+    mov fh, x9 
+    sub width, width, #14
+    st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [dst], #64
+    add src, x11, x14
+    st1 {v18.8h, v19.8h, v20.8h, v21.8h}, [dst], #64
+    mov weight, x12
+    cmp width, #14
+    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [dst], #64
+    st1 {v26.8h, v27.8h}, [dst], #32
+    bge L14Loop
+
+L8:
+cmp width, #7
+ble L1
+
+mov x14, #8
+mul x14, src_w_step, x14
+
+L8Loop:
+    mov x11, src
+    mov x12, weight
+    movi v16.8h, #0
+    movi v17.8h, #0
+    movi v18.8h, #0
+    movi v19.8h, #0
+    movi v20.8h, #0
+    movi v21.8h, #0
+    movi v22.8h, #0
+    movi v23.8h, #0
+    mov v27.d[0], fh
+    L8LoopFY:
+        mov v27.d[1], fw
+        L8LoopFX:
+            ld1 {v28.8h, v29.8h, v30.8h}, [weight], #48
+            ld1 {v0.8h}, [src], src_w_step
+            fmla v16.8h, v28.8h, v0.h[0]
+            ld1 {v1.8h}, [src], src_w_step
+            fmla v16.8h, v29.8h, v0.h[1]
+            fmla v16.8h, v30.8h, v0.h[2]
+            ld1 {v2.8h}, [src], src_w_step
+            fmla v17.8h, v28.8h, v1.h[0]
+            ld1 {v3.8h}, [src], src_w_step
+            fmla v17.8h, v29.8h, v1.h[1]
+            fmla v18.8h, v28.8h, v2.h[0]
+            ld1 {v4.8h}, [src], src_w_step
+            fmla v19.8h, v28.8h, v3.h[0]
+            ld1 {v5.8h}, [src], src_w_step
+            fmla v20.8h, v28.8h, v4.h[0]
+            ld1 {v6.8h}, [src], src_w_step
+            fmla v21.8h, v28.8h, v5.h[0]
+            ld1 {v7.8h}, [src], src_w_step
+            fmla v22.8h, v28.8h, v6.h[0]
+            fmla v23.8h, v28.8h, v7.h[0]
+
+            fmla v18.8h, v29.8h, v2.h[1]
+            fmla v19.8h, v29.8h, v3.h[1]
+            fmla v20.8h, v29.8h, v4.h[1]
+            fmla v21.8h, v29.8h, v5.h[1]
+            fmla v22.8h, v29.8h, v6.h[1]
+            fmla v23.8h, v29.8h, v7.h[1]
+
+            fmla v17.8h, v30.8h, v1.h[2]
+            fmla v18.8h, v30.8h, v2.h[2]
+            fmla v19.8h, v30.8h, v3.h[2]
+            fmla v20.8h, v30.8h, v4.h[2]
+            fmla v21.8h, v30.8h, v5.h[2]
+            fmla v22.8h, v30.8h, v6.h[2]
+            fmla v23.8h, v30.8h, v7.h[2]
+
+            sub src, src, x14
+            subs fw, fw, #1
+            add src, src, dilateX_step
+            bne L8LoopFX
+        subs fh, fh, #1
+        mov fw, v27.d[1]
+        add src, src, dilate_y_gap
+        bne L8LoopFY
+    mov fh, v27.d[0]
+    sub width, width, #8
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [dst], #64
+    add src, x11, x14
+    mov weight, x12
+    cmp width, #8
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [dst], #64
+    bge L8Loop
+
+L1:
+cmp width, #0
+ble End
+
+L1Loop:
+    mov x11, src
+    mov x12, weight
+    movi v0.8h, #0
+    movi v1.8h, #0
+    movi v2.8h, #0
+    
+    mov x14, fh
+    L1LoopFY:
+        mov x15, fw
+        L1LoopFX:
+            ld1 {v3.8h}, [src], dilateX_step
+            ld1 {v28.8h, v29.8h, v30.8h}, [weight], #48
+
+            fmla v0.8h, v28.8h, v3.h[0]
+            fmla v1.8h, v29.8h, v3.h[1]
+            fmla v2.8h, v30.8h, v3.h[2]
+            subs fw, fw, #1
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x15
+        add src, src, dilate_y_gap
+        bne L1LoopFY
+    mov fh, x14
+
+    fadd v0.8h, v0.8h, v1.8h
+    fadd v0.8h, v0.8h, v2.8h
+    add src, x11, src_w_step
+    mov weight, x12
+    subs width, width, #1
+    st1 {v0.8h}, [dst], #16
+    bne L1Loop
+
+End:
+
+sub sp, sp, #128
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8.S
new file mode 100644
index 0000000..d0e0e16
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8.S
@@ -0,0 +1,687 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFp16O8
+//void DeconvFp16O8(__fp16* dst,             // x0
+//                     const __fp16* src,    // x1
+//                     const __fp16* weight, // x2
+//                     int width,            // x3
+//                     int dst_w_step,       // x4
+//                     int src_depth_div8,   // x5
+//                     int src_depth_step,   // x6
+//                     int fw,               // x7
+//                     int fh,               // x8
+//                     int dilate_x_step,    // x9
+//                     int dilate_y_step)    // x10
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+dst_w_step   .req x4
+ic8          .req x5
+fw           .req x7
+fh           .req x8
+dilate_x_step .req x9
+dilate_y_step .req x10
+dst_tmp      .req x15
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step, x5:src_depth_div8, x6: src_depth_step, x7:fw
+
+//Load from sp
+//x8:fh, x9:dilate_x_step, x10:dilate_y_step
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+// eor x9, x9, x9
+ldr x9, [sp, #8]
+// eor x10, x10, x10
+ldr x10, [sp, #16]
+
+//step multi by sizeof(__fp16)
+lsl x10, x10, #1
+lsl x9, x9, #1
+lsl x6, x6, #1
+lsl x4, x4, #1
+
+//src_depth_step -> src_depth_step - fh*dilate_y_step
+//mul x12, x8, x10
+//sub x6, x6, x12
+
+//dilate_y_step -> dilate_y_step-fw*dilate_x_step
+//mul x12, x7, x9
+//sub x10, x10, x12
+
+sub sp, sp, #144
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+
+L14:
+cmp x3, #13
+ble L4
+
+L14Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #14
+    mul x14, dst_w_step, x14
+    mov x19, fh
+    L14LoopFY:
+        mov x20, fw
+        L14LoopFX:
+            mov x13, ic8
+            ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [weight], #64
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+            ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64
+            ld1 {v12.8h, v13.8h}, [x1], #32
+
+            fmul v14.8h, v28.8h, v0.h[0]
+            fmul v15.8h, v28.8h, v1.h[0]
+            fmul v16.8h, v28.8h, v2.h[0]
+            fmul v17.8h, v28.8h, v3.h[0]
+            fmul v18.8h, v28.8h, v4.h[0]
+            fmul v19.8h, v28.8h, v5.h[0]
+            fmul v20.8h, v28.8h, v6.h[0]
+            fmul v21.8h, v28.8h, v7.h[0]
+            fmul v22.8h, v28.8h, v8.h[0]
+            fmul v23.8h, v28.8h, v9.h[0]
+            fmul v24.8h, v28.8h, v10.h[0]
+            fmul v25.8h, v28.8h, v11.h[0]
+            fmul v26.8h, v28.8h, v12.h[0]
+            fmul v27.8h, v28.8h, v13.h[0]
+
+            subs ic8, ic8, #1
+            beq L14LoopZEnd
+            L14LoopZ:
+                ld1 {v28.8h}, [weight], #16
+                sub src, src, #224
+                add src, src, x6
+
+                fmla v14.8h, v29.8h, v0.h[1]
+                fmla v15.8h, v29.8h, v1.h[1]
+                fmla v16.8h, v29.8h, v2.h[1]
+                fmla v17.8h, v29.8h, v3.h[1]
+                fmla v18.8h, v29.8h, v4.h[1]
+                fmla v19.8h, v29.8h, v5.h[1]
+                fmla v20.8h, v29.8h, v6.h[1]
+                fmla v21.8h, v29.8h, v7.h[1]
+                fmla v22.8h, v29.8h, v8.h[1]
+                fmla v23.8h, v29.8h, v9.h[1]
+                fmla v24.8h, v29.8h, v10.h[1]
+                fmla v25.8h, v29.8h, v11.h[1]
+                fmla v26.8h, v29.8h, v12.h[1]
+                fmla v27.8h, v29.8h, v13.h[1]
+
+                ld1 {v29.8h}, [weight], #16
+                fmla v14.8h, v30.8h, v0.h[2]
+                fmla v15.8h, v30.8h, v1.h[2]
+                fmla v16.8h, v30.8h, v2.h[2]
+                fmla v17.8h, v30.8h, v3.h[2]
+                fmla v18.8h, v30.8h, v4.h[2]
+                fmla v19.8h, v30.8h, v5.h[2]
+                fmla v20.8h, v30.8h, v6.h[2]
+                fmla v21.8h, v30.8h, v7.h[2]
+                fmla v22.8h, v30.8h, v8.h[2]
+                fmla v23.8h, v30.8h, v9.h[2]
+                fmla v24.8h, v30.8h, v10.h[2]
+                fmla v25.8h, v30.8h, v11.h[2]
+                fmla v26.8h, v30.8h, v12.h[2]
+                fmla v27.8h, v30.8h, v13.h[2]
+
+                ld1 {v30.8h}, [weight], #16
+                fmla v14.8h, v31.8h, v0.h[3]
+                fmla v15.8h, v31.8h, v1.h[3]
+                fmla v16.8h, v31.8h, v2.h[3]
+                fmla v17.8h, v31.8h, v3.h[3]
+                fmla v18.8h, v31.8h, v4.h[3]
+                fmla v19.8h, v31.8h, v5.h[3]
+                fmla v20.8h, v31.8h, v6.h[3]
+                fmla v21.8h, v31.8h, v7.h[3]
+                fmla v22.8h, v31.8h, v8.h[3]
+                fmla v23.8h, v31.8h, v9.h[3]
+                fmla v24.8h, v31.8h, v10.h[3]
+                fmla v25.8h, v31.8h, v11.h[3]
+                fmla v26.8h, v31.8h, v12.h[3]
+                fmla v27.8h, v31.8h, v13.h[3]
+
+                ld1 {v31.8h}, [weight], #16
+                fmla v14.8h, v28.8h, v0.h[4]
+                fmla v15.8h, v28.8h, v1.h[4]
+                fmla v16.8h, v28.8h, v2.h[4]
+                fmla v17.8h, v28.8h, v3.h[4]
+                fmla v18.8h, v28.8h, v4.h[4]
+                fmla v19.8h, v28.8h, v5.h[4]
+                fmla v20.8h, v28.8h, v6.h[4]
+                fmla v21.8h, v28.8h, v7.h[4]
+                fmla v22.8h, v28.8h, v8.h[4]
+                fmla v23.8h, v28.8h, v9.h[4]
+                fmla v24.8h, v28.8h, v10.h[4]
+                fmla v25.8h, v28.8h, v11.h[4]
+                fmla v26.8h, v28.8h, v12.h[4]
+                fmla v27.8h, v28.8h, v13.h[4]
+
+                ld1 {v28.8h}, [weight], #16
+                fmla v14.8h, v29.8h, v0.h[5]
+                fmla v15.8h, v29.8h, v1.h[5]
+                fmla v16.8h, v29.8h, v2.h[5]
+                fmla v17.8h, v29.8h, v3.h[5]
+                fmla v18.8h, v29.8h, v4.h[5]
+                fmla v19.8h, v29.8h, v5.h[5]
+                fmla v20.8h, v29.8h, v6.h[5]
+                fmla v21.8h, v29.8h, v7.h[5]
+                fmla v22.8h, v29.8h, v8.h[5]
+                fmla v23.8h, v29.8h, v9.h[5]
+                fmla v24.8h, v29.8h, v10.h[5]
+                fmla v25.8h, v29.8h, v11.h[5]
+                fmla v26.8h, v29.8h, v12.h[5]
+                fmla v27.8h, v29.8h, v13.h[5]
+
+                ld1 {v29.8h}, [weight], #16
+                fmla v14.8h, v30.8h, v0.h[6]
+                fmla v15.8h, v30.8h, v1.h[6]
+                fmla v16.8h, v30.8h, v2.h[6]
+                fmla v17.8h, v30.8h, v3.h[6]
+                fmla v18.8h, v30.8h, v4.h[6]
+                fmla v19.8h, v30.8h, v5.h[6]
+                fmla v20.8h, v30.8h, v6.h[6]
+                fmla v21.8h, v30.8h, v7.h[6]
+                fmla v22.8h, v30.8h, v8.h[6]
+                fmla v23.8h, v30.8h, v9.h[6]
+                fmla v24.8h, v30.8h, v10.h[6]
+                fmla v25.8h, v30.8h, v11.h[6]
+                fmla v26.8h, v30.8h, v12.h[6]
+                fmla v27.8h, v30.8h, v13.h[6]
+
+                ld1 {v30.8h}, [weight], #16
+                fmla v14.8h, v31.8h, v0.h[7]
+                fmla v15.8h, v31.8h, v1.h[7]
+                fmla v16.8h, v31.8h, v2.h[7]
+                fmla v17.8h, v31.8h, v3.h[7]
+                fmla v18.8h, v31.8h, v4.h[7]
+                fmla v19.8h, v31.8h, v5.h[7]
+                fmla v20.8h, v31.8h, v6.h[7]
+                fmla v21.8h, v31.8h, v7.h[7]
+                fmla v22.8h, v31.8h, v8.h[7]
+                fmla v23.8h, v31.8h, v9.h[7]
+                fmla v24.8h, v31.8h, v10.h[7]
+                fmla v25.8h, v31.8h, v11.h[7]
+                fmla v26.8h, v31.8h, v12.h[7]
+                fmla v27.8h, v31.8h, v13.h[7]
+
+                ld1 {v31.8h}, [weight], #16
+                ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+                ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+                ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64
+                ld1 {v12.8h, v13.8h}, [x1], #32
+
+                subs ic8, ic8, #1
+                fmla v14.8h, v28.8h, v0.h[0]
+                fmla v15.8h, v28.8h, v1.h[0]
+                fmla v16.8h, v28.8h, v2.h[0]
+                fmla v17.8h, v28.8h, v3.h[0]
+                fmla v18.8h, v28.8h, v4.h[0]
+                fmla v19.8h, v28.8h, v5.h[0]
+                fmla v20.8h, v28.8h, v6.h[0]
+                fmla v21.8h, v28.8h, v7.h[0]
+                fmla v22.8h, v28.8h, v8.h[0]
+                fmla v23.8h, v28.8h, v9.h[0]
+                fmla v24.8h, v28.8h, v10.h[0]
+                fmla v25.8h, v28.8h, v11.h[0]
+                fmla v26.8h, v28.8h, v12.h[0]
+                fmla v27.8h, v28.8h, v13.h[0]
+
+                bne L14LoopZ
+
+            L14LoopZEnd:
+            ld1 {v28.8h}, [weight], #16
+            fmla v14.8h, v29.8h, v0.h[1]
+            fmla v15.8h, v29.8h, v1.h[1]
+            fmla v16.8h, v29.8h, v2.h[1]
+            fmla v17.8h, v29.8h, v3.h[1]
+            fmla v18.8h, v29.8h, v4.h[1]
+            fmla v19.8h, v29.8h, v5.h[1]
+            fmla v20.8h, v29.8h, v6.h[1]
+            fmla v21.8h, v29.8h, v7.h[1]
+            fmla v22.8h, v29.8h, v8.h[1]
+            fmla v23.8h, v29.8h, v9.h[1]
+            fmla v24.8h, v29.8h, v10.h[1]
+            fmla v25.8h, v29.8h, v11.h[1]
+            fmla v26.8h, v29.8h, v12.h[1]
+            fmla v27.8h, v29.8h, v13.h[1]
+
+            ld1 {v29.8h}, [weight], #16
+            fmla v14.8h, v30.8h, v0.h[2]
+            fmla v15.8h, v30.8h, v1.h[2]
+            fmla v16.8h, v30.8h, v2.h[2]
+            fmla v17.8h, v30.8h, v3.h[2]
+            fmla v18.8h, v30.8h, v4.h[2]
+            fmla v19.8h, v30.8h, v5.h[2]
+            fmla v20.8h, v30.8h, v6.h[2]
+            fmla v21.8h, v30.8h, v7.h[2]
+            fmla v22.8h, v30.8h, v8.h[2]
+            fmla v23.8h, v30.8h, v9.h[2]
+            fmla v24.8h, v30.8h, v10.h[2]
+            fmla v25.8h, v30.8h, v11.h[2]
+            fmla v26.8h, v30.8h, v12.h[2]
+            fmla v27.8h, v30.8h, v13.h[2]
+
+            ld1 {v30.8h}, [weight], #16
+            fmla v14.8h, v31.8h, v0.h[3]
+            fmla v15.8h, v31.8h, v1.h[3]
+            fmla v16.8h, v31.8h, v2.h[3]
+            fmla v17.8h, v31.8h, v3.h[3]
+            fmla v18.8h, v31.8h, v4.h[3]
+            fmla v19.8h, v31.8h, v5.h[3]
+            fmla v20.8h, v31.8h, v6.h[3]
+            fmla v21.8h, v31.8h, v7.h[3]
+            fmla v22.8h, v31.8h, v8.h[3]
+            fmla v23.8h, v31.8h, v9.h[3]
+            fmla v24.8h, v31.8h, v10.h[3]
+            fmla v25.8h, v31.8h, v11.h[3]
+            fmla v26.8h, v31.8h, v12.h[3]
+            fmla v27.8h, v31.8h, v13.h[3]
+
+            ld1 {v31.8h}, [weight], #16
+            fmla v14.8h, v28.8h, v0.h[4]
+            fmla v15.8h, v28.8h, v1.h[4]
+            fmla v16.8h, v28.8h, v2.h[4]
+            fmla v17.8h, v28.8h, v3.h[4]
+            fmla v18.8h, v28.8h, v4.h[4]
+            fmla v19.8h, v28.8h, v5.h[4]
+            fmla v20.8h, v28.8h, v6.h[4]
+            fmla v21.8h, v28.8h, v7.h[4]
+            fmla v22.8h, v28.8h, v8.h[4]
+            fmla v23.8h, v28.8h, v9.h[4]
+            fmla v24.8h, v28.8h, v10.h[4]
+            fmla v25.8h, v28.8h, v11.h[4]
+            fmla v26.8h, v28.8h, v12.h[4]
+            fmla v27.8h, v28.8h, v13.h[4]
+
+            fmla v14.8h, v29.8h, v0.h[5]
+            fmla v15.8h, v29.8h, v1.h[5]
+            fmla v16.8h, v29.8h, v2.h[5]
+            fmla v17.8h, v29.8h, v3.h[5]
+            fmla v18.8h, v29.8h, v4.h[5]
+            fmla v19.8h, v29.8h, v5.h[5]
+            fmla v20.8h, v29.8h, v6.h[5]
+            fmla v21.8h, v29.8h, v7.h[5]
+            fmla v22.8h, v29.8h, v8.h[5]
+            fmla v23.8h, v29.8h, v9.h[5]
+            fmla v24.8h, v29.8h, v10.h[5]
+            fmla v25.8h, v29.8h, v11.h[5]
+            fmla v26.8h, v29.8h, v12.h[5]
+            fmla v27.8h, v29.8h, v13.h[5]
+
+            mov dst_tmp, dst
+            fmla v14.8h, v30.8h, v0.h[6]
+            fmla v15.8h, v30.8h, v1.h[6]
+            fmla v16.8h, v30.8h, v2.h[6]
+            fmla v17.8h, v30.8h, v3.h[6]
+            fmla v18.8h, v30.8h, v4.h[6]
+            fmla v19.8h, v30.8h, v5.h[6]
+            fmla v20.8h, v30.8h, v6.h[6]
+            fmla v21.8h, v30.8h, v7.h[6]
+            fmla v22.8h, v30.8h, v8.h[6]
+            fmla v23.8h, v30.8h, v9.h[6]
+            fmla v24.8h, v30.8h, v10.h[6]
+            fmla v25.8h, v30.8h, v11.h[6]
+            fmla v26.8h, v30.8h, v12.h[6]
+            fmla v27.8h, v30.8h, v13.h[6]
+
+            fmla v14.8h, v31.8h, v0.h[7]
+            ld1 {v0.8h}, [dst_tmp], dst_w_step
+            fmla v15.8h, v31.8h, v1.h[7]
+            ld1 {v1.8h}, [dst_tmp], dst_w_step
+            fmla v16.8h, v31.8h, v2.h[7]
+            ld1 {v2.8h}, [dst_tmp], dst_w_step
+            fmla v17.8h, v31.8h, v3.h[7]
+            ld1 {v3.8h}, [dst_tmp], dst_w_step
+            fmla v18.8h, v31.8h, v4.h[7]
+            ld1 {v4.8h}, [dst_tmp], dst_w_step
+            fmla v19.8h, v31.8h, v5.h[7]
+            ld1 {v5.8h}, [dst_tmp], dst_w_step
+            fmla v20.8h, v31.8h, v6.h[7]
+            ld1 {v6.8h}, [dst_tmp], dst_w_step
+            fmla v21.8h, v31.8h, v7.h[7]
+            ld1 {v7.8h}, [dst_tmp], dst_w_step
+            fmla v22.8h, v31.8h, v8.h[7]
+            ld1 {v8.8h}, [dst_tmp], dst_w_step
+            fmla v23.8h, v31.8h, v9.h[7]
+            ld1 {v9.8h}, [dst_tmp], dst_w_step
+            fmla v24.8h, v31.8h, v10.h[7]
+            ld1 {v10.8h}, [dst_tmp], dst_w_step
+            fmla v25.8h, v31.8h, v11.h[7]
+            ld1 {v11.8h}, [dst_tmp], dst_w_step
+            fmla v26.8h, v31.8h, v12.h[7]
+            ld1 {v12.8h}, [dst_tmp], dst_w_step
+            fmla v27.8h, v31.8h, v13.h[7]
+            ld1 {v13.8h}, [dst_tmp], dst_w_step
+
+            // add with stride
+            fadd v14.8h, v14.8h, v0.8h
+            fadd v15.8h, v15.8h, v1.8h
+            fadd v16.8h, v16.8h, v2.8h
+            fadd v17.8h, v17.8h, v3.8h
+            fadd v18.8h, v18.8h, v4.8h
+            fadd v19.8h, v19.8h, v5.8h
+            fadd v20.8h, v20.8h, v6.8h
+            fadd v21.8h, v21.8h, v7.8h
+            fadd v22.8h, v22.8h, v8.8h
+            fadd v23.8h, v23.8h, v9.8h
+            fadd v24.8h, v24.8h, v10.8h
+            fadd v25.8h, v25.8h, v11.8h
+            fadd v26.8h, v26.8h, v12.8h
+            fadd v27.8h, v27.8h, v13.8h
+
+            st1 {v14.8h}, [dst], dst_w_step
+            st1 {v15.8h}, [dst], dst_w_step
+            st1 {v16.8h}, [dst], dst_w_step
+            st1 {v17.8h}, [dst], dst_w_step
+            st1 {v18.8h}, [dst], dst_w_step
+            st1 {v19.8h}, [dst], dst_w_step
+            st1 {v20.8h}, [dst], dst_w_step
+            st1 {v21.8h}, [dst], dst_w_step
+            st1 {v22.8h}, [dst], dst_w_step
+            st1 {v23.8h}, [dst], dst_w_step
+            st1 {v24.8h}, [dst], dst_w_step
+            st1 {v25.8h}, [dst], dst_w_step
+            st1 {v26.8h}, [dst], dst_w_step
+            st1 {v27.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic8, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L14LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L14LoopFY
+
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #224
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #14
+    cmp width, #14
+    bge L14Loop
+
+L4:
+cmp x3, #3
+ble L1
+
+L4Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #4
+    mul x14, x14, dst_w_step
+
+    mov x19, fh
+    L4LoopFY:
+        mov x20, fw
+        L4LoopFX:
+            mov x13, ic8
+            ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [weight], #64
+            ldr q0, [src]
+            ldr q1, [src, #16]
+            ldr q2, [src, #32]
+            ldr q3, [src, #48]
+            fmul v14.8h, v28.8h, v0.h[0]
+            fmul v15.8h, v28.8h, v1.h[0]
+            fmul v16.8h, v28.8h, v2.h[0]
+            fmul v17.8h, v28.8h, v3.h[0]
+
+            subs ic8, ic8, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                add src, src, x6
+
+                ld1 {v28.8h}, [weight], #16
+                fmla v14.8h, v29.8h, v0.h[1]
+                fmla v15.8h, v29.8h, v1.h[1]
+                fmla v16.8h, v29.8h, v2.h[1]
+                fmla v17.8h, v29.8h, v3.h[1]
+
+                ld1 {v29.8h}, [weight], #16
+                fmla v14.8h, v30.8h, v0.h[2]
+                fmla v15.8h, v30.8h, v1.h[2]
+                fmla v16.8h, v30.8h, v2.h[2]
+                fmla v17.8h, v30.8h, v3.h[2]
+
+                ld1 {v30.8h}, [weight], #16
+                fmla v14.8h, v31.8h, v0.h[3]
+                fmla v15.8h, v31.8h, v1.h[3]
+                fmla v16.8h, v31.8h, v2.h[3]
+                fmla v17.8h, v31.8h, v3.h[3]
+
+                ld1 {v31.8h}, [weight], #16
+                fmla v14.8h, v28.8h, v0.h[4]
+                fmla v15.8h, v28.8h, v1.h[4]
+                fmla v16.8h, v28.8h, v2.h[4]
+                fmla v17.8h, v28.8h, v3.h[4]
+
+                ld1 {v28.8h}, [weight], #16
+                fmla v14.8h, v29.8h, v0.h[5]
+                fmla v15.8h, v29.8h, v1.h[5]
+                fmla v16.8h, v29.8h, v2.h[5]
+                fmla v17.8h, v29.8h, v3.h[5]
+
+                ld1 {v29.8h}, [weight], #16
+                fmla v14.8h, v30.8h, v0.h[6]
+                fmla v15.8h, v30.8h, v1.h[6]
+                fmla v16.8h, v30.8h, v2.h[6]
+                fmla v17.8h, v30.8h, v3.h[6]
+
+                ld1 {v30.8h}, [weight], #16
+                fmla v14.8h, v31.8h, v0.h[7]
+                fmla v15.8h, v31.8h, v1.h[7]
+                fmla v16.8h, v31.8h, v2.h[7]
+                fmla v17.8h, v31.8h, v3.h[7]
+
+                ldr q0, [src]
+                ldr q1, [src, #16]
+                ldr q2, [src, #32]
+                ldr q3, [src, #48]
+
+                ld1 {v31.8h}, [weight], #16
+                fmla v14.8h, v28.8h, v0.h[0]
+                fmla v15.8h, v28.8h, v1.h[0]
+                fmla v16.8h, v28.8h, v2.h[0]
+                fmla v17.8h, v28.8h, v3.h[0]
+
+                subs ic8, ic8, #1
+                bne L4LoopZ
+
+            L4LoopZEnd:
+            ld1 {v28.8h}, [weight], #16
+            fmla v14.8h, v29.8h, v0.h[1]
+            fmla v15.8h, v29.8h, v1.h[1]
+            fmla v16.8h, v29.8h, v2.h[1]
+            fmla v17.8h, v29.8h, v3.h[1]
+
+            ld1 {v29.8h}, [weight], #16
+            fmla v14.8h, v30.8h, v0.h[2]
+            fmla v15.8h, v30.8h, v1.h[2]
+            fmla v16.8h, v30.8h, v2.h[2]
+            fmla v17.8h, v30.8h, v3.h[2]
+
+            ld1 {v30.8h}, [weight], #16
+            fmla v14.8h, v31.8h, v0.h[3]
+            fmla v15.8h, v31.8h, v1.h[3]
+            fmla v16.8h, v31.8h, v2.h[3]
+            fmla v17.8h, v31.8h, v3.h[3]
+
+            ld1 {v31.8h}, [weight], #16
+            fmla v14.8h, v28.8h, v0.h[4]
+            fmla v15.8h, v28.8h, v1.h[4]
+            fmla v16.8h, v28.8h, v2.h[4]
+            fmla v17.8h, v28.8h, v3.h[4]
+
+            mov dst_tmp, dst
+            fmla v14.8h, v29.8h, v0.h[5]
+            fmla v15.8h, v29.8h, v1.h[5]
+            fmla v16.8h, v29.8h, v2.h[5]
+            fmla v17.8h, v29.8h, v3.h[5]
+
+            fmla v14.8h, v30.8h, v0.h[6]
+            fmla v15.8h, v30.8h, v1.h[6]
+            fmla v16.8h, v30.8h, v2.h[6]
+            fmla v17.8h, v30.8h, v3.h[6]
+
+            fmla v14.8h, v31.8h, v0.h[7]
+            ld1 {v0.8h}, [dst_tmp], dst_w_step
+            fmla v15.8h, v31.8h, v1.h[7]
+            ld1 {v1.8h}, [dst_tmp], dst_w_step
+            fmla v16.8h, v31.8h, v2.h[7]
+            ld1 {v2.8h}, [dst_tmp], dst_w_step
+            fmla v17.8h, v31.8h, v3.h[7]
+            ld1 {v3.8h}, [dst_tmp], dst_w_step
+
+            // add with stride
+            fadd v14.8h, v14.8h, v0.8h
+            fadd v15.8h, v15.8h, v1.8h
+            fadd v16.8h, v16.8h, v2.8h
+            fadd v17.8h, v17.8h, v3.8h
+            st1 {v14.8h}, [dst], dst_w_step
+            st1 {v15.8h}, [dst], dst_w_step
+            st1 {v16.8h}, [dst], dst_w_step
+            st1 {v17.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic8, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L4LoopFY
+    
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #64
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #4
+    cmp width, #4
+    bge L4Loop
+
+
+L1:
+cmp x3, #0
+ble End
+
+L1Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #1
+    mul x14, dst_w_step, x14
+
+    mov x19, fh
+    L1LoopFY:
+        mov x20, fw
+        L1LoopFX:
+            mov x13, ic8
+            eor v14.16b, v14.16b, v14.16b
+            eor v15.16b, v15.16b, v15.16b
+            eor v16.16b, v16.16b, v16.16b
+            eor v17.16b, v17.16b, v17.16b
+            L1LoopZ:
+                ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [weight], #64
+                ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [weight], #64
+                ld1 {v0.8h}, [src], x6
+
+                fmla v14.8h, v20.8h, v0.h[0]
+                fmla v15.8h, v21.8h, v0.h[1]
+                fmla v16.8h, v22.8h, v0.h[2]
+                fmla v17.8h, v23.8h, v0.h[3]
+                fmla v14.8h, v24.8h, v0.h[4]
+                fmla v15.8h, v25.8h, v0.h[5]
+                fmla v16.8h, v26.8h, v0.h[6]
+                fmla v17.8h, v27.8h, v0.h[7]
+
+                subs ic8, ic8, #1
+                bne L1LoopZ
+
+            L1LoopZEnd:
+            ld1 {v0.8h}, [dst]
+            // add with stride
+            fadd v14.8h, v14.8h, v15.8h
+            fadd v16.8h, v16.8h, v17.8h
+            fadd v14.8h, v14.8h, v16.8h
+            fadd v14.8h, v14.8h, v0.8h
+            st1 {v14.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov ic8, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L1LoopFY
+
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #16
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #1
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+sub sp, sp, #144
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8_C1.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8_C1.S
new file mode 100644
index 0000000..295d55c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/DECONV_FP16_O8_C1.S
@@ -0,0 +1,361 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function DeconvFp16O8C1
+//input is nchw
+//output is nc8hw8
+//void DeconvFp16O8C1(__fp16* dst,             // x0
+//                     const __fp16* src,    // x1
+//                     const __fp16* weight, // x2
+//                     int width,            // x3
+//                     int dst_w_step,       // x4
+//                     int src_depth,        // x5
+//                     int src_depth_step,   // x6
+//                     int fw,               // x7
+//                     int fh,               // x8
+//                     int dilate_x_step,    // x9
+//                     int dilate_y_step)    // x10
+
+dst          .req x0
+src          .req x1
+weight       .req x2
+width        .req x3
+dst_w_step   .req x4
+input_c      .req x5
+fw           .req x7
+fh           .req x8
+dilate_x_step .req x9
+dilate_y_step .req x10
+dst_tmp      .req x15
+
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step, x5:src_depth, x6: src_depth_step, x7:fw
+
+//Load from sp
+//x8:fh, x9:dilate_x_step, x10:dilate_y_step
+// eor x8, x8, x8
+ldr x8, [sp, #0]
+// eor x9, x9, x9
+ldr x9, [sp, #8]
+// eor x10, x10, x10
+ldr x10, [sp, #16]
+
+//step multi by sizeof(__fp16)
+lsl x10, x10, #1
+lsl x9, x9, #1
+lsl x6, x6, #1
+lsl x4, x4, #1
+
+sub sp, sp, #144
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+
+L16:
+cmp x3, #15
+ble L4
+
+L16Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #16
+    mul x14, dst_w_step, x14
+    mov x19, fh
+    L16LoopFY:
+        mov x20, fw
+        L16LoopFX:
+            mov x13, input_c
+            ld1 {v4.8h}, [weight], #16
+            ld1 {v0.8h, v1.8h}, [src], x6
+
+            fmul v16.8h, v4.8h, v0.h[0]
+            fmul v17.8h, v4.8h, v0.h[1]
+            fmul v18.8h, v4.8h, v0.h[2]
+            fmul v19.8h, v4.8h, v0.h[3]
+            fmul v20.8h, v4.8h, v0.h[4]
+            fmul v21.8h, v4.8h, v0.h[5]
+            fmul v22.8h, v4.8h, v0.h[6]
+            fmul v23.8h, v4.8h, v0.h[7]
+            fmul v24.8h, v4.8h, v1.h[0]
+            fmul v25.8h, v4.8h, v1.h[1]
+            fmul v26.8h, v4.8h, v1.h[2]
+            fmul v27.8h, v4.8h, v1.h[3]
+            fmul v28.8h, v4.8h, v1.h[4]
+            fmul v29.8h, v4.8h, v1.h[5]
+            fmul v30.8h, v4.8h, v1.h[6]
+            fmul v31.8h, v4.8h, v1.h[7]
+
+            subs input_c, input_c, #1
+            beq L16LoopZEnd
+            L16LoopZ:
+                ld1 {v4.8h}, [weight], #16
+                ld1 {v0.8h, v1.8h}, [src], x6
+
+                subs input_c, input_c, #1
+                fmla v16.8h, v4.8h, v0.h[0]
+                fmla v17.8h, v4.8h, v0.h[1]
+                fmla v18.8h, v4.8h, v0.h[2]
+                fmla v19.8h, v4.8h, v0.h[3]
+                fmla v20.8h, v4.8h, v0.h[4]
+                fmla v21.8h, v4.8h, v0.h[5]
+                fmla v22.8h, v4.8h, v0.h[6]
+                fmla v23.8h, v4.8h, v0.h[7]
+                fmla v24.8h, v4.8h, v1.h[0]
+                fmla v25.8h, v4.8h, v1.h[1]
+                fmla v26.8h, v4.8h, v1.h[2]
+                fmla v27.8h, v4.8h, v1.h[3]
+                fmla v28.8h, v4.8h, v1.h[4]
+                fmla v29.8h, v4.8h, v1.h[5]
+                fmla v30.8h, v4.8h, v1.h[6]
+                fmla v31.8h, v4.8h, v1.h[7]
+
+                bne L16LoopZ
+
+            L16LoopZEnd:
+
+            mov dst_tmp, dst
+
+            ld1 {v0.8h},  [dst_tmp], dst_w_step
+            ld1 {v1.8h},  [dst_tmp], dst_w_step
+            ld1 {v2.8h},  [dst_tmp], dst_w_step
+            ld1 {v3.8h},  [dst_tmp], dst_w_step
+            ld1 {v4.8h},  [dst_tmp], dst_w_step
+            ld1 {v5.8h},  [dst_tmp], dst_w_step
+            ld1 {v6.8h},  [dst_tmp], dst_w_step
+            ld1 {v7.8h},  [dst_tmp], dst_w_step
+            ld1 {v8.8h},  [dst_tmp], dst_w_step
+            ld1 {v9.8h},  [dst_tmp], dst_w_step
+            ld1 {v10.8h}, [dst_tmp], dst_w_step
+            ld1 {v11.8h}, [dst_tmp], dst_w_step
+            ld1 {v12.8h}, [dst_tmp], dst_w_step
+            ld1 {v13.8h}, [dst_tmp], dst_w_step
+            ld1 {v14.8h}, [dst_tmp], dst_w_step
+            ld1 {v15.8h}, [dst_tmp], dst_w_step
+
+            // add with stride
+            fadd v16.8h, v16.8h, v0.8h
+            fadd v17.8h, v17.8h, v1.8h
+            fadd v18.8h, v18.8h, v2.8h
+            fadd v19.8h, v19.8h, v3.8h
+            fadd v20.8h, v20.8h, v4.8h
+            fadd v21.8h, v21.8h, v5.8h
+            fadd v22.8h, v22.8h, v6.8h
+            fadd v23.8h, v23.8h, v7.8h
+            fadd v24.8h, v24.8h, v8.8h
+            fadd v25.8h, v25.8h, v9.8h
+            fadd v26.8h, v26.8h, v10.8h
+            fadd v27.8h, v27.8h, v11.8h
+            fadd v28.8h, v28.8h, v12.8h
+            fadd v29.8h, v29.8h, v13.8h
+            fadd v30.8h, v30.8h, v14.8h
+            fadd v31.8h, v31.8h, v15.8h
+
+            st1 {v16.8h}, [dst], dst_w_step
+            st1 {v17.8h}, [dst], dst_w_step
+            st1 {v18.8h}, [dst], dst_w_step
+            st1 {v19.8h}, [dst], dst_w_step
+            st1 {v20.8h}, [dst], dst_w_step
+            st1 {v21.8h}, [dst], dst_w_step
+            st1 {v22.8h}, [dst], dst_w_step
+            st1 {v23.8h}, [dst], dst_w_step
+            st1 {v24.8h}, [dst], dst_w_step
+            st1 {v25.8h}, [dst], dst_w_step
+            st1 {v26.8h}, [dst], dst_w_step
+            st1 {v27.8h}, [dst], dst_w_step
+            st1 {v28.8h}, [dst], dst_w_step
+            st1 {v29.8h}, [dst], dst_w_step
+            st1 {v30.8h}, [dst], dst_w_step
+            st1 {v31.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov input_c, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L16LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L16LoopFY
+
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #32
+    add dst, dst, x14
+    mov weight, x12
+    sub width, width, #16
+    cmp width, #16
+    bge L16Loop
+
+L4:
+cmp x3, #3
+ble L1
+
+L4Loop:
+    mov x11, src 
+    mov x12, weight
+
+    mov x14, #4
+    mul x14, x14, dst_w_step
+
+    mov x19, fh
+    L4LoopFY:
+        mov x20, fw
+        L4LoopFX:
+            mov x13, input_c
+            ld1 {v4.8h}, [weight], #16
+            ld1 {v0.4h}, [src], x6
+            fmul v16.8h, v4.8h, v0.h[0]
+            fmul v17.8h, v4.8h, v0.h[1]
+            fmul v18.8h, v4.8h, v0.h[2]
+            fmul v19.8h, v4.8h, v0.h[3]
+
+            subs input_c, input_c, #1
+            beq L4LoopZEnd
+            L4LoopZ:
+                ld1 {v0.4h}, [src], x6
+                ld1 {v4.8h}, [weight], #16
+                subs input_c, input_c, #1
+                fmla v16.8h, v4.8h, v0.h[0]
+                fmla v17.8h, v4.8h, v0.h[1]
+                fmla v18.8h, v4.8h, v0.h[2]
+                fmla v19.8h, v4.8h, v0.h[3]
+
+                bne L4LoopZ
+
+            L4LoopZEnd:
+            mov dst_tmp, dst
+
+            ld1 {v0.8h}, [dst_tmp], dst_w_step
+            ld1 {v1.8h}, [dst_tmp], dst_w_step
+            ld1 {v2.8h}, [dst_tmp], dst_w_step
+            ld1 {v3.8h}, [dst_tmp], dst_w_step
+
+            // add with stride
+            fadd v16.8h, v16.8h, v0.8h
+            fadd v17.8h, v17.8h, v1.8h
+            fadd v18.8h, v18.8h, v2.8h
+            fadd v19.8h, v19.8h, v3.8h
+            st1 {v16.8h}, [dst], dst_w_step
+            st1 {v17.8h}, [dst], dst_w_step
+            st1 {v18.8h}, [dst], dst_w_step
+            st1 {v19.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov input_c, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L4LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L4LoopFY
+
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #8
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #4
+    cmp width, #4
+    bge L4Loop
+
+L1:
+cmp x3, #0
+ble End
+
+L1Loop:
+    mov x11, src 
+    mov x12, weight 
+
+    mov x14, #1
+    mul x14, dst_w_step, x14
+
+    mov x19, fh
+    L1LoopFY:
+        mov x20, fw
+        L1LoopFX:
+            mov x13, input_c
+            eor v16.16b, v16.16b, v16.16b
+            L1LoopZ:
+                ld1 {v4.8h}, [weight], #16
+                ld1 {v0.h}[0], [src], x6
+
+                fmla v16.8h, v4.8h, v0.h[0]
+
+                subs input_c, input_c, #1
+                bne L1LoopZ
+
+            L1LoopZEnd:
+            ld1 {v0.8h}, [dst]
+            // add with stride
+            fadd v16.8h, v16.8h, v0.8h
+            st1 {v16.8h}, [dst], dst_w_step
+
+            sub dst, dst, x14
+            add dst, dst, dilate_x_step
+
+            mov input_c, x13
+            subs fw, fw, #1
+            sub x1, x1, x14
+            mov src, x11
+            bne L1LoopFX
+        subs fh, fh, #1
+        mov fw, x20
+        mul x20, fw, dilate_x_step
+        sub dst, dst, x20
+        add dst, dst, dilate_y_step
+        bne L1LoopFY
+
+    mov fh, x19
+    mul x20, fh, dilate_y_step
+    sub dst, dst, x20
+    add src, src, #2
+    add dst, dst, x14
+    mov weight, x12 
+    sub width, width, #1
+    cmp width, #1
+    bge L1Loop
+
+End:
+
+sub sp, sp, #144
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/FLOAT2HALF.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/FLOAT2HALF.S
new file mode 100644
index 0000000..b1c8038
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/FLOAT2HALF.S
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function Float2HalfKernel
+//void Half2FloatKernel(__fp16* dst, const float* src, const size_t length)
+//Auto Load:
+//x0:dst, x1:src, x2:length
+cmp x2, #15
+ble L16END
+
+L16:
+    sub x2, x2, #16
+    ldr q0, [x1]
+    ldr q1, [x1, #16]
+    ldr q2, [x1, #32]
+    ldr q3, [x1, #48]
+    add x1, x1, #64
+    cmp x2, #16
+    fcvtn  v4.4h, v0.4s
+    fcvtn2 v4.8h, v1.4s
+    fcvtn  v5.4h, v2.4s
+    fcvtn2 v5.8h, v3.4s
+    st1 {v4.8h, v5.8h}, [x0], #32
+    bge L16
+
+L16END:
+cmp x2, #7
+ble L8END
+L8:
+    sub x2, x2, #8
+    ldr q0, [x1]
+    ldr q1, [x1, #16]
+    add x1, x1, #32
+    cmp x2, #8
+    fcvtn  v2.4h, v0.4s
+    fcvtn2 v2.8h, v1.4s
+    st1 {v2.8h}, [x0], #16
+    bge L8
+
+L8END:
+cmp x2, #3
+ble L4END
+L4:
+    sub x2, x2, #4
+    ld1 {v0.4s}, [x1], #16
+    cmp x2, #4
+    fcvtn  v1.4h, v0.4s
+    st1 {v1.4h}, [x0], #8
+    bge L4
+
+L4END:
+cmp x2, #0
+beq L1END
+
+L1:
+    subs x2, x2, #1
+    ldr s0, [x1], #4
+    fcvt h1, s0
+    str h1, [x0], #2
+    bne L1
+L1END:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_FP16_N8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_FP16_N8.S
new file mode 100644
index 0000000..cb8b421
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_FP16_N8.S
@@ -0,0 +1,718 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GEMM_FP16_N8
+//void GEMM_FP16_N8(float* dst, const float* src, const float* weight, int src_depth,
+//                            int dst_step, int dst_depth, int width, float *bias, int64_t relu)
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:src_depth, x4:dst_step, x5:dst_depth, x6: width, x7: bias
+
+.macro COMPUTE_UNIT_0 z0 z1 z2 z3 z4 z5 z6 z7 y
+fmla \z0, \y, v0.h[0]
+fmla \z1, \y, v0.h[1]
+fmla \z2, \y, v0.h[2]
+fmla \z3, \y, v0.h[3]
+fmla \z4, \y, v0.h[4]
+fmla \z5, \y, v0.h[5]
+fmla \z6, \y, v0.h[6]
+fmla \z7, \y, v0.h[7]
+.endm
+
+.macro COMPUTE_UNIT_1 z0 z1 z2 z3 z4 z5 z6 z7 y
+fmla \z0, \y, v1.h[0]
+fmla \z1, \y, v1.h[1]
+fmla \z2, \y, v1.h[2]
+fmla \z3, \y, v1.h[3]
+fmla \z4, \y, v1.h[4]
+fmla \z5, \y, v1.h[5]
+fmla \z6, \y, v1.h[6]
+fmla \z7, \y, v1.h[7]
+.endm
+
+.macro COMPUTE_UNIT_2 z0 z1 z2 z3 z4 z5 z6 z7 y
+fmla \z0, \y, v2.h[0]
+fmla \z1, \y, v2.h[1]
+fmla \z2, \y, v2.h[2]
+fmla \z3, \y, v2.h[3]
+fmla \z4, \y, v2.h[4]
+fmla \z5, \y, v2.h[5]
+fmla \z6, \y, v2.h[6]
+fmla \z7, \y, v2.h[7]
+.endm
+
+.macro COMPUTE_UNIT_3 z0 z1 z2 z3 z4 z5 z6 z7 y
+fmla \z0, \y, v3.h[0]
+fmla \z1, \y, v3.h[1]
+fmla \z2, \y, v3.h[2]
+fmla \z3, \y, v3.h[3]
+fmla \z4, \y, v3.h[4]
+fmla \z5, \y, v3.h[5]
+fmla \z6, \y, v3.h[6]
+fmla \z7, \y, v3.h[7]
+.endm
+
+.macro COMPUTE_M4_UNIT_0 z0 z1 z2 z3 y
+fmla \z0, \y, v0.h[0]
+fmla \z1, \y, v0.h[1]
+fmla \z2, \y, v0.h[2]
+fmla \z3, \y, v0.h[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_1 z0 z1 z2 z3 y
+fmla \z0, \y, v1.h[0]
+fmla \z1, \y, v1.h[1]
+fmla \z2, \y, v1.h[2]
+fmla \z3, \y, v1.h[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_2 z0 z1 z2 z3 y
+fmla \z0, \y, v2.h[0]
+fmla \z1, \y, v2.h[1]
+fmla \z2, \y, v2.h[2]
+fmla \z3, \y, v2.h[3]
+.endm
+
+.macro COMPUTE_M4_UNIT_3 z0 z1 z2 z3 y
+fmla \z0, \y, v3.h[0]
+fmla \z1, \y, v3.h[1]
+fmla \z2, \y, v3.h[2]
+fmla \z3, \y, v3.h[3]
+.endm
+
+.macro COMPUTE_M3_UNIT_0 z0 z1 z2 y
+fmla \z0, \y, v0.h[0]
+fmla \z1, \y, v0.h[1]
+fmla \z2, \y, v0.h[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_1 z0 z1 z2 y
+fmla \z0, \y, v1.h[0]
+fmla \z1, \y, v1.h[1]
+fmla \z2, \y, v1.h[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_2 z0 z1 z2 y
+fmla \z0, \y, v2.h[0]
+fmla \z1, \y, v2.h[1]
+fmla \z2, \y, v2.h[2]
+.endm
+
+.macro COMPUTE_M3_UNIT_3 z0 z1 z2 y
+fmla \z0, \y, v3.h[0]
+fmla \z1, \y, v3.h[1]
+fmla \z2, \y, v3.h[2]
+.endm
+
+//step multi by sizeof(__fp16)
+lsl x4, x4, #1
+
+prfm pldl1keep, [x7]
+sub sp, sp, #160
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+stp x21, x22, [sp], #16
+
+//x8: src_z_step (width * 8 * sizeof(__fp16))
+lsl x8, x6, #4
+
+//x9: weight_z_step (src_depth * 8 * sizeof(__fp16))
+lsl x9, x3, #4
+mov x19, x6
+// src_depth aligned with 8
+lsr x13, x3, #3
+lsl x13, x13, #3
+// src_depth remain
+sub x20, x3, x13
+mov x3, x13
+mov x21, x20
+
+// x22: Store Flag
+ldr x22, [sp, #0]
+
+// src ptr tmp
+mov x14, x1
+
+LoopDz:
+// load bias
+ld1 {v31.16b}, [x7], #16
+// dst ptr tmp
+mov x10, x0
+// weight ptr tmp
+mov x15, x2
+
+L16:
+    cmp x6, #15
+    ble L8
+
+    mov x12, x2
+    ldr q0, [x1, #0]
+    prfm pldl1keep, [x1, #1024]
+    mov v8.16b,  v31.16b
+    mov v9.16b,  v31.16b
+    ldr q1, [x1, #16]
+    mov v10.16b, v31.16b
+    mov v11.16b, v31.16b
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]
+    mov v12.16b, v31.16b
+    mov v13.16b, v31.16b
+    mov v14.16b, v31.16b
+    mov v15.16b, v31.16b
+    mov v16.16b, v31.16b
+    mov v17.16b, v31.16b
+    mov v18.16b, v31.16b
+    mov v19.16b, v31.16b
+    mov v20.16b, v31.16b
+    mov v21.16b, v31.16b
+    mov v22.16b, v31.16b
+    mov v23.16b, v31.16b
+
+    cmp x13, #7
+    ble L16MAC8CEND
+
+L16MAC8C:
+    ldr q2, [x1, #32]
+    ldr q3, [x1, #48]
+    ldr q5, [x2, #16]
+    ldr q6, [x2, #32]
+    // oc8ic0 * [ic0m0, ic0m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+
+    ldr q0, [x1, #64]
+    // oc8ic0 * [ic0m8, ic0m15]
+    COMPUTE_UNIT_1 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v4.8h
+
+    ldr q1, [x1, #80]
+    ldr q7, [x2, #48]
+    // oc8ic1 * [ic1m0, ic1m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v5.8h
+
+    ldr q2, [x1, #96]
+    ldr q4, [x2, #64]
+    // oc8ic1 * [ic1m8, ic1m15]
+    COMPUTE_UNIT_3 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v5.8h
+
+    ldr q3, [x1, #112]
+    ldr q5, [x2, #80]
+    // oc8ic2 * [ic2m0, ic2m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v6.8h
+
+    ldr q0, [x1, #128]
+    // oc8ic2 * [ic2m8, ic2m15]
+    COMPUTE_UNIT_1 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v6.8h
+
+    ldr q1, [x1, #144]
+    ldr q6, [x2, #96]
+    // oc8ic3 * [ic3m0, ic3m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v7.8h
+
+    ldr q2, [x1, #160]
+    // oc8ic3 * [ic3m8, ic3m15]
+    COMPUTE_UNIT_3 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v7.8h
+
+    ldr q3, [x1, #176]
+    ldr q7, [x2, #112]
+    add x2, x2, #128
+    // oc8ic4 * [ic4m0, ic4m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+
+    ldr q0, [x1, #192]
+    // oc8ic4 * [ic4m8, ic4m15]
+    COMPUTE_UNIT_1 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v4.8h
+
+    ldr q1, [x1, #208]
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]    
+    // oc8ic5 * [ic5m0, ic5m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v5.8h
+
+    ldr q2, [x1, #224]
+    // oc8ic5 * [ic5m8, ic5m15]
+    COMPUTE_UNIT_3 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v5.8h
+
+    ldr q3, [x1, #240]
+    add x1, x1, #256
+    // oc8ic6 * [ic6m0, ic6m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v6.8h
+
+    ldr q0, [x1, #0]
+    prfm pldl1keep, [x1, #1024]
+    // oc8ic6 * [ic6m8, ic6m15]
+    COMPUTE_UNIT_1 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v6.8h
+
+    ldr q1, [x1, #16]
+    // oc8ic7 * [ic7m0, ic7m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v7.8h
+
+    subs x3, x3, #8
+    // oc8ic7 * [ic7m8, ic7m15]
+    COMPUTE_UNIT_3 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v7.8h
+
+    bne L16MAC8C
+
+L16MAC8CEND:
+    cmp x21, #0
+    beq L16MAC1CEND
+
+L16MAC1C:
+    // add pre-load offset
+    add x1, x1, #32
+    add x2, x2, #16
+    // oc8ic0 * [ic0m0, ic0m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+    ldr q0, [x1, #0]
+    subs x20, x20, #1
+    // oc8ic0 * [ic0m8, ic0m15]
+    COMPUTE_UNIT_1 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v4.8h
+    ldr q1, [x1, #16]
+    ldr q4, [x2]
+    bne L16MAC1C
+
+L16MAC1CEND:
+
+    cbz x22, Store16
+    eor v0.16b, v0.16b, v0.16b
+    fmax v8.8h,  v8.8h,  v0.8h
+    fmax v9.8h,  v9.8h,  v0.8h
+    fmax v10.8h, v10.8h, v0.8h
+    fmax v11.8h, v11.8h, v0.8h
+    fmax v12.8h, v12.8h, v0.8h
+    fmax v13.8h, v13.8h, v0.8h
+    fmax v14.8h, v14.8h, v0.8h
+    fmax v15.8h, v15.8h, v0.8h
+    fmax v16.8h, v16.8h, v0.8h
+    fmax v17.8h, v17.8h, v0.8h
+    fmax v18.8h, v18.8h, v0.8h
+    fmax v19.8h, v19.8h, v0.8h
+    fmax v20.8h, v20.8h, v0.8h
+    fmax v21.8h, v21.8h, v0.8h
+    fmax v22.8h, v22.8h, v0.8h
+    fmax v23.8h, v23.8h, v0.8h
+
+    // if relu6
+    cmp x22, #2
+    bne Store16
+    // 6.0f
+    movi v1.8h, #0x46, lsl #8
+    fmin v8.8h,  v8.8h,  v1.8h
+    fmin v9.8h,  v9.8h,  v1.8h
+    fmin v10.8h, v10.8h, v1.8h
+    fmin v11.8h, v11.8h, v1.8h
+    fmin v12.8h, v12.8h, v1.8h
+    fmin v13.8h, v13.8h, v1.8h
+    fmin v14.8h, v14.8h, v1.8h
+    fmin v15.8h, v15.8h, v1.8h
+    fmin v16.8h, v16.8h, v1.8h
+    fmin v17.8h, v17.8h, v1.8h
+    fmin v18.8h, v18.8h, v1.8h
+    fmin v19.8h, v19.8h, v1.8h
+    fmin v20.8h, v20.8h, v1.8h
+    fmin v21.8h, v21.8h, v1.8h
+    fmin v22.8h, v22.8h, v1.8h
+    fmin v23.8h, v23.8h, v1.8h
+Store16:
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+    sub x6, x6, #16
+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+    cmp x6, #16
+    // reset weight ptr
+    mov x2, x12
+    // reset loop counter
+    mov x3, x13
+    mov x20, x21
+    bge L16
+
+// corner case
+L8:
+    cmp x6, #7
+    ble L4
+
+    mov x12, x2
+    ldr q0, [x1, #0]
+    prfm pldl1keep, [x1, #512]
+    mov v8.16b,  v31.16b
+    mov v9.16b,  v31.16b
+    mov v10.16b, v31.16b
+    mov v11.16b, v31.16b
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]
+    mov v12.16b, v31.16b
+    mov v13.16b, v31.16b
+    mov v14.16b, v31.16b
+    mov v15.16b, v31.16b
+
+    cmp x13, #7
+    ble L8MAC8CEND
+
+L8MAC8C:
+    ldr q1, [x1, #16]
+    ldr q2, [x1, #32]
+    ldr q5, [x2, #16]
+    ldr q6, [x2, #32]
+    // oc8ic0 * [ic0m0, ic0m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+
+    ldr q3, [x1, #48]
+    ldr q0, [x1, #64]
+    ldr q7, [x2, #48]
+    ldr q4, [x2, #64]
+    // oc8ic1 * [ic1m0, ic1m7]
+    COMPUTE_UNIT_1 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v5.8h
+
+    ldr q1, [x1, #80]
+    ldr q5, [x2, #80]
+    // oc8ic2 * [ic2m0, ic2m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v6.8h
+
+    ldr q2, [x1, #96]
+    ldr q6, [x2, #96]
+    // oc8ic3 * [ic3m0, ic3m7]
+    COMPUTE_UNIT_3 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v7.8h
+
+    ldr q3, [x1, #112]
+    ldr q7, [x2, #112]
+    // add x1, x1, x8
+    add x1, x1, #128
+    add x2, x2, #128
+    // oc8ic4 * [ic4m0, ic4m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+
+    ldr q0, [x1, #0]
+    prfm pldl1keep, [x1, #512]
+    // oc8ic5 * [ic5m0, ic5m7]
+    COMPUTE_UNIT_1 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v5.8h
+
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]
+    // oc8ic6 * [ic6m0, ic6m7]
+    COMPUTE_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v6.8h
+
+    subs x3, x3, #8
+    // oc8ic7 * [ic7m0, ic7m7]
+    COMPUTE_UNIT_3 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v7.8h
+
+    bne L8MAC8C
+
+L8MAC8CEND:
+    cmp x21, #0
+    beq L8MAC1CEND
+
+L8MAC1C:
+    // add pre-load offset
+    add x1, x1, #16
+    add x2, x2, #16
+    subs x20, x20, #1
+    // oc8ic0 * [ic0m0, ic0m7]
+    COMPUTE_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h, v4.8h
+    ldr q0, [x1]
+    ldr q4, [x2]
+    bne L8MAC1C
+
+L8MAC1CEND:
+    cbz x22, Store8
+    eor v0.16b, v0.16b, v0.16b
+    fmax v8.8h,  v8.8h,  v0.8h
+    fmax v9.8h,  v9.8h,  v0.8h
+    fmax v10.8h, v10.8h, v0.8h
+    fmax v11.8h, v11.8h, v0.8h
+    fmax v12.8h, v12.8h, v0.8h
+    fmax v13.8h, v13.8h, v0.8h
+    fmax v14.8h, v14.8h, v0.8h
+    fmax v15.8h, v15.8h, v0.8h
+
+    // if relu6
+    cmp x22, #2
+    bne Store8
+    // 6.0f
+    movi v1.8h, #0x46, lsl #8
+    fmin v8.8h,  v8.8h,  v1.8h
+    fmin v9.8h,  v9.8h,  v1.8h
+    fmin v10.8h, v10.8h, v1.8h
+    fmin v11.8h, v11.8h, v1.8h
+    fmin v12.8h, v12.8h, v1.8h
+    fmin v13.8h, v13.8h, v1.8h
+    fmin v14.8h, v14.8h, v1.8h
+    fmin v15.8h, v15.8h, v1.8h
+Store8:
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+    sub x6, x6, #8
+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
+
+    cmp x6, #8
+
+    mov x2, x12
+    mov x3, x13
+    mov x20, x21
+    bge L8
+
+L4:
+    cmp x6, #3
+    ble L1
+
+    mov x12, x2
+    ldr d0, [x1, #0]
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x1, #256]
+    prfm pldl1keep, [x2, #512]
+    mov v8.16b,  v31.16b
+    mov v9.16b,  v31.16b
+    mov v10.16b, v31.16b
+    mov v11.16b, v31.16b
+
+    cmp x13, #7
+    ble L4MAC8CEND
+
+L4MAC8C:
+    ldr d1, [x1, #8]
+    ldr d2, [x1, #16]
+    ldr q5, [x2, #16]
+    ldr q6, [x2, #32]
+    // oc8ic0 * [ic0m0, ic0m3]
+    COMPUTE_M4_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v4.8h
+
+    ldr d3, [x1, #24]
+    ldr d0, [x1, #32]
+    ldr q7, [x2, #48]
+    ldr q4, [x2, #64]
+    // oc8ic1 * [ic1m0, ic1m3]
+    COMPUTE_M4_UNIT_1 v8.8h,  v9.8h,  v10.8h, v11.8h, v5.8h
+
+    ldr d1, [x1, #40]
+    ldr q5, [x2, #80]
+    // oc8ic2 * [ic2m0, ic2m3]
+    COMPUTE_M4_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v6.8h
+
+    ldr d2, [x1, #48]
+    ldr q6, [x2, #96]
+    // oc8ic3 * [ic3m0, ic3m3]
+    COMPUTE_M4_UNIT_3 v8.8h,  v9.8h,  v10.8h, v11.8h, v7.8h
+
+    ldr d3, [x1, #56]
+    ldr q7, [x2, #112]
+    // add x1, x1, x8
+    add x1, x1, #64
+    add x2, x2, #128
+    // oc8ic4 * [ic4m0, ic4m3]
+    COMPUTE_M4_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v4.8h
+
+    ldr d0, [x1, #0]
+    prfm pldl1keep, [x1, #256]
+    // oc8ic5 * [ic5m0, ic5m3]
+    COMPUTE_M4_UNIT_1 v8.8h,  v9.8h,  v10.8h, v11.8h, v5.8h
+
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]
+    // oc8ic6 * [ic6m0, ic6m3]
+    COMPUTE_M4_UNIT_2 v8.8h,  v9.8h,  v10.8h, v11.8h, v6.8h
+
+    subs x3, x3, #8
+    // oc8ic7 * [ic7m0, ic7m3]
+    COMPUTE_M4_UNIT_3 v8.8h,  v9.8h,  v10.8h, v11.8h, v7.8h
+
+    bne L4MAC8C
+
+L4MAC8CEND:
+    cmp x21, #0
+    beq L4MAC1CEND
+
+L4MAC1C:
+    // add pre-load offset
+    add x1, x1, #8
+    add x2, x2, #16
+    subs x20, x20, #1
+    // oc8ic0 * [ic0m0, ic0m3]
+    COMPUTE_M4_UNIT_0 v8.8h,  v9.8h,  v10.8h, v11.8h, v4.8h
+    ldr q0, [x1]
+    ldr q4, [x2]
+    bne L4MAC1C
+
+L4MAC1CEND:
+    cbz x22, Store4
+    eor v0.16b, v0.16b, v0.16b
+    fmax v8.8h,  v8.8h,  v0.8h
+    fmax v9.8h,  v9.8h,  v0.8h
+    fmax v10.8h, v10.8h, v0.8h
+    fmax v11.8h, v11.8h, v0.8h
+
+    // if relu6
+    cmp x22, #2
+    bne Store4
+    // 6.0f
+    movi v1.8h, #0x46, lsl #8
+    fmin v8.8h,  v8.8h,  v1.8h
+    fmin v9.8h,  v9.8h,  v1.8h
+    fmin v10.8h, v10.8h, v1.8h
+    fmin v11.8h, v11.8h, v1.8h
+Store4:
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+    sub x6, x6, #4
+
+    cmp x6, #4
+
+    mov x2, x12
+    mov x3, x13
+    mov x20, x21
+    bge L4
+
+L1:
+    cmp x6, #0
+    ble END
+
+    // when L1, src is 4 x crr
+    mov x12, x2
+    ldr d0, [x1, #0]
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x1, #256]
+    prfm pldl1keep, [x2, #512]
+    mov v8.16b,  v31.16b
+    mov v9.16b,  v31.16b
+    mov v10.16b, v31.16b
+
+    cmp x13, #7
+    ble L1MAC8CEND
+
+L1MAC8C:
+    ldr d1, [x1, #8]
+    ldr d2, [x1, #16]
+    ldr q5, [x2, #16]
+    ldr q6, [x2, #32]
+    // oc8ic0 * [ic0m0, ic0m2]
+    COMPUTE_M3_UNIT_0 v8.8h,  v9.8h,  v10.8h, v4.8h
+
+    ldr d3, [x1, #24]
+    ldr d0, [x1, #32]
+    ldr q7, [x2, #48]
+    ldr q4, [x2, #64]
+    // oc8ic1 * [ic1m0, ic1m2]
+    COMPUTE_M3_UNIT_1 v8.8h,  v9.8h,  v10.8h, v5.8h
+
+    ldr d1, [x1, #40]
+    ldr q5, [x2, #80]
+    // oc8ic2 * [ic2m0, ic2m2]
+    COMPUTE_M3_UNIT_2 v8.8h,  v9.8h,  v10.8h, v6.8h
+
+    ldr d2, [x1, #48]
+    ldr q6, [x2, #96]
+    // oc8ic3 * [ic3m0, ic3m2]
+    COMPUTE_M3_UNIT_3 v8.8h,  v9.8h,  v10.8h, v7.8h
+
+    ldr d3, [x1, #56]
+    ldr q7, [x2, #112]
+    // add x1, x1, x8
+    add x1, x1, #64
+    add x2, x2, #128
+    // oc8ic4 * [ic4m0, ic4m2]
+    COMPUTE_M3_UNIT_0 v8.8h,  v9.8h,  v10.8h, v4.8h
+
+    ldr d0, [x1, #0]
+    prfm pldl1keep, [x1, #256]
+    // oc8ic5 * [ic5m0, ic5m2]
+    COMPUTE_M3_UNIT_1 v8.8h,  v9.8h,  v10.8h, v5.8h
+
+    ldr q4, [x2, #0]
+    prfm pldl1keep, [x2, #512]
+    // oc8ic6 * [ic6m0, ic6m2]
+    COMPUTE_M3_UNIT_2 v8.8h,  v9.8h,  v10.8h, v6.8h
+
+    subs x3, x3, #8
+    // oc8ic7 * [ic7m0, ic7m2]
+    COMPUTE_M3_UNIT_3 v8.8h,  v9.8h,  v10.8h, v7.8h
+
+    bne L1MAC8C
+
+L1MAC8CEND:
+    cmp x21, #0
+    beq L1MAC1CEND
+
+L1MAC1C:
+    // add pre-load offset
+    add x1, x1, #8
+    add x2, x2, #16
+    subs x20, x20, #1
+    // oc8ic0 * [ic0m0, ic0m2]
+    COMPUTE_M3_UNIT_0 v8.8h,  v9.8h,  v10.8h, v4.8h
+    ldr q0, [x1]
+    ldr q4, [x2]
+    bne L1MAC1C
+
+L1MAC1CEND:
+    cbz x22, Store1
+    eor v0.16b, v0.16b, v0.16b
+    fmax v8.8h,  v8.8h,  v0.8h
+    fmax v9.8h,  v9.8h,  v0.8h
+    fmax v10.8h, v10.8h, v0.8h
+
+    // if relu6
+    cmp x22, #2
+    bne Store1
+    // 6.0f
+    movi v1.8h, #0x46, lsl #8
+    fmin v8.8h,  v8.8h,  v1.8h
+    fmin v9.8h,  v9.8h,  v1.8h
+    fmin v10.8h, v10.8h, v1.8h
+Store1:
+    cmp x6, #3
+    beq Store1_3reg
+    cmp x6, #2
+    beq Store1_2reg
+    cmp x6, #1
+    beq Store1_1reg
+Store1_3reg:
+    st1 {v8.8h, v9.8h, v10.8h}, [x0]
+    b Store1_End
+Store1_2reg:
+    st1 {v8.8h, v9.8h}, [x0]
+    b Store1_End
+Store1_1reg:
+    st1 {v8.8h}, [x0]
+Store1_End:
+    // reset ptr and counters
+    mov x2, x12
+    mov x3, x13
+    mov x20, x21
+
+END:
+
+subs x5, x5, #8
+// update dst ptr
+add x0, x10, x4
+// reset src ptr
+mov x1, x14
+// update weight ptr
+add x2, x15, x9
+// reset M counter
+mov x6, x19
+bne LoopDz
+
+sub sp, sp, #160
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+ldp x21, x22, [sp], #16
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X4.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X4.S
new file mode 100644
index 0000000..b525d55
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X4.S
@@ -0,0 +1,563 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+asm_function GemmInt8SdotUnit8x4
+//void GemmInt8SdotUnit8x4(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                         long src_depth, long dst_depth, long hw, 
+//                         const int32_t* bias, const float* scale,
+//                         long relu, const int8_t* add_input, 
+//                         const float* add_scale, const int8_t* relu6_max)
+//x0(dst),
+//x1(src),
+//x2(weight),
+//x3(src_depth),
+//x4(dst_depth),
+//x5(hw),
+//x6(bias),
+//x7(scale)
+//from stack(relu)      [sp, #0]
+//from stack(add_input) [sp, #8]
+//from stack(add_scale) [sp, #16]
+//from stack(relu6_max) [sp, #24]
+
+sub sp, sp, #192
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+stp x21, x22, [sp], #16
+stp x23, x24, [sp], #16
+stp x25, x26, [sp], #16
+
+// relu
+ldr x23, [sp, #0]
+// zero vector
+eor v31.16b, v31.16b, v31.16b
+// add_input
+ldr x25, [sp, #8]
+// add_scale
+ldr x26, [sp, #16]
+// relu6_max
+ldr x24, [sp, #24]
+
+// src_ptr x1
+// hw counter x5
+// dst_ptr x0
+
+LoopHW8:
+    // if hw counter <= 7, skip
+    cmp x5, #7
+    ble LoopHW1
+
+    // src_ptr 0 ~ 7
+    mov x10, x1
+    add x11, x1, x3
+    add x12, x1, x3, lsl#1
+    add x14, x1, x3, lsl#2
+    add x13, x11, x3, lsl#1
+    add x15, x11, x3, lsl#2
+    add x19, x12, x3, lsl#2
+    add x20, x13, x3, lsl#2
+
+    // load bias 32bit, accumulator 8 reg
+    ld1 {v16.4s}, [x6]
+    mov v17.16b, v16.16b
+    mov v18.16b, v16.16b
+    mov v19.16b, v16.16b
+    mov v20.16b, v16.16b
+    mov v21.16b, v16.16b
+    mov v22.16b, v16.16b
+    mov v23.16b, v16.16b
+
+    // src_depth counter
+    mov x21, x3
+
+    // weight_ptr
+    mov x22, x2
+
+    cmp x21, #15
+    ble LoopCrr8
+
+    ldr q8, [x22]
+    ldr q9, [x22, #16]
+    ldr q10, [x22, #32]
+    ldr q11, [x22, #48]
+
+    ld1 {v0.16b}, [x10], #16
+    ld1 {v1.16b}, [x11], #16
+    ld1 {v2.16b}, [x12], #16
+    ld1 {v3.16b}, [x13], #16
+    ld1 {v4.16b}, [x14], #16
+    ld1 {v5.16b}, [x15], #16
+    ld1 {v6.16b}, [x19], #16
+    ld1 {v7.16b}, [x20], #16
+
+    .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+    .word 0x4f81e111 // sdot v17.4s, v8.16b,  v1.4b[0]
+    .word 0x4f82e112 // sdot v18.4s, v8.16b,  v2.4b[0]
+    .word 0x4f83e113 // sdot v19.4s, v8.16b,  v3.4b[0]
+    .word 0x4f84e114 // sdot v20.4s, v8.16b,  v4.4b[0]
+    .word 0x4f85e115 // sdot v21.4s, v8.16b,  v5.4b[0]
+    .word 0x4f86e116 // sdot v22.4s, v8.16b,  v6.4b[0]
+    .word 0x4f87e117 // sdot v23.4s, v8.16b,  v7.4b[0]
+
+    sub x21, x21, #16
+
+    LoopCrr16:
+        cmp x21, #15
+        ble LoopCrr16End
+
+        add x22, x22, #64
+
+        .word 0x4fa0e130 // sdot v16.4s, v9.16b,  v0.4b[1]
+        .word 0x4fa1e131 // sdot v17.4s, v9.16b,  v1.4b[1]
+        .word 0x4fa2e132 // sdot v18.4s, v9.16b,  v2.4b[1]
+        .word 0x4fa3e133 // sdot v19.4s, v9.16b,  v3.4b[1]
+        .word 0x4fa4e134 // sdot v20.4s, v9.16b,  v4.4b[1]
+        .word 0x4fa5e135 // sdot v21.4s, v9.16b,  v5.4b[1]
+        .word 0x4fa6e136 // sdot v22.4s, v9.16b,  v6.4b[1]
+        .word 0x4fa7e137 // sdot v23.4s, v9.16b,  v7.4b[1]
+
+        ldr q8, [x22]
+        ldr q9, [x22, #16]
+
+        .word 0x4f80e950 // sdot v16.4s, v10.16b, v0.4b[2]
+        .word 0x4f81e951 // sdot v17.4s, v10.16b, v1.4b[2]
+        .word 0x4f82e952 // sdot v18.4s, v10.16b, v2.4b[2]
+        .word 0x4f83e953 // sdot v19.4s, v10.16b, v3.4b[2]
+        .word 0x4f84e954 // sdot v20.4s, v10.16b, v4.4b[2]
+        .word 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]
+        .word 0x4f86e956 // sdot v22.4s, v10.16b, v6.4b[2]
+        .word 0x4f87e957 // sdot v23.4s, v10.16b, v7.4b[2]
+
+        ldr q10, [x22, #32]
+
+        .word 0x4fa0e970 // sdot v16.4s, v11.16b, v0.4b[3]
+        ld1 {v0.16b}, [x10], #16
+        .word 0x4fa1e971 // sdot v17.4s, v11.16b, v1.4b[3]
+        ld1 {v1.16b}, [x11], #16
+        .word 0x4fa2e972 // sdot v18.4s, v11.16b, v2.4b[3]
+        ld1 {v2.16b}, [x12], #16
+        .word 0x4fa3e973 // sdot v19.4s, v11.16b, v3.4b[3]
+        ld1 {v3.16b}, [x13], #16
+        .word 0x4fa4e974 // sdot v20.4s, v11.16b, v4.4b[3]
+        ld1 {v4.16b}, [x14], #16
+        .word 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]
+        ld1 {v5.16b}, [x15], #16
+        .word 0x4fa6e976 // sdot v22.4s, v11.16b, v6.4b[3]
+        ld1 {v6.16b}, [x19], #16
+        .word 0x4fa7e977 // sdot v23.4s, v11.16b, v7.4b[3]
+        ld1 {v7.16b}, [x20], #16
+
+        ldr q11, [x22, #48]
+        sub x21, x21, #16
+
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f81e111 // sdot v17.4s, v8.16b,  v1.4b[0]
+        .word 0x4f82e112 // sdot v18.4s, v8.16b,  v2.4b[0]
+        .word 0x4f83e113 // sdot v19.4s, v8.16b,  v3.4b[0]
+        .word 0x4f84e114 // sdot v20.4s, v8.16b,  v4.4b[0]
+        .word 0x4f85e115 // sdot v21.4s, v8.16b,  v5.4b[0]
+        .word 0x4f86e116 // sdot v22.4s, v8.16b,  v6.4b[0]
+        .word 0x4f87e117 // sdot v23.4s, v8.16b,  v7.4b[0]
+
+        b LoopCrr16
+
+    LoopCrr16End:
+
+        add x22, x22, #64
+        .word 0x4fa0e130 // sdot v16.4s, v9.16b,  v0.4b[1]
+        .word 0x4fa1e131 // sdot v17.4s, v9.16b,  v1.4b[1]
+        .word 0x4fa2e132 // sdot v18.4s, v9.16b,  v2.4b[1]
+        .word 0x4fa3e133 // sdot v19.4s, v9.16b,  v3.4b[1]
+        .word 0x4fa4e134 // sdot v20.4s, v9.16b,  v4.4b[1]
+        .word 0x4fa5e135 // sdot v21.4s, v9.16b,  v5.4b[1]
+        .word 0x4fa6e136 // sdot v22.4s, v9.16b,  v6.4b[1]
+        .word 0x4fa7e137 // sdot v23.4s, v9.16b,  v7.4b[1]
+        .word 0x4f80e950 // sdot v16.4s, v10.16b, v0.4b[2]
+        .word 0x4f81e951 // sdot v17.4s, v10.16b, v1.4b[2]
+        .word 0x4f82e952 // sdot v18.4s, v10.16b, v2.4b[2]
+        .word 0x4f83e953 // sdot v19.4s, v10.16b, v3.4b[2]
+        .word 0x4f84e954 // sdot v20.4s, v10.16b, v4.4b[2]
+        .word 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]
+        .word 0x4f86e956 // sdot v22.4s, v10.16b, v6.4b[2]
+        .word 0x4f87e957 // sdot v23.4s, v10.16b, v7.4b[2]
+        .word 0x4fa0e970 // sdot v16.4s, v11.16b, v0.4b[3]
+        .word 0x4fa1e971 // sdot v17.4s, v11.16b, v1.4b[3]
+        .word 0x4fa2e972 // sdot v18.4s, v11.16b, v2.4b[3]
+        .word 0x4fa3e973 // sdot v19.4s, v11.16b, v3.4b[3]
+        .word 0x4fa4e974 // sdot v20.4s, v11.16b, v4.4b[3]
+        .word 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]
+        .word 0x4fa6e976 // sdot v22.4s, v11.16b, v6.4b[3]
+        .word 0x4fa7e977 // sdot v23.4s, v11.16b, v7.4b[3]
+
+    LoopCrr8:
+        cmp x21, #7
+        ble LoopCrr4
+
+        ld1 {v8.16b, v9.16b}, [x22], #32
+
+        ld1 {v0.8b}, [x10], #8
+        ld1 {v1.8b}, [x11], #8
+        ld1 {v2.8b}, [x12], #8
+        ld1 {v3.8b}, [x13], #8
+        ld1 {v4.8b}, [x14], #8
+        ld1 {v5.8b}, [x15], #8
+        ld1 {v6.8b}, [x19], #8
+        ld1 {v7.8b}, [x20], #8
+
+        .word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]
+        .word 0x4f81e111 // sdot v17.4s, v8.16b, v1.4b[0]
+        .word 0x4f82e112 // sdot v18.4s, v8.16b, v2.4b[0]
+        .word 0x4f83e113 // sdot v19.4s, v8.16b, v3.4b[0]
+        .word 0x4f84e114 // sdot v20.4s, v8.16b, v4.4b[0]
+        .word 0x4f85e115 // sdot v21.4s, v8.16b, v5.4b[0]
+        .word 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]
+        .word 0x4f87e117 // sdot v23.4s, v8.16b, v7.4b[0]
+
+        sub x21, x21, #8
+
+        .word 0x4fa0e130 // sdot v16.4s, v9.16b, v0.4b[1]
+        .word 0x4fa1e131 // sdot v17.4s, v9.16b, v1.4b[1]
+        .word 0x4fa2e132 // sdot v18.4s, v9.16b, v2.4b[1]
+        .word 0x4fa3e133 // sdot v19.4s, v9.16b, v3.4b[1]
+        .word 0x4fa4e134 // sdot v20.4s, v9.16b, v4.4b[1]
+        .word 0x4fa5e135 // sdot v21.4s, v9.16b, v5.4b[1]
+        .word 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]
+        .word 0x4fa7e137 // sdot v23.4s, v9.16b, v7.4b[1]
+
+        b LoopCrr8
+    
+    LoopCrr4:
+        cmp x21, #3
+        ble LoopEnd
+
+        ld1 {v8.16b}, [x22], #16
+        ld1 {v0.s}[0], [x10], #4
+        ld1 {v1.s}[0], [x11], #4
+        ld1 {v2.s}[0], [x12], #4
+        ld1 {v3.s}[0], [x13], #4
+        ld1 {v4.s}[0], [x14], #4
+        ld1 {v5.s}[0], [x15], #4
+        ld1 {v6.s}[0], [x19], #4
+        ld1 {v7.s}[0], [x20], #4
+
+        sub x21, x21, #4
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f81e111 // sdot v17.4s, v8.16b,  v1.4b[0]
+        .word 0x4f82e112 // sdot v18.4s, v8.16b,  v2.4b[0]
+        .word 0x4f83e113 // sdot v19.4s, v8.16b,  v3.4b[0]
+        .word 0x4f84e114 // sdot v20.4s, v8.16b,  v4.4b[0]
+        .word 0x4f85e115 // sdot v21.4s, v8.16b,  v5.4b[0]
+        .word 0x4f86e116 // sdot v22.4s, v8.16b,  v6.4b[0]
+        .word 0x4f87e117 // sdot v23.4s, v8.16b,  v7.4b[0]
+
+        b LoopCrr4
+
+LoopEnd:
+    // hw counter -= 8
+    sub x5, x5, #8
+    // src_ptr += 8 * src_depth
+    add x1, x1, x3, lsl#3
+
+    // scale oc0 ~ oc7
+    ldr q1, [x7]
+
+ConvReluAdd:
+    cmp x23, #-1  // if relu == -1, Conv-Relu-Add
+    bne MulScale
+
+    smax v16.4s, v16.4s, v31.4s
+    smax v17.4s, v17.4s, v31.4s
+    smax v18.4s, v18.4s, v31.4s
+    smax v19.4s, v19.4s, v31.4s
+    smax v20.4s, v20.4s, v31.4s
+    smax v21.4s, v21.4s, v31.4s
+    smax v22.4s, v22.4s, v31.4s
+    smax v23.4s, v23.4s, v31.4s
+MulScale:
+    scvtf v16.4s, v16.4s
+    scvtf v17.4s, v17.4s
+    scvtf v18.4s, v18.4s
+    scvtf v19.4s, v19.4s
+    scvtf v20.4s, v20.4s
+    scvtf v21.4s, v21.4s
+    scvtf v22.4s, v22.4s
+    scvtf v23.4s, v23.4s
+
+    fmul v16.4s, v16.4s, v1.4s
+    fmul v17.4s, v17.4s, v1.4s
+    fmul v18.4s, v18.4s, v1.4s
+    fmul v19.4s, v19.4s, v1.4s
+    fmul v20.4s, v20.4s, v1.4s
+    fmul v21.4s, v21.4s, v1.4s
+    fmul v22.4s, v22.4s, v1.4s
+    fmul v23.4s, v23.4s, v1.4s
+
+    cbz x25, ConvAddPost  // if add_input_ptr == 0, skip
+
+AddInputScale:
+
+    // add_input_ptr 0 ~ 7
+    add x11, x25, x4
+    add x12, x25, x4, lsl#1
+    add x14, x25, x4, lsl#2
+    add x13, x11, x4, lsl#1
+    add x15, x11, x4, lsl#2
+    add x19, x12, x4, lsl#2
+    add x20, x13, x4, lsl#2
+    ld1 {v0.s}[0], [x25]
+    ld1 {v1.s}[0], [x11]
+    ld1 {v2.s}[0], [x12]
+    ld1 {v3.s}[0], [x13]
+    ld1 {v4.s}[0], [x14]
+    ld1 {v5.s}[0], [x15]
+    ld1 {v6.s}[0], [x19]
+    ld1 {v7.s}[0], [x20]
+    // add_scale
+    ldr q8, [x26]
+
+    // convert add_input int8 to fp32
+    sxtl v0.8h, v0.8b
+    sxtl v1.8h, v1.8b
+    sxtl v2.8h, v2.8b
+    sxtl v3.8h, v3.8b
+    sxtl v4.8h, v4.8b
+    sxtl v5.8h, v5.8b
+    sxtl v6.8h, v6.8b
+    sxtl v7.8h, v7.8b
+    sxtl v0.4s, v0.4h
+    sxtl v1.4s, v1.4h
+    sxtl v2.4s, v2.4h
+    sxtl v3.4s, v3.4h
+    sxtl v4.4s, v4.4h
+    sxtl v5.4s, v5.4h
+    sxtl v6.4s, v6.4h
+    sxtl v7.4s, v7.4h
+    scvtf v0.4s, v0.4s
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    scvtf v3.4s, v3.4s
+    scvtf v4.4s, v4.4s
+    scvtf v5.4s, v5.4s
+    scvtf v6.4s, v6.4s
+    scvtf v7.4s, v7.4s
+
+    fmla v16.4s, v0.4s, v8.4s
+    fmla v17.4s, v1.4s, v8.4s
+    fmla v18.4s, v2.4s, v8.4s
+    fmla v19.4s, v3.4s, v8.4s
+    fmla v20.4s, v4.4s, v8.4s
+    fmla v21.4s, v5.4s, v8.4s
+    fmla v22.4s, v6.4s, v8.4s
+    fmla v23.4s, v7.4s, v8.4s
+
+    // add_input_ptr += 8 * dst_depth
+    add x25, x25, x4, lsl#3
+
+ConvAddPost:
+    fcvtas v16.4s, v16.4s
+    fcvtas v17.4s, v17.4s
+    fcvtas v18.4s, v18.4s
+    fcvtas v19.4s, v19.4s
+    fcvtas v20.4s, v20.4s
+    fcvtas v21.4s, v21.4s
+    fcvtas v22.4s, v22.4s
+    fcvtas v23.4s, v23.4s
+
+    sqxtn  v16.4h, v16.4s
+    sqxtn  v18.4h, v18.4s
+    sqxtn  v20.4h, v20.4s
+    sqxtn  v22.4h, v22.4s
+    sqxtn2 v16.8h, v17.4s
+    sqxtn2 v18.8h, v19.4s
+    sqxtn2 v20.8h, v21.4s
+    sqxtn2 v22.8h, v23.4s
+
+    sqxtn v16.8b, v16.8h
+    sqxtn v18.8b, v18.8h
+    sqxtn v20.8b, v20.8h
+    sqxtn v22.8b, v22.8h
+
+    cmp x23, #1  // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt ConvAddPostEnd
+    smax v16.8b, v16.8b, v31.8b
+    smax v18.8b, v18.8b, v31.8b
+    smax v20.8b, v20.8b, v31.8b
+    smax v22.8b, v22.8b, v31.8b
+
+    cmp x23, #2   // relu6
+    bne ConvAddPostEnd
+    ld1r {v0.2s}, [x24]
+    smin v16.8b, v16.8b, v0.8b
+    smin v18.8b, v18.8b, v0.8b
+    smin v20.8b, v20.8b, v0.8b
+    smin v22.8b, v22.8b, v0.8b
+
+ConvAddPostEnd:
+
+    // store to dst_ptr 0 ~ 7
+    mov x10, x0
+    add x11, x0,  x4
+    add x12, x0,  x4, lsl#1
+    add x14, x0,  x4, lsl#2
+    add x13, x11, x4, lsl#1
+    add x15, x11, x4, lsl#2
+    add x19, x12, x4, lsl#2
+    add x20, x13, x4, lsl#2
+
+    st1 {v16.s}[0], [x10]
+    st1 {v16.s}[1], [x11]
+    st1 {v18.s}[0], [x12]
+    st1 {v18.s}[1], [x13]
+    st1 {v20.s}[0], [x14]
+    st1 {v20.s}[1], [x15]
+    st1 {v22.s}[0], [x19]
+    st1 {v22.s}[1], [x20]
+
+    // dst_ptr += 8 * dst_depth
+    add x0, x0, x4, lsl#3
+
+    b LoopHW8
+
+LoopHW1:
+    // if hw counter <= 0, skip
+    cmp x5, #0
+    ble LoopHW1End
+
+    // src_ptr_0
+    mov x10, x1
+
+    // load bias 32bit, accumulator 2 reg
+    ld1 {v16.4s}, [x6]
+
+    // src_depth counter
+    mov x21, x3
+
+    // weight_ptr
+    mov x22, x2
+
+    HW1LoopCrr16:
+        cmp x21, #15
+        ble HW1LoopCrr8
+
+        ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x22], #64
+        ld1 {v0.16b}, [x10], #16
+        sub x21, x21, #16
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4fa0e130 // sdot v16.4s, v9.16b,  v0.4b[1]
+        .word 0x4f80e950 // sdot v16.4s, v10.16b, v0.4b[2]
+        .word 0x4fa0e970 // sdot v16.4s, v11.16b, v0.4b[3]
+        b HW1LoopCrr16
+    
+    HW1LoopCrr8:
+        cmp x21, #7
+        ble HW1LoopCrr4
+
+        ld1 {v8.16b, v9.16b}, [x22], #32
+        ld1 {v0.8b}, [x10], #8
+        sub x21, x21, #8
+        .word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]
+        .word 0x4fa0e130 // sdot v16.4s, v9.16b, v0.4b[1]
+        b HW1LoopCrr8
+
+    HW1LoopCrr4:
+        cmp x21, #3
+        ble HW1LoopEnd
+
+        ld1 {v8.16b}, [x22], #16
+        ld1 {v0.s}[0], [x10], #4
+        sub x21, x21, #4
+        .word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]
+        b HW1LoopCrr4
+
+HW1LoopEnd:
+    // hw counter -= 1
+    sub x5, x5, #1
+    // src_ptr += 1 * src_depth
+    add x1, x1, x3
+
+    // scale oc0 ~ oc7
+    ldr q1, [x7]
+
+HW1ConvReluAdd:
+    cmp x23, #-1  // if relu == -1, Conv-Relu-Add
+    bne HW1MulScale
+
+    smax v16.4s, v16.4s, v31.4s
+
+HW1MulScale:
+    scvtf v16.4s, v16.4s
+
+    fmul v16.4s, v16.4s, v1.4s
+
+    cbz x25, HW1ConvAddPost  // if add_input_ptr == 0, skip
+
+HW1AddInputScale:
+
+    ld1 {v0.s}[0], [x25]
+    // add_scale
+    ldr q1, [x26]
+    // convert add_input int8 to fp32
+    sxtl v0.8h, v0.8b
+    sxtl v0.4s, v0.4h
+    scvtf v0.4s, v0.4s
+    fmla v16.4s, v0.4s, v1.4s
+
+    // add_input_ptr += 1 * dst_depth
+    add x25, x25, x4
+
+HW1ConvAddPost:
+    fcvtas v16.4s, v16.4s
+    sqxtn v16.4h, v16.4s
+    sqxtn v16.8b, v16.8h
+
+    cmp x23, #1  // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt HW1ConvAddPostEnd
+    smax v16.8b, v16.8b, v31.8b
+
+    cmp x23, #2   // relu6
+    bne HW1ConvAddPostEnd
+    ld1 {v0.s}[0], [x24]
+    smin v16.8b, v16.8b, v0.8b
+
+HW1ConvAddPostEnd:
+    // store to dst_ptr
+    st1 {v16.s}[0], [x0]
+    // dst_ptr += 1 * dst_depth
+    add x0, x0, x4
+
+    b LoopHW1
+
+LoopHW1End:
+
+sub sp, sp, #192
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+ldp x21, x22, [sp], #16
+ldp x23, x24, [sp], #16
+ldp x25, x26, [sp], #16
+
+END:
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X8.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X8.S
new file mode 100644
index 0000000..29e2580
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMM_INT8_SDOT_8X8.S
@@ -0,0 +1,821 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+asm_function GemmInt8SdotUnit8x8
+//void GemmInt8SdotUnit8x8(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                         long src_depth, long dst_depth, long hw, 
+//                         const int32_t* bias, const float* scale,
+//                         long relu, const int8_t* add_input, 
+//                         const float* add_scale, const int8_t* relu6_max)
+//x0(dst),
+//x1(src),
+//x2(weight),
+//x3(src_depth),
+//x4(dst_depth),
+//x5(hw),
+//x6(bias),
+//x7(scale)
+//from stack(relu)      [sp, #0]
+//from stack(add_input) [sp, #8]
+//from stack(add_scale) [sp, #16]
+//from stack(relu6_max) [sp, #24]
+
+// dz counter, dst_depth / 8 * 8
+lsr x9, x4, #3
+lsl x9, x9, #3
+
+subs x9, x9, #8
+blt END
+
+sub sp, sp, #208
+st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+stp x19, x20, [sp], #16
+stp x21, x22, [sp], #16
+stp x23, x24, [sp], #16
+stp x25, x26, [sp], #16
+stp x27, x28, [sp], #16
+
+// add_input
+ldr x26, [sp, #8]
+// add_scale
+ldr x27, [sp, #16]
+
+// for (long dz = 0; dz + 7 < dst_depth; dz += 8)
+LoopDz8:
+
+// src_ptr
+mov x23, x1
+
+// hw counter
+mov x24, x5
+
+// dst_ptr
+mov x25, x0
+
+// add_input_ptr
+mov x28, x26
+
+LoopHW8:
+    // if hw counter <= 7, skip
+    cmp x24, #7
+    ble LoopHW1
+
+    // src_ptr 0 ~ 7
+    mov x10, x23
+    add x11, x23, x3
+    add x12, x23, x3, lsl#1
+    add x14, x23, x3, lsl#2
+    add x13, x11, x3, lsl#1
+    add x15, x11, x3, lsl#2
+    add x19, x12, x3, lsl#2
+    add x20, x13, x3, lsl#2
+
+    // load bias 32bit, accumulator 16 reg
+    ld1 {v16.4s, v17.4s}, [x6]
+    mov v18.16b, v16.16b
+    mov v19.16b, v17.16b
+    mov v20.16b, v16.16b
+    mov v21.16b, v17.16b
+    mov v22.16b, v16.16b
+    mov v23.16b, v17.16b
+    mov v24.16b, v16.16b
+    mov v25.16b, v17.16b
+    mov v26.16b, v16.16b
+    mov v27.16b, v17.16b
+    mov v28.16b, v16.16b
+    mov v29.16b, v17.16b
+    mov v30.16b, v16.16b
+    mov v31.16b, v17.16b
+
+    // src_depth counter
+    mov x21, x3
+
+    // weight_ptr
+    mov x22, x2
+
+    cmp x21, #15
+    ble LoopCrr8
+
+    ldr q8, [x22]
+    ldr q9, [x22, #16]
+
+    ld1 {v0.16b}, [x10], #16
+    ld1 {v1.16b}, [x11], #16
+    ld1 {v2.16b}, [x12], #16
+    ld1 {v3.16b}, [x13], #16
+    ld1 {v4.16b}, [x14], #16
+    ld1 {v5.16b}, [x15], #16
+    ld1 {v6.16b}, [x19], #16
+    ld1 {v7.16b}, [x20], #16
+
+    ldr q10, [x22, #32]
+    ldr q11, [x22, #48]
+
+    .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+    .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+    .word 0x4f81e112 // sdot v18.4s, v8.16b,  v1.4b[0]
+    .word 0x4f81e133 // sdot v19.4s, v9.16b,  v1.4b[0]
+    .word 0x4f82e114 // sdot v20.4s, v8.16b,  v2.4b[0]
+    .word 0x4f82e135 // sdot v21.4s, v9.16b,  v2.4b[0]
+    .word 0x4f83e116 // sdot v22.4s, v8.16b,  v3.4b[0]
+    .word 0x4f83e137 // sdot v23.4s, v9.16b,  v3.4b[0]
+    .word 0x4f84e118 // sdot v24.4s, v8.16b,  v4.4b[0]
+    .word 0x4f84e139 // sdot v25.4s, v9.16b,  v4.4b[0]
+    .word 0x4f85e11a // sdot v26.4s, v8.16b,  v5.4b[0]
+    .word 0x4f85e13b // sdot v27.4s, v9.16b,  v5.4b[0]
+    .word 0x4f86e11c // sdot v28.4s, v8.16b,  v6.4b[0]
+    .word 0x4f86e13d // sdot v29.4s, v9.16b,  v6.4b[0]
+    .word 0x4f87e11e // sdot v30.4s, v8.16b,  v7.4b[0]
+    .word 0x4f87e13f // sdot v31.4s, v9.16b,  v7.4b[0]
+
+    sub x21, x21, #16
+
+    LoopCrr16:
+        cmp x21, #15
+        ble LoopCrr16End
+
+        ldr q8, [x22, #64]
+        ldr q9, [x22, #80]
+
+        .word 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]
+        .word 0x4fa0e171 // sdot v17.4s, v11.16b, v0.4b[1]
+        prfm pldl1keep, [x10, #64]
+        .word 0x4fa1e152 // sdot v18.4s, v10.16b, v1.4b[1]
+        .word 0x4fa1e173 // sdot v19.4s, v11.16b, v1.4b[1]
+        prfm pldl1keep, [x11, #64]
+        .word 0x4fa2e154 // sdot v20.4s, v10.16b, v2.4b[1]
+        .word 0x4fa2e175 // sdot v21.4s, v11.16b, v2.4b[1]
+        prfm pldl1keep, [x12, #64]
+        .word 0x4fa3e156 // sdot v22.4s, v10.16b, v3.4b[1]
+        .word 0x4fa3e177 // sdot v23.4s, v11.16b, v3.4b[1]
+        prfm pldl1keep, [x13, #64]
+        .word 0x4fa4e158 // sdot v24.4s, v10.16b, v4.4b[1]
+        .word 0x4fa4e179 // sdot v25.4s, v11.16b, v4.4b[1]
+        prfm pldl1keep, [x14, #64]
+        .word 0x4fa5e15a // sdot v26.4s, v10.16b, v5.4b[1]
+        .word 0x4fa5e17b // sdot v27.4s, v11.16b, v5.4b[1]
+        prfm pldl1keep, [x15, #64]
+        .word 0x4fa6e15c // sdot v28.4s, v10.16b, v6.4b[1]
+        .word 0x4fa6e17d // sdot v29.4s, v11.16b, v6.4b[1]
+        prfm pldl1keep, [x19, #64]
+        .word 0x4fa7e15e // sdot v30.4s, v10.16b, v7.4b[1]
+        .word 0x4fa7e17f // sdot v31.4s, v11.16b, v7.4b[1]
+        prfm pldl1keep, [x20, #64]
+
+        ldr q10, [x22, #96]
+        ldr q11, [x22, #112]
+        add x22, x22, #128
+
+        .word 0x4f80e910 // sdot v16.4s, v8.16b,  v0.4b[2]
+        .word 0x4f80e931 // sdot v17.4s, v9.16b,  v0.4b[2]
+        .word 0x4f81e912 // sdot v18.4s, v8.16b,  v1.4b[2]
+        .word 0x4f81e933 // sdot v19.4s, v9.16b,  v1.4b[2]
+        .word 0x4f82e914 // sdot v20.4s, v8.16b,  v2.4b[2]
+        .word 0x4f82e935 // sdot v21.4s, v9.16b,  v2.4b[2]
+        .word 0x4f83e916 // sdot v22.4s, v8.16b,  v3.4b[2]
+        .word 0x4f83e937 // sdot v23.4s, v9.16b,  v3.4b[2]
+        .word 0x4f84e918 // sdot v24.4s, v8.16b,  v4.4b[2]
+        .word 0x4f84e939 // sdot v25.4s, v9.16b,  v4.4b[2]
+        .word 0x4f85e91a // sdot v26.4s, v8.16b,  v5.4b[2]
+        .word 0x4f85e93b // sdot v27.4s, v9.16b,  v5.4b[2]
+        .word 0x4f86e91c // sdot v28.4s, v8.16b,  v6.4b[2]
+        .word 0x4f86e93d // sdot v29.4s, v9.16b,  v6.4b[2]
+        .word 0x4f87e91e // sdot v30.4s, v8.16b,  v7.4b[2]
+        .word 0x4f87e93f // sdot v31.4s, v9.16b,  v7.4b[2]
+
+        ldr q8, [x22]
+        ldr q9, [x22, #16]
+
+        .word 0x4fa0e950 // sdot v16.4s, v10.16b, v0.4b[3]
+        .word 0x4fa0e971 // sdot v17.4s, v11.16b, v0.4b[3]
+        ld1 {v0.16b}, [x10], #16
+        .word 0x4fa1e952 // sdot v18.4s, v10.16b, v1.4b[3]
+        .word 0x4fa1e973 // sdot v19.4s, v11.16b, v1.4b[3]
+        ld1 {v1.16b}, [x11], #16
+        .word 0x4fa2e954 // sdot v20.4s, v10.16b, v2.4b[3]
+        .word 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]
+        ld1 {v2.16b}, [x12], #16
+        .word 0x4fa3e956 // sdot v22.4s, v10.16b, v3.4b[3]
+        .word 0x4fa3e977 // sdot v23.4s, v11.16b, v3.4b[3]
+        ld1 {v3.16b}, [x13], #16
+        .word 0x4fa4e958 // sdot v24.4s, v10.16b, v4.4b[3]
+        .word 0x4fa4e979 // sdot v25.4s, v11.16b, v4.4b[3]
+        ld1 {v4.16b}, [x14], #16
+        .word 0x4fa5e95a // sdot v26.4s, v10.16b, v5.4b[3]
+        .word 0x4fa5e97b // sdot v27.4s, v11.16b, v5.4b[3]
+        ld1 {v5.16b}, [x15], #16
+        .word 0x4fa6e95c // sdot v28.4s, v10.16b, v6.4b[3]
+        .word 0x4fa6e97d // sdot v29.4s, v11.16b, v6.4b[3]
+        ld1 {v6.16b}, [x19], #16
+        .word 0x4fa7e95e // sdot v30.4s, v10.16b, v7.4b[3]
+        .word 0x4fa7e97f // sdot v31.4s, v11.16b, v7.4b[3]
+        ld1 {v7.16b}, [x20], #16
+
+        ldr q10, [x22, #32]
+        ldr q11, [x22, #48]
+        sub x21, x21, #16
+
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        .word 0x4f81e112 // sdot v18.4s, v8.16b,  v1.4b[0]
+        .word 0x4f81e133 // sdot v19.4s, v9.16b,  v1.4b[0]
+        .word 0x4f82e114 // sdot v20.4s, v8.16b,  v2.4b[0]
+        .word 0x4f82e135 // sdot v21.4s, v9.16b,  v2.4b[0]
+        .word 0x4f83e116 // sdot v22.4s, v8.16b,  v3.4b[0]
+        .word 0x4f83e137 // sdot v23.4s, v9.16b,  v3.4b[0]
+        .word 0x4f84e118 // sdot v24.4s, v8.16b,  v4.4b[0]
+        .word 0x4f84e139 // sdot v25.4s, v9.16b,  v4.4b[0]
+        .word 0x4f85e11a // sdot v26.4s, v8.16b,  v5.4b[0]
+        .word 0x4f85e13b // sdot v27.4s, v9.16b,  v5.4b[0]
+        .word 0x4f86e11c // sdot v28.4s, v8.16b,  v6.4b[0]
+        .word 0x4f86e13d // sdot v29.4s, v9.16b,  v6.4b[0]
+        .word 0x4f87e11e // sdot v30.4s, v8.16b,  v7.4b[0]
+        .word 0x4f87e13f // sdot v31.4s, v9.16b,  v7.4b[0]
+
+        b LoopCrr16
+
+    LoopCrr16End:
+
+        ldr q12, [x22, #64]
+        ldr q13, [x22, #80]
+        ldr q14, [x22, #96]
+        ldr q15, [x22, #112]
+        add x22, x22, #128
+        .word 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]
+        .word 0x4fa0e171 // sdot v17.4s, v11.16b, v0.4b[1]
+        .word 0x4fa1e152 // sdot v18.4s, v10.16b, v1.4b[1]
+        .word 0x4fa1e173 // sdot v19.4s, v11.16b, v1.4b[1]
+        .word 0x4fa2e154 // sdot v20.4s, v10.16b, v2.4b[1]
+        .word 0x4fa2e175 // sdot v21.4s, v11.16b, v2.4b[1]
+        .word 0x4fa3e156 // sdot v22.4s, v10.16b, v3.4b[1]
+        .word 0x4fa3e177 // sdot v23.4s, v11.16b, v3.4b[1]
+        .word 0x4fa4e158 // sdot v24.4s, v10.16b, v4.4b[1]
+        .word 0x4fa4e179 // sdot v25.4s, v11.16b, v4.4b[1]
+        .word 0x4fa5e15a // sdot v26.4s, v10.16b, v5.4b[1]
+        .word 0x4fa5e17b // sdot v27.4s, v11.16b, v5.4b[1]
+        .word 0x4fa6e15c // sdot v28.4s, v10.16b, v6.4b[1]
+        .word 0x4fa6e17d // sdot v29.4s, v11.16b, v6.4b[1]
+        .word 0x4fa7e15e // sdot v30.4s, v10.16b, v7.4b[1]
+        .word 0x4fa7e17f // sdot v31.4s, v11.16b, v7.4b[1]
+        .word 0x4f80e990 // sdot v16.4s, v12.16b, v0.4b[2]
+        .word 0x4f80e9b1 // sdot v17.4s, v13.16b, v0.4b[2]
+        .word 0x4f81e992 // sdot v18.4s, v12.16b, v1.4b[2]
+        .word 0x4f81e9b3 // sdot v19.4s, v13.16b, v1.4b[2]
+        .word 0x4f82e994 // sdot v20.4s, v12.16b, v2.4b[2]
+        .word 0x4f82e9b5 // sdot v21.4s, v13.16b, v2.4b[2]
+        .word 0x4f83e996 // sdot v22.4s, v12.16b, v3.4b[2]
+        .word 0x4f83e9b7 // sdot v23.4s, v13.16b, v3.4b[2]
+        .word 0x4f84e998 // sdot v24.4s, v12.16b, v4.4b[2]
+        .word 0x4f84e9b9 // sdot v25.4s, v13.16b, v4.4b[2]
+        .word 0x4f85e99a // sdot v26.4s, v12.16b, v5.4b[2]
+        .word 0x4f85e9bb // sdot v27.4s, v13.16b, v5.4b[2]
+        .word 0x4f86e99c // sdot v28.4s, v12.16b, v6.4b[2]
+        .word 0x4f86e9bd // sdot v29.4s, v13.16b, v6.4b[2]
+        .word 0x4f87e99e // sdot v30.4s, v12.16b, v7.4b[2]
+        .word 0x4f87e9bf // sdot v31.4s, v13.16b, v7.4b[2]
+        .word 0x4fa0e9d0 // sdot v16.4s, v14.16b, v0.4b[3]
+        .word 0x4fa0e9f1 // sdot v17.4s, v15.16b, v0.4b[3]
+        .word 0x4fa1e9d2 // sdot v18.4s, v14.16b, v1.4b[3]
+        .word 0x4fa1e9f3 // sdot v19.4s, v15.16b, v1.4b[3]
+        .word 0x4fa2e9d4 // sdot v20.4s, v14.16b, v2.4b[3]
+        .word 0x4fa2e9f5 // sdot v21.4s, v15.16b, v2.4b[3]
+        .word 0x4fa3e9d6 // sdot v22.4s, v14.16b, v3.4b[3]
+        .word 0x4fa3e9f7 // sdot v23.4s, v15.16b, v3.4b[3]
+        .word 0x4fa4e9d8 // sdot v24.4s, v14.16b, v4.4b[3]
+        .word 0x4fa4e9f9 // sdot v25.4s, v15.16b, v4.4b[3]
+        .word 0x4fa5e9da // sdot v26.4s, v14.16b, v5.4b[3]
+        .word 0x4fa5e9fb // sdot v27.4s, v15.16b, v5.4b[3]
+        .word 0x4fa6e9dc // sdot v28.4s, v14.16b, v6.4b[3]
+        .word 0x4fa6e9fd // sdot v29.4s, v15.16b, v6.4b[3]
+        .word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]
+        .word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]
+
+    LoopCrr8:
+        cmp x21, #7
+        ble LoopCrr4
+
+        ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x22], #64
+
+        ld1 {v0.8b}, [x10], #8
+        ld1 {v1.8b}, [x11], #8
+        ld1 {v2.8b}, [x12], #8
+        ld1 {v3.8b}, [x13], #8
+        ld1 {v4.8b}, [x14], #8
+        ld1 {v5.8b}, [x15], #8
+        ld1 {v6.8b}, [x19], #8
+        ld1 {v7.8b}, [x20], #8
+
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        .word 0x4f81e112 // sdot v18.4s, v8.16b,  v1.4b[0]
+        .word 0x4f81e133 // sdot v19.4s, v9.16b,  v1.4b[0]
+        .word 0x4f82e114 // sdot v20.4s, v8.16b,  v2.4b[0]
+        .word 0x4f82e135 // sdot v21.4s, v9.16b,  v2.4b[0]
+        .word 0x4f83e116 // sdot v22.4s, v8.16b,  v3.4b[0]
+        .word 0x4f83e137 // sdot v23.4s, v9.16b,  v3.4b[0]
+        .word 0x4f84e118 // sdot v24.4s, v8.16b,  v4.4b[0]
+        .word 0x4f84e139 // sdot v25.4s, v9.16b,  v4.4b[0]
+        .word 0x4f85e11a // sdot v26.4s, v8.16b,  v5.4b[0]
+        .word 0x4f85e13b // sdot v27.4s, v9.16b,  v5.4b[0]
+        .word 0x4f86e11c // sdot v28.4s, v8.16b,  v6.4b[0]
+        .word 0x4f86e13d // sdot v29.4s, v9.16b,  v6.4b[0]
+        .word 0x4f87e11e // sdot v30.4s, v8.16b,  v7.4b[0]
+        .word 0x4f87e13f // sdot v31.4s, v9.16b,  v7.4b[0]
+
+        sub x21, x21, #8
+
+        .word 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]
+        .word 0x4fa0e171 // sdot v17.4s, v11.16b, v0.4b[1]
+        .word 0x4fa1e152 // sdot v18.4s, v10.16b, v1.4b[1]
+        .word 0x4fa1e173 // sdot v19.4s, v11.16b, v1.4b[1]
+        .word 0x4fa2e154 // sdot v20.4s, v10.16b, v2.4b[1]
+        .word 0x4fa2e175 // sdot v21.4s, v11.16b, v2.4b[1]
+        .word 0x4fa3e156 // sdot v22.4s, v10.16b, v3.4b[1]
+        .word 0x4fa3e177 // sdot v23.4s, v11.16b, v3.4b[1]
+        .word 0x4fa4e158 // sdot v24.4s, v10.16b, v4.4b[1]
+        .word 0x4fa4e179 // sdot v25.4s, v11.16b, v4.4b[1]
+        .word 0x4fa5e15a // sdot v26.4s, v10.16b, v5.4b[1]
+        .word 0x4fa5e17b // sdot v27.4s, v11.16b, v5.4b[1]
+        .word 0x4fa6e15c // sdot v28.4s, v10.16b, v6.4b[1]
+        .word 0x4fa6e17d // sdot v29.4s, v11.16b, v6.4b[1]
+        .word 0x4fa7e15e // sdot v30.4s, v10.16b, v7.4b[1]
+        .word 0x4fa7e17f // sdot v31.4s, v11.16b, v7.4b[1]
+
+        b LoopCrr8
+    
+    LoopCrr4:
+        cmp x21, #3
+        ble LoopEnd
+
+        ld1 {v8.16b, v9.16b}, [x22], #32
+        ld1 {v0.s}[0], [x10], #4
+        ld1 {v1.s}[0], [x11], #4
+        ld1 {v2.s}[0], [x12], #4
+        ld1 {v3.s}[0], [x13], #4
+        ld1 {v4.s}[0], [x14], #4
+        ld1 {v5.s}[0], [x15], #4
+        ld1 {v6.s}[0], [x19], #4
+        ld1 {v7.s}[0], [x20], #4
+
+        sub x21, x21, #4
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        .word 0x4f81e112 // sdot v18.4s, v8.16b,  v1.4b[0]
+        .word 0x4f81e133 // sdot v19.4s, v9.16b,  v1.4b[0]
+        .word 0x4f82e114 // sdot v20.4s, v8.16b,  v2.4b[0]
+        .word 0x4f82e135 // sdot v21.4s, v9.16b,  v2.4b[0]
+        .word 0x4f83e116 // sdot v22.4s, v8.16b,  v3.4b[0]
+        .word 0x4f83e137 // sdot v23.4s, v9.16b,  v3.4b[0]
+        .word 0x4f84e118 // sdot v24.4s, v8.16b,  v4.4b[0]
+        .word 0x4f84e139 // sdot v25.4s, v9.16b,  v4.4b[0]
+        .word 0x4f85e11a // sdot v26.4s, v8.16b,  v5.4b[0]
+        .word 0x4f85e13b // sdot v27.4s, v9.16b,  v5.4b[0]
+        .word 0x4f86e11c // sdot v28.4s, v8.16b,  v6.4b[0]
+        .word 0x4f86e13d // sdot v29.4s, v9.16b,  v6.4b[0]
+        .word 0x4f87e11e // sdot v30.4s, v8.16b,  v7.4b[0]
+        .word 0x4f87e13f // sdot v31.4s, v9.16b,  v7.4b[0]
+
+        b LoopCrr4
+
+LoopEnd:
+    // hw counter -= 8
+    sub x24, x24, #8
+    // src_ptr += 8 * src_depth
+    add x23, x23, x3, lsl#3
+
+    ldr x10, [sp, #0]   // relu
+
+    // scale oc0 ~ oc7
+    ldr q1, [x7]
+    ldr q2, [x7, #16]
+
+ConvReluAdd:
+    cmp x10, #-1  // if relu == -1, Conv-Relu-Add
+    bne MulScale
+
+    eor v0.16b, v0.16b, v0.16b
+    smax v16.4s, v16.4s, v0.4s
+    smax v17.4s, v17.4s, v0.4s
+    smax v18.4s, v18.4s, v0.4s
+    smax v19.4s, v19.4s, v0.4s
+    smax v20.4s, v20.4s, v0.4s
+    smax v21.4s, v21.4s, v0.4s
+    smax v22.4s, v22.4s, v0.4s
+    smax v23.4s, v23.4s, v0.4s
+    smax v24.4s, v24.4s, v0.4s
+    smax v25.4s, v25.4s, v0.4s
+    smax v26.4s, v26.4s, v0.4s
+    smax v27.4s, v27.4s, v0.4s
+    smax v28.4s, v28.4s, v0.4s
+    smax v29.4s, v29.4s, v0.4s
+    smax v30.4s, v30.4s, v0.4s
+    smax v31.4s, v31.4s, v0.4s
+MulScale:
+    scvtf v16.4s, v16.4s
+    scvtf v17.4s, v17.4s
+    scvtf v18.4s, v18.4s
+    scvtf v19.4s, v19.4s
+    scvtf v20.4s, v20.4s
+    scvtf v21.4s, v21.4s
+    scvtf v22.4s, v22.4s
+    scvtf v23.4s, v23.4s
+    scvtf v24.4s, v24.4s
+    scvtf v25.4s, v25.4s
+    scvtf v26.4s, v26.4s
+    scvtf v27.4s, v27.4s
+    scvtf v28.4s, v28.4s
+    scvtf v29.4s, v29.4s
+    scvtf v30.4s, v30.4s
+    scvtf v31.4s, v31.4s
+
+    fmul v16.4s, v16.4s, v1.4s
+    fmul v17.4s, v17.4s, v2.4s
+    fmul v18.4s, v18.4s, v1.4s
+    fmul v19.4s, v19.4s, v2.4s
+    fmul v20.4s, v20.4s, v1.4s
+    fmul v21.4s, v21.4s, v2.4s
+    fmul v22.4s, v22.4s, v1.4s
+    fmul v23.4s, v23.4s, v2.4s
+    fmul v24.4s, v24.4s, v1.4s
+    fmul v25.4s, v25.4s, v2.4s
+    fmul v26.4s, v26.4s, v1.4s
+    fmul v27.4s, v27.4s, v2.4s
+    fmul v28.4s, v28.4s, v1.4s
+    fmul v29.4s, v29.4s, v2.4s
+    fmul v30.4s, v30.4s, v1.4s
+    fmul v31.4s, v31.4s, v2.4s
+
+    cbz x26, ConvAddPost  // if add_input == 0, skip
+
+AddInputScale:
+
+    // add_input_ptr 0 ~ 7
+    add x11, x28, x4
+    add x12, x28, x4, lsl#1
+    add x14, x28, x4, lsl#2
+    add x13, x11, x4, lsl#1
+    add x15, x11, x4, lsl#2
+    add x19, x12, x4, lsl#2
+    add x20, x13, x4, lsl#2
+    ldr d0, [x28]
+    ldr d2, [x11]
+    ldr d4, [x12]
+    ldr d6, [x13]
+    ldr d8, [x14]
+    ldr d9, [x15]
+    ldr d10, [x19]
+    ldr d11, [x20]
+    // add_scale
+    ldr q12, [x27]
+    ldr q13, [x27, #16]
+
+    // convert add_input int8 to fp32
+    sxtl v0.8h, v0.8b
+    sxtl v2.8h, v2.8b
+    sxtl v4.8h, v4.8b
+    sxtl v6.8h, v6.8b
+    sxtl2 v1.4s, v0.8h
+    sxtl2 v3.4s, v2.8h
+    sxtl2 v5.4s, v4.8h
+    sxtl2 v7.4s, v6.8h
+    sxtl  v0.4s, v0.4h
+    sxtl  v2.4s, v2.4h
+    sxtl  v4.4s, v4.4h
+    sxtl  v6.4s, v6.4h
+    scvtf v0.4s, v0.4s
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    scvtf v3.4s, v3.4s
+    scvtf v4.4s, v4.4s
+    scvtf v5.4s, v5.4s
+    scvtf v6.4s, v6.4s
+    scvtf v7.4s, v7.4s
+
+    fmla v16.4s, v0.4s, v12.4s
+    sxtl v0.8h, v8.8b
+    fmla v17.4s, v1.4s, v13.4s
+    fmla v18.4s, v2.4s, v12.4s
+    sxtl v2.8h, v9.8b
+    fmla v19.4s, v3.4s, v13.4s
+    fmla v20.4s, v4.4s, v12.4s
+    sxtl v4.8h, v10.8b
+    fmla v21.4s, v5.4s, v13.4s
+    fmla v22.4s, v6.4s, v12.4s
+    sxtl v6.8h, v11.8b
+    fmla v23.4s, v7.4s, v13.4s
+
+    sxtl2 v1.4s, v0.8h
+    sxtl2 v3.4s, v2.8h
+    sxtl2 v5.4s, v4.8h
+    sxtl2 v7.4s, v6.8h
+    sxtl  v0.4s, v0.4h
+    sxtl  v2.4s, v2.4h
+    sxtl  v4.4s, v4.4h
+    sxtl  v6.4s, v6.4h
+    scvtf v0.4s, v0.4s
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    scvtf v3.4s, v3.4s
+    scvtf v4.4s, v4.4s
+    scvtf v5.4s, v5.4s
+    scvtf v6.4s, v6.4s
+    scvtf v7.4s, v7.4s
+
+    fmla v24.4s, v0.4s, v12.4s
+    fmla v25.4s, v1.4s, v13.4s
+    fmla v26.4s, v2.4s, v12.4s
+    fmla v27.4s, v3.4s, v13.4s
+    fmla v28.4s, v4.4s, v12.4s
+    fmla v29.4s, v5.4s, v13.4s
+    fmla v30.4s, v6.4s, v12.4s
+    fmla v31.4s, v7.4s, v13.4s
+
+    // add_input_ptr += 8 * dst_depth
+    add x28, x28, x4, lsl#3
+
+ConvAddPost:
+    fcvtas v16.4s, v16.4s
+    fcvtas v17.4s, v17.4s
+    fcvtas v18.4s, v18.4s
+    fcvtas v19.4s, v19.4s
+    fcvtas v20.4s, v20.4s
+    fcvtas v21.4s, v21.4s
+    fcvtas v22.4s, v22.4s
+    fcvtas v23.4s, v23.4s
+    fcvtas v24.4s, v24.4s
+    fcvtas v25.4s, v25.4s
+    fcvtas v26.4s, v26.4s
+    fcvtas v27.4s, v27.4s
+    fcvtas v28.4s, v28.4s
+    fcvtas v29.4s, v29.4s
+    fcvtas v30.4s, v30.4s
+    fcvtas v31.4s, v31.4s
+
+    sqxtn  v16.4h, v16.4s
+    sqxtn  v18.4h, v18.4s
+    sqxtn  v20.4h, v20.4s
+    sqxtn  v22.4h, v22.4s
+    sqxtn  v24.4h, v24.4s
+    sqxtn  v26.4h, v26.4s
+    sqxtn  v28.4h, v28.4s
+    sqxtn  v30.4h, v30.4s
+    sqxtn2 v16.8h, v17.4s
+    sqxtn2 v18.8h, v19.4s
+    sqxtn2 v20.8h, v21.4s
+    sqxtn2 v22.8h, v23.4s
+    sqxtn2 v24.8h, v25.4s
+    sqxtn2 v26.8h, v27.4s
+    sqxtn2 v28.8h, v29.4s
+    sqxtn2 v30.8h, v31.4s
+
+    sqxtn v16.8b, v16.8h
+    sqxtn v18.8b, v18.8h
+    sqxtn v20.8b, v20.8h
+    sqxtn v22.8b, v22.8h
+    sqxtn v24.8b, v24.8h
+    sqxtn v26.8b, v26.8h
+    sqxtn v28.8b, v28.8h
+    sqxtn v30.8b, v30.8h
+
+    cmp x10, #1  // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt ConvAddPostEnd
+    eor v0.16b, v0.16b, v0.16b
+    smax v16.8b, v16.8b, v0.8b
+    smax v18.8b, v18.8b, v0.8b
+    smax v20.8b, v20.8b, v0.8b
+    smax v22.8b, v22.8b, v0.8b
+    smax v24.8b, v24.8b, v0.8b
+    smax v26.8b, v26.8b, v0.8b
+    smax v28.8b, v28.8b, v0.8b
+    smax v30.8b, v30.8b, v0.8b
+
+    cmp x10, #2   // relu6
+    bne ConvAddPostEnd
+    ldr x13, [sp, #24]  // relu6_max
+    ld1 {v0.8b}, [x13]
+    smin v16.8b, v16.8b, v0.8b
+    smin v18.8b, v18.8b, v0.8b
+    smin v20.8b, v20.8b, v0.8b
+    smin v22.8b, v22.8b, v0.8b
+    smin v24.8b, v24.8b, v0.8b
+    smin v26.8b, v26.8b, v0.8b
+    smin v28.8b, v28.8b, v0.8b
+    smin v30.8b, v30.8b, v0.8b
+
+ConvAddPostEnd:
+
+    // store to dst_ptr 0 ~ 7
+    mov x10, x25
+    add x11, x25, x4
+    add x12, x25, x4, lsl#1
+    add x14, x25, x4, lsl#2
+    add x13, x11, x4, lsl#1
+    add x15, x11, x4, lsl#2
+    add x19, x12, x4, lsl#2
+    add x20, x13, x4, lsl#2
+
+    str d16, [x10]
+    str d18, [x11]
+    str d20, [x12]
+    str d22, [x13]
+    str d24, [x14]
+    str d26, [x15]
+    str d28, [x19]
+    str d30, [x20]
+
+    // dst_ptr += 8 * dst_depth
+    add x25, x25, x4, lsl#3
+
+    b LoopHW8
+
+LoopHW1:
+    // if hw counter <= 0, skip
+    cmp x24, #0
+    ble LoopHW1End
+
+    // src_ptr_0
+    mov x10, x23
+
+    // load bias 32bit, accumulator 2 reg
+    ld1 {v16.4s, v17.4s}, [x6]
+
+    // src_depth counter
+    mov x21, x3
+
+    // weight_ptr
+    mov x22, x2
+
+    HW1LoopCrr16:
+        cmp x21, #15
+        ble HW1LoopCrr8
+
+        ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x22], #64
+        ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x22], #64
+        ld1 {v0.16b}, [x10], #16
+        sub x21, x21, #16
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        .word 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]
+        .word 0x4fa0e171 // sdot v17.4s, v11.16b, v0.4b[1]
+        .word 0x4f80e990 // sdot v16.4s, v12.16b, v0.4b[2]
+        .word 0x4f80e9b1 // sdot v17.4s, v13.16b, v0.4b[2]
+        .word 0x4fa0e9d0 // sdot v16.4s, v14.16b, v0.4b[3]
+        .word 0x4fa0e9f1 // sdot v17.4s, v15.16b, v0.4b[3]
+        b HW1LoopCrr16
+    
+    HW1LoopCrr8:
+        cmp x21, #7
+        ble HW1LoopCrr4
+
+        ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x22], #64
+        ld1 {v0.8b}, [x10], #8
+        sub x21, x21, #8
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        .word 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]
+        .word 0x4fa0e171 // sdot v17.4s, v11.16b, v0.4b[1]
+        b HW1LoopCrr8
+
+    HW1LoopCrr4:
+        cmp x21, #3
+        ble HW1LoopEnd
+
+        ld1 {v8.16b, v9.16b}, [x22], #32
+        ld1 {v0.s}[0], [x10], #4
+        sub x21, x21, #4
+        .word 0x4f80e110 // sdot v16.4s, v8.16b,  v0.4b[0]
+        .word 0x4f80e131 // sdot v17.4s, v9.16b,  v0.4b[0]
+        b HW1LoopCrr4
+
+HW1LoopEnd:
+    // hw counter -= 1
+    sub x24, x24, #1
+    // src_ptr += 1 * src_depth
+    add x23, x23, x3
+
+    ldr x10, [sp, #0]   // relu
+
+    // scale oc0 ~ oc7
+    ldr q1, [x7]
+    ldr q2, [x7, #16]
+
+HW1ConvReluAdd:
+    cmp x10, #-1  // if relu == -1, Conv-Relu-Add
+    bne HW1MulScale
+
+    eor v0.16b, v0.16b, v0.16b
+    smax v16.4s, v16.4s, v0.4s
+    smax v17.4s, v17.4s, v0.4s
+
+HW1MulScale:
+    scvtf v16.4s, v16.4s
+    scvtf v17.4s, v17.4s
+
+    fmul v16.4s, v16.4s, v1.4s
+    fmul v17.4s, v17.4s, v2.4s
+
+    cbz x26, HW1ConvAddPost  // if add_input == 0, skip
+
+HW1AddInputScale:
+
+    ldr d0, [x28]
+    // add_scale
+    ldr q12, [x27]
+    ldr q13, [x27, #16]
+
+    // convert add_input int8 to fp32
+    sxtl v0.8h, v0.8b
+    sxtl  v1.4s, v0.4h
+    sxtl2 v2.4s, v0.8h
+    scvtf v1.4s, v1.4s
+    scvtf v2.4s, v2.4s
+    fmla v16.4s, v1.4s, v12.4s
+    fmla v17.4s, v2.4s, v13.4s
+
+    // add_input_ptr += 1 * dst_depth
+    add x28, x28, x4
+
+HW1ConvAddPost:
+    fcvtas v16.4s, v16.4s
+    fcvtas v17.4s, v17.4s
+    sqxtn  v16.4h, v16.4s
+    sqxtn2 v16.8h, v17.4s
+    sqxtn v16.8b, v16.8h
+
+    cmp x10, #1  // if relu != 1 or 2, Conv-Add-Relu or Relu6, skip
+    blt HW1ConvAddPostEnd
+    eor v0.16b, v0.16b, v0.16b
+    smax v16.8b, v16.8b, v0.8b
+
+    cmp x10, #2   // relu6
+    bne HW1ConvAddPostEnd
+    ldr x13, [sp, #24]  // relu6_max
+    ld1 {v0.8b}, [x13]
+    smin v16.8b, v16.8b, v0.8b
+
+HW1ConvAddPostEnd:
+    // store to dst_ptr
+    str d16, [x25]
+    // dst_ptr += 1 * dst_depth
+    add x25, x25, x4
+
+    b LoopHW1
+
+LoopHW1End:
+
+// dst += 8 * sizeof(int8)
+add x0, x0, #8
+// bias += 8 * sizeof(int32)
+add x6, x6, #32
+// weight += 8 * src_depth
+add x2, x2, x3, lsl#3
+// scale += 8 * sizeof(fp32)
+add x7, x7, #32
+// relu6_max += 8 * sizeof(int32)
+ldr x13, [sp, #24]
+add x13, x13, #8
+str x13, [sp, #24]
+// if add_input != 0
+// add_input += 8 * sizeof(int8)
+// add_scale ++ 8 * sizeof(fp32)
+cbz x26, UpdateAddInputEnd
+add x26, x26, #8
+add x27, x27, #32
+UpdateAddInputEnd:
+
+subs x9, x9, #8
+bge LoopDz8
+
+sub sp, sp, #208
+ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+ldp x19, x20, [sp], #16
+ldp x21, x22, [sp], #16
+ldp x23, x24, [sp], #16
+ldp x25, x26, [sp], #16
+ldp x27, x28, [sp], #16
+
+END:
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMV_INT8_SDOT.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMV_INT8_SDOT.S
new file mode 100644
index 0000000..6a1fa09
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/GEMV_INT8_SDOT.S
@@ -0,0 +1,233 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function GemvInt8Sdot
+//void GemvInt8Sdot(int8_t* dst, const int8_t* src, const int8_t* weight,
+//                  const int32_t* bias, const float* scale, long ic_r4, long oc_r4)
+//x0(dst),
+//x1(src),
+//x2(weight),
+//x3(bias),
+//x4(scale),
+//x5(ic_r4),
+//x6(oc_r4),
+
+LoopOc16:
+    cmp x6, #15
+    ble LoopOc4
+
+    sub x6, x6, #16
+    ldr q0, [x3]
+    ldr q1, [x3, #16]
+    ldr q2, [x3, #32]
+    ldr q3, [x3, #48]
+    add x3, x3, #64
+
+    mov x9, x1              // src_ptr 
+    mov x10, x5             // ic counter
+    mov x11, x2             // weight_ptr
+
+    Oc16L16:
+        cmp x10, #15
+        ble Oc16L8
+
+        sub x10, x10, #16
+        ld1 {v4.16b}, [x9], #16
+        ldr q16, [x11]
+        ldr q17, [x11, #16]
+        ldr q18, [x11, #32]
+        ldr q19, [x11, #48]
+        ldr q20, [x11, #64]
+        ldr q21, [x11, #80]
+        ldr q22, [x11, #96]
+        ldr q23, [x11, #112]
+        ldr q24, [x11, #128]
+        ldr q25, [x11, #144]
+        ldr q26, [x11, #160]
+        ldr q27, [x11, #176]
+        ldr q28, [x11, #192]
+        ldr q29, [x11, #208]
+        ldr q30, [x11, #224]
+        ldr q31, [x11, #240]
+        add x11, x11, #256
+
+        .word 0x4f84e200 // sdot v0.4s, v16.16b, v4.4b[0]
+        .word 0x4f84e221 // sdot v1.4s, v17.16b, v4.4b[0]
+        .word 0x4f84e242 // sdot v2.4s, v18.16b, v4.4b[0]
+        .word 0x4f84e263 // sdot v3.4s, v19.16b, v4.4b[0]
+        .word 0x4fa4e280 // sdot v0.4s, v20.16b, v4.4b[1]
+        .word 0x4fa4e2a1 // sdot v1.4s, v21.16b, v4.4b[1]
+        .word 0x4fa4e2c2 // sdot v2.4s, v22.16b, v4.4b[1]
+        .word 0x4fa4e2e3 // sdot v3.4s, v23.16b, v4.4b[1]
+        .word 0x4f84eb00 // sdot v0.4s, v24.16b, v4.4b[2]
+        .word 0x4f84eb21 // sdot v1.4s, v25.16b, v4.4b[2]
+        .word 0x4f84eb42 // sdot v2.4s, v26.16b, v4.4b[2]
+        .word 0x4f84eb63 // sdot v3.4s, v27.16b, v4.4b[2]
+        .word 0x4fa4eb80 // sdot v0.4s, v28.16b, v4.4b[3]
+        .word 0x4fa4eba1 // sdot v1.4s, v29.16b, v4.4b[3]
+        .word 0x4fa4ebc2 // sdot v2.4s, v30.16b, v4.4b[3]
+        .word 0x4fa4ebe3 // sdot v3.4s, v31.16b, v4.4b[3]
+        b Oc16L16
+
+    Oc16L8:
+        cmp x10, #7
+        ble Oc16L4
+
+        sub x10, x10, #8
+        ld1 {v4.8b}, [x9], #8
+        ldr q16, [x11]
+        ldr q17, [x11, #16]
+        ldr q18, [x11, #32]
+        ldr q19, [x11, #48]
+        ldr q20, [x11, #64]
+        ldr q21, [x11, #80]
+        ldr q22, [x11, #96]
+        ldr q23, [x11, #112]
+        add x11, x11, #128
+
+        .word 0x4f84e200 // sdot v0.4s, v16.16b, v4.4b[0]
+        .word 0x4f84e221 // sdot v1.4s, v17.16b, v4.4b[0]
+        .word 0x4f84e242 // sdot v2.4s, v18.16b, v4.4b[0]
+        .word 0x4f84e263 // sdot v3.4s, v19.16b, v4.4b[0]
+        .word 0x4fa4e280 // sdot v0.4s, v20.16b, v4.4b[1]
+        .word 0x4fa4e2a1 // sdot v1.4s, v21.16b, v4.4b[1]
+        .word 0x4fa4e2c2 // sdot v2.4s, v22.16b, v4.4b[1]
+        .word 0x4fa4e2e3 // sdot v3.4s, v23.16b, v4.4b[1]
+        b Oc16L8
+    
+    Oc16L4:
+        cmp x10, #3
+        ble Oc16L4End
+
+        sub x10, x10, #4
+        ld1 {v4.s}[0], [x9], #4
+        ldr q16, [x11]
+        ldr q17, [x11, #16]
+        ldr q18, [x11, #32]
+        ldr q19, [x11, #48]
+        add x11, x11, #64
+
+        .word 0x4f84e200 // sdot v0.4s, v16.16b, v4.4b[0]
+        .word 0x4f84e221 // sdot v1.4s, v17.16b, v4.4b[0]
+        .word 0x4f84e242 // sdot v2.4s, v18.16b, v4.4b[0]
+        .word 0x4f84e263 // sdot v3.4s, v19.16b, v4.4b[0]
+        b Oc16L4
+
+    Oc16L4End:
+
+        ldr q16, [x4]
+        ldr q17, [x4, #16]
+        ldr q18, [x4, #32]
+        ldr q19, [x4, #48]
+        add x4, x4, #64
+
+        scvtf v0.4s, v0.4s
+        scvtf v1.4s, v1.4s
+        scvtf v2.4s, v2.4s
+        scvtf v3.4s, v3.4s
+
+        fmul v0.4s, v0.4s, v16.4s
+        fmul v1.4s, v1.4s, v17.4s
+        fmul v2.4s, v2.4s, v18.4s
+        fmul v3.4s, v3.4s, v19.4s
+
+        fcvtas v0.4s, v0.4s
+        fcvtas v1.4s, v1.4s
+        fcvtas v2.4s, v2.4s
+        fcvtas v3.4s, v3.4s
+
+        sqxtn  v0.4h, v0.4s
+        sqxtn2 v0.8h, v1.4s
+        sqxtn  v1.4h, v2.4s
+        sqxtn2 v1.8h, v3.4s
+        sqxtn  v0.8b, v0.8h
+        sqxtn  v1.8b, v1.8h
+
+        st1 {v0.8b, v1.8b}, [x0], #16
+        // weight += 16 * ic_r4
+        add x2, x2, x5, lsl#4
+
+        b LoopOc16
+
+LoopOc4:
+    cmp x6, #3
+    ble END
+
+    sub x6, x6, #4
+    ld1 {v0.16b}, [x3], #16
+
+    mov x9, x1              // src_ptr 
+    mov x10, x5             // ic counter
+    mov x11, x2             // weight_ptr
+
+    Oc4L16:
+        cmp x10, #15
+        ble Oc4L4
+
+        sub x10, x10, #16
+        ld1 {v4.16b}, [x9], #16
+        ldr q16, [x11]
+        ldr q17, [x11, #16]
+        ldr q18, [x11, #32]
+        ldr q19, [x11, #48]
+        add x11, x11, #64
+
+        .word 0x4f84e200 // sdot v0.4s, v16.16b, v4.4b[0]
+        .word 0x4fa4e220 // sdot v0.4s, v17.16b, v4.4b[1]
+        .word 0x4f84ea40 // sdot v0.4s, v18.16b, v4.4b[2]
+        .word 0x4fa4ea60 // sdot v0.4s, v19.16b, v4.4b[3]
+
+        b Oc4L16
+
+    Oc4L4:
+        cmp x10, #3
+        ble Oc4L4End
+
+        sub x10, x10, #4
+        ld1 {v4.s}[0], [x9], #4
+        ld1 {v16.16b}, [x11], #16
+
+        .word 0x4f84e200 // sdot v0.4s, v16.16b, v4.4b[0]
+
+        b Oc4L4
+
+    Oc4L4End:
+
+        ld1 {v16.4s}, [x4], #16
+        scvtf v0.4s, v0.4s
+        fmul v0.4s, v0.4s, v16.4s
+        fcvtas v0.4s, v0.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v0.8b, v0.8h
+
+        st1 {v0.s}[0], [x0], #4
+        // weight += 4 * ic_r4
+        add x2, x2, x5, lsl#2
+
+        b LoopOc4
+
+END:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/HALF2FLOAT.S b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/HALF2FLOAT.S
new file mode 100644
index 0000000..68d4f7b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/asm_64/HALF2FLOAT.S
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#if TNN_ARM82
+#ifdef __aarch64__
+
+#include "tnn/device/arm/acc/compute/asm_func_name.S"
+
+.text
+.align 5
+
+asm_function Half2FloatKernel
+//void Half2FloatKernel(float* dst, const __fp16* src, const size_t length)
+//Auto Load:
+//x0:dst, x1:src, x2:length
+cmp x2, #15
+ble L16END
+
+L16:
+    sub x2, x2, #16
+    ldr q0, [x1]
+    ldr q1, [x1, #16]
+    add x1, x1, #32
+    cmp x2, #16
+    fcvtl  v2.4s, v0.4h
+    fcvtl2 v3.4s, v0.8h
+    fcvtl  v4.4s, v1.4h
+    fcvtl2 v5.4s, v1.8h
+    st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x0], #64
+    bge L16
+
+L16END:
+cmp x2, #7
+ble L8END
+L8:
+    sub x2, x2, #8
+    ld1 {v0.8h}, [x1], #16
+    cmp x2, #8
+    fcvtl  v1.4s, v0.4h
+    fcvtl2 v2.4s, v0.8h
+    st1 {v1.4s, v2.4s}, [x0], #32
+    bge L8
+
+L8END:
+cmp x2, #3
+ble L4END
+L4:
+    sub x2, x2, #4
+    ld1 {v0.4h}, [x1], #8
+    cmp x2, #4
+    fcvtl  v1.4s, v0.4h
+    st1 {v1.4s}, [x0], #16
+    bge L4
+
+L4END:
+cmp x2, #0
+beq L1END
+
+L1:
+    subs x2, x2, #1
+    ldr h0, [x1], #2
+    fcvt s1, h0
+    str s1, [x0], #4
+    bne L1
+L1END:
+
+ret
+
+#endif
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.cc
new file mode 100644
index 0000000..050aefa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.cc
@@ -0,0 +1,1239 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute_arm82/compute_half.h"
+
+#include <string.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+/*
+[1] Implement functions in compute.h with fp16 version 
+*/
+#if TNN_ARM82
+template <>
+void PostAddBias<fp16_t, fp16_t>(void* dst, const void* bias, long area, long oc8) {
+    for (long z = oc8 - 1; z >= 0; --z) {
+        Half8 vbias = Half8::load(reinterpret_cast<const fp16_t*>(bias) + 8 * z);
+        auto dst_z  = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+        long p      = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            Half8 dst_1 = Half8::load(dst_p + 8);
+            Half8 dst_2 = Half8::load(dst_p + 16);
+            Half8 dst_3 = Half8::load(dst_p + 24);
+            dst_0 = dst_0 + vbias;
+            dst_1 = dst_1 + vbias;
+            dst_2 = dst_2 + vbias;
+            dst_3 = dst_3 + vbias;
+            Half8::save(dst_p, dst_0);
+            Half8::save(dst_p + 8, dst_1);
+            Half8::save(dst_p + 16, dst_2);
+            Half8::save(dst_p + 24, dst_3);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            dst_0 = dst_0 + vbias;
+            Half8::save(dst_p, dst_0);
+        }
+    }
+}
+
+template <>
+void PostAddBiasRelu<fp16_t, fp16_t>(void* dst, const void* bias, long area, long oc8) {
+    Half8 vzero = Half8((fp16_t)0.f);
+    for (long z = oc8 - 1; z >= 0; --z) {
+        Half8 vbias = Half8::load(reinterpret_cast<const fp16_t*>(bias) + 8 * z);
+        auto dst_z  = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+        long p      = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            Half8 dst_1 = Half8::load(dst_p + 8);
+            Half8 dst_2 = Half8::load(dst_p + 16);
+            Half8 dst_3 = Half8::load(dst_p + 24);
+            dst_0 = Half8::max(dst_0 + vbias, vzero);
+            dst_1 = Half8::max(dst_1 + vbias, vzero);
+            dst_2 = Half8::max(dst_2 + vbias, vzero);
+            dst_3 = Half8::max(dst_3 + vbias, vzero);
+            Half8::save(dst_p, dst_0);
+            Half8::save(dst_p + 8, dst_1);
+            Half8::save(dst_p + 16, dst_2);
+            Half8::save(dst_p + 24, dst_3);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            dst_0 = Half8::max(dst_0 + vbias, vzero);
+            Half8::save(dst_p, dst_0);
+        }
+    }
+}
+
+template <>
+void PostAddBiasRelu6<fp16_t, fp16_t>(void* dst, const void* bias, long area, long oc8) {
+    Half8 vzero = Half8((fp16_t)0.f);
+    Half8 vrelu6 = Half8((fp16_t)6.f);
+    for (long z = oc8 - 1; z >= 0; --z) {
+        Half8 vbias = Half8::load(reinterpret_cast<const fp16_t*>(bias) + 8 * z);
+        auto dst_z   = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+        long p       = 0;
+        for (; p < area - 3; p += 4) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            Half8 dst_1 = Half8::load(dst_p + 8);
+            Half8 dst_2 = Half8::load(dst_p + 16);
+            Half8 dst_3 = Half8::load(dst_p + 24);
+            dst_0 = Half8::min(Half8::max(dst_0 + vbias, vzero), vrelu6);
+            dst_1 = Half8::min(Half8::max(dst_1 + vbias, vzero), vrelu6);
+            dst_2 = Half8::min(Half8::max(dst_2 + vbias, vzero), vrelu6);
+            dst_3 = Half8::min(Half8::max(dst_3 + vbias, vzero), vrelu6);
+            Half8::save(dst_p, dst_0);
+            Half8::save(dst_p + 8, dst_1);
+            Half8::save(dst_p + 16, dst_2);
+            Half8::save(dst_p + 24, dst_3);
+        }
+        for (; p < area; ++p) {
+            auto dst_p = dst_z + 8 * p;
+            Half8 dst_0 = Half8::load(dst_p);
+            dst_0 = Half8::min(Half8::max(dst_0 + vbias, vzero), vrelu6);
+            Half8::save(dst_p, dst_0);
+        }
+    }
+}
+
+template <>
+void PostAddBiasSwish<fp16_t, fp16_t, true>(void* dst, const void* bias, long area, long oc8) {
+    if (!bias) {
+        for (long z = oc8 - 1; z >= 0; --z) {
+            fp16_t* dst_z  = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 8 * p;
+                Half8 val = Half8::load(dst_p);
+                Half8::save(dst_p, val * Half8::fast_sigmoid(val));
+            }
+        }
+    } else {
+        for (long z = oc8 - 1; z >= 0; --z) {
+            Half8 vbias = Half8::load(reinterpret_cast<const fp16_t*>(bias) + 8 * z);
+            fp16_t* dst_z = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 8 * p;
+                Half8 val = Half8::load(dst_p);
+                val = val + vbias;
+                Half8::save(dst_p, val * Half8::fast_sigmoid(val));
+            }
+        }
+    }
+}
+template <>
+void PostAddBiasSwish<fp16_t, fp16_t, false>(void* dst, const void* bias, long area, long oc8) {
+    if (!bias) {
+        for (long z = oc8 - 1; z >= 0; --z) {
+            fp16_t* dst_z  = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 8 * p;
+                Half8 val = Half8::load(dst_p);
+                Half8::save(dst_p, val * Half8::sigmoid(val));
+            }
+        }
+    } else {
+        for (long z = oc8 - 1; z >= 0; --z) {
+            Half8 vbias = Half8::load(reinterpret_cast<const fp16_t*>(bias) + 8 * z);
+            fp16_t* dst_z     = reinterpret_cast<fp16_t*>(dst) + area * 8 * z;
+            for (long p = 0; p < area; ++p) {
+                auto dst_p = dst_z + 8 * p;
+                Half8 val = Half8::load(dst_p);
+                val = val + vbias;
+                Half8::save(dst_p, val * Half8::sigmoid(val));
+            }
+        }
+    }
+}
+
+void MaxPoolingHalf(const fp16_t* src, long iw, long ih, fp16_t* dst, long ow, long oh, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX = ox * stride_w - pad_w;
+            const long srcOriginY = oy * stride_h - pad_h;
+            const long kxs        = MAX(0, -srcOriginX);
+            const long kxe        = MIN(kw, iw - srcOriginX);
+            const long kys        = MAX(0, -srcOriginY);
+            const long kye        = MIN(kh, ih - srcOriginY);
+            const auto src_ptr    = src + (srcOriginY * iw + srcOriginX) * 8;
+            auto dst_ptr          = dst + (oy * ow + ox) * 8;
+
+            Half8 vmax = Half8(HALF_LOWEST);
+
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 8;
+                for (long kx = kxs; kx < kxe; kx++) {
+                    vmax = Half8::max(vmax, Half8::load(src_ptr_h + kx * 8));
+                }
+            }
+
+            Half8::save(dst_ptr, vmax);
+        }
+    }
+}
+
+void AvgPoolingHalf(const fp16_t* src, long iw, long ih, fp16_t* dst, long ow, long oh, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX    = ox * stride_w - pad_w;
+            const long srcOriginY    = oy * stride_h - pad_h;
+            const long kxs           = MAX(0, -srcOriginX);
+            const long kxe           = MIN(kw, iw - srcOriginX);
+            const long kys           = MAX(0, -srcOriginY);
+            const long kye           = MIN(kh, ih - srcOriginY);
+            const float kernel_count = 1.0 / ((kxe - kxs) * (kye - kys));
+            const auto src_ptr       = src + (srcOriginY * iw + srcOriginX) * 8;
+            auto dst_ptr             = dst + (oy * ow + ox) * 8;
+
+            Float4 vavg_low = Float4(0.f);
+            Float4 vavg_high = Float4(0.f);
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * 8;
+                Half8 vavg = Half8((fp16_t)0.f);
+                for (long kx = kxs; kx < kxe; kx++) {
+                    Half4 v0, v1;
+                    Half8 v = Half8::load(src_ptr_h + kx * 8);
+                    Half8::get_low(v, v0);
+                    Half8::get_high(v, v1);
+                    Half4::add_to_f32(v0, vavg_low);
+                    Half4::add_to_f32(v1, vavg_high);
+                }
+                Half4 v0, v1;
+                Half8::get_low(vavg, v0);
+                Half8::get_high(vavg, v1);
+                Half4::add_to_f32(v0, vavg_low);
+                Half4::add_to_f32(v1, vavg_high);
+            }
+            vavg_low = vavg_low * Float4(kernel_count);
+            vavg_high = vavg_high * Float4(kernel_count);
+            Half4::save(dst_ptr, Half4(vavg_low));
+            Half4::save(dst_ptr + 4, Half4(vavg_high));
+        }
+    }
+}
+
+template <>
+void DepthwiseUnit<fp16_t, fp16_t>(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long fw, long fh, long weight_y_step, long dilate_x_step,
+                   long dilate_y_step) {
+    long fx, fy;
+    Half8 dst_v = Half8((fp16_t)0.0f);
+    const auto* src_z    = src;
+    const auto* weight_z = weight;
+    for (fy = 0; fy < fh; ++fy) {
+        const auto* src_y    = src_z + fy * dilate_y_step;
+        const auto* weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            Half8 src_v = Half8::load(src_y + fx * dilate_x_step);
+            Half8 weight_v = Half8::load(weight_y + 8 * fx);
+            Half8::mla(dst_v, src_v, weight_v);
+        }
+    }
+    Half8::save(dst, dst_v);
+}
+
+template <>
+void DepthwiseConv<fp16_t, fp16_t>(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long src_w_step, long fw, long fh,
+                   long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep) {
+    long dx, fx, fy;
+    for (long y = 0; y < height; ++y) {
+        auto srcY = src + y * srcHStep;
+        auto dstY = dst + y * dstHStep;
+        dx        = 0;
+        for (; dx + 3 < width; dx += 4) {
+            Half8 dst_v[4];
+            for (long i = 0; i < 4; i++) {
+                dst_v[i] = Half8((fp16_t)0.0f);
+            }
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * 8;
+                for (fx = 0; fx < fw; ++fx) {
+                    Half8 weight_v = Half8::load(weight_y + 8 * fx);
+                    Half8 src_v0   = Half8::load(src_y + fx * dilate_x_step);
+                    Half8 src_v1   = Half8::load(src_y + fx * dilate_x_step + src_w_step);
+                    Half8 src_v2   = Half8::load(src_y + fx * dilate_x_step + 2 * src_w_step);
+                    Half8 src_v3   = Half8::load(src_y + fx * dilate_x_step + 3 * src_w_step);
+                    Half8::mla(dst_v[0], src_v0, weight_v);
+                    Half8::mla(dst_v[1], src_v1, weight_v);
+                    Half8::mla(dst_v[2], src_v2, weight_v);
+                    Half8::mla(dst_v[3], src_v3, weight_v);
+                }
+            }
+            Half8::save(dstY + (dx + 0) * 8, dst_v[0]);
+            Half8::save(dstY + (dx + 1) * 8, dst_v[1]);
+            Half8::save(dstY + (dx + 2) * 8, dst_v[2]);
+            Half8::save(dstY + (dx + 3) * 8, dst_v[3]);
+        }
+        for (; dx < width; ++dx) {
+            Half8 dst_v = Half8((fp16_t)0.0f);
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * 8;
+                for (fx = 0; fx < fw; ++fx) {
+                    Half8 src_v    = Half8::load(src_y + fx * dilate_x_step);
+                    Half8 weight_v = Half8::load(weight_y + 8 * fx);
+                    Half8::mla(dst_v, src_v, weight_v);
+                }
+            }
+            Half8::save(dstY + dx * 8, dst_v);
+        }
+    }
+}
+
+template <>
+void DepthwiseUnitDeconv<fp16_t, fp16_t>(const fp16_t* dst, fp16_t* src, const fp16_t* weight, long fw, long fh, long weight_y_step,
+                         long dilate_x_step, long dilate_y_step) {
+    long fx, fy;
+    fp16_t* src_z              = src;
+    const fp16_t* weight_z = weight;
+    Half8 dstV = Half8::load(dst);
+    for (fy = 0; fy < fh; ++fy) {
+        fp16_t* src_y = src_z + fy * dilate_y_step;
+        const fp16_t* weight_y = weight_z + fy * weight_y_step;
+        for (fx = 0; fx < fw; ++fx) {
+            Half8 weight_x = Half8::load(weight_y + 8 * fx);
+            Half8 src_x    = Half8::load(src_y + fx * dilate_x_step);
+            Half8::mla(src_x, weight_x, dstV);
+            Half8::save(src_y + fx * dilate_x_step, src_x);
+        }
+    }
+}
+
+template <>
+void DepthwiseDeconv<fp16_t, fp16_t>(const fp16_t* dst, fp16_t* src, const fp16_t* weight, long width, long src_w_setup, long fw, long fh,
+                     long dilate_x_step, long dilate_y_step) {
+    long dx;
+    for (dx = 0; dx < width; ++dx) {
+        const fp16_t* dst_x = dst + dx * 8;
+        fp16_t* src_dx      = src + src_w_setup * dx;
+        DepthwiseUnitDeconv(dst_x, src_dx, weight, fw, fh, fw * 8, dilate_x_step, dilate_y_step);
+    }
+}
+
+template <> 
+void ScaleBias(fp16_t *src, int channel, int hw, const float *scale, const float *bias, fp16_t *dst) {
+    if (dst == nullptr) {
+        dst = src;
+    }
+
+    RawBuffer scale_buffer(ROUND_UP(channel, 8) * sizeof(fp16_t));
+    RawBuffer bias_buffer(ROUND_UP(channel, 8) * sizeof(fp16_t));
+    Float2Half(scale_buffer.force_to<fp16_t *>(), scale, channel);
+    Float2Half(bias_buffer.force_to<fp16_t *>(), bias, channel);
+    auto local_scale = scale_buffer.force_to<fp16_t *>();
+    auto local_bias  = bias_buffer.force_to<fp16_t *>();
+
+    for (int z = 0; z < UP_DIV(channel, 8); ++z) {
+        auto src_z   = src + z * hw * 8;
+        auto dst_z   = dst + z * hw * 8;
+
+        auto v_scale = Half8::load(local_scale + z * 8);
+        auto v_bias  = Half8::load(local_bias + z * 8);
+        for (int s = 0; s < hw; ++s) {
+            Half8 dst_v = v_bias;
+            Half8::mla(dst_v, Half8::load(src_z + s * 8), v_scale);
+            Half8::save(dst_z + s * 8, dst_v);
+        }
+    }
+}
+
+int PackNeonNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel) {
+    if ((hw == 1) && (channel % 8 == 0)) {
+        memcpy(dst, src, hw * channel * sizeof(fp16_t));
+        return 0;
+    }
+
+    auto cc = (channel>>3<<3);
+    Half8 v;
+    for (int c = 0; c < cc; c += 8) {
+        auto src_c = src + c;
+        auto dst_c = dst + c * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = Half8::load(src_c);
+            Half8::save(dst_c, v);
+            src_c += channel;
+            dst_c += 8;
+        }
+    }
+
+    int remain = channel % 8;
+    if (remain) {
+        auto src_c = src + cc;
+        auto dst_c = dst + cc * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = Half8((fp16_t)0.f);
+            for (int r = 0; r < remain; ++r)
+                v.set_lane(src_c[r], r);
+            Half8::save(dst_c, v);
+            src_c += channel;
+            dst_c += 8;
+        }
+    }
+
+    return 0;
+}
+
+int UnpackNeonNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel) {
+    if ((hw == 1) && (channel % 8 == 0)) {
+        memcpy(dst, src, hw * channel * sizeof(fp16_t));
+        return 0;
+    }
+
+    auto cc = (channel>>3<<3);
+    Half8 v;
+    for (int c = 0; c < cc; c += 8) {
+        auto dst_c = dst + c;
+        auto src_c = src + c * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = Half8::load(src_c);
+            Half8::save(dst_c, v);
+            src_c += 8;
+            dst_c += channel;
+        }
+    }
+
+    int remain = channel % 8;
+    if (remain) {
+        auto dst_c = dst + cc;
+        auto src_c = src + cc * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = Half8::load(src_c);
+            for (int r = 0; r < remain; ++r)
+                *(dst_c + r) = v[r];
+            src_c += 8;
+            dst_c += channel;
+        }
+    }
+
+    return 0;
+}
+
+#endif  // TNN_ARM82
+
+
+#if TNN_ARM82
+
+void FloatC4ToHalfC8(fp16_t* dst, const float* src, long batch, long channel, long hw) {
+    long c_r4 = UP_DIV(channel, 4);
+    long c_r8 = UP_DIV(channel, 8);
+    for (long n = 0; n < batch; n++) {
+        auto dst_n = dst + n * c_r8 * hw * 8;
+        auto src_n = src + n * c_r4 * hw * 4;
+        OMP_PARALLEL_FOR_GUIDED_
+        for (long ci = 0; ci < c_r4; ++ci) {
+            long co         = ci / 2;
+            long dst_offset = (ci % 2) ? 4 : 0;
+            auto dst_c      = dst_n + co * hw * 8 + dst_offset;
+            auto src_c      = src_n + ci * hw * 4;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                // nchw4 to nchw8
+#ifdef TNN_ARM82_A64
+                vst1_f16(dst_c + cnt * 8, vcvt_f16_f32(vld1q_f32(src_c + cnt * 4)));
+#elif defined(TNN_ARM82_A32)
+                vst1_u16((unsigned short*)(dst_c + cnt * 8), vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(src_c + cnt * 4))));
+#else
+                for (long idx = 0; idx < 4; idx++) {
+                    dst_c[cnt * 8 + idx] = src_c[cnt * 4 + idx];
+                }
+#endif
+            }
+        }
+
+        if (c_r4 * 4 < c_r8 * 8) {
+            long co         = c_r4 / 2;
+            long dst_offset = (c_r4 % 2) ? 4 : 0;
+            auto dst_c      = dst_n + co * hw * 8 + dst_offset;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                // nchw4 to nchw8
+#ifdef TNN_ARM82_A64
+                vst1_f16(dst_c + cnt * 8, vdup_n_f16(0.f));
+#elif defined(TNN_ARM82_A32)
+                vst1_u16((unsigned short*)(dst_c + cnt * 8), vdup_n_u16(0));
+#else
+                for (long idx = 0; idx < 4; idx++) {
+                    dst_c[cnt * 8 + idx] = 0.f;
+                }
+#endif
+            }
+        }
+    }
+}
+
+void HalfC8ToFloatC4(float* dst, const fp16_t* src, long batch, long channel, long hw) {
+    long c_r4 = UP_DIV(channel, 4);
+    long c_r8 = UP_DIV(channel, 8);
+    for (long n = 0; n < batch; n++) {
+        auto src_n = src + n * c_r8 * hw * 8;
+        auto dst_n = dst + n * c_r4 * hw * 4;
+        OMP_PARALLEL_FOR_GUIDED_
+        for (long co = 0; co < c_r4; ++co) {
+            long ci         = co / 2;
+            long src_offset = (co % 2) ? 4 : 0;
+            auto src_c      = src_n + ci * hw * 8 + src_offset;
+            auto dst_c      = dst_n + co * hw * 4;
+            for (long cnt = 0; cnt < hw; cnt++) {
+                // nchw8 to nchw4
+#ifdef TNN_ARM82_A64
+                vst1q_f32(dst_c + cnt * 4, vcvt_f32_f16(vld1_f16(src_c + cnt * 8)));
+#elif defined(TNN_ARM82_A32)
+                vst1q_f32(dst_c + cnt * 4, vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16((unsigned short*)src_c + cnt * 8))));
+#else
+                for (long idx = 0; idx < 4; idx++) {
+                    dst_c[cnt * 4 + idx] = src_c[cnt * 8 + idx];
+                }
+#endif
+            }
+        }
+    }
+}
+
+#define transpose_4x4(v0, v1, v2, v3, v_zero)       \
+{                                                   \
+    float32x4x2_t q01 = vtrnq_f32(v0, v1);          \
+    float32x4x2_t q23 = vtrnq_f32(v2, v_zero);      \
+    float32x2_t d00 = vget_low_f32(q01.val[0]);     \
+    float32x2_t d01 = vget_high_f32(q01.val[0]);    \
+    float32x2_t d10 = vget_low_f32(q01.val[1]);     \
+    float32x2_t d11 = vget_high_f32(q01.val[1]);    \
+    float32x2_t d20 = vget_low_f32(q23.val[0]);     \
+    float32x2_t d21 = vget_high_f32(q23.val[0]);    \
+    float32x2_t d30 = vget_low_f32(q23.val[1]);     \
+    float32x2_t d31 = vget_high_f32(q23.val[1]);    \
+    v0 = vcombine_f32(d00, d20);                    \
+    v1 = vcombine_f32(d10, d30);                    \
+    v2 = vcombine_f32(d01, d21);                    \
+    v3 = vcombine_f32(d11, d31);                    \
+}
+
+int PackNeonC3(fp16_t *dst, const float *src, size_t hw, size_t channel) {
+    auto src0 = src;
+    auto src1 = src + hw;
+    auto src2 = src + hw * 2;
+    int cur_hw = 0;
+#ifdef TNN_ARM82_USE_NEON
+    float32x4_t v_zero_f32 = vdupq_n_f32(0.f);
+#ifdef TNN_ARM82_A64
+    float16x4_t v_zero_f16 = vdup_n_f16(0.f);
+#else
+    uint16x4_t v_zero_u16  = vdup_n_u16(0x0);
+    uint16_t *dst_u16 = reinterpret_cast<uint16_t *>(dst);
+#endif
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        float32x4_t v0 = vld1q_f32(src0 + cur_hw);
+        float32x4_t v1 = vld1q_f32(src1 + cur_hw);
+        float32x4_t v2 = vld1q_f32(src2 + cur_hw);
+        float32x4_t v3;
+        transpose_4x4(v0, v1, v2, v3, v_zero_f32);
+#ifdef TNN_ARM82_A64
+        vst1q_f16(dst + cur_hw * 8,      vcombine_f16(vcvt_f16_f32(v0), v_zero_f16));
+        vst1q_f16(dst + cur_hw * 8 + 8,  vcombine_f16(vcvt_f16_f32(v1), v_zero_f16));
+        vst1q_f16(dst + cur_hw * 8 + 16, vcombine_f16(vcvt_f16_f32(v2), v_zero_f16));
+        vst1q_f16(dst + cur_hw * 8 + 24, vcombine_f16(vcvt_f16_f32(v3), v_zero_f16));
+#else
+        vst1q_u16(dst_u16 + cur_hw * 8,      vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(v0)), v_zero_u16));
+        vst1q_u16(dst_u16 + cur_hw * 8 + 8,  vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(v1)), v_zero_u16));
+        vst1q_u16(dst_u16 + cur_hw * 8 + 16, vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(v2)), v_zero_u16));
+        vst1q_u16(dst_u16 + cur_hw * 8 + 24, vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(v3)), v_zero_u16));        
+#endif
+    }
+#endif  // TNN_ARM82_USE_NEON
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 8 + 0] = src0[cur_hw];
+        dst[cur_hw * 8 + 1] = src1[cur_hw];
+        dst[cur_hw * 8 + 2] = src2[cur_hw];
+        dst[cur_hw * 8 + 3] = 0.f;
+        dst[cur_hw * 8 + 4] = 0.f;
+        dst[cur_hw * 8 + 5] = 0.f;
+        dst[cur_hw * 8 + 6] = 0.f;
+        dst[cur_hw * 8 + 7] = 0.f;
+    }
+    return 0;
+}
+
+// 0 < remain < 8
+int PackNeonRemain(fp16_t *dst, const fp16_t *src, size_t hw, int remain) {
+    int r, cur_hw;
+    int idx = 0;
+    memset(dst, 0, hw * 8 * sizeof(fp16_t));
+    for (r = 0; r < remain; ++r) {
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst[8 * cur_hw + r] = src[idx++];
+        }
+    }
+
+    return 0;
+}
+
+int PackNeon(fp16_t* dst, const fp16_t* src, size_t hw, size_t channel) {
+    for (int c = 0; c + 7 < channel; c += 8) {
+        auto src0 = src + c * hw;
+        auto src1 = src + c * hw + hw;
+        auto src2 = src + c * hw + hw * 2;
+        auto src3 = src + c * hw + hw * 3;
+        auto src4 = src + c * hw + hw * 4;
+        auto src5 = src + c * hw + hw * 5;
+        auto src6 = src + c * hw + hw * 6;
+        auto src7 = src + c * hw + hw * 7;
+        auto dst_c = dst + c * hw;
+        int cur_hw = 0;
+#ifdef TNN_ARM82_USE_NEON
+        for (; cur_hw + 7 < hw; cur_hw += 8) {
+            Half8x8 v;
+            v.set_value0(Half8::load(src0 + cur_hw));
+            v.set_value1(Half8::load(src1 + cur_hw));
+            v.set_value2(Half8::load(src2 + cur_hw));
+            v.set_value3(Half8::load(src3 + cur_hw));
+            v.set_value4(Half8::load(src4 + cur_hw));
+            v.set_value5(Half8::load(src5 + cur_hw));
+            v.set_value6(Half8::load(src6 + cur_hw));
+            v.set_value7(Half8::load(src7 + cur_hw));
+            v.save_transpose(dst_c + cur_hw * 8);
+        }
+#endif
+        for (; cur_hw < hw; cur_hw++) {
+            dst_c[cur_hw * 8 + 0] = src0[cur_hw];
+            dst_c[cur_hw * 8 + 1] = src1[cur_hw];
+            dst_c[cur_hw * 8 + 2] = src2[cur_hw];
+            dst_c[cur_hw * 8 + 3] = src3[cur_hw];
+            dst_c[cur_hw * 8 + 4] = src4[cur_hw];
+            dst_c[cur_hw * 8 + 5] = src5[cur_hw];
+            dst_c[cur_hw * 8 + 6] = src6[cur_hw];
+            dst_c[cur_hw * 8 + 7] = src7[cur_hw];
+        }
+    }
+    int remain = channel % 8;
+    int offset = channel / 8 * 8;
+    src += offset * hw;
+    dst += offset * hw;
+    if (remain) {
+        PackNeonRemain(dst, src, hw, remain);
+    }
+
+    return 0;
+}
+
+/*
+convert data type from uint8 to half, data format from nhwc 2 nc8hw8
+*/
+template <bool reverse_channel>
+void BGRAToBlobImpl(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias,
+                    int hw, int channel) {
+    int i = 0;
+    fp16_t scale_half[4] = {fp16_t(scale[0]), fp16_t(scale[1]), fp16_t(scale[2]), fp16_t(scale[3])};
+    fp16_t bias_half[4]  = {fp16_t(bias[0]), fp16_t(bias[1]), fp16_t(bias[2]), fp16_t(bias[3])};
+#ifdef TNN_ARM82_A64
+    float16x8_t bias_neon_b = vdupq_n_f16(bias_half[0]);
+    float16x8_t bias_neon_g = vdupq_n_f16(bias_half[1]);
+    float16x8_t bias_neon_r = vdupq_n_f16(bias_half[2]);
+    float16x8_t bias_neon_a = vdupq_n_f16(bias_half[3]);
+    float16x8_t vzero       = vdupq_n_f16(0.0f);
+    float16x8x4_t vf16;
+    for (; i < hw - 7; i += 8) {
+        uint8x8x4_t v_u8 = vld4_u8(src + i * 4);
+        uint16x8_t b_u16 = vmovl_u8(v_u8.val[0]);
+        uint16x8_t g_u16 = vmovl_u8(v_u8.val[1]);
+        uint16x8_t r_u16 = vmovl_u8(v_u8.val[2]);
+        uint16x8_t a_u16 = vmovl_u8(v_u8.val[3]);
+
+        vf16.val[0] = vcvtq_f16_u16(reverse_channel ? r_u16 : b_u16);
+        vf16.val[1] = vcvtq_f16_u16(g_u16);
+        vf16.val[2] = vcvtq_f16_u16(reverse_channel ? b_u16 : r_u16);
+        vf16.val[3] = vcvtq_f16_u16(a_u16);
+
+        vf16.val[0] = vaddq_f16(bias_neon_b, vmulq_n_f16(vf16.val[0], scale_half[0]));
+        vf16.val[1] = vaddq_f16(bias_neon_g, vmulq_n_f16(vf16.val[1], scale_half[1]));
+        vf16.val[2] = vaddq_f16(bias_neon_r, vmulq_n_f16(vf16.val[2], scale_half[2]));
+        vf16.val[3] = vaddq_f16(bias_neon_a, vmulq_n_f16(vf16.val[3], scale_half[3]));
+
+        if (channel == 3) {
+            vf16.val[3] = vdupq_n_f16(0.0f);
+        }
+
+        float16x8x4_t vf16_dump;
+        vf16_dump.val[0] = vzip1q_f16(vf16.val[0], vzero);
+        vf16_dump.val[1] = vzip1q_f16(vf16.val[1], vzero);
+        vf16_dump.val[2] = vzip1q_f16(vf16.val[2], vzero);
+        vf16_dump.val[3] = vzip1q_f16(vf16.val[3], vzero);
+        vst4q_f16(dst + i * 8, vf16_dump);
+        vf16_dump.val[0] = vzip2q_f16(vf16.val[0], vzero);
+        vf16_dump.val[1] = vzip2q_f16(vf16.val[1], vzero);
+        vf16_dump.val[2] = vzip2q_f16(vf16.val[2], vzero);
+        vf16_dump.val[3] = vzip2q_f16(vf16.val[3], vzero);
+        vst4q_f16(dst + i * 8 + 32, vf16_dump);
+    }
+#elif defined(TNN_ARM82_A32)
+    Half4 scale_half4 = Half4(scale_half);
+    Half8 bias_neon_b = Half8(bias_half[0]);
+    Half8 bias_neon_g = Half8(bias_half[1]);
+    Half8 bias_neon_r = Half8(bias_half[2]);
+    Half8 bias_neon_a = Half8(bias_half[3]);
+    Half8 vzero       = Half8(fp16_t(0.0f));
+    Half8x4 vf16;
+    for (; i < hw - 7; i += 8) {
+        uint8x8x4_t v_u8 = vld4_u8(src + i * 4);
+        uint16x8_t b_u16 = vmovl_u8(v_u8.val[0]);
+        uint16x8_t g_u16 = vmovl_u8(v_u8.val[1]);
+        uint16x8_t r_u16 = vmovl_u8(v_u8.val[2]);
+        uint16x8_t a_u16 = vmovl_u8(v_u8.val[3]);
+
+        Half8 val0 = bias_neon_b;
+        Half8 val1 = bias_neon_g;
+        Half8 val2 = bias_neon_r;
+        Half8 val3 = bias_neon_a;
+
+        Half8::mla_4_lanes(val0, Half8::cvt(reverse_channel ? r_u16 : b_u16),
+                           val1, Half8::cvt(g_u16),
+                           val2, Half8::cvt(reverse_channel ? b_u16 : r_u16),
+                           val3, Half8::cvt(a_u16),
+                           scale_half4);
+
+        vf16.set_value0(val0);
+        vf16.set_value1(val1);
+        vf16.set_value2(val2);
+        vf16.set_value3(val3);
+
+        if (channel == 3) {
+            vf16.set_value3(vzero);
+        }
+
+        vf16.save_transpose(dst + i * 8, vzero);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[8 * i + 0] = scale_half[0] * fp16_t(src[4 * i + (reverse_channel ? 2 : 0)]) + bias_half[0];
+        dst[8 * i + 1] = scale_half[1] * fp16_t(src[4 * i + 1]) + bias_half[1];
+        dst[8 * i + 2] = scale_half[2] * fp16_t(src[4 * i + (reverse_channel ? 0 : 2)]) + bias_half[2];
+        dst[8 * i + 3] = scale_half[3] * fp16_t(src[4 * i + 3]) + bias_half[3];
+        if (channel == 3) {
+            dst[8 * i + 3] = 0.0f;
+        }
+    }
+}
+
+template void BGRAToBlobImpl<true>(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias,
+                                   int hw, int channel);
+template void BGRAToBlobImpl<false>(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias,
+                                    int hw, int channel);
+
+
+/*
+convert data type from uint8 to half, data format from nhw3 2 nc8hw8
+*/
+template <bool reverse_channel>
+void BGRToBlobImpl(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+    fp16_t scale_half[3] = {fp16_t(scale[0]), fp16_t(scale[1]), fp16_t(scale[2])};
+    fp16_t bias_half[3]  = {fp16_t(bias[0]), fp16_t(bias[1]), fp16_t(bias[2])};
+#ifdef TNN_ARM82_A64
+    float16x8_t bias_neon_b = vdupq_n_f16(bias_half[0]);
+    float16x8_t bias_neon_g = vdupq_n_f16(bias_half[1]);
+    float16x8_t bias_neon_r = vdupq_n_f16(bias_half[2]);
+    float16x8_t vzero       = vdupq_n_f16(0.0f);
+    float16x8x4_t vf16;
+    vf16.val[3] = vzero;
+    for (; i < hw - 7; i += 8) {
+        uint8x8x3_t v_u8 = vld3_u8(src + i * 3);
+        uint16x8_t b_u16 = vmovl_u8(v_u8.val[0]);
+        uint16x8_t g_u16 = vmovl_u8(v_u8.val[1]);
+        uint16x8_t r_u16 = vmovl_u8(v_u8.val[2]);
+
+        vf16.val[0] = vcvtq_f16_u16(reverse_channel ? r_u16 : b_u16);
+        vf16.val[1] = vcvtq_f16_u16(g_u16);
+        vf16.val[2] = vcvtq_f16_u16(reverse_channel ? b_u16 : r_u16);
+
+        vf16.val[0] = vaddq_f16(bias_neon_b, vmulq_n_f16(vf16.val[0], scale_half[0]));
+        vf16.val[1] = vaddq_f16(bias_neon_g, vmulq_n_f16(vf16.val[1], scale_half[1]));
+        vf16.val[2] = vaddq_f16(bias_neon_r, vmulq_n_f16(vf16.val[2], scale_half[2]));
+
+        float16x8x4_t vf16_dump;
+        vf16_dump.val[0] = vzip1q_f16(vf16.val[0], vzero);
+        vf16_dump.val[1] = vzip1q_f16(vf16.val[1], vzero);
+        vf16_dump.val[2] = vzip1q_f16(vf16.val[2], vzero);
+        vf16_dump.val[3] = vzip1q_f16(vf16.val[3], vzero);
+        vst4q_f16(dst + i * 8, vf16_dump);
+        vf16_dump.val[0] = vzip2q_f16(vf16.val[0], vzero);
+        vf16_dump.val[1] = vzip2q_f16(vf16.val[1], vzero);
+        vf16_dump.val[2] = vzip2q_f16(vf16.val[2], vzero);
+        vf16_dump.val[3] = vzip2q_f16(vf16.val[3], vzero);
+        vst4q_f16(dst + i * 8 + 32, vf16_dump);
+    }
+#elif defined(TNN_ARM82_A32)
+    Half4 scale_half4 = Half4(scale_half);
+    Half8 bias_neon_b = Half8(bias_half[0]);
+    Half8 bias_neon_g = Half8(bias_half[1]);
+    Half8 bias_neon_r = Half8(bias_half[2]);
+    Half8 vzero       = Half8(fp16_t(0.0f));
+    Half8x4 vf16;
+    vf16.set_value3(vzero);
+    for (; i < hw - 7; i += 8) {
+        uint8x8x3_t v_u8 = vld3_u8(src + i * 3);
+        uint16x8_t b_u16 = vmovl_u8(v_u8.val[0]);
+        uint16x8_t g_u16 = vmovl_u8(v_u8.val[1]);
+        uint16x8_t r_u16 = vmovl_u8(v_u8.val[2]);
+
+        Half8 val0 = bias_neon_b;
+        Half8 val1 = bias_neon_g;
+        Half8 val2 = bias_neon_r;
+
+        Half8::mla_3_lanes(val0, Half8::cvt(reverse_channel ? r_u16 : b_u16),
+                           val1, Half8::cvt(g_u16),
+                           val2, Half8::cvt(reverse_channel ? b_u16 : r_u16),
+                           scale_half4);
+
+        vf16.set_value0(val0);
+        vf16.set_value1(val1);
+        vf16.set_value2(val2);
+
+        vf16.save_transpose(dst + i * 8, vzero);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[8 * i + 0] = scale_half[0] * fp16_t(src[3 * i + (reverse_channel ? 2 : 0)]) + bias_half[0];
+        dst[8 * i + 1] = scale_half[1] * fp16_t(src[3 * i + 1]) + bias_half[1];
+        dst[8 * i + 2] = scale_half[2] * fp16_t(src[3 * i + (reverse_channel ? 0 : 2)]) + bias_half[2];
+        dst[8 * i + 3] = 0.0f;
+    }
+}
+
+template void BGRToBlobImpl<true>(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias, int hw);
+template void BGRToBlobImpl<false>(const uint8_t *src, fp16_t *dst, const float *scale, const float *bias, int hw);
+
+/*
+convert data type from uint8 to half, data format from nhw1 2 nc8hw8
+*/
+void GrayToBlob(const uint8_t *src, fp16_t *dst, const float scale, const float bias, int hw) {
+    int i = 0;
+    fp16_t scale_half = fp16_t(scale);
+    fp16_t bias_half  = fp16_t(bias);
+    memset(dst, 0, hw * 8 * sizeof(fp16_t));
+#ifdef TNN_ARM82_A64
+    float16x8_t scale_neon = vdupq_n_f16(scale_half);
+    float16x8_t bias_neon  = vdupq_n_f16(bias_half);
+    for (; i < hw - 7; i += 8) {
+        uint8x8_t v_u8   = vld1_u8(src + i);
+        float16x8_t vf16 = vcvtq_f16_u16(vmovl_u8(v_u8));
+        float16x8_t rf16 = vaddq_f16(bias_neon, vmulq_f16(scale_neon, vf16));
+
+        vst1q_lane_f16(dst + (i + 0) * 8, rf16, 0);
+        vst1q_lane_f16(dst + (i + 1) * 8, rf16, 1);
+        vst1q_lane_f16(dst + (i + 2) * 8, rf16, 2);
+        vst1q_lane_f16(dst + (i + 3) * 8, rf16, 3);
+        vst1q_lane_f16(dst + (i + 4) * 8, rf16, 4);
+        vst1q_lane_f16(dst + (i + 5) * 8, rf16, 5);
+        vst1q_lane_f16(dst + (i + 6) * 8, rf16, 6);
+        vst1q_lane_f16(dst + (i + 7) * 8, rf16, 7);
+    }
+#elif defined(TNN_ARM82_A32)
+    Half8 scale_neon = Half8(scale_half);
+    Half8 bias_neon  = Half8(bias_half);
+    for (; i < hw - 7; i += 8) {
+        uint8x8_t v_u8   = vld1_u8(src + i);
+        Half8 rf16 = bias_neon;
+        Half8::mla(rf16, scale_neon, Half8::cvt(vmovl_u8(v_u8)));
+
+        rf16.save_lane0(dst + (i + 0) * 8);
+        rf16.save_lane1(dst + (i + 1) * 8);
+        rf16.save_lane2(dst + (i + 2) * 8);
+        rf16.save_lane3(dst + (i + 3) * 8);
+        rf16.save_lane4(dst + (i + 4) * 8);
+        rf16.save_lane5(dst + (i + 5) * 8);
+        rf16.save_lane6(dst + (i + 6) * 8);
+        rf16.save_lane7(dst + (i + 7) * 8);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[8 * i] = scale_half * fp16_t(src[i]) + bias_half;
+    }
+}
+
+
+template <bool reverse_channel>
+void BlobToBGRAImpl(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    int i = 0;
+    fp16_t scale_half[4] = {fp16_t(scale[0]), fp16_t(scale[1]), fp16_t(scale[2]), fp16_t(scale[3])};
+    fp16_t bias_half[4]  = {fp16_t(bias[0]), fp16_t(bias[1]), fp16_t(bias[2]), fp16_t(bias[3])};
+#ifdef TNN_ARM82_A64
+    float16x8_t bias_neon_b = vdupq_n_f16(bias_half[0]);
+    float16x8_t bias_neon_g = vdupq_n_f16(bias_half[1]);
+    float16x8_t bias_neon_r = vdupq_n_f16(bias_half[2]);
+    float16x8_t bias_neon_a = vdupq_n_f16(bias_half[3]);
+    uint8x8x4_t vi8x4;
+    float16x8x4_t vf16;
+    for (; i < hw - 7; i += 8) {
+        float16x8x4_t vf16_0 = vld4q_f16(src + i * 8);
+        float16x8x4_t vf16_1 = vld4q_f16(src + i * 8 + 32);
+        vf16.val[0] = vuzp1q_f16(vf16_0.val[0], vf16_1.val[0]);
+        vf16.val[1] = vuzp1q_f16(vf16_0.val[1], vf16_1.val[1]);
+        vf16.val[2] = vuzp1q_f16(vf16_0.val[2], vf16_1.val[2]);
+        vf16.val[3] = vuzp1q_f16(vf16_0.val[3], vf16_1.val[3]);
+
+        vf16.val[0] = vaddq_f16(bias_neon_b, vmulq_n_f16(vf16.val[0], scale_half[0]));
+        vf16.val[1] = vaddq_f16(bias_neon_g, vmulq_n_f16(vf16.val[1], scale_half[1]));
+        vf16.val[2] = vaddq_f16(bias_neon_r, vmulq_n_f16(vf16.val[2], scale_half[2]));
+        vf16.val[3] = vaddq_f16(bias_neon_a, vmulq_n_f16(vf16.val[3], scale_half[3]));
+
+        int16x8_t s16_0 = vcvtaq_s16_f16(vf16.val[reverse_channel ? 2 : 0]);
+        int16x8_t s16_1 = vcvtaq_s16_f16(vf16.val[1]);
+        int16x8_t s16_2 = vcvtaq_s16_f16(vf16.val[reverse_channel ? 0 : 2]);
+        int16x8_t s16_3 = vcvtaq_s16_f16(vf16.val[3]);
+
+        vi8x4.val[0] = vqmovun_s16(s16_0);
+        vi8x4.val[1] = vqmovun_s16(s16_1);
+        vi8x4.val[2] = vqmovun_s16(s16_2);
+        vi8x4.val[3] = vqmovun_s16(s16_3);
+
+        if (channel == 3) {
+            uint8x8x4_t vi8x4_tmp = vld4_u8(dst + i * 4);
+            vi8x4.val[3]          = vi8x4_tmp.val[3];
+        }
+
+        vst4_u8(dst + i * 4, vi8x4);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = half2uint8(reverse_channel ? (scale_half[2] * fp16_t(src[8 * i + 2]) + bias_half[2]) :
+                                                       (scale_half[0] * fp16_t(src[8 * i + 0]) + bias_half[0]));
+        dst[4 * i + 1] = half2uint8(scale_half[1] * fp16_t(src[8 * i + 1]) + bias_half[1]);
+        dst[4 * i + 2] = half2uint8(reverse_channel ? (scale_half[0] * fp16_t(src[8 * i + 0]) + bias_half[0]) :
+                                                       (scale_half[2] * fp16_t(src[8 * i + 2]) + bias_half[2]));
+        if (channel == 4) {
+            dst[4 * i + 3] = half2uint8(scale_half[3] * fp16_t(src[8 * i + 3]) + bias_half[3]);
+        }
+    }
+}
+
+template void BlobToBGRAImpl<true>(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel);
+template void BlobToBGRAImpl<false>(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel);
+
+
+template <bool reverse_channel>
+void BlobToBGRImpl(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+    fp16_t scale_half[3] = {fp16_t(scale[0]), fp16_t(scale[1]), fp16_t(scale[2])};
+    fp16_t bias_half[3]  = {fp16_t(bias[0]), fp16_t(bias[1]), fp16_t(bias[2])};
+#ifdef TNN_ARM82_A64
+    float16x8_t bias_neon_b = vdupq_n_f16(bias_half[0]);
+    float16x8_t bias_neon_g = vdupq_n_f16(bias_half[1]);
+    float16x8_t bias_neon_r = vdupq_n_f16(bias_half[2]);
+    uint8x8x3_t vi8x3;
+    float16x8x3_t vf16;
+    for (; i < hw - 7; i += 8) {
+        float16x8x4_t vf16_0 = vld4q_f16(src + i * 8);
+        float16x8x4_t vf16_1 = vld4q_f16(src + i * 8 + 32);
+        vf16.val[0] = vuzp1q_f16(vf16_0.val[0], vf16_1.val[0]);
+        vf16.val[1] = vuzp1q_f16(vf16_0.val[1], vf16_1.val[1]);
+        vf16.val[2] = vuzp1q_f16(vf16_0.val[2], vf16_1.val[2]);
+
+        vf16.val[0] = vaddq_f16(bias_neon_b, vmulq_n_f16(vf16.val[0], scale_half[0]));
+        vf16.val[1] = vaddq_f16(bias_neon_g, vmulq_n_f16(vf16.val[1], scale_half[1]));
+        vf16.val[2] = vaddq_f16(bias_neon_r, vmulq_n_f16(vf16.val[2], scale_half[2]));
+
+        int16x8_t s16_0 = vcvtaq_s16_f16(vf16.val[reverse_channel ? 2 : 0]);
+        int16x8_t s16_1 = vcvtaq_s16_f16(vf16.val[1]);
+        int16x8_t s16_2 = vcvtaq_s16_f16(vf16.val[reverse_channel ? 0 : 2]);
+
+        vi8x3.val[0] = vqmovun_s16(s16_0);
+        vi8x3.val[1] = vqmovun_s16(s16_1);
+        vi8x3.val[2] = vqmovun_s16(s16_2);
+
+        vst3_u8(dst + i * 3, vi8x3);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[3 * i + 0] = half2uint8(reverse_channel ? (scale_half[2] * fp16_t(src[8 * i + 2]) + bias_half[2]) :
+                                                       (scale_half[0] * fp16_t(src[8 * i + 0]) + bias_half[0]));
+        dst[3 * i + 1] = half2uint8(scale_half[1] * fp16_t(src[8 * i + 1]) + bias_half[1]);
+        dst[3 * i + 2] = half2uint8(reverse_channel ? (scale_half[0] * fp16_t(src[8 * i + 0]) + bias_half[0]) :
+                                                       (scale_half[2] * fp16_t(src[8 * i + 2]) + bias_half[2]));
+    }
+}
+
+
+template void BlobToBGRImpl<true>(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias, int hw);
+template void BlobToBGRImpl<false>(const fp16_t *src, uint8_t *dst, const float *scale, const float *bias, int hw);
+
+void GemmHalfPackA(int m, int n, int k, const fp16_t* a, fp16_t* pack_a, int lda, const fp16_t* b, int ldb, fp16_t* c,
+                   int ldc) {
+#ifdef __aarch64__
+    PackA_8(m, k, a, lda, pack_a);
+    Kernel_8x16(m, n, k, pack_a, b, c, ldc);
+    a += (m / 8) * 8 * lda;
+    c += (m / 8) * 8 * ldc;
+    m = m % 8;
+#endif
+
+    PackA_4(m, k, a, lda, pack_a);
+    Kernel_4x16(m, n, k, pack_a, b, c, ldc);
+    a += (m / 4) * 4 * lda;
+    c += (m / 4) * 4 * ldc;
+    m = m % 4;
+
+    PackA_1(m, k, a, lda, pack_a);
+    Kernel_1x16(m, n, k, pack_a, b, c, ldc);
+}
+
+void GemmFloatPackAB(int m, int n, int k, const fp16_t* a, fp16_t* pack_a, int lda, const fp16_t* b, fp16_t* pack_b, int ldb, fp16_t* c,
+                   int ldc) {
+    PackB_16(k, n, b, ldb, pack_b);
+#ifdef __aarch64__
+    PackA_8(m, k, a, lda, pack_a);
+    Kernel_8x16(m, n, k, pack_a, pack_b, c, ldc);
+    a += (m / 8) * 8 * lda;
+    c += (m / 8) * 8 * ldc;
+    m = m % 8;
+#endif
+
+    PackA_4(m, k, a, lda, pack_a);
+    Kernel_4x16(m, n, k, pack_a, pack_b, c, ldc);
+    a += (m / 4) * 4 * lda;
+    c += (m / 4) * 4 * ldc;
+    m = m % 4;
+
+    PackA_1(m, k, a, lda, pack_a);
+    Kernel_1x16(m, n, k, pack_a, pack_b, c, ldc);
+}
+#endif  // TNN_ARM82
+
+/*
+[3] Implement asm functions in compute_half.h for simulation
+*/
+#ifdef TNN_ARM82_SIMU
+void ActiveOutput(fp16_t* dst, const Half8* src, long relu, int num) {
+    for (long i = 0; i < num; i++) {
+        if (relu == ActivationType_ReLU) {
+            Half8::save(dst + i * 8, Half8::max(src[i], Half8((fp16_t)0.f)));
+        } else if (relu == ActivationType_ReLU6) {
+            Half8::save(dst + i * 8, Half8::min(Half8::max(src[i], Half8((fp16_t)0.f)), Half8((fp16_t)6.0f)));
+        } else {
+            // Float4::save(dst + i * 4, src[i]);
+            Half8::save(dst + i * 8, src[i]);
+        }
+    }
+}
+void GEMM_FP16_N8(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long src_depth,
+                           long dst_step, long dst_depth, long width, fp16_t *bias, long relu) {
+    long dx, sz, dz;
+    // NC8HW8
+    for (dz = 0; dz < dst_depth; dz += 8) {
+        // dst_step = M * 8 (NC8HW8)
+        auto dst_z = dst + (dz / 8) * dst_step;
+        auto weight_dz = weight + dz * src_depth;
+        Half8 v_bias = Half8::load(bias + dz);
+        // process 8x16 results in one loop
+        dx = 0;
+        for (; dx + 7 < width; dx += 8) {
+            auto dst_dx = dst_z + dx * 8;
+            auto src_dx = src + dx * src_depth;
+            Half8 v_dst[8];
+            for (int i = 0; i < 8; i++) {
+                v_dst[i] = v_bias;
+            }
+            for (long sz = 0; sz < src_depth; sz++) {
+                auto src_z = src_dx + sz * 8;
+                auto weight_sz = weight_dz + sz * 8;
+                Half8 v_weight = Half8::load(weight_sz);
+                Half8 v_src    = Half8::load(src_z);
+                Half8::mlaq_lane0(v_dst[0], v_weight, v_src);
+                Half8::mlaq_lane1(v_dst[1], v_weight, v_src);
+                Half8::mlaq_lane2(v_dst[2], v_weight, v_src);
+                Half8::mlaq_lane3(v_dst[3], v_weight, v_src);
+                Half8::mlaq_lane4(v_dst[4], v_weight, v_src);
+                Half8::mlaq_lane5(v_dst[5], v_weight, v_src);
+                Half8::mlaq_lane6(v_dst[6], v_weight, v_src);
+                Half8::mlaq_lane7(v_dst[7], v_weight, v_src);
+            }
+            ActiveOutput(dst_dx, v_dst, relu, 8);
+        }
+        // process 4x8 results in one loop
+        for (; dx + 3 < width; dx += 4) {
+            auto dst_dx = dst_z + dx * 8;
+            auto src_dx = src + dx * src_depth;
+            Half8 v_dst[4];
+            for (int i = 0; i < 4; i++) {
+                v_dst[i] = v_bias;
+            }
+            for (long sz = 0; sz < src_depth; sz++) {
+                auto src_z = src_dx + sz * 4;
+                auto weight_sz = weight_dz + sz * 8;
+                Half8 v_weight = Half8::load(weight_sz);
+                Half4 v_src    = Half4::load(src_z);
+                Half8::mla_lane0(v_dst[0], v_weight, v_src);
+                Half8::mla_lane1(v_dst[1], v_weight, v_src);
+                Half8::mla_lane2(v_dst[2], v_weight, v_src);
+                Half8::mla_lane3(v_dst[3], v_weight, v_src);
+            }
+            ActiveOutput(dst_dx, v_dst, relu, 4);
+        }
+        // // the process 1x4 results
+        if (dx < width) {
+            auto dst_dx = dst_z + dx * 8;
+            auto src_dx = src + dx * src_depth;
+            Half8 v_dst[3];
+            for (int i = 0; i < 3; i++) {
+                v_dst[i] = v_bias;
+            }
+            for (long sz = 0; sz < src_depth; sz++) {
+                auto src_z = src_dx + sz * 4;
+                auto weight_sz = weight_dz + sz * 8;
+                Half8 v_weight = Half8::load(weight_sz);
+                Half4 v_src    = Half4::load(src_z);
+                Half8::mla_lane0(v_dst[0], v_weight, v_src);
+                Half8::mla_lane1(v_dst[1], v_weight, v_src);
+                Half8::mla_lane2(v_dst[2], v_weight, v_src);
+            }
+            ActiveOutput(dst_dx, v_dst, relu, width - dx);
+        }
+    }
+}
+void GemmFp16SlidewC3(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long src_w_setup, long fw,
+                       long fh, long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_x  = dst + dx * 8;
+        dst_x[0]    = 0.0f;
+        dst_x[1]    = 0.0f;
+        dst_x[2]    = 0.0f;
+        dst_x[3]    = 0.0f;
+        dst_x[4]    = 0.0f;
+        dst_x[5]    = 0.0f;
+        dst_x[6]    = 0.0f;
+        dst_x[7]    = 0.0f;
+        auto src_dx = src + src_w_setup * dx;
+
+        for (fy = 0; fy < fh; ++fy) {
+            auto src_y    = src_dx + fy * dilate_y_step;
+            auto weight_y = weight + fy * fw * 24;
+            for (fx = 0; fx < fw; ++fx) {
+                auto weight_x = weight_y + 24 * fx;
+                auto src_x    = src_y + fx * dilate_x_step;
+                for (long i = 0; i < 3; ++i) {
+                    for (long j = 0; j < 8; ++j) {
+                        dst_x[j] = float(dst_x[j]) + float(src_x[i]) * float(weight_x[8 * i + j]);
+                    }
+                }
+            }
+        }
+    }
+}
+void ConvDw3x3Fp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width) {
+    long dx;
+    int fy, fx;
+    fp16_t** cache_line_ptr    = (fp16_t**)cache_line;
+    const fp16_t* weight_z_ptr = (const fp16_t*)weight_z;
+    for (dx = 0; dx < dst_width; ++dx) {
+        auto dst_x = (fp16_t*)dst_z + dx * 8;
+        dst_x[0]   = (fp16_t)0.0f;
+        dst_x[1]   = (fp16_t)0.0f;
+        dst_x[2]   = (fp16_t)0.0f;
+        dst_x[3]   = (fp16_t)0.0f;
+        dst_x[4]   = (fp16_t)0.0f;
+        dst_x[5]   = (fp16_t)0.0f;
+        dst_x[6]   = (fp16_t)0.0f;
+        dst_x[7]   = (fp16_t)0.0f;
+        for (fy = 0; fy < 3; ++fy) {
+            for (fx = 0; fx < 3; ++fx) {
+                for (long i = 0; i < 8; ++i) {
+                    dst_x[i] = dst_x[i] + cache_line_ptr[fy][dx * 8 + fx * 8 + i] * weight_z_ptr[3 * fy * 8 + fx * 8 + i];
+                }
+            }
+        }
+    }
+}
+/*
+general deconv micro kernel fp16_t
+*/
+void DeconvFp16O8(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long dst_w_step, long src_depth_quad,
+                   long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_dx = dst + dx * dst_w_step;
+        for (fy = 0; fy < fh; ++fy) {
+            auto dst_y    = dst_dx + fy * dilate_y_step;
+            auto weight_y = weight + fy * fw * src_depth_quad * 64;
+            for (fx = 0; fx < fw; ++fx) {
+                auto dst_x    = dst_y + fx * dilate_x_step;
+                auto weight_x = weight_y + fx * src_depth_quad * 64;
+                fp16_t temp[8] = {fp16_t(0.0f)};
+                for (sz = 0; sz < src_depth_quad; ++sz) {
+                    auto weight_z = weight_x + sz * 64;
+                    auto src_z    = src + dx * 8 + sz * src_depth_step;
+                    for (long i = 0; i < 8; ++i) {
+                        for (long j = 0; j < 8; ++j) {
+                            temp[j] = temp[j] + src_z[i] * weight_z[8 * i + j];
+                        }
+                    }
+                }
+                for (long j = 0; j < 8; ++j) {
+                    dst_x[j] = dst_x[j] + temp[j];
+                }
+            }
+        }
+    }
+}
+void DeconvFp16O8C1(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long dst_w_step, long src_depth,
+                   long src_depth_step, long fw, long fh, long dilate_x_step, long dilate_y_step) {
+    long dx, sz, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        auto dst_dx = dst + dx * dst_w_step;
+        for (fy = 0; fy < fh; ++fy) {
+            auto dst_y    = dst_dx + fy * dilate_y_step;
+            auto weight_y = weight + fy * fw * src_depth * 8;
+            for (fx = 0; fx < fw; ++fx) {
+                auto dst_x    = dst_y + fx * dilate_x_step;
+                auto weight_x = weight_y + fx * src_depth * 8;
+                fp16_t temp[8] = {fp16_t(0.0f)};
+                for (sz = 0; sz < src_depth; ++sz) {
+                    auto weight_z = weight_x + sz * 8;
+                    auto src_z    = src + dx * 1 + sz * src_depth_step;
+                    for (long j = 0; j < 8; ++j) {
+                        temp[j] = temp[j] + src_z[0] * weight_z[j];
+                    }
+                }
+                for (long j = 0; j < 8; ++j) {
+                    dst_x[j] = dst_x[j] + temp[j];
+                }
+            }
+        }
+    }
+}
+
+#endif  // TNN_ARM82_SIMU
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.h
new file mode 100644
index 0000000..6aea0dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_half.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_COMPUTE_HALF_H_
+#define TNN_ARM_COMPUTE_HALF_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+#if TNN_ARM82
+// used for reformat
+void HalfC8ToFloatC4(float* dst, const fp16_t* src, long batch, long channel, long hw);
+void FloatC4ToHalfC8(fp16_t* dst, const float* src, long batch, long channel, long hw);
+// used for blob converter
+int PackNeon(fp16_t* dst, const fp16_t* src, size_t hw, size_t channel);
+int PackNeonC3(fp16_t* dst, const float* src, size_t hw, size_t channel);
+int PackNeonNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+int UnpackNeonNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+template <bool reverse_channel>
+void BGRAToBlobImpl(const uint8_t* src, fp16_t* dst, const float* scale, const float* bias, int hw, int channel);
+template <bool reverse_channel>
+void BGRToBlobImpl(const uint8_t* src, fp16_t* dst, const float* scale, const float* bias, int hw);
+void GrayToBlob(const uint8_t* src, fp16_t* dst, const float scale, const float bias, int hw);
+template <bool reverse_channel>
+void BlobToBGRAImpl(const fp16_t* src, uint8_t* dst, const float* scale, const float* bias, int hw, int channel);
+template <bool reverse_channel>
+void BlobToBGRImpl(const fp16_t* src, uint8_t* dst, const float* scale, const float* bias, int hw);
+
+void GemmHalfPackA(int m, int n, int k, const fp16_t* a, fp16_t* pack_a, int lda, const fp16_t* b, int ldb, fp16_t* c,
+                   int ldc);
+void GemmFloatPackAB(int m, int n, int k, const fp16_t* a, fp16_t* pack_a, int lda, const fp16_t* b, fp16_t* pack_b, int ldb, fp16_t* c,
+                   int ldc);
+#endif
+
+#ifdef TNN_ARM82_USE_NEON
+#ifdef __cplusplus
+extern "C" {
+#endif
+#endif
+
+void Half2FloatKernel(float* dst, const fp16_t* src, const size_t length);
+void Float2HalfKernel(fp16_t* dst, const float* src, const size_t length);
+void GEMM_FP16_N8(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long src_depth, long dst_step, long dst_depth,
+                  long width, fp16_t* bias, long relu);
+void GemmFp16SlidewC3(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long src_w_setup, long fw,
+                      long fh, long dilateX_step, long dilateY_step);
+void ConvDw3x3Fp16SlideW(void* dst_z, void** cache_line, const void* weight_z, long dst_width);
+void DeconvFp16O8(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long dst_w_step,
+                  long src_depth_quad, long src_depth_step, long fw, long fh, long dilateX_step, long dilateY_step);
+void DeconvFp16O8C1(fp16_t* dst, const fp16_t* src, const fp16_t* weight, long width, long dst_w_step, long src_depth,
+                    long src_depth_step, long fw, long fh, long dilateX_step, long dilateY_step);
+
+#ifdef TNN_ARM82_USE_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+}  // namespace TNN_NS
+
+#endif  // TNN_ARM_COMPUTE_HALF_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.cc
new file mode 100644
index 0000000..5dbb558
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.cc
@@ -0,0 +1,239 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h"
+
+namespace TNN_NS {
+
+#ifdef TNN_ARM82_A32
+void GemmInt8SdotUnit4x8(int8_t* dst, const int8_t* src, const int8_t* weight,
+                         long src_depth, long dst_depth, long hw,
+                         const int32_t* bias, const float* scale,
+                         long relu, const int8_t* add_input,
+                         const float* add_scale, const int8_t* relu6_max) {
+    const int8_t* add_input_ptr = nullptr;
+    const float* add_scale_ptr = nullptr;
+    for (int dz = 0; dz < dst_depth / 8 * 8; dz += 8) {
+        if (add_input) {
+            add_input_ptr = add_input + dz;
+            add_scale_ptr = add_scale + dz;
+        }
+        GemmInt8SdotUnit4x8Kernel(dst + dz, src, weight + dz * src_depth,
+                                  src_depth, dst_depth, hw,
+                                  bias + dz, scale + dz, relu,
+                                  add_input_ptr, add_scale_ptr, relu6_max + dz);
+    }
+}
+#endif
+
+// c0 kh0kh1kh2 0, c1 kh0kh1kh2 0, c2 kh0kh1kh2 0, c3 kh0kh1kh2 0
+// c4 kh0kh1kh2 0, c5 kh0kh1kh2 0, c6 kh0kh1kh2 0, c7 kh0kh1kh2 0
+void PackSDOTDW3X3INT8Weight(const int8_t *src, int8_t *dst, int oc) {
+    int oc_r4 = ROUND_UP(oc, 4);
+    int oc_r4_align = oc_r4 / 8 * 8;
+    int o = 0;
+    for (; o < oc_r4_align; o += 8) {
+        auto src_o = src + o * 3 * 3; // kh x kw
+        auto dst_o = dst + o * 3 * 4; // kw x [kh0, kh1, kh2, 0]
+        for (int w = 0; w < 3; w++) {
+            auto src_w = src_o + w;
+            auto dst_w = dst_o + w * 8 * 4;
+            for (int c = 0; c < 8; c++) {
+                auto src_c = src_w + c * 3 * 3;
+                auto dst_c = dst_w + c * 4;
+                if (o + c < oc) {
+                    for (int h = 0; h < 3; h++) {
+                        dst_c[h] = src_c[h * 3];
+                    }
+                } else {
+                    for (int h = 0; h < 3; h++) {
+                        dst_c[h] = 0;
+                    }
+                }
+                dst_c[3] = 0;
+            }
+        }
+    }
+    if (o < oc_r4) {
+        auto src_o = src + o * 3 * 3; // kh x kw
+        auto dst_o = dst + o * 3 * 4; // kw x [kh0, kh1, kh2, 0]
+        for (int w = 0; w < 3; w++) {
+            auto src_w = src_o + w;
+            auto dst_w = dst_o + w * 4 * 4;
+            for (int c = 0; c < 4; c++) {
+                auto src_c = src_w + c * 3 * 3;
+                auto dst_c = dst_w + c * 4;
+                if (o + c < oc) {
+                    for (int h = 0; h < 3; h++) {
+                        dst_c[h] = src_c[h * 3];
+                    }
+                } else {
+                    for (int h = 0; h < 3; h++) {
+                        dst_c[h] = 0;
+                    }
+                }
+                dst_c[3] = 0;
+            }
+        }
+    }
+}
+
+void PackSDOTINT8Weight(const int8_t *src, int8_t *dst, int oc, int ic, int kh, int kw) {
+    int oc_r4  = ROUND_UP(oc, 4);
+    int ic_r4  = ROUND_UP(ic, 4);
+    int crs    = ic * kw * kh;
+    int crs_r4 = ic_r4 * kw * kh;
+    int ic_align_4 = ic / 4 * 4;
+    int oc_r4_align = oc_r4 / 8 * 8;
+
+    for (int o = 0; o < oc_r4_align; o += 8) {
+        auto src_o = src + o * crs;
+        auto dst_o = dst + o * crs_r4;
+        for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+                auto src_w = src_o + h * kw + w;
+                auto dst_w = dst_o + h * kw * ic_r4 * 8 + w * ic_r4 * 8;
+                int c = 0;
+                for (; c < ic_align_4; c += 4) {
+                    auto src_c = src_w + c * kh * kw;
+                    auto dst_c = dst_w + c * 8;
+                    for (int i = 0; i < 8; i++) {
+                        if (o + i < oc) {
+                            for (int j = 0; j < 4; j++) {
+                                dst_c[i * 4 + j] = src_c[i * crs + j * kh * kw];
+                            }
+                        }
+                    }
+                }
+                if (c < ic_r4) {
+                    auto src_c = src_w + c * kh * kw;
+                    auto dst_c = dst_w + c * 8;
+                    for (int i = 0; i < 8; i++) {
+                        if (o + i < oc) {
+                            for (int j = 0; j < 4; j++) {
+                                if (c + j < ic) {
+                                    dst_c[i * 4 + j] = src_c[i * crs + j * kh * kw];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (oc_r4 > oc_r4_align) {
+        int o = oc_r4_align;
+        auto src_o = src + o * crs;
+        auto dst_o = dst + o * crs_r4;
+        for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+                auto src_w = src_o + h * kw + w;
+                auto dst_w = dst_o + h * kw * ic_r4 * 4 + w * ic_r4 * 4;
+                int c = 0;
+                for (; c < ic_align_4; c += 4) {
+                    auto src_c = src_w + c * kh * kw;
+                    auto dst_c = dst_w + c * 4;
+                    for (int i = 0; i < 4; i++) {
+                        if (o + i < oc) {
+                            for (int j = 0; j < 4; j++) {
+                                dst_c[i * 4 + j] = src_c[i * crs + j * kh * kw];
+                            }
+                        }
+                    }
+                }
+                if (c < ic_r4) {
+                    auto src_c = src_w + c * kh * kw;
+                    auto dst_c = dst_w + c * 4;
+                    for (int i = 0; i < 4; i++) {
+                        if (o + i < oc) {
+                            for (int j = 0; j < 4; j++) {
+                                if (c + j < ic) {
+                                    dst_c[i * 4 + j] = src_c[i * crs + j * kh * kw];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void PackSDOTINT8WeightGemv(const int8_t *src, int8_t *dst, const int oc, const int ic, const int hw) {
+    auto ic_r4       = ROUND_UP(ic, 4);
+    auto oc_r4       = ROUND_UP(oc, 4);
+    auto dst_step    = ic * hw;
+    auto dst_step_r4 = ic_r4 * hw;
+    int o = 0;
+    for (; o + 15 < oc; o += 16) {
+        auto dst_o = dst + o * dst_step_r4;
+        auto src_o = src + o * dst_step;
+        for (int i = 0; i < hw; i++) {
+            auto dst_i = dst_o + i * ic_r4 * 16;
+            auto src_i = src_o + i;
+            int c = 0;
+            for (; c + 3 < ic; c += 4) {
+                auto dst_c = dst_i + c * 16;
+                auto src_c = src_i + c * hw;
+                for (int m = 0; m < 16; m++) {
+                    for (int n = 0; n < 4; n++) {
+                        dst_c[m * 4 + n] = src_c[m * dst_step + n * hw];
+                    }
+                }
+            }
+            if (c < ic) {
+                auto dst_c = dst_i + c * 16;
+                auto src_c = src_i + c * hw;
+                for (int m = 0; m < 16; m++) {
+                    for (int n = 0; n < ic - c; n++) {
+                        dst_c[m * 4 + n] = src_c[m * dst_step + n * hw];
+                    }
+                }
+            }
+        }
+    }
+    for (; o < oc_r4; o += 4) {
+        auto dst_o = dst + o * dst_step_r4;
+        auto src_o = src + o * dst_step;
+        for (int i = 0; i < hw; i++) {
+            auto dst_i = dst_o + i * ic_r4 * 4;
+            auto src_i = src_o + i;
+            int c = 0;
+            for (; c + 3 < ic; c += 4) {
+                auto dst_c = dst_i + c * 4;
+                auto src_c = src_i + c * hw;
+                for (int m = 0; m < 4; m++) {
+                    if (m + o < oc) {
+                        for (int n = 0; n < 4; n++) {
+                            dst_c[m * 4 + n] = src_c[m * dst_step + n * hw];
+                        }
+                    }
+                }
+            }
+            if (c < ic) {
+                auto dst_c = dst_i + c * 4;
+                auto src_c = src_i + c * hw;
+                for (int m = 0; m < 4; m++) {
+                    if (m + o < oc) {
+                        for (int n = 0; n < ic - c; n++) {
+                            dst_c[m * 4 + n] = src_c[m * dst_step + n * hw];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h
new file mode 100644
index 0000000..1a3d474
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/compute_sdot_int8.h
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_common.h"
+
+namespace TNN_NS {
+
+void PackSDOTINT8Weight(const int8_t *src, int8_t *dst, int oc, int ic, int kh, int kw);
+void PackSDOTINT8WeightGemv(const int8_t *src, int8_t *dst, const int oc, const int ic, const int hw);
+void PackSDOTDW3X3INT8Weight(const int8_t *src, int8_t *dst, int oc);
+
+#ifdef TNN_ARM82_A32
+void GemmInt8SdotUnit4x8(int8_t* dst, const int8_t* src, const int8_t* weight,
+                         long src_depth, long dst_depth, long hw,
+                         const int32_t* bias, const float* scale,
+                         long relu, const int8_t* add_input,
+                         const float* add_scale, const int8_t* relu6_max);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef TNN_ARM82_USE_NEON
+void ConvDw3x3Int8SdotSlideW(int8_t *dst_z, int8_t **src, const int8_t* weight_z, const int32_t* bias_z,
+                             const float* scale_z, long dc, long dst_depth, long width);
+void ConvDw3x3Int8SdotSlideWLeftC4(int8_t *dst_z, int8_t **src, const int8_t* weight_z, const int32_t* bias_z,
+                             const float* scale_z, long dc, long dst_depth, long width);
+
+void ConvDw3x3S2Int8SdotSlideW(int8_t *dst_z, int8_t **src, const int8_t* weight_z, const int32_t* bias_z,
+                               const float* scale_z, long dc, long dst_depth, long width);
+void ConvDw3x3S2Int8SdotSlideWLeftC4(int8_t *dst_z, int8_t **src, const int8_t* weight_z, const int32_t* bias_z,
+                               const float* scale_z, long dc, long dst_depth, long width);
+
+void GemvInt8Sdot(int8_t* dst, const int8_t* src, const int8_t* weight,
+                  const int32_t* bias, const float* scale, long ic_r4, long oc_r4);
+#endif
+
+#if defined(TNN_ARM82_A32)
+void GemmInt8SdotUnit4x8Kernel(int8_t* dst, const int8_t* src, const int8_t* weight,
+                               long src_depth, long dst_depth, long hw,
+                               const int32_t* bias, const float* scale,
+                               long relu, const int8_t* add_input,
+                               const float* add_scale, const int8_t* relu6_max);
+
+void GemmInt8SdotUnit4x4(int8_t* dst, const int8_t* src, const int8_t* weight,
+                         long src_depth, long dst_depth, long hw,
+                         const int32_t* bias, const float* scale,
+                         long relu, const int8_t* add_input,
+                         const float* add_scale, const int8_t* relu6_max);
+#elif defined(TNN_ARM82_A64)
+void GemmInt8SdotUnit8x8(int8_t* dst, const int8_t* src, const int8_t* weight,
+                        long src_depth, long dst_depth, long hw, 
+                        const int32_t* bias, const float* scale,
+                        long relu, const int8_t* add_input, 
+                        const float* add_scale, const int8_t* relu6_max);
+
+void GemmInt8SdotUnit8x4(int8_t* dst, const int8_t* src, const int8_t* weight,
+                        long src_depth, long dst_depth, long hw, 
+                        const int32_t* bias, const float* scale,
+                        long relu, const int8_t* add_input, 
+                        const float* add_scale, const int8_t* relu6_max);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/gemm_function_fp16.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/gemm_function_fp16.cc
new file mode 100644
index 0000000..fc15a51
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/gemm_function_fp16.cc
@@ -0,0 +1,607 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/Half8.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+void Kernel_8x16(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc) {
+#ifdef TNN_ARM82_A64
+    for (int i = 0; i < m - 7; i += 8) {
+        const fp16_t *ar = sa + i * k;
+        const fp16_t *br = sb;
+        fp16_t *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 15; j += 16) {
+            const fp16_t *a    = ar;
+            const fp16_t *b    = br + j * k;
+            fp16_t *c          = cr + j;
+            int64_t ldc_offset = ldc * sizeof(fp16_t) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT8x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.8h},  [x9], #16         \n"
+                "   ld1 {v20.8h}, [x9], %3          \n"
+                "   ld1 {v9.8h},  [x9], #16         \n"
+                "   ld1 {v21.8h}, [x9], %3          \n"
+                "   ld1 {v10.8h}, [x9], #16         \n"
+                "   ld1 {v22.8h}, [x9], %3          \n"
+                "   ld1 {v11.8h}, [x9], #16         \n"
+                "   ld1 {v23.8h}, [x9], %3          \n"
+                "   ld1 {v12.8h}, [x9], #16         \n"
+                "   ld1 {v24.8h}, [x9], %3          \n"
+                "   ld1 {v13.8h}, [x9], #16         \n"
+                "   ld1 {v25.8h}, [x9], %3          \n"
+                "   ld1 {v14.8h}, [x9], #16         \n"
+                "   ld1 {v26.8h}, [x9], %3          \n"
+                "   ld1 {v15.8h}, [x9], #16         \n"
+                "   ld1 {v27.8h}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE8x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   st1 {v8.8h},  [x9], #16         \n"
+                "   st1 {v20.8h}, [x9], %3          \n"
+                "   st1 {v9.8h},  [x9], #16         \n"
+                "   st1 {v21.8h}, [x9], %3          \n"
+                "   st1 {v10.8h}, [x9], #16         \n"
+                "   st1 {v22.8h}, [x9], %3          \n"
+                "   st1 {v11.8h}, [x9], #16         \n"
+                "   st1 {v23.8h}, [x9], %3          \n"
+                "   st1 {v12.8h}, [x9], #16         \n"
+                "   st1 {v24.8h}, [x9], %3          \n"
+                "   st1 {v13.8h}, [x9], #16         \n"
+                "   st1 {v25.8h}, [x9], %3          \n"
+                "   st1 {v14.8h}, [x9], #16         \n"
+                "   st1 {v26.8h}, [x9], %3          \n"
+                "   st1 {v15.8h}, [x9], #16         \n"
+                "   st1 {v27.8h}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   ld1 {v2.8h}, [%1], #16          \n"
+                "INIT8x16H                          \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+
+                "   fmla v8.8h , v0.8h, v2.h[0]     \n"
+                "   fmla v9.8h , v0.8h, v2.h[1]     \n"
+                "   fmla v10.8h, v0.8h, v2.h[2]     \n"
+                "   fmla v11.8h, v0.8h, v2.h[3]     \n"
+
+                "   fmla v12.8h, v0.8h, v2.h[4]     \n"
+                "   ld1 {v1.8h}, [%0], #16          \n"
+                "   fmla v13.8h, v0.8h, v2.h[5]     \n"
+                "   fmla v14.8h, v0.8h, v2.h[6]     \n"
+                "   prfm pldl1keep, [%1, #64]       \n"
+                "   fmla v15.8h, v0.8h, v2.h[7]     \n"
+
+                "   fmla v20.8h, v1.8h, v2.h[0]     \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   fmla v21.8h, v1.8h, v2.h[1]     \n"
+                "   fmla v22.8h, v1.8h, v2.h[2]     \n"
+                "   prfm pldl1keep, [%0, #128]      \n"
+                "   fmla v23.8h, v1.8h, v2.h[3]     \n"
+                "   subs x8, x8, #1                 \n"
+
+                "   fmla v24.8h, v1.8h, v2.h[4]     \n"
+                "   fmla v25.8h, v1.8h, v2.h[5]     \n"
+                "   fmla v26.8h, v1.8h, v2.h[6]     \n"
+                "   fmla v27.8h, v1.8h, v2.h[7]     \n"
+
+                "   ld1 {v2.8h}, [%1], #16          \n"
+                "   bne 0b                          \n"
+                "SAVE8x16H                          \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
+        }
+        int remain = n % 16;
+        if (remain) {
+            const fp16_t *a   = ar;
+            const fp16_t *b   = br + (n / 16) * 16 * k;
+            fp16_t *c         = cr + (n / 16) * 16;
+            float16x8_t c0[8] = {vdupq_n_f16(0)};
+            float16x8_t c1[8] = {vdupq_n_f16(0)};
+            float16x8_t b0, b1, av, a0, a1, a2, a3, a4, a5, a6, a7;
+            for (int kk = 0; kk < k; ++kk) {
+                b0    = vld1q_f16(b);
+                b1    = vld1q_f16(b + 8);
+                av    = vld1q_f16(a);
+                a0    = vdupq_n_f16(av[0]);
+                a1    = vdupq_n_f16(av[1]);
+                a2    = vdupq_n_f16(av[2]);
+                a3    = vdupq_n_f16(av[3]);
+                a4    = vdupq_n_f16(av[4]);
+                a5    = vdupq_n_f16(av[5]);
+                a6    = vdupq_n_f16(av[6]);
+                a7    = vdupq_n_f16(av[7]);
+                c0[0] = vfmaq_f16(c0[0], a0, b0);
+                c1[0] = vfmaq_f16(c1[0], a0, b1);
+                c0[1] = vfmaq_f16(c0[1], a1, b0);
+                c1[1] = vfmaq_f16(c1[1], a1, b1);
+                c0[2] = vfmaq_f16(c0[2], a2, b0);
+                c1[2] = vfmaq_f16(c1[2], a2, b1);
+                c0[3] = vfmaq_f16(c0[3], a3, b0);
+                c1[3] = vfmaq_f16(c1[3], a3, b1);
+                c0[4] = vfmaq_f16(c0[4], a4, b0);
+                c1[4] = vfmaq_f16(c1[4], a4, b1);
+                c0[5] = vfmaq_f16(c0[5], a5, b0);
+                c1[5] = vfmaq_f16(c1[5], a5, b1);
+                c0[6] = vfmaq_f16(c0[6], a6, b0);
+                c1[6] = vfmaq_f16(c1[6], a6, b1);
+                c0[7] = vfmaq_f16(c0[7], a7, b0);
+                c1[7] = vfmaq_f16(c1[7], a7, b1);
+
+                b += 16;
+                a += 8;
+            }
+            for (int ms = 0; ms < 8; ++ms) {
+                for (int rr = 0; rr < remain; ++rr) {
+                    c[rr] += rr < 8 ? c0[ms][rr] : c1[ms][rr - 8];
+                }
+                c += ldc;
+            }
+        }
+    }
+#else
+    return NaiveKernel<8, 16>(m, n, k, sa, sb, sc, ldc);
+#endif  // TNN_ARM82_A64
+}
+
+void Kernel_4x16(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc) {
+#ifdef TNN_ARM82_USE_NEON
+    for (int i = 0; i < m - 3; i += 4) {
+        const fp16_t *ar = sa + i * k;
+        const fp16_t *br = sb;
+        fp16_t *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 15; j += 16) {
+            const fp16_t *a = ar;
+            const fp16_t *b = br + j * k;
+            fp16_t *c       = cr + j;
+#ifdef TNN_ARM82_A64
+            int64_t ldc_offset = ldc * sizeof(fp16_t) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT4x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.8h},  [x9], #16         \n"
+                "   ld1 {v20.8h}, [x9], %3          \n"
+                "   ld1 {v9.8h},  [x9], #16         \n"
+                "   ld1 {v21.8h}, [x9], %3          \n"
+                "   ld1 {v10.8h}, [x9], #16         \n"
+                "   ld1 {v22.8h}, [x9], %3          \n"
+                "   ld1 {v11.8h}, [x9], #16         \n"
+                "   ld1 {v23.8h}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE4x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   st1 {v8.8h},  [x9], #16         \n"
+                "   st1 {v20.8h}, [x9], %3          \n"
+                "   st1 {v9.8h},  [x9], #16         \n"
+                "   st1 {v21.8h}, [x9], %3          \n"
+                "   st1 {v10.8h}, [x9], #16         \n"
+                "   st1 {v22.8h}, [x9], %3          \n"
+                "   st1 {v11.8h}, [x9], #16         \n"
+                "   st1 {v23.8h}, [x9]              \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   ld1 {v2.4h}, [%1], #8           \n"
+                "INIT4x16H                          \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+
+                "   fmla v8.8h , v0.8h, v2.h[0]     \n"
+                "   fmla v9.8h , v0.8h, v2.h[1]     \n"
+                "   fmla v10.8h, v0.8h, v2.h[2]     \n"
+                "   fmla v11.8h, v0.8h, v2.h[3]     \n"
+
+                "   ld1 {v1.8h}, [%0], #16          \n"
+                "   prfm pldl1keep, [%1, #64]       \n"
+                "   prfm pldl1keep, [%0, #64]       \n"
+
+                "   fmla v20.8h, v1.8h, v2.h[0]     \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   fmla v21.8h, v1.8h, v2.h[1]     \n"
+                "   fmla v22.8h, v1.8h, v2.h[2]     \n"
+                "   prfm pldl1keep, [%0, #128]      \n"
+                "   fmla v23.8h, v1.8h, v2.h[3]     \n"
+                "   subs x8, x8, #1                 \n"
+                "   ld1 {v2.4h}, [%1], #8           \n"
+                "   bne 0b                          \n"
+                "SAVE4x16H                          \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v8", "v9", "v10", "v11", "v20", "v21", "v22", "v23");
+#else
+            int ldc_offset = ldc * sizeof(fp16_t) - 16;
+            asm volatile(
+                ".macro INIT4x16H                   \n"
+                "   mov r9,        %2               \n"
+                "   vld1.16 {d16,d17},  [r9]!       \n"
+                "   vld1.16 {d18,d19},  [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.16 {d20,d21}, [r9]!        \n"
+                "   vld1.16 {d22,d23}, [r9]         \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.16 {d24,d25}, [r9]!        \n"
+                "   vld1.16 {d26,d27}, [r9]         \n"
+                "   add      r9,   r9, %3           \n"
+                "   vld1.16 {d28,d29}, [r9]!        \n"
+                "   vld1.16 {d30,d31}, [r9]         \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE4x16H                   \n"
+                "   mov r9,        %2               \n"
+                "   vst1.16 {d16,d17},  [r9]!       \n"
+                "   vst1.16 {d18,d19},  [r9]        \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.16 {d20,d21}, [r9]!        \n"
+                "   vst1.16 {d22,d23}, [r9]         \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.16 {d24,d25}, [r9]!        \n"
+                "   vst1.16 {d26,d27}, [r9]         \n"
+                "   add      r9,   r9, %3           \n"
+                "   vst1.16 {d28,d29}, [r9]!        \n"
+                "   vst1.16 {d30,d31}, [r9]         \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   vld1.16 {d0,d1},  [%0]!         \n"
+                "   vld1.16 {d4},     [%1]!         \n"
+                "INIT4x16H                          \n"
+                "mov r8,%4                          \n"
+                "0:                                 \n"
+                "   vmla.f16 q8,  q0, d4[0]         \n"
+                "   vld1.16 {d2,d3},  [%0]!         \n"
+                "   vmla.f16 q10, q0, d4[1]         \n"
+                "   vmla.f16 q12, q0, d4[2]         \n"
+                "   vmla.f16 q14, q0, d4[3]         \n"
+                "   subs r8, r8, #1                 \n"
+
+                "   vmla.f16 q9,  q1, d4[0]         \n"
+                "   vmla.f16 q11, q1, d4[1]         \n"
+                "   vmla.f16 q13, q1, d4[2]         \n"
+                "   vld1.16 {d0,d1},  [%0]!         \n"
+                "   vmla.f16 q15, q1, d4[3]         \n"
+                "   vld1.16 {d4},     [%1]!         \n"
+                "   bne 0b                          \n"
+                "SAVE4x16H                          \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k)
+                : "memory", "cc", "r8", "r9", "q0", "q1", "q2", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#endif  // TNN_ARM82_A64
+        }
+        int remain = n % 16;
+        if (remain) {
+            const fp16_t *a = ar;
+            const fp16_t *b = br + (n / 16) * 16 * k;
+            fp16_t *c       = cr + (n / 16) * 16;
+            Half8 vec_0     = Half8(fp16_t(0));
+            Half8 c0[4]     = {vec_0, vec_0, vec_0, vec_0};
+            Half8 c1[4]     = {vec_0, vec_0, vec_0, vec_0};
+            Half8 b0, b1, av, a0, a1, a2, a3;
+            for (int kk = 0; kk < k; ++kk) {
+                b0 = Half8::load(b);
+                b1 = Half8::load(b + 8);
+                av = Half8::load(a);
+                a0 = Half8(av[0]);
+                a1 = Half8(av[1]);
+                a2 = Half8(av[2]);
+                a3 = Half8(av[3]);
+                Half8::mla(c0[0], a0, b0);
+                Half8::mla(c1[0], a0, b1);
+                Half8::mla(c0[1], a1, b0);
+                Half8::mla(c1[1], a1, b1);
+                Half8::mla(c0[2], a2, b0);
+                Half8::mla(c1[2], a2, b1);
+                Half8::mla(c0[3], a3, b0);
+                Half8::mla(c1[3], a3, b1);
+                b += 16;
+                a += 4;
+            }
+            for (int ms = 0; ms < 4; ++ms) {
+                if (remain > 8) {
+                    Half8::save(c, c0[ms] + Half8::load(c));
+                    Half8 c_old = vec_0;
+                    for (int rr = 8; rr < remain; ++rr) {
+                        c_old.set_lane(c[rr], rr - 8);
+                    }
+                    Half8 c_new = c_old + c1[ms];
+                    for (int rr = 8; rr < remain; ++rr) {
+                        c[rr] = c_new[rr - 8];
+                    }
+                } else {
+                    Half8 c_old = vec_0;
+                    for (int rr = 0; rr < remain; ++rr) {
+                        c_old.set_lane(c[rr], rr);
+                    }
+                    Half8 c_new = c_old + c0[ms];
+                    for (int rr = 0; rr < remain; ++rr) {
+                        c[rr] = c_new[rr];
+                    }
+                }
+                c += ldc;
+            }
+        }
+    }
+#else
+    return NaiveKernel<4, 16>(m, n, k, sa, sb, sc, ldc);
+#endif  // TNN_ARM82_USE_NEON
+}
+
+void Kernel_1x16(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t *sc, int ldc) {
+#ifdef TNN_ARM82_USE_NEON
+    for (int i = 0; i < m; ++i) {
+        const fp16_t *ar = sa + i * k;
+        const fp16_t *br = sb;
+        fp16_t *cr       = sc + i * ldc;
+        OMP_PARALLEL_FOR_
+        for (int j = 0; j < n - 15; j += 16) {
+            const fp16_t *a = ar;
+            const fp16_t *b = br + j * k;
+            fp16_t *c       = cr + j;
+#ifdef TNN_ARM82_A64
+            int64_t ldc_offset = ldc * sizeof(fp16_t) - 16;
+            int64_t k_64       = k;
+            asm volatile(
+                ".macro INIT1x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   ld1 {v8.8h},  [x9], #16         \n"
+                "   ld1 {v20.8h}, [x9], %3          \n"
+                "   movi v9.8h,    #0               \n"
+                "   movi v21.8h,   #0               \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE1x16H                   \n"
+                "   mov x9,        %2               \n"
+                "   fadd v8.8h,  v8.8h,  v9.8h      \n"
+                "   fadd v20.8h, v20.8h, v21.8h     \n"
+                "   st1 {v8.8h},  [x9], #16         \n"
+                "   st1 {v20.8h}, [x9], %3          \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   ld1 {v2.4h}, [%1], #8           \n"
+                "INIT1x16H                          \n"
+                "mov x8,%4                          \n"
+                "0:                                 \n"
+                "   subs x9, x8, #4                 \n"
+                "   blt 1f                          \n"
+                "   ld1 {v1.8h}, [%0], #16          \n"
+                "   fmla v8.8h , v0.8h, v2.h[0]     \n"
+                "   ld1 {v3.8h}, [%0], #16          \n"
+                "   fmla v20.8h, v1.8h, v2.h[0]     \n"
+                "   ld1 {v4.8h}, [%0], #16          \n"
+                "   fmla v9.8h , v3.8h, v2.h[1]     \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   fmla v21.8h, v4.8h, v2.h[1]     \n"
+                "   subs x8, x8, #4                 \n"
+
+                "   ld1 {v1.8h}, [%0], #16          \n"
+                "   fmla v8.8h , v0.8h, v2.h[2]     \n"
+                "   ld1 {v3.8h}, [%0], #16          \n"
+                "   fmla v20.8h, v1.8h, v2.h[2]     \n"
+                "   ld1 {v4.8h}, [%0], #16          \n"
+                "   fmla v9.8h , v3.8h, v2.h[3]     \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   fmla v21.8h, v4.8h, v2.h[3]     \n"
+                "   ld1 {v2.4h}, [%1], #8           \n"
+                "   bgt 0b                          \n"
+                "1:                                 \n"
+                "   subs x8, x8, #1                 \n"
+                "   ld1 {v1.8h}, [%0], #16          \n"
+                "   blt 2f                          \n"
+                "   fmla v8.8h , v0.8h, v2.h[0]     \n"
+                "   fmla v20.8h, v1.8h, v2.h[0]     \n"
+                "   sub %1, %1, #6                  \n"
+                "   ld1 {v0.8h}, [%0], #16          \n"
+                "   ld1 {v2.4h}, [%1], #8           \n"
+                "   bne 1b                          \n"
+                "2:                                 \n"
+                "   SAVE1x16H                       \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k_64)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k_64)
+                : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v20", "v21");
+#else
+            int ldc_offset = ldc * sizeof(float) - 16;
+            asm volatile(
+                ".macro INIT1x16H                   \n"
+                "   mov r9,        %2               \n"
+                "   vld1.16 {d16,d17}, [r9]!        \n"
+                "   vld1.16 {d20,d21}, [r9]         \n"
+                "   vmov.u32 q9,   #0               \n"
+                "   vmov.u32 q11,  #0               \n"
+                ".endm                              \n"
+                "                                   \n"
+                ".macro SAVE1x16H                   \n"
+                "   mov r9,       %2                \n"
+                "   vadd.f16 q8,  q8,  q9           \n"
+                "   vadd.f16 q10, q10, q11          \n"
+                "   vst1.16 {d16,d17}, [r9]!        \n"
+                "   vst1.16 {d20,d21}, [r9]         \n"
+                ".endm                              \n"
+                "                                   \n"
+                "   vld1.16 {d0,d1}, [%0]!          \n"
+                "   vld1.16 {d4},    [%1]!          \n"
+                "INIT1x16H                          \n"
+                "mov r8,%4                          \n"
+                "0:                                 \n"
+                "   subs r9, r8,  #4                \n"
+                "   blt 1f                          \n"
+                "   vld1.16 {d2,d3},  [%0]!         \n"
+                "   vmla.f16 q8,  q0, d4[0]         \n"
+                "   vld1.16 {d6,d7},  [%0]!         \n"
+                "   vmla.f16 q10, q1, d4[0]         \n"
+                "   vld1.16 {d8,d9},  [%0]!         \n"
+                "   vmla.f16 q9,  q3, d4[1]         \n"
+                "   vld1.16 {d0,d1},  [%0]!         \n"
+                "   vmla.f16 q11, q4, d4[1]         \n"
+                "   subs r8, r8,  #4                \n"
+
+                "   vld1.16 {d2,d3},  [%0]!         \n"
+                "   vmla.f16 q8,  q0, d4[2]         \n"
+                "   vld1.16 {d6,d7},  [%0]!         \n"
+                "   vmla.f16 q10, q1, d4[2]         \n"
+                "   vld1.16 {d8,d9},  [%0]!         \n"
+                "   vmla.f16 q9,  q3, d4[3]         \n"
+                "   vld1.16 {d0,d1},  [%0]!         \n"
+                "   vmla.f16 q11, q4, d4[3]         \n"
+                "   vld1.16 {d4},     [%1]!         \n"
+                "   bgt 0b                          \n"
+                "1:                                 \n"
+                "   subs r8, r8,  #1                \n"
+                "   vld1.16 {d2,d3},  [%0]!         \n"
+                "   blt 2f                          \n"
+                "   vmla.f16 q8,  q0, d4[0]         \n"
+                "   vmla.f16 q10, q1, d4[0]         \n"
+                "   sub %1, %1,   #6                \n"
+                "   vld1.16 {d0,d1},  [%0]!         \n"
+                "   vld1.16 {d4},     [%1]!         \n"
+                "   bne 1b                          \n"
+                "2:                                 \n"
+                "   SAVE1x16H                       \n"
+                "                                   \n"
+                : "=r"(b), "=r"(a), "=r"(c), "=r"(ldc_offset), "=r"(k)
+                : "0"(b), "1"(a), "2"(c), "3"(ldc_offset), "4"(k)
+                : "memory", "cc", "r8", "r9", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11");
+#endif  // TNN_ARM82_A64
+        }
+        int remain = n % 16;
+        if (remain) {
+            const fp16_t *a = ar;
+            const fp16_t *b = br + (n / 16) * 16 * k;
+            fp16_t *c       = cr + (n / 16) * 16;
+            Half8 vec_0     = Half8(fp16_t(0));
+            Half8 c0        = vec_0;
+            Half8 c1        = vec_0;
+            for (int kk = 0; kk < k; ++kk) {
+                Half8 b0 = Half8::load(b);
+                Half8 b1 = Half8::load(b + 8);
+                Half8 a0 = Half8(a[kk]);
+                Half8::mla(c0, a0, b0);
+                Half8::mla(c1, a0, b1);
+                b += 16;
+            }
+            if (remain > 8) {
+                Half8::save(c, c0 + Half8::load(c));
+                Half8 c_old = vec_0;
+                for (int rr = 8; rr < remain; ++rr) {
+                    c_old.set_lane(c[rr], rr - 8);
+                }
+                Half8 c_new = c_old + c1;
+                for (int rr = 8; rr < remain; ++rr) {
+                    c[rr] = c_new[rr - 8];
+                }
+            } else {
+                Half8 c_old = vec_0;
+                for (int rr = 0; rr < remain; ++rr) {
+                    c_old.set_lane(c[rr], rr);
+                }
+                Half8 c_new = c_old + c0;
+                for (int rr = 0; rr < remain; ++rr) {
+                    c[rr] = c_new[rr];
+                }
+            }
+        }
+    }
+#else
+    return NaiveKernel<1, 16>(m, n, k, sa, sb, sc, ldc);
+#endif  // TNN_ARM82_USE_NEON
+}
+
+void PackB_16(int k, int n, const fp16_t *from, int ldb, fp16_t *to) {
+    return NaivePackB<16>(k, n, from, ldb, to);
+}
+
+void PackA_8(int m, int k, const fp16_t *src, int lda, fp16_t *dst) {
+    const fp16_t *src_offset[8];
+    for (int j = 0; j < m - 7; j += 8) {
+        fp16_t *dst_r = dst + j * k;
+        src_offset[0] = src;
+        src_offset[1] = src_offset[0] + lda;
+        src_offset[2] = src_offset[1] + lda;
+        src_offset[3] = src_offset[2] + lda;
+        src_offset[4] = src_offset[3] + lda;
+        src_offset[5] = src_offset[4] + lda;
+        src_offset[6] = src_offset[5] + lda;
+        src_offset[7] = src_offset[6] + lda;
+        src += 8 * lda;
+
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < k; ++i) {
+            fp16_t *dst_t = dst_r + i * 8;
+            *(dst_t + 0)  = *(src_offset[0] + i);
+            *(dst_t + 1)  = *(src_offset[1] + i);
+            *(dst_t + 2)  = *(src_offset[2] + i);
+            *(dst_t + 3)  = *(src_offset[3] + i);
+            *(dst_t + 4)  = *(src_offset[4] + i);
+            *(dst_t + 5)  = *(src_offset[5] + i);
+            *(dst_t + 6)  = *(src_offset[6] + i);
+            *(dst_t + 7)  = *(src_offset[7] + i);
+        }
+    }
+}
+
+void PackA_4(int m, int k, const fp16_t *src, int lda, fp16_t *dst) {
+    const fp16_t *src_offset[4];
+    for (int j = 0; j < m - 3; j += 4) {
+        fp16_t *dst_r = dst + j * k;
+        src_offset[0] = src;
+        src_offset[1] = src_offset[0] + lda;
+        src_offset[2] = src_offset[1] + lda;
+        src_offset[3] = src_offset[2] + lda;
+        src += 4 * lda;
+
+        OMP_PARALLEL_FOR_
+        for (int i = 0; i < k; ++i) {
+            fp16_t *dst_t = dst_r + i * 4;
+            *(dst_t + 0)  = *(src_offset[0] + i);
+            *(dst_t + 1)  = *(src_offset[1] + i);
+            *(dst_t + 2)  = *(src_offset[2] + i);
+            *(dst_t + 3)  = *(src_offset[3] + i);
+        }
+    }
+}
+
+void PackA_1(int m, int k, const fp16_t *src, int lda, fp16_t *dst) {
+    OMP_PARALLEL_FOR_
+    for (int j = 0; j < m; ++j) {
+        memcpy(dst + j * k, src + j * lda, k * sizeof(fp16_t));
+    }
+}
+
+}  // namespace TNN_NS
+
+#endif  // TNN_ARM82
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/winograd_function_fp16.cc b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/winograd_function_fp16.cc
new file mode 100644
index 0000000..f65f15b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/compute_arm82/winograd_function_fp16.cc
@@ -0,0 +1,298 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#if TNN_ARM82
+
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include "tnn/device/arm/acc/Half8.h"
+#include <cstring>
+#include <memory>
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace TNN_NS {
+
+template <typename T, int unit>
+static inline void WeightTransform(const T *src, T *dst, int kernel_size, int in_channel,
+                                   int out_channel, const T (*G)[3]) {
+    T M[unit][3];
+    T K_trans[unit * unit];
+
+    int ic_8        = UP_DIV(in_channel, 8);
+    int oc_8        = UP_DIV(out_channel, 8);
+    int unit_stride = ic_8 * oc_8 * 8 * 8;
+    int oc_stride   = ic_8 * 8 * 8;
+    int ic_stride   = 8 * 8;
+
+    for (int oc = 0; oc < out_channel; oc++) {
+        int zo        = oc / 8;
+        int ro        = oc % 8;
+        T *dst_oz = dst + zo * oc_stride + ro;
+        for (int ic = 0; ic < in_channel; ic++) {
+            const T *src_z = src + (oc * in_channel + ic) * 3 * 3;
+            const T *k0    = src_z;
+            const T *k1    = k0 + 3;
+            const T *k2    = k1 + 3;
+
+            int zi = ic / 8;
+            int ri = ic % 8;
+
+            // M=G*g
+            for (int i = 0; i < unit; i++) {
+                M[i][0] = k0[0] * G[i][0] + k1[0] * G[i][1] + k2[0] * G[i][2];
+                M[i][1] = k0[1] * G[i][0] + k1[1] * G[i][1] + k2[1] * G[i][2];
+                M[i][2] = k0[2] * G[i][0] + k1[2] * G[i][1] + k2[2] * G[i][2];
+            }
+
+            // K_trans=M*GT
+            for (int j = 0; j < unit; j++) {
+                T *Mp = &M[j][0];
+                for (int i = 0; i < unit; i++) {
+                    K_trans[j * unit + i] = Mp[0] * G[i][0] + Mp[1] * G[i][1] + Mp[2] * G[i][2];
+                }
+            }
+
+            auto dst_sz = dst_oz + zi * ic_stride + 8 * ri;
+
+            for (int k = 0; k < unit * unit; k++) {
+                *(dst_sz + k * unit_stride) = K_trans[k];
+            }
+        }
+    }
+}
+
+void WeightTransformHalf4x4(const float *src, float *dst, int kernel_size, int in_channel, int out_channel) {
+    const float G[4][3] = {{1.0f, 0.0f, 0.0f}, {0.5f, 0.5f, 0.5f}, {0.5f, -0.5f, 0.5f}, {0.0f, 0.0f, 1.0f}};
+
+    WeightTransform<float, 4>(src, dst, kernel_size, in_channel, out_channel, G);
+}
+
+void WeightTransformHalf6x6(const float *src, float *dst, int kernel_size, int in_channel, int out_channel) {
+    const float G[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},           {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
+
+    WeightTransform<float, 6>(src, dst, kernel_size, in_channel, out_channel, G);
+}
+
+void SrcTransformInOne4x4Fp16(const void *src, void *dst, int w_stride, int h_stride) {
+    const fp16_t *src_ptr = reinterpret_cast<const fp16_t *>(src);
+    fp16_t *dst_ptr = reinterpret_cast<fp16_t *>(dst);
+
+    Half8 vec_src[4][4];
+    Half8 vec_mid[4][4];
+    Half8 vec_dst[4][4];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            vec_src[i][j] = Half8::load(src_ptr + i * h_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_mid[0][i] = vec_src[i][0] - vec_src[i][2];
+        vec_mid[1][i] = vec_src[i][1] + vec_src[i][2];
+        vec_mid[2][i] = vec_src[i][2] - vec_src[i][1];
+        vec_mid[3][i] = vec_src[i][1] - vec_src[i][3];
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_dst[0][i] = vec_mid[i][0] - vec_mid[i][2];
+        vec_dst[1][i] = vec_mid[i][1] + vec_mid[i][2];
+        vec_dst[2][i] = vec_mid[i][2] - vec_mid[i][1];
+        vec_dst[3][i] = vec_mid[i][1] - vec_mid[i][3];
+
+        Half8::save(dst_ptr + 0  + 8 * i, vec_dst[0][i]);
+        Half8::save(dst_ptr + 32 + 8 * i, vec_dst[1][i]);
+        Half8::save(dst_ptr + 64 + 8 * i, vec_dst[2][i]);
+        Half8::save(dst_ptr + 96 + 8 * i, vec_dst[3][i]);
+    }
+}
+
+// A = [1  0]
+//     [1  1]
+//     [1 -1]
+//     [0 -1]
+void DstTransformInOne4x2Fp16(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+    const fp16_t *src_ptr = reinterpret_cast<const fp16_t *>(src);
+    fp16_t *dst_ptr = reinterpret_cast<fp16_t *>(dst);
+
+    Half8 vec_src[4][4];
+    Half8 vec_mid[4][2];
+    Half8 vec_dst[2][2];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            vec_src[i][j] = Half8::load(src_ptr + i * 4 * w_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        vec_mid[i][0] = (vec_src[0][i] + vec_src[1][i]) + vec_src[2][i];
+        vec_mid[i][1] = (vec_src[1][i] - vec_src[2][i]) - vec_src[3][i];
+    }
+
+    for (int i = 0; i < ey; i++) {
+        vec_dst[i][0] = (vec_mid[0][i] + vec_mid[1][i]) + vec_mid[2][i];
+        vec_dst[i][1] = (vec_mid[1][i] - vec_mid[2][i]) - vec_mid[3][i];
+        Half8::save(dst_ptr + i * h_stride, vec_dst[i][0]);
+        Half8::save(dst_ptr + i * h_stride + 8, vec_dst[i][1]);
+    }
+}
+
+// B = [4  0  0  0  0  0] s0
+//     [0 -4  4 -2  2  4] s1
+//     [-5 -4 -4 -1 -1 0] s2
+//     [0  1 -1  2 -2 -5] s3
+//     [1  1  1  1  1  0] s4
+//     [0  0  0  0  0  1] s5
+
+// auto m0 = s0 * 4.f - s2 * 5.f + s4;
+// auto m1 = s3 + s4 - (s1 + s2) * 4.f;
+// auto m2 = (s1 - s2) * 4.f + s4 - s3;
+// auto m3 = (s3 - s1) * 2.f + s4 - s2;
+// auto m4 = s4 - s2 - (s3 - s1) * 2.f;
+// auto m5 = s1 * 4.f - s3 * 5.f + s5
+void SrcTransformInOne6x6Fp16(const void *src, void *dst, int w_stride, int h_stride) {
+    Half8 vec_src[6][6];
+    Half8 vec_mid[6][6];
+    Half8 vec_dst[6][6];
+    Half8 B0 = Half8((fp16_t)4.0f);
+    Half8 B1 = Half8((fp16_t)5.0f);
+    Half8 B2 = Half8((fp16_t)2.0f);
+
+    const fp16_t *src_f = reinterpret_cast<const fp16_t *>(src);
+    fp16_t *dst_f = reinterpret_cast<fp16_t *>(dst);
+
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 6; j++) {
+            vec_src[i][j] = Half8::load(src_f + i * h_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_mid[0][i] = vec_src[i][4];
+        Half8::mla(vec_mid[0][i], vec_src[i][0], B0);
+        Half8::mls(vec_mid[0][i], vec_src[i][2], B1);
+
+        vec_mid[1][i] = vec_src[i][3] + vec_src[i][4];
+        Half8::mls(vec_mid[1][i], vec_src[i][1] + vec_src[i][2], B0);
+
+        vec_mid[2][i] = vec_src[i][4] - vec_src[i][3];
+        Half8::mla(vec_mid[2][i], vec_src[i][1] - vec_src[i][2], B0);
+
+        vec_mid[3][i] = vec_src[i][4] - vec_src[i][2];
+        Half8::mla(vec_mid[3][i], vec_src[i][3] - vec_src[i][1], B2);
+
+        vec_mid[4][i] = vec_src[i][4] - vec_src[i][2];
+        Half8::mla(vec_mid[4][i], vec_src[i][1] - vec_src[i][3], B2);
+
+        vec_mid[5][i] = vec_src[i][5];
+        Half8::mla(vec_mid[5][i], vec_src[i][1], B0);
+        Half8::mls(vec_mid[5][i], vec_src[i][3], B1);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_dst[0][i] = vec_mid[i][4];
+        Half8::mla(vec_dst[0][i], vec_mid[i][0], B0);
+        Half8::mls(vec_dst[0][i], vec_mid[i][2], B1);
+
+        vec_dst[1][i] = vec_mid[i][3] + vec_mid[i][4];
+        Half8::mls(vec_dst[1][i], vec_mid[i][1] + vec_mid[i][2], B0);
+
+        vec_dst[2][i] = vec_mid[i][4] - vec_mid[i][3];
+        Half8::mla(vec_dst[2][i], vec_mid[i][1] - vec_mid[i][2], B0);
+
+        vec_dst[3][i] = vec_mid[i][4] - vec_mid[i][2];
+        Half8::mla(vec_dst[3][i], vec_mid[i][3] - vec_mid[i][1], B2);
+
+        vec_dst[4][i] = vec_mid[i][4] - vec_mid[i][2];
+        Half8::mla(vec_dst[4][i], vec_mid[i][1] - vec_mid[i][3], B2);
+
+        vec_dst[5][i] = vec_mid[i][5];
+        Half8::mla(vec_dst[5][i], vec_mid[i][1], B0);
+        Half8::mls(vec_dst[5][i], vec_mid[i][3], B1);
+
+        Half8::save(dst_f + 0   + 8 * i, vec_dst[0][i]);
+        Half8::save(dst_f + 48  + 8 * i, vec_dst[1][i]);
+        Half8::save(dst_f + 96  + 8 * i, vec_dst[2][i]);
+        Half8::save(dst_f + 144 + 8 * i, vec_dst[3][i]);
+        Half8::save(dst_f + 192 + 8 * i, vec_dst[4][i]);
+        Half8::save(dst_f + 240 + 8 * i, vec_dst[5][i]);
+    }
+}
+
+// A = [1  0  0  0]
+//     [1  1  1  1]
+//     [1 -1  1 -1]
+//     [1  2  4  8]
+//     [1 -2  4 -8]
+//     [0  0  0  1]
+// auto m0 = s0 + s1 + s2 + s3 + s4;
+// auto m1 = s1 - s2 + (s3 - s4) * 2.f;
+// auto m2 = s1 + s2 + (s3 + s4) * 4.f;
+// auto m3 = s1 - s2 + (s3 - s4) * 8.f + s5;
+void DstTransformInOne6x4Fp16(const void *src, void *dst, int w_stride, int h_stride, int ey) {
+    Half8 vec_src[6][6];
+    Half8 vec_mid[6][4];
+    Half8 vec_dst[4][4];
+    Half8 A0 = Half8((fp16_t)4.0f);
+    Half8 A1 = Half8((fp16_t)2.0f);
+    Half8 A2 = Half8((fp16_t)8.0f);
+
+    const fp16_t *src_f = reinterpret_cast<const fp16_t *>(src);
+    fp16_t *dst_f = reinterpret_cast<fp16_t *>(dst);
+
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 6; j++) {
+            vec_src[i][j] = Half8::load(src_f + i * 6 * w_stride + j * w_stride);
+        }
+    }
+
+    for (int i = 0; i < 6; i++) {
+        vec_mid[i][0] = (((vec_src[0][i] + vec_src[1][i]) + (vec_src[2][i] + vec_src[3][i])) + vec_src[4][i]);
+
+        vec_mid[i][1] = vec_src[1][i] - vec_src[2][i];
+        Half8::mla(vec_mid[i][1], vec_src[3][i] - vec_src[4][i], A1);
+
+        vec_mid[i][2] = vec_src[1][i] + vec_src[2][i];
+        Half8::mla(vec_mid[i][2], vec_src[3][i] + vec_src[4][i], A0);
+
+        vec_mid[i][3] = vec_src[1][i] - vec_src[2][i];
+        Half8::mla(vec_mid[i][3], vec_src[3][i] - vec_src[4][i], A2);
+        vec_mid[i][3] = vec_mid[i][3] + vec_src[5][i];
+    }
+
+    for (int i = 0; i < ey; i++) {
+        vec_dst[i][0] = (((vec_mid[0][i] + vec_mid[1][i]) + (vec_mid[2][i] + vec_mid[3][i])) + vec_mid[4][i]);
+
+        vec_dst[i][1] = vec_mid[1][i] - vec_mid[2][i];
+        Half8::mla(vec_dst[i][1], vec_mid[3][i] - vec_mid[4][i], A1);
+
+        vec_dst[i][2] = vec_mid[1][i] + vec_mid[2][i];
+        Half8::mla(vec_dst[i][2], vec_mid[3][i] + vec_mid[4][i], A0);
+
+        vec_dst[i][3] = vec_mid[1][i] - vec_mid[2][i];
+        Half8::mla(vec_dst[i][3], vec_mid[3][i] - vec_mid[4][i], A2);
+        vec_dst[i][3] = vec_dst[i][3] + vec_mid[5][i];
+
+        Half8::save(dst_f + i * h_stride, vec_dst[i][0]);
+        Half8::save(dst_f + i * h_stride + 8, vec_dst[i][1]);
+        Half8::save(dst_f + i * h_stride + 16, vec_dst[i][2]);
+        Half8::save(dst_f + i * h_stride + 24, vec_dst[i][3]);
+    }
+}
+
+}  // namespace TNN_NS
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_3x3.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_3x3.h
new file mode 100644
index 0000000..0bc97c8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_3x3.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_3X3_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_3X3_H_
+
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+#include "tnn/utils/winograd_generator.h"
+
+namespace TNN_NS {
+
+class ArmConvFp16Layer3x3 : public ArmConvFp16LayerCommon {
+public:
+    virtual ~ArmConvFp16Layer3x3();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <int dst_unit, int src_unit>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    static int SelectWinograd(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                              const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    int src_unit_;
+    int dst_unit_;
+    SrcTransformFunc SrcTransformFunc_;
+    DstTransformFunc DstTransformFunc_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_3X3_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_c3.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_c3.h
new file mode 100644
index 0000000..6d919f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_c3.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_C3_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_C3_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmConvFp16LayerC3 : public ArmConvFp16LayerCommon {
+public:
+    // @brief virtual destrcutor
+    virtual ~ArmConvFp16LayerC3();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_C3_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h
new file mode 100644
index 0000000..bda8843
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+
+class ArmConvFp16LayerCommon : public ArmLayerAcc {
+public:
+    virtual ~ArmConvFp16LayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+                
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    int tile_blk_size;
+
+    std::function<void(fp16_t *, const fp16_t *, const ConvLayerParam *, 
+                       size_t, size_t, const ArmKernelParam *kparam)> img2col_func;
+
+    PostFunc post_func_ = nullptr;
+
+    template <typename T>
+    void PostExec(const std::vector<Blob *> &outputs);
+    void PostExecNoBias(const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h
new file mode 100644
index 0000000..31c3b7d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmConvFp16LayerDepthwise : public ArmConvFp16LayerCommon {
+public:
+    virtual ~ArmConvFp16LayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_FP16_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise_s1.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise_s1.h
new file mode 100644
index 0000000..f7a14a5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise_s1.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_FP16_ACC_DEPTHWISE_S1_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_FP16_ACC_DEPTHWISE_S1_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+class ArmConvFp16LayerDepthwiseS1 : public ArmConvFp16LayerDepthwise {
+public:
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    virtual ~ArmConvFp16LayerDepthwiseS1();
+    virtual Status DoForward(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param,
+                            const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+protected:
+    ConvDwSliceFunc  SlideFunc_ = nullptr;
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_FP16_ACC_DEPTHWISE_S1_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.cc
new file mode 100644
index 0000000..c5cb9b9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+/*
+ArmConvInt8Layer1x1 used for 1x1 conv with small c and big h*w
+*/
+bool ArmConvInt8Layer1x1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs) {
+    if (param->group != 1 || param->kernels[0] != 1 || param->kernels[1] != 1 || param->strides[0] != 1 ||
+        param->strides[1] != 1 || param->pads[0] != 0 || param->pads[1] != 0 || param->pads[2] != 0 ||
+        param->pads[3] != 0) {
+        return false;
+    }
+    auto dims_input         = inputs[0]->GetBlobDesc().dims;
+    const int input_channel = dims_input[1];
+    const int h             = dims_input[2];
+    const int w             = dims_input[3];
+    if (input_channel <= 32 && h * w > param->output_channel) {
+        return true;
+    }
+
+    return false;
+}
+
+ArmConvInt8Layer1x1::~ArmConvInt8Layer1x1() {}
+
+static inline void packWeightBias(const size_t nc, const size_t kc, const uint32_t nr, const uint32_t np,
+                                  const uint32_t kr, const int8_t *const k, const int32_t *const b,
+                                  void *const packed_w) {
+    union {
+        void *const as_void_ptr;
+        int8_t *as_int8_ptr;
+        int32_t *as_int32_ptr;
+    } packed = {packed_w};
+
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+        const size_t nr_block_size = nc - nr_block_start < nr ? (nc - nr_block_start) : nr;
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            *(packed.as_int32_ptr++) = b ? b[nr_block_start + nr_block_offset] : 0.0f;
+        }
+        packed.as_int32_ptr += (nr - nr_block_size);
+        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+            const size_t kr_block_size = kc - kr_block_start < kr ? (kc - kr_block_start) : kr;
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+                    *(packed.as_int8_ptr++) = kv;
+                }
+                packed.as_int8_ptr += (kr - kr_block_size);
+            }
+            packed.as_int8_ptr += ((nr - nr_block_size) & (np - 1)) * kr;
+        }
+    }
+}
+
+Status ArmConvInt8Layer1x1::allocateBufferWeightBias(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input           = inputs[0]->GetBlobDesc().dims;
+    auto dims_output          = outputs[0]->GetBlobDesc().dims;
+    const int input_channels  = dims_input[1];
+    const int output_channels = dims_output[1];
+
+    int nr                           = 8;
+    int kr                           = 1;
+    const int n_stride               = ROUND_UP(output_channels, nr);
+    const int k_stride               = input_channels;
+    const size_t packed_weights_size = (sizeof(uint8_t) * k_stride + sizeof(int32_t)) * n_stride;
+    const int8_t *k                  = conv_res->filter_handle.force_to<int8_t *>();
+    const int32_t *b                 = conv_res->bias_handle.force_to<int32_t *>();
+    RawBuffer temp_buffer(packed_weights_size);
+    int8_t *packed_w = temp_buffer.force_to<int8_t *>();
+    buffer_weight_   = temp_buffer;
+
+    packWeightBias(output_channels, input_channels, nr, nr, kr, k, b, (void *)packed_w);
+    return TNN_OK;
+}
+
+Status ArmConvInt8Layer1x1::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferWeightBias(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferScale(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(setFusionParam(inputs, outputs), TNN_OK);
+
+    // init base k_param_
+    k_param_->scale   = buffer_scale_.force_to<float *>();
+    k_param_->fil_ptr = buffer_weight_.force_to<void *>();
+
+    return TNN_OK;
+}
+
+Status ArmConvInt8Layer1x1::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto add_input = (conv_param->fusion_type == FusionType_None) ? nullptr : inputs[1];
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+#ifdef __aarch64__
+    const int mr        = 8;
+#else
+    const int mr        = 4;
+#endif
+    const int nr        = 8;
+    const int kr        = 1;
+    auto dims_input     = input->GetBlobDesc().dims;
+    auto dims_output    = output->GetBlobDesc().dims;
+    const int batch     = dims_output[0];
+    int ic              = dims_input[1];
+    int oc              = dims_output[1];
+    int8_t *input_data  = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle()));
+    int8_t *output_data = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+    int8_t *add_input_data = add_input ? reinterpret_cast<int8_t *>(GetBlobHandlePtr(add_input->GetHandle())) : nullptr;
+
+    struct Q8GemmContext context = {.k         = ic,
+                                    .k_stride  = ic,
+                                    .n         = oc,
+                                    .n_stride  = ROUND_UP(oc, 8),
+                                    .a         = input_data,
+                                    .a_stride  = ROUND_UP(ic, 4),  // input_pixel_stride
+                                    .packed_w  = reinterpret_cast<int8_t *>(k_param_->fil_ptr),
+                                    .c         = output_data,
+                                    .c_stride  = ROUND_UP(oc, 4),
+                                    .scales    = reinterpret_cast<float *>(k_param_->scale),
+                                    .relu      = relu_,
+                                    .add_input = add_input_data,
+                                    .add_scale = buffer_add_scale_.force_to<float *>(),
+                                    .relu6_max = relu6_max_.force_to<int8_t *>()};
+    size_t output_size           = k_param_->ow * k_param_->oh * k_param_->oc_r4;
+    ComputeQ8Gemm(&context, dims_output[2] * dims_output[3], oc, mr, nr);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.h
new file mode 100644
index 0000000..0db9931
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_1X1_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_1X1_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+
+class ArmConvInt8Layer1x1 : public ArmConvInt8LayerCommon {
+public:
+    virtual ~ArmConvInt8Layer1x1();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeightBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_1X1_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.cc
new file mode 100644
index 0000000..8c747b1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.cc
@@ -0,0 +1,468 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/naive_compute.h"
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace TNN_NS {
+/*
+ArmConvInt8LayerCommon as the last conv int8 solution
+*/
+bool ArmConvInt8LayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return true;
+    }
+    return false;
+}
+
+ArmConvInt8LayerCommon::~ArmConvInt8LayerCommon() {}
+
+Status ArmConvInt8LayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        // only support group == 1
+        const int group = conv_param->group;
+        if (group != 1) {
+            LOGE("GROUP NOT SUPPORTED NOW\n");
+            return Status(TNNERR_PARAM_ERR, "INT8 CONV GROUD > 1 NOT SUPPORT");
+        }
+        const int oc      = output_channel;
+        const int ic      = inputs[0]->GetBlobDesc().dims[1];
+        const int oc_4    = UP_DIV(oc, 4);
+        const int ic_4    = UP_DIV(ic, 4);
+        const int icrs_16 = UP_DIV(ic_4 * kw * kh, 4);
+
+        int weight_count   = group * oc_4 * icrs_16 * 64;
+        int data_byte_size = weight_count * DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+        RawBuffer temp_buffer(data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+
+        // from [o][i][h][w]
+        // to armv8: [o/4][h][w][i/16][o4][i16]
+        // to armv7: [o/4][h][w][i/8][o2][i2][o2][i4]
+        PackINT8Weight(conv_res->filter_handle.force_to<int8_t *>(), temp_buffer.force_to<int8_t *>(), group, ic,
+                       output_channel, conv_param->kernels[1], conv_param->kernels[0]);
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    if (!buffer_bias_.GetBytesSize()) {
+        if (conv_param->bias) {
+            int total_byte_size =
+                ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->bias_handle.GetDataType());
+
+            const int bias_handle_size      = conv_res->bias_handle.GetBytesSize();
+            const int32_t *bias_handle_data = conv_res->bias_handle.force_to<int32_t *>();
+
+            RawBuffer temp_buffer(total_byte_size);
+            memcpy(temp_buffer.force_to<int32_t *>(), conv_res->bias_handle.force_to<int32_t *>(), bias_handle_size);
+
+            buffer_bias_ = temp_buffer;
+        } else if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            // int 8 kernel always add bias, if not, set zeros
+            buffer_bias_ = RawBuffer(ROUND_UP(dims_output[1], 4) * sizeof(int32_t));
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerCommon::allocateBufferScale(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    // alloc scale buffer
+    if (!buffer_scale_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->scale_handle.GetDataType());
+
+        const int scale_handle_size = conv_res->scale_handle.GetBytesSize();
+        const float *w_scale        = conv_res->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        int scale_len_w = conv_res->scale_handle.GetDataCount();
+        int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_w = scale_len_w == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+            if (w_scale[scale_idx_w] < 0.0f || o_scale[scale_idx_o] < 0.0f) {
+                return Status(TNNERR_PARAM_ERR, "int8-blob scale can not be negative");
+            }
+
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = w_scale[scale_idx_w] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        buffer_scale_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerCommon::allocateBufferAddScale(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims) !=
+        DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims)) {
+        return Status(TNNERR_LAYER_ERR, "Conv-Add fusion does not support broadcast-add");
+    }
+
+    // alloc add scale buffer
+    if (!buffer_add_scale_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->scale_handle.GetDataType());
+
+        const float *i_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        int scale_len_i = reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.GetDataCount();
+        int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_i = scale_len_i == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+            if (i_scale[scale_idx_i] < 0.0f || o_scale[scale_idx_o] < 0.0f) {
+                return Status(TNNERR_PARAM_ERR, "int8-blob scale can not be negative");
+            }
+
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        buffer_add_scale_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+// aarch32 memcpy small size has poor performance, use intrinsic to speed up
+static inline void memcpy_intrinsic(int8_t *dst, const int8_t *src, int ic_r4) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    for (; i + 31 < ic_r4; i += 32) {
+        vst1q_s8(dst + i, vld1q_s8(src + i));
+        vst1q_s8(dst + i + 16, vld1q_s8(src + i + 16));
+    }
+    for (; i + 15 < ic_r4; i += 16) {
+        vst1q_s8(dst + i, vld1q_s8(src + i));
+    }
+    for (; i + 7 < ic_r4; i += 8) {
+        vst1_s8(dst + i, vld1_s8(src + i));
+    }
+#endif
+    for (; i + 3 < ic_r4; i += 4) {
+        *((int32_t*)(dst + i)) = *((int32_t*)(src + i));
+    }
+}
+
+#define DEF_IMG2COL_VAL                                                                                                \
+    int x_id = (int)x_start + i;                                                                                       \
+    int ox   = x_id % kparam->ow;                                                                                      \
+    int oy   = x_id / kparam->ow;                                                                                      \
+    int sx   = ox * param->strides[0] - param->pads[0];                                                                \
+    int sy   = oy * param->strides[1] - param->pads[2];                                                                \
+    int sfy  = MAX(0, (UP_DIV(-sy, param->dialations[1])));                                                            \
+    int efy  = MIN(kh, UP_DIV(kparam->ih - sy, param->dialations[1]));                                                 \
+    int sfx  = MAX(0, (UP_DIV(-sx, param->dialations[0])));                                                            \
+    int efx  = MIN(kw, UP_DIV(kparam->iw - sx, param->dialations[0]));                                                 \
+    int fyC  = efy - sfy;                                                                                              \
+    int fxC  = efx - sfx;
+
+/*
+general img2col func
+*/
+static void im2col(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                   int crs_div8, const ArmKernelParam *kparam) {
+    const int col_buffer_size = crs_div8 * NEON_INT8CONV_TILE_HW * 8;
+    const int src_w_step      = kparam->ic_r4;
+    const int crs_r8          = crs_div8 * 8;
+    memset(dst, 0, col_buffer_size);
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r8 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * kparam->iw) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * kparam->ic_r4;
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * kparam->ic_r4;
+            auto src_y = input_offset + fy * kparam->iw * kparam->ic_r4 * dilate_y;
+            for (int fx = 0; fx < fxC; ++fx) {
+                auto dst_x = dst_y + fx * kparam->ic_r4;
+                auto src_x = src_y + fx * dilate_x * kparam->ic_r4;
+                memcpy_intrinsic(dst_x, src_x, kparam->ic_r4);
+            }
+        }
+    }
+}
+
+/*
+template img2col func when c is small(eg. 1,2,3)
+*/
+template <int REALC>
+static void im2col_smallc(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                          int crs_div8, const ArmKernelParam *kparam) {
+    const int col_buffer_size = crs_div8 * NEON_INT8CONV_TILE_HW * 8;
+    const int src_w_step      = 4;
+    const int crs_r8          = crs_div8 * 8;
+    memset(dst, 0, col_buffer_size);
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r8 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * kparam->iw) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * REALC;
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * REALC;
+            auto src_y = input_offset + fy * kparam->iw * src_w_step * dilate_y;
+            for (int fx = 0; fx < fxC; fx++) {
+                auto dst_x = dst_y + fx * REALC;
+                auto src_x = src_y + fx * src_w_step * dilate_x;
+                memcpy(dst_x, src_x, REALC);
+            }
+        }
+    }
+}
+
+Status ArmConvInt8LayerCommon::setFusionParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    // fused add input
+    if (conv_param->fusion_type != FusionType_None) {
+        RETURN_ON_NEQ(allocateBufferAddScale(inputs, outputs), TNN_OK);
+    }
+
+    // only support relu activation
+    if (conv_param->activation_type == ActivationType_ReLU) {
+        relu_ = 1;
+        if (conv_param->fusion_type == FusionType_Conv_Activation_Add) {
+            relu_ = -1;
+        }
+    } else if (conv_param->activation_type == ActivationType_ReLU6) {
+        relu_ = 2;
+        if (conv_param->fusion_type == FusionType_Conv_Activation_Add) {
+            return Status(TNNERR_LAYER_ERR, "Conv-Activation-Add fusion does not support relu6");
+        }
+    }
+
+    // compute relu6 max
+    if (conv_param->activation_type == ActivationType_ReLU6) {
+        auto output_scale_resource      = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        auto output_scale_len           = output_scale_resource->scale_handle.GetDataCount();
+        auto output_scale_resource_data = output_scale_resource->scale_handle.force_to<float *>();
+        auto &dims_output               = outputs[0]->GetBlobDesc().dims;
+        auto &output_channel            = dims_output[1];
+        RawBuffer relu6_max             = RawBuffer(ROUND_UP(output_channel, 8) * sizeof(int8_t));
+        auto relu6_max_data             = relu6_max.force_to<int8_t *>();
+        for (int i = 0; i < output_channel; ++i) {
+            int scale_idx     = output_scale_len == 1 ? 0 : i;
+            relu6_max_data[i] = float2int8(6.0f / output_scale_resource_data[scale_idx]);
+        }
+        for (int i = output_channel; i < ROUND_UP(output_channel, 8); ++i) {
+            relu6_max_data[i] = 127;
+        }
+        relu6_max_ = relu6_max;
+        relu6_max_.SetDataType(DATA_TYPE_INT8);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferScale(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(setFusionParam(inputs, outputs), TNN_OK);
+
+    // init base k_param_
+    k_param_->scale   = buffer_scale_.force_to<float *>();
+    k_param_->bias    = buffer_bias_.force_to<void *>();
+    k_param_->fil_ptr = buffer_weight_.force_to<void *>();
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto dims_input = inputs[0]->GetBlobDesc().dims;
+    int kernel_x    = conv_param->kernels[0];
+    int kernel_y    = conv_param->kernels[1];
+    int stride_x    = conv_param->strides[0];
+    int stride_y    = conv_param->strides[1];
+    int pad_x       = conv_param->pads[0];
+    int pad_y       = conv_param->pads[2];
+
+    // fast mode
+    bool no_im2col = kernel_x == 1 && kernel_y == 1 && k_param_->ic_r4 % 8 == 0 && stride_x == 1 && stride_y == 1 &&
+                     pad_x == 0 && pad_y == 0 && dims_input[2] * dims_input[3] % 4 == 0;
+    if (!no_im2col) {
+        im_col_func_ = im2col;
+        if (dims_input[1] == 1)
+            im_col_func_ = im2col_smallc<1>;
+        else if (dims_input[1] == 2)
+            im_col_func_ = im2col_smallc<2>;
+        else if (dims_input[1] == 3)
+            im_col_func_ = im2col_smallc<3>;
+        else if (dims_input[1] == 4)
+            im_col_func_ = im2col_smallc<4>;
+    } else {
+        im_col_func_ = nullptr;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    auto input     = inputs[0];
+    auto output    = outputs[0];
+    auto add_input = (conv_param->fusion_type == FusionType_None) ? nullptr : inputs[1];
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    const int batch  = dims_output[0];
+    auto ic          = dims_input[1];
+    auto ic_calc     = ic < 4 ? ic : k_param_->ic_r4;
+
+    int8_t *input_data     = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle()));
+    int8_t *output_data    = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+    int8_t *add_input_data = add_input ? reinterpret_cast<int8_t *>(GetBlobHandlePtr(add_input->GetHandle())) : nullptr;
+
+    const int crs_div8   = UP_DIV(ic_calc * conv_param->kernels[1] * conv_param->kernels[0], 8);
+    const int tile_count = UP_DIV(k_param_->oh * k_param_->ow, NEON_INT8CONV_TILE_HW);
+
+    int max_num_threads  = OMP_MAX_THREADS_NUM_;
+    const int crs_r16    = ROUND_UP(k_param_->ic_r4 * conv_param->kernels[1] * conv_param->kernels[0], 16);
+    size_t gemm_tmp_size = crs_r16 * NEON_INT8CONV_TILE_HW * max_num_threads + NEON_KERNEL_EXTRA_LOAD;
+    size_t im2col_size   = gemm_tmp_size;
+    size_t tmpout_size   = k_param_->oc_r4 * NEON_INT8CONV_TILE_HW * max_num_threads;
+    size_t tmpin_size    = tmpout_size;
+    size_t total_size    = gemm_tmp_size + im2col_size + tmpout_size + tmpin_size;
+
+    int8_t *work_space   = reinterpret_cast<int8_t *>(context_->GetSharedWorkSpace(total_size));
+    int8_t *gemm_tmp_ptr = work_space;
+    int8_t *im2col_ptr   = work_space + gemm_tmp_size;
+    int8_t *tmpout_ptr   = im2col_ptr + im2col_size;
+    int8_t *tmpin_ptr    = tmpout_ptr + tmpout_size;
+
+    for (int n = 0; n < batch; ++n) {
+        const auto input_batch = input_data + n * k_param_->iw * k_param_->ih * k_param_->ic_r4;
+        auto output_batch      = output_data + n * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+        auto add_input_batch =
+            add_input_data ? add_input_data + n * k_param_->ow * k_param_->oh * k_param_->oc_r4 : nullptr;
+
+        OMP_PARALLEL_FOR_GUIDED_
+        for (int t_idx = 0; t_idx < tile_count; t_idx++) {
+            int thread_id          = OMP_TID_;
+            int8_t *input_kernel   = nullptr;
+            const int hw_start     = t_idx * NEON_INT8CONV_TILE_HW;
+            const int real_hw_tile = MIN(k_param_->oh * k_param_->ow - hw_start, NEON_INT8CONV_TILE_HW);
+            const int input_count  = crs_div8 * NEON_INT8CONV_TILE_HW * 8;
+            auto gemm_work_space   = gemm_tmp_ptr + input_count * thread_id;
+            // im2col
+            if (im_col_func_) {
+                input_kernel = im2col_ptr + input_count * thread_id;
+                im_col_func_(input_kernel, input_batch, conv_param, hw_start, real_hw_tile, crs_div8, k_param_.get());
+            } else {
+                input_kernel = input_batch + hw_start * ic_calc;
+            }
+            auto output_kernel    = output_batch + hw_start * k_param_->oc_r4;
+            auto add_input_kernel = add_input_batch ? add_input_batch + hw_start * k_param_->oc_r4 : nullptr;
+            // gemm int8
+            if (real_hw_tile == NEON_INT8CONV_TILE_HW) {
+                GemmInt8(output_kernel, input_kernel, gemm_work_space, reinterpret_cast<int8_t *>(k_param_->fil_ptr),
+                         reinterpret_cast<int32_t *>(k_param_->bias), k_param_->scale, crs_div8, crs_div8 * 8,
+                         k_param_->oc_r4, relu_, add_input_kernel, buffer_add_scale_.force_to<float *>(), 
+                         relu6_max_.force_to<int8_t *>());
+            } else {
+                int8_t *outptr_tmp = tmpout_ptr + k_param_->oc_r4 * NEON_INT8CONV_TILE_HW * thread_id;
+                int8_t *add_input_ptr_tmp = nullptr;
+                if (add_input_kernel) {
+                    add_input_ptr_tmp = tmpin_ptr + k_param_->oc_r4 * NEON_INT8CONV_TILE_HW * thread_id;
+                    memcpy(add_input_ptr_tmp, add_input_kernel, real_hw_tile * k_param_->oc_r4);
+                }
+                GemmInt8(outptr_tmp, input_kernel, gemm_work_space, reinterpret_cast<int8_t *>(k_param_->fil_ptr),
+                         reinterpret_cast<int32_t *>(k_param_->bias), k_param_->scale, crs_div8, crs_div8 * 8,
+                         k_param_->oc_r4, relu_, add_input_ptr_tmp, buffer_add_scale_.force_to<float *>(),
+                         relu6_max_.force_to<int8_t *>());
+                memcpy(output_kernel, outptr_tmp, real_hw_tile * k_param_->oc_r4);
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h
new file mode 100644
index 0000000..1be652b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+
+class ArmConvInt8LayerCommon : public ArmLayerAcc {
+public:
+    virtual ~ArmConvInt8LayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+                
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferScale(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferAddScale(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status setFusionParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    RawBuffer buffer_scale_;
+    RawBuffer buffer_add_scale_;
+    // for conv relu6 fusion
+    RawBuffer relu6_max_;
+
+    long relu_ = 0;
+
+    std::function<void(int8_t *, const int8_t *, const ConvLayerParam *, size_t, size_t, int,
+                       const ArmKernelParam *kparam)>
+        im_col_func_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.cc
new file mode 100644
index 0000000..938daf7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.cc
@@ -0,0 +1,200 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool ArmConvInt8LayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+    if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_INT8) {
+        return false;
+    }
+    if (param->fusion_type != FusionType_None) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    return param->group == input_channel && param->group == output_channel;
+}
+
+ArmConvInt8LayerDepthwise::~ArmConvInt8LayerDepthwise() {}
+
+Status ArmConvInt8LayerDepthwise::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int8_t *filter = conv_res->filter_handle.force_to<int8_t *>();
+        CHECK_PARAM_NULL(filter);
+        int kw             = conv_param->kernels[0];
+        int kh             = conv_param->kernels[1];
+        const int channel  = inputs[0]->GetBlobDesc().dims[1];
+        const int c_4      = ROUND_UP(channel, 4);
+        int data_byte_size = c_4 * kh * kw;
+        RawBuffer temp_buffer(data_byte_size);
+        int8_t *temp_ptr = temp_buffer.force_to<int8_t *>();
+
+        for (int c = 0; c < channel; c++) {
+            int8_t *f_c = filter + c * kw * kh;
+            int8_t *t_c = temp_ptr + c;
+            for (int k = 0; k < kh * kw; k++) {
+                t_c[k * c_4] = f_c[k];
+            }
+        }
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvInt8LayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch  = dims_output[0];
+    const int group  = conv_param->group;
+    auto input_width = dims_input[3], input_height = dims_input[2], ic = dims_input[1];
+    auto output_width = dims_output[3], output_height = dims_output[2], oc = dims_output[1];
+    auto ic_4 = UP_DIV(dims_input[1], 4);
+    auto oc_4 = UP_DIV(dims_output[1], 4);
+
+    int kernel_x = conv_param->kernels[0];
+    int kernel_y = conv_param->kernels[1];
+    int stride_x = conv_param->strides[0];
+    int stride_y = conv_param->strides[1];
+    int pad_x    = conv_param->pads[0];
+    int pad_y    = conv_param->pads[2];
+    int dilate_x = conv_param->dialations[0];
+    int dilate_y = conv_param->dialations[1];
+
+    const int dst_y_step = output_width * oc_4 * 4;
+    const int src_y_step = input_width * ic_4 * 4;
+
+    int8_t *input_data  = reinterpret_cast<int8_t *>(GetBlobHandlePtr(input->GetHandle()));
+    int8_t *output_data = reinterpret_cast<int8_t *>(GetBlobHandlePtr(output->GetHandle()));
+
+    const int32_t *bias_data = buffer_bias_.force_to<int32_t *>();
+    const float *scale_data  = buffer_scale_.force_to<float *>();
+    int8_t *weight_data      = buffer_weight_.force_to<int8_t *>();
+
+    int l = 0, t = 0, r = output_width, b = output_height;
+    for (; l * stride_x - pad_x < 0; l++)
+        ;
+    for (; t * stride_y - pad_y < 0; t++)
+        ;
+    for (; (r - 1) * stride_x - pad_x + kernel_x * dilate_x > input_width && r > l; r--)
+        ;
+    for (; (b - 1) * stride_y - pad_y + kernel_y * dilate_y > input_height && b > t; b--)
+        ;
+
+    auto RunCorner = [=](int8_t *dst_z, const int8_t *src_z, int left, int top, int right, int bottom) {
+        for (long dy = top; dy < bottom; ++dy) {
+            auto dst_y             = dst_z + dy * dst_y_step;
+            const long src_start_y = dy * stride_y - pad_y;
+            const auto src_y       = src_z + src_start_y * src_y_step;
+            const long sfy         = MAX(0, (UP_DIV(-src_start_y, dilate_y)));
+            const long efy         = MIN(kernel_y, (UP_DIV(k_param_->ih - src_start_y, dilate_y)));
+            for (long dx = left; dx < right; ++dx) {
+                auto dst_x             = dst_y + k_param_->oc_r4 * dx;
+                const long src_start_x = dx * stride_x - pad_x;
+                const auto src_x       = src_y + src_start_x * k_param_->oc_r4;
+                const long sfx         = MAX(0, (UP_DIV(-src_start_x, dilate_x)));
+                const long efx         = MIN(kernel_x, (UP_DIV(k_param_->iw - src_start_x, dilate_x)));
+                const long srcIndex    = (sfx * dilate_x + sfy * dilate_y * k_param_->iw) * k_param_->oc_r4;
+                const long weightIndex = (kernel_x * sfy + sfx) * k_param_->oc_r4;
+
+                DepthwiseI8Unit(dst_x, src_x + srcIndex, reinterpret_cast<int8_t *>(k_param_->fil_ptr) + weightIndex,
+                                reinterpret_cast<int32_t *>(k_param_->bias), efx - sfx, efy - sfy,
+                                k_param_->oc_r4 * kernel_x, src_y_step * dilate_y, k_param_->oc_r4 * dilate_x,
+                                k_param_->scale, k_param_->oc_r4);
+            }
+        }
+    };
+
+    for (int bIndex = 0; bIndex < batch; ++bIndex) {
+        const auto input_batch = input_data + bIndex * src_y_step * input_height;
+        auto output_batch      = output_data + bIndex * dst_y_step * output_height;
+
+        long src_w_step = k_param_->oc_r4 * conv_param->strides[0];
+        auto dwfunc     = DepthwiseI8General;
+#ifdef TNN_USE_NEON
+        if (kernel_x == kernel_y && kernel_x == 3 && k_param_->oc_r4 >= 8 && dilate_x == 1 && dilate_y == 1) {
+            dwfunc = DepthwiseI8K3;
+        } else if (kernel_x == kernel_y && kernel_x == 5 && k_param_->oc_r4 >= 8 && dilate_x == 1 && dilate_y == 1) {
+            dwfunc = DepthwiseI8K5;
+        }
+#endif
+        OMP_PARALLEL_SECTIONS_ {
+            OMP_SECTION_ {
+                // top corner
+                RunCorner(output_batch, input_batch, 0, 0, k_param_->ow, t);
+            }
+            OMP_SECTION_ {
+                // bottom corner
+                RunCorner(output_batch, input_batch, 0, b, k_param_->ow, k_param_->oh);
+            }
+            OMP_SECTION_ {
+                // left corner
+                RunCorner(output_batch, input_batch, 0, t, l, b);
+            }
+            OMP_SECTION_ {
+                // bottom corner
+                RunCorner(output_batch, input_batch, r, t, k_param_->ow, b);
+            }
+        }
+        if (r > l && b > t) {
+            OMP_PARALLEL_FOR_GUIDED_
+            for (long dy = t; dy < b; ++dy) {
+                const long src_start_y = dy * conv_param->strides[1] - conv_param->pads[2];
+                const auto src_dy      = input_batch + src_start_y * src_y_step;
+                auto dst_y             = output_batch + dy * dst_y_step;
+                dwfunc(dst_y + l * k_param_->oc_r4,
+                       src_dy + (l * conv_param->strides[0] - conv_param->pads[0]) * k_param_->oc_r4,
+                       reinterpret_cast<int8_t *>(k_param_->fil_ptr), reinterpret_cast<int32_t *>(k_param_->bias),
+                       r - l, src_y_step * dilate_y, k_param_->oc_r4 * dilate_x, src_w_step, k_param_->oc_r4,
+                       conv_param->kernels[0], conv_param->kernels[1], k_param_->scale);
+            }
+        }
+
+        if (conv_param->activation_type == ActivationType_ReLU) {
+            ReluInt8(output_batch, output_batch, output_height * dst_y_step);
+        } else if (conv_param->activation_type == ActivationType_ReLU6) {
+            Relu6Int8(output_batch, output_batch, relu6_max_.force_to<int8_t *>(), output_height * output_width, oc_4 * 4);
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h
new file mode 100644
index 0000000..15232e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class ArmConvInt8LayerDepthwise : public ArmConvInt8LayerCommon {
+public:
+    virtual ~ArmConvInt8LayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // preferred when gourp == input channel and group == outputchanenl
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_common.h
new file mode 100644
index 0000000..dc8359e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_common.h
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h"
+
+#ifndef NEON_INT8_SDOT_TILE_HW
+#if defined(TNN_ARM82_A64)
+#define NEON_INT8_SDOT_TILE_HW (8)
+#elif defined(TNN_ARM82_A32)
+#define NEON_INT8_SDOT_TILE_HW (4)
+#endif
+#endif
+
+namespace TNN_NS {
+
+class ArmConvInt8SdotLayerCommon : public ArmConvInt8LayerCommon {
+public:
+    virtual ~ArmConvInt8SdotLayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    int tile_blk_ = 32;
+    std::function<void(int8_t *, const int8_t *, const ConvLayerParam *, size_t, size_t, int,
+                       DimsVector, DimsVector)> im_col_func_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_depthwise_3x3.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_depthwise_3x3.h
new file mode 100644
index 0000000..731f097
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_depthwise_3x3.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_DEPTHWISE_3X3_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_DEPTHWISE_3X3_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+class ArmConvInt8SdotLayerDepthwise3x3 : public ArmConvInt8LayerDepthwise {
+public:
+    virtual ~ArmConvInt8SdotLayerDepthwise3x3();
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param,
+                            const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_INT8_SDOT_LAYER_ACC_DEPTHWISE_3X3_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.cc
new file mode 100644
index 0000000..c5b3e2a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.cc
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_1x1.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+/*
+pack func, can be treated as img2col in conv1x1
+*/
+template <typename T>
+static void PackLine(T *dst, T *src, int ih, int iw, int oh, int ow, int c_r4, int pad_h, int pad_w, int stride_h,
+                     int stride_w) {
+    if (pad_h != 0 || pad_w != 0)
+        memset(dst, 0, c_r4 * oh * ow * sizeof(T));
+
+    dst += pad_h * ow * 4 + pad_w * 4;
+
+    for (int c = 0; c < c_r4; c += 4) {
+        auto dst_c = dst + c * oh * ow;
+        auto src_c = src + c * ih * iw;
+        if (stride_w == 1 && stride_h == 1) {
+            for (int h = 0; h < ih; h++) {
+                memcpy(dst_c + h * ow * 4, src_c + h * iw * 4, iw * 4 * sizeof(T));
+            }
+        } else if (pad_w == 0 && pad_h == 0) {
+            for (int h = 0; h < oh; h++) {
+                auto dst_h = dst_c + h * ow * 4;
+                auto src_h = src_c + h * stride_h * iw * 4;
+                for (int w = 0; w < ow; w++) {
+                    Float4::save(dst_h + w * 4, Float4::load(src_h + w * stride_w * 4));
+                }
+            }
+        } else {
+            Float4 zeros(0.f);
+            for (int h = 0; h < oh; h++) {
+                int sh = h * stride_h - pad_h;
+                if (sh >= 0 && sh < ih) {
+                    auto dst_h = dst_c + (h - pad_h) * ow * 4;
+                    auto src_h = src_c + (h * stride_h - pad_h) * iw * 4;
+                    for (int w = 0; w < ow; w++) {
+                        int sw = w * stride_w - pad_w;
+                        if (sw >= 0 && sw < iw) {
+                            Float4::save(dst_h + (w - pad_w) * 4, Float4::load(src_h + (w * stride_w - pad_w) * 4));
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+bool ArmConvLayer1x1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    return param->kernels[0] == 1 && param->kernels[1] == 1 && param->group == 1 &&
+           dims_output[1] % ARM_SGEMM_TILE_N == 0;
+}
+
+ArmConvLayer1x1::~ArmConvLayer1x1() {}
+
+Status ArmConvLayer1x1::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (!buffer_weight_.GetBytesSize()) {
+        RETURN_ON_NEQ(ArmConvLayerCommon::allocateBufferWeight(inputs, outputs), TNN_OK);
+        if (ARM_SGEMM_TILE_N == 8) {
+            ConvertWeightsC4ToC8(buffer_weight_.force_to<float *>(), inputs[0]->GetBlobDesc().dims[1],
+                                 outputs[0]->GetBlobDesc().dims[1]);
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayer1x1::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+template <typename T>
+Status ArmConvLayer1x1::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch = dims_output[0];
+    auto ic4        = UP_DIV(dims_input[1], 4);
+    auto oc4        = UP_DIV(dims_output[1], 4);
+
+    int src_z_step = k_param_->iw * k_param_->ih * 4;
+    int dst_z_step = k_param_->ow * k_param_->oh * 4;
+    int plane_num  = k_param_->ow * k_param_->oh;
+
+    T *src_origin = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    /*
+    get a_block & b_block based on l2 cache size(512K most of the time)
+    */
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    int threadbuf_num   = plane_num > oc4 * 4 ? max_num_threads : 1;
+    int a_block, b_block;
+    set_block_size(a_block, b_block, 512 * 1024 / data_byte_size, plane_num, oc4 * 4, ic4 * 4, data_byte_size);
+    int work_space_size = a_block * ic4 * 4 * sizeof(T) * threadbuf_num;
+    auto work_space     = reinterpret_cast<T *>(context_->GetSharedWorkSpace(work_space_size + NEON_KERNEL_EXTRA_LOAD));
+
+    /*
+    pack inputs when pads or strides are not equal to one
+    */
+    if ((k_param_->ih != k_param_->oh) || (k_param_->iw != k_param_->ow)) {
+        work_space_size += batch * ic4 * 4 * dims_output[2] * dims_output[3] * data_byte_size;
+        auto tmp_dst = reinterpret_cast<T *>(context_->GetSharedWorkSpace(work_space_size + NEON_KERNEL_EXTRA_LOAD));
+        work_space   = tmp_dst + batch * ic4 * 4 * dims_output[2] * dims_output[3];
+
+        PackLine(tmp_dst, src_origin, k_param_->ih, k_param_->iw, k_param_->oh, k_param_->ow, k_param_->ic_r4 * batch,
+                 conv_param->pads[2], conv_param->pads[0], conv_param->strides[1], conv_param->strides[0]);
+        src_origin = tmp_dst;
+    }
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = src_origin + batch_idx * k_param_->ow * k_param_->oh * ROUND_UP(dims_input[1], 4);
+        auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * ROUND_UP(dims_output[1], 4);
+        auto bias_ptr   = reinterpret_cast<float *>(k_param_->bias);
+
+        /*
+        call different sgemm func based on input and weight size
+        */
+        if (plane_num > oc4 * 4) {
+            sgemm_repack_lhs(output_ptr, input_ptr, buffer_weight_.force_to<float *>(), ic4, oc4, plane_num, dst_z_step,
+                             a_block, b_block, work_space, bias_ptr, conv_param->activation_type,
+                             context_->GetPrecision() != PRECISION_HIGH);
+        } else {
+            sgemm_repack_rhs(output_ptr, input_ptr, buffer_weight_.force_to<float *>(), ic4, oc4, plane_num, dst_z_step,
+                             a_block, b_block, work_space, bias_ptr, conv_param->activation_type,
+                             context_->GetPrecision() != PRECISION_HIGH);
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.h
new file mode 100644
index 0000000..bc88d95
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_1x1.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_1X1_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_1X1_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class ArmConvLayer1x1 : public ArmConvLayerCommon {
+public:
+    virtual ~ArmConvLayer1x1();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // preferred when kernels[0] == 1 && kernels[1] == 1 && group == 1
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // copy and pack to c4 or c8
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_1X1_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.cc
new file mode 100644
index 0000000..05aa910
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.cc
@@ -0,0 +1,553 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_3x3.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+/*
+template transposition used in repack func
+| a | b | c | d |      | a | e | i | m |
+| e | f | g | h |  to  | b | f | g | h |
+| i | j | k | l |      | c | g | k | o |
+| m | n | o | p |      | d | h | l | p |
+*/
+template <typename T>
+void transpose_4x4(T *v0, T *v1, T *v2, T *v3) {
+    LOGE("TYPE NOT IMPLEMENT");
+}
+
+#ifdef TNN_USE_NEON
+/*
+template specialization for data type Float4(float&bfp16)
+*/
+template <>
+void transpose_4x4(Float4 *v0, Float4 *v1, Float4 *v2, Float4 *v3) {
+    float32x4x2_t q01 = vtrnq_f32(v0->value, v1->value);
+    float32x4x2_t q23 = vtrnq_f32(v2->value, v3->value);
+
+    float32x2_t d00 = vget_low_f32(q01.val[0]);
+    float32x2_t d01 = vget_high_f32(q01.val[0]);
+
+    float32x2_t d10 = vget_low_f32(q01.val[1]);
+    float32x2_t d11 = vget_high_f32(q01.val[1]);
+
+    float32x2_t d20 = vget_low_f32(q23.val[0]);
+    float32x2_t d21 = vget_high_f32(q23.val[0]);
+
+    float32x2_t d30 = vget_low_f32(q23.val[1]);
+    float32x2_t d31 = vget_high_f32(q23.val[1]);
+
+    v0->value = vcombine_f32(d00, d20);
+    v1->value = vcombine_f32(d10, d30);
+    v2->value = vcombine_f32(d01, d21);
+    v3->value = vcombine_f32(d11, d31);
+}
+#else
+/*
+template specialization for data type Float4(float&bfp16)
+*/
+template <>
+void transpose_4x4(Float4 *v0, Float4 *v1, Float4 *v2, Float4 *v3) {
+    Float4 q0 = *v0;
+    Float4 q1 = *v1;
+    Float4 q2 = *v2;
+    Float4 q3 = *v3;
+
+    v0->value[0] = q0.value[0];
+    v0->value[1] = q1.value[0];
+    v0->value[2] = q2.value[0];
+    v0->value[3] = q3.value[0];
+
+    v1->value[0] = q0.value[1];
+    v1->value[1] = q1.value[1];
+    v1->value[2] = q2.value[1];
+    v1->value[3] = q3.value[1];
+
+    v2->value[0] = q0.value[2];
+    v2->value[1] = q1.value[2];
+    v2->value[2] = q2.value[2];
+    v2->value[3] = q3.value[2];
+
+    v3->value[0] = q0.value[3];
+    v3->value[1] = q1.value[3];
+    v3->value[2] = q2.value[3];
+    v3->value[3] = q3.value[3];
+}
+#endif
+
+/*
+transpose 4x4
+*/
+template <typename T>
+static inline void _repack_4(T *dst_b, T *src_b, int src_stride) {
+    Float4 q0 = Float4::load(src_b + 0 * src_stride);
+    Float4 q1 = Float4::load(src_b + 1 * src_stride);
+    Float4 q2 = Float4::load(src_b + 2 * src_stride);
+    Float4 q3 = Float4::load(src_b + 3 * src_stride);
+    transpose_4x4(&q0, &q1, &q2, &q3);
+
+    Float4::save(dst_b + 0, q0);
+    Float4::save(dst_b + 4, q1);
+    Float4::save(dst_b + 8, q2);
+    Float4::save(dst_b + 12, q3);
+}
+
+/*
+A0 & A1 are 4x4 matrix
+| A0 | to | A0T | A1T |
+| A1 |
+*/
+template <typename T>
+static inline void _repack_8(T *dst_b, T *src_b, int src_stride) {
+    Float4 q0 = Float4::load(src_b + 0 * src_stride);
+    Float4 q1 = Float4::load(src_b + 1 * src_stride);
+    Float4 q2 = Float4::load(src_b + 2 * src_stride);
+    Float4 q3 = Float4::load(src_b + 3 * src_stride);
+    transpose_4x4(&q0, &q1, &q2, &q3);
+
+    Float4 q4 = Float4::load(src_b + 4 * src_stride);
+    Float4 q5 = Float4::load(src_b + 5 * src_stride);
+    Float4 q6 = Float4::load(src_b + 6 * src_stride);
+    Float4 q7 = Float4::load(src_b + 7 * src_stride);
+    transpose_4x4(&q4, &q5, &q6, &q7);
+
+    Float4::save(dst_b + 0, q0);
+    Float4::save(dst_b + 4, q4);
+    Float4::save(dst_b + 8, q1);
+    Float4::save(dst_b + 12, q5);
+    Float4::save(dst_b + 16, q2);
+    Float4::save(dst_b + 20, q6);
+    Float4::save(dst_b + 24, q3);
+    Float4::save(dst_b + 28, q7);
+}
+
+/*
+A0 & A1 & A2 are 4x4 matrix
+| A0 |
+| A1 |  to | A0T | A1T | A2T |
+| A2 |
+*/
+template <typename T>
+static inline void _repack_12(T *dst_b, T *src_b, int src_stride) {
+    Float4 q0 = Float4::load(src_b + 0 * src_stride);
+    Float4 q1 = Float4::load(src_b + 1 * src_stride);
+    Float4 q2 = Float4::load(src_b + 2 * src_stride);
+    Float4 q3 = Float4::load(src_b + 3 * src_stride);
+    transpose_4x4(&q0, &q1, &q2, &q3);
+
+    Float4 q4 = Float4::load(src_b + 4 * src_stride);
+    Float4 q5 = Float4::load(src_b + 5 * src_stride);
+    Float4 q6 = Float4::load(src_b + 6 * src_stride);
+    Float4 q7 = Float4::load(src_b + 7 * src_stride);
+    transpose_4x4(&q4, &q5, &q6, &q7);
+
+    Float4 q8  = Float4::load(src_b + 8 * src_stride);
+    Float4 q9  = Float4::load(src_b + 9 * src_stride);
+    Float4 q10 = Float4::load(src_b + 10 * src_stride);
+    Float4 q11 = Float4::load(src_b + 11 * src_stride);
+    transpose_4x4(&q8, &q9, &q10, &q11);
+
+    Float4::save(dst_b + 0, q0);
+    Float4::save(dst_b + 4, q4);
+    Float4::save(dst_b + 8, q8);
+    Float4::save(dst_b + 12, q1);
+    Float4::save(dst_b + 16, q5);
+    Float4::save(dst_b + 20, q9);
+    Float4::save(dst_b + 24, q2);
+    Float4::save(dst_b + 28, q6);
+    Float4::save(dst_b + 32, q10);
+    Float4::save(dst_b + 36, q3);
+    Float4::save(dst_b + 40, q7);
+    Float4::save(dst_b + 44, q11);
+}
+
+/*
+load and pack data from nc4hw4 to nchw to maximize the efficiency of sgemm
+*/
+template <typename T>
+static void load_repack(T *dst_b, T *src_b, int width, int src_stride) {
+    if (width == ARM_SGEMM_TILE_M) {
+#if defined(__aarch64__)
+        _repack_12(dst_b, src_b, src_stride);
+#else
+        _repack_8(dst_b, src_b, src_stride);
+#endif
+    } else {
+        int b_i = 0;
+        for (; b_i + 3 < width; b_i += 4) {
+            auto src_r = src_b + b_i * src_stride;
+            auto dst_r = dst_b + b_i * 4;
+            _repack_4(dst_r, src_r, src_stride);
+        }
+        for (; b_i < width; b_i++) {
+            Float4::save(dst_b + b_i * 4, Float4::load(src_b + b_i * src_stride));
+        }
+    }
+}
+
+bool ArmConvLayer3x3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if (param->group != 1 || param->dialations[0] != 1 || param->dialations[1] != 1 || param->strides[0] != 1 ||
+        param->kernels[0] != param->kernels[1] || param->strides[1] != 1 ||
+        ROUND_UP(outputs[0]->GetBlobDesc().dims[1], 4) % ARM_SGEMM_TILE_N != 0) {
+        return false;
+    }
+
+    if (!SelectWinograd(param, inputs, outputs)) {
+        return false;
+    }
+
+    return true;
+}
+
+int ArmConvLayer3x3::SelectWinograd(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return 0;
+    }
+
+    int ic          = inputs[0]->GetBlobDesc().dims[1];
+    int oc          = outputs[0]->GetBlobDesc().dims[1];
+    int kernel_size = param->kernels[0];
+    int ow          = outputs[0]->GetBlobDesc().dims[3];
+    int oh          = outputs[0]->GetBlobDesc().dims[2];
+
+    if (kernel_size != 3) {
+        return 0;
+    }
+
+    int dst_unit      = 2;
+    float max_rate    = 1.f;
+    float origin_cost = (float)ow * oh * (float)ROUND_UP(ic, 4) * ROUND_UP(oc, 4) * kernel_size * kernel_size;
+
+    // only support F(2x2, 3x3) and F(4x4, 3x3)
+    for (int u = 2; u <= 4; u += 2) {
+        float src_unit = (float)(u + kernel_size - 1);
+
+        // winograd cost = src transform + gemm + dst transform
+        float winograd_cost =
+            (2 * src_unit * src_unit * src_unit * ROUND_UP(ic, 4) +
+             src_unit * src_unit * ROUND_UP(ic, 4) * ROUND_UP(oc, 4) + 2 * src_unit * u * u * ROUND_UP(oc, 4)) *
+            (UP_DIV(ow, u) * UP_DIV(oh, u));
+
+        float acc_rate = origin_cost / winograd_cost;
+
+        if (acc_rate > max_rate * 1.1f) {
+            max_rate = acc_rate;
+            dst_unit = u;
+        }
+    }
+
+    // 10% penalty, winograd will result in more cache miss
+    if (max_rate < 1.1f) {
+        return 0;
+    }
+
+    return dst_unit;
+}
+
+ArmConvLayer3x3::~ArmConvLayer3x3() {}
+
+Status ArmConvLayer3x3::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+
+        const int kw = conv_param->kernels[0];
+        const int kh = conv_param->kernels[1];
+
+        const int weight_bytes_count = conv_res->filter_handle.GetBytesSize();
+        const float *src             = conv_res->filter_handle.force_to<float *>();
+        int data_byte_size           = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        dst_unit_ = SelectWinograd(conv_param, inputs, outputs);
+        src_unit_ = dst_unit_ + kw - 1;
+
+        const int weight_count = src_unit_ * src_unit_ * k_param_->oc_r4 * k_param_->ic_r4;
+        RawBuffer pack_weight(weight_count * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+
+        switch (dst_unit_) {
+            case 2:
+                WeightTransform4x4(src, pack_weight.force_to<float *>(), 3, input_channel, output_channel);
+                break;
+            case 4:
+                WeightTransform6x6(src, pack_weight.force_to<float *>(), 3, input_channel, output_channel);
+                break;
+            default:
+                LOGE("Unsupport winograd dst unit\n");
+                break;
+        }
+
+#ifdef __aarch64__
+        for (int i = 0; i < src_unit_ * src_unit_; i++) {
+            ConvertWeightsC4ToC8(pack_weight.force_to<float *>() + i * k_param_->ic_r4 * k_param_->oc_r4, dims_input[1],
+                                 dims_output[1]);
+        }
+#endif
+        buffer_weight_ = pack_weight;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvLayer3x3::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmConvLayerCommon::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    if (conv_param) {
+        if (in_data_type == DATA_TYPE_FLOAT) {
+            if (dst_unit_ == 2) {
+                SrcTransformFunc_ = SrcTransformInOne4x4Float;
+                DstTransformFunc_ = DstTransformInOne4x2Float;
+            } else if (dst_unit_ == 4) {
+                SrcTransformFunc_ = SrcTransformInOne6x6Float;
+                DstTransformFunc_ = DstTransformInOne6x4Float;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else if (in_data_type == DATA_TYPE_BFP16) {
+            if (dst_unit_ == 2) {
+                SrcTransformFunc_ = SrcTransformInOne4x4BFP16;
+                DstTransformFunc_ = DstTransformInOne4x2BFP16;
+            } else if (dst_unit_ == 4) {
+                SrcTransformFunc_ = SrcTransformInOne6x6BFP16;
+                DstTransformFunc_ = DstTransformInOne6x4BFP16;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else {
+            return TNNERR_LAYER_ERR;
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayer3x3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+template <typename T>
+Status ArmConvLayer3x3::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto input                 = inputs[0];
+    auto output                = outputs[0];
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch = output->GetBlobDesc().dims[0];
+
+    auto w_unit      = UP_DIV(k_param_->ow, dst_unit_);
+    auto h_unit      = UP_DIV(k_param_->oh, dst_unit_);
+    auto title_count = UP_DIV(w_unit * h_unit, ARM_SGEMM_TILE_M);
+
+    T *src_origin = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int max_num_threads          = OMP_MAX_THREADS_NUM_;
+    int transform_num_per_thread = src_unit_ * src_unit_ * 4;
+    int work_num_per_thread      = (k_param_->ic_r4 * 2 + k_param_->oc_r4) * src_unit_ * src_unit_ * ARM_SGEMM_TILE_M;
+
+    auto tranform_buf_size = max_num_threads * transform_num_per_thread * sizeof(float);
+    auto work_buf_size     = work_num_per_thread * sizeof(float);
+
+    // gemm kernel need bias pointer
+    auto fake_bias_size = k_param_->oc_r4 * sizeof(float);
+    float *work_sapce   = reinterpret_cast<float *>(
+        context_->GetSharedWorkSpace(tranform_buf_size + work_buf_size + fake_bias_size + NEON_KERNEL_EXTRA_LOAD));
+    float *fake_bias    = reinterpret_cast<float *>(work_sapce);
+    T *transform_buffer = reinterpret_cast<T *>(work_sapce + fake_bias_size / sizeof(float));
+    work_sapce += tranform_buf_size / sizeof(float) + fake_bias_size / sizeof(float);
+
+    // memset fake bias data to get correct results
+    memset(fake_bias, 0, fake_bias_size);
+
+    if (DstTransformFunc_ == nullptr || SrcTransformFunc_ == nullptr) {
+        return TNNERR_COMMON_ERROR;
+    }
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r4;
+        auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+
+        for (int t_idx = 0; t_idx < title_count; t_idx++) {
+            auto _src_origin = work_sapce;
+            auto _dst_origin = _src_origin + k_param_->ic_r4 * src_unit_ * src_unit_ * ARM_SGEMM_TILE_M;
+            auto repack_buf  = _dst_origin + k_param_->oc_r4 * src_unit_ * src_unit_ * ARM_SGEMM_TILE_M;
+
+            int x_idx    = t_idx * ARM_SGEMM_TILE_M;
+            int x_remain = w_unit * h_unit - x_idx;
+            int x_c      = x_remain > ARM_SGEMM_TILE_M ? ARM_SGEMM_TILE_M : x_remain;
+
+            int src_z_step = k_param_->iw * k_param_->ih * 4;
+            int dst_z_step = x_c * src_unit_ * src_unit_ * 4;
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < k_param_->ic_r4 / 4; z++) {
+                int tid         = OMP_TID_;
+                auto mid_buffer = transform_buffer + tid * transform_num_per_thread;
+                auto src_z      = input_ptr + z * src_z_step;
+                auto dst_z      = _src_origin + z * dst_z_step;
+                for (int x_i = 0; x_i < x_c; x_i++) {
+                    int idx   = x_idx + x_i;
+                    int w_idx = idx % w_unit;
+                    int h_idx = idx / w_unit;
+
+                    int src_x = w_idx * dst_unit_ - conv_param->pads[0];
+                    int src_y = h_idx * dst_unit_ - conv_param->pads[2];
+                    int sy    = MAX(0, src_y) - src_y;
+                    int ey    = MIN(src_y + src_unit_, k_param_->ih) - src_y;
+                    int sx    = MAX(0, src_x) - src_x;
+                    int ex    = MIN(src_x + src_unit_, k_param_->iw) - src_x;
+                    int count = (ex - sx) * 4;
+
+                    // source transform start
+                    auto src_start       = src_z + (src_x + src_y * k_param_->iw) * 4;
+                    auto dst_start       = dst_z + x_i * src_unit_ * src_unit_ * 4;
+                    T *transform_src     = nullptr;
+                    float *transform_dst = dst_start;
+                    int h_stride0        = 0;
+                    int h_stride1        = 4 * src_unit_;
+
+                    if (ex - sx == src_unit_ && ey - sy == src_unit_) {
+                        transform_src = src_start;
+                        h_stride0     = 4 * k_param_->iw;
+                    } else {
+                        memset(mid_buffer, 0, src_unit_ * src_unit_ * 4 * data_byte_size);
+                        if (count > 0) {
+                            for (int yy = sy; yy < ey; yy++) {
+                                auto dst_yy = mid_buffer + yy * src_unit_ * 4 + sx * 4;
+                                auto src_yy = src_start + 4 * k_param_->iw * yy + sx * 4;
+                                memcpy(dst_yy, src_yy, count * data_byte_size);
+                            }
+                        }
+
+                        transform_src = mid_buffer;
+                        h_stride0     = 4 * src_unit_;
+                    }
+
+                    SrcTransformFunc_(transform_src, transform_dst, 4, h_stride0);
+                    // source transform end
+                }
+
+                /*
+                repack data format to nchw for gemm func
+                total data num : ic * tile * unit * unit
+                */
+                auto repack_z = repack_buf + 4 * x_c * z;
+                for (int i = 0; i < src_unit_ * src_unit_; i++) {
+                    auto repack_dst = repack_z + i * x_c * k_param_->ic_r4;
+                    auto repack_src = dst_z + i * 4;
+                    load_repack(repack_dst, repack_src, x_c, src_unit_ * src_unit_ * 4);
+                }
+            }
+
+            // gemm multi (n8 for armv8, n4 for armv7)
+            OMP_PARALLEL_FOR_
+            for (int i = 0; i < src_unit_ * src_unit_; i++) {
+                GEMM_FUNC(_dst_origin + i * 4 * x_c, repack_buf + i * k_param_->ic_r4 * x_c,
+                          reinterpret_cast<float *>(k_param_->fil_ptr) + i * k_param_->ic_r4 * k_param_->oc_r4,
+                          k_param_->ic_r4 / 4, x_c * src_unit_ * src_unit_ * 4, k_param_->oc_r4 / 4, x_c, fake_bias, 0);
+            }
+
+            src_z_step = x_c * src_unit_ * src_unit_ * 4;
+            dst_z_step = k_param_->ow * k_param_->oh * 4;
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < k_param_->oc_r4 / 4; z++) {
+                int tid         = OMP_TID_;
+                auto mid_buffer = transform_buffer + tid * transform_num_per_thread;
+                auto src_z      = _dst_origin + z * src_z_step;
+                auto dst_z      = output_ptr + z * dst_z_step;
+                for (int x_i = 0; x_i < x_c; x_i++) {
+                    int idx   = x_idx + x_i;
+                    int w_idx = idx % w_unit;
+                    int h_idx = idx / w_unit;
+
+                    int dst_x = w_idx * dst_unit_;
+                    int dst_y = h_idx * dst_unit_;
+                    int ey    = MIN(dst_y + dst_unit_, k_param_->oh) - dst_y;
+                    int ex    = MIN(dst_x + dst_unit_, k_param_->ow) - dst_x;
+
+                    int count = ex * 4;
+                    // dst transform start
+                    auto src_start       = src_z + x_i * 4;
+                    auto dst_start       = dst_z + 4 * (dst_x + dst_y * k_param_->ow);
+                    float *transform_src = src_start;
+                    T *transform_dst     = nullptr;
+                    int h_stride0        = 4 * dst_unit_;
+                    int h_stride1        = 0;
+
+                    if (ex == dst_unit_) {
+                        transform_dst = dst_start;
+                        h_stride1     = 4 * k_param_->ow;
+                    } else {
+                        transform_dst = mid_buffer;
+                        h_stride1     = 4 * dst_unit_;
+                    }
+
+                    DstTransformFunc_(transform_src, transform_dst, x_c * 4, h_stride1, ey);
+
+                    if (ex != dst_unit_) {
+                        for (int yy = 0; yy < ey; ++yy) {
+                            auto dst_yy = dst_start + yy * 4 * k_param_->ow;
+                            auto src_yy = mid_buffer + yy * 4 * dst_unit_;
+                            memcpy(dst_yy, src_yy, count * data_byte_size);
+                        }
+                    }
+                    // dst transform end
+                }
+            }
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.h
new file mode 100644
index 0000000..ed3cd0c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_3x3.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_3X3_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_3X3_H_
+
+#include "tnn/device/arm/acc/compute/winograd_function.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+#include "tnn/utils/winograd_generator.h"
+
+namespace TNN_NS {
+
+class ArmConvLayer3x3 : public ArmConvLayerCommon {
+public:
+    virtual ~ArmConvLayer3x3();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    static int SelectWinograd(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                              const std::vector<Blob *> &outputs);
+                              
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    int src_unit_;
+    int dst_unit_;
+    SrcTransformFunc SrcTransformFunc_;
+    DstTransformFunc DstTransformFunc_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_3X3_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.cc
new file mode 100644
index 0000000..97ef9b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_acc.h"
+
+#include <memory>
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_group.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+Status ArmConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret;
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (conv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_CONVOLUTION, conv_res, &fp32_res), TNN_OK);
+        conv_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret                    = ArmLayerAcc::Init(context, param, conv_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = ArmLayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    auto data_type = inputs[0]->GetBlobDesc().data_type;
+    if (conv_param->group != 1 && (conv_param->group != inputs[0]->GetBlobDesc().dims[1] ||
+                                   conv_param->group != outputs[0]->GetBlobDesc().dims[1])) {
+        conv_acc_impl_ = std::make_shared<ArmConvLayerGroup>();
+    } else {
+        if (data_type == DATA_TYPE_INT8) {
+            ArmConvLayerAccFactory::CreateImpInt8(inputs, outputs, param_, conv_acc_impl_);
+        }
+#if TNN_ARM82
+        else if (data_type == DATA_TYPE_HALF) {
+            ArmConvLayerAccFactory::CreateImpHalf(inputs, outputs, param_, conv_acc_impl_);
+        }
+#endif
+        else {
+            ArmConvLayerAccFactory::CreateImpFP(inputs, outputs, param_, conv_acc_impl_);
+        }
+    }
+
+    if (!conv_acc_impl_) {
+        return Status(TNNERR_NET_ERR, "Could not create conv impl_");
+    }
+    return conv_acc_impl_->Init(context_, param_, resource_, inputs, outputs);
+}
+
+ArmConvLayerAcc::~ArmConvLayerAcc() {}
+
+Status ArmConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return conv_acc_impl_->Reshape(inputs, outputs);
+}
+
+Status ArmConvLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // converted weights are assumed to be packed, and can be freed now
+    if (conv_acc_f32_resource_) {
+        conv_acc_f32_resource_.reset();
+    }
+
+    if (conv_acc_impl_) {
+        return conv_acc_impl_->DoForward(inputs, outputs);
+    } else {
+        return Status(TNNERR_CONTEXT_ERR, "conv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_ARM_ACC(Conv, LAYER_CONVOLUTION)
+REGISTER_ARM_PRECISION_FP16(LAYER_CONVOLUTION)
+REGISTER_ARM_LAYOUT(LAYER_CONVOLUTION, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.h
new file mode 100644
index 0000000..6e22464
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmConvLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<ArmLayerAcc> conv_acc_impl_           = nullptr;
+    std::shared_ptr<LayerResource> conv_acc_f32_resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.cc
new file mode 100644
index 0000000..46a021f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.cc
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h"
+
+namespace TNN_NS {
+
+/*
+get different impl based on conv params
+ArmConvInt8LayerCommon always as the last solution
+*/
+void ArmConvLayerAccFactory::CreateImpInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                           LayerParam *param, std::shared_ptr<ArmLayerAcc> &conv_acc_impl) {
+    if (0) {
+    }
+#ifdef TNN_ARM82_USE_NEON
+    else if (ArmConvInt8SdotLayerDepthwise3x3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvInt8SdotLayerDepthwise3x3 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvInt8SdotLayerDepthwise3x3>();
+        }
+    }
+#endif
+    else if (ArmConvInt8LayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvInt8LayerDepthwise *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvInt8LayerDepthwise>();
+        }
+    }
+#ifdef TNN_ARM82_USE_NEON
+    else if (ArmConvInt8SdotLayerCommon::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvInt8SdotLayerCommon *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvInt8SdotLayerCommon>();
+        }
+    }
+#endif
+    else if (ArmConvInt8Layer1x1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvInt8Layer1x1 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvInt8Layer1x1>();
+        }
+    } else if (ArmConvInt8LayerCommon::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvInt8LayerCommon *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvInt8LayerCommon>();
+        }
+    }
+}
+
+/*
+get different impl based on conv params
+ArmConvLayerCommon always as the last solution
+bfp16 impl included in fp impl
+*/
+void ArmConvLayerAccFactory::CreateImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                         LayerParam *param, std::shared_ptr<ArmLayerAcc> &conv_acc_impl) {
+    if (ArmConvLayerC3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvLayerC3 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvLayerC3>();
+        }
+    } else if (ArmConvLayer3x3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvLayer3x3 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvLayer3x3>();
+        }
+    } else if (ArmConvLayer1x1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvLayer1x1 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvLayer1x1>();
+        }
+    } else if (ArmConvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (ArmConvLayerDepthwiseS1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+            if (!dynamic_cast<ArmConvLayerDepthwiseS1 *>(conv_acc_impl.get())) {
+                conv_acc_impl = std::make_shared<ArmConvLayerDepthwiseS1>();
+            }
+        } else if (!dynamic_cast<ArmConvLayerDepthwise *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvLayerDepthwise>();
+        }
+    }
+    if (!conv_acc_impl) {
+        conv_acc_impl = std::make_shared<ArmConvLayerCommon>();
+    }
+}
+
+#if TNN_ARM82
+void ArmConvLayerAccFactory::CreateImpHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                         LayerParam *param, std::shared_ptr<ArmLayerAcc> &conv_acc_impl) {
+    if (ArmConvFp16LayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (ArmConvFp16LayerDepthwiseS1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+            if (!dynamic_cast<ArmConvFp16LayerDepthwiseS1 *>(conv_acc_impl.get())) {
+                conv_acc_impl = std::make_shared<ArmConvFp16LayerDepthwiseS1>();
+            }
+        } else {
+            if (!dynamic_cast<ArmConvFp16LayerDepthwise *>(conv_acc_impl.get())) {
+                conv_acc_impl = std::make_shared<ArmConvFp16LayerDepthwise>();
+            }
+        }
+    } else if (ArmConvFp16LayerC3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvFp16LayerC3 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvFp16LayerC3>();
+        }
+    } else if (ArmConvFp16Layer3x3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvFp16Layer3x3 *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvFp16Layer3x3>();
+        }
+    } else if (ArmConvFp16LayerCommon::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<ArmConvFp16LayerCommon *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<ArmConvFp16LayerCommon>();
+        }
+    }
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h
new file mode 100644
index 0000000..a9aba9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_FACTORY_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_FACTORY_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_1x1.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_common.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_layer_depthwise.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_1x1.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_3x3.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_c3.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.h"
+
+#if TNN_ARM82
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_3x3.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_c3.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise_s1.h"
+#ifdef TNN_ARM82_USE_NEON
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_common.h"
+#include "tnn/device/arm/acc/convolution/arm_conv_int8_sdot_layer_depthwise_3x3.h"
+#endif
+#endif
+
+namespace TNN_NS {
+
+class ArmConvLayerAccFactory {
+public:
+    static void CreateImpInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, LayerParam *param,
+                              std::shared_ptr<ArmLayerAcc> &conv_acc_impl);
+
+    static void CreateImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, LayerParam *param,
+                            std::shared_ptr<ArmLayerAcc> &conv_acc_impl);
+
+#if TNN_ARM82
+    static void CreateImpHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, LayerParam *param,
+                              std::shared_ptr<ArmLayerAcc> &conv_acc_impl);
+#endif
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_GROUP_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.cc
new file mode 100644
index 0000000..88f0be5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.cc
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_c3.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+// usually appears on the first conv layer
+bool ArmConvLayerC3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+    return inputs[0]->GetBlobDesc().dims[1] == 3 && param->group == 1;
+}
+
+ArmConvLayerC3::~ArmConvLayerC3() {}
+
+Status ArmConvLayerC3::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+        const int oc_4           = UP_DIV(output_channel, 4);
+        const int ic_4           = UP_DIV(input_channel, 4);
+
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        int weight_count   = oc_4 * ic_4 * kh * kw * 16;
+        int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        buffer_weight_   = RawBuffer(weight_count * data_byte_size);
+        const float *src = conv_res->filter_handle.force_to<float *>();
+        float *dst       = buffer_weight_.force_to<float *>();
+
+        ConvertWeightsFromOI3HWToOHW12((float *)src, (float *)dst, input_channel, output_channel,
+                                       conv_param->kernels[1], conv_param->kernels[0]);
+    }
+    return TNN_OK;
+}
+template <typename T>
+void GemmSlidewC3(T *dst, const T *src, const float *weight, int width, int src_w_setup, int fw, int fh,
+                  int dilateX_step, int dilateY_step) {
+    LOGE("TYPE NOT IMPLEMENT");
+}
+
+template <>
+void GemmSlidewC3(float *dst, const float *src, const float *weight, int width, int src_w_setup, int fw, int fh,
+                  int dilateX_step, int dilateY_step) {
+    GemmFloatSlidewC3(dst, src, weight, width, src_w_setup, fw, fh, dilateX_step, dilateY_step);
+}
+
+template <>
+void GemmSlidewC3(bfp16_t *dst, const bfp16_t *src, const float *weight, int width, int src_w_setup, int fw, int fh,
+                  int dilateX_step, int dilateY_step) {
+    GemmBfp16SlidewC3(dst, src, weight, width, src_w_setup, fw, fh, dilateX_step, dilateY_step);
+}
+
+template <typename T>
+Status ArmConvLayerC3::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input         = inputs[0];
+    auto output        = outputs[0];
+    auto dims_input    = input->GetBlobDesc().dims;
+    auto dims_output   = output->GetBlobDesc().dims;
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch = dims_output[0];
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    int kernel_x               = conv_param->kernels[0];
+    int kernel_y               = conv_param->kernels[1];
+    int dilate_y_step          = k_param_->iw * 4 * conv_param->dialations[1];
+    int dilate_x_step          = 4 * conv_param->dialations[0];
+
+    int weight_z_step = kernel_y * kernel_x * 12;
+
+    T *src_origin = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+
+    int src_xc = 1 + (k_param_->ow - 1) * conv_param->strides[0] + conv_param->dialations[0] * (kernel_x - 1);
+    int workspace_per_thread = src_xc * kernel_y * k_param_->ic_r4 * data_byte_size;
+    T *work_space = reinterpret_cast<T *>(context_->GetSharedWorkSpace(max_num_threads * workspace_per_thread));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto input_ptr  = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r4;
+        auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+        int src_start_x = 0 - conv_param->pads[0];
+        int src_end_x   = src_start_x + src_xc >= k_param_->iw ? k_param_->iw : src_start_x + src_xc;
+
+        int dst_offset = 0;
+        if (src_start_x < 0) {
+            dst_offset  = -src_start_x;
+            src_start_x = 0;
+        }
+        int copy_count = src_end_x - src_start_x;
+        auto src_x     = input_ptr + 4 * src_start_x;
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < k_param_->oh; dy++) {
+            int thread_id = OMP_TID_;
+
+            auto work_space_t = work_space + thread_id * workspace_per_thread / sizeof(T);
+            memset(work_space_t, 0, workspace_per_thread);
+            int src_start_y = dy * conv_param->strides[1] - conv_param->pads[2];
+            int sfy         = MAX(0, (UP_DIV(-src_start_y, conv_param->dialations[1])));
+            int efy         = MIN(kernel_y, UP_DIV(k_param_->ih - src_start_y, conv_param->dialations[1]));
+
+            // copy make board
+            for (int ky = sfy; ky < efy; ky++) {
+                int sy     = src_start_y + ky * conv_param->dialations[1];
+                auto src_y = src_x + 4 * sy * k_param_->iw;
+                auto dst_y = work_space_t + (ky * src_xc + dst_offset) * 4;
+                memcpy(dst_y, src_y, copy_count * 4 * data_byte_size);
+            }
+            for (int dz = 0; dz < k_param_->oc_r4 / 4; dz++) {
+                auto dst_z =
+                    reinterpret_cast<T *>(output_ptr) + dz * k_param_->ow * k_param_->oh * 4 + k_param_->ow * 4 * dy;
+                auto weight_dz = reinterpret_cast<float *>(k_param_->fil_ptr) + dz * weight_z_step;
+                // process one line at a time
+                GemmSlidewC3(dst_z, reinterpret_cast<T *>(work_space_t), weight_dz, k_param_->ow,
+                             conv_param->strides[0] * 4, kernel_x, kernel_y, dilate_x_step, src_xc * 4);
+            }
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerC3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.h
new file mode 100644
index 0000000..3197212
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_c3.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_C3_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_C3_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerC3 : public ArmConvLayerCommon {
+public:
+    // @brief virtual destrcutor
+    virtual ~ArmConvLayerC3();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_C3_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.cc
new file mode 100644
index 0000000..7c10fd9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.cc
@@ -0,0 +1,340 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+#if defined(__aarch64__)
+#define CONVOLUTION_TILED_NUMBER (14)
+#else
+#define CONVOLUTION_TILED_NUMBER (8)
+#endif
+
+namespace TNN_NS {
+/*
+ArmConvLayerCommonas as the last solution, always return true
+handle the case group != 1, dilate != 1, any pads and strides
+*/
+bool ArmConvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+ArmConvLayerCommon::~ArmConvLayerCommon() {}
+
+Status ArmConvLayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+
+        const int group          = conv_param->group;
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+        const int goc            = output_channel / group;
+        const int gic            = input_channel / group;
+        const int goc_4          = UP_DIV(goc, 4);
+        const int gic_4          = UP_DIV(gic, 4);
+
+        const float *src = conv_res->filter_handle.force_to<float *>();
+
+        int weight_count   = group * goc_4 * gic_4 * kh * kw * 16;
+        int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        /*
+        [ATTENTION]
+        alloc more NEON_KERNEL_EXTRA_LOAD bytes for assemble kernel prefetch
+        */
+        RawBuffer temp_buffer(weight_count * data_byte_size + NEON_KERNEL_EXTRA_LOAD);
+        float *dst = temp_buffer.force_to<float *>();
+
+        ConvertWeightsFromGOIHWToGOIHW16((float *)src, (float *)dst, group, input_channel, output_channel,
+                                         conv_param->kernels[1], conv_param->kernels[0]);
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_bias_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->bias_handle.GetDataType());
+        RawBuffer temp_buffer(total_byte_size);
+        if (conv_param->bias) {
+            const int bias_handle_size    = conv_res->bias_handle.GetBytesSize();
+            const float *bias_handle_data = conv_res->bias_handle.force_to<float *>();
+
+            if (conv_res->bias_handle.GetDataType() == DATA_TYPE_FLOAT ||
+                conv_res->bias_handle.GetDataType() == DATA_TYPE_HALF) {
+                memcpy(temp_buffer.force_to<float *>(), conv_res->bias_handle.force_to<float *>(), bias_handle_size);
+            }
+        }
+        buffer_bias_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    k_param_->fil_ptr = buffer_weight_.force_to<void *>();
+    k_param_->bias    = buffer_bias_.force_to<void *>();
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    /*
+    post func, support bias, bias + relu/relu6
+    */
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        if (conv_param->activation_type == ActivationType_ReLU) {
+            post_func_ = PostAddBiasRelu<float>;
+        } else if (conv_param->activation_type == ActivationType_SIGMOID_MUL) {
+            post_func_ = context_->GetPrecision() == PRECISION_HIGH ? PostAddBiasSwish<float, float, false>
+                                                                    : PostAddBiasSwish<float, float, true>;
+        } else if (conv_param->activation_type == ActivationType_ReLU6) {
+            post_func_ = PostAddBiasRelu6<float>;
+        } else {
+            post_func_ = PostAddBias<float>;
+        }
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        if (conv_param->activation_type == ActivationType_ReLU) {
+            post_func_ = PostAddBiasRelu<bfp16_t>;
+        } else if (conv_param->activation_type == ActivationType_SIGMOID_MUL) {
+            post_func_ = context_->GetPrecision() == PRECISION_HIGH ? PostAddBiasSwish<bfp16_t, float, false>
+                                                                    : PostAddBiasSwish<bfp16_t, float, true>;
+        } else if (conv_param->activation_type == ActivationType_ReLU6) {
+            post_func_ = PostAddBiasRelu6<bfp16_t>;
+        } else {
+            post_func_ = PostAddBias<bfp16_t>;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+template <typename T>
+Status ArmConvLayerCommon::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch = dims_output[0];
+    const int group = conv_param->group;
+    auto ic = dims_input[1], input_slice = UP_DIV(dims_input[1], 4);
+    auto oc = dims_output[1], output_slice = UP_DIV(dims_output[1], 4), output_slice_per_group = output_slice / group;
+
+    auto gic                    = dims_input[1] / group;
+    auto goc                    = dims_output[1] / group;
+    auto gic_4                  = UP_DIV(gic, 4);
+    auto goc_4                  = UP_DIV(goc, 4);
+    auto input_bytes_per_group  = k_param_->iw * k_param_->ih * gic_4 * 4 * data_byte_size;
+    auto output_bytes_per_group = k_param_->ow * k_param_->oh * goc_4 * 4 * data_byte_size;
+
+    int dilate_y_step = k_param_->iw * 4 * conv_param->dialations[1];
+    int dilate_x_step = 4 * conv_param->dialations[0];
+
+    int src_z_step    = k_param_->iw * k_param_->ih * 4;
+    int weight_z_step = conv_param->kernels[1] * conv_param->kernels[0] * gic_4 * 16;
+
+    T *input_orign = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_origin  = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+
+    int x_count = UP_DIV(k_param_->ow, CONVOLUTION_TILED_NUMBER);
+    int src_xc  = 1 + (CONVOLUTION_TILED_NUMBER - 1) * conv_param->strides[0] +
+                 conv_param->dialations[0] * (conv_param->kernels[0] - 1);
+    int workspace_per_thread = src_xc * conv_param->kernels[1] * ROUND_UP(dims_input[1], 4) * data_byte_size;
+    RawBuffer i_buffer;
+    RawBuffer o_buffer;
+
+    T *work_space = reinterpret_cast<T *>(context_->GetSharedWorkSpace(max_num_threads * workspace_per_thread));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        T *input_ptr;
+        T *output_ptr;
+
+        /*
+        first unpack input tensor to nchw data format
+        pack data to make sure every group chanel algin4
+        */
+        if (gic_4 != (gic / 4) && group != 1) {
+            RawBuffer i_temp_buffer_(group * input_bytes_per_group);
+            RawBuffer temp_buffer(group * input_bytes_per_group);
+            i_buffer  = i_temp_buffer_;
+            input_ptr = i_buffer.force_to<T *>();
+
+            UnpackC4(temp_buffer.force_to<T *>(),
+                     input_orign + batch_idx * k_param_->iw * k_param_->ih * ROUND_UP(ic, 4),
+                     k_param_->iw * k_param_->ih, ic);
+            for (int g = 0; g < group; g++) {
+                PackC4(input_ptr + g * input_bytes_per_group / 4,
+                       temp_buffer.force_to<T *>() + g * k_param_->iw * k_param_->ih * gic, k_param_->iw * k_param_->ih,
+                       gic);
+            }
+        } else {
+            input_ptr = input_orign + batch_idx * k_param_->iw * k_param_->ih * ROUND_UP(ic, 4);
+        }
+
+        if (goc_4 != (goc / 4) && group != 1) {
+            RawBuffer o_temp_buffer_(group * output_bytes_per_group);
+            o_buffer   = o_temp_buffer_;
+            output_ptr = o_buffer.force_to<T *>();
+        } else {
+            output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * ROUND_UP(oc, 4);
+        }
+
+        for (int g = 0; g < group; g++) {
+            auto input_g_ptr  = input_ptr + g * k_param_->iw * k_param_->ih * gic_4 * 4;
+            auto output_g_ptr = output_ptr + g * k_param_->ow * k_param_->oh * goc_4 * 4;
+            auto w_g_offset   = g * goc_4 * weight_z_step;
+            OMP_PARALLEL_FOR_
+            for (int x = 0; x < x_count; x++) {
+                int thread_id = OMP_TID_;
+
+                auto work_space_t = work_space + thread_id * workspace_per_thread / sizeof(T);
+
+                int x_idx    = (int)x * CONVOLUTION_TILED_NUMBER;
+                int x_remain = k_param_->ow - x_idx;
+                int x_c      = x_remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : x_remain;
+                int src_xc =
+                    1 + (x_c - 1) * conv_param->strides[0] + conv_param->dialations[0] * (conv_param->kernels[0] - 1);
+                int d_x         = x_idx;
+                int src_start_x = d_x * conv_param->strides[0] - conv_param->pads[0];
+                int src_end_x   = src_start_x + src_xc >= k_param_->iw ? k_param_->iw : src_start_x + src_xc;
+
+                int dst_offset = 0;
+                if (src_start_x < 0) {
+                    dst_offset  = -src_start_x;
+                    src_start_x = 0;
+                }
+                int copy_count = src_end_x - src_start_x;
+                auto src_x     = input_g_ptr + 4 * src_start_x;
+
+                for (int dy = 0; dy < k_param_->oh; dy++) {
+                    /*
+                    copy make board, data in workspace are dirty, should be clear first
+                    */
+                    memset(work_space_t, 0, workspace_per_thread);
+                    int src_start_y = dy * conv_param->strides[1] - conv_param->pads[2];
+                    int sfy         = MAX(0, (UP_DIV(-src_start_y, conv_param->dialations[1])));
+                    int efy =
+                        MIN(conv_param->kernels[1], UP_DIV(k_param_->ih - src_start_y, conv_param->dialations[1]));
+
+                    for (int sz = 0; sz < gic_4; sz++) {
+                        auto dst_z = work_space_t + sz * src_xc * conv_param->kernels[1] * 4;
+                        auto src_z = src_x + sz * src_z_step;
+                        for (int ky = sfy; ky < efy; ky++) {
+                            int sy     = src_start_y + ky * conv_param->dialations[1];
+                            auto src_y = src_z + 4 * sy * k_param_->iw;
+                            auto dst_y = dst_z + (ky * src_xc + dst_offset) * 4;
+                            memcpy(dst_y, src_y, copy_count * 4 * sizeof(T));
+                        }
+                    }
+
+                    // output: tile x oc
+                    for (int dz = 0; dz < goc_4; dz++) {
+                        auto dst_z =
+                            output_g_ptr + dz * k_param_->ow * k_param_->oh * 4 + x_idx * 4 + k_param_->ow * 4 * dy;
+                        auto weight_dz = reinterpret_cast<float *>(k_param_->fil_ptr) + w_g_offset + dz * weight_z_step;
+
+                        ConvCommonO4(dst_z, work_space_t, weight_dz, x_c, conv_param->strides[0] * 4, gic_4,
+                                     src_xc * 4 * conv_param->kernels[1], conv_param->kernels[0],
+                                     conv_param->kernels[1], dilate_x_step, src_xc * 4);
+                    }
+                }
+            }
+        }
+
+        /*
+        first unpack every group output data to get nchw data format
+        pack data to make sure output tensor channel algin4 and continuously
+        */
+        if (goc_4 != (goc / 4) && group != 1) {
+            RawBuffer temp_buffer(group * output_bytes_per_group);
+            for (int g = 0; g < group; g++) {
+                UnpackC4(temp_buffer.force_to<T *>() + g * k_param_->ow * k_param_->oh * goc,
+                         output_ptr + g * k_param_->ow * k_param_->oh * goc_4 * 4, k_param_->ow * k_param_->oh, goc);
+            }
+            PackC4(dst_origin + batch_idx * k_param_->ow * k_param_->oh * ROUND_UP(oc, 4), temp_buffer.force_to<T *>(),
+                   k_param_->ow * k_param_->oh, oc);
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+template <typename T>
+void ArmConvLayerCommon::PostExec(const std::vector<Blob *> &outputs) {
+    const int batch = outputs[0]->GetBlobDesc().dims[0];
+    auto dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+    if (post_func_) {
+        OMP_PARALLEL_FOR_
+        for (int batch_idx = 0; batch_idx < batch; ++batch_idx) {
+            auto output_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+            for (int dz = 0; dz < k_param_->oc_r4; dz += 4) {
+                auto dst_z    = output_ptr + dz * k_param_->ow * k_param_->oh;
+                float *bias_z = reinterpret_cast<float *>(k_param_->bias) + dz;
+                post_func_(dst_z, bias_z, k_param_->ow * k_param_->oh, 1);
+            }
+        }
+    }
+}
+
+template void ArmConvLayerCommon::PostExec<float>(const std::vector<Blob *> &outputs);
+template void ArmConvLayerCommon::PostExec<bfp16_t>(const std::vector<Blob *> &outputs);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.h
new file mode 100644
index 0000000..f3eca50
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_common.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerCommon : public ArmLayerAcc {
+public:
+    virtual ~ArmConvLayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // always true as last solution
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc conv params and set post op
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    PostFunc post_func_ = nullptr;
+
+    template <typename T>
+    void PostExec(const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.cc
new file mode 100644
index 0000000..738b622
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.cc
@@ -0,0 +1,179 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool ArmConvLayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    const int group          = param->group;
+    const int input_channel  = inputs[0]->GetBlobDesc().dims[1];
+    const int output_channel = outputs[0]->GetBlobDesc().dims[1];
+
+    return group == input_channel && group == output_channel;
+}
+
+ArmConvLayerDepthwise::~ArmConvLayerDepthwise() {}
+
+Status ArmConvLayerDepthwise::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = param->kernels[0];
+        int kh = param->kernels[1];
+
+        const int group  = param->group;
+        const int group4 = UP_DIV(group, 4) * 4;
+
+        const float *src = conv_res->filter_handle.force_to<float *>();
+
+        int weight_count   = group4 * kh * kw;
+        int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            RawBuffer temp_buffer(weight_count * data_byte_size);
+            float *dst = temp_buffer.force_to<float *>();
+
+            DataFormatConverter::ConvertFromNCHWToNCHW4Float((float *)src, (float *)dst, 1, group,
+                                                                param->kernels[1], param->kernels[0]);
+            temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+template <typename T>
+Status ArmConvLayerDepthwise::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch    = dims_output[0];
+    int dst_depth_quad = UP_DIV(dims_output[1], 4);
+    int dst_z_step     = k_param_->ow * k_param_->oh;
+    int src_z_step     = k_param_->iw * k_param_->ih;
+    int dilate_y_step  = k_param_->iw * 4 * param->dialations[1];
+    int dilate_x_step  = 4 * param->dialations[0];
+    int weight_z_step  = param->kernels[0] * param->kernels[1];
+
+    int l = 0, t = 0, r = k_param_->ow, b = k_param_->oh;
+    for (; l * param->strides[0] - param->pads[0] < 0; l++)
+        ;
+    for (; t * param->strides[1] - param->pads[2] < 0; t++)
+        ;
+    for (; (r - 1) * param->strides[0] - param->pads[0] + param->kernels[0] * param->dialations[0] > k_param_->iw &&
+            r > l; r--)
+        ;
+    for (; (b - 1) * param->strides[1] - param->pads[2] + param->kernels[1] * param->dialations[1] > k_param_->ih &&
+            b > t; b--)
+        ;
+
+    // lamda function to process left/right/top/bottom corner
+    auto RunCorner = [=](T *dst_z, const T *src_z, const float *weight_dz, int left, int top, int right, int bottom) {
+        for (int dy = top; dy < bottom; ++dy) {
+            auto *dst_y        = dst_z + dy * k_param_->ow * 4;
+            int srcStartY      = dy * param->strides[1] - param->pads[2];
+            const auto *src_dy = src_z + srcStartY * k_param_->iw * 4;
+            int sfy            = MAX(0, (UP_DIV(-srcStartY, param->dialations[1])));
+            int efy            = MIN(param->kernels[1], UP_DIV(k_param_->ih - srcStartY, param->dialations[1]));
+            for (int dx = left; dx < right; ++dx) {
+                auto *dst_x        = dst_y + 4 * dx;
+                int srcStartX      = dx * param->strides[0] - param->pads[0];
+                const auto *src_dx = src_dy + srcStartX * 4;
+                int sfx            = MAX(0, (UP_DIV(-srcStartX, param->dialations[0])));
+                int efx            = MIN(param->kernels[0], UP_DIV(k_param_->iw - srcStartX, param->dialations[0]));
+                DepthwiseUnit(dst_x,
+                              src_dx + (sfx * param->dialations[0] + sfy * param->dialations[1] * k_param_->iw) * 4,
+                              weight_dz + 4 * (param->kernels[0] * sfy + sfx), efx - sfx, efy - sfy,
+                              4 * param->kernels[0], dilate_x_step, dilate_y_step);
+            }
+        }
+    };
+
+    auto *src_origin = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    auto *dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+    auto dw_full     = DepthwiseConv<T>;
+    /*
+    convdw3x3 stride >= 2 here
+    convdw3x3s1 has separate kernel in convdws1 acc
+    */
+    if (param->kernels[0] == 3 && param->kernels[1] == 3) {
+        dw_full = DepthwiseConv3x3<T>;
+    }
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r4;
+        auto dst_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+
+        OMP_PARALLEL_FOR_
+        for (int dz = 0; dz < k_param_->oc_r4; dz += 4) {
+            auto *dst_z     = dst_ptr + dst_z_step * dz;
+            auto *src_z     = src_ptr + src_z_step * dz;
+            auto *weight_dz = reinterpret_cast<float *>(k_param_->fil_ptr) + dz * weight_z_step;
+            auto *bias_z    = reinterpret_cast<T *>(k_param_->bias) + dz;
+
+            RunCorner(dst_z, src_z, weight_dz, 0, 0, k_param_->ow, t);
+            RunCorner(dst_z, src_z, weight_dz, 0, b, k_param_->ow, k_param_->oh);
+            RunCorner(dst_z, src_z, weight_dz, 0, t, l, b);
+            RunCorner(dst_z, src_z, weight_dz, r, t, k_param_->ow, b);
+
+            if (r > l && b > t) {
+                dw_full(dst_z + t * k_param_->ow * 4 + l * 4,
+                        src_z + (t * param->strides[1] - param->pads[2]) * k_param_->iw * 4 +
+                            (l * param->strides[0] - param->pads[0]) * 4,
+                        weight_dz, r - l, param->strides[0] * 4, param->kernels[0], param->kernels[1], dilate_x_step,
+                        dilate_y_step, b - t, k_param_->iw * 4 * param->strides[1], k_param_->ow * 4);
+            }
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h
new file mode 100644
index 0000000..9a72942
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerDepthwise : public ArmConvLayerCommon {
+public:
+    virtual ~ArmConvLayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.cc
new file mode 100644
index 0000000..8380aba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.cc
@@ -0,0 +1,194 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+
+#include "tnn/utils/omp_utils.h"
+
+#define MAX_CACHE_LINE_NUM 7
+
+namespace TNN_NS {
+
+template <typename T>
+static inline void cache_lines_slide(T **cache_lines, int n) {
+    auto temp = cache_lines[0];
+    for (int i = 0; i < n - 1; i++) {
+        cache_lines[i] = cache_lines[i + 1];
+    }
+    cache_lines[n - 1] = temp;
+}
+
+bool ArmConvLayerDepthwiseS1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    // only support convdw3x3 and convdw5x5
+    return param->group == input_channel && param->group == output_channel &&
+           (param->kernels[0] == param->kernels[1] && (param->kernels[0] == 3 || param->kernels[0] == 5)) &&
+           param->dialations[0] == 1 && param->dialations[1] == 1 && param->strides[0] == 1 && param->strides[1] == 1;
+}
+
+ArmConvLayerDepthwiseS1::~ArmConvLayerDepthwiseS1() {}
+
+Status ArmConvLayerDepthwiseS1::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ArmConvLayerDepthwise::Reshape(inputs, outputs);
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    if (conv_param) {
+        if (in_data_type == DATA_TYPE_FLOAT) {
+            if (conv_param->kernels[1] == 3) {
+                SlideFunc_ = ConvDw3x3FloatSlideW;
+            } else if (conv_param->kernels[1] == 5) {
+                SlideFunc_ = ConvDw5x5FloatSlideW;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else if (in_data_type == DATA_TYPE_BFP16) {
+            if (conv_param->kernels[1] == 3) {
+                SlideFunc_ = ConvDw3x3Bfp16SlideW;
+            } else if (conv_param->kernels[1] == 5) {
+                SlideFunc_ = ConvDw5x5Bfp16SlideW;
+            } else {
+                return TNNERR_LAYER_ERR;
+            }
+        } else {
+            return TNNERR_LAYER_ERR;
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayerDepthwiseS1::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto in_data_type = inputs[0]->GetBlobDesc().data_type;
+    if (in_data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (in_data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    } else {
+        return TNNERR_LAYER_ERR;
+    }
+}
+
+template <typename T>
+Status ArmConvLayerDepthwiseS1::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    const int batch    = dims_output[0];
+    int dst_depth_quad = UP_DIV(dims_output[1], 4);
+    int dst_z_step     = k_param_->ow * k_param_->oh;
+    int src_z_step     = k_param_->iw * k_param_->ih;
+    int pad_l          = conv_param->pads[0];
+    int pad_r          = conv_param->pads[1];
+    int pad_t          = conv_param->pads[2];
+    int pad_b          = conv_param->pads[3];
+    int weight_z_step  = conv_param->kernels[0] * conv_param->kernels[1];
+
+    auto *src_origin         = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    auto *dst_origin         = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+    int max_num_threads      = OMP_MAX_THREADS_NUM_;
+    int workspace_per_thread = conv_param->kernels[1] * (k_param_->iw + pad_l + pad_r) * 4 * data_byte_size;
+
+    if (!SlideFunc_) {
+        LOGE("Error: ConvDw slide func is nil\n");
+        return Status(TNNERR_LAYER_ERR, "Error: ConvDw slide func is nil");
+    }
+
+    if (pad_t > conv_param->kernels[1]) {
+        LOGE("ERROR: ConvDw pad_t must small than kernel_h\n");
+        return Status(TNNERR_LAYER_ERR, "ERROR: ConvDw pad_t must small than kernel_h");
+    }
+
+    auto work_space = reinterpret_cast<T *>(context_->GetSharedWorkSpace(max_num_threads * workspace_per_thread));
+
+    /*
+    [ATTENTION]
+    data in workspace are dirty, must be clear first
+    */
+    memset(work_space, 0, max_num_threads * workspace_per_thread);
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_origin + batch_idx * k_param_->iw * k_param_->ih * k_param_->ic_r4;
+        auto dst_ptr = dst_origin + batch_idx * k_param_->ow * k_param_->oh * k_param_->oc_r4;
+
+        OMP_PARALLEL_FOR_
+        for (int dz = 0; dz < k_param_->oc_r4; dz += 4) {
+            auto *dst_z                       = dst_ptr + dst_z_step * dz;
+            auto *src_z                       = src_ptr + src_z_step * dz;
+            const auto *weight_dz             = reinterpret_cast<float *>(k_param_->fil_ptr) + dz * weight_z_step;
+            int thread_id                     = OMP_TID_;
+            auto thread_work_space            = work_space + thread_id * workspace_per_thread / data_byte_size;
+            T *cache_line[MAX_CACHE_LINE_NUM] = {nullptr};
+            for (int i = 0; i < conv_param->kernels[1]; i++) {
+                cache_line[i] = thread_work_space + i * (k_param_->iw + pad_l + pad_r) * 4;
+            }
+
+            auto src_y = src_z;
+            auto dst_y = dst_z;
+            // memset pat_t lines
+            for (int ky = 0; ky < pad_t; ky++) {
+                memset(cache_line[ky] + pad_l * 4, 0, k_param_->iw * 4 * data_byte_size);
+            }
+            // load mid lines
+            for (int ky = pad_t; ky < conv_param->kernels[1] - 1; ky++) {
+                memcpy(cache_line[ky] + pad_l * 4, src_y, k_param_->iw * 4 * data_byte_size);
+                src_y += k_param_->iw * 4;
+            }
+            for (int dy = 0; dy < k_param_->oh - pad_b; dy++) {
+                // load only one line every loop
+                memcpy(cache_line[conv_param->kernels[1] - 1] + pad_l * 4, src_y, k_param_->iw * 4 * data_byte_size);
+                // kernel func
+                SlideFunc_(dst_y, (void **)cache_line, weight_dz, k_param_->ow);
+
+                src_y += k_param_->iw * 4;
+                dst_y += k_param_->ow * 4;
+                cache_lines_slide(cache_line, conv_param->kernels[1]);
+            }
+            // memset pad_b lines
+            for (int ky = pad_b; ky > 0; ky--) {
+                memset(cache_line[conv_param->kernels[1] - 1] + pad_l * 4, 0, k_param_->iw * 4 * data_byte_size);
+                // kernel func
+                SlideFunc_(dst_y, (void **)cache_line, weight_dz, k_param_->ow);
+
+                dst_y += k_param_->ow * 4;
+                cache_lines_slide(cache_line, conv_param->kernels[1]);
+            }
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.h
new file mode 100644
index 0000000..dc55130
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_depthwise_s1.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_S1_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_S1_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerDepthwiseS1 : public ArmConvLayerDepthwise {
+public:
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    virtual ~ArmConvLayerDepthwiseS1();
+    virtual Status DoForward(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    template<typename T> 
+    Status Exec(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param,
+                            const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+protected:
+    ConvDwSliceFunc  SlideFunc_ = nullptr;
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_ACC_DEPTHWISE_S1_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.cc b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.cc
new file mode 100644
index 0000000..f4409c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.cc
@@ -0,0 +1,466 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_group.h"
+
+#include <memory>
+
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status ArmConvLayerGroup::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret;
+    ConvLayerParam *conv_param  = dynamic_cast<ConvLayerParam *>(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource);
+    std::vector<shared_ptr<LayerResource>> resources;
+
+    CHECK_PARAM_NULL(conv_param);
+    CHECK_PARAM_NULL(conv_res);
+
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+
+    group_ = conv_param->group;
+
+    if (group_inputs_.size() == group_ && group_outputs_.size() == group_) {
+        // has been init, just return;
+        return TNN_OK;
+    } else {
+        group_inputs_.clear();
+        group_outputs_.clear();
+        group_scale_res_.clear();
+        conv_acc_impls_.clear();
+    }
+
+    for (int g = 0; g < group_; g++) {
+        BlobDesc empty_desc;
+        group_inputs_.emplace_back(std::make_shared<Blob>(empty_desc));
+        group_outputs_.emplace_back(std::make_shared<Blob>(empty_desc));
+    }
+
+    RETURN_ON_NEQ(SetGroupParam(group_conv_param_), TNN_OK);
+    RETURN_ON_NEQ(SplitResource(resources), TNN_OK);
+    RETURN_ON_NEQ(SetSplitBlobDesc(inputs[0], group_inputs_), TNN_OK);
+    RETURN_ON_NEQ(SetSplitBlobDesc(outputs[0], group_outputs_), TNN_OK);
+    RETURN_ON_NEQ(SetSplitBlobScale(inputs[0], group_inputs_), TNN_OK);
+    RETURN_ON_NEQ(SetSplitBlobScale(outputs[0], group_outputs_), TNN_OK);
+
+    for (int g = 0; g < group_; g++) {
+        std::vector<Blob *> local_inputs;
+        std::vector<Blob *> local_outputs;
+        local_inputs.emplace_back(group_inputs_[g].get());
+        local_outputs.emplace_back(group_outputs_[g].get());
+        std::shared_ptr<ArmLayerAcc> tmp_acc = nullptr;
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            // CreateImpInt8(local_inputs, local_outputs, group_conv_param_.get(), tmp_acc);
+            ArmConvLayerAccFactory::CreateImpInt8(local_inputs, local_outputs, group_conv_param_.get(), tmp_acc);
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+                   inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+            // CreateImpFP(local_inputs, local_outputs, group_conv_param_.get(), tmp_acc);
+            ArmConvLayerAccFactory::CreateImpFP(local_inputs, local_outputs, group_conv_param_.get(), tmp_acc);
+        } 
+#if TNN_ARM82
+        else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            ArmConvLayerAccFactory::CreateImpHalf(local_inputs, local_outputs, group_conv_param_.get(), tmp_acc);
+        }
+#endif
+        CHECK_PARAM_NULL(tmp_acc);
+        RETURN_ON_NEQ(tmp_acc->Init(context_, group_conv_param_.get(), resources[g].get(), local_inputs, local_outputs),
+                      TNN_OK);
+
+        conv_acc_impls_.emplace_back(tmp_acc);
+    }
+
+    return TNN_OK;
+}
+
+ArmConvLayerGroup::~ArmConvLayerGroup() {}
+
+Status ArmConvLayerGroup::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_acc_impls_.size() == 0) {
+        return Status(TNNERR_LAYER_ERR, "Error: group conv impl is nil");
+    } else {
+        RETURN_ON_NEQ(SetSplitBlobDesc(inputs[0], group_inputs_), TNN_OK);
+        RETURN_ON_NEQ(SetSplitBlobDesc(outputs[0], group_outputs_), TNN_OK);
+        for (int g = 0; g < group_; g++) {
+            std::vector<Blob *> local_inputs;
+            std::vector<Blob *> local_outputs;
+            local_inputs.emplace_back(group_inputs_[g].get());
+            local_outputs.emplace_back(group_outputs_[g].get());
+            RETURN_ON_NEQ(conv_acc_impls_[g]->Reshape(local_inputs, local_outputs), TNN_OK);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerGroup::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret;
+    if (!conv_acc_impls_.size()) {
+        return Status(TNNERR_LAYER_ERR, "conv_acc_impl_ is nil");
+    }
+
+    RETURN_ON_NEQ(SetSplitBlobDesc(inputs[0], group_inputs_), TNN_OK);
+    RETURN_ON_NEQ(SetSplitBlobDesc(outputs[0], group_outputs_), TNN_OK);    
+
+    auto input_dims    = inputs[0]->GetBlobDesc().dims;
+    auto output_dims   = outputs[0]->GetBlobDesc().dims;
+    auto data_type     = inputs[0]->GetBlobDesc().data_type;
+    auto element_size  = DataTypeUtils::GetBytesSize(data_type);
+    auto batch         = input_dims[0];
+    auto input_origin  = reinterpret_cast<char *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
+    auto output_origin = reinterpret_cast<char *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    int align_c = 4;
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16 || data_type == DATA_TYPE_INT8) {
+        align_c = 4;
+    } else if (data_type == DATA_TYPE_HALF) {
+        align_c = 8;
+    }
+
+    auto ic_per_group = input_dims[1] / group_;
+    auto oc_per_group = output_dims[1] / group_;
+
+    size_t input_step_per_batch        = ROUND_UP(input_dims[1], align_c) * input_dims[2] * input_dims[3];
+    size_t input_step_per_group        = ic_per_group * input_dims[2] * input_dims[3];
+    size_t input_step_per_group_align  = ROUND_UP(ic_per_group, align_c) * input_dims[2] * input_dims[3];
+    size_t output_step_per_batch       = ROUND_UP(output_dims[1], align_c) * output_dims[2] * output_dims[3];
+    size_t output_step_per_group       = oc_per_group * output_dims[2] * output_dims[3];
+    size_t output_step_per_group_align = ROUND_UP(oc_per_group, align_c) * output_dims[2] * output_dims[3];
+
+    RawBuffer unpack_input_buffer;
+    RawBuffer pack_input_buffer;
+    RawBuffer unpack_output_buffer;
+    RawBuffer pack_output_buffer;
+    if (data_type == DATA_TYPE_INT8) {
+        RawBuffer unpack_input_data(group_ * input_step_per_group * element_size);
+        RawBuffer pack_input_data(group_ * input_step_per_group_align * element_size);
+        RawBuffer unpack_output_data(group_ * output_step_per_group * element_size);
+        RawBuffer pack_output_data(group_ * output_step_per_group_align * element_size);
+        unpack_input_buffer = unpack_input_data;
+        pack_input_buffer = pack_input_data;
+        unpack_output_buffer = unpack_output_data;
+        pack_output_buffer = pack_output_data;
+    } else {
+        if (ic_per_group % align_c != 0) {
+            RawBuffer unpack_data(group_ * input_step_per_group * element_size);
+            RawBuffer pack_data(group_ * input_step_per_group_align * element_size);
+            unpack_input_buffer = unpack_data;
+            pack_input_buffer = pack_data;
+        }
+        if (oc_per_group % align_c != 0) {
+            RawBuffer unpack_data(group_ * output_step_per_group * element_size);
+            RawBuffer pack_data(group_ * output_step_per_group_align * element_size);
+            unpack_output_buffer = unpack_data;
+            pack_output_buffer = pack_data;
+        }
+    }
+
+    for (int b = 0; b < batch; b++) {
+        auto input_batch = input_origin + b * input_step_per_batch * element_size;
+        auto output_batch = output_origin + b * output_step_per_batch * element_size;
+        auto output_tmp = output_batch;
+
+        if (data_type == DATA_TYPE_INT8) {
+            TransformInput(pack_input_buffer.force_to<char *>(),
+                        unpack_input_buffer.force_to<char *>(), input_batch,
+                        ic_per_group, input_step_per_group, input_step_per_group_align,
+                        input_dims, data_type);
+            input_batch = pack_input_buffer.force_to<char *>();
+            output_tmp = pack_output_buffer.force_to<char *>();
+        } else {
+            if (ic_per_group % align_c != 0) {
+                TransformInput(pack_input_buffer.force_to<char *>(),
+                            unpack_input_buffer.force_to<char *>(), input_batch,
+                            ic_per_group, input_step_per_group, input_step_per_group_align,
+                            input_dims, data_type);
+                input_batch = pack_input_buffer.force_to<char *>();
+            }
+
+            if (oc_per_group % align_c != 0) {
+                output_tmp = pack_output_buffer.force_to<char *>();
+            }
+        }
+
+        for (int g = 0; g < group_; g++) {
+            BlobHandle handle_input;
+            BlobHandle handle_output;
+            handle_input.base = reinterpret_cast<void *>((input_batch + g * input_step_per_group_align * element_size));
+            handle_input.bytes_offset = 0;
+            group_inputs_[g].get()->SetHandle(handle_input);
+            handle_output.base = reinterpret_cast<void *>((output_tmp + g * output_step_per_group_align * element_size));
+            handle_output.bytes_offset = 0;
+            group_outputs_[g].get()->SetHandle(handle_output);
+
+            std::vector<Blob *> local_inputs;
+            std::vector<Blob *> local_outputs;
+            local_inputs.emplace_back(group_inputs_[g].get());
+            local_outputs.emplace_back(group_outputs_[g].get());
+            CHECK_PARAM_NULL(conv_acc_impls_[g].get());
+            RETURN_ON_NEQ(conv_acc_impls_[g]->DoForward(local_inputs, local_outputs), TNN_OK);
+        }
+
+        if (data_type == DATA_TYPE_INT8) {
+            TransformOutput(pack_output_buffer.force_to<char *>(),
+                            unpack_output_buffer.force_to<char *>(), output_batch,
+                            oc_per_group, output_step_per_group, output_step_per_group_align,
+                            output_dims, data_type);
+        } else {
+            if (oc_per_group % align_c != 0) {
+                TransformOutput(pack_output_buffer.force_to<char *>(),
+                                unpack_output_buffer.force_to<char *>(), output_batch,
+                                oc_per_group, output_step_per_group, output_step_per_group_align,
+                                output_dims, data_type);
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmConvLayerGroup::SetGroupParam(std::shared_ptr<LayerParam> &group_param) {
+    auto conv_param_ = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param_);
+
+    auto conv_param = new ConvLayerParam();
+    CHECK_PARAM_NULL(conv_param);
+
+    *conv_param                = *conv_param_;
+    conv_param->output_channel = conv_param->output_channel / conv_param->group;
+    conv_param->group          = 1;
+
+    group_param = std::shared_ptr<LayerParam>(conv_param);
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerGroup::SetSplitBlobDesc(Blob *blob, std::vector<std::shared_ptr<Blob>> &blobs) {
+    auto group_desc    = blob->GetBlobDesc();
+    group_desc.dims[0] = 1;
+    group_desc.dims[1] = group_desc.dims[1] / group_;
+
+    for (int g = 0; g < group_; g++) {
+        blobs[g]->SetBlobDesc(group_desc);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmConvLayerGroup::SetSplitBlobScale(Blob *blob, std::vector<std::shared_ptr<Blob>> &blobs) {
+    auto data_type = blob->GetBlobDesc().data_type;
+
+    if (data_type != DATA_TYPE_INT8) {
+        // non int8 blob have no scale handle
+        return TNN_OK;
+    }
+
+    auto int8_blob = reinterpret_cast<BlobInt8 *>(blob);
+    auto int8_res  = int8_blob->GetIntResource();
+
+    for (int g = 0; g < group_; g++) {
+        auto old_blob = blobs[g];
+        auto new_blob = new BlobInt8(old_blob->GetBlobDesc(), old_blob->GetHandle());
+        auto new_res  = new IntScaleResource();
+        CHECK_PARAM_NULL(new_blob);
+        CHECK_PARAM_NULL(new_res);
+        auto group_scale_bytes_size = int8_res->scale_handle.GetBytesSize() / group_;
+        auto group_bias_bytes_size  = int8_res->bias_handle.GetBytesSize() / group_;
+
+        // set int8 group scale
+        if (int8_res->scale_handle.GetDataCount() == 1) {
+            new_res->scale_handle = RawBuffer(sizeof(float), int8_res->scale_handle.force_to<char *>());
+        } else {
+            new_res->scale_handle = RawBuffer(group_scale_bytes_size,
+                                              int8_res->scale_handle.force_to<char *>() + g * group_scale_bytes_size);
+        }
+
+        // set int8 group bias
+        if (int8_res->bias_handle.GetDataCount() == 1) {
+            new_res->bias_handle = RawBuffer(sizeof(int32_t), int8_res->bias_handle.force_to<char *>());
+        } else {
+            new_res->bias_handle =
+                RawBuffer(group_bias_bytes_size, int8_res->bias_handle.force_to<char *>() + g * group_bias_bytes_size);
+        }
+
+        // set int8 group resource
+        new_blob->SetIntResource(new_res);
+
+        // replace blob with int8 blob
+        blobs[g] = std::shared_ptr<Blob>(new_blob);
+        group_scale_res_.emplace_back(std::shared_ptr<IntScaleResource>(new_res));
+    }
+
+    return TNN_OK;
+}
+
+static inline void _memcpy_2d(int8_t *dst, int8_t *src, int height, int width, int dst_stride, int src_stride) {
+    for (int h = 0; h < height; h++) {
+        memcpy(dst + h * dst_stride, src + h * src_stride, width);
+    }
+}
+
+Status ArmConvLayerGroup::SplitResource(std::vector<std::shared_ptr<LayerResource>> &resources) {
+    auto conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto conv_res   = dynamic_cast<ConvLayerResource *>(resource_);
+
+    CHECK_PARAM_NULL(conv_param);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto group_filter_bytes_size = conv_res->filter_handle.GetBytesSize() / group_;
+    auto origin_filter_ptr       = conv_res->filter_handle.force_to<char *>();
+
+    for (int g = 0; g < group_; g++) {
+        auto group_res = new ConvLayerResource();
+        // split filter
+        group_res->filter_handle = RawBuffer(group_filter_bytes_size, origin_filter_ptr + g * group_filter_bytes_size);
+
+        // split bias if needed
+        if (conv_param->bias) {
+            auto group_bias_bytes_size = conv_res->bias_handle.GetBytesSize() / group_;
+            auto origin_bias_ptr       = conv_res->bias_handle.force_to<char *>();
+
+            group_res->bias_handle = RawBuffer(group_bias_bytes_size, origin_bias_ptr + g * group_bias_bytes_size);
+        }
+
+        // split filter scale if int8
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_INT8) {
+            auto scale_handle = conv_res->scale_handle;
+            if (scale_handle.GetDataCount() == 1) {
+                // channel shared scale
+                group_res->scale_handle = RawBuffer(sizeof(float), scale_handle.force_to<char *>());
+            } else {
+                auto group_scale_bytes_size = scale_handle.GetBytesSize() / group_;
+                group_res->scale_handle =
+                    RawBuffer(group_scale_bytes_size, scale_handle.force_to<char *>() + g * group_scale_bytes_size);
+            }
+        }
+
+        resources.emplace_back(std::shared_ptr<LayerResource>(group_res));
+    }
+
+    return TNN_OK;
+}
+
+void ArmConvLayerGroup::TransformInput(char *packed, char *unpacked, char *src, 
+                    size_t ic_per_group, size_t group_step, size_t group_step_align,
+                    DimsVector dims, DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT) {
+        // FLOAT NC4HW4
+        float *packed_data = reinterpret_cast<float *>(packed);
+        float *unpacked_data = reinterpret_cast<float *>(unpacked);
+        float *src_data = reinterpret_cast<float *>(src);
+        UnpackC4(unpacked_data, src_data, dims[2] * dims[3], dims[1]);
+        for (int g = 0; g < group_; g++) {
+            PackC4(packed_data + g * group_step_align, 
+                   unpacked_data + g * group_step, 
+                   dims[2] * dims[3], ic_per_group);
+        }
+    } else if (data_type == DATA_TYPE_BFP16) {
+        // BFP16 NC4HW4
+        bfp16_t *packed_data = reinterpret_cast<bfp16_t *>(packed);
+        bfp16_t *unpacked_data = reinterpret_cast<bfp16_t *>(unpacked);
+        bfp16_t *src_data = reinterpret_cast<bfp16_t *>(src);
+        UnpackC4(unpacked_data, src_data, dims[2] * dims[3], dims[1]);
+        for (int g = 0; g < group_; g++) {
+            PackC4(packed_data + g * group_step_align, 
+                   unpacked_data + g * group_step, 
+                   dims[2] * dims[3], ic_per_group);
+        }
+    } else if (data_type == DATA_TYPE_INT8) {
+        // INT8 NHWC
+        int8_t *packed_data = reinterpret_cast<int8_t *>(packed);
+        int8_t *src_data = reinterpret_cast<int8_t *>(src);
+        size_t ic_per_group_align = ROUND_UP(ic_per_group, 4);
+        size_t ic_align = ROUND_UP(dims[1], 4);
+        for (int g = 0; g < group_; g++) {
+            _memcpy_2d(packed_data + g * group_step_align, 
+                       src_data + g * ic_per_group, dims[2] * dims[3],
+                       ic_per_group, ic_per_group_align, ic_align);
+        }
+    }
+#if TNN_ARM82
+    else if (data_type == DATA_TYPE_HALF) {
+        // FP16 NC8HW8
+        fp16_t *packed_data = reinterpret_cast<fp16_t *>(packed);
+        fp16_t *unpacked_data = reinterpret_cast<fp16_t *>(unpacked);
+        fp16_t *src_data = reinterpret_cast<fp16_t *>(src);
+        UnpackC8(unpacked_data, src_data, dims[2] * dims[3], dims[1]);
+        for (int g = 0; g < group_; g++) {
+            PackC8(packed_data + g * group_step_align, 
+                   unpacked_data + g * group_step, 
+                   dims[2] * dims[3], ic_per_group);
+        }
+    }
+#endif
+    return;
+}
+
+void ArmConvLayerGroup::TransformOutput(char *packed, char *unpacked, char *dst, 
+                    size_t oc_per_group, size_t group_step, size_t group_step_align,
+                    DimsVector dims, DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT) {
+        // FLOAT NC4HW4
+        float *packed_data = reinterpret_cast<float *>(packed);
+        float *unpacked_data = reinterpret_cast<float *>(unpacked);
+        float *dst_data = reinterpret_cast<float *>(dst);
+        for (int g = 0; g < group_; g++) {
+            UnpackC4(unpacked_data + g * group_step, 
+                     packed_data + g * group_step_align, 
+                     dims[2] * dims[3], oc_per_group);
+        }
+        PackC4(dst_data, unpacked_data, dims[2] * dims[3], dims[1]);
+    } else if (data_type == DATA_TYPE_BFP16) {
+        // BFP16 NC4HW4
+        bfp16_t *packed_data = reinterpret_cast<bfp16_t *>(packed);
+        bfp16_t *unpacked_data = reinterpret_cast<bfp16_t *>(unpacked);
+        bfp16_t *dst_data = reinterpret_cast<bfp16_t *>(dst);
+        for (int g = 0; g < group_; g++) {
+            UnpackC4(unpacked_data + g * group_step, 
+                     packed_data + g * group_step_align, 
+                     dims[2] * dims[3], oc_per_group);
+        }
+        PackC4(dst_data, unpacked_data, dims[2] * dims[3], dims[1]);
+    } else if (data_type == DATA_TYPE_INT8) {
+        // INT8 NHWC
+        int8_t *packed_data = reinterpret_cast<int8_t *>(packed);
+        int8_t *dst_data = reinterpret_cast<int8_t *>(dst);
+        size_t oc_per_group_align = ROUND_UP(oc_per_group, 4);
+        size_t oc_align = ROUND_UP(dims[1], 4);
+        for (int g = 0; g < group_; g++) {
+            _memcpy_2d(dst_data + g * oc_per_group, 
+                       packed_data + g * group_step_align,
+                       dims[2] * dims[3], oc_per_group, 
+                       oc_align, oc_per_group_align);
+        }
+    }
+#if TNN_ARM82
+    else if (data_type == DATA_TYPE_HALF) {
+        // FP16 NC8HW8
+        fp16_t *packed_data = reinterpret_cast<fp16_t *>(packed);
+        fp16_t *unpacked_data = reinterpret_cast<fp16_t *>(unpacked);
+        fp16_t *dst_data = reinterpret_cast<fp16_t *>(dst);
+        for (int g = 0; g < group_; g++) {
+            UnpackC8(unpacked_data + g * group_step,
+                     packed_data + g * group_step_align,
+                     dims[2] * dims[3], oc_per_group);
+        }
+        PackC8(dst_data, unpacked_data, dims[2] * dims[3], dims[1]);
+    }
+#endif
+    return;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.h b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.h
new file mode 100644
index 0000000..48dd0ab
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/convolution/arm_conv_layer_group.h
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_GROUP_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_GROUP_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+class ArmConvLayerGroup : public ArmLayerAcc {
+public:
+    virtual ~ArmConvLayerGroup();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    Status SetGroupParam(std::shared_ptr<LayerParam> &group_param);
+
+    Status SplitResource(std::vector<std::shared_ptr<LayerResource>> &resources);
+
+    Status SetSplitBlobDesc(Blob *blob, std::vector<std::shared_ptr<Blob>> &blobs);
+    Status SetSplitBlobScale(Blob *blob, std::vector<std::shared_ptr<Blob>> &blobs);
+
+    void TransformInput(char *packed_data, char *unpacked_data, char *src, 
+                        size_t ic_per_group, size_t group_step, size_t group_step_align,
+                        DimsVector dims, DataType data_type);
+                        
+    void TransformOutput(char *packed_data, char *unpacked_data, char *src, 
+                        size_t ic_per_group, size_t group_step, size_t group_step_align,
+                        DimsVector dims, DataType data_type);
+
+private:
+    std::vector<std::shared_ptr<ArmLayerAcc>> conv_acc_impls_;
+    std::vector<std::shared_ptr<Blob>> group_inputs_;
+    std::vector<std::shared_ptr<Blob>> group_outputs_;
+
+    std::shared_ptr<LayerParam> group_conv_param_ = nullptr;
+    std::vector<std::shared_ptr<IntScaleResource>> group_scale_res_;
+
+    int group_ = 1;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CONV_LAYER_GROUP_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_common.h
new file mode 100644
index 0000000..8b82c24
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_common.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmDeconvFp16LayerCommon : public ArmConvFp16LayerCommon {
+public:
+    virtual ~ArmDeconvFp16LayerCommon();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    // alloc deconv Weight and pack GOHWI16
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_depthwise.h
new file mode 100644
index 0000000..21ba786
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_depthwise.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_fp16_layer_depthwise.h"
+
+namespace TNN_NS {
+
+class ArmDeconvFp16LayerDepthwise : public ArmConvFp16LayerDepthwise {
+public:
+    virtual ~ArmDeconvFp16LayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param,
+                           const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_FP16_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.cc
new file mode 100644
index 0000000..52f846d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.cc
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN
+// available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.h"
+
+#include <memory>
+
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.h"
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.h"
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#if TNN_ARM82
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_common.h"
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_fp16_layer_depthwise.h"
+#endif
+
+namespace TNN_NS {
+
+Status ArmDeconvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret;
+    ConvLayerParam *deconv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(deconv_param);
+
+    ConvLayerResource *deconv_res = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(deconv_res);
+
+    if (deconv_res->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_DECONVOLUTION, deconv_res, &fp32_res), TNN_OK);
+        deconv_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret                      = ArmLayerAcc::Init(context, param, deconv_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = ArmLayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto data_type = inputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16) {
+        GetImpFP(inputs, outputs);
+    }
+#if TNN_ARM82
+    else if (data_type == DATA_TYPE_HALF) {
+        GetImpHalf(inputs, outputs);
+    }
+#endif
+    else {
+        return Status(TNNERR_NET_ERR, "int8 deconv impl is not supported");
+    }
+
+    if (!deconv_acc_impl_) {
+        return Status(TNNERR_NET_ERR, "Could not create conv impl_");
+    }
+
+    return deconv_acc_impl_->Init(context_, param_, resource_, inputs, outputs);
+}
+
+ArmDeconvLayerAcc::~ArmDeconvLayerAcc() {}
+
+void ArmDeconvLayerAcc::GetImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (ArmDeconvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+        if (!deconv_acc_impl_ || !dynamic_cast<ArmDeconvLayerDepthwise *>(deconv_acc_impl_.get())) {
+            auto deconv_acc  = std::make_shared<ArmDeconvLayerDepthwise>();
+            deconv_acc_impl_ = deconv_acc;
+        }
+    } else if (ArmDeconvLayerStride::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+        if (!dynamic_cast<ArmDeconvLayerStride *>(deconv_acc_impl_.get())) {
+            deconv_acc_impl_ = std::make_shared<ArmDeconvLayerStride>();
+        }
+    }
+    if (!deconv_acc_impl_) {
+        deconv_acc_impl_ = std::make_shared<ArmDeconvLayerCommon>();
+    }
+}
+
+#if TNN_ARM82
+void ArmDeconvLayerAcc::GetImpHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (ArmDeconvFp16LayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+        if (!deconv_acc_impl_ || !dynamic_cast<ArmDeconvFp16LayerDepthwise *>(deconv_acc_impl_.get())) {
+            auto deconv_acc  = std::make_shared<ArmDeconvFp16LayerDepthwise>();
+            deconv_acc_impl_ = deconv_acc;
+        }
+    } else {
+        if (!deconv_acc_impl_) {
+            deconv_acc_impl_ = std::make_shared<ArmDeconvFp16LayerCommon>();
+        }
+    }
+}
+#endif
+
+Status ArmDeconvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return deconv_acc_impl_->Reshape(inputs, outputs);
+}
+
+Status ArmDeconvLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // converted weights are assumed to be packed, and can be freed now
+    if (deconv_acc_f32_resource_) {
+        deconv_acc_f32_resource_.reset();
+    }
+
+    if (deconv_acc_impl_) {
+        return deconv_acc_impl_->DoForward(inputs, outputs);
+    } else {
+        return Status(TNNERR_CONTEXT_ERR, "deconv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_ARM_ACC(Deconv, LAYER_DECONVOLUTION)
+REGISTER_ARM_PRECISION_FP16(LAYER_DECONVOLUTION)
+REGISTER_ARM_LAYOUT(LAYER_DECONVOLUTION, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.h
new file mode 100644
index 0000000..1d4072e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_acc.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+
+namespace TNN_NS {
+
+class ArmDeconvLayerAcc : public ArmLayerAcc {
+public:
+    virtual ~ArmDeconvLayerAcc();
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    void GetImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+#if TNN_ARM82
+    void GetImpHalf(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+#endif
+
+protected:
+    std::shared_ptr<ArmLayerAcc> deconv_acc_impl_           = nullptr;
+    std::shared_ptr<LayerResource> deconv_acc_f32_resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.cc b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.cc
new file mode 100644
index 0000000..a1c366e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.cc
@@ -0,0 +1,261 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.h"
+#include "tnn/device/arm/acc/compute/gemm_function.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+
+#include "tnn/utils/omp_utils.h"
+
+#if defined(__aarch64__)
+#define CONVOLUTION_TILED_NUMBER (14)
+#else
+#define CONVOLUTION_TILED_NUMBER (8)
+#endif
+
+namespace TNN_NS {
+/*
+ArmDeconvLayerCommonas as the last solution, always return true
+handle the case group != 1, dilate != 1, any pads and strides
+*/
+bool ArmDeconvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+ArmDeconvLayerCommon::~ArmDeconvLayerCommon() {}
+
+Status ArmDeconvLayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw          = conv_param->kernels[0];
+        int kh          = conv_param->kernels[1];
+        const int ic    = inputs[0]->GetBlobDesc().dims[1];
+        const int oc    = outputs[0]->GetBlobDesc().dims[1];
+        const int group = conv_param->group;
+        const int goc   = oc / group;
+        const int gic   = ic / group;
+        const int goc_4 = UP_DIV(goc, 4);
+        const int gic_4 = UP_DIV(gic, 4);
+
+        const float *src = conv_res->filter_handle.force_to<float *>();
+        CHECK_PARAM_NULL(src);
+
+        int weight_count   = group * goc_4 * gic_4 * kh * kw * 16;
+        int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT ||
+            conv_res->filter_handle.GetDataType() == DATA_TYPE_INT8) {
+            RawBuffer temp_buffer(weight_count * data_byte_size);
+            float *dst = temp_buffer.force_to<float *>();
+
+            if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+                ConvertWeightsFromGIOHWToGOHWI16((float *)src, (float *)dst, group, ic, oc, conv_param->kernels[1],
+                                                 conv_param->kernels[0]);
+            } else {
+                // Todo
+            }
+
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+Status ArmDeconvLayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+template <typename T>
+Status ArmDeconvLayerCommon::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    DataType data_type       = output->GetBlobDesc().data_type;
+    const int data_byte_size = 4;
+
+    const int batch  = dims_output[0];
+    const int group  = conv_param->group;
+    auto input_width = dims_input[3], input_height = dims_input[2], ic = dims_input[1],
+         input_slice  = UP_DIV(dims_input[1], 4);
+    auto output_width = dims_output[3], output_height = dims_output[2], oc = dims_output[1],
+         output_slice = UP_DIV(dims_output[1], 4);
+
+    auto gic                    = dims_input[1] / group;
+    auto goc                    = dims_output[1] / group;
+    auto gic_4                  = UP_DIV(gic, 4);
+    auto goc_4                  = UP_DIV(goc, 4);
+    auto input_bytes_per_group  = input_width * input_height * gic_4 * 4 * data_byte_size;
+    auto output_bytes_per_group = output_width * output_height * goc_4 * 4 * data_byte_size;
+
+    int kernel_x = conv_param->kernels[0];
+    int kernel_y = conv_param->kernels[1];
+
+    T *src_origin = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_origin = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    int dst_w_pad = output_width + conv_param->pads[0] + conv_param->pads[2];
+    int dst_h_pad = output_height + conv_param->pads[1] + conv_param->pads[3] + 1;
+
+    int i_buf_size     = group * input_bytes_per_group;
+    int o_buf_size     = group * output_bytes_per_group;
+    int trans_buf_size = group * (MAX(input_bytes_per_group, output_bytes_per_group));
+    int pad_img_size   = dst_w_pad * dst_h_pad * goc_4 * 4 * data_byte_size;
+    int bf16_buf_size  = data_type == DATA_TYPE_BFP16 ? (i_buf_size + o_buf_size) * batch : 0;
+
+    float *work_space = reinterpret_cast<float *>(
+        context_->GetSharedWorkSpace(trans_buf_size + i_buf_size + o_buf_size + pad_img_size + bf16_buf_size));
+    float *input_float, *output_float;
+
+    // bfp16 format has no separate process, convert to float
+    if (data_type == DATA_TYPE_BFP16) {
+        input_float = work_space;
+        FloatConvert(reinterpret_cast<bfp16_t *>(src_origin), input_float,
+                     batch * input_width * input_height * k_param_->ic_r4 / 4);
+        output_float = input_float + i_buf_size * batch / sizeof(float);
+        work_space   = output_float + o_buf_size * batch / sizeof(float);
+    } else {
+        input_float  = reinterpret_cast<float *>(src_origin);
+        output_float = reinterpret_cast<float *>(dst_origin);
+    }
+
+    auto i_buffer = work_space;
+    auto o_buffer = i_buffer + i_buf_size / data_byte_size;
+    auto t_buffer = o_buffer + o_buf_size / data_byte_size;
+    auto p_buffer = t_buffer + trans_buf_size / data_byte_size;
+
+    int weight_z_step  = kernel_y * kernel_x * gic_4 * 16;
+    int src_z_step     = k_param_->iw * k_param_->ih * 4;
+    int dst_z_step     = k_param_->ow * k_param_->oh * 4;
+    int dst_z_step_pad = dst_w_pad * dst_h_pad * 4;
+
+    int dilate_y_step = dst_w_pad * 4 * conv_param->dialations[1];
+    int dilate_x_step = 4 * conv_param->dialations[0];
+    int dst_w_step    = conv_param->strides[0] * 4;
+
+    int loop   = input_width / CONVOLUTION_TILED_NUMBER;
+    int remain = input_width % CONVOLUTION_TILED_NUMBER;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        float *input_ptr;
+        float *output_ptr;
+
+        /*
+        first unpack input tensor to nchw data format
+        pack data to make sure every group channel algin4
+        */
+        if (gic_4 != (gic / 4) && group != 1) {
+            input_ptr = i_buffer;
+            UnpackC4(t_buffer, input_float + batch_idx * input_width * input_height * k_param_->ic_r4,
+                     input_width * input_height, ic);
+            for (int g = 0; g < group; g++) {
+                PackC4(input_ptr + g * input_bytes_per_group / 4, t_buffer + g * input_width * input_height * gic,
+                       input_width * input_height, gic);
+            }
+        } else {
+            input_ptr = input_float + batch_idx * input_width * input_height * k_param_->ic_r4;
+        }
+
+        if (goc_4 != (goc / 4) && group != 1) {
+            output_ptr = o_buffer;
+        } else {
+            output_ptr = output_float + batch_idx * output_width * output_height * ROUND_UP(oc, 4);
+        }
+
+        for (int g = 0; g < group; g++) {
+            auto input_g_ptr  = input_ptr + g * input_width * input_height * gic_4 * 4;
+            auto output_g_ptr = output_ptr + g * output_width * output_height * goc_4 * 4;
+            auto weight_ptr   = buffer_weight_.force_to<float *>() + g * goc_4 * weight_z_step;
+
+            // prepare init value
+            memset(p_buffer, 0, pad_img_size);
+
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < goc_4; z++) {
+                auto weight_z = weight_ptr + z * weight_z_step;
+                auto dst_z    = p_buffer + z * dst_z_step_pad;
+                for (int dy = 0; dy < k_param_->ih; dy++) {
+                    auto src_y = input_g_ptr + dy * k_param_->iw * 4;
+                    auto dst_y = dst_z + dy * conv_param->strides[1] * dst_w_pad * 4;
+                    for (int dx = 0; dx <= loop; dx++) {
+                        auto x_idx   = dx * CONVOLUTION_TILED_NUMBER;
+                        auto x_count = MIN(CONVOLUTION_TILED_NUMBER, k_param_->iw - x_idx);
+                        auto src_x   = input_g_ptr + dy * k_param_->iw * 4 + x_idx * 4;
+                        auto dst_x   = dst_y + x_idx * conv_param->strides[0] * 4;
+                        DeconvFloatO4(dst_x, src_x, weight_z, x_count, dst_w_step, gic_4, src_z_step,
+                                      conv_param->kernels[0], conv_param->kernels[1], dilate_x_step, dilate_y_step);
+                    }
+                }
+            }
+
+            // crop inner image
+            OMP_PARALLEL_FOR_
+            for (int z = 0; z < goc_4; z++) {
+                auto src_z = p_buffer + z * dst_z_step_pad;
+                auto dst_z = output_g_ptr + z * dst_z_step;
+                for (int dy = 0; dy < output_height; dy++) {
+                    auto src_y = src_z + (dy + conv_param->pads[2]) * dst_w_pad * 4 + conv_param->pads[0] * 4;
+                    auto dst_y = dst_z + dy * output_width * 4;
+                    memcpy(dst_y, src_y, output_width * 4 * data_byte_size);
+                }
+            }
+        }
+
+        /*
+        first unpack every group output data to get nchw data format
+        pack data to make sure output tensor channel algin4 and continuously
+        */
+        if (goc_4 != (goc / 4) && group != 1) {
+            for (int g = 0; g < group; g++) {
+                UnpackC4(t_buffer + g * output_width * output_height * goc,
+                         output_ptr + g * output_width * output_height * goc_4 * 4, output_width * output_height, goc);
+            }
+            PackC4(output_float + batch_idx * output_width * output_height * ROUND_UP(oc, 4), t_buffer,
+                   output_width * output_height, oc);
+        }
+    }
+
+    // convert back to bfp16
+    if (data_type == DATA_TYPE_BFP16) {
+        FloatConvert<float, bfp16_t>(output_float, reinterpret_cast<bfp16_t *>(GetBlobHandlePtr(output->GetHandle())),
+                                     batch * output_width * output_height * k_param_->oc_r4 / 4);
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.h
new file mode 100644
index 0000000..e69a061
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_common.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class ArmDeconvLayerCommon : public ArmConvLayerCommon {
+public:
+    virtual ~ArmDeconvLayerCommon();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc deconv Weight and pack GOHWI16
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.cc
new file mode 100644
index 0000000..c4b60e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.cc
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool ArmDeconvLayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    return param->group == input_channel && param->group == output_channel;
+}
+
+ArmDeconvLayerDepthwise::~ArmDeconvLayerDepthwise() {}
+
+Status ArmDeconvLayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return TNNERR_LAYER_ERR;
+}
+
+template <typename T>
+Status ArmDeconvLayerDepthwise::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto output      = inputs[0];
+    auto input       = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch    = dims_output[0];
+    int src_width      = dims_input[3];
+    int src_height     = dims_input[2];
+    int dst_width      = dims_output[3];
+    int dst_height     = dims_output[2];
+    int dst_depth_quad = UP_DIV(dims_output[1], 4);
+    int dst_z_step     = dst_width * dst_height * 4;
+    int src_z_step     = src_width * src_height * 4;
+    int dst_y_step     = dst_width * 4;
+    int src_y_step     = src_width * 4;
+    int kernel_x       = conv_param->kernels[0];
+    int kernel_y       = conv_param->kernels[1];
+    int stride_x       = conv_param->strides[0];
+    int stride_y       = conv_param->strides[1];
+    int pad_x          = conv_param->pads[0];
+    int pad_y          = conv_param->pads[2];
+    int dilate_x       = conv_param->dialations[0];
+    int dilate_y       = conv_param->dialations[1];
+    int dilate_y_step  = src_width * 4 * dilate_y;
+    int dilate_x_step  = 4 * dilate_x;
+    int weight_z_step  = kernel_x * kernel_y * 4;
+
+    int l = 0, t = 0, r = dst_width, b = dst_height;
+    for (; l * stride_x - pad_x < 0; l++) {
+        // do nothing
+    }
+    for (; t * stride_y - pad_y < 0; t++) {
+        // do nothing
+    }
+    for (; (r - 1) * stride_x - pad_x + kernel_x * dilate_x > src_width && r > l; r--) {
+        // do nothing
+    }
+    for (; (b - 1) * stride_y - pad_y + kernel_y * dilate_y > src_height && b > t; b--) {
+        // do nothing
+    }
+    auto RunCorner = [=](void *dst_z, void *src_z, const float *weight_dz, int Left, int Top, int Right, int Bot) {
+        for (int dy = Top; dy < Bot; ++dy) {
+            T *dst_y      = reinterpret_cast<T *>(dst_z) + dy * dst_y_step;
+            int srcStartY = dy * stride_y - pad_y;
+            T *src_dy     = reinterpret_cast<T *>(src_z) + srcStartY * src_y_step;
+            int sfy       = MAX(0, (UP_DIV(-srcStartY, dilate_y)));
+            int efy       = MIN(kernel_y, UP_DIV(src_height - srcStartY, dilate_y));
+            for (int dx = Left; dx < Right; ++dx) {
+                T *dst_x      = dst_y + 4 * dx;
+                int srcStartX = dx * stride_x - pad_x;
+                T *src_dx     = src_dy + srcStartX * 4;
+                int sfx       = MAX(0, (UP_DIV(-srcStartX, dilate_x)));
+                int efx       = MIN(kernel_x, UP_DIV(src_width - srcStartX, dilate_x));
+                DepthwiseUnitDeconv(dst_x, src_dx + (sfx * dilate_x + sfy * dilate_y * src_width) * 4,
+                                    weight_dz + 4 * (kernel_x * sfy + sfx), efx - sfx, efy - sfy, 4 * kernel_x,
+                                    dilate_x_step, dilate_y_step);
+            }
+        }
+    };
+
+    T *src_orign = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle()));
+    T *dst_orign = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_orign + batch_idx * src_width * src_height * ROUND_UP(dims_input[1], 4);
+        auto dst_ptr = dst_orign + batch_idx * dst_width * dst_height * ROUND_UP(dims_output[1], 4);
+
+        memset(src_ptr, 0, src_width * src_height * dst_depth_quad * 4 * sizeof(T));
+
+        for (int dz = 0; dz < dst_depth_quad; dz++) {
+            T *dst_z         = dst_ptr + dst_z_step * dz;
+            T *src_z         = src_ptr + src_z_step * dz;
+            float *weight_dz = buffer_weight_.force_to<float *>() + dz * weight_z_step;
+
+            RunCorner(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
+            RunCorner(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
+            RunCorner(dst_z, src_z, weight_dz, 0, t, l, b);
+            RunCorner(dst_z, src_z, weight_dz, r, t, dst_width, b);
+
+            if (r > l) {
+                for (int dy = t; dy < b; dy++) {
+                    T *dst_y      = dst_z + dy * dst_y_step;
+                    int srcStartY = dy * stride_y - pad_y;
+                    T *src_dy     = src_z + srcStartY * src_y_step;
+                    DepthwiseDeconv(dst_y + l * 4, src_dy + (l * stride_x - pad_x) * 4, weight_dz, r - l, stride_x * 4,
+                                    kernel_x, kernel_y, dilate_x_step, dilate_y_step);
+                }
+            }
+        }
+    }
+
+    PostExec<T>(outputs);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.h
new file mode 100644
index 0000000..a9575f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_depthwise.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_depthwise.h"
+
+namespace TNN_NS {
+
+class ArmDeconvLayerDepthwise : public ArmConvLayerDepthwise {
+public:
+    virtual ~ArmDeconvLayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs,
+                            const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs,
+                    const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param,
+                           const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.cc b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.cc
new file mode 100644
index 0000000..4308631
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.cc
@@ -0,0 +1,351 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.h"
+
+#include <memory>
+
+#include "tnn/device/arm/acc/convolution/arm_conv_layer_acc_factory.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+bool ArmDeconvLayerStride::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    if (!param)
+        return false;
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+
+    return param->group == 1 && input_dims[1] >= 64 &&
+           param->strides[0] > 1 && param->strides[1] > 1 &&
+           param->dialations[0] == 1 && param->dialations[1] == 1 &&
+           param->kernels[0] >= param->strides[0] && param->kernels[1] >= param->strides[1];
+}
+
+Status ArmDeconvLayerStride::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_res);
+
+    RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    // step0: split param into groups
+    conv_units_.clear();
+    RETURN_ON_NEQ(CreateStrideConvUnit(), TNN_OK);
+
+    // step1: set split blob desc
+    RETURN_ON_NEQ(SetSplitBlobDesc(inputs[0]), TNN_OK);
+
+    // step2: set split resource, crop and transpose
+    RETURN_ON_NEQ(SplitResource(), TNN_OK);
+
+    // step3: create conv impl accroding to split params
+    for (auto &unit : conv_units_) {
+        std::vector<Blob *> local_outputs;
+        local_outputs.emplace_back(unit.blob.get());
+        std::shared_ptr<ArmLayerAcc> tmp_acc = nullptr;
+        auto data_type                       = inputs[0]->GetBlobDesc().data_type;
+        if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_BFP16) {
+            ArmConvLayerAccFactory::CreateImpFP(inputs, local_outputs, unit.param.get(), tmp_acc);
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: stride conv not support data type");
+        }
+
+        CHECK_PARAM_NULL(tmp_acc);
+        RETURN_ON_NEQ(tmp_acc->Init(context_, unit.param.get(), unit.resource.get(), inputs, local_outputs), TNN_OK);
+
+        unit.conv_acc_impl = tmp_acc;
+
+        // release resource, have been set into unit conv impl
+        unit.resource.reset();
+    }
+
+    return TNN_OK;
+}
+
+ArmDeconvLayerStride::~ArmDeconvLayerStride() {}
+
+Status ArmDeconvLayerStride::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_units_.size() == 0) {
+        return Status(TNNERR_LAYER_ERR, "Error: group conv impl is nil");
+    } else {
+        RETURN_ON_NEQ(SetSplitBlobDesc(inputs[0]), TNN_OK);
+        for (auto &unit : conv_units_) {
+            std::vector<Blob *> local_outputs;
+            local_outputs.emplace_back(unit.blob.get());
+            RETURN_ON_NEQ(unit.conv_acc_impl->Reshape(inputs, local_outputs), TNN_OK);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmDeconvLayerStride::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RawBuffer blob_buf;
+
+    RETURN_ON_NEQ(SetSplitBlobHandle(outputs[0], blob_buf), TNN_OK);
+    // step0: forward conv queue
+    for (auto &unit : conv_units_) {
+        std::vector<Blob *> local_outputs;
+        local_outputs.emplace_back(unit.blob.get());
+        CHECK_PARAM_NULL(unit.conv_acc_impl.get());
+        RETURN_ON_NEQ(unit.conv_acc_impl->DoForward(inputs, local_outputs), TNN_OK);
+    }
+
+    // step1: copy stride convs into one output
+    CopyOutputSplitBlob(outputs[0]);
+
+    return TNN_OK;
+}
+
+Status ArmDeconvLayerStride::CreateStrideConvUnit() {
+    auto param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto sy = param->strides[1];
+    auto sx = param->strides[0];
+    auto ky = param->kernels[1];
+    auto kx = param->kernels[0];
+
+    for (int y = 0; y < sy; y++) {
+        if (y >= ky)
+            continue;
+        int kc_y = 1 + (ky - y - 1) / sy;
+        for (int x = 0; x < sx; x++) {
+            if (x >= kx)
+                continue;
+            int kc_x = 1 + (kx - x - 1) / sx;
+
+            ConvUnit conv_unit;
+            auto stride_conv_param        = new ConvLayerParam();
+            auto stride_conv_resource     = new ConvLayerResource();
+            *stride_conv_param            = *param;
+            stride_conv_param->strides    = {1, 1};
+            stride_conv_param->kernels    = {kc_x, kc_y};
+            stride_conv_param->pad_type   = -1;
+            stride_conv_param->pads       = {kc_x - 1, kc_x - 1, kc_y - 1, kc_y - 1};
+            stride_conv_param->dialations = {1, 1};
+            conv_unit.param               = std::shared_ptr<ConvLayerParam>(stride_conv_param);
+            conv_unit.resource            = std::shared_ptr<ConvLayerResource>(stride_conv_resource);
+            conv_unit.y_offset            = y;
+            conv_unit.x_offset            = x;
+            conv_unit.kc_y                = kc_y;
+            conv_unit.kc_x                = kc_x;
+
+            BlobDesc empty_desc;
+            conv_unit.blob = std::make_shared<Blob>(empty_desc);
+
+            conv_units_.emplace_back(conv_unit);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ArmDeconvLayerStride::SetSplitBlobDesc(Blob *blob) {
+    for (auto &unit : conv_units_) {
+        auto desc    = blob->GetBlobDesc();
+        desc.dims[1] = unit.param->output_channel;
+        desc.dims[2] = desc.dims[2] + unit.kc_y - 1;
+        desc.dims[3] = desc.dims[3] + unit.kc_x - 1;
+
+        unit.blob->SetBlobDesc(desc);
+    }
+
+    return TNN_OK;
+}
+
+Status ArmDeconvLayerStride::SetSplitBlobHandle(Blob *blob, RawBuffer &buf) {
+    std::vector<int> blob_data_offset;
+    int total_count   = 0;
+    int offset        = 0;
+    auto data_type    = blob->GetBlobDesc().data_type;
+    auto element_size = DataTypeUtils::GetBytesSize(data_type);
+
+    for (auto &unit : conv_units_) {
+        auto dims       = unit.blob->GetBlobDesc().dims;
+        auto data_count = dims[0] * ROUND_UP(dims[1], 4) * dims[2] * dims[3];
+        total_count += data_count;
+        blob_data_offset.push_back(offset);
+        offset += data_count;
+    }
+
+    RawBuffer temp(total_count * element_size);
+
+    for (int i = 0; i < conv_units_.size(); i++) {
+        BlobHandle handle;
+        handle.base         = temp.force_to<void *>();
+        handle.bytes_offset = blob_data_offset[i] * element_size;
+        conv_units_[i].blob->SetHandle(handle);
+    }
+
+    buf = temp;
+    return TNN_OK;
+}
+
+template <typename T>
+void ArmDeconvLayerStride::CopyWithStride(ConvUnit &unit, Blob *output) {
+    auto param                = reinterpret_cast<ConvLayerParam *>(param_);
+    auto dims                 = output->GetBlobDesc().dims;
+    auto pad_y                = param->pads[2];
+    auto pad_x                = param->pads[0];
+    auto stride_y             = param->strides[1];
+    auto stride_x             = param->strides[0];
+    auto batch                = dims[0];
+    auto oh                   = dims[2];
+    auto ow                   = dims[3];
+    auto output_origin        = reinterpret_cast<T *>(GetBlobHandlePtr(output->GetHandle()));
+    auto stride_dims          = unit.blob->GetBlobDesc().dims;
+    auto stride_oh            = stride_dims[2];
+    auto stride_ow            = stride_dims[3];
+    auto stride_output_origin = reinterpret_cast<T *>(GetBlobHandlePtr(unit.blob->GetHandle()));
+    int y_start               = std::ceil(1.0 * (pad_y - unit.y_offset) / stride_y);
+    y_start                   = std::max(y_start, 0);
+    int y_end                 = std::floor(1.0 * (pad_y + oh - unit.y_offset - 1) / stride_y);
+    y_end                     = std::min(y_end, stride_oh - 1);
+    int x_start               = std::ceil(1.0 * (pad_x - unit.x_offset) / stride_x);
+    x_start                   = std::max(x_start, 0);
+    int x_end                 = std::floor(1.0 * (pad_x + ow - unit.x_offset - 1) / stride_x);
+    x_end                     = std::min(x_end, stride_ow - 1);
+
+    for (int b = 0; b < batch; b++) {
+        auto src_b = stride_output_origin + b * ROUND_UP(stride_dims[1], 4) * stride_dims[2] * stride_dims[3];
+        auto dst_b = output_origin + b * ROUND_UP(dims[1], 4) * dims[2] * dims[3];
+        for (int oz = 0; oz < UP_DIV(dims[1], 4); oz++) {
+            auto src_z       = src_b + oz * stride_dims[2] * stride_dims[3] * 4;
+            auto dst_z       = dst_b + oz * dims[2] * dims[3] * 4;
+            auto src_y_start = src_z;
+            auto dst_y_start = dst_z + (unit.y_offset - pad_y) * ow * 4;
+            for (int y = y_start; y <= y_end; y++) {
+                auto src_y       = src_y_start + y * stride_ow * 4;
+                auto dst_y       = dst_y_start + y * stride_y * ow * 4;
+                auto src_x_start = src_y;
+                auto dst_x_start = dst_y + (unit.x_offset - pad_x) * 4;
+                for (int x = x_start; x <= x_end; x++) {
+                    Float4::save(dst_x_start + x * stride_x * 4, Float4::load(src_x_start + x * 4));
+                }
+            }
+        }
+    }
+}
+
+Status ArmDeconvLayerStride::CopyOutputSplitBlob(Blob *output) {
+    auto data_type = output->GetBlobDesc().data_type;
+
+    for (auto &unit : conv_units_) {
+        if (data_type == DATA_TYPE_FLOAT) {
+            CopyWithStride<float>(unit, output);
+        } else if (data_type == DATA_TYPE_BFP16) {
+            CopyWithStride<bfp16_t>(unit, output);
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: stride conv not support data type");
+        }
+    }
+
+    return TNN_OK;
+}
+
+/*
+matrix rotate 180
+*/
+template <typename T>
+static inline void _rotete_180(T *ptr, int col, int row) {
+    auto rot_ptr = new T[col * row];
+    for (int i = 0; i < row; i++) {
+        for (int j = 0; j < col; j++) {
+            rot_ptr[(row - i - 1) * col + col - j - 1] = ptr[i * col + j];
+        }
+    }
+
+    memcpy(ptr, rot_ptr, col * row * sizeof(T));
+    delete[] rot_ptr;
+}
+
+template <typename T>
+static inline void _crop_stride(T *dst, T *src, int x_offset, int y_offset, int kx, int ky, int kc_x, int kc_y, int sx,
+                                int sy) {
+    for (int fy = 0; fy < kc_y; fy++) {
+        auto ori_fy = y_offset + fy * sy;
+        for (int fx = 0; fx < kc_x; fx++) {
+            auto ori_fx         = x_offset + fx * sx;
+            dst[fx + fy * kc_x] = src[ori_fy * kx + ori_fx];
+        }
+    }
+}
+
+Status ArmDeconvLayerStride::SplitResource() {
+    auto param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto sy             = param->strides[1];
+    auto sx             = param->strides[0];
+    auto ky             = param->kernels[1];
+    auto kx             = param->kernels[0];
+    auto group          = param->group;
+    auto output_channel = param->output_channel;
+    auto input_channel  = conv_res->filter_handle.GetDataCount() / group / (kx * ky * output_channel);
+    auto data_type      = conv_res->filter_handle.GetDataType();
+    auto element_size   = DataTypeUtils::GetBytesSize(data_type);
+    DimsVector res_dims = {input_channel, output_channel, ky, kx};
+    for (auto &conv_unit : conv_units_) {
+        auto kc_x                      = conv_unit.kc_x;
+        auto kc_y                      = conv_unit.kc_y;
+        auto x_offset                  = conv_unit.x_offset;
+        auto y_offset                  = conv_unit.y_offset;
+        auto stride_conv_res           = conv_unit.resource.get();
+        DimsVector unit_res_dims       = {output_channel, input_channel, kc_y, kc_x};
+        stride_conv_res->filter_handle = RawBuffer(kc_x * kc_y * input_channel * output_channel * element_size);
+
+        for (int ic = 0; ic < input_channel; ic++) {
+            for (int oc = 0; oc < output_channel; oc++) {
+                auto dst =
+                    stride_conv_res->filter_handle.force_to<char *>() +
+                    (oc * DimsVectorUtils::Count(unit_res_dims, 1) + ic * DimsVectorUtils::Count(unit_res_dims, 2)) *
+                        element_size;
+                auto src = conv_res->filter_handle.force_to<char *>() +
+                           (ic * DimsVectorUtils::Count(res_dims, 1) + oc * DimsVectorUtils::Count(res_dims, 2)) *
+                               element_size;
+
+                if (data_type == DATA_TYPE_FLOAT) {
+                    auto dst_ = reinterpret_cast<float *>(dst);
+                    auto src_ = reinterpret_cast<float *>(src);
+                    _crop_stride(dst_, src_, x_offset, y_offset, kx, ky, kc_x, kc_y, sx, sy);
+                    _rotete_180(dst_, kc_x, kc_y);
+                } else if (data_type == DATA_TYPE_BFP16) {
+                    auto dst_ = reinterpret_cast<bfp16_t *>(dst);
+                    auto src_ = reinterpret_cast<bfp16_t *>(src);
+                    _crop_stride(dst, src, x_offset, y_offset, kx, ky, kc_x, kc_y, sx, sy);
+                    _rotete_180(dst, kc_x, kc_y);
+                } else {
+                    return Status(TNNERR_LAYER_ERR, "Error: stride conv resource not support data type");
+                }
+            }
+        }
+
+        if (param->bias) {
+            stride_conv_res->bias_handle =
+                RawBuffer(conv_res->bias_handle.GetBytesSize(), conv_res->bias_handle.force_to<char *>());
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.h b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.h
new file mode 100644
index 0000000..a7ab7b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/deconvolution/arm_deconv_layer_stride.h
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_STRIDE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_STRIDE_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+#include "tnn/device/arm/arm_device.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+class ArmDeconvLayerStride : public ArmLayerAcc {
+public:
+    virtual ~ArmDeconvLayerStride();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    struct ConvUnit {
+        int kc_x;
+        int kc_y;
+        int x_offset;
+        int y_offset;
+        std::shared_ptr<ConvLayerParam> param;
+        std::shared_ptr<ConvLayerResource> resource;
+        std::shared_ptr<ArmLayerAcc> conv_acc_impl;
+        std::shared_ptr<Blob> blob;
+    };
+
+private:
+    Status CreateStrideConvUnit();
+
+    Status SplitResource();
+
+    Status SetSplitBlobDesc(Blob *blob);
+    Status SetSplitBlobHandle(Blob *blob, RawBuffer &buf);
+
+    Status CopyOutputSplitBlob(Blob *output);
+
+    template <typename T>
+    void CopyWithStride(ConvUnit &unit, Blob* output);
+
+private:
+    std::vector<ConvUnit> conv_units_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DECONV_LAYER_STRIDE_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/acc/neon_mathfun.h b/3rdparty/TNN/source/tnn/device/arm/acc/neon_mathfun.h
new file mode 100644
index 0000000..9a398df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/acc/neon_mathfun.h
@@ -0,0 +1,464 @@
+/* NEON implementation of sin, cos, exp and log
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ */
+
+/* Copyright (C) 2011  Julien Pommier
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+#ifndef neon_mathfun_h
+#define neon_mathfun_h
+#include <arm_neon.h>
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x)
+{
+    float32x4_t one = vdupq_n_f32(1);
+
+    x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+    uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+    int32x4_t ux = vreinterpretq_s32_f32(x);
+
+    int32x4_t emm0 = vshrq_n_s32(ux, 23);
+
+    /* keep only the fractional part */
+    ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+    ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+    x = vreinterpretq_f32_s32(ux);
+
+    emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+    float32x4_t e = vcvtq_f32_s32(emm0);
+
+    e = vaddq_f32(e, one);
+
+    /* part2:
+     *     if( x < SQRTHF ) {
+     *       e -= 1;
+     *       x = x + x - 1.0;
+     *     } else { x = x - 1.0; }
+     */
+    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+    float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+    x = vsubq_f32(x, one);
+    e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+    x = vaddq_f32(x, tmp);
+
+    float32x4_t z = vmulq_f32(x,x);
+
+    float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+    y = vmulq_f32(y, x);
+
+    y = vmulq_f32(y, z);
+
+
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+    y = vaddq_f32(y, tmp);
+
+
+    tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+    y = vsubq_f32(y, tmp);
+
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+    x = vaddq_f32(x, y);
+    x = vaddq_f32(x, tmp);
+    x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+    return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x)
+{
+    float32x4_t tmp, fx;
+
+    float32x4_t one = vdupq_n_f32(1);
+    x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+    x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+    /* perform a floorf */
+    tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+    /* if greater, substract 1 */
+    uint32x4_t mask = vcgtq_f32(tmp, fx);
+    mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+    fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+    tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+    float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+    x = vsubq_f32(x, tmp);
+    x = vsubq_f32(x, z);
+
+    static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+    float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
+    float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
+    float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
+    float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
+    float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
+    float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+    y = vmulq_f32(y, x);
+    z = vmulq_f32(x, x);
+
+    y = vaddq_f32(y, c1);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c2);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c3);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c4);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c5);
+
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, x);
+    y = vaddq_f32(y, one);
+
+    /* build 2^n */
+    int32x4_t mm;
+    mm = vcvtq_s32_f32(fx);
+    mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+    mm = vshlq_n_s32(mm, 23);
+    float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+
+    y = vmulq_f32(y, pow2n);
+    return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+ *
+ *   The code is the exact rewriting of the cephes sinf function.
+ *   Precision is excellent as long as x < 8192 (I did not bother to
+ *   take into account the special handling they have for greater values
+ *   -- it does not return garbage for arguments over 8192, though, but
+ *   the extra precision is missing).
+ *
+ *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ *   surprising but correct result.
+ *
+ *   Note also that when you compute sin(x), cos(x) is available at
+ *   almost no extra price so both sin_ps and cos_ps make use of
+ *   sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
+{
+    // any x
+    float32x4_t xmm1, xmm2, xmm3, y;
+
+    uint32x4_t emm2;
+
+    uint32x4_t sign_mask_sin, sign_mask_cos;
+    sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+    x = vabsq_f32(x);
+
+    /* scale by 4/Pi */
+    y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+    /* store the integer part of y in mm0 */
+    emm2 = vcvtq_u32_f32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+    y = vcvtq_f32_u32(emm2);
+
+    /* get the polynom selection mask
+     *     there is one polynom for 0 <= x <= Pi/4
+     *     and another one for Pi/4<x<=Pi/2
+     *
+     *     Both branches will be computed.
+     */
+    uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+
+    /* The magic pass: "Extended precision modular arithmetic"
+     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+    xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+    xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+    x = vaddq_f32(x, xmm1);
+    x = vaddq_f32(x, xmm2);
+    x = vaddq_f32(x, xmm3);
+
+    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+    float32x4_t z = vmulq_f32(x,x);
+    float32x4_t y1, y2;
+
+    y1 = vmulq_n_f32(z, c_coscof_p0);
+    y2 = vmulq_n_f32(z, c_sincof_p0);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, x);
+    y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+    y2 = vaddq_f32(y2, x);
+    y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+    /* select the correct result from the two polynoms */
+    float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+    float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+    *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+    *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+static inline float32x4_t sin_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ysin;
+}
+
+static inline float32x4_t cos_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ycos;
+}
+
+static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b);
+    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+    return vmulq_f32(a, reciprocal);
+}
+
+static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp_ps(vmulq_f32(b, log_ps(a)));
+}
+
+// tanh neon vector version
+// refer the scalar version from Cephes Math Library
+
+#define c_cephes_HALFMAXLOGF 44.014845935754205f
+#define c_cephes_tanh_C1 0.625f
+
+#define c_cephes_tanh_p0 - 5.70498872745E-3
+#define c_cephes_tanh_p1 + 2.06390887954E-2
+#define c_cephes_tanh_p2 - 5.37397155531E-2
+#define c_cephes_tanh_p3 + 1.33314422036E-1
+#define c_cephes_tanh_p4 - 3.33332819422E-1
+
+/* Single precision hyperbolic tangent computed for 4 simultaneous float */
+static inline float32x4_t tanh_ps(float32x4_t x)
+{
+    float32x4_t x2 = vabsq_f32(x);
+
+    uint32x4_t mask_l = vcgeq_f32(x2, vdupq_n_f32(c_cephes_tanh_C1));
+    uint32x4_t mask_l2 = vcgtq_f32(x2, vdupq_n_f32(c_cephes_HALFMAXLOGF));
+
+    // abs(x) >= 0.625
+    // tanh(x) = 1 − 2 / (exp(2x) + 1)
+    float32x4_t _one = vdupq_n_f32(1.f);
+    float32x4_t _two = vdupq_n_f32(2.f);
+    float32x4_t exp_x_x = exp_ps(vaddq_f32(x, x));
+#if __aarch64__
+    float32x4_t y0 = vsubq_f32(_one, vdivq_f32(_two, vaddq_f32(exp_x_x, _one)));
+#else
+    float32x4_t y0 = vsubq_f32(_one, div_ps(_two, vaddq_f32(exp_x_x, _one)));
+#endif
+
+    // abs(x) < 0.625
+    /*
+        z = x2 * x2;
+        z =
+        (((( -5.70498872745E-3 * z
+        + 2.06390887954E-2) * z
+        - 5.37397155531E-2) * z
+        + 1.33314422036E-1) * z
+        - 3.33332819422E-1) * z * x
+        + x;
+    */
+    static const float cephes_tanh_p[5] = { c_cephes_tanh_p0, c_cephes_tanh_p1, c_cephes_tanh_p2, c_cephes_tanh_p3, c_cephes_tanh_p4 };
+    float32x4_t y = vld1q_dup_f32(cephes_tanh_p+0);
+    float32x4_t c1 = vld1q_dup_f32(cephes_tanh_p+1);
+    float32x4_t c2 = vld1q_dup_f32(cephes_tanh_p+2);
+    float32x4_t c3 = vld1q_dup_f32(cephes_tanh_p+3);
+    float32x4_t c4 = vld1q_dup_f32(cephes_tanh_p+4);
+
+    float32x4_t z = vmulq_f32(x, x);
+
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c1);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c2);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c3);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c4);
+
+    y = vmulq_f32(y, z);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, x);
+
+    // abs(x) > HALFMAXLOGF
+    // return 1.0 or -1.0
+    uint32x4_t mask_pos = vcgtq_f32(x, vdupq_n_f32(0.f));
+    float32x4_t y1 = vreinterpretq_f32_u32(vbslq_u32(mask_pos, vreinterpretq_u32_f32(vdupq_n_f32(1.f)), vreinterpretq_u32_f32(vdupq_n_f32(-1.f))));
+
+    y = vreinterpretq_f32_u32(vbslq_u32(mask_l, vreinterpretq_u32_f32(y0), vreinterpretq_u32_f32(y)));
+    y = vreinterpretq_f32_u32(vbslq_u32(mask_l2, vreinterpretq_u32_f32(y1), vreinterpretq_u32_f32(y)));
+    return y;
+}
+
+static inline float32x4_t sqrt_ps(float32x4_t v)
+{
+    // 1/sqrt(x)
+    float32x4_t y;
+    float32x4_t y1 = v;
+    float32x4_t y0 = vrsqrteq_f32(v);
+    float32x4_t y2 = vmulq_f32(y0, y1);
+    float32x4_t y3 = vrsqrtsq_f32(y2, y0);
+    y0 = vmulq_f32(y0, y3);
+    y2 = vmulq_f32(y0, y1);
+    y3 = vrsqrtsq_f32(y2, y0);
+    y0 = vmulq_f32(y0, y3);
+
+    // 1/x
+    // y1 = vrecpeq_f32(y0);
+    // y2 = vrecpsq_f32(y1, y0);
+    // y1 = vmulq_f32(y1, y2);
+    // y2 = vrecpsq_f32(y1, y0);
+    // y0 = vmulq_f32(y1, y2);
+    float32x4_t one = vdupq_n_f32(1.f);
+    y = div_ps(one, y0);
+
+    return y;
+}
+
+static inline float32x4_t sigmoid_ps(float32x4_t v)
+{
+    float32x4_t _one = vdupq_n_f32(1.f);
+    float32x4_t _v = vnegq_f32(v);
+    _v = exp_ps(_v);
+    _v = vaddq_f32(_v, _one);
+    float32x4_t _outp = vrecpeq_f32(_v);
+    // _outp = vmulq_f32(vrecpsq_f32(_v, _outp), _outp);
+    return vmulq_f32(vrecpsq_f32(_v, _outp), _outp);
+}
+
+//http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html
+/* sigmoid() computed for 4 float at once: small error*/
+static inline float32x4_t fast_sigmoid_ps(float32x4_t x)
+{
+    float32x4_t const16 = vdupq_n_f32(256.0f);
+    uint32x4_t mask = vcgeq_f32(x, const16);
+    
+    float32x4_t const1 = vdupq_n_f32(1.0f);
+    float32x4_t const025 = vdupq_n_f32(0.25f*0.25f*0.25f*0.25f);
+    
+    //y = 1-x/16
+    float32x4_t temp = vmlsq_f32(const1, const025, x);
+    //y2
+    temp = vmulq_f32(temp, temp);
+    //y4
+    temp = vmulq_f32(temp, temp);
+    //y8
+    temp = vmulq_f32(temp, temp);
+    //y16
+    temp = vmulq_f32(temp, temp);
+
+    //y256 
+    temp = vmulq_f32(temp, temp);
+    temp = vmulq_f32(temp, temp);
+    temp = vmulq_f32(temp, temp);
+    temp = vmulq_f32(temp, temp);
+    
+    temp = vaddq_f32(temp, const1);
+    float32x4_t result = vrecpeq_f32(temp);
+    result = vmulq_f32(vrecpsq_f32(temp, result), result);
+//    result = vmulq_f32(vrecpsq_f32(temp, result), result);
+    result = vbslq_f32(mask, const1, result);
+    
+    return result;
+}
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.cc b/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.cc
new file mode 100644
index 0000000..1e4bf62
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.cc
@@ -0,0 +1,1415 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_blob_converter.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+ArmBlobConverterAcc::ArmBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {}
+ArmBlobConverterAcc::~ArmBlobConverterAcc() {}
+
+std::string ArmBlobConverterAcc::GetUniqueBlobConvertKey(MatType mat_type, DataType data_type,
+                                                         BlobConvertDirection cvt_dir) {
+    return ToString(mat_type) + "_" + ToString(data_type) + "_" + ToString(cvt_dir);
+}
+
+std::map<std::string, ArmBlobConvertFunc>& ArmBlobConverterAcc::GetBlobConvertFuncMap() {
+    static std::map<std::string, ArmBlobConvertFunc> cvt_map;
+    return cvt_map;
+}
+
+Status ArmBlobConverterAcc::RegisterBlobConvertFunc(MatType mat_type, DataType data_type,
+                                                    BlobConvertDirection cvt_dir, ArmBlobConvertFunc cvt_func) {
+    auto& cvt_map       = GetBlobConvertFuncMap();
+    const auto& cvt_key = GetUniqueBlobConvertKey(mat_type, data_type, cvt_dir);
+    cvt_map[cvt_key] = cvt_func;
+    return TNN_OK;
+}
+
+Status ArmBlobConverterAcc::GetBlobConvertFunc(MatType mat_type, DataType data_type,
+                                               BlobConvertDirection cvt_dir, ArmBlobConvertFunc& cvt_func) {
+    const auto& cvt_map = GetBlobConvertFuncMap();
+    const auto& cvt_key = GetUniqueBlobConvertKey(mat_type, data_type, cvt_dir);
+    if (cvt_map.find(cvt_key) == cvt_map.end() || cvt_map.at(cvt_key) == nullptr) {
+        LOGE("ArmBlobConverterAcc::GetBlobConvertFunc, convert type not support yet. mat_type: %d data_type:%d cvt_dir:%d\n", mat_type, data_type, cvt_dir);
+#if !TNN_ARM82
+        if (data_type == DATA_TYPE_HALF) {
+            LOGE("ArmBlobConverterAcc::GetBlobConvertFunc, fp16 is used while TNN_ARM82 is off, try to open TNN_ARM82 and run again.\n");
+        }
+#endif
+        return Status(TNNERR_PARAM_ERR, "ArmBlobConverterAcc::GetBlobConvertFunc, convert type not support yet");
+    }
+    cvt_func = cvt_map.at(cvt_key);
+    return TNN_OK;
+}
+
+Status ArmBlobConverterAcc::ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob is null");
+    }
+    auto desc       = blob_->GetBlobDesc();
+    auto dims       = desc.dims;
+    auto batch      = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel    = DimsFunctionUtils::GetDim(dims, 1);
+    auto hw         = DimsVectorUtils::Count(dims, 2);
+    auto c_r4       = ROUND_UP(channel, 4);
+    auto handle_ptr = GetBlobHandlePtr(blob_->GetHandle());
+    if (desc.data_type == DATA_TYPE_INT8) {
+        if (fused_int8_scale.size() < c_r4) {
+            fused_int8_scale.resize(c_r4);
+            fused_int8_bias.resize(c_r4);
+        }
+        auto scale_handle = reinterpret_cast<BlobInt8*>(blob_)->GetIntResource()->scale_handle;
+        auto scale_data   = scale_handle.force_to<float*>();
+        auto scale_count  = scale_handle.GetDataCount();
+        for (int i = 0; i < channel; i++) {
+            auto scale_idx      = scale_count == 1 ? 0 : i;
+            fused_int8_scale[i] = param.scale[i] * scale_data[scale_idx];
+            fused_int8_bias[i]  = param.bias[i];
+        }
+    } else if (desc.data_type == DATA_TYPE_INT32) {
+        int count    = DimsVectorUtils::Count(blob_->GetBlobDesc().dims);
+        int ele_size = DataTypeUtils::GetBytesSize(desc.data_type);
+        if (image.GetMatType() == NC_INT32) {
+            memcpy(image.GetData(), GetBlobHandlePtr(blob_->GetHandle()), count * ele_size);
+        }
+        return ret;
+    }
+
+    auto cvt_data_type  = desc.data_type;
+    auto cvt_handle_ptr = handle_ptr;
+
+    // pack if data format is nchw
+    RawBuffer tmp_packed_blob;
+    if (desc.data_format == DATA_FORMAT_NCHW) {
+        if (desc.data_type == DATA_TYPE_FLOAT) {
+            tmp_packed_blob = RawBuffer(batch * c_r4 * hw * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT));
+            auto dst_ptr    = tmp_packed_blob.force_to<float*>();
+            auto src_ptr    = reinterpret_cast<float*>(cvt_handle_ptr);
+            for (int n = 0; n < batch; ++n) {
+                auto dst_ptr_n = dst_ptr + n * c_r4 * hw;
+                auto src_ptr_n = src_ptr + n * channel * hw;
+                PackC4(dst_ptr_n, src_ptr_n, hw, channel);
+            }
+        }
+#if TNN_ARM82
+        else if (desc.data_type == DATA_TYPE_HALF) {
+            tmp_packed_blob = RawBuffer(batch * ROUND_UP(c_r4, 8) * hw * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+            auto dst_ptr    = tmp_packed_blob.force_to<fp16_t*>();
+            auto src_ptr    = reinterpret_cast<fp16_t*>(cvt_handle_ptr);
+            for (int n = 0; n < batch; ++n) {
+                auto dst_ptr_n = dst_ptr + n * ROUND_UP(c_r4, 8) * hw;
+                auto src_ptr_n = src_ptr + n * channel * hw;
+                PackC8(dst_ptr_n, src_ptr_n, hw, channel);
+            }
+        }
+#endif
+        else {
+            LOGE("ArmBlobConverterAcc::ConvertToMatAsync, not support data type for nchw blob, %d\n", desc.data_type);
+            return Status(TNNERR_PARAM_ERR,
+                          "ArmBlobConverterAcc::ConvertToMatAsync not support data type for nchw blob");
+        }
+        cvt_handle_ptr = tmp_packed_blob.force_to<char*>();
+    }
+
+#ifdef TNN_ARM82_A32
+    RawBuffer tmp_float_blob;
+    if (desc.data_type == DATA_TYPE_HALF) {
+        // In aarch32 or armv7, first reformat half blob to float blob.
+        tmp_float_blob = RawBuffer(batch * c_r4 * hw * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT));
+        HalfC8ToFloatC4(tmp_float_blob.force_to<float*>(), reinterpret_cast<fp16_t*>(cvt_handle_ptr), batch, channel,
+                        DimsVectorUtils::Count(dims, 2));
+        // In aarch32 or armv7, then convert from float blob.
+        cvt_data_type  = DATA_TYPE_FLOAT;
+        cvt_handle_ptr = tmp_float_blob.force_to<char*>();
+    }
+#endif
+
+    ret = GetBlobConvertFunc(image.GetMatType(), cvt_data_type, CVT_DIR_BLOB2MAT, cvt_func_);
+    if (ret == TNN_OK) {
+        return cvt_func_(image, cvt_handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+    } else {
+        return ret;
+    }
+}
+
+Status ArmBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob_ is null");
+    }
+    auto desc       = blob_->GetBlobDesc();
+    auto dims       = desc.dims;
+    auto batch      = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel    = DimsFunctionUtils::GetDim(dims, 1);
+    auto hw         = DimsVectorUtils::Count(dims, 2);
+    auto handle_ptr = GetBlobHandlePtr(blob_->GetHandle());
+    auto c_r4       = ROUND_UP(channel, 4);
+    if (desc.data_type == DATA_TYPE_INT8) {
+        if (fused_int8_scale.size() < c_r4) {
+            fused_int8_scale.resize(c_r4);
+            fused_int8_bias.resize(c_r4);
+        }
+        auto scale_handle = reinterpret_cast<BlobInt8*>(blob_)->GetIntResource()->scale_handle;
+        auto scale_data   = scale_handle.force_to<float*>();
+        auto scale_count  = scale_handle.GetDataCount();
+        for (int i = 0; i < channel; i++) {
+            auto scale_idx = scale_count == 1 ? 0 : i;
+            if (scale_data[scale_idx] != 0) {
+                fused_int8_scale[i] = param.scale[i] / scale_data[scale_idx];
+                fused_int8_bias[i]  = param.bias[i] / scale_data[scale_idx];
+            } else {
+                fused_int8_scale[i] = 0;
+                fused_int8_bias[i]  = 0;
+            }
+        }
+    }
+
+    auto cvt_data_type  = desc.data_type;
+    auto cvt_handle_ptr = handle_ptr;
+
+    // frist convert to packed buffer if data format is nchw
+    RawBuffer tmp_packed_blob;
+    if (desc.data_format == DATA_FORMAT_NCHW) {
+        if (desc.data_type == DATA_TYPE_FLOAT) {
+            tmp_packed_blob = RawBuffer(batch * c_r4 * hw * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT));
+        }
+#if TNN_ARM82
+        else if (desc.data_type == DATA_TYPE_HALF) {
+            tmp_packed_blob = RawBuffer(batch * ROUND_UP(c_r4, 8) * hw * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+        }
+#endif
+        else {
+            LOGE("ArmBlobConverterAcc::ConvertFromMatAsync, not support data type for nchw blob, %d\n", desc.data_type);
+            return Status(TNNERR_PARAM_ERR,
+                          "ArmBlobConverterAcc::ConvertFromMatAsync not support data type for nchw blob");
+        }
+        cvt_handle_ptr = tmp_packed_blob.force_to<char*>();
+    }
+
+    ret = GetBlobConvertFunc(image.GetMatType(), cvt_data_type, CVT_DIR_MAT2BLOB, cvt_func_);
+    if (ret == TNN_OK) {
+        ret = cvt_func_(image, cvt_handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+    } else {
+        return ret;
+    }
+
+    // then unpack if data format is nchw
+    if (desc.data_format == DATA_FORMAT_NCHW) {
+        if (desc.data_type == DATA_TYPE_FLOAT) {
+            auto dst_ptr = reinterpret_cast<float*>(handle_ptr);
+            auto src_ptr = reinterpret_cast<float*>(cvt_handle_ptr);
+            for (int n = 0; n < batch; ++n) {
+                auto dst_ptr_n = dst_ptr + n * channel * hw;
+                auto src_ptr_n = src_ptr + n * c_r4 * hw;
+                UnpackC4(dst_ptr_n, src_ptr_n, hw, channel);
+            }
+        }
+#if TNN_ARM82
+        else if (desc.data_type == DATA_TYPE_HALF) {
+            auto dst_ptr = reinterpret_cast<fp16_t*>(handle_ptr);
+            auto src_ptr = reinterpret_cast<fp16_t*>(cvt_handle_ptr);
+            for (int n = 0; n < batch; ++n) {
+                auto dst_ptr_n = dst_ptr + n * channel * hw;
+                auto src_ptr_n = src_ptr + n * ROUND_UP(c_r4, 8) * hw;
+                UnpackC8(dst_ptr_n, src_ptr_n, hw, channel);
+            }
+        }
+#endif
+    }
+
+    return ret;
+}
+
+/*
+compatible to ncnn mat
+*/
+Status ArmBlobConverterAcc::ConvertToMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertToMatAsync(image, param, command_queue);
+}
+
+/*
+compatible to ncnn mat
+*/
+Status ArmBlobConverterAcc::ConvertFromMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertFromMatAsync(image, param, command_queue);
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(Arm);
+REGISTER_BLOB_CONVERTER(Arm, DEVICE_ARM);
+
+/*
+convert data type from  Tin to Tout, data format from nc4hw4 2 nchw
+*/
+template <typename Tin, typename Tout>
+void FloatBlobToNCHW(const Tin *src, Tout *dst, int channel, int hw) {
+    if (channel % 4 == 0 && hw == 1 && sizeof(Tin) == sizeof(Tout)) {
+        memcpy(dst, src, channel * sizeof(Tin));
+        return;
+    }
+    UnpackC4(dst, src, hw, channel);
+    return;
+}
+
+template void FloatBlobToNCHW(const float *src, bfp16_t *dst, int channel, int hw);
+template void FloatBlobToNCHW(const float *src, float *dst, int channel, int hw);
+template void FloatBlobToNCHW(const bfp16_t *src, float *dst, int channel, int hw);
+template void FloatBlobToNCHW(const bfp16_t *src, bfp16_t *dst, int channel, int hw);
+
+template <typename Tin, typename Tout>
+void HalfBlobToNCHW(const Tin *src, Tout *dst, int channel, int hw) {
+    if (channel % 4 == 0 && hw == 1 && sizeof(Tin) == sizeof(Tout)) {
+        memcpy(dst, src, channel * sizeof(Tin));
+        return;
+    }
+    UnpackC8(dst, src, hw, channel);
+}
+
+template void HalfBlobToNCHW(const fp16_t* src, float *dst, int channel, int hw);
+
+/*
+convert data type from int8 to float, data format from nhwc 2 nchw
+*/
+static void Int8BlobToNCHW(const int8_t *src, float *dst, int channel, int hw, float *scale, float *bias) {
+    UnpackAndDequant(dst, src, hw, channel, scale, bias);
+    return;
+}
+
+/*
+convert data type from uint8 to float, data format from nhwc 2 nchw
+*/
+template <bool reverse_channel>
+static void BGRAToBlobImpl(const uint8_t *src, float *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    float32x4_t bias_neon_a = vdupq_n_f32(bias[3]);
+    float32x4x4_t vf32;
+    for (; i < hw - 7; i += 8) {
+        uint8x8x4_t v_u8 = vld4_u8(src + i * 4);
+        int16x8_t b_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[0]));
+        int16x8_t g_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[1]));
+        int16x8_t r_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[2]));
+        int16x8_t a_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[3]));
+
+        vf32.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? r_s16 : b_s16)));
+        vf32.val[1] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        vf32.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? b_s16 : r_s16)));
+        vf32.val[3] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16)));
+
+        vf32.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32.val[0], scale[0]));
+        vf32.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32.val[1], scale[1]));
+        vf32.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32.val[2], scale[2]));
+        vf32.val[3] = vaddq_f32(bias_neon_a, vmulq_n_f32(vf32.val[3], scale[3]));
+
+        if (channel == 3) {
+            vf32.val[3] = vdupq_n_f32(0.0f);
+        }
+
+        vst4q_f32(dst + i * 4, vf32);
+
+        vf32.val[0] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? r_s16 : b_s16)));
+        vf32.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        vf32.val[2] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? b_s16 : r_s16)));
+        vf32.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16)));
+
+        vf32.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32.val[0], scale[0]));
+        vf32.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32.val[1], scale[1]));
+        vf32.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32.val[2], scale[2]));
+        vf32.val[3] = vaddq_f32(bias_neon_a, vmulq_n_f32(vf32.val[3], scale[3]));
+
+        if (channel == 3) {
+            vf32.val[3] = vdupq_n_f32(0.0f);
+        }
+
+        vst4q_f32(dst + i * 4 + 16, vf32);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = scale[0] * src[4 * i + (reverse_channel ? 2 : 0)] + bias[0];
+        dst[4 * i + 1] = scale[1] * src[4 * i + 1] + bias[1];
+        dst[4 * i + 2] = scale[2] * src[4 * i + (reverse_channel ? 0 : 2)] + bias[2];
+        dst[4 * i + 3] = scale[3] * src[4 * i + 3] + bias[3];
+        if (channel == 3) {
+            dst[4 * i + 3] = 0.0f;
+        }
+    }
+}
+
+/*
+convert data type from uint8 to float, data format from nhw4 2 nc4hw4
+*/
+template <bool reverse_channel>
+static void BGRAToBlobImpl(const uint8_t *src, int8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    float32x4_t bias_neon_a = vdupq_n_f32(bias[3]);
+    int8x8x4_t vi8x4;
+    for (; i < hw - 7; i += 8) {
+        uint8x8x4_t v_u8 = vld4_u8(src + i * 4);
+        int16x8_t b_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[0]));
+        int16x8_t g_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[1]));
+        int16x8_t r_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[2]));
+        int16x8_t a_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[3]));
+
+        float32x4_t f32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? r_s16 : b_s16)));
+        float32x4_t f32_1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        float32x4_t f32_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? b_s16 : r_s16)));
+        float32x4_t f32_3 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16)));
+        float32x4_t f32_4 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? r_s16 : b_s16)));
+        float32x4_t f32_5 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        float32x4_t f32_6 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? b_s16 : r_s16)));
+        float32x4_t f32_7 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16)));
+
+        f32_0 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_0, scale[0]));
+        f32_1 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_1, scale[1]));
+        f32_2 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_2, scale[2]));
+        f32_3 = vaddq_f32(bias_neon_a, vmulq_n_f32(f32_3, scale[3]));
+        f32_4 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_4, scale[0]));
+        f32_5 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_5, scale[1]));
+        f32_6 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_6, scale[2]));
+        f32_7 = vaddq_f32(bias_neon_a, vmulq_n_f32(f32_7, scale[3]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(f32_0));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(f32_4));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(f32_1));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(f32_5));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(f32_2));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(f32_6));
+        int16x4_t s16_l3 = vqmovn_s32(VCVTAQ_S32_F32(f32_3));
+        int16x8_t s16_3  = VQMOVN_HIGH_S32_T(s16_l3, VCVTAQ_S32_F32(f32_7));
+
+        vi8x4.val[0] = vqmovn_s16(s16_0);
+        vi8x4.val[1] = vqmovn_s16(s16_1);
+        vi8x4.val[2] = vqmovn_s16(s16_2);
+        vi8x4.val[3] = vqmovn_s16(s16_3);
+
+        if (channel == 3) {
+            vi8x4.val[3] = vdup_n_s8(0);
+        }
+
+        vst4_s8(dst + i * 4, vi8x4);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = float2int8(scale[0] * src[4 * i + (reverse_channel ? 2 : 0)] + bias[0]);
+        dst[4 * i + 1] = float2int8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2int8(scale[2] * src[4 * i + (reverse_channel ? 0 : 2)] + bias[2]);
+        dst[4 * i + 3] = float2int8(scale[3] * src[4 * i + 3] + bias[3]);
+        if (channel == 3) {
+            dst[4 * i + 3] = 0;
+        }
+    }
+}
+
+/*
+if channel == 3, the fourth channel is ignored
+*/
+template<typename T>
+void BGRAToBlob(const uint8_t *src, T *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel, int channel) {
+    if (reverse_channel) {
+        BGRAToBlobImpl<true>(src, dst, scale, bias, hw, channel);
+    } else {
+        BGRAToBlobImpl<false>(src, dst, scale, bias, hw, channel);
+    }
+}
+
+/*
+convert data type from uint8 to float, data format from nhw1 2 nc4hw4
+*/
+static void GrayToBlob(const uint8_t *src, float *dst, const float scale, const float bias, int hw) {
+    int i = 0;
+    memset(dst, 0, hw * 4 * sizeof(float));
+#ifdef TNN_USE_NEON
+    float32x4_t scale_neon = vdupq_n_f32(scale);
+    float32x4_t bias_neon  = vdupq_n_f32(bias);
+    for (; i < hw - 7; i += 8) {
+        uint8x8_t v_u8     = vld1_u8(src + i);
+        int16x8_t v_s16    = vreinterpretq_s16_u16(vmovl_u8(v_u8));
+        float32x4_t vf32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_s16)));
+        float32x4_t vf32_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_s16)));
+        float32x4_t rf32_0 = vaddq_f32(bias_neon, vmulq_f32(scale_neon, vf32_0));
+        float32x4_t rf32_1 = vaddq_f32(bias_neon, vmulq_f32(scale_neon, vf32_1));
+
+        vst1q_lane_f32(dst + (i + 0) * 4, rf32_0, 0);
+        vst1q_lane_f32(dst + (i + 1) * 4, rf32_0, 1);
+        vst1q_lane_f32(dst + (i + 2) * 4, rf32_0, 2);
+        vst1q_lane_f32(dst + (i + 3) * 4, rf32_0, 3);
+        vst1q_lane_f32(dst + (i + 4) * 4, rf32_1, 0);
+        vst1q_lane_f32(dst + (i + 5) * 4, rf32_1, 1);
+        vst1q_lane_f32(dst + (i + 6) * 4, rf32_1, 2);
+        vst1q_lane_f32(dst + (i + 7) * 4, rf32_1, 3);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i] = scale * src[i] + bias;
+    }
+}
+
+/*
+convert data type from uint8 to int8, data format from nhw1 2 nhwc
+*/
+static void GrayToBlob(const uint8_t *src, int8_t *dst, const float scale, const float bias, int hw) {
+    int i = 0;
+    memset(dst, 0, hw * 4 * sizeof(int8_t));
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon = vdupq_n_f32(bias);
+    int8_t dst_tmp[8];
+    for (; i < hw - 7; i += 8) {
+        uint8x8_t v_u8     = vld1_u8(src + i);
+        int16x8_t v_s16    = vreinterpretq_s16_u16(vmovl_u8(v_u8));
+        float32x4_t vf32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_s16)));
+        float32x4_t vf32_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_s16)));
+        float32x4_t rf32_0 = vaddq_f32(bias_neon, vmulq_n_f32(vf32_0, scale));
+        float32x4_t rf32_1 = vaddq_f32(bias_neon, vmulq_n_f32(vf32_1, scale));
+
+        int16x4_t s16_l = vqmovn_s32(VCVTAQ_S32_F32(rf32_0));
+        int16x8_t s16   = VQMOVN_HIGH_S32_T(s16_l, VCVTAQ_S32_F32(rf32_1));
+        vst1_s8(dst_tmp, vqmovn_s16(s16));
+        dst[(i + 0) * 4] = dst_tmp[0];
+        dst[(i + 1) * 4] = dst_tmp[1];
+        dst[(i + 2) * 4] = dst_tmp[2];
+        dst[(i + 3) * 4] = dst_tmp[3];
+        dst[(i + 4) * 4] = dst_tmp[4];
+        dst[(i + 5) * 4] = dst_tmp[5];
+        dst[(i + 6) * 4] = dst_tmp[6];
+        dst[(i + 7) * 4] = dst_tmp[7];
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i] = float2int8(scale * src[i] + bias);
+    }
+}
+
+/*
+convert data type from uint8 to float, data format from nhw3 2 nc4hw4
+*/
+template <bool reverse_channel>
+static void BGRToBlobImpl(const uint8_t *src, float *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    float32x4x4_t vf32;
+    vf32.val[3] = vdupq_n_f32(0);
+    for (; i < hw - 7; i += 8) {
+        uint8x8x3_t v_u8 = vld3_u8(src + i * 3);
+        int16x8_t b_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[0]));
+        int16x8_t g_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[1]));
+        int16x8_t r_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[2]));
+
+        vf32.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? r_s16 : b_s16)));
+        vf32.val[1] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        vf32.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? b_s16 : r_s16)));
+
+        vf32.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32.val[0], scale[0]));
+        vf32.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32.val[1], scale[1]));
+        vf32.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32.val[2], scale[2]));
+
+        vst4q_f32(dst + i * 4, vf32);
+
+        vf32.val[0] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? r_s16 : b_s16)));
+        vf32.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        vf32.val[2] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? b_s16 : r_s16)));
+
+        vf32.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32.val[0], scale[0]));
+        vf32.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32.val[1], scale[1]));
+        vf32.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32.val[2], scale[2]));
+
+        vst4q_f32(dst + i * 4 + 16, vf32);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = scale[0] * src[3 * i + (reverse_channel ? 2 : 0)] + bias[0];
+        dst[4 * i + 1] = scale[1] * src[3 * i + 1] + bias[1];
+        dst[4 * i + 2] = scale[2] * src[3 * i + (reverse_channel ? 0 : 2)] + bias[2];
+        dst[4 * i + 3] = 0;
+    }
+}
+
+/*
+convert data type from uint8 to float, data format from nhw3 2 nc4hw4
+*/
+template <bool reverse_channel>
+static void BGRToBlobImpl(const uint8_t *src, int8_t *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    int8x8x4_t vi8x4;
+    vi8x4.val[3] = vdup_n_s8(0);
+    for (; i < hw - 7; i += 8) {
+        uint8x8x3_t v_u8 = vld3_u8(src + i * 3);
+        int16x8_t b_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[0]));
+        int16x8_t g_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[1]));
+        int16x8_t r_s16  = vreinterpretq_s16_u16(vmovl_u8(v_u8.val[2]));
+
+        float32x4_t f32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? r_s16 : b_s16)));
+        float32x4_t f32_1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        float32x4_t f32_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(reverse_channel ? b_s16 : r_s16)));
+        float32x4_t f32_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? r_s16 : b_s16)));
+        float32x4_t f32_4 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        float32x4_t f32_5 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(reverse_channel ? b_s16 : r_s16)));
+
+        f32_0 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_0, scale[0]));
+        f32_1 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_1, scale[1]));
+        f32_2 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_2, scale[2]));
+        f32_3 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_3, scale[0]));
+        f32_4 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_4, scale[1]));
+        f32_5 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_5, scale[2]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(f32_0));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(f32_3));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(f32_1));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(f32_4));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(f32_2));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(f32_5));
+
+        vi8x4.val[0] = vqmovn_s16(s16_0);
+        vi8x4.val[1] = vqmovn_s16(s16_1);
+        vi8x4.val[2] = vqmovn_s16(s16_2);
+
+        vst4_s8(dst + i * 4, vi8x4);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = float2int8(scale[0] * src[3 * i + (reverse_channel ? 2 : 0)] + bias[0]);
+        dst[4 * i + 1] = float2int8(scale[1] * src[3 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2int8(scale[2] * src[3 * i + (reverse_channel ? 0 : 2)] + bias[2]);
+        dst[4 * i + 3] = 0;
+    }
+}
+
+template<typename T>
+void BGRToBlob(const uint8_t *src, T *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel) {
+    if (reverse_channel) {
+        BGRToBlobImpl<true>(src, dst, scale, bias, hw);
+    } else {
+        BGRToBlobImpl<false>(src, dst, scale, bias, hw);
+    }
+}
+
+/*
+convert data type from Tin to Tout, data format from nchw 2 nc4hw4
+*/
+template <typename Tin, typename Tout>
+void NCHWToBlob(const Tin *src, Tout *dst, int channel, int hw, float *scale) {
+    PackC4(dst, src, hw, channel);
+}
+
+template <>
+void NCHWToBlob(const fp16_t *src, fp16_t *dst, int channel, int hw, float *scale) {
+    PackC8(dst, src, hw, channel);
+}
+
+template <>
+void NCHWToBlob(const float *src, fp16_t *dst, int channel, int hw, float *scale) {
+    PackC8(dst, src, hw, channel);
+}
+
+/*
+convert data type from float to int8, data format from nchw 2 nhwc
+*/
+template <>
+void NCHWToBlob(const float *src, int8_t *dst, int channel, int hw, float *scale) {
+    PackCAndQuant(dst, src, hw, channel, scale);
+}
+
+template <bool reverse_channel>
+static void BlobToBGRAImpl(const float *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    float32x4_t bias_neon_a = vdupq_n_f32(bias[3]);
+    uint8x8x4_t vi8x4;
+    for (; i < hw - 7; i += 8) {
+        float32x4x4_t vf32_0 = vld4q_f32(src + i * 4);
+        float32x4x4_t vf32_1 = vld4q_f32(src + i * 4 + 16);
+
+        vf32_0.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32_0.val[0], scale[0]));
+        vf32_0.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32_0.val[1], scale[1]));
+        vf32_0.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32_0.val[2], scale[2]));
+        vf32_0.val[3] = vaddq_f32(bias_neon_a, vmulq_n_f32(vf32_0.val[3], scale[3]));
+        vf32_1.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32_1.val[0], scale[0]));
+        vf32_1.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32_1.val[1], scale[1]));
+        vf32_1.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32_1.val[2], scale[2]));
+        vf32_1.val[3] = vaddq_f32(bias_neon_a, vmulq_n_f32(vf32_1.val[3], scale[3]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[reverse_channel ? 2 : 0]));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(vf32_1.val[reverse_channel ? 2 : 0]));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[1]));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(vf32_1.val[1]));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[reverse_channel ? 0 : 2]));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(vf32_1.val[reverse_channel ? 0 : 2]));
+        int16x4_t s16_l3 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[3]));
+        int16x8_t s16_3  = VQMOVN_HIGH_S32_T(s16_l3, VCVTAQ_S32_F32(vf32_1.val[3]));
+
+        vi8x4.val[0] = vqmovun_s16(s16_0);
+        vi8x4.val[1] = vqmovun_s16(s16_1);
+        vi8x4.val[2] = vqmovun_s16(s16_2);
+        vi8x4.val[3] = vqmovun_s16(s16_3);
+
+        if (channel == 3) {
+            uint8x8x4_t vi8x4_tmp = vld4_u8(dst + i * 4);
+            vi8x4.val[3]          = vi8x4_tmp.val[3];
+        }
+
+        vst4_u8(dst + i * 4, vi8x4);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[4 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+        if (channel == 4) {
+            dst[4 * i + 3] = float2uint8(scale[3] * src[4 * i + 3] + bias[3]);
+        }
+    }
+}
+
+template <bool reverse_channel>
+static void BlobToBGRAImpl(const int8_t *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    float32x4_t bias_neon_a = vdupq_n_f32(bias[3]);
+    uint8x8x4_t vi8x4;
+    for (; i < hw - 7; i += 8) {
+        int8x8x4_t v_s8  = vld4_s8(src + i * 4);
+        int16x8_t b_s16  = vmovl_s8(v_s8.val[0]);
+        int16x8_t g_s16  = vmovl_s8(v_s8.val[1]);
+        int16x8_t r_s16  = vmovl_s8(v_s8.val[2]);
+        int16x8_t a_s16  = vmovl_s8(v_s8.val[3]);
+
+        float32x4_t f32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16)));
+        float32x4_t f32_1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        float32x4_t f32_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(r_s16)));
+        float32x4_t f32_3 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16)));
+        float32x4_t f32_4 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16)));
+        float32x4_t f32_5 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        float32x4_t f32_6 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(r_s16)));
+        float32x4_t f32_7 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16)));
+
+        f32_0 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_0, scale[0]));
+        f32_1 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_1, scale[1]));
+        f32_2 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_2, scale[2]));
+        f32_3 = vaddq_f32(bias_neon_a, vmulq_n_f32(f32_3, scale[3]));
+        f32_4 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_4, scale[0]));
+        f32_5 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_5, scale[1]));
+        f32_6 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_6, scale[2]));
+        f32_7 = vaddq_f32(bias_neon_a, vmulq_n_f32(f32_7, scale[3]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(reverse_channel ? f32_2 : f32_0));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(reverse_channel ? f32_6 : f32_4));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(f32_1));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(f32_5));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(reverse_channel ? f32_0 : f32_2));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(reverse_channel ? f32_4 : f32_6));
+        int16x4_t s16_l3 = vqmovn_s32(VCVTAQ_S32_F32(f32_3));
+        int16x8_t s16_3  = VQMOVN_HIGH_S32_T(s16_l3, VCVTAQ_S32_F32(f32_7));
+
+        vi8x4.val[0] = vqmovun_s16(s16_0);
+        vi8x4.val[1] = vqmovun_s16(s16_1);
+        vi8x4.val[2] = vqmovun_s16(s16_2);
+        vi8x4.val[3] = vqmovun_s16(s16_3);
+
+        if (channel == 3) {
+            uint8x8x4_t vi8x4_tmp = vld4_u8(dst + i * 4);
+            vi8x4.val[3]          = vi8x4_tmp.val[3];
+        }
+
+        vst4_u8(dst + i * 4, vi8x4);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[4 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[4 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+        if (channel == 4) {
+            dst[4 * i + 3] = float2uint8(scale[3] * src[4 * i + 3] + bias[3]);
+        }
+    }
+}
+
+/*
+if channel == 3, the fourth channel is ignored
+*/
+template<typename T>
+static void BlobToBGRA(const T *src, uint8_t *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel, int channel) {
+    if (reverse_channel) {
+        BlobToBGRAImpl<true>(src, dst, scale, bias, hw, channel);
+    } else {
+        BlobToBGRAImpl<false>(src, dst, scale, bias, hw, channel);
+    }
+}
+
+template <bool reverse_channel>
+static void BlobToBGRImpl(const float *src, uint8_t *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    uint8x8x3_t vi8x3;
+    for (; i < hw - 7; i += 8) {
+        float32x4x4_t vf32_0 = vld4q_f32(src + i * 4);
+        float32x4x4_t vf32_1 = vld4q_f32(src + i * 4 + 16);
+
+        vf32_0.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32_0.val[0], scale[0]));
+        vf32_0.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32_0.val[1], scale[1]));
+        vf32_0.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32_0.val[2], scale[2]));
+        vf32_1.val[0] = vaddq_f32(bias_neon_b, vmulq_n_f32(vf32_1.val[0], scale[0]));
+        vf32_1.val[1] = vaddq_f32(bias_neon_g, vmulq_n_f32(vf32_1.val[1], scale[1]));
+        vf32_1.val[2] = vaddq_f32(bias_neon_r, vmulq_n_f32(vf32_1.val[2], scale[2]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[reverse_channel ? 2 : 0]));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(vf32_1.val[reverse_channel ? 2 : 0]));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[1]));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(vf32_1.val[1]));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(vf32_0.val[reverse_channel ? 0 : 2]));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(vf32_1.val[reverse_channel ? 0 : 2]));
+
+        vi8x3.val[0] = vqmovun_s16(s16_0);
+        vi8x3.val[1] = vqmovun_s16(s16_1);
+        vi8x3.val[2] = vqmovun_s16(s16_2);
+
+        vst3_u8(dst + i * 3, vi8x3);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[3 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[3 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[3 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+    }
+}
+
+template <bool reverse_channel>
+static void BlobToBGRImpl(const int8_t *src, uint8_t *dst, const float *scale, const float *bias, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    float32x4_t bias_neon_b = vdupq_n_f32(bias[0]);
+    float32x4_t bias_neon_g = vdupq_n_f32(bias[1]);
+    float32x4_t bias_neon_r = vdupq_n_f32(bias[2]);
+    uint8x8x3_t vi8x3;
+    for (; i < hw - 7; i += 8) {
+        int8x8x4_t v_s8  = vld4_s8(src + i * 4);
+        int16x8_t b_s16  = vmovl_s8(v_s8.val[0]);
+        int16x8_t g_s16  = vmovl_s8(v_s8.val[1]);
+        int16x8_t r_s16  = vmovl_s8(v_s8.val[2]);
+
+        float32x4_t f32_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16)));
+        float32x4_t f32_1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(g_s16)));
+        float32x4_t f32_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(r_s16)));
+        float32x4_t f32_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16)));
+        float32x4_t f32_4 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(g_s16)));
+        float32x4_t f32_5 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(r_s16)));
+
+        f32_0 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_0, scale[0]));
+        f32_1 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_1, scale[1]));
+        f32_2 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_2, scale[2]));
+        f32_3 = vaddq_f32(bias_neon_b, vmulq_n_f32(f32_3, scale[0]));
+        f32_4 = vaddq_f32(bias_neon_g, vmulq_n_f32(f32_4, scale[1]));
+        f32_5 = vaddq_f32(bias_neon_r, vmulq_n_f32(f32_5, scale[2]));
+
+        int16x4_t s16_l0 = vqmovn_s32(VCVTAQ_S32_F32(reverse_channel ? f32_2 : f32_0));
+        int16x8_t s16_0  = VQMOVN_HIGH_S32_T(s16_l0, VCVTAQ_S32_F32(reverse_channel ? f32_5 : f32_3));
+        int16x4_t s16_l1 = vqmovn_s32(VCVTAQ_S32_F32(f32_1));
+        int16x8_t s16_1  = VQMOVN_HIGH_S32_T(s16_l1, VCVTAQ_S32_F32(f32_4));
+        int16x4_t s16_l2 = vqmovn_s32(VCVTAQ_S32_F32(reverse_channel ? f32_0 : f32_2));
+        int16x8_t s16_2  = VQMOVN_HIGH_S32_T(s16_l2, VCVTAQ_S32_F32(reverse_channel ? f32_3 : f32_5));
+
+        vi8x3.val[0] = vqmovun_s16(s16_0);
+        vi8x3.val[1] = vqmovun_s16(s16_1);
+        vi8x3.val[2] = vqmovun_s16(s16_2);
+
+        vst3_u8(dst + i * 3, vi8x3);
+    }
+#endif
+    for (; i < hw; ++i) {
+        dst[3 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[3 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[3 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+    }
+}
+
+template<typename T>
+static void BlobToBGR(const T *src, uint8_t *dst, const float *scale, const float *bias, int hw,
+                      bool reverse_channel) {
+    if (reverse_channel) {
+        BlobToBGRImpl<true>(src, dst, scale, bias, hw);
+    } else {
+        BlobToBGRImpl<false>(src, dst, scale, bias, hw);
+    }
+}
+
+static Mat GetBGRFromYUV(Mat& image, const DimsVector& dims, const int hw, bool is_nv12) {
+    Mat bgr(DEVICE_ARM, N8UC3, image.GetDims());
+    auto batch = DimsFunctionUtils::GetDim(dims, 0);
+    for (int n = 0; n < batch; n++) {
+        if (is_nv12) {
+            NV12ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+        } else {
+            NV21ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+        }
+    }
+    return bgr;
+}
+
+/*
+reverse channel in format nchw
+*/
+template <typename T>
+void NCHWChannelReverse(T *src, T *dst, int channel, int hw) {
+    for (int c = 0; c < channel / 2; c++) {
+        auto offset0 = c * hw;
+        auto offset1 = (channel - 1 - c) * hw;
+        std::vector<T> tmp(src + offset0, src + offset0 + hw);
+        memcpy(dst + offset0, src + offset1, hw * sizeof(T));
+        memcpy(dst + offset1, tmp.data(), hw * sizeof(T));
+    }
+}
+
+/*
+reverse channel in format nhwc
+*/
+template <typename T>
+void NHWCChannelReverse(T *src, T *dst, int channel, int hw) {
+    for (int i = 0; i < hw; i++) {
+        for (int c = 0; c < channel / 2; c++) {
+            T tmp                              = src[i * channel + c];
+            dst[i * channel + c]               = src[i * channel + channel - 1 - c];
+            dst[i * channel + channel - 1 - c] = tmp;
+        }
+    }
+}
+
+/*
+reverse channel in format rgb uint8
+*/
+void RGBChannelReverse(uint8_t *src, uint8_t *dst, int channel, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    for (; i + 15 < hw; i += 16) {
+        uint8x16x3_t v_u8 = vld3q_u8(src + i * 3);
+        uint8x16_t v_temp = v_u8.val[0];
+        v_u8.val[0]       = v_u8.val[2];
+        v_u8.val[2]       = v_temp;
+        vst3q_u8(dst + i * 3, v_u8);
+    }
+#endif
+    for (; i < hw; i++) {
+        uint8_t tmp    = src[i * 3];
+        dst[i * 3]     = src[i * 3 + 2];
+        dst[i * 3 + 2] = tmp;
+    }
+}
+
+/*
+reverse channel in format rgba uint8, only reverse rgb
+*/
+void RGBAChannelReverse(uint8_t *src, uint8_t *dst, int channel, int hw) {
+    int i = 0;
+#ifdef TNN_USE_NEON
+    for (; i + 15 < hw; i += 16) {
+        uint8x16x4_t v_u8 = vld4q_u8(src + i * 4);
+        uint8x16_t v_temp = v_u8.val[0];
+        v_u8.val[0]       = v_u8.val[2];
+        v_u8.val[2]       = v_temp;
+        vst4q_u8(dst + i * 4, v_u8);
+    }
+#endif
+    for (; i < hw; i++) {
+        uint8_t tmp    = src[i * 4];
+        dst[i * 4]     = src[i * 4 + 2];
+        dst[i * 4 + 2] = tmp;
+    }
+}
+
+bool NeedDoScaleBias(const MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static Status ConvertN8UC4ToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRAToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw,
+                   reinterpret_cast<int8_t*>(handle_ptr) + n * 4 * hw, fused_int8_scale.data(), fused_int8_bias.data(),
+                   hw, param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertN8UC4ToFloatBlob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                      const DimsVector& dims, const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRAToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw,
+                   reinterpret_cast<float*>(handle_ptr) + n * 4 * hw, param.scale.data(), param.bias.data(), hw,
+                   param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertN8UC3ToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw,
+                  reinterpret_cast<int8_t*>(handle_ptr) + n * 4 * hw, fused_int8_scale.data(), fused_int8_bias.data(),
+                  hw, param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertN8UC3ToFloatBlob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                      const DimsVector& dims, const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw,
+                  reinterpret_cast<float*>(handle_ptr) + n * 4 * hw, param.scale.data(), param.bias.data(), hw,
+                  param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNGRAYToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        GrayToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 1 * hw,
+                   reinterpret_cast<int8_t*>(handle_ptr) + n * 4 * hw, fused_int8_scale[0], fused_int8_bias[0], hw);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNGRAYToFloatBlob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                      const DimsVector& dims, const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        GrayToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 1 * hw,
+                   reinterpret_cast<float*>(handle_ptr) + n * 4 * hw, param.scale[0], param.bias[0], hw);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNNV12ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, true);
+    return ConvertN8UC3ToInt8Blob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNNV12ToFloatBlob(Mat& image, char* handle_ptr,
+                                      const MatConvertParam& param, const DimsVector& dims,
+                                      const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, true);
+    return ConvertN8UC3ToFloatBlob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNNV21ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, false);
+    return ConvertN8UC3ToInt8Blob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNNV21ToFloatBlob(Mat& image, char* handle_ptr,
+                                      const MatConvertParam& param, const DimsVector& dims,
+                                      const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, false);
+    return ConvertN8UC3ToFloatBlob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNCHWFloatToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                         const DimsVector& dims, const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        NCHWToBlob(reinterpret_cast<float*>(image.GetData()) + n * channel * hw,
+                   reinterpret_cast<int8_t*>(handle_ptr) + n * c_r4 * hw, channel, hw, fused_int8_scale.data());
+    }
+    return TNN_OK;
+}
+
+template <typename T_mat, typename T_blob>
+static Status ConvertFloatMatToFloatBlob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                         const DimsVector& dims, const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    if (NeedDoScaleBias(param)) {
+        for (int n = 0; n < batch; n++) {
+            NCHWToBlob(reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw,
+                       reinterpret_cast<T_blob*>(handle_ptr) + n * c_r4 * hw, channel, hw, nullptr);
+            ScaleBias(reinterpret_cast<T_blob*>(handle_ptr) + n * c_r4 * hw, channel, hw, param.scale.data(),
+                      param.bias.data());
+        }
+    } else {
+        for (int n = 0; n < batch; n++) {
+            NCHWToBlob(reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw,
+                       reinterpret_cast<T_blob*>(handle_ptr) + n * c_r4 * hw, channel, hw, nullptr);
+        }
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8MatToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                       const DimsVector& dims, const int hw, const int c_r4,
+                                       std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    return DataFormatConverter::ConvertFromNCHWToNHWC4Int8(reinterpret_cast<int8_t*>(image.GetData()),
+                                                           reinterpret_cast<int8_t*>(handle_ptr), batch, channel, hw);
+}
+
+// convert from mat to blob
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertN8UC4ToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, ConvertN8UC4ToFloatBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertN8UC3ToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, ConvertN8UC3ToFloatBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNGRAYToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, ConvertNGRAYToFloatBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV12ToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, ConvertNNV12ToFloatBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV21ToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, ConvertNNV21ToFloatBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNCHWFloatToInt8Blob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<float,float>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_BFP16, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<float, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_BFP16_TEST, DATA_TYPE_BFP16, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<bfp16_t, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<fp16_t,float>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
+
+#if TNN_ARM82
+static Status ConvertN8UC4ToHalfBlob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRAToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw,
+                   reinterpret_cast<fp16_t*>(handle_ptr) + n * 8 * hw, param.scale.data(), param.bias.data(), hw,
+                   param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertN8UC3ToHalfBlob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BGRToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw,
+                  reinterpret_cast<fp16_t*>(handle_ptr) + n * 8 * hw, param.scale.data(), param.bias.data(), hw,
+                  param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNGRAYToHalfBlob(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        GrayToBlob(reinterpret_cast<uint8_t*>(image.GetData()) + n * 1 * hw,
+                   reinterpret_cast<fp16_t*>(handle_ptr) + n * 8 * hw, param.scale[0], param.bias[0], hw);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNNV12ToHalfBlob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, true);
+    return ConvertN8UC3ToHalfBlob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNNV21ToHalfBlob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, false);
+    return ConvertN8UC3ToHalfBlob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+template <typename T_mat>
+static Status ConvertFloatMatToHalfBlob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                        const DimsVector& dims, const int hw, const int c_r4,
+                                        std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    auto c_r8    = ROUND_UP(c_r4, 8);
+    if (NeedDoScaleBias(param)) {
+        for (int n = 0; n < batch; n++) {
+            NCHWToBlob(reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw,
+                       reinterpret_cast<fp16_t*>(handle_ptr) + n * c_r8 * hw, channel, hw, nullptr);
+            ScaleBias(reinterpret_cast<fp16_t*>(handle_ptr) + n * c_r8 * hw, channel, hw, param.scale.data(),
+                      param.bias.data());
+        }
+    } else {
+        for (int n = 0; n < batch; n++) {
+            NCHWToBlob(reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw,
+                       reinterpret_cast<fp16_t*>(handle_ptr) + n * c_r8 * hw, channel, hw, nullptr);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertN8UC4ToHalfBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertN8UC3ToHalfBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertNGRAYToHalfBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertNNV12ToHalfBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertNNV21ToHalfBlob)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertFloatMatToHalfBlob<float>)
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertFloatMatToHalfBlob<fp16_t>)
+#endif
+
+static Status ConvertInt8BlobToN8UC4(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGRA(reinterpret_cast<int8_t*>(handle_ptr) + n * 4 * hw,
+                   reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw, fused_int8_scale.data(),
+                   fused_int8_bias.data(), hw, param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertFloatBlobToN8UC4(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                      const DimsVector& dims, const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGRA(reinterpret_cast<float*>(handle_ptr) + n * 4 * hw,
+                   reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw, param.scale.data(), param.bias.data(), hw,
+                   param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToN8UC3(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGR(reinterpret_cast<int8_t*>(handle_ptr) + n * 4 * hw,
+                  reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw, fused_int8_scale.data(),
+                  fused_int8_bias.data(), hw, param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertFloatBlobToN8UC3(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                      const DimsVector& dims, const int hw, const int c_r4,
+                                      std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGR(reinterpret_cast<float*>(handle_ptr) + n * 4 * hw,
+                  reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw, param.scale.data(), param.bias.data(), hw,
+                  param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToNCHWFloat(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                         const DimsVector& dims, const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        Int8BlobToNCHW(reinterpret_cast<int8_t*>(handle_ptr) + n * c_r4 * hw,
+                       reinterpret_cast<float*>(image.GetData()) + n * channel * hw, channel, hw,
+                       fused_int8_scale.data(), fused_int8_bias.data());
+    }
+    return TNN_OK;
+}
+
+template <typename T_mat, typename T_blob>
+static Status ConvertFloatBlobToFloatMat(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                         const DimsVector& dims, const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    if (NeedDoScaleBias(param)) {
+        for (int n = 0; n < batch; n++) {
+            RawBuffer scale_biased(c_r4 * hw * sizeof(float));
+            ScaleBias(reinterpret_cast<T_blob*>(handle_ptr) + n * c_r4 * hw, channel, hw, param.scale.data(),
+                      param.bias.data(), scale_biased.force_to<T_blob*>());
+            FloatBlobToNCHW(scale_biased.force_to<T_blob*>(),
+                            reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw, channel, hw);
+        }
+    } else {
+        for (int n = 0; n < batch; n++) {
+            FloatBlobToNCHW(reinterpret_cast<T_blob*>(handle_ptr) + n * c_r4 * hw,
+                            reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw, channel, hw);
+        }
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToInt8Mat(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                       const DimsVector& dims, const int hw, const int c_r4,
+                                       std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    return DataFormatConverter::ConvertFromNHWC4ToNCHWInt8(
+        reinterpret_cast<int8_t*>(handle_ptr), reinterpret_cast<int8_t*>(image.GetData()), batch, channel, hw);
+}
+
+// convert from blob to mat
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC4)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_FLOAT, CVT_DIR_BLOB2MAT, ConvertFloatBlobToN8UC4)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC3)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_FLOAT, CVT_DIR_BLOB2MAT, ConvertFloatBlobToN8UC3)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToNCHWFloat)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_FLOAT, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<float,float>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_BFP16, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<float, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_BFP16_TEST, DATA_TYPE_BFP16, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<bfp16_t, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
+
+#if TNN_ARM82
+static Status ConvertHalfBlobToN8UC4(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGRA(reinterpret_cast<fp16_t*>(handle_ptr) + n * 8 * hw,
+                   reinterpret_cast<uint8_t*>(image.GetData()) + n * 4 * hw, param.scale.data(), param.bias.data(), hw,
+                   param.reverse_channel, channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertHalfBlobToN8UC3(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    for (int n = 0; n < batch; n++) {
+        BlobToBGR(reinterpret_cast<fp16_t*>(handle_ptr) + n * 8 * hw,
+                  reinterpret_cast<uint8_t*>(image.GetData()) + n * 3 * hw, param.scale.data(), param.bias.data(), hw,
+                  param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+template <typename T_mat>
+static Status ConvertHalfBlobToFloatMat(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                        const DimsVector& dims, const int hw, const int c_r4,
+                                        std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    auto c_r8    = ROUND_UP(c_r4, 8);
+    if (NeedDoScaleBias(param)) {
+        for (int n = 0; n < batch; n++) {
+            RawBuffer scale_biased(c_r8 * hw * sizeof(fp16_t));
+            ScaleBias(reinterpret_cast<fp16_t*>(handle_ptr) + n * c_r8 * hw, channel, hw, param.scale.data(),
+                      param.bias.data(), scale_biased.force_to<fp16_t*>());
+            HalfBlobToNCHW(scale_biased.force_to<fp16_t*>(),
+                           reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw, channel, hw);
+        }
+    } else {
+        for (int n = 0; n < batch; n++) {
+            HalfBlobToNCHW(reinterpret_cast<fp16_t*>(handle_ptr) + n * c_r8 * hw,
+                           reinterpret_cast<T_mat*>(image.GetData()) + n * channel * hw, channel, hw);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToN8UC4)
+REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToN8UC3)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToFloatMat<float>)
+REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToFloatMat<fp16_t>)
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.h b/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.h
new file mode 100644
index 0000000..4c7eac1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_blob_converter.h
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+typedef Status (*ArmBlobConvertFunc)(Mat& image,
+                                     char* handle_ptr,
+                                     const MatConvertParam& param,
+                                     const DimsVector& dims,
+                                     const int hw,
+                                     const int c_r4,
+                                     std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias);
+
+typedef enum {
+    CVT_DIR_MAT2BLOB = 0,
+    CVT_DIR_BLOB2MAT = 1
+} BlobConvertDirection;
+
+class ArmBlobConverterAcc : public BlobConverterAcc {
+public:
+    ArmBlobConverterAcc(Blob* blob);
+    virtual ~ArmBlobConverterAcc();
+
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+    static Status RegisterBlobConvertFunc(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                          ArmBlobConvertFunc cvt_func);
+
+private:
+    Status ReverseInputImageChannel(Mat& image, const BlobDesc& desc, const DimsVector& dims, const int hw);
+    Status ReverseOutImageChannel(Mat& image, const BlobDesc& desc, const DimsVector& dims, const int hw);
+
+    std::vector<float> fused_int8_scale;
+    std::vector<float> fused_int8_bias;
+    ArmBlobConvertFunc cvt_func_;
+
+    static Status GetBlobConvertFunc(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                     ArmBlobConvertFunc& cvt_func);
+    static std::string GetUniqueBlobConvertKey(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir);
+    static std::map<std::string, ArmBlobConvertFunc>& GetBlobConvertFuncMap();
+};
+
+class ArmBlobConvertFuncRegister {
+public:
+    explicit ArmBlobConvertFuncRegister(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                        ArmBlobConvertFunc cvt_func) {
+        ArmBlobConverterAcc::RegisterBlobConvertFunc(mat_type, data_type, cvt_dir, cvt_func);
+    }
+};
+
+#define REGISTER_ARM_BLOB_CONVERT_FUNC(mat_type, data_type, cvt_dir, cvt_func)                                  \
+    ArmBlobConvertFuncRegister g_arm_##mat_type##_##data_type##_##cvt_dir##_register(mat_type, data_type,       \
+                                                                                     cvt_dir, cvt_func);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_IMAGE_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_common.h b/3rdparty/TNN/source/tnn/device/arm/arm_common.h
new file mode 100644
index 0000000..96ed57f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_common.h
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_COMMON_H_
+#define TNN_ARM_COMMON_H_
+
+#include <iostream>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/acc/Float4.h"
+#include "tnn/device/arm/acc/compute/compute.h"
+#include "tnn/device/arm/acc/compute/compute_int8.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#define NEON_KERNEL_EXTRA_LOAD (64)
+
+#ifndef NEON_INT8CONV_TILE_HW
+#define NEON_INT8CONV_TILE_HW (4)
+#endif
+
+#ifdef TNN_USE_NEON
+
+#ifdef __aarch64__
+#define VQMOVN_HIGH_S32_T(lows16, highs32) vqmovn_high_s32((lows16), (highs32))
+#define VMOVL_HIGH_S16_T(a) vmovl_high_s16(a)
+#define VCVTAQ_S32_F32(a) vcvtaq_s32_f32(a)
+#define VPADDQ_S32(a, b) vpaddq_s32(a, b)
+
+#else
+#define VQMOVN_HIGH_S32_T(lows16, highs32) vcombine_s16((lows16), vqmovn_s32(highs32))
+#define VMOVL_HIGH_S16_T(a) vmovl_s16(vget_high_s16(a))
+// trick convert for float, only accurate when abs(a) < 1.5 * 2^22, assume ok
+// magic number 12582912.0f will do rounding to nearest, ties to even
+// but naive and aarch64 will do rounding to nearest, ties away from zero, it's not aligned
+// ties away from zero: val + (val >= 0.f ? 0.5f : -0.5f)
+// const float32x4_t kNeonClampNumberf = vdupq_n_f32(12582912.0f);
+// const int32x4_t kNeonClampNumberi   = vdupq_n_s32(0x4B400000);
+// #define VCVTAQ_S32_F32(a) (vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(a, kNeonClampNumberf)), kNeonClampNumberi))
+const float32x4_t kNeonA = vdupq_n_f32(0.5f);
+const float32x4_t kNeonB = vdupq_n_f32(-0.5f);
+const float32x4_t kNeonZero = vdupq_n_f32(0.f);
+#define VCVTAQ_S32_F32(a) (vcvtq_s32_f32(vaddq_f32(a, vbslq_f32(vcgeq_f32(a, kNeonZero), kNeonA, kNeonB))))
+
+#define VPADDQ_S32(a, b)                                                                                               \
+    vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b)))
+
+#endif
+
+#endif
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_context.cc b/3rdparty/TNN/source/tnn/device/arm/arm_context.cc
new file mode 100644
index 0000000..e7fbff5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_context.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+Status ArmContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status ArmContext::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status ArmContext::ShareCommandQueue(Context* context) {
+    return TNN_OK;
+}
+
+Status ArmContext::OnInstanceForwardBegin() {
+    Context::OnInstanceForwardBegin();
+    OMP_SET_THREADS_(GetNumThreads());
+    return TNN_OK;
+}
+
+Status ArmContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status ArmContext::Synchronize() {
+    return TNN_OK;
+}
+
+Status ArmContext::SetNumThreads(int num_threads) {
+    num_threads_ = MIN(MAX(num_threads, 1), OMP_CORES_);
+    return TNN_OK;
+}
+
+int ArmContext::GetNumThreads() {
+    return num_threads_;
+}
+
+void* ArmContext::GetSharedWorkSpace(size_t size) {
+    return GetSharedWorkSpace(size, 0);
+}
+
+void* ArmContext::GetSharedWorkSpace(size_t size, int index) {
+    while(work_space_.size() < index + 1) {
+        work_space_.push_back(RawBuffer(ROUND_UP(size, 64)));
+    }
+    if (work_space_[index].GetBytesSize() < size) {
+        work_space_[index] = RawBuffer(ROUND_UP(size, 64));
+    }
+    return work_space_[index].force_to<void*>();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_context.h b/3rdparty/TNN/source/tnn/device/arm/arm_context.h
new file mode 100644
index 0000000..5f6f474
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_context.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
+
+#include "tnn/core/context.h"
+#include "tnn/interpreter/raw_buffer.h"
+namespace TNN_NS {
+
+class ArmContext : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+    
+    // @brief share tnn command queue to another context
+    virtual Status ShareCommandQueue(Context* context) override;
+    
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief set threads run on device
+    virtual Status SetNumThreads(int num_threads) override;
+
+    // @brief get threads run on device
+    virtual int GetNumThreads();
+
+    void* GetSharedWorkSpace(size_t size);
+    void* GetSharedWorkSpace(size_t size, int index);
+
+private:
+    int num_threads_ = 1;
+    std::vector<RawBuffer> work_space_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_device.cc b/3rdparty/TNN/source/tnn/device/arm/arm_device.cc
new file mode 100644
index 0000000..bf5c182
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_device.cc
@@ -0,0 +1,201 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_device.h"
+
+#include <stdlib.h>
+
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+static inline void *armMalloc(size_t size) {
+#if _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void *ptr = 0;
+    if (posix_memalign(&ptr, 32, size))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(32, size);
+#else
+    return malloc(size);
+#endif
+}
+
+ArmDevice::ArmDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+ArmDevice::~ArmDevice() {}
+
+BlobMemorySizeInfo ArmDevice::Calculate1DMemorySize(BlobDesc &desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    int count      = 1;
+    if (desc.data_format == DATA_FORMAT_NCHW || desc.data_format == DATA_FORMAT_AUTO) {
+        for (auto d : desc.dims)
+            count *= d;
+    } else {
+        // packed format
+        if (desc.data_type == DATA_TYPE_HALF) {
+            count = DimsFunctionUtils::GetDim(desc.dims, 0) * ROUND_UP(DimsFunctionUtils::GetDim(desc.dims, 1), 8) * DimsVectorUtils::Count(desc.dims, 2);
+        } else {
+            count = DimsFunctionUtils::GetDim(desc.dims, 0) * ROUND_UP(DimsFunctionUtils::GetDim(desc.dims, 1), 4) * DimsVectorUtils::Count(desc.dims, 2);
+        }
+    }
+    info.dims.push_back(count);
+    return info;
+}
+
+BlobMemorySizeInfo ArmDevice::Calculate(BlobDesc &desc) {
+    return this->Calculate1DMemorySize(desc);
+}
+
+Status ArmDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_ARM;
+    desc.data_format = DATA_FORMAT_NCHW;
+    if (mat_type == NCHW_FLOAT) {
+        desc.data_type = DATA_TYPE_FLOAT;
+    } else if (mat_type == RESERVED_BFP16_TEST) {
+        desc.data_type = DATA_TYPE_BFP16;
+    } else if (mat_type == RESERVED_FP16_TEST) {
+        desc.data_type = DATA_TYPE_HALF;
+    } else if (mat_type == N8UC3 || mat_type == N8UC4 || mat_type == NGRAY || mat_type == NNV21 || mat_type == NNV12 ||
+               mat_type == RESERVED_INT8_TEST) {
+        // round up to support special case like: N8UC4 with dims[1] = 3
+        desc.dims[1]   = ROUND_UP(desc.dims[1], 4);
+        desc.data_type = DATA_TYPE_INT8;
+    } else if (mat_type == NC_INT32) {
+        desc.data_type = DATA_TYPE_INT32;
+    }
+    auto size_info = Calculate(desc);
+    return Allocate(handle, size_info);
+}
+
+Status ArmDevice::Allocate(void **handle, BlobMemorySizeInfo &size_info) {
+    if (handle) {
+        int bytes_size = GetBlobMemoryBytesSize(size_info);
+        *handle        = armMalloc(bytes_size + NEON_KERNEL_EXTRA_LOAD);
+    }
+    return TNN_OK;
+}
+
+Status ArmDevice::Allocate(BlobHandle *handle, BlobMemorySizeInfo &size_info) {
+    void* data = nullptr;
+
+    // arm alloc extra 64 bypes for load(see NEON_KERNEL_EXTRA_LOAD)
+    auto status = Allocate(&data, size_info);
+    if (status != TNN_OK) {
+        return status;
+    }
+    handle->base         = data;
+    handle->bytes_offset = 16;
+
+    return TNN_OK;
+}
+
+Status ArmDevice::Free(void *handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+Status ArmDevice::CopyToDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(GetBlobHandlePtr(*dst), GetBlobHandlePtr(*src), size_in_bytes);
+
+    return TNN_OK;
+}
+
+Status ArmDevice::CopyFromDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(GetBlobHandlePtr(*dst), GetBlobHandlePtr(*src), size_in_bytes);
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc *ArmDevice::CreateLayerAcc(LayerType type) {
+    auto &layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    }
+    return NULL;
+}
+
+std::shared_ptr<const ImplementedPrecision> ArmDevice::GetImplementedPrecision(LayerType type) {
+    auto &layer_precision_map = GetLayerPrecisionMap();
+    if (layer_precision_map.count(type) > 0) {
+        return layer_precision_map[type];
+    }
+    return std::make_shared<ImplementedPrecision>();
+}
+
+NetworkType ArmDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_DEFAULT;
+}
+
+std::shared_ptr<const ImplementedLayout> ArmDevice::GetImplementedLayout(LayerType type) {
+    auto &layer_layout_map = GetLayerLayoutMap();
+    if (layer_layout_map.count(type) > 0) {
+        return layer_layout_map[type];
+    }
+    return std::make_shared<ImplementedLayout>();
+}
+
+Context *ArmDevice::CreateContext(int device_id) {
+    return new ArmContext();
+}
+
+Status ArmDevice::RegisterLayerAccCreator(LayerType type, LayerAccCreator *creator) {
+    GetLayerCreatorMap()[type] = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>> &ArmDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+Status ArmDevice::RegisterLayerPrecision(LayerType type, std::shared_ptr<ImplementedPrecision> precision) {
+    GetLayerPrecisionMap()[type] = precision;
+    return TNN_OK;
+}
+
+Status ArmDevice::RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+    GetLayerLayoutMap()[type] = layout;
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<ImplementedPrecision>> &ArmDevice::GetLayerPrecisionMap() {
+    static std::map<LayerType, std::shared_ptr<ImplementedPrecision>> layer_precision_map;
+    return layer_precision_map;
+}
+
+std::map<LayerType, std::shared_ptr<ImplementedLayout>> &ArmDevice::GetLayerLayoutMap() {
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>> layer_layout_map;
+    return layer_layout_map;
+}
+
+TypeDeviceRegister<ArmDevice> g_arm_device_register(DEVICE_ARM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_device.h b/3rdparty/TNN/source/tnn/device/arm/arm_device.h
new file mode 100644
index 0000000..b4b4f81
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_device.h
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_DEVICE_H_
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief ArmDevice create cpu memory and cpu layer acc
+
+class ArmDevice : public AbstractDevice {
+public:
+    explicit ArmDevice(DeviceType device_type);
+
+    virtual ~ArmDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(BlobHandle* handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    virtual std::shared_ptr<const ImplementedPrecision> GetImplementedPrecision(LayerType type);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+    static Status RegisterLayerPrecision(LayerType type, std::shared_ptr<ImplementedPrecision> precision);
+
+    static Status RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout);
+
+private:
+    BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc);
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+    static std::map<LayerType, std::shared_ptr<ImplementedPrecision>>& GetLayerPrecisionMap();
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>>& GetLayerLayoutMap();
+};
+
+//@brief ArmTypeLayerAccRegister register ArmTypeLayerAccCreator
+template <typename T>
+class ArmTypeLayerAccRegister {
+public:
+    explicit ArmTypeLayerAccRegister(LayerType type) {
+        ArmDevice::RegisterLayerAccCreator(type, new T());
+    }
+};
+
+class ArmTypeLayerPrecisionRegister {
+public:
+    explicit ArmTypeLayerPrecisionRegister(LayerType type, std::shared_ptr<ImplementedPrecision> precision) {
+        ArmDevice::RegisterLayerPrecision(type, precision);
+    }
+};
+
+class ArmTypeLayerLayoutRegister {
+public:
+    explicit ArmTypeLayerLayoutRegister(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+        ArmDevice::RegisterLayerLayout(type, layout);
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_DEVICE_FACTORY_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.cc b/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.cc
new file mode 100644
index 0000000..983d9ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.cc
@@ -0,0 +1,290 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_mat_converter.h"
+
+#include "tnn/device/arm/arm_mat_util.h"
+#include "tnn/device/arm/arm_util.h"
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+
+namespace TNN_NS {
+
+Status ArmMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, false);
+    if (ret != TNN_OK)
+        return ret;
+
+    auto elem_num = DimsVectorUtils::Count(src.GetDims());
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == NNV21 || src.GetMatType() == NNV12 || 
+        src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        memcpy(dst.GetData(), src.GetData(), elem_num * sizeof(uint8_t));
+    } else if(src.GetMatType() == NCHW_FLOAT) {
+        memcpy(dst.GetData(), src.GetData(), elem_num * sizeof(float));
+    } else {
+        return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::Copy, convert type not support yet");
+    }
+    return ret;
+}
+
+Status ArmMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NGRAY) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == N8UC3) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC3((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC3((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == N8UC4) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC4((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC4((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearYUV420sp((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                   (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestYUV420sp((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                  (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::Resize, convert type not support yet");
+    }
+
+    return ret;
+}
+
+Status ArmMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY) {
+        // element size 1
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() +
+                                          param.top_left_x + param.top_left_y * src.GetWidth());
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth());
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height, src.GetWidth(), dst.GetWidth());
+        }
+    } else if (src.GetMatType() == N8UC3) {
+        // element size 3
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 +
+                                          (param.top_left_x + param.top_left_y * src.GetWidth()) * 3);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width * 3, param.height, src.GetWidth() * 3, dst.GetWidth() * 3);
+        }
+    } else if (src.GetMatType() == N8UC4) {
+        // element size 4
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 4 +
+                                          (param.top_left_x + param.top_left_y * src.GetWidth()) * 4);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 4);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width * 4, param.height, src.GetWidth() * 4, dst.GetWidth() * 4);
+        }
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.top_left_x % 2 || param.top_left_y % 2 || param.width % 2 || param.height % 2) {
+            return Status(TNNERR_PARAM_ERR, "corp param can not be odd");
+        }
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            // crop y
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 / 2 +
+                                          param.top_left_x + param.top_left_y * src.GetWidth());
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3 / 2);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height, src.GetWidth(), dst.GetWidth());
+            // crop uv
+            src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 / 2 +
+                      src.GetWidth() * src.GetHeight() + param.top_left_x + param.top_left_y * src.GetWidth() / 2);
+            dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3 / 2 +
+                      dst.GetWidth() * dst.GetHeight());
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height / 2, src.GetWidth(), dst.GetWidth());
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::Crop, convert type not support yet");
+    }
+
+    return ret;
+}
+
+#define AFFINE_CHECK_RUN(func1, func2)                                                                      \
+    if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {             \
+        func1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),                     \
+                                (uint8_t*)dst.GetData(), dst_width, dst_height,                             \
+                                param.transform, param.border_val);                                         \
+    } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {     \
+        func2((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),                     \
+                            (uint8_t*)dst.GetData(), dst_width, dst_height,                                 \
+                            param.transform, param.border_val);                                             \
+    } else {                                                                                                \
+        return Status(TNNERR_PARAM_ERR, "warpaffine type not support yet");                                 \
+    }
+
+Status ArmMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NGRAY) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC1, WarpAffineNearestC1);
+    } else if (src.GetMatType() == N8UC3) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC3, WarpAffineNearestC3);
+    } else if (src.GetMatType() == N8UC4) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC4, WarpAffineNearestC4);
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearYUV420sp, WarpAffineNearestYUV420sp);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::WarpAffine, convert type not support yet");
+    }
+
+    return ret;
+}
+
+#undef AFFINE_CHECK_RUN
+
+Status ArmMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    switch (type) {
+        case COLOR_CONVERT_NV12TOBGR:
+            NV12ToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV21TOBGR:
+            NV21ToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV12TOBGRA:
+            NV12ToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV21TOBGRA:
+            NV21ToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_BGRTOGRAY:
+            BGRToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_BGRATOGRAY:
+            BGRAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_RGBTOGRAY:
+            RGBToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_RGBATOGRAY:
+            RGBAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        default:
+            return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::CvtColor, color conversion type not support yet");
+    }
+
+    return ret;
+}
+
+static Status CopyMakeBorderImpl(Mat& src, Mat& dst, CopyMakeBorderParam param, int channel) {
+    Status ret = TNN_OK;
+
+    if (param.border_type == BORDER_TYPE_CONSTANT) {
+        uint8_t border_ival = uint8_t(param.border_val);
+        int src_stride      = src.GetWidth() * channel;
+        int dst_stride      = dst.GetWidth() * channel;
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * channel);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * channel);
+            MatMemcpy2DWithPadding(src_ptr, dst_ptr, src.GetWidth() * channel, src.GetHeight(), src_stride, dst_stride,
+                                   param.top, param.bottom, param.left * channel, param.right * channel, border_ival);
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder border type not support yet");
+    }
+
+    return ret;
+}
+
+Status ArmMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY) {
+        // element size 1
+        ret = CopyMakeBorderImpl(src, dst, param, 1);
+    } else if (src.GetMatType() == N8UC3) {
+        // element size 3
+        ret = CopyMakeBorderImpl(src, dst, param, 3);
+    } else if (src.GetMatType() == N8UC4) {
+        // element size 4
+        ret = CopyMakeBorderImpl(src, dst, param, 4);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder mat type not support yet");
+    }
+
+    return ret;
+}
+
+DECLARE_MAT_CONVERTER_CREATER(Arm);
+REGISTER_MAT_CONVERTER(Arm, DEVICE_ARM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.h b/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.h
new file mode 100644
index 0000000..e8674ab
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_mat_converter.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_CONVERTER_H_
+
+#include "tnn/utils/mat_converter_acc.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_util.h"
+
+namespace TNN_NS {
+
+class ArmMatConverterAcc : public MatConverterAcc {
+public:
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.cc b/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.cc
new file mode 100644
index 0000000..a744ac9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.cc
@@ -0,0 +1,1576 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_mat_util.h"
+
+#include "stdlib.h"
+#include <algorithm>
+#include <type_traits>
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/mat_converter_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+static inline void *armMalloc(size_t size) {
+#if _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void *ptr = 0;
+    if (posix_memalign(&ptr, 32, size))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(32, size);
+#else
+    return malloc(size);
+#endif
+}
+
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)0), (int)UCHAR_MAX)
+#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)SHRT_MIN), (int)SHRT_MAX)
+#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)INT_MIN), (int)INT_MAX)
+
+void MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride) {
+    auto src_ptr = reinterpret_cast<uint8_t*>(src);
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+
+    for (int h = 0; h < height; h++) {
+        memcpy(dst_ptr, src_ptr, width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+    }
+
+}
+
+void MatMemcpy2DWithPadding(void* src, void* dst, int width, int height, int src_stride, int dst_stride,
+                            int top, int bottom, int left, int right, uint8_t pad_val) {
+    auto src_ptr = reinterpret_cast<uint8_t*>(src);
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+
+    int top_plane = top * dst_stride;
+    memset(dst_ptr, pad_val, top_plane);
+    dst_ptr += top_plane;
+
+    for (int h = 0; h < height; h++) {
+        memset(dst_ptr, pad_val, left);
+        dst_ptr += left;
+        memcpy(dst_ptr, src_ptr, width);
+        src_ptr += src_stride;
+        dst_ptr += width;
+        memset(dst_ptr, pad_val, right);
+        dst_ptr += right;
+    }
+
+    int bottom_plane = bottom * dst_stride;
+    memset(dst_ptr, pad_val, bottom_plane);
+}
+
+static void ResizeGetAdjacentRows(int sy, int prev_sy, short** rows0, short** rows1, int* xofs, 
+                                  const uint8_t* src, int src_stride, int c, int w, const short* ialphap) {
+    if (sy == prev_sy) {
+        // reuse all rows
+    } else if (sy == prev_sy + 1) {
+        // hresize one row
+        short* rows0_old  = *rows0;
+        *rows0            = *rows1;
+        *rows1            = rows0_old;
+        const uint8_t* S1 = src + src_stride * (sy + 1);
+
+        short* rows1p        = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const uint8_t* S1p = S1 + sx;
+
+#ifndef TNN_USE_NEON
+            for (int dc = 0; dc < c; ++dc) {
+                rows1p[dc]         = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+            }
+#else
+            if (c == 2) {
+                int16x4_t _a0a1XX = vld1_s16(ialphap);
+                int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
+                uint8x8_t _S1 = uint8x8_t();
+
+                _S1 = vld1_lane_u8(S1p, _S1, 0);
+                _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
+                _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
+                _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
+
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S1lowhigh = vget_low_s16(_S116);
+                int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
+                int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
+                int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else if (c == 3) {
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S1 = uint8x8_t();
+
+                _S1 = vld1_lane_u8(S1p, _S1, 0);
+                _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
+                _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
+                _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
+                _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
+                _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
+
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else if (c == 4) {
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S1 = vld1_u8(S1p);
+
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S1high = vget_high_s16(_S116);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else {
+                for (int dc = 0; dc < c; ++dc) {
+                    rows1p[dc]         = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+                }
+            }
+#endif
+
+            ialphap += 2;
+            rows1p += c;
+        }
+    } else {
+        // hresize two rows
+        const uint8_t* S0 = src + src_stride * (sy);
+        const uint8_t* S1 = src + src_stride * (sy + 1);
+
+        short* rows0p        = *rows0;
+        short* rows1p        = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const uint8_t* S0p = S0 + sx;
+            const uint8_t* S1p = S1 + sx;
+
+#ifndef TNN_USE_NEON
+            for (int dc = 0; dc < c; ++dc) {
+                rows0p[dc]         = (S0p[dc] * a0 + S0p[dc + c] * a1) >> 4;
+                rows1p[dc]         = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+            }
+#else
+            if (c == 2) {
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S0 = uint8x8_t();
+                uint8x8_t _S1 = uint8x8_t();
+
+                _S0 = vld1_lane_u8(S0p, _S0, 0);
+                _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
+                _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
+                _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
+
+                _S1 = vld1_lane_u8(S1p, _S1, 0);
+                _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
+                _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
+                _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
+
+                int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S0lowhigh = vget_low_s16(_S016);
+                int16x4_t _S1lowhigh = vget_low_s16(_S116);
+                int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
+                int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
+                _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
+                int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
+                int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
+                vst1_s16(rows0p, _rows01_sr4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else if (c == 3) {
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S0 = uint8x8_t();
+                uint8x8_t _S1 = uint8x8_t();
+
+                _S0 = vld1_lane_u8(S0p, _S0, 0);
+                _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
+                _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
+                _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
+                _S0 = vld1_lane_u8(S0p + 4, _S0, 4);
+                _S0 = vld1_lane_u8(S0p + 5, _S0, 5);
+
+                _S1 = vld1_lane_u8(S1p, _S1, 0);
+                _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
+                _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
+                _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
+                _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
+                _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
+
+                int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S0low = vget_low_s16(_S016);
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
+                int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
+                int32x4_t _rows0 = vmull_s16(_S0low, _a0);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows0 = vmlal_s16(_rows0, _S0high, _a1);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows0p, _rows0_sr4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else if (c == 4) {
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S0 = vld1_u8(S0p);
+                uint8x8_t _S1 = vld1_u8(S1p);
+                int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S0low = vget_low_s16(_S016);
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S0high = vget_high_s16(_S016);
+                int16x4_t _S1high = vget_high_s16(_S116);
+                int32x4_t _rows0 = vmull_s16(_S0low, _a0);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows0 = vmlal_s16(_rows0, _S0high, _a1);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows0p, _rows0_sr4);
+                vst1_s16(rows1p, _rows1_sr4);
+            } else {
+                for (int dc = 0; dc < c; ++dc) {
+                    rows0p[dc]         = (S0p[dc] * a0 + S0p[dc + c] * a1) >> 4;
+                    rows1p[dc]         = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+                }
+            }
+#endif
+
+            ialphap += 2;
+            rows0p += c;
+            rows1p += c;
+        }
+    }
+}
+
+static void ResizeCalculateOneRow(short* rows0p, short* rows1p, const int b0, const int b1, const int w, const int c,
+                                  uint8_t* Dp) {
+#ifndef TNN_USE_NEON
+    int remain = w * c;
+#else
+    int nn = (w * c) >> 3;
+    int remain = (w * c) - (nn << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (; nn > 0; nn--) {
+        int16x4_t _rows0p_sr4   = vld1_s16(rows0p);
+        int16x4_t _rows1p_sr4   = vld1_s16(rows1p);
+        int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+        int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+        int32x4_t _rows0p_sr4_mb0   = vmull_s16(_rows0p_sr4, _b0);
+        int32x4_t _rows1p_sr4_mb1   = vmull_s16(_rows1p_sr4, _b1);
+        int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+        int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+        int32x4_t _acc = _v2;
+        _acc           = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+        _acc           = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+        int32x4_t _acc_1 = _v2;
+        _acc_1           = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+        _acc_1           = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+        int16x4_t _acc16   = vshrn_n_s32(_acc, 2);
+        int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+        uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+        vst1_u8(Dp, _D);
+
+        Dp += 8;
+        rows0p += 8;
+        rows1p += 8;
+    }
+#endif
+    for (; remain; --remain) {
+        *Dp++ = (uint8_t)(
+            ((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
+    }
+}
+
+struct ResizeBilinearKernelParm {
+    ResizeBilinearKernelParm(int* _xofs, int* _yofs, short* _ialpha, short* _ibeta,
+                             const uint8_t* _src, uint8_t* _dst, int _src_plane, int _src_stride, int _schannel) {
+        xofs       = _xofs;
+        yofs       = _yofs;
+        ialpha     = _ialpha;
+        ibeta      = _ibeta;
+        src        = _src;
+        dst        = _dst;
+        src_plane  = _src_plane;
+        src_stride = _src_stride;
+        schannel   = _schannel;
+    };
+
+    int* xofs;
+    int* yofs;
+    short* ialpha;
+    short* ibeta;
+    const uint8_t* src;
+    uint8_t* dst;
+    int src_plane;
+    int src_stride;
+    int schannel;
+};
+
+void ResizeBilinearOneRow(ResizeBilinearKernelParm& param, int thread_id, short** rows0_t, short** rows1_t,
+                          int* prev_sy, int b, int w, int h, int stride, int dy) {
+    int sy = param.yofs[dy];
+    ResizeGetAdjacentRows(sy, prev_sy[thread_id], &rows0_t[thread_id], &rows1_t[thread_id], param.xofs,
+                          param.src + b * param.src_plane, param.src_stride, param.schannel, w, param.ialpha);
+    prev_sy[thread_id] = sy;
+
+    // vresize
+    short b0 = param.ibeta[dy * 2];
+    short b1 = param.ibeta[dy * 2 + 1];
+
+    uint8_t* Dp   = param.dst + stride * (b * h + dy);
+
+    ResizeCalculateOneRow(rows0_t[thread_id], rows1_t[thread_id], b0, b1, w, param.schannel, Dp);
+}
+
+#define ResizeBilinearPreparation(channel)                               \
+    int schannel  = channel;                                             \
+    int* buf      = nullptr;                                             \
+    GetResizeBuf(src_w, src_h, w, h, schannel, &buf);                    \
+    int* xofs     = buf;                                                 \
+    int* yofs     = buf + w;                                             \
+    short* ialpha = (short*)(buf + w + h);                               \
+    short* ibeta  = (short*)(buf + w + h + w);                           \
+    int src_plane = src_h * src_stride;
+
+void ResizeBilinearC1Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                          uint8_t* dst, int w, int h, int stride) {
+    ResizeBilinearPreparation(1);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[w * max_num_threads];
+    short* rows1        = new short[w * max_num_threads];
+    short* rows0_t[max_num_threads];
+    short* rows1_t[max_num_threads];
+    int    prev_sy[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * w;
+            rows1_t[t] = rows1 + t * w;
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id  = OMP_TID_;
+            ResizeBilinearOneRow(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+}
+
+void ResizeBilinearC2Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                          uint8_t* dst, int w, int h, int stride) {
+    ResizeBilinearPreparation(2);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 2 + 2) * max_num_threads];
+    short* rows1        = new short[(w * 2 + 2) * max_num_threads];
+    short* rows0_t[max_num_threads];
+    short* rows1_t[max_num_threads];
+    int    prev_sy[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 2 + 2);
+            rows1_t[t] = rows1 + t * (w * 2 + 2);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id  = OMP_TID_;
+            ResizeBilinearOneRow(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+}
+
+void ResizeBilinearC3Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                          uint8_t* dst, int w, int h, int stride) {
+    ResizeBilinearPreparation(3);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 3 + 1) * max_num_threads];
+    short* rows1        = new short[(w * 3 + 1) * max_num_threads];
+    short* rows0_t[max_num_threads];
+    short* rows1_t[max_num_threads];
+    int    prev_sy[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 3 + 1);
+            rows1_t[t] = rows1 + t * (w * 3 + 1);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id  = OMP_TID_;
+            ResizeBilinearOneRow(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+}
+
+void ResizeBilinearC4Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                          uint8_t* dst, int w, int h, int stride) {
+    ResizeBilinearPreparation(4);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 4) * max_num_threads];
+    short* rows1        = new short[(w * 4) * max_num_threads];
+    short* rows0_t[max_num_threads];
+    short* rows1_t[max_num_threads];
+    int    prev_sy[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 4);
+            rows1_t[t] = rows1 + t * (w * 4);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id  = OMP_TID_;
+            ResizeBilinearOneRow(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+}
+
+void ResizeBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = w * h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        ResizeBilinearC1(srcY, 1, src_w, src_h, dstY, w, h);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + w * h;
+        ResizeBilinearC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, w / 2, h / 2);
+    }
+}
+
+void ResizeBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC1Impl(src, batch, src_w, src_h, src_w, dst, w, h, w);
+}
+
+void ResizeBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC2Impl(src, batch, src_w, src_h, src_w * 2, dst, w, h, w * 2);
+}
+
+void ResizeBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC3Impl(src, batch, src_w, src_h, src_w * 3, dst, w, h, w * 3);
+}
+
+void ResizeBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC4Impl(src, batch, src_w, src_h, src_w * 4, dst, w, h, w * 4);
+}
+
+#define ResizeNearestPreparation(channel)                               \
+    int schannel  = channel;                                            \
+    int* buf      = nullptr;                                            \
+    GetResizeBufNearset(src_w, src_h, w, h, schannel, &buf);            \
+    int* xofs     = buf;                                                \
+    int* yofs     = buf + w;                                            \
+    uint8_t* ialpha = (uint8_t*)(buf + w + h);                          \
+    uint8_t* ibeta  = (uint8_t*)(buf + w + h + w);
+
+#define ResizeNearestLoopPreparation()                                  \
+    int sy = (ibeta[dy] == 0) ? yofs[dy] + 1 : yofs[dy];                \
+    const uint8_t* Sp = src + src_stride * (b * src_h + sy);            \
+    uint8_t* Dp       = dst + stride * (b * h + dy);                    \
+    int dx = 0;                                                         \
+
+#ifdef TNN_USE_NEON
+
+#define MAKE_LOAD(n)                                                    \
+    _sx = vld1q_s32(xofs_p);                                            \
+    _S0 = vld##n##_lane_u8(Sp + _sx[0], _S0, 0);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[1], _S0, 1);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[2], _S0, 2);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[3], _S0, 3);                        \
+    _S1 = vld##n##_lane_u8(Sp + _sx[0] + n, _S1, 0);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[1] + n, _S1, 1);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[2] + n, _S1, 2);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[3] + n, _S1, 3);                    \
+    _sx = vld1q_s32(xofs_p + 4);                                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[0], _S0, 4);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[1], _S0, 5);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[2], _S0, 6);                        \
+    _S0 = vld##n##_lane_u8(Sp + _sx[3], _S0, 7);                        \
+    _S1 = vld##n##_lane_u8(Sp + _sx[0] + n, _S1, 4);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[1] + n, _S1, 5);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[2] + n, _S1, 6);                    \
+    _S1 = vld##n##_lane_u8(Sp + _sx[3] + n, _S1, 7);                    \
+    uint8x8_t _mask = vld1_u8(ialpha_p);                                \
+
+#define LOAD_C1() MAKE_LOAD(1)
+#define LOAD_C2() MAKE_LOAD(2)
+#define LOAD_C3() MAKE_LOAD(3)
+#define LOAD_C4() MAKE_LOAD(4)
+
+#endif  // TNN_USE_NEON
+
+void ResizeNearestC1Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                         uint8_t* dst, int w, int h, int stride) {
+    ResizeNearestPreparation(1);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef TNN_USE_NEON
+            int32x4_t _sx = int32x4_t();
+            uint8x8_t _S0 = uint8x8_t();
+            uint8x8_t _S1 = uint8x8_t();
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            int simd_loop = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                LOAD_C1();
+
+                vst1_u8(Dp_p, vbsl_u8(_mask, _S0, _S1));
+
+                xofs_p   += 8;
+                ialpha_p += 8;
+                Dp_p     += 8;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                Dp[dx] = (ialpha[dx] == 0) ? Sp[sx + 1] : Sp[sx];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC2Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                         uint8_t* dst, int w, int h, int stride) {
+    ResizeNearestPreparation(2);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef TNN_USE_NEON
+            int32x4_t _sx   = int32x4_t();
+            uint8x8x2_t _S0 = uint8x8x2_t();
+            uint8x8x2_t _S1 = uint8x8x2_t();
+            uint8x8x2_t _S2 = uint8x8x2_t();
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            int simd_loop  = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                LOAD_C2();
+
+                _S2.val[0] = vbsl_u8(_mask, _S0.val[0], _S1.val[0]);
+                _S2.val[1] = vbsl_u8(_mask, _S0.val[1], _S1.val[1]);
+                vst2_u8(Dp_p, _S2);
+
+                xofs_p   += 8;
+                ialpha_p += 8;
+                Dp_p     += 8 * 2;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                Dp[dx * 2]     = (ialpha[dx] == 0) ? Sp[sx + 2] : Sp[sx];
+                Dp[dx * 2 + 1] = (ialpha[dx] == 0) ? Sp[sx + 3] : Sp[sx + 1];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC3Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                         uint8_t* dst, int w, int h, int stride) {
+    ResizeNearestPreparation(3);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef TNN_USE_NEON
+            int32x4_t _sx   = int32x4_t();
+            uint8x8x3_t _S0 = uint8x8x3_t();
+            uint8x8x3_t _S1 = uint8x8x3_t();
+            uint8x8x3_t _S2 = uint8x8x3_t();
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            int simd_loop = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                LOAD_C3();
+
+                _S2.val[0] = vbsl_u8(_mask, _S0.val[0], _S1.val[0]);
+                _S2.val[1] = vbsl_u8(_mask, _S0.val[1], _S1.val[1]);
+                _S2.val[2] = vbsl_u8(_mask, _S0.val[2], _S1.val[2]);
+                vst3_u8(Dp_p, _S2);
+
+                xofs_p   += 8;
+                ialpha_p += 8;
+                Dp_p     += 8 * 3;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                Dp[dx * 3]     = (ialpha[dx] == 0) ? Sp[sx + 3] : Sp[sx];
+                Dp[dx * 3 + 1] = (ialpha[dx] == 0) ? Sp[sx + 4] : Sp[sx + 1];
+                Dp[dx * 3 + 2] = (ialpha[dx] == 0) ? Sp[sx + 5] : Sp[sx + 2];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC4Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                         uint8_t* dst, int w, int h, int stride) {
+    ResizeNearestPreparation(4);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef TNN_USE_NEON
+            int32x4_t _sx   = int32x4_t();
+            uint8x8x4_t _S0 = uint8x8x4_t();
+            uint8x8x4_t _S1 = uint8x8x4_t();
+            uint8x8x4_t _S2 = uint8x8x4_t();
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            int simd_loop = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                LOAD_C4();
+
+                _S2.val[0] = vbsl_u8(_mask, _S0.val[0], _S1.val[0]);
+                _S2.val[1] = vbsl_u8(_mask, _S0.val[1], _S1.val[1]);
+                _S2.val[2] = vbsl_u8(_mask, _S0.val[2], _S1.val[2]);
+                _S2.val[3] = vbsl_u8(_mask, _S0.val[3], _S1.val[3]);
+                vst4_u8(Dp_p, _S2);
+
+                xofs_p   += 8;
+                ialpha_p += 8;
+                Dp_p     += 8 * 4;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                Dp[dx * 4]     = (ialpha[dx] == 0) ? Sp[sx + 4] : Sp[sx];
+                Dp[dx * 4 + 1] = (ialpha[dx] == 0) ? Sp[sx + 5] : Sp[sx + 1];
+                Dp[dx * 4 + 2] = (ialpha[dx] == 0) ? Sp[sx + 6] : Sp[sx + 2];
+                Dp[dx * 4 + 3] = (ialpha[dx] == 0) ? Sp[sx + 7] : Sp[sx + 3];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+#ifdef TNN_USE_NEON
+
+#undef MAKE_LOAD
+#undef LOAD_C1
+#undef LOAD_C2
+#undef LOAD_C3
+#undef LOAD_C4
+
+#endif  // TNN_USE_NEON
+
+void ResizeNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC1Impl(src, batch, src_w, src_h, src_w, dst, w, h, w);
+}
+
+void ResizeNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC2Impl(src, batch, src_w, src_h, src_w * 2, dst, w, h, w * 2);
+}
+
+void ResizeNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC3Impl(src, batch, src_w, src_h, src_w * 3, dst, w, h, w * 3);
+}
+
+void ResizeNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC4Impl(src, batch, src_w, src_h, src_w * 4, dst, w, h, w * 4);
+}
+
+void ResizeNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = w * h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        ResizeNearestC1(srcY, 1, src_w, src_h, dstY, w, h);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + w * h;
+        ResizeNearestC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, w / 2, h / 2);
+    }
+}
+
+#define INTER_REMAP_COEF_BITS  15
+#define INTER_REMAP_COEF_SCALE (1<<INTER_REMAP_COEF_BITS)
+#define INTER_BITS      5
+#define INTER_TAB_SIZE  (1<<INTER_BITS)
+#define KSIZE 2
+static short BilinearTab_i[INTER_TAB_SIZE*INTER_TAB_SIZE][KSIZE][KSIZE];
+
+// Interpolation table of size 32 x 32 x 4:
+// (1*1,     0*1,     1*0,     0*0)    , ... , (1/32*1,     31/32*1,     1/32*0,     31/32*0)
+// (1*31/32, 0*31/32, 1*1/32,  0*1/32) , ... , (1/32*31/32, 31/32*31/32, 1/32*1/32,  31/32*1/32)
+//                                       ...
+// (1*1/32,  0*1/32,  1*31/32, 0*31/32), ... , (1/32*1/32,  31/32*1/32,  1/32*31/32, 31/32*31/32)
+static void InitInterTab2D() {
+    static bool inited = false;
+    if (inited) {
+        return;
+    }
+
+    short* itab = BilinearTab_i[0][0];
+    int ksize = KSIZE;
+
+    float* _tab = new float[2 * INTER_TAB_SIZE];
+    int i, j, k1, k2;
+    InitInterTab1D(_tab, INTER_TAB_SIZE);
+    for (i = 0; i < INTER_TAB_SIZE; i++) {
+        for (j = 0; j < INTER_TAB_SIZE; j++, itab += ksize * ksize) {
+            int isum = 0;
+
+            for (k1 = 0; k1 < ksize; k1++) {
+                float vy = _tab[i * ksize + k1];
+                for (k2 = 0; k2 < ksize; k2++) {
+                    float v                       = vy * _tab[j * ksize + k2];
+                    isum += itab[k1 * ksize + k2] = SATURATE_CAST_SHORT(v * INTER_REMAP_COEF_SCALE);
+                }
+            }
+
+            if (isum != INTER_REMAP_COEF_SCALE) {
+                int diff   = isum - INTER_REMAP_COEF_SCALE;
+                int ksize2 = ksize / 2, Mk1 = ksize2, Mk2 = ksize2, mk1 = ksize2, mk2 = ksize2;
+                for (k1 = ksize2; k1 < ksize2 + 2; k1++)
+                    for (k2 = ksize2; k2 < ksize2 + 2; k2++) {
+                        if (itab[k1 * ksize + k2] < itab[mk1 * ksize + mk2])
+                            mk1 = k1, mk2 = k2;
+                        else if (itab[k1 * ksize + k2] > itab[Mk1 * ksize + Mk2])
+                            Mk1 = k1, Mk2 = k2;
+                    }
+                if (diff < 0)
+                    itab[Mk1 * ksize + Mk2] = (short)(itab[Mk1 * ksize + Mk2] - diff);
+                else
+                    itab[mk1 * ksize + mk2] = (short)(itab[mk1 * ksize + mk2] - diff);
+            }
+        }
+    }
+
+    delete[] _tab;
+}
+
+// The buffer contains adelta and bdelta, which are used to calculate src position (src_x, src_y)
+// from dst position (x, y):
+// src_x = adelta[2*x]   + bdelta[2*y]
+// src_y = adelta[2*x+1] + bdelta[2*y+1]
+static void WarpAffineInit(uint8_t* dst, int batch, int dst_w, int dst_h, int channel, const float border_val,
+                           const float (*transform)[3], int** buffer) {
+    uint8_t border_ival = (uint8_t)border_val;
+    memset(dst, border_ival, batch * dst_h * dst_w * channel);
+
+    // Init LookUp Table
+    InitInterTab2D();
+
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+
+    *buffer = reinterpret_cast<int*>(armMalloc((dst_w + dst_h) * 2 * sizeof(int)));
+
+    int* adelta = *buffer;
+    int* bdelta = *buffer + dst_w * 2;
+
+    for (int x = 0; x < dst_w; x++) {
+        *adelta++ = SATURATE_CAST_INT(m[0] * x * 1024);
+        *adelta++ = SATURATE_CAST_INT(m[3] * x * 1024);
+    }
+
+    for (int y = 0; y < dst_h; y++) {
+        *bdelta++ = SATURATE_CAST_INT((m[1] * y + m[2]) * 1024);
+        *bdelta++ = SATURATE_CAST_INT((m[4] * y + m[5]) * 1024);
+    }
+}
+
+static inline bool CheckDataIsOnBoundary(const int new_x_loc, const int new_y_loc, const int src_w, const int src_h) {
+    return new_x_loc >= -1 && new_x_loc <= (src_w - 1) &&
+           new_y_loc >= -1 && new_y_loc <= (src_h - 1);
+}
+
+static inline bool CheckDataIsInBoundary(const int new_x_loc, const int new_y_loc, const int src_w, const int src_h) {
+    return new_x_loc >= 0 && new_x_loc < (src_w - 1) &&
+           new_y_loc >= 0 && new_y_loc < (src_h - 1);
+}
+
+static void WarpAffinePrepareOneRow(int* buf_loc, short* tab_loc, int* adelta, int* bdelta, int channel,
+                                    const uint8_t* src, int src_w, int src_h, uint8_t* dst, int dst_w,
+                                    int y, int src_offset, int& x_count, int& end_x, float border_val = 0) {
+    const unsigned char* src2 = src + src_w * channel;
+
+    short xy_loc_buf[dst_w * 2];
+    short tb_loc_buf[dst_w];
+    int   sc_loc_buf[dst_w];
+    int*   adelta_p     = adelta;
+    short* xy_loc_buf_p = xy_loc_buf;
+    short* tb_loc_buf_p = tb_loc_buf;
+    int*   sc_loc_buf_p = sc_loc_buf;
+    int x = 0;
+#ifdef TNN_USE_NEON
+    int32x2_t _bdelta0 = vld1_s32(bdelta + 2 * y);
+    int32x4_t _bdelta  = vcombine_s32(_bdelta0, _bdelta0);
+    int32x4_t _offset  = vdupq_n_s32(16);
+    int16x8_t _mask    = vdupq_n_s16(31);
+    int16x8_t _coeff   = {1,32,1,32,1,32,1,32};
+    int32x4_t _channel = vdupq_n_s32(channel);
+    int32x4_t _soffset = vdupq_n_s32(src_offset);
+    int16x4_t _src_w   = {1, (short)src_w,1,(short)src_w};
+    for (; x < dst_w>>2<<2; x += 4) {
+        int32x4_t _xyxy0   = vaddq_s32(vld1q_s32(adelta_p), _offset);
+        int32x4_t _xyxy1   = vaddq_s32(vld1q_s32(adelta_p + 4), _offset);
+        _xyxy0             = vaddq_s32(_xyxy0, _bdelta);
+        _xyxy1             = vaddq_s32(_xyxy1, _bdelta);
+        int16x4_t _xyxy0s  = vshrn_n_s32(_xyxy0, 10);
+        //int16x8_t _xyxy01s = vshrn_high_n_s32(_xyxy0s, _xyxy1, 10);
+        int16x4_t _xyxy1s  = vshrn_n_s32(_xyxy1, 10);
+        int16x8_t _xyxy01s = vcombine_s16(_xyxy0s, _xyxy1s);
+        vst1q_s16(xy_loc_buf_p, _xyxy01s);
+
+        int32x4_t _src_0   = vmull_s16(_xyxy0s, _src_w);
+        int32x4_t _src_1   = vmull_s16(vget_high_s16(_xyxy01s), _src_w);
+        vst1q_s32(sc_loc_buf_p, vaddq_s32(vmulq_s32(_channel, VPADDQ_S32(_src_0, _src_1)), _soffset));
+
+        _xyxy0s            = vshrn_n_s32(_xyxy0, 5);
+        //_xyxy01s           = vshrn_high_n_s32(_xyxy0s, _xyxy1, 5);
+        _xyxy1s            = vshrn_n_s32(_xyxy1, 5);
+        _xyxy01s           = vcombine_s16(_xyxy0s, _xyxy1s);
+        int16x8_t _tab_xys = vmulq_s16(vandq_s16(_xyxy01s, _mask), _coeff);
+        vst1_s16(tb_loc_buf_p, vpadd_s16(vget_low_s16(_tab_xys), vget_high_s16(_tab_xys)));
+
+        adelta_p     += 8;
+        xy_loc_buf_p += 8;
+        tb_loc_buf_p += 4;
+        sc_loc_buf_p += 4;
+    }
+    if (dst_w % 4) {
+        x -= 4;
+    }
+#endif
+    for (; x < dst_w; ++x) {
+        int new_x     = adelta[2 * x] + bdelta[2 * y] + 16;
+        int new_y     = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
+        int new_x_loc = new_x >> 10;
+        int new_y_loc = new_y >> 10;
+        xy_loc_buf[2 * x]     = new_x_loc;
+        xy_loc_buf[2 * x + 1] = new_y_loc;
+        tb_loc_buf[x] = ((new_x >> 5) & 31) + ((new_y >> 5) & 31) * 32;
+        sc_loc_buf[x] = (new_x_loc + new_y_loc * src_w) * channel + src_offset;
+    }
+
+    for (x = 0; x < dst_w; ++x) {
+        short new_x_loc    = xy_loc_buf[2 * x];
+        short new_y_loc    = xy_loc_buf[2 * x + 1];
+        short new_xy_float = tb_loc_buf[x];
+        int   src_loc      = sc_loc_buf[x];
+
+        if ((unsigned)new_x_loc < (src_w - 1) && (unsigned)new_y_loc < (src_h - 1)) {
+            buf_loc[x] = src_loc;
+            tab_loc[x] = new_xy_float;
+            x_count++;
+            end_x = x;
+        } else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+            short* wtab = BilinearTab_i[new_xy_float][0];
+            int dsc_loc = x * channel;
+
+            int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+            int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+            int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+            int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+            for (int c = 0; c < channel; ++c) {
+                int val_xy = 0;
+                val_xy += wtab[0] * (mask0 ? src[src_loc + c] : border_val);
+                val_xy += wtab[1] * (mask1 ? src[src_loc + channel + c] : border_val);
+                val_xy += wtab[2] * (mask2 ? src2[src_loc + c] : border_val);
+                val_xy += wtab[3] * (mask3 ? src2[src_loc + channel + c] : border_val);
+                dst[dsc_loc + c] = SATURATE_CAST_UCHAR((val_xy + (1 << 14)) >> 15);
+            }
+        }
+    }
+}
+
+static void WarpAffineCalculateOneRow(int begin_x, int end_x, int channel, int dst_loc_base, const int* buf_loc,
+                                      const short* tab_loc, const uint8_t* src1, const uint8_t* src2, uint8_t* dst) {
+    const int* buf_loc_p   = buf_loc + begin_x;
+    const short* tab_loc_p = tab_loc + begin_x;
+    const short* tab_p     = BilinearTab_i[0][0];
+    int x                  = begin_x;
+
+#ifdef TNN_USE_NEON
+
+#define MAKE_CAL(n)                                                                 \
+    _val0    = vmull_s16(_tab0, vget_low_s16(_src16_0##n));                         \
+    _val1    = vmull_s16(_tab1, vget_high_s16(_src16_0##n));                        \
+    _val2    = vmull_s16(_tab2, vget_low_s16(_src16_1##n));                         \
+    _val3    = vmull_s16(_tab3, vget_high_s16(_src16_1##n));                        \
+    _res0123 = VPADDQ_S32(VPADDQ_S32(_val0, _val1), VPADDQ_S32(_val2, _val3));      \
+    _res0123 = vaddq_s32(_res0123, _offset);                                        \
+    _res16   = vshrn_n_s32(_res0123, 15);                                           \
+    _resu8.val[n] = vqmovun_s16(vcombine_s16(_res16, _res16));                      \
+
+#define CAL_C0() MAKE_CAL(0)
+#define CAL_C1() MAKE_CAL(1)
+#define CAL_C2() MAKE_CAL(2)
+#define CAL_C3() MAKE_CAL(3)
+
+#endif
+
+    if (channel == 1) {
+#ifdef TNN_USE_NEON
+        uint8_t* dst_p         = dst +  dst_loc_base + begin_x * 1;
+        int32x4_t _offset      = vdupq_n_s32(1 << 14);
+        int simd_loop          = 0;
+        for (; x <= end_x - 3; x += 4) {
+            int32x4_t _src_loc = vld1q_s32(buf_loc_p);
+            uint8x8_t _src01   = uint8x8_t();
+            _src01 = vld1_lane_u8(src1 + _src_loc[0], _src01, 0);
+            _src01 = vld1_lane_u8(src1 + _src_loc[0] + 1, _src01, 1);
+            _src01 = vld1_lane_u8(src2 + _src_loc[0], _src01, 2);
+            _src01 = vld1_lane_u8(src2 + _src_loc[0] + 1, _src01, 3);
+            _src01 = vld1_lane_u8(src1 + _src_loc[1], _src01, 4);
+            _src01 = vld1_lane_u8(src1 + _src_loc[1] + 1, _src01, 5);
+            _src01 = vld1_lane_u8(src2 + _src_loc[1], _src01, 6);
+            _src01 = vld1_lane_u8(src2 + _src_loc[1] + 1, _src01, 7);
+            int16x8_t _src16_0 = vreinterpretq_s16_u16(vmovl_u8(_src01));
+            uint8x8_t _src23   = uint8x8_t();
+            _src23 = vld1_lane_u8(src1 + _src_loc[2], _src23, 0);
+            _src23 = vld1_lane_u8(src1 + _src_loc[2] + 1, _src23, 1);
+            _src23 = vld1_lane_u8(src2 + _src_loc[2], _src23, 2);
+            _src23 = vld1_lane_u8(src2 + _src_loc[2] + 1, _src23, 3);
+            _src23 = vld1_lane_u8(src1 + _src_loc[3], _src23, 4);
+            _src23 = vld1_lane_u8(src1 + _src_loc[3] + 1, _src23, 5);
+            _src23 = vld1_lane_u8(src2 + _src_loc[3], _src23, 6);
+            _src23 = vld1_lane_u8(src2 + _src_loc[3] + 1, _src23, 7);
+            int16x8_t _src16_1 = vreinterpretq_s16_u16(vmovl_u8(_src23));
+
+            int16x4_t _tab_loc = vld1_s16(tab_loc_p);
+            int16x4_t _tab0    = vld1_s16(tab_p + _tab_loc[0] * 4);
+            int16x4_t _tab1    = vld1_s16(tab_p + _tab_loc[1] * 4);
+            int16x4_t _tab2    = vld1_s16(tab_p + _tab_loc[2] * 4);
+            int16x4_t _tab3    = vld1_s16(tab_p + _tab_loc[3] * 4);
+
+            int32x4_t _val0    = vmull_s16(_tab0, vget_low_s16(_src16_0));
+            int32x4_t _val1    = vmull_s16(_tab1, vget_high_s16(_src16_0));
+            int32x4_t _val2    = vmull_s16(_tab2, vget_low_s16(_src16_1));
+            int32x4_t _val3    = vmull_s16(_tab3, vget_high_s16(_src16_1));
+
+            int32x4_t _res0123 = VPADDQ_S32(VPADDQ_S32(_val0, _val1), VPADDQ_S32(_val2, _val3));
+            _res0123           = vaddq_s32(_res0123, _offset);
+            int16x4_t _res16   = vshrn_n_s32(_res0123, 15);
+            uint8x8_t _resu8   = vqmovun_s16(vcombine_s16(_res16, _res16));
+
+            vst1_lane_u8(dst_p, _resu8, 0);
+            vst1_lane_u8(dst_p + 1, _resu8, 1);
+            vst1_lane_u8(dst_p + 2, _resu8, 2);
+            vst1_lane_u8(dst_p + 3, _resu8, 3);
+
+            buf_loc_p += 4;
+            tab_loc_p += 4;
+            dst_p     += 4;
+            ++simd_loop;
+        }
+        x = begin_x + (simd_loop << 2);
+#endif
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 1;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point0 = src1[src_loc];
+            int point1 = src1[src_loc + 1];
+            int point2 = src2[src_loc];
+            int point3 = src2[src_loc + 1];
+
+            int val_xy0  = wtab[0] * point0 + wtab[1] * point1 + wtab[2] * point2 + wtab[3] * point3;
+            dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+        }
+    } else if (channel == 2) {
+#ifdef TNN_USE_NEON
+        uint8_t* dst_p         = dst +  dst_loc_base + begin_x * 2;
+        int32x4_t _offset      = vdupq_n_s32(1 << 14);
+        int simd_loop          = 0;
+        for (; x <= end_x - 3; x += 4) {
+            int32x4_t _src_loc = vld1q_s32(buf_loc_p);
+            uint8x8x2_t _src01 = uint8x8x2_t();
+            _src01 = vld2_lane_u8(src1 + _src_loc[0], _src01, 0);
+            _src01 = vld2_lane_u8(src1 + _src_loc[0] + 2, _src01, 1);
+            _src01 = vld2_lane_u8(src2 + _src_loc[0], _src01, 2);
+            _src01 = vld2_lane_u8(src2 + _src_loc[0] + 2, _src01, 3);
+            _src01 = vld2_lane_u8(src1 + _src_loc[1], _src01, 4);
+            _src01 = vld2_lane_u8(src1 + _src_loc[1] + 2, _src01, 5);
+            _src01 = vld2_lane_u8(src2 + _src_loc[1], _src01, 6);
+            _src01 = vld2_lane_u8(src2 + _src_loc[1] + 2, _src01, 7);
+            int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
+            int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
+            uint8x8x2_t _src23  = uint8x8x2_t();
+            _src23 = vld2_lane_u8(src1 + _src_loc[2], _src23, 0);
+            _src23 = vld2_lane_u8(src1 + _src_loc[2] + 2, _src23, 1);
+            _src23 = vld2_lane_u8(src2 + _src_loc[2], _src23, 2);
+            _src23 = vld2_lane_u8(src2 + _src_loc[2] + 2, _src23, 3);
+            _src23 = vld2_lane_u8(src1 + _src_loc[3], _src23, 4);
+            _src23 = vld2_lane_u8(src1 + _src_loc[3] + 2, _src23, 5);
+            _src23 = vld2_lane_u8(src2 + _src_loc[3], _src23, 6);
+            _src23 = vld2_lane_u8(src2 + _src_loc[3] + 2, _src23, 7);
+            int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
+            int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
+
+            int16x4_t _tab_loc = vld1_s16(tab_loc_p);
+            int16x4_t _tab0    = vld1_s16(tab_p + _tab_loc[0] * 4);
+            int16x4_t _tab1    = vld1_s16(tab_p + _tab_loc[1] * 4);
+            int16x4_t _tab2    = vld1_s16(tab_p + _tab_loc[2] * 4);
+            int16x4_t _tab3    = vld1_s16(tab_p + _tab_loc[3] * 4);
+
+            int32x4_t _val0, _val1, _val2, _val3, _res0123;
+            int16x4_t _res16;
+            uint8x8x2_t _resu8;
+
+            CAL_C0();
+            CAL_C1();
+
+            vst2_lane_u8(dst_p, _resu8, 0);
+            vst2_lane_u8(dst_p + 2, _resu8, 1);
+            vst2_lane_u8(dst_p + 4, _resu8, 2);
+            vst2_lane_u8(dst_p + 6, _resu8, 3);
+
+            buf_loc_p += 4;
+            tab_loc_p += 4;
+            dst_p     += 8;
+            ++simd_loop;
+        }
+        x = begin_x + (simd_loop << 2);
+#endif
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 2;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point02 + wtab[2] * point10 + wtab[3] * point12;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point03 + wtab[2] * point11 + wtab[3] * point13;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+        }
+    } else if (channel == 3) {
+#ifdef TNN_USE_NEON
+        uint8_t* dst_p         = dst +  dst_loc_base + begin_x * 3;
+        int32x4_t _offset      = vdupq_n_s32(1 << 14);
+        int simd_loop          = 0;
+        for (; x <= end_x - 3; x += 4) {
+            int32x4_t _src_loc = vld1q_s32(buf_loc_p);
+            uint8x8x3_t _src01 = uint8x8x3_t();
+            _src01 = vld3_lane_u8(src1 + _src_loc[0], _src01, 0);
+            _src01 = vld3_lane_u8(src1 + _src_loc[0] + 3, _src01, 1);
+            _src01 = vld3_lane_u8(src2 + _src_loc[0], _src01, 2);
+            _src01 = vld3_lane_u8(src2 + _src_loc[0] + 3, _src01, 3);
+            _src01 = vld3_lane_u8(src1 + _src_loc[1], _src01, 4);
+            _src01 = vld3_lane_u8(src1 + _src_loc[1] + 3, _src01, 5);
+            _src01 = vld3_lane_u8(src2 + _src_loc[1], _src01, 6);
+            _src01 = vld3_lane_u8(src2 + _src_loc[1] + 3, _src01, 7);
+            int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
+            int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
+            int16x8_t _src16_02 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[2]));
+            uint8x8x3_t _src23  = uint8x8x3_t();
+            _src23 = vld3_lane_u8(src1 + _src_loc[2], _src23, 0);
+            _src23 = vld3_lane_u8(src1 + _src_loc[2] + 3, _src23, 1);
+            _src23 = vld3_lane_u8(src2 + _src_loc[2], _src23, 2);
+            _src23 = vld3_lane_u8(src2 + _src_loc[2] + 3, _src23, 3);
+            _src23 = vld3_lane_u8(src1 + _src_loc[3], _src23, 4);
+            _src23 = vld3_lane_u8(src1 + _src_loc[3] + 3, _src23, 5);
+            _src23 = vld3_lane_u8(src2 + _src_loc[3], _src23, 6);
+            _src23 = vld3_lane_u8(src2 + _src_loc[3] + 3, _src23, 7);
+            int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
+            int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
+            int16x8_t _src16_12 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[2]));
+
+            int16x4_t _tab_loc = vld1_s16(tab_loc_p);
+            int16x4_t _tab0    = vld1_s16(tab_p + _tab_loc[0] * 4);
+            int16x4_t _tab1    = vld1_s16(tab_p + _tab_loc[1] * 4);
+            int16x4_t _tab2    = vld1_s16(tab_p + _tab_loc[2] * 4);
+            int16x4_t _tab3    = vld1_s16(tab_p + _tab_loc[3] * 4);
+
+            int32x4_t _val0, _val1, _val2, _val3, _res0123;
+            int16x4_t _res16;
+            uint8x8x3_t _resu8;
+
+            CAL_C0();
+            CAL_C1();
+            CAL_C2();
+
+            vst3_lane_u8(dst_p, _resu8, 0);
+            vst3_lane_u8(dst_p + 3, _resu8, 1);
+            vst3_lane_u8(dst_p + 6, _resu8, 2);
+            vst3_lane_u8(dst_p + 9, _resu8, 3);
+
+            buf_loc_p += 4;
+            tab_loc_p += 4;
+            dst_p     += 12;
+            ++simd_loop;
+        }
+        x = begin_x + (simd_loop << 2);
+#endif
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 3;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point04 = src1[src_loc + 4];
+            int point05 = src1[src_loc + 5];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+            int point14 = src2[src_loc + 4];
+            int point15 = src2[src_loc + 5];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point03 + wtab[2] * point10 + wtab[3] * point13;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point04 + wtab[2] * point11 + wtab[3] * point14;
+            int val_xy2      = wtab[0] * point02 + wtab[1] * point05 + wtab[2] * point12 + wtab[3] * point15;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+            dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
+        }
+    } else if (channel == 4) {
+#ifdef TNN_USE_NEON
+        uint8_t* dst_p         = dst +  dst_loc_base + begin_x * 4;
+        int32x4_t _offset      = vdupq_n_s32(1 << 14);
+        int simd_loop          = 0;
+        for (; x <= end_x - 3; x += 4) {
+            int32x4_t _src_loc = vld1q_s32(buf_loc_p);
+            uint8x8x4_t _src01 = uint8x8x4_t();
+            _src01 = vld4_lane_u8(src1 + _src_loc[0], _src01, 0);
+            _src01 = vld4_lane_u8(src1 + _src_loc[0] + 4, _src01, 1);
+            _src01 = vld4_lane_u8(src2 + _src_loc[0], _src01, 2);
+            _src01 = vld4_lane_u8(src2 + _src_loc[0] + 4, _src01, 3);
+            _src01 = vld4_lane_u8(src1 + _src_loc[1], _src01, 4);
+            _src01 = vld4_lane_u8(src1 + _src_loc[1] + 4, _src01, 5);
+            _src01 = vld4_lane_u8(src2 + _src_loc[1], _src01, 6);
+            _src01 = vld4_lane_u8(src2 + _src_loc[1] + 4, _src01, 7);
+            int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
+            int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
+            int16x8_t _src16_02 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[2]));
+            int16x8_t _src16_03 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[3]));
+            uint8x8x4_t _src23  = uint8x8x4_t();
+            _src23 = vld4_lane_u8(src1 + _src_loc[2], _src23, 0);
+            _src23 = vld4_lane_u8(src1 + _src_loc[2] + 4, _src23, 1);
+            _src23 = vld4_lane_u8(src2 + _src_loc[2], _src23, 2);
+            _src23 = vld4_lane_u8(src2 + _src_loc[2] + 4, _src23, 3);
+            _src23 = vld4_lane_u8(src1 + _src_loc[3], _src23, 4);
+            _src23 = vld4_lane_u8(src1 + _src_loc[3] + 4, _src23, 5);
+            _src23 = vld4_lane_u8(src2 + _src_loc[3], _src23, 6);
+            _src23 = vld4_lane_u8(src2 + _src_loc[3] + 4, _src23, 7);
+            int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
+            int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
+            int16x8_t _src16_12 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[2]));
+            int16x8_t _src16_13 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[3]));
+
+            int16x4_t _tab_loc = vld1_s16(tab_loc_p);
+            int16x4_t _tab0    = vld1_s16(tab_p + _tab_loc[0] * 4);
+            int16x4_t _tab1    = vld1_s16(tab_p + _tab_loc[1] * 4);
+            int16x4_t _tab2    = vld1_s16(tab_p + _tab_loc[2] * 4);
+            int16x4_t _tab3    = vld1_s16(tab_p + _tab_loc[3] * 4);
+
+            int32x4_t _val0, _val1, _val2, _val3, _res0123;
+            int16x4_t _res16;
+            uint8x8x4_t _resu8;
+
+            CAL_C0();
+            CAL_C1();
+            CAL_C2();
+            CAL_C3();
+
+            vst4_lane_u8(dst_p, _resu8, 0);
+            vst4_lane_u8(dst_p + 4, _resu8, 1);
+            vst4_lane_u8(dst_p + 8, _resu8, 2);
+            vst4_lane_u8(dst_p + 12, _resu8, 3);
+
+            buf_loc_p += 4;
+            tab_loc_p += 4;
+            dst_p     += 16;
+            ++simd_loop;
+        }
+        x = begin_x + (simd_loop << 2);
+#endif
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 4;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point04 = src1[src_loc + 4];
+            int point05 = src1[src_loc + 5];
+            int point06 = src1[src_loc + 6];
+            int point07 = src1[src_loc + 7];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+            int point14 = src2[src_loc + 4];
+            int point15 = src2[src_loc + 5];
+            int point16 = src2[src_loc + 6];
+            int point17 = src2[src_loc + 7];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point04 + wtab[2] * point10 + wtab[3] * point14;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point05 + wtab[2] * point11 + wtab[3] * point15;
+            int val_xy2      = wtab[0] * point02 + wtab[1] * point06 + wtab[2] * point12 + wtab[3] * point16;
+            int val_xy3      = wtab[0] * point03 + wtab[1] * point07 + wtab[2] * point13 + wtab[3] * point17;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+            dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
+            dst[dst_loc + 3] = SATURATE_CAST_UCHAR((val_xy3 + (1 << 14)) >> 15);
+        }
+    }
+
+#ifdef TNN_USE_NEON
+
+#undef MAKE_CAL
+#undef CAL_C0
+#undef CAL_C1
+#undef CAL_C2
+#undef CAL_C3
+
+#endif
+
+}
+
+template <int schannel>
+static void WarpAffineBilinear(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                        const float (*transform)[3], const float border_val) {
+    int src_plane = src_h * src_w * schannel;
+
+    int* buffer = nullptr;
+    WarpAffineInit(dst, batch, dst_w, dst_h, schannel, border_val, transform, &buffer);
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    int* buf_loc        = new int[dst_w * max_num_threads];
+    short* tab_loc      = new short[dst_w * max_num_threads];
+
+    const unsigned char* src2 = src + src_w * schannel;
+
+    OMP_PARALLEL_FOR_
+    for (int y = 0; y < dst_h * batch; ++y) {
+        int thread_id    = OMP_TID_;
+        int x_count      = 0;
+        int end_x        = 0;
+        int dst_loc_base = y * dst_w * schannel;
+        int* buf_loc_t   = buf_loc + thread_id * dst_w;
+        short* tab_loc_t = tab_loc + thread_id * dst_w;
+
+        WarpAffinePrepareOneRow(buf_loc_t, tab_loc_t, adelta, bdelta, schannel, src, src_w, src_h,
+                                dst + dst_loc_base, dst_w, y % dst_h, (y / dst_h) * src_plane, x_count, end_x, border_val);
+        WarpAffineCalculateOneRow(end_x - x_count + 1, end_x, schannel, dst_loc_base, buf_loc_t, tab_loc_t, src, src2, dst);
+    }
+
+    delete[] buf_loc;
+    delete[] tab_loc;
+
+    free(buffer);
+}
+
+void WarpAffineBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<1>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<2>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<3>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<4>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                                const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        WarpAffineBilinearC1(srcY, 1, src_w, src_h, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineBilinearC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+template <int schannel>
+static void WarpAffineNearest(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                              const float (*transform)[3], const float border_val) {
+    uint8_t border_ival = (uint8_t)border_val;
+    int* buffer = nullptr;
+    WarpAffineInit(dst, batch, dst_w, dst_h, schannel, border_val, transform, &buffer);
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    int src_stride = src_w * schannel;
+    int src_plane  = src_h * src_w * schannel;
+    OMP_PARALLEL_FOR_
+    for (int y = 0; y < dst_h * batch; ++y) {
+        int y_c = y / dst_h;
+        int y_r = y % dst_h;
+
+        auto src_b = src + y_c * src_plane;
+        auto dst_y = dst + y * dst_w * schannel;
+
+        for (int x = 0; x < dst_w; ++x) {
+            int new_x     = adelta[2 * x] + bdelta[2 * y_r] + 16;
+            int new_y     = adelta[2 * x + 1] + bdelta[2 * y_r + 1] + 16;
+            int new_x_loc = new_x >> 10;
+            int new_y_loc = new_y >> 10;
+
+            bool is_left  = ((new_x >> 5) & 31) < 16;
+            bool is_top   = ((new_y >> 5) & 31) < 16;
+
+            int src_loc   = (new_x_loc + new_y_loc * src_w) * schannel;
+            auto src_y1   = src_b + src_loc;
+            auto src_y2   = src_y1 + src_stride;
+            auto dst_x    = dst_y + x * schannel;
+
+            if (CheckDataIsInBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+                int c = 0;
+#ifdef TNN_USE_NEON
+                if (schannel == 4) {
+                    uint8x8_t v_is_top = is_top ? vdup_n_u8(255) : vdup_n_u8(0);
+                    uint8x8_t v_src_1  = vld1_u8(src_y1);
+                    uint8x8_t v_src_2  = vld1_u8(src_y2);
+                    uint8x8_t v_src_h  = vbsl_u8(v_is_top, v_src_1, v_src_2);
+                    if (is_left) {
+                        vst1_lane_u8(dst_x, v_src_h, 0);
+                        vst1_lane_u8(dst_x + 1, v_src_h, 1);
+                        vst1_lane_u8(dst_x + 2, v_src_h, 2);
+                        vst1_lane_u8(dst_x + 3, v_src_h, 3);
+                    } else {
+                        vst1_lane_u8(dst_x, v_src_h, 4);
+                        vst1_lane_u8(dst_x + 1, v_src_h, 5);
+                        vst1_lane_u8(dst_x + 2, v_src_h, 6);
+                        vst1_lane_u8(dst_x + 3, v_src_h, 7);
+                    }
+                    c = 4;
+                }
+#endif
+                for (; c < schannel; c++) {
+                    uint8_t point00 = src_y1[c];
+                    uint8_t point01 = src_y1[schannel + c];
+                    uint8_t point10 = src_y2[c];
+                    uint8_t point11 = src_y2[schannel + c];
+                    if (is_top) {
+                        dst_x[c] = is_left ? point00 : point01;
+                    } else {
+                        dst_x[c] = is_left ? point10 : point11;
+                    }
+                }
+            } else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+                int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+                int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+                int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+                int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+                for (int c = 0; c < schannel; ++c) {
+                    uint8_t point00 = mask0 ? src_y1[c] : border_ival;
+                    uint8_t point01 = mask1 ? src_y1[schannel + c] : border_ival;
+                    uint8_t point10 = mask2 ? src_y2[c] : border_ival;
+                    uint8_t point11 = mask3 ? src_y2[schannel + c] : border_ival;
+                    if (is_top) {
+                        dst_x[c] = is_left ? point00 : point01;
+                    } else {
+                        dst_x[c] = is_left ? point10 : point11;
+                    }
+                }
+            }
+        }
+    }
+
+    free(buffer);
+}
+
+void WarpAffineNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<1>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<2>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<3>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<4>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                               const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        WarpAffineNearestC1(srcY, 1, src_w, src_h, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineNearestC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.h b/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.h
new file mode 100644
index 0000000..34e7926
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_mat_util.h
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_UTIL_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_UTIL_H_
+
+#include <string.h>
+#include <sys/time.h>
+#include <cstdlib>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+#define GET_OFFSET_PTR(ptr, offset) (reinterpret_cast<int8_t*>(ptr) + offset)
+
+void MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride);
+void MatMemcpy2DWithPadding(void* src, void* dst, int width, int height, int src_stride, int dst_stride,
+                            int top, int bottom, int left, int right, uint8_t pad_val);
+
+// resize
+void ResizeBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+
+void ResizeNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+
+// warp affine
+void WarpAffineBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                                const float (*transform)[3], const float border_val = 0.0);
+
+void WarpAffineNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                               const float (*transform)[3], const float border_val = 0.0);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_MAT_UTIL_H_
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_util.cc b/3rdparty/TNN/source/tnn/device/arm/arm_util.cc
new file mode 100644
index 0000000..dec2032
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_util.cc
@@ -0,0 +1,1905 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/arm_util.h"
+
+#include <type_traits>
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+#ifdef TNN_USE_NEON
+int PackNeonC3(float *dst, const float *src, size_t hw, size_t channel) {
+    auto src0 = src;
+    auto src1 = src + hw;
+    auto src2 = src + hw * 2;
+    int cur_hw = 0;
+    float32x4x4_t v;
+    v.val[3] = vdupq_n_f32(0);
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v.val[0] = vld1q_f32(src0 + cur_hw);
+        v.val[1] = vld1q_f32(src1 + cur_hw);
+        v.val[2] = vld1q_f32(src2 + cur_hw);
+        vst4q_f32(dst + cur_hw * 4, v);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 4 + 0] = src0[cur_hw];
+        dst[cur_hw * 4 + 1] = src1[cur_hw];
+        dst[cur_hw * 4 + 2] = src2[cur_hw];
+        dst[cur_hw * 4 + 3] = 0.f;
+    }
+
+    return 0;
+}
+int PackNeonC2(float *dst, const float *src, size_t hw, size_t channel) {
+    auto src0 = src;
+    auto src1 = src + hw;
+    int cur_hw = 0;
+    float32x4x4_t v;
+    v.val[2] = vdupq_n_f32(0);
+    v.val[3] = vdupq_n_f32(0);
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v.val[0] = vld1q_f32(src0 + cur_hw);
+        v.val[1] = vld1q_f32(src1 + cur_hw);
+        vst4q_f32(dst + cur_hw * 4, v);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 4 + 0] = src0[cur_hw];
+        dst[cur_hw * 4 + 1] = src1[cur_hw];
+        dst[cur_hw * 4 + 2] = 0.f;
+        dst[cur_hw * 4 + 3] = 0.f;
+    }
+
+    return 0;
+}
+int PackNeonC1(float *dst, const float *src, size_t hw, size_t channel) {
+    auto src0 = src;
+    int cur_hw = 0;
+    float32x4x4_t v;
+    v.val[1] = vdupq_n_f32(0);
+    v.val[2] = vdupq_n_f32(0);
+    v.val[3] = vdupq_n_f32(0);
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v.val[0] = vld1q_f32(src0 + cur_hw);
+        vst4q_f32(dst + cur_hw * 4, v);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 4 + 0] = src0[cur_hw];
+        dst[cur_hw * 4 + 1] = 0.f;
+        dst[cur_hw * 4 + 2] = 0.f;
+        dst[cur_hw * 4 + 3] = 0.f;
+    }
+
+    return 0;
+}
+int PackNeon(float *dst, const float *src, size_t hw, size_t channel) {
+    for (int c = 0; c + 3 < channel; c += 4) {
+        auto src0 = src + c * hw;
+        auto src1 = src + c * hw + hw;
+        auto src2 = src + c * hw + hw * 2;
+        auto src3 = src + c * hw + hw * 3;
+        auto dst_c = dst + c * hw;
+        int cur_hw = 0;
+        for (; cur_hw + 3 < hw; cur_hw += 4) {
+            float32x4x4_t v;
+            v.val[0] = vld1q_f32(src0 + cur_hw);
+            v.val[1] = vld1q_f32(src1 + cur_hw);
+            v.val[2] = vld1q_f32(src2 + cur_hw);
+            v.val[3] = vld1q_f32(src3 + cur_hw);
+            vst4q_f32(dst_c + cur_hw * 4, v);
+        }
+        for (; cur_hw < hw; cur_hw++) {
+            dst_c[cur_hw * 4 + 0] = src0[cur_hw];
+            dst_c[cur_hw * 4 + 1] = src1[cur_hw];
+            dst_c[cur_hw * 4 + 2] = src2[cur_hw];
+            dst_c[cur_hw * 4 + 3] = src3[cur_hw];
+        }
+    }
+    int remain = channel % 4;
+    int offset = channel / 4 * 4;
+    src += offset * hw;
+    dst += offset * hw;
+    if (remain == 3) {
+        PackNeonC3(dst, src, hw, remain);
+    } else if (remain == 2) {
+        PackNeonC2(dst, src, hw, remain);
+    } else if (remain == 1) {
+        PackNeonC1(dst, src, hw, remain);
+    }
+
+    return 0;
+}
+int PackNeonNHWC(float *dst, const float *src, size_t hw, size_t channel) {
+    if ((hw == 1) && (channel % 4 == 0)) {
+        memcpy(dst, src, hw * channel * sizeof(float));
+        return 0;
+    }
+
+    auto cc = (channel>>2<<2);
+    float32x4_t v;
+    for (int c = 0; c < cc; c += 4) {
+        auto src_c = src + c;
+        auto dst_c = dst + c * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = vld1q_f32(src_c);
+            vst1q_f32(dst_c, v);
+            src_c += channel;
+            dst_c += 4;
+        }
+    }
+
+    int remain = channel % 4;
+    if (remain) {
+        auto src_c = src + cc;
+        auto dst_c = dst + cc * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = vdupq_n_f32(0);
+            for (int r = 0; r < remain; ++r)
+                v[r] = *(src_c + r);
+            vst1q_f32(dst_c, v);
+            src_c += channel;
+            dst_c += 4;
+        }
+    }
+
+    return 0;
+}
+int UnpackNeonC3(float *dst, const float *src, size_t hw, size_t channel) {
+    auto dst0 = dst;
+    auto dst1 = dst + hw;
+    auto dst2 = dst + hw * 2;
+    int cur_hw = 0;
+    float32x4x4_t v;
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v = vld4q_f32(src + cur_hw * 4);
+        vst1q_f32(dst0 + cur_hw, v.val[0]);
+        vst1q_f32(dst1 + cur_hw, v.val[1]);
+        vst1q_f32(dst2 + cur_hw, v.val[2]);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst0[cur_hw] = src[cur_hw * 4 + 0];
+        dst1[cur_hw] = src[cur_hw * 4 + 1];
+        dst2[cur_hw] = src[cur_hw * 4 + 2];
+    }
+
+    return 0;
+}
+int UnpackNeonC2(float *dst, const float *src, size_t hw, size_t channel) {
+    auto dst0 = dst;
+    auto dst1 = dst + hw;
+    int cur_hw = 0;
+    float32x4x4_t v;
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v = vld4q_f32(src + cur_hw * 4);
+        vst1q_f32(dst0 + cur_hw, v.val[0]);
+        vst1q_f32(dst1 + cur_hw, v.val[1]);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst0[cur_hw] = src[cur_hw * 4 + 0];
+        dst1[cur_hw] = src[cur_hw * 4 + 1];
+    }
+
+    return 0;
+}
+int UnpackNeonC1(float *dst, const float *src, size_t hw, size_t channel) {
+    auto dst0 = dst;
+    int cur_hw = 0;
+    float32x4_t v;
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        v[0] = src[cur_hw * 4];
+        v[1] = src[cur_hw * 4 + 4];
+        v[2] = src[cur_hw * 4 + 4 * 2];
+        v[3] = src[cur_hw * 4 + 4 * 3];
+        vst1q_f32(dst0 + cur_hw, v);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst0[cur_hw] = src[cur_hw * 4 + 0];
+    }
+
+    return 0;
+}
+int UnpackNeon(float *dst, const float *src, size_t hw, size_t channel) {
+    float32x4x4_t v;
+    for (int c = 0; c + 3 < channel; c += 4) {
+        int cur_hw = 0;
+        auto src_c = src + c * hw;
+        auto dst0  = dst + c * hw;
+        auto dst1  = dst + c * hw + hw;
+        auto dst2  = dst + c * hw + hw * 2;
+        auto dst3  = dst + c * hw + hw * 3;
+        for (; cur_hw + 3 < hw; cur_hw += 4) {
+            v = vld4q_f32(src_c + cur_hw * 4);
+            vst1q_f32(dst0 + cur_hw, v.val[0]);
+            vst1q_f32(dst1 + cur_hw, v.val[1]);
+            vst1q_f32(dst2 + cur_hw, v.val[2]);
+            vst1q_f32(dst3 + cur_hw, v.val[3]);
+        }
+        for (; cur_hw < hw; cur_hw++) {
+            dst0[cur_hw] = src_c[cur_hw * 4 + 0];
+            dst1[cur_hw] = src_c[cur_hw * 4 + 1];
+            dst2[cur_hw] = src_c[cur_hw * 4 + 2];
+            dst3[cur_hw] = src_c[cur_hw * 4 + 3];
+        }
+    }
+    int remain = channel % 4;
+    int offset = channel / 4 * 4;
+    src += offset * hw;
+    dst += offset * hw;
+    if (remain == 3) {
+        UnpackNeonC3(dst, src, hw, remain);
+    } else if (remain == 2) {
+        UnpackNeonC2(dst, src, hw, remain);
+    } else if (remain == 1) {
+        UnpackNeonC1(dst, src, hw, remain);
+    }
+
+    return 0;
+}
+int UnpackNeonNHWC(float *dst, const float *src, size_t hw, size_t channel) {
+    if ((hw == 1) && (channel % 4 == 0)) {
+        memcpy(dst, src, hw * channel * sizeof(float));
+        return 0;
+    }
+
+    auto cc = (channel>>2<<2);
+    float32x4_t v;
+    for (int c = 0; c < cc; c += 4) {
+        auto dst_c = dst + c;
+        auto src_c = src + c * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = vld1q_f32(src_c);
+            vst1q_f32(dst_c, v);
+            src_c += 4;
+            dst_c += channel;
+        }
+    }
+
+    int remain = channel % 4;
+    if (remain) {
+        auto dst_c = dst + cc;
+        auto src_c = src + cc * hw;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            v = vld1q_f32(src_c);
+            for (int r = 0; r < remain; ++r)
+                *(dst_c + r) = v[r];
+            src_c += 4;
+            dst_c += channel;
+        }
+    }
+
+    return 0;
+}
+#endif
+
+char* GetBlobHandlePtr(BlobHandle handle) {
+    return reinterpret_cast<char *>(handle.base) + handle.bytes_offset; 
+}
+
+template <typename Tin, typename Tout>
+int PackC4(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_USE_NEON
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return PackNeon((float *)dst, (const float *)src, hw, channel);
+    }
+#endif
+    int c, cur_hw;
+    int idx = 0;
+    memset(dst, 0, hw * UP_DIV(channel, 4) * 4 * sizeof(Tout));
+    for (c = 0; c < channel; ++c) {
+        int plane      = c / 4;
+        auto *dstPlane = plane * hw * 4 + dst;
+        int offset     = c % 4;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dstPlane[4 * cur_hw + offset] = src[idx++];
+        }
+    }
+
+    return 0;
+}
+
+template int PackC4(float *dst, const float *src, size_t hw, size_t channel);
+template int PackC4(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int PackC4(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int PackC4(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int PackC4(float *dst, const fp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC8(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_ARM82_USE_NEON
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, fp16_t>::value) {
+        if (channel == 3) {
+            return PackNeonC3((fp16_t*)dst, (const float*)src, hw, channel);
+        }
+    }
+    if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, fp16_t>::value) {
+        return PackNeon((fp16_t *)dst, (const fp16_t *)src, hw, channel);
+    }
+#endif
+    int c, cur_hw;
+    int idx = 0;
+    memset(dst, 0, hw * UP_DIV(channel, 8) * 8 * sizeof(Tout));
+    for (c = 0; c < channel; ++c) {
+        int plane      = c / 8;
+        auto *dstPlane = plane * hw * 8 + dst;
+        int offset     = c % 8;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dstPlane[8 * cur_hw + offset] = src[idx++];
+        }
+    }
+
+    return 0;
+}
+
+template int PackC8(float *dst, const float *src, size_t hw, size_t channel);
+template int PackC8(fp16_t *dst, const float *src, size_t hw, size_t channel);
+template int PackC8(float *dst, const fp16_t *src, size_t hw, size_t channel);
+template int PackC8(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackCX(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return PackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, float>::value && std::is_same<Tout, bfp16_t>::value) {
+        return PackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, bfp16_t>::value && std::is_same<Tout, float>::value) {
+        return PackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, bfp16_t>::value && std::is_same<Tout, bfp16_t>::value) {
+        return PackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, float>::value && std::is_same<Tout, fp16_t>::value) {
+        return PackC8(dst, src, hw, channel);
+    } else if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, float>::value) {
+        return PackC8(dst, src, hw, channel);
+    } else if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, fp16_t>::value) {
+        return PackC8(dst, src, hw, channel);
+    } else {
+        return 0;
+    }
+}
+
+template int PackCX(float *dst, const float *src, size_t hw, size_t channel);
+template int PackCX(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int PackCX(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int PackCX(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int PackCX(fp16_t *dst, const float *src, size_t hw, size_t channel);
+template int PackCX(float *dst, const fp16_t *src, size_t hw, size_t channel);
+template int PackCX(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC4FromNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_USE_NEON
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return PackNeonNHWC((float *)dst, (const float *)src, hw, channel);
+    }
+#endif
+    int c, cur_hw;
+    int idx = 0;
+    memset(dst, 0, hw * UP_DIV(channel, 4) * 4 * sizeof(Tout));
+    for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+        for (c = 0; c < channel; ++c) {
+            int plane      = c / 4;
+            auto *dstPlane = plane * hw * 4 + dst;
+            int offset     = c % 4;
+            dstPlane[4 * cur_hw + offset] = src[idx++];
+        }
+    }
+
+    return 0;
+}
+
+template int PackC4FromNHWC(float *dst, const float *src, size_t hw, size_t channel);
+template int PackC4FromNHWC(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int PackC4FromNHWC(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int PackC4FromNHWC(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC8FromNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_ARM82_USE_NEON
+    if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, fp16_t>::value) {
+        return PackNeonNHWC((fp16_t *)dst, (const fp16_t *)src, hw, channel);
+    }
+#endif
+    int c, cur_hw;
+    int idx = 0;
+    memset(dst, 0, hw * UP_DIV(channel, 8) * 8 * sizeof(Tout));
+    for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+        for (c = 0; c < channel; ++c) {
+            int plane      = c / 8;
+            auto *dstPlane = plane * hw * 8 + dst;
+            int offset     = c % 8;
+            dstPlane[8 * cur_hw + offset] = src[idx++];
+        }
+    }
+
+    return 0;
+}
+
+template int PackC8FromNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+int PackCAndQuant(int8_t *dst, const float *src, size_t hw, size_t channel, float *scale) {
+    int idx  = 0;
+    int c_r4 = ROUND_UP(channel, 4);
+    memset(dst, 0, hw * c_r4);
+    for (int c = 0; c < channel; ++c) {
+        int8_t *dst_c = dst + c;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst_c[cur_hw * c_r4] = float2int8(src[idx++] * scale[c]);
+        }
+    }
+
+    return 0;
+}
+
+template <typename Tin, typename Tout>
+int UnpackC4(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_USE_NEON
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return UnpackNeon((float *)dst, (const float *)src, hw, channel);
+    }
+#endif
+    int cur_hw;
+    int c;
+    int idx = 0;
+    for (c = 0; c < channel; ++c) {
+        int plane         = c / 4;
+        const auto *src_c = plane * hw * 4 + src;
+        int offset        = c % 4;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst[idx++] = src_c[4 * cur_hw + offset];
+        }
+    }
+    return 0;
+}
+
+template int UnpackC4(float *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC4(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int UnpackC4(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC4(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+
+
+bool FloatBlobCanIgnorePack(size_t channel, size_t hw) {
+    return (hw == 1) && (channel % 4 == 0);
+}
+
+bool HalfBlobCanIgnorePack(size_t channel, size_t hw) {
+    return (hw == 1) && (channel % 8 == 0);
+}
+
+int PackFloatBlob(float *dst, float *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * ROUND_UP(channel, 4) * hw;
+        auto src_ptr_n = src + n * channel * hw;
+        PackC4(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+int UnpackFloatBlob(float *dst, float *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * channel * hw;
+        auto src_ptr_n = src + n * ROUND_UP(channel, 4) * hw;
+        UnpackC4(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+int PackFloatBlob(bfp16_t *dst, bfp16_t *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * ROUND_UP(channel, 4) * hw;
+        auto src_ptr_n = src + n * channel * hw;
+        PackC4(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+int UnpackFloatBlob(bfp16_t *dst, bfp16_t *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * channel * hw;
+        auto src_ptr_n = src + n * ROUND_UP(channel, 4) * hw;
+        UnpackC4(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+int PackHalfBlob(fp16_t *dst, fp16_t *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * ROUND_UP(channel, 8) * hw;
+        auto src_ptr_n = src + n * channel * hw;
+        PackC8(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+int UnpackHalfBlob(fp16_t *dst, fp16_t *src, size_t batch, size_t channel, size_t hw) {
+    OMP_PARALLEL_FOR_
+    for (int n = 0; n < batch; ++n) {
+        auto dst_ptr_n = dst + n * channel * hw;
+        auto src_ptr_n = src + n * ROUND_UP(channel, 8) * hw;
+        UnpackC8(dst_ptr_n, src_ptr_n, hw, channel);
+    }
+    return 0;
+}
+
+template <typename Tin, typename Tout>
+int UnpackC8(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+    int cur_hw;
+    int c;
+    int idx = 0;
+    for (c = 0; c < channel; ++c) {
+        int plane         = c / 8;
+        const auto *src_c = plane * hw * 8 + src;
+        int offset        = c % 8;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst[idx++] = src_c[8 * cur_hw + offset];
+        }
+    }
+    return 0;
+}
+
+template int UnpackC8(float *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC8(float *dst, const fp16_t *src, size_t hw, size_t channel);
+template int UnpackC8(fp16_t *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC8(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackCX(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return UnpackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, float>::value && std::is_same<Tout, bfp16_t>::value) {
+        return UnpackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, bfp16_t>::value && std::is_same<Tout, float>::value) {
+        return UnpackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, bfp16_t>::value && std::is_same<Tout, bfp16_t>::value) {
+        return UnpackC4(dst, src, hw, channel);
+    } else if (std::is_same<Tin, float>::value && std::is_same<Tout, fp16_t>::value) {
+        return UnpackC8(dst, src, hw, channel);
+    } else if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, float>::value) {
+        return UnpackC8(dst, src, hw, channel);
+    } else if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, fp16_t>::value) {
+        return UnpackC8(dst, src, hw, channel);
+    } else {
+        return 0;
+    }
+}
+
+template int UnpackCX(float *dst, const float *src, size_t hw, size_t channel);
+template int UnpackCX(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int UnpackCX(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int UnpackCX(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int UnpackCX(fp16_t *dst, const float *src, size_t hw, size_t channel);
+template int UnpackCX(float *dst, const fp16_t *src, size_t hw, size_t channel);
+template int UnpackCX(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackC4ToNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_USE_NEON
+    if (std::is_same<Tin, float>::value && std::is_same<Tout, float>::value) {
+        return UnpackNeonNHWC((float *)dst, (const float *)src, hw, channel);
+    }
+#endif
+    int cur_hw;
+    int c;
+    int idx = 0;
+    for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+        for (c = 0; c < channel; ++c) {
+            int plane         = c / 4;
+            const auto *src_c = plane * hw * 4 + src;
+            int offset        = c % 4;
+            dst[idx++] = src_c[4 * cur_hw + offset];
+        }
+    }
+    return 0;
+}
+
+template int UnpackC4ToNHWC(float *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC4ToNHWC(float *dst, const bfp16_t *src, size_t hw, size_t channel);
+template int UnpackC4ToNHWC(bfp16_t *dst, const float *src, size_t hw, size_t channel);
+template int UnpackC4ToNHWC(bfp16_t *dst, const bfp16_t *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackC8ToNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel) {
+#ifdef TNN_ARM82_USE_NEON
+    if (std::is_same<Tin, fp16_t>::value && std::is_same<Tout, fp16_t>::value) {
+        return UnpackNeonNHWC((fp16_t *)dst, (const fp16_t *)src, hw, channel);
+    }
+#endif
+    int cur_hw;
+    int c;
+    int idx = 0;
+    for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+        for (c = 0; c < channel; ++c) {
+            int plane         = c / 8;
+            const auto *src_c = plane * hw * 8 + src;
+            int offset        = c % 8;
+            dst[idx++] = src_c[8 * cur_hw + offset];
+        }
+    }
+    return 0;
+}
+template int UnpackC8ToNHWC(fp16_t *dst, const fp16_t *src, size_t hw, size_t channel);
+
+int UnpackAndDequant(float *dst, const int8_t *src, size_t hw, size_t channel, float *scale, float *bias) {
+    int cur_hw;
+    int c;
+    int idx  = 0;
+    int c_r4 = ROUND_UP(channel, 4);
+    for (c = 0; c < channel; ++c) {
+        auto *src_c = src + c;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst[idx++] = src_c[c_r4 * cur_hw] * scale[c] + bias[c];
+        }
+    }
+    return 0;
+}
+
+int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_t c_step, size_t w_step,
+                       size_t channel) {
+    int c;
+
+    for (c = 0; c < channel; ++c) {
+        int plane          = c / 4;
+        const float *src_c = plane * ih * iw * 4 + src;
+        float *dst_c_start = dst + c * c_step;
+        for (int h = 0; h < ih; h++) {
+            float *dst_x_start = dst_c_start + h * w_step;
+            int offset         = c % 4;
+            for (int w = 0; w < iw; w++) {
+                *dst_x_start++ = src_c[4 * (h * iw + w) + offset];
+            }
+        }
+    }
+
+    return 0;
+}
+
+int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw) {
+    auto c_r4 = ROUND_UP(channel, 4);
+
+    for (int c = 0; c < channel; ++c) {
+        auto src_c = src + c;
+        auto dst_c = dst + c * hw;
+        for (int z = 0; z < hw; ++z) {
+            dst_c[z] = src_c[z * c_r4];
+        }
+    }
+
+    return 0;
+}
+
+#define ConvertWeightsPreparation                                        \
+    const int goc       = output_channel / group;                        \
+    const int gic       = input_channel / group;                         \
+    const int goc_4     = (goc + 3) / 4;                                 \
+    const int gic_4     = (gic + 3) / 4;                                 \
+    const int src_count = group * goc * gic * height * width;            \
+    int src_index = 0;
+// to   [g][o/4][i/4][h][w][16]
+// from [g][o][i][h][w]
+template <typename T>
+int ConvertWeightsFromGOIHWToGOIHW16(T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width) {
+    ConvertWeightsPreparation;
+
+    for (int g = 0; g < group; g++) {
+        auto g_dst = dst + g * goc_4 * gic_4 * height * width * 16;  // g
+        for (int o = 0; o < goc; o++) {
+            auto zo = o / 4, ro = o % 4;
+            auto o_dst = g_dst + zo * gic_4 * height * width * 16 + ro;  // o/4 x 4
+            for (int i = 0; i < gic; i++) {
+                auto zi = i / 4, ri = i % 4;
+                auto i_dst = o_dst + zi * height * width * 16 + ri * 4;  // i/4 x 4
+                for (int h = 0; h < height; h++) {
+                    for (int w = 0; w < width; w++) {
+                        // to   [g][o/4][i/4][h][w][16]
+                        // from [g][o][i][h][w]
+                        if (src_index < src_count) {
+                            i_dst[(h * width + w) * 16] = src[src_index++];
+                        } else {
+                            i_dst[(h * width + w) * 16] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+template int ConvertWeightsFromGOIHWToGOIHW16(float *src, float *dst, int group, int input_channel, int output_channel,
+                                              int height, int width);
+
+// to   [g][o/4][h][w][i/4][16]
+// from [g][i][o][h][w]
+template <typename T>
+int ConvertWeightsFromGIOHWToGOHWI16(T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width) {
+    ConvertWeightsPreparation;
+
+    for (int g = 0; g < group; g++) {
+        auto g_dst = dst + g * goc_4 * gic_4 * height * width * 16;  // g
+        for (int i = 0; i < gic; i++) {
+            auto zi = i / 4, ri = i % 4;
+            auto i_dst = g_dst + zi * 16 + ri * 4;
+            for (int o = 0; o < goc; o++) {
+                auto zo = o / 4, ro = o % 4;
+                auto o_dst = i_dst + zo * height * width * gic_4 * 16 + ro;
+                for (int h = 0; h < height; h++) {
+                    for (int w = 0; w < width; w++) {
+                        if (src_index < src_count) {
+                            o_dst[(h * width + w) * gic_4 * 16] = src[src_index++];
+                        } else {
+                            o_dst[(h * width + w) * gic_4 * 16] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+template int ConvertWeightsFromGIOHWToGOHWI16(float *src, float *dst, int group, int input_channel, int output_channel,
+                                              int height, int width);
+
+// if gic < 8
+// to   [g][o/8][h][w][i][8]
+// from [g][i][o][h][w]
+//else
+// to   [g][o/8][h][w][i/8][64]
+// from [g][i][o][h][w]
+template <typename T>
+int ConvertWeightsFromGIOHWToGOHWI64(const T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width) {
+    const int goc       = output_channel / group;
+    const int gic       = input_channel / group;
+    const int goc_8     = (goc + 7) / 8;
+    const int gic_8     = (gic + 7) / 8;
+    const int src_count = group * goc * gic * height * width;
+    int src_index = 0;
+
+    if (gic < 8) {
+        for (int g = 0; g < group; g++) {
+            auto g_dst = dst + g * goc_8 * gic * height * width * 8;  // g
+            for (int i = 0; i < gic; i++) {
+                auto i_dst = g_dst + i * 8;
+                for (int o = 0; o < goc; o++) {
+                    auto zo = o / 8, ro = o % 8;
+                    auto o_dst = i_dst + zo * height * width * gic * 8 + ro;
+                    for (int h = 0; h < height; h++) {
+                        for (int w = 0; w < width; w++) {
+                            if (src_index < src_count) {
+                                o_dst[(h * width + w) * gic * 8] = src[src_index++];
+                            } else {
+                                o_dst[(h * width + w) * gic * 8] = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return 0;
+    }
+
+    for (int g = 0; g < group; g++) {
+        auto g_dst = dst + g * goc_8 * gic_8 * height * width * 64;  // g
+        for (int i = 0; i < gic; i++) {
+            auto zi = i / 8, ri = i % 8;
+            auto i_dst = g_dst + zi * 64 + ri * 8;
+            for (int o = 0; o < goc; o++) {
+                auto zo = o / 8, ro = o % 8;
+                auto o_dst = i_dst + zo * height * width * gic_8 * 64 + ro;
+                for (int h = 0; h < height; h++) {
+                    for (int w = 0; w < width; w++) {
+                        if (src_index < src_count) {
+                            o_dst[(h * width + w) * gic_8 * 64] = src[src_index++];
+                        } else {
+                            o_dst[(h * width + w) * gic_8 * 64] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+#if TNN_ARM82
+template int ConvertWeightsFromGIOHWToGOHWI64(const int16_t *src, int16_t *dst, int group, int input_channel, int output_channel,
+                                              int height, int width);
+#endif
+
+template <typename T>
+int ConvertWeightsC4ToC8(T *weight, int ic, int oc) {
+    int ic4 = UP_DIV(ic, 4), oc4 = UP_DIV(oc, 4);
+    RawBuffer temp(ic4 * oc4 * 16 * sizeof(T));
+
+    for (int o = 0; o < oc4 * 4; o++) {
+        for (int i = 0; i < ic4 * 4; i++) {
+            int d_zo = o / 8, d_ro = o % 8, d_zi = i / 4, d_ri = i % 4;
+            int s_zo = o / 4, s_ro = o % 4, s_zi = i / 4, s_ri = i % 4;
+            int o_offset = d_zo * ic4 * 32 + d_zi * 32 + d_ri * 8 + d_ro;
+            int i_offset = s_zo * ic4 * 16 + s_zi * 16 + s_ri * 4 + s_ro;
+
+            temp.force_to<T *>()[o_offset] = weight[i_offset];
+        }
+    }
+    memcpy(weight, temp.force_to<T *>(), temp.GetBytesSize());
+    return 0;
+}
+template int ConvertWeightsC4ToC8(float *weight, int ic, int oc);
+template int ConvertWeightsC4ToC8(bfp16_t *weight, int ic, int oc);
+
+// from [o][i][h][w]
+// to armv8: [o/4][h][w][i/16][o4][i16]
+// to armv7: [o/4][h][w][i/8][o2][i2][o2][i4]
+int PackINT8Weight(int8_t *src, int8_t *dst, int group, int input_channel, int output_channel, int height, int width) {
+    const int oc_4        = (output_channel + 3) / 4;
+    const int ic_calc     = input_channel < 4 ? input_channel : ROUND_UP(input_channel, 4);
+    const int crs_round16 = ROUND_UP(ic_calc * height * width, 16);
+    memset(dst, 0, oc_4 * 4 * crs_round16);
+    for (int o = 0; o < output_channel; o++) {
+        auto zo = o / 4, ro = o % 4;
+        for (int h = 0; h < height; h++) {
+            for (int w = 0; w < width; w++) {
+                for (int i = 0; i < input_channel; i++) {
+#if !defined(__aarch64__) && defined(TNN_USE_NEON)
+                    // to armv7: [o/4][h][w][i/8][o2][i2][o2][i4]
+                    // so dirty and difficult to read but best for kernel
+                    auto zro = ro / 2, rro = ro % 2;
+                    auto o_dst = dst + zo * 4 * crs_round16 + zro * 16 + rro * 4;
+                    auto zi    = ((h * width + w) * ic_calc + i) / 8;
+                    auto ri    = ((h * width + w) * ic_calc + i) % 8;
+                    auto zri = ri / 4, rri = ri % 4;
+                    o_dst[zi * 8 * 4 + zri * 4 * 2 + rri] =
+                        src[o * input_channel * height * width + i * height * width + h * width + w];
+#else
+                    // to armv8: [o/4][h][w][i/16][o4][i16]
+                    auto o_dst = dst + zo * 4 * crs_round16 + ro * 16;
+                    auto ri    = ((h * width + w) * ic_calc + i) % 16;
+                    auto zi    = ((h * width + w) * ic_calc + i) / 16;
+                    o_dst[zi * 16 * 4 + ri] =
+                        src[o * input_channel * height * width + i * height * width + h * width + w];
+#endif
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+// to   [g][o/4][h][w][12]
+// from [g][o][i][h][w]
+template <typename T>
+int ConvertWeightsFromOI3HWToOHW12(T *src, T *dst, int input_channel, int output_channel, int height, int width) {
+    const int oc_4      = (output_channel + 3) / 4;
+    const int ic_4      = (input_channel + 3) / 4;
+    const int src_count = output_channel * input_channel * height * width;
+
+    int src_index = 0;
+
+    for (int o = 0; o < output_channel; o++) {
+        auto zo = o / 4, ro = o % 4;
+        auto o_dst = dst + zo * height * width * 12 + ro;  // o/4 x 4
+        for (int i = 0; i < input_channel; i++) {
+            auto zi = i / 3, ri = i % 3;
+            auto i_dst = o_dst + zi * height * width * 12 + ri * 4;  // i/4 x 4
+            for (int h = 0; h < height; h++) {
+                for (int w = 0; w < width; w++) {
+                    // to   [g][o/4][h][w][12]
+                    // from [g][o][i][h][w]
+                    if (src_index < src_count) {
+                        i_dst[(h * width + w) * 12] = src[src_index++];
+                    } else {
+                        i_dst[(h * width + w) * 12] = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+template int ConvertWeightsFromOI3HWToOHW12(float *src, float *dst, int input_channel, int output_channel, int height,
+                                            int width);
+
+// to   [g][o/8][h][w][24]
+// from [g][o][i][h][w]
+template <typename T>
+int ConvertWeightsFromOI3HWToOHW24(const T *src, T *dst, int input_channel, int output_channel, int height, int width) {
+    const int src_count = output_channel * input_channel * height * width;
+
+    int src_index = 0;
+
+    for (int o = 0; o < output_channel; o++) {
+        auto zo = o / 8, ro = o % 8;
+        auto o_dst = dst + zo * height * width * 24 + ro;  // o/8 x 8
+        for (int i = 0; i < input_channel; i++) {
+            auto zi = i / 3, ri = i % 3;
+            auto i_dst = o_dst + zi * height * width * 24 + ri * 8;  // i x 8
+            for (int h = 0; h < height; h++) {
+                for (int w = 0; w < width; w++) {
+                    // to   [g][o/8][h][w][24]
+                    // from [g][o][i][h][w]
+                    if (src_index < src_count) {
+                        i_dst[(h * width + w) * 24] = src[src_index++];
+                    } else {
+                        i_dst[(h * width + w) * 24] = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+#if TNN_ARM82
+template int ConvertWeightsFromOI3HWToOHW24(const int16_t *src, int16_t *dst, int input_channel, int output_channel, int height, int width);
+#endif
+
+// to   [g][o/8][i/8][h][w][i8][o8]
+// from [g][o][i][h][w]
+template <typename T>
+int ConvertWeightsFromGOIHWToGOIHW64(const T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width) {
+    const int goc = output_channel / group;
+    const int gic = input_channel / group;
+    const int goc_r8 = ROUND_UP(goc, 8);
+    const size_t src_count = group * goc * gic * height * width;
+    const size_t ic_step = gic * height * width;
+    const size_t hw_size = height * width;
+
+    for (int g = 0; g < group; g++) {
+        auto g_src = src + g * goc * ic_step;
+        auto g_dst = dst + g * goc_r8 * ic_step;
+        for (int oc = 0; oc < goc; oc += 8) {
+            int oc_eff = MIN(goc - oc, 8);
+            auto oc_src = g_src + oc * ic_step;
+            auto oc_dst = g_dst + oc * ic_step;
+            for (int ic = 0; ic < gic; ic += 8) {
+                int ic_eff = MIN(gic - ic, 8);
+                auto ic_src = oc_src + ic * hw_size;
+                auto ic_dst = oc_dst + ic * hw_size * 8;
+                for (int k = 0; k < hw_size; k++) {
+                    auto k_src = ic_src + k;
+                    auto k_dst = ic_dst + k * ic_eff * 8;
+                    for (int ic_i = 0; ic_i < ic_eff; ic_i++) {
+                        int oc_i = 0;
+                        for (; oc_i < oc_eff; oc_i++) {
+                            k_dst[ic_i * 8 + oc_i] = k_src[oc_i * ic_step + ic_i * hw_size];
+                        }
+                        for (; oc_i < 8; oc_i++) {
+                            k_dst[ic_i * 8 + oc_i] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+#if TNN_ARM82
+template int ConvertWeightsFromGOIHWToGOIHW64(const fp16_t *src, fp16_t *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+template int ConvertWeightsFromGOIHWToGOIHW64(const int16_t *src, int16_t *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+#endif
+
+//float
+//     r = 1.164 * (y - 16) + 1.596 * (v - 128);
+//     g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
+//     b = 1.164 * (y - 16) + 2.018 * (u - 128);
+//int 16
+//     r = (74 *y - 1135 + 102 * vv ) >> 6
+//     g = (74 *y - 1135 - 52 * vv - 25 * uu ) >> 6
+//     b = (74 *y - 1135 + 129 * uu ) >> 6
+void NV12ToBGR(const unsigned char* nv12, unsigned char* bgr, int h, int w) {
+#ifndef TNN_USE_NEON
+    return NaiveYUVToBGROrBGRA(nv12, bgr, 3, h, w, true);
+#else
+    const unsigned char* yptr  = nv12;
+    const unsigned char* vuptr = nv12 + w * h;
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = bgr;
+        unsigned char* rgb1 = bgr + w * 3;
+#if __aarch64__
+        int64_t nn = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain = w - (nn << 3);
+
+        int16x8_t _q1135 = vdupq_n_s16(1135);
+        int8x8_t _v74    = vdup_n_s8(74);
+        int8x8_t _v128   = vdup_n_s8(int8_t(128));
+        int8x8_t _v102   = vdup_n_s8(102);
+        int8x8_t _v52    = vdup_n_s8(52);
+        int8x8_t _v25    = vdup_n_s8(25);
+        // use 127 instead of 129 to prevent char overflow, add another 2 in asm
+        int8x8_t _v127   = vdup_n_s8(127);
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240  = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "prfm  pldl1strm, [%[_vu], #128]    \n\t"
+                "ld1   {v2.8b},   [%[_vu]], #8      \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "0:                                 \n\t"
+                "prfm  pldl1strm, [%[_y0], #128]    \n\t"
+                "ld1   {v0.8b},   [%[_y0]], #8      \n\t"
+                "prfm  pldl1strm, [%[_y1], #128]    \n\t"
+                "ld1   {v1.8b},   [%[_y1]], #8      \n\t"
+                "umull v28.8h, v0.8b,  %[_v74].8b   \n\t"
+                "sub   v28.8h, v28.8h, %[_q1135].8h \n\t"   // v28 -> b0
+                "orr   v3.8b,  v2.8b,  v2.8b        \n\t"
+                "umull v29.8h, v1.8b,  %[_v74].8b   \n\t"
+                "sub   v29.8h, v29.8h, %[_q1135].8h \n\t"   // v29 -> b1
+                "orr   v9.16b, v28.16b, v28.16b     \n\t"   // v9  -> g0
+                "trn1  v31.8b, v2.8b, v3.8b         \n\t"   // u
+                "trn2  v30.8b, v2.8b, v3.8b         \n\t"   // v
+                "orr   v11.16b, v29.16b, v29.16b    \n\t"   // v11 -> g1
+                "sshll v27.8h, v31.8b, #1           \n\t"
+                "smlsl v9.8h,  v30.8b, %[_v52].8b   \n\t"
+                "orr   v8.16b, v28.16b, v28.16b     \n\t"   // v8  -> r0
+                "smlsl v11.8h, v30.8b, %[_v52].8b   \n\t"
+                "orr   v10.16b, v29.16b, v29.16b    \n\t"   // v10 -> r1
+                "smlal v8.8h,  v30.8b, %[_v102].8b  \n\t"
+                "smlal v28.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlal v10.8h, v30.8b, %[_v102].8b  \n\t"
+                "add   v28.8h, v28.8h, v27.8h       \n\t"
+                "smlsl v9.8h,  v31.8b, %[_v25].8b   \n\t"
+                "smlal v29.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlsl v11.8h, v31.8b, %[_v25].8b   \n\t"
+                "add   v29.8h, v29.8h, v27.8h       \n\t"
+                "sqshrun v26.8b, v8.8h,  #6         \n\t"   // v24-v26: b0g0r0
+                "sqshrun v24.8b, v28.8h, #6         \n\t"
+                "sqshrun v6.8b,  v10.8h, #6         \n\t"
+                "sqshrun v25.8b, v9.8h,  #6         \n\t"   // v4-v6: b1g1r1
+                "sqshrun v4.8b,  v29.8h, #6         \n\t"
+                "sqshrun v5.8b,  v11.8h, #6         \n\t"
+                "prfm pldl1strm, [%[_vu], #128]     \n\t"
+                "ld1 {v2.8b},    [%[_vu]], #8       \n\t"
+                "subs %[_nn], %[_nn], #1            \n\t"
+                "prfm pstl1strm, [%[_r0]]           \n\t"
+                "st3 {v24.8b-v26.8b}, [%[_r0]], #24 \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "prfm pstl1strm, [%[_r1]]           \n\t"
+                "st3 {v4.8b-v6.8b},   [%[_r1]], #24 \n\t"
+                "bne 0b                             \n\t"
+                "sub %[_vu], %[_vu], #8             \n\t"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_v102]"w"(_v102),
+                  [_v52]"w"(_v52),
+                  [_v25]"w"(_v25),
+                  [_v127]"w"(_v127),
+                  [_q1135]"w"(_q1135),
+                  [_v74]"w"(_v74),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8",
+                  "v9", "v10", "v11", "v12", "v24", "v25", "v26","v27", "v28", "v29", "v30", "v31"
+            );
+        }
+#else
+        int nn         = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain     = w - (nn << 3);
+        short _s1135   = 1135;
+        int8x8_t _v74  = vdup_n_s8(74);
+        int8x8_t _v128 = vdup_n_s8(int8_t(128));
+        // to much input w cause compile error, merge to one
+        int8x8_t _vuvfilter = {102, 52, 25, 127, 0, 0, 0, 0};
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240     = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vcgt.u8    d27, d2, %[_v240]   \n"
+                "vbsl.u8    d27,  %[_v240], d2  \n"
+                "vsub.u8    d2, d27, %[_v128]   \n"
+                "vmov.s8    d10, %[_filt]       \n"
+                "vdup.8     d11, d10[1]         \n"   // v52
+                "vdup.8     d12, d10[2]         \n"   // v25
+                "vdup.8     d13, d10[3]         \n"   // v127
+                "vdup.16    q7,  %[_s1135]      \n"   // q1135
+                "vdup.8     d10, d10[0]         \n"   // v102
+                "0:                             \n"
+                "pld        [%[_y0], #128]      \n"
+                "vld1.u8    {d0}, [%[_y0]]!     \n"
+                "pld        [%[_y1], #128]      \n"
+                "vld1.u8    {d1}, [%[_y1]]!     \n"
+                "vmull.u8   q2, d0, %[_v74]     \n"
+                "vorr       d3, d2, d2          \n"
+                "vsub.s16   q2, q2, q7          \n"   // q2  -> b0
+                "vmull.u8   q3, d1, %[_v74]     \n"
+                "vorr       q9, q2, q2          \n"   // q9  -> g0
+                "vsub.s16   q3, q3, q7          \n"   // q3  -> b1
+                "vtrn.s8    d3, d2              \n"   // d3 -> u, d2 -> v
+                "vorr       q11, q3, q3         \n"   // q11 -> g1
+                "vshll.s8   q4, d3, #1          \n"
+                "vmlsl.s8   q9, d2, d11         \n"
+                "vorr       q8, q2, q2          \n"   // q8  -> r0
+                "vmlsl.s8   q11, d2, d11        \n"
+                "vorr       q10, q3, q3         \n"   // q10 -> r1
+                "vmlal.s8   q8, d2, d10         \n"
+                "vmlal.s8   q2, d3, d13         \n"
+                "vmlal.s8   q10, d2, d10        \n"
+                "vadd.s16   q2, q2, q4          \n"
+                "vmlsl.s8   q9, d3, d12         \n"
+                "vmlal.s8   q3, d3, d13         \n"
+                "vmlsl.s8   q11,d3, d12         \n"
+                "vadd.s16   q3, q3, q4          \n"
+                "vqshrun.s16 d26, q8, #6        \n"   // d24-d26: b0g0r0
+                "vqshrun.s16 d24, q2, #6        \n"
+                "vqshrun.s16 d4,  q3, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"   // d4-d6: b1g1r1
+                "vqshrun.s16 d6, q10, #6        \n"
+                "vqshrun.s16 d5, q11, #6        \n"
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "subs       %[_nn], #1          \n"
+                "vst3.u8    {d24-d26}, [%[_r0]]!\n"
+                "vcgt.u8    d27, d2, %[_v240]   \n"
+                "vbsl.u8    d27,  %[_v240], d2  \n"
+                "vsub.u8    d2, d27, %[_v128]   \n"
+                "vst3.u8    {d4-d6},   [%[_r1]]!\n"
+                "bne        0b                  \n"
+                "sub        %[_vu], #8          \n"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_filt]"w"(_vuvfilter),
+                  [_v74]"w"(_v74),
+                  [_s1135]"r"(_s1135),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "q0", "q1", "q2", "q3","q4","q5","q6","q7","q8", "q9", "q10", "q11", "q12", "q13"
+            );
+        }
+#endif //__aarch64__
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, remain, true, 3);
+        yptr  += 2*w;
+        vuptr += remain;
+        bgr   += 2*3*w;
+    }
+#endif
+}
+
+void NV21ToBGR(const unsigned char* nv21, unsigned char* bgr, int h, int w) {
+#ifndef TNN_USE_NEON
+    return NaiveYUVToBGROrBGRA(nv21, bgr, 3, h, w, false);
+#else
+    const unsigned char* yptr  = nv21;
+    const unsigned char* vuptr = nv21 + w * h;
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = bgr;
+        unsigned char* rgb1 = bgr + w * 3;
+#if __aarch64__
+        int64_t nn = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain = w - (nn << 3);
+
+        int16x8_t _q1135 = vdupq_n_s16(1135);
+        int8x8_t _v74    = vdup_n_s8(74);
+        int8x8_t _v128   = vdup_n_s8(int8_t(128));
+        int8x8_t _v102   = vdup_n_s8(102);
+        int8x8_t _v52    = vdup_n_s8(52);
+        int8x8_t _v25    = vdup_n_s8(25);
+        // use 127 instead of 129 to prevent char overflow, add another 2 in asm
+        int8x8_t _v127   = vdup_n_s8(127);
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240  = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "prfm  pldl1strm, [%[_vu], #128]    \n\t"
+                "ld1   {v2.8b},   [%[_vu]], #8      \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "0:                                 \n\t"
+                "prfm  pldl1strm, [%[_y0], #128]    \n\t"
+                "ld1   {v0.8b},   [%[_y0]], #8      \n\t"
+                "prfm  pldl1strm, [%[_y1], #128]    \n\t"
+                "ld1   {v1.8b},   [%[_y1]], #8      \n\t"
+                "umull v28.8h, v0.8b,  %[_v74].8b   \n\t"
+                "sub   v28.8h, v28.8h, %[_q1135].8h \n\t"   // v28 -> b0
+                "orr   v3.8b,  v2.8b,  v2.8b        \n\t"
+                "umull v29.8h, v1.8b,  %[_v74].8b   \n\t"
+                "sub   v29.8h, v29.8h, %[_q1135].8h \n\t"   // v29 -> b1
+                "orr   v9.16b, v28.16b, v28.16b     \n\t"   // v9  -> g0
+                "trn1  v30.8b, v2.8b, v3.8b         \n\t"   // u
+                "trn2  v31.8b, v2.8b, v3.8b         \n\t"   // v
+                "orr   v11.16b, v29.16b, v29.16b    \n\t"   // v11 -> g1
+                "sshll v27.8h, v31.8b, #1           \n\t"
+                "smlsl v9.8h,  v30.8b, %[_v52].8b   \n\t"
+                "orr   v8.16b, v28.16b, v28.16b     \n\t"   // v8  -> r0
+                "smlsl v11.8h, v30.8b, %[_v52].8b   \n\t"
+                "orr   v10.16b, v29.16b, v29.16b    \n\t"   // v10 -> r1
+                "smlal v8.8h,  v30.8b, %[_v102].8b  \n\t"
+                "smlal v28.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlal v10.8h, v30.8b, %[_v102].8b  \n\t"
+                "add   v28.8h, v28.8h, v27.8h       \n\t"
+                "smlsl v9.8h,  v31.8b, %[_v25].8b   \n\t"
+                "smlal v29.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlsl v11.8h, v31.8b, %[_v25].8b   \n\t"
+                "add   v29.8h, v29.8h, v27.8h       \n\t"
+                "sqshrun v26.8b, v8.8h,  #6         \n\t"   // v24-v26: b0g0r0
+                "sqshrun v24.8b, v28.8h, #6         \n\t"
+                "sqshrun v6.8b,  v10.8h, #6         \n\t"
+                "sqshrun v25.8b, v9.8h,  #6         \n\t"   // v4-v6: b1g1r1
+                "sqshrun v4.8b,  v29.8h, #6         \n\t"
+                "sqshrun v5.8b,  v11.8h, #6         \n\t"
+                "prfm pldl1strm, [%[_vu], #128]     \n\t"
+                "ld1 {v2.8b},    [%[_vu]], #8       \n\t"
+                "subs %[_nn], %[_nn], #1            \n\t"
+                "prfm pstl1strm, [%[_r0]]           \n\t"
+                "st3 {v24.8b-v26.8b}, [%[_r0]], #24 \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "prfm pstl1strm, [%[_r1]]           \n\t"
+                "st3 {v4.8b-v6.8b},   [%[_r1]], #24 \n\t"
+                "bne 0b                             \n\t"
+                "sub %[_vu], %[_vu], #8             \n\t"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_v102]"w"(_v102),
+                  [_v52]"w"(_v52),
+                  [_v25]"w"(_v25),
+                  [_v127]"w"(_v127),
+                  [_q1135]"w"(_q1135),
+                  [_v74]"w"(_v74),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8",
+                  "v9", "v10", "v11", "v12", "v24", "v25", "v26","v27", "v28", "v29", "v30", "v31"
+            );
+        }
+#else
+        int nn         = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain     = w - (nn << 3);
+        short _s1135   = 1135;
+        int8x8_t _v74  = vdup_n_s8(74);
+        int8x8_t _v128 = vdup_n_s8(int8_t(128));
+        // to much input w cause compile error, merge to one
+        int8x8_t _vuvfilter = {102, 52, 25, 127, 0, 0, 0, 0};
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240     = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vcgt.u8    d27, d2, %[_v240]   \n"
+                "vbsl.u8    d27,  %[_v240], d2  \n"
+                "vsub.u8    d2, d27, %[_v128]   \n"
+                "vmov.s8    d10, %[_filt]       \n"
+                "vdup.8     d11, d10[1]         \n"   // v52
+                "vdup.8     d12, d10[2]         \n"   // v25
+                "vdup.8     d13, d10[3]         \n"   // v127
+                "vdup.16    q7,  %[_s1135]      \n"   // q1135
+                "vdup.8     d10, d10[0]         \n"   // v102
+                "0:                             \n"
+                "pld        [%[_y0], #128]      \n"
+                "vld1.u8    {d0}, [%[_y0]]!     \n"
+                "pld        [%[_y1], #128]      \n"
+                "vld1.u8    {d1}, [%[_y1]]!     \n"
+                "vmull.u8   q2, d0, %[_v74]     \n"
+                "vorr       d3, d2, d2          \n"
+                "vsub.s16   q2, q2, q7          \n"   // q2  -> b0
+                "vmull.u8   q3, d1, %[_v74]     \n"
+                "vorr       q9, q2, q2          \n"   // q9  -> g0
+                "vsub.s16   q3, q3, q7          \n"   // q3  -> b1
+                "vtrn.s8    d2, d3              \n"   // d2 -> u, d3 -> v
+                "vorr       q11, q3, q3         \n"   // q11 -> g1
+                "vshll.s8   q4, d3, #1          \n"
+                "vmlsl.s8   q9, d2, d11         \n"
+                "vorr       q8, q2, q2          \n"   // q8  -> r0
+                "vmlsl.s8   q11, d2, d11        \n"
+                "vorr       q10, q3, q3         \n"   // q10 -> r1
+                "vmlal.s8   q8, d2, d10         \n"
+                "vmlal.s8   q2, d3, d13         \n"
+                "vmlal.s8   q10, d2, d10        \n"
+                "vadd.s16   q2, q2, q4          \n"
+                "vmlsl.s8   q9, d3, d12         \n"
+                "vmlal.s8   q3, d3, d13         \n"
+                "vmlsl.s8   q11,d3, d12         \n"
+                "vadd.s16   q3, q3, q4          \n"
+                "vqshrun.s16 d26, q8, #6        \n"   // d24-d26: b0g0r0
+                "vqshrun.s16 d24, q2, #6        \n"
+                "vqshrun.s16 d4,  q3, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"   // d4-d6: b1g1r1
+                "vqshrun.s16 d6, q10, #6        \n"
+                "vqshrun.s16 d5, q11, #6        \n"
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "subs       %[_nn], #1          \n"
+                "vst3.u8    {d24-d26}, [%[_r0]]!\n"
+                "vcgt.u8    d27, d2, %[_v240]   \n"
+                "vbsl.u8    d27,  %[_v240], d2  \n"
+                "vsub.u8    d2, d27, %[_v128]   \n"
+                "vst3.u8    {d4-d6},   [%[_r1]]!\n"
+                "bne        0b                  \n"
+                "sub        %[_vu], #8          \n"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_filt]"w"(_vuvfilter),
+                  [_v74]"w"(_v74),
+                  [_s1135]"r"(_s1135),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "q0", "q1", "q2", "q3","q4","q5","q6","q7","q8", "q9", "q10", "q11", "q12", "q13"
+            );
+        }
+#endif //__aarch64__
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, remain, false, 3);
+        yptr  += 2*w;
+        vuptr += remain;
+        bgr   += 2*3*w;
+    }
+#endif // TNN_USE_NEON
+}
+
+void NV12ToBGRA(const unsigned char* nv12, unsigned char* bgra, int h, int w) {
+#ifndef TNN_USE_NEON
+    return NaiveYUVToBGROrBGRA(nv12, bgra, 4, h, w, true);
+#else
+    const unsigned char* yptr  = nv12;
+    const unsigned char* vuptr = nv12 + w * h;
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = bgra;
+        unsigned char* rgb1 = bgra + w * 4;
+#if __aarch64__
+        int64_t nn = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain = w - (nn << 3);
+
+        int16x8_t _q1135 = vdupq_n_s16(1135);
+        int8x8_t _v74    = vdup_n_s8(74);
+        int8x8_t _v128   = vdup_n_s8(int8_t(128));
+        int8x8_t _v255   = vdup_n_s8(int8_t(255));
+        int8x8_t _v102   = vdup_n_s8(102);
+        int8x8_t _v52    = vdup_n_s8(52);
+        int8x8_t _v25    = vdup_n_s8(25);
+        // use 127 instead of 129 to prevent char overflow, add another 2 in asm
+        int8x8_t _v127   = vdup_n_s8(127);
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240  = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "prfm  pldl1strm, [%[_vu], #128]    \n\t"
+                "ld1   {v2.8b},   [%[_vu]], #8      \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "mov   v27.8b, %[_v255].8b          \n\t"
+                "orr   v7.8b,  v27.8b, v27.8b       \n\t"
+                "0:                                 \n\t"
+                "prfm  pldl1strm, [%[_y0], #128]    \n\t"
+                "ld1   {v0.8b},   [%[_y0]], #8      \n\t"
+                "prfm  pldl1strm, [%[_y1], #128]    \n\t"
+                "ld1   {v1.8b},   [%[_y1]], #8      \n\t"
+                "umull v28.8h, v0.8b,  %[_v74].8b   \n\t"
+                "sub   v28.8h, v28.8h, %[_q1135].8h \n\t"   // v28 -> b0
+                "orr   v3.8b,  v2.8b,  v2.8b        \n\t"
+                "umull v29.8h, v1.8b,  %[_v74].8b   \n\t"
+                "sub   v29.8h, v29.8h, %[_q1135].8h \n\t"   // v29 -> b1
+                "orr   v9.16b, v28.16b, v28.16b     \n\t"   // v9  -> g0
+                "trn1  v31.8b, v2.8b, v3.8b         \n\t"   // u
+                "trn2  v30.8b, v2.8b, v3.8b         \n\t"   // v
+                "orr   v11.16b, v29.16b, v29.16b    \n\t"   // v11 -> g1
+                "sshll v13.8h, v31.8b, #1           \n\t"
+                "smlsl v9.8h,  v30.8b, %[_v52].8b   \n\t"
+                "orr   v8.16b, v28.16b, v28.16b     \n\t"   // v8  -> r0
+                "smlsl v11.8h, v30.8b, %[_v52].8b   \n\t"
+                "orr   v10.16b, v29.16b, v29.16b    \n\t"   // v10 -> r1
+                "smlal v8.8h,  v30.8b, %[_v102].8b  \n\t"
+                "smlal v28.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlal v10.8h, v30.8b, %[_v102].8b  \n\t"
+                "add   v28.8h, v28.8h, v13.8h       \n\t"
+                "smlsl v9.8h,  v31.8b, %[_v25].8b   \n\t"
+                "smlal v29.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlsl v11.8h, v31.8b, %[_v25].8b   \n\t"
+                "add   v29.8h, v29.8h, v13.8h       \n\t"
+                "sqshrun v26.8b, v8.8h,  #6         \n\t"   // v24-v27: b0g0r0a0
+                "sqshrun v24.8b, v28.8h, #6         \n\t"
+                "sqshrun v6.8b,  v10.8h, #6         \n\t"
+                "sqshrun v25.8b, v9.8h,  #6         \n\t"   // v4-v7: b1g1r1a1
+                "sqshrun v4.8b,  v29.8h, #6         \n\t"
+                "sqshrun v5.8b,  v11.8h, #6         \n\t"
+                "prfm pldl1strm, [%[_vu], #128]     \n\t"
+                "ld1 {v2.8b},    [%[_vu]], #8       \n\t"
+                "subs %[_nn], %[_nn], #1            \n\t"
+                "prfm pstl1strm, [%[_r0]]           \n\t"
+                "st4 {v24.8b-v27.8b}, [%[_r0]], #32 \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "prfm pstl1strm, [%[_r1]]           \n\t"
+                "st4 {v4.8b-v7.8b},   [%[_r1]], #32 \n\t"
+                "bne 0b                             \n\t"
+                "sub %[_vu], %[_vu], #8             \n\t"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_v102]"w"(_v102),
+                  [_v52]"w"(_v52),
+                  [_v25]"w"(_v25),
+                  [_v127]"w"(_v127),
+                  [_q1135]"w"(_q1135),
+                  [_v74]"w"(_v74),
+                  [_v240]"w"(_v240),
+                  [_v255]"w"(_v255)
+                : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                  "v9", "v10", "v11", "v12", "v13", "v24", "v25", "v26","v27", "v28", "v29", "v30", "v31"
+            );
+        }
+#else
+        int nn         = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain     = w - (nn << 3);
+        short _s1135   = 1135;
+        int8x8_t _v74  = vdup_n_s8(74);
+        int8x8_t _v128 = vdup_n_s8(int8_t(128));
+        // to much input w cause compile error, merge to one
+        int8x8_t _vuvfilter = {102, 52, 25, 127, int8_t(255), 0, 0, 0};
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240     = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vcgt.u8    d9, d2, %[_v240]    \n"
+                "vbsl.u8    d9,  %[_v240], d2   \n"
+                "vsub.u8    d2, d9, %[_v128]    \n"
+                "vmov.s8    d10, %[_filt]       \n"
+                "vdup.8     d27, d10[4]         \n"   // v255
+                "vdup.8     d11, d10[1]         \n"   // v52
+                "vdup.8     d12, d10[2]         \n"   // v25
+                "vdup.8     d13, d10[3]         \n"   // v127
+                "vdup.16    q7,  %[_s1135]      \n"   // q1135
+                "vdup.8     d10, d10[0]         \n"   // v102
+                "0:                             \n"
+                "pld        [%[_y0], #128]      \n"
+                "vld1.u8    {d0}, [%[_y0]]!     \n"
+                "pld        [%[_y1], #128]      \n"
+                "vld1.u8    {d1}, [%[_y1]]!     \n"
+                "vmull.u8   q2, d0, %[_v74]     \n"
+                "vorr       d3, d2, d2          \n"
+                "vsub.s16   q2, q2, q7          \n"   // q2  -> b0
+                "vmull.u8   q3, d1, %[_v74]     \n"
+                "vorr       q9, q2, q2          \n"   // q9  -> g0
+                "vsub.s16   q3, q3, q7          \n"   // q3  -> b1
+                "vtrn.s8    d3, d2              \n"   // d3 -> u, d2 -> v
+                "vorr       q11, q3, q3         \n"   // q11 -> g1
+                "vshll.s8   q4, d3, #1          \n"
+                "vmlsl.s8   q9, d2, d11         \n"
+                "vorr       q8, q2, q2          \n"   // q8  -> r0
+                "vmlsl.s8   q11, d2, d11        \n"
+                "vorr       q10, q3, q3         \n"   // q10 -> r1
+                "vmlal.s8   q8, d2, d10         \n"
+                "vmlal.s8   q2, d3, d13         \n"
+                "vmlal.s8   q10, d2, d10        \n"
+                "vadd.s16   q2, q2, q4          \n"
+                "vmlsl.s8   q9, d3, d12         \n"
+                "vmlal.s8   q3, d3, d13         \n"
+                "vmlsl.s8   q11,d3, d12         \n"
+                "vadd.s16   q3, q3, q4          \n"
+                "vqshrun.s16 d26, q8, #6        \n"   // d24-d27: b0g0r0a0
+                "vqshrun.s16 d24, q2, #6        \n"
+                "vqshrun.s16 d3,  q3, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"   // d3-d6: b1g1r1a1
+                "vqshrun.s16 d5, q10, #6        \n"
+                "vqshrun.s16 d4, q11, #6        \n"
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vorr       d6, d27, d27        \n"
+                "subs       %[_nn], #1          \n"
+                "vst4.u8    {d24-d27}, [%[_r0]]!\n"
+                "vcgt.u8    d9, d2, %[_v240]    \n"
+                "vbsl.u8    d9,  %[_v240], d2   \n"
+                "vsub.u8    d2, d9, %[_v128]    \n"
+                "vst4.u8    {d3-d6},   [%[_r1]]!\n"
+                "bne        0b                  \n"
+                "sub        %[_vu], #8          \n"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_filt]"w"(_vuvfilter),
+                  [_v74]"w"(_v74),
+                  [_s1135]"r"(_s1135),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "q0", "q1", "q2", "q3","q4","q5","q6","q7","q8", "q9", "q10", "q11", "q12", "q13"
+            );
+        }
+#endif //__aarch64__
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, remain, true, 4);
+        yptr  += 2*w;
+        vuptr += remain;
+        bgra  += 2*4*w;
+    }
+#endif // TNN_USE_NEON
+}
+
+void NV21ToBGRA(const unsigned char* nv21, unsigned char* bgra, int h, int w) {
+#ifndef TNN_USE_NEON
+    return NaiveYUVToBGROrBGRA(nv21, bgra, 4, h, w, false);
+#else
+    const unsigned char* yptr  = nv21;
+    const unsigned char* vuptr = nv21 + w * h;
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = bgra;
+        unsigned char* rgb1 = bgra + w * 4;
+#if __aarch64__
+        int64_t nn = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain = w - (nn << 3);
+
+        int16x8_t _q1135 = vdupq_n_s16(1135);
+        int8x8_t _v74    = vdup_n_s8(74);
+        int8x8_t _v128   = vdup_n_s8(int8_t(128));
+        int8x8_t _v255   = vdup_n_s8(int8_t(255));
+        int8x8_t _v102   = vdup_n_s8(102);
+        int8x8_t _v52    = vdup_n_s8(52);
+        int8x8_t _v25    = vdup_n_s8(25);
+        // use 127 instead of 129 to prevent char overflow, add another 2 in asm
+        int8x8_t _v127   = vdup_n_s8(127);
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240  = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "prfm  pldl1strm, [%[_vu], #128]    \n\t"
+                "ld1   {v2.8b},   [%[_vu]], #8      \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "mov   v27.8b, %[_v255].8b          \n\t"
+                "orr   v7.8b,  v27.8b, v27.8b       \n\t"
+                "0:                                 \n\t"
+                "prfm  pldl1strm, [%[_y0], #128]    \n\t"
+                "ld1   {v0.8b},   [%[_y0]], #8      \n\t"
+                "prfm  pldl1strm, [%[_y1], #128]    \n\t"
+                "ld1   {v1.8b},   [%[_y1]], #8      \n\t"
+                "umull v28.8h, v0.8b,  %[_v74].8b   \n\t"
+                "sub   v28.8h, v28.8h, %[_q1135].8h \n\t"   // v28 -> b0
+                "orr   v3.8b,  v2.8b,  v2.8b        \n\t"
+                "umull v29.8h, v1.8b,  %[_v74].8b   \n\t"
+                "sub   v29.8h, v29.8h, %[_q1135].8h \n\t"   // v29 -> b1
+                "orr   v9.16b, v28.16b, v28.16b     \n\t"   // v9  -> g0
+                "trn1  v30.8b, v2.8b, v3.8b         \n\t"   // u
+                "trn2  v31.8b, v2.8b, v3.8b         \n\t"   // v
+                "orr   v11.16b, v29.16b, v29.16b    \n\t"   // v11 -> g1
+                "sshll v13.8h, v31.8b, #1           \n\t"
+                "smlsl v9.8h,  v30.8b, %[_v52].8b   \n\t"
+                "orr   v8.16b, v28.16b, v28.16b     \n\t"   // v8  -> r0
+                "smlsl v11.8h, v30.8b, %[_v52].8b   \n\t"
+                "orr   v10.16b, v29.16b, v29.16b    \n\t"   // v10 -> r1
+                "smlal v8.8h,  v30.8b, %[_v102].8b  \n\t"
+                "smlal v28.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlal v10.8h, v30.8b, %[_v102].8b  \n\t"
+                "add   v28.8h, v28.8h, v13.8h       \n\t"
+                "smlsl v9.8h,  v31.8b, %[_v25].8b   \n\t"
+                "smlal v29.8h, v31.8b, %[_v127].8b  \n\t"
+                "smlsl v11.8h, v31.8b, %[_v25].8b   \n\t"
+                "add   v29.8h, v29.8h, v13.8h       \n\t"
+                "sqshrun v26.8b, v8.8h,  #6         \n\t"   // v24-v27: b0g0r0a0
+                "sqshrun v24.8b, v28.8h, #6         \n\t"
+                "sqshrun v6.8b,  v10.8h, #6         \n\t"
+                "sqshrun v25.8b, v9.8h,  #6         \n\t"   // v4-v7: b1g1r1a1
+                "sqshrun v4.8b,  v29.8h, #6         \n\t"
+                "sqshrun v5.8b,  v11.8h, #6         \n\t"
+                "prfm pldl1strm, [%[_vu], #128]     \n\t"
+                "ld1 {v2.8b},    [%[_vu]], #8       \n\t"
+                "subs %[_nn], %[_nn], #1            \n\t"
+                "prfm pstl1strm, [%[_r0]]           \n\t"
+                "st4 {v24.8b-v27.8b}, [%[_r0]], #32 \n\t"
+                "cmhi  v12.8b, v2.8b, %[_v240].8b   \n\t"
+                "bsl   v12.8b, %[_v240].8b, v2.8b   \n\t"
+                "sub   v2.8b, v12.8b, %[_v128].8b   \n\t"
+                "prfm pstl1strm, [%[_r1]]           \n\t"
+                "st4 {v4.8b-v7.8b},   [%[_r1]], #32 \n\t"
+                "bne 0b                             \n\t"
+                "sub %[_vu], %[_vu], #8             \n\t"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_v102]"w"(_v102),
+                  [_v52]"w"(_v52),
+                  [_v25]"w"(_v25),
+                  [_v127]"w"(_v127),
+                  [_q1135]"w"(_q1135),
+                  [_v74]"w"(_v74),
+                  [_v240]"w"(_v240),
+                  [_v255]"w"(_v255)
+                : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                  "v9", "v10", "v11", "v12", "v13", "v24", "v25", "v26","v27", "v28", "v29", "v30", "v31"
+            );
+        }
+#else
+        int nn         = w >> 3;
+        if (nn > 0) {
+            // avoid prefetch cross border
+            nn -= 1;
+        }
+        int remain     = w - (nn << 3);
+        short _s1135   = 1135;
+        int8x8_t _v74  = vdup_n_s8(74);
+        int8x8_t _v128 = vdup_n_s8(int8_t(128));
+        // to much input w cause compile error, merge to one
+        int8x8_t _vuvfilter = {102, 52, 25, 127, int8_t(255), 0, 0, 0};
+        // saturate uv to 240 to avoid b overflow
+        uint8x8_t _v240     = vdup_n_u8(240);
+
+        if (nn > 0) {
+            asm volatile(
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vcgt.u8    d9, d2, %[_v240]    \n"
+                "vbsl.u8    d9,  %[_v240], d2   \n"
+                "vsub.u8    d2, d9, %[_v128]    \n"
+                "vmov.s8    d10, %[_filt]       \n"
+                "vdup.8     d27, d10[4]         \n"   // v255
+                "vdup.8     d11, d10[1]         \n"   // v52
+                "vdup.8     d12, d10[2]         \n"   // v25
+                "vdup.8     d13, d10[3]         \n"   // v127
+                "vdup.16    q7,  %[_s1135]      \n"   // q1135
+                "vdup.8     d10, d10[0]         \n"   // v102
+                "0:                             \n"
+                "pld        [%[_y0], #128]      \n"
+                "vld1.u8    {d0}, [%[_y0]]!     \n"
+                "pld        [%[_y1], #128]      \n"
+                "vld1.u8    {d1}, [%[_y1]]!     \n"
+                "vmull.u8   q2, d0, %[_v74]     \n"
+                "vorr       d3, d2, d2          \n"
+                "vsub.s16   q2, q2, q7          \n"   // q2  -> b0
+                "vmull.u8   q3, d1, %[_v74]     \n"
+                "vorr       q9, q2, q2          \n"   // q9  -> g0
+                "vsub.s16   q3, q3, q7          \n"   // q3  -> b1
+                "vtrn.s8    d2, d3              \n"   // d2 -> u, d3 -> v
+                "vorr       q11, q3, q3         \n"   // q11 -> g1
+                "vshll.s8   q4, d3, #1          \n"
+                "vmlsl.s8   q9, d2, d11         \n"
+                "vorr       q8, q2, q2          \n"   // q8  -> r0
+                "vmlsl.s8   q11, d2, d11        \n"
+                "vorr       q10, q3, q3         \n"   // q10 -> r1
+                "vmlal.s8   q8, d2, d10         \n"
+                "vmlal.s8   q2, d3, d13         \n"
+                "vmlal.s8   q10, d2, d10        \n"
+                "vadd.s16   q2, q2, q4          \n"
+                "vmlsl.s8   q9, d3, d12         \n"
+                "vmlal.s8   q3, d3, d13         \n"
+                "vmlsl.s8   q11,d3, d12         \n"
+                "vadd.s16   q3, q3, q4          \n"
+                "vqshrun.s16 d26, q8, #6        \n"   // d24-d27: b0g0r0a0
+                "vqshrun.s16 d24, q2, #6        \n"
+                "vqshrun.s16 d3,  q3, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"   // d3-d6: b1g1r1a1
+                "vqshrun.s16 d5, q10, #6        \n"
+                "vqshrun.s16 d4, q11, #6        \n"
+                "pld        [%[_vu], #128]      \n"
+                "vld1.u8    {d2}, [%[_vu]]!     \n"
+                "vorr       d6, d27, d27        \n"
+                "subs       %[_nn], #1          \n"
+                "vst4.u8    {d24-d27}, [%[_r0]]!\n"
+                "vcgt.u8    d9, d2, %[_v240]    \n"
+                "vbsl.u8    d9,  %[_v240], d2   \n"
+                "vsub.u8    d2, d9, %[_v128]    \n"
+                "vst4.u8    {d3-d6},   [%[_r1]]!\n"
+                "bne        0b                  \n"
+                "sub        %[_vu], #8          \n"
+
+                : [_nn]"+r"(nn),
+                  [_y0]"+r"(yptr0),
+                  [_y1]"+r"(yptr1),
+                  [_vu]"+r"(vuptr),
+                  [_r0]"+r"(rgb0),
+                  [_r1]"+r"(rgb1)
+                : [_v128]"w"(_v128),
+                  [_filt]"w"(_vuvfilter),
+                  [_v74]"w"(_v74),
+                  [_s1135]"r"(_s1135),
+                  [_v240]"w"(_v240)
+                : "cc", "memory", "q0", "q1", "q2", "q3","q4","q5","q6","q7","q8", "q9", "q10", "q11", "q12", "q13"
+            );
+        }
+#endif //__aarch64__
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, remain, false, 4);
+        yptr  += 2*w;
+        vuptr += remain;
+        bgra  += 2*4*w;
+    }
+#endif // TNN_USE_NEON
+}
+
+#ifdef TNN_USE_NEON
+
+#define CVTGRAYIMPL(n, bgr_order)                                       \
+    uint8x8x##n##_t _Src;                                               \
+    _Src  = vld##n##_u8(Sp);                                            \
+    _Bh   = vmovl_u8(_Src.val[bgr_order ? 0 : 2]);                      \
+    _Gh   = vmovl_u8(_Src.val[1]);                                      \
+    _Rh   = vmovl_u8(_Src.val[bgr_order ? 2 : 0]);                      \
+    _Bval = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_Bh)));                \
+    _Gval = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_Gh)));                \
+    _Rval = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_Rh)));                \
+    _acc  = _Bval * _coeff_b;                                           \
+    _acc  = _acc + _Gval * _coeff_g;                                    \
+    _acc  = _acc + _Rval * _coeff_r;                                    \
+    _acc0 = vmovn_u32(vcvtq_u32_f32(_acc.value));                       \
+    _Bval = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_Bh)));               \
+    _Gval = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_Gh)));               \
+    _Rval = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_Rh)));               \
+    _acc  = _Bval * _coeff_b;                                           \
+    _acc  = _acc + _Gval * _coeff_g;                                    \
+    _acc  = _acc + _Rval * _coeff_r;                                    \
+    _acc1 = vmovn_u32(vcvtq_u32_f32(_acc.value));                       \
+    vst1_u8(Dp, vmovn_u16(vcombine_u16(_acc0, _acc1)));                 \
+
+#endif  // TNN_USE_NEON
+
+template <int channel, bool bgr_order>
+void ColorToGray(const unsigned char* bgr, unsigned char* gray, int h, int w) {
+#ifndef TNN_USE_NEON
+    NaiveColorToGray(bgr, gray, h, w, channel, bgr_order);
+#else
+    int offset = 0;
+    int plane  = h * w;
+
+    const unsigned char* Sp = bgr;
+    unsigned char* Dp       = gray;
+    uint16x8_t _Bh, _Gh, _Rh;
+    Float4 _Bval, _Gval, _Rval, _acc;
+    Float4 _coeff_b(0.114);
+    Float4 _coeff_g(0.587);
+    Float4 _coeff_r(0.299);
+    uint16x4_t _acc0, _acc1;
+    for (; offset < plane>>3<<3; offset += 8) {
+        if (channel == 3) {
+            CVTGRAYIMPL(3, bgr_order);
+        } else {
+            CVTGRAYIMPL(4, bgr_order);
+        }
+        Sp   += 8 * channel;
+        Dp   += 8;
+    }
+    if (plane % 8) {
+        offset -= 8;
+    }
+
+    for (; offset < plane; ++offset) {
+        unsigned b = bgr[offset * channel + (bgr_order ? 0 : 2)];
+        unsigned g = bgr[offset * channel + 1];
+        unsigned r = bgr[offset * channel + (bgr_order ? 2 : 0)];
+        float gray_color = 0.114 * b + 0.587 * g + 0.299 * r;
+        gray[offset] = gray_color;
+    }
+#endif // TNN_USE_NEON
+}
+
+void BGRToGray(const unsigned char* bgr, unsigned char* gray, int h, int w) {
+    ColorToGray<3, true>(bgr, gray, h, w);
+}
+
+void BGRAToGray(const unsigned char* bgra, unsigned char* gray, int h, int w) {
+    ColorToGray<4, true>(bgra, gray, h, w);
+}
+
+void RGBToGray(const unsigned char* rgb, unsigned char* gray, int h, int w) {
+    ColorToGray<3, false>(rgb, gray, h, w);
+}
+
+void RGBAToGray(const unsigned char* rgba, unsigned char* gray, int h, int w) {
+    ColorToGray<4, false>(rgba, gray, h, w);
+}
+
+#ifdef TNN_USE_NEON
+
+#undef CVTGRAYIMPL
+
+#endif  // TNN_USE_NEON
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/arm/arm_util.h b/3rdparty/TNN/source/tnn/device/arm/arm_util.h
new file mode 100644
index 0000000..cda6510
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/arm/arm_util.h
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ARM_UTIL_H_
+#define TNN_ARM_UTIL_H_
+
+#include <string.h>
+#include <sys/time.h>
+
+#include <cstdlib>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+#if TNN_PROFILE
+struct Timer {
+public:
+    void Start() {
+        gettimeofday(&start, NULL);
+    }
+    float TimeEclapsed() {
+        struct timeval end;
+        gettimeofday(&end, NULL);
+        float delta = (end.tv_sec - start.tv_sec) * 1000.f + (end.tv_usec - start.tv_usec) / 1000.f;
+        gettimeofday(&start, NULL);
+        return delta;
+    }
+
+private:
+    struct timeval start;
+};
+#endif
+
+char *GetBlobHandlePtr(BlobHandle handle);
+
+template <typename Tin, typename Tout>
+int PackC4(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC8(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackCX(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC4FromNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int PackC8FromNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+int PackCAndQuant(int8_t *dst, const float *src, size_t hw, size_t channel, float *scale);
+
+template <typename Tin, typename Tout>
+int UnpackC4(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+bool FloatBlobCanIgnorePack(size_t channel, size_t hw);
+bool HalfBlobCanIgnorePack(size_t channel, size_t hw);
+int PackFloatBlob(float *dst, float *src, size_t batch, size_t channel, size_t hw);
+int UnpackFloatBlob(float *dst, float *src, size_t batch, size_t channel, size_t hw);
+int PackFloatBlob(bfp16_t *dst, bfp16_t *src, size_t batch, size_t channel, size_t hw);
+int UnpackFloatBlob(bfp16_t *dst, bfp16_t *src, size_t batch, size_t channel, size_t hw);
+int PackHalfBlob(fp16_t *dst, fp16_t *src, size_t batch, size_t channel, size_t hw);
+int UnpackHalfBlob(fp16_t *dst, fp16_t *src, size_t batch, size_t channel, size_t hw);
+
+template <typename Tin, typename Tout>
+int UnpackC8(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackCX(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackC4ToNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+template <typename Tin, typename Tout>
+int UnpackC8ToNHWC(Tout *dst, const Tin *src, size_t hw, size_t channel);
+
+int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_t c_step, size_t w_step, size_t depth);
+
+int UnpackAndDequant(float *dst, const int8_t *src, size_t hw, size_t channel, float *scale, float *bias);
+
+int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw);
+
+template <typename T>
+int ConvertWeightsC4ToC8(T *weight, int ic, int oc);
+
+template <typename T>
+int ConvertWeightsFromGOIHWToGOIHW16(T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+
+template <typename T>
+int ConvertWeightsFromGIOHWToGOHWI16(T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+
+template <typename T>
+int ConvertWeightsFromGIOHWToGOHWI64(const T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+
+template <typename T>
+int ConvertWeightsFromOI3HWToOHW12(T *src, T *dst, int input_channel, int output_channel, int height, int width);
+
+template <typename T>
+int ConvertWeightsFromOI3HWToOHW24(const T *src, T *dst, int input_channel, int output_channel, int height, int width);
+
+template <typename T>
+int ConvertWeightsFromGOIHWToGOIHW64(const T *src, T *dst, int group, int input_channel, int output_channel, int height,
+                                     int width);
+
+int PackINT8Weight(int8_t *src, int8_t *dst, int group, int input_channel, int output_channel, int height, int width);
+
+void NV12ToBGR(const unsigned char *nv12, unsigned char *bgr, int height, int width);
+
+void NV21ToBGR(const unsigned char *nv21, unsigned char *bgr, int height, int width);
+
+void NV12ToBGRA(const unsigned char *nv12, unsigned char *bgra, int height, int width);
+
+void NV21ToBGRA(const unsigned char *nv21, unsigned char *bgra, int height, int width);
+
+void BGRToGray(const unsigned char *bgr, unsigned char *gray, int height, int width);
+
+void BGRAToGray(const unsigned char *bgra, unsigned char *gray, int height, int width);
+
+void RGBToGray(const unsigned char *rgb, unsigned char *gray, int height, int width);
+
+void RGBAToGray(const unsigned char *rgba, unsigned char *gray, int height, int width);
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/cpu/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/cpu/CMakeLists.txt
new file mode 100644
index 0000000..9d1162e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB_RECURSE CPU_SRC *.h *.cc)
+
+add_library(TNNCpu OBJECT ${CPU_SRC})
+
+#if(SYSTEM.Android)
+#    target_link_libraries(TNNCpu log)
+#endif()
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.cc
new file mode 100644
index 0000000..74a4ded
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/compute/compute_elewise.h"
+
+#include <cstring>
+#include <functional>
+#include <type_traits>
+
+#include "math.h"
+
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+typedef std::function<float(float, float)> ELEWISE_OP;
+
+/*
+ * Output[i] = input0[i] op input1[i] op ... op  input..n[i]
+ * CPU_ELEWISE supports broadcast on all dimensions
+ */
+void CPU_ELEWISE(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+                 DimsVector shape_output, ELEWISE_OP op) {
+    const int count        = DimsVectorUtils::Count(shape_output);
+    float *output_data     = static_cast<float *>(output);
+
+    OMP_PARALLEL_FOR_
+    for(int offset = 0; offset< count; ++offset) {
+        DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(shape_output, offset);
+        float result;
+        for (int i = 0; i < input_ptrs.size(); i++) {
+            float *input_data = static_cast<float *>(input_ptrs[i]);
+            auto input_shape  = input_shapes[i];
+
+            DimsVector input_index;
+            int diff = shape_output.size() - input_shape.size();
+            for(int i = 0; i < input_shape.size(); ++i) {
+                input_index.push_back(std::min(output_index[i + diff], input_shape[i] - 1));
+            }
+             
+            int input_offset = DimsOffsetUtils::ConvertIndexToOffset(input_shape, input_index);
+            if(i == 0) {
+                result = input_data[input_offset];
+            } else {
+                result = op(result, input_data[input_offset]);
+            }
+        }
+        output_data[offset] = result;
+    }
+}
+
+/*
+ * Output[i] = min(input0[i], input1[i], input..n[i])
+ * Broadcast is supported on n, c, h, w dims
+ */
+void CPU_MIN(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP min_op = [](float a, float b) -> float { return std::min(a, b); };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, min_op);
+}
+
+/*
+ * Output[i] = max(input0[i], input1[i], input..n[i])
+ * Broadcast is supported on each dimension of NCHW
+ */
+void CPU_MAX(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP max_op = [](float a, float b) -> float { return std::max(a, b); };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, max_op);
+}
+
+/*
+ * Output[i] = input0[i] * input1[i] * ... *  input..n[i]
+ * Broadcast is supported on all dimensions
+ */
+void CPU_MUL(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP mul_op = [](float a, float b) -> float { return a * b; };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, mul_op);
+}
+
+/*
+ * Output[i] = input0[i] + input1[i] + ... +  input..n[i]
+ * CPU_ADD supports broadcast on all dimensions
+ */
+void CPU_ADD(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP add_op = [](float a, float b) -> float { return a + b; };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, add_op);
+}
+
+/*
+ * Output[i] = input0[i] / input1[i] / ... /  input..n[i]
+ * CPU_DIV supports broadcast on all dimensions
+ */
+void CPU_DIV(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP div_op = [](float a, float b) -> float { return a / b; };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, div_op);
+}
+
+/*
+ * Output[i] = input0[i] - input1[i] - ... -  input..n[i]
+ * CPU_SUB supports broadcast on all dimensions
+ */
+void CPU_SUB(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output) {
+    ELEWISE_OP sub_op = [](float a, float b) -> float { return a - b; };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, sub_op);
+}
+
+/*
+ * Output[i] = input0[i] - input1[i] - ... -  input..n[i]
+ * CPU_SQUARED_DIFFERENCE supports broadcast on all dimensions
+ */
+void CPU_SQUARED_DIFFERENCE(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes,
+                            void *output, DimsVector shape_output) {
+    ELEWISE_OP squared_difference_op = [](float a, float b) -> float { return (a - b) * (a - b); };
+    CPU_ELEWISE(input_ptrs, input_shapes, output, shape_output, squared_difference_op);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.h b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.h
new file mode 100644
index 0000000..c66a657
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_elewise.h
@@ -0,0 +1,117 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CPU_COMPUTE_ELEWISE_H_
+#define TNN_CPU_COMPUTE_ELEWISE_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/core/common.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+/*
+ * Output[i] = input0[i] op input1[i] op ... op  input..n[i]
+ * CPU_ELEMENT_WISE supports broadcast on all dimensions
+ */
+template <typename T_IN, typename T_OUT>
+void CPU_ELEMENT_WISE(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+                      const DimsVector& shape_output, std::function<T_OUT(T_IN, T_IN)> op) {
+    const int count = DimsVectorUtils::Count(shape_output);
+    T_OUT *output_data  = static_cast<T_OUT *>(output);
+
+    OMP_PARALLEL_FOR_
+    for (int offset = 0; offset < count; ++offset) {
+        DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(shape_output, offset);
+        T_OUT result;
+        for (int i = 0; i < input_ptrs.size(); i++) {
+            T_IN *input_data = static_cast<T_IN *>(input_ptrs[i]);
+            auto input_shape  = input_shapes[i];
+            DimsVector input_index;
+            auto diff = shape_output.size() - input_shape.size();
+            for (int i = 0; i < input_shape.size(); ++i) {
+                input_index.push_back(std::min(output_index[i + diff], input_shape[i] - 1));
+            }
+            int input_offset = DimsOffsetUtils::ConvertIndexToOffset(input_shape, input_index);
+            if (i == 0) {
+                result = input_data[input_offset];
+            } else {
+                result = op(result, input_data[input_offset]);
+            }
+        }
+        output_data[offset] = result;
+    }
+}
+
+template <typename T_IN_0, typename T_IN_1, typename T_IN_2, typename T_OUT>
+void CPU_ELEMENT_WISE(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+                      const DimsVector& shape_output, std::function<T_OUT(T_IN_0, T_IN_1, T_IN_2)> op) {
+    const int count = DimsVectorUtils::Count(shape_output);
+    T_OUT *output_data  = static_cast<T_OUT *>(output);
+
+    OMP_PARALLEL_FOR_
+    for (int offset = 0; offset < count; ++offset) {
+        DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(shape_output, offset);
+        
+        T_IN_0 *input_data_0 = static_cast<T_IN_0 *>(input_ptrs[0]);
+        T_IN_1 *input_data_1 = static_cast<T_IN_1 *>(input_ptrs[1]);
+        T_IN_2 *input_data_2 = static_cast<T_IN_2 *>(input_ptrs[2]);
+        T_OUT result;
+        
+        int input_offset[3] = {0,0,0};
+        
+        for (int ii=0; ii<3; ii++) {
+            auto input_shape  = input_shapes[ii];
+            DimsVector input_index;
+            auto diff = shape_output.size() - input_shape.size();
+            for (int i = 0; i < input_shape.size(); ++i) {
+                input_index.push_back(std::min(output_index[i + diff], input_shape[i] - 1));
+            }
+            input_offset[ii] = DimsOffsetUtils::ConvertIndexToOffset(input_shape, input_index);
+        }
+
+        output_data[offset] = op(input_data_0[input_offset[0]], input_data_1[input_offset[1]], input_data_2[input_offset[2]]);
+    }
+}
+
+// float add
+void CPU_MIN(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_MAX(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_MUL(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_ADD(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_DIV(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_SUB(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+             DimsVector shape_output);
+
+void CPU_SQUARED_DIFFERENCE(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes,
+                            void *output, DimsVector shape_output);
+}  // namespace TNN_NS
+#endif  // TNN_CPU_COMPUTE_ELEWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.cc
new file mode 100644
index 0000000..95bb465
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/compute/compute_int8.h"
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+// use float data type for intermediate result
+typedef std::function<float(float, float)> INT8_OP;
+
+void CPU_INT8_CALCULATE(const std::vector<void *> &input_ptrs, const std::vector<float *> &scale_ptrs, int scale_len,
+                        void *output, float *scale_out, DimsVector dims, INT8_OP op) {
+    int batch   = dims[0];
+    int channel = dims[1];
+    int count   = DimsVectorUtils::Count(dims, 2, 4);
+    for (int n = 0; n < batch; n++) {
+        OMP_PARALLEL_FOR_
+        for (int c = 0; c < channel; c++) {
+            int offset    = n * channel * count + c * count;
+            int scale_idx = scale_len == 1 ? 0 : c;
+            for (int hw = 0; hw < count; hw++) {
+                float acc = 0;
+                for (int inid = 0; inid < input_ptrs.size(); inid++) {
+                    if (inid == 0) {
+                        acc = scale_ptrs[inid][scale_idx] *
+                              static_cast<float>(static_cast<int8_t *>(input_ptrs[inid])[hw + offset]);
+                    } else {
+                        acc = op(acc, scale_ptrs[inid][scale_idx] *
+                                        static_cast<float>(static_cast<int8_t *>(input_ptrs[inid])[hw + offset]));
+                    }
+                }
+                static_cast<int8_t *>(output)[hw + offset] = float2int8(acc / scale_out[scale_idx]);
+            }
+        }
+    }
+}
+
+void CPU_ADD(const std::vector<void *> &input_ptrs, const std::vector<float *> &scale_ptrs, int scale_len, void *output,
+             float *scale_out, DimsVector dims) {
+    INT8_OP add_op = [](float a, float b) -> float { return a + b; };
+    CPU_INT8_CALCULATE(input_ptrs, scale_ptrs, scale_len, output, scale_out, dims, add_op);
+}
+void CPU_SUB(const std::vector<void *> &input_ptrs, const std::vector<float *> &scale_ptrs, int scale_len, void *output,
+             float *scale_out, DimsVector dims) {
+    INT8_OP sub_op = [](float a, float b) -> float { return a - b; };
+    CPU_INT8_CALCULATE(input_ptrs, scale_ptrs, scale_len, output, scale_out, dims, sub_op);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.h b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.h
new file mode 100644
index 0000000..e50fb8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/compute_int8.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CPU_COMPUTE_INT8_H_
+#define TNN_CPU_COMPUTE_INT8_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/core/common.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+// int8 add, reload by scale
+void CPU_ADD(const std::vector<void *> &input_ptrs, const std::vector<float *> &scale_ptrs, int scale_len, void *output,
+             float *scale_out, DimsVector dims);
+
+// int8 sub, reload by scale
+void CPU_SUB(const std::vector<void *> &input_ptrs, const std::vector<float *> &scale_ptrs, int scale_len, void *output,
+             float *scale_out, DimsVector dims);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_CPU_COMPUTE_INT8_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/compute/normalized_bbox.h b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/normalized_bbox.h
new file mode 100644
index 0000000..f1a3ab5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/compute/normalized_bbox.h
@@ -0,0 +1,381 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CPU_COMPUTE_NORMALIZED_BBOX_HPP
+#define TNN_CPU_COMPUTE_NORMALIZED_BBOX_HPP
+#include <cstddef>
+#include <cstring>
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class NormalizedBBox {
+public:
+    NormalizedBBox() {
+        memset(_has_bits_, 0, sizeof(_has_bits_));
+        size_ = 0.;
+        xmin_ = 0.;
+        xmax_ = 0.;
+        ymin_ = 0.;
+        ymax_ = 0.;
+        // clear_has_size();
+    }
+
+    virtual ~NormalizedBBox() {
+        memset(_has_bits_, 0, sizeof(_has_bits_));
+    }
+
+    // NormalizedBBox(const NormalizedBBox& from);
+
+    /*inline NormalizedBBox& operator=(const NormalizedBBox& from) {
+        CopyFrom(from);
+        return *this;
+    }
+
+    inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+        return _unknown_fields_;
+    }
+
+    inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+        return &_unknown_fields_;
+    }
+
+    static const ::google::protobuf::Descriptor* descriptor();
+    static const NormalizedBBox& default_instance();
+
+    void Swap(NormalizedBBox* other);
+
+    // implements Message ----------------------------------------------
+
+    NormalizedBBox* New() const;
+
+    //void CopyFrom(const ::google::protobuf::Message& from);
+    //void MergeFrom(const ::google::protobuf::Message& from);
+    void CopyFrom(const NormalizedBBox& from);
+    void MergeFrom(const NormalizedBBox& from);
+    void Clear();
+    bool IsInitialized() const;
+
+    int ByteSize() const;
+
+    bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8*
+  SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const; int
+  GetCachedSize() const { return _cached_size_; } private: void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;*/
+public:
+    //::google::protobuf::Metadata GetMetadata() const;
+
+    // nested types ----------------------------------------------------
+
+    // accessors -------------------------------------------------------
+
+    // optional float xmin = 1;
+    inline bool has_xmin() const;
+    inline void clear_xmin();
+    static const int kXminFieldNumber = 1;
+    inline float xmin() const;
+    inline void set_xmin(float value);
+
+    // optional float ymin = 2;
+    inline bool has_ymin() const;
+    inline void clear_ymin();
+    static const int kYminFieldNumber = 2;
+    inline float ymin() const;
+    inline void set_ymin(float value);
+
+    // optional float xmax = 3;
+    inline bool has_xmax() const;
+    inline void clear_xmax();
+    static const int kXmaxFieldNumber = 3;
+    inline float xmax() const;
+    inline void set_xmax(float value);
+
+    // optional float ymax = 4;
+    inline bool has_ymax() const;
+    inline void clear_ymax();
+    static const int kYmaxFieldNumber = 4;
+    inline float ymax() const;
+    inline void set_ymax(float value);
+
+    // optional int32 label = 5;
+    inline bool has_label() const;
+    inline void clear_label();
+    static const int kLabelFieldNumber = 5;
+    inline int label() const;
+    inline void set_label(int value);
+
+    // optional bool difficult = 6;
+    inline bool has_difficult() const;
+    inline void clear_difficult();
+    static const int kDifficultFieldNumber = 6;
+    inline bool difficult() const;
+    inline void set_difficult(bool value);
+
+    // optional float score = 7;
+    inline bool has_score() const;
+    inline void clear_score();
+    static const int kScoreFieldNumber = 7;
+    inline float score() const;
+    inline void set_score(float value);
+
+    // optional float size = 8;
+    inline bool has_size() const;
+    inline void clear_size();
+    static const int kSizeFieldNumber = 8;
+    inline float size() const;
+    inline void set_size(float value);
+
+    // @@protoc_insertion_point(class_scope:caffe.NormalizedBBox)
+private:
+    inline void set_has_xmin();
+    inline void clear_has_xmin();
+    inline void set_has_ymin();
+    inline void clear_has_ymin();
+    inline void set_has_xmax();
+    inline void clear_has_xmax();
+    inline void set_has_ymax();
+    inline void clear_has_ymax();
+    inline void set_has_label();
+    inline void clear_has_label();
+    inline void set_has_difficult();
+    inline void clear_has_difficult();
+    inline void set_has_score();
+    inline void clear_has_score();
+    inline void set_has_size();
+    inline void clear_has_size();
+
+    //::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+    //::google::protobuf::uint32 _has_bits_[1];
+    size_t _has_bits_[1];
+    mutable int _cached_size_ = 0;
+    float xmin_ = 0.f;
+    float ymin_ = 0.f;
+    float xmax_ = 0.f;
+    float ymax_ = 0.f;
+    //::google::protobuf::int32 label_;
+    int label_ = 0;
+    bool difficult_ = false;
+    float score_ = 0.f;
+    float size_ = 0.f;
+    // friend void  protobuf_AddDesc_caffe_2eproto();
+    // friend void protobuf_AssignDesc_caffe_2eproto();
+    // friend void protobuf_ShutdownFile_caffe_2eproto();
+
+    void InitAsDefaultInstance();
+    static NormalizedBBox* default_instance_;
+};
+
+// NormalizedBBox
+
+// optional float xmin = 1;
+inline bool NormalizedBBox::has_xmin() const {
+    return (_has_bits_[0] & 0x00000001u) != 0;
+}
+inline void NormalizedBBox::set_has_xmin() {
+    _has_bits_[0] |= 0x00000001u;
+}
+inline void NormalizedBBox::clear_has_xmin() {
+    _has_bits_[0] &= ~0x00000001u;
+}
+inline void NormalizedBBox::clear_xmin() {
+    xmin_ = 0;
+    clear_has_xmin();
+}
+inline float NormalizedBBox::xmin() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.xmin)
+    return xmin_;
+}
+inline void NormalizedBBox::set_xmin(float value) {
+    set_has_xmin();
+    xmin_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.xmin)
+}
+
+// optional float ymin = 2;
+inline bool NormalizedBBox::has_ymin() const {
+    return (_has_bits_[0] & 0x00000002u) != 0;
+}
+inline void NormalizedBBox::set_has_ymin() {
+    _has_bits_[0] |= 0x00000002u;
+}
+inline void NormalizedBBox::clear_has_ymin() {
+    _has_bits_[0] &= ~0x00000002u;
+}
+inline void NormalizedBBox::clear_ymin() {
+    ymin_ = 0;
+    clear_has_ymin();
+}
+inline float NormalizedBBox::ymin() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.ymin)
+    return ymin_;
+}
+inline void NormalizedBBox::set_ymin(float value) {
+    set_has_ymin();
+    ymin_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.ymin)
+}
+
+// optional float xmax = 3;
+inline bool NormalizedBBox::has_xmax() const {
+    return (_has_bits_[0] & 0x00000004u) != 0;
+}
+inline void NormalizedBBox::set_has_xmax() {
+    _has_bits_[0] |= 0x00000004u;
+}
+inline void NormalizedBBox::clear_has_xmax() {
+    _has_bits_[0] &= ~0x00000004u;
+}
+inline void NormalizedBBox::clear_xmax() {
+    xmax_ = 0;
+    clear_has_xmax();
+}
+inline float NormalizedBBox::xmax() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.xmax)
+    return xmax_;
+}
+inline void NormalizedBBox::set_xmax(float value) {
+    set_has_xmax();
+    xmax_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.xmax)
+}
+
+// optional float ymax = 4;
+inline bool NormalizedBBox::has_ymax() const {
+    return (_has_bits_[0] & 0x00000008u) != 0;
+}
+inline void NormalizedBBox::set_has_ymax() {
+    _has_bits_[0] |= 0x00000008u;
+}
+inline void NormalizedBBox::clear_has_ymax() {
+    _has_bits_[0] &= ~0x00000008u;
+}
+inline void NormalizedBBox::clear_ymax() {
+    ymax_ = 0;
+    clear_has_ymax();
+}
+inline float NormalizedBBox::ymax() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.ymax)
+    return ymax_;
+}
+inline void NormalizedBBox::set_ymax(float value) {
+    set_has_ymax();
+    ymax_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.ymax)
+}
+
+// optional int32 label = 5;
+inline bool NormalizedBBox::has_label() const {
+    return (_has_bits_[0] & 0x00000010u) != 0;
+}
+inline void NormalizedBBox::set_has_label() {
+    _has_bits_[0] |= 0x00000010u;
+}
+inline void NormalizedBBox::clear_has_label() {
+    _has_bits_[0] &= ~0x00000010u;
+}
+inline void NormalizedBBox::clear_label() {
+    label_ = 0;
+    clear_has_label();
+}
+inline int NormalizedBBox::label() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.label)
+    return label_;
+}
+inline void NormalizedBBox::set_label(int value) {
+    set_has_label();
+    label_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.label)
+}
+
+// optional bool difficult = 6;
+inline bool NormalizedBBox::has_difficult() const {
+    return (_has_bits_[0] & 0x00000020u) != 0;
+}
+inline void NormalizedBBox::set_has_difficult() {
+    _has_bits_[0] |= 0x00000020u;
+}
+inline void NormalizedBBox::clear_has_difficult() {
+    _has_bits_[0] &= ~0x00000020u;
+}
+inline void NormalizedBBox::clear_difficult() {
+    difficult_ = false;
+    clear_has_difficult();
+}
+inline bool NormalizedBBox::difficult() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.difficult)
+    return difficult_;
+}
+inline void NormalizedBBox::set_difficult(bool value) {
+    set_has_difficult();
+    difficult_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.difficult)
+}
+
+// optional float score = 7;
+inline bool NormalizedBBox::has_score() const {
+    return (_has_bits_[0] & 0x00000040u) != 0;
+}
+inline void NormalizedBBox::set_has_score() {
+    _has_bits_[0] |= 0x00000040u;
+}
+inline void NormalizedBBox::clear_has_score() {
+    _has_bits_[0] &= ~0x00000040u;
+}
+inline void NormalizedBBox::clear_score() {
+    score_ = 0;
+    clear_has_score();
+}
+inline float NormalizedBBox::score() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.score)
+    return score_;
+}
+inline void NormalizedBBox::set_score(float value) {
+    set_has_score();
+    score_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.score)
+}
+
+// optional float size = 8;
+inline bool NormalizedBBox::has_size() const {
+    return (_has_bits_[0] & 0x00000080u) != 0;
+}
+inline void NormalizedBBox::set_has_size() {
+    _has_bits_[0] |= 0x00000080u;
+}
+inline void NormalizedBBox::clear_has_size() {
+    _has_bits_[0] &= ~0x00000080u;
+}
+inline void NormalizedBBox::clear_size() {
+    size_ = 0;
+    clear_has_size();
+}
+inline float NormalizedBBox::size() const {
+    // @@protoc_insertion_point(field_get:caffe.NormalizedBBox.size)
+    return size_;
+}
+inline void NormalizedBBox::set_size(float value) {
+    set_has_size();
+    size_ = value;
+    // @@protoc_insertion_point(field_set:caffe.NormalizedBBox.size)
+}
+
+}  // namespace TNN_NS
+
+#endif  // TNN_NORMALIZED_BBOX_HPP
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ histogram_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ histogram_layer_acc.cc
new file mode 100644
index 0000000..085f9aa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ histogram_layer_acc.cc	
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Histogram, LAYER_HISTOGRAM);
+
+Status CpuHistogramLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuHistogramLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    auto input_data     = (int *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+    auto output_data     = (int *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    const int count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    
+    memset(output_data, 0, ele_size*DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims));
+    if (input_data_type == DATA_TYPE_INT32) {
+        for (int i = 0; i < count; ++i) {
+            auto index = input_data[i];
+            output_data[index] = output_data[index] + 1;
+        }
+    } else {
+        LOGE("unsupport data type to Histogram\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Histogram, LAYER_HISTOGRAM);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_abs_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_abs_layer_acc.cc
new file mode 100644
index 0000000..586dbc6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_abs_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct abs_operator : unary_operator {
+    virtual float operator()(float in) {
+        return fabs(in);
+    }
+} ABS_OP;
+
+DECLARE_UNARY_ACC(Abs, LAYER_ABS, ABS_OP);
+
+REGISTER_CPU_ACC(Abs, LAYER_ABS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_acos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_acos_layer_acc.cc
new file mode 100644
index 0000000..334cceb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_acos_layer_acc.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "cpu_conv_layer_acc.h"
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+typedef struct acos_operator : unary_operator {
+    virtual float operator()(float in) {
+        return acos(in);
+    }
+} ACOS_OP;
+
+DECLARE_UNARY_ACC(Acos, LAYER_ACOS, ACOS_OP);
+
+REGISTER_CPU_ACC(Acos, LAYER_ACOS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc
new file mode 100644
index 0000000..b5d8fc8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Add, LAYER_ADD);
+
+Status CpuAddLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_ADD(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                  [](int a, int b) -> int { return a + b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        std::vector<float *> scale_ptrs;
+
+        for (size_t inid = 0; inid < input_blobs.size(); inid++) {
+            scale_ptrs.push_back(
+                reinterpret_cast<BlobInt8 *>(input_blobs[inid])->GetIntResource()->scale_handle.force_to<float *>());
+        }
+        CPU_ADD(input_ptrs, scale_ptrs,
+                reinterpret_cast<BlobInt8 *>(input_blobs[0])->GetIntResource()->scale_handle.GetDataCount(),
+                output->GetHandle().base,
+                reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>(),
+                output->GetBlobDesc().dims);
+    } else {
+        LOGE("Error: CpuAddLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuAddLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+REGISTER_CPU_ACC(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_arg_max_or_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_arg_max_or_min_layer_acc.cc
new file mode 100644
index 0000000..832f70e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_arg_max_or_min_layer_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include <limits>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include <iostream>
+
+namespace TNN_NS {
+DECLARE_CPU_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+Status CpuArgMaxOrMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuArgMaxOrMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param       = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    int axis         = param->axis;
+    int num          = DimsVectorUtils::Count(input_dims, 0, axis);
+    int channels     = input_dims[axis];
+    int stride       = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (stride == 0) {
+        stride = 1;
+    }
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        auto input_ptr  = static_cast<float *>(input_blob->GetHandle().base);
+        auto output_ptr = static_cast<int *>(output_blob->GetHandle().base);
+        for (int n = 0; n < num; ++n) {
+            for (int s = 0; s < stride; ++s) {
+                int guard_index = 0;
+                for (int c = 1; c < channels; ++c) {
+                    float guard_value = input_ptr[n * stride * channels + guard_index * stride + s];
+                    float cur_value   = input_ptr[n * stride * channels + c * stride + s];
+                    if (param->mode == 0) {
+                        // ArgMin
+                        guard_index = cur_value < guard_value ? c : guard_index;
+                    } else {
+                        // ArgMax
+                        guard_index = cur_value > guard_value ? c : guard_index;
+                    }
+                };  // end for loop
+                output_ptr[n * stride + s] = guard_index;
+                // std::cout << output_ptr[n * stride + s] << " ";
+            }
+        }  // end for
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuArgMaxOrMinLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuArgMaxOrMinLayerAcc layer acc dont support datatype");
+    } else {
+        LOGE("Error: CpuArgMaxOrMinLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuArgMaxOrMinLayerAcc layer acc dont support datatype");
+    }  // end if
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_asin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_asin_layer_acc.cc
new file mode 100644
index 0000000..73eb891
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_asin_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct asin_operator : unary_operator {
+    virtual float operator()(float in) {
+        return asin(in);
+    }
+} ASIN_OP;
+
+DECLARE_UNARY_ACC(Asin, LAYER_ASIN, ASIN_OP);
+
+REGISTER_CPU_ACC(Asin, LAYER_ASIN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_atan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_atan_layer_acc.cc
new file mode 100644
index 0000000..d561c7f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_atan_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct atan_operator : unary_operator {
+    virtual float operator()(float in) {
+        return atan(in);
+    }
+} ATAN_OP;
+
+DECLARE_UNARY_ACC(Atan, LAYER_ATAN, ATAN_OP);
+
+REGISTER_CPU_ACC(Atan, LAYER_ATAN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_batch_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_batch_norm_layer_acc.cc
new file mode 100644
index 0000000..a2478b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_batch_norm_layer_acc.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FP32_RESOURCE(BatchNorm, LAYER_BATCH_NORM);
+
+Status CpuBatchNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuBatchNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNormLayerResource is nil");
+    }
+
+    auto input_blob        = inputs[0];
+    auto output_blob       = outputs[0];
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    int channel            = input_blob->GetBlobDesc().dims[1];
+    int count              = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims);
+    RawBuffer scale_handle = resource->scale_handle;
+    auto *scale_data       = resource->scale_handle.force_to<float *>();
+    bool share_channel     = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(scale_handle.GetDataType());
+    auto *bias_data        = resource->bias_handle.force_to<float *>();
+
+    const int channel_size = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 2);
+
+    if (share_channel) {
+        for (int index = 0; index < count; ++index) {
+            float result = 0.0f;
+            result       = input_data[index] * scale_data[0];
+            if (bias_data != nullptr) {
+                result += bias_data[0];
+            }
+            output_data[index] = result;
+        }
+    } else {
+        for (int index = 0; index < count; ++index) {
+            float result      = 0.0f;
+            int channel_index = (index / channel_size) % channel;
+            result            = input_data[index] * scale_data[channel_index];
+            if (bias_data != nullptr) {
+                result += bias_data[channel_index];
+            }
+            output_data[index] = result;
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(BatchNorm, LAYER_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc
new file mode 100644
index 0000000..a2a8694
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(BiasAdd, LAYER_BIAS_ADD);
+
+Status CpuBiasAddLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuBiasAddLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<BiasAddLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BiasAddLayerResource is nil");
+    }
+
+    auto input_blob        = inputs[0];
+    auto output_blob       = outputs[0];
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    auto dims              = output_blob->GetBlobDesc().dims;
+    int batch              = dims[0];
+    int channel            = dims[1];
+    auto *bias_data        = resource->bias_handle.force_to<float *>();
+
+    const int inner_size = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 2);
+
+    for (int b = 0; b < batch; ++b) {
+        for(int c = 0; c < channel; ++c) {
+            float bias = bias_data[c];
+            for (int i = 0; i < inner_size; ++i, ++output_data, ++input_data) {
+                *output_data = *input_data + bias;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(BiasAdd, LAYER_BIAS_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.cc
new file mode 100644
index 0000000..d046c8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+CpuBinaryOpLayerAcc::~CpuBinaryOpLayerAcc() {}
+
+Status CpuBinaryOpLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuBinaryOpLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: CpuBinaryOpLayerAcc layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: CpuBinaryOpLayerAcc layer param is nil");
+    }
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+    if (!((inputs.size() == 1 && layer_res) || inputs.size() >= 2)) {
+        LOGE("Error: CpuBinaryLayerAcc invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "CpuBinaryLayerAcc invalid inputs count");
+    }
+
+    auto output = outputs[0];
+    auto dims   = output->GetBlobDesc().dims;
+    std::vector<void *> input_ptrs;
+    std::vector<DimsVector> input_shapes;
+
+    if (inputs.size() >= 2) {
+        for (size_t inid = 0; inid < inputs.size(); inid++) {
+            input_ptrs.push_back(inputs[inid]->GetHandle().base);
+            input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
+        }
+    } else {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            input_ptrs.push_back(layer_res->element_handle.force_to<void *>());
+            input_shapes.push_back(layer_res->element_shape);
+
+            input_ptrs.push_back(inputs[0]->GetHandle().base);
+            input_shapes.push_back(input_shape0);
+        } else {
+            input_ptrs.push_back(inputs[0]->GetHandle().base);
+            input_shapes.push_back(input_shape0);
+
+            input_ptrs.push_back(layer_res->element_handle.force_to<void *>());
+            input_shapes.push_back(layer_res->element_shape);
+        }
+    }
+    Status status = Calculate(inputs, input_ptrs, input_shapes, output);
+    return status;
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.h
new file mode 100644
index 0000000..1a9ef5a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_binary_op_layer_acc.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_BINARY_OP_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_BINARY_OP_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+class CpuBinaryOpLayerAcc : public CpuLayerAcc {
+public:
+    virtual ~CpuBinaryOpLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    virtual Status Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                             const std::vector<DimsVector> &input_shapes, Blob *output) = 0;
+};
+
+#define DECLARE_CPU_BINARY_OP_ACC(type_string, layer_type)                                                             \
+    class Cpu##type_string##LayerAcc : public CpuBinaryOpLayerAcc {                                                    \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {                   \
+            if (inputs.size() == 1) {                                                                                  \
+                CPU_CONVERT_HALF_RESOURCE(layer_type);                                                                 \
+            } else {                                                                                                   \
+                RETURN_ON_NEQ(CpuLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);                   \
+            }                                                                                                          \
+            return TNN_OK;                                                                                             \
+        }                                                                                                              \
+                                                                                                                       \
+    protected:                                                                                                         \
+        std::shared_ptr<LayerResource> fp32_resource_ = nullptr;                                                       \
+                                                                                                                       \
+    private:                                                                                                           \
+        virtual Status Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,        \
+                                 const std::vector<DimsVector> &input_shapes, Blob *output);                           \
+    }
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_BINARY_OP_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bitshift_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bitshift_layer_acc.cc
new file mode 100644
index 0000000..d26730d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_bitshift_layer_acc.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(BitShift, LAYER_BITSHIFT);
+
+Status CpuBitShiftLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuBitShiftLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<BitShiftLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    auto input_data     = (unsigned int *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+    auto output_data     = (unsigned int *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+    
+    const int count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    
+    if (input_data_type == DATA_TYPE_INT32 || input_data_type == DATA_TYPE_UINT32) {
+        if (layer_param->direction == 0) {
+            for (int index = 0; index < count; ++index) {
+                output_data[index] = input_data[index] >> layer_param->bits;
+            }
+        } else {
+            for (int index = 0; index < count; ++index) {
+                output_data[index] = input_data[index] << layer_param->bits;
+            }
+        }
+
+    } else {
+        LOGE("unsupport data type to Histogram\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(BitShift, LAYER_BITSHIFT);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cast_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cast_layer_acc.cc
new file mode 100644
index 0000000..9089b6b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cast_layer_acc.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Cast, LAYER_CAST);
+
+Status CpuCastLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuCastLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto param      = dynamic_cast<CastLayerParam *>(param_);
+    void *input_data      = inputs[0]->GetHandle().base;
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    void *output_data     = outputs[0]->GetHandle().base;
+    auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    const int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    if (input_data_type == output_data_type) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, count * ele_size);
+        }
+    } else if (input_data_type == DATA_TYPE_FLOAT && output_data_type == DATA_TYPE_INT32) {
+        auto *input_data_ptr  = (float *)input_data;
+        auto *output_data_ptr = (int *)output_data;
+        for (int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<int>(input_data_ptr[i]);
+        }
+    } else if (input_data_type == DATA_TYPE_INT32 && output_data_type == DATA_TYPE_FLOAT) {
+        auto *input_data_ptr  = (int *)input_data;
+        auto *output_data_ptr = (float *)output_data;
+        for (int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<float>(input_data_ptr[i]);
+        }
+    } else if (input_data_type == DATA_TYPE_INT8 && output_data_type == DATA_TYPE_FLOAT) {
+        auto *input_data_ptr  = (int8_t *)input_data;
+        auto *output_data_ptr = (float *)output_data;
+        for (int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<float>(input_data_ptr[i]);
+        }
+    } else if (input_data_type == DATA_TYPE_INT32 && output_data_type == DATA_TYPE_UINT32) {
+        auto *input_data_ptr  = (int *)input_data;
+        auto *output_data_ptr = (unsigned int *)output_data;
+        for (int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<unsigned int>(input_data_ptr[i]);
+        }
+    } else if (input_data_type == DATA_TYPE_UINT32 && output_data_type == DATA_TYPE_INT32) {
+        auto *input_data_ptr  = (unsigned int *)input_data;
+        auto *output_data_ptr = (int *)output_data;
+        for (int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<int>(input_data_ptr[i]);
+        }
+    } else {
+        LOGE("unsupport data type to cast\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Cast, LAYER_CAST);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ceil_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ceil_layer_acc.cc
new file mode 100644
index 0000000..a612522
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_ceil_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct ceil_operator : unary_operator {
+    virtual float operator()(float in) {
+        return std::ceil(in);
+    }
+} CEIL_OP;
+
+DECLARE_UNARY_ACC(Ceil, LAYER_CEIL, CEIL_OP);
+
+REGISTER_CPU_ACC(Ceil, LAYER_CEIL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_clip_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_clip_layer_acc.cc
new file mode 100644
index 0000000..09da885
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_clip_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct clip_operator : unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<ClipLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: clip layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: clip layer param is nil");
+        }
+        min_ = layer_param->min;
+        max_ = layer_param->max;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        float tmp = std::min(max_, in);
+        tmp       = std::max(min_, tmp);
+        return tmp;
+    }
+
+private:
+    float min_ = 0.f;
+    float max_ = 0.f;
+} CLIP_OP;
+
+DECLARE_UNARY_ACC(Clip, LAYER_CLIP, CLIP_OP);
+
+REGISTER_CPU_ACC(Clip, LAYER_CLIP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_concat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_concat_layer_acc.cc
new file mode 100644
index 0000000..89e40d0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_concat_layer_acc.cc
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Concat, LAYER_CONCAT);
+
+Status CpuConcatLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConcatLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ConcatLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: ConcatLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ConcatLayerParam is nil");
+    }
+    if (inputs.size() < 2) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 2");
+    }
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto dims   = input->GetBlobDesc().dims;
+
+    bool int8_per_tensor_flag = false;
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        int8_per_tensor_flag = true;
+        // if one blob is per channel quant, concat with the normal way
+        for (auto &blob : inputs) {
+            if (reinterpret_cast<BlobInt8 *>(blob)->GetIntResource()->scale_handle.GetDataCount() > 1) {
+                int8_per_tensor_flag = false;
+                break;
+            }
+        }
+    }
+    int axis = param->axis;
+    if (axis < 0) {
+        axis += (int)inputs[0]->GetBlobDesc().dims.size();
+    }
+    if (axis > dims.size() || axis < 0) {
+        LOGE("Error: Concat layer param invalid\n");
+        return Status(TNNERR_PARAM_ERR, "Concat layer param invalid");
+    }
+
+    int num_concats = 1;
+    for (int i = 0; i < axis; i++) {
+        num_concats *= dims[i];
+    }
+
+    int concate_size = 1;
+    for (int i = axis + 1; i < dims.size(); i++) {
+        concate_size *= dims[i];
+    }
+
+    auto datasize                 = DataTypeUtils::GetBytesSize(input->GetBlobDesc().data_type);
+    int8_t *output_data           = static_cast<int8_t *>(output->GetHandle().base);
+    int output_concat_axis        = output->GetBlobDesc().dims[axis];
+    int output_concat_axis_offset = 0;
+
+    if (!int8_per_tensor_flag) {
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            // use int8_t for all types
+            int8_t *input_data          = static_cast<int8_t *>(inputs[i]->GetHandle().base);
+            const int input_concat_axis = inputs[i]->GetBlobDesc().dims[axis];
+            //support shape1[i] == 0 for empty blob in yolov5
+            if (input_data) {
+                for (int n = 0; n < num_concats; ++n) {
+                    memcpy(output_data + (n * output_concat_axis + output_concat_axis_offset) * concate_size * datasize,
+                           input_data + n * input_concat_axis * concate_size * datasize,
+                           input_concat_axis * concate_size * datasize);
+                }
+            }
+            output_concat_axis_offset += input_concat_axis;
+        }
+    } else {
+        float *output_scale = reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>();
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            float *input_scale =
+                reinterpret_cast<BlobInt8 *>(inputs[i])->GetIntResource()->scale_handle.force_to<float *>();
+            int8_t *input_data          = static_cast<int8_t *>(inputs[i]->GetHandle().base);
+            const int input_concat_axis = inputs[i]->GetBlobDesc().dims[axis];
+            for (int n = 0; n < num_concats; ++n) {
+                int8_t *concat_dst = output_data + (n * output_concat_axis + output_concat_axis_offset) * concate_size;
+                int8_t *concat_src = input_data + n * input_concat_axis * concate_size;
+                // per tensor need dequant and requant
+                for (int i = 0; i < input_concat_axis * concate_size; i++) {
+                    concat_dst[i] = float2int8(concat_src[i] * input_scale[0] / output_scale[0]);
+                }
+            }
+            output_concat_axis_offset += input_concat_axis;
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Concat, LAYER_CONCAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_const_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_const_layer_acc.cc
new file mode 100644
index 0000000..853cdc3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_const_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Const, LAYER_CONST);
+
+Status CpuConstLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConstLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_resource = dynamic_cast<ConstLayerResource *>(resource_);
+    auto output_blob    = outputs[0];
+    auto output_dims    = output_blob->GetBlobDesc().dims;
+    auto count          = DimsVectorUtils::Count(output_dims);
+    auto const_data_ptr = layer_resource->weight_handle.force_to<void *>();
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        memcpy(output_blob->GetHandle().base, const_data_ptr, count * sizeof(float));
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        memcpy(output_blob->GetHandle().base, const_data_ptr, count * sizeof(int32_t));
+    }
+    return TNN_OK;
+}
+
+//REGISTER_CPU_ACC(Const, LAYER_CONST);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_constantofshape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_constantofshape_layer_acc.cc
new file mode 100644
index 0000000..c5a84dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_constantofshape_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+Status CpuConstantOfShapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConstantOfShapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_resource = dynamic_cast<ConstantOfShapeLayerResource*>(resource_);
+    CHECK_PARAM_NULL(layer_resource);
+    auto data_value_count = layer_resource->value.GetDataCount();
+    auto data_value_size = layer_resource->value.GetBytesSize();
+    auto data_value_ptr = layer_resource->value.force_to<char *>();
+    
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto output_count = DimsVectorUtils::Count(output_dims);
+    auto output_data_ptr = (char *)outputs[0]->GetHandle().base;
+    
+    //support the case if constofshape has empty output blob with dims={0}
+    if (output_dims.size() == 1 && output_dims[0] == 0) {
+        return TNN_OK;
+    }
+    
+    if (output_dims.size() <= 0 || output_data_ptr==nullptr || output_count <= 0) {
+        return Status(TNNERR_LAYER_ERR, "ConstantOfShape has invalid param or resource");
+    }
+
+    for (int i=0; i<output_count; i++) {
+        memcpy(output_data_ptr + i*data_value_size, data_value_ptr, data_value_size);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.cc
new file mode 100644
index 0000000..38f49e9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.cc
@@ -0,0 +1,134 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_conv_1d_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuConv1DLayerAcc::~CpuConv1DLayerAcc() {}
+
+Status CpuConv1DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    if (runtime_model_ != RUNTIME_MODE_NORMAL) {
+        return status;
+    }
+
+    auto conv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    auto conv_res = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_res);
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        if (!buffer_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            const float *w_scale = conv_res->scale_handle.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale);
+
+            const float *i_scale =
+                reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+            const float *o_scale =
+                reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+            int scale_len_w = conv_res->scale_handle.GetDataCount();
+            int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+
+            int32_t *bias_ptr = conv_res->bias_handle.force_to<int32_t *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int w_scale_idx = scale_len_w == 1 ? 0 : i;
+                int o_scale_idx = scale_len_o == 1 ? 0 : i;
+                if (o_scale[o_scale_idx] >= FLT_MIN)
+                    temp_ptr[i] = w_scale[w_scale_idx] / o_scale[o_scale_idx];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_scale_ = temp_buffer;
+        }
+
+        if (conv_param->fusion_type != FusionType_None && !buffer_add_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            auto add_input_resource  = reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource();
+            auto add_output_resource = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+
+            const float *i_scale = add_input_resource->scale_handle.force_to<float *>();
+
+            const float *o_scale = add_output_resource->scale_handle.force_to<float *>();
+
+            int scale_len_i = add_input_resource->scale_handle.GetDataCount();
+            int scale_len_o = add_output_resource->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int scale_idx_i = scale_len_i == 1 ? 0 : i;
+                int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+                if (o_scale[scale_idx_o] >= FLT_MIN)
+                    temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_add_scale_ = temp_buffer;
+        }
+    }
+    return TNN_OK;
+}
+
+Status CpuConv1DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConv1DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: ConvLayerParam or ConvLayerResource is empty");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    void *input_ptr    = input_blob->GetHandle().base;
+    void *output_ptr   = output_blob->GetHandle().base;
+    void *weight_ptr   = resource->filter_handle.force_to<void *>();
+    void *bias_ptr     = NULL;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+    if (param->bias || data_type == DATA_TYPE_INT8) {
+        bias_ptr = resource->bias_handle.force_to<void *>();
+    }
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+
+    if (data_type == DATA_TYPE_FLOAT) {
+        NaiveConv1D<float, float, float, float>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                                param->strides[0], param->kernels[0], param->pads[0], param->group,
+                                                param->dialations[0], param->activation_type, NULL, 0);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "data type not support in conv");
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuConv1DLayerAcc>> g_cpu_conv_1d_layer_acc_register(LAYER_CONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.h
new file mode 100644
index 0000000..1a4c9cb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_1d_layer_acc.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_1D_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_1D_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv1d layer cpu acc
+class CpuConv1DLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuConv1DLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+    // @brief for conv add fusion
+    RawBuffer buffer_add_scale_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_1D_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.cc
new file mode 100644
index 0000000..6ca57d1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_conv_3d_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuConv3DLayerAcc::~CpuConv3DLayerAcc() {}
+
+Status CpuConv3DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    if (runtime_model_ != RUNTIME_MODE_NORMAL) {
+        return status;
+    }
+
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    if (input_blob->GetBlobDesc().data_format != DATA_FORMAT_NCDHW) {
+        LOGE("Error: Conv3D layer only support NCDHW data format\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Conv3D layer only support NCDHW data format");
+    }
+
+    auto conv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    auto conv_res = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_res);
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        if (!buffer_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            const float *w_scale = conv_res->scale_handle.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale);
+
+            const float *i_scale =
+                reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+            const float *o_scale =
+                reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+            int scale_len_w = conv_res->scale_handle.GetDataCount();
+            int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+
+            int32_t *bias_ptr = conv_res->bias_handle.force_to<int32_t *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int w_scale_idx = scale_len_w == 1 ? 0 : i;
+                int o_scale_idx = scale_len_o == 1 ? 0 : i;
+                if (o_scale[o_scale_idx] >= FLT_MIN)
+                    temp_ptr[i] = w_scale[w_scale_idx] / o_scale[o_scale_idx];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_scale_ = temp_buffer;
+        }
+
+        if (conv_param->fusion_type != FusionType_None && !buffer_add_scale_.GetBytesSize()) {
+            auto dims_output = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            auto add_input_resource  = reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource();
+            auto add_output_resource = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+
+            const float *i_scale = add_input_resource->scale_handle.force_to<float *>();
+
+            const float *o_scale = add_output_resource->scale_handle.force_to<float *>();
+
+            int scale_len_i = add_input_resource->scale_handle.GetDataCount();
+            int scale_len_o = add_output_resource->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int scale_idx_i = scale_len_i == 1 ? 0 : i;
+                int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+                if (o_scale[scale_idx_o] >= FLT_MIN)
+                    temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_add_scale_ = temp_buffer;
+        }
+
+    }
+    return TNN_OK;
+}
+
+Status CpuConv3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConv3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: ConvLayerParam or ConvLayerResource is empty");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    void *input_ptr    = input_blob->GetHandle().base;
+    void *output_ptr   = output_blob->GetHandle().base;
+    void *weight_ptr   = resource->filter_handle.force_to<void *>();
+    void *bias_ptr     = NULL;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+    if (param->bias || data_type == DATA_TYPE_INT8) {
+        bias_ptr = resource->bias_handle.force_to<void *>();
+    }
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+
+    if (data_type == DATA_TYPE_FLOAT) {
+        NaiveConv3D<float, float, float, float>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                            param->strides[2], param->strides[1], param->strides[0],
+                                            param->kernels[2], param->kernels[1], param->kernels[0],
+                                            param->pads[4], param->pads[2], param->pads[0], param->group,
+                                            param->dialations[2] ,param->dialations[1], param->dialations[0],
+                                            param->activation_type, NULL, 0);
+    } else if (data_type == DATA_TYPE_BFP16) {
+        NaiveConv3D<bfp16_t, float, float, bfp16_t>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                            param->strides[2], param->strides[1], param->strides[0],
+                                            param->kernels[2], param->kernels[1], param->kernels[0],
+                                            param->pads[4], param->pads[2], param->pads[0], param->group,
+                                            param->dialations[2] ,param->dialations[1], param->dialations[0],
+                                            param->activation_type, NULL, 0);
+    } else if (data_type == DATA_TYPE_INT8) {
+        float *scale_ptr = buffer_scale_.force_to<float *>();
+        void *add_input  = (param->fusion_type == FusionType_None) ? nullptr : inputs[1]->GetHandle().base;
+        NaiveConv3D<int8_t, int8_t, int32_t, int8_t>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                            param->strides[2], param->strides[1], param->strides[0],
+                                            param->kernels[2], param->kernels[1], param->kernels[0],
+                                            param->pads[4], param->pads[2], param->pads[0], param->group,
+                                            param->dialations[2] ,param->dialations[1], param->dialations[0],
+                                            param->activation_type, scale_ptr, buffer_scale_.GetDataCount(), param->fusion_type,
+                                            add_input, buffer_add_scale_.force_to<float *>());
+    } else {
+        return Status(TNNERR_LAYER_ERR, "data type not support in conv3d");
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuConv3DLayerAcc>> g_cpu_conv_3d_layer_acc_register(LAYER_CONVOLUTION_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.h
new file mode 100644
index 0000000..45c06fc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_3d_layer_acc.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_3D_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_3D_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv3d layer cpu acc
+class CpuConv3DLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuConv3DLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+    // @brief for conv add fusion
+    RawBuffer buffer_add_scale_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_3D_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.cc
new file mode 100644
index 0000000..323e0a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_conv_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuConvLayerAcc::~CpuConvLayerAcc() {}
+
+Status CpuConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CPU_CONVERT_HALF_RESOURCE(LAYER_CONVOLUTION);
+    if (runtime_model_ != RUNTIME_MODE_NORMAL) {
+        return TNN_OK;
+    }
+
+    auto conv_param = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    auto conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        if (!buffer_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            const float *w_scale = conv_res->scale_handle.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale);
+
+            const float *i_scale =
+                reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+            const float *o_scale =
+                reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+            int scale_len_w = conv_res->scale_handle.GetDataCount();
+            int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+
+            int32_t *bias_ptr = conv_res->bias_handle.force_to<int32_t *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int w_scale_idx = scale_len_w == 1 ? 0 : i;
+                int o_scale_idx = scale_len_o == 1 ? 0 : i;
+                if (o_scale[o_scale_idx] >= FLT_MIN)
+                    temp_ptr[i] = w_scale[w_scale_idx] / o_scale[o_scale_idx];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_scale_ = temp_buffer;
+        }
+
+        if (conv_param->fusion_type != FusionType_None && !buffer_add_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            auto add_input_resource  = reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource();
+            auto add_output_resource = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+
+            const float *i_scale = add_input_resource->scale_handle.force_to<float *>();
+
+            const float *o_scale = add_output_resource->scale_handle.force_to<float *>();
+
+            int scale_len_i = add_input_resource->scale_handle.GetDataCount();
+            int scale_len_o = add_output_resource->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int scale_idx_i = scale_len_i == 1 ? 0 : i;
+                int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+                if (o_scale[scale_idx_o] >= FLT_MIN)
+                    temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_add_scale_ = temp_buffer;
+        }
+        if (conv_param->activation_type == ActivationType_ReLU6) {
+            auto output_scale_resource      = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+            auto output_scale_len           = output_scale_resource->scale_handle.GetDataCount();
+            auto output_scale_resource_data = output_scale_resource->scale_handle.force_to<float *>();
+            auto &dims_output               = outputs[0]->GetBlobDesc().dims;
+            auto &output_channel            = dims_output[1];
+            RawBuffer relu6_max             = RawBuffer(output_channel * sizeof(int8_t));
+            auto relu6_max_data             = relu6_max.force_to<int8_t *>();
+            for (int i = 0; i < output_channel; ++i) {
+                int scale_idx     = output_scale_len == 1 ? 0 : i;
+                relu6_max_data[i] = float2int8(6.0f / output_scale_resource_data[scale_idx]);
+            }
+            relu6_max_ = relu6_max;
+            relu6_max_.SetDataType(DATA_TYPE_INT8);
+        }
+    }
+    return TNN_OK;
+}
+
+Status CpuConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: ConvLayerParam or ConvLayerResource is empty");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    void *input_ptr    = input_blob->GetHandle().base;
+    void *output_ptr   = output_blob->GetHandle().base;
+    void *weight_ptr   = resource->filter_handle.force_to<void *>();
+    void *bias_ptr     = NULL;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+    if (param->bias || data_type == DATA_TYPE_INT8) {
+        bias_ptr = resource->bias_handle.force_to<void *>();
+    }
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+
+    if (data_type == DATA_TYPE_FLOAT) {
+        NaiveConv<float, float, float, float>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                              param->strides[1], param->strides[0], param->kernels[1],
+                                              param->kernels[0], param->pads[2], param->pads[0], param->group,
+                                              param->dialations[1], param->activation_type, NULL, 0, NULL, 0);
+    } else if (data_type == DATA_TYPE_BFP16) {
+        NaiveConv<bfp16_t, float, float, bfp16_t>(input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims,
+                                                  param->strides[1], param->strides[0], param->kernels[1],
+                                                  param->kernels[0], param->pads[2], param->pads[0], param->group,
+                                                  param->dialations[1], param->activation_type, NULL, 0, NULL, 0);
+    } else if (data_type == DATA_TYPE_INT8) {
+        auto weight_scale = buffer_scale_.force_to<float *>();
+        auto relu6_max    = relu6_max_.force_to<int8_t *>();
+        void *add_input   = (param->fusion_type == FusionType_None) ? nullptr : inputs[1]->GetHandle().base;
+        NaiveConv<int8_t, int8_t, int32_t, int8_t>(
+            input_ptr, output_ptr, weight_ptr, bias_ptr, input_dims, output_dims, param->strides[1], param->strides[0],
+            param->kernels[1], param->kernels[0], param->pads[2], param->pads[0], param->group, param->dialations[1],
+            param->activation_type, weight_scale, buffer_scale_.GetDataCount(), relu6_max, relu6_max_.GetDataCount(),
+            param->fusion_type, add_input, buffer_add_scale_.force_to<float *>());
+    } else {
+        return Status(TNNERR_LAYER_ERR, "data type not support in conv");
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuConvLayerAcc>> g_cpu_conv_layer_acc_register(LAYER_CONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.h
new file mode 100644
index 0000000..e749a77
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_conv_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class CpuConvLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuConvLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+    // @brief for conv add fusion
+    RawBuffer buffer_add_scale_;
+    std::shared_ptr<LayerResource> fp32_resource_ = nullptr;
+    // @brief for conv relu6 fusion
+    RawBuffer relu6_max_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONV_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cos_layer_acc.cc
new file mode 100644
index 0000000..d7c3404
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_cos_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct cos_operator : unary_operator {
+    virtual float operator()(float in) {
+        return cos(in);
+    }
+} COS_OP;
+
+DECLARE_UNARY_ACC(Cos, LAYER_COS, COS_OP);
+
+REGISTER_CPU_ACC(Cos, LAYER_COS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.cc
new file mode 100644
index 0000000..d762bcd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.cc
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_deconv_layer_acc.h"
+
+#include <algorithm>
+
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+static int LeastCommonMultiple(int m, int n) {
+    int a = m, b = n;
+    while (a != b) {
+        if (a > b) {
+            a = a - b;
+        } else {
+            b = b - a;
+        }
+    }
+    return m * n / a;
+}
+
+CpuDeconvLayerAcc::~CpuDeconvLayerAcc() {}
+
+Status CpuDeconvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CPU_CONVERT_HALF_RESOURCE(LAYER_DECONVOLUTION);
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("CpuDeconvLayerAcc dont support DATA_TYPE_INT8");
+        return Status(TNNERR_PARAM_ERR, "CpuDeconvLayerAcc dont support DATA_TYPE_INT8");
+    }
+    return TNN_OK;
+}
+
+Status CpuDeconvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuDeconvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return Status(TNNERR_LAYER_ERR, "data type not support in deconv");
+}
+
+void CpuDeconvLayerAcc::ActiveOutput(ConvLayerParam *param, float &sum) {
+    if (param->activation_type == ActivationType_ReLU) {
+        sum = sum > 0.0f ? sum : 0.0f;
+    } else if (param->activation_type == ActivationType_ReLU6) {
+        if (sum > 6.0f) {
+            sum = 6.0f;
+        } else if (sum < 0.0f) {
+            sum = 0.0f;
+        }
+    } else if(param->activation_type == ActivationType_SIGMOID_MUL) {
+        sum = 1.0f / (1.0f + exp(-sum)) * sum;
+    }
+}
+
+template <typename T>
+Status CpuDeconvLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: DeconvLayerParam or DeconvLayerResource is empty");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    void *input_ptr   = input_blob->GetHandle().base;
+    void *output_ptr  = output_blob->GetHandle().base;
+    // NOTE: weight is format [n][i][o][h][w]
+    // different form conv weight layout [n][o][i][h][w]
+    void *weight_ptr   = resource->filter_handle.force_to<void *>();
+    void *bias_ptr     = param->bias ? resource->bias_handle.force_to<void *>() : nullptr;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    const int batch        = output_dims[0];
+    const int group        = param->group;
+
+    const int output_channel           = output_dims[1];
+    const int output_channel_per_group = output_channel / group;
+    const int output_height            = output_dims[2];
+    const int output_width             = output_dims[3];
+    const int output_size              = output_height * output_width;
+
+    const int input_channel           = input_dims[1];
+    const int input_channel_per_group = input_channel / group;
+    const int input_height            = input_dims[2];
+    const int input_width             = input_dims[3];
+    const int input_size              = input_height * input_width;
+
+    //    const int kernel_size = DimsVectorUtils::Count(param->kernels);
+    const int pad_w_begin = param->pads[0];
+    const int pad_h_begin = param->pads[2];
+
+    const int kernel_w    = param->kernels[0];
+    const int kernel_h    = param->kernels[1];
+    const int kernel_size = kernel_h * kernel_w;
+
+    const int stride_w = param->strides[0];
+    const int stride_h = param->strides[1];
+
+    const int dilation_w = param->dialations[0];
+    const int dilation_h = param->dialations[1];
+
+    const int delta_ky = LeastCommonMultiple(dilation_h, stride_h) / dilation_h;
+    const int delta_kx = LeastCommonMultiple(dilation_w, stride_w) / dilation_w;
+    const int delta_iy = delta_ky * dilation_h / stride_h;
+    const int delta_ix = delta_kx * dilation_w / stride_w;
+
+    if (data_type != DATA_TYPE_INT8) {
+        // #pragma omp parallel
+        for (int b = 0; b < batch; b++) {
+            T *output_ptr_base = (T *)output_ptr + b * group * output_channel_per_group * output_size;
+            T *input_ptr_base  = (T *)input_ptr + b * group * input_channel_per_group * input_size;
+            for (int g = 0; g < group; g++) {
+                const float *weight_ptr_g =
+                    (float *)weight_ptr + g * input_channel_per_group * output_channel_per_group * kernel_size;
+                const float *bias_g = bias_ptr ? (float *)bias_ptr + g * output_channel_per_group : nullptr;
+                T *output_ptr_g     = output_ptr_base + g * output_channel_per_group * output_size;
+                T *input_ptr_g      = input_ptr_base + g * input_channel_per_group * input_size;
+
+                for (int oc = 0; oc < output_channel_per_group; oc++) {
+                    const float bias      = bias_g ? bias_g[oc] : 0.f;
+                    T *output_channel_ptr = output_ptr_g + oc * output_size;
+
+                    for (int oh = 0; oh < output_height; oh++) {
+                        for (int ow = 0; ow < output_width; ow++) {
+                            T *outout_data_ptr = output_channel_ptr + oh * output_width + ow;
+                            float sum          = bias;
+
+                            int oy     = oh + pad_h_begin;
+                            int ox     = ow + pad_w_begin;
+                            int max_sy = std::min((input_height - 1) * stride_h, oy / stride_h * stride_h);
+                            int max_sx = std::min((input_width - 1) * stride_w, ox / stride_w * stride_w);
+                            int min_ky = UP_DIV(oy - max_sy, dilation_h);
+                            int min_kx = UP_DIV(ox - max_sx, dilation_w);
+                            if ((oy - min_ky * dilation_h) % stride_h == 0 &&
+                                (ox - min_kx * dilation_w) % stride_w == 0) {
+                                int min_sy = std::max(0, ROUND_UP(oy + dilation_h - kernel_h * dilation_h, stride_h));
+                                int min_sx = std::max(0, ROUND_UP(ox + dilation_w - kernel_w * dilation_w, stride_w));
+                                int max_ky = (oy - min_sy) / dilation_h;
+                                int max_kx = (ox - min_sx) / dilation_w;
+                                int min_iy = (oy - max_ky * dilation_h) / stride_h;
+                                int min_ix = (ox - max_kx * dilation_w) / stride_w;
+
+                                auto weight_data = weight_ptr_g + oc * kernel_size;
+                                auto input_data  = (T *)input_ptr_g;
+                                for (auto ic = 0; ic < input_channel_per_group; ic++) {
+                                    for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= delta_ky, iy += delta_iy) {
+                                        for (auto kx = max_kx, ix = min_ix; kx >= min_kx;
+                                             kx -= delta_kx, ix += delta_ix) {
+                                            auto wt4 = weight_data[ic * output_channel_per_group * kernel_size +
+                                                                   ky * kernel_w + kx];
+                                            auto in4 = input_data[ic * input_size + iy * input_width + ix];
+                                            sum += float(in4) * wt4;
+                                        }
+                                    }
+                                }
+                            }
+                            // post op : only support relu and relu6
+                            ActiveOutput(param, sum);
+                            *outout_data_ptr = sum;
+                        }
+                    }
+                }
+            }
+        }
+
+    } else {
+        LOGE("Error: CpuDeconvLayerAcc layer acc dont support datatype: %d\n", data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuDeconvLayerAcc layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuDeconvLayerAcc>> g_cpu_deconv_layer_acc_register(LAYER_DECONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.h
new file mode 100644
index 0000000..ab54487
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_deconv_layer_acc.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief Deconv layer cpu acc
+class CpuDeconvLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuDeconvLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+
+    std::shared_ptr<LayerResource> fp32_resource_ = nullptr;
+
+    void ActiveOutput(ConvLayerParam *param, float &sum);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.cc
new file mode 100644
index 0000000..0f35617
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_detection_output_layer_acc.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/compute/normalized_bbox.h"
+#include "tnn/utils/bbox_util.h"
+
+namespace TNN_NS {
+
+CpuDetectionOuputLayerAcc::~CpuDetectionOuputLayerAcc(){};
+
+Status CpuDetectionOuputLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuDetectionOuputLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    DetectionOutputLayerParam *param = dynamic_cast<DetectionOutputLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    NaiveDetectionOutput(inputs, outputs, param);
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuDetectionOuputLayerAcc>> g_cpu_detection_output_layer_acc_register(
+    LAYER_DETECTION_OUTPUT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.h
new file mode 100644
index 0000000..be1db7b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_output_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_DETECTION_OUTPUT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_DETECTION_OUTPUT_LAYER_ACC_H_
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/utils/bbox_util.h"
+
+namespace TNN_NS {
+
+// @brief Detection Output layer cpu acc
+class CpuDetectionOuputLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuDetectionOuputLayerAcc();
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_DETECTION_OUTPUT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.cc
new file mode 100644
index 0000000..d801aba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_detection_post_process_layer_acc.h"
+
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/detection_post_process_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+CpuDetectionPostProcessLayerAcc::~CpuDetectionPostProcessLayerAcc(){};
+
+Status CpuDetectionPostProcessLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuDetectionPostProcessLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<DetectionPostProcessLayerParam *>(param_);
+    auto resource = dynamic_cast<DetectionPostProcessLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: ConvLayerParam or ConvLayerResource is empty");
+    }
+    if (param->use_regular_nms) {
+        return TNNERR_UNSUPPORT_NET;
+    }
+    Blob* nhwc_input0 = new Blob(inputs[0]->GetBlobDesc(), true);
+    DataFormatConverter::ConvertFromNCHWToNHWC<float>(inputs[0], nhwc_input0);
+    nhwc_input0->GetBlobDesc().dims = DimsVectorUtils::NCHW2NHWC(nhwc_input0->GetBlobDesc().dims);
+    Blob* nhwc_input1 = new Blob(inputs[1]->GetBlobDesc(), true);
+    DataFormatConverter::ConvertFromNCHWToNHWC<float>(inputs[1], nhwc_input1);
+    nhwc_input1->GetBlobDesc().dims = DimsVectorUtils::NCHW2NHWC(nhwc_input1->GetBlobDesc().dims);
+    CenterSizeEncoding scale_values;
+    scale_values.y = param->center_size_encoding[0];
+    scale_values.x = param->center_size_encoding[1];
+    scale_values.h = param->center_size_encoding[2];
+    scale_values.w = param->center_size_encoding[3];
+    BlobDesc decode_boxes_desc;
+    decode_boxes_desc.dims = {nhwc_input0->GetBlobDesc().dims[1], 4, 1, 1};
+    Blob decode_boxes_blob = Blob(decode_boxes_desc, true);
+    DecodeBoxes(param, resource, nhwc_input0, scale_values, &decode_boxes_blob);
+
+    NonMaxSuppressionMultiClassFastImpl(param, resource, &decode_boxes_blob, nhwc_input1, outputs[0], outputs[1],
+                                        outputs[2], outputs[3]);
+
+    delete nhwc_input0;
+    delete nhwc_input1;
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuDetectionPostProcessLayerAcc>>
+    g_cpu_detection_post_process_layer_acc_register(LAYER_DETECTION_POST_PROCESS);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.h
new file mode 100644
index 0000000..cf8694c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_detection_post_process_layer_acc.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_DETECTION_POST_PROCESS_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_DETECTION_POST_PROCESS_LAYER_ACC_H_
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/utils/bbox_util.h"
+
+namespace TNN_NS {
+
+// @brief Detection Output layer cpu acc
+class CpuDetectionPostProcessLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuDetectionPostProcessLayerAcc();
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_ACC_CPU_DETECTION_POST_PROCESS_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc
new file mode 100644
index 0000000..c8cd9d4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Div, LAYER_DIV);
+
+Status CpuDivLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    void *output_data       = output->GetHandle().base;
+    const auto &output_dims = output->GetBlobDesc().dims;
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_ELEMENT_WISE<float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                [](float a, float b) -> float { return a / b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                  [](int a, int b) -> int { return a / b; });
+    } else {
+        LOGE("Error: CpuDivLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "CpuDivLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Div, LAYER_DIV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_einsum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_einsum_layer_acc.cc
new file mode 100644
index 0000000..6a896db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_einsum_layer_acc.cc
@@ -0,0 +1,229 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+
+#include "tnn/device/cpu/acc/compute/compute_elewise.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Einsum, LAYER_EINSUM);
+
+inline int count(std::vector<int> dimes, int start_axis) {
+    const int end_axis = int(dimes.size());
+    ASSERT(start_axis <= end_axis);
+    ASSERT(start_axis >= 0);
+    ASSERT(end_axis >= 0);
+    int count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+        count *= dimes[i];
+    }
+    return count;
+};
+
+std::shared_ptr<Blob> Permute(Blob *input_blob, const std::vector<int> &orders) {
+    auto output_blob_ptr = std::make_shared<Blob>(input_blob->GetBlobDesc(), true);
+    auto *output_blob    = output_blob_ptr.get();
+    auto input_dims      = input_blob->GetBlobDesc().dims;
+    auto output_dims     = input_blob->GetBlobDesc().dims;
+    const int dims_size  = input_dims.size();
+    for (int i = 0; i < dims_size; i++) {
+        output_dims[i] = input_dims[orders[i]];
+    }
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    int num_dims = int(input_dims.size());
+    ASSERT(input_dims.size() == output_dims.size());
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(count(input_dims, i + 1));
+        output_step.push_back(count(output_dims, i + 1));
+    }
+
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    const int output_count = DimsVectorUtils::Count(output_dims);
+    NaivePermute<float>(output_count, output_dims, input_data, orders, input_step, output_step, num_dims, output_data);
+
+    return output_blob_ptr;
+}
+
+void Squeeze(Blob *input_blob, const int axis) {
+    auto output_dims = input_blob->GetBlobDesc().dims;
+    output_dims.erase(output_dims.begin() + axis);
+    input_blob->GetBlobDesc().dims = output_dims;
+}
+
+std::shared_ptr<Blob> Sum(Blob *input_blob, const int axis) {
+    const auto input_desc = input_blob->GetBlobDesc();
+    auto output_desc      = input_desc;
+    auto output_dims      = output_desc.dims;
+    output_dims.erase(output_dims.begin() + axis);
+    output_desc.dims     = output_dims;
+    auto output_blob_ptr = std::make_shared<Blob>(output_desc, true);
+    auto *output_blob    = output_blob_ptr.get();
+
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    int outer_count   = DimsVectorUtils::Count(input_dims, 0, axis);
+    int reducer_count = input_dims[axis];
+    int inner_count   = DimsVectorUtils::Count(input_dims, axis + 1);
+    inner_count       = inner_count == 0 ? 1 : inner_count;
+    input_dims[axis]  = 1;
+
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    const int output_count = DimsVectorUtils::Count(output_dims);
+    memset(output_data, 0, output_count * sizeof(0));
+
+    for (int oc = 0; oc < outer_count; oc++) {
+        for (int c = 0; c < reducer_count; c++) {
+            for (int ic = 0; ic < inner_count; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_count;
+        }
+        output_data += inner_count;
+    }
+
+    return output_blob_ptr;
+}
+
+std::shared_ptr<Blob> Mul(Blob *a, Blob *b) {
+    std::vector<void *> input_ptrs       = {a->GetHandle().base, b->GetHandle().base};
+    std::vector<DimsVector> input_shapes = {a->GetBlobDesc().dims, b->GetBlobDesc().dims};
+    auto output_dims                     = DimsVectorUtils::Max(a->GetBlobDesc().dims, b->GetBlobDesc().dims);
+    auto output_desc                     = a->GetBlobDesc();
+    output_desc.dims                     = output_dims;
+    auto output_ptr                      = std::make_shared<Blob>(output_desc, true);
+    auto *output                         = output_ptr.get();
+    CPU_MUL(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+
+    return output_ptr;
+}
+
+void Flatten(Blob *input_blob) {
+    const int output_dims_size     = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims);
+    input_blob->GetBlobDesc().dims = {output_dims_size};
+}
+
+std::shared_ptr<Blob> Dot(Blob *a, Blob *b) {
+    auto output_blob_desc = a->GetBlobDesc();
+    output_blob_desc.dims = {1};
+    auto output_blob_ptr  = std::make_shared<Blob>(output_blob_desc, true);
+    auto *output_blob     = output_blob_ptr.get();
+    const int data_size   = a->GetBlobDesc().dims[0];
+    float *a_data         = static_cast<float *>(a->GetHandle().base);
+    float *b_data         = static_cast<float *>(b->GetHandle().base);
+    float *output_data    = static_cast<float *>(output_blob->GetHandle().base);
+    float sum             = 0;
+    for (int i = 0; i < data_size; i++) {
+        sum += a_data[i] * b_data[i];
+    }
+    output_data[0] = sum;
+
+    return output_blob_ptr;
+}
+
+Status CpuEinsumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuEinsumLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<EinsumLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: EinsumLayerParam is nil");
+    }
+
+    std::vector<std::shared_ptr<Blob>> permuted_operands;
+    const int num_ops = inputs.size();
+    for (int i = 0; i < num_ops; i++) {
+        auto operand_ptr            = std::make_shared<Blob>(inputs[i]->GetBlobDesc(), inputs[i]->GetHandle());
+        auto *operand               = operand_ptr.get();
+        operand->GetBlobDesc().dims = param->operand_dims[i];
+        permuted_operands.push_back(Permute(operand, param->perm_shapes[i]));
+    }
+
+    int out_size     = param->out_size;
+    int perm_index   = param->dim_last_op.size();
+    auto dim_last_op = param->dim_last_op;
+    auto result      = permuted_operands[0];
+
+    if (param->has_zero_size_dim) {
+        std::vector<int> out_shape(out_size);
+        int output_shape_count = 1;
+        for (int i = 0; i < out_size; i++) {
+            out_shape[i] = permuted_operands[dim_last_op[i]].get()->GetBlobDesc().dims[i];
+            output_shape_count *= out_shape[i];
+        }
+        float *output_ptr = static_cast<float *>(outputs[0]->GetHandle().base);
+        memset(output_ptr, 0, sizeof(float) * output_shape_count);
+
+        return TNN_OK;
+    }
+
+    int dim = out_size;
+    for (int i = dim; i < perm_index; ++i, ++dim) {
+        if (dim_last_op[i] == 0) {
+            if (result.get()->GetBlobDesc().dims[dim] == 1) {
+                Squeeze(result.get(), dim--);
+            } else {
+                result = Sum(result.get(), dim--);
+            }
+        }
+    }
+
+    auto operand = permuted_operands[1];
+    std::vector<int> sum_dims;
+
+    dim = out_size;
+    for (int j = dim; j < perm_index; ++j, ++dim) {
+        if (dim_last_op[j] < 1) {
+            Squeeze(operand.get(), dim);
+            --dim;
+        } else if (dim_last_op[j] == 1) {
+            if (result.get()->GetBlobDesc().dims[dim] == 1) {
+                operand = Sum(operand.get(), dim);
+                Squeeze(result.get(), dim);
+                --dim;
+            } else {
+                sum_dims.push_back(dim);
+            }
+        }
+    }
+
+    if (sum_dims.empty()) {
+        result = Mul(result.get(), operand.get());
+    } else if (sum_dims.size() == result.get()->GetBlobDesc().dims.size()) {
+        Flatten(result.get());
+        Flatten(operand.get());
+        result = Dot(result.get(), operand.get());
+    } else {
+        result = Mul(result.get(), operand.get());
+        for (const auto axis : sum_dims) {
+            result = Sum(result.get(), axis);
+        }
+    }
+
+    const int data_count = DimsVectorUtils::Count(result.get()->GetBlobDesc().dims);
+    memcpy(outputs[0]->GetHandle().base, result.get()->GetHandle().base, data_count * sizeof(float));
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Einsum, LAYER_EINSUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_elu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_elu_layer_acc.cc
new file mode 100644
index 0000000..7770010
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_elu_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct elu_operator : unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<EluLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error:  layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        float tmp = in;
+        if (tmp < 0) {
+            tmp = alpha_ * (exp(tmp) - 1.0f);
+        }
+        return tmp;
+    }
+
+private:
+    float alpha_ = 0;
+} ELU_OP;
+
+DECLARE_UNARY_ACC(Elu, LAYER_ELU, ELU_OP);
+
+REGISTER_CPU_ACC(Elu, LAYER_ELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc
new file mode 100644
index 0000000..00f259b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Equal, LAYER_EQUAL);
+
+Status CpuEqualLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuEqualLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *output_blob = outputs[0];
+    
+    std::vector<void *> input_ptrs;
+    std::vector<DimsVector> input_shapes;
+    for (size_t inid = 0; inid < inputs.size(); inid++) {
+        input_ptrs.push_back(inputs[inid]->GetHandle().base);
+        input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
+    }
+    
+    auto data_type = inputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_INT32) {
+        void *output_data = output_blob->GetHandle().base;
+        const auto &output_dims = output_blob->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                  [](int a, int b) -> char { return a == b; });
+    }  else {
+        LOGE("Error: CpuEqualLayerAcc don't support data type: %d\n", data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Equal, LAYER_EQUAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_erf_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_erf_layer_acc.cc
new file mode 100644
index 0000000..de0889e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_erf_layer_acc.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+namespace TNN_NS {
+inline float fast_erf_approximation(float x) {
+    //use x*x instead of pow(x, 2), see  https://www.zhihu.com/question/60172486
+    auto t = 1 / (1 + 0.5 * fabs(x));
+    
+    auto t_2 = t * t;
+    auto t_3 = t_2 * t;
+    auto t_4 = t_3 * t;
+    auto t_5 = t_4 * t;
+    auto t_6 = t_5 * t;
+    auto t_7 = t_6 * t;
+    auto t_8 = t_7 * t;
+    auto t_9 = t_8 * t;
+    
+    auto v = t * exp(-x * x - 1.26551223 + 1.00002368 * t + 0.37409196 * t_2 + 0.09678418 * t_3 -
+                     0.18628806 * t_4 + 0.27886807 * t_5 - 1.13520398 * t_6 +
+                     1.48851587 * t_7 - 0.82215223 * t_8 + 0.17087277 * t_9);
+    if (x >= 0) {
+        return 1 - v;
+    } else {
+        return v - 1;
+    }
+}
+
+typedef struct erf_operator : unary_operator {
+    virtual float operator()(float x) {
+        return erf(x);
+    }
+} ERF_OP;
+
+DECLARE_UNARY_ACC(Erf, LAYER_ERF, ERF_OP);
+
+REGISTER_CPU_ACC(Erf, LAYER_ERF);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_exp_layer_acc.cc
new file mode 100644
index 0000000..0824fdd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_exp_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct exp_operator : unary_operator {
+    virtual float operator()(float in) {
+        return exp(in);
+    }
+} EXP_OP;
+
+DECLARE_UNARY_ACC(Exp, LAYER_EXP, EXP_OP);
+
+REGISTER_CPU_ACC(Exp, LAYER_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc
new file mode 100644
index 0000000..d0a28d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(Expand, LAYER_EXPAND,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs););
+
+Status CpuExpandLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuExpandLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto expand_param = dynamic_cast<ExpandLayerParam*>(param_);
+    CHECK_PARAM_NULL(expand_param);
+    
+    if (inputs.size() == 2) {
+        auto data_dims = inputs[0]->GetBlobDesc().dims;
+        DimsVector shape_dims;
+        auto shape_data = (int *)inputs[1]->GetHandle().base;
+        auto shape_data_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        for (int i=0; i<shape_data_count; i++) {
+            shape_dims.push_back(shape_data[i]);
+        }
+        
+        expand_param->shape = shape_dims;
+        
+        auto output_dims = DimsFunctionUtils::Expand(data_dims, shape_dims, nullptr);
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+Status CpuExpandLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    
+    
+    int diff = output_dims.size() - input_dims.size();
+    
+    char *input_data  = reinterpret_cast<char *>(input_blob->GetHandle().base);
+    char *output_data = reinterpret_cast<char *>(output_blob->GetHandle().base);
+    int output_diff_start_cnt = DimsVectorUtils::Count(output_dims, diff);
+    for(int i = 0; i < output_diff_start_cnt; ++i) {
+        int index = i, in_index = 0;
+        for(int j = input_dims.size() - 1; j >= 0; --j) {
+            int input_dim = input_dims[j];
+            int output_dim = output_dims[j + diff];
+            int mod = index % output_dim;
+            if(input_dim == 1) {
+                mod = 0;
+            }
+            index /= output_dim;
+            in_index += mod * DimsVectorUtils::Count(input_dims, j + 1);
+        }
+        //output_data[i] = input_data[in_index];
+        memcpy(output_data + i*ele_size, input_data + in_index*ele_size, ele_size);
+    }
+    if(diff > 0) {
+        const int data_size = output_diff_start_cnt * ele_size;
+        int repeat_cnt      = DimsVectorUtils::Count(output_dims, 0, diff);
+        for(int i = 1; i < repeat_cnt; ++i) {
+            memcpy(output_data + i * data_size, output_data, data_size);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Expand, LAYER_EXPAND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_flatten_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_flatten_layer_acc.cc
new file mode 100644
index 0000000..b855bdc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_flatten_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Flatten, LAYER_FLATTEN);
+
+Status CpuFlattenLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuFlattenLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<FlattenLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: FlattenLayerParam is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    if (output->GetHandle().base != input->GetHandle().base) {
+        auto dims_input    = input->GetBlobDesc().dims;
+        int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+        auto size_in_bytes = DimsVectorUtils::Count(dims_input) * data_byte_size;
+        memcpy(output->GetHandle().base, input->GetHandle().base, size_in_bytes);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_floor_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_floor_layer_acc.cc
new file mode 100644
index 0000000..ca99d49
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_floor_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct floor_operator : unary_operator {
+    virtual float operator()(float in) {
+        return std::floor(in);
+    }
+} FLOOR_OP;
+
+DECLARE_UNARY_ACC(Floor, LAYER_FLOOR, FLOOR_OP);
+
+REGISTER_CPU_ACC(Floor, LAYER_FLOOR);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gather_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gather_layer_acc.cc
new file mode 100644
index 0000000..cadb458
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gather_layer_acc.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Gather, LAYER_GATHER);
+
+Status CpuGatherLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuGatherLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int axis = layer_param->axis;
+    
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+    
+    DimsVector input_data_dims;
+    char *input_data_ptr = nullptr;
+    if (layer_param->data_in_resource) {
+        input_data_dims = layer_resource->data.GetBufferDims();
+        input_data_ptr = layer_resource->data.force_to<char*>();
+    } else {
+        input_data_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+        input_data_ptr = (char*)(*(inputs.begin()))->GetHandle().base;
+    }
+    
+    DimsVector indices_dims;
+    int *indices_data_ptr = nullptr;
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_data_ptr = layer_resource->indices.force_to<int*>();
+    } else {
+        indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+        indices_data_ptr = (int *)(*(inputs.rbegin()))->GetHandle().base;
+    }
+    
+    const int slice_size = DimsVectorUtils::Count(input_data_dims, axis+1);
+    const int input_slice_count = DimsVectorUtils::Count(input_data_dims, axis, axis+1);
+    const int batch = DimsVectorUtils::Count(input_data_dims, 0, axis);
+    
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const int output_slice_count = DimsVectorUtils::Count(indices_dims);
+    
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto output_data_ptr = (char*)outputs[0]->GetHandle().base;
+    
+    for (int b=0; b<batch; b++) {
+        int input_index_b = b*input_slice_count*slice_size;
+        int output_index_b = b*output_slice_count*slice_size;
+        for (int i=0; i<output_slice_count; i++) {
+            int slice_index = indices_data_ptr[i];
+            if (slice_index < 0) {
+                slice_index += input_slice_count;
+            }
+            if (slice_index < 0 || slice_index >= input_slice_count) {
+                LOGE("CpuGatherLayerAcc::Forward invalid slice_index (%d) layer name: %s\n", slice_index, layer_param->name.c_str());
+                return Status(TNNERR_MODEL_ERR, "CpuGatherLayerAcc::Forward invalid slice_index");
+            }
+            int input_index = input_index_b + slice_index*slice_size;
+            int output_index = output_index_b + i*slice_size;
+            
+            memcpy(output_data_ptr + output_index*ele_size,
+                   input_data_ptr + input_index*ele_size,
+                   slice_size * ele_size);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Gather, LAYER_GATHER);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gathernd_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gathernd_layer_acc.cc
new file mode 100644
index 0000000..ea0fe01
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gathernd_layer_acc.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(GatherND, LAYER_GATHERND);
+
+Status CpuGatherNDLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuGatherNDLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherNDLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int batch_dims = layer_param->batch_dims;
+    
+    if (batch_dims != 0) {
+        return Status(TNNERR_PARAM_ERR, "GatherNDLayerParam has invalid param batch_dims");
+    }
+    
+    auto input_data_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+    auto input_data_ptr = (char*)(*(inputs.begin()))->GetHandle().base + (*(inputs.begin()))->GetHandle().bytes_offset;
+    auto output_data_ptr = (char*)(*(outputs.begin()))->GetHandle().base + (*(outputs.begin()))->GetHandle().bytes_offset;
+    auto input_stride = DimsFunctionUtils::StrideOfShape(input_data_dims);
+    
+    auto indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+    int *indices_data_ptr = (int *)(*(inputs.rbegin()))->GetHandle().base;
+
+    if (indices_dims[indices_dims.size()-1] != input_data_dims.size()) {
+        return Status(TNNERR_PARAM_ERR, "GatherNDLayerParam has invalid param indices_dims");
+    }
+    
+    const int slice_index_size = indices_dims[indices_dims.size()-1];
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    
+    const int output_slice_count = DimsVectorUtils::Count(indices_dims, 0, (int)indices_dims.size()-1);
+    for (int i=0; i<output_slice_count; i++) {
+        auto output_index = i;
+        
+        int *indices_ptr = indices_data_ptr + i * slice_index_size;
+        
+        int input_index = 0;
+        for (int ii=0; ii<slice_index_size; ii++) {
+            input_index += indices_ptr[ii] *input_stride[ii];
+        }
+        memcpy(output_data_ptr + output_index*ele_size,
+               input_data_ptr + input_index*ele_size,
+               1 * ele_size);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(GatherND, LAYER_GATHERND);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gelu_layer_acc.cc
new file mode 100644
index 0000000..18787da
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gelu_layer_acc.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Gelu, LAYER_GELU);
+
+Status CpuGeluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuGeluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    //GELU(x) = 0.5f * x * (erf(x*0.707106793288165f) + 1.0f); use erf in math.h or see the func in cpu_erf_layer_acc.cc
+    //GELU(x) = 0.5f * x * (tanh((x+x*x*x*0.0447149984538f)*0.7978845834732056f)+1.0f), the approximation has big error if input is -2.281006575
+    
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count         = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    auto data_type    = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            auto x = input_data[index];
+            output_data[index] = 0.5f * x * (erf(x*0.707106793288165f) + 1.0f);
+        }
+    } else {
+        LOGE("CpuGeluLayerAcc dont support data type: %d", data_type);
+        return Status(TNNERR_NO_RESULT, "CpuGeluLayerAcc dont support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Gelu, LAYER_GELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gridsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gridsample_layer_acc.cc
new file mode 100644
index 0000000..5028a0c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_gridsample_layer_acc.cc
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(GridSample, LAYER_GRIDSAMPLE);
+
+Status CpuGridSampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+static inline bool within_bounds_2d(int h, int w, int H, int W) {
+    return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+Status CpuGridSampleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GridSampleLayerParam *>(param_);
+    if (layer_param->mode != 2 || layer_param->pad_type != 0 || layer_param->align_corners != 0) {
+        return Status(TNNERR_PARAM_ERR, "CpuGridSampleLayerAcc dont support some mode or pade type or align_corners");
+    }
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto grid_dims   = inputs[1]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    if (output_dims.size() != 4) {
+        return Status(TNNERR_PARAM_ERR, "CpuGridSampleLayerAcc only support 4D sampler");
+    }
+    const int batch               = input_dims[0];
+    const int channel             = input_dims[1];
+    const int input_height        = input_dims[2];
+    const int input_width         = input_dims[3];
+    const int input_channel_area  = DimsVectorUtils::Count(input_dims, 2);
+    const int output_channel_area = DimsVectorUtils::Count(output_dims, 2);
+    const int grid_height         = grid_dims[1];
+    const int grid_width          = grid_dims[2];
+    const int grid_area           = DimsVectorUtils::Count(grid_dims, 1);
+    auto output_height            = output_dims[2];
+    auto output_width             = output_dims[3];
+
+    float *input_base_ptr  = (float *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+    float *grid_base_ptr   = (float *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+    float *output_base_ptr = (float *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+
+    if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_FLOAT) {
+        return Status(TNNERR_PARAM_ERR, "CpuGridSampleLayerAcc now only support float data");
+    }
+    for (int n = 0; n < batch; n++) {
+        auto input_data_b  = input_base_ptr + n * channel * input_channel_area;
+        auto grid_data_b   = grid_base_ptr + n * grid_area;
+        auto output_data_b = output_base_ptr + n * channel * output_channel_area;
+        for (int h = 0; h < output_height; ++h) {
+            for (int w = 0; w < output_width; ++w) {
+                auto grid_position = grid_data_b + h * grid_width * 2 + w * 2;
+                float x            = grid_position[0];
+                float y            = grid_position[1];
+                // unnormalize
+                float ix = (x + 1) * input_width * 0.5 - 0.5;
+                float iy = (y + 1) * input_height * 0.5 - 0.5;
+                // get corner pixel values from (x, y)
+                // for 4d, we use north-east-south-west
+                int ix_nw = static_cast<int>(std::floor(ix));
+                int iy_nw = static_cast<int>(std::floor(iy));
+
+                int ix_ne = ix_nw + 1;
+                int iy_ne = iy_nw;
+
+                int ix_sw = ix_nw;
+                int iy_sw = iy_nw + 1;
+
+                int ix_se = ix_nw + 1;
+                int iy_se = iy_nw + 1;
+
+                // get surfaces to each neighbor:
+                bool nw_within_bound = within_bounds_2d(iy_nw, ix_nw, input_height, input_width);
+                bool ne_within_bound = within_bounds_2d(iy_ne, ix_ne, input_height, input_width);
+                bool sw_within_bound = within_bounds_2d(iy_sw, ix_sw, input_height, input_width);
+                bool se_within_bound = within_bounds_2d(iy_se, ix_se, input_height, input_width);
+                float nw             = nw_within_bound ? (ix_se - ix) * (iy_se - iy) : 0;
+                float ne             = ne_within_bound ? (ix - ix_sw) * (iy_sw - iy) : 0;
+                float sw             = sw_within_bound ? (ix_ne - ix) * (iy - iy_ne) : 0;
+                float se             = se_within_bound ? (ix - ix_nw) * (iy - iy_nw) : 0;
+                int nw_index         = nw_within_bound ? iy_nw * input_width + ix_nw : 0;
+                int ne_index         = ne_within_bound ? iy_ne * input_width + ix_ne : 0;
+                int sw_index         = sw_within_bound ? iy_sw * input_width + ix_sw : 0;
+                int se_index         = se_within_bound ? iy_se * input_width + ix_se : 0;
+
+                // calculate bilinear weighted pixel value and set output pixel
+                float *input_data  = input_data_b;
+                float *output_data = output_data_b + h * output_width + w;
+                for (int c = 0; c < channel;
+                     ++c, output_data += output_channel_area, input_data += input_channel_area) {
+                    auto res = static_cast<float>(0);
+                    res += input_data[nw_index] * nw;
+                    res += input_data[ne_index] * ne;
+                    res += input_data[sw_index] * sw;
+                    res += input_data[se_index] * se;
+                    *output_data = res;
+                }
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(GridSample, LAYER_GRIDSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_group_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_group_norm_layer_acc.cc
new file mode 100644
index 0000000..26c04bf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_group_norm_layer_acc.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(GroupNorm, LAYER_GROUP_NORM);
+
+Status CpuGroupNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuGroupNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GroupNormLayerParam *>(param_);
+    // for group norm
+    // if group == input_channel, it is equivalent with InstanceNorm
+    // if group == 1, it is equivalent with LayerNorm
+
+    Blob *input_blob  = inputs[0];
+    Blob *scale_blob  = inputs[1];
+    Blob *bias_blob  = inputs[2];
+    Blob *output_blob = outputs[0];
+    
+    const int group = layer_param->group;
+    const int batch_time_group = output_blob->GetBlobDesc().dims[0] * group;
+    const int channels_per_group = output_blob->GetBlobDesc().dims[1] / group;
+    const int channel_area = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    const int group_area = channel_area * channels_per_group;
+    if (0 == group_area || 0 == channels_per_group) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+    
+    float *k_data = (float *)((char*)scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+    float *b_data = (float *)((char*)bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
+
+    const float epsilon = layer_param->eps;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = (float *)((char *)input_blob->GetHandle().base+ input_blob->GetHandle().bytes_offset);
+        float *output_data = (float *)((char *)output_blob->GetHandle().base + output_blob->GetHandle().bytes_offset);
+        //浮点运算在累加时存在大数吃小数情况，造成误差大，instancenorm累加次数大，更容易出现
+        //可考虑用Kahan公式或者用double运算，最后转换成float
+        // https://blog.csdn.net/weixin_34268753/article/details/85917630
+
+        //利用方差计算公式减少读次数
+        // https://baike.baidu.com/item/方差计算公式/5318566?fr=aladdin
+        for (int b = 0; b < batch_time_group; b++) {
+            //sum_x sum_x2
+            double mean_x = 0;
+            double variance = 1;
+            {
+                double sum_x  = 0;
+                double sum_x2 = 0;
+                for (int hw = 0; hw < group_area; ++hw) {
+                    auto temp = input_data[hw];
+                    sum_x += temp;
+                    sum_x2 += temp * temp;
+                    ;
+                }
+                mean_x  = sum_x / group_area;
+                auto mean_x2 = sum_x2 / group_area;
+
+                variance = mean_x2 - mean_x * mean_x;
+                variance = 1.0f / sqrt(variance + epsilon);
+            }
+
+            int output_channel = (b % group) * channels_per_group;
+            for (int c = 0; c < channels_per_group; ++c, ++output_channel) {
+                float k = k_data[output_channel];
+                float bias = b_data == NULL ? 0.0f : b_data[output_channel];
+                bias -= mean_x * variance * k;
+                for (int hw = 0; hw < channel_area; ++hw, ++output_data, ++input_data) {
+                    *output_data = (float)((*input_data) * variance * k + bias);
+                }
+            }
+        }
+    } else {
+        LOGE("Error: CpuGroupNormLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuGroupNormLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(GroupNorm, LAYER_GROUP_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..5a6da71
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_sigmoid_layer_acc.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct hardsigmoid_operator : unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<HardSigmoidLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: hardsigmoid layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: hardsigmoid layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        beta_  = layer_param->beta;
+        min_   = -beta_ / alpha_;
+        max_   = (1.0f - beta_) / alpha_;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        float temp = in;
+        if (temp <= min_) {
+            temp = 0.0;
+        } else if (temp < max_) {
+            temp = temp * alpha_ + beta_;
+        } else {
+            temp = 1.0;
+        }
+        return temp;
+    }
+
+private:
+    float min_ = 0.f, max_ = 0.f, alpha_ = 0.f, beta_ = 0.f;
+} HARDSIGMOID_OP;
+
+DECLARE_UNARY_ACC(HardSigmoid, LAYER_HARDSIGMOID, HARDSIGMOID_OP);
+
+REGISTER_CPU_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_swish_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_swish_layer_acc.cc
new file mode 100644
index 0000000..df72d97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hard_swish_layer_acc.cc
@@ -0,0 +1,116 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(HardSwish, LAYER_HARDSWISH);
+
+Status CpuHardSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuHardSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: HardSwishLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: HardSwishLayerParam is nil");
+    }
+    const float alpha = layer_param->alpha;
+    const float beta  = layer_param->beta;
+    const float minV  = -beta / alpha;
+    const float maxV  = (1.0f - beta) / alpha;
+    LOGD("alpha: %.6f  beta: %.6f\n", alpha, beta);
+    LOGD("minV: %.6f  maxV: %.6f\n", minV, maxV);
+
+    Blob *input_blob_0 = inputs[0];
+    Blob *input_blob_1 = inputs.size() > 1 ? inputs[1] : input_blob_0;
+    Blob *output_blob  = outputs[0];
+    // TO-DO : support input size is 1 and resource is not null
+
+    auto shape_input_0     = input_blob_0->GetBlobDesc().dims;
+    auto shape_input_1     = input_blob_1->GetBlobDesc().dims;
+    auto shape_output      = output_blob->GetBlobDesc().dims;
+    // TODO: refactor later
+    while(shape_input_0.size() < 4)
+        shape_input_0.push_back(1);
+    while(shape_input_1.size() < 4)
+        shape_input_1.push_back(1);
+    while(shape_output.size() < 4)
+        shape_output.push_back(1);
+
+    const int batch        = shape_output[0];
+    const int channel      = shape_output[1];
+    const int height       = DimsFunctionUtils::GetDim(shape_output, 2);
+    const int width        = DimsFunctionUtils::GetDim(shape_output, 3);
+    const int channel_size = height * width;
+
+    const int input0_dim2 = DimsFunctionUtils::GetDim(shape_input_0, 2);
+    const int input0_dim3 = DimsFunctionUtils::GetDim(shape_input_0, 3);
+    const int input1_dim2 = DimsFunctionUtils::GetDim(shape_input_1, 2);
+    const int input1_dim3 = DimsFunctionUtils::GetDim(shape_input_1, 3);
+
+    // y =  x0 * clip(x1*alpha + beta, 0, 1)
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data_0 = static_cast<float *>(input_blob_0->GetHandle().base);
+        float *input_data_1 = static_cast<float *>(input_blob_1->GetHandle().base);
+        float *output_data  = static_cast<float *>(output_blob->GetHandle().base);
+
+        for (int b = 0; b < batch; b++) {
+            int output_index_b = b * channel * channel_size;
+
+            int input_index_b_0 =
+                std::min(b, shape_input_0[0] - 1) * shape_input_0[1] * input0_dim2 * input0_dim3;
+            int input_index_b_1 =
+                std::min(b, shape_input_1[0] - 1) * shape_input_1[1] * input1_dim2 * input1_dim3;
+            //            OMP_PARALLEL_FOR_
+            for (int c = 0; c < channel; c++) {
+                int output_index_c = c * channel_size + output_index_b;
+
+                int input_index_c_0 =
+                    std::min(c, shape_input_0[1] - 1) * input0_dim2 * input0_dim3 + input_index_b_0;
+                int input_index_c_1 =
+                    std::min(c, shape_input_1[1] - 1) * input1_dim2 * input1_dim3 + input_index_b_1;
+
+                for (int h = 0; h < height; h++) {
+                    int output_index_h = h * width + output_index_c;
+
+                    int input_index_h_0 = std::min(h, input0_dim2 - 1) * input0_dim3 + input_index_c_0;
+                    int input_index_h_1 = std::min(h, input1_dim2 - 1) * input1_dim3 + input_index_c_1;
+                    for (int w = 0; w < width; w++) {
+                        int input_index_w_0 = std::min(w, input0_dim3 - 1) + input_index_h_0;
+                        int input_index_w_1 = std::min(w, input1_dim3 - 1) + input_index_h_1;
+
+                        float temp0                     = input_data_0[input_index_w_0];
+                        float temp1                     = input_data_1[input_index_w_1] * alpha + beta;
+                        output_data[output_index_h + w] = temp0 * std::max(std::min(temp1, 1.0f), 0.0f);
+                    }
+                }
+            }
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "Error: CpuHardSwishLayerAcc datatype not support ");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(HardSwish, LAYER_HARDSWISH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hdrguide_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hdrguide_layer_acc.cc
new file mode 100644
index 0000000..8f16408
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_hdrguide_layer_acc.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FP32_RESOURCE(HdrGuide, LAYER_HDRGUIDE);
+
+Status CpuHdrGuideLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuHdrGuideLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: HdrGuideLayerResource is empty");
+    }
+
+    Blob *input_blob         = inputs[0];
+    Blob *output_blob        = outputs[0];
+    float *input_ptr         = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_ptr        = static_cast<float *>(output_blob->GetHandle().base);
+    float *ccm_weight        = resource->ccm_weight_handle.force_to<float *>();
+    float *ccm_bias          = resource->ccm_bias_handle.force_to<float *>();
+    float *shifts            = resource->shifts_handle.force_to<float *>();
+    float *slopes            = resource->slopes_handle.force_to<float *>();
+    float *projection_weight = resource->projection_weight_handle.force_to<float *>();
+    float *projection_bias   = resource->projection_bias_handle.force_to<float *>();
+
+    DimsVector output_dims   = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims    = input_blob->GetBlobDesc().dims;
+    int batch                = input_dims[0];
+    int channel              = input_dims[1];
+    int height               = input_dims[2];
+    int width                = input_dims[3];
+    int input_channel_stride = height * width;
+    int input_batch_stride   = channel * input_channel_stride;
+    int output_batch_stride  = input_channel_stride * 1;
+
+    for (int b = 0; b < batch; ++b) {
+        const float *ptr0 = input_ptr + input_batch_stride * b;
+        const float *ptr1 = input_ptr + input_batch_stride * b + input_channel_stride;
+        const float *ptr2 = input_ptr + input_batch_stride * b + 2 * input_channel_stride;
+        float *guide_out  = output_ptr + output_batch_stride * b;
+
+        for (int y = 0; y < height; y++) {
+            int x = 0;
+            for (; x < width; x++) {
+                float r = ptr0[0];
+                float g = ptr1[0];
+                float b = ptr2[0];
+
+                // use ccm, create new r, g, b value
+                float new_r = ccm_weight[0] * r + ccm_weight[1] * g + ccm_weight[2] * b + ccm_bias[0];
+                float new_g = ccm_weight[3] * r + ccm_weight[4] * g + ccm_weight[5] * b + ccm_bias[1];
+                float new_b = ccm_weight[6] * r + ccm_weight[7] * g + ccm_weight[8] * b + ccm_bias[2];
+
+                // use slope and shifts per channel
+                float guide_r = 0;
+                float guide_g = 0;
+                float guide_b = 0;
+                for (int i = 0; i < 4; i++) {
+                    guide_r += slopes[i + 0] * std::max(new_r - shifts[i + 0], float(0));
+                    guide_g += slopes[i + 4] * std::max(new_g - shifts[i + 4], float(0));
+                    guide_b += slopes[i + 8] * std::max(new_b - shifts[i + 8], float(0));
+                }
+
+                // channel mix
+                float guide_value = projection_weight[0] * guide_r + projection_weight[1] * guide_g +
+                                    projection_weight[2] * guide_b + projection_bias[0];
+
+                guide_out[0] = std::min(std::max(guide_value, 0.f), 1.f);
+
+                ptr0 += 1;
+                ptr1 += 1;
+                ptr2 += 1;
+                guide_out += 1;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(HdrGuide, LAYER_HDRGUIDE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inner_product_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inner_product_layer_acc.cc
new file mode 100644
index 0000000..c2aafb8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inner_product_layer_acc.cc
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+// DECLARE_CPU_ACC(InnerProduct, LAYER_INNER_PRODUCT);
+
+class CpuInnerProductLayerAcc : public CpuLayerAcc {
+public:
+    virtual ~CpuInnerProductLayerAcc(){};
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+    std::shared_ptr<LayerResource> fp32_resource_ = nullptr;
+};
+
+Status CpuInnerProductLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CPU_CONVERT_HALF_RESOURCE(LAYER_INNER_PRODUCT);
+    if (runtime_model_ != RUNTIME_MODE_NORMAL) {
+        return TNN_OK;
+    }
+
+    auto layer_param = dynamic_cast<InnerProductLayerParam *>(param);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(layer_res);
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        if (!buffer_scale_.GetBytesSize()) {
+            auto dims_output    = outputs[0]->GetBlobDesc().dims;
+            int total_byte_size = dims_output[1] * sizeof(float);
+
+            const float *w_scale = layer_res->scale_handle.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale);
+
+            const float *o_scale =
+                reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+            int scale_len_w = layer_res->scale_handle.GetDataCount();
+            int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+            RawBuffer temp_buffer(total_byte_size);
+            float *temp_ptr = temp_buffer.force_to<float *>();
+
+            int32_t *bias_ptr = layer_res->bias_handle.force_to<int32_t *>();
+            for (int i = 0; i < dims_output[1]; i++) {
+                int w_scale_idx = scale_len_w == 1 ? 0 : i;
+                int o_scale_idx = scale_len_o == 1 ? 0 : i;
+                if (o_scale[o_scale_idx] >= FLT_MIN)
+                    temp_ptr[i] = w_scale[w_scale_idx] / o_scale[o_scale_idx];
+                else
+                    temp_ptr[i] = 0.0;
+            }
+            buffer_scale_ = temp_buffer;
+        }
+    }
+    return TNN_OK;
+}
+
+Status CpuInnerProductLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuInnerProductLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto resource = dynamic_cast<InnerProductLayerResource *>(resource_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: InnerProductLayerParam is nil");
+    }
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: InnerProductLayerResource is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    int has_bias   = param->has_bias;
+    int num_output = param->num_output;
+
+    void *input_data  = input_blob->GetHandle().base;
+    void *output_data = output_blob->GetHandle().base;
+    void *weight_data = resource->weight_handle.force_to<void *>();
+    void *bias_data   = nullptr;
+    if (has_bias) {
+        bias_data = resource->bias_handle.force_to<void *>();
+    }
+
+    auto dims_input  = input_blob->GetBlobDesc().dims;
+    auto dims_output = output_blob->GetBlobDesc().dims;
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        NaiveFC((float *)input_data, (float *)output_data, (float *)weight_data, (float *)bias_data, dims_input,
+                dims_output);
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        NaiveFC(input_data, output_data, weight_data, buffer_scale_.force_to<float *>(), dims_output[1], bias_data,
+                dims_input, dims_output);
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        RawBuffer weight_bf16 = RawBuffer(resource->weight_handle.GetDataCount() * sizeof(bfp16_t));
+        ConvertFromFloatToBFP16((float *)weight_data, weight_bf16.force_to<void *>(),
+                                resource->weight_handle.GetDataCount());
+        NaiveFC((bfp16_t *)input_data, (bfp16_t *)output_data, weight_bf16.force_to<bfp16_t *>(), (float *)bias_data,
+                dims_input, dims_output);
+    } else {
+        return Status(TNNERR_MODEL_ERR, "blob type is unsupported");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_instance_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_instance_norm_layer_acc.cc
new file mode 100644
index 0000000..3e95b8d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_instance_norm_layer_acc.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FP32_RESOURCE(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+Status CpuInstanceNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuInstanceNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<InstanceNormLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<InstanceNormLayerResource *>(resource_);
+    if (!layer_res) {
+        LOGE("Error: layer resource is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer resource is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    int batch    = output_blob->GetBlobDesc().dims[0];
+    int channels = output_blob->GetBlobDesc().dims[1];
+    int area     = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (0 == area) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    RawBuffer scale_handle = layer_res->scale_handle;
+    float *k_data          = layer_res->scale_handle.force_to<float *>();
+    float *b_data          = layer_res->bias_handle.force_to<float *>();
+
+    const float epsilon = layer_param->eps;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        //浮点运算在累加时存在大数吃小数情况，造成误差大，instancenorm累加次数大，更容易出现
+        //可考虑用Kahan公式或者用double运算，最后转换成float
+        // https://blog.csdn.net/weixin_34268753/article/details/85917630
+
+        //利用方差计算公式减少读次数
+        // https://baike.baidu.com/item/方差计算公式/5318566?fr=aladdin
+        for (int b = 0; b < batch; b++) {
+            for (int c = 0; c < channels; ++c) {
+                double sum_x  = 0;
+                double sum_x2 = 0;
+                for (int hw = 0; hw < area; ++hw) {
+                    auto temp = input_data[hw];
+                    sum_x += temp;
+                    sum_x2 += temp * temp;
+                    ;
+                }
+                auto mean_x  = sum_x / area;
+                auto mean_x2 = sum_x2 / area;
+
+                auto variance = mean_x2 - mean_x * mean_x;
+                variance      = variance > 0 ? variance : 0;
+                variance      = 1.0f / sqrt(variance + epsilon);
+
+                double k = k_data[c];
+                variance *= k;
+                double b = b_data == NULL ? 0.0f : b_data[c];
+                b -= mean_x * variance;
+                for (int hw = 0; hw < area; ++hw, ++output_data, ++input_data) {
+                    *output_data = (float)((*input_data) * variance + b);
+                }
+            }
+        }
+    } else {
+        LOGE("Error: CpuInstanceNormLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuInstanceNormLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inverse_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inverse_layer_acc.cc
new file mode 100644
index 0000000..2835d4f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_inverse_layer_acc.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Inverse, LAYER_INVERSE);
+
+Status CpuInverseLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuInverseLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    //see https://en.wikipedia.org/wiki/Invertible_matrix#Inversion_of_2.C3.972_matrices
+    //http://rodolphe-vaillant.fr/?e=7
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (input_dims.size()<2) {
+        return Status(TNNERR_PARAM_ERR, "CpuInverseLayerAcc has invalid input dims");
+    }
+    
+    float *input_data  = (float *)((char *)inputs[0]->GetHandle().base+ inputs[0]->GetHandle().bytes_offset);
+    float *output_data = (float *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+    
+    const int batch = DimsVectorUtils::Count(input_dims, 0, (int)input_dims.size()-2);
+    if (input_dims[input_dims.size()-1] ==2 && input_dims[input_dims.size()-2] ==2) {
+        for (int b=0; b<batch; b++) {
+            float det = input_data[0]*input_data[3] - input_data[1]*input_data[2];
+            float det_inverse = 1.0f / det;
+            
+            output_data[0] = input_data[3]*det_inverse;
+            output_data[1] = -input_data[1]*det_inverse;
+            output_data[2] = -input_data[2]*det_inverse;
+            output_data[3] = input_data[0]*det_inverse;
+            
+            input_data += 4;
+            output_data += 4;
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CpuInverseLayerAcc now only support inverse of matrix batchx2x2");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Inverse, LAYER_INVERSE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.cc
new file mode 100644
index 0000000..1ad4399
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/blob_transfer_utils.h"
+
+namespace TNN_NS {
+
+CpuLayerAcc::~CpuLayerAcc() {}
+
+Status CpuLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    param_    = param;
+    resource_ = resource;
+    
+    auto status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    status = ReloadConstantBlobs(inputs, false);
+
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return Reshape(inputs, outputs);
+}
+
+Status CpuLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+        
+        auto buffer = (*const_resource)[name];
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+        auto status = RawBuffer2Blob(buffer.get(), blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        LOGD("Reload constant blob: %s %p\n", name.c_str(), &blob);
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+std::vector<DataFormat> CpuLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size == 5) {
+        support_list.push_back(DATA_FORMAT_NCDHW);
+    } else if (dims_size >= 0) {
+        support_list.push_back(DATA_FORMAT_NCHW);
+    }
+    return support_list;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.h
new file mode 100644
index 0000000..2a19301
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_acc.h
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_LAYER_ACC_H_
+
+#include <memory>
+#include <vector>
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/device/cpu/acc/compute/compute_elewise.h"
+#include "tnn/device/cpu/acc/compute/compute_int8.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/memory_manager/blob_memory_pool.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+
+namespace TNN_NS {
+
+// @brief cpu layer acc
+class CpuLayerAcc : public AbstractLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~CpuLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) = 0;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) = 0;
+    
+    // @brief allocate or update constant blobs if constant resource change。
+    // Note: this func may cost much time, call this func only when necessary。
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+
+protected:
+    LayerParam *param_       = nullptr;
+    LayerResource *resource_ = nullptr;
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+};
+
+#define DECLARE_CPU_ACC_WITH_FUNC(type_string, layer_type, extra_funcs)                                                                       \
+    class Cpu##type_string##LayerAcc : public CpuLayerAcc {                                                            \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        extra_funcs \
+    }
+
+#define DECLARE_CPU_ACC(type_string, layer_type)                                                                       \
+    class Cpu##type_string##LayerAcc : public CpuLayerAcc {                                                            \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+    }
+
+#define CPU_CONVERT_HALF_RESOURCE(layer_type)                                                                          \
+    {                                                                                                                  \
+        LayerResource *fp32_res = nullptr;                                                                             \
+        RETURN_ON_NEQ(ConvertHalfResource(layer_type, resource, &fp32_res), TNN_OK);                                   \
+        fp32_resource_ = std::shared_ptr<LayerResource>(fp32_res);                                                     \
+        RETURN_ON_NEQ(CpuLayerAcc::Init(context, param, fp32_resource_.get(), inputs, outputs), TNN_OK);               \
+    }
+
+#define DECLARE_CPU_ACC_WITH_FP32_RESOURCE(type_string, layer_type)                                                    \
+    class Cpu##type_string##LayerAcc : public CpuLayerAcc {                                                            \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {                   \
+            CPU_CONVERT_HALF_RESOURCE(layer_type);                                                                     \
+            return TNN_OK;                                                                                             \
+        }                                                                                                              \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+                                                                                                                       \
+    protected:                                                                                                         \
+        std::shared_ptr<LayerResource> fp32_resource_ = nullptr;                                                       \
+    }
+
+#define REGISTER_CPU_ACC(type_string, layer_type)                                                                      \
+    CpuTypeLayerAccRegister<TypeLayerAccCreator<Cpu##type_string##LayerAcc>> g_cpu_##layer_type##_acc_register(        \
+        layer_type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc
new file mode 100644
index 0000000..1d0b383
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(LayerNorm, LAYER_LAYER_NORM);
+
+Status CpuLayerNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuLayerNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<LayerNormLayerParam *>(param_);
+    //Unlike Batch Normalization and Instance Normalization, which applies scalar scale and bias for each entire channel/plane with the affine option,
+    //Layer Normalization applies per-element scale and bias with elementwise_affine.
+    //see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html?highlight=layernorm#torch.nn.LayerNorm
+
+    auto input_blob  = inputs[0];
+    auto scale_blob  = inputs[1];
+    auto bias_blob  = inputs[2];
+    auto output_blob = outputs[0];
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    
+    const int reduce_dim_size = layer_param->reduce_dims_size;
+    const int channel_dim_size = (int)dims_input.size() - reduce_dim_size;
+    
+    const int channels = DimsVectorUtils::Count(dims_input, 0, channel_dim_size);
+    const int channel_area = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, channel_dim_size);
+    
+    if (0 == channels || 0 == channel_area) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+    
+    float *k_data = (float *)((char*)scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+    float *b_data = (float *)((char*)bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
+
+    const float epsilon = layer_param->eps;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = (float *)((char *)input_blob->GetHandle().base+ input_blob->GetHandle().bytes_offset);
+        float *output_data = (float *)((char *)output_blob->GetHandle().base + output_blob->GetHandle().bytes_offset);
+        //浮点运算在累加时存在大数吃小数情况，造成误差大，instancenorm累加次数大，更容易出现
+        //可考虑用Kahan公式或者用double运算，最后转换成float
+        // https://blog.csdn.net/weixin_34268753/article/details/85917630
+
+        //利用方差计算公式减少读次数
+        // https://baike.baidu.com/item/方差计算公式/5318566?fr=aladdin
+        for (int c = 0; c < channels; c++) {
+            //sum_x sum_x2
+            double mean_x = 0;
+            double variance = 1;
+            {
+                double sum_x  = 0;
+                double sum_x2 = 0;
+                for (int hw = 0; hw < channel_area; ++hw) {
+                    auto temp = input_data[hw];
+                    sum_x += temp;
+                    sum_x2 += temp * temp;
+                    ;
+                }
+                mean_x  = sum_x / channel_area;
+                auto mean_x2 = sum_x2 / channel_area;
+
+                variance = mean_x2 - mean_x * mean_x;
+                variance = 1.0f / sqrt(variance + epsilon);
+            }
+
+            for (int hw = 0; hw < channel_area; ++hw, ++output_data, ++input_data) {
+                float k = k_data[hw];
+                float bias = b_data[hw];
+                bias -= mean_x * variance * k;
+                
+                *output_data = (float)((*input_data) * variance * k + bias);
+                    
+            }
+        }
+    } else {
+        LOGE("Error: CpuLayerNormLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuLayerNormLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(LayerNorm, LAYER_LAYER_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_layer_acc.cc
new file mode 100644
index 0000000..d633c01
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct log_operator : unary_operator {
+    virtual float operator()(float in) {
+        return log(in);
+    }
+} LOG_OP;
+
+DECLARE_UNARY_ACC(Log, LAYER_LOG, LOG_OP);
+
+REGISTER_CPU_ACC(Log, LAYER_LOG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..9bab96e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_log_sigmoid_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct log_sigmoid_operator : unary_operator {
+    virtual float operator()(float in) {
+        return log(1.0f / (1.0f + exp(-in)));
+    }
+} LOG_SIGMOID_OP;
+
+DECLARE_UNARY_ACC(LogSigmoid, LAYER_LOGSIGMOID, LOG_SIGMOID_OP);
+
+REGISTER_CPU_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lrn_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lrn_layer_acc.cc
new file mode 100644
index 0000000..c290ce9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lrn_layer_acc.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(LRN, LAYER_LRN);
+
+Status CpuLRNLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuLRNLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<LRNLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: LRNLayerParam is empyt");
+    }
+    float alpha = param->alpha;
+    float beta  = param->beta;
+    float bias  = param->bias;
+    int size    = param->size;
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+    int batch   = output_blob->GetBlobDesc().dims[0];
+    int channel = output_blob->GetBlobDesc().dims[1];
+    int height  = output_blob->GetBlobDesc().dims[2];
+    int width   = output_blob->GetBlobDesc().dims[3];
+    int count   = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 0);
+
+    auto square_data = (float *)calloc(count, sizeof(float));
+    if (square_data == nullptr) {
+        LOGE("Error: calloc square data failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: calloc square data failed!");
+    }
+    for (int n = 0; n < batch; ++n) {
+        for (int c = 0; c < channel; ++c) {
+            // max(0, c - int(math.floor((nsize - 1) / 2)))
+            // :
+            // min(C-1, c + int(math.ceil((nsize - 1) / 2)) + 1),
+            int begin = std::max(0, c - int(std::floor((size - 1) / 2)));
+            int end   = std::min(channel, c + int(std::ceil((size - 1) / 2)) + 1);
+            for (int i = begin; i < end; ++i) {
+                int count = height * width;
+                for (int j = 0; j < count; ++j) {
+                    int input_index = n * channel * count + i * count + j;
+                    int index       = n * channel * count + c * count + j;
+                    // square
+                    square_data[index] += std::pow(input_data[input_index], 2);
+                }
+            }
+        }
+    }
+    // y = x / ((bias + (alpha / nsize) * square_sum) ** beta)
+    for (int n = 0; n < batch; ++n) {
+        for (int c = 0; c < channel; ++c) {
+            int count = height * width;
+            for (int i = 0; i < count; ++i) {
+                int index        = n * channel * count + c * count + i;
+                int output_index = index;
+                output_data[output_index] =
+                    input_data[index] / (std::pow(bias + (alpha / float(size)) * square_data[index], beta));
+            }
+        }
+    }
+    free(square_data);
+
+    return TNN_OK;
+}  // namespace TNN_NS
+
+REGISTER_CPU_ACC(LRN, LAYER_LRN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lstm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lstm_layer_acc.cc
new file mode 100644
index 0000000..5e48cb3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_lstm_layer_acc.cc
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(LSTMONNX, LAYER_LSTMONNX);
+
+static Status LSTM_Single(const float *x, float *y, const float *w, const float *r, const float *b,
+                          float *h_t, float *c_t,
+                          const int T, const int batch_size, const int input_size, const int hidden_size, int reverse) {
+    //num_directions = 1 for all below
+    //X shape [sequence batch_size input_size]
+    const int x_page_size = batch_size * input_size;
+    
+    //Y shape [sequence batch_size num_directions * hidden_size]
+    const int y_page_size = batch_size * hidden_size;
+    
+    //W[iofc], weight tensor for the gates, shape [num_directions, 4*hidden_size, input_size]
+    const int w_page_size = hidden_size * input_size;
+    auto w_x_I = w;
+    auto w_x_O = w_x_I + w_page_size;
+    auto w_x_F = w_x_O + w_page_size;
+    auto w_x_C = w_x_F + w_page_size;
+    
+    //R[iofc], recurrence weight tensor, shape [num_directions, 4*hidden_size, hidden_size]
+    int r_page_size = hidden_size * hidden_size;
+    auto r_x_I = r;
+    auto r_x_O = r_x_I + r_page_size;
+    auto r_x_F = r_x_O + r_page_size;
+    auto r_x_C = r_x_F + r_page_size;
+    
+    //B[iofc] Concatenation of [Wb[iofc], Rb[iofc]], [num_directions, 8*hidden_size]
+    int b_page_size = hidden_size;
+    auto b_w_I = b;
+    auto b_w_O = b_w_I + b_page_size;
+    auto b_w_F = b_w_O + b_page_size;
+    auto b_w_C = b_w_F + b_page_size;
+    
+    auto b_r_I = b_w_C + b_page_size;
+    auto b_r_O = b_r_I + b_page_size;
+    auto b_r_F = b_r_O + b_page_size;
+    auto b_r_C = b_r_F + b_page_size;
+    
+    //temp gates, shape [hidden_size, 4]
+    auto gates = std::shared_ptr<float>(new float[hidden_size * 4], [](float* p) { delete[] p; });
+    
+    for (int t = 0; t < T; t++) {
+        int ti = reverse ? T - 1 - t : t;
+
+        const float* x_t = x + ti * x_page_size;
+        float* y_t = y + ti *y_page_size;
+        
+        for (int b = 0; b < batch_size; b++) {
+            const float* x_t_b = x_t + b * input_size;
+            float* h_t_b = h_t + b * hidden_size;
+            float* c_t_b = c_t + b * hidden_size;
+            //float*gates_b = (float *)gates.get() + b * output_size * 4;
+            
+            for (int q = 0; q < hidden_size; q++) {
+                auto gates_data = (float *)gates.get() + q * 4;
+                
+                //W weights
+                auto w_x_I_o = w_x_I + q * input_size;
+                auto w_x_O_o = w_x_O + q * input_size;
+                auto w_x_F_o = w_x_F + q * input_size;
+                auto w_x_C_o = w_x_C + q * input_size;
+
+                auto r_x_I_o = r_x_I + q * hidden_size;
+                auto r_x_O_o = r_x_O + q * hidden_size;
+                auto r_x_F_o = r_x_F + q * hidden_size;
+                auto r_x_C_o = r_x_C + q * hidden_size;
+                
+                //bias
+                float I = b_w_I[q] + b_r_I[q];
+                float O = b_w_O[q] + b_r_O[q];
+                float F = b_w_F[q] + b_r_F[q];
+                float C = b_w_C[q] + b_r_C[q];
+
+                for (int i = 0; i < input_size; i++) {
+                    I += w_x_I_o[i] * x_t_b[i];
+                    O += w_x_O_o[i] * x_t_b[i];
+                    F += w_x_F_o[i] * x_t_b[i];
+                    C += w_x_C_o[i] * x_t_b[i];
+                }
+
+                for (int i = 0; i < hidden_size; i++) {
+                    I += r_x_I_o[i] * h_t_b[i];
+                    O += r_x_O_o[i] * h_t_b[i];
+                    F += r_x_F_o[i] * h_t_b[i];
+                    C += r_x_C_o[i] * h_t_b[i];
+                }
+
+                gates_data[0] = I;
+                gates_data[1] = O;
+                gates_data[2] = F;
+                gates_data[3] = C;
+            }
+            
+            float* output_data = y_t + b *hidden_size;
+            for (int q = 0; q < hidden_size; q++) {
+                const auto gates_data = (float *)gates.get() + q * 4;
+
+                float I = gates_data[0];
+                float O = gates_data[1];
+                float F = gates_data[2];
+                float C = gates_data[3];
+
+                I = 1.f / (1.f + exp(-I));
+                F = 1.f / (1.f + exp(-F));
+                O = 1.f / (1.f + exp(-O));
+                C = tanh(C);
+
+                float cell2 = F * c_t_b[q] + I * C;
+                float H = O * tanh(cell2);
+                c_t_b[q] = cell2;
+                h_t_b[q] = H;
+                output_data[q] = H;
+            }
+        }
+    }
+    
+    return TNN_OK;
+}
+
+Status CpuLSTMONNXLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuLSTMONNXLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    int num_directions = layer_param->direction >=2 ? 2 : 1;
+    
+    bool reverse = false;
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+    }
+    Blob * blob_W = inputs[1];
+    Blob * blob_R = inputs[2];
+    Blob * blob_B = inputs[3];
+    Blob * blob_h0 = nullptr;
+    Blob * blob_c0 = nullptr;
+    
+    if (inputs.size() >= 6) {
+        blob_h0 = inputs[4];
+        blob_c0 = inputs[5];
+    }
+    
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto T = input_dims[0]; // length of sequence
+    const auto batch = input_dims[1];  // batch_size
+    const auto input_size = DimsVectorUtils::Count(input_dims, 2); // input dimension
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const auto hidden_size = layer_param->hidden_size; // output dimension
+    
+    //X shape [sequence batch_size input_size]
+    float *x = (float *)((char*)(inputs[0]->GetHandle().base) + inputs[0]->GetHandle().bytes_offset);
+    
+    //Y shape [sequence batch_size num_directions *hidden_size]
+    float *y = (float *)((char*)(outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset);
+    
+    //W[iofc], weight tensor for the gates, shape [num_directions, 4*hidden_size, input_size]
+    float *w = (float *)((char*)(blob_W->GetHandle().base) + blob_W->GetHandle().bytes_offset);
+    
+    //R[iofc], recurrence weight tensor, shape [num_directions, 4*hidden_size, hidden_size]
+    float *r = (float *)((char*)(blob_R->GetHandle().base) + blob_R->GetHandle().bytes_offset);
+    
+    //B[iofc] Concatenation of [Wb[iofc], Rb[iofc]], [num_directions, 8*hidden_size]
+    float *b = (float *)((char*)(blob_B->GetHandle().base) + blob_B->GetHandle().bytes_offset);
+    
+    //initial_h, initial value of the hidden, If not specified - assumed to be 0. shape [num_directions, batch_size, hidden_size]
+    auto h_t = (float *)((char*)(outputs[1]->GetHandle().base) + outputs[1]->GetHandle().bytes_offset);
+    if (blob_h0 != nullptr){
+        auto h_0 = (float *)((char*)(blob_h0->GetHandle().base) + blob_h0->GetHandle().bytes_offset);
+        if (h_0) {
+            memcpy((void *)h_t, h_0, num_directions * batch * hidden_size * sizeof(float));
+        }
+    } else {
+        memset(h_t, 0, num_directions * batch * hidden_size * sizeof(float));
+    }
+    
+    //initial_c, initial value of the cell, If not specified - assumed to be 0. shape [num_directions, batch_size, hidden_size]
+    auto c_t = (float *)((char*)(outputs[2]->GetHandle().base) + outputs[2]->GetHandle().bytes_offset);
+    if (blob_c0 != nullptr){
+        auto c_0 = (float *)((char*)(blob_c0->GetHandle().base) + blob_c0->GetHandle().bytes_offset);
+        if (c_0) {
+            memcpy((void *)c_t, c_0, num_directions * batch * hidden_size * sizeof(float));
+        }
+    } else {
+        memset(c_t, 0, num_directions * batch * hidden_size * sizeof(float));
+    }
+    
+    if (layer_param->direction == 0 || layer_param->direction == 1) {
+        return LSTM_Single(x, y, w, r, b, h_t, c_t, T, batch, input_size, hidden_size, layer_param->direction);
+    } else if (layer_param->direction == 2) {
+        //Y shape [num_directions sequence batch_size hidden_size]
+        auto y_temp = std::shared_ptr<float>(new float[num_directions*T*batch*hidden_size], [](float* p) { delete[] p; });
+        auto y0 = y_temp.get();
+        auto y1 = y0 + T * batch * hidden_size;
+        LSTM_Single(x, y0, w, r, b, h_t, c_t, T, batch, input_size, hidden_size, 0);
+        
+        auto w1 = w + 4*hidden_size*input_size;
+        auto r1 = r + 4*hidden_size*hidden_size;
+        auto b1 = b + 8*hidden_size;
+        auto h_t1 = h_t + batch*hidden_size;
+        auto c_t1 = c_t + batch*hidden_size;
+        LSTM_Single(x, y1, w1, r1, b1, h_t1, c_t1, T, batch, input_size, hidden_size, 1);
+        
+        //transpose [num_directions sequence batch_size hidden_size] to [sequence batch_size num_directions*hidden_size]
+        for (int i = 0; i < T*batch; i++) {
+            auto y0_data = y0 + i*hidden_size;
+            auto y1_data = y1 + i*hidden_size;
+            auto y_data = y + i*num_directions*hidden_size;
+
+            memcpy(y_data, y0_data, hidden_size * sizeof(float));
+            memcpy(y_data + hidden_size, y1_data, hidden_size * sizeof(float));
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "LSTMONNX has invalid direction param");
+    }
+
+  
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(LSTMONNX, LAYER_LSTMONNX);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc
new file mode 100644
index 0000000..301c118
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_CPU_ACC(MatMul, LAYER_MATMUL);
+
+Status CpuMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param               = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource            = dynamic_cast<MatMulLayerResource *>(resource_);
+    DimsVector matrix_a_dims = param->matrix_a_dims;
+    DimsVector matrix_b_dims = param->matrix_b_dims;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+    }
+    DataType data_type       = inputs[0]->GetBlobDesc().data_type;
+    auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *matrix_a;
+        float *matrix_b;
+
+        if (inputs.size() == 2) {
+            matrix_a = static_cast<float *>(inputs[0]->GetHandle().base);
+            matrix_b = static_cast<float *>(inputs[1]->GetHandle().base);
+        } else {
+            auto weight = resource->weight.force_to<float *>();
+            matrix_a    = param->weight_position == 0 ? weight : static_cast<float *>(inputs[0]->GetHandle().base);
+            matrix_b    = param->weight_position == 1 ? weight : static_cast<float *>(inputs[0]->GetHandle().base);
+        }
+        auto matrix_c = static_cast<float *>(outputs[0]->GetHandle().base);
+        int M         = matrix_a_dims[matrix_a_dims.size() - 2];
+        int N         = matrix_a_dims[matrix_a_dims.size() - 1];
+        int K         = matrix_b_dims[matrix_b_dims.size() - 1];
+        int count_a     = DimsVectorUtils::Count(matrix_a_dims);
+        int count_b     = DimsVectorUtils::Count(matrix_b_dims);
+        int count_c     = DimsVectorUtils::Count(matrix_c_dims);
+        int batch_a   = count_a / (M * N);
+        int batch_b   = count_b / (N * K);
+        int batch_c   = count_c / (M * K);
+        for (int bc = 0; bc < batch_c; ++bc) {
+            int ba = bc % batch_a;
+            int bb = bc % batch_b;
+            
+            for (int m = 0; m < M; ++m) {
+                for (int k = 0; k < K; ++k) {
+                    //in align with onnx, use double to compute here for decision.
+                    //or for align with bert model, use COSINE distance ??? not checked
+                    double sum = 0;
+                    for (int n = 0; n < N; ++n) {
+                        sum += double(matrix_a[ba * M * N + m * N + n]) * double(matrix_b[bb * N * K + n * K + k]);
+                    }
+                    matrix_c[bc * M * K + m * K + k] = float(sum);
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(MatMul, LAYER_MATMUL)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc
new file mode 100644
index 0000000..9a3a6ff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Max, LAYER_MAXIMUM);
+
+Status CpuMaxLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_MAX(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else {
+        LOGE("Error: CpuMaxLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuMaxLayerAcc don't support data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc
new file mode 100644
index 0000000..874bc1b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Min, LAYER_MINIMUM);
+
+Status CpuMinLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_MIN(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else {
+        LOGE("Error: CpuMinLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuMinLayerAcc don't support data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Min, LAYER_MINIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc
new file mode 100644
index 0000000..4eb817e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Mul, LAYER_SUB);
+
+Status CpuMulLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_MUL(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                  [](int a, int b) -> int { return a * b; });
+    } else {
+        LOGE("Error: CpuMulLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuMulLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Mul, LAYER_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_neg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_neg_layer_acc.cc
new file mode 100644
index 0000000..9e87363
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_neg_layer_acc.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct neg_operator : unary_operator {
+    virtual float operator()(float in) {
+        return -in;
+    }
+    virtual int operator()(int in) {
+        return -in;
+    }
+    virtual int8_t operator()(int8_t in) {
+        return -in;
+    }
+} NEG_OP;
+
+DECLARE_UNARY_ACC(Neg, LAYER_NEG, NEG_OP);
+
+REGISTER_CPU_ACC(Neg, LAYER_NEG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_nonzero_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_nonzero_layer_acc.cc
new file mode 100644
index 0000000..3a3db67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_nonzero_layer_acc.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(NonZero, LAYER_NONZERO,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+                          void PutDataIndex(int *dst, const DimsVector stride, const DimsVector index););
+
+Status CpuNonZeroLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+void CpuNonZeroLayerAcc::PutDataIndex(int *dst, const DimsVector stride, const DimsVector index) {
+    for (int i=0; i<index.size(); i++) {
+        *dst = index[i];
+        dst += stride[1];
+    }
+}
+
+Status CpuNonZeroLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto input_count = DimsVectorUtils::Count(input_dims);
+    auto ele_size = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+    
+    auto input_data_ptr = (char *)inputs[0]->GetHandle().base;
+    
+    //runtime compute count
+    int nonzero_count = 0;
+    for (int index=0; index<input_count; index++) {
+        bool is_non_zero = false;
+        for (int i=0; i<ele_size; i++) {
+            if (input_data_ptr[i] != 0) {
+                is_non_zero = true;
+                break;
+            }
+        }
+        
+        if (is_non_zero) {
+            nonzero_count++;
+        }
+        
+        input_data_ptr += ele_size;
+    }
+    
+    int input_dim_size = (int)input_dims.size();
+    outputs[0]->GetBlobDesc().dims = {input_dim_size, nonzero_count};
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+Status CpuNonZeroLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto input_count = DimsVectorUtils::Count(input_dims);
+    auto ele_size = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+    
+    auto output_blob  = outputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto output_shtride = DimsFunctionUtils::StrideOfShape(output_dims);
+    
+    auto input_data_ptr = (char *)inputs[0]->GetHandle().base;
+    int *output_data_ptr = (int *)outputs[0]->GetHandle().base;
+    
+    DimsVector dim_index(input_dims.size(), 0);
+    for (int index=0; index<input_count; index++) {
+        bool is_non_zero = false;
+        for (int i=0; i<ele_size; i++) {
+            if (input_data_ptr[i] != 0) {
+                is_non_zero = true;
+                break;
+            }
+        }
+        
+        if (is_non_zero) {
+            PutDataIndex(output_data_ptr, output_shtride, dim_index);
+            output_data_ptr++;
+        }
+        
+        input_data_ptr += ele_size;
+        dim_index = DimsFunctionUtils::IncreaseIndex(dim_index, input_dims);
+    }
+    
+    
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(NonZero, LAYER_NONZERO);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_normalize_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_normalize_layer_acc.cc
new file mode 100644
index 0000000..2d82c16
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_normalize_layer_acc.cc
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+#include <limits.h>
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Normalize, LAYER_NORMALIZE);
+
+bool CheckNormalizeLayerParam(const int p, const int axis, const int across_spatial) {
+    return (p != 1 && p != 2 && p != INT_MAX && p != INT_MIN) || axis != 1 || across_spatial != 0;
+}
+
+Status CpuNormalizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuNormalizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 1) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 2");
+    }
+    auto layer_param = dynamic_cast<NormalizeLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is nil");
+    }
+
+    float epsilon = layer_param->epsilon;
+    int axis      = layer_param->axis;
+    int p         = layer_param->p;
+
+    int across_spatial = layer_param->across_spatial;
+    int channel_shared = layer_param->channel_shared;
+
+    // old tnn support scale the result of normalize and only norm2
+    if (CheckNormalizeLayerParam(p, axis, across_spatial)) {
+        LOGE("Error: layer param is not supported now\n");
+        return Status(TNNERR_INST_ERR, "Error: layer param is not supported now");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto output_dims  = output_blob->GetBlobDesc().dims;
+    int batch         = output_dims[0];
+    int channel       = output_dims[1];
+    int channel_size  = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        float *denominator = new float[channel_size];
+        memset(denominator, 0, channel_size * sizeof(float));
+
+        for (int b = 0; b < batch; b++) {
+            float *input_data_b  = input_data + b * channel * channel_size;
+            float *output_data_b = output_data + b * channel * channel_size;
+            int start_channel    = 0;
+            if (layer_param->p == INT_MAX || layer_param->p == INT_MIN) {
+                memcpy(denominator, input_data_b, channel_size * sizeof(float));
+                start_channel = 1;
+            }
+
+            for (int c = start_channel; c < channel; c++) {
+                float *input_data_c = input_data_b + c * channel_size;
+                for (int index = 0; index < channel_size; index++) {
+                    if (layer_param->p == 1) {
+                        // sum - abs(x)
+                        denominator[index] += fabs(input_data_c[index]);
+                    } else if (layer_param->p == 2) {
+                        // sum - x*x
+                        denominator[index] += input_data_c[index] * input_data_c[index];
+                    } else if (layer_param->p == INT_MAX) {
+                        denominator[index] = std::max(denominator[index], input_data_c[index]);
+                    } else if (layer_param->p == INT_MIN) {
+                        denominator[index] = std::min(denominator[index], input_data_c[index]);
+                    }
+                }
+            }
+
+            if (layer_param->p == 2) {
+                // max - sqrt
+                for (int index = 0; index < channel_size; index++) {
+                    denominator[index] = std::max((float)sqrt(denominator[index]), epsilon);
+                }
+            }
+
+            // div
+            for (int c = 0; c < channel; c++) {
+                float *input_data_c  = input_data_b + c * channel_size;
+                float *output_data_c = output_data_b + c * channel_size;
+                for (int index = 0; index < channel_size; index++) {
+                    output_data_c[index] = input_data_c[index] / denominator[index];
+                }
+            }
+        }
+        delete[] denominator;
+    } else {
+        LOGE("Error: CpuNormalizeLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuNormalizeLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Normalize, LAYER_NORMALIZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_onehot_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_onehot_layer_acc.cc
new file mode 100644
index 0000000..cc0cf7b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_onehot_layer_acc.cc
@@ -0,0 +1,116 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(OneHot, LAYER_ONEHOT,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuOneHotLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuOneHotLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<OneHotLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 3) {
+        //depth
+        {
+            auto data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+            if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+                return Status(TNNERR_PARAM_ERR, "OneHotLayer has invalid layer resource for depth param");
+            }
+            layer_param->depth = data[0];
+        }
+        
+        //values
+        {
+            auto data = (float *)((char *)inputs[2]->GetHandle().base + inputs[2]->GetHandle().bytes_offset);
+            if (inputs[2]->GetBlobDesc().data_type!= DATA_TYPE_FLOAT ||
+                DimsVectorUtils::Count(inputs[2]->GetBlobDesc().dims) < 2) {
+                return Status(TNNERR_PARAM_ERR, "OneHotLayer has invalid layer resource for values param");
+            }
+            layer_param->value_off = data[0];
+            layer_param->value_on = data[1];
+        }
+        
+        //infer output shape
+        int axis = layer_param->axis;
+        auto output_dims = inputs[0]->GetBlobDesc().dims;
+        if (axis < 0) {
+            axis += output_dims.size() + 1;
+        }
+        
+        output_dims.insert(output_dims.begin()+axis, layer_param->depth);
+        
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return TNN_OK;
+}
+
+Status CpuOneHotLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<OneHotLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    int axis = layer_param->axis;
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    if (axis < 0) {
+        axis += input_dims.size() + 1;
+    }
+    
+    //see https://github.com/onnx/onnx/blob/master/docs/Operators.md#OneHot
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    RETURN_VALUE_ON_NEQ(input_data_type, DATA_TYPE_INT32,
+                        Status(TNNERR_MODEL_ERR, "OneHot input indices must be INT"));
+    auto output_data_type  = outputs[0]->GetBlobDesc().data_type;
+    RETURN_VALUE_ON_NEQ(output_data_type, DATA_TYPE_FLOAT,
+                        Status(TNNERR_MODEL_ERR, "OneHot only supports output with FLOAT"));
+    
+    auto input_data = (int *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+    auto output_data = (float *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+
+    const int output_count = DimsVectorUtils::Count(output_dims);
+    
+    auto value_off = layer_param->value_off;
+    auto value_on = layer_param->value_on;
+    
+    for (int i = 0; i < output_count; ++i) {
+        output_data[i] = value_off;
+    }
+    
+    DimsVector input_index(input_dims.size(), 0);
+    const int input_count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    for (int i = 0; i < input_count; ++i) {
+        int depth = input_data[i];
+        auto output_index = input_index;
+        output_index.insert(output_index.begin()+axis, depth);
+        auto output_offset = DimsOffsetUtils::ConvertIndexToOffset(output_dims, output_index);
+        output_data[output_offset] = value_on;
+        
+        input_index = DimsFunctionUtils::IncreaseIndex(input_index, input_dims);
+    }
+    
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(OneHot, LAYER_ONEHOT);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc
new file mode 100644
index 0000000..cf1baeb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc
@@ -0,0 +1,228 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Pad, LAYER_PAD);
+
+Status CpuPadLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+#define GetPadCommonParams                                              \
+    int pad_l = layer_param->pads[0];                                   \
+    int pad_r = layer_param->pads[1];                                   \
+    int pad_t = layer_param->pads[2];                                   \
+    int pad_b = layer_param->pads[3];                                   \
+    int pad_c_b = layer_param->pads[4];                                 \
+    int pad_c_e = layer_param->pads[5];
+
+void ConstPad(float* input_data, float* output_data, const int batch,
+              const int input_channel, const int input_height, const int input_width,
+              const int output_channel, const int output_height, const int output_width,
+              PadLayerParam* layer_param) {
+    GetPadCommonParams;
+    float value = layer_param->value;
+
+    int cb_border = pad_c_b;
+    int ce_border = pad_c_b + input_channel;
+    int ht_border = pad_t;
+    int hb_border = pad_t + input_height;
+    int wl_border = pad_l;
+    int wr_border = pad_l + input_width;
+    for (int n = 0; n < batch; ++n) {
+        for (int c = 0; c < output_channel; c++) {
+            auto input_data_ptr  = input_data + (n * input_channel + c - cb_border) * input_height * input_width;
+            auto output_data_ptr = output_data + (n * output_channel + c) * output_height * output_width;
+            for (int h = 0; h < output_height; ++h) {
+                for (int w = 0; w < output_width; ++w) {
+                    if (c < cb_border || c >= ce_border ||
+                        h < ht_border || h >= hb_border ||
+                        w < wl_border || w >= wr_border ) {
+                        output_data_ptr[h * output_width + w] = value;
+                    } else {
+                        int output_idx              = h * output_width + w;
+                        int input_idx               = (h - ht_border) * input_width + w - wl_border;
+                        output_data_ptr[output_idx] = input_data_ptr[input_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ReflectPad(float* input_data, float* output_data, const int batch,
+                const int input_channel, const int input_height, const int input_width,
+                const int output_channel, const int output_height, const int output_width,
+                const int channels, const int input_width_bytes, const int data_byte_size,
+                PadLayerParam* layer_param) {
+    GetPadCommonParams;
+
+    for (int c = 0; c < channels; c++) {
+        auto input_data_ptr  = input_data + c * input_height * input_width;
+        auto output_data_ptr = output_data + c * output_height * output_width;
+
+        // center
+        for (int h = 0; h < input_height; h++) {
+            auto output_ptr_h = output_data_ptr + output_width * (h + pad_t);
+            auto input_ptr_h  = input_data_ptr + input_width * h;
+            for (int i = 0; i < pad_l; i++) {
+                output_ptr_h[i] = input_ptr_h[pad_l - i];
+            }
+            memcpy(output_ptr_h + pad_l, input_ptr_h, input_width_bytes);
+            for (int i = 0; i < pad_r; i++) {
+                output_ptr_h[i + pad_l + input_width] = input_ptr_h[input_width - i - 2];
+            }
+        }
+
+        // top
+        for (int h = 0; h < pad_t; h++) {
+            auto output_ptr_h = output_data_ptr + output_width * h;
+            auto output_ref_h = output_data_ptr + output_width * (pad_t + pad_t - h);
+            memcpy(output_ptr_h, output_ref_h, output_width * data_byte_size);
+        }
+
+        // bottom
+        for (int h = 0; h < pad_b; h++) {
+            auto output_ptr_h = output_data_ptr + output_width * (h + input_height + pad_t);
+            auto output_ref_h = output_data_ptr + output_width * (input_height + pad_t - 2 - h);
+            memcpy(output_ptr_h, output_ref_h, output_width * data_byte_size);
+        }
+    }
+}
+
+Status EdgePad(float* input_data, float* output_data, const int batch,
+               const int input_channel, const int input_height, const int input_width,
+               const int output_channel, const int output_height, const int output_width,
+               const int channels, const int input_width_bytes, const int data_byte_size,
+               PadLayerParam* layer_param) {
+    GetPadCommonParams;
+
+    for (int c = 0; c < channels; c++) {
+        auto input_data_ptr  = input_data + c * input_height * input_width;
+        auto output_data_ptr = output_data + c * output_height * output_width;
+
+        int ht_border = pad_t;
+        int hb_border = pad_t + input_height;
+        int wl_border = pad_l;
+        int wr_border = pad_l + input_width;
+        // top
+        for (int h = 0; h < output_height; ++h) {
+            for (int w = 0; w < output_width; ++w) {
+                if (h < ht_border && w < wl_border) {
+                    // left_top
+                    output_data_ptr[h * output_width + w] = input_data_ptr[0 * input_width + 0];
+                } else if (h < ht_border && w >= wr_border) {
+                    // right_top
+                    output_data_ptr[h * output_width + w] = input_data_ptr[0 * input_width + input_width - 1];
+                } else if (h > hb_border && w < wl_border) {
+                    // left_bottom
+                    output_data_ptr[h * output_width + w] =
+                        input_data_ptr[(input_height - 1) * input_width + 0];
+                } else if (h > hb_border && w >= wr_border) {
+                    // right_bottom
+                    output_data_ptr[h * output_width + w] =
+                        input_data_ptr[(input_height - 1) * input_width + input_width - 1];
+                } else if (h >= ht_border && w < wl_border) {
+                    // left
+                    output_data_ptr[h * output_width + w] = input_data_ptr[(h - ht_border) * input_width + 0];
+                } else if (h >= ht_border && w >= wr_border) {
+                    // right
+                    output_data_ptr[h * output_width + w] =
+                        input_data_ptr[(h - ht_border) * input_width + input_width - 1];
+                } else if (h < ht_border && w < wr_border) {
+                    // top
+                    output_data_ptr[h * output_width + w] = input_data_ptr[0 * input_width + w];
+                } else if (h >= hb_border && w < wr_border) {
+                    // bottom
+                    output_data_ptr[h * output_width + w] =
+                        input_data_ptr[(input_height - 1) * input_width + w];
+                } else {
+                    LOGE("Error: Stuck in the wrong branch: type:%d\n", layer_param->type);
+                    return Status(TNNERR_PARAM_ERR, "Error: layer param is not supported");
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status CpuPadLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+
+    int batch                   = output_dims[0];
+    int channels                = output_dims[0] * output_dims[1];
+    int output_channel          = output_dims[1];
+    int output_height           = output_dims[2];
+    int output_width            = output_dims[3];
+    int input_channel           = input_dims[1];
+    int input_height            = input_dims[2];
+    int input_width             = input_dims[3];
+    int data_byte_size          = DataTypeUtils::GetBytesSize(input_blob->GetBlobDesc().data_type);
+    const int input_width_bytes = input_width * data_byte_size;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_UINT32) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        if (layer_param->type == 0) {
+            // mode: const
+            ConstPad(input_data, output_data, batch, input_channel, input_height, input_width,
+                     output_channel, output_height, output_width, layer_param);
+        } else if (layer_param->type == 1) {
+            // mode: reflect
+            ReflectPad(input_data, output_data, batch, input_channel, input_height, input_width,
+                       output_channel, output_height, output_width, channels, input_width_bytes,
+                       data_byte_size, layer_param);
+        } else if (layer_param->type == 2) {
+            // mode: edge
+            Status status = EdgePad(
+                    input_data, output_data, batch, input_channel, input_height, input_width,
+                    output_channel, output_height, output_width, channels, input_width_bytes,
+                    data_byte_size, layer_param);
+            if (status != TNN_OK) {
+                return status;
+            }
+        } else {
+            LOGE("Error: CpuPadLayerAcc layer param is not supported: type:%d\n", layer_param->type);
+            return Status(TNNERR_PARAM_ERR, "Error: CpuPadLayerAcc layer param is not supported");
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuPadLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuPadLayerAcc layer acc dont support datatype");
+    } else {
+        LOGE("Error: CpuPadLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuPadLayerAcc layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Pad, LAYER_PAD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_padv2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_padv2_layer_acc.cc
new file mode 100644
index 0000000..44315ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_padv2_layer_acc.cc
@@ -0,0 +1,138 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(PadV2, LAYER_PADV2,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuPadV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPadV2LayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                         const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<PadLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 2) {
+        auto data_type = inputs[1]->GetBlobDesc().data_type;
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data_ptr = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        DimsVector dims;
+        if (data_type == DATA_TYPE_INT32) {
+            auto dim_data = (int *)dim_data_ptr;
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->pads = dims;
+        } else if(data_type == DATA_TYPE_INT64){
+            auto dim_data = (long long int *)dim_data_ptr;
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(DataTypeUtils::SaturateCast(dim_data[i]));
+            }
+            layer_param->pads = dims;
+        }
+    }
+    
+    auto output_dims = inputs[0]->GetBlobDesc().dims;
+    auto dim_size = layer_param->pads.size()/2;
+    dim_size = dim_size <= output_dims.size() ? dim_size : output_dims.size();
+    for (int i = 0; i<dim_size; i++) {
+        output_dims[i] += layer_param->pads[i] + layer_param->pads[i+dim_size];
+    }
+    outputs[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+void ConstPadV2(float* input_data, float* output_data, DimsVector input_dims, DimsVector output_dims,
+              PadLayerParam* layer_param) {
+    float value = layer_param->value;
+    
+    const int count = DimsVectorUtils::Count(output_dims);
+    DimsVector output_dim_index(output_dims.size(), 0);
+    for (int i = 0; i<count; i++) {
+        auto input_index = DimsFunctionUtils::Pad(output_dim_index, input_dims, layer_param->pads, layer_param->type, nullptr);
+        if (DimsFunctionUtils::IsInBox(input_index, input_dims)) {
+            int input_offset = DimsOffsetUtils::ConvertIndexToOffset(input_dims, input_index);
+            output_data[i] = input_data[input_offset];
+        } else {
+            output_data[i] = value;
+        }
+        
+        output_dim_index = DimsFunctionUtils::IncreaseIndex(output_dim_index, output_dims);
+    }
+}
+
+void ReflectPadV2(float *input_data, float *output_data, DimsVector input_dims, DimsVector output_dims,
+                  PadLayerParam *layer_param) {
+    const int count = DimsVectorUtils::Count(output_dims);
+    DimsVector output_dim_index(output_dims.size(), 0);
+    for (int i = 0; i < count; i++) {
+        auto input_index = DimsFunctionUtils::Pad(output_dim_index, input_dims, layer_param->pads, layer_param->type, nullptr);
+        int input_offset = DimsOffsetUtils::ConvertIndexToOffset(input_dims, input_index);
+        output_data[i] = input_data[input_offset];
+
+        output_dim_index = DimsFunctionUtils::IncreaseIndex(output_dim_index, output_dims);
+    }
+}
+
+Status CpuPadV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ||
+        output_blob->GetBlobDesc().data_type == DATA_TYPE_UINT32) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        if (layer_param->type == 0) {
+            // mode: const
+            ConstPadV2(input_data, output_data, input_dims, output_dims, layer_param);
+        } else if (layer_param->type == 1) {
+            // mode: reflect
+            ReflectPadV2(input_data, output_data, input_dims, output_dims, layer_param);
+        } else {
+            LOGE("Error: CpuPadV2LayerAcc layer param is not supported: type:%d\n", layer_param->type);
+            return Status(TNNERR_PARAM_ERR, "Error: CpuPadV2LayerAcc layer param is not supported");
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuPadV2LayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuPadV2LayerAcc layer acc dont support datatype");
+    } else {
+        LOGE("Error: CpuPadV2LayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuPadV2LayerAcc layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(PadV2, LAYER_PADV2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc
new file mode 100644
index 0000000..6561b52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_permute_layer_acc.h"
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuPermuteLayerAcc::~CpuPermuteLayerAcc(){};
+
+Status CpuPermuteLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return TNN_OK;
+}
+
+Status CpuPermuteLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPermuteLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PermuteLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PermuteLayerParam is empyt");
+    }
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    DataType data_type     = output_blob->GetBlobDesc().data_type;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    const int output_count = DimsVectorUtils::Count(output_dims);
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    int num_dims = int(input_dims.size());
+    ASSERT(input_dims.size() == output_dims.size());
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(CpuPermuteLayerAcc::count(input_dims, i + 1));
+        output_step.push_back(CpuPermuteLayerAcc::count(output_dims, i + 1));
+    }
+
+    if (data_type != DATA_TYPE_INT8) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        NaivePermute<float>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    } else {
+        // DATA_TYPE_INT8
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        NaivePermute<int8_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuPermuteLayerAcc>> g_cpu_permute_layer_acc_register(LAYER_PERMUTE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.h
new file mode 100644
index 0000000..7a93f9a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_permute_layer_acc.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTE_LAYER_ACC_H_
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class CpuPermuteLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuPermuteLayerAcc();
+
+    /**
+     * @brief init layer with param, resouce, input blobs and output blobs.
+     * @param context cpu context
+     * @param param    layer param
+     * @param resource  layer resouce
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    /**
+     * @brief Compute the volume of a slice; i.e., the product of dimensions
+     *        among a range of axes.
+     *
+     * @param dimes the dimensions
+     *
+     * @param start_axis The first axis to include in the slice.
+     *
+     */
+    inline int count(std::vector<int> &dimes, int start_axis) const {
+        const int end_axis = int(dimes.size());
+        ASSERT(start_axis <= end_axis);
+        ASSERT(start_axis >= 0);
+        ASSERT(end_axis >= 0);
+        int count = 1;
+        for (int i = start_axis; i < end_axis; ++i) {
+            count *= dimes[i];
+        }
+        return count;
+    };
+};
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pixel_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pixel_shuffle_layer_acc.cc
new file mode 100644
index 0000000..51605fa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pixel_shuffle_layer_acc.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status CpuPixelShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPixelShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param   = dynamic_cast<PixelShuffleLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int upscale_factor = layer_param->upscale_factor;
+    auto input_blob    = inputs[0];
+    auto input_dims    = input_blob->GetBlobDesc().dims;
+    auto output_blob   = outputs[0];
+    auto output_dims   = output_blob->GetBlobDesc().dims;
+    int slice_size     = DimsVectorUtils::Count(output_dims, 0, 2);
+    auto input_h       = input_dims[2];
+    auto input_w       = input_dims[3];
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_prt  = static_cast<float *>(input_blob->GetHandle().base);
+        auto output_ptr = static_cast<float *>(output_blob->GetHandle().base);
+        for (int s = 0; s < slice_size; ++s) {
+            for (int i = 0; i < upscale_factor; ++i) {
+                for (int j = 0; j < upscale_factor; ++j) {
+                    for (int h = 0; h < input_h; ++h) {
+                        for (int w = 0; w < input_w; ++w) {
+                            output_ptr[s * input_h * upscale_factor * input_w * upscale_factor +
+                                       h * upscale_factor * input_w * upscale_factor + i * input_w * upscale_factor +
+                                       w * upscale_factor + j] =
+                                input_prt[s * upscale_factor * upscale_factor * input_h * input_w +
+                                          i * upscale_factor * input_h * input_w + j * input_h * input_w + h * input_w +
+                                          w];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_3d_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_3d_layer_acc.cc
new file mode 100644
index 0000000..d705528
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_3d_layer_acc.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Pool3D, LAYER_POOLING_3D);
+
+Status CpuPool3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPool3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PoolingLayerParam is nil");
+    }
+    if (inputs[0]->GetBlobDesc().data_format != DATA_FORMAT_NCDHW) {
+        LOGE("Error: Pool3D layer only support NCDHW data format\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Pool3D layer only support NCDHW data format");
+    }
+    if (outputs[0]->GetBlobDesc().data_format != DATA_FORMAT_NCDHW) {
+        LOGE("Error: Pool3D layer only support NCDHW data format\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Pool3D layer only support NCDHW data format");
+    }
+
+    int stride_x   = param->strides[0];
+    int stride_y   = param->strides[1];
+    int stride_d   = param->strides[2];
+    int pad_x      = param->pads[0];
+    int pad_y      = param->pads[2];
+    int pad_d      = param->pads[4];
+    int kernel_x   = param->kernels[0];
+    int kernel_y   = param->kernels[1];
+    int kernel_d   = param->kernels[2];
+    auto pool_type = param->pool_type;
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        NaivePooling3D<float, float>(reinterpret_cast<float *>(input->GetHandle().base),
+                                    reinterpret_cast<float *>(output->GetHandle().base), dims_input, dims_output,
+                                    stride_d, stride_y, stride_x, kernel_d, kernel_y, kernel_x,
+                                    pad_d, pad_y, pad_x, pool_type);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        NaivePooling3D<bfp16_t, float>(reinterpret_cast<bfp16_t *>(input->GetHandle().base),
+                                    reinterpret_cast<bfp16_t *>(output->GetHandle().base), dims_input, dims_output,
+                                    stride_d, stride_y, stride_x, kernel_d, kernel_y, kernel_x,
+                                    pad_d, pad_y, pad_x, pool_type);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        NaivePooling3D<int8_t, int32_t>(reinterpret_cast<int8_t *>(input->GetHandle().base),
+                                    reinterpret_cast<int8_t *>(output->GetHandle().base), dims_input, dims_output,
+                                    stride_d, stride_y, stride_x, kernel_d, kernel_y, kernel_x,
+                                    pad_d, pad_y, pad_x, pool_type);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Pool3D, LAYER_POOLING_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc
new file mode 100644
index 0000000..1fe21e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Pool, LAYER_POOLING);
+
+Status CpuPoolLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPoolLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PoolingLayerParam is nil");
+    }
+
+    int stride_x   = param->strides[0];
+    int stride_y   = param->strides[1];
+    int pad_x      = param->pads[0];
+    int pad_y      = param->pads[2];
+    int kernel_x   = param->kernels[0];
+    int kernel_y   = param->kernels[1];
+    auto pool_type = param->pool_type;
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto input_width = dims_input[3], input_height = dims_input[2];
+    auto output_width = dims_output[3], output_height = dims_output[2], output_channel = dims_output[1];
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        if (param->is_adaptive_pool) {
+            NaiveAdaptivePooling<float, float>(reinterpret_cast<float *>(input->GetHandle().base),
+                                               reinterpret_cast<float *>(output->GetHandle().base), dims_input,
+                                               dims_output, pool_type);
+        } else {
+            NaivePooling<float, float>(reinterpret_cast<float *>(input->GetHandle().base),
+                                       reinterpret_cast<float *>(output->GetHandle().base), dims_input, dims_output,
+                                       stride_y, stride_x, kernel_y, kernel_x, pad_y, pad_x, pool_type);
+        }
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        NaivePooling<bfp16_t, float>(reinterpret_cast<bfp16_t *>(input->GetHandle().base),
+                                    reinterpret_cast<bfp16_t *>(output->GetHandle().base), dims_input, dims_output,
+                                    stride_y, stride_x, kernel_y, kernel_x, pad_y, pad_x, pool_type);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        NaivePooling<int8_t, int32_t>(reinterpret_cast<int8_t *>(input->GetHandle().base),
+                                    reinterpret_cast<int8_t *>(output->GetHandle().base), dims_input, dims_output,
+                                    stride_y, stride_x, kernel_y, kernel_x, pad_y, pad_x, pool_type);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Pool, LAYER_POOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pow_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pow_layer_acc.cc
new file mode 100644
index 0000000..8061a5c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_pow_layer_acc.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct power_operator : unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<PowLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: pow layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: pow layer param is nil");
+        }
+        scale_    = layer_param->scale;
+        shift_    = layer_param->shift;
+        exponent_ = layer_param->exponent;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        return pow(in * scale_ + shift_, exponent_);
+    }
+
+private:
+    float scale_ = 0.f, shift_ = 0.f, exponent_ = 0.f;
+} POW_OP;
+
+DECLARE_UNARY_ACC(Pow, LAYER_POWER, POW_OP);
+
+REGISTER_CPU_ACC(Pow, LAYER_POWER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prelu_layer_acc.cc
new file mode 100644
index 0000000..f728670
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prelu_layer_acc.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FP32_RESOURCE(PRelu, LAYER_PRELU);
+
+Status CpuPReluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPReluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: PReluLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerParam is nil");
+    }
+
+    auto layer_res = dynamic_cast<PReluLayerResource *>(resource_);
+    if (!layer_res) {
+        LOGE("Error: PReluLayerResource is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerResource is nil");
+    }
+
+    const int slope_size     = layer_res->slope_handle.GetBytesSize();
+    const DataType data_type = layer_res->slope_handle.GetDataType();
+
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    int channel            = output_blob->GetBlobDesc().dims[1];
+    int count              = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    const int channel_size = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (0 == channel_size) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    if (output_blob->GetBlobDesc().data_type != DATA_TYPE_INT8) {
+        const float *slope_data = layer_res->slope_handle.force_to<float *>();
+
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        if (layer_param->channel_shared) {
+            for (int index = 0; index < count; ++index) {
+                if (input_data[index] < 0) {
+                    output_data[index] = input_data[index] * slope_data[0];
+                } else {
+                    output_data[index] = input_data[index];
+                }
+            }
+        } else {
+            for (int index = 0; index < count; ++index) {
+                if (input_data[index] < 0) {
+                    int channel_index  = (index / channel_size) % channel;
+                    output_data[index] = input_data[index] * slope_data[channel_index];
+                } else {
+                    output_data[index] = input_data[index];
+                }
+            }
+        }
+    } else {
+        ASSERT(0);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(PRelu, LAYER_PRELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.cc
new file mode 100644
index 0000000..ea4e1b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/cpu/acc/cpu_prior_box_layer_acc.h"
+
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+CpuPriorBoxLayerAcc::~CpuPriorBoxLayerAcc(){};
+
+Status CpuPriorBoxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return TNN_OK;
+}
+
+Status CpuPriorBoxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPriorBoxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PriorBoxLayerParam is empyt");
+    }
+
+    Blob *output_blob  = outputs[0];
+    void *output_data  = output_blob->GetHandle().base;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    // compute
+    if (data_type == DATA_TYPE_FLOAT) {
+        auto prior_box = GeneratePriorBox(inputs, outputs, param);
+        memcpy(output_data, prior_box.data(), prior_box.size() * sizeof(float));
+    } else {
+        return Status(TNNERR_LAYER_ERR, "datatype not support");
+    }
+    return TNN_OK;
+}
+
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuPriorBoxLayerAcc>> g_cpu_prior_box_layer_acc_register(LAYER_PRIOR_BOX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.h
new file mode 100644
index 0000000..b8e6fcf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_prior_box_layer_acc.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_PRIOR_BOX_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_PRIOR_BOX_LAYER_ACC_H_
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class CpuPriorBoxLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuPriorBoxLayerAcc();
+
+    /**
+     * @brief init layer with param, resouce, input blobs and output blobs.
+     * @param context cpu context
+     * @param param    layer param
+     * @param resource  layer resouce
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    template <typename T>
+    inline void set_value(const int N, const T alpha, T *Y) {
+        if (alpha == 0) {
+            memset(Y, 0, sizeof(T) * N);  // NOLINT(caffe/alt_fn)
+            return;
+        }
+        for (int i = 0; i < N; ++i) {
+            Y[i] = alpha;
+        }
+    }
+
+    template <typename T>
+    void compute(Blob *output_blob, T *output_data, PriorBoxLayerParam *param, int layer_height, int layer_width,
+                 int img_height, int img_width, float step_h, float step_w);
+};
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_PRIOR_BOX_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc
new file mode 100644
index 0000000..5d5c2ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(Ragne, LAYER_RANGE,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuRagneLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuRagneLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<RangeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 3) {
+        //start
+        {
+            layer_param->data_type = inputs[0]->GetBlobDesc().data_type;
+            
+            auto start_data = (int *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+            auto start = layer_param->start;
+            if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                start.f = *start_data;
+            } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                start.i = *((int *)start_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid limit data type");
+            }
+            layer_param->start = start;
+        }
+
+        //limit
+        {
+            auto limit_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+            auto limit = layer_param->limit;
+            if (inputs[1]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                limit.f = *limit_data;
+            } else if (inputs[1]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                limit.i = *((int *)limit_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid limit data type");
+            }
+            layer_param->limit = limit;
+        }
+        
+        //delta
+        {
+            auto delta_data = (int *)((char *)inputs[2]->GetHandle().base + inputs[2]->GetHandle().bytes_offset);
+            auto delta = layer_param->delta;
+            if (inputs[2]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                delta.f = *delta_data;
+            } else if (inputs[2]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                delta.i = *((int *)delta_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid delta data type");
+            }
+            layer_param->delta = delta;
+        }
+        
+        //infer output shape
+        Status status = TNN_OK;
+        auto output_dims = DimsFunctionUtils::Range(layer_param->start, layer_param->limit,
+                                                  layer_param->delta, layer_param->data_type, &status);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return TNN_OK;
+}
+
+Status CpuRagneLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<RangeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    const auto& output_blob = outputs[0];
+    const auto& data_type = output_blob->GetBlobDesc().data_type;
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    if (data_type == DATA_TYPE_INT32) {
+        auto output_data = static_cast<int32_t*>(output_blob->GetHandle().base);
+        for (int i = 0; i < count; ++i) {
+            output_data[i] = layer_param->start.i + i * layer_param->delta.i;
+        }
+    } else {
+        LOGE("output blob of Shape Layer has wrong data type \n");
+        return Status(TNNERR_COMMON_ERROR, "output blob has wrong data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Ragne, LAYER_RANGE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reciprocal_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reciprocal_layer_acc.cc
new file mode 100644
index 0000000..f625407
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reciprocal_layer_acc.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct reciprocal_operator : unary_operator {
+    virtual float operator()(float in) {
+        float temp = in;
+        if (temp != 0) {
+            temp = (1.0 / temp);
+        }
+        return temp;
+    }
+} RECIPROCAL_OP;
+
+DECLARE_UNARY_ACC(Reciprocal, LAYER_RECIPROCAL, RECIPROCAL_OP);
+
+REGISTER_CPU_ACC(Reciprocal, LAYER_RECIPROCAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l1_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l1_layer_acc.cc
new file mode 100644
index 0000000..38aad5e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l1_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceL1, LAYER_REDUCE_L1);
+
+Status CpuReduceL1LayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                            int inner_dim) {
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += std::abs(input_data[ic]);
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceL1, LAYER_REDUCE_L1);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l2_layer_acc.cc
new file mode 100644
index 0000000..4601829
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_l2_layer_acc.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_PRE_REDUCE_POST_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+Status CpuReduceL2LayerAcc::PreCalculateReduce(float* dst, float* src, int count) {
+    for (int i = 0; i < count; ++i) {
+        dst[i] = std::pow(src[i], 2);
+    }
+    return TNN_OK;
+}
+
+Status CpuReduceL2LayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                            int inner_dim) {
+    float* origin_output_data = output_data;
+    int output_size           = outer_dim * inner_dim;
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+Status CpuReduceL2LayerAcc::PostCalculateReduce(float* dst, float* src, int count) {
+    for (int i = 0; i < count; ++i) {
+        dst[i] = std::sqrt(src[i]);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.cc
new file mode 100644
index 0000000..5051e7c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.cc
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuReduceLayerAcc::~CpuReduceLayerAcc() {}
+
+Status CalculateReduceDims(Blob *input_blob, ReduceLayerParam *layer_param,
+                           std::vector<std::tuple<int, int, int>> &reduce_dims) {
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto axes       = layer_param->axis;
+    std::sort(axes.begin(), axes.end());
+    reduce_dims.clear();
+    for (const auto &axis : axes) {
+        int outer_count   = DimsVectorUtils::Count(input_dims, 0, axis);
+        int reducer_count = input_dims[axis];
+        int inner_count   = DimsVectorUtils::Count(input_dims, axis + 1);
+        inner_count       = inner_count == 0 ? 1 : inner_count;
+        reduce_dims.emplace_back(std::make_tuple(outer_count, reducer_count, inner_count));
+        input_dims[axis] = 1;
+    }
+    return TNN_OK;
+}
+
+Status CpuReduceLayerAcc::PreCalculateReduce(float *dst, float *src, int count) {
+    ::memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+Status CpuReduceLayerAcc::PostCalculateReduce(float *dst, float *src, int count) {
+    ::memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+Status CpuReduceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuReduceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.empty()) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 2");
+    }
+    auto layer_param = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is invalid\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is invalid");
+    }
+
+    Blob *input_blob  = inputs[0];
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    Blob *output_blob = outputs[0];
+    auto output_count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    // <outer count, reduce count, inner count>
+    std::vector<std::tuple<int, int, int>> reduce_dims;
+    Status status = CalculateReduceDims(input_blob, layer_param, reduce_dims);
+    if (status != TNN_OK) {
+        LOGE("CpuReduceLayerAcc: Calculate reduce dims failed\n");
+        return status;
+    }
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        int input_count            = DimsVectorUtils::Count(input_dims);
+        float* pre_cal_reduce_result = new float[input_count];
+        PreCalculateReduce(pre_cal_reduce_result, input_data, input_count);
+
+        float *src       = pre_cal_reduce_result;
+        float *tmp_ptr   = nullptr;
+        bool release_mem = false;
+        for (int i = 0; i < reduce_dims.size(); ++i) {
+            auto reduce_dim   = reduce_dims[i];
+            auto outer_count  = std::get<0>(reduce_dim);
+            auto reduce_count = std::get<1>(reduce_dim);
+            auto inner_count  = std::get<2>(reduce_dim);
+            if (tmp_ptr != nullptr) {
+                release_mem = true;
+            }
+            tmp_ptr = new float[inner_count * outer_count]();
+            CalculateReduce(tmp_ptr, src, outer_count, reduce_count, inner_count);
+            if (release_mem) {
+                delete[] src;
+            }
+            src = tmp_ptr;
+        }
+        PostCalculateReduce(output_data, src, output_count);
+        if (release_mem || reduce_dims.size() == 1) {
+            delete[] src;
+        }
+        delete[] pre_cal_reduce_result;
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuReduceLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuReduceLayerAcc layer acc dont support datatype");
+    } else {
+        LOGE("Error: CpuReduceLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuReduceLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.h
new file mode 100644
index 0000000..633468f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_layer_acc.h
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_REDUCE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_REDUCE_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+
+namespace TNN_NS {
+
+// @brief reduce layer acc
+class CpuReduceLayerAcc : public CpuLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~CpuReduceLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    virtual Status PreCalculateReduce(float *dst, float *src, int count);
+    virtual Status PostCalculateReduce(float *dst, float *src, int count);
+
+private:
+    virtual Status CalculateReduce(float *output_data, float *input_data, int outer_dim, int channels,
+                                   int inner_dim) = 0;
+};
+
+#define DECLARE_CPU_REDUCE_ACC(type_string, layer_type)                                                                \
+    class Cpu##type_string##LayerAcc : public CpuReduceLayerAcc {                                                      \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+                                                                                                                       \
+    private:                                                                                                           \
+        virtual Status CalculateReduce(float *output_data, float *input_data, int outer_dim, int channels,             \
+                                       int inner_dim);                                                                 \
+    }
+
+#define DECLARE_CPU_PRE_REDUCE_POST_ACC(type_string, layer_type)                                                       \
+    class Cpu##type_string##LayerAcc : public CpuReduceLayerAcc {                                                      \
+    public:                                                                                                            \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status PreCalculateReduce(float *dst, float *src, int count);                                          \
+        virtual Status PostCalculateReduce(float *dst, float *src, int count);                                         \
+                                                                                                                       \
+    private:                                                                                                           \
+        virtual Status CalculateReduce(float *output_data, float *input_data, int outer_dim, int channels,             \
+                                       int inner_dim);                                                                 \
+    }
+
+#define REGISTER_CPU_REDUCE_ACC(type_string, layer_type)                                                               \
+    CpuTypeLayerAccRegister<TypeLayerAccCreator<Cpu##type_string##LayerAcc>> g_cpu_##layer_type##_acc_register(        \
+        layer_type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_REDUCE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_exp_layer_acc.cc
new file mode 100644
index 0000000..99a7414
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_exp_layer_acc.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_PRE_REDUCE_POST_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+Status CpuReduceLogSumExpLayerAcc::PreCalculateReduce(float* dst, float* src, int count) {
+    ::memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+Status CpuReduceLogSumExpLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                                   int inner_dim) {
+    for (int oc = 0; oc < outer_dim; oc++) {
+        // Standardize to prevent overflow.
+        // log(sum(exp xi)) = c + log(sum(exp(xi-c)))
+        std::vector<float> max_values(inner_dim, -FLT_MAX);
+        for (int c = 0; c < channels; c++) {
+            const int offset = c * inner_dim;
+            for (int ic = 0; ic < inner_dim; ic++) {
+                max_values[ic] = std::max(max_values[ic], input_data[ic + offset]);
+            }
+        }
+
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += std::exp(input_data[ic] - max_values[ic]);
+            }
+            input_data += inner_dim;
+        }
+
+        for (int ic = 0; ic < inner_dim; ic++) {
+            output_data[ic] = std::log(output_data[ic]) + max_values[ic];
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+Status CpuReduceLogSumExpLayerAcc::PostCalculateReduce(float* dst, float* src, int count) {
+    ::memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_layer_acc.cc
new file mode 100644
index 0000000..360e4a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_log_sum_layer_acc.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_PRE_REDUCE_POST_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+Status CpuReduceLogSumLayerAcc::PreCalculateReduce(float* dst, float* src, int count) {
+    memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+Status CpuReduceLogSumLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                                int inner_dim) {
+    float* origin_output_data = output_data;
+    int output_size           = outer_dim * inner_dim;
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+Status CpuReduceLogSumLayerAcc::PostCalculateReduce(float* dst, float* src, int count) {
+    for (int i = 0; i < count; ++i) {
+        dst[i] = std::log(src[i]);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_max_layer_acc.cc
new file mode 100644
index 0000000..98ce9ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_max_layer_acc.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+Status CpuReduceMaxLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                             int inner_dim) {
+    int output_size = outer_dim * inner_dim;
+    for (int i = 0; i < output_size; ++i) {
+        output_data[i] = -FLT_MAX;
+    }
+
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] = std::max(input_data[ic], output_data[ic]);
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_mean_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_mean_layer_acc.cc
new file mode 100644
index 0000000..97b3859
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_mean_layer_acc.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+Status CpuReduceMeanLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                              int inner_dim) {
+    const float channels_inv = 1.0f / channels;
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += input_data[ic] * channels_inv;
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_min_layer_acc.cc
new file mode 100644
index 0000000..541a135
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_min_layer_acc.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+Status CpuReduceMinLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                             int inner_dim) {
+    int output_size = outer_dim * inner_dim;
+    for (int i = 0; i < output_size; ++i) {
+        output_data[i] = FLT_MAX;
+    }
+
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] = std::min(input_data[ic], output_data[ic]);
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_prod_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_prod_layer_acc.cc
new file mode 100644
index 0000000..568a720
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_prod_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceProd, LAYER_REDUCE_PROD);
+
+Status CpuReduceProdLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                              int inner_dim) {
+    int output_size = outer_dim * inner_dim;
+    for (int i = 0; i < output_size; ++i) {
+        output_data[i] = 1;
+    }
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] *= input_data[ic];
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceProd, LAYER_REDUCE_PROD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_layer_acc.cc
new file mode 100644
index 0000000..6eae5ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_REDUCE_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+Status CpuReduceSumLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                             int inner_dim) {
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_square_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_square_layer_acc.cc
new file mode 100644
index 0000000..ebe0183
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reduce_sum_square_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_reduce_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_PRE_REDUCE_POST_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+Status CpuReduceSumSquareLayerAcc::PreCalculateReduce(float* dst, float* src, int count) {
+    for (int i = 0; i < count; ++i) {
+        dst[i] = std::pow(src[i], 2);
+    }
+    return TNN_OK;
+}
+
+Status CpuReduceSumSquareLayerAcc::CalculateReduce(float* output_data, float* input_data, int outer_dim, int channels,
+                                                   int inner_dim) {
+    for (int oc = 0; oc < outer_dim; oc++) {
+        for (int c = 0; c < channels; c++) {
+            for (int ic = 0; ic < inner_dim; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_dim;
+        }
+        output_data += inner_dim;
+    }
+    return TNN_OK;
+}
+
+Status CpuReduceSumSquareLayerAcc::PostCalculateReduce(float* dst, float* src, int count) {
+    ::memcpy(dst, src, count * sizeof(float));
+    return TNN_OK;
+}
+
+REGISTER_CPU_REDUCE_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reformat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reformat_layer_acc.cc
new file mode 100644
index 0000000..2b148bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reformat_layer_acc.cc
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class CpuReformatLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuReformatLayerAcc();
+
+    /**
+     * @brief init layer with param, resouce, input blobs and output blobs.
+     * @param context cpu context
+     * @param param    layer param
+     * @param resource  layer resouce
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+CpuReformatLayerAcc::~CpuReformatLayerAcc() {}
+
+Status CpuReformatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_type == DATA_TYPE_INT8 && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+        reformat_param->type = DEQUANT_ONLY;
+    } else if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_INT8) {
+        reformat_param->type = QUANT_ONLY;
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Error: cpu layer acc got unsupported data type.");
+    }
+    return CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CpuReformatLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_type == DATA_TYPE_INT8 && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+        reformat_param->type = DEQUANT_ONLY;
+    } else if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_INT8) {
+        reformat_param->type = QUANT_ONLY;
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Error: cpu layer acc got unsupported data type.");
+    }
+    return TNN_OK;
+}
+
+Status CpuReformatLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto dims       = outputs[0]->GetBlobDesc().dims;
+    size_t datasize = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    IntScaleResource *re;
+    if (param->src_type == DATA_TYPE_INT8) {
+        re = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource();
+    } else if (param->dst_type == DATA_TYPE_INT8) {
+        re = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Error: cpu layer acc got unsupported data type.");
+    }
+
+    if (param->type == DEQUANT_ONLY) {
+        NaiveDequant(reinterpret_cast<int8_t *>(inputs[0]->GetHandle().base), re->scale_handle.force_to<float *>(),
+                    re->scale_handle.GetDataCount(), reinterpret_cast<float *>(outputs[0]->GetHandle().base), dims);
+    } else if (param->type == QUANT_ONLY) {
+        NaiveQuant(reinterpret_cast<float *>(inputs[0]->GetHandle().base), re->scale_handle.force_to<float *>(),
+                  re->scale_handle.GetDataCount(), reinterpret_cast<int8_t *>(outputs[0]->GetHandle().base), dims);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Reformat, LAYER_REFORMAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu6_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu6_layer_acc.cc
new file mode 100644
index 0000000..6d52a8c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu6_layer_acc.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Relu6, LAYER_RELU6);
+
+Status CpuRelu6LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuRelu6LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count         = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = std::max(0.0f, input_data[index]);
+            output_data[index] = std::min(6.0f, output_data[index]);
+        }
+    } else {
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = std::max((int8_t)0, input_data[index]);
+            //???确认是否是6
+            output_data[index] = std::min((int8_t)6, output_data[index]);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Relu6, LAYER_RELU6);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu_layer_acc.cc
new file mode 100644
index 0000000..5cf4270
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_relu_layer_acc.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Relu, LAYER_RELU);
+
+Status CpuReluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuReluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count         = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    auto data_type    = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = std::max(0.0f, input_data[index]);
+        }
+    } else if (data_type == DATA_TYPE_BFP16) {
+        bfp16_t *input_data  = static_cast<bfp16_t *>(input_blob->GetHandle().base);
+        bfp16_t *output_data = static_cast<bfp16_t *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = std::max(0.0f, (float)input_data[index]);
+        }
+    } else if (data_type == DATA_TYPE_INT8) {
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = std::max((int8_t)0, input_data[index]);
+        }
+    } else {
+        LOGE("CpuReluLayerAcc dont support data type: %d", data_type);
+        return Status(TNNERR_NO_RESULT, "CpuReluLayerAcc dont support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Relu, LAYER_RELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reorg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reorg_layer_acc.cc
new file mode 100644
index 0000000..0f0b0a8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reorg_layer_acc.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Reorg, LAYER_REORG);
+
+Status CpuReorgLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuReorgLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto layer_param  = dynamic_cast<ReorgLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    int stride  = layer_param->stride;
+    int forward = layer_param->forward;
+    int mode    = layer_param->mode;
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *bottom_data = static_cast<float *>(input_blob->GetHandle().base);
+        float *top_data    = static_cast<float *>(output_blob->GetHandle().base);
+        if (forward) {
+            DimsVector input_dims = input_blob->GetBlobDesc().dims;
+            int batch             = input_dims[0];
+            int channel           = input_dims[1];
+            int height            = input_dims[2];
+            int width             = input_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        } else {
+            DimsVector output_dims = output_blob->GetBlobDesc().dims;
+            int batch              = output_dims[0];
+            int channel            = output_dims[1];
+            int height             = output_dims[2];
+            int width              = output_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Reorg, LAYER_REORG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc
new file mode 100644
index 0000000..6b539b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(Reshape, LAYER_RESHAPE,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuReshapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuReshapeLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<ReshapeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    Status status = TNN_OK;
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (inputs.size() >= 2) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "Reshape input(shape) has invalid data type");
+        }
+        
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->shape = dims;
+        layer_param->num_axes = dim_count;
+        auto output_dims = DimsFunctionUtils::Reshape(input_dims, dims, layer_param->axis, dim_count, &status);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    //Adjust params to different batch\height\width with 0 and -1
+    auto shape = layer_param->shape;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    if (shape.size() == output_dims.size()) {
+        const auto count = MIN(output_dims.size(), input_dims.size());
+        
+        //reset 0
+        {
+            for (auto i=0; i<count; i++) {
+                if (output_dims[i]> 0 && input_dims[i] == output_dims[i] && shape[i] != -1) {
+                    shape[i] = 0;
+                }
+            }
+        }
+        
+        //reset -1
+        {
+            int non_zero_index = -1;
+            int non_zero_count = 0;
+            for (auto i=0; i<shape.size(); i++) {
+                if (shape[i] != 0) {
+                    non_zero_index = i;
+                    non_zero_count++;
+                }
+            }
+            
+            if (non_zero_count == 1) {
+                shape[non_zero_index] = -1;
+            }
+        }
+        
+        auto infer_output_dims = DimsFunctionUtils::Reshape(input_dims, shape, layer_param->axis, (int)shape.size(), &status);
+        if (status == TNN_OK && DimsVectorUtils::Equal(infer_output_dims, output_dims)) {
+            layer_param->shape = shape;
+        }
+    }
+    
+    return TNN_OK;
+}
+
+Status CpuReshapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto &input  = inputs[0];
+    auto &output = outputs[0];
+    auto param   = (ReshapeLayerParam *)param_;
+    ASSERT(param != nullptr);
+    if (param->reshape_type == 0) {
+        if (output->GetHandle().base != input->GetHandle().base) {
+            auto dims_input    = input->GetBlobDesc().dims;
+            int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+            auto size_in_bytes = DimsVectorUtils::Count(dims_input) * data_byte_size;
+            memcpy(output->GetHandle().base, input->GetHandle().base, size_in_bytes);
+        }
+    } else if (param->reshape_type == 1) {
+        const auto dims_output = output->GetBlobDesc().dims;
+        if (dims_output.size() <= 4) {
+            // tensorflow reshape
+            DataFormatConverter::ConvertFromNCHWToNHWC<float>(input, output);
+            DataFormatConverter::ConvertFromNHWCToNCHW<float>(output, nullptr);
+        } else {
+            // tensorflow reshape does not support dims>4
+            LOGE("Error: Unsupported dim size(%d) for reshape type(%d)", (int)dims_output.size(), param->reshape_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuReshapeLayerAcc failed!\n");
+        }
+    } else {
+        LOGE("Error: Unsupport reshape type(%d)", param->reshape_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuReshapeLayerAcc failed!\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Reshape, LAYER_RESHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_roialign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_roialign_layer_acc.cc
new file mode 100644
index 0000000..d15442c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_roialign_layer_acc.cc
@@ -0,0 +1,269 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(RoiAlign, LAYER_ROIALIGN);
+
+template <typename T>
+struct PreCalc {
+    int64_t pos1;
+    int64_t pos2;
+    int64_t pos3;
+    int64_t pos4;
+    T w1;
+    T w2;
+    T w3;
+    T w4;
+};
+
+template <typename T>
+void PreCalcForBilinearInterpolate(const int64_t height, const int64_t width, const int64_t pooled_height,
+                                       const int64_t pooled_width, const int64_t iy_upper, const int64_t ix_upper,
+                                       T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int64_t roi_bin_grid_h,
+                                       int64_t roi_bin_grid_w, std::vector<PreCalc<T>> &pre_calc) {
+    int64_t pre_calc_index = 0;
+    for (int64_t ph = 0; ph < pooled_height; ph++) {
+        for (int64_t pw = 0; pw < pooled_width; pw++) {
+            for (int64_t iy = 0; iy < iy_upper; iy++) {
+                const T yy = roi_start_h + ph * bin_size_h +
+                             static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+                for (int64_t ix = 0; ix < ix_upper; ix++) {
+                    const T xx = roi_start_w + pw * bin_size_w +
+                                 static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+                    T x = xx;
+                    T y = yy;
+                    // deal with: inverse elements are out of feature map boundary
+                    if (y < -1.0 || y > height || x < -1.0 || x > width) {
+                        // empty
+                        PreCalc<T> pc;
+                        pc.pos1                  = 0;
+                        pc.pos2                  = 0;
+                        pc.pos3                  = 0;
+                        pc.pos4                  = 0;
+                        pc.w1                    = 0;
+                        pc.w2                    = 0;
+                        pc.w3                    = 0;
+                        pc.w4                    = 0;
+                        pre_calc[pre_calc_index] = pc;
+                        pre_calc_index += 1;
+                        continue;
+                    }
+
+                    if (y <= 0) {
+                        y = 0;
+                    }
+                    if (x <= 0) {
+                        x = 0;
+                    }
+
+                    auto y_low = static_cast<int64_t>(y);
+                    auto x_low = static_cast<int64_t>(x);
+                    int64_t y_high;
+                    int64_t x_high;
+
+                    if (y_low >= height - 1) {
+                        y_high = y_low = height - 1;
+                        y              = (T)y_low;
+                    } else {
+                        y_high = y_low + 1;
+                    }
+
+                    if (x_low >= width - 1) {
+                        x_high = x_low = width - 1;
+                        x              = (T)x_low;
+                    } else {
+                        x_high = x_low + 1;
+                    }
+
+                    T ly = y - y_low;
+                    T lx = x - x_low;
+                    T hy = static_cast<T>(1.) - ly;
+                    T hx = static_cast<T>(1.) - lx;
+                    T w1 = hy * hx;
+                    T w2 = hy * lx;
+                    T w3 = ly * hx;
+                    T w4 = ly * lx;
+
+                    // save weights and indeces
+                    PreCalc<T> pc;
+                    pc.pos1                  = y_low * width + x_low;
+                    pc.pos2                  = y_low * width + x_high;
+                    pc.pos3                  = y_high * width + x_low;
+                    pc.pos4                  = y_high * width + x_high;
+                    pc.w1                    = w1;
+                    pc.w2                    = w2;
+                    pc.w3                    = w3;
+                    pc.w4                    = w4;
+                    pre_calc[pre_calc_index] = pc;
+
+                    pre_calc_index += 1;
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void CalculateRoiAlign(const DimsVector &output_shape, const T *bottom_data, float spatial_scale, int height, int width,
+                       int sampling_ratio, const T *bottom_rois, int64_t num_roi_cols, T *top_data, int mode,
+                       const int *batch_indices_ptr) {
+    int n_rois        = output_shape[0];
+    int channels      = output_shape[1];
+    int pooled_height = output_shape[2];
+    int pooled_width  = output_shape[3];
+
+    // 100 is a random chosed value, need be tuned
+    double cost = static_cast<double>(channels * pooled_width * pooled_height * 100);
+
+    for (int n = 0; n < n_rois; ++n) {
+        int64_t index_n = n * channels * pooled_width * pooled_height;
+
+        const T *offset_bottom_rois = bottom_rois + n * num_roi_cols;
+        const auto roi_batch_ind    = batch_indices_ptr[n];
+
+        // Do not using rounding; this implementation detail is critical
+        T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+        T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+        T roi_end_w   = offset_bottom_rois[2] * spatial_scale;
+        T roi_end_h   = offset_bottom_rois[3] * spatial_scale;
+
+        // Force malformed ROIs to be 1x1
+        T roi_width  = std::max(roi_end_w - roi_start_w, (T)1.);
+        T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+        T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+        T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+        // We use roi_bin_grid to sample the grid and mimic integral
+        int64_t roi_bin_grid_h = (sampling_ratio > 0)
+                                     ? sampling_ratio
+                                     : static_cast<int64_t>(std::ceil(roi_height / pooled_height));  // e.g., = 2
+        int64_t roi_bin_grid_w =
+            (sampling_ratio > 0) ? sampling_ratio : static_cast<int64_t>(std::ceil(roi_width / pooled_width));
+
+        // We do average (integral) pooling inside a bin
+        const int64_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+        // we want to precalculate indices and weights shared by all channels,
+        // this is the key point of optimization
+        std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+        PreCalcForBilinearInterpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w,
+                                          roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h,
+                                          roi_bin_grid_w, pre_calc);
+
+        for (int64_t c = 0; c < channels; c++) {
+            int64_t index_n_c = index_n + c * pooled_width * pooled_height;
+            const T *offset_bottom_data =
+                bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) * height * width);
+            int64_t pre_calc_index = 0;
+
+            for (int64_t ph = 0; ph < pooled_height; ph++) {
+                for (int64_t pw = 0; pw < pooled_width; pw++) {
+                    int64_t index = index_n_c + ph * pooled_width + pw;
+
+                    T output_val = 0.;
+                    if (mode == 1) {  // avg pooling
+                        for (int64_t iy = 0; iy < roi_bin_grid_h; iy++) {
+                            for (int64_t ix = 0; ix < roi_bin_grid_w; ix++) {
+                                PreCalc<T> pc = pre_calc[pre_calc_index];
+                                output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                                              pc.w2 * offset_bottom_data[pc.pos2] +
+                                              pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4];
+
+                                pre_calc_index += 1;
+                            }
+                        }
+                        output_val /= count;
+                    } else {  // max pooling
+                        bool max_flag = false;
+                        for (int64_t iy = 0; iy < roi_bin_grid_h; iy++) {
+                            for (int64_t ix = 0; ix < roi_bin_grid_w; ix++) {
+                                PreCalc<T> pc = pre_calc[pre_calc_index];
+                                T val         = std::max(std::max(std::max(pc.w1 * offset_bottom_data[pc.pos1],
+                                                                   pc.w2 * offset_bottom_data[pc.pos2]),
+                                                          pc.w3 * offset_bottom_data[pc.pos3]),
+                                                 pc.w4 * offset_bottom_data[pc.pos4]);
+                                if (!max_flag) {
+                                    output_val = val;
+                                    max_flag   = true;
+                                } else {
+                                    output_val = std::max(output_val, val);
+                                }
+
+                                pre_calc_index += 1;
+                            }
+                        }
+                    }
+
+                    top_data[index] = output_val;
+                }  // for pw
+            }      // for ph
+        }          // for c
+    }              // for n
+
+    return;
+}
+
+Status CpuRoiAlignLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuRoiAlignLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<RoiAlignLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: RoiAlignLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: RoiAlignLayerParam is nil");
+    }
+    if (inputs.size() < 3) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 3");
+    }
+    auto input_blob         = inputs[0];
+    auto rois               = inputs[1];
+    auto batch_indices      = inputs[2];
+    auto output_blob        = outputs[0];
+    auto input_dims         = input_blob->GetBlobDesc().dims;
+    auto rois_dims          = rois->GetBlobDesc().dims;
+    auto batch_indices_dims = batch_indices->GetBlobDesc().dims;
+    auto output_dims        = output_blob->GetBlobDesc().dims;
+    auto num_channels       = input_dims[1];
+    auto num_rois           = batch_indices_dims[0];
+    auto num_roi_cols       = rois_dims[1];
+    auto *input_ptr         = static_cast<float *>(input_blob->GetHandle().base);
+    auto *rois_ptr          = static_cast<float *>(rois->GetHandle().base);
+    auto *batch_indices_ptr = static_cast<int *>(batch_indices->GetHandle().base);
+    auto *output_ptr        = static_cast<float *>(output_blob->GetHandle().base);
+    auto mode               = param->mode;
+    auto output_height      = param->output_height;
+    auto output_width       = param->output_width;
+    auto sampling_ratio     = param->sampling_ratio;
+    auto spatial_scale      = param->spatial_scale;
+
+    CalculateRoiAlign<float>(output_dims, input_ptr, spatial_scale, input_dims[2], input_dims[3], sampling_ratio,
+                             rois_ptr, num_roi_cols, output_ptr, mode, batch_indices_ptr);
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(RoiAlign, LAYER_ROIALIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_rsqrt_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_rsqrt_layer_acc.cc
new file mode 100644
index 0000000..3dcdb52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_rsqrt_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct RSQRT_OP : unary_operator {
+    virtual float operator()(float in) {
+        return 1 / (sqrt(in));
+    }
+} RSQRT_OP;
+
+DECLARE_UNARY_ACC(Rsqrt, LAYER_RSQRT, RSQRT_OP);
+
+REGISTER_CPU_ACC(Rsqrt, LAYER_RSQRT);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scale_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scale_layer_acc.cc
new file mode 100644
index 0000000..8244692
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scale_layer_acc.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FP32_RESOURCE(Scale, LAYER_SCALE);
+
+Status CpuScaleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuScaleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNormLayerResource is nil");
+    }
+
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    int channel            = input_blob->GetBlobDesc().dims[1];
+    int hw                 = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 2);
+    int count              = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims);
+    RawBuffer scale_handle = resource->scale_handle;
+    float *k_data          = resource->scale_handle.force_to<float *>();
+    bool share_channel     = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(scale_handle.GetDataType());
+    float *b_data          = resource->bias_handle.force_to<float *>();
+
+    for (int index = 0; index < count; ++index) {
+        float result = 0.0f;
+        int c        = share_channel ? 0 : (index / hw) % channel;
+        result       = input_data[index] * k_data[c];
+        if (b_data != nullptr) {
+            result += b_data[c];
+        }
+        output_data[index] = result;
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Scale, LAYER_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scatter_nd_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scatter_nd_layer_acc.cc
new file mode 100644
index 0000000..974b46b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_scatter_nd_layer_acc.cc
@@ -0,0 +1,124 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(ScatterND, LAYER_SCATTER_ND);
+
+Status CpuScatterNDLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuScatterNDLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<ScatterNDLayerResource *>(resource_);
+    if (!resource && inputs.size() < 3) {
+        LOGE("CpuScatterNDLayerAcc has not layer resource\n");
+        return Status(TNNERR_PARAM_ERR, "CpuScatterNDLayerAcc has not layer resource");
+    }
+    
+    DimsVector indices_dims;
+    int* indice_offset = nullptr;
+    Blob* update_data_blob = nullptr;
+    if (inputs.size() >= 3) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            LOGE("CpuScatterNDLayerAcc indice input has invalid data type\n");
+            return Status(TNNERR_PARAM_ERR, "CpuScatterNDLayerAcc indice input has invalid data type");
+        }
+        indice_offset = (int *)((char*)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        indices_dims = inputs[1]->GetBlobDesc().dims;
+        update_data_blob = inputs[2];
+    } else {
+        indice_offset = resource->indices.force_to<int*>();
+        indices_dims = resource->indices.GetBufferDims();
+        update_data_blob = inputs[1];
+    }
+    
+    Blob *output_blob = outputs[0];
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        Blob* input_data_blob = inputs[0];
+        float* input_data = reinterpret_cast<float*>(input_data_blob->GetHandle().base);
+        float* update_data = reinterpret_cast<float*>(update_data_blob->GetHandle().base);
+        float* output_data = reinterpret_cast<float*>(output_blob->GetHandle().base);
+        auto input_dims = input_data_blob->GetBlobDesc().dims;
+        auto update_dims = update_data_blob->GetBlobDesc().dims;
+        if(indices_dims.empty()) {
+            LOGE("Error: indices dims has rank 0");
+            return Status(TNNERR_PARAM_ERR, "Error: indices dims has rank 0");
+        }
+        auto indice_rank = indices_dims.size();
+        auto last_indice_dimension = indices_dims[indice_rank - 1];
+        if(last_indice_dimension > input_dims.size()) {
+            LOGE("Error: last dimension of indices larger than input blob dims size ");
+            return Status(TNNERR_PARAM_ERR, "Error: last dimension of indices larger than input blob dims size ");
+        }
+
+        auto update_rank = update_dims.size();
+        auto input_rank = input_dims.size();
+        if(update_rank < indice_rank - 1) {
+            LOGE("Error: update_rank < indice_rank -1 ");
+            return Status(TNNERR_PARAM_ERR, "Error: update_rank < indice_rank -1 ");
+        }
+
+        for(int i = 0; i < indice_rank -1; ++i) {
+            if(indices_dims[i] != update_dims[i]) {
+                LOGE("Error: indices_dims and update dims not equal before index indice_rank -1");
+                return Status(TNNERR_PARAM_ERR, "Error: indices_dims and update dims not equal before index indice_rank -1");
+            }
+        }
+
+        if(DimsVectorUtils::Count(update_dims, indice_rank -1) != DimsVectorUtils::Count(input_dims, last_indice_dimension)) {
+                LOGE("Error: indices_dims and update dims not equal before index indice_rank -1");
+                return Status(TNNERR_PARAM_ERR, "Error: indices_dims and update dims not equal before index indice_rank -1");
+        }
+
+        //copy input to output
+        memcpy(output_data, input_data, DimsVectorUtils::Count(input_dims) * sizeof(float));
+
+        std::vector<int> element_counts(last_indice_dimension, 0);
+
+        for (int i = 0; i < last_indice_dimension; ++i) {
+            element_counts[i] = DimsVectorUtils::Count(input_dims, i + 1);
+        }
+
+        int element_to_copy = DimsVectorUtils::Count(input_dims, last_indice_dimension);
+        int offset_count = DimsVectorUtils::Count(indices_dims, 0, indice_rank - 1);
+ 
+        for(int i = 0; i < offset_count; ++i) {
+            int offset = 0;
+            for(int j = 0; j < last_indice_dimension; ++j) {
+                auto indice = *(indice_offset + i * last_indice_dimension + j);
+                offset += indice * element_counts[j];
+            }
+            memcpy(output_data + offset, update_data + i * element_to_copy, element_to_copy * sizeof(float));
+        }
+
+        return TNN_OK;
+
+    } else {
+        LOGE("Error: CpuScatterNDLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuScatterNDLayerAcc layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_selu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_selu_layer_acc.cc
new file mode 100644
index 0000000..e74f86f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_selu_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct selu_operator : unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<SeluLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: selu layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: selu layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        gamma_ = layer_param->gamma;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        float temp = in;
+        if (temp <= 0) {
+            temp = gamma_ * (alpha_ * exp(temp) - alpha_);
+        } else {
+            temp = gamma_ * temp;
+        }
+        return temp;
+    }
+
+private:
+    float alpha_ = 0.f, gamma_ = 0.f;
+} SELU_OP;
+
+DECLARE_UNARY_ACC(Selu, LAYER_SELU, SELU_OP);
+
+REGISTER_CPU_ACC(Selu, LAYER_SELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shape_layer_acc.cc
new file mode 100644
index 0000000..c66d0a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shape_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Shape, LAYER_SHAPE);
+
+Status CpuShapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuShapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto& input_blob = inputs[0];
+    const auto& input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto& output_blob = outputs[0];
+    const auto& data_type = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_INT32) {
+        auto output_data = static_cast<int32_t*>(output_blob->GetHandle().base);
+        for (int i = 0; i < input_dims.size(); ++i) {
+            output_data[i] = input_dims[i];
+        }
+    } else {
+        LOGE("output blob of Shape Layer has wrong data type \n");
+        return Status(TNNERR_COMMON_ERROR, "output blob has wrong data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Shape, LAYER_SHAPE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shuffle_layer_acc.cc
new file mode 100644
index 0000000..6586673
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_shuffle_layer_acc.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+static int shuffle_cpu(float *output, const float *input, int group_row, int group_column, int len) {
+    int ele_size = sizeof(float);
+    for (int i = 0; i < group_row; ++i)  // 2
+    {
+        for (int j = 0; j < group_column; ++j)  // 3
+        {
+            const float *p_i = input + (i * group_column + j) * len;
+            float *p_o       = output + (j * group_row + i) * len;
+            memcpy(p_o, p_i, len * ele_size);
+        }
+    }
+
+    return 0;
+}
+
+DECLARE_CPU_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+Status CpuShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ShuffleLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: ShuffleLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ShuffleLayerParam is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto dims   = input->GetBlobDesc().dims;
+
+    const float *bottom_data = static_cast<float *>(input->GetHandle().base);
+    float *top_data          = static_cast<float *>(output->GetHandle().base);
+
+    const int num              = dims[0];
+    const int feature_map_size = DimsVectorUtils::Count(dims, 1);
+    const int sp_sz            = DimsVectorUtils::Count(dims, 2);
+    const int chs              = dims[1];
+
+    int group_row    = param->group;
+    int group_column = int(chs / group_row);
+
+    assert(chs == (group_column * group_row));
+
+    // Dtype* temp_data = temp_blob_.mutable_cpu_data();
+    for (int n = 0; n < num; ++n) {
+        shuffle_cpu(top_data + n * feature_map_size, bottom_data + n * feature_map_size, group_row, group_column,
+                    sp_sz);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..fac22ce
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sigmoid_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct sigmoid_operator : unary_operator {
+    virtual float operator()(float in) {
+        return 1.0f / (1.0f + exp(-in));
+    }
+} SIGMOID_OP;
+
+DECLARE_UNARY_ACC(Sigmoid, LAYER_SIGMOID, SIGMOID_OP);
+
+REGISTER_CPU_ACC(Sigmoid, LAYER_SIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sign_layer_acc.cc
new file mode 100644
index 0000000..7c01246
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sign_layer_acc.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct sign_operator : unary_operator {
+    virtual float operator()(float in) {
+        float temp = in;
+        if (temp > 0) {
+            temp = 1;
+        } else if (temp < 0) {
+            temp = -1;
+        }
+        return temp;
+    }
+} SIGN_OP;
+
+DECLARE_UNARY_ACC(Sign, LAYER_SIGN, SIGN_OP);
+
+REGISTER_CPU_ACC(Sign, LAYER_SIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_signed_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_signed_mul_layer_acc.cc
new file mode 100644
index 0000000..e8363dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_signed_mul_layer_acc.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(SignedMul, LAYER_SIGNED_MUL);
+
+Status CpuSignedMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuSignedMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SignedMulLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: SignedMulLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SignedMulLayerParam is nil");
+    }
+
+    auto alpha     = layer_param->alpha;
+    auto beta      = layer_param->beta;
+    auto gamma_inv = 1.0f / layer_param->gamma;
+
+    auto input_blob    = inputs[0];
+    auto output_blob   = outputs[0];
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+    int batch          = input_blob->GetBlobDesc().dims[0];
+    int channel        = input_blob->GetBlobDesc().dims[1];
+    int channel_size   = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < channel; c++) {
+            int channel_index = b * channel + c;
+            float *input_data_c = input_data + channel_index * channel_size;
+            float *output_data_c = output_data + channel_index * channel_size;
+            for (int i = 0; i < channel_size; i++) {
+                //sub 
+                float temp = input_data_c[i] - alpha;
+                
+                //sign
+                if (temp > 0) {
+                    temp = 1;
+                } else if (temp < 0) {
+                    temp = -1;
+                }
+                
+                //add
+                temp += beta;
+                
+                //div
+                temp *= gamma_inv;
+                output_data_c[i] = temp;
+            }
+        }
+        
+        //mul
+        float *output_data_c0 = output_data + b * channel * channel_size;
+        for (int c = channel - 1; c >= 0; c--) {
+            int channel_index = b * channel + c;
+            float *output_data_c = output_data + channel_index * channel_size;
+            for (int i = 0; i < channel_size; i++) {
+                output_data_c[i] *= output_data_c0[i];
+            }
+        }
+    }
+    return 0;
+}
+
+REGISTER_CPU_ACC(SignedMul, LAYER_SIGNED_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sin_layer_acc.cc
new file mode 100644
index 0000000..a75c31c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sin_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct sin_operator : unary_operator {
+    virtual float operator()(float in) {
+        return sin(in);
+    }
+} SIN_OP;
+
+DECLARE_UNARY_ACC(Sin, LAYER_SIN, SIN_OP);
+
+REGISTER_CPU_ACC(Sin, LAYER_SIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_size_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_size_layer_acc.cc
new file mode 100644
index 0000000..ab7dbcb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_size_layer_acc.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Size, LAYER_SIZE);
+
+Status CpuSizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuSizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *output_data = ((char *)outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset;
+    auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    const int count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    if (output_data_type == DATA_TYPE_INT32) {
+        *((int *)output_data) = count;
+    } else {
+        LOGE("unsupport data type to cast\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Size, LAYER_SIZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softmax_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softmax_layer_acc.cc
new file mode 100644
index 0000000..ca68917
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softmax_layer_acc.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(SoftMax, LAYER_SOFTMAX);
+
+Status CpuSoftMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuSoftMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<SoftmaxLayerParam *>(param_);
+
+    if (!params) {
+        LOGE("Error: SoftmaxLayerParam is unsupported\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SoftmaxLayerParam is unsupported");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+    auto dims          = input_blob->GetBlobDesc().dims;
+    int axis           = params->axis;
+    axis               = static_cast<int>((axis + dims.size()) % dims.size());
+    int batch          = DimsVectorUtils::Count(dims, 0, axis);
+    int channel        = dims[axis];
+    int count          = DimsVectorUtils::Count(dims, axis + 1);
+    ;
+
+    float *const temp = new float[count];
+
+    for (int n = 0; n < batch; n++) {
+        float *const input_batch  = input_data + n * channel * count;
+        float *const output_batch = output_data + n * channel * count;
+        // max
+        memcpy(temp, input_batch, count * sizeof(float));
+        for (int c = 1; c < channel; c++) {
+            float *input_channel = input_batch + c * count;
+            for (int ele = 0; ele < count; ele++) {
+                temp[ele] = std::max(temp[ele], input_channel[ele]);
+            }
+        }
+
+        // exp
+        for (int c = 0; c < channel; c++) {
+            float *input_channel  = input_batch + c * count;
+            float *output_channel = output_batch + c * count;
+
+            for (int ele = 0; ele < count; ele++) {
+                output_channel[ele] = expf(input_channel[ele] - temp[ele]);
+            }
+        }
+
+        // sum
+        memcpy(temp, output_batch, count * sizeof(float));
+        for (int c = 1; c < channel; c++) {
+            float *output_channel = output_batch + c * count;
+            for (int ele = 0; ele < count; ele++) {
+                temp[ele] += output_channel[ele];
+            }
+        }
+
+        // division
+        for (int ele = 0; ele < count; ele++) {
+            temp[ele] = 1.0f / temp[ele];
+        }
+        for (int c = 0; c < channel; c++) {
+            float *output_channel = output_batch + c * count;
+            for (int ele = 0; ele < count; ele++) {
+                output_channel[ele] *= temp[ele];
+            }
+        }
+    }
+
+    delete[] temp;
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(SoftMax, LAYER_SOFTMAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softplus_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softplus_layer_acc.cc
new file mode 100644
index 0000000..2ba6c3f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_softplus_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct softplus_operator : unary_operator {
+    virtual float operator()(float in) {
+        return log(exp(in) + 1.0f);
+    }
+} SOFTPLUS_OP;
+
+DECLARE_UNARY_ACC(Softplus, LAYER_SOFTPLUS, SOFTPLUS_OP);
+
+REGISTER_CPU_ACC(Softplus, LAYER_SOFTPLUS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc
new file mode 100644
index 0000000..d140540
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(SplitV, LAYER_SPLITV);
+
+Status CpuSplitVLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    if (!layer_param || layer_param->slices.size() != outputs.size()) {
+        return Status(TNNERR_PARAM_ERR, "CpuSplitVLayerAcc has invalid param, slices size != output blobs size");
+    }
+
+    const int axis  = layer_param->axis;
+    auto input_blob = inputs[0];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    const int bath  = DimsVectorUtils::Count(input_dims, 0, axis);
+    int slice_size  = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (slice_size == 0) {
+        //support split empty blob such as blob with shape[12, 2, 128], axis = 0
+        return TNN_OK;
+    }
+    const int slice_input = input_dims[axis];
+    auto input_data       = static_cast<float *>(input_blob->GetHandle().base);
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        for (size_t b = 0; b < bath; b++) {
+            int slice_input_offset = 0;
+            for (size_t i = 0; i < outputs.size(); i++) {
+                auto output_blob = outputs[i];
+                auto output_data = static_cast<float *>(output_blob->GetHandle().base);
+                const int slice  = output_blob->GetBlobDesc().dims[axis];
+
+                auto input_data_ptr  = input_data + b * slice_input * slice_size + slice_input_offset * slice_size;
+                auto output_data_ptr = output_data + b * slice * slice_size;
+
+                memcpy(output_data_ptr, input_data_ptr, slice * slice_size * sizeof(float));
+                slice_input_offset += slice;
+            }
+        }
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuSplitVLayerAcc layer acc don't support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuSplitVLayerAcc layer acc dont support datatype");
+    } else {
+        LOGE("Error: CpuSplitVLayerAcc layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuSplitVLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(SplitV, LAYER_SPLITV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sqrt_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sqrt_layer_acc.cc
new file mode 100644
index 0000000..7484f91
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sqrt_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct sqrt_operator : unary_operator {
+    virtual float operator()(float in) {
+        return sqrt(in);
+    }
+} SQRT_OP;
+
+DECLARE_UNARY_ACC(Sqrt, LAYER_SQRT, SQRT_OP);
+
+REGISTER_CPU_ACC(Sqrt, LAYER_SQRT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squared_difference_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squared_difference_layer_acc.cc
new file mode 100644
index 0000000..7c66d9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squared_difference_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+
+Status CpuSquaredDifferenceLayerAcc::Calculate(const std::vector<Blob *> &input_blobs,
+                                               const std::vector<void *> &input_ptrs,
+                                               const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_SQUARED_DIFFERENCE(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: CpuSquaredDifferenceLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuSquaredDifferenceLayerAcc don't support data type");
+    } else {
+        LOGE("Error: CpuSquaredDifferenceLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuSquaredDifferenceLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+REGISTER_CPU_ACC(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc
new file mode 100644
index 0000000..8a7b5d0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Squeeze, LAYER_SQUEEZE);
+
+Status CpuSqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuSqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *input_data  = inputs[0]->GetHandle().base;
+    void *output_data = outputs[0]->GetHandle().base;
+    auto input_dims   = outputs[0]->GetBlobDesc().dims;
+    auto count        = DimsVectorUtils::Count(input_dims);
+    auto ele_size     = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Squeeze, LAYER_SQUEEZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_layer_acc.cc
new file mode 100644
index 0000000..19f6da7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_layer_acc.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status CpuStrideSliceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuStrideSliceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto begins = layer_param->begins;
+    std::reverse(begins.begin(), begins.end());
+    auto ends = layer_param->ends;
+    std::reverse(ends.begin(), ends.end());
+    auto strides = layer_param->strides;
+    std::reverse(strides.begin(), strides.end());
+
+    const auto input_dims     = input_blob->GetBlobDesc().dims;
+    const int input_dims_size = input_dims.size();
+    for (int i = 0; i < input_dims_size; ++i) {
+        if (begins[i] < 0) {
+            begins[i] += input_dims[i];
+        }
+        if (ends[i] == 0) {
+            ends[i] = input_dims[i];
+        }
+        if (ends[i] < 0) {
+            ends[i] += input_dims[i];
+        }
+    }
+   
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    int output_count = DimsVectorUtils::Count(output_dims);
+
+    if (output_blob->GetBlobDesc().data_type != DATA_TYPE_INT8) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for(int offset = 0; offset < output_count; ++offset) {
+            DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(output_dims, offset);
+            DimsVector input_index;
+            for(int i = 0; i < output_index.size(); ++i) {
+                input_index.push_back(begins[i] + output_index[i] * strides[i]);
+            }
+            int in_offset = DimsOffsetUtils::ConvertIndexToOffset(input_dims, input_index);
+            output_data[offset] = input_data[in_offset];
+        }
+    } else {
+        ASSERT(0);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc
new file mode 100644
index 0000000..12d5252
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(StrideSliceV2, LAYER_STRIDED_SLICE_V2,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuStrideSliceV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuStrideSliceV2LayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                         const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 2) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "stride slice input(begins) has invalid data type");
+        }
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->begins = dims;
+    }
+    
+    if (inputs.size() >= 3) {
+        if (inputs[2]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "stride slice input(ends) has invalid data type");
+        }
+        auto input_dims = inputs[2]->GetBlobDesc().dims;
+        
+        auto dim_count = DimsVectorUtils::Count(inputs[2]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[2]->GetHandle().base + inputs[2]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->ends = dims;
+    }
+    
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    
+    auto begins = layer_param->begins;
+    auto ends = layer_param->ends;
+    auto axes = layer_param->axes;
+    auto strides = layer_param->strides;
+    
+    //前闭后开区间
+    Status status = TNN_OK;
+    auto output_dims = DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &status);
+    //support empty blob for yolov5 Slice_507, only in device cpu
+    if (status != TNN_OK && !(output_dims.size() == input_dims.size() &&  runtime_model_ == RUNTIME_MODE_CONST_FOLD)) {
+        return status;
+    }
+    
+    outputs[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+
+Status CpuStrideSliceV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto begins = layer_param->begins;
+    auto ends = layer_param->ends;
+    auto strides = layer_param->strides;
+    auto axes = layer_param->axes;
+    
+    DimsVector input_dims = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    int output_count = DimsVectorUtils::Count(output_dims);
+    
+    //rectify begins and ends here for value < 0 or = INT_MAX
+    Status status = TNN_OK;
+    DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &status);
+    //support empty blob for yolov5 Slice_507, only in device cpu
+    if (status != TNN_OK && !(output_dims.size() == input_dims.size() &&  runtime_model_ == RUNTIME_MODE_CONST_FOLD)) {
+        return status;
+    }
+
+    if (output_blob->GetBlobDesc().data_type != DATA_TYPE_INT8) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for(int offset = 0; offset < output_count; ++offset) {
+            DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(output_dims, offset);
+            DimsVector input_index;
+            int axes_index = 0;
+            for(int i = 0; i < output_index.size(); ++i) {
+                if(axes_index < axes.size() && i == axes[axes_index]) {
+                    input_index.push_back(begins[axes_index] + output_index[i] * strides[axes_index]);
+                    ++axes_index;
+                } else {
+                    input_index.push_back(output_index[i]);
+                }
+            }
+            int in_offset = DimsOffsetUtils::ConvertIndexToOffset(input_dims, input_index);
+            output_data[offset] = input_data[in_offset];
+        }
+    } else {
+        ASSERT(0);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc
new file mode 100644
index 0000000..21031e5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc
@@ -0,0 +1,56 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <iomanip>
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Sub, LAYER_SUB);
+
+Status CpuSubLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        CPU_SUB(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                  [](int a, int b) -> int { return a - b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        std::vector<float *> scale_ptrs;
+
+        for (size_t inid = 0; inid < input_blobs.size(); inid++) {
+            scale_ptrs.push_back(
+                reinterpret_cast<BlobInt8 *>(input_blobs[inid])->GetIntResource()->scale_handle.force_to<float *>());
+        }
+        CPU_SUB(input_ptrs, scale_ptrs,
+                reinterpret_cast<BlobInt8 *>(input_blobs[0])->GetIntResource()->scale_handle.GetDataCount(),
+                output->GetHandle().base,
+                reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>(),
+                output->GetBlobDesc().dims);
+    } else {
+        LOGE("Error: CpuSubLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuSubLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tan_layer_acc.cc
new file mode 100644
index 0000000..394f74a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tan_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct tan_operator : unary_operator {
+    virtual float operator()(float in) {
+        return tan(in);
+    }
+} TAN_OP;
+
+DECLARE_UNARY_ACC(Tan, LAYER_TAN, TAN_OP);
+
+REGISTER_CPU_ACC(Tan, LAYER_TAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tanh_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tanh_layer_acc.cc
new file mode 100644
index 0000000..b790af5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tanh_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct tanh_operator : unary_operator {
+    virtual float operator()(float in) {
+        return tanh(in);
+    }
+} TANH_OP;
+
+DECLARE_UNARY_ACC(Tanh, LAYER_TANH, TANH_OP);
+
+REGISTER_CPU_ACC(Tanh, LAYER_TANH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tile_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tile_layer_acc.cc
new file mode 100644
index 0000000..dc5ef97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_tile_layer_acc.cc
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC_WITH_FUNC(Tile, LAYER_REPEAT,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
+
+Status CpuTileLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuTileLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                         const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<TileLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 2) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "TileLayer input(reps) has invalid data type");
+        }
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->reps = dims;
+    }
+    
+    
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto reps = layer_param->reps;
+    
+    auto output_dims = DimsFunctionUtils::Tile(input_dims, reps);
+    
+    outputs[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+Status CpuTileLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<TileLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    const auto output_dims = output_blob->GetBlobDesc().dims;
+    while (input_dims.size() < output_dims.size()) {
+        input_dims.insert(input_dims.begin(), 1);
+    }
+    
+    int count = DimsVectorUtils::Count(output_dims);
+    auto data_type = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_INT32 || data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        DimsVector output_index(output_dims.size(), 0);
+        for (int i=0; i<count; i++) {
+            auto input_index = DimsFunctionUtils::ModIndex(output_index, input_dims);
+            int input_offset = DimsOffsetUtils::ConvertIndexToOffset(input_dims, input_index);
+            
+            output_data[i] = input_data[input_offset];
+            
+            output_index = DimsFunctionUtils::IncreaseIndex(output_index, output_dims);
+        }
+    } else {
+        return Status(Status(TNNERR_MODEL_ERR, "CpuTileLayerAcc input has invalid data type"));
+    }
+    
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Tile, LAYER_REPEAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_topk_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_topk_layer_acc.cc
new file mode 100644
index 0000000..2e39133
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_topk_layer_acc.cc
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include <queue>
+#include <vector>
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(TopK, LAYER_TOPK);
+
+template <typename T>
+struct topk_record {
+    int index;
+    T value;
+    bool operator<(const topk_record &rhs) const
+    {
+            return value < rhs.value;
+    }
+    bool operator>(const topk_record &rhs) const
+    {
+            return value > rhs.value;
+    }
+};
+
+template <typename T>
+bool compare_greater(topk_record<T> a, topk_record<T> b) {
+    return a.value > b.value;
+}
+
+template <typename T>
+bool compare_less(topk_record<T> a, topk_record<T> b) {
+    return a.value < b.value;
+}
+
+template <typename T>
+void CPU_TOPK(const T * input_data, T * output_data, int * output_index,
+              DimsVector input_dims, int topk, int axis, int largest, int sort) {
+
+    auto topk_heap = new std::priority_queue<topk_record<T>, 
+                        std::vector<topk_record<T> >,
+                        bool (*)(topk_record<T>, topk_record<T>)> (compare_greater<T>);
+    if (!largest) {
+        delete topk_heap;
+        topk_heap = new std::priority_queue<topk_record<T>, 
+                        std::vector<topk_record<T> >,
+                        bool (*)(topk_record<T>, topk_record<T>)> (compare_less<T>);
+    }
+
+    int topk_dim_size = input_dims[axis];
+    int inner_size    = DimsVectorUtils::Count(input_dims, axis + 1);
+    int outer_size    = DimsVectorUtils::Count(input_dims, 0, axis);
+    int outer_stride  = DimsVectorUtils::Count(input_dims, axis);
+
+    for (int o = 0; o < outer_size; o++) {
+        auto in_o_ptr     = input_data + o * outer_stride;
+        auto ou_o_ptr     = output_data + o * topk * inner_size;
+        auto ou_idx_o_ptr = output_index + o * topk * inner_size;
+
+        for (int i = 0; i < inner_size; i++) {
+            auto in_i_ptr = in_o_ptr + i;
+
+            for (int k = 0; k < topk_dim_size; k++) {
+                topk_record<T> tmp;
+                tmp.index = k;
+                tmp.value = in_i_ptr[k * inner_size];
+                topk_heap->push(tmp);
+                if (topk_heap->size() > topk) {
+                    topk_heap->pop();
+                }
+            }
+
+            auto ou_i_ptr     = ou_o_ptr + i;
+            auto ou_idx_i_ptr = ou_idx_o_ptr + i;
+
+            if (sort) {
+                std::vector<topk_record<T> > sort_result;
+                sort_result.reserve(topk_heap->size());
+                while (!topk_heap->empty()) {
+                    sort_result.emplace_back(topk_heap->top());
+                    topk_heap->pop();
+                }
+                if (largest) {
+                    std::sort(sort_result.begin(), sort_result.end(), std::greater<topk_record<T> >());
+                } else {
+                    std::sort(sort_result.begin(), sort_result.end(), std::less<topk_record<T> >());
+                }
+
+                for (int k = 0; k < topk; k++) {
+                    topk_record<T> tmp = sort_result[k];
+                    ou_i_ptr[k * inner_size] = tmp.value;
+                    ou_idx_i_ptr[k * inner_size] = tmp.index;
+                }
+            } else {
+                while (!topk_heap->empty()) {
+                    topk_record<T> tmp   = topk_heap->top();
+                    *(ou_i_ptr)     = tmp.value;
+                    *(ou_idx_i_ptr) = tmp.index;
+                    topk_heap->pop();
+                    ou_i_ptr += inner_size;
+                    ou_idx_i_ptr += inner_size;
+                }
+            }
+        }
+    }
+
+    delete topk_heap;
+}
+
+Status CpuTopKLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuTopKLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<TopKLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: TopKLayerParam is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: TopKLayerParam is nil");
+    }
+
+    if (outputs.size() != 2) {
+        LOGE("Error: TopKLayer must have 2 output blobs\n");
+        return Status(TNNERR_PARAM_ERR, "Error: TopKLayer must have 2 output blobs");
+    }
+
+    if (inputs.size() > 1) {
+        LOGE("Error: TopKLayer only support 1 input blob, not support k as a input blob\n");
+        return Status(TNNERR_PARAM_ERR, "Error: TopKLayer only support 1 input blob, not support k as a input blob");
+    }
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (param->axis >= input_dims.size()) {
+        LOGE("Error: TopKLayer the axis exceeds input dims\n");
+        return Status(TNNERR_PARAM_ERR, "Error: TopKLayer the axis exceeds input dims");
+    }
+
+    void *input_data = inputs[0]->GetHandle().base;
+    void *output_data = outputs[0]->GetHandle().base;
+    void *output_index_data = outputs[1]->GetHandle().base;
+
+    if (param->k <= 0) {
+        LOGE("Error: TopKLayer k <= 0\n");
+        return Status(TNNERR_PARAM_ERR, "Error: TopKLayer k <= 0");
+    }
+
+    int topk = MIN(param->k, input_dims[param->axis]);
+
+    auto data_type = inputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        CPU_TOPK<float>(static_cast<const float*>(input_data),
+                 static_cast<float*>(output_data),
+                 static_cast<int*>(output_index_data),
+                 input_dims, topk, param->axis,
+                 param->largest, param->sorted);
+    } else if (data_type == DATA_TYPE_INT32) {
+        CPU_TOPK<int>(static_cast<const int*>(input_data),
+                 static_cast<int*>(output_data),
+                 static_cast<int*>(output_index_data),
+                 input_dims, topk, param->axis,
+                 param->largest, param->sorted);
+    } else {
+        LOGE("Error: CpuTopKLayerAcc don't support data type: %d\n", data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuTopKLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(TopK, LAYER_TOPK);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.cc
new file mode 100644
index 0000000..0607a8f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_unary_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status CpuUnaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                              const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto ret = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    if (NULL == op_) {
+        LOGE("Error: Unary layer init got null op\n");
+        return Status(TNNERR_LAYER_ERR, "Unary layer init got null op");
+    }
+    return op_->Init(param);
+}
+
+Status CpuUnaryLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuUnaryLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 1) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 1");
+    }
+
+    if (NULL == op_) {
+        LOGE("Error: Unary layer got null op\n");
+        return Status(TNNERR_LAYER_ERR, "Unary layer got undefined op");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count         = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = (*op_)(input_data[index]);
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        auto input_data  = static_cast<int *>(input_blob->GetHandle().base);
+        auto output_data = static_cast<int *>(output_blob->GetHandle().base);
+        for (int index = 0; index < count; ++index) {
+            output_data[index] = (*op_)(input_data[index]);
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        auto dims = input_blob->GetBlobDesc().dims;
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        if (param_->quantized) {
+            auto input_scale_handle  = reinterpret_cast<BlobInt8 *>(input_blob)->GetIntResource()->scale_handle;
+            auto output_scale_handle = reinterpret_cast<BlobInt8 *>(output_blob)->GetIntResource()->scale_handle;
+            const float *i_scale     = input_scale_handle.force_to<float *>();
+            const float *o_scale     = output_scale_handle.force_to<float *>();
+            int scale_len_i          = input_scale_handle.GetDataCount();
+            int scale_len_o          = output_scale_handle.GetDataCount();
+            for (int n = 0; n < dims[0]; ++n) {
+                auto input_data_n  = input_data + n * dims[1] * dims[2] * dims[3];
+                auto output_data_n = output_data + n * dims[1] * dims[2] * dims[3];
+                for (int c = 0; c < dims[1]; ++c) {
+                    auto input_data_c  = input_data_n + c * dims[2] * dims[3];
+                    auto output_data_c = output_data_n + c * dims[2] * dims[3];
+                    float input_scale  = scale_len_i == 0 ? i_scale[0] : i_scale[c];
+                    float output_scale = scale_len_o == 0 ? o_scale[0] : o_scale[c];
+                    for (int hw = 0; hw < DimsVectorUtils::Count(dims, 2); ++hw) {
+                        float input_data_tmp  = input_data_c[hw] * input_scale;
+                        float output_data_tmp = (*op_)(input_data_tmp);
+                        output_data_c[hw]     = float2int8(output_data_tmp / output_scale);
+                    }
+                }
+            }
+        } else {
+            for (int index = 0; index < count; ++index) {
+                output_data[index] = (*op_)(input_data[index]);
+            }
+        }
+    } else {
+        LOGE("Error: CpuUnaryLayerAcc layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuUnaryLayerAcc layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.h
new file mode 100644
index 0000000..1d0e0f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unary_layer_acc.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_UNARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_UNARY_LAYER_ACC_H_
+
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+
+namespace TNN_NS {
+
+typedef struct unary_operator {
+public:
+    virtual Status Init(LayerParam *param = NULL) {
+        param_ = param;
+        return TNN_OK;
+    }
+    virtual float operator()(float in) {
+        return in;
+    }
+    virtual int operator()(int in) {
+        return in;
+    }
+    virtual bfp16_t operator()(bfp16_t in) {
+        return in;
+    }
+    virtual int8_t operator()(int8_t in) {
+        return in;
+    }
+
+protected:
+    LayerParam *param_ = NULL;
+} UNARY_OP;
+
+// @brief cpu unary layer acc
+class CpuUnaryLayerAcc : public CpuLayerAcc {
+public:
+    CpuUnaryLayerAcc() : op_(NULL) {}
+
+    // @brief virtual destrcutor
+    virtual ~CpuUnaryLayerAcc(){};
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<UNARY_OP> op_;
+};
+
+#define DECLARE_UNARY_ACC(type_string, layer_type, OP_TYPE)                                                            \
+    class Cpu##type_string##LayerAcc : public CpuUnaryLayerAcc {                                                       \
+    public:                                                                                                            \
+        Cpu##type_string##LayerAcc() {                                                                                 \
+            CpuUnaryLayerAcc::op_ = std::make_shared<OP_TYPE>();                                                       \
+        };                                                                                                             \
+        virtual ~Cpu##type_string##LayerAcc(){};                                                                       \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_UNARY_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unsqueeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unsqueeze_layer_acc.cc
new file mode 100644
index 0000000..5948f33
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_unsqueeze_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status CpuUnsqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuUnsqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *input_data  = inputs[0]->GetHandle().base;
+    void *output_data = outputs[0]->GetHandle().base;
+    auto dims         = outputs[0]->GetBlobDesc().dims;
+    auto count        = DimsVectorUtils::Count(dims);
+    auto ele_size     = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.cc
new file mode 100644
index 0000000..5508bf3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.cc
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_upsample_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+static inline bool CheckInputOutputSizeSame(int input_height, int input_width, int output_height, int output_width) {
+    return input_height == output_height && input_width == output_width;
+}
+
+// nearest interpolate function
+static inline int upsample_nearest2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                     int output_height, int output_width, int channels, bool align_corners) {
+    // special case: just copy
+    if (CheckInputOutputSizeSame(input_height, input_width, output_height, output_width)) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, channels * input_height * input_width * sizeof(float));
+        }
+        return 0;
+    }
+
+    const float height_scale = (float)input_height / (float)output_height;
+    const float width_scale  = (float)input_width / (float)output_width;
+
+    OMP_PARALLEL_FOR_
+    for (int i = 0; i < channels; ++i) {
+        int output_index  = i * output_height * output_width;
+        int input_index_i = i * input_height * input_width;
+        for (int j = 0; j < output_height; ++j) {
+            int scaled_j      = static_cast<int>(j * height_scale);
+            int input_index_j = input_index_i + scaled_j * input_width;
+            for (int u = 0; u < output_width; ++u) {
+                int scaled_u                = static_cast<int>(u * width_scale);
+                output_data[output_index++] = input_data[input_index_j + scaled_u];
+            }
+        }
+    }
+
+    return 0;
+}
+
+// bilinear interpolate function
+static inline int upsample_bilinear2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                      int output_height, int output_width, int channels, bool align_corners) {
+    // special case: just copy
+    if (CheckInputOutputSizeSame(input_height, input_width, output_height, output_width)) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, channels * input_height * input_width * sizeof(float));
+        }
+        return 0;
+    }
+
+    // align corners option from pytorch
+    if (align_corners) {
+        const float rheight = (output_height > 1) ? (float)(input_height - 1) / (output_height - 1) : 0.f;
+        const float rwidth  = (output_width > 1) ? (float)(input_width - 1) / (output_width - 1) : 0.f;
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < output_height; ++h2) {
+            const float h1r = rheight * h2;
+
+            const int h1         = static_cast<int>(h1r);
+            const int h1p        = (h1 < input_height - 1) ? 1 : 0;
+            const float h1lambda = h1r - h1;
+            const float h0lambda = (float)1. - h1lambda;
+            for (int w2 = 0; w2 < output_width; ++w2) {
+                const float w1r      = rwidth * w2;
+                const int w1         = static_cast<int>(w1r);
+                const int w1p        = (w1 < input_width - 1) ? 1 : 0;
+                const float w1lambda = w1r - w1;
+                const float w0lambda = (float)1. - w1lambda;
+                const float *Xdata   = &(input_data[h1 * input_width + w1]);
+                float *Ydata         = &(output_data[h2 * output_width + w2]);
+                for (int c = 0; c < channels; ++c) {
+                    Ydata[0] =
+                        h0lambda * (w0lambda * Xdata[0] + w1lambda * Xdata[w1p]) +
+                        h1lambda * (w0lambda * Xdata[h1p * input_width] + w1lambda * Xdata[h1p * input_width + w1p]);
+                    Xdata += input_width * input_height;
+                    Ydata += output_width * output_height;
+                }
+            }
+        }
+    } else {
+        const float rheight = (output_height > 1) ? (float)(input_height) / (output_height) : 0.f;
+        const float rwidth  = (output_width > 1) ? (float)(input_width) / (output_width) : 0.f;
+
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < output_height; ++h2) {
+            float h1r     = static_cast<float>(rheight * (h2 + 0.5) - 0.5);
+            h1r           = h1r >= 0 ? h1r : 0;
+            const int h1  = static_cast<int>(h1r);
+            const int h1p = (h1 < input_height - 1) ? 1 : 0;
+
+            const float h1lambda = h1r - h1;
+            const float h0lambda = (float)1. - h1lambda;
+
+            for (int w2 = 0; w2 < output_width; ++w2) {
+                float w1r = static_cast<float>(rwidth * (w2 + 0.5) - 0.5);
+                w1r       = w1r >= 0 ? w1r : 0;
+
+                const int w1            = static_cast<int>(w1r);
+                const int w1p           = (w1 < input_width - 1) ? 1 : 0;
+                const float w1lambda    = w1r - w1;
+                const float w0lambda    = (float)1. - w1lambda;
+                const float *x_data_ptr = &(input_data[h1 * input_width + w1]);
+                float *y_data_ptr       = &(output_data[h2 * output_width + w2]);
+                for (int c = 0; c < channels; ++c) {
+                    y_data_ptr[0] = h0lambda * (w0lambda * x_data_ptr[0] + w1lambda * x_data_ptr[w1p]) +
+                                    h1lambda * (w0lambda * x_data_ptr[h1p * input_width] +
+                                                w1lambda * x_data_ptr[h1p * input_width + w1p]);
+                    x_data_ptr += input_width * input_height;
+                    y_data_ptr += output_width * output_height;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+// cubic interpolate weights
+template <typename T>
+static void GetCubicWeights(float coor, T coeffs[4]) {
+    // opencv uses -0.75
+    static const float A = -0.75f;
+    float x = coor - std::floor(coor);
+
+    coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
+    coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
+    coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+// cubic interpolate function
+template <bool align_corners>
+static void upsample_cubic2d_impl(float *dst, const float *src, int sh, int sw,
+                                      int dh, int dw, int channels) {
+    const float h_scale = (dh > 1) ? (align_corners ? (float)(sh - 1) / (dh - 1)
+                                                : (float)(sh) / (dh)) : 0.f;
+    const float w_scale = (dw > 1) ? (align_corners ? (float)(sw - 1) / (dw - 1)
+                                                : (float)(sw) / (dw)) : 0.f;
+#define Clip(x,X) ( (x) >=0 ? ((x)<(X)?(x):((X)-1)) : 0 )
+#define SrcValueAt(c, h, w) (src[c*sh*sw+(Clip(h,sh))*sw+(Clip(w,sw))])
+
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < dh; ++h2) {
+            float h1 = static_cast<float>(align_corners ? h_scale * h2 : h_scale * (h2 + 0.5) - 0.5);
+            int hh = std::floor(h1);
+            float wy[4];
+            GetCubicWeights(h1, wy);
+            for (int w2 = 0; w2 < dw; ++w2) {
+                float w1 = static_cast<float>(align_corners? w_scale * w2 : w_scale * (w2 + 0.5) - 0.5);
+                int ww = std::floor(w1);
+                float wx[4];
+                GetCubicWeights(w1, wx);
+                for (int c = 0; c < channels; ++c) {
+                    float src_arr[4][4] = {
+                        {SrcValueAt(c, hh-1, ww-1), SrcValueAt(c, hh-1, ww), SrcValueAt(c, hh-1, ww+1), SrcValueAt(c, hh-1, ww+2)},
+                        {SrcValueAt(c, hh+0, ww-1), SrcValueAt(c, hh+0, ww), SrcValueAt(c, hh+0, ww+1), SrcValueAt(c, hh+0, ww+2)},
+                        {SrcValueAt(c, hh+1, ww-1), SrcValueAt(c, hh+1, ww), SrcValueAt(c, hh+1, ww+1), SrcValueAt(c, hh+1, ww+2)},
+                        {SrcValueAt(c, hh+2, ww-1), SrcValueAt(c, hh+2, ww), SrcValueAt(c, hh+2, ww+1), SrcValueAt(c, hh+2, ww+2)}
+                    };
+                    float vals[4];
+                    vals[0] = wx[0]*src_arr[0][0] + wx[1]*src_arr[0][1] + wx[2]*src_arr[0][2] + wx[3]*src_arr[0][3];
+                    vals[1] = wx[0]*src_arr[1][0] + wx[1]*src_arr[1][1] + wx[2]*src_arr[1][2] + wx[3]*src_arr[1][3];
+                    vals[2] = wx[0]*src_arr[2][0] + wx[1]*src_arr[2][1] + wx[2]*src_arr[2][2] + wx[3]*src_arr[2][3];
+                    vals[3] = wx[0]*src_arr[3][0] + wx[1]*src_arr[3][1] + wx[2]*src_arr[3][2] + wx[3]*src_arr[3][3];
+
+                    float sum = wy[0]*vals[0] + wy[1]*vals[1] + wy[2]*vals[2] + wy[3]*vals[3];
+                    dst[(c * dh + h2) * dw + w2] = sum;
+                }
+            }
+        }
+#undef Clip
+#undef SrcValueAt
+}
+
+static inline int upsample_cubic2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                      int output_height, int output_width, int channels, bool align_corners) {
+    if (align_corners)
+        upsample_cubic2d_impl<true>(output_data, input_data, input_height,
+                     input_width, output_height, output_width, channels);
+    else
+        upsample_cubic2d_impl<false>(output_data, input_data, input_height,
+                     input_width, output_height, output_width, channels);
+
+    return 0;
+}
+
+CpuUpsampleLayerAcc::~CpuUpsampleLayerAcc() {}
+
+
+Status CpuUpsampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        int workspace_byte_size = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims) * sizeof(float);
+        if (buffer_input_fp32_.GetBytesSize() < workspace_byte_size) {
+            buffer_input_fp32_ = RawBuffer(workspace_byte_size);
+        }
+        workspace_byte_size = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims) * sizeof(float);
+        if (buffer_output_fp32_.GetBytesSize() < workspace_byte_size) {
+            buffer_output_fp32_ = RawBuffer(workspace_byte_size);
+        }
+    }
+    return TNN_OK;
+}
+
+Status CpuUpsampleLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<UpsampleLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() > 1) {
+        auto input_dims = inputs[0]->GetBlobDesc().dims;
+        
+        //fill param with inputs
+        std::vector<float> scales;
+        std::vector<int> sizes;
+        Blob *scales_blob = nullptr;
+        Blob *sizes_blob = nullptr;
+        if (inputs.size() == 2) {
+            scales_blob = inputs[1];
+        } else if (inputs.size() == 3) {
+            scales_blob = inputs[2];
+        } else if (inputs.size() == 4) {
+            sizes_blob = inputs[3];
+        }
+        
+        if (scales_blob) {
+            auto scales_data  = (float *)scales_blob->GetHandle().base;
+            auto scales_count = DimsVectorUtils::Count(scales_blob->GetBlobDesc().dims);
+            if (scales_count < 2) {
+                LOGE("Error: Upsample has invalid scales count:%d", scales_count);
+                return Status(TNNERR_PARAM_ERR, "Error: Upsample has invalid scales count");
+            }
+            for (int i = 0; i < scales_count; ++i) {
+                scales.push_back(scales_data[i]);
+            }
+            // width_scale height_scale
+            float w_scale = scales[scales.size() - 1];
+            float h_scale = scales[scales.size() - 2];
+            scales = {w_scale, h_scale};
+            layer_param->scales = scales;
+        }
+        
+        if (sizes_blob) {
+            auto sizes_data  = (int *)sizes_blob->GetHandle().base;
+            auto sizes_count = DimsVectorUtils::Count(sizes_blob->GetBlobDesc().dims);
+            if (sizes_count < 2) {
+                LOGE("Error: Upsample has invalid sizes count:%d", sizes_count);
+                return Status(TNNERR_PARAM_ERR, "Error: Upsample has invalid scales count");
+            }
+            for (int i = 0; i < sizes_count; ++i) {
+                sizes.push_back(sizes_data[i]);
+            }
+            // width_scale height_scale
+            int w_size = sizes[sizes.size() - 1];
+            int h_size = sizes[sizes.size() - 2];
+            sizes = {w_size, h_size};
+            layer_param->dims = sizes;
+        }
+        
+        //infer output shape
+        Status status = TNN_OK;
+        auto output_dims = DimsFunctionUtils::Upsample(input_dims, scales, sizes, layer_param->mode, &status);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+Status CpuUpsampleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<UpsampleLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: UpsampleLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims_input   = input_blob->GetBlobDesc().dims;
+    auto dims_output  = output_blob->GetBlobDesc().dims;
+
+    auto batch       = dims_input[0];
+    auto channel     = dims_input[1];
+    auto input_width = dims_input[3], input_height = dims_input[2];
+    auto output_width = dims_output[3], output_height = dims_output[2];
+    auto input_plane  = input_width * input_height * channel;
+    auto output_plane = output_width * output_height * channel;
+
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+    if (data_type == DATA_TYPE_INT8) {
+        auto resource      = reinterpret_cast<BlobInt8 *>(input_blob)->GetIntResource();
+        const float *scale = resource->scale_handle.force_to<float *>();
+        int scale_len      = resource->scale_handle.GetDataCount();
+        auto workspace     = buffer_input_fp32_.force_to<float *>();
+        NaiveDequant(reinterpret_cast<int8_t *>(input_data), scale, scale_len, workspace, dims_input);
+        input_data  = workspace;
+        output_data = buffer_output_fp32_.force_to<float *>();
+    }
+
+    if (param->mode == 1) {  // nearest
+        for (int b = 0; b < batch; ++b) {
+            upsample_nearest2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                               output_height, output_width, channel, (bool)param->align_corners);
+        }
+    } else if (param->mode == 2) {  // bilinear/linear
+        for (int b = 0; b < batch; ++b) {
+            upsample_bilinear2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                                output_height, output_width, channel, (bool)param->align_corners);
+        }
+    } else if (param->mode == 3) { // cubic
+        for (int b = 0; b < batch; ++b) {
+            upsample_cubic2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                             output_height, output_width, channel, (bool)param->align_corners);
+        }
+    } else {
+        LOGE("Error: Upsample dont support resize type\n");
+        return Status(TNNERR_MODEL_ERR, "Error: Upsample dont support resize type");
+    }
+
+    if (data_type == DATA_TYPE_INT8) {
+        auto resource      = reinterpret_cast<BlobInt8 *>(output_blob)->GetIntResource();
+        const float *scale = resource->scale_handle.force_to<float *>();
+        int scale_len      = resource->scale_handle.GetDataCount();
+        auto workspace     = output_data;
+        output_data        = static_cast<float *>(output_blob->GetHandle().base);
+        NaiveQuant(workspace, scale, scale_len, reinterpret_cast<int8_t *>(output_data), dims_output);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Upsample, LAYER_UPSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.h b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.h
new file mode 100644
index 0000000..c364e0f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_upsample_layer_acc.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_UPSAMPLE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_UPSAMPLE_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief upsample layer cpu acc
+class CpuUpsampleLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuUpsampleLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_input_fp32_;
+    RawBuffer buffer_output_fp32_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_UPSAMPLE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc
new file mode 100644
index 0000000..4cc0f97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+DECLARE_CPU_ACC(Where, LAYER_WHERE);
+
+Status CpuWhereLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuWhereLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    //X, Y, condition order for input
+    Blob *output_blob = outputs[0];
+    
+    std::vector<void *> input_ptrs;
+    std::vector<DimsVector> input_shapes;
+    for (size_t inid = 0; inid < inputs.size(); inid++) {
+        input_ptrs.push_back(inputs[inid]->GetHandle().base);
+        input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
+    }
+    
+    auto data_type = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_INT32) {
+        void *output_data = output_blob->GetHandle().base;
+        const auto &output_dims = output_blob->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int, char, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                              [](int a, int b, char c) -> int { return c!=0 ? a : b; });
+    }  else {
+        LOGE("Error: CpuEqualLayerAcc don't support data type: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Where, LAYER_WHERE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_blob_converter.cc b/3rdparty/TNN/source/tnn/device/cpu/cpu_blob_converter.cc
new file mode 100644
index 0000000..227f4d3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_blob_converter.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/blob_converter_default.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+class CpuBlobConverterAcc : public DefaultBlobConverterAcc {
+public:
+    CpuBlobConverterAcc(Blob *blob) : DefaultBlobConverterAcc(blob) {}
+    ~CpuBlobConverterAcc() {}
+};
+
+DECLARE_BLOB_CONVERTER_CREATER(Cpu);
+REGISTER_BLOB_CONVERTER(Cpu, DEVICE_NAIVE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_context.cc b/3rdparty/TNN/source/tnn/device/cpu/cpu_context.cc
new file mode 100644
index 0000000..f8aff47
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_context.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/cpu_context.h"
+
+namespace TNN_NS {
+
+Status CpuContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status CpuContext::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status CpuContext::ShareCommandQueue(Context* context) {
+    return TNN_OK;
+}
+
+Status CpuContext::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+
+Status CpuContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status CpuContext::Synchronize() {
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_context.h b/3rdparty/TNN/source/tnn/device/cpu/cpu_context.h
new file mode 100644
index 0000000..d00fd3e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_context.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class CpuContext : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief share tnn command queue to another context
+    Status ShareCommandQueue(Context* context) override;
+    
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_device.cc b/3rdparty/TNN/source/tnn/device/cpu/cpu_device.cc
new file mode 100644
index 0000000..420f270
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_device.cc
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/cpu_device.h"
+#include "tnn/device/cpu/cpu_context.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+
+namespace TNN_NS {
+
+CpuDevice::CpuDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+CpuDevice::~CpuDevice() {}
+
+BlobMemorySizeInfo CpuDevice::Calculate(BlobDesc& desc) {
+    return Calculate1DMemorySize(desc);
+}
+
+Status CpuDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_NAIVE;
+    if (mat_type == NCHW_FLOAT || 
+        mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_INT8_TEST || mat_type == RESERVED_FP16_TEST) {
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == N8UC3 || mat_type == NGRAY || mat_type == NNV21 || mat_type == NNV12) {
+        desc.data_type   = DATA_TYPE_INT8;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == N8UC4) {
+        BlobMemorySizeInfo size_info;
+        int count           = desc.dims[0] * 4 * desc.dims[2] * desc.dims[3];
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(count);
+        return Allocate(handle, size_info);
+    } else if (mat_type == NC_INT32) {
+        auto size_info   = Calculate(desc);
+        size_info.data_type     = DATA_TYPE_INT32;
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("CpuDevice dont support mat_type:%d\n", mat_type);
+        return Status(TNNERR_PARAM_ERR, "cpu dont support mat_type");
+    }
+}
+
+Status CpuDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    if (handle) {
+        auto size = GetBlobMemoryBytesSize(size_info);
+        if (size > 0) {
+            *handle = malloc(size);
+        } else if (size == 0) {
+            //support empty blob for yolov5 Slice_507, only in device cpu
+            *handle = NULL;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "CpuDevice::Allocate malloc bytes size < 0");
+        }
+    }
+    return TNN_OK;
+}
+
+Status CpuDevice::Free(void* handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+Status CpuDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+Status CpuDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc* CpuDevice::CreateLayerAcc(LayerType type) {
+    auto& layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    }
+    return NULL;
+}
+
+Context* CpuDevice::CreateContext(int device_id) {
+    return new CpuContext();
+}
+
+NetworkType CpuDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_DEFAULT; 
+}
+
+Status CpuDevice::RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator) {
+    GetLayerCreatorMap()[type] = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>& CpuDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+TypeDeviceRegister<CpuDevice> g_cpu_device_register(DEVICE_NAIVE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_device.h b/3rdparty/TNN/source/tnn/device/cpu/cpu_device.h
new file mode 100644
index 0000000..491f6be
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_device.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_DEVICE_H_
+
+#include <map>
+#include <memory>
+#include <cstring>
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief CpuDevice create cpu memory and cpu layer acc
+
+class CpuDevice : public AbstractDevice {
+ public:
+
+  explicit CpuDevice(DeviceType device_type);
+
+  ~CpuDevice();
+
+  virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+  virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+  virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+  virtual Status Free(void* handle);
+
+  virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle * src, BlobDesc& desc,
+                              void* command_queue);
+
+  virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle * src, BlobDesc& desc,
+                                  void* command_queue);
+
+  virtual AbstractLayerAcc *CreateLayerAcc(LayerType type);
+
+  virtual Context *CreateContext(int device_id);
+
+  virtual NetworkType ConvertAutoNetworkType();
+
+  static Status RegisterLayerAccCreator(LayerType type,
+                                            LayerAccCreator *creator);
+
+ private:
+  static std::map<LayerType, std::shared_ptr<LayerAccCreator>> &GetLayerCreatorMap();
+};
+
+//@brief CpuTypeLayerAccRegister register CpuTypeLayerAccCreator
+template <typename T>
+class CpuTypeLayerAccRegister {
+ public:
+  explicit CpuTypeLayerAccRegister(LayerType type) {
+    CpuDevice::RegisterLayerAccCreator(type, new T());
+  }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.cc b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.cc
new file mode 100644
index 0000000..ab755e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.cc
@@ -0,0 +1,277 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/cpu_mat_converter.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+
+namespace TNN_NS {
+
+CpuMatConverterAcc::CpuMatConverterAcc() : MatConverterAcc() {}
+CpuMatConverterAcc::~CpuMatConverterAcc() {}
+
+Status CpuMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    Status ret            = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, false);
+    if (ret != TNN_OK)
+        return ret;
+
+    MatType mat_type   = src.GetMatType();
+    int data_type_size = 1;
+    DimsVector dims    = src.GetDims();
+    if (mat_type == NCHW_FLOAT) {
+        data_type_size = sizeof(float);
+    } else if (mat_type == N8UC4) {
+        //special for 8UC4, blob channel <= 4.
+        dims[1] = 4;
+    }
+    int size_in_bytes = DimsVectorUtils::Count(dims) * data_type_size;
+    memcpy(dst.GetData(), src.GetData(), size_in_bytes);
+    return ret; 
+}
+
+Status CpuMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NCHW_FLOAT) {
+        ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    } else if ((src.GetMatType() == N8UC4) || (src.GetMatType() == N8UC3) || (src.GetMatType() == NGRAY)) {
+        int channel = src.GetChannel();
+        if (param.type == INTERP_TYPE_LINEAR) {
+            for (int batch = 0; batch < src.GetBatch(); batch++)
+            {
+                uint8_t* src_ptr = (uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * channel;
+                uint8_t* dst_ptr = (uint8_t*)dst.GetData() + batch * dst_width * dst_height * channel;
+                ResizeBilinear(src_ptr, src.GetWidth(), src.GetHeight(),
+                               dst_ptr, dst_width, dst_height, channel);
+            } 
+        } else if(param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearest((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+            (uint8_t*)dst.GetData(), dst_width, dst_height, channel);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if ((src.GetMatType() == NNV12) || (src.GetMatType() == NNV21)) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            for (int batch = 0; batch < src.GetBatch(); batch++)
+            {
+                uint8_t* src_ptr = (uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * 3 / 2;
+                uint8_t* dst_ptr = (uint8_t*)dst.GetData() + batch * dst_width * dst_height * 3 / 2;
+                ResizeBilinearYUV(src_ptr, src.GetWidth(), src.GetHeight(),
+                                  dst_ptr, dst_width, dst_height);
+            }
+        } else if(param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestYUV((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == RESERVED_BFP16_TEST) {
+        ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    } else {
+        ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return ret;
+}
+
+Status CpuMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        auto mat_type = src.GetMatType();
+        int channel = mat_type == NGRAY? 1 : (mat_type == N8UC3? 3 : 4);
+        for (int batch = 0; batch < src.GetBatch(); batch++)
+        {
+            auto src_ptr = GET_OFFSET_PTR((uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * channel, (param.top_left_x + param.top_left_y * src.GetWidth()) * channel);
+            auto dst_ptr = GET_OFFSET_PTR((uint8_t*)dst.GetData() + batch * dst.GetWidth() * dst.GetHeight() * channel, 0);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width * channel, param.height, src.GetWidth() * channel, dst.GetWidth() * channel);
+        } 
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.top_left_x % 2 || param.top_left_y % 2 || param.width % 2 || param.height % 2) {
+            return Status(TNNERR_PARAM_ERR, "corp param can not be odd");
+        }
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            // crop y
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), param.top_left_x + param.top_left_y * src.GetWidth() +
+                                                         b * src.GetHeight() * src.GetWidth() * 3 / 2);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3 / 2);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height, src.GetWidth(), dst.GetWidth());
+            // crop uv
+            src_ptr = GET_OFFSET_PTR(
+                src.GetData(), src.GetWidth() * src.GetHeight() + param.top_left_x + param.top_left_y * src.GetWidth() / 2 +
+                               b * src.GetHeight() * src.GetWidth() * 3 / 2);
+            dst_ptr = GET_OFFSET_PTR(dst.GetData(), dst.GetWidth() * dst.GetHeight() +
+                                                    b * dst.GetHeight() * dst.GetWidth() * 3 / 2);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height / 2, src.GetWidth(), dst.GetWidth());
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+
+    return ret;
+}
+
+Status CpuMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    //LOGE("cpu mat converter warp affine start, mat type: %d, interp type: %d\n", src.GetMatType(), param.interp_type);
+    Status ret            = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        auto mat_type = src.GetMatType();
+        int channel = mat_type == NGRAY? 1 : (mat_type == N8UC3? 3 : 4);
+        if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {
+            for (int batch = 0; batch < src.GetDims()[0]; batch++)
+            {
+                uint8_t* src_ptr = (uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * channel;
+                uint8_t* dst_ptr = (uint8_t*)dst.GetData() + batch * dst.GetWidth() * dst.GetHeight() * channel;
+                WarpAffineBilinear(src_ptr, src.GetWidth(), src.GetHeight(), channel,
+                                   dst_ptr, dst.GetWidth(), dst.GetHeight(),
+                                   param.transform, param.border_val);
+            }
+        } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {
+            for (int batch = 0; batch < src.GetDims()[0]; batch++)
+            {
+                uint8_t* src_ptr = (uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * channel;
+                uint8_t* dst_ptr = (uint8_t*)dst.GetData() + batch * dst.GetWidth() * dst.GetHeight() * channel;
+                WarpAffineNearest(src_ptr, src.GetWidth(), src.GetHeight(), channel,
+                                  dst_ptr, dst.GetWidth(), dst.GetHeight(),
+                                  param.transform, param.border_val);
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "warpaffine type not support yet");
+        }
+    } else if (src.GetMatType() == NNV12 || src.GetMatType() == NNV21) {
+        auto mat_type = src.GetMatType();
+        if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {
+            WarpAffineBilinearYUV((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                  (uint8_t*)dst.GetData(), dst.GetWidth(), dst.GetHeight(),
+                                  param.transform, param.border_val);
+        } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {
+            WarpAffineNearestYUV((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                 (uint8_t*)dst.GetData(), dst.GetWidth(), dst.GetHeight(),
+                                 param.transform, param.border_val);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "warpaffine type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+
+    return ret;
+}
+
+Status CpuMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (type == COLOR_CONVERT_NV12TOBGR) {
+        YUVToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), true);
+    } else if (type == COLOR_CONVERT_NV21TOBGR) {
+        YUVToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), false);
+    } else if (type == COLOR_CONVERT_NV12TOBGRA) {
+        YUVToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), true);
+    } else if (type == COLOR_CONVERT_NV21TOBGRA) {
+        YUVToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), false);
+    } else if (type == COLOR_CONVERT_BGRTOGRAY) {
+        BGROrBGRAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), 3);
+    } else if (type == COLOR_CONVERT_BGRATOGRAY) {
+        BGROrBGRAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), 4);
+    } else if (type == COLOR_CONVERT_RGBTOGRAY) {
+        RGBOrRGBAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), 3);
+    } else if (type == COLOR_CONVERT_RGBATOGRAY) {
+        RGBOrRGBAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth(), 4);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "color conversion type not support yet");
+    }
+
+    return ret;
+}
+
+Status CpuMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        if (param.border_type == BORDER_TYPE_CONSTANT) {
+            auto mat_type = src.GetMatType();
+            int channel = mat_type == NGRAY? 1 : (mat_type == N8UC3? 3 : 4);
+            for (int i = 0; i < DimsVectorUtils::Count(dst.GetDims()); ++i) {
+                ((uint8_t*)dst.GetData())[i] = int(param.border_val);
+            }
+            for (int batch = 0; batch < src.GetBatch(); batch++)
+            {
+                auto src_ptr = GET_OFFSET_PTR((uint8_t*)src.GetData() + batch * src.GetWidth() * src.GetHeight() * channel, 0);
+                auto dst_ptr = GET_OFFSET_PTR((uint8_t*)dst.GetData() + batch * dst.GetWidth() * dst.GetHeight() * channel,
+                                              (param.left + param.top * dst.GetWidth()) * channel);
+                MatMemcpy2D(src_ptr, dst_ptr, src.GetWidth() * channel, src.GetHeight(), src.GetWidth() * channel, dst.GetWidth() * channel);
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "CopyMakeBorder border type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder mat type not support yet");
+    }
+
+    return ret;
+}
+
+void CpuMatConverterAcc::MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride) {
+    auto src_ptr = reinterpret_cast<uint8_t*>(src);
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+
+    for (int h = 0; h < height; h++) {
+        memcpy(dst_ptr, src_ptr, width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+    }
+
+}
+
+DECLARE_MAT_CONVERTER_CREATER(Cpu);
+REGISTER_MAT_CONVERTER(Cpu, DEVICE_NAIVE);
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.h b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.h
new file mode 100644
index 0000000..7efd2f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_converter.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_CONVERTER_H_
+#include "tnn/core/macro.h"
+#include "tnn/device/cpu/acc/compute/compute_int8.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+#include "tnn/utils/mat_converter_acc.h"
+#include "tnn/device/cpu/cpu_mat_util.h"
+
+namespace TNN_NS {
+#define GET_OFFSET_PTR(ptr, offset) (reinterpret_cast<int8_t*>(ptr) + offset)
+class CpuMatConverterAcc : public MatConverterAcc {
+public:
+    CpuMatConverterAcc();
+    virtual ~CpuMatConverterAcc();
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+
+private:
+    void MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.cc b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.cc
new file mode 100644
index 0000000..95d7baa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.cc
@@ -0,0 +1,451 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/cpu_mat_util.h"
+#include <algorithm>
+#include <type_traits>
+#include "tnn/core/macro.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/mat_converter_utils.h"
+
+namespace TNN_NS {
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)0), (int)UCHAR_MAX)
+#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)SHRT_MIN), (int)SHRT_MAX)
+#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)INT_MIN), (int)INT_MAX)
+
+#define INTER_REMAP_COEF_BITS  15
+#define INTER_REMAP_COEF_SCALE (1<<INTER_REMAP_COEF_BITS)
+#define INTER_BITS      5
+#define INTER_TAB_SIZE  (1<<INTER_BITS)
+#define KSIZE 2
+static void ResizeGetAdjacentRows(int sy, int prev_sy, short* rows0, short* rows1, int* xofs,
+                                     const uint8_t* src, int src_stride, int c, int w, const short* ialphap) {
+    const uint8_t* S0 = src + src_stride * (sy);
+    const uint8_t* S1 = src + src_stride * (sy + 1);
+
+    short* rows0p        = rows0;
+    short* rows1p        = rows1;
+    for (int dx = 0; dx < w; dx++) {
+        int sx   = xofs[dx];
+        short a0 = ialphap[0];
+        short a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+
+        for (int dc = 0; dc < c; ++dc) {
+            rows0p[dc]         = (S0p[dc] * a0 + S0p[dc + c] * a1) >> 4;
+            rows1p[dc]         = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+        }
+
+        ialphap += 2;
+        rows0p += c;
+        rows1p += c;
+    }
+}
+
+static void ResizeCalculateOneRow(short* rows0p, short* rows1p, const int b0, const int b1, const int w, const int c,
+                                     uint8_t* Dp) {
+    int remain = w * c;
+    for (; remain; --remain) {
+        *Dp++ = (uint8_t)(
+            ((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
+    }
+}
+
+void ResizeBilinearImpl(const uint8_t* src, int src_w, int src_h, int src_stride,
+                             uint8_t* dst, int w, int h, int stride, int channel) {
+    int* buf = nullptr;
+    GetResizeBuf(src_w, src_h, w, h, channel, &buf);
+    int* xofs = buf;
+    int* yofs = buf + w;
+    short* ialpha = (short*)(buf + w + h);
+    short* ibeta  = (short*)(buf + w + h + w);
+
+    // loop body
+    short* rows0 = new short[w * channel];
+    short* rows1 = new short[w * channel];
+
+    int prev_sy = -2;
+
+    for (int dy = 0; dy < h; dy++) {
+        int sy = yofs[dy];
+        ResizeGetAdjacentRows(sy, prev_sy, rows0, rows1, xofs, src, src_stride, channel, w, ialpha);
+        prev_sy = sy;
+
+        // vresize
+        short b0 = ibeta[0];
+        short b1 = ibeta[1];
+
+        uint8_t* Dp   = dst + stride * (dy);
+
+        ResizeCalculateOneRow(rows0, rows1, b0, b1, w, channel, Dp);
+
+        ibeta += 2;
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+}
+
+void ResizeNearestImpl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride,
+                         uint8_t* dst, int w, int h, int stride, int channel) {
+    int* buf      = nullptr;
+    GetResizeBufNearset(src_w, src_h, w, h, channel, &buf);
+    int* xofs     = buf;
+    int* yofs     = buf + w;
+    uint8_t* ialpha = (uint8_t*)(buf + w + h);
+    uint8_t* ibeta  = (uint8_t*)(buf + w + h + w);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        for (int dy = 0; dy < h; dy++) {
+            int sy = (ibeta[dy] == 0) ? yofs[dy] + 1 : yofs[dy];
+
+            const uint8_t* Sp = src + src_stride * (b * src_h + sy);
+            uint8_t* Dp       = dst + stride * (b * h + dy);
+
+            int dx = 0;
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                for(int dc = 0; dc < channel; dc++) {
+                    Dp[dx*channel + dc] = (ialpha[dx] == 0) ? Sp[sx + dc + channel] : Sp[sx + dc];
+                }
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeBilinear(const uint8_t* src, int src_w, int src_h, uint8_t* dst, int w, int h, int channel) {
+    return ResizeBilinearImpl(src, src_w, src_h, src_w * channel, dst, w, h, w * channel, channel);
+}
+
+void ResizeBilinearYUV(const uint8_t* src, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    const uint8_t* srcY  = src;
+    uint8_t* dstY        = dst;
+    ResizeBilinear(srcY, src_w, src_h, dstY, w, h, 1);
+
+    const uint8_t* srcUV = srcY + src_w * src_h;
+    uint8_t* dstUV       = dstY + w * h;
+    ResizeBilinear(srcUV, src_w / 2, src_h / 2, dstUV, w / 2, h / 2, 2);
+}
+
+void ResizeNearest(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h, int channel) {
+    return ResizeNearestImpl(src, batch, src_w, src_h, src_w * channel, dst, w, h, w * channel, channel);
+}
+
+void ResizeNearestYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = w * h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        ResizeNearest(srcY, 1, src_w, src_h, dstY, w, h, 1);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + w * h;
+        ResizeNearest(srcUV, 1, src_w / 2, src_h / 2, dstUV, w / 2, h / 2, 2);
+    }
+}
+
+bool CheckDataIsOnBoundary(const int new_x_loc, const int new_y_loc, const int src_w, const int src_h) {
+    return new_x_loc >= -1 && new_x_loc <= (src_w - 1) &&
+           new_y_loc >= -1 && new_y_loc <= (src_h - 1);
+}
+
+static void CalculateBilinearOutput(const uint8_t* src, const uint8_t* src2, uint8_t* dst,
+                                    int* adelta, int* bdelta, int src_h, int src_w, int channel,
+                                    int x, int y, int dst_loc_base, float* _tab, int border_val) {
+    int new_x       = adelta[2 * x] + bdelta[2 * y] + 16;
+    int new_y       = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
+    int new_x_loc   = new_x >> 10;
+    int new_y_loc   = new_y >> 10;
+
+    short coeffs_x  = (new_x >> 5) & 31;
+    short coeffs_y  = (new_y >> 5) & 31;
+
+    int src_loc     = (new_x_loc + new_y_loc * src_w) * channel;
+
+    short bilinearWeight[KSIZE * KSIZE];
+    // set weight for bilinear
+    for (int yy = 0; yy < KSIZE; yy++)
+    {
+        float vy    = _tab[coeffs_y * KSIZE + yy];
+        for (int xx = 0; xx < KSIZE; xx++)
+        {
+            float v = vy * _tab[coeffs_x * KSIZE + xx];
+            bilinearWeight[yy * KSIZE + xx] = SATURATE_CAST_SHORT(v * INTER_REMAP_COEF_SCALE);
+        }
+    }
+
+    if (new_x_loc >= 0 && new_x_loc < (src_w - 1) && new_y_loc >= 0 && new_y_loc < (src_h - 1)) {
+        for (int c = 0; c < channel; c++)
+        {
+            int dst_loc = dst_loc_base + x * channel;
+            int point00 = src[src_loc + c];
+            int point01 = src[src_loc + channel + c];
+            int point10 = src2[src_loc + c];
+            int point11 = src2[src_loc + channel + c];
+
+            int val_xy  = bilinearWeight[0] * point00 + bilinearWeight[1] * point01 + bilinearWeight[2] * point10 +
+                            bilinearWeight[3] * point11;
+
+            dst[dst_loc + c] = SATURATE_CAST_UCHAR((val_xy + (1 << 14)) >> 15);
+        }
+    }
+    else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+        int dsc_loc = dst_loc_base + x * channel;
+
+        int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+        int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+        int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+        int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+        for (int c = 0; c < channel; ++c) {
+            int val_xy = 0;
+            val_xy += bilinearWeight[0] * (mask0 ? src[src_loc + c] : border_val);
+            val_xy += bilinearWeight[1] * (mask1 ? src[src_loc + channel + c] : border_val);
+            val_xy += bilinearWeight[2] * (mask2 ? src2[src_loc + c] : border_val);
+            val_xy += bilinearWeight[3] * (mask3 ? src2[src_loc + channel + c] : border_val);
+            dst[dsc_loc + c] = SATURATE_CAST_UCHAR((val_xy + (1 << 14)) >> 15);
+        }
+    }
+}
+
+void WarpAffineBilinear(const uint8_t* src, int src_w, int src_h, int channel, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val)
+{
+    // Init
+    uint8_t border_ival = (uint8_t)border_val;
+    for (int i = 0; i < dst_h * dst_w * channel; ++i) {
+        dst[i] = border_ival;
+    }
+
+    float* _tab = new float[2 * INTER_TAB_SIZE];
+    InitInterTab1D(_tab, INTER_TAB_SIZE);
+
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+
+    int* buffer = (int *)malloc((dst_w + dst_h) * 2 * sizeof(int));
+
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    for (int x = 0; x < dst_w; x++) {
+        adelta[x * 2] = SATURATE_CAST_INT(m[0] * x * 1024);
+        adelta[x * 2 + 1] = SATURATE_CAST_INT(m[3] * x * 1024);
+    }
+
+    for (int y = 0; y < dst_h; y++) {
+        bdelta[y * 2] = SATURATE_CAST_INT((m[1] * y + m[2]) * 1024);
+        bdelta[y * 2 + 1] = SATURATE_CAST_INT((m[4] * y + m[5]) * 1024);
+    }
+
+    int* buf_loc   = new int[dst_w];
+    short* tab_loc = new short[dst_w];
+
+    const uint8_t* src2 = src + src_w * channel;
+
+    for (int y = 0; y < dst_h; ++y) {
+        int dst_loc_base    = y * dst_w * channel;
+
+        for (int x = 0; x < dst_w; ++x) {
+            CalculateBilinearOutput(src, src2, dst, adelta, bdelta, src_h, src_w, channel, x, y,
+                                    dst_loc_base, _tab, (int)border_ival);
+        }
+    }
+
+    delete[] buf_loc;
+    delete[] tab_loc;
+    delete[] _tab;
+
+    free(buffer);
+}
+
+void WarpAffineBilinearYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                           const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        WarpAffineBilinear(srcY, src_w, src_h, 1, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineBilinear(srcUV, src_w / 2, src_h / 2, 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+static void CalculateNearestOutput(const uint8_t* src, const uint8_t* src2, uint8_t* dst,
+                                   int* adelta, int* bdelta, int src_h, int src_w, int channel,
+                                   int x, int y, int dst_loc_base, int border_val) {
+    int new_x       = adelta[2 * x] + bdelta[2 * y] + 16;
+    int new_y       = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
+    int new_x_loc   = new_x >> 10;
+    int new_y_loc   = new_y >> 10;
+
+    short coeffs_x  = (new_x >> 5) & 31;
+    short coeffs_y  = (new_y >> 5) & 31;
+
+    int src_loc     = (new_x_loc + new_y_loc * src_w) * channel;
+
+    if (new_x_loc >= 0 && new_x_loc < (src_w - 1) && new_y_loc >= 0 && new_y_loc < (src_h - 1)) {
+        for (int c = 0; c < channel; c++)
+        {
+            int dst_loc = dst_loc_base + x * channel;
+            int point00 = src[src_loc + c];
+            int point01 = src[src_loc + channel + c];
+            int point10 = src2[src_loc + c];
+            int point11 = src2[src_loc + channel + c];
+
+            int val_xy;
+            if (coeffs_y < (1<<4)) {
+                val_xy = (coeffs_x < (1<<4)) ? point00 : point01;
+            } else {
+                val_xy = (coeffs_x < (1<<4)) ? point10 : point11;
+            }
+
+            dst[dst_loc + c] = val_xy;
+        }
+    }
+    else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+        int dsc_loc = dst_loc_base + x * channel;
+
+        int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+        int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+        int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+        int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+        for (int c = 0; c < channel; ++c) {
+            int point00 = mask0 ? src[src_loc + c] : border_val;
+            int point01 = mask1 ? src[src_loc + channel + c] : border_val;
+            int point10 = mask2 ? src2[src_loc + c] : border_val;
+            int point11 = mask3 ? src2[src_loc + channel + c] : border_val;
+
+            int val_xy = 0;
+            if (coeffs_y < (1<<4)) {
+                val_xy = (coeffs_x < (1<<4)) ? point00 : point01;
+            } else {
+                val_xy = (coeffs_x < (1<<4)) ? point10 : point11;
+            }
+
+            dst[dsc_loc + c] = val_xy;
+        }
+    }
+}
+
+void WarpAffineNearest(const uint8_t* src, int src_w, int src_h, int channel, uint8_t* dst, int dst_w, int dst_h,
+                       const float (*transform)[3], const float border_val)
+{
+    // Init
+    uint8_t border_ival = (uint8_t)border_val;
+    for (int i = 0; i < dst_h * dst_w * channel; ++i) {
+        dst[i] = border_ival;
+    }
+
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+
+    int* buffer = (int *)malloc((dst_w + dst_h) * 2 * sizeof(int));
+
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    for (int x = 0; x < dst_w; x++) {
+        adelta[x * 2] = SATURATE_CAST_INT(m[0] * x * 1024);
+        adelta[x * 2 + 1] = SATURATE_CAST_INT(m[3] * x * 1024);
+    }
+
+    for (int y = 0; y < dst_h; y++) {
+        bdelta[y * 2] = SATURATE_CAST_INT((m[1] * y + m[2]) * 1024);
+        bdelta[y * 2 + 1] = SATURATE_CAST_INT((m[4] * y + m[5]) * 1024);
+    }
+
+    const uint8_t* src2 = src + src_w * channel;
+
+    for (int y = 0; y < dst_h; ++y) {
+        int dst_loc_base    = y * dst_w * channel;
+
+        for (int x = 0; x < dst_w; ++x) {
+            CalculateNearestOutput(src, src2, dst, adelta, bdelta, src_h, src_w, channel, x, y,
+                                   dst_loc_base, (int)border_ival);
+        }
+    }
+
+    free(buffer);
+}
+
+void WarpAffineNearestYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                           const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY  = src + b * src_plane;
+        uint8_t* dstY        = dst + b * dst_plane;
+        WarpAffineNearest(srcY, src_w, src_h, 1, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineNearest(srcUV, src_w / 2, src_h / 2, 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+void BGROrBGRAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel) {
+    NaiveBGROrBGRAToGray(src, dst, h, w, channel);
+}
+
+void RGBOrRGBAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel) {
+    NaiveRGBOrRGBAToGray(src, dst, h, w, channel);
+}
+
+#undef SATURATE_CAST_UCHAR
+
+void YUVToBGR(const unsigned char* yuv, unsigned char* bgr, int h, int w, bool is_nv12) {
+    NaiveYUVToBGROrBGRA(yuv, bgr, 3, h, w, is_nv12);
+}
+
+void YUVToBGRA(const unsigned char* yuv, unsigned char* bgra, int h, int w, bool is_nv12) {
+    NaiveYUVToBGROrBGRA(yuv, bgra, 4, h, w, is_nv12);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.h b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.h
new file mode 100644
index 0000000..1416a08
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cpu/cpu_mat_util.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_UTIL_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_UTIL_H_
+#include <string.h>
+#include <time.h>
+#include <cstdlib>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+#define GET_OFFSET_PTR(ptr, offset) (reinterpret_cast<int8_t*>(ptr) + offset)
+
+void WarpAffineBilinear(const uint8_t* src, int src_w, int src_h, int channel, uint8_t* dst, int dst_w, int dst_h,
+                        const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                           const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearest(const uint8_t* src, int src_w, int src_h, int channel, uint8_t* dst, int dst_w, int dst_h,
+                       const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void ResizeBilinear(const uint8_t* src, int src_w, int src_h, uint8_t* dst, int w, int h, int channel);
+void ResizeBilinearYUV(const uint8_t* src, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearest(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h, int channel);
+void ResizeNearestYUV(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void BGROrBGRAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel);
+void RGBOrRGBAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel);
+void YUVToBGR(const unsigned char* yuv, unsigned char* bgr, int h, int w, bool is_nv12);
+void YUVToBGRA(const unsigned char* yuv, unsigned char* bgra, int h, int w, bool is_nv12);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_MAT_UTIL_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/cuda/CMakeLists.txt
new file mode 100644
index 0000000..9f62552
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.8)
+
+enable_language(CUDA)
+
+add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+add_definitions(-DCUB_IGNORE_DEPRECATED_CPP11)
+
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+include_directories(thirdparty/cub-1.9.10-1)
+
+if(NOT DEFINED ENV{CUDNN_ROOT_DIR})
+    message(FATAL_ERROR "not defined environment variable:CUDNN_ROOT_DIR")
+endif()
+include_directories($ENV{CUDNN_ROOT_DIR}/include)
+
+set(TARGET_ARCH "-gencode arch=compute_75,code=sm_75 \
+                 -gencode arch=compute_70,code=sm_70 \
+                 -gencode arch=compute_61,code=sm_61 \
+                 -gencode arch=compute_60,code=sm_60")
+
+message(${TARGET_ARCH})
+
+if(TNN_GLIBCXX_USE_CXX11_ABI_ENABLE)
+    set(CMAKE_CUDA_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_OPT_FLAG} -Xcompiler -fPIC --compiler-options -fno-strict-aliasing \
+        -lineinfo -Xptxas -dlcm=cg -use_fast_math -D_GLIBCXX_USE_CXX11_ABI=1 ${TARGET_ARCH}")
+else()
+    set(CMAKE_CUDA_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_OPT_FLAG} -Xcompiler -fPIC --compiler-options -fno-strict-aliasing \
+        -lineinfo -Xptxas -dlcm=cg -use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 ${TARGET_ARCH}")
+endif()
+
+file(GLOB CUDA_SRCS *.cc *.h *.cu acc/*.cc acc/*.h acc/*.cu)
+message(${CUDA_SRCS})
+
+add_library(TNNCuda OBJECT ${CUDA_SRCS})
+
+set_property(TARGET TNNCuda PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET TNNCuda PROPERTY CUDA_STANDARD 11)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_abs_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_abs_layer_acc.cu
new file mode 100644
index 0000000..6c05b9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_abs_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Abs, LAYER_ABS);
+
+Status CudaAbsLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaAbsLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaAbsLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Abs, LAYER_ABS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_acos_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_acos_layer_acc.cu
new file mode 100644
index 0000000..ae71103
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_acos_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Acos, LAYER_ACOS);
+
+Status CudaAcosLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaAcosLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaAcosLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Acos, LAYER_ACOS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_add_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_add_layer_acc.cu
new file mode 100644
index 0000000..9be522c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_add_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Add, LAYER_ADD);
+
+Status CudaAddLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaAddLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaAddLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_arg_max_or_min_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_arg_max_or_min_layer_acc.cu
new file mode 100644
index 0000000..317d9a6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_arg_max_or_min_layer_acc.cu
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cub/cub.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+template <typename K, typename V>
+using BlockReduce =
+    cub::BlockReduce<KeyValuePair<K, V>, TNN_CUDA_NUM_THREADS>;
+
+template <typename T, typename ReductionOpT>
+__global__ void argmaxmin_kernel(
+    const T* input,
+    const int outer_size,
+    const int inner_size,
+    const int stride,
+    const ReductionOpT reducer,
+    const T init,
+    int* output) {
+  __shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
+  for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
+    int i = idx / stride;
+    int j = idx % stride;
+    KeyValuePair<int, T> kv = {-1, init};
+    for (int k = threadIdx.x; k < inner_size; k += blockDim.x) {
+        kv = reducer({k, input[i * inner_size * stride + k * stride + j]}, kv);
+    }
+    kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
+    if (threadIdx.x == 0) {
+      output[idx] = static_cast<int>(kv.key);
+    }
+    __syncthreads();
+  }
+}
+
+Status CudaArgMaxOrMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaArgMaxOrMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaArgMaxOrMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    CHECK_PARAM_NULL(params);
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    int axis         = params->axis;
+    int num          = DimsVectorUtils::Count(input_dims, 0, axis);
+    int channels     = input_dims[axis];
+    int stride       = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (stride == 0) {
+        stride = 1;
+    }
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        int *output_data = static_cast<int *>(output_blob->GetHandle().base);
+        if (params->mode == 0) {
+            argmaxmin_kernel<<<num * stride, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                input_data, num * stride, channels, stride, cub::ArgMin(), FLT_MAX, output_data);
+        } else {
+            argmaxmin_kernel<<<num * stride, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                input_data, num * stride, channels, stride, cub::ArgMax(), -FLT_MAX, output_data);
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_asin_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_asin_layer_acc.cu
new file mode 100644
index 0000000..a18a70a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_asin_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Asin, LAYER_ASIN);
+
+Status CudaAsinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaAsinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaAsinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Asin, LAYER_ASIN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_atan_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_atan_layer_acc.cu
new file mode 100644
index 0000000..6e48a03
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_atan_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Atan, LAYER_ATAN);
+
+Status CudaAtanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaAtanLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaAtanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Atan, LAYER_ATAN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_batch_norm_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_batch_norm_layer_acc.cu
new file mode 100644
index 0000000..1a2e11c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_batch_norm_layer_acc.cu
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(BatchNorm, LAYER_BATCH_NORM);
+
+Status CudaBatchNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaBatchNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaBatchNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(BatchNorm, LAYER_BATCH_NORM);
+REGISTER_CUDA_ACC(BatchNorm, LAYER_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_bitshift_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_bitshift_layer_acc.cu
new file mode 100644
index 0000000..e184ee4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_bitshift_layer_acc.cu
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(BitShift, LAYER_BITSHIFT);
+
+__global__ void bitshift_kernel(const int* input, int* output, int count, int bits, int direction) {
+    CUDA_KERNEL_LOOP(index, count) {
+        if (direction == 0) {
+            output[index] = input[index] >> bits;
+        } else {
+            output[index] = input[index] << bits;
+        }
+    }
+}
+
+Status CudaBitShiftLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaBitShiftLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaBitShiftLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<BitShiftLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    auto input_data = (int*)(inputs[0]->GetHandle().base);
+    auto output_data = (int *)(outputs[0]->GetHandle().base);
+
+    const int count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    bitshift_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            input_data, output_data, count, layer_param->bits, layer_param->direction);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(BitShift, LAYER_BITSHIFT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cast_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cast_layer_acc.cu
new file mode 100644
index 0000000..af994d3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cast_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Cast, LAYER_CAST);
+
+Status CudaCastLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaCastLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaCastLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Cast, LAYER_CAST);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu
new file mode 100644
index 0000000..252b4e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu
@@ -0,0 +1,205 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+#include "cuda_fp16.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+template<int THREAD_PER_BLOCK, typename T>
+__global__ void cbam_fused_pooling_kernel(const float *input, T* output, T* output2, int n) {
+    __shared__ float smax[THREAD_PER_BLOCK/32];
+    __shared__ float ssum[THREAD_PER_BLOCK/32];
+    int block_offset = blockIdx.x * n;
+    const float4* ptr = (const float4*)(input + block_offset);
+    float thread_max = -FLT_MAX;
+    float thread_sum = 0;
+    for (int i = threadIdx.x; i < n / 4; i += blockDim.x) {
+        float4 data = ptr[i];
+        thread_max = fmaxf(thread_max, data.x);
+        thread_max = fmaxf(thread_max, data.y);
+        thread_max = fmaxf(thread_max, data.z);
+        thread_max = fmaxf(thread_max, data.w);
+        thread_sum += data.x;
+        thread_sum += data.y;
+        thread_sum += data.z;
+        thread_sum += data.w;
+    }
+
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0xffffffff, thread_max, 16, 32));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000ffff, thread_max, 8, 16));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x000000ff, thread_max, 4, 8));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000000f, thread_max, 2, 4));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x00000003, thread_max, 1, 2));
+
+    thread_sum += __shfl_down_sync(0xffffffff, thread_sum, 16, 32);
+    thread_sum += __shfl_down_sync(0x0000ffff, thread_sum, 8, 16);
+    thread_sum += __shfl_down_sync(0x000000ff, thread_sum, 4, 8);
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (threadIdx.x % 32 == 0) {
+        smax[threadIdx.x / 32] = thread_max;
+        ssum[threadIdx.x / 32] = thread_sum;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_max = smax[threadIdx.x];
+        thread_sum = ssum[threadIdx.x];
+    } else {
+        thread_max = 0;
+        thread_sum = 0;
+    }
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000000f, thread_max, 2, 4));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x00000003, thread_max, 1, 2));
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+    if (threadIdx.x == 0) {
+        output[blockIdx.x] = convert_float_value<T>(thread_sum / n);
+        output2[blockIdx.x] = convert_float_value<T>(thread_max);
+    }
+}
+
+template<int THREAD_PER_BLOCK>
+__global__ void cbam_fused_pooling_half_kernel(const __half *input, __half* output, __half* output2, int n) {
+    __shared__ float smax[THREAD_PER_BLOCK/32];
+    __shared__ __half2 ssum[THREAD_PER_BLOCK/32];
+    int block_offset = blockIdx.x * n;
+    float thread_max = -FLT_MAX;
+
+    __half2 sum = __halves2half2(0, 0);
+    if (n % 8 == 0) {
+        const float4* ptr = (const float4*)(input + block_offset);
+        for (int i = threadIdx.x; i < n / 8; i += blockDim.x) {
+            __half2 data[4];
+            *((float4*)(&data)) = ptr[i];
+            sum = __hadd2(sum, data[0]);
+            sum = __hadd2(sum, data[1]);
+            sum = __hadd2(sum, data[2]);
+            sum = __hadd2(sum, data[3]);
+            thread_max = fmaxf(thread_max, __high2float(data[0]));
+            thread_max = fmaxf(thread_max, __low2float(data[0]));
+            thread_max = fmaxf(thread_max, __high2float(data[1]));
+            thread_max = fmaxf(thread_max, __low2float(data[1]));
+            thread_max = fmaxf(thread_max, __high2float(data[2]));
+            thread_max = fmaxf(thread_max, __low2float(data[2]));
+            thread_max = fmaxf(thread_max, __high2float(data[3]));
+            thread_max = fmaxf(thread_max, __low2float(data[3]));
+        }
+    } else {
+        const float2* ptr = (const float2*)(input + block_offset);
+        for (int i = threadIdx.x; i < n / 4; i += blockDim.x) {
+            __half2 data[2];
+            *((float2*)(&data)) = ptr[i];
+            sum = __hadd2(sum, data[0]);
+            sum = __hadd2(sum, data[1]);
+            thread_max = fmaxf(thread_max, __high2float(data[0]));
+            thread_max = fmaxf(thread_max, __low2float(data[0]));
+            thread_max = fmaxf(thread_max, __high2float(data[1]));
+            thread_max = fmaxf(thread_max, __low2float(data[1]));
+        }
+    }
+
+    sum = __hadd2(sum, __shfl_down_sync(0xffffffff, sum, 16, 32));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0xffffffff, thread_max, 16, 32));
+    sum = __hadd2(sum, __shfl_down_sync(0x0000ffff, sum, 8, 16));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000ffff, thread_max, 8, 16));
+    sum = __hadd2(sum, __shfl_down_sync(0x000000ff, sum, 4, 8));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x000000ff, thread_max, 4, 8));
+    sum = __hadd2(sum, __shfl_down_sync(0x0000000f, sum, 2, 4));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000000f, thread_max, 2, 4));
+    sum = __hadd2(sum, __shfl_down_sync(0x00000003, sum, 1, 2));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x00000003, thread_max, 1, 2));
+
+    if (threadIdx.x % 32 == 0) {
+        smax[threadIdx.x / 32] = thread_max;
+        ssum[threadIdx.x / 32] = sum;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_max = smax[threadIdx.x];
+        sum = ssum[threadIdx.x];
+    } else {
+        thread_max = 0;
+        sum = __halves2half2(0, 0);
+    }
+
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x0000000f, thread_max, 2, 4));
+    sum = __hadd2(sum, __shfl_down_sync(0x0000000f, sum, 2, 4));
+    thread_max = fmaxf(thread_max, __shfl_down_sync(0x00000003, thread_max, 1, 2));
+    sum = __hadd2(sum, __shfl_down_sync(0x00000003, sum, 1, 2));
+
+    if (threadIdx.x == 0) {
+        output[blockIdx.x] = __float2half((__high2float(sum) + __low2float(sum)) / n);
+        output2[blockIdx.x] = thread_max;
+    }
+}
+
+Status CudaCbamFusedPoolingLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaCbamFusedPoolingLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaCbamFusedPoolingLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob1 = outputs[0];
+    Blob *output_blob2 = outputs[1];
+
+    DataType type = input_blob->GetBlobDesc().data_type;
+    DataFormat format = input_blob->GetBlobDesc().data_format;
+
+    int batch_size = input_blob->GetBlobDesc().dims[0];
+    int nchannels = input_blob->GetBlobDesc().dims[1];
+    int inp_H = input_blob->GetBlobDesc().dims[2];
+    int inp_W = input_blob->GetBlobDesc().dims[3];
+
+    const int thread_num = 128;
+    int block_num = batch_size * nchannels;
+    void* input_ptr = input_blob->GetHandle().base;
+    void* output_ptr1 = output_blob1->GetHandle().base;
+    void* output_ptr2 = output_blob2->GetHandle().base;
+
+    if (type == DataType::DATA_TYPE_HALF) {
+        cbam_fused_pooling_half_kernel<thread_num><<<block_num, thread_num, 0, context_->GetStream()>>>(
+            static_cast<__half *>(input_ptr), static_cast<__half *>(output_ptr1),
+            static_cast<__half *>(output_ptr2), inp_H * inp_W);
+    } else if (type == DataType::DATA_TYPE_FLOAT) {
+        if (output_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT)
+            cbam_fused_pooling_kernel<thread_num, float><<<block_num, thread_num, 0, context_->GetStream()>>>(
+                static_cast<float *>(input_ptr), static_cast<float *>(output_ptr1),
+                static_cast<float *>(output_ptr2), inp_H * inp_W);
+        else
+            cbam_fused_pooling_kernel<thread_num, __half><<<block_num, thread_num, 0, context_->GetStream()>>>(
+                static_cast<float *>(input_ptr), static_cast<__half *>(output_ptr1),
+                static_cast<__half *>(output_ptr2), inp_H * inp_W);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", type);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu
new file mode 100644
index 0000000..280f76e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu
@@ -0,0 +1,214 @@
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include <limits>
+#include <cuda.h>
+#include <numeric>
+#include <cuda_fp16.h>
+#include <iostream>
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+typedef uint32_t uint32;
+
+template <typename T>
+__device__ __forceinline__ T getMin() {
+    return T(-FLT_MAX);
+}   
+
+__global__ void cbam_fused_reduce_kernel(float *in, float *out, int n, int c, int h, int w) {
+    // each thread processes one output element
+    int stride = h * w;
+    int tidx = blockDim.x * blockIdx.x + threadIdx.x;
+    int sample_idx = tidx / (h * w);
+
+    if (sample_idx >= n) return;
+
+    int sample_offset = tidx  % (h * w);
+    int src_offset = sample_idx * (c * h * w) + sample_offset;
+    int dst_offset = sample_idx * (2 * h * w) + sample_offset;
+
+    float accumulate = 0;
+    float max = getMin<float>();
+
+    for (int i = 0; i < c; i ++) {
+        float in_value = in[src_offset + i * stride];
+        accumulate += in_value;
+        max = fmaxf(max, in_value);
+    }
+
+    out[dst_offset] = accumulate / c;
+    out[dst_offset + stride] = max;
+}
+
+template<typename T>
+__global__ void cbam_fused_reduce_half_kernel(__half *in, T *out, int n, int c, int h, int w) {
+    // each thread processes one output element
+    int stride = h * w;
+    int tidx = blockDim.x * blockIdx.x + threadIdx.x;
+    int sample_idx = tidx / (h * w);
+
+    if (sample_idx >= n) return;
+
+    int sample_offset = tidx  % (h * w);
+    int src_offset = sample_idx * (c * h * w) + sample_offset;
+    int dst_offset = sample_idx * (2 * h * w) + sample_offset;
+
+    float accumulate = 0;
+    float max = getMin<float>();
+
+    for(int i = 0; i < c; i ++) {
+        float in_value = __half2float(in[src_offset + i * stride]);
+        accumulate += in_value;
+        max = fmaxf(max, in_value);
+    }
+
+    out[dst_offset] = convert_float_value<T>(accumulate / c);
+    out[dst_offset + stride] = convert_float_value<T>(max);
+}
+
+
+// for fp16, N(c/x)HWx  formats
+// only process cases where 2 <= x <= 16, and c <= 1024
+template<typename T>
+__global__ void cbam_fused_reduce_packed_kernel(__half *in, T *out,
+        int n, int c, int h, int w, int pack_num) {
+
+    // Each block process blockDim.x fp16 spatial elements of the input map (include all the channels)
+    int data_stride_c = h * w * pack_num;
+    int data_num_c = (c + pack_num - 1) / pack_num;
+
+    // make sure a block is not across a sample's boundary
+    int blocks_per_sample = (data_stride_c + blockDim.x - 1) / blockDim.x;
+    
+    int sample_idx = blockIdx.x / blocks_per_sample;
+    int element_idx = blockIdx.x % blocks_per_sample * blockDim.x + threadIdx.x;
+    int element_offset = sample_idx * data_stride_c * data_num_c + element_idx;
+    
+    int pixel_idx = element_idx / pack_num;
+    int channel_idx = element_idx % pack_num;
+    
+    // the actual number of bytes will be used is:
+    // blockDim.x / pack_num * sizeof(float)
+    // make sure blockDim.x / pack_num <= 64
+    // half of the array used for saving mean
+    // the other half is for saving max
+#define MAX_NUM_PIXELS_PER_BLOCK 64
+    __shared__ float reduction_results[MAX_NUM_PIXELS_PER_BLOCK * 2];
+    
+    float accumulate = 0;
+    float max = getMin<float>();
+
+    if (pixel_idx <  h * w && channel_idx < c) {
+        int count = 0;
+        while (channel_idx < c) {
+            float in_value = __half2float(in[element_offset]);
+            accumulate += in_value;
+            max = fmaxf(max, in_value);
+            channel_idx += pack_num;
+            element_offset += data_stride_c;            
+        }
+
+        reduction_results[threadIdx.x] = accumulate / count;
+        reduction_results[threadIdx.x + MAX_NUM_PIXELS_PER_BLOCK] = max;
+    }
+    
+    __syncthreads();
+    
+    // use the first (blockDim.x / pack_num) threads to do the final reduction
+    // each thread processes one pixel in the output image/feature
+    if (threadIdx.x < blockDim.x / pack_num) {
+        float accumulate = 0;
+        float count = 0;
+        int mean_value_offset = threadIdx.x * pack_num;
+        int max_value_offset = threadIdx.x * pack_num + MAX_NUM_PIXELS_PER_BLOCK;
+        float max = reduction_results[max_value_offset];
+
+        for(int idx = threadIdx.x; idx < pack_num && idx < c; idx ++){
+            accumulate =+ reduction_results[mean_value_offset + idx];
+            max = fmaxf(max, reduction_results[max_value_offset + idx]);
+            count += 1;                        
+        }
+
+        accumulate /= count;
+        
+        // write to output buffer
+        int output_sample_stride = pack_num * h * w;
+        int output_pixel_idx = (element_idx - threadIdx.x + threadIdx.x * pack_num) / pack_num;
+        int offset = sample_idx * output_sample_stride + pack_num * output_pixel_idx;
+        out[offset] = convert_float_value<T>(accumulate);//__float2half(accumulate);
+        out[offset+1] = convert_float_value<T>(max);//__float2half(max);
+    }
+}
+
+
+Status CudaCbamFusedReduceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaCbamFusedReduceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaCbamFusedReduceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    DataType type = input_blob->GetBlobDesc().data_type;
+    DataFormat format = input_blob->GetBlobDesc().data_format;
+
+    int batch_size = input_blob->GetBlobDesc().dims[0];
+    int nchannels = input_blob->GetBlobDesc().dims[1];
+    int inp_H = input_blob->GetBlobDesc().dims[2];
+    int inp_W = input_blob->GetBlobDesc().dims[3];
+
+    if (type == DataType::DATA_TYPE_FLOAT) {
+        float* input_ptr = static_cast<float*>(input_blob->GetHandle().base);
+        float* output_ptr = static_cast<float*>(output_blob->GetHandle().base);
+
+        int thread_num = 32;
+        int block_num = (inp_H * inp_W * batch_size + thread_num - 1) / thread_num;
+        cbam_fused_reduce_kernel<<<block_num, thread_num, 0, context_->GetStream()>>>(static_cast<float *>(input_ptr), 
+            static_cast<float *>(output_ptr), 
+            batch_size, nchannels, inp_H, inp_W);
+    } else if (type == DataType::DATA_TYPE_HALF) {
+        void* input_ptr = input_blob->GetHandle().base;
+        void* output_ptr = output_blob->GetHandle().base;
+        int thread_num = 64;
+        if (format == DataFormat::DATA_FORMAT_NCHW) {
+            int block_num = (inp_H * inp_W * batch_size + thread_num - 1) / thread_num;
+            if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT)
+                cbam_fused_reduce_half_kernel<float><<<block_num, thread_num, 0, context_->GetStream()>>>(
+                    static_cast<__half *>(input_ptr), static_cast<float *>(output_ptr), batch_size, nchannels, inp_H, inp_W);
+            else
+                cbam_fused_reduce_half_kernel<__half><<<block_num, thread_num, 0, context_->GetStream()>>>(
+                    static_cast<__half *>(input_ptr), static_cast<__half *>(output_ptr), batch_size, nchannels, inp_H, inp_W);
+        } else if (format == DataFormat::DATA_FORMAT_NC2HW2 || format == DataFormat::DATA_FORMAT_NC4HW4 ||
+                format == DataFormat::DATA_FORMAT_NC16HW16) {
+            int pack_num = 2;
+            switch (format) {
+                case DataFormat::DATA_FORMAT_NC2HW2: pack_num = 2; break;
+                case DataFormat::DATA_FORMAT_NC4HW4: pack_num = 4; break;
+                case DataFormat::DATA_FORMAT_NC16HW16: pack_num = 16; break;
+                default: pack_num = 2;
+            }
+            int blocks_per_sample = (inp_H * inp_W * pack_num + thread_num - 1) / thread_num;
+            int blocks_num = blocks_per_sample * batch_size;
+            cbam_fused_reduce_packed_kernel<__half><<<blocks_num, thread_num, 0, context_->GetStream()>>>(static_cast<__half *>(input_ptr), 
+                static_cast<__half *>(output_ptr), batch_size, nchannels, inp_H, inp_W, pack_num);
+        } else {
+            LOGE("Error: layer acc dont support data format: %d\n", input_blob->GetBlobDesc().data_format);
+            return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support data format");
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_ceil_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_ceil_layer_acc.cu
new file mode 100644
index 0000000..98a9350
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_ceil_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Ceil, LAYER_CEIL);
+
+Status CudaCeilLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaCeilLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaCeilLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Ceil, LAYER_CEIL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_clip_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_clip_layer_acc.cu
new file mode 100644
index 0000000..560002f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_clip_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Clip, LAYER_CLIP);
+
+Status CudaClipLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaClipLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaClipLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Clip, LAYER_CLIP);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_concat_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_concat_layer_acc.cu
new file mode 100644
index 0000000..2ba8f15
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_concat_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Concat, LAYER_CONCAT);
+
+Status CudaConcatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaConcatLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaConcatLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Concat, LAYER_CONCAT);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_constantofshape_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_constantofshape_layer_acc.cu
new file mode 100644
index 0000000..cbe4c38
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_constantofshape_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+Status CudaConstantOfShapeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaConstantOfShapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaConstantOfShapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_1d_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_1d_layer_acc.cu
new file mode 100644
index 0000000..59ea8b0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_1d_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Conv1D, LAYER_CONVOLUTION_1D);
+
+Status CudaConv1DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaConv1DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaConv1DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Conv1D, LAYER_CONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu
new file mode 100644
index 0000000..7dead9d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu
@@ -0,0 +1,228 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_conv_3d_layer_acc.h"
+
+#include <memory>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+using perf_t = cudnnConvolutionFwdAlgoPerf_t;
+
+std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, int n_algo) {
+
+    std::vector<perf_t> valid_results;
+    valid_results.reserve(n_algo);
+    for (int i = 0; i < n_algo; i++) {
+        if (perfResults[i].status == CUDNN_STATUS_SUCCESS) {
+            valid_results.push_back(perfResults[i]);
+        }
+    }
+    return valid_results;
+}
+
+
+Status CudaConv3DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    alpha_          = 1.0f;
+    beta_           = 0.0f;
+    workspace_data_ = nullptr;
+    workspace_size_ = 0;
+    weights_        = nullptr;
+    bias_           = nullptr;
+    bias_term_      = false;
+
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+
+    Blob *input = inputs[0];
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+
+    // LOGD("CudaConv3DLayer param kernel_w: %d, kernel_h: %d \n",
+    // conv_param->kernels[0], conv_param->kernels[1]);
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, conv_param->group));
+
+    const int filter_dims[] = {
+        output_dims[1], input_dims[1] / conv_param->group,
+        conv_param->kernels[2], conv_param->kernels[1], conv_param->kernels[0]};
+
+    CUDNN_CHECK(cudnnSetFilterNdDescriptor(filter_desc_, CUDNN_DATA_FLOAT,
+                                           CUDNN_TENSOR_NCHW, 5, filter_dims));
+
+    const int pad_dims[] = {conv_param->pads[4], conv_param->pads[2],
+                            conv_param->pads[0]};  // DHW
+    const int sti_dims[] = {conv_param->strides[2], conv_param->strides[1],
+                            conv_param->strides[0]};
+    const int dil_dims[] = {conv_param->dialations[2], conv_param->dialations[1],
+                            conv_param->dialations[1]};
+
+    CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(
+        conv_desc_, 3, pad_dims, sti_dims, dil_dims, CUDNN_CROSS_CORRELATION,
+        CUDNN_DATA_FLOAT));
+
+    ConvLayerResource *conv_resource =
+        dynamic_cast<ConvLayerResource *>(resource);
+    float *weights = conv_resource->filter_handle.force_to<float *>();
+    // LOGD("weight size: %d \n", conv_resource->filter_handle.GetBytesSize());
+    // LOGD("weights0: %f \n", weights[0]);
+
+    size_t weights_size = sizeof(float) * input_dims[1] *
+                          output_dims[1] * conv_param->kernels[2] *
+                          conv_param->kernels[1] * conv_param->kernels[0];
+
+    CUDA_CHECK(cudaMalloc((void **)&weights_, weights_size));
+    CUDA_CHECK(cudaMemcpy(weights_, weights, weights_size, cudaMemcpyHostToDevice));
+
+    // LOGD("CudaConv3DLayer bias: %d \n", conv_param->bias);
+    // LOGD("CudaConv3DLayer bias size: %d \n",
+    // conv_resource->bias_handle.GetBytesSize());
+
+    if (conv_param->bias) {
+        bias_term_ = true;
+        if (output_dims[1] * sizeof(float) !=
+            conv_resource->bias_handle.GetBytesSize()) {
+            return TNNERR_MODEL_ERR;
+        }
+
+        const int bias_dim[] = {1, output_dims[1], 1, 1, 1};
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+        CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(
+            bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 5, bias_dim));
+
+        CUDA_CHECK(cudaMalloc((void **)&bias_,
+                              conv_resource->bias_handle.GetBytesSize()));
+        CUDA_CHECK(cudaMemcpy(
+            bias_, conv_resource->bias_handle.force_to<float *>(),
+            conv_resource->bias_handle.GetBytesSize(), cudaMemcpyHostToDevice));
+    }
+
+    return this->Reshape(inputs, outputs);
+}
+
+CudaConv3DLayerAcc::~CudaConv3DLayerAcc(){
+    if (workspace_data_ != nullptr) {
+        CUDA_CHECK(cudaFree(workspace_data_));
+    }
+    if (weights_ != nullptr) {
+        CUDA_CHECK(cudaFree(weights_));
+    }
+    if (bias_ != nullptr) {
+        CUDA_CHECK(cudaFree(bias_));
+    }
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+}
+
+Status CudaConv3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+
+    // LOGD("input n,c,d,h,w: %d, %d, %d, %d, %d , output n,c,h,w: %d, %d, %d,
+    // %d, %d \n",
+    //      blob_info_.batch, blob_info_.input_c, blob_info_.input_d,
+    //      blob_info_.input_h, blob_info_.input_w,
+    //      blob_info_.batch, blob_info_.output_c, blob_info_.output_d,
+    //      blob_info_.output_h, blob_info_.output_w);
+
+    int in_dims[] = {input_dims[0], input_dims[1], input_dims[2],
+                     input_dims[3], input_dims[4]};
+    CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(bottom_desc_, CUDNN_TENSOR_NCHW,
+                                             CUDNN_DATA_FLOAT, 5, in_dims));
+
+    int out_dims[5];
+    CUDNN_CHECK(cudnnGetConvolutionNdForwardOutputDim(
+        conv_desc_, bottom_desc_, filter_desc_, 5, out_dims));
+
+    // LOGD("conv3d layer acc cudnn infered ncdhw %d %d %d %d %d\n",
+    // out_dims[0],
+    //      out_dims[1], out_dims[2], out_dims[3], out_dims[4]);
+
+    CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(top_desc_, CUDNN_TENSOR_NCHW,
+                                             CUDNN_DATA_FLOAT, 5, out_dims));
+
+    for(int i=0;i<5;i++) {
+        if (out_dims[i] != output_dims[i]) {
+            LOGE("CUDNN got different output shapes from TNN\n");
+            return TNNERR_LAYER_ERR;
+        }
+    }
+
+    // algorithm
+    static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
+        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        top_desc_, num_algos, &perf_count, perf_results.get()));
+
+    auto valid_algos = getValidAlgorithms(perf_results.get(), perf_count);
+    if (valid_algos.size() == 0) {
+        LOGE("CUDNN get conv algo failed.\n");
+        return TNNERR_LAYER_ERR;
+    }
+    conv_algo_ = valid_algos[0].algo;
+
+    // LOGD("Convolution algorithm: %d\n", conv_algo_);
+
+    // workspace
+    size_t needed_workspace_size;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        top_desc_, conv_algo_, &needed_workspace_size));
+
+    // LOGD("Workspace size: %ld\n", workspace_size_);
+    if (workspace_size_ < needed_workspace_size) {
+        workspace_size_ = needed_workspace_size;
+        if (workspace_data_ != nullptr) {
+            CUDA_CHECK(cudaFree(workspace_data_));
+        }
+        CUDA_CHECK(cudaMalloc(&workspace_data_, workspace_size_));
+    }
+    return TNN_OK;
+}
+
+Status CudaConv3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CUDNN_CHECK(cudnnConvolutionForward(
+        context_->cudnn_handle_, &alpha_, bottom_desc_,
+        inputs[0]->GetHandle().base, filter_desc_, weights_, conv_desc_,
+        conv_algo_, workspace_data_, workspace_size_, &beta_, top_desc_,
+        outputs[0]->GetHandle().base));
+
+    if (bias_term_) {
+        float alpha = 1.0f;
+        float beta  = 1.0f;
+        CUDNN_CHECK(cudnnAddTensor(context_->cudnn_handle_, &alpha, bias_desc_,
+                                   bias_, &beta, top_desc_,
+                                   outputs[0]->GetHandle().base));
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Conv3D, LAYER_CONVOLUTION_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.h
new file mode 100644
index 0000000..c8a501a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV3D_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV3D_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaConv3DLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaConv3DLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+
+    bool bias_term_;
+
+    // cudnn descs
+    bool descs_setup_;
+    bool algo_inited_;
+
+    cudnnTensorFormat_t tensor_format_;
+    cudnnDataType_t data_type_;
+
+    cudnnConvolutionMode_t conv_mode_;
+    cudnnConvolutionFwdAlgo_t conv_algo_;
+
+    cudnnTensorDescriptor_t bottom_desc_;
+    cudnnTensorDescriptor_t top_desc_;
+
+    cudnnTensorDescriptor_t bias_desc_;
+    cudnnFilterDescriptor_t filter_desc_;
+    cudnnConvolutionDescriptor_t conv_desc_;
+
+    float alpha_;
+    float beta_;
+
+    bool workspace_setup_;
+    size_t workspace_size_;
+
+    float *weights_;
+    float *bias_;
+    float *workspace_data_;
+    
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV3D_LAYER_ACC_H_A
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu
new file mode 100644
index 0000000..cad1ae0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu
@@ -0,0 +1,201 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <memory>
+
+#include "tnn/device/cuda/acc/cuda_conv_layer_acc.h"
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+using perf_t = cudnnConvolutionFwdAlgoPerf_t;
+
+Status CudaConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    alpha_          = 1.0f;
+    beta_           = 0.0f;
+    workspace_data_ = nullptr;
+    workspace_size_ = 0;
+    weights_        = nullptr;
+    bias_           = nullptr;
+    bias_term_      = false;
+
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+
+    Blob *input = inputs[0];
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+
+    // only some 7x7 conv case need run with cudnn because trt fp16 bug
+    bool symmetric = (conv_param->pads[0] == conv_param->pads[1]) && (conv_param->pads[2] == conv_param->pads[3]);
+    if (!symmetric || conv_param->kernels[1] != 7 || conv_param->kernels[0] != 7) return TNN_OK;
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&bottom_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&top_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, conv_param->group));
+
+    const int filter_dims[] = {
+        output_dims[1], input_dims[1] / conv_param->group,
+        conv_param->kernels[1], conv_param->kernels[0]};
+
+    CUDNN_CHECK(cudnnSetFilterNdDescriptor(filter_desc_, CUDNN_DATA_FLOAT,
+        CUDNN_TENSOR_NCHW, 4, filter_dims));
+
+    const int pad_dims[] = {conv_param->pads[2], conv_param->pads[0]};
+    const int sti_dims[] = {conv_param->strides[1], conv_param->strides[0]};
+    const int dil_dims[] = {conv_param->dialations[1], conv_param->dialations[0]};
+
+    CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(
+        conv_desc_, 2, pad_dims, sti_dims, dil_dims, CUDNN_CROSS_CORRELATION,
+        CUDNN_DATA_FLOAT));
+
+    ConvLayerResource *conv_resource =
+        dynamic_cast<ConvLayerResource *>(resource);
+    float *weights = conv_resource->filter_handle.force_to<float *>();
+
+    size_t weights_size = sizeof(float) * input_dims[1] * output_dims[1] *
+        conv_param->kernels[1] * conv_param->kernels[0];
+
+    CUDA_CHECK(cudaMalloc((void **)&weights_, weights_size));
+    CUDA_CHECK(cudaMemcpy(weights_, weights, weights_size, cudaMemcpyHostToDevice));
+
+    if (conv_param->bias) {
+        bias_term_ = true;
+        if (output_dims[1] * sizeof(float) != conv_resource->bias_handle.GetBytesSize()) {
+            return TNNERR_MODEL_ERR;
+        }
+
+        const int bias_dim[] = {1, output_dims[1], 1, 1};
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+        CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(
+            bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 4, bias_dim));
+
+        CUDA_CHECK(cudaMalloc((void **)&bias_, conv_resource->bias_handle.GetBytesSize()));
+        CUDA_CHECK(cudaMemcpy(bias_, conv_resource->bias_handle.force_to<float *>(),
+            conv_resource->bias_handle.GetBytesSize(), cudaMemcpyHostToDevice));
+    }
+
+    return this->Reshape(inputs, outputs);
+}
+
+CudaConvLayerAcc::~CudaConvLayerAcc(){
+    if (workspace_data_ != nullptr) {
+        CUDA_CHECK(cudaFree(workspace_data_));
+    }
+    if (weights_ != nullptr) {
+        CUDA_CHECK(cudaFree(weights_));
+    }
+    if (bias_ != nullptr) {
+        CUDA_CHECK(cudaFree(bias_));
+    }
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bottom_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(top_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+}
+
+Status CudaConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    // only some 7x7 conv case need run with cudnn because trt fp16 bug
+    bool symmetric = (conv_param->pads[0] == conv_param->pads[1]) && (conv_param->pads[2] == conv_param->pads[3]);
+    if (!symmetric || conv_param->kernels[1] != 7 || conv_param->kernels[0] != 7) return TNN_OK;
+
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+
+    int in_dims[] = {input_dims[0], input_dims[1], input_dims[2], input_dims[3]};
+    CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(bottom_desc_, CUDNN_TENSOR_NCHW,
+                                             CUDNN_DATA_FLOAT, 4, in_dims));
+
+    int out_dims[4];
+    CUDNN_CHECK(cudnnGetConvolutionNdForwardOutputDim(
+        conv_desc_, bottom_desc_, filter_desc_, 4, out_dims));
+
+    CUDNN_CHECK(cudnnSetTensorNdDescriptorEx(top_desc_, CUDNN_TENSOR_NCHW,
+                                             CUDNN_DATA_FLOAT, 4, out_dims));
+
+    for(int i = 0; i < 4; i++) {
+        if (out_dims[i] != output_dims[i]) {
+            LOGE("CUDNN got different output shapes from TNN\n");
+            return TNNERR_LAYER_ERR;
+        }
+    }
+
+    // algorithm
+    static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
+        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        top_desc_, num_algos, &perf_count, perf_results.get()));
+
+    std::vector<perf_t> valid_algos;
+    valid_algos.reserve(perf_count);
+    for (int i = 0; i < perf_count; i++) {
+        if (perf_results.get()[i].status == CUDNN_STATUS_SUCCESS) {
+            valid_algos.push_back(perf_results.get()[i]);
+        }
+    }
+
+    if (valid_algos.size() == 0) {
+        LOGE("CUDNN get conv algo failed.\n");
+        return TNNERR_LAYER_ERR;
+    }
+    conv_algo_ = valid_algos[0].algo;
+
+    // workspace
+    size_t needed_workspace_size;
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        top_desc_, conv_algo_, &needed_workspace_size));
+
+    if (workspace_size_ < needed_workspace_size) {
+        workspace_size_ = needed_workspace_size;
+        if (workspace_data_ != nullptr) {
+            CUDA_CHECK(cudaFree(workspace_data_));
+        }
+        CUDA_CHECK(cudaMalloc(&workspace_data_, workspace_size_));
+    }
+    return TNN_OK;
+}
+
+Status CudaConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CUDNN_CHECK(cudnnConvolutionForward(
+        context_->cudnn_handle_, &alpha_, bottom_desc_,
+        inputs[0]->GetHandle().base, filter_desc_, weights_, conv_desc_,
+        conv_algo_, workspace_data_, workspace_size_, &beta_, top_desc_,
+        outputs[0]->GetHandle().base));
+
+    if (bias_term_) {
+        float alpha = 1.0f;
+        float beta  = 1.0f;
+        CUDNN_CHECK(cudnnAddTensor(context_->cudnn_handle_, &alpha, bias_desc_,
+                                   bias_, &beta, top_desc_,
+                                   outputs[0]->GetHandle().base));
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Conv, LAYER_CONVOLUTION);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.h
new file mode 100644
index 0000000..dc8190e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_conv_layer_acc.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaConvLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaConvLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+
+    bool bias_term_;
+
+    // cudnn descs
+    bool descs_setup_;
+    bool algo_inited_;
+
+    cudnnTensorFormat_t tensor_format_;
+    cudnnDataType_t data_type_;
+
+    cudnnConvolutionMode_t conv_mode_;
+    cudnnConvolutionFwdAlgo_t conv_algo_;
+
+    cudnnTensorDescriptor_t bottom_desc_;
+    cudnnTensorDescriptor_t top_desc_;
+
+    cudnnTensorDescriptor_t bias_desc_;
+    cudnnFilterDescriptor_t filter_desc_;
+    cudnnConvolutionDescriptor_t conv_desc_;
+
+    float alpha_;
+    float beta_;
+
+    bool workspace_setup_;
+    size_t workspace_size_;
+
+    float *weights_;
+    float *bias_;
+    float *workspace_data_;
+    
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_CONV_LAYER_ACC_H_A
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cos_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cos_layer_acc.cu
new file mode 100644
index 0000000..3c0acce
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_cos_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Cos, LAYER_COS);
+
+Status CudaCosLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaCosLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaCosLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Cos, LAYER_COS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu
new file mode 100644
index 0000000..0735f07
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Deconvolution, LAYER_DECONVOLUTION);
+
+Status CudaDeconvolutionLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaDeconvolutionLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaDeconvolutionLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Deconvolution, LAYER_DECONVOLUTION);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.cu
new file mode 100644
index 0000000..c2071d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.cu
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_detection_output_layer_acc.h"
+#include "tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cuh"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/bbox_util.h"
+
+namespace TNN_NS {
+
+inline CodeType GetCodeType(const int number) {
+    ASSERT(number > 0 && number < 4);
+
+    switch (number) {
+        case 1: {
+            return PriorBoxParameter_CodeType_CORNER;
+        }
+        case 2: {
+            return PriorBoxParameter_CodeType_CENTER_SIZE;
+        }
+        default: {
+            return PriorBoxParameter_CodeType_CORNER_SIZE;
+        }
+    }
+}
+
+void CudaDetectionOutputLayerAcc::AllocTempBuf() {
+    DetectionOutputLayerParam *params = dynamic_cast<DetectionOutputLayerParam *>(param_);
+    int num_overlaps = top_k * (top_k - 1) / 2;
+    if (params->keep_top_k > 0) {
+        max_top_k = max_num * params->keep_top_k;
+    } else {
+        max_top_k = max_num * 256;
+    }
+
+    CreateTempBuf(max_num * num_loc_classes * num_priors * 4 * sizeof(float));
+    CreateTempBuf(max_num * params->keep_top_k * 7 * sizeof(float));
+    CreateTempBuf(max_num * params->num_classes * num_priors * sizeof(float));
+    CreateTempBuf(max_num * params->num_classes * num_priors * sizeof(float));
+    CreateTempBuf(max_num * params->num_classes * num_priors * sizeof(int));
+    CreateTempBuf(max_num * params->num_classes * num_priors * sizeof(int));
+    CreateTempBuf(max_num * params->num_classes * num_overlaps * sizeof(float));
+    CreateTempBuf(max_num * params->num_classes * num_priors * sizeof(bool));
+    CreateTempBuf(max_num * params->num_classes * sizeof(int));
+    CreateTempBuf(max_num * params->num_classes * top_k * sizeof(float));
+    CreateTempBuf(max_num * params->num_classes * top_k * sizeof(int));
+    CreateTempBuf(max_num * params->keep_top_k * sizeof(float));
+    CreateTempBuf(max_num * params->keep_top_k * sizeof(float));
+    CreateTempBuf(max_num * sizeof(int));
+    temp_storage_bytes = 32 * 1024 * 1024 + 256;
+    CreateTempBuf(temp_storage_bytes);
+}
+
+Status CudaDetectionOutputLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaDetectionOutputLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaDetectionOutputLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob1 = inputs[0];
+    Blob *input_blob2 = inputs[1];
+    Blob *input_blob3 = inputs[2];
+
+    Blob *output_blob = outputs[0];
+    float* loc_data_d = static_cast<float*>(input_blob1->GetHandle().base);
+    float* conf_data_d = static_cast<float*>(input_blob2->GetHandle().base);
+    float* prior_data_d = static_cast<float*>(input_blob3->GetHandle().base);
+
+    int num = input_blob1->GetBlobDesc().dims[0];
+    DetectionOutputLayerParam *params = dynamic_cast<DetectionOutputLayerParam *>(param_);
+    CHECK_PARAM_NULL(params);
+
+    if (tempbufs_.size() == 0) {
+        max_num = num;
+        num_priors = inputs[2]->GetBlobDesc().dims[2] / 4;
+        num_loc_classes = params->share_location ? 1 : params->num_classes;
+        top_k = std::min(params->nms_param.top_k, num_priors);
+        AllocTempBuf();
+    }
+
+    if (num > max_num) {
+        for (int i = 0; i < tempbufs_.size(); i++) {
+            Status ret = device_->Free(tempbufs_[i].ptr);
+            if (ret != TNN_OK) {
+                LOGE("Error cuda free acc temp buf failed\n");
+            }
+        }
+        tempbufs_.clear();
+        max_num = num;
+        AllocTempBuf();
+    }
+
+    CodeType code_type = GetCodeType(params->code_type);
+    decode_bboxes_all_launcher(loc_data_d, prior_data_d, num, num_priors, num_loc_classes,
+        params->background_label_id, code_type, params->share_location, params->variance_encoded_in_target,
+        false, false, nullptr, (float*)tempbufs_[0].ptr, context_->GetStream());
+
+    int *all_out_size = new int[num];
+    int num_kept = 0;
+    NMSFast((float*)tempbufs_[0].ptr, conf_data_d, num, params->num_classes, num_loc_classes, num_priors,
+        params->background_label_id, params->share_location, params->keep_top_k, top_k, params->confidence_threshold,
+        params->nms_param.nms_threshold, 1.001f, params->eta, false, nullptr, 0, (float*)tempbufs_[14].ptr,
+        temp_storage_bytes, tempbufs_, (float*)tempbufs_[1].ptr, all_out_size, &num_kept, context_->GetStream());
+
+    std::vector<int> top_shape(2, 1);
+    top_shape.push_back(num_kept);
+    top_shape.push_back(7);
+
+    if (num_kept == 0) {
+        top_shape[2] = num;
+    }
+
+    output_blob->GetBlobDesc().dims[2] = top_shape[2];
+    float* top_data_d = static_cast<float*>(output_blob->GetHandle().base);
+    if (num_kept == 0) {
+        int out_size = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+        float *top_data = new float[out_size];
+        for (int vi = 0; vi < out_size; vi++) {
+            top_data[vi] = -1;
+        }
+        for (int i = 0; i < num; ++i) {
+            top_data[i * 7 + 0] = i;
+        }
+        CUDA_CHECK(cudaMemcpyAsync(top_data_d, top_data, out_size * sizeof(float), cudaMemcpyHostToDevice, context_->GetStream()));
+        //TODO (johnzlli) need refactor
+        CUDA_CHECK(cudaStreamSynchronize(context_->GetStream()));
+        delete [] top_data;
+    } else {
+        CUDA_CHECK(cudaMemcpyAsync(top_data_d, tempbufs_[1].ptr, num_kept * 7 * sizeof(float), cudaMemcpyDeviceToDevice,
+            context_->GetStream()));
+    }
+
+    delete [] all_out_size;
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.h
new file mode 100644
index 0000000..6ae36cb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaDetectionOutputLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaDetectionOutputLayerAcc() {};
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    void AllocTempBuf();
+    int max_num;
+    int num_priors;
+    int num_loc_classes;
+    int top_k;
+    int max_top_k;
+    size_t temp_storage_bytes;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cu
new file mode 100644
index 0000000..6d7f9f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cu
@@ -0,0 +1,712 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cub/cub.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cuh"
+#include "tnn/utils/bbox_util.h"
+
+namespace TNN_NS {
+
+__device__ void decode_bbox_one(const float* loc_data, const float* prior_bbox, const float* prior_variance,
+        const int code_type, const bool variance_encoded_in_target, const bool clip_bbox, float* decode_bbox) {
+    if (code_type == PriorBoxParameter_CodeType_CORNER) {
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to add the offset
+            // predictions.
+            #pragma unroll
+            for(int i = 0; i < 4; i++) {
+                decode_bbox[i] = prior_bbox[i] + loc_data[i];
+            }
+        } else {
+            // variance is encoded in bbox, we need to scale the offset accordingly.
+            #pragma unroll
+            for(int i = 0; i < 4; i++) {
+                decode_bbox[i] = prior_bbox[i] + prior_variance[i] * loc_data[i];
+            }
+        }
+    } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+        float prior_width = prior_bbox[2] - prior_bbox[0];
+        float prior_height = prior_bbox[3] - prior_bbox[1];
+        float prior_center_x = (prior_bbox[2] + prior_bbox[0]) / 2.;
+        float prior_center_y = (prior_bbox[3] + prior_bbox[1]) / 2.;
+
+        float decode_bbox_center_x, decode_bbox_center_y;
+        float decode_bbox_width, decode_bbox_height;
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to retore the offset
+            // predictions.
+            decode_bbox_center_x = loc_data[0] * prior_width + prior_center_x;
+            decode_bbox_center_y = loc_data[1] * prior_height + prior_center_y;
+            decode_bbox_width    = exp(loc_data[2]) * prior_width;
+            decode_bbox_height   = exp(loc_data[3]) * prior_height;
+        } else {
+            // variance is encoded in bbox, we need to scale the offset accordingly.
+            decode_bbox_center_x =
+                    prior_variance[0] * loc_data[0] * prior_width + prior_center_x;
+            decode_bbox_center_y =
+                    prior_variance[1] * loc_data[1] * prior_height + prior_center_y;
+            decode_bbox_width =
+                    exp(prior_variance[2] * loc_data[2]) * prior_width;
+            decode_bbox_height =
+                    exp(prior_variance[3] * loc_data[3]) * prior_height;
+        }
+
+        decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.);
+        decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.);
+        decode_bbox[2] = (decode_bbox_center_x + decode_bbox_width / 2.);
+        decode_bbox[3] = (decode_bbox_center_y + decode_bbox_height / 2.);
+    } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
+        float prior_width = prior_bbox[2] - prior_bbox[0];
+        float prior_height = prior_bbox[3] - prior_bbox[1];
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to add the offset
+            // predictions.
+            decode_bbox[0] = (prior_bbox[0] + loc_data[0] * prior_width);
+            decode_bbox[1] = (prior_bbox[1] + loc_data[1] * prior_height);
+            decode_bbox[2] = (prior_bbox[2] + loc_data[2] * prior_width);
+            decode_bbox[3] = (prior_bbox[3] + loc_data[3] * prior_height);
+        } else {
+            // variance is encoded in bbox, we need to scale the offset accordingly.
+            decode_bbox[0] =
+                        prior_bbox[0] + prior_variance[0] * loc_data[0] * prior_width;
+            decode_bbox[1] = 
+                        prior_bbox[1] + prior_variance[1] * loc_data[1] * prior_height;
+            decode_bbox[2] = 
+                        prior_bbox[2] + prior_variance[2] * loc_data[2] * prior_width;
+            decode_bbox[3] = 
+                        prior_bbox[3] + prior_variance[3] * loc_data[3] * prior_height;
+        }
+    } else {
+        //LOG(FATAL) << "Unknown LocLossType.";
+    }
+    if (clip_bbox) {
+        #pragma unroll
+        for(int i = 0; i < 4; i++) {
+            decode_bbox[i] = fmaxf(fminf(decode_bbox[i], 1.f), 0.f);
+        }
+    } 
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void decode_arm_loc(const float* arm_loc_data, const float* prior_data, const int num,
+        const int m_num_priors, const int m_code_type, const bool m_variance_encoded_in_target, 
+        const bool clip_bbox, float * prior_decoded) {
+    for (int i = 0; i < ELE_PER_THREAD; i++) {
+        int global_id = ELE_PER_THREAD * THREAD_PER_BLOCK * blockIdx.x + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (global_id >= num * m_num_priors ) continue;
+        int idx_num = global_id / m_num_priors;
+        int p = global_id % m_num_priors;
+
+        const float* loc_inner      = &arm_loc_data[idx_num * m_num_priors  * 4 + p * 4];
+        const float* prior_bbox     = &prior_data[p * 4];
+        const float* prior_variance = &prior_data[m_num_priors * 4 + p * 4];
+        float* decode_bbox  = &prior_decoded[idx_num * m_num_priors * 4 + p * 4];
+
+        decode_bbox_one(loc_inner, prior_bbox, prior_variance, m_code_type, m_variance_encoded_in_target,
+            clip_bbox, decode_bbox);
+    }
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void decode_bboxes_all_kernel(const float* loc_data, const float* prior_data, const int num,
+        const int m_num_priors, const int m_num_loc_classes, const int m_background_label_id,
+        const int m_code_type, const bool m_share_location, const bool m_variance_encoded_in_target,
+        const bool clip_bbox, const bool with_arm_loc, const float * prior_decoded, float* decode_bboxes) {
+    int idx_num = blockIdx.y;
+    for(int i = 0; i < ELE_PER_THREAD; i++) {
+        int global_id = ELE_PER_THREAD * THREAD_PER_BLOCK * blockIdx.x + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (global_id >= m_num_loc_classes * m_num_priors) continue;
+        int c = global_id / m_num_priors;
+        int p = global_id % m_num_priors;
+
+        int label = m_share_location ? -1 : c;
+        if (label == m_background_label_id) {
+            // Ignore background class.
+            continue;
+        }
+
+        const float* loc_inner = &loc_data[idx_num * (m_num_priors * m_num_loc_classes * 4) +
+            p * (m_num_loc_classes * 4) + c * 4];
+        const float* arm_prior_bbox  = &prior_decoded[idx_num * m_num_priors * 4 + p * 4];
+        const float* prior_bbox  = &prior_data[p * 4];
+        const float* prior_variance = &prior_data[m_num_priors * 4 + p * 4];
+        float* decode_bbox  = &decode_bboxes[idx_num * (m_num_loc_classes * m_num_priors * 4) +
+            c * (m_num_priors * 4) + p * 4];
+
+        if (with_arm_loc) {
+            decode_bbox_one(loc_inner, arm_prior_bbox, prior_variance,
+                m_code_type, m_variance_encoded_in_target, clip_bbox,
+                decode_bbox);
+        } else {
+            decode_bbox_one(loc_inner, prior_bbox, prior_variance,
+                m_code_type, m_variance_encoded_in_target, clip_bbox,
+                decode_bbox);
+        }
+    }
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void nms_topk_init_kernel(const int num, const int m_num_classes, const int m_num_priors,
+        const float *conf_data, const bool with_arm_conf, const float *arm_conf_data,
+        const float m_objectness_score, float *key, int *value, int *sort_offset_start,
+        int *sort_offset_end) {
+    int i = blockIdx.y;
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+    int key_offset = i * m_num_classes * m_num_priors;
+    for (int j = 0; j < ELE_PER_THREAD; j++) {
+        int gid = block_offset + j * THREAD_PER_BLOCK + threadIdx.x; 
+        // gid = c * m_num_priors + p;
+        if (gid < m_num_classes * m_num_priors) {
+            int c = gid / m_num_priors;
+            int p = gid % m_num_priors;
+            if (with_arm_conf && arm_conf_data[i * m_num_priors * 2 + p * 2 +1] < m_objectness_score ) {
+                key[key_offset + gid] = (c == 0 ? 1.0 : 0);
+            } else {
+                key[key_offset + gid] = conf_data[key_offset + p * m_num_classes + c];
+            }
+            value[key_offset + gid] = gid;
+            // segmented sort for nms [per class sort]
+            if (p==0) {
+                sort_offset_start[i * m_num_classes + c] = (i * m_num_classes + c ) * m_num_priors;
+                sort_offset_end[i * m_num_classes + c]   = (i * m_num_classes + c + 1) * m_num_priors;
+            }
+        }
+    }
+}
+
+__device__ inline float JaccardOverlap(const float xmin1, const float xmin2, const float ymin1, const float ymin2, 
+    const float xmax1, const float xmax2, const float ymax1, const float ymax2, const bool normalized) {
+
+    float norm_add = normalized ? 0.f : 1.f;
+    float left  = max(xmin1, xmin2), right  = min(xmax1, xmax2);
+    float top   = max(ymin1, ymin2), bottom = min(ymax1, ymax2);
+    float width = max(right - left + norm_add, 0.f), height = max(bottom - top + norm_add, 0.f);
+    float interS = width * height;
+    float Sa = (xmax1 - xmin1 + norm_add) * (ymax1 - ymin1 + norm_add);
+    float Sb = (xmax2 - xmin2 + norm_add) * (ymax2 - ymin2 + norm_add);
+    return interS / (Sa + Sb - interS);
+}
+
+__device__ inline float JaccardOverlap(const int a, const int b, const float* decode_bboxes, const bool normalized) {
+    float xmin1, xmin2;
+    float ymin1, ymin2;
+    float xmax1, xmax2;
+    float ymax1, ymax2;
+
+    xmin1 = decode_bboxes[a * 4 + 0];
+    ymin1 = decode_bboxes[a * 4 + 1];
+    xmax1 = decode_bboxes[a * 4 + 2];
+    ymax1 = decode_bboxes[a * 4 + 3];
+
+    xmin2 = decode_bboxes[b * 4 + 0];
+    ymin2 = decode_bboxes[b * 4 + 1];
+    xmax2 = decode_bboxes[b * 4 + 2];
+    ymax2 = decode_bboxes[b * 4 + 3];
+
+    return JaccardOverlap(xmin1, xmin2, ymin1, ymin2, xmax1, xmax2, ymax1, ymax2, normalized);
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD, int TILE_X, int TILE_Y>
+__global__ void jaccardoverlap_kernel(const int top_k, const int num_overlaps, const int m_num_classes,
+        const int m_num_loc_classes, const bool m_share_location, const int m_background_label_id,
+        const int *indices, const int m_num_priors, const float *decode_bboxes, float *overlaps) {
+    int tile_idx = blockIdx.x;
+    int tile_idy = blockIdx.y;
+    int idx_num = blockIdx.z / m_num_classes;
+    int c = blockIdx.z % m_num_classes;
+    if (c == m_background_label_id) {
+        return ;
+    }
+
+    int start_x = tile_idx * TILE_X;
+    int start_y = tile_idy * TILE_Y;
+
+    if (start_y >= start_x + TILE_X) {
+        return ;
+    }
+
+    __shared__ float bbox[(TILE_X + TILE_Y) * 4];
+
+    int lablel = m_share_location ? 0 : c;
+    int bbox_offset = idx_num * (m_num_loc_classes * m_num_priors * 4) + lablel * (m_num_priors * 4);
+    int index_offset = idx_num * m_num_classes * m_num_priors + c * m_num_priors;
+    int overlap_offset = idx_num * m_num_classes * num_overlaps + c * num_overlaps;
+
+    if (threadIdx.x  < TILE_X * 4) {
+        int a = start_x + threadIdx.x / 4;
+        int index = indices[index_offset + a] % m_num_priors;
+        bbox[threadIdx.x ] = decode_bboxes[bbox_offset + index * 4 + threadIdx.x % 4];
+    }
+    if (threadIdx.x  < (TILE_X + TILE_Y) * 4 && threadIdx.x >= TILE_X *4 ) {
+        int b = start_y + (threadIdx.x - TILE_X * 4) / 4;
+        int index = indices[index_offset + b] % m_num_priors;
+        bbox[threadIdx.x ] = decode_bboxes[bbox_offset + index * 4 + threadIdx.x % 4];
+    }
+
+    __syncthreads();
+
+    #pragma unroll  
+    for(int i = 0; i < ELE_PER_THREAD; i++) {
+        int inner_tile_id = i * THREAD_PER_BLOCK + threadIdx.x;
+        int tile_a = inner_tile_id % TILE_X ;
+        int tile_b = inner_tile_id / TILE_X;
+        int a = tile_a + start_x;
+        int b = tile_b + start_y;
+        int gid = a * (a - 1) / 2 + b;
+        if (b < a && gid < num_overlaps) {
+            float xmin1 = bbox[tile_a * 4 + 0];
+            float ymin1 = bbox[tile_a * 4 + 1];
+            float xmax1 = bbox[tile_a * 4 + 2];
+            float ymax1 = bbox[tile_a * 4 + 3];
+            float xmin2 = bbox[TILE_X * 4 + tile_b * 4 + 0];
+            float ymin2 = bbox[TILE_X * 4 + tile_b * 4 + 1];
+            float xmax2 = bbox[TILE_X * 4 + tile_b * 4 + 2];
+            float ymax2 = bbox[TILE_X * 4 + tile_b * 4 + 3];
+            overlaps[overlap_offset + gid] = JaccardOverlap(xmin1, xmin2, ymin1, ymin2, xmax1, xmax2, ymax1, ymax2, true);
+        }
+    }
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void jaccardoverlap_batch_kernel(const int top_k, const int m_keep_top_k, const int m_num_priors,
+        const int m_num_classes, const int m_num_loc_classes, const bool m_share_location, const int *indices,
+        const int *num_select_d, const float *decode_bboxes, float *overlaps, int *indices_out) {
+    int idx_batch = blockIdx.y;
+    int keep_top_k = min(m_keep_top_k, num_select_d[idx_batch]);
+    int num_overlaps = keep_top_k * (keep_top_k - 1) / 2;
+
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+    int index_offset = idx_batch * m_num_classes * top_k;
+    int bbox_offset  = idx_batch* (m_num_loc_classes * m_num_priors * 4);
+    int overlap_offset =  idx_batch* m_keep_top_k * (m_keep_top_k- 1) / 2;
+
+    for(int i = 0; i < ELE_PER_THREAD; i++) {
+        int gid =  block_offset + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (gid < num_overlaps) {
+            int a = floorf(sqrtf(gid * 2));
+            if (gid * 2 < a * (a + 1)) {
+                a -= 1;
+            }
+            a += 1;
+            int b = gid - a * (a - 1) / 2;
+
+            int c_a  = indices[index_offset + a] / m_num_priors;
+            int c_b  = indices[index_offset + b] / m_num_priors;
+
+            int label_a  = m_share_location ? 0 : c_a;
+            int label_b  = m_share_location ? 0 : c_b;
+
+            int idxa = indices[index_offset + a] % m_num_priors + label_a * m_num_priors;
+            int idxb = indices[index_offset + b] % m_num_priors + label_b * m_num_priors;
+
+            overlaps[overlap_offset + gid] = JaccardOverlap(idxa, idxb, &decode_bboxes[bbox_offset], true);
+        }
+        if (gid < m_num_classes * top_k) {
+            indices_out[index_offset + gid] = idx_batch * m_num_classes * m_num_priors + indices[index_offset + gid];
+        }
+    }
+}
+
+template<int THREAD_PER_BLOCK>
+__global__ void adaptive_nms_kernel(const int top_k, const float m_nms_threshold, const float m_eta,
+        const int m_background_label_id, const int m_num_classes, const int num_overlaps, const int m_num_priors,
+        const float *overlaps, const float *conf_sorted, const float m_confidence_threshold, bool *keep, int *num) {
+    int overlap_offset  = blockIdx.x * m_num_classes * num_overlaps + blockIdx.y * num_overlaps;
+    int keep_offset     = blockIdx.x * m_num_classes * m_num_priors  + blockIdx.y * m_num_priors;
+    int num_offset = blockIdx.x * m_num_classes + blockIdx.y;
+    int conf_offset = blockIdx.x * m_num_classes * m_num_priors + blockIdx.y * m_num_priors;
+    if (blockIdx.y == m_background_label_id) {
+        return;
+    }
+
+    __shared__ int keep_flag;
+    __shared__ float adaptive_threshold;
+    __shared__ bool first_rank_fail;
+    int keep_count = 1;
+
+    int tid = threadIdx.x;
+    if (tid == 0) {
+        keep[keep_offset + 0]=true;
+        keep_flag = 1;
+        adaptive_threshold = m_nms_threshold;
+
+        first_rank_fail = (conf_sorted[conf_offset + 0] > m_confidence_threshold ? false : true);
+        if (first_rank_fail) {
+            num[num_offset] = 0;
+        }
+    }
+    __syncthreads();  
+    if (first_rank_fail) {
+        for(int i = 0; i < top_k; i += THREAD_PER_BLOCK) {
+            int pr = i + threadIdx.x;
+            if (pr < top_k){
+                keep[keep_offset + pr] = false;
+            }
+        }
+        return;
+    }
+
+    for(int r = 1; r < top_k; r++) {
+        __syncthreads();  
+        if (conf_sorted[conf_offset + r] > m_confidence_threshold) {
+            for(int i = 0; i < r; i += THREAD_PER_BLOCK) {
+                int pr = i + threadIdx.x;
+                if (pr < r) {
+                    if (keep[keep_offset + pr] && overlaps[overlap_offset + r * (r - 1) / 2 + pr] > adaptive_threshold) {
+                        atomicCAS(&keep_flag, 1, 0);
+                    }
+                }
+            }
+        } else {
+            if (tid==0) {
+                keep_flag = 0;
+            }
+        }
+        __syncthreads();  
+        if (tid ==0 ) {
+            if (keep_flag == 1) {
+                keep[keep_offset + r] = true;
+                keep_count += 1;
+                if (m_eta < 1 && adaptive_threshold > 0.5) {
+                    adaptive_threshold *= m_eta;
+                }
+            } else {
+                keep[keep_offset + r] = false;
+            }
+            keep_flag = 1;
+        }
+        __syncthreads();  
+    }
+    if (tid == 0) {
+        num[num_offset] = keep_count;
+    }
+    __syncthreads();  
+
+    return ;
+}
+
+template<int THREAD_PER_BLOCK>
+__global__ void adaptive_nms_batch_kernel(const int m_keep_top_k, const int batch_offset, const float m_nms_threshold,
+        const float m_eta, const float *overlaps, const float *conf_sorted, const float m_confidence_threshold,
+        const int *num_select_d, bool *keep, int *num) {
+    int overlap_offset  = blockIdx.x * m_keep_top_k * (m_keep_top_k - 1) / 2;
+    int keep_offset     = blockIdx.x * batch_offset;
+    int conf_offset     = blockIdx.x * batch_offset;
+    int num_offset      = blockIdx.x ;
+    int keep_top_k      = min(m_keep_top_k, num_select_d[blockIdx.x]);
+
+    __shared__ int keep_flag;
+    __shared__ float adaptive_threshold;
+    __shared__ bool first_rank_fail;
+    int keep_count = 1;
+
+    int tid = threadIdx.x;
+    if (tid == 0) {
+        keep[keep_offset + 0] = true;
+        keep_flag = 1;
+        adaptive_threshold = m_nms_threshold;
+
+        first_rank_fail = ( keep_top_k == 0 ) ||  (conf_sorted[conf_offset + 0] > m_confidence_threshold ? false : true);
+        if (first_rank_fail) {
+            num[num_offset] = 0;
+        }
+    }
+    __syncthreads();  
+    if (first_rank_fail){
+        for(int i=0;i<m_keep_top_k;i+=THREAD_PER_BLOCK){
+            int pr = i + threadIdx.x;
+            if (pr < m_keep_top_k){
+                keep[keep_offset + pr] = false;
+            }
+        }
+        return;
+    }
+
+    for(int r = 1; r < keep_top_k; r++) {
+        __syncthreads();  
+        if (conf_sorted[conf_offset + r] > m_confidence_threshold) {
+            for(int i = 0; i < r; i += THREAD_PER_BLOCK) {
+                int pr = i + threadIdx.x;
+                if (pr < r) {
+                    if (keep[keep_offset + pr] && overlaps[overlap_offset + r * (r - 1) / 2 + pr] > adaptive_threshold) {
+                        atomicCAS(&keep_flag, 1, 0);
+                    }
+                }
+            }
+        } else {
+            if (tid == 0) {
+                keep_flag = 0;
+            }
+        }
+        __syncthreads();  
+        if (tid ==0) {
+            if (keep_flag == 1) {
+                keep[keep_offset + r] = true;
+                keep_count += 1;
+                if (m_eta < 1 && adaptive_threshold > 0.5) {
+                    adaptive_threshold *= m_eta;
+                }
+            } else {
+                keep[keep_offset + r] = false;
+            }
+            keep_flag = 1;
+        }
+        __syncthreads();  
+    }
+    if (tid == 0) {
+        num[num_offset] = keep_count;
+    }
+    __syncthreads();  
+
+    return ;
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void write_output_batch_nms_kernel(const int num_kept, const int m_num_classes, const int m_num_loc_classes,
+        const int m_num_priors, const bool m_share_location, const float * conf_kept_out, const int * index_kept_out,
+        const float * decode_bboxes, float* all_out) {
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+    for (int iele = 0; iele < ELE_PER_THREAD; iele++) {
+        int out_offset =  block_offset + iele * THREAD_PER_BLOCK + threadIdx.x;
+        if (out_offset < num_kept){
+            int i       = index_kept_out[out_offset] / (m_num_classes * m_num_priors); 
+            int index   = index_kept_out[out_offset] % (m_num_classes * m_num_priors);
+            float score = conf_kept_out[out_offset];
+
+            int p = index % m_num_priors;
+            int c = index / m_num_priors;
+
+            int lablel = m_share_location ? 0 : c;
+            int bbox_offset = i * (m_num_loc_classes * m_num_priors * 4) + lablel * (m_num_priors * 4) + p * 4;
+            float xmin = decode_bboxes[bbox_offset + 0];
+            float ymin = decode_bboxes[bbox_offset + 1];
+            float xmax = decode_bboxes[bbox_offset + 2];
+            float ymax = decode_bboxes[bbox_offset + 3];
+
+            // int all_out_offset = i * m_keep_top_k * 7;
+            all_out[out_offset * 7 + 0] = i;
+            all_out[out_offset * 7 + 1] = c;
+            all_out[out_offset * 7 + 2] = score;
+            all_out[out_offset * 7 + 3] = xmin;
+            all_out[out_offset * 7 + 4] = ymin;
+            all_out[out_offset * 7 + 5] = xmax;
+            all_out[out_offset * 7 + 6] = ymax;
+        }
+    }
+}
+
+__global__ void set_sort_offset(const int num, const int m_num_classes, const int top_k, 
+        const int *num_select_d, int *sort_offset_start,  int *sort_offset_end) {
+    int i = threadIdx.x;
+    if (i < num) {
+        sort_offset_start[i] = i * m_num_classes * top_k;
+        sort_offset_end[i] = i * m_num_classes * top_k + num_select_d[i];
+    }
+}
+
+void decode_arm_loc_launcher(const float* arm_loc_data, const float* prior_data, const int num, const int m_num_priors, 
+        const int m_code_type, const bool m_variance_encoded_in_target, const bool clip_bbox, float * prior_decoded,
+        cudaStream_t stream) {
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 16;
+    dim3 griddim;
+    griddim.x = (num * m_num_priors + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) /(ELE_PER_THREAD * THREAD_PER_BLOCK);
+    griddim.y = 1;
+    decode_arm_loc<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, stream>>>(arm_loc_data,
+        prior_data, num, m_num_priors, m_code_type, m_variance_encoded_in_target, false, prior_decoded);
+}
+
+void decode_bboxes_all_launcher(const float* loc_data, const float* prior_data, const int num, const int m_num_priors, 
+        const int m_num_loc_classes, const int m_background_label_id, const int m_code_type, const bool m_share_location,
+        const bool m_variance_encoded_in_target, const bool clip_bbox, const bool with_arm_loc, const float * prior_decoded,
+        float* decode_bboxes, cudaStream_t stream) {
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 16;
+
+    dim3 griddim;
+    griddim.x = (m_num_loc_classes * m_num_priors + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) /(ELE_PER_THREAD * THREAD_PER_BLOCK);
+    griddim.y = num;
+    
+    decode_bboxes_all_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, 128, 0, stream>>>(loc_data, prior_data, num,
+        m_num_priors, m_num_loc_classes, m_background_label_id, m_code_type, m_share_location, m_variance_encoded_in_target,
+        clip_bbox, with_arm_loc, prior_decoded, decode_bboxes);
+}
+
+int NMSFast(const float * decode_bboxes_d, const float * conf_data_d, const int num, const int m_num_classes,
+        const int m_num_loc_classes, const int m_num_priors, const int m_background_label_id, const bool m_share_location,
+        const int m_keep_top_k, const int m_top_k, const float m_confidence_threshold, const float m_nms_threshold,
+        const float m_class_wise_nms_threshold, const float m_eta, const bool with_arm_conf, const float * arm_conf_data,
+        const float m_objectness_score, void * m_d_temp_storage, const size_t m_temp_storage_bytes, std::vector<CudaTempBufUnit> tempbufs, 
+        float * all_out_d, int * all_out_size, int * num_kept, cudaStream_t stream) {
+    float *key_d = (float*)tempbufs[2].ptr;
+    float *key_out_d = (float*)tempbufs[3].ptr;
+    int *value_d = (int*)tempbufs[4].ptr;
+    int *value_out_d = (int*)tempbufs[5].ptr;
+
+    float *overlaps_d = (float*)tempbufs[6].ptr;
+    bool *keep_d = (bool*)tempbufs[7].ptr;
+    int *num_kept_per_class_d = (int*)tempbufs[8].ptr;
+
+    CUDA_CHECK(cudaMemset(keep_d, 0, num * m_num_classes * m_num_priors * sizeof(bool)));
+
+    const int top_k = min(m_top_k, m_num_priors);
+    int num_overlaps = top_k * (top_k-1) / 2;
+
+    int *sort_offset_start = (int*)tempbufs[11].ptr;
+    int *sort_offset_end = (int*)tempbufs[12].ptr;
+
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 16;
+    dim3 initdim;
+    initdim.x = (m_num_priors * m_num_classes + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    initdim.y = num;
+
+    nms_topk_init_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<initdim, THREAD_PER_BLOCK, 0, stream>>>(num, m_num_classes, m_num_priors,
+        conf_data_d, with_arm_conf, arm_conf_data, m_objectness_score, key_d, value_d, sort_offset_start, sort_offset_end);
+
+    void * d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    // Sort by confidence in each class
+    CubDebug(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, key_d, key_out_d, 
+        value_d, value_out_d, num * m_num_classes * m_num_priors, num *m_num_classes, sort_offset_start, sort_offset_end));
+
+    bool malloc_flag = false;
+    if (temp_storage_bytes > m_temp_storage_bytes) {
+        malloc_flag = true;
+        CUDA_CHECK(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes));
+    } else {
+        d_temp_storage = m_d_temp_storage;
+        temp_storage_bytes = m_temp_storage_bytes;
+    }
+
+    CubDebug(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, key_d, key_out_d, 
+        value_d, value_out_d, num * m_num_classes * m_num_priors, num * m_num_classes, sort_offset_start, sort_offset_end));
+
+    dim3 griddim;
+    {
+        const int tile_x = 16;
+        const int tile_y = 16;
+        // THREAD_PER_BLOCK > (tile_x + tile_y) * 4
+        const int THREAD_PER_BLOCK = 128;
+        const int ELE_PER_THREAD = tile_x * tile_y / THREAD_PER_BLOCK;
+
+        griddim.x = (top_k + tile_x-1) / tile_x;
+        griddim.y = (top_k + tile_y-1) / tile_y;
+        griddim.z = num * m_num_classes;
+
+        // calculate jaccardoverlap in each class for the top_k predictions 
+        jaccardoverlap_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD, tile_x, tile_y><<<griddim, THREAD_PER_BLOCK, 0, stream>>>(top_k,
+            num_overlaps, m_num_classes, m_num_loc_classes, m_share_location, m_background_label_id, value_out_d, m_num_priors,
+            decode_bboxes_d, overlaps_d);
+        
+    }
+
+    dim3 nms_dim;
+    nms_dim.x = num;
+    nms_dim.y = m_num_classes;
+
+    // apply nms in each class
+    if (top_k <= 256) {
+        adaptive_nms_kernel<256><<<nms_dim, 256, 0, stream>>>(top_k, m_nms_threshold, m_eta, 
+            m_background_label_id, m_num_classes, num_overlaps, m_num_priors,
+            overlaps_d , key_out_d, m_confidence_threshold,
+            keep_d , num_kept_per_class_d);
+    } else if (top_k <=512) {
+        adaptive_nms_kernel<512><<<nms_dim, 512, 0, stream>>>(top_k, m_nms_threshold, m_eta, 
+            m_background_label_id, m_num_classes, num_overlaps, m_num_priors,
+            overlaps_d , key_out_d, m_confidence_threshold,
+            keep_d , num_kept_per_class_d);
+    } else {
+        adaptive_nms_kernel<1024><<<nms_dim, 1024, 0, stream>>>(top_k, m_nms_threshold, m_eta, 
+            m_background_label_id, m_num_classes, num_overlaps, m_num_priors,
+            overlaps_d , key_out_d, m_confidence_threshold,
+            keep_d , num_kept_per_class_d);
+    }
+
+    // variables for select kept values
+    float *conf_kept_out_d = (float*)tempbufs[9].ptr;
+    int *index_kept_out_d = (int*)tempbufs[10].ptr;
+    int *num_select_d = (int*) tempbufs[13].ptr;
+
+    // select those kept predictions
+    for (int i = 0; i < num; i++) { 
+        float *key_per_n_d = key_out_d + i * m_num_classes * m_num_priors;
+        int *value_per_n_d = value_out_d + i * m_num_classes * m_num_priors;
+        bool *keep_per_n_d = keep_d + i * m_num_classes * m_num_priors;
+
+        float *conf_per_n_out_d = conf_kept_out_d + i * m_num_classes * top_k;
+        int *index_per_n_out_d = index_kept_out_d + i * m_num_classes * top_k;
+
+        cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, key_per_n_d, keep_per_n_d, conf_per_n_out_d,
+            num_select_d + i, m_num_classes * m_num_priors);
+        cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, value_per_n_d, keep_per_n_d, index_per_n_out_d,
+            num_select_d + i, m_num_classes * m_num_priors);
+    }
+
+    // sort by confidence in each batch 
+    set_sort_offset<<<1, 1024, 0, stream>>>(num, m_num_classes, top_k, num_select_d, sort_offset_start, sort_offset_end);
+
+    CubDebug(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+        conf_kept_out_d, key_out_d, index_kept_out_d, value_out_d, num * m_num_classes * top_k, 
+        num, sort_offset_start, sort_offset_end));
+
+    num_overlaps = m_keep_top_k * (m_keep_top_k - 1) / 2;
+    griddim.x = (num_overlaps + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    griddim.y = num;
+    griddim.z = 1;
+
+    // calculate jaccardoverlap in each batch for the top keep_top_k predictions
+    jaccardoverlap_batch_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, stream>>>(top_k,
+        m_keep_top_k, m_num_priors,m_num_classes, m_num_loc_classes, m_share_location, value_out_d, num_select_d,
+        decode_bboxes_d, overlaps_d, value_d);
+
+    CUDA_CHECK(cudaMemset(keep_d, 0, num * m_num_classes * top_k * sizeof(bool)));
+
+    nms_dim.x = num;
+    nms_dim.y = 1;
+
+    // applay nms in each batch
+    adaptive_nms_batch_kernel<1024><<<nms_dim, 1024, 0, stream>>>(m_keep_top_k, m_num_classes * top_k,
+        m_class_wise_nms_threshold, m_eta, overlaps_d, key_out_d, m_confidence_threshold, num_select_d,
+        keep_d , num_kept_per_class_d);
+
+    // select those kept predictions for each batch
+    cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, value_d, keep_d, index_kept_out_d,
+        num_kept_per_class_d+num, num * m_num_classes * top_k);
+    cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, key_out_d, keep_d, conf_kept_out_d,
+        num_kept_per_class_d+num, num * m_num_classes * top_k);
+
+    CUDA_CHECK(cudaMemcpy(num_kept, num_kept_per_class_d+ num, sizeof(int), cudaMemcpyDeviceToHost));
+
+    if (*num_kept > 0) {
+        dim3 writedim;
+        writedim.x = (*num_kept + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+        write_output_batch_nms_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<writedim, THREAD_PER_BLOCK, 0, stream>>>(*num_kept,
+            m_num_classes, m_num_loc_classes,m_num_priors, m_share_location, conf_kept_out_d, index_kept_out_d, decode_bboxes_d,
+            all_out_d);
+    }
+
+    if (malloc_flag) {
+        CUDA_CHECK(cudaFree(d_temp_storage));
+    }
+
+    return 0; 
+} 
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cuh b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cuh
new file mode 100644
index 0000000..05277f2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_detection_output_layer_acc_kernel.cuh
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_KERNEL_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_KERNEL_CUH_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+void decode_arm_loc_launcher(const float* arm_loc_data, const float* prior_data, const int num,
+    const int m_num_priors, const int m_code_type, const bool m_variance_encoded_in_target,
+    const bool clip_bbox, float * prior_decoded, cudaStream_t stream);
+
+void decode_bboxes_all_launcher(const float* loc_data, const float* prior_data, const int num,
+    const int m_num_priors, const int m_num_loc_classes, const int m_background_label_id,
+    const int m_code_type, const bool m_share_location, const bool m_variance_encoded_in_target,
+    const bool clip_bbox, const bool with_arm_loc, const float * prior_decoded, float* decode_bboxes,
+    cudaStream_t stream);
+
+int NMSFast(const float * decode_bboxes_d, const float * conf_data_d, const int num,
+    const int m_num_classes, const int m_num_loc_classes, const int m_num_priors,
+    const int m_background_label_id, const bool m_share_location, const int m_keep_top_k,
+    const int m_top_k, const float m_confidence_threshold, const float m_nms_threshold,
+    const float m_class_wise_nms_threshold, const float m_eta, const bool with_arm_conf,
+    const float * arm_conf_data, const float m_objectness_score, void * m_d_temp_storage,
+    const size_t m_temp_storage_bytes, std::vector<CudaTempBufUnit> tempbufs,
+    float * all_out_d, int * all_out_size, int * num_kept, cudaStream_t stream);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DETECTION_OUTPUT_LAYER_ACC_KERNEL_CUH_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_div_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_div_layer_acc.cu
new file mode 100644
index 0000000..7d0c3b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_div_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Div, LAYER_DIV);
+
+Status CudaDivLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaDivLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaDivLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Div, LAYER_DIV);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_einsum_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_einsum_layer_acc.cu
new file mode 100644
index 0000000..212306b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_einsum_layer_acc.cu
@@ -0,0 +1,323 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Einsum, LAYER_EINSUM);
+
+__global__ void einsum_permute_kernel(int n, const float *srcData, int num_axes, int *permute_order,
+        int *old_steps, int *new_steps, float *dstData) {
+    CUDA_KERNEL_LOOP(index, n) {
+        int old_idx = 0;
+        int idx = index;
+        for (int j = 0; j < num_axes; ++j) {
+            int order = permute_order[j];
+            old_idx += (idx / new_steps[j]) * old_steps[order];
+            idx %= new_steps[j];
+        }
+        dstData[index] = srcData[old_idx];
+    }
+}
+
+__global__ void einsum_sum_kernel(const float* in, float* out, int outer_count, int reducer_count, int inner_count) {
+    CUDA_KERNEL_LOOP(index, outer_count) {
+        const float* input_data = in + index * inner_count * reducer_count;
+        float* output_data = out + index * inner_count;
+        for (int c = 0; c < reducer_count; c++) {
+            for (int ic = 0; ic < inner_count; ic++) {
+                output_data[ic] += input_data[ic];
+            }
+            input_data += inner_count;
+        }
+    }
+}
+
+__global__ void einsum_mul_kernel(int count, const float* in1, const float* in2, float* out,
+        const int* input1_dims, int input1_dims_size, const int* input2_dims, int input2_dims_size,
+        const int* output_dims, int output_dim_size) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int diff1 = output_dim_size - input1_dims_size;
+        int diff2 = output_dim_size - input2_dims_size;
+        int input1_offset = 0;
+        int input2_offset = 0;
+        int prod = count;
+        for (int i = 0; i < diff1; i++) {
+            prod /= output_dims[i];
+        }
+        for (int i = 0; i < input1_dims_size; i++) {
+            prod /= output_dims[i+diff1];
+            int mod = index / prod % output_dims[i+diff1];
+            mod = min(mod, input1_dims[i]-1);
+            input1_offset = input1_offset * input1_dims[i] + mod;
+        }
+        prod = count;
+        for (int i = 0; i < diff2; i++) {
+            prod /= output_dims[i];
+        }
+        for (int i = 0; i < input2_dims_size; i++) {
+            prod /= output_dims[i+diff2];
+            int mod = index / prod % output_dims[i+diff2];
+            mod = min(mod, input2_dims[i]-1);
+            input2_offset = input2_offset * input2_dims[i] + mod;
+        }
+        out[index] = in1[input1_offset] * in2[input2_offset];
+    }
+}
+
+template<int THREAD_PER_BLOCK>
+__global__ void einsum_dot_kernel(const float* a, const float* b, float* c, int count) {
+    __shared__ double ssum[THREAD_PER_BLOCK/32];
+    double thread_sum = 0.f;
+    for (int i = threadIdx.x; i < count; i+=THREAD_PER_BLOCK) {
+        thread_sum += a[i] * b[i];
+    }
+    thread_sum += __shfl_down_sync(0xffffffff, thread_sum, 16, 32);
+    thread_sum += __shfl_down_sync(0x0000ffff, thread_sum, 8, 16);
+    thread_sum += __shfl_down_sync(0x000000ff, thread_sum, 4, 8);
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (threadIdx.x % 32 == 0) {
+        ssum[threadIdx.x / 32] = thread_sum;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_sum = ssum[threadIdx.x];
+    } else {
+        thread_sum = 0;
+    }
+
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (threadIdx.x == 0) {
+        c[0] = thread_sum;
+    }
+}
+
+std::shared_ptr<Blob> EinsumPermute(Blob *input_blob, const std::vector<int> &orders,
+        void* buf1, void* buf2, void* buf3, cudaStream_t stream) {
+    auto output_blob_ptr = std::make_shared<Blob>(input_blob->GetBlobDesc(), true);
+    auto *output_blob    = output_blob_ptr.get();
+    auto input_dims      = input_blob->GetBlobDesc().dims;
+    auto output_dims     = input_blob->GetBlobDesc().dims;
+    const int dims_size  = input_dims.size();
+    for (int i = 0; i < dims_size; i++) {
+        output_dims[i] = input_dims[orders[i]];
+    }
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(DimsVectorUtils::Count(input_dims, i + 1));
+        output_step.push_back(DimsVectorUtils::Count(output_dims, i + 1));
+    }
+    cudaMemcpyAsync(buf1, orders.data(), orders.size() * sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(buf2, input_step.data(), input_dims.size() * sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(buf3, output_step.data(), input_dims.size() * sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+
+    float *input_data   = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data  = static_cast<float *>(output_blob->GetHandle().base);
+    const int count = DimsVectorUtils::Count(output_dims);
+    einsum_permute_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, stream>>>(count,
+        input_data, output_dims.size(), (int*)buf1, (int*)buf2, (int*)buf3, output_data);
+    return output_blob_ptr;
+}
+
+void EinsumSqueeze(Blob *input_blob, const int axis) {
+    auto output_dims = input_blob->GetBlobDesc().dims;
+    output_dims.erase(output_dims.begin() + axis);
+    input_blob->GetBlobDesc().dims = output_dims;
+}
+
+std::shared_ptr<Blob> EinsumSum(Blob *input_blob, const int axis, cudaStream_t stream) {
+    const auto input_desc = input_blob->GetBlobDesc();
+    auto output_desc      = input_desc;
+    auto output_dims      = output_desc.dims;
+    output_dims.erase(output_dims.begin() + axis);
+    output_desc.dims     = output_dims;
+    auto output_blob_ptr = std::make_shared<Blob>(output_desc, true);
+    auto *output_blob    = output_blob_ptr.get();
+
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    int outer_count   = DimsVectorUtils::Count(input_dims, 0, axis);
+    int reducer_count = input_dims[axis];
+    int inner_count   = DimsVectorUtils::Count(input_dims, axis + 1);
+    inner_count       = inner_count == 0 ? 1 : inner_count;
+    input_dims[axis]  = 1;
+
+    float *input_data      = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data     = static_cast<float *>(output_blob->GetHandle().base);
+    einsum_sum_kernel<<<TNN_CUDA_GET_BLOCKS(outer_count), TNN_CUDA_NUM_THREADS, 0, stream>>>(input_data, output_data,
+        outer_count, reducer_count, inner_count);
+    return output_blob_ptr;
+}
+
+std::shared_ptr<Blob> EinsumMul(Blob *a, Blob *b, void* buf1, void* buf2, void* buf3, cudaStream_t stream) {
+    std::vector<void *> input_ptrs       = {a->GetHandle().base, b->GetHandle().base};
+    auto output_dims                     = DimsVectorUtils::Max(a->GetBlobDesc().dims, b->GetBlobDesc().dims);
+    auto output_desc                     = a->GetBlobDesc();
+    output_desc.dims                     = output_dims;
+    auto output_ptr                      = std::make_shared<Blob>(output_desc, true);
+    auto *output                         = output_ptr.get();
+    int input1_dims_size = a->GetBlobDesc().dims.size();
+    int input2_dims_size = b->GetBlobDesc().dims.size();
+
+    cudaMemcpyAsync(buf1, a->GetBlobDesc().dims.data(), input1_dims_size*sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(buf2, b->GetBlobDesc().dims.data(), input2_dims_size*sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(buf3, output_dims.data(), output_dims.size()*sizeof(int),
+        cudaMemcpyHostToDevice, stream);
+
+    int count = DimsVectorUtils::Count(output_dims);
+    einsum_mul_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, stream>>>(count,
+        (const float*)(a->GetHandle().base), (const float*)(b->GetHandle().base), (float*)(output->GetHandle().base),
+        (const int*)buf1, input1_dims_size, (const int*)buf2, input2_dims_size, (const int*)buf3, output_dims.size());
+    return output_ptr;
+}
+
+void EinsumFlatten(Blob *input_blob) {
+    const int output_dims_size     = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims);
+    input_blob->GetBlobDesc().dims = {output_dims_size};
+}
+
+std::shared_ptr<Blob> EinsumDot(Blob *a, Blob *b, cudaStream_t stream) {
+    auto output_blob_desc = a->GetBlobDesc();
+    output_blob_desc.dims = {1};
+    auto output_blob_ptr  = std::make_shared<Blob>(output_blob_desc, true);
+    auto *output_blob     = output_blob_ptr.get();
+    const int data_size   = a->GetBlobDesc().dims[0];
+    float *a_data         = static_cast<float *>(a->GetHandle().base);
+    float *b_data         = static_cast<float *>(b->GetHandle().base);
+    float *output_data    = static_cast<float *>(output_blob->GetHandle().base);
+    einsum_dot_kernel<128><<<1, 128, 0, stream>>>(a_data, b_data, output_data, data_size);
+    return output_blob_ptr;
+}
+
+Status CudaEinsumLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CreateTempBuf(6 * sizeof(int));
+    CreateTempBuf(6 * sizeof(int));
+    CreateTempBuf(6 * sizeof(int));
+    CreateTempBuf(6 * sizeof(int));
+    CreateTempBuf(6 * sizeof(int));
+    CreateTempBuf(6 * sizeof(int));
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaEinsumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaEinsumLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<EinsumLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: EinsumLayerParam is nil");
+    }
+
+    std::vector<std::shared_ptr<Blob>> permuted_operands;
+    for (int i = 0; i < inputs.size(); i++) {
+        //auto operand = inputs[i];
+        auto operand_ptr          = std::make_shared<Blob>(inputs[i]->GetBlobDesc(), inputs[i]->GetHandle());
+        auto *operand             = operand_ptr.get();
+        operand->GetBlobDesc().dims = param->operand_dims[i];
+        permuted_operands.push_back(EinsumPermute(operand, param->perm_shapes[i], tempbufs_[0].ptr,
+            tempbufs_[1].ptr, tempbufs_[2].ptr, context_->GetStream()));
+    }
+
+    int out_size = param->out_size;
+    int perm_index = param->dim_last_op.size();
+    auto dim_last_op = param->dim_last_op;
+    auto result = permuted_operands[0];
+
+    if (param->has_zero_size_dim) {
+        std::vector<int> out_shape(out_size);
+        int output_shape_count = 1;
+        for (int i = 0; i < out_size; i++) {
+            out_shape[i] = permuted_operands[dim_last_op[i]].get()->GetBlobDesc().dims[i];
+            output_shape_count *= out_shape[i];
+        }
+        float *output_ptr = static_cast<float *>(outputs[0]->GetHandle().base);
+        cudaMemset(output_ptr, 0, sizeof(float) * output_shape_count);
+
+        return TNN_OK;
+    }
+
+    int dim = out_size;
+    for (int i = dim; i < perm_index; ++i, ++dim) {
+        if (dim_last_op[i] == 0) {
+            if (result.get()->GetBlobDesc().dims[dim] == 1) {
+                EinsumSqueeze(result.get(), dim--);
+            } else {
+                result = EinsumSum(result.get(), dim--, context_->GetStream());
+            }
+        }
+    }
+
+    auto operand = permuted_operands[1];
+    std::vector<int> sum_dims;
+
+    dim = out_size;
+    for (int j = dim; j < perm_index; ++j, ++dim) {
+        if (dim_last_op[j] < 1) {
+            EinsumSqueeze(operand.get(), dim);
+            --dim;
+        } else if (dim_last_op[j] == 1) {
+            if (result.get()->GetBlobDesc().dims[dim] == 1) {
+                operand = EinsumSum(operand.get(), dim, context_->GetStream());
+                EinsumSqueeze(result.get(), dim);
+                --dim;
+            } else {
+                sum_dims.push_back(dim);
+            }
+        }
+    }
+
+    if (sum_dims.empty()) {
+        result = EinsumMul(result.get(), operand.get(), tempbufs_[0].ptr,
+            tempbufs_[1].ptr, tempbufs_[2].ptr, context_->GetStream());
+    } else if (sum_dims.size() == result.get()->GetBlobDesc().dims.size()) {
+        EinsumFlatten(result.get());
+        EinsumFlatten(operand.get());
+        result = EinsumDot(result.get(), operand.get(), context_->GetStream());
+    } else {
+        result = EinsumMul(result.get(), operand.get(), tempbufs_[0].ptr,
+            tempbufs_[1].ptr, tempbufs_[2].ptr, context_->GetStream());
+        for (const auto axis : sum_dims) {
+            result = EinsumSum(result.get(), axis, context_->GetStream());
+        }
+    }
+
+    const int data_count = DimsVectorUtils::Count(result.get()->GetBlobDesc().dims);
+    cudaMemcpyAsync(outputs[0]->GetHandle().base, result.get()->GetHandle().base,
+        data_count * sizeof(float), cudaMemcpyDeviceToDevice, context_->GetStream());
+//    context_->Synchronize();
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Einsum, LAYER_EINSUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_elu_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_elu_layer_acc.cu
new file mode 100644
index 0000000..499c344
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_elu_layer_acc.cu
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Elu, LAYER_ELU);
+
+__global__ void elu_kernel(const int n, const float* in, float* out, float alpha) {
+    CUDA_KERNEL_LOOP(index, n) {
+        out[index] = in[index] < 0 ? alpha * (exp(in[index]) - 1) : in[index];
+    }
+}
+
+Status CudaEluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaEluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaEluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<EluLayerParam *>(param_);
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    elu_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data, output_data, params->alpha);
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Elu, LAYER_ELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_erf_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_erf_layer_acc.cu
new file mode 100644
index 0000000..399f33c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_erf_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Erf, LAYER_ERF);
+
+Status CudaErfLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaErfLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaErfLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Erf, LAYER_ERF);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_exp_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_exp_layer_acc.cu
new file mode 100644
index 0000000..a274ee2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_exp_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Exp, LAYER_EXP);
+
+Status CudaExpLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaExpLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaExpLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Exp, LAYER_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_expand_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_expand_layer_acc.cu
new file mode 100644
index 0000000..be376e9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_expand_layer_acc.cu
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/cuda/cuda_macro.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC_WITH_FUNC(Expand, LAYER_EXPAND,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs););
+
+Status CudaExpandLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaExpandLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaExpandLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto expand_param = dynamic_cast<ExpandLayerParam*>(param_);
+    CHECK_PARAM_NULL(expand_param);
+    
+    if (inputs.size() == 2) {
+        auto shape_data = (int *)inputs[1]->GetHandle().base;
+        auto shape_data_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        DimsVector shape_dims(shape_data_count, 1);
+        CUDA_CHECK(cudaMemcpy(&shape_dims[0], shape_data, shape_data_count * sizeof(int), cudaMemcpyDeviceToHost));
+        expand_param->shape = shape_dims;
+        
+        auto data_dims = inputs[0]->GetBlobDesc().dims;
+        auto output_dims = DimsFunctionUtils::Expand(data_dims, shape_dims, nullptr);
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+typedef struct epand_dims_t{
+    epand_dims_t(std::vector<int> dims) {
+        memset(d, 0, maxDims * sizeof(int));
+        nbDims = dims.size();
+        for(int i=nbDims - 1;i>=0;i--) {
+            d[i] = dims[i];
+        }
+    }
+    constexpr static int maxDims=6;
+    int nbDims=0;
+    int d[maxDims];
+} dims_t; 
+
+typedef struct expand_steps_t {
+    expand_steps_t(dims_t dims) {
+        memset(d, 0, maxDims * sizeof(int));
+        nbDims = dims.nbDims;
+        int cnt = 1;
+        for(int i=nbDims - 1;i>=0;i--) {
+            if (dims.d[i] == 1) {
+                d[i] = 0;
+            } else {
+                d[i] = cnt;
+                cnt *= dims.d[i];
+            }
+        }
+    }
+    constexpr static int maxDims=6;
+    int nbDims=0;
+    int d[maxDims];
+} steps_t; 
+
+Status trim_dims(dims_t &a, dims_t &b) {
+    if (a.nbDims != b.nbDims) {
+        LOGE("nbDims not equal");
+        return TNNERR_LAYER_ERR;
+    }
+
+    // trim the dimension when both are 1
+    int i=0;
+    int j=0;
+    int trim_cnt = 0;
+    for(;i<a.nbDims;i++) {
+        if (a.d[i] == 1 && b.d[i] == 1) {
+            trim_cnt += 1;
+            continue;
+        }
+        a.d[j] = a.d[i];
+        b.d[j] = b.d[i];
+        j++;
+    }
+    a.nbDims -= trim_cnt;
+    b.nbDims -= trim_cnt;
+
+    // trim the leading broadcasting dims
+    while(a.nbDims > 2 && a.d[0] == 1 && a.d[1]==1) {
+        b.d[0] *= b.d[1];
+        memcpy(b.d+1, b.d+2, (b.nbDims - 2) * sizeof(int));
+        memcpy(a.d+1, a.d+2, (a.nbDims - 2) * sizeof(int));
+        b.nbDims-= 1;
+        a.nbDims-= 1;
+    }
+
+    return TNN_OK;
+}
+
+template<int ELE_PER_THREAD, int THRADS_PER_BLOCK, int INNER_NBDIMS>
+__global__ void expand_kernel(const float *src, const dims_t src_dims, const steps_t src_steps, 
+                              float * dst,  const dims_t dst_dims, const steps_t dst_steps, 
+                              const int inner_size) 
+{
+    src += blockIdx.y * src_steps.d[0];
+    dst += blockIdx.y * dst_steps.d[0];
+
+    CUDA_KERNEL_LOOP(index, inner_size) {
+
+        int inner_id = index;
+        size_t src_idx = 0;
+        size_t dst_idx = 0;
+
+        #pragma unroll
+        for(int i=INNER_NBDIMS;i>0;i--) {
+            int pos = inner_id % dst_dims.d[i];
+            inner_id /= dst_dims.d[i];
+            src_idx += pos * src_steps.d[i];
+            dst_idx += pos * dst_steps.d[i];
+        }
+
+        dst[dst_idx] = src[src_idx];
+    }
+}
+
+Status CudaExpandLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+
+    // expand input dims
+    auto expanded_input_dims = input_dims;
+    while(expanded_input_dims.size() < output_dims.size()) {
+        expanded_input_dims.insert(expanded_input_dims.begin(), 1);
+    }
+
+    dims_t src_dims(expanded_input_dims);
+    dims_t dst_dims(output_dims);
+
+    RETURN_ON_NEQ(trim_dims(src_dims, dst_dims), TNN_OK);
+
+    steps_t src_steps(src_dims);
+    steps_t dst_steps(dst_dims);
+
+    // calc kernel shapes
+    int outter_dim = dst_dims.d[0];
+    int inner_dim = 1;
+    for(int i=1;i<dst_dims.nbDims;i++) {
+        inner_dim *= dst_dims.d[i];
+    }
+
+    const int ELE_PER_THREAD   = 1;
+    const int THREAD_PER_BLOCK = 128;
+
+    dim3 blocks;
+    blocks.x = (inner_dim + THREAD_PER_BLOCK - 1 ) / THREAD_PER_BLOCK;
+    blocks.y = outter_dim;
+
+    float * src = (float*)(((char*)input_blob->GetHandle().base) + input_blob->GetHandle().bytes_offset);
+    float * dst = (float*)(((char*)output_blob->GetHandle().base) + output_blob->GetHandle().bytes_offset);
+
+    using kernel_function_ptr_t = decltype(&expand_kernel<1,1,1>);
+    kernel_function_ptr_t kernel_ptr = nullptr;
+
+    switch (dst_dims.nbDims - 1) {
+        case 1 :
+            kernel_ptr = expand_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK, 1>;
+            break;
+        case 2 :
+            kernel_ptr = expand_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK, 2>;
+            break;
+        case 3 :
+            kernel_ptr = expand_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK, 3>;
+            break;
+        case 4 :
+            kernel_ptr = expand_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK, 4>;
+            break;
+        case 5 :
+            kernel_ptr = expand_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK, 5>;
+            break;
+        default:
+            LOGE("unsupported configuration");
+            return TNNERR_LAYER_ERR; 
+    }
+
+    kernel_ptr<<<blocks, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+                    (src, src_dims, src_steps, dst, dst_dims, dst_steps, inner_dim);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Expand, LAYER_EXPAND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_flatten_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_flatten_layer_acc.cu
new file mode 100644
index 0000000..7de8eab
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_flatten_layer_acc.cu
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Flatten, LAYER_FLATTEN);
+
+Status CudaFlattenLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaFlattenLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaFlattenLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto data_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto size_in_bytes = DimsVectorUtils::Count(input_dims) * data_byte_size;
+    cudaMemcpyAsync(outputs[0]->GetHandle().base, inputs[0]->GetHandle().base, size_in_bytes, cudaMemcpyDeviceToDevice, context_->GetStream());
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_floor_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_floor_layer_acc.cu
new file mode 100644
index 0000000..f65ff52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_floor_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Floor, LAYER_FLOOR);
+
+Status CudaFloorLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaFloorLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaFloorLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Floor, LAYER_FLOOR);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.cu
new file mode 100644
index 0000000..38b8187
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.cu
@@ -0,0 +1,117 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/acc/cuda_gather_layer_acc.h"
+#include "tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cuh"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status CudaGatherLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(CudaLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    auto layer_param = dynamic_cast<GatherLayerParam *>(param);
+    auto layer_resource = dynamic_cast<GatherLayerResource *>(resource);
+
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+
+    if (layer_param->data_in_resource) {
+        auto input_data = layer_resource->data.force_to<char*>();
+        auto input_size = layer_resource->data.GetBytesSize();
+        CUDA_CHECK(cudaMalloc((void **)&input_data_, input_size));
+        CUDA_CHECK(cudaMemcpy(input_data_, input_data, input_size, cudaMemcpyHostToDevice));
+    }
+
+    if (layer_param->indices_in_resource) {
+        auto indices_data = layer_resource->indices.force_to<char*>();
+        auto indices_size = layer_resource->indices.GetBytesSize();
+        CUDA_CHECK(cudaMalloc((void **)&indices_data_, indices_size));
+        CUDA_CHECK(cudaMemcpy(indices_data_, indices_data, indices_size, cudaMemcpyHostToDevice));
+    }
+    return TNN_OK;
+}
+
+CudaGatherLayerAcc::~CudaGatherLayerAcc(){
+    if (input_data_ != nullptr) {
+        CUDA_CHECK(cudaFree(input_data_));
+    }
+    if (indices_data_ != nullptr) {
+        CUDA_CHECK(cudaFree(indices_data_));
+    }
+}
+
+Status CudaGatherLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGatherLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam *>(param_);
+    auto layer_resource = dynamic_cast<GatherLayerResource *>(resource_);
+
+    DataType dtype;
+    DimsVector input_dims;
+    void *input_data_ptr = nullptr;
+    if (layer_param->data_in_resource) {
+        input_dims = layer_resource->data.GetBufferDims();
+        input_data_ptr = input_data_;
+        dtype = layer_resource->data.GetDataType();
+    } else {
+        input_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+        input_data_ptr = (*(inputs.begin()))->GetHandle().base;
+        dtype = (*(inputs.begin()))->GetBlobDesc().data_type;
+    }
+
+    DimsVector indices_dims;
+    void *indices_data_ptr = nullptr;
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_data_ptr = indices_data_;
+    } else {
+        indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+        indices_data_ptr = (*(inputs.rbegin()))->GetHandle().base;
+    }
+
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+    void* output_data_ptr = outputs[0]->GetHandle().base;
+
+    int axis = layer_param->axis;
+    int dst_size = DimsVectorUtils::Count(output_dims);
+    int slice_size = DimsVectorUtils::Count(input_dims, axis + 1);
+    int input_slice_count = DimsVectorUtils::Count(input_dims, axis, axis + 1);
+    int output_slice_count = DimsVectorUtils::Count(indices_dims);
+
+    if (dtype == DATA_TYPE_FLOAT) {
+        return RunGather(dst_size, slice_size, input_slice_count, output_slice_count,
+                         (const float*)input_data_ptr, (const int*)indices_data_ptr,
+                         (float*)output_data_ptr, context_->GetStream());
+    } else if (dtype == DATA_TYPE_INT32) {
+        return RunGather(dst_size, slice_size, input_slice_count, output_slice_count,
+                         (const int*)input_data_ptr, (const int*)indices_data_ptr,
+                         (int*)output_data_ptr, context_->GetStream());
+    } else if (dtype == DATA_TYPE_HALF) {
+        return RunGather(dst_size, slice_size, input_slice_count, output_slice_count,
+                         (const __half*)input_data_ptr, (const int*)indices_data_ptr,
+                         (__half*)output_data_ptr, context_->GetStream());
+    } else {
+        LOGE("CudaGatherLayerAcc::Forward unsupported dtype %d\n", dtype);
+        return Status(TNNERR_MODEL_ERR, "CudaGatherLayerAcc::Forward unsupported dtype");
+    }
+}
+
+REGISTER_CUDA_ACC(Gather, LAYER_GATHER);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.h
new file mode 100644
index 0000000..7beb865
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaGatherLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaGatherLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    void *indices_data_ = nullptr;
+    void *input_data_ = nullptr;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_H_A
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cu
new file mode 100644
index 0000000..6d04fde
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cu
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cuh"
+
+namespace TNN_NS {
+
+template <typename T>
+__global__ void gather_kernel(
+    int dst_size,
+    int slice_size,
+    int src_slice_count,
+    int dst_slice_count,
+    const T* src_data,
+    const int* indices_data,
+    T* dst_data) {
+
+    int dst_batch_size = dst_slice_count * slice_size;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst_size) {
+        int batch_idx = tid / dst_batch_size;
+        int dst_idx_per_batch = tid % dst_batch_size;
+        int dst_slice_idx = dst_idx_per_batch / slice_size;
+        int offset_per_slice = dst_idx_per_batch % slice_size;
+
+        int src_slice_idx = indices_data[dst_slice_idx];
+        src_slice_idx = src_slice_idx < 0 ?
+                        src_slice_idx + src_slice_count : src_slice_idx;
+
+        if (src_slice_idx < 0 || src_slice_idx >= src_slice_count) {
+            dst_data[tid] = 0;
+            return;
+        }
+
+        int src_idx = batch_idx * src_slice_count * slice_size;
+        src_idx += src_slice_idx * slice_size + offset_per_slice;
+        dst_data[tid] = src_data[src_idx];
+    }
+}
+
+template <typename T>
+Status RunGather(int dst_size, int slice_size, int src_slice_count, int dst_slice_count,
+                 const T* src_data, const int* indices_data, T* dst_data, cudaStream_t stream) {
+    const int THREAD_PER_BLOCK = 128;
+    int blocks = (dst_size + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+
+    gather_kernel<T><<<blocks, THREAD_PER_BLOCK, 0, stream>>>(
+        dst_size, slice_size, src_slice_count, dst_slice_count,
+        src_data, indices_data, dst_data);
+
+    return TNN_OK;
+}
+
+template Status RunGather<float>(int dst_size, int slice_size, int src_slice_count, int dst_slice_count,
+                 const float* src_data, const int* indices_data, float* dst_data, cudaStream_t stream);
+template Status RunGather<int>(int dst_size, int slice_size, int src_slice_count, int dst_slice_count,
+                 const int* src_data, const int* indices_data, int* dst_data, cudaStream_t stream);
+template Status RunGather<__half>(int dst_size, int slice_size, int src_slice_count, int dst_slice_count,
+                 const __half* src_data, const int* indices_data, __half* dst_data, cudaStream_t stream);
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cuh b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cuh
new file mode 100644
index 0000000..01dfad0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_layer_acc_kernel.cuh
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_KERNEL_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_KERNEL_CUH_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+template <typename T>
+Status RunGather(int dst_size, int slice_size, int src_slice_count, int dst_slice_count,
+                 const T* src_data, const int* indices_data, T* dst_data, cudaStream_t stream);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_GATHER_LAYER_ACC_KERNEL_CUH_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_nd_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_nd_layer_acc.cu
new file mode 100644
index 0000000..d64dedb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gather_nd_layer_acc.cu
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(GatherND, LAYER_GATHERND);
+
+__global__ void gather_nd_kernel(int count, const float* data, const int* indices, const int* input_stride, int slice_size, float* dst) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int *indices_ptr = indices + idx * slice_size;
+        int input_index = 0;
+        for (int i=0; i<slice_size; i++) {
+            input_index += indices_ptr[i] * input_stride[i];
+        }
+        dst[idx] = data[input_index]; 
+    }
+}
+
+Status CudaGatherNDLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherNDLayerParam*>(param);
+    CHECK_PARAM_NULL(layer_param);
+    int batch_dims = layer_param->batch_dims;
+    
+    if (batch_dims != 0) {
+        return Status(TNNERR_PARAM_ERR, "GatherNDLayerParam has invalid param batch_dims");
+    }
+
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+   
+Status CudaGatherNDLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGatherNDLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (tempbufs_.size() == 0) {
+        auto input_dims = inputs[0]->GetBlobDesc().dims;
+        CreateTempBuf(input_dims.size() * sizeof(int));
+    }
+
+    Blob *input_data_blob  = inputs[0];
+    Blob *indices_blob  = inputs[1];
+    Blob *output_blob = outputs[0];
+
+
+    auto input_data_dims = input_data_blob->GetBlobDesc().dims;
+    auto indices_dims = indices_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    int count = DimsVectorUtils::Count(output_dims);
+    auto input_stride = DimsFunctionUtils::StrideOfShape(input_data_dims);
+    if (indices_dims[indices_dims.size()-1] != input_data_dims.size()) {
+        return Status(TNNERR_PARAM_ERR, "GatherNDLayerParam has invalid param indices_dims");
+    }
+    cudaMemcpyAsync(tempbufs_[0].ptr, input_stride.data(), input_stride.size() * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+
+    const int slice_size = indices_dims[indices_dims.size()-1];
+    float* input_data = static_cast<float*>(input_data_blob->GetHandle().base);
+    int* indices_data = static_cast<int*>(indices_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    gather_nd_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data, indices_data, (int*)tempbufs_[0].ptr, slice_size, output_data);    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(GatherND, LAYER_GATHERND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gelu_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gelu_layer_acc.cu
new file mode 100644
index 0000000..b80b64c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gelu_layer_acc.cu
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Gelu, LAYER_GELU);
+
+__global__ void gelu_kernel(const int n, const float *in, float *out) {
+    CUDA_KERNEL_LOOP(index, n) {
+        const auto x   = in[index];
+        out[index] = 0.5f * x * (erff(x*0.707106793288165f) + 1.0f);
+    }
+}
+
+Status CudaGeluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaGeluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGeluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    auto data_type    = input_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float*>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float*>(output_blob->GetHandle().base);
+        gelu_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, input_data, output_data);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Gelu, LAYER_GELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu
new file mode 100644
index 0000000..d4f8dd8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(GridSample, LAYER_GRIDSAMPLE);
+
+__global__ void gridsample_kernel(const float* input_data, const float* grid_data, float* output_data,
+        int output_channel_area, int input_channel_area, int grid_area, int channel, int input_height,
+        int input_width) {
+    const float* grid_ptr = grid_data + blockIdx.y * grid_area;
+    const float* input_ptr = input_data + blockIdx.y * input_channel_area * channel;
+    float* output_ptr = output_data + blockIdx.y * output_channel_area * channel;
+
+    CUDA_KERNEL_LOOP(index, output_channel_area) {
+        float grid_x = (grid_ptr[2*index] + 1) * input_width * 0.5 -0.5;
+        float grid_y = (grid_ptr[2*index+1] + 1) * input_height * 0.5 - 0.5;
+
+        const int w0 = floorf(grid_x);
+        const int w1p = (w0 < input_width - 1) ? 1 : 0;
+        float w1lambda = grid_x - w0;
+        float w0lambda = (float)1. - w1lambda;
+        if (w0 < 0 || w0 > input_width - 1) {
+            w0lambda = 0;
+        }
+        if (w0 + 1 < 0 || w0 + 1 > input_width - 1) {
+            w1lambda = 0;
+        }
+
+        const int h0  = floorf(grid_y);
+        const int h1p = (h0 < input_height - 1) ? 1 : 0;
+        float h1lambda = grid_y - h0;
+        float h0lambda = (float)1. - h1lambda;
+        if (h0 < 0 || h0 > input_height - 1) {
+            h0lambda = 0;
+        }
+        if (h0 + 1 < 0 || h0 + 1 > input_height - 1) {
+            h1lambda = 0;
+        }
+
+        const float *x_data_ptr = input_ptr + h0 * input_width + w0;
+        float *y_data_ptr = output_ptr + index;
+        for (int c=0; c<channel; c++) {
+            y_data_ptr[0] = h0lambda * (w0lambda * x_data_ptr[0] + w1lambda * x_data_ptr[w1p]) +
+                            h1lambda * (w0lambda * x_data_ptr[h1p * input_width] +
+                                        w1lambda * x_data_ptr[h1p * input_width + w1p]);
+
+            x_data_ptr += input_channel_area;
+            y_data_ptr += output_channel_area;
+        }
+    }
+}
+
+Status CudaGridSampleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaGridSampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGridSampleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto grid_dims = inputs[1]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    int channel = input_dims[1];
+    int input_height = input_dims[2];
+    int input_width = input_dims[3];
+    int input_channel_area = DimsVectorUtils::Count(input_dims, 2);
+    int output_channel_area = DimsVectorUtils::Count(output_dims, 2);
+    int grid_area = DimsVectorUtils::Count(grid_dims, 1);
+
+    float* input_data  = (float *)((char *)inputs[0]->GetHandle().base+ inputs[0]->GetHandle().bytes_offset);
+    float* grid_data   = (float *)((char *)inputs[1]->GetHandle().base+ inputs[1]->GetHandle().bytes_offset);
+    float* output_data = (float *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+
+    dim3 griddim(TNN_CUDA_GET_BLOCKS(output_channel_area), input_dims[0]);
+    gridsample_kernel<<<griddim, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(input_data, grid_data, output_data,
+        output_channel_area, input_channel_area, grid_area, channel, input_height, input_width);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(GridSample, LAYER_GRIDSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu
new file mode 100644
index 0000000..52510c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu
@@ -0,0 +1,242 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(GroupNorm, LAYER_GROUP_NORM);
+
+namespace {
+
+inline static int getThreads(int count) {
+    if (count <= 0) return 0;
+    if (count <= 32) return 32;
+    if (count > 256) return 512;
+    count -= 1;
+    count |= (count >> 1);
+    count |= (count >> 2);
+    count |= (count >> 4);
+    return count + 1;
+}
+
+template<typename T>
+struct Tuple2 {
+    T v1; T v2;
+    __device__ __host__ inline Tuple2<T>(const T a, const T b) : v1(a), v2(b) {}
+    __device__ __host__ inline Tuple2<T>() : v1(0.), v2(0.) {}
+    __device__ __host__ inline Tuple2<T>(const T& other): v1(other), v2(other) {}
+    __device__ __host__ inline Tuple2<T> operator+(const Tuple2<T> &other) { return {v1 + other.v1, v2 + other.v2}; }
+    __device__ __host__ inline Tuple2<T> &operator+=(const Tuple2<T> &other) { v1 += other.v1; v2 += other.v2; return *this; }
+};
+
+template<typename T> struct GNAccType {using type = T; };
+template<> struct GNAccType<__half> {using type = float; };
+template<> struct GNAccType<float> {using type = float; };
+
+__device__ inline static Tuple2<float> __shfl_down_sync(unsigned mask, Tuple2<float> var, unsigned int delta, int width) {
+    auto ret = ::__shfl_down_sync(mask, *(double *)&var, delta, width);
+    return *(Tuple2<float>*)&ret;
+}
+// __device__ inline static Tuple2<__half> __shfl_down_sync(unsigned mask, Tuple2<__half> var, unsigned int delta, int width) {
+//     auto ret = __shfl_down_sync(mask, *(float*)&var, delta, width);
+//     return *(Tuple2<__half>*)&ret;
+// }
+
+template<typename T, int WARP_SIZE>
+struct WarpReducer { __device__ inline static T reduce(T val); };
+template<typename T> struct WarpReducer<T, 32> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0xffffffff, val, 16, 32);
+    val += __shfl_down_sync(0x0000ffff, val, 8, 16);
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 16> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x0000ffff, val, 8, 16);
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 8> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 4> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 2> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 1> { __device__ inline static T reduce(T val) { return val; }};
+
+template<typename T> using UFunc = T(*)(T);
+template<typename T> __device__ __host__ inline T idn(T val) { return val; }
+template<typename T> __device__ __host__ inline T sqr(T val) { return val * val; }
+template<typename T> __device__ __host__ inline Tuple2<T> idn(Tuple2<T> val) { return val; }
+template<typename T> __device__ __host__ inline Tuple2<T> idn_sqr(Tuple2<T> val) { return {val.v1, val.v2 * val.v2}; }
+}
+
+template<int THREAD_PER_BLOCK, typename T, typename AccType, UFunc<AccType> ufunc>
+__device__ static void reduce(const T* input, AccType* output, const int count, const int in_elem_step = 1) {
+
+    static_assert(THREAD_PER_BLOCK % 32 == 0 && THREAD_PER_BLOCK >= 32, "");
+    __shared__ char _sm_static[(THREAD_PER_BLOCK / 32) * sizeof(AccType)];
+    AccType *ssum = reinterpret_cast<AccType*>(_sm_static);
+    AccType sum = AccType(0.);
+
+    const T* ptr = input + threadIdx.x * in_elem_step;
+    const auto actual_step = THREAD_PER_BLOCK * in_elem_step;
+    for (int i = threadIdx.x; i < count; i += THREAD_PER_BLOCK, ptr += actual_step) {
+        auto value = static_cast<AccType>(*ptr);
+        sum += ufunc(value);
+    }
+    sum = WarpReducer<AccType, 32>::reduce(sum);
+    if (threadIdx.x % 32 == 0) { ssum[threadIdx.x / 32] = sum; }
+    __syncthreads();
+
+    sum = threadIdx.x < THREAD_PER_BLOCK / 32 ? ssum[threadIdx.x] : AccType(0.);
+    sum = WarpReducer<AccType, THREAD_PER_BLOCK / 32>::reduce(sum);
+    if (threadIdx.x == 0) { *output = sum; }
+    __syncthreads();
+}
+
+
+template<typename T>
+__device__ void fuse_param_and_affine(const T *input, T *output, const float *gamma, const float *beta,
+                                      const int c_per_g, const int hw, const float eps,
+                                      typename GNAccType<T>::type sum1, typename GNAccType<T>::type sum2) {
+    using AccType = typename GNAccType<T>::type;
+    extern __shared__ char _sm[];
+    AccType* scale = reinterpret_cast<AccType*>(_sm);
+    AccType* bias = scale + c_per_g;
+    const int c_off = c_per_g * blockIdx.x;
+    for (int i = threadIdx.x; i < c_per_g; i += blockDim.x) {
+        AccType mean = sum1 / (c_per_g * hw) ;
+        AccType var = sum2 / (c_per_g * hw) - mean * mean;
+        AccType k = rsqrt(var + eps) * gamma[c_off + i];
+        scale[i] = k;
+        bias[i] = - mean * k + beta[c_off + i];
+    }
+    __syncthreads();
+
+    const auto count = c_per_g * hw;
+    const auto offset = count * blockIdx.x;
+    const T* in_ptr = input + offset;
+    T* out_ptr = output + offset;
+    for (int i = threadIdx.x; i < count; i += blockDim.x) {
+        auto c_idx = i / hw;
+        out_ptr[i] = static_cast<AccType>(in_ptr[i]) * scale[c_idx] + bias[c_idx];
+    }
+}
+
+template<int THREAD_PER_BLOCK, typename T>
+__global__ void group_norm_1pass(const T *input, T *output, const float *gamma, const float *beta,
+                                 const int c_per_g, const int hw, const float eps) {
+    // 1 group per block, used when c_per_g * hw <= 4096
+    // assert (c == g * c_per_g)
+    using AccType = typename GNAccType<T>::type;
+
+    __shared__ char _sums[sizeof(Tuple2<AccType>)];
+    Tuple2<AccType> *sums = reinterpret_cast<Tuple2<AccType>*>(_sums);
+    reduce<THREAD_PER_BLOCK, T, Tuple2<AccType>, idn_sqr<AccType> >(
+        input + blockIdx.x * hw * c_per_g, sums, c_per_g * hw);
+
+    fuse_param_and_affine<T>(input, output, gamma, beta, c_per_g, hw, eps, sums[0].v1, sums[0].v2);
+}
+
+template<typename T>
+static Status group_norm_v2(const T *input, T* output, const float *gamma, const float *beta,
+                            const int n, const int c, const int g, const int c_per_g, const int h, const int w,
+                            const float eps, cudaStream_t s) {
+    using AccType = typename GNAccType<T>::type;
+    static std::map<int, void(*)(
+        const T*, T*, const float *, const float *,
+        const int, const int, const float)> group_norm_1pass_funcs = {
+        {32,  group_norm_1pass<32, T>},
+        {64,  group_norm_1pass<64, T>},
+        {128, group_norm_1pass<128, T>},
+        {256, group_norm_1pass<256, T>},
+        {512, group_norm_1pass<512, T>},
+    };
+    const int hw = h * w;
+    auto block = getThreads(c_per_g * hw);
+    auto grid = n * g;
+    {
+        group_norm_1pass_funcs[block]<<<grid, block, 2 * c_per_g * sizeof(AccType), s>>>(
+            input, output, gamma, beta, c_per_g, hw, eps);
+        auto err = cudaGetLastError();
+        if (err != cudaSuccess)
+            return Status(TNNERR_CUDA_TENSORRT_ERROR, "GN Plugin 1pass failed: " + std::to_string(err));
+    }
+    return TNN_OK;
+}
+
+Status CudaGroupNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaGroupNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGroupNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<GroupNormLayerParam*>(param_);
+    auto dtype = inputs[0]->GetBlobDesc().data_type;
+
+    Blob *input_blob = inputs[0];
+    Blob *scale_blob = inputs[1];
+    Blob *bias_blob  = inputs[2];
+    Blob *output_blob = outputs[0];
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (dtype == DATA_TYPE_FLOAT) {
+        float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base);
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base);
+        float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+        int channels_per_group = input_dims[1] / params->group;
+
+        return group_norm_v2<float>(input_data, output_data, scale_data, bias_data,
+                                    input_dims[0], input_dims[1], params->group, channels_per_group,
+                                    input_dims[2], input_dims[3], params->eps, context_->GetStream());
+    } else if (dtype == DATA_TYPE_HALF) {
+        __half* input_data = static_cast<__half*>(input_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base);
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base);
+        __half* output_data = static_cast<__half*>(output_blob->GetHandle().base);
+        int channels_per_group = input_dims[1] / params->group;
+
+        return group_norm_v2<__half>(input_data, output_data, scale_data, bias_data,
+                                    input_dims[0], input_dims[1], params->group, channels_per_group,
+                                    input_dims[2], input_dims[3], params->eps, context_->GetStream());
+    } else {
+        return Status(TNNERR_CUDA_TENSORRT_ERROR, "Unexpected data type " + std::to_string(dtype));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(GroupNorm, LAYER_GROUP_NORM);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_sigmoid_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_sigmoid_layer_acc.cu
new file mode 100644
index 0000000..fbbab7f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_sigmoid_layer_acc.cu
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+
+ __global__ void hard_sigmoid_kernel(const int n, const float* in, float* out,
+        const float alpha, const float beta) {
+    const float minV  = -beta / alpha;
+    const float maxV  = (1.0f - beta) / alpha;
+    CUDA_KERNEL_LOOP(index, n) {
+        float value = in[index];
+        if (value <= minV) {
+            value = 0.0;
+        } else if (value < maxV) {
+            value = value * alpha + beta;
+        } else {
+            value = 1.0;
+        }
+        out[index] = value;
+    }
+}
+
+Status CudaHardSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaHardSigmoidLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaHardSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<HardSigmoidLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: hardsigmoid layer param is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: hardsigmoid layer param is nil");
+    }
+    const float alpha_ = params->alpha;
+    const float beta_  = params->beta;
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    hard_sigmoid_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data, output_data, alpha_, beta_);
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu
new file mode 100644
index 0000000..1253b76
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(HardSwish, LAYER_HARDSWISH);
+
+__global__ void hard_swish_kernel(int count, const float* in1, const float* in2, float* out, int in_n1,
+        int in_c1, int in_h1, int in_w1, int in_n2, int in_c2, int in_h2, int in_w2, int out_c, int out_h,
+        int out_w, const float alpha, const float beta) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int b = index / (out_c * out_h * out_w);
+        int c = index / (out_h * out_w) % out_c;
+        int h = index / out_w % out_h;
+        int w = index % out_w;
+        int input_index_b_1 = min(b, in_n1-1) * in_c1 * in_h1 * in_w1;
+        int input_index_b_2 = min(b, in_n2-1) * in_c2 * in_h2 * in_w2;
+        int input_index_c_1 = min(c, in_c1-1) * in_h1 * in_w1 + input_index_b_1;
+        int input_index_c_2 = min(c, in_c2-1) * in_h2 * in_w2 + input_index_b_2;
+        int input_index_h_1 = min(h, in_h1-1) * in_w1 + input_index_c_1;
+        int input_index_h_2 = min(h, in_h2-1) * in_w1 + input_index_c_2;
+        int input_index_w_1 = min(w, in_w1-1) + input_index_h_1;
+        int input_index_w_2 = min(w, in_w2-1) + input_index_h_2;
+        out[index] = in1[input_index_w_1] * max(min(in2[input_index_w_2] * alpha + beta, 1.f), 0.f);
+    }
+}
+
+Status CudaHardSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaHardSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaHardSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: HardSwishLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: HardSwishLayerParam is nil");
+    }
+
+    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+
+    Blob* input_blob1 = inputs[0];
+    Blob* input_blob2 = inputs[0];
+    Blob* output_blob = outputs[0];
+    if (inputs.size() != 1) {
+        input_blob2 = inputs[1];
+    }
+    float* input_data1 = static_cast<float*>(input_blob1->GetHandle().base);
+    float* input_data2 = static_cast<float*>(input_blob2->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    auto input_dims1 = input_blob1->GetBlobDesc().dims;
+    auto input_dims2 = input_blob2->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+
+    int in_n1 = DimsFunctionUtils::GetDim(input_dims1, 0);
+    int in_c1 = DimsFunctionUtils::GetDim(input_dims1, 1);
+    int in_h1 = DimsFunctionUtils::GetDim(input_dims1, 2);
+    int in_w1 = DimsFunctionUtils::GetDim(input_dims1, 3);
+
+    int in_n2 = DimsFunctionUtils::GetDim(input_dims2, 0);
+    int in_c2 = DimsFunctionUtils::GetDim(input_dims2, 1);
+    int in_h2 = DimsFunctionUtils::GetDim(input_dims2, 2);
+    int in_w2 = DimsFunctionUtils::GetDim(input_dims2, 3);
+
+    int out_c = DimsFunctionUtils::GetDim(output_dims, 1);
+    int out_h = DimsFunctionUtils::GetDim(output_dims, 2);
+    int out_w = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    hard_swish_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data1, input_data2, output_data, in_n1, in_c1, in_h1, in_w1, in_n2, in_c2, in_h2,
+        in_w2, out_c, out_h, out_w, params->alpha, params->beta);
+    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(HardSwish, LAYER_HARDSWISH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_histogram_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_histogram_layer_acc.cu
new file mode 100644
index 0000000..850ef9a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_histogram_layer_acc.cu
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Histogram, LAYER_HISTOGRAM);
+
+__global__ void histogram_kernel(const int* input, int* output, int count) {
+    CUDA_KERNEL_LOOP(i, count) {
+        int index = input[i];
+        atomicAdd(&output[index], 1);
+    }
+}
+
+Status CudaHistogramLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaHistogramLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaHistogramLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    auto input_data = (int *)(inputs[0]->GetHandle().base);
+    auto output_data = (int *)(outputs[0]->GetHandle().base);
+
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    const int count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    
+    cudaMemset(output_data, 0, ele_size * DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims));
+    histogram_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        input_data, output_data, count);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Histogram, LAYER_HISTOGRAM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu
new file mode 100644
index 0000000..d5c4041
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_inner_product_layer_acc.h"
+
+#include <cublas_v2.h>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+Status CudaInnerProductLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+
+    InnerProductLayerParam *ip_param =
+        dynamic_cast<InnerProductLayerParam *>(param);
+    if (ip_param == nullptr) {
+        LOGE("Convert to InnerProductLayerParam failed\n");
+        return TNNERR_LAYER_ERR;
+    }
+
+    InnerProductLayerResource *ip_resource =
+        dynamic_cast<InnerProductLayerResource *>(resource);
+    if (ip_resource == nullptr) {
+        LOGE("Convert to InnerProductLayerResource failed\n");
+        return TNNERR_LAYER_ERR;
+    }
+
+    has_bias_        = ip_param->has_bias;
+    multiplier_size_ = 0;
+
+    float *weight = ip_resource->weight_handle.force_to<float *>();
+    float *bias   = ip_resource->bias_handle.force_to<float *>();
+
+    weight_size_ = ip_resource->weight_handle.GetBytesSize() / sizeof(float);
+    bias_size_   = ip_resource->bias_handle.GetBytesSize() / sizeof(float);
+
+    CUDA_CHECK(cudaMalloc((void **)&weight_, weight_size_ * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **)&bias_, bias_size_ * sizeof(float)));
+
+    CUDA_CHECK(cudaMemcpy(weight_, weight, weight_size_ * sizeof(float),
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(bias_, bias, bias_size_ * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+    return this->Reshape(inputs, outputs);
+}
+
+Status CudaInnerProductLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+    InnerProductLayerParam *ip_param =
+        dynamic_cast<InnerProductLayerParam *>(param_);
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+
+    n_       = ip_param->num_output;
+    int axis = ip_param->axis;
+    m_       = DimsVectorUtils::Count(input_dims, 0, axis);
+    k_       = DimsVectorUtils::Count(input_dims, axis);
+
+    if (k_ * n_ != weight_size_) {
+        LOGE("weight size (%lu) != N(%d) * K(%d). \n", weight_size_, n_, k_);
+        return TNNERR_LAYER_ERR;
+    }
+
+    if (has_bias_) {
+        if (m_ > multiplier_size_) {
+            multiplier_size_ = m_;
+            if (multiplier_ != nullptr) {
+                CUDA_CHECK(cudaFree(multiplier_));
+                multiplier_ = nullptr;
+            }
+            CUDA_CHECK(cudaMalloc((void **)&multiplier_,
+                                  multiplier_size_ * sizeof(float)));
+            float *tmp = new float[multiplier_size_];
+            for (int i = 0; i < multiplier_size_; i++) {
+                tmp[i] = 1.0;
+            }
+            CUDA_CHECK(cudaMemcpy(multiplier_, tmp,
+                                  multiplier_size_ * sizeof(float),
+                                  cudaMemcpyHostToDevice));
+            delete[] tmp;
+        }
+    }
+    return TNN_OK;
+}
+
+Status CudaInnerProductLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *ip_param =
+        dynamic_cast<InnerProductLayerParam *>(param_);
+
+    float *bottom_data = (float *)inputs[0]->GetHandle().base;
+    float *top_data    = (float *)outputs[0]->GetHandle().base;
+
+    float alpha = 1.0;
+    float beta  = 0.0;
+
+    CUBLAS_CHECK(cublasSgemm(context_->cublas_handle_, CUBLAS_OP_T, CUBLAS_OP_N,
+                             n_, m_, k_, &alpha, weight_, k_, bottom_data, k_,
+                             &beta, top_data, n_));
+
+    if (has_bias_) {
+        alpha = 1.0;
+        beta  = 1.0;
+        CUBLAS_CHECK(cublasSgemm(context_->cublas_handle_, CUBLAS_OP_N,
+                                 CUBLAS_OP_N, n_, m_, 1, &alpha, bias_, n_,
+                                 multiplier_, 1, &beta, top_data, n_));
+    }
+
+    return TNN_OK;
+}
+
+CudaInnerProductLayerAcc::~CudaInnerProductLayerAcc(){
+    if (weight_ != nullptr) {
+        CUDA_CHECK(cudaFree(weight_));
+        weight_ = nullptr;
+    }
+    if (bias_ != nullptr) {
+        CUDA_CHECK(cudaFree(bias_));
+        bias_ = nullptr;
+    }
+    if (multiplier_ != nullptr) {
+        CUDA_CHECK(cudaFree(multiplier_));
+        multiplier_ = nullptr;
+    }
+}
+
+REGISTER_CUDA_ACC(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.h
new file mode 100644
index 0000000..a659645
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_INNER_PRODUCT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_INNER_PRODUCT_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaInnerProductLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaInnerProductLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    float *weight_     = nullptr;
+    float *bias_       = nullptr;
+    float *multiplier_ = nullptr;
+
+    size_t weight_size_;
+    size_t bias_size_;
+    size_t multiplier_size_;
+
+    int has_bias_;
+
+    int m_;
+    int n_;
+    int k_;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_INNER_PRODUCT_LAYER_ACC_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu
new file mode 100644
index 0000000..8865028
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu
@@ -0,0 +1,158 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cub/cub.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+template<int THREAD_PER_BLOCK, typename T>
+__global__ void instance_norm_kernel(const T * input, T* output, const float * gamma,
+        const float * beta, const int size, const int batch_size, const int C, const float eps) {
+    __shared__ double ssum1[THREAD_PER_BLOCK/32];
+    __shared__ double ssum2[THREAD_PER_BLOCK/32];
+    __shared__ float k;
+    __shared__ float b;
+
+    // const int batch_offset = blockIdx.y * size;
+    const int block_offset = blockIdx.x * size;
+    const T * ptr = input + block_offset;
+    T * dst = output + block_offset;
+    const int cid = blockIdx.x % C;
+    
+    double thread_sum1 = 0.f;
+    double thread_sum2 = 0.f;
+
+    for (int i = threadIdx.x; i < size; i+=THREAD_PER_BLOCK) {
+        float value = get_float_value<T>(ptr[i]);
+        thread_sum1 += value;
+        thread_sum2 += value * value;
+    }
+
+    thread_sum1 += __shfl_down_sync(0xffffffff, thread_sum1, 16, 32);
+    thread_sum1 += __shfl_down_sync(0x0000ffff, thread_sum1, 8, 16);
+    thread_sum1 += __shfl_down_sync(0x000000ff, thread_sum1, 4, 8);
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0xffffffff, thread_sum2, 16, 32);
+    thread_sum2 += __shfl_down_sync(0x0000ffff, thread_sum2, 8, 16);
+    thread_sum2 += __shfl_down_sync(0x000000ff, thread_sum2, 4, 8);
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x % 32 == 0) {
+        ssum1[threadIdx.x / 32] = thread_sum1;
+        ssum2[threadIdx.x / 32] = thread_sum2;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_sum1 = ssum1[threadIdx.x];
+        thread_sum2 = ssum2[threadIdx.x];
+    } else {
+        thread_sum1 = 0;
+        thread_sum2 = 0;
+    }
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x == 0) {
+        double mean = thread_sum1 / size;
+        double var = thread_sum2 / size - mean * mean;
+
+        k = gamma[cid] / sqrt(var + eps);
+        b = - mean * k + beta[cid];
+    }
+    
+    __syncthreads();
+    #pragma unroll(4)
+    for (int i = threadIdx.x; i < size; i += THREAD_PER_BLOCK) {
+        dst[i] = convert_float_value<T>((get_float_value<T>(ptr[i]) * k + b));
+    }
+}
+
+Status CudaInstanceNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto res = dynamic_cast<InstanceNormLayerResource *>(resource);
+    if (!res) {
+        LOGE("Error: InstanceNormLayerResource is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: InstanceNormLayerResource is nil");
+    }
+
+    float *k_data = res->scale_handle.force_to<float *>();
+    int k_size = res->scale_handle.GetBytesSize();
+    float *b_data = res->bias_handle.force_to<float *>();
+    int b_size = res->bias_handle.GetBytesSize();
+
+    CreateTempBuf(k_size);
+    CreateTempBuf(b_size);
+    cudaMemcpyAsync(tempbufs_[0].ptr, k_data, k_size, cudaMemcpyHostToDevice, context_->GetStream());
+    cudaMemcpyAsync(tempbufs_[1].ptr, b_data, b_size, cudaMemcpyHostToDevice, context_->GetStream());
+    return TNN_OK;
+}
+
+Status CudaInstanceNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaInstanceNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    int num = dims[0];
+    int channels = dims[1];
+    int height = dims[2];
+    int width = dims[3];
+    int count = DimsVectorUtils::Count(dims);
+    int hw = DimsVectorUtils::Count(dims, 2);
+    void* input_data = input_blob->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+
+    const int THREAD_PER_BLOCK = 128;
+    dim3 griddim;
+    griddim.x = channels * num;
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        instance_norm_kernel<THREAD_PER_BLOCK, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((float*)input_data,
+            (float*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        instance_norm_kernel<THREAD_PER_BLOCK, __half><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((__half*)input_data,
+            (__half*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inverse_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inverse_layer_acc.cu
new file mode 100644
index 0000000..1c176b9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_inverse_layer_acc.cu
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Inverse, LAYER_INVERSE);
+
+__global__ void inverse_kernel_2x2(int count, const float *input, float* output) {
+    CUDA_KERNEL_LOOP(index, count) {
+        const float* inptr = input + 4*index;
+        float* outptr = output + 4*index;
+        float det = inptr[0]*inptr[3] - inptr[1]*inptr[2];
+        float det_inverse = 1.f / det;
+
+        outptr[0] = inptr[3] * det_inverse;
+        outptr[1] = -inptr[1] * det_inverse;
+        outptr[2] = -inptr[2] * det_inverse;
+        outptr[3] = inptr[0] * det_inverse;
+    }
+}
+
+Status CudaInverseLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaInverseLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaInverseLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    float* input_data = static_cast<float*>(inputs[0]->GetHandle().base);
+    float* output_data = static_cast<float*>(outputs[0]->GetHandle().base);
+    int count = DimsVectorUtils::Count(input_dims, 0, (int)input_dims.size()-2);
+    if (input_dims[input_dims.size()-1] == 2 && input_dims[input_dims.size()-2] == 2) {
+        inverse_kernel_2x2<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, input_data, output_data);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CudaInverseLayerAcc now only support inverse of matrix batchx2x2");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Inverse, LAYER_INVERSE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.cc b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.cc
new file mode 100644
index 0000000..c6e4ce3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/utils/blob_transfer_utils.h"
+
+namespace TNN_NS {
+
+CudaLayerAcc::~CudaLayerAcc() {
+    for (int i = 0; i < tempbufs_.size(); i++) {
+        Status ret = device_->Free(tempbufs_[i].ptr);
+        if (ret != TNN_OK) {
+            LOGE("Error cuda free acc temp buf failed\n");
+        }
+    }
+}
+
+Status CudaLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+
+    device_   = dynamic_cast<CudaDevice*>(GetDevice(DEVICE_CUDA));
+    param_    = param;
+    resource_ = resource;
+    context_ = dynamic_cast<CudaContext*>(context);
+
+    auto status = ReloadConstantBlobs(inputs, false);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return CudaLayerAcc::Reshape(inputs, outputs);
+}
+
+
+Status CudaLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    void *command_queue = nullptr;
+    context_->GetCommandQueue(&command_queue);
+
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        auto buffer = (*const_resource)[name];
+        if (buffer->GetBytesSize() == 0 ) {
+            continue;
+        }
+
+        std::shared_ptr<Blob> blob = nullptr;
+        auto status = RawBuffer2Blob(buffer.get(), blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        BlobDesc desc = blob->GetBlobDesc();
+        desc.device_type = DEVICE_CUDA;
+        auto cuda_blob = std::make_shared<Blob>(desc, true);
+        CopyToDevice(cuda_blob.get(), blob.get(), command_queue);
+        cuda_blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map[name] = cuda_blob;
+        iter->SetHandle(cuda_blob->GetHandle());
+        LOGD("CUDA Reload constant blob: %s\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+
+Status CudaLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+std::vector<DataFormat> CudaLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size == 4) {
+        support_list.push_back(DATA_FORMAT_NCHW);
+    }
+    return support_list;
+}
+
+void CudaLayerAcc::CreateTempBuf(size_t size) {
+    CudaTempBufUnit buf;
+    device_->Allocate(&(buf.ptr), size);
+    tempbufs_.push_back(buf);
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.h
new file mode 100644
index 0000000..df70395
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_acc.h
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LAYER_ACC_H_
+
+#include <cuda_fp16.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/cuda/cuda_context.h"
+#include "tnn/device/cuda/cuda_device.h"
+#include "tnn/device/cuda/cuda_macro.h"
+#include "tnn/device/cuda/utils.cuh"
+
+namespace TNN_NS {
+
+struct CudaTempBufUnit {
+    void* ptr = nullptr;
+    uint32_t size;
+};
+
+class CudaLayerAcc : public AbstractLayerAcc {
+public:
+    /**
+     * @brief init layer with param, resource, input blobs and output blobs.
+     * @param param       layer param
+     * @param resource    layer resource
+     * @param inputs      input blobs
+     * @param outputs     output blobs
+     */
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // @brief virtual destrcutor
+    virtual ~CudaLayerAcc();
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs     input blobs
+     * @param outputs    output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs     input blobs
+     * @param outputs    output blobs
+     * @return forward result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // @brief allocate or update constant blobs if constant resource change。
+    // Note: this func may cost much time, call this func only when necessary。
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+
+protected:
+    void CreateTempBuf(size_t size);
+
+    bool is_reshaped         = false;
+    CudaDevice *device_      = nullptr;
+    LayerParam *param_       = nullptr;
+    LayerResource *resource_ = nullptr;
+    CudaContext *context_    = nullptr;
+    std::vector<CudaTempBufUnit> tempbufs_;
+
+private:
+    // @brief retrun device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+};
+
+
+#define DECLARE_CUDA_ACC_WITH_FUNC(type_string, layer_type, extra_funcs)                                              \
+    class Cuda##type_string##LayerAcc : public CudaLayerAcc {                                                         \
+    public:                                                                                                           \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                             \
+            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                                   \
+        virtual ~Cuda##type_string##LayerAcc() {};                                                                    \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                \
+        extra_funcs                                                                                                   \
+    }
+
+#define DECLARE_CUDA_ACC(type_string, layer_type)                                                                     \
+    class Cuda##type_string##LayerAcc : public CudaLayerAcc {                                                         \
+    public:                                                                                                           \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                             \
+            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                                   \
+        virtual ~Cuda##type_string##LayerAcc() {};                                                                    \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                \
+    }
+
+#define REGISTER_CUDA_ACC(type_string, layer_type)                                                                    \
+    CudaTypeLayerAccRegister<TypeLayerAccCreator<Cuda##type_string##LayerAcc>> g_cuda_##layer_type##_acc_register(    \
+        layer_type);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
new file mode 100644
index 0000000..205f1af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cub/cub.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(LayerNorm, LAYER_LAYER_NORM);
+
+template<int THREAD_PER_BLOCK, typename T, typename Acc>
+__global__ void layer_norm_kernel(const T * input, T* output, const T *scale,
+        const T *bias, const int size, const int batch_size, const float eps) {
+    __shared__ Acc ssum1[THREAD_PER_BLOCK/32];
+    __shared__ Acc ssum2[THREAD_PER_BLOCK/32];
+    __shared__ Acc mean;
+    __shared__ Acc var;
+
+    const int block_offset = blockIdx.x * size;
+    const T *ptr = input + block_offset;
+    T *dst = output + block_offset;
+
+    Acc thread_sum1 = 0.f;
+    Acc thread_sum2 = 0.f;
+
+    for (int i = threadIdx.x; i < size; i+=THREAD_PER_BLOCK) {
+        float value = get_float_value<T>(ptr[i]);
+        thread_sum1 += value;
+        thread_sum2 += value * value;
+    }
+
+    thread_sum1 += __shfl_down_sync(0xffffffff, thread_sum1, 16, 32);
+    thread_sum1 += __shfl_down_sync(0x0000ffff, thread_sum1, 8, 16);
+    thread_sum1 += __shfl_down_sync(0x000000ff, thread_sum1, 4, 8);
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0xffffffff, thread_sum2, 16, 32);
+    thread_sum2 += __shfl_down_sync(0x0000ffff, thread_sum2, 8, 16);
+    thread_sum2 += __shfl_down_sync(0x000000ff, thread_sum2, 4, 8);
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x % 32 == 0) {
+        ssum1[threadIdx.x / 32] = thread_sum1;
+        ssum2[threadIdx.x / 32] = thread_sum2;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_sum1 = ssum1[threadIdx.x];
+        thread_sum2 = ssum2[threadIdx.x];
+    } else {
+        thread_sum1 = 0;
+        thread_sum2 = 0;
+    }
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x == 0) {
+        mean = thread_sum1 / size;
+        var = (thread_sum2 / size - mean * mean);
+        var = 1.0 / sqrt(var + eps);
+    }
+    __syncthreads();
+
+    #pragma unroll(4)
+    for (int i = threadIdx.x; i < size; i += THREAD_PER_BLOCK) {
+        float k = get_float_value<T>(scale[i]) * var;
+        float b = - mean * k + get_float_value<T>(bias[i]);
+        dst[i] = convert_float_value<T>((get_float_value<T>(ptr[i]) * k + b));
+    }
+}
+
+Status CudaLayerNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status CudaLayerNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLayerNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *scale_blob  = inputs[1];
+    Blob *bias_blob   = inputs[2];
+    Blob *output_blob = outputs[0];
+
+    auto layer_param = dynamic_cast<LayerNormLayerParam *>(param_);
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    const int reduce_dim_size = layer_param->reduce_dims_size;
+    const int channel_dim_size = (int)dims_input.size() - reduce_dim_size;
+
+    const int channels = DimsVectorUtils::Count(dims_input, 0, channel_dim_size);
+    const int channel_area = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, channel_dim_size);
+    if (0 == channels || 0 == channel_area) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    void *input_data  = input_blob->GetHandle().base;
+    void *output_data = output_blob->GetHandle().base;
+    void *scale_data  = scale_blob->GetHandle().base;
+    void *bias_data   = bias_blob->GetHandle().base;
+
+    const int THREAD_PER_BLOCK = 128;
+    dim3 griddim;
+    griddim.x = channels;
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        layer_norm_kernel<THREAD_PER_BLOCK, float, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((float*)input_data,
+            (float *)output_data, (float *)scale_data, (float *)bias_data, channel_area, channels, layer_param->eps);
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        layer_norm_kernel<THREAD_PER_BLOCK, __half, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((__half*)input_data,
+            (__half *)output_data, (__half *)scale_data, (__half *)bias_data, channel_area, channels, layer_param->eps);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(LayerNorm, LAYER_LAYER_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_layer_acc.cu
new file mode 100644
index 0000000..4e1aa4d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Log, LAYER_LOG);
+
+Status CudaLogLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaLogLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLogLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Log, LAYER_LOG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_sigmoid_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_sigmoid_layer_acc.cu
new file mode 100644
index 0000000..9f54b5a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_log_sigmoid_layer_acc.cu
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+
+__global__ void log_sigmoid_kernel(int count, const float* input, float* output) {
+    CUDA_KERNEL_LOOP(index, count) {
+        output[index] = __logf(1.0f / (1.0f + __expf(-input[index])));
+    }
+}
+
+Status CudaLogSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaLogSigmoidLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLogSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        log_sigmoid_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(count,
+            input_data, output_data);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "datatype not support");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lrn_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lrn_layer_acc.cu
new file mode 100644
index 0000000..d2d32b9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lrn_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(LRN, LAYER_LRN);
+
+Status CudaLRNLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaLRNLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLRNLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(LRN, LAYER_LRN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu
new file mode 100644
index 0000000..c40e209
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu
@@ -0,0 +1,368 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_lstm_layer_acc.h"
+
+#include <memory>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+/* 
+    CUDNN LSTM Weight Storage Format:
+        Concat(
+            [4, hidden_size, input_size],   // ifco Weight For input 
+            [4, hidden_size, hidden_size],  // ifco Weight For reccurent 
+            *[4, hidden_size, input_size],  // ifco Backward Weight For input, only exists in bidirection mode
+            *[4, hidden_size, hidden_size], // ifco Backward Weight For reccurent, only exists in bidirection mode
+            [4, hidden_size],               // ifco Bias for input
+            [4, hidden_size],               // ifco Bias for reccurent
+            *[4, hidden_size],               // ifco Backward Bias for input, only exists in bidirection mode
+            *[4, hidden_size],               // ifco Backward Bias for recurent, only exists in bidirection mode
+        )
+*/
+Status PackONNXWeightsToCUDNNFormat(Blob * W, Blob * R, Blob* B, 
+                                    const int directions, const int hidden_size, const int input_size, 
+                                    float * cudnn_weight_ptr)
+{
+    // 1. Check blob volumn
+    if (DimsVectorUtils::Count(W->GetBlobDesc().dims) != directions * 4 * hidden_size * input_size) {
+        LOGE("Blob W has invalid volumn\n");
+        return  TNNERR_LAYER_ERR;
+    }
+
+    if (DimsVectorUtils::Count(R->GetBlobDesc().dims) != directions * 4 * hidden_size * hidden_size) {
+        LOGE("Blob R has invalid volumn\n");
+        return  TNNERR_LAYER_ERR;
+    }
+
+    if (DimsVectorUtils::Count(B->GetBlobDesc().dims) != directions * 8 * hidden_size) {
+        LOGE("Blob B has invalid volumn\n");
+        return  TNNERR_LAYER_ERR;
+    }
+
+    const int gate_offset[4] = {0, 2, 3, 1}; // IOFC -> IFCO
+
+    // [num_directions, 4*hidden_size, input_size].
+    float * W_ptr = (float*)(((char*)W->GetHandle().base) + W->GetHandle().bytes_offset);
+    // [num_directions, 4*hidden_size, hidden_size].
+    float * R_ptr = (float*)(((char*)R->GetHandle().base) + R->GetHandle().bytes_offset);
+    // [num_directions, 8*hidden_size].
+    float * B_ptr = (float*)(((char*)B->GetHandle().base) + B->GetHandle().bytes_offset);
+
+    size_t offset = 0;
+    for(int dire = 0; dire < directions; dire++) {
+        // W
+        for(int g=0;g<4;g++) {
+            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
+                                  W_ptr + (dire * 4 + gate_offset[g]) * hidden_size * input_size,
+                                  hidden_size * input_size * sizeof(float),
+                                  cudaMemcpyDeviceToDevice));
+            offset += hidden_size * input_size;
+        }
+        // R
+        for(int g=0;g<4;g++) {
+            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
+                                  R_ptr + (dire * 4 + gate_offset[g]) * hidden_size * hidden_size,
+                                  hidden_size * hidden_size * sizeof(float),
+                                  cudaMemcpyDeviceToDevice));
+            offset += hidden_size * hidden_size;
+        }
+    }
+
+    for(int dire = 0; dire < directions; dire++) {
+        // WB
+        for(int g=0;g<4;g++) {
+            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
+                                  B_ptr + (dire * 8 + gate_offset[g]) * hidden_size,
+                                  hidden_size * sizeof(float),
+                                  cudaMemcpyDeviceToDevice));
+            offset += hidden_size;
+        }
+        // RB
+        for(int g=0;g<4;g++) {
+            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
+                                  B_ptr + (dire * 8 + 4 + gate_offset[g]) * hidden_size,
+                                  hidden_size * sizeof(float),
+                                  cudaMemcpyDeviceToDevice));
+            offset += hidden_size;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status CudaLSTMONNXLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+
+    rnn_algo_ = CUDNN_RNN_ALGO_STANDARD;
+    // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;
+    // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_STATIC;
+
+    CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_));
+   
+    unsigned long long seed = 1337ull; // Pick a seed.
+    float dropout = 0;
+    size_t stateSize;
+
+    CUDNN_CHECK(cudnnDropoutGetStatesSize(context_->cudnn_handle_, &stateSize));
+    RETURN_ON_NEQ(device_->Allocate(&dropout_state_, stateSize), TNN_OK);
+    CUDNN_CHECK(cudnnSetDropoutDescriptor(dropout_desc_, 
+                               context_->cudnn_handle_,
+                               dropout, 
+                               dropout_state_, 
+                               stateSize, 
+                               seed));
+
+    return this->Reshape(inputs, outputs);
+}
+
+CudaLSTMONNXLayerAcc::~CudaLSTMONNXLayerAcc(){
+    CUDNN_CHECK(cudnnDestroyRNNDescriptor(rnn_desc_));
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_CHECK(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cy_desc_));
+
+    if (dropout_state_) {
+        device_->Free(dropout_state_);
+        dropout_state_ = nullptr;
+    }
+
+    if (x_desc_ && seq_length_ > 0) {
+        for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
+        free(x_desc_);
+        x_desc_ = nullptr;
+    }
+    if (y_desc_ && seq_length_ > 0) {
+        for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
+        free(y_desc_);
+        y_desc_ = nullptr;
+    }
+
+    if (hx_) {
+        device_->Free(hx_);
+        hx_ = nullptr;
+    }
+    if (hy_) {
+        device_->Free(hy_);
+        hy_ = nullptr;
+    }
+    if (cx_) {
+        device_->Free(cx_);
+        cx_ = nullptr;
+    }
+    if (cy_) {
+        device_->Free(cy_);
+        cy_ = nullptr;
+    }
+    if (workspace_) {
+        device_->Free(workspace_);
+        workspace_= nullptr;
+        workspace_size_ = 0;
+    }
+
+    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+        cudnnDestroyPersistentRNNPlan(rnn_plan_);
+    }
+}
+
+Status CudaLSTMONNXLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    this->is_reshaped = false;
+    return TNN_OK;
+}
+
+Status CudaLSTMONNXLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    if (!this->is_reshaped) {
+        DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+        LSTMONNXLayerParam * lstm_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+
+        if (inputs.size() < 4) {
+            return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+        }
+        // free the last init resources 
+        if (x_desc_ && seq_length_ > 0) {
+            for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
+            free(x_desc_);
+            x_desc_ = nullptr;
+        }
+        if (y_desc_ && seq_length_ > 0) {
+            for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
+            free(y_desc_);
+            y_desc_ = nullptr;
+        }
+
+        hidden_size_ = lstm_param->hidden_size;
+        num_layers_ = 1;
+        input_size_ = DimsVectorUtils::Count(input_dims, 2); // input dimension
+        bidirectional_ = lstm_param->direction >= 2 ? true : false;
+
+        // currently one onnx lstm layer only compute one time, so num_layers = 1
+        seq_length_ = input_dims[0];
+        int batch_size = input_dims[1];
+
+        CUDNN_CHECK(cudnnSetRNNDescriptor_v6(context_->cudnn_handle_,
+                                        rnn_desc_,
+                                        hidden_size_, 
+                                        num_layers_, 
+                                        dropout_desc_,
+                                        CUDNN_LINEAR_INPUT, 
+                                        bidirectional_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, 
+                                        CUDNN_LSTM, 
+                                        rnn_algo_,
+                                        CUDNN_DATA_FLOAT));
+
+        // xy initialize
+        x_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
+        y_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
+
+        
+        int dimA[3];
+        int strideA[3];
+
+        for (int i = 0; i < seq_length_; i++) {
+            CUDNN_CHECK( cudnnCreateTensorDescriptor(&(x_desc_[i])) );
+            CUDNN_CHECK( cudnnCreateTensorDescriptor(&(y_desc_[i])) );
+
+            dimA[0] = batch_size;
+            dimA[1] = input_size_;
+            dimA[2] = 1;
+
+            strideA[0] = dimA[2] * dimA[1];
+            strideA[1] = dimA[2];
+            strideA[2] = 1;
+
+            CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
+
+            dimA[0] = batch_size;
+            dimA[1] = hidden_size_ * (bidirectional_ ? 2 : 1);
+            dimA[2] = 1;
+
+            strideA[0] = dimA[2] * dimA[1];
+            strideA[1] = dimA[2];
+            strideA[2] = 1;
+
+            CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
+        }
+    
+    
+        // hc initialize
+        dimA[0] = num_layers_ * (bidirectional_ ? 2 : 1);
+        dimA[1] = batch_size;
+        dimA[2] = hidden_size_;
+
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+        CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+        CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+        CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+
+        size_t hc_size_in_bytes = (bidirectional_ ? 2 : 1) * batch_size * hidden_size_ * sizeof(float);
+        RETURN_ON_NEQ(device_->ReAllocate((void **)&hx_, hc_size_in_bytes), TNN_OK);
+        RETURN_ON_NEQ(device_->ReAllocate((void **)&hy_, hc_size_in_bytes), TNN_OK);
+        RETURN_ON_NEQ(device_->ReAllocate((void **)&cx_, hc_size_in_bytes), TNN_OK);
+        RETURN_ON_NEQ(device_->ReAllocate((void **)&cy_, hc_size_in_bytes), TNN_OK);
+
+        CUDA_CHECK(cudaMemset(hy_, 0, hc_size_in_bytes));
+        CUDA_CHECK(cudaMemset(cy_, 0, hc_size_in_bytes));
+
+        if (inputs.size() >= 6) {
+            // [num_directions, batch_size, hidden_size].
+            float * h0_ptr = (float*)(((char*)inputs[4]->GetHandle().base) + inputs[4]->GetHandle().bytes_offset);
+            float * c0_ptr = (float*)(((char*)inputs[5]->GetHandle().base) + inputs[5]->GetHandle().bytes_offset);
+            CUDA_CHECK(cudaMemcpy(hx_, h0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
+            CUDA_CHECK(cudaMemcpy(cx_, c0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
+        } else {
+            CUDA_CHECK(cudaMemset(hx_, 0, hc_size_in_bytes));
+            CUDA_CHECK(cudaMemset(cx_, 0, hc_size_in_bytes));
+        }
+    
+        // weight initialize
+        size_t weightsSize;
+        CUDNN_CHECK(cudnnGetRNNParamsSize(context_->cudnn_handle_, rnn_desc_, x_desc_[0], &weightsSize, CUDNN_DATA_FLOAT));
+
+        int dimW[3];   
+        dimW[0] =  weightsSize / sizeof(float);
+        dimW[1] = 1;
+        dimW[2] = 1;
+
+        CUDNN_CHECK(cudnnSetFilterNdDescriptor(w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));   
+
+        RETURN_ON_NEQ(device_->ReAllocate((void **)&weights_, weightsSize), TNN_OK);
+        RETURN_ON_NEQ(PackONNXWeightsToCUDNNFormat(inputs[1], inputs[2], inputs[3], 
+                                                num_layers_ * (bidirectional_ ? 2 : 1), hidden_size_, input_size_,
+                                                (float*)weights_), 
+                    TNN_OK);
+
+        CUDNN_CHECK(cudnnGetRNNWorkspaceSize(context_->cudnn_handle_, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
+
+        if (workspace_size_ > 0) {
+            RETURN_ON_NEQ(device_->ReAllocate(&workspace_, workspace_size_), TNN_OK);
+        }
+
+        // set lstm algo persist plan 
+        if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+        // Note: This step is expensive. Once completed the plan can be reused so long as the descriptor
+        CUDNN_CHECK(cudnnCreatePersistentRNNPlan(rnn_desc_, batch_size, CUDNN_DATA_FLOAT, &rnn_plan_));
+        CUDNN_CHECK(cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_));
+        }
+        this->is_reshaped = true;
+    }
+
+    float * bottom_data = (float*)(((char*)inputs[0]->GetHandle().base) + inputs[0]->GetHandle().bytes_offset);
+    float * top_data    = (float*)(((char*)outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset);
+    CUDNN_CHECK(cudnnRNNForwardInference(context_->cudnn_handle_,
+                                         rnn_desc_, 
+                                         seq_length_,
+                                         x_desc_, 
+                                         bottom_data, 
+                                         hx_desc_,
+                                         hx_, 
+                                         cx_desc_,
+                                         cx_, 
+                                         w_desc_,
+                                         weights_,
+                                         y_desc_,
+                                         top_data,
+                                         hy_desc_, 
+                                         hy_,
+                                         cy_desc_, 
+                                         cy_,
+                                         workspace_,
+                                         workspace_size_));
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(LSTMONNX, LAYER_LSTMONNX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h
new file mode 100644
index 0000000..0c4d545
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LSTM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LSTM_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaLSTMONNXLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaLSTMONNXLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+
+    int num_layers_;
+    int seq_length_;
+    int hidden_size_;
+    int input_size_;
+    bool bidirectional_;
+
+    cudnnRNNAlgo_t rnn_algo_;
+
+    cudnnRNNDescriptor_t rnn_desc_;
+    cudnnFilterDescriptor_t w_desc_;
+    cudnnPersistentRNNPlan_t rnn_plan_;
+    cudnnDropoutDescriptor_t dropout_desc_;
+
+    cudnnTensorDescriptor_t *x_desc_, *y_desc_;
+    cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+    cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+
+    float * hx_ = nullptr;
+    float * hy_ = nullptr;
+    float * cx_ = nullptr;
+    float * cy_ = nullptr;
+
+    void * workspace_ = nullptr;
+    void * dropout_state_ = nullptr;
+    size_t workspace_size_ = 0;
+    float* weights_ = nullptr;
+    
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_LSTM_LAYER_ACC_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu
new file mode 100644
index 0000000..845a3d2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu
@@ -0,0 +1,253 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(MatMul, LAYER_MATMUL);
+
+#define BLOCK_DIM 16
+
+__device__ __forceinline__ __half atomic_add(__half* address, __half val) {
+#if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
+    return atomicAdd(address, val);
+#else
+    unsigned int* address_as_uint = (unsigned int*) address;
+    unsigned int old = *address_as_uint;
+    __half* old_as_half = (__half*) &old;
+    unsigned int assumed;
+    unsigned int updated;
+    __half* updated_as_half = (__half*) &updated;
+    do {
+        assumed = old;
+        updated = old;
+        *updated_as_half = __hadd(val, *updated_as_half);
+        old = atomicCAS(address_as_uint, assumed, updated);
+    } while (assumed != old);
+    return *old_as_half;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+template<typename T>
+__global__ void matmul_transpose_kernel(T *odata, T *idata, int width, int height) {
+    __shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
+
+    odata += blockIdx.z * width * height;
+    idata += blockIdx.z * width * height;
+    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
+    if((xIndex < width) && (yIndex < height)) {
+        unsigned int index_in = yIndex * width + xIndex;
+        block[threadIdx.y][threadIdx.x] = idata[index_in];
+    }
+
+    __syncthreads();
+
+    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
+    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
+    if((xIndex < height) && (yIndex < width)) {
+        unsigned int index_out = yIndex * height + xIndex;
+        odata[index_out] = block[threadIdx.x][threadIdx.y];
+    }
+}
+
+__global__ void matmul_batched_gemv_kernel(const float* data1, const float* data2, float* output,
+        int stride_a1, int stride_a2, int stride_a3, int stride_b1, int stride_b2, int stride_b3,
+        int size2, int size3, int N, int K) {
+
+    int index1 = blockIdx.x / (size2 * size3);
+    int index2 = blockIdx.x / size3 % size2;
+    int index3 = blockIdx.x % size3;
+
+    int offset_a = index1 * stride_a1 * (size2 * size3) +
+                index2 * stride_a2 * size3 +
+                index3 * stride_a3;
+
+    int offset_b = index1 * stride_b1 * (size2 * size3) +
+                index2 * stride_b2 * size3 +
+                index3 * stride_b3;
+
+    int offset_out = index1 * (size2 * size3) +
+                index2 * size3 + index3;
+
+    const float* a = data1 + offset_a * N * K + blockIdx.y * blockDim.x + blockIdx.z * TNN_CUDA_NUM_THREADS * N;
+    const float* b = data2 + offset_b * K;
+    float* out = output + offset_out * N + blockIdx.y * blockDim.x;
+
+    int group = threadIdx.x / 32;
+    int lane = threadIdx.x % 32;
+    float value_b = threadIdx.x + blockIdx.z * TNN_CUDA_NUM_THREADS < K ?
+        b[threadIdx.x + blockIdx.z * TNN_CUDA_NUM_THREADS] : 0;
+
+    a += group * 32 * N;
+    float local_sum[4] = {0, 0, 0, 0};
+
+    int end = max(0, min(32, K - group * 32));
+
+    for (int j = 0; j < end; j++) {
+        float bx = __shfl_sync(0xffffffff, value_b, j, 32);
+        for (int i = 0; i < 4; i++) {
+            int new_group = (group + i) % 4;
+            int offset = new_group * 32 + lane;
+            if (blockIdx.y * blockDim.x + offset < N) {
+                local_sum[i] = __fmaf_rn(a[offset + j * N], bx, local_sum[i]);
+            }
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        int new_group = (group + i) % 4;
+        int offset = new_group * 32 + lane;
+        if (blockIdx.y * blockDim.x + offset < N)
+            atomicAdd(&out[offset], local_sum[i]);
+    }
+}
+
+__global__ void matmul_batched_gemv_kernel_fp16(const __half* data1, const float* data2, __half * output,
+        int stride_a1, int stride_a2, int stride_a3, int stride_b1, int stride_b2, int stride_b3,
+        int size2, int size3, int N, int K) {
+
+    int index1 = blockIdx.x / (size2 * size3);
+    int index2 = blockIdx.x / size3 % size2;
+    int index3 = blockIdx.x % size3;
+
+    int offset_a = index1 * stride_a1 * (size2 * size3) +
+                index2 * stride_a2 * size3 +
+                index3 * stride_a3;
+
+    int offset_b = index1 * stride_b1 * (size2 * size3) +
+                index2 * stride_b2 * size3 +
+                index3 * stride_b3;
+
+    int offset_out = index1 * (size2 * size3) +
+                index2 * size3 + index3;
+
+    const __half* a = data1 + offset_a * N * K + blockIdx.y * blockDim.x + blockIdx.z * TNN_CUDA_NUM_THREADS * N;
+    const float* b = data2 + offset_b * K;
+    __half * out = output + offset_out * N + blockIdx.y * blockDim.x;
+
+    int group = threadIdx.x / 32;
+    int lane = threadIdx.x % 32;
+    float value_b = threadIdx.x + blockIdx.z * TNN_CUDA_NUM_THREADS < K ?
+        b[threadIdx.x + blockIdx.z * TNN_CUDA_NUM_THREADS] : 0.f;
+
+    a += group * 32 * N;
+    float local_sum[4] = {0, 0, 0, 0};
+
+    int end = max(0, min(32, K - group * 32));
+
+    for (int j = 0; j < end; j++) {
+        float bx = __shfl_sync(0xffffffff, value_b, j, 32);
+        for (int i = 0; i < 4; i++) {
+            int new_group = (group + i) % 4;
+            int offset = new_group * 32 + lane;
+            if (blockIdx.y * blockDim.x + offset < N) {
+                local_sum[i] = __fmaf_rn(__half2float(a[offset + j * N]), bx, local_sum[i]);
+            }
+        }
+    }
+
+    for (int i = 0; i < 4; i++) {
+        int new_group = (group + i) % 4;
+        int offset = new_group * 32 + lane;
+        if (blockIdx.y * blockDim.x + offset < N)
+            atomic_add(&out[offset], __float2half(local_sum[i]));
+    }
+}
+
+Status CudaMatMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob* input_blob1 = inputs[0];
+    Blob* input_blob2 = inputs[1];
+    Blob* output_blob = outputs[0];
+    auto input_dims1 = input_blob1->GetBlobDesc().dims;
+    auto input_dims2 = input_blob2->GetBlobDesc().dims;
+
+    if (input_dims1.size() > 5) {
+        LOGE("Error: layer acc dont support dims: %lu\n", input_dims1.size());
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+
+    int K = input_dims1[input_dims1.size() - 1];
+    int N = input_dims1[input_dims1.size() - 2];
+
+    int size[3];
+    int stride_a[3];
+    int stride_b[3];
+
+    int i = 0;
+    for (; i < input_dims1.size() - 2; i++) {
+        size[i] = std::max(input_dims1[i], input_dims2[i]);
+        stride_a[i] = input_dims1[i] == 1 ? 0 : 1;
+        stride_b[i] = input_dims2[i] == 1 ? 0 : 1;
+    }
+
+    for (; i < 3; i++) {
+        size[i] = 1;
+        stride_a[i] = 0;
+        stride_b[i] = 0;
+    }
+
+    void* input_data1 = input_blob1->GetHandle().base;
+    void* input_data2 = input_blob2->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+
+    dim3 dimGrid(K/BLOCK_DIM, N/BLOCK_DIM, size[0]*stride_a[0]+size[1]*stride_a[1]+size[2]*stride_a[2]);
+    dim3 dimBlock(BLOCK_DIM, BLOCK_DIM, 1);
+
+    int type_size = DataTypeUtils::GetBytesSize(input_blob1->GetBlobDesc().data_type);
+    int cur_workspace_size = (size[0]*stride_a[0]+size[1]*stride_a[1]+size[2]*stride_a[2]) * K * N * type_size;
+
+    context_->SetWorkspaceSize(cur_workspace_size);
+    if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_FLOAT) {
+        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, context_->GetStream()>>>((float*)context_->GetWorkspace(),
+        (float*)input_data1, K, N);
+    } else if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_HALF) {
+        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, context_->GetStream()>>>((__half*)context_->GetWorkspace(),
+        (__half*)input_data1, K, N);
+    }
+
+    dim3 grid;
+    grid.x = size[0] * size[1] * size[2];
+    grid.y = (N + TNN_CUDA_NUM_THREADS - 1) / TNN_CUDA_NUM_THREADS;
+    grid.z = (K + TNN_CUDA_NUM_THREADS - 1) / TNN_CUDA_NUM_THREADS;
+
+    CUDA_CHECK(cudaMemsetAsync(output_data, 0, size[0] * size[1] * size[2] * N * type_size, context_->GetStream()));
+
+    if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_FLOAT) {
+        matmul_batched_gemv_kernel<<<grid, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            (float*)context_->GetWorkspace(), (float*)input_data2, (float*)output_data, stride_a[0], stride_a[1],
+            stride_a[2], stride_b[0], stride_b[1], stride_b[2], size[1], size[2], N, K);
+    } else if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_HALF) {
+        matmul_batched_gemv_kernel_fp16<<<grid, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            (__half*)context_->GetWorkspace(), (float*)input_data2, (__half*)output_data, stride_a[0], stride_a[1],
+            stride_a[2], stride_b[0], stride_b[1], stride_b[2], size[1], size[2], N, K);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(MatMul, LAYER_MATMUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_max_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_max_layer_acc.cu
new file mode 100644
index 0000000..cdcb789
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_max_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Max, LAYER_MAXIMUM);
+
+Status CudaMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_min_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_min_layer_acc.cu
new file mode 100644
index 0000000..d566d4e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_min_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Min, LAYER_MINIMUM);
+
+Status CudaMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Min, LAYER_MINIMUM);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu
new file mode 100644
index 0000000..0f613e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Mul, LAYER_MUL);
+
+Status CudaMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Mul, LAYER_MUL);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_neg_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_neg_layer_acc.cu
new file mode 100644
index 0000000..85f16e9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_neg_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Neg, LAYER_NEG);
+
+Status CudaNegLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaNegLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaNegLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Neg, LAYER_NEG);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_normalize_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_normalize_layer_acc.cu
new file mode 100644
index 0000000..a25e07c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_normalize_layer_acc.cu
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cub/block/block_reduce.cuh>
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Normalize, LAYER_NORMALIZE);
+
+template<int p>
+__device__ static void reduce_op(float &norm, const float &a) {
+    norm = a;
+}
+
+template<> __device__ void reduce_op<1>(float& norm, const float &a) { norm += fabs(a); }
+template<> __device__ void reduce_op<2>(float& norm, const float &a) { norm += a * a; }
+template<> __device__ void reduce_op<INT_MAX>(float& norm, const float &a) { norm = max(norm, a); }
+template<> __device__ void reduce_op<INT_MIN>(float& norm, const float &a) { norm = min(norm, a); }
+
+template<int p>
+__global__ void normalize_kernel(const float *__restrict src, float* dst, float eps, int num, int channel, int hw) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= hw) return;
+
+    src += tid + blockIdx.y * channel * hw;
+    dst += tid + blockIdx.y * channel * hw;
+    float sum = 0.f;
+    if (p == INT_MAX) {
+        sum = -FLT_MAX;
+    } else if (p == INT_MIN) {
+        sum = FLT_MAX;
+    }
+
+    for (int c = 0; c < channel; c++) {
+        reduce_op<p>(sum, src[c * hw]);
+    }
+
+    if (p == 2) {
+        sum = max((float)sqrt(sum), eps);
+    }
+
+    for (int c = 0; c < channel; c++) {
+        dst[c * hw] = src[c * hw] / sum;
+    }
+}
+
+using kernel_function_ptr_t = decltype(&normalize_kernel<1>);
+
+Status CudaNormalizeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaNormalizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaNormalizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<NormalizeLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: NormalizeLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: NormalizeLayerParam is nil");
+    }
+
+    float epsilon = params->epsilon;
+    int axis = params->axis;
+    int p = params->p;
+
+    int across_spatial = params->across_spatial;
+
+    // old tnn support scale the result of normalize and only norm2
+    if ((p != 1 && p != 2 && p != INT_MAX && p != INT_MIN) || axis != 1 || across_spatial != 0) {
+        LOGE("Error: layer param is not supported now\n");
+        return Status(TNNERR_INST_ERR, "Error: layer param is not supported now");
+    }
+
+    kernel_function_ptr_t kernel_ptr = normalize_kernel<1>;
+    switch (p) {
+        case 1:
+            kernel_ptr = normalize_kernel<1>;
+            break;
+        case 2:
+            kernel_ptr = normalize_kernel<2>;
+            break;
+        case INT_MAX:
+            kernel_ptr = normalize_kernel<INT_MAX>;
+            break;
+        case INT_MIN:
+            kernel_ptr = normalize_kernel<INT_MIN>;
+            break;
+        default:
+            kernel_ptr = normalize_kernel<1>;
+            break;
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    int batch = output_dims[0];
+    int channel = output_dims[1];
+    int channel_size = DimsVectorUtils::Count(output_dims, 2);
+    dim3 grid;
+    const int block = 64;
+    grid.x = (channel_size + block - 1) / block;
+    grid.y = batch;
+    kernel_ptr<<<grid, 64, 0, context_->GetStream()>>>(
+        input_data, output_data, epsilon, batch, channel, channel_size);
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Normalize, LAYER_NORMALIZE);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_onehot_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_onehot_layer_acc.cu
new file mode 100644
index 0000000..f734bd0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_onehot_layer_acc.cu
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/fastdiv.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(OneHot, LAYER_ONEHOT);
+
+__global__ void onehot_kernel(
+    const int* indices_data,
+    float* output_data,
+    int count,
+    const fastdiv depth_suffix,
+    const fastdiv suffix,
+    const int depth,
+    const float value_on,
+    const float value_off) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int prefix_index = index / depth_suffix; 
+        int prefix_offset = index - prefix_index * depth_suffix;
+        int depth_index = prefix_offset / suffix;
+        int suffix_index = depth_index - depth_index * suffix;
+        int indices_index = prefix_index * suffix + suffix_index;
+        bool is_valid_range = indices_data[indices_index] >= -depth && indices_data[indices_index] < depth;
+        int adjusted_indice = (indices_data[indices_index] + depth) % depth;
+        output_data[index] = (is_valid_range && adjusted_indice == depth_index) ? value_on : value_off;
+    }
+}
+
+Status CudaOneHotLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaOneHotLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaOneHotLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<OneHotLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto output_dims = output->GetBlobDesc().dims;
+    int axis = layer_param->axis;
+    if(axis < 0) {
+        axis += output_dims.size();
+    }
+    
+    auto input_data = (int*)(input->GetHandle().base);
+    auto output_data = (float *)(output->GetHandle().base);
+ 
+    int depth = output_dims[axis];
+    fastdiv depth_suffix, suffix;
+    depth_suffix.init(DimsVectorUtils::Count(output_dims, axis));
+    suffix.init(DimsVectorUtils::Count(output_dims, axis+1));
+
+    const int count = DimsVectorUtils::Count(output_dims);
+    onehot_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            input_data, output_data, count, depth_suffix, suffix, depth, 
+            layer_param->value_on, layer_param->value_off);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(OneHot, LAYER_ONEHOT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu
new file mode 100644
index 0000000..2eaddb2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu
@@ -0,0 +1,161 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Pad, LAYER_PAD);
+
+__global__ void pad_default_kernel(const float* src, float* dst, int count, int input_channel, int output_channel,
+        int pad_c, int output_h, int output_w, int input_h, int input_w, int pad_w, int pad_h, float value) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        int dst_n = idx / (output_channel * output_h * output_w);
+        int dst_c = (idx / (output_h * output_w)) % output_channel;
+        int dst_h = (idx / output_w) % output_h;
+        int dst_w = idx % output_w;
+
+        if (dst_c < pad_c || dst_c >= input_channel + pad_c || dst_h < pad_h || dst_h >= (pad_h + input_h) ||
+                dst_w < pad_w || dst_w >= (pad_w + input_w)) {
+            dst[idx] = value;
+        } else {
+          int src_idx = dst_n * input_channel * input_h * input_w + (dst_c - pad_c) * input_h * input_w +
+            (dst_h - pad_h) * input_w + (dst_w - pad_w);
+          dst[idx] = src[src_idx];
+        }
+    }
+}
+
+__global__ void pad_symmetric_kernel(const float* src, float* dst, int count, int channels, int output_h,
+        int output_w, int input_h, int input_w, int pad_w, int pad_h) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        int dst_n = idx / (channels * output_h * output_w);
+        int dst_c = (idx / (output_h * output_w)) % channels;
+        int dst_h = (idx / output_w) % output_h;
+        int dst_w = idx % output_w;
+
+        int h = dst_h >= pad_h? (dst_h < pad_h + input_h? dst_h - pad_h : pad_h - 1 - dst_h + 2 * input_h) : pad_h - 1 - dst_h;
+        int w = dst_w >= pad_w? (dst_w < pad_w + input_w? dst_w - pad_w : pad_w - 1 - dst_w + 2 * input_w) : pad_w - 1 - dst_w;
+        dst[idx] = src[dst_n * channels * input_h * input_w + dst_c * input_h * input_w + h * input_w + w];
+    }
+}
+
+__global__ void pad_reflect_kernel(const float* src, float* dst, int count, int channels, int output_h, int output_w,
+        int input_h, int input_w, int pad_w, int pad_h) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        int dst_n = idx / (channels * output_h * output_w);
+        int dst_c = (idx / (output_h * output_w)) % channels;
+        int dst_h = (idx / output_w) % output_h;
+        int dst_w = idx % output_w;
+
+        int h = dst_h >= pad_h? (dst_h < pad_h + input_h? dst_h - pad_h : pad_h - 2 - dst_h + 2 * input_h) : pad_h - dst_h;
+        int w = dst_w >= pad_w? (dst_w < pad_w + input_w? dst_w - pad_w : pad_w - 2 - dst_w + 2 * input_w) : pad_w - dst_w;
+        dst[idx] = src[dst_n * channels * input_h * input_w + dst_c * input_h * input_w + h * input_w + w];
+    }
+}
+
+__global__ void pad_reflect_kernel_fp16(const __half * src, __half * dst, int count, int channels, int output_h, int output_w,
+        int input_h, int input_w, int pad_w, int pad_h) {
+    const float4 * src_l = reinterpret_cast<const float4*>(src);
+    float4 * dst_l = reinterpret_cast<float4*>(dst);
+
+    channels = channels / 8;
+    CUDA_KERNEL_LOOP(idx, count / 8) {
+        int dst_n = idx / (channels * output_h * output_w);
+        int dst_h = (idx / (output_w * channels)) % output_h;
+        int dst_w = (idx / channels) % output_w;
+        int dst_c = idx % channels;
+
+        int h = dst_h >= pad_h ? (dst_h < pad_h + input_h ? dst_h - pad_h : pad_h - 2 - dst_h + 2 * input_h) : pad_h - dst_h;
+        int w = dst_w >= pad_w ? (dst_w < pad_w + input_w ? dst_w - pad_w : pad_w - 2 - dst_w + 2 * input_w) : pad_w - dst_w;
+
+        int src_idx = dst_n * input_h * input_w * channels + h * input_w * channels + w * channels + dst_c;
+
+        dst_l[idx] = src_l[src_idx];
+    }
+}
+
+Status CudaPadLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPadLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPadLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<PadLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: PadLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PadLayerParam is nil");
+    }
+
+    int pad_l = params->pads[0];
+    int pad_r = params->pads[1];
+    int pad_t = params->pads[2];
+    int pad_b = params->pads[3];
+    int pad_c_b = params->pads[4];
+    int pad_c_e = params->pads[5];
+    float value = params->value;
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    const int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    int output_channel = output_blob->GetBlobDesc().dims[1];
+    int input_h = input_blob->GetBlobDesc().dims[2];
+    int input_w = input_blob->GetBlobDesc().dims[3];
+    int output_h = output_blob->GetBlobDesc().dims[2];
+    int output_w = output_blob->GetBlobDesc().dims[3];
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+        float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+        if (params->type == 2) {
+            pad_symmetric_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                input_data, output_data, count, output_channel, output_h, output_w, input_h, input_w, pad_l,
+                pad_t);
+        } else if (params->type == 1) {
+            pad_reflect_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                input_data, output_data, count, output_channel, output_h, output_w, input_h, input_w, pad_l,
+                pad_t);
+        } else {
+            int input_channel = input_blob->GetBlobDesc().dims[1];
+            pad_default_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                input_data, output_data, count, input_channel, output_channel, pad_c_b, output_h, output_w,
+                input_h, input_w, pad_l, pad_t, value);
+        }
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        if (params->type != 1) {
+            LOGE("Error: Pad types other than relfect_mode do not supports half mode\n");
+            return Status(TNNERR_MODEL_ERR, "Error: Pad layer don't supports pad type");
+        }
+        __half* input_data = static_cast<__half*>(input_blob->GetHandle().base);
+        __half* output_data = static_cast<__half*>(output_blob->GetHandle().base);
+
+        pad_reflect_kernel_fp16<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            input_data, output_data, count, output_channel, output_h, output_w, input_h, input_w, pad_l,
+            pad_t);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Pad, LAYER_PAD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_v2_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_v2_layer_acc.cu
new file mode 100644
index 0000000..87ce486
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pad_v2_layer_acc.cu
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(PadV2, LAYER_PADV2);
+
+template <typename T>
+__global__ void pad_default_kernel_v2(const T* src, T* dst, int count, int input_channel, int output_channel,
+        int pad_c, int output_d, int output_h, int output_w, int input_d, int input_h, int input_w, int pad_d, int pad_h, int pad_w, T value) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        int dst_n = idx / (output_channel * output_d * output_h * output_w);
+        int dst_c = (idx / (output_d * output_h * output_w)) % output_channel;
+        int dst_d = (idx / (output_h * output_w)) % output_d;
+        int dst_h = (idx / output_w) % output_h;
+        int dst_w = idx % output_w;
+
+        if (dst_c < pad_c || dst_c >= input_channel + pad_c || dst_d < pad_d || dst_d >= input_d + pad_d || dst_h < pad_h || dst_h >= (pad_h + input_h) ||
+                dst_w < pad_w || dst_w >= (pad_w + input_w)) {
+            dst[idx] = value;
+        } else {
+          int src_idx = dst_n * input_channel * input_d * input_h * input_w + (dst_c - pad_c) * input_d * input_h * input_w +
+            (dst_d - pad_d) * input_h * input_w + (dst_h - pad_h) * input_w + (dst_w - pad_w);
+            dst[idx] = src[src_idx];
+        }
+    }
+}
+
+template <typename T>
+__global__ void pad_reflect_kernel_v2(const T* src, T* dst, int count, int channels, int output_d, int output_h, int output_w,
+int input_d, int input_h, int input_w, int pad_d, int pad_h, int pad_w) {
+  CUDA_KERNEL_LOOP(idx, count) {
+    int dst_n = idx / (channels * output_d * output_h * output_w);
+    int dst_c = (idx / (output_d * output_h * output_w)) % channels;
+    int dst_d = (idx / (output_h * output_w)) % output_d;
+    int dst_h = (idx / output_w) % output_h;
+    int dst_w = idx % output_w;
+
+    int d = dst_d >= pad_d? (dst_d < pad_d + input_d? dst_d - pad_d : pad_d - 2 - dst_d + 2 * input_d) : pad_d - dst_d;
+    int h = dst_h >= pad_h? (dst_h < pad_h + input_h? dst_h - pad_h : pad_h - 2 - dst_h + 2 * input_h) : pad_h - dst_h;
+    int w = dst_w >= pad_w? (dst_w < pad_w + input_w? dst_w - pad_w : pad_w - 2 - dst_w + 2 * input_w) : pad_w - dst_w;
+    dst[idx] = src[dst_n * channels * input_d * input_h * input_w + dst_c * input_d * input_h * input_w + d * input_h * input_w + h * input_w + w];
+  }
+}
+
+Status CudaPadV2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPadV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPadV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<PadLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: PadV2LayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PadV2LayerParam is nil");
+    }
+
+    Blob* output_blob = outputs[0];
+    Blob* input_blob = inputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+
+    int pad_c = 0, pad_d = 0, pad_h = 0, pad_w = 0;
+    int output_c = 1, output_d = 1, output_h = 1, output_w = 1;
+    int input_c = 1, input_d = 1, input_h = 1, input_w = 1;
+
+    pad_c = params->pads[1];  
+    output_c = output_dims[1];
+    input_c = input_dims[1];
+    
+    if(output_dims.size() > 2) {
+        pad_d = params->pads[2];  
+        output_d = output_dims[2];
+        input_d = input_dims[2];
+    }
+
+    if(output_dims.size() > 3) {
+        pad_h = params->pads[3];  
+        output_h = output_dims[3];
+        input_h = input_dims[3];
+    }
+
+    if(output_dims.size() > 4) {
+        pad_w = params->pads[4];  
+        output_w = output_dims[4];
+        input_w = input_dims[4];
+    }
+
+    const int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+
+    float value = params->value;
+    void* input_data = input_blob->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        if (params->type == 0) {
+            pad_default_kernel_v2<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                static_cast<float*>(input_data), static_cast<float*>(output_data), count, input_c, output_c, pad_c, output_d, output_h, output_w,
+                input_d, input_h, input_w, pad_d, pad_h, pad_w, value);
+        } else if(params->type == 1) {
+            pad_reflect_kernel_v2<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                static_cast<float*>(input_data), static_cast<float*>(output_data), count, output_c, output_d, output_h, output_w, input_d, 
+                input_h, input_w, pad_d, pad_h, pad_w);
+        } else {
+            LOGE("Error: layer acc dont support pad type: %d\n", params->type);
+            return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support pad type");
+        }
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        if (params->type == 0) {
+            pad_default_kernel_v2<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                static_cast<__half*>(input_data), static_cast<__half*>(output_data), count, input_c, output_c, pad_c, output_d, output_h, output_w,
+                input_d, input_h, input_w, pad_d, pad_h, pad_w, __float2half(value));
+        } else if(params->type == 1) {
+            pad_reflect_kernel_v2<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                static_cast<__half*>(input_data), static_cast<__half*>(output_data), count, output_c, output_d, output_h, output_w, input_d, 
+                input_h, input_w, pad_d, pad_h, pad_w);
+        } else {
+            LOGE("Error: layer acc dont support pad type: %d\n", params->type);
+            return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support pad type");
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(PadV2, LAYER_PADV2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu
new file mode 100644
index 0000000..7ff80e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Permute, LAYER_PERMUTE);
+
+Status CudaPermuteLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPermuteLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPermuteLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Permute, LAYER_PERMUTE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pixel_shuffle_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pixel_shuffle_layer_acc.cu
new file mode 100644
index 0000000..a3c8bf8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pixel_shuffle_layer_acc.cu
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+__global__ void pixel_shuffle_kernel(int count, const float* input, float* output, int slice_size, int upscale_factor,
+        int input_h, int input_w) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int s = (index / upscale_factor / upscale_factor / input_h / input_w);
+        int i = (index / upscale_factor / input_h / input_w) % upscale_factor;
+        int j = (index / input_h / input_w) % upscale_factor;
+        int h = (index / input_w) % input_h;
+        int w = index % input_w;
+        output[s * input_h * upscale_factor * input_w * upscale_factor +
+            h * upscale_factor * input_w * upscale_factor + i * input_w * upscale_factor +
+            w * upscale_factor + j] = input[s * upscale_factor * upscale_factor * input_h * input_w +
+            i * upscale_factor * input_h * input_w + j * input_h * input_w + h * input_w + w];
+    }
+}
+
+Status CudaPixelShuffleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPixelShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPixelShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param   = dynamic_cast<PixelShuffleLayerParam *>(param_);
+    int upscale_factor = layer_param->upscale_factor;
+    auto input_blob    = inputs[0];
+    auto input_dims    = input_blob->GetBlobDesc().dims;
+    auto output_blob   = outputs[0];
+    auto output_dims   = output_blob->GetBlobDesc().dims;
+    int slice_size     = DimsVectorUtils::Count(output_dims, 0, 2);
+    auto input_h       = input_dims[2];
+    auto input_w       = input_dims[3];
+    auto count         = slice_size * upscale_factor * upscale_factor * input_h * input_w;
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_prt  = static_cast<float *>(input_blob->GetHandle().base);
+        auto output_ptr = static_cast<float *>(output_blob->GetHandle().base);
+        pixel_shuffle_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, input_prt, output_ptr, slice_size, upscale_factor, input_h, input_w);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_3d_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_3d_layer_acc.cu
new file mode 100644
index 0000000..4ac3f50
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_3d_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Pooling3D, LAYER_POOLING_3D);
+
+Status CudaPooling3DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPooling3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPooling3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Pooling3D, LAYER_POOLING_3D);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu
new file mode 100644
index 0000000..4c79c9f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_pooling_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+__device__ int get_start_index(int a, int b, int c) {
+    return (int)floorf((float)(a * c) / b);
+}
+
+__device__ int get_end_index(int a, int b, int c) {
+    return (int)ceilf((float)((a + 1) * c) / b);
+}
+
+__global__ void adaptive_pooling_kernel(const float* input, float* output, int channels, int input_height,
+        int input_width, int output_height, int output_width, int pool_type) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= output_height * output_width) return;
+
+    const float* input_ptr = input + blockIdx.y * input_height * input_width;
+    float* output_ptr = output + blockIdx.y * output_height * output_width;
+
+    int oh = tid / output_width;
+    int ow = tid % output_width;
+
+    int ih0 = get_start_index(oh, output_height, input_height);
+    int ih1 = get_end_index(oh, output_height, input_height);
+    int kh = ih1 - ih0;
+
+    int iw0 = get_start_index(ow, output_width, input_width);
+    int iw1 = get_end_index(ow, output_width, input_width);
+    int kw = iw1 - iw0;
+
+    if (pool_type == 1) {
+        float sum = 0;
+        for (int ih = ih0; ih < ih1; ih++) {
+            for (int iw = iw0; iw < iw1; iw++) {
+                sum += input_ptr[ih * input_width + iw];
+            }
+        }
+        output_ptr[oh * output_width + ow] = sum / kh / kw;
+    }
+}
+
+CudaPoolingLayerAcc::~CudaPoolingLayerAcc() {
+    cudnnDestroy(this->m_cudnn_handle);
+    cudnnDestroyPoolingDescriptor(this->m_pooling_desc);
+    cudnnDestroyTensorDescriptor(this->m_input_desc);
+    cudnnDestroyTensorDescriptor(this->m_output_desc);
+}
+
+Status CudaPoolingLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto params = dynamic_cast<PoolingLayerParam*>(param);
+    if (params->pool_type == 0) {
+        this->m_pooling_mode = CUDNN_POOLING_MAX;
+    } else {
+        this->m_pooling_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    }
+
+    this->m_tensor_format = CUDNN_TENSOR_NCHW;
+    this->m_data_type = CUDNN_DATA_FLOAT;
+    cudnnCreate(&m_cudnn_handle);
+    cudnnCreatePoolingDescriptor(&m_pooling_desc);
+    cudnnCreateTensorDescriptor(&m_input_desc);
+    cudnnCreateTensorDescriptor(&m_output_desc);
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    cudnnSetPooling2dDescriptor(this->m_pooling_desc, this->m_pooling_mode, CUDNN_PROPAGATE_NAN,
+        params->kernels[1], params->kernels[0], params->pads[2], params->pads[0], params->strides[1],
+        params->strides[0]);
+    return TNN_OK;
+}
+
+Status CudaPoolingLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPoolingLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PoolingLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    if (param->is_global_pool) {
+        cudnnSetPooling2dDescriptor(this->m_pooling_desc, this->m_pooling_mode, CUDNN_PROPAGATE_NAN,
+            input_dims[2], input_dims[3], param->pads[2], param->pads[0], param->strides[1],
+            param->strides[0]);
+    }
+    cudnnSetTensor4dDescriptor(this->m_input_desc, this->m_tensor_format, this->m_data_type,
+        input_dims[0], input_dims[1], input_dims[2], input_dims[3]);
+    cudnnSetTensor4dDescriptor(this->m_output_desc, this->m_tensor_format, this->m_data_type,
+        output_dims[0], output_dims[1], output_dims[2], output_dims[3]);
+
+    if (param->is_adaptive_pool) {
+        bool is_1d = input_dims.size() == 3;
+        int channels = is_1d ? input_dims[0] : input_dims[0] * input_dims[1];
+        int input_height = is_1d ? input_dims[1] : input_dims[2];
+        int input_width = is_1d ? input_dims[2] : input_dims[3];
+        int output_height = is_1d ? output_dims[1] : output_dims[2];
+        int output_width = is_1d ? output_dims[2] : output_dims[3];
+        int count = output_height*output_width;
+        dim3 grid(TNN_CUDA_GET_BLOCKS(count), channels);
+        adaptive_pooling_kernel<<<grid, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            input_data, output_data, channels, input_height, input_width, output_height, output_width,
+            param->pool_type);
+    } else {
+        float alpha = 1.f;
+        float beta = 0.f;
+        cudnnPoolingForward(this->m_cudnn_handle, this->m_pooling_desc, &alpha, m_input_desc,
+            input_data, &beta, m_output_desc, output_data);
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Pooling, LAYER_POOLING);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.h b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.h
new file mode 100644
index 0000000..be60f8b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_POOLING_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_POOLING_LAYER_ACC_H_
+
+#include <cudnn.h>
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaPoolingLayerAcc : public CudaLayerAcc {
+public:
+    virtual ~CudaPoolingLayerAcc();
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    cudnnHandle_t m_cudnn_handle;
+    cudnnTensorFormat_t m_tensor_format;
+    cudnnDataType_t m_data_type;
+    cudnnPoolingMode_t m_pooling_mode;
+    cudnnPoolingDescriptor_t m_pooling_desc;
+    cudnnTensorDescriptor_t m_input_desc;
+    cudnnTensorDescriptor_t m_output_desc;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_POOLING_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_power_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_power_layer_acc.cu
new file mode 100644
index 0000000..c81d90e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_power_layer_acc.cu
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Pow, LAYER_POWER);
+
+__global__ void pow_kernel(int n, const float* srcData, const float power,
+        const float scale, const float shift, float* dstData) {
+    CUDA_KERNEL_LOOP(index, n) {
+        float result = 1;
+        float input = srcData[index] * scale + shift;
+        for (int i = 0; i < power; ++i) {
+            result *= input;
+        }
+        dstData[index] = result;
+    }
+}
+
+__global__ void pow_kernel_fp16(int n, bool odd, const __half2 * srcData, const float power,
+        const float scale, const float shift, __half2 * dstData) {
+#if __CUDA_ARCH__ > 520    
+    CUDA_KERNEL_LOOP(index, n) {
+        if (odd && index == n - 1) {
+            __half* srcData1 = (__half*)srcData;
+            __half* dstData1 = (__half*)dstData;
+            __half value = __hfma(srcData1[index * 2], __float2half(scale), __float2half(shift));
+            __half result = __float2half(1.0f);
+            for (int i = 0; i < power; ++i) {
+                result = __hmul(result, value);
+            }
+            dstData1[index * 2] = result;
+        } else {
+            __half2 value = __hfma2(srcData[index], __float2half2_rn(scale), __float2half2_rn(shift));
+            __half2 result = __float2half2_rn(1.0f);
+            for (int i = 0; i < power; ++i) {
+               result = __hmul2(result, value);
+            }
+            dstData[index] = result;
+        }
+    }
+#endif
+}
+
+Status CudaPowLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPowLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPowLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<PowLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: PowLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PowLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+        float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+        pow_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, input_data, params->exponent, params->scale, params->shift, output_data);
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        __half2* input_data = static_cast<__half2*>(input_blob->GetHandle().base);
+        __half2* output_data = static_cast<__half2*>(output_blob->GetHandle().base);
+
+        bool odd = count & 0x1;
+        int thread_count = (count + 1) / 2;
+        pow_kernel_fp16<<<TNN_CUDA_GET_BLOCKS(thread_count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            thread_count, odd, input_data, params->exponent, params->scale, params->shift, output_data);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Pow, LAYER_POWER);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prelu_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prelu_layer_acc.cu
new file mode 100644
index 0000000..fcb48df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prelu_layer_acc.cu
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(PReLU, LAYER_PRELU);
+
+__global__ void prelu_kernel(const int n, const int channels, const int dim,
+        const float* in, float* out, const float* slope_data, const int div_factor) {
+    CUDA_KERNEL_LOOP(index, n) {
+        int c = (index / dim) % channels / div_factor;
+        out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+    }
+}
+
+Status CudaPReLULayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto res = dynamic_cast<PReluLayerResource *>(resource_);
+    if (!res) {
+        LOGE("Error: PReluLayerResource is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerResource is nil");
+    }
+    const int slope_size = res->slope_handle.GetBytesSize();
+    const float *slope_data = res->slope_handle.force_to<float *>();
+    CreateTempBuf(slope_size);
+    cudaMemcpyAsync(tempbufs_[0].ptr, slope_data, slope_size, cudaMemcpyHostToDevice, context_->GetStream());
+
+    return TNN_OK;
+}
+
+Status CudaPReLULayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPReLULayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<PReluLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: PReluLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    const int channels = output_blob->GetBlobDesc().dims[1];
+    const int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    const int hw = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (0 == hw) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+    const int div_factor = params->channel_shared ? channels : 1;
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    prelu_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>
+        (count, channels, hw, input_data, output_data, (const float*)tempbufs_[0].ptr, div_factor);    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(PReLU, LAYER_PRELU);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prior_box_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prior_box_layer_acc.cu
new file mode 100644
index 0000000..682459e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_prior_box_layer_acc.cu
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(PriorBox, LAYER_PRIOR_BOX);
+
+Status CudaPriorBoxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaPriorBoxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaPriorBoxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<PriorBoxLayerParam *>(param_);
+    if (!params) {
+        return Status(TNNERR_MODEL_ERR, "Error: PriorBoxLayerParam is empyt");
+    }
+
+    Blob *output_blob  = outputs[0];
+    void *output_data  = output_blob->GetHandle().base;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    if (data_type == DATA_TYPE_FLOAT) {
+        auto prior_box = GeneratePriorBox(inputs, outputs, params);
+        cudaMemcpyAsync(output_data, prior_box.data(), prior_box.size() * sizeof(float),
+            cudaMemcpyHostToDevice, context_->GetStream());
+    } else {
+        return Status(TNNERR_LAYER_ERR, "datatype not support");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(PriorBox, LAYER_PRIOR_BOX);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reciprocal_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reciprocal_layer_acc.cu
new file mode 100644
index 0000000..a2a04fb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reciprocal_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Reciprocal, LAYER_RECIPROCAL);
+
+Status CudaReciprocalLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReciprocalLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReciprocalLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Reciprocal, LAYER_RECIPROCAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_l2_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_l2_layer_acc.cu
new file mode 100644
index 0000000..e084584
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_l2_layer_acc.cu
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+__global__ void reduce_l2_kernel(const int num, const int channels,
+        const int spatial_dim, const float* input, float* output) {
+    CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        float tmp = 0;
+        for (int c = 0; c < channels; ++c) {
+            float value = input[(n * channels + c) * spatial_dim + s];
+            tmp += value * value;
+        }
+        output[n * spatial_dim + s] = sqrt(tmp);
+    }
+}
+
+Status CudaReduceL2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceL2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceL2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    int channels = 1;
+    int first_axis = 4;
+    int last_axis = 0;
+    // remove duplicate axes
+    auto axis = params->axis;
+    std::sort(axis.begin(), axis.end());
+    axis.erase(std::unique(axis.begin(), axis.end() ), axis.end());
+    for (int i = 0; i < axis.size(); i++) {
+        channels *= input_blob->GetBlobDesc().dims[axis[i]];
+        first_axis = std::min(axis[i], first_axis);
+        last_axis = std::max(axis[i], last_axis);
+    }
+
+    for(int i=first_axis; i<=last_axis; ++i) {
+        if (std::find(axis.begin(), axis.end(), i) == axis.end()) {
+            LOGE("Error: discontinuous reduce axes!");
+            return Status(TNNERR_PARAM_ERR, "Error: discontinuous reduce axes!"); 
+        }
+    }
+
+    int outer_dim = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 0, first_axis);
+    int inner_dim = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, last_axis+1);
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    reduce_l2_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        outer_dim, channels, inner_dim, input_data, output_data);
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu
new file mode 100644
index 0000000..97cc88a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+template <int blockSize, typename T>
+__global__ void reduce_log_sum_exp_kernel(const int num, const int channels,
+        const int spatial_dim, const T* input, T* output) {
+    int n = blockIdx.x / spatial_dim;
+    int s = blockIdx.x % spatial_dim;
+
+    __shared__ float smax[blockSize/32];
+    __shared__ float ssum[blockSize/32];
+
+    int tid = threadIdx.x;
+    float max_value = -FLT_MAX;
+    for (int c = tid; c < channels; c += blockDim.x) {
+        float value = get_float_value<T>(input[(n * channels + c) * spatial_dim + s]);
+        max_value = fmaxf(value, max_value);
+    }
+
+    float tmp = __shfl_down_sync(0xffffffff, max_value, 16, 32);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0xffffffff, max_value, 16, 32);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0x0000ffff, max_value, 8, 16);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0x000000ff, max_value, 4, 8);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0x0000000f, max_value, 2, 4);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0x00000003, max_value, 1, 2);
+    max_value = fmaxf(max_value, tmp);
+
+    if (tid % 32 == 0) {
+        smax[tid / 32] = max_value;
+    }
+    __syncthreads();
+
+    if (tid < blockDim.x / 32) {
+        max_value = smax[tid];
+    } else {
+        max_value = -FLT_MAX;
+    }
+
+    tmp = __shfl_down_sync(0x0000000f, max_value, 2, 4);
+    max_value = fmaxf(max_value, tmp);
+    tmp = __shfl_down_sync(0x00000003, max_value, 1, 2);
+    max_value = fmaxf(max_value, tmp);
+
+    if (tid == 0) {
+        smax[0] = max_value;
+    }
+    __syncthreads();
+
+    float thread_sum = 0;
+    for (int c = tid; c < channels; c += blockDim.x) {
+        float value = get_float_value<T>(input[(n * channels + c) * spatial_dim + s]);
+        thread_sum += exp(value - smax[0]);
+    }
+
+    thread_sum += __shfl_down_sync(0xffffffff, thread_sum, 16, 32);
+    thread_sum += __shfl_down_sync(0x0000ffff, thread_sum, 8, 16);
+    thread_sum += __shfl_down_sync(0x000000ff, thread_sum, 4, 8);
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (tid % 32 == 0) {
+        ssum[tid / 32] = thread_sum;
+    }
+    __syncthreads();
+
+    if (tid < blockDim.x / 32) {
+        thread_sum = ssum[tid];
+    } else {
+        thread_sum = 0;
+    }
+
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (tid == 0) {
+        output[n * spatial_dim + s] = convert_float_value<T>(log(thread_sum) + smax[0]);
+    }
+}
+
+Status CudaReduceLogSumExpLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceLogSumExpLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceLogSumExpLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    int channels = 1;
+    int first_axis = 4;
+    int last_axis = 0;
+    // remove duplicate axes
+    auto axis = params->axis;
+    std::sort(axis.begin(), axis.end());
+    axis.erase(std::unique(axis.begin(), axis.end() ), axis.end());
+    for (int i = 0; i < axis.size(); i++) {
+        channels *= input_blob->GetBlobDesc().dims[axis[i]];
+        first_axis = std::min(axis[i], first_axis);
+        last_axis = std::max(axis[i], last_axis);
+    }
+
+    for(int i=first_axis; i<=last_axis; ++i) {
+        if (std::find(axis.begin(), axis.end(), i) == axis.end()) {
+            LOGE("Error: discontinuous reduce axes!");
+            return Status(TNNERR_PARAM_ERR, "Error: discontinuous reduce axes!"); 
+        }
+    }
+
+    int outer_dim = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 0, first_axis);
+    int inner_dim = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, last_axis+1);
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    const int BLOCKSIZE = 128;
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+        float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+        reduce_log_sum_exp_kernel<BLOCKSIZE, float><<<count, BLOCKSIZE, BLOCKSIZE*sizeof(float), context_->GetStream()>>>(
+            outer_dim, channels, inner_dim, input_data, output_data);
+    } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        __half* input_data = static_cast<__half*>(input_blob->GetHandle().base);
+        __half* output_data = static_cast<__half*>(output_blob->GetHandle().base);
+        reduce_log_sum_exp_kernel<BLOCKSIZE, __half><<<count, BLOCKSIZE, BLOCKSIZE*sizeof(float), context_->GetStream()>>>(
+            outer_dim, channels, inner_dim, input_data, output_data);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_max_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_max_layer_acc.cu
new file mode 100644
index 0000000..a7e8f1b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_max_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+Status CudaReduceMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_mean_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_mean_layer_acc.cu
new file mode 100644
index 0000000..9c72cde
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_mean_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+Status CudaReduceMeanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceMeanLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceMeanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_min_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_min_layer_acc.cu
new file mode 100644
index 0000000..7ebda69
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_min_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+Status CudaReduceMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_sum_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_sum_layer_acc.cu
new file mode 100644
index 0000000..3679bd0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reduce_sum_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+Status CudaReduceSumLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReduceSumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReduceSumLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu6_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu6_layer_acc.cu
new file mode 100644
index 0000000..89b66f2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu6_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReLU6, LAYER_RELU6);
+
+Status CudaReLU6LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReLU6LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReLU6LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReLU6, LAYER_RELU6);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu_layer_acc.cu
new file mode 100644
index 0000000..583bb0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_relu_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ReLU, LAYER_RELU);
+
+Status CudaReLULayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReLULayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReLULayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ReLU, LAYER_RELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu
new file mode 100644
index 0000000..c176acb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Reshape, LAYER_RESHAPE);
+
+Status CudaReshapeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaReshapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaReshapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    int count = DimsVectorUtils::Count(dims);
+    void* input_data = input_blob->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+    cudaMemcpyAsync(output_data, input_data, count * sizeof(float), cudaMemcpyDeviceToDevice, context_->GetStream());
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Reshape, LAYER_RESHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_roialign_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_roialign_layer_acc.cu
new file mode 100644
index 0000000..28f7d8a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_roialign_layer_acc.cu
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(RoiAlign, LAYER_ROIALIGN);
+
+__device__ float bilinear_interpolate(const float* input_data, int height, int width, float y,
+        float x, bool is_mode_avg) {
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width) {
+        // empty
+        return 0;
+    }
+
+    if (y <= 0) {
+        y = 0;
+    }
+
+    if (x <= 0) {
+        x = 0;
+    }
+
+    int y_low = (int)y;
+    int x_low = (int)x;
+    int y_high;
+    int x_high;
+
+    if (y_low >= height - 1) {
+        y_high = y_low = height - 1;
+        y = (float)y_low;
+    } else {
+        y_high = y_low + 1;
+    }
+
+    if (x_low >= width - 1) {
+        x_high = x_low = width - 1;
+        x = (float)x_low;
+    } else {
+        x_high = x_low + 1;
+    }
+
+    float ly = y - y_low;
+    float lx = x - x_low;
+    float hy = 1. - ly, hx = 1. - lx;
+    // do bilinear interpolation
+    float v1 = input_data[y_low * width + x_low];
+    float v2 = input_data[y_low * width + x_high];
+    float v3 = input_data[y_high * width + x_low];
+    float v4 = input_data[y_high * width + x_high];
+    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+    float val = is_mode_avg
+            ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)  // mode Avg
+            : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
+
+    return val;
+}
+
+__global__ void roialign_kernel(int count, const float* input_data, float spatial_scale,
+        int channels, int height, int width, int pooled_height, int pooled_width,
+        int sampling_ratio, const float* input_rois, int roi_cols, float* output_data,
+        bool is_mode_avg, const int* batch_indices_ptr) {
+    CUDA_KERNEL_LOOP(index, count) {
+        // (n, c, ph, pw) is an element in the pooled output
+        int pw = index % pooled_width;
+        int ph = (index / pooled_width) % pooled_height;
+        int c = (index / pooled_width / pooled_height) % channels;
+        int n = index / pooled_width / pooled_height / channels;
+
+        // RoI could have 4 or 5 columns
+        const float* offset_input_rois = input_rois + n * roi_cols;
+        const auto roi_batch_ind = batch_indices_ptr[n];
+
+        // Do not using rounding; this implementation detail is critical
+        float roi_offset = 0.f;
+        float roi_start_w = offset_input_rois[0] * spatial_scale - roi_offset;
+        float roi_start_h = offset_input_rois[1] * spatial_scale - roi_offset;
+        float roi_end_w = offset_input_rois[2] * spatial_scale - roi_offset;
+        float roi_end_h = offset_input_rois[3] * spatial_scale - roi_offset;
+
+        float roi_width = roi_end_w - roi_start_w;
+        float roi_height = roi_end_h - roi_start_h;
+        roi_width = max(roi_width, 1.f);
+        roi_height = max(roi_height, 1.f);
+        float bin_size_h = roi_height / pooled_height;
+        float bin_size_w = roi_width / pooled_width;
+
+        const float* offset_input_data = input_data +
+            static_cast<int64_t>((roi_batch_ind * channels + c) * height * width);
+
+        int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_height / pooled_height);
+        int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+        const float count = roi_bin_grid_h * roi_bin_grid_w;
+
+        float output_val = 0.;
+        bool max_flag = false;
+        for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            float y = roi_start_h + ph * bin_size_h +
+                static_cast<float>(iy + .5f) * bin_size_h / static_cast<float>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                float x = roi_start_w + pw * bin_size_w +
+                    static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
+
+                float val = bilinear_interpolate(offset_input_data, height, width, y, x, is_mode_avg);
+
+                if (is_mode_avg) {
+                    output_val += val;
+                } else {
+                    if (!max_flag) {
+                        output_val = val;
+                        max_flag = true;
+                    } else {
+                        output_val = max(output_val, val);
+                    }
+                }
+            }
+        }
+        if (is_mode_avg) {
+            output_val /= count;
+        }
+        output_data[index] = output_val;
+    }
+}
+
+
+Status CudaRoiAlignLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaRoiAlignLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaRoiAlignLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<RoiAlignLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Blob *input_blob  = inputs[0];
+    Blob *rois_blob = inputs[1];
+    Blob *batch_indices_blob = inputs[2];
+    Blob *output_blob = outputs[0];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    int channels = input_blob->GetBlobDesc().dims[1];
+    int height = input_blob->GetBlobDesc().dims[2];
+    int width = input_blob->GetBlobDesc().dims[3];
+    int pooled_height = param->output_height;
+    int pooled_width = param->output_width;
+    float spatial_scale = param->spatial_scale;
+    int sampling_ratio = param->sampling_ratio;
+    int roi_cols = batch_indices_blob->GetBlobDesc().dims[0];
+    float *input_data = static_cast<float *>(input_blob->GetHandle().base);
+    float *input_rois = static_cast<float *>(rois_blob->GetHandle().base);
+    int *batch_indices_ptr = static_cast<int *>(batch_indices_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+    roialign_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(count, input_data,
+        spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, input_rois, roi_cols,
+        output_data, param->mode, batch_indices_ptr);
+    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(RoiAlign, LAYER_ROIALIGN);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_scatter_nd_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_scatter_nd_layer_acc.cu
new file mode 100644
index 0000000..4e716e9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_scatter_nd_layer_acc.cu
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(ScatterND, LAYER_SCATTER_ND);
+
+__global__ void scatter_nd_kernel(int offset_count, const int* indice, const float* update, float* output,
+        int last_indice_dimension, int element_to_copy, int* element_counts) {
+    CUDA_KERNEL_LOOP(index, offset_count) {
+        int offset = 0;
+        for (int j = 0; j < last_indice_dimension; j++) {
+            offset += indice[index * last_indice_dimension + j] * element_counts[j];
+        }
+        for (int j = 0; j < element_to_copy; j++) {
+            output[offset] = update[index * element_to_copy + j];
+        }
+    }
+}
+
+Status CudaScatterNDLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    auto res = dynamic_cast<ScatterNDLayerResource *>(resource);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (inputs.size() < 3) {
+        auto count = res->indices.GetDataCount();
+        CreateTempBuf(count * sizeof(int));
+    }
+
+    return TNN_OK;
+}
+
+Status CudaScatterNDLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    this->is_reshaped = false;
+    return TNN_OK;
+}
+
+Status CudaScatterNDLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *update_blob = inputs.size() < 3 ? inputs[1] : inputs[2];
+    Blob *output_blob = outputs[0];
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    float* input_data = reinterpret_cast<float*>(input_blob->GetHandle().base);
+    float* update_data = reinterpret_cast<float*>(update_blob->GetHandle().base);
+    float* output_data = reinterpret_cast<float*>(output_blob->GetHandle().base);
+    int* indice_data = nullptr;
+    int* element_counts = nullptr;
+
+    if (inputs.size() < 3 && tempbufs_.size() < 2) {
+        CreateTempBuf(input_dims.size() * sizeof(int));
+    }
+
+    auto resource = dynamic_cast<ScatterNDLayerResource *>(resource_);
+    DimsVector indices_dims;
+
+    if (!this->is_reshaped) {
+        if (inputs.size() < 3) {
+            indices_dims = resource->indices.GetBufferDims();
+        } else {
+            indices_dims = inputs[1]->GetBlobDesc().dims;
+        }
+        auto indice_rank = indices_dims.size();
+        auto last_indice_dimension = indices_dims[indice_rank - 1];
+        std::vector<int> element_counts_(last_indice_dimension, 0);
+        for (int i = 0; i < last_indice_dimension; ++i) {
+            element_counts_[i] = DimsVectorUtils::Count(input_dims, i + 1);
+        }
+
+        if (inputs.size() < 3) {
+            int* indice = resource->indices.force_to<int*>();
+            int count = resource->indices.GetDataCount();
+            cudaMemcpyAsync(tempbufs_[0].ptr, indice, count * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+            cudaMemcpyAsync(tempbufs_[1].ptr, element_counts_.data(), last_indice_dimension * sizeof(int),
+                cudaMemcpyHostToDevice, context_->GetStream());
+        } else {
+            cudaMemcpyAsync(tempbufs_[0].ptr, element_counts_.data(), last_indice_dimension * sizeof(int),
+                cudaMemcpyHostToDevice, context_->GetStream());
+        }
+        this->is_reshaped = true;
+    }
+
+    if (inputs.size() < 3) {
+        indices_dims = resource->indices.GetBufferDims();
+        indice_data = (int*)tempbufs_[0].ptr;
+        element_counts = (int*)tempbufs_[1].ptr;
+    } else {
+        indices_dims = inputs[1]->GetBlobDesc().dims;
+        indice_data = reinterpret_cast<int*>(inputs[1]->GetHandle().base);
+        element_counts = (int*)tempbufs_[0].ptr;
+    }
+
+    auto indice_rank = indices_dims.size();
+    auto last_indice_dimension = indices_dims[indice_rank - 1];
+    int element_to_copy = DimsVectorUtils::Count(input_dims, last_indice_dimension);
+    int offset_count = DimsVectorUtils::Count(indices_dims, 0, indice_rank - 1);
+
+    cudaMemcpyAsync(output_data, input_data, DimsVectorUtils::Count(input_dims) * sizeof(float),
+        cudaMemcpyDeviceToDevice, context_->GetStream());
+
+    scatter_nd_kernel<<<TNN_CUDA_GET_BLOCKS(offset_count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        offset_count, indice_data, update_data, output_data, last_indice_dimension, element_to_copy, element_counts);
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shape_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shape_layer_acc.cu
new file mode 100644
index 0000000..211ecdf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shape_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Shape, LAYER_SHAPE);
+
+Status CudaShapeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaShapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaShapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Shape, LAYER_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shuffle_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shuffle_layer_acc.cu
new file mode 100644
index 0000000..3851773
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_shuffle_layer_acc.cu
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+__global__ void shuffle_kernel(const int count, const int feature_map_size, const float *input, 
+        float *output, int group_row, int group_column, int len) {
+    CUDA_KERNEL_LOOP(index, count) {
+        const int n = index / group_row / group_column / len;
+        const int i = (index / group_column / len) % group_row;
+        const int j = index / len % group_column;
+        const int k = index - (n * feature_map_size + (i * group_column + j) * len);
+        float* p_o = output + n * feature_map_size + (j * group_row + i) * len;
+        p_o[k] = input[index];
+    }
+}
+
+Status CudaShuffleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ShuffleLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: ShuffleLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ShuffleLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto dims   = input_blob->GetBlobDesc().dims;
+    const int num              = dims[0];
+    const int feature_map_size = DimsVectorUtils::Count(dims, 1);
+    const int sp_sz            = DimsVectorUtils::Count(dims, 2);
+    const int chs              = dims[1];
+
+    int group_row    = param->group;
+    int group_column = int(chs / group_row);
+    assert(chs == (group_column * group_row));
+    int count = DimsVectorUtils::Count(dims);
+
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    shuffle_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, feature_map_size, input_data, output_data, group_row, group_column, sp_sz);
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sigmoid_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sigmoid_layer_acc.cu
new file mode 100644
index 0000000..ab13ec3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sigmoid_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Sigmoid, LAYER_SIGMOID);
+
+Status CudaSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSigmoidLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Sigmoid, LAYER_SIGMOID);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sign_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sign_layer_acc.cu
new file mode 100644
index 0000000..2f8193f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sign_layer_acc.cu
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Sign, LAYER_SIGN);
+
+ __global__ void sign_kernel(const int n, const float* in, float* out) {
+    CUDA_KERNEL_LOOP(index, n) {
+        float value = in[index];
+        if (value > 0.f) {
+            value = 1.f;
+        } else if (value < 0.f) {
+            value = -1.f;
+        }
+        out[index] = value;
+    }
+}
+
+Status CudaSignLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSignLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSignLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    sign_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data, output_data);
+    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Sign, LAYER_SIGN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sin_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sin_layer_acc.cu
new file mode 100644
index 0000000..0a8cdd2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sin_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Sin, LAYER_SIN);
+
+Status CudaSinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Sin, LAYER_SIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softmax_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softmax_layer_acc.cu
new file mode 100644
index 0000000..6ec0a2a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softmax_layer_acc.cu
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(SoftMax, LAYER_SOFTMAX);
+
+__global__ void softmax_channel_max_kernel(const int num, const int channels, 
+    const int spatial_dim, const float* data, float* out) {
+    CUDA_KERNEL_LOOP(index, num*spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        float maxval = -FLT_MAX;
+        for (int c = 0; c < channels; ++c) {
+            maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+        }
+        out[index] = maxval;
+    }
+}
+
+__global__ void softmax_channel_subtract_exp_kernel(const int count,
+    const int num, const int channels,
+    const int spatial_dim, const float* bottom_data, const float* channel_max, float* top_data) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int n = index / channels / spatial_dim;
+        int s = index % spatial_dim;
+        top_data[index] = exp(bottom_data[index] - channel_max[n * spatial_dim + s]);
+    }
+}
+
+__global__ void softmax_channel_sum_kernel(const int num, const int channels,
+    const int spatial_dim, const float* data, float* channel_sum) {
+    CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        float sum = 0;
+        for (int c = 0; c < channels; ++c) {
+          sum += data[(n * channels + c) * spatial_dim + s];
+        }
+        channel_sum[index] = sum;
+    }
+}
+
+__global__ void softmax_channel_div_kernel(const int count,
+    const int num, const int channels,
+    const int spatial_dim, const float* channel_sum, float* data) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int n = index / channels / spatial_dim;
+        int s = index % spatial_dim;
+        data[index] /= channel_sum[n * spatial_dim + s];
+    }
+}
+
+Status CudaSoftMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    CreateTempBuf(DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims) * sizeof(float));
+    return TNN_OK;
+}
+
+Status CudaSoftMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSoftMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<SoftmaxLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: SoftMaxLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SoftMaxLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    int count = DimsVectorUtils::Count(dims);
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    int axis = params->axis;
+    axis = static_cast<int>((axis + dims.size()) % dims.size());
+    int channel = dims[axis];
+    int outer_num = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, 0, axis);
+    int inner_num = DimsVectorUtils::Count(input_blob->GetBlobDesc().dims, axis + 1);
+
+    softmax_channel_max_kernel<<<TNN_CUDA_GET_BLOCKS(outer_num * inner_num), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>
+      (outer_num, channel, inner_num, input_data, (float*)tempbufs_[0].ptr);
+    softmax_channel_subtract_exp_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>
+      (count, outer_num, channel, inner_num, input_data, (float*)tempbufs_[0].ptr, output_data);
+    softmax_channel_sum_kernel<<<TNN_CUDA_GET_BLOCKS(outer_num * inner_num), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>
+      (outer_num, channel, inner_num, output_data, (float*)tempbufs_[0].ptr);
+    softmax_channel_div_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>
+      (count, outer_num, channel, inner_num, (float*)tempbufs_[0].ptr, output_data);
+    
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(SoftMax, LAYER_SOFTMAX);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softplus_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softplus_layer_acc.cu
new file mode 100644
index 0000000..f051df8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_softplus_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Softplus, LAYER_SOFTPLUS);
+
+Status CudaSoftplusLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSoftplusLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSoftplusLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Softplus, LAYER_SOFTPLUS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu
new file mode 100644
index 0000000..d849dc4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(SplitV, LAYER_SPLITV);
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void splitv_separate_kernel(
+    const float * __restrict__ src, float * dst,
+    const int inner_size, const int in_stride, 
+    const int split_start, const int split_end)
+{
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+
+    const int split_size = split_end - split_start;
+    const int size = split_size * inner_size;
+    src += blockIdx.z * in_stride;
+    dst += blockIdx.z * size;
+  
+    #pragma unroll
+    for(int i =0;i < ELE_PER_THREAD ;i++)
+    {
+        int index = block_offset + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (index < size){
+          int input_index = index + split_start * inner_size;
+          dst[index] = __ldg(src + input_index);
+        }
+    }
+
+}
+
+Status CudaSplitVLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    if (!layer_param || layer_param->slices.size() != outputs.size()) {
+        return Status(TNNERR_PARAM_ERR, "CudaSplitVLayerAcc has invalid param, slices size != output blobs size");
+    }
+
+    return TNN_OK;
+}
+
+Status CudaSplitVLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    int axis = layer_param->axis;
+    Blob *input_blob  = inputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 16;
+
+    const int in_stride = DimsVectorUtils::Count(dims, axis);
+    const int inner_size = DimsVectorUtils::Count(dims, axis + 1);
+    
+    auto slices = layer_param->slices;
+    int split_num = slices.size();
+
+    int split_begin = 0;
+    for(int i= 0; i < split_num; i++) {
+      Blob* output_blob = outputs[i];
+      int split_end = split_begin + slices[i];
+      dim3 griddim;
+      griddim.x = (slices[i] * inner_size + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+      griddim.z = DimsVectorUtils::Count(dims, 0, axis);
+
+      float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+      splitv_separate_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+          (input_data, output_data, inner_size, in_stride, split_begin, split_end);
+      split_begin = split_end;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(SplitV, LAYER_SPLITV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sqrt_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sqrt_layer_acc.cu
new file mode 100644
index 0000000..ef38067
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sqrt_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Sqrt, LAYER_SQRT);
+
+Status CudaSqrtLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSqrtLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSqrtLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Sqrt, LAYER_SQRT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_squeeze_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_squeeze_layer_acc.cu
new file mode 100644
index 0000000..bb2488a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_squeeze_layer_acc.cu
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Squeeze, LAYER_SQUEEZE);
+
+Status CudaSqueezeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    int count = DimsVectorUtils::Count(dims);
+    void* input_data = input_blob->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+    auto size = count * DataTypeUtils::GetBytesSize(input_blob->GetBlobDesc().data_type);
+    CUDA_CHECK(cudaMemcpyAsync(output_data, input_data, size, cudaMemcpyDeviceToDevice, context_->GetStream()));
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Squeeze, LAYER_SQUEEZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc.cu
new file mode 100644
index 0000000..7599925
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc.cu
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "cuda_strided_slice_layer_acc_kernel.cuh"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status CudaStrideSliceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }    
+    CreateTempBuf(4 * sizeof(int));
+    CreateTempBuf(4 * sizeof(int));
+    auto params = dynamic_cast<StrideSliceLayerParam *>(param);
+    if (!params) {
+        LOGE("Error: ShuffleLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ShuffleLayerParam is nil");
+    }
+
+    cudaMemcpyAsync(tempbufs_[0].ptr, &(params->begins[0]), 4 * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+    cudaMemcpyAsync(tempbufs_[1].ptr, &(params->strides[0]), 4 * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+
+    return TNN_OK;
+}
+
+Status CudaStrideSliceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaStrideSliceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    int input_n = input_dims[0];
+    int input_c = input_dims[1];
+    int input_h = input_dims[2];
+    int input_w = input_dims[3];
+
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    int output_c = output_dims[1];
+    int output_h = output_dims[2];
+    int output_w = output_dims[3];
+    int div_c = output_w * output_h;
+    int div_n = output_w * output_h * output_c;
+
+    int count = DimsVectorUtils::Count(output_dims);
+
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    return RunStrideSlice(count, input_data, input_c, input_h, input_w, (const int*)tempbufs_[0].ptr, 
+                (const int*)tempbufs_[1].ptr, output_data, output_c, output_h, output_w, div_c, div_n, context_->GetStream());
+
+}
+
+REGISTER_CUDA_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cu
new file mode 100644
index 0000000..f81dede
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cu
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cuh"
+
+namespace TNN_NS {
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void strided_slice_kernel(int size, const float * __restrict__ srcData, int input_c, int input_h,
+        int input_w, const int* __restrict__ begin, const int* __restrict__ strides, float* __restrict__ dstData,
+        int output_c, int output_h, int output_w, int div_c, int div_n) {
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+
+    const int mul_n = input_c * input_h * input_w * strides[3];
+    const int mul_c = input_h * input_w * strides[2];
+    const int mul_h = input_w * strides[1];
+    const int mul_w = strides[0];
+    const int offset = begin[3] * input_c * input_h * input_w +
+                   + begin[2] * input_h * input_w +
+                   + begin[1] * input_w
+                   + begin[0];
+
+    #pragma unroll
+    for(int i =0;i < ELE_PER_THREAD ;i++) {
+        int index = block_offset + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (index < size) {
+            int w = index % output_w;
+            int h = index / output_w % output_h;
+            int c = index / div_c % output_c;
+            int n = index / div_n ;
+            int input_index = n * mul_n + c * mul_c + h * mul_h + w * mul_w + offset;
+            dstData[index] = srcData[input_index];
+        }
+    }
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void strided_slice_v2_kernel(int size, const float * __restrict__ srcData, int input_c, int input_d, int input_h,
+        int input_w, const int* __restrict__ begin, const int* __restrict__ strides, float* __restrict__ dstData,
+        int output_c, int output_d, int output_h, int output_w, int div_d, int div_c, int div_n) {
+    int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+
+    const int mul_n = input_c * input_d * input_h * input_w * strides[4];
+    const int mul_c = input_d * input_h * input_w * strides[3];
+    const int mul_d = input_h * input_w * strides[2];
+    const int mul_h = input_w * strides[1];
+    const int mul_w = strides[0];
+    const int offset = begin[4] * input_c * input_d * input_h * input_w
+                   + begin[3] * input_d * input_h * input_w
+                   + begin[2] * input_h * input_w +
+                   + begin[1] * input_w
+                   + begin[0];
+
+    #pragma unroll
+    for(int i =0;i < ELE_PER_THREAD ;i++) {
+        int index = block_offset + i * THREAD_PER_BLOCK + threadIdx.x;
+        if (index < size) {
+            int w = index % output_w;
+            int h = index / output_w % output_h;
+            int d = index / div_d % output_d;
+            int c = index / div_c % output_c;
+            int n = index / div_n ;
+            int input_index = n * mul_n + c * mul_c + d * mul_d + h * mul_h + w * mul_w + offset;
+            dstData[index] = srcData[input_index];
+        }
+    }
+}
+
+Status RunStrideSlice(int size, const float * src_data, int input_c, int input_h,
+        int input_w, const int* begin, const int* strides, float* dst_data,
+        int output_c, int output_h, int output_w, int div_c, int div_n, cudaStream_t stream) {
+     
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 64;
+    int blocks = (size + THREAD_PER_BLOCK * ELE_PER_THREAD - 1) / (THREAD_PER_BLOCK * ELE_PER_THREAD);
+    strided_slice_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<blocks, THREAD_PER_BLOCK, 0, stream>>>(
+        size, src_data, input_c, input_h, input_w, begin, strides,
+        dst_data, output_c, output_h, output_w, div_c, div_n);
+    return TNN_OK;
+}
+
+Status RunStrideSlice(int size, const float * src_data, int input_c, int input_d, int input_h,
+        int input_w, const int* begin, const int* strides, float* dst_data,
+        int output_c, int output_d, int output_h, int output_w, int div_d, int div_c, int div_n, cudaStream_t stream) {
+     
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 1;
+    int blocks = (size + THREAD_PER_BLOCK * ELE_PER_THREAD - 1) / (THREAD_PER_BLOCK * ELE_PER_THREAD);
+    strided_slice_v2_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<blocks, THREAD_PER_BLOCK, 0, stream>>>(
+        size, src_data, input_c, input_d, input_h, input_w, begin, strides,
+        dst_data, output_c, output_d, output_h, output_w, div_d, div_c, div_n);
+    return TNN_OK;
+}
+
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cuh b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cuh
new file mode 100644
index 0000000..5eef7ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_layer_acc_kernel.cuh
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_STRIDED_SLICE_LAYER_ACC_KERNEL_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_STRIDED_SLICE_LAYER_ACC_KERNEL_CUH_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+Status RunStrideSlice(int size, const float * src_data, int input_c, int input_h,
+        int input_w, const int* begin, const int* strides, float* dst_data,
+        int output_c, int output_h, int output_w, int div_c, int div_n, cudaStream_t stream);
+
+Status RunStrideSlice(int size, const float * src_data, int input_c, int input_d, int input_h,
+        int input_w, const int* begin, const int* strides, float* dst_data,
+        int output_c, int output_d, int output_h, int output_w, int div_d, int div_c, int div_n, cudaStream_t stream);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_STRIDED_SLICE_LAYER_ACC_KERNEL_CUH_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_v2_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_v2_layer_acc.cu
new file mode 100644
index 0000000..6663b11
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_strided_slice_v2_layer_acc.cu
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "cuda_strided_slice_layer_acc_kernel.cuh"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+Status CudaStrideSliceV2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }    
+    CreateTempBuf(5 * sizeof(int));
+    CreateTempBuf(5 * sizeof(int));
+
+    return TNN_OK;
+}
+
+Status CudaStrideSliceV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    this->is_reshaped = false;
+    return TNN_OK;
+}
+
+Status CudaStrideSliceV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    if (!this->is_reshaped) {
+        auto params = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+        if (!params) {
+            LOGE("Error: ShuffleLayerParam is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: ShuffleLayerParam is nil");
+        }
+
+        auto param_begins = params->begins;
+        auto param_strides = params->strides;
+        auto axes = params->axes;
+        std::vector<int> begins(5, 0), strides(5, 1);
+        for(int i = 0; i < axes.size(); ++i) {
+            int axis = axes[i];
+            int begin = param_begins[i];
+            begins[axis] = begin >= 0? begin : begin + input_dims[axis];
+            strides[axis] = param_strides[i];
+        }
+        std::reverse(begins.begin(), begins.end());
+        std::reverse(strides.begin(), strides.end());
+
+        cudaMemcpy(tempbufs_[0].ptr, &(begins[0]), 5 * sizeof(int), cudaMemcpyHostToDevice);
+        cudaMemcpy(tempbufs_[1].ptr, &(strides[0]), 5 * sizeof(int), cudaMemcpyHostToDevice);
+        this->is_reshaped = true;
+    }
+
+    int input_n = input_dims[0];
+    int input_c = input_dims[1];
+    int output_c = output_dims[1];
+    int input_d = 1, output_d = 1;
+    if(input_dims.size() > 2) {
+        input_d = input_dims[2];
+        output_d = output_dims[2];
+    }
+    int input_h = 1, output_h = 1;
+    if(input_dims.size() > 3) {
+        input_h = input_dims[3];
+        output_h = output_dims[3];
+    }
+    int input_w = 1, output_w = 1;
+    if(input_dims.size() > 4) {
+        input_w = input_dims[4];
+        output_w = output_dims[4];
+    }
+
+    int div_d = output_w * output_h;
+    int div_c = output_w * output_h * output_d;
+    int div_n = output_w * output_h * output_d * output_c;
+    int count = DimsVectorUtils::Count(output_dims);
+
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+
+    return RunStrideSlice(count, input_data, input_c, input_d, input_h, input_w, (const int*)tempbufs_[0].ptr,
+                (const int*)tempbufs_[1].ptr, output_data, output_c, output_d, output_h, output_w, div_d, div_c, div_n, context_->GetStream());
+}
+
+REGISTER_CUDA_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sub_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sub_layer_acc.cu
new file mode 100644
index 0000000..b43e0fe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_sub_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Sub, LAYER_SUB);
+
+Status CudaSubLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSubLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSubLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tan_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tan_layer_acc.cu
new file mode 100644
index 0000000..a16185f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tan_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Tan, LAYER_TAN);
+
+Status CudaTanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaTanLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaTanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Tan, LAYER_TAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tanh_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tanh_layer_acc.cu
new file mode 100644
index 0000000..a68780a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tanh_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Tanh, LAYER_TANH);
+
+Status CudaTanhLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaTanhLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaTanhLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Tanh, LAYER_TANH);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tile_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tile_layer_acc.cu
new file mode 100644
index 0000000..f9f0c21
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_tile_layer_acc.cu
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Tile, LAYER_REPEAT);
+
+__global__ void tile_kernel(int count, const float *input, float *output, const int* input_dims, const int* output_dims, int size) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int offset = 0;
+        int prod = count;
+        for (int i = 0; i < size; i++) {
+            prod /= output_dims[i];
+            int mod = index / prod % input_dims[i];
+            offset = offset * input_dims[i] + mod;
+        }
+        output[index] = input[offset];
+    }
+}
+
+Status CudaTileLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaTileLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    this->is_reshaped = false;
+    return TNN_OK;
+}
+
+Status CudaTileLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (tempbufs_.size() == 0) {
+        auto output_dims = outputs[0]->GetBlobDesc().dims;
+        CreateTempBuf(output_dims.size() * sizeof(int));
+        CreateTempBuf(output_dims.size() * sizeof(int));
+    }
+
+    if (!this->is_reshaped) {
+        auto input_dims = inputs[0]->GetBlobDesc().dims;
+        auto output_dims = outputs[0]->GetBlobDesc().dims;
+        while (input_dims.size() < output_dims.size()) {
+            input_dims.insert(input_dims.begin(), 1);
+        }
+
+        cudaMemcpyAsync(tempbufs_[0].ptr, input_dims.data(), input_dims.size()*sizeof(int),
+            cudaMemcpyHostToDevice, context_->GetStream());
+        cudaMemcpyAsync(tempbufs_[1].ptr, output_dims.data(), output_dims.size()*sizeof(int),
+            cudaMemcpyHostToDevice, context_->GetStream());
+        this->is_reshaped = true;
+    }
+
+    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    float* input_data = static_cast<float*>(inputs[0]->GetHandle().base);
+    float* output_data = static_cast<float*>(outputs[0]->GetHandle().base);
+
+    tile_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+        count, input_data, output_data, (const int *)tempbufs_[0].ptr, (const int *)tempbufs_[1].ptr,
+        inputs[0]->GetBlobDesc().dims.size());
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Tile, LAYER_REPEAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_topk_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_topk_layer_acc.cu
new file mode 100644
index 0000000..823ed00
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_topk_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(TopK, LAYER_TOPK);
+
+Status CudaTopKLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaTopKLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaTopKLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(TopK, LAYER_TOPK);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_unsqueeze_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_unsqueeze_layer_acc.cu
new file mode 100644
index 0000000..f02f4db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_unsqueeze_layer_acc.cu
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status CudaUnsqueezeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaUnsqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaUnsqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    int count = DimsVectorUtils::Count(dims);
+    void* input_data = input_blob->GetHandle().base;
+    void* output_data = output_blob->GetHandle().base;
+    auto size = count * DataTypeUtils::GetBytesSize(input_blob->GetBlobDesc().data_type);
+    CUDA_CHECK(cudaMemcpyAsync(output_data, input_data, size, cudaMemcpyDeviceToDevice, context_->GetStream()));
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_upsample_layer_acc.cu b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_upsample_layer_acc.cu
new file mode 100644
index 0000000..c56193b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/acc/cuda_upsample_layer_acc.cu
@@ -0,0 +1,143 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Upsample, LAYER_UPSAMPLE);
+
+__global__ void upsample_nearest2d_kernel(int count, const float* srcData,
+        float* dstData, float rheight, float rwidth, int output_c,
+        int output_h, int output_w, int input_h, int input_w) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int w = index % output_w;
+        int h = (index / output_w) % output_h;
+        int nc = index / output_w / output_h;
+        int scaled_h = h * rheight;
+        int scaled_w = w * rwidth;
+        int in_index = (nc * input_h + scaled_h) * input_w + scaled_w;
+        dstData[index] = srcData[in_index];
+    }
+}
+
+__global__ void upsample_bilinear2d_align_corners_kernel(int count, const float * srcData,
+        float * dstData, float rheight, float rwidth, int output_c,
+        int output_h, int output_w, int input_h, int input_w) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int w = index % output_w;
+        int h = (index / output_w) % output_h;
+        int c = (index / output_w / output_h) % output_c;
+        int n = index / output_w / output_h / output_c;
+        float h1r = rheight * h;
+        int h1 = h1r;
+        int h1p = (h1 < input_h - 1) ? 1 : 0;
+        float h1lambda = h1r - h1;
+        float h0lambda = (float)1. - h1lambda;
+
+        float w1r = rwidth * w;
+        int w1 = w1r;
+        int w1p = (w1 < input_w - 1) ? 1 : 0;
+        float w1lambda = w1r - w1;
+        float w0lambda = (float)1. - w1lambda;
+
+        int left_top = (n * output_c + c) * input_h * input_w + h1 * input_w + w1;
+        dstData[index] = h0lambda * (w0lambda * srcData[left_top] + w1lambda * srcData[left_top + w1p]) +
+            h1lambda * (w0lambda * srcData[left_top + h1p * input_w] + w1lambda * srcData[left_top + h1p * input_w + w1p]);
+    }
+}
+
+__global__ void upsample_bilinear2d_no_align_corners_kernel(int count, const float * srcData,
+        float * dstData, float rheight, float rwidth, int output_c,
+        int output_h, int output_w, int input_h, int input_w) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int w = index % output_w;
+        int h = (index / output_w) % output_h;
+        int c = (index / output_w / output_h) % output_c;
+        int n = index / output_w / output_h / output_c;
+        float h1r = rheight * (h + 0.5) - 0.5;
+        h1r = h1r >= 0 ? h1r : 0;
+        int h1 = h1r;
+        int h1p = (h1 < input_h - 1) ? 1 : 0;
+        float h1lambda = h1r - h1;
+        float h0lambda = (float)1. - h1lambda;
+
+        float w1r = rwidth * (w + 0.5) - 0.5;
+        w1r = w1r >= 0? w1r : 0;
+        int w1 = w1r;
+        int w1p = (w1 < input_w - 1) ? 1 : 0;
+        float w1lambda = w1r - w1;
+        float w0lambda = (float)1. - w1lambda;
+
+        int left_top = (n * output_c + c) * input_h * input_w + h1 * input_w + w1;
+        dstData[index] = h0lambda * (w0lambda * srcData[left_top] + w1lambda * srcData[left_top + w1p]) +
+            h1lambda * (w0lambda * srcData[left_top + h1p * input_w] + w1lambda * srcData[left_top + h1p * input_w + w1p]);
+    }
+}
+
+Status CudaUpsampleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaUpsampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaUpsampleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<UpsampleLayerParam *>(param_);
+    if (!params) {
+        LOGE("Error: UpsampleLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: UpsampleLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    int input_height = input_blob->GetBlobDesc().dims[2];
+    int input_width = input_blob->GetBlobDesc().dims[3];
+    int output_height = output_blob->GetBlobDesc().dims[2];
+    int output_width = output_blob->GetBlobDesc().dims[3];
+    int output_channel = output_blob->GetBlobDesc().dims[1];
+    int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    if (params->mode == 2) {
+        if (params->align_corners) {
+            float rheight = (output_height > 1) ? (float)(input_height - 1) / (output_height - 1) : 0.f;
+            float rwidth = (output_width > 1) ? (float)(input_width - 1) / (output_width - 1) : 0.f;
+            upsample_bilinear2d_align_corners_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(count,
+                input_data, output_data, rheight, rwidth, output_channel, output_height, output_width, input_height, input_width);
+        } else {
+            float rheight = (output_height > 1) ? (float)(input_height) / output_height : 0.f;
+            float rwidth = (output_width > 1) ? (float)(input_width) / output_width : 0.f;
+            upsample_bilinear2d_no_align_corners_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(count,
+                input_data, output_data, rheight, rwidth, output_channel, output_height, output_width, input_height, input_width);
+        }
+    } else if (params->mode == 1) {
+        float rheight = (float)(input_height) / output_height;
+        float rwidth = (float)(input_width) / output_width;
+        upsample_nearest2d_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(count,
+            input_data, output_data, rheight, rwidth, output_channel, output_height, output_width, input_height, input_width);
+    } else {
+        LOGE("Error: Upsample dont support resize mode\n");
+        return Status(TNNERR_MODEL_ERR, "Error: Upsample dont support resize mode");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Upsample, LAYER_UPSAMPLE);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.cc b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.cc
new file mode 100644
index 0000000..93a11d7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.cc
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/cuda_blob_converter.h"
+#include "tnn/device/cuda/cuda_context.h"
+#include "tnn/device/cuda/cuda_device.h"
+#include "tnn/device/cuda/cuda_macro.h"
+#include "tnn/device/cuda/cuda_blob_converter_kernel.cuh"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+CudaBlobConverterAcc::CudaBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {
+    scale_ptr_ = nullptr;
+    bias_ptr_ = nullptr;
+    image_ptr_ = nullptr;
+}
+
+CudaBlobConverterAcc::~CudaBlobConverterAcc() {
+    if (scale_ptr_) CUDA_CHECK(cudaFree(scale_ptr_));
+    if (bias_ptr_) CUDA_CHECK(cudaFree(bias_ptr_));
+    if (image_ptr_) CUDA_CHECK(cudaFree(image_ptr_));
+}
+
+Status CudaBlobConverterAcc::ConvertToMat(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = ConvertToMatAsync(image, param, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    cudaError_t status = cudaStreamSynchronize(stream);
+    if (cudaSuccess != status) {
+        return TNNERR_CUDA_SYNC_ERROR;
+    }
+    return TNN_OK;
+}
+
+Status CudaBlobConverterAcc::ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob is null");
+    }
+
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    auto blob_data = reinterpret_cast<float*>(blob_->GetHandle().base);
+    auto desc = blob_->GetBlobDesc();
+    auto dims = desc.dims;
+    auto hw = DimsVectorUtils::Count(dims, 2);
+    auto chw = DimsFunctionUtils::GetDim(dims, 1) * hw;
+    auto nchw = DimsFunctionUtils::GetDim(dims, 0) * chw;
+    if (image.GetDeviceType() == DEVICE_CUDA) {
+        prepareParamPtr(param, image.GetMatType(), stream);
+        if (image.GetMatType() == NCHW_FLOAT) {
+            ScaleBias(blob_data, (float*)image.GetData(), stream, scale_ptr_, bias_ptr_, 
+                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+        } else if (image.GetMatType() == NC_INT32 && desc.data_type == DATA_TYPE_INT32) {
+            cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int32_t),
+                cudaMemcpyDeviceToDevice, stream);
+        } else if (image.GetMatType() == N8UC4) {
+            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image.GetData(), stream, 4, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == N8UC3) {
+            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image.GetData(), stream, 3, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == NGRAY) {
+            BlobToGray(nchw, blob_data, (unsigned char*)image.GetData(), stream, param.scale[0], param.bias[0]);
+        } else {
+            ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    } else {
+        prepareImagePtr(image, param, dims, stream);
+        prepareParamPtr(param, image.GetMatType(), stream);
+        if (image.GetMatType() == NCHW_FLOAT) {
+            ScaleBias(blob_data, (float*)image_ptr_, stream, scale_ptr_, bias_ptr_, 
+                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(float),
+                cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == NC_INT32 && desc.data_type == DATA_TYPE_INT32) {
+            cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int32_t), 
+                cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == N8UC4) {
+            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image_ptr_, stream, 4, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+            cudaMemcpyAsync(image.GetData(), image_ptr_, dims[0] * 4 * hw * sizeof(unsigned char),
+                cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == N8UC3) {
+            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image_ptr_, stream, 3, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+            cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(unsigned char),
+                cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == NGRAY) {
+            BlobToGray(nchw, blob_data, (unsigned char*)image_ptr_, stream, param.scale[0], param.bias[0]);
+            cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(unsigned char),
+                cudaMemcpyDeviceToHost, stream);
+        } else {
+            ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+    return ret;
+}
+
+Status CudaBlobConverterAcc::ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = ConvertFromMatAsync(image, param, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    cudaError_t status = cudaStreamSynchronize(stream);
+    if (cudaSuccess != status) {
+        return TNNERR_CUDA_SYNC_ERROR;
+    }
+    return TNN_OK;
+}
+
+Status CudaBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob_ is null");
+    }
+
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    auto blob_data = reinterpret_cast<float*>(blob_->GetHandle().base);
+    auto desc = blob_->GetBlobDesc();
+    auto dims = desc.dims;
+    auto hw = DimsVectorUtils::Count(dims, 2);
+    auto chw = DimsFunctionUtils::GetDim(dims, 1) * hw;
+    auto nchw = DimsFunctionUtils::GetDim(dims, 0) * chw;
+
+    if (image.GetDeviceType() == DEVICE_CUDA) {
+        prepareParamPtr(param, image.GetMatType(), command_queue);
+        if (image.GetMatType() == NCHW_FLOAT) {
+            ScaleBias((float*)image.GetData(), blob_data, stream, scale_ptr_, bias_ptr_, 
+                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+        } else if (image.GetMatType() == NC_INT32) {
+            desc.data_type = DATA_TYPE_INT32;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t),
+                cudaMemcpyDeviceToDevice, stream);
+        } else if (image.GetMatType() == N8UC4) {
+            BGRToBlob(dims[0], chw, hw, (unsigned char*)image.GetData(), blob_data, stream, 4, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == N8UC3) {
+            BGRToBlob(dims[0], chw, hw, (unsigned char*)image.GetData(), blob_data, stream, 3, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == NGRAY) {
+            GrayToBlob(nchw, (unsigned char*)image.GetData(), blob_data, stream, param.scale[0], param.bias[0]);
+        } else {
+            ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    } else {
+        prepareImagePtr(image, param, dims, command_queue);
+        prepareParamPtr(param, image.GetMatType(), command_queue);
+        if (image.GetMatType() == NCHW_FLOAT) {
+            cudaMemcpyAsync(image_ptr_, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(float),
+                cudaMemcpyHostToDevice, stream);
+            ScaleBias((float*)image_ptr_, blob_data, stream, scale_ptr_, bias_ptr_, 
+                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+        } else if (image.GetMatType() == NC_INT32) {
+            desc.data_type = DATA_TYPE_INT32;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t), 
+                cudaMemcpyHostToDevice, stream);
+        } else if (image.GetMatType() == N8UC4) {
+            cudaMemcpyAsync(image_ptr_, image.GetData(), dims[0] * 4 * hw * sizeof(unsigned char),
+                cudaMemcpyHostToDevice, stream);
+            BGRToBlob(dims[0], chw, hw, (unsigned char*)image_ptr_, blob_data, stream, 4, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == N8UC3) {
+            cudaMemcpyAsync(image_ptr_, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(unsigned char),
+                cudaMemcpyHostToDevice, stream);
+            BGRToBlob(dims[0], chw, hw, (unsigned char*)image_ptr_, blob_data, stream, 3, scale_ptr_, bias_ptr_,
+                param.reverse_channel);
+        } else if (image.GetMatType() == NGRAY) {
+            cudaMemcpyAsync(image_ptr_, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(unsigned char),
+                cudaMemcpyHostToDevice, stream);
+            GrayToBlob(nchw, (unsigned char*)image_ptr_, blob_data, stream, param.scale[0], param.bias[0]);
+        } else {
+            ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+
+    return ret;
+}
+
+void CudaBlobConverterAcc::prepareImagePtr(Mat& image, MatConvertParam param, DimsVector dims, void* command_queue) {
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    int hw = DimsVectorUtils::Count(dims, 2);
+    int n = DimsFunctionUtils::GetDim(dims, 0);
+    int unitBytes = image.GetMatType() == NCHW_FLOAT ? sizeof(float) : sizeof(unsigned char);
+    int current_image_size = image.GetMatType() == N8UC4 ? hw * 4 * n : DimsVectorUtils::Count(dims);
+    current_image_size *= unitBytes;
+    if (!image_ptr_ || current_image_size > image_size_) {
+        if (image_ptr_) CUDA_CHECK(cudaFree(image_ptr_));
+        CUDA_CHECK(cudaMalloc((void**)&image_ptr_, current_image_size));
+        image_size_ = current_image_size;
+    }
+}
+
+void CudaBlobConverterAcc::prepareParamPtr(MatConvertParam param, MatType type, void* command_queue) {
+    if (type == NGRAY) return;
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    int c_reserve;
+    int c_copy;
+    if (type == N8UC4) {
+        c_reserve = 4;
+        c_copy = 4;
+    } else if (type == NCHW_FLOAT) {
+        c_reserve = DimsFunctionUtils::GetDim(blob_->GetBlobDesc().dims, 1);
+        c_copy = c_reserve;
+    } else {
+        c_reserve = 4;
+        c_copy = 3;
+    }
+    if(c_reserve > c_reserve_) {
+        c_reserve_ = c_reserve;
+        if(scale_ptr_) CUDA_CHECK(cudaFree(scale_ptr_));
+        if(bias_ptr_) CUDA_CHECK(cudaFree(bias_ptr_));
+        CUDA_CHECK(cudaMalloc((void**)&scale_ptr_, c_reserve * sizeof(float)));
+        CUDA_CHECK(cudaMalloc((void**)&bias_ptr_, c_reserve * sizeof(float)));
+    }
+    cudaMemcpyAsync(scale_ptr_, param.scale.data(), c_copy * sizeof(float), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(bias_ptr_, param.bias.data(), c_copy * sizeof(float), cudaMemcpyHostToDevice, stream);
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(Cuda);
+REGISTER_BLOB_CONVERTER(Cuda, DEVICE_CUDA);
+
+}  //  namespace TNN_NS
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.h b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.h
new file mode 100644
index 0000000..ab502dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_BLOB_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_BLOB_CONVERTER_H_
+
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+class CudaBlobConverterAcc : public BlobConverterAcc {
+public:
+    CudaBlobConverterAcc(Blob* blob);
+    virtual ~CudaBlobConverterAcc();
+
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = nullptr);
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = nullptr);
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = nullptr);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = nullptr);
+
+private:
+    int c_reserve_ = 0;
+    float* scale_ptr_;
+    float* bias_ptr_;
+    void* image_ptr_;
+    uint64_t image_size_ = 0;
+
+    void prepareImagePtr(Mat& image, MatConvertParam param, DimsVector dims, void* command_queue);
+    void prepareParamPtr(MatConvertParam param, MatType type, void* command_queue);
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_BLOB_CONVERTER_H_
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cu b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cu
new file mode 100644
index 0000000..3c2f1eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cu
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/cuda_blob_converter_kernel.cuh"
+
+namespace TNN_NS {
+
+#define ELEMENT_PER_THREAD 4
+#define THREAD_PER_BLOCK 128
+
+inline __device__ unsigned char fp32_to_u8_sat(float in) {
+    int x = __float2int_rn(in);
+    x = x > 255 ? 255 : x;
+    x = x > 0 ? x : 0;
+    return (unsigned char)(x);
+}
+
+__global__ void scale_bias_kernel(int size, const float* src, float* dst, float* scale, float* bias, int hw, int channels) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < size) {
+        int c = tid / hw % channels;
+        dst[tid] = src[tid] * scale[c] + bias[c];
+    }
+}
+
+__global__ void blob_to_bgr_kernel(int CHW, int HW, const float* __restrict__ src, unsigned char *dst,
+        int channels, float *scale, float *bias, bool reverse_channel) {
+    const int offset = ELEMENT_PER_THREAD * THREAD_PER_BLOCK * blockIdx.x + threadIdx.x;
+
+    src += offset + blockIdx.y * CHW;
+    dst += offset * channels + blockIdx.y * channels * HW;
+    if (reverse_channel) {
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[i * THREAD_PER_BLOCK * channels] = fp32_to_u8_sat(src[i * THREAD_PER_BLOCK + 2 * HW] * scale[2] + bias[2]);
+            }
+        }
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[1 + i * THREAD_PER_BLOCK * channels] = fp32_to_u8_sat(src[i * THREAD_PER_BLOCK + 1 * HW] * scale[1] + bias[1]);
+            }
+        }
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[2 + i * THREAD_PER_BLOCK * channels] = fp32_to_u8_sat(src[i * THREAD_PER_BLOCK] * scale[0] + bias[0]);
+            }
+        }
+    } else {
+        for (int c = 0; c < 3; c++) {
+            for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+                if (i * THREAD_PER_BLOCK + offset < HW) {
+                    dst[c + i * THREAD_PER_BLOCK * channels] = fp32_to_u8_sat(src[i * THREAD_PER_BLOCK + c * HW] * scale[c] + bias[c]);
+                }
+            }
+        }
+    }
+    if (CHW / HW == 4) {
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[3 + i * THREAD_PER_BLOCK * channels] = fp32_to_u8_sat(src[i * THREAD_PER_BLOCK + 3 * HW] * scale[3] + bias[3]);
+            }
+        }
+    }
+}
+
+__global__ void blob_to_gray_kernel(int count, const float *src, unsigned char *dst, float scale, float bias) {
+    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < count)
+        dst[index] = fp32_to_u8_sat(scale * src[index] + bias);
+}
+
+__global__ void bgr_to_blob_kernel(int CHW, int HW, const unsigned char* __restrict__ src, float *dst,
+        int channels, float *scale, float *bias, bool reverse_channel) {
+    const int offset = ELEMENT_PER_THREAD * THREAD_PER_BLOCK * blockIdx.x + threadIdx.x;
+
+    src += offset * channels + blockIdx.y * channels * HW;
+    dst += offset + blockIdx.y * CHW;
+    if (reverse_channel) {
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[i * THREAD_PER_BLOCK] = src[i * THREAD_PER_BLOCK * channels + 2] * scale[0] + bias[0];
+            }
+        }
+
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[HW + i * THREAD_PER_BLOCK] = src[i * THREAD_PER_BLOCK * channels + 1] * scale[1] + bias[1];
+            }
+        }
+
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[HW * 2 + i * THREAD_PER_BLOCK] = src[i * THREAD_PER_BLOCK * channels + 0] * scale[2] + bias[2];
+            }
+        }
+    } else {
+        for (int c = 0; c < 3; c++) {
+            for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+                if (i * THREAD_PER_BLOCK + offset < HW) {
+                    dst[c * HW + i * THREAD_PER_BLOCK] = src[i * THREAD_PER_BLOCK * channels + c] * scale[c] + bias[c];
+                }
+            }
+        }
+    }
+    if (CHW / HW == 4) {
+        for (int i = 0; i < ELEMENT_PER_THREAD; i++) {
+            if (i * THREAD_PER_BLOCK + offset < HW) {
+                dst[HW * 3 + i * THREAD_PER_BLOCK] = src[i * THREAD_PER_BLOCK * channels + 3] * scale[3] + bias[3];
+            }
+        }
+    }
+}
+
+__global__ void gray_to_blob_kernel(int count, const unsigned char *src, float *dst, float scale, float bias) {
+    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < count)
+        dst[index] = scale * src[index] + bias;
+}
+
+void BlobToBGR(int batch, int CHW, int HW, const float *src, unsigned char *dst, cudaStream_t stream,
+        int channels, float *scale, float *bias, bool reverse_channel) {
+    dim3 grid;
+    grid.x = (HW + ELEMENT_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELEMENT_PER_THREAD * THREAD_PER_BLOCK);
+    grid.y = batch;
+    blob_to_bgr_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
+        CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+}
+
+void BlobToGray(int count, const float *src, unsigned char *dst, cudaStream_t stream, float scale, float bias) {
+    const int BLOCK_NUM = (count + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    blob_to_gray_kernel<<<BLOCK_NUM, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias);
+}
+
+void BGRToBlob(int batch, int CHW, int HW, const unsigned char *src, float *dst, cudaStream_t stream,
+        int channels, float *scale, float* bias, bool reverse_channel) {
+    dim3 grid;
+    grid.x = (HW + ELEMENT_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELEMENT_PER_THREAD * THREAD_PER_BLOCK);
+    grid.y = batch;
+    bgr_to_blob_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
+        CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+}
+
+void GrayToBlob(int count, const unsigned char *src, float *dst, cudaStream_t stream, float scale, float bias) {
+    const int BLOCK_NUM = (count + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    gray_to_blob_kernel<<<BLOCK_NUM, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias);
+}
+
+void ScaleBias(const float* src, float* dst, cudaStream_t stream, float* scale, float* bias, int batch, int channels, int hw) {
+    int count = batch * channels * hw;
+    int grid = (count + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    scale_bias_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias, hw, channels); 
+}
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh
new file mode 100644
index 0000000..eac3d79
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUDA_BLOB_CONVERTER_KERNEL_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUDA_BLOB_CONVERTER_KERNEL_CUH_
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+void BlobToBGR(int batch, int CHW, int HW, const float *src, unsigned char *dst, cudaStream_t stream,
+        int channels, float *scale, float *bias, bool reverse_channel);
+
+void BlobToGray(int count, const float *src, unsigned char *dst, cudaStream_t stream, float scale, float bias);
+
+void BGRToBlob(int batch, int CHW, int HW, const unsigned char *src, float *dst, cudaStream_t stream,
+        int channels, float *scale, float *bias, bool reverse_channel);
+
+void GrayToBlob(int count, const unsigned char *src, float *dst, cudaStream_t stream, float scale, float bias);
+
+void ScaleBias(const float* src, float* dst, cudaStream_t stream, float* scale, float* bias, int batch, int channels, int hw);
+
+}  //  namespace TNN_NS;
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUDA_BLOB_CONVERTER_KERNEL_CUH_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_context.cc b/3rdparty/TNN/source/tnn/device/cuda/cuda_context.cc
new file mode 100644
index 0000000..16eca62
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_context.cc
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/cuda_context.h"
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+
+#include "tnn/device/cuda/cuda_macro.h"
+
+namespace TNN_NS {
+
+CudaContext::~CudaContext() {
+    if (own_stream_) {
+        cudaError_t status = cudaStreamDestroy(stream_);
+        if (cudaSuccess != status) {
+            LOGE("destroy cuda stream failed");
+        }
+    }
+
+    cudnnStatus_t cudnn_status = cudnnDestroy(cudnn_handle_);
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        LOGE("destroy cudnn handle failed");
+    }
+
+    cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        LOGE("destroy cublas handle failed");
+    }
+
+    if (workspace_) {
+        CUDA_CHECK(cudaFree(workspace_));
+    }
+}
+
+Status CudaContext::Setup(int device_id) {
+    this->device_id_ = device_id;
+
+    CUDA_CHECK(cudaSetDevice(device_id));
+    CUDA_CHECK(cudaStreamCreate(&stream_));
+    own_stream_ = true;
+
+    cudnnStatus_t cudnn_status = cudnnCreate(&cudnn_handle_);
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        LOGE("create cudnn handle failed");
+        return TNNERR_INST_ERR;
+    }
+
+    cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        LOGE("cudnn handle set stream failed");
+        return TNNERR_INST_ERR;
+    }
+
+    cublasStatus_t cublas_status = cublasCreate(&cublas_handle_);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        LOGE("create cublas handle failed");
+        return TNNERR_INST_ERR;
+    }
+
+    cublas_status = cublasSetStream(cublas_handle_, stream_);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        LOGE("cublas handle set stream failed");
+        return TNNERR_INST_ERR;
+    }
+
+    return TNN_OK;
+}
+
+Status CudaContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status CudaContext::GetCommandQueue(void** command_queue) {
+    CUDA_CHECK(cudaSetDevice(device_id_));
+    *command_queue = stream_;
+    return TNN_OK;
+}
+
+Status CudaContext::ShareCommandQueue(Context* context) {
+
+    if (context == nullptr)
+        return TNNERR_NULL_PARAM;
+
+    CudaContext* cuda_ctx = dynamic_cast<CudaContext*>(context);
+    if (cuda_ctx == nullptr)
+        return TNNERR_DEVICE_INVALID_COMMAND_QUEUE;
+
+    if (own_stream_) {
+        CUDA_CHECK(cudaStreamSynchronize(stream_))
+        CUDA_CHECK(cudaStreamDestroy(stream_));
+    }
+    own_stream_ = false;
+    stream_ = cuda_ctx->GetStream();
+
+    cudnnStatus_t cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        LOGE("cudnn handle set stream failed");
+        return TNNERR_INST_ERR;
+    }
+
+    cublasStatus_t cublas_status = cublasSetStream(cublas_handle_, stream_);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        LOGE("cublas handle set stream failed");
+        return TNNERR_INST_ERR;
+    }
+
+    return TNN_OK;
+}
+
+Status CudaContext::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+Status CudaContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+cudaStream_t& CudaContext::GetStream() {
+    return stream_;
+}
+
+void* CudaContext::GetWorkspace() {
+    return workspace_;
+}
+
+void CudaContext::SetWorkspaceSize(int size) {
+    if (size > workspace_size_) {
+        if (workspace_) {
+            CUDA_CHECK(cudaFree(workspace_));
+        }
+        CUDA_CHECK(cudaMalloc(&workspace_, size));
+        workspace_size_ = size;
+    }
+}
+
+Status CudaContext::Synchronize() {
+    cudaError_t status = cudaStreamSynchronize(stream_);
+    if (cudaSuccess != status) {
+        LOGE("cuda strema synchronize failed\n");
+        return TNNERR_CUDA_SYNC_ERROR;
+    }
+    return TNN_OK;
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_context.h b/3rdparty/TNN/source/tnn/device/cuda/cuda_context.h
new file mode 100644
index 0000000..c00f5ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_context.h
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_CONTEXT_H_
+
+#include <string>
+#include <vector>
+#include <cuda_runtime.h>
+
+#include <cudnn.h>
+#include <cublas_v2.h>
+
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class CudaContext : public Context {
+public:
+    // @brief deconstructor
+    ~CudaContext();
+
+    // @brief setup with specified device id
+    Status Setup(int device_id);
+
+    // @brief load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief share tnn command queue to another context
+    virtual Status ShareCommandQueue(Context* context);
+
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief get cuda stream
+    cudaStream_t& GetStream();
+
+    // @brief get workspace
+    void* GetWorkspace();
+
+    // @brief get worksapce size
+    void SetWorkspaceSize(int size);
+
+public:
+    cudnnHandle_t cudnn_handle_;
+    cublasHandle_t cublas_handle_;
+    cudaStream_t stream_;
+    void* workspace_ = nullptr;
+    int workspace_size_ = 0;
+    int device_id_;
+    bool own_stream_ = false;
+};
+
+}  //  namespace TNN_NS;
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_device.cc b/3rdparty/TNN/source/tnn/device/cuda/cuda_device.cc
new file mode 100644
index 0000000..e342faf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_device.cc
@@ -0,0 +1,176 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cuda_runtime.h>
+
+#include "tnn/device/cuda/cuda_context.h"
+#include "tnn/device/cuda/cuda_device.h"
+#include "tnn/device/cuda/cuda_macro.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+
+namespace TNN_NS {
+
+CudaDevice::CudaDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+CudaDevice::~CudaDevice() {}
+
+BlobMemorySizeInfo CudaDevice::Calculate(BlobDesc& desc) {
+    auto size_info = Calculate1DMemorySize(desc);
+    int size_count = DimsVectorUtils::Count(size_info.dims);
+    if (size_count == 0) {
+        size_info.dims[0] = 1;
+    }
+    return size_info;
+}
+
+Status CudaDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_ARM;
+    desc.data_format = DATA_FORMAT_NCHW;
+    if (mat_type == NCHW_FLOAT) {
+        desc.data_type = DATA_TYPE_FLOAT;
+    } else if (mat_type == NC_INT32) {
+        desc.data_type = DATA_TYPE_INT32;
+    } else {
+        desc.data_type = DATA_TYPE_INT8;
+    }
+    auto size_info = Calculate(desc);
+    return Allocate(handle, size_info);
+}
+
+Status CudaDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    void* ptr;
+    int bytes_size = GetBlobMemoryBytesSize(size_info);
+    cudaError_t status = cudaMalloc(&ptr, bytes_size);
+    if (cudaSuccess != status) {
+        LOGE("cuda alloc failed with size %d for %p status:%d\n", bytes_size, ptr, status);
+        return TNNERR_OUTOFMEMORY;
+    }
+
+    *handle = ptr;
+    return TNN_OK;
+}
+
+Status CudaDevice::Allocate(void** handle, size_t size) {
+    void* ptr = nullptr;
+    cudaError_t status = cudaMalloc(&ptr, size);
+    if (cudaSuccess != status) {
+        LOGE("cuda alloc failed with size %lu for %p, status:%d\n", size, ptr, status);
+        return TNNERR_OUTOFMEMORY;
+    }
+    if (ptr == nullptr) {
+        LOGE("cuda alloc got nullptr\n");
+        return TNNERR_OUTOFMEMORY;
+    }
+    *handle = ptr;
+    return TNN_OK;
+}
+
+Status CudaDevice::ReAllocate(void** handle, size_t size) {
+    Status ret;
+    if (*handle != nullptr) {
+        ret = Free(*handle);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+    ret = Allocate(handle, size);
+    return ret;
+}
+
+Status CudaDevice::Free(void* handle) {
+    cudaError_t status = cudaFree(handle);
+    if (cudaSuccess != status) {
+        LOGE("cuda free failed.");
+        return TNNERR_COMMON_ERROR;
+    }
+    return TNN_OK;
+}
+
+Status CudaDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    if (nullptr == stream) {
+        return Status(TNNERR_DEVICE_INVALID_COMMAND_QUEUE);
+    }
+
+    auto size_info = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    cudaError_t status = cudaMemcpyAsync(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+                        reinterpret_cast<char*>(src->base) + src->bytes_offset,
+                        size_in_bytes, cudaMemcpyHostToDevice, stream);
+
+    CUDA_CHECK(status);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return TNN_OK;
+}
+
+Status CudaDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+    if (nullptr == stream) {
+        return TNNERR_DEVICE_INVALID_COMMAND_QUEUE;
+    }
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    cudaError_t status = cudaMemcpyAsync(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+                        reinterpret_cast<char*>(src->base) + src->bytes_offset,
+                        size_in_bytes, cudaMemcpyDeviceToHost, stream);
+
+    CUDA_CHECK(status);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc* CudaDevice::CreateLayerAcc(LayerType type) {
+    auto layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        AbstractLayerAcc* ptr = layer_creator_map[type]->CreateLayerAcc(type);
+        return ptr;
+    }
+    return NULL;
+}
+
+Context* CudaDevice::CreateContext(int device_id) {
+    auto context = new CudaContext();
+    Status ret = context->Setup(device_id);
+    if (ret != TNN_OK) {
+        LOGE("Cuda context setup failed.");
+        delete context;
+        return NULL;
+    }
+    return context;
+}
+
+NetworkType CudaDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_TENSORRT;
+}
+
+Status CudaDevice::RegisterLayerAccCreator(LayerType type, LayerAccCreator *creator) {
+    GetLayerCreatorMap()[type] = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>&
+CudaDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+TypeDeviceRegister<CudaDevice> g_cuda_device_register(DEVICE_CUDA);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_device.h b/3rdparty/TNN/source/tnn/device/cuda/cuda_device.h
new file mode 100644
index 0000000..8d5f892
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_device.h
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_DEVICE_H_
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief CudaDevice create cuda memory and cuda layer acc
+
+class CudaDevice : public AbstractDevice {
+public:
+
+    using AbstractDevice::Allocate;
+
+    explicit CudaDevice(DeviceType device_type);
+
+    ~CudaDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    Status Allocate(void** handle, size_t size);
+
+    Status ReAllocate(void** handle, size_t size);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+private:
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+};
+
+// @brief CudaTypeLayerAccRegister register CudaTypeLayerAccCreator
+template <typename T>
+class CudaTypeLayerAccRegister {
+public:
+    explicit CudaTypeLayerAccRegister(LayerType type) {
+        CudaDevice::RegisterLayerAccCreator(type, new T());
+    }
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_macro.h b/3rdparty/TNN/source/tnn/device/cuda/cuda_macro.h
new file mode 100644
index 0000000..70ee436
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_macro.h
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MACRO_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MACRO_H_
+
+#include <sstream>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+#define FatalError(err) {                                                  \
+    std::stringstream _where, _message;                                    \
+        _where << __FILE__ << ':' << __LINE__;                             \
+        _message << std::string(err) + "\n"                                \
+                 << __FILE__ << ':' << __LINE__ << "\nAborting... \n";     \
+        LOGE("%s", _message.str().c_str());                                \
+        exit(EXIT_FAILURE);                                                \
+}
+
+#define CUDA_CHECK(status) {                                               \
+    std::stringstream _error;                                              \
+    if (cudaSuccess != status) {                                           \
+        _error << "Cuda failure: " << cudaGetErrorName(status) << " "      \
+               << cudaGetErrorString(status);                              \
+        FatalError(_error.str());                                          \
+    }                                                                      \
+}
+
+#define CUDNN_CHECK(status)                                                    \
+    {                                                                          \
+        std::stringstream _error;                                              \
+        if (status != CUDNN_STATUS_SUCCESS) {                                  \
+            _error << "CUDNN failure: " << cudnnGetErrorString(status);        \
+            FatalError(_error.str());                                          \
+        }                                                                      \
+    }
+
+#define CUBLAS_CHECK(status)                                                   \
+    {                                                                          \
+        std::stringstream _error;                                              \
+        if (status != CUBLAS_STATUS_SUCCESS) {                                 \
+            _error << "Cublas failure: "                                       \
+                   << " " << status;                                           \
+            FatalError(_error.str());                                          \
+        }                                                                      \
+    }
+
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+#define TNN_CUDA_NUM_THREADS 128
+
+inline int TNN_CUDA_GET_BLOCKS(const int N) {
+    return (N + TNN_CUDA_NUM_THREADS - 1) / TNN_CUDA_NUM_THREADS;
+}
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MACRO_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.cc b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.cc
new file mode 100644
index 0000000..a585a60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.cc
@@ -0,0 +1,207 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cuda_runtime.h>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+#include "tnn/device/cuda/cuda_mat_converter.h"
+#include "tnn/device/cuda/cuda_mat_util.cuh"
+
+namespace TNN_NS {
+
+Status CudaMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, false);
+    if (ret != TNN_OK)
+        return ret;
+
+    MatType mat_type = src.GetMatType();
+    int data_type_size = 1;
+    DimsVector dims    = src.GetDims();
+    if (mat_type == NCHW_FLOAT) {
+        data_type_size = sizeof(float);
+    } else if (mat_type == N8UC4) {
+        //special for 8UC4, blob channel <= 4.
+        dims[1] = 4;
+    }
+
+    int size_in_bytes = DimsVectorUtils::Count(dims) * data_type_size;
+    if (src.GetDeviceType() == DEVICE_NAIVE && dst.GetDeviceType() == DEVICE_CUDA) {
+        cudaMemcpy(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyHostToDevice);
+    } else if (src.GetDeviceType() == DEVICE_CUDA && dst.GetDeviceType() == DEVICE_NAIVE) {
+        cudaMemcpy(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyDeviceToHost);
+    } else if (src.GetDeviceType() == DEVICE_CUDA && dst.GetDeviceType() == DEVICE_CUDA) {
+        cudaMemcpy(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyDeviceToDevice);
+    } else {
+        memcpy(dst.GetData(), src.GetData(), size_in_bytes);
+    }
+    return ret;
+}
+
+Status CudaMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NCHW_FLOAT) {
+        ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    } else if ((src.GetMatType() == N8UC4) || (src.GetMatType() == N8UC3) || (src.GetMatType() == NGRAY)) {
+        int channel = src.GetChannel();
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinear((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetWidth(),
+                src.GetHeight(), dst_width, dst_height, channel);
+        } else if(param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearest((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetWidth(),
+                src.GetHeight(), dst_width, dst_height, channel);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else {
+        ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return ret;
+}
+
+Status CudaMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        auto mat_type = src.GetMatType();
+        int channel = mat_type == NGRAY? 1 : (mat_type == N8UC3? 3 : 4);
+        uint8_t* src_ptr = (uint8_t*)src.GetData();
+        uint8_t* dst_ptr = (uint8_t*)dst.GetData();
+        CropRGB(src_ptr, dst_ptr, src.GetBatch(), channel, src.GetWidth(), src.GetHeight(), dst.GetWidth(),
+            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y);
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.top_left_x % 2 || param.top_left_y % 2 || param.width % 2 || param.height % 2) {
+            return Status(TNNERR_PARAM_ERR, "corp param can not be odd");
+        }
+        uint8_t* src_ptr = (uint8_t*)src.GetData();
+        uint8_t* dst_ptr = (uint8_t*)dst.GetData();
+        CropYUV(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
+            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return ret;
+}
+
+Status CudaMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {
+        if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+            int channel = src.GetMatType() == NGRAY ? 1 : (src.GetMatType() == N8UC3 ? 3 : 4);
+            uint8_t* src_ptr = (uint8_t*)src.GetData();
+            uint8_t* dst_ptr = (uint8_t*)dst.GetData();
+            WarpAffineBilinear(src_ptr, src.GetBatch(), channel, src.GetWidth(), src.GetHeight(), dst_ptr, dst.GetWidth(),
+                dst.GetHeight(), param.transform, param.border_val);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {
+        if (src.GetMatType() == NGRAY || src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+            int channel = src.GetMatType() == NGRAY ? 1 : (src.GetMatType() == N8UC3 ? 3 : 4);
+            uint8_t* src_ptr = (uint8_t*)src.GetData();
+            uint8_t* dst_ptr = (uint8_t*)dst.GetData();
+            WarpAffineNearest(src_ptr, src.GetBatch(), channel, src.GetWidth(), src.GetHeight(), dst_ptr, dst.GetWidth(),
+                dst.GetHeight(), param.transform, param.border_val);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+
+    return ret;
+}
+
+Status CudaMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (type == COLOR_CONVERT_NV12TOBGR) {
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, true);
+    } else if (type == COLOR_CONVERT_NV21TOBGR) {
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, false);
+    } else if (type == COLOR_CONVERT_NV12TOBGRA) {
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, true);
+    } else if (type == COLOR_CONVERT_NV21TOBGRA) {
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, false);
+    } else if (type == COLOR_CONVERT_BGRTOGRAY) {
+        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3);
+    } else if (type == COLOR_CONVERT_BGRATOGRAY) {
+        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "color conversion type not support yet");
+    }
+
+    return ret;
+}
+
+Status CudaMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    uint8_t* src_ptr = (uint8_t*)src.GetData();
+    uint8_t* dst_ptr = (uint8_t*)dst.GetData();
+
+    if (src.GetMatType() == NGRAY) {
+        CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
+            dst.GetHeight(), 1, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+    } else if (src.GetMatType() == N8UC3) {
+        CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
+            dst.GetHeight(), 3, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+    } else if (src.GetMatType() == N8UC4) {
+        CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
+            dst.GetHeight(), 4, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder mat type not support yet");
+    }
+
+    return ret;
+}
+
+DECLARE_MAT_CONVERTER_CREATER(Cuda);
+REGISTER_MAT_CONVERTER(Cuda, DEVICE_CUDA);
+
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.h b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.h
new file mode 100644
index 0000000..bdfc598
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_converter.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MAT_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/mat_converter_acc.h"
+
+namespace TNN_NS {
+
+class CudaMatConverterAcc : public MatConverterAcc {
+public:
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MAT_CONVERTER_H_
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cu b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cu
new file mode 100644
index 0000000..98e1233
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cu
@@ -0,0 +1,587 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/cuda_mat_util.cuh"
+
+#include <algorithm>
+
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+#include "tnn/device/cuda/cuda_macro.h"
+#include "tnn/device/cuda/cuda_mat_util.cuh"
+
+namespace TNN_NS {
+
+#define INTER_RESIZE_COEF_BITS 11
+#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
+#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX)
+#define INTER_REMAP_COEF_BITS  15
+#define INTER_REMAP_COEF_SCALE (1<<INTER_REMAP_COEF_BITS)
+#define INTER_BITS      5
+#define INTER_TAB_SIZE  (1<<INTER_BITS)
+#define KSIZE 2
+
+template<int ELE_PER_THREAD>
+__global__ void resize_bilinear_kernel(uint8_t* dst_data, int dst_hwc, int height, int width, const uint8_t* src_data,
+        int src_height, int src_width, int src_hwc, int channel, float scale_x, float scale_y) {
+    int ele_off = ELE_PER_THREAD * blockDim.x * blockIdx.x + threadIdx.x;
+
+    src_data += blockIdx.y * src_hwc;
+    dst_data += blockIdx.y * dst_hwc + ele_off;
+    int index = ele_off;
+
+    #pragma unroll
+    for (int i = 0; i < ELE_PER_THREAD; i++) {
+        if (index < dst_hwc) {
+            const int h = index / channel / width;
+            const int w = (index / channel) % width;
+            const int c = index % channel;
+
+            float fy = (float)((h + 0.5) * scale_y - 0.5);
+            int sy = __float2int_rd(fy);
+            fy -= sy;
+            int sy2 = sy + 1;
+            if (sy < 0) {
+                sy = sy2 = 0;
+                fy = 0;
+            }
+            if (sy >= src_height - 1) {
+                sy = sy2 = src_height - 1;
+                fy = 0;
+            }
+            int cbufy_0 = __float2int_rn((1.f - fy) * INTER_RESIZE_COEF_SCALE);
+            int cbufy_1 = __float2int_rn(fy * INTER_RESIZE_COEF_SCALE);
+
+            float fx = (float)((w + 0.5) * scale_x - 0.5);
+            int sx = __float2int_rd(fx);
+            fx -= sx;
+            int sx2 = sx + 1;
+            if (sx < 0) {
+                fx = 0;
+                sx = sx2 = 0;
+            }
+            if (sx >= src_width - 1) {
+                fx = 0;
+                sx = sx2 = src_width - 1;
+            }
+            int cbufx_0 = __float2int_rn((1.f - fx) * INTER_RESIZE_COEF_SCALE);
+            int cbufx_1 = __float2int_rn(fx * INTER_RESIZE_COEF_SCALE);
+            int src_idx0 = sy * src_width * channel + sx * channel + c;
+            int src_idx1 = sy * src_width * channel + sx2 * channel + c;
+            int src_idx2 = sy2 * src_width * channel + sx * channel + c;
+            int src_idx3 = sy2 * src_width * channel + sx2 * channel + c;
+            int s0 = (src_data[src_idx0] * cbufx_0 + src_data[src_idx1] * cbufx_1) >> 4;
+            int s1 = (src_data[src_idx2] * cbufx_0 + src_data[src_idx3] * cbufx_1) >> 4;
+            uint8_t val = ((s0 * cbufy_0 >> 16) + (s1 * cbufy_1 >> 16) + 2) >> 2;
+            dst_data[i * blockDim.x] = val;
+        }
+        index += blockDim.x;
+    }
+}
+
+template<int ELE_PER_THREAD>
+__global__ void resize_nearest_kernel(uint8_t* dst_data, int dst_hwc, int height, int width, const uint8_t* src_data,
+        int src_height, int src_width, int src_hwc, int channel, float scale_x, float scale_y) {
+    int ele_off = ELE_PER_THREAD * blockDim.x * blockIdx.x + threadIdx.x;
+
+    src_data += blockIdx.y * src_hwc;
+    dst_data += blockIdx.y * dst_hwc + ele_off;
+    int index = ele_off;
+
+    #pragma unroll
+    for (int i = 0; i < ELE_PER_THREAD; i++) {
+        if (index < dst_hwc) {
+            const int h = index / channel / width;
+            const int w = (index / channel) % width;
+            const int c = index % channel;
+
+            float pos_fx = (float)((w + 0.5) * scale_x - 0.5);
+            int pos_ix = (int)pos_fx;
+            float rat_fx = pos_fx - pos_ix;
+            if (pos_ix < 0) {
+                pos_ix = 0;
+                rat_fx = 0.f;
+            }
+            if (pos_ix >= src_width - 1) {
+                pos_ix = src_width - 2;
+                rat_fx = 1.f;
+            }
+            int mask_x = (rat_fx <= 0.5) ? -1 : 0;
+
+            float pos_fy = (float)((h + 0.5) * scale_y - 0.5);
+            int pos_iy = (int)pos_fy;
+            float rat_fy = pos_fy - pos_iy;
+            if (pos_iy < 0) {
+                pos_iy = 0;
+                rat_fy = 0.f;
+            }
+            if (pos_iy >= src_height - 1) {
+                pos_iy = src_height - 2;
+                rat_fy = 1.f;
+            }
+            int mask_y = (rat_fy <= 0.5) ? -1 : 0;
+
+            int sy = (mask_y == 0) ? pos_iy + 1 : pos_iy;
+            int sx = pos_ix;
+            int src_idx = sy * src_width * channel + sx * channel + c;
+            dst_data[i * blockDim.x] = (mask_x == 0) ? src_data[src_idx + channel] : src_data[src_idx];
+        }
+        index += blockDim.x;
+    }
+}
+
+__global__ void crop_rgb_kernel(const uint8_t* src, uint8_t* dst, int channel, int src_width, int src_height, int dst_width,
+        int dst_height, int width, int height, int top_left_x, int top_left_y) {
+    int batch = blockIdx.y;
+    int ele_off = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (ele_off < channel * width * height) {
+        src += batch * src_width * src_height * channel + (top_left_x + top_left_y * src_width) * channel;
+        dst += batch * dst_width * dst_height * channel;
+        int h = ele_off / (width * channel);
+        int w = ele_off % (width * channel);
+        dst[dst_width * h * channel + w] = src[src_width * h * channel + w];
+    }
+}
+
+__global__ void crop_yuv_kernel(const uint8_t* src, uint8_t* dst, int src_width, int src_height, int dst_width,
+        int dst_height, int width, int height, int top_left_x, int top_left_y) {
+    int batch = blockIdx.y;
+    int ele_off = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (ele_off < width * height) {
+        const uint8_t* src_y = src + batch * src_height * src_width * 3 / 2 + top_left_x + top_left_y * src_width;
+        uint8_t* dst_y = dst + batch * dst_height * dst_width * 3 / 2;
+        int h = ele_off / width;
+        int w = ele_off % width;
+        dst_y[dst_width * h + w] = src_y[src_width * h + w];
+        if (h < height / 2) {
+            const uint8_t* src_uv = src + batch * src_height * src_width * 3 / 2 + src_width * src_height +
+                top_left_x + top_left_y * src_width / 2;
+            uint8_t* dst_uv = dst + batch * dst_height * dst_width * 3 / 2 + dst_width * dst_height;
+            dst_uv[dst_width * h + w] = src_uv[src_width * h + w];
+        }
+    }
+}
+
+__global__ void yuv_to_rgba_kernel(const uint8_t* src, uint8_t* dst, int height, int width, int HW, int channel, bool is_nv12) {
+    src += blockIdx.z * HW * 3 / 2;
+    dst += blockIdx.z * HW * channel;
+
+    const int w = blockIdx.x * blockDim.x + threadIdx.x;
+    const int h = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (w < width && h < height) {
+        uint8_t y = src[h * width + w];
+        uint8_t u, v;
+        if (is_nv12) {
+            u = src[HW + (h / 2 * (width / 2 * 2)) + w / 2 * 2];
+            v = src[HW + (h / 2 * (width / 2 * 2)) + w / 2 * 2 + 1];
+        } else {
+            v = src[HW + (h / 2 * (width / 2 * 2)) + w / 2 * 2];
+            u = src[HW + (h / 2 * (width / 2 * 2)) + w / 2 * 2 + 1];
+        }
+
+        int b = 1.164 * (y - 16) + 2.018 * (u - 128);
+        int g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
+        int r = 1.164 * (y - 16) + 1.596 * (v - 128);
+        if (r > 255)   r = 255;
+        if (g > 255)   g = 255;
+        if (b > 255)   b = 255;
+        if (r < 0)     r = 0;
+        if (g < 0)     g = 0;
+        if (b < 0)     b = 0;
+
+        uint8_t* rgb = dst + (h * width + w) * channel;
+        rgb[0] = (uint8_t)r;
+        rgb[1] = (uint8_t)g;
+        rgb[2] = (uint8_t)b;
+        if (channel == 4)
+            rgb[3] = 255;
+    }
+}
+
+__global__ void bgra_to_gray_kernel(const uint8_t* src, uint8_t* dst, int height, int width, int HW, int channel) {
+    src += blockIdx.z * HW * channel;
+    dst += blockIdx.z * HW;
+
+    const int w = blockIdx.x * blockDim.x + threadIdx.x;
+    const int h = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (w < width && h < height) {
+        uint8_t b = src[(h * width + w) * channel + 0];
+        uint8_t g = src[(h * width + w) * channel + 1];
+        uint8_t r = src[(h * width + w) * channel + 2];
+        float gray_color = 0.114f * b + 0.587 * g + 0.299 * r;
+        dst[h * width + w] = gray_color;
+    }
+}
+
+__global__ void copy_make_border_kernel(const uint8_t* src, uint8_t* dst, int src_height, int src_width, int dst_height,
+        int dst_width, int top, int bottom, int left, int right, uint8_t pad_val) {
+    src += blockIdx.y * src_height * src_width;
+    dst += blockIdx.y * dst_height * dst_width;
+    int offset = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (offset < src_height * src_width) {
+        int h = offset / src_width;
+        int w = offset % src_width;
+        if (h == 0) {
+            if (w == 0) {
+                for (int i = 0; i < top; i++) {
+                    for (int j = 0; j < left; j++) {
+                        dst[i * dst_width + j] = pad_val;
+                    }
+                }
+            }
+            for (int i = 0; i < top; i++) {
+                dst[left + i * dst_width + w] = pad_val;
+            }
+            if (w == src_width - 1) {
+                for (int i = 0; i < top; i++) {
+                    for (int j = 0; j < right; j++) {
+                        dst[i * dst_width + left + src_width + j] = pad_val;
+                    }
+                }
+            }
+        }
+        if (w == 0) {
+            for (int i = 0; i < left; i++) {
+                dst[(top + h) * dst_width + i] = pad_val;
+            }
+        }
+        if (w == src_width - 1) {
+            for (int i = 0; i < right; i++) {
+                dst[(top + h) * dst_width + left + src_width + i] = pad_val;
+            }
+        }
+        if (h == src_height - 1) {
+            if (w == 0) {
+                for (int i = 0; i < bottom; i++) {
+                    for (int j = 0; j < left; j++) {
+                        dst[(top + src_height + i) * dst_width + j] = pad_val;
+                    }
+                }
+            }
+            for (int i = 0; i < bottom; i++) {
+                dst[left + (top + src_height + i) * dst_width + w] = pad_val;
+            }
+            if (w == src_width - 1) {
+                for (int i = 0; i < bottom; i++) {
+                    for (int j = 0; j < right; j++) {
+                        dst[(top + src_height + i) * dst_width + left + src_width + j] = pad_val;
+                    }
+                }
+            }
+        }
+        dst[(h + top) * dst_width + left + w] = src[h * src_width + w];
+    }
+}
+
+__device__ int fp_2_int_sat(double in) {
+    long long x = __double2ll_rn(in);
+    x = x > INT_MAX?INT_MAX : x;
+    x = x < INT_MIN?INT_MIN : x;
+    return int(x);
+}
+
+__device__ __forceinline__ int imax(int a, int b) {
+    return max(a,b);
+}
+
+__device__ __forceinline__ int imin(int a, int b) {
+    return min(a,b);
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void warp_affine_bilinear_kernel(const uint8_t* src, uint8_t* dst, const int H, const int W, const int C,
+        const int OH, const int OW, const short* table, double* tm, const uint8_t border_value) {
+    src += blockIdx.y * H * W * C;
+    dst += blockIdx.y * OH * OW * C;
+    const int DELTA = 1 << 14;
+
+    #pragma unroll
+    for (int i = 0; i < ELE_PER_THREAD; i++) {
+        const int hwc_id = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD + 
+                            threadIdx.x + i * THREAD_PER_BLOCK;
+        if (hwc_id >= OH * OW * C) {
+            break;
+        }
+
+        // output hwc
+        const int c = hwc_id % C;
+        const int hw = hwc_id / C;
+        const int h = hw / OW;
+        const int w = hw % OW;
+
+        int new_w_full = fp_2_int_sat(tm[0] * w * 1024) +
+                         fp_2_int_sat((tm[1] * h + tm[2])* 1024) + 16;
+        int new_h_full = fp_2_int_sat(tm[3] * w * 1024) +
+                         fp_2_int_sat((tm[4] * h + tm[5])* 1024) + 16;
+
+        new_w_full >>= 5;
+        new_h_full >>= 5;
+        short new_hw_float = (new_w_full & 31) + (new_h_full & 31) * 32;
+        const short *wtab = &table[new_hw_float*4];
+
+        int new_w_int = new_w_full >> 5;
+        int new_h_int = new_h_full >> 5;
+
+        // input hw
+        int new_w_real[2] = { new_w_int, new_w_int + 1 };
+        int new_h_real[2] = { new_h_int, new_h_int + 1 };
+
+        unsigned char val[2][2];
+        #pragma unroll
+        for(int wi = 0; wi < 2; wi++) {
+            #pragma unroll
+            for(int hi = 0; hi < 2; hi++) {
+                if (new_w_real[wi] >= 0 && new_w_real[wi] < W &&
+                    new_h_real[hi] >= 0 && new_h_real[hi] < H) {
+                    val[hi][wi] = src[(new_h_real[hi] * W + new_w_real[wi]) * C + c];
+                } else {
+                    val[hi][wi] = border_value;
+                }
+            }
+        }
+
+        int val_inter = wtab[0] * val[0][0] + wtab[1] * val[0][1] + wtab[2] * val[1][0] + wtab[3] * val[1][1];
+        int src_value = (val_inter + DELTA ) >> 15;
+        dst[hwc_id] = src_value;
+    }
+}
+
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void warp_affine_nearest_kernel(const uint8_t* src, uint8_t* dst, const int H, const int W, const int C,
+        const int OH, const int OW, double* tm, const uint8_t border_value) {
+    src += blockIdx.y * H * W * C;
+    dst += blockIdx.y * OH * OW * C;
+
+    #pragma unroll
+    for (int i = 0; i < ELE_PER_THREAD; i++) {
+        const int hwc_id = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD +
+                            threadIdx.x + i * THREAD_PER_BLOCK;
+        if (hwc_id >= OH * OW * C) {
+            break;
+        }
+
+        // output hwc
+        const int c = hwc_id % C;
+        const int hw = hwc_id / C;
+        const int h = hw / OW;
+        const int w = hw % OW;
+
+        int new_w_full = fp_2_int_sat(tm[0] * w * 1024) +
+                         fp_2_int_sat((tm[1] * h + tm[2])* 1024) + 16;
+        int new_h_full = fp_2_int_sat(tm[3] * w * 1024) +
+                         fp_2_int_sat((tm[4] * h + tm[5])* 1024) + 16;
+
+        new_w_full >>= 5;
+        new_h_full >>= 5;
+        bool is_left = (new_w_full & 31) < 16;
+        bool is_top = (new_h_full & 31) < 16;
+        int new_w_int = new_w_full >> 5;
+        int new_h_int = new_h_full >> 5;
+
+        // input hw
+        int new_w_real[2] = { new_w_int, new_w_int + 1 };
+        int new_h_real[2] = { new_h_int, new_h_int + 1 };
+
+        unsigned char  val[2][2];
+        #pragma unroll 
+        for(int wi = 0; wi < 2; wi++) {
+            #pragma unroll 
+            for(int hi = 0; hi < 2; hi++) {
+                if (new_w_real[wi] >= 0 && new_w_real[wi] < W && 
+                    new_h_real[hi] >= 0 && new_h_real[hi] < H) {
+                    val[hi][wi] = src[(new_h_real[hi] * W + new_w_real[wi]) * C + c];
+                } else {
+                    val[hi][wi] = border_value;
+                }
+            }
+        }
+
+        if (is_top) {
+            dst[hwc_id] = is_left ? val[0][0] : val[0][1];
+        } else {
+            dst[hwc_id] = is_left ? val[1][0] : val[1][1];
+        }
+    }
+}
+
+void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel) {
+    const int ELE_PER_THREAD = 4;
+    const int THREAD_PER_BLOCK = 128;
+    dim3 grid;
+    int size_dst = dst_h * dst_w * channel;
+    grid.x = (size_dst + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    grid.y = batch;
+    float scale_x = (float)src_w / dst_w;
+    float scale_y = (float)src_h / dst_h;
+    resize_bilinear_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK>>>(dst, size_dst, dst_h, dst_w, src, src_h,
+        src_w, src_h * src_w * channel, channel, scale_x, scale_y);
+}
+
+void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel) {
+    const int ELE_PER_THREAD = 4;
+    const int THREAD_PER_BLOCK = 128;
+    dim3 grid;
+    int size_dst = dst_h * dst_w * channel;
+    grid.x = (size_dst + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    grid.y = batch;
+    float scale_x = (float)src_w / dst_w;
+    float scale_y = (float)src_h / dst_h;
+    resize_nearest_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK>>>(dst, size_dst, dst_h, dst_w, src, src_h,
+        src_w, src_h * src_w * channel, channel, scale_x, scale_y);
+}
+
+static void initInterTab2D(short* input_table) {
+    short* itab = input_table;
+    int ksize = 2;
+
+    float *_tab = new float[2 * INTER_TAB_SIZE];
+    int i, j, k1, k2;
+    InitInterTab1D(_tab, INTER_TAB_SIZE);
+    for (i = 0; i < INTER_TAB_SIZE; i++) {
+        for (j = 0; j < INTER_TAB_SIZE; j++, itab += ksize * ksize) {
+            int isum = 0;
+
+            for (k1 = 0; k1 < ksize; k1++) {
+                float vy = _tab[i * ksize + k1];
+                for (k2 = 0; k2 < ksize; k2++) {
+                    float v = vy * _tab[j * ksize + k2];
+                    isum += itab[k1 * ksize + k2] = SATURATE_CAST_SHORT(v * INTER_REMAP_COEF_SCALE);
+                }
+            }
+
+            if (isum != INTER_REMAP_COEF_SCALE) {
+                int diff = isum - INTER_REMAP_COEF_SCALE;
+                int ksize2 = ksize / 2;
+                int Mk1 = ksize2, Mk2 = ksize2, mk1 = ksize2, mk2 = ksize2;
+                for (k1 = ksize2; k1 < ksize2 + 2; k1++)
+                    for (k2 = ksize2; k2 < ksize2 + 2; k2++) {
+                        if (itab[k1 * ksize + k2] < itab[mk1 * ksize + mk2])
+                            mk1 = k1, mk2 = k2;
+                        else if (itab[k1 * ksize + k2] > itab[Mk1 * ksize + Mk2])
+                            Mk1 = k1, Mk2 = k2;
+                    }
+                if (diff < 0)
+                    itab[Mk1 * ksize + Mk2] = (short)(itab[Mk1 * ksize + Mk2] - diff);
+                else
+                    itab[mk1 * ksize + mk2] = (short)(itab[mk1 * ksize + mk2] - diff);
+            }
+        }
+    }
+    delete[] _tab;
+}
+
+void WarpAffineBilinear(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+        const float (*transform)[3], const float border_val) {
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+    double *tm_gpu;
+    CUDA_CHECK(cudaMalloc((void**)&tm_gpu, 6 * sizeof(double)));
+    CUDA_CHECK(cudaMemcpy(tm_gpu, m, 6 * sizeof(double), cudaMemcpyHostToDevice));
+    const int table_size = INTER_TAB_SIZE * INTER_TAB_SIZE * KSIZE * KSIZE;
+    short table_cpu[table_size];
+    short *table_gpu;
+    CUDA_CHECK(cudaMalloc((void**)&table_gpu, table_size * sizeof(short)));
+    initInterTab2D(table_cpu);
+    cudaMemcpy(table_gpu, table_cpu, table_size * sizeof(short), cudaMemcpyHostToDevice);
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 8;
+    int size_dst = dst_h * dst_w * channel;
+    dim3 griddim;
+    griddim.x = (size_dst + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    griddim.y = batch;
+    warp_affine_bilinear_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK><<<griddim, THREAD_PER_BLOCK>>>(src, dst, src_h, src_w,
+        channel, dst_h, dst_w, table_gpu, tm_gpu, border_val);
+    CUDA_CHECK(cudaFree(tm_gpu));
+    CUDA_CHECK(cudaFree(table_gpu));
+}
+
+void WarpAffineNearest(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+        const float (*transform)[3], const float border_val) {
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+    double *tm_gpu;
+    CUDA_CHECK(cudaMalloc((void**)&tm_gpu, 6 * sizeof(double)));
+    cudaMemcpy(tm_gpu, m, 6 * sizeof(double), cudaMemcpyHostToDevice);
+    const int THREAD_PER_BLOCK = 128;
+    const int ELE_PER_THREAD = 8;
+    int size_dst = dst_h * dst_w * channel;
+    dim3 griddim;
+    griddim.x = (size_dst + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+    griddim.y = batch;
+    warp_affine_nearest_kernel<ELE_PER_THREAD, THREAD_PER_BLOCK><<<griddim, THREAD_PER_BLOCK>>>(src, dst, src_h, src_w,
+        channel, dst_h, dst_w, tm_gpu, border_val);
+    CUDA_CHECK(cudaFree(tm_gpu));
+}
+
+void CropRGB(const uint8_t* src, uint8_t* dst, int batch, int channel, int src_width, int src_height, int dst_width, int dst_height,
+        int width, int height, int top_left_x, int top_left_y) {
+    int THREAD_PER_BLOCK = 128;
+    dim3 grid;
+    grid.x = (width * height * channel + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    grid.y = batch;
+    crop_rgb_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, channel, src_width, src_height, dst_width, dst_height,
+        width, height, top_left_x, top_left_y);
+}
+
+void CropYUV(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width, int dst_height,
+        int width, int height, int top_left_x, int top_left_y) {
+    int THREAD_PER_BLOCK = 128;
+    dim3 grid;
+    grid.x = (width * height + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    grid.y = batch;
+    crop_yuv_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, src_width, src_height, dst_width, dst_height, width,
+        height, top_left_x, top_left_y);
+}
+
+void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12) {
+    dim3 block, grid;
+    int BLOCKX = 32;
+    int BLOCKY = 8;
+    block.x = BLOCKX;
+    block.y = BLOCKY;
+    grid.x = (w + BLOCKX - 1) / BLOCKX;
+    grid.y = (h + BLOCKY - 1) / BLOCKY;
+    grid.z = batch;
+    yuv_to_rgba_kernel<<<grid, block>>>(src, dst, h, w, w * h, channel, is_nv12);
+}
+
+void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel) {
+    dim3 block, grid;
+    int BLOCKX = 32;
+    int BLOCKY = 8;
+    block.x = BLOCKX;
+    block.y = BLOCKY;
+    grid.x = (w + BLOCKX - 1) / BLOCKX;
+    grid.y = (h + BLOCKY - 1) / BLOCKY;
+    grid.z = batch;
+    bgra_to_gray_kernel<<<grid, block>>>(src, dst, h, w, w * h, channel);
+}
+
+void CudaCopyMakeBorder(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width,
+        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val) {
+    int THREAD_PER_BLOCK = 128;
+    dim3 grid;
+    grid.x = (src_width * src_height * channel + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
+    grid.y = batch;
+    copy_make_border_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, src_height, src_width * channel, dst_height, dst_width * channel, top, bottom,
+        left * channel, right * channel, pad_val);
+}
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cuh b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cuh
new file mode 100644
index 0000000..4e21333
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/cuda_mat_util.cuh
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MAT_UTIL_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MAT_UTIL_CUH_
+#include <string.h>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel);
+void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int w, int h, int channel);
+void CropRGB(const uint8_t* src, uint8_t* dst, int batch, int channel, int src_width, int src_height, int dst_width,
+        int dst_height, int width, int height, int top_left_x, int top_left_y);
+void CropYUV(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width, int dst_height,
+        int width, int height, int top_left_x, int top_left_y);
+void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12);
+void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel);
+void CudaCopyMakeBorder(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width,
+        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val);
+void WarpAffineBilinear(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+        const float (*transform)[3], const float border_val);
+void WarpAffineNearest(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+        const float (*transform)[3], const float border_val);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_MAT_UTIL_CUH_
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/fastdiv.h b/3rdparty/TNN/source/tnn/device/cuda/fastdiv.h
new file mode 100644
index 0000000..efc2303
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/fastdiv.h
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_FASTDIV_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_FASTDIV_H_
+
+class fastdiv
+{
+public:
+	// divisor != 0 
+	__host__ __device__ __forceinline__
+	fastdiv(){}
+
+	void init(int divisor)
+	{
+		this->d = divisor;
+		update_magic_numbers();
+	}
+
+	__host__ __device__ __forceinline__
+	operator int() const
+	{
+		return d;
+	}
+
+private:
+	int d;
+	int M;
+	int s;
+	int n_add_sign;
+
+	// Hacker's Delight, Second Edition, Chapter 10, Integer Division By Constants
+	__host__ __device__ __forceinline__
+	void update_magic_numbers()
+	{
+		if (d == 1)
+		{
+			M = 0;
+			s = -1;
+			n_add_sign = 1;
+			return;
+		}
+
+		int p;
+		unsigned int ad, anc, delta, q1, r1, q2, r2, t;
+		const unsigned two31 = 0x80000000;
+		ad = d;
+		t = two31 + ((unsigned int)d >> 31);
+		anc = t - 1 - t % ad;
+		p = 31;
+		q1 = two31 / anc;
+		r1 = two31 - q1 * anc;
+		q2 = two31 / ad;
+		r2 = two31 - q2 * ad;
+		do
+		{
+			++p;
+			q1 = 2 * q1;
+			r1 = 2 * r1;
+			if (r1 >= anc)
+			{
+				++q1;
+				r1 -= anc;
+			}
+			q2 = 2 * q2;
+			r2 = 2 * r2;
+			if (r2 >= ad)
+			{
+				++q2;
+				r2 -= ad;
+			}
+			delta = ad - r2;
+		} while (q1 < delta || (q1 == delta && r1 == 0));
+		this->M = q2 + 1;
+		this->s = p - 32;
+
+		if ((M < 0))
+			n_add_sign = 1;
+		else
+			n_add_sign = 0;			
+	}
+
+	__host__ __device__ __forceinline__
+	friend int operator/(const int divident, const fastdiv& divisor);
+};
+
+__host__ __device__ __forceinline__
+int operator/(const int n, const fastdiv& divisor)
+{
+	int q;
+#ifdef __CUDA_ARCH__
+	asm("mul.hi.s32 %0, %1, %2;" : "=r"(q) : "r"(divisor.M), "r"(n));
+#else
+	q = (((unsigned long long)((long long)divisor.M * (long long)n)) >> 32);
+#endif
+	q += n * divisor.n_add_sign;
+	if (divisor.s >= 0)
+	{
+		q >>= divisor.s; // we rely on this to be implemented as arithmetic shift
+		q += (((unsigned int)q) >> 31);
+	}
+	return q;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const int n, const fastdiv& divisor)
+{
+	int quotient = n / divisor;
+	int remainder = n - quotient * divisor;
+	return remainder;
+}
+
+__host__ __device__ __forceinline__
+int operator/(const unsigned int n, const fastdiv& divisor)
+{
+	return ((int)n) / divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const unsigned int n, const fastdiv& divisor)
+{
+	return ((int)n) % divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator/(const short n, const fastdiv& divisor)
+{
+	return ((int)n) / divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const short n, const fastdiv& divisor)
+{
+	return ((int)n) % divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator/(const unsigned short n, const fastdiv& divisor)
+{
+	return ((int)n) / divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const unsigned short n, const fastdiv& divisor)
+{
+	return ((int)n) % divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator/(const char n, const fastdiv& divisor)
+{
+	return ((int)n) / divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const char n, const fastdiv& divisor)
+{
+	return ((int)n) % divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator/(const unsigned char n, const fastdiv& divisor)
+{
+	return ((int)n) / divisor;
+}
+
+__host__ __device__ __forceinline__
+int operator%(const unsigned char n, const fastdiv& divisor)
+{
+	return ((int)n) % divisor;
+}
+
+#endif // TNN_SOURCE_TNN_DEVICE_CUDA_FASTDIV_H_
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject
new file mode 100644
index 0000000..e76d1da
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject
@@ -0,0 +1,1223 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings>
+					<externalSetting languages="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073"/>
+				</externalSettings>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="B40CTrunk" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
+					<folderInfo id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113" name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.481495889" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.100038061" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
+							<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.412463247" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.996758685" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
+								<option id="gnu.both.asm.option.include.paths.900454792" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.221302756" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1353653670" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
+								<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<option id="gnu.cpp.compiler.option.preprocessor.def.1893619952" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" useByScannerDiscovery="false" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="__device__"/>
+									<listOptionValue builtIn="false" value="__global__"/>
+									<listOptionValue builtIn="false" value="__shared__"/>
+									<listOptionValue builtIn="false" value="__forceinline__"/>
+									<listOptionValue builtIn="false" value="__host__"/>
+									<listOptionValue builtIn="false" value="__device_builtin__"/>
+									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
+									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
+									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
+									<listOptionValue builtIn="false" value="__align__(...)"/>
+									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
+								</option>
+								<option id="gnu.cpp.compiler.option.dialect.std.49639338" name="Language standard" superClass="gnu.cpp.compiler.option.dialect.std" useByScannerDiscovery="true" value="gnu.cpp.compiler.dialect.default" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
+								<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
+								<option id="gnu.c.compiler.option.preprocessor.def.symbols.1005509663" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" useByScannerDiscovery="false" valueType="definedSymbols">
+									<listOptionValue builtIn="false" value="__device__"/>
+									<listOptionValue builtIn="false" value="__global__"/>
+									<listOptionValue builtIn="false" value="__shared__"/>
+									<listOptionValue builtIn="false" value="__forceinline__"/>
+									<listOptionValue builtIn="false" value="__host__"/>
+									<listOptionValue builtIn="false" value="__device_builtin__"/>
+									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
+									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
+									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
+									<listOptionValue builtIn="false" value="__align__(...)"/>
+									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.1600375047" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.1176124124" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.958378367" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.pathentry"/>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+			<storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
+			<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="B40CTrunk.null.1404415602" name="B40CTrunk"/>
+	</storageModule>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Default">
+			<resource resourceType="PROJECT" workspacePath="/GIT_CUB"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings">
+		<doc-comment-owner id="org.eclipse.cdt.ui.doxygen">
+			<path value=""/>
+		</doc-comment-owner>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="makefileGenerator">
+				<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+			<buildOutputProvider>
+				<openAction enabled="true" filePath=""/>
+				<parser enabled="true"/>
+			</buildOutputProvider>
+			<scannerInfoProvider id="specsFile">
+				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+				<parser enabled="true"/>
+			</scannerInfoProvider>
+		</profile>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1665401269;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.494265807">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.43985841;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1045483126">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1240277003;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1264397663">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.459535216;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2120860882">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1758599759;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.466964704">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1671954574;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.304556051">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.2110267806;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.903720746">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1850250798;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1752562149">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1296776241;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.268633283">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.265387950;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.563557831">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.629007265;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.450470600">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.2085396856;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1885998497">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.652522784;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1098348915">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1149397878;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1156849140">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.586941236;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1654082299">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1214991320;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.332043455">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.440957653;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1117446939">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.158380621;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="makefileGenerator">
+					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
+				<buildOutputProvider>
+					<openAction enabled="true" filePath=""/>
+					<parser enabled="true"/>
+				</buildOutputProvider>
+				<scannerInfoProvider id="specsFile">
+					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
+					<parser enabled="true"/>
+				</scannerInfoProvider>
+			</profile>
+		</scannerConfigBuildInfo>
+	</storageModule>
+</cproject>
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore
new file mode 100644
index 0000000..3441f55
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore
@@ -0,0 +1 @@
+.p4config
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project
new file mode 100644
index 0000000..7aca9e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>GIT_CUB</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+	</natures>
+</projectDescription>
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore
new file mode 100644
index 0000000..d81d4c4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore
@@ -0,0 +1 @@
+/language.settings.xml
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs
new file mode 100644
index 0000000..64da777
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs
@@ -0,0 +1,72 @@
+eclipse.preferences.version=1
+org.eclipse.cdt.codan.checkers.errnoreturn=Warning
+org.eclipse.cdt.codan.checkers.errnoreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.checkers.errreturnvalue=Error
+org.eclipse.cdt.codan.checkers.errreturnvalue.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nocommentinside=-Error
+org.eclipse.cdt.codan.checkers.nocommentinside.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nolinecomment=-Error
+org.eclipse.cdt.codan.checkers.nolinecomment.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.noreturn=Error
+org.eclipse.cdt.codan.checkers.noreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation=Error
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},no_break_comment\=>"no break",last_case_param\=>true,empty_case_param\=>false}
+org.eclipse.cdt.codan.internal.checkers.CatchByReference=Warning
+org.eclipse.cdt.codan.internal.checkers.CatchByReference.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},unknown\=>false,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem=Error
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization=Warning
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},skip\=>true}
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker=-Info
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},pattern\=>"^[a-z]",macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem=Error
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},paramNot\=>false}
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},else\=>false,afterelse\=>false}
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>("@(\#)","$Id")}
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+useParentScope=false
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs
new file mode 100644
index 0000000..80b8e65
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs
@@ -0,0 +1,177 @@
+eclipse.preferences.version=1
+indexer/indexAllFiles=true
+indexer/indexAllHeaderVersions=false
+indexer/indexAllVersionsSpecificHeaders=
+indexer/indexOnOpen=false
+indexer/indexUnusedHeadersWithAlternateLang=false
+indexer/indexUnusedHeadersWithDefaultLang=true
+indexer/indexerId=org.eclipse.cdt.core.fastIndexer
+indexer/skipFilesLargerThanMB=8
+indexer/skipImplicitReferences=false
+indexer/skipIncludedFilesLargerThanMB=16
+indexer/skipMacroReferences=false
+indexer/skipReferences=false
+indexer/skipTypeReferences=false
+indexer/useHeuristicIncludeResolution=true
+org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation=16
+org.eclipse.cdt.core.formatter.alignment_for_assignment=16
+org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_binary_expression=16
+org.eclipse.cdt.core.formatter.alignment_for_compact_if=0
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression=48
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression_chain=18
+org.eclipse.cdt.core.formatter.alignment_for_constructor_initializer_list=0
+org.eclipse.cdt.core.formatter.alignment_for_declarator_list=16
+org.eclipse.cdt.core.formatter.alignment_for_enumerator_list=48
+org.eclipse.cdt.core.formatter.alignment_for_expression_list=0
+org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer=16
+org.eclipse.cdt.core.formatter.alignment_for_member_access=0
+org.eclipse.cdt.core.formatter.alignment_for_overloaded_left_shift_chain=16
+org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration=48
+org.eclipse.cdt.core.formatter.brace_position_for_array_initializer=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block_in_case=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_method_declaration=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_switch=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_type_declaration=next_line
+org.eclipse.cdt.core.formatter.comment.min_distance_between_code_and_line_comment=1
+org.eclipse.cdt.core.formatter.comment.never_indent_line_comments_on_first_column=true
+org.eclipse.cdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=true
+org.eclipse.cdt.core.formatter.compact_else_if=true
+org.eclipse.cdt.core.formatter.continuation_indentation=1
+org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer=1
+org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_extra_spaces=0
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier=true
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header=false
+org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header=false
+org.eclipse.cdt.core.formatter.indent_empty_lines=false
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_block=true
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_body=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch=false
+org.eclipse.cdt.core.formatter.indentation.size=4
+org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_colon_in_constructor_initializer_list=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_after_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while=insert
+org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.join_wrapped_lines=true
+org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line=false
+org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line=true
+org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.lineSplit=80
+org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve=1
+org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line=true
+org.eclipse.cdt.core.formatter.tabulation.char=space
+org.eclipse.cdt.core.formatter.tabulation.size=4
+org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations=false
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs
new file mode 100644
index 0000000..ca73f82
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs
@@ -0,0 +1,3 @@
+eclipse.preferences.version=1
+formatter_profile=_B40C
+formatter_settings_version=1
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs
new file mode 100644
index 0000000..2e6330e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs
@@ -0,0 +1,4 @@
+content-types/enabled=true
+content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
+content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
+eclipse.preferences.version=1
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md
new file mode 100644
index 0000000..79dc8b6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md
@@ -0,0 +1,848 @@
+# CUB 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+## Summary
+
+CUB 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+## Bug Fixes
+
+- #1217: Move static local in `cub::DeviceCount` to a separate host-only
+    function because NVC++ doesn't support static locals in host-device
+    functions.
+
+# CUB 1.9.10 (NVIDIA HPC SDK 20.5)
+
+## Summary
+
+CUB 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake `find_package` support.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake `find_package` support.
+  Just point CMake at the `cmake` folder in your CUB include directory
+    (ex: `cmake -DCUB_DIR=/usr/local/cuda/include/cub/cmake/ .`) and then you
+    can add CUB to your CMake project with `find_package(CUB REQUIRED CONFIG)`.
+
+# CUB 1.9.9 (CUDA 11.0)
+
+## Summary
+
+CUB 1.9.9 is the release accompanying the CUDA Toolkit 11.0 release.
+It introduces CMake support, version macros, platform detection machinery,
+  and support for NVC++, which uses Thrust (and thus CUB) to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+Additionally, the scan dispatch layer was refactored and modernized.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake support.
+  Thanks to Francis Lemaire for this contribution.
+- Refactorized and modernized scan dispatch layer.
+  Thanks to Francis Lemaire for this contribution.
+- Policy hooks for device-wide reduce, scan, and radix sort facilities
+    to simplify tuning and allow users to provide custom policies.
+  Thanks to Francis Lemaire for this contribution.
+- `<cub/version.cuh>`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`,
+    `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`.
+- Platform detection machinery:
+  - `<cub/util_cpp_dialect.cuh>`: Detects the C++ standard dialect.
+  - `<cub/util_compiler.cuh>`: host and device compiler detection.
+  - `<cub/util_deprecated.cuh>`: `CUB_DEPRECATED`.
+  - <cub/config.cuh>`: Includes `<cub/util_arch.cuh>`,
+      `<cub/util_compiler.cuh>`, `<cub/util_cpp_dialect.cuh>`,
+      `<cub/util_deprecated.cuh>`, `<cub/util_macro.cuh>`,
+      `<cub/util_namespace.cuh>`
+- `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for
+    `cudaGetDeviceCount`.
+
+## Other Enhancements
+
+- Lazily initialize the per-device CUDAattribute caches, because CUDA context
+    creation is expensive and adds up with large CUDA binaries on machines with
+    many GPUs.
+  Thanks to the NVIDIA PyTorch team for bringing this to our attention.
+- Make `cub::SwitchDevice` avoid setting/resetting the device if the current
+    device is the same as the target device.
+
+## Bug Fixes
+
+- Add explicit failure parameter to CAS in the CUB attribute cache to workaround
+    a GCC 4.8 bug.
+- Revert a change in reductions that changed the signedness of the `lane_id`
+    variable to suppress a warning, as this introduces a bug in optimized device
+    code.
+- Fix initialization in `cub::ExclusiveSum`.
+  Thanks to Conor Hoekstra for this contribution.
+- Fix initialization of the `std::array` in the CUB attribute cache.
+- Fix `-Wsign-compare` warnings.
+  Thanks to Elias Stehle for this contribution.
+- Fix `test_block_reduce.cu` to build without parameters.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `grid_even_share.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `thread_search.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `cub.cuh`.
+  Thanks to Felix Kallenborn for this contribution.
+
+# CUB 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+## Summary
+
+CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms.
+
+# CUB 1.9.8 (CUDA 11.0 Early Access)
+
+## Summary
+
+CUB 1.9.8 is the first release of CUB to be officially supported and included
+  in the CUDA Toolkit.
+When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query
+  APIs, which improves performance of these queries by 20x to 50x when they
+  are called concurrently by multiple host threads.
+
+## Enhancements
+
+- (C++11 or later) Cache calls to `cudaFuncGetAttributes` and
+    `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`.
+    These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform
+    poorly under contention; with the caching, they are 20 to 50x faster when
+    called concurrently.
+  Thanks to Bilge Acun for bringing this issue to our attention.
+- `DispatchReduce` now takes an `OutputT` template parameter so that users can
+    specify the intermediate type explicitly.
+- Radix sort tuning policies updates to fix performance issues for element
+    types smaller than 4 bytes.
+
+## Bug Fixes
+
+- Change initialization style from copy initialization to direct initialization
+    (which is more permissive) in `AgentReduce` to allow a wider range of types
+    to be used with it.
+- Fix bad signed/unsigned comparisons in `WarpReduce`.
+- Fix computation of valid lanes in warp-level reduction primitive to correctly
+    handle the case where there are 0 input items per warp.
+
+# CUB 1.8.0
+
+## Summary
+
+CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces.
+
+## Breaking Changes
+
+- The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and
+    `cub::ShuffleDown` have been changed to allow for better computation of the
+    PTX SHFL control constant for logical warps smaller than 32 threads.
+
+## Bug Fixes
+
+- #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical
+    warps smaller than 32 threads.
+
+# CUB 1.7.5
+
+## Summary
+
+CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting
+  performance for 1 byte keys.
+It was incorporated into Thrust 1.9.2.
+
+## Enhancements
+
+- Radix sort support for `__half` keys.
+- Radix sort tuning policy updates to improve 1 byte key performance.
+
+## Bug Fixes
+
+- Syntax tweaks to mollify Clang.
+- #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results.
+- #128: 7-bit sorting passes fail for SM61 with large values.
+
+# CUB 1.7.4
+
+## Summary
+
+CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2.
+
+## Bug Fixes
+
+- #114: Can't pair non-trivially-constructible values in radix sort.
+- #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical
+    warp sizes smaller than 32.
+
+# CUB 1.7.3
+
+## Summary
+
+CUB 1.7.3 is a minor release.
+
+## Bug Fixes
+
+- #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs.
+
+# CUB 1.7.2
+
+## Summary
+
+CUB 1.7.2 is a minor release.
+
+## Bug Fixes
+
+- #104: Device-wide reduction is now "run-to-run" deterministic for
+    pseudo-associative reduction operators (like floating point addition).
+
+# CUB 1.7.1
+
+## Summary
+
+CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a
+  number of bug fixes.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM7x (Volta).
+
+## Bug Fixes
+
+- #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older.
+- #103: Can't mix Thrust from CUDA 9.0 and CUB.
+- #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict
+    with `std::min`/`std::max`.
+- #99: Radix sorting crashes NVCC on Windows 10 for SM52.
+- #98: cuda-memcheck: --tool initcheck failed with lineOfSight.
+- #94: Git clone size.
+- #93: Accept iterators for segment offsets.
+- #87: CUB uses anonymous unions which is not valid C++.
+- #44: Check for C++11 is incorrect for Visual Studio 2013.
+
+# CUB 1.7.0
+
+## Summary
+
+CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs.
+It is compatible with independent thread scheduling.
+It was incorporated into Thrust 1.9.0-5.
+
+## Breaking Changes
+
+- Remove `cub::WarpAll` and `cub::WarpAny`.
+  These functions served to emulate `__all` and `__any` functionality for
+    SM1x devices, which did not have those operations.
+  However, SM1x devices are now deprecated in CUDA, and the interfaces of these
+    two functions are now lacking the lane-mask needed for collectives to run on
+    SM7x and newer GPUs which have independent thread scheduling.
+
+## Other Enhancements
+
+- Remove any assumptions of implicit warp synchronization to be compatible with
+    SM7x's (Volta) independent thread scheduling.
+
+## Bug Fixes
+
+- #86: Incorrect results with reduce-by-key.
+
+# CUB 1.6.4
+
+## Summary
+
+CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x
+  (Pascal) GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) -
+    3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively.
+
+## Bug Fixes
+
+- Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5.
+- #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have
+    pointer-to-const type.
+- Mollify Clang device-side warnings.
+- Remove out-dated MSVC project files.
+
+# CUB 1.6.3
+
+## Summary
+
+CUB 1.6.3 improves support for Windows, changes
+  `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type,
+  and enhances radix sort performance for SM6x (Pascal) GPUs.
+
+## Breaking Changes
+
+- `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data
+    type, instead of the `Iterator` type.
+  This allows for output iterators having `void` as their `value_type` (e.g.
+    discard iterators).
+
+## Other Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte
+    keys/s on GP100.
+- Improved support for Windows (warnings, alignment, etc).
+
+## Bug Fixes
+
+- #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items.
+- #72: `cub:InequalityWrapper::operator` should be non-const.
+- #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor.
+- #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type`
+    isn't `T`.
+- #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch
+    specialization.
+
+# CUB 1.6.2 (previously 1.5.5)
+
+## Summary
+
+CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal)
+  GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs.
+
+## Bug Fixes
+
+- Fix AArch64 compilation of `cub::CachingDeviceAllocator`.
+
+# CUB 1.6.1 (previously 1.5.4)
+
+## Summary
+
+CUB 1.6.1 (previously 1.5.4) is a minor release.
+
+## Bug Fixes
+
+- Fix radix sorting bug introduced by scan refactorization.
+
+# CUB 1.6.0 (previously 1.5.3)
+
+## Summary
+
+CUB 1.6.0 changes the scan and reduce interfaces.
+Exclusive scans now accept an "initial value" instead of an "identity value".
+Scans and reductions now support differing input and output sequence types.
+Additionally, many bugs have been fixed.
+
+## Breaking Changes
+
+- Device/block/warp-wide exclusive scans have been revised to now accept an
+    "initial value" (instead of an "identity value") for seeding the computation
+    with an arbitrary prefix.
+- Device-wide reductions and scans can now have input sequence types that are
+    different from output sequence types (as long as they are convertible).
+
+## Other Enhancements
+
+- Reduce repository size by moving the doxygen binary to doc repository.
+- Minor reduction in `cub::BlockScan` instruction counts.
+
+## Bug Fixes
+
+- Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`.
+- Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into
+    double.
+- Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`.
+- Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error
+    state upon successful retry.
+- Issue #46: Very high amount of needed memory from the
+    `cub::DeviceHistogram::HistogramEven`.
+- Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled
+
+# CUB 1.5.2
+
+## Summary
+
+CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance
+  for SM5x (Maxwell).
+
+## Enhancements
+
+- Improved medium-size scan performance on SM5x (Maxwell).
+- Refactored `cub::CachingDeviceAllocator`:
+  - Now spends less time locked.
+  - Uses C++11's `std::mutex` when available.
+  - Failure to allocate a block from the runtime will retry once after
+  		freeing cached allocations.
+  - Now respects max-bin, fixing an issue where blocks in excess of max-bin
+      were still being retained in the free cache.
+
+## Bug fixes:
+
+- Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs.
+
+# CUB 1.5.1
+
+## Summary
+
+CUB 1.5.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for incorrect `cub::DeviceRadixSort` output for some small problems on
+    SM52 (Mawell) GPUs.
+- Fix for macro redefinition warnings when compiling `thrust::sort`.
+
+# CUB 1.5.0
+
+CUB 1.5.0 introduces segmented sort and reduction primitives.
+
+## New Features:
+
+- Segmented device-wide operations for device-wide sort and reduction primitives.
+
+## Bug Fixes:
+
+- #36: `cub::ThreadLoad` generates compiler errors when loading from
+    pointer-to-const.
+- #29: `cub::DeviceRadixSort::SortKeys<bool>` yields compiler errors.
+- #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`.
+- #25: Fix for incorrect results and crashes when radix sorting 0-length
+    problems.
+- Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and
+    warp-reduction on non-primitive data types (e.g. user-defined structs).
+- Fix small radix sorting problems where 0 temporary bytes were required and
+    users code was invoking `malloc(0)` on some systems where that returns
+    `NULL`.
+  CUB assumed the user was asking for the size again and not running the sort.
+
+# CUB 1.4.1
+
+## Summary
+
+CUB 1.4.1 is a minor release.
+
+## Enhancements
+
+- Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types.
+
+## Bug Fixes
+
+- Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and
+    `cub::DeviceReduceByKey`.
+- Remove requirement for callers to define the `CUB_CDP` macro
+    when invoking CUB device-wide rountines using CUDA dynamic parallelism.
+- Fix headers not being included in the proper order (or missing includes)
+    for some block-wide functions.
+
+# CUB 1.4.0
+
+## Summary
+
+CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`,
+  improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell)
+  GPUs.
+
+## New Features:
+
+- `cub::DeviceSpmv` methods for multiplying sparse matrices by
+    dense vectors, load-balanced using a merge-based parallel decomposition.
+- `cub::DeviceRadixSort` sorting entry-points that always return
+    the sorted output into the specified buffer, as opposed to the
+    `cub::DoubleBuffer` in which it could end up in either buffer.
+- `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting
+    offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in
+    a given sequence.
+  Useful for top-down partitioning algorithms like MSD sorting of very-large
+    keys.
+
+## Other Enhancements
+
+- Support and performance tuning for SM5x (Maxwell) GPUs.
+- Updated cub::DeviceHistogram implementation that provides the same
+    "histogram-even" and "histogram-range" functionality as IPP/NPP.
+  Provides extremely fast and, perhaps more importantly, very uniform
+    performance response across diverse real-world datasets, including
+    pathological (homogeneous) sample distributions.
+
+# CUB 1.3.2
+
+## Summary
+
+CUB 1.3.2 is a minor release.
+
+## Bug Fixes
+
+- Fix `cub::DeviceReduce` where reductions of small problems (small enough to
+    only dispatch a single thread block) would run in the default stream (stream
+    zero) regardless of whether an alternate stream was specified.
+
+# CUB 1.3.1
+
+## Summary
+
+CUB 1.3.1 is a minor release.
+
+## Bug Fixes
+
+- Workaround for a benign WAW race warning reported by cuda-memcheck
+    in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm.
+- Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more
+    key bits than the caller specified (up to the nearest radix digit).
+- Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and
+    SM3x (Kepler) GPUs.
+
+# CUB 1.3.0
+
+## Summary
+
+CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide
+  primitives and adds an enhanced version of `cub::WarpScan`.
+
+## Breaking Changes
+
+- CUB's collective (block-wide, warp-wide) primitives underwent a minor
+    interface refactoring:
+  - To provide the appropriate support for multidimensional thread blocks,
+      The interfaces for collective classes are now template-parameterized by
+      X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being
+      optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`).
+    Furthermore, the constructors that accept remapped linear
+      thread-identifiers have been removed: all primitives now assume a
+      row-major thread-ranking for multidimensional thread blocks.
+  - To allow the host program (compiled by the host-pass) to accurately
+      determine the device-specific storage requirements for a given collective
+      (compiled for each device-pass), the interfaces for collective classes
+      are now (optionally) template-parameterized by the desired PTX compute
+      capability.
+    This is useful when aliasing collective storage to shared memory that has
+      been allocated dynamically by the host at the kernel call site.
+  - Most CUB programs having typical 1D usage should not require any
+      changes to accomodate these updates.
+
+## New Features
+
+- Added "combination" `cub::WarpScan` methods for efficiently computing
+    both inclusive and exclusive prefix scans (and sums).
+
+## Bug Fixes
+
+- Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and
+    `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be
+    returned when parameterized for floating-point types (fp32, fp64).
+- Workaround for ptxas error when compiling with with -G flag on Linux (for
+    debug instrumentation).
+- Fixes for certain scan scenarios using custom scan operators where code
+    compiled for SM1x is run on newer GPUs of higher compute-capability: the
+    compiler could not tell which memory space was being used collective
+    operations and was mistakenly using global ops instead of shared ops.
+
+# CUB 1.2.3
+
+## Summary
+
+CUB 1.2.3 is a minor release.
+
+## Bug Fixes
+
+- Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for
+    non-primitive value types.
+- Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation.
+
+# CUB 1.2.2
+
+## Summary
+
+CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections
+  for examples.
+
+## New Features
+
+- MSVC project solutions for device-wide and block-wide examples
+- New algorithmic variant of cub::BlockReduce for improved performance
+    when using commutative operators (e.g., numeric addition).
+
+## Bug Fixes
+
+- Inclusion of Thrust headers in a certain order prevented CUB device-wide
+    primitives from working properly.
+
+# CUB 1.2.0
+
+## Summary
+
+CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and
+  `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0.
+
+## New Features
+
+- `cub::DeviceReduce::ReduceByKey`.
+- `cub::DeviceReduce::RunLengthEncode`.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition`
+    performance.
+- Documentation and testing:
+  - Added performance-portability plots for many device-wide primitives.
+  - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and
+      Thrust 1.6 (and older).
+- Revised the operation of temporary tile status bookkeeping for
+    `cub::DeviceScan` (and similar) to be safe for current code run on future
+    platforms (now uses proper fences).
+
+## Bug Fixes
+
+- Fix `cub::DeviceScan` bug where Windows alignment disagreements between host
+    and device regarding user-defined data types would corrupt tile status.
+- Fix `cub::BlockScan` bug where certain exclusive scans on custom data types
+    for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for
+    the first thread in the block.
+- Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0.
+
+# CUB 1.1.1
+
+## Summary
+
+CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting,
+  `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and
+  `cub::MaxSMOccupancy`.
+Additionally, scan and sort performance for older GPUs has been improved and
+  many bugs have been fixed.
+
+## Breaking Changes
+
+- Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing
+    cache-modifiers from their interfaces.
+  `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator`
+    should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that
+    behavior.
+
+## New Features
+
+- `cub::TexObjInputIterator`, `cub::TexRefInputIterator`,
+    `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator`
+    types for loading & storing arbitrary types through the cache hierarchy.
+  They are compatible with Thrust.
+- Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`.
+- Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`.
+- `cub::DeviceSelect` (select-unique, select-if, and select-flagged).
+- `cub::DevicePartition` (partition-if, partition-flagged).
+- Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for
+    warp-wide communication of arbitrary data types (SM3x and up).
+- `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given
+    kernel function pointer.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older
+    GPUs (SM1x to SM3x).
+- Renamed device-wide `stream_synchronous` param to `debug_synchronous` to
+    avoid confusion about usage.
+- Documentation improvements:
+  - Added simple examples of device-wide methods.
+  - Improved doxygen documentation and example snippets.
+- Improved test coverege to include up to 21,000 kernel variants and 851,000
+    unit tests (per architecture, per platform).
+
+## Bug Fixes
+
+- Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when
+    operating on non-primitive types for older architectures SM1x.
+- SHFL-based scans and reductions produced incorrect results for multi-word
+    types (size > 4B) on Linux.
+- For `cub::WarpScan`-based scans, not all threads in the first warp were
+    entering the prefix callback functor.
+- `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35
+    architectures.
+- `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit
+    Linux was incorrect.
+- `cub::BlockDiscontinuity` failed to compile for types other than
+    `int32_t`/`uint32_t`.
+- CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide
+    methods now report the same temporary storage allocation size requirement as
+    their host-callable counterparts.
+
+# CUB 1.0.2
+
+## Summary
+
+CUB 1.0.2 is a minor release.
+
+## Bug Fixes
+
+- Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`,
+    and `cub::BlockDiscontinuity`.
+- Cleaned up unnecessary/missing header includes.
+  You can now safely include a specific .cuh (instead of `cub.cuh`).
+- Bug/compilation fixes for `cub::BlockHistogram`.
+
+# CUB 1.0.1
+
+## Summary
+
+CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`.
+Numerous other performance and correctness fixes and included.
+
+## Breaking Changes
+
+- New collective interface idiom (specialize/construct/invoke).
+
+## New Features
+
+- `cub::DeviceRadixSort`.
+  Implements short-circuiting for homogenous digit passes.
+- `cub::DeviceScan`.
+  Implements single-pass "adaptive-lookback" strategy.
+
+## Other Enhancements
+
+- Significantly improved documentation (with example code snippets).
+- More extensive regression test suit for aggressively testing collective
+    variants.
+- Allow non-trially-constructed types (previously unions had prevented aliasing
+    temporary storage of those types).
+- Improved support for SM3x SHFL (collective ops now use SHFL for types larger
+    than 32 bits).
+- Better code generation for 64-bit addressing within
+    `cub::BlockLoad`/`cub::BlockStore`.
+- `cub::DeviceHistogram` now supports histograms of arbitrary bins.
+- Updates to accommodate CUDA 5.5 dynamic parallelism.
+
+## Bug Fixes
+
+- Workarounds for SM10 codegen issues in uncommonly-used
+    `cub::WarpScan`/`cub::WarpReduce` specializations.
+
+# CUB 0.9.4
+
+## Summary
+
+CUB 0.9.3 is a minor release.
+
+## Enhancements
+
+- Various documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixed compilation errors for SM1x.
+- Fixed compilation errors for some WarpScan entrypoints on SM3x and up.
+
+# CUB 0.9.3
+
+## Summary
+
+CUB 0.9.3 adds histogram algorithms and work management utility descriptors.
+
+## New Features
+
+- `cub::DevicHistogram256`.
+- `cub::BlockHistogram256`.
+- `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which
+    trades more register consumption for less shared memory I/O.
+- `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors.
+
+## Other Enhancements
+
+- Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves
+    performance on SM3x by using SHFL.
+- Allow types other than builtin types to be used in `cub::WarpScan::*Sum`
+    methods if they only have `operator+` overloaded.
+  Previously they also required to support assignment from `int(0)`.
+- Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work
+    even when block size is not an even multiple of warp size.
+- Refactoring of `cub::DeviceAllocator` interface and
+    `cub::CachingDeviceAllocator` implementation.
+
+# CUB 0.9.2
+
+## Summary
+
+CUB 0.9.2 adds `cub::WarpReduce`.
+
+## New Features
+
+- `cub::WarpReduce`, which uses the SHFL instruction when applicable.
+  `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing
+    its own.
+
+## Enhancements
+
+- Documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixes for 64-bit Linux compilation warnings and errors.
+
+# CUB 0.9.1
+
+## Summary
+
+CUB 0.9.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and
+    summation.
+  Summation entrypoints are now called `::Sum()`, similar to the
+    convention in `cub::BlockScan`.
+- Small edits to documentation and download tracking.
+
+# CUB 0.9.0
+
+## Summary
+
+Initial preview release.
+CUB is the first durable, high-performance library of cooperative block-level,
+  warp-level, and thread-level primitives for CUDA kernel programming.
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt
new file mode 100644
index 0000000..9b76052
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt
@@ -0,0 +1,334 @@
+cmake_minimum_required(VERSION 3.8)
+
+project(CUB CUDA CXX)
+
+set(CUB_SOURCE ${CMAKE_SOURCE_DIR})
+# include(cmake/common_variables.cmake)
+
+if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
+
+  set_property(
+    CACHE CMAKE_BUILD_TYPE
+    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
+  )
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
+  set(CMAKE_CONFIGURE_DEPENDS CONFIGURE_DEPENDS)
+endif ()
+
+list(INSERT CMAKE_MODULE_PATH 0 ${PROJECT_SOURCE_DIR}/cmake)
+include(AppendOptionIfAvailable)
+
+# Please note this also sets the default for the CUDA C++ version; see the comment below.
+set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+message("-- C++ Standard version: ${CMAKE_CXX_STANDARD}")
+
+if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
+  unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+  message(FATAL_ERROR "CUB tests and examples require the C++ compiler"
+      " and the CUDA host compiler to be the same; to set this compiler, please"
+      " use the CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER"
+      " variable.")
+endif ()
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+
+enable_language(CUDA)
+
+# Force CUDA C++ standard to be the same as the C++ standard used.
+#
+# Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
+# which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
+# versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
+# In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
+if (DEFINED CMAKE_CUDA_STANDARD)
+    message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
+        " is used as the C++ standard version for both C++ and CUDA.")
+endif()
+unset(CMAKE_CUDA_STANDARD CACHE)
+set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+
+set(CUB_HIGHEST_COMPUTE_ARCH 75)
+set(CUB_KNOWN_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72 75)
+
+option(CUB_DISABLE_ARCH_BY_DEFAULT "If ON, then all CUDA architectures are disabled on the initial CMake run." OFF)
+set(OPTION_INIT ON)
+if (CUB_DISABLE_ARCH_BY_DEFAULT)
+  set(OPTION_INIT OFF)
+endif ()
+
+if (NOT ${CUB_HIGHEST_COMPUTE_ARCH} IN_LIST CUB_KNOWN_COMPUTE_ARCHS)
+  message(FATAL_ERROR "When changing the highest compute version, don't forget to add it to the list!")
+endif ()
+
+foreach (COMPUTE_ARCH IN LISTS CUB_KNOWN_COMPUTE_ARCHS)
+  option(CUB_ENABLE_COMPUTE_${COMPUTE_ARCH} "Enable code generation for tests for sm_${COMPUTE_ARCH}" ${OPTION_INIT})
+  if (CUB_ENABLE_COMPUTE_${COMPUTE_ARCH})
+    list(APPEND CUB_ENABLED_ARCH ${COMPUTE_ARCH})
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
+    set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} sm_${COMPUTE_ARCH}")
+  endif ()
+endforeach ()
+
+option(CUB_ENABLE_COMPUTE_FUTURE "Enable code generation for tests for compute_${CUB_HIGHEST_COMPUTE_ARCH}" ${OPTION_INIT})
+if (CUB_ENABLE_COMPUTE_FUTURE)
+  list(APPEND CUB_ENABLED_ARCH ${CUB_HIGHEST_COMPUTE_ARCH})
+  set(CMAKE_CUDA_FLAGS
+    "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${CUB_HIGHEST_COMPUTE_ARCH},code=compute_${CUB_HIGHEST_COMPUTE_ARCH}")
+  set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} compute_${CUB_HIGHEST_COMPUTE_ARCH}")
+endif ()
+
+message("-- Enabled CUDA architectures:${COMPUTE_MESSAGE}")
+
+# Create a variable containing the minimal target arch for tests
+list(REMOVE_DUPLICATES CUB_ENABLED_ARCH)
+list(SORT CUB_ENABLED_ARCH)
+list(GET CUB_ENABLED_ARCH 0 CUB_MINIMAL_ENABLED_ARCH)
+
+
+if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00)
+    message(FATAL_ERROR "This version of MSVC no longer supported.")
+  endif ()
+endif ()
+
+if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
+    message(FATAL_ERROR "This version of GCC no longer supported.")
+  endif ()
+endif ()
+
+if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # TODO Enable /Wall
+  append_option_if_available("/WX" CUB_CXX_WARNINGS)
+
+  # Disabled loss-of-data conversion warnings.
+  # TODO Re-enable.
+  append_option_if_available("/wd4244" CUB_CXX_WARNINGS)
+  append_option_if_available("/wd4267" CUB_CXX_WARNINGS)
+
+  # Suppress numeric conversion-to-bool warnings.
+  # TODO Re-enable.
+  append_option_if_available("/wd4800" CUB_CXX_WARNINGS)
+
+  # Disable warning about applying unary operator- to unsigned type.
+  append_option_if_available("/wd4146" CUB_CXX_WARNINGS)
+
+  set(CUB_TREAT_FILE_AS_CXX "/TP")
+else ()
+  append_option_if_available("-Werror" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wall" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wextra" CUB_CXX_WARNINGS)
+  append_option_if_available("-Winit-self" CUB_CXX_WARNINGS)
+  append_option_if_available("-Woverloaded-virtual" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wcast-qual" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wno-cast-align" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wno-long-long" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wno-variadic-macros" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wno-unused-function" CUB_CXX_WARNINGS)
+  append_option_if_available("-Wno-unused-variable" CUB_CXX_WARNINGS)
+
+  set(CUB_TREAT_FILE_AS_CXX "-x c++")
+endif ()
+
+if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5)
+    # In GCC 4.4, the CUDA backend's kernel launch templates cause
+    # impossible-to-decipher "'<anonymous>' is used uninitialized in this
+    # function" warnings, so we disable uninitialized variable warnings.
+    append_option_if_available("-Wno-uninitialized" CUB_CXX_WARNINGS)
+  endif ()
+
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5)
+    # This isn't available until GCC 4.3, and misfires on TMP code until
+    # GCC 4.5.
+    append_option_if_available("-Wlogical-op" CUB_CXX_WARNINGS)
+  endif ()
+
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
+    # GCC 7.3 complains about name mangling changes due to `noexcept`
+    # becoming part of the type system; we don't care.
+    append_option_if_available("-Wno-noexcept-type" CUB_CXX_WARNINGS)
+  endif ()
+
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_STANDARD EQUAL 98)
+    # thrust::complex can't really be made trivially copyable in pre-11.
+    # Disable a warning about a non-trivially-copyable type being memmoved that was added to GCC 8.
+    append_option_if_available("-Wno-class-memaccess" CUB_CXX_WARNINGS)
+  endif ()
+endif ()
+
+if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
+    ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}"))
+  # xlC and Clang warn about unused parameters in uninstantiated templates.
+  # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out
+  # (and thus has unused parameters) when you aren't using it.
+  append_option_if_available("-Wno-unused-parameters" CUB_CXX_WARNINGS)
+endif ()
+
+if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # -Wunneeded-internal-declaration misfires in the unit test framework
+  # on older versions of Clang.
+  append_option_if_available("-Wno-unneeded-internal-declaration" CUB_CXX_WARNINGS)
+endif ()
+
+foreach (CXX_OPTION IN LISTS CUB_CXX_WARNINGS)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_OPTION}")
+endforeach ()
+
+foreach (CXX_OPTION IN LISTS CUB_CXX_WARNINGS)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${CXX_OPTION}")
+endforeach ()
+
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+list(APPEND CUB_HEADER_GLOBS cub/*.cuh)
+
+# Get all .cuh files...
+file(
+  GLOB_RECURSE CUB_HEADERS
+  RELATIVE ${PROJECT_SOURCE_DIR}/cub
+  ${CMAKE_CONFIGURE_DEPENDS}
+  ${CUB_HEADER_GLOBS}
+)
+
+foreach (CUB_HEADER IN LISTS CUB_HEADERS)
+
+  set(CUB_HEADER_TEST_EXT .cu)
+
+  set(SOURCE_NAME headers/${CUB_HEADER}${CUB_HEADER_TEST_EXT})
+  configure_file(cmake/header_test.in ${SOURCE_NAME})
+
+  list(APPEND CUB_HEADER_TEST_SOURCES ${SOURCE_NAME})
+endforeach ()
+
+add_library(header-test OBJECT ${CUB_HEADER_TEST_SOURCES})
+target_include_directories(
+  header-test
+  PUBLIC ${PROJECT_SOURCE_DIR}
+)
+
+#Create Header-Only Library/Target
+
+add_library(CUB INTERFACE)
+target_include_directories(CUB INTERFACE ${CMAKE_SOURCE_DIR})
+
+
+
+include(CTest)
+enable_testing()
+
+# Handle tests.
+
+math(EXPR CUB_TEST_ARCH ${CUB_MINIMAL_ENABLED_ARCH}*10)
+message("-- CUB Test architecture (TEST_ARCH): ${CUB_TEST_ARCH}")
+
+set(CUB_TEST_RUN_ARGUMENTS
+  -DCUB_SOURCE=${CMAKE_SOURCE_DIR}
+  -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake")
+
+list(APPEND CUB_TEST_GLOBS test/test_*.cu)
+
+file(
+  GLOB CUB_TESTS
+  RELATIVE ${PROJECT_SOURCE_DIR}/test
+  ${CMAKE_CONFIGURE_DEPENDS}
+  ${CUB_TEST_GLOBS}
+)
+
+foreach (CUB_TEST_SOURCE IN LISTS CUB_TESTS)
+  # TODO: Per-test flags.
+
+  set(CUB_TEST_ADD_TO_CTEST ON)
+
+  get_filename_component(CUB_TEST_CATEGORY ${CUB_TEST_SOURCE} DIRECTORY)
+  if (NOT ("" STREQUAL "${CUB_TEST_CATEGORY}"))
+    set(CUB_TEST_CATEGORY "${CUB_TEST_CATEGORY}.")
+  endif ()
+
+  get_filename_component(CUB_TEST_NAME ${CUB_TEST_SOURCE} NAME_WE)
+
+  set(CUB_TEST "cub.test.${CUB_TEST_CATEGORY}${CUB_TEST_NAME}")
+
+  add_executable(
+    ${CUB_TEST}
+    ${PROJECT_SOURCE_DIR}/test/${CUB_TEST_SOURCE}
+  )
+
+  target_compile_definitions(${CUB_TEST} PRIVATE TEST_ARCH=${CUB_TEST_ARCH})
+
+  target_link_libraries(${CUB_TEST} CUB)
+
+  target_include_directories(
+    ${CUB_TEST} 
+    PRIVATE ${PROJECT_SOURCE_DIR}/test
+  )
+
+  if (CUB_TEST_ADD_TO_CTEST)
+    add_test(NAME ${CUB_TEST}
+      COMMAND ${CMAKE_COMMAND}
+        -DCUB_BINARY=$<TARGET_FILE:${CUB_TEST}>
+        ${CUB_TEST_RUN_ARGUMENTS})
+  endif ()
+
+endforeach ()
+
+# Handle examples.
+
+list(APPEND CUB_EXAMPLE_GLOBS examples/example_*.cu)
+
+if (CMAKE_VERSION VERSION_LESS 3.12)
+  file(
+    GLOB_RECURSE CUB_EXAMPLES
+    RELATIVE ${PROJECT_SOURCE_DIR}/examples
+    ${CUB_EXAMPLE_GLOBS}
+    CONFIGURE_DEPENDS
+  )
+else ()
+  file(
+    GLOB_RECURSE CUB_EXAMPLES
+    RELATIVE ${PROJECT_SOURCE_DIR}/examples
+    ${CUB_EXAMPLE_GLOBS}
+  )
+endif ()
+
+set(CUB_EXAMPLE_RUN_ARGUMENTS
+  -DCUB_SOURCE=${CMAKE_SOURCE_DIR}
+  -P "${CMAKE_SOURCE_DIR}/cmake/run_example.cmake")
+
+foreach (CUB_EXAMPLE_SOURCE IN LISTS CUB_EXAMPLES)
+  # TODO: Per-example flags.
+
+  get_filename_component(CUB_EXAMPLE_CATEGORY ${CUB_EXAMPLE_SOURCE} DIRECTORY)
+  if (NOT ("" STREQUAL "${CUB_EXAMPLE_CATEGORY}"))
+    set(CUB_EXAMPLE_CATEGORY "${CUB_EXAMPLE_CATEGORY}.")
+  endif ()
+
+  get_filename_component(CUB_EXAMPLE_NAME ${CUB_EXAMPLE_SOURCE} NAME_WE)
+
+  set(CUB_EXAMPLE "cub.example.${CUB_EXAMPLE_CATEGORY}${CUB_EXAMPLE_NAME}")
+
+  add_executable(
+    ${CUB_EXAMPLE}
+    ${PROJECT_SOURCE_DIR}/examples/${CUB_EXAMPLE_SOURCE}
+  )
+
+  target_link_libraries(${CUB_EXAMPLE} CUB)
+
+  target_include_directories(
+    ${CUB_EXAMPLE}
+    PRIVATE ${PROJECT_SOURCE_DIR}/examples
+  )
+
+  add_test(NAME ${CUB_EXAMPLE}
+    COMMAND ${CMAKE_COMMAND}
+      -DCUB_EXAMPLE=${CUB_EXAMPLE}
+      -DCUB_BINARY=$<TARGET_FILE:${CUB_EXAMPLE}>
+      ${CUB_EXAMPLE_RUN_ARGUMENTS})
+endforeach ()
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md
new file mode 100644
index 0000000..bb231d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md
@@ -0,0 +1,74 @@
+# CUB Development Model
+
+The following is a description of the basic development process that CUB follows. This is a living
+document that will evolve as our process evolves.
+
+CUB is distributed in three ways:
+
+   * On GitHub.
+   * In the NVIDIA HPC SDK.
+   * In the CUDA Toolkit.
+
+## Trunk Based Development
+
+CUB uses [trunk based development](https://trunkbaseddevelopment.com). There is a single long-lived
+branch called `master`. Engineers may create branches for feature development. Such branches always
+merge into `master`. There are no release branches. Releases are produced by taking a snapshot of
+`master` ("snapping"). After a release has been snapped from `master`, it will never be changed.
+
+## Repositories
+
+As CUB is developed both on GitHub and internally at NVIDIA, there are three main places where code lives:
+
+   * The Source of Truth, the [public CUB repository](https://github.com/thrust/cub), referred to as
+     `github` later in this document.
+   * An internal GitLab repository, referred to as `gitlab` later in this document.
+   * An internal Perforce repository, referred to as `perforce` later in this document.
+
+## Versioning
+
+CUB has its own versioning system for releases, independent of the versioning scheme of the NVIDIA
+HPC SDK or the CUDA Toolkit.
+
+Today, CUB version numbers have a specific [semantic meaning](https://semver.org/).
+Releases prior to 1.10.0 largely, but not strictly, followed these semantic meanings.
+
+The version number for a CUB release uses the following format: `MMM.mmm.ss-ppp`, where:
+
+   * `CUB_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits. It is incremented
+     when the fundamental nature of the library evolves, leading to widespread changes across the
+     entire library interface with no guarantee of API, ABI, or semantic compatibility with former
+     versions.
+   * `CUB_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits. It is incremented when
+     breaking API, ABI, or semantic changes are made.
+   * `CUB_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits. It is incremented
+     when notable new features or bug fixes or features that are API, ABI, and semantic backwards
+     compatible are added.
+   * `CUB_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits. It is incremented if any
+     change in the repo whatsoever is made and no other version component has been incremented.
+
+The `<cub/version.h>` header defines `CUB_*` macros for all of the version components mentioned
+above. Additionally, a `CUB_VERSION` macro is defined, which is an integer literal containing all
+of the version components except for `CUB_PATCH_NUMBER`.
+
+## Branches and Tags
+
+The following tag names are used in the CUB project:
+
+  * `github/nvhpc-X.Y`: the tag that directly corresponds to what has been shipped in the NVIDIA HPC SDK release X.Y.
+  * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y.
+  * `github/A.B.C`: the tag that directly corresponds to a CUB version A.B.C.
+
+The following branch names are used in the CUB project:
+
+  * `github/master`: the Source of Truth development branch of CUB.
+  * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories.
+  * `github/feature/<name>`: feature branch for a feature under development.
+  * `github/bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where `bug-system` is `github` or `nvidia`.
+  * `gitlab/master`: mirror of `github/master`.
+  * `perforce/private`: mirrored `github/master`, plus files necessary for internal NVIDIA testing systems.
+
+On the rare occasion that we cannot do work in the open, for example when developing a change specific to an
+unreleased product, these branches may exist on `gitlab` instead of `github`. By default, everything should be
+in the open on `github` unless there is a strong motivation for it to not be open.
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT
new file mode 100644
index 0000000..a678e64
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT
@@ -0,0 +1,24 @@
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md
new file mode 100644
index 0000000..6c4d297
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md
@@ -0,0 +1,161 @@
+<hr>
+<h3>About CUB</h3>
+
+CUB provides state-of-the-art, reusable software components for every layer
+of the CUDA programming model:
+- [<b><em>Device-wide primitives</em></b>](https://nvlabs.github.com/cub/group___device_module.html)
+  - Sort, prefix scan, reduction, histogram, etc.
+  - Compatible with CUDA dynamic parallelism
+- [<b><em>Block-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___block_module.html)
+  - I/O, sort, prefix scan, reduction, histogram, etc.
+  - Compatible with arbitrary thread block sizes and types
+- [<b><em>Warp-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___warp_module.html)
+  - Warp-wide prefix scan, reduction, etc.
+  - Safe and architecture-specific
+- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
+  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc.
+
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
+
+CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit.
+
+We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
+
+<br><hr>
+<h3>A Simple Example</h3>
+
+```C++
+#include <cub/cub.cuh>
+
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
+     // owning 16 integer items each
+     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
+     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
+
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load;
+         typename BlockStore::TempStorage      store;
+     } temp_storage;
+
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
+
+     // Store the sorted segment
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses `cub::BlockRadixSort` to collectively sort
+its own input segment.  The class is specialized by the
+data type being sorted, by the number of threads per block, by the number of
+keys per thread, and implicitly by the targeted compilation architecture.
+
+The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized.
+Furthermore, to provide coalesced accesses to device memory, these primitives are
+configured to access memory using a striped access pattern (where consecutive threads
+simultaneously access consecutive items) and then <em>transpose</em> the keys into
+a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.
+
+Once specialized, these classes expose opaque `TempStorage` member types.
+The thread block uses these storage types to statically allocate the union of
+shared memory needed by the thread block.  (Alternatively these storage types
+could be aliased to global memory allocations).
+
+<br><hr>
+<h3>Releases</h3>
+
+CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition
+to GitHub.
+
+See the [changelog](CHANGELOG.md) for details about specific releases.
+
+| CUB Release               | Included In                    |
+| ------------------------- | ------------------------------ |
+| 1.9.10                    | NVIDIA HPC SDK 20.5            |
+| 1.9.9                     | CUDA Toolkit 11.0              |
+| 1.9.8-1                   | NVIDIA HPC SDK 20.3            |
+| 1.9.8                     | CUDA Toolkit 11.0 Early Access |
+| 1.9.8                     | CUDA 11.0 Early Access         |
+| 1.8.0                     |                                |
+| 1.7.5                     | Thrust 1.9.2                   |
+| 1.7.4                     | Thrust 1.9.1-2                 |
+| 1.7.3                     |                                |
+| 1.7.2                     |                                |
+| 1.7.1                     |                                |
+| 1.7.0                     | Thrust 1.9.0-5                 |
+| 1.6.4                     |                                |
+| 1.6.3                     |                                |
+| 1.6.2 (previously 1.5.5)  |                                |
+| 1.6.1 (previously 1.5.4)  |                                |
+| 1.6.0 (previously 1.5.3)  |                                |
+| 1.5.2                     |                                |
+| 1.5.1                     |                                |
+| 1.5.0                     |                                |
+| 1.4.1                     |                                |
+| 1.4.0                     |                                |
+| 1.3.2                     |                                |
+| 1.3.1                     |                                |
+| 1.3.0                     |                                |
+| 1.2.3                     |                                |
+| 1.2.2                     |                                |
+| 1.2.0                     |                                |
+| 1.1.1                     |                                |
+| 1.0.2                     |                                |
+| 1.0.1                     |                                |
+| 0.9.4                     |                                |
+| 0.9.2                     |                                |
+| 0.9.1                     |                                |
+| 0.9.0                     |                                |
+
+<br><hr>
+<h3>Development Model</h3>
+
+For information on development model, see [this document](DEVELOPMENT_MODEL.md).
+
+<br><hr>
+<h3>Open Source License</h3>
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake
new file mode 100644
index 0000000..e495797
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake
@@ -0,0 +1,13 @@
+include_guard(GLOBAL)
+include(CheckCXXCompilerFlag)
+
+macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
+
+set(_VAR "CXX_FLAG_${_FLAG}")
+check_cxx_compiler_flag(${_FLAG} ${_VAR})
+
+if (${${_VAR}})
+  list(APPEND ${_LIST} ${_FLAG})
+endif ()
+
+endmacro ()
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in
new file mode 100644
index 0000000..1792808
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in
@@ -0,0 +1 @@
+#include <cub/${CUB_HEADER}>
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake
new file mode 100644
index 0000000..bc5ca63
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake
@@ -0,0 +1,20 @@
+include("${CUB_SOURCE}/cmake/common_variables.cmake")
+
+execute_process(
+  COMMAND "${CUB_BINARY}"
+  ${FILECHECK_COMMAND}
+  RESULT_VARIABLE EXIT_CODE
+  OUTPUT_VARIABLE STDOUT
+  ERROR_VARIABLE STDERR
+)
+
+if (NOT "0" STREQUAL "${EXIT_CODE}")
+  message(FATAL_ERROR "${CUB_BINARY} failed (${EXIT_CODE}):\n${STDERR}")
+endif ()
+
+if (CHECK_EMPTY_OUTPUT)
+  string(LENGTH "${OUTPUT_VARIABLE}" LENGTH)
+  if (NOT ${LENGTH} EQUAL 0)
+    message(FATAL_ERROR "${CUB_BINARY}: output received, but not expected.")
+  endif ()
+endif ()
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake
new file mode 100644
index 0000000..5bc422d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake
@@ -0,0 +1,8 @@
+execute_process(
+  COMMAND "${CUB_BINARY}"
+  RESULT_VARIABLE EXIT_CODE
+)
+
+if (NOT "0" STREQUAL "${EXIT_CODE}")
+    message(FATAL_ERROR "${CUB_BINARY} failed (${EXIT_CODE})")
+endif ()
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk
new file mode 100644
index 0000000..4010ed3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk
@@ -0,0 +1,203 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# *
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# *
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
+
+COMMA = ,
+ifdef sm
+	SM_ARCH = $(subst $(COMMA),-,$(sm))
+else
+    SM_ARCH = 600
+endif
+
+ifeq (700, $(findstring 700, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+    SM_DEF 		+= -DSM700
+    TEST_ARCH 	= 700
+endif
+ifeq (620, $(findstring 620, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
+    SM_DEF 		+= -DSM620
+    TEST_ARCH 	= 620
+endif
+ifeq (610, $(findstring 610, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+    SM_DEF 		+= -DSM610
+    TEST_ARCH 	= 610
+endif
+ifeq (600, $(findstring 600, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+    SM_DEF 		+= -DSM600
+    TEST_ARCH 	= 600
+endif
+ifeq (520, $(findstring 520, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+    SM_DEF 		+= -DSM520
+    TEST_ARCH 	= 520
+endif
+ifeq (370, $(findstring 370, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\"
+    SM_DEF 		+= -DSM370
+    TEST_ARCH 	= 370
+endif
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+    SM_DEF 		+= -DSM350
+    TEST_ARCH 	= 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+    SM_DEF 		+= -DSM300
+    TEST_ARCH 	= 300
+endif
+
+
+# [cdp=<0|1>] CDP enable option (default: no)
+ifeq ($(cdp), 1)
+	DEFINES += -DCUB_CDP
+	CDP_SUFFIX = cdp
+    NVCCFLAGS += -rdc=true -lcudadevrt
+else
+	CDP_SUFFIX = nocdp
+endif
+
+
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+    NPPI = -lnppist
+endif
+
+
+# [abi=<0|1>] CUDA ABI option (enabled by default)
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+
+# [open64=<0|1>] Middle-end compiler option (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else
+	PTX_SUFFIX = nvvm
+endif
+
+
+# [verbose=<0|1>] Verbose toolchain output from nvcc option
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+
+# [keep=<0|1>] Keep intermediate compilation artifacts option
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# [debug=<0|1>] Generate debug mode code
+ifeq ($(debug), 1)
+	NVCCFLAGS += -G
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
+
+NVCC ?= "$(shell which nvcc)"
+ifdef nvccver
+    NVCC_VERSION = $(nvccver)
+else
+    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+    # For MSVC
+    # Enable more warnings and treat as errors
+    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler /fp:strict
+    # Help the compiler/linker work with huge numbers of kernels on Windows
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+	CC = cl
+
+	# Multithreaded runtime
+	NVCCFLAGS += -Xcompiler /MT
+
+ifneq ($(force32), 1)
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
+else
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
+endif
+	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
+else
+    # For g++
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler -ffloat-store
+    CC = g++
+ifneq ($(force32), 1)
+    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
+else
+    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
+endif
+endif
+
+# Suffix to append to each binary
+BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
+			$(CUB_DIR)common.mk
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh
new file mode 100644
index 0000000..7559bf1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 0000000..c861a41
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,790 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
+    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortDownsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 0000000..c65773f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,527 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortUpsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh
new file mode 100644
index 0000000..0f3ba75
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh
@@ -0,0 +1,386 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                ComputeT,                       ///< Dominant compute type
+    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentReducePolicy :
+    ScalingType
+{
+    enum
+    {
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 0000000..01eded8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh
new file mode 100644
index 0000000..79697b7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh
new file mode 100644
index 0000000..0781b3e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh
@@ -0,0 +1,469 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                         NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                    ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,                ///< The BlockScan algorithm to use
+    typename                    ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+
+struct AgentScanPolicy :
+    ScalingType
+{
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 0000000..9cd524a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh
new file mode 100644
index 0000000..e9568f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 0000000..810f893
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 0000000..924ef2a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,814 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3] = {};
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3] = {};
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 0000000..c895375
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh
new file mode 100644
index 0000000..37b8c29
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh
new file mode 100644
index 0000000..35a0333
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh
@@ -0,0 +1,1246 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh
new file mode 100644
index 0000000..0302090
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh
new file mode 100644
index 0000000..fc91f11
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh
@@ -0,0 +1,1229 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh
new file mode 100644
index 0000000..a98976f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh
@@ -0,0 +1,695 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh
new file mode 100644
index 0000000..e666902
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh
@@ -0,0 +1,862 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh
new file mode 100644
index 0000000..bbacdf3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh
new file mode 100644
index 0000000..1bf971f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh
new file mode 100644
index 0000000..513ef35
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh
@@ -0,0 +1,2141 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
+        T initial_value = 0;
+#else
+        T initial_value{};
+#endif
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh
new file mode 100644
index 0000000..723228c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh
@@ -0,0 +1,303 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
+            output = temp_storage[linear_tid + distance].prev;
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh
new file mode 100644
index 0000000..495a155
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh
@@ -0,0 +1,999 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 0000000..3be0a3d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 0000000..f117355
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 0000000..2a57521
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                int valid_raking_threads = (IS_FULL_TILE) ?
+                    RAKING_THREADS :
+                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
+                    partial,
+                    valid_raking_threads,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 0000000..78a32b8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 0000000..4dd3451
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum    reduction_op;
+        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 0000000..1d6c2f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 0000000..3835e48
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,391 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 0000000..6617160
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,435 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 0000000..a8279d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,417 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake
new file mode 100644
index 0000000..4260ba6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake
@@ -0,0 +1,33 @@
+# Parse version information from version.cuh:
+file(READ "${CMAKE_CURRENT_LIST_DIR}/../version.cuh" CUB_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
+math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
+math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
+
+# Build comparison versions:
+set(CUB_COMPAT "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}")
+set(CUB_EXACT "${CUB_COMPAT}.${CUB_VERSION_TWEAK}")
+set(FIND_COMPAT "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}")
+set(FIND_EXACT "${FIND_COMPAT}.${PACKAGE_FIND_VERSION_TWEAK}")
+
+# Set default results
+set(PACKAGE_VERSION ${CUB_EXACT})
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+
+# Test for compatibility (ignores tweak)
+if (FIND_COMPAT VERSION_EQUAL CUB_COMPAT)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+endif()
+
+# Test for exact (does not ignore tweak)
+if (FIND_EXACT VERSION_EQUAL CUB_EXACT)
+  set(PACKAGE_VERSION_EXACT TRUE)
+endif()
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake
new file mode 100644
index 0000000..13b0e2d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake
@@ -0,0 +1,55 @@
+#
+# find_package(CUB) config file.
+#
+# Defines a CUB::CUB target that may be linked from user projects to include
+# CUB.
+
+function(_cub_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit CUB will *always* be used
+  #    during compilation, and the include paths of an IMPORTED CUB::CUB
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to CUB::CUB. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+#
+# Setup targets
+#
+
+_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
+# Strip out the 'cub/cmake/' from 'cub/cmake/cub-config.cmake':
+get_filename_component(_CUB_INCLUDE_DIR "../.." ABSOLUTE BASE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
+
+if (CUB_IGNORE_DEPRECATED_CPP_DIALECT)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_CPP_11)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_COMPILER)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
+endif()
+
+#
+# Standardize version info
+#
+
+set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
+set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
+set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
+set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
+set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
+set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh
new file mode 100644
index 0000000..b909bbf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh
@@ -0,0 +1,40 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static configuration header for the CUB project.
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_deprecated.cuh"
+#include "util_macro.cuh"
+#include "util_namespace.cuh"
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh
new file mode 100644
index 0000000..a71d78f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+// Static configuration
+#include "config.cuh"
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh
new file mode 100644
index 0000000..2ee967b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh
new file mode 100644
index 0000000..65db3b7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh
new file mode 100644
index 0000000..df218a7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh
@@ -0,0 +1,796 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh
new file mode 100644
index 0000000..4f01c24
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh
new file mode 100644
index 0000000..e31ebf0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh
new file mode 100644
index 0000000..ae8a590
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_scan.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 0000000..2ab2a7d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,875 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_radix_sort.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 0000000..97308c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh
new file mode 100644
index 0000000..136d260
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh
new file mode 100644
index 0000000..0be0c20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh
@@ -0,0 +1,174 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 0000000..339b3d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1092 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 110)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    // No global atomic support
+                    result = cudaErrorNotSupported;
+                }
+            #endif
+        }
+        return result;
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS] = {};
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 0000000..2b0919f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1660 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../config.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            ActiveUpsweepPolicyT,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
+        ::Type ActiveUpsweepPolicyT;
+
+    typedef typename If<
+            (ALT_DIGIT_BITS),
+            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
+        ::Type ActiveDownsweepPolicyT;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            ActiveDownsweepPolicyT,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT> BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+    // Dominant-sized key/value type
+    typedef typename If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>::Type DominantT;
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT,       ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy,
+                    typename ActivePolicyT::ScanPolicy,
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3] = {};
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchSegmentedRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+            {
+              _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
+                      "%lld items per thread, %lld SM occupancy, "
+                      "current bit %d, bit_grain %d\n",
+                      (long long)num_segments,
+                      (long long)pass_config.segmented_config.block_threads,
+                      (long long)stream,
+                      (long long)pass_config.segmented_config.items_per_thread,
+                      (long long)pass_config.segmented_config.sm_occupancy,
+                      current_bit,
+                      pass_bits);
+            }
+
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 0000000..c9a5e4f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,885 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename InputT,            ///< Input data type
+    typename OutputT,           ///< Compute/output data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1] = {};
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+    typename SelectedPolicy = DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchSegmentedReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
+            ).doit(segmented_reduce_kernel,
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 0000000..d8d8dca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,560 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+                }
+                else
+                {
+                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 0000000..b68f166
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+                }
+                else
+                {
+                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 0000000..24b30f1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,493 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OutputT> ///< Data type
+struct DeviceScanPolicy
+{
+
+    /// SM10
+    struct Policy100 : ChainedPolicy<100, Policy100, Policy100>
+    {
+        typedef AgentScanPolicy<
+                64, 9,                                          ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy100>
+    {
+        typedef AgentScanPolicy<
+                96, 21,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        typedef AgentScanPolicy<
+                256, 9,                                         ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM600
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+    {
+        typedef AgentScanPolicy<
+                128, 15,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT,            ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceScanPolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type> >
+struct DispatchScan:
+    SelectedPolicy
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    void*           d_temp_storage;         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&         temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT  d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT d_out;                  ///< [out] Pointer to the output sequence of data items
+    ScanOpT         scan_op;                ///< [in] Binary scan functor
+    InitValueT      init_value;             ///< [in] Initial value to seed the exclusive scan
+    OffsetT         num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
+    cudaStream_t    stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool            debug_synchronous;
+    int             ptx_version;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchScan(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous,
+        int             ptx_version
+    ):
+    d_temp_storage(d_temp_storage),
+    temp_storage_bytes(temp_storage_bytes),
+    d_in(d_in),
+    d_out(d_out),
+    num_items(num_items),
+    scan_op(scan_op),
+    init_value(init_value),
+    stream(stream),
+    debug_synchronous(debug_synchronous),
+    ptx_version(ptx_version)
+    {}
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                Policy::BLOCK_THREADS))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(scan_kernel,
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceScanInitKernel<ScanTileStateT>,
+            DeviceScanKernel<Policy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>
+        );
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchScan dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            scan_op,
+            init_value,
+            stream,
+            debug_synchronous,
+            ptx_version
+            );
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 0000000..5fec4cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,546 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+                }
+                else
+                {
+                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 0000000..fb431df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,850 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Init textures
+                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+                }
+            #endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            thrust::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                thrust::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Free textures
+                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+                }
+            #endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh
new file mode 100644
index 0000000..1bcb533
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh
new file mode 100644
index 0000000..d5f8b34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh
new file mode 100644
index 0000000..889a94c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh
new file mode 100644
index 0000000..6b5f676
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = fill_size;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                OffsetT counters[2];
+                counters[FILL] = fill_size;
+                counters[DRAIN] = 0;
+                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                fill_size = d_counters[FILL];
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh
new file mode 100644
index 0000000..39ed4e9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh
@@ -0,0 +1,172 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+#include "../util_cpp_dialect.cuh"
+
+#pragma once
+
+#if CUB_CPP_DIALECT >= 2011
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../config.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if CUB_CPP_DIALECT >= 2011
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+    void TryLock()
+    {
+        mtx.try_lock();
+    }
+
+#else       // C++11
+
+    #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // MSVC
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // C++11
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 0000000..f16fab8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 0000000..7a41a5d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 0000000..e169701
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 0000000..44fb56c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 0000000..c7167a7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 0000000..e665c78
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 0000000..2bd3a60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed.  It is here
+                // only to avoid warnings about missing return statements.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 0000000..e1e4361
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,380 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (CUDART_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+/*
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+*/
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            // Simply dereference the pointer on the host
+            return ptr[tex_offset];
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Use the texture reference
+                return TexId::Fetch(tex_offset);
+            #else
+                // This is dead code that will never be executed.  It is here
+                // only to avoid warnings about missing returns.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDART_VERSION
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 0000000..dee2fea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh
new file mode 100644
index 0000000..31e7596
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh
new file mode 100644
index 0000000..6a3192b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh
@@ -0,0 +1,316 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh
new file mode 100644
index 0000000..41063f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh
new file mode 100644
index 0000000..fd907fc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_operators.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh
new file mode 100644
index 0000000..96b9e65
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh
@@ -0,0 +1,156 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include <iterator>
+#include "../util_namespace.cuh"
+#include "../config.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh
new file mode 100644
index 0000000..47d6c61
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh
@@ -0,0 +1,420 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh
new file mode 100644
index 0000000..fa03996
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh
@@ -0,0 +1,709 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        if (!recached)
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh
new file mode 100644
index 0000000..58d0c73
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
+        !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
+#ifndef CUB_PTX_ARCH
+    #if defined(__NVCOMPILER_CUDA__)
+        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+        // when compiling both host code and device code. Currently, only one
+        // PTX version can be targeted.
+        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(__NVCOMPILER_CUDA__)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
+
+/// Maximum number of devices supported.
+#ifndef CUB_MAX_DEVICES
+    #define CUB_MAX_DEVICES 128
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
+#endif
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct RegBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct MemBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh
new file mode 100644
index 0000000..9be9492
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Detect compiler information.
+ */
+
+#pragma once
+
+// enumerate host compilers we know about
+#define CUB_HOST_COMPILER_UNKNOWN 0
+#define CUB_HOST_COMPILER_MSVC 1
+#define CUB_HOST_COMPILER_GCC 2
+#define CUB_HOST_COMPILER_CLANG 3
+
+// enumerate device compilers we know about
+#define CUB_DEVICE_COMPILER_UNKNOWN 0
+#define CUB_DEVICE_COMPILER_MSVC 1
+#define CUB_DEVICE_COMPILER_GCC 2
+#define CUB_DEVICE_COMPILER_NVCC 3
+#define CUB_DEVICE_COMPILER_CLANG 4
+
+// figure out which host compiler we're using
+#if defined(_MSC_VER)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
+#  define CUB_MSVC_VERSION _MSC_VER
+#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__clang__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
+#  define CUB_CLANG_VERSION                                                    \
+    (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#elif defined(__GNUC__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
+#  define CUB_GCC_VERSION                                                      \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
+#endif // CUB_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__)
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#  if defined(__CUDA__)
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#  else
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
+#  endif
+#else
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh
new file mode 100644
index 0000000..b4cbe92
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh
@@ -0,0 +1,135 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - CUB_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the thrust opt-outs as well:
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+#  define    CUB_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
+     defined(THRUST_IGNORE_DEPRECATED_COMPILER)
+#  define    CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#  define CUB_IGNORE_DEPRECATED_CPP_11
+#  define CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef CUB_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define CUB_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if CUB_CPLUSPLUS < 201103L
+#    define CUB_CPP_DIALECT 2003
+#  elif CUB_CPLUSPLUS < 201402L
+#    define CUB_CPP_DIALECT 2011
+#  elif CUB_CPLUSPLUS < 201703L
+#    define CUB_CPP_DIALECT 2014
+#  elif CUB_CPLUSPLUS == 201703L
+#    define CUB_CPP_DIALECT 2017
+#  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define CUB_CPP_DIALECT 2020
+#  endif
+
+#  undef CUB_CPLUSPLUS // cleanup
+
+#endif // !CUB_CPP_DIALECT
+
+// Define CUB_COMPILER_DEPRECATION macro:
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
+#  define CUB_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define CUB_COMPILER_DEPRECATION(REQ, FIX) \
+  CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+// Minimum required compiler checks:
+#ifndef CUB_IGNORE_DEPRECATED_COMPILER
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
+     CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
+     CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
+#  endif
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
+     CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
+#  endif
+#endif
+
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
+    (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
+  CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
+#endif
+
+#undef CUB_COMPILER_DEPRECATION
+#undef CUB_COMP_DEPR_IMPL
+#undef CUB_COMP_DEPR_IMPL0
+#undef CUB_COMP_DEPR_IMPL1
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh
new file mode 100644
index 0000000..354eab6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
+#ifdef CUB_STDERR
+    if (error)
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if defined(__NVCOMPILER_CUDA__)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format, __VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh
new file mode 100644
index 0000000..b2bf465
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define CUB_DEPRECATED macro.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEPRECATED __declspec(deprecated)
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#else
+#  define CUB_DEPRECATED
+#endif
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh
new file mode 100644
index 0000000..5196f40
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh
@@ -0,0 +1,715 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+#include <atomic>
+#include <array>
+#include <cassert>
+#endif
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+/**
+ * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t& temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the current device or -1 if an error occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int device = -1;
+    if (CubDebug(cudaGetDevice(&device))) return -1;
+    return device;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+/**
+ * \brief RAII helper which saves the current device and switches to the
+ *        specified device on construction and switches to the saved device on
+ *        destruction.
+ */
+struct SwitchDevice
+{
+private:
+    int const old_device;
+    bool const needs_reset;
+public:
+    __host__ __forceinline__ SwitchDevice(int new_device)
+      : old_device(CurrentDevice()), needs_reset(old_device != new_device)
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(new_device));
+    }
+
+    __host__ __forceinline__ ~SwitchDevice()
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(old_device));
+    }
+};
+
+/**
+ * \brief Returns the number of CUDA devices available or -1 if an error
+ *        occurred.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int count = -1;
+    if (CubDebug(cudaGetDeviceCount(&count)))
+        // CUDA makes no guarantees about the state of the output parameter if
+        // `cudaGetDeviceCount` fails; in practice, they don't, but out of
+        // paranoia we'll reset `count` to `-1`.
+        count = -1;
+    return count;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Cache for an arbitrary value produced by a nullary function.
+ */
+template <typename T, T(*Function)()>
+struct ValueCache
+{
+    T const value;
+
+    /**
+     * \brief Call the nullary function to produce the value and construct the
+     *        cache.
+     */
+    __host__ __forceinline__ ValueCache() : value(Function()) {}
+};
+
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+// Host code, only safely usable in C++11 or newer, where thread-safe
+// initialization of static locals is guaranteed.  This is a separate function
+// to avoid defining a local static in a host/device function.
+__host__ __forceinline__ int DeviceCountCachedValue()
+{
+    static ValueCache<int, DeviceCountUncached> cache;
+    return cache.value;
+}
+#endif
+
+/**
+ * \brief Returns the number of CUDA devices available.
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
+{
+    int result = -1;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                result = DeviceCountCachedValue();
+            #else
+                // Host code and C++98.
+                result = DeviceCountUncached();
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = DeviceCountUncached();
+        #endif
+    }
+    return result;
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Per-device cache for a CUDA attribute value; the attribute is queried
+ *        and stored for each device upon construction.
+ */
+struct PerDeviceAttributeCache
+{
+    struct DevicePayload
+    {
+        int         attribute;
+        cudaError_t error;
+    };
+
+    // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
+    // `DeviceEntryInitializing` state, and then proceeds to the
+    // `DeviceEntryReady` state. These are the only state transitions allowed;
+    // e.g. a linear sequence of transitions.
+    enum DeviceEntryStatus
+    {
+        DeviceEntryEmpty = 0,
+        DeviceEntryInitializing,
+        DeviceEntryReady
+    };
+
+    struct DeviceEntry
+    {
+        std::atomic<DeviceEntryStatus> flag;
+        DevicePayload                  payload;
+    };
+
+private:
+    std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
+
+public:
+    /**
+     * \brief Construct the cache.
+     */
+    __host__ __forceinline__ PerDeviceAttributeCache() : entries_()
+    {
+        assert(DeviceCount() <= CUB_MAX_DEVICES);
+    }
+
+    /**
+     * \brief Retrieves the payload of the cached function \p f for \p device.
+     *
+     * \note You must pass a morally equivalent function in to every call or
+     *       this function has undefined behavior.
+     */
+    template <typename Invocable>
+    __host__ DevicePayload operator()(Invocable&& f, int device)
+    {
+        if (device >= DeviceCount())
+            return DevicePayload{0, cudaErrorInvalidDevice};
+
+        auto& entry   = entries_[device];
+        auto& flag    = entry.flag;
+        auto& payload = entry.payload;
+
+        DeviceEntryStatus old_status = DeviceEntryEmpty;
+
+        // First, check for the common case of the entry being ready.
+        if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
+        {
+            // Assume the entry is empty and attempt to lock it so we can fill
+            // it by trying to set the state from `DeviceEntryReady` to
+            // `DeviceEntryInitializing`.
+            if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
+                                             std::memory_order_acq_rel,
+                                             std::memory_order_acquire))
+            {
+                // We successfully set the state to `DeviceEntryInitializing`;
+                // we have the lock and it's our job to initialize this entry
+                // and then release it.
+
+                // We don't use `CubDebug` here because we let the user code
+                // decide whether or not errors are hard errors.
+                if (payload.error = std::forward<Invocable>(f)(payload.attribute))
+                    // Clear the global CUDA error state which may have been
+                    // set by the last call. Otherwise, errors may "leak" to
+                    // unrelated kernel launches.
+                    cudaGetLastError();
+
+                // Release the lock by setting the state to `DeviceEntryReady`.
+                flag.store(DeviceEntryReady, std::memory_order_release);
+            }
+
+            // If the `compare_exchange_weak` failed, then `old_status` has
+            // been updated with the value of `flag` that it observed.
+
+            else if (old_status == DeviceEntryInitializing)
+            {
+                // Another execution agent is initializing this entry; we need
+                // to wait for them to finish; we'll know they're done when we
+                // observe the entry status as `DeviceEntryReady`.
+                do { old_status = flag.load(std::memory_order_acquire); }
+                while (old_status != DeviceEntryReady);
+                // FIXME: Use `atomic::wait` instead when we have access to
+                // host-side C++20 atomics. We could use libcu++, but it only
+                // supports atomics for SM60 and up, even if you're only using
+                // them in host code.
+            }
+        }
+
+        // We now know that the state of our entry is `DeviceEntryReady`, so
+        // just return the entry's payload.
+        return entry.payload;
+    }
+};
+
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
+{
+    // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+    // it can be called.
+    typedef void (*EmptyKernelPtr)();
+    EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+
+    // This is necessary for unused variable warnings in host compilers. The
+    // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
+    (void)reinterpret_cast<void*>(empty_kernel);
+
+    cudaError_t result = cudaSuccess;
+    if (CUB_IS_HOST_CODE) {
+       #if CUB_INCLUDE_HOST_CODE
+            cudaFuncAttributes empty_kernel_attrs;
+
+            do {
+                if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
+                    break;
+            }
+            while(0);
+
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // This is necessary to ensure instantiation of EmptyKernel in device code.
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+            ptx_version = CUB_PTX_ARCH;
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ */
+__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
+{
+    SwitchDevice sd(device);
+    return PtxVersionUncached(ptx_version);
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+template <typename Tag>
+__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+{
+    // C++11 guarantees that initialization of static locals is thread safe.
+    static PerDeviceAttributeCache cache;
+    return cache;
+}
+
+struct PtxVersionCacheTag {};
+struct SmVersionCacheTag {};
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
+{
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+      // If this call fails, then we get the error code back in the payload,
+      // which we check with `CubDebug` below.
+      [=] (int& pv) { return PtxVersionUncached(pv, device); },
+      device);
+
+    if (!CubDebug(payload.error))
+        ptx_version = payload.attribute;
+
+    return payload.error;
+
+#else // Pre C++11.
+
+    return PtxVersionUncached(ptx_version, device);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                auto const device = CurrentDevice();
+
+                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    ptx_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98.
+                result = PtxVersionUncached(ptx_version);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = PtxVersionUncached(ptx_version);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        int major = 0, minor = 0;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#else // Device code without the CUDA runtime.
+
+    (void)sm_version;
+    (void)device;
+
+    // CUDA API calls are not supported from this device.
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11
+                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return SmVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    sm_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * Synchronize the specified \p stream.
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cudaDeviceSynchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes));
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT& op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh
new file mode 100644
index 0000000..ff86365
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh
new file mode 100644
index 0000000..4488d97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+#include "version.cuh"
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
+
+// Declare these namespaces here for the purpose of Doxygenating them
+
+/*! \namespace cub
+ *  \brief \p cub is the top-level namespace which contains all CUB
+ *         functions and types.
+ */
+namespace cub
+{
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh
new file mode 100644
index 0000000..3f20c11
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh
@@ -0,0 +1,734 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 SHFL_C,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     SHFL_C,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh
new file mode 100644
index 0000000..0ba41e1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh
@@ -0,0 +1,1167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh
new file mode 100644
index 0000000..e7329d8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file version.h
+ *  \brief Compile-time macros encoding CUB release version
+ *
+ *         <cub/version.h> is the only CUB header that is guaranteed to
+ *         change with every CUB release.
+ *
+ */
+
+#pragma once
+
+/*! \def CUB_VERSION
+ *  \brief The preprocessor macro \p CUB_VERSION encodes the version
+ *         number of the CUB library.
+ *
+ *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>CUB_VERSION / 100000</tt> is the major version.
+ */
+#define CUB_VERSION 100910
+
+/*! \def CUB_MAJOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
+ *         major version number of the CUB library.
+ */
+#define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
+
+/*! \def CUB_MINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
+ *         minor version number of the CUB library.
+ */
+#define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
+
+/*! \def CUB_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the CUB library.
+ */
+#define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
+
+/*! \def CUB_PATCH_NUMBER
+ *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
+ *         patch number of the CUB library.
+ */
+#define CUB_PATCH_NUMBER 1
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 0000000..dbc56ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,542 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+
+#include <stdint.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    uint32_t member_mask;
+
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = static_cast<int>(LaneId());
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value,
+            cub::Sum(),
+            last_lane,
+            offset,
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        int last_lane = (ALL_LANES_VALID) ?
+                            LOGICAL_WARP_THREADS - 1 :
+                            valid_items - 1;
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Mask of physical lanes outside the logical warp and convert to logical lanemask
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+        }
+
+        // Mask in the last lane of logical warp
+        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 0000000..2442a8c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 0000000..18b46dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,632 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = LaneId();
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 0000000..ccd1de3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh
new file mode 100644
index 0000000..50ee705
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh
@@ -0,0 +1,611 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh
new file mode 100644
index 0000000..e9e9500
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh
@@ -0,0 +1,935 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml
new file mode 100644
index 0000000..3ca7f77
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml	
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="1">
+<profile kind="CodeFormatterProfile" name="B40C" version="1">
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.lineSplit" value="80"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_enumerator_list" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_declarator_list" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_empty_lines" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_method_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_type_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expression_list" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_conditional_expression" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indentation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_array_initializer" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+</profile>
+</profiles>
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/.gitignore
new file mode 100644
index 0000000..9dad963
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/.gitignore
@@ -0,0 +1,7 @@
+/bin
+/Debug
+/Release
+/cuda55.sdf
+/cuda55.suo
+/cuda60.sdf
+/cuda60.suo
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu
new file mode 100644
index 0000000..2fbeda9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu
@@ -0,0 +1,323 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockRadixSort
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <algorithm>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+/// Uniform key samples
+bool g_uniform_keys;
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide sorting over integers
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key         *d_in,          // Tile of input
+    Key         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage        load;
+        typename BlockRadixSortT::TempStorage   sort;
+    } temp_storage;
+
+    // Per-thread tile items
+    Key items[ITEMS_PER_THREAD];
+
+    // Our current block's offset
+    int block_offset = blockIdx.x * TILE_SIZE;
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Sort keys
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store output in striped fashion
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+    // Store elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize sorting problem (and solution).
+ */
+template <typename Key>
+void Initialize(
+    Key *h_in,
+    Key *h_reference,
+    int num_items,
+    int tile_size)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (g_uniform_keys)
+        {
+            h_in[i] = 0;
+        }
+        else
+        {
+            RandomBits(h_in[i]);
+        }
+        h_reference[i] = h_in[i];
+    }
+
+    // Only sort the first tile
+    std::sort(h_reference, h_reference + tile_size);
+}
+
+
+/**
+ * Test BlockScan
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    Key *h_in               = new Key[TILE_SIZE * g_grid_size];
+    Key *h_reference        = new Key[TILE_SIZE * g_grid_size];
+    clock_t *h_elapsed      = new clock_t[g_grid_size];
+
+    // Initialize problem and reference output on host
+    Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
+
+    // Initialize device arrays
+    Key *d_in       = NULL;
+    Key *d_out      = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(cudaMalloc((void**)&d_in,          sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_out,         sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_elapsed,     sizeof(clock_t) * g_grid_size));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            std::cout << h_in[i] << ", ";
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
+
+    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+    fflush(stdout);
+
+    // Run kernel once to prime caches and check result
+    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    fflush(stdout);
+
+    // Run this several times and average the performance results
+    GpuTimer            timer;
+    float               elapsed_millis          = 0.0;
+    unsigned long long  elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        timer.Start();
+
+        // Run kernel
+        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
+        for (int i = 0; i < g_grid_size; i++)
+            elapsed_clocks += h_elapsed[i];
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
+    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+    fflush(stdout);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_elapsed) delete[] h_elapsed;
+    if (d_in) CubDebugExit(cudaFree(d_in));
+    if (d_out) CubDebugExit(cudaFree(d_out));
+    if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_uniform_keys = args.CheckCmdLineFlag("uniform");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    fflush(stdout);
+
+    // Run tests
+    printf("\nuint32:\n"); fflush(stdout);
+    Test<unsigned int, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nfp32:\n"); fflush(stdout);
+    Test<float, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nuint8:\n"); fflush(stdout);
+    Test<unsigned char, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu
new file mode 100644
index 0000000..bad8001
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu
@@ -0,0 +1,290 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockReduce
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+__global__ void BlockSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile aggregate
+    clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
+{
+    // Specialize BlockReduce type for our thread block
+    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
+
+    // Shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute sum
+    int aggregate = BlockReduceT(temp_storage).Sum(data);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        *d_out = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int *h_in, int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block reduction
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * 1);
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations>] "
+            "[--grid-size=<grid size>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_REDUCE_RAKING>();
+    Test<512, 2, BLOCK_REDUCE_RAKING>();
+    Test<256, 4, BLOCK_REDUCE_RAKING>();
+    Test<128, 8, BLOCK_REDUCE_RAKING>();
+    Test<64, 16, BLOCK_REDUCE_RAKING>();
+    Test<32, 32, BLOCK_REDUCE_RAKING>();
+    Test<16, 64, BLOCK_REDUCE_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu
new file mode 100644
index 0000000..fa709a5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockScan
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockScanAlgorithm      ALGORITHM>
+__global__ void BlockPrefixSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
+
+    // Specialize BlockScan type for our thread block
+    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;
+        typename BlockStoreT::TempStorage   store;
+        typename BlockScanT::TempStorage    scan;
+    } temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in, data);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute exclusive prefix sum
+    int aggregate;
+    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Store items from a blocked arrangement
+    BlockStoreT(temp_storage.store).Store(d_out, data);
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive prefix sum problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(
+    int *h_in,
+    int *h_reference,
+    int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block scan
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockScanAlgorithm  ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_reference    = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_SCAN_RAKING>();
+    Test<512, 2, BLOCK_SCAN_RAKING>();
+    Test<256, 4, BLOCK_SCAN_RAKING>();
+    Test<128, 8, BLOCK_SCAN_RAKING>();
+    Test<64, 16, BLOCK_SCAN_RAKING>();
+    Test<32, 32, BLOCK_SCAN_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
+    Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
+    Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
+    Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
+    Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
+    Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
+
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu
new file mode 100644
index 0000000..d74e162
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu
@@ -0,0 +1,57 @@
+
+
+#include <cub/cub.cuh>
+
+
+template <
+    int         BLOCK_THREADS,          ///< Number of CTA threads
+    typename    KeyT,                   ///< Key type
+    typename    ValueT>                 ///< Value type
+__global__ void Kernel()
+{
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef cub::KeyValuePair<int, ValueT> OffsetValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef cub::ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockDiscontinuity type for setting head flags
+    typedef cub::BlockDiscontinuity<
+            KeyT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeysT;
+
+    // Parameterized BlockScan type
+    typedef cub::BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            cub::BLOCK_SCAN_WARP_SCANS>
+        BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockScanT::TempStorage                scan;           // Scan storage
+        typename BlockDiscontinuityKeysT::TempStorage   discontinuity;  // Discontinuity storage
+    } temp_storage;
+
+
+    // Read data (each thread gets 3 items each, every 9 items is a segment)
+    KeyT    my_keys[3]      = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3};
+    ValueT  my_values[3]    = {1, 1, 1};
+
+    // Set head segment head flags
+    int     my_flags[3];
+    BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads(
+        my_flags,
+        my_keys,
+        cub::Inequality());
+
+    __syncthreads();
+
+
+
+
+
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/.gitignore
new file mode 100644
index 0000000..7032b5a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/.gitignore
@@ -0,0 +1,8 @@
+/bin
+/Debug
+/ipch
+/Release
+/cuda55.sdf
+/cuda55.suo
+/cuda60.sdf
+/cuda60.suo
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu
new file mode 100644
index 0000000..ae02b3c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::Flagged().
+ *
+ * Partition flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu
new file mode 100644
index 0000000..7bf1c16
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::If().
+ *
+ * Partitions items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // DevicePartition a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu
new file mode 100644
index 0000000..1494ccb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceRadixSort::SortPairs().
+ *
+ * Sorts an array of float keys paired with a corresponding array of int values.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for floating point types.  Distinguishes
+ * between positive and negative zero.
+ */
+struct Pair
+{
+    float   key;
+    int     value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Return true if key is negative zero and b.key is positive zero
+        unsigned int key_bits   = *reinterpret_cast<unsigned*>(const_cast<float*>(&key));
+        unsigned int b_key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&b.key));
+        unsigned int HIGH_BIT   = 1u << 31;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+void Initialize(
+    float           *h_keys,
+    int             *h_values,
+    float           *h_reference_keys,
+    int             *h_reference_values,
+    int             num_items)
+{
+    Pair *h_pairs = new Pair[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        RandomBits(h_keys[i]);
+        RandomBits(h_values[i]);
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Input values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
+        num_items, int(sizeof(float)), int(sizeof(int)));
+    fflush(stdout);
+
+    // Allocate host arrays
+    float   *h_keys             = new float[num_items];
+    float   *h_reference_keys   = new float[num_items];
+    int     *h_values           = new int[num_items];
+    int     *h_reference_values = new int[num_items];
+
+    // Initialize problem and solution on host
+    Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
+
+    // Allocate device arrays
+    DoubleBuffer<float> d_keys;
+    DoubleBuffer<int>   d_values;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Initialize device arrays
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Run
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_reference_keys) delete[] h_reference_keys;
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu
new file mode 100644
index 0000000..fc8fddb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceReduce::Sum().
+ *
+ * Sums an array of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int   *h_in,
+    int     num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Compute solution
+ */
+void Solve(
+    int           *h_in,
+    int           &h_reference,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (i == 0)
+            h_reference = h_in[0];
+        else
+            h_reference += h_in[i];
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int* h_in = new int[num_items];
+    int  h_reference;
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
+
+    // Request and allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu
new file mode 100644
index 0000000..3c85526
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceScan::ExclusiveSum().
+ *
+ * Computes an exclusive sum of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int        *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+int Solve(
+    int           *h_in,
+    int           *h_reference,
+    int             num_items)
+{
+    int inclusive = 0;
+    int aggregate = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+        aggregate += h_in[i];
+    }
+
+    return aggregate;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int*  h_in = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu
new file mode 100644
index 0000000..12581f8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Flagged().
+ *
+ * Selects flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu
new file mode 100644
index 0000000..689c99b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::If().
+ *
+ * Selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // Select a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu
new file mode 100644
index 0000000..e9cefd5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Unique().
+ *
+ * Selects the first element from each run of identical values from a sequence
+ * of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int         *h_in,
+    int         *h_reference,
+    int         num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int*  h_in        = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
+        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu
new file mode 100644
index 0000000..ed70248
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu
@@ -0,0 +1,384 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of sorting a sequence of keys and values (each pair is a
+ * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
+ * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for using std::sort on key-value pairs.
+ */
+template <typename Key, typename Value>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Pair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
+{
+    os << '<' << val.key << ',' << val.value << '>';
+    return os;
+}
+
+
+/**
+ * Initialize problem
+ */
+template <typename Key, typename Value>
+void Initialize(
+    Key    *h_keys,
+    Value  *h_values,
+    int    num_items,
+    int    max_key)
+{
+    float scale = float(max_key) / float(UINT_MAX);
+    for (int i = 0; i < num_items; ++i)
+    {
+        Key sample;
+        RandomBits(sample);
+        h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample);
+        h_values[i] = i;
+    }
+
+    if (g_verbose)
+    {
+        printf("Keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve sorted non-trivial subrange problem.  Returns the number
+ * of non-trivial runs found.
+ */
+template <typename Key, typename Value>
+int Solve(
+    Key     *h_keys,
+    Value   *h_values,
+    int     num_items,
+    int     *h_offsets_reference,
+    int     *h_lengths_reference)
+{
+    // Sort
+
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    if (g_verbose)
+    {
+        printf("Sorted pairs:\n");
+        DisplayResults(h_pairs, num_items);
+        printf("\n\n");
+    }
+
+    // Find non-trivial runs
+
+    Key     previous        = h_pairs[0].key;
+    int     length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (previous != h_pairs[i].key)
+        {
+            if (length > 1)
+            {
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_pairs[i].key;
+    }
+
+    if (length > 1)
+    {
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    delete[] h_pairs;
+
+    return num_runs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef unsigned int    Key;
+    typedef int             Value;
+
+    int timing_iterations   = 0;
+    int num_items           = 40;
+    Key max_key             = 20;       // Max item
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxkey", max_key);
+    args.GetCmdLineArgument("i", timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations> "
+            "[--n=<input items, default 40> "
+            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays (problem and reference solution)
+
+    Key     *h_keys                 = new Key[num_items];
+    Value   *h_values               = new Value[num_items];
+    int     *h_offsets_reference    = new int[num_items];
+    int     *h_lengths_reference    = new int[num_items];
+
+    // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
+    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
+    fflush(stdout);
+
+    Initialize(h_keys, h_values, num_items, max_key);
+    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
+
+    printf("%d non-trivial runs\n", num_runs);
+    fflush(stdout);
+
+    // Repeat for performance timing
+    GpuTimer gpu_timer;
+    GpuTimer gpu_rle_timer;
+    float elapsed_millis = 0.0;
+    float elapsed_rle_millis = 0.0;
+    for (int i = 0; i <= timing_iterations; ++i)
+    {
+
+        // Allocate and initialize device arrays for sorting
+        DoubleBuffer<Key>       d_keys;
+        DoubleBuffer<Value>     d_values;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
+
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+        // Start timer
+        gpu_timer.Start();
+
+        // Allocate temporary storage for sorting
+        size_t  temp_storage_bytes  = 0;
+        void    *d_temp_storage     = NULL;
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the sort
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+        // Free unused buffers and sorting temporary storage
+        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
+        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+        // Start timer
+        gpu_rle_timer.Start();
+
+        // Allocate device arrays for enumerating non-trivial runs
+        int     *d_offests_out   = NULL;
+        int     *d_lengths_out   = NULL;
+        int     *d_num_runs      = NULL;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
+
+        // Allocate temporary storage for isolating non-trivial runs
+        d_temp_storage = NULL;
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the isolation
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+
+        // Free keys buffer
+        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
+
+        //
+        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
+        //
+
+        // Stop sort timer
+        gpu_timer.Stop();
+        gpu_rle_timer.Stop();
+
+        if (i == 0)
+        {
+            // First iteration is a warmup: // Check for correctness (and display results, if specified)
+
+            printf("\nRUN OFFSETS: \n");
+            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nRUN LENGTHS: \n");
+            compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nNUM RUNS: \n");
+            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            AssertEquals(0, compare);
+        }
+        else
+        {
+            elapsed_millis += gpu_timer.ElapsedMillis();
+            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
+        }
+
+        // GPU cleanup
+
+        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
+        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
+        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+    // Host cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_values) delete[] h_values;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+
+    printf("\n\n");
+
+    if (timing_iterations > 0)
+    {
+        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
+            timing_iterations,
+            elapsed_millis / timing_iterations,
+            elapsed_rle_millis / timing_iterations);
+    }
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore
new file mode 100644
index 0000000..5e56e04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore
@@ -0,0 +1 @@
+/bin
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu
new file mode 100644
index 0000000..6b33e1f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu
@@ -0,0 +1,1070 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of COO SpMV using prefix scan to implement a
+ * reduce-value-by-row strategy
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "coo_graph.cuh"
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+typedef int         VertexId;   // uint32s as vertex ids
+typedef double      Value;      // double-precision floating point values
+
+bool                    g_verbose       = false;
+int                     g_timing_iterations    = 1;
+CachingDeviceAllocator  g_allocator;
+
+
+/******************************************************************************
+ * Texture referencing
+ ******************************************************************************/
+
+/**
+ * Templated texture reference type for multiplicand vector
+ */
+template <typename Value>
+struct TexVector
+{
+    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
+    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
+
+    // Texture reference type
+    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
+    static TexRef ref;
+
+    /**
+     * Bind textures
+     */
+    static void BindTexture(void *d_in, int elements)
+    {
+        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
+        if (d_in)
+        {
+            size_t offset;
+            size_t bytes = sizeof(CastType) * elements;
+            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
+        }
+    }
+
+    /**
+     * Unbind textures
+     */
+    static void UnbindTexture()
+    {
+        CubDebugExit(cudaUnbindTexture(ref));
+    }
+
+    /**
+     * Load
+     */
+    static __device__ __forceinline__ Value Load(int offset)
+    {
+        Value output;
+        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
+        return output;
+    }
+};
+
+// Texture reference definitions
+template <typename Value>
+typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
+
+
+/******************************************************************************
+ * Utility types
+ ******************************************************************************/
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id
+ */
+template <typename VertexId, typename Value>
+struct PartialProduct
+{
+    VertexId    row;            /// Row-id
+    Value       partial;        /// PartialProduct sum
+};
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
+ */
+template <>
+struct PartialProduct<int, double>
+{
+    long long   row;            /// Row-id
+    double      partial;        /// PartialProduct sum
+};
+
+
+/**
+ * Reduce-value-by-row scan operator
+ */
+struct ReduceByKeyOp
+{
+    template <typename PartialProduct>
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &first,
+        const PartialProduct &second)
+    {
+        PartialProduct retval;
+
+        retval.partial = (second.row != first.row) ?
+                second.partial :
+                first.partial + second.partial;
+
+        retval.row = second.row;
+        return retval;
+    }
+};
+
+
+/**
+ * Stateful block-wide prefix operator for BlockScan
+ */
+template <typename PartialProduct>
+struct BlockPrefixCallbackOp
+{
+    // Running block-wide prefix
+    PartialProduct running_prefix;
+
+    /**
+     * Returns the block-wide running_prefix in thread-0
+     */
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        ReduceByKeyOp scan_op;
+
+        PartialProduct retval = running_prefix;
+        running_prefix = scan_op(running_prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+/**
+ * Operator for detecting discontinuities in a list of row identifiers.
+ */
+struct NewRowOp
+{
+    /// Returns true if row_b is the start of a new row
+    template <typename VertexId>
+    __device__ __forceinline__ bool operator()(
+        const VertexId& row_a,
+        const VertexId& row_b)
+    {
+        return (row_a != row_b);
+    }
+};
+
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * SpMV thread block abstraction for processing a contiguous segment of
+ * sparse COO tiles.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct PersistentBlockSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
+    typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
+
+    // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
+    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        union
+        {
+            typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
+            typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
+            struct
+            {
+                typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
+                typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
+            };
+        };
+
+        VertexId        first_block_row;    ///< The first row-ID seen by this thread block
+        VertexId        last_block_row;     ///< The last row-ID seen by this thread block
+        Value           first_product;      ///< The first dot-product written by this thread block
+    };
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    VertexId                        *d_rows;
+    VertexId                        *d_columns;
+    Value                           *d_values;
+    Value                           *d_vector;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             block_offset;
+    int                             block_end;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    PersistentBlockSpmv(
+        TempStorage                 &temp_storage,
+        VertexId                    *d_rows,
+        VertexId                    *d_columns,
+        Value                       *d_values,
+        Value                       *d_vector,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         block_offset,
+        int                         block_end)
+    :
+        temp_storage(temp_storage),
+        d_rows(d_rows),
+        d_columns(d_columns),
+        d_values(d_values),
+        d_vector(d_vector),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        block_offset(block_offset),
+        block_end(block_end)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_rows[block_offset];
+            VertexId last_block_row             = d_rows[block_end - 1];
+
+            temp_storage.first_block_row        = first_block_row;
+            temp_storage.last_block_row         = last_block_row;
+            temp_storage.first_product          = Value(0);
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        columns[ITEMS_PER_THREAD];
+        VertexId        rows[ITEMS_PER_THREAD];
+        Value           values[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
+        if (FULL_TILE)
+        {
+            // Unguarded loads
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
+        }
+        else
+        {
+            // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
+            // vertex for out-of-bound items, but zero-valued
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
+        }
+
+        // Load the referenced values from x and compute the dot product partials sums
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+#if CUB_PTX_ARCH >= 350
+            values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
+#else
+            values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
+#endif
+        }
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
+
+        __syncthreads();
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                     // (Out) Head flags
+            rows,                           // Original row ids
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Assemble partial product structures
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            partial_sums[ITEM].partial = values[ITEM];
+            partial_sums[ITEM].row = rows[ITEM];
+        }
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+
+                // Save off the first partial product that this thread block will scatter
+                if (partial_sums[ITEM].row == temp_storage.first_block_row)
+                {
+                    temp_storage.first_product = partial_sums[ITEM].partial;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        while (block_offset <= block_end - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process the last, partially-full tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        if (threadIdx.x == 0)
+        {
+            if (gridDim.x == 1)
+            {
+                // Scatter the final aggregate (this kernel contains only 1 thread block)
+                d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+            }
+            else
+            {
+                // Write the first and last partial products from this thread block so
+                // that they can be subsequently "fixed up" in the next kernel.
+
+                PartialProduct first_product;
+                first_product.row       = temp_storage.first_block_row;
+                first_product.partial   = temp_storage.first_product;
+
+                d_block_partials[blockIdx.x * 2]          = first_product;
+                d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
+            }
+        }
+    }
+};
+
+
+/**
+ * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct FinalizeSpmvBlock
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
+        typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
+
+        VertexId last_block_row;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             num_partials;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    FinalizeSpmvBlock(
+        TempStorage                 &temp_storage,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         num_partials)
+    :
+        temp_storage(temp_storage),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        num_partials(num_partials)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_block_partials[0].row;
+            VertexId last_block_row             = d_block_partials[num_partials - 1].row;
+            temp_storage.last_block_row         = last_block_row;
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        rows[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#endif
+        }
+        else
+        {
+            // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
+            PartialProduct default_sum;
+            default_sum.row = temp_storage.last_block_row;
+            default_sum.partial = Value(0);
+
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#endif
+        }
+
+        // Copy out row IDs for row-head flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            rows[ITEM] = partial_sums[ITEM].row;
+        }
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            rows,                           // Original row ids
+            head_flags,                     // (Out) Head flags
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        int block_offset = 0;
+        while (block_offset <= num_partials - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final partial tile (if present)
+        int guarded_items = num_partials - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        // Scatter the final aggregate (this kernel contains only 1 thread block)
+        if (threadIdx.x == 0)
+        {
+            d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+        }
+    }
+};
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+
+
+/**
+ * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void CooKernel(
+    GridEvenShare<int>              even_share,
+    PartialProduct<VertexId, Value> *d_block_partials,
+    VertexId                        *d_rows,
+    VertexId                        *d_columns,
+    Value                           *d_values,
+    Value                           *d_vector,
+    Value                           *d_result)
+{
+    // Specialize SpMV thread block abstraction type
+    typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
+
+    // Shared memory allocation
+    __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    PersistentBlockSpmv persistent_block(
+        temp_storage,
+        d_rows,
+        d_columns,
+        d_values,
+        d_vector,
+        d_result,
+        d_block_partials,
+        even_share.block_offset,
+        even_share.block_end);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+/**
+ * Kernel for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS,  1)
+__global__ void CooFinalizeKernel(
+    PartialProduct<VertexId, Value> *d_block_partials,
+    int                             num_partials,
+    Value                           *d_result)
+{
+    // Specialize "fix-up" thread block abstraction type
+    typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
+
+    // Shared memory allocation
+    __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+
+//---------------------------------------------------------------------
+// Host subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple test of device
+ */
+template <
+    int                         COO_BLOCK_THREADS,
+    int                         COO_ITEMS_PER_THREAD,
+    int                         COO_SUBSCRIPTION_FACTOR,
+    int                         FINALIZE_BLOCK_THREADS,
+    int                         FINALIZE_ITEMS_PER_THREAD,
+    typename                    VertexId,
+    typename                    Value>
+void TestDevice(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
+
+    // SOA device storage
+    VertexId        *d_rows;             // SOA graph row coordinates
+    VertexId        *d_columns;          // SOA graph col coordinates
+    Value           *d_values;           // SOA graph values
+    Value           *d_vector;           // Vector multiplicand
+    Value           *d_result;           // Output row
+    PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
+
+    // Create SOA version of coo_graph on host
+    int             num_edges   = coo_graph.coo_tuples.size();
+    VertexId        *h_rows     = new VertexId[num_edges];
+    VertexId        *h_columns  = new VertexId[num_edges];
+    Value           *h_values   = new Value[num_edges];
+    for (int i = 0; i < num_edges; i++)
+    {
+        h_rows[i]       = coo_graph.coo_tuples[i].row;
+        h_columns[i]    = coo_graph.coo_tuples[i].col;
+        h_values[i]     = coo_graph.coo_tuples[i].val;
+    }
+
+    // Get CUDA properties
+    Device device_props;
+    CubDebugExit(device_props.Init());
+
+    // Determine launch configuration from kernel properties
+    int coo_sm_occupancy;
+    CubDebugExit(device_props.MaxSmOccupancy(
+        coo_sm_occupancy,
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
+        COO_BLOCK_THREADS));
+    int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
+
+    // Construct an even-share work distribution
+    GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
+    int coo_grid_size  = even_share.grid_size;
+    int num_partials   = coo_grid_size * 2;
+
+    // Allocate COO device arrays
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
+
+    // Copy host arrays to device
+    CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
+
+    // Bind textures
+    TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
+
+    // Print debug info
+    printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
+        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
+    if (coo_grid_size > 1)
+    {
+        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
+    }
+    fflush(stdout);
+
+    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+
+    // Run kernel (always run one iteration without timing)
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0;
+    for (int i = 0; i <= g_timing_iterations; i++)
+    {
+        gpu_timer.Start();
+
+        // Initialize output
+        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
+
+        // Run the COO kernel
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
+            even_share,
+            d_block_partials,
+            d_rows,
+            d_columns,
+            d_values,
+            d_vector,
+            d_result);
+
+        if (coo_grid_size > 1)
+        {
+            // Run the COO finalize kernel
+            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
+                d_block_partials,
+                num_partials,
+                d_result);
+        }
+
+        gpu_timer.Stop();
+
+        if (i > 0)
+            elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Force any kernel stdio to screen
+    CubDebugExit(cudaThreadSynchronize());
+    fflush(stdout);
+
+    // Display timing
+    if (g_timing_iterations > 0)
+    {
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
+        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
+            g_timing_iterations,
+            avg_elapsed,
+            total_bytes / avg_elapsed / 1000.0 / 1000.0,
+            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
+    }
+
+    // Check results
+    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    TexVector<Value>::UnbindTexture();
+    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
+    CubDebugExit(g_allocator.DeviceFree(d_rows));
+    CubDebugExit(g_allocator.DeviceFree(d_columns));
+    CubDebugExit(g_allocator.DeviceFree(d_values));
+    CubDebugExit(g_allocator.DeviceFree(d_vector));
+    CubDebugExit(g_allocator.DeviceFree(d_result));
+    delete[] h_rows;
+    delete[] h_columns;
+    delete[] h_values;
+}
+
+
+/**
+ * Compute reference answer on CPU
+ */
+template <typename VertexId, typename Value>
+void ComputeReference(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    for (VertexId i = 0; i < coo_graph.row_dim; i++)
+    {
+        h_reference[i] = 0.0;
+    }
+
+    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
+    {
+        h_reference[coo_graph.coo_tuples[i].row] +=
+            coo_graph.coo_tuples[i].val *
+            h_vector[coo_graph.coo_tuples[i].col];
+    }
+}
+
+
+/**
+ * Assign arbitrary values to vector items
+ */
+template <typename Value>
+void AssignVectorValues(Value *vector, int col_dim)
+{
+    for (int i = 0; i < col_dim; i++)
+    {
+        vector[i] = 1.0;
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
+            "\t--type=wheel --spokes=<spokes>\n"
+            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
+            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
+            "\t--type=market --file=<file>\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get graph type
+    string type;
+    args.GetCmdLineArgument("type", type);
+
+    // Generate graph structure
+
+    CpuTimer timer;
+    timer.Start();
+    CooGraph<VertexId, Value> coo_graph;
+    if (type == string("grid2d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
+    } else if (type == string("grid3d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
+    }
+    else if (type == string("wheel"))
+    {
+        VertexId spokes;
+        args.GetCmdLineArgument("spokes", spokes);
+        printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
+        if (coo_graph.InitWheel(spokes)) exit(1);
+    }
+    else if (type == string("market"))
+    {
+        string filename;
+        args.GetCmdLineArgument("file", filename);
+        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
+        if (coo_graph.InitMarket(filename)) exit(1);
+    }
+    else
+    {
+        printf("Unsupported graph type\n");
+        exit(1);
+    }
+    timer.Stop();
+    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
+        timer.ElapsedMillis() / 1000.0,
+        coo_graph.coo_tuples.size(),
+        coo_graph.row_dim,
+        coo_graph.col_dim);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        cout << coo_graph << "\n";
+    }
+
+    // Create vector
+    Value *h_vector = new Value[coo_graph.col_dim];
+    AssignVectorValues(h_vector, coo_graph.col_dim);
+    if (g_verbose)
+    {
+        printf("Vector[%d]: ", coo_graph.col_dim);
+        DisplayResults(h_vector, coo_graph.col_dim);
+        printf("\n\n");
+    }
+
+    // Compute reference answer
+    Value *h_reference = new Value[coo_graph.row_dim];
+    ComputeReference(coo_graph, h_vector, h_reference);
+    if (g_verbose)
+    {
+        printf("Results[%d]: ", coo_graph.row_dim);
+        DisplayResults(h_reference, coo_graph.row_dim);
+        printf("\n\n");
+    }
+
+    // Parameterization for SM35
+    enum
+    {
+        COO_BLOCK_THREADS           = 64,
+        COO_ITEMS_PER_THREAD        = 10,
+        COO_SUBSCRIPTION_FACTOR     = 4,
+        FINALIZE_BLOCK_THREADS      = 256,
+        FINALIZE_ITEMS_PER_THREAD   = 4,
+    };
+
+    // Run GPU version
+    TestDevice<
+        COO_BLOCK_THREADS,
+        COO_ITEMS_PER_THREAD,
+        COO_SUBSCRIPTION_FACTOR,
+        FINALIZE_BLOCK_THREADS,
+        FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
+
+    // Cleanup
+    delete[] h_vector;
+    delete[] h_reference;
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu
new file mode 100644
index 0000000..5d27227
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu
@@ -0,0 +1,2142 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of segmented reduction using a load-balanced parallelization
+ * strategy based on the MergePath decision path.
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 1;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/******************************************************************************
+ * Utility routines
+ ******************************************************************************/
+
+
+/**
+ * An pair of index offsets
+ */
+template <typename OffsetT>
+struct IndexPair
+{
+    OffsetT a_idx;
+    OffsetT b_idx;
+};
+
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void ParallelMergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (a_split_min < a_split_max)
+    {
+        OffsetT a_distance       = a_split_max - a_split_min;
+        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
+        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
+
+        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
+        int num_up = __syncthreads_count(move_up);
+/*
+        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
+            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
+*/
+        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
+        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
+    }
+
+    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
+}
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void MergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    intersection.a_idx = CUB_MIN(split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - split_min, end.b_idx);
+}
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegion
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    bool                    _USE_SMEM_SEGMENT_CACHE,    ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+    bool                    _USE_SMEM_VALUE_CACHE,      ///< Whether or not to cache incoming values in shared memory before reducing each tile
+    CacheLoadModifier       _LOAD_MODIFIER_SEGMENTS,    ///< Cache load modifier for reading segment offsets
+    CacheLoadModifier       _LOAD_MODIFIER_VALUES,      ///< Cache load modifier for reading values
+    BlockReduceAlgorithm    _REDUCE_ALGORITHM,          ///< The BlockReduce algorithm to use
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        USE_SMEM_SEGMENT_CACHE  = _USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = _USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+    };
+
+    static const CacheLoadModifier      LOAD_MODIFIER_SEGMENTS  = _LOAD_MODIFIER_SEGMENTS;  ///< Cache load modifier for reading segment offsets
+    static const CacheLoadModifier      LOAD_MODIFIER_VALUES    = _LOAD_MODIFIER_VALUES;    ///< Cache load modifier for reading values
+    static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< The BlockReduce algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide segmented reduction.
+ */
+template <
+    typename BlockSegReduceRegionPolicy,    ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,         ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                 ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,               ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct BlockSegReduceRegion
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,                     /// Number of work items to be processed per tile
+
+        USE_SMEM_SEGMENT_CACHE  = BlockSegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = BlockSegReduceRegionPolicy::USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+
+        SMEM_SEGMENT_CACHE_ITEMS    = USE_SMEM_SEGMENT_CACHE ? TILE_ITEMS : 1,
+        SMEM_VALUE_CACHE_ITEMS      = USE_SMEM_VALUE_CACHE ? TILE_ITEMS : 1,
+    };
+
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Segment offsets iterator wrapper type
+    typedef typename If<(IsPointer<SegmentOffsetIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS, SegmentOffsetT, OffsetT>,  // Wrap the native input pointer with CacheModifiedInputIterator
+            SegmentOffsetIterator>::Type                                                                            // Directly use the supplied input iterator type
+        WrappedSegmentOffsetIterator;
+
+    // Values iterator wrapper type
+    typedef typename If<(IsPointer<ValueIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_VALUES, Value, OffsetT>,        // Wrap the native input pointer with CacheModifiedInputIterator
+            ValueIterator>::Type                                                                                // Directly use the supplied input iterator type
+        WrappedValueIterator;
+
+    // Tail flag type for marking segment discontinuities
+    typedef int TailFlag;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT> IndexPair;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockShift type for exchanging index pairs
+    typedef BlockShift<
+            IndexPair,
+            BLOCK_THREADS>
+        BlockShift;
+
+    // Parameterized BlockReduce type for block-wide reduction
+    typedef BlockReduce<
+            Value,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::REDUCE_ALGORITHM>
+        BlockReduce;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            // Smem needed for BlockScan
+            typename BlockScan::TempStorage scan;
+
+            // Smem needed for BlockReduce
+            typename BlockReduce::TempStorage reduce;
+
+            struct
+            {
+                // Smem needed for communicating start/end indices between threads for a given work tile
+                typename BlockShift::TempStorage shift;
+
+                // Smem needed for caching segment end-offsets
+                SegmentOffset cached_segment_end_offsets[SMEM_SEGMENT_CACHE_ITEMS + 1];
+            };
+
+            // Smem needed for caching values
+            Value cached_values[SMEM_VALUE_CACHE_ITEMS];
+        };
+
+        IndexPair block_region_idx[2];      // The starting [0] and ending [1] pairs of segment and value indices for the thread block's region
+
+        // The first partial reduction tuple scattered by this thread block
+        KeyValuePair first_tuple;
+    };
+
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                    &temp_storage;          ///< Reference to shared storage
+    WrappedSegmentOffsetIterator    d_segment_end_offsets;  ///< A sequence of \p num_segments segment end-offsets
+    WrappedValueIterator            d_values;               ///< A sequence of \p num_values data to reduce
+    OutputIteratorT                  d_output;               ///< A sequence of \p num_segments segment totals
+    CountingIterator                d_value_offsets;        ///< A sequence of \p num_values value-offsets
+    IndexPair                       *d_block_idx;
+    OffsetT                         num_values;             ///< Total number of values to reduce
+    OffsetT                         num_segments;           ///< Number of segments being reduced
+    Value                           identity;               ///< Identity value (for zero-length segments)
+    ReductionOp                     reduction_op;           ///< Reduction operator
+    ReduceByKeyOp                   scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp         prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegion(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        SegmentOffsetIterator   d_segment_end_offsets,  ///< A sequence of \p num_segments segment end-offsets
+        ValueIterator           d_values,               ///< A sequence of \p num_values values
+        OutputIteratorT          d_output,               ///< A sequence of \p num_segments segment totals
+        IndexPair               *d_block_idx,
+        OffsetT                 num_values,             ///< Number of values to reduce
+        OffsetT                 num_segments,           ///< Number of segments being reduced
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_segment_end_offsets(d_segment_end_offsets),
+        d_values(d_values),
+        d_value_offsets(0),
+        d_output(d_output),
+        d_block_idx(d_block_idx),
+        num_values(num_values),
+        num_segments(num_segments),
+        identity(identity),
+        reduction_op(reduction_op),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+    /**
+     * Fast-path single-segment tile reduction.  Perform a
+     * simple block-wide reduction and accumulate the result into
+     * the running total.
+     */
+    __device__ __forceinline__ void SingleSegmentTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+        // Load a tile's worth of values (using identity for out-of-bounds items)
+        Value values[ITEMS_PER_THREAD];
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Reduce the tile of values and update the running total in thread-0
+        KeyValuePair tile_aggregate;
+        tile_aggregate.key      = block_idx.a_idx;
+        tile_aggregate.value    = BlockReduce(temp_storage.reduce).Reduce(values, reduction_op);
+
+        if (threadIdx.x == 0)
+        {
+            prefix_op.running_total = scan_op(prefix_op.running_total, tile_aggregate);
+        }
+    }
+
+    /**
+     * Fast-path empty-segment tile reduction.  Write out a tile of identity
+     * values to output.
+     */
+    __device__ __forceinline__ void EmptySegmentsTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        Value segment_reductions[ITEMS_PER_THREAD];
+
+        if (threadIdx.x == 0)
+        {
+            // The first segment gets the running segment total
+            segment_reductions[0] = prefix_op.running_total.value;
+
+            // Update the running prefix
+            prefix_op.running_total.value = identity;
+            prefix_op.running_total.key = next_tile_idx.a_idx;
+        }
+        else
+        {
+            // Remainder of segments in this tile get identity
+            segment_reductions[0] = identity;
+        }
+
+        // Remainder of segments in this tile get identity
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            segment_reductions[ITEM] = identity;
+
+        // Store reductions
+        OffsetT tile_segments = next_tile_idx.a_idx - block_idx.a_idx;
+        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_output + block_idx.a_idx, segment_reductions, tile_segments);
+    }
+
+
+    /**
+     * Multi-segment tile reduction.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void MultiSegmentTile(
+        IndexPair block_idx,
+        IndexPair thread_idx,
+        IndexPair next_thread_idx,
+        IndexPair next_tile_idx)
+    {
+        IndexPair local_thread_idx;
+        local_thread_idx.a_idx = thread_idx.a_idx - block_idx.a_idx;
+        local_thread_idx.b_idx = thread_idx.b_idx - block_idx.b_idx;
+
+        // Check if first segment end-offset is in range
+        bool valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            (USE_SMEM_SEGMENT_CACHE)?
+                temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx] :
+                d_segment_end_offsets[thread_idx.a_idx] :
+            -1;
+
+        OffsetT segment_ids[ITEMS_PER_THREAD];
+        OffsetT value_offsets[ITEMS_PER_THREAD];
+
+        KeyValuePair first_partial;
+        first_partial.key    = thread_idx.a_idx;
+        first_partial.value  = identity;
+
+        // Get segment IDs and gather-offsets for values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            segment_ids[ITEM]   = -1;
+            value_offsets[ITEM] = -1;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= thread_idx.b_idx)))
+            {
+                // Consume this segment index
+                segment_ids[ITEM] = thread_idx.a_idx;
+                thread_idx.a_idx++;
+                local_thread_idx.a_idx++;
+
+                valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+                // Read next segment end-offset (if valid)
+                if (valid_segment)
+                {
+                    if (USE_SMEM_SEGMENT_CACHE)
+                        segment_end_offset = temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx];
+                    else
+                        segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+                }
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+                value_offsets[ITEM] = thread_idx.b_idx;
+                thread_idx.b_idx++;
+                local_thread_idx.b_idx++;
+
+                valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+            }
+        }
+
+        // Load values
+        Value values[ITEMS_PER_THREAD];
+
+        if (USE_SMEM_VALUE_CACHE)
+        {
+            // Barrier for smem reuse
+            __syncthreads();
+
+            OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+            // Load a tile's worth of values (using identity for out-of-bounds items)
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+            // Store to shared
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_values, values, tile_values);
+
+            // Barrier for smem reuse
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    temp_storage.cached_values[value_offsets[ITEM] - block_idx.b_idx];
+            }
+        }
+        else
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    d_values[value_offsets[ITEM]];
+            }
+        }
+
+        // Reduce within thread segments
+        KeyValuePair running_total = first_partial;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_ids[ITEM] != -1)
+            {
+                // Consume this segment index
+                d_output[segment_ids[ITEM]] = running_total.value;
+
+//                _CubLog("Updating segment %d with value %lld\n", segment_ids[ITEM], running_total.value)
+
+                if (first_partial.key == segment_ids[ITEM])
+                    first_partial.value = running_total.value;
+
+                running_total.key    = segment_ids[ITEM];
+                running_total.value  = identity;
+            }
+
+            running_total.value = reduction_op(running_total.value, values[ITEM]);
+        }
+/*
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            pairs,                          // Scan input
+            pairs,                          // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+*/
+
+/*
+        // Check if first segment end-offset is in range
+        bool valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            d_segment_end_offsets[thread_idx.a_idx] :
+            num_values;                                                     // Out of range (the last segment end-offset is one-past the last value offset)
+
+        // Load first value offset
+        OffsetT value_offset = (valid_value) ?
+            d_value_offsets[thread_idx.b_idx] :
+            num_values;                                                     // Out of range (one-past the last value offset)
+
+        // Assemble segment-demarcating tail flags and partial reduction tuples
+        TailFlag        tail_flags[ITEMS_PER_THREAD];
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Default tuple and flag values
+            partial_reductions[ITEM].key    = thread_idx.a_idx;
+            partial_reductions[ITEM].value  = identity;
+            tail_flags[ITEM]                = 0;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= value_offset)))
+            {
+                // Consume this segment index
+
+                // Set tail flag noting the end of the segment
+                tail_flags[ITEM] = 1;
+
+                // Increment segment index
+                thread_idx.a_idx++;
+
+                // Read next segment end-offset (if valid)
+                if ((valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx)))
+                    segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+
+                // Update the tuple's value with the value at this index.
+                partial_reductions[ITEM].value = d_values[value_offset];
+
+                // Increment value index
+                thread_idx.b_idx++;
+
+                // Read next value offset (if valid)
+                if ((valid_value = (thread_idx.b_idx < next_thread_idx.b_idx)))
+                    value_offset = d_value_offsets[thread_idx.b_idx];
+            }
+        }
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            partial_reductions,             // Scan input
+            partial_reductions,             // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // The first segment index for this region (hoist?)
+        OffsetT first_segment_idx = temp_storage.block_idx.a_idx[0];
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (tail_flags[ITEM])
+            {
+                OffsetT segment_idx = partial_reductions[ITEM].key;
+                Value   value       = partial_reductions[ITEM].value;
+
+                // Write value reduction to corresponding segment id
+                d_output[segment_idx] = value;
+
+                // Save off the first value product that this thread block will scatter
+                if (segment_idx == first_segment_idx)
+                {
+                    temp_storage.first_tuple.value = value;
+                }
+            }
+        }
+*/
+    }
+
+
+
+    /**
+     * Have the thread block process the specified region of the MergePath decision path
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT         block_diagonal,
+        OffsetT         next_block_diagonal,
+        KeyValuePair    &first_tuple,       // [Out] Valid in thread-0
+        KeyValuePair    &last_tuple)        // [Out] Valid in thread-0
+    {
+        // Thread block initialization
+        if (threadIdx.x < 2)
+        {
+            // Retrieve block starting and ending indices
+            IndexPair block_idx = {0, 0};
+            if (gridDim.x > 1)
+            {
+                block_idx = d_block_idx[blockIdx.x + threadIdx.x];
+            }
+            else if (threadIdx.x > 0)
+            {
+                block_idx.a_idx = num_segments;
+                block_idx.b_idx = num_values;
+            }
+
+            // Share block starting and ending indices
+            temp_storage.block_region_idx[threadIdx.x] = block_idx;
+
+            // Initialize the block's running prefix
+            if (threadIdx.x == 0)
+            {
+                prefix_op.running_total.key    = block_idx.a_idx;
+                prefix_op.running_total.value  = identity;
+
+                // Initialize the "first scattered partial reduction tuple" to the prefix tuple (in case we don't actually scatter one)
+                temp_storage.first_tuple = prefix_op.running_total;
+            }
+        }
+
+        // Ensure coherence of region indices
+        __syncthreads();
+
+        // Read block's starting indices
+        IndexPair block_idx = temp_storage.block_region_idx[0];
+
+        // Have the thread block iterate over the region
+        #pragma unroll 1
+        while (block_diagonal < next_block_diagonal)
+        {
+            // Read block's ending indices (hoist?)
+            IndexPair next_block_idx = temp_storage.block_region_idx[1];
+
+            // Clamp the per-thread search range to within one work-tile of block's current indices
+            IndexPair next_tile_idx;
+            next_tile_idx.a_idx = CUB_MIN(next_block_idx.a_idx, block_idx.a_idx + TILE_ITEMS);
+            next_tile_idx.b_idx = CUB_MIN(next_block_idx.b_idx, block_idx.b_idx + TILE_ITEMS);
+
+            // Have each thread search for the end-indices of its subranges within the segment and value inputs
+            IndexPair next_thread_idx;
+            if (USE_SMEM_SEGMENT_CACHE)
+            {
+                // Search in smem cache
+                OffsetT num_segments = next_tile_idx.a_idx - block_idx.a_idx;
+
+                // Load global
+                SegmentOffset segment_offsets[ITEMS_PER_THREAD];
+                LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_segment_end_offsets + block_idx.a_idx, segment_offsets, num_segments, num_values);
+
+                // Store to shared
+                StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_segment_end_offsets, segment_offsets);
+
+                __syncthreads();
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    temp_storage.cached_segment_end_offsets - block_idx.a_idx,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+            else
+            {
+                // Search in global
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    d_segment_end_offsets,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+
+            // Share thread end-indices to get thread begin-indices and tile end-indices
+            IndexPair thread_idx;
+
+            BlockShift(temp_storage.shift).Up(
+                next_thread_idx,    // Input item
+                thread_idx,         // [out] Output item
+                block_idx,          // Prefix item to be provided to <em>thread</em><sub>0</sub>
+                next_tile_idx);     // [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
+
+//            if (block_idx.a_idx == next_tile_idx.a_idx)
+//            {
+//                // There are no segment end-offsets in this tile.  Perform a
+//                // simple block-wide reduction and accumulate the result into
+//                // the running total.
+//                SingleSegmentTile(next_tile_idx, block_idx);
+//            }
+//          else if (block_idx.b_idx == next_tile_idx.b_idx)
+//            {
+//                // There are no values in this tile (only empty segments).
+//                EmptySegmentsTile(next_tile_idx.a_idx, block_idx.a_idx);
+//            }
+//            else
+            if ((next_tile_idx.a_idx < num_segments) && (next_tile_idx.b_idx < num_values))
+            {
+                // Merge the tile's segment and value indices (full tile)
+                MultiSegmentTile<true>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+            else
+            {
+                // Merge the tile's segment and value indices (partially full tile)
+                MultiSegmentTile<false>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+
+            // Advance the block's indices in preparation for the next tile
+            block_idx = next_tile_idx;
+
+            // Advance to the next region in the decision path
+            block_diagonal += TILE_ITEMS;
+
+            // Barrier for smem reuse
+            __syncthreads();
+        }
+
+        // Get first and last tuples for the region
+        if (threadIdx.x == 0)
+        {
+            first_tuple = temp_storage.first_tuple;
+            last_tuple = prefix_op.running_total;
+        }
+
+    }
+
+
+};
+
+
+
+
+
+
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegionByKey
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm      _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    bool                    _LOAD_WARP_TIME_SLICING,    ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+    CacheLoadModifier       _LOAD_MODIFIER,             ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)    };
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceRegionByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp>                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockSegReduceRegionByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionByKeyPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // KeyValuePair input type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type KeyValuePair;
+
+    // Signed integer type for global offsets
+    typedef typename KeyValuePair::Key OffsetT;
+
+    // Value type
+    typedef typename KeyValuePair::Value Value;
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Input iterator wrapper type for loading KeyValuePair elements through cache
+    typedef CacheModifiedInputIterator<
+            BlockSegReduceRegionByKeyPolicy::LOAD_MODIFIER,
+            KeyValuePair,
+            OffsetT>
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            WrappedInputIteratorT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            BlockSegReduceRegionByKeyPolicy::LOAD_ALGORITHM,
+            BlockSegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING>
+        BlockLoad;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionByKeyPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Parameterized BlockDiscontinuity type for identifying key discontinuities
+    typedef BlockDiscontinuity<
+            OffsetT,
+            BLOCK_THREADS>
+        BlockDiscontinuity;
+
+    // Operator for detecting discontinuities in a list of segment identifiers.
+    struct NewSegmentOp
+    {
+        /// Returns true if row_b is the start of a new row
+        __device__ __forceinline__ bool operator()(const OffsetT& b, const OffsetT& a)
+        {
+            return (a != b);
+        }
+    };
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoad::TempStorage                 load;           // Smem needed for tile loading
+            struct {
+                typename BlockScan::TempStorage             scan;           // Smem needed for reduce-value-by-segment scan
+                typename BlockDiscontinuity::TempStorage    discontinuity;  // Smem needed for head-flagging
+            };
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;          ///< Reference to shared storage
+    WrappedInputIteratorT       d_tuple_partials;       ///< A sequence of partial reduction tuples to scan
+    OutputIteratorT              d_output;               ///< A sequence of segment totals
+    Value                       identity;               ///< Identity value (for zero-length segments)
+    ReduceByKeyOp               scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp     prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegionByKey(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        InputIteratorT          d_tuple_partials,       ///< A sequence of partial reduction tuples to scan
+        OutputIteratorT          d_output,               ///< A sequence of segment totals
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_tuple_partials(d_tuple_partials),
+        d_output(d_output),
+        identity(identity),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+
+    /**
+     * Processes a reduce-value-by-key input tile, outputting reductions for each segment
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        OffsetT block_offset,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx,
+        int guarded_items = TILE_ITEMS)
+    {
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+        OffsetT         segment_ids[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions);
+        }
+        else
+        {
+            KeyValuePair oob_default;
+            oob_default.key    = last_segment_idx;       // The last segment ID to be reduced
+            oob_default.value  = identity;
+
+            // Partially-full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions, guarded_items, oob_default);
+        }
+
+        // Barrier for shared memory reuse
+        __syncthreads();
+
+        // Copy the segment IDs for head-flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            segment_ids[ITEM] = partial_reductions[ITEM].key;
+        }
+
+        // FlagT segment heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                         // [out] Head flags
+            segment_ids,                        // Segment ids
+            NewSegmentOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_total.key);       // Last segment ID from previous tile to compare with first segment ID in this tile
+
+        // Reduce-value-by-segment across partial_reductions using exclusive prefix scan
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_reductions,                   // Scan input
+            partial_reductions,                   // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_output[partial_reductions[ITEM].key] = partial_reductions[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessRegion(
+        OffsetT block_offset,
+        OffsetT block_end,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx)
+    {
+        if (threadIdx.x == 0)
+        {
+            // Initialize running prefix to the first segment index paired with identity
+            prefix_op.running_total.key    = first_segment_idx;
+            prefix_op.running_total.value  = identity;
+        }
+
+        // Process full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessTile<true>(block_offset, first_segment_idx, last_segment_idx);
+            __syncthreads();
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final value tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, first_segment_idx, last_segment_idx, guarded_items);
+        }
+    }
+};
+
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+
+template <
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OffsetT>                           ///< Signed integer type for global offsets
+__global__ void SegReducePartitionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    IndexPair<OffsetT>          *d_block_idx,
+    int                         num_partition_samples,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Cache-modified iterator for segment end-offsets
+    CacheModifiedInputIterator<LOAD_LDG, SegmentOffsetT, OffsetT> d_wrapped_segment_end_offsets(d_segment_end_offsets);
+
+    // Counting iterator for value offsets
+    CountingIterator d_value_offsets(0);
+
+    // Initialize even-share to tell us where to start and stop our tile-processing
+    int partition_id = (blockDim.x * blockIdx.x) + threadIdx.x;
+    even_share.Init(partition_id);
+
+    // Search for block starting and ending indices
+    IndexPair<OffsetT> start_idx = {0, 0};
+    IndexPair<OffsetT> end_idx   = {num_segments, num_values};
+    IndexPair<OffsetT> block_idx;
+
+    MergePathSearch(
+        even_share.block_offset,            // Next thread diagonal
+        d_wrapped_segment_end_offsets,      // A (segment end-offsets)
+        d_value_offsets,                    // B (value offsets)
+        start_idx,                          // Start indices into A and B
+        end_idx,                            // End indices into A and B
+        block_idx);                         // [out] diagonal intersection indices into A and B
+
+    // Write output
+    if (partition_id < num_partition_samples)
+    {
+        d_block_idx[partition_id] = block_idx;
+    }
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+template <
+    typename BlockSegReduceRegionPolicy,        ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT,                           ///< Signed integer type for global offsets
+    typename Value>                             ///< Value type
+__launch_bounds__ (BlockSegReduceRegionPolicy::BLOCK_THREADS)
+__global__ void SegReduceRegionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    ValueIterator               d_values,               ///< [in] A sequence of \p num_values values
+    OutputIteratorT              d_output,               ///< [out] A sequence of \p num_segments segment totals
+    KeyValuePair<OffsetT, Value> *d_tuple_partials,      ///< [out] A sequence of (gridDim.x * 2) partial reduction tuples
+    IndexPair<OffsetT>          *d_block_idx,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    Value                       identity,               ///< [in] Identity value (for zero-length segments)
+    ReductionOp                 reduction_op,           ///< [in] Reduction operator
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Specialize thread block abstraction type for reducing a range of segmented values
+    typedef BlockSegReduceRegion<
+            BlockSegReduceRegionPolicy,
+            SegmentOffsetIterator,
+            ValueIterator,
+            OutputIteratorT,
+            ReductionOp,
+            OffsetT>
+        BlockSegReduceRegion;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegion::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    BlockSegReduceRegion thread_block(
+        temp_storage,
+        d_segment_end_offsets,
+        d_values,
+        d_output,
+        d_block_idx,
+        num_values,
+        num_segments,
+        identity,
+        reduction_op);
+
+    // First and last partial reduction tuples within the range (valid in thread-0)
+    KeyValuePair first_tuple, last_tuple;
+
+    // Consume block's region of work
+    thread_block.ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end,
+        first_tuple,
+        last_tuple);
+
+    if (threadIdx.x == 0)
+    {
+        if (gridDim.x > 1)
+        {
+            // Special case where the first segment written and the carry-out are for the same segment
+            if (first_tuple.key == last_tuple.key)
+            {
+                first_tuple.value = identity;
+            }
+
+            // Write the first and last partial products from this thread block so
+            // that they can be subsequently "fixed up" in the next kernel.
+            d_tuple_partials[blockIdx.x * 2]          = first_tuple;
+            d_tuple_partials[(blockIdx.x * 2) + 1]    = last_tuple;
+        }
+    }
+
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (single-block).
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp,                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    OffsetT,                                ///< Signed integer type for global offsets
+    typename    Value>                                  ///< Value type
+__launch_bounds__ (BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS, 1)
+__global__ void SegReduceRegionByKeyKernel(
+    InputIteratorT          d_tuple_partials,           ///< [in] A sequence of partial reduction tuples
+    OutputIteratorT          d_output,                   ///< [out] A sequence of \p num_segments segment totals
+    OffsetT                 num_segments,               ///< [in] Number of segments in the \p d_output sequence
+    int                     num_tuple_partials,         ///< [in] Number of partial reduction tuples being reduced
+    Value                   identity,                   ///< [in] Identity value (for zero-length segments)
+    ReductionOp             reduction_op)               ///< [in] Reduction operator
+{
+    // Specialize thread block abstraction type for reducing a range of values by key
+    typedef BlockSegReduceRegionByKey<
+            BlockSegReduceRegionByKeyPolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            ReductionOp>
+        BlockSegReduceRegionByKey;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegionByKey::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    BlockSegReduceRegionByKey thread_block(
+        temp_storage,
+        d_tuple_partials,
+        d_output,
+        identity,
+        reduction_op);
+
+    // Process input tiles
+    thread_block.ProcessRegion(
+        0,                          // Region start
+        num_tuple_partials,         // Region end
+        0,                          // First segment ID
+        num_segments);              // Last segment ID (one-past)
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
+ */
+template <
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct DeviceSegReduceDispatch
+{
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT>IndexPair;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                6,                              ///< Items per thread (per tile of input)
+                true,                           ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_LDG,                       ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                256,                            ///< Threads per thread block
+                9,                             ///< Items per thread (per tile of input)
+                BLOCK_LOAD_DIRECT,              ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_LDG,                       ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /// SM10
+    struct Policy100
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                false,                          ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_RAKING>              ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                BLOCK_LOAD_WARP_TRANSPOSE,      ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+/*
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+*/
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSegReduceRegionPolicy           : PtxPolicy::SegReduceRegionPolicy {};
+    struct PtxSegReduceRegionByKeyPolicy      : PtxPolicy::SegReduceRegionByKeyPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <
+        typename SegReduceKernelConfig,
+        typename SegReduceByKeyKernelConfig>
+    __host__ __device__ __forceinline__
+    static void InitConfigs(
+        int                         ptx_version,
+        SegReduceKernelConfig       &seg_reduce_region_config,
+        SegReduceByKeyKernelConfig  &seg_reduce_region_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        seg_reduce_region_config.Init<PtxSegReduceRegionPolicy>();
+        seg_reduce_region_by_key_config.Init<PtxSegReduceRegionByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            seg_reduce_region_config.template          Init<typename Policy350::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy350::SegReduceRegionByKeyPolicy>();
+        }
+/*
+        else if (ptx_version >= 300)
+        {
+            seg_reduce_region_config.template          Init<typename Policy300::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy300::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            seg_reduce_region_config.template          Init<typename Policy200::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy200::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            seg_reduce_region_config.template          Init<typename Policy130::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy130::SegReduceRegionByKeyPolicy>();
+        }
+*/
+        else
+        {
+            seg_reduce_region_config.template          Init<typename Policy100::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy100::SegReduceRegionByKeyPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * SegReduceRegionKernel kernel dispatch configuration
+     */
+    struct SegReduceKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        bool                    use_smem_segment_cache;
+        bool                    use_smem_value_cache;
+        CacheLoadModifier       load_modifier_segments;
+        CacheLoadModifier       load_modifier_values;
+        BlockReduceAlgorithm    reduce_algorithm;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionPolicy::ITEMS_PER_THREAD;
+            use_smem_segment_cache      = SegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE;
+            use_smem_value_cache        = SegReduceRegionPolicy::USE_SMEM_VALUE_CACHE;
+            load_modifier_segments      = SegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS;
+            load_modifier_values        = SegReduceRegionPolicy::LOAD_MODIFIER_VALUES;
+            reduce_algorithm            = SegReduceRegionPolicy::REDUCE_ALGORITHM;
+            scan_algorithm              = SegReduceRegionPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+    /**
+     * SegReduceRegionByKeyKernel kernel dispatch configuration
+     */
+    struct SegReduceByKeyKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_algorithm;
+        bool                    load_warp_time_slicing;
+        CacheLoadModifier       load_modifier;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionByKeyPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionByKeyPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionByKeyPolicy::ITEMS_PER_THREAD;
+            load_algorithm              = SegReduceRegionByKeyPolicy::LOAD_ALGORITHM;
+            load_warp_time_slicing      = SegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING;
+            load_modifier               = SegReduceRegionByKeyPolicy::LOAD_MODIFIER;
+            scan_algorithm              = SegReduceRegionByKeyPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    template <
+        typename                        SegReducePartitionKernelPtr,
+        typename                        SegReduceRegionKernelPtr,               ///< Function type of cub::SegReduceRegionKernel
+        typename                        SegReduceRegionByKeyKernelPtr>          ///< Function type of cub::SegReduceRegionByKeyKernel
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous,                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                             sm_version,                             ///< [in] SM version of target device to use when computing SM occupancy
+        SegReducePartitionKernelPtr     seg_reduce_partition_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionKernelPtr        seg_reduce_region_kernel,               ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionByKeyKernelPtr   seg_reduce_region_by_key_kernel,        ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionByKeyKernel
+        SegReduceKernelConfig           &seg_reduce_region_config,              ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_kernel was compiled for
+        SegReduceByKeyKernelConfig      &seg_reduce_region_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_by_key_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Dispatch two kernels: (1) a multi-block segmented reduction
+            // to reduce regions by block, and (2) a single-block reduce-by-key kernel
+            // to "fix up" segments spanning more than one region.
+
+            // Tile size of seg_reduce_region_kernel
+            int tile_size = seg_reduce_region_config.block_threads * seg_reduce_region_config.items_per_thread;
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_region_kernel
+            int seg_reduce_region_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                seg_reduce_region_sm_occupancy,
+                sm_version,
+                seg_reduce_region_kernel,
+                seg_reduce_region_config.block_threads))) break;
+
+            // Get device occupancy for histogram_region_kernel
+            int seg_reduce_region_occupancy = seg_reduce_region_sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int num_diagonals = num_values + num_segments;                  // Total number of work items
+            int subscription_factor = seg_reduce_region_sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+            int max_grid_size = seg_reduce_region_occupancy * subscription_factor;
+            GridEvenShare<OffsetT>even_share(
+                num_diagonals,
+                max_grid_size,
+                tile_size);
+
+            // Get grid size for seg_reduce_region_kernel
+            int seg_reduce_region_grid_size = even_share.grid_size;
+
+            // Number of "fix-up" reduce-by-key tuples (2 per thread block)
+            int num_tuple_partials = seg_reduce_region_grid_size * 2;
+            int num_partition_samples = seg_reduce_region_grid_size + 1;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                num_tuple_partials * sizeof(KeyValuePair),  // bytes needed for "fix-up" reduce-by-key tuples
+                num_partition_samples * sizeof(IndexPair),  // bytes needed block indices
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocations
+            KeyValuePair    *d_tuple_partials   = (KeyValuePair*) allocations[0];           // "fix-up" tuples
+            IndexPair       *d_block_idx        = (IndexPair *) allocations[1];             // block starting/ending indices
+
+            // Array of segment end-offsets
+            SegmentOffsetIterator d_segment_end_offsets = d_segment_offsets + 1;
+
+            // Grid launch params for seg_reduce_partition_kernel
+            int partition_block_size = 32;
+            int partition_grid_size = (num_partition_samples + partition_block_size - 1) / partition_block_size;
+
+            // Partition work among multiple thread blocks if necessary
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_partition_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_partition_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    partition_grid_size, partition_block_size, (long long) stream);
+
+                // Invoke seg_reduce_partition_kernel
+                seg_reduce_partition_kernel<<<partition_grid_size, partition_block_size, 0, stream>>>(
+                    d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+                    d_block_idx,
+                    num_partition_samples,
+                    num_values,             ///< [in] Number of values to reduce
+                    num_segments,           ///< [in] Number of segments being reduced
+                    even_share);            ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log seg_reduce_region_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking seg_reduce_region_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, (long long) stream, seg_reduce_region_config.items_per_thread, seg_reduce_region_sm_occupancy);
+
+            // Mooch
+            if (CubDebug(error = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte))) break;
+
+            // Invoke seg_reduce_region_kernel
+            seg_reduce_region_kernel<<<seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, 0, stream>>>(
+                d_segment_end_offsets,
+                d_values,
+                d_output,
+                d_tuple_partials,
+                d_block_idx,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                even_share);
+
+            // Sync the stream if specified
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+/*
+            // Perform "fix-up" of region partial reductions if grid size is greater than one thread block
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_region_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_region_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, seg_reduce_region_by_key_config.block_threads, (long long) stream, seg_reduce_region_by_key_config.items_per_thread);
+
+                // Invoke seg_reduce_region_by_key_kernel
+                seg_reduce_region_by_key_kernel<<<1, seg_reduce_region_by_key_config.block_threads, 0, stream>>>(
+                    d_tuple_partials,
+                    d_output,
+                    num_segments,
+                    num_tuple_partials,
+                    identity,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+*/
+        }
+
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous)                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            SegReduceKernelConfig seg_reduce_region_config;
+            SegReduceByKeyKernelConfig seg_reduce_region_by_key_config;
+
+            InitConfigs(ptx_version, seg_reduce_region_config, seg_reduce_region_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_values,
+                d_segment_offsets,
+                d_output,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                stream,
+                debug_synchronous,
+                ptx_version,            // Use PTX version instead of SM version because, as a statically known quantity, this improves device-side launch dramatically but at the risk of imprecise occupancy calculation for mismatches
+                SegReducePartitionKernel<SegmentOffsetIterator, OffsetT>,
+                SegReduceRegionKernel<PtxSegReduceRegionPolicy, SegmentOffsetIterator, ValueIterator, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                SegReduceRegionByKeyKernel<PtxSegReduceRegionByKeyPolicy, KeyValuePair*, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                seg_reduce_region_config,
+                seg_reduce_region_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+
+    }
+};
+
+
+
+
+/******************************************************************************
+ * DeviceSegReduce
+ *****************************************************************************/
+
+/**
+ * \brief DeviceSegReduce provides operations for computing a device-wide, parallel segmented reduction across a sequence of data items residing within global memory.
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ */
+struct DeviceSegReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     * \tparam Value                    <b>[inferred]</b> Value type
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT,
+        typename                Value,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Reduce(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        Value                   identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp             reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                ReductionOp,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * Does not support non-commutative summation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT>
+    __host__ __device__ __forceinline__
+    static cudaError_t Sum(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Value type
+        typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+        Value identity = Value();
+        cub::Sum reduction_op;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                cub::Sum,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+};
+
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename OffsetT, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Value           *h_values,
+    vector<OffsetT> &segment_offsets,
+    int             num_values,
+    int             avg_segment_size)
+{
+    // Initialize values
+//    if (g_verbose) printf("Values: ");
+    for (int i = 0; i < num_values; ++i)
+    {
+        InitValue(gen_mode, h_values[i], i);
+//        if (g_verbose) std::cout << h_values[i] << ", ";
+    }
+//    if (g_verbose) printf("\n\n");
+
+    // Initialize segment lengths
+    const unsigned int  MAX_INTEGER         = -1u;
+    const unsigned int  MAX_SEGMENT_LENGTH  = avg_segment_size * 2;
+    const double        SCALE_FACTOR        = double(MAX_SEGMENT_LENGTH) / double(MAX_INTEGER);
+
+    segment_offsets.push_back(0);
+
+    OffsetT consumed = 0;
+    OffsetT remaining = num_values;
+    while (remaining > 0)
+    {
+        // Randomly sample a 32-bit unsigned int
+        unsigned int segment_length;
+        RandomBits(segment_length);
+
+        // Scale to maximum segment length
+        segment_length = (unsigned int) (double(segment_length) * SCALE_FACTOR);
+        segment_length = CUB_MIN(segment_length, remaining);
+
+        consumed += segment_length;
+        remaining -= segment_length;
+
+        segment_offsets.push_back(consumed);
+    }
+}
+
+
+/**
+ * Compute reference answer
+ */
+template <typename OffsetT, typename Value>
+void ComputeReference(
+    Value       *h_values,
+    OffsetT     *h_segment_offsets,
+    Value       *h_reference,
+    int         num_segments,
+    Value       identity)
+{
+    if (g_verbose) printf("%d segment reductions: ", num_segments);
+    for (int segment = 0; segment < num_segments; ++segment)
+    {
+        h_reference[segment] = identity;
+
+        for (int i = h_segment_offsets[segment]; i < h_segment_offsets[segment + 1]; ++i)
+        {
+            h_reference[segment] += h_values[i];
+        }
+        if (g_verbose) std::cout << h_reference[segment] << ", ";
+    }
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Simple test of device
+ */
+template <
+    bool            CDP,
+    typename        OffsetT,
+    typename        Value,
+    typename        ReductionOp>
+void Test(
+    OffsetT         num_values,
+    int             avg_segment_size,
+    ReductionOp     reduction_op,
+    Value           identity,
+    char*           type_string)
+{
+    Value   *h_values = NULL;
+    Value   *h_reference = NULL;
+    OffsetT *h_segment_offsets = NULL;
+
+    printf("%d\n", num_values);
+
+    // Initialize problem on host
+    h_values = new Value[num_values];
+    vector<OffsetT> segment_offsets;
+    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
+
+    // Allocate simple offsets array and copy STL vector into it
+    h_segment_offsets = new OffsetT[segment_offsets.size()];
+    for (int i = 0; i < segment_offsets.size(); ++i)
+        h_segment_offsets[i] = segment_offsets[i];
+
+    OffsetT num_segments = segment_offsets.size() - 1;
+    if (g_verbose)
+    {
+        printf("%d segment offsets: ", num_segments);
+        for (int i = 0; i < num_segments; ++i)
+            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
+        if (g_verbose) std::cout << std::endl << std::endl;
+    }
+
+    // Solve problem on host
+    h_reference = new Value[num_segments];
+    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
+
+    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
+        (CDP) ? "CDP device invoked" : "Host-invoked",
+        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
+        num_values, (int) sizeof(Value), type_string,
+        num_segments, (int) sizeof(OffsetT));
+    fflush(stdout);
+
+    // Allocate and initialize problem on device
+    Value   *d_values = NULL;
+    OffsetT *d_segment_offsets = NULL;
+    Value   *d_output = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Request and allocate temporary storage
+    void    *d_temp_storage = NULL;
+    size_t  temp_storage_bytes = 0;
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output
+    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    }
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
+        float giga_bandwidth = giga_rate *
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    // Device cleanup
+    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Host cleanup
+    if (h_values)           delete[] h_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (h_reference)        delete[] h_reference;
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_values          = 32 * 1024 * 1024;
+    int avg_segment_size    = 500;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_values);
+    args.GetCmdLineArgument("ss", avg_segment_size);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--n=<input samples>]\n"
+            "[--ss=<average segment size>]\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h
new file mode 100644
index 0000000..07c2e4a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_histogram.cuh>
+
+using namespace cub;
+
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_cub_histogram(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool is_warmup)
+{
+    enum {
+        is_float = Equals<PixelType, float4>::VALUE,
+    };
+
+    typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
+    typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
+
+    // Setup data structures
+    unsigned int*       d_histogram[ACTIVE_CHANNELS];
+    int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
+        num_levels[CHANNEL] = NUM_BINS + 1;
+        lower_level[CHANNEL] = 0;
+        upper_level[CHANNEL] = (is_float) ? 1 : 256;
+    }
+
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    SampleT* d_image_samples = (SampleT*) d_image;
+
+    // Get amount of temporary storage needed
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    // Compute histogram
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_temp_storage);
+
+    return elapsed_millis;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h
new file mode 100644
index 0000000..3308a28
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h
@@ -0,0 +1,185 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_gmem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_gmem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        unsigned int *gmem = out + g * NUM_PARTS;
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
+            gmem[i] = 0;
+        __syncthreads();
+
+        // process pixels (updates our group's partial histogram in gmem)
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
+            }
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_gmem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS)
+            return; // out of range
+
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+
+        out[i] = total;
+    }
+
+
+}   // namespace histogram_gmem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_gmem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist,
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h
new file mode 100644
index 0000000..2c70702
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_smem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_smem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
+            smem[i] = 0;
+        __syncthreads();
+
+        // process pixels
+        // updates our group's partial histogram in smem
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
+            }
+        }
+
+        __syncthreads();
+
+        // move to our workgroup's slice of output
+        out += g * NUM_PARTS;
+
+        // store local output to global
+        for (int i = t; i < NUM_BINS; i += nt)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_smem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+        out[i] = total;
+    }
+
+}   // namespace histogram_smem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_smem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu
new file mode 100644
index 0000000..7ab66a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu
@@ -0,0 +1,635 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include "histogram/histogram_gmem_atomics.h"
+#include "histogram/histogram_smem_atomics.h"
+#include "histogram/histogram_cub.h"
+
+#include <cub/util_allocator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+bool                    g_report = false;   // Whether to display a full report in CSV format
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+struct less_than_value
+{
+    inline bool operator()(
+        const std::pair<std::string, double> &a,
+        const std::pair<std::string, double> &b)
+    {
+        return a.second < b.second;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Targa (.tga) image file parsing
+//---------------------------------------------------------------------
+
+/**
+ * TGA image header info
+ */
+struct TgaHeader
+{
+    char idlength;
+    char colormaptype;
+    char datatypecode;
+    short colormaporigin;
+    short colormaplength;
+    char colormapdepth;
+    short x_origin;
+    short y_origin;
+    short width;
+    short height;
+    char bitsperpixel;
+    char imagedescriptor;
+
+    void Parse (FILE *fptr)
+    {
+        idlength = fgetc(fptr);
+        colormaptype = fgetc(fptr);
+        datatypecode = fgetc(fptr);
+        fread(&colormaporigin, 2, 1, fptr);
+        fread(&colormaplength, 2, 1, fptr);
+        colormapdepth = fgetc(fptr);
+        fread(&x_origin, 2, 1, fptr);
+        fread(&y_origin, 2, 1, fptr);
+        fread(&width, 2, 1, fptr);
+        fread(&height, 2, 1, fptr);
+        bitsperpixel = fgetc(fptr);
+        imagedescriptor = fgetc(fptr);
+    }
+
+    void Display (FILE *fptr)
+    {
+        fprintf(fptr, "ID length:           %d\n", idlength);
+        fprintf(fptr, "Color map type:      %d\n", colormaptype);
+        fprintf(fptr, "Image type:          %d\n", datatypecode);
+        fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
+        fprintf(fptr, "Color map length:    %d\n", colormaplength);
+        fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
+        fprintf(fptr, "X origin:            %d\n", x_origin);
+        fprintf(fptr, "Y origin:            %d\n", y_origin);
+        fprintf(fptr, "Width:               %d\n", width);
+        fprintf(fptr, "Height:              %d\n", height);
+        fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
+        fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
+    }
+};
+
+
+/**
+ * Decode image byte data into pixel
+ */
+void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
+{
+    if (bytes == 4)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = tga_pixel[3];
+    }
+    else if (bytes == 3)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = 0;
+    }
+    else if (bytes == 2)
+    {
+        pixel.x = (tga_pixel[1] & 0x7c) << 1;
+        pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
+        pixel.z = (tga_pixel[0] & 0x1f) << 3;
+        pixel.w = (tga_pixel[1] & 0x80);
+    }
+}
+
+
+/**
+ * Reads a .tga image file
+ */
+void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
+{
+    // Open the file
+    FILE *fptr;
+    if ((fptr = fopen(filename, "rb")) == NULL)
+    {
+        fprintf(stderr, "File open failed\n");
+        exit(-1);
+    }
+
+    // Parse header
+    TgaHeader header;
+    header.Parse(fptr);
+//    header.Display(stdout);
+    width = header.width;
+    height = header.height;
+
+    // Verify compatibility
+    if (header.datatypecode != 2 && header.datatypecode != 10)
+    {
+        fprintf(stderr, "Can only handle image type 2 and 10\n");
+        exit(-1);
+    }
+    if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
+    {
+        fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
+        exit(-1);
+    }
+    if (header.colormaptype != 0 && header.colormaptype != 1)
+    {
+        fprintf(stderr, "Can only handle color map types of 0 and 1\n");
+        exit(-1);
+    }
+
+    // Skip unnecessary header info
+    int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
+    fseek(fptr, skip_bytes, SEEK_CUR);
+
+    // Read the image
+    int pixel_bytes = header.bitsperpixel / 8;
+
+    // Allocate and initialize pixel data
+    size_t image_bytes = width * height * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+    memset(pixels, 0, image_bytes);
+
+    // Parse pixels
+    unsigned char   tga_pixel[5];
+    int             current_pixel = 0;
+    while (current_pixel < header.width * header.height)
+    {
+        if (header.datatypecode == 2)
+        {
+            // Uncompressed
+            if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
+                exit(-1);
+            }
+            ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+            current_pixel++;
+        }
+        else if (header.datatypecode == 10)
+        {
+            // Compressed
+            if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
+                exit(-1);
+            }
+            int run_length = tga_pixel[0] & 0x7f;
+            ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+            current_pixel++;
+
+            if (tga_pixel[0] & 0x80)
+            {
+                // RLE chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+                    current_pixel++;
+                }
+            }
+            else
+            {
+                // Normal chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+                    {
+                        fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
+                        exit(-1);
+                    }
+                    ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+                    current_pixel++;
+                }
+            }
+        }
+    }
+
+    // Close file
+    fclose(fptr);
+}
+
+
+
+//---------------------------------------------------------------------
+// Random image generation
+//---------------------------------------------------------------------
+
+/**
+ * Generate a random image with specified entropy
+ */
+void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
+{
+    int num_pixels = width * height;
+    size_t image_bytes = num_pixels * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+
+    for (int i = 0; i < num_pixels; ++i)
+    {
+        RandomBits(pixels[i].x, entropy_reduction);
+        RandomBits(pixels[i].y, entropy_reduction);
+        RandomBits(pixels[i].z, entropy_reduction);
+        RandomBits(pixels[i].w, entropy_reduction);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Histogram verification
+//---------------------------------------------------------------------
+
+// Decode float4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    float* samples = reinterpret_cast<float*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+}
+
+// Decode uchar4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+}
+
+// Decode uchar1 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    bins[0] = (unsigned int) pixel.x;
+}
+
+
+// Compute reference histogram.  Specialized for uchar4
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
+{
+    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
+
+    for (int i = 0; i < width; i++)
+    {
+        for (int j = 0; j < height; j++)
+        {
+            PixelType pixel = image[i + j * width];
+
+            unsigned int bins[ACTIVE_CHANNELS];
+            DecodePixelGold<NUM_BINS>(pixel, bins);
+
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
+            }
+        }
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Test execution
+//---------------------------------------------------------------------
+
+/**
+ * Run a specific histogram implementation
+ */
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void RunTest(
+    std::vector<std::pair<std::string, double> >&   timings,
+    PixelType*                                      d_pixels,
+    const int                                       width,
+    const int                                       height,
+    unsigned int *                                  d_hist,
+    unsigned int *                                  h_hist,
+    int                                             timing_iterations,
+    const char *                                    long_name,
+    const char *                                    short_name,
+    double (*f)(PixelType*, int, int, unsigned int*, bool))
+{
+    if (!g_report) printf("%s ", long_name); fflush(stdout);
+
+    // Run single test to verify (and code cache)
+    (*f)(d_pixels, width, height, d_hist, !g_report);
+
+    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
+    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+
+    double elapsed_ms = 0;
+    for (int i = 0; i < timing_iterations; i++)
+    {
+        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
+    }
+    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
+    timings.push_back(std::pair<std::string, double>(short_name, avg_us));
+
+    if (!g_report)
+    {
+        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
+    }
+    else
+    {
+        printf("%.3f, ", avg_us); fflush(stdout);
+    }
+
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Evaluate corpus of histogram implementations
+ */
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void TestMethods(
+    PixelType*  h_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    // Copy data to gpu
+    PixelType* d_pixels;
+    size_t pixel_bytes = width * height * sizeof(PixelType);
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
+    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
+
+    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
+
+    // Allocate results arrays on cpu/gpu
+    unsigned int *h_hist;
+    unsigned int *d_hist;
+    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
+    h_hist = (unsigned int *) malloc(histogram_bytes);
+    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
+
+    // Compute reference cpu histogram
+    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
+
+    // Store timings
+    std::vector<std::pair<std::string, double> > timings;
+
+    // Run experiments
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+
+    // Report timings
+    if (!g_report)
+    {
+        std::sort(timings.begin(), timings.end(), less_than_value());
+        printf("Timings (us):\n");
+        for (int i = 0; i < timings.size(); i++)
+        {
+            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
+            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
+        }
+        printf("\n");
+    }
+
+    // Free data
+    CubDebugExit(g_allocator.DeviceFree(d_pixels));
+    CubDebugExit(g_allocator.DeviceFree(d_hist));
+    free(h_hist);
+}
+
+
+/**
+ * Test different problem genres
+ */
+void TestGenres(
+    uchar4*     uchar4_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    int num_pixels = width * height;
+
+    {
+        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
+
+        size_t      image_bytes     = num_pixels * sizeof(uchar1);
+        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
+
+        // Convert to 1-channel (averaging first 3 channels)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            uchar1_pixels[i].x = (unsigned char)
+                (((unsigned int) uchar4_pixels[i].x +
+                  (unsigned int) uchar4_pixels[i].y +
+                  (unsigned int) uchar4_pixels[i].z) / 3);
+        }
+
+        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(uchar1_pixels);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
+        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
+        size_t      image_bytes     = num_pixels * sizeof(float4);
+        float4*     float4_pixels   = (float4*) malloc(image_bytes);
+
+        // Convert to float4 with range [0.0, 1.0)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
+            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
+            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
+            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
+        }
+        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(float4_pixels);
+        if (g_report) printf("\n");
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "\n\t"
+                "--file=<.tga filename> "
+            "\n\t"
+                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
+                "[--height=<default: 1080>] "
+                "[--width=<default: 1920>] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    std::string         filename;
+    int                 timing_iterations   = 100;
+    int                 entropy_reduction   = 0;
+    int                 height              = 1080;
+    int                 width               = 1920;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_report = args.CheckCmdLineFlag("report");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("file", filename);
+    args.GetCmdLineArgument("height", height);
+    args.GetCmdLineArgument("width", width);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get GPU device bandwidth (GB/s)
+    int device_ordinal, bus_width, mem_clock_khz;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
+    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
+
+    // Run test(s)
+    uchar4* uchar4_pixels = NULL;
+    if (!g_report)
+    {
+        if (!filename.empty())
+        {
+            // Parse targa file
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
+        }
+        else
+        {
+            // Generate image
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
+        }
+
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+    }
+    else
+    {
+        // Run test suite
+        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
+
+        // Entropy reduction tests
+        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
+        {
+            printf("entropy reduction %d, ", entropy_reduction);
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+        printf("entropy reduction -1, ");
+        GenerateRandomImage(uchar4_pixels, width, height, -1);
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        printf("\n");
+
+        // File image tests
+        std::vector<std::string> file_tests;
+        file_tests.push_back("animals");
+        file_tests.push_back("apples");
+        file_tests.push_back("sunset");
+        file_tests.push_back("cheetah");
+        file_tests.push_back("nature");
+        file_tests.push_back("operahouse");
+        file_tests.push_back("austin");
+        file_tests.push_back("cityscape");
+
+        for (int i = 0; i < file_tests.size(); ++i)
+        {
+            printf("%s, ", file_tests[i].c_str());
+            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+    }
+
+    free(uchar4_pixels);
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n\n");
+
+    return 0;
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h
new file mode 100644
index 0000000..1fb5233
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h
@@ -0,0 +1,1244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Matrix data structures and parsing logic
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cstring>
+
+#include <iterator>
+#include <string>
+#include <algorithm>
+#include <iostream>
+#include <queue>
+#include <set>
+#include <fstream>
+#include <stdio.h>
+
+#ifdef CUB_MKL
+    #include <numa.h>
+    #include <mkl.h>
+#endif
+
+using namespace std;
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+struct GraphStats
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+
+    double      diag_dist_mean;         // mean
+    double      diag_dist_std_dev;      // sample std dev
+    double      pearson_r;    // coefficient of variation
+
+    double      row_length_mean;        // mean
+    double      row_length_std_dev;     // sample std_dev
+    double      row_length_variation;   // coefficient of variation
+    double      row_length_skewness;    // skewness
+
+    void Display(bool show_labels = true)
+    {
+        if (show_labels)
+            printf("\n"
+                "\t num_rows: %d\n"
+                "\t num_cols: %d\n"
+                "\t num_nonzeros: %d\n"
+                "\t diag_dist_mean: %.2f\n"
+                "\t diag_dist_std_dev: %.2f\n"
+                "\t pearson_r: %f\n"
+                "\t row_length_mean: %.5f\n"
+                "\t row_length_std_dev: %.5f\n"
+                "\t row_length_variation: %.5f\n"
+                "\t row_length_skewness: %.5f\n",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+        else
+            printf(
+                "%d, "
+                "%d, "
+                "%d, "
+                "%.2f, "
+                "%.2f, "
+                "%f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, ",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * COO matrix type.  A COO matrix is just a vector of edge tuples.  Tuples are sorted
+ * first by row, then by column.
+ */
+template<typename ValueT, typename OffsetT>
+struct CooMatrix
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // COO edge tuple
+    struct CooTuple
+    {
+        OffsetT            row;
+        OffsetT            col;
+        ValueT             val;
+
+        CooTuple() {}
+        CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {}
+        CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {}
+
+        /**
+         * Comparator for sorting COO sparse format num_nonzeros
+         */
+        bool operator<(const CooTuple &other) const
+        {
+            if ((row < other.row) || ((row == other.row) && (col < other.col)))
+            {
+                return true;
+            }
+
+            return false;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Data members
+    //---------------------------------------------------------------------
+
+    // Fields
+    int                 num_rows;
+    int                 num_cols;
+    int                 num_nonzeros;
+    CooTuple*           coo_tuples;
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    // Constructor
+    CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(NULL) {}
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+        if (coo_tuples) delete[] coo_tuples;
+        coo_tuples = NULL;
+    }
+
+
+    // Destructor
+    ~CooMatrix()
+    {
+        Clear();
+    }
+
+
+    // Display matrix to stdout
+    void Display()
+    {
+        cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n";
+        cout << "Ordinal, Row, Column, Value\n";
+        for (int i = 0; i < num_nonzeros; i++)
+        {
+            cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n";
+        }
+    }
+
+
+    /**
+     * Builds a symmetric COO sparse from an asymmetric CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrSymmetric(CsrMatrixT &csr_matrix)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_cols;
+        num_cols        = csr_matrix.num_rows;
+        num_nonzeros    = csr_matrix.num_nonzeros * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < csr_matrix.num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = row;
+                coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero];
+
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+    /**
+     * Builds a COO sparse from a relabeled CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrRelabel(CsrMatrixT &csr_matrix, OffsetT* relabel_indices)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_rows;
+        num_cols        = csr_matrix.num_cols;
+        num_nonzeros    = csr_matrix.num_nonzeros;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = relabel_indices[row];
+                coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+
+
+    /**
+     * Builds a METIS COO sparse from the given file.
+     */
+    void InitMetis(const string &metis_filename)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        // TODO
+    }
+
+
+    /**
+     * Builds a MARKET COO sparse from the given file.
+     */
+    void InitMarket(
+        const string&   market_filename,
+        ValueT          default_value       = 1.0,
+        bool            verbose             = false)
+    {
+        if (verbose) {
+            printf("Reading... "); fflush(stdout);
+        }
+
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        std::ifstream ifs;
+        ifs.open(market_filename.c_str(), std::ifstream::in);
+        if (!ifs.good())
+        {
+            fprintf(stderr, "Error opening file\n");
+            exit(1);
+        }
+
+        bool    array = false;
+        bool    symmetric = false;
+        bool    skew = false;
+        int     current_edge = -1;
+        char    line[1024];
+
+        if (verbose) {
+            printf("Parsing... "); fflush(stdout);
+        }
+
+        while (true)
+        {
+            ifs.getline(line, 1024);
+            if (!ifs.good())
+            {
+                // Done
+                break;
+            }
+
+            if (line[0] == '%')
+            {
+                // Comment
+                if (line[1] == '%')
+                {
+                    // Banner
+                    symmetric   = (strstr(line, "symmetric") != NULL);
+                    skew        = (strstr(line, "skew") != NULL);
+                    array       = (strstr(line, "array") != NULL);
+
+                    if (verbose) {
+                        printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout);
+                    }
+                }
+            }
+            else if (current_edge == -1)
+            {
+                // Problem description
+                int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros);
+                if ((!array) && (nparsed == 3))
+                {
+                    if (symmetric)
+                        num_nonzeros *= 2;
+
+                    // Allocate coo matrix
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+
+                }
+                else if (array && (nparsed == 2))
+                {
+                    // Allocate coo matrix
+                    num_nonzeros = num_rows * num_cols;
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+                }
+                else
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line);
+                    exit(1);
+                }
+
+            }
+            else
+            {
+                // Edge
+                if (current_edge >= num_nonzeros)
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros);
+                    exit(1);
+                }
+
+                int row, col;
+                double val;
+
+                if (array)
+                {
+                    if (sscanf(line, "%lf", &val) != 1)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge);
+                        exit(1);
+                    }
+                    col = (current_edge / num_rows);
+                    row = (current_edge - (num_rows * col));
+
+                    coo_tuples[current_edge] = CooTuple(row, col, val);    // Convert indices to zero-based
+                }
+                else
+                {
+                    // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing)
+                    char *l = line;
+                    char *t = NULL;
+
+                    // parse row
+                    row = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse col
+                    col = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse val
+                    val = strtod(l, &t);
+                    if (t == l)
+                    {
+                        val = default_value;
+                    }
+/*
+                    int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val);
+                    if (nparsed == 2)
+                    {
+                        // No value specified
+                        val = default_value;
+                        
+                    }
+                    else if (nparsed != 3)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge);
+                        exit(1);
+                    }
+*/
+
+                    coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val);    // Convert indices to zero-based
+
+                }
+
+                current_edge++;
+
+                if (symmetric && (row != col))
+                {
+                    coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col;
+                    coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row;
+                    coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Adjust nonzero count (nonzeros along the diagonal aren't reversed)
+        num_nonzeros = current_edge;
+
+        if (verbose) {
+            printf("done. Ordering..."); fflush(stdout);
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        if (verbose) {
+            printf("done. "); fflush(stdout);
+        }
+
+        ifs.close();
+    }
+
+
+    /**
+     * Builds a dense matrix
+     */
+    int InitDense(
+        OffsetT     num_rows,
+        OffsetT     num_cols,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        this->num_rows  = num_rows;
+        this->num_cols  = num_cols;
+
+        num_nonzeros    = num_rows * num_cols;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT col = 0; col < num_cols; ++col)
+            {
+                coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value);
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+    /**
+     * Builds a wheel COO sparse matrix having spokes spokes.
+     */
+    int InitWheel(
+        OffsetT     spokes,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = spokes + 1;
+        num_cols        = num_rows;
+        num_nonzeros    = spokes * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        // Add spoke num_nonzeros
+        int current_edge = 0;
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            coo_tuples[current_edge] = CooTuple(0, i + 1, default_value);
+            current_edge++;
+        }
+
+        // Add rim
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            OffsetT dest = (i + 1) % spokes;
+            coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value);
+            current_edge++;
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 2D grid CSR matrix.  Interior num_vertices have degree 5 when including
+     * a self-loop.
+     *
+     * Returns 0 on success, 1 on failure.
+     */
+    int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        int     interior_nodes  = (width - 2) * (width - 2);
+        int     edge_nodes      = (width - 2) * 4;
+        int     corner_nodes    = 4;
+        num_rows                       = width * width;
+        num_cols                       = num_rows;
+        num_nonzeros                   = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT j = 0; j < width; j++)
+        {
+            for (OffsetT k = 0; k < width; k++)
+            {
+                OffsetT me = (j * width) + k;
+
+                // West
+                OffsetT neighbor = (j * width) + (k - 1);
+                if (k - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // East
+                neighbor = (j * width) + (k + 1);
+                if (k + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // North
+                neighbor = ((j - 1) * width) + k;
+                if (j - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // South
+                neighbor = ((j + 1) * width) + k;
+                if (j + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                if (self_loop)
+                {
+                    neighbor = me;
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 3D grid COO sparse matrix.  Interior num_vertices have degree 7 when including
+     * a self-loop.  Values are unintialized, coo_tuples are sorted.
+     */
+    int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            return -1;
+        }
+
+        OffsetT interior_nodes  = (width - 2) * (width - 2) * (width - 2);
+        OffsetT face_nodes      = (width - 2) * (width - 2) * 6;
+        OffsetT edge_nodes      = (width - 2) * 12;
+        OffsetT corner_nodes    = 8;
+        num_cols                       = width * width * width;
+        num_rows                       = num_cols;
+        num_nonzeros                     = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT i = 0; i < width; i++)
+        {
+            for (OffsetT j = 0; j < width; j++)
+            {
+                for (OffsetT k = 0; k < width; k++)
+                {
+
+                    OffsetT me = (i * width * width) + (j * width) + k;
+
+                    // Up
+                    OffsetT neighbor = (i * width * width) + (j * width) + (k - 1);
+                    if (k - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // Down
+                    neighbor = (i * width * width) + (j * width) + (k + 1);
+                    if (k + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // West
+                    neighbor = (i * width * width) + ((j - 1) * width) + k;
+                    if (j - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // East
+                    neighbor = (i * width * width) + ((j + 1) * width) + k;
+                    if (j + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // North
+                    neighbor = ((i - 1) * width * width) + (j * width) + k;
+                    if (i - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // South
+                    neighbor = ((i + 1) * width * width) + (j * width) + k;
+                    if (i + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    if (self_loop)
+                    {
+                        neighbor = me;
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * CSR sparse format matrix
+ */
+template<
+    typename ValueT,
+    typename OffsetT>
+struct CsrMatrix
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+    OffsetT*    row_offsets;
+    OffsetT*    column_indices;
+    ValueT*     values;
+    bool        numa_malloc;
+
+    /**
+     * Constructor
+     */
+    CsrMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), row_offsets(NULL), column_indices(NULL), values(NULL) 
+    {
+#ifdef CUB_MKL
+        numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1));
+#else
+        numa_malloc = false;
+#endif
+    }
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+#ifdef CUB_MKL
+        if (numa_malloc) 
+        {
+            numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1));
+            numa_free(values, sizeof(ValueT) * num_nonzeros);
+            numa_free(column_indices, sizeof(OffsetT) * num_nonzeros);
+        }
+        else
+        {
+            if (row_offsets)    mkl_free(row_offsets);
+            if (column_indices) mkl_free(column_indices);
+            if (values)         mkl_free(values);
+        }
+
+#else
+        if (row_offsets)    delete[] row_offsets;
+        if (column_indices) delete[] column_indices;
+        if (values)         delete[] values;
+#endif
+
+        row_offsets = NULL;
+        column_indices = NULL;
+        values = NULL;
+    }
+
+    /**
+     * Destructor
+     */
+    ~CsrMatrix()
+    {
+        Clear();
+    }
+
+    GraphStats Stats()
+    {
+        GraphStats stats;
+        stats.num_rows = num_rows;
+        stats.num_cols = num_cols;
+        stats.num_nonzeros = num_nonzeros;
+
+        //
+        // Compute diag-distance statistics
+        //
+
+        OffsetT samples     = 0;
+        double  mean        = 0.0;
+        double  ss_tot      = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+                double x                = (col > row) ? col - row : row - col;
+
+                samples++;
+                double delta            = x - mean;
+                mean                    = mean + (delta / samples);
+                ss_tot                  += delta * (x - mean);
+            }
+        }
+        stats.diag_dist_mean            = mean;
+        double variance                 = ss_tot / samples;
+        stats.diag_dist_std_dev         = sqrt(variance);
+
+
+        //
+        // Compute deming statistics
+        //
+
+        samples         = 0;
+        double mean_x   = 0.0;
+        double mean_y   = 0.0;
+        double ss_x     = 0.0;
+        double ss_y     = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+                double delta;
+
+                delta                   = x - mean_x;
+                mean_x                  = mean_x + (delta / samples);
+                ss_x                    += delta * (x - mean_x);
+
+                delta                   = y - mean_y;
+                mean_y                  = mean_y + (delta / samples);
+                ss_y                    += delta * (y - mean_y);
+            }
+        }
+
+        samples         = 0;
+        double s_xy     = 0.0;
+        double s_xxy    = 0.0;
+        double s_xyy    = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+
+                double xy =             (x - mean_x) * (y - mean_y);
+                double xxy =            (x - mean_x) * (x - mean_x) * (y - mean_y);
+                double xyy =            (x - mean_x) * (y - mean_y) * (y - mean_y);
+                double delta;
+
+                delta                   = xy - s_xy;
+                s_xy                    = s_xy + (delta / samples);
+
+                delta                   = xxy - s_xxy;
+                s_xxy                   = s_xxy + (delta / samples);
+
+                delta                   = xyy - s_xyy;
+                s_xyy                   = s_xyy + (delta / samples);
+            }
+        }
+
+        double s_xx     = ss_x / num_nonzeros;
+        double s_yy     = ss_y / num_nonzeros;
+
+        double deming_slope = (s_yy - s_xx + sqrt(((s_yy - s_xx) * (s_yy - s_xx)) + (4 * s_xy * s_xy))) / (2 * s_xy);
+
+        stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y));
+
+
+        //
+        // Compute row-length statistics
+        //
+
+        // Sample mean
+        stats.row_length_mean       = double(num_nonzeros) / num_rows;
+        variance                    = 0.0;
+        stats.row_length_skewness   = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT length              = row_offsets[row + 1] - row_offsets[row];
+            double delta                = double(length) - stats.row_length_mean;
+            variance   += (delta * delta);
+            stats.row_length_skewness   += (delta * delta * delta);
+        }
+        variance                    /= num_rows;
+        stats.row_length_std_dev    = sqrt(variance);
+        stats.row_length_skewness   = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0);
+        stats.row_length_variation  = stats.row_length_std_dev / stats.row_length_mean;
+
+        return stats;
+    }
+
+    /**
+     * Build CSR matrix from sorted COO matrix
+     */
+    void FromCoo(const CooMatrix<ValueT, OffsetT> &coo_matrix)
+    {
+        num_rows        = coo_matrix.num_rows;
+        num_cols        = coo_matrix.num_cols;
+        num_nonzeros    = coo_matrix.num_nonzeros;
+
+#ifdef CUB_MKL
+
+        if (numa_malloc)
+        {
+            numa_set_strict(1);
+//            numa_set_bind_policy(1);
+
+//        values          = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros);
+//        row_offsets     = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1));
+//        column_indices  = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros);
+
+            row_offsets     = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0);
+            column_indices  = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0);
+            values          = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1);
+        }
+        else
+        {
+            values          = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096);
+            row_offsets     = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096);
+            column_indices  = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096);
+
+        }
+
+#else
+        row_offsets     = new OffsetT[num_rows + 1];
+        column_indices  = new OffsetT[num_nonzeros];
+        values          = new ValueT[num_nonzeros];
+#endif
+
+        OffsetT prev_row = -1;
+        for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++)
+        {
+            OffsetT current_row = coo_matrix.coo_tuples[current_edge].row;
+
+            // Fill in rows up to and including the current row
+            for (OffsetT row = prev_row + 1; row <= current_row; row++)
+            {
+                row_offsets[row] = current_edge;
+            }
+            prev_row = current_row;
+
+            column_indices[current_edge]    = coo_matrix.coo_tuples[current_edge].col;
+            values[current_edge]            = coo_matrix.coo_tuples[current_edge].val;
+        }
+
+        // Fill out any trailing edgeless vertices (and the end-of-list element)
+        for (OffsetT row = prev_row + 1; row <= num_rows; row++)
+        {
+            row_offsets[row] = num_nonzeros;
+        }
+    }
+
+
+    /**
+     * Display log-histogram to stdout
+     */
+    void DisplayHistogram()
+    {
+        // Initialize
+        int log_counts[9];
+        for (int i = 0; i < 9; i++)
+        {
+            log_counts[i] = 0;
+        }
+
+        // Scan
+        int max_log_length = -1;
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            OffsetT length = row_offsets[row + 1] - row_offsets[row];
+
+            int log_length = -1;
+            while (length > 0)
+            {
+                length /= 10;
+                log_length++;
+            }
+            if (log_length > max_log_length)
+            {
+                max_log_length = log_length;
+            }
+
+            log_counts[log_length + 1]++;
+        }
+        printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros);
+        for (int i = -1; i < max_log_length + 1; i++)
+        {
+            printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols);
+        }
+        fflush(stdout);
+    }
+
+
+    /**
+     * Display matrix to stdout
+     */
+    void Display()
+    {
+        printf("Input Matrix:\n");
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]);
+            for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++)
+            {
+                printf("%d (%f), ", column_indices[current_edge], values[current_edge]);
+            }
+            printf("\n");
+        }
+        fflush(stdout);
+    }
+
+
+};
+
+
+
+/******************************************************************************
+ * Matrix transformations
+ ******************************************************************************/
+
+// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByLow
+{
+    OffsetT* row_degrees;
+    OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] < row_degrees[b])
+            return true;
+        else if (row_degrees[a] > row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+// Comparator for ordering rows by degree (highest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByHigh
+{
+    OffsetT* row_degrees;
+    OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] > row_degrees[b])
+            return true;
+        else if (row_degrees[a] < row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    OffsetT*                        relabel_indices)
+{
+    // Initialize row degrees
+    OffsetT* row_degrees_in     = new OffsetT[matrix.num_rows];
+    OffsetT* row_degrees_out    = new OffsetT[matrix.num_rows];
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        row_degrees_in[row]         = 0;
+        row_degrees_out[row]        = matrix.row_offsets[row + 1] - matrix.row_offsets[row];
+    }
+    for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero)
+    {
+        row_degrees_in[matrix.column_indices[nonzero]]++;
+    }
+
+    // Initialize unlabeled set 
+    typedef std::set<OffsetT, OrderByLow<OffsetT> > UnlabeledSet;
+    typename UnlabeledSet::key_compare  unlabeled_comp(row_degrees_in);
+    UnlabeledSet                        unlabeled(unlabeled_comp);
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row]    = -1;
+        unlabeled.insert(row);
+    }
+
+    // Initialize queue set
+    std::deque<OffsetT> q;
+
+    // Process unlabeled vertices (traverse connected components)
+    OffsetT relabel_idx = 0;
+    while (!unlabeled.empty())
+    {
+        // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree
+        OffsetT vertex = *unlabeled.begin();
+        q.push_back(vertex);
+
+        while (!q.empty())
+        {
+            vertex = q.front();
+            q.pop_front();
+
+            if (relabel_indices[vertex] == -1)
+            {
+                // Update this vertex
+                unlabeled.erase(vertex);
+                relabel_indices[vertex] = relabel_idx;
+                relabel_idx++;
+
+                // Sort neighbors by degree
+                OrderByLow<OffsetT> neighbor_comp(row_degrees_in);
+                std::sort(
+                    matrix.column_indices + matrix.row_offsets[vertex],
+                    matrix.column_indices + matrix.row_offsets[vertex + 1],
+                    neighbor_comp);
+
+                // Inspect neighbors, adding to the out frontier if unlabeled
+                for (OffsetT neighbor_idx = matrix.row_offsets[vertex];
+                    neighbor_idx < matrix.row_offsets[vertex + 1];
+                    ++neighbor_idx)
+                {
+                    OffsetT neighbor = matrix.column_indices[neighbor_idx];
+                    q.push_back(neighbor);
+                }
+            }
+        }
+    }
+
+/*
+    // Reverse labels
+    for (int row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1;
+    }
+*/
+
+    // Cleanup
+    if (row_degrees_in) delete[] row_degrees_in;
+    if (row_degrees_out) delete[] row_degrees_out;
+}
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    bool                            verbose = false)
+{
+    // Do not process if not square
+    if (matrix.num_cols != matrix.num_rows)
+    {
+        if (verbose) {
+            printf("RCM transformation ignored (not square)\n"); fflush(stdout);
+        }
+        return;
+    }
+
+    // Initialize relabel indices
+    OffsetT* relabel_indices = new OffsetT[matrix.num_rows];
+
+    if (verbose) {
+        printf("RCM relabeling... "); fflush(stdout);
+    }
+
+    RcmRelabel(matrix, relabel_indices);
+
+    if (verbose) {
+        printf("done. Reconstituting... "); fflush(stdout);
+    }
+
+    // Create a COO matrix from the relabel indices
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+    coo_matrix.InitCsrRelabel(matrix, relabel_indices);
+
+    // Reconstitute the CSR matrix from the sorted COO tuples
+    if (relabel_indices) delete[] relabel_indices;
+    matrix.Clear();
+    matrix.FromCoo(coo_matrix);
+
+    if (verbose) {
+        printf("done. "); fflush(stdout);
+    }
+}
+
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu
new file mode 100644
index 0000000..b64297d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu
@@ -0,0 +1,917 @@
+/******************************************************************************
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//---------------------------------------------------------------------
+// SpMV comparison tool
+//---------------------------------------------------------------------
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include <cusparse.h>
+
+#include "sparse_matrix.h"
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_spmv.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+bool                    g_quiet     = false;        // Whether to display stats in CSV format
+bool                    g_verbose   = false;        // Whether to display output to console
+bool                    g_verbose2  = false;        // Whether to display input to console
+CachingDeviceAllocator  g_allocator(true);          // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// SpMV verification
+//---------------------------------------------------------------------
+
+// Compute reference SpMV y = Ax
+template <
+    typename ValueT,
+    typename OffsetT>
+void SpmvGold(
+    CsrMatrix<ValueT, OffsetT>&     a,
+    ValueT*                         vector_x,
+    ValueT*                         vector_y_in,
+    ValueT*                         vector_y_out,
+    ValueT                          alpha,
+    ValueT                          beta)
+{
+    for (OffsetT row = 0; row < a.num_rows; ++row)
+    {
+        ValueT partial = beta * vector_y_in[row];
+        for (
+            OffsetT offset = a.row_offsets[row];
+            offset < a.row_offsets[row + 1];
+            ++offset)
+        {
+            partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]];
+        }
+        vector_y_out[row] = partial;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// GPU I/O proxy
+//---------------------------------------------------------------------
+
+/**
+ * Read every matrix nonzero value, read every corresponding vector value
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ValueT,
+    typename    OffsetT,
+    typename    VectorItr>
+__launch_bounds__ (int(BLOCK_THREADS))
+__global__ void NonZeroIoKernel(
+    SpmvParams<ValueT, OffsetT> params,
+    VectorItr                   d_vector_x)
+{
+    enum
+    {
+        TILE_ITEMS      = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+
+    ValueT nonzero = 0.0;
+
+    int tile_idx = blockIdx.x;
+
+    OffsetT block_offset = tile_idx * TILE_ITEMS;
+
+    OffsetT column_indices[ITEMS_PER_THREAD];
+    ValueT values[ITEMS_PER_THREAD];
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+
+        OffsetT* ci = params.d_column_indices + nonzero_idx;
+        ValueT*a = params.d_values + nonzero_idx;
+
+        column_indices[ITEM]    = (nonzero_idx < params.num_nonzeros) ? *ci : 0;
+        values[ITEM]            = (nonzero_idx < params.num_nonzeros) ? *a : 0.0;
+    }
+
+    __syncthreads();
+
+    // Read vector
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        ValueT vector_value    = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]);
+        nonzero                += vector_value * values[ITEM];
+    }
+
+    __syncthreads();
+
+    if (block_offset < params.num_rows)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+            if (row_idx < params.num_rows)
+            {
+                OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx);
+
+                if ((row_end_offset >= 0) && (nonzero == nonzero))
+                    params.d_vector_y[row_idx] = nonzero;
+            }
+        }
+    }
+
+}
+
+
+/**
+ * Run GPU I/O proxy
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuCsrIoProxy(
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    enum {
+        BLOCK_THREADS       = 128,
+        ITEMS_PER_THREAD    = 7,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+//    size_t smem = 1024 * 16;
+    size_t smem = 1024 * 0;
+
+    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int blocks = std::max(nonzero_blocks, row_blocks);
+
+    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
+    TexItr x_itr;
+    CubDebugExit(x_itr.BindTexture(params.d_vector_x));
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
+
+
+    int spmv_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
+
+    if (!g_quiet)
+        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
+
+    // Warmup
+    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+
+    // Check for failures
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(SyncStream(0));
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+    timer.Start();
+    for (int it = 0; it < timing_iterations; ++it)
+    {
+        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    CubDebugExit(x_itr.UnbindTexture());
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse HybMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    cusparseStatus_t status = cusparseScsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO);
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO));
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse CsrMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis = 0.0;
+    GpuTimer timer;
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// GPU Merge-based SpMV
+//---------------------------------------------------------------------
+
+/**
+ * Run CUB SpMV
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuMergeCsrmv(
+    ValueT*                         vector_y_in,
+    ValueT*                         reference_vector_y_out,
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    // Get amount of temporary storage needed
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros,
+// params.alpha, params.beta,
+        (cudaStream_t) 0, false));
+
+    // Allocate
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+        (cudaStream_t) 0, !g_quiet));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        CubDebugExit(DeviceSpmv::CsrMV(
+            d_temp_storage, temp_storage_bytes,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, params.d_vector_y,
+            params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+            (cudaStream_t) 0, false));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Display perf
+ */
+template <typename ValueT, typename OffsetT>
+void DisplayPerf(
+    float                           device_giga_bandwidth,
+    double                          avg_millis,
+    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
+{
+    double nz_throughput, effective_bandwidth;
+    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
+        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
+
+    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
+    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
+
+    if (!g_quiet)
+        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
+            sizeof(ValueT) * 8,
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+    else
+        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+
+    fflush(stdout);
+}
+
+
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTest(
+    bool                        rcm_relabel,
+    ValueT                      alpha,
+    ValueT                      beta,
+    CooMatrix<ValueT, OffsetT>& coo_matrix,
+    int                         timing_iterations,
+    CommandLineArgs&            args)
+{
+    // Adaptive timing iterations: run 16 billion nonzeros through
+    if (timing_iterations == -1)
+        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
+
+    if (!g_quiet)
+        printf("\t%d timing iterations\n", timing_iterations);
+
+    // Convert to CSR
+    CsrMatrix<ValueT, OffsetT> csr_matrix;
+    csr_matrix.FromCoo(coo_matrix);
+    if (!args.CheckCmdLineFlag("csrmv"))
+        coo_matrix.Clear();
+
+    // Relabel
+    if (rcm_relabel)
+    {
+        if (!g_quiet)
+        {
+            csr_matrix.Stats().Display();
+            printf("\n");
+            csr_matrix.DisplayHistogram();
+            printf("\n");
+            if (g_verbose2)
+                csr_matrix.Display();
+            printf("\n");
+        }
+
+        RcmRelabel(csr_matrix, !g_quiet);
+
+        if (!g_quiet) printf("\n");
+    }
+
+    // Display matrix info
+    csr_matrix.Stats().Display(!g_quiet);
+    if (!g_quiet)
+    {
+        printf("\n");
+        csr_matrix.DisplayHistogram();
+        printf("\n");
+        if (g_verbose2)
+            csr_matrix.Display();
+        printf("\n");
+    }
+    fflush(stdout);
+
+    // Allocate input and output vectors
+    ValueT* vector_x        = new ValueT[csr_matrix.num_cols];
+    ValueT* vector_y_in     = new ValueT[csr_matrix.num_rows];
+    ValueT* vector_y_out    = new ValueT[csr_matrix.num_rows];
+
+    for (int col = 0; col < csr_matrix.num_cols; ++col)
+        vector_x[col] = 1.0;
+
+    for (int row = 0; row < csr_matrix.num_rows; ++row)
+        vector_y_in[row] = 1.0;
+
+    // Compute reference answer
+    SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta);
+
+    float avg_millis;
+
+    if (g_quiet) {
+        printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout);
+    }
+
+    // Get GPU device bandwidth (GB/s)
+    float device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Allocate and initialize GPU problem
+    SpmvParams<ValueT, OffsetT> params;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_values,          sizeof(ValueT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_x,        sizeof(ValueT) * csr_matrix.num_cols));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_y,        sizeof(ValueT) * csr_matrix.num_rows));
+    params.num_rows         = csr_matrix.num_rows;
+    params.num_cols         = csr_matrix.num_cols;
+    params.num_nonzeros     = csr_matrix.num_nonzeros;
+    params.alpha            = alpha;
+    params.beta             = beta;
+
+    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
+
+    if (!g_quiet) printf("\n\n");
+    printf("GPU CSR I/O Prox, "); fflush(stdout);
+    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
+    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("CUB, "); fflush(stdout);
+        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    // Initialize cuSparse
+    cusparseHandle_t cusparse;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse CsrMV, "); fflush(stdout);
+        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    if (args.CheckCmdLineFlag("hybmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse HybMV, "); fflush(stdout);
+
+        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+
+    // Cleanup
+    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
+    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
+    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
+    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
+    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
+
+    if (vector_x)                   delete[] vector_x;
+    if (vector_y_in)                delete[] vector_y_in;
+    if (vector_y_out)               delete[] vector_y_out;
+}
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTests(
+    bool                rcm_relabel,
+    ValueT              alpha,
+    ValueT              beta,
+    const std::string&  mtx_filename,
+    int                 grid2d,
+    int                 grid3d,
+    int                 wheel,
+    int                 dense,
+    int                 timing_iterations,
+    CommandLineArgs&    args)
+{
+    // Initialize matrix in COO form
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+
+    if (!mtx_filename.empty())
+    {
+        // Parse matrix market file
+        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
+        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
+
+        if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
+        {
+            if (!g_quiet) printf("Trivial dataset\n");
+            exit(0);
+        }
+    }
+    else if (grid2d > 0)
+    {
+        // Generate 2D lattice
+        printf("grid2d_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitGrid2d(grid2d, false);
+    }
+    else if (grid3d > 0)
+    {
+        // Generate 3D lattice
+        printf("grid3d_%d, ", grid3d); fflush(stdout);
+        coo_matrix.InitGrid3d(grid3d, false);
+    }
+    else if (wheel > 0)
+    {
+        // Generate wheel graph
+        printf("wheel_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitWheel(wheel);
+    }
+    else if (dense > 0)
+    {
+        // Generate dense graph
+        OffsetT size = 1 << 24; // 16M nnz
+        args.GetCmdLineArgument("size", size);
+
+        OffsetT rows = size / dense;
+        printf("dense_%d_x_%d, ", rows, dense); fflush(stdout);
+        coo_matrix.InitDense(rows, dense);
+    }
+    else
+    {
+        fprintf(stderr, "No graph type specified.\n");
+        exit(1);
+    }
+
+    RunTest(
+        rcm_relabel,
+        alpha,
+        beta,
+        coo_matrix,
+        timing_iterations,
+        args);
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--csrmv | --hybmv | --bsrmv ] "
+            "[--device=<device-id>] "
+            "[--quiet] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--fp64] "
+            "[--rcm] "
+            "[--alpha=<alpha scalar (default: 1.0)>] "
+            "[--beta=<beta scalar (default: 0.0)>] "
+            "\n\t"
+                "--mtx=<matrix market file> "
+            "\n\t"
+                "--dense=<cols>"
+            "\n\t"
+                "--grid2d=<width>"
+            "\n\t"
+                "--grid3d=<width>"
+            "\n\t"
+                "--wheel=<spokes>"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    bool                fp64;
+    bool                rcm_relabel;
+    std::string         mtx_filename;
+    int                 grid2d              = -1;
+    int                 grid3d              = -1;
+    int                 wheel               = -1;
+    int                 dense               = -1;
+    int                 timing_iterations   = -1;
+    float               alpha               = 1.0;
+    float               beta                = 0.0;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose2 = args.CheckCmdLineFlag("v2");
+    g_quiet = args.CheckCmdLineFlag("quiet");
+    fp64 = args.CheckCmdLineFlag("fp64");
+    rcm_relabel = args.CheckCmdLineFlag("rcm");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("mtx", mtx_filename);
+    args.GetCmdLineArgument("grid2d", grid2d);
+    args.GetCmdLineArgument("grid3d", grid3d);
+    args.GetCmdLineArgument("wheel", wheel);
+    args.GetCmdLineArgument("dense", dense);
+    args.GetCmdLineArgument("alpha", alpha);
+    args.GetCmdLineArgument("beta", beta);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run test(s)
+    if (fp64)
+    {
+        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+    else
+    {
+        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n");
+
+    return 0;
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh
new file mode 100755
index 0000000..f432043
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
+do
+	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
+done
+
+echo
+echo
+
+for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
+do
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done
+
+echo
+echo
+
+for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
+#for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
+do 
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done 
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/.gitignore
new file mode 100644
index 0000000..978ba97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/.gitignore
@@ -0,0 +1,3 @@
+/bin
+/link_main.obj
+/dummy/
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h
new file mode 100644
index 0000000..f032f21
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h
@@ -0,0 +1,307 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utilities for interacting with the opaque CUDA __half type
+ */
+
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <iosfwd>
+
+#include <cub/util_type.cuh>
+
+
+/******************************************************************************
+ * half_t
+ ******************************************************************************/
+
+/**
+ * Host-based fp16 data type compatible and convertible with __half
+ */
+struct half_t
+{
+    uint16_t __x;
+
+    /// Constructor from __half
+    __host__ __device__ __forceinline__
+    half_t(const __half &other)
+    {
+        __x = reinterpret_cast<const uint16_t&>(other);
+    }
+
+    /// Constructor from integer
+    __host__ __device__ __forceinline__
+    half_t(int a)
+    {
+        *this = half_t(float(a));
+    }
+
+    /// Default constructor
+    __host__ __device__ __forceinline__
+    half_t() : __x(0)
+    {}
+
+    /// Constructor from float
+    __host__ __device__ __forceinline__
+    half_t(float a)
+    {
+        // Stolen from Norbert Juffa
+        uint32_t ia = *reinterpret_cast<uint32_t*>(&a);
+        uint16_t ir;
+
+        ir = (ia >> 16) & 0x8000;
+
+        if ((ia & 0x7f800000) == 0x7f800000)
+        {
+            if ((ia & 0x7fffffff) == 0x7f800000)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ir = 0x7fff; /* canonical NaN */
+            }
+        }
+        else if ((ia & 0x7f800000) >= 0x33000000)
+        {
+            int32_t shift = (int32_t) ((ia >> 23) & 0xff) - 127;
+            if (shift > 15)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
+                if (shift < -14)
+                { /* denormal */
+                    ir |= ia >> (-1 - shift);
+                    ia = ia << (32 - (-1 - shift));
+                }
+                else
+                { /* normal */
+                    ir |= ia >> (24 - 11);
+                    ia = ia << (32 - (24 - 11));
+                    ir = ir + ((14 + shift) << 10);
+                }
+                /* IEEE-754 round to nearest of even */
+                if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1)))
+                {
+                    ir++;
+                }
+            }
+        }
+
+        this->__x = ir;
+    }
+
+    /// Cast to __half
+    __host__ __device__ __forceinline__
+    operator __half() const
+    {
+        return reinterpret_cast<const __half&>(__x);
+    }
+
+    /// Cast to float
+    __host__ __device__ __forceinline__
+    operator float() const
+    {
+        // Stolen from Andrew Kerr
+
+        int sign        = ((this->__x >> 15) & 1);
+        int exp         = ((this->__x >> 10) & 0x1f);
+        int mantissa    = (this->__x & 0x3ff);
+        uint32_t f      = 0;
+
+        if (exp > 0 && exp < 31)
+        {
+            // normal
+            exp += 112;
+            f = (sign << 31) | (exp << 23) | (mantissa << 13);
+        }
+        else if (exp == 0)
+        {
+            if (mantissa)
+            {
+                // subnormal
+                exp += 113;
+                while ((mantissa & (1 << 10)) == 0)
+                {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= 0x3ff;
+                f = (sign << 31) | (exp << 23) | (mantissa << 13);
+            }
+            else if (sign)
+            {
+                f = 0x80000000; // negative zero
+            }
+            else
+            {
+                f = 0x0;        // zero
+            }
+        }
+        else if (exp == 31)
+        {
+            if (mantissa)
+            {
+                f = 0x7fffffff;     // not a number
+            }
+            else
+            {
+                f = (0xff << 23) | (sign << 31);    //  inf
+            }
+        }
+        return *reinterpret_cast<float const *>(&f);
+    }
+
+
+    /// Get raw storage
+    __host__ __device__ __forceinline__
+    uint16_t raw()
+    {
+        return this->__x;
+    }
+
+    /// Equality
+    __host__ __device__ __forceinline__
+    bool operator ==(const half_t &other)
+    {
+        return (this->__x == other.__x);
+    }
+
+    /// Inequality
+    __host__ __device__ __forceinline__
+    bool operator !=(const half_t &other)
+    {
+        return (this->__x != other.__x);
+    }
+
+    /// Assignment by sum
+    __host__ __device__ __forceinline__
+    half_t& operator +=(const half_t &rhs)
+    {
+        *this = half_t(float(*this) + float(rhs));
+        return *this;
+    }
+
+    /// Multiply
+    __host__ __device__ __forceinline__
+    half_t operator*(const half_t &other)
+    {
+        return half_t(float(*this) * float(other));
+    }
+
+    /// Add
+    __host__ __device__ __forceinline__
+    half_t operator+(const half_t &other)
+    {
+        return half_t(float(*this) + float(other));
+    }
+
+    /// Less-than
+    __host__ __device__ __forceinline__
+    bool operator<(const half_t &other) const
+    {
+        return float(*this) < float(other);
+    }
+
+    /// Less-than-equal
+    __host__ __device__ __forceinline__
+    bool operator<=(const half_t &other) const
+    {
+        return float(*this) <= float(other);
+    }
+
+    /// Greater-than
+    __host__ __device__ __forceinline__
+    bool operator>(const half_t &other) const
+    {
+        return float(*this) > float(other);
+    }
+
+    /// Greater-than-equal
+    __host__ __device__ __forceinline__
+    bool operator>=(const half_t &other) const
+    {
+        return float(*this) >= float(other);
+    }
+
+    /// numeric_traits<half_t>::max
+    __host__ __device__ __forceinline__
+    static half_t max() {
+        uint16_t max_word = 0x7BFF;
+        return reinterpret_cast<half_t&>(max_word);
+    }
+
+    /// numeric_traits<half_t>::lowest
+    __host__ __device__ __forceinline__
+    static half_t lowest() {
+        uint16_t lowest_word = 0xFBFF;
+        return reinterpret_cast<half_t&>(lowest_word);
+    }
+};
+
+
+/******************************************************************************
+ * I/O stream overloads
+ ******************************************************************************/
+
+/// Insert formatted \p half_t into the output stream
+std::ostream& operator<<(std::ostream &out, const half_t &x)
+{
+    out << (float)x;
+    return out;
+}
+
+
+/// Insert formatted \p __half into the output stream
+std::ostream& operator<<(std::ostream &out, const __half &x)
+{
+    return out << half_t(x);
+}
+
+
+/******************************************************************************
+ * Traits overloads
+ ******************************************************************************/
+
+template <>
+struct cub::FpLimits<half_t>
+{
+    static __host__ __device__ __forceinline__ half_t Max() { return half_t::max(); }
+
+    static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); }
+};
+
+template <> struct cub::NumericTraits<half_t> : cub::BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t> {};
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu
new file mode 100644
index 0000000..8a9b19f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void a()
+{
+    printf("a() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu
new file mode 100644
index 0000000..a19ec40
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void b()
+{
+    printf("b() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp
new file mode 100644
index 0000000..ef677ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp
@@ -0,0 +1,10 @@
+#include <stdio.h>
+
+extern void a();
+extern void b();
+
+int main()
+{
+    printf("hello world\n");
+    return 0;
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h
new file mode 100644
index 0000000..76aae80
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h
@@ -0,0 +1,160 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+#include <stdio.h>
+
+namespace mersenne {
+
+/* Period parameters */
+const unsigned int N          = 624;
+const unsigned int M          = 397;
+const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
+const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
+const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
+
+static unsigned int mt[N];  /* the array for the state vector  */
+static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned int s)
+{
+    mt[0] = s & 0xffffffff;
+    for (mti = 1; mti < N; mti++)
+    {
+        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
+
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+
+        mt[mti] &= 0xffffffff;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned int init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218);
+    i = 1;
+    j = 0;
+    k = (N > key_length ? N : key_length);
+    for (; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
+            + init_key[j] + j;  /* non linear */
+        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
+        i++;
+        j++;
+        if (i >= N)
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+        if (j >= key_length) j = 0;
+    }
+    for (k = N - 1; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
+        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i >= N)
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+    }
+
+    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned int genrand_int32(void)
+{
+    unsigned int y;
+    static unsigned int mag01[2] = { 0x0, MATRIX_A };
+
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= N)
+    { /* generate N words at one time */
+        int kk;
+
+        if (mti == N + 1) /* if init_genrand() has not been called, */
+        init_genrand(5489); /* a defat initial seed is used */
+
+        for (kk = 0; kk < N - M; kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        for (; kk < N - 1; kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+        mti = 0;
+    }
+
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+
+
+} // namespace mersenne
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu
new file mode 100644
index 0000000..f771435
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for caching allocator of device memory
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--bytes=<timing bytes>]"
+            "[--i=<timing iterations>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+#if (CUB_PTX_ARCH == 0)
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get number of GPUs and current GPU
+    int num_gpus;
+    int initial_gpu;
+    int timing_iterations           = 10000;
+    int timing_bytes                = 1024 * 1024;
+
+    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
+    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("bytes", timing_bytes);
+
+    // Create default allocator (caches up to 6MB in device allocations per GPU)
+    CachingDeviceAllocator allocator;
+    allocator.debug = true;
+
+    printf("Running single-gpu tests...\n"); fflush(stdout);
+
+    //
+    // Test0
+    //
+
+    // Create a new stream
+    cudaStream_t other_stream;
+    CubDebugExit(cudaStreamCreate(&other_stream));
+
+    // Allocate 999 bytes on the current gpu in stream0
+    char *d_999B_stream0_a;
+    char *d_999B_stream0_b;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_a
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+
+    // Allocate another 999 bytes in stream 0
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Allocate 999 bytes on the current gpu in other_stream
+    char *d_999B_stream_other_a;
+    char *d_999B_stream_other_b;
+    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
+
+    // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have one cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Check that we can now use both allocations in other_stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other_a and d_999B_stream_other_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(cudaStreamDestroy(other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Free all cached
+    CubDebugExit(allocator.FreeAllCached());
+
+    //
+    // Test1
+    //
+
+    // Allocate 5 bytes on the current gpu
+    char *d_5B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
+
+    // Check that that we have zero free bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test2
+    //
+
+    // Allocate 4096 bytes on the current gpu
+    char *d_4096B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    //
+    // Test3
+    //
+
+    // DeviceFree d_5B
+    CubDebugExit(allocator.DeviceFree(d_5B));
+
+    // Check that that we have min_bin_bytes free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test4
+    //
+
+    // DeviceFree d_4096B
+    CubDebugExit(allocator.DeviceFree(d_4096B));
+
+    // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
+
+    // Check that that we have 0 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+    // Check that that we have 2 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 2);
+
+    //
+    // Test5
+    //
+
+    // Allocate 768 bytes on the current gpu
+    char *d_768B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test6
+    //
+
+    // Allocate max_cached_bytes on the current gpu
+    char *d_max_cached;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
+
+    // DeviceFree d_max_cached
+    CubDebugExit(allocator.DeviceFree(d_max_cached));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we still have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test7
+    //
+
+    // Free all cached blocks on all GPUs
+    CubDebugExit(allocator.FreeAllCached());
+
+    // Check that that we have 0 bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 0 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Check that that still we have 1 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test8
+    //
+
+    // Allocate max cached bytes + 1 on the current gpu
+    char *d_max_cached_plus;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
+
+    // DeviceFree max cached bytes
+    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
+
+    // DeviceFree d_768B
+    CubDebugExit(allocator.DeviceFree(d_768B));
+
+    unsigned int power;
+    size_t rounded_bytes;
+    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
+
+    // Check that that we have 4096 free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+    // Check that that we have 1 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Check that that still we have 0 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+#ifndef CUB_CDP
+    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
+
+    if (num_gpus > 1)
+    {
+        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
+
+        //
+        // Test9
+        //
+
+        // Allocate 768 bytes on the next gpu
+        int next_gpu = (initial_gpu + 1) % num_gpus;
+        char *d_768B_2;
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // DeviceFree d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Re-allocate 768 bytes on the next gpu
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // Re-free d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Check that that we have 4096 free bytes cached on the initial gpu
+        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+        // Check that that we have 4096 free bytes cached on the second gpu
+        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
+
+        // Check that that we have 2 cached blocks across all GPUs
+        AssertEquals(allocator.cached_blocks.size(), 2);
+
+        // Check that that still we have 0 live block across all GPUs
+        AssertEquals(allocator.live_blocks.size(), 0);
+    }
+#endif  // CUB_CDP
+
+    //
+    // Performance
+    //
+
+    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
+    CpuTimer    cpu_timer;
+    char        *d_1024MB                       = NULL;
+    allocator.debug                             = false;
+
+    // Prime the caching allocator and the kernel
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+    cub::EmptyKernel<void><<<1, 32>>>();
+
+    // CUDA
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    // CUB
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
+    GpuTimer gpu_timer;
+
+    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // Kernel-only
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        cub::EmptyKernel<void><<<1, 32>>>();
+    }
+    gpu_timer.Stop();
+    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // CUDA
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    // CUB
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+
+#endif
+
+    printf("Success\n");
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu
new file mode 100644
index 0000000..b76466f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <string>
+#include <typeinfo>
+
+#include <cub/block/block_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockHistogram test kernel.
+ */
+template <
+    int                     BINS,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm ALGORITHM,
+    typename                T,
+    typename                HistoCounter>
+__global__ void BlockHistogramKernel(
+    T                       *d_samples,
+    HistoCounter            *d_histogram)
+{
+    // Parameterize BlockHistogram type for our thread block
+    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
+
+    // Test histo (writing directly to histogram buffer in global)
+    BlockHistogram(temp_storage).Histogram(data, d_histogram);
+}
+
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    int             BINS,
+    typename        SampleT>
+void Initialize(
+    GenMode         gen_mode,
+    SampleT         *h_samples,
+    int             *h_histograms_linear,
+    int             num_samples)
+{
+    // Init bins
+    for (int bin = 0; bin < BINS; ++bin)
+    {
+        h_histograms_linear[bin] = 0;
+    }
+
+    if (g_verbose) printf("Samples: \n");
+
+    // Initialize interleaved channel samples and histogram them correspondingly
+    for (int i = 0; i < num_samples; ++i)
+    {
+        InitValue(gen_mode, h_samples[i], i);
+        h_samples[i] %= BINS;
+
+        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";
+
+        h_histograms_linear[h_samples[i]]++;
+    }
+
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Test BlockHistogram
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test(
+    GenMode                     gen_mode)
+{
+    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
+        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
+        num_samples,
+        typeid(SampleT).name(),
+        (int) sizeof(SampleT),
+        BINS,
+        BLOCK_THREADS,
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    SampleT         *h_samples          = new SampleT[num_samples];
+    int   *h_reference = new int[BINS];
+
+    // Initialize problem
+    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);
+
+    // Allocate problem device arrays
+    SampleT         *d_samples = NULL;
+    int             *d_histogram = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));
+
+    // Initialize/clear device arrays
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));
+
+    // Run kernel
+    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
+        d_samples,
+        d_histogram);
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
+    printf("\t%s\n\n", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (h_reference) delete[] h_reference;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test different sample distributions
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
+}
+
+
+/**
+ * Test different ALGORITHM
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
+}
+
+
+/**
+ * Test different ITEMS_PER_THREAD
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, 1>();
+    Test<SampleT, BINS, BLOCK_THREADS, 5>();
+}
+
+
+/**
+ * Test different BLOCK_THREADS
+ */
+template <
+    typename                    SampleT,
+    int                         BINS>
+void Test()
+{
+    Test<SampleT, BINS, 32>();
+    Test<SampleT, BINS, 96>();
+    Test<SampleT, BINS, 128>();
+}
+
+
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<total input samples across all channels> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        Test<unsigned char, 32>();
+        Test<unsigned char, 256>();
+        Test<unsigned short, 1024>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu
new file mode 100644
index 0000000..f1a0bf3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu
@@ -0,0 +1,549 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockLoad and BlockStore utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store kernel.
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    OutputIteratorT    d_out_unguarded,
+    OutputIteratorT    d_out_guarded,
+    int               num_items)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+    // Shared memory type for this thread block
+    union TempStorage
+    {
+        typename BlockLoad::TempStorage     load;
+        typename BlockStore::TempStorage    store;
+    };
+
+    // Allocate temp storage in shared memory
+    __shared__ TempStorage temp_storage;
+
+    // Threadblock work bounds
+    int block_offset = blockIdx.x * TILE_SIZE;
+    int guarded_elements = num_items - block_offset;
+
+    // Tile of items
+    OutputT data[ITEMS_PER_THREAD];
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
+
+    __syncthreads();
+
+    // reset data
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        data[ITEM] = OutputT();
+
+    __syncthreads();
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
+}
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store variants
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+void TestKernel(
+    T                   *h_in,
+    InputIteratorT      d_in,
+    OutputIteratorT      d_out_unguarded_itr,
+    OutputIteratorT      d_out_guarded_itr,
+    T                   *d_out_unguarded_ptr,
+    T                   *d_out_guarded_ptr,
+    int                 grid_size,
+    int                 guarded_elements)
+{
+    int compare;
+
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Test with discard output iterator
+    typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
+    DiscardOutputIterator<OffsetT> discard_itr;
+
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            discard_itr,
+            discard_itr,
+            guarded_elements);
+
+    // Test with regular output iterator
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out_unguarded_itr,
+            d_out_guarded_itr,
+            guarded_elements);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
+    printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
+    printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test native pointer.  Specialized for sufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      sufficient_resources)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestNative "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        (T const *) d_in,   // Test const
+        d_out_unguarded,
+        d_out_guarded,
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+
+/**
+ * Test native pointer.  Specialized for insufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<false>      sufficient_resources)
+{}
+
+
+/**
+ * Test iterator.  Specialized for sufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      sufficient_resources)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestIterator "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "LOAD_MODIFIER(%d) "
+        "STORE_MODIFIER(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+/**
+ * Test iterator.  Specialized for insufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<false>     sufficient_resources)
+{}
+
+
+/**
+ * Evaluate different pointer access types
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestPointerType(
+    int             grid_size,
+    float           fraction_valid)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 512;
+#else
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
+#endif
+
+    static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
+
+    TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+    TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+}
+
+
+/**
+ * Evaluate different time-slicing strategies
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestSlicedStrategy(
+    int             grid_size,
+    float           fraction_valid)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
+}
+
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<false> is_warp_multiple)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<true>  is_warp_multiple)
+{
+    TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different register blocking
+ */
+template <
+    typename T,
+    int BLOCK_THREADS>
+void TestItemsPerThread(
+    int grid_size,
+    float fraction_valid)
+{
+    Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
+
+    TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
+}
+
+
+/**
+ * Evaluate different thread block sizes
+ */
+template <typename T>
+void TestThreads(
+    int grid_size,
+    float fraction_valid)
+{
+    TestItemsPerThread<T, 15>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 32>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 72>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 96>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 128>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
+    TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
+
+#else
+
+    // Compile/run thorough tests
+    TestThreads<char>(2, 0.8f);
+    TestThreads<int>(2, 0.8f);
+    TestThreads<long>(2, 0.8f);
+    TestThreads<long2>(2, 0.8f);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        TestThreads<double2>(2, 0.8f);
+    TestThreads<TestFoo>(2, 0.8f);
+    TestThreads<TestBar>(2, 0.8f);
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu
new file mode 100644
index 0000000..959018b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu
@@ -0,0 +1,717 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <iostream>
+
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Specialized descending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized descending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+
+
+/**
+ * BlockRadixSort kernel
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    int                 RADIX_BITS,
+    bool                MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig SMEM_CONFIG,
+    int                 DESCENDING,
+    int                 BLOCKED_OUTPUT,
+    typename            Key,
+    typename            Value>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     *d_elapsed)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockRadixSort<
+            Key,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            Value,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG>
+        BlockRadixSortT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockRadixSortT::TempStorage temp_storage;
+
+    // Items per thread
+    Key     keys[ITEMS_PER_THREAD];
+    Value   values[ITEMS_PER_THREAD];
+
+    LoadDirectBlocked(threadIdx.x, d_keys, keys);
+    LoadDirectBlocked(threadIdx.x, d_values, values);
+
+    // Start cycle timer
+    clock_t stop;
+    clock_t start = clock();
+
+    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
+        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
+
+    // Store time
+    if (threadIdx.x == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename Key,
+    typename Value,
+    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename Key, typename Value>
+struct Pair<Key, Value, true>
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Key in unsigned bits
+        typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
+        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&b.key));
+        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+template <bool DESCENDING, typename Key, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Key             *h_keys,
+    Value           *h_values,
+    Key             *h_reference_keys,
+    Value           *h_reference_values,
+    int             num_items,
+    int             entropy_reduction,
+    int             begin_bit,
+    int             end_bit)
+{
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_keys[i], i);
+
+        RandomBits(h_values[i]);
+
+        // Mask off unwanted portions
+        int num_bits = end_bit - begin_bit;
+        if ((begin_bit > 0) || (end_bit < sizeof(Key) * 8))
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(Key));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_keys[i], &base, sizeof(Key));
+        }
+
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+    std::stable_sort(h_pairs, h_pairs + num_items);
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+
+
+/**
+ * Test BlockRadixSort kernel
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestDriver(
+    GenMode                 gen_mode,
+    int                     entropy_reduction,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
+        KEYS_ONLY = Equals<Value, NullType>::VALUE,
+    };
+
+    // Allocate host arrays
+    Key     *h_keys             = new Key[TILE_SIZE];
+    Key     *h_reference_keys   = new Key[TILE_SIZE];
+    Value   *h_values           = new Value[TILE_SIZE];
+    Value   *h_reference_values = new Value[TILE_SIZE];
+
+    // Allocate device arrays
+    Key     *d_keys     = NULL;
+    Value   *d_values   = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+
+    // Initialize problem and solution on host
+    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
+        TILE_SIZE, entropy_reduction, begin_bit, end_bit);
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));
+
+    printf("%s "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "RADIX_BITS(%d) "
+        "MEMOIZE_OUTER_SCAN(%d) "
+        "INNER_SCAN_ALGORITHM(%d) "
+        "SMEM_CONFIG(%d) "
+        "DESCENDING(%d) "
+        "BLOCKED_OUTPUT(%d) "
+        "sizeof(Key)(%d) "
+        "sizeof(Value)(%d) "
+        "gen_mode(%d), "
+        "entropy_reduction(%d) "
+        "begin_bit(%d) "
+        "end_bit(%d), "
+        "samples(%d)\n",
+            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            DESCENDING,
+            BLOCKED_OUTPUT,
+            (int) sizeof(Key),
+            (int) sizeof(Value),
+            gen_mode,
+            entropy_reduction,
+            begin_bit,
+            end_bit,
+            g_num_rand_samples);
+
+    // Set shared memory config
+    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);
+
+    // Run kernel
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
+        d_keys, d_values, begin_bit, end_bit, d_elapsed);
+
+    // Flush kernel output / errors
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check keys results
+    printf("\tKeys: ");
+    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check value results
+    if (!KEYS_ONLY)
+    {
+        printf("\tValues: ");
+        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+    printf("\n");
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+    printf("\n");
+
+    // Cleanup
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
+    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test driver (valid tile size <= MAX_SMEM_BYTES)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<true> fits_smem_capacity)
+{
+    // Iterate begin_bit
+    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
+    {
+        // Iterate end bit
+        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
+        {
+            // Uniform key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                UNIFORM, 0, begin_bit, end_bit);
+
+            // Sequential key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                INTEGER_SEED, 0, begin_bit, end_bit);
+
+            // Iterate random with entropy_reduction
+            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
+            {
+                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                    RANDOM, entropy_reduction, begin_bit, end_bit);
+            }
+        }
+    }
+}
+
+
+/**
+ * Test driver (invalid tile size)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<false> fits_smem_capacity)
+{}
+
+
+/**
+ * Test ascending/descending and to-blocked/to-striped
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    typename                Key,
+    typename                Value>
+void Test()
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
+#else
+    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
+#endif
+
+    // Sort-ascending, to-striped
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
+
+    // Sort-descending, to-blocked
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
+
+    // Not necessary
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeys()
+{
+    // Test keys-only sorting with both smem configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
+#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
+#endif
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeysAndPairs()
+{
+    // Test pairs sorting with only 4-byte configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
+}
+
+
+/**
+ * Test key type
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
+void Test()
+{
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef TEST_KEYS_ONLY
+
+    // Test unsigned types with keys-only
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned char>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned short>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned int>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long long>();
+
+#else
+
+    // Test signed and fp types with paired values
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, float>();
+    if (ptx_version > 120)
+    {
+        // Don't check doubles on PTX120 or below because they're down-converted
+        TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, double>();
+    }
+
+#endif
+}
+
+
+/**
+ * Test inner scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_RAKING>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_WARP_SCANS>();
+}
+
+
+/**
+ * Test outer scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, true>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, false>();
+}
+
+
+/**
+ * Test radix bits
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 5>();
+}
+
+
+/**
+ * Test items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    // Open64 compiler can't handle the number of test cases
+#else
+    Test<BLOCK_THREADS, 4>();
+#endif
+    Test<BLOCK_THREADS, 11>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    {
+        typedef float T;
+        TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8);
+    }
+/*
+    // Compile/run quick tests
+    typedef unsigned int T;
+    TestDriver<64, 17, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<96, 8, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<128, 2, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+*/
+
+#else
+
+    // Compile/run thorough tests
+    Test<32>();
+    Test<64>();
+    Test<160>();
+
+
+#endif  // QUICK_TEST
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu
new file mode 100644
index 0000000..c8df4bc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu
@@ -0,0 +1,822 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <device_functions.h>
+#include <typeinfo>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_debug.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Generic reduction (full, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data[0], reduction_op);
+}
+
+/// Generic reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data, reduction_op);
+}
+
+/// Generic reduction (partial, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
+{
+    return block_reduce.Reduce(data, reduction_op, valid_threads);
+}
+
+/// Sum reduction (full, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
+{
+    return block_reduce.Sum(data[0]);
+}
+
+/// Sum reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
+{
+    return block_reduce.Sum(data);
+}
+
+/// Sum reduction (partial, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
+{
+    return block_reduce.Sum(data, valid_threads);
+}
+
+
+/**
+ * Test full-tile reduction kernel (where num_items is an even
+ * multiple of BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void FullTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    ReductionOp             reduction_op,
+    int                     tiles,
+    clock_t                 *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Cooperative thread block reduction utility type (returns aggregate in thread 0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+
+    // Load first tile of data
+    int block_offset = 0;
+
+    if (block_offset < TILE_SIZE * tiles)
+    {
+        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+        block_offset += TILE_SIZE;
+
+        // Start cycle timer
+        clock_t start = clock();
+
+        // Cooperative reduce first tile
+        BlockReduceT block_reduce(temp_storage) ;
+        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+        // Stop cycle timer
+ #if CUB_PTX_ARCH == 100
+        // Bug: recording stop clock causes mis-write of running prefix value
+        clock_t stop = 0;
+#else
+        clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+        clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+        // Loop over input tiles
+        while (block_offset < TILE_SIZE * tiles)
+        {
+            // TestBarrier between thread block reductions
+            __syncthreads();
+    
+            // Load tile of data
+            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+            block_offset += TILE_SIZE;
+
+            // Start cycle timer
+            clock_t start = clock();
+
+            // Cooperatively reduce the tile's aggregate
+            BlockReduceT block_reduce(temp_storage) ;
+            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+            // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+            // Bug: recording stop clock causes mis-write of running prefix value
+            clock_t stop = 0;
+#else
+            clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+            elapsed += (start > stop) ? start - stop : stop - start;
+
+            // Reduce thread block aggregate
+            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
+        }
+
+        // Store data
+        if (linear_tid == 0)
+        {
+            d_out[0] = block_aggregate;
+            *d_elapsed = elapsed;
+        }
+    }
+}
+
+
+
+/**
+ * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void PartialTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    clock_t                 *d_elapsed)
+{
+    // Cooperative thread block reduction utility type (returns aggregate only in thread-0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T partial;
+
+    // Load partial tile data
+    if (linear_tid < num_items)
+    {
+        partial = d_in[linear_tid];
+    }
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Cooperatively reduce the tile's aggregate
+    BlockReduceT block_reduce(temp_storage) ;
+    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
+
+    // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+    // Bug: recording stop clock causes mis-write of running prefix value
+    clock_t stop = 0;
+#else
+    clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+
+    clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+    // Store data
+    if (linear_tid == 0)
+    {
+        d_out[0] = tile_aggregate;
+        *d_elapsed = elapsed;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           h_reference[1],
+    ReductionOp reduction_op,
+    int         num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        if (i == 0)
+            h_reference[0] = h_in[0];
+        else
+            h_reference[0] = reduction_op(h_reference[0], h_in[i]);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n");
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test full-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<true>          sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    int num_items = TILE_SIZE * tiles;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    // Test multi-tile (unguarded)
+    printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
+        Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,
+        tiles,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        reduction_op,
+        tiles,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test full-tile reduction.  (Specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ * Test full-tile reduction.
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
+#else
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
+#endif
+    };
+
+    TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+/**
+ * Run battery of tests for different thread block dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+}
+
+/**
+ * Run battery of tests for different thread items
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
+}
+
+
+/**
+ * Run battery of full-tile tests for different numbers of tiles
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (int tiles = 1; tiles < 3; tiles++)
+    {
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Partial-tile test generation
+//---------------------------------------------------------------------
+
+/**
+ * Test partial-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<true>          sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        num_items,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+
+/**
+ * Test partial-tile reduction (specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 16 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 512,
+#else
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 48 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 1024,
+#endif
+    };
+
+    TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (
+        int num_items = 1;
+        num_items < BLOCK_THREADS;
+        num_items += CUB_MAX(1, BLOCK_THREADS / 5))
+    {
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Run battery of full-tile tests for different gen modes
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void Test(
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+    {
+        // Don't test randomly-generated floats b/c of stability
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different block-reduction algorithmic variants
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+#ifdef TEST_RAKING
+    Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
+    Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
+#endif
+#ifdef TEST_WARP_REDUCTIONS
+    Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
+#endif
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+    Test<7,   T>(reduction_op);
+    Test<32,  T>(reduction_op);
+    Test<63,  T>(reduction_op);
+    Test<97,  T>(reduction_op);
+    Test<128, T>(reduction_op);
+    Test<238, T>(reduction_op);
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <typename T>
+void Test()
+{
+    Test<T>(Sum());
+    Test<T>(Max());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+
+
+    printf("\n full tile ------------------------\n\n");
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 4, int>(RANDOM, 1, Sum());
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 1, int>(RANDOM, 1, Sum());
+
+    printf("\n partial tile ------------------------\n\n");
+
+    TestPartialTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, int>(RANDOM, 7, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // primitives
+        Test<char>();
+        Test<short>();
+        Test<int>();
+        Test<long long>();
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            Test<double>();
+
+        Test<float>();
+
+        // vector types
+        Test<char2>();
+        Test<short2>();
+        Test<int2>();
+        Test<longlong2>();
+
+        Test<char4>();
+        Test<short4>();
+        Test<int4>();
+        Test<longlong4>();
+
+        // Complex types
+        Test<TestFoo>();
+        Test<TestBar>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu
new file mode 100644
index 0000000..192fb51
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu
@@ -0,0 +1,929 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <limits>
+#include <typeinfo>
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+    PREFIX,
+};
+
+
+/**
+ * Scan mode to test
+ */
+enum ScanMode
+{
+    EXCLUSIVE,
+    INCLUSIVE
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+/**
+ * Stateful prefix functor
+ */
+template <
+    typename T,
+    typename ScanOpT>
+struct BlockPrefixCallbackOp
+{
+    int     linear_tid;
+    T       prefix;
+    ScanOpT  scan_op;
+
+    __device__ __forceinline__
+    BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) :
+        linear_tid(linear_tid),
+        prefix(prefix),
+        scan_op(scan_op)
+    {}
+
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // For testing purposes
+        T retval = (linear_tid == 0) ? prefix  : T();
+        prefix = scan_op(prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Exclusive scan
+//---------------------------------------------------------------------
+
+/// Exclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op);
+}
+
+/// Exclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Exclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Exclusive sum
+//---------------------------------------------------------------------
+
+/// Exclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0]);
+}
+
+/// Exclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data);
+}
+
+/// Exclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, block_aggregate);
+}
+
+/// Exclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Exclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive scan
+//---------------------------------------------------------------------
+
+/// Inclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op);
+}
+
+/// Inclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate);
+}
+
+/// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, block_aggregate);
+}
+
+/// Inclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Inclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive sum
+//---------------------------------------------------------------------
+
+/// Inclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0]);
+}
+
+/// Inclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, block_aggregate);
+}
+
+/// Inclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, prefix_op);
+}
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockScan test kernel.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            T,
+    typename            ScanOpT>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void BlockScanKernel(
+    T                   *d_in,
+    T                   *d_out,
+    T                   *d_aggregate,
+    ScanOpT              scan_op,
+    T                   initial_value,
+    clock_t             *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Parameterize BlockScan type for our thread block
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockScanT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectBlocked(linear_tid, d_in, data);
+
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test scan
+    T                                   block_aggregate;
+    BlockScanT                          block_scan(temp_storage);
+    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);
+
+    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
+        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store output
+    StoreDirectBlocked(linear_tid, d_out, data);
+
+    // Store block_aggregate
+    if (TEST_MODE != BASIC)
+        d_aggregate[linear_tid] = block_aggregate;
+
+    // Store prefix
+    if (TEST_MODE == PREFIX)
+    {
+        if (linear_tid == 0)
+            d_out[TILE_SIZE] = prefix_op.prefix;
+    }
+
+    // Store time
+    if (linear_tid == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT     scan_op,
+    T           initial_value,
+    Int2Type<EXCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    h_reference[0]      = initial_value;
+    T inclusive         = scan_op(initial_value, h_in[0]);
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        h_reference[i] = inclusive;
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT      scan_op,
+    T           initial_value,
+    Int2Type<INCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    T inclusive         = scan_op(initial_value, h_in[0]);
+    h_reference[0]      = inclusive;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+        h_reference[i] = inclusive;
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for sufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value,
+    Int2Type<true>      sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    T *h_in = new T[TILE_SIZE];
+    T *h_reference = new T[TILE_SIZE];
+    T *h_aggregate = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    T block_aggregate = Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        TILE_SIZE,
+        scan_op,
+        initial_value,
+        Int2Type<SCAN_MODE>());
+
+    // Test reference block_aggregate is returned in all threads
+    for (int i = 0; i < BLOCK_THREADS; ++i)
+    {
+        h_aggregate[i] = block_aggregate;
+    }
+
+    // Run kernel
+    printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n",
+        TEST_MODE, gen_mode, ALGORITHM,
+        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,  TILE_SIZE,
+        typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    T       *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+        {
+            std::cout << CoutCast(h_in[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Run block_aggregate/prefix kernel
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (TEST_MODE == AGGREGATE)
+    {
+        // Copy out and display block_aggregate
+        printf("\tScan block aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    if (TEST_MODE == PREFIX)
+    {
+        // Copy out and display updated prefix
+        printf("\tScan running total: ");
+        T running_total = scan_op(initial_value, block_aggregate);
+        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for insufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value,
+    Int2Type<false>     sufficient_resources)
+{}
+
+
+/**
+ * Test thread block scan.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    enum
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 512),
+#else
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 1024),
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+        // Accommodate ptxas crash bug (access violation) on Windows
+        special_skip            = ((TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1)),
+#else
+        special_skip            = false,
+#endif
+        sufficient_resources    = (sufficient_smem && sufficient_threads && !special_skip),
+    };
+
+    Test<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(
+        gen_mode, scan_op, initial_value, Int2Type<sufficient_resources>());
+}
+
+
+
+/**
+ * Run test for different thread block dimensions
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+    Test<BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run test for different policy types
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    ScanMode    SCAN_MODE,
+    TestMode    TEST_MODE,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+#ifdef TEST_RAKING
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_RAKING_MEMOIZE
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING_MEMOIZE>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_WARP_SCANS
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_WARP_SCANS>(gen_mode, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Run tests for different primitive variants
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, scan_op, identity);
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, BASIC>(gen_mode, scan_op, identity);      // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);  // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, PREFIX>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run tests for different problem-generation options
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(UNIFORM, scan_op, identity, initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(INTEGER_SEED, scan_op, identity, initial_value);
+
+    // Don't test randomly-generated floats b/c of stability
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(RANDOM, scan_op, identity, initial_value);
+}
+
+
+/**
+ * Run tests for different data types and scan ops
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned char) 0, (unsigned char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned short) 0, (unsigned short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned int) 0, (unsigned int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned long long) 0, (unsigned long long) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (float) 0, (float) 99);
+
+    // primitive (alternative scan op)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<char>::min(), (char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<short>::min(), (short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<int>::min(), (int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<long long>::min(), (long long) 99);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<double>::max() * -1, (double) 99);
+
+    // vec-1
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar1(0), make_uchar1(17));
+
+    // vec-2
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar2(0, 0), make_uchar2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ushort2(0, 0), make_ushort2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uint2(0, 0), make_uint2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21));
+
+    // vec-4
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85));
+
+    // complex
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run tests for different items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+    Test<BLOCK_THREADS, 2>();
+    Test<BLOCK_THREADS, 9>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+
+    // Compile/run quick tests
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0));
+
+    Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85));
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Run tests for different thread block sizes
+        Test<17>();
+        Test<32>();
+        Test<62>();
+        Test<65>();
+//            Test<96>();             // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21));
+        Test<128>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu
new file mode 100644
index 0000000..8da6926
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu
@@ -0,0 +1,1686 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <algorithm>
+#include <typeinfo>
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    #include <npp.h>
+#endif
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_histogram.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    NPP,        // NPP method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+bool                    g_verbose_input     = false;
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to NPP histogram
+//---------------------------------------------------------------------
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+/**
+ * Dispatch to single-channel 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+//CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<1>             num_channels,
+    Int2Type<1>             num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_C1R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+/**
+ * Dispatch to 3/4 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+//CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<4>          num_channels,
+    Int2Type<3>   num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_AC4R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceHistogram entrypoints
+//---------------------------------------------------------------------
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
+struct Dispatch;
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
+struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
+{
+    /**
+     * Dispatch to CUB multi histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  *d_temp_storage_bytes,
+        cudaError_t             *d_cdp_error,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                d_levels,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+
+    /**
+     * Dispatch to CUB multi histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  *d_temp_storage_bytes,
+        cudaError_t             *d_cdp_error,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+};
+
+
+template <>
+struct Dispatch<1, 1, CUB>
+{
+
+    /**
+     * Dispatch to CUB single histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  *d_temp_storage_bytes,
+        cudaError_t             *d_cdp_error,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramRange(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                d_levels[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+
+    /**
+     * Dispatch to CUB single histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  *d_temp_storage_bytes,
+        cudaError_t             *d_cdp_error,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,
+        bool                debug_synchronous)
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramEven(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes,
+                stream,
+                debug_synchronous);
+        }
+        return error;
+    }
+
+};
+
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceHistogram
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+__global__ void CnpDispatchKernel(
+    Int2Type<ALGORITHM> algorithm,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
+    int                 num_samples,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/ **
+ * Dispatch to CDP kernel
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+cudaError_t Dispatch(
+    Int2Type<ALGORITHM> algorithm,
+    Int2Type<true>      use_cdp,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
+    int                 num_samples,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+*/
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+// Searches for bin given a list of bin-boundary levels
+template <typename LevelT>
+struct SearchTransform
+{
+    LevelT          *levels;      // Pointer to levels array
+    int             num_levels;   // Number of levels in array
+
+    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
+        if (bin < 0)
+        {
+            // Sample out of range
+            return num_levels;
+        }
+        return bin;
+    }
+};
+
+
+// Scales samples to evenly-spaced bins
+template <typename LevelT>
+struct ScaleTransform
+{
+    int    num_levels;  // Number of levels in array
+    LevelT max;         // Max sample level (exclusive)
+    LevelT min;         // Min sample level (inclusive)
+    LevelT scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        LevelT max,         // Max sample level (exclusive)
+        LevelT min,         // Min sample level (inclusive)
+        LevelT scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((LevelT) sample) - min) / scale);
+    }
+};
+
+// Scales samples to evenly-spaced bins
+template <>
+struct ScaleTransform<float>
+{
+    int   num_levels;  // Number of levels in array
+    float max;         // Max sample level (exclusive)
+    float min;         // Min sample level (inclusive)
+    float scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        float max,         // Max sample level (exclusive)
+        float min,         // Min sample level (inclusive)
+        float scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = 1.0f / scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((float) sample) - min) * scale);
+    }
+};
+
+
+/**
+ * Generate sample
+ */
+template <typename T, typename LevelT>
+void Sample(T &datum, LevelT max_level, int entropy_reduction)
+{
+    unsigned int max = (unsigned int) -1;
+    unsigned int bits;
+    RandomBits(bits, entropy_reduction);
+    float fraction = (float(bits) / max);
+
+    datum = (T) (fraction * max_level);
+}
+
+
+/**
+ * Initialize histogram samples
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        LevelT,
+    typename        SampleT,
+    typename        OffsetT>
+void InitializeSamples(
+    LevelT          max_level,
+    int             entropy_reduction,
+    SampleT         *h_samples,
+    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    // Initialize samples
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Init sample value
+                Sample(h_samples[offset], max_level, entropy_reduction);
+                if (g_verbose_input)
+                {
+                    if (channel > 0) printf(", ");
+                    std::cout << CoutCast(h_samples[offset]);
+                }
+            }
+        }
+    }
+}
+
+
+/**
+ * Initialize histogram solutions
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        SampleIteratorT,
+    typename        TransformOp,
+    typename        OffsetT>
+void InitializeBins(
+    SampleIteratorT h_samples,
+    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    // Init bins
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
+        {
+            h_histogram[CHANNEL][bin] = 0;
+        }
+    }
+
+    // Initialize samples
+    if (g_verbose_input) printf("Samples: \n");
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            if (g_verbose_input) printf("[");
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Update sample bin
+                int bin = transform_op[channel](h_samples[offset]);
+                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
+                if ((bin >= 0) && (bin < num_levels[channel] - 1))
+                {
+                    // valid bin
+                    h_histogram[channel][bin]++;
+                }
+            }
+            if (g_verbose_input) printf("]");
+        }
+        if (g_verbose_input) printf("\n\n");
+    }
+}
+
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT,
+    typename        SampleIteratorT>
+void TestEven(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    SampleIteratorT h_samples,
+    SampleIteratorT d_samples)
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+
+        transform_op[channel].Init(
+            num_levels[channel],
+            upper_level[channel],
+            lower_level[channel],
+            ((upper_level[channel] - lower_level[channel]) / bins));
+    }
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 8;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenNative(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    // Allocate and initialize host sample data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT* d_samples = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        h_samples, d_samples);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenIterator(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    SampleT sample = (SampleT) lower_level[0];
+    ConstantInputIterator<SampleT> sample_itr(sample);
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        sample_itr, sample_itr);
+
+}
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
+        std::cout << levels[channel][0];
+        for (int level = 1; level < num_levels[channel]; ++level)
+            std::cout << ", " << levels[channel][level];
+        printf("]\n");
+    }
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        transform_op[channel].levels = levels[channel];
+        transform_op[channel].num_levels = num_levels[channel];
+
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+    }
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT*        d_samples = NULL;
+    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
+    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
+        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
+
+        int bins = num_levels[channel] - 1;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 9;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+
+        if (d_levels[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
+    }
+
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEven(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    LevelT lower_level[NUM_ACTIVE_CHANNELS];
+    LevelT upper_level[NUM_ACTIVE_CHANNELS];
+
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    // Set upper and lower levels for each channel
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int num_bins = num_levels[channel] - 1;
+        lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
+        upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
+    }
+
+    // Test pointer-based samples
+    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Test iterator-based samples (CUB-only)
+    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+}
+
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    LevelT* levels[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        levels[channel] = new LevelT[num_levels[channel]];
+
+        int num_bins = num_levels[channel] - 1;
+        LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
+
+        for (int level = 0; level < num_levels[channel]; ++level)
+            levels[channel][level] = lower_level + (level * min_level_increment);
+    }
+
+    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        delete[] levels[channel];
+
+}
+
+
+
+/**
+ * Test different entrypoints
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+/**
+ * Test different number of levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    int num_levels[NUM_ACTIVE_CHANNELS];
+
+// Unnecessary testing
+//    // All the same level
+//    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+//    {
+//        num_levels[channel] = max_num_levels;
+//    }
+//    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+//        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    // All different levels
+    num_levels[0] = max_num_levels;
+    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
+    }
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+
+/**
+ * Test different entropy-levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
+}
+
+
+/**
+ * Test different row strides
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
+
+    // No padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
+
+    // 13 samples padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
+}
+
+
+/**
+ * Test different problem sizes
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // 0 row/col images
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(0), OffsetT(0), max_level, max_num_levels);
+
+    // 1080 image
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
+
+    // Sample different aspect ratios sizes
+    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
+    {
+        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
+        {
+            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+                cols, rows, max_level, max_num_levels);
+        }
+    }
+
+    // Randomly select linear problem size between 1:10,000,000
+    unsigned int max_int = (unsigned int) -1;
+    for (int i = 0; i < 4; ++i)
+    {
+        unsigned int num_items;
+        RandomBits(num_items);
+        num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+        num_items = CUB_MAX(1, num_items);
+
+        Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+            OffsetT(num_items), 1, max_level, max_num_levels);
+    }
+}
+
+
+
+/**
+ * Test different channel interleavings (valid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          max_level,
+    int             max_num_levels,
+    Int2Type<true>  is_valid_tag)
+{
+    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+}
+
+
+/**
+ * Test different channel interleavings (invalid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          max_level,
+    int             max_num_levels,
+    Int2Type<false> is_valid_tag)
+{}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_row_pixels = -1;
+    int entropy_reduction = 0;
+    int num_rows = 1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", num_row_pixels);
+
+    int row_stride_pixels = num_row_pixels;
+
+    args.GetCmdLineArgument("rows", num_rows);
+    args.GetCmdLineArgument("stride", row_stride_pixels);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    bool compare_npp = args.CheckCmdLineFlag("npp");
+#endif
+
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<pixels per row> "
+            "[--rows=<number of rows> "
+            "[--stride=<row stride in pixels> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "[--v] "
+            "[--cdp]"
+            "[--npp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    if (num_row_pixels < 0)
+    {
+        num_row_pixels      = 1920 * 1080;
+        row_stride_pixels   = num_row_pixels;
+    }
+
+#if defined(QUICKER_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: signed char 256 bins
+        typedef signed char         SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 4/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[4]       = {257, 257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 256 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+         LevelT  max_level           = 1.0;
+         int     num_levels[3]       = {257, 257, 257};
+         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: signed char 256 bins
+        typedef signed char         SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 129, 65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        // HistogramEven: double [0,1.0] 64 bins
+        typedef double              SampleT;
+        typedef double              LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 512 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {513};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
+        TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
+
+		// Test down-conversion of size_t offsets to int
+        TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu
new file mode 100644
index 0000000..06e030e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu
@@ -0,0 +1,1305 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <typeinfo>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/reverse.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,                        // CUB method (allows overwriting of input)
+    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
+
+    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
+    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
+
+    THRUST,                     // Thrust method
+    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         is_descending,
+    Int2Type<CUB>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>             is_descending,
+    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
+    int                         *d_selector,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          is_descending,
+    Int2Type<CUB>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>              is_descending,
+    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
+    int                         *d_selector,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         is_descending,
+    Int2Type<CUB_SEGMENTED> dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>                         is_descending,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
+    int                                     *d_selector,
+    size_t                                  *d_temp_storage_bytes,
+    cudaError_t                             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          is_descending,
+    Int2Type<CUB_SEGMENTED> dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>                          is_descending,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
+    int                                     *d_selector,
+    size_t                                  *d_temp_storage_bytes,
+    cudaError_t                             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch keys-only to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<THRUST>        dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<NullType>  &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
+
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+        thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch key-value pairs to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<THRUST>        dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
+        thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+
+        thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRadixSort
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+__global__ void CnpDispatchKernel(
+    Int2Type<IS_DESCENDING> is_descending,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  temp_storage_bytes,
+    DoubleBuffer<KeyT>      d_keys,
+    DoubleBuffer<ValueT>    d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    bool                    debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error            = cudaErrorNotSupported;
+#else
+    *d_cdp_error            = Dispatch(
+                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
+                                num_items, num_segments, d_segment_offsets,
+                                begin_bit, end_bit, 0, debug_synchronous);
+    *d_temp_storage_bytes   = temp_storage_bytes;
+    *d_selector             = d_keys.selector;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<CDP>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, debug_synchronous);
+
+    // Copy out selector
+    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
+    d_values.selector = d_keys.selector;
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename KeyT,
+    typename ValueT,
+    bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for bool types)
+ */
+template <typename ValueT>
+struct Pair<bool, ValueT, false>
+{
+    bool     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (!key && b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename KeyT, typename ValueT>
+struct Pair<KeyT, ValueT, true>
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // KeyT in unsigned bits
+        typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&key));
+        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&b.key));
+        UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key data
+ */
+template <typename KeyT>
+void InitializeKeyBits(
+    GenMode         gen_mode,
+    KeyT            *h_keys,
+    int             num_items,
+    int             entropy_reduction)
+{
+    for (int i = 0; i < num_items; ++i)
+        InitValue(gen_mode, h_keys[i], i);
+}
+
+
+/**
+ * Initialize solution
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void InitializeSolution(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    int     *&h_reference_ranks,
+    KeyT    *&h_reference_keys)
+{
+    typedef Pair<KeyT, int> PairT;
+
+    PairT *h_pairs = new PairT[num_items];
+
+    int num_bits = end_bit - begin_bit;
+    for (int i = 0; i < num_items; ++i)
+    {
+
+        // Mask off unwanted portions
+        if (num_bits < sizeof(KeyT) * 8)
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(KeyT));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
+        }
+        else
+        {
+            h_pairs[i].key = h_keys[i];
+        }
+
+        h_pairs[i].value = i;
+    }
+
+    printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
+
+    for (int i = 0; i < num_segments; ++i)
+    {
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+    }
+
+    printf(" Done.\n"); fflush(stdout);
+
+    h_reference_ranks  = new int[num_items];
+    h_reference_keys   = new KeyT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_ranks[i]    = h_pairs[i].value;
+        h_reference_keys[i]     = h_keys[h_pairs[i].value];
+    }
+
+    if (h_pairs) delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test DeviceRadixSort
+ */
+template <
+    Backend     BACKEND,
+    bool        IS_DESCENDING,
+    typename    KeyT,
+    typename    ValueT>
+void Test(
+    KeyT        *h_keys,
+    ValueT      *h_values,
+    int         num_items,
+    int         num_segments,
+    int         *h_segment_offsets,
+    int         begin_bit,
+    int         end_bit,
+    KeyT        *h_reference_keys,
+    ValueT      *h_reference_values)
+{
+    // Key alias type
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    typedef typename If<Equals<KeyT, half_t>::VALUE, __half, KeyT>::Type KeyAliasT;
+#else
+    typedef KeyT KeyAliasT;
+#endif
+
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
+        (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (KEYS_ONLY) ? "keys-only" : "key-value",
+        num_items, num_segments,
+        (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
+        IS_DESCENDING, begin_bit, end_bit);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+    }
+
+    // Allocate device arrays
+    DoubleBuffer<KeyAliasT> d_keys;
+    DoubleBuffer<ValueT>    d_values;
+    int                     *d_selector;
+    int                     *d_segment_offsets;
+    size_t                  *d_temp_storage_bytes;
+    cudaError_t             *d_cdp_error;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
+    if (!KEYS_ONLY)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
+    }
+
+    // Allocate temporary storage (and make it un-aligned)
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
+    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
+
+    // Initialize/clear device arrays
+    d_keys.selector = 0;
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
+    if (!KEYS_ONLY)
+    {
+        d_values.selector = 0;
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
+    }
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    printf("Warmup done.  Checking results:\n"); fflush(stdout);
+    int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast<KeyT*>(d_keys.Current()), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
+    if (!KEYS_ONLY)
+    {
+        int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+        compare |= values_compare;
+        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+    if (BACKEND == CUB_NO_OVERWRITE)
+    {
+        // Check that input isn't overwritten
+        int input_compare = CompareDeviceResults(h_keys, reinterpret_cast<KeyT*>(d_keys.d_buffers[0]), num_items, true, g_verbose);
+        compare |= input_compare;
+        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Performance
+    if (g_timing_iterations)
+        printf("\nPerforming timing iterations:\n"); fflush(stdout);
+
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0f;
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Initialize/clear device arrays
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
+        if (!KEYS_ONLY)
+        {
+            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
+        }
+
+        gpu_timer.Start();
+        CubDebugExit(Dispatch(
+            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+            num_items, num_segments, d_segment_offsets,
+            begin_bit, end_bit, 0, false));
+        gpu_timer.Stop();
+        elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = (KEYS_ONLY) ?
+            giga_rate * sizeof(KeyT) * 2 :
+            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
+        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test backend
+ */
+template <bool IS_DESCENDING, typename KeyT, typename ValueT>
+void TestBackend(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    KeyT    *h_reference_keys,
+    int     *h_reference_ranks)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    ValueT *h_values             = NULL;
+    ValueT *h_reference_values   = NULL;
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+
+#ifdef SEGMENTED_SORT
+    // Test multi-segment implementations
+    Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+#else   // SEGMENTED_SORT
+    if (num_segments == 1)
+    {
+        // Test single-segment implementations
+        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+        Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    #ifdef CUB_CDP
+        Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    #endif
+    }
+#endif  // SEGMENTED_SORT
+
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+}
+
+
+
+
+/**
+ * Test value type
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void TestValueTypes(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    // Initialize the solution
+
+    int *h_reference_ranks = NULL;
+    KeyT *h_reference_keys = NULL;
+    InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    // Test keys-only
+    TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 8b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 32b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 64b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with non-trivially-constructable value
+    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Cleanup
+    if (h_reference_ranks) delete[] h_reference_ranks;
+    if (h_reference_keys) delete[] h_reference_keys;
+}
+
+
+
+/**
+ * Test ascending/descending
+ */
+template <typename KeyT>
+void TestDirection(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+    TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+}
+
+
+/**
+ * Test different bit ranges
+ */
+template <typename KeyT>
+void TestBits(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets)
+{
+    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
+    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
+    {
+        // Partial bits
+        int begin_bit = 1;
+        int end_bit = (sizeof(KeyT) * 8) - 1;
+        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+
+        // Across subword boundaries
+        int mid_bit = sizeof(KeyT) * 4;
+        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
+    }
+
+    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
+    TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
+}
+
+
+/**
+ * Test different segment compositions
+ */
+template <typename KeyT>
+void TestSegments(
+    KeyT    *h_keys,
+    int     num_items,
+    int     max_segments)
+{
+    int *h_segment_offsets = new int[max_segments + 1];
+
+#ifdef SEGMENTED_SORT
+    for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
+    {
+        if (num_items / num_segments < 128 * 1000) {
+            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+            InitializeSegments(num_items, num_segments, h_segment_offsets);
+            TestBits(h_keys, num_items, num_segments, h_segment_offsets);
+        }
+    }
+#else
+    // Test single segment
+    if (num_items < 128 * 1000) {
+        // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+        InitializeSegments(num_items, 1, h_segment_offsets);
+        TestBits(h_keys, num_items, 1, h_segment_offsets);
+    }
+#endif
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/**
+ * Test different (sub)lengths and number of segments
+ */
+template <typename KeyT>
+void TestSizes(
+    KeyT    *h_keys,
+    int     max_items,
+    int     max_segments)
+{
+    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
+    {
+        TestSegments(h_keys, num_items, max_segments);
+    }
+    TestSegments(h_keys, 1, max_segments);
+    TestSegments(h_keys, 0, max_segments);
+}
+
+
+/**
+ * Test key sampling distributions
+ */
+template <typename KeyT>
+void TestGen(
+    int             max_items,
+    int             max_segments)
+{
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    if (max_items < 0)
+        max_items = (ptx_version > 100) ? 9000003 : max_items = 5000003;
+
+    if (max_segments < 0)
+        max_segments = 5003;
+
+    KeyT *h_keys = new KeyT[max_items];
+
+    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
+    {
+        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
+        InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
+        TestSizes(h_keys, max_items, max_segments);
+    }
+
+    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    if (h_keys) delete[] h_keys;
+}
+
+
+//---------------------------------------------------------------------
+// Simple test
+//---------------------------------------------------------------------
+
+template <
+    Backend     BACKEND,
+    typename    KeyT,
+    typename    ValueT,
+    bool        IS_DESCENDING>
+void Test(
+    int         num_items,
+    int         num_segments,
+    GenMode     gen_mode,
+    int         entropy_reduction,
+    int         begin_bit,
+    int         end_bit)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    KeyT    *h_keys             = new KeyT[num_items];
+    int     *h_reference_ranks  = NULL;
+    KeyT    *h_reference_keys   = NULL;
+    ValueT  *h_values           = NULL;
+    ValueT  *h_reference_values = NULL;
+    int     *h_segment_offsets  = new int[num_segments + 1];
+
+    if (end_bit < 0)
+        end_bit = sizeof(KeyT) * 8;
+
+    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
+    InitializeSegments(num_items, num_segments, h_segment_offsets);
+    InitializeSolution<IS_DESCENDING>(
+        h_keys, num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+    if (h_reference_ranks) delete[] h_reference_ranks;
+
+    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
+    Test<BACKEND, IS_DESCENDING>(
+        h_keys, h_values,
+        num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_keys, h_reference_values);
+
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int bits = -1;
+    int num_items = -1;
+    int num_segments = -1;
+    int entropy_reduction = 0;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("s", num_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("bits", bits);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--bits=<valid key bits>]"
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    enum {
+        IS_DESCENDING   = false
+    };
+
+    // Compile/run basic CUB test
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+    Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+    Test<CUB,           unsigned char,      NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    Test<CUB,           half_t,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+#endif
+    Test<CUB,           float,              NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           double,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    printf("\n-------------------------------\n");
+
+    Test<CUB,           unsigned char,      unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+    // Compare CUB and thrust on 32b keys-only
+    Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b keys-only
+    Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+    // Compare CUB and thrust on 32b key-value pairs
+    Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b key + 32b value pairs
+    Test<CUB, unsigned long long, unsigned int, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, unsigned int, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestGen<bool>                 (num_items, num_segments);
+
+        TestGen<char>                 (num_items, num_segments);
+        TestGen<signed char>          (num_items, num_segments);
+        TestGen<unsigned char>        (num_items, num_segments);
+
+        TestGen<short>                (num_items, num_segments);
+        TestGen<unsigned short>       (num_items, num_segments);
+
+        TestGen<int>                  (num_items, num_segments);
+        TestGen<unsigned int>         (num_items, num_segments);
+
+        TestGen<long>                 (num_items, num_segments);
+        TestGen<unsigned long>        (num_items, num_segments);
+
+        TestGen<long long>            (num_items, num_segments);
+        TestGen<unsigned long long>   (num_items, num_segments);
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+        TestGen<half_t>                (num_items, num_segments);
+#endif
+        TestGen<float>                (num_items, num_segments);
+
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestGen<double>           (num_items, num_segments);
+
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu
new file mode 100644
index 0000000..95126d6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu
@@ -0,0 +1,1350 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+int                     g_ptx_version;
+int                     g_sm_count;
+double                  g_device_giga_bandwidth;
+bool                    g_verbose           = false;
+bool                    g_verbose_input     = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,            // CUB method
+    CUB_SEGMENTED,  // CUB segmented method
+    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+    THRUST,         // Thrust method
+};
+
+
+// Custom max functor
+struct CustomMax
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename OutputT>
+    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Sum            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Min            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Max            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMin         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMax         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSegmentedReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Sum            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Min            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Max            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMin         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMax         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduction entrypoint (min or max specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        OutputT init;
+        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
+
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+/**
+ * Dispatch to reduction entrypoint (sum specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    Sum                 reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceReduce
+ */
+template <
+    typename            InputIteratorT,
+    typename            OutputIteratorT,
+    typename            OffsetIteratorT,
+    typename            ReductionOpT>
+__global__ void CnpDispatchKernel(
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CUB_CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_CDP>       dispatch_to,
+    int                 timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Initialize problem
+template <typename InputT>
+void Initialize(
+    GenMode         gen_mode,
+    InputT          *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose_input)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/// Solve problem (max/custom-max functor)
+template <typename ReductionOpT, typename InputT, typename _OutputT>
+struct Solution
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (min functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Min, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Min reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (sum functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Sum, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Sum reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate;
+            InitValue(INTEGER_SEED, aggregate, 0);
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (argmin functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMin, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMin reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (argmax functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMax, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMax reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Test DeviceReduce for a given problem input
+template <
+    typename                BackendT,
+    typename                DeviceInputIteratorT,
+    typename                DeviceOutputIteratorT,
+    typename                HostReferenceIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void Test(
+    BackendT                backend,
+    DeviceInputIteratorT    d_in,
+    DeviceOutputIteratorT   d_out,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op,
+    HostReferenceIteratorT  h_reference)
+{
+    // Input data types
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
+
+    // Allocate CUB_CDP device arrays for temp storage size and error
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Inquire temp device storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Allocate temp device storage
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    if (g_timing_iterations > 0)
+    {
+        GpuTimer gpu_timer;
+        gpu_timer.Start();
+
+        CubDebugExit(Dispatch(backend, g_timing_iterations,
+            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, num_segments, d_segment_offsets,
+            reduction_op, 0, false));
+
+        gpu_timer.Stop();
+        float elapsed_millis = gpu_timer.ElapsedMillis();
+
+        // Display performance
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(InputT);
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/// Test DeviceReduce
+template <
+    Backend                 BACKEND,
+    typename                OutputValueT,
+    typename                HostInputIteratorT,
+    typename                DeviceInputIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void SolveAndTest(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
+    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
+    typedef typename SolutionT::OutputT                                         OutputT;
+
+    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
+        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
+        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
+    fflush(stdout);
+
+    // Allocate and solve solution
+    OutputT *h_reference = new OutputT[num_segments];
+    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
+
+//    // Run with discard iterator
+//    DiscardOutputIterator<OffsetT> discard_itr;
+//    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
+
+    // Run with output data (cleared for sanity-check)
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
+    Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (h_reference) delete[] h_reference;
+}
+
+
+/// Test specific problem type
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        OffsetT,
+    typename        ReductionOpT>
+void TestProblem(
+    OffsetT         num_items,
+    OffsetT         num_segments,
+    GenMode         gen_mode,
+    ReductionOpT    reduction_op)
+{
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
+    InputT* h_in = new InputT[num_items];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize segment data
+    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
+    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+
+    // Initialize device data
+    OffsetT *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
+
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (h_in)               delete[] h_in;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/// Test different operators
+template <
+    Backend             BACKEND,
+    typename            OutputT,
+    typename            HostInputIteratorT,
+    typename            DeviceInputIteratorT,
+    typename            OffsetT,
+    typename            OffsetIteratorT>
+void TestByOp(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets)
+{
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
+}
+
+
+/// Test different backends
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestByBackend(
+    OffsetT     num_items,
+    OffsetT     max_segments,
+    GenMode     gen_mode)
+{
+    // Initialize host data
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+
+    InputT  *h_in               = new InputT[num_items];
+    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize device data
+    InputT  *d_in               = NULL;
+    OffsetT *d_segment_offsets  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    //
+    // Test single-segment implementations
+    //
+
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    // Page-aligned-input tests
+    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
+#ifdef CUB_CDP
+    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
+#endif
+
+    // Non-page-aligned-input tests
+    if (num_items > 1)
+    {
+        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
+        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
+    }
+
+    //
+    // Test segmented implementation
+    //
+
+    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+    int max_items_per_segment = 128000;
+
+    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
+        num_segments < max_segments;
+        num_segments = (num_segments * 32) + 1)
+    {
+        // Test with segment pointer
+        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
+
+        // Test with segment iterator
+        typedef CastOp<OffsetT> IdentityOpT;
+        IdentityOpT identity_op;
+        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
+            h_segment_offsets,
+            identity_op);
+       TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
+            d_segment_offsets,
+            identity_op);
+
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
+    }
+
+    if (h_in)               delete[] h_in;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+
+/// Test different input-generation modes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+void TestByGenMode(
+    OffsetT num_items,
+    OffsetT max_segments)
+{
+    //
+    // Test pointer support using different input-generation modes
+    //
+
+    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
+
+    //
+    // Test iterator support using a constant-iterator and SUM
+    //
+
+    InputT val;
+    InitValue(UNIFORM, val, 0);
+    ConstantInputIterator<InputT, OffsetT> h_in(val);
+
+    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#ifdef CUB_CDP
+    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#endif
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/// Test different problem sizes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+struct TestBySize
+{
+    OffsetT max_items;
+    OffsetT max_segments;
+
+    TestBySize(OffsetT max_items, OffsetT max_segments) :
+        max_items(max_items),
+        max_segments(max_segments)
+    {}
+
+    template <typename ActivePolicyT>
+    cudaError_t Invoke()
+    {
+        //
+        // Black-box testing on all backends
+        //
+
+        // Test 0, 1, many
+        TestByGenMode<InputT, OutputT>(0,           max_segments);
+        TestByGenMode<InputT, OutputT>(1,           max_segments);
+        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
+
+        // Test random problem sizes from a log-distribution [8, max_items-ish)
+        int     num_iterations = 8;
+        double  max_exp = log(double(max_items)) / log(double(2.0));
+        for (int i = 0; i < num_iterations; ++i)
+        {
+            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
+            TestByGenMode<InputT, OutputT>(num_items, max_segments);
+        }
+
+        //
+        // White-box testing of single-segment problems around specific sizes
+        //
+
+        // Tile-boundaries: multiple blocks, one tile per block
+        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
+
+        // Tile-boundaries: multiple blocks, multiple tiles per block
+        OffsetT sm_occupancy = 32;
+        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
+        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
+
+        return cudaSuccess;
+    }
+};
+
+
+/// Test problem type
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestType(
+    OffsetT     max_items,
+    OffsetT     max_segments)
+{
+    typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
+
+    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
+
+    MaxPolicyT::Invoke(g_ptx_version, dispatch);
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef int OffsetT;
+
+    OffsetT max_items       = 27000000;
+    OffsetT max_segments    = 34000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", max_items);
+    args.GetCmdLineArgument("s", max_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Get ptx version
+    CubDebugExit(PtxVersion(g_ptx_version));
+
+    // Get SM count
+    g_sm_count = args.deviceProp.multiProcessorCount;
+
+    std::numeric_limits<float>::max();
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic test
+
+
+    TestProblem<CUB, char, int>(            max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, short, int>(           max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB, int, int>(             max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
+    TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());
+
+    printf("\n-------------------------------\n");
+
+    TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick comparison tests
+
+    TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
+    TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
+    TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
+    TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestType<char, char>(max_items, max_segments);
+
+        TestType<unsigned char, unsigned char>(max_items, max_segments);
+
+        TestType<char, int>(max_items, max_segments);
+
+        TestType<short, short>(max_items, max_segments);
+        TestType<int, int>(max_items, max_segments);
+        TestType<long, long>(max_items, max_segments);
+        TestType<long long, long long>(max_items, max_segments);
+
+        TestType<uchar2, uchar2>(max_items, max_segments);
+        TestType<uint2, uint2>(max_items, max_segments);
+        TestType<ulonglong2, ulonglong2>(max_items, max_segments);
+        TestType<ulonglong4, ulonglong4>(max_items, max_segments);
+
+        TestType<TestFoo, TestFoo>(max_items, max_segments);
+        TestType<TestBar, TestBar>(max_items, max_segments);
+    }
+
+#endif
+
+
+    printf("\n");
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu
new file mode 100644
index 0000000..7d35eef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu
@@ -0,0 +1,853 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::ReduceByKey utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                  equality_op,
+    ReductionOpT                 reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ReduceByKey(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input keys type
+    typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
+        typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
+        typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
+        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
+
+        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
+        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
+
+        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_keys_in_wrapper,
+                d_keys_in_wrapper + num_items,
+                d_values_in_wrapper,
+                d_keys_out_wrapper,
+                d_values_out_wrapper);
+        }
+
+        OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+
+        int repeat;
+
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT,
+    typename        EqualityOpT,
+    typename        ReductionOpT>
+int Solve(
+    KeyInputIteratorT       h_keys_in,
+    KeyT                    *h_keys_reference,
+    ValueInputIteratorT     h_values_in,
+    ValueT                  *h_values_reference,
+    EqualityOpT             equality_op,
+    ReductionOpT            reduction_op,
+    int                     num_items)
+{
+    // First item
+    KeyT previous        = h_keys_in[0];
+    ValueT aggregate     = h_values_in[0];
+    int num_segments    = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_keys_in[i]))
+        {
+            h_keys_reference[num_segments] = previous;
+            h_values_reference[num_segments] = aggregate;
+            num_segments++;
+            aggregate = h_values_in[i];
+        }
+        else
+        {
+            aggregate = reduction_op(aggregate, h_values_in[i]);
+        }
+        previous = h_keys_in[i];
+    }
+
+    h_keys_reference[num_segments] = previous;
+    h_values_reference[num_segments] = aggregate;
+    num_segments++;
+
+    return num_segments;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceKeyInputIteratorT,
+    typename            DeviceValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT,
+    typename            EqualityOpT,
+    typename            ReductionOpT>
+void Test(
+    DeviceKeyInputIteratorT     d_keys_in,
+    DeviceValueInputIteratorT   d_values_in,
+    KeyT*                       h_keys_reference,
+    ValueT*                     h_values_reference,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    int                         num_segments,
+    int                         num_items)
+{
+    // Allocate device output arrays and number of segments
+    KeyT*   d_keys_out             = NULL;
+    ValueT* d_values_out           = NULL;
+    int*    d_num_runs         = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
+    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
+    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
+
+    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis  = elapsed_millis / g_timing_iterations;
+        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
+        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT* h_values_in        = new ValueT[num_items];
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_values_in[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    ValueT   *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestIterator(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<ValueT, int> h_values_in(one_val);
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op,
+    int             max_segment)
+{
+    // 0 key-bit entropy reduction rounds
+    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
+
+    if (max_segment > 1)
+    {
+        // 2 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
+
+        // 7 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
+    }
+}
+
+
+/**
+ * Test different avg segment lengths modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
+
+    // Evaluate different max-segment lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
+    {
+        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
+    }
+}
+
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestDispatch(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
+#ifdef CUB_CDP
+    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestSize(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<KeyT, ValueT>(1,        reduction_op);
+        TestDispatch<KeyT, ValueT>(100,      reduction_op);
+        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
+        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
+    }
+    else
+    {
+        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
+    }
+
+}
+
+
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+    TestSize<KeyT, ValueT>(num_items, cub::Sum());
+    TestSize<KeyT, ValueT>(num_items, cub::Max());
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("---- RLE int ---- \n");
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- RLE long long ---- \n");
+    TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- int ---- \n");
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- float ---- \n");
+    TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        printf("---- double ---- \n");
+        TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+        TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+
+        // Test different input types
+        TestOp<int, char>(num_items);
+        TestOp<int, short>(num_items);
+        TestOp<int, int>(num_items);
+        TestOp<int, long>(num_items);
+        TestOp<int, long long>(num_items);
+        TestOp<int, float>(num_items);
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestOp<int, double>(num_items);
+
+        TestOp<int, uchar2>(num_items);
+        TestOp<int, uint2>(num_items);
+        TestOp<int, uint3>(num_items);
+        TestOp<int, uint4>(num_items);
+        TestOp<int, ulonglong4>(num_items);
+        TestOp<int, TestFoo>(num_items);
+        TestOp<int, TestBar>(num_items);
+
+        TestOp<char, int>(num_items);
+        TestOp<long long, int>(num_items);
+        TestOp<TestFoo, int>(num_items);
+        TestOp<TestBar, int>(num_items);
+
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu
new file mode 100644
index 0000000..7309db9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu
@@ -0,0 +1,890 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::RunLengthEncode utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+// Operation types
+enum RleMethod
+{
+    RLE,                // Run length encode
+    NON_TRIVIAL,
+    CSR,
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE>               method,
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::Encode(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to non-trivial runs entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<NON_TRIVIAL>       method,
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<RLE>               method,
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
+        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
+
+        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
+
+        LengthT one_val;
+        InitValue(INTEGER_SEED, one_val, 1);
+        thrust::constant_iterator<LengthT> constant_one(one_val);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                constant_one,
+                d_unique_out_wrapper,
+                d_lengths_out_wrapper);
+        }
+
+        OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRunLengthEncode
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    Int2Type<RLE_METHOD>            method,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE_METHOD>        method,
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    EqualityOp                  equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    RleMethod       RLE_METHOD,
+    typename        InputIteratorT,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT,
+    typename        EqualityOp>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_unique_reference,
+    OffsetT         *h_offsets_reference,
+    LengthT         *h_lengths_reference,
+    EqualityOp      equality_op,
+    int             num_items)
+{
+    if (num_items == 0) 
+        return 0;
+
+    // First item
+    T       previous        = h_in[0];
+    LengthT  length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_in[i]))
+        {
+            if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+            {
+                h_unique_reference[num_runs]      = previous;
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_in[i];
+    }
+
+    if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+    {
+        h_unique_reference[num_runs]    = previous;
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    return num_runs;
+}
+
+
+
+/**
+ * Test DeviceRunLengthEncode for a given problem input
+ */
+template <
+    RleMethod           RLE_METHOD,
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T,
+    typename            OffsetT,
+    typename            LengthT,
+    typename            EqualityOp>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_unique_reference,
+    OffsetT             *h_offsets_reference,
+    LengthT             *h_lengths_reference,
+    EqualityOp          equality_op,
+    int                 num_runs,
+    int                 num_items)
+{
+    // Allocate device output arrays and number of segments
+    T*          d_unique_out       = NULL;
+    LengthT*    d_offsets_out      = NULL;
+    OffsetT*    d_lengths_out      = NULL;
+    int*        d_num_runs         = NULL;
+
+    if (RLE_METHOD == RLE)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*          d_temp_storage_bytes = NULL;
+    cudaError_t*     d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void*           d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    if (RLE_METHOD == RLE)
+        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
+    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare0 = 0;
+    int compare1 = 0;
+    int compare2 = 0;
+    int compare3 = 0;
+
+    if (RLE_METHOD == RLE)
+    {
+        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
+        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != RLE)
+    {
+        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
+        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != CSR)
+    {
+        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
+    }
+
+    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
+        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
+    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
+    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceRunLengthEncode on pointer type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*      h_in                    = new T[num_items];
+    T*      h_unique_reference      = new T[num_items];
+    OffsetT* h_offsets_reference     = new OffsetT[num_items];
+    LengthT* h_lengths_reference     = new LengthT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T* d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             num_items,
+    Int2Type<true>  is_primitive)
+{
+    // Allocate host arrays
+    T* h_unique_reference       = new T[num_items];
+    OffsetT* h_offsets_reference = new OffsetT[num_items];
+    LengthT* h_lengths_reference = new LengthT[num_items];
+
+    T one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<T, int> h_in(one_val);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
+    fflush(stdout);
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+}
+
+
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             num_items,
+    Int2Type<false> is_primitive)
+{}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void Test(
+    int             num_items)
+{
+    // Test iterator (one run)
+    TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
+
+    // num_items runs
+    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
+
+    // Evaluate different run lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
+    {
+        // Uniform selection run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
+
+        // Reduced-entropy run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestDispatch(
+    int             num_items)
+{
+    Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
+
+#ifdef CUB_CDP
+    Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestSize(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<T, OffsetT, LengthT>(0);
+        TestDispatch<T, OffsetT, LengthT>(1);
+        TestDispatch<T, OffsetT, LengthT>(100);
+        TestDispatch<T, OffsetT, LengthT>(10000);
+        TestDispatch<T, OffsetT, LengthT>(1000000);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestDispatch<T, OffsetT, LengthT>(num_items);
+        }
+    }
+    else
+    {
+        TestDispatch<T, OffsetT, LengthT>(num_items);
+    }
+
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int max_segment              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", max_segment);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestSize<char,          int, int>(num_items);
+        TestSize<short,         int, int>(num_items);
+        TestSize<int,           int, int>(num_items);
+        TestSize<long,          int, int>(num_items);
+        TestSize<long long,     int, int>(num_items);
+        TestSize<float,         int, int>(num_items);
+        TestSize<double,        int, int>(num_items);
+
+        TestSize<uchar2,        int, int>(num_items);
+        TestSize<uint2,         int, int>(num_items);
+        TestSize<uint3,         int, int>(num_items);
+        TestSize<uint4,         int, int>(num_items);
+        TestSize<ulonglong4,    int, int>(num_items);
+        TestSize<TestFoo,       int, int>(num_items);
+        TestSize<TestBar,       int, int>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu
new file mode 100644
index 0000000..d3ad242
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu
@@ -0,0 +1,1027 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+double                  g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceScan entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceScan
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(
+        Int2Type<CUB>(),
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode      gen_mode,
+    T            *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    OutputT         initial_value)
+{
+    if (num_items > 0)
+    {
+        OutputT val         = h_in[0];
+        h_reference[0]      = initial_value;
+        OutputT inclusive   = scan_op(initial_value, val);
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            val = h_in[i];
+            h_reference[i] = inclusive;
+            inclusive = scan_op(inclusive, val);
+        }
+    }
+}
+
+
+/**
+ * Solve inclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    NullType)
+{
+    if (num_items > 0)
+    {
+        OutputT inclusive   = h_in[0];
+        h_reference[0]      = inclusive;
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            OutputT val = h_in[i];
+            inclusive = scan_op(inclusive, val);
+            h_reference[i] = inclusive;
+        }
+    }
+}
+
+
+/**
+ * Test DeviceScan for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+void Test(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
+
+    // Allocate device output array
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        g_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test DeviceScan on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestPointer(
+    int             num_items,
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    InputT*     h_in        = new InputT[num_items];
+    OutputT*    h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Initialize(gen_mode, h_in, num_items);
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Allocate problem device arrays
+    InputT *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceScan on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestIterator(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
+    fflush(stdout);
+
+    // Use a constant iterator as the input
+    InputT val = InputT();
+    ConstantInputIterator<InputT, int> h_in(val);
+
+    // Allocate host arrays
+    OutputT*  h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
+    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
+#ifdef CUB_CDP
+    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Test different operators
+ */
+template <typename InputT, typename OutputT>
+void TestOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
+    Test<InputT, OutputT>(num_items, cub::Max(), identity);
+
+    // Exclusive (non-specialized, so we can test initial-value)
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
+
+    // Inclusive (no initial value)
+    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
+    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename InputT,
+    typename OutputT>
+void TestSize(
+    int     num_items,
+    OutputT identity,
+    OutputT initial_value)
+{
+    if (num_items < 0)
+    {
+        TestOp<InputT>(0,        identity, initial_value);
+        TestOp<InputT>(1,        identity, initial_value);
+        TestOp<InputT>(100,      identity, initial_value);
+        TestOp<InputT>(10000,    identity, initial_value);
+        TestOp<InputT>(1000000,  identity, initial_value);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestOp<InputT>(num_items,  identity, initial_value);
+        }
+    }
+    else
+    {
+        TestOp<InputT>(num_items, identity, initial_value);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, int>(            num_items    , RANDOM_BIT, Sum(), (int) (0));
+    TestPointer<CUB, short, int>(           num_items    , RANDOM_BIT, Sum(), (int) (0));
+
+    printf("----------------------------\n");
+
+    TestPointer<CUB, int, int>(             num_items    , RANDOM_BIT, Sum(), (int) (0));
+    TestPointer<CUB, long long, long long>( num_items    , RANDOM_BIT, Sum(), (long long) (0));
+
+    printf("----------------------------\n");
+
+    TestPointer<CUB, float, float>(         num_items    , RANDOM_BIT, Sum(), (float) (0));
+    TestPointer<CUB, double, double>(       num_items    , RANDOM_BIT, Sum(), (double) (0));
+
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, char>(        num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+    TestPointer<THRUST, char, char>(     num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short, short>(       num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+    TestPointer<THRUST, short, short>(    num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
+    TestPointer<THRUST, int, int>(      num_items    , UNIFORM, Sum(), (int) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long, long long>(   num_items / 2, UNIFORM, Sum(), (long long) (0));
+    TestPointer<THRUST, long long, long long>(num_items / 2, UNIFORM, Sum(), (long long) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestBar, TestBar>(     num_items / 4, UNIFORM, Sum(), TestBar());
+    TestPointer<THRUST, TestBar, TestBar>(  num_items / 4, UNIFORM, Sum(), TestBar());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input+output data types
+        TestSize<unsigned char>(num_items,      (int) 0, (int) 99);
+
+        // Test same input+output data types
+        TestSize<unsigned char>(num_items,      (unsigned char) 0,      (unsigned char) 99);
+        TestSize<char>(num_items,               (char) 0,               (char) 99);
+        TestSize<unsigned short>(num_items,     (unsigned short) 0,     (unsigned short)99);
+        TestSize<unsigned int>(num_items,       (unsigned int) 0,       (unsigned int) 99);
+        TestSize<unsigned long long>(num_items, (unsigned long long) 0, (unsigned long long) 99);
+
+        TestSize<uchar2>(num_items,     make_uchar2(0, 0),              make_uchar2(17, 21));
+        TestSize<char2>(num_items,      make_char2(0, 0),               make_char2(17, 21));
+        TestSize<ushort2>(num_items,    make_ushort2(0, 0),             make_ushort2(17, 21));
+        TestSize<uint2>(num_items,      make_uint2(0, 0),               make_uint2(17, 21));
+        TestSize<ulonglong2>(num_items, make_ulonglong2(0, 0),          make_ulonglong2(17, 21));
+        TestSize<uchar4>(num_items,     make_uchar4(0, 0, 0, 0),        make_uchar4(17, 21, 32, 85));
+        TestSize<char4>(num_items,      make_char4(0, 0, 0, 0),         make_char4(17, 21, 32, 85));
+
+        TestSize<ushort4>(num_items,    make_ushort4(0, 0, 0, 0),       make_ushort4(17, 21, 32, 85));
+        TestSize<uint4>(num_items,      make_uint4(0, 0, 0, 0),         make_uint4(17, 21, 32, 85));
+        TestSize<ulonglong4>(num_items, make_ulonglong4(0, 0, 0, 0),    make_ulonglong4(17, 21, 32, 85));
+
+        TestSize<TestFoo>(num_items,
+            TestFoo::MakeTestFoo(0, 0, 0, 0),
+            TestFoo::MakeTestFoo(1ll << 63, 1 << 31, short(1 << 15), char(1 << 7)));
+
+        TestSize<TestBar>(num_items,
+            TestBar(0, 0),
+            TestBar(1ll << 63, 1 << 31));
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu
new file mode 100644
index 0000000..9bdca34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu
@@ -0,0 +1,1039 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::If and DevicePartition::If utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+// Selection functor type
+template <typename T>
+struct LessThan
+{
+    T compare;
+
+    __host__ __device__ __forceinline__
+    LessThan(T compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const T &a) const {
+        return (a < compare);
+    }
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<false>             partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<true>              partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>         d_out_wrapper_end;
+        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
+
+        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_out_wrapper,
+                d_out_unselected,
+                select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>     d_out_wrapper_end;
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
+        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_flags_wrapper,
+                d_out_wrapper,
+                d_out_unselected,
+                CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+__global__ void CnpDispatchKernel(
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    T*  h_in,
+    int num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        // Initialize each item to a randomly selected value from [0..126]
+        unsigned int value;
+        RandomBits(value, 0, 0, 7);
+        if (value == 127)
+            value = 126;
+        InitValue(INTEGER_SEED, h_in[i], value);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve selection problem (and set corresponding flags)
+ */
+template <
+    typename        InputIteratorT,
+    typename        FlagIteratorT,
+    typename        SelectOpT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    SelectOpT       select_op,
+    T*              h_reference,
+    FlagIteratorT   h_flags,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if ((h_flags[i] = select_op(h_in[i])))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    bool                IS_FLAGGED,
+    bool                IS_PARTITION,
+    typename            DeviceInputIteratorT,
+    typename            FlagT,
+    typename            SelectOpT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT    d_in,
+    FlagT*                  h_flags,
+    SelectOpT               select_op,
+    T*                      h_reference,
+    int                     num_selected,
+    int                     num_items)
+{
+    // Allocate device flags, output, and num-selected
+    FlagT*      d_flags = NULL;
+    T*          d_out = NULL;
+    int*        d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*         d_temp_storage_bytes = NULL;
+    cudaError_t*    d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Copy flags and clear device output array
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = (IS_PARTITION) ?
+        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
+        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis          = elapsed_millis / g_timing_iterations;
+        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
+        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
+        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
+        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
+
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test on pointer type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_in        = new T[num_items];
+    FlagT*  h_flags     = new FlagT[num_items];
+    T*      h_reference = new T[num_items];
+
+    // Initialize input
+    Initialize(h_in, num_items);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestIterator(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_reference = new T[num_items];
+    FlagT*  h_flags = new FlagT[num_items];
+
+    // Use counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+}
+
+
+/**
+ * Test different selection ratios
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
+    {
+        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
+    }
+}
+
+
+/**
+ * Test (select vs. partition) and (flagged vs. functor)
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestMethod(
+    int             num_items)
+{
+    // Functor
+    Test<BACKEND, false, false, T>(num_items);
+    Test<BACKEND, false, true, T>(num_items);
+
+    // Flagged
+    Test<BACKEND, true, false, T>(num_items);
+    Test<BACKEND, true, true, T>(num_items);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    TestMethod<CUB, T>(num_items);
+#ifdef CUB_CDP
+    TestMethod<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+/**
+ * Test select/partition on pointer types
+ */
+template <typename T>
+void ComparePointer(
+    int             num_items,
+    float           select_ratio)
+{
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, true, T>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, true, T>(num_items, select_ratio);
+
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    float select_ratio      = 0.5;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("ratio", select_ratio);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--ratio=<selection ratio, default 0.5>] "
+            "[--repeat=<repetitions of entire test suite>] "
+            "[--v] "
+            "[--cdp] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, int>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, int>(num_items, select_ratio);
+
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, false, false, int>(num_items, select_ratio);
+
+    ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
+    ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
+    ComparePointer<int>(        num_items,                                  select_ratio);
+    ComparePointer<long long>(  num_items / 2,                              select_ratio);
+    ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu
new file mode 100644
index 0000000..fff2958
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu
@@ -0,0 +1,651 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_out_wrapper_end;
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_reference,
+    int                 num_selected,
+    int                 num_items)
+{
+    // Allocate device output array and num selected
+    T       *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*  h_in        = new T[num_items];
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Allocate host arrays
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, T>(num_items, 0, max_segment);
+        TestPointer<BACKEND, T>(num_items, 2, max_segment);
+        TestPointer<BACKEND, T>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    Test<CUB, T>(num_items);
+#ifdef CUB_CDP
+    Test<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, int>(        num_items);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, char>(        num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+    TestPointer<THRUST, char>(     num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short>(       num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+    TestPointer<THRUST, short>(    num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+    TestPointer<THRUST, int>(      num_items,                                 entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long>(   num_items / 2,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, long long>(num_items / 2,                             entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestFoo>(     num_items / 4,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, TestFoo>(  num_items / 4,                             entropy_reduction, maxseg);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu
new file mode 100644
index 0000000..e6e3b81
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for software global barrier throughput
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/grid/grid_barrier.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Kernel that iterates through the specified number of software global barriers
+ */
+__global__ void Kernel(
+    GridBarrier global_barrier,
+    int iterations)
+{
+    for (int i = 0; i < iterations; i++)
+    {
+        global_barrier.Sync();
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    cudaError_t retval = cudaSuccess;
+
+    // Defaults
+    int iterations = 10000;
+    int block_size = 128;
+    int grid_size = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Get args
+    args.GetCmdLineArgument("i", iterations);
+    args.GetCmdLineArgument("grid-size", grid_size);
+    args.GetCmdLineArgument("block-size", block_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--i=<iterations>]"
+            "[--grid-size<grid-size>]"
+            "[--block-size<block-size>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Get SM properties
+    int sm_count, max_block_threads, max_sm_occupancy;
+    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+
+    // Compute grid size and occupancy
+    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
+
+    if (grid_size == -1)
+    {
+        grid_size = occupancy * sm_count;
+    }
+    else
+    {
+        occupancy = grid_size / sm_count;
+    }
+
+    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
+        grid_size, block_size, occupancy);
+    fflush(stdout);
+
+    // Init global barrier
+    GridBarrierLifetime global_barrier;
+    global_barrier.Setup(grid_size);
+
+    // Time kernel
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
+    gpu_timer.Stop();
+
+    retval = CubDebug(cudaThreadSynchronize());
+
+    // Output timing results
+    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
+    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
+        iterations,
+        gpu_timer.ElapsedMillis(),
+        avg_elapsed);
+
+    return retval;
+}
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu
new file mode 100644
index 0000000..53c689c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu
@@ -0,0 +1,805 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+template <typename T>
+struct TransformOp
+{
+    // Increment transform
+    __host__ __device__ __forceinline__ T operator()(T input) const
+    {
+        T addend;
+        InitValue(INTEGER_SEED, addend, 1);
+        return input + addend;
+    }
+};
+
+struct SelectOp
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(T input)
+    {
+        return true;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
+}
+
+
+/**
+ * Test constant iterator
+ */
+template <typename T>
+void TestConstant(T base)
+{
+    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    T h_reference[8] = {base, base, base, base, base, base, base, base};
+    ConstantInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    int copy_items  = 100;
+    T   *h_copy     = new T[copy_items];
+    T   *d_copy     = NULL;
+
+    for (int i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test counting iterator
+ */
+template <typename T>
+void TestCounting(T base)
+{
+    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = base + 0;          // Value at offset 0
+    h_reference[1] = base + 100;        // Value at offset 100
+    h_reference[2] = base + 1000;       // Value at offset 1000
+    h_reference[3] = base + 10000;      // Value at offset 10000
+    h_reference[4] = base + 1;          // Value at offset 1
+    h_reference[5] = base + 21;         // Value at offset 21
+    h_reference[6] = base + 11;         // Value at offset 11
+    h_reference[7] = base + 0;          // Value at offset 0;
+
+    CountingInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    unsigned long long  max_items   = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
+    size_t  copy_items              = (size_t) CUB_MIN(max_items - base, 100);     // potential issue with differencing overflows when T is a smaller type than can handle the offset
+    T                   *h_copy     = new T[copy_items];
+    T                   *d_copy     = NULL;
+
+    for (unsigned long long i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test modified iterator
+ */
+template <typename T, typename CastT>
+void TestModified()
+{
+    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+
+    CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
+    CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);
+
+    thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test transform iterator
+ */
+template <typename T, typename CastT>
+void TestTransform()
+{
+    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test tex-obj texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexObj()
+{
+    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind obj-based test iterator
+    TexObjInputIterator<T> d_obj_itr;
+    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    Test(d_obj_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    CubDebugExit(d_obj_itr.UnbindTexture());
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+#if CUDART_VERSION >= 5050
+
+/**
+ * Test tex-ref texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexRef()
+{
+    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind ref-based test iterator
+    TexRefInputIterator<T, __LINE__> d_ref_itr;
+    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create and bind dummy iterator of same type to check with interferance
+    TexRefInputIterator<T, __LINE__> d_ref_itr2;
+    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+
+    Test(d_ref_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_ref_itr.UnbindTexture());
+    CubDebugExit(d_ref_itr2.UnbindTexture());
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+/**
+ * Test texture transform iterator
+ */
+template <typename T, typename CastT>
+void TestTexTransform()
+{
+    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    // Create and bind texture iterator
+    typedef TexRefInputIterator<T, __LINE__> TextureIterator;
+
+    TextureIterator d_tex_itr;
+    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create transform iterator
+    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
+
+    Test(xform_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_tex_itr.UnbindTexture());
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+#endif  // CUDART_VERSION
+
+
+
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<false> is_integer)
+{
+    TestModified<T, CastT>();
+    TestTransform<T, CastT>();
+
+#if CUB_CDP
+    // Test tex-obj iterators if CUDA dynamic parallelism enabled
+    TestTexObj<T, CastT>(type_string);
+#endif  // CUB_CDP
+
+#if CUDART_VERSION >= 5050
+    // Test tex-ref iterators for CUDA 5.5
+    TestTexRef<T, CastT>();
+    TestTexTransform<T, CastT>();
+#endif  // CUDART_VERSION
+}
+
+/**
+ * Run integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<true> is_integer)
+{
+    TestConstant<T>(0);
+    TestConstant<T>(99);
+
+    TestCounting<T>(0);
+    TestCounting<T>(99);
+
+    // Run non-integer tests
+    Test<T, CastT>(Int2Type<false>());
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    enum {
+        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
+    };
+
+    // Test non-const type
+    Test<T, T>(Int2Type<IS_INTEGER>());
+
+    // Test non-const type
+    Test<T, const T>(Int2Type<IS_INTEGER>());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // Evaluate different data types
+    Test<char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n"); fflush(stdout);
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h
new file mode 100644
index 0000000..d44b939
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h
@@ -0,0 +1,1635 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+#else
+    #include <sys/resource.h>
+#endif
+
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include "mersenne.h"
+#include "half.h"
+
+#include "cub/util_debug.cuh"
+#include "cub/util_device.cuh"
+#include "cub/util_type.cuh"
+#include "cub/util_macro.cuh"
+#include "cub/iterator/discard_output_iterator.cuh"
+
+
+/******************************************************************************
+ * Assertion macros
+ ******************************************************************************/
+
+/**
+ * Assert equals
+ */
+#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
+
+
+/******************************************************************************
+ * Command-line parsing functionality
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLineArgs
+{
+
+    std::vector<std::string>    keys;
+    std::vector<std::string>    values;
+    std::vector<std::string>    args;
+    cudaDeviceProp              deviceProp;
+    float                       device_giga_bandwidth;
+    size_t                      device_free_physmem;
+    size_t                      device_total_physmem;
+
+    /**
+     * Constructor
+     */
+    CommandLineArgs(int argc, char **argv) :
+        keys(10),
+        values(10)
+    {
+        using namespace std;
+
+        // Initialize mersenne generator
+        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
+        mersenne::init_by_array(mersenne_init, 4);
+
+        for (int i = 1; i < argc; i++)
+        {
+            string arg = argv[i];
+
+            if ((arg[0] != '-') || (arg[1] != '-'))
+            {
+                args.push_back(arg);
+                continue;
+            }
+
+            string::size_type pos;
+            string key, val;
+            if ((pos = arg.find('=')) == string::npos) {
+                key = string(arg, 2, arg.length() - 2);
+                val = "";
+            } else {
+                key = string(arg, 2, pos - 2);
+                val = string(arg, pos + 1, arg.length() - 1);
+            }
+
+            keys.push_back(key);
+            values.push_back(val);
+        }
+    }
+
+
+    /**
+     * Checks whether a flag "--<flag>" is present in the commandline
+     */
+    bool CheckCmdLineFlag(const char* arg_name)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+                return true;
+        }
+        return false;
+    }
+
+
+    /**
+     * Returns number of naked (non-flag and non-key-value) commandline parameters
+     */
+    template <typename T>
+    int NumNakedArgs()
+    {
+        return args.size();
+    }
+
+
+    /**
+     * Returns the commandline parameter for a given index (not including flags)
+     */
+    template <typename T>
+    void GetCmdLineArgument(int index, T &val)
+    {
+        using namespace std;
+        if (index < args.size()) {
+            istringstream str_stream(args[index]);
+            str_stream >> val;
+        }
+    }
+
+    /**
+     * Returns the value specified for a given commandline parameter --<flag>=<value>
+     */
+    template <typename T>
+    void GetCmdLineArgument(const char *arg_name, T &val)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+            {
+                istringstream str_stream(values[i]);
+                str_stream >> val;
+            }
+        }
+    }
+
+
+    /**
+     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+     */
+    template <typename T>
+    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
+    {
+        using namespace std;
+
+        if (CheckCmdLineFlag(arg_name))
+        {
+            // Clear any default values
+            vals.clear();
+
+            // Recover from multi-value string
+            for (int i = 0; i < keys.size(); ++i)
+            {
+                if (keys[i] == string(arg_name))
+                {
+                    string val_string(values[i]);
+                    istringstream str_stream(val_string);
+                    string::size_type old_pos = 0;
+                    string::size_type new_pos = 0;
+
+                    // Iterate comma-separated values
+                    T val;
+                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
+                    {
+                        if (new_pos != old_pos)
+                        {
+                            str_stream.width(new_pos - old_pos);
+                            str_stream >> val;
+                            vals.push_back(val);
+                        }
+
+                        // skip over comma
+                        str_stream.ignore(1);
+                        old_pos = new_pos + 1;
+                    }
+
+                    // Read last value
+                    str_stream >> val;
+                    vals.push_back(val);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * The number of pairs parsed
+     */
+    int ParsedArgc()
+    {
+        return (int) keys.size();
+    }
+
+    /**
+     * Initialize device
+     */
+    cudaError_t DeviceInit(int dev = -1)
+    {
+        cudaError_t error = cudaSuccess;
+
+        do
+        {
+            int deviceCount;
+            error = CubDebug(cudaGetDeviceCount(&deviceCount));
+            if (error) break;
+
+            if (deviceCount == 0) {
+                fprintf(stderr, "No devices supporting CUDA.\n");
+                exit(1);
+            }
+            if (dev < 0)
+            {
+                GetCmdLineArgument("device", dev);
+            }
+            if ((dev > deviceCount - 1) || (dev < 0))
+            {
+                dev = 0;
+            }
+
+            error = CubDebug(cudaSetDevice(dev));
+            if (error) break;
+
+            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+
+            int ptx_version;
+            error = CubDebug(cub::PtxVersion(ptx_version));
+            if (error) break;
+
+            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
+            if (error) break;
+
+            if (deviceProp.major < 1) {
+                fprintf(stderr, "Device does not support CUDA.\n");
+                exit(1);
+            }
+
+            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+            if (!CheckCmdLineFlag("quiet"))
+            {
+                printf(
+                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
+                        "%lld free / %lld total MB physmem, "
+                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
+                    dev,
+                    deviceProp.name,
+                    ptx_version,
+                    deviceProp.major * 100 + deviceProp.minor * 10,
+                    deviceProp.multiProcessorCount,
+                    (unsigned long long) device_free_physmem / 1024 / 1024,
+                    (unsigned long long) device_total_physmem / 1024 / 1024,
+                    device_giga_bandwidth,
+                    deviceProp.memoryClockRate,
+                    (deviceProp.ECCEnabled) ? "on" : "off");
+                fflush(stdout);
+            }
+
+        } while (0);
+
+        return error;
+    }
+};
+
+/******************************************************************************
+ * Random bits generator
+ ******************************************************************************/
+
+int g_num_rand_samples = 0;
+
+
+template <typename T>
+bool IsNaN(T val) { return false; }
+
+template<>
+__noinline__ bool IsNaN<float>(float val)
+{
+    volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);
+
+    return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) || 
+        ((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
+}
+
+template<>
+__noinline__ bool IsNaN<float1>(float1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float2>(float2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float3>(float3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float4>(float4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+template<>
+__noinline__ bool IsNaN<double>(double val)
+{
+    volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);
+
+    return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) || 
+        ((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
+}
+
+template<>
+__noinline__ bool IsNaN<double1>(double1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double2>(double2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double3>(double3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double4>(double4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+
+template<>
+__noinline__ bool IsNaN<half_t>(half_t val)
+{
+    volatile unsigned short bits = reinterpret_cast<unsigned short &>(val);
+
+    return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
+        ((bits >= 0xFC01) && (bits <= 0xFFFFFFFF)));
+}
+
+
+
+/**
+ * Generates random keys.
+ *
+ * We always take the second-order byte from rand() because the higher-order
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ *
+ * We can decrease the entropy level of keys by adopting the technique
+ * of Thearling and Smith in which keys are computed from the bitwise AND of
+ * multiple random samples:
+ *
+ * entropy_reduction    | Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1                   | 0
+ * 0                    | 32
+ * 1                    | 25.95 (81%)
+ * 2                    | 17.41 (54%)
+ * 3                    | 10.78 (34%)
+ * 4                    | 6.42 (20%)
+ * ...                  | ...
+ *
+ */
+template <typename K>
+void RandomBits(
+    K &key,
+    int entropy_reduction = 0,
+    int begin_bit = 0,
+    int end_bit = sizeof(K) * 8)
+{
+    const int NUM_BYTES = sizeof(K);
+    const int WORD_BYTES = sizeof(unsigned int);
+    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
+
+    unsigned int word_buff[NUM_WORDS];
+
+    if (entropy_reduction == -1)
+    {
+        memset((void *) &key, 0, sizeof(key));
+        return;
+    }
+
+    if (end_bit < 0)
+        end_bit = sizeof(K) * 8;
+
+    while (true) 
+    {
+        // Generate random word_buff
+        for (int j = 0; j < NUM_WORDS; j++)
+        {
+            int current_bit = j * WORD_BYTES * 8;
+
+            unsigned int word = 0xffffffff;
+            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
+            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
+
+            for (int i = 0; i <= entropy_reduction; i++)
+            {
+                // Grab some of the higher bits from rand (better entropy, supposedly)
+                word &= mersenne::genrand_int32();
+                g_num_rand_samples++;                
+            }
+
+            word_buff[j] = word;
+        }
+
+        memcpy(&key, word_buff, sizeof(K));
+
+        K copy = key;
+        if (!IsNaN(copy))
+            break;          // avoids NaNs when generating random floating point numbers
+    }
+}
+
+/// Randomly select number between [0:max)
+template <typename T>
+T RandomValue(T max)
+{
+    unsigned int bits;
+    unsigned int max_int = (unsigned int) -1;
+    do {
+        RandomBits(bits);
+    } while (bits == max_int);
+
+    return (T) ((double(bits) / double(max_int)) * double(max));
+}
+
+
+/******************************************************************************
+ * Console printing utilities
+ ******************************************************************************/
+
+/**
+ * Helper for casting character types to integers for cout printing
+ */
+template <typename T>
+T CoutCast(T val) { return val; }
+
+int CoutCast(char val) { return val; }
+
+int CoutCast(unsigned char val) { return val; }
+
+int CoutCast(signed char val) { return val; }
+
+
+
+/******************************************************************************
+ * Test value initialization utilities
+ ******************************************************************************/
+
+/**
+ * Test problem generation options
+ */
+enum GenMode
+{
+    UNIFORM,            // Assign to '2', regardless of integer seed
+    INTEGER_SEED,       // Assign to integer seed
+    RANDOM,             // Assign to random, regardless of integer seed
+    RANDOM_BIT,         // Assign to randomly chosen 0 or 1, regardless of integer seed
+};
+
+/**
+ * Initialize value
+ */
+template <typename T>
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+        RandomBits(value);
+        break;
+    case RANDOM_BIT:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0) ? (T) 1 : (T) -1;
+        break;
+#endif
+     case UNIFORM:
+        value = 2;
+        break;
+    case INTEGER_SEED:
+    default:
+         value = (T) index;
+        break;
+    }
+}
+
+
+/**
+ * Initialize value (bool)
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+    case RANDOM_BIT:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0);
+        break;
+#endif
+     case UNIFORM:
+        value = true;
+        break;
+    case INTEGER_SEED:
+    default:
+        value = (index > 0);
+        break;
+    }
+}
+
+
+/**
+ * cub::NullType test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, cub::NullType &value, int index = 0)
+{}
+
+
+/**
+ * cub::KeyValuePair<OffsetT, ValueT>test initialization
+ */
+template <typename KeyT, typename ValueT>
+__host__ __device__ __forceinline__ void InitValue(
+    GenMode                             gen_mode,
+    cub::KeyValuePair<KeyT, ValueT>&    value,
+    int                                 index = 0)
+{
+    InitValue(gen_mode, value.value, index);
+
+    // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
+    RandomBits(value.key, 3);
+    value.key = (value.key & 0x1);
+}
+
+
+
+/******************************************************************************
+ * Comparison and ostream operators
+ ******************************************************************************/
+
+/**
+ * KeyValuePair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
+{
+    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
+    return os;
+}
+
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '(' << CoutCast(val.x) << ')';                \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x);                                \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x);                                \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x > b.x);                                 \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x < b.x);                                 \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(a.x + b.x);                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace std */
+
+
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        return a.y > b.y;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        return a.y < b.y;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                         \
+        T b)                                         \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        return a.z > b.z;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        return a.z < b.z;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ','                       \
+            << CoutCast(val.w) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z) ||                                 \
+            (a.w != b.w);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z) &&                                 \
+            (a.w == b.w);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+        InitValue(gen_mode, value.w, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
+        return a.w > b.w;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
+        return a.w < b.w;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z,                                      \
+            a.w + b.w);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
+    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
+    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
+    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
+    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD(char, char)
+CUB_VEC_OVERLOAD(short, short)
+CUB_VEC_OVERLOAD(int, int)
+CUB_VEC_OVERLOAD(long, long)
+CUB_VEC_OVERLOAD(longlong, long long)
+CUB_VEC_OVERLOAD(uchar, unsigned char)
+CUB_VEC_OVERLOAD(ushort, unsigned short)
+CUB_VEC_OVERLOAD(uint, unsigned int)
+CUB_VEC_OVERLOAD(ulong, unsigned long)
+CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD(float, float)
+CUB_VEC_OVERLOAD(double, double)
+
+
+//---------------------------------------------------------------------
+// Complex data type TestFoo
+//---------------------------------------------------------------------
+
+/**
+ * TestFoo complex data type
+ */
+struct TestFoo
+{
+    long long   x;
+    int         y;
+    short       z;
+    char        w;
+
+    // Factory
+    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
+    {
+        TestFoo retval = {x, y, z, w};
+        return retval;
+    }
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
+    {
+        x = b;
+        y = b;
+        z = b;
+        w = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
+    {
+        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
+    {
+        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
+    {
+        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        if (y < b.y) return true; else if (b.y < y) return false;
+        if (z < b.z) return true; else if (b.z < z) return false;
+        return w < b.w;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        if (y > b.y) return true; else if (b.y > y) return false;
+        if (z > b.z) return true; else if (b.z > z) return false;
+        return w > b.w;
+    }
+
+};
+
+/**
+ * TestFoo ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestFoo& val)
+{
+    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
+    return os;
+}
+
+/**
+ * TestFoo test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+    InitValue(gen_mode, value.z, index);
+    InitValue(gen_mode, value.w, index);
+}
+
+
+/// numeric_limits<TestFoo> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestFoo>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestFoo Max()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max(),
+            NumericTraits<short>::Max(),
+            NumericTraits<char>::Max());
+    }
+
+    static TestFoo Lowest()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest(),
+            NumericTraits<short>::Lowest(),
+            NumericTraits<char>::Lowest());
+    }
+};
+} // namespace cub
+
+
+//---------------------------------------------------------------------
+// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
+//---------------------------------------------------------------------
+
+/**
+ * TestBar complex data type
+ */
+struct TestBar
+{
+    long long       x;
+    int             y;
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
+    {}
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestBar& operator =(int b)
+    {
+        x = b;
+        y = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
+    {
+        return TestBar(x + b.x, y + b.y);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
+    {
+        return (x != b.x) || (y != b.y);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
+    {
+        return (x == b.x) && (y == b.y);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        return y < b.y;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        return y > b.y;
+    }
+
+};
+
+
+/**
+ * TestBar ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestBar& val)
+{
+    os << '(' << val.x << ',' << val.y << ')';
+    return os;
+}
+
+/**
+ * TestBar test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+}
+
+/// numeric_limits<TestBar> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestBar>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestBar Max()
+    {
+        return TestBar(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max());
+    }
+
+    static TestBar Lowest()
+    {
+        return TestBar(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest());
+    }
+};
+} // namespace cub
+
+
+/******************************************************************************
+ * Helper routines for list comparison and display
+ ******************************************************************************/
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename S, typename T, typename OffsetT>
+int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                << CoutCast(computed[i]) << " != "
+                << CoutCast(reference[i]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            float difference = std::abs(computed[i]-reference[i]);
+            float fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << "(computed) " << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
+{
+    return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            double difference = std::abs(computed[i]-reference[i]);
+            double fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+int CompareDeviceResults(
+    cub::NullType *h_reference,
+    cub::NullType *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename OffsetT>
+int CompareDeviceResults(
+    S *h_reference,
+    cub::DiscardOutputIterator<OffsetT> d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename T>
+int CompareDeviceResults(
+    S *h_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data)
+    {
+        printf("Reference:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a device array
+ */
+template <typename T>
+int CompareDeviceDeviceResults(
+    T *d_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_reference = (T*) malloc(num_items * sizeof(T));
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data) {
+        printf("Reference:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_reference) free(h_reference);
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Print the contents of a host array
+ */
+void DisplayResults(
+    cub::NullType   *h_data,
+    size_t          num_items)
+{}
+
+
+/**
+ * Print the contents of a host array
+ */
+template <typename InputIteratorT>
+void DisplayResults(
+    InputIteratorT h_data,
+    size_t num_items)
+{
+    // Display data
+    for (int i = 0; i < int(num_items); i++)
+    {
+        std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n");
+}
+
+
+/**
+ * Print the contents of a device array
+ */
+template <typename T>
+void DisplayDeviceResults(
+    T *d_data,
+    size_t num_items)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    DisplayResults(h_data, num_items);
+
+    // Cleanup
+    if (h_data) free(h_data);
+}
+
+
+/******************************************************************************
+ * Segment descriptor generation
+ ******************************************************************************/
+
+/**
+ * Initialize segments
+ */
+void InitializeSegments(
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    bool    verbose = false)
+{
+    if (num_segments <= 0)
+        return;
+
+    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
+    int offset = 0;
+    for (int i = 0; i < num_segments; ++i)
+    {
+        h_segment_offsets[i] = offset;
+
+        unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
+        offset += segment_length;
+        offset = CUB_MIN(offset, num_items);
+    }
+    h_segment_offsets[num_segments] = num_items;
+
+    if (verbose)
+    {
+        printf("Segment offsets: ");
+        DisplayResults(h_segment_offsets, num_segments + 1);
+    }
+}
+
+
+/******************************************************************************
+ * Timing
+ ******************************************************************************/
+
+
+struct CpuTimer
+{
+#if defined(_WIN32) || defined(_WIN64)
+
+    LARGE_INTEGER ll_freq;
+    LARGE_INTEGER ll_start;
+    LARGE_INTEGER ll_stop;
+
+    CpuTimer()
+    {
+        QueryPerformanceFrequency(&ll_freq);
+    }
+
+    void Start()
+    {
+        QueryPerformanceCounter(&ll_start);
+    }
+
+    void Stop()
+    {
+        QueryPerformanceCounter(&ll_stop);
+    }
+
+    float ElapsedMillis()
+    {
+        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
+        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
+
+        return float((stop - start) * 1000);
+    }
+
+#else
+
+    rusage start;
+    rusage stop;
+
+    void Start()
+    {
+        getrusage(RUSAGE_SELF, &start);
+    }
+
+    void Stop()
+    {
+        getrusage(RUSAGE_SELF, &stop);
+    }
+
+    float ElapsedMillis()
+    {
+        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
+        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
+
+        return (sec * 1000) + (usec / 1000);
+    }
+
+#endif
+};
+
+struct GpuTimer
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+    GpuTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~GpuTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void Start()
+    {
+        cudaEventRecord(start, 0);
+    }
+
+    void Stop()
+    {
+        cudaEventRecord(stop, 0);
+    }
+
+    float ElapsedMillis()
+    {
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed;
+    }
+};
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu
new file mode 100644
index 0000000..673219a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu
@@ -0,0 +1,840 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<
+    typename    OpT,
+    int         LOGICAL_WARP_THREADS>
+struct WrapperFunctor
+{
+    OpT op;
+    int num_valid;
+
+    inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
+
+    template <typename T>
+    inline __host__ __device__ T operator()(const T &a, const T &b) const
+    {
+#if CUB_PTX_ARCH != 0
+        if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
+            cub::ThreadTrap();
+#endif
+
+        return op(a, b);
+    }
+
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Generic reduction
+ */
+template <
+    typename    T,
+    typename    ReductionOp,
+    typename    WarpReduce,
+    bool        PRIMITIVE = Traits<T>::PRIMITIVE>
+struct DeviceTest
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
+    }
+
+};
+
+
+/**
+ * Summation
+ */
+template <
+    typename    T,
+    typename    WarpReduce>
+struct DeviceTest<T, Sum, WarpReduce, true>
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).Sum(data);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
+    }
+
+};
+
+
+/**
+ * Full-tile warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void FullWarpReduceKernel(
+    T               *d_in,
+    T               *d_out,
+    ReductionOp     reduction_op,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-reduce utility type (1 warp)
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+/**
+ * Partially-full warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void PartialWarpReduceKernel(
+    T           *d_in,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed,
+    int         valid_warp_threads)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test partial-warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op, valid_warp_threads);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+
+/**
+ * Head-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpHeadSegmentedReduceKernel(
+    T           *d_in,
+    FlagT        *d_head_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT   head_flag   = d_head_flags[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
+        temp_storage[warp_id], input, head_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+/**
+ * Tail-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpTailSegmentedReduceKernel(
+    T           *d_in,
+    FlagT       *d_tail_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT    tail_flag   = d_tail_flags[threadIdx.x];
+    FlagT    head_flag   = (threadIdx.x == 0) ?
+                            0 :
+                            d_tail_flags[threadIdx.x - 1];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
+        temp_storage[warp_id], input, tail_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    T           *h_in,
+    int         *h_flags,
+    int         warps,
+    int         warp_threads,
+    int         valid_warp_threads,
+    ReductionOp reduction_op,
+    T           *h_head_out,
+    T           *h_tail_out)
+{
+    for (int i = 0; i < warps * warp_threads; ++i)
+    {
+        // Sample a value for this item
+        InitValue(gen_mode, h_in[i], i);
+        h_head_out[i] = h_in[i];
+        h_tail_out[i] = h_in[i];
+
+        // Sample whether or not this item will be a segment head
+        char bits;
+        RandomBits(bits, flag_entropy);
+        h_flags[i] = bits & 0x1;
+    }
+
+    // Accumulate segments (lane 0 of each warp is implicitly a segment head)
+    for (int warp = 0; warp < warps; ++warp)
+    {
+        int warp_offset  = warp * warp_threads;
+        int item_offset = warp_offset + valid_warp_threads - 1;
+
+        // Last item in warp
+        T head_aggregate = h_in[item_offset];
+        T tail_aggregate = h_in[item_offset];
+
+        if (h_flags[item_offset])
+            h_head_out[item_offset] = head_aggregate;
+        item_offset--;
+
+        // Work backwards
+        while (item_offset >= warp_offset)
+        {
+            if (h_flags[item_offset + 1])
+            {
+                head_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
+            }
+
+            if (h_flags[item_offset])
+            {
+                h_head_out[item_offset] = head_aggregate;
+                h_tail_out[item_offset + 1] = tail_aggregate;
+                tail_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
+            }
+
+            item_offset--;
+        }
+
+        // Record last segment head_aggregate to head offset
+        h_head_out[warp_offset] = head_aggregate;
+        h_tail_out[warp_offset] = tail_aggregate;
+    }
+}
+
+
+/**
+ * Test warp reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestReduce(
+    GenMode     gen_mode,
+    ReductionOp reduction_op,
+    int         valid_warp_threads = LOGICAL_WARP_THREADS)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_out          = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
+    }
+
+    // Run kernel
+    printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
+        gen_mode,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        valid_warp_threads,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    if (valid_warp_threads == LOGICAL_WARP_THREADS)
+    {
+        // Run full-warp kernel
+        FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed);
+    }
+    else
+    {
+        // Run partial-warp kernel
+        PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed,
+            valid_warp_threads);
+    }
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_out) delete[] h_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test warp segmented reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestSegmentedReduce(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    ReductionOp reduction_op)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    int compare;
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_head_out     = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T           *d_in = NULL;
+    int         *d_flags = NULL;
+    T           *d_head_out = NULL;
+    T           *d_tail_out = NULL;
+    clock_t     *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+
+        printf("\nFlags:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+    }
+
+    printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
+        gen_mode,
+        flag_entropy,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run head-based kernel
+    WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_head_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tHead-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Run tail-based kernel
+    WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_tail_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tTail-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_head_out) delete[] h_head_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
+    if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different full and partial tile sizes
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void Test(
+    GenMode     gen_mode,
+    ReductionOp reduction_op)
+{
+    // Partial tiles
+    for (
+        int valid_warp_threads = 1;
+        valid_warp_threads < LOGICAL_WARP_THREADS;
+        valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
+    {
+        // Without wrapper (to test non-excepting PTX POD-op specializations)
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
+
+        // With wrapper to ensure no ops called on OOB lanes
+        WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
+    }
+
+    // Full tile
+    TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
+
+    // Segmented reduction with different head flags
+    for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
+    {
+        TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different data types and reduce ops
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // primitive
+    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());
+
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Sum());
+
+    if (gen_mode != RANDOM)
+    {
+        Test<WARPS, LOGICAL_WARP_THREADS, float>(           gen_mode, Sum());
+        Test<WARPS, LOGICAL_WARP_THREADS, double>(          gen_mode, Sum());
+    }
+
+    // primitive (alternative reduce op)
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Max());
+
+    // vec-1
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar1>(              gen_mode, Sum());
+
+    // vec-2
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar2>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort2>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint2>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>(          gen_mode, Sum());
+
+    // vec-4
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar4>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort4>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint4>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>(          gen_mode, Sum());
+
+    // complex
+    Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, TestBar>(             gen_mode, Sum());
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Run battery of tests for different number of active warps
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<1, LOGICAL_WARP_THREADS>();
+
+    // Only power-of-two subwarps can be tiled
+    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
+        Test<2, LOGICAL_WARP_THREADS>();
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestReduce<1, 32, int>(UNIFORM, Sum());
+
+    TestReduce<1, 32, double>(UNIFORM, Sum());
+    TestReduce<2, 16, TestBar>(UNIFORM, Sum());
+    TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<7>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu
new file mode 100644
index 0000000..ba8e5cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu
@@ -0,0 +1,661 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_scan.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+static const int        NUM_WARPS       = 2;
+
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+};
+
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/// Exclusive scan basic
+template <typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate);
+}
+
+
+/// Exclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveSum(data, data);
+}
+
+
+/// Exclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveSum(data, data, aggregate);
+}
+
+
+/// Inclusive scan basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveScan(data, data, scan_op, aggregate);
+}
+
+/// Inclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveSum(data, data, aggregate);
+}
+
+
+/**
+ * WarpScan test kernel
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    TestMode    TEST_MODE,
+    typename    T,
+    typename    ScanOpT,
+    typename    InitialValueT>
+__global__ void WarpScanKernel(
+    T               *d_in,
+    T               *d_out,
+    T               *d_aggregate,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-scan utility type (1 warp)
+    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpScanT::TempStorage temp_storage[NUM_WARPS];
+
+    // Get warp index
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+
+    // Per-thread tile data
+    T data = d_in[threadIdx.x];
+
+    // Start cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    T aggregate;
+
+    // Test scan
+    WarpScanT warp_scan(temp_storage[warp_id]);
+    DeviceTest(
+        warp_scan,
+        data,
+        initial_value,
+        scan_op,
+        aggregate,
+        Int2Type<TEST_MODE>(),
+        Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store data
+    d_out[threadIdx.x] = data;
+
+    if (TEST_MODE != BASIC)
+    {
+        // Store aggregate
+        d_aggregate[threadIdx.x] = aggregate;
+    }
+
+    // Store time
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <
+    typename        T,
+    typename        ScanOpT>
+void Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    T               *h_reference,
+    int             logical_warp_items,
+    ScanOpT         scan_op,
+    T               initial_value,
+    T               warp_aggregates[NUM_WARPS])
+{
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        int base_idx = (w * logical_warp_items);
+        int i = base_idx;
+
+        InitValue(gen_mode, h_in[i], i);
+
+        T warp_aggregate   = h_in[i];
+        h_reference[i]      = initial_value;
+        T inclusive         = scan_op(initial_value, h_in[i]);
+
+        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
+        {
+            InitValue(gen_mode, h_in[i], i);
+            h_reference[i] = inclusive;
+            inclusive = scan_op(inclusive, h_in[i]);
+            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
+        }
+
+        warp_aggregates[w] = warp_aggregate;
+    }
+
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ScanOpT>
+void Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         logical_warp_items,
+    ScanOpT     scan_op,
+    NullType,
+    T           warp_aggregates[NUM_WARPS])
+{
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        int base_idx = (w * logical_warp_items);
+        int i = base_idx;
+
+        InitValue(gen_mode, h_in[i], i);
+
+        T warp_aggregate    = h_in[i];
+        T inclusive         = h_in[i];
+        h_reference[i]      = inclusive;
+
+        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
+        {
+            InitValue(gen_mode, h_in[i], i);
+            inclusive = scan_op(inclusive, h_in[i]);
+            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
+            h_reference[i] = inclusive;
+        }
+
+        warp_aggregates[w] = warp_aggregate;
+    }
+}
+
+
+/**
+ * Test warp scan
+ */
+template <
+    int             LOGICAL_WARP_THREADS,
+    TestMode        TEST_MODE,
+    typename        T,
+    typename        ScanOpT,
+    typename        InitialValueT>        // NullType implies inclusive-scan, otherwise inclusive scan
+void Test(
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    enum {
+        TOTAL_ITEMS = LOGICAL_WARP_THREADS * NUM_WARPS,
+    };
+
+    // Allocate host arrays
+    T *h_in = new T[TOTAL_ITEMS];
+    T *h_reference = new T[TOTAL_ITEMS];
+    T *h_aggregate = new T[TOTAL_ITEMS];
+
+    // Initialize problem
+    T aggregates[NUM_WARPS];
+
+    Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        LOGICAL_WARP_THREADS,
+        scan_op,
+        initial_value,
+        aggregates);
+
+    if (g_verbose)
+    {
+        printf("Input: \n");
+        DisplayResults(h_in, TOTAL_ITEMS);
+        printf("\n");
+    }
+
+    for (int w = 0; w < NUM_WARPS; ++w)
+    {
+        for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
+        {
+            h_aggregate[(w * LOGICAL_WARP_THREADS) + i] = aggregates[w];
+        }
+    }
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    T *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TOTAL_ITEMS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TOTAL_ITEMS + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * TOTAL_ITEMS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TOTAL_ITEMS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TOTAL_ITEMS + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * TOTAL_ITEMS));
+
+    // Run kernel
+    printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
+        TEST_MODE, typeid(TEST_MODE).name(),
+        gen_mode, typeid(gen_mode).name(),
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run aggregate/prefix kernel
+    WarpScanKernel<LOGICAL_WARP_THREADS, TEST_MODE><<<1, TOTAL_ITEMS>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TOTAL_ITEMS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Copy out and display aggregate
+    if (TEST_MODE == AGGREGATE)
+    {
+        printf("\tScan aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, TOTAL_ITEMS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different primitive variants
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    // Exclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, T());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, T());
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, NullType());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, NullType());
+}
+
+
+/**
+ * Run battery of tests for different data types and scan ops
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long long) 99);
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (float) 99);
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (double) 99);
+    }
+
+    // primitive (alternative scan op)
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned long long) 99);
+
+    // vec-2
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uchar2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ushort2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uint2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulong2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulonglong2(17, 21));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float2(17, 21));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double2(17, 21));
+    }
+
+    // vec-4
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_char4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_short4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_int4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_long4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_longlong4(17, 21, 32, 85));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float4(17, 21, 32, 85));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double4(17, 21, 32, 85));
+    }
+
+    // complex
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0);
+    Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0);
+    Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0);
+    Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0);
+
+    typedef KeyValuePair<int, float> T;
+    cub::Sum sum_op;
+    Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp<cub::Sum>(sum_op), T());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<2>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore
new file mode 100644
index 0000000..5e56e04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore
@@ -0,0 +1 @@
+/bin
diff --git a/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu
new file mode 100644
index 0000000..ec0cf57
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu
@@ -0,0 +1,763 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Evaluates different tuning configurations of DeviceReduce.
+ *
+ * The best way to use this program:
+ * (1) Find the best all-around single-block tune for a given arch.
+ *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
+ * (2) Update the single tune in device_reduce.cuh
+ * (3) Find the best all-around multi-block tune for a given arch.
+ *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
+ * (4) Update the multi-block tune in device_reduce.cuh
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <vector>
+#include <algorithm>
+#include <stdio.h>
+#include <cub/cub.cuh>
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+#ifndef TUNE_ARCH
+#define TUNE_ARCH 100
+#endif
+
+int     g_max_items         = 48 * 1024 * 1024;
+int     g_samples           = 100;
+int     g_timing_iterations        = 2;
+bool    g_verbose           = false;
+bool    g_single            = false;
+bool    g_verify            = true;
+CachingDeviceAllocator  g_allocator;
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+}
+
+/**
+ * Sequential reduction
+ */
+template <typename T, typename ReductionOp>
+T Reduce(
+    T               *h_in,
+    ReductionOp     reduction_op,
+    int             num_items)
+{
+    T retval = h_in[0];
+    for (int i = 1; i < num_items; ++i)
+        retval = reduction_op(retval, h_in[i]);
+
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+
+/**
+ * Wrapper structure for generating and running different tuning configurations
+ */
+template <
+    typename T,
+    typename OffsetT,
+    typename ReductionOp>
+struct Schmoo
+{
+    //---------------------------------------------------------------------
+    // Types
+    //---------------------------------------------------------------------
+
+    /// Pairing of kernel function pointer and corresponding dispatch params
+    template <typename KernelPtr>
+    struct DispatchTuple
+    {
+        KernelPtr                           kernel_ptr;
+        DeviceReduce::KernelDispachParams   params;
+
+        float                               avg_throughput;
+        float                               best_avg_throughput;
+        OffsetT                              best_size;
+        float                               hmean_speedup;
+
+
+        DispatchTuple() :
+            kernel_ptr(0),
+            params(DeviceReduce::KernelDispachParams()),
+            avg_throughput(0.0),
+            best_avg_throughput(0.0),
+            hmean_speedup(0.0),
+            best_size(0)
+        {}
+    };
+
+    /**
+     * Comparison operator for DispatchTuple.avg_throughput
+     */
+    template <typename Tuple>
+    static bool MinSpeedup(const Tuple &a, const Tuple &b)
+    {
+        float delta = a.hmean_speedup - b.hmean_speedup;
+
+        return ((delta < 0.02) && (delta > -0.02)) ?
+            (a.best_avg_throughput < b.best_avg_throughput) :       // Negligible average performance differences: defer to best performance
+            (a.hmean_speedup < b.hmean_speedup);
+    }
+
+
+
+    /// Multi-block reduction kernel type and dispatch tuple type
+    typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
+    typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
+
+    /// Single-block reduction kernel type and dispatch tuple type
+    typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
+    typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    vector<MultiDispatchTuple> multi_kernels;       // List of generated multi-block kernels
+    vector<SingleDispatchTuple> single_kernels;     // List of generated single-block kernels
+
+
+    //---------------------------------------------------------------------
+    // Kernel enumeration methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Must have smem that fits in the SM
+     * Must have vector load length that divides items per thread
+     */
+    template <typename TilesReducePolicy, typename ReductionOp>
+    struct SmemSize
+    {
+        enum
+        {
+            BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
+            IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
+                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
+        };
+    };
+
+
+    /**
+     * Specialization that allows kernel generation with the specified TilesReducePolicy
+     */
+    template <
+        typename    TilesReducePolicy,
+        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
+    struct Ok
+    {
+        /// Enumerate multi-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateMulti(
+            KernelsVector &multi_kernels,
+            int subscription_factor)
+        {
+            MultiDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
+            tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            multi_kernels.push_back(tuple);
+        }
+
+
+        /// Enumerate single-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels)
+        {
+            SingleDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>();
+            tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            single_kernels.push_back(tuple);
+        }
+    };
+
+    /**
+     * Specialization that rejects kernel generation with the specified TilesReducePolicy
+     */
+    template <typename TilesReducePolicy>
+    struct Ok<TilesReducePolicy, false>
+    {
+        template <typename KernelsVector>
+        static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
+
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels) {}
+    };
+
+
+    /// Enumerate block-scheduling variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM,
+        CacheLoadModifier      LOAD_MODIFIER>
+    void Enumerate()
+    {
+        // Multi-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
+#if TUNE_ARCH >= 200
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
+#endif
+
+        // Single-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
+    }
+
+
+    /// Enumerate load modifier variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
+#if TUNE_ARCH >= 350
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
+#endif
+    }
+
+
+    /// Enumerate block algorithms
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD,
+        int VECTOR_LOAD_LENGTH>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    }
+
+
+    /// Enumerate vectorization variations
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
+    }
+
+
+    /// Enumerate thread-granularity variations
+    template <int BLOCK_THREADS>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, 7>();
+        Enumerate<BLOCK_THREADS, 8>();
+        Enumerate<BLOCK_THREADS, 9>();
+
+        Enumerate<BLOCK_THREADS, 11>();
+        Enumerate<BLOCK_THREADS, 12>();
+        Enumerate<BLOCK_THREADS, 13>();
+
+        Enumerate<BLOCK_THREADS, 15>();
+        Enumerate<BLOCK_THREADS, 16>();
+        Enumerate<BLOCK_THREADS, 17>();
+
+        Enumerate<BLOCK_THREADS, 19>();
+        Enumerate<BLOCK_THREADS, 20>();
+        Enumerate<BLOCK_THREADS, 21>();
+
+        Enumerate<BLOCK_THREADS, 23>();
+        Enumerate<BLOCK_THREADS, 24>();
+        Enumerate<BLOCK_THREADS, 25>();
+    }
+
+
+    /// Enumerate block size variations
+    void Enumerate()
+    {
+        printf("\nEnumerating kernels\n"); fflush(stdout);
+
+        Enumerate<32>();
+        Enumerate<64>();
+        Enumerate<96>();
+        Enumerate<128>();
+        Enumerate<160>();
+        Enumerate<192>();
+        Enumerate<256>();
+        Enumerate<512>();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Test methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Test a configuration
+     */
+    void TestConfiguration(
+        MultiDispatchTuple      &multi_dispatch,
+        SingleDispatchTuple     &single_dispatch,
+        T*                      d_in,
+        T*                      d_out,
+        T*                      h_reference,
+        OffsetT                  num_items,
+        ReductionOp             reduction_op)
+    {
+        // Clear output
+        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
+
+        // Allocate temporary storage
+        void            *d_temp_storage = NULL;
+        size_t          temp_storage_bytes = 0;
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Warmup/correctness iteration
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+
+        if (g_verify) CubDebugExit(cudaDeviceSynchronize());
+
+        // Copy out and display results
+        int compare = (g_verify) ?
+            CompareDeviceResults(h_reference, d_out, 1, true, false) :
+            0;
+
+        // Performance
+        GpuTimer gpu_timer;
+        float elapsed_millis = 0.0;
+        for (int i = 0; i < g_timing_iterations; i++)
+        {
+            gpu_timer.Start();
+
+            CubDebugExit(DeviceReduce::Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                multi_dispatch.kernel_ptr,
+                single_dispatch.kernel_ptr,
+                FillAndResetDrainKernel<OffsetT>,
+                multi_dispatch.params,
+                single_dispatch.params,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op));
+
+            gpu_timer.Stop();
+            elapsed_millis += gpu_timer.ElapsedMillis();
+        }
+
+        // Mooch
+        CubDebugExit(cudaDeviceSynchronize());
+
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
+        float avg_bandwidth = avg_throughput * sizeof(T);
+
+        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
+        if (avg_throughput > multi_dispatch.best_avg_throughput)
+        {
+            multi_dispatch.best_avg_throughput = avg_throughput;
+            multi_dispatch.best_size = num_items;
+        }
+
+        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
+        if (avg_throughput > single_dispatch.best_avg_throughput)
+        {
+            single_dispatch.best_avg_throughput = avg_throughput;
+            single_dispatch.best_size = num_items;
+        }
+
+        if (g_verbose)
+        {
+            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
+            multi_dispatch.params.Print();
+            printf(" ), single_dispatch( ");
+            single_dispatch.params.Print();
+            printf(" )\n");
+            fflush(stdout);
+        }
+
+        AssertEquals(0, compare);
+
+        // Cleanup temporaries
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+
+    /**
+     * Evaluate multi-block configurations
+     */
+    void TestMulti(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+    {
+        // Simple single kernel tuple for use with multi kernel sweep
+        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
+        SingleDispatchTuple simple_single_tuple;
+        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
+        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
+
+        double max_exponent      = log2(double(g_max_items));
+        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
+        unsigned int max_int     = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nMulti-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2^g_min_exponent, g_max_items].  First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+
+                if (sample < g_samples / 2)
+                {
+                    // log bias
+                    double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
+                    num_items = pow(2.0, exponent);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+                }
+                else
+                {
+                    // uniform bias
+                    num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
+                }
+            }
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each multi-kernel configuration
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < multi_kernels.size(); ++j)
+            {
+                multi_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < multi_kernels.size(); ++j)
+                multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
+            multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
+
+        // Print ranked multi configurations
+        printf("\nRanked multi_kernels:\n");
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", multi_kernels.size() - j);
+            multi_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                multi_kernels[j].hmean_speedup,
+                multi_kernels[j].best_avg_throughput,
+                (int) multi_kernels[j].best_size,
+                multi_kernels[j].best_avg_throughput * sizeof(T),
+                multi_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+
+    /**
+     * Evaluate single-block configurations
+     */
+    void TestSingle(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+     {
+        // Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
+        MultiDispatchTuple multi_tuple;
+
+        double max_exponent     = log2(double(g_max_items));
+        unsigned int max_int    = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nSingle-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2, g_max_items], log-distributed
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+                double exponent = ((max_exponent - 1) * scale) + 1;
+                num_items = pow(2.0, exponent);
+                printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+            }
+
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < single_kernels.size(); ++j)
+            {
+                single_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < single_kernels.size(); ++j)
+                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
+            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
+
+        // Print ranked single configurations
+        printf("\nRanked single_kernels:\n");
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", single_kernels.size() - j);
+            single_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                single_kernels[j].hmean_speedup,
+                single_kernels[j].best_avg_throughput,
+                (int) single_kernels[j].best_size,
+                single_kernels[j].best_avg_throughput * sizeof(T),
+                single_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+};
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    args.GetCmdLineArgument("n", g_max_items);
+    args.GetCmdLineArgument("s", g_samples);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_single = args.CheckCmdLineFlag("single");
+    g_verify = !args.CheckCmdLineFlag("noverify");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--n=<max items>]"
+            "[--s=<samples>]"
+            "[--i=<timing iterations>]"
+            "[--single]"
+            "[--v]"
+            "[--noverify]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#if (TUNE_SIZE == 1)
+    typedef unsigned char T;
+#elif (TUNE_SIZE == 2)
+    typedef unsigned short T;
+#elif (TUNE_SIZE == 4)
+    typedef unsigned int T;
+#elif (TUNE_SIZE == 8)
+    typedef unsigned long long T;
+#else
+    // Default
+    typedef unsigned int T;
+#endif
+
+    typedef unsigned int OffsetT;
+    Sum reduction_op;
+
+    // Enumerate kernels
+    Schmoo<T, OffsetT, Sum > schmoo;
+    schmoo.Enumerate();
+
+    // Allocate host arrays
+    T *h_in = new T[g_max_items];
+
+    // Initialize problem
+    Initialize(UNIFORM, h_in, g_max_items);
+
+    // Initialize device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
+
+    // Test kernels
+    if (g_single)
+        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
+    else
+        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+
+    return 0;
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/utils.cu b/3rdparty/TNN/source/tnn/device/cuda/utils.cu
new file mode 100644
index 0000000..0e3b403
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/utils.cu
@@ -0,0 +1,24 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/utils.cuh"
+
+namespace TNN_NS {
+
+template<> __device__ float get_float_value<__half>(__half value) { return __half2float(value); }
+
+template<> __device__ __half convert_float_value<__half>(float value) { return __float2half(value); }
+
+}  //  namespace TNN_NS;
+
diff --git a/3rdparty/TNN/source/tnn/device/cuda/utils.cuh b/3rdparty/TNN/source/tnn/device/cuda/utils.cuh
new file mode 100644
index 0000000..1b60588
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/cuda/utils.cuh
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
+
+#include <cuda_fp16.h>
+
+namespace TNN_NS {
+
+template<typename T>
+__device__ float get_float_value(T value) { return value; }
+
+template<typename T>
+__device__ T convert_float_value(float value) { return T(value); }
+
+}  //  namespace TNN_NS;
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
+
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/huawei_npu/CMakeLists.txt
new file mode 100644
index 0000000..213f05d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB NPU_SRC
+        *.h
+        *.cc
+        convert/*.h
+        convert/*.cc
+        convert/math/*.h
+        convert/math/*.cc)
+add_library(TNNNPU OBJECT ${NPU_SRC})
+include_directories(../../../../third_party/huawei_npu/hiai_ddk_latest/include)
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_acos_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_acos_layer_convert.cc
new file mode 100644
index 0000000..1644b58
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_acos_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuAcosLayer : public NpuUnaryLayer {
+public:
+    NpuAcosLayer(LayerType ignore) : NpuUnaryLayer(LAYER_ACOS) {}
+    ~NpuAcosLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Acos>();
+    }
+};
+
+REGISTER_NPU_LAYER(Acos, LAYER_ACOS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_add_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_add_layer_convert.cc
new file mode 100644
index 0000000..4d0a9e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_add_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuAddLayer : public NpuBinaryLayer {
+public:
+    NpuAddLayer(LayerType ignore) : NpuBinaryLayer(LAYER_ADD) {}
+    ~NpuAddLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<ge::op::Add>();
+    }
+};
+
+REGISTER_NPU_LAYER(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_asin_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_asin_layer_convert.cc
new file mode 100644
index 0000000..767b541
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_asin_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuAsinLayer : public NpuUnaryLayer {
+public:
+    NpuAsinLayer(LayerType ignore) : NpuUnaryLayer(LAYER_ASIN) {}
+    ~NpuAsinLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Asin>();
+    }
+};
+
+REGISTER_NPU_LAYER(Asin, LAYER_ASIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_atan_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_atan_layer_convert.cc
new file mode 100644
index 0000000..6aebbbe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_atan_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuAtanLayer : public NpuUnaryLayer {
+public:
+    NpuAtanLayer(LayerType ignore) : NpuUnaryLayer(LAYER_ATAN) {}
+    ~NpuAtanLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<hiai::op::Atan>();
+    }
+};
+
+REGISTER_NPU_LAYER(Atan, LAYER_ATAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_binary_layer_convert.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_binary_layer_convert.h
new file mode 100644
index 0000000..a240429
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_binary_layer_convert.h
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_BINARY_LAYER_CONVERT_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_BINARY_LAYER_CONVERT_H_
+#include <tnn/device/huawei_npu/convert/npu_base_layer_convert.h>
+#include <tnn/device/huawei_npu/convert/npu_utils.h>
+#include <tnn/layer/base_layer.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+class NpuBinaryLayer : public NpuBaseLayer {
+public:
+    NpuBinaryLayer(LayerType layer_type) : NpuBaseLayer(layer_type){};
+    virtual ~NpuBinaryLayer() {}
+
+protected:
+    template <class T>
+    Status BinaryConvert() {
+        auto param    = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+        auto resource = dynamic_cast<EltwiseLayerResource *>(resource_);
+        CHECK_PARAM_NULL(param);
+
+        int input_size = input_ops_.size();
+        if (!((input_size == 1 && resource) || input_size == 2)) {
+            return Status(TNNERR_LAYER_ERR, "Error: the Binary layer input number is not correct");
+        }
+
+        auto output = std::make_shared<T>(outputs_name_[0]);
+        if (input_size == 2) {
+            output->set_input_x1(*input_ops_[0]->GetOperator());
+            output->set_input_x2(*input_ops_[1]->GetOperator());
+        } else {
+            std::shared_ptr<ge::op::Const> weight_const = nullptr;
+            RETURN_ON_NEQ(GetBinaryWeight(weight_const), TNN_OK);
+
+            if (param->weight_input_index == 0) {
+                // weight const
+                output->set_input_x1(*weight_const);
+                output->set_input_x2(*input_ops_[0]->GetOperator());
+            } else {
+                // make input as the sub
+                output->set_input_x1(*input_ops_[0]->GetOperator());
+                output->set_input_x2(*weight_const);
+            }
+        }
+        ADD_OUTPUT_OP(output)
+    }
+
+    Status GetBinaryWeight(std::shared_ptr<ge::op::Const> &weight_const) {
+        auto resource = dynamic_cast<EltwiseLayerResource *>(resource_);
+        CHECK_PARAM_NULL(resource);
+
+        weight_const = std::make_shared<ge::op::Const>(layer_name_ + "_weight");
+        ge::Shape weight_shape(NpuUtils::Int32VecToTVec<int64_t>(resource->element_shape));
+        NpuUtils::CreateAttrValue(weight_const, weight_shape, resource->element_handle);
+        weight_ops_.push_back(weight_const);
+
+        return TNN_OK;
+    }
+
+private:
+    std::vector<shared_ptr<ge::Operator>> weight_ops_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_BINARY_LAYER_CONVERT_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_ceil_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_ceil_layer_convert.cc
new file mode 100644
index 0000000..2633d6c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_ceil_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuCeilLayer : public NpuUnaryLayer {
+public:
+    NpuCeilLayer(LayerType ignore) : NpuUnaryLayer(LAYER_CEIL) {}
+    ~NpuCeilLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Ceil>();
+    }
+};
+
+REGISTER_NPU_LAYER(Ceil, LAYER_CEIL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_cos_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_cos_layer_convert.cc
new file mode 100644
index 0000000..0901abe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_cos_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuCosLayer : public NpuUnaryLayer {
+public:
+    NpuCosLayer(LayerType ignore) : NpuUnaryLayer(LAYER_COS) {}
+    ~NpuCosLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Cos>();
+    }
+};
+
+REGISTER_NPU_LAYER(Cos, LAYER_COS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_div_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_div_layer_convert.cc
new file mode 100644
index 0000000..671e8f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_div_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuDivLayer : public NpuBinaryLayer {
+public:
+    NpuDivLayer(LayerType ignore) : NpuBinaryLayer(LAYER_DIV) {}
+    ~NpuDivLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<ge::op::RealDiv>();
+    }
+};
+
+REGISTER_NPU_LAYER(Div, LAYER_DIV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_exp_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_exp_layer_convert.cc
new file mode 100644
index 0000000..cc28420
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_exp_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuExpLayer : public NpuUnaryLayer {
+public:
+    NpuExpLayer(LayerType ignore) : NpuUnaryLayer(LAYER_EXP) {}
+    ~NpuExpLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Exp>();
+    }
+};
+
+REGISTER_NPU_LAYER(Exp, LAYER_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_floor_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_floor_layer_convert.cc
new file mode 100644
index 0000000..5a273be
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_floor_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuFloorLayer : public NpuUnaryLayer {
+public:
+    NpuFloorLayer(LayerType ignore) : NpuUnaryLayer(LAYER_FLOOR) {}
+    ~NpuFloorLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Floor>();
+    }
+};
+
+REGISTER_NPU_LAYER(Floor, LAYER_FLOOR);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_hardswish_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_hardswish_layer_convert.cc
new file mode 100644
index 0000000..506a3b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_hardswish_layer_convert.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Hardswish, LAYER_HARDSWISH)
+
+Status NpuHardswishLayer::Convert() {
+    auto param = dynamic_cast<HardSwishLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (!(param->alpha >= 0.1666f && param->alpha <= 0.1667f && param->beta >= 0.4999f && param->beta <= 0.5001f)) {
+        LOGE("hardswish only support alpha=1/6 beta=0.5, but in fact, alpha=%f beta=%f\n", param->alpha, param->beta);
+        return Status(TNNERR_LAYER_ERR, "Error: Npu currently only supports hardswish (alpha=1/6, beta=0.5)");
+    }
+
+    int input_size = input_ops_.size();
+    if (input_size == 1) {
+        auto output = std::make_shared<hiai::op::HardSwish>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        ADD_OUTPUT_OP(output)
+    } else if (input_size == 2) {
+        // hardswish will be broken into hardsigmoid+mul
+        auto sub_op = std::make_shared<ge::op::Activation>(layer_name_ + "_sigmoid");
+        sub_op->set_input_x(*input_ops_[1]->GetOperator());
+        sub_op->set_attr_mode(10);
+        weight_ops_.push_back(sub_op);
+
+        auto output = std::make_shared<hiai::op::Mul>(outputs_name_[0]);
+        output->set_input_x1(*input_ops_[0]->GetOperator());
+        output->set_input_x2(*sub_op);
+        ADD_OUTPUT_OP(output)
+    } else {
+        printf("the Unary input size is not correct\n");
+        return Status(TNNERR_PARAM_ERR, "Error: the Unary layer count is not correct");
+    }
+}
+
+REGISTER_NPU_LAYER(Hardswish, LAYER_HARDSWISH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_layer_convert.cc
new file mode 100644
index 0000000..3311128
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuLogLayer : public NpuUnaryLayer {
+public:
+    NpuLogLayer(LayerType ignore) : NpuUnaryLayer(LAYER_LOG) {}
+    ~NpuLogLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<hiai::op::Log>();
+    }
+};
+
+REGISTER_NPU_LAYER(Log, LAYER_LOG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_sigmoid_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_sigmoid_layer_convert.cc
new file mode 100644
index 0000000..dad7e67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_log_sigmoid_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(LogSigmoid, LAYER_LOGSIGMOID)
+
+Status NpuLogSigmoidLayer::Convert() {
+    auto sigmoid_op = std::make_shared<hiai::op::Activation>(layer_name_ + "_sigmoid");
+    sigmoid_op->set_input_x(*input_ops_[0]->GetOperator());
+    sigmoid_op->set_attr_mode(0);
+    weight_ops_.push_back(sigmoid_op);
+
+    auto output = std::make_shared<hiai::op::Log>(outputs_name_[0]);
+    output->set_input_x(*sigmoid_op);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(LogSigmoid, LAYER_LOGSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_max_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_max_layer_convert.cc
new file mode 100644
index 0000000..7e1111a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_max_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuMaxLayer : public NpuBinaryLayer {
+public:
+    NpuMaxLayer(LayerType ignore) : NpuBinaryLayer(LAYER_MAXIMUM) {}
+    ~NpuMaxLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<ge::op::Maximum>();
+    }
+};
+
+REGISTER_NPU_LAYER(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_min_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_min_layer_convert.cc
new file mode 100644
index 0000000..d18ee7a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_min_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuMinLayer : public NpuBinaryLayer {
+public:
+    NpuMinLayer(LayerType ignore) : NpuBinaryLayer(LAYER_MINIMUM) {}
+    ~NpuMinLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<ge::op::Minimum>();
+    }
+};
+
+REGISTER_NPU_LAYER(Min, LAYER_MINIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_mul_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_mul_layer_convert.cc
new file mode 100644
index 0000000..bbafb08
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_mul_layer_convert.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include <tnn/core/status.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+class NpuMulLayer : public NpuBinaryLayer {
+public:
+    NpuMulLayer(LayerType ignore) : NpuBinaryLayer(LAYER_MUL) {}
+    ~NpuMulLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<hiai::op::Mul>();
+    }
+};
+
+REGISTER_NPU_LAYER(Mul, LAYER_MUL)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_neg_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_neg_layer_convert.cc
new file mode 100644
index 0000000..a971637
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_neg_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuNegLayer : public NpuUnaryLayer {
+public:
+    NpuNegLayer(LayerType ignore) : NpuUnaryLayer(LAYER_NEG) {}
+    ~NpuNegLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Neg>();
+    }
+};
+
+REGISTER_NPU_LAYER(Neg, LAYER_NEG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_pow_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_pow_layer_convert.cc
new file mode 100644
index 0000000..c26e811
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_pow_layer_convert.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Pow, LAYER_POWER)
+
+Status NpuPowLayer::Convert() {
+    auto param = dynamic_cast<PowLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto output = std::make_shared<hiai::op::Power>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_scale(param->scale);
+    output->set_attr_shift(param->shift);
+    output->set_attr_power(param->exponent);
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Pow, LAYER_POWER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_reciprocal_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_reciprocal_layer_convert.cc
new file mode 100644
index 0000000..f412726
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_reciprocal_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReciprocalLayer : public NpuUnaryLayer {
+public:
+    NpuReciprocalLayer(LayerType ignore) : NpuUnaryLayer(LAYER_RECIPROCAL) {}
+    ~NpuReciprocalLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Reciprocal>();
+    }
+};
+
+REGISTER_NPU_LAYER(Reciprocal, LAYER_RECIPROCAL);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sign_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sign_layer_convert.cc
new file mode 100644
index 0000000..938b398
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sign_layer_convert.cc
@@ -0,0 +1,31 @@
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuSignLayer : public NpuUnaryLayer {
+public:
+    NpuSignLayer(LayerType ignore) : NpuUnaryLayer(LAYER_SIGN) {}
+    ~NpuSignLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Sign>();
+    }
+};
+
+REGISTER_NPU_LAYER(Sign, LAYER_SIGN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sin_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sin_layer_convert.cc
new file mode 100644
index 0000000..c59da7e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sin_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuSinLayer : public NpuUnaryLayer {
+public:
+    NpuSinLayer(LayerType ignore) : NpuUnaryLayer(LAYER_SIN) {}
+    ~NpuSinLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Sin>();
+    }
+};
+
+REGISTER_NPU_LAYER(Sin, LAYER_SIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sqrt_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sqrt_layer_convert.cc
new file mode 100644
index 0000000..ff62f02
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sqrt_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuSqrtLayer : public NpuUnaryLayer {
+public:
+    NpuSqrtLayer(LayerType ignore) : NpuUnaryLayer(LAYER_SQRT) {}
+    ~NpuSqrtLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Sqrt>();
+    }
+};
+
+REGISTER_NPU_LAYER(Sqrt, LAYER_SQRT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sub_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sub_layer_convert.cc
new file mode 100644
index 0000000..e35fef3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_sub_layer_convert.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_binary_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuSubLayer : public NpuBinaryLayer {
+public:
+    NpuSubLayer(LayerType ignore) : NpuBinaryLayer(LAYER_SUB) {}
+    ~NpuSubLayer() {}
+    Status Convert() {
+        return NpuBinaryLayer::BinaryConvert<ge::op::Sub>();
+    }
+};
+
+REGISTER_NPU_LAYER(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_tan_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_tan_layer_convert.cc
new file mode 100644
index 0000000..1714e3f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_tan_layer_convert.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "npu_unary_operator.h"
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuTanLayer : public NpuUnaryLayer {
+public:
+    NpuTanLayer(LayerType ignore) : NpuUnaryLayer(LAYER_TAN) {}
+    ~NpuTanLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuUnaryLayer::UnaryConvert<ge::op::Tan>();
+    }
+};
+
+REGISTER_NPU_LAYER(Tan, LAYER_TAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_unary_operator.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_unary_operator.h
new file mode 100644
index 0000000..8fd6e4b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/math/npu_unary_operator.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_UNARY_OPERATOR_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_UNARY_OPERATOR_H_
+
+#include <tnn/core/layer_type.h>
+#include <tnn/device/huawei_npu/convert/npu_base_layer_convert.h>
+
+namespace TNN_NS {
+
+class NpuUnaryLayer : public NpuBaseLayer {
+public:
+    NpuUnaryLayer(LayerType layer_type) : NpuBaseLayer(layer_type){};
+    virtual ~NpuUnaryLayer() {}
+
+protected:
+    template <class T>
+    Status UnaryConvert() {
+        int input_size = input_ops_.size();
+        if (input_size >= 2) {
+            printf("the Unary input size is not correct\n");
+            return Status(TNNERR_PARAM_ERR, "Error: the Unary layer count is not correct");
+        }
+        auto output = std::make_shared<T>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        ADD_OUTPUT_OP(output)
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_MATH_NPU_UNARY_OPERATOR_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_activation_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_activation_layer_convert.cc
new file mode 100644
index 0000000..6f60ff9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_activation_layer_convert.cc
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "npu_base_layer_convert.h"
+#include "tnn/core/layer_type.h"
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_ACTIVATION_LAYER_CONVERT_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_ACTIVATION_LAYER_CONVERT_H_
+
+namespace TNN_NS {
+
+class NpuActivationLayerConvert : public NpuBaseLayer {
+protected:
+    int mode = 0;
+    virtual Status Convert() {
+        auto output = std::make_shared<ge::op::Activation>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+
+        switch (type_) {
+            case LAYER_SIGMOID:
+                mode = 0;
+                break;
+            case LAYER_RELU:
+                mode = 1;
+                break;
+            case LAYER_TANH:
+                mode = 2;
+                break;
+            case LAYER_ELU: {
+                mode       = 4;
+                auto param = dynamic_cast<EluLayerParam *>(param_);
+                CHECK_PARAM_NULL(param);
+                output->set_attr_coef(param->alpha);
+            } break;
+            case LAYER_ABS:
+                mode = 6;
+                break;
+            case LAYER_SOFTSIGN:
+                mode = 8;
+                break;
+            case LAYER_SOFTPLUS:
+                mode = 9;
+                break;
+            case LAYER_HARDSIGMOID: {
+                auto param = dynamic_cast<HardSigmoidLayerParam *>(param_);
+                CHECK_PARAM_NULL(param);
+                if (!(param->alpha >= 0.1666f && param->alpha <= 0.1667f && param->beta >= 0.4999f &&
+                      param->beta <= 0.5001f)) {
+                    LOGE("hardsigmoid only support alpha=1/6 beta=0.5, but in fact, alpha=%f beta=%f\n", param->alpha,
+                         param->beta);
+                    return Status(TNNERR_LAYER_ERR,
+                                  "Error: Npu currently only supports hardsigmoid (alpha=1/6, beta=0.5)");
+                }
+                mode = 10;
+            } break;
+            case LAYER_SELU:
+                mode = 12;
+                break;
+            case LAYER_RELU6:
+                mode = 14;
+                break;
+            case LAYER_GELU:
+                mode = 15;
+                break;
+            default:
+                return Status(TNNERR_UNKNOWN_LAYER, "This activation is not defined in NPU");
+        }
+
+        output->set_attr_mode(mode);
+        ADD_OUTPUT_OP(output)
+    }
+
+public:
+    NpuActivationLayerConvert(LayerType layer_type) : NpuBaseLayer(layer_type) {}
+    ~NpuActivationLayerConvert() {}
+};
+
+#define DECLARE_NPU_ACTIVATION_LAYER(type_string, layer_type)                                                          \
+    class Npu##type_string##Layer : public NpuActivationLayerConvert {                                                 \
+    public:                                                                                                            \
+        Npu##type_string##Layer(LayerType ignore) : NpuActivationLayerConvert(layer_type){};                           \
+        ~Npu##type_string##Layer(){};                                                                                  \
+    };
+
+DECLARE_NPU_ACTIVATION_LAYER(Sigmoid, LAYER_SIGMOID)
+REGISTER_NPU_LAYER(Sigmoid, LAYER_SIGMOID)
+DECLARE_NPU_ACTIVATION_LAYER(Relu, LAYER_RELU)
+REGISTER_NPU_LAYER(Relu, LAYER_RELU)
+DECLARE_NPU_ACTIVATION_LAYER(Tanh, LAYER_TANH)
+REGISTER_NPU_LAYER(Tanh, LAYER_TANH)
+DECLARE_NPU_ACTIVATION_LAYER(Elu, LAYER_ELU)
+REGISTER_NPU_LAYER(Elu, LAYER_ELU)
+DECLARE_NPU_ACTIVATION_LAYER(Abs, LAYER_ABS)
+REGISTER_NPU_LAYER(Abs, LAYER_ABS)
+DECLARE_NPU_ACTIVATION_LAYER(Softplus, LAYER_SOFTPLUS)
+REGISTER_NPU_LAYER(Softplus, LAYER_SOFTPLUS)
+DECLARE_NPU_ACTIVATION_LAYER(Softsign, LAYER_SOFTSIGN)
+REGISTER_NPU_LAYER(Softsign, LAYER_SOFTSIGN)
+DECLARE_NPU_ACTIVATION_LAYER(HardSigmoid, LAYER_HARDSIGMOID)
+REGISTER_NPU_LAYER(HardSigmoid, LAYER_HARDSIGMOID)
+DECLARE_NPU_ACTIVATION_LAYER(Selu, LAYER_SELU)
+REGISTER_NPU_LAYER(Selu, LAYER_SELU)
+DECLARE_NPU_ACTIVATION_LAYER(Relu6, LAYER_RELU6)
+REGISTER_NPU_LAYER(Relu6, LAYER_RELU6)
+DECLARE_NPU_ACTIVATION_LAYER(Gelu, LAYER_GELU)
+REGISTER_NPU_LAYER(Gelu, LAYER_GELU)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_ACTIVATION_LAYER_CONVERT_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_arg_max_or_min_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_arg_max_or_min_layer_convert.cc
new file mode 100644
index 0000000..fac8679
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_arg_max_or_min_layer_convert.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <graph/op/all_ops.h>
+#include "tnn/device/huawei_npu/convert/npu_base_layer_convert.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN)
+
+Status NpuArgMaxOrMinLayer::Convert() {
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    if (param->mode == 1) {
+        // arg max
+        std::shared_ptr<ge::op::Const> axis_const = std::make_shared<ge::op::Const>(layer_name_ + "_axis");
+        std::vector<int> axis_vec                 = {param->axis};
+        ge::TensorDesc const_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_INT32);
+        NpuUtils::CreateAttrArray(axis_const, axis_vec, const_desc, 1);
+        weight_ops_.push_back(axis_const);
+
+        auto output = std::make_shared<hiai::op::ArgMaxExt2>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_axis(*axis_const);
+        output->set_attr_keep_dims(param->keep_dims);
+
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("ArgMaxOrMin layer only support max by now\n");
+        return Status(TNNERR_PARAM_ERR, "ArgMaxOrMin layer only support max by now\n");
+    }
+}
+
+REGISTER_NPU_LAYER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.cc
new file mode 100644
index 0000000..489b632
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+OperatorInfo::OperatorInfo() = default;
+
+OperatorInfo::~OperatorInfo() {}
+
+OperatorInfo::OperatorInfo(std::shared_ptr<ge::Operator> op) {
+    this->op_ = op;
+}
+OperatorInfo::OperatorInfo(std::shared_ptr<ge::Operator> op, std::vector<int> shape) {
+    this->op_    = op;
+    this->shape_ = shape;
+}
+
+shared_ptr<ge::Operator> OperatorInfo::GetOperator() {
+    return op_;
+}
+
+std::vector<int> OperatorInfo::GetShape() {
+    return shape_;
+}
+void OperatorInfo::SetShape(std::vector<int> shape) {
+    this->shape_ = shape;
+}
+void OperatorInfo::SetOperator(std::shared_ptr<ge::Operator> op) {
+    this->op_ = op;
+}
+
+NpuBaseLayer::NpuBaseLayer(LayerType type) {
+    this->type_ = type;
+}
+
+NpuBaseLayer::~NpuBaseLayer(){};
+
+Status NpuBaseLayer::Init(Context *context, LayerParam *param, LayerResource *resource,
+                          std::vector<std::shared_ptr<OperatorInfo>> input_ops, AbstractDevice *device,
+                          std::vector<std::string> outputs) {
+    param_        = param;
+    resource_     = resource;
+    input_ops_    = input_ops;
+    outputs_name_ = outputs;
+
+    // calculate the output shape
+    Status ret = NpuBaseLayer::CalculateOutputShape(output_shapes_);
+    if (ret != TNN_OK)
+        return ret;
+
+    // Convert all layers
+    ret = Convert();
+    return ret;
+}
+
+void NpuBaseLayer::SetLayerName(std::string layer_name) {
+    layer_name_ = layer_name;
+}
+
+void NpuBaseLayer::SetNpuVersion(std::string version) {
+    npu_version_ = version;
+}
+
+std::string NpuBaseLayer::GetLayerName() {
+    return layer_name_;
+}
+
+Status NpuBaseLayer::SetOutputOps() {
+    // all output index (output shape/ops) follow the outputs_name_ attribute
+    for (int i = 0; i < outputs_name_.size(); i++) {
+        output_ops_[i]->SetShape(output_shapes_[i]);
+    }
+    return TNN_OK;
+}
+
+Status NpuBaseLayer::GetOutputShape(int i, std::vector<int> &output_shape) {
+    output_shape = output_shapes_[i];
+    return TNN_OK;
+}
+
+Status NpuBaseLayer::CalculateOutputShape(std::vector<std::vector<int>> &output_shapes) {
+    BaseLayer *shape_calculator = CreateLayer(type_);
+    std::vector<Blob *> input_blobs;
+    BlobDesc blob_desc;
+    for (auto &input_op : input_ops_) {
+        blob_desc.dims = input_op->GetShape();
+        Blob *blob     = new Blob(blob_desc);
+        input_blobs.push_back(blob);
+    }
+    std::vector<Blob *> output_blobs;
+    for (int i = 0; i < outputs_name_.size(); i++) {
+        Blob *blob = new Blob(blob_desc);
+        output_blobs.push_back(blob);
+    }
+    Status ret = shape_calculator->InferShapeAhead(input_blobs, output_blobs, param_, resource_);
+    if (ret == TNN_OK) {
+        for (int i = 0; i < outputs_name_.size(); i++) {
+            output_shapes.push_back(output_blobs[i]->GetBlobDesc().dims);
+        }
+    }
+
+    for (auto &blob : input_blobs) {
+        delete (blob);
+    }
+    for (auto &blob : output_blobs) {
+        delete (blob);
+    }
+    input_blobs.clear();
+    output_blobs.clear();
+    delete (shape_calculator);
+    return ret;
+}
+
+std::vector<std::shared_ptr<OperatorInfo>> &NpuBaseLayer::GetOutputOps() {
+    return output_ops_;
+}
+
+std::map<LayerType, std::shared_ptr<NpuLayerCreator>> &GetGlobalNpuLayerCreatorMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<NpuLayerCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<NpuLayerCreator>>); });
+    return *creators;
+}
+
+NpuBaseLayer *CreateNpuBaseLayer(LayerType type) {
+    NpuBaseLayer *cur_layer = nullptr;
+    auto &layer_creater_map = GetGlobalNpuLayerCreatorMap();
+    if (layer_creater_map.count(type) > 0) {
+        cur_layer = layer_creater_map[type]->CreateNpuBaseLayer();
+    }
+    return cur_layer;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h
new file mode 100644
index 0000000..1a55214
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h
@@ -0,0 +1,168 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NPU_CONVERT_NPU_BASE_LAYER_CONVERT_ACC_H_
+#define TNN_SOURCE_TNN_NPU_CONVERT_NPU_BASE_LAYER_CONVERT_ACC_H_
+
+#include <tnn/layer/base_layer.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "graph/attr_value.h"
+#include "graph/compatible/all_ops.h"
+#include "graph/op/nn_defs.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+class OperatorInfo {
+public:
+    OperatorInfo();
+    explicit OperatorInfo(std::shared_ptr<ge::Operator> op);
+    OperatorInfo(std::shared_ptr<ge::Operator> op, vector<int> shape);
+
+    virtual ~OperatorInfo();
+
+    shared_ptr<ge::Operator> GetOperator();
+    std::vector<int> GetShape();
+    void SetShape(vector<int> shape);
+    void SetOperator(std::shared_ptr<ge::Operator> op);
+
+private:
+    std::shared_ptr<ge::Operator> op_;
+    std::vector<int> shape_;
+};
+
+//@brief BaseLaye define the layer interface
+class NpuBaseLayer {
+public:
+    explicit NpuBaseLayer(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~NpuBaseLayer();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                std::vector<std::shared_ptr<OperatorInfo>> input_ops, AbstractDevice *device,
+                std::vector<std::string> outputs);
+
+    // @brief layer init
+    // @param ...
+    //@brief get layer name
+    std::string GetLayerName();
+
+    //@brief set laye name
+    void SetLayerName(std::string layer_name);
+
+    //@brief set npu rom version
+    void SetNpuVersion(std::string version);
+
+    // add for huawei_npu
+    //@brief get output operators
+    std::vector<std::shared_ptr<OperatorInfo>> &GetOutputOps();
+
+    Status SetOutputOps();
+    Status GetOutputShape(int i, std::vector<int> &output_shape);
+    Status CalculateOutputShape(std::vector<std::vector<int>> &output_shapes);
+
+protected:
+    LayerType type_;
+    std::string layer_name_ = "";
+    std::string npu_version_ = "";
+    // add for huawei_npu
+    std::vector<std::shared_ptr<OperatorInfo>> input_ops_;
+    std::vector<std::shared_ptr<OperatorInfo>> output_ops_;
+    LayerParam *param_;
+    LayerResource *resource_;
+
+    std::vector<std::string> outputs_name_;
+    std::vector<std::vector<int>> output_shapes_;
+    virtual Status Convert() = 0;
+};
+
+//@brief LayerCreator define the create layer interface
+class NpuLayerCreator {
+public:
+    virtual NpuBaseLayer *CreateNpuBaseLayer() = 0;
+};
+
+//@brief TypeLayerCreator create TypeLayer
+template <typename T>
+class TypeNpuLayerCreator : public NpuLayerCreator {
+public:
+    explicit TypeNpuLayerCreator(LayerType type) {
+        this->type_ = type;
+    };
+    virtual NpuBaseLayer *CreateNpuBaseLayer() {
+        auto layer = new T(type_);
+        return layer;
+    }
+
+protected:
+    LayerType type_;
+};
+
+//@brief TypeLayerCreator register map
+std::map<LayerType, std::shared_ptr<NpuLayerCreator>> &GetGlobalNpuLayerCreatorMap();
+
+//@brief TypeLayerRegister register TypeLayerCreator
+template <typename T>
+class TypeNpuLayerRegister {
+public:
+    explicit TypeNpuLayerRegister(LayerType type) {
+        GetGlobalNpuLayerCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+NpuBaseLayer *CreateNpuBaseLayer(LayerType type);
+
+#define DECLARE_NPU_LAYER(type_string, layer_type)                                                                     \
+    class Npu##type_string##Layer : public NpuBaseLayer {                                                              \
+    public:                                                                                                            \
+        Npu##type_string##Layer(LayerType ignore) : NpuBaseLayer(layer_type){};                                        \
+        virtual ~Npu##type_string##Layer(){};                                                                          \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status Convert();                                                                                      \
+    };
+
+#define DECLARE_NPU_LAYER_WEIGHT(type_string, layer_type)                                                              \
+    class Npu##type_string##Layer : public NpuBaseLayer {                                                              \
+    public:                                                                                                            \
+        Npu##type_string##Layer(LayerType ignore) : NpuBaseLayer(layer_type){};                                        \
+        virtual ~Npu##type_string##Layer(){};                                                                          \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status Convert();                                                                                      \
+        std::vector<std::shared_ptr<ge::Operator>> weight_ops_;                                                        \
+    };
+
+#define REGISTER_NPU_LAYER(type_string, layer_type)                                                                    \
+    TypeNpuLayerRegister<TypeNpuLayerCreator<Npu##type_string##Layer>> g_Npu##layer_type##_register(layer_type);
+
+#define ADD_OUTPUT_OP(output)                                                                                          \
+    std::shared_ptr<OperatorInfo> output_op = std::make_shared<OperatorInfo>(output);                                  \
+    output_ops_.push_back(output_op);                                                                                  \
+    return SetOutputOps();
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NPU_CONVERT_NPU_BASE_LAYER_CONVERT_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_batch_norm_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_batch_norm_layer_convert.cc
new file mode 100644
index 0000000..aef9d17
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_batch_norm_layer_convert.cc
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(BatchNorm, LAYER_BATCH_NORM)
+
+Status InitBnVectorData(std::vector<float> &mean_data, std::vector<float> &variance_data,
+                        std::vector<float> &scale_data, std::vector<float> &bias_data, float *scale_data_fp32,
+                        float *bias_data_fp32, int channel, bool share_channel) {
+    if (nullptr == scale_data_fp32) {
+        return Status(TNNERR_NULL_PARAM, "scale data ptr is null");
+    }
+    for (int i = 0; i < channel; i++) {
+        mean_data.push_back(0.0f);
+        variance_data.push_back(1.0f);
+        int index = i;
+        if (share_channel) {
+            index = 0;
+        }
+        scale_data.push_back(scale_data_fp32[index]);
+        if (nullptr == bias_data_fp32) {
+            bias_data.push_back(0.0f);
+        } else {
+            bias_data.push_back(bias_data_fp32[index]);
+        }
+    }
+    return TNN_OK;
+}
+
+Status NpuBatchNormLayer::Convert() {
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNorm layer resource is nil");
+    }
+    if (input_ops_[0]->GetShape().size() != 4) {
+        return Status(TNNERR_PARAM_ERR, "Error: BatchNorm layer not support dim != 4 for HUAWEI_NPU");
+    }
+
+    Status ret = TNN_OK;
+
+    // channel is the second element of NCHW
+    int channel = input_ops_[0]->GetShape()[1];
+    bool share_channel =
+        resource->scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(resource->scale_handle.GetDataType());
+
+    // fixed - set to be 0 and 1
+    std::vector<float> mean_data;
+    std::vector<float> variance_data;
+    // here needs to consider float 16
+    std::vector<float> scale_data;
+    std::vector<float> bias_data;
+
+    if (resource->scale_handle.GetDataType() != DATA_TYPE_FLOAT) {
+        // if filter handle is half,it needs to be converted to float first.
+        auto scale_data_fp32 = GetFloatFromRawBuffer(resource->scale_handle);
+        auto bias_data_fp32  = GetFloatFromRawBuffer(resource->bias_handle);
+
+        if (scale_data_fp32 == nullptr) {
+            return Status(TNNERR_NPU_LOAD_ERROR, "In NPU, when convert to 16, pointer is null");
+        }
+        ret = InitBnVectorData(mean_data, variance_data, scale_data, bias_data, scale_data_fp32.get(),
+                               bias_data_fp32.get(), channel, share_channel);
+    } else {
+        ret = InitBnVectorData(mean_data, variance_data, scale_data, bias_data,
+                               resource->scale_handle.force_to<float *>(), resource->bias_handle.force_to<float *>(),
+                               channel, share_channel);
+    }
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    ge::Shape shape({channel});
+    ge::TensorDesc desc(shape, ge::FORMAT_NCHW, ge::DT_FLOAT);
+
+    auto mean_const = std::make_shared<ge::op::Const>(layer_name_ + "_mean");
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(mean_const, mean_data, desc, channel), TNN_OK);
+
+    auto variance_const = std::make_shared<ge::op::Const>(layer_name_ + "_variance");
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(variance_const, variance_data, desc, channel), TNN_OK);
+
+    auto scale_const = std::make_shared<ge::op::Const>(layer_name_ + "_scale");
+    auto bias_const  = std::make_shared<ge::op::Const>(layer_name_ + "_bias");
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(scale_const, scale_data, desc, channel), TNN_OK);
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(bias_const, bias_data, desc, channel), TNN_OK);
+
+    weight_ops_.push_back(mean_const);
+    weight_ops_.push_back(variance_const);
+    weight_ops_.push_back(scale_const);
+    weight_ops_.push_back(bias_const);
+
+    auto output = std::make_shared<ge::op::BatchNormExt2>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_variance(*variance_const);
+    output->set_input_mean(*mean_const);
+    output->set_input_scale(*scale_const);
+    output->set_input_offset(*bias_const);
+    output->set_attr_mode(1);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(BatchNorm, LAYER_BATCH_NORM)
+REGISTER_NPU_LAYER(BatchNorm, LAYER_SCALE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_cast_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_cast_layer_convert.cc
new file mode 100644
index 0000000..64d3a1b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_cast_layer_convert.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Cast, LAYER_CAST)
+
+Status NpuCastLayer::Convert() {
+    auto param = dynamic_cast<CastLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    if (param->from == param->to) {
+        auto output = std::make_shared<hiai::op::Permute>(outputs_name_[0]);
+
+        std::vector<int64_t> order;
+        order.clear();
+        for (int idx = 0; idx < output_shapes_[0].size(); ++idx) {
+            order.push_back(idx);
+        }
+
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_attr_order(order);
+
+        ADD_OUTPUT_OP(output)
+    } else {
+        auto output = std::make_shared<hiai::op::CastT>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        ge::DataType src_dtype = NpuUtils::ConvertToHiaiDataType((TNN_NS::DataType)param->from);
+        ge::DataType dst_dtype = NpuUtils::ConvertToHiaiDataType((TNN_NS::DataType)param->to);
+        output->set_attr_src_dtype(src_dtype);
+        output->set_attr_src_dtype(dst_dtype);
+        ADD_OUTPUT_OP(output)
+    }
+}
+
+REGISTER_NPU_LAYER(Cast, LAYER_CAST)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_clip_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_clip_layer_convert.cc
new file mode 100644
index 0000000..5bc90b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_clip_layer_convert.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Clip, LAYER_CLIP)
+
+Status NpuClipLayer::Convert() {
+    auto param = dynamic_cast<ClipLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    std::vector<float> min = {param->min};
+    std::vector<float> max = {param->max};
+
+    ge::TensorDesc desc({}, ge::FORMAT_NCHW, ge::DT_FLOAT);
+    ge::AttrValue::TENSOR input_size_tensor = std::make_shared<ge::Tensor>(desc);
+    auto min_const = std::make_shared<ge::op::Const>(layer_name_ + "_min");
+    NpuUtils::CreateAttrArray(min_const, min, desc, 1);
+    auto max_const = std::make_shared<ge::op::Const>(layer_name_ + "_max");
+    NpuUtils::CreateAttrArray(max_const, max, desc, 1);
+
+    weight_ops_.push_back(min_const);
+    weight_ops_.push_back(max_const);
+    auto output = std::make_shared<hiai::op::ClipByValue>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_clip_value_max(*max_const);
+    output->set_input_clip_value_min(*min_const);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Clip, LAYER_CLIP)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_concat_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_concat_layer_convert.cc
new file mode 100644
index 0000000..d699687
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_concat_layer_convert.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Concat, LAYER_CONCAT)
+
+Status NpuConcatLayer::Convert() {
+    int input_size = input_ops_.size();
+    auto param     = dynamic_cast<ConcatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (input_size < 2) {
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 2");
+    }
+    int axis = param->axis;
+
+    auto output = std::make_shared<ge::op::Concat>(outputs_name_[0]);
+    // multiple input
+    output->create_dynamic_input_x(input_size);
+    for (int i = 1; i < input_size + 1; i++) {
+        output->set_dynamic_input_x(i, *(input_ops_[i - 1]->GetOperator()));
+    }
+    output->set_attr_axis(axis);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Concat, LAYER_CONCAT)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert.cc
new file mode 100644
index 0000000..2299376
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert.cc
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_conv_layer_convert_impl.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuConvLayer : public NpuConvImplLayer {
+public:
+    NpuConvLayer(LayerType ignore) : NpuConvImplLayer(LAYER_CONVOLUTION){};
+    virtual ~NpuConvLayer() {}
+
+protected:
+    virtual Status Convert() {
+        Status ret    = ObtainParam();
+        auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        if (!resource) {
+            return Status(TNNERR_MODEL_ERR, "Error: ConvLayerResource is empty");
+        }
+
+        // weight
+        int total_data_size = resource->filter_handle.GetDataCount();
+        int in_group        = total_data_size / (kernel_h_ * kernel_w_ * output_channel_);
+        ge::Shape weight_shape({output_channel_, in_group, kernel_h_, kernel_w_});
+        auto weight_const = std::make_shared<ge::op::Const>(layer_name_ + "_weight");
+        NpuUtils::CreateAttrValue(weight_const, weight_shape, resource->filter_handle);
+        weight_ops_.push_back(weight_const);
+
+        std::string conv_op_name = outputs_name_[0];
+        if (activation_type_ != ActivationType_None) {
+            conv_op_name = layer_name_ + "_conv_op";
+        }
+        auto conv_op = std::make_shared<hiai::op::Convolution>(conv_op_name);
+        conv_op->set_input_x(*input_ops_[0]->GetOperator());
+        conv_op->set_input_filter(*weight_const);
+        // Init bias
+        int bias_count = resource->bias_handle.GetDataCount();
+        // check bias
+        if (bias_count != 0) {
+            // bias
+            ge::Shape bias_shape({1, bias_count, 1, 1});
+            auto bias_const = std::make_shared<ge::op::Const>(layer_name_ + "_bias");
+            NpuUtils::CreateAttrValue(bias_const, bias_shape, resource->bias_handle);
+            weight_ops_.push_back(bias_const);
+            conv_op->set_input_bias(*bias_const);
+        }
+        conv_op->set_attr_strides(ge::AttrValue::LIST_INT({stride_h_, stride_w_}));
+        conv_op->set_attr_dilations(ge::AttrValue::LIST_INT({dilation_h_, dilation_w_}));
+        conv_op->set_attr_groups(group_);
+        conv_op->set_attr_pads(ge::AttrValue::LIST_INT({pad_h_begin_, pad_h_end_, pad_w_begin_, pad_w_end_}));
+
+        conv_op->set_attr_pad_mode("SPECIFIC");
+
+        if (activation_type_ == ActivationType_None) {
+            ADD_OUTPUT_OP(conv_op)
+        } else {
+            weight_ops_.push_back(conv_op);
+
+            if (activation_type_ == ActivationType_ReLU) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(outputs_name_[0]);
+                activation_op->set_input_x(*conv_op);
+                activation_op->set_attr_mode(1);
+                ADD_OUTPUT_OP(activation_op)
+            } else if (activation_type_ == ActivationType_ReLU6) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(outputs_name_[0]);
+                activation_op->set_input_x(*conv_op);
+                activation_op->set_attr_mode(14);
+                ADD_OUTPUT_OP(activation_op)
+            } else if (activation_type_ == ActivationType_SIGMOID_MUL) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(layer_name_ + "_sigmoid");
+                activation_op->set_input_x(*conv_op);
+                activation_op->set_attr_mode(0);
+                weight_ops_.push_back(activation_op);
+
+                auto mul_op = std::make_shared<hiai::op::Mul>(outputs_name_[0]);
+                mul_op->set_input_x1(*conv_op);
+                mul_op->set_input_x2(*activation_op);
+                ADD_OUTPUT_OP(mul_op)
+            } else {
+                LOGE("the convolution activation type (%d) is not support in HUAWEI_NPU yet\n", activation_type_);
+                return Status(TNNERR_PARAM_ERR, "the convolution activation type is not support in HUAWEI_NPU yet");
+            }
+        }
+    }
+};
+
+REGISTER_NPU_LAYER(Conv, LAYER_CONVOLUTION)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert_impl.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert_impl.h
new file mode 100644
index 0000000..6dbb9b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_conv_layer_convert_impl.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/all_ops.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_CONV_LAYER_IMPL_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_CONV_LAYER_IMPL_H_
+
+namespace TNN_NS {
+
+class NpuConvImplLayer : public NpuBaseLayer {
+public:
+    NpuConvImplLayer(LayerType layer_type) : NpuBaseLayer(layer_type){};
+    virtual ~NpuConvImplLayer() {}
+
+protected:
+    Status ObtainParam() {
+        auto param = dynamic_cast<ConvLayerParam *>(param_);
+        CHECK_PARAM_NULL(param);
+        stride_w_ = param->strides[0];
+        stride_h_ = param->strides[1];
+
+        dilation_w_ = param->dialations[0];
+        dilation_h_ = param->dialations[1];
+
+        kernel_w_       = param->kernels[0];
+        kernel_h_       = param->kernels[1];
+        group_          = param->group;
+        output_channel_ = param->output_channel;
+        pad_w_begin_    = param->pads[0];
+        pad_w_end_      = param->pads[1];
+        pad_h_begin_    = param->pads[2];
+        pad_h_end_      = param->pads[3];
+        pad_type_       = param->pad_type;
+
+        activation_type_ = param->activation_type;
+
+        return TNN_OK;
+    }
+    std::vector<shared_ptr<ge::Operator>> weight_ops_;
+    int stride_w_;
+    int stride_h_;
+
+    int dilation_w_;
+    int dilation_h_;
+
+    int kernel_w_;
+    int kernel_h_;
+
+    int pad_w_begin_;
+    int pad_w_end_;
+    int pad_h_begin_;
+    int pad_h_end_;
+
+    int group_;
+    int output_channel_;
+    int pad_type_;
+
+    int activation_type_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_CONVERT_NPU_CONV_LAYER_CONVERT_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_deconv_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_deconv_layer_convert.cc
new file mode 100644
index 0000000..fed26e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_deconv_layer_convert.cc
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_conv_layer_convert_impl.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuDeconvLayer : public NpuConvImplLayer {
+public:
+    NpuDeconvLayer(LayerType ignore) : NpuConvImplLayer(LAYER_DECONVOLUTION){};
+    virtual ~NpuDeconvLayer() {}
+
+protected:
+    virtual Status Convert() {
+        Status ret = ObtainParam();
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+        if (!resource) {
+            return Status(TNNERR_MODEL_ERR, "Error: DeConvLayerResource is empty");
+        }
+
+        if (!(NpuUtils::VersionCompare(npu_version_, "100.320.xxx.xxx", VCT_BIGEQUAL) &&
+              ((NpuUtils::VersionCompare(npu_version_, "100.320.010.023", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.320.010.999", VCT_SMALLER)) ||
+               (NpuUtils::VersionCompare(npu_version_, "100.320.011.019", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.320.011.999", VCT_SMALLER)) ||
+               (NpuUtils::VersionCompare(npu_version_, "100.320.012.011", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.320.012.999", VCT_SMALLER))))) {
+            if (resource->bias_handle.GetDataCount() > 0) {
+                LOGE("Current IR deconv does not support bias (npu version: %s)\n", npu_version_.c_str());
+                return Status(TNNERR_LAYER_ERR, "Error: Current IR deconv does not support bias");
+            }
+        }
+
+        const int input_channel = input_ops_[0]->GetShape()[1];
+
+        // filter
+        int filter_channel = (resource->filter_handle.GetDataCount() / (kernel_h_ * kernel_w_ * input_channel));
+        ge::Shape filter_shape({input_channel, filter_channel, kernel_h_, kernel_w_});
+        auto filter_const = std::make_shared<ge::op::Const>(layer_name_ + "filter");
+        NpuUtils::CreateAttrValue(filter_const, filter_shape, resource->filter_handle);
+        weight_ops_.push_back(filter_const);
+
+        // bias
+        int bias_count  = resource->bias_handle.GetDataCount();
+        auto bias_const = std::make_shared<ge::op::Const>(layer_name_ + "_bias");
+        if (bias_count != 0) {
+            // bias
+            ge::Shape bias_shape({1, bias_count, 1, 1});
+            NpuUtils::CreateAttrValue(bias_const, bias_shape, resource->bias_handle);
+            weight_ops_.push_back(bias_const);
+        }
+
+        std::string deconv_op_name = outputs_name_[0];
+        if (activation_type_ != ActivationType_None) {
+            deconv_op_name = layer_name_ + "_deconv_op";
+        }
+        auto deconv_op = std::make_shared<hiai::op::ConvTranspose>(deconv_op_name);
+        deconv_op->set_input_filter(*filter_const);
+        if (bias_count != 0) {
+            deconv_op->set_input_bias(*bias_const);
+        }
+        deconv_op->set_input_x(*input_ops_[0]->GetOperator());
+        deconv_op->set_attr_groups(group_);
+        if (0 == pad_type_ || 3 == pad_type_) {
+            deconv_op->set_attr_pad_mode("SAME");
+            deconv_op->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+        } else if (1 == pad_type_) {
+            deconv_op->set_attr_pad_mode("VALID");
+            deconv_op->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+        } else {
+            deconv_op->set_attr_pad_mode("SPECIFIC");
+            deconv_op->set_attr_pads(ge::AttrValue::LIST_INT({pad_h_begin_, pad_h_end_, pad_w_begin_, pad_w_end_}));
+        }
+        deconv_op->set_attr_strides(ge::AttrValue::LIST_INT({stride_h_, stride_w_}));
+        deconv_op->set_attr_dilations(ge::AttrValue::LIST_INT({dilation_h_, dilation_w_}));
+
+        if (activation_type_ == ActivationType_None) {
+            ADD_OUTPUT_OP(deconv_op)
+        } else {
+            weight_ops_.push_back(deconv_op);
+
+            if (activation_type_ == ActivationType_ReLU) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(outputs_name_[0]);
+                activation_op->set_input_x(*deconv_op);
+                activation_op->set_attr_mode(1);
+                ADD_OUTPUT_OP(activation_op)
+            } else if (activation_type_ == ActivationType_ReLU6) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(outputs_name_[0]);
+                activation_op->set_input_x(*deconv_op);
+                activation_op->set_attr_mode(14);
+                ADD_OUTPUT_OP(activation_op)
+            } else if (activation_type_ == ActivationType_SIGMOID_MUL) {
+                auto activation_op = std::make_shared<hiai::op::Activation>(layer_name_ + "_sigmoid");
+                activation_op->set_input_x(*deconv_op);
+                activation_op->set_attr_mode(0);
+                weight_ops_.push_back(activation_op);
+
+                auto mul_op = std::make_shared<hiai::op::Mul>(outputs_name_[0]);
+                mul_op->set_input_x1(*deconv_op);
+                mul_op->set_input_x2(*activation_op);
+                ADD_OUTPUT_OP(mul_op)
+            } else {
+                LOGE("the deconvolution activation type (%d) is not support in HUAWEI_NPU yet\n", activation_type_);
+                return Status(TNNERR_PARAM_ERR, "the deconvolution activation type is not support in HUAWEI_NPU yet");
+            }
+        }
+    }
+};
+REGISTER_NPU_LAYER(Deconv, LAYER_DECONVOLUTION)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_flatten_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_flatten_layer_convert.cc
new file mode 100644
index 0000000..b3dd381
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_flatten_layer_convert.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Flatten, LAYER_FLATTEN)
+
+Status NpuFlattenLayer::Convert() {
+    auto param = dynamic_cast<FlattenLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    if (NpuUtils::VersionCompare(npu_version_, "100.500.010.010", VCT_SMALLER)) {
+        // use hiai::op::Flatten
+        if (param->axis == 1) {
+            auto output = std::make_shared<hiai::op::Flatten>(outputs_name_[0]);
+            output->set_input_x(*input_ops_[0]->GetOperator());
+            ADD_OUTPUT_OP(output)
+        } else {
+            LOGE("Use hiai::op::Flatten and axis should be 1\n");
+            return Status(TNNERR_MODEL_ERR, "Use hiai::op::Flatten and axis should be 1");
+        }
+    } else {
+        // use hiai::op::FlattenV2
+        auto output = std::make_shared<hiai::op::FlattenV2>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_attr_axis(param->axis);
+        ADD_OUTPUT_OP(output)
+    }
+}
+
+REGISTER_NPU_LAYER(Flatten, LAYER_FLATTEN)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_gather_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_gather_layer_convert.cc
new file mode 100644
index 0000000..498e180
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_gather_layer_convert.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Gather, LAYER_GATHER)
+
+Status NpuGatherLayer::Convert() {
+    auto param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    auto resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((param->data_in_resource || param->indices_in_resource) && !resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+
+    auto output = std::make_shared<hiai::op::GatherV2D>(outputs_name_[0]);
+
+    // set data
+    if (param->data_in_resource) {
+        DimsVector data_dims                      = resource->data.GetBufferDims();
+        std::shared_ptr<ge::op::Const> data_const = std::make_shared<ge::op::Const>(layer_name_ + "_data");
+        ge::Shape data_shape(NpuUtils::Int32VecToTVec<int64_t>(data_dims));
+        NpuUtils::CreateAttrValue(data_const, data_shape, resource->data);
+        weight_ops_.push_back(data_const);
+        output->set_input_x(*data_const);
+    } else {
+        output->set_input_x(*input_ops_[0]->GetOperator());
+    }
+
+    // set indices
+    if (param->indices_in_resource) {
+        DimsVector indices_dims = resource->indices.GetBufferDims();
+        int length              = resource->indices.GetBytesSize();
+
+        std::shared_ptr<ge::op::Const> indices_const = std::make_shared<ge::op::Const>(layer_name_ + "_indices");
+        if (indices_dims.size() == 0 && length == 4) {
+            std::vector<int> vec = {resource->indices.force_to<int*>()[0]};
+            ge::TensorDesc const_desc(ge::Shape(), ge::FORMAT_NCHW, ge::DT_INT32);
+            NpuUtils::CreateAttrArray(indices_const, vec, const_desc, 1);
+        } else {
+            ge::Shape indices_shape(NpuUtils::Int32VecToTVec<int64_t>(indices_dims));
+            NpuUtils::CreateAttrValue(indices_const, indices_shape, resource->indices);
+        }
+        weight_ops_.push_back(indices_const);
+        output->set_input_indices(*indices_const);
+    } else {
+        if (input_ops_.size() < 2) {
+            LOGE("gather layer don't have indics resource\n");
+            return Status(TNNERR_MODEL_ERR, "gather layer don't have indics resource\n");
+        }
+        output->set_input_indices(*input_ops_[1]->GetOperator());
+    }
+
+    // set axis
+    output->set_attr_axis(param->axis);
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Gather, LAYER_GATHER)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc
new file mode 100644
index 0000000..414dab6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(InnerProduct, LAYER_INNER_PRODUCT)
+
+Status NpuInnerProductLayer::Convert() {
+    auto param    = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto resource = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(param);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: InnerProductLayerResource is nil");
+    }
+
+    // weight
+    int input_dims_size = (int)input_ops_[0]->GetShape().size();
+    vector<int> w_shape = input_ops_[0]->GetShape();
+    w_shape[0]          = param->num_output;
+    for (int i = input_dims_size; i < 4; ++i) {
+        w_shape.push_back(1);
+    }
+    ge::Shape weight_shape(NpuUtils::Int32VecToTVec<int64_t>(w_shape));
+    auto weight_const = std::make_shared<ge::op::Const>(layer_name_ + "_weight");
+    NpuUtils::CreateAttrValue(weight_const, weight_shape, resource->weight_handle);
+    weight_ops_.push_back(weight_const);
+
+    auto output = std::make_shared<hiai::op::FullyConnection>(outputs_name_[0]);
+    if (input_dims_size < 4) {
+        // insert Reshape layer if input dims size < 4
+        std::vector<int> shape;
+        shape.clear();
+        for (int i = 0; i < input_dims_size; ++i) {
+            shape.push_back(0);
+        }
+        for (int i = input_dims_size; i < 4; ++i) {
+            shape.push_back(1);
+        }
+        std::shared_ptr<ge::op::Const> shape_const = std::make_shared<ge::op::Const>(layer_name_ + "_reshape_shape");
+        ge::TensorDesc shape_desc(ge::Shape({(int64_t)shape.size()}), ge::FORMAT_NCHW, ge::DT_INT32);
+        NpuUtils::CreateAttrArray(shape_const, shape, shape_desc, (int)shape.size());
+        weight_ops_.push_back(shape_const);
+
+        auto reshape_op = std::make_shared<hiai::op::Reshape>(layer_name_ + "_reshape");
+        reshape_op->set_input_x(*input_ops_[0]->GetOperator());
+        reshape_op->set_input_shape(*shape_const);
+        weight_ops_.push_back(reshape_op);
+
+        output->set_input_x(*reshape_op);
+    } else {
+        output->set_input_x(*input_ops_[0]->GetOperator());
+    }
+    output->set_input_w(*weight_const);
+    output->set_attr_num_output(param->num_output);
+    // bias
+    if (param->has_bias) {
+        std::string bias_name = layer_name_ + "_bias";
+        int bias_count        = resource->bias_handle.GetDataCount();
+        ge::Shape bias_shape({1, bias_count, 1, 1});
+        auto bias_const = std::make_shared<ge::op::Const>(bias_name);
+        NpuUtils::CreateAttrValue(bias_const, bias_shape, resource->bias_handle);
+        weight_ops_.push_back(bias_const);
+        output->set_input_b(*bias_const);
+    }
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(InnerProduct, LAYER_INNER_PRODUCT)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_instance_norm_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_instance_norm_layer_convert.cc
new file mode 100644
index 0000000..c79d517
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_instance_norm_layer_convert.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(InstanceNorm, LAYER_INST_BATCH_NORM)
+
+Status NpuInstanceNormLayer::Convert() {
+    auto resource = dynamic_cast<InstanceNormLayerResource*>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: InstanceNorm layer resource is nil");
+    }
+    if (input_ops_[0]->GetShape().size() != 4) {
+        return Status(TNNERR_PARAM_ERR, "Error: InstanceNorm layer not support dim != 4 for HUAWEI_NPU");
+    }
+    // input shape 0.n 1.c 2.h 3.w
+    int input_channel = input_ops_[0]->GetShape()[1];
+    // scale
+    std::string scale_name = layer_name_ + "_scale";
+    ge::Shape scale_shape({1, input_channel, 1, 1});
+    auto scale = std::make_shared<ge::op::Const>(scale_name);
+    NpuUtils::CreateAttrValue(scale, scale_shape, resource->scale_handle);
+    weight_ops_.push_back(scale);
+
+    // bias data
+    std::string bias_name = layer_name_ + "_bias";
+    ge::Shape bias_shape({1, input_channel, 1, 1});
+    auto bias_const = std::make_shared<ge::op::Const>(bias_name);
+    NpuUtils::CreateAttrValue(bias_const, bias_shape, resource->bias_handle);
+    weight_ops_.push_back(bias_const);
+
+    auto output = std::make_shared<hiai::op::InstanceNorm>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_gamma(*scale);
+    output->set_input_beta(*bias_const);
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(InstanceNorm, LAYER_INST_BATCH_NORM)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_lrn_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_lrn_layer_convert.cc
new file mode 100644
index 0000000..cc524bc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_lrn_layer_convert.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(LRN, LAYER_LRN)
+
+Status NpuLRNLayer::Convert() {
+    auto param = dynamic_cast<LRNLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    float alpha = param->alpha;
+    float beta  = param->beta;
+    float bias  = param->bias;
+    int size    = param->size;
+
+    auto output = std::make_shared<ge::op::LRN>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_lrn_localsize(size);
+    output->set_attr_lrn_alpha(alpha);
+    output->set_attr_lrn_beta(beta);
+    output->set_attr_lrn_k(bias);
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(LRN, LAYER_LRN)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_matmul_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_matmul_layer_convert.cc
new file mode 100644
index 0000000..56d5931
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_matmul_layer_convert.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+#include <sstream>
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(MatMul, LAYER_MATMUL)
+
+Status NpuMatMulLayer::Convert() {
+    auto param = dynamic_cast<MatMulLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    auto resource = dynamic_cast<MatMulLayerResource*>(resource_);
+    if ((param->weight_position != 0 && param->weight_position != 1) && !resource) {
+        return Status(TNNERR_MODEL_ERR, "MatMul resource is invalid");
+    }
+
+    auto output = std::make_shared<hiai::op::MatMul>(outputs_name_[0]);
+
+    std::stringstream input0_dims_stream;
+    auto input0_dims = input_ops_[0]->GetShape();
+    std::copy(input0_dims.begin(), input0_dims.end(), std::ostream_iterator<int>(input0_dims_stream, ","));
+    if (input_ops_.size() == 1) {
+        auto weight_dims = resource->weight.GetBufferDims();
+        if (input0_dims.size() != 2 || weight_dims.size() != 2) {
+            std::stringstream weight_dims_stream;
+            std::copy(weight_dims.begin(), weight_dims.end(), std::ostream_iterator<int>(weight_dims_stream, ","));
+            LOGE("the inputs of MatMul is not 2-dimensional (input1: %s  input2: %s)", input0_dims_stream.str().c_str(),
+                 weight_dims_stream.str().c_str());
+            return Status(TNNERR_MODEL_ERR, "MatMul in HUAWEI_NPU just support 2-dimensional for both inputs");
+        }
+
+        std::shared_ptr<ge::op::Const> data_const = std::make_shared<ge::op::Const>(layer_name_ + "_data");
+        ge::Shape data_shape(NpuUtils::Int32VecToTVec<int64_t>(weight_dims));
+        NpuUtils::CreateAttrValue(data_const, data_shape, resource->weight);
+        weight_ops_.push_back(data_const);
+
+        if (param->weight_position == 0) {
+            output->set_input_x1(*data_const);
+            output->set_input_x2(*input_ops_[0]->GetOperator());
+        } else if (param->weight_position == 1) {
+            output->set_input_x1(*input_ops_[0]->GetOperator());
+            output->set_input_x2(*data_const);
+        } else {
+            LOGE("weight_position should be 0 or 1\n");
+            return Status(TNNERR_MODEL_ERR, "invalid param in MatMul (weight_position should be 0 or 1)");
+        }
+    } else if (input_ops_.size() == 2) {
+        auto input1_dims = input_ops_[0]->GetShape();
+        if (input0_dims.size() != 2 || input1_dims.size() != 2) {
+            std::stringstream input1_dims_stream;
+            std::copy(input1_dims.begin(), input1_dims.end(), std::ostream_iterator<int>(input1_dims_stream, ","));
+            LOGE("the inputs of MatMul is not 2-dimensional (input1: %s  input2: %s)", input0_dims_stream.str().c_str(),
+                 input1_dims_stream.str().c_str());
+            return Status(TNNERR_MODEL_ERR, "MatMul in HUAWEI_NPU just support 2-dimensional for both inputs");
+        }
+
+        output->set_input_x1(*input_ops_[0]->GetOperator());
+        output->set_input_x2(*input_ops_[1]->GetOperator());
+    } else {
+        return Status(TNNERR_MODEL_ERR, "invalid input count in MatMul");
+    }
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(MatMul, LAYER_MATMUL)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_normalize_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_normalize_layer_convert.cc
new file mode 100644
index 0000000..42a1ee2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_normalize_layer_convert.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Normalize, LAYER_NORMALIZE)
+
+Status NpuNormalizeLayer::Convert() {
+    auto param = dynamic_cast<NormalizeLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    if (param->p == 2) {
+        auto output         = std::make_shared<hiai::op::L2Normalize>(outputs_name_[0]);
+        int input_dims_size = (int)input_ops_[0]->GetShape().size();
+        if (input_dims_size < 4) {
+            // reshape dims to dim-4
+            auto reshape_op = std::make_shared<hiai::op::Reshape>(layer_name_ + "_reshape");
+            std::vector<int> shape;
+            shape.clear();
+            for (int i = 0; i < 4; ++i) {
+                if (i < input_dims_size) {
+                    shape.push_back(0);
+                } else {
+                    shape.push_back(1);
+                }
+            }
+            ge::TensorDesc shape_desc(ge::Shape({(int64_t)shape.size()}), ge::FORMAT_NCHW, ge::DT_INT32);
+            std::shared_ptr<ge::op::Const> shape_const =
+                std::make_shared<ge::op::Const>(layer_name_ + "_reshape_shape_const");
+            RETURN_ON_NEQ(NpuUtils::CreateAttrArray(shape_const, shape, shape_desc, shape.size()), TNN_OK);
+            reshape_op->set_input_x(*input_ops_[0]->GetOperator());
+            reshape_op->set_input_shape(*shape_const);
+            weight_ops_.push_back(shape_const);
+            weight_ops_.push_back(reshape_op);
+
+            output->set_input_x(*reshape_op);
+        } else if (input_dims_size == 4) {
+            output->set_input_x(*input_ops_[0]->GetOperator());
+        } else {
+            LOGE("input dims (%d) > 4 is not support in Normalize yet\n", input_dims_size);
+            return Status(TNNERR_MODEL_ERR, "input dims > 4 is not support in Normalize for HUAWEI_NPU");
+        }
+
+        output->set_attr_axis({param->axis});
+        if (param->epsilon > 1e-4f)
+            output->set_attr_eps(param->epsilon);
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("the param->p (%d) is not support in Normalize yet\n", param->p);
+        return Status(TNNERR_MODEL_ERR, "the param p is invalid in Normalize for HUAWEI_NPU");
+    }
+}
+
+REGISTER_NPU_LAYER(Normalize, LAYER_NORMALIZE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pad_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pad_layer_convert.cc
new file mode 100644
index 0000000..1a8bd6c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pad_layer_convert.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/all_ops.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Pad, LAYER_PAD)
+
+Status NpuPadLayer::Convert() {
+    // parameter and weight of the pad layer
+    auto param = dynamic_cast<PadLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    // paddings
+    std::vector<int> paddings = {
+        0, 0, param->pads[4], param->pads[5], param->pads[2], param->pads[3], param->pads[0], param->pads[1]};
+    std::shared_ptr<ge::op::Const> paddings_const = std::make_shared<ge::op::Const>(layer_name_ + "_paddings");
+    ge::TensorDesc desc(ge::Shape({4, 2}), ge::FORMAT_NCHW, ge::DT_INT32);
+    NpuUtils::CreateAttrArray(paddings_const, paddings, desc, 8);
+    weight_ops_.push_back(paddings_const);
+
+    if (param->type == 0) {
+        // values
+        std::vector<float> const_val                   = {param->value};
+        std::shared_ptr<ge::op::Const> const_val_const = std::make_shared<ge::op::Const>(layer_name_ + "_values");
+        ge::TensorDesc const_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        NpuUtils::CreateAttrArray(const_val_const, const_val, const_desc, 1);
+        weight_ops_.push_back(const_val_const);
+
+        auto output = std::make_shared<hiai::op::PadV2>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_paddings(*paddings_const);
+        output->set_input_constant_values(*const_val_const);
+        ADD_OUTPUT_OP(output)
+    } else if (param->type == 1) {
+        auto output = std::make_shared<hiai::op::MirrorPad>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_paddings(*paddings_const);
+        output->set_attr_mode("REFLECT");
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("Npu does not support such padding\n");
+        return Status(TNNERR_LAYER_ERR, "Npu does not support such padding");
+    }
+}
+
+REGISTER_NPU_LAYER(Pad, LAYER_PAD)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_permute_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_permute_layer_convert.cc
new file mode 100644
index 0000000..6033667
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_permute_layer_convert.cc
@@ -0,0 +1,49 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Permute, LAYER_PERMUTE)
+
+Status NpuPermuteLayer::Convert() {
+    auto param = dynamic_cast<PermuteLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    std::vector<int64_t> orders(param->orders.begin(), param->orders.end());
+
+    if (orders.size() > 4) {
+        LOGE("(Permute) dims size bigger than 4 is not support in HUAWEI_NPU");
+        return Status(TNNERR_MODEL_ERR, "(Permute) dims size bigger than 4 is not support in HUAWEI_NPU");
+    }
+
+    if (NpuUtils::VersionCompare(npu_version_, "100.500.010.012", VCT_BIGEQUAL)) {
+        auto output = std::make_shared<hiai::op::Permute>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_attr_order(orders);
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("Permute has bug in this rom version and is disabled\n");
+        return Status(TNNERR_MODEL_ERR, "Permute has bug in this rom version and is disabled");
+    }
+}
+
+REGISTER_NPU_LAYER(Permute, LAYER_PERMUTE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pooling_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pooling_layer_convert.cc
new file mode 100644
index 0000000..d073b43
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_pooling_layer_convert.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Pool, LAYER_POOLING)
+
+Status NpuPoolLayer::Convert() {
+    // parameter and weight of the pooling layer
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    // max pooling type 0; other avg
+    int pool_mode = param->pool_type;
+    if (pool_mode != 0) {
+        pool_mode = 1;
+    }
+    int pad_mode = 0;
+    Status ret   = NpuUtils::GetPadMode(pad_mode, param->pad_type);
+    if (ret != TNN_OK)
+        return ret;
+
+    int stride_w    = param->strides[0];
+    int stride_h    = param->strides[1];
+    int pad_w_begin = param->pads[0];
+    int pad_w_end   = param->pads[1];
+    int pad_h_begin = param->pads[2];
+    int pad_h_end   = param->pads[3];
+    int kernel_w    = param->kernels[0];
+    int kernel_h    = param->kernels[1];
+    int ceil_mode   = param->ceil_mode;
+
+    auto output = std::make_shared<hiai::op::PoolingD>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_mode(pool_mode);
+    if (kernel_h == 0 || kernel_w == 0) {
+        output->set_attr_global_pooling(ge::AttrValue::BOOL{true});
+    } else {
+        output->set_attr_window(ge::AttrValue::LIST_INT({kernel_h, kernel_w}));
+    }
+    output->set_attr_pad_mode(pad_mode);
+    output->set_attr_pad(ge::AttrValue::LIST_INT({pad_h_begin, pad_h_end, pad_w_begin, pad_w_end}));
+    output->set_attr_stride(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+    if (param->pad_type == -1) {
+        output->set_attr_ceil_mode(ceil_mode);
+        output->set_attr_data_mode(1);
+    }
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Pool, LAYER_POOLING)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prelu_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prelu_layer_convert.cc
new file mode 100644
index 0000000..e9832e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prelu_layer_convert.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Prelu, LAYER_PRELU)
+
+Status NpuPreluLayer::Convert() {
+    auto param = dynamic_cast<PReluLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto resource = dynamic_cast<PReluLayerResource *>(resource_);
+    CHECK_PARAM_NULL(resource);
+
+    // check slope
+    bool channel_shared = param->channel_shared;
+    if (!channel_shared) {
+        const float *slope_data = resource->slope_handle.force_to<float *>();
+        float temp_data         = slope_data[0];
+        bool val_is_same        = true;
+        for (int i = 1; i < resource->slope_handle.GetDataCount(); ++i) {
+            if (temp_data != slope_data[i]) {
+                val_is_same = false;
+                break;
+            }
+        }
+        if (val_is_same) {
+            channel_shared = true;
+        }
+    }
+
+    if ((!channel_shared) && NpuUtils::VersionCompare(npu_version_, "100.320.010.023", VCT_BIGEQUAL)) {
+        // use hiai::op::PRelu
+        auto output = std::make_shared<hiai::op::PRelu>(outputs_name_[0]);
+
+        std::shared_ptr<ge::op::Const> slope_const = std::make_shared<ge::op::Const>(layer_name_ + "_slope");
+        ge::Shape data_shape({1, (int64_t)resource->slope_handle.GetDataCount(), 1, 1});
+        NpuUtils::CreateAttrValue(slope_const, data_shape, resource->slope_handle);
+        weight_ops_.push_back(slope_const);
+
+        output->set_input_weight(*slope_const);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+
+        ADD_OUTPUT_OP(output)
+    } else {
+        // use hiai::op::Activation
+        auto output = std::make_shared<hiai::op::Activation>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        const float *slope_data = resource->slope_handle.force_to<float *>();
+        if (channel_shared) {
+            // if channel shared
+            output->set_attr_negative_slope(slope_data[0]);
+        } else {
+            LOGE("Error: huawei_npu currently only supports shared-channel prelu in this rom version (%s)\n",
+                 npu_version_.c_str());
+            return Status(TNNERR_LAYER_ERR,
+                          "Error: huawei_npu currently only supports shared-channel prelu in this rom version");
+        }
+        output->set_attr_mode(5);
+        ADD_OUTPUT_OP(output)
+    }
+}
+
+REGISTER_NPU_LAYER(Prelu, LAYER_PRELU)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prior_box_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prior_box_layer_convert.cc
new file mode 100644
index 0000000..65fb067
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_prior_box_layer_convert.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(PriorBox, LAYER_PRIOR_BOX)
+
+Status NpuPriorBoxLayer::Convert() {
+    auto param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    int img_height = param->img_h;
+    int img_width  = param->img_w;
+    if (img_height == 0 || img_width == 0) {
+        img_height = input_ops_[1]->GetShape()[2];
+        img_width  = input_ops_[1]->GetShape()[3];
+    }
+    auto output = std::make_shared<hiai::op::PriorBox>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_min_size(param->min_sizes);
+    output->set_attr_max_size(param->max_sizes);
+    output->set_attr_aspect_ratio(param->aspect_ratios);
+
+    output->set_attr_flip(param->flip);
+    output->set_attr_clip(param->clip);
+    output->set_attr_variance(param->variances);
+    output->set_attr_step_h(param->step_h);
+    output->set_attr_step_w(param->step_w);
+    output->set_attr_offset(param->offset);
+    output->set_attr_img_h(img_height);
+    output->set_attr_img_w(img_width);
+
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(PriorBox, LAYER_PRIOR_BOX)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_layer_convert.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_layer_convert.h
new file mode 100644
index 0000000..eefac60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_layer_convert.h
@@ -0,0 +1,93 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceLayer : public NpuBaseLayer {
+public:
+    NpuReduceLayer(LayerType layer_type) : NpuBaseLayer(layer_type){};
+    virtual ~NpuReduceLayer() {}
+
+protected:
+    Status GetReduceParam() {
+        // parameter and weight of the pooling layer
+        auto param = dynamic_cast<ReduceLayerParam *>(param_);
+        CHECK_PARAM_NULL(param);
+        axes_ = param->axis;
+        std::vector<int> input_shape_vec = input_ops_[0]->GetShape();
+
+        // check if all reduce
+        if (param->all_reduce) {
+            axes_.clear();
+            for (int i = 0; i < input_shape_vec.size(); i++) {
+                axes_.push_back(i);
+            }
+        } else {
+            for (int i = 0; i < axes_.size(); i++) {
+                if (axes_[i] < 0) {
+                    axes_[i] = input_shape_vec.size() + axes_[i];
+                }
+            }
+        }
+
+        keep_dims_ = param->keep_dims;
+        return TNN_OK;
+    }
+
+    template <class T>
+    Status ReduceConvert() {
+        GetReduceParam();
+
+        int reduce_size = axes_.size();
+        ge::Shape weight_shape({reduce_size});
+        ge::TensorDesc desc(weight_shape, ge::FORMAT_NCHW, ge::DT_INT32);
+        std::shared_ptr<ge::op::Const> axes_op = std::make_shared<ge::op::Const>(layer_name_ + "_axes");
+        NpuUtils::CreateAttrArray(axes_op, axes_, desc, reduce_size);
+        weight_ops_.push_back(axes_op);
+
+        auto output = std::make_shared<T>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_axes(*axes_op);
+        output->set_attr_keep_dims(keep_dims_);
+        ADD_OUTPUT_OP(output)
+    }
+
+    template <class T>
+    Status ReduceConvertAttr() {
+        GetReduceParam();
+
+        std::vector<int64_t> axes(axes_.begin(), axes_.end());
+
+        auto output = std::make_shared<T>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_attr_axes(axes);
+        output->set_attr_keep_dims(keep_dims_);
+        ADD_OUTPUT_OP(output)
+    }
+
+private:
+    std::vector<shared_ptr<ge::Operator>> weight_ops_;
+protected:
+    std::vector<int> axes_;
+    int keep_dims_;
+};
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_logsumexp_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_logsumexp_layer_convert.cc
new file mode 100644
index 0000000..fdd1e5f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_logsumexp_layer_convert.cc
@@ -0,0 +1,57 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceLogSumExpLayer : public NpuReduceLayer {
+public:
+    NpuReduceLogSumExpLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_LOG_SUM_EXP) {}
+    ~NpuReduceLogSumExpLayer() {}
+
+protected:
+    Status Convert() {
+        if (!(NpuUtils::VersionCompare(npu_version_, "100.500.xxx.xxx", VCT_BIGEQUAL) &&
+              ((NpuUtils::VersionCompare(npu_version_, "100.500.010.011", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.500.010.999", VCT_SMALLER)) ||
+               (NpuUtils::VersionCompare(npu_version_, "100.500.011.011", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.500.011.999", VCT_SMALLER)) ||
+               (NpuUtils::VersionCompare(npu_version_, "100.500.012.011", VCT_BIGEQUAL) &&
+                NpuUtils::VersionCompare(npu_version_, "100.500.012.999", VCT_SMALLER))))) {
+            LOGE("ReduceLogSumExp is supported from 100.500.010.011, but the device version is %s)\n", npu_version_.c_str());
+            return Status(TNNERR_LAYER_ERR, "Error: ReduceLogSumExp is not support in this rom version");
+        }
+
+        NpuReduceLayer::GetReduceParam();
+
+        std::vector<int64_t> axes(axes_.begin(), axes_.end());
+
+        auto output = std::make_shared<hiai::op::ReduceLogSumExp>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_attr_axes(axes);
+        output->set_attr_keepdims(keep_dims_);
+        ADD_OUTPUT_OP(output)
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_max_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_max_layer_convert.cc
new file mode 100644
index 0000000..259af7c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_max_layer_convert.cc
@@ -0,0 +1,38 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceMaxLayer : public NpuReduceLayer {
+public:
+    NpuReduceMaxLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_MAX) {}
+    ~NpuReduceMaxLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuReduceLayer::ReduceConvert<hiai::op::ReduceMax>();
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceMax, LAYER_REDUCE_MAX)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_mean_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_mean_layer_convert.cc
new file mode 100644
index 0000000..cfa0bad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_mean_layer_convert.cc
@@ -0,0 +1,38 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceMeanLayer : public NpuReduceLayer {
+public:
+    NpuReduceMeanLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_MEAN) {}
+    ~NpuReduceMeanLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuReduceLayer::ReduceConvert<hiai::op::ReduceMean>();
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceMean, LAYER_REDUCE_MEAN)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_min_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_min_layer_convert.cc
new file mode 100644
index 0000000..3ecf784
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_min_layer_convert.cc
@@ -0,0 +1,38 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceMinLayer : public NpuReduceLayer {
+public:
+    NpuReduceMinLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_MIN) {}
+    ~NpuReduceMinLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuReduceLayer::ReduceConvert<hiai::op::ReduceMin>();
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceMin, LAYER_REDUCE_MIN)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_prod_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_prod_layer_convert.cc
new file mode 100644
index 0000000..c855722
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_prod_layer_convert.cc
@@ -0,0 +1,37 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+
+namespace TNN_NS {
+
+class NpuReduceProdLayer : public NpuReduceLayer {
+public:
+    NpuReduceProdLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_PROD) {}
+    ~NpuReduceProdLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuReduceLayer::ReduceConvertAttr<hiai::op::ReduceProdD>();
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceProd, LAYER_REDUCE_PROD)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_sum_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_sum_layer_convert.cc
new file mode 100644
index 0000000..0df1f13
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reduce_sum_layer_convert.cc
@@ -0,0 +1,38 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_reduce_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+class NpuReduceSumLayer : public NpuReduceLayer {
+public:
+    NpuReduceSumLayer(LayerType ignore) : NpuReduceLayer(LAYER_REDUCE_SUM) {}
+    ~NpuReduceSumLayer() {}
+
+protected:
+    Status Convert() {
+        return NpuReduceLayer::ReduceConvert<hiai::op::ReduceSum>();
+    }
+};
+
+REGISTER_NPU_LAYER(ReduceSum, LAYER_REDUCE_SUM)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reshape_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reshape_layer_convert.cc
new file mode 100644
index 0000000..7752478
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_reshape_layer_convert.cc
@@ -0,0 +1,161 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Reshape, LAYER_RESHAPE)
+
+static Status ConvertShapeFromTNNToTFLite(std::vector<int>& shape) {
+    if (shape.empty()) {
+        LOGE("(Reshape) param->shape is empty\n");
+        return Status(TNNERR_PARAM_ERR, "(Reshape) param->shape is empty");
+    }
+
+    if (shape.size() < 3) {
+        return TNN_OK;
+    } else if (shape.size() == 3) {
+        auto c   = shape[1];
+        auto h   = shape[2];
+        shape[1] = h;
+        shape[2] = c;
+    } else if (shape.size() == 4) {
+        auto c   = shape[1];
+        auto h   = shape[2];
+        auto w   = shape[3];
+        shape[1] = h;
+        shape[2] = w;
+        shape[3] = c;
+    } else {
+        LOGE("(Reshape) param->shape is invalid for HUAWEI_NPU\n");
+        return Status(TNNERR_PARAM_ERR, "(Reshape) param->shape is invalid for HUAWEI_NPU");
+    }
+
+    return TNN_OK;
+}
+
+static Status GetPermuteOrder(std::vector<int64_t>& order, int dims_size, bool to_tflite) {
+    order.clear();
+    if (dims_size < 3) {
+        for (int i = 0; i < dims_size; ++i) {
+            order.push_back(i);
+        }
+    } else if (dims_size == 3) {
+        if (to_tflite) {
+            // from nch to nhc
+            order = {0, 2, 1};
+        } else {
+            // from nhc to nch
+            order = {0, 2, 1};
+        }
+    } else if (dims_size == 4) {
+        if (to_tflite) {
+            // from nchw to nhwc
+            order = {0, 2, 3, 1};
+        } else {
+            // from nhwc to nchw
+            order = {0, 3, 1, 2};
+        }
+    } else {
+        LOGE("(Reshape) is not support input dims > 4 HUAWEI_NPU\n");
+        return Status(TNNERR_PARAM_ERR, "(Reshape) is not support input dims > 4 HUAWEI_NPU");
+    }
+
+    return TNN_OK;
+}
+
+static Status InferShapeFromZero(std::vector<int> input_dims, std::vector<int>& shape) {
+    int infer_max_idx = (int)input_dims.size() - 1;
+
+    for (int i = 0; i < shape.size(); ++i) {
+        if (shape[i] == 0) {
+            if (i > infer_max_idx) {
+                LOGE("Reshape param is invalid, 0 in shape (pos: %d) can't be infered\n", i);
+                return Status(TNNERR_PARAM_ERR, "Reshape param is invalid, 0 in shape can't be infered");
+            }
+            if (input_dims[i] != 0) {
+                shape[i] = input_dims[i];
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status NpuReshapeLayer::Convert() {
+    auto param = dynamic_cast<ReshapeLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto shape = param->shape;
+    if (shape.size() > 4) {
+        LOGE("(Reshape) dims size bigger than 4 is not support in HUAWEI_NPU\n");
+        return Status(TNNERR_MODEL_ERR, "(Reshape) dims size bigger than 4 is not support in HUAWEI_NPU");
+    }
+
+    // infer shape to avoid the suitation: 0 exist in the shape position which is bigger than input.size(),
+    // this suitation exists in TFLite type Reshape
+    RETURN_ON_NEQ(InferShapeFromZero(input_ops_[0]->GetShape(), shape), TNN_OK);
+
+    if (param->reshape_type == 1) {
+        RETURN_ON_NEQ(ConvertShapeFromTNNToTFLite(shape), TNN_OK);
+    }
+
+    std::shared_ptr<ge::op::Const> shape_const = std::make_shared<ge::op::Const>(layer_name_ + "_shape");
+    ge::TensorDesc shape_desc(ge::Shape({(int64_t)shape.size()}), ge::FORMAT_NCHW, ge::DT_INT32);
+    NpuUtils::CreateAttrArray(shape_const, shape, shape_desc, (int)shape.size());
+    weight_ops_.push_back(shape_const);
+
+    if (param->reshape_type == 0) {
+        // onnx caffe reshape(nchw): 0
+        auto output = std::make_shared<hiai::op::Reshape>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_shape(*shape_const);
+        output->set_attr_axis(param->axis);
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("TFLite type Reshape is not support in HUAWEI_NPU\n");
+        return Status(TNNERR_MODEL_ERR, "TFLite type Reshape is not support in HUAWEI_NPU");
+
+        //std::vector<int64_t> order;
+        //// Tensorflow TFLite reshape(nhwc): 1
+        //// convert input form nchw to nhwc first
+        //auto permute_op = std::make_shared<hiai::op::Permute>(layer_name_ + "permute");
+        //permute_op->set_input_x(*input_ops_[0]->GetOperator());
+        //RETURN_ON_NEQ(GetPermuteOrder(order, input_ops_[0]->GetShape().size(), true), TNN_OK);
+        //permute_op->set_attr_order(order);
+        //weight_ops_.push_back(permute_op);
+
+        //// do reshape
+        //auto reshape_op = std::make_shared<hiai::op::Reshape>(layer_name_ + "reshape");
+        //reshape_op->set_input_x(*permute_op);
+        //reshape_op->set_input_shape(*shape_const);
+        //reshape_op->set_attr_axis(param->axis);
+        //weight_ops_.push_back(reshape_op);
+
+        //// convert input form nhwc to nchw
+        //auto output = std::make_shared<hiai::op::Permute>(outputs_name_[0]);
+        //output->set_input_x(*reshape_op);
+        //RETURN_ON_NEQ(GetPermuteOrder(order, shape.size(), false), TNN_OK);
+        //output->set_attr_order(order);
+
+        //ADD_OUTPUT_OP(output)
+    }
+}
+
+REGISTER_NPU_LAYER(Reshape, LAYER_RESHAPE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shape_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shape_layer_convert.cc
new file mode 100644
index 0000000..54723ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shape_layer_convert.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Shape, LAYER_SHAPE)
+
+Status NpuShapeLayer::Convert() {
+    auto output = std::make_shared<hiai::op::Shape>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Shape, LAYER_SHAPE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shuffle_channel_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shuffle_channel_layer_convert.cc
new file mode 100644
index 0000000..d97aa93
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_shuffle_channel_layer_convert.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(ShuffleChannel, LAYER_SHUFFLE_CHANNEL)
+
+Status NpuShuffleChannelLayer::Convert() {
+    auto param = dynamic_cast<ShuffleLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (input_ops_[0]->GetShape().size() != 4) {
+        return Status(TNNERR_PARAM_ERR, "Error: ShuffleChannel layer not support dim != 4 for HUAWEI_NPU");
+    }
+
+    auto output = std::make_shared<ge::op::ShuffleChannel>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_group(param->group);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(ShuffleChannel, LAYER_SHUFFLE_CHANNEL)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_softmax_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_softmax_layer_convert.cc
new file mode 100644
index 0000000..9e12a2e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_softmax_layer_convert.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER(Softmax, LAYER_SOFTMAX)
+
+Status NpuSoftmaxLayer::Convert() {
+    auto param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (input_ops_[0]->GetShape().size() != 4) {
+        return Status(TNNERR_PARAM_ERR, "Error: Softmax layer not support dim != 4 for HUAWEI_NPU");
+    }
+
+    auto output = std::make_shared<ge::op::Softmax>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_attr_axis(param->axis);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Softmax, LAYER_SOFTMAX)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_splitv_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_splitv_layer_convert.cc
new file mode 100644
index 0000000..eb9add8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_splitv_layer_convert.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(SplitV, LAYER_SPLITV)
+
+Status NpuSplitVLayer::Convert() {
+    auto param = dynamic_cast<SplitVLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    std::vector<int> slices_vec = param->slices;
+    ge::TensorDesc slices_desc(ge::Shape({(int64_t)slices_vec.size()}), ge::FORMAT_NCHW, ge::DT_INT32);
+    std::shared_ptr<ge::op::Const> slices_const = std::make_shared<ge::op::Const>(layer_name_ + "_slices_const");
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(slices_const, slices_vec, slices_desc, slices_vec.size()), TNN_OK);
+    weight_ops_.push_back(slices_const);
+
+    std::vector<int> split_dim_vec = {param->axis};
+    ge::TensorDesc split_dim_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_INT32);
+    std::shared_ptr<ge::op::Const> split_dim_const = std::make_shared<ge::op::Const>(layer_name_ + "_split_dim_const");
+    RETURN_ON_NEQ(NpuUtils::CreateAttrArray(split_dim_const, split_dim_vec, split_dim_desc, 1), TNN_OK);
+    weight_ops_.push_back(split_dim_const);
+
+    std::string split_output_name = layer_name_ + "split_output";
+    auto output = std::make_shared<hiai::op::SplitV>(split_output_name);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_size_splits(*slices_const);
+    output->set_input_split_dim(*split_dim_const);
+    output->set_attr_num_split(param->slices.size());
+    output->create_dynamic_output_y(param->slices.size());
+    weight_ops_.push_back(output);
+
+    // create data op to split output
+    // get output ops
+    for (int i = 0; i < outputs_name_.size(); i++) {
+        auto temp_op = std::make_shared<hiai::op::Permute>(outputs_name_[i]);
+        std::string output_node_name = "y" + std::to_string(i+1);
+
+        std::vector<int64_t> order;
+        order.clear();
+        for (int idx = 0; idx < output_shapes_[i].size(); ++idx) {
+            order.push_back(idx);
+        }
+
+        temp_op->set_input_x(*output, output_node_name);
+        temp_op->set_attr_order(order);
+
+        std::shared_ptr<OperatorInfo> output_op = std::make_shared<OperatorInfo>(temp_op);
+        output_op->SetShape(output_shapes_[i]);
+        output_ops_.push_back(output_op);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_NPU_LAYER(SplitV, LAYER_SPLITV)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc
new file mode 100644
index 0000000..3d93817
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Squeeze, LAYER_SQUEEZE)
+
+Status NpuSqueezeLayer::Convert() {
+    auto param = dynamic_cast<SqueezeLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    auto resource = dynamic_cast<SqueezeLayerResource*>(resource_);
+    if (param->data_in_resource && !resource) {
+        return Status(TNNERR_MODEL_ERR, "Squeeze resource is invalid");
+    }
+
+    auto output = std::make_shared<hiai::op::Squeeze>(outputs_name_[0]);
+    if (param->data_in_resource) {
+        DimsVector data_dims                      = resource->data.GetBufferDims();
+        std::shared_ptr<ge::op::Const> data_const = std::make_shared<ge::op::Const>(layer_name_ + "_data");
+        ge::Shape data_shape(NpuUtils::Int32VecToTVec<int64_t>(data_dims));
+        NpuUtils::CreateAttrValue(data_const, data_shape, resource->data);
+        weight_ops_.push_back(data_const);
+        output->set_input_x(*data_const);
+    } else {
+        output->set_input_x(*input_ops_[0]->GetOperator());
+    }
+    output->set_attr_axis(ge::AttrValue::LIST_INT(NpuUtils::Int32VecToTVec<long>(param->axes)));
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Squeeze, LAYER_SQUEEZE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slice_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slice_layer_convert.cc
new file mode 100644
index 0000000..a6bd2a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slice_layer_convert.cc
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(StridedSlice, LAYER_STRIDED_SLICE);
+
+Status NpuStridedSliceLayer::Convert() {
+    auto param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    std::vector<int> input_shape_vec = input_ops_[0]->GetShape();
+
+    auto begins = param->begins;
+    std::reverse(begins.begin(), begins.end());
+    auto ends = param->ends;
+    std::reverse(ends.begin(), ends.end());
+    auto strides = param->strides;
+    std::reverse(strides.begin(), strides.end());
+
+    for (int i = 0; i < begins.size(); ++i) {
+        if (begins[i] < 0) {
+            begins[i] += input_shape_vec[i];
+        }
+    }
+
+    for (int i = 0; i < ends.size(); ++i) {
+        if (ends[i] == 0) {
+            ends[i] = input_shape_vec[i];
+        }
+        if (ends[i] < 0) {
+            ends[i] += input_shape_vec[i];
+        }
+    }
+
+    ge::Shape input_shape({4});
+    ge::TensorDesc desc(input_shape, ge::FORMAT_NCHW, ge::DT_INT32);
+
+    // begins
+    std::shared_ptr<ge::op::Const> begins_op = std::make_shared<ge::op::Const>(layer_name_ + "_begin");
+    NpuUtils::CreateAttrArray(begins_op, begins, desc, 4);
+    weight_ops_.push_back(begins_op);
+
+    // ends
+    // in format nchw
+    std::shared_ptr<ge::op::Const> ends_op = std::make_shared<ge::op::Const>(layer_name_ + "_end");
+    NpuUtils::CreateAttrArray(ends_op, ends, desc, 4);
+    weight_ops_.push_back(ends_op);
+
+    // strides
+    // in format nchw
+    std::shared_ptr<ge::op::Const> strides_op = std::make_shared<ge::op::Const>(layer_name_ + "_stride");
+    NpuUtils::CreateAttrArray(strides_op, strides, desc, 4);
+    weight_ops_.push_back(strides_op);
+
+    // stride
+    auto output = std::make_shared<ge::op::StridedSlice>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_begin(*begins_op);
+    output->set_input_end(*ends_op);
+    output->set_input_strides(*strides_op);
+    output->set_attr_begin_mask(0);
+    output->set_attr_end_mask(0);
+    output->set_attr_ellipsis_mask(0);
+    output->set_attr_new_axis_mask(0);
+    output->set_attr_shrink_axis_mask(0);
+    ADD_OUTPUT_OP(output);
+}
+
+REGISTER_NPU_LAYER(StridedSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slicev2_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slicev2_layer_convert.cc
new file mode 100644
index 0000000..32cc266
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_stride_slicev2_layer_convert.cc
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "graph/op/nn_defs.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(StridedSliceV2, LAYER_STRIDED_SLICE_V2);
+
+Status NpuStridedSliceV2Layer::Convert() {
+    if (NpuUtils::VersionCompare(npu_version_, "100.500.010.010", VCT_SMALLER)) {
+        LOGE("StrideSliceV2 is not support in this rom version (%s)\n", npu_version_.c_str());
+        return Status(TNNERR_MODEL_ERR, "StrideSliceV2 is not support in this rom version");
+    }
+
+    auto param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    std::vector<int> input_shape_vec = input_ops_[0]->GetShape();
+
+    auto begins = param->begins;
+    std::reverse(begins.begin(), begins.end());
+    auto ends = param->ends;
+    std::reverse(ends.begin(), ends.end());
+    auto axes = param->axes;
+    std::reverse(axes.begin(), axes.end());
+    auto strides = param->strides;
+    std::reverse(strides.begin(), strides.end());
+
+    for (int i = 0; i < ends.size(); ++i) {
+        if (ends[i] == INT_MAX) {
+            ends[i] = input_shape_vec[axes[i]];
+        } else if (ends[i] == INT_MIN) {
+            ends[i] = -1;
+        } else if (ends[i] < 0) {
+            ends[i] += input_shape_vec[axes[i]];
+        }
+    }
+
+    ge::Shape param_shape({(int64_t)axes.size()});
+    ge::TensorDesc desc(param_shape, ge::FORMAT_NCHW, ge::DT_INT32);
+
+    // begins
+    std::shared_ptr<ge::op::Const> begins_op = std::make_shared<ge::op::Const>(layer_name_ + "_begin");
+    NpuUtils::CreateAttrArray(begins_op, begins, desc, begins.size());
+    weight_ops_.push_back(begins_op);
+
+    // ends
+    // in format nchw
+    std::shared_ptr<ge::op::Const> ends_op = std::make_shared<ge::op::Const>(layer_name_ + "_end");
+    NpuUtils::CreateAttrArray(ends_op, ends, desc, ends.size());
+    weight_ops_.push_back(ends_op);
+
+    // axes
+    // in format nchw
+    std::shared_ptr<ge::op::Const> axes_op = std::make_shared<ge::op::Const>(layer_name_ + "_axes");
+    NpuUtils::CreateAttrArray(axes_op, axes, desc, axes.size());
+    weight_ops_.push_back(axes_op);
+
+    // strides
+    // in format nchw
+    std::shared_ptr<ge::op::Const> strides_op = std::make_shared<ge::op::Const>(layer_name_ + "_stride");
+    NpuUtils::CreateAttrArray(strides_op, strides, desc, strides.size());
+    weight_ops_.push_back(strides_op);
+
+    // stride
+    auto output = std::make_shared<hiai::op::StridedSliceV2>(outputs_name_[0]);
+    output->set_input_x(*input_ops_[0]->GetOperator());
+    output->set_input_begin(*begins_op);
+    output->set_input_end(*ends_op);
+    output->set_input_axes(*axes_op);
+    output->set_input_strides(*strides_op);
+    output->set_attr_begin_mask(0);
+    output->set_attr_end_mask(0);
+    output->set_attr_ellipsis_mask(0);
+    output->set_attr_new_axis_mask(0);
+    output->set_attr_shrink_axis_mask(0);
+    ADD_OUTPUT_OP(output);
+}
+
+REGISTER_NPU_LAYER(StridedSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_unsqueeze_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_unsqueeze_layer_convert.cc
new file mode 100644
index 0000000..15ca03e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_unsqueeze_layer_convert.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Unsqueeze, LAYER_UNSQUEEZE)
+
+Status NpuUnsqueezeLayer::Convert() {
+    auto param = dynamic_cast<UnsqueezeLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    auto resource = dynamic_cast<UnsqueezeLayerResource*>(resource_);
+    if (param->data_in_resource && !resource) {
+        return Status(TNNERR_MODEL_ERR, "Unsqueeze resource is invalid");
+    }
+
+    auto output = std::make_shared<hiai::op::ExpandDims>(outputs_name_[0]);
+    if (param->data_in_resource) {
+        DimsVector data_dims                      = resource->data.GetBufferDims();
+        std::shared_ptr<ge::op::Const> data_const = std::make_shared<ge::op::Const>(layer_name_ + "_data");
+        ge::Shape data_shape(NpuUtils::Int32VecToTVec<int64_t>(data_dims));
+        NpuUtils::CreateAttrValue(data_const, data_shape, resource->data);
+        weight_ops_.push_back(data_const);
+        output->set_input_x(*data_const);
+    } else {
+        output->set_input_x(*input_ops_[0]->GetOperator());
+    }
+    std::shared_ptr<ge::op::Const> axis_const = std::make_shared<ge::op::Const>(layer_name_ + "_axis");
+    ge::TensorDesc desc(ge::Shape({(long)param->axes.size()}), ge::FORMAT_NCHW, ge::DT_INT32);
+    NpuUtils::CreateAttrArray(axis_const, param->axes, desc, param->axes.size());
+    weight_ops_.push_back(axis_const);
+    output->set_input_axis(*axis_const);
+    ADD_OUTPUT_OP(output)
+}
+
+REGISTER_NPU_LAYER(Unsqueeze, LAYER_UNSQUEEZE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_upsample_layer_convert.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_upsample_layer_convert.cc
new file mode 100644
index 0000000..8cfe840
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_upsample_layer_convert.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "graph/attr_value.h"
+#include "npu_base_layer_convert.h"
+#include "npu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_NPU_LAYER_WEIGHT(Upsample, LAYER_UPSAMPLE)
+
+Status NpuUpsampleLayer::Convert() {
+    auto param = dynamic_cast<UpsampleLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    float scale_w     = param->scales[0];
+    float scale_h     = param->scales[1];
+    int output_width  = (int)round(input_ops_[0]->GetShape()[3] * scale_w);
+    int output_height = (int)round(input_ops_[0]->GetShape()[2] * scale_h);
+
+    if (param->dims.size() >= 2) {
+        output_width  = param->dims[0];
+        output_height = param->dims[1];
+    }
+
+    const int resize_mode     = param->mode;
+    const bool align_corners  = param->align_corners == 0 ? false : true;
+    std::vector<int> dims_vec = param->dims;
+
+    std::shared_ptr<ge::op::Const> input_size_const = std::make_shared<ge::op::Const>(layer_name_ + "_input_size");
+    ge::TensorDesc desc(ge::Shape({2}), ge::FORMAT_ND, ge::DT_INT32);
+    NpuUtils::CreateAttrArray(input_size_const, std::vector<int>{output_height, output_width}, desc, 2);
+    weight_ops_.push_back(input_size_const);
+    if (resize_mode == 1) {
+        // nereast
+        auto output = std::make_shared<hiai::op::ResizeNearestNeighbor>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_size(*input_size_const);
+        output->set_attr_align_corners(false);
+        ADD_OUTPUT_OP(output)
+
+    } else if (resize_mode == 2) {
+        // bilinear/linear
+        auto output = std::make_shared<hiai::op::ResizeBilinearV2>(outputs_name_[0]);
+        output->set_input_x(*input_ops_[0]->GetOperator());
+        output->set_input_size(*input_size_const);
+        output->set_attr_align_corners(align_corners);
+        output->set_attr_half_pixel_centers(!align_corners);
+        ADD_OUTPUT_OP(output)
+    } else {
+        LOGE("the upsample type is not support in huawei NPU\n");
+        return Status(TNNERR_NPU_UNSUPPORT_ERROR, "the upsample scale is not support in huawei NPU");
+    }
+}
+
+REGISTER_NPU_LAYER(Upsample, LAYER_UPSAMPLE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.cc
new file mode 100644
index 0000000..efce9ea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.cc
@@ -0,0 +1,257 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "npu_utils.h"
+#include <stdlib.h>
+#include <sstream>
+#include "tnn/core/macro.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/split_utils.h"
+
+namespace TNN_NS {
+
+Status NpuUtils::CreateAttrValue(shared_ptr<ge::op::Const> &attr_value, ge::Shape shape, RawBuffer &raw_buffer) {
+    ge::TensorPtr tensor_ptr = std::make_shared<ge::Tensor>();
+    ge::Format format        = ge::FORMAT_RESERVED;
+    if (shape.GetDims().size() == 4) {
+        format = ge::FORMAT_NCHW;
+    } else {
+        format = ge::FORMAT_ND;
+    }
+    if (raw_buffer.GetDataType() == DATA_TYPE_FLOAT || raw_buffer.GetDataType() == DATA_TYPE_HALF) {
+        ge::TensorDesc desc(shape, format, ge::DT_FLOAT);
+        tensor_ptr->SetTensorDesc(desc);
+    } else if (raw_buffer.GetDataType() == DATA_TYPE_INT32) {
+        ge::TensorDesc desc(shape, format, ge::DT_INT32);
+        tensor_ptr->SetTensorDesc(desc);
+    } else {
+        LOGE("raw buffer data type is not support in CreateAttrValue");
+        return Status(TNNERR_INVALID_DATA, "raw buffer data type is not support in CreateAttrValue");
+    }
+    if (raw_buffer.GetDataType() == DATA_TYPE_HALF) {
+        // if filter handle is half, need convert to float first.
+        std::shared_ptr<float> float_data_ptr = GetFloatFromRawBuffer(raw_buffer);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        tensor_ptr->SetData((uint8_t *)float_data_ptr.get(), raw_buffer.GetDataCount() * sizeof(float));
+    } else if (raw_buffer.GetDataType() == DATA_TYPE_FLOAT) {
+        tensor_ptr->SetData(raw_buffer.force_to<uint8_t *>(), raw_buffer.GetBytesSize());
+    } else if (raw_buffer.GetDataType() == DATA_TYPE_INT32) {
+        tensor_ptr->SetData(raw_buffer.force_to<uint8_t *>(), raw_buffer.GetBytesSize());
+    } else {
+        tensor_ptr->SetData(raw_buffer.force_to<uint8_t *>(), raw_buffer.GetBytesSize());
+    }
+    attr_value->set_attr_value(tensor_ptr);
+    return TNN_OK;
+}
+
+Status NpuUtils::CreateInputData(std::shared_ptr<ge::op::Data> &input_data, std::string &input_name,
+                                 DimsVector dims_vector) {
+    // TO-DO: support int32 input
+    ge::TensorDesc desc(ge::Shape(NpuUtils::Int32VecToTVec<int64_t>(dims_vector)), ge::FORMAT_NCHW, ge::DT_FLOAT);
+
+    input_data = std::make_shared<ge::op::Data>(input_name);
+    input_data->update_input_desc_x(desc);
+    return TNN_OK;
+}
+
+Status NpuUtils::CreateConstOpFromResource(std::shared_ptr<OperatorInfo> &const_op, std::string name,
+                                           NetResource *net_resource) {
+    if (net_resource->constant_map.count(name) > 0) {
+        auto raw_buffer      = net_resource->constant_map[name];
+        auto data_const      = std::make_shared<ge::op::Const>(name + "_const");
+        auto raw_buffer_dims = raw_buffer->GetBufferDims();
+        ge::Shape const_shape(NpuUtils::Int32VecToTVec<int64_t>(raw_buffer_dims));
+        auto ret = NpuUtils::CreateAttrValue(data_const, const_shape, *raw_buffer);
+        if (ret != TNN_OK) {
+            LOGE("The input op of layer which is found in resource convert to const op failed\n");
+            return ret;
+        }
+        const_op = std::make_shared<OperatorInfo>(data_const);
+        const_op->SetShape(raw_buffer_dims);
+    } else {
+        LOGE("The input op of layer is not found in resource");
+        return Status(TNNERR_LAYER_ERR, "The input op of layer is not found in resource");
+    }
+
+    return TNN_OK;
+}
+
+Status NpuUtils::WriteModelFile(domi::ModelBufferData &model_buffer_data, std::string file_path) {
+    int file_length = model_buffer_data.length;
+    if (file_length == 0) {
+        LOGE("ERROR: The file length equals to 0 build model fail\n");
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: The file length equals to 0 build model fail");
+    }
+    std::ofstream file(file_path.c_str(), std::ios::binary);
+    file.write(static_cast<char *>(model_buffer_data.data), model_buffer_data.length);
+    file.close();
+    return TNN_OK;
+}
+
+Status NpuUtils::GetPadMode(int &pad_mode, int pad_type) {
+    // huawei_npu pad mode
+    if (pad_type == 0) {
+        // pad_type : SAME_UPPER or SAME_LOWER
+        // pad_mode : SAME
+        pad_mode = 6;
+    } else if (pad_type == 1) {
+        // pad_type : VALID
+        // pad_mode : VALID 5
+        pad_mode = 5;
+    } else if (pad_type == -1) {
+        // pad_type : NOSET
+        pad_mode = 0;
+    } else {
+        return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+    }
+    return TNN_OK;
+}
+
+static bool IsNumberString(std::string num_str) {
+    const char *num_char = num_str.c_str();
+
+    for (int i = 0; i < num_str.length(); ++i) {
+        if (!(num_char[i] >= '0' && num_char[i] <= '9')) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool NpuUtils::IsVersionValid(std::string version) {
+    str_arr version_vec;
+    auto ret = SplitUtils::SplitStr(version.c_str(), version_vec, ".");
+    if (ret != TNN_OK) {
+        LOGE("split npu version failed (str: %s)\n", version.c_str());
+        return false;
+    }
+
+    if (version_vec.size() != 4) {
+        return false;
+    }
+
+    for (auto val : version_vec) {
+        if (!IsNumberString(val) && val != "xxx")
+            return false;
+    }
+
+    return true;
+}
+
+bool NpuUtils::VersionCompare(std::string version, std::string cmp, VersionCompareType type) {
+    if (!IsVersionValid(version) || !IsVersionValid(cmp)) {
+        LOGE("invalid version(s1: %s  s2: %s)\n", version.c_str(), cmp.c_str());
+        return false;
+    }
+
+    str_arr version_vec;
+    str_arr cmp_vec;
+
+    auto ret = SplitUtils::SplitStr(version.c_str(), version_vec, ".");
+    if (ret != TNN_OK) {
+        LOGE("split npu version failed (str: %s)\n", version.c_str());
+        return false;
+    }
+
+    ret = SplitUtils::SplitStr(cmp.c_str(), cmp_vec, ".");
+    if (ret != TNN_OK) {
+        LOGE("split npu version failed (str: %s)\n", cmp.c_str());
+        return false;
+    }
+
+    for (unsigned int i = 0; i < version_vec.size(); ++i) {
+        int version_val = 0;
+        int cmp_val     = 0;
+
+        if (version_vec[i] == "xxx") {
+            version_val = -1;
+        }
+        if (cmp_vec[i] == "xxx") {
+            cmp_val = -1;
+        }
+
+        version_val = atoi(version_vec[i].c_str());
+        cmp_val     = atoi(cmp_vec[i].c_str());
+
+        if (VCT_SMALLER == type || VCT_SMALLEQUAL == type) {
+            if (version_val < cmp_val) {
+                return true;
+            } else if (version_val > cmp_val) {
+                return false;
+            }
+        } else if (VCT_BIGGER == type || VCT_BIGEQUAL == type) {
+            if (version_val < cmp_val) {
+                return false;
+            } else if (version_val > cmp_val) {
+                return true;
+            }
+        }
+    }
+
+    if (VCT_SMALLEQUAL == type || VCT_BIGEQUAL == type) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+void NpuUtils::SplitNetwork(const int cpu_count, NetStructure *net_structure, std::set<std::string> &visited,
+                            std::map<std::string, shared_ptr<OperatorInfo>> &global_operator_map) {
+    std::vector<shared_ptr<LayerInfo>> layers;
+    // only view input
+    InputShapesMap sub_input_shapes_map;
+    for (int i = cpu_count; i < net_structure->layers.size(); i++) {
+        std::shared_ptr<LayerInfo> &layer_info = net_structure->layers[i];
+        for (std::string &input : layer_info->inputs) {
+            // if the subnetwork input exists in visited
+            if (visited.count(input) > 0) {
+                // if the input has not defined in new inputShapeMap yet
+                if (sub_input_shapes_map.count(input) == 0) {
+                    sub_input_shapes_map[input] = global_operator_map[input]->GetShape();
+                }
+            }
+        }
+        layers.push_back(layer_info);
+    }
+    net_structure->layers           = layers;
+    net_structure->inputs_shape_map = sub_input_shapes_map;
+
+    // remove outputs which has visited
+    std::set<std::string> outputs;
+    for (auto output : net_structure->outputs) {
+        if (visited.count(output) == 0) {
+            outputs.insert(output);
+        }
+    }
+    net_structure->outputs = outputs;
+}
+
+ge::DataType NpuUtils::ConvertToHiaiDataType(TNN_NS::DataType tnn_dtype) {
+    if (DATA_TYPE_FLOAT == tnn_dtype) {
+        return ge::DT_FLOAT;
+    } else if (DATA_TYPE_HALF == tnn_dtype) {
+        return ge::DT_FLOAT16;
+    } else if (DATA_TYPE_INT8 == tnn_dtype) {
+        return ge::DT_INT8;
+    } else if (DATA_TYPE_INT32 == tnn_dtype) {
+        return ge::DT_INT32;
+    } else if (DATA_TYPE_BFP16 == tnn_dtype) {
+        return ge::DT_UNDEFINED;
+    }
+    return ge::DT_UNDEFINED;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.h b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.h
new file mode 100644
index 0000000..ce78a80
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/convert/npu_utils.h
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_UTILS_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_UTILS_H_
+
+#include <tnn/core/blob.h>
+#include <tnn/interpreter/layer_resource.h>
+#include <tnn/interpreter/net_resource.h>
+#include <tnn/interpreter/net_structure.h>
+#include <tnn/interpreter/raw_buffer.h>
+
+#include "graph/compatible/all_ops.h"
+#include "graph/operator.h"
+#include "hiai_ir_build.h"
+#include "npu_base_layer_convert.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    VCT_SMALLER = 0,
+    VCT_SMALLEQUAL,
+    VCT_BIGGER,
+    VCT_BIGEQUAL,
+} VersionCompareType;
+
+class NpuUtils {
+public:
+    static Status CreateInputData(std::shared_ptr<ge::op::Data> &input_data, std::string &input_name,
+                                  DimsVector dims_vector);
+
+    static Status CreateAttrValue(std::shared_ptr<ge::op::Const> &attr_value, ge::Shape shape, RawBuffer &raw_buffer);
+
+    template <class T>
+    static Status CreateAttrArray(std::shared_ptr<ge::op::Const> &attr_value, std::vector<T> data,
+                                  ge::TensorDesc input_desc, int length) {
+        ge::TensorPtr input_size_tensor = std::make_shared<ge::Tensor>(input_desc);
+        // since 1-d array total size = sizeof(datatype) * length
+        input_size_tensor->SetData((uint8_t *)data.data(), sizeof(T) * length);
+        attr_value->set_attr_value(input_size_tensor);
+        return TNN_OK;
+    }
+
+    static Status CreateConstOpFromResource(std::shared_ptr<OperatorInfo> &const_op, std::string name,
+                                            NetResource *net_resource);
+
+    static Status WriteModelFile(domi::ModelBufferData &model_buffer_data, std::string file_path);
+
+    static Status GetPadMode(int &pad_mode, int pad_type);
+
+    static bool IsVersionValid(std::string version);
+
+    static bool VersionCompare(std::string version, std::string cmp, VersionCompareType type);
+
+    static void SplitNetwork(const int cpu_count, NetStructure *net_structure, std::set<std::string> &visited,
+                             std::map<std::string, shared_ptr<OperatorInfo>> &global_operator_map);
+
+    template <class T>
+    static std::vector<T> Int32VecToTVec(std::vector<int> vec) {
+        std::vector<T> result;
+        result.clear();
+        for (auto value : vec) {
+            result.push_back((T)value);
+        }
+        return result;
+    }
+
+    static ge::DataType ConvertToHiaiDataType(TNN_NS::DataType tnn_dtype);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_blob_converter.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_blob_converter.cc
new file mode 100644
index 0000000..f9b0786
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_blob_converter.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/blob_converter_default.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+class NpuBlobConverterAcc : public DefaultBlobConverterAcc {
+public:
+    NpuBlobConverterAcc(Blob *blob) : DefaultBlobConverterAcc(blob) {}
+    ~NpuBlobConverterAcc() {}
+};
+
+DECLARE_BLOB_CONVERTER_CREATER(Npu);
+REGISTER_BLOB_CONVERTER(Npu, DEVICE_HUAWEI_NPU);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.cc
new file mode 100644
index 0000000..d6a89eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/huawei_npu/npu_context.h"
+
+namespace TNN_NS {
+
+Status NpuContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status NpuContext::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status NpuContext::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+
+Status NpuContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status NpuContext::Synchronize() {
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.h b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.h
new file mode 100644
index 0000000..a9143a3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_context.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class NpuContext : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.cc
new file mode 100644
index 0000000..cbb2142
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/huawei_npu/npu_device.h"
+#include "tnn/device/huawei_npu/npu_context.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+
+namespace TNN_NS {
+
+NpuDevice::NpuDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+NpuDevice::~NpuDevice() {}
+
+BlobMemorySizeInfo NpuDevice::Calculate(BlobDesc& desc) {
+    return Calculate1DMemorySize(desc);
+}
+
+Status NpuDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    if (handle) {
+        *handle = malloc(GetBlobMemoryBytesSize(size_info));
+    }
+    return TNN_OK;
+}
+
+Status NpuDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_NAIVE;
+    if (mat_type == NCHW_FLOAT) {
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("CpuDevice dont support mat_type:%d", mat_type);
+        return Status(TNNERR_PARAM_ERR, "cpu dont support mat_type");
+    }
+}
+
+Status NpuDevice::Free(void* handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+Status NpuDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+Status NpuDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc* NpuDevice::CreateLayerAcc(LayerType type) {
+    auto& layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    }
+    return NULL;
+}
+
+Context* NpuDevice::CreateContext(int device_id) {
+    return new NpuContext();
+}
+
+NetworkType NpuDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_HUAWEI_NPU;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>& NpuDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+TypeDeviceRegister<NpuDevice> g_npu_device_register(DEVICE_HUAWEI_NPU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.h b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.h
new file mode 100644
index 0000000..b83f132
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_device.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_DEVICE_H_
+
+#include <cstring>
+#include <map>
+#include <memory>
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief NpuDevice create huawei_npu memory and huawei_npu layer acc
+
+class NpuDevice : public AbstractDevice {
+public:
+    explicit NpuDevice(DeviceType device_type);
+
+    ~NpuDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+private:
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.cc b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.cc
new file mode 100644
index 0000000..369ecda
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.cc
@@ -0,0 +1,703 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "npu_network.h"
+
+#include <sys/time.h>
+#include <tnn/device/huawei_npu/convert/npu_base_layer_convert.h>
+#include <tnn/interpreter/layer_resource_generator.h>
+
+#include "HiAiModelManagerService.h"
+#include "graph/model.h"
+#include "hiai_ir_build.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/device/huawei_npu/convert/npu_utils.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/tnn/model_interpreter.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<NpuNetwork>> g_network_impl_npu_factory_register(NETWORK_TYPE_HUAWEI_NPU);
+
+NpuNetwork::NpuNetwork() {
+    model_name_ = "";
+    client_     = nullptr;
+    input_tensor_.clear();
+    output_tensor_.clear();
+}
+
+NpuNetwork::~NpuNetwork() {
+    DeInit();
+}
+
+Status NpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+    // config check
+    if (InitConfigCheck(net_config, model_config)) {
+        return Status(TNNERR_NULL_PARAM, "ERROR: Npu not support device_type or model type");
+    }
+
+    // rom version check
+    client_        = std::make_shared<hiai::AiModelMngerClient>();
+    Status tnn_ret = RomVersionCheck();
+    if (tnn_ret != TNN_OK) {
+        return tnn_ret;
+    }
+
+    // create context
+    tnn_ret = InitContext(net_config);
+    if (tnn_ret != TNN_OK) {
+        return tnn_ret;
+    }
+
+    // get interpreter
+    auto *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    net_structure_            = default_interpreter->GetNetStructure();
+
+    // check if store the om file
+    use_path_ = (net_config.cache_path.compare("") != 0);
+
+    // modify the inputShapeMap. if reshape, add a suffix to the model name to create a new model
+    InputShapesMap input_shapes_map_temp = net_structure_->inputs_shape_map;
+    std::string model_suffix = NpuCommonUtils::modifyModelInputSize(max_inputs_shape, input_shapes_map_temp);
+
+    // init the path to store/read om
+    auto params_md5 = default_interpreter->GetParamsMd5();
+    model_name_     = "";
+    for (auto item : params_md5) {
+        auto str = item.substr(0, item.length() > 6 ? 6 : item.length());
+        model_name_ += str + "_";
+    }
+    model_name_            = model_name_ + model_suffix + "_" + version_str_;
+    std::string model_path = use_path_ ? net_config.cache_path + "/" + model_name_ + ".om" : "";
+    LOGI("[TNN/NPU]The path %s\n", model_path.c_str());
+
+    // hiai model init
+    InputShapesMap cpu_inputs_shape;
+    tnn_ret = HiAIModelInit(model_path, net_config, model_config, default_interpreter, input_shapes_map_temp,
+                            cpu_inputs_shape);
+    if (tnn_ret != TNN_OK) {
+        return tnn_ret;
+    }
+
+    // create tnn input/output blobs
+    tnn_ret = InitBlobs(input_shapes_map_temp, cpu_inputs_shape);
+    if (tnn_ret != TNN_OK) {
+        return tnn_ret;
+    }
+
+    return TNN_OK;
+}
+
+bool NpuNetwork::InitConfigCheck(NetworkConfig &net_config, ModelConfig &model_config) {
+    return net_config.device_type != DEVICE_HUAWEI_NPU || model_config.model_type != MODEL_TYPE_TNN;
+}
+
+// check Npu init situation
+Status NpuNetwork::RomVersionCheck() {
+    // Start to load HiAi API
+    if (client_ == nullptr) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: HiaiDDK API load error, check ddk");
+    }
+
+    // init Ai Model Manager Client
+    hiai::AIStatus ret = client_->Init(nullptr);
+    if (ret != hiai::AI_SUCCESS) {
+        return Status(TNNERR_NPU_LOAD_ERROR, "ERROR: huawei_npu is not installed");
+    }
+    // get rom version
+    const char *version = client_->GetVersion();
+    if (version == nullptr) {
+        return Status(
+            TNNERR_NPU_LOAD_ERROR,
+            "ERROR: GetRomVersion(ROM): huawei npu is not match (only support DaVinci NPU) or rom version is too low");
+    }
+    version_str_ = version;
+
+    // check if NPU version is greater than 300
+    LOGI("[TNN/NPU]ddk current version: %s\n", version_str_.c_str());
+    if (!NpuUtils::VersionCompare(version_str_, "100.320.xxx.xxx", VCT_BIGEQUAL)) {
+        return Status(TNNERR_NPU_LOAD_ERROR, "ERROR: huawei_npu is installed but is below 100.320.xxx.xxx");
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::InitContext(NetworkConfig &net_config) {
+    Status ret = TNN_OK;
+    device_    = GetDevice(net_config.device_type);
+    if (device_ == NULL) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+    context_ = device_->CreateContext(net_config.device_id);
+    if (context_ == NULL) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+
+    ret = context_->LoadLibrary(net_config.library_path);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::HiAIModelInit(std::string model_path, NetworkConfig &net_config, ModelConfig &model_config,
+                                 DefaultModelInterpreter *interpreter, InputShapesMap inputs_shape,
+                                 InputShapesMap &cpu_inputs_shape) {
+    // hiai variables
+    std::vector<std::shared_ptr<hiai::AiModelDescription>> model_desc;
+    auto model_builder                = std::make_shared<hiai::AiModelBuilder>(client_);
+    hiai::MemBuffer *model_mem_buffer = nullptr;
+    // hiai ir variables
+    domi::HiaiIrBuild ir_build;
+    domi::ModelBufferData om_model_buff;
+
+    if (use_path_ && NpuCommonUtils::FileExits(model_path)) {
+        LOGI("[TNN/NPU]The om file already exists in %s\n", model_path.c_str());
+        model_mem_buffer = model_builder->InputMemBufferCreate(model_path);
+    } else {
+        // NPU IR build
+        Status ir_ret = IRInitLayers(net_config, interpreter, inputs_shape);
+        if (ir_ret != TNN_OK) {
+            LOGI("[TNN/NPU] Some layers not support in NPU, switch to ARM\n");
+            if (cpu_count_ != net_structure_->layers.size()) {
+                // create sub_network_interp_
+                sub_network_interp_.reset(new ModelInterpreter());
+                *sub_network_interp_->GetNetStructure() = *interpreter->GetNetStructure();
+                *sub_network_interp_->GetNetResource()  = *interpreter->GetNetResource();
+
+                ir_ret = InitSubNetwork(net_config, model_config, sub_network_interp_.get(), cpu_inputs_shape);
+                if (ir_ret != TNN_OK) {
+                    return ir_ret;
+                }
+            }
+        }
+        // update use path, if the network is split, then don't save om file
+        use_path_ = use_path_ && !use_subnet_;
+        // set Graph
+        ir_ret = SetGraphInputsAndOutputs(inputs_shape, cpu_inputs_shape);
+        if (ir_ret != TNN_OK) {
+            return ir_ret;
+        }
+        // build Graph
+        ir_ret = BuildGraph(ir_build, om_model_buff);
+        if (ir_ret != TNN_OK) {
+            return ir_ret;
+        }
+        // if path is specified, then first write to file, load from file later
+        if (use_path_) {
+            ir_ret = NpuUtils::WriteModelFile(om_model_buff, model_path);
+            if (ir_ret != TNN_OK) {
+                return ir_ret;
+            }
+        }
+
+        // finish build, start to load the model
+        model_mem_buffer = model_builder->InputMemBufferCreate(om_model_buff.data, om_model_buff.length);
+    }
+
+    if (model_mem_buffer == nullptr) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: function InputMemBufferCreate() failed");
+    }
+
+    // set model attribute
+    std::shared_ptr<hiai::AiModelDescription> desc = std::make_shared<hiai::AiModelDescription>(
+        model_name_, hiai::AiModelDescription_Frequency_HIGH, hiai::HIAI_FRAMEWORK_NONE, hiai::HIAI_MODELTYPE_ONLINE,
+        hiai::AiModelDescription_DeviceType_NPU);
+
+    desc->SetModelBuffer(model_mem_buffer->GetMemBufferData(), model_mem_buffer->GetMemBufferSize());
+
+    // only load one model
+    model_desc.push_back(desc);
+
+    // load model
+    hiai::AIStatus ret = client_->Load(model_desc);
+    if (ret != hiai::AI_SUCCESS) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: Load model Load() failed");
+    }
+
+    // destroy unused memory
+    model_builder->MemBufferDestroy(model_mem_buffer);
+    ir_build.ReleaseModelBuff(om_model_buff);
+
+    // check model
+    bool is_compatible = true;
+    ret                = client_->CheckModelCompatibility(*desc, is_compatible);
+    LOGI("[TNN/NPU] is model compatible: %s", is_compatible ? "true" : "false");
+    LOGI("[TNN/NPU] ret value %d", ret);
+    if (ret != hiai::AI_SUCCESS) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: check model CheckModelCompatibility() failed");
+    }
+
+    return TNN_OK;
+}
+
+Status NpuNetwork::IRInitLayers(NetworkConfig &net_config, DefaultModelInterpreter *interpreter,
+                                InputShapesMap &inputs_shape) {
+    Status ret                = TNN_OK;
+    NetResource *net_resource = interpreter->GetNetResource();
+
+    if (net_structure_ == NULL || net_resource == NULL) {
+        return Status(TNNERR_NULL_PARAM, "ERROR: network_ is nil, network_type may not support");
+    }
+
+    ret = optimizer::NetOptimizerManager::Optimize(net_structure_, net_resource, net_config);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    // Create input operators
+    ret = CreateGraphInputs(inputs_shape);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    // Init layers
+    ret = ConvertLayers(net_resource);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::InitSubNetwork(NetworkConfig &net_config, ModelConfig &model_config,
+                                  DefaultModelInterpreter *interpreter, InputShapesMap &cpu_inputs_shape) {
+    // from here load cpu
+    sub_network_                 = std::make_shared<DefaultNetwork>();
+    NetworkConfig cpu_net_config = net_config;
+    cpu_net_config.device_type   = DEVICE_ARM;
+    cpu_net_config.network_type  = NETWORK_TYPE_DEFAULT;
+    // change the network_structure for split
+    NpuUtils::SplitNetwork(cpu_count_, interpreter->GetNetStructure(), visited_, global_operator_map_);
+    cpu_inputs_shape = interpreter->GetNetStructure()->inputs_shape_map;
+    if (cpu_inputs_shape.empty()) {
+        LOGE(
+            "ERROR: When split the network,  the arm can not find input in the huawei_npu visited "
+            "layers\n");
+        return Status(TNNERR_LAYER_ERR,
+                      "ERROR: When split the network,  the arm can not find input in the huawei_npu visited layers");
+    }
+    Status ret = sub_network_->Init(cpu_net_config, model_config, interpreter, cpu_inputs_shape, cpu_inputs_shape);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    use_subnet_ = true;
+    return TNN_OK;
+}
+
+Status NpuNetwork::ConvertLayers(NetResource *net_resource) {
+    Status ret = TNN_OK;
+    // loop net_structure
+    cpu_count_ = 0;
+    for (auto layer_info : net_structure_->layers) {
+        auto const_layers = net_resource->constant_layers;
+        if (const_layers.find(layer_info->name) != const_layers.end()) {
+            LOGI("layer(name: %s) is constant layer, skip convert\n", layer_info->name.c_str());
+            cpu_count_++;
+            continue;
+        }
+        LOGI("convert layer (type: %d, name: %s)\n", layer_info->type, layer_info->name.c_str());
+        LayerType type          = layer_info->type;
+        NpuBaseLayer *cur_layer = CreateNpuBaseLayer(type);
+        if (cur_layer == nullptr) {
+            LOGE("Error Init layer tyep %d (layer name: %s), huawei_npu does not support, may switch to arm\n",
+                 layer_info->type, layer_info->name.c_str());
+            return Status(TNNERR_LAYER_ERR, "CreateLayer failed");
+        }
+        std::string layer_name = layer_info->name;
+        cur_layer->SetLayerName(layer_name);
+        cur_layer->SetNpuVersion(version_str_);
+
+        // set layer nodes
+        std::vector<std::shared_ptr<OperatorInfo>> input_ops;
+#ifdef GENERATE_RESOURCE
+        std::vector<Blob *> input_blobs;
+        BlobDesc blob_desc;
+#endif
+        for (std::string &name : layer_info->inputs) {
+            if (global_operator_map_.count(name) <= 0) {
+                std::shared_ptr<OperatorInfo> const_op;
+                ret = NpuUtils::CreateConstOpFromResource(const_op, name, net_resource);
+                if (ret != TNN_OK) {
+                    LOGE("The input op of layer is not found, may switch to arm\n");
+                    return Status(TNNERR_LAYER_ERR, "The input op of layer is not found");
+                }
+                global_operator_map_[name] = const_op;
+            }
+            input_ops.push_back(global_operator_map_[name]);
+#ifdef GENERATE_RESOURCE
+            blob_desc.dims = global_operator_map_[name]->GetShape();
+            Blob *blob     = new Blob(blob_desc);
+            input_blobs.push_back(blob);
+#endif
+        }
+#ifdef GENERATE_RESOURCE
+        // generate resource if null
+        if (net_resource->resource_map.count(layer_name) == 0) {
+            LayerParam *layer_param  = layer_info->param.get();
+            LayerResource *layer_res = nullptr;
+            GenerateRandomResource(type, layer_param, &layer_res, input_blobs);
+            net_resource->resource_map[layer_name] = std::shared_ptr<LayerResource>(layer_res);
+        }
+        for (auto &blob : input_blobs) {
+            delete (blob);
+        }
+#endif
+        LayerResource *layer_resource = net_resource->resource_map[layer_name].get();
+        /*
+         * cur_layer->convert
+         */
+        ret =
+            cur_layer->Init(context_, layer_info->param.get(), layer_resource, input_ops, device_, layer_info->outputs);
+        if (ret != TNN_OK) {
+            LOGE("Error Init layer %s (%s), may switch to arm\n", cur_layer->GetLayerName().c_str(),
+                 ret.description().c_str());
+            return ret;
+        }
+        layers_.push_back(cur_layer);
+
+        for (auto &op : cur_layer->GetOutputOps()) {
+            visited_.insert(op->GetOperator()->GetName());
+            global_operator_map_[op->GetOperator()->GetName()] = op;
+        }
+        cpu_count_++;
+    }
+    return ret;
+}
+
+Status NpuNetwork::CreateGraphInputs(InputShapesMap &input_shape_map) {
+    Status ret = TNN_OK;
+    // init graph input
+    auto iterator = input_shape_map.begin();
+    for (; iterator != input_shape_map.end(); iterator++) {
+        shared_ptr<ge::op::Data> input_data;
+        std::string input_name           = iterator->first;
+        DimsVector dims_vector           = iterator->second;
+        ret                              = NpuUtils::CreateInputData(input_data, input_name, dims_vector);
+        auto input_op                    = std::make_shared<OperatorInfo>(input_data, dims_vector);
+        global_operator_map_[input_name] = input_op;
+        visited_.insert(input_name);
+    }
+    return ret;
+}
+
+Status NpuNetwork::SetGraphInputsAndOutputs(InputShapesMap &input_shape_map, InputShapesMap &cpu_input_shape_map) {
+    // init graph input
+    std::vector<ge::Operator> input_ops;
+    std::vector<ge::Operator> output_ops;
+    auto iterator = input_shape_map.begin();
+    for (; iterator != input_shape_map.end(); iterator++) {
+        std::string input_name = iterator->first;
+        input_ops.push_back(*global_operator_map_[input_name]->GetOperator());
+    }
+    // init graph output
+    if (use_subnet_) {
+        // push output first, which is the input of sub_network
+        auto iterator = cpu_input_shape_map.begin();
+        for (; iterator != cpu_input_shape_map.end(); iterator++) {
+            if (input_shape_map.count(iterator->first) == 0) {
+                if (global_operator_map_[iterator->first] != nullptr) {
+                    output_ops.push_back(*global_operator_map_[iterator->first]->GetOperator());
+                } else {
+                    return Status(TNNERR_LAYER_ERR, "ERROR: When init the cpu network, some input not found\n");
+                }
+            }
+        }
+
+        // push network output later, which is not the input of sub_network.
+        // Note that: the order of set outputs must be the same as the order to init output blobs
+        for (auto &name : net_structure_->outputs) {
+            if (global_operator_map_.count(name) != 0) {
+                output_ops.push_back(*global_operator_map_[name]->GetOperator());
+            }
+        }
+    } else {
+        for (auto &name : net_structure_->outputs) {
+            if (input_shape_map.count(name) == 0) {
+                output_ops.push_back(*global_operator_map_[name]->GetOperator());
+            }
+        }
+    }
+    graph_.SetInputs(input_ops).SetOutputs(output_ops);
+    return TNN_OK;
+}
+
+Status NpuNetwork::BuildGraph(domi::HiaiIrBuild &ir_build, domi::ModelBufferData &om_model_buff) {
+    ge::Model model(model_name_, model_name_ + "_v1");
+    model.SetGraph(graph_);
+    // build options
+    bool build_ret = ir_build.CreateModelBuff(model, om_model_buff);
+    if (!build_ret) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "HIAI build model, CreateModelBuff() failed");
+    }
+    build_ret = ir_build.BuildIRModel(model, om_model_buff);
+    if (!build_ret) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "HIAI build model, BuildIRModel() failed");
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::InitBlobs(InputShapesMap &inputs_shape, InputShapesMap &cpu_inputs_shape) {
+    input_tensor_.clear();
+    output_tensor_.clear();
+    std::vector<hiai::TensorDimension> input_dims;
+    std::vector<hiai::TensorDimension> output_dims;
+    hiai::AIStatus ret = client_->GetModelIOTensorDim(model_name_, input_dims, output_dims);
+    if (ret != hiai::AI_SUCCESS) {
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: function GetModelIOTensorDim() failed");
+    }
+    if (input_dims.size() == 0) {
+        return Status(TNNERR_MODEL_ERR, "ERROR: Npu the model input_dims.size() == 0");
+    }
+
+    for (auto dim : input_dims) {
+        std::shared_ptr<hiai::AiTensor> input = std::make_shared<hiai::AiTensor>();
+        ret                                   = input->Init(&dim);
+
+        if (ret != hiai::AI_SUCCESS) {
+            return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR:Get input tensor from loaded model failed");
+        }
+        input_tensor_.push_back(input);
+    }
+
+    for (auto dim : output_dims) {
+        std::shared_ptr<hiai::AiTensor> output = std::make_shared<hiai::AiTensor>();
+        ret                                    = output->Init(&dim);
+        if (ret != hiai::AI_SUCCESS) {
+            return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR:Get output tensor from loaded model failed");
+        }
+        output_tensor_.push_back(output);
+    }
+
+    // init input blobs
+    int input_idx = 0;
+    for (auto item : inputs_shape) {
+        auto name             = item.first;
+        auto npu_blob         = CreateNpuBlob(input_dims[input_idx], name, input_tensor_[input_idx]->GetBuffer());
+        input_blob_map_[name] = npu_blob;
+        input_idx++;
+    }
+
+    // init output blobs
+    if (use_subnet_) {
+        // Note that: the order of init output blobs must be the same as the order of setting input ops
+        sub_network_->GetAllInputBlobs(cpu_inter_in_blobmap_);
+        // create sub-network input blob-converter
+        for (auto blob_item : cpu_inter_in_blobmap_) {
+            cpu_blob_converter_map_[blob_item.first] = std::make_shared<BlobConverter>(blob_item.second);
+        }
+
+        int output_idx = 0;
+        for (auto item : cpu_inputs_shape) {
+            auto name = item.first;
+            if (input_blob_map_.count(name) != 0) {
+                auto desc                    = input_blob_map_[name]->GetBlobDesc();
+                auto handle                  = input_blob_map_[name]->GetHandle();
+                npu_inter_out_blobmap_[name] = new Blob(desc, handle);
+            } else {
+                auto npu_blob = CreateNpuBlob(output_dims[output_idx], name, output_tensor_[output_idx]->GetBuffer());
+                npu_inter_out_blobmap_[name] = npu_blob;
+                output_idx++;
+            }
+        }
+
+        // get the sub_network_ output first
+        sub_network_->GetAllOutputBlobs(output_blob_map_);
+
+        // add the output which is in npu_network
+        for (auto name : net_structure_->outputs) {
+            if (output_blob_map_.count(name) == 0) {
+                auto npu_blob = CreateNpuBlob(output_dims[output_idx], name, output_tensor_[output_idx]->GetBuffer());
+                output_blob_map_[name] = npu_blob;
+                output_idx++;
+            }
+        }
+    } else {
+        // get the final output
+        int output_idx = 0;
+        for (auto name : net_structure_->outputs) {
+            auto npu_blob = CreateNpuBlob(output_dims[output_idx], name, output_tensor_[output_idx]->GetBuffer());
+            output_blob_map_[name] = npu_blob;
+            output_idx++;
+        }
+    }
+
+    LOGI("Init NPU Blobs Done!\n");
+    return TNN_OK;
+}
+
+Blob *NpuNetwork::CreateNpuBlob(hiai::TensorDimension dims, std::string name, void *data) {
+    int n = dims.GetNumber();
+    int c = dims.GetChannel();
+    int h = dims.GetHeight();
+    int w = dims.GetWidth();
+    // add blob
+    BlobDesc desc;
+    desc.device_type = DEVICE_HUAWEI_NPU;
+    desc.data_format = DATA_FORMAT_NCHW;
+    desc.name        = name;
+    desc.dims.push_back(n);
+    desc.dims.push_back(c);
+    desc.dims.push_back(h);
+    desc.dims.push_back(w);
+    BlobHandle handle;
+    handle.base = data;
+    return new Blob(desc, handle);
+}
+
+Status NpuNetwork::GetForwardMemorySize(int &memory_size) {
+    memory_size = 0;
+    return TNNERR_NPU_UNSUPPORT_ERROR;
+}
+
+Status NpuNetwork::SetForwardMemory(void *memory) {
+    return TNNERR_NPU_UNSUPPORT_ERROR;
+}
+
+Status NpuNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blob_map_;
+    return TNN_OK;
+}
+
+Status NpuNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blob_map_;
+    return TNN_OK;
+}
+
+Status NpuNetwork::SetDeviceAffinity(const std::vector<int> &) {
+    return TNNERR_NPU_UNSUPPORT_ERROR;
+}
+
+Status NpuNetwork::Reshape(const InputShapesMap &inputs) {
+    return TNNERR_NPU_UNSUPPORT_ERROR;
+}
+
+Status NpuNetwork::DeInit() {
+    client_->UnLoadModel();
+    auto iterator = input_blob_map_.begin();
+    while (iterator != input_blob_map_.end()) {
+        if (iterator->second != nullptr) {
+            delete (iterator->second);
+            iterator->second = nullptr;
+        }
+        iterator++;
+    }
+    input_blob_map_.clear();
+
+    if (use_subnet_) {
+        iterator = npu_inter_out_blobmap_.begin();
+        while (iterator != npu_inter_out_blobmap_.end()) {
+            if (iterator->second != nullptr) {
+                delete (iterator->second);
+                iterator->second = nullptr;
+            }
+            iterator++;
+        }
+        npu_inter_out_blobmap_.clear();
+    } else {
+        iterator = output_blob_map_.begin();
+        while (iterator != output_blob_map_.end()) {
+            if (iterator->second != nullptr) {
+                delete (iterator->second);
+                iterator->second = nullptr;
+            }
+            iterator++;
+        }
+        output_blob_map_.clear();
+    }
+
+    for (auto &layer : layers_) {
+        delete (layer);
+    }
+    layers_.clear();
+
+    if (context_ != nullptr) {
+        delete context_;
+        context_ = nullptr;
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::GetCommandQueue(void **command_queue) {
+    return TNN_OK;
+}
+
+Status NpuNetwork::Forward() {
+    hiai::AiContext context;
+    std::string key   = "model_name";
+    std::string value = model_name_;
+    context.AddPara(key, value);
+    int istamp;
+#if TNN_PROFILE
+    struct timeval start, end;
+    gettimeofday(&start, NULL);
+#endif
+    hiai::AIStatus ret = client_->Process(context, input_tensor_, output_tensor_, 1000, istamp);
+    if (ret != hiai::AI_SUCCESS) {
+        LOGE("NPU Forward Failed (ret = %d)\n", (int)ret);
+        return Status(TNNERR_NPU_HIAI_API_ERROR, "Forward failed!");
+    }
+#if TNN_PROFILE
+    gettimeofday(&end, NULL);
+    float delta = (end.tv_sec - start.tv_sec) * 1000.f + (end.tv_usec - start.tv_usec) / 1000.f;
+    std::shared_ptr<ProfilingData> pdata(new ProfilingData());
+    pdata->kernel_time = delta;
+    pdata->layer_name  = "NPU Forward";
+    pdata->op_name     = "NPU Execute";
+    context_->AddProfilingData(pdata);
+#endif
+
+    if (use_subnet_) {
+        for (auto iterator = npu_inter_out_blobmap_.begin(); iterator != npu_inter_out_blobmap_.end(); iterator++) {
+            std::string name = iterator->first;
+            Blob *npu_blob   = iterator->second;
+            Blob *cpu_blob   = cpu_inter_in_blobmap_[name];
+            if (cpu_blob_converter_map_.count(name) == 0) {
+                LOGE("cpu blob convert for sub-network not found!\n");
+                return Status(TNNERR_NULL_PARAM, "cpu blob convert for sub-network not found!");
+            }
+            Mat input_mat(DEVICE_NAIVE, NCHW_FLOAT, npu_blob->GetBlobDesc().dims,
+                          (char *)npu_blob->GetHandle().base + npu_blob->GetHandle().bytes_offset);
+            MatConvertParam param;
+            cpu_blob_converter_map_[name]->ConvertFromMat(input_mat, param, nullptr);
+        }
+        return sub_network_->Forward();
+    }
+    return TNN_OK;
+}
+
+Status NpuNetwork::ForwardAsync(Callback call_back) {
+    return NpuNetwork::Forward();
+}
+
+#if TNN_PROFILE
+void NpuNetwork::StartProfile() {
+    context_->StartProfile();
+    if (nullptr != sub_network_) {
+        sub_network_->StartProfile();
+    }
+}
+
+std::shared_ptr<ProfileResult> NpuNetwork::FinishProfile() {
+    auto result = context_->FinishProfile();
+    if (nullptr != sub_network_) {
+        auto sub_result = sub_network_->FinishProfile();
+        result->AddProfileResult(sub_result);
+    }
+    return result;
+}
+#endif
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.h b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.h
new file mode 100644
index 0000000..81f10c2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/huawei_npu/npu_network.h
@@ -0,0 +1,176 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_NETWORK_H_
+#define TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_NETWORK_H_
+
+#include <tnn/core/abstract_device.h>
+#include <tnn/core/blob_manager.h>
+#include <tnn/device/huawei_npu/convert/npu_base_layer_convert.h>
+#include <tnn/interpreter/net_resource.h>
+#include <tnn/interpreter/net_structure.h>
+#include <tnn/layer/base_layer.h>
+
+#include <hiai_ir_build.h>
+#include <tnn/core/default_network.h>
+#include <tnn/interpreter/default_model_interpreter.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "HiAiModelManagerService.h"
+#include "graph/attr_value.h"
+#include "graph/graph.h"
+#include "graph/op/nn_defs.h"
+#include "tnn/core/abstract_network.h"
+
+namespace TNN_NS {
+
+class NpuNetwork : public AbstractNetwork {
+public:
+    // @brief default constructor
+    NpuNetwork();
+
+    // @brief virtual default destructor
+    virtual ~NpuNetwork();
+
+    // @brief init network with net cfg and net res.
+    // @param net_cfg
+    // @param net_res
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+
+    // @brief deinit release init create resource
+    virtual Status DeInit();
+
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by rapidnet layers for
+    //  forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    //  @brief: set memory used by the rapidnet instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the rapidnet network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by rapidnet layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    //
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief network infer
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get rapidnet command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward();
+
+    // @brief rapidnet instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    // @brief set device affinity
+    virtual Status SetDeviceAffinity(const std::vector<int> &device_list);
+
+#if TNN_PROFILE
+public:
+    virtual void StartProfile();
+    virtual std::shared_ptr<ProfileResult> FinishProfile();
+#endif
+
+private:
+    // add for huawei_npu
+
+    bool InitConfigCheck(NetworkConfig &net_config, ModelConfig &model_config);
+
+    Status RomVersionCheck();
+
+    Status InitContext(NetworkConfig &net_config);
+
+    Status HiAIModelInit(std::string model_path, NetworkConfig &net_config, ModelConfig &model_config,
+                         DefaultModelInterpreter *interpreter, InputShapesMap inputs_shape,
+                         InputShapesMap &cpu_inputs_shape);
+
+    Status IRInitLayers(NetworkConfig &net_config, DefaultModelInterpreter *interpreter, InputShapesMap &inputs_shape);
+
+    Status InitSubNetwork(NetworkConfig &net_config, ModelConfig &model_config, DefaultModelInterpreter *interpreter,
+                          InputShapesMap &cpu_inputs_shape);
+
+    Status ConvertLayers(NetResource *net_resource);
+
+    Status CreateGraphInputs(InputShapesMap &input_shape_map);
+
+    Status SetGraphInputsAndOutputs(InputShapesMap &input_shape_map, InputShapesMap &cpu_input_shape_map);
+
+    Status BuildGraph(domi::HiaiIrBuild &ir_build, domi::ModelBufferData &om_model_buff);
+
+    Status InitBlobs(InputShapesMap &inputs_shape, InputShapesMap &cpu_inputs_shape);
+
+    Blob *CreateNpuBlob(hiai::TensorDimension dims, std::string name, void *data);
+
+private:
+    AbstractDevice *device_ = nullptr;
+
+    Context *context_ = nullptr;
+
+    std::vector<NpuBaseLayer *> layers_;
+
+    NetStructure *net_structure_ = nullptr;
+    // add for huawei_npu
+    // map to store the operators corresponding to their names
+    std::map<std::string, shared_ptr<OperatorInfo>> global_operator_map_;
+    // graph used to build
+    ge::Graph graph_ = ge::Graph("graph");
+
+    // the boolean controls if build from om or build from memory
+    bool use_path_ = true;
+    // the name of the model
+    std::string model_name_;
+    std::string version_str_ = "";
+    std::shared_ptr<hiai::AiModelMngerClient> client_;
+    std::vector<std::shared_ptr<hiai::AiTensor>> input_tensor_;
+    std::vector<std::shared_ptr<hiai::AiTensor>> output_tensor_;
+
+    // blob map used only for input
+    BlobMap input_blob_map_;
+    BlobMap output_blob_map_;
+
+    // here to add sub network :
+    std::shared_ptr<DefaultNetwork> sub_network_;
+    std::shared_ptr<DefaultModelInterpreter> sub_network_interp_;
+    // count how many layers have been constructed
+    int cpu_count_;
+    std::set<std::string> visited_;
+    bool use_subnet_ = false;
+    BlobMap npu_inter_out_blobmap_;
+    BlobMap cpu_inter_in_blobmap_;
+    std::map<std::string, std::shared_ptr<BlobConverter>> cpu_blob_converter_map_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_HUAWEI_NPU_NPU_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/metal/CMakeLists.txt
new file mode 100644
index 0000000..435143a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB_RECURSE METAL_SRCS *.h *.cc *.m *.mm)
+file(GLOB_RECURSE METAL_SHADER_SRCS *.metal)
+
+add_library(TNNMetal OBJECT ${METAL_SRCS})
+
+if(TNN_METAL_ENABLE AND SYSTEM.Darwin)
+    if(TNN_METAL_FLOAT32)
+        add_custom_command(OUTPUT TNN_METAL_LIBRARY
+                           COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_tnn_metallib.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_BINARY_DIR}/tnn.metallib ${CMAKE_SOURCE_DIR}/source/ 1
+                           COMMENT "Compile metal shaderes: fp32")
+    else()
+        add_custom_command(OUTPUT TNN_METAL_LIBRARY
+                           COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_tnn_metallib.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_BINARY_DIR}/tnn.metallib ${CMAKE_SOURCE_DIR}/source/ 0
+                           COMMENT "Compile metal shaderes: fp16")
+    endif()
+
+    add_custom_target(tnn.metallib ALL DEPENDS TNN_METAL_LIBRARY)
+endif()
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.h
new file mode 100644
index 0000000..23b3c1a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_1x1_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_1x1_H_
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalConvLayer1x1 : public MetalConvLayerCommon {
+public:
+    virtual ~MetalConvLayer1x1();
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status SetKernelEncoderParam(
+                                         id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_1x1_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.metal b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.metal
new file mode 100644
index 0000000..15a1fdb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.metal
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+#define CONV_UNROLL (4)
+
+kernel void convolution_1x1_4x(const device ftype4 *in           [[buffer(0)]],
+                            device ftype4 *out                [[buffer(1)]],
+                            constant MetalConvParams& params  [[buffer(2)]],
+                            const device ftype4x4 *wt         [[buffer(3)]],
+                            const device ftype4 *biasTerms    [[buffer(4)]],
+                            uint3 gid                         [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size,
+                           params.output_slice,
+                           params.batch)))
+        return;
+    
+    int g = gid.y / params.output_slice_per_group;
+    auto xy_wt  = wt  + (int)gid.y * params.input_slice_per_group;
+    auto xy_in  = in  + (int)gid.z * params.input_slice  * params.input_size  + g * params.input_slice_per_group * params.input_size  + (int)gid.x;
+    auto xy_out = out + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    auto result = params.has_bias ? float4(biasTerms[gid.y]) : float4(Zero4);
+    for (auto z = 0; z < params.input_slice_per_group; z++, xy_in += params.input_size) {
+        result += float4(*xy_in) * float4x4(xy_wt[z]);
+    }
+    *xy_out = activate(ftype4(result), params.activation);
+}
+
+kernel void convolution_1x1_common(const device ftype *in     [[buffer(0)]],
+                            device ftype *out                [[buffer(1)]],
+                            constant MetalConvParams& params  [[buffer(2)]],
+                            const device ftype4 *wt           [[buffer(3)]],
+                            const device ftype *biasTerms    [[buffer(4)]],
+                            uint3 gid                         [[thread_position_in_grid]]) {
+    uint output_slice_per_group = UP_DIV(params.output_slice_per_group, 4);
+    uint slices = params.group * output_slice_per_group;
+    if (any(gid >= uint3(params.output_size,
+                           slices,
+                           params.batch)))
+        return;
+
+    int g = gid.y / output_slice_per_group;
+    int4 output_channel = (gid.y % output_slice_per_group) * 4 + int4(0, 1, 2, 3);
+    bool4 valid = output_channel < params.output_slice_per_group;
+    output_channel += g * params.output_slice_per_group;
+    
+    auto output_slice = output_channel / 4;
+    auto output_c     = output_channel % 4;
+
+    int start_input_channel = g * params.input_slice_per_group;
+    int start_input_slice   = start_input_channel / 4;
+    int start_input_c       = start_input_channel % 4;
+
+    const device ftype4 *xy_wt  = wt  + (int)gid.y * params.input_slice_per_group;
+    const device ftype *xy_in   = in  + (int)gid.z * params.input_slice  * params.input_size * 4 + \
+                    start_input_slice * params.input_size * 4 + (int)gid.x * 4 + start_input_c;
+
+    int4 out_idx = (int)gid.z * params.output_slice * params.output_size * 4 + \
+                    output_slice * params.output_size * 4 + (int)gid.x * 4 + output_c;
+
+    auto bias = params.has_bias ? (select(float4(Zero4),
+                        float4(biasTerms[output_channel[0]],
+                                biasTerms[output_channel[1]],
+                                biasTerms[output_channel[2]],
+                                biasTerms[output_channel[3]]),
+                        valid)) : float4(Zero4);
+    
+    float4 sum = bias;
+    const int step_size = params.input_size * 4 - 3;
+    for (auto c = 0; c < params.input_slice_per_group; c++) {
+        sum += float(*xy_in) * float4(xy_wt[c]);
+        start_input_c += 1;
+        int flag = start_input_c == 4;
+        xy_in += flag * step_size + (1 - flag) * 1;
+        start_input_c -= flag * start_input_c;
+    }
+    ftype4 result = activate(ftype4(sum), params.activation);
+    if (valid[3]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+        out[out_idx[3]] = result[3];
+    } else if (valid[2]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+    } else if (valid[1]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+    } else {
+        out[out_idx[0]] = result[0];
+    }
+}
+
+kernel void convolution_1x1_g1z4(const device ftype4 *in             [[buffer(0)]],
+                                 device ftype4 *out                  [[buffer(1)]],
+                                 constant MetalConvParams& params    [[buffer(2)]],
+                                 const device ftype4x4 *wt           [[buffer(3)]],
+                                 const device ftype4 *biasTerms      [[buffer(4)]],
+                                 uint3 gid                           [[thread_position_in_grid]]) {
+    if ((int)gid.x >= params.output_size || (int)gid.y * CONV_UNROLL >= params.output_slice || (int)gid.z >= params.batch) return;
+    
+    int uz = gid.y * CONV_UNROLL;
+    auto xy_wt0 = wt + uz * params.input_slice;
+    auto xy_wt1 = uz + 1 < params.output_slice ? xy_wt0 + params.input_slice : nullptr;
+    auto xy_wt2 = uz + 2 < params.output_slice ? xy_wt1 + params.input_slice : nullptr;
+    auto xy_wt3 = uz + 3 < params.output_slice ? xy_wt2 + params.input_slice : nullptr;
+    auto xy_in  = in  + (int)gid.z * params.input_slice  * params.input_size                         + (int)gid.x;
+    auto xy_out = out + (int)gid.z * params.output_slice * params.output_size + uz * params.output_size + (int)gid.x;
+    
+    float4 result0 = 0, result1 = 0, result2 = 0, result3 = 0;
+    for (auto z = 0; z < params.input_slice; z++, xy_in += params.input_size) {
+        auto in4 = float4(*xy_in);
+        /* true */  result0 += in4 * float4x4(xy_wt0[z]);
+        if (xy_wt1) result1 += in4 * float4x4(xy_wt1[z]);
+        if (xy_wt2) result2 += in4 * float4x4(xy_wt2[z]);
+        if (xy_wt3) result3 += in4 * float4x4(xy_wt3[z]);
+    }
+    
+    
+    if (params.has_bias) {
+        *xy_out = activate(ftype4(result0 + float4(biasTerms[uz + 0])), params.activation);
+        if (xy_wt1) { xy_out += params.output_size; *xy_out = activate(ftype4(result1 + float4(biasTerms[uz + 2])), params.activation); }
+        if (xy_wt2) { xy_out += params.output_size; *xy_out = activate(ftype4(result2 + float4(biasTerms[uz + 2])), params.activation); }
+        if (xy_wt3) { xy_out += params.output_size; *xy_out = activate(ftype4(result3 + float4(biasTerms[uz + 3])), params.activation); }
+    } else {
+        *xy_out = activate(ftype4(result0), params.activation);
+        if (xy_wt1) { xy_out += params.output_size; *xy_out = activate(ftype4(result1), params.activation); }
+        if (xy_wt2) { xy_out += params.output_size; *xy_out = activate(ftype4(result2), params.activation); }
+        if (xy_wt3) { xy_out += params.output_size; *xy_out = activate(ftype4(result3), params.activation); }
+    }
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.mm
new file mode 100644
index 0000000..659ad44
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_1x1.mm
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_1x1.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+bool MetalConvLayer1x1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto kernel_x = param->kernels[0], kernel_y = param->kernels[1];
+    auto dilate_x = param->dialations[0], dilate_y = param->dialations[1];
+    auto stride_x = param->strides[0], stride_y = param->strides[1];
+    auto pad_x = param->pads[0], pad_y = param->pads[0];
+    return kernel_x == 1 && kernel_y == 1 && dilate_x == 1 && dilate_y == 1 && pad_x == 0 && pad_y == 0 &&
+           stride_x == 1 && stride_y == 1;
+}
+
+MetalConvLayer1x1::~MetalConvLayer1x1() {}
+
+Status MetalConvLayer1x1::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    const int group  = conv_param->group;
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    const int goc = dims_output[1] / group;
+    const int gic = dims_input[1] / group;
+
+    
+    // buffer_param_
+    {
+        MetalConvParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        SetDefaultMetalConvParams(metal_params, conv_param);
+        
+        metal_params.input_slice_per_group  = metal_params.input_slice / group;
+        metal_params.output_slice_per_group = metal_params.output_slice / group;
+        if (is_channel_4x_ == false) {
+            metal_params.input_slice_per_group  = gic;
+            metal_params.output_slice_per_group = goc;
+        }
+        
+        
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConvParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalConvLayer1x1::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    
+    ConvLayerParam *layer_param  = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *layer_res = dynamic_cast<ConvLayerResource *>(resource_);
+    auto dims_input              = inputs[0]->GetBlobDesc().dims;
+    auto dims_output             = outputs[0]->GetBlobDesc().dims;
+    const int input_channel      = dims_input[1];
+    const int output_channel     = dims_output[1];
+    const int group  = layer_param->group;
+    const int goc = dims_output[1] / group;
+    const int gic = dims_input[1] / group;
+
+    is_channel_4x_ = group == 1 || (group > 1 && (gic % 4 == 0) && (goc % 4 == 0));
+    if (is_channel_4x_)
+        return MetalConvLayerCommon::AllocateBufferWeight(inputs, outputs);
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        buffer_weight_ = AllocatePackedGOIHW4MetalBufferFormRawBuffer(
+            layer_res->filter_handle, {output_channel, input_channel, 1, 1}, layer_param->group, status);
+    }
+    return status;
+}
+
+std::string MetalConvLayer1x1::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (is_channel_4x_)
+        return "convolution_1x1_4x";
+    return "convolution_1x1_common";
+}
+
+Status MetalConvLayer1x1::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+    [encoder setBuffer:buffer_weight_
+                offset:0
+               atIndex:3];
+    [encoder setBuffer:buffer_bias_
+                offset:0
+               atIndex:4];
+    return TNN_OK;
+}
+
+Status MetalConvLayer1x1::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    
+    auto dims_output  = outputs[0]->GetBlobDesc().dims;
+    auto output_slice = UP_DIV(dims_output[1], 4);
+    if (is_channel_4x_) {
+        size = MTLSizeMake(dims_output[3]*dims_output[2], output_slice, dims_output[0]);
+    } else {
+        auto goc = dims_output[1] / layer_param->group;
+        auto slice_per_group = UP_DIV(goc, 4);
+        size = MTLSizeMake(dims_output[3]*dims_output[2], slice_per_group * layer_param->group, dims_output[0]);
+    }
+    return TNN_OK;
+}
+
+Status MetalConvLayer1x1::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.h
new file mode 100644
index 0000000..8a0635b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalConvLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    // @brief virtual destrcutor
+    virtual ~MetalConvLayerAcc();
+
+    Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<MetalLayerAcc> conv_acc_impl_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.mm
new file mode 100644
index 0000000..7abf533
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_acc.mm
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_acc.h"
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_1x1.h"
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h"
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_winograd.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+Status MetalConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (MetalConvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        conv_acc_impl_ = make_shared<MetalConvLayerDepthwise>();
+    } else if (MetalConvLayer1x1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        conv_acc_impl_ = make_shared<MetalConvLayer1x1>();
+    } else if (MetalConvLayerWinograd::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        conv_acc_impl_ = make_shared<MetalConvLayerWinograd>();
+    } else {
+        conv_acc_impl_ = make_shared<MetalConvLayerCommon>();
+    }
+
+    auto status = conv_acc_impl_->Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+MetalConvLayerAcc::~MetalConvLayerAcc() {
+    conv_acc_impl_ = nullptr;
+}
+
+Status MetalConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    do {
+        if (MetalConvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+            if (!conv_acc_impl_ || !dynamic_cast<MetalConvLayerDepthwise *>(conv_acc_impl_.get())) {
+                auto conv_acc = make_shared<MetalConvLayerDepthwise>();
+                conv_acc->Init(context_, param_, resource_, inputs, outputs);
+                conv_acc_impl_ = conv_acc;
+            }
+            break;
+        }
+
+        if (MetalConvLayer1x1::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+            if (!conv_acc_impl_ || !dynamic_cast<MetalConvLayer1x1 *>(conv_acc_impl_.get())) {
+                auto conv_acc = make_shared<MetalConvLayer1x1>();
+                conv_acc->Init(context_, param_, resource_, inputs, outputs);
+                conv_acc_impl_ = conv_acc;
+            }
+            break;
+        }
+
+        if (MetalConvLayerWinograd::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+            if (!conv_acc_impl_ || !dynamic_cast<MetalConvLayerWinograd *>(conv_acc_impl_.get())) {
+                auto conv_acc = make_shared<MetalConvLayerWinograd>();
+                conv_acc->Init(context_, param_, resource_, inputs, outputs);
+                conv_acc_impl_ = conv_acc;
+            }
+            break;
+        }
+    } while (0);
+
+    if (conv_acc_impl_) {
+        return conv_acc_impl_->Reshape(inputs, outputs);
+    } else {
+        LOGE("Error: conv_acc_impl_ is nil\n");
+        return Status(TNNERR_LAYER_ERR, "conv_acc_impl_ is nil");
+    }
+}
+
+Status MetalConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_acc_impl_) {
+        return conv_acc_impl_->Forward(inputs, outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "conv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_METAL_ACC(Conv, LAYER_CONVOLUTION);
+REGISTER_METAL_LAYOUT(LAYER_CONVOLUTION, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.h
new file mode 100644
index 0000000..e61736e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.h
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalConvLayerCommon : public MetalLayerAcc {
+public:
+    virtual ~MetalConvLayerCommon();
+
+    Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    /**
+     * @brief layer forward
+     * @param param    convolution para
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return implement is prefered
+     */
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief allocate MTLBuffer for weigths
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    virtual Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief allocate MTLBuffer for biase
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    virtual Status AllocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+protected:
+    id<MTLBuffer> buffer_weight_ = nil;
+    id<MTLBuffer> buffer_bias_   = nil;
+    bool is_channel_4x_ = false;
+    int bias_datatype_bytes_ = 0;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.metal b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.metal
new file mode 100644
index 0000000..eac3126
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.metal
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void convolution_common_4x(const device ftype4 *in            [[buffer(0)]],
+                                     device ftype4 *out                 [[buffer(1)]],
+                                     constant MetalConvParams& params   [[buffer(2)]],
+                                     const device ftype4x4 *wt          [[buffer(3)]],
+                                     const device ftype4 *biasTerms     [[buffer(4)]],
+                                     uint3 gid                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width,
+                           params.output_height,
+                           params.output_slice)))
+        return;
+    
+    int offset_x = (int)gid.x * params.stride_x - params.pad_x;
+    int offset_y = (int)gid.y * params.stride_y - params.pad_y;
+    int sx = max(0, (UP_DIV(-offset_x, params.dilation_x)));
+    int ex = min(params.kernel_x, UP_DIV(params.input_width - offset_x, params.dilation_x));
+    short kw = ex - sx;
+    int sy = max(0, (UP_DIV(-offset_y, params.dilation_y)));
+    int ey = min(params.kernel_y, UP_DIV(params.input_height - offset_y, params.dilation_y));
+    short kh = ey - sy;
+    offset_x += sx * params.dilation_x;
+    offset_y += sy * params.dilation_y;
+    
+    auto z_in  = in                                                   + offset_y * params.input_width    + offset_x;
+    auto z_wt  = wt  + (int)gid.z * params.input_slice * params.kernel_size + sy * params.kernel_x             + sx;
+    auto z_out = out + (int)gid.z * params.output_size                   + (int)gid.y * params.output_width + (int)gid.x;
+    
+    int dilation_h = params.input_width * params.dilation_y;
+    auto result = params.has_bias ? float4(biasTerms[gid.z]) : float4(Zero4);
+    for (auto z = 0; z < params.input_slice; z++) {
+        for (auto y = 0; y < kh; y++) {
+            for (auto x = 0; x < kw; x++) {
+                auto wt4 = float4x4(z_wt[z * params.kernel_size + y * params.kernel_x + x]);
+                auto in4 = float4(z_in[z * params.input_size  + y * dilation_h   + x * params.dilation_x]);
+                result += in4 * wt4;
+            }
+        }
+    }
+    
+    *z_out = activate(ftype4(result), params.activation);
+}
+
+kernel void convolution_common(const device ftype *in     [[buffer(0)]],    
+                            device ftype *out                [[buffer(1)]],
+                            constant MetalConvParams& params  [[buffer(2)]],
+                            const device ftype4 *wt           [[buffer(3)]],
+                            const device ftype *biasTerms    [[buffer(4)]],
+                            constant int& group      [[buffer(5)]],
+                            uint3 gid                         [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width,
+                           params.output_height,
+                           params.output_slice)))
+        return;
+
+    int4 local_channel = int(gid.z) * 4 + int4(0, 1, 2, 3);
+    bool4 valid = local_channel < params.output_slice_per_group;
+    auto output_channel = local_channel + group * params.output_slice_per_group;
+    
+    auto output_slice = output_channel / 4;
+    auto output_c     = output_channel % 4;
+
+    int start_input_channel = group * params.input_slice_per_group;
+    int start_input_slice   = start_input_channel / 4;
+    int start_input_c       = start_input_channel % 4;
+
+    int offset_x = (int)gid.x * params.stride_x - params.pad_x;
+    int offset_y = (int)gid.y * params.stride_y - params.pad_y;
+    int sx = max(0, (UP_DIV(-offset_x, params.dilation_x)));
+    int ex = min(params.kernel_x, UP_DIV(params.input_width - offset_x, params.dilation_x));
+    short kw = ex - sx;
+    int sy = max(0, (UP_DIV(-offset_y, params.dilation_y)));
+    int ey = min(params.kernel_y, UP_DIV(params.input_height - offset_y, params.dilation_y));
+    short kh = ey - sy;
+    offset_x += sx * params.dilation_x;
+    offset_y += sy * params.dilation_y;
+
+    const device ftype4 *xy_wt  = wt  + (int)gid.z * params.input_slice_per_group * params.kernel_size  + \
+                        sy * params.kernel_x + sx;
+    const device ftype *xy_in   = in  + start_input_slice * params.input_size * 4 + \
+                        (offset_y * params.input_width + offset_x) * 4 + start_input_c;
+
+    int4 out_idx = output_slice * params.output_size * 4 + \
+                        ((int)gid.y * params.output_width + (int)gid.x) * 4 + output_c;
+
+    auto bias = params.has_bias ? (select(float4(Zero4),
+                        float4(biasTerms[local_channel[0]],
+                                biasTerms[local_channel[1]],
+                                biasTerms[local_channel[2]],
+                                biasTerms[local_channel[3]]),
+                        valid)) : float4(Zero4);
+    
+    float4 sum = bias;
+    int dilation_h = params.input_width * params.dilation_y;
+    const int step_size = params.input_size * 4 - 3;
+    // Todo: optimize weight layout
+    for (auto c = 0; c < params.input_slice_per_group; c++) {
+        for (auto y = 0; y < kh; y++) {
+            for (auto x = 0; x < kw; x++) {
+                float val_in = float(xy_in[(y * dilation_h + x * params.dilation_x)*4]);
+                float4 val_w = float4(xy_wt[y * params.kernel_x + x]);
+                sum += val_in * val_w;
+            }
+        }
+        start_input_c += 1;
+        int flag = start_input_c == 4;
+        xy_in += flag * step_size + (1 - flag) * 1;
+        start_input_c -= flag * start_input_c;
+        xy_wt += params.kernel_size;
+    }
+    ftype4 result = activate(ftype4(sum), params.activation);
+    if (valid[3]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+        out[out_idx[3]] = result[3];
+    } else if (valid[2]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+    } else if (valid[1]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+    } else {
+        out[out_idx[0]] = result[0];
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.mm
new file mode 100644
index 0000000..fef3b42
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_common.mm
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+bool MetalConvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+MetalConvLayerCommon::~MetalConvLayerCommon() {}
+
+Status MetalConvLayerCommon::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *layer_param  = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *layer_res = dynamic_cast<ConvLayerResource *>(resource_);
+    auto dims_input              = inputs[0]->GetBlobDesc().dims;
+    auto dims_output             = outputs[0]->GetBlobDesc().dims;
+    const int input_channel      = dims_input[1];
+    const int output_channel     = dims_output[1];
+    const int group     = layer_param->group;
+    const int goc       = output_channel / group;
+    const int gic       = input_channel / group;
+    is_channel_4x_ = group == 1 || (group > 1 && (gic % 4 == 0) && (goc % 4 == 0));
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        int kw = layer_param->kernels[0];
+        int kh = layer_param->kernels[1];
+
+        if (is_channel_4x_) {
+            buffer_weight_ = AllocatePackedGOIHW16MetalBufferFormRawBuffer(
+                layer_res->filter_handle, {output_channel, input_channel, kh, kw}, layer_param->group, status);
+        } else {
+            buffer_weight_ = AllocatePackedGOIHW4MetalBufferFormRawBuffer(
+                layer_res->filter_handle, {output_channel, input_channel, kh, kw}, layer_param->group, status);
+        }
+    }
+    return status;
+}
+
+Status MetalConvLayerCommon::AllocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto layer_res   = dynamic_cast<ConvLayerResource *>(resource_);
+
+    Status status = TNN_OK;
+    // buffer_bias_
+    if (!buffer_bias_) {
+        if (layer_param->bias) {
+            auto dims_output = outputs[0]->GetBlobDesc().dims;
+            const DataType data_type  = layer_res->bias_handle.GetDataType();
+            bias_datatype_bytes_ = DataTypeUtils::GetBytesSize(data_type);
+            buffer_bias_ = AllocateMetalBufferFormRawBuffer1D(layer_res->bias_handle, dims_output[1], status);
+        } else {
+            //防止bind时候为空
+            buffer_bias_ = buffer_weight_;
+        }
+    }
+    return status;
+}
+
+Status MetalConvLayerCommon::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    auto conv_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    const int group  = conv_param->group;
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    const int goc    = dims_output[1] / group;
+    const int gic    = dims_input[1] / group;
+    // buffer_param_
+    {
+        MetalConvParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        SetDefaultMetalConvParams(metal_params, conv_param);
+        
+        if (is_channel_4x_) {
+            metal_params.input_slice            = UP_DIV(dims_input[1], 4) / group;
+            metal_params.input_slice_per_group  = metal_params.input_slice;
+            metal_params.output_slice           = UP_DIV(dims_output[1], 4) / group;
+            metal_params.output_slice_per_group = metal_params.output_slice;
+        } else {
+            metal_params.input_slice            = UP_DIV(gic, 4);
+            metal_params.input_slice_per_group  = gic;
+            metal_params.output_slice           = UP_DIV(goc, 4);
+            metal_params.output_slice_per_group = goc;
+        }
+
+        metal_params.threadgroup_input_slice = metal_params.input_slice;
+        //            metal_params.batch = dims_output[0];
+        
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConvParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalConvLayerCommon::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = AllocateBufferWeight(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    status = AllocateBufferBias(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    status = AllocateBufferParam(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return TNN_OK;
+}
+
+std::string MetalConvLayerCommon::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (is_channel_4x_)
+        return "convolution_common_4x";
+    return "convolution_common";
+}
+
+Status MetalConvLayerCommon::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    
+    auto output = outputs[0];
+    auto dims_output  = output->GetBlobDesc().dims;
+    // group = 7, output_channel = 35
+    auto output_slice = UP_DIV(dims_output[1], 4);
+    auto output_slice_per_group = output_slice / layer_param->group;
+    output_slice_per_group = output_slice_per_group > 0 ? output_slice_per_group : 1;
+    if (is_channel_4x_ == false) {
+        auto goc = dims_output[1] / layer_param->group;
+        output_slice_per_group = UP_DIV(goc, 4);
+    }
+    size = MTLSizeMake(dims_output[3], dims_output[2], output_slice_per_group);
+    return TNN_OK;
+}
+
+Status MetalConvLayerCommon::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    const int group  = layer_param->group;
+    const int batch  = dims_output[0];
+    const int goc    = dims_output[1] / group;
+    const int gic    = dims_input[1] / group;
+    
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+    
+    int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+    
+    auto input_bytes = dims_input[3] * dims_input[2] * ROUND_UP(dims_input[1], 4) * data_byte_size;
+    auto input_bytes_per_group = input_bytes / group;
+    auto output_bytes = dims_output[3] * dims_output[2] * ROUND_UP(dims_output[1], 4) * data_byte_size;
+    auto output_bytes_per_group = output_bytes / group;
+    auto bias_bytes_per_group = dims_output[1] / group * bias_datatype_bytes_;
+    if (is_channel_4x_ == false) {
+        // compute offset within kernel
+        input_bytes_per_group = 0;
+        output_bytes_per_group = 0;
+    }
+
+    Status status = TNN_OK;
+    
+    do {
+        MTLSize threads;
+        status = ComputeThreadSize(inputs, outputs, threads);
+        BREAK_IF(status != TNN_OK);
+        
+        auto kernel_name = KernelName(inputs, outputs);
+        if (kernel_name.length() <= 0) {
+            status = Status(TNNERR_LAYER_ERR, "empty kernel name");
+            break;
+        }
+        
+        MetalBandwidth bandwidth;
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                          bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        for (int b = 0; b < batch; b++) {
+            for (int g = 0; g < group; g++) {
+                [encoder
+                    setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                       offset:(NSUInteger)input->GetHandle().bytes_offset + (b * input_bytes + g * input_bytes_per_group)
+                      atIndex:0];
+                [encoder
+                    setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                       offset:(NSUInteger)output->GetHandle().bytes_offset + (b * output_bytes + g * output_bytes_per_group)
+                      atIndex:1];
+                [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+                [encoder setBuffer:buffer_weight_ offset:g * buffer_weight_.length / group atIndex:3];
+                // bias may be padded
+                [encoder setBuffer:buffer_bias_ offset:g * bias_bytes_per_group atIndex:4];
+                [encoder setBytes :&g       length:sizeof(int) atIndex:5];
+
+                status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+
+                if (status != TNN_OK) {
+                    [encoder endEncoding];
+                    return status;
+                }
+            }
+        }
+    } while (0);
+    
+    [encoder endEncoding];
+    
+    if (status == TNN_OK) {
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+    return status;
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h
new file mode 100644
index 0000000..2d06930
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalConvLayerDepthwise : public MetalConvLayerCommon {
+public:
+    virtual ~MetalConvLayerDepthwise();
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status SetKernelEncoderParam(
+                                         id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs,
+                                                           MTLSize &size);
+private:
+    // flags indicating if specialized configs are met
+    bool k51s1d1_ = false;
+    bool k15s1d1_ = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.metal b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.metal
new file mode 100644
index 0000000..fb03f60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.metal
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void convolution_depthwise(const device ftype4 *in           [[buffer(0)]],
+                                  device ftype4 *out                [[buffer(1)]],
+                                  constant MetalConvParams& params  [[buffer(2)]],
+                                  const device ftype4 *wt           [[buffer(3)]],
+                                  const device ftype4 *biasTerms    [[buffer(4)]],
+                                  uint3 gid                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width,
+                           params.output_height,
+                           params.batch*params.output_slice)))
+        return;
+    
+    int oz = gid.z % params.output_slice;
+    int offset_x = (int)gid.x * params.stride_x - params.pad_x;
+    int offset_y = (int)gid.y * params.stride_y - params.pad_y;
+    int sx = max(0, (UP_DIV(-offset_x, params.dilation_x)));
+    int ex = min(params.kernel_x, UP_DIV(params.input_width - offset_x, params.dilation_x));
+    int sy = max(0, (UP_DIV(-offset_y, params.dilation_y)));
+    int ey = min(params.kernel_y, UP_DIV(params.input_height - offset_y, params.dilation_y));
+    offset_x += sx * params.dilation_x;
+    offset_y += sy * params.dilation_y;
+    
+    auto z_wt  = wt  + (int)oz * params.kernel_size;
+    auto z_in  = in  + (int)gid.z * params.input_size;
+    auto z_out = out + (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+    
+    auto result = params.has_bias ? float4(biasTerms[oz]) : float4(Zero4);
+    for (auto ky = sy, y = offset_y; ky < ey; ky++, y += params.dilation_y) {
+        for (auto kx = sx, x = offset_x; kx < ex; kx++, x += params.dilation_x) {
+            auto wt4 = float4(z_wt[ky * params.kernel_x   + kx]);
+            auto in4 = float4(z_in[ y * params.input_width + x]);
+            result += in4 * wt4;
+        }
+    }
+    
+    *z_out = activate(ftype4(result), params.activation);
+}
+
+kernel void convolution_depthwise5x5_h8w4(const device ftype4 *in           [[buffer(0)]],
+                                          device ftype4 *out                [[buffer(1)]],
+                                          constant MetalConvParams& params  [[buffer(2)]],
+                                          const device ftype4 *wt           [[buffer(3)]],
+                                          const device ftype4 *biasTerms    [[buffer(4)]],
+                                          uint3 gid                       [[thread_position_in_grid]],
+                                          uint3 group_id                  [[threadgroup_position_in_grid]],
+                                          uint thread_index               [[thread_index_in_threadgroup]]) {
+    threadgroup ftype4 input_data_cache[8 * 12];
+    
+    // compute ld offset of inputs
+    const int ld_start_w = group_id.x * 4 - params.pad_x;
+    const int ld_start_h = group_id.y * 8 - params.pad_y;
+    const int ld_start_c = group_id.z;
+    
+    const int ld_offset = ld_start_c * params.input_size;
+    
+    const int a_smem_st_offset = thread_index;
+    
+    // load data
+    int ld_w = ld_start_w + thread_index % 8;
+    int ld_h = ld_start_h + thread_index / 8;
+    const int ld_pos = ld_offset + ld_h * params.input_width + ld_w;
+    
+    bool w_in_image = ld_w >=0 && ld_w < params.input_width;
+    
+    bool in_image = (ld_h >=0 && ld_h < params.input_height) && w_in_image;
+    ftype4 v = in_image ? in[ld_pos] : Zero4;
+    input_data_cache[a_smem_st_offset] = v;
+    
+    bool in_image1 =  ld_h + 4 >= 0 && ld_h + 4 < params.input_height && w_in_image ;
+    v = in_image1 ? in[ld_pos + 4 * params.input_width] : Zero4;
+    input_data_cache[a_smem_st_offset +  32] = v;
+    
+    bool in_image2 =  ld_h + 8 >= 0 && ld_h + 8 < params.input_height && w_in_image;
+    v = in_image2 ? in[ld_pos + 8 * params.input_width] : Zero4;
+    input_data_cache[a_smem_st_offset + 64] = v;
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (!any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch))) {
+        auto z_out = out + (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+        auto result = params.has_bias ? biasTerms[gid.z] : Zero4;
+        auto z_wt  = wt  + (int)gid.z * params.kernel_size;
+        
+        int offset_x = thread_index % 4;
+        int offset_y = thread_index / 4;
+#pragma unroll
+        for (auto ky = 0, y = offset_y; ky < 5; ky++, y ++) {
+            for (auto kx = 0, x = offset_x; kx < 5; kx++, x ++) {
+                auto wt4 = z_wt[ky * 5   + kx];
+                auto in4 = input_data_cache[ y * 8 + x];
+                result += in4 * wt4;
+            }
+        }
+        
+        *z_out = activate(result, params.activation);
+    }
+}
+
+kernel void convolution_depthwise5x1_h8w4(const device ftype4 *in           [[buffer(0)]],
+                                          device ftype4 *out                [[buffer(1)]],
+                                          constant MetalConvParams& params  [[buffer(2)]],
+                                          const device ftype4 *wt           [[buffer(3)]],
+                                          const device ftype4 *biasTerms    [[buffer(4)]],
+                                          uint3 gid                       [[thread_position_in_grid]],
+                                          uint3 group_id                  [[threadgroup_position_in_grid]],
+                                          uint thread_index               [[thread_index_in_threadgroup]]) {
+    threadgroup ftype4 input_data_cache[8 * 8];
+    
+    // compute ld offset of inputs
+    const int ld_start_w = group_id.x * 4 - params.pad_x;
+    const int ld_start_h = group_id.y * 8 - params.pad_y;
+    const int ld_start_c = group_id.z;
+    
+    const int ld_offset = ld_start_c * params.input_size;
+    
+    const int a_smem_st_offset = thread_index;
+    
+    // load data
+    int ld_w = ld_start_w + thread_index % 8;
+    int ld_h = ld_start_h + thread_index / 8;
+    const int ld_pos = ld_offset + ld_h * params.input_width + ld_w;
+    
+    bool w_in_image = ld_w >=0 && ld_w < params.input_width;
+    
+    bool in_image = (ld_h >=0 && ld_h < params.input_height) && w_in_image;
+    ftype4 v = in_image ? in[ld_pos] : Zero4;
+    input_data_cache[a_smem_st_offset] = v;
+    
+    bool in_image1 =  ld_h + 4 >= 0 && ld_h + 4 < params.input_height && w_in_image ;
+    v = in_image1 ? in[ld_pos + 4 * params.input_width] : Zero4;
+    input_data_cache[a_smem_st_offset +  32] = v;
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (!any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch))) {
+        auto z_out = out + (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+        auto result = params.has_bias ? biasTerms[gid.z] : Zero4;
+        auto z_wt  = wt  + (int)gid.z * params.kernel_size;
+        
+        int offset_x = thread_index % 4;
+        int y = thread_index / 4;
+#pragma unroll
+        for (auto kx = 0, x = offset_x; kx < 5; kx++, x ++) {
+            auto wt4 = z_wt[kx];
+            auto in4 = input_data_cache[ y * 8 + x];
+            result += in4 * wt4;
+        }
+        
+        *z_out = activate(result, params.activation);
+    }
+}
+
+kernel void convolution_depthwise1x5_h4w8(const device ftype4 *in           [[buffer(0)]],
+                                          device ftype4 *out                [[buffer(1)]],
+                                          constant MetalConvParams& params  [[buffer(2)]],
+                                          const device ftype4 *wt           [[buffer(3)]],
+                                          const device ftype4 *biasTerms    [[buffer(4)]],
+                                          uint3 gid                       [[thread_position_in_grid]],
+                                          uint3 group_id                  [[threadgroup_position_in_grid]],
+                                          uint thread_index               [[thread_index_in_threadgroup]]) {
+    threadgroup ftype4 input_data_cache[8 * 8];
+    
+    // compute ld offset of inputs
+    const int ld_start_w = group_id.x * 8 - params.pad_x;
+    const int ld_start_h = group_id.y * 4 - params.pad_y;
+    const int ld_start_c = group_id.z;
+    
+    const int ld_offset = ld_start_c * params.input_size;
+    
+    const int a_smem_st_offset = thread_index;
+    
+    // load data
+    int ld_w = ld_start_w + thread_index % 8;
+    int ld_h = ld_start_h + thread_index / 8;
+    const int ld_pos = ld_offset + ld_h * params.input_width + ld_w;
+    
+    bool w_in_image = ld_w >=0 && ld_w < params.input_width;
+    
+    bool in_image = (ld_h >=0 && ld_h < params.input_height) && w_in_image;
+    ftype4 v = in_image ? in[ld_pos] : Zero4;
+    input_data_cache[a_smem_st_offset] = v;
+    
+    bool in_image1 =  ld_h + 4 >= 0 && ld_h + 4 < params.input_height && w_in_image ;
+    v = in_image1 ? in[ld_pos + 4 * params.input_width] : Zero4;
+    input_data_cache[a_smem_st_offset +  32] = v;
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    if (!any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch))) {
+        auto z_out = out + (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+        auto result = params.has_bias ? biasTerms[gid.z] : Zero4;
+        auto z_wt  = wt  + (int)gid.z * params.kernel_size;
+        
+        int x = thread_index % 8;
+        int offset_y = thread_index / 8;
+#pragma unroll
+        for (auto ky = 0, y = offset_y; ky < 5; ky++, y ++) {
+            auto wt4 = z_wt[ky];
+            auto in4 = input_data_cache[ y * 8 + x];
+            result += in4 * wt4;
+        }
+        
+        *z_out = activate(result, params.activation);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.mm
new file mode 100644
index 0000000..e977c43
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.mm
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+bool MetalConvLayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    auto input_dims          = inputs[0]->GetBlobDesc().dims;
+    auto output_dims         = outputs[0]->GetBlobDesc().dims;
+
+    return param->group == input_dims[1] && param->group == output_dims[1];
+}
+
+MetalConvLayerDepthwise::~MetalConvLayerDepthwise() {}
+
+Status MetalConvLayerDepthwise::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    auto param  = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        int kw = param->kernels[0];
+        int kh = param->kernels[1];
+
+        const int group = param->group;
+        buffer_weight_ =  AllocatePackedNC4HW4MetalBufferFormRawBuffer(
+                                                                       resource->filter_handle,
+                                                                       {1, group, kh, kw},
+                                                                       group,
+                                                                       status);
+    }
+    return status;
+}
+
+Status MetalConvLayerDepthwise::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device        = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *layer_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalConvParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        SetDefaultMetalConvParams(metal_params, layer_param);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConvParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    // check if specialized kernels should be used
+    bool s11 = (layer_param->strides[0] == 1 && layer_param->strides[1] == 1);
+    bool d11 = (layer_param->dialations[0] == 1 && layer_param->dialations[1] == 1);
+    bool k15 = (layer_param->kernels[0] == 1 && layer_param->kernels[1] == 5);
+    bool k51 = (layer_param->kernels[0] == 5 && layer_param->kernels[1] == 1);
+    if (s11 && d11 && k51) {
+        this->k51s1d1_ = true;
+    } else if (s11 && d11 && k15) {
+        this->k15s1d1_ = true;
+    }
+
+    return TNN_OK;
+}
+
+std::string MetalConvLayerDepthwise::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (k51s1d1_) {
+        return "convolution_depthwise5x1_h8w4";
+    } else if (k15s1d1_) {
+        return "convolution_depthwise1x5_h4w8";
+    }
+    return "convolution_depthwise";
+}
+
+Status MetalConvLayerDepthwise::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+    [encoder setBuffer:buffer_weight_
+                offset:0
+               atIndex:3];
+    [encoder setBuffer:buffer_bias_
+                offset:0
+               atIndex:4];
+    return TNN_OK;
+}
+
+Status MetalConvLayerDepthwise::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto output = outputs[0];
+    auto dims_output  = output->GetBlobDesc().dims;
+    if (k51s1d1_) {
+        size = MTLSizeMake(4, 8, 1);
+    } else if (k15s1d1_) {
+        size = MTLSizeMake(8, 4, 1);
+    } else {
+        size = GetDefaultThreadSize(dims_output, false);
+    }
+    return TNN_OK;
+}
+
+Status MetalConvLayerDepthwise::ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size) {
+    auto dims_output  = outputs[0]->GetBlobDesc().dims;
+    auto output_height = dims_output[2];
+    auto output_width  = dims_output[3];
+    auto output_slice = UP_DIV(dims_output[1], 4);
+    auto output_batch = dims_output[0];
+    if (k51s1d1_) {
+        size = MTLSizeMake(UP_DIV(output_width, 4),
+                           UP_DIV(output_height, 8),
+                           output_batch*output_slice);
+    } else if (k15s1d1_) {
+        size = MTLSizeMake(UP_DIV(output_width, 8),
+                           UP_DIV(output_height, 4),
+                           output_batch*output_slice);
+    } else {
+        size = MTLSizeMake(0, 0, 0);
+    }
+    return TNN_OK;
+}
+
+
+Status MetalConvLayerDepthwise::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.h
new file mode 100644
index 0000000..131e31b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_WINOGRAD_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_WINOGRAD_H_
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalConvLayerWinograd : public MetalConvLayerCommon {
+public:
+    virtual ~MetalConvLayerWinograd();
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+    Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+protected:
+    id<MTLBuffer> buffer_shape_       = nil;
+    id<MTLBuffer> buffer_temp_input_  = nil;
+    id<MTLBuffer> buffer_temp_output_ = nil;
+
+    MTLSize input_transform_threads_;
+    MTLSize matmul_threads_;
+    MTLSize output_transform_threads_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONV_LAYER_ACC_WINOGRAD_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.metal b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.metal
new file mode 100644
index 0000000..130611c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.metal
@@ -0,0 +1,201 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+template <typename IType, typename OType>
+static inline void matmul4x4_template(const device IType *in,
+                                      device OType *out,
+                                      const device IType *kt,
+                                      constant MetalMatMul4x4Params &params,
+                                      uint3 gid) {
+    if ((int)gid.x >= params.output_width || (int)gid.y >= params.output_height)
+        return;
+    
+    auto ky = (int)gid.y + (int)gid.z * params.output_height;
+    auto iy = (int)gid.x + (int)gid.z * params.output_width;
+    auto off_in  = in  + iy * params.multi_length;
+    auto off_wt  = kt  + ky * params.multi_length;
+    auto off_out = out + iy + 4 * (int)gid.y * params.output_width * params.group;
+    
+    float4 result0 = 0, result1 = 0, result2 = 0, result3 = 0;
+    for (int k = 0; k < params.multi_length; ++k) {
+        auto w4x4 = float4x4(off_wt[k]);
+        auto i4x4 = float4x4(off_in[k]);
+        result0 += w4x4 * i4x4[0];
+        result1 += w4x4 * i4x4[1];
+        result2 += w4x4 * i4x4[2];
+        result3 += w4x4 * i4x4[3];
+    }
+    *off_out = OType(result0); off_out += params.output_width * params.group;
+    *off_out = OType(result1); off_out += params.output_width * params.group;
+    *off_out = OType(result2); off_out += params.output_width * params.group;
+    *off_out = OType(result3);
+}
+
+kernel void matmul4x4(const device ftype4x4 *in     [[buffer(0)]],
+                      device ftype4 *out            [[buffer(1)]],
+                      const device ftype4x4 *kt     [[buffer(2)]],
+                      constant MetalMatMul4x4Params &params [[buffer(3)]],
+                      uint3 gid                     [[thread_position_in_grid]]) {
+    matmul4x4_template<ftype4x4, ftype4>(in, out, kt, params, gid);
+}
+
+
+
+static inline ftype4 get_input(const device ftype4 *input, int x, int y, constant MetalWinogradParams &params) {
+    return x < params.input_width && y < params.input_height && x >= 0 && y >= 0 ? input[x + y * params.input_width] : Zero4;
+}
+
+kernel void winograd_transform_source2_3_1(const device ftype4 *in          [[buffer(0)]],
+                                           device ftype4 *out               [[buffer(1)]],
+                                           constant MetalWinogradParams &params [[buffer(2)]],
+                                           uint3 gid                        [[thread_position_in_grid]]) {
+    if ((int)gid.x >= params.unit_width || (int)gid.y >= params.unit_height)
+        return;
+    
+    auto pos = int3(gid);
+    
+    int ix = pos.x * params.unit - params.pad_x;
+    int iy = pos.y * params.unit - params.pad_y;
+    
+    auto z_in = in + pos.z * params.input_width * params.input_height;
+    auto S00 = get_input(z_in, ix + 0, iy + 0, params);
+    auto S10 = get_input(z_in, ix + 1, iy + 0, params);
+    auto S20 = get_input(z_in, ix + 2, iy + 0, params);
+    auto S30 = get_input(z_in, ix + 3, iy + 0, params);
+    auto S01 = get_input(z_in, ix + 0, iy + 1, params);
+    auto S11 = get_input(z_in, ix + 1, iy + 1, params);
+    auto S21 = get_input(z_in, ix + 2, iy + 1, params);
+    auto S31 = get_input(z_in, ix + 3, iy + 1, params);
+    auto S02 = get_input(z_in, ix + 0, iy + 2, params);
+    auto S12 = get_input(z_in, ix + 1, iy + 2, params);
+    auto S22 = get_input(z_in, ix + 2, iy + 2, params);
+    auto S32 = get_input(z_in, ix + 3, iy + 2, params);
+    auto S03 = get_input(z_in, ix + 0, iy + 3, params);
+    auto S13 = get_input(z_in, ix + 1, iy + 3, params);
+    auto S23 = get_input(z_in, ix + 2, iy + 3, params);
+    auto S33 = get_input(z_in, ix + 3, iy + 3, params);
+    
+    auto m00 = +S00 - S02;
+    auto m10 = +S10 - S12;
+    auto m20 = +S20 - S22;
+    auto m30 = +S30 - S32;
+    auto m01 = +0.5 * S01 + 0.5 * S02;
+    auto m11 = +0.5 * S11 + 0.5 * S12;
+    auto m21 = +0.5 * S21 + 0.5 * S22;
+    auto m31 = +0.5 * S31 + 0.5 * S32;
+    auto m02 = -0.5 * S01 + 0.5 * S02;
+    auto m12 = -0.5 * S11 + 0.5 * S12;
+    auto m22 = -0.5 * S21 + 0.5 * S22;
+    auto m32 = -0.5 * S31 + 0.5 * S32;
+    auto m03 = -S01 + S03;
+    auto m13 = -S11 + S13;
+    auto m23 = -S21 + S23;
+    auto m33 = -S31 + S33;
+    
+    int dst_x_origin = pos.z;
+    int dst_y_origin = params.unit_width * pos.y + pos.x;
+    int dst_y_stride = params.input_slice * 4;
+    int dst_y        = dst_y_origin / 4;
+    int dst_x        = dst_y_origin % 4 + 4 * dst_x_origin;
+    int src_height   = UP_DIV(params.unit_width * params.unit_height, 4);
+    int stride       = src_height * dst_y_stride;
+    auto xy_out = out + dst_y * dst_y_stride + dst_x;
+    *xy_out =  +m00 - m20;
+    xy_out += stride; *xy_out =  +0.5 * m10 + 0.5 * m20;
+    xy_out += stride; *xy_out =  -0.5 * m10 + 0.5 * m20;
+    xy_out += stride; *xy_out =  -m10 + m30;
+    xy_out += stride; *xy_out =  +m01 - m21;
+    xy_out += stride; *xy_out =  +0.5 * m11 + 0.5 * m21;
+    xy_out += stride; *xy_out =  -0.5 * m11 + 0.5 * m21;
+    xy_out += stride; *xy_out =  -m11 + m31;
+    xy_out += stride; *xy_out =  +m02 - m22;
+    xy_out += stride; *xy_out=  +0.5 * m12 + 0.5 * m22;
+    xy_out += stride; *xy_out =  -0.5 * m12 + 0.5 * m22;
+    xy_out += stride; *xy_out =  -m12 + m32;
+    xy_out += stride; *xy_out =  +m03 - m23;
+    xy_out += stride; *xy_out =  +0.5 * m13 + 0.5 * m23;
+    xy_out += stride; *xy_out =  -0.5 * m13 + 0.5 * m23;
+    xy_out += stride; *xy_out =  -m13 + m33;
+}
+
+static inline void set_output(constant MetalWinogradParams &params, device ftype4 *output, int x, int y, ftype4 value) {
+    output[y * params.output_width + x] = activate(value, params.activation);
+}
+
+kernel void winograd_transform_dest2_3_1(const device ftype4 *in            [[buffer(0)]],
+                                         device ftype4 *out                 [[buffer(1)]],
+                                         const device ftype4 *biasTerms     [[buffer(2)]],
+                                         constant MetalWinogradParams &params   [[buffer(3)]],
+                                         uint3 gid                          [[thread_position_in_grid]]) {
+    if ((int)gid.x >= params.unit_width || (int)gid.y >= params.unit_height)
+        return;
+    auto pos = int3(gid);
+    
+    int dst_w        = UP_DIV(params.unit_width * params.unit_height, 4);
+    int dst_x_origin = params.unit_width * pos.y + pos.x;
+    int dst_x        = dst_x_origin / 4;
+    int dst_y        = 4 * pos.z + dst_x_origin % 4;
+    int dst_y_stride = dst_w * 16;
+    auto xy_in = in + dst_y * dst_y_stride + dst_x;
+    
+    auto S00 = *xy_in; xy_in += dst_w;
+    auto S10 = *xy_in; xy_in += dst_w;
+    auto S20 = *xy_in; xy_in += dst_w;
+    auto S30 = *xy_in; xy_in += dst_w;
+    auto S01 = *xy_in; xy_in += dst_w;
+    auto S11 = *xy_in; xy_in += dst_w;
+    auto S21 = *xy_in; xy_in += dst_w;
+    auto S31 = *xy_in; xy_in += dst_w;
+    auto S02 = *xy_in; xy_in += dst_w;
+    auto S12 = *xy_in; xy_in += dst_w;
+    auto S22 = *xy_in; xy_in += dst_w;
+    auto S32 = *xy_in; xy_in += dst_w;
+    auto S03 = *xy_in; xy_in += dst_w;
+    auto S13 = *xy_in; xy_in += dst_w;
+    auto S23 = *xy_in; xy_in += dst_w;
+    auto S33 = *xy_in;
+    
+    auto m00 = +S00 + S01 + S02;
+    auto m10 = +S10 + S11 + S12;
+    auto m20 = +S20 + S21 + S22;
+    auto m30 = +S30 + S31 + S32;
+    auto m01 = +S01 - S02 + S03;
+    auto m11 = +S11 - S12 + S13;
+    auto m21 = +S21 - S22 + S23;
+    auto m31 = +S31 - S32 + S33;
+    
+    // write output
+    auto b4 = params.has_bias? biasTerms[int(pos.z)] : ftype4(Zero4);
+    int oy = pos.y * params.unit;
+    int ox = pos.x * params.unit;
+    auto z_out = out + pos.z * params.output_width * params.output_height;
+    
+    /* if true */ {
+        set_output(params, z_out, ox + 0, oy + 0, b4 + m00 + m10 + m20);
+    }
+    if (ox + 1 < params.output_width) {
+        set_output(params, z_out, ox + 1, oy + 0, b4 + m10 - m20 + m30);
+    }
+    if (oy + 1 < params.output_height) {
+        set_output(params, z_out, ox + 0, oy + 1, b4 + m01 + m11 + m21);
+    }
+    if (ox + 1 < params.output_width && oy + 1 < params.output_height) {
+        set_output(params, z_out, ox + 1, oy + 1, b4 + m11 - m21 + m31);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.mm
new file mode 100644
index 0000000..253774f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_conv_layer_winograd.mm
@@ -0,0 +1,269 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_winograd.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/winograd_generator.h"
+
+namespace TNN_NS {
+bool MetalConvLayerWinograd::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    //        return false;
+    if (!param) {
+        return false;
+    }
+
+    if (param->group != 1) {
+        return false;
+    }
+
+    if (param->group != 1 || param->kernels[0] != 3 || param->kernels[1] != 3 || param->dialations[0] != 1 ||
+        param->dialations[1] != 1 || param->strides[0] != 1 || param->strides[1] != 1) {
+        return false;
+    }
+
+    auto iw = inputs[0]->GetBlobDesc().dims[3];
+    auto ih = inputs[0]->GetBlobDesc().dims[2];
+    auto ic = ROUND_UP(inputs[0]->GetBlobDesc().dims[1], 4);
+    auto oc = ROUND_UP(outputs[0]->GetBlobDesc().dims[1], 4);
+    // skip layers with large chennels due to large numerical errors
+    return (ic * oc * ih / iw >= 2048) && (ic * oc < 512 * 4096);
+}
+
+MetalConvLayerWinograd::~MetalConvLayerWinograd() {}
+
+Status MetalConvLayerWinograd::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device        = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *conv_param  = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        const int dst_unit = 2;
+
+        const int kw = conv_param->kernels[0];
+        const int kh = conv_param->kernels[1];
+
+        const int weight_bytes_count = conv_res->filter_handle.GetBytesSize();
+        const float *weight_fp32     = conv_res->filter_handle.force_to<float *>();
+        const uint16_t *weight_fp16  = conv_res->filter_handle.force_to<uint16_t *>();
+        const DataType data_type     = conv_res->filter_handle.GetDataType();
+
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+
+        //转float
+        if (data_type == DATA_TYPE_HALF) {
+            weight_fp32 = new float[weight_bytes_count / 2];
+            ConvertFromHalfToFloat((void *)weight_fp16, (float *)weight_fp32, weight_bytes_count / 2);
+        }
+
+        //预处理
+        WinogradGenerator generator(dst_unit, kh, 1.0f);
+        auto pack_weight_fp32 = generator.allocTransformWeight(output_channel, input_channel, kh, kw, 4, 4);
+        generator.transformWeight(pack_weight_fp32, weight_fp32, output_channel, input_channel, kh, kw);
+
+        auto pack_weight_fp32_data = get<0>(pack_weight_fp32).get();
+        auto pack_weight_fp32_dims = get<1>(pack_weight_fp32);
+        int pack_weight_count      = DimsVectorUtils::Count(pack_weight_fp32_dims);
+
+#if TNN_METAL_FULL_PRECISION
+        {
+            buffer_weight_ = [device newBufferWithBytes:pack_weight_fp32_data
+                                                 length:pack_weight_count * sizeof(float)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        }
+#else
+        {
+            auto pack_weight_fp16_data = new uint16_t[pack_weight_count];
+            ConvertFromFloatToHalf((float *)pack_weight_fp32_data, (void *)pack_weight_fp16_data, pack_weight_count);
+            buffer_weight_ = [device newBufferWithBytes:pack_weight_fp16_data
+                                                 length:pack_weight_count * sizeof(uint16_t)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] pack_weight_fp16_data;
+        }
+
+#endif
+        if (data_type == DATA_TYPE_HALF) {
+            delete[] weight_fp32;
+        }
+    }
+    return status;
+}
+
+Status MetalConvLayerWinograd::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    const int src_unit = 2 + conv_param->kernels[1] - 1;
+    const int dst_nit  = 2;
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    auto ow = dims_output[3];
+    auto oh = dims_output[2];
+    auto uw = UP_DIV(ow, dst_nit);
+    auto uh = UP_DIV(oh, dst_nit);
+    auto us = UP_DIV(uw * uh, 4);
+    auto iz = UP_DIV(dims_input[1], 4);
+    auto oz = UP_DIV(dims_output[1], 4);
+
+    // buffer_param_
+    {
+        MetalWinogradParams metal_params;
+        metal_params.activation = conv_param->activation_type;
+        metal_params.has_bias   = conv_param->bias;
+
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.pad_x       = conv_param->pads[0];
+        metal_params.pad_y       = conv_param->pads[2];
+        metal_params.unit_width  = uw;
+        metal_params.unit_height = uh;
+        metal_params.unit        = dst_nit;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalWinogradParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    // buffer_shape_
+    {
+        MetalMatMul4x4Params metal_params;
+        metal_params.output_width  = us;
+        metal_params.output_height = oz;
+        metal_params.multi_length  = iz;
+        metal_params.group         = src_unit * src_unit;
+
+        buffer_shape_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalMatMul4x4Params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    // save threads size
+    {
+        input_transform_threads_.width   = uw;
+        input_transform_threads_.height  = uh;
+        input_transform_threads_.depth   = iz;
+        matmul_threads_.width            = us;
+        matmul_threads_.height           = oz;
+        matmul_threads_.depth            = src_unit * src_unit;
+        output_transform_threads_.width  = uw;
+        output_transform_threads_.height = uh;
+        output_transform_threads_.depth  = oz;
+    }
+
+    {
+        int data_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+        // accquire space
+        int is = src_unit * src_unit * us * iz * 16 * data_byte_size;
+        int os = src_unit * src_unit * us * oz * 16 * data_byte_size;
+#if TNN_METAL_DEBUG
+        buffer_temp_input_  = [device newBufferWithLength:is options:MTLResourceCPUCacheModeDefaultCache];
+        buffer_temp_output_ = [device newBufferWithLength:os options:MTLResourceCPUCacheModeDefaultCache];
+#else
+        buffer_temp_input_  = [device newBufferWithLength:is options:MTLResourceStorageModePrivate];
+        buffer_temp_output_ = [device newBufferWithLength:os options:MTLResourceStorageModePrivate];
+#endif
+    }
+
+    return TNN_OK;
+}
+
+Status MetalConvLayerWinograd::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    int batch = dims_output[0];
+
+    int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+    auto input_bytes = dims_input[3] * dims_input[2] * ROUND_UP(dims_input[1], 4) * data_byte_size;
+    auto output_bytes = dims_output[3] * dims_output[2] * ROUND_UP(dims_output[1], 4) * data_byte_size;
+
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder      = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+
+    for (int n = 0; n < batch; ++n) {
+        do {
+            { // transform
+                status = [context_impl load:@"winograd_transform_source2_3_1" encoder:encoder bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+
+                [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                            offset:(NSUInteger)input->GetHandle().bytes_offset + n*input_bytes
+                           atIndex:0];
+                [encoder setBuffer:buffer_temp_input_ offset:0 atIndex:1];
+                [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+                status = [context_impl dispatchEncoder:encoder threads:input_transform_threads_ bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+            }
+            { // gemm
+                status = [context_impl load:@"matmul4x4" encoder:encoder bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+
+                [encoder setBuffer:buffer_temp_input_ offset:0 atIndex:0];
+                [encoder setBuffer:buffer_temp_output_ offset:0 atIndex:1];
+                [encoder setBuffer:buffer_weight_ offset:0 atIndex:2];
+                [encoder setBuffer:buffer_shape_ offset:0 atIndex:3];
+                status = [context_impl dispatchEncoder:encoder threads:matmul_threads_ bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+            }
+            { // transform
+                status = [context_impl load:@"winograd_transform_dest2_3_1" encoder:encoder bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+
+                [encoder setBuffer:buffer_temp_output_ offset:0 atIndex:0];
+                [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                            offset:(NSUInteger)output->GetHandle().bytes_offset + n*output_bytes
+                           atIndex:1];
+                [encoder setBuffer:buffer_bias_ offset:0 atIndex:2];
+                [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+                status = [context_impl dispatchEncoder:encoder threads:output_transform_threads_ bandwidth:bandwidth];
+                BREAK_IF(status != TNN_OK);
+            }
+        } while (0);
+        if (status != TNN_OK) {
+            [encoder endEncoding];
+            return status;
+        }
+    }
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.h
new file mode 100644
index 0000000..a2c2602
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_INNER_PRODUCT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_INNER_PRODUCT_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief innerproduct layer metal acc
+class MetalInnerProductLayerAcc : public MetalConvLayerCommon {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~MetalInnerProductLayerAcc();
+
+    Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status SetKernelEncoderParam(
+                                         id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+
+protected:
+    Status isSupported(LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                       const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_INNER_PRODUCT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.metal
new file mode 100644
index 0000000..ba19824
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.metal
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void inner_product(const device ftype4 *in                                      [[buffer(0)]],
+                                         device ftype4 *out                                              [[buffer(1)]],
+                                         constant MetalInnerProductParams & params  [[buffer(2)]],
+                                         const device ftype4x4 *wt                                  [[buffer(3)]],
+                                         const device ftype4 *biasTerms                         [[buffer(4)]],
+                                         uint3 gid                                                             [[thread_position_in_grid]]) {
+    if ((int)gid.x >= params.output_size || (int)gid.y >= params.output_slice || (int)gid.z >= params.batch) return;
+    
+    auto xy_wt  = wt                                                    + (int)gid.y * params.input_slice * params.input_size;
+    auto xy_in  = in  + (int)gid.z * params.input_slice  * params.input_size  + (int)gid.x;
+    auto xy_out = out + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    auto result = params.has_bias ? float4(biasTerms[gid.y]) : float4(Zero4);
+    for (auto z = 0; z < params.input_slice*params.input_size; z++) {
+            result += float4(xy_in[z]) * float4x4(xy_wt[z]);
+    }
+    *xy_out = activate(ftype4(result), params.activation);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.mm
new file mode 100644
index 0000000..1a12c2f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.mm
@@ -0,0 +1,179 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_inner_product_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+Status MetalInnerProductLayerAcc::Init(Context *context, LayerParam *param,
+                               LayerResource *resource,
+                               const std::vector<Blob *> &inputs,
+                               const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status MetalInnerProductLayerAcc::isSupported(LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    
+    auto layer_param = dynamic_cast<InnerProductLayerParam *>(param);
+    if (!layer_param || layer_param->axis != 1) {
+        LOGE("MetalInnerProductLayerAcc do not support axis!=1 \n");
+        return Status(TNNERR_LAYER_ERR, "MetalInnerProductLayerAcc do not support axis!=1");
+    }
+    
+    auto layer_res = dynamic_cast<InnerProductLayerResource *>(resource);
+    if (!layer_res) {
+        LOGE("InnerProductLayerResource is invalid \n");
+        return Status(TNNERR_LAYER_ERR, "InnerProductLayerResource is invalid");
+    }
+    return TNN_OK;
+}
+
+MetalInnerProductLayerAcc::~MetalInnerProductLayerAcc() {}
+
+Status
+MetalInnerProductLayerAcc::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                             const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    auto layer_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output  = outputs[0]->GetBlobDesc().dims;
+    const int input_channel = dims_input[1];
+    const int output_channel = dims_output[1];
+    
+    const int kh = DimsFunctionUtils::GetDim(dims_input, 2);
+    const int kw = DimsFunctionUtils::GetDim(dims_input, 3);
+    
+    if (!buffer_weight_) {
+        buffer_weight_ = AllocatePackedGOIHW16MetalBufferFormRawBuffer(layer_res->weight_handle,
+                                                                {output_channel, input_channel, kh, kw},
+                                                                1, status);
+    }
+    return status;
+}
+
+Status
+MetalInnerProductLayerAcc::AllocateBufferBias(const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    auto layer_param  = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto layer_res = dynamic_cast<InnerProductLayerResource *>(resource_);
+
+    // buffer_bias_
+    if (!buffer_bias_) {
+        if (layer_param->has_bias) {
+            auto dims_output = outputs[0]->GetBlobDesc().dims;
+            buffer_bias_ = AllocateMetalBufferFormRawBuffer1D(layer_res->bias_handle,
+                                                              dims_output[1], status);
+        } else {
+            //防止bind时候为空
+            buffer_bias_ = buffer_weight_;
+        }
+    }
+    return status;
+}
+
+Status
+MetalInnerProductLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    auto param = dynamic_cast<InnerProductLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalInnerProductParams metal_params;
+        metal_params.has_bias   = param->has_bias;
+
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        buffer_param_ =
+            [device newBufferWithBytes:(const void *)(&metal_params)
+                                length:sizeof(MetalInnerProductParams)
+                               options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalInnerProductLayerAcc::Reshape(const std::vector<Blob *> &inputs,
+                                  const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    return MetalConvLayerCommon::Reshape(inputs, outputs);
+}
+
+std::string MetalInnerProductLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "inner_product";
+}
+
+Status MetalInnerProductLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+    [encoder setBuffer:buffer_weight_
+                offset:0
+               atIndex:3];
+    [encoder setBuffer:buffer_bias_
+                offset:0
+               atIndex:4];
+    return TNN_OK;
+}
+
+Status MetalInnerProductLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalInnerProductLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                  const std::vector<Blob *> &outputs) {
+    Status status = isSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(InnerProduct, LAYER_INNER_PRODUCT);
+REGISTER_METAL_LAYOUT(LAYER_INNER_PRODUCT, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.h
new file mode 100644
index 0000000..fd17087
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief Deconv layer metal acc
+class MetalDeconvLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~MetalDeconvLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<MetalLayerAcc> deconv_acc_impl_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.mm
new file mode 100644
index 0000000..3575819
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.mm
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_acc.h"
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h"
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+Status MetalDeconvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (MetalDeconvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        deconv_acc_impl_ = make_shared<MetalDeconvLayerDepthwise>();
+    } else {
+        deconv_acc_impl_ = make_shared<MetalDeconvLayerCommon>();
+    }
+
+    auto status = deconv_acc_impl_->Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+MetalDeconvLayerAcc::~MetalDeconvLayerAcc() {}
+
+Status MetalDeconvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    do {
+        if (MetalDeconvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param_), inputs, outputs)) {
+            if (!deconv_acc_impl_ || !dynamic_cast<MetalDeconvLayerDepthwise *>(deconv_acc_impl_.get())) {
+                auto deconv_acc = make_shared<MetalDeconvLayerDepthwise>();
+                deconv_acc->Init(context_, param_, resource_, inputs, outputs);
+                deconv_acc_impl_ = deconv_acc;
+                break;
+            }
+        }
+    } while (0);
+
+    if (deconv_acc_impl_) {
+        return deconv_acc_impl_->Reshape(inputs, outputs);
+    } else {
+        LOGE("Error: Deconv_acc_impl_ is nil\n");
+        return Status(TNNERR_LAYER_ERR, "Deconv_acc_impl_ is nil");
+    }
+}
+
+Status MetalDeconvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (deconv_acc_impl_) {
+        return deconv_acc_impl_->Forward(inputs, outputs);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "Deconv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_METAL_ACC(Deconv, LAYER_DECONVOLUTION);
+REGISTER_METAL_LAYOUT(LAYER_DECONVOLUTION, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h
new file mode 100644
index 0000000..af2c1db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief Deconv layer metal acc
+class MetalDeconvLayerCommon : public MetalConvLayerCommon {
+public:
+    virtual ~MetalDeconvLayerCommon();
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+    virtual Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status ComputeDeconvParam(MetalConvParams& metal_param);
+
+protected:
+    bool is_group2_specialized_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.metal b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.metal
new file mode 100644
index 0000000..c6f0a98
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.metal
@@ -0,0 +1,192 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void deconv_common_group_channel_in4x_out4x(const device ftype4 *in          [[buffer(0)]],
+                                                   device ftype4 *out               [[buffer(1)]],
+                                                   constant MetalConvParams& param   [[buffer(2)]],
+                                                   const device ftype4x4 *wt        [[buffer(3)]],
+                                                   const device ftype4 *biasTerms   [[buffer(4)]],
+                                                   uint3 gid                        [[thread_position_in_grid]]) {
+    if ((int)gid.x >= param.output_width || (int)gid.y >= param.output_height) return;
+    
+    int b = gid.z / param.output_slice;
+    int o = gid.z % param.output_slice;
+    float4 result = param.has_bias ? float4(biasTerms[o]) : float4(Zero4);
+    
+    
+    int oy = (int)gid.y + param.pad_y;
+    int ox = (int)gid.x + param.pad_x;
+    int max_sy = min((param.input_height - 1) * param.stride_y, oy / param.stride_y * param.stride_y);
+    int max_sx = min((param.input_width - 1) * param.stride_x, ox / param.stride_x * param.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, param.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, param.dilation_x);
+    if ((oy - min_ky * param.dilation_y) % param.stride_y == 0 && (ox - min_kx * param.dilation_x) % param.stride_x == 0) {
+        
+        int min_sy = max(0, ROUND_UP(oy + param.dilation_y - param.kernel_y * param.dilation_y, param.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + param.dilation_x - param.kernel_x * param.dilation_x, param.stride_x));
+        int max_ky = (oy - min_sy) / param.dilation_y;
+        int max_kx = (ox - min_sx) / param.dilation_x;
+        int min_iy = (oy - max_ky * param.dilation_y) / param.stride_y;
+        int min_ix = (ox - max_kx * param.dilation_x) / param.stride_x;
+        auto o_wt = wt + o * param.input_slice * param.kernel_size;
+        auto b_in = in + b * param.input_slice * param.input_size;
+        for (auto z = 0; z < param.input_slice; z++) {
+            for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= param.kernel_delta_y, iy += param.input_delta_y) {
+                for (auto kx = max_kx, ix = min_ix; kx >= min_kx; kx -= param.kernel_delta_x, ix += param.input_delta_x) {
+                    auto wt4 = float4x4(o_wt[z * param.kernel_size + ky * param.kernel_x + kx]);
+                    auto in4 = float4(b_in[z * param.input_size + iy * param.input_width + ix]);
+                    result += float4(in4 * wt4);
+                }
+            }
+        }
+    }
+    
+    out[(int)gid.z * param.output_size + (int)gid.y * param.output_width + (int)gid.x] = activate(ftype4(result), param.activation);
+}
+
+
+kernel void deconv_common_group_channel_in2_out1_group2(const device ftype4 *in          [[buffer(0)]],
+                                                        device ftype4 *out               [[buffer(1)]],
+                                                        constant MetalConvParams& param   [[buffer(2)]],
+                                                        const device ftype4x4 *wt        [[buffer(3)]],
+                                                        const device ftype4 *biasTerms   [[buffer(4)]],
+                                                        uint3 gid                        [[thread_position_in_grid]]) {
+    if ((int)gid.x >= param.output_width || (int)gid.y >= param.output_height) return;
+    
+    float4 result0 = param.has_bias ? float4(biasTerms[0]) : float4(Zero4);
+    float4 result1 = float4(result0.y);
+    
+    
+    int oy = (int)gid.y + param.pad_y;
+    int ox = (int)gid.x + param.pad_x;
+    int max_sy = min((param.input_height - 1) * param.stride_y, oy / param.stride_y * param.stride_y);
+    int max_sx = min((param.input_width - 1) * param.stride_x, ox / param.stride_x * param.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, param.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, param.dilation_x);
+    if ((oy - min_ky * param.dilation_y) % param.stride_y == 0 && (ox - min_kx * param.dilation_x) % param.stride_x == 0) {
+        
+        int min_sy = max(0, ROUND_UP(oy + param.dilation_y - param.kernel_y * param.dilation_y, param.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + param.dilation_x - param.kernel_x * param.dilation_x, param.stride_x));
+        int max_ky = (oy - min_sy) / param.dilation_y;
+        int max_kx = (ox - min_sx) / param.dilation_x;
+        int min_iy = (oy - max_ky * param.dilation_y) / param.stride_y;
+        int min_ix = (ox - max_kx * param.dilation_x) / param.stride_x;
+        auto o_wt0 = wt + 0;
+        auto o_wt1 = wt + param.kernel_size;
+        auto b_in = in + 0;
+        for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= param.kernel_delta_y, iy += param.input_delta_y) {
+            for (auto kx = max_kx, ix = min_ix; kx >= min_kx; kx -= param.kernel_delta_x, ix += param.input_delta_x) {
+                auto wt4_0 = o_wt0[ky * param.kernel_x + kx];
+                auto wt4_1 = o_wt1[ky * param.kernel_x + kx];
+                auto in4 = b_in[iy * param.input_width + ix];
+                
+                result0 += float4(float2(in4.xy), 0.0f, 0.0f) * float4x4(wt4_0);
+                result1 += float4(float2(in4.zw), 0.0f, 0.0f) * float4x4(wt4_1);
+            }
+        }
+    }
+    
+    out[(int)gid.y * param.output_width + (int)gid.x] = activate(ftype4(result0.x, result1.x , 0.0f, 0.0f), param.activation);
+}
+
+kernel void deconv_common_group_channel(const device ftype *in          [[buffer(0)]],
+                                        device ftype *out               [[buffer(1)]],
+                                        constant MetalConvParams& param   [[buffer(2)]],
+                                        const device ftype4 *wt         [[buffer(3)]],
+                                        const device ftype *biasTerms   [[buffer(4)]],
+                                        constant int& group             [[buffer(5)]],
+                                        uint3 gid                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(param.output_width,
+                           param.output_height,
+                           param.output_slice)))
+        return;
+
+    int4 local_channel = int(gid.z) * 4 + int4(0, 1, 2, 3);
+    bool4 valid = local_channel < param.output_slice_per_group;
+    auto output_channel = local_channel + group * param.output_slice_per_group;
+
+    auto bias = param.has_bias ? (select(float4(Zero4),
+                        float4(biasTerms[local_channel[0]],
+                                biasTerms[local_channel[1]],
+                                biasTerms[local_channel[2]],
+                                biasTerms[local_channel[3]]),
+                        valid)) : float4(Zero4);
+    float4 sum = bias;
+    
+    auto output_slice = output_channel / 4;
+    auto output_c     = output_channel % 4;
+
+    int start_input_channel = group * param.input_slice_per_group;
+    int start_input_slice   = start_input_channel / 4;
+    int start_input_c       = start_input_channel % 4;
+
+    int oy = (int)gid.y + param.pad_y;
+    int ox = (int)gid.x + param.pad_x;
+    int max_sy = min((param.input_height - 1) * param.stride_y, oy / param.stride_y * param.stride_y);
+    int max_sx = min((param.input_width - 1) * param.stride_x, ox / param.stride_x * param.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, param.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, param.dilation_x);
+    if ((oy - min_ky * param.dilation_y) % param.stride_y == 0 && (ox - min_kx * param.dilation_x) % param.stride_x == 0) {
+        
+        int min_sy = max(0, ROUND_UP(oy + param.dilation_y - param.kernel_y * param.dilation_y, param.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + param.dilation_x - param.kernel_x * param.dilation_x, param.stride_x));
+        int max_ky = (oy - min_sy) / param.dilation_y;
+        int max_kx = (ox - min_sx) / param.dilation_x;
+        int min_iy = (oy - max_ky * param.dilation_y) / param.stride_y;
+        int min_ix = (ox - max_kx * param.dilation_x) / param.stride_x;
+
+        const device ftype4 *xy_wt = wt + (int)gid.z * param.input_slice_per_group * param.kernel_size;
+        const device ftype  *xy_in = in + start_input_slice * param.input_size * 4 + start_input_c;
+        const auto step_size = param.input_size * 4 - 3;
+        
+        for (auto c = 0; c < param.input_slice_per_group; c++) {
+            for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= param.kernel_delta_y, iy += param.input_delta_y) {
+                for (auto kx = max_kx, ix = min_ix; kx >= min_kx; kx -= param.kernel_delta_x, ix += param.input_delta_x) {
+                    auto wt4 = float4(xy_wt[ky * param.kernel_x + kx]);
+                    auto in = float(xy_in[(iy * param.input_width + ix)*4]);
+                    sum += in * wt4;
+                }
+            }
+            start_input_c += 1;
+            int flag = start_input_c == 4;
+            xy_in += flag * step_size + (1 - flag) * 1;
+            start_input_c -= flag * start_input_c;
+            xy_wt += param.kernel_size;
+        }
+    }
+
+    int4 out_idx = output_slice * param.output_size * 4 + \
+                        ((int)gid.y * param.output_width + (int)gid.x) * 4 + output_c;
+    ftype4 result = activate(ftype4(sum), param.activation);
+    if (valid[3]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+        out[out_idx[3]] = result[3];
+    } else if (valid[2]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+        out[out_idx[2]] = result[2];
+    } else if (valid[1]) {
+        out[out_idx[0]] = result[0];
+        out[out_idx[1]] = result[1];
+    } else {
+        out[out_idx[0]] = result[0];
+    }
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.mm b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.mm
new file mode 100644
index 0000000..4ea19ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.mm
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+static int LeastCommonMultiple(int m, int n) {
+    int a = m, b = n;
+    while (a != b) {
+        if (a > b) {
+            a = a - b;
+        } else {
+            b = b - a;
+        }
+    }
+    return m * n / a;
+}
+
+Status MetalDeconvLayerCommon::ComputeDeconvParam(MetalConvParams& metal_params) {
+    metal_params.kernel_delta_y =
+        LeastCommonMultiple(metal_params.dilation_y, metal_params.stride_y) / metal_params.dilation_y;
+    metal_params.kernel_delta_x =
+        LeastCommonMultiple(metal_params.dilation_x, metal_params.stride_x) / metal_params.dilation_x;
+    metal_params.input_delta_y = metal_params.kernel_delta_y * metal_params.dilation_y / metal_params.stride_y;
+    metal_params.input_delta_x = metal_params.kernel_delta_x * metal_params.dilation_x / metal_params.stride_x;
+
+    return TNN_OK;
+}
+
+bool MetalDeconvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+MetalDeconvLayerCommon::~MetalDeconvLayerCommon() {}
+
+Status MetalDeconvLayerCommon::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    ConvLayerParam *layer_param  = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *layer_res = dynamic_cast<ConvLayerResource *>(resource_);
+    auto dims_input              = inputs[0]->GetBlobDesc().dims;
+    auto dims_output             = outputs[0]->GetBlobDesc().dims;
+    const int input_channel      = dims_input[1];
+    const int output_channel     = dims_output[1];
+    const int group = layer_param->group;
+    const int goc   = output_channel / group;
+    const int gic   = input_channel / group;
+    is_channel_4x_ = group == 1 || (group > 1 && (gic % 4 == 0) && (goc % 4 == 0));
+    is_group2_specialized_ = group == 2 && goc == 1 && gic == 2;
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        int kw = layer_param->kernels[0];
+        int kh = layer_param->kernels[1];
+
+        if (is_channel_4x_ || is_group2_specialized_) {
+            buffer_weight_ = AllocatePackedGOIHW16MetalBufferFormRawBuffer(
+                layer_res->filter_handle, {output_channel, input_channel, kh, kw}, layer_param->group, status, true);
+        } else {
+            buffer_weight_ = AllocatePackedGOIHW4MetalBufferFormRawBuffer(
+                layer_res->filter_handle, {output_channel, input_channel, kh, kw}, layer_param->group, status, true);
+        }
+    }
+    return status;
+}
+
+Status MetalDeconvLayerCommon::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device         = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *deconv_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    const int group  = deconv_param->group;
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalConvParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        SetDefaultMetalConvParams(metal_params, deconv_param);
+
+        if (is_channel_4x_ || is_group2_specialized_) {
+            auto input_slice_per_group = UP_DIV(dims_input[1], 4) / group;
+            input_slice_per_group      = input_slice_per_group > 0 ? input_slice_per_group : 1;
+            metal_params.input_slice   = input_slice_per_group;
+
+            auto output_slice_per_group = UP_DIV(dims_output[1], 4) / group;
+            output_slice_per_group      = output_slice_per_group > 0 ? output_slice_per_group : 1;
+            metal_params.output_slice   = output_slice_per_group;
+
+            metal_params.threadgroup_input_slice = metal_params.input_slice;
+            //            metal_params.batch = dims_output[0];
+        } else {
+            const int goc = dims_output[1] / group;
+            const int gic = dims_input[1] / group;
+            metal_params.input_slice            = UP_DIV(gic, 4);
+            metal_params.input_slice_per_group  = gic;
+            metal_params.output_slice           = UP_DIV(goc, 4);
+            metal_params.output_slice_per_group = goc;
+        }
+        
+        auto status = ComputeDeconvParam(metal_params);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConvParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalDeconvLayerCommon::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *deconv_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto group                   = deconv_param->group;
+    auto input                   = inputs[0];
+    auto output                  = outputs[0];
+    auto dims_input              = input->GetBlobDesc().dims;
+    auto dims_output             = output->GetBlobDesc().dims;
+    auto context_impl            = context_->getMetalContextImpl();
+
+    auto encoder = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    auto batch = dims_output[0];
+
+    auto input_width             = dims_input[3];
+    auto input_height            = dims_input[2];
+    auto input_channel           = dims_input[1];
+    auto input_channel_per_group = input_channel / group;
+    auto input_slice             = UP_DIV(dims_input[1], 4);
+    auto input_bytes             = input_width * input_height * input_slice * 4 * data_byte_size;
+    auto input_bytes_per_group   = input_bytes / group;
+
+    auto input_slice_per_group = input_slice / group;
+    input_slice_per_group      = input_slice_per_group > 0 ? input_slice_per_group : 1;
+
+    auto output_width             = dims_output[3];
+    auto output_height            = dims_output[2];
+    auto output_channel           = dims_output[1];
+    auto output_channel_per_group = output_channel / group;
+    auto output_slice             = UP_DIV(dims_output[1], 4);
+    auto output_bytes             = output_width * output_height * output_slice * 4 * data_byte_size;
+    auto output_bytes_per_group   = output_bytes / group;
+
+    auto output_slice_per_group = output_slice / group;
+    output_slice_per_group      = output_slice_per_group > 0 ? output_slice_per_group : 1;
+
+    auto kernel_size = deconv_param->kernels[0] * deconv_param->kernels[1];
+    auto weight_bytes_per_group = output_slice_per_group * input_slice_per_group * kernel_size * 16 * data_byte_size;
+    auto bias_bytes_per_group   = output_slice_per_group * 4 * data_byte_size;
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+    MTLSize threads = {(NSUInteger)output_width, (NSUInteger)output_height, (NSUInteger)output_slice_per_group};
+
+    if (is_channel_4x_) {
+        status = [context_impl load:@"deconv_common_group_channel_in4x_out4x" encoder:encoder bandwidth:bandwidth];
+    } else {
+        //special case
+        if (is_group2_specialized_) {
+            status = [context_impl load:@"deconv_common_group_channel_in2_out1_group2"
+                                encoder:encoder
+                              bandwidth:bandwidth];
+            input_bytes_per_group  = input_channel_per_group * output_height * output_width * data_byte_size;
+            output_slice_per_group = output_channel_per_group * output_height * output_width * data_byte_size;
+        } else {
+            status = [context_impl load:@"deconv_common_group_channel"
+                                encoder:encoder
+                              bandwidth:bandwidth];
+            threads = {(NSUInteger)output_width, (NSUInteger)output_height,
+                        (NSUInteger)UP_DIV(output_channel_per_group, 4)};
+            // compute offset within kernel
+            input_bytes_per_group  = 0;
+            output_bytes_per_group = 0;
+            weight_bytes_per_group = ROUND_UP(output_channel_per_group, 4) * input_channel_per_group * kernel_size * data_byte_size;
+            bias_bytes_per_group   = output_channel_per_group * data_byte_size;
+        }
+    }
+
+    if (status != TNN_OK) {
+        [encoder endEncoding];
+        return status;
+    }
+
+    if (!is_group2_specialized_) {
+        for(int n=0; n<batch; ++n) {
+            for (int g = 0; g < group; g++) {
+                [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                            offset:(NSUInteger)input->GetHandle().bytes_offset + g * input_bytes_per_group + n * input_bytes
+                           atIndex:0];
+                [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                            offset:(NSUInteger)output->GetHandle().bytes_offset + g * output_bytes_per_group + n * output_bytes
+                           atIndex:1];
+                [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+                [encoder setBuffer:buffer_weight_ offset:g * weight_bytes_per_group atIndex:3];
+                [encoder setBuffer:buffer_bias_ offset:g * bias_bytes_per_group atIndex:4];
+                [encoder setBytes :&g       length:sizeof(int) atIndex:5];
+
+                status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+
+                if (status != TNN_OK) {
+                    [encoder endEncoding];
+                    return status;
+                }
+            }
+        }
+    } else {
+        for(int n=0; n<batch; ++n) {
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                        offset:(NSUInteger)input->GetHandle().bytes_offset + n * input_bytes
+                       atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)output->GetHandle().bytes_offset + n * output_bytes
+                       atIndex:1];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+            [encoder setBuffer:buffer_weight_ offset:0 atIndex:3];
+            [encoder setBuffer:buffer_bias_ offset:0 atIndex:4];
+
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+
+            if (status != TNN_OK) {
+                [encoder endEncoding];
+                return status;
+            }
+        }
+    }
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return TNN_OK;
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.h
new file mode 100644
index 0000000..7ec8b41
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief Deconv layer metal acc
+class MetalDeconvLayerDepthwise : public MetalDeconvLayerCommon {
+public:
+    virtual ~MetalDeconvLayerDepthwise();
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status SetKernelEncoderParam(
+                                         id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_DECONV_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.metal b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.metal
new file mode 100644
index 0000000..7007f2d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.metal
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void deconv_depthwise(const device ftype4 *in          [[buffer(0)]],
+                             device ftype4 *out               [[buffer(1)]],
+                             constant MetalConvParams& param  [[buffer(2)]],
+                             const device ftype4 *wt          [[buffer(3)]],
+                             const device ftype4 *biasTerms   [[buffer(4)]],
+                             uint3 gid                      [[thread_position_in_grid]]) {
+    if ((int)gid.x >= param.output_width || (int)gid.y >= param.output_height) return;
+    
+    const int out_slice = int(gid.z) % param.output_slice;
+
+    float4 result = param.has_bias ? float4(biasTerms[out_slice]) : float4(Zero4);
+    int oy = (int)gid.y + param.pad_y;
+    int ox = (int)gid.x + param.pad_x;
+    int max_sy = min((param.input_height - 1) * param.stride_y, oy / param.stride_y * param.stride_y);
+    int max_sx = min((param.input_width - 1) * param.stride_x, ox / param.stride_x * param.stride_x);
+    int min_ky = UP_DIV(oy - max_sy, param.dilation_y);
+    int min_kx = UP_DIV(ox - max_sx, param.dilation_x);
+    if ((oy - min_ky * param.dilation_y) % param.stride_y == 0 && (ox - min_kx * param.dilation_x) % param.stride_x == 0) {
+        
+        int min_sy = max(0, ROUND_UP(oy + param.dilation_y - param.kernel_y * param.dilation_y, param.stride_y));
+        int min_sx = max(0, ROUND_UP(ox + param.dilation_x - param.kernel_x * param.dilation_x, param.stride_x));
+        int max_ky = (oy - min_sy) / param.dilation_y;
+        int max_kx = (ox - min_sx) / param.dilation_x;
+        int min_iy = (oy - max_ky * param.dilation_y) / param.stride_y;
+        int min_ix = (ox - max_kx * param.dilation_x) / param.stride_x;
+        auto z_wt = wt + out_slice * param.kernel_size;
+        auto z_in = in + (int)gid.z * param.input_size;
+        for (auto ky = max_ky, iy = min_iy; ky >= min_ky; ky -= param.kernel_delta_y, iy += param.input_delta_y) {
+            for (auto kx = max_kx, ix = min_ix; kx >= min_kx; kx -= param.kernel_delta_x, ix += param.input_delta_x) {
+                auto wt4 = float4(z_wt[ky * param.kernel_x + kx]);
+                auto in4 = float4(z_in[iy * param.input_width + ix]);
+                result += in4 * wt4;
+            }
+        }
+    }
+    out[(int)gid.z * param.output_size + (int)gid.y * param.output_width + (int)gid.x] = activate(ftype4(result), param.activation);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.mm b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.mm
new file mode 100644
index 0000000..6a223f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.mm
@@ -0,0 +1,116 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/convolution/metal_conv_layer_depthwise.h"
+#include "tnn/device/metal/acc/deconvolution/metal_deconv_layer_depthwise.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+bool MetalDeconvLayerDepthwise::isPrefered(ConvLayerParam *param,
+                                           const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs) {
+    return MetalConvLayerDepthwise::isPrefered(param, inputs, outputs);
+}
+
+MetalDeconvLayerDepthwise::~MetalDeconvLayerDepthwise() {}
+
+Status MetalDeconvLayerDepthwise::AllocateBufferWeight(
+    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *layer_res =
+        dynamic_cast<ConvLayerResource *>(resource_);
+
+    Status status = TNN_OK;
+    if (!buffer_weight_) {
+        int kw = layer_param->kernels[0];
+        int kh = layer_param->kernels[1];
+
+        const int group  = layer_param->group;
+        buffer_weight_ = AllocatePackedNC4HW4MetalBufferFormRawBuffer(layer_res->filter_handle,
+                                                                {1, group, kh, kw},
+                                                                group, status);
+    }
+    return status;
+}
+
+Status MetalDeconvLayerDepthwise::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device        = [TNNMetalDeviceImpl sharedDevice];
+    ConvLayerParam *layer_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    // buffer_param_
+    {
+        MetalConvParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        SetDefaultMetalConvParams(metal_params, layer_param);
+
+        auto status = MetalDeconvLayerCommon::ComputeDeconvParam(metal_params);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConvParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    return TNN_OK;
+}
+
+std::string MetalDeconvLayerDepthwise::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "deconv_depthwise";
+}
+
+Status MetalDeconvLayerDepthwise::ComputeThreadSize(
+                                                    const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs,
+                                                    MTLSize &size) {
+    auto output = outputs[0];
+    auto dims_output  = output->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+Status MetalDeconvLayerDepthwise::SetKernelEncoderParam(
+                                                        id<MTLComputeCommandEncoder> encoder,
+                                                        const std::vector<Blob *> &inputs,
+                                                        const std::vector<Blob *> &outputs) {
+    MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+    [encoder setBuffer:buffer_weight_
+                offset:0
+               atIndex:3];
+    [encoder setBuffer:buffer_bias_
+                offset:0
+               atIndex:4];
+    return TNN_OK;
+}
+
+Status MetalDeconvLayerDepthwise::Forward(const std::vector<Blob *> &inputs,
+                                          const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ConvLayerParam *>(param_);
+    auto input                   = inputs[0];
+    auto dims_input              = input->GetBlobDesc().dims;
+    if (!layer_param || layer_param->group != dims_input[1]) {
+        LOGE("Error: group is not supported\n");
+        return Status(TNNERR_LAYER_ERR, "group is not supported");
+    }
+    
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.metal
new file mode 100644
index 0000000..6ee8bd2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void abs(const device ftype4 *in                           [[buffer(0)]],
+                         device ftype4 *out                                   [[buffer(1)]],
+                         constant MetalParams& params            [[buffer(2)]],
+                         uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = abs(*z_in);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.mm
new file mode 100644
index 0000000..8d1206b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_abs_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Abs, LAYER_ABS);
+
+string MetalAbsLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "abs";
+}
+
+Status MetalAbsLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalAbsLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Abs, LAYER_ABS);
+REGISTER_METAL_LAYOUT(LAYER_ABS, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.metal
new file mode 100644
index 0000000..64f1c91
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void acos(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = acos(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.mm
new file mode 100644
index 0000000..29663da
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_acos_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Acos, LAYER_ACOS);
+
+string MetalAcosLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "acos";
+}
+
+Status MetalAcosLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalAcosLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Acos, LAYER_ACOS);
+REGISTER_METAL_LAYOUT(LAYER_ACOS, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.metal
new file mode 100644
index 0000000..1137226
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void add_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = src0[index_in] + src1[index_in];
+}
+
+kernel void add_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = data0 + data1;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.mm
new file mode 100644
index 0000000..a13e08e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_add_layer_acc.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Add, LAYER_ADD);
+
+std::string MetalAddLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "add_broadcast";
+    } else {
+        kernel_name = "add_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalAddLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalAddLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Add, LAYER_ADD);
+REGISTER_METAL_LAYOUT(LAYER_ADD, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.metal
new file mode 100644
index 0000000..5f3f382
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.metal
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+#define COMPARE_SET_FLAG(f, v1, v2, m)          \
+    do {                                        \
+        if (m == 0) {                           \
+            f = bool4(v1 <= v2);                \
+        } else {                                \
+            f = bool4(v1 >= v2);                \
+        }                                       \
+    } while(0)
+
+#define REDUCE_VEC4(vec, op)                    \
+        op(vec.x, op(vec.y, op(vec.z, vec.w)))
+
+kernel void argmax_or_min_common(const device ftype4 *src                   [[buffer(0)]],
+                                 device       int4   *dst                   [[buffer(1)]],
+                                 constant MetalArgMaxOrMinParams &params    [[buffer(2)]],
+                                 uint3 gid                                  [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, params.outer_size, 1)))
+        return;
+
+    int index_out = (int)gid.y * params.inner_size + (int)gid.x;
+    int index_in  = (int)gid.y * params.inner_size * params.reduce_size + (int)gid.x;
+
+    ftype4 guard_value = src[index_in];
+    int4   guard_index = int4(0);
+    auto   flag        = bool4(false);
+    for(int r=1; r<params.reduce_size; ++r) {
+        index_in += params.inner_size;
+        ftype4 val = src[index_in];
+        int4   idx = int4(r);
+        COMPARE_SET_FLAG(flag, guard_value, val, params.mode);
+        guard_value = select(val, guard_value, flag);
+        guard_index = select(idx, guard_index, flag);
+    }
+
+    dst[index_out] = guard_index;
+}
+
+kernel void argmax_or_min_channel(const device ftype4 *src                  [[buffer(0)]],
+                                  device       int4   *dst                  [[buffer(1)]],
+                                  constant MetalArgMaxOrMinParams &params   [[buffer(2)]],
+                                  uint3 gid                                 [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, params.outer_size, 1)))
+        return;
+
+    ftype pad_val = params.mode == 0 ? ftype(FLT_MAX):ftype(-FLT_MAX);
+
+    int index_out = (int)gid.y * params.inner_size + (int)gid.x;
+    int index_in  = (int)gid.y * params.inner_size * params.reduce_size + (int)gid.x;
+
+    ftype4 guard_value = ftype4(pad_val);
+    int4   guard_index = int4(-1);
+
+    auto reduce_c4 = params.input_channel / 4;
+    auto reduce_r4 = params.input_channel % 4;
+    int4 idx = int4(0, 1, 2, 3);
+    auto flag = bool4(false);
+    for(int rc=0; rc<reduce_c4; ++rc) {
+        ftype4 val = src[index_in];
+        COMPARE_SET_FLAG(flag, guard_value, val, params.mode);
+        guard_value = select(val, guard_value, flag);
+        guard_index = select(idx, guard_index, flag);
+
+        index_in += params.inner_size;
+        idx += 4;
+    }
+    if (reduce_r4 != 0) {
+        ftype4 r4_value = src[index_in];
+        switch(reduce_r4) {
+            case 1:
+                r4_value = ftype4(r4_value.x, pad_val, pad_val, pad_val);
+                break;
+            case 2:
+                r4_value = ftype4(r4_value.x, r4_value.y, pad_val, pad_val);
+                break;
+            case 3:
+                r4_value = ftype4(r4_value.x, r4_value.y, r4_value.z, pad_val);
+                break;
+        }
+        COMPARE_SET_FLAG(flag, guard_value, r4_value, params.mode);
+        guard_value = select(r4_value, guard_value, flag);
+        guard_index = select(idx, guard_index, flag);
+    }
+    // find the target value in ftype4
+    ftype target_val = params.mode==0? REDUCE_VEC4(guard_value, min):REDUCE_VEC4(guard_value, max);
+    auto eq  = (guard_value == target_val);
+    idx = select(int4(params.input_channel), guard_index, eq);
+    auto target_idx = REDUCE_VEC4(idx, min);
+
+    dst[index_out] = int4(target_idx, 0, 0, 0);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.mm
new file mode 100644
index 0000000..50d8ad8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_arg_max_or_min_layer_acc.mm
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+// @brief arg_max_or_argmin layer metal acc
+class MetalArgMaxOrMinLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferParam(const std::vector<Blob*>& inputs, const std::vector<Blob*>& outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ComputeThreadSize(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, MTLSize &size);
+};
+
+Status MetalArgMaxOrMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    context_ = dynamic_cast<MetalContext *>(context);
+
+    param_    = param;
+    resource_ = resource;
+
+#if TNN_METAL_FULL_PRECISION
+    inputs[0]->GetBlobDesc().data_type  = DATA_TYPE_FLOAT;
+#else
+    inputs[0]->GetBlobDesc().data_type  = DATA_TYPE_HALF;
+#endif
+    outputs[0]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+
+    status = ReloadConstantBlobs(inputs, false);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return Reshape(inputs, outputs);
+}
+
+std::string MetalArgMaxOrMinLayerAcc::KernelName(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    if (param->axis == 1) {
+        // channel
+        return "argmax_or_min_channel";
+    } else {
+        return "argmax_or_min_common";
+    }
+    return "";
+}
+
+Status MetalArgMaxOrMinLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                        const std::vector<Blob *> &outputs) {
+    Status status = TNN_OK;
+    id<MTLDevice> device          = [TNNMetalDeviceImpl sharedDevice];
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto input_channel = dims_input[1];
+    dims_input[1] = UP_DIV(input_channel, 4);
+    // buffer_param_
+    {
+        MetalArgMaxOrMinParams metal_params;
+        metal_params.input_channel = input_channel;
+        metal_params.mode = param->mode;
+        auto axis = param->axis;
+        metal_params.reduce_size = dims_input[axis];
+        metal_params.outer_size  = DimsVectorUtils::Count(dims_input, 0, axis);
+        metal_params.inner_size  = DimsVectorUtils::Count(dims_input, axis+1);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalArgMaxOrMinParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return status;
+}
+
+Status MetalArgMaxOrMinLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    Status status = TNN_OK;
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    auto axis = param->axis;
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto slice = UP_DIV(dims_input[1], 4);
+    dims_input[1] = slice;
+
+    auto outer_size  = DimsVectorUtils::Count(dims_input, 0, axis);
+    auto inner_size  = DimsVectorUtils::Count(dims_input, axis+1);
+    size = MTLSizeMake(inner_size, outer_size, 1);
+    return status;
+}
+
+Status MetalArgMaxOrMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    if (data_type != DATA_TYPE_INT32) {
+        LOGE("MetalArgMaxOrMinLayerAcc: output DataType must be int32\n");
+        return Status(TNNERR_LAYER_ERR, "MetalArgMaxOrMinLayerAcc: output DataType must be int32");
+    }
+
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    if (param->axis <= 1 && param->keep_dims == 0) {
+        LOGE("MetalArgMaxOrMinLayerAcc: axis<1 and keep_dims=0 not supported!\n");
+        return Status(TNNERR_LAYER_ERR, "MetalArgMaxOrMinLayerAcc: axis<1 and keep_dims=0 not supported!");
+    }
+
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    
+    MTLSize threads;
+    auto status = ComputeThreadSize(inputs, outputs, threads);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    do {
+        auto kernel_name = KernelName(inputs, outputs);
+        MetalBandwidth bandwidth;
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                          bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    
+    if (status == TNN_OK) {
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+    return status;
+}
+
+REGISTER_METAL_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+REGISTER_METAL_LAYOUT(LAYER_ARG_MAX_OR_MIN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.metal
new file mode 100644
index 0000000..f556ae1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void asin(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = asin(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.mm
new file mode 100644
index 0000000..7296df1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_asin_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Asin, LAYER_ASIN);
+
+string MetalAsinLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "asin";
+}
+
+Status MetalAsinLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalAsinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Asin, LAYER_ASIN);
+REGISTER_METAL_LAYOUT(LAYER_ASIN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.metal
new file mode 100644
index 0000000..57c5939
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void atan(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = atan(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.mm
new file mode 100644
index 0000000..9633207
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_atan_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Atan, LAYER_ATAN);
+
+string MetalAtanLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "atan";
+}
+
+Status MetalAtanLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalAtanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Atan, LAYER_ATAN);
+REGISTER_METAL_LAYOUT(LAYER_ATAN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.metal
new file mode 100644
index 0000000..4f1db54
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.metal
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void batch_norm(const device ftype4 *src             [[buffer(0)]],
+                       device ftype4 *dst                   [[buffer(1)]],
+                       constant MetalParams& params         [[buffer(2)]],
+                       const device ftype4 *scales          [[buffer(3)]],
+                       const device ftype4 *biases          [[buffer(4)]],
+                       uint3 gid                            [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto index = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size  + (int)gid.x;
+    if (!params.share_channel) {
+        dst[index] = src[index]*scales[gid.y] + biases[gid.y];
+    } else {
+        dst[index] = src[index]*scales[0].x + biases[0].x;
+    }
+    
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.mm
new file mode 100644
index 0000000..4b49e38
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_batch_norm_layer_acc.mm
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalBatchNormLayerAcc : public MetalLayerAcc {
+public:
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_scale_ = nil;
+    id<MTLBuffer> buffer_bias_  = nil;
+};
+
+class MetalScaleLayerAcc : public MetalBatchNormLayerAcc {
+public:
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+Status MetalBatchNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalBatchNormLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    auto layer_res = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: layer resource is nil");
+    }
+
+    Status status = TNN_OK;
+    // buffer_param_
+    {
+        auto metal_params          = GetDefaultMetalParams(dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+        metal_params.share_channel = (layer_res->scale_handle.GetDataCount() == 1) ? 1 : 0;
+        buffer_param_              = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    if (!buffer_scale_) {
+        buffer_scale_ = AllocateMetalBufferFormRawBuffer1D(layer_res->scale_handle, dims_output[1], status);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+
+    if (!buffer_bias_) {
+        RawBuffer raw_buffer = layer_res->bias_handle;
+        if (raw_buffer.force_to<float *>() == nullptr) {
+            auto buffer               = layer_res->scale_handle;
+            const DataType data_type  = buffer.GetDataType();
+            const int total_byte_size = dims_output[1] * DataTypeUtils::GetBytesSize(data_type);
+            raw_buffer                = RawBuffer(total_byte_size);
+        }
+        buffer_bias_ = AllocateMetalBufferFormRawBuffer1D(raw_buffer, dims_output[1], status);
+    }
+    return status;
+}
+
+Status MetalBatchNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto context_impl = context_->getMetalContextImpl();
+    if (!context_impl) {
+        LOGE("context_impl is nil\n");
+        return Status(TNNERR_CONTEXT_ERR, "MetalBatchNormLayerAcc context_impl is nil");
+    }
+
+    auto encoder = [context_impl encoder];
+    if (!encoder) {
+        LOGE("encoder is nil\n");
+        return Status(TNNERR_CONTEXT_ERR, "MetalBatchNormLayerAcc encoder is nil");
+    }
+
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+    auto output_height = DimsFunctionUtils::GetDim(dims_output, 2);
+    auto output_slice  = UP_DIV(dims_output[1], 4);
+    auto batch         = dims_output[0];
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+
+    do {
+        status = [context_impl load:@"batch_norm" encoder:encoder bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+
+        MTLSize threads = {(NSUInteger)output_height * output_width, (NSUInteger)output_slice, (NSUInteger)batch};
+
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                    offset:(NSUInteger)input->GetHandle().bytes_offset
+                   atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                    offset:(NSUInteger)output->GetHandle().bytes_offset
+                   atIndex:1];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+        [encoder setBuffer:buffer_scale_ offset:0 atIndex:3];
+        [encoder setBuffer:buffer_bias_ offset:0 atIndex:4];
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+Status MetalScaleLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalBatchNormLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Scale, LAYER_SCALE);
+REGISTER_METAL_ACC(BatchNorm, LAYER_BATCH_NORM);
+
+REGISTER_METAL_LAYOUT(LAYER_SCALE, DATA_FORMAT_NC4HW4);
+REGISTER_METAL_LAYOUT(LAYER_BATCH_NORM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.h
new file mode 100644
index 0000000..d2a2b57
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CAST_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CAST_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalCastLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    // @brief virtual destrcutor
+    virtual ~MetalCastLayerAcc() {}
+
+    Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                    const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs);
+
+    Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs,
+                                    MTLSize &size);
+
+    std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CAST_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.metal
new file mode 100644
index 0000000..2c1616b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.metal
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+template <typename Src, typename Dst>
+static void cast_from_to(const device Src *src,
+                               device Dst *dst,
+                               uint index) {
+    dst[index] = static_cast<Dst>(src[index]);
+}
+
+kernel void cast_same_bytes2(const device half4 *src                   [[buffer(0)]],
+                                device half4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<half4, half4>(src, dst, index);
+}
+
+kernel void cast_same_bytes4(const device float4 *src                   [[buffer(0)]],
+                                device float4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<float4, float4>(src, dst, index);
+}
+
+kernel void cast_ftype_to_int32(const device ftype4 *src            [[buffer(0)]],
+                                device int4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<ftype4, int4>(src, dst, index);
+
+}
+
+kernel void cast_int32_to_ftype(const device int4 *src            [[buffer(0)]],
+                                device ftype4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<int4, ftype4>(src, dst, index);
+
+}
+
+kernel void cast_int32_to_uint32(const device int4 *src            [[buffer(0)]],
+                                device uint4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<int4, uint4>(src, dst, index);
+
+}
+
+kernel void cast_uint32_to_int32(const device uint4 *src            [[buffer(0)]],
+                                device int4 *dst                   [[buffer(1)]],
+                                constant MetalCastParams &params  [[buffer(2)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice, params.batch)))
+        return;
+
+    uint index = (gid.z * params.input_slice + gid.y) * params.input_size + gid.x;
+    cast_from_to<uint4, int4>(src, dst, index);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.mm
new file mode 100644
index 0000000..fdd010c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cast_layer_acc.mm
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_cast_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+const static auto isFloat = [](DataType data_type) {
+        return data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_FLOAT;
+};
+
+Status MetalCastLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto layer_param = dynamic_cast<CastLayerParam *>(param_);
+    const auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    const auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+    if (!isFloat(input_data_type)) {
+        inputs[0]->GetBlobDesc().data_type = input_data_type;
+    }
+    if (!isFloat(output_data_type)) {
+        outputs[0]->GetBlobDesc().data_type = output_data_type;
+    }
+
+    return TNN_OK;
+}
+
+Status MetalCastLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalCastLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<GatherLayerParam *>(param_);
+    const auto dims_input = inputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalCastParams metal_params;
+        metal_params.input_size  = DimsFunctionUtils::GetDimProduct(dims_input, 2);
+        metal_params.input_slice = UP_DIV(dims_input[1], 4);
+        metal_params.batch       = dims_input[0];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalCastParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalCastLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<CastLayerParam *>(param_);
+    const auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    const auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+
+    if (input_data_type == output_data_type) {
+        const auto size_bytes = DataTypeUtils::GetBytesSize(input_data_type);
+        if (size_bytes == 2)
+            return "cast_same_bytes2";
+        else if(size_bytes == 4)
+            return "cast_same_bytes4";
+    } else if (isFloat(input_data_type) && output_data_type == DATA_TYPE_INT32) {
+        return "cast_ftype_to_int32";
+    } else if (input_data_type == DATA_TYPE_INT32 && isFloat(output_data_type)) {
+        return "cast_int32_to_ftype";
+    } else if (input_data_type == DATA_TYPE_INT32 && output_data_type == DATA_TYPE_UINT32) {
+        return "cast_int32_to_uint32";
+    } else if (input_data_type == DATA_TYPE_UINT32 && output_data_type == DATA_TYPE_INT32) {
+        return "cast_uint32_to_int32";
+    }
+
+    LOGE("unsupport data type to cast\n");
+    return "";
+}
+
+Status MetalCastLayerAcc::SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalCastLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    const auto dims_input = inputs[0]->GetBlobDesc().dims;
+    size = MTLSizeMake(DimsFunctionUtils::GetDimProduct(dims_input, 2),
+                        UP_DIV(dims_input[1], 4),
+                        dims_input[0]);
+    return TNN_OK;
+}
+
+Status MetalCastLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    
+    MTLSize threads;
+    auto status = ComputeThreadSize(inputs, outputs, threads);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    do {
+        auto kernel_name = KernelName(inputs, outputs);
+        
+        MetalBandwidth bandwidth;
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                          bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    
+    if (status == TNN_OK) {
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+    return status;
+}
+
+REGISTER_METAL_ACC(Cast, LAYER_CAST);
+REGISTER_METAL_LAYOUT(LAYER_CAST, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.metal
new file mode 100644
index 0000000..190a083
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void ceil(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = ceil(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.mm
new file mode 100644
index 0000000..4ccf38d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_ceil_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Ceil, LAYER_CEIL);
+
+string MetalCeilLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "ceil";
+}
+
+Status MetalCeilLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalCeilLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Ceil, LAYER_CEIL);
+REGISTER_METAL_LAYOUT(LAYER_CEIL, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.metal
new file mode 100644
index 0000000..c40db2d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void clip(const device ftype4 *in                            [[buffer(0)]],
+                         device ftype4 *out                                   [[buffer(1)]],
+                         constant MetalClipParams& params      [[buffer(2)]],
+                        uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = clamp(*z_in, params.min, params.max);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.mm
new file mode 100644
index 0000000..1166263
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_clip_layer_acc.mm
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Clip, LAYER_CLIP);
+
+string MetalClipLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "clip";
+}
+
+Status MetalClipLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ClipLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: ClipLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "ClipLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalClipParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.min = layer_param->min;
+        metal_params.max = layer_param->max;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalClipParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalClipLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Clip, LAYER_CLIP);
+REGISTER_METAL_LAYOUT(LAYER_CLIP, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.h
new file mode 100644
index 0000000..42e2854
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.h
@@ -0,0 +1,793 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_METAL_COMMON_H_
+#define TNN_METAL_COMMON_H_
+
+#ifndef FLT_MAX
+#define FLT_MIN 1.175494351e-38F
+#define FLT_MAX 3.402823466e+38F
+#endif
+
+#ifndef TNN_METAL_FULL_PRECISION
+#define TNN_METAL_FULL_PRECISION 0
+#endif
+
+#ifndef UP_DIV
+#define UP_DIV(x, y) (((x) + (y)-1) / (y))
+#endif
+#ifndef ROUND_UP
+#define ROUND_UP(x, y) (((x) + (y)-1) / (y) * (y))
+#endif
+
+/**Base Param Struct **/
+struct MetalParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int share_channel = 0;
+    int batch;
+};
+
+struct MetalSplitVParamV2 {
+    int inner_size;
+    int axis_size;
+    int outer_size;
+};
+
+struct MetalConcatParamV2 {
+    int inner_size;
+    int axis_size;
+    int outer_size;
+};
+
+struct MetalGatherParams {
+    int inner_size;
+    int input_axis_size;
+    int output_axis_size;
+    int outer_size;
+
+    int input_slice;
+    int output_slice;
+};
+
+struct MetalCastParams {
+    int batch;
+    int input_slice;
+    int input_size;
+};
+
+// keep as same as BroadcastType in layer_param.h
+#define kBroadcastTypeNormal 0x0000
+#define kBroadcastTypeSingle 0x0001
+#define kBroadcastTypeChannel 0x0002
+#define kBroadcastTypeElement 0x0003
+#define kBroadcastTypeHeightWidth 0x0004
+
+/** Broadcast Param Struct **/
+struct MetalBroadcastParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int broadcast_input0;
+    int broadcast_input1;
+    int batch;
+
+    int weight_index;
+
+    int input0_size;
+    int input1_size;
+};
+
+/**Pow Param Struct **/
+struct MetalPowParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float exponent = 1.0;
+    float scale    = 1.0;
+    float shift    = 0.0;
+    int batch;
+};
+
+/** Hard Sigmoid Param Struct **/
+struct MetalHardSigmoidParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float alpha = 1.0;
+    float beta  = 0.0;
+    float min   = 0.0;
+    float max   = 0.0;
+    int batch;
+    int broadcast_input0;
+    int broadcast_input1;
+};
+
+/** Elu Param Struct **/
+struct MetalEluParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float alpha = 1.0;
+    int batch;
+};
+
+/** Clip Param Struct **/
+struct MetalClipParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float min = -FLT_MAX;
+    float max = FLT_MAX;
+    int batch;
+};
+
+/** Selu Param Struct **/
+struct MetalSeluParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float alpha = 1.67326;
+    float gamma = 1.0507;
+    int batch;
+};
+
+/** LRN Param Struct **/
+struct MetalLRNParams {
+    int input_width;
+    int input_height;
+    int input_channel;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    float alpha;
+    float beta;
+    float bias;
+    int size;
+
+    int batch;
+};
+
+/** Stride Slice Param Struct **/
+struct MetalStrideSliceParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    int begin_n;
+    int begin_c;
+    int begin_h;
+    int begin_w;
+    int stride_n;
+    int stride_c;
+    int stride_h;
+    int stride_w;
+};
+
+struct MetalStrideSliceParamsV2 {
+    int input_shape3d_low[3];
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+
+    int shape3d_low[3];
+    //uint3 shape3d_low;
+    // count of shape3d_low
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+
+    // strides for h, s, b
+    int strides_high[3];
+    int strides_low[3];
+    int begins_high[3];
+    int begins_low[3];
+};
+
+/** Shuffle Param Struct **/
+struct MetalShuffleParams {
+    int input_width;
+    int input_height;
+    int input_channel;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int group             = 0;
+    int channel_per_group = 0;
+    int batch;
+};
+
+/** Inner Product Param Struct **/
+struct MetalInnerProductParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    int has_bias;
+    int activation = -1;
+};
+
+/** Conv Param Struct **/
+struct MetalConvParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_slice_per_group;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_slice_per_group;
+    int batch;
+    int threadgroup_input_slice;
+    int kernel_x;
+    int kernel_y;
+    int kernel_size;
+    int stride_x;
+    int stride_y;
+    int pad_x;
+    int pad_y;
+    int dilation_x;
+    int dilation_y;
+    int kernel_delta_x;
+    int kernel_delta_y;
+    int input_delta_x;
+    int input_delta_y;
+    int has_bias;
+    int activation = -1;
+    int group;
+};
+
+/** Winograd Param Struct **/
+struct MetalWinogradParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    int pad_x;
+    int pad_y;
+    int unit_width;
+    int unit_height;
+    int unit;
+    int has_bias;
+    int activation = -1;
+};
+
+/** Mat Mul Param Struct **/
+struct MetalMatMul4x4Params {
+    int output_width;
+    int output_height;
+    int multi_length;
+    int group;
+};
+
+/** Pool Param Struct **/
+struct MetalPoolParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    int kernel_x;
+    int kernel_y;
+    int stride_x;
+    int stride_y;
+    int pad_x;
+    int pad_y;
+};
+
+/** Upsample Param Struct **/
+struct MetalUpsampleParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    // input_x/output_x
+    float scale_x;
+    // input_y/output_y
+    float scale_y;
+};
+
+/** Concat Param Struct **/
+struct MetalConcatParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_channel_0;
+    int input_slice_0;
+    int input_channel_1;
+    int input_slice_1;
+    int input_channel_offset;
+    int output_width;
+    int output_size;
+    int output_channel;
+    int output_slice;
+    int batch;
+};
+
+/** Normalize Param Struct **/
+struct MetalNormalizeParams {
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    float epsilon = 1e-12;
+};
+
+/** Reduce Param Struct
+ *  ReduceL1
+ *  ReduceL2
+ *  ReduceLogSum
+ *  ReduceLogSumExp
+ *  ReduceMax
+ *  ReduceMean
+ *  ReduceMin
+ *  ReduceProd
+ *  ReduceSum
+ *  ReduceSumSquare
+ * **/
+struct MetalReduceParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_batch;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_batch;
+    int batch;
+    int axis;
+    int input_channel;
+    int input_channel_mode_4;
+};
+
+/** Multi-axis Reduce Param Struct **/
+struct MetalMultiAxisReduceParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_batch;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_batch;
+    int batch;
+    int input_channel;
+    int input_channel_mode_4;
+    int reduce_length;
+    int reduce_flag[4] = {0};
+};
+
+/** Softmax Param Struct **/
+struct MetalSoftmaxParams {
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int channel_remain = 0;
+    int batch;
+};
+
+/** Pad Param Struct **/
+struct MetalPadParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    float value = 0;
+    int pad_l;
+    int pad_r;
+    int pad_t;
+    int pad_b;
+    int pad_c_b;
+    int pad_c_e;
+    int input_channel;
+};
+
+/** Image Converter Param Struct **/
+struct MetalImageConverterParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch = 1;
+    float scale_x;
+    float scale_y;
+    float scale_z;
+    float scale_w;
+    float bias_x;
+    float bias_y;
+    float bias_z;
+    float bias_w;
+    int bgra_to_rgba;
+};
+
+/** Signed Mul Param Struct **/
+struct MetalSignedMulParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int batch;
+    float alpha;
+    float beta;
+    float gamma_inv;
+};
+
+struct MetalResizeParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch;
+    
+    float scale_w;
+    float scale_h;
+    
+    int resized_width;
+    int resized_height;
+    int type;
+};
+
+struct MetalCropParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch;
+    
+    int crop_width;
+    int crop_height;
+    int top_left_x;
+    int top_left_y;
+};
+
+struct MetalWarpAffineParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch;
+    
+    int resized_width;
+    int resized_height;
+    // double is not supported in Metal
+    float transform_inv[2][3];
+    int interp_type;
+    int border_type;
+    float border_val;
+};
+
+struct MetalCopyParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch;
+};
+
+struct MetalCopyMakeBorderParam {
+    int width;
+    int height;
+    int channel;
+    int batch;
+    int top;
+    int bottom ;
+    int left;
+    int right;
+    int border_type;
+    float border_val;
+};
+
+
+struct MetalBGR2GrayParams {
+    int width;
+    int height;
+    int size;
+    int channel;
+    int slice;
+    int batch;
+};
+
+/** Reshape Param Struct **/
+struct MetalReshapeParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_channel;
+
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_channel;
+    int batch;
+};
+
+/** ArgMaxOrMin Param Struct **/
+struct MetalArgMaxOrMinParams {
+    int input_channel;
+    int outer_size;
+    int inner_size;
+    int reduce_size;
+    int mode;
+};
+
+/** PixelShuffle Param Struct **/
+struct MetalPixelShuffleParams {
+    int input_width;
+    int input_height;
+    int input_slice;
+    int input_channel;
+
+    int output_width;
+    int output_height;
+    int output_slice;
+    int batch;
+
+    int upscale_factor;
+};
+
+/** Reorg Param Struct **/
+struct MetalReorgParams {
+    int input_width;
+    int input_height;
+    int input_slice;
+    int input_channel;
+
+    int output_width;
+    int output_height;
+    int output_slice;
+    int output_channel;
+    int batch;
+
+    int stride;
+    int mode; // DCR: 0  CRD: 1
+};
+
+/** MetalPermute Param Struct **/
+struct MetalPermuteParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_batch;
+
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int share_channel = 0;
+    int batch;
+
+    int dim_count;
+
+    int strides[4];
+    int orders[4];
+    int channel_dim_size; // the input size alongside the new chanel dimension
+    int channel_dim; // which axis of the output corresponds the input channel
+};
+
+#define MAX_DIM_COUNT 8
+struct MetalDynamicPermuteParams {
+    int input_sizes[MAX_DIM_COUNT];
+    int input_size;
+    int input_slice;
+    int input_batch;
+
+    int output_sizes[MAX_DIM_COUNT];
+    int output_size;
+    int output_slice;
+    int batch;
+
+    int dim_count;
+
+    int strides[MAX_DIM_COUNT];
+
+    int channel_dim_size; // the input size alongside the new chanel dimension
+    int channel_dim; // which axis of the output corresponds the input channel
+};
+
+/** MetalRecurrent Param Struct **/
+struct MetalRecurrentParams {
+    int seq_len;
+    int batch;
+    int input_width;
+    int hidden_size;
+    int direction;
+
+    bool reverse;
+    bool has_init_h;
+    bool has_init_c;
+
+    int activation;
+};
+
+/** Squeeze Param Struct **/
+struct MetalSqueezeParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_channel;
+    int input_batch;
+
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_channel;
+    
+    int batch;
+};
+
+/** MetalRecurrent Param Struct **/
+struct MetalMatMulParams {
+    int batch_c;
+    int batch_a;
+    int batch_b;
+    int M;
+    int N;
+    int K;
+};
+
+struct MetalTileParams {
+    int input_width;
+    int input_height;
+    int input_size;
+    int input_slice;
+    int input_channel;
+
+    int output_width;
+    int output_height;
+    int output_size;
+    int output_slice;
+    int output_channel;
+    int batch;
+
+    int extend_width_times;
+    int extend_channel_times;
+    int extend_batch_times;
+};
+
+#define SetDefaultMetalParams(metal_params, dims_input, dims_output)                                                   \
+    do {                                                                                                               \
+        metal_params.input_width   = DimsFunctionUtils::GetDim(dims_input, 3);                                         \
+        metal_params.input_height  = DimsFunctionUtils::GetDim(dims_input, 2);                                                        \
+        metal_params.input_size    = metal_params.input_height * metal_params.input_width;                             \
+        metal_params.input_slice   = UP_DIV(dims_input[1], 4);                                                         \
+        metal_params.output_width  = DimsFunctionUtils::GetDim(dims_output, 3);                                        \
+        metal_params.output_height = DimsFunctionUtils::GetDim(dims_output, 2);                                        \
+        metal_params.output_size   = metal_params.output_height * metal_params.output_width;                           \
+        metal_params.output_slice  = UP_DIV(dims_output[1], 4);                                                        \
+        metal_params.batch         = dims_output[0];                                                                   \
+    } while (0)
+
+#define FixDefaultMetalParams(metal_params, dims_input, dims_output)                                                   \
+    do {                                                                                                               \
+        metal_params.input_width   = DimsFunctionUtils::GetDimProduct(dims_input, 3);                                   \
+        metal_params.input_size    = metal_params.input_height * metal_params.input_width;                             \
+        metal_params.output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3);                                  \
+        metal_params.output_size   = metal_params.output_height * metal_params.output_width;                           \
+    } while(0)
+
+#define SetDefaultMetalConvParams(metal_params, conv_param)                                                            \
+    do {                                                                                                               \
+        metal_params.activation  = conv_param->activation_type;                                                        \
+        metal_params.has_bias    = conv_param->bias;                                                                   \
+        metal_params.kernel_x    = conv_param->kernels[0];                                                             \
+        metal_params.kernel_y    = conv_param->kernels[1];                                                             \
+        metal_params.kernel_size = metal_params.kernel_x * metal_params.kernel_y;                                      \
+        metal_params.stride_x    = conv_param->strides[0];                                                             \
+        metal_params.stride_y    = conv_param->strides[1];                                                             \
+        metal_params.pad_x       = conv_param->pads[0];                                                                \
+        metal_params.pad_y       = conv_param->pads[2];                                                                \
+        metal_params.dilation_x  = conv_param->dialations[0];                                                          \
+        metal_params.dilation_y  = conv_param->dialations[1];                                                          \
+        metal_params.group       = conv_param->group;                                                                  \
+    } while (0)
+
+#endif  // TNN_METAL_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.metal
new file mode 100644
index 0000000..3f443f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_common.metal
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include <metal_stdlib>
+
+using namespace metal;
+
+#if TNN_METAL_FULL_PRECISION
+typedef float    ftype;
+typedef float2   ftype2;
+typedef float3   ftype3;
+typedef float4   ftype4;
+typedef float2x2 ftype2x2;
+typedef float2x3 ftype2x3;
+typedef float2x4 ftype2x4;
+typedef float3x2 ftype3x2;
+typedef float3x3 ftype3x3;
+typedef float3x4 ftype3x4;
+typedef float4x2 ftype4x2;
+typedef float4x3 ftype4x3;
+typedef float4x4 ftype4x4;
+#define FTYPE_MAX MAXFLOAT
+#else
+typedef half     ftype;
+typedef half2    ftype2;
+typedef half3    ftype3;
+typedef half4    ftype4;
+typedef half2x2  ftype2x2;
+typedef half2x3  ftype2x3;
+typedef half2x4  ftype2x4;
+typedef half3x2  ftype3x2;
+typedef half3x3  ftype3x3;
+typedef half3x4  ftype3x4;
+typedef half4x2  ftype4x2;
+typedef half4x3  ftype4x3;
+typedef half4x4  ftype4x4;
+#define FTYPE_MAX MAXHALF
+#endif
+
+
+using namespace metal;
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-const-variable"
+static constant ftype4 Zero4 = ftype4(0, 0, 0, 0);
+static constant ftype4 One4 = ftype4(1, 1, 1, 1);
+static constant ftype4 Six4 = ftype4(6, 6, 6, 6);
+#pragma clang diagnostic pop
+
+
+inline ftype4 prelu(ftype4 x, ftype4 slop) {
+    return fmax(Zero4, x) + slop * fmin(Zero4, x);
+}
+
+inline ftype4 activate(ftype4 value, int type) {
+    switch (type) {
+        case 0x0001: // Relu see layer_param.h
+            return max(value, Zero4);
+        case 0x0002: // Relu6 see layer_param.h
+            return clamp(value, Zero4, Six4);
+        case 0x0100: // Sigmoid_Mul see layer_param.h
+            return One4 / (One4 + exp(-value)) * value;
+        default: // None
+            return value;
+    }
+}
+
+// compute tanh according to its definition
+// metal::tanh may produce nan
+inline ftype4 tanh_high_precision(ftype4 x) {
+    float4 ep = exp(float4(x));
+    float4 en = exp(float4(-x));
+    float4 numerator   = ep - en;
+    float4 denominator = ep + en;
+    float4 result = numerator / denominator;
+
+    return ftype4(result);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.metal
new file mode 100644
index 0000000..b636e67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.metal
@@ -0,0 +1,197 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void concat_axis_1_common(const device ftype4 *src0               [[buffer(0)]],
+                                 const device ftype4 *src1               [[buffer(1)]],
+                                 device ftype4 *dst                      [[buffer(2)]],
+                                 constant MetalConcatParams &params      [[buffer(3)]],
+                                 uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int input_channel_0 = params.input_channel_0;
+    //const int input_channel_1 = params.input_channel_1;
+
+    int index_out = (int)gid.z*params.output_slice*params.output_size + (int)gid.y*params.output_size + (int)gid.x;
+    
+    int output_slice = gid.y;
+    int output_channel = gid.y*4;
+    if (output_channel + 4 <= input_channel_0) {
+        int input_slice_0 = output_slice;
+        int index_in_0 = (int)gid.z*params.input_slice_0*params.input_size + (int)input_slice_0*params.input_size + (int)gid.x;
+        dst[index_out] = src0[index_in_0];
+    } else if (output_channel < input_channel_0 && output_channel + 4 > input_channel_0) {
+        int input_slice_0 = params.input_slice_0 - 1;
+        int index_in_0 = (int)gid.z*params.input_slice_0*params.input_size + (int)input_slice_0*params.input_size + (int)gid.x;
+        auto data_0 = src0[index_in_0];
+        int remain_0 = input_channel_0 - output_channel;
+        
+        int input_slice_1 = 0;
+        int index_in_1 = (int)gid.z*params.input_slice_1*params.input_size + (int)input_slice_1*params.input_size + (int)gid.x;
+        auto data_1 = src1[index_in_1];
+        
+        if (remain_0 == 1) {
+            dst[index_out] = ftype4(data_0.x, data_1.xyz);
+        } else if (remain_0 == 2) {
+            dst[index_out] = ftype4(data_0.xy, data_1.xy);
+        } else {
+            dst[index_out] = ftype4(data_0.xyz, data_1.x);
+        }
+    } else if (output_channel >= input_channel_0){
+        int remain_1_high = (output_channel - input_channel_0) % 4;
+        if (remain_1_high == 0) {
+            int input_slice_1 = (output_channel - input_channel_0)/4;
+            int index_in_1 = (int)gid.z*params.input_slice_1*params.input_size + (int)input_slice_1*params.input_size + (int)gid.x;
+            dst[index_out] = src1[index_in_1];
+        } else {
+            int input_slice_1_low = (output_channel - input_channel_0)/4;
+            int index_in_1_low = (int)gid.z*params.input_slice_1*params.input_size + (int)input_slice_1_low*params.input_size + (int)gid.x;
+            auto data_1_low = src1[index_in_1_low];
+            auto data_1_high = data_1_low;
+            if (input_slice_1_low + 1 < params.input_slice_1) {
+                data_1_high = src1[index_in_1_low + params.input_size];
+            }
+            
+            if (remain_1_high == 1) {
+                dst[index_out] = ftype4(data_1_low.yzw, data_1_high.x);
+            } else if (remain_1_high == 2) {
+                dst[index_out] = ftype4(data_1_low.zw, data_1_high.xy);
+            } else {
+                dst[index_out] = ftype4(data_1_low.w, data_1_high.xyz);
+            }
+        }
+    }
+}
+
+kernel void concat_axis_1_common_x(const device ftype4 *src                              [[buffer(0)]],
+                                                            device ftype *dst                                        [[buffer(1)]],
+                                                            constant MetalConcatParams &params      [[buffer(2)]],
+                                                            uint3 gid                                                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_size, params.input_slice_0, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z*params.input_slice_0*params.input_size + (int)gid.y*params.input_size + (int)gid.x;
+    auto data_in = src[index_in];
+    
+    int4 input_channeles = (int)gid.y*4 + int4(0, 1, 2, 3);
+    int4 output_channeles = input_channeles + params.input_channel_offset;
+    int4 output_slice = output_channeles / 4;
+    int4 output_i = output_channeles % 4;
+    
+    int4 index_out = (int)gid.z*params.output_slice*params.output_size + output_slice*params.output_size + (int)gid.x;
+   
+    if ( all( index_out == index_out.yzwx) &&
+        all( output_i == int4(0, 1, 2, 3)) &&
+        all( input_channeles < int4(params.input_channel_0)) ) {
+        auto dst4 = (device ftype4 *)(dst);
+        dst4[index_out.x] = data_in;
+    } else  {
+        dst[index_out.x*4 + output_i.x] = data_in.x;
+        if (input_channeles[1] < params.input_channel_0) {
+            dst[index_out.y*4 + output_i.y] = data_in.y;
+        }
+        if (input_channeles[2] < params.input_channel_0) {
+            dst[index_out.z*4 + output_i.z] = data_in.z;
+        }
+        if (input_channeles[3] < params.input_channel_0) {
+            dst[index_out.w*4 + output_i.w] = data_in.w;
+        }
+    }
+    
+    //below will result error
+//    if ( all( output_i == int4(0, 1, 2, 3)) && all( input_channeles < int4(params.input_channel_0)) ) {
+//        dst[index_out.x] = data_in;
+//    } else {
+//        dst[index_out.x][output_i.x] = data_in.x;
+//        if (input_channeles[1] < params.input_channel_0) {
+//            dst[index_out.y][output_i.y] = data_in.y;
+//        }
+//        if (input_channeles[2] < params.input_channel_0) {
+//            dst[index_out.z][output_i.z] = data_in.z;
+//        }
+//        if (input_channeles[3] < params.input_channel_0) {
+//            dst[index_out.w][output_i.w] = data_in.w;
+//        }
+//    }
+}
+
+kernel void concat_axis_23_common_x(const device ftype4 *src                          [[buffer(0)]],
+                                                            device ftype4 *dst                                        [[buffer(1)]],
+                                                            constant MetalConcatParams &params      [[buffer(2)]],
+                                                            constant int2 &offset_xy                              [[buffer(3)]],
+                                                            uint3 gid                                                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.input_width, params.input_height,
+                         params.input_slice_0*params.batch)))
+        return;
+
+    int index_in = (int)gid.z*params.input_size + (int)gid.y*params.input_width + (int)gid.x;
+    auto data_in = src[index_in];
+
+    int index_out = (int)gid.z*params.output_size + ((int)gid.y + offset_xy.y)*params.output_width + (int)gid.x + offset_xy.x;
+    dst[index_out] = data_in;
+}
+
+kernel void concat_common(const device ftype4 *src                          [[buffer(0)]],
+                                device ftype4 *dst                                        [[buffer(1)]],
+                                constant MetalConcatParamV2 &params      [[buffer(2)]],
+                                constant int &axis_offset                           [[buffer(3)]],
+                                constant int &axis_size                         [[buffer(4)]],
+                                uint3 gid                                                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, axis_size, params.outer_size)))
+        return;
+
+    int index_in = (int)gid.z*axis_size*params.inner_size + (int)gid.y*params.inner_size + (int)gid.x;
+
+    int output_axis_offset = axis_offset + (int)gid.y;    
+    int index_out = (int)gid.z*params.axis_size*params.inner_size + output_axis_offset*params.inner_size + (int)gid.x;
+
+    dst[index_out] = src[index_in];
+}
+
+kernel void concat_axis_1(const device ftype4 *src                          [[buffer(0)]],
+                                device ftype4 *dst                                        [[buffer(1)]],
+                                constant MetalConcatParamV2 &params      [[buffer(2)]],
+                                constant int &axis_offset                           [[buffer(3)]],
+                                constant int &input_slice                         [[buffer(4)]],
+                                constant int &input_channel                         [[buffer(5)]],
+                                uint3 gid                                                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, input_slice, params.outer_size)))
+        return;
+
+    int index_in = (int)gid.z*input_slice*params.inner_size + (int)gid.y*params.inner_size + (int)gid.x;
+    int4 input_channel_offset = (int)gid.y*4 + int4(0, 1, 2, 3);
+    bool4 valid = input_channel_offset < input_channel;
+    int4 channel_offset = input_channel_offset + axis_offset;
+
+    int4 output_slices = channel_offset / 4;
+    int4 output_i = channel_offset % 4;
+    int4 index_out = (int)gid.z*params.axis_size*params.inner_size + output_slices*params.inner_size + (int)gid.x;
+
+    ftype4 val = src[index_in];
+
+    if (axis_offset % 4 == 0 && all(valid == bool4(true))) {
+        dst[index_out.x] = val;
+    } else {
+        auto dst1 = (device ftype*)dst;
+        dst1[index_out.x * 4 + output_i.x] = val.x;
+        if (valid.y == true) dst1[index_out.y * 4 + output_i.y] = val.y;
+        if (valid.z == true) dst1[index_out.z * 4 + output_i.z] = val.z;
+        if (valid.w == true) dst1[index_out.w * 4 + output_i.w] = val.w;
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.mm
new file mode 100644
index 0000000..8f7d81e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_concat_layer_acc.mm
@@ -0,0 +1,190 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+// @brief concat layer metal acc
+class MetalConcatLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    // @brief virtual destrcutor
+    virtual ~MetalConcatLayerAcc();
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+protected:
+    bool specialized_ = false;
+};
+
+Status MetalConcatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+MetalConcatLayerAcc::~MetalConcatLayerAcc() {}
+
+Status MetalConcatLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto layer_param = dynamic_cast<ConcatLayerParam *>(param_);
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    auto axis = layer_param->axis;
+    specialized_ = axis == 1 && inputs.size() == 2;
+
+    if (specialized_) {
+        // specialized kernel
+        auto dims_input_0 = inputs[0]->GetBlobDesc().dims;
+        auto dims_input_1 = inputs[1]->GetBlobDesc().dims;
+        MetalConcatParams metal_params;
+
+        metal_params.input_size      = DimsFunctionUtils::GetDim(dims_input_0, 2) * DimsFunctionUtils::GetDimProduct(dims_input_0, 3);
+        metal_params.input_channel_0 = dims_input_0[1];
+        metal_params.input_slice_0   = UP_DIV(dims_input_0[1], 4);
+
+        metal_params.input_channel_1 = dims_input_1[1];
+        metal_params.input_slice_1   = UP_DIV(dims_input_1[1], 4);
+
+        metal_params.output_channel = dims_output[1];
+        metal_params.output_size    = DimsFunctionUtils::GetDim(dims_output, 2) * DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        metal_params.output_slice   = UP_DIV(dims_output[1], 4);
+
+        metal_params.batch = dims_output[0];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConcatParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    } else {
+        dims_output[1] = UP_DIV(dims_output[1], 4);
+        MetalConcatParamV2 metal_params;
+        metal_params.outer_size = DimsVectorUtils::Count(dims_output, 0, axis);
+        metal_params.inner_size = DimsFunctionUtils::GetDimProduct(dims_output, axis+1);
+        metal_params.axis_size  = DimsFunctionUtils::GetDim(dims_output, axis);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalConcatParamV2)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    return TNN_OK;
+}
+
+Status MetalConcatLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ConcatLayerParam *>(param_);
+
+    auto output = outputs[0];
+
+    DataType data_type       = output->GetBlobDesc().data_type;
+    string data_type_str     = DataTypeUtils::GetDataTypeString(data_type);
+    const int data_type_size = DataTypeUtils::GetBytesSize(data_type);
+    
+    int offset = 0;
+    bool on_channel = layer_param->axis == 1;
+    Status status = TNN_OK;
+
+    MetalBandwidth bandwidth;
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    if (specialized_) {
+        do {
+            auto input_0 = inputs[0];
+            auto input_1 = inputs[1];
+            status = [context_impl load:@"concat_axis_1_common"
+                                encoder:encoder
+                              bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+            
+            auto dims_output   = output->GetBlobDesc().dims;
+            auto output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+            auto output_height = DimsFunctionUtils::GetDim(dims_output, 2);
+            auto output_slice  = UP_DIV(dims_output[1], 4);
+            auto batch         = dims_output[0];
+            auto threads =  MTLSizeMake(output_width * output_height, output_slice, batch);
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_0->GetHandle().base
+                        offset:(NSUInteger)input_0->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_1->GetHandle().base
+                        offset:(NSUInteger)input_1->GetHandle().bytes_offset
+                       atIndex:1];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)output->GetHandle().bytes_offset
+                       atIndex:2];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        } while (0);
+        [encoder endEncoding];
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+        return status;
+    }
+
+    for (int i = 0; i < inputs.size(); i++) {
+        auto input_dims = inputs[i]->GetBlobDesc().dims;
+        int axis_size = DimsFunctionUtils::GetDim(input_dims, layer_param->axis);
+        if (on_channel) {
+            axis_size = UP_DIV(axis_size, 4);
+        }
+        int channel_size = DimsFunctionUtils::GetDim(input_dims, 1);
+        do {
+            if (layer_param->axis == 1) {
+                status = [context_impl load:@"concat_axis_1"
+                                    encoder:encoder
+                                    bandwidth:bandwidth];
+            } else {
+                status = [context_impl load:@"concat_common"
+                                    encoder:encoder
+                                    bandwidth:bandwidth];
+            }
+            BREAK_IF(status != TNN_OK);
+
+            MTLSize threads;
+            GetSingleAxisSplitSize(input_dims, layer_param->axis, threads, false);
+
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)inputs[i]->GetHandle().base
+                        offset:(NSUInteger)inputs[i]->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)output->GetHandle().bytes_offset
+                       atIndex:1];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+            [encoder setBytes:&offset       length:sizeof(int) atIndex:3];
+            [encoder setBytes:&axis_size    length:sizeof(int) atIndex:4];
+            [encoder setBytes:&channel_size length:sizeof(int) atIndex:5];
+
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        } while (0);
+        offset += DimsFunctionUtils::GetDim(input_dims, layer_param->axis);;  
+    }
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+
+    return status;
+}
+
+REGISTER_METAL_ACC(Concat, LAYER_CONCAT);
+REGISTER_METAL_LAYOUT(LAYER_CONCAT, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.metal
new file mode 100644
index 0000000..b0be0cd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void cos(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = cos(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.mm
new file mode 100644
index 0000000..46d9532
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cos_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Cos, LAYER_COS);
+
+string MetalCosLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "cos";
+}
+
+Status MetalCosLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalCosLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Cos, LAYER_COS);
+REGISTER_METAL_LAYOUT(LAYER_COS, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.h
new file mode 100644
index 0000000..ffb7fc4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_ACC_METAL_CPU_ADAPTER_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_ACC_METAL_CPU_ADAPTER_H_
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/core/context.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+class MetalCpuAdapterAcc : public AbstractLayerAcc {
+public:
+
+    MetalCpuAdapterAcc(LayerType impl_layer_type);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~MetalCpuAdapterAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    // @brief get data_type for bolobs of cpu layer acc
+    DataType GetCpuLayerAccPrecision(DataType metal_blob_data_type);
+    // @brief get data_format for blobs of cpu layer acc
+    DataFormat GetCpuLayerAccDataFormat();
+    Status ConvertBlobForAdaptorAcc(const std::vector<Blob *> & metal_blobs,
+                                                const std::vector<Blob *> & cpu_blobs, bool metal_to_cpu);
+
+private:
+    LayerType impl_layer_type_;
+
+    DeviceType impl_device_type_;
+    AbstractDevice* impl_device_;
+    Context* impl_device_context_;
+    AbstractLayerAcc* cpu_adapter_acc_;
+
+    MetalContext *metal_context_ = nullptr;
+
+    std::vector<Blob *> cpu_blob_in_;
+    std::vector<Blob *> cpu_blob_out_;
+};
+
+}
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_ACC_METAL_CPU_ADAPTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm
new file mode 100644
index 0000000..f57b659
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm
@@ -0,0 +1,266 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc//metal_cpu_adapter_acc.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+inline MatType MatTypeByBlob(const BlobDesc& desc) {    
+    if (desc.data_type == DATA_TYPE_FLOAT)
+        return NCHW_FLOAT;
+    
+    if (desc.data_type == DATA_TYPE_HALF)
+        return RESERVED_BFP16_TEST;
+    
+    return INVALID;
+}
+
+static void PackOrUnpackData(void *src, void *dst, DataType data_type, DimsVector& dims, bool pack) {
+    if (DATA_TYPE_FLOAT == data_type) {
+        float *src_data = reinterpret_cast<float*>(src);
+        float *dst_data = reinterpret_cast<float*>(dst);
+        if (pack) {
+            DataFormatConverter::ConvertFromNCHWToNCHW4Float(src_data, dst_data, dims[0], dims[1],
+                DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3));
+        } else {
+            DataFormatConverter::ConvertFromNCHW4ToNCHWFloat(src_data, dst_data, dims[0], dims[1],
+                DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3));
+        }
+    }
+#if TNN_ARM82
+    else if (DATA_TYPE_HALF == data_type) {
+        // TODO: how to packc8 in a device-independent way?
+        const int batch   = dims[0];
+        const int channel = dims[1];
+        const int hw      = DimsFunctionUtils::GetDimProduct(dims, 2);
+        fp16_t *src_data = reinterpret_cast<fp16_t*>(src);
+        fp16_t *dst_data = reinterpret_cast<fp16_t*>(dst);
+        if (pack) {
+            for(int n=0; n<batch; ++n) {
+                PackC8(dst_data+n*channel*hw, src_data+n*channel*hw , hw, channel);
+            }
+        } else {
+            for(int n=0; n<batch; ++n) {
+                UnpackC8(dst_data+n*channel*hw, src_data+n*channel*hw , hw, channel);
+            }
+        }
+    }
+#endif
+}
+
+MetalCpuAdapterAcc::MetalCpuAdapterAcc(LayerType impl_layer_type) {
+    impl_layer_type_ = impl_layer_type;
+    DeviceType device_list[2] = {DEVICE_ARM, DEVICE_X86};
+    for(auto device_type : device_list) {
+        auto device = GetDevice(device_type);
+        if(device != NULL) {
+            auto acc = device->CreateLayerAcc(impl_layer_type_);
+            if(acc != NULL) {
+                cpu_adapter_acc_  = acc;
+                impl_device_type_ = device_type;
+                impl_device_      = GetDevice(impl_device_type_);
+                impl_device_context_ = device->CreateContext(0);
+                break;
+            }
+        }
+    }
+}
+
+Status MetalCpuAdapterAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs,
+                                const std::vector<Blob *> &outputs) {
+    if(cpu_adapter_acc_ == NULL) {
+        return Status(TNNERR_MODEL_ERR, "cpu adapter acc is null");
+    }
+    auto status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    metal_context_ = dynamic_cast<MetalContext *>(context);
+    if (metal_context_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "Metal Context Convert failed");
+    }
+    
+    //check input and output data type
+    for(auto input : inputs) {
+        auto desc = input->GetBlobDesc();
+        if (desc.data_type != DATA_TYPE_FLOAT && desc.data_type != DATA_TYPE_HALF) {
+            LOGE("layer acc with tyoe (%d) is nil\n", (int)impl_layer_type_);
+            return Status(TNNERR_NULL_PARAM, "layer acc is nil");
+        }
+    }
+
+    for(auto output : outputs) {
+        auto desc = output->GetBlobDesc();
+        if (desc.data_type != DATA_TYPE_FLOAT && desc.data_type != DATA_TYPE_HALF) {
+            LOGE("layer acc with tyoe (%d) is nil\n", (int)impl_layer_type_);
+            return Status(TNNERR_NULL_PARAM, "layer acc is nil");
+        }
+    }
+    
+    //TODO: test with bfp16 mode
+    
+    for(auto input : inputs) {
+        auto desc = input->GetBlobDesc();
+        desc.device_type = impl_device_type_;
+        desc.data_format = GetCpuLayerAccDataFormat();
+        desc.data_type   = GetCpuLayerAccPrecision(input->GetBlobDesc().data_type);
+        cpu_blob_in_.push_back(new Blob(desc, true));
+    }
+
+    for(auto output : outputs) {
+        auto desc = output->GetBlobDesc();
+        desc.device_type = impl_device_type_;
+        desc.data_format = GetCpuLayerAccDataFormat();
+        desc.data_type   = GetCpuLayerAccPrecision(output->GetBlobDesc().data_type);
+        cpu_blob_out_.push_back(new Blob(desc, true));
+    }
+    
+    //cpu acc init
+    status = cpu_adapter_acc_->Init(impl_device_context_, param, resource, cpu_blob_in_, cpu_blob_out_);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    cpu_adapter_acc_->SetRuntimeMode(runtime_model_);
+    cpu_adapter_acc_->SetConstantResource(const_resource_);
+    
+    return status;
+}
+
+MetalCpuAdapterAcc::~MetalCpuAdapterAcc() {
+    for(auto input : cpu_blob_in_) {
+        delete input;
+    }
+    cpu_blob_in_.clear();
+    
+    for(auto output : cpu_blob_out_) {
+        delete output;
+    }
+    cpu_blob_out_.clear();
+    
+    if (cpu_adapter_acc_) {
+        delete cpu_adapter_acc_;
+    }
+    cpu_adapter_acc_ = nullptr;
+    
+    if (impl_device_context_) {
+        delete impl_device_context_;
+    }
+    impl_device_context_ = nullptr;
+    
+}
+
+DataType MetalCpuAdapterAcc::GetCpuLayerAccPrecision(DataType metal_blob_data_type) {
+    if (metal_blob_data_type == DATA_TYPE_HALF) {
+        static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
+        bool layer_implemented_fp16  = impl_device_->GetImplementedPrecision(impl_layer_type_)->fp16_implemented;
+        return (cpu_support_fp16 && layer_implemented_fp16) ? DATA_TYPE_HALF : DATA_TYPE_FLOAT;
+    } else if (metal_blob_data_type == DATA_TYPE_FLOAT) {
+        return DATA_TYPE_FLOAT;
+    }
+    return DATA_TYPE_FLOAT;
+}
+
+ DataFormat MetalCpuAdapterAcc::GetCpuLayerAccDataFormat() {
+    auto cpu_layouts = impl_device_->GetImplementedLayout(impl_layer_type_);
+    return cpu_layouts->layouts[0];
+ }
+
+Status MetalCpuAdapterAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    for(int i = 0; i < inputs.size(); ++i) {
+        auto device_input = inputs[i];
+        auto cpu_input = cpu_blob_in_[i];
+        cpu_input->GetBlobDesc().dims = device_input->GetBlobDesc().dims;
+    }
+    for(int i = 0; i < outputs.size(); ++i) {
+        auto device_output = outputs[i];
+        auto cpu_output = cpu_blob_out_[i];
+        cpu_output->GetBlobDesc().dims = device_output->GetBlobDesc().dims;
+    }
+    return cpu_adapter_acc_->Reshape(cpu_blob_in_, cpu_blob_out_);
+}
+
+Status MetalCpuAdapterAcc::ConvertBlobForAdaptorAcc(const std::vector<Blob *> & metal_blobs,
+                                                const std::vector<Blob *> & cpu_blobs, bool metal_to_cpu) {
+    Status status = TNN_OK;
+    void* command_queue = nullptr;
+    metal_context_->GetCommandQueue(&command_queue);
+    for(int i = 0; i < metal_blobs.size(); ++i) {
+        auto device_blob = metal_blobs[i];
+        auto cpu_blob    = cpu_blobs[i];
+
+        // leave constant blobs to device layer acc
+        if (const_resource_ != nullptr &&
+            const_resource_->find(device_blob->GetBlobDesc().name) != const_resource_->end()) {
+                continue;
+        }
+
+        auto dims = cpu_blob->GetBlobDesc().dims;
+        if (!metal_to_cpu) {
+            device_blob->GetBlobDesc().dims = dims;
+        }
+
+        BlobConverter blob_converter(device_blob);
+        MatConvertParam param;
+        const auto& cpu_blob_desc = cpu_blob->GetBlobDesc();
+
+        if(DATA_FORMAT_NCHW == cpu_blob_desc.data_format) {
+            if (metal_to_cpu) {
+                Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims, cpu_blob->GetHandle().base);
+                status = blob_converter.ConvertToMat(mat, param, command_queue);
+            } else {
+                Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims, cpu_blob->GetHandle().base);
+                status = blob_converter.ConvertFromMat(mat, param, command_queue);
+            }
+            RETURN_ON_NEQ(status, TNN_OK);
+        } else {
+            //To optimize, use convert to change format
+            Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims);
+            if (metal_to_cpu) {
+                status = blob_converter.ConvertToMat(mat, param, command_queue);
+                RETURN_ON_NEQ(status, TNN_OK);
+                PackOrUnpackData(mat.GetData(), cpu_blob->GetHandle().base, cpu_blob_desc.data_type, dims, true);
+            } else {
+                PackOrUnpackData(cpu_blob->GetHandle().base, mat.GetData(), cpu_blob_desc.data_type, dims, false);
+                status = blob_converter.ConvertFromMat(mat, param, command_queue);
+                RETURN_ON_NEQ(status, TNN_OK);
+            }
+        }
+    }
+    return status;
+}
+
+Status MetalCpuAdapterAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = TNN_OK;
+
+    status = ConvertBlobForAdaptorAcc(inputs, cpu_blob_in_, true);
+    RETURN_ON_NEQ(status, TNN_OK);
+    //cpu acc forward
+    status = cpu_adapter_acc_->Forward(cpu_blob_in_, cpu_blob_out_);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    status = ConvertBlobForAdaptorAcc(outputs, cpu_blob_out_, false);
+
+    return status;
+}
+
+std::vector<DataFormat> MetalCpuAdapterAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    return {DATA_FORMAT_NC4HW4};
+}
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.metal
new file mode 100644
index 0000000..18617d4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void div_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = src0[index_in] / src1[index_in];
+}
+
+kernel void div_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = data0 / data1;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.mm
new file mode 100644
index 0000000..4834dc0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_div_layer_acc.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Div, LAYER_DIV);
+
+std::string MetalDivLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "div_broadcast";
+    } else {
+        kernel_name = "div_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalDivLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalDivLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Div, LAYER_DIV);
+REGISTER_METAL_LAYOUT(LAYER_DIV, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.metal
new file mode 100644
index 0000000..f7362b1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void elu(const device ftype4 *in                           [[buffer(0)]],
+                        device ftype4 *out                                   [[buffer(1)]],
+                        constant MetalEluParams& params       [[buffer(2)]],
+                        uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = fmax(*z_in, Zero4) + params.alpha * (exp(fmin(*z_in, Zero4)) - 1.0);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.mm
new file mode 100644
index 0000000..d8645d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_elu_layer_acc.mm
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Elu, LAYER_ELU);
+
+string MetalEluLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "elu";
+}
+
+Status MetalEluLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<EluLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: EluLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "EluLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalEluParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.alpha = layer_param->alpha;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalEluParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalEluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Elu, LAYER_ELU);
+REGISTER_METAL_LAYOUT(LAYER_ELU, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.metal
new file mode 100644
index 0000000..eed11b1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void exp(const device ftype4 *in                           [[buffer(0)]],
+                         device ftype4 *out                                   [[buffer(1)]],
+                         constant MetalEluParams& params       [[buffer(2)]],
+                         uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = exp(*z_in);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.mm
new file mode 100644
index 0000000..5dd861b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_exp_layer_acc.mm
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Exp, LAYER_EXP);
+
+string MetalExpLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "exp";
+}
+
+Status MetalExpLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalExpLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Exp, LAYER_EXP);
+REGISTER_METAL_LAYOUT(LAYER_EXP, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_flatten_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_flatten_layer_acc.mm
new file mode 100644
index 0000000..9529fc0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_flatten_layer_acc.mm
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Flatten, LAYER_FLATTEN);
+
+Status MetalFlattenLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalFlattenLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalReshapeParams metal_params;
+        metal_params.input_width   = DimsFunctionUtils::GetDimProduct(dims_input, 3);
+        metal_params.input_height  = DimsFunctionUtils::GetDim(dims_input, 2);
+        metal_params.input_size    = metal_params.input_height * metal_params.input_width;
+        metal_params.input_slice   = UP_DIV(dims_input[1], 4);
+        metal_params.input_channel = dims_input[1];
+
+        metal_params.output_width   = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        metal_params.output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_size    = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice   = UP_DIV(dims_output[1], 4);
+        metal_params.output_channel = dims_output[1];
+        metal_params.batch          = dims_output[0];
+
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalFlattenLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "reshape_common_nchw";
+}
+
+Status MetalFlattenLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    auto output_width   = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+    auto output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+    auto output_size    = output_height * output_width;
+    auto output_slice   = UP_DIV(dims_output[1], 4);
+    auto batch          = dims_output[0];
+    size = MTLSizeMake(output_size, output_slice, batch);
+
+    return TNN_OK;
+}
+
+Status MetalFlattenLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalFlattenLayerAcc::SetKernelEncoderParam(
+                                                   id<MTLComputeCommandEncoder> encoder,
+                                                   const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Flatten, LAYER_FLATTEN);
+REGISTER_METAL_LAYOUT(LAYER_FLATTEN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.metal
new file mode 100644
index 0000000..cdacd74
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void floor(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = floor(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.mm
new file mode 100644
index 0000000..813f8b3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_floor_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Floor, LAYER_FLOOR);
+
+string MetalFloorLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "floor";
+}
+
+Status MetalFloorLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalFloorLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Floor, LAYER_FLOOR);
+REGISTER_METAL_LAYOUT(LAYER_FLOOR, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.h
new file mode 100644
index 0000000..087950a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_GATHER_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_GATHER_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalGatherLayerAcc : public MetalLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~MetalGatherLayerAcc() {}
+
+    Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                    const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs);
+
+    Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs,
+                                    MTLSize &size);
+
+    std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+protected:
+    id<MTLBuffer> buffer_data_;
+    id<MTLBuffer> buffer_indices_;
+    MTLSize threads_shape_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_GATHER_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.metal
new file mode 100644
index 0000000..84bff82
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.metal
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void gather_axis_1(const device ftype *src                   [[buffer(0)]],
+                          const device int *gather_indices          [[buffer(1)]],
+                                device ftype *dst                   [[buffer(2)]],
+                                constant MetalGatherParams &params  [[buffer(3)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, params.output_axis_size, params.outer_size)))
+        return;
+
+    int output_slice = (int)gid.y / 4;
+    int output_c     = (int)gid.y % 4;
+    int index_out = (((int)gid.z*params.output_slice + output_slice)*params.inner_size + (int)gid.x)*4 + output_c;
+
+    int input_axis_index = gather_indices[(int)gid.y];
+    int input_slice = input_axis_index / 4;
+    int input_c     = input_axis_index % 4;
+    int index_in = (((int)gid.z*params.input_slice + input_slice)*params.inner_size + (int)gid.x)*4 + input_c;
+
+    dst[index_out] = src[index_in];
+}
+
+kernel void gather_common(const device ftype4 *src                  [[buffer(0)]],
+                          const device int *gather_indices          [[buffer(1)]],
+                                device ftype4 *dst                  [[buffer(2)]],
+                                constant MetalGatherParams &params  [[buffer(3)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, params.output_axis_size, params.outer_size)))
+        return;
+    
+    int index_out = ((int)gid.z*params.output_axis_size + (int)gid.y)*params.inner_size + (int)gid.x;
+
+    int input_axis_index = gather_indices[gid.y];
+    int index_in = ((int)gid.z*params.input_axis_size + input_axis_index)*params.inner_size + (int)gid.x;
+
+    dst[index_out] = src[index_in];
+}
+
+kernel void gather_common_nchw(const device ftype *src                  [[buffer(0)]],
+                          const device int *gather_indices          [[buffer(1)]],
+                                device ftype *dst                  [[buffer(2)]],
+                                constant MetalGatherParams &params  [[buffer(3)]],
+                                uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, params.output_axis_size, params.outer_size)))
+        return;
+    
+    int index_out = ((int)gid.z*params.output_axis_size + (int)gid.y)*params.inner_size + (int)gid.x;
+
+    int input_axis_index = gather_indices[gid.y];
+    int index_in = ((int)gid.z*params.input_axis_size + input_axis_index)*params.inner_size + (int)gid.x;
+
+    dst[index_out] = src[index_in];
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.mm
new file mode 100644
index 0000000..96c44ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_gather_layer_acc.mm
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_gather_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils.h"
+
+namespace TNN_NS {
+
+Status MetalGatherLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+std::vector<DataFormat> MetalGatherLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    return {DATA_FORMAT_NCHW};
+}
+
+Status MetalGatherLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<GatherLayerParam *>(param_);
+    auto layer_resource  = dynamic_cast<GatherLayerResource *>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+    const auto axis = layer_param->axis;
+
+    DimsVector input_data_dims;
+    void *input_data = nullptr;
+    if (layer_param->data_in_resource) {
+        input_data_dims = layer_resource->data.GetBufferDims();
+        input_data = layer_resource->data.force_to<void *>();
+    } else {
+        input_data_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+    }
+
+    DimsVector indices_dims;
+    void *indices_data = nullptr;
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_data = layer_resource->indices.force_to<void *>();
+    } else {
+        indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+    }
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    //input_data_dims[1]   = UP_DIV(input_data_dims[1], 4);
+    const int inner_size = DimsFunctionUtils::GetDimProduct(input_data_dims, axis+1);
+    const int outer_size = DimsFunctionUtils::GetDimProduct(input_data_dims, 0, axis);
+    int input_axis_size  = DimsFunctionUtils::GetDim(input_data_dims, axis);
+    int output_axis_size = DimsFunctionUtils::GetDim(dims_output, axis);
+    if (DimsVectorUtils::Count(indices_dims) == 1 && dims_output.size() < input_data_dims.size()) {
+        dims_output.insert(dims_output.begin()+axis, 1);
+        output_axis_size = DimsFunctionUtils::GetDim(dims_output, axis);
+    }
+    // buffer_param_
+    {
+        MetalGatherParams metal_params;
+        metal_params.inner_size = inner_size;
+        metal_params.outer_size = outer_size;
+        metal_params.input_axis_size  = input_axis_size;
+        metal_params.output_axis_size = output_axis_size;
+        //metal_params.input_slice  = input_data_dims[1];
+        //metal_params.output_slice = UP_DIV(dims_output[1], 4);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalGatherParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    // buffer data
+    if (layer_param->data_in_resource && buffer_data_ == nil) {
+        const auto data_count = DimsVectorUtils::Count(input_data_dims);
+        const auto data_type  = layer_resource->data.GetDataType();
+        auto data_type_size   = 0;
+        std::shared_ptr<void> data_cast_type = nullptr;
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_HALF) {
+            data_cast_type.reset(new float[data_count], [](float *p){delete[] p;});
+            if (ConvertFromHalfToFloat(input_data, (float *)data_cast_type.get(), data_count) != 0) {
+                LOGE("Error: DataType %d not support\n", data_type);
+                return Status(TNNERR_MODEL_ERR, "Convert Data in LayerRerouece from half to float failed!");
+            }
+            input_data = data_cast_type.get();
+        }
+        data_type_size = sizeof(float);
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            data_cast_type.reset(new uint16_t[data_count], [](uint16_t *p){delete[] p;});
+            if (ConvertFromFloatToHalf((float *)data_cast_type.get(), (float *)input_data, data_count) != 0) {
+                LOGE("Error: DataType %d not support\n", data_type);
+                return Status(TNNERR_MODEL_ERR, "Convert Data in LayerRerouece from float to half failed!");
+            }
+            input_data = data_cast_type.get();
+        }
+        data_type_size = sizeof(uint16_t);
+#endif
+        buffer_data_ = [device newBufferWithBytes:(const void *)input_data
+                                           length:data_type_size*data_count
+                                          options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    // buffer indices
+    if (layer_param->indices_in_resource && buffer_indices_ == nil) {
+        buffer_indices_ = [device newBufferWithBytes:(const void *)indices_data
+                                              length:sizeof(int)*output_axis_size
+                                             options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    // save threads shape
+    threads_shape_ = MTLSizeMake(inner_size, output_axis_size, outer_size);
+    return TNN_OK;
+}
+
+std::string MetalGatherLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam *>(param_);
+    /*
+    if (layer_param->axis == 1)
+        return "gather_axis_1";
+    return "gather_common";
+    */
+    return "gather_common_nchw";
+}
+
+Status MetalGatherLayerAcc::SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    auto layer_param     = dynamic_cast<GatherLayerParam *>(param_);
+    if (layer_param->data_in_resource) {
+        [encoder setBuffer:buffer_data_
+                    offset:(NSUInteger)0
+                   atIndex:0];
+    } else {
+        [encoder setBuffer:(__bridge id<MTLBuffer>)inputs[0]->GetHandle().base
+                    offset:(NSUInteger)inputs[0]->GetHandle().bytes_offset
+                   atIndex:0];
+    }
+
+    if (layer_param->indices_in_resource) {
+        [encoder setBuffer:buffer_indices_
+                    offset:(NSUInteger)0
+                   atIndex:1];
+    } else {
+        [encoder setBuffer:(__bridge id<MTLBuffer>)inputs[1]->GetHandle().base
+                    offset:(NSUInteger)inputs[1]->GetHandle().bytes_offset
+                   atIndex:1];
+    }
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)outputs[0]->GetHandle().base
+                        offset:(NSUInteger)outputs[0]->GetHandle().bytes_offset
+                       atIndex:2];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+
+    return TNN_OK;
+}
+
+Status MetalGatherLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    size = threads_shape_;
+    return TNN_OK;
+}
+
+Status MetalGatherLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Gather, LAYER_GATHER);
+REGISTER_METAL_LAYOUT(LAYER_GATHER, DATA_FORMAT_NCHW);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.metal
new file mode 100644
index 0000000..9e883e7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void hard_sigmoid(const device ftype4 *in                                      [[buffer(0)]],
+                                         device ftype4 *out                                              [[buffer(1)]],
+                                         constant MetalHardSigmoidParams& params     [[buffer(2)]],
+                                         uint3 gid                                                             [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in[(int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x];
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = clamp(z_in*params.alpha + params.beta, 0, 1);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.mm
new file mode 100644
index 0000000..fd56f6e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_sigmoid_layer_acc.mm
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+
+string MetalHardSigmoidLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "hard_sigmoid";
+}
+
+Status MetalHardSigmoidLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<HardSigmoidLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: HardSigmoidLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "HardSigmoidLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalHardSigmoidParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_output, dims_output);
+        FixDefaultMetalParams(metal_params, dims_output, dims_output);
+
+        metal_params.alpha = layer_param->alpha;
+        metal_params.beta  = layer_param->beta;
+        metal_params.min   = -metal_params.beta / metal_params.alpha;
+        metal_params.max   = (1.0f - metal_params.beta) / metal_params.alpha;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalHardSigmoidParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalHardSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+REGISTER_METAL_LAYOUT(LAYER_HARDSIGMOID, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.metal
new file mode 100644
index 0000000..01df49f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.metal
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void hard_swish(const device ftype4 *src0                                    [[buffer(0)]],
+                                      const device ftype4 *src1                                    [[buffer(1)]],
+                                      device ftype4 *dst                                               [[buffer(2)]],
+                                      constant MetalHardSigmoidParams& params     [[buffer(3)]],
+                                      uint3 gid                                                             [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = data0*clamp(data1*params.alpha + params.beta, 0, 1);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.mm
new file mode 100644
index 0000000..406300d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hard_swish_layer_acc.mm
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(HardSwish, LAYER_HARDSWISH);
+Status MetalHardSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalHardSwishLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: HardSwishLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "HardSwishLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalHardSigmoidParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.broadcast_input0 = layer_param->input0_broadcast_type;
+        metal_params.broadcast_input1 = layer_param->input1_broadcast_type;
+        
+        metal_params.alpha = layer_param->alpha;
+        metal_params.beta  = layer_param->beta;
+        metal_params.min   = -metal_params.beta / metal_params.alpha;
+        metal_params.max   = (1.0f - metal_params.beta) / metal_params.alpha;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalHardSigmoidParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalHardSwishLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "hard_swish";
+}
+
+Status MetalHardSwishLayerAcc::SetKernelEncoderParam(
+                                                     id<MTLComputeCommandEncoder> encoder,
+                                                     const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    auto input0 = inputs[0];
+    auto input1 = inputs.size() > 1 ? inputs[1] : input0;
+    auto output = outputs[0];
+    
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->GetHandle().base
+                offset:(NSUInteger)input0->GetHandle().bytes_offset
+               atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->GetHandle().base
+                offset:(NSUInteger)input1->GetHandle().bytes_offset
+               atIndex:1];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                offset:(NSUInteger)output->GetHandle().bytes_offset
+               atIndex:2];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+    return TNN_OK;
+}
+
+Status MetalHardSwishLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalHardSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(HardSwish, LAYER_HARDSWISH);
+REGISTER_METAL_LAYOUT(LAYER_HARDSWISH, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.metal
new file mode 100644
index 0000000..4135ae1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.metal
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void hdr_guide(const device ftype4 *src             [[buffer(0)]],
+                       device ftype4 *dst                   [[buffer(1)]],
+                       constant MetalParams& params         [[buffer(2)]],
+                       const device ftype4x4& ccm_weights          [[buffer(3)]],
+                       const device ftype4& ccm_biases          [[buffer(4)]],
+                      const device ftype4 *slopes          [[buffer(5)]],
+                      const device ftype4 *shifts          [[buffer(6)]],
+                      const device ftype4& projection          [[buffer(7)]],
+                       uint2 gid                            [[thread_position_in_grid]]) {
+    if (any(gid >= uint2(params.output_size, params.output_slice*params.batch)))
+        return;
+    
+    auto index = (int)gid.y * params.input_size  + (int)gid.x;
+    auto index_out = (int)gid.y * params.output_size  + (int)gid.x;
+    
+    // use ccm, create new r, g, b value
+    auto src_data = src[index];
+    ftype4 result = src_data * ccm_weights + ccm_biases;
+    
+    // use slope and shifts per channel
+    ftype4 guide_result  = slopes[0] * max(result - shifts[0], Zero4);
+    guide_result += slopes[1] * max(result - shifts[1], Zero4);
+    guide_result += slopes[2] * max(result - shifts[2], Zero4);
+    guide_result += slopes[3] * max(result - shifts[3], Zero4);
+
+    // channel mix
+    ftype4 guide_value = ftype4(dot(projection.xyz, guide_result.xyz) + projection.w, 0, 0, 0);
+    guide_value = clamp(guide_value, 0, 1);
+    dst[index_out] = guide_value;
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.mm
new file mode 100644
index 0000000..4ae897a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_hdrguide_layer_acc.mm
@@ -0,0 +1,538 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+class MetalHDRGuideLayerAcc : public MetalLayerAcc {
+public:
+    virtual ~MetalHDRGuideLayerAcc(){};
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_ccm_weight_ = nil;
+    id<MTLBuffer> buffer_ccm_bias_   = nil;
+    id<MTLBuffer> buffer_shifts_     = nil;
+    id<MTLBuffer> buffer_slopes_     = nil;
+    id<MTLBuffer> buffer_projection_ = nil;
+
+    Status AllocateBufferCCMWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferCCMBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferSlopes(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferShifts(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status AllocateBufferProjection(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+Status MetalHDRGuideLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    auto layer_res = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+
+    Status status = TNN_OK;
+    // buffer_param_
+    {
+        auto metal_params = GetDefaultMetalParams(dims_input, dims_output);
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    status = AllocateBufferCCMWeight(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateBufferCCMBias(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateBufferSlopes(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateBufferShifts(inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateBufferProjection(inputs, outputs);
+    return status;
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferCCMWeight(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    auto layer_res = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+
+    Status status = TNN_OK;
+    if (!buffer_ccm_weight_) {
+        int kw = 1;
+        int kh = 1;
+
+        const int input_channel  = 3;
+        const int output_channel = 3;
+
+        const int group = 1;
+
+        buffer_ccm_weight_ = AllocatePackedGOIHW16MetalBufferFormRawBuffer(
+            layer_res->ccm_weight_handle, {output_channel, input_channel, kh, kw}, group, status);
+    }
+    return status;
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferCCMBias(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_res       = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+
+    if (!buffer_ccm_bias_) {
+        const float *ccm_bias_data = layer_res->ccm_bias_handle.force_to<float *>();
+        const DataType data_type   = layer_res->ccm_bias_handle.GetDataType();
+
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            return Status(TNNERR_MODEL_ERR, "k_handle DataType is not supported");
+        }
+
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_FLOAT) {
+            float *ccm_bias_data_fp32 = (float *)ccm_bias_data;
+
+            //补齐
+            float data_fill_4[4] = {
+                ccm_bias_data_fp32[0],
+                ccm_bias_data_fp32[1],
+                ccm_bias_data_fp32[2],
+                0,
+            };
+
+            buffer_ccm_bias_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                   length:4 * sizeof(float)
+                                                  options:MTLResourceCPUCacheModeWriteCombined];
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *ccm_bias_data_fp16 = (uint16_t *)ccm_bias_data;
+
+            //补齐
+            uint16_t data_fill_4[4] = {
+                ccm_bias_data_fp16[0],
+                ccm_bias_data_fp16[1],
+                ccm_bias_data_fp16[2],
+                0,
+            };
+
+            // convert to float
+            float *data_fp32_data = new float[4];
+            ConvertFromHalfToFloat((void *)data_fill_4, data_fp32_data, 4);
+
+            buffer_ccm_bias_ = [device newBufferWithBytes:(const void *)data_fp32_data
+                                                   length:4 * sizeof(float)
+                                                  options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp32_data;
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            //补齐
+            float data_fill_4[4] = {
+                ccm_bias_data[0],
+                ccm_bias_data[1],
+                ccm_bias_data[2],
+                0.0f,
+            };
+
+            // convert to half
+            uint16_t *data_fp16_data = new uint16_t[4];
+            ConvertFromFloatToHalf((float *)data_fill_4, (void *)data_fp16_data, 4);
+
+            buffer_ccm_bias_ = [device newBufferWithBytes:(const void *)data_fp16_data
+                                                   length:4 * sizeof(uint16_t)
+                                                  options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp16_data;
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *ccm_bias_data_fp16 = (uint16_t *)ccm_bias_data;
+
+            //补齐
+            uint16_t data_fill_4[4] = {
+                ccm_bias_data_fp16[0],
+                ccm_bias_data_fp16[1],
+                ccm_bias_data_fp16[2],
+                0,
+            };
+
+            buffer_ccm_bias_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                   length:4 * sizeof(uint16_t)
+                                                  options:MTLResourceCPUCacheModeWriteCombined];
+        }
+#endif
+    }
+    return TNN_OK;
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferSlopes(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_res       = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+    if (!buffer_slopes_) {
+        const float *slope_data  = layer_res->slopes_handle.force_to<float *>();
+        const DataType data_type = layer_res->ccm_bias_handle.GetDataType();
+
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            return Status(TNNERR_MODEL_ERR, "k_handle DataType is not supported");
+        }
+
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_FLOAT) {
+            float *slope_data_fp32 = (float *)slope_data;
+
+            //补齐
+            float data_fill_4[16] = {
+                slope_data_fp32[0], slope_data_fp32[4], slope_data_fp32[8],  0,
+                slope_data_fp32[1], slope_data_fp32[5], slope_data_fp32[9],  0,
+                slope_data_fp32[2], slope_data_fp32[6], slope_data_fp32[10], 0,
+                slope_data_fp32[3], slope_data_fp32[7], slope_data_fp32[11], 0,
+            };
+
+            buffer_slopes_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                 length:16 * sizeof(float)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *slope_data_fp16 = (uint16_t *)slope_data;
+
+            //补齐
+            uint16_t data_fill_4[16] = {
+                slope_data_fp16[0], slope_data_fp16[4], slope_data_fp16[8],  0,
+                slope_data_fp16[1], slope_data_fp16[5], slope_data_fp16[9],  0,
+                slope_data_fp16[2], slope_data_fp16[6], slope_data_fp16[10], 0,
+                slope_data_fp16[3], slope_data_fp16[7], slope_data_fp16[11], 0,
+            };
+
+            // convert to float
+            float *data_fp32_data = new float[16];
+            ConvertFromHalfToFloat((void *)data_fill_4, data_fp32_data, 16);
+
+            buffer_slopes_ = [device newBufferWithBytes:(const void *)data_fp32_data
+                                                 length:16 * sizeof(float)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp32_data;
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            //补齐
+            float data_fill_4[16] = {
+                slope_data[0], slope_data[4], slope_data[8],  0.0f, slope_data[1], slope_data[5], slope_data[9],  0.0f,
+                slope_data[2], slope_data[6], slope_data[10], 0.0f, slope_data[3], slope_data[7], slope_data[11], 0.0f,
+            };
+
+            // convert to half
+            uint16_t *data_fp16_data = new uint16_t[16];
+            ConvertFromFloatToHalf((float *)data_fill_4, (void *)data_fp16_data, 16);
+
+            buffer_slopes_ = [device newBufferWithBytes:(const void *)data_fp16_data
+                                                 length:16 * sizeof(uint16_t)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp16_data;
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *slope_data_fp16 = (uint16_t *)slope_data;
+
+            //补齐
+            uint16_t data_fill_4[16] = {
+                slope_data_fp16[0], slope_data_fp16[4], slope_data_fp16[8],  0,
+                slope_data_fp16[1], slope_data_fp16[5], slope_data_fp16[9],  0,
+                slope_data_fp16[2], slope_data_fp16[6], slope_data_fp16[10], 0,
+                slope_data_fp16[3], slope_data_fp16[7], slope_data_fp16[11], 0,
+            };
+
+            buffer_slopes_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                 length:16 * sizeof(uint16_t)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        }
+#endif
+    }
+    return TNN_OK;
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferShifts(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_res       = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+
+    if (!buffer_shifts_) {
+        const float *shifts_data = layer_res->shifts_handle.force_to<float *>();
+        const DataType data_type = layer_res->shifts_handle.GetDataType();
+
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            return Status(TNNERR_MODEL_ERR, "k_handle DataType is not supported");
+        }
+
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_FLOAT) {
+            float *shifts_data_fp32 = (float *)shifts_data;
+
+            //补齐
+            float data_fill_4[16] = {
+                shifts_data_fp32[0], shifts_data_fp32[4], shifts_data_fp32[8],  0,
+                shifts_data_fp32[1], shifts_data_fp32[5], shifts_data_fp32[9],  0,
+                shifts_data_fp32[2], shifts_data_fp32[6], shifts_data_fp32[10], 0,
+                shifts_data_fp32[3], shifts_data_fp32[7], shifts_data_fp32[11], 0,
+            };
+
+            buffer_shifts_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                 length:16 * sizeof(float)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *shifts_data_fp16 = (uint16_t *)shifts_data;
+
+            //补齐
+            uint16_t data_fill_4[16] = {
+                shifts_data_fp16[0], shifts_data_fp16[4], shifts_data_fp16[8],  0,
+                shifts_data_fp16[1], shifts_data_fp16[5], shifts_data_fp16[9],  0,
+                shifts_data_fp16[2], shifts_data_fp16[6], shifts_data_fp16[10], 0,
+                shifts_data_fp16[3], shifts_data_fp16[7], shifts_data_fp16[11], 0,
+            };
+
+            // convert to float
+            float *data_fp32_data = new float[16];
+            ConvertFromHalfToFloat((void *)data_fill_4, data_fp32_data, 16);
+
+            buffer_shifts_ = [device newBufferWithBytes:(const void *)data_fp32_data
+                                                 length:16 * sizeof(float)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp32_data;
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            //补齐
+            float data_fill_4[16] = {
+                shifts_data[0], shifts_data[4], shifts_data[8],  0.0f,           shifts_data[1],  shifts_data[5],
+                shifts_data[9], 0.0f,           shifts_data[2],  shifts_data[6], shifts_data[10], 0.0f,
+                shifts_data[3], shifts_data[7], shifts_data[11], 0.0f,
+            };
+
+            // convert to half
+            uint16_t *data_fp16_data = new uint16_t[16];
+            ConvertFromFloatToHalf((float *)data_fill_4, (void *)data_fp16_data, 16);
+
+            buffer_shifts_ = [device newBufferWithBytes:(const void *)data_fp16_data
+                                                 length:16 * sizeof(uint16_t)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp16_data;
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *shifts_data_fp16 = (uint16_t *)shifts_data;
+
+            //补齐
+            uint16_t data_fill_4[16] = {
+                shifts_data_fp16[0], shifts_data_fp16[4], shifts_data_fp16[8],  0,
+                shifts_data_fp16[1], shifts_data_fp16[5], shifts_data_fp16[9],  0,
+                shifts_data_fp16[2], shifts_data_fp16[6], shifts_data_fp16[10], 0,
+                shifts_data_fp16[3], shifts_data_fp16[7], shifts_data_fp16[11], 0,
+            };
+
+            buffer_shifts_ = [device newBufferWithBytes:(const void *)data_fill_4
+                                                 length:16 * sizeof(uint16_t)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        }
+#endif
+    }
+
+    return TNN_OK;
+}
+
+Status MetalHDRGuideLayerAcc::AllocateBufferProjection(const std::vector<Blob *> &inputs,
+                                                       const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_res       = dynamic_cast<HdrGuideLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: HDRGuideLayerResource is nil");
+    }
+
+    if (!buffer_projection_) {
+        const float *projection_weight_data = layer_res->projection_weight_handle.force_to<float *>();
+        const float *projection_bias_data   = layer_res->projection_bias_handle.force_to<float *>();
+        const DataType data_type            = layer_res->projection_weight_handle.GetDataType();
+
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            return Status(TNNERR_MODEL_ERR, "b_handle DataType is not supported");
+        }
+
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_FLOAT) {
+            float *projection_weight_data_fp32 = (float *)projection_weight_data;
+            float *projection_bias_data_fp32   = (float *)projection_bias_data;
+            //补齐
+            float projection_data[4] = {
+                projection_weight_data_fp32[0],
+                projection_weight_data_fp32[1],
+                projection_weight_data_fp32[2],
+                projection_bias_data_fp32[0],
+            };
+
+            buffer_projection_ = [device newBufferWithBytes:(const void *)projection_data
+                                                     length:4 * sizeof(float)
+                                                    options:MTLResourceCPUCacheModeWriteCombined];
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *projection_weight_data_fp16 = (uint16_t *)projection_weight_data;
+            uint16_t *projection_bias_data_fp16   = (uint16_t *)projection_bias_data;
+            //补齐
+            uint16_t projection_data[4] = {
+                projection_weight_data_fp16[0],
+                projection_weight_data_fp16[1],
+                projection_weight_data_fp16[2],
+                projection_bias_data_fp16[0],
+            };
+
+            // convert to float
+            float *data_fp32_data = new float[4];
+            ConvertFromHalfToFloat((void *)projection_data, data_fp32_data, 4);
+
+            buffer_projection_ = [device newBufferWithBytes:(const void *)data_fp32_data
+                                                     length:4 * sizeof(float)
+                                                    options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp32_data;
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            //补齐
+            float projection_data[4] = {
+                projection_weight_data[0],
+                projection_weight_data[1],
+                projection_weight_data[2],
+                projection_bias_data[0],
+            };
+
+            // convert to half
+            uint16_t *data_fp16_data = new uint16_t[4];
+            ConvertFromFloatToHalf((float *)projection_data, (void *)data_fp16_data, 4);
+
+            buffer_projection_ = [device newBufferWithBytes:(const void *)data_fp16_data
+                                                     length:4 * sizeof(uint16_t)
+                                                    options:MTLResourceCPUCacheModeWriteCombined];
+            delete[] data_fp16_data;
+        } else if (data_type == DATA_TYPE_HALF) {
+            uint16_t *projection_weight_data_fp16 = (uint16_t *)projection_weight_data;
+            uint16_t *projection_bias_data_fp16   = (uint16_t *)projection_bias_data;
+            //补齐
+            uint16_t projection_data[4] = {
+                projection_weight_data_fp16[0],
+                projection_weight_data_fp16[1],
+                projection_weight_data_fp16[2],
+                projection_bias_data_fp16[0],
+            };
+
+            buffer_projection_ = [device newBufferWithBytes:(const void *)projection_data
+                                                     length:4 * sizeof(uint16_t)
+                                                    options:MTLResourceCPUCacheModeWriteCombined];
+        }
+#endif
+    }
+
+    return TNN_OK;
+}
+
+Status MetalHDRGuideLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto context_impl = context_->getMetalContextImpl();
+    if (!context_impl) {
+        LOGE("context_impl is nil\n");
+        return Status(TNNERR_CONTEXT_ERR, "MetalHDRGuideLayerAcc context_impl is nil");
+    }
+
+    auto encoder = [context_impl encoder];
+    if (!encoder) {
+        LOGE("encoder is nil\n");
+        return Status(TNNERR_CONTEXT_ERR, "MetalHDRGuideLayerAcc encoder is nil");
+    }
+
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto output_width  = DimsFunctionUtils::GetDim(dims_output, 3),
+         output_height = DimsFunctionUtils::GetDim(dims_output, 2),
+         output_slice  = UP_DIV(dims_output[1], 4) * dims_output[0];
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+
+    do {
+        status = [context_impl load:@"hdr_guide" encoder:encoder bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+
+        MTLSize threads = {(NSUInteger)output_height * output_width, (NSUInteger)output_slice, 1};
+
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                    offset:(NSUInteger)input->GetHandle().bytes_offset
+                   atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                    offset:(NSUInteger)output->GetHandle().bytes_offset
+                   atIndex:1];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+        [encoder setBuffer:buffer_ccm_weight_ offset:0 atIndex:3];
+        [encoder setBuffer:buffer_ccm_bias_ offset:0 atIndex:4];
+        [encoder setBuffer:buffer_slopes_ offset:0 atIndex:5];
+        [encoder setBuffer:buffer_shifts_ offset:0 atIndex:6];
+        [encoder setBuffer:buffer_projection_ offset:0 atIndex:7];
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+REGISTER_METAL_ACC(HDRGuide, LAYER_HDRGUIDE);
+REGISTER_METAL_LAYOUT(LAYER_HDRGUIDE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.metal
new file mode 100644
index 0000000..b72678a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.metal
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void instance_norm(const device ftype4 *src                       [[buffer(0)]],
+                                            device ftype4 *dst                                [[buffer(1)]],
+                                            constant MetalParams& params         [[buffer(2)]],
+                                            const device ftype4 *scales                 [[buffer(3)]],
+                                            const device ftype4 *biases                [[buffer(4)]],
+                                            uint3 gid                                              [[thread_position_in_grid]],
+                                            uint t_index                                         [[thread_index_in_threadgroup]]) {
+    if (any(gid >= uint3(params.input_size, 1, params.batch*params.input_slice)))
+        return;
+    
+    auto index_c = (int)gid.z * params.input_size;
+    auto index_slice = (int)gid.z % params.input_slice;
+    
+    const int max_index = min(32, params.input_size);
+    
+    //do not use setThreadgroupMemoryLength, unknown bug will raise
+    threadgroup float4 x_group[32];
+    threadgroup float4 x2_group[32];
+    
+    //compute sum x x2
+    float4 sum_x = float4(0);
+    float4 sum_x2 = float4(0);
+    for (int index = t_index; index < params.input_size; index+=32) {
+        auto temp = float4(src[index + index_c]);
+        sum_x += temp;
+        sum_x2 += temp*temp;
+    }
+    x_group[t_index] = sum_x;
+    x2_group[t_index] = sum_x2;
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    //compute mean x x2
+    if (t_index == 0) {
+        sum_x = float4(0);
+        sum_x2 = float4(0);
+        for (int index = 0; index < max_index; index++) {
+            sum_x += x_group[index];
+            sum_x2 += x2_group[index];
+        }
+        auto mean_x = sum_x / params.input_size;
+        auto mean_x2 = sum_x2 / params.input_size;
+
+        auto variance = mean_x2 - mean_x*mean_x;
+        variance = max(variance, 0);
+        variance =1.0f / sqrt(variance + 0.00001f);
+
+        auto k = float4(scales[index_slice]);
+        variance *= (k);
+        auto b = float4(biases[index_slice]);
+        b -= mean_x * variance;
+        
+        x_group[0] = variance;
+        x2_group[0] = b;
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    auto scale_final = x_group[0];
+    auto bias_final = x2_group[0];
+    for (int index = t_index; index < params.input_size; index+=32) {
+        dst[index + index_c] = ftype4(float4(src[index + index_c]) * scale_final + bias_final);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.mm
new file mode 100644
index 0000000..3cdb08b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_instance_norm_layer_acc.mm
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalInstanceNormLayerAcc : public MetalLayerAcc {
+public:
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_scale_final_ = nil;
+    id<MTLBuffer> buffer_bias_final_  = nil;
+protected:
+    id<MTLBuffer> buffer_scale_ = nil;
+    id<MTLBuffer> buffer_bias_  = nil;
+};
+
+Status MetalInstanceNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalInstanceNormLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    auto layer_res = dynamic_cast<InstanceNormLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: layer resource is nil");
+    }
+
+    Status status = TNN_OK;
+    // buffer_param_
+    {
+        auto metal_params = GetDefaultMetalParams(dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+        // adapt to batchnorm, merge output batch to slice
+        metal_params.output_slice *= metal_params.batch;
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    if (!buffer_scale_) {
+        buffer_scale_ = AllocateMetalBufferFormRawBuffer1D(layer_res->scale_handle, dims_output[1], status);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+
+    if (!buffer_bias_) {
+        buffer_bias_ = AllocateMetalBufferFormRawBuffer1D(layer_res->bias_handle, dims_output[1], status);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+
+    const int total_channel = dims_input[0] * ROUND_UP(dims_input[1], 4);
+    auto zero_buffer        = RawBuffer(total_channel);
+    if (!buffer_scale_final_) {
+        buffer_scale_final_ = AllocateMetalBufferFormRawBuffer1D(zero_buffer, total_channel, status);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+
+    if (!buffer_bias_final_) {
+        buffer_bias_final_ = AllocateMetalBufferFormRawBuffer1D(zero_buffer, total_channel, status);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+    return status;
+}
+
+Status MetalInstanceNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto context_impl = context_->getMetalContextImpl();
+    if (!context_impl) {
+        LOGE("context_impl is nil\n");
+        return Status(TNNERR_CONTEXT_ERR, "MetalInstanceNormLayerAcc context_impl is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output  = output->GetBlobDesc().dims;
+    auto output_slice = UP_DIV(dims_output[1], 4);
+    auto batch        = dims_output[0];
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+
+    // stage 1, update scale and bias
+    {
+        auto encoder = [context_impl encoder];
+        if (!encoder) {
+            LOGE("encoder is nil\n");
+            return Status(TNNERR_CONTEXT_ERR, "instance_norm_var_bias encoder is nil");
+        }
+
+        encoder.label = GetKernelLabel();
+
+        do {
+            status = [context_impl load:@"instance_norm" encoder:encoder bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+
+            LOGD("bandwidth.thread_execution_width: %d\n", (int)bandwidth.thread_execution_width);
+            LOGD("bandwidth.max_threads_per_group: %d\n", (int)bandwidth.max_threads_per_group);
+
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+            [encoder setBuffer:buffer_scale_ offset:0 atIndex:3];
+            [encoder setBuffer:buffer_bias_ offset:0 atIndex:4];
+            
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)output->GetHandle().bytes_offset
+                       atIndex:1];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                        offset:(NSUInteger)input->GetHandle().bytes_offset
+                       atIndex:0];
+
+            // do not use setThreadgroupMemoryLength, unknown bug will raise
+            //            [encoder setThreadgroupMemoryLength:8*8*4*data_type_size atIndex:0];
+            [encoder dispatchThreadgroups:{(NSUInteger)1, (NSUInteger)1, (NSUInteger)batch * output_slice}
+                    threadsPerThreadgroup:{(NSUInteger)32, (NSUInteger)1, (NSUInteger)1}];
+            BREAK_IF(status != TNN_OK);
+        } while (0);
+
+        [encoder endEncoding];
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+
+    return status;
+}
+
+REGISTER_METAL_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+REGISTER_METAL_LAYOUT(LAYER_INST_BATCH_NORM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.h
new file mode 100644
index 0000000..f1829c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.h
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_LAYER_ACC_H_
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_device.h"
+#include "tnn/device/metal/metal_macro.h"
+#include "tnn/utils/blob_transfer_utils.h"
+
+TNN_OBJC_CLASS(TNNMMetalContextImpl);
+
+namespace TNN_NS {
+class MetalContext;
+
+// @brief conv layer metal acc
+class MetalLayerAcc : public AbstractLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~MetalLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief allocate or update constant blobs if constant resource changes.
+     * Note: this func may cost much time, call this func only when necessary.
+     */
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+
+
+public:
+    virtual std::string KernelName(const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs);
+    
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+    
+
+protected:
+    LayerParam *param_       = nullptr;
+    LayerResource *resource_ = nullptr;
+
+    MetalContext *context_ = nullptr;
+
+    id<MTLBuffer> buffer_param_ = nil;
+    
+    NSString *kernel_label_ = nil;
+    NSString * GetKernelLabel();
+
+    // @brief if true, const blobs are loaded by the naive device
+    virtual bool UseNaiveConstantBlobs();
+    // @brief config blobdesc for reloading buffer to metal blob
+    virtual Status ConfigBuffer2MetalBlobDesc(BlobDesc &desc);
+    // @brief reload buffer to metal blob
+    virtual Status RawBuffer2MetalBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, BlobDesc &desc);
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+};
+
+MTLSize GetDefaultThreadSize(DimsVector dims, bool combineHeightWidth);
+
+MTLSize GetDefaultThreadSizeFusedLast(DimsVector dims, bool combineHeightWidth);
+
+struct MetalParams GetDefaultMetalParams(DimsVector input, DimsVector output);
+
+// @brief allocate metal buffer form RawBuffer, like conv bias
+// @context tnn instance device context
+// @param buffer    input raw buffer
+// @param count  the count of elements in RawBuffer
+// @param status   output status
+id<MTLBuffer> AllocateMetalBufferFormRawBuffer1D(RawBuffer buffer, int count, Status &status);
+
+// @brief allocate packed metal buffer with format GOIHW4 form RawBuffer, like conv weight(gic or gic is not 4x)
+// @context tnn instance device context
+// @param buffer    input raw buffer
+// @param buffer_shape  format OIHW
+// @param group    group
+// @param status   output status
+// @param status   transpose transpose weght for deconv
+id<MTLBuffer> AllocatePackedGOIHW4MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                            Status &status, bool transpose = false);
+
+// @brief allocate packed metal buffer with format GOIHW16 form RawBuffer, like conv weight
+// @context tnn instance device context
+// @param buffer    input raw buffer
+// @param buffer_shape  format OIHW
+// @param group    group
+// @param status   output status
+// @param status   transpose transpose weght for deconv
+id<MTLBuffer> AllocatePackedGOIHW16MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                            Status &status, bool transpose = false);
+
+// @brief allocate packed metal buffer with format NC4HW4 form RawBuffer, like conv weight
+// @context tnn instance device context
+// @param buffer    input raw buffer
+// @param buffer_shape  format OIHW
+// @param group    group
+// @param status   output status
+id<MTLBuffer> AllocatePackedNC4HW4MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                           Status &status);
+
+void GetSingleAxisSplitSize(const DimsVector& dims, int axis, MTLSize& size, bool reduce_on_axis);
+
+#define DECLARE_METAL_ACC(type_string, layer_type)                                                                     \
+    class Metal##type_string##LayerAcc : public MetalLayerAcc {                                                        \
+    public:                                                                                                            \
+        virtual ~Metal##type_string##LayerAcc(){};                                                                     \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);     \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs); \
+        virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs, \
+                                 const std::vector<Blob *> &outputs, \
+                                 MTLSize &size); \
+        virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder, \
+                                     const std::vector<Blob *> &inputs, \
+                                     const std::vector<Blob *> &outputs);\
+    }
+
+#define DECLARE_METAL_ACC_WITH_EXTRA(type_string, layer_type, extra)                                            \
+    class Metal##type_string##LayerAcc : public MetalLayerAcc {                                                        \
+    public:                                                                                                            \
+        virtual ~Metal##type_string##LayerAcc(){};                                                                     \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);     \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs); \
+        virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs, \
+                                 const std::vector<Blob *> &outputs, \
+                                 MTLSize &size); \
+        virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder, \
+                                     const std::vector<Blob *> &inputs, \
+                                     const std::vector<Blob *> &outputs);\
+        extra; \
+    }
+
+#define REGISTER_METAL_ACC(type_string, layer_type)                                                                    \
+    MetalTypeLayerAccRegister<TypeLayerAccCreator<Metal##type_string##LayerAcc>> g_metal_##layer_type##_acc_register(  \
+        layer_type);
+
+class MetalTypeLayerLayoutCreator {
+public:
+    static std::shared_ptr<ImplementedLayout> UpdateImplementedLayout(LayerType layer_type, DataFormat layout) {
+        // make sure arm device has been registered
+        TypeDeviceRegister<MetalDevice> metal_device_register(DEVICE_METAL);
+        auto implemented_layout = GetDevice(DEVICE_METAL)->GetImplementedLayout(layer_type);
+        auto updated_layout     = std::make_shared<ImplementedLayout>(*implemented_layout);
+        updated_layout->layouts.push_back(layout);
+        return updated_layout;
+    }
+};
+
+#define REGISTER_METAL_LAYOUT(layer_type, layout)                                                                        \
+    MetalTypeLayerLayoutRegister g_metal_##layer_type##_##layout##_layout_register(                                      \
+             layer_type, MetalTypeLayerLayoutCreator::UpdateImplementedLayout(layer_type, layout));
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.mm
new file mode 100644
index 0000000..8a0b03a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_layer_acc.mm
@@ -0,0 +1,831 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status MetalLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    context_ = dynamic_cast<MetalContext *>(context);
+
+    param_    = param;
+    resource_ = resource;
+
+    //修正BlobManager::Init中设置的data_type
+    // metal 运行时只支持half，debug模式支持fp32
+#if TNN_METAL_FULL_PRECISION
+    inputs[0]->GetBlobDesc().data_type  = DATA_TYPE_FLOAT;
+    outputs[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+#else
+    inputs[0]->GetBlobDesc().data_type  = DATA_TYPE_HALF;
+    outputs[0]->GetBlobDesc().data_type = DATA_TYPE_HALF;
+#endif
+
+    status = ReloadConstantBlobs(inputs, false);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return Reshape(inputs, outputs);
+    //    return Reshape(inputs, outputs);
+}
+
+MetalLayerAcc::~MetalLayerAcc() {
+    buffer_param_ = nil;
+}
+
+Status MetalLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        auto buffer = (*const_resource)[name];
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+
+        Status status = TNN_OK;
+        if (UseNaiveConstantBlobs()) {
+            status = RawBuffer2Blob(buffer.get(), blob);
+        } else {
+            status = RawBuffer2MetalBlob(buffer.get(), blob, iter->GetBlobDesc());
+        }
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        iter->GetBlobDesc() = blob->GetBlobDesc();
+        iter->SetFlag(blob->GetFlag());
+        LOGD("Reload constant blob: %s\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+Status MetalLayerAcc::RawBuffer2MetalBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, BlobDesc &desc) {
+    if (!buffer || buffer->GetBytesSize() <= 0) {
+        LOGE("RawBuffer2MetalBlob: buffer for blob:%s is null\n", desc.name.c_str());
+        return Status(TNNERR_PARAM_ERR, "RawBuffer2MetalBlob: buffer is null");
+    }
+
+    if (buffer->GetDataType() != DATA_TYPE_FLOAT && buffer->GetDataType() != DATA_TYPE_HALF) {
+        LOGE("RawBuffer2MetalBlob: unsupported buffer data type: %d\n", buffer->GetDataType());
+        return Status(TNNERR_PARAM_ERR, "RawBuffer2MetalBlob: unsupported buffer data type");
+    }
+
+    const int count = blob ? DimsVectorUtils::Count(blob->GetBlobDesc().dims) : 0;
+    const int ele_size = blob ? DataTypeUtils::GetBytesSize(blob->GetBlobDesc().data_type) : 0;
+    if (!blob || buffer->GetBytesSize() != count*ele_size) {
+        desc.device_type = DEVICE_METAL;
+#if TNN_METAL_FULL_PRECISION
+        desc.data_type  = DATA_TYPE_FLOAT;
+#else
+        desc.data_type  = DATA_TYPE_HALF;
+#endif
+
+        desc.data_format = DATA_FORMAT_NC4HW4;
+        ConfigBuffer2MetalBlobDesc(desc);
+
+        if (buffer->GetBytesSize() > 0) {
+            blob = std::make_shared<Blob>(desc, true);
+        } else {
+            blob = std::make_shared<Blob>(desc, false);
+        }
+    }
+
+    if (!blob->GetHandle().base) {
+        LOGE("RawBuffer2MetalBlob: Allocate blob failed!\n");
+        return Status(TNNERR_INST_ERR, "RawBuffer2MetalBlob: Allocate blob failed!");
+    }
+
+    // convert data type if necessary
+    void *buffer_data = buffer->force_to<void *>();
+    auto buffer_size  = buffer->GetBytesSize();
+    float *buffer_float = nullptr;
+    uint16_t *buffer_half = nullptr;
+    LOGD("ReloadConstantBlob for layer:%s\n", param_->name.c_str());
+#if TNN_METAL_FULL_PRECISION
+    if (buffer->GetDataType() == DATA_TYPE_HALF) {
+        auto buffer_data_count = buffer->GetDataCount();
+        buffer_float = new float[buffer_data_count];
+        if (ConvertFromHalfToFloat(buffer_data, buffer_float, buffer_data_count) != 0) {
+            LOGE("RawBuffer2MetalBlob: Convert buffer from half to float failed!");
+            return Status(TNNERR_INST_ERR, "RawBuffer2MetalBlob: Convert buffer from half to float failed!");
+        }
+        buffer_data = buffer_float;
+        buffer_size = sizeof(float) * buffer_data_count;
+        LOGE("ReloadConstantBlob for layer:%s, convert from half to float!\n", param_->name.c_str());
+    }
+#else
+    if (buffer->GetDataType() == DATA_TYPE_FLOAT) {
+        auto buffer_data_count = buffer->GetDataCount();
+        buffer_half = new uint16_t[buffer_data_count];
+        if (ConvertFromFloatToHalf((float *)buffer_data, buffer_half, buffer_data_count) != 0) {
+            LOGE("RawBuffer2MetalBlob: Convert buffer from float to half failed!");
+            return Status(TNNERR_INST_ERR, "RawBuffer2MetalBlob: Convert buffer from float to half failed!");
+        }
+        buffer_data = buffer_half;
+        buffer_size = sizeof(uint16_t) * buffer_data_count;
+    }
+#endif
+
+    Status status = TNN_OK;
+    // copy to metal blob
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    id<MTLBuffer> tmp_buffer = nil;
+    tmp_buffer = [device newBufferWithBytes:buffer_data
+                                     length:buffer_size
+                                    options:MTLResourceCPUCacheModeWriteCombined];
+    if (buffer_float) delete[] buffer_float;
+    if (buffer_half) delete[] buffer_half;
+
+    auto context_impl = context_->getMetalContextImpl();
+    // _commandBuffer may be nil, call 'commit' to initialize a commandBuffer
+    [context_impl commit];
+
+    auto encoder = [context_impl encoder];
+    if (!encoder) {
+        LOGE("RawBuffer2MetalBlob: encoder is nil!\n");
+    }
+
+    MetalImageConverterParams params;
+    const auto blob_dims = blob->GetBlobDesc().dims;
+    params.size    = DimsFunctionUtils::GetDimProduct(blob_dims, 2);
+    params.channel = DimsFunctionUtils::GetDim(blob_dims, 1);
+    params.batch   = DimsFunctionUtils::GetDim(blob_dims, 0);
+    params.slice   = UP_DIV(params.channel, 4);
+
+    id<MTLBuffer> buffer_param = [device newBufferWithBytes:(const void *)(&params)
+                                        length:sizeof(params)
+                                       options:MTLResourceCPUCacheModeWriteCombined];
+
+    MTLSize threads;
+    std::string kernel_name;
+    if (desc.data_format == DATA_FORMAT_NCHW) {
+        threads = MTLSizeMake(params.size, params.channel, params.batch);
+        kernel_name = "data_converter_nchw";
+    } else if (desc.data_format == DATA_FORMAT_NC4HW4) {
+        threads = MTLSizeMake(params.size, params.slice, params.batch);
+        kernel_name = "data_converter_nchw_2_nc4hw4_ftype_identity";
+    } else {
+        LOGE("RawBuffer2MetalBlob: unsupported data_format: %d\n", desc.data_format);
+        return Status(TNNERR_PARAM_ERR, "RawBuffer2MetalBlob: unsupported data_format");
+    }
+    
+    MetalBandwidth bandwidth;
+    status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                        encoder:encoder
+                        bandwidth:bandwidth];
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)blob->GetHandle().base
+                offset:(NSUInteger)blob->GetHandle().bytes_offset
+                atIndex:0];
+    [encoder setBuffer:tmp_buffer offset:0 atIndex:1];
+    [encoder setBuffer:buffer_param offset:0 atIndex:2];
+
+    status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+    RETURN_ON_NEQ(status, TNN_OK);
+    [encoder endEncoding];
+    [context_impl commit];
+
+    return status;
+}
+
+Status MetalLayerAcc::ConfigBuffer2MetalBlobDesc(BlobDesc &desc) {
+    return TNN_OK;
+}
+
+bool MetalLayerAcc::UseNaiveConstantBlobs() {
+    return false;
+}
+
+Status MetalLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        auto metal_params = GetDefaultMetalParams(dims_input, dims_output);
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalLayerAcc::KernelName(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    LOGE("Error: subclass must implement the interface KernelName\n");
+    return "";
+}
+
+Status MetalLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto output = outputs[0];
+    auto dims_output  = output->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, true);
+    return TNN_OK;
+}
+
+/*
+  If an acc prefers to dispatch kernel with threadsPerGroup and threadGroups specified,
+  it should override this method to give how many threadGroups to use, and it should also
+  override the @ComputeThreadSize method to give threadsPerGroup.
+  Use this implementaion means dispatching kernels without caring about the threadGroup config.
+*/
+Status MetalLayerAcc::ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size) {
+    size = MTLSizeMake(0, 0, 0);
+    return TNN_OK;
+}
+
+Status MetalLayerAcc::SetKernelEncoderParam(
+                                            id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                offset:(NSUInteger)input->GetHandle().bytes_offset
+               atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                offset:(NSUInteger)output->GetHandle().bytes_offset
+               atIndex:1];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+    
+    return TNN_OK;
+}
+
+NSString * MetalLayerAcc::GetKernelLabel() {
+    if (kernel_label_.length > 0) {
+        return kernel_label_;
+    } else if ((!kernel_label_ || kernel_label_.length <= 0) && param_) {
+        kernel_label_ = [NSString stringWithUTF8String:param_->name.c_str()];
+    }
+    return kernel_label_;
+}
+
+Status MetalLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    auto data_type_str = DataTypeUtils::GetDataTypeString(data_type);
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("MetalLayerAcc: DataType must be float or half\n");
+        return Status(TNNERR_LAYER_ERR, "MetalLayerAcc: DataType must be float or half");
+    }
+    
+    //
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+    
+    MTLSize threads;
+    auto status = ComputeThreadSize(inputs, outputs, threads);
+    if (status != TNN_OK) {
+        return status;
+    }
+    // check if perferring to launch kernel with threadGroups specified
+    MTLSize groups;
+    status = ComputeThreadgroupSize(inputs, outputs, groups);
+    bool preferDispatchingWithGroups = (groups.width!=0 && groups.height!=0 && groups.depth!=0);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    do {
+        auto kernel_name = KernelName(inputs, outputs);
+        if (kernel_name.length() <= 0) {
+            status = Status(TNNERR_LAYER_ERR, "empty kernel name");
+            break;
+        }
+        
+        MetalBandwidth bandwidth;
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                          bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);
+        if (preferDispatchingWithGroups) {
+            status = [context_impl dispatchEncoder:encoder threadsPerGroup:threads groups: groups bandwidth:bandwidth];
+        } else {
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        }
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    
+    if (status == TNN_OK) {
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+    return status;
+}
+
+std::vector<DataFormat> MetalLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size == 4) {
+        support_list.push_back(DATA_FORMAT_NC4HW4);
+    } else {
+        support_list.push_back(DATA_FORMAT_NC4HW4);
+    }
+    return support_list;
+}
+
+MTLSize GetDefaultThreadSize(DimsVector dims, bool combineHeightWidth) {
+    auto output_height  = DimsFunctionUtils::GetDim(dims, 2);
+    auto output_width   = DimsFunctionUtils::GetDim(dims, 3);
+    auto output_size  = output_width * output_height;
+    auto output_slice = UP_DIV(dims[1], 4);
+    auto output_batch = dims[0];
+    
+    if (combineHeightWidth) {
+        return MTLSizeMake(output_size, output_slice, output_batch);
+    } else {
+        return MTLSizeMake(output_width, output_height, output_batch*output_slice);
+    }
+}
+
+MTLSize GetDefaultThreadSizeFusedLast(DimsVector dims, bool combineHeightWidth) {
+    auto output_height  = DimsFunctionUtils::GetDim(dims, 2);
+    auto output_width   = DimsFunctionUtils::GetDimProduct(dims, 3);
+    auto output_size  = output_width * output_height;
+    auto output_slice = UP_DIV(dims[1], 4);
+    auto output_batch = dims[0];
+    
+    if (combineHeightWidth) {
+        return MTLSizeMake(output_size, output_slice, output_batch);
+    } else {
+        return MTLSizeMake(output_width, output_height, output_batch*output_slice);
+    }
+}
+
+void GetSingleAxisSplitSize(const DimsVector& dims, int axis, MTLSize& size, bool reduce_on_axis) {
+    auto axis_size = DimsFunctionUtils::GetDim(dims, axis);
+    auto dims_copy = dims;
+    dims_copy[1] = UP_DIV(dims[1], 4);
+    size = MTLSizeMake(DimsVectorUtils::Count(dims_copy, axis+1),
+                        reduce_on_axis? 1 : (axis == 1? UP_DIV(axis_size, 4) : axis_size),
+                        DimsVectorUtils::Count(dims_copy, 0, axis));
+}
+
+struct MetalParams GetDefaultMetalParams(DimsVector dims_input, DimsVector dims_output) {
+    MetalParams metal_params;
+    SetDefaultMetalParams(metal_params, dims_input, dims_output);
+    return metal_params;
+}
+
+id<MTLBuffer> AllocateMetalBufferFormRawBuffer1D(RawBuffer buffer, int count, Status &status) {
+    id<MTLDevice> device     = [TNNMetalDeviceImpl sharedDevice];
+    id<MTLBuffer> mtl_buffer = nil;
+
+    // ensure count >= 16/2
+    count = std::max(count, 16 / 2);
+
+    const float *b_handle_data = buffer.force_to<float *>();
+    if (b_handle_data == nullptr) {
+        LOGE("ERROR: Data is nil \n");
+        return nil;
+    }
+    const int b_handle_size   = buffer.GetBytesSize();
+    const DataType data_type  = buffer.GetDataType();
+    const int data_count_4    = ROUND_UP(count, 4);
+    const int total_byte_size = data_count_4 * DataTypeUtils::GetBytesSize(data_type);
+
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("Error: DataType %d not support\n", data_type);
+        status = Status(TNNERR_MODEL_ERR, "bias_handle DataType is not supported");
+        return mtl_buffer;
+    }
+    if (total_byte_size < b_handle_size) {
+        LOGE("Error: Invalid model, buffer has wrong byte size\n");
+        status = Status(TNNERR_MODEL_ERR,  "Error: Invalid model, buffer has wrong byte size");
+        return mtl_buffer;
+    }
+
+#if TNN_METAL_FULL_PRECISION
+    if (data_type == DATA_TYPE_FLOAT) {
+        //补齐
+        float *data_fill_4 = (float *)b_handle_data;
+        if (total_byte_size != b_handle_size) {
+            data_fill_4 = (float *)new char[total_byte_size];
+            memset((void *)data_fill_4, 0, total_byte_size);
+            memcpy(data_fill_4, b_handle_data, b_handle_size);
+        }
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_fill_4
+                                         length:total_byte_size
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+
+        if (total_byte_size != b_handle_size) {
+            delete[] data_fill_4;
+        }
+    } else if (data_type == DATA_TYPE_HALF) {
+        //补齐
+        uint16_t *data_fill_4 = (uint16_t *)b_handle_data;
+        if (total_byte_size != b_handle_size) {
+            data_fill_4 = (uint16_t *)new char[total_byte_size];
+            memset((void *)data_fill_4, 0, total_byte_size);
+            memcpy(data_fill_4, b_handle_data, b_handle_size);
+        }
+
+        // convert to float
+        float *data_fp32_data = new float[data_count_4];
+        if (ConvertFromHalfToFloat((void *)data_fill_4, data_fp32_data, data_count_4) != 0) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            status = Status(TNNERR_MODEL_ERR, "Convert LayerRerouece from half to float failed!");
+            return nil;
+        }
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_fp32_data
+                                         length:data_count_4*sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_fp32_data;
+
+        if (total_byte_size != b_handle_size) {
+            delete[] data_fill_4;
+        }
+    }
+#else
+    if (data_type == DATA_TYPE_FLOAT) {
+        //补齐
+        float *data_fill_4 = (float *)b_handle_data;
+        if (total_byte_size != b_handle_size) {
+            data_fill_4 = (float *)new char[total_byte_size];
+            memset((void *)data_fill_4, 0, total_byte_size);
+            memcpy(data_fill_4, b_handle_data, b_handle_size);
+        }
+
+        // convert to half
+        uint16_t *data_fp16_data = new uint16_t[data_count_4];
+        ConvertFromFloatToHalf((float *)data_fill_4, (void *)data_fp16_data, data_count_4);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_fp16_data
+                                         length:data_count_4*sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_fp16_data;
+
+        if (total_byte_size != b_handle_size) {
+            delete[] data_fill_4;
+        }
+    } else if (data_type == DATA_TYPE_HALF) {
+        //补齐
+        uint16_t *data_fill_4 = (uint16_t *)b_handle_data;
+        if (total_byte_size != b_handle_size) {
+            data_fill_4 = (uint16_t *)new char[total_byte_size];
+            memset((void *)data_fill_4, 0, total_byte_size);
+            memcpy(data_fill_4, b_handle_data, b_handle_size);
+        }
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_fill_4
+                                         length:total_byte_size
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+
+        if (total_byte_size != b_handle_size) {
+            delete[] data_fill_4;
+        }
+    }
+#endif
+    return mtl_buffer;
+}
+
+id<MTLBuffer> AllocatePackedGOIHW4MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                            Status &status, bool transpose) {
+    id<MTLDevice> device     = [TNNMetalDeviceImpl sharedDevice];
+    id<MTLBuffer> mtl_buffer = nil;
+
+    const int output_channel = buffer_shape[0];
+    const int input_channel  = buffer_shape[1];
+    const int kh             = buffer_shape[2];
+    const int kw             = buffer_shape[3];
+
+    const int goc   = output_channel / group;
+    const int gic   = input_channel / group;
+    const int goc_4 = UP_DIV(goc, 4);
+
+    int weight_count_nopack  = group * goc * gic * kh * kw;
+    int weight_count_pack    = group * goc_4 * gic * kh * kw * 4;
+    const DataType data_type = buffer.GetDataType();
+
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("Error: DataType %d not support\n", data_type);
+        status = Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        return nil;
+    }
+
+#if TNN_METAL_FULL_PRECISION
+    if (data_type == DATA_TYPE_FLOAT) {
+        // convert to float
+        float *weight_fp32_data = buffer.force_to<float *>();
+
+        // pack
+        float *weight_pack_fp32_data = new float[weight_count_pack];
+        memset((void *)weight_pack_fp32_data, 0, weight_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Float(weight_fp32_data, weight_pack_fp32_data, group,
+                                                            goc, gic, kh*kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp32_data
+                                         length:weight_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_pack_fp32_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        uint16_t *weight_fp16_data = buffer.force_to<uint16_t *>();
+
+        // convert to float
+        float *weight_fp32_data = new float[weight_count_nopack];
+        if (ConvertFromHalfToFloat((void *)weight_fp16_data, (float *)weight_fp32_data, weight_count_nopack) != 0) {
+            LOGE("Error: DataType %d not support\n", data_type);
+            status = Status(TNNERR_MODEL_ERR, "Convert LayerRerouece from half to float failed!");
+            return nil;
+        }
+
+        // pack
+        float *weight_pack_fp32_data = new float[weight_count_pack];
+        memset((void *)weight_pack_fp32_data, 0, weight_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Float(weight_fp32_data, weight_pack_fp32_data, group,
+                                                            goc, gic, kh*kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp32_data
+                                         length:weight_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_fp32_data;
+        delete[] weight_pack_fp32_data;
+    }
+#else
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *weight_fp32_data = buffer.force_to<float *>();
+
+        // convert to half
+        uint16_t *weight_fp16_data = new uint16_t[weight_count_nopack];
+        ConvertFromFloatToHalf((float *)weight_fp32_data, (void *)weight_fp16_data, weight_count_nopack);
+
+        // pack
+        uint16_t *weight_pack_fp16_data = new uint16_t[weight_count_pack];
+        memset((void *)weight_pack_fp16_data, 0, weight_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Half((short *)weight_fp16_data, (short *)weight_pack_fp16_data,
+                                                           group, goc, gic, kh*kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp16_data
+                                         length:weight_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_fp16_data;
+        delete[] weight_pack_fp16_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        // convert to half
+        uint16_t *weight_fp16_data = buffer.force_to<uint16_t *>();
+
+        // pack
+        uint16_t *weight_pack_fp16_data = new uint16_t[weight_count_pack];
+        memset((void *)weight_pack_fp16_data, 0, weight_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Half((short *)weight_fp16_data, (short *)weight_pack_fp16_data,
+                                                           group, goc, gic, kh*kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp16_data
+                                         length:weight_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_pack_fp16_data;
+    }
+#endif
+    return mtl_buffer;
+}
+
+id<MTLBuffer> AllocatePackedGOIHW16MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                            Status &status, bool transpose) {
+    id<MTLDevice> device     = [TNNMetalDeviceImpl sharedDevice];
+    id<MTLBuffer> mtl_buffer = nil;
+
+    const int output_channel = buffer_shape[0];
+    const int input_channel  = buffer_shape[1];
+    const int kh             = buffer_shape[2];
+    const int kw             = buffer_shape[3];
+
+    const int goc   = output_channel / group;
+    const int gic   = input_channel / group;
+    const int goc_4 = UP_DIV(goc, 4);
+    const int gic_4 = UP_DIV(gic, 4);
+
+    int weight_count_nopack  = group * goc * gic * kh * kw;
+    int weight_count_pack    = group * goc_4 * gic_4 * kh * kw * 16;
+    const DataType data_type = buffer.GetDataType();
+
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("Error: DataType %d not support\n", data_type);
+        status = Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        return nil;
+    }
+
+#if TNN_METAL_FULL_PRECISION
+    if (data_type == DATA_TYPE_FLOAT) {
+        // convert to float
+        float *weight_fp32_data = buffer.force_to<float *>();
+
+        // pack
+        float *weight_pack_fp32_data = new float[weight_count_pack];
+        memset((void *)weight_pack_fp32_data, 0, weight_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromGOIHWToGOIHW16Float(weight_fp32_data, weight_pack_fp32_data, group,
+                                                            input_channel, output_channel, kh, kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp32_data
+                                         length:weight_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_pack_fp32_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        uint16_t *weight_fp16_data = buffer.force_to<uint16_t *>();
+
+        // convert to float
+        float *weight_fp32_data = new float[weight_count_pack];
+        ConvertFromHalfToFloat((void *)weight_fp16_data, (float *)weight_fp32_data, weight_count_nopack);
+
+        // pack
+        float *weight_pack_fp32_data = new float[weight_count_pack];
+        memset((void *)weight_pack_fp32_data, 0, weight_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromGOIHWToGOIHW16Float(weight_fp32_data, weight_pack_fp32_data, group,
+                                                            input_channel, output_channel, kh, kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp32_data
+                                         length:weight_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_fp32_data;
+        delete[] weight_pack_fp32_data;
+    }
+#else
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *weight_fp32_data = buffer.force_to<float *>();
+
+        // convert to half
+        uint16_t *weight_fp16_data = new uint16_t[weight_count_pack];
+        ConvertFromFloatToHalf((float *)weight_fp32_data, (void *)weight_fp16_data, weight_count_nopack);
+
+        // pack
+        uint16_t *weight_pack_fp16_data = new uint16_t[weight_count_pack];
+        memset((void *)weight_pack_fp16_data, 0, weight_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromGOIHWToGOIHW16Half((short *)weight_fp16_data, (short *)weight_pack_fp16_data,
+                                                           group, input_channel, output_channel, kh, kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp16_data
+                                         length:weight_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_fp16_data;
+        delete[] weight_pack_fp16_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        // convert to half
+        uint16_t *weight_fp16_data = buffer.force_to<uint16_t *>();
+
+        // pack
+        uint16_t *weight_pack_fp16_data = new uint16_t[weight_count_pack];
+        memset((void *)weight_pack_fp16_data, 0, weight_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromGOIHWToGOIHW16Half((short *)weight_fp16_data, (short *)weight_pack_fp16_data,
+                                                           group, input_channel, output_channel, kh, kw, transpose);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)weight_pack_fp16_data
+                                         length:weight_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] weight_pack_fp16_data;
+    }
+#endif
+    return mtl_buffer;
+}
+
+id<MTLBuffer> AllocatePackedNC4HW4MetalBufferFormRawBuffer(RawBuffer buffer, DimsVector buffer_shape, int group,
+                                                           Status &status) {
+    id<MTLDevice> device     = [TNNMetalDeviceImpl sharedDevice];
+    id<MTLBuffer> mtl_buffer = nil;
+
+    const int channel = DimsFunctionUtils::GetDim(buffer_shape, 1);
+    const int kh      = DimsFunctionUtils::GetDim(buffer_shape, 2);
+    const int kw      = DimsFunctionUtils::GetDim(buffer_shape, 3);
+
+    const int channel4 = UP_DIV(channel, 4) * 4;
+
+    int data_count_nopack  = channel * kh * kw;
+    int data_count_pack    = channel4 * kh * kw;
+    const DataType data_type = buffer.GetDataType();
+
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("Error: DataType %d not support\n", data_type);
+        status = Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        return nil;
+    }
+
+#if TNN_METAL_FULL_PRECISION
+    if (data_type == DATA_TYPE_FLOAT) {
+        // convert to float
+        float *data_fp32_data = buffer.force_to<float *>();
+
+        // pack
+        float *data_pack_fp32_data = new float[data_count_pack];
+        memset((void *)data_pack_fp32_data, 0, data_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Float(data_fp32_data, data_pack_fp32_data, 1, channel, kh, kw);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_pack_fp32_data
+                                         length:data_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_pack_fp32_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        uint16_t *data_fp16_data = buffer.force_to<uint16_t *>();
+
+        // convert to float
+        float *data_fp32_data = new float[data_count_pack];
+        ConvertFromHalfToFloat((void *)data_fp16_data, (float *)data_fp32_data, data_count_nopack);
+
+        // pack
+        float *data_pack_fp32_data = new float[data_count_pack];
+        memset((void *)data_pack_fp32_data, 0, data_count_pack * sizeof(float));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Float(data_fp32_data, data_pack_fp32_data, 1, channel, kh, kw);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_pack_fp32_data
+                                         length:data_count_pack * sizeof(float)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_fp32_data;
+        delete[] data_pack_fp32_data;
+    }
+#else
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *data_fp32_data = buffer.force_to<float *>();
+
+        // convert to half
+        uint16_t *data_fp16_data = new uint16_t[data_count_pack];
+        ConvertFromFloatToHalf((float *)data_fp32_data, (void *)data_fp16_data, data_count_nopack);
+
+        // pack
+        uint16_t *data_pack_fp16_data = new uint16_t[data_count_pack];
+        memset((void *)data_pack_fp16_data, 0, data_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Half((short *)data_fp16_data, (short *)data_pack_fp16_data, 1,
+                                                        channel, kh, kw);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_pack_fp16_data
+                                         length:data_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_fp16_data;
+        delete[] data_pack_fp16_data;
+    } else if (data_type == DATA_TYPE_HALF) {
+        // convert to half
+        uint16_t *data_fp16_data = buffer.force_to<uint16_t *>();
+
+        // pack
+        uint16_t *data_pack_fp16_data = new uint16_t[data_count_pack];
+        memset((void *)data_pack_fp16_data, 0, data_count_pack * sizeof(uint16_t));
+
+        DataFormatConverter::ConvertFromNCHWToNCHW4Half((short *)data_fp16_data, (short *)data_pack_fp16_data, 1,
+                                                        channel, kh, kw);
+
+        mtl_buffer = [device newBufferWithBytes:(const void *)data_pack_fp16_data
+                                         length:data_count_pack * sizeof(uint16_t)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+        delete[] data_pack_fp16_data;
+    }
+#endif
+    return mtl_buffer;
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.metal
new file mode 100644
index 0000000..5f3ccea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void log(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = log(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.mm
new file mode 100644
index 0000000..6271871
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Log, LAYER_LOG);
+
+string MetalLogLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "log";
+}
+
+Status MetalLogLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalLogLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Log, LAYER_LOG);
+REGISTER_METAL_LAYOUT(LAYER_LOG, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.metal
new file mode 100644
index 0000000..526d6a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.metal
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void log_sigmoid(const device ftype4 *in                     [[buffer(0)]],
+                                       device ftype4 *out                            [[buffer(1)]],
+                                       constant MetalParams& params      [[buffer(2)]],
+                                       uint3 gid                                            [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    *z_out = log(One4/(One4 + exp(-*z_in)));
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.mm
new file mode 100644
index 0000000..7062f14
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_log_sigmoid_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+
+string MetalLogSigmoidLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "log_sigmoid";
+}
+
+Status MetalLogSigmoidLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalLogSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+REGISTER_METAL_LAYOUT(LAYER_LOGSIGMOID, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.metal
new file mode 100644
index 0000000..f66431f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.metal
@@ -0,0 +1,56 @@
+//  Copyright © 2019 tencent. All rights reserved.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void lrn_across_channel(const device ftype4 *src                [[buffer(0)]],
+                               device ftype4 *dst                      [[buffer(1)]],
+                               constant MetalLRNParams &params         [[buffer(2)]],
+                               uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height,
+                         params.output_slice * params.batch))) {
+        return;
+    }
+
+    int src_index = (int)gid.z * params.input_size +
+                    (int)gid.y * params.input_width + (int)gid.x;
+    int dst_index = (int)gid.z * params.output_size +
+                    (int)gid.y * params.output_width + (int)gid.x;
+
+    auto src_data_ptr = src + src_index;
+    auto dst_data_ptr = dst + dst_index;
+
+    int half_size = params.size / 2;
+    float4 sum    = 0;
+    for (int k = -half_size; k <= half_size; k++) {
+        int4 j4  = int4(0, 1, 2, 3) + k;
+        int4 z4  = int4(floor(float4(j4) / 4));
+        int4 r4  = j4 - z4 * 4;
+        int4 c4  = gid.z * 4 + j4;
+        bool4 v4 = 0 <= c4 && c4 < params.input_channel;
+
+        if (v4[0]) {
+            float in4 = float(src_data_ptr[z4[0] * params.input_size][r4[0]]);
+            sum[0] += in4 * in4;
+        }
+        if (v4[1]) {
+            float in4 = float(src_data_ptr[z4[1] * params.input_size][r4[1]]);
+            sum[1] += in4 * in4;
+        }
+        if (v4[2]) {
+            float in4 = float(src_data_ptr[z4[2] * params.input_size][r4[2]]);
+            sum[2] += in4 * in4;
+        }
+        if (v4[3]) {
+            float in4 = float(src_data_ptr[z4[3] * params.input_size][r4[3]]);
+            sum[3] += in4 * in4;
+        }
+    }
+
+    *dst_data_ptr = *src_data_ptr *
+                    ftype4(pow(params.bias + (params.alpha / params.size) * sum,
+                               -params.beta));
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.mm
new file mode 100644
index 0000000..c40dd78
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_lrn_layer_acc.mm
@@ -0,0 +1,72 @@
+//  Copyright © 2019 tencent. All rights reserved.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(LRN, LAYER_LRN);
+
+Status MetalLRNLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalLRNLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<LRNLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is invalid\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is invalid");
+    }
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalLRNParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        metal_params.input_channel = dims_input[1];
+        metal_params.alpha         = layer_param->alpha;
+        metal_params.beta          = layer_param->beta;
+        metal_params.bias          = layer_param->bias;
+        metal_params.size          = layer_param->size;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalLRNParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalLRNLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "lrn_across_channel";
+}
+
+Status MetalLRNLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalLRNLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalLRNLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+REGISTER_METAL_ACC(LRN, LAYER_LRN);
+REGISTER_METAL_LAYOUT(LAYER_LRN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.h
new file mode 100644
index 0000000..5084d6c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_MAT_MUL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_MAT_MUL_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalMatMulLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    // @brief virtual destrcutor
+    virtual ~MetalMatMulLayerAcc();
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual std::string KernelName(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs);
+    Status AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                             const std::vector<Blob *> &outputs);
+
+    virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs);
+
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size);
+
+protected:
+    id<MTLBuffer> buffer_weight_ = nil;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_MAT_MUL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.metal
new file mode 100644
index 0000000..3cb203c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.metal
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void matmul_common(const device ftype *mat_a                                      [[buffer(0)]],
+                           const device ftype *mat_b                                              [[buffer(1)]],
+                                        device ftype *mat_c                                [[buffer(2)]],
+                                        constant MetalMatMulParams & params  [[buffer(3)]],
+                                        uint3 gid                                                             [[thread_position_in_grid]]) {
+    if ((int)gid.x >= params.N || (int)gid.y >= params.M || (int)gid.z >= params.batch_c) return;
+
+    int batch_a = (int)gid.z >= params.batch_a ? 0 : (int)gid.z;
+    int batch_b = (int)gid.z >= params.batch_b ? 0 : (int)gid.z;
+
+    auto ptr_a = mat_a + (batch_a * params.M + (int)gid.y) * params.K;
+    auto ptr_b = mat_b + (batch_b * params.K * params.N) + (int)gid.x;
+    const int step_a = 1;
+    const int step_b = params.N;
+
+    float result = 0;
+    for (auto z = 0; z < params.K; z++) {
+        result += float(*ptr_a) * float(*ptr_b);
+        ptr_a += step_a;
+        ptr_b += step_b;
+    }
+    
+    mat_c[(int)gid.y * params.N + (int)gid.x] = ftype(result);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.mm
new file mode 100644
index 0000000..4451df6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mat_mul_layer_acc.mm
@@ -0,0 +1,205 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_mat_mul_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+Status MetalMatMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+MetalMatMulLayerAcc::~MetalMatMulLayerAcc() {}
+
+Status MetalMatMulLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    auto param = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource            = dynamic_cast<MatMulLayerResource *>(resource_);
+    DimsVector matrix_a_dims = param->matrix_a_dims;
+    DimsVector matrix_b_dims = param->matrix_b_dims;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+    }
+    auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
+
+    const int N = matrix_b_dims[matrix_b_dims.size() - 1];
+    const int K = matrix_a_dims[matrix_a_dims.size() - 1];
+    const int M = matrix_a_dims[matrix_a_dims.size() - 2];
+
+    int count_a     = DimsVectorUtils::Count(matrix_a_dims);
+    int count_b     = DimsVectorUtils::Count(matrix_b_dims);
+    int count_c     = DimsVectorUtils::Count(matrix_c_dims);
+    int batch_a   = count_a / (M * K);
+    int batch_b   = count_b / (K * N);
+    int batch_c   = count_c / (M * N);
+    // buffer_param_
+    {
+        MetalMatMulParams metal_params;
+        
+        metal_params.batch_a = batch_a;
+        metal_params.batch_b = batch_b;
+        metal_params.batch_c = batch_c;
+        metal_params.M       = M;
+        metal_params.N       = N;
+        metal_params.K       = K;
+        
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalMatMulParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalMatMulLayerAcc::AllocateBufferWeight(const std::vector<Blob *> &inputs,
+                                             const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_res = dynamic_cast<MatMulLayerResource *>(resource_);
+    auto& weight   = layer_res->weight;
+
+    const auto data_type = weight.GetDataType();
+    void *data = weight.force_to<void *>();
+    auto bytes = weight.GetBytesSize();
+    const auto count = weight.GetDataCount();
+    float *data_type_float = nullptr;
+    uint16_t *data_type_half  = nullptr;
+#if TNN_METAL_FULL_PRECISION
+    if (data_type == DATA_TYPE_HALF) {
+        data_type_float = new float[count];
+        if (ConvertFromHalfToFloat(data, (float *)data_type_float, count) != 0) {
+            LOGE("matmul weight DataType is not supported");
+            delete[] data_type_float;
+            return Status(TNNERR_LAYER_ERR, "matmul weight DataType is not supported");
+        }
+        data = data_type_float;
+        bytes = count * sizeof(float);
+    }        
+#else       
+    if (data_type == DATA_TYPE_FLOAT) {
+        data_type_half = new uint16_t[count];
+        if (ConvertFromFloatToHalf((float *)data, data_type_half, count) != 0) {
+            LOGE("matmul weight DataType is not supported");
+            delete[] data_type_half;
+            return Status(TNNERR_LAYER_ERR, "matmul weight DataType is not supported");
+        }
+        data = data_type_half;
+        bytes = count * sizeof(uint16_t);
+    }
+#endif
+    
+    if (!buffer_weight_) {
+        buffer_weight_ = [device newBufferWithBytes:data
+                                     length:bytes
+                                    options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    if (data_type_float != nullptr) delete[] data_type_float;
+    if (data_type_half != nullptr) delete[] data_type_half;
+
+    return TNN_OK;
+}
+
+Status MetalMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = AllocateBufferWeight(inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    status = MetalLayerAcc::Reshape(inputs, outputs);
+    
+    return status;
+}
+
+std::vector<DataFormat> MetalMatMulLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    return {DATA_FORMAT_NCHW};
+}
+
+
+std::string MetalMatMulLayerAcc::KernelName(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return "matmul_common";
+}
+
+Status MetalMatMulLayerAcc::SetKernelEncoderParam(
+                                            id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<MatMulLayerParam *>(param_);
+
+    int bytes_offset_mat_a = 0;
+    int bytes_offset_mat_b = 0;
+    id<MTLBuffer> matrix_a = nil;
+    id<MTLBuffer> matrix_b = nil;
+    if (inputs.size() == 2) {
+        matrix_a = (__bridge id<MTLBuffer>)(inputs[0]->GetHandle().base);
+        matrix_b = (__bridge id<MTLBuffer>)(inputs[1]->GetHandle().base);
+        bytes_offset_mat_a = inputs[0]->GetHandle().bytes_offset;
+        bytes_offset_mat_b = inputs[1]->GetHandle().bytes_offset;
+    } else {
+        matrix_a = param->weight_position == 0 ? buffer_weight_ : 
+                                (__bridge id<MTLBuffer>)(inputs[0]->GetHandle().base);
+        matrix_b = param->weight_position == 1 ? buffer_weight_ :
+                                (__bridge id<MTLBuffer>)(inputs[0]->GetHandle().base);
+    }
+    
+    [encoder setBuffer:matrix_a
+                offset:(NSUInteger)bytes_offset_mat_a
+               atIndex:0];
+    [encoder setBuffer:matrix_b
+                offset:(NSUInteger)bytes_offset_mat_b
+               atIndex:1];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)outputs[0]->GetHandle().base
+                offset:(NSUInteger)outputs[0]->GetHandle().bytes_offset
+               atIndex:2];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+    
+    return TNN_OK;
+}
+
+Status MetalMatMulLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto param = dynamic_cast<MatMulLayerParam *>(param_);
+    DimsVector matrix_a_dims = param->matrix_a_dims;
+    DimsVector matrix_b_dims = param->matrix_b_dims;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+    }
+    auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
+    const int N = matrix_b_dims[matrix_b_dims.size() - 1];
+    const int K = matrix_a_dims[matrix_a_dims.size() - 1];
+    const int M = matrix_a_dims[matrix_a_dims.size() - 2];
+
+    int count_c = DimsVectorUtils::Count(matrix_c_dims);
+    int batch_c = count_c / (M * N);
+
+    size = MTLSizeMake(N, M, batch_c);
+    return TNN_OK;
+}
+
+Status MetalMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(MatMul, LAYER_MATMUL);
+REGISTER_METAL_LAYOUT(LAYER_MATMUL, DATA_FORMAT_NCHW);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.metal
new file mode 100644
index 0000000..471e5d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void max_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = max(src0[index_in] , src1[index_in]);
+}
+
+kernel void max_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = max(data0 , data1);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.mm
new file mode 100644
index 0000000..00d58a9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_max_layer_acc.mm
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Max, LAYER_MAXIMUM);
+
+std::string MetalMaxLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "max_broadcast";
+    } else {
+        kernel_name = "max_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Max, LAYER_MAXIMUM);
+REGISTER_METAL_LAYOUT(LAYER_MAXIMUM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.metal
new file mode 100644
index 0000000..96ddd9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void min_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = min(src0[index_in] , src1[index_in]);
+}
+
+kernel void min_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = min(data0 , data1);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.mm
new file mode 100644
index 0000000..fd17e97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_min_layer_acc.mm
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Min, LAYER_MINIMUM);
+
+std::string MetalMinLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "min_broadcast";
+    } else {
+        kernel_name = "min_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Min, LAYER_MINIMUM);
+REGISTER_METAL_LAYOUT(LAYER_MINIMUM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.metal
new file mode 100644
index 0000000..98f90b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void mul_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = src0[index_in] * src1[index_in];
+}
+
+kernel void mul_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = data0 * data1;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.mm
new file mode 100644
index 0000000..0fbcc04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_mul_layer_acc.mm
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Mul, LAYER_MUL);
+
+std::string MetalMulLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "mul_broadcast";
+    } else {
+        kernel_name = "mul_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Mul, LAYER_MUL);
+REGISTER_METAL_LAYOUT(LAYER_MUL, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h
new file mode 100644
index 0000000..f939c0e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_MULTIDIR_BROADCAST_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_MULTIDIR_BROADCAST_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief broadcast layer metal acc
+class MetalMultidirBroadcastLayerAcc : public MetalLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~MetalMultidirBroadcastLayerAcc();
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+public:
+    virtual Status SetKernelEncoderParam(
+                                         id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_weight_ = nil;
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+};
+
+#define DECLARE_METAL_MULTIDIR_BROADCAST_ACC(type_string, layer_type)                                                  \
+    class Metal##type_string##LayerAcc : public MetalMultidirBroadcastLayerAcc {                                       \
+    public:                                                                                                            \
+        virtual ~Metal##type_string##LayerAcc(){};                                                                     \
+        virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);         \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+    }
+
+#define REGISTER_METAL_MULTIDIR_BROADCAST_ACC(type_string, layer_type) REGISTER_METAL_ACC(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_MULTIDIR_BROADCAST_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.mm
new file mode 100644
index 0000000..2eacdd1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.mm
@@ -0,0 +1,159 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+MetalMultidirBroadcastLayerAcc::~MetalMultidirBroadcastLayerAcc() {}
+
+Status MetalMultidirBroadcastLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto max_dim_size = inputs[0]->GetBlobDesc().dims.size();
+    for(auto *blob : inputs) {
+        auto dims_input = blob->GetBlobDesc().dims;
+        max_dim_size = std::max(dims_input.size(), max_dim_size);
+    }
+    for(auto *blob : inputs) {
+        auto dims_input = blob->GetBlobDesc().dims;
+        while (dims_input.size() < max_dim_size) {
+            dims_input.insert(dims_input.begin(), 1);
+        }
+        blob->GetBlobDesc().dims = dims_input;
+    }
+
+    return MetalLayerAcc::ReloadConstantBlobs(inputs, only_reload_shape_differ_blob);
+}
+
+Status MetalMultidirBroadcastLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                           const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+    bool has_resource = layer_res && (layer_res->element_handle.force_to<void *>());
+
+    Status status = TNN_OK;
+    if (layer_res && !buffer_weight_) {
+        // If two inputs have different dimensions, align on the right
+        auto element_shape = layer_res->element_shape;
+        while(element_shape.size() < outputs[0]->GetBlobDesc().dims.size())
+            element_shape.insert(element_shape.begin(), 1);
+        buffer_weight_ =
+            AllocatePackedNC4HW4MetalBufferFormRawBuffer(layer_res->element_handle, element_shape, 1, status);
+    }
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    auto dims_input0  = inputs[0]->GetBlobDesc().dims;
+
+    // buffer_param_
+    {
+        MetalBroadcastParams metal_params;
+        metal_params.input_width  = DimsFunctionUtils::GetDim(dims_output, 3);
+        metal_params.input_height = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.input_size   = metal_params.input_height * metal_params.input_width;
+        metal_params.input_slice  = UP_DIV(dims_output[1], 4);
+
+        metal_params.output_width  = DimsFunctionUtils::GetDim(dims_output, 3);
+        metal_params.output_height = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_size   = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice  = UP_DIV(dims_output[1], 4);
+
+        metal_params.input0_size   = UP_DIV(dims_input0[1], 4) * DimsFunctionUtils::GetDimProduct(dims_input0, 2);
+        if (!(layer_res && buffer_weight_)) {
+            auto dims_input1  = inputs[1]->GetBlobDesc().dims;
+            metal_params.input1_size = UP_DIV(dims_input1[1], 4) * DimsFunctionUtils::GetDimProduct(dims_input1, 2);
+        }
+
+        metal_params.batch = dims_output[0];
+
+        metal_params.broadcast_input0 = layer_param->input0_broadcast_type;
+        metal_params.broadcast_input1 = layer_param->input1_broadcast_type;
+
+        metal_params.weight_index = has_resource? layer_param->weight_input_index : -1;
+        // for passing unit_test
+        //metal_params.weight_index = layer_param->weight_input_index;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalBroadcastParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    return status;
+}
+
+Status MetalMultidirBroadcastLayerAcc::SetKernelEncoderParam(
+                                                             id<MTLComputeCommandEncoder> encoder,
+                                                             const std::vector<Blob *> &inputs,
+                                                             const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    
+    auto input0 = inputs[0];
+    auto output = outputs[0];
+    
+    auto input_buffer0 = (__bridge id<MTLBuffer>)(void *)input0->GetHandle().base;
+    auto input_buffer0_bytes_offset = (NSUInteger)input0->GetHandle().bytes_offset;
+
+    auto input_buffer1 = buffer_weight_;
+    auto input_buffer1_bytes_offset = (NSUInteger)0;
+    
+    if (buffer_weight_) {
+        if (layer_param->weight_input_index == 0) {
+            std::swap(input_buffer0, input_buffer1);
+            std::swap(input_buffer0_bytes_offset, input_buffer1_bytes_offset);
+        }
+    } else {
+        if (inputs.size() <= 1) {
+            input_buffer1              = input_buffer0;
+            input_buffer1_bytes_offset = input_buffer0_bytes_offset;
+        } else {
+            input_buffer1              = (__bridge id<MTLBuffer>)(void *)inputs[1]->GetHandle().base;
+            input_buffer1_bytes_offset = (NSUInteger)inputs[1]->GetHandle().bytes_offset;
+        }
+    }
+    
+    [encoder setBuffer:input_buffer0 offset:input_buffer0_bytes_offset atIndex:0];
+    [encoder setBuffer:input_buffer1 offset:input_buffer1_bytes_offset atIndex:1];
+
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                offset:(NSUInteger)output->GetHandle().bytes_offset
+               atIndex:2];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+    return TNN_OK;
+}
+
+Status MetalMultidirBroadcastLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+
+    if (!((inputs.size() == 1 && buffer_weight_) || inputs.size() == 2)) {
+        LOGE("Error: MetalMultidirBroadcastLayerAcc invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "MetalMultidirBroadcastLayerAcc invalid inputs count");
+    }
+    
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.metal
new file mode 100644
index 0000000..b160e04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void neg(const device ftype4 *in                  [[buffer(0)]],
+                           device ftype4 *out                          [[buffer(1)]],
+                           constant MetalParams& params    [[buffer(2)]],
+                           uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = -(*z_in);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.mm
new file mode 100644
index 0000000..963165c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_neg_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Neg, LAYER_NEG);
+
+string MetalNegLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "neg";
+}
+
+Status MetalNegLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalNegLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Neg, LAYER_NEG);
+REGISTER_METAL_LAYOUT(LAYER_NEG, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.metal
new file mode 100644
index 0000000..ad3b3f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.metal
@@ -0,0 +1,226 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "metal_common.metal"
+
+using namespace metal;
+
+kernel void normaliz_2_axis_1_common_channel_0(const device ftype4 *src                  [[buffer(0)]],
+                                                                                device ftype4 *dst                             [[buffer(1)]],
+                                                                                constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                                uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, 1, params.batch)))
+        return;
+    
+    int index = (int)gid.z*params.output_slice*params.output_size + (int)gid.x;
+    auto const src_data = src + index;
+    auto const dst_data = dst + index;
+    
+    //xx
+    auto src_data_ptr = src_data;
+    auto sum4 = float4(*src_data_ptr) * float4(*src_data_ptr);
+    for (int s = 1; s < params.output_slice; s++) {
+        src_data_ptr += params.output_size;
+        sum4 += float4(*src_data_ptr) * float4(*src_data_ptr);
+    }
+    //sum
+    float4 sum_01 = sum4 + sum4.yzwx;
+    float4 sum_23 = sum4.zwxy + sum4.wxyz;
+    sum4 = sum_01 + sum_23;
+    
+    //max - sqrt
+    sum4 = max(sqrt(sum4), float4(params.epsilon));
+    sum4 = 1.0f/sum4;
+    
+    //division
+    src_data_ptr = src_data;
+    auto dst_data_ptr = dst_data;
+    for (int s = 0; s < params.output_slice; s++) {
+        *dst_data_ptr = ftype4(float4(*src_data_ptr)*sum4);
+        dst_data_ptr += params.output_size;
+        src_data_ptr += params.output_size;
+    }
+}
+
+kernel void normaliz_2_axis_1_slice_1_channel_4(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //x*x
+    auto xx = input_data * input_data;
+    
+    //sum
+    float4 sum_01 = xx + xx.yzwx;
+    float4 sum_23 = xx.zwxy + xx.wxyz;
+    float4 sum_0123 = sum_01 + sum_23;
+    
+    //max - sqrt
+    sum_0123 = max(sqrt(sum_0123), params.epsilon);
+    
+    //div
+    dst[index] = ftype4(input_data/sum_0123);
+}
+
+kernel void normaliz_2_axis_1_slice_1_channel_3(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //x*x
+    auto xx = input_data * input_data;
+    
+    //sum
+    auto sum_012 = xx + xx.yzxw + xx.zxyw;
+    
+    //max - sqrt
+    sum_012 = max(sqrt(sum_012), params.epsilon);
+    
+    //div
+    dst[index] = ftype4(input_data/sum_012);
+}
+
+kernel void normaliz_2_axis_1_slice_1_channel_2(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //x*x
+    auto xx = input_data * input_data;
+    
+    //sum
+    auto sum_01 = xx + xx.yxzw;
+    
+    //max - sqrt
+    sum_01 = max(sqrt(sum_01), params.epsilon);
+    
+    //div
+    dst[index] = ftype4(input_data/sum_01);
+}
+
+
+kernel void normaliz_1_axis_1_common_channel_0(const device ftype4 *src                  [[buffer(0)]],
+                                                                                device ftype4 *dst                             [[buffer(1)]],
+                                                                                constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                                uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, 1, params.batch)))
+        return;
+    
+    int index = (int)gid.z*params.output_slice*params.output_size + (int)gid.x;
+    auto const src_data = src + index;
+    auto const dst_data = dst + index;
+    
+    //sum abs
+    auto src_data_ptr = src_data;
+    auto sum4 = abs(float4(*src_data_ptr));
+    for (int s = 1; s < params.output_slice; s++) {
+        src_data_ptr += params.output_size;
+        sum4 += abs(float4(*src_data_ptr));
+    }
+    auto sum_01 = sum4 + sum4.yzwx;
+    auto sum_23 = sum4.zwxy + sum4.wxyz;
+    sum4 = sum_01 + sum_23;
+    sum4 = 1.0f/sum4;
+    
+    //division
+    src_data_ptr = src_data;
+    auto dst_data_ptr = dst_data;
+    for (int s = 0; s < params.output_slice; s++) {
+        *dst_data_ptr = ftype4(float4(*src_data_ptr)*sum4);
+        dst_data_ptr += params.output_size;
+        src_data_ptr += params.output_size;
+    }
+}
+
+kernel void normaliz_1_axis_1_slice_1_channel_4(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //abs
+    auto xx = abs(input_data);
+    
+    //sum
+    float4 sum_01 = xx + xx.yzwx;
+    float4 sum_23 = xx.zwxy + xx.wxyz;
+    float4 sum_0123 = sum_01 + sum_23;
+    
+    //div
+    dst[index] = ftype4(input_data/sum_0123);
+}
+
+kernel void normaliz_1_axis_1_slice_1_channel_3(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //abs
+    auto xx = abs(input_data);
+    
+    //sum
+    auto sum_012 = xx + xx.yzxw + xx.zxyw;
+    
+    //div
+    dst[index] = ftype4(input_data/sum_012);
+}
+
+kernel void normaliz_1_axis_1_slice_1_channel_2(const device ftype4 *src                  [[buffer(0)]],
+                                                                            device ftype4 *dst                             [[buffer(1)]],
+                                                                            constant MetalNormalizeParams &params      [[buffer(2)]],
+                                                                            uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto input_data  = float4(src[index]);
+    
+    //abs
+    auto xx = abs(input_data);
+    
+    //sum
+    auto sum_01 = xx + xx.yxzw;
+    
+    //div
+    dst[index] = ftype4(input_data/sum_01);
+}
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.mm
new file mode 100644
index 0000000..2464a90
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_normalize_layer_acc.mm
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Normalize, LAYER_NORMALIZE);
+
+Status MetalNormalizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalNormalizeLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<NormalizeLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalNormalizeParams metal_params;
+        metal_params.output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        metal_params.output_height = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_size   = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice  = UP_DIV(dims_output[1], 4);
+
+        metal_params.batch = dims_output[0];
+
+        metal_params.epsilon = layer_param->epsilon;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalNormalizeParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalNormalizeLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "";
+}
+
+Status MetalNormalizeLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalNormalizeLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalNormalizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param  = dynamic_cast<NormalizeLayerParam *>(param_);
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder      = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output    = output->GetBlobDesc().dims;
+    auto output_width   = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+    auto output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+    auto output_channel = dims_output[1];
+    auto output_slice   = UP_DIV(dims_output[1], 4);
+    auto batch          = dims_output[0];
+    auto mode           = dims_output[1] % 4;
+
+    MetalBandwidth bandwidth;
+    Status status        = TNN_OK;
+    DataType data_type   = output->GetBlobDesc().data_type;
+    string data_type_str = DataTypeUtils::GetDataTypeString(data_type);
+
+    do {
+        if (layer_param->axis == 1) {
+            if (output_slice == 1) {
+                status = [context_impl load:[NSString stringWithFormat:@"normaliz_%d_axis_1_slice_1_channel_%d",
+                                                                       layer_param->p, output_channel]
+                                    encoder:encoder
+                                  bandwidth:bandwidth];
+            } else {
+                status = [context_impl
+                         load:[NSString stringWithFormat:@"normaliz_%d_axis_1_common_channel_%d", layer_param->p, mode]
+                      encoder:encoder
+                    bandwidth:bandwidth];
+            }
+        } else {
+            LOGE("MetalNormalizeLayerAcc do not support axis!=1\n");
+            status = Status(TNNERR_LAYER_ERR, "MetalNormalizeLayerAcc do not support axis!=1");
+        }
+        BREAK_IF(status != TNN_OK);
+
+        MTLSize threads = {(NSUInteger)output_width * output_height, 1, (NSUInteger)batch};
+
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+REGISTER_METAL_ACC(Normalize, LAYER_NORMALIZE);
+REGISTER_METAL_LAYOUT(LAYER_NORMALIZE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.metal
new file mode 100644
index 0000000..4dd54b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.metal
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void pad_const_common(const device ftype4 *src                  [[buffer(0)]],
+                                                     device ftype4 *dst                            [[buffer(1)]],
+                                                     constant MetalPadParams &params     [[buffer(2)]],
+                                                     uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int index_out = (int)gid.z*params.output_size + (int)gid.y*params.output_width + (int)gid.x;
+    int index_in_y = (int)gid.y - params.pad_t ;
+    int index_in_x = (int)gid.x - params.pad_l;
+
+    int index_out_c_beg = ((int)gid.z % params.output_slice ) * 4;
+    int index_out_c_end = index_out_c_beg + 4;
+    
+    auto temp = ftype4(params.value);
+    if (index_in_y >= 0 && index_in_y < params.input_height &&
+        index_in_x >= 0 && index_in_x < params.input_width) {
+            int num_pad_c_beg = max(params.pad_c_b - index_out_c_beg, 0);
+            int num_pad_c_end = max(index_out_c_end - (params.pad_c_b + params.input_channel), 0);
+            int num_from_input = 4 - num_pad_c_beg - num_pad_c_end;
+            // we need to fill temp with some input elements
+            if(num_from_input > 0){
+                int index_in_channel = index_out_c_beg < params.pad_c_b? 0 : index_out_c_beg - params.pad_c_b;
+                int index_in_batch = (int)gid.z / params.output_slice;
+
+                for(int i=0; i<num_from_input; ++i){
+                    int index_in_slice = index_in_channel / 4;
+                    int index_in_c = index_in_channel%4;
+
+                    int index_in = (index_in_batch*params.input_slice*params.input_size) + index_in_slice*params.input_size + index_in_y*params.input_width + index_in_x;
+                    temp[num_pad_c_beg + i] = src[index_in][index_in_c];
+                    index_in_channel += 1;
+                }
+            }
+    }
+    dst[index_out] =  temp;
+}
+
+//specialization for performing padding at channel dimension when pad_c_b is multiple of 4
+kernel void pad_const_channel4(const device ftype4 *src                  [[buffer(0)]],
+                                     device ftype4 *dst                  [[buffer(1)]],
+                                     constant MetalPadParams &params     [[buffer(2)]],
+                                     uint3 gid                           [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int index_out = (int)gid.z*params.output_size + (int)gid.y*params.output_width + (int)gid.x;
+    int index_in_y = (int)gid.y - params.pad_t ;
+    int index_in_x = (int)gid.x - params.pad_l;
+    // the param.pad_c_b must be multiple of 4
+    int index_in_slice = (int)gid.z % params.output_slice - params.pad_c_b / 4;
+
+    auto temp = ftype4(params.value);
+    if (index_in_y >= 0 && index_in_y < params.input_height &&
+        index_in_x >= 0 && index_in_x < params.input_width  &&
+        index_in_slice >=0  && index_in_slice < params.input_slice) {
+            int index_in_batch = (int)gid.z / params.output_slice;
+            int index_in = index_in_batch*params.input_slice*params.input_size + index_in_slice*params.input_size + index_in_y*params.input_width + index_in_x;
+            temp = src[index_in];
+    }
+    dst[index_out] =  temp;
+}
+
+kernel void pad_reflect_common(const device ftype4 *src                  [[buffer(0)]],
+                                                     device ftype4 *dst                            [[buffer(1)]],
+                                                     constant MetalPadParams &params     [[buffer(2)]],
+                                                     uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int index_out = (int)gid.z*params.output_size + (int)gid.y*params.output_width + (int)gid.x;
+    int index_in_y = (int)gid.y - params.pad_t ;
+    int index_in_x = (int)gid.x - params.pad_l;
+    
+    if (index_in_y < 0) {
+        index_in_y = - index_in_y;
+    } else if (index_in_y >= params.input_height){
+        index_in_y = params.input_height - (index_in_y - params.input_height) - 2;
+    }
+    
+    if (index_in_x < 0) {
+        index_in_x = - index_in_x;
+    } else if (index_in_x >= params.input_width){
+        index_in_x = params.input_width - (index_in_x - params.input_width) - 2;
+    }
+    
+    int index_in = (int)gid.z*params.input_size + index_in_y * params.input_width + index_in_x;
+    
+    dst[index_out] =  src[index_in];
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.mm
new file mode 100644
index 0000000..41a0a9a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pad_layer_acc.mm
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Pad, LAYER_PAD);
+
+Status MetalPadLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalPadLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<PadLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalPadParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.pad_l = layer_param->pads[0];
+        metal_params.pad_r = layer_param->pads[1];
+        metal_params.pad_t = layer_param->pads[2];
+        metal_params.pad_b = layer_param->pads[3];
+        metal_params.pad_c_b = layer_param->pads[4];
+        metal_params.pad_c_e = layer_param->pads[5];
+        metal_params.value = layer_param->value;
+        metal_params.input_channel = dims_input[1];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalPadParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalPadLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs,
+                                           MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+std::string MetalPadLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return "";
+    }
+    int pad_type = layer_param->type;
+    bool pad_const_specilized = ((layer_param->pads[4])%4 == 0) && (inputs[0]->GetBlobDesc().dims[1]%4 == 0);
+
+    string kernel_name = "";
+    if (pad_type == 1) {
+        kernel_name = "pad_reflect_common";
+    } else if (pad_type == 0 && pad_const_specilized) {
+        kernel_name = "pad_const_channel4";
+    } else if (pad_type == 0){
+        kernel_name = "pad_const_common";
+    } else {
+        LOGE("Error: layer param is not supported: type:%d\n", pad_type);
+    }
+    return kernel_name;
+}
+
+Status MetalPadLayerAcc::SetKernelEncoderParam(
+    id<MTLComputeCommandEncoder> encoder,
+    const std::vector<Blob *> &inputs,
+    const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalPadLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs) {
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    auto data_type_str = DataTypeUtils::GetDataTypeString(data_type);
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("MetalLayerAcc: DataType must be float or half\n");
+        return Status(TNNERR_LAYER_ERR, "MetalLayerAcc: DataType must be float or half");
+    }
+    
+    auto layer_param     = dynamic_cast<PadLayerParam *>(param_);
+    int pad_type     = layer_param->type;
+    bool pad_const_specilized = ((layer_param->pads[4])%4 == 0) && (inputs[0]->GetBlobDesc().dims[1]%4 == 0);
+
+    MTLSize threads;
+    auto status = ComputeThreadSize(inputs, outputs, threads);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    string kernel_name = "invalid";
+    if (pad_type == 1) {
+        kernel_name = "pad_reflect_common";
+    } else if (pad_type == 0 && pad_const_specilized) {
+        kernel_name = "pad_const_channel4";
+    } else if (pad_type == 0){
+        kernel_name = "pad_const_common";
+    } else {
+        LOGE("Error: layer param is not supported: type:%d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is not supported");
+    }
+    
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+    
+    do {
+        MetalBandwidth bandwidth;
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                          bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    
+    if (status == TNN_OK) {
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+    }
+    return status;
+}
+
+REGISTER_METAL_ACC(Pad, LAYER_PAD);
+REGISTER_METAL_LAYOUT(LAYER_PAD, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.metal
new file mode 100644
index 0000000..01f4131
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.metal
@@ -0,0 +1,312 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void permute_to_nhwc(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+    
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+    
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //n h w c
+    
+    int input_batch = index_batch;
+    int input_slice = index_width/4;
+    int4 input_height = index_channel;
+    bool4 valid_position = input_height < params.input_height;
+    input_height = clamp(input_height, 0, params.input_height-1);
+    int input_width = index_height;
+    
+    int4 input_i = index_width % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+    
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_to_nhcw(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+    
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+    
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //n h c w
+    
+    int input_batch = index_batch;
+    int input_slice = index_height/4;
+    int4 input_height = index_channel;
+    bool4 valid_position = input_height < params.input_height;
+    input_height = clamp(input_height, 0, params.input_height-1);
+    int input_width = index_width;
+    
+    int4 input_i = index_height % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+    
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_to_nwch(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //n w c h
+
+    int input_batch = index_batch;
+    int input_slice = index_height/4;
+    int input_height = index_width;
+    int4 input_width = index_channel;
+    bool4 valid_position = input_width < params.input_width;
+    input_width = clamp(input_width, 0, params.input_width-1);
+
+    int4 input_i = index_height % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_to_nwhc(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+    //0, 3, 2, 1
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //n w c h
+
+    int input_batch = index_batch;
+    int input_slice = index_width/4;
+    int input_height = index_height;
+    int4 input_width = index_channel;
+    bool4 valid_position = input_width < params.input_width;
+    input_width = clamp(input_width, 0, params.input_width-1);
+
+    int4 input_i = index_width % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_to_chwn(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //c h w n
+
+    int input_batch = index_width;
+    int input_slice = index_batch/4;
+    int4 input_height = index_channel;
+    bool4 valid_position = input_height < params.input_height;
+    input_height = clamp(input_height, 0, params.input_height-1);
+    int input_width = index_height;
+
+    int4 input_i = index_batch % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_to_wnch(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalPermuteParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+
+    int index_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+
+    int index_batch = gid.z / params.output_slice;
+    int output_slice = gid.z % params.output_slice;
+    int4 index_channel = output_slice*4 + int4(0, 1, 2, 3);
+    int index_height = gid.y;
+    int index_width = gid.x;
+    //w n c h
+
+    int4 input_batch = index_channel;
+    int input_slice = index_height / 4;
+    int input_height = index_width;
+    int input_width = index_batch;
+    bool4 valid_position = input_batch < params.batch;
+    input_batch = clamp(input_batch, 0, params.input_batch-1);
+
+    int4 input_i = index_height % 4;
+    int4 index_in = input_batch * params.input_slice * params.input_size + \
+                    input_slice * params.input_size + \
+                    input_height * params.input_width + input_width;
+
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+    dst[index_out] = val;
+}
+
+kernel void permute_copy(const device ftype4 *src                  [[buffer(0)]],
+                            device ftype4 *dst                            [[buffer(1)]],
+                            constant MetalPermuteParams &params     [[buffer(2)]],
+                            uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice * params.batch)))
+        return;
+
+    int index_in_out = (int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x;
+    dst[index_in_out] = src[index_in_out];
+}
+
+kernel void permute_common(const device ftype4 *src                  [[buffer(0)]],
+                           device ftype4 *dst                        [[buffer(1)]],
+                           constant MetalDynamicPermuteParams &params       [[buffer(2)]],
+                           uint3 gid                                 [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    int index_out = ((int)gid.z * params.output_slice + (int)gid.y) * params.output_size + (int)gid.x;
+
+    int batch  = (int)gid.z;
+    int slice  = (int)gid.y;
+    int4 channel = slice * 4 + int4(0, 1, 2, 3);
+    int size  = (int)gid.x;
+    bool4 valid_position = channel < params.channel_dim_size;
+    channel = clamp(channel, 0, params.channel_dim_size-1);
+
+    int4 input_i  = int4(0);
+    int input_slice = 0;
+    int4 index_in = 0;
+
+    if (params.channel_dim == 0) {
+        input_slice = batch / 4;
+        input_i = batch % 4;
+        index_in = input_slice*params.strides[0] + channel*params.strides[1];
+    } else if (params.channel_dim == 1) {
+        input_slice = slice;
+        input_i = channel % 4;
+        index_in = batch*params.strides[0] + input_slice*params.strides[1];
+    } else {
+        index_in = batch*params.strides[0] + channel*params.strides[1];
+    }
+    for(int i=params.dim_count-1; i>=2; --i) {
+        int axis_size = size % params.output_sizes[i];
+        if (i == params.channel_dim) {
+            input_slice = axis_size / 4;
+            input_i     = axis_size % 4;
+            index_in += input_slice * params.strides[i];
+        } else {
+            index_in += axis_size * params.strides[i];
+        }
+        size = size / params.output_sizes[i];
+    }
+
+    ftype4 val = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+    val = select(ftype4(0), val, valid_position);
+
+    dst[index_out] = val;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.mm
new file mode 100644
index 0000000..1524a8d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_permute_layer_acc.mm
@@ -0,0 +1,191 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Permute, LAYER_PERMUTE);
+
+const static std::map<unsigned int, std::string> kernels = {
+    // NC4HW4
+    {0x0231, "permute_to_nhwc"},
+    {0x0213, "permute_to_nhcw"},
+    {0x0312, "permute_to_nwch"},
+    {0x0321, "permute_to_nwhc"},
+    {0x1230, "permute_to_chwn"},
+    {0x3012, "permute_to_wnch"},
+    {0x0123, "permute_copy"},
+    // NCHW
+    //{0x4000, "permute_nchw"}
+};
+
+unsigned int GetPermuteKernelKey(DataFormat format, const std::vector<int>& orders) {
+    unsigned int kid = 0;
+    if (format == DATA_FORMAT_NC4HW4 && orders.size() == 4) {
+        constexpr static unsigned int keys[4] = {
+            0x1000, 0x0100, 0x010, 0x0001};
+        kid = orders[0]*keys[0] + orders[1]*keys[1] + \
+                             orders[2]*keys[2] + orders[3]*keys[3];
+    }
+    return kid;
+}
+
+bool hasKernelFor(Blob *input, const std::vector<int>& orders) {
+    const auto blob_format = input->GetBlobDesc().data_format;
+    const auto kernel_id = GetPermuteKernelKey(blob_format, orders);
+    return kernels.count(kernel_id) != 0;
+}
+
+std::string GetPermuteKernel(Blob *input, const std::vector<int>& orders) {
+    const auto blob_format = input->GetBlobDesc().data_format;
+    /*
+    // try specialized kernels first
+    if (orders.size() == 4) {
+        auto kernel_key = GetPermuteKernelKey(blob_format, orders);
+        if (kernels.count(kernel_key) > 0)
+            return kernels.at(kernel_key);
+        return "permute_common4";
+    } else if (orders.size() == 3) {
+        auto new_orders = orders;
+        new_orders.push_back(3);
+        if (hasKernelFor(input, new_orders)) return GetPermuteKernel(input, new_orders);
+        new_orders.clear();
+        for(const auto& i : orders) {
+            if (i == 2) new_orders.push_back(3);
+            else new_orders.push_back(i);
+        }
+        new_orders.push_back(2);
+        if (hasKernelFor(input, new_orders)) return GetPermuteKernel(input, new_orders);
+    } else if (orders.size() == 2) {
+        auto new_orders = orders;
+        new_orders.push_back(2);
+        new_orders.push_back(3);
+        if (hasKernelFor(input, new_orders)) return GetPermuteKernel(input, new_orders);
+        new_orders = orders;
+        new_orders.push_back(3);
+        new_orders.push_back(2);
+        if (hasKernelFor(input, new_orders)) return GetPermuteKernel(input, new_orders);
+    }
+    */
+    return "permute_common";
+}
+
+Status MetalPermuteLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PermuteLayerParam *>(param_);
+    if (GetPermuteKernel(inputs[0], layer_param->orders) == "") {
+        return Status(TNNERR_PARAM_ERR, "permute orders not supported!");
+    }
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalPermuteLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    PermuteLayerParam *params = dynamic_cast<PermuteLayerParam *>(param_);
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalDynamicPermuteParams metal_params;
+        if (dims_input.size() > MAX_DIM_COUNT) {
+            LOGE("too many dimensions in inputs to permute layer!");
+            return Status(TNNERR_PARAM_ERR, "too many dimensions in inputs to permute layer!");
+        }
+
+        const int dim_count = dims_input.size();
+        metal_params.dim_count = dim_count;
+        const int input_slice  = UP_DIV(dims_input[1], 4);
+        const int output_slice = UP_DIV(dims_output[1], 4);
+        metal_params.input_slice = input_slice;
+        metal_params.output_slice = output_slice;
+        metal_params.input_batch = dims_input[0];
+        metal_params.batch = dims_output[0];
+
+        int input_strides[MAX_DIM_COUNT] = {0};
+        int stride = 1;
+        int input_size = 1;
+        int output_size = 1;
+        for(int i=dim_count-1; i>=0; --i) {
+            input_strides[i] = stride;
+            if (i == 1) {
+                stride *= input_slice;
+            } else {
+                stride *= dims_input[i];
+            }
+            metal_params.input_sizes[i] = dims_input[i];
+            metal_params.output_sizes[i] = dims_output[i];
+            if (i > 1) {
+                input_size *= dims_input[i];
+                output_size *= dims_output[i];
+            }
+        }
+        metal_params.input_size = input_size;
+        metal_params.output_size = output_size;
+
+        int output_strides[MAX_DIM_COUNT] = {0};
+        const auto& orders = params->orders;
+        for(int i=0; i<dim_count; ++i) {
+            output_strides[i] = input_strides[orders[i]];
+            if (orders[i] == 1) {
+                metal_params.channel_dim = i;
+            }
+            if (i == 1) {
+                metal_params.channel_dim_size = dims_input[orders[i]];
+            }
+            metal_params.strides[i] = output_strides[i];
+        }
+
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalPermuteLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+std::string MetalPermuteLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PermuteLayerParam *>(param_);
+    const auto &kernel_name = GetPermuteKernel(inputs[0], layer_param->orders);
+    return kernel_name;
+}
+
+Status MetalPermuteLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = MTLSizeMake(DimsFunctionUtils::GetDimProduct(dims_output, 2),
+                        UP_DIV(dims_output[1], 4),
+                        dims_output[0]);
+    return TNN_OK;
+}
+
+Status MetalPermuteLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Permute, LAYER_PERMUTE);
+REGISTER_METAL_LAYOUT(LAYER_PERMUTE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.metal
new file mode 100644
index 0000000..a2de89a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.metal
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void pixel_shuffle_common(const device ftype4 *src                   [[buffer(0)]],
+                                device ftype4 *dst                          [[buffer(1)]],
+                                constant MetalPixelShuffleParams& params    [[buffer(2)]],
+                                uint3 gid                                   [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+
+    auto index_out = (int)gid.z*params.output_height*params.output_width + (int)gid.y*params.output_width + (int)gid.x;
+
+    int iw    = (int)gid.x / params.upscale_factor;
+    int rw    = (int)gid.x % params.upscale_factor;
+    int ih    = (int)gid.y / params.upscale_factor;
+    int rh    = (int)gid.y % params.upscale_factor;
+    int os    = (int)gid.z % params.output_slice;
+    int batch = (int)gid.z / params.output_slice;
+    int4 oc   = os * 4 + int4(0, 1, 2, 3);
+
+    int4 ic       = ((oc * params.upscale_factor + rh) * params.upscale_factor) + rw;
+    int4 is       = ic / 4;
+    int4 icr      = ic % 4;
+    int4 index_in = ((batch * params.input_slice + is) * params.input_height + ih) * params.input_width + iw;
+
+    bool4 valid_pos = ic < params.input_channel;
+    index_in = select(int4(0), index_in, valid_pos);
+    icr      = select(int4(0), icr,      valid_pos);
+
+    ftype4 val = select(
+        ftype4(0),
+        ftype4(
+            src[index_in[0]][icr[0]],
+            src[index_in[1]][icr[1]],
+            src[index_in[2]][icr[2]],
+            src[index_in[3]][icr[3]]),
+        valid_pos);
+
+    dst[index_out] = val;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.mm
new file mode 100644
index 0000000..e1db020
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pixel_shuffle_layer_acc.mm
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status MetalPixelShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalPixelShuffleLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<PixelShuffleLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalPixelShuffleParams metal_params;
+        metal_params.batch          = dims_input[0];
+        metal_params.input_channel  = dims_input[1];
+        metal_params.input_slice    = UP_DIV(dims_input[1], 4);
+        metal_params.input_height   = DimsFunctionUtils::GetDim(dims_input, 2);
+        metal_params.input_width    = DimsFunctionUtils::GetDim(dims_input, 3);
+
+        metal_params.output_slice   = UP_DIV(dims_output[1], 4);
+        metal_params.output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_width   = DimsFunctionUtils::GetDim(dims_output, 3);
+
+        metal_params.upscale_factor = layer_param->upscale_factor;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalPixelShuffleParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalPixelShuffleLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+std::string MetalPixelShuffleLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "pixel_shuffle_common";
+}
+
+Status MetalPixelShuffleLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+Status MetalPixelShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+REGISTER_METAL_LAYOUT(LAYER_PIXEL_SHUFFLE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.metal
new file mode 100644
index 0000000..93058f6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.metal
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void pooling_max(const device ftype4 *in            [[buffer(0)]],
+                        device ftype4 *out                 [[buffer(1)]],
+                        constant MetalPoolParams& params   [[buffer(2)]],
+                        uint3 gid                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int off_x = gid.x * params.stride_x - params.pad_x;
+    int off_y = gid.y * params.stride_y - params.pad_y;
+    
+    int y_min = clamp(off_y, 0, params.input_height);
+    int y_max = clamp(off_y + params.kernel_y, 0, params.input_height);
+    int x_min = clamp(off_x, 0, params.input_width);
+    int x_max = clamp(off_x + params.kernel_x, 0, params.input_width);
+    
+    auto z_in = in + (int)gid.z * params.input_size;
+    auto result = z_in[y_min * params.input_width + x_min];
+    for (int y = y_min; y < y_max; y++) {
+        auto y_in = z_in + y * params.input_width;
+        for (int x = x_min; x < x_max; x++) {
+            result = max(result, y_in[x]);
+        }
+    }
+    out[(int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x] = result;
+}
+
+kernel void pooling_avg(const device ftype4 *in            [[buffer(0)]],
+                        device ftype4 *out                 [[buffer(1)]],
+                        constant MetalPoolParams& params   [[buffer(2)]],
+                        uint3 gid                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int off_x = gid.x * params.stride_x - params.pad_x;
+    int off_y = gid.y * params.stride_y - params.pad_y;
+
+    int y_min = clamp(off_y, 0, params.input_height);
+    int y_max = clamp(off_y + params.kernel_y, 0, params.input_height);
+    int x_min = clamp(off_x, 0, params.input_width);
+    int x_max = clamp(off_x + params.kernel_x, 0, params.input_width);
+    
+    auto z_in = in + (int)gid.z * params.input_size;
+    float4 result = 0;
+    for (int y = y_min; y < y_max; y++) {
+        auto y_in = z_in + y * params.input_width;
+        for (int x = x_min; x < x_max; x++) {
+            result += float4(y_in[x]);
+        }
+    }
+    
+    int count = (y_max - y_min) * (x_max - x_min);
+    float4 div = count > 0 ? 1.f / count : 0.0;
+    out[(int)gid.z * params.output_size + (int)gid.y * params.output_width + (int)gid.x] = ftype4(result * div);
+}
+
+kernel void pooling_global_average(const device ftype4 *in            [[buffer(0)]],
+                                     device ftype4 *out                 [[buffer(1)]],
+                                     constant MetalPoolParams& params   [[buffer(2)]],
+                                     uint3 gid                          [[thread_position_in_grid]],
+                                     uint t_index                       [[thread_index_in_threadgroup]]) {
+    if (any(gid >= uint3(params.input_size, 1, params.batch*params.input_slice)))
+        return;
+    
+    auto input_index_c = (int)gid.z * params.input_size;
+    auto output_index_c = (int)gid.z * params.output_size;
+    
+    const int max_index = min(32, params.input_size);
+    
+    //do not use setThreadgroupMemoryLength, unknown bug will raise
+    threadgroup float4 x_group[32];
+    
+    //compute local sum
+    float4 sum_x = float4(0);
+    for (int index = t_index; index < params.input_size; index+=32) {
+        auto temp = float4(in[index + input_index_c]);
+        sum_x += temp;
+    }
+    x_group[t_index] = sum_x;
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    //compute the average
+    if (t_index == 0) {
+        sum_x = float4(0);
+        for (int index = 0; index < max_index; index++) {
+            sum_x += x_group[index];
+        }
+        auto mean_x = sum_x / params.input_size;
+        out[output_index_c] = ftype4(mean_x);
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+}
+
+kernel void pooling_global_max(const device ftype4 *in            [[buffer(0)]],
+                                     device ftype4 *out                 [[buffer(1)]],
+                                     constant MetalPoolParams& params   [[buffer(2)]],
+                                     uint3 gid                          [[thread_position_in_grid]],
+                                     uint t_index                       [[thread_index_in_threadgroup]]) {
+    if (any(gid >= uint3(params.input_size, 1, params.batch*params.input_slice)))
+        return;
+    
+    auto input_index_c = (int)gid.z * params.input_size;
+    auto output_index_c = (int)gid.z * params.output_size;
+    
+    const int max_index = min(32, params.input_size);
+    
+    //do not use setThreadgroupMemoryLength, unknown bug will raise
+    threadgroup ftype4 x_group[32];
+    
+    //compute local maximum value
+    ftype4 max_x = ftype4(-FTYPE_MAX);
+    for (int index = t_index; index < params.input_size; index+=32) {
+        auto temp = in[index + input_index_c];
+        max_x = max(max_x, temp);
+    }
+    x_group[t_index] = max_x;
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    //compute the maximum
+    if (t_index == 0) {
+        max_x = ftype4(-FTYPE_MAX);
+        for (int index = 0; index < max_index; index++) {
+            max_x = max(max_x, x_group[index]);
+        }
+        out[output_index_c] = max_x;
+    }
+    
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.mm
new file mode 100644
index 0000000..3142c03
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pooling_layer_acc.mm
@@ -0,0 +1,135 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class MetalPoolingLayerAcc : public MetalLayerAcc {
+public:                                                                                                            
+    virtual ~MetalPoolingLayerAcc(){};                                                                     
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);     
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs, 
+                            const std::vector<Blob *> &outputs, 
+                            MTLSize &size); 
+    virtual Status ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size);
+    virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder, 
+                                const std::vector<Blob *> &inputs, 
+                                const std::vector<Blob *> &outputs);
+private:
+    bool use_global_pooling_ = false;
+};
+
+Status MetalPoolingLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto pool_param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!pool_param || (pool_param->pool_type != 0 && pool_param->pool_type != 1)) {
+        LOGE("Error: PoolingLayerParam pool_type unsupported\n");
+        return Status(TNNERR_PARAM_ERR, "Error: PoolingLayerParam pool_type unsupported");
+    }
+
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalPoolingLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device          = [TNNMetalDeviceImpl sharedDevice];
+    PoolingLayerParam *pool_param = dynamic_cast<PoolingLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // check if global average pooling
+    use_global_pooling_ = (pool_param->kernels[0] == dims_input[3]) && \
+                            (pool_param->kernels[1] == dims_input[2] && \
+                            (pool_param->pads[0] == 0) && \
+                            (pool_param->pads[2] == 0));
+    // buffer_param_
+    {
+        MetalPoolParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.kernel_x = pool_param->kernels[0];
+        metal_params.kernel_y = pool_param->kernels[1];
+        metal_params.stride_x = pool_param->strides[0];
+        metal_params.stride_y = pool_param->strides[1];
+        metal_params.pad_x    = pool_param->pads[0];
+        metal_params.pad_y    = pool_param->pads[2];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalPoolParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalPoolingLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    const int pool_type = param->pool_type;
+    return pool_type == 0 ? use_global_pooling_ ? "pooling_global_max" : "pooling_max" : use_global_pooling_ ? "pooling_global_average" : "pooling_avg";
+}
+
+Status MetalPoolingLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!param || (param->pool_type != 0 && param->pool_type != 1)) {
+        LOGE("Error: PoolingLayerParam pool_type unsupported\n");
+        return Status(TNNERR_PARAM_ERR, "Error: PoolingLayerParam pool_type unsupported");
+    }
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalPoolingLayerAcc::SetKernelEncoderParam(
+                                               id<MTLComputeCommandEncoder> encoder,
+                                               const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalPoolingLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    if (use_global_pooling_) {
+        //diapatch kernel with threadGroups and threads
+        size = MTLSizeMake(32, 1, 1);
+    } else {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        size = GetDefaultThreadSize(dims_output, false);
+    }
+    return TNN_OK;
+}
+
+Status MetalPoolingLayerAcc::ComputeThreadgroupSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs,
+                                     MTLSize &size) {
+    if (use_global_pooling_) {
+        auto dims_output  = outputs[0]->GetBlobDesc().dims;
+        auto output_slice = UP_DIV(dims_output[1], 4);
+        auto batch        = dims_output[0];
+        size = MTLSizeMake((NSUInteger)1, (NSUInteger)1, (NSUInteger)batch * output_slice);
+    } else {
+        size = MTLSizeMake(0, 0, 0);
+    }
+    return TNN_OK;
+}
+
+REGISTER_METAL_ACC(Pooling, LAYER_POOLING);
+REGISTER_METAL_LAYOUT(LAYER_POOLING, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.metal
new file mode 100644
index 0000000..5846204
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void pow(const device ftype4 *in                           [[buffer(0)]],
+                          device ftype4 *out                                   [[buffer(1)]],
+                          constant MetalPowParams& params     [[buffer(2)]],
+                          uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = pow(*z_in * params.scale + params.shift, params.exponent);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.mm
new file mode 100644
index 0000000..88507ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_pow_layer_acc.mm
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Pow, LAYER_POWER);
+
+string MetalPowLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "pow";
+}
+
+Status MetalPowLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PowLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: PowLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "PowLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalPowParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.scale    = layer_param->scale;
+        metal_params.shift    = layer_param->shift;
+        metal_params.exponent = layer_param->exponent;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalPowParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalPowLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Pow, LAYER_POWER);
+REGISTER_METAL_LAYOUT(LAYER_POWER, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.h
new file mode 100644
index 0000000..7513ca7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_PRELU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_PRELU_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief prelu layer metal acc
+class MetalPReluLayerAcc : public MetalLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~MetalPReluLayerAcc();
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<MetalLayerAcc> activate_impl_ = nullptr;
+    id<MTLBuffer> buffer_slope_                   = nil;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_PRELU_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.metal
new file mode 100644
index 0000000..a56ab56
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.metal
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void prelu(const device ftype4 *src                    [[buffer(0)]],
+                            device ftype4 *dst                             [[buffer(1)]],
+                            constant MetalParams& params      [[buffer(2)]],
+                            const device ftype4 *slopex              [[buffer(3)]],
+                            uint3 gid                                            [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = src[(int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x];
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    ftype4 slope = params.share_channel ? ftype4(slopex[0].x) : slopex[gid.y];
+    
+    *z_out = select(z_in, slope * z_in, signbit(z_in));
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.mm
new file mode 100644
index 0000000..eb96165
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prelu_layer_acc.mm
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_prelu_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+MetalPReluLayerAcc::~MetalPReluLayerAcc() {}
+
+Status MetalPReluLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    if (!layer_param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerParam is nil");
+    }
+    auto layer_res = dynamic_cast<PReluLayerResource *>(resource_);
+    if (!layer_res) {
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerResource is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        auto metal_params          = GetDefaultMetalParams(dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+        metal_params.share_channel = layer_param->channel_shared;
+        buffer_param_              = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    Status status = TNN_OK;
+    if (!buffer_slope_) {
+        buffer_slope_ = AllocateMetalBufferFormRawBuffer1D(layer_res->slope_handle, dims_output[1], status);
+    }
+    return status;
+}
+
+Status MetalPReluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder      = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output   = output->GetBlobDesc().dims;
+    auto batch         = dims_output[0];
+    auto output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3),
+         output_height = DimsFunctionUtils::GetDim(dims_output, 2),
+         output_slice  = UP_DIV(dims_output[1], 4) * dims_output[0];
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+    do {
+        status = [context_impl load:@"prelu" encoder:encoder bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+
+        MTLSize threads = {(NSUInteger)output_width * output_height, (NSUInteger)output_slice, (NSUInteger)batch};
+
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                    offset:(NSUInteger)input->GetHandle().bytes_offset
+                   atIndex:0];
+        [encoder setBuffer:buffer_slope_ offset:0 atIndex:3];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                    offset:(NSUInteger)output->GetHandle().bytes_offset
+                   atIndex:1];
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+REGISTER_METAL_ACC(PRelu, LAYER_PRELU);
+REGISTER_METAL_LAYOUT(LAYER_PRELU, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.metal
new file mode 100644
index 0000000..5b6faa3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.metal
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void copy_buffer_2_buffer(const device ftype4 *src                    [[buffer(0)]],
+                                                    device ftype4 *dst                             [[buffer(1)]],
+                                                    constant MetalParams& params      [[buffer(2)]],
+                                                    uint3 gid                                            [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    auto index = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index] = src[index];
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.mm
new file mode 100644
index 0000000..7c46317
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_prior_box_layer_acc.mm
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalPriorBoxLayerAcc : public MetalLayerAcc {
+public:
+    virtual ~MetalPriorBoxLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                         const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_priorbox_ = nil;
+    DimsVector buffer_priorbox_shape_ = {};
+};
+
+MetalPriorBoxLayerAcc::~MetalPriorBoxLayerAcc(){
+}
+
+Status MetalPriorBoxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is nil");
+    }
+    
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    
+    if (!buffer_priorbox_ || !DimsVectorUtils::Equal(buffer_priorbox_shape_, dims_output)) {
+        auto priorbox = GeneratePriorBox(inputs, outputs, param);
+        RawBuffer raw_prior_box((int)priorbox.size()*sizeof(float), (char *)priorbox.data());
+        
+        Status status = TNN_OK;
+        buffer_priorbox_ = AllocatePackedNC4HW4MetalBufferFormRawBuffer(raw_prior_box, dims_output, 1, status);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+    
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+std::string MetalPriorBoxLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "copy_buffer_2_buffer";
+}
+
+Status MetalPriorBoxLayerAcc::SetKernelEncoderParam(
+                                                id<MTLComputeCommandEncoder> encoder,
+                                                const std::vector<Blob *> &inputs,
+                                                const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    
+    [encoder setBuffer:buffer_priorbox_
+                offset:0
+               atIndex:0];
+    [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                offset:(NSUInteger)output->GetHandle().bytes_offset
+               atIndex:1];
+    [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+    
+    return TNN_OK;
+}
+
+REGISTER_METAL_ACC(PriorBox, LAYER_PRIOR_BOX);
+REGISTER_METAL_LAYOUT(LAYER_PRIOR_BOX, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.metal
new file mode 100644
index 0000000..f8c1611
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void reciprocal(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = (1.0 / *z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.mm
new file mode 100644
index 0000000..9e4b464
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reciprocal_layer_acc.mm
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Reciprocal, LAYER_RECIPROCAL);
+
+string MetalReciprocalLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "reciprocal";
+}
+
+Status MetalReciprocalLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalReciprocalLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Reciprocal, LAYER_RECIPROCAL);
+REGISTER_METAL_LAYOUT(LAYER_RECIPROCAL, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l1_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l1_layer_acc.mm
new file mode 100644
index 0000000..f2bfcb3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l1_layer_acc.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceL1, LAYER_REDUCE_L1);
+
+std::string MetalReduceL1LayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_l1_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_l1_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_l1_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_l1_axis_2_common";
+    } else {
+        return "reduce_l1_axis_3_common";
+    }
+}
+
+Status MetalReduceL1LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+
+Status MetalReduceL1LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceL1, LAYER_REDUCE_L1);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_L1, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l2_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l2_layer_acc.mm
new file mode 100644
index 0000000..02e54db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_l2_layer_acc.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+std::string MetalReduceL2LayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_l2_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_l2_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_l2_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_l2_axis_2_common";
+    } else {
+        return "reduce_l2_axis_3_common";
+    }
+}
+
+Status MetalReduceL2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+
+Status MetalReduceL2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceL2, LAYER_REDUCE_L2);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_L2, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.h
new file mode 100644
index 0000000..161dd86
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_REDUCE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_REDUCE_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief broadcast layer metal acc
+class MetalReduceLayerAcc : public MetalLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~MetalReduceLayerAcc();
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief metal kernel name
+     */
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    id<MTLBuffer> buffer_weight_ = nil;
+    int axis_ = 1;
+    bool multi_axis_ = false;
+    bool need_reformat_ = false;
+    id<MTLBuffer> buffer_output_ = nil;
+    id<MTLBuffer> buffer_reformat_ = nil;
+};
+
+#define DECLARE_METAL_REDUCE_ACC(type_string, layer_type)                                                              \
+    class Metal##type_string##LayerAcc : public MetalReduceLayerAcc {                                                  \
+    public:                                                                                                            \
+        virtual ~Metal##type_string##LayerAcc(){};                                                                     \
+        virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);         \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+    }
+
+#define REGISTER_METAL_REDUCE_ACC(type_string, layer_type) REGISTER_METAL_ACC(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_REDUCE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.metal
new file mode 100644
index 0000000..5d658dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.metal
@@ -0,0 +1,238 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+#define INPUT_PARAM const device ftype4 *src                [[buffer(0)]],              \
+                    device ftype4 *dst                      [[buffer(1)]],              \
+                    constant MetalReduceParams &params      [[buffer(2)]],              \
+                    uint3 gid                               [[thread_position_in_grid]] \
+
+#define DEFINE_REDUCE_AXIS_0(name, ini, op1, op2, post)  \
+kernel void reduce_##name##_axis_0_common(INPUT_PARAM) { \
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.output_batch)))    \
+        return;                                                                             \
+    int step = params.input_slice * params.input_size;                                      \
+    auto z_in  = src  + (int)gid.y * params.input_size + (int)gid.x;                        \
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;    \
+    auto s = ftype4(ini);                                                                                                       \
+    for (int b = 0; b < params.input_batch; b++) {                                                                              \
+        auto t = *(z_in + b * step);                                                                                            \
+        t = op1;                                                                                                                \
+        s = op2;                                                                                                                \
+    }                                                                                                                           \
+    s = post;                                                                                                                   \
+    *z_out = s;                                                                                                                 \
+} \
+
+#define DEFINE_REDUCE_AXIS_1(name, ini, op1, op2, post)                                         \
+kernel void reduce_##name##_axis_1_common(INPUT_PARAM) {                                        \
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.output_batch)))        \
+        return;                                                                                 \
+    int step = params.input_size;                                                               \
+    auto z_in  = src  + (int)gid.z * params.input_slice * params.input_size + (int)gid.x;       \
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;    \
+    auto s = ftype4(ini);                                                                                                       \
+    for (int b = 0; b < params.input_slice - 1; b++) {                                                                          \
+        auto t = *(z_in + b * step);                                                                                            \
+        t = op1;                                                                                                                \
+        s = op2;                                                                                                                \
+    }                                                                                                                           \
+    auto t = *(z_in + step * (params.input_slice - 1));                                                                         \
+    t = op1;                                                                                                                    \
+    switch (params.input_channel_mode_4) {                                                                                      \
+        case 1:                                                                                                                 \
+            t.yzw = ini;                                                                                                        \
+            break;                                                                                                              \
+        case 2:                                                                                                                 \
+            t.zw = ini;                                                                                                         \
+            break;                                                                                                              \
+        case 3:                                                                                                                 \
+            t.w = ini;                                                                                                          \
+            break;                                                                                                              \
+    }                                                                                                                           \
+    s = op2;                                                                                                                    \
+    s = post;                                                                                                                   \
+    *z_out = s;                                                                                                                 \
+}                                                                                                                               \
+
+#define DEFINE_REDUCE_AXIS_2(name, ini, op1, op2, post)                                         \
+kernel void reduce_##name##_axis_2_common(INPUT_PARAM) {                                        \
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.output_batch)))        \
+        return;                                                                                 \
+    int step = params.input_width;                                                              \
+    auto z_in  = src + (int)gid.z * params.input_slice * params.input_size + (int)gid.y * params.input_size + (int)gid.x;       \
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;    \
+    auto s = ftype4(ini);                                                                                                       \
+    for (int b = 0; b < params.input_height; b++) {                                                                             \
+        auto t = *(z_in + b * step);                                                                                            \
+        t = op1;                                                                                                                \
+        s = op2;                                                                                                                \
+    }                                                                                                                           \
+    s = post;                                                                                                                   \
+    *z_out = s;                                                                                                                 \
+}                                                                                                                               \
+
+#define DEFINE_REDUCE_AXIS_3(name, ini, op1, op2, post)                                         \
+kernel void reduce_##name##_axis_3_common(INPUT_PARAM) {                                        \
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.output_batch)))        \
+        return;                                                                                 \
+    auto z_in  = src + (int)gid.z * params.input_slice * params.input_size + (int)gid.y * params.input_size + (int)gid.x * params.input_width;  \
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;                    \
+    auto s = ftype4(ini);                                                                                                                       \
+    for (int b = 0; b < params.input_width; b++) {                                                                                              \
+        auto t = *(z_in + b);                                                                                                                   \
+        t = op1;                                                                                                                                \
+        s = op2;                                                                                                                                \
+    }                                                                                                                                           \
+    s = post;                                                                                                                                   \
+    *z_out = s;                                                                                                                                 \
+}
+
+DEFINE_REDUCE_AXIS_0(l1, 0, abs(t), s+t, s);
+DEFINE_REDUCE_AXIS_1(l1, 0, abs(t), s+t, dot(s,1));
+DEFINE_REDUCE_AXIS_2(l1, 0, abs(t), s+t, s);
+DEFINE_REDUCE_AXIS_3(l1, 0, abs(t), s+t, s);
+
+DEFINE_REDUCE_AXIS_0(l2, 0, pow(t,2), s+t, sqrt(s));
+DEFINE_REDUCE_AXIS_1(l2, 0, pow(t,2), s+t, sqrt(dot(s,1)));
+DEFINE_REDUCE_AXIS_2(l2, 0, pow(t,2), s+t, sqrt(s));
+DEFINE_REDUCE_AXIS_3(l2, 0, pow(t,2), s+t, sqrt(s));
+
+DEFINE_REDUCE_AXIS_0(mean, 0, t, s+t, s/params.input_batch);
+DEFINE_REDUCE_AXIS_1(mean, 0, t, s+t, dot(s,1)/params.input_channel);
+DEFINE_REDUCE_AXIS_2(mean, 0, t, s+t, s/params.input_height);
+DEFINE_REDUCE_AXIS_3(mean, 0, t, s+t, s/params.input_width);
+
+DEFINE_REDUCE_AXIS_0(max, -FLT_MAX, t, max(t,s), s);
+DEFINE_REDUCE_AXIS_1(max, -FLT_MAX, t, max(t,s), ftype4(max(max(s.x,s.y),max(s.z,s.w)),0,0,0));
+DEFINE_REDUCE_AXIS_2(max, -FLT_MAX, t, max(t,s), s);
+DEFINE_REDUCE_AXIS_3(max, -FLT_MAX, t, max(t,s), s);
+
+DEFINE_REDUCE_AXIS_0(min, FLT_MAX, t, min(t,s), s);
+DEFINE_REDUCE_AXIS_1(min, FLT_MAX, t, min(t,s), ftype4(min(min(s.x,s.y),min(s.z,s.w)),0,0,0));
+DEFINE_REDUCE_AXIS_2(min, FLT_MAX, t, min(t,s), s);
+DEFINE_REDUCE_AXIS_3(min, FLT_MAX, t, min(t,s), s);
+
+DEFINE_REDUCE_AXIS_0(log_sum, 0, t, s+t, log(s));
+DEFINE_REDUCE_AXIS_1(log_sum, 0, t, s+t, log(dot(s,1)));
+DEFINE_REDUCE_AXIS_2(log_sum, 0, t, s+t, log(s));
+DEFINE_REDUCE_AXIS_3(log_sum, 0, t, s+t, log(s));
+
+DEFINE_REDUCE_AXIS_0(log_sum_exp, 0, exp(t), s+t, log(s));
+DEFINE_REDUCE_AXIS_1(log_sum_exp, 0, exp(t), s+t, log(dot(s,1)));
+DEFINE_REDUCE_AXIS_2(log_sum_exp, 0, exp(t), s+t, log(s));
+DEFINE_REDUCE_AXIS_3(log_sum_exp, 0, exp(t), s+t, log(s));
+
+DEFINE_REDUCE_AXIS_0(prod, 1, t, s*t, s);
+DEFINE_REDUCE_AXIS_1(prod, 1, t, s*t, s.x*s.y*s.w*s.z);
+DEFINE_REDUCE_AXIS_2(prod, 1, t, s*t, s);
+DEFINE_REDUCE_AXIS_3(prod, 1, t, s*t, s);
+
+DEFINE_REDUCE_AXIS_0(sum, 0, t, s+t, s);
+DEFINE_REDUCE_AXIS_1(sum, 0, t, s+t, dot(s,1));
+DEFINE_REDUCE_AXIS_2(sum, 0, t, s+t, s);
+DEFINE_REDUCE_AXIS_3(sum, 0, t, s+t, s);
+
+DEFINE_REDUCE_AXIS_0(sum_square, 0, pow(t,2), s+t, s);
+DEFINE_REDUCE_AXIS_1(sum_square, 0, pow(t,2), s+t, dot(s,1));
+DEFINE_REDUCE_AXIS_2(sum_square, 0, pow(t,2), s+t, s);
+DEFINE_REDUCE_AXIS_3(sum_square, 0, pow(t,2), s+t, s);
+
+
+#define MULTI_AXIS_INPUT_PARAM const device ftype4 *src              [[buffer(0)]],              \
+                    device ftype4 *dst                               [[buffer(1)]],              \
+                    constant MetalMultiAxisReduceParams &params      [[buffer(2)]],              \
+                    uint3 gid                                        [[thread_position_in_grid]] \
+
+#define DEFINE_REDUCE_MULTI_AXIS(name, ini, op1, op2, post1, post2)                                                          \
+kernel void reduce_##name##_multi_axis_common(MULTI_AXIS_INPUT_PARAM) {                                                      \
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.output_batch)))                                     \
+        return;                                                                                                              \
+    auto z_out = dst + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x; \
+    const int nid  = (int)gid.z;                                           \
+    const int sid  = (int)gid.y;                                           \
+    const int hid  = (int)gid.x / params.output_width;                     \
+    const int wid  = (int)gid.x % params.output_width;                     \
+    int reduce_n = select(1, params.input_batch,   params.reduce_flag[0]); \
+    int reduce_s = select(1, params.input_slice-1, params.reduce_flag[1]); \
+    int reduce_h = select(1, params.input_height,  params.reduce_flag[2]); \
+    int reduce_w = select(1, params.input_width,   params.reduce_flag[3]); \
+    int is = params.input_slice;                                           \
+    int ih = params.input_height;                                          \
+    int iw = params.input_width;                                           \
+    auto s = ftype4(ini);                                                  \
+    for (int n = 0; n < reduce_n; ++n) {                                            \
+        int npos = n + nid;                                                         \
+        for (int ss = 0; ss < reduce_s; ++ss) {                                     \
+            int spos = ss + sid;                                                    \
+            for (int h = 0; h < reduce_h; ++h) {                                    \
+                int hpos = h + hid;                                                 \
+                for (int w = 0; w < reduce_w; ++w) {                                \
+                    int wpos = w + wid;                                             \
+                    auto t = *(src + ((npos * is + spos) * ih + hpos) * iw + wpos); \
+                    t = op1;                                                        \
+                    s = op2;                                                        \
+                }                                                                   \
+            }                                                                       \
+        }                                                                           \
+    }                                                                               \
+    if (params.reduce_flag[1]) {                                                    \
+        for (int n = 0; n < reduce_n; ++n) {                                        \
+            int npos = n + nid;                                                     \
+            for (int h = 0; h < reduce_h; ++h) {                                    \
+                int hpos = h + hid;                                                 \
+                for (int w = 0; w < reduce_w; ++w) {                                \
+                    int wpos = w + wid;                                             \
+                    auto t = *(src + ((npos * is +is -1) * ih + hpos) * iw + wpos); \
+                    t = op1;                                                        \
+                    switch (params.input_channel_mode_4) {                          \
+                        case 1:                                                     \
+                            t.yzw = ini;                                            \
+                            break;                                                  \
+                        case 2:                                                     \
+                            t.zw = ini;                                             \
+                            break;                                                  \
+                        case 3:                                                     \
+                            t.w = ini;                                              \
+                            break;                                                  \
+                    }                                                               \
+                    s = op2;                                                        \
+                }                                                                   \
+            }                                                                       \
+        }                                                                           \
+    }                                                                               \
+    if (params.reduce_flag[1])                             \
+        s = post1;                                         \
+    else                                                   \
+        s = post2;                                         \
+    *z_out = s;                                            \
+}
+
+DEFINE_REDUCE_MULTI_AXIS(l1,  0, abs(t), s+t, dot(s,1), s);
+DEFINE_REDUCE_MULTI_AXIS(l2,  0, pow(t,2), s+t, sqrt(dot(s,1)), sqrt(s));
+
+DEFINE_REDUCE_MULTI_AXIS(sum, 0, t, s+t, dot(s,1), s);
+DEFINE_REDUCE_MULTI_AXIS(sum_square, 0, pow(t,2), s+t, dot(s,1), s);
+DEFINE_REDUCE_MULTI_AXIS(prod, 1, t, s*t, s.x*s.y*s.w*s.z, s);
+DEFINE_REDUCE_MULTI_AXIS(max, -FLT_MAX, t, max(t,s), ftype4(max(max(s.x,s.y),max(s.z,s.w)),0,0,0), s);
+DEFINE_REDUCE_MULTI_AXIS(min, FLT_MAX,  t, min(t,s), ftype4(min(min(s.x,s.y),min(s.z,s.w)),0,0,0), s);
+DEFINE_REDUCE_MULTI_AXIS(mean, 0, t, s+t, dot(s,1)/params.reduce_length, s/params.reduce_length);
+
+DEFINE_REDUCE_MULTI_AXIS(log_sum, 0, t, s+t, log(dot(s,1)), log(s));
+DEFINE_REDUCE_MULTI_AXIS(log_sum_exp, 0, exp(t), s+t, log(dot(s,1)), log(s));
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.mm
new file mode 100644
index 0000000..de5a896
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_layer_acc.mm
@@ -0,0 +1,206 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+static DimsVector GetKeepDimOutput(const DimsVector& dims_input, ReduceLayerParam *param) {
+    DimsVector dims_output(dims_input);
+    for(const auto& axis : param->axis) {
+        dims_output[axis] = 1;
+    }
+    return dims_output;
+}
+
+MetalReduceLayerAcc::~MetalReduceLayerAcc() {}
+
+Status MetalReduceLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is invalid\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is invalid");
+    }
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    for (int i = 0; i < layer_param->axis.size(); ++i) {
+        auto axis = layer_param->axis[i];
+        need_reformat_ = need_reformat_ || axis == 0 || axis == 1;
+    }
+    need_reformat_ = need_reformat_ && (layer_param->keep_dims==0);
+
+    if (need_reformat_) {
+        dims_output = GetKeepDimOutput(dims_input, layer_param);
+    }
+
+    if (layer_param->axis.size() == 1) {
+        int axis = layer_param->axis[0];
+        multi_axis_ = false;
+        axis_ = axis;
+        // buffer_param_
+        {
+            MetalReduceParams metal_params;
+            SetDefaultMetalParams(metal_params, dims_input, dims_output);
+            FixDefaultMetalParams(metal_params, dims_input, dims_output);
+            metal_params.input_batch = dims_input[0];
+            metal_params.input_channel = dims_input[1];
+            metal_params.output_batch = dims_output[0];
+            metal_params.axis  = axis;
+            metal_params.input_channel_mode_4 = dims_input[1] % 4;
+            buffer_param_                     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                                    length:sizeof(MetalReduceParams)
+                                                                   options:MTLResourceCPUCacheModeWriteCombined];
+        }
+    } else {
+        multi_axis_ = true;
+        // buffer_param_
+        {
+            MetalMultiAxisReduceParams metal_params;
+            SetDefaultMetalParams(metal_params, dims_input, dims_output);
+            FixDefaultMetalParams(metal_params, dims_input, dims_output);
+            metal_params.input_batch = dims_input[0];
+            metal_params.input_channel = dims_input[1];
+            metal_params.output_batch = dims_output[0];
+            metal_params.input_channel_mode_4 = dims_input[1] % 4;
+
+            int reduce_length = 1;
+            for (auto axis : layer_param->axis) {
+                metal_params.reduce_flag[axis] = 1;
+                reduce_length *= dims_input[axis];
+            }
+            metal_params.reduce_length = reduce_length;
+            buffer_param_                     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                                    length:sizeof(MetalMultiAxisReduceParams)
+                                                                   options:MTLResourceCPUCacheModeWriteCombined];
+        }
+    }
+
+    if (need_reformat_) {
+        MetalSqueezeParams metal_params;
+        auto reformat_dims_input = dims_output;
+        auto reformat_dims_output = outputs[0]->GetBlobDesc().dims;
+
+        SetDefaultMetalParams(metal_params, reformat_dims_input, reformat_dims_output);
+        metal_params.input_channel  = reformat_dims_input[1];
+        metal_params.output_channel = reformat_dims_output[1];
+        metal_params.input_batch    = reformat_dims_input[0];
+        buffer_reformat_   = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                length:sizeof(metal_params)
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        
+        auto data_type_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+        auto buffer_bytes = data_type_byte_size * DimsFunctionUtils::GetDimProduct(reformat_dims_input, 2) * \
+                            reformat_dims_input[0] * ROUND_UP(reformat_dims_input[1], 4);
+        buffer_output_ = [device newBufferWithLength:buffer_bytes
+                                             options:MTLResourceStorageModePrivate];
+    }
+
+    return TNN_OK;
+}
+
+std::string MetalReduceLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "";
+}
+
+Status MetalReduceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is invalid\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is invalid");
+    }
+
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder      = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_output   = output->GetBlobDesc().dims;
+    if (need_reformat_) {
+        dims_output = GetKeepDimOutput(input->GetBlobDesc().dims, layer_param);
+    }
+
+    MetalBandwidth bandwidth;
+    Status status        = TNN_OK;
+    DataType data_type   = output->GetBlobDesc().data_type;
+    string data_type_str = DataTypeUtils::GetDataTypeString(data_type);
+
+    do {
+        auto kernel_name = KernelName(inputs, outputs);
+        if (kernel_name.length() <= 0) {
+            LOGE("Error: empty kernel name\n");
+            status = Status(TNNERR_LAYER_ERR, "empty kernel name");
+            break;
+        }
+
+        status = [context_impl load:[NSString stringWithUTF8String:kernel_name.c_str()]
+                            encoder:encoder
+                            bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+
+        auto output_width  = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        auto output_height = DimsFunctionUtils::GetDim(dims_output, 2);
+        auto output_slice = UP_DIV(dims_output[1], 4);
+        auto batch         = dims_output[0];
+        MTLSize threads = {(NSUInteger)output_width * output_height, (NSUInteger)output_slice, (NSUInteger)batch};
+
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                    offset:(NSUInteger)(NSUInteger)input->GetHandle().bytes_offset
+                   atIndex:0];
+        if (need_reformat_) {
+            [encoder setBuffer:buffer_output_
+                        offset:0
+                       atIndex:1];
+        } else {
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)(NSUInteger)output->GetHandle().bytes_offset
+                       atIndex:1];
+        }
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+        
+        if (need_reformat_) {
+            threads = GetDefaultThreadSize(outputs[0]->GetBlobDesc().dims, true);
+            status = [context_impl load: @"squeeze_common"
+                            encoder:encoder
+                            bandwidth:bandwidth];
+            [encoder setBuffer:buffer_output_
+                        offset:0
+                       atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->GetHandle().base
+                        offset:(NSUInteger)(NSUInteger)output->GetHandle().bytes_offset
+                       atIndex:1];
+            [encoder setBuffer:buffer_reformat_ offset:0 atIndex:2];
+            
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        }
+        
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_exp_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_exp_layer_acc.mm
new file mode 100644
index 0000000..0a94cf1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_exp_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+std::string MetalReduceLogSumExpLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_log_sum_exp_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_log_sum_exp_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_log_sum_exp_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_log_sum_exp_axis_2_common";
+    } else {
+        return "reduce_log_sum_exp_axis_3_common";
+    }
+}
+
+Status MetalReduceLogSumExpLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceLogSumExpLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_LOG_SUM_EXP, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_layer_acc.mm
new file mode 100644
index 0000000..03143a7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_log_sum_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+std::string MetalReduceLogSumLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_log_sum_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_log_sum_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_log_sum_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_log_sum_axis_2_common";
+    } else {
+        return "reduce_log_sum_axis_3_common";
+    }
+}
+
+Status MetalReduceLogSumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceLogSumLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_LOG_SUM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_max_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_max_layer_acc.mm
new file mode 100644
index 0000000..0e2bd8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_max_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+std::string MetalReduceMaxLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_max_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_max_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_max_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_max_axis_2_common";
+    } else {
+        return "reduce_max_axis_3_common";
+    }
+}
+
+Status MetalReduceMaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceMaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceMax, LAYER_REDUCE_MAX);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_MAX, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_mean_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_mean_layer_acc.mm
new file mode 100644
index 0000000..5a740e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_mean_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+std::string MetalReduceMeanLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_mean_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_mean_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_mean_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_mean_axis_2_common";
+    } else {
+        return "reduce_mean_axis_3_common";
+    }
+}
+
+Status MetalReduceMeanLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceMeanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_MEAN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_min_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_min_layer_acc.mm
new file mode 100644
index 0000000..2e47d06
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_min_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+std::string MetalReduceMinLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_min_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_min_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_min_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_min_axis_2_common";
+    } else {
+        return "reduce_min_axis_3_common";
+    }
+}
+
+Status MetalReduceMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceMinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceMin, LAYER_REDUCE_MIN);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_MIN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_prod_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_prod_layer_acc.mm
new file mode 100644
index 0000000..d51c2b5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_prod_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceProd, LAYER_REDUCE_PROD);
+
+std::string MetalReduceProdLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_prod_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_prod_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_prod_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_prod_axis_2_common";
+    } else {
+        return "reduce_prod_axis_3_common";
+    }
+}
+
+Status MetalReduceProdLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceProdLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceProd, LAYER_REDUCE_PROD);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_PROD, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_layer_acc.mm
new file mode 100644
index 0000000..563d090
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+std::string MetalReduceSumLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_sum_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_sum_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_sum_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_sum_axis_2_common";
+    } else {
+        return "reduce_sum_axis_3_common";
+    }
+}
+
+Status MetalReduceSumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceSumLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceSum, LAYER_REDUCE_SUM);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_SUM, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_square_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_square_layer_acc.mm
new file mode 100644
index 0000000..347c97e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reduce_sum_square_layer_acc.mm
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_REDUCE_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+std::string MetalReduceSumSquareLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (multi_axis_)
+        return "reduce_sum_square_multi_axis_common";
+    if (axis_ == 0) {
+        return "reduce_sum_square_axis_0_common";
+    } else if (axis_ == 1) {
+        return "reduce_sum_square_axis_1_common";
+    } else if (axis_ == 2) {
+        return "reduce_sum_square_axis_2_common";
+    } else {
+        return "reduce_sum_square_axis_3_common";
+    }
+}
+
+Status MetalReduceSumSquareLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReduceSumSquareLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalReduceLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_REDUCE_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+REGISTER_METAL_LAYOUT(LAYER_REDUCE_SUM_SQUARE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.h
new file mode 100644
index 0000000..8fd42b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_REFORMAT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_REFORMAT_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class MetalReformatLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~MetalReformatLayerAcc() {}
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs);
+    
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs, MTLSize &size);
+    
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+
+    RawBuffer scale_buffer_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_REFORMAT_LAYER_ACC_H_
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.metal
new file mode 100644
index 0000000..ff6a017
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void nc4hw4_buffer_nchw_buffer(const device ftype4 *src                  [[buffer(0)]],
+                                      device ftype *dst                        [[buffer(1)]],
+                                      constant MetalImageConverterParams &params [[buffer(2)]],
+                                      uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+
+    const int index_in =  (int)gid.z*params.slice*params.size + (int)gid.y*params.size + (int)gid.x;
+
+    int channel_out = gid.y * 4;
+    int index_out = ((int)gid.z*params.channel + channel_out)*params.size + (int)gid.x;
+
+    ftype4 data  = src[index_in];
+
+    dst[index_out] = data.x;
+    if (channel_out + 3 < params.channel) {
+        dst[index_out+params.size]   = data.y;
+        dst[index_out+params.size*2] = data.z;
+        dst[index_out+params.size*3] = data.w;
+    } else if (channel_out + 2 < params.channel) {
+        dst[index_out+params.size]   = data.y;
+        dst[index_out+params.size*2] = data.z;
+    } else if (channel_out + 1 < params.channel) {
+        dst[index_out+params.size]   = data.y;
+    }
+}
+
+kernel void nchw_buffer_nc4hw4_buffer(const device ftype *src                   [[buffer(0)]],
+                                      device ftype4 *dst                       [[buffer(1)]],
+                                      constant MetalImageConverterParams& params [[buffer(2)]],
+                                      uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+
+    int channel_in= gid.y * 4;
+    int index_in = ((int)gid.z*params.channel + channel_in)*params.size + (int)gid.x;
+
+    const int index_out =  (int)gid.z*params.slice*params.size + (int)gid.y * params.size + (int)gid.x;
+
+    ftype4 data  = ftype4(Zero4);
+
+    data.x = src[index_in];
+    if (channel_in + 3 < params.channel) {
+        data.y = src[index_in + params.size];
+        data.z = src[index_in + params.size*2];
+        data.w = src[index_in + params.size*3];
+    } else if (channel_in + 2 < params.channel) {
+        data.y = src[index_in + params.size];
+        data.z = src[index_in + params.size*2];
+    } else if (channel_in + 1 < params.channel) {
+        data.y = src[index_in + params.size];
+    }
+
+    dst[index_out] = data;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.mm
new file mode 100644
index 0000000..ef1b71f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reformat_layer_acc.mm
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_reformat_layer_acc.h"
+
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+Status MetalReformatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(MetalLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_format == DATA_FORMAT_NC4HW4 && reformat_param->dst_format == DATA_FORMAT_NCHW) {
+        // nc4hw4 to nchw
+        ;
+    } else if (reformat_param->src_format == DATA_FORMAT_NCHW && reformat_param->dst_format == DATA_FORMAT_NC4HW4) {
+        // nchw to nc4hw4
+        ;
+    } else {
+        LOGE("MetalReformatLayerAcc::Init Error: src_fmt: %d, dst_fmt: %d, src_type: %d, dst_type: %d\n",
+             reformat_param->src_format, reformat_param->dst_format, reformat_param->src_type,
+             reformat_param->dst_type);
+        return Status(TNNERR_MODEL_ERR, "MetalReformatLayerAcc::Init unsupport reformat type");
+    }
+    return AllocateBufferParam(inputs, outputs);
+}
+
+std::vector<DataFormat> MetalReformatLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list{DATA_FORMAT_NC4HW4, DATA_FORMAT_NCHW};
+    return support_list;
+}
+
+Status MetalReformatLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalImageConverterParams metal_params;
+        metal_params.width   = DimsFunctionUtils::GetDimProduct(dims, 3);
+        metal_params.height  = DimsFunctionUtils::GetDim(dims, 2);
+        metal_params.size    = metal_params.height * metal_params.width;
+        metal_params.channel = dims[1];
+        metal_params.slice   = UP_DIV(metal_params.channel, 4);
+        metal_params.batch   = dims[0];
+        buffer_param_        = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                length:sizeof(metal_params)
+                                               options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+    
+Status MetalReformatLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                                const std::vector<Blob *> &outputs,
+                                                MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = MTLSizeMake(DimsFunctionUtils::GetDimProduct(dims_output, 2), UP_DIV(dims_output[1], 4), dims_output[0]);
+    return TNN_OK;
+}
+    
+std::string MetalReformatLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param_);
+    if (reformat_param->src_format == DATA_FORMAT_NCHW) {
+        return "nchw_buffer_nc4hw4_buffer";
+    }
+    return "nc4hw4_buffer_nchw_buffer";
+}
+    
+Status MetalReformatLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Reformat, LAYER_REFORMAT)
+REGISTER_METAL_LAYOUT(LAYER_REFORMAT, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.metal
new file mode 100644
index 0000000..a26d91d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void relu6(const device ftype4 *in                  [[buffer(0)]],
+                           device ftype4 *out                          [[buffer(1)]],
+                           constant MetalParams& params    [[buffer(2)]],
+                           uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = clamp(*z_in, Zero4, Six4);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.mm
new file mode 100644
index 0000000..63cc8f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu6_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Relu6, LAYER_RELU6);
+
+string MetalRelu6LayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "relu6";
+}
+
+Status MetalRelu6LayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalRelu6LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Relu6, LAYER_RELU6);
+REGISTER_METAL_LAYOUT(LAYER_RELU6, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.metal
new file mode 100644
index 0000000..a7595db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.metal
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void relu(const device ftype4 *in                 [[buffer(0)]],
+                         device ftype4 *out                         [[buffer(1)]],
+                        constant MetalParams& params    [[buffer(2)]],
+                         uint3 gid                                        [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = fmax(*z_in, Zero4);
+    //    *z_out = *z_in;
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.mm
new file mode 100644
index 0000000..6f8808e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_relu_layer_acc.mm
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Relu, LAYER_RELU);
+
+string MetalReluLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "relu";
+}
+
+Status MetalReluLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalReluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Relu, LAYER_RELU);
+REGISTER_METAL_LAYOUT(LAYER_RELU, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.metal
new file mode 100644
index 0000000..2e99922
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.metal
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void reorg_forward_common(const device ftype4 *src             [[buffer(0)]],
+                                 device ftype4 *dst                   [[buffer(1)]],
+                                 constant MetalReorgParams& params    [[buffer(2)]],
+                                 uint3 gid                            [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+
+    auto index_out = (int)gid.z*params.output_height*params.output_width + (int)gid.y*params.output_width + (int)gid.x;
+
+    int iw = (int)gid.x / params.stride;
+    int rw = (int)gid.x % params.stride;
+    int ih = (int)gid.y / params.stride;
+    int rh = (int)gid.y % params.stride;
+
+    int os    = (int)gid.z % params.output_slice;
+    int batch = (int)gid.z / params.output_slice;
+    int4 oc   = os * 4 + int4(0, 1, 2, 3);
+
+    int4 ic = ((rh * params.stride) + rw) * params.output_channel + oc;
+    if (params.mode == 1)
+        ic = ((oc * params.stride + rh) * params.stride) + rw;
+    int4 is  = ic / 4;
+    int4 icr = ic % 4;
+
+    int4 index_in = ((batch * params.input_slice + is) * params.input_height + ih) * params.input_width + iw;
+
+    bool4 valid_pos = oc < params.output_channel;
+    index_in = select(int4(0), index_in, valid_pos);
+    icr      = select(int4(0), icr,      valid_pos);
+
+    ftype4 val = select(
+        ftype4(0),
+        ftype4(
+            src[index_in[0]][icr[0]],
+            src[index_in[1]][icr[1]],
+            src[index_in[2]][icr[2]],
+            src[index_in[3]][icr[3]]),
+        valid_pos);
+
+    dst[index_out] = val;
+}
+
+kernel void reorg_backward_common(const device ftype4 *src             [[buffer(0)]],
+                                  device ftype4 *dst                   [[buffer(1)]],
+                                  constant MetalReorgParams& params    [[buffer(2)]],
+                                  uint3 gid                            [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+
+    auto index_out = (int)gid.z*params.output_height*params.output_width + (int)gid.y*params.output_width + (int)gid.x;
+
+    int os    = (int)gid.z % params.output_slice;
+    int batch = (int)gid.z / params.output_slice;
+    int4 oc   = os * 4 + int4(0, 1, 2, 3);
+
+    int4 ic     = oc % params.input_channel;
+    int4 offset = oc / params.input_channel;
+    if (params.mode == 1) {
+        ic     = oc / (params.stride * params.stride);
+        offset = oc % (params.stride * params.stride);
+    }
+
+    int4 ih  = (int)gid.y * params.stride + offset / params.stride;
+    int4 iw  = (int)gid.x * params.stride + offset % params.stride;
+    int4 is  = ic / 4;
+    int4 icr = ic % 4;
+
+    int4 index_in = ((batch * params.input_slice + is) * params.input_height + ih) * params.input_width + iw;
+
+    bool4 valid_pos = oc < params.output_channel;
+    index_in = select(int4(0), index_in, valid_pos);
+    icr      = select(int4(0), icr,      valid_pos);
+
+    ftype4 val = select(
+        ftype4(0),
+        ftype4(
+            src[index_in[0]][icr[0]],
+            src[index_in[1]][icr[1]],
+            src[index_in[2]][icr[2]],
+            src[index_in[3]][icr[3]]),
+        valid_pos);
+
+    dst[index_out] = val;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.mm
new file mode 100644
index 0000000..38e5828
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reorg_layer_acc.mm
@@ -0,0 +1,93 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Reorg, LAYER_REORG);
+
+Status MetalReorgLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReorgLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto layer_param     = dynamic_cast<ReorgLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalReorgParams metal_params;
+        metal_params.batch          = dims_input[0];
+        metal_params.input_channel  = dims_input[1];
+        metal_params.input_slice    = UP_DIV(dims_input[1], 4);
+        metal_params.input_height   = DimsFunctionUtils::GetDim(dims_input, 2);
+        metal_params.input_width    = DimsFunctionUtils::GetDim(dims_input, 3);
+
+        metal_params.output_channel  = dims_output[1];
+        metal_params.output_slice   = UP_DIV(dims_output[1], 4);
+        metal_params.output_height   = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_width    = DimsFunctionUtils::GetDim(dims_output, 3);
+
+        metal_params.stride  = layer_param->stride;
+        metal_params.mode    = layer_param->mode;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalReorgParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalReorgLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+std::string MetalReorgLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param     = dynamic_cast<ReorgLayerParam *>(param_);
+    if (layer_param->forward) {
+        return "reorg_forward_common";
+    } else {
+        return "reorg_backward_common";
+    }
+    return "";
+}
+
+Status MetalReorgLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+Status MetalReorgLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Reorg, LAYER_REORG);
+REGISTER_METAL_LAYOUT(LAYER_REORG, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.metal
new file mode 100644
index 0000000..48b3715
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.metal
@@ -0,0 +1,132 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void reshape_common_nchw(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalReshapeParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_out = (int)gid.y*params.output_size + (int)gid.x;
+    int4 index_nchw = ((int)gid.y*4 + int4(0, 1, 2, 3))*params.output_size + (int)gid.x;
+
+    int chw_size = params.output_size * params.output_channel;
+    bool4 flag = (index_nchw >= chw_size);
+
+    index_nchw += (int)gid.z*params.output_channel*params.output_size;
+    index_out += (int)gid.z*params.output_slice*params.output_size;
+    
+    int4 input_batch = index_nchw / (params.input_channel * params.input_size);
+    int4 input_channel = (index_nchw / params.input_size) % params.input_channel;
+    int4 input_x = index_nchw - (input_batch * params.input_channel  + input_channel) * params.input_size;
+    int4 input_slice = input_channel / 4;
+    int4 input_i = input_channel % 4;
+    
+    int4 index_in = input_batch * params.input_slice * params.input_size + input_slice * params.input_size + input_x;
+
+    ftype4 val = ftype4(0);
+    if (flag[1] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            0,
+            0,
+            0
+        );
+    } else if (flag[2] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            0,
+            0
+        );
+    } else if(flag[3] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            0
+        );
+    } else {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]
+        );
+    }
+    dst[index_out] = val;
+}
+
+kernel void reshape_common_nhwc(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalReshapeParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    int index_out = (int)gid.y*params.output_size + (int)gid.x;
+    int4 index_nhwc = (int)gid.y*4 + int4(0, 1, 2, 3) + (int)gid.x*params.output_channel;
+
+    int chw_size = params.output_size * params.output_channel;
+    bool4 flag = (index_nhwc >= chw_size);
+
+    index_nhwc += (int)gid.z*params.output_channel*params.output_size;
+    index_out += (int)gid.z*params.output_slice*params.output_size;
+
+    int4 input_batch = index_nhwc / (params.input_channel * params.input_size);
+    int4 input_x = (index_nhwc / params.input_channel) % params.input_size;
+    int4 input_channel = index_nhwc - (input_batch * params.input_size + input_x) * params.input_channel;
+    int4 input_slice = input_channel / 4;
+    int4 input_i = input_channel % 4;
+
+    int4 index_in = input_batch * params.input_slice * params.input_size + input_slice * params.input_size + input_x;
+
+    ftype4 val = ftype4(0);
+    if (flag[1] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            0,
+            0,
+            0
+        );
+    } else if (flag[2] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            0,
+            0
+        );
+    } else if(flag[3] == true) {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            0
+        );
+    } else {
+        val = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]
+        );
+    }
+    dst[index_out] = val;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.mm
new file mode 100644
index 0000000..8b06a06
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_reshape_layer_acc.mm
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+// ReshapeLayer has loaded constant input and set layer_param
+DECLARE_METAL_ACC_WITH_EXTRA(Reshape, LAYER_RESHAPE, protected: virtual bool UseNaiveConstantBlobs(){return true;});
+
+Status MetalReshapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalReshapeLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalReshapeParams metal_params;
+        metal_params.input_width   = DimsFunctionUtils::GetDimProduct(dims_input, 3);
+        metal_params.input_height  = DimsFunctionUtils::GetDim(dims_input, 2);
+        metal_params.input_size    = metal_params.input_height * metal_params.input_width;
+        metal_params.input_slice   = UP_DIV(dims_input[1], 4);
+        metal_params.input_channel = dims_input[1];
+
+        metal_params.output_width   = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        metal_params.output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+        metal_params.output_size    = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice   = UP_DIV(dims_output[1], 4);
+        metal_params.output_channel = dims_output[1];
+        metal_params.batch          = dims_output[0];
+
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalReshapeLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<ReshapeLayerParam *>(param_);
+    bool layout_nchw_ = (layer_param->reshape_type == 0);
+    std::string kernel_name = "";
+    if (layout_nchw_) {
+        kernel_name = "reshape_common_nchw";
+    } else {
+        kernel_name = "reshape_common_nhwc";
+    }
+    return kernel_name;
+}
+
+Status MetalReshapeLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    auto output_width   = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+    auto output_height  = DimsFunctionUtils::GetDim(dims_output, 2);
+    auto output_size    = output_height * output_width;
+    auto output_slice   = UP_DIV(dims_output[1], 4);
+    auto batch          = dims_output[0];
+    size = MTLSizeMake(output_size, output_slice, batch);
+
+    return TNN_OK;
+}
+
+Status MetalReshapeLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalReshapeLayerAcc::SetKernelEncoderParam(
+                                                   id<MTLComputeCommandEncoder> encoder,
+                                                   const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Reshape, LAYER_RESHAPE);
+REGISTER_METAL_LAYOUT(LAYER_RESHAPE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.metal
new file mode 100644
index 0000000..e47bdea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void selu(const device ftype4 *in                           [[buffer(0)]],
+                        device ftype4 *out                                   [[buffer(1)]],
+                        constant MetalSeluParams& params       [[buffer(2)]],
+                        uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = params.gamma * fmax(*z_in, Zero4) + params.gamma * (params.alpha * exp(fmin(*z_in, Zero4)) - params.alpha);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.mm
new file mode 100644
index 0000000..93e16f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_selu_layer_acc.mm
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Selu, LAYER_SELU);
+
+string MetalSeluLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "selu";
+}
+
+Status MetalSeluLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SeluLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: SeluLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "SeluLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalSeluParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.alpha = layer_param->alpha;
+        metal_params.gamma = layer_param->gamma;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalSeluParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalSeluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Selu, LAYER_SELU);
+REGISTER_METAL_LAYOUT(LAYER_SELU, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.metal
new file mode 100644
index 0000000..69962fe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.metal
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void channel_shuffle(const device ftype4 *src                              [[buffer(0)]],
+                                             device ftype4 *dst                                        [[buffer(1)]],
+                                             constant MetalShuffleParams &params      [[buffer(2)]],
+                                             uint3 gid                                                       [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    const int group_row = params.group;
+    const int channel = params.input_channel;
+    const int group_col = params.channel_per_group;
+    
+    int index_out = (int)gid.z*params.output_slice*params.output_size + (int)gid.y*params.output_size + (int)gid.x;
+    
+    int4 output_channel = gid.y*4 + int4(0, 1, 2, 3);
+    int4 input_channel_col = output_channel/group_row;
+    int4 input_channel_row = output_channel - group_row*input_channel_col;
+    
+    int4 input_channel = input_channel_row*group_col + input_channel_col;
+    input_channel = min(input_channel, int4(channel - 1));
+    int4 input_slice = input_channel/4;
+    int4 input_i = input_channel%4;
+    
+    int4 index_in = (int)gid.z*params.input_slice*params.input_size + input_slice*params.input_size + (int)gid.x;
+    dst[index_out] = ftype4(
+        src[index_in[0]][input_i[0]],
+        src[index_in[1]][input_i[1]],
+        src[index_in[2]][input_i[2]],
+        src[index_in[3]][input_i[3]]
+    );
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.mm
new file mode 100644
index 0000000..3727b2c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_shuffle_layer_acc.mm
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status IsMetalShuffleLayerAccSupported(LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+
+    auto layer_param = dynamic_cast<ShuffleLayerParam *>(param);
+    if (!layer_param || layer_param->group <= 0) {
+        LOGE("ShuffleLayerParam is nil\n");
+        return Status(TNNERR_LAYER_ERR, "ShuffleLayerParam is nil");
+    }
+    auto dims_input = inputs[0]->GetBlobDesc().dims;
+    if (dims_input[1] % layer_param->group != 0) {
+        LOGE("ShuffleLayerParam group is invalid\n");
+        return Status(TNNERR_LAYER_ERR, "ShuffleLayerParam group is invalid");
+    }
+
+    return TNN_OK;
+}
+
+DECLARE_METAL_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+Status MetalShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalShuffleLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    Status status = IsMetalShuffleLayerAccSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    auto layer_param = dynamic_cast<ShuffleLayerParam *>(param_);
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    {
+        auto dims_input = inputs[0]->GetBlobDesc().dims;
+        MetalShuffleParams metal_params;
+
+        metal_params.input_size    = DimsFunctionUtils::GetDimProduct(dims_input, 2);
+        metal_params.input_channel = dims_input[1];
+        metal_params.input_slice   = UP_DIV(dims_input[1], 4);
+
+        metal_params.output_size  = DimsFunctionUtils::GetDimProduct(dims_output, 2);
+        metal_params.output_slice = UP_DIV(dims_output[1], 4);
+
+        metal_params.group             = layer_param->group;
+        metal_params.channel_per_group = dims_input[1] / metal_params.group;
+
+        metal_params.batch = dims_output[0];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalShuffleParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    return TNN_OK;
+}
+
+std::string MetalShuffleLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "channel_shuffle";
+}
+
+Status MetalShuffleLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    const auto& output_dims = outputs[0]->GetBlobDesc().dims;
+    auto hw = DimsVectorUtils::Count(output_dims, 2);
+    auto slice = UP_DIV(output_dims[1], 4);
+    size = MTLSizeMake(hw, slice, output_dims[0]);
+
+    return TNN_OK;
+    //return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalShuffleLayerAcc::SetKernelEncoderParam(
+                                               id<MTLComputeCommandEncoder> encoder,
+                                               const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalShuffleLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = IsMetalShuffleLayerAccSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+REGISTER_METAL_LAYOUT(LAYER_SHUFFLE_CHANNEL, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.metal
new file mode 100644
index 0000000..285cee4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.metal
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void sigmoid(const device ftype4 *in                     [[buffer(0)]],
+                                device ftype4 *out                            [[buffer(1)]],
+                                constant MetalParams& params      [[buffer(2)]],
+                                uint3 gid                                            [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    *z_out = One4/(One4 + exp(-*z_in));
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.mm
new file mode 100644
index 0000000..b33c0e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sigmoid_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Sigmoid, LAYER_SIGMOID);
+
+string MetalSigmoidLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "sigmoid";
+}
+
+Status MetalSigmoidLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalSigmoidLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Sigmoid, LAYER_SIGMOID);
+REGISTER_METAL_LAYOUT(LAYER_SIGMOID, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.metal
new file mode 100644
index 0000000..5639bfd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void sign(const device ftype4 *in                  [[buffer(0)]],
+                           device ftype4 *out                          [[buffer(1)]],
+                           constant MetalParams& params    [[buffer(2)]],
+                           uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = sign(*z_in);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.mm
new file mode 100644
index 0000000..17adf80
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sign_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Sign, LAYER_SIGN);
+
+string MetalSignLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "sign";
+}
+
+Status MetalSignLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalSignLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Sign, LAYER_SIGN);
+REGISTER_METAL_LAYOUT(LAYER_SIGN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.metal
new file mode 100644
index 0000000..dc8bb35
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.metal
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+#define ARITHMETIC
+
+using namespace metal;
+kernel void signed_mul_fused(const device ftype4 *in_ptr                           [[buffer(0)]],
+                        device ftype4 *out_ptr                                   [[buffer(1)]],
+                        constant MetalSignedMulParams& params       [[buffer(2)]],
+                        uint3 gid                                                  [[thread_position_in_grid]]) {
+    
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto in  = in_ptr[(int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x];
+    auto z_out = out_ptr + (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    auto mul = in_ptr[(int)gid.z * params.input_slice * params.input_size  +  (int)gid.x];
+    mul = mul.xxxx;
+    
+    auto temp = in - params.alpha;
+    auto mul_temp  = mul - params.alpha;
+    temp = sign(temp)*1;
+    mul_temp = sign(mul_temp)*1;
+
+    temp += params.beta;
+    mul_temp += params.beta;
+    temp *= params.gamma_inv;
+    mul_temp *= params.gamma_inv;
+    temp *= mul_temp;
+    *z_out = temp;
+}
+
+/*
+* Specialization for channel=4
+*/
+kernel void signed_mul_fused_channel4(const device ftype4 *in_ptr                           [[buffer(0)]],
+                        device ftype4 *out_ptr                                   [[buffer(1)]],
+                        constant MetalSignedMulParams& params       [[buffer(2)]],
+                        uint3 gid                                                  [[thread_position_in_grid]]) {
+    
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto in  = in_ptr[(int)gid.z * params.input_slice * params.input_size  + (int)gid.x];
+    auto z_out = out_ptr + (int)gid.z * params.output_slice * params.output_size + (int)gid.x;
+    
+    auto temp = in - params.alpha;
+    temp = sign(temp)*1;
+
+    temp += params.beta;
+    temp *= params.gamma_inv;
+    auto mul = temp.xxxx;
+    temp *= mul;
+    *z_out = temp;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.mm
new file mode 100644
index 0000000..df0b1c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_signed_mul_layer_acc.mm
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(SignedMul, LAYER_SIGNED_MUL);
+
+string MetalSignedMulLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().dims[1] == 4)
+        return "signed_mul_fused_channel4";
+
+    return "signed_mul_fused";
+}
+
+Status MetalSignedMulLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SignedMulLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: SignedMulLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "SignedMulLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    if (buffer_param_ == nil) {
+        MetalSignedMulParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        metal_params.alpha = layer_param->alpha;
+        metal_params.beta  = layer_param->beta;
+        if (layer_param->gamma == 0) {
+            LOGE("Error: SignedMulLayerParam.gamma should not be 0\n");
+            return Status(TNNERR_MODEL_ERR, "SignedMulLayerParam gamma=0");
+        }
+        metal_params.gamma_inv = 1.0 / layer_param->gamma;
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalSignedMulParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalSignedMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(SignedMul, LAYER_SIGNED_MUL);
+REGISTER_METAL_LAYOUT(LAYER_SIGNED_MUL, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.metal
new file mode 100644
index 0000000..8cf1a2e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void sin(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = sin(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.mm
new file mode 100644
index 0000000..f102bf5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sin_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Sin, LAYER_SIN);
+
+string MetalSinLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "sin";
+}
+
+Status MetalSinLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalSinLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Sin, LAYER_SIN);
+REGISTER_METAL_LAYOUT(LAYER_SIN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.metal
new file mode 100644
index 0000000..a113a43
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.metal
@@ -0,0 +1,305 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void softmax_common(const device ftype4 *src     [[buffer(0)]],
+                           device       ftype4 *dst     [[buffer(1)]],
+                           constant MetalArgMaxOrMinParams &params  [[buffer(2)]],
+                           uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, 1, params.outer_size)))
+        return;
+
+    int index_in  = (int)gid.z * params.inner_size * params.reduce_size + (int)gid.x;
+
+    ftype4 max4 = src[index_in];
+    for(int i=1; i<params.reduce_size; ++i) {
+        max4 = max(max4, src[index_in + i*params.inner_size]);
+    }
+
+    float4 sum4 = float4(Zero4);
+    for(int i=0; i<params.reduce_size; ++i) {
+        ftype4 val4 = src[index_in + i*params.inner_size];
+        float4 rst4 = exp(float4(val4 - max4));
+        dst[index_in + i*params.inner_size] = ftype4(rst4);
+        sum4 += rst4;
+    }
+
+    sum4 = 1.0f / sum4;
+
+    for(int i=0; i<params.reduce_size; ++i) {
+        float4 val4 = float4(dst[index_in + i*params.inner_size]);
+        val4 = val4 * sum4;
+        dst[index_in + i*params.inner_size] = ftype4(val4);
+    }
+}
+
+kernel void softmax_axis_2_common(
+                                            const device ftype4 *src                [[buffer(0)]],
+                                            device ftype4 *dst                      [[buffer(1)]],
+                                            constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                            uint3 gid                               [[thread_position_in_grid]])
+{
+    if (any(gid >= uint3(params.output_width, 1, params.output_slice*params.batch)))
+        return;
+    
+    int index = (int)gid.z*params.output_size + (int)gid.x;
+    auto const src_data = src + index;
+    auto const dst_data = dst + index;
+    
+    //max
+    auto src_data_ptr = src_data;
+    auto max4 = *src_data_ptr;
+    for (int s = 1; s < params.output_height; s++) {
+        src_data_ptr += params.output_width;
+        max4 = max(max4, *src_data_ptr);
+    }
+    
+    //exp
+    src_data_ptr = src_data;
+    auto dst_data_ptr = dst_data;
+    float4 sum4 = float4(0,0,0,0);
+    for (int s = 0; s < params.output_height; s++) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+        
+        src_data_ptr += params.output_width;
+        dst_data_ptr += params.output_width;
+    }
+    
+    //sum
+    sum4 = 1.0f/sum4;
+    
+    //division
+    dst_data_ptr = dst_data;
+    for (int s = 0; s < params.output_height; s++) {
+        *dst_data_ptr = ftype4(float4(*dst_data_ptr)*sum4);
+        dst_data_ptr += params.output_width;
+    }
+}
+
+kernel void softmax_axis_1_common_mode_0(
+                                            const device ftype4 *src                [[buffer(0)]],
+                                            device ftype4 *dst                      [[buffer(1)]],
+                                            constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                            uint3 gid                               [[thread_position_in_grid]])
+{
+    if (any(gid >= uint3(params.output_size, 1, params.batch)))
+        return;
+    
+    int index = (int)gid.z*params.output_slice*params.output_size + (int)gid.x;
+    auto const src_data = src + index;
+    auto const dst_data = dst + index;
+    
+    //max
+    auto src_data_ptr = src_data;
+    auto max4 = *src_data_ptr;
+    for (int s = 1; s < params.output_slice; s++) {
+        src_data_ptr += params.output_size;
+        max4 = fmax(max4, *src_data_ptr);
+    }
+    auto max_01 = fmax(max4, max4.yzwx);
+    auto max_23 = fmax(max4.zwxy, max4.wxyz);
+    max4 = fmax(max_01, max_23);
+    
+    //exp
+    src_data_ptr = src_data;
+    auto dst_data_ptr = dst_data;
+    float4 sum4 = float4(0,0,0,0);
+    for (int s = 0; s < params.output_slice; s++) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+        
+        src_data_ptr += params.output_size;
+        dst_data_ptr += params.output_size;
+    }
+    
+    //sum
+    float4 sum_01 = sum4 + sum4.yzwx;
+    float4 sum_23 = sum4.zwxy + sum4.wxyz;
+    sum4 = sum_01 + sum_23;
+    sum4 = 1.0f/sum4;
+    
+    //division
+    dst_data_ptr = dst_data;
+    for (int s = 0; s < params.output_slice; s++) {
+        *dst_data_ptr = ftype4(float4(*dst_data_ptr)*sum4);
+        dst_data_ptr += params.output_size;
+    }
+}
+
+kernel void softmax_axis_1_common(
+                                            const device ftype4 *src                [[buffer(0)]],
+                                            device ftype4 *dst                      [[buffer(1)]],
+                                            constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                            uint3 gid                               [[thread_position_in_grid]])
+{
+    if (any(gid >= uint3(params.output_size, 1, params.batch)))
+        return;
+    
+    int index = (int)gid.z*params.output_slice*params.output_size + (int)gid.x;
+    auto const src_data = src + index;
+    auto const dst_data = dst + index;
+    
+    int low_slice = params.channel_remain > 0 ? params.output_slice-1 : params.output_slice;
+    //max
+    auto src_data_ptr = src_data;
+    auto max4 = *src_data_ptr;
+    for (int s = 1; s < low_slice; s++) {
+        src_data_ptr += params.output_size;
+        max4 = max(max4, *src_data_ptr);
+    }
+    if (params.channel_remain == 1) {
+        auto max4_remain = *(src_data_ptr + params.output_size);
+        max4 = max(max4, max4_remain.xxxx);
+    } else if (params.channel_remain == 2) {
+        auto max4_remain = *(src_data_ptr + params.output_size);
+        max4 = max(max4, max4_remain.xyxy);
+    } else if (params.channel_remain == 3) {
+        auto max4_remain = *(src_data_ptr + params.output_size);
+        max4 = max(max4, max4_remain.xyzx);
+    }
+    auto max_01 = max(max4, max4.yzwx);
+    auto max_23 = max(max4.zwxy, max4.wxyz);
+    max4 = max(max_01, max_23);
+    
+    //exp
+    src_data_ptr = src_data;
+    auto dst_data_ptr = dst_data;
+    float4 sum4 = float4(0,0,0,0);
+    for (int s = 0; s < low_slice; s++) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+        
+        src_data_ptr += params.output_size;
+        dst_data_ptr += params.output_size;
+    }
+    if (params.channel_remain == 1) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        temp = float4(temp.x, 0, 0, 0);
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+    } else if (params.channel_remain == 2) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        temp = float4(temp.xy, 0, 0);
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+    } else if (params.channel_remain == 3) {
+        auto temp = exp(float4(*src_data_ptr - max4));
+        temp = float4(temp.xyz, 0);
+        *dst_data_ptr = ftype4(temp);
+        sum4 += temp;
+    }
+    
+    //sum
+    float4 sum_01 = sum4 + sum4.yzwx;
+    float4 sum_23 = sum4.zwxy + sum4.wxyz;
+    sum4 = sum_01 + sum_23;
+    sum4 = 1.0f/sum4;
+    
+    //division
+    dst_data_ptr = dst_data;
+    for (int s = 0; s < params.output_slice; s++) {
+        *dst_data_ptr = ftype4(float4(*dst_data_ptr)*sum4);
+        dst_data_ptr += params.output_size;
+    }
+}
+
+kernel void softmax_axis_1_slice_1_channel_4(
+                                            const device ftype4 *src                [[buffer(0)]],
+                                            device ftype4 *dst                      [[buffer(1)]],
+                                            constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                            uint3 gid                               [[thread_position_in_grid]])
+{
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto in  = src[index];
+    
+    //max
+    auto max_01 = fmax(in, in.yzwx);
+    auto max_23 = fmax(in.zwxy, in.wxyz);
+    auto max_0123 = fmax(max_01, max_23);
+    
+    //exp
+    float4 exp4 = exp(float4(in - max_0123));
+    
+    //sum
+    float4 sum_01 = exp4 + exp4.yzwx;
+    float4 sum_23 = exp4.zwxy + exp4.wxyz;
+    float4 sum_0123 = sum_01 + sum_23;
+    
+    //division
+    dst[index] = ftype4(exp4/sum_0123);
+}
+
+kernel void softmax_axis_1_slice_1_channel_3(
+                                             const device ftype4 *src                [[buffer(0)]],
+                                             device ftype4 *dst                      [[buffer(1)]],
+                                             constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                             uint3 gid                               [[thread_position_in_grid]])
+ {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto in  = src[index];
+    
+    //max
+    auto max_012 = fmax(fmax(in, in.yzxw), in.zxyw);
+    
+    //exp
+    float4 exp4 = exp(float4(in - max_012));
+    
+    //sum
+    float4 sum_012 = exp4 + exp4.yzxw + exp4.zxyw;
+    
+    //division
+    float4 div = exp4/sum_012;
+    dst[index] = ftype4(ftype3(div.xyz), 0.0h);
+}
+
+kernel void softmax_axis_1_slice_1_channel_2(
+                                             const device ftype4 *src                [[buffer(0)]],
+                                             device ftype4 *dst                      [[buffer(1)]],
+                                             constant MetalSoftmaxParams &params            [[buffer(2)]],
+                                             uint3 gid                               [[thread_position_in_grid]])
+{
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index = (int)gid.z * params.output_size + (int)gid.x;
+    auto in  = src[index];
+    
+    //max
+    auto max_01 = fmax(in, in.yxzw);
+    
+    //exp
+    float4 exp4 = exp(float4(in - max_01));
+    
+    //sum
+    float4 sum_012 = exp4 + exp4.yxzw;
+    
+    //division
+    float4 div = exp4/sum_012;
+    dst[index] = ftype4(ftype2(div.xy), 0.0h, 0.0h);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.mm
new file mode 100644
index 0000000..472e12e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softmax_layer_acc.mm
@@ -0,0 +1,155 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(Softmax, LAYER_SOFTMAX);
+
+Status MetalSoftmaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    const auto& input_dims = inputs[0]->GetBlobDesc().dims;
+    layer_param->axis = (layer_param->axis + input_dims.size()) % input_dims.size();
+
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalSoftmaxLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    if (layer_param->axis != 1) {
+        auto dims_input  = inputs[0]->GetBlobDesc().dims;
+        auto input_channel = dims_input[1];
+        dims_input[1] = UP_DIV(input_channel, 4);
+        MetalArgMaxOrMinParams metal_params;
+        metal_params.input_channel = input_channel;
+        auto axis = layer_param->axis;
+        metal_params.reduce_size = dims_input[axis];
+        metal_params.outer_size  = DimsVectorUtils::Count(dims_input, 0, axis);
+        metal_params.inner_size  = DimsVectorUtils::Count(dims_input, axis+1);
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalArgMaxOrMinParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+        return TNN_OK;
+    }
+    // buffer_param_
+    {
+        MetalSoftmaxParams metal_params;
+        metal_params.output_width   = DimsFunctionUtils::GetDimProduct(output_dims, 3);
+        metal_params.output_height  = DimsFunctionUtils::GetDim(output_dims, 2);
+        metal_params.output_size    = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice   = UP_DIV(output_dims[1], 4);
+        metal_params.channel_remain = output_dims[1] % 4;
+
+        metal_params.batch = output_dims[0];
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalSoftmaxParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalSoftmaxLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "";
+}
+
+Status MetalSoftmaxLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalSoftmaxLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalSoftmaxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    auto context_impl              = context_->getMetalContextImpl();
+    auto encoder                   = [context_impl encoder];
+    encoder.label = GetKernelLabel();
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    MetalBandwidth bandwidth;
+    Status status      = TNN_OK;
+    DataType data_type = output->GetBlobDesc().data_type;
+
+    do {
+        if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+            LOGE("data type(%d) is unsupported\n", data_type);
+            status = Status(TNNERR_LAYER_ERR, "data type is unsupported");
+        }
+        BREAK_IF(status != TNN_OK);
+        auto output_dims    = output->GetBlobDesc().dims;
+        auto batch          = output_dims[0];
+        auto output_channel = output_dims[1];
+        auto output_height  = DimsFunctionUtils::GetDim(output_dims, 2);
+        auto output_width   = DimsFunctionUtils::GetDimProduct(output_dims, 3);
+        auto output_slice   = UP_DIV(output_dims[1], 4);
+        auto mode           = output_dims[1] % 4;
+
+        MTLSize threads;
+        if (layer_param->axis == 1) {
+            threads = {(NSUInteger)output_width * output_height, 1, (NSUInteger)batch};
+            if (output_slice == 1) {
+                status =
+                    [context_impl load:[NSString stringWithFormat:@"softmax_axis_1_slice_1_channel_%d", output_channel]
+                               encoder:encoder
+                             bandwidth:bandwidth];
+            } else {
+                if (mode == 0) {
+                    status = [context_impl load:@"softmax_axis_1_common_mode_0" encoder:encoder bandwidth:bandwidth];
+                } else {
+                    status = [context_impl load:@"softmax_axis_1_common" encoder:encoder bandwidth:bandwidth];
+                }
+            }
+        } else {
+            GetSingleAxisSplitSize(output_dims, layer_param->axis, threads, true);
+            status  = [context_impl load:@"softmax_common" encoder:encoder bandwidth:bandwidth];
+        }
+        BREAK_IF(status != TNN_OK);
+
+        status = SetKernelEncoderParam(encoder, inputs, outputs);
+        BREAK_IF(status != TNN_OK);
+
+        status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+        BREAK_IF(status != TNN_OK);
+    } while (0);
+
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+REGISTER_METAL_ACC(Softmax, LAYER_SOFTMAX);
+REGISTER_METAL_LAYOUT(LAYER_SOFTMAX, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.metal
new file mode 100644
index 0000000..d3c1b3e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.metal
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void softplus(const device ftype4 *in                     [[buffer(0)]],
+                                device ftype4 *out                            [[buffer(1)]],
+                                constant MetalParams& params      [[buffer(2)]],
+                                uint3 gid                                            [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    *z_out = log(exp(*z_in) + One4);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.mm
new file mode 100644
index 0000000..6b01f7a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_softplus_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Softplus, LAYER_SOFTPLUS);
+
+string MetalSoftplusLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "softplus";
+}
+
+Status MetalSoftplusLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalSoftplusLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Softplus, LAYER_SOFTPLUS);
+REGISTER_METAL_LAYOUT(LAYER_SOFTPLUS, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.metal
new file mode 100644
index 0000000..d7d09f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.metal
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void splitv_axis_2(const device ftype4 *src                 [[buffer(0)]],
+                                device ftype4 *dst                 [[buffer(1)]],
+                          constant MetalParams &params             [[buffer(2)]],
+                          constant int &h_offset                   [[buffer(3)]],
+                          constant int &split_axis_size               [[buffer(4)]],
+                          uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, split_axis_size, params.batch*params.output_slice)))
+        return;
+
+    auto output_size = params.output_width * split_axis_size;
+    int index_out = (int)gid.z*output_size + (int)gid.y*params.output_width + (int)gid.x;
+
+    int input_h = h_offset + int(gid.y);
+    int input_w = int(gid.x);
+    int index_in = (int)gid.z * params.input_size + input_h * params.input_width + input_w;
+
+    dst[index_out] = src[index_in];
+}
+
+kernel void splitv_axis_3(const device ftype4 *src                 [[buffer(0)]],
+                                device ftype4 *dst                 [[buffer(1)]],
+                          constant MetalParams &params             [[buffer(2)]],
+                          constant int &w_offset                   [[buffer(3)]],
+                          constant int &split_axis_size               [[buffer(4)]],
+                          uint3 gid                                [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(split_axis_size, params.output_height, params.batch*params.output_slice)))
+        return;
+
+    auto output_size = params.output_height * split_axis_size;
+    int index_out = (int)gid.z*output_size + (int)gid.y*split_axis_size + (int)gid.x;
+
+    int input_w = w_offset + int(gid.x);
+    int input_h = int(gid.y);
+    int index_in = (int)gid.z * params.input_size + input_h * params.input_width + input_w;
+
+    dst[index_out] = src[index_in];
+}
+
+
+kernel void splitv_axis_1_common(const device ftype4 *src                       [[buffer(0)]],
+                                    device ftype4 *dst                            [[buffer(1)]],
+                                    constant MetalSplitVParamV2 &params     [[buffer(2)]],
+                                    constant int &axis_offset           [[buffer(3)]],
+                                    constant int &axis_size               [[buffer(4)]],
+                                    uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, axis_size, params.outer_size)))
+        return;
+    
+    int index_out = (int)gid.z*axis_size*params.inner_size + (int)gid.y*params.inner_size + (int)gid.x;
+    
+    const int input_channel_count = params.axis_size*4;
+    int4 input_channeles = (int)gid.y*4 + int4(0, 1, 2, 3) + axis_offset;
+    int4 input_slice = input_channeles / 4;
+    input_slice = min(input_slice, params.axis_size-1);
+    int4 input_i = input_channeles % 4;
+    
+    int4 index_in = (int)gid.z*params.axis_size*params.inner_size + input_slice*params.inner_size + (int)gid.x;
+    
+    if (all( index_in == index_in.yzwx) &&
+        all( input_i == int4(0, 1, 2, 3)) &&
+        all( input_channeles < int4(input_channel_count)) ) {
+        dst[index_out] = src[index_in[0]];
+    } else {
+        dst[index_out] = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]
+        );
+    }
+}
+
+kernel void splitv_common(const device ftype4 *src                       [[buffer(0)]],
+                            device ftype4 *dst                            [[buffer(1)]],
+                            constant MetalSplitVParamV2 &params     [[buffer(2)]],
+                            constant int &axis_offset           [[buffer(3)]],
+                            constant int &axis_size               [[buffer(4)]],
+                            uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.inner_size, axis_size, params.outer_size)))
+        return;
+    
+    int index_out = (int)gid.z*axis_size*params.inner_size + (int)gid.y*params.inner_size + (int)gid.x;
+    
+    int input_axis_offset = axis_offset + (int)gid.y;
+    
+    int index_in = (int)gid.z*params.axis_size*params.inner_size + input_axis_offset*params.inner_size + (int)gid.x;
+    
+    dst[index_out] = src[index_in];
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.mm
new file mode 100644
index 0000000..a2ebf1b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_splitv_layer_acc.mm
@@ -0,0 +1,132 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(SplitV, LAYER_SPLITV);
+
+Status MetalSplitVLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalSplitVLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    input_dims[1] = UP_DIV(input_dims[1], 4);
+    {
+    MetalSplitVParamV2 metal_params;
+    metal_params.outer_size = DimsVectorUtils::Count(input_dims, 0, layer_param->axis);
+    metal_params.inner_size = DimsFunctionUtils::GetDimProduct(input_dims, layer_param->axis+1);
+    metal_params.axis_size  = DimsFunctionUtils::GetDim(input_dims, layer_param->axis);
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalSoftmaxParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+
+    return  TNN_OK;
+}
+
+std::string MetalSplitVLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    if (layer_param->axis == 1)
+        return "splitv_axis_1_common";
+
+    return "splitv_common";
+}
+
+Status MetalSplitVLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalSplitVLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    return TNN_OK;
+}
+
+Status MetalSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    auto context_impl = context_->getMetalContextImpl();
+
+    auto input = inputs[0];
+
+    MetalBandwidth bandwidth;
+
+    DataType data_type   = input->GetBlobDesc().data_type;
+    string data_type_str = DataTypeUtils::GetDataTypeString(data_type);
+    if (data_type != DATA_TYPE_FLOAT && data_type != DATA_TYPE_HALF) {
+        LOGE("MetalSplitVLayerAcc: DataType must be float or half\n");
+        return Status(TNNERR_LAYER_ERR, "MetalSplitVLayerAcc: DataType must be float or half");
+    }
+
+    const string kernel_name = KernelName(inputs, outputs);
+    bool split_channel       = layer_param->axis == 1;
+
+    int axis_offset = 0;
+    Status status = TNN_OK;
+
+    for (int i = 0; i < outputs.size(); i++) {
+        auto dims_output    = outputs[i]->GetBlobDesc().dims;
+        auto output_slice   = UP_DIV(dims_output[1], 4);
+        auto split_axis_size= DimsFunctionUtils::GetDim(dims_output, layer_param->axis); 
+        split_axis_size = split_channel? output_slice : split_axis_size;
+
+        auto encoder = [context_impl encoder];
+        encoder.label = GetKernelLabel();
+
+        do {
+            status = [context_impl load: [NSString stringWithUTF8String:kernel_name.c_str()]
+                                encoder:encoder
+                              bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+            MTLSize threads;
+            GetSingleAxisSplitSize(outputs[i]->GetBlobDesc().dims, layer_param->axis, threads, false);
+
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->GetHandle().base
+                        offset:(NSUInteger)input->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)outputs[i]->GetHandle().base
+                        offset:(NSUInteger)outputs[i]->GetHandle().bytes_offset
+                       atIndex:1];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+            [encoder setBytes:&axis_offset length:sizeof(axis_offset) atIndex:3];
+            [encoder setBytes:&split_axis_size length:sizeof(split_axis_size) atIndex:4];
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        } while (0);
+        [encoder endEncoding];
+        [context_impl commit];
+        TNN_PRINT_ENCODER(context_, encoder, this);
+        BREAK_IF(status != TNN_OK);
+        axis_offset += dims_output[layer_param->axis];
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_METAL_ACC(SplitV, LAYER_SPLITV);
+REGISTER_METAL_LAYOUT(LAYER_SPLITV, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.metal
new file mode 100644
index 0000000..fc1f008
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.metal
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void sqrt(const device ftype4 *in                           [[buffer(0)]],
+                         device ftype4 *out                                   [[buffer(1)]],
+                         constant MetalParams& params            [[buffer(2)]],
+                         uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    *z_out = sqrt(*z_in);
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.mm
new file mode 100644
index 0000000..d5304c6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sqrt_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Sqrt, LAYER_SQRT);
+
+string MetalSqrtLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "sqrt";
+}
+
+Status MetalSqrtLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalSqrtLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Sqrt, LAYER_SQRT);
+REGISTER_METAL_LAYOUT(LAYER_SQRT, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.metal
new file mode 100644
index 0000000..8ec16e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.metal
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <metal_math>
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+using namespace metal;
+
+kernel void squeeze_common(const device ftype4 *src                  [[buffer(0)]],
+                                                device ftype4 *dst                            [[buffer(1)]],
+                                                constant MetalSqueezeParams &params     [[buffer(2)]],
+                                                uint3 gid                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_out = (int)gid.z*params.output_slice*params.output_size + (int)gid.y*params.output_size + (int)gid.x;
+    
+    int4 index_nchw = ((int)gid.y*4 + int4(0, 1, 2, 3))*params.output_size + (int)gid.x;
+    index_nchw += (int)gid.z*params.output_size*params.output_channel;
+
+    int4 input_batch = index_nchw / (params.input_size * params.input_channel);
+    index_nchw -= input_batch * params.input_channel * params.input_size;
+    int4 input_channel = index_nchw / params.input_size;
+    int4 input_x = index_nchw - input_channel * params.input_size;
+    int4 input_slice = input_channel / 4;
+    int4 input_i = input_channel % 4;
+    
+    int4 index_in = input_batch * params.input_slice * params.input_size + input_slice * params.input_size + input_x;
+
+    bool4 valid = ((int)gid.y*4 + int4(0, 1, 2, 3)) < params.output_channel;
+    index_in = select(int4(0), index_in, valid);
+    input_i  = select(int4(0), input_i,  valid);
+
+    ftype4 val = select(
+        ftype4(0),
+        ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]),
+        valid);
+    
+    dst[index_out] = val;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.mm
new file mode 100644
index 0000000..a071d36
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_squeeze_layer_acc.mm
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC_WITH_EXTRA(Squeeze, LAYER_SQUEEZE, private: bool need_reformat_ = false);
+
+Status MetalSqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+    
+Status MetalSqueezeLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    auto layer_param        = dynamic_cast<SqueezeLayerParam*>(param_);
+    need_reformat_ = false;
+    auto axes = layer_param->axes;
+    for (auto axis : axes) {
+        axis = axis < 0 ? axis + dims_output.size() : axis;
+        need_reformat_ = need_reformat_ || axis == 0 || axis == 1;
+    }
+    // buffer_param_
+    {
+        if (need_reformat_) {
+            MetalSqueezeParams metal_params;
+            SetDefaultMetalParams(metal_params, dims_input, dims_output);
+            FixDefaultMetalParams(metal_params, dims_input, dims_output);
+            metal_params.input_channel  = dims_input[1];
+            metal_params.output_channel = dims_output[1];
+            metal_params.input_batch = dims_input[0];
+            buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                    length:sizeof(metal_params)
+                                                   options:MTLResourceCPUCacheModeWriteCombined];
+        } else {
+            MetalPermuteParams metal_params;
+            SetDefaultMetalParams(metal_params, dims_input, dims_output);
+            FixDefaultMetalParams(metal_params, dims_input, dims_output);
+            metal_params.input_batch = dims_input[0];
+            buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                    length:sizeof(metal_params)
+                                                   options:MTLResourceCPUCacheModeWriteCombined];
+        }
+    }
+    return TNN_OK;
+}
+    
+Status MetalSqueezeLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs,
+                                            MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSizeFusedLast(dims_output, need_reformat_);
+    return TNN_OK;
+}
+
+std::string MetalSqueezeLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (need_reformat_)
+        return "squeeze_common";
+    return "permute_copy";
+}
+
+Status MetalSqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                         const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalSqueezeLayerAcc::SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                                const std::vector<Blob *> &inputs,
+                                                const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Squeeze, LAYER_SQUEEZE);
+REGISTER_METAL_ACC(Squeeze, LAYER_UNSQUEEZE);
+
+REGISTER_METAL_LAYOUT(LAYER_SQUEEZE, DATA_FORMAT_NC4HW4);
+REGISTER_METAL_LAYOUT(LAYER_UNSQUEEZE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.metal
new file mode 100644
index 0000000..6ca81db
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.metal
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void stride_slice_common(const device ftype4 *src                                    [[buffer(0)]],
+                                                     device ftype4 *dst                                              [[buffer(1)]],
+                                                     constant MetalStrideSliceParams& params      [[buffer(2)]],
+                                                     uint3 gid                                                             [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_width, params.output_height, params.batch*params.output_slice)))
+        return;
+    const int batch_out = gid.z / params.output_slice;
+    const int slice_out = gid.z % params.output_slice;
+    
+    int index_out = (int)gid.z*params.output_size + (int)gid.y*params.output_width + (int)gid.x;
+    
+    const int4 channel_out = slice_out*4 + int4(0, 1, 2, 3);
+    
+    int batch_in = batch_out * params.stride_n + params.begin_n;
+    int4 channel_in = channel_out * params.stride_c + params.begin_c;
+    int4 slice_in = channel_in / 4;
+    slice_in = min(slice_in, params.input_slice-1);
+    int4 input_i = channel_in % 4;
+    int height_in = (int)gid.y * params.stride_h + params.begin_h;
+    int width_in = (int)gid.x * params.stride_w + params.begin_w;
+    
+    int4 index_in = batch_in * params.input_slice * params.input_size + slice_in * params.input_size + height_in * params.input_width + width_in;
+    
+    if (all(index_in ==  index_in.yzwx) && all(input_i ==  int4(0, 1, 2, 3))) {
+        dst[index_out] = src[index_in[0]];
+    } else {
+        dst[index_out] = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]
+        );
+    }
+    
+}
+
+static uint3 linear_to_3d(uint idx, constant int shape[3]) {
+    uint3 idx3d;
+    idx3d.x = idx % shape[0];
+    idx3d.y = (idx / shape[0]) % shape[1];
+    idx3d.z = (idx / shape[0]) / shape[1];
+
+    return idx3d;
+}
+
+template <int S, typename T>
+static uint dot(T a, constant uint b[S]) {
+    uint rst = 0;
+    for(int i=0; i<S; ++i) rst += a[i] * b[i];
+    return rst;
+}
+
+template <int S, typename T>
+static uint dot(T a, T b) {
+    uint rst = 0;
+    for(int i=0; i<S; ++i) rst += a[i] * b[i];
+    return rst;
+}
+
+template <int S, typename T>
+static T mul(T a, constant int b[S]) {
+    T rst;
+    for(int i=0; i<S; ++i) rst[i] = a[i] * b[i];
+    return rst;
+}
+
+template <int S, typename T>
+static T add(T a, constant int b[S]) {
+    T rst;
+    for(int i=0; i<S; ++i) rst[i] = a[i] + b[i];
+    return rst;
+}
+
+
+kernel void stride_slice_common_dim6(const device ftype4 *src                                    [[buffer(0)]],
+                                                     device ftype4 *dst                                              [[buffer(1)]],
+                                                     constant MetalStrideSliceParamsV2& params      [[buffer(2)]],
+                                                     uint3 gid                                                             [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_width, params.output_height, params.batch*params.output_slice)))
+        return;
+
+    const int batch_out = gid.z / params.output_slice;
+    const int slice_out = gid.z % params.output_slice;
+    uint3 idx_high = uint3(gid.y, slice_out, batch_out);
+    uint3 idx_low  = linear_to_3d(gid.x, params.shape3d_low);
+    
+    int index_out = dot<3>(idx_high, uint3(params.output_width, params.output_size, params.output_size*params.output_slice));
+    index_out    += dot<3>(idx_low,  uint3(1, params.shape3d_low[0], params.shape3d_low[0]*params.shape3d_low[1]));
+
+    const int4 channel_out = slice_out*4 + int4(0, 1, 2, 3);
+    int4 channel_in = channel_out * params.strides_high[1] + params.begins_high[1];
+    int4 slice_in = channel_in / 4;
+    int4 input_i  = channel_in % 4;
+    slice_in = min(slice_in, params.input_slice-1);
+
+    uint3 input_idx_low = add<3>(mul<3>(idx_low, params.strides_low), params.begins_low);
+    uint3 input_idx_high;
+    input_idx_high.x = idx_high[0] * params.strides_high[0] + params.begins_high[0];
+    input_idx_high.z = idx_high[2] * params.strides_high[2] + params.begins_high[2];
+    if (all(slice_in == slice_in.yzwx) && all(input_i == int4(0, 1, 2, 3))) {
+        input_idx_high.y = slice_in[0];
+        int index_in = dot<3>(input_idx_low, uint3(1, params.input_shape3d_low[0],  params.input_shape3d_low[0] * params.input_shape3d_low[1]));
+        index_in += dot<3>(input_idx_high, uint3(params.input_width, params.input_size, params.input_size*params.input_slice));
+
+        dst[index_out] = src[index_in];
+    } else {
+        int4 index_in = dot<3>(input_idx_low, uint3(1, params.input_shape3d_low[0],  params.input_shape3d_low[0] * params.input_shape3d_low[1]));
+        for(int i=0; i<4; ++i) {
+            input_idx_high.y = slice_in[i];
+            index_in[i] += dot<3>(input_idx_high, uint3(params.input_width, params.input_size, params.input_size*params.input_slice));;
+        }
+        dst[index_out] = ftype4(
+            src[index_in[0]][input_i[0]],
+            src[index_in[1]][input_i[1]],
+            src[index_in[2]][input_i[2]],
+            src[index_in[3]][input_i[3]]
+        );
+    }
+    
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.mm
new file mode 100644
index 0000000..66ecdb3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_layer_acc.mm
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status IsMetalStrideSliceLayerAccSupported(LayerParam *param, LayerResource *resource,
+                                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    //    if (layer_param->begins[2] %4 !=0) {
+    //        LOGE("Error: StrideSlice's begins channel must be 4x\n");
+    //        return Status(TNNERR_NET_ERR, "StrideSlice's begins channel must be 4x\n");
+    //    }
+    return TNN_OK;
+}
+
+DECLARE_METAL_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status MetalStrideSliceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = IsMetalStrideSliceLayerAccSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalStrideSliceLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    if (!layer_param) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalStrideSliceParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+        auto begins = layer_param->begins;
+        std::reverse(begins.begin(), begins.end());
+        metal_params.begin_n = begins[0];
+        metal_params.begin_c = begins[1];
+        metal_params.begin_h = begins[2];
+        metal_params.begin_w = begins[3];
+
+        auto strides = layer_param->strides;
+        std::reverse(strides.begin(), strides.end());
+        metal_params.stride_n = strides[0];
+        metal_params.stride_c = strides[1];
+        metal_params.stride_h = strides[2];
+        metal_params.stride_w = strides[3];
+        buffer_param_         = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalStrideSliceParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalStrideSliceLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "stride_slice_common";
+}
+
+Status MetalStrideSliceLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalStrideSliceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = IsMetalStrideSliceLayerAccSupported(param_, resource_, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalStrideSliceLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+       auto dims_output = outputs[0]->GetBlobDesc().dims;
+       size = GetDefaultThreadSize(dims_output, false);
+       return TNN_OK;
+}
+
+REGISTER_METAL_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+REGISTER_METAL_LAYOUT(LAYER_STRIDED_SLICE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_v2_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_v2_layer_acc.mm
new file mode 100644
index 0000000..edfd31a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_stride_slice_v2_layer_acc.mm
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_ACC(StrideSliceV2, LAYER_STRIDED_SLICE);
+
+Status MetalStrideSliceV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalStrideSliceV2LayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                     const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    if (!layer_param) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParamV2 is nil");
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    auto dim_size    = dims_output.size();
+    
+    if (dim_size != dims_input.size()) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParamV2 not support different dim_size!");
+    }
+    if (dim_size > 6) {
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParamV2 not support dim_size>6!");
+    }
+    auto begins  = layer_param->begins;
+    auto ends    = layer_param->ends;
+    auto strides = layer_param->strides;
+    auto axes    = layer_param->axes;
+
+    Status status = TNN_OK;
+    DimsFunctionUtils::StrideSlice(dims_input, begins, ends, strides, axes, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    if (dim_size <= 4) {
+        // use strideslice for 4 dim
+        // buffer_param_
+        {
+            MetalStrideSliceParams metal_params;
+            SetDefaultMetalParams(metal_params, dims_input, dims_output);
+
+            std::vector<int> rectified_begins(dim_size, 0);
+            std::vector<int> rectified_strides(dim_size, 1);
+
+            for (int i = 0, axes_idx = 0; i < dims_output.size(); ++i) {
+                if (axes_idx >= axes.size() || i != axes[axes_idx]) {
+                    rectified_begins[i]  = 0;
+                    rectified_strides[i] = 1;
+                } else {
+                    rectified_begins[i]  = begins[axes_idx];
+                    rectified_strides[i] = strides[axes_idx];
+                    axes_idx += 1;
+                }
+            }
+            // pad to size 4
+            for (int i = dims_output.size(); i < 4; ++i) {
+                rectified_begins.push_back(0);
+                rectified_strides.push_back(1);
+            }
+
+            metal_params.begin_n = rectified_begins[0];
+            metal_params.begin_c = rectified_begins[1];
+            metal_params.begin_h = rectified_begins[2];
+            metal_params.begin_w = rectified_begins[3];
+
+            metal_params.stride_n = rectified_strides[0];
+            metal_params.stride_c = rectified_strides[1];
+            metal_params.stride_h = rectified_strides[2];
+            metal_params.stride_w = rectified_strides[3];
+
+            buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                length:sizeof(MetalStrideSliceParams)
+                                               options:MTLResourceCPUCacheModeWriteCombined];
+        }
+    } else {
+        const int max_dim = 6;
+        // stride slice for high dimension
+        MetalStrideSliceParamsV2 metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        metal_params.input_width  = DimsFunctionUtils::GetDimProduct(dims_input, 3);
+        metal_params.output_width = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+        metal_params.input_size  = metal_params.input_width * metal_params.input_height;
+        metal_params.output_size = metal_params.output_width * metal_params.output_height;
+        for(int i=0; i<3; ++i)
+            metal_params.input_shape3d_low[3-i-1] = DimsFunctionUtils::GetDim(dims_input, 3+i);
+        for(int i=0; i<3; ++i)
+            metal_params.shape3d_low[3-i-1] = DimsFunctionUtils::GetDim(dims_output, 3+i);
+        
+        std::vector<int> rectified_begins(max_dim, 0);
+        std::vector<int> rectified_strides(max_dim, 1);
+        for (int i = 0, axes_idx = 0; i < max_dim; ++i) {
+            if (axes_idx >= axes.size() || i != axes[axes_idx]) {
+                rectified_begins[i]  = 0;
+                rectified_strides[i] = 1;
+            } else {
+                rectified_begins[i]  = begins[axes_idx];
+                rectified_strides[i] = strides[axes_idx];
+                axes_idx += 1;
+            }
+        }
+        // pad to max_dim
+        for (int i = dims_output.size(); i < max_dim; ++i) {
+            rectified_begins.push_back(0);
+            rectified_strides.push_back(1);
+        }
+
+        for(int i=0; i<max_dim; ++i) {
+            if (i < max_dim/ 2) {
+                metal_params.strides_low[i] = rectified_strides[max_dim-1-i];
+                metal_params.begins_low[i]  = rectified_begins[max_dim-1-i];
+            } else {
+                metal_params.strides_high[i-max_dim/2] = rectified_strides[max_dim-1-i];
+                metal_params.begins_high[i-max_dim/2]  = rectified_begins[max_dim-1-i];
+            }
+        }
+
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                length:sizeof(MetalStrideSliceParamsV2)
+                                               options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalStrideSliceV2LayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto dims_size = outputs[0]->GetBlobDesc().dims.size();
+    if (dims_size <= 4)
+        return "stride_slice_common";
+    return "stride_slice_common_dim6";
+}
+
+Status MetalStrideSliceV2LayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalStrideSliceV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalStrideSliceV2LayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+       auto dims_output  = outputs[0]->GetBlobDesc().dims;
+       const auto batch  = dims_output[0];
+       const auto slice  = UP_DIV(dims_output[1], 4);
+       const auto height = dims_output[2];
+       const auto isize  = DimsFunctionUtils::GetDimProduct(dims_output, 3);
+       size = MTLSizeMake(isize, height, batch * slice);
+       return TNN_OK;
+}
+
+REGISTER_METAL_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+REGISTER_METAL_LAYOUT(LAYER_STRIDED_SLICE_V2, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.metal
new file mode 100644
index 0000000..2398bc3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.metal
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void sub_normal(const device ftype4 *src0                                [[buffer(0)]],
+                                       const device ftype4 *src1                                [[buffer(1)]],
+                                       device ftype4 *dst                                            [[buffer(2)]],
+                                       constant MetalBroadcastParams& params     [[buffer(3)]],
+                                       uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    int index_in = (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    
+    dst[index_out] = src0[index_in] - src1[index_in];
+}
+
+kernel void sub_broadcast(const device ftype4 *src0                                [[buffer(0)]],
+                                           const device ftype4 *src1                                [[buffer(1)]],
+                                           device ftype4 *dst                                            [[buffer(2)]],
+                                           constant MetalBroadcastParams& params     [[buffer(3)]],
+                                           uint3 gid                                                          [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    const int index_size = (int)gid.y * params.input_size + (int)gid.x;
+    const int index = (int)gid.z * params.input_slice * params.input_size  + index_size;
+    const int batch_offset0 = params.weight_index == 0? 0 : (int)gid.z * params.input0_size;
+    const int batch_offset1 = params.weight_index == 1? 0 : (int)gid.z * params.input1_size;
+    
+    ftype4 data0;
+    if (params.broadcast_input0 == kBroadcastTypeChannel) {
+        data0 = src0[batch_offset0 + gid.y];
+    } else if (params.broadcast_input0 == kBroadcastTypeSingle) {
+        data0 = ftype4(src0[batch_offset0 + 0].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeHeightWidth) {
+        data0 = ftype4(src0[batch_offset0 + gid.x].x);
+    } else if (params.broadcast_input0 == kBroadcastTypeElement) {
+        data0 = ftype4(src0[index_size]);
+    } else {
+        data0 = src0[index];
+    }
+        
+    ftype4 data1;
+    if (params.broadcast_input1 == kBroadcastTypeChannel) {
+        data1 = src1[batch_offset1 + gid.y];
+    } else if (params.broadcast_input1 == kBroadcastTypeSingle) {
+        data1 = ftype4(src1[batch_offset1 + 0].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeHeightWidth) {
+        data1 = ftype4(src1[batch_offset1 + gid.x].x);
+    } else if (params.broadcast_input1 == kBroadcastTypeElement) {
+        data1 = ftype4(src1[index_size]);
+    } else {
+        data1 = src1[index];
+    }
+    
+    int index_out = (int)gid.z * params.output_slice * params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+    dst[index_out] = data0 - data1;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.mm
new file mode 100644
index 0000000..1c56c14
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_sub_layer_acc.mm
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_multidir_broadcast_layer_acc.h"
+
+namespace TNN_NS {
+DECLARE_METAL_MULTIDIR_BROADCAST_ACC(Sub, LAYER_SUB);
+
+std::string MetalSubLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    std::string kernel_name = "";
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return kernel_name;
+    }
+    if (layer_param->input0_broadcast_type > 0 || layer_param->input1_broadcast_type > 0) {
+        kernel_name = "sub_broadcast";
+    } else {
+        kernel_name = "sub_normal";
+    }
+    return kernel_name;
+}
+
+Status MetalSubLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalSubLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalMultidirBroadcastLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_MULTIDIR_BROADCAST_ACC(Sub, LAYER_SUB);
+REGISTER_METAL_LAYOUT(LAYER_SUB, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.metal
new file mode 100644
index 0000000..9e17cd6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.metal
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+kernel void tan(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >=
+            uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+
+    auto z_in = in + (int)gid.z * params.input_slice * params.input_size +
+                (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                 (int)gid.y * params.output_size + (int)gid.x;
+
+    *z_out = tan(*z_in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.mm
new file mode 100644
index 0000000..f4fe6e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tan_layer_acc.mm
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+
+DECLARE_METAL_UNARY_ACC(Tan, LAYER_TAN);
+
+string MetalTanLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "tan";
+}
+
+Status MetalTanLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalTanLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Tan, LAYER_TAN);
+REGISTER_METAL_LAYOUT(LAYER_TAN, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.metal
new file mode 100644
index 0000000..52bd6d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.metal
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void tanh(const device ftype4 *in                           [[buffer(0)]],
+                          device ftype4 *out                                   [[buffer(1)]],
+                          constant MetalParams& params            [[buffer(2)]],
+                          uint3 gid                                                  [[thread_position_in_grid]]) {
+    if (any(uint3(gid) >= uint3(params.output_size, params.output_slice, params.batch)))
+        return;
+    
+    auto z_in  = in  + (int)gid.z * params.input_slice * params.input_size  + (int)gid.y * params.input_size + (int)gid.x;
+    auto z_out = out + (int)gid.z *  params.output_slice*  params.output_size + (int)gid.y * params.output_size + (int)gid.x;
+
+#if TNN_METAL_FULL_PRECISION
+    *z_out = metal::precise::tanh(*z_in);
+#else
+    *z_out = tanh_high_precision(*z_in);
+#endif
+}
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.mm
new file mode 100644
index 0000000..2b19d33
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tanh_layer_acc.mm
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_UNARY_ACC(Tanh, LAYER_TANH);
+
+string MetalTanhLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "tanh";
+}
+
+Status MetalTanhLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::AllocateBufferParam(inputs, outputs);
+}
+
+Status MetalTanhLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalUnaryLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_UNARY_ACC(Tanh, LAYER_TANH);
+REGISTER_METAL_LAYOUT(LAYER_TANH, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.metal
new file mode 100644
index 0000000..dfbf56e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.metal
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+#include <metal_stdlib>
+
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void tile(const device ftype4 *in [[buffer(0)]],
+                device ftype4 *out [[buffer(1)]],
+                constant MetalTileParams &params [[buffer(2)]],
+                uint3 gid [[thread_position_in_grid]]) {
+     if (any(gid >= uint3(params.output_size, params.output_slice, params.batch)))
+                        return;
+    int plane_row = params.extend_width_times * params.input_width;
+    int plane = plane_row * params.input_height;
+    auto z_out = out + (int)gid.z * params.output_slice * params.output_size +
+                     (int)gid.y * params.output_size + (int)gid.x;
+
+    int batch_index = (int)gid.z % (params.batch/params.extend_batch_times) * params.input_slice * params.input_size;
+    int in_channel_offset =(int)gid.x % plane / plane_row * params.input_width +
+                                          (int)gid.x % params.input_width;
+    int channel_index = (int)gid.y*4  - (int)gid.y * 4 / params.input_channel * params.input_channel;
+
+    ftype a[4];
+    int count = 0;
+    for(int i = 0; i < 4; i++)
+    {
+        if(channel_index >= params.input_channel)    //channel_index must in range(0,input_channel)
+        {
+            channel_index = 0;
+            count++;                                //if channel_index==input_channel, channel_index re-assign to 0,count is this case happen times in one slice
+        }
+        if(count > params.extend_channel_times)
+            a[i] = 0;                               //after channel extended, output channel < 4,need pad 0
+        else
+            // channel_index/4 and channel_index%4 means four channel element packed as one ftype4 element
+            a[i] = in[batch_index + channel_index/4*params.input_size + in_channel_offset][channel_index%4];
+
+        channel_index++;
+    }
+    ftype4 tmp = ftype4(a[0],a[1],a[2],a[3]);
+    *z_out = tmp;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.mm
new file mode 100644
index 0000000..498f0ac
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_tile_layer_acc.mm
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+
+namespace TNN_NS {
+DECLARE_METAL_ACC(Tile, LAYER_REPEAT);
+
+Status MetalTileLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalTileLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalTileParams metal_params;
+        metal_params.input_width          = dims_input[3];
+        metal_params.input_height         = dims_input[2];
+        metal_params.input_size           = metal_params.input_height * metal_params.input_width;
+        metal_params.input_slice          = UP_DIV(dims_input[1], 4);
+        metal_params.input_channel        = dims_input[1];
+
+        metal_params.output_width         = dims_output[3];
+        metal_params.output_height        = dims_output[2];
+        metal_params.output_size          = metal_params.output_height * metal_params.output_width;
+        metal_params.output_slice         = UP_DIV(dims_output[1], 4);
+        metal_params.output_channel       = dims_output[1];
+        metal_params.batch                = dims_output[0];
+        metal_params.extend_batch_times   = dims_output[0] / dims_input[0];
+        metal_params.extend_channel_times = dims_output[1] / dims_input[1];
+        metal_params.extend_width_times   = dims_output[3] / dims_input[3];
+
+
+
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                                length:sizeof(metal_params)
+                                               options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+string MetalTileLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "tile";
+}
+
+Status MetalTileLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs,
+                                               MTLSize &size) {
+    return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalTileLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                     const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+Status MetalTileLayerAcc::SetKernelEncoderParam(
+    id<MTLComputeCommandEncoder> encoder,
+    const std::vector<Blob *> &inputs,
+    const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Tile, LAYER_REPEAT);
+REGISTER_METAL_LAYOUT(LAYER_REPEAT, DATA_FORMAT_NC4HW4);
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.h
new file mode 100644
index 0000000..d4738dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_UNARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_UNARY_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief unary layer metal acc
+class MetalUnaryLayerAcc : public MetalLayerAcc {
+public:
+    // @brief virtual destrcutor
+    virtual ~MetalUnaryLayerAcc();
+
+    /**
+     * @brief metal kernel name
+     */
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                             const std::vector<Blob *> &outputs,
+                             MTLSize &size);
+    
+    Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                 const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs);
+};
+
+#define DECLARE_METAL_UNARY_ACC(type_string, layer_type)                                                               \
+    class Metal##type_string##LayerAcc : public MetalUnaryLayerAcc {                                                   \
+    public:                                                                                                            \
+        virtual ~Metal##type_string##LayerAcc(){};                                                                     \
+        std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                 \
+        Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);             \
+        Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);                         \
+    }
+
+#define REGISTER_METAL_UNARY_ACC(type_string, layer_type) REGISTER_METAL_ACC(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_MULTIDIR_BROADCAST_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.mm
new file mode 100644
index 0000000..d453329
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_unary_layer_acc.mm
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_unary_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+MetalUnaryLayerAcc::~MetalUnaryLayerAcc() {}
+
+Status MetalUnaryLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs) {
+   id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    auto dims_input      = inputs[0]->GetBlobDesc().dims;
+    auto dims_output     = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        auto metal_params = GetDefaultMetalParams(dims_input, dims_output);
+        FixDefaultMetalParams(metal_params, dims_input, dims_output);     
+        buffer_param_     = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(metal_params)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+std::string MetalUnaryLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return "";
+}
+
+Status MetalUnaryLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalUnaryLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    const auto& output_dims = outputs[0]->GetBlobDesc().dims;
+    auto hw = DimsFunctionUtils::GetDimProduct(output_dims, 2);
+    auto slice = UP_DIV(output_dims[1] ,4);
+    auto batch = output_dims[0];
+    size = MTLSizeMake(hw, slice, batch);
+    return TNN_OK;
+    // return MetalLayerAcc::ComputeThreadSize(inputs, outputs, size);
+}
+
+Status MetalUnaryLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.metal
new file mode 100644
index 0000000..156651c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.metal
@@ -0,0 +1,179 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void upsample_nearest(const device ftype4 *src                [[buffer(0)]],
+                             device ftype4 *dst                      [[buffer(1)]],
+                             constant MetalUpsampleParams &params    [[buffer(2)]],
+                             uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    int src_x = floor(gid.x * params.scale_x);
+    int src_y = floor(gid.y * params.scale_y);
+    
+    int index_dst = int(gid.z) * params.output_size + int(gid.y) * params.output_width + int(gid.x);
+    int index_src = int(gid.z) * params.input_size  + src_y * params.input_width + src_x;
+    
+    dst[index_dst] = src[index_src];
+}
+
+kernel void upsample_bilinear_align(const device ftype4 *src                [[buffer(0)]],
+                                    device ftype4 *dst                      [[buffer(1)]],
+                                    constant MetalUpsampleParams &params    [[buffer(2)]],
+                                    uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    float srcX = gid.x * params.scale_x, srcY = gid.y * params.scale_y;
+    int left = floor(srcX), right = min(left + 1, params.input_width - 1);
+    int top = floor(srcY), bottom = min(top + 1, params.input_height - 1);
+    
+    float x2_factor = srcX - left;
+    float y2_factor = srcY - top;
+    float x1_factor = 1 - x2_factor;
+    float y1_factor = 1 - y2_factor;
+    
+    auto in_z        = src + gid.z * params.input_size;
+    auto in_top      = in_z + top * params.input_width;
+    auto in_bottom   = in_z + bottom * params.input_width;
+    auto tl = ftype4(in_top[left])     * x1_factor * y1_factor;
+    auto tr = ftype4(in_top[right])    * x2_factor * y1_factor;
+    auto bl = ftype4(in_bottom[left])  * x1_factor * y2_factor;
+    auto br = ftype4(in_bottom[right]) * x2_factor * y2_factor;
+    dst[int(gid.z) * params.output_size + int(gid.y) * params.output_width + int(gid.x)] = ftype4(tl + tr + bl + br);
+}
+
+
+kernel void upsample_bilinear_noalign(const device ftype4 *in                 [[buffer(0)]],
+                                      device ftype4 *out                      [[buffer(1)]],
+                                      constant MetalUpsampleParams &params    [[buffer(2)]],
+                                      uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+    
+    float srcX = max(params.scale_x*(gid.x+0.5f) - 0.5f, 0.0f);
+    float srcY = max(params.scale_y*(gid.y+0.5f) - 0.5f, 0.0f);
+    int left = floor(srcX), right = min(left + 1, params.input_width - 1);
+    int top = floor(srcY), bottom = min(top + 1, params.input_height - 1);
+    
+    float x2_factor = srcX - left;
+    float y2_factor = srcY - top;
+    float x1_factor = 1 - x2_factor;
+    float y1_factor = 1 - y2_factor;
+    
+    auto in_z        = in + gid.z * params.input_size;
+    auto in_top      = in_z + top * params.input_width;
+    auto in_bottom   = in_z + bottom * params.input_width;
+    auto tl = ftype4(in_top[left])     * x1_factor * y1_factor;
+    auto tr = ftype4(in_top[right])    * x2_factor * y1_factor;
+    auto bl = ftype4(in_bottom[left])  * x1_factor * y2_factor;
+    auto br = ftype4(in_bottom[right]) * x2_factor * y2_factor;
+    out[int(gid.z) * params.output_size + int(gid.y) * params.output_width + int(gid.x)] = ftype4(tl + tr + bl + br);
+}
+
+static inline ftype4 upsample_cubic_interpolation(ftype4 A, ftype4 B, ftype4 C, ftype4 D, float factor) {
+    constexpr float w = -0.75f;
+
+    ftype4 coeffs;
+    coeffs[0] = ((w*(factor + 1) - 5*w)*(factor + 1) + 8*w)*(factor + 1) - 4*w;
+    coeffs[1] = ((w + 2)*factor - (w + 3))*factor*factor + 1;
+    coeffs[2] = ((w + 2)*(1 - factor) - (w + 3))*(1 - factor)*(1 - factor) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+
+    return A*coeffs[0]+B*coeffs[1]+C*coeffs[2]+D*coeffs[3];
+}
+
+kernel void upsample_cubic_align(const device ftype4 *in               [[buffer(0)]],
+                           device ftype4 *out                      [[buffer(1)]],
+                           constant MetalUpsampleParams &params    [[buffer(2)]],
+                           uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+
+    float x = params.scale_x*gid.x;
+    float y = params.scale_y*gid.y;
+    int xx  = floor(x);
+    int yy  = floor(y);
+
+    float x_factor = x - xx;
+    float y_factor = y - yy;
+
+    int4 xp = int4(xx - 1, xx + 0, xx + 1, xx + 2);
+    xp = clamp(xp, 0, params.input_width - 1);
+
+    int4 yp = int4(yy - 1, yy + 0, yy + 1, yy + 2);
+    yp = clamp(yp, 0, params.input_height - 1);
+
+    auto in_z = in + gid.z * params.input_size;
+    ftype4x4 ABCD;
+    for (int i = 0; i < 4; i++) {
+        auto in_y = in_z + yp[i] * params.input_width;
+        ftype4 A = ftype4(in_y[xp[0]]);
+        ftype4 B = ftype4(in_y[xp[1]]);
+        ftype4 C = ftype4(in_y[xp[2]]);
+        ftype4 D = ftype4(in_y[xp[3]]);
+        ABCD[i] = upsample_cubic_interpolation(A, B, C, D, x_factor);
+    }
+
+    auto val = ftype4(upsample_cubic_interpolation(ABCD[0], ABCD[1], ABCD[2], ABCD[3], y_factor));
+    out[int(gid.z) * params.output_size + int(gid.y) * params.output_width + int(gid.x)] = val;
+}
+
+kernel void upsample_cubic_noalign(const device ftype4 *in               [[buffer(0)]],
+                           device ftype4 *out                      [[buffer(1)]],
+                           constant MetalUpsampleParams &params    [[buffer(2)]],
+                           uint3 gid                               [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.output_width, params.output_height, params.output_slice*params.batch)))
+        return;
+
+    float x = params.scale_x*(gid.x+0.5f) - 0.5f;
+    float y = params.scale_y*(gid.y+0.5f) - 0.5f;
+    int xx  = floor(x);
+    int yy  = floor(y);
+
+    float x_factor = x - xx;
+    float y_factor = y - yy;
+
+    int4 xp = int4(xx - 1, xx + 0, xx + 1, xx + 2);
+    xp = clamp(xp, 0, params.input_width - 1);
+
+    int4 yp = int4(yy - 1, yy + 0, yy + 1, yy + 2);
+    yp = clamp(yp, 0, params.input_height - 1);
+
+    auto in_z = in + gid.z * params.input_size;
+    ftype4x4 ABCD;
+    for (int i = 0; i < 4; i++) {
+        auto in_y = in_z + yp[i] * params.input_width;
+        ftype4 A = ftype4(in_y[xp[0]]);
+        ftype4 B = ftype4(in_y[xp[1]]);
+        ftype4 C = ftype4(in_y[xp[2]]);
+        ftype4 D = ftype4(in_y[xp[3]]);
+        ABCD[i] = upsample_cubic_interpolation(A, B, C, D, x_factor);
+    }
+
+    auto val = ftype4(upsample_cubic_interpolation(ABCD[0], ABCD[1], ABCD[2], ABCD[3], y_factor));
+    out[int(gid.z) * params.output_size + int(gid.y) * params.output_width + int(gid.x)] = val;
+}
+
+
+
+
+
+
+
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.mm
new file mode 100644
index 0000000..0d05243
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/metal_upsample_layer_acc.mm
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class MetalUpsampleLayerAcc : public MetalLayerAcc {
+public:
+    virtual ~MetalUpsampleLayerAcc(){};
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual std::string KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status ComputeThreadSize(const std::vector<Blob *> &inputs,
+                             const std::vector<Blob *> &outputs,
+                             MTLSize &size);
+    virtual Status SetKernelEncoderParam(id<MTLComputeCommandEncoder> encoder,
+                                 const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs);
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false) { return TNN_OK; }
+};
+
+Status MetalUpsampleLayerAcc::Reshape(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Reshape(inputs, outputs);
+}
+
+Status MetalUpsampleLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs) {
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    UpsampleLayerParam *layer_param =
+        dynamic_cast<UpsampleLayerParam *>(param_);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalUpsampleParams metal_params;
+        SetDefaultMetalParams(metal_params, dims_input, dims_output);
+        
+        float scale_x = 0;
+        float scale_y = 0;
+        if (layer_param->mode == 1) {
+            scale_x = (float)metal_params.input_width / (float)metal_params.output_width;
+            scale_y = (float)metal_params.input_height / (float)metal_params.output_height;
+        } else if (layer_param->mode == 2 || layer_param->mode == 3) {
+            if (layer_param->align_corners) {
+                scale_x = (metal_params.output_width > 1) ? (float)(metal_params.input_width - 1) / (metal_params.output_width - 1) : 0.f;
+                scale_y = (metal_params.output_height > 1) ? (float)(metal_params.input_height - 1) / (metal_params.output_height - 1) : 0.f;
+            } else {
+                scale_x = (metal_params.output_width > 1) ? (float)(metal_params.input_width) / (metal_params.output_width) : 0.f;
+                scale_y = (metal_params.output_height > 1) ? (float)(metal_params.input_height) / (metal_params.output_height) : 0.f;
+            }
+        }
+        
+        metal_params.scale_x = scale_x;
+        metal_params.scale_y = scale_y;
+
+        buffer_param_ =
+            [device newBufferWithBytes:(const void *)(&metal_params)
+                                length:sizeof(MetalUpsampleParams)
+                               options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalUpsampleLayerAcc::SetKernelEncoderParam(
+                                                 id<MTLComputeCommandEncoder> encoder,
+                                            const std::vector<Blob *> &inputs,
+                                            const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::SetKernelEncoderParam(encoder, inputs, outputs);
+}
+
+Status MetalUpsampleLayerAcc::ComputeThreadSize(const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs,
+                                        MTLSize &size) {
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    size = GetDefaultThreadSize(dims_output, false);
+    return TNN_OK;
+}
+
+std::string MetalUpsampleLayerAcc::KernelName(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<UpsampleLayerParam *>(param_);
+    if (layer_param->mode == 1) {
+        // nearest align_corners=False?待确认
+        return "upsample_nearest";
+    } else if (layer_param->mode == 2) {
+        // bilinear/linear align_corners=True
+        if (layer_param->align_corners) {
+            return "upsample_bilinear_align";
+        } else {
+            return "upsample_bilinear_noalign";
+        }
+    } else if (layer_param->mode == 3) {
+        if (layer_param->align_corners) {
+            return "upsample_cubic_align";
+        } else {
+            return "upsample_cubic_noalign";
+        }
+    } else {
+        LOGE("upsample type not support!\n");
+        return "";
+    }
+}
+
+Status MetalUpsampleLayerAcc::Forward(const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Forward(inputs, outputs);
+}
+
+REGISTER_METAL_ACC(Upsample, LAYER_UPSAMPLE);
+REGISTER_METAL_LAYOUT(LAYER_UPSAMPLE, DATA_FORMAT_NC4HW4);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.h b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.h
new file mode 100644
index 0000000..d8de364
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_LSTM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_LSTM_LAYER_ACC_H_
+
+#include "tnn/device/metal/acc/metal_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer metal acc
+class MetalLSTMLayerAcc : public MetalLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    // @brief virtual destrcutor
+    virtual ~MetalLSTMLayerAcc();
+
+    virtual Status AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+protected:
+    virtual Status AllocateBufferWeights(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status AllocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status AllocateBufferStates(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+protected:
+    // weight buffer
+    id<MTLBuffer> buffer_wi_ = nil;
+    id<MTLBuffer> buffer_wh_ = nil;
+    id<MTLBuffer> buffer_bias_ = nil;
+    // initial state buffer
+    id<MTLBuffer> buffer_h0_ = nil;
+    id<MTLBuffer> buffer_c0_ = nil;
+    // gates buffer
+    id<MTLBuffer> buffer_gates_ = nil;
+    virtual bool UseNaiveConstantBlobs() {return true;}
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_LSTM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.metal b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.metal
new file mode 100644
index 0000000..047dd67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.metal
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+#define SAFE_TANH 1
+
+template <typename ftype>
+inline ftype safe_tanh(ftype x) {
+    ftype ep = exp(x);
+    if (isinf(ep))
+        return 1;
+    ftype en = exp(-x);
+    if (isinf(en))
+        return -1;
+    ftype numerator = ep - en;
+    ftype denominator = ep + en;
+    return numerator / denominator;
+}
+
+// x: [seq, batch, input]
+// w: [dir, output, input, 4]
+// gates: [dir, seq, batch, output, 4(IOFC)]
+kernel void lstm_gates(const device ftype *x                    [[buffer(0)]],
+                       const device ftype4 *w                   [[buffer(1)]],
+                       device ftype4 *gates                     [[buffer(2)]],
+                       constant MetalRecurrentParams& params     [[buffer(3)]],
+                       uint3 gid  [[thread_position_in_grid]]) {
+    if (any(gid >= uint3(params.hidden_size, params.batch, params.seq_len*params.direction))) return;
+    
+    short d = gid.z / params.seq_len;
+    short t = gid.z % params.seq_len;
+    short n = gid.y;
+    short o = gid.x;
+    
+    auto weight = w + (d * params.hidden_size + o) * params.input_width;
+    auto input  = x + (t * params.batch + n ) * params.input_width;
+    auto output = gates + ((d * params.seq_len + t) * params.batch + n) * params.hidden_size + o;
+
+    float4 result = 0;
+    for(short i = 0; i<params.input_width; ++i) {
+        result += float4(weight[i]) * float4(input[i]);
+    }
+    *output = ftype4(result);
+}
+
+// gates: [dir, seq, batch, output, 4(IOFC)]
+// cell:  [dir, batch, output]
+// hidden:[dir, batch, output]
+// bias:  [dir, output, 8]
+// w:     [dir, output_out, output_in, 4]
+// y:     [seq, batch, dir, output]
+kernel void lstm_forward(const device ftype4 *gates      [[buffer(0)]],
+                         const device ftype *c_0        [[buffer(1)]],
+                         const device ftype *h_0        [[buffer(2)]],
+                         threadgroup  ftype *h_local    [[threadgroup(0)]],
+                         const device ftype4 *w         [[buffer(3)]],
+                         const device ftype4 *b         [[buffer(4)]],
+                         device ftype *c                [[buffer(5)]],
+                         device ftype *h                [[buffer(6)]],
+                         device ftype *y                [[buffer(7)]],
+                         constant MetalRecurrentParams& params  [[buffer(8)]],
+                         uint3 gid                            [[thread_position_in_grid]]) {
+    
+    if (any(gid >= uint3(params.hidden_size, params.batch, params.direction))) return;
+    
+    short d = gid.z;
+    short n = gid.y;
+    short o = gid.x;
+
+    ftype cell = params.has_init_c? c_0[(d * params.batch + n) * params.hidden_size + o] : 0;
+    ftype4 bias = b[(d * params.hidden_size + o) * 2] + b[(d * params.hidden_size + o) * 2 + 1];
+
+    auto wh = w + (d * params.hidden_size + o) * params.hidden_size;
+    gates   = gates + (d * params.seq_len * params.batch + n) * params.hidden_size + o;
+
+    // load initial cell and hidden state to local memory
+    h_local[o] = params.has_init_h? h_0[(d * params.batch + n) * params.hidden_size + o] : 0;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    auto output = y + (n * params.direction + d) * params.hidden_size + o;
+    bool forward = (params.direction == 1 && !params.reverse) || (params.direction == 2 && d == 0);
+    
+    for(ushort s=0; s<params.seq_len; ++s) {
+        ushort t = forward? s : params.seq_len-1-s;
+        float4 IOFC = float4(gates[t * params.hidden_size * params.batch] + bias);
+        for(short i=0; i<params.hidden_size; ++i) {
+            IOFC += float4(wh[i] * h_local[i]);
+        }
+
+        float3 IOF = 1.f / (1.f + exp(-IOFC.xyz));
+        float C   = IOFC.w;
+#if SAFE_TANH
+        // metal compute tanh in a different way than CPU
+        // C = sinh(C) / cosh(C);
+        C = safe_tanh(C);
+#else
+        C = tanh(C);
+#endif
+
+        float cell2 = IOF.z * cell + IOF.x * C;
+#if SAFE_TANH
+        //float H = IOF.y * (sinh(cell2) / cosh(cell2));
+        float H =  IOF.y * safe_tanh(cell2);
+#else
+        float H = IOF.y * tanh(cell2);
+#endif
+        h_local[o] = ftype(H);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        output[t * params.hidden_size * params.direction * params.batch] = ftype(H);
+        cell = cell2;
+    }
+    // write final hidden and cell to output
+    c[(d* params.batch + n) * params.hidden_size + o] = ftype(cell);
+    h[(d* params.batch + n) * params.hidden_size + o] = ftype(h_local[o]);
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.mm b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.mm
new file mode 100644
index 0000000..375bc5f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.mm
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/metal/acc/recurrent/metal_lstm_layer_acc.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+// [outer, axis, inner] => [outer, inner, axis]
+template <typename T>
+static void TransposeWeight(const T *src, T *dst, int outer, int inner, int axis_size) {
+    int dst_idx = 0;
+    for(int o=0; o<outer; ++o) {
+        for(int i=0; i<inner; ++i) {
+            for(int v=0; v<axis_size; ++v) {
+                int src_idx = (o * axis_size + v) * inner + i;
+                dst[dst_idx++] = src[src_idx];
+            }
+        }
+    }
+}
+
+static void TransposeWeightBlob(Blob* blob, void *buffer, int outer, int inner, int axis_size) {
+    auto data_type = blob->GetBlobDesc().data_type;
+    void *ptr = (static_cast<char *>(blob->GetHandle().base) + blob->GetHandle().bytes_offset);
+    if (data_type == DATA_TYPE_HALF) {
+        using T = uint16_t;
+        TransposeWeight<T>((T *)ptr, (T *)buffer, outer, inner, axis_size);
+    } else if (data_type == DATA_TYPE_FLOAT) {
+        using T = float;
+        TransposeWeight<T>((T *)ptr, (T *)buffer, outer, inner, axis_size);
+    }
+}
+
+static id<MTLBuffer> AllocateBufferForWeightBlob(Blob* weight_blob, int weight_count, int outer, int inner, int axis_size,
+                                          id<MTLDevice> device) {
+    auto data_type = weight_blob->GetBlobDesc().data_type;
+    auto buffer_bytes = weight_count * DataTypeUtils::GetBytesSize(data_type);
+    std::shared_ptr<char> weight_cpu_buffer(new char[buffer_bytes], [](char* p){delete [] p;});
+    std::shared_ptr<char> weight_cpu_buffer_type = nullptr;
+    TransposeWeightBlob(weight_blob, weight_cpu_buffer.get(), outer, inner, axis_size);
+#if TNN_METAL_FULL_PRECISION
+    auto metal_buffer_bytes = weight_count * sizeof(float);
+    if (data_type == DATA_TYPE_HALF) {
+        // half to float
+        weight_cpu_buffer_type.reset(new char[metal_buffer_bytes], [](char* p){delete [] p;});
+        if (ConvertFromHalfToFloat(weight_cpu_buffer.get(), (float *)weight_cpu_buffer_type.get(), weight_count) != 0) {
+            LOGE("lstm weight DataType is not supported");
+            return nil;
+        }
+        weight_cpu_buffer = weight_cpu_buffer_type;
+    }
+#else
+    auto metal_buffer_bytes = weight_count * sizeof(uint16_t);
+    if (data_type == DATA_TYPE_FLOAT) {
+        // float to half
+        weight_cpu_buffer_type.reset(new char [metal_buffer_bytes], [](char* p){ delete [] p;});
+        if (ConvertFromFloatToHalf((float *)(weight_cpu_buffer.get()), weight_cpu_buffer_type.get(), weight_count) != 0) {
+            LOGE("lstm weight DataType is not supported");
+            return nil;
+        };
+        weight_cpu_buffer = weight_cpu_buffer_type;
+    }
+#endif
+    id<MTLBuffer> buffer = [device newBufferWithBytes:weight_cpu_buffer.get()
+                                     length:metal_buffer_bytes
+                                    options:MTLResourceCPUCacheModeWriteCombined];
+    return buffer;
+}
+
+namespace TNN_NS {
+Status MetalLSTMLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                                   const std::vector<Blob *> &outputs) {
+    return MetalLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+MetalLSTMLayerAcc::~MetalLSTMLayerAcc() {}
+
+Status MetalLSTMLayerAcc::AllocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    id<MTLDevice> device       = [TNNMetalDeviceImpl sharedDevice];
+    auto lstm_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+
+    int num_direction = lstm_param->direction >= 2 ? 2 : 1;
+    int hidden_size = lstm_param->hidden_size;
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    // buffer_param_
+    {
+        MetalRecurrentParams metal_params;
+        
+        metal_params.seq_len = dims_input[0];
+        metal_params.batch   = dims_input[1];
+        metal_params.hidden_size = hidden_size;
+        metal_params.input_width = DimsVectorUtils::Count(dims_input, 2);
+        metal_params.reverse = lstm_param->direction==1;
+        metal_params.direction = num_direction;
+        metal_params.has_init_h = inputs.size() > 4;
+        metal_params.has_init_c = inputs.size() > 5;
+        
+        buffer_param_ = [device newBufferWithBytes:(const void *)(&metal_params)
+                                            length:sizeof(MetalRecurrentParams)
+                                           options:MTLResourceCPUCacheModeWriteCombined];
+    }
+    return TNN_OK;
+}
+
+Status MetalLSTMLayerAcc::AllocateBufferWeights(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = TNN_OK;
+    
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    // get input shape
+    int num_directions = layer_param->direction >=2 ? 2 : 1;
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto input_size  = DimsVectorUtils::Count(input_dims, 2);
+    const auto hidden_size = layer_param->hidden_size;
+    
+    {
+        // buffer weights_input
+        auto weight_count = num_directions * hidden_size * input_size * 4;
+        auto inner = hidden_size * input_size;
+        auto outer = num_directions;
+        // TODO: transpose to [dir, input, output, 4]
+        // transpose from: [dir, 4, hidden, input] to [dir, hidden, input, 4]
+        if (!buffer_wi_) {
+            buffer_wi_ = AllocateBufferForWeightBlob(inputs[1], weight_count, outer, inner, 4, device);
+            RETURN_VALUE_ON_NEQ(!buffer_wi_, false,
+                                Status(TNNERR_MODEL_ERR, "allocating buffer for lstm weight_input failed!"));
+        }
+    }
+    
+    {
+        // buffer weight_hidden
+        auto weight_count = num_directions * hidden_size * hidden_size * 4;
+        auto inner = hidden_size * hidden_size;
+        auto outer = num_directions;
+        // TODO: transpose to [dir, hidden_in, hidden_out, 4]
+        // transpose from: [dir, 4, hidden_out, hidden_in] to [dir, hidden_out, hidden_in, 4]
+        if (!buffer_wh_) {
+            buffer_wh_ = AllocateBufferForWeightBlob(inputs[2], weight_count, outer, inner, 4, device);
+            RETURN_VALUE_ON_NEQ(!buffer_wh_, false,
+                                Status(TNNERR_MODEL_ERR, "allocating buffer for lstm weight_hidden failed!"));
+        }
+    }
+    
+    return status;
+}
+
+Status MetalLSTMLayerAcc::AllocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = TNN_OK;
+    
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    // get input shape
+    int num_directions = layer_param->direction >=2 ? 2 : 1;
+    const auto hidden_size = layer_param->hidden_size;
+    {
+        // buffer bias
+        auto weight_count = num_directions * hidden_size * 8;
+        auto inner = hidden_size;
+        auto outer = num_directions;
+        // transpose from: [dir, 8, hidden_size] to [dir, hidden_size, 8]
+        buffer_bias_ = AllocateBufferForWeightBlob(inputs[3], weight_count, outer, inner, 8, device);
+        RETURN_VALUE_ON_NEQ(!buffer_bias_, false, Status(TNNERR_MODEL_ERR, "allocating buffer for lstm bias failed!"));
+    }
+
+    return status;
+}
+
+Status MetalLSTMLayerAcc::AllocateBufferStates(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = TNN_OK;
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    // get input shape
+    int num_directions = layer_param->direction >= 2 ? 2 : 1;
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto seq_len = input_dims[0];
+    const auto batch = input_dims[1];
+    const auto input_size  = DimsVectorUtils::Count(input_dims, 2);
+    const auto hidden_size = layer_param->hidden_size;
+    
+    auto state_buffer_bytes = 0;
+    
+    // gates buffer
+#if TNN_METAL_FULL_PRECISION
+    state_buffer_bytes = num_directions * seq_len * batch * hidden_size * 4 * sizeof(float);
+#else
+    state_buffer_bytes = num_directions * seq_len * batch * hidden_size * 4 * sizeof(uint16_t);
+#endif
+    if (!buffer_gates_ || buffer_gates_.length != state_buffer_bytes) {
+        buffer_gates_ = [device newBufferWithLength:state_buffer_bytes
+                                            options:MTLResourceStorageModePrivate];  // only metal kernel writes to this
+    }
+    
+    // initial states buffer
+#if TNN_METAL_FULL_PRECISION
+    auto metal_state_buffer_bytes = num_directions * batch * hidden_size * sizeof(float);
+#else
+    auto metal_state_buffer_bytes = num_directions * batch * hidden_size * sizeof(uint16_t);
+#endif
+    if (inputs.size() > 5 && (!buffer_c0_ || buffer_c0_.length != metal_state_buffer_bytes)) {
+        Blob *c0 = inputs[5];
+        auto data_type = c0->GetBlobDesc().data_type;
+        void *ptr = static_cast<char *>(c0->GetHandle().base) + c0->GetHandle().bytes_offset;
+        std::shared_ptr<char> buffer_type = nullptr;
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_HALF) {
+            buffer_type.reset(new char [metal_state_buffer_bytes], [](char *p){delete [] p;});
+            if (ConvertFromHalfToFloat(ptr, (float *)buffer_type.get(), num_directions * batch * hidden_size) < 0) {
+                return Status(TNNERR_MODEL_ERR, "lstm initial state DataType is not supported");
+            }
+            ptr = buffer_type.get();
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            buffer_type.reset(new char [metal_state_buffer_bytes], [](char *p){delete [] p;});
+            if (ConvertFromFloatToHalf((float *)ptr, buffer_type.get(), num_directions * batch * hidden_size) < 0) {
+                return Status(TNNERR_MODEL_ERR, "lstm initial state DataType is not supported");
+            }
+            ptr = buffer_type.get();
+        }
+#endif
+        buffer_c0_ = [device newBufferWithBytes:ptr
+                                         length:metal_state_buffer_bytes
+                                        options:MTLResourceOptionCPUCacheModeWriteCombined];
+    }
+    if (inputs.size() > 4 && (!buffer_h0_ || buffer_h0_.length != metal_state_buffer_bytes)) {
+        Blob *h0 = inputs[4];
+        auto data_type = h0->GetBlobDesc().data_type;
+        void *ptr = static_cast<char *>(h0->GetHandle().base) + h0->GetHandle().bytes_offset;
+        std::shared_ptr<char> buffer_type = nullptr;
+#if TNN_METAL_FULL_PRECISION
+        if (data_type == DATA_TYPE_HALF) {
+            buffer_type.reset( new char [metal_state_buffer_bytes], [](char *p){delete [] p;});
+            if (ConvertFromHalfToFloat(ptr, (float *)buffer_type.get(), num_directions * batch * hidden_size) < 0) {
+                return Status(TNNERR_MODEL_ERR, "lstm initial state DataType is not supported");
+            }
+            ptr = buffer_type.get();
+        }
+#else
+        if (data_type == DATA_TYPE_FLOAT) {
+            buffer_type.reset( new char [metal_state_buffer_bytes], [](char *p){delete [] p;});
+            if (ConvertFromFloatToHalf((float *)ptr, buffer_type.get(), num_directions * batch * hidden_size) < 0) {
+                return Status(TNNERR_MODEL_ERR, "lstm initial state DataType is not supported");
+            }
+            ptr = buffer_type.get();
+        }
+#endif
+        buffer_h0_ = [device newBufferWithBytes:ptr
+                                         length:metal_state_buffer_bytes
+                                        options:MTLResourceOptionCPUCacheModeWriteCombined];
+    } else if (buffer_h0_ == nil || buffer_c0_ == nil) {
+        // no initial states, set them to a valid potision to avoid error when binded with kernels
+        buffer_h0_ = buffer_gates_;
+        buffer_c0_ = buffer_gates_;
+    }
+    
+    return status;
+}
+
+Status MetalLSTMLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = MetalLayerAcc::Reshape(inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    status = AllocateBufferStates(inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    if (!buffer_wh_ || !buffer_wi_) {
+        status = AllocateBufferWeights(inputs, outputs);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+    
+    if (!buffer_bias_) {
+        status = AllocateBufferBias(inputs, outputs);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+    
+    return status;
+}
+
+std::vector<DataFormat> MetalLSTMLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size >= 2) {
+        // inputs to lstm layer should at least has two dimensions
+        support_list.push_back(DATA_FORMAT_NCHW);
+    }
+    return support_list;
+}
+
+Status MetalLSTMLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // check if inputs valid
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+    }
+    
+    Blob *blob_x = inputs[0];
+    Blob *blob_y = outputs[0];
+    Blob *blob_h = outputs[1];
+    Blob *blob_c = outputs[2];
+    
+    const auto input_dims = blob_x->GetBlobDesc().dims;
+    const auto seq_len = input_dims[0]; // length of sequence
+    const auto batch   = input_dims[1];  // batch_size
+    const auto input_size  = DimsVectorUtils::Count(input_dims, 2); // input dimension
+    const auto output_dims = blob_y->GetBlobDesc().dims;
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    const int num_directions = layer_param->direction >= 2 ? 2 : 1;
+    const auto hidden_size = layer_param->hidden_size; // output dimension
+    
+    auto context_impl = context_->getMetalContextImpl();
+    auto encoder      = [context_impl encoder];
+
+    Status status = TNN_OK;
+    MetalBandwidth bandwidth;
+    do {
+        {
+            // lstm_gates
+            status = [context_impl load:@"lstm_gates" encoder:encoder bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+            auto threads = MTLSizeMake(hidden_size, batch, seq_len*num_directions);
+            
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)blob_x->GetHandle().base
+                        offset:(NSUInteger)blob_x->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:buffer_wi_    offset:0 atIndex:1];
+            [encoder setBuffer:buffer_gates_ offset:0 atIndex:2];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:3];
+            
+            status = [context_impl dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        }
+        {
+            // lstm_forward
+            status = [context_impl load:@"lstm_forward" encoder:encoder bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+            auto threads_per_group = MTLSizeMake(hidden_size, 1, 1);
+            auto groups = MTLSizeMake(1, batch, num_directions);
+#if TNN_METAL_FULL_PRECISION
+            const auto hidden_bytes = hidden_size * sizeof(float);
+#else
+            const auto hidden_bytes = hidden_size * sizeof(uint16_t);
+#endif
+            
+            [encoder setBuffer:buffer_gates_ offset:0 atIndex:0];
+            [encoder setBuffer:buffer_c0_ offset:0 atIndex:1];
+            [encoder setBuffer:buffer_h0_ offset:0 atIndex:2];
+            [encoder setBuffer:buffer_wh_ offset:0 atIndex:3];
+            [encoder setBuffer:buffer_bias_ offset:0 atIndex:4];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)blob_c->GetHandle().base
+                        offset:(NSUInteger)blob_c->GetHandle().bytes_offset
+                       atIndex:5];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)blob_h->GetHandle().base
+                        offset:(NSUInteger)blob_h->GetHandle().bytes_offset
+                       atIndex:6];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)blob_y->GetHandle().base
+                        offset:(NSUInteger)blob_y->GetHandle().bytes_offset
+                       atIndex:7];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:8];
+            [encoder setThreadgroupMemoryLength:hidden_bytes atIndex:0];
+            
+            status = [context_impl dispatchEncoder:encoder threadsPerGroup:threads_per_group groups:groups bandwidth:bandwidth];
+            BREAK_IF(status != TNN_OK);
+        }
+    } while (0);
+    if (status != TNN_OK) {
+        [encoder endEncoding];
+        return status;
+    }
+    
+    [encoder endEncoding];
+    [context_impl commit];
+    TNN_PRINT_ENCODER(context_, encoder, this);
+    return status;
+}
+
+REGISTER_METAL_ACC(LSTM, LAYER_LSTMONNX);
+REGISTER_METAL_LAYOUT(LAYER_LSTMONNX, DATA_FORMAT_NCHW);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/build_tnn_metallib.sh b/3rdparty/TNN/source/tnn/device/metal/build_tnn_metallib.sh
new file mode 100755
index 0000000..f71a27c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/build_tnn_metallib.sh
@@ -0,0 +1,62 @@
+# 递归搜索文件夹下的所有metal文件并编译
+# 1: metal文件所在的文件夹
+# 2: metallib输出路径
+
+
+TNNMetalDirectory=$1
+TNNMetallibPath=$2
+TNNMetalInclude=$3
+
+TNNMetalFloat32=$4
+
+TNNAllMetalFiles=()
+
+function FindAllMetalFiles()
+{
+  # for file in `ls $1`　　　　　　
+   for file in $(ls $1)
+   do
+     if [ -d $1'/'$file ]; then
+       FindAllMetalFiles $1'/'$file
+     else
+       if [ "${file##*.}"x = "metal"x ]; then
+         TNNAllMetalFiles+=($1'/'$file)
+       fi
+     fi
+   done
+}
+
+function BuildMetalLib()
+{
+   TNNAllMetalAIRFiles=()
+
+   #build air files
+   for file in ${TNNAllMetalFiles[@]}
+   do
+      echo "\033[32m       Compile ${file}\033[0m"
+      # echo 'TNNMetalFloat32 = '${TNNMetalFloat32}
+      xcrun -sdk macosx metal -std=osx-metal1.1 -DTNN_METAL_FULL_PRECISION -dM -I ${TNNMetalInclude} -c ${file} -o ${file}.air
+      TNNAllMetalAIRFiles+=(${file}.air)
+   done
+
+   #build metallib
+   xcrun -sdk macosx metallib ${TNNAllMetalAIRFiles[@]} -o ${TNNMetallibPath}
+
+   #delete air files
+   for file in ${TNNAllMetalAIRFiles[@]}
+   do
+      rm -r ${file}
+   done
+}
+
+FindAllMetalFiles ${TNNMetalDirectory}
+BuildMetalLib
+
+
+# #Build tnn metallib
+# #1: input metal files full path list
+# #2: output metallib full path
+# function BuildMetal()
+# {
+#
+# }
diff --git a/3rdparty/TNN/source/tnn/device/metal/coreml_network.h b/3rdparty/TNN/source/tnn/device/metal/coreml_network.h
new file mode 100644
index 0000000..2606249
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/coreml_network.h
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_CORE_COREML_NETWORK_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_CORE_COREML_NETWORK_H_
+#import <CoreML/CoreML.h>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class CoreMLNetwork : public AbstractNetwork {
+public:
+    // @brief CoreMLNetwork Constructor
+    CoreMLNetwork();
+
+    // @brief CoreMLNetwork virtual Destructor
+    virtual ~CoreMLNetwork();
+
+public:
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param net_structure network structure info
+    // @param net_resource network resource info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+
+    // @brief reshape with input shape info
+    // @inputs input shape info
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief network forward
+    virtual Status Forward();
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief network deinit to release init create resource
+    virtual Status DeInit();
+
+    // @brief get network forward for all blob memory size
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    // @brief set forward memory when share memory mode is set from external
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief get all input blobs
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+private:
+    AbstractDevice *device_              = nullptr;
+    Context *context_                    = nullptr;
+    __strong NSDictionary *coreml_net_   = nil;
+    __strong NSDictionary *coreml_shape_ = nil;
+    __strong NSObject *coreml_model_     = nil;
+    BlobMap blob_input_map_;
+    BlobMap blob_output_map_;
+    DimsVector coreml_input_dims_;
+    DimsVector coreml_output_dims_;
+    
+    Status CheckCoreMLStatus();
+};
+
+} // namespace TNN_NS
+
+#endif // TNN_SOURCE_TNN_DEVICE_METAL_CORE_COREML_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/coreml_network.mm b/3rdparty/TNN/source/tnn/device/metal/coreml_network.mm
new file mode 100644
index 0000000..1f80be1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/coreml_network.mm
@@ -0,0 +1,298 @@
+#include "tnn/device/metal/coreml_network.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<CoreMLNetwork>>
+    g_network_impl_coreml_factory_register(NETWORK_TYPE_COREML);
+
+CoreMLNetwork::CoreMLNetwork() {}
+
+CoreMLNetwork::~CoreMLNetwork() {
+    DeInit();
+    for (auto iter : blob_input_map_) {
+        if (iter.second && iter.second->GetHandle().base) {
+            CFBridgingRelease(iter.second->GetHandle().base);
+            iter.second->SetHandle(BlobHandle());
+        }
+    }
+    blob_input_map_ = {};
+
+    for (auto iter : blob_output_map_) {
+        if (iter.second && iter.second->GetHandle().base) {
+            CFBridgingRelease(iter.second->GetHandle().base);
+            iter.second->SetHandle(BlobHandle());
+        }
+    }
+    blob_output_map_ = {};
+}
+
+Status CoreMLNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                           InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+    if (@available(iOS 11.0, macOS 10.13, *)) {
+        Status ret = TNN_OK;
+
+        device_ = GetDevice(net_config.device_type);
+        if (device_ == NULL) {
+            return TNNERR_DEVICE_NOT_SUPPORT;
+        }
+
+        context_ = device_->CreateContext(net_config.device_id);
+        if (context_ == NULL) {
+            return TNNERR_DEVICE_CONTEXT_CREATE;
+        }
+
+        ret = context_->LoadLibrary(net_config.library_path);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+
+        if (model_config.params.size() < 1) {
+            LOGE("Error: ModelConfig.params[0] is not a directory of MLModel\n");
+            return Status(TNNERR_INST_ERR, "Error: ModelConfig.params[0] is not a directory of MLModel");
+        }
+
+        NSError *error = nil;
+
+        NSString *model_dir = [NSString stringWithUTF8String:model_config.params[0].c_str()];
+        NSData *data_net =
+            [NSData dataWithContentsOfFile:[model_dir stringByAppendingPathComponent:@"model.espresso.net"]];
+        NSData *data_shape =
+            [NSData dataWithContentsOfFile:[model_dir stringByAppendingPathComponent:@"model.espresso.shape"]];
+        if (!data_net || !data_shape) {
+            LOGE("Error: CoreML net or shape file is invalid\n");
+            return Status(TNNERR_INST_ERR, "CoreML net or shape file is invalid");
+        }
+
+        coreml_net_ = [NSJSONSerialization JSONObjectWithData:data_net
+                                                      options:NSJSONReadingAllowFragments
+                                                        error:&error];
+        if (error || !coreml_net_ || [coreml_net_[@"layers"] count] <= 0) {
+            LOGE("Error: MLModel modelWithContentsOfURL failed: invalid net file\n");
+            return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed: invalid net file");
+        }
+        coreml_shape_ = [NSJSONSerialization JSONObjectWithData:data_shape
+                                                        options:NSJSONReadingAllowFragments
+                                                          error:&error];
+        if (error || !coreml_shape_ || [coreml_shape_[@"layer_shapes"] count] <= 0) {
+            LOGE("Error: MLModel modelWithContentsOfURL failed: invalid shape file\n");
+            return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed: invalid shape file");
+        }
+
+        coreml_model_ = [MLModel modelWithContentsOfURL:[NSURL fileURLWithPath:model_dir] error:&error];
+
+        if (error || !coreml_model_) {
+            LOGE("Error: MLModel modelWithContentsOfURL failed\n");
+            return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed");
+        }
+
+        return ret;
+    } else {
+        return Status(TNNERR_INST_ERR, "CoreML only support iOS 11+");
+    }
+}
+
+Status CoreMLNetwork::GetForwardMemorySize(int &memory_size) {
+    memory_size = 0;
+    return Status(TNNERR_INST_ERR, "CoreML do not support GetForwardMemorySize");
+}
+
+Status CoreMLNetwork::SetForwardMemory(void *memory) {
+    return Status(TNNERR_INST_ERR, "CoreML do not support SetForwardMemory");
+}
+
+Status CoreMLNetwork::CheckCoreMLStatus() {
+    if (!coreml_net_ || [coreml_net_[@"layers"] count] <= 0) {
+        LOGE("Error: MLModel modelWithContentsOfURL failed: invalid net file\n");
+        return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed: invalid net file");
+    }
+    
+    if (!coreml_shape_ || [coreml_shape_[@"layer_shapes"] count] <= 0) {
+        LOGE("Error: MLModel modelWithContentsOfURL failed: invalid shape file\n");
+        return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed: invalid shape file");
+    }
+    return TNN_OK;
+}
+
+Status CoreMLNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    if (blob_input_map_.size() > 0) {
+        blobs = blob_input_map_;
+        return TNN_OK;
+    }
+    
+    auto status = CheckCoreMLStatus();
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    MetalContext *context              = dynamic_cast<MetalContext *>(context_);
+    TNNMMetalContextImpl *context_impl = context->getMetalContextImpl();
+    NSDictionary *layer_shapes = coreml_shape_[@"layer_shapes"];
+    NSArray *layeres = coreml_net_[@"layers"];
+    
+    {
+        NSString *input_name       = layeres[0][@"bottom"];
+        NSDictionary *inuput_shape = layer_shapes[input_name];
+
+        DimsVector input_dims = {[inuput_shape[@"n"] intValue],
+                                 [inuput_shape[@"k"] intValue],
+                                 [inuput_shape[@"h"] intValue],
+                                 [inuput_shape[@"w"] intValue]};
+        coreml_input_dims_    = input_dims;
+
+        BlobDesc desc;
+        {
+            desc.device_type = DEVICE_METAL;
+            desc.data_type   = DATA_TYPE_FLOAT;
+            // data_format describes data order nchw, nhwc, ...
+            desc.data_format = DATA_FORMAT_NCHW;
+            desc.dims        = input_dims;
+            desc.name        = input_name.UTF8String;
+        };
+        const int data_count = input_dims[0] * (((input_dims[1] + 3) / 4 * 4)) * input_dims[2] * input_dims[3];
+
+        int bytes_count      = data_count * DataTypeUtils::GetBytesSize(desc.data_type);
+        id<MTLBuffer> buffer = [context_impl.device newBufferWithLength:bytes_count
+                                                                options:MTLResourceCPUCacheModeDefaultCache];
+
+        BlobHandle handle;
+        {
+            handle.base         = (void *)CFBridgingRetain(buffer);
+            handle.bytes_offset = 0;
+        };
+
+        blob_input_map_[desc.name] = new Blob(desc, handle);
+    }
+
+    blobs = blob_input_map_;
+    return TNN_OK;
+}
+
+Status CoreMLNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    if (blob_output_map_.size() > 0) {
+        blobs = blob_output_map_;
+        return TNN_OK;
+    }
+    
+    auto status = CheckCoreMLStatus();
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    MetalContext *context              = dynamic_cast<MetalContext *>(context_);
+    TNNMMetalContextImpl *context_impl = context->getMetalContextImpl();
+    
+    NSDictionary *layer_shapes = coreml_shape_[@"layer_shapes"];
+    NSArray *layeres = coreml_net_[@"layers"];
+    
+    {
+        NSString *output_name      = layeres[layeres.count - 1][@"top"];
+        NSDictionary *output_shape = layer_shapes[output_name];
+
+        DimsVector output_dims = {[output_shape[@"n"] intValue],
+                                  [output_shape[@"k"] intValue],
+                                  [output_shape[@"h"] intValue],
+                                  [output_shape[@"w"] intValue]};
+        coreml_output_dims_    = output_dims;
+
+        BlobDesc desc;
+        {
+            desc.device_type = DEVICE_METAL;
+            desc.data_type   = DATA_TYPE_FLOAT;
+            // data_format describes data order nchw, nhwc, ...
+            desc.data_format = DATA_FORMAT_NCHW;
+            desc.dims        = output_dims;
+            desc.name        = output_name.UTF8String;
+        };
+        const int data_count = output_dims[0] * (((output_dims[1] + 3) / 4 * 4)) * output_dims[2] * output_dims[3];
+
+        int bytes_count      = data_count * DataTypeUtils::GetBytesSize(desc.data_type);
+        id<MTLBuffer> buffer = [context_impl.device newBufferWithLength:bytes_count
+                                                                options:MTLResourceCPUCacheModeDefaultCache];
+
+        BlobHandle handle;
+        {
+            handle.base         = (void *)CFBridgingRetain(buffer);
+            handle.bytes_offset = 0;
+        };
+
+        blob_output_map_[desc.name] = new Blob(desc, handle);
+    }
+
+    blobs = blob_output_map_;
+    return TNN_OK;
+}
+
+Status CoreMLNetwork::Reshape(const InputShapesMap &inputs) {
+    return Status(TNNERR_INST_ERR, "CoreML do not support Reshape");
+}
+
+Status CoreMLNetwork::DeInit() {
+    coreml_model_ = nil;
+    return TNN_OK;
+}
+
+Status CoreMLNetwork::GetCommandQueue(void **command_queue) {
+    if (context_ == NULL) {
+        return Status(TNNERR_DEVICE_CONTEXT_CREATE, "CoreML GetCommandQueue is nil");
+    }
+    return context_->GetCommandQueue(command_queue);
+}
+
+Status CoreMLNetwork::Forward() {
+    if (@available(iOS 11.0, macOS 10.13, *)) {
+        BlobMap blob_output_map;
+        auto status = GetAllOutputBlobs(blob_output_map);
+        if (status != TNN_OK) {
+            return status;
+        }
+
+        if (!coreml_net_ || [coreml_net_[@"layers"] count] <= 0) {
+            LOGE("Error: MLModel modelWithContentsOfURL failed: invalid net file\n");
+            return Status(TNNERR_INST_ERR, "MLModel modelWithContentsOfURL failed: invalid net file");
+        }
+        NSArray *layeres      = coreml_net_[@"layers"];
+        NSString *input_name  = layeres[0][@"bottom"];
+        NSString *output_name = layeres[layeres.count - 1][@"top"];
+
+        Blob *input_blob          = blob_input_map_[string(input_name.UTF8String)];
+        auto input_mtl_buffer     = (__bridge id<MTLBuffer>)(void *)input_blob->GetHandle().base;
+        auto input_dims           = input_blob->GetBlobDesc().dims;
+        DimsVector input_stridess = {input_dims[1] * input_dims[2] * input_dims[3], input_dims[2] * input_dims[3],
+                                     input_dims[3], 1};
+        NSError *error            = nil;
+        MLMultiArray *input_array = [[MLMultiArray alloc]
+            initWithDataPointer:input_mtl_buffer.contents
+                          shape:@[ @(input_dims[1]), @(input_dims[2]), @(input_dims[3]) ]
+                       dataType:MLMultiArrayDataTypeFloat32
+                        strides:@[ @(input_stridess[1]), @(input_stridess[2]), @(input_stridess[3]) ]
+                    deallocator:^(void *_Nonnull bytes) {
+                    }
+                          error:&error];
+        MLFeatureValue *input_feat_value = [MLFeatureValue featureValueWithMultiArray:input_array];
+        auto input  = [[MLDictionaryFeatureProvider alloc] initWithDictionary:@{input_name : input_feat_value}
+                                                                       error:&error];
+        auto output = (MLDictionaryFeatureProvider *)[(MLModel *)coreml_model_ predictionFromFeatures:input
+                                                                                                error:&error];
+        MLMultiArray *output_array = [output objectForKeyedSubscript:output_name].multiArrayValue;
+        int out_data_count         = DimsVectorUtils::Count(coreml_output_dims_);
+
+        Blob *output_blob      = blob_output_map[string(output_name.UTF8String)];
+        auto output_mtl_buffer = (__bridge id<MTLBuffer>)(void *)output_blob->GetHandle().base;
+        auto output_dims       = output_blob->GetBlobDesc().dims;
+        int bytes_count        = out_data_count * DataTypeUtils::GetBytesSize(output_blob->GetBlobDesc().data_type);
+
+        memcpy(output_mtl_buffer.contents, output_array.dataPointer, bytes_count);
+        return TNN_OK;
+    } else {
+        return Status(TNNERR_INST_ERR, "CoreML only support iOS 11+");
+    }
+}
+
+// @brief tnn instance network infer, it will not wait
+Status CoreMLNetwork::ForwardAsync(Callback call_back) {
+    return Forward();
+}
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.metal b/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.metal
new file mode 100644
index 0000000..21b87ec
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.metal
@@ -0,0 +1,525 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+#pragma mark - texture -> buffer
+kernel void image_converter_texture_bgra8888_2_buffer_nc4hw4(
+       texture2d<half, access::read> src_bgra      [[texture(0)]],
+       device ftype4 *dst                           [[buffer(0)]],
+       constant MetalImageConverterParams& params  [[buffer(1)]],
+       ushort2 gid                                 [[thread_position_in_grid]])
+{
+    //NOTE:
+    //output of func read is rgba data, no matter the pixel format of texture is
+    //MTLPixelFormatRGBA8Unorm or MTLPixelFormatBGRA8Unorm
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+    float4 in = float4(src_bgra.read(uint2(gid)));
+    in = !params.bgra_to_rgba ? in.zyxw : in;
+    auto out = dst + (int)gid.y * params.width + (int)gid.x;
+    
+    *out = ftype4(in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w));
+}
+
+kernel void image_converter_texture_bgra8888_2_buffer_nchw_f(
+      texture2d<half, access::read> src_bgra      [[texture(0)]],
+      device float *dst                           [[buffer(0)]],
+      constant MetalImageConverterParams& params  [[buffer(1)]],
+      ushort2 gid                                 [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+    float4 in = float4(src_bgra.read(uint2(gid)));
+    in = !params.bgra_to_rgba ? in.zyxw : in;
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+    const int channel_size = params.height * params.width;
+    in = in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    dst[offset]                = in.x;
+    dst[offset+channel_size]   = in.y;
+    dst[offset+channel_size*2] = in.z;
+    dst[offset+channel_size*3] = in.w;
+}
+
+#pragma mark - buffer -> texture
+kernel void image_converter_buffer_nc4hw4_2_texture_bgra8888(
+        texture2d<half, access::write> dst_bgra     [[texture(0)]],
+        const device ftype4 *src                     [[buffer(0)]],
+        constant MetalImageConverterParams& params  [[buffer(1)]],
+        ushort2 gid                                 [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+    
+    float4 in  = float4(src[(int)gid.y * params.width + (int)gid.x]);
+    in = in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    in = !params.bgra_to_rgba ? in.zyxw : in;
+    dst_bgra.write(half4(in), uint2(gid));
+}
+
+
+kernel void image_converter_buffer_nchw_f_2_texture_bgra8888(
+       texture2d<half, access::write> dst_bgra    [[texture(0)]],
+       const device float *src                    [[buffer(0)]],
+       constant MetalImageConverterParams& params      [[buffer(1)]],
+       ushort2 gid                                 [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+    
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+    const int channel_size = params.height * params.width;
+    
+    float4 in  = float4(src[offset], src[offset+channel_size], src[offset+channel_size*2], src[offset+channel_size*3]);
+    in = in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    
+    in = !params.bgra_to_rgba ? in.zyxw : in;
+    dst_bgra.write(half4(in), uint2(gid));
+}
+
+static inline uchar convert_uchar_saturate(const ftype val) {
+    return val <= 0 ? uchar(0) : (val >= 255 ? uchar(255) : uchar(val));
+}
+
+#pragma mark - buffer <-> buffer
+kernel void image_converter_buffer_nc4hw4_2_buffer_bgra(
+      device uchar  *dst                           [[buffer(0)]],
+      const device ftype4 *src                     [[buffer(1)]],
+      constant MetalImageConverterParams& params   [[buffer(2)]],
+      ushort2 gid                                  [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+
+    float4 in = float4(src[offset]);
+    
+    in = in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    in = params.bgra_to_rgba ? in.zyxw : in;
+
+    dst[offset*4 + 0] = convert_uchar_saturate(in.x);
+    dst[offset*4 + 1] = convert_uchar_saturate(in.y);
+    dst[offset*4 + 2] = convert_uchar_saturate(in.z);
+    dst[offset*4 + 3] = convert_uchar_saturate(in.w);
+}
+
+kernel void image_converter_buffer_nc4hw4_2_buffer_bgr(
+      device uchar  *dst                           [[buffer(0)]],
+      const device ftype4 *src                     [[buffer(1)]],
+      constant MetalImageConverterParams& params   [[buffer(2)]],
+      ushort2 gid                                  [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+
+    float3 in = float3(src[offset].xyz);
+    
+    in = in*float3(params.scale_x, params.scale_y, params.scale_z) + float3(params.bias_x, params.bias_y, params.bias_z);
+    in = params.bgra_to_rgba ? in.zyx : in;
+
+    dst[offset*3 + 0] = convert_uchar_saturate(in.x);
+    dst[offset*3 + 1] = convert_uchar_saturate(in.y);
+    dst[offset*3 + 2] = convert_uchar_saturate(in.z);
+}
+
+kernel void image_converter_buffer_bgr_2_buffer_nc4hw4(
+      device ftype4 *dst                          [[buffer(0)]],
+      const device uchar  *src                    [[buffer(1)]],
+      constant MetalImageConverterParams& params  [[buffer(2)]],
+      ushort2 gid                                 [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+
+    float3 in = float3(src[offset*3], src[offset*3 + 1], src[offset*3 + 2]);
+    in = params.bgra_to_rgba ? in.zyx : in;
+    
+    in = in*float3(params.scale_x, params.scale_y, params.scale_z) + float3(params.bias_x, params.bias_y, params.bias_z);
+    
+    ftype4 val  = ftype4(in.x, in.y, in.z, 0.f);
+    dst[offset] = val;
+}
+
+kernel void image_converter_buffer_bgra_2_buffer_nc4hw4(
+      device ftype4 *dst                          [[buffer(0)]],
+      const device uchar  *src                    [[buffer(1)]],
+      constant MetalImageConverterParams& params  [[buffer(2)]],
+      ushort2 gid                                 [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(params.width, params.height)))
+        return;
+
+    const int offset = (int)gid.y * params.width + (int)gid.x;
+
+    float4 in = float4(src[offset*4], src[offset*4 + 1], src[offset*4 + 2], src[offset*4 + 3]);
+    in = params.bgra_to_rgba ? in.zyxw : in;
+    
+    in = in*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    dst[offset] = ftype4(in.x);
+}
+
+template<typename SrcType, typename SrcType4, typename DstType, typename DstType4>
+static inline void data_converter_nc4hw4_2_nchw(device DstType *dst,
+                                                const device SrcType4 *src,
+                                                constant MetalImageConverterParams& params,
+                                                uint3 gid)
+{
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+    
+    const int index_in =  (int)gid.z*params.slice*params.size + (int)gid.y*params.size + (int)gid.x;
+    
+    int channel_out = gid.y*4;
+    int index_out = ((int)gid.z*params.channel + channel_out)*params.size + (int)gid.x;
+    
+    float4 in_data  = float4(src[index_in]);
+    in_data = in_data*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    
+    auto out_data = DstType4(in_data);
+    
+    dst[index_out]                = out_data.x;
+    if (channel_out + 1 < params.channel) {
+        dst[index_out+params.size]   = out_data.y;
+    }
+    if (channel_out + 2 < params.channel) {
+        dst[index_out+params.size*2] = out_data.z;
+    }
+    if (channel_out + 3 < params.channel) {
+        dst[index_out+params.size*3] = out_data.w;
+    }
+}
+
+template<typename SrcType, typename SrcType4, typename DstType, typename DstType4, bool DoScale=true>
+static inline void data_converter_nc4hw4_2_nchw_v2(device DstType *dst,
+                                                const device SrcType4 *src,
+                                                constant MetalImageConverterParams& params,
+                                                const device float *scale,
+                                                const device float * bias,
+                                                uint3 gid)
+{
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+
+    const int index_in =  (int)gid.z*params.slice*params.size + (int)gid.y*params.size + (int)gid.x;
+
+    int channel_out = gid.y*4;
+    int index_out = ((int)gid.z*params.channel + channel_out)*params.size + (int)gid.x;
+
+    float4 scale_c = float4(Zero4);
+    float4 bias_c  = float4(Zero4);
+    if (DoScale) {
+        scale_c = float4(scale[channel_out], 0, 0, 0);
+        bias_c  = float4(bias[channel_out], 0, 0, 0);
+        if (channel_out + 1 < params.channel) {
+            scale_c.y = scale[channel_out + 1];
+            bias_c.y = bias[channel_out + 1];
+        }
+        if (channel_out + 2 < params.channel) {
+            scale_c.z = scale[channel_out + 2];
+            bias_c.z = bias[channel_out + 2];
+        }
+        if (channel_out + 3 < params.channel) {
+            scale_c.w = scale[channel_out + 3];
+            bias_c.w = bias[channel_out + 3];
+        }
+    }
+
+    SrcType4 in_data  = src[index_in];
+    auto out_data = DstType4(in_data);
+    if (DoScale) {
+        float4 value = float4(in_data) * scale_c + bias_c;
+        out_data = DstType4(value);
+    }
+
+    dst[index_out]                = out_data.x;
+    if (channel_out + 1 < params.channel) {
+        dst[index_out+params.size]   = out_data.y;
+    }
+    if (channel_out + 2 < params.channel) {
+        dst[index_out+params.size*2] = out_data.z;
+    }
+    if (channel_out + 3 < params.channel) {
+        dst[index_out+params.size*3] = out_data.w;
+    }
+}
+    
+kernel void data_converter_nc4hw4_2_nchw_float(
+                                             device float *dst                             [[buffer(0)]],
+                                             const device ftype4 *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nc4hw4_2_nchw<ftype, ftype4, float, float4>(dst, src, params, gid);
+}
+
+kernel void data_converter_nc4hw4_2_nchw_float_v2(
+                                             device float *dst                             [[buffer(0)]],
+                                             const device ftype4 *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nc4hw4_2_nchw_v2<ftype, ftype4, float, float4>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nc4hw4_2_nchw_half_v2(
+                                             device half *dst                             [[buffer(0)]],
+                                             const device ftype4 *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nc4hw4_2_nchw_v2<ftype, ftype4, half, half4>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nc4hw4_2_nchw_int32_v2(
+                                             device int *dst                             [[buffer(0)]],
+                                             const device int4 *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nc4hw4_2_nchw_v2<int, int4, int, int4, false>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nc4hw4_2_nchw_int322float_v2(
+                                             device float *dst                             [[buffer(0)]],
+                                             const device int4 *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nc4hw4_2_nchw_v2<int, int4, float, float4>(dst, src, params, scale, bias, gid);
+}
+
+template<typename SrcType, typename SrcType4, typename DstType, typename DstType4>
+static inline void data_converter_nchw_2_nc4hw4(device DstType4 *dst,
+                                                const device SrcType *src,
+                                                constant MetalImageConverterParams& params,
+                                                uint3 gid)
+{
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+    
+    int channel_in= gid.y*4;
+    int index_in = ((int)gid.z*params.channel + channel_in)*params.size + (int)gid.x;
+    
+    const int index_out =  (int)gid.z*params.slice*params.size + (int)gid.y * params.size + (int)gid.x;
+    
+    float4 in_data  = float4(Zero4);
+    float4 in_mask = float4(1, 0, 0, 0);
+    
+    in_data.x = src[index_in];
+    if (channel_in + 1 < params.channel) {
+        in_data.y = src[index_in + params.size];
+        in_mask.y = 1;
+    }
+    if (channel_in + 2 < params.channel) {
+        in_data.z = src[index_in + params.size*2];
+        in_mask.z = 1;
+    }
+    if (channel_in + 3 < params.channel) {
+        in_data.w = src[index_in + params.size*3];
+        in_mask.w = 1;
+    }
+    
+    in_data = in_data*float4(params.scale_x, params.scale_y, params.scale_z, params.scale_w) + float4(params.bias_x, params.bias_y, params.bias_z, params.bias_w);
+    in_data *= in_mask;
+    
+    dst[index_out] = DstType4(in_data);
+
+}
+
+template<typename SrcType, typename SrcType4, typename DstType, typename DstType4, bool DoScale=true>
+static inline void data_converter_nchw_2_nc4hw4_v2(device DstType4 *dst,
+                                                const device SrcType *src,
+                                                constant MetalImageConverterParams& params,
+                                                const device float *scale,
+                                                const device float *bias,
+                                                uint3 gid)
+{
+    if (any(gid >= uint3(params.size, params.slice, params.batch)))
+        return;
+
+    int channel_in= gid.y*4;
+    int index_in = ((int)gid.z*params.channel + channel_in)*params.size + (int)gid.x;
+
+    const int index_out =  (int)gid.z*params.slice*params.size + (int)gid.y * params.size + (int)gid.x;
+
+    ftype4 in_data  = ftype4(Zero4);
+    float4 scale_c  = float4(Zero4);
+    float4 bias_c   = float4(Zero4);
+
+    in_data.x = src[index_in];
+    if (DoScale == true) {
+        scale_c.x = scale[channel_in];
+        bias_c.x  = bias[channel_in];
+    }
+    if (channel_in + 1 < params.channel) {
+        in_data.y = src[index_in + params.size];
+        if (DoScale) {
+            scale_c.y = scale[channel_in + 1];
+            bias_c.y  = bias[channel_in + 1];
+        }
+    }
+    if (channel_in + 2 < params.channel) {
+        in_data.z = src[index_in + params.size*2];
+        if (DoScale) {
+            scale_c.z = scale[channel_in + 2];
+            bias_c.z  = bias[channel_in + 2];
+        }
+    }
+    if (channel_in + 3 < params.channel) {
+        in_data.w = src[index_in + params.size*3];
+        if (DoScale) {
+            scale_c.w = scale[channel_in + 3];
+            bias_c.w  = bias[channel_in + 3];
+        }
+    }
+
+    ftype4 result = in_data;
+    if (DoScale) {
+        result = ftype4(float4(in_data) * scale_c + bias_c);
+    }
+
+    dst[index_out] = DstType4(result);
+}
+
+kernel void data_converter_nchw_2_nc4hw4_float(
+                                             device ftype4 *dst                             [[buffer(0)]],
+                                             const device float *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nchw_2_nc4hw4<float, float4, ftype, ftype4>(dst, src, params, gid);
+}
+
+kernel void data_converter_nchw_2_nc4hw4_float_v2(
+                                             device ftype4 *dst                             [[buffer(0)]],
+                                             const device float *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nchw_2_nc4hw4_v2<float, float4, ftype, ftype4>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nchw_2_nc4hw4_half_v2(
+                                             device ftype4 *dst                             [[buffer(0)]],
+                                             const device half *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nchw_2_nc4hw4_v2<half, half4, ftype, ftype4>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nchw_2_nc4hw4_int32_v2(
+                                             device int4 *dst                             [[buffer(0)]],
+                                             const device int *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nchw_2_nc4hw4_v2<int, int4, int, int4, false>(dst, src, params, scale, bias, gid);
+}
+
+kernel void data_converter_nchw_2_nc4hw4_ftype_identity(
+                                             device ftype4 *dst                             [[buffer(0)]],
+                                             const device ftype *src                   [[buffer(1)]],
+                                             constant MetalImageConverterParams& params      [[buffer(2)]],
+                                             const device float *scale                  [[buffer(3)]],
+                                             const device float *bias                   [[buffer(4)]],
+                                             uint3 gid                                 [[thread_position_in_grid]])
+{
+    data_converter_nchw_2_nc4hw4_v2<ftype, ftype4, ftype, ftype4, false>(dst, src, params, nullptr, nullptr, gid);
+}
+
+template<typename SrcType, typename DstType>
+static inline void data_converter_nchw_copy_type(device DstType *dst,
+                                                const device SrcType *src,
+                                            constant MetalImageConverterParams& params,
+                                                uint3 gid)
+{
+    if (any(gid >= uint3(params.size, params.channel, params.batch)))
+        return;
+
+    int index = ((int)gid.z*params.channel + (int)gid.y)*params.size + (int)gid.x;
+    dst[index] = DstType(src[index]);
+}
+
+kernel void data_converter_nchw_ftype2float(device float *dst      [[buffer(0)]],
+                                         const device ftype *src  [[buffer(1)]],
+                                         constant MetalImageConverterParams& params      [[buffer(2)]],
+                                         const device float *scale                  [[buffer(3)]],
+                                         const device float *bias                   [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]])
+{
+    data_converter_nchw_copy_type<ftype, float>(dst, src, params, gid);
+}
+
+kernel void data_converter_nchw_float2ftype(device ftype *dst      [[buffer(0)]],
+                                         const device float *src  [[buffer(1)]],
+                                         constant MetalImageConverterParams& params      [[buffer(2)]],
+                                         const device float *scale                  [[buffer(3)]],
+                                         const device float *bias                   [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]])
+{
+    data_converter_nchw_copy_type<float, ftype>(dst, src, params, gid);
+}
+
+kernel void data_converter_nchw_ftype2half(device half *dst      [[buffer(0)]],
+                                         const device ftype *src  [[buffer(1)]],
+                                         constant MetalImageConverterParams& params      [[buffer(2)]],
+                                         const device float *scale                  [[buffer(3)]],
+                                         const device float *bias                   [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]])
+{
+    data_converter_nchw_copy_type<ftype, half>(dst, src, params, gid);
+}
+
+kernel void data_converter_nchw_half2ftype(device ftype *dst      [[buffer(0)]],
+                                         const device half *src  [[buffer(1)]],
+                                         constant MetalImageConverterParams& params      [[buffer(2)]],
+                                         const device float *scale                  [[buffer(3)]],
+                                         const device float *bias                   [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]])
+{
+    data_converter_nchw_copy_type<half, ftype>(dst, src, params, gid);
+}
+
+kernel void data_converter_nchw(device ftype *dst      [[buffer(0)]],
+                                         const device ftype *src  [[buffer(1)]],
+                                         constant MetalImageConverterParams& params      [[buffer(2)]],
+                                         const device float *scale                  [[buffer(3)]],
+                                         const device float *bias                   [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]])
+{
+    data_converter_nchw_copy_type<ftype, ftype>(dst, src, params, gid);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.mm b/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.mm
new file mode 100644
index 0000000..aa312a0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_blob_converter.mm
@@ -0,0 +1,725 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Metal/Metal.h>
+
+#import "tnn/core/macro.h"
+#import "tnn/device//metal/acc/metal_common.h"
+#import "tnn/device//metal/metal_command_queue.h"
+#import "tnn/device//metal/metal_context.h"
+#import "tnn/utils/blob_converter_internal.h"
+#import "tnn/utils/data_type_utils.h"
+#import "tnn/utils/dims_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+class MetalBlobConverterAcc : public BlobConverterAcc {
+public:
+    MetalBlobConverterAcc(Blob *blob);
+    virtual ~MetalBlobConverterAcc(){};
+    virtual Status ConvertToMat(Mat &image, MatConvertParam param, void *command_queue = NULL);
+    virtual Status ConvertToMatAsync(Mat &image, MatConvertParam param, void *command_queue = NULL);
+
+    virtual Status ConvertFromMat(Mat &image, MatConvertParam param, void *command_queue = NULL);
+    virtual Status ConvertFromMatAsync(Mat &image, MatConvertParam param, void *command_queue = NULL);
+
+protected:
+    MatConvertParam param_;
+    id<MTLDevice> device_                         = nil;
+    id<MTLBuffer> buffer_param_                   = nil;
+    id<MTLComputePipelineState> pipeline_process_ = nil;
+    // buffer for scale and bias used in nchw_float mat transformation
+    id<MTLBuffer> buffer_scale_;
+    id<MTLBuffer> buffer_bias_;
+    // @param waitState: 0: no wait, 1: wait gpu completed, 2: wait gpu scheduled.
+    Status ConvertToMatCommon(Mat &input_mat, Blob *output_blob, void *command_queue, int waitState = 1);
+    Status ConvertFromMatCommon(Mat &input_mat, Blob *output_blob, void *command_queue, int waitState = 1);
+
+    Status AllocateBufferParam(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob);
+    Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob,
+                                   void *command_queue);
+    bool CheckDeviceAndMat(DeviceType device_type, MatType mat_type);
+    std::shared_ptr<Mat> buffer_mat_ = nullptr;
+};
+
+MetalBlobConverterAcc::MetalBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {
+    auto buffer = (__bridge id<MTLBuffer>)(blob->GetHandle().base);
+    device_     = buffer.device;
+}
+
+bool MetalBlobConverterAcc::CheckDeviceAndMat(DeviceType device_type, MatType mat_type) {
+    bool device_supported = (device_type == DEVICE_METAL || device_type == DEVICE_ARM || 
+            device_type == DEVICE_X86 || device_type == DEVICE_NAIVE);
+
+    bool mat_supported = (mat_type == N8UC4 || mat_type == N8UC3 ||
+            mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST || mat_type == NC_INT32);
+
+    return device_supported && mat_supported;
+}
+
+Status MetalBlobConverterAcc::AllocateBufferParam(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob) {
+    auto dims = blob->GetBlobDesc().dims;
+    MetalImageConverterParams metal_param;
+    metal_param.width        = DimsFunctionUtils::GetDimProduct(dims, 3);
+    metal_param.height       = DimsFunctionUtils::GetDim(dims, 2);
+    metal_param.size         = metal_param.height * metal_param.width;
+    metal_param.channel      = dims[1];
+    metal_param.slice        = UP_DIV(metal_param.channel, 4);
+    metal_param.batch        = dims[0];
+    metal_param.bgra_to_rgba = param.reverse_channel;
+
+    LOGD("metal_param size: %d %d %d %d %d\n", metal_param.batch, metal_param.channel, metal_param.height,
+         metal_param.width, metal_param.size);
+
+    float scale_texture_buffer = 1.0f;
+    float bias_texture_buffer  = 1.0f;
+    const auto mat_type = mat->GetMatType();
+    // Metal does not support N8UC3 mat, only N8UC4 metal mat uses mtltexture
+    bool need_rescale = (mat_type == N8UC4) && (is_mat_to_blob || mat->GetDeviceType() == DEVICE_METAL);
+    if (need_rescale) {
+        scale_texture_buffer = is_mat_to_blob ? 255.0f : 1.0 / 255.0f;
+        bias_texture_buffer  = is_mat_to_blob ? 1.0    : 1.0 / 255.0f;
+    }
+
+    if (mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST || mat_type == NC_INT32) {
+        // scale and bias should at least have channel elements, so we use another buffer instead of metal_param
+        if (param.scale.size() < dims[1] || param.bias.size() < dims[1]) {
+            // invalid scale and bias
+            return Status(TNNERR_INVALID_INPUT, "invalid scale or bias shape!");
+        }
+        if (buffer_scale_ == nil) {
+            buffer_scale_ = [device_ newBufferWithBytes:&(param.scale[0])
+                                                 length:sizeof(float)*dims[1]
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        }
+        if (buffer_bias_ == nil) {
+            buffer_bias_ = [device_ newBufferWithBytes:&(param.bias[0])
+                                                 length:sizeof(float)*dims[1]
+                                                options:MTLResourceCPUCacheModeWriteCombined];
+        }
+        if (buffer_scale_ == nil) {
+            return Status(TNNERR_INVALID_INPUT, "buffer scale is nil");
+        }
+        if (buffer_bias_ == nil) {
+            return Status(TNNERR_INVALID_INPUT, "buffer bias is nil");
+        }
+    } else {
+        if (param.scale.size() >= 3) {
+            metal_param.scale_x = scale_texture_buffer * param.scale[0];
+            metal_param.scale_y = scale_texture_buffer * param.scale[1];
+            metal_param.scale_z = scale_texture_buffer * param.scale[2];
+            if (param.scale.size() > 3)
+                metal_param.scale_w = scale_texture_buffer * param.scale[3];
+        } else {
+            metal_param.scale_x = 1.0f;
+            metal_param.scale_y = 1.0f;
+            metal_param.scale_z = 1.0f;
+            metal_param.scale_w = 1.0f;
+        }
+        if (param.bias.size() >= 3) {
+            metal_param.bias_x = bias_texture_buffer * param.bias[0];
+            metal_param.bias_y = bias_texture_buffer * param.bias[1];
+            metal_param.bias_z = bias_texture_buffer * param.bias[2];
+            if (param.bias.size() > 3)
+                metal_param.bias_w = bias_texture_buffer * param.bias[3];
+        } else {
+            metal_param.bias_x = 0.0f;
+            metal_param.bias_y = 0.0f;
+            metal_param.bias_z = 0.0f;
+            metal_param.bias_w = 0.0f;
+        }
+    }
+
+    LOGD("metal_param scale: %.6f %.6f %.6f\n", metal_param.scale_x, metal_param.scale_y, metal_param.scale_z);
+    LOGD("metal_param size: %d %d %d\n", metal_param.size, metal_param.slice, metal_param.batch);
+
+    buffer_param_ = [device_ newBufferWithBytes:&metal_param
+                                         length:sizeof(MetalImageConverterParams)
+                                        options:MTLResourceCPUCacheModeWriteCombined];
+    if (!buffer_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer param is nil");
+    }
+    return TNN_OK;
+}
+
+Status MetalBlobConverterAcc::AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob,
+                                                      void *command_queue) {
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+
+    auto mat_device_type  = mat->GetDeviceType();
+    auto mat_type         = mat->GetMatType();
+    auto blob_device_type = blob->GetBlobDesc().device_type;
+    auto blob_data_type   = blob->GetBlobDesc().data_type;
+    auto blob_data_format = blob->GetBlobDesc().data_format;
+
+    id<MTLFunction> func_process = nil;
+
+    // texture <-> blob
+    if (mat_type == N8UC4) {
+        if (is_mat_to_blob) {
+            if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"image_converter_texture_bgra8888_2_buffer_nc4hw4"];
+                LOGD("image_converter_texture_bgra8888_2_buffer_nc4hw4\n");
+            } else if (blob_data_format == DATA_FORMAT_NCHW) {
+                if (blob_data_type == DATA_TYPE_FLOAT) {
+                    func_process = [library newFunctionWithName:@"image_converter_texture_bgra8888_2_buffer_nchw_f"];
+                    LOGD("image_converter_texture_bgra8888_2_buffer_nchw_f\n");
+                }
+            }
+        } else {
+            if (mat_device_type != DEVICE_METAL && blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"image_converter_buffer_nc4hw4_2_buffer_bgra"];
+                LOGD("image_converter_buffer_nc4hw4_2_buffer_bgra\n");
+            } else if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"image_converter_buffer_nc4hw4_2_texture_bgra8888"];
+                LOGD("image_converter_buffer_nc4hw4_2_texture_bgra8888\n");
+            } else if (blob_data_format == DATA_FORMAT_NCHW) {
+                if (blob_data_type == DATA_TYPE_FLOAT) {
+                    func_process = [library newFunctionWithName:@"image_converter_buffer_nchw_f_2_texture_bgra8888"];
+                    LOGD("image_converter_buffer_nchw_f_2_texture_bgra8888\n");
+                }
+            }
+        }
+    } else if (mat_type == N8UC3) {
+        if (is_mat_to_blob) {
+            if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"image_converter_buffer_bgr_2_buffer_nc4hw4"];
+                LOGD("image_converter_buffer_bgr_2_buffer_nc4hw4\n");
+
+            }
+        } else {
+            if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"image_converter_buffer_nc4hw4_2_buffer_bgr"];
+                LOGD("image_converter_buffer_nc4hw4_2_buffer_bgr\n");
+            }
+        }
+    } else if (mat_type == NCHW_FLOAT) {
+        if (is_mat_to_blob) {
+            if (blob_data_format == DATA_FORMAT_NCHW) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_float2ftype"];
+                LOGD("data_converter_nchw_2_nchw\n");
+            } else if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_2_nc4hw4_float_v2"];
+                LOGD("data_converter_nchw_2_nc4hw4_float_v2\n");
+            }
+        } else {
+            if (blob_data_type == DATA_TYPE_INT32) {
+                // int32 blob to float mat
+                func_process = [library newFunctionWithName:@"data_converter_nc4hw4_2_nchw_int322float_v2"];
+                LOGD("data_converter_nc4hw4_2_nchw_int32_v2\n");
+            } else if (blob_data_format == DATA_FORMAT_NCHW) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_ftype2float"];
+                LOGD("data_converter_nchw_2_nchw\n");
+            } else if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"data_converter_nc4hw4_2_nchw_float_v2"];
+                LOGD("data_converter_nc4hw4_2_nchw_float_v2\n");
+            }
+        }
+    } else if (mat_type == RESERVED_BFP16_TEST) {
+        if (is_mat_to_blob) {
+            if (blob_data_format == DATA_FORMAT_NCHW) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_half2ftype"];
+                LOGD("data_converter_nchw_2_nchw\n");
+            } else if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_2_nc4hw4_half_v2"];
+                LOGD("data_converter_nchw_2_nc4hw4_float_v2\n");
+            }
+        } else {
+            if (blob_data_format == DATA_FORMAT_NCHW) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_ftype2half"];
+                LOGD("data_converter_nchw_2_nchw\n");
+            } else if (blob_data_format == DATA_FORMAT_NC4HW4) {
+                func_process = [library newFunctionWithName:@"data_converter_nc4hw4_2_nchw_half_v2"];
+                LOGD("data_converter_nc4hw4_2_nchw_float_v2\n");
+            }
+        }
+    } else if (mat_type == NC_INT32) {
+        if (blob_data_type == DATA_TYPE_INT32 && blob_data_format == DATA_FORMAT_NC4HW4) {
+            if (is_mat_to_blob) {
+                func_process = [library newFunctionWithName:@"data_converter_nchw_2_nc4hw4_int32_v2"];
+                LOGD("data_converter_nchw_2_nc4hw4_int32_v2\n");
+            } else {
+                func_process = [library newFunctionWithName:@"data_converter_nc4hw4_2_nchw_int32_v2"];
+                LOGD("data_converter_nc4hw4_2_nchw_int32_v2\n");
+            }
+        }
+    }
+
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+
+    return TNN_OK;
+}
+
+Status MetalBlobConverterAcc::ConvertToMat(Mat &image, MatConvertParam param, void *command_queue) {
+    param_ = param;
+    return ConvertToMatCommon(image, blob_, command_queue, 1);
+}
+
+Status MetalBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam param, void *command_queue) {
+    param_ = param;
+    return ConvertToMatCommon(image, blob_, command_queue, 2);
+}
+
+// #lizard forgives
+Status MetalBlobConverterAcc::ConvertToMatCommon(Mat &output_mat, Blob *input_blob, void *command_queue,
+                                                 int waitState) {
+    auto mat_device_type = output_mat.GetDeviceType();
+    auto mat_type        = output_mat.GetMatType();
+
+    if (!CheckDeviceAndMat(mat_device_type, mat_type)) {
+        return Status(TNNERR_COMMON_ERROR, "input_mat.GetDeviceType() or.GetMatType() is invalid");
+    }
+
+    auto dims               = blob_->GetBlobDesc().dims;
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+    if (waitState == 1) {
+        [context_impl commit:YES];
+    }
+
+    // check class type
+    if (!input_blob || typeid(*input_blob) != typeid(Blob)) {
+        LOGE("Error: input_blob is nil or not instance of Blob*\n");
+        return Status(TNNERR_INST_ERR, "input_blob is nil or not instance of Blob*");
+    }
+
+    auto status = AllocateBufferParam(param_, &output_mat, input_blob, false);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateComputePipeline(param_, &output_mat, input_blob, false, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    auto output_mat_device              = output_mat.GetDeviceType();
+    id<MTLCommandBuffer> command_buffer = nil;
+    if (mat_type == N8UC4 && output_mat_device == DEVICE_METAL) {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((DimsFunctionUtils::GetDim(dims, 3) + group_threads.width - 1) / group_threads.width),
+                          (NSUInteger)DimsFunctionUtils::GetDim(dims, 2),
+                          (NSUInteger)1};
+
+        auto output_texture       = (__bridge id<MTLTexture>)(output_mat.GetData());
+        Blob *input_buffer_blob = (Blob *)(input_blob);
+        if (output_texture.height != DimsFunctionUtils::GetDim(dims, 2) || output_texture.width != DimsFunctionUtils::GetDim(dims, 3) ||
+            (output_texture.pixelFormat != MTLPixelFormatBGRA8Unorm &&
+             output_texture.pixelFormat != MTLPixelFormatRGBA8Unorm)) {
+            return Status(TNNERR_INST_ERR, "output mat's texture is invalid, wrong size or pixel format");
+        }
+
+        command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState:pipeline_process_];
+
+        [encoder setTexture:output_texture atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_buffer_blob->GetHandle().base
+                    offset:(NSUInteger)input_buffer_blob->GetHandle().bytes_offset
+                   atIndex:0];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:1];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+        if (waitState == 1) {
+            [command_buffer waitUntilCompleted];
+        } else if (waitState == 2) {
+            [command_buffer waitUntilScheduled];
+        }
+    } else if ((mat_type == N8UC4 || mat_type == N8UC3) && output_mat_device != DEVICE_METAL) {
+        auto safe_dims = dims;
+        safe_dims[1]   = mat_type == N8UC4 ? 4 : 3;
+        auto count = DimsVectorUtils::Count(safe_dims);
+        auto bytes_size = sizeof(unsigned char);
+        id<MTLBuffer> output_mtl_buffer = [command_queue_impl.device newBufferWithLength:count * bytes_size
+                                                                       options:MTLResourceCPUCacheModeDefaultCache];
+
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((DimsFunctionUtils::GetDim(dims, 3) + group_threads.width - 1) / group_threads.width),
+                          (NSUInteger)DimsFunctionUtils::GetDim(dims, 2),
+                          (NSUInteger)1};
+
+        command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState:pipeline_process_];
+
+        [encoder setBuffer:output_mtl_buffer offset:0 atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_blob->GetHandle().base
+                    offset:(NSUInteger)input_blob->GetHandle().bytes_offset
+                   atIndex:1];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+
+        [command_buffer waitUntilCompleted];
+        memcpy(output_mat.GetData(), output_mtl_buffer.contents, count * bytes_size);
+    } else if (mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST) {
+        auto input_buffer_blob          = dynamic_cast<Blob *>(input_blob);
+        id<MTLBuffer> output_mtl_buffer = nil;
+
+        int count = DimsVectorUtils::Count(dims);
+        const auto bytes_size = (mat_type == NCHW_FLOAT) ? sizeof(float) : sizeof(fp16_t);
+        if (output_mat_device == DEVICE_METAL) {
+            output_mtl_buffer = (__bridge id<MTLBuffer>)(output_mat.GetData());
+        } else if (output_mat_device == DEVICE_ARM || output_mat_device == DEVICE_NAIVE || mat_device_type == DEVICE_X86) {
+            output_mtl_buffer = [command_queue_impl.device newBufferWithLength:count * bytes_size
+                                                                       options:MTLResourceCPUCacheModeDefaultCache];
+        }
+
+        NSUInteger image_size  = DimsFunctionUtils::GetDimProduct(dims, 2);
+        NSUInteger image_slice = UP_DIV(dims[1], 4);
+        bool is_blob_nchw = input_buffer_blob->GetBlobDesc().data_format == DATA_FORMAT_NCHW;
+
+        auto group_threads = MTLSizeMake(pipeline_process_.threadExecutionWidth, 1, 1);
+        auto groups = MTLSizeMake((image_size + group_threads.width - 1) / group_threads.width,
+                                  image_slice, dims[0]);
+        if (is_blob_nchw) {
+            groups = MTLSizeMake((image_size + group_threads.width - 1) / group_threads.width,
+                                 dims[1], dims[0]);
+        }
+
+        if (image_size <= image_slice) {
+            group_threads = MTLSizeMake(1, pipeline_process_.threadExecutionWidth, 1);
+            groups = MTLSizeMake(image_size,
+                                 (image_slice + group_threads.height - 1) / group_threads.height,
+                                 dims[0]);
+            if (is_blob_nchw) {
+                groups = MTLSizeMake(image_size,
+                                     (dims[1] + group_threads.height - 1) / group_threads.height,
+                                     dims[0]);
+            }
+        }
+
+        command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState:pipeline_process_];
+
+        [encoder setBuffer:output_mtl_buffer offset:0 atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_buffer_blob->GetHandle().base
+                    offset:(NSUInteger)input_buffer_blob->GetHandle().bytes_offset
+                   atIndex:1];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+        // scale and bias
+        [encoder setBuffer:buffer_scale_ offset:0 atIndex:3];
+        [encoder setBuffer:buffer_bias_  offset:0 atIndex:4];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+
+        if (output_mat_device == DEVICE_METAL) {
+            if (waitState == 1) {
+                [command_buffer waitUntilCompleted];
+            } else if (waitState == 2) {
+                [command_buffer waitUntilScheduled];
+            }
+        } else {
+            [command_buffer waitUntilCompleted];
+            memcpy(output_mat.GetData(), output_mtl_buffer.contents, count * bytes_size);
+        }
+    } else if (mat_type == NC_INT32) {
+        auto input_buffer_blob          = dynamic_cast<Blob *>(input_blob);
+        id<MTLBuffer> output_mtl_buffer = nil;
+
+        int count = DimsVectorUtils::Count(dims);
+        const auto bytes_size = sizeof(int);
+        if (output_mat_device == DEVICE_METAL) {
+            output_mtl_buffer = (__bridge id<MTLBuffer>)(output_mat.GetData());
+        } else if (output_mat_device == DEVICE_ARM || output_mat_device == DEVICE_NAIVE || mat_device_type == DEVICE_X86) {
+            output_mtl_buffer = [command_queue_impl.device newBufferWithLength:count * bytes_size
+                                                                       options:MTLResourceCPUCacheModeDefaultCache];
+        }
+
+        NSUInteger image_size  = DimsFunctionUtils::GetDimProduct(dims, 2);
+        NSUInteger image_slice = UP_DIV(dims[1], 4);
+
+        auto group_threads = MTLSizeMake(pipeline_process_.threadExecutionWidth, 1, 1);
+        auto groups = MTLSizeMake((image_size + group_threads.width - 1) / group_threads.width,
+                                  image_slice, dims[0]);
+
+        if (image_size <= image_slice) {
+            group_threads = MTLSizeMake(1, pipeline_process_.threadExecutionWidth, 1);
+            groups = MTLSizeMake(image_size,
+                                 (image_slice + group_threads.height - 1) / group_threads.height,
+                                 dims[0]);
+        }
+
+        command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState:pipeline_process_];
+
+        [encoder setBuffer:output_mtl_buffer offset:0 atIndex:0];
+        [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input_buffer_blob->GetHandle().base
+                    offset:(NSUInteger)input_buffer_blob->GetHandle().bytes_offset
+                   atIndex:1];
+        [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+        // scale and bias
+        [encoder setBuffer:buffer_scale_ offset:0 atIndex:3];
+        [encoder setBuffer:buffer_bias_  offset:0 atIndex:4];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+
+        if (output_mat_device == DEVICE_METAL) {
+            if (waitState == 1) {
+                [command_buffer waitUntilCompleted];
+            } else if (waitState == 2) {
+                [command_buffer waitUntilScheduled];
+            }
+        } else {
+            [command_buffer waitUntilCompleted];
+            memcpy(output_mat.GetData(), output_mtl_buffer.contents, count * bytes_size);
+        }
+    }
+    return TNN_OK;
+}
+
+Status MetalBlobConverterAcc::ConvertFromMat(Mat &image, MatConvertParam param, void *command_queue) {
+    param_ = param;
+    return ConvertFromMatCommon(image, blob_, command_queue, 1);
+}
+
+Status MetalBlobConverterAcc::ConvertFromMatAsync(Mat &image, MatConvertParam param, void *command_queue) {
+    param_ = param;
+    return ConvertFromMatCommon(image, blob_, command_queue, 2);
+}
+
+// #lizard forgives
+Status MetalBlobConverterAcc::ConvertFromMatCommon(Mat &input_mat, Blob *output_blob, void *command_queue,
+                                                   int waitState) {
+    auto mat_device_type = input_mat.GetDeviceType();
+    auto mat_type        = input_mat.GetMatType();
+
+    if (!CheckDeviceAndMat(mat_device_type, mat_type)) {
+        LOGE("GetDeviceType: %d GetMatType: %d\n", input_mat.GetDeviceType(), input_mat.GetMatType());
+        return Status(TNNERR_COMMON_ERROR, "input_mat.GetDeviceType() or.GetMatType() is invalid");
+    }
+
+    auto dims               = blob_->GetBlobDesc().dims;
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+
+    auto context_impl = command_queue_impl.metalContextImpl;
+    if (waitState == 1) {
+        [context_impl commit:YES];
+    }
+    
+    // check class type
+    if (!output_blob || typeid(*output_blob) != typeid(Blob)) {
+        LOGE("Error: output_blob is nil or not instance of Blob*\n");
+        return Status(TNNERR_INST_ERR, "output_blob is nil or not instance of Blob*");
+    }
+
+    auto status = AllocateBufferParam(param_, &input_mat, output_blob, true);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateComputePipeline(param_, &input_mat, output_blob, true, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    do {
+        if (mat_type == N8UC4) {
+            // For Texture input
+            MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+            MTLSize groups        = {(NSUInteger)((DimsFunctionUtils::GetDim(dims, 3) + group_threads.width - 1) / group_threads.width),
+                              (NSUInteger)DimsFunctionUtils::GetDim(dims, 2), (NSUInteger)1};
+
+            id<MTLTexture> input_texture = nil;
+            if (mat_device_type == DEVICE_METAL) {
+                input_texture = (__bridge id<MTLTexture>)(input_mat.GetData());
+            } else if (mat_device_type == DEVICE_NAIVE || mat_device_type == DEVICE_ARM || mat_device_type == DEVICE_X86) {
+                buffer_mat_ = std::make_shared<TNN_NS::Mat>(DEVICE_METAL, TNN_NS::N8UC4, dims);
+                input_texture = (__bridge id<MTLTexture>)buffer_mat_->GetData();
+                if (!input_texture) {
+                    LOGE("Error: newTextureWithDescriptor return nil\n");
+                    return Status(TNNERR_INST_ERR, "newTextureWithDescriptor return nil");
+                }
+
+                [input_texture replaceRegion:MTLRegionMake2D(0, 0, DimsFunctionUtils::GetDim(dims, 3), DimsFunctionUtils::GetDim(dims, 2))
+                                 mipmapLevel:0
+                                   withBytes:input_mat.GetData()
+                                 bytesPerRow:DimsFunctionUtils::GetDim(dims, 3) * 4];
+            } else {
+                break;
+            }
+
+            Blob *output_buffer_blob = (Blob *)(output_blob);
+            if (input_texture.height != DimsFunctionUtils::GetDim(dims, 2) || input_texture.width != DimsFunctionUtils::GetDim(dims, 3) ||
+                (input_texture.pixelFormat != MTLPixelFormatBGRA8Unorm &&
+                 input_texture.pixelFormat != MTLPixelFormatRGBA8Unorm)) {
+                return Status(TNNERR_INST_ERR, "input mat's texture is invalid, wrong size or pixel format");
+            }
+
+            auto command_buffer = [command_queue_impl commandBuffer];
+            [command_buffer enqueue];
+            auto encoder = [command_buffer computeCommandEncoder];
+            [encoder setComputePipelineState:pipeline_process_];
+
+            [encoder setTexture:input_texture atIndex:0];
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output_buffer_blob->GetHandle().base
+                        offset:(NSUInteger)output_buffer_blob->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:1];
+
+            [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+            [encoder endEncoding];
+
+            [command_buffer commit];
+            if (waitState == 1) {
+                [command_buffer waitUntilCompleted];
+            } else if (waitState == 2) {
+                [command_buffer waitUntilScheduled];
+            }
+            return TNN_OK;
+        } else if (mat_type == N8UC3 && mat_device_type != DEVICE_METAL) {
+            MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+            MTLSize groups        = {(NSUInteger)((DimsFunctionUtils::GetDim(dims, 3) + group_threads.width - 1) / group_threads.width),
+                              (NSUInteger)DimsFunctionUtils::GetDim(dims, 2), (NSUInteger)1};
+
+            const auto bytes_size = sizeof(unsigned char);
+            auto count = DimsVectorUtils::Count(dims);
+            id<MTLBuffer> input_tmp_buffer = [command_queue_impl.device newBufferWithBytes:input_mat.GetData()
+                                                                      length:count * bytes_size
+                                                                     options:MTLCPUCacheModeDefaultCache];
+
+            auto command_buffer = [command_queue_impl commandBuffer];
+            [command_buffer enqueue];
+            auto encoder = [command_buffer computeCommandEncoder];
+            [encoder setComputePipelineState:pipeline_process_];
+
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output_blob->GetHandle().base
+                        offset:(NSUInteger)output_blob->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:input_tmp_buffer offset:0 atIndex:1];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+
+            [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+            [encoder endEncoding];
+
+            [command_buffer commit];
+            if (waitState == 1) {
+                [command_buffer waitUntilCompleted];
+            } else if (waitState == 2) {
+                [command_buffer waitUntilScheduled];
+            }
+            return TNN_OK;
+        } else if (mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST) {
+            // For Buffer input
+
+            id<MTLBuffer> input_buffer = nil;
+            const auto bytes_size = (mat_type == NCHW_FLOAT) ? sizeof(float) : sizeof(fp16_t);
+            if (mat_device_type == DEVICE_METAL) {
+                input_buffer = (__bridge id<MTLBuffer>)(input_mat.GetData());
+            } else if (mat_device_type == DEVICE_NAIVE || mat_device_type == DEVICE_ARM || mat_device_type == DEVICE_X86) {
+                int count    = DimsVectorUtils::Count(dims);
+                input_buffer = [command_queue_impl.device newBufferWithBytes:input_mat.GetData()
+                                                                      length:count * bytes_size
+                                                                     options:MTLCPUCacheModeDefaultCache];
+            } else {
+                break;
+            }
+
+            NSUInteger image_size  = DimsFunctionUtils::GetDimProduct(dims, 2);
+            NSUInteger image_slice = UP_DIV(dims[1], 4);
+            bool is_blob_nchw = output_blob->GetBlobDesc().data_format == DATA_FORMAT_NCHW;
+
+            auto group_threads = MTLSizeMake(pipeline_process_.threadExecutionWidth, 1, 1);
+            auto groups = MTLSizeMake((image_size + group_threads.width - 1) / group_threads.width,
+                                      image_slice, dims[0]);
+            if (is_blob_nchw)
+                groups.height = dims[1];
+
+            if (image_size <= image_slice) {
+                group_threads = MTLSizeMake(1, pipeline_process_.threadExecutionWidth, 1);
+                groups = MTLSizeMake(image_size,
+                                     (image_slice + group_threads.height - 1) / group_threads.height,
+                                     dims[0]);
+                if (is_blob_nchw)
+                    groups.height = (dims[1] + group_threads.height - 1) / group_threads.height;
+            }
+
+            Blob *output_buffer_blob = (Blob *)(output_blob);
+
+            auto command_buffer = [command_queue_impl commandBuffer];
+            [command_buffer enqueue];
+            auto encoder = [command_buffer computeCommandEncoder];
+            [encoder setComputePipelineState:pipeline_process_];
+
+            [encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output_buffer_blob->GetHandle().base
+                        offset:(NSUInteger)output_buffer_blob->GetHandle().bytes_offset
+                       atIndex:0];
+            [encoder setBuffer:input_buffer offset:0 atIndex:1];
+            [encoder setBuffer:buffer_param_ offset:0 atIndex:2];
+            //scale and bias
+            [encoder setBuffer:buffer_scale_ offset:0 atIndex:3];
+            [encoder setBuffer:buffer_bias_  offset:0 atIndex:4];
+
+            [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+            [encoder endEncoding];
+
+            [command_buffer commit];
+            if (waitState == 1) {
+                [command_buffer waitUntilCompleted];
+            } else if (waitState == 2) {
+                [command_buffer waitUntilScheduled];
+            }
+            return TNN_OK;
+        } else {
+            break;
+        }
+    } while (0);
+    return Status(TNNERR_COMMON_ERROR, "input_mat.GetDeviceType() or.GetMatType() is invalid");
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(Metal);
+REGISTER_BLOB_CONVERTER(Metal, DEVICE_METAL);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.h b/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.h
new file mode 100644
index 0000000..0f6e4af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_COMMAND_QUEUE_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_COMMAND_QUEUE_H_
+
+#include "tnn/core/context.h"
+#include "tnn/device/metal/metal_macro.h"
+#import <Metal/Metal.h>
+#include <string>
+
+TNN_OBJC_CLASS(TNNMMetalContextImpl);
+
+@interface TNNMetalCommandQueueImpl : NSObject <MTLCommandQueue> {
+    __strong id<MTLCommandQueue> mtl_command_queue_;
+}
+@property(weak, nonatomic) TNNMMetalContextImpl *_Nullable metalContextImpl;
+
+- (instancetype _Nullable)initWithCommandQueue:(nullable id<MTLCommandQueue>)command_queue;
+@end
+
+#endif // TNN_SOURCE_TNN_DEVICE_METAL_METAL_COMMAND_QUEUE_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.mm b/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.mm
new file mode 100644
index 0000000..b0d59f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_command_queue.mm
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import "tnn/device//metal/metal_command_queue.h"
+#import <Foundation/Foundation.h>
+#import <QuartzCore/QuartzCore.h>
+
+@implementation TNNMetalCommandQueueImpl
+- (instancetype)initWithCommandQueue:(id<MTLCommandQueue>)commandQueue {
+    self = [super init];
+    if (self) {
+        mtl_command_queue_ = commandQueue;
+    }
+    return self;
+}
+
+- (id<MTLDevice>)device {
+    return mtl_command_queue_.device;
+}
+
+- (NSString *)label {
+    return mtl_command_queue_.label;
+}
+
+- (void)setLabel:(NSString *)label {
+    mtl_command_queue_.label = label;
+}
+
+- (nullable id<MTLCommandBuffer>)commandBuffer {
+    return [mtl_command_queue_ commandBuffer];
+}
+
+- (nullable id<MTLCommandBuffer>)commandBufferWithUnretainedReferences {
+    return [mtl_command_queue_ commandBufferWithUnretainedReferences];
+}
+
+- (void)insertDebugCaptureBoundary {
+    [mtl_command_queue_ insertDebugCaptureBoundary];
+}
+@end
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_context.h b/3rdparty/TNN/source/tnn/device/metal/metal_context.h
new file mode 100644
index 0000000..2a0de8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_context.h
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONTEXT_H_
+
+#include <string>
+
+#include "tnn/core/context.h"
+#include "tnn/core/profile.h"
+#include "tnn/device/metal/metal_macro.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+#if TNN_PROFILE
+#define TNN_PRINT_ENCODER(context, encoder, layer_acc)                                                                 \
+    if (context_->profile_layer) {                                                                                     \
+        auto pdata       = std::make_shared<ProfilingData>();                                                          \
+        auto dims_input  = inputs[0]->GetBlobDesc().dims;                                                              \
+        auto dims_output = outputs[0]->GetBlobDesc().dims;                                                             \
+        layer_acc->UpdateProfilingData(pdata.get(), layer_acc->param_, dims_input, dims_output);                       \
+        auto context_metal = context_->getMetalContextImpl();                                                          \
+        [context_metal printEncoder:encoder];                                                                          \
+        [context_metal waitUntilCompleted:pdata];                                                                      \
+    }
+#else
+#define TNN_PRINT_ENCODER(context, encoder, layer_acc) ((void)0)
+#endif
+
+TNN_OBJC_CLASS(TNNMMetalContextImpl);
+TNN_OBJC_CLASS(TNNMetalCommandQueueImpl);
+
+namespace TNN_NS {
+
+typedef struct {
+    /** wrap size */
+    NSUInteger thread_execution_width;
+    /** max threads per thread group */
+    NSUInteger max_threads_per_group;
+    /** run conbcurrently on z axis or not */
+    BOOL z_axis_protected;
+} MetalBandwidth;
+
+class MetalContext : public Context {
+public:
+    MetalContext();
+    virtual ~MetalContext();
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    Status GetCommandQueue(void **command_queue);
+    // @brief share tnn command queue to another context
+    Status ShareCommandQueue(Context* context);
+
+    virtual Status LoadLibrary(std::vector<std::string> path);
+    virtual Status OnInstanceForwardBegin();
+    virtual Status OnInstanceForwardEnd();
+    virtual Status Synchronize();
+    TNNMMetalContextImpl *getMetalContextImpl();
+
+private:
+    __strong TNNMMetalContextImpl *metal_context_impl_;
+};
+
+} // namespace TNN_NS
+
+@interface TNNMetalDeviceImpl : NSObject
++ (id<MTLDevice>)sharedDevice;
+@end
+
+@interface TNNMMetalContextImpl : NSObject
+@property(assign, nonatomic) NSUInteger commitCount;
+@property(assign, nonatomic) MetalContext *context;
+/** metal device */
+@property(strong, nonatomic, readonly) id<MTLDevice> device;
+@property(strong, nonatomic, readonly) id<MTLLibrary> library;
+@property(strong, nonatomic) TNNMetalCommandQueueImpl *commandQueue;
+@property(strong, nonatomic) id<MTLCommandBuffer> commandBuffer;
+
+/**
+ * @brief load metal library
+ * @param path      library full path
+ */
+- (Status)loadLibrary:(NSString *)path;
+
+/**
+ * @brief load encoder with function name. returns maxTotalThreadsPerThreadgroup
+ * of pipeline.
+ * @param name      pipline name
+ * @param encoder   command encoder
+ * @return bandwidth info for function
+ */
+- (Status)load:(NSString *)name
+    encoder:(id<MTLComputeCommandEncoder>)encoder
+    bandwidth:(TNN_NS::MetalBandwidth &)bandwidth;
+
+/**
+ * @brief create compute encoder on default command buffer
+ * @return created encoder
+ */
+- (id<MTLComputeCommandEncoder>)encoder;
+
+/**
+ * @brief dispatch encoder with default settings
+ * @param encoder   command encoder
+ * @param threads   threads size
+ * @param bandwidth bandwidth
+ */
+- (Status)dispatchEncoder:(id<MTLComputeCommandEncoder>)encoder
+                threads:(MTLSize)threads
+                bandwidth:(TNN_NS::MetalBandwidth)bandwidth;
+
+/**
+ * @brief dispatch encoder with specified settings
+ * @param encoder           command encoder
+ * @param threadsPerGroup   threadsPerGroup size
+ * @param groups            threadGroups
+ * @param bandwidth         bandwidth
+ */
+- (Status)dispatchEncoder:(id<MTLComputeCommandEncoder>)encoder
+                  threadsPerGroup:(MTLSize)threadsPerGroup
+                  groups:(MTLSize)groups
+                  bandwidth:(TNN_NS::MetalBandwidth)bandwidth;
+
+/**
+ * @brief before instance forward
+ */
+- (Status)onInstanceForwardBegin;
+/**
+ * @brief after instance forward
+ */
+- (Status)onInstanceForwardEnd;
+
+/**
+ * @brief commit commands
+ */
+- (void)commit;
+/**
+ * @brief commit commands
+ */
+- (void)commit:(BOOL)force_commit;
+
+/**
+ * @brief wait for completion
+ */
+- (void)waitUntilCompleted:(std::shared_ptr<ProfilingData>)pdata;
+
+#if TNN_METAL_DEBUG || TNN_PROFILE
+/**
+ * @brief print encoder
+ */
+- (void)printEncoder:(id<MTLCommandEncoder>)encoder;
+#endif
+@end
+#endif // TNN_SOURCE_TNN_DEVICE_METAL_METAL_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_context.mm b/3rdparty/TNN/source/tnn/device/metal/metal_context.mm
new file mode 100644
index 0000000..acb4079
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_context.mm
@@ -0,0 +1,424 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/profile.h"
+#include "tnn/device/metal/metal_context.h"
+#import "tnn/device//metal/metal_command_queue.h"
+#import <Foundation/Foundation.h>
+#import <QuartzCore/QuartzCore.h>
+#if TNN_PROFILE
+#define kMetalCommandBufferDepth 1
+#else
+#define kMetalCommandBufferDepth 10
+#endif
+
+static NSUInteger smallest_log2(NSUInteger integer) {
+    if (integer == 0)
+        return 0;
+    NSUInteger power = 0;
+    while ((integer & 0b1) == 0) {
+        integer = integer >> 1;
+        power++;
+    }
+    return power;
+}
+
+namespace TNN_NS {
+MetalContext::MetalContext() {
+    metal_context_impl_         = [TNNMMetalContextImpl new];
+    metal_context_impl_.context = this;
+}
+
+MetalContext::~MetalContext() {
+    metal_context_impl_ = nil;
+}
+
+Status MetalContext::GetCommandQueue(void **command_queue) {
+    if (!metal_context_impl_) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "metal context is nil");
+    }
+    if (command_queue) {
+        *command_queue = (__bridge void *)metal_context_impl_.commandQueue;
+    }
+    return TNN_OK;
+}
+
+Status MetalContext::ShareCommandQueue(Context* context) {
+    if (!metal_context_impl_) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "metal context is nil");
+    }
+    auto context_target = dynamic_cast<MetalContext *>(context);
+    if (!context_target) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "inpute context is not metal context");
+    }
+    
+    metal_context_impl_ = context_target->getMetalContextImpl();
+    
+    return TNN_OK;
+}
+
+Status MetalContext::LoadLibrary(std::vector<std::string> path) {
+    if (path.size() <= 0) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "library path is empty");
+    }
+    return [metal_context_impl_ loadLibrary:[NSString stringWithUTF8String:path[0].c_str()]];
+}
+Status MetalContext::OnInstanceForwardBegin() {
+    Context::OnInstanceForwardBegin();
+    return [metal_context_impl_ onInstanceForwardBegin];
+}
+
+Status MetalContext::OnInstanceForwardEnd() {
+    return [metal_context_impl_ onInstanceForwardEnd];
+}
+TNNMMetalContextImpl *MetalContext::getMetalContextImpl() {
+    return metal_context_impl_;
+}
+Status MetalContext::Synchronize() {
+    if (metal_context_impl_) {
+        [metal_context_impl_ waitUntilCompleted:nullptr];
+        return TNN_OK;
+    } else {
+        return Status(TNNERR_INST_ERR, "metal context is nil");
+    }
+}
+} // namespace TNN_NS
+
+@implementation TNNMetalDeviceImpl
++ (id<MTLDevice>)sharedDevice {
+    static id<MTLDevice> g_shared_device = nil;
+    static dispatch_once_t onceToken;
+    dispatch_once(&onceToken, ^{
+        g_shared_device = MTLCreateSystemDefaultDevice();
+    });
+    return g_shared_device;
+}
+@end
+
+@interface TNNMMetalContextImpl ()
+@property(strong, nonatomic) id<MTLLibrary> library;
+@property(strong, nonatomic) NSMutableDictionary<NSString *, id<MTLComputePipelineState>> *pipeLineCaches;
+@property(strong, nonatomic) NSMutableArray<id<MTLCommandBuffer>> *waitingCommandBufferes;
+@end
+
+@implementation TNNMMetalContextImpl
+
+- (instancetype)init {
+    self = [super init];
+    if (self) {
+        if (@available(iOS 9.0, *)) {
+            _device       = [TNNMetalDeviceImpl sharedDevice];
+            _commandQueue = [[TNNMetalCommandQueueImpl alloc] initWithCommandQueue:[_device newCommandQueue]];
+            _commandQueue.metalContextImpl = self;
+            _pipeLineCaches                = [[NSMutableDictionary alloc] init];
+            _waitingCommandBufferes        = [[NSMutableArray alloc] init];
+            _commitCount                   = 0;
+        } else {
+            LOGE("Error: only support iOS 9.0+\n");
+            self = nil;
+            return nil;
+        }
+    }
+    return self;
+}
+
+- (Status)onInstanceForwardBegin {
+    _commitCount = 0;
+    if (!_commandBuffer || _commandBuffer.status >= MTLCommandBufferStatusCommitted) {
+        _commandBuffer = [_commandQueue commandBuffer];
+        [_commandBuffer enqueue];
+    }
+    //    NSLog(@"onInstanceForwardBegin: %p", _commandBuffer);
+    return TNN_OK;
+}
+
+- (Status)onInstanceForwardEnd {
+    [self commit:YES];
+    //    [self waitUntilCompleted];
+    return TNN_OK;
+}
+
+- (Status)onCommandBufferScheduled:(id<MTLCommandBuffer>)commandBuffer {
+    return TNN_OK;
+}
+
+- (Status)onCommandBufferCompletedFor:(id<MTLCommandBuffer>)commandBuffer {
+    //    NSLog(@"ytfq final: %.6f ms", CACurrentMediaTime()*1000.0f);
+    return TNN_OK;
+}
+
+- (id<MTLComputeCommandEncoder>)encoder {
+    auto result = [_commandBuffer computeCommandEncoder];
+#if TNN_METAL_DEBUG || TNN_PROFILE
+    result.label = nil;
+#endif
+    return result;
+}
+
+- (Status)loadLibrary:(NSString *)path {
+    auto library = [_device newLibraryWithFile:path error:nil];
+    if (!library) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "library load failed");
+    }
+    _library = library;
+    return TNN_OK;
+}
+
+- (Status)load:(NSString *)name
+       encoder:(id<MTLComputeCommandEncoder>)encoder
+     bandwidth:(TNN_NS::MetalBandwidth &)bandwidth {
+    id<MTLComputePipelineState> pipeline = [self pipelineWithName:name];
+    if (!pipeline) {
+        LOGE("Error: pipelineWithName nil: %s\n", name.UTF8String);
+        return Status(TNNERR_INST_ERR, "Error: pipelineWithName return nil");
+    }
+    [encoder setComputePipelineState:pipeline];
+#if TNN_METAL_DEBUG || TNN_PROFILE
+    if (!name) {
+    } else if (!encoder.label) {
+        encoder.label = name;
+    } else {
+        NSArray *components = [encoder.label componentsSeparatedByString:@","];
+        if (![components containsObject:name]) {
+            components = [components arrayByAddingObject:name];
+        }
+        encoder.label = [components componentsJoinedByString:@","];
+    }
+#endif
+    bandwidth = {pipeline.threadExecutionWidth, pipeline.maxTotalThreadsPerThreadgroup, NO};
+    return TNN_OK;
+}
+
+- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name {
+    id<MTLComputePipelineState> result = _pipeLineCaches[name];
+    if (result)
+        return result;
+
+    id<MTLFunction> function = [self functionWithName:name];
+    if (!function)
+        return nil;
+
+    NSError *error = nil;
+    result         = [_device newComputePipelineStateWithFunction:function error:&error];
+
+    if (error) {
+        LOGE("Error: create pipeline error: %s\n", error.localizedDescription.UTF8String);
+    }
+
+    if (result)
+        _pipeLineCaches[name] = result;
+    return result;
+}
+
+- (id<MTLFunction>)functionWithName:(NSString *)name {
+    if (!name)
+        return nil;
+    id<MTLFunction> result = [_library newFunctionWithName:name];
+#if TNN_METAL_DEBUG || TNN_PROFILE
+    if (@available(iOS 10.0, *))
+        result.label = name;
+#endif
+    return result;
+}
+
+- (Status)dispatchEncoder:(id<MTLComputeCommandEncoder>)encoder
+                  threads:(MTLSize)threads
+                bandwidth:(TNN_NS::MetalBandwidth)bandwidth {
+    return [self dispatchEncoder:encoder
+                         threads:threads
+                 threadsPerGroup:[self threadsPerGroupWithThreads:threads bandwidth:bandwidth]
+                       bandwidth:bandwidth];
+}
+
+- (Status)dispatchEncoder:(id<MTLComputeCommandEncoder>)encoder
+                  threadsPerGroup:(MTLSize)threadsPerGroup
+                  groups:(MTLSize)groups
+                bandwidth:(TNN_NS::MetalBandwidth)bandwidth {
+    MTLSize totalThreads = MTLSizeMake(
+        threadsPerGroup.width  * groups.width,
+        threadsPerGroup.height * groups.height,
+        threadsPerGroup.depth  * groups.depth
+    );
+    return [self dispatchEncoder:encoder
+                         threads:totalThreads
+                 threadsPerGroup:threadsPerGroup
+                       bandwidth:bandwidth];
+}
+
+- (Status)dispatchEncoder:(id<MTLComputeCommandEncoder>)encoder
+                  threads:(MTLSize)threads
+          threadsPerGroup:(MTLSize)threadsPerGroup
+                bandwidth:(TNN_NS::MetalBandwidth)bandwidth {
+    if (threads.width == 0 || threads.height == 0 || threads.depth == 0 || threadsPerGroup.width == 0 ||
+        threadsPerGroup.height == 0 || threadsPerGroup.depth == 0) {
+        LOGE("Error: dispatch error %td %td %td / %td %td %td\n", threads.width, threads.height, threads.depth,
+             threadsPerGroup.width, threadsPerGroup.height, threadsPerGroup.depth);
+        return Status(TNNERR_INST_ERR, "dispatch threads or threadsPerGroup is invalid");
+    }
+    threadsPerGroup.width  = MIN(threadsPerGroup.width, bandwidth.max_threads_per_group);
+    threadsPerGroup.height = MIN(threadsPerGroup.height, bandwidth.max_threads_per_group);
+    threadsPerGroup.depth  = MIN(threadsPerGroup.depth, bandwidth.max_threads_per_group);
+    //#ifdef TNN_TARGET_IPHONE
+    //    if (@available(iOS 11.0, *)) {
+    //        if ([_device supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1])
+    //        {
+    //            [encoder dispatchThreads:threads
+    //            threadsPerThreadgroup:threadsPerGroup]; return;
+    //        }
+    //    }
+    //#endif
+    MTLSize groups = {
+        static_cast<NSUInteger>(UP_DIV(threads.width, threadsPerGroup.width)),
+        static_cast<NSUInteger>(UP_DIV(threads.height, threadsPerGroup.height)),
+        static_cast<NSUInteger>(UP_DIV(threads.depth, threadsPerGroup.depth)),
+    };
+#if TNN_PROFILE
+    LOGD("max_threads_per_group: %d\n", (int)bandwidth.max_threads_per_group);
+    LOGD("groups:(%d %d %d)\n", (int)groups.width, (int)groups.height, (int)groups.depth);
+    LOGD("threadsPerGroup:(%d %d %d)\n", (int)threadsPerGroup.width, (int)threadsPerGroup.height,
+         (int)threadsPerGroup.depth);
+#endif
+    [encoder dispatchThreadgroups:groups threadsPerThreadgroup:threadsPerGroup];
+    return TNN_OK;
+}
+
+- (MTLSize)threadsPerGroupWithThreads:(MTLSize)t bandwidth:(TNN_NS::MetalBandwidth)bw {
+    auto pwarp = smallest_log2(bw.thread_execution_width);
+    auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
+    auto py = smallest_log2(t.height), sy = (NSUInteger)ceil(log2(t.height));
+
+    // accurately match on x
+    if (px >= pwarp) {
+        return {bw.thread_execution_width, 1, 1};
+    }
+    // accurately match on xy
+    else if (px + py >= pwarp && sx < pwarp / 2) {
+        NSUInteger x = pow(2, px);
+        return {x, bw.thread_execution_width / x, 1};
+    }
+    // similarly match on x
+    else if (sx >= pwarp) {
+        return {bw.thread_execution_width, 1, 1};
+    }
+    // similarly match on xy
+    else if (sx + sy >= pwarp) {
+        NSUInteger x = pow(2, sx);
+        return {x, bw.thread_execution_width / x, 1};
+    }
+
+    // on xyz (for most shaders do not protect gid.z, z axis must be accurately
+    // match)
+    auto pz = smallest_log2(t.depth);
+    auto sz = bw.z_axis_protected ? ceil(log2(t.depth)) : pz;
+    if (px + py + pz >= pwarp) {
+        NSUInteger x = pow(2, px), y = pow(2, py);
+        return {x, y, bw.thread_execution_width / x / y};
+    } else if (sx + sy + sz >= pwarp) {
+        NSUInteger x = pow(2, sx), z = pow(2, MIN(sz, pwarp - sx));
+        return {x, bw.thread_execution_width / x / z, z};
+    } else {
+        NSUInteger z = pow(2, sz);
+        return {t.width, t.height, z};
+    }
+}
+
+- (void)commit {
+#if TNN_METAL_DEBUG && TNN_METAL_BENCHMARK
+    [self commit:YES];
+#else
+    [self commit:NO];
+#endif
+}
+
+- (void)commit:(BOOL)force_commit {
+    _commitCount++;
+    if (!force_commit && _commitCount % kMetalCommandBufferDepth != 0) {
+        return;
+    }
+    _commitCount = (!force_commit) ? _commitCount : 0;
+
+    if (_commandBuffer.status < MTLCommandBufferStatusCommitted) {
+        /*Note: addScheduledHandler or addCompletedHandler may cause crash*/
+        /*
+         @weakify(self);
+        [_commandBuffer
+            addScheduledHandler:^(id<MTLCommandBuffer> _Nonnull commandBuffer) {
+                @strongify(self);
+                [self onCommandBufferScheduled:commandBuffer];
+            }];
+        [_commandBuffer
+            addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull commandBuffer) {
+                @strongify(self);
+                [self onCommandBufferCompletedFor:commandBuffer];
+            }];
+         */
+        [_commandBuffer commit];
+        if (_commandBuffer) {
+            _waitingCommandBufferes = [NSMutableArray arrayWithObject:_commandBuffer];
+        }
+        // create a new command buffer
+        _commandBuffer = [_commandQueue commandBuffer];
+    }
+}
+
+- (void)waitUntilCompleted:(std::shared_ptr<ProfilingData>)profile_data {
+    //    NSLog(@"waitUntilCompleted");
+    NSArray *buffers        = _waitingCommandBufferes.copy;
+    _waitingCommandBufferes = [NSMutableArray new];
+
+    for (id<MTLCommandBuffer> buffer in buffers) {
+        if (buffer.status >= MTLCommandBufferStatusCompleted)
+            continue;
+
+//        NSLog(@"waitUntilCompleted: %p", buffer);
+#if TNN_METAL_DEBUG || TNN_PROFILE
+        NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
+        [buffer waitUntilCompleted];
+        NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
+#if TNN_TARGET_IPHONE || TARGET_OS_OSX
+        if (@available(iOS 10.3, macos 10.15, *)) {
+            if (profile_data) {
+                profile_data->kernel_time = (buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f;
+                LOGD("commit costs: %.3fms (kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
+                     (buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
+                     (buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
+            }
+        } else
+#endif
+        {
+            if (profile_data) {
+                profile_data->kernel_time = (end - begin) * 1000.f;
+                LOGD("commit costs: %.3fms\n", (end - begin) * 1000.f);
+            }
+        }
+#if TNN_PROFILE
+        if (profile_data) {
+            self.context->AddProfilingData(profile_data);
+        }
+#endif
+#else
+        [buffer waitUntilCompleted];
+#endif
+        if (buffer.error) {
+            LOGE("Error: %s\n", buffer.error.localizedDescription.UTF8String);
+        }
+    }
+}
+
+#if TNN_METAL_DEBUG || TNN_PROFILE
+- (void)printEncoder:(id<MTLCommandEncoder>)encoder {
+    LOGD("Encoder: %s encoded.\n", encoder.label.UTF8String);
+}
+#endif
+@end
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_device.h b/3rdparty/TNN/source/tnn/device/metal/metal_device.h
new file mode 100644
index 0000000..3d0f678
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_device.h
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_METAL_METAL_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_METAL_METAL_DEVICE_H_
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+class MetalDevice : public AbstractDevice {
+public:
+    explicit MetalDevice(DeviceType device_type);
+
+    ~MetalDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+    
+    // @brief get implemented layouts on the device by layer type
+    virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+    
+    static Status RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout);
+
+private:
+    static BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc);
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>>& GetLayerLayoutMap();
+};
+
+//@brief CpuTypeLayerAccRegister register CpuTypeLayerAccCreator
+template <typename T>
+class MetalTypeLayerAccRegister {
+public:
+    explicit MetalTypeLayerAccRegister(LayerType type) {
+        MetalDevice::RegisterLayerAccCreator(type, new T());
+    }
+};
+
+class MetalTypeLayerLayoutRegister {
+public:
+    explicit MetalTypeLayerLayoutRegister(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+        MetalDevice::RegisterLayerLayout(type, layout);
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_METAL_METAL_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_device.mm b/3rdparty/TNN/source/tnn/device/metal/metal_device.mm
new file mode 100644
index 0000000..7d27606
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_device.mm
@@ -0,0 +1,199 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Metal/Metal.h>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/metal/acc/metal_common.h"
+#include "tnn/device/metal/metal_context.h"
+#include "tnn/device/metal/metal_device.h"
+#include "tnn/device/metal/metal_macro.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/metal/acc/metal_cpu_adapter_acc.h"
+
+namespace TNN_NS {
+
+BlobMemorySizeInfo MetalDevice::Calculate1DMemorySize(BlobDesc &desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    int count      = 0;
+    if (desc.data_format == DATA_FORMAT_NC4HW4) {
+        count = desc.dims[0] * ROUND_UP(desc.dims[1], 4) * DimsVectorUtils::Count(desc.dims, 2);
+    } else {
+        count = DimsVectorUtils::Count(desc.dims);
+    }
+    info.dims.push_back(count);
+    return info;
+}
+
+MetalDevice::MetalDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+MetalDevice::~MetalDevice() {}
+
+BlobMemorySizeInfo MetalDevice::Calculate(BlobDesc &desc) {
+    return MetalDevice::Calculate1DMemorySize(desc);
+}
+
+Status MetalDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
+    if (!handle) {
+        return TNN_OK;
+    }
+
+    id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+    if (mat_type == N8UC4) {
+        int dims_size               = (int)dims.size();
+        int height                  = dims[dims_size - 2];
+        int width                   = dims[dims_size - 1];
+        auto textureDescriptor      = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatBGRA8Unorm
+                                                                                    width:width
+                                                                                   height:height
+                                                                                mipmapped:NO];
+        textureDescriptor.usage     = MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite;
+        id<MTLTexture> texture_rgba = [device newTextureWithDescriptor:textureDescriptor];
+        *handle                     = (void *)CFBridgingRetain(texture_rgba);
+        return TNN_OK;
+    } else if (mat_type == NCHW_FLOAT) {
+        BlobDesc desc;
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.dims        = dims;
+        desc.device_type = DEVICE_METAL;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        int size         = GetBlobMemoryBytesSize(size_info);
+        auto buffer      = [device newBufferWithLength:size options:MTLResourceCPUCacheModeDefaultCache];
+        *handle          = (void *)CFBridgingRetain(buffer);
+        return TNN_OK;
+    } else if (mat_type == NC_INT32) {
+        BlobDesc desc;
+        desc.data_type   = DATA_TYPE_INT32;
+        desc.dims        = dims;
+        desc.device_type = DEVICE_METAL;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        int size         = GetBlobMemoryBytesSize(size_info);
+        auto buffer      = [device newBufferWithLength:size options:MTLResourceCPUCacheModeDefaultCache];
+        *handle          = (void *)CFBridgingRetain(buffer);
+        return TNN_OK;
+    } else {
+        LOGE("unsupport mat type: %d", mat_type);
+        return Status(TNNERR_PARAM_ERR, "unsupport mat type");
+    }
+}
+
+Status MetalDevice::Allocate(void **handle, BlobMemorySizeInfo &size_info) {
+    if (handle) {
+        int size             = GetBlobMemoryBytesSize(size_info);
+        id<MTLDevice> device = [TNNMetalDeviceImpl sharedDevice];
+#if TNN_METAL_DEBUG
+        id<MTLBuffer> buffer = [device newBufferWithLength:size options:MTLResourceCPUCacheModeDefaultCache];
+#else
+        id<MTLBuffer> buffer = [device newBufferWithLength:size options:MTLResourceStorageModePrivate];
+#endif
+        *handle = (void *)CFBridgingRetain(buffer);
+    }
+    return TNN_OK;
+}
+
+Status MetalDevice::Free(void *handle) {
+    if (handle) {
+        CFBridgingRelease(handle);
+    }
+    return TNN_OK;
+}
+
+Status MetalDevice::CopyToDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+
+    if (!command_queue) {
+        LOGD("command_queue is nil context\n");
+        return Status(TNNERR_CONTEXT_ERR, "Error: command_queue is nil context");
+    }
+
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    id<MTLBuffer> buffer = (__bridge id<MTLBuffer>)(dst->base);
+    uint64_t offset      = dst->bytes_offset;
+    memcpy((char *)buffer.contents + offset, ((char *)src->base) + src->bytes_offset, size_in_bytes);
+    //    LOGE("inputdata gpu: %.6f %.6f\n", ((float *)buffer.contents)[0], ((float *)buffer.contents)[1]);
+    return TNN_OK;
+}
+
+Status MetalDevice::CopyFromDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+    if (!command_queue) {
+        LOGD("command_queue is nil context\n");
+        return Status(TNNERR_CONTEXT_ERR, "Error: command_queue is nil context");
+    }
+
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    id<MTLBuffer> buffer = (__bridge id<MTLBuffer>)(src->base);
+    uint64_t offset      = src->bytes_offset;
+    memcpy(((char *)dst->base) + dst->bytes_offset, (char *)buffer.contents + offset, size_in_bytes);
+
+    //    LOGE("outputdata gpu: %.6f %.6f\n", ((float *)buffer.contents)[0], ((float *)buffer.contents)[1]);
+    return TNN_OK;
+}
+
+AbstractLayerAcc *MetalDevice::CreateLayerAcc(LayerType type) {
+    std::map<LayerType, std::shared_ptr<LayerAccCreator>> &layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    } else {
+        return new MetalCpuAdapterAcc(type);
+    }
+    return NULL;
+}
+
+NetworkType MetalDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_DEFAULT;
+}
+
+Context *MetalDevice::CreateContext(int device_id) {
+    return new MetalContext();
+}
+
+std::shared_ptr<const ImplementedLayout> MetalDevice::GetImplementedLayout(LayerType type) {
+    auto &layer_layout_map = GetLayerLayoutMap();
+    if (layer_layout_map.count(type) > 0) {
+        return layer_layout_map[type];
+    }
+    return std::make_shared<ImplementedLayout>();
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>> &MetalDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+std::map<LayerType, std::shared_ptr<ImplementedLayout>> &MetalDevice::GetLayerLayoutMap() {
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>> layer_layout_map;
+    return layer_layout_map;
+}
+
+Status MetalDevice::RegisterLayerAccCreator(LayerType type, LayerAccCreator *creator) {
+    std::map<LayerType, std::shared_ptr<LayerAccCreator>> &layer_creator_map = GetLayerCreatorMap();
+    layer_creator_map[type]                                   = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+Status MetalDevice::RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+    GetLayerLayoutMap()[type] = layout;
+    return TNN_OK;
+}
+
+TypeDeviceRegister<MetalDevice> g_metal_device_register(DEVICE_METAL);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_macro.h b/3rdparty/TNN/source/tnn/device/metal/metal_macro.h
new file mode 100644
index 0000000..d363349
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_macro.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_METAL_MACRO_H_
+#define TNN_METAL_MACRO_H_
+
+#include "tnn/core/macro.h"
+
+#if !defined(__APPLE__)
+#define TNN_METAL_ENABLED 0
+#else
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <float.h>
+#endif
+
+#ifdef __OBJC__
+#define TNN_OBJC_CLASS(name) @class name
+#else
+#define TNN_OBJC_CLASS(name) typedef struct objc_object name
+#endif //__OBJC__
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#define TNN_TARGET_IPHONE 1
+#define TNN_TARGET_OSX 0
+#else
+#define TNN_TARGET_IPHONE 0
+#define TNN_TARGET_OSX 1
+#endif
+#endif
+
+#ifndef TNN_METAL_ENABLED
+#define TNN_METAL_ENABLED (!(TARGET_OS_IPHONE && TARGET_OS_SIMULATOR))
+#endif
+
+#ifndef TNN_METAL_DEBUG
+#if DEBUG
+#define TNN_METAL_DEBUG 1
+#else
+#define TNN_METAL_DEBUG 0
+#endif
+#endif
+
+#endif // TNN_METAL_MACRO_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.metal b/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.metal
new file mode 100644
index 0000000..f0770e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.metal
@@ -0,0 +1,561 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <metal_stdlib>
+#include <metal_texture>
+#include "tnn/device/metal/acc/metal_common.metal"
+
+using namespace metal;
+
+kernel void mat_converter_texture_n8uc4_crop(
+                                             texture2d<half, access::read> src_bgra      [[texture(0)]],
+                                             texture2d<half, access::write> dst_bgra     [[texture(1)]],
+                                             constant MetalCropParams& parameters        [[buffer(0)]],
+                                             ushort2 gid                                 [[thread_position_in_grid]])
+{
+    auto src_x = gid.x + parameters.top_left_x;
+    auto src_y = gid.y + parameters.top_left_y;
+    if (src_x >= parameters.width || src_y >= parameters.height || any(gid >= ushort2(parameters.crop_width, parameters.crop_height)))
+        return;
+    
+    auto in = src_bgra.read(uint2(src_x, src_y));
+    dst_bgra.write(in, uint2(gid));
+}
+
+kernel void mat_converter_texture_n8uc4_resize_nearest(
+                                                       texture2d<half, access::sample> src_bgra     [[texture(0)]],
+                                                       texture2d<half, access::write> dst_bgra      [[texture(1)]],
+                                                       constant MetalResizeParams& parameters       [[buffer(0)]],
+                                                       ushort2 gid                                  [[thread_position_in_grid]])
+{
+    //TODO: the behavior of out-of-bounds reads?
+    // clamp_to_edge: coods out-of-bounds will be moved to the edge
+    constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
+    
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+    
+    float x = min(float(gid.x*1.0/parameters.scale_w), float(parameters.width  - 1));
+    float y = min(float(gid.y*1.0/parameters.scale_h), float(parameters.height - 1));
+    
+    auto sampled_color = src_bgra.sample(s, float2(x, y));
+    
+    dst_bgra.write(sampled_color, uint2(gid));
+}
+
+kernel void mat_converter_texture_n8uc4_resize_linear(
+                                                      texture2d<half, access::sample> src_bgra      [[texture(0)]],
+                                                      texture2d<half, access::write> dst_bgra       [[texture(1)]],
+                                                      constant MetalResizeParams& parameters        [[buffer(0)]],
+                                                      ushort2 gid                                   [[thread_position_in_grid]])
+{
+    constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::linear);
+    
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+
+    float scale_w_inv = float(parameters.width) / float(parameters.resized_width);
+    float scale_h_inv = float(parameters.height) / float(parameters.resized_height);
+    
+    float x = min(float(gid.x * scale_w_inv), float(parameters.width-1));
+    float y = min(float(gid.y * scale_h_inv), float(parameters.height-1));
+
+    auto sampled_color = src_bgra.sample(s, float2(x, y));
+    dst_bgra.write(sampled_color, uint2(gid));
+}
+
+#define CLAMP(v, min, max) \
+if (v < min) { \
+v = min; \
+} else if (v > max) { \
+v = max; \
+}
+
+float4 GetPixelClamped(texture2d<half, access::read> in [[texture(0)]], uint x, uint y, uint width, uint height) {
+    CLAMP(x, 0, width - 1)
+    CLAMP(y, 0, height - 1)
+    return float4(in.read(uint2(x, y)));
+}
+
+ftype GetValueClamped(device ftype* data [[buffer(0)]], uint x, uint y, uint width, uint height) {
+    CLAMP(x, 0, width - 1)
+    CLAMP(y, 0, height - 1)
+    return data[y * width +x];
+}
+
+#define S_MIN -32768
+#define S_MAX 32767
+#define SATURATE_CAST_SHORT(x) (half)(min(max(S_MIN, (int)((x)+((x)>=0.f? 0.5f:-0.5f))), S_MAX))
+
+kernel void mat_converter_texture_n8uc4_resize_bilinear(
+                                                        texture2d<half, access::read> src_bgra        [[texture(0)]],
+                                                        texture2d<half, access::write> dst_bgra       [[texture(1)]],
+                                                        constant MetalResizeParams& parameters        [[buffer(0)]],
+                                                        ushort2 gid                                   [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+    
+    float scale_x_inv = float(parameters.width) / float(parameters.resized_width);
+    float scale_y_inv = float(parameters.height) / float(parameters.resized_height);
+    
+    float x = float(gid.x + 0.5) * scale_x_inv - 0.5;
+    float y = float(gid.y + 0.5) * scale_y_inv - 0.5;
+    
+    int xint = floor(x);
+    float xfrac = x - xint;
+    if(xint < 0){
+        xint = 0;
+        xfrac = 0.f;
+    }
+    if(xint >= parameters.width - 1){
+        xint = parameters.width - 2;
+        xfrac = 1.f;
+    }
+    
+    int yint = floor(y);
+    float yfrac = y - yint;
+    if(yint < 0){
+        yint = 0;
+        yfrac = 0.f;
+    }
+    if(yint >= parameters.height - 1){
+        yint = parameters.height - 2;
+        yfrac = 1.f;
+    }
+    
+    float4 p00 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 0, parameters.width, parameters.height))*255.0;
+    float4 p10 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 0, parameters.width, parameters.height))*255.0;
+    float4 p01 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 1, parameters.width, parameters.height))*255.0;
+    float4 p11 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 1, parameters.width, parameters.height))*255.0;
+
+    float x_ef0_ = (1 - xfrac) * 2048;
+    float x_ef1_ = xfrac * 2048;
+    
+    float y_ef0_ = (1 - yfrac) * 2048;
+    float y_ef1_ = yfrac * 2048;
+    
+    float4 col0 = (p00 * x_ef0_ + p10 * x_ef1_) / 16;
+    float4 col1 = (p01 * x_ef0_ + p11 * x_ef1_) / 16;
+    
+    float4 value = ((col0 * y_ef0_)/(1024.0*64.0) + (col1 * y_ef1_)/(1024.0*64.0) + 2.0) / 4.0;
+    
+    value = value / 255.0;
+    
+    dst_bgra.write(half4(value), uint2(gid));
+}
+
+float4 GetCubicWeights(float x) {
+    const float A = -0.75f;
+    float4 w;
+    w[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
+    w[1] = ((A + 2)*x - (A + 3))*x*x + 1;
+    w[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
+    w[3] = 1.f - w[0] - w[1] - w[2];
+    return w;
+}
+
+kernel void mat_converter_texture_n8uc4_resize_cubic(
+                                                        texture2d<half, access::read> src_bgra        [[texture(0)]],
+                                                        texture2d<half, access::write> dst_bgra       [[texture(1)]],
+                                                        constant MetalResizeParams& parameters        [[buffer(0)]],
+                                                        ushort2 gid                                   [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+    
+    float scale_x_inv = float(parameters.width) / float(parameters.resized_width);
+    float scale_y_inv = float(parameters.height) / float(parameters.resized_height);
+    
+    float x = float(gid.x + 0.5) * scale_x_inv - 0.5;
+    float y = float(gid.y + 0.5) * scale_y_inv - 0.5;
+    
+    int xint = floor(x);
+    float xfrac = x - xint;
+    float4 wx = GetCubicWeights(xfrac);
+    
+    int yint = floor(y);
+    float yfrac = y - yint;
+    float4 wy = GetCubicWeights(yfrac);
+    
+    float4 p00 = float4(GetPixelClamped(src_bgra, xint + 0, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p01 = float4(GetPixelClamped(src_bgra, xint + 1, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p02 = float4(GetPixelClamped(src_bgra, xint + 2, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p03 = float4(GetPixelClamped(src_bgra, xint + 3, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 val0 = p00 * wx.x + p01 * wx.y + p02 * wx.z + p03 * wx.w;
+    
+    float4 p10 = float4(GetPixelClamped(src_bgra, xint + 0, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p11 = float4(GetPixelClamped(src_bgra, xint + 1, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p12 = float4(GetPixelClamped(src_bgra, xint + 2, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p13 = float4(GetPixelClamped(src_bgra, xint + 3, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 val1 = p10 * wx.x + p11 * wx.y + p12 * wx.z + p13 * wx.w;
+    
+    float4 p20 = float4(GetPixelClamped(src_bgra, xint + 0, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p21 = float4(GetPixelClamped(src_bgra, xint + 1, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p22 = float4(GetPixelClamped(src_bgra, xint + 2, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p23 = float4(GetPixelClamped(src_bgra, xint + 3, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 val2 = p20 * wx.x + p21 * wx.y + p22 * wx.z + p23 * wx.w;
+    
+    float4 p30 = float4(GetPixelClamped(src_bgra, xint + 0, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p31 = float4(GetPixelClamped(src_bgra, xint + 1, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p32 = float4(GetPixelClamped(src_bgra, xint + 2, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 p33 = float4(GetPixelClamped(src_bgra, xint + 3, yint - 1, parameters.width, parameters.height))*255.0;
+    float4 val3 = p30 * wx.x + p31 * wx.y + p32 * wx.z + p33 * wx.w;
+    
+    float4 rst = val0 * wy.x + val1 * wy.y + val2 * wy.z + val3 * wy.w;
+    
+    rst = rst / 255.0;
+    
+    dst_bgra.write(half4(rst), uint2(gid));
+}
+
+kernel void mat_converter_nchwf_n8uc4_resize_cubic(
+                                                   device ftype* src        [[buffer(0)]],
+                                                   device ftype* dst        [[buffer(1)]],
+                                                   constant MetalResizeParams& params        [[buffer(2)]],
+                                                   ushort3 gid                                  [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort3(params.resized_width, params.resized_height, params.batch*params.slice)))
+        return;
+    
+    float scale_x_inv = float(params.width) / float(params.resized_width);
+    float scale_y_inv = float(params.height) / float(params.resized_height);
+    device ftype* src_c = src + gid.z*params.width*params.height;
+    
+    float x = float(gid.x + 0.5) * scale_x_inv - 0.5;
+    float y = float(gid.y + 0.5) * scale_y_inv - 0.5;
+    
+    int xint = floor(x);
+    float xfrac = x - xint;
+    float4 wx = GetCubicWeights(xfrac);
+    
+    int yint = floor(y);
+    float yfrac = y - yint;
+    float4 wy = GetCubicWeights(yfrac);
+    
+    float p00 = float(GetValueClamped(src_c, xint + 0, yint - 1, params.width, params.height));
+    float p01 = float(GetValueClamped(src_c, xint + 1, yint - 1, params.width, params.height));
+    float p02 = float(GetValueClamped(src_c, xint + 2, yint - 1, params.width, params.height));
+    float p03 = float(GetValueClamped(src_c, xint + 3, yint - 1, params.width, params.height));
+    float val0 = p00 * wx.x + p01 * wx.y + p02 * wx.z + p03 * wx.w;
+    
+    float p10 = float(GetValueClamped(src_c, xint + 0, yint - 1, params.width, params.height));
+    float p11 = float(GetValueClamped(src_c, xint + 1, yint - 1, params.width, params.height));
+    float p12 = float(GetValueClamped(src_c, xint + 2, yint - 1, params.width, params.height));
+    float p13 = float(GetValueClamped(src_c, xint + 3, yint - 1, params.width, params.height));
+    float val1 = p10 * wx.x + p11 * wx.y + p12 * wx.z + p13 * wx.w;
+    
+    float p20 = float(GetValueClamped(src_c, xint + 0, yint - 1, params.width, params.height));
+    float p21 = float(GetValueClamped(src_c, xint + 1, yint - 1, params.width, params.height));
+    float p22 = float(GetValueClamped(src_c, xint + 2, yint - 1, params.width, params.height));
+    float p23 = float(GetValueClamped(src_c, xint + 3, yint - 1, params.width, params.height));
+    float val2 = p20 * wx.x + p21 * wx.y + p22 * wx.z + p23 * wx.w;
+    
+    float p30 = float(GetValueClamped(src_c, xint + 0, yint - 1, params.width, params.height));
+    float p31 = float(GetValueClamped(src_c, xint + 1, yint - 1, params.width, params.height));
+    float p32 = float(GetValueClamped(src_c, xint + 2, yint - 1, params.width, params.height));
+    float p33 = float(GetValueClamped(src_c, xint + 3, yint - 1, params.width, params.height));
+    float val3 = p30 * wx.x + p31 * wx.y + p32 * wx.z + p33 * wx.w;
+    
+    float rst = val0 * wy.x + val1 * wy.y + val2 * wy.z + val3 * wy.w;
+    
+    dst[(gid.z*params.height+gid.y)*params.width+gid.x] = ftype(rst);
+}
+
+kernel void bgr2gray_n8uc4_nchw_float(
+                              texture2d<half, access::read>  src_bgra[[texture(0)]],
+                              device float*                       out[[buffer(0)]],
+                              constant MetalBGR2GrayParams& parameters [[buffer(1)]],
+                              ushort2 gid[[thread_position_in_grid]])
+{
+    
+    if(any(gid >= ushort2(parameters.width, parameters.height)))
+        return;
+    auto out_offset = gid.y * parameters.width + gid.x;
+    
+    float4 rgb = float4(src_bgra.read(uint2(gid.xy))) * 255;
+    auto bgr = rgb.zyxw;
+    
+    float rst = float(0);
+    rst = bgr[0] * 0.114 + bgr[1] * 0.587 + bgr[2] * 0.299;
+    
+    out[out_offset] = rst;
+}
+
+kernel void copy_nchw_to_cpu(
+                             device float* in                       [[buffer(0)]],
+                             device float* out                      [[buffer(1)]],
+                             constant MetalCopyParams& parameters   [[buffer(2)]],
+                             ushort3 gid                            [[thread_position_in_grid]])
+{
+    if(any(gid >= ushort3(parameters.width, parameters.height, parameters.channel * parameters.batch)))
+        return;
+    auto offset = gid.z * parameters.width * parameters.height + gid.y * parameters.width + gid.x;
+    out[offset] = in[offset];
+}
+
+kernel void copy_n8uc4_to_cpu(
+                              texture2d<half, access::read> src_bgra[[texture(0)]],
+                              device uchar* out[[buffer(0)]],
+                              constant MetalCopyParams& parameters [[buffer(1)]],
+                              ushort2 gid[[thread_position_in_grid]])
+{
+    
+    if(any(gid >= ushort2(parameters.width, parameters.height)))
+        return;
+    
+    half4 cs = src_bgra.read(uint2(gid.xy));
+    half4 shuffle_cs = cs.zyxw;
+    uchar4 data = uchar4(shuffle_cs * 255.0);
+    
+    auto offset = (gid.y * parameters.width + gid.x)*4;
+    out[offset + 0] = data[0];
+    out[offset + 1] = data[1];
+    out[offset + 2] = data[2];
+    out[offset + 3] = data[3];
+}
+
+kernel void copy_n8uc4_metal_to_n8uc3_cpu(
+                                          texture2d<half, access::read> src_bgra    [[texture(0)]],
+                                          device uchar* out                         [[buffer(0)]],
+                                          constant MetalCopyParams& parameters      [[buffer(1)]],
+                                          ushort2 gid                               [[thread_position_in_grid]])
+{
+    if(any(gid >= ushort2(parameters.width, parameters.height)))
+        return;
+    
+    half4 cs = src_bgra.read(uint2(gid.xy));
+    
+    half4 shuffle_cs = cs.zyxw;
+    uchar4 data = uchar4(shuffle_cs * 255.0);
+    
+    auto offset = (gid.y * parameters.width + gid.x) * 3;
+    out[offset + 0] = data[0];
+    out[offset + 1] = data[1];
+    out[offset + 2] = data[2];
+}
+
+kernel void copy_n8uc3_cpu_to_n8uc4_metal(
+                                          device uchar* src                         [[buffer(0)]],
+                                          texture2d<half, access::write> dst_bgra   [[texture(0)]],
+                                          constant MetalCopyParams& parameters      [[buffer(1)]],
+                                          ushort2 gid                               [[thread_position_in_grid]])
+{
+    if(any(gid >= ushort2(parameters.width, parameters.height)))
+        return;
+    
+    auto offset = (gid.y * parameters.width + gid.x) * 3;
+    
+    half r = half(src[offset + 0] * 1.0) / 255.0;
+    half g = half(src[offset + 1] * 1.0) / 255.0;
+    half b = half(src[offset + 2] * 1.0) / 255.0;
+    
+    half4 data;
+    data[0] = r;
+    data[1] = g;
+    data[2] = b;
+    data[3] = 0;
+    
+    data = data.zyxw;
+    
+    dst_bgra.write(data, uint2(gid));
+}
+
+kernel void mat_converter_texture_n8uc4_warpaffine_linear_const(
+                                                                texture2d<half, access::read> src_bgra        [[texture(0)]],
+                                                                texture2d<half, access::write> dst_bgra       [[texture(1)]],
+                                                                constant MetalWarpAffineParams& parameters    [[buffer(0)]],
+                                                                ushort2 gid                                   [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+    
+    float x = gid.x * parameters.transform_inv[0][0] + gid.y * parameters.transform_inv[0][1] + parameters.transform_inv[0][2];
+    float y = gid.x * parameters.transform_inv[1][0] + gid.y * parameters.transform_inv[1][1] + parameters.transform_inv[1][2];
+    
+    float4 value = float4(parameters.border_val);
+    
+    int xint = floor(x);
+    float xfrac = x - xint;
+    if(xint < 0){
+        xint = 0;
+        xfrac = 0.f;
+    }
+    if(xint >= parameters.width - 1){
+        xint = parameters.width - 2;
+        xfrac = 1.f;
+    }
+    
+    int yint = floor(y);
+    float yfrac = y - yint;
+    if(yint < 0){
+        yint = 0;
+        yfrac = 0.f;
+    }
+    if(yint >= parameters.height - 1){
+        yint = parameters.height - 2;
+        yfrac = 1.f;
+    }
+    if( x >= 0 && x < parameters.width-1 && y>=0 && y< parameters.height-1 ) {
+        // normal bilinear sampling
+        float4 p00 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 0, parameters.width, parameters.height))*255.0;
+        float4 p10 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 0, parameters.width, parameters.height))*255.0;
+        float4 p01 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 1, parameters.width, parameters.height))*255.0;
+        float4 p11 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 1, parameters.width, parameters.height))*255.0;
+        
+        float x_ef0_ = (1 - xfrac) * 2048;
+        float x_ef1_ = xfrac * 2048;
+        
+        float y_ef0_ = (1 - yfrac) * 2048;
+        float y_ef1_ = yfrac * 2048;
+        
+        float4 col0 = (p00 * x_ef0_ + p10 * x_ef1_) / 16;
+        float4 col1 = (p01 * x_ef0_ + p11 * x_ef1_) / 16;
+        
+        value = ((col0 * y_ef0_)/(1024.0*64.0) + (col1 * y_ef1_)/(1024.0*64.0) + 2.0) / 4.0;
+    } else if( x>=-1 && x<=parameters.width-1 && y>=-1 && y<=parameters.height-1 ){
+        // partial sampling
+        //(x, y)
+        bool mask0 = x >= 0 && y >= 0;
+        //(x+1, y)
+        bool mask1 = x <= (parameters.width - 2) && y >= 0;
+        //(x, y+1)
+        bool mask2 = x >= 0 && y <= (parameters.height - 2);
+        //(x+1, y+1)
+        bool mask3 = x <= (parameters.width - 2) && y <= (parameters.height - 2);
+        
+        float4 p00 = mask0 ? float4(GetPixelClamped(src_bgra, xint + 0, yint + 0, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p10 = mask1 ? float4(GetPixelClamped(src_bgra, xint + 1, yint + 0, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p01 = mask2 ? float4(GetPixelClamped(src_bgra, xint + 0, yint + 1, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p11 = mask3 ? float4(GetPixelClamped(src_bgra, xint + 1, yint + 1, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        
+        float x_ef0_ = (1 - xfrac) * 2048;
+        float x_ef1_ = xfrac * 2048;
+        
+        float y_ef0_ = (1 - yfrac) * 2048;
+        float y_ef1_ = yfrac * 2048;
+        
+        float4 col0 = (p00 * x_ef0_ + p10 * x_ef1_) / 16;
+        float4 col1 = (p01 * x_ef0_ + p11 * x_ef1_) / 16;
+        
+        value = ((col0 * y_ef0_)/(1024.0*64.0) + (col1 * y_ef1_)/(1024.0*64.0) + 2.0) / 4.0;
+    }
+    
+    value = value / 255.0;
+    dst_bgra.write(half4(value), uint2(gid));
+}
+
+kernel void mat_converter_texture_n8uc4_warpaffine_nearest_const(
+                                                                texture2d<half, access::read> src_bgra        [[texture(0)]],
+                                                                texture2d<half, access::write> dst_bgra       [[texture(1)]],
+                                                                constant MetalWarpAffineParams& parameters    [[buffer(0)]],
+                                                                ushort2 gid                                   [[thread_position_in_grid]])
+{
+    if (any(gid >= ushort2(parameters.resized_width, parameters.resized_height)))
+        return;
+
+    float x = gid.x * parameters.transform_inv[0][0] + gid.y * parameters.transform_inv[0][1] + parameters.transform_inv[0][2];
+    float y = gid.x * parameters.transform_inv[1][0] + gid.y * parameters.transform_inv[1][1] + parameters.transform_inv[1][2];
+
+    float4 value = float4(parameters.border_val);
+
+    int xint = floor(x);
+    float xfrac = x - xint;
+
+    int yint = floor(y);
+    float yfrac = y - yint;
+
+    bool x_next = xfrac >= 0.5;
+    bool y_next = yfrac >= 0.5;
+    if( x >= 0 && x < parameters.width-1 && y>=0 && y< parameters.height-1 ) {
+        // normal bilinear sampling
+        float4 p00 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 0, parameters.width, parameters.height))*255.0;
+        float4 p10 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 0, parameters.width, parameters.height))*255.0;
+        float4 p01 = float4(GetPixelClamped(src_bgra, xint + 0, yint + 1, parameters.width, parameters.height))*255.0;
+        float4 p11 = float4(GetPixelClamped(src_bgra, xint + 1, yint + 1, parameters.width, parameters.height))*255.0;
+
+        value = y_next? (x_next? p11 : p01) : (x_next? p10 : p00);
+    } else if( x>=-1 && x<=parameters.width-1 && y>=-1 && y<=parameters.height-1 ){
+        // partial sampling
+        //(x, y)
+        bool mask0 = x >= 0 && y >= 0;
+        //(x+1, y)
+        bool mask1 = x <= (parameters.width - 2) && y >= 0;
+        //(x, y+1)
+        bool mask2 = x >= 0 && y <= (parameters.height - 2);
+        //(x+1, y+1)
+        bool mask3 = x <= (parameters.width - 2) && y <= (parameters.height - 2);
+
+        float4 p00 = mask0 ? float4(GetPixelClamped(src_bgra, xint + 0, yint + 0, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p10 = mask1 ? float4(GetPixelClamped(src_bgra, xint + 1, yint + 0, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p01 = mask2 ? float4(GetPixelClamped(src_bgra, xint + 0, yint + 1, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+        float4 p11 = mask3 ? float4(GetPixelClamped(src_bgra, xint + 1, yint + 1, parameters.width, parameters.height))*255.0 : float4(parameters.border_val);
+
+        value = y_next? (x_next? p11 : p01) : (x_next? p10 : p00);
+    }
+
+    value = value / 255.0;
+    dst_bgra.write(half4(value), uint2(gid));
+}
+
+kernel void copymakeborder_n8uc4_constant(
+                              texture2d<half, access::read> src_bgra[[texture(0)]],
+                              texture2d<half, access::write>dst_bgra[[texture(1)]],
+                              constant MetalCopyMakeBorderParam& parameters [[buffer(0)]],
+                              ushort2 gid[[thread_position_in_grid]])
+{
+    uint2 dst_size;
+    int dst_height = parameters.height + parameters.top + parameters.bottom;
+    int dst_width  = parameters.width + parameters.left + parameters.right;
+    dst_size.x = dst_width;
+    dst_size.y = dst_height;
+    if(any(gid >= (ushort2)dst_size))
+        return;
+
+    int2 in_loc = int2(gid.x-parameters.left, gid.y-parameters.top);
+    half4 value = half4(parameters.border_val / 255.0);
+    if(in_loc.x >= 0 && in_loc.x < parameters.width && in_loc.y >= 0 && in_loc.y < parameters.height)
+        value = src_bgra.read(uint2(in_loc));
+
+    dst_bgra.write(value, uint2(gid));
+}
+
+kernel void copymakeborder_nchw_constant(
+                                device float* in                       [[buffer(0)]],
+                                device float* out                      [[buffer(1)]],
+                                constant MetalCopyMakeBorderParam& parameters   [[buffer(2)]],
+                                ushort3 gid                            [[thread_position_in_grid]])
+{
+    uint3 dst_size;
+    int dst_height = parameters.height + parameters.top + parameters.bottom;
+    int dst_width  = parameters.width + parameters.left + parameters.right;
+    int dst_slice  = parameters.batch * parameters.channel;
+    dst_size.x = dst_width;
+    dst_size.y = dst_height;
+    dst_size.z = dst_slice;
+    if(any(gid >= ushort3(dst_size)))
+        return;
+
+    auto dst_offset = gid.z * dst_height * dst_width + gid.y * dst_width + gid.x;
+
+    auto src_h = gid.y - parameters.top;
+    auto src_w = gid.x - parameters.left;
+    float value = parameters.border_val;
+    if (src_h >= 0 && src_h < parameters.height && src_w >= 0 && src_w < parameters.width) {
+        auto src_offset = gid.z * parameters.width * parameters.height + src_h * parameters.width + src_w;
+        value = in[src_offset];
+    }
+
+    out[dst_offset] = value;
+}
diff --git a/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.mm b/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.mm
new file mode 100644
index 0000000..d53df06
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/metal_mat_converter.mm
@@ -0,0 +1,1230 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#import <Metal/Metal.h>
+
+#import "tnn/utils/mat_utils.h"
+#import "tnn/utils/mat_converter_acc.h"
+#import "tnn/device//metal/metal_context.h"
+#import "tnn/device//metal/metal_command_queue.h"
+#import "tnn/device//metal/acc/metal_common.h"
+#import "tnn/core/abstract_device.h"
+#import "tnn/utils/dims_utils.h"
+
+#define ENABLE_PIPELINE_CACHE 1
+#define KERNEL_SYNC 0
+
+namespace TNN_NS {
+
+class MetalMatConverterAcc : public MatConverterAcc {
+public:
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+
+    ~MetalMatConverterAcc() {};
+protected:
+    MetalResizeParams resize_param_;
+    MetalCropParams crop_param_;
+    MetalWarpAffineParams warpaffine_param_;
+    MetalCopyParams copy_param_;
+    MetalBGR2GrayParams bgr2gray_param_;
+    MetalCopyMakeBorderParam copy_make_border_param_;
+    
+    id<MTLDevice> device_                           = nil;
+    //metal params
+    id<MTLBuffer> buffer_resize_param_              = nil;
+    id<MTLBuffer> buffer_crop_param_                = nil;
+    id<MTLBuffer> buffer_warpaffine_param_          = nil;
+    id<MTLBuffer> buffer_copy_param_                = nil;
+    id<MTLBuffer> buffer_bgr2gray_param_            = nil;
+    id<MTLBuffer> buffer_copymakeborder_param_      = nil;
+    
+    id<MTLComputePipelineState> pipeline_process_   = nil;
+    //Allocate metal kernel param
+    Status AllocateBufferResizeParam(ResizeParam param, Mat& src, Mat& dst);
+    Status AllocateBufferCropParam(CropParam param, Mat& src, Mat& dst);
+    Status AllocateBufferWarpAffineParam(WarpAffineParam param, Mat& src, Mat& dst);
+    Status AllocateBufferCopyParam(Mat& src, Mat& dst);
+    Status AllocateBufferBGR2GrayParam(Mat& src, Mat& dst);
+    Status AllocateBufferCopyMakeBorderParam(CopyMakeBorderParam param, Mat& src, Mat& dst);
+    //Find corresponding metal kernel
+    Status AllocateResizeComputePipeline(ResizeParam param, Mat& src, Mat& dst, void *command_queue);
+    Status AllocateCropComputePipeline(CropParam param, Mat& src, Mat& dst, void *command_queue);
+    Status AllocateWarpAffineComputePipeline(WarpAffineParam param, Mat& src, Mat& dst, void *command_queue);
+    Status AllocateCopyComputePipeline(Mat& src, Mat& dst, void *command_queue);
+    Status AllocateBGR2GrayComputePipeline(Mat& src, Mat& dst, void *command_queue);
+    Status AllocateCopyMakeBorderComputePipeline(CopyMakeBorderParam param, Mat& src, Mat& dst, void *command_queue);
+
+    Status BGR2Gray(Mat& src, Mat& dst, void* command_queue = NULL);
+    Status CopyInputCheck(Mat& src, Mat& dst,
+                          const DeviceType& src_device_type, const DeviceType& dst_device_type,
+                          const MatType& src_mat_type, const MatType& dst_mat_type);
+    bool DeviceTypeCheck(const DeviceType& src_device_type,
+                         const DeviceType& dst_device_type);
+    Status SetDevice(Mat& src, Mat& dst,
+                     const DeviceType& src_device_type, const DeviceType& dst_device_type,
+                     const MatType& src_mat_type, const MatType& dst_mat_type);
+    Status MetalCopyToCPU(Mat& src, Mat& dst,
+                          const DimsVector& dims,
+                          const MatType& src_mat_type, const MatType& dst_mat_type,
+                          TNNMetalCommandQueueImpl *command_queue_impl);
+    Status CPUCopyToMetal(Mat& src, Mat& dst,
+                          const DimsVector& dims,
+                          const MatType& src_mat_type, const MatType& dst_mat_type,
+                          TNNMetalCommandQueueImpl *command_queue_impl);
+};
+
+Status MetalMatConverterAcc::AllocateBufferResizeParam(ResizeParam param, Mat& src, Mat& dst) {
+    resize_param_.batch    = src.GetBatch();
+    resize_param_.width    = src.GetWidth();
+    resize_param_.height   = src.GetHeight();
+    resize_param_.size     = resize_param_.width * resize_param_.height;
+    resize_param_.channel  = src.GetChannel();
+    resize_param_.slice    = UP_DIV(resize_param_.channel, 4);
+    //resize specific parameters
+    resize_param_.scale_w  = param.scale_w;
+    resize_param_.scale_h  = param.scale_h;
+
+    resize_param_.resized_width = dst.GetWidth();
+    resize_param_.resized_height = dst.GetHeight();
+    
+    resize_param_.type           = int(param.type);
+    
+    buffer_resize_param_ = [device_ newBufferWithBytes:&resize_param_
+                                                length:sizeof(MetalResizeParams)
+                                               options:MTLResourceCPUCacheModeWriteCombined];
+    
+    if (!buffer_resize_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer resize param is nil!");
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBufferCropParam(CropParam param, Mat& src, Mat& dst) {
+    crop_param_.batch          = src.GetBatch();
+    crop_param_.width          = src.GetWidth();
+    crop_param_.height         = src.GetHeight();
+    crop_param_.size           = crop_param_.width * crop_param_.height;
+    crop_param_.channel        = src.GetChannel();
+    crop_param_.slice          = UP_DIV(crop_param_.channel, 4);
+    //crop specific parameters
+    crop_param_.crop_width     = param.width;
+    crop_param_.crop_height    = param.height;
+    crop_param_.top_left_x     = param.top_left_x;
+    crop_param_.top_left_y     = param.top_left_y;
+    
+    buffer_crop_param_ = [device_ newBufferWithBytes:&crop_param_
+                                              length:sizeof(MetalCropParams)
+                                             options:MTLResourceCPUCacheModeWriteCombined];
+    
+    if (!buffer_crop_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer crop param is nil!");
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBufferWarpAffineParam(WarpAffineParam param, Mat& src, Mat& dst) {
+    warpaffine_param_.batch    = src.GetBatch();
+    warpaffine_param_.width    = src.GetWidth();
+    warpaffine_param_.height   = src.GetHeight();
+    warpaffine_param_.size     = warpaffine_param_.width * warpaffine_param_.height;
+    warpaffine_param_.channel  = src.GetChannel();
+    warpaffine_param_.slice    = UP_DIV(warpaffine_param_.channel, 4);
+    warpaffine_param_.resized_height = dst.GetHeight();
+    warpaffine_param_.resized_width = dst.GetWidth();
+    
+    warpaffine_param_.interp_type = int(param.interp_type);
+    warpaffine_param_.border_type = int(param.border_type);
+    warpaffine_param_.border_val = param.border_val;
+    
+    // compute the inverse transformation matrix
+    float d   = param.transform[0][0] * param.transform[1][1] - param.transform[0][1] * param.transform[1][0];
+    d          = d != 0 ? 1. / d : 0;
+
+    float a11 = param.transform[1][1] * d, a22 = param.transform[0][0] * d;
+    warpaffine_param_.transform_inv[0][0]      = a11;
+    warpaffine_param_.transform_inv[0][1]      = param.transform[0][1] * (-d);
+    warpaffine_param_.transform_inv[1][0]      = param.transform[1][0] * (-d);
+    warpaffine_param_.transform_inv[1][1]      = a22;
+
+    float b1 = -a11 * param.transform[0][2] - warpaffine_param_.transform_inv[0][1] * param.transform[1][2];
+    float b2 = -warpaffine_param_.transform_inv[1][0] * param.transform[0][2] - a22 * param.transform[1][2];
+    warpaffine_param_.transform_inv[0][2]      = b1;
+    warpaffine_param_.transform_inv[1][2]      = b2;
+    
+    buffer_warpaffine_param_ = [device_ newBufferWithBytes:&warpaffine_param_
+                                                    length:sizeof(MetalWarpAffineParams)
+                                                   options:MTLResourceCPUCacheModeWriteCombined];
+    
+    if (!buffer_warpaffine_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer warpaffine param is nil!");
+    }
+    
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBufferCopyParam(Mat& src, Mat& dst) {
+    copy_param_.batch    = src.GetBatch();
+    copy_param_.width    = src.GetWidth();
+    copy_param_.height   = src.GetHeight();
+    copy_param_.size     = copy_param_.width * copy_param_.height;
+    copy_param_.channel  = src.GetChannel();
+    copy_param_.slice    = UP_DIV(copy_param_.channel, 4);
+    
+    buffer_copy_param_ = [device_ newBufferWithBytes:&copy_param_
+                                              length:sizeof(MetalCopyParams)
+                                             options:MTLResourceCPUCacheModeWriteCombined];
+    
+    if (!buffer_copy_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer copy param is nil!");
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBufferBGR2GrayParam(Mat& src, Mat& dst) {
+    bgr2gray_param_.batch    = src.GetBatch();
+    bgr2gray_param_.width    = src.GetWidth();
+    bgr2gray_param_.height   = src.GetHeight();
+    bgr2gray_param_.size     = bgr2gray_param_.width * bgr2gray_param_.height;
+    bgr2gray_param_.channel  = src.GetChannel();
+    bgr2gray_param_.slice    = UP_DIV(bgr2gray_param_.channel, 4);
+    
+    buffer_bgr2gray_param_ = [device_ newBufferWithBytes:&bgr2gray_param_
+                                              length:sizeof(MetalBGR2GrayParams)
+                                             options:MTLResourceCPUCacheModeWriteCombined];
+    
+    if (!buffer_bgr2gray_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer bgr2gray param is nil!");
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBufferCopyMakeBorderParam(CopyMakeBorderParam param, Mat &src, Mat &dst) {
+    copy_make_border_param_.batch = src.GetBatch();
+    copy_make_border_param_.channel = src.GetChannel();
+    copy_make_border_param_.height = src.GetHeight();
+    copy_make_border_param_.width = src.GetWidth();
+    copy_make_border_param_.top = param.top;
+    copy_make_border_param_.bottom = param.bottom;
+    copy_make_border_param_.left = param.left;
+    copy_make_border_param_.right = param.right;
+
+    copy_make_border_param_.border_type = int(param.border_type);
+    copy_make_border_param_.border_val = param.border_val;
+
+    buffer_copymakeborder_param_= [device_ newBufferWithBytes:&copy_make_border_param_
+                                                    length:sizeof(MetalCopyMakeBorderParam)
+                                                      options:MTLResourceCPUCacheModeWriteCombined];
+
+    if (!buffer_copymakeborder_param_) {
+        return Status(TNNERR_INVALID_INPUT, "buffer copymakeborder param is nil!");
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateCropComputePipeline(CropParam param, Mat& src, Mat& dst, void *command_queue) {
+#if ENABLE_PIPELINE_CACHE
+    static std::map<std::string,  id <MTLComputePipelineState> > library_cache;
+#endif
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+    
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+    
+    id<MTLFunction> func_process = nil;
+
+    std::string kernel_name("");
+    if (src_mat_type == dst_mat_type) {
+        if (N8UC4 == src_mat_type) {
+#if ENABLE_PIPELINE_CACHE
+            kernel_name = std::string("mat_converter_texture_n8uc4_crop");
+            if (library_cache.count(kernel_name) > 0){
+                pipeline_process_ = library_cache[kernel_name];
+                return TNN_OK;
+            }
+#endif
+            // metal N8UC4 image crop kernel
+            func_process = [library newFunctionWithName:@"mat_converter_texture_n8uc4_crop"];
+        } else if(N8UC3 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if(NGRAY == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if(NNV21 == src_mat_type || NNV12 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if(NCHW_FLOAT == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "crop pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+    
+#if ENABLE_PIPELINE_CACHE
+    library_cache[kernel_name] = pipeline_process;
+#endif
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateResizeComputePipeline(ResizeParam param, Mat& src, Mat& dst, void *command_queue) {
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+#if ENABLE_PIPELINE_CACHE
+    static std::map<std::string,  id <MTLComputePipelineState> > library_cache;
+#endif
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+
+    id<MTLFunction> func_process = nil;
+    
+    std::string kernel_name("");
+    if (src_mat_type == dst_mat_type) {
+        if (N8UC4 == src_mat_type) {
+            if (INTERP_TYPE_NEAREST == param.type) {
+#if ENABLE_PIPELINE_CACHE
+                kernel_name = string("mat_converter_texture_n8uc4_resize_nearest");
+                if (library_cache.count(kernel_name) != 0) {
+                    // cache hit
+                    pipeline_process_ = library_cache[kernel_name];
+                    return TNN_OK;
+                }
+                // cache miss
+#endif
+                func_process = [library newFunctionWithName:@"mat_converter_texture_n8uc4_resize_nearest"];
+            } else if (INTERP_TYPE_LINEAR == param.type) {
+#if ENABLE_PIPELINE_CACHE
+                kernel_name = string("mat_converter_texture_n8uc4_resize_linear");
+                if (library_cache.count(kernel_name) != 0) {
+                    // cache hit
+                    pipeline_process_ = library_cache[kernel_name];
+                    return TNN_OK;
+                }
+                // cache miss
+#endif
+                func_process = [library newFunctionWithName:@"mat_converter_texture_n8uc4_resize_linear"];
+            }
+        } else if (N8UC3 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NGRAY == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NNV21 == src_mat_type || NNV12 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NCHW_FLOAT == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "resize pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+#if ENABLE_PIPELINE_CACHE
+    library_cache[kernel_name] = pipeline_process;
+#endif
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateCopyComputePipeline(Mat& src, Mat& dst, void *command_queue) {
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+    
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+    
+    id<MTLFunction> func_process = nil;
+    
+    if (src_mat_type == dst_mat_type) {
+        if (N8UC4 == src_mat_type) {
+            func_process = [library newFunctionWithName:@"copy_n8uc4_to_cpu"];
+        } else if (NCHW_FLOAT == src_mat_type) {
+            func_process = [library newFunctionWithName:@"copy_nchw_to_cpu"];
+        } else if (N8UC3 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NGRAY == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NNV21 == src_mat_type || NNV12 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        }
+    } else if(src_mat_type == N8UC3 || dst_mat_type == N8UC3) {
+        auto src_device_type = src.GetDeviceType();
+        auto dst_device_type = dst.GetDeviceType();
+        if (src_device_type == DEVICE_METAL) {
+            func_process = [library newFunctionWithName:@"copy_n8uc4_metal_to_n8uc3_cpu"];
+        } else {
+            func_process = [library newFunctionWithName:@"copy_n8uc3_cpu_to_n8uc4_metal"];
+        }
+    }else {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "copy pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+    
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateCopyMakeBorderComputePipeline(CopyMakeBorderParam param, Mat &src, Mat &dst, void *command_queue) {
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+#if ENABLE_PIPELINE_CACHE
+    static std::map<std::string,  id <MTLComputePipelineState> > library_cache;
+#endif
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+
+    if (src_mat_type != dst_mat_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+    if(BORDER_TYPE_CONSTANT != param.border_type) {
+        return Status(TNNERR_INVALID_INPUT, "border type not support yet");
+    }
+
+    id<MTLFunction> func_process = nil;
+    std::string kernel_name("");
+    if (N8UC4 == src_mat_type) {
+#if ENABLE_PIPELINE_CACHE
+        kernel_name = string("copymakeborder_n8uc4_constant");
+        if (library_cache.count(kernel_name) != 0) {
+            // cache hit
+            pipeline_process_ = library_cache[kernel_name];
+            return TNN_OK;
+        }
+        // cache miss
+#endif
+        func_process = [library newFunctionWithName:@"copymakeborder_n8uc4_constant"];
+    } else if (NCHW_FLOAT == src_mat_type) {
+#if ENABLE_PIPELINE_CACHE
+        kernel_name = string("copymakeborder_nchw_constant");
+        if (library_cache.count(kernel_name) != 0) {
+            // cache hit
+            pipeline_process_ = library_cache[kernel_name];
+            return TNN_OK;
+        }
+        // cache miss
+#endif
+        func_process = [library newFunctionWithName:@"copymakeborder_nchw_constant"];
+    } else {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "copymakeborder pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+#if ENABLE_PIPELINE_CACHE
+    library_cache[kernel_name] = pipeline_process;
+#endif
+    return TNN_OK;
+}
+
+Status  MetalMatConverterAcc::AllocateWarpAffineComputePipeline(WarpAffineParam param, Mat& src, Mat& dst, void *command_queue) {
+#if ENABLE_PIPELINE_CACHE
+    static std::map<std::string, id<MTLComputePipelineState>> library_cache;
+#endif
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+    
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+    
+    auto interp_type = param.interp_type;
+    auto border_type = param.border_type;
+    
+    id<MTLFunction> func_process = nil;
+    
+    std::string kernel_name("");
+    if (src_mat_type == dst_mat_type) {
+        if (N8UC4 == src_mat_type) {
+            if (INTERP_TYPE_NEAREST == interp_type) {
+#if ENABLE_PIPELINE_CACHE
+                kernel_name = "mat_converter_texture_n8uc4_warpaffine_nearest_const";
+                if (library_cache.count(kernel_name) > 0) {
+                    pipeline_process_ = library_cache[kernel_name];
+                    return TNN_OK;
+                }
+#endif
+                func_process = [library newFunctionWithName:@"mat_converter_texture_n8uc4_warpaffine_nearest_const"];
+            } else if (INTERP_TYPE_LINEAR == interp_type && BORDER_TYPE_CONSTANT == border_type) {
+#if ENABLE_PIPELINE_CACHE
+                kernel_name = "mat_converter_texture_n8uc4_warpaffine_linear_const";
+                if (library_cache.count(kernel_name) > 0) {
+                    pipeline_process_ = library_cache[kernel_name];
+                    return TNN_OK;
+                }
+#endif
+                func_process = [library newFunctionWithName:@"mat_converter_texture_n8uc4_warpaffine_linear_const"];
+            } else {
+                return Status(TNNERR_PARAM_ERR, "not support yet");
+            }
+        } else if (N8UC3 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NGRAY == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NNV21 == src_mat_type || NNV12 == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else if (NCHW_FLOAT == src_mat_type) {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        } else {
+            return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "warpaffine pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+#if ENABLE_PIPELINE_CACHE
+    library_cache[kernel_name] = pipeline_process;
+#endif
+    
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::AllocateBGR2GrayComputePipeline(Mat& src, Mat& dst, void *command_queue) {
+#if ENABLE_PIPELINE_CACHE
+        static std::map<std::string, id<MTLComputePipelineState>> library_cache;
+#endif
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto library = command_queue_impl.metalContextImpl.library;
+    if (!library) {
+        return Status(TNNERR_INVALID_INPUT, "metal library is nil");
+    }
+    
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+    
+    id<MTLFunction> func_process = nil;
+    
+    std::string kernel_name("");
+    if (src_mat_type == N8UC4) {
+        if (NCHW_FLOAT == dst_mat_type) {
+#if ENABLE_PIPELINE_CACHE
+            kernel_name = "bgr2gray_n8uc4_nchw_float";
+            if (library_cache.count(kernel_name) > 0) {
+                pipeline_process_ = library_cache[kernel_name];
+                return TNN_OK;
+            }
+#endif
+            func_process = [library newFunctionWithName:@"bgr2gray_n8uc4_nchw_float"];
+        } else {
+            return Status(TNNERR_PARAM_ERR, "dst mat type not support yet");
+        }
+    }else {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+    if (!func_process) {
+        return Status(TNNERR_INVALID_INPUT, "mat converter func not found");
+    }
+    auto pipeline_process = [device_ newComputePipelineStateWithFunction:func_process error:nil];
+    if (!pipeline_process) {
+        return Status(TNNERR_INVALID_INPUT, "bgr2gray pipeline is nil");
+    }
+    pipeline_process_ = pipeline_process;
+#if ENABLE_PIPELINE_CACHE
+    library_cache[kernel_name] = pipeline_process;
+#endif
+    return TNN_OK;
+}
+
+bool MetalMatConverterAcc::DeviceTypeCheck(const DeviceType& src_device_type,
+                                           const DeviceType& dst_device_type) {
+    return dst_device_type != DEVICE_METAL && src_device_type != DEVICE_METAL;
+}
+
+Status MetalMatConverterAcc::CopyInputCheck(Mat& src,
+                                            Mat& dst,
+                                            const DeviceType& src_device_type,
+                                            const DeviceType& dst_device_type,
+                                            const MatType& src_mat_type,
+                                            const MatType& dst_mat_type) {
+    if (DeviceTypeCheck(src_device_type, dst_device_type)) {
+        return Status(TNNERR_INVALID_INPUT, "neither src nor dst is not Metal Mat");
+    }
+
+    if (!(src_device_type == DEVICE_NAIVE || src_device_type == DEVICE_ARM || dst_device_type == DEVICE_NAIVE || dst_device_type == DEVICE_ARM)) {
+        return Status(TNNERR_INVALID_INPUT, "device type not support yet");
+    }
+
+    if (src_mat_type != N8UC4 && src_mat_type != NCHW_FLOAT && src_mat_type != N8UC3) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+
+    if (src_mat_type != dst_mat_type && !(src_mat_type == N8UC3 && dst_mat_type == N8UC4) && !(src_mat_type == N8UC4 && dst_mat_type == N8UC3)) {
+        return Status(TNNERR_INVALID_INPUT, "src and dst mat type must be same");
+    }
+
+    if (! ((src_mat_type == dst_mat_type && DimsVectorUtils::Equal(src.GetDims(), dst.GetDims())) || ((src_mat_type == N8UC3 || dst_mat_type == N8UC3) && (src.GetHeight()==dst.GetHeight() && src.GetWidth()==dst.GetWidth()))) ) {
+        return Status(TNNERR_INVALID_INPUT, "src and dst shape not match");
+    }
+
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::SetDevice(Mat& src,
+                                       Mat& dst,
+                                       const DeviceType& src_device_type,
+                                       const DeviceType& dst_device_type,
+                                       const MatType& src_mat_type,
+                                       const MatType& dst_mat_type) {
+    if (device_ == nil) {
+        if (src_device_type == DEVICE_METAL) {
+            if(src_mat_type == N8UC4) {
+                id<MTLTexture> texture = (__bridge id<MTLTexture>)(src.GetData());
+                device_ = texture.device;
+            } else {
+                id<MTLBuffer> buffer = (__bridge id<MTLBuffer>)(src.GetData());
+                device_     = buffer.device;
+            }
+        } else if(dst_device_type == DEVICE_METAL) {
+            if(dst_mat_type == N8UC4) {
+                id<MTLTexture> texture = (__bridge id<MTLTexture>)(dst.GetData());
+                device_ = texture.device;
+            } else {
+                id<MTLBuffer> buffer = (__bridge id<MTLBuffer>)(dst.GetData());
+                device_     = buffer.device;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::MetalCopyToCPU(Mat& src, Mat& dst,
+                                            const DimsVector& dims,
+                                            const MatType& src_mat_type, const MatType& dst_mat_type,
+                                            TNNMetalCommandQueueImpl *command_queue_impl) {
+    // 1) metal => buffer
+    id<MTLBuffer> tmp_buffer = nil;
+    if (dst_mat_type == N8UC3) {
+        tmp_buffer = [device_ newBufferWithLength: dims[2]*dims[3]*3
+                                            options:MTLResourceOptionCPUCacheModeDefault];
+    } else if(dst_mat_type == N8UC4){
+        // N8UC4
+        tmp_buffer = [device_ newBufferWithLength: dims[2]*dims[3]*4
+                                            options:MTLResourceOptionCPUCacheModeDefault];
+    } else {
+        // NCHW_FLOAT
+        auto count = DimsVectorUtils::Count(dims);
+        tmp_buffer = [device_ newBufferWithLength: count*4
+                                            options:MTLResourceOptionCPUCacheModeDefault];
+    }
+    if (tmp_buffer == nil) {
+        return Status(TNNERR_INST_ERR, "tmp_buffer is nil");
+    }
+    do {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((dims[3] + group_threads.width - 1) / group_threads.width), (NSUInteger)dims[2], (NSUInteger)1};
+        if (src_mat_type == NCHW_FLOAT) {
+            groups = {(NSUInteger)((dims[3] + group_threads.width - 1) / group_threads.width), (NSUInteger)dims[2], (NSUInteger)dims[1]*dims[0]};
+        }
+
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+        if (dst_mat_type == N8UC4) {
+            id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+
+            [encoder setTexture:input_texture atIndex:0];
+            [encoder setBuffer:tmp_buffer offset:0 atIndex:0];
+            [encoder setBuffer:buffer_copy_param_ offset:0 atIndex:1];
+        } else if(dst_mat_type == NCHW_FLOAT) {
+            id<MTLBuffer> input_buffer = (__bridge id<MTLBuffer>)(src.GetData());
+
+            [encoder setBuffer:input_buffer offset:0 atIndex:0];
+            [encoder setBuffer:tmp_buffer offset:0 atIndex:1];
+            [encoder setBuffer:buffer_copy_param_ offset:0 atIndex:2];
+        } else if(dst_mat_type == N8UC3) {
+            id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+
+            [encoder setTexture:input_texture atIndex:0];
+            [encoder setBuffer:tmp_buffer offset:0 atIndex:0];
+            [encoder setBuffer:buffer_copy_param_ offset:0 atIndex:1];
+        }
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+    } while(0);
+    // 2) buffer => dst
+    memcpy(dst.GetData(), [tmp_buffer contents], tmp_buffer.length);
+
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::CPUCopyToMetal(Mat& src, Mat& dst,
+                                            const DimsVector& dims,
+                                            const MatType& src_mat_type, const MatType& dst_mat_type,
+                                            TNNMetalCommandQueueImpl *command_queue_impl) {
+    if (src_mat_type == N8UC4) {
+        id<MTLTexture> texture = (__bridge id<MTLTexture>)(dst.GetData());
+        if (!texture) {
+            return Status(TNNERR_INST_ERR, "dst GetTexture return nil");
+        }
+        [texture replaceRegion:MTLRegionMake2D(0, 0, dims[3], dims[2])
+                    mipmapLevel:0
+                        withBytes:src.GetData()
+                    bytesPerRow:dims[3]*4];
+    } else if(src_mat_type == NCHW_FLOAT) {
+        // 1) cpu => buffer
+        auto count = DimsVectorUtils::Count(dims);
+        id<MTLBuffer> tmp_buffer = [device_ newBufferWithLength: count*4
+                                                        options:MTLResourceOptionCPUCacheModeDefault];
+        memcpy([tmp_buffer contents], src.GetData(), tmp_buffer.length);
+        // 2) buffer => metal
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((dims[3] + group_threads.width - 1) / group_threads.width), (NSUInteger)dims[2], (NSUInteger)dims[1]*dims[0]};
+
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+
+        id<MTLBuffer> dst_buffer = (__bridge id<MTLBuffer>)(dst.GetData());
+
+        [encoder setBuffer:tmp_buffer offset:0 atIndex:0];
+        [encoder setBuffer:dst_buffer offset:0 atIndex:1];
+        [encoder setBuffer:buffer_copy_param_ offset:0 atIndex:2];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+    } else if(src_mat_type == N8UC3) {
+        // 1) cpu => buffer
+        id<MTLBuffer> tmp_buffer = [device_ newBufferWithLength: dims[2]*dims[3]*3
+                                                        options:MTLResourceOptionCPUCacheModeDefault];
+        memcpy([tmp_buffer contents], src.GetData(), tmp_buffer.length);
+
+        // 2) buffer => metal
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((dims[3] + group_threads.width - 1) / group_threads.width), (NSUInteger)dims[2], (NSUInteger)1};
+
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+
+        id<MTLTexture> dst_texture = (__bridge id<MTLTexture>)(dst.GetData());
+
+        [encoder setBuffer:tmp_buffer offset:0 atIndex:0];
+        [encoder setTexture:dst_texture atIndex:0];
+        [encoder setBuffer:buffer_copy_param_ offset:0 atIndex:1];
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+
+    auto src_mat_type    = src.GetMatType();
+    auto dst_mat_type    = dst.GetMatType();
+
+    if (src_device_type == dst_device_type) {
+        return TNN_OK;
+    }
+
+    auto status = CopyInputCheck(src, dst, src_device_type, dst_device_type, src_mat_type, dst_mat_type);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = SetDevice(src, dst, src_device_type, dst_device_type, src_mat_type, dst_mat_type);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+    
+    status = AllocateBufferCopyParam(src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    status = AllocateCopyComputePipeline(src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    auto dims = src.GetDims();
+    //check copy direction
+    if (src_device_type == DEVICE_METAL) {
+        // Metal => cpu
+        status = MetalCopyToCPU(src, dst, dims, src_mat_type, dst_mat_type, command_queue_impl);
+        if (status != TNN_OK) {
+            return status;
+        }
+    } else {
+        // cpu => Metal
+        status = CPUCopyToMetal(src, dst, dims, src_mat_type, dst_mat_type, command_queue_impl);
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+    if (dst_mat_type != src_mat_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    if (src_mat_type != N8UC4) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+    //Get device
+    if (device_ == nil) {
+        id<MTLTexture> texture = (__bridge id<MTLTexture>)(src.GetData());
+        device_     = texture.device;
+    }
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+
+    auto status = AllocateBufferResizeParam(param, src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateResizeComputePipeline(param, src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    do {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((resize_param_.resized_width + group_threads.width - 1) / group_threads.width), (NSUInteger)resize_param_.resized_height, (NSUInteger)1};
+        
+        id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+        id<MTLTexture> output_texture = (__bridge id<MTLTexture>)(dst.GetData());
+        
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+        
+        [encoder setTexture:input_texture atIndex:0];
+        [encoder setTexture:output_texture atIndex:1];
+        [encoder setBuffer:buffer_resize_param_ offset:0 atIndex:0];
+        
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+        
+        [command_buffer commit];
+        //wait to complete
+#if KERNEL_SYNC
+        [command_buffer waitUntilCompleted];
+#endif
+    } while(0);
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+    if (dst_mat_type != src_mat_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    if (src_device_type != dst_device_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst device type must be same");
+    }
+
+    if (src_mat_type != N8UC4) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+    //Get device
+    if (device_ == nil) {
+        id<MTLTexture> texture = (__bridge id<MTLTexture>)(src.GetData());
+        device_     = texture.device;
+    }
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+    
+    auto status = AllocateBufferCropParam(param, src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    status = AllocateCropComputePipeline(param, src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    do {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((param.width + group_threads.width - 1) / group_threads.width), (NSUInteger)param.height, (NSUInteger)1};
+        id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+        id<MTLTexture> output_texture = (__bridge id<MTLTexture>)(dst.GetData());
+        
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+        
+        [encoder setTexture:input_texture atIndex:0];
+        [encoder setTexture:output_texture atIndex:1];
+        [encoder setBuffer:buffer_crop_param_ offset:0 atIndex:0];
+        
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+        
+        [command_buffer commit];
+#if KERNEL_SYNC
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+#endif
+    } while(0);
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+    if (dst_mat_type != src_mat_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    if (src_device_type != dst_device_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst device type must be same");
+    }
+
+    if (src_mat_type != N8UC4) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+    //Get device
+    if (device_ == nil) {
+        id<MTLTexture> texture = (__bridge id<MTLTexture>)(src.GetData());
+        device_     = texture.device;
+    }
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+
+    auto status = AllocateBufferWarpAffineParam(param, src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    status = AllocateWarpAffineComputePipeline(param, src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    do {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((warpaffine_param_.resized_width + group_threads.width - 1) / group_threads.width), (NSUInteger)warpaffine_param_.resized_height, (NSUInteger)1};
+        
+        id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+        id<MTLTexture> output_texture = (__bridge id<MTLTexture>)(dst.GetData());
+        
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+        
+        [encoder setTexture:input_texture atIndex:0];
+        [encoder setTexture:output_texture atIndex:1];
+        [encoder setBuffer:buffer_warpaffine_param_ offset:0 atIndex:0];
+        
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+        
+        [command_buffer commit];
+#if KERNEL_SYNC
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+#endif
+    } while(0);
+    
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::CopyMakeBorder(Mat &src, Mat &dst, CopyMakeBorderParam param, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+
+    if (dst_mat_type != src_mat_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    if (src_device_type != dst_device_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst device type must be same");
+    }
+
+    if (src_mat_type != N8UC4 && src_mat_type != NCHW_FLOAT) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+
+    auto status = SetDevice(src, dst, src_device_type, dst_device_type, src_mat_type, dst_mat_type);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+
+    auto context_impl = command_queue_impl.metalContextImpl;
+
+    status = AllocateBufferCopyMakeBorderParam(param, src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = AllocateCopyMakeBorderComputePipeline(param, src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    do {
+        auto dst_dims = dst.GetDims();
+        const int dst_height = dst_dims[2];
+        const int dst_width  = dst_dims[3];
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((dst_width + group_threads.width - 1) / group_threads.width), (NSUInteger)dst_height, (NSUInteger)1};
+        if (src_mat_type == NCHW_FLOAT) {
+            groups.depth = dst_dims[1] * dst_dims[0];
+        }
+
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+
+        if (src_mat_type == N8UC4) {
+            id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+            id<MTLTexture> output_texture = (__bridge id<MTLTexture>)(dst.GetData());
+
+            [encoder setTexture:input_texture atIndex:0];
+            [encoder setTexture:output_texture atIndex:1];
+            [encoder setBuffer:buffer_copymakeborder_param_ offset:0 atIndex:0];
+        } else if (src_mat_type == NCHW_FLOAT) {
+            id<MTLBuffer> input_buffer = (__bridge id<MTLBuffer>)(src.GetData());
+            id<MTLBuffer> output_buffer = (__bridge id<MTLBuffer>)(dst.GetData());
+
+            [encoder setBuffer:input_buffer offset:0 atIndex:0];
+            [encoder setBuffer:output_buffer offset:0 atIndex:1];
+            [encoder setBuffer:buffer_copymakeborder_param_ offset:0 atIndex:2];
+        }
+
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+
+        [command_buffer commit];
+#if KERNEL_SYNC
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+#endif
+    } while(0);
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::BGR2Gray(Mat& src, Mat& dst, void* command_queue) {
+    auto src_device_type = src.GetDeviceType();
+    auto dst_device_type = dst.GetDeviceType();
+    
+    auto src_mat_type = src.GetMatType();
+    auto dst_mat_type = dst.GetMatType();
+    
+    if (src_mat_type != N8UC4 || dst_mat_type != NCHW_FLOAT) {
+        return Status(TNNERR_PARAM_ERR, "mat type not support yet");
+    }
+
+    if (src_device_type != dst_device_type) {
+        return Status(TNNERR_PARAM_ERR, "src and dst device type must be same");
+    }
+
+    //Get device
+    if (device_ == nil) {
+        id<MTLTexture> texture = (__bridge id<MTLTexture>)(src.GetData());
+        device_     = texture.device;
+    }
+
+    auto command_queue_impl = (__bridge TNNMetalCommandQueueImpl *)(command_queue);
+    if (!command_queue_impl) {
+        return Status(TNNERR_INST_ERR, "command queue is nil");
+    }
+    
+    auto context_impl = command_queue_impl.metalContextImpl;
+    
+    auto status = AllocateBufferBGR2GrayParam(src, dst);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    status = AllocateBGR2GrayComputePipeline(src, dst, command_queue);
+    if (status != TNN_OK) {
+        return status;
+    }
+    
+    do {
+        MTLSize group_threads = {(NSUInteger)pipeline_process_.threadExecutionWidth, (NSUInteger)1, (NSUInteger)1};
+        MTLSize groups = {(NSUInteger)((bgr2gray_param_.width + group_threads.width - 1) / group_threads.width), (NSUInteger)bgr2gray_param_.height, (NSUInteger)1};
+        id<MTLTexture> input_texture = (__bridge id<MTLTexture>)(src.GetData());
+        id<MTLBuffer> output = (__bridge id<MTLBuffer>)(dst.GetData());
+        
+        auto command_buffer = [command_queue_impl commandBuffer];
+        [command_buffer enqueue];
+        auto encoder = [command_buffer computeCommandEncoder];
+        [encoder setComputePipelineState: pipeline_process_];
+        
+        [encoder setTexture:input_texture atIndex:0];
+        [encoder setBuffer:output offset:0 atIndex:0];
+        [encoder setBuffer:buffer_bgr2gray_param_ offset:0 atIndex:1];
+        
+        [encoder dispatchThreadgroups:groups threadsPerThreadgroup:group_threads];
+        [encoder endEncoding];
+        
+        [command_buffer commit];
+#if KERNEL_SYNC
+        //wait to complete
+        [command_buffer waitUntilCompleted];
+#endif
+    } while(0);
+    return TNN_OK;
+}
+
+Status MetalMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    return Status(TNNERR_PARAM_ERR, "metal not support color conversion");
+}
+
+DECLARE_MAT_CONVERTER_CREATER(Metal);
+REGISTER_MAT_CONVERTER(Metal, DEVICE_METAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.h b/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.h
new file mode 100644
index 0000000..9d0fa0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.h
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CORE_TNN_IMPL_COREML_H_
+#define TNN_CORE_TNN_IMPL_COREML_H_
+
+#include "tnn/core/tnn_impl.h"
+
+namespace TNN_NS {
+
+class TNNImplCoreML : public TNNImpl {
+public:
+    // @brief tnn constructor
+    TNNImplCoreML();
+
+    // @brief tnn destructor
+    virtual ~TNNImplCoreML();
+
+    // @brief init the tnn, construct model interpreter
+    // @param config config model type and params
+    // @return status code: Successful, returns zero. Otherwise, returns
+    // error code.
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: If successful, returns zero. Otherwise, returns
+    // error
+    // code.
+    virtual Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    // return input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use the shape in the
+    // proto
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap inputs_shape = InputShapesMap());
+
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_CORE_TNN_IMPL_COREML_H_
diff --git a/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.mm b/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.mm
new file mode 100644
index 0000000..39c4d40
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/metal/tnn_impl_coreml.mm
@@ -0,0 +1,62 @@
+#include "tnn/device/metal/tnn_impl_coreml.h"
+#include "tnn/core/instance.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/net_structure.h"
+#include "stdio.h"
+
+namespace TNN_NS {
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplCoreML>>
+    g_tnn_impl_coreml_factory_register(MODEL_TYPE_COREML);
+    
+TNNImplCoreML::TNNImplCoreML() {
+    
+}
+    
+TNNImplCoreML::~TNNImplCoreML() {
+}
+
+Status TNNImplCoreML::Init(ModelConfig& config) {
+    return TNNImpl::Init(config);
+}
+
+Status TNNImplCoreML::DeInit() {
+//    coreml_model_ = nil;
+    return TNN_OK;
+}
+
+Status TNNImplCoreML::AddOutput(const std::string& layer_name, int output_index) {
+    return Status(TNNERR_MODEL_ERR, "Error: CoreML do not support adding output");
+}
+
+Status TNNImplCoreML::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model input shapes");
+}
+
+std::shared_ptr<Instance> TNNImplCoreML::CreateInst(NetworkConfig& net_config,
+                                               Status& status,
+                                               InputShapesMap inputs_shape) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(nullptr, inputs_shape);
+
+    if (status != TNN_OK) {
+        return nullptr;
+    }
+    return instance;
+}
+
+
+std::shared_ptr<Instance> TNNImplCoreML::CreateInst(NetworkConfig& net_config,
+                                               Status& status,
+                                               InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(nullptr, min_inputs_shape, max_inputs_shape);
+
+    if (status != TNN_OK) {
+        return nullptr;
+    }
+    return instance;
+}
+
+    
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/opencl/CMakeLists.txt
new file mode 100644
index 0000000..b85e123
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/CMakeLists.txt
@@ -0,0 +1,12 @@
+execute_process(COMMAND ${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/cl/opencl_codegen.py" "${CMAKE_CURRENT_SOURCE_DIR}/cl")
+
+file(GLOB_RECURSE OPENCL_SRCS *.h *.cc)
+
+add_library(TNNOpenCL OBJECT ${OPENCL_SRCS})
+
+target_include_directories(TNNOpenCL PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/opencl/include
+)
+
+
+add_definitions(-DTNN_USE_OPENCL_WRAPPER)
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc
new file mode 100644
index 0000000..23efad5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc
@@ -0,0 +1,273 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+#define LowOpHWThre 9
+#define HighOpIntensityThre 128
+
+// magic number
+static const uint32_t lws_limit         = 128;
+
+bool OpenCLConvLayer1x1Acc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &,
+                                       const std::vector<Blob *> &) {
+    if (!param) {
+        return false;
+    }
+    return param->group == 1 && param->kernels[0] == 1 && param->kernels[1] == 1 && param->dialations[0] == 1 && 
+            param->dialations[1] == 1 && param->pads[0] == 0 && param->pads[1] == 0;
+}
+
+Status OpenCLConvLayer1x1Acc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Conv 1x1 Acc\n");
+
+    conv_type_ = CT_CONV_1x1;
+    op_name_   = "Conv_1x1";
+
+    // AccImpl init first
+    Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    if (1 == conv_params_.stride_x && 1 == conv_params_.stride_y) {
+        stride_is_1_ = true;
+    }
+
+    if (stride_is_1_ && !run_3d_ndrange_) {
+        if (MALI_T == gpu_info_.type || MALI_G == gpu_info_.type) {
+            use_buffer_ = true;
+        }
+    }
+
+    ret = AllocateWeightsBias(resource);
+    CHECK_TNN_OK(ret)
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int input_channel     = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int output_batch      = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_channel    = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height     = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width      = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    std::string program_name = "convolution_1x1";
+    std::string kernel_name;
+    if (run_3d_ndrange_) {
+        program_name += "_gws_3d";
+        kernel_name = "Conv2D1x1GS3D";
+    } else {
+        kernel_name = "Conv2D1x1";
+    }
+    if (stride_is_1_) {
+        kernel_name += "_S1";
+    }
+    if (use_buffer_) {
+        program_name += "_mix";
+        kernel_name += "_MIX";
+    }
+
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+    int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width;
+    int task_size_per_cu = task_size / static_cast<int>(compute_units);
+    if (!run_3d_ndrange_ && stride_is_1_ && use_buffer_ && task_size_per_cu < 256) {
+        width_blocking_is_1_ = true;
+        kernel_name += "_WB1";
+        run_local_work_ = (UP_DIV(input_channel, 4) >= HighOpIntensityThre) &&
+                          (output_height * output_width <= LowOpHWThre);
+        if (run_local_work_) {
+            kernel_name += "_Local";
+        }
+    } else if (output_channel > 4 && run_3d_ndrange_ && !use_buffer_) {
+        is_channel_blocking_ = true;
+        kernel_name += "_CB2";
+    }
+
+    ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name, build_options_);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    LOGD("conv1x1: use buffer: %s  run_3d: %s\n", use_buffer_ ? "Yes" : "No", run_3d_ndrange_ ? "Yes" : "No");
+
+    return TNN_OK;
+}
+
+OpenCLConvLayer1x1Acc::~OpenCLConvLayer1x1Acc() {}
+
+Status OpenCLConvLayer1x1Acc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Conv 1x1 Acc Reshape\n");
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int input_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+
+    const int output_channels = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_channel_blocks = UP_DIV(output_channels, 4);
+
+    int type_size = sizeof(float);
+    if (OpenCLRuntime::GetInstance()->GetPrecision() != PRECISION_HIGH) {
+        type_size = 2;
+    }
+    auto &unit              = execute_units_[0];
+    uint32_t workgroup_size = 0;
+
+    if (run_3d_ndrange_) {
+        if (is_channel_blocking_) {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 8)),
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        } else {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4)),
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        }
+
+        execute_units_[0].local_work_size =
+            Conv2d1x1LocalWS3D(execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max);
+
+    } else {
+         if (width_blocking_is_1_) {
+            if (run_local_work_) {
+                auto &unit              = execute_units_[0];
+                workgroup_size = std::min(static_cast<uint32_t>(unit.local_mem_size / (4 * type_size)),
+                                          unit.workgroupsize_max);
+                workgroup_size = std::min(static_cast<uint32_t>(input_channel_blocks), workgroup_size);
+                int temp_size = 1;
+                while ((temp_size <<= 1) <= workgroup_size);
+                workgroup_size = temp_size >> 1;
+
+                execute_units_[0].global_work_size = {
+                    static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) *
+                                                 DimsFunctionUtils::GetDim(output_dims, 3) * workgroup_size),
+                    static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                          DimsFunctionUtils::GetDim(output_dims, 2))};
+            } else {
+                execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) * DimsFunctionUtils::GetDim(output_dims, 3)),
+                                                      static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) * DimsFunctionUtils::GetDim(output_dims, 2))};
+            }
+        } else if (is_channel_blocking_) {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 8) *
+                                      UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        } else {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) *
+                                      UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        }
+
+        if (!run_local_work_) {
+            execute_units_[0].local_work_size = Conv2dCommonLocalWS2D(
+                execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size);
+        } else {
+            execute_units_[0].local_work_size = {workgroup_size, 1};
+        }
+    }
+    //input width, input height
+    int input_imageshape[2]  = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    //output width, output height
+    int output_imageshape[2] = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+    int stride_shape[2]      = {conv_params_.stride_x, conv_params_.stride_y};
+    uint32_t idx             = 0;
+    for (auto gws : execute_units_[0].global_work_size) {
+        execute_units_[0].ocl_kernel.setArg(idx++, gws);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    if (use_buffer_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Buffer *)ocl_weights_->GetData()));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Buffer *)ocl_bias_->GetData()));
+    } else {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_imageshape), input_imageshape);
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+    if (is_channel_blocking_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int>(output_channel_blocks));
+    }
+    if (!stride_is_1_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+    }
+    if (!width_blocking_is_1_) {
+        // set value (output width / 4)
+        execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4));
+    }
+
+    if (run_local_work_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(input_channel_blocks, workgroup_size));
+        execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+    }
+
+    execute_units_[0].ocl_kernel.setArg(idx++, (int)conv_params_.activation_type);
+
+    if (!run_local_work_ && ocl_context_->GetEnableTuneKernel()) {
+        execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0]));
+    }
+
+    return TNN_OK;
+}
+
+std::vector<uint32_t> OpenCLConvLayer1x1Acc::Conv2d1x1LocalWS3D(std::vector<uint32_t> &gws,
+                                                                const uint32_t max_workgroup_size) {
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+    uint64_t cache_size    = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize();
+    LOGD("cache_size: %d\n", (int)cache_size);
+    const uint32_t base = std::max<uint32_t>(cache_size / g_base_gpu_mem_cachesize, 1);
+    // (inputs + weights + outputs) * array_size * sizeof(float)
+    uint32_t kernel_cache_size = is_channel_blocking_ ? (4 + 8 + 8) * 4 * 4 : (4 + 4 + 4) * 4 * 4;
+
+    std::vector<uint32_t> lws(3, 1);
+    if(max_workgroup_size > 0) {
+        lws[1]              = std::min<uint32_t>(gws[1], max_workgroup_size);
+        if (lws[1] >= base) {
+            lws[0] = std::min<uint32_t>(gws[0], base);
+        } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
+            lws[0] = std::min<uint32_t>(gws[0], base);
+        } else {
+            lws[0] = gws[0] / 8;
+            if (lws[0] < base) {
+                lws[0] = std::max<uint32_t>(gws[0] / 4, base);
+            }
+        }
+        lws[0]                  = std::min<uint32_t>(lws[0], max_workgroup_size / lws[1]);
+        const uint32_t lws_size = lws[0] * lws[1];
+        lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size / compute_units) * 2, gws[2]);
+        if (lws[2] == 0) {
+            lws[2] = std::min<uint32_t>(gws[2], base);
+        }
+        lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], max_workgroup_size / lws_size), 1);
+    }
+    LOGD("compute_units : %d , max_workgroup_size : %d\n", compute_units, max_workgroup_size);
+    LOGD("layer: %s conv1x1 [%d, %d, %d] -- [%d, %d, %d] \n", layer_name_.c_str(), gws[0], gws[1], gws[2], lws[0],
+         lws[1], lws[2]);
+
+    return lws;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h
new file mode 100644
index 0000000..04a6b4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_1X1_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_1X1_ACC_H_
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+class OpenCLConvLayer1x1Acc : public OpenCLConvLayerAccImpl {
+public:
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayer1x1Acc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::vector<uint32_t> Conv2d1x1LocalWS3D(std::vector<uint32_t> &gws, const uint32_t max_workgroup_size);
+
+    bool stride_is_1_ = false;
+    bool width_blocking_is_1_ = false;
+    bool run_local_work_ = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_1X1_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc.cc
new file mode 100644
index 0000000..886511d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h"
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h"
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.h"
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.h"
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+class OpenCLConvLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::shared_ptr<OpenCLConvLayerAccImpl> conv_acc_implement_ = nullptr;
+};
+
+Status OpenCLConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+
+    if (OpenCLConvLayerDepthwiseAcc::IsPrefered(conv_param, inputs, outputs)) {
+        conv_acc_implement_ = std::make_shared<OpenCLConvLayerDepthwiseAcc>();
+    } else if (OpenCLConvLayer1x1Acc::IsPrefered(conv_param, inputs, outputs)) {
+        conv_acc_implement_ = std::make_shared<OpenCLConvLayer1x1Acc>();
+    } else if (OpenCLConvLayerWinogradAcc::IsPrefered(conv_param, inputs, outputs)) {
+        conv_acc_implement_ = std::make_shared<OpenCLConvLayerWinogradAcc>();
+    } else if (OpenCLConvLayerCommonAcc::IsPrefered(conv_param, inputs, outputs)) {
+        conv_acc_implement_ = std::make_shared<OpenCLConvLayerCommonAcc>();
+    }
+
+    if (conv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "this type conv acc is not implemented");
+
+    return conv_acc_implement_->Init(context, conv_param, resource, inputs, outputs);
+}
+
+OpenCLConvLayerAcc::~OpenCLConvLayerAcc() {}
+
+Status OpenCLConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    if (conv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "this type conv acc is not implemented");
+
+    return conv_acc_implement_->Reshape(inputs, outputs);
+}
+
+Status OpenCLConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_FORWARD_ERROR, "this type conv acc is not implemented");
+
+    return conv_acc_implement_->Forward(inputs, outputs);
+}
+
+REGISTER_OPENCL_ACC(Conv, LAYER_CONVOLUTION)
+REGISTER_OPENCL_LAYOUT(LAYER_CONVOLUTION, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc
new file mode 100644
index 0000000..ae77687
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc
@@ -0,0 +1,331 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+// magic number
+static const uint32_t lws_limit = 128;
+
+OpenCLConvLayerAccImpl::OpenCLConvLayerAccImpl() {
+    conv_type_ = CT_CONV_COMMON;
+    op_name_   = "Conv";
+}
+
+Status OpenCLConvLayerAccImpl::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+    if (nullptr == conv_param) {
+        LOGE("invalid conv param!\n");
+        return Status(TNNERR_NULL_PARAM, "invalid conv param");
+    }
+
+    // interpreter conv 2d param with name info.
+    conv_params_.kernel_x        = conv_param->kernels[0];
+    conv_params_.kernel_y        = conv_param->kernels[1];
+    conv_params_.pad_x           = conv_param->pads[0];
+    conv_params_.pad_y           = conv_param->pads[2];
+    conv_params_.stride_x        = conv_param->strides[0];
+    conv_params_.stride_y        = conv_param->strides[1];
+    conv_params_.dilation_x      = conv_param->dialations[0];
+    conv_params_.dilation_y      = conv_param->dialations[1];
+    conv_params_.pad_type        = conv_param->pad_type;
+    conv_params_.group           = conv_param->group;
+    conv_params_.has_bias        = conv_param->bias;
+    conv_params_.activation_type = conv_param->activation_type;
+
+    conv_params_.input_channel  = DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, 1);
+    conv_params_.output_channel = DimsFunctionUtils::GetDim(outputs[0]->GetBlobDesc().dims, 1);
+
+    if ((conv_params_.group <= 0 || conv_params_.input_channel % conv_params_.group != 0)) {
+        LOGE("invalid group size in Conv layer!\n");
+        return Status(TNNERR_LAYER_ERR, "invalid group size in Conv layer");
+    }
+
+    // depthwise kernel or winograd kernel use 2d ndragne.
+    if (CT_CONV_DEPTHWISE == conv_type_ || CT_CONV_WINOGRAD == conv_type_ ) {
+        run_3d_ndrange_ = false;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLConvLayerAccImpl::~OpenCLConvLayerAccImpl() {}
+
+Status OpenCLConvLayerAccImpl::AllocateWeightsBias(LayerResource *resource) {
+    Status ret                       = TNN_OK;
+    ConvLayerResource *conv_resource = dynamic_cast<ConvLayerResource *>(resource);
+    if (nullptr == conv_resource) {
+        LOGE("invalid conv resource!\n");
+        return Status(TNNERR_NULL_PARAM, "invalid conv resource");
+    }
+    // get weights
+    if (conv_resource->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer.
+        float *weights_data_ptr = conv_resource->filter_handle.force_to<float *>();
+        if (weights_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(weights_data_ptr);
+        CHECK_TNN_OK(ret)
+    } else {
+        // if filter handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(conv_resource->filter_handle);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(float_data_ptr.get());
+        CHECK_TNN_OK(ret)
+    }
+
+    // convert bias
+    ret = ConvertChannelWeights(conv_resource->bias_handle, ocl_bias_, conv_params_.output_channel,
+                                conv_params_.has_bias, false, use_buffer_);
+    return ret;
+}
+
+// convert weights will copy data to buffer, then:
+// if use clBuffer weigths for kernel, will convert buffer to buffer with target format.
+// if use clImage weights for kernel, will convert buffer to image with target format.
+Status OpenCLConvLayerAccImpl::ConvertWeights(float *weights_data_ptr) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    float *wdata_ptr              = weights_data_ptr;
+    std::shared_ptr<float> weights_data_trans;
+    // special for group conv
+    if (CT_CONV_COMMON == conv_type_ && conv_params_.group > 1) {
+        int element_size =
+            conv_params_.output_channel * conv_params_.input_channel * conv_params_.kernel_y * conv_params_.kernel_x;
+        weights_data_trans.reset(new float[element_size], [](float *p) { delete[] p; });
+        GROUP_PADDING<float, int>(weights_data_ptr, weights_data_trans.get(), conv_params_.group,
+                                  conv_params_.output_channel, conv_params_.input_channel, conv_params_.kernel_y,
+                                  conv_params_.kernel_x, GOIHW);
+        wdata_ptr = weights_data_trans.get();
+    }
+
+    // copy weights data into clBuffer
+    DimsVector filter_shape;
+    if (CT_CONV_DEPTHWISE == conv_type_) {
+        filter_shape = {1, conv_params_.output_channel, conv_params_.kernel_y, conv_params_.kernel_x};
+    } else {
+        filter_shape = {conv_params_.output_channel, conv_params_.input_channel, conv_params_.kernel_y,
+                        conv_params_.kernel_x};
+    }
+
+    shared_ptr<OpenCLMemory> weight_memory(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(filter_shape) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+    }
+    weight_memory->SetData(&buffer);
+    auto weight_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        buffer, true, CL_MAP_WRITE, 0, DimsVectorUtils::Count(filter_shape) * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL Conv MemMap failed");
+    }
+    memcpy(weight_clbuffer_ptr, wdata_ptr, DimsVectorUtils::Count(filter_shape) * sizeof(float));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(buffer, weight_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL Conv MemUnMap failed");
+    }
+
+    // create ocl_weights_
+    if (use_buffer_) {
+        // create weights use clBuffer
+        DimsVector filter_buffershape;
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            filter_buffershape = {1, ROUND_UP(conv_params_.output_channel, 4),
+                                  conv_params_.kernel_y, conv_params_.kernel_x};
+        } else {
+            filter_buffershape = {ROUND_UP(conv_params_.output_channel, 4), ROUND_UP(conv_params_.input_channel, 4),
+                                  conv_params_.kernel_y, conv_params_.kernel_x};
+        }
+
+        ocl_weights_.reset(new OpenCLMemory(TNN_CL_BUFFER));
+        size_t type_size = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+            type_size = 2;
+        cl::Buffer *weights_clbuffer =
+            new cl::Buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                           DimsVectorUtils::Count(filter_buffershape) * type_size, nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            if (nullptr != weights_clbuffer)
+                delete weights_clbuffer;
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+        }
+        ocl_weights_->SetData(weights_clbuffer, true);
+
+        // transfer from clBuffer to clBuffer
+        ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+        OpenCLBufferFormat buffer_format = CONV2D_FILTER;
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            buffer_format = DW_CONV2D_FILTER;
+        }
+        return convertor.ConvertBufferToBuffer(weight_memory.get(), buffer_format, filter_shape, ocl_weights_.get(),
+                                               true);
+    } else {
+        // create weights use clImage
+        DimsVector filter_imageshape;
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            filter_imageshape = {conv_params_.kernel_x * conv_params_.kernel_y,
+                                 (int)(UP_DIV(conv_params_.output_channel, 4))};  // {w,h}
+        } else {
+            filter_imageshape = {conv_params_.input_channel, (int)(UP_DIV(conv_params_.output_channel, 4) *
+                                                                   conv_params_.kernel_x * conv_params_.kernel_y)};
+        }
+
+        cl_channel_type data_type = CL_FLOAT;
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+            data_type = CL_HALF_FLOAT;
+        cl::Image2D *image =
+            new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                            filter_imageshape[0], filter_imageshape[1], 0, nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            if (nullptr != image)
+                delete image;
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+        }
+        ocl_weights_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+        ocl_weights_->SetData(image, true);
+
+        // transfer from clBuffer to clImage
+        ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+        OpenCLBufferFormat buffer_format = CONV2D_FILTER;
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            buffer_format = DW_CONV2D_FILTER;
+        }
+        return convertor.ConvertBufferToImage(weight_memory.get(), buffer_format, filter_shape, ocl_weights_.get(),
+                                              true);
+    }
+}
+
+#if TNN_PROFILE
+double OpenCLConvLayerAccImpl::GetFlops() {
+    return 2.0 * DimsVectorUtils::Count(output_dims_) * input_dims_[1] / conv_params_.group * conv_params_.kernel_x *
+           conv_params_.kernel_y / 1000.0 / 1000.0;
+}
+#endif
+
+// local size 2d calculate, special for conv default.
+std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS2D(std::vector<uint32_t> &gws,
+                                                                    const uint32_t max_workgroup_size,
+                                                                    const uint32_t subgroup_size) {
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+
+    std::vector<uint32_t> lws;
+    lws.clear();
+
+    if (ADRENO == gpu_info_.type) {
+        lws = AdrenoLocalSize2D(gws, gpu_info_, compute_units, max_workgroup_size, subgroup_size);
+    }
+
+    return lws;
+}
+
+// local size 3d calculate, special for conv default.
+std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DKernel3x3(std::vector<uint32_t> &gws,
+                                                                    const uint32_t kernel_size,
+                                                                    const uint32_t max_workgroup_size) {
+    uint32_t compute_units = std::max<uint32_t>(OpenCLRuntime::GetInstance()->DeviceComputeUnits() / 2, 1);
+    uint64_t cache_size    = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize();
+    const uint32_t base    = std::max<uint32_t>(std::min<uint32_t>(cache_size / g_base_gpu_mem_cachesize, 4), 1);
+    // (inputs + weights + outputs) * array_size * sizeof(float)
+    uint32_t kernel_cache_size = is_channel_blocking_ ? (4 + 8 + 8) * 4 * 4 : (4 + 4 + 4) * 4 * 4;
+
+    std::vector<uint32_t> lws(3, 1);
+    if (max_workgroup_size > 0) {
+        lws[1] = std::min<uint32_t>(gws[1], max_workgroup_size);
+        lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base), max_workgroup_size / lws[1]);
+        const uint32_t lws_size = lws[0] * lws[1];
+
+        lws[2] = std::min<uint32_t>(ROUND_UP(cache_size / kernel_cache_size / lws_size / compute_units, base), gws[2]);
+        if (lws[2] == 0) {
+            lws[2] = std::min<uint32_t>(gws[2], base);
+        }
+        lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], max_workgroup_size / lws_size), 1);
+    }
+
+    LOGD("compute_units : %d , max_workgroup_size : %d\n", compute_units, max_workgroup_size);
+    LOGD("layer: %s conv_common [%d, %d, %d] -- [%d, %d, %d] \n", layer_name_.c_str(), gws[0], gws[1], gws[2], lws[0],
+         lws[1], lws[2]);
+    return lws;
+}
+
+// local size 3d calculate, special for conv default.
+std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DGeneral(std::vector<uint32_t> &gws,
+                                                                    const uint32_t kernel_size,
+                                                                    const uint32_t max_workgroup_size) {
+
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+    uint64_t cache_size    = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize();
+    const uint32_t base    = std::max<uint32_t>(cache_size / g_base_gpu_mem_cachesize, 1);
+    // (inputs + weights + outputs) * array_size * sizeof(float)
+    uint32_t kernel_cache_size = is_channel_blocking_ ? (4 + 8 + 8) * 4 * 4 : (4 + 4 + 4) * 4 * 4;
+    LOGD("cache_size: %d\n", (int)cache_size);
+    std::vector<uint32_t> lws(3, 1);
+    if (max_workgroup_size > 0) {
+        lws[1] = std::min<uint32_t>(gws[1], max_workgroup_size);
+        lws[0] = gws[0] / 4;
+        if (lws[0] == 0) {
+            lws[0] = gws[0];
+        }
+        lws[0]                  = std::min<uint32_t>(lws[0], max_workgroup_size / lws[1]);
+        const uint32_t lws_size = lws[0] * lws[1];
+        lws[2] =
+            std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size / lws_size / compute_units) * 8, gws[2]);
+        if (lws[2] == 0) {
+            if (gws[2] < lws_limit) {
+                lws[2] = gws[2];
+            } else {
+                lws[2] = base;
+            }
+        }
+        lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], max_workgroup_size / lws_size), 1);
+    }
+
+    LOGD("compute_units : %d , max_workgroup_size : %d\n", compute_units, max_workgroup_size);
+    LOGD("layer: %s conv_common [%d, %d, %d] -- [%d, %d, %d] \n", layer_name_.c_str(), gws[0], gws[1], gws[2], lws[0],
+         lws[1], lws[2]);
+    return lws;
+}
+
+std::string OpenCLConvLayerAccImpl::GenerateTuneKernelKey(OpenCLExecuteUnit &unit) {
+    std::string tune_key = unit.program_name + "_" + unit.kernel_name + "_" + "param[" + 
+    "kernel_" + ToString(conv_params_.kernel_x) + "_" + ToString(conv_params_.kernel_y) + "_" 
+    "pad_" + ToString(conv_params_.pad_x) + "_" + ToString(conv_params_.pad_y ) + "_" 
+    "stride_" + ToString(conv_params_.stride_x) + "_" + ToString(conv_params_.stride_y ) + "_" 
+    "dilation_" + ToString(conv_params_.dilation_x) + "_"+ ToString(conv_params_.dilation_y) + "_"
+    "pad_" + ToString(conv_params_.pad_type) + "_" + 
+    "group_" + ToString(conv_params_.group) + "]_global";
+    for(auto size : unit.global_work_size) {
+        tune_key += "_" + ToString(size);
+    }
+    return tune_key;
+} 
+
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h
new file mode 100644
index 0000000..62237c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_ACC_IMPL_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_ACC_IMPL_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+struct OpenCLConvParam {
+    int input_channel;
+    int output_channel;
+    int kernel_x;
+    int kernel_y;
+    int pad_x;
+    int pad_y;
+    int stride_x;
+    int stride_y;
+    int dilation_x;
+    int dilation_y;
+
+    int pad_type;
+    int group;
+    int has_bias;
+    int activation_type;
+};
+
+enum ConvType { CT_CONV_COMMON = 0, CT_CONV_1x1, CT_CONV_DEPTHWISE, CT_CONV_WINOGRAD };
+
+class OpenCLConvLayerAccImpl : public OpenCLLayerAcc {
+public:
+    OpenCLConvLayerAccImpl();
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayerAccImpl() override;
+
+#if TNN_PROFILE
+    virtual double GetFlops() override;
+#endif
+
+protected:
+    Status AllocateWeightsBias(LayerResource *resource);
+    std::vector<uint32_t> Conv2dCommonLocalWS2D(std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
+                                                const uint32_t subgroup_size = 0);
+    std::vector<uint32_t> Conv2dCommonLocalWS3DGeneral(std::vector<uint32_t> &gws, const uint32_t kernel_size,
+                                                const uint32_t max_workgroup_size);
+
+    std::vector<uint32_t> Conv2dCommonLocalWS3DKernel3x3(std::vector<uint32_t> &gws, const uint32_t kernel_size,
+                                                const uint32_t max_workgroup_size);
+    std::string GenerateTuneKernelKey(OpenCLExecuteUnit &unit);
+
+private:
+    Status ConvertWeights(float *weights_data_ptr);
+
+protected:
+    OpenCLConvParam conv_params_ = {0};
+    shared_ptr<OpenCLMemory> ocl_weights_;
+    shared_ptr<OpenCLMemory> ocl_bias_;
+    ConvType conv_type_;
+    std::set<std::string> build_options_;
+    bool is_channel_blocking_ = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_ACC_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc
new file mode 100644
index 0000000..9060aa5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc
@@ -0,0 +1,190 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+bool OpenCLConvLayerCommonAcc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &,
+                                          const std::vector<Blob *> &) {
+    if (!param) {
+        return false;
+    }
+
+    return true;
+}
+
+Status OpenCLConvLayerCommonAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Conv Common Acc\n");
+
+    Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    conv_type_ = CT_CONV_COMMON;
+    op_name_   = "Conv_" + ToString(conv_params_.kernel_x) + "x" + ToString(conv_params_.kernel_y);
+
+    if (!run_3d_ndrange_) {
+        if (MALI_T == gpu_info_.type || MALI_G == gpu_info_.type) {
+            use_buffer_ = true;
+        }
+    }
+
+    ret = AllocateWeightsBias(resource);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const int output_batch      = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_channel    = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height     = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width      = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    std::string program_name = "convolution";
+    std::string kernel_name = "Conv2D";
+    if (run_3d_ndrange_) {
+        program_name = "convolution_gws_3d";
+        kernel_name = "Conv2DGS3D";
+        if (output_channel > 4) {
+            is_channel_blocking_ = true;
+            kernel_name += "_CB2";
+        }
+    } else {
+        if (use_buffer_) {
+            program_name = "convolution_mix";
+            kernel_name += "_MIX";
+        }
+        int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width;
+        if (task_size > 4096 && output_channel > 4) {
+            is_channel_blocking_ = true;
+            kernel_name += "_CB2";
+        }
+    }
+
+    ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name, build_options_);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLConvLayerCommonAcc::~OpenCLConvLayerCommonAcc() {}
+
+Status OpenCLConvLayerCommonAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Conv Common Acc Reshape\n");
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int input_height   = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width    = DimsFunctionUtils::GetDim(input_dims, 3);
+
+    int input_imageshape[2]  = {input_width, input_height};
+    int output_imageshape[2] = {output_width, output_height};
+    int kernel_shape[2]      = {conv_params_.kernel_x, conv_params_.kernel_y};
+    int stride_shape[2]      = {conv_params_.stride_x, conv_params_.stride_y};
+    int padding_shape[2]     = {conv_params_.pad_x, conv_params_.pad_y};
+    int dilation_shape[2]    = {conv_params_.dilation_x, conv_params_.dilation_y};
+
+    if (run_3d_ndrange_) {
+        if (is_channel_blocking_) {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 8)),
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        } else {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4)),
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        }
+
+        if (kernel_shape[0] == 3 && kernel_shape[1] == 3) {
+            execute_units_[0].local_work_size =
+                Conv2dCommonLocalWS3DKernel3x3(execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1],
+                                               execute_units_[0].workgroupsize_max);
+        } else {
+            execute_units_[0].local_work_size =
+                Conv2dCommonLocalWS3DGeneral(execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1],
+                                             execute_units_[0].workgroupsize_max);
+        }
+    } else {
+        if (is_channel_blocking_) {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 8) *
+                                      UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        } else {
+            execute_units_[0].global_work_size = {
+                static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) *
+                                      UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+                static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                      DimsFunctionUtils::GetDim(output_dims, 2))};
+        }
+        execute_units_[0].local_work_size = LocalWS2DDefault(execute_units_[0]);
+    }
+
+    const int input_channels = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int input_channel_blocks = UP_DIV(input_channels, 4);
+
+    const int output_channels = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_channel_blocks = UP_DIV(output_channels, 4);
+
+    uint32_t idx = 0;
+    for (auto gws : execute_units_[0].global_work_size) {
+        execute_units_[0].ocl_kernel.setArg(idx++, gws);
+    }
+
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    if (use_buffer_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Buffer *)ocl_weights_->GetData()));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Buffer *)ocl_bias_->GetData()));
+    } else {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_imageshape), input_imageshape);
+    
+    execute_units_[0].ocl_kernel.setArg(idx++, input_channel_blocks);
+    if (is_channel_blocking_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, output_channel_blocks);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(dilation_shape), dilation_shape);
+    if (is_channel_blocking_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, kernel_shape[0] * kernel_shape[1]);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(output_width, 4));
+    execute_units_[0].ocl_kernel.setArg(idx++, (int)conv_params_.activation_type);
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+            execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0]));
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h
new file mode 100644
index 0000000..d6fa3c4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_COMMON_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_COMMON_ACC_H_
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+class OpenCLConvLayerCommonAcc : public OpenCLConvLayerAccImpl {
+public:
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayerCommonAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_COMMON_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.cc
new file mode 100644
index 0000000..53158e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.cc
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+bool OpenCLConvLayerDepthwiseAcc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                             const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    return param->group == DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, 1) &&
+           param->group == DimsFunctionUtils::GetDim(outputs[0]->GetBlobDesc().dims, 1);
+}
+
+Status OpenCLConvLayerDepthwiseAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Conv Depthwise Acc\n");
+
+    conv_type_ = CT_CONV_DEPTHWISE;
+    op_name_   = "Conv_Depthwise";
+
+    Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    ret = AllocateWeightsBias(resource);
+    CHECK_TNN_OK(ret)
+
+    std::string program_name = "convolution_depthwise";
+    std::string kernel_name = "DepthwiseConv2D";
+    if (conv_params_.stride_x == 1 && conv_params_.stride_y == 1 && conv_params_.dilation_x == 1 &&
+        conv_params_.dilation_y == 1) {
+        kernel_name = "DepthwiseConv2DS1";
+    }
+    ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name, build_options_);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLConvLayerDepthwiseAcc::~OpenCLConvLayerDepthwiseAcc() {}
+
+Status OpenCLConvLayerDepthwiseAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Conv Depthwise Acc Reshape\n");
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int input_height   = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width    = DimsFunctionUtils::GetDim(input_dims, 3);
+    const int input_channels = DimsFunctionUtils::GetDim(input_dims, 1);
+
+    execute_units_[0].global_work_size = {
+        static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) *
+                              UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)),
+        static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                              DimsFunctionUtils::GetDim(output_dims, 2))};
+
+    int kernel_shape[2]      = {conv_params_.kernel_x, conv_params_.kernel_y};
+    int stride_shape[2]      = {conv_params_.stride_x, conv_params_.stride_y};
+    int padding_shape[2]     = {conv_params_.pad_x, conv_params_.pad_y};
+    int dilation_shape[2]    = {conv_params_.dilation_x, conv_params_.dilation_y};
+
+    int input_imageshape[2]  = {input_width, input_height};
+    int output_imageshape[2] = {output_width, output_height};
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_imageshape), input_imageshape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+    
+    if (conv_params_.stride_x != 1 || conv_params_.stride_y != 1 || conv_params_.dilation_x != 1 ||
+        conv_params_.dilation_y != 1) {
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(dilation_shape), dilation_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, (int)conv_params_.activation_type);
+
+    execute_units_[0].local_work_size = Conv2dCommonLocalWS2D(
+            execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size);
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+        execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0]));
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.h
new file mode 100644
index 0000000..f8715f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_depthwise_acc.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_DEPTHWISE_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_DEPTHWISE_ACC_H_
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+class OpenCLConvLayerDepthwiseAcc : public OpenCLConvLayerAccImpl {
+public:
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayerDepthwiseAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_DEPTHWISE_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.cc
new file mode 100644
index 0000000..eef67ff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.cc
@@ -0,0 +1,297 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+#include "tnn/utils/winograd_generator.h"
+
+namespace TNN_NS {
+
+#define UNIT 2
+
+bool OpenCLConvLayerWinogradAcc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> & inputs,
+                                       const std::vector<Blob *> & outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if(param->group != 1) {
+        return false;
+    }
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    auto image2d_max_size = opencl_runtime->GetImage2dMaxSize();
+    int image2d_max_height = image2d_max_size[1];
+    auto input_dims          = inputs[0]->GetBlobDesc().dims;
+
+    if(UP_DIV(param->output_channel, 4) * 16 > image2d_max_height || 
+        DimsFunctionUtils::GetDim(input_dims, 0) * UP_DIV(DimsFunctionUtils::GetDim(input_dims, 2), 2) * 16 > image2d_max_height) {
+        return false;
+    }
+
+    return  param->kernels[0] == 3 && param->kernels[1] == 3 && param->dialations[0] == 1 && 
+            param->dialations[1] == 1 && param->strides[0] == 1 && param->strides[1] == 1 && 
+            param->output_channel >=32 && param->input_channel >= 32 && 
+            DimsFunctionUtils::GetDim(input_dims, 3) * 1.0f / param->output_channel <= 4;
+}
+
+Status OpenCLConvLayerWinogradAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Conv Winograd Acc \n");
+
+    conv_type_ = CT_CONV_WINOGRAD;
+    op_name_   = "Conv_Winograd";
+
+    Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    ConvLayerResource *conv_resource = dynamic_cast<ConvLayerResource *>(resource_);
+
+    auto input_dims          = inputs[0]->GetBlobDesc().dims;
+    auto output_dims         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int output_channel = DimsFunctionUtils::GetDim(output_dims, 1);
+
+    //convert filter
+    ret = ConvertWinogradTransformWeigths(conv_resource->filter_handle, ocl_weights_, input_channel, output_channel);
+    CHECK_TNN_OK(ret)
+
+    //convert bias
+    ret = ConvertChannelWeights(conv_resource->bias_handle, ocl_bias_, conv_params_.output_channel,
+                                conv_params_.has_bias, false);
+    CHECK_TNN_OK(ret)
+
+    ret = AllocateWinogradMatrixVAndM(input_dims, output_dims);
+    CHECK_TNN_OK(ret)
+
+    //create kernels 
+    execute_units_.resize(3);
+    std::string program_name;
+    program_name = "winograd";
+    std::string kernel_name;
+    //kernel WinogradTransformSource
+    kernel_name = "TransformToMatrixV";
+    ret         = CreateExecuteUnit(execute_units_[0], program_name, kernel_name, build_options_);    
+    CHECK_TNN_OK(ret)
+
+    // kernel MatrixInnerProduct
+    kernel_name = "MatrixInnerProduct";
+    ret         = CreateExecuteUnit(execute_units_[1], program_name, kernel_name, build_options_);
+    CHECK_TNN_OK(ret)
+
+    //kernel WinogradTransformDest 
+    kernel_name = "TransformFromMatrixM";
+    ret         = CreateExecuteUnit(execute_units_[2], program_name, kernel_name, build_options_);
+    CHECK_TNN_OK(ret)
+
+    return TNN_OK;
+}
+
+OpenCLConvLayerWinogradAcc::~OpenCLConvLayerWinogradAcc() {}
+
+Status OpenCLConvLayerWinogradAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int batch          = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_channel = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height  = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width   = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int input_channel = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int input_height  = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width   = DimsFunctionUtils::GetDim(input_dims, 3);
+
+
+    const int round_up_ouptut_width = UP_DIV(output_width, 2);
+    const int round_up_output_height = UP_DIV(output_height, 2);
+    const int batch_round_h = batch * round_up_output_height;
+    const int output_channel_blocks = UP_DIV(output_channel, 4);
+    const int input_channel_blocks = UP_DIV(input_channel, 4);
+    const int round_up_4x4_ouptut_width = UP_DIV(round_up_ouptut_width, 4);
+
+
+    int padding_shape[2]     = {conv_params_.pad_x, conv_params_.pad_y};
+
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(input_channel_blocks * round_up_ouptut_width),
+                                        static_cast<uint32_t>(batch_round_h)};
+    execute_units_[0].local_work_size = Conv2dCommonLocalWS2D(
+            execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size);
+
+    execute_units_[1].global_work_size = {static_cast<uint32_t>(output_channel_blocks * round_up_4x4_ouptut_width),
+                                          static_cast<uint32_t>(16 * batch_round_h)};
+
+    execute_units_[2].global_work_size = {static_cast<uint32_t>(output_channel_blocks * round_up_ouptut_width),
+                                        static_cast<uint32_t>(batch_round_h)};
+    execute_units_[2].local_work_size = Conv2dCommonLocalWS2D(
+            execute_units_[2].global_work_size, execute_units_[2].workgroupsize_max, execute_units_[2].sub_group_size);
+
+
+    //kernel WinogradTransformSource
+    int idx = 0;
+    for (auto gws : execute_units_[0].global_work_size) {
+        execute_units_[0].ocl_kernel.setArg(idx++, gws);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_v_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, input_height);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_width);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_channel);
+    execute_units_[0].ocl_kernel.setArg(idx++, round_up_output_height);
+    execute_units_[0].ocl_kernel.setArg(idx++, round_up_ouptut_width);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+
+    // kernel MatrixInnerProduct
+    idx = 0;
+    for (auto gws : execute_units_[1].global_work_size) {
+        execute_units_[1].ocl_kernel.setArg(idx++, gws);
+    }
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_v_->GetData()));
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_m_->GetData()));
+    execute_units_[1].ocl_kernel.setArg(idx++, round_up_ouptut_width);
+    execute_units_[1].ocl_kernel.setArg(idx++, round_up_4x4_ouptut_width);
+    execute_units_[1].ocl_kernel.setArg(idx++, batch_round_h);
+    execute_units_[1].ocl_kernel.setArg(idx++, output_channel_blocks);
+    execute_units_[1].ocl_kernel.setArg(idx++, input_channel_blocks);
+
+    //kernel TransformFromMatrixM
+    idx = 0;
+    for (auto gws : execute_units_[2].global_work_size) {
+        execute_units_[2].ocl_kernel.setArg(idx++, gws);
+    }
+    execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_m_->GetData()));
+    execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[2].ocl_kernel.setArg(idx++, round_up_ouptut_width);
+    execute_units_[2].ocl_kernel.setArg(idx++, round_up_output_height);
+    execute_units_[2].ocl_kernel.setArg(idx++, output_width);
+    execute_units_[2].ocl_kernel.setArg(idx++, output_height);
+    execute_units_[2].ocl_kernel.setArg(idx++, (int)conv_params_.activation_type);
+
+    execute_units_[1].local_work_size = Conv2dCommonLocalWS2D(
+            execute_units_[1].global_work_size, execute_units_[1].workgroupsize_max, execute_units_[1].sub_group_size);
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+        execute_units_[1].local_work_size = LocalTune(execute_units_[1], ocl_context_, GenerateTuneKernelKey(execute_units_[1]));
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLConvLayerWinogradAcc::ConvertWinogradTransformWeigths(RawBuffer &raw_handle, shared_ptr<OpenCLMemory> &ocl_handle, int input_channel, int output_channel) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    const int kernel_size       = conv_params_.kernel_x;
+    int unit_output = UNIT;
+    int unit_input = UNIT + kernel_size - 1;
+    WinogradGenerator generator(unit_output, kernel_size, 1.0f);
+    auto transform_weight =  generator.allocTransformWeight(output_channel, input_channel, kernel_size, kernel_size, 4, 4);
+    // if filter handle is half, need convert to float first.
+    auto filter_data = GetFloatFromRawBuffer(raw_handle);
+    if (filter_data == nullptr) {
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+    }
+    generator.transformWeight(transform_weight, filter_data.get(), output_channel, input_channel, kernel_size, kernel_size);
+
+    auto dims = std::get<1>(transform_weight);
+    
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(dims) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+    }
+    auto transform_weight_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        buffer, true, CL_MAP_WRITE, 0, DimsVectorUtils::Count(dims) * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL Conv MemMap failed");
+    }
+    memcpy(transform_weight_clbuffer_ptr, std::get<0>(transform_weight).get(), DimsVectorUtils::Count(dims) * sizeof(float));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(buffer, transform_weight_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL Conv MemUnMap failed");
+    }
+
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    int image_height = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 1);
+    int image_width = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+    cl::Image2D *image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        image_width, image_height, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+    }
+    ocl_weights_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_weights_->SetData(image, true);
+    CopyBufferToImage(opencl_runtime, ocl_context_, buffer, *image, image_width, image_height, true);
+    return TNN_OK;
+}
+
+Status OpenCLConvLayerWinogradAcc::AllocateWinogradMatrixVAndM(DimsVector input_dims, DimsVector output_dims) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl_int ret = CL_SUCCESS;
+
+    const int batch             = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_channel    = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height     = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width      = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int input_channel         = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int output_channel_blocks = UP_DIV(output_channel, 4);
+    const int input_channel_blocks  = UP_DIV(input_channel, 4);
+
+    const int round_up_ouptut_width = UP_DIV(output_width, 2);
+    const int round_up_output_height = UP_DIV(output_height, 2);
+
+    cl::Image2D *image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        input_channel_blocks * round_up_ouptut_width, 16 * batch * round_up_output_height, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+    }
+    ocl_v_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_v_->SetData(image, true);
+
+
+    image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        output_channel_blocks * round_up_ouptut_width, 16 * batch * round_up_output_height, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Conv malloc memory failed");
+    }
+    ocl_m_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_m_->SetData(image, true);
+    
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.h
new file mode 100644
index 0000000..c5548d3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_winograd_acc.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_WINOGRAD_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_WINOGRAD_ACC_H_
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+
+namespace TNN_NS {
+
+class OpenCLConvLayerWinogradAcc : public OpenCLConvLayerAccImpl {
+public:
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConvLayerWinogradAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status ConvertWinogradTransformWeigths(RawBuffer &raw_handle, shared_ptr<OpenCLMemory> &ocl_handle, int input_channel, int output_channel);
+
+    Status AllocateWinogradMatrixVAndM(DimsVector input_dims, DimsVector output_dims);
+
+    shared_ptr<OpenCLMemory> ocl_v_;
+    shared_ptr<OpenCLMemory> ocl_m_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CONV_LAYER_WINOGRAD_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc.cc
new file mode 100644
index 0000000..d7b6c54
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h"
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.h"
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+class OpenCLDeconvLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLDeconvLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::shared_ptr<OpenCLLayerAcc> deconv_acc_implement_ = nullptr;
+};
+
+Status OpenCLDeconvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+
+    if (OpenCLDeconvLayerDepthwiseAcc::IsPrefered(conv_param, inputs, outputs)) {
+        deconv_acc_implement_ = std::make_shared<OpenCLDeconvLayerDepthwiseAcc>();
+    } else if (OpenCLDeconvLayerCommonAcc::IsPrefered(conv_param, inputs, outputs)) {
+        deconv_acc_implement_ = std::make_shared<OpenCLDeconvLayerCommonAcc>();
+    }
+
+    if (deconv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "this type deconv is not implemented");
+
+    return deconv_acc_implement_->Init(context, conv_param, resource, inputs, outputs);
+}
+
+OpenCLDeconvLayerAcc::~OpenCLDeconvLayerAcc() {}
+
+Status OpenCLDeconvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    if (deconv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "this type deconv is not implemented");
+
+    return deconv_acc_implement_->Reshape(inputs, outputs);
+}
+
+Status OpenCLDeconvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (deconv_acc_implement_ == nullptr)
+        return Status(TNNERR_OPENCL_ACC_FORWARD_ERROR, "this type deconv is not implemented");
+
+    return deconv_acc_implement_->Forward(inputs, outputs);
+}
+
+REGISTER_OPENCL_ACC(Deconv, LAYER_DECONVOLUTION)
+REGISTER_OPENCL_LAYOUT(LAYER_DECONVOLUTION, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.cc
new file mode 100644
index 0000000..d5f817b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.cc
@@ -0,0 +1,284 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+Status OpenCLDeconvLayerAccImpl::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+    if (nullptr == conv_param) {
+        LOGE("invalid deconv param!\n");
+        return Status(TNNERR_NULL_PARAM, "invalid deconv param");
+    }
+
+    //interpreter deconv 2d param with name info.
+    deconv_params_.kernel_x        = conv_param->kernels[0];
+    deconv_params_.kernel_y        = conv_param->kernels[1];
+    deconv_params_.pad_x           = conv_param->pads[0];
+    deconv_params_.pad_y           = conv_param->pads[2];
+    deconv_params_.stride_x        = conv_param->strides[0];
+    deconv_params_.stride_y        = conv_param->strides[1];
+    deconv_params_.dilation_x      = conv_param->dialations[0];
+    deconv_params_.dilation_y      = conv_param->dialations[1];
+    deconv_params_.pad_type        = conv_param->pad_type;
+    deconv_params_.group           = conv_param->group;
+    deconv_params_.has_bias        = conv_param->bias;
+    deconv_params_.activation_type = conv_param->activation_type;
+
+    deconv_params_.input_channel  = DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, 1);
+    deconv_params_.output_channel = DimsFunctionUtils::GetDim(outputs[0]->GetBlobDesc().dims, 1);
+
+    if ((deconv_params_.group <= 0 || deconv_params_.input_channel % deconv_params_.group != 0)) {
+        LOGE("invalid group size in DeConv layer!\n");
+        return Status(TNNERR_LAYER_ERR, "invalid group size in DeConv layer");
+    }
+
+    ConvLayerResource *deconv_resource = dynamic_cast<ConvLayerResource *>(resource);
+    if (nullptr == deconv_resource) {
+        LOGE("invalid deconv resource!\n");
+        return Status(TNNERR_NULL_PARAM, "invalid deconv resource");
+    }
+    // get weights
+    if (deconv_resource->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        //get float pointer from raw buffer.
+        float *weights_data_ptr = deconv_resource->filter_handle.force_to<float *>();
+        if (weights_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(weights_data_ptr);
+        CHECK_TNN_OK(ret)
+    } else {
+        //if filter handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(deconv_resource->filter_handle);  // handle the memory
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(float_data_ptr.get());
+        CHECK_TNN_OK(ret)
+    }
+
+    //convert bias
+    ret = ConvertChannelWeights(deconv_resource->bias_handle, ocl_bias_, deconv_params_.output_channel,
+                                deconv_params_.has_bias);
+    return ret;
+}
+
+OpenCLDeconvLayerAccImpl::~OpenCLDeconvLayerAccImpl() {}
+
+Status OpenCLDeconvLayerAccImpl::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Deconv Impl Acc Reshape\n");
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    // output_channel/4
+    const int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+    const int input_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+
+    int pad_x_trans = deconv_params_.kernel_x - 1 - deconv_params_.pad_x;
+    int pad_y_trans = deconv_params_.kernel_y - 1 - deconv_params_.pad_y;
+
+    const int align_width  = deconv_params_.stride_x - 1 - pad_x_trans;
+    const int align_height = deconv_params_.stride_y - 1 - pad_y_trans;
+
+    //input_width, input_height
+    int input_imageshape[2]  = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    //output_width, output_height
+    int output_imageshape[2] = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+    int stride_shape[2]      = {deconv_params_.stride_x, deconv_params_.stride_y};
+    int align_shape[2]       = {align_width, align_height};
+    int padding_shape[2]     = {pad_x_trans, pad_y_trans};
+    int kernel_shape[2]      = {deconv_params_.kernel_x, deconv_params_.kernel_y};
+
+    bool is_deconv_4x4_s2_p1_wb4    =
+        (CT_DECONV_DEPTHWISE != deconv_type_ &&
+        deconv_params_.kernel_x == 4 && deconv_params_.kernel_y == 4 &&
+        deconv_params_.stride_x == 2 && deconv_params_.stride_y == 2 &&
+        deconv_params_.pad_x == 1 && deconv_params_.pad_y == 1 &&
+        deconv_params_.dilation_x == 1 && deconv_params_.dilation_y == 1 && DimsFunctionUtils::GetDim(output_dims, 3) % 4 == 0);
+
+    // output_width * output_channel/4, batch * output_height
+    execute_units_[0].global_work_size = {
+            static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3) *
+                                  UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4)),
+            static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                  DimsFunctionUtils::GetDim(output_dims, 2))};
+
+    if (is_deconv_4x4_s2_p1_wb4) {
+        // output_width/4 * output_channel/4, batch * output_height
+        execute_units_[0].global_work_size[0] =
+            static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4) *
+                                  UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4));
+    }
+
+    uint32_t idx = 0;
+    for (auto gws : execute_units_[0].global_work_size) {
+        execute_units_[0].ocl_kernel.setArg(idx++, gws);
+    }
+
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_imageshape), input_imageshape);
+    execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape);
+
+    if (is_deconv_4x4_s2_p1_wb4) {
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(
+            UP_DIV(DimsFunctionUtils::GetDim(output_dims, 3), 4)));
+    } else {
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(align_shape), align_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(kernel_shape[0] * kernel_shape[1]));
+    }
+
+    SetExtraKernelParameters(idx, inputs, outputs);
+
+    execute_units_[0].local_work_size = LocalWS2DDefault(execute_units_[0]);
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+        execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0]));
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLDeconvLayerAccImpl::ConvertWeights(float *weights_data_ptr) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    float *wdata_ptr = weights_data_ptr;
+    std::shared_ptr<float> weights_data_trans;
+    if (CT_DECONV_COMMON == deconv_type_) {
+        if (deconv_params_.group == 1) {
+            int element_size = deconv_params_.output_channel * deconv_params_.input_channel * deconv_params_.kernel_y *
+                               deconv_params_.kernel_x;
+            weights_data_trans.reset(new float[element_size], [](float *p) { delete[] p; });
+            IOHW2OIHW<float, int>(weights_data_ptr, weights_data_trans.get(), deconv_params_.output_channel,
+                                  deconv_params_.input_channel, deconv_params_.kernel_y, deconv_params_.kernel_x);
+            wdata_ptr = weights_data_trans.get();
+        } else {
+            //special for group deconv
+            int element_size = deconv_params_.output_channel * deconv_params_.input_channel * deconv_params_.kernel_y *
+                               deconv_params_.kernel_x;
+            weights_data_trans.reset(new float[element_size], [](float *p) { delete[] p; });
+            GROUP_PADDING<float, int>(weights_data_ptr, weights_data_trans.get(), deconv_params_.group,
+                                      deconv_params_.output_channel, deconv_params_.input_channel,
+                                      deconv_params_.kernel_y, deconv_params_.kernel_x, GIOHW);
+            wdata_ptr = weights_data_trans.get();
+        }
+    }
+
+    // copy weights data into clBuffer
+    DimsVector filter_shape;
+    if (CT_DECONV_DEPTHWISE == deconv_type_) {
+        filter_shape = {1, deconv_params_.output_channel, deconv_params_.kernel_y, deconv_params_.kernel_x};
+    } else {
+        filter_shape = {deconv_params_.output_channel, deconv_params_.input_channel, deconv_params_.kernel_y,
+                        deconv_params_.kernel_x};
+    }
+
+    uint32_t bytes_size = DimsVectorUtils::Count(filter_shape) * sizeof(float);
+    shared_ptr<OpenCLMemory> weight_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer buffer(*opencl_runtime->Context(), (cl_mem_flags)(CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR),
+                      (cl::size_type)bytes_size, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL Deconv malloc memory failed");
+    }
+    weight_buffer->SetData(&buffer);
+    auto weight_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(buffer, true, CL_MAP_WRITE, 0, bytes_size,
+                                                                              nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL Deconv  MemMap failed");
+    }
+    memcpy(weight_clbuffer_ptr, wdata_ptr, bytes_size);
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(buffer, weight_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL Deconv MemUnMap failed");
+    }
+
+    // create ocl_weights_
+    DimsVector filter_imageshape;
+    if (CT_DECONV_DEPTHWISE == deconv_type_) {
+        filter_imageshape = {deconv_params_.kernel_x * deconv_params_.kernel_y,
+                            (int)(UP_DIV(deconv_params_.output_channel, 4))};
+    } else {
+        filter_imageshape = {deconv_params_.input_channel, (int)(UP_DIV(deconv_params_.output_channel, 4) *
+                                                                 deconv_params_.kernel_x * deconv_params_.kernel_y)};
+    }
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl::Image2D *image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        filter_imageshape[0], filter_imageshape[1], 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_weights_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_weights_->SetData(image, true);
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    OpenCLBufferFormat buffer_format = CONV2D_FILTER;
+    if (CT_DECONV_DEPTHWISE == deconv_type_) {
+        buffer_format = DW_CONV2D_FILTER;
+    }
+    return convertor.ConvertBufferToImage(weight_buffer.get(), buffer_format, filter_shape, ocl_weights_.get(), true);
+}
+
+void OpenCLDeconvLayerAccImpl::SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs,
+                                                        const std::vector<Blob *> &outputs) {}
+
+#if TNN_PROFILE
+double OpenCLDeconvLayerAccImpl::GetFlops() {
+    return 2.0 * DimsVectorUtils::Count(input_dims_) * output_dims_[1] / deconv_params_.group * deconv_params_.kernel_x *
+           deconv_params_.kernel_y / 1000.0 / 1000.0;
+}
+#endif
+
+std::string OpenCLDeconvLayerAccImpl::GenerateTuneKernelKey(OpenCLExecuteUnit &unit) {
+    std::string tune_key = unit.program_name + "_" + unit.kernel_name + "_" + "param[" + 
+    "kernel_" + ToString(deconv_params_.kernel_x) + "_" + ToString(deconv_params_.kernel_y) + "_" +  
+    "pad_" + ToString(deconv_params_.pad_x) + "_" + ToString(deconv_params_.pad_y ) + "_" + 
+    "stride_" + ToString(deconv_params_.stride_x) + "_" + ToString(deconv_params_.stride_y ) + "_" + 
+    "dilation_" + ToString(deconv_params_.dilation_x) + "_"+ ToString(deconv_params_.dilation_y) + "_" +
+    "pad_type_" + ToString(deconv_params_.pad_type) + "_" + 
+    "group_" + ToString(deconv_params_.group) + "]_global";
+    for(auto size : unit.global_work_size) {
+        tune_key += "_" + ToString(size);
+    }
+    return tune_key;
+} 
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.h b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.h
new file mode 100644
index 0000000..e45c542
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_ACC_IMPL_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_ACC_IMPL_H_
+
+#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+enum DeConvType { CT_DECONV_COMMON = 0, CT_DECONV_DEPTHWISE };
+
+class OpenCLDeconvLayerAccImpl : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLDeconvLayerAccImpl() override;
+
+#if TNN_PROFILE
+    virtual double GetFlops() override;
+#endif
+
+private:
+    virtual void SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs,
+                                          const std::vector<Blob *> &outputs);
+    Status ConvertWeights(float *weights_data_ptr);
+
+    std::string GenerateTuneKernelKey(OpenCLExecuteUnit &unit);
+
+protected:
+    OpenCLConvParam deconv_params_;
+    shared_ptr<OpenCLMemory> ocl_weights_;
+    shared_ptr<OpenCLMemory> ocl_bias_;
+    DeConvType deconv_type_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_ACC_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.cc
new file mode 100644
index 0000000..139b4f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+bool OpenCLDeconvLayerCommonAcc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &,
+                                            const std::vector<Blob *> &) {
+    if (!param) {
+        return false;
+    }
+
+    return true;
+}
+
+Status OpenCLDeconvLayerCommonAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Deconv Common Acc\n");
+
+    op_name_            = "Deconv2D";
+    deconv_type_        = CT_DECONV_COMMON;
+    auto output         = outputs[0];
+    auto output_dims    = output->GetBlobDesc().dims;
+
+    Status ret = OpenCLDeconvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string kernel_name = "Deconv2D";
+    if (deconv_params_.kernel_x == 4 && deconv_params_.kernel_y == 4 &&
+               deconv_params_.stride_x == 2 && deconv_params_.stride_y == 2 &&
+               deconv_params_.pad_x == 1 && deconv_params_.pad_y == 1 &&
+               deconv_params_.dilation_x == 1 && deconv_params_.dilation_y == 1 && DimsFunctionUtils::GetDim(output_dims, 3) % 4 == 0) {
+        kernel_name = "Deconv2D4x4s2p1wb4";
+    }
+
+    ret                     = CreateExecuteUnit(execute_units_[0], "deconvolution", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLDeconvLayerCommonAcc::~OpenCLDeconvLayerCommonAcc() {}
+
+void OpenCLDeconvLayerCommonAcc::SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs,
+                                                          const std::vector<Blob *> &outputs) {
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int input_channel_blocks  = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
+    execute_units_[0].ocl_kernel.setArg(idx++, (int)deconv_params_.activation_type);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h
new file mode 100644
index 0000000..23cf054
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_common_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_COMMON_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_COMMON_ACC_H_
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+class OpenCLDeconvLayerCommonAcc : public OpenCLDeconvLayerAccImpl {
+public:
+    // @brief judge if OpenCLDeconvLayerCommonAcc is prefered.
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLDeconvLayerCommonAcc() override;
+
+private:
+    virtual void SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs,
+                                          const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_COMMON_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.cc
new file mode 100644
index 0000000..733a021
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+bool OpenCLDeconvLayerDepthwiseAcc::IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                               const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    return param->group == DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, 1) &&
+           param->group == DimsFunctionUtils::GetDim(outputs[0]->GetBlobDesc().dims, 1);
+}
+
+Status OpenCLDeconvLayerDepthwiseAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Deconv Depthwise Acc\n");
+
+    op_name_     = "Deconv_Depthwise";
+    deconv_type_ = CT_DECONV_DEPTHWISE;
+
+    Status ret = OpenCLDeconvLayerAccImpl::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string kernel_name = "DepthwiseDeconv2D";
+
+    ret                     = CreateExecuteUnit(execute_units_[0], "deconvolution", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLDeconvLayerDepthwiseAcc::~OpenCLDeconvLayerDepthwiseAcc() {}
+
+void OpenCLDeconvLayerDepthwiseAcc::SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs,
+                                                             const std::vector<Blob *> &outputs) {
+    auto output_dims                = outputs[0]->GetBlobDesc().dims;
+    const int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_channel_blocks));
+    execute_units_[0].ocl_kernel.setArg(idx++, (int)deconv_params_.activation_type);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.h
new file mode 100644
index 0000000..22fb415
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_depthwise_acc.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_DEPTHWISE_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_DEPTHWISE_ACC_H_
+
+#include "tnn/device/opencl/acc/deconvolution/opencl_deconv_layer_acc_impl.h"
+#include "tnn/device/opencl/opencl_memory.h"
+namespace TNN_NS {
+
+class OpenCLDeconvLayerDepthwiseAcc : public OpenCLDeconvLayerAccImpl {
+public:
+    // @brief judge if OpenCLDeconvLayerDepthwiseAcc is prefered.
+    static bool IsPrefered(const ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLDeconvLayerDepthwiseAcc() override;
+
+private:
+    virtual void SetExtraKernelParameters(uint32_t idx, const std::vector<Blob *> &inputs, 
+                                        const std::vector<Blob *> &outputs) override;
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_DECONV_LAYER_DEPTHWISE_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_abs_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_abs_layer_acc.cc
new file mode 100644
index 0000000..220e1a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_abs_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Abs);
+
+Status OpenCLAbsLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Abs Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Abs";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLAbsLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "fabs(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLAbsLayerAcc::~OpenCLAbsLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Abs, LAYER_ABS)
+REGISTER_OPENCL_LAYOUT(LAYER_ABS, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_acos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_acos_layer_acc.cc
new file mode 100644
index 0000000..0a67c32
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_acos_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Acos);
+
+Status OpenCLAcosLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Acos Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Acos";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLAcosLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "acos(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLAcosLayerAcc::~OpenCLAcosLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Acos, LAYER_ACOS)
+REGISTER_OPENCL_LAYOUT(LAYER_ACOS, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_add_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_add_layer_acc.cc
new file mode 100644
index 0000000..adc559e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_add_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Add);
+
+Status OpenCLAddLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Add Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Add";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string kernel_name;
+    std::string compute = "in0+in1";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLAddLayerAcc::~OpenCLAddLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Add, LAYER_ADD)
+REGISTER_OPENCL_LAYOUT(LAYER_ADD, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_arg_max_or_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_arg_max_or_min_layer_acc.cc
new file mode 100644
index 0000000..57dbaa9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_arg_max_or_min_layer_acc.cc
@@ -0,0 +1,133 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLArgMaxOrMinLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::vector<DataType> SupportDataType(int dims_size, BlobType blob_type) override;
+};
+
+Status OpenCLArgMaxOrMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ArgMaxOrMin Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "ArgMaxOrMin";
+
+    ArgMaxOrMinLayerParam *arg_max_or_min_param = dynamic_cast<ArgMaxOrMinLayerParam *>(param);
+    if (!arg_max_or_min_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name;
+    if (0 == arg_max_or_min_param->axis) {
+        kernel_name = "ArgOpN";
+    } else if (1 == arg_max_or_min_param->axis) {
+        kernel_name = "ArgOpC";
+    } else if (2 == arg_max_or_min_param->axis) {
+        kernel_name = "ArgOpH";
+    } else if (3 == arg_max_or_min_param->axis) {
+        kernel_name = "ArgOpW";
+    } else {
+        LOGE("not support axis = %d in argmax/min yet!\n", arg_max_or_min_param->axis);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid argmax/min axis");
+    }
+
+    std::set<std::string> build_options;
+    std::string compute, operator_func;
+    if (0 == arg_max_or_min_param->mode) {
+        compute         = "MinOp";
+        operator_func   = "min";
+    } else if (1 == arg_max_or_min_param->mode) {
+        compute         = "MaxOp";
+        operator_func   = "max";
+    } else {
+        LOGE("not support mode = %d in argmax/min yet!\n", arg_max_or_min_param->mode);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid argmax/min mode");
+    }
+
+    build_options.emplace(" -DOPERATOR=" + operator_func + " -DBINARY_OPERATOR=" + compute);
+
+    ret = CreateExecuteUnit(execute_units_[0], "arg", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLArgMaxOrMinLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("ArgMaxOrMin Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    ArgMaxOrMinLayerParam *arg_max_or_min_param = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    if (!arg_max_or_min_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims        = outputs[0]->GetBlobDesc().dims;
+    auto input_dims         = inputs[0]->GetBlobDesc().dims;
+    int axis                = arg_max_or_min_param->axis;
+    int select_last_index   = arg_max_or_min_param->select_last_index;
+
+    if (select_last_index != 0) {
+        LOGE("Error: select_last_index is not supported for now\n");
+        return Status(TNNERR_MODEL_ERR, "Error: select_last_index in ArgMax/ArgMin failed");
+    }
+
+    uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    // only support up to 4 dims for now
+    for (int i = 0; i < 4; i++) {
+        if (i != axis)
+            execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, i));
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, axis));
+    // channel arg op
+    if (axis == 1) {
+        execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(DimsFunctionUtils::GetDim(input_dims, axis), 4));
+    }
+
+    return TNN_OK;
+}
+
+std::vector<DataType> OpenCLArgMaxOrMinLayerAcc::SupportDataType(int dims_size, BlobType blob_type) {
+    if (blob_type == BLOB_INPUT) {
+        return {DATA_TYPE_FLOAT, DATA_TYPE_HALF};
+    } else {
+        return {DATA_TYPE_INT32};
+    }
+}
+
+REGISTER_OPENCL_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN)
+REGISTER_OPENCL_LAYOUT(LAYER_ARG_MAX_OR_MIN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_asin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_asin_layer_acc.cc
new file mode 100644
index 0000000..1bbe9f1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_asin_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Asin);
+
+Status OpenCLAsinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Asin Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Asin";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLAsinLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "asin(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLAsinLayerAcc::~OpenCLAsinLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Asin, LAYER_ASIN)
+REGISTER_OPENCL_LAYOUT(LAYER_ASIN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_atan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_atan_layer_acc.cc
new file mode 100644
index 0000000..0e78dd7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_atan_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Atan);
+
+Status OpenCLAtanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Atan Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Atan";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLAtanLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "atan(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLAtanLayerAcc::~OpenCLAtanLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Atan, LAYER_ATAN)
+REGISTER_OPENCL_LAYOUT(LAYER_ATAN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_batch_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_batch_norm_layer_acc.cc
new file mode 100644
index 0000000..2872417
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_batch_norm_layer_acc.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <vector>
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLBatchNormLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLBatchNormLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status ConvertWeights(float *weights_data_ptr, int output_channel);
+
+private:
+    bool share_channel_ = false;
+    std::shared_ptr<OpenCLMemory> ocl_k_ = nullptr;
+    std::shared_ptr<OpenCLMemory> ocl_b_ = nullptr;
+};
+
+Status OpenCLBatchNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init BatchNorm Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "BatchNorm";
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    int channels    = DimsFunctionUtils::GetDim(input_dims, 1);
+
+    BatchNormLayerResource *batchnorm_resource = dynamic_cast<BatchNormLayerResource *>(resource);
+    if (batchnorm_resource == nullptr) {
+        LOGE("BatchNormLayerResource is null!\n");
+        return Status(TNNERR_MODEL_ERR, "BatchNormLayerResource is null");
+    }
+
+    RawBuffer &scale_handle = batchnorm_resource->scale_handle;
+    RawBuffer &bias_handle  = batchnorm_resource->bias_handle;
+    DataType data_type      = scale_handle.GetDataType();
+
+    share_channel_ = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(data_type);
+    bool has_bias  = bias_handle.GetBytesSize() != 0;
+
+    ret = ConvertChannelWeights(scale_handle, ocl_k_, channels, true, share_channel_);
+    CHECK_TNN_OK(ret)
+
+    // get bias
+    ret = ConvertChannelWeights(bias_handle, ocl_b_, channels, has_bias, share_channel_);
+    CHECK_TNN_OK(ret)
+
+    // create kernel
+    std::string kernel_name = "BatchNormGS3D";
+    ret                     = CreateExecuteUnit(execute_units_[0], "batch_norm", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLBatchNormLayerAcc::~OpenCLBatchNormLayerAcc() {}
+
+Status OpenCLBatchNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("BatchNorm Layer Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_k_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_b_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(BatchNorm, LAYER_BATCH_NORM)
+REGISTER_OPENCL_ACC(BatchNorm, LAYER_SCALE)
+REGISTER_OPENCL_LAYOUT(LAYER_BATCH_NORM, DATA_FORMAT_NHC4W4);
+REGISTER_OPENCL_LAYOUT(LAYER_SCALE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.cc
new file mode 100644
index 0000000..aaa1541
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.cc
@@ -0,0 +1,296 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLBinaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Binary Acc\n");
+
+    output_dims_size_ = outputs[0]->GetBlobDesc().dims.size();
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+
+    auto broadcast_param = dynamic_cast<MultidirBroadcastLayerParam *>(param);
+    CHECK_PARAM_NULL(broadcast_param);
+    broadcast_param_ = *broadcast_param;
+
+    EltwiseLayerResource *layer_res = dynamic_cast<EltwiseLayerResource *>(resource);
+    if (layer_res == nullptr) {
+        if (inputs.size() == 2) {
+            if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal &&
+                broadcast_param_.input1_broadcast_type == BroadcastTypeNormal) {
+                // inputs[0] and inputs[1] are equal
+                input_idx_ = 0;
+                param_idx_ = 1;
+            } else if (broadcast_param_.input0_broadcast_type != BroadcastTypeNormal &&
+                       broadcast_param_.input1_broadcast_type == BroadcastTypeNormal) {
+                // inputs[0] is the param
+                input_idx_ = 1;
+                param_idx_ = 0;
+
+            } else if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal &&
+                       broadcast_param_.input1_broadcast_type != BroadcastTypeNormal) {
+                // inputs[1] is the param
+                input_idx_ = 0;
+                param_idx_ = 1;
+
+            } else {
+                return Status(TNNERR_PARAM_ERR, "input dims is illegal");
+            }
+
+        } else {
+            return Status(TNNERR_PARAM_ERR, "inputs size shound be 2 without binary resource");
+        }
+    } else {
+        param_dims_ = layer_res->element_shape;
+        input_idx_      = 0;
+        if (inputs.size() != 1) {
+            return Status(TNNERR_PARAM_ERR, "input size should be 1");
+        }
+
+        int diff = output_dims_size_ - param_dims_.size();
+        for (int i = 0; i < diff; i++) {
+            param_dims_.insert(param_dims_.begin(), 1);
+        }
+
+        if (layer_res->element_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            float *data_ptr = layer_res->element_handle.force_to<float *>();
+            ret             = ConvertParam(data_ptr, param_dims_);
+            CHECK_TNN_OK(ret)
+        } else {
+            auto float_data_ptr = GetFloatFromRawBuffer(layer_res->element_handle);  // handle the memory
+            if (float_data_ptr == nullptr) {
+                return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "convert res to float failed");
+            }
+            ret = ConvertParam(float_data_ptr.get(), param_dims_);
+            CHECK_TNN_OK(ret)
+        }
+    }
+
+    kernel_name_ = GetKernelName(broadcast_param_);
+
+    return TNN_OK;
+}
+
+OpenCLBinaryLayerAcc::~OpenCLBinaryLayerAcc() {}
+
+Status OpenCLBinaryLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Binary Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    kernel_arg_idx_ = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    // set input0 and input1
+    if (inputs.size() == 2) {
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)inputs[input_idx_]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)inputs[param_idx_]->GetHandle().base));
+    } else {
+        if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal && broadcast_param_.weight_input_index == 0) {
+            execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)binary_params_->GetData()));
+            execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)inputs[0]->GetHandle().base));
+        } else {
+            execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)inputs[0]->GetHandle().base));
+            execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)binary_params_->GetData()));
+        }
+    }
+    // set optional param
+    if (kernel_name_ == "BinaryChannel" || kernel_name_ == "BinaryCHW" ||
+        kernel_name_ == "BinaryHW" || kernel_name_ == "BinaryWidth") {
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, DimsFunctionUtils::GetDim(output_dims, 2));
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, DimsFunctionUtils::GetDim(output_dims, 3));
+        int param_batch = 1;
+        if (inputs.size() == 2) {
+            auto param_dims = inputs[param_idx_]->GetBlobDesc().dims;
+            param_batch = DimsFunctionUtils::GetDim(param_dims, 0);
+        }
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, param_batch);
+    } else if (kernel_name_ == "BinaryBroadcast") {
+        std::vector<int> output_shape(4), input0_shape(4), input1_shape(4);
+        if (inputs.size() == 2) {
+            if (inputs[input_idx_]->GetBlobDesc().dims.size() > 4 ||
+                inputs[param_idx_]->GetBlobDesc().dims.size() > 4) {
+                return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "opencl binary layer inputs not support dims > 4");
+            }
+            for (int i = 0; i < 4; ++i) {
+                input0_shape[i] = DimsFunctionUtils::GetDim(inputs[input_idx_]->GetBlobDesc().dims, i);
+                input1_shape[i] = DimsFunctionUtils::GetDim(inputs[param_idx_]->GetBlobDesc().dims, i);
+            }
+        } else {
+            if (inputs[input_idx_]->GetBlobDesc().dims.size() > 4 || param_dims_.size() > 4) {
+                return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "opencl binary layer inputs not support dims > 4");
+            }
+            for (int i = 0; i < 4; ++i) {
+                input0_shape[i] = DimsFunctionUtils::GetDim(inputs[input_idx_]->GetBlobDesc().dims, i);
+                input1_shape[i] = DimsFunctionUtils::GetDim(param_dims_, i);
+            }
+        }
+
+        for (int i = 0; i < 4; ++i) {
+            output_shape[i] = DimsFunctionUtils::GetDim(output_dims, i);
+        }
+
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, 4 * sizeof(int), output_shape.data());
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, 4 * sizeof(int), input0_shape.data());
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, 4 * sizeof(int), input1_shape.data());
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, UP_DIV(input0_shape[1], 4));
+        execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, UP_DIV(input1_shape[1], 4));
+    }
+
+    // set output
+    execute_units_[0].ocl_kernel.setArg(kernel_arg_idx_++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    return TNN_OK;
+}
+
+std::string OpenCLBinaryLayerAcc::GetKernelName(const MultidirBroadcastLayerParam &param) {
+    if (param.input0_broadcast_type == BroadcastTypeNormal &&
+        param.input1_broadcast_type == BroadcastTypeNormal) {
+        return "BinaryElementWise";
+    } else if (param.input0_broadcast_type == BroadcastTypeSingle ||
+               param.input1_broadcast_type == BroadcastTypeSingle) {
+        return "BinarySingle";
+    } else if ((param.input0_broadcast_type == BroadcastTypeChannel &&
+                param.input1_broadcast_type == BroadcastTypeNormal) ||
+                (param.input1_broadcast_type == BroadcastTypeChannel &&
+                param.input0_broadcast_type == BroadcastTypeNormal)) {
+        return "BinaryChannel";
+    } else if ((param.input0_broadcast_type == BroadcastTypeElement &&
+                param.input1_broadcast_type == BroadcastTypeNormal) ||
+                (param.input1_broadcast_type == BroadcastTypeElement &&
+                param.input0_broadcast_type == BroadcastTypeNormal)) {
+        return "BinaryCHW";
+    } else if ((param.input0_broadcast_type == BroadcastTypeHeightWidth &&
+                param.input1_broadcast_type == BroadcastTypeNormal) ||
+                (param.input1_broadcast_type == BroadcastTypeHeightWidth &&
+                param.input0_broadcast_type == BroadcastTypeNormal)) {
+        return "BinaryHW";
+    } else if ((param.input0_broadcast_type == BroadcastTypeWidth &&
+                param.input1_broadcast_type == BroadcastTypeNormal) ||
+                (param.input1_broadcast_type == BroadcastTypeWidth &&
+                param.input0_broadcast_type == BroadcastTypeNormal)) {
+        return "BinaryWidth";
+    } else {
+        return "BinaryBroadcast";
+    }
+}
+
+Status OpenCLBinaryLayerAcc::ConvertParam(float *param_data_ptr, std::vector<int> param_dims) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    // copy param data into clBuffer
+    shared_ptr<OpenCLMemory> param_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    int param_size  = DimsVectorUtils::Count(param_dims);
+    int buffer_size = DimsFunctionUtils::GetDim(param_dims, 0) *
+                      ROUND_UP(DimsFunctionUtils::GetDim(param_dims, 1), 4) *
+                      DimsFunctionUtils::GetDim(param_dims, 2) * DimsFunctionUtils::GetDim(param_dims, 3);
+    cl_int ret      = CL_SUCCESS;
+    cl::Buffer param_clbuffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                              buffer_size * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    param_buffer->SetData(&param_clbuffer);
+    auto param_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        param_clbuffer, true, CL_MAP_WRITE, 0, buffer_size * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memset(param_clbuffer_ptr, 0, buffer_size * sizeof(float));
+    memcpy(param_clbuffer_ptr, param_data_ptr, param_size * sizeof(float));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(param_clbuffer, param_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create binary_param_
+    int climage_w             = UP_DIV(DimsFunctionUtils::GetDim(param_dims, 1), 4) * DimsFunctionUtils::GetDim(param_dims, 3);
+    int climage_h             = DimsFunctionUtils::GetDim(param_dims, 0) * DimsFunctionUtils::GetDim(param_dims, 2);
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl::Image2D *image = new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                                         cl::ImageFormat(CL_RGBA, data_type), climage_w, climage_h, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    binary_params_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    binary_params_->SetData(image, true);
+
+    // convert nchw buffer to Image
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    return convertor.ConvertBufferToImage(param_buffer.get(), NCHW_BUFFER, param_dims, binary_params_.get(), true);
+}
+
+Status OpenCLBinaryLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        auto buffer = (*const_resource)[name];
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+        auto buffer_dims = buffer->GetBufferDims();
+        if (output_dims_size_ != buffer_dims.size()) {
+            std::shared_ptr<RawBuffer> new_buffer(new RawBuffer(*buffer));
+            int diff = output_dims_size_ - buffer_dims.size();
+            for (int i = 0; i < diff; i++) {
+                buffer_dims.insert(buffer_dims.begin(), 1);
+            }
+            new_buffer->SetBufferDims(buffer_dims);
+            buffer = new_buffer;
+        }
+        auto status = RawBuffer2OpenCLBlob(buffer.get(), blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        auto dims = iter->GetBlobDesc().dims;
+        auto data_type_size = DataTypeUtils::GetBytesSize(iter->GetBlobDesc().data_type);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        iter->GetBlobDesc() = blob->GetBlobDesc();
+        LOGD("Reload constant blob: %s\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.h
new file mode 100644
index 0000000..c8b47d2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_binary_layer_acc.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_BINARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_BINARY_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/opencl_memory.h"
+
+namespace TNN_NS {
+
+class OpenCLBinaryLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLBinaryLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false) override;
+
+private:
+    std::string GetKernelName(const MultidirBroadcastLayerParam &param);
+    Status ConvertParam(float *bias_data_ptr, std::vector<int> param_dims);
+
+protected:
+    std::string kernel_name_ = "";
+    MultidirBroadcastLayerParam broadcast_param_;
+    uint32_t kernel_arg_idx_ = 0;
+
+private:
+    std::shared_ptr<OpenCLMemory> binary_params_ = nullptr;
+    std::vector<int> param_dims_ = {};
+    int input_idx_ = 0;
+    int param_idx_ = 1;
+    int output_dims_size_ = 0;
+};
+
+#define DECLARE_OPENCL_BINARY_ACC(type_string)                                                                         \
+    class OpenCL##type_string##LayerAcc : public OpenCLBinaryLayerAcc {                                                \
+    public:                                                                                                            \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;           \
+        virtual ~OpenCL##type_string##LayerAcc() override;                                                             \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_BINARY_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cast_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cast_layer_acc.cc
new file mode 100644
index 0000000..0b05143
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cast_layer_acc.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Cast);
+
+Status OpenCLCastLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Cast Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+    execute_units_.resize(1);
+    if(input_data_type == output_data_type) {
+        ret         = CreateExecuteUnit(execute_units_[0], "copy", "CopyImage");
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "cast not support");        
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLCastLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Cast Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    const int batch         = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int channels      = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    int inputWH[]      = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    int inputOffset[]  = {0, 0, 0, 0};
+    int outputOffset[] = {0, 0, 0, 0};
+    int outputWH[] = {output_width, output_height};
+
+    auto &unit = execute_units_[0];
+    int idx    = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+    unit.ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    unit.ocl_kernel.setArg(idx++, inputOffset);
+    unit.ocl_kernel.setArg(idx++, outputOffset);
+    unit.ocl_kernel.setArg(idx++, inputWH);
+    unit.ocl_kernel.setArg(idx++, outputWH);
+    unit.ocl_kernel.setArg(idx++, outputWH);
+    
+    return TNN_OK;
+}
+
+
+
+REGISTER_OPENCL_ACC(Cast, LAYER_CAST)
+REGISTER_OPENCL_LAYOUT(LAYER_CAST, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_ceil_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_ceil_layer_acc.cc
new file mode 100644
index 0000000..0f61f3a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_ceil_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Ceil);
+
+Status OpenCLCeilLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Ceil Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Ceil";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLCeilLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "ceil(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLCeilLayerAcc::~OpenCLCeilLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Ceil, LAYER_CEIL)
+REGISTER_OPENCL_LAYOUT(LAYER_CEIL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_clip_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_clip_layer_acc.cc
new file mode 100644
index 0000000..67cddbf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_clip_layer_acc.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Clip);
+
+Status OpenCLClipLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Clip Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Clip";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLClipLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    ClipLayerParam *clip_param = dynamic_cast<ClipLayerParam *>(param_);
+    if (clip_param == nullptr) {
+        LOGE("clip param is nil");
+        return build_options;
+    }
+
+    std::string compute = "clamp(in,(FLOAT4)(" + ToString(clip_param->min) + "f),(FLOAT4)(" + ToString(clip_param->max) + "f))";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLClipLayerAcc::~OpenCLClipLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Clip, LAYER_CLIP)
+REGISTER_OPENCL_LAYOUT(LAYER_CLIP, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_concat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_concat_layer_acc.cc
new file mode 100644
index 0000000..f98b50d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_concat_layer_acc.cc
@@ -0,0 +1,342 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+typedef enum { BUFFER_COPY = 0, IMAGE_COPY, TWO_INPUTS_CHANNEL_4X, TWO_INPUTS_CHANNEL_MOD_123 } ConcatKernelType;
+
+class OpenCLConcatLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLConcatLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+#if TNN_PROFILE
+    virtual double GetBandwidth() override;
+#endif
+
+private:
+    Status ReshapeImageConcat(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ReshapeTwoInputsConcat(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    Status ReshapeBufferConcat(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    std::shared_ptr<cl::Buffer> output_buffer_              = nullptr;
+    std::vector<std::shared_ptr<cl::Buffer>> input_buffers_ = {};
+    int axis_                                               = 1;
+    bool do_image_concat_                                   = true;
+    ConcatKernelType concat_type_                           = BUFFER_COPY;
+};
+
+bool CheckIsTwoInputs(const size_t input_size, const int axis) {
+    return input_size == 2 && axis == 1;
+}
+
+Status OpenCLConcatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Concat Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Concat";
+
+    ConcatLayerParam *concat_param = dynamic_cast<ConcatLayerParam *>(param);
+    CHECK_PARAM_NULL(concat_param);
+
+    axis_            = concat_param->axis;
+    do_image_concat_ = true;
+    if (axis_ == 1) {
+        for (size_t i = 0; i < inputs.size() - 1; ++i) {
+            int channel = DimsFunctionUtils::GetDim(inputs[i]->GetBlobDesc().dims, 1);
+            if (channel % 4 != 0) {
+                do_image_concat_ = false;
+                break;
+            }
+        }
+    }
+
+    LOGD("do_image_concat: %s\n", do_image_concat_ ? "true" : "false");
+
+    // choose kernel type
+    if (CheckIsTwoInputs(inputs.size(), axis_)) {
+        if (do_image_concat_) {
+            if (gpu_info_.type == ADRENO) {
+                concat_type_ = TWO_INPUTS_CHANNEL_4X;
+            } else {
+                concat_type_ = IMAGE_COPY;
+            }
+        } else {
+            concat_type_ = TWO_INPUTS_CHANNEL_MOD_123;
+        }
+    } else {
+        if (do_image_concat_) {
+            concat_type_ = IMAGE_COPY;
+        } else {
+            concat_type_ = BUFFER_COPY;
+        }
+    }
+
+    // create kernel
+    std::string kernel_name;
+
+    if (IMAGE_COPY == concat_type_) {
+        std::string program_name = "copy";
+        execute_units_.resize(inputs.size());
+        for (size_t i = 0; i < execute_units_.size(); i++) {
+            kernel_name = "CopyImage";
+            ret         = CreateExecuteUnit(execute_units_[i], program_name, kernel_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+        }
+    } else if (TWO_INPUTS_CHANNEL_4X == concat_type_) {
+        std::string program_name = "concat";
+        kernel_name              = "ConcatChannel4X";
+        execute_units_.resize(1);
+        ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else if (TWO_INPUTS_CHANNEL_MOD_123 == concat_type_) {
+        std::set<std::string> build_options;
+        build_options.emplace("-DCHANNEL0_MOD_4=" +
+            ToString(DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, 1) % 4));
+        std::string program_name = "concat";
+        kernel_name              = "ConcatChannel";
+        execute_units_.resize(1);
+        ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name, build_options);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else {
+        // default BUFFER_COPY
+        std::string program_name = "copy";
+        execute_units_.resize(2 * inputs.size() + 1);
+        for (size_t i = 0; i < inputs.size(); i++) {
+            // Image to Buffer
+            kernel_name = "CopyImageToBuffer";
+            ret         = CreateExecuteUnit(execute_units_[2 * i], program_name, kernel_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+            // Merge Buffer to Buffer
+            kernel_name = "CopyBuffer";
+            ret         = CreateExecuteUnit(execute_units_[2 * i + 1], program_name, kernel_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+        }
+        // Buffer to Image
+        kernel_name = "CopyBufferToImage";
+        ret         = CreateExecuteUnit(execute_units_[2 * inputs.size()], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLConcatLayerAcc::~OpenCLConcatLayerAcc() {}
+
+Status OpenCLConcatLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Concat Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+
+    if (IMAGE_COPY == concat_type_) {
+        return ReshapeImageConcat(inputs, outputs);
+    } else if (TWO_INPUTS_CHANNEL_4X == concat_type_) {
+        return ReshapeTwoInputsConcat(inputs, outputs);
+    } else if (TWO_INPUTS_CHANNEL_MOD_123 == concat_type_) {
+        return ReshapeTwoInputsConcat(inputs, outputs);
+    } else {
+        // default BUFFER_COPY
+        return ReshapeBufferConcat(inputs, outputs);
+    }
+}
+
+#if TNN_PROFILE
+double OpenCLConcatLayerAcc::GetBandwidth() {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int data_type_size            = opencl_runtime->GetPrecision() != PRECISION_HIGH ? 2 : 4;
+    return DimsVectorUtils::Count(output_dims_) * data_type_size / 1000.0 / 1000.0;
+}
+#endif
+
+Status OpenCLConcatLayerAcc::ReshapeImageConcat(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto output      = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    // output_width, output_height
+    int output_wh[]     = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+    int input_offset[]  = {0, 0, 0, 0};
+    int output_offset[] = {0, 0, 0, 0};
+
+    for (int i = 0; i < execute_units_.size(); ++i) {
+        auto input      = inputs[i];
+        auto input_dims = input->GetBlobDesc().dims;
+        // input_width, input_height
+        int input_wh[] = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+        // batch, input_channel/4, input_height, input_width
+        int region[] = {DimsFunctionUtils::GetDim(input_dims, 0),
+                        UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4),
+                        DimsFunctionUtils::GetDim(input_dims, 2), DimsFunctionUtils::GetDim(input_dims, 3)};
+
+        auto &unit = execute_units_[i];
+        int idx    = SetExecuteUnit2DSizeInfoDefault(unit, input_dims);
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        unit.ocl_kernel.setArg(idx++, input_offset);
+        unit.ocl_kernel.setArg(idx++, output_offset);
+        unit.ocl_kernel.setArg(idx++, input_wh);
+        unit.ocl_kernel.setArg(idx++, output_wh);
+        unit.ocl_kernel.setArg(idx++, input_wh);
+
+        output_offset[axis_] += region[axis_];
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLConcatLayerAcc::ReshapeBufferConcat(const std::vector<Blob *> &inputs,
+                                                 const std::vector<Blob *> &outputs) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    auto output                   = outputs[0];
+    auto output_dims              = output->GetBlobDesc().dims;
+
+    // allocate temp buffers
+    int blob_elem_size = opencl_runtime->GetPrecision() != PRECISION_HIGH ? 2 : 4;
+    int output_size    = DimsVectorUtils::Count(output->GetBlobDesc().dims);
+    output_buffer_ =
+        std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, output_size * blob_elem_size);
+    input_buffers_.resize(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+        int input_size = DimsVectorUtils::Count(inputs[i]->GetBlobDesc().dims);
+        input_buffers_[i] =
+            std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, input_size * blob_elem_size);
+    }
+
+    // set args
+    int input_offset[]  = {0, 0, 0, 0};
+    int output_offset[] = {0, 0, 0, 0};
+    int output_stride[] = {DimsFunctionUtils::GetDim(output_dims, 1) *
+                           DimsFunctionUtils::GetDim(output_dims, 3) * DimsFunctionUtils::GetDim(output_dims, 2), 1,
+                           DimsFunctionUtils::GetDim(output_dims, 3) * DimsFunctionUtils::GetDim(output_dims, 1),
+                           DimsFunctionUtils::GetDim(output_dims, 1)};
+    for (size_t i = 0; i < inputs.size(); i++) {
+        auto input      = inputs[i];
+        auto input_dims = input->GetBlobDesc().dims;
+
+        const int batch        = DimsFunctionUtils::GetDim(input_dims, 0);
+        const int input_height = DimsFunctionUtils::GetDim(input_dims, 2);
+        const int input_width  = DimsFunctionUtils::GetDim(input_dims, 3);
+        const int channels     = DimsFunctionUtils::GetDim(input_dims, 1);
+
+        int input_wh[]      = {input_width, input_height};
+        int buffer_region[] = {batch, channels, input_height, input_width};
+        int input_stride[]  = {channels * input_width * input_height, 1, input_width * channels, channels};
+
+        std::vector<int> buffer_output_size = {batch, channels, input_height, input_width};
+        // image to buffer (from (NH,C4W4) to NHWC)
+        {
+            auto &unit = execute_units_[2 * i];
+            int idx    = SetExecuteUnit2DSizeInfoDefault(unit, input_dims);
+            unit.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+            unit.ocl_kernel.setArg(idx++, *(input_buffers_[i]));
+            unit.ocl_kernel.setArg(idx++, input_offset);
+            unit.ocl_kernel.setArg(idx++, input_offset);
+            unit.ocl_kernel.setArg(idx++, input_wh);
+            unit.ocl_kernel.setArg(idx++, input_stride);
+            unit.ocl_kernel.setArg(idx++, 4 * sizeof(int), buffer_output_size.data());
+        }
+
+        // buffer to buffer
+        {
+            auto &unit = execute_units_[2 * i + 1];
+            // special process: buffer_region[1] * 4 to get the right global size
+            int idx = SetExecuteUnit2DSizeInfoDefault(
+                unit, {buffer_region[0], buffer_region[1] * 4, buffer_region[2], buffer_region[3]});
+            unit.ocl_kernel.setArg(idx++, *(input_buffers_[i]));
+            unit.ocl_kernel.setArg(idx++, *output_buffer_);
+            unit.ocl_kernel.setArg(idx++, input_offset);
+            unit.ocl_kernel.setArg(idx++, output_offset);
+            unit.ocl_kernel.setArg(idx++, input_stride);
+            unit.ocl_kernel.setArg(idx++, output_stride);
+            unit.ocl_kernel.setArg(idx++, input_wh);
+        }
+        output_offset[axis_] += buffer_region[axis_];
+    }
+
+    // buffer to image (from NHWC to (NH,C4W4))
+    {
+        int output_wh[] = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+        auto &unit      = execute_units_[2 * inputs.size()];
+        int idx         = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+        unit.ocl_kernel.setArg(idx++, *output_buffer_);
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        unit.ocl_kernel.setArg(idx++, input_offset);
+        unit.ocl_kernel.setArg(idx++, input_offset);
+        unit.ocl_kernel.setArg(idx++, output_stride);
+        unit.ocl_kernel.setArg(idx++, output_wh);
+        unit.ocl_kernel.setArg(idx++, output_wh);
+        unit.ocl_kernel.setArg(idx++, output_size - 1);
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLConcatLayerAcc::ReshapeTwoInputsConcat(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    run_3d_ndrange_  = true;
+    auto output      = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+    auto input0      = inputs[0];
+    auto input1      = inputs[1];
+
+    // [output_channle/4, output_width, batch * output_height]
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4)),
+                                          static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)),
+                                          static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 0) *
+                                                                DimsFunctionUtils::GetDim(output_dims, 2))};
+    execute_units_[0].local_work_size  = LocalWS3DDefault(execute_units_[0]);
+    int idx                            = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input0->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input1->GetHandle().base));
+    // input channel
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input0->GetBlobDesc().dims, 1));
+    // output channel
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 1));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Concat, LAYER_CONCAT)
+REGISTER_OPENCL_LAYOUT(LAYER_CONCAT, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cos_layer_acc.cc
new file mode 100644
index 0000000..70d8f75
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cos_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Cos);
+
+Status OpenCLCosLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Cos Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Cos";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLCosLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "cos(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLCosLayerAcc::~OpenCLCosLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Cos, LAYER_COS)
+REGISTER_OPENCL_LAYOUT(LAYER_COS, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.cc
new file mode 100644
index 0000000..efe6c0c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.cc
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_cpu_adapter_acc.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+inline MatType MatTypeByBlob(const BlobDesc& desc) {
+    // TODO: opencl blob converter support fp16 mat
+    return NCHW_FLOAT;
+}
+
+static void PackOrUnpackData(void *src, void *dst, DataType data_type, DimsVector& dims, bool pack) {
+    // TODO: PackOrUnpackData support fp16 mat
+    if (DATA_TYPE_FLOAT == data_type) {
+        float *src_data = reinterpret_cast<float*>(src);
+        float *dst_data = reinterpret_cast<float*>(dst);
+        if (pack) {
+            DataFormatConverter::ConvertFromNCHWToNCHW4Float(src_data, dst_data, dims[0], dims[1],
+                DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3));
+        } else {
+            DataFormatConverter::ConvertFromNCHW4ToNCHWFloat(src_data, dst_data, dims[0], dims[1],
+                DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3));
+        }
+    }
+}
+
+OpenCLCpuAdapterAcc::OpenCLCpuAdapterAcc(LayerType impl_layer_type) {
+    impl_layer_type_ = impl_layer_type;
+    cpu_adapter_acc_ = NULL;
+    impl_device_type_ = DEVICE_ARM;
+    impl_device_context_ = NULL;
+    DeviceType device_list[2] = {DEVICE_ARM, DEVICE_X86};
+    for(auto device_type : device_list) {
+        auto device = GetDevice(device_type);
+        if(device != NULL) {
+            auto acc = device->CreateLayerAcc(impl_layer_type_);
+            if(acc != NULL) {
+                cpu_adapter_acc_     = acc;
+                impl_device_type_    = device_type;
+                impl_device_         = device;
+                impl_device_context_ = device->CreateContext(0);
+                break;
+            }
+        }
+    }
+}
+
+Status OpenCLCpuAdapterAcc::Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) {
+    if(cpu_adapter_acc_ == NULL) {
+       return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "cpu adapter acc is null");
+    }
+    auto status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    ocl_context_ = dynamic_cast<OpenCLContext *>(context);
+    if (ocl_context_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "OpenCL Context Convert failed");
+    }
+
+    //check input and output data type
+    for(auto input : inputs) {
+        auto desc = input->GetBlobDesc();
+        if (desc.data_type != DATA_TYPE_FLOAT && desc.data_type != DATA_TYPE_HALF) {
+            LOGE("layer acc with tyoe (%d) is nil\n", (int)impl_layer_type_);
+            return Status(TNNERR_NULL_PARAM, "layer acc is nil");
+        }
+    }
+
+    for(auto output : outputs) {
+        auto desc = output->GetBlobDesc();
+        if (desc.data_type != DATA_TYPE_FLOAT && desc.data_type != DATA_TYPE_HALF) {
+            LOGE("layer acc with tyoe (%d) is nil\n", (int)impl_layer_type_);
+            return Status(TNNERR_NULL_PARAM, "layer acc is nil");
+        }
+    }
+
+    //TODO: test with bfp16 mode
+
+    for(auto input : inputs) {
+        auto desc = input->GetBlobDesc();
+        desc.device_type = impl_device_type_;
+        desc.data_format = GetCpuLayerAccDataFormat();
+        desc.data_type = GetCpuLayerAccPrecision();
+        cpu_blob_in_.push_back(new Blob(desc, true));
+    }
+
+    for(auto output : outputs) {
+        auto desc = output->GetBlobDesc();
+        desc.device_type = impl_device_type_;
+        desc.data_format = GetCpuLayerAccDataFormat();
+        desc.data_type = GetCpuLayerAccPrecision();
+        cpu_blob_out_.push_back(new Blob(desc, true));
+    }
+
+    //cpu acc init
+    status = cpu_adapter_acc_->Init(impl_device_context_, param, resource, cpu_blob_in_, cpu_blob_out_);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    cpu_adapter_acc_->SetRuntimeMode(runtime_model_);
+    cpu_adapter_acc_->SetConstantResource(const_resource_);
+
+    return status;
+}
+
+OpenCLCpuAdapterAcc::~OpenCLCpuAdapterAcc() {
+    for(auto input : cpu_blob_in_) {
+        delete input;
+    }
+    cpu_blob_in_.clear();
+
+    for(auto output : cpu_blob_out_) {
+        delete output;
+    }
+    cpu_blob_out_.clear();
+
+    if (cpu_adapter_acc_) {
+        delete cpu_adapter_acc_;
+    }
+    cpu_adapter_acc_ = nullptr;
+
+    if (impl_device_context_) {
+        delete impl_device_context_;
+    }
+    impl_device_context_ = nullptr;
+
+}
+
+Status OpenCLCpuAdapterAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    for(int i = 0; i < inputs.size(); ++i) {
+        auto device_input = inputs[i];
+        auto cpu_input = cpu_blob_in_[i];
+        auto dims = device_input->GetBlobDesc().dims;
+        cpu_input->GetBlobDesc().dims = dims;
+    }
+
+    for(int i = 0; i < outputs.size(); ++i) {
+        auto device_output = outputs[i];
+        auto cpu_output = cpu_blob_out_[i];
+        auto dims = device_output->GetBlobDesc().dims;
+        cpu_output->GetBlobDesc().dims = dims;
+    }
+
+    return cpu_adapter_acc_->Reshape(cpu_blob_in_, cpu_blob_out_);
+}
+
+Status OpenCLCpuAdapterAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void* command_queue = nullptr;
+    ocl_context_->GetCommandQueue(&command_queue);
+
+    Status status = TNN_OK;
+    //convert data from opencl to cpu
+    status = ConvertBlobForAdaptorAcc(inputs, cpu_blob_in_, true);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    //cpu acc forward
+    status = cpu_adapter_acc_->Forward(cpu_blob_in_, cpu_blob_out_);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    //convert data from cpu to opencl
+    status = ConvertBlobForAdaptorAcc(outputs, cpu_blob_out_, false);
+
+    return status;
+}
+
+Status OpenCLCpuAdapterAcc::ConvertBlobForAdaptorAcc(const std::vector<Blob *> & device_blobs,
+                                                     const std::vector<Blob *> & cpu_blobs,
+                                                     bool device_to_cpu) {
+    Status status = TNN_OK;
+    void* command_queue = nullptr;
+    ocl_context_->GetCommandQueue(&command_queue);
+    for (int i = 0; i < device_blobs.size(); ++i) {
+        auto device_blob = device_blobs[i];
+        auto cpu_blob    = cpu_blobs[i];
+
+        if (const_resource_ != nullptr &&
+            const_resource_->find(device_blob->GetBlobDesc().name) != const_resource_->end()) {
+                continue;
+        }
+
+        auto dims = cpu_blob->GetBlobDesc().dims;
+        if (!device_to_cpu) {
+            device_blob->GetBlobDesc().dims = dims;
+        }
+
+        BlobConverter blob_converter(device_blob);
+        MatConvertParam param;
+        const auto& cpu_blob_desc = cpu_blob->GetBlobDesc();
+
+        if (DATA_FORMAT_NCHW == cpu_blob_desc.data_format) {
+            if (device_to_cpu) {
+                Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims, cpu_blob->GetHandle().base);
+                status = blob_converter.ConvertToMat(mat, param, command_queue);
+            } else {
+                Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims, cpu_blob->GetHandle().base);
+                status = blob_converter.ConvertFromMat(mat, param, command_queue);
+            }
+            RETURN_ON_NEQ(status, TNN_OK);
+        } else {
+            //To optimize, use convert to change format
+            Mat mat(impl_device_type_, MatTypeByBlob(cpu_blob_desc), cpu_blob_desc.dims);
+            if (device_to_cpu) {
+                status = blob_converter.ConvertToMat(mat, param, command_queue);
+                RETURN_ON_NEQ(status, TNN_OK);
+                PackOrUnpackData(mat.GetData(), cpu_blob->GetHandle().base, cpu_blob_desc.data_type, dims, true);
+            } else {
+                PackOrUnpackData(cpu_blob->GetHandle().base, mat.GetData(), cpu_blob_desc.data_type, dims, false);
+                status = blob_converter.ConvertFromMat(mat, param, command_queue);
+                RETURN_ON_NEQ(status, TNN_OK);
+            }
+        }
+    }
+    return status;
+}
+
+std::vector<DataFormat> OpenCLCpuAdapterAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size >= 2) {
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    }
+    return support_list;
+}
+
+DataType OpenCLCpuAdapterAcc::GetCpuLayerAccPrecision() {
+    // TODO: opencl blob converter support fp16 mat
+    return DATA_TYPE_FLOAT;
+}
+
+ DataFormat OpenCLCpuAdapterAcc::GetCpuLayerAccDataFormat() {
+     auto cpu_layouts = impl_device_->GetImplementedLayout(impl_layer_type_);
+     return cpu_layouts->layouts[0];
+ }
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.h
new file mode 100644
index 0000000..562cce2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_cpu_adapter_acc.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CPU_ADAPTER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CPU_ADAPTER_H_
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/core/context.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/device/opencl/opencl_context.h"
+
+namespace TNN_NS {
+
+class OpenCLCpuAdapterAcc : public AbstractLayerAcc {
+public:
+
+    OpenCLCpuAdapterAcc(LayerType impl_layer_type);
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLCpuAdapterAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    // @brief get data_type for bolobs of cpu layer acc
+    DataType GetCpuLayerAccPrecision();
+    // @brief get data_format for blobs of cpu layer acc
+    DataFormat GetCpuLayerAccDataFormat();
+    Status ConvertBlobForAdaptorAcc(const std::vector<Blob *> & device_blobs,
+                                    const std::vector<Blob *> & cpu_blobs, bool device_to_cpu);
+
+private:
+    LayerType impl_layer_type_;
+    
+    DeviceType impl_device_type_;
+    Context* impl_device_context_;
+    AbstractDevice* impl_device_;
+    AbstractLayerAcc* cpu_adapter_acc_ = nullptr;
+
+    OpenCLContext *ocl_context_ = nullptr;
+
+    std::vector<Blob *> cpu_blob_in_;
+    std::vector<Blob *> cpu_blob_out_;
+};
+
+}
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_CPU_ADAPTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_div_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_div_layer_acc.cc
new file mode 100644
index 0000000..738b7f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_div_layer_acc.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Div);
+
+Status OpenCLDivLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Div Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Div";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute;
+    if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal) {
+        compute = "in0/in1";
+    } else {
+        compute = "in1/in0";
+    }
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLDivLayerAcc::~OpenCLDivLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Div, LAYER_DIV)
+REGISTER_OPENCL_LAYOUT(LAYER_DIV, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_elu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_elu_layer_acc.cc
new file mode 100644
index 0000000..9a602ba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_elu_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Elu);
+
+Status OpenCLEluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Elu Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Elu";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLEluLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    EluLayerParam *elu_param = dynamic_cast<EluLayerParam *>(param_);
+    if (elu_param == nullptr) {
+        LOGE("elu param is nil");
+        return build_options;
+    }
+    std::string compute = "select(in,(FLOAT)(" + ToString(elu_param->alpha) + "f)*(exp(in)-(FLOAT)(1.0f)),in<0)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+
+    AdjustBuildOptionForFp32(build_options);
+    return build_options;
+}
+
+OpenCLEluLayerAcc::~OpenCLEluLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Elu, LAYER_ELU)
+REGISTER_OPENCL_LAYOUT(LAYER_ELU, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_exp_layer_acc.cc
new file mode 100644
index 0000000..1d2a593
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_exp_layer_acc.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Exp);
+
+Status OpenCLExpLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Exp Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Exp";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLExpLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "exp(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+
+    AdjustBuildOptionForFp32(build_options);
+    return build_options;
+}
+
+OpenCLExpLayerAcc::~OpenCLExpLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Exp, LAYER_EXP)
+REGISTER_OPENCL_LAYOUT(LAYER_EXP, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.cc
new file mode 100644
index 0000000..908fb01
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.cc
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_expand_layer_acc.h"
+
+#include <sstream>
+
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLExpandLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Expand Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Expand";
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+    auto input_dims_size = input_dims.size();
+    auto output_dims_size = output_dims.size();
+
+    std::string src_format = "Image", dst_format = "Image";
+    std::string img_to_buf_program_name = "image_to_buffer", buf_to_img_program_name = "buffer_to_image";
+    src_format = input_dims_size == 5 ? "Image5D" : input_dims_size == 6 ? "Image6D" : src_format;
+    img_to_buf_program_name = input_dims_size == 5 ? "image_5d_to_buffer" : input_dims_size == 6 ? "image_6d_to_buffer" : img_to_buf_program_name;
+    dst_format = output_dims_size == 5 ? "Image5D" : output_dims_size == 6 ? "Image6D" : dst_format;
+    buf_to_img_program_name = output_dims_size == 5 ? "buffer_to_image_5d" : output_dims_size == 6 ? "buffer_to_image_6d" : buf_to_img_program_name;
+
+    execute_units_.resize(3);
+    // image->buffer
+    {
+        ret = CreateExecuteUnit(execute_units_[0], img_to_buf_program_name, src_format + "ToNCHWBuffer");
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    // expand
+    {
+        std::set<std::string> build_options;
+        std::ostringstream oss;
+        oss << "-DINNER_SIZE" << output_dims.size();
+        build_options.emplace(oss.str());
+        ret = CreateExecuteUnit(execute_units_[0], "expand", "Expand", build_options);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    // buffer->image
+    {
+        ret = CreateExecuteUnit(execute_units_[2], buf_to_img_program_name, "NCHWBufferTo" + dst_format);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLExpandLayerAcc::~OpenCLExpandLayerAcc() {}
+
+Status OpenCLExpandLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Expand Acc Expand\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int input_size                 = sizeof(float) * DimsVectorUtils::Count(input_dims);
+    int output_size                = sizeof(float) * DimsVectorUtils::Count(output_dims);
+
+    src_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(),
+                                                     CL_MEM_READ_WRITE, input_size);
+    dst_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(),
+                                                     CL_MEM_READ_WRITE, output_size);
+
+    // image->buffer
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], input_dims);
+        execute_units_[0].ocl_kernel.setArg(idx++, *src_buffer_.get());
+        if (input_dims.size() <= 4) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        } else if (input_dims.size() == 5) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+        } else if (input_dims.size()== 6) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 5)));
+        }
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    }
+
+    //expand
+    {
+        auto expanded_input_dims = input_dims;
+        while(expanded_input_dims.size() < output_dims.size()) {
+            expanded_input_dims.insert(expanded_input_dims.begin(), 1);
+        }
+        auto expanded_input_step =  DimsFunctionUtils::GetDimsStep(expanded_input_dims);
+
+        DimsVector fix_output_dims(6, 0), fix_input_step(6, 0);
+        for(int i = 0; i < output_dims.size(); ++i) {
+            fix_output_dims[i] = output_dims[i];
+            fix_input_step[i] = expanded_input_step[i];
+        }
+      
+        uint32_t idx = SetExecuteUnit1DSizeInfoDefault(execute_units_[1], output_dims);
+        execute_units_[1].ocl_kernel.setArg(idx++, *src_buffer_);
+        execute_units_[1].ocl_kernel.setArg(idx++, *src_buffer_);
+        execute_units_[1].ocl_kernel.setArg(idx++, fix_output_dims.size() * sizeof(int), fix_output_dims.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, fix_input_step.size() * sizeof(int), fix_input_step.data());
+    }
+
+    // buffer->image
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[2], output_dims);
+        execute_units_[2].ocl_kernel.setArg(idx++, *dst_buffer_.get());
+        if (output_dims.size() <= 4) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+        } else if (output_dims.size() == 5) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+        } else if (output_dims.size() == 6) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 5)));
+        }
+        execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Expand, LAYER_EXPAND)
+REGISTER_OPENCL_LAYOUT(LAYER_EXPAND, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.h
new file mode 100644
index 0000000..3291cb8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_expand_layer_acc.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_EXPAND_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_EXPAND_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLExpandLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+    virtual ~OpenCLExpandLayerAcc() override;
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+private:
+    std::shared_ptr<cl::Buffer> src_buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> dst_buffer_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_EXPAND_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_floor_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_floor_layer_acc.cc
new file mode 100644
index 0000000..ec862ff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_floor_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Floor);
+
+Status OpenCLFloorLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Floor Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Floor";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLFloorLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "floor(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLFloorLayerAcc::~OpenCLFloorLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Floor, LAYER_FLOOR)
+REGISTER_OPENCL_LAYOUT(LAYER_FLOOR, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gather_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gather_layer_acc.cc
new file mode 100644
index 0000000..c2a2298
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gather_layer_acc.cc
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLGatherLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLGatherLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status ConvertIndicesBuffer(RawBuffer& indices);
+
+    std::shared_ptr<cl::Buffer> src_buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> dst_buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> indices_buffer_ = nullptr;
+};
+
+Status OpenCLGatherLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Gather Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param);
+    CHECK_PARAM_NULL(layer_param);
+    int axis = layer_param->axis;
+
+    run_3d_ndrange_ = true;
+    op_name_        = "Gather";
+
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if (!layer_param->data_in_resource && layer_param->indices_in_resource && layer_resource) {
+        ret = ConvertIndicesBuffer(layer_resource->indices);
+        auto input = inputs[0];
+        auto output = outputs[0];
+        auto input_dims = input->GetBlobDesc().dims;
+        auto output_dims = output->GetBlobDesc().dims;
+        auto input_dims_size = input_dims.size();
+        auto output_dims_size = output_dims.size();
+
+        std::string src_format = "Image", dst_format = "Image";
+        std::string img_to_buf_program_name = "image_to_buffer", buf_to_img_program_name = "buffer_to_image";
+        src_format = input_dims_size == 5 ? "Image5D" : input_dims_size == 6 ? "Image6D" : src_format;
+        img_to_buf_program_name = input_dims_size == 5 ? "image_5d_to_buffer" : input_dims_size == 6 ? "image_6d_to_buffer" : img_to_buf_program_name;
+        dst_format = output_dims_size == 5 ? "Image5D" : output_dims_size == 6 ? "Image6D" : dst_format;
+        buf_to_img_program_name = output_dims_size == 5 ? "buffer_to_image_5d" : output_dims_size == 6 ? "buffer_to_image_6d" : buf_to_img_program_name;
+
+        // create kernel
+        execute_units_.resize(3);
+        // image->buffer
+        {
+            ret = CreateExecuteUnit(execute_units_[0], img_to_buf_program_name, src_format + "ToNCHWBuffer");
+            if (ret != TNN_OK) {
+                LOGE("create execute unit failed!\n");
+                return ret;
+            }
+        }
+
+        // gather
+        {
+            ret = CreateExecuteUnit(execute_units_[1], "gather", "GatherCommon");
+            if (ret != TNN_OK) {
+                LOGE("create execute unit failed!\n");
+                return ret;
+            }
+        }
+
+        // buffer->image
+        {
+            ret = CreateExecuteUnit(execute_units_[2], buf_to_img_program_name, "NCHWBufferTo" + dst_format);
+            if (ret != TNN_OK) {
+                LOGE("create execute unit failed!\n");
+                return ret;
+            }
+        }
+        return TNN_OK; 
+    } else {
+        return Status(TNNERR_PARAM_ERR, "Error: only support indices in resource now \n"); 
+    }
+}
+
+Status OpenCLGatherLayerAcc::ConvertIndicesBuffer(RawBuffer& indices) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    cl_int ret = CL_SUCCESS;
+    indices_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(indices.GetBufferDims()) * sizeof(int), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    auto indices_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        *indices_buffer_.get(), true, CL_MAP_WRITE, 0, DimsVectorUtils::Count(indices.GetBufferDims()) * sizeof(int), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memcpy(indices_clbuffer_ptr, indices.force_to<char*>(), DimsVectorUtils::Count(indices.GetBufferDims()) * sizeof(int));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(*indices_buffer_.get(), indices_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+    return TNN_OK;
+}
+
+OpenCLGatherLayerAcc::~OpenCLGatherLayerAcc() {}
+
+Status OpenCLGatherLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Gather Acc Reshape\n");
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int input_size                 = sizeof(float) * DimsVectorUtils::Count(input_dims);
+    int output_size                = sizeof(float) * DimsVectorUtils::Count(output_dims);
+
+    src_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(),
+                                                     CL_MEM_READ_WRITE, input_size);
+    dst_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(),
+                                                     CL_MEM_READ_WRITE, output_size);
+
+    // image->buffer
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], input_dims);
+        execute_units_[0].ocl_kernel.setArg(idx++, *src_buffer_.get());
+        if (input_dims.size() <= 4) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        } else if (input_dims.size() == 5) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+        } else if (input_dims.size()== 6) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 5)));
+        }
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    }
+
+    //gather
+    {
+        int axis = layer_param->axis;
+        int inner_size = DimsVectorUtils::Count(input_dims, axis + 1);
+        int outer_size = DimsVectorUtils::Count(input_dims, 0, axis);
+        int input_axis_size = input_dims[axis];
+        int indice_size = DimsVectorUtils::Count(output_dims) / inner_size / outer_size;
+        int output_outer_step = inner_size * indice_size;
+        int input_outer_step = inner_size * input_axis_size;
+        execute_units_[1].global_work_size = {static_cast<uint32_t>(inner_size), static_cast<uint32_t>(indice_size), static_cast<uint32_t>(outer_size)};
+        execute_units_[1].local_work_size = LocalWS3DDefault(execute_units_[1]); 
+        uint32_t idx = 0;
+        execute_units_[1].ocl_kernel.setArg(idx++, execute_units_[1].global_work_size[0]);
+        execute_units_[1].ocl_kernel.setArg(idx++, execute_units_[1].global_work_size[1]);
+        execute_units_[1].ocl_kernel.setArg(idx++, execute_units_[1].global_work_size[2]);
+        execute_units_[1].ocl_kernel.setArg(idx++, *src_buffer_);
+        execute_units_[1].ocl_kernel.setArg(idx++, *indices_buffer_);
+        execute_units_[1].ocl_kernel.setArg(idx++, *dst_buffer_);
+        execute_units_[1].ocl_kernel.setArg(idx++, inner_size);
+        execute_units_[1].ocl_kernel.setArg(idx++, input_outer_step);
+        execute_units_[1].ocl_kernel.setArg(idx++, output_outer_step);
+    }
+
+    // buffer->image
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[2], output_dims);
+        execute_units_[2].ocl_kernel.setArg(idx++, *dst_buffer_.get());
+        if (output_dims.size() <= 4) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+        } else if (output_dims.size() == 5) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+        } else if (output_dims.size() == 6) {
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+            execute_units_[2].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 5)));
+        }
+        execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Gather, LAYER_GATHER)
+REGISTER_OPENCL_LAYOUT(LAYER_GATHER, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gridsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gridsample_layer_acc.cc
new file mode 100644
index 0000000..4b44ae2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_gridsample_layer_acc.cc
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+class OpenCLGridsampleLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs,
+                                       bool only_reload_shape_differ_blob = false) override {
+        return TNN_OK;
+    }
+};
+
+Status OpenCLGridsampleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Upsample Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Gridsample";
+
+    GridSampleLayerParam *gridsample_param = dynamic_cast<GridSampleLayerParam *>(param);
+    if (!gridsample_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name;
+    if (gridsample_param->mode == 2) {  // bilinear
+        kernel_name = "BilinearGridSample";
+    } else {
+        LOGE("Not support Gridsample type: %d\n", gridsample_param->mode);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid upsample mode");
+    }
+
+    ret = CreateExecuteUnit(execute_units_[0], "gridsample", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLGridsampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Girdsample Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    GridSampleLayerParam *gridsample_param = dynamic_cast<GridSampleLayerParam *>(param_);
+    if (!gridsample_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    if (gridsample_param->mode != 2 || gridsample_param->pad_type != 0 || gridsample_param->align_corners != 0) {
+        return Status(TNNERR_PARAM_ERR,
+                      "OpenclGridSampleLayerAcc dont support some mode or pade type or align_corners");
+    }
+
+    auto input  = inputs[0];
+    auto grid   = inputs[1];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+    if (input_dims.size() != 4 || output_dims.size() != 4) {
+        LOGE("GridSample Layer (OpenCL) only support 4-dim by now\n");
+        return Status(TNNERR_INVALID_INPUT, "GridSample Layer (OpenCL) only support 4-dim by now\n");
+    }
+    const int batch        = DimsFunctionUtils::GetDim(input_dims, 0);
+    const int channels     = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int input_height = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width  = DimsFunctionUtils::GetDim(input_dims, 3);
+
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int channel_blocks = UP_DIV(channels, 4);
+
+    uint32_t idx         = 0;
+    auto output_upheight = output_dims;
+    output_upheight[2]   = UP_DIV(output_upheight[2], 4);
+    idx                  = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_upheight);
+
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)grid->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(input_height));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(input_width));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_height));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_width));
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Gridsample, LAYER_GRIDSAMPLE)
+REGISTER_OPENCL_LAYOUT(LAYER_GRIDSAMPLE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..42185e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_sigmoid_layer_acc.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(HardSigmoid);
+
+Status OpenCLHardSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init HardSigmoid Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "HardSigmoid";
+
+    // create kernel
+    std::string kernel_name = "HardSigmoid";
+    ret                     = CreateExecuteUnit(execute_units_[0], "hard_sigmoid", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLHardSigmoidLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("HardSigmoid Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    HardSigmoidLayerParam *hs_param = dynamic_cast<HardSigmoidLayerParam *>(param_);
+    if (!hs_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    float min_value = -hs_param->beta / hs_param->alpha;
+    float max_value = (1.0f - hs_param->beta) / hs_param->alpha;
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, hs_param->alpha);
+    execute_units_[0].ocl_kernel.setArg(idx++, hs_param->beta);
+    execute_units_[0].ocl_kernel.setArg(idx++, min_value);
+    execute_units_[0].ocl_kernel.setArg(idx++, max_value);
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(HardSigmoid, LAYER_HARDSIGMOID)
+REGISTER_OPENCL_LAYOUT(LAYER_HARDSIGMOID, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_swish_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_swish_layer_acc.cc
new file mode 100644
index 0000000..0acede9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hard_swish_layer_acc.cc
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+#include <sstream>
+
+namespace TNN_NS {
+
+class OpenCLHardSwishLayerAcc : public OpenCLBinaryLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+    virtual ~OpenCLHardSwishLayerAcc() override;
+
+private:
+    void ExtendInputs(const std::vector<Blob *> &inputs);
+
+    bool need_extend_input_ = false;
+    std::vector<Blob *> inputs_extend_ = {};
+};
+
+OpenCLHardSwishLayerAcc::~OpenCLHardSwishLayerAcc() {}
+
+Status OpenCLHardSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init HardSwish Acc\n");
+
+    if (nullptr == resource && 1 == inputs.size()) {
+        need_extend_input_ = true;
+    }
+
+    ExtendInputs(inputs);
+
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs_extend_, outputs);
+    CHECK_TNN_OK(ret)
+
+    HardSwishLayerParam *hs_param = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!hs_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    op_name_ = "HardSwish";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute;
+    std::string alpha_str = std::to_string(hs_param->alpha);
+    if (alpha_str.find('.') != std::string::npos && alpha_str.find('f') == std::string::npos) {
+        alpha_str += "f";
+    }
+    std::string beta_str = std::to_string(hs_param->beta);
+    if (beta_str.find('.') != std::string::npos && beta_str.find('f') == std::string::npos) {
+        beta_str += "f";
+    }
+
+    if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal) {
+        std::ostringstream oss;
+        oss << "in0*clamp(in1*(FLOAT)(" << alpha_str << ")+(FLOAT)(" << beta_str << "),(FLOAT)0.0f,(FLOAT)1.0f)";
+        compute = oss.str();
+    } else {
+        std::ostringstream oss;
+        oss << "in1*clamp(in0*(FLOAT)(" << alpha_str << ")+(FLOAT)(" << beta_str << "),(FLOAT)0.0f,(FLOAT)1.0f)";
+        compute = oss.str();
+    }
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLHardSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("HardSwish Acc Reshape\n");
+    HardSwishLayerParam *hs_param = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!hs_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    ExtendInputs(inputs);
+
+    Status ret = OpenCLBinaryLayerAcc::Reshape(inputs_extend_, outputs);
+    CHECK_TNN_OK(ret)
+
+    return TNN_OK;
+}
+
+void OpenCLHardSwishLayerAcc::ExtendInputs(const std::vector<Blob *> &inputs) {
+    inputs_extend_ = inputs;
+    if (need_extend_input_) {
+        inputs_extend_.clear();
+        inputs_extend_.resize(2);
+        inputs_extend_[0] = inputs[0];
+        inputs_extend_[1] = inputs[0];
+    }
+}
+
+REGISTER_OPENCL_ACC(HardSwish, LAYER_HARDSWISH)
+REGISTER_OPENCL_LAYOUT(LAYER_HARDSWISH, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hdr_guide_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hdr_guide_layer_acc.cc
new file mode 100644
index 0000000..054953b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_hdr_guide_layer_acc.cc
@@ -0,0 +1,296 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLHdrGuideLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLHdrGuideLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status ConvertWeights(shared_ptr<OpenCLMemory> &ocl_memory, float *weight, float *bias, int pixel_count);
+    Status ConvertTrans(shared_ptr<OpenCLMemory> &ocl_blob, float *data_ptr, float default_val);
+    bool InputParamCheck(RawBuffer &ccm_weight_handle, RawBuffer &ccm_bias_handle,
+                         RawBuffer &shifts_handle, RawBuffer &slopes_handle,
+                         RawBuffer &p_weight_handle, RawBuffer &p_bias_handle);
+
+private:
+    shared_ptr<OpenCLMemory> ocl_ccm_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_shifts_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_slopes_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_projection_ = nullptr;
+};
+
+bool OpenCLHdrGuideLayerAcc::InputParamCheck(
+        RawBuffer &ccm_weight_handle, RawBuffer &ccm_bias_handle,
+        RawBuffer &shifts_handle, RawBuffer &slopes_handle,
+        RawBuffer &p_weight_handle, RawBuffer &p_bias_handle) {
+    return (ccm_weight_handle.GetDataCount() != 9 || ccm_bias_handle.GetDataCount() != 3 ||
+            shifts_handle.GetDataCount() != 12 || slopes_handle.GetDataCount() != 12 ||
+            p_weight_handle.GetDataCount() != 3 || p_bias_handle.GetDataCount() != 1);
+}
+
+Status OpenCLHdrGuideLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init HDRGuide Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "HDRGuide";
+
+    HdrGuideLayerResource *hdr_guide_resource = dynamic_cast<HdrGuideLayerResource *>(resource);
+    RawBuffer &ccm_weight_handle              = hdr_guide_resource->ccm_weight_handle;
+    RawBuffer &ccm_bias_handle                = hdr_guide_resource->ccm_bias_handle;
+    RawBuffer &shifts_handle                  = hdr_guide_resource->shifts_handle;
+    RawBuffer &slopes_handle                  = hdr_guide_resource->slopes_handle;
+    RawBuffer &p_weight_handle                = hdr_guide_resource->projection_weight_handle;
+    RawBuffer &p_bias_handle                  = hdr_guide_resource->projection_bias_handle;
+    if (InputParamCheck(ccm_weight_handle, ccm_bias_handle, shifts_handle,
+                        slopes_handle, p_weight_handle, p_bias_handle)) {
+        LOGE("Invalid data size of HDRGuide Param!\n");
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "Invalid data size of HDRGuide Param!");
+    }
+
+    // get ccm weight
+    if (ccm_weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        float *weight_ptr = ccm_weight_handle.force_to<float *>();
+        float *bias_ptr   = ccm_bias_handle.force_to<float *>();
+        if (weight_ptr == nullptr || bias_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(ocl_ccm_, weight_ptr, bias_ptr, 3);
+        CHECK_TNN_OK(ret)
+    } else {
+        auto weight_ptr = GetFloatFromRawBuffer(ccm_weight_handle);  // handle the memory
+        auto bias_ptr   = GetFloatFromRawBuffer(ccm_bias_handle);    // handle the memory
+        if (weight_ptr == nullptr || bias_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(ocl_ccm_, weight_ptr.get(), bias_ptr.get(), 3);
+        CHECK_TNN_OK(ret)
+    }
+
+    // get ocl_shifts_
+    if (shifts_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        float *data_ptr = shifts_handle.force_to<float *>();
+        if (data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertTrans(ocl_shifts_, data_ptr, 0);
+        CHECK_TNN_OK(ret)
+    } else {
+        auto data_ptr = GetFloatFromRawBuffer(shifts_handle);  // handle the memory
+        if (data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertTrans(ocl_shifts_, data_ptr.get(), 0);
+        CHECK_TNN_OK(ret)
+    }
+
+    // get ocl_slopes_
+    if (slopes_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        float *data_ptr = slopes_handle.force_to<float *>();
+        if (data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertTrans(ocl_slopes_, data_ptr, 1);
+        CHECK_TNN_OK(ret)
+    } else {
+        auto data_ptr = GetFloatFromRawBuffer(slopes_handle);  // handle the memory
+        if (data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertTrans(ocl_slopes_, data_ptr.get(), 1);
+        CHECK_TNN_OK(ret)
+    }
+
+    // get projection weight
+    if (p_weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        float *weight_ptr = p_weight_handle.force_to<float *>();
+        float *bias_ptr   = p_bias_handle.force_to<float *>();
+        if (weight_ptr == nullptr || bias_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(ocl_projection_, weight_ptr, bias_ptr, 1);
+        CHECK_TNN_OK(ret)
+    } else {
+        auto weight_ptr = GetFloatFromRawBuffer(p_weight_handle);  // handle the memory
+        auto bias_ptr   = GetFloatFromRawBuffer(p_bias_handle);    // handle the memory
+        if (weight_ptr == nullptr || bias_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(ocl_projection_, weight_ptr.get(), bias_ptr.get(), 1);
+        CHECK_TNN_OK(ret)
+    }
+
+    // create kernel
+    std::string kernel_name = "HdrGuide";
+    ret                     = CreateExecuteUnit(execute_units_[0], "hdr_guide", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLHdrGuideLayerAcc::~OpenCLHdrGuideLayerAcc() {}
+
+Status OpenCLHdrGuideLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("HDRGuide Layer Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_ccm_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_shifts_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_slopes_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_projection_->GetData()));
+
+    return TNN_OK;
+}
+
+Status OpenCLHdrGuideLayerAcc::ConvertWeights(shared_ptr<OpenCLMemory> &ocl_memory, float *weight, float *bias,
+                                              int pixel_count) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    int buffer_size = pixel_count * 4;
+    cl_int ret      = CL_SUCCESS;
+    cl::Buffer clbuffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                        buffer_size * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    float *clbuffer_ptr = static_cast<float *>(ocl_context_->CommandQueue()->enqueueMapBuffer(
+        clbuffer, true, CL_MAP_WRITE, 0, buffer_size * sizeof(float), nullptr, nullptr, &ret));
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+
+    for (int y = 0; y < pixel_count; ++y) {
+        float *data_ptr = clbuffer_ptr + y * 4;
+        for (int x = 0; x < 3; ++x) {
+            data_ptr[x] = weight[y * 3 + x];
+        }
+        data_ptr[3] = bias[y];
+    }
+
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(clbuffer, clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create ocl_blob
+    int image_w               = pixel_count;
+    int image_h               = 1;
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl::Image2D *image = new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                                         cl::ImageFormat(CL_RGBA, data_type), image_w, image_h, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_memory->SetData(image, true);
+
+    // convert
+    shared_ptr<OpenCLMemory> input_memory(new OpenCLMemory(TNN_CL_BUFFER));
+    input_memory->SetData(&clbuffer);
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    return convertor.ConvertBufferToImage(input_memory.get(), ARGUMENT, {buffer_size}, ocl_memory.get(), true);
+}
+
+Status OpenCLHdrGuideLayerAcc::ConvertTrans(shared_ptr<OpenCLMemory> &ocl_blob, float *weight, float default_val) {
+    OpenCLRuntime *ocl_runtime = OpenCLRuntime::GetInstance();
+
+    int buf_size   = 4 * 4;
+    cl_int ocl_ret = CL_SUCCESS;
+    cl::Buffer clbuf(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, buf_size * sizeof(float),
+                     nullptr, &ocl_ret);
+    if (ocl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ocl_ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    float *clbuf_ptr = static_cast<float *>(ocl_context_->CommandQueue()->enqueueMapBuffer(
+        clbuf, true, CL_MAP_WRITE, 0, buf_size * sizeof(float), nullptr, nullptr, &ocl_ret));
+    if (ocl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ocl_ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+
+    for (int y = 0; y < 4; ++y) {
+        float *data_ptr = clbuf_ptr + y * 4;
+        for (int x = 0; x < 3; ++x) {
+            data_ptr[x] = weight[y + x * 4];
+        }
+        data_ptr[3] = default_val;
+    }
+
+    ocl_ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(clbuf, clbuf_ptr);
+    if (ocl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ocl_ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create ocl_blob
+    int img_width          = 4;
+    int img_height         = 1;
+    cl_channel_type data_t = CL_FLOAT;
+    if (ocl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_t = CL_HALF_FLOAT;
+    cl::Image2D *img = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_t),
+                                       img_width, img_height, 0, nullptr, &ocl_ret);
+    if (ocl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ocl_ret)
+        if (nullptr != img)
+            delete img;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_blob.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_blob->SetData(img, true);
+
+    // convert
+    shared_ptr<OpenCLMemory> input(new OpenCLMemory(TNN_CL_BUFFER));
+    input->SetData(&clbuf);
+    ImageBufferConvertor convertor(ocl_runtime, ocl_context_->CommandQueue());
+    return convertor.ConvertBufferToImage(input.get(), ARGUMENT, {buf_size}, ocl_blob.get(), true);
+}
+
+REGISTER_OPENCL_ACC(HdrGuide, LAYER_HDRGUIDE)
+REGISTER_OPENCL_LAYOUT(LAYER_HDRGUIDE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inner_product_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inner_product_layer_acc.cc
new file mode 100644
index 0000000..012f01b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inner_product_layer_acc.cc
@@ -0,0 +1,310 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/acc/opencl_reshape_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLInnerProductLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLInnerProductLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status InitReshapeLayer(const std::vector<Blob *> &inputs);
+    Status ConvertWeights(float *weights_data_ptr, int weight_w, int weight_h);
+
+private:
+    int num_output_ = 0;
+    int transpose_ = 0;
+    int axis_ = 0;
+    shared_ptr<OpenCLMemory> ocl_weights_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_bias_ = nullptr;
+
+    bool need_reshape_ = false;
+    ReshapeLayerParam reshape_param_;
+    shared_ptr<OpenCLReshapeLayerAcc> reshape_layer_acc_ = nullptr;
+    std::vector<Blob *> reshape_outputs_ = {};
+    shared_ptr<Blob> reshape_output_blob_ = nullptr;
+    shared_ptr<cl::Image2D> reshape_output_image_ = nullptr;
+};
+
+Status OpenCLInnerProductLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init InnerProduct Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "InnerProduct";
+
+    InnerProductLayerParam *innerproduct_param = dynamic_cast<InnerProductLayerParam *>(param);
+    CHECK_PARAM_NULL(innerproduct_param);
+
+    num_output_  = innerproduct_param->num_output;
+    int has_bias = innerproduct_param->has_bias;
+    transpose_   = innerproduct_param->transpose;
+    axis_        = innerproduct_param->axis;
+
+    InnerProductLayerResource *innerproduct_resource = dynamic_cast<InnerProductLayerResource *>(resource);
+    CHECK_PARAM_NULL(innerproduct_resource);
+    RawBuffer &weight_handle = innerproduct_resource->weight_handle;
+    RawBuffer &bias_handle   = innerproduct_resource->bias_handle;
+    DataType data_type       = weight_handle.GetDataType();
+
+    // get weights
+    int weights_height = weight_handle.GetBytesSize() / DataTypeUtils::GetBytesSize(data_type) / num_output_;
+    int weights_width  = num_output_;
+    if (weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer.
+        float *weights_data_ptr = weight_handle.force_to<float *>();
+        if (weights_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(weights_data_ptr, weights_width, weights_height);
+        CHECK_TNN_OK(ret)
+    } else {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(weight_handle);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertWeights(float_data_ptr.get(), weights_width, weights_height);
+        CHECK_TNN_OK(ret)
+    }
+
+    // get bias
+    ret = ConvertChannelWeights(innerproduct_resource->bias_handle, ocl_bias_, num_output_, has_bias);
+    CHECK_TNN_OK(ret)
+
+    // create kernel
+    std::string kernel_name = "Innerproduct";
+    ret                     = CreateExecuteUnit(execute_units_[0], "innerproduct", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLInnerProductLayerAcc::~OpenCLInnerProductLayerAcc() {}
+
+Status OpenCLInnerProductLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("InnerProduct Layer Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input_dims     = inputs[0]->GetBlobDesc().dims;
+    auto output_dims    = outputs[0]->GetBlobDesc().dims;
+    auto output_height  = DimsFunctionUtils::GetDim(output_dims, 2);
+    auto output_width   = DimsFunctionUtils::GetDim(output_dims, 3);
+    auto input_height   = DimsFunctionUtils::GetDim(input_dims, 2);
+    auto input_width    = DimsFunctionUtils::GetDim(input_dims, 3);
+    // now only support axis is channel, output width and output height is 1.
+    if (axis_ != 1 || output_height != 1 || output_width != 1) {
+        LOGE("Invalid InnerParameter param or input/output size!\n");
+        return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "Invalid InnerParameter param or input/output size!");
+    }
+
+    // if input width and input height is not 1, need reshape first.
+    if (input_height != 1 || input_width != 1) {
+        need_reshape_ = true;
+    }
+
+    // init
+    if (need_reshape_) {
+        ret = InitReshapeLayer(inputs);
+        CHECK_TNN_OK(ret)
+    }
+
+    // reshape
+    if (need_reshape_) {
+        if (reshape_layer_acc_ == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "reshape layer acc in InnerProduct is null");
+        }
+        ret = reshape_layer_acc_->Reshape(inputs, reshape_outputs_);
+        CHECK_TNN_OK(ret)
+    }
+
+    // calcuate M,K,N
+    int N = num_output_;
+    int M = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims, 0, axis_);
+    int K = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims, axis_);
+
+    const int K_blocks = UP_DIV(K, 4);
+    const int remain   = K % 4;
+
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(N, 4)), static_cast<uint32_t>(M)};
+    execute_units_[0].local_work_size  = {64, 1};
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    if (need_reshape_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[0]->GetHandle().base));
+    } else {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_weights_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, K_blocks);
+    execute_units_[0].ocl_kernel.setArg(idx++, remain);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    return TNN_OK;
+}
+
+Status OpenCLInnerProductLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = TNN_OK;
+    if (need_reshape_) {
+        // reshape first
+        if (reshape_layer_acc_ == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_FORWARD_ERROR, "reshape layer acc in InnerProduct is null");
+        }
+        ret = reshape_layer_acc_->Forward(inputs, reshape_outputs_);
+        CHECK_TNN_OK(ret)
+    }
+
+    return OpenCLLayerAcc::Forward(inputs, outputs);
+}
+
+Status OpenCLInnerProductLayerAcc::InitReshapeLayer(const std::vector<Blob *> &inputs) {
+    Status ret = TNN_OK;
+
+    reshape_layer_acc_ = std::make_shared<OpenCLReshapeLayerAcc>();
+    if (reshape_layer_acc_ == nullptr) {
+        LOGE("Create Reshape Layer Acc in InnerProduct failed!\n");
+        return Status(TNNERR_CREATE_LAYER, "Create Reshape Layer Acc in InnerProduct failed!");
+    }
+
+    // create output_blob
+    BlobDesc output_desc    = inputs[0]->GetBlobDesc();
+    output_desc.data_format = DATA_FORMAT_NHC4W4;
+    auto dims               = inputs[0]->GetBlobDesc().dims;
+    output_desc.dims[0]     = DimsFunctionUtils::GetDim(dims, 0);
+    output_desc.dims[1]     = DimsFunctionUtils::GetDim(dims, 1) * DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+    output_desc.dims[2]     = 1;
+    output_desc.dims[3]     = 1;
+    reshape_output_blob_    = std::make_shared<Blob>(output_desc);
+    if (reshape_output_blob_ == nullptr) {
+        LOGE("Create reshape output blob in InnerProduct failed!\n");
+        return Status(TNNERR_CREATE_LAYER, "Create reshape output blob in InnerProduct failed!");
+    }
+    reshape_outputs_.clear();
+    reshape_outputs_.push_back(reshape_output_blob_.get());
+
+    // create output_image
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    DimsVector imageshape{(int)(UP_DIV(DimsFunctionUtils::GetDim(output_desc.dims, 1), 4)),
+        DimsFunctionUtils::GetDim(output_desc.dims, 0)};
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl_int err            = CL_SUCCESS;
+    reshape_output_image_ = std::make_shared<cl::Image2D>(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                                                          cl::ImageFormat(CL_RGBA, data_type), imageshape[0],
+                                                          imageshape[1], 0, nullptr, &err);
+    if (err != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(err)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    BlobHandle blob_handle;
+    blob_handle.base = reshape_output_image_.get();
+    reshape_output_blob_->SetHandle(blob_handle);
+
+    // Init LayerAcc
+    reshape_param_.name         = layer_name_ + "_Reshape";
+    reshape_param_.reshape_type = 0;
+    reshape_param_.axis         = 0;
+    reshape_param_.num_axes     = 4;
+    reshape_param_.shape        = {0, -1, 1, 1};
+    reshape_layer_acc_->Init(ocl_context_, &reshape_param_, nullptr, inputs, reshape_outputs_);
+
+    return ret;
+}
+
+Status OpenCLInnerProductLayerAcc::ConvertWeights(float *weights_data_ptr, int weight_w, int weight_h) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    // traspose
+    shared_ptr<float> weights_data_ptr_trans(new float[weight_w * weight_h]);
+    for (size_t i = 0; i < weight_h; i++) {
+        for (size_t j = 0; j < weight_w; j++) {
+            weights_data_ptr_trans.get()[j + i * weight_w] = weights_data_ptr[i + j * weight_h];
+        }
+    }
+
+    // copy weights data into clBuffer
+    DimsVector weight_shape{weight_h, weight_w, 1, 1};
+    shared_ptr<OpenCLMemory> weight_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(weight_shape) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    weight_buffer->SetData(&buffer);
+    auto weight_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        buffer, true, CL_MAP_WRITE, 0, DimsVectorUtils::Count(weight_shape) * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memcpy(weight_clbuffer_ptr, weights_data_ptr_trans.get(), DimsVectorUtils::Count(weight_shape) * sizeof(float));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(buffer, weight_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create ocl_weights_
+    DimsVector weight_imageshape{(int)(UP_DIV(weight_w, 4)), weight_h};
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl::Image2D *image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        weight_imageshape[0], weight_imageshape[1], 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_weights_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_weights_->SetData(image, true);
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    return convertor.ConvertBufferToImage(weight_buffer.get(), NHWC_BUFFER, weight_shape, ocl_weights_.get(), true);
+}
+
+REGISTER_OPENCL_ACC(InnerProduct, LAYER_INNER_PRODUCT)
+REGISTER_OPENCL_LAYOUT(LAYER_INNER_PRODUCT, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_instance_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_instance_norm_layer_acc.cc
new file mode 100644
index 0000000..6c8efff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_instance_norm_layer_acc.cc
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdio.h>
+#include <vector>
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLInstanceNormLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLInstanceNormLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+#if TNN_PROFILE
+    virtual double GetFlops() override;
+    virtual double GetBandwidth() override;
+#endif
+
+private:
+    Status ConvertWeights(float *weights_data_ptr, int output_channel);
+    Status AllocateImage(int batch, int output_channel);
+    Status BuildVarBiasKernel(int width);
+    std::vector<uint32_t> GetLocalWS();
+
+private:
+    bool share_channel_ = false;
+    shared_ptr<OpenCLMemory> ocl_k_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_b_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_var_ = nullptr;
+    shared_ptr<OpenCLMemory> ocl_bias_ = nullptr;
+    int thread_block_w_ = 8;
+    int bench_idx_      = 0;
+};
+
+Status OpenCLInstanceNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init InstanceNorm Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "InstanceNorm";
+
+    InstanceNormLayerResource *instnorm_resource = dynamic_cast<InstanceNormLayerResource *>(resource);
+    if (instnorm_resource == nullptr) {
+        LOGE("InstanceNormLayerResource is null!\n");
+        return Status(TNNERR_MODEL_ERR, "InstanceNormLayerResource is null");
+    }
+
+    RawBuffer &scale_handle = instnorm_resource->scale_handle;
+    RawBuffer &bias_handle  = instnorm_resource->bias_handle;
+    DataType data_type      = scale_handle.GetDataType();
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    int batch       = DimsFunctionUtils::GetDim(input_dims, 0);
+    int channels    = DimsFunctionUtils::GetDim(input_dims, 1);
+    int width       = DimsFunctionUtils::GetDim(input_dims, 3);
+
+    share_channel_ = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(data_type);
+    bool has_bias  = bias_handle.GetBytesSize() != 0;
+    //convert scale
+    ret = ConvertChannelWeights(scale_handle, ocl_k_, channels, true, share_channel_);
+    CHECK_TNN_OK(ret)
+
+    //convert bias
+    ret = ConvertChannelWeights(bias_handle, ocl_b_, channels, has_bias, share_channel_);
+    CHECK_TNN_OK(ret)
+
+    // allocate var and bias
+    ret = AllocateImage(batch, channels);
+    CHECK_TNN_OK(ret)
+
+    // create kernel
+    execute_units_.resize(2);
+    ret = BuildVarBiasKernel(width);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+    //create execute unit
+    ret = CreateExecuteUnit(execute_units_[1], "batch_norm", "BatchNormBatch");
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLInstanceNormLayerAcc::~OpenCLInstanceNormLayerAcc() {}
+
+Status OpenCLInstanceNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Instance Norm Layer Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+
+    // unit0
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(thread_block_w_ * thread_block_w_),
+                                        static_cast<uint32_t>(channel_blocks) * static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 0))};
+    execute_units_[0].local_work_size = GetLocalWS();
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_k_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_b_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, channel_blocks);
+    //input_height
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+    //input_width
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+    //input_height * input_width
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3) *
+            DimsFunctionUtils::GetDim(input_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_var_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+
+    // unit1
+    idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[1], input_dims);
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_var_->GetData()));
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_bias_->GetData()));
+    //input_width
+    execute_units_[1].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+    //input_height
+    execute_units_[1].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+    execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    return TNN_OK;
+}
+
+Status OpenCLInstanceNormLayerAcc::AllocateImage(int batch, int output_channel) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    int image_size            = UP_DIV(output_channel, 4);
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+
+    cl_int ret             = CL_SUCCESS;
+    cl::Image2D *image_var = new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                                             cl::ImageFormat(CL_RGBA, data_type), image_size, batch, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image_var)
+            delete image_var;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_var_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_var_->SetData(image_var, true);
+
+    cl::Image2D *image_bias = new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE,
+                                              cl::ImageFormat(CL_RGBA, data_type), image_size, batch, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image_bias)
+            delete image_bias;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_bias_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_bias_->SetData(image_bias, true);
+
+    return TNN_OK;
+}
+
+Status OpenCLInstanceNormLayerAcc::BuildVarBiasKernel(int width) {
+    Status ret = TNN_OK;
+    while (1) {
+        std::set<std::string> build_options;
+        char temp_str[32];
+        memset(temp_str, 0, 32);
+        snprintf(temp_str, 31, "-DTHREAD_BLOCK_W=%d", thread_block_w_);
+        build_options.emplace(temp_str);
+        std::string kernel_name = "InstanceNormVarBias_LocalMem";
+        ret                     = CreateExecuteUnit(execute_units_[0], "instance_norm", kernel_name, build_options);
+
+        if (execute_units_[0].workgroupsize_max >= thread_block_w_ * thread_block_w_) {
+            break;
+        } else {
+            while (execute_units_[0].workgroupsize_max < thread_block_w_ * thread_block_w_) {
+                thread_block_w_--;
+            }
+        }
+    }
+    return ret;
+}
+
+std::vector<uint32_t> OpenCLInstanceNormLayerAcc::GetLocalWS() {
+    std::vector<uint32_t> lws(2, 0);
+
+    lws[0] = thread_block_w_ * thread_block_w_;
+    lws[1] = 1;
+
+    return lws;
+}
+
+#if TNN_PROFILE
+double OpenCLInstanceNormLayerAcc::GetFlops() {
+    if (0 == bench_idx_) {
+        return 1.0 * DimsVectorUtils::Count(output_dims_) * (1 + 3) / 1000.0 / 1000.0;
+    } else if (1 == bench_idx_) {
+        return 1.0 * DimsVectorUtils::Count(output_dims_) / 1000.0 / 1000.0;
+    } else {
+        return 0;
+    }
+}
+
+double OpenCLInstanceNormLayerAcc::GetBandwidth() {
+    if (0 == bench_idx_) {
+        bench_idx_ = (bench_idx_ + 1) % execute_units_.size();
+        return (2.0 * DimsVectorUtils::Count(output_dims_) + 2.0 * output_dims_[1]) / 1000.0 / 1000.0;
+    } else if (1 == bench_idx_) {
+        bench_idx_ = (bench_idx_ + 1) % execute_units_.size();
+        return (DimsVectorUtils::Count(output_dims_) + 2.0 * output_dims_[0] * output_dims_[3]) / 1000.0 / 1000.0;
+    } else {
+        return 0;
+    }
+}
+#endif
+
+REGISTER_OPENCL_ACC(InstanceNorm, LAYER_INST_BATCH_NORM)
+REGISTER_OPENCL_LAYOUT(LAYER_INST_BATCH_NORM, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inverse_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inverse_layer_acc.cc
new file mode 100644
index 0000000..432afe6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_inverse_layer_acc.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include <iostream>
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+namespace TNN_NS {
+
+class OpenCLInverseLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLInverseLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+Status OpenCLInverseLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init BatchNorm Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+    auto output_dims        = outputs[0]->GetBlobDesc().dims;
+    auto input_dims         = inputs[0]->GetBlobDesc().dims;
+    run_3d_ndrange_         = false;
+    op_name_                = "Inverse";
+    std::string kernel_name = "Inverse";
+
+    // create kernel
+    ret = CreateExecuteUnit(execute_units_[0], "inverse", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLInverseLayerAcc::~OpenCLInverseLayerAcc() {}
+
+Status OpenCLInverseLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[3]);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Inverse, LAYER_INVERSE)
+REGISTER_OPENCL_LAYOUT(LAYER_INVERSE, DATA_FORMAT_NHC4W4);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.cc
new file mode 100644
index 0000000..c116c70
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.cc
@@ -0,0 +1,492 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/string_utils_inner.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/blob_transfer_utils.h"
+
+namespace TNN_NS {
+
+//#define LOCAL_SIZE_FINE_TUNE
+
+Status OpenCLLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    param_       = param;
+    resource_    = resource;
+    layer_name_  = param->name;
+    ocl_context_ = dynamic_cast<OpenCLContext *>(context);
+    if (ocl_context_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "OpenCL Context Convert failed");
+    }
+    execute_units_.resize(1);
+
+    if (context->GetPrecision() != PRECISION_HIGH) {
+        LOGD("OpenCL Blob Pricision is Half!\n");
+        for (auto blob : inputs) {
+            blob->GetBlobDesc().data_type = blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ? DATA_TYPE_INT32 : DATA_TYPE_HALF;
+        }
+        for (auto blob : outputs) {
+            blob->GetBlobDesc().data_type = blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ? DATA_TYPE_INT32 : DATA_TYPE_HALF;
+        }
+    } else {
+        LOGD("OpenCL Blob Pricision is Float!\n");
+        for (auto blob : inputs) {
+            blob->GetBlobDesc().data_type = blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ? DATA_TYPE_INT32 : DATA_TYPE_FLOAT;
+        }
+        for (auto blob : outputs) {
+            blob->GetBlobDesc().data_type = blob->GetBlobDesc().data_type == DATA_TYPE_INT32 ? DATA_TYPE_INT32 : DATA_TYPE_FLOAT;
+        }
+    }
+
+    input_dims_  = inputs[0]->GetBlobDesc().dims;
+    output_dims_ = outputs[0]->GetBlobDesc().dims;
+
+    ConfigKernelStrategy();
+
+    status = ReloadConstantBlobs(inputs, false);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return TNN_OK;
+}
+
+OpenCLLayerAcc::~OpenCLLayerAcc() {}
+
+Status OpenCLLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CheckBlob(inputs, outputs);
+}
+
+Status OpenCLLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+#if defined(LOCAL_SIZE_FINE_TUNE) && TNN_PROFILE
+    auto execute_unit_org                                 = execute_units_[0];
+    auto max_wgs                                          = execute_unit_org.workgroupsize_max;
+    std::vector<std::vector<uint32_t>> local_size_list_3d = {
+        {16, 4, 1}, {8, 8, 1},   {4, 16, 1}, {2, 32, 1}, {1, 64, 1}, {2, 64, 1}, {4, 64, 1},
+        {8, 64, 1}, {16, 64, 1}, {8, 64, 2}, {4, 64, 4}, {2, 64, 8}, {2, 64, 4}, {},
+    };
+    std::vector<std::vector<uint32_t>> local_size_list_2d = {
+        {2, max_wgs / 2},   {4, max_wgs / 4},   {8, max_wgs / 8},
+        {16, max_wgs / 16}, {max_wgs / 2, 2},   {max_wgs / 4, 4},
+        {max_wgs / 8, 8},   {max_wgs / 16, 16}, {},
+    };
+    std::vector<uint32_t> local_size_default;
+    if (execute_unit_org.global_work_size.size() == 2) {
+        local_size_default = LocalWS2DDefault(execute_unit_org);
+    } else if (execute_unit_org.global_work_size.size() == 3) {
+        local_size_default = LocalWS3DDefault(execute_unit_org);
+    }
+
+    OpenCLExecuteUnit exec_unit_default = execute_unit_org;
+    exec_unit_default.local_work_size   = local_size_default;
+    execute_units_.push_back(exec_unit_default);
+
+    if (execute_unit_org.global_work_size.size() == 2) {
+        for (auto local_size : local_size_list_2d) {
+            OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
+            exec_unit_temp.local_work_size   = local_size;
+            execute_units_.push_back(exec_unit_temp);
+        }
+    } else if (execute_unit_org.global_work_size.size() == 3) {
+        for (auto local_size : local_size_list_3d) {
+            OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
+            exec_unit_temp.local_work_size   = local_size;
+            execute_units_.push_back(exec_unit_temp);
+        }
+    }
+
+#endif
+
+    Status ret   = TNN_OK;
+    int unit_idx = 0;
+    for (auto execute_unit : execute_units_) {
+        if (unactive_unit_ids_.find(unit_idx) != unactive_unit_ids_.end()) {
+            unit_idx++;
+            continue;
+        }
+#if TNN_PROFILE
+        std::shared_ptr<OpenCLProfilingData> pdata(new OpenCLProfilingData());
+        UpdateProfilingData(pdata.get(), execute_unit.global_work_size, execute_unit.local_work_size, unit_idx);
+        ret = RunKernel(execute_unit.ocl_kernel, execute_unit.global_work_size, execute_unit.local_work_size,
+                        ocl_context_->CommandQueue(), op_name_, pdata.get());
+        CHECK_TNN_OK(ret)
+        ocl_context_->AddProfilingData(pdata);
+#else
+
+        ret = RunKernel(execute_unit.ocl_kernel, execute_unit.global_work_size, execute_unit.local_work_size,
+                        ocl_context_->CommandQueue(), op_name_);
+        CHECK_TNN_OK(ret)
+
+#endif
+        unit_idx++;
+    }
+
+    if (NeedFlush()) {
+        ocl_context_->CommandQueue()->flush();
+    }
+
+    return TNN_OK;
+}
+
+#if TNN_PROFILE
+void OpenCLLayerAcc::UpdateProfilingData(OpenCLProfilingData *pdata, std::vector<uint32_t> gws,
+                                         std::vector<uint32_t> lws, int idx) {
+    AbstractLayerAcc::UpdateProfilingData(pdata, param_, input_dims_, output_dims_);
+    if (idx != 0)
+        pdata->layer_name += "_" + ToString(idx);
+    pdata->op_name         = op_name_;
+    pdata->global_worksize = gws;
+    pdata->local_worksize  = lws;
+}
+#endif
+
+void OpenCLLayerAcc::ConfigKernelStrategy() {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    gpu_info_                     = opencl_runtime->GetGpuInfo();
+    if (gpu_info_.type == ADRENO && gpu_info_.opencl_version >= 2.0f) {
+        if (gpu_info_.model_num > 509)
+            run_3d_ndrange_ = true;
+    }
+}
+
+std::vector<DataFormat> OpenCLLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (data_type == DATA_TYPE_INT32) {
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    } else {
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    }
+    return support_list;
+}
+
+std::vector<DataType> OpenCLLayerAcc::SupportDataType(int dims_size, BlobType blob_type) {
+    std::vector<DataType> support_list;
+    return {DATA_TYPE_FLOAT, DATA_TYPE_HALF};
+}
+
+bool OpenCLLayerAcc::NeedFlush() {
+    // flush by magic number
+    if (0 == ocl_context_->AddAndGetFlushCount() % 10) {
+        return true;
+    }
+    return false;
+}
+
+Status OpenCLLayerAcc::ConvertChannelWeights(RawBuffer &raw_handle, shared_ptr<OpenCLMemory> &ocl_handle,
+                                             int output_channel, bool has_handle, bool share_channel, bool use_buffer) {
+    // convert first check handle is null and handle data type is float or half,
+    // then process with float pointer.
+    Status ret = TNN_OK;
+    if (!has_handle) {
+        ret = ConvertChannelWeights(nullptr, ocl_handle, output_channel, has_handle, share_channel, use_buffer);
+        CHECK_TNN_OK(ret)
+    } else if (raw_handle.GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer
+        float *handle_data_ptr = raw_handle.force_to<float *>();
+        if (handle_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertChannelWeights(handle_data_ptr, ocl_handle, output_channel, has_handle, share_channel, use_buffer);
+        CHECK_TNN_OK(ret)
+    } else {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(raw_handle);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        ret = ConvertChannelWeights(float_data_ptr.get(), ocl_handle, output_channel, has_handle, share_channel,
+                                    use_buffer);
+        CHECK_TNN_OK(ret)
+    }
+
+    return ret;
+}
+
+// ConvertChannelWeights only convert weights dims equal to 1 or output_channel.
+// Convert Weights to clBuffer or ClImage, pack c4.
+Status OpenCLLayerAcc::ConvertChannelWeights(float *handle_data_ptr, shared_ptr<OpenCLMemory> &ocl_handle,
+                                             int output_channel, bool has_handle, bool share_channel, bool use_buffer) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    // copy weights data into clBuffer
+    int handle_size = UP_DIV(output_channel, 4) * 4;
+    cl_int ret      = CL_SUCCESS;
+    cl::Buffer handle_clbuffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                               handle_size * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    float *handle_clbuffer_ptr = (float *)ocl_context_->CommandQueue()->enqueueMapBuffer(
+        handle_clbuffer, true, CL_MAP_WRITE, 0, handle_size * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memset(handle_clbuffer_ptr, 0, handle_size * sizeof(float));
+    if (has_handle) {
+        for (int i = 0; i < output_channel; ++i) {
+            handle_clbuffer_ptr[i] = share_channel ? handle_data_ptr[0] : handle_data_ptr[i];
+        }
+    }
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(handle_clbuffer, handle_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create ocl_handle_
+    if (use_buffer_) {
+        // use clBuffer
+        ocl_handle.reset(new OpenCLMemory(TNN_CL_BUFFER));
+        size_t type_size = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+            type_size = 2;
+        cl::Buffer *buffer =
+            new cl::Buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE, handle_size * type_size, nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            if (nullptr != buffer)
+                delete buffer;
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+        }
+        ocl_handle->SetData(buffer, true);
+
+        // convert buffer to buffer
+        shared_ptr<OpenCLMemory> input(new OpenCLMemory(TNN_CL_BUFFER));
+        input->SetData(&handle_clbuffer);
+        ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+        return convertor.ConvertBufferToBuffer(input.get(), ARGUMENT, {output_channel}, ocl_handle.get(), true);
+
+    } else {
+        // use clImage
+        int ocl_handle_w          = UP_DIV(output_channel, 4);
+        int ocl_handle_h          = 1;
+        cl_channel_type data_type = CL_FLOAT;
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+            data_type = CL_HALF_FLOAT;
+        cl::Image2D *image =
+            new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                            ocl_handle_w, ocl_handle_h, 0, nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            if (nullptr != image)
+                delete image;
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+        }
+        ocl_handle.reset(new OpenCLMemory(TNN_CL_IMAGE));
+        ocl_handle->SetData(image, true);
+
+        // convert buffer to image
+        shared_ptr<OpenCLMemory> input_blob(new OpenCLMemory(TNN_CL_BUFFER));
+        input_blob->SetData(&handle_clbuffer);
+        ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+        return convertor.ConvertBufferToImage(input_blob.get(), ARGUMENT, {output_channel}, ocl_handle.get(), true);
+    }
+}
+
+Status OpenCLLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        auto buffer = (*const_resource)[name];
+        // int32 blob is not supported on opencl, only used on cpu
+        if (buffer->GetDataType() == DATA_TYPE_INT32) {
+            continue;
+        }
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+        auto status = RawBuffer2OpenCLBlob(buffer.get(), blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        auto dims = iter->GetBlobDesc().dims;
+        auto data_type_size = DataTypeUtils::GetBytesSize(iter->GetBlobDesc().data_type);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        iter->GetBlobDesc() = blob->GetBlobDesc();
+        LOGD("Reload constant blob: %s\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+Status OpenCLLayerAcc::RawBuffer2OpenCLBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, DataFormat format) {
+    if (!buffer || buffer->GetBufferDims().size() > 4) {
+        return Status(TNNERR_PARAM_ERR, "raw buffer for opencl blob is invalid");
+    }
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    float *buffer_data_ptr;
+    if (buffer->GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer
+        buffer_data_ptr = buffer->force_to<float *>();
+        if (buffer_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+    } else if (buffer->GetDataType() == DATA_TYPE_HALF) {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(*buffer);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        buffer_data_ptr = float_data_ptr.get();
+    } else {
+        return Status(TNNERR_PARAM_ERR, "data type for opencl blob is invalid");
+    }
+
+    if (format == DATA_FORMAT_NHC4W4) {
+        // copy raw buffer data into clBuffer
+        std::shared_ptr<OpenCLMemory> blob_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+        auto dims = buffer->GetBufferDims();
+        int buffer_size  = DimsVectorUtils::Count(dims);
+        int blob_buffer_size = DimsFunctionUtils::GetDim(dims, 0) *
+                            ALIGN_UP4(DimsFunctionUtils::GetDim(dims, 1)) *
+                            DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        cl_int ret      = CL_SUCCESS;
+        cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                            blob_buffer_size * sizeof(float), nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+        }
+        blob_buffer->SetData(&cl_buffer);
+        auto cl_buffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+            cl_buffer, true, CL_MAP_WRITE, 0, blob_buffer_size * sizeof(float), nullptr, nullptr, &ret);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+        }
+        memset(cl_buffer_ptr, 0, blob_buffer_size * sizeof(float));
+        memcpy(cl_buffer_ptr, buffer_data_ptr, buffer_size * sizeof(float));
+        ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(cl_buffer, cl_buffer_ptr);
+        if (ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(ret)
+            return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+        }
+
+        BlobDesc desc;
+        desc.device_type = DEVICE_OPENCL;
+        desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+        desc.dims = dims;
+        desc.data_format = format;
+        if (buffer_size > 0) {
+            blob = std::make_shared<Blob>(desc, true);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "raw buffer for opencl blob is empty");
+        }
+
+        // transfer from clBuffer to clImage
+        ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+        std::shared_ptr<OpenCLMemory> blob_memory;
+        blob_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+        blob_memory->SetData(blob->GetHandle().base, false);
+        Status ret_convert = convertor.ConvertBufferToImage(
+                blob_buffer.get(), NCHW_BUFFER, dims, blob_memory.get(), true);
+        CHECK_TNN_OK(ret_convert)
+    } else {
+        return Status(TNNERR_PARAM_ERR, "only NHC4W4 blob is supported for now");
+    }
+    return TNN_OK;
+}
+
+Status OpenCLLayerAcc::CheckBlob(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    /*
+     * Check whether the format is supported by OpenCLLayerAcc or not.
+     * The supported format of each layer is given by LayerAcc.
+     * OpenCL Blob may change format after allocate.
+     */
+    for (auto blob : outputs) {
+        Status ret = ResolveBlobDataFormat(blob, BLOB_OUTPUT);
+        if (ret != TNN_OK) {
+            LOGE("Resolve Layer(%s)-Output Blob(%s) Data Format(%d) failed\n",
+                 layer_name_.c_str(), blob->GetBlobDesc().name.c_str(), blob->GetBlobDesc().data_format);
+            return ret;
+        }
+
+        ret = ResolveBlobDataType(blob, BLOB_OUTPUT);
+        if (ret != TNN_OK) {
+            LOGE("Resolve Layer(%s)-Output Blob(%s) Data Type(%d) failed\n",
+                 layer_name_.c_str(), blob->GetBlobDesc().name.c_str(), blob->GetBlobDesc().data_type);
+            return ret;
+        }
+    }
+
+    for (auto blob : inputs) {
+        Status ret = ResolveBlobDataFormat(blob, BLOB_INPUT);
+        if (ret != TNN_OK) {
+            LOGE("Resolve Layer(%s)-Input Blob(%s) Data Format(%d) failed\n",
+                 layer_name_.c_str(), blob->GetBlobDesc().name.c_str(), blob->GetBlobDesc().data_format);
+            return ret;
+        }
+
+        ret = ResolveBlobDataType(blob, BLOB_INPUT);
+        if (ret != TNN_OK) {
+            LOGE("Resolve Layer(%s)-Input Blob(%s) Data Type(%d) failed\n",
+                 layer_name_.c_str(), blob->GetBlobDesc().name.c_str(), blob->GetBlobDesc().data_type);
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLLayerAcc::ResolveBlobDataType(Blob *blob, BlobType blob_type) {
+    auto desc = blob->GetBlobDesc();
+    auto support_list = SupportDataType(static_cast<int>(desc.dims.size()), blob_type);
+    if (support_list.size() <= 0) {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT,
+                      "unsupported data type for device acc");
+    }
+
+    /*
+     * DATA_TYPE_AUTO : first type supported by the LayerAcc
+     * Others:  return error if LayerAcc not support.
+     */
+    if (desc.data_type == DATA_TYPE_AUTO) {
+        desc.data_type = support_list[0];
+        blob->SetBlobDesc(desc);
+        return TNN_OK;
+    } else {
+        auto iter = std::find(support_list.begin(), support_list.end(), desc.data_type);
+        if (iter != support_list.end()) {
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "unsupported data type for device acc");
+        }
+    }
+}
+
+
+void OpenCLLayerAcc::InsertUnactiveUnitId(int id) {
+    unactive_unit_ids_.insert(id);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.h
new file mode 100644
index 0000000..5b3c24f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_layer_acc.h
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/abstract_layer_acc.h"
+
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_device.h"
+#include "tnn/device/opencl/opencl_execute_unit.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/device/opencl/opencl_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLLayerAcc : public AbstractLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false) override;
+
+#if TNN_PROFILE
+    virtual void UpdateProfilingData(OpenCLProfilingData *pdata, std::vector<uint32_t> gws, std::vector<uint32_t> lws,
+                                     int idx = 0);
+#endif
+protected:
+    virtual bool NeedFlush();
+
+    void ConfigKernelStrategy();
+
+    void InsertUnactiveUnitId(int id);
+
+    Status ConvertChannelWeights(RawBuffer &raw_handle, shared_ptr<OpenCLMemory> &ocl_handle, int output_channel,
+                                 bool has_value = true, bool share_channel = false, bool use_buffer = false);
+
+    Status RawBuffer2OpenCLBlob(RawBuffer *buffer, std::shared_ptr<Blob> &blob, DataFormat format = DATA_FORMAT_NHC4W4);
+    OpenCLContext *ocl_context_ = nullptr;
+    std::vector<OpenCLExecuteUnit> execute_units_ = {};
+
+    LayerParam *param_ = nullptr;
+    LayerResource *resource_ = nullptr;
+    std::string op_name_ = "";
+    std::string layer_name_ = "";
+    DimsVector input_dims_ = {};
+    DimsVector output_dims_ = {};
+
+    GpuInfo gpu_info_;
+    bool run_3d_ndrange_ = false;
+    bool use_buffer_     = false;
+
+private:
+    Status ConvertChannelWeights(float *handle_data_ptr, shared_ptr<OpenCLMemory> &ocl_handle, int output_channel,
+                                 bool has_handle = true, bool share_channel = false, bool use_buffer = false);
+
+    Status CheckBlob(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+
+    // @brief return device layer acc support data type
+    virtual std::vector<DataType> SupportDataType(int dims_size, BlobType blob_type);
+
+    // @brief decide Blob Data Type based on support data type list
+    virtual Status ResolveBlobDataType(Blob *blob, BlobType blob_type);
+
+    std::set<int> unactive_unit_ids_ = {};
+};
+
+#define DECLARE_OPENCL_ACC(type_string)                                                                                \
+    class OpenCL##type_string##LayerAcc : public OpenCLLayerAcc {                                                      \
+    public:                                                                                                            \
+        virtual ~OpenCL##type_string##LayerAcc(){};                                                                    \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;           \
+        virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;        \
+    }
+
+#define REGISTER_OPENCL_ACC(type_string, layer_type)                                                                   \
+    OpenCLTypeLayerAccRegister<TypeLayerAccCreator<OpenCL##type_string##LayerAcc>>                                     \
+        g_opencl_##layer_type##_acc_register(layer_type);
+
+class OpenCLTypeLayerLayoutCreator {
+public:
+    static std::shared_ptr<ImplementedLayout> UpdateImplementedLayout(LayerType layer_type, DataFormat layout) {
+        // make sure opencl device has been registered
+        TypeDeviceRegister<OpenCLDevice> opencl_device_register(DEVICE_OPENCL);
+        auto implemented_layout = GetDevice(DEVICE_OPENCL)->GetImplementedLayout(layer_type);
+        auto updated_layout     = std::make_shared<ImplementedLayout>(*implemented_layout);
+        updated_layout->layouts.push_back(layout);
+        return updated_layout;
+    }
+};
+
+#define REGISTER_OPENCL_LAYOUT(layer_type, layout)                                                                        \
+    OpenCLTypeLayerLayoutRegister g_opencl_##layer_type##_##layout##_layout_register(                                      \
+             layer_type, OpenCLTypeLayerLayoutCreator::UpdateImplementedLayout(layer_type, layout));
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_log_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_log_layer_acc.cc
new file mode 100644
index 0000000..0116598
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_log_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Log);
+
+Status OpenCLLogLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Log Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Log";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLLogLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "log(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLLogLayerAcc::~OpenCLLogLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Log, LAYER_LOG)
+REGISTER_OPENCL_LAYOUT(LAYER_LOG, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_logsigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_logsigmoid_layer_acc.cc
new file mode 100644
index 0000000..a05d588
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_logsigmoid_layer_acc.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(LogSigmoid);
+
+Status OpenCLLogSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init LogSigmoid Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "LogSigmoid";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLLogSigmoidLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "log((FLOAT)(1.0f)/((FLOAT)(1.0f)+exp(-in)))";
+    build_options.emplace(" -DOPERATOR=" + compute);
+
+    AdjustBuildOptionForFp32(build_options);
+    return build_options;
+}
+
+OpenCLLogSigmoidLayerAcc::~OpenCLLogSigmoidLayerAcc() {}
+
+REGISTER_OPENCL_ACC(LogSigmoid, LAYER_LOGSIGMOID)
+REGISTER_OPENCL_LAYOUT(LAYER_LOGSIGMOID, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.cc
new file mode 100644
index 0000000..0f1f46a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.cc
@@ -0,0 +1,618 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_lstm_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+Status OpenCLLSTMONNXLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init LSTMONNX Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "LSTMONNX";
+
+    execute_units_.resize(3);
+
+    {
+        std::string kernel_name = "LSTMONNXGates";
+        ret                     = CreateExecuteUnit(execute_units_[0], "lstm", kernel_name);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    {
+        std::string kernel_name = "LSTMONNXForward";
+        ret                     = CreateExecuteUnit(execute_units_[1], "lstm", kernel_name);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    {
+        std::string kernel_name = "LSTMONNXResultConvert";
+        ret                     = CreateExecuteUnit(execute_units_[2], "lstm", kernel_name);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLLSTMONNXLayerAcc::~OpenCLLSTMONNXLayerAcc() {}
+
+Status OpenCLLSTMONNXLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("LSTMONNX Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: LSTMONNX layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: LSTMONNX layer param is null");
+    }    
+
+    auto input          = inputs[0];
+    Blob *blob_w        = inputs[1];
+    Blob *blob_r        = inputs[2];
+    Blob *blob_b        = inputs[3];
+    Blob *blob_h0       = nullptr;
+    Blob *blob_c0       = nullptr;
+    auto output         = outputs[0];
+    auto output_hidden  = outputs[1];
+    auto output_cell    = outputs[2];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+    const auto sequence = DimsFunctionUtils::GetDim(input_dims, 0);
+    const auto batch    = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int input_size = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_size_updiv_4 = UP_DIV(input_size, 4);
+    int num_directions  = layer_param->direction >=2 ? 2 : 1;
+    const int hidden_size = DimsFunctionUtils::GetDim(output_dims, 2) / num_directions;
+    const int hidden_size_updiv_4 = UP_DIV(hidden_size, 4);
+    int reverse         = layer_param->direction == 1;
+
+    if (inputs.size() >= 6) {
+        blob_h0 = inputs[4];
+        blob_c0 = inputs[5];
+    } else {
+        Status ret = CreateDefaultState(num_directions, batch, hidden_size, ocl_zero_state_blob_);
+        if (ret != TNN_OK) {
+            return Status(TNNERR_LAYER_ERR, "Empty initial states create failed");
+        }
+        blob_h0 = ocl_zero_state_blob_.get();
+        blob_c0 = ocl_zero_state_blob_.get();
+    }
+
+    ret = AllocateTempBlob(num_directions, hidden_size, batch, sequence, ocl_gates_);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_LAYER_ERR, "Allocate gates failed");
+    }
+
+    // special case, reverse output is stored after forward output
+    bool need_temp_out = (hidden_size % 4 != 0 && num_directions == 2);
+    if (need_temp_out) {
+        ret = AllocateTempBlob(num_directions, hidden_size, batch, sequence, ocl_temp_out_);
+        if (ret != TNN_OK) {
+            return Status(TNNERR_LAYER_ERR, "Allocate gates failed");
+        }
+    }
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int type_size = sizeof(float);
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+        type_size = 2;
+    }
+
+    {
+        execute_units_[0].global_work_size = {static_cast<uint32_t>(hidden_size_updiv_4 * 4 * num_directions),
+                                              static_cast<uint32_t>(sequence * batch)};
+        execute_units_[0].local_work_size = LocalWS2DDefault(execute_units_[0]);
+        uint32_t idx = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)blob_w->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, input_size_updiv_4);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_gates_->GetHandle().base));
+    }
+
+    {
+        execute_units_[1].global_work_size = {static_cast<uint32_t>(hidden_size_updiv_4 * num_directions),
+                                              static_cast<uint32_t>(batch)};
+        execute_units_[1].local_work_size = {static_cast<uint32_t>(hidden_size_updiv_4), 1};
+        uint32_t idx = 0;
+        execute_units_[1].ocl_kernel.setArg(idx++, execute_units_[1].global_work_size[0]);
+        execute_units_[1].ocl_kernel.setArg(idx++, execute_units_[1].global_work_size[1]);
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_gates_->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)blob_r->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)blob_b->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)blob_h0->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)blob_c0->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, sequence);
+        execute_units_[1].ocl_kernel.setArg(idx++, num_directions);
+        execute_units_[1].ocl_kernel.setArg(idx++, hidden_size_updiv_4);
+        execute_units_[1].ocl_kernel.setArg(idx++, reverse);
+        int h_local_size = batch * num_directions * hidden_size_updiv_4 * 4 * type_size;
+        execute_units_[1].ocl_kernel.setArg(idx++, h_local_size, nullptr);
+        if (need_temp_out) {
+            execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_temp_out_->GetHandle().base));
+        } else {
+            execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        }
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output_hidden->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output_cell->GetHandle().base));
+    }
+
+    if (need_temp_out) {
+        execute_units_[2].global_work_size = {
+            static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(output_dims, 2), 4)),
+            static_cast<uint32_t>(sequence * batch)};
+        execute_units_[2].local_work_size = LocalWS2DDefault(execute_units_[2]);
+        uint32_t idx = 0;
+        execute_units_[2].ocl_kernel.setArg(idx++, execute_units_[2].global_work_size[0]);
+        execute_units_[2].ocl_kernel.setArg(idx++, execute_units_[2].global_work_size[1]);
+        execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_temp_out_->GetHandle().base));
+        execute_units_[2].ocl_kernel.setArg(idx++, hidden_size);
+        execute_units_[2].ocl_kernel.setArg(idx++, hidden_size_updiv_4);
+        execute_units_[2].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    } else {
+        InsertUnactiveUnitId(2);
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLLSTMONNXLayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM ONNX has invalid inputs");
+    }
+
+    // load w from constant blobs
+    auto w = inputs[1];
+    std::string name = w->GetBlobDesc().name;
+    if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+        return Status(TNNERR_LAYER_ERR, "LSTM ONNX has invalid input-w");
+    }
+
+    auto buffer = (*const_resource)[name];
+    std::shared_ptr<Blob> blob = nullptr;
+    if (const_blob_map_.find(name) != const_blob_map_.end()) {
+        blob = const_blob_map_[name];
+    } else {
+        auto status = ConvertWeights(buffer, blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map_[name] = blob;
+    }
+    w->SetHandle(blob->GetHandle());
+
+    // load r from constant blobs
+    auto r = inputs[2];
+    name = r->GetBlobDesc().name;
+    if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+        return Status(TNNERR_LAYER_ERR, "LSTM ONNX has invalid input-r");
+    }
+
+    buffer = (*const_resource)[name];
+    blob = nullptr;
+    if (const_blob_map_.find(name) != const_blob_map_.end()) {
+        blob = const_blob_map_[name];
+    } else {
+        auto status = ConvertWeights(buffer, blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map_[name] = blob;
+    }
+    r->SetHandle(blob->GetHandle());
+
+    // load b from constant blobs
+    auto b = inputs[3];
+    name = b->GetBlobDesc().name;
+    if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+        return Status(TNNERR_LAYER_ERR, "LSTM ONNX has invalid input-b");
+    }
+
+    buffer = (*const_resource)[name];
+    blob = nullptr;
+    if (const_blob_map_.find(name) != const_blob_map_.end()) {
+        blob = const_blob_map_[name];
+    } else {
+        auto status = ConvertBias(buffer, blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map_[name] = blob;
+    }
+    b->SetHandle(blob->GetHandle());
+
+    if (inputs.size() >= 6) {
+        auto h0 = inputs[4];
+        name = h0->GetBlobDesc().name;
+        if (const_resource != nullptr && const_resource->find(name) != const_resource->end()) {
+            buffer = (*const_resource)[name];
+            blob = nullptr;
+            if (const_blob_map_.find(name) != const_blob_map_.end()) {
+                blob = const_blob_map_[name];
+            } else {
+                auto status = ConvertInitialState(buffer, blob);
+                RETURN_ON_NEQ(status, TNN_OK);
+                blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+                const_blob_map_[name] = blob;
+            }
+            h0->SetHandle(blob->GetHandle());
+        }
+
+        auto c0 = inputs[5];
+        name = c0->GetBlobDesc().name;
+        if (const_resource != nullptr && const_resource->find(name) != const_resource->end()) {
+            buffer = (*const_resource)[name];
+            blob = nullptr;
+            if (const_blob_map_.find(name) != const_blob_map_.end()) {
+                blob = const_blob_map_[name];
+            } else {
+                auto status = ConvertInitialState(buffer, blob);
+                RETURN_ON_NEQ(status, TNN_OK);
+                blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+                const_blob_map_[name] = blob;
+            }
+            c0->SetHandle(blob->GetHandle());
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLLSTMONNXLayerAcc::ConvertWeights(std::shared_ptr<RawBuffer> buffer, std::shared_ptr<Blob>& blob) {
+    if (!buffer || buffer->GetBufferDims().size() != 3) {
+        return Status(TNNERR_PARAM_ERR, "weights buffer is invalid");
+    }
+
+    float *weights_data_ptr;
+    if (buffer->GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer
+        weights_data_ptr = buffer->force_to<float *>();
+        if (weights_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+    } else {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(*buffer);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        weights_data_ptr = float_data_ptr.get();
+    }
+
+    // weights: [num_directions, 4 * hidden_size, weights_width]
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int num_directions = buffer->GetBufferDims()[0];
+    int gates_hidden_size = buffer->GetBufferDims()[1];
+    int weights_width = buffer->GetBufferDims()[2];
+    int weights_w = num_directions * ALIGN_UP4(gates_hidden_size / 4) * 4, weights_h = weights_width;
+
+    // copy weights data into clBuffer
+    std::shared_ptr<OpenCLMemory> weights_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                         DimsVectorUtils::Count(buffer->GetBufferDims()) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    weights_buffer->SetData(&cl_buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(cl_buffer, CL_TRUE, 0,
+            DimsVectorUtils::Count(buffer->GetBufferDims()) * sizeof(float), weights_data_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    BlobDesc desc;
+    // use CNH4 format to desc weights blob
+    DimsVector weights_image_shape = {1, weights_h, weights_w};
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = weights_image_shape;
+    desc.data_format = DATA_FORMAT_CNH4;
+    if (buffer->GetBytesSize() > 0) {
+        blob = std::make_shared<Blob>(desc, true);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "weights buffer is empty");
+    }
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> weights_memory;
+    weights_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    weights_memory->SetData(blob->GetHandle().base, false);
+    Status ret_convert = convertor.ConvertBufferToImage(
+            weights_buffer.get(), LSTM_FILTER, buffer->GetBufferDims(), weights_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+Status OpenCLLSTMONNXLayerAcc::ConvertBias(std::shared_ptr<RawBuffer> buffer, std::shared_ptr<Blob>& blob) {
+    if (!buffer || buffer->GetBufferDims().size() != 2) {
+        return Status(TNNERR_PARAM_ERR, "bias buffer is invalid");
+    }
+
+    float *bias_data_ptr;
+    if (buffer->GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer
+        bias_data_ptr = buffer->force_to<float *>();
+        if (bias_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+    } else {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(*buffer);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        bias_data_ptr = float_data_ptr.get();
+    }
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int num_directions = buffer->GetBufferDims()[0];
+    int gates_hidden_size = buffer->GetBufferDims()[1];
+    int hidden_size = gates_hidden_size / 8;
+    int bias_w = ALIGN_UP4(hidden_size) * 8, bias_h = num_directions;
+
+    // bias: [num_directions, 8 * hidden_size]
+    // copy bias data into clBuffer
+    std::shared_ptr<OpenCLMemory> bias_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                         DimsVectorUtils::Count(buffer->GetBufferDims()) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    bias_buffer->SetData(&cl_buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(cl_buffer, CL_TRUE, 0,
+            DimsVectorUtils::Count(buffer->GetBufferDims()) * sizeof(float), bias_data_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    BlobDesc desc;
+    // use CNH4 format to desc bias blob
+    DimsVector bias_image_shape = {1, bias_h, bias_w};
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = bias_image_shape;
+    desc.data_format = DATA_FORMAT_CNH4;
+    if (buffer->GetBytesSize() > 0) {
+        blob = std::make_shared<Blob>(desc, true);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "weights buffer is empty");
+    }
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> bias_memory;
+    bias_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    bias_memory->SetData(blob->GetHandle().base, false);
+    Status ret_convert = convertor.ConvertBufferToImage(
+            bias_buffer.get(), LSTM_BIAS, buffer->GetBufferDims(), bias_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+Status OpenCLLSTMONNXLayerAcc::ConvertInitialState(std::shared_ptr<RawBuffer> buffer,
+                                                   std::shared_ptr<Blob>& blob) {
+    if (!buffer || buffer->GetBufferDims().size() != 3) {
+        return Status(TNNERR_PARAM_ERR, "state buffer is invalid");
+    }
+
+    float *state_data_ptr;
+    if (buffer->GetDataType() == DATA_TYPE_FLOAT) {
+        // get float pointer from raw buffer
+        state_data_ptr = buffer->force_to<float *>();
+        if (state_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+    } else {
+        // if handle is half, need convert to float first.
+        auto float_data_ptr = GetFloatFromRawBuffer(*buffer);
+        if (float_data_ptr == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+        }
+        state_data_ptr = float_data_ptr.get();
+    }
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int num_directions = buffer->GetBufferDims()[0];
+    int batch = buffer->GetBufferDims()[1];
+    int hidden_size = buffer->GetBufferDims()[2];
+    int state_w = num_directions * batch, state_h = hidden_size;
+
+    // state: [num_directions, batch, hidden_size]
+    // transpose
+    DimsVector state_shape = {state_w, state_h, 1, 1};
+    std::shared_ptr<float> state_data_ptr_trans(new float[state_w * state_h]);
+    for (size_t d = 0; d < num_directions; d++) {
+        for (size_t b = 0; b < batch; b++) {
+            for (size_t i = 0; i < hidden_size; i++) {
+                state_data_ptr_trans.get()[(b * num_directions + d) * hidden_size + i] =
+                    state_data_ptr[(d * batch + b) * hidden_size + i];
+            }
+        }
+    }
+
+    // transposed state: [batch * num_directions, hidden_size]
+
+    // copy state data into clBuffer
+    std::shared_ptr<OpenCLMemory> state_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                         DimsVectorUtils::Count(state_shape) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    state_buffer->SetData(&cl_buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(cl_buffer, CL_TRUE, 0,
+            DimsVectorUtils::Count(state_shape) * sizeof(float), state_data_ptr_trans.get());
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    BlobDesc desc;
+    // use CNH4 format to desc state blob
+    DimsVector state_cnh4_shape = {1, state_w, state_h};
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = state_cnh4_shape;
+    desc.data_format = DATA_FORMAT_CNH4;
+    if (buffer->GetBytesSize() > 0) {
+        blob = std::make_shared<Blob>(desc, true);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "weights buffer is empty");
+    }
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> state_memory;
+    state_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    state_memory->SetData(blob->GetHandle().base, false);
+    Status ret_convert = convertor.ConvertBufferToImage(
+            state_buffer.get(), NHWC_BUFFER, state_shape, state_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+Status OpenCLLSTMONNXLayerAcc::CreateDefaultState(int num_directions,
+                                                  int batch,
+                                                  int hidden_size,
+                                                  std::shared_ptr<Blob>& blob) {
+    DimsVector shape = {num_directions, batch, hidden_size};
+    if (blob) {
+        auto dims = blob->GetBlobDesc().dims;
+        if (dims == shape) return TNN_OK;
+    }
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    BlobDesc desc;
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = shape;
+    desc.data_format = DATA_FORMAT_CNH4;
+    blob = std::make_shared<Blob>(desc, true);
+    int image_width = ALIGN_UP4(hidden_size), image_height = num_directions * batch;
+
+    std::vector<float> zero_buffer_data(image_width * image_height, 0);
+    std::shared_ptr<OpenCLMemory> state_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                         image_width * image_height * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    state_buffer->SetData(&cl_buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(cl_buffer, CL_TRUE, 0,
+            image_width * image_height * sizeof(float), zero_buffer_data.data());
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> state_memory;
+    state_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    state_memory->SetData(blob->GetHandle().base, false);
+    DimsVector nhwc_buffer_shape = {image_height, image_width, 1, 1};
+    Status ret_convert = convertor.ConvertBufferToImage(
+            state_buffer.get(), NHWC_BUFFER, nhwc_buffer_shape, state_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+// temp blob: [hidden_size * 4 * num_directions, sequence * batch]
+Status OpenCLLSTMONNXLayerAcc::AllocateTempBlob(int num_directions,
+                                                int hidden_size,
+                                                int batch,
+                                                int sequence,
+                                                std::shared_ptr<Blob>& blob) {
+    DimsVector shape = {sequence, batch, ALIGN_UP4(hidden_size) * 4 * num_directions};
+    if (blob) {
+        auto dims = blob->GetBlobDesc().dims;
+        if (dims == shape) return TNN_OK;
+    }
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    BlobDesc desc;
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = shape;
+    desc.data_format = DATA_FORMAT_CNH4;
+    blob = std::make_shared<Blob>(desc, true);
+
+    std::vector<float> zero_buffer_data(DimsVectorUtils::Count(shape), 0);
+    std::shared_ptr<OpenCLMemory> state_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer cl_buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                         DimsVectorUtils::Count(shape) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    state_buffer->SetData(&cl_buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(cl_buffer, CL_TRUE, 0,
+            DimsVectorUtils::Count(shape) * sizeof(float), zero_buffer_data.data());
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> state_memory;
+    state_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    state_memory->SetData(blob->GetHandle().base, false);
+    DimsVector nhwc_buffer_shape = {sequence * batch, ALIGN_UP4(hidden_size) * 4 * num_directions, 1, 1};
+    Status ret_convert = convertor.ConvertBufferToImage(
+            state_buffer.get(), NHWC_BUFFER, nhwc_buffer_shape, state_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+std::vector<DataFormat> OpenCLLSTMONNXLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size >= 2) {
+        support_list.push_back(DATA_FORMAT_CNH4);
+    }
+    return support_list;
+}
+
+REGISTER_OPENCL_ACC(LSTMONNX, LAYER_LSTMONNX);
+REGISTER_OPENCL_LAYOUT(LAYER_LSTMONNX, DATA_FORMAT_CNH4);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.h
new file mode 100644
index 0000000..d44dbdc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_lstm_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LSTM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LSTM_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/opencl_memory.h"
+
+namespace TNN_NS {
+class OpenCLLSTMONNXLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLLSTMONNXLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false) override;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    Status ConvertWeights(std::shared_ptr<RawBuffer> buffer, std::shared_ptr<Blob>& blob);
+    Status ConvertBias(std::shared_ptr<RawBuffer> buffer, std::shared_ptr<Blob>& blob);
+    Status ConvertInitialState(std::shared_ptr<RawBuffer> buffer, std::shared_ptr<Blob>& blob);
+    Status CreateDefaultState(int num_directions, int batch_size, int hidden_size, std::shared_ptr<Blob>& blob);
+    Status AllocateTempBlob(int num_directions, int hidden_size, int batch, int sequence, std::shared_ptr<Blob>& blob);
+
+private:
+    std::shared_ptr<Blob> ocl_gates_;
+    std::shared_ptr<Blob> ocl_temp_out_;
+    std::shared_ptr<Blob> ocl_zero_state_blob_; // default state blob
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_LSTM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.cc
new file mode 100644
index 0000000..9d0e8dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.cc
@@ -0,0 +1,379 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_mat_mul_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLMatMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init MatMul Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "MatMul";
+
+    MatMulLayerParam *matmul_param = dynamic_cast<MatMulLayerParam *>(param);
+    CHECK_PARAM_NULL(matmul_param);
+
+    matrix_a_dims_ = matmul_param->matrix_a_dims;
+    matrix_b_dims_ = matmul_param->matrix_b_dims;
+    if (matrix_a_dims_.size() == 1) {
+        matrix_a_dims_.insert(matrix_a_dims_.begin(), 1);
+    }
+    if (matrix_b_dims_.size() == 1) {
+        matrix_b_dims_.push_back(1);
+    }
+
+    matrix_c_dims_       = outputs[0]->GetBlobDesc().dims;
+    int M         = matrix_a_dims_[matrix_a_dims_.size() - 2];
+    int K         = matrix_a_dims_[matrix_a_dims_.size() - 1];
+    int N         = matrix_b_dims_[matrix_b_dims_.size() - 1];
+    int count_a   = DimsVectorUtils::Count(matrix_a_dims_);
+    int count_b   = DimsVectorUtils::Count(matrix_b_dims_);
+    int count_c   = DimsVectorUtils::Count(matrix_c_dims_);
+    int batch_a   = count_a / (M * K);
+    int batch_b   = count_b / (K * N);
+    int batch_c   = count_c / (M * N);
+
+    // input0, input1, output
+    reshape_inputs_.resize(3);
+    reshape_outputs_.resize(3);
+
+    // Load weights from layer resource
+    if (inputs.size() == 1) {
+        MatMulLayerResource *matmul_resource = dynamic_cast<MatMulLayerResource *>(resource);
+        CHECK_PARAM_NULL(matmul_resource);
+        RawBuffer &weight_handle = matmul_resource->weight;
+        DataType data_type       = weight_handle.GetDataType();
+        weight_position_         = matmul_param->weight_position; 
+        // get weights
+        int weights_height = batch_b * K;
+        int weights_width  = N;
+        if (weight_position_ == 0) {
+            weights_height = batch_a * M;
+            weights_width  = K;
+        }
+        if (weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            // get float pointer from raw buffer.
+            float *weights_data_ptr = weight_handle.force_to<float *>();
+            if (weights_data_ptr == nullptr) {
+                return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+            }
+            ret = ConvertWeights(weights_data_ptr, weights_width, weights_height);
+            CHECK_TNN_OK(ret)
+        } else {
+            // if handle is half, need convert to float first.
+            auto float_data_ptr = GetFloatFromRawBuffer(weight_handle);
+            if (float_data_ptr == nullptr) {
+                return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "pointer is null");
+            }
+            ret = ConvertWeights(float_data_ptr.get(), weights_width, weights_height);
+            CHECK_TNN_OK(ret)
+        }
+    }
+
+    // create kernel
+    std::string kernel_name = "MatMul";
+    ret                     = CreateExecuteUnit(execute_units_[0], "matmul", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLMatMulLayerAcc::~OpenCLMatMulLayerAcc() {}
+
+Status OpenCLMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("MatMul Layer Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input0_dims    = inputs[0]->GetBlobDesc().dims;
+    auto output_dims    = outputs[0]->GetBlobDesc().dims;
+    if (inputs.size() == 2) {
+        bool need_reshape = false;
+        ret = InitReshapeLayer(inputs[0], reshape_layer_acc_[0], need_reshape, reshape_inputs_[0],
+                               reshape_outputs_[0], reshape_blob_[0], 0);
+        CHECK_TNN_OK(ret)
+        need_reshape_[0] = need_reshape;
+
+        ret = InitReshapeLayer(inputs[1], reshape_layer_acc_[1], need_reshape, reshape_inputs_[1],
+                               reshape_outputs_[1], reshape_blob_[1], 1);
+        CHECK_TNN_OK(ret)
+        need_reshape_[1] = need_reshape;
+
+        ret = InitReshapeLayer(outputs[0], reshape_layer_acc_[2], need_reshape, reshape_inputs_[2],
+                               reshape_outputs_[2], reshape_blob_[2], 2);
+        CHECK_TNN_OK(ret)
+        need_reshape_[2] = need_reshape;
+    } else {
+        int input0_position = weight_position_ == 1 ? 0 : 1;
+        bool need_reshape = false;
+        ret = InitReshapeLayer(inputs[0], reshape_layer_acc_[input0_position],
+                               need_reshape, reshape_inputs_[input0_position],
+                               reshape_outputs_[input0_position], reshape_blob_[input0_position],
+                               input0_position);
+        CHECK_TNN_OK(ret)
+        need_reshape_[input0_position] = need_reshape;
+
+        ret = InitReshapeLayer(outputs[0], reshape_layer_acc_[2], need_reshape, reshape_inputs_[2],
+                               reshape_outputs_[2], reshape_blob_[2], 2);
+        CHECK_TNN_OK(ret)
+        need_reshape_[2] = need_reshape;
+    }
+
+    // reshape
+    for (int i = 0; i < 3; i++) {
+        if (need_reshape_[i]) {
+            if (reshape_layer_acc_[i] == nullptr) {
+                return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "reshape layer acc in MatMul is null");
+            }
+            ret = reshape_layer_acc_[i]->Reshape(reshape_inputs_[i], reshape_outputs_[i]);
+            CHECK_TNN_OK(ret)
+        }
+    }
+
+    int M         = matrix_a_dims_[matrix_a_dims_.size() - 2];
+    int K         = matrix_a_dims_[matrix_a_dims_.size() - 1];
+    int N         = matrix_b_dims_[matrix_b_dims_.size() - 1];
+    int count_a   = DimsVectorUtils::Count(matrix_a_dims_);
+    int count_b   = DimsVectorUtils::Count(matrix_b_dims_);
+    int count_c   = DimsVectorUtils::Count(matrix_c_dims_);
+    int batch_a   = count_a / (M * K);
+    int batch_b   = count_b / (K * N);
+    int batch_c   = count_c / (M * N);
+
+    const int K_blocks = UP_DIV(K, 4);
+    const int K_remain = K % 4;
+
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(N, 4)), static_cast<uint32_t>(batch_c * M)};
+    execute_units_[0].local_work_size  = {64, 1};
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    if (inputs.size() == 2) {
+        if (need_reshape_[0]) {
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[0][0]->GetHandle().base));
+        } else {
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        }
+        if (need_reshape_[1]) {
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[1][0]->GetHandle().base));
+        } else {
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[1]->GetHandle().base));
+        }
+    } else {
+        if (weight_position_ == 1) {
+            if (need_reshape_[0]) {
+                execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[0][0]->GetHandle().base));
+            } else {
+                execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+            }
+            // get weight from reshape output
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[1][0]->GetHandle().base));
+        } else {
+            // get weight from reshape output
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[0][0]->GetHandle().base));
+            if (need_reshape_[1]) {
+                execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_outputs_[1][0]->GetHandle().base));
+            } else {
+                execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+            }
+        }
+    }
+
+    execute_units_[0].ocl_kernel.setArg(idx++, M);
+    execute_units_[0].ocl_kernel.setArg(idx++, K_blocks);
+    execute_units_[0].ocl_kernel.setArg(idx++, K);
+    execute_units_[0].ocl_kernel.setArg(idx++, K_remain);
+    execute_units_[0].ocl_kernel.setArg(idx++, batch_a);
+    execute_units_[0].ocl_kernel.setArg(idx++, batch_b);
+    if (need_reshape_[2]) {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)reshape_inputs_[2][0]->GetHandle().base));
+    } else {
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = TNN_OK;
+    for (int i = 0; i < 2; i++) {
+        if (need_reshape_[i]) {
+            // reshape first
+            if (reshape_layer_acc_[i] == nullptr) {
+                return Status(TNNERR_OPENCL_ACC_FORWARD_ERROR, "reshape layer acc in MatMul is null");
+            }
+            ret = reshape_layer_acc_[i]->Forward(reshape_inputs_[i], reshape_outputs_[i]);
+            CHECK_TNN_OK(ret)
+        }
+    }
+
+    ret = OpenCLLayerAcc::Forward(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    if (need_reshape_[2]) {
+        if (reshape_layer_acc_[2] == nullptr) {
+            return Status(TNNERR_OPENCL_ACC_FORWARD_ERROR, "reshape layer acc in MatMul is null");
+        }
+        ret = reshape_layer_acc_[2]->Forward(reshape_inputs_[2], reshape_outputs_[2]);
+        CHECK_TNN_OK(ret)
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLMatMulLayerAcc::InitReshapeLayer(
+        Blob *blob,
+        std::shared_ptr<OpenCLReshapeLayerAcc>& layer,
+        bool& need_reshape,
+        std::vector<Blob *> &reshape_layer_inputs,
+        std::vector<Blob *> &reshape_layer_outputs,
+        std::shared_ptr<Blob> &reshape_blob,
+        int position) {
+    Status ret = TNN_OK;
+    auto dims = blob->GetBlobDesc().dims;
+    if ((position != 0 && dims.size() <= 2) || (dims.size() == 2)) {
+        need_reshape = false;
+        return TNN_OK;
+    }
+
+    need_reshape = true;
+    reshape_layer_inputs.clear();
+    if (position != 2) {
+        reshape_layer_inputs.push_back(blob);
+    } else {
+        BlobDesc input_desc     = blob->GetBlobDesc();
+        auto dims               = blob->GetBlobDesc().dims;
+        int dim0 = 1, dim1 = 1, dim2 = 1, dim3 = 1;
+        dim1 = dims.back();
+        dim0 = DimsVectorUtils::Count(dims) / dim1;
+        input_desc.dims = {dim0, dim1, dim2, dim3};
+        reshape_blob    = std::make_shared<Blob>(input_desc, true);
+        if (reshape_blob == nullptr) {
+            LOGE("Create reshape input blob in MatMul failed!\n");
+            return Status(TNNERR_CREATE_LAYER, "Create reshape input blob in MatMul failed!");
+        }
+        reshape_layer_inputs.push_back(reshape_blob.get());
+    }
+
+    layer = std::make_shared<OpenCLReshapeLayerAcc>();
+    if (layer == nullptr) {
+        LOGE("Create Reshape Layer Acc in MatMul failed!\n");
+        return Status(TNNERR_CREATE_LAYER, "Create Reshape Layer Acc in MatMul failed!");
+    }
+
+    // create output_blob
+    BlobDesc output_desc    = blob->GetBlobDesc();
+    output_desc.data_format = DATA_FORMAT_NHC4W4;
+    if (position != 2) {
+        auto dims               = blob->GetBlobDesc().dims;
+        int dim0 = 1, dim1 = 1, dim2 = 1, dim3 = 1;
+        if (position == 0 && dims.size() == 1) {
+            dim1 = dims[0];
+        } else {
+            dim1 = dims.back();
+            dim0 = DimsVectorUtils::Count(dims) / dim1;
+        }
+        output_desc.dims = {dim0, dim1, dim2, dim3};
+        reshape_blob    = std::make_shared<Blob>(output_desc, true);
+        if (reshape_blob == nullptr) {
+            LOGE("Create reshape output blob in MatMul failed!\n");
+            return Status(TNNERR_CREATE_LAYER, "Create reshape output blob in MatMul failed!");
+        }
+        reshape_layer_outputs.clear();
+        reshape_layer_outputs.push_back(reshape_blob.get());
+    } else {
+        reshape_layer_outputs.clear();
+        reshape_layer_outputs.push_back(blob);
+    }    
+
+    // Init LayerAcc
+    if (position != 2) {
+        ReshapeLayerParam reshape_param;
+        reshape_param.name         = "MatMul_Reshape";
+        reshape_param.reshape_type = 0;
+        reshape_param.axis         = 0;
+        reshape_param.num_axes     = 4;
+        reshape_param.shape        = output_desc.dims;
+        layer->Init(ocl_context_, &reshape_param, nullptr, reshape_layer_inputs, reshape_layer_outputs);
+    } else {
+        ReshapeLayerParam reshape_param;
+        reshape_param.name         = "MatMul_Reshape";
+        reshape_param.reshape_type = 0;
+        reshape_param.axis         = 0;
+        reshape_param.num_axes     = blob->GetBlobDesc().dims.size();
+        reshape_param.shape        = blob->GetBlobDesc().dims;
+        layer->Init(ocl_context_, &reshape_param, nullptr, reshape_layer_inputs, reshape_layer_outputs);
+    }
+
+    return ret;
+}
+
+Status OpenCLMatMulLayerAcc::ConvertWeights(float *weights_data_ptr, int weight_w, int weight_h) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    // copy weights data into clBuffer
+    DimsVector weight_shape{weight_h, weight_w, 1, 1};
+    shared_ptr<OpenCLMemory> weight_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl_int ret = CL_SUCCESS;
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(weight_shape) * sizeof(float), nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    weight_buffer->SetData(&buffer);
+    ret = ocl_context_->CommandQueue()->enqueueWriteBuffer(
+        buffer, CL_TRUE, 0, DimsVectorUtils::Count(weight_shape) * sizeof(float), weights_data_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    // create weights blob
+    BlobDesc desc;
+    desc.device_type = DEVICE_OPENCL;
+    desc.data_type = opencl_runtime->GetPrecision() == PRECISION_HIGH ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+    desc.dims = weight_shape;
+    desc.data_format = DATA_FORMAT_NHC4W4;
+    reshape_blob_[weight_position_] = std::make_shared<Blob>(desc, true);
+    reshape_outputs_[weight_position_].clear();
+    reshape_outputs_[weight_position_].push_back(reshape_blob_[weight_position_].get());
+
+    // transfer from clBuffer to clImage
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    std::shared_ptr<OpenCLMemory> weight_memory;
+    weight_memory.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    weight_memory->SetData(reshape_blob_[weight_position_]->GetHandle().base, false);
+    Status ret_convert = convertor.ConvertBufferToImage(
+            weight_buffer.get(), NHWC_BUFFER, weight_shape, weight_memory.get(), true);
+    CHECK_TNN_OK(ret_convert)
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(MatMul, LAYER_MATMUL)
+REGISTER_OPENCL_LAYOUT(LAYER_MATMUL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.h
new file mode 100644
index 0000000..4d8d49f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mat_mul_layer_acc.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/acc/opencl_reshape_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLMatMulLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLMatMulLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    Status InitReshapeLayer(Blob *blob, std::shared_ptr<OpenCLReshapeLayerAcc>& layer,
+                            bool &need_reshape, std::vector<Blob *> &reshape_layer_inputs,
+                            std::vector<Blob *> &reshape_layer_outputs, std::shared_ptr<Blob>& reshape_blob,
+                            int position);
+    Status ConvertWeights(float *weights_data_ptr, int weight_w, int weight_h);
+
+private:
+    DimsVector matrix_a_dims_ = {};
+    DimsVector matrix_b_dims_ = {};
+    DimsVector matrix_c_dims_ = {};
+    int weight_position_ = 0;
+    // input0, input1, output
+    std::vector<bool> need_reshape_ = {false, false, false};
+    std::vector<std::shared_ptr<OpenCLReshapeLayerAcc> > reshape_layer_acc_ = {nullptr, nullptr, nullptr};
+    std::vector<std::vector<Blob *> > reshape_inputs_ = {};
+    std::vector<std::vector<Blob *> > reshape_outputs_ = {};
+    std::vector<std::shared_ptr<Blob> > reshape_blob_ = {nullptr, nullptr, nullptr};
+};
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_max_layer_acc.cc
new file mode 100644
index 0000000..a3623a6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_max_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Max);
+
+Status OpenCLMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Max Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Max";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute = "max(in0,in1)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLMaxLayerAcc::~OpenCLMaxLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Max, LAYER_MAXIMUM)
+REGISTER_OPENCL_LAYOUT(LAYER_MAXIMUM, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_min_layer_acc.cc
new file mode 100644
index 0000000..f005a31
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_min_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Min);
+
+Status OpenCLMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Min Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Min";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute = "min(in0,in1)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLMinLayerAcc::~OpenCLMinLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Min, LAYER_MINIMUM)
+REGISTER_OPENCL_LAYOUT(LAYER_MINIMUM, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mul_layer_acc.cc
new file mode 100644
index 0000000..3e29876
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_mul_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Mul);
+
+Status OpenCLMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Mul Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Mul";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute = "in0*in1";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLMulLayerAcc::~OpenCLMulLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Mul, LAYER_MUL)
+REGISTER_OPENCL_LAYOUT(LAYER_MUL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_neg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_neg_layer_acc.cc
new file mode 100644
index 0000000..3caa6ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_neg_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Neg);
+
+Status OpenCLNegLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Neg Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Neg";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLNegLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "-in";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLNegLayerAcc::~OpenCLNegLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Neg, LAYER_NEG)
+REGISTER_OPENCL_LAYOUT(LAYER_NEG, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_normalize_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_normalize_layer_acc.cc
new file mode 100644
index 0000000..43a1a1e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_normalize_layer_acc.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Normalize);
+
+Status OpenCLNormalizeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Normalize Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Normalize";
+
+    return TNN_OK;
+}
+
+Status OpenCLNormalizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Normalize Layer Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto norm_param = dynamic_cast<NormalizeLayerParam *>(param_);
+    if (!norm_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    if (norm_param->p != 1 && norm_param->p != 2) {
+        LOGE("the param p=%d is not support yet\n", norm_param->p);
+        return Status(TNNERR_MODEL_ERR, "invalid param p");
+    }
+
+    ASSERT(inputs.size() == 1);
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    // create kernel
+    std::set<std::string> build_options;
+    if (2 == norm_param->p) {
+        build_options.emplace("-DNORMALIZE_P2");
+    }
+    std::string kernel_name;
+    if (DimsFunctionUtils::GetDim(input_dims, 1) % 4 == 0) {
+        kernel_name = "NormalizeCommon0";
+    } else {
+        kernel_name = "NormalizeCommon";
+    }
+
+    ret = CreateExecuteUnit(execute_units_[0], "normalize", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    const int batch    = DimsFunctionUtils::GetDim(input_dims, 0);
+    const int height   = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int width    = DimsFunctionUtils::GetDim(input_dims, 3);
+    const int channels = DimsFunctionUtils::GetDim(input_dims, 1);
+
+    const int channel_blocks = UP_DIV(channels, 4);
+    const int channel_remain = channels % 4;
+
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(width), static_cast<uint32_t>(batch * height)};
+    execute_units_[0].local_work_size  = LocalWS2DDefault(execute_units_[0]);
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, channel_blocks);
+    if (channel_remain != 0) {
+        execute_units_[0].ocl_kernel.setArg(idx++, channel_remain);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, width);
+    execute_units_[0].ocl_kernel.setArg(idx++, norm_param->epsilon);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Normalize, LAYER_NORMALIZE)
+REGISTER_OPENCL_LAYOUT(LAYER_NORMALIZE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pad_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pad_layer_acc.cc
new file mode 100644
index 0000000..12a11d8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pad_layer_acc.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Pad);
+
+Status OpenCLPadLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Pad Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "Pad";
+
+    PadLayerParam *pad_param = dynamic_cast<PadLayerParam *>(param);
+    if (!pad_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    if (0 == pad_param->type) {
+        ret = CreateExecuteUnit(execute_units_[0], "pad", "PadConst");
+    } else if (1 == pad_param->type) {
+        ret = CreateExecuteUnit(execute_units_[0], "pad", "PadReflect");
+    } else {
+        return Status(TNNERR_PARAM_ERR, "this pad type is not support yet!");
+    }
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPadLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Pad Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    PadLayerParam *pad_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!pad_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 1));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[4]);
+    if (0 == pad_param->type) {
+        execute_units_[0].ocl_kernel.setArg(idx++, pad_param->value);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Pad, LAYER_PAD)
+REGISTER_OPENCL_LAYOUT(LAYER_PAD, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_padv2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_padv2_layer_acc.cc
new file mode 100644
index 0000000..285b60a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_padv2_layer_acc.cc
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(PadV2);
+
+Status OpenCLPadV2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init PadV2 Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+
+    if (input_dims.size() != 4 || output_dims.size() != 4) {
+        LOGE("PadV2 Layer (OpenCL) only support 4-dim by now\n");
+        return Status(TNNERR_INVALID_INPUT, "PadV2 Layer in OpenCL only support 4-dim by now\n");
+    }
+
+    run_3d_ndrange_ = true;
+    op_name_        = "PadV2";
+
+    PadLayerParam *pad_param = dynamic_cast<PadLayerParam *>(param);
+    if (!pad_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    auto dim_size = pad_param->pads.size() / 2;
+    dim_size      = dim_size <= output_dims.size() ? dim_size : output_dims.size();
+    for (int i = 0; i < dim_size; i++) {
+        if (pad_param->pads[i] != pad_param->pads[i + dim_size]) {
+            LOGE("PadV2 Layer (OpenCL) pad param is invalid (begin must be equal to end by now)\n");
+            return Status(TNNERR_PARAM_ERR,
+                          "PadV2 Layer (OpenCL) pad param is invalid (begin must be equal to end by now)\n");
+        }
+    }
+
+    if (0 == pad_param->type) {
+        ret = CreateExecuteUnit(execute_units_[0], "pad", "PadConst");
+    } else if (1 == pad_param->type) {
+        ret = CreateExecuteUnit(execute_units_[0], "pad", "PadReflect");
+    } else {
+        return Status(TNNERR_PARAM_ERR, "this pad type is not support yet!");
+    }
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPadV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("PadV2 Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    PadLayerParam *pad_param = dynamic_cast<PadLayerParam *>(param_);
+    if (!pad_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    uint32_t idx     = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 1));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[3]);
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, pad_param->pads[1]);
+    if (0 == pad_param->type) {
+        execute_units_[0].ocl_kernel.setArg(idx++, pad_param->value);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(PadV2, LAYER_PADV2)
+REGISTER_OPENCL_LAYOUT(LAYER_PADV2, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_permute_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_permute_layer_acc.cc
new file mode 100644
index 0000000..3eaf4ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_permute_layer_acc.cc
@@ -0,0 +1,204 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+class OpenCLPermuteLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLPermuteLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+
+private:
+    std::shared_ptr<cl::Buffer> inter_buffer_ = nullptr;
+    std::vector<int> dims_                    = {};
+};
+
+Status OpenCLPermuteLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Permute Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Permute";
+
+    PermuteLayerParam *permute_param = dynamic_cast<PermuteLayerParam *>(param);
+    CHECK_PARAM_NULL(permute_param);
+
+    if (permute_param->orders.size() <= 4) {
+        dims_.resize(4);
+        for (unsigned int i = 0; i < permute_param->orders.size(); ++i) {
+            int dim    = permute_param->orders[i];
+            dims_[dim] = i;
+        }
+
+        // pad permute order to 4 dims
+        for (unsigned int i = permute_param->orders.size(); i < 4; ++i) {
+            dims_[i] = i;
+        }
+    } else {
+        dims_.resize(permute_param->orders.size());
+        for (unsigned int i = 0; i < permute_param->orders.size(); ++i) {
+            int dim    = permute_param->orders[i];
+            dims_[dim] = i;
+        }
+    }
+
+    std::string src_format = "Image", dst_format = "Image";
+    std::string copy_program_name = "copy";
+    src_format = dims_.size() == 5 ? "Image5D" : dims_.size() == 6 ? "Image6D" : src_format;
+    copy_program_name = dims_.size() == 5 ? "copy_image_5d" : dims_.size() == 6 ? "copy_image_6d" : copy_program_name;
+    dst_format = src_format;
+
+    execute_units_.resize(2);
+    // image->buffer
+    {
+        std::string kernel_name = "Copy" + src_format + "ToBuffer";
+        ret = CreateExecuteUnit(execute_units_[0], copy_program_name, kernel_name);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    // buffer->image
+    {
+        std::string kernel_name = "CopyBufferTo" + dst_format;
+        ret = CreateExecuteUnit(execute_units_[1], copy_program_name, kernel_name);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLPermuteLayerAcc::~OpenCLPermuteLayerAcc() {}
+
+Status OpenCLPermuteLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Permute Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    int output_n    = DimsFunctionUtils::GetDim(output_dims, 0);
+    int output_c    = DimsFunctionUtils::GetDim(output_dims, 1);
+    int output_count = 1;
+    for (int i = 2; i < dims_.size(); ++i) {
+        output_count *= DimsFunctionUtils::GetDim(output_dims, i);
+    }
+
+    int input_n     = DimsFunctionUtils::GetDim(input_dims, 0);
+    int input_c     = DimsFunctionUtils::GetDim(input_dims, 1);
+    int input_count = 1;
+    for (int i = 2; i < dims_.size(); ++i) {
+        input_count *= DimsFunctionUtils::GetDim(input_dims, i);
+    }
+
+    int size0          = UP_DIV(output_c, 4) * 4 * output_n * output_count;
+    int size1          = UP_DIV(input_c, 4) * 4 * input_n * input_count;
+    int blob_elem_size = opencl_runtime->GetPrecision() != PRECISION_HIGH ? 2 : 4;
+    int blob_size      = std::max(size0, size1) * blob_elem_size;
+
+    inter_buffer_           = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, blob_size);
+    std::vector<int> offset(dims_.size(), 0);
+    std::vector<int> output_stride(dims_.size());
+    int count = 1;
+    for (int i = 0; i < dims_.size(); ++i) {
+        output_stride[dims_.size() - 1 - i] = count;
+        count *= DimsFunctionUtils::GetDim(output_dims, dims_.size() - 1 - i);
+    }
+    std::vector<int> permute_input_stride(dims_.size());
+    for (int i = 0; i < dims_.size(); ++i) {
+        permute_input_stride[i] = output_stride[dims_[i]];
+    }
+    std::vector<int> input_size(dims_.size() - 2);
+    std::vector<int> output_size(dims_.size() - 2);
+    for (int i = 2; i < dims_.size(); ++i) {
+        input_size[i - 2] = DimsFunctionUtils::GetDim(input_dims, i);
+        output_size[i - 2] = DimsFunctionUtils::GetDim(output_dims, i);
+    }
+
+    if (dims_.size() == 4) {
+        // dim-4 store w,h
+        std::reverse(input_size.begin(), input_size.end());
+        std::reverse(output_size.begin(), output_size.end());
+    }
+
+    std::vector<int> buffer_output_size(dims_.size());
+    for (int i = 0; i < dims_.size(); ++i) {
+        buffer_output_size[i] = DimsFunctionUtils::GetDim(input_dims, i);
+    }
+    // image->buffer
+    {
+        int idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], input_dims);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        execute_units_[0].ocl_kernel.setArg(idx++, offset.size() * sizeof(int), offset.data());
+        execute_units_[0].ocl_kernel.setArg(idx++, offset.size() * sizeof(int), offset.data());
+        execute_units_[0].ocl_kernel.setArg(idx++, input_size.size() * sizeof(int), input_size.data());
+        execute_units_[0].ocl_kernel.setArg(idx++, permute_input_stride.size() * sizeof(int), permute_input_stride.data());
+        execute_units_[0].ocl_kernel.setArg(idx++, buffer_output_size.size() * sizeof(int), buffer_output_size.data());
+    }
+
+    // buffer->image
+    {
+        int idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[1], output_dims);
+        execute_units_[1].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        execute_units_[1].ocl_kernel.setArg(idx++, offset.size() * sizeof(int), offset.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, offset.size() * sizeof(int), offset.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, output_stride.size() * sizeof(int), output_stride.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, output_size.size() * sizeof(int), output_size.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, output_size.size() * sizeof(int), output_size.data());
+        execute_units_[1].ocl_kernel.setArg(idx++, std::max(size0, size1) - 1);
+    }
+
+    return TNN_OK;
+}
+
+std::vector<DataFormat> OpenCLPermuteLayerAcc::SupportDataFormat(DataType data_type,
+                                                                 int dims_size,
+                                                                 BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (data_type == DATA_TYPE_INT32) {
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    } else if (dims_size >= 2 && dims_size <= 6) { // only support up to 6 dims
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    }
+    return support_list;
+}
+
+REGISTER_OPENCL_ACC(Permute, LAYER_PERMUTE)
+REGISTER_OPENCL_LAYOUT(LAYER_PERMUTE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pixel_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pixel_shuffle_layer_acc.cc
new file mode 100644
index 0000000..fe96e27
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pixel_shuffle_layer_acc.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(PixelShuffle);
+
+Status OpenCLPixelShuffleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init PixelShuffle Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "PixelShuffle";
+
+    PixelShuffleLayerParam *pixel_shuffle_param = dynamic_cast<PixelShuffleLayerParam *>(param);
+    if (!pixel_shuffle_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name = "PixelShuffle";
+
+    ret = CreateExecuteUnit(execute_units_[0], "pixel_shuffle", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPixelShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("PixelShuffle Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    PixelShuffleLayerParam *pixel_shuffle_param = dynamic_cast<PixelShuffleLayerParam *>(param_);
+    if (!pixel_shuffle_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims    = outputs[0]->GetBlobDesc().dims;
+    auto input_dims     = inputs[0]->GetBlobDesc().dims;
+    int upscale_factor  = pixel_shuffle_param->upscale_factor;
+
+    uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 3));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(upscale_factor));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(upscale_factor * upscale_factor));
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE)
+REGISTER_OPENCL_LAYOUT(LAYER_PIXEL_SHUFFLE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pooling_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pooling_layer_acc.cc
new file mode 100644
index 0000000..3f38814
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pooling_layer_acc.cc
@@ -0,0 +1,213 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Pooling);
+
+#define LowOpParallelismThre 256
+#define HighOpIntensityThre 128
+
+Status OpenCLPoolingLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Pooling Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "Pooling";
+
+    PoolingLayerParam *pooling_param = dynamic_cast<PoolingLayerParam *>(param);
+    if (!pooling_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    if (pooling_param->pad_type == 1) {  // VALID Type
+        pooling_param->pads[0] = 0;
+        pooling_param->pads[2] = 0;
+    }
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string kernel_name = "Pooling";
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    const int batch         = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+    const int channels      = DimsFunctionUtils::GetDim(output_dims, 1);
+
+    const int kernel_height = pooling_param->kernels[1];
+    const int kernel_width  = pooling_param->kernels[0];
+
+    const int channel_blocks = UP_DIV(channels, 4);
+
+    bool run_local_work = batch * output_height * output_width * channel_blocks < LowOpParallelismThre &&
+                          kernel_width * kernel_height >= HighOpIntensityThre;
+    if (run_local_work) {
+        kernel_name += "Local";
+    }
+
+    if (pooling_param->pool_type != 0) {  // 0:max_pooling  other:average pooling
+        build_options.emplace("-DPOOL_AVG");
+    }
+    ret = CreateExecuteUnit(execute_units_[0], "pooling", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPoolingLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Pooling Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    PoolingLayerParam *pooling_param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!pooling_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto &unit  = execute_units_[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    const int batch         = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+    const int channels      = DimsFunctionUtils::GetDim(output_dims, 1);
+
+    const int input_height = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width  = DimsFunctionUtils::GetDim(input_dims, 3);
+
+    const int channel_blocks    = UP_DIV(channels, 4);
+    uint32_t workgroup_size     = 0;
+
+    OpenCLRuntime * opencl_runtime = OpenCLRuntime::GetInstance();
+    int type_size = sizeof(float);
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH && pooling_param->pool_type == 0) {
+        type_size = 2;
+    }
+
+    int compute_intensity = pooling_param->kernels[0] * pooling_param->kernels[1];
+    bool run_local_work = batch * output_height * output_width * channel_blocks < LowOpParallelismThre &&
+                          compute_intensity >= HighOpIntensityThre;
+
+    if (run_local_work) {
+        workgroup_size = std::min(static_cast<uint32_t>(unit.local_mem_size / (4 * type_size)),
+                                  unit.workgroupsize_max);
+        workgroup_size = std::min(static_cast<uint32_t>(compute_intensity), workgroup_size);
+        uint32_t temp_size = 1;
+        while ((temp_size <<= 1) <= workgroup_size);
+        workgroup_size = temp_size >> 1;
+
+        int workgroup_w_size = 1, workgroup_h_size;
+        while ((workgroup_w_size <<= 1) <= pooling_param->kernels[0] && workgroup_w_size <= workgroup_size);
+        workgroup_w_size >>= 1;
+        workgroup_h_size = workgroup_size / workgroup_w_size;
+
+        execute_units_[0].global_work_size = {
+            static_cast<uint32_t>(channel_blocks * workgroup_size),
+            static_cast<uint32_t>(output_width),
+            static_cast<uint32_t>(batch * output_height),
+        };
+
+        execute_units_[0].local_work_size = {workgroup_size, 1, 1};
+
+        int input_image_shape[2]        = {input_width, input_height};
+        int padding_shape[2]            = {pooling_param->pads[0], pooling_param->pads[2]};
+        int stride_shape[2]             = {pooling_param->strides[0], pooling_param->strides[1]};
+        int kernel_shape[2]             = {pooling_param->kernels[0], pooling_param->kernels[1]};
+        int local_block_size_shape[2]   = {workgroup_w_size, workgroup_h_size};
+        int local_block_count_shape[2]  = {UP_DIV(pooling_param->kernels[0], workgroup_w_size),
+                                           UP_DIV(pooling_param->kernels[1], workgroup_h_size)};
+
+        uint32_t idx                      = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[2]);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_image_shape), input_image_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_height));
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(workgroup_size));
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(local_block_size_shape), local_block_size_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(local_block_count_shape), local_block_count_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+    } else {
+        execute_units_[0].global_work_size = {
+            static_cast<uint32_t>(channel_blocks),
+            static_cast<uint32_t>(output_width),
+            static_cast<uint32_t>(batch * output_height),
+        };
+
+        int input_image_shape[2] = {input_width, input_height};
+        int padding_shape[2]     = {pooling_param->pads[0], pooling_param->pads[2]};
+        int stride_shape[2]      = {pooling_param->strides[0], pooling_param->strides[1]};
+        int kernel_shape[2]      = {pooling_param->kernels[0], pooling_param->kernels[1]};
+
+        execute_units_[0].local_work_size = LocalWS3DDefault(execute_units_[0]);
+        uint32_t idx                      = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[2]);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(input_image_shape), input_image_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_height));
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(padding_shape), padding_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    }
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+        std::string tune_key = unit.program_name + "_" + unit.kernel_name + "_" + "param[" +
+                               "kernel_" + ToString(pooling_param->kernels[0]) + "_" + ToString(pooling_param->kernels[1]) + "_" +
+                               "pad_" + ToString(pooling_param->pads[0]) + "_" + ToString(pooling_param->pads[1]) + "_" +
+                               "stride_" + ToString(pooling_param->strides[0]) + "_" + ToString(pooling_param->strides[1]) + "_" +
+                               "pool_type_" + ToString(pooling_param->pool_type) + "_" + 
+                               "ceil_mode_" + ToString(pooling_param->ceil_mode) + "_" + 
+                               "pad_type_" + ToString(pooling_param->pad_type) + "]_global";
+        for (auto size : unit.global_work_size) {
+            tune_key += "_" + ToString(size);
+        }
+        execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, tune_key);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Pooling, LAYER_POOLING)
+REGISTER_OPENCL_LAYOUT(LAYER_POOLING, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pow_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pow_layer_acc.cc
new file mode 100644
index 0000000..a9035a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_pow_layer_acc.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Pow);
+
+Status OpenCLPowLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Pow Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "Pow";
+
+    // create kernel
+    std::string kernel_name = "Power";
+    ret                     = CreateExecuteUnit(execute_units_[0], "pow", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPowLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Pow Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    PowLayerParam *pow_param = dynamic_cast<PowLayerParam *>(param_);
+    if (!pow_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, pow_param->scale);
+    execute_units_[0].ocl_kernel.setArg(idx++, pow_param->shift);
+    execute_units_[0].ocl_kernel.setArg(idx++, pow_param->exponent);
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Pow, LAYER_POWER)
+REGISTER_OPENCL_LAYOUT(LAYER_POWER, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prelu_layer_acc.cc
new file mode 100644
index 0000000..3f452af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prelu_layer_acc.cc
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLPReluLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLPReluLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+#if TNN_PROFILE
+    virtual double GetFlops() override;
+    virtual double GetBandwidth() override;
+#endif
+
+private:
+    Status ConvertWeights(float *weights_data_ptr, int output_channel);
+
+private:
+    bool share_channel_ = false;
+    shared_ptr<OpenCLMemory> ocl_scope_ = nullptr;
+};
+
+Status OpenCLPReluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init PRelu Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "PRelu";
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    int channels    = DimsFunctionUtils::GetDim(input_dims, 1);
+
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param);
+    if (layer_param == nullptr) {
+        LOGE("PReluLayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "PReluLayerParam is null");
+    }
+    share_channel_ = layer_param->channel_shared;
+
+    auto layer_res = dynamic_cast<PReluLayerResource *>(resource);
+    if (layer_res == nullptr) {
+        LOGE("PReluLayerResource is null!\n");
+        return Status(TNNERR_MODEL_ERR, "PReluLayerResource is null");
+    }
+    RawBuffer &scope_handle = layer_res->slope_handle;
+    DataType data_type      = scope_handle.GetDataType();
+
+    ConvertChannelWeights(scope_handle, ocl_scope_, channels, true, share_channel_);
+
+    // create kernel
+    std::string kernel_name = "PRelu";
+    if (run_3d_ndrange_)
+        kernel_name = "PReluGS3D";
+    ret = CreateExecuteUnit(execute_units_[0], "prelu", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLPReluLayerAcc::~OpenCLPReluLayerAcc() {}
+
+Status OpenCLPReluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("PRelu Acc Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx = 0;
+    if (run_3d_ndrange_) {
+        idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    } else {
+        idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    if (!run_3d_ndrange_) {
+        //set output width
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 3));
+    }
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)ocl_scope_->GetData()));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    return TNN_OK;
+}
+
+#if TNN_PROFILE
+double OpenCLPReluLayerAcc::GetFlops() {
+    return 2.0 * DimsVectorUtils::Count(output_dims_) / 1000.0 / 1000.0;
+}
+
+double OpenCLPReluLayerAcc::GetBandwidth() {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int data_type_size            = opencl_runtime->GetPrecision() != PRECISION_HIGH ? 2 : 4;
+    return 2.0 * DimsVectorUtils::Count(output_dims_) * data_type_size / 1000.0 / 1000.0;
+}
+#endif
+
+REGISTER_OPENCL_ACC(PRelu, LAYER_PRELU)
+REGISTER_OPENCL_LAYOUT(LAYER_PRELU, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prior_box_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prior_box_layer_acc.cc
new file mode 100644
index 0000000..f7f69eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_prior_box_layer_acc.cc
@@ -0,0 +1,146 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLPriorBoxLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual ~OpenCLPriorBoxLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    Status ConvertPriorBox(std::vector<float> &priorbox, DimsVector dims);
+
+private:
+    shared_ptr<OpenCLMemory> ocl_priorbox_ = nullptr;
+    PriorBoxLayerParam *param_ = nullptr;
+    const int PRIORBOX_CHANNEL = 2;
+};
+
+OpenCLPriorBoxLayerAcc::~OpenCLPriorBoxLayerAcc(){};
+
+Status OpenCLPriorBoxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "PriorBox";
+
+    param_ = dynamic_cast<PriorBoxLayerParam *>(param);
+    if (!param_) {
+        return Status(TNNERR_MODEL_ERR, "Error: PriorBoxLayerParam is empyt");
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLPriorBoxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    std::vector<float> priorbox = GeneratePriorBox(inputs, outputs, param_);
+    auto dims = outputs[0]->GetBlobDesc().dims;
+    ret = ConvertPriorBox(priorbox, dims);
+    return ret;
+}
+
+Status OpenCLPriorBoxLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    DimsVector dims               = GetImageShape(ocl_priorbox_.get());
+
+    Status ret = TNN_OK;
+#if TNN_PROFILE
+    std::shared_ptr<OpenCLProfilingData> pdata(new OpenCLProfilingData());
+    UpdateProfilingData(pdata.get(), {}, {});
+
+    ret = CopyImageToImage(opencl_runtime, ocl_context_, *((cl::Image *)ocl_priorbox_->GetData()),
+                           *((cl::Image *)outputs[0]->GetHandle().base), DimsFunctionUtils::GetDim(dims, 0),
+                           DimsFunctionUtils::GetDim(dims, 1), false, pdata.get());
+    ocl_context_->AddProfilingData(pdata);
+#else
+    ret = CopyImageToImage(opencl_runtime, ocl_context_, *((cl::Image *)ocl_priorbox_->GetData()),
+                           *((cl::Image *)outputs[0]->GetHandle().base), DimsFunctionUtils::GetDim(dims, 0),
+                           DimsFunctionUtils::GetDim(dims, 1), false);
+#endif
+    return ret;
+}
+
+Status OpenCLPriorBoxLayerAcc::ConvertPriorBox(std::vector<float> &priorbox, DimsVector dims) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    cl_int ret                    = CL_SUCCESS;
+
+    shared_ptr<OpenCLMemory> priorbox_buffer(new OpenCLMemory(TNN_CL_BUFFER));
+
+    cl::Buffer priorbox_clbuffer(*opencl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                 priorbox.size() * sizeof(float), nullptr, &ret);
+
+    priorbox_buffer->SetData(&priorbox_clbuffer);
+
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    auto priorbox_clbuffer_ptr = ocl_context_->CommandQueue()->enqueueMapBuffer(
+        priorbox_clbuffer, true, CL_MAP_WRITE, 0, priorbox.size() * sizeof(float), nullptr, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+
+    memcpy(priorbox_clbuffer_ptr, priorbox.data(), priorbox.size() * sizeof(float));
+    ret = ocl_context_->CommandQueue()->enqueueUnmapMemObject(priorbox_clbuffer, priorbox_clbuffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create ocl_pribox_
+    int ocl_priorbox_w        = UP_DIV(PRIORBOX_CHANNEL, 4);
+    int ocl_priorbox_h        = priorbox.size() / PRIORBOX_CHANNEL;
+    cl_channel_type data_type = CL_FLOAT;
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
+        data_type = CL_HALF_FLOAT;
+    cl::Image2D *image =
+        new cl::Image2D(*opencl_runtime->Context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, data_type),
+                        ocl_priorbox_w, ocl_priorbox_h, 0, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != image)
+            delete image;
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+    }
+    ocl_priorbox_.reset(new OpenCLMemory(TNN_CL_IMAGE));
+    ocl_priorbox_->SetData(image, true);
+
+    ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
+    return convertor.ConvertBufferToImage(priorbox_buffer.get(), NCHW_BUFFER, dims, ocl_priorbox_.get(), true);
+}
+
+REGISTER_OPENCL_ACC(PriorBox, LAYER_PRIOR_BOX)
+REGISTER_OPENCL_LAYOUT(LAYER_PRIOR_BOX, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reciprocal_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reciprocal_layer_acc.cc
new file mode 100644
index 0000000..f0aaa33
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reciprocal_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Reciprocal);
+
+Status OpenCLReciprocalLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reciprocal Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Reciprocal";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReciprocalLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "(FLOAT)(1.0f)/in";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLReciprocalLayerAcc::~OpenCLReciprocalLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Reciprocal, LAYER_RECIPROCAL)
+REGISTER_OPENCL_LAYOUT(LAYER_RECIPROCAL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l1_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l1_layer_acc.cc
new file mode 100644
index 0000000..d0bcd89
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l1_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceL1);
+
+Status OpenCLReduceL1LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceL1 Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceL1";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceL1LayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+fabs(t)); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceL1LayerAcc::~OpenCLReduceL1LayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceL1, LAYER_REDUCE_L1)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_L1, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l2_layer_acc.cc
new file mode 100644
index 0000000..99d948b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_l2_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceL2);
+
+Status OpenCLReduceL2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceL2 Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceL2";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceL2LayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+pow(t,2)); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=sqrt(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceL2LayerAcc::~OpenCLReduceL2LayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceL2, LAYER_REDUCE_L2)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_L2, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.cc
new file mode 100644
index 0000000..8f38c4e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.cc
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+#define LowOpParallelismThre 256
+#define HighOpIntensityThre 128
+
+Status OpenCLReduceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reduce Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto reduce_param = dynamic_cast<ReduceLayerParam *>(param);
+    if (!reduce_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    int hb   = DimsFunctionUtils::GetDim(output_dims, 0) * DimsFunctionUtils::GetDim(output_dims, 2);
+    int cw   = DimsFunctionUtils::GetDim(output_dims, 3) * UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    if (reduce_param->axis.size() == 1) {
+        int axis = reduce_param->axis[0];
+        axis     = axis >= 0 ? axis : axis + (int)input_dims.size();
+
+        int axis_n = DimsFunctionUtils::GetDim(input_dims, axis);
+
+        run_local_work_ = cw * hb < LowOpParallelismThre && axis_n >= HighOpIntensityThre;
+
+        run_3d_ndrange_         = false;
+        std::string kernel_name;
+        if (axis == 0) {
+            kernel_name = "ReduceC0";
+        } else if (axis == 1) {
+            kernel_name = "ReduceC1";
+        } else if (axis == 2) {
+            kernel_name = "ReduceC2";
+        } else {
+            kernel_name = "ReduceC3";
+        }
+
+        if (run_local_work_) {
+            kernel_name += "Local";
+        }
+
+        std::set<std::string> build_options = CreateBuildOptions();
+
+        ret = CreateExecuteUnit(execute_units_[0], "reduce", kernel_name, build_options);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    } else {
+        run_3d_ndrange_         = false;
+        std::string kernel_name = "ReduceMultiAxis";
+
+        std::set<std::string> build_options = CreateBuildOptions();
+
+        ret = CreateExecuteUnit(execute_units_[0], "reduce", kernel_name, build_options);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLReduceLayerAcc::~OpenCLReduceLayerAcc() {}
+
+Status OpenCLReduceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Reduce Layer Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto reduce_param = dynamic_cast<ReduceLayerParam *>(param_);
+    if (!reduce_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    ASSERT(inputs.size() == 1);
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    int hb   = DimsFunctionUtils::GetDim(output_dims, 0) * DimsFunctionUtils::GetDim(output_dims, 2);
+    int cw   = DimsFunctionUtils::GetDim(output_dims, 3) * UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+    int c4_n = DimsFunctionUtils::GetDim(input_dims, 1) / 4;
+    int c4_r = DimsFunctionUtils::GetDim(input_dims, 1) % 4;
+    int cw4  = DimsFunctionUtils::GetDim(input_dims, 3) * c4_n;
+
+    if (reduce_param->axis.size() == 1) {
+        int axis = reduce_param->axis[0];
+        axis     = axis >= 0 ? axis : axis + (int)input_dims.size();
+
+        int axis_n = DimsFunctionUtils::GetDim(input_dims, axis);
+
+        auto &unit            = execute_units_[0];
+        uint32_t workgroup_size = 0;
+
+        OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+        int type_size = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            type_size = 2;
+        }
+
+        if (run_local_work_) {
+            workgroup_size = std::min(static_cast<uint32_t>(unit.local_mem_size / (4 * type_size)),
+                                    unit.workgroupsize_max);
+            workgroup_size = std::min(static_cast<uint32_t>(axis == 1 ? c4_n : axis_n), workgroup_size);
+            int temp_size = 1;
+            while ((temp_size <<= 1) <= workgroup_size);
+            workgroup_size = temp_size >> 1;
+
+            unit.global_work_size = {static_cast<uint32_t>(cw * workgroup_size), static_cast<uint32_t>(hb)};
+            unit.local_work_size  = {workgroup_size, 1};
+        } else {
+            unit.global_work_size = {static_cast<uint32_t>(cw), static_cast<uint32_t>(hb)};
+            unit.local_work_size  = LocalWS2DDefault(unit);
+        }
+
+        uint32_t idx = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 0));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 1));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+        execute_units_[0].ocl_kernel.setArg(idx++, c4_n);
+        execute_units_[0].ocl_kernel.setArg(idx++, c4_r);
+        execute_units_[0].ocl_kernel.setArg(idx++, cw4);
+        execute_units_[0].ocl_kernel.setArg(idx++, axis_n);
+
+        if (run_local_work_) {
+            if (axis == 1) {
+                execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(c4_n, workgroup_size));
+            } else {
+                execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(axis_n, workgroup_size));
+            }
+            execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+        }
+    } else {
+        auto &unit              = execute_units_[0];
+        uint32_t workgroup_size = 0;
+
+        int axis_n = 1;
+        std::vector<int> axis_nhwc = {0, 0, 0, 0};
+        for (int i = 0; i < reduce_param->axis.size(); i++) {
+            int axis = reduce_param->axis[i];
+            axis     = axis >= 0 ? axis : axis + (int)input_dims.size();
+            switch(axis) {
+                case 0:
+                    if (!axis_nhwc[0]) {
+                        axis_n *= DimsFunctionUtils::GetDim(input_dims, axis);
+                        axis_nhwc[0] = 1;
+                    }
+                    break;
+                case 1:
+                    if (!axis_nhwc[3]) {
+                        axis_n *= DimsFunctionUtils::GetDim(input_dims, axis);
+                        axis_nhwc[3] = 1;
+                    }
+                    break;
+                case 2:
+                    if (!axis_nhwc[1]) {
+                        axis_n *= DimsFunctionUtils::GetDim(input_dims, axis);
+                        axis_nhwc[1] = 1;
+                    }
+                    break;
+                case 3:
+                    if (!axis_nhwc[2]) {
+                        axis_n *= DimsFunctionUtils::GetDim(input_dims, axis);
+                        axis_nhwc[2] = 1;
+                    }
+                    break;
+            }
+        }
+
+        unit.global_work_size = {static_cast<uint32_t>(cw), static_cast<uint32_t>(hb)};
+        unit.local_work_size  = LocalWS2DDefault(unit);
+
+        uint32_t idx = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 0));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 1));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+        execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+        execute_units_[0].ocl_kernel.setArg(idx++, c4_n);
+        execute_units_[0].ocl_kernel.setArg(idx++, c4_r);
+        execute_units_[0].ocl_kernel.setArg(idx++, cw4);
+        execute_units_[0].ocl_kernel.setArg(idx++, axis_n);
+        execute_units_[0].ocl_kernel.setArg(idx++, 4 * sizeof(int), axis_nhwc.data());
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.h
new file mode 100644
index 0000000..8dbeb3f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_REDUCE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_REDUCE_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLReduceLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLReduceLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::set<std::string> CreateBuildOptions() = 0;
+    bool run_local_work_ = false;
+};
+
+#define DECLARE_OPENCL_REDUCE_ACC(type_string)                                                                         \
+    class OpenCL##type_string##LayerAcc : public OpenCLReduceLayerAcc {                                                \
+    public:                                                                                                            \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;           \
+        virtual ~OpenCL##type_string##LayerAcc() override;                                                             \
+                                                                                                                       \
+    private:                                                                                                           \
+        virtual std::set<std::string> CreateBuildOptions() override;                                                   \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_REDUCE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_exp_layer_acc.cc
new file mode 100644
index 0000000..dd4ac56
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_exp_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceLogSumExp);
+
+Status OpenCLReduceLogSumExpLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceLogSumExp Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceLogSumExp";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceLogSumExpLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+exp(t)); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=log(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+
+    AdjustBuildOptionForFp32(build_options);
+    return build_options;
+}
+
+OpenCLReduceLogSumExpLayerAcc::~OpenCLReduceLogSumExpLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_LOG_SUM_EXP, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_layer_acc.cc
new file mode 100644
index 0000000..5f1626c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_log_sum_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceLogSum);
+
+Status OpenCLReduceLogSumLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceLogSum Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceLogSum";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceLogSumLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=log(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceLogSumLayerAcc::~OpenCLReduceLogSumLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_LOG_SUM, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_max_layer_acc.cc
new file mode 100644
index 0000000..d0ee547
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_max_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceMax);
+
+Status OpenCLReduceMaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceMax Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceMax";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceMaxLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=-FLT_MAX ";
+    std::string compute = " -DOPERATOR(r,t)=r=max(r,t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=max(r,t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=max(max(r.x,r.y),max(r.z,r.w)) ";
+    std::string post    = " -DPOSTOPERATOR(r)=(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceMaxLayerAcc::~OpenCLReduceMaxLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceMax, LAYER_REDUCE_MAX)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_MAX, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_mean_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_mean_layer_acc.cc
new file mode 100644
index 0000000..f5bd24c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_mean_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceMean);
+
+Status OpenCLReduceMeanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceMean Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceMean";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceMeanLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=(r/axis_n) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceMeanLayerAcc::~OpenCLReduceMeanLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceMean, LAYER_REDUCE_MEAN)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_MEAN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_min_layer_acc.cc
new file mode 100644
index 0000000..03ec050
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_min_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceMin);
+
+Status OpenCLReduceMinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceMin Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceMin";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceMinLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=FLT_MAX ";
+    std::string compute = " -DOPERATOR(r,t)=r=min(r,t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=min(r,t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=min(min(r.x,r.y),min(r.z,r.w)) ";
+    std::string post    = " -DPOSTOPERATOR(r)=(r) ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceMinLayerAcc::~OpenCLReduceMinLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceMin, LAYER_REDUCE_MIN)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_MIN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_prod_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_prod_layer_acc.cc
new file mode 100644
index 0000000..b73d5f8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_prod_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceProd);
+
+Status OpenCLReduceProdLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                      const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceProd Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceProd";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceProdLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=1 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r*t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r*t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x*r.y*r.z*r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=r ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceProdLayerAcc::~OpenCLReduceProdLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceProd, LAYER_REDUCE_PROD)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_PROD, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_layer_acc.cc
new file mode 100644
index 0000000..78bd26b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceSum);
+
+Status OpenCLReduceSumLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceSum Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceSum";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceSumLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+t); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=r ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceSumLayerAcc::~OpenCLReduceSumLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceSum, LAYER_REDUCE_SUM)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_SUM, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_square_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_square_layer_acc.cc
new file mode 100644
index 0000000..9efb2a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reduce_sum_square_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reduce_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_REDUCE_ACC(ReduceSumSquare);
+
+Status OpenCLReduceSumSquareLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                           const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ReduceSumSquare Acc\n");
+    Status ret = OpenCLReduceLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "ReduceSumSquare";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReduceSumSquareLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string init    = " -DDATAINIT=0 ";
+    std::string compute = " -DOPERATOR(r,t)=r=(r+pow(t,2)); ";
+    std::string reduce  = " -DREDUCEOPERATOR(r,t)=r=(r+t); ";
+    std::string inner   = " -DINNEROPERATOR(r)=r.x+r.y+r.z+r.w ";
+    std::string post    = " -DPOSTOPERATOR(r)=r ";
+    build_options.emplace(init + compute + reduce + inner + post);
+    return build_options;
+}
+
+OpenCLReduceSumSquareLayerAcc::~OpenCLReduceSumSquareLayerAcc() {}
+
+REGISTER_OPENCL_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE)
+REGISTER_OPENCL_LAYOUT(LAYER_REDUCE_SUM_SQUARE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.cc
new file mode 100644
index 0000000..cfcadd1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reformat_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLReformatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reformat Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Reformat";
+
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_format == DATA_FORMAT_NHC4W4 && reformat_param->dst_format == DATA_FORMAT_CNH4) {
+        kernel_name_ = "NHC4W4ImageToCNH4Image";
+    } else if (reformat_param->src_format == DATA_FORMAT_CNH4 && reformat_param->dst_format == DATA_FORMAT_NHC4W4) {
+        kernel_name_ = "CNH4ImageToNHC4W4Image";
+    } else {
+        LOGE("OpenCLReformatLayerAcc::Init Error: src_fmt: %d, dst_fmt: %d, src_type: %d, dst_type: %d\n",
+             reformat_param->src_format, reformat_param->dst_format, reformat_param->src_type,
+             reformat_param->dst_type);
+        return Status(TNNERR_MODEL_ERR, "OpenCLReformatLayerAcc::Init unsupport reformat type");
+    }
+
+    // create kernel
+    ret                     = CreateExecuteUnit(execute_units_[0], "image_to_image", kernel_name_);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLReformatLayerAcc::~OpenCLReformatLayerAcc() {}
+
+Status OpenCLReformatLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Reformat Layer Reshape\n");
+    ASSERT(inputs.size() == 1);
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    if (input_dims != output_dims || input_dims.size() != 3) {
+        LOGE("Reformat Layer input dims invalid\n");
+        return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "Reformat Layer input dims invalid");
+    }
+
+    int n = DimsFunctionUtils::GetDim(input_dims, 0);
+    int c = DimsFunctionUtils::GetDim(input_dims, 1);
+    int h = DimsFunctionUtils::GetDim(input_dims, 2);
+    if (kernel_name_ == "NHC4W4ImageToCNH4Image") {
+        execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(h, 4)), static_cast<uint32_t>(c * n)};
+        execute_units_[0].local_work_size  = {64, 1};
+    } else if (kernel_name_ == "CNH4ImageToNHC4W4Image") {
+        execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(c, 4)), static_cast<uint32_t>(n * h)};
+        execute_units_[0].local_work_size  = {1, 64};
+    }
+
+    uint32_t idx = 0;
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, n);
+    execute_units_[0].ocl_kernel.setArg(idx++, h);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    return TNN_OK;
+}
+
+std::vector<DataFormat> OpenCLReformatLayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list{DATA_FORMAT_NHC4W4, DATA_FORMAT_CNH4};
+    return support_list;
+}
+
+REGISTER_OPENCL_ACC(Reformat, LAYER_REFORMAT)
+REGISTER_OPENCL_LAYOUT(LAYER_REFORMAT, DATA_FORMAT_NHC4W4)
+REGISTER_OPENCL_LAYOUT(LAYER_REFORMAT, DATA_FORMAT_CNH4)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.h
new file mode 100644
index 0000000..75cc540
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reformat_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_REFORMAT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_REFORMAT_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLReformatLayerAcc : public OpenCLLayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLReformatLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    RawBuffer scale_buffer_;
+
+    std::string kernel_name_ = "";
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_REFORMAT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu6_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu6_layer_acc.cc
new file mode 100644
index 0000000..2554f43
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu6_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Relu6);
+
+Status OpenCLRelu6LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Relu6 Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Relu6";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLRelu6LayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "clamp(in,(FLOAT4)0,(FLOAT4)6)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLRelu6LayerAcc::~OpenCLRelu6LayerAcc() {}
+
+REGISTER_OPENCL_ACC(Relu6, LAYER_RELU6)
+REGISTER_OPENCL_LAYOUT(LAYER_RELU6, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu_layer_acc.cc
new file mode 100644
index 0000000..a738f42
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_relu_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Relu);
+
+Status OpenCLReluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Relu Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Relu";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLReluLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "fmax(in,(FLOAT4)0)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLReluLayerAcc::~OpenCLReluLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Relu, LAYER_RELU)
+REGISTER_OPENCL_LAYOUT(LAYER_RELU, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reorg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reorg_layer_acc.cc
new file mode 100644
index 0000000..f7d4aca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reorg_layer_acc.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class OpenCLReorgLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLReorgLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::shared_ptr<cl::Buffer> input_buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> output_buffer_ = nullptr;
+    int stride_     = 0;
+    int forward_    = 0;
+    int mode_       = 0;
+};
+
+Status OpenCLReorgLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reorg Acc \n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Reorg";
+
+    auto layer_param = dynamic_cast<ReorgLayerParam *>(param);
+    if (layer_param == nullptr) {
+        LOGE("ReorgLayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "ReorgLayerParam is null");
+    }
+
+    forward_    = layer_param->forward;
+    stride_     = layer_param->stride;
+    mode_       = layer_param->mode;
+
+    std::string program_name, kernel_name;
+    execute_units_.resize(3);
+    program_name = "image_to_buffer";
+    kernel_name  = "ImageToNCHWBufferFLOAT";
+    ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    program_name = "reorg";
+    kernel_name  = "Reorg";
+    ret          = CreateExecuteUnit(execute_units_[1], program_name, kernel_name);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    program_name = "buffer_to_image";
+    kernel_name  = "NCHWBufferToImageFLOAT";
+    ret          = CreateExecuteUnit(execute_units_[2], program_name, kernel_name);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLReorgLayerAcc::~OpenCLReorgLayerAcc() {}
+
+Status OpenCLReorgLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Reorg Acc Reshape\n");
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    int type_size = sizeof(float);
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+        type_size = 2;
+    }
+    int dims_count = DimsVectorUtils::Count(input->GetBlobDesc().dims);
+    input_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, dims_count * type_size);
+    output_buffer_ =
+        std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, dims_count * type_size);
+
+    auto &unit0              = execute_units_[0];
+    uint32_t idx = SetExecuteUnit2DSizeInfoDefault(unit0, input_dims);
+    unit0.ocl_kernel.setArg(idx++, *input_buffer_);
+    unit0.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2))); //input height
+    unit0.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3))); //input width
+    unit0.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1))); //input channel
+    unit0.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+
+    idx = 0;
+    auto &unit1            = execute_units_[1];
+    unit1.global_work_size = {(uint32_t) DimsVectorUtils::Count(input_dims)};
+    unit1.local_work_size  = {unit1.workgroupsize_max};
+    unit1.ocl_kernel.setArg(idx++, unit1.global_work_size[0]);
+    unit1.ocl_kernel.setArg(idx++, *input_buffer_);
+    unit1.ocl_kernel.setArg(idx++, *output_buffer_);
+    unit1.ocl_kernel.setArg(idx++, forward_ ? DimsFunctionUtils::GetDim(input_dims, 3) : DimsFunctionUtils::GetDim(output_dims, 3)); // input width
+    unit1.ocl_kernel.setArg(idx++, forward_ ? DimsFunctionUtils::GetDim(input_dims, 2) : DimsFunctionUtils::GetDim(output_dims, 2)); // input height
+    unit1.ocl_kernel.setArg(idx++, forward_ ? DimsFunctionUtils::GetDim(input_dims, 1) : DimsFunctionUtils::GetDim(output_dims, 1)); // input channel
+    unit1.ocl_kernel.setArg(idx++, forward_ ? DimsFunctionUtils::GetDim(input_dims, 0) : DimsFunctionUtils::GetDim(output_dims, 0)); // batch
+    unit1.ocl_kernel.setArg(idx++, stride_);
+    unit1.ocl_kernel.setArg(idx++, stride_ * stride_);
+    unit1.ocl_kernel.setArg(idx++, forward_);
+    unit1.ocl_kernel.setArg(idx++, mode_);
+
+    auto &unit2            = execute_units_[2];
+    idx = SetExecuteUnit2DSizeInfoDefault(unit2, output_dims);
+    unit2.ocl_kernel.setArg(idx++, *output_buffer_);
+    unit2.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));   // output height
+    unit2.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));    // output width
+    unit2.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));  // output channel
+    unit2.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Reorg, LAYER_REORG)
+REGISTER_OPENCL_LAYOUT(LAYER_REORG, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.cc
new file mode 100644
index 0000000..3ee8b67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.cc
@@ -0,0 +1,216 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_reshape_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLReshapeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Reshape Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    int reshape_type = -1;
+    ReshapeLayerParam *reshape_param = dynamic_cast<ReshapeLayerParam *>(param_);
+    if (!reshape_param) {
+        FlattenLayerParam *flatten_param = dynamic_cast<FlattenLayerParam *>(param_);
+        if(!flatten_param) {
+            LOGE("Error: layer param is null\n");
+            return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+        } else {
+            reshape_type = 0;
+        }
+    } else {
+        reshape_type = reshape_param->reshape_type;
+    }
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Reshape";
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+    input_dims_size_ = input_dims.size();
+    output_dims_size_ = output_dims.size();
+
+    std::string src_format = "Image", dst_format = "Image";
+    im_to_bf_program_name_ = "image_to_buffer";
+    bf_to_im_program_name_ = "buffer_to_image";
+    src_format = input_dims_size_ == 5 ? "Image5D" : input_dims_size_ == 6 ? "Image6D" : src_format;
+    im_to_bf_program_name_ = input_dims_size_ == 5 ? "image_5d_to_buffer" : input_dims_size_ == 6 ? "image_6d_to_buffer" : im_to_bf_program_name_;
+    dst_format = output_dims_size_ == 5 ? "Image5D" : output_dims_size_ == 6 ? "Image6D" : dst_format;
+    bf_to_im_program_name_ = output_dims_size_ == 5 ? "buffer_to_image_5d" : output_dims_size_ == 6 ? "buffer_to_image_6d" : bf_to_im_program_name_;
+
+    if (reshape_type == 0)
+    {
+        im_to_bf_func_name_      = src_format + "ToNCHWBuffer";
+        bf_to_im_func_name_      = "NCHWBufferTo" + dst_format;
+    } else if (reshape_type == 1 && outputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4) {
+        // tensorflow reshape data format is NHWC, only support NHC4W4 blob for now
+        im_to_bf_func_name_      = src_format + "ToNHWCBuffer";
+        bf_to_im_func_name_      = "NHWCBufferTo" + dst_format;
+    } else {
+        LOGE("Error: Unsupport reshape type(%d), src_format: %s, dst_format: %s\n",
+             reshape_type, src_format.c_str(), dst_format.c_str());
+        return Status(TNNERR_MODEL_ERR, "Error: OpenCLReshapeLayerAcc failed!\n");
+    }
+
+    execute_units_.resize(2);
+    // image->buffer
+    {
+        std::set<std::string> build_opt;
+        if (outputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+            is_nchw_output_ = true;
+            build_opt.emplace("-DENABLE_BUFFER_PRECISION_ADJUST");
+        }
+        ret = CreateExecuteUnit(execute_units_[0], im_to_bf_program_name_, im_to_bf_func_name_, build_opt);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    // buffer->image
+    {
+        ret = CreateExecuteUnit(execute_units_[1], bf_to_im_program_name_, bf_to_im_func_name_);
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLReshapeLayerAcc::~OpenCLReshapeLayerAcc() {}
+
+Status OpenCLReshapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Reshape Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    // reinit opencl execute unit if data format is changed during Reshape
+    if (output->GetBlobDesc().data_format == DATA_FORMAT_NCHW && !is_nchw_output_) {
+        std::set<std::string> build_opt;
+        is_nchw_output_ = true;
+        build_opt.emplace("-DENABLE_BUFFER_PRECISION_ADJUST");
+        ret = CreateExecuteUnit(execute_units_[0], im_to_bf_program_name_, im_to_bf_func_name_, build_opt);
+        CHECK_TNN_OK(ret)
+    }
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int blob_size                 = sizeof(float) * DimsVectorUtils::Count(input_dims);
+
+    if (output->GetBlobDesc().data_format != DATA_FORMAT_NCHW) {
+        inter_buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(),
+                                                     CL_MEM_READ_WRITE, blob_size);
+    }
+
+    // image->buffer
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], input_dims);
+        if (output->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+            execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Buffer *)output->GetHandle().base));
+        } else {
+            execute_units_[0].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        }
+        if (input_dims_size_ <= 4) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        } else if (input_dims_size_ == 5) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+        } else if (input_dims_size_ == 6) {
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 4)));
+            execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 5)));
+        }
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    }
+
+    // buffer->image
+    if (output->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+        InsertUnactiveUnitId(1);
+    } else {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[1], output_dims);
+        execute_units_[1].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        if (output_dims_size_ <= 4) {
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+        } else if (output_dims_size_ == 5) {
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+        } else if (output_dims_size_ == 6) {
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 4)));
+            execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 5)));
+        }
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    }
+
+    return TNN_OK;
+}
+
+std::vector<DataFormat> OpenCLReshapeLayerAcc::SupportDataFormat(DataType data_type,
+                                                                 int dims_size,
+                                                                 BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (data_type == DATA_TYPE_INT32) {
+        // reshape layer blob may contain shape info
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    } else if (dims_size >= 2 && dims_size <= 6) { // only support up to 6 dims
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+        // output blob support nchw
+        if (blob_type == BLOB_OUTPUT) {
+            support_list.push_back(DATA_FORMAT_NCHW);
+        }
+    }
+    return support_list;
+}
+
+std::vector<DataType> OpenCLReshapeLayerAcc::SupportDataType(int dims_size, BlobType blob_type) {
+    if (blob_type == BLOB_INPUT) {
+        // reshape layer blob may contain shape info
+        return {DATA_TYPE_FLOAT, DATA_TYPE_HALF, DATA_TYPE_INT32};
+    } else {
+        return {DATA_TYPE_FLOAT, DATA_TYPE_HALF};
+    }
+}
+
+REGISTER_OPENCL_ACC(Reshape, LAYER_RESHAPE)
+REGISTER_OPENCL_ACC(Reshape, LAYER_FLATTEN)
+REGISTER_OPENCL_LAYOUT(LAYER_RESHAPE, DATA_FORMAT_NHC4W4);
+REGISTER_OPENCL_LAYOUT(LAYER_FLATTEN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.h
new file mode 100644
index 0000000..848eb7d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_reshape_layer_acc.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_RESHAPE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_RESHAPE_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLReshapeLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLReshapeLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    virtual std::vector<DataType> SupportDataType(int dims_size, BlobType blob_type) override;
+    std::shared_ptr<cl::Buffer> inter_buffer_ = nullptr;
+    int input_dims_size_ = 0;
+    int output_dims_size_ = 0;
+    bool is_nchw_output_ = false;
+    std::string im_to_bf_func_name_;
+    std::string bf_to_im_func_name_;
+    std::string im_to_bf_program_name_;
+    std::string bf_to_im_program_name_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_RESHAPE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_selu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_selu_layer_acc.cc
new file mode 100644
index 0000000..107a18d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_selu_layer_acc.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <sstream>
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Selu);
+
+Status OpenCLSeluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Selu Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "Selu";
+
+    std::set<std::string> build_options;
+    AdjustBuildOptionForFp32(build_options);
+
+    // create kernel
+    std::string kernel_name = "Selu";
+    ret                     = CreateExecuteUnit(execute_units_[0], "selu", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLSeluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Selu Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    SeluLayerParam *selu_param = dynamic_cast<SeluLayerParam *>(param_);
+    if (!selu_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+    float factor1 = selu_param->alpha * selu_param->gamma;
+    float factor2 = selu_param->gamma;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], outputs[0]->GetBlobDesc().dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, factor1);
+    execute_units_[0].ocl_kernel.setArg(idx++, factor2);
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Selu, LAYER_SELU)
+REGISTER_OPENCL_LAYOUT(LAYER_SELU, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_shuffle_layer_acc.cc
new file mode 100644
index 0000000..03a74b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_shuffle_layer_acc.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/opencl_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Shuffle);
+
+Status OpenCLShuffleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init ShuffleChannel Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "ShuffleChannel";
+
+    // create kernel
+    std::string kernel_name = "ShuffleChannel";
+    ret                     = CreateExecuteUnit(execute_units_[0], "shuffle", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLShuffleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("ShuffleChannel Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto shuffle_param = dynamic_cast<ShuffleLayerParam *>(param_);
+    if (shuffle_param == nullptr) {
+        LOGE("ShuffleChannelLayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "ShuffleChannelLayerParam is null!");
+    }
+
+    ASSERT(inputs.size() == 1);
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    if (shuffle_param->group <= 0 || DimsFunctionUtils::GetDim(input_dims, 1) % shuffle_param->group != 0) {
+        LOGE("invalid group size in Shuffle layer!\n");
+        return Status(TNNERR_LAYER_ERR, "invalid group size in Shuffle layer!");
+    }
+
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], outputs[0]->GetBlobDesc().dims);
+    int group_size = DimsFunctionUtils::GetDim(output_dims, 1) / shuffle_param->group;
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, shuffle_param->group);
+    execute_units_[0].ocl_kernel.setArg(idx++, group_size);
+    execute_units_[0].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 1)); //output channel
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL)
+REGISTER_OPENCL_LAYOUT(LAYER_SHUFFLE_CHANNEL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..405011f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sigmoid_layer_acc.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Sigmoid);
+
+Status OpenCLSigmoidLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Sigmoid Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Sigmoid";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLSigmoidLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "(FLOAT)(1.0f)/((FLOAT)(1.0f)+exp(-in))";
+    build_options.emplace(" -DOPERATOR=" + compute);
+
+    AdjustBuildOptionForFp32(build_options);
+    return build_options;
+}
+
+OpenCLSigmoidLayerAcc::~OpenCLSigmoidLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Sigmoid, LAYER_SIGMOID)
+REGISTER_OPENCL_LAYOUT(LAYER_SIGMOID, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sign_layer_acc.cc
new file mode 100644
index 0000000..ebc37f2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sign_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Sign);
+
+Status OpenCLSignLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Sign Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Sign";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLSignLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "select(select(in,(FLOAT4)(-1),in<(FLOAT4)0),(FLOAT4)1,in>(FLOAT4)0)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLSignLayerAcc::~OpenCLSignLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Sign, LAYER_SIGN)
+REGISTER_OPENCL_LAYOUT(LAYER_SIGN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_signed_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_signed_mul_layer_acc.cc
new file mode 100644
index 0000000..bac51a3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_signed_mul_layer_acc.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(SignedMul);
+
+Status OpenCLSignedMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init SignedMul Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "SignedMul";
+
+    SignedMulLayerParam *signed_mul_param = dynamic_cast<SignedMulLayerParam *>(param);
+    if (!signed_mul_param) {
+        LOGE("Error: singed mul layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: signed mul layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name = "SignedMul";
+    ret = CreateExecuteUnit(execute_units_[0], "signed_mul", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLSignedMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("SignedMul Layer Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    SignedMulLayerParam *signed_mul_param = dynamic_cast<SignedMulLayerParam *>(param_);
+    if (!signed_mul_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int batch    = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int channels = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int height   = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int width    = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    uint32_t idx = 0;
+    execute_units_[0].global_work_size = {static_cast<uint32_t>(width), static_cast<uint32_t>(UP_DIV(channels, 4)),
+                                                static_cast<uint32_t>(height * batch)};
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, signed_mul_param->alpha);
+    execute_units_[0].ocl_kernel.setArg(idx++, signed_mul_param->beta);
+    execute_units_[0].ocl_kernel.setArg(idx++, 1.0f / signed_mul_param->gamma);
+    execute_units_[0].local_work_size = LocalWS3DDefault(execute_units_[0]);
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(SignedMul, LAYER_SIGNED_MUL)
+REGISTER_OPENCL_LAYOUT(LAYER_SIGNED_MUL, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sin_layer_acc.cc
new file mode 100644
index 0000000..900adb8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sin_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Sin);
+
+Status OpenCLSinLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Sin Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Sin";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLSinLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "sin(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLSinLayerAcc::~OpenCLSinLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Sin, LAYER_SIN)
+REGISTER_OPENCL_LAYOUT(LAYER_SIN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softmax_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softmax_layer_acc.cc
new file mode 100644
index 0000000..bd65318
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softmax_layer_acc.cc
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Softmax);
+
+#define HighOpIntensityThre 128
+
+Status OpenCLSoftmaxLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init SoftMax Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = true;
+    op_name_        = "SoftMax";
+
+    SoftmaxLayerParam *softmax_param = dynamic_cast<SoftmaxLayerParam *>(param);
+    if (!softmax_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name;
+    if (softmax_param->axis == 1) {
+        kernel_name = "SoftmaxChannel";
+    } else if (softmax_param->axis == 2) {
+        kernel_name = "SoftmaxHeight";
+    } else {
+        LOGE("not support axis = %d in softmax yet!\n", softmax_param->axis);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid softmax axis");
+    }
+
+    auto output_dims    = outputs[0]->GetBlobDesc().dims;
+    const int batch     = DimsFunctionUtils::GetDim(output_dims, 0);
+    int cw              = DimsFunctionUtils::GetDim(output_dims, 3) * UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+
+    auto input_dims     = inputs[0]->GetBlobDesc().dims;
+    int axis_n          = DimsFunctionUtils::GetDim(input_dims, softmax_param->axis);
+
+    // only support fine-grained parallelism in softmax height
+    bool run_local_work = softmax_param->axis == 2 && axis_n >= HighOpIntensityThre;
+    if (run_local_work) {
+        kernel_name += "Local";
+    }
+
+    std::set<std::string> build_options;
+    AdjustBuildOptionForFp32(build_options);
+
+    ret = CreateExecuteUnit(execute_units_[0], "softmax", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLSoftmaxLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("SoftMax Layer Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    SoftmaxLayerParam *softmax_param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    if (!softmax_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    ASSERT(inputs.size() == 1);
+
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+
+    const int batch     = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int channels  = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int height    = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int width     = DimsFunctionUtils::GetDim(output_dims, 3);
+    int c4_n            = DimsFunctionUtils::GetDim(input_dims, 1) / 4;
+
+    const int channelBlocks  = UP_DIV(channels, 4);
+    const int remainChannels = channelBlocks * 4 - channels;
+
+    int cw      = DimsFunctionUtils::GetDim(output_dims, 3) * channelBlocks;
+    int axis_n  = DimsFunctionUtils::GetDim(input_dims, softmax_param->axis);
+
+    uint32_t idx = 0;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    int type_size = sizeof(float);
+    if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+        type_size = 2;
+    }
+
+    if (1 == softmax_param->axis) {
+        execute_units_[0].global_work_size = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(width),
+                                                static_cast<uint32_t>(height * batch)};
+        uint32_t idx                       = 0;
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[2]);
+
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int>(channels));
+        execute_units_[0].ocl_kernel.setArg(idx++, remainChannels);
+        execute_units_[0].local_work_size = LocalWS3DDefault(execute_units_[0]);
+    } else if (2 == softmax_param->axis) {
+        bool run_local_work     = axis_n >= HighOpIntensityThre;
+        uint32_t workgroup_size = 0;
+        auto &unit              = execute_units_[0];
+        if (run_local_work) {
+            workgroup_size = std::min(static_cast<uint32_t>(unit.local_mem_size / (4 * type_size)),
+                                      unit.workgroupsize_max);
+            workgroup_size = std::min(static_cast<uint32_t>(axis_n), workgroup_size);
+            int temp_size = 1;
+            while ((temp_size <<= 1) <= workgroup_size);
+            workgroup_size = temp_size >> 1;
+
+            unit.global_work_size = {static_cast<uint32_t>(cw * workgroup_size), static_cast<uint32_t>(batch)};
+            unit.local_work_size  = {workgroup_size, 1};
+        } else {
+            if (execute_units_[0].workgroupsize_max > 256) {
+                execute_units_[0].local_work_size = {16, 16, 1};
+            } else {
+                execute_units_[0].local_work_size = {8, 8, 1};
+            }
+            execute_units_[0].global_work_size = {(uint32_t)channelBlocks * width, (uint32_t)batch, 1};
+        }
+        int shape[]                        = {batch, channelBlocks, height, width};
+
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[0]);
+        execute_units_[0].ocl_kernel.setArg(idx++, execute_units_[0].global_work_size[1]);
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+        execute_units_[0].ocl_kernel.setArg(idx++, shape);
+        if (run_local_work) {
+            execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(axis_n, workgroup_size));
+            execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+        }
+    } else {
+        LOGE("not support axis = %d in softmax yet!\n", softmax_param->axis);
+        return Status(TNNERR_OPENCL_ACC_RESHAPE_ERROR, "invalid softmax axis");
+    }
+
+    if (ocl_context_->GetEnableTuneKernel()) {
+        std::string tune_key = execute_units_[0].program_name + "_" + execute_units_[0].kernel_name + "_" + "param[" +
+                               "axis_" + ToString(softmax_param->axis) + "]_global";
+        for (auto size : execute_units_[0].global_work_size) {
+            tune_key += "_" + ToString(size);
+        }
+        execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, tune_key);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Softmax, LAYER_SOFTMAX)
+REGISTER_OPENCL_LAYOUT(LAYER_SOFTMAX, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softplus_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softplus_layer_acc.cc
new file mode 100644
index 0000000..247fefa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_softplus_layer_acc.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Softplus);
+
+Status OpenCLSoftplusLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Softplus Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Softplus";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLSoftplusLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "log((FLOAT)(1.0f)+exp(in))";
+    build_options.emplace(" -DOPERATOR=" + compute);
+
+    AdjustBuildOptionForFp32(build_options);
+
+    return build_options;
+}
+
+OpenCLSoftplusLayerAcc::~OpenCLSoftplusLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Softplus, LAYER_SOFTPLUS)
+REGISTER_OPENCL_LAYOUT(LAYER_SOFTPLUS, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_split_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_split_layer_acc.cc
new file mode 100644
index 0000000..42e05f8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_split_layer_acc.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_ACC(Split);
+
+Status OpenCLSplitLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Split Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Split";
+
+    // create kernel
+    execute_units_.resize(outputs.size());
+    for (size_t i = 0; i < execute_units_.size(); i++) {
+        ret = CreateExecuteUnit(execute_units_[i], "copy", "CopyImage");
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLSplitLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Split Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    const int batch         = DimsFunctionUtils::GetDim(output_dims, 0);
+    const int channels      = DimsFunctionUtils::GetDim(output_dims, 1);
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    int inputWH[]      = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    int inputOffset[]  = {0, 0, 0, 0};
+    int outputOffset[] = {0, 0, 0, 0};
+    for (int i = 0; i < execute_units_.size(); ++i) {
+        auto output    = outputs[i];
+        int outputWH[] = {output_width, output_height};
+
+        auto &unit = execute_units_[i];
+        int idx    = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        unit.ocl_kernel.setArg(idx++, inputOffset);
+        unit.ocl_kernel.setArg(idx++, outputOffset);
+        unit.ocl_kernel.setArg(idx++, inputWH);
+        unit.ocl_kernel.setArg(idx++, outputWH);
+        unit.ocl_kernel.setArg(idx++, outputWH);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Split, LAYER_SPLITING)
+REGISTER_OPENCL_LAYOUT(LAYER_SPLITING, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_splitv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_splitv_layer_acc.cc
new file mode 100644
index 0000000..7297aa3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_splitv_layer_acc.cc
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_execute_unit.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    SPLITV_IMAGE = 0,
+    SPLITV_UNITE = 1,
+} SplitVType;
+
+struct SplitVUnit {
+    int axis;
+    int begin;
+    int end;
+    SplitVType type;
+};
+
+class OpenCLSplitVLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLSplitVLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::vector<SplitVUnit> splitv_units_ = {};
+    std::shared_ptr<cl::Buffer> buffer_ = nullptr;
+    bool use_buffer_ = false;
+};
+
+Status OpenCLSplitVLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init SplitV Acc \n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "SplitV";
+
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param);
+    if (layer_param == nullptr) {
+        LOGE("SplitVLayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "SplitVLayerParam is null");
+    }
+
+    if (layer_param->axis != 1) {
+        LOGE("axis=%d is not support in SplitV yet!\n", layer_param->axis);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "the axis not support");
+    }
+
+    std::vector<int> slices = layer_param->slices;
+
+    if (slices.size() != outputs.size()) {
+        LOGE("invalid params in SplitV!\n");
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid params in SplitV");
+    }
+
+    int begin   = 0;
+    for (auto output : outputs) {
+        auto output_dim = output->GetBlobDesc().dims;
+
+        SplitVUnit unit;
+        unit.axis  = layer_param->axis;
+        int slice  = output_dim[unit.axis];
+        unit.begin = begin;
+        unit.end   = begin + slice;
+
+        if (unit.axis == 1 && unit.begin % 4 != 0) {
+            unit.type   = SPLITV_UNITE;
+            use_buffer_ = true;
+        } else {
+            unit.type = SPLITV_IMAGE;
+        }
+
+        splitv_units_.push_back(unit);
+
+        begin += slice;
+    }
+
+    execute_units_.clear();
+    if (use_buffer_) {
+        // use buffer, convert image to buffer first.
+        OpenCLExecuteUnit exec_unit;
+        ret = CreateExecuteUnit(exec_unit, "image_to_buffer", "ImageToNCHWBufferFLOAT");
+        CHECK_TNN_OK(ret)
+        execute_units_.push_back(exec_unit);
+    }
+    for (auto unit : splitv_units_) {
+        OpenCLExecuteUnit exec_unit;
+        if (SPLITV_IMAGE == unit.type) {
+            ret = CreateExecuteUnit(exec_unit, "copy", "CopyImage");
+        } else {
+            ret = CreateExecuteUnit(exec_unit, "copy", "CopyBufferToImage");
+        }
+        CHECK_TNN_OK(ret)
+        execute_units_.push_back(exec_unit);
+    }
+
+    return TNN_OK;
+}
+
+OpenCLSplitVLayerAcc::~OpenCLSplitVLayerAcc() {}
+
+Status OpenCLSplitVLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("StrideSlice Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    auto input           = inputs[0];
+    auto dims            = input->GetBlobDesc().dims;
+    int exec_unit_idx    = 0;
+    int input_dims_count = DimsVectorUtils::Count(input->GetBlobDesc().dims);
+    if (use_buffer_) {
+        // use buffer, convert to buffer.
+        int type_size = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            type_size = 2;
+        }
+        cl_int cl_ret;
+        cl::Buffer *cl_buffer = new cl::Buffer(*opencl_runtime->Context(), (cl_mem_flags)CL_MEM_READ_WRITE,
+                                               (cl::size_type)(input_dims_count * type_size), nullptr, &cl_ret);
+        if (cl_ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(cl_ret)
+            if (nullptr != cl_buffer)
+                delete cl_buffer;
+            return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL malloc memory failed");
+        }
+        buffer_.reset(cl_buffer);
+
+        auto &unit   = execute_units_[exec_unit_idx];
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+        unit.ocl_kernel.setArg(idx++, *buffer_);
+        // input_height
+        unit.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 2)));
+        // input_width
+        unit.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 3)));
+        // input_channel
+        unit.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+        unit.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        exec_unit_idx++;
+    }
+
+    int unit_idx = 0;
+    for (auto output : outputs) {
+        auto output_dims = output->GetBlobDesc().dims;
+        int input_wh[]   = {DimsFunctionUtils::GetDim(dims, 3), DimsFunctionUtils::GetDim(dims, 2)};
+        int output_wh[]  = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+
+        auto &unit       = execute_units_[exec_unit_idx];
+        auto splitv_unit = splitv_units_[unit_idx];
+
+        if (splitv_unit.type == SPLITV_IMAGE) {
+            // kernel: CopyImage
+            int input_offset[] = {0, 0, 0, 0};
+            if (splitv_unit.axis == 1) {
+                // if the axis is 1, then offset need to div 4
+                input_offset[1] = splitv_unit.begin / 4;
+            } else {
+                input_offset[splitv_unit.axis] = splitv_unit.begin;
+            }
+            int output_offset[] = {0, 0, 0, 0};
+            // default set execution
+            int idx = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+            unit.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+            unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+            unit.ocl_kernel.setArg(idx++, input_offset);
+            unit.ocl_kernel.setArg(idx++, output_offset);
+            unit.ocl_kernel.setArg(idx++, input_wh);
+            unit.ocl_kernel.setArg(idx++, output_wh);
+            unit.ocl_kernel.setArg(idx++, output_wh);
+        } else {
+            // kernel: CopyBufferToImage
+            int input_offset[]             = {0, 0, 0, 0};
+            input_offset[splitv_unit.axis] = splitv_unit.begin;
+            int output_offset[]            = {0, 0, 0, 0};
+            // stride: input_channel * input_height * input_width, input_height * input_width, input_width, 1
+            int input_channel = DimsFunctionUtils::GetDim(dims, 1);
+            int input_height = DimsFunctionUtils::GetDim(dims, 2);
+            int input_width = DimsFunctionUtils::GetDim(dims, 3);
+            int input_stride[] = {input_channel * input_height * input_width,
+                                  input_height * input_width, input_width, 1};
+            int idx = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+            unit.ocl_kernel.setArg(idx++, *buffer_);
+            unit.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+            unit.ocl_kernel.setArg(idx++, input_offset);
+            unit.ocl_kernel.setArg(idx++, output_offset);
+            unit.ocl_kernel.setArg(idx++, input_stride);
+            unit.ocl_kernel.setArg(idx++, output_wh);
+            unit.ocl_kernel.setArg(idx++, output_wh);
+            unit.ocl_kernel.setArg(idx++, input_dims_count - 1);
+        }
+        exec_unit_idx++;
+        unit_idx++;
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(SplitV, LAYER_SPLITV)
+REGISTER_OPENCL_LAYOUT(LAYER_SPLITV, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sqrt_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sqrt_layer_acc.cc
new file mode 100644
index 0000000..dbe3bdb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sqrt_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Sqrt);
+
+Status OpenCLSqrtLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Sqrt Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Sqrt";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLSqrtLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "sqrt(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLSqrtLayerAcc::~OpenCLSqrtLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Sqrt, LAYER_SQRT)
+REGISTER_OPENCL_LAYOUT(LAYER_SQRT, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_squeeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_squeeze_layer_acc.cc
new file mode 100644
index 0000000..6fdcae5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_squeeze_layer_acc.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+class OpenCLSqueezeLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLSqueezeLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::shared_ptr<cl::Buffer> inter_buffer_ = nullptr;
+};
+
+Status OpenCLSqueezeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                   const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Squeeze Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Squeeze";
+
+    execute_units_.resize(2);
+    // image->buffer
+    {
+        ret = CreateExecuteUnit(execute_units_[0], "image_to_buffer", "ImageToNCHWBuffer");
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    // buffer->image
+    {
+        ret = CreateExecuteUnit(execute_units_[1], "buffer_to_image", "NCHWBufferToImage");
+        if (ret != TNN_OK) {
+            LOGE("create execute unit failed!\n");
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLSqueezeLayerAcc::~OpenCLSqueezeLayerAcc() {}
+
+Status OpenCLSqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Squeeze Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    int size0          = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4) * 4 * DimsFunctionUtils::GetDim(output_dims, 0) *
+                                DimsFunctionUtils::GetDim(output_dims, 2) * DimsFunctionUtils::GetDim(output_dims, 3);
+    int size1          = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4) * 4 * DimsFunctionUtils::GetDim(input_dims, 0) *
+                                DimsFunctionUtils::GetDim(input_dims, 2) * DimsFunctionUtils::GetDim(input_dims, 3);
+    int blob_size      = std::max(size0, size1) * sizeof(float);
+
+    inter_buffer_      = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, blob_size);
+
+    // image->buffer
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], input_dims);
+        execute_units_[0].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    }
+
+    // buffer->image
+    {
+        uint32_t idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[1], output_dims);
+        execute_units_[1].ocl_kernel.setArg(idx++, *inter_buffer_.get());
+        execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 2)));
+        execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 3)));
+        execute_units_[1].ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(output_dims, 1)));
+        execute_units_[1].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Squeeze, LAYER_SQUEEZE)
+REGISTER_OPENCL_LAYOUT(LAYER_SQUEEZE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_layer_acc.cc
new file mode 100644
index 0000000..3a1c83f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_layer_acc.cc
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_execute_unit.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+typedef enum { STRIDE_SLICE_IMAGE = 0, STRIDE_SLICE_C4_UNITE = 1, STRIDE_SLICE_C4_SEPARATE = 2 } StrideSliceType;
+
+class OpenCLStrideSliceLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLStrideSliceLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::vector<int> begins_;
+    std::vector<int> strides_;
+    std::vector<int> ends_;
+    StrideSliceType type_ = STRIDE_SLICE_C4_SEPARATE;
+    std::shared_ptr<cl::Buffer> buffer_;
+};
+
+Status OpenCLStrideSliceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init StrideSlice Acc \n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_  = false;
+    op_name_         = "StrideSlice";
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param);
+    if (layer_param == nullptr) {
+        LOGE("StrideSliceLayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "StrideSliceLayerParam is null");
+    }
+
+    begins_ = layer_param->begins;
+    std::reverse(begins_.begin(), begins_.end());
+    strides_ = layer_param->strides;
+    std::reverse(strides_.begin(), strides_.end());
+    ends_ = layer_param->ends;
+    std::reverse(ends_.begin(), ends_.end());
+
+    for (int i = 0; i < ends_.size(); ++i) {
+        if (ends_[i] == 0) {
+            ends_[i] = DimsFunctionUtils::GetDim(inputs[0]->GetBlobDesc().dims, i);
+        }
+    }
+
+    int begin_channel  = begins_[1];
+    int end_channel    = ends_[1];
+    int channel_stride = strides_[1];
+    type_              = STRIDE_SLICE_C4_SEPARATE;
+    if (begin_channel % 4 == 0 && channel_stride == 1) {
+        type_ = STRIDE_SLICE_IMAGE;
+        for (auto value : strides_) {
+            if (value > 1) {
+                type_ = STRIDE_SLICE_C4_UNITE;
+            }
+        }
+    }
+
+    std::string program_name, kernel_name;
+    if (type_ == STRIDE_SLICE_IMAGE) {
+        execute_units_.resize(1);
+        program_name = "copy";
+        kernel_name  = "CopyImage";
+        ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else if (type_ == STRIDE_SLICE_C4_UNITE) {
+        execute_units_.resize(1);
+        program_name = "stride_slice";
+        kernel_name  = "StrideSliceC4Unite";
+        ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else {
+        execute_units_.resize(2);
+        program_name = "image_to_buffer";
+        kernel_name  = "ImageToNCHWBufferFLOAT";
+        ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        program_name = "stride_slice";
+        kernel_name  = "StrideSliceC4Separate";
+        ret          = CreateExecuteUnit(execute_units_[1], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLStrideSliceLayerAcc::~OpenCLStrideSliceLayerAcc() {}
+
+Status OpenCLStrideSliceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("StrideSlice Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    auto input                    = inputs[0];
+    auto output                   = outputs[0];
+    auto input_dims               = input->GetBlobDesc().dims;
+    auto output_dims              = output->GetBlobDesc().dims;
+    int inputWH[]                 = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    int outputWH[]                = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+
+    if (type_ == STRIDE_SLICE_IMAGE) {
+        auto &unit0          = execute_units_[0];
+        int inputOffset[]    = {begins_[0], begins_[1] / 4, begins_[2], begins_[3]};
+        int outputOffset[]   = {0, 0, 0, 0};
+        DimsVector exec_dims = {ends_[0] - begins_[0], ends_[1] - begins_[1], ends_[2] - begins_[2],
+                                ends_[3] - begins_[3]};
+        int idx = SetExecuteUnit2DSizeInfoDefault(unit0, exec_dims);
+        unit0.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+        unit0.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+        unit0.ocl_kernel.setArg(idx++, inputOffset);
+        unit0.ocl_kernel.setArg(idx++, outputOffset);
+        unit0.ocl_kernel.setArg(idx++, inputWH);
+        unit0.ocl_kernel.setArg(idx++, outputWH);
+        unit0.ocl_kernel.setArg(idx++, outputWH);
+    } else if (type_ == STRIDE_SLICE_C4_UNITE) {
+        auto &unit0               = execute_units_[0];
+        int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+        // output_width * output_channel_blocks, output_batch * output_height
+        unit0.global_work_size = {(uint32_t)DimsFunctionUtils::GetDim(output_dims, 3) * output_channel_blocks,
+                                  (uint32_t)DimsFunctionUtils::GetDim(output_dims, 0) *
+                                  DimsFunctionUtils::GetDim(output_dims, 2)};
+        unit0.local_work_size  = LocalWS2DDefault(unit0);
+        unit0.ocl_kernel.setArg(0, *((cl::Image *)input->GetHandle().base));
+        unit0.ocl_kernel.setArg(1, *((cl::Image *)output->GetHandle().base));
+        unit0.ocl_kernel.setArg(2, 4 * sizeof(int), begins_.data());
+        unit0.ocl_kernel.setArg(3, 4 * sizeof(int), strides_.data());
+        unit0.ocl_kernel.setArg(4, inputWH);
+        unit0.ocl_kernel.setArg(5, outputWH);
+    } else {
+        int dims_count = DimsVectorUtils::Count(input->GetBlobDesc().dims);
+        int type_size  = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            type_size = 2;
+        }
+        buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, dims_count * type_size);
+
+        auto &unit0              = execute_units_[0];
+        int input_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+        // input_width * input_channel_blocks, input_batch * input_height
+        unit0.global_work_size = {(uint32_t)DimsFunctionUtils::GetDim(input_dims, 3) * input_channel_blocks,
+                                  (uint32_t)DimsFunctionUtils::GetDim(input_dims, 0) * DimsFunctionUtils::GetDim(input_dims, 2)};
+        unit0.local_work_size  = LocalWS2DDefault(unit0);
+        unit0.ocl_kernel.setArg(0, unit0.global_work_size[0]);
+        unit0.ocl_kernel.setArg(1, unit0.global_work_size[1]);
+        unit0.ocl_kernel.setArg(2, *buffer_);
+        // input height
+        unit0.ocl_kernel.setArg(3, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+        // input width
+        unit0.ocl_kernel.setArg(4, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+        // input channel
+        unit0.ocl_kernel.setArg(5, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        unit0.ocl_kernel.setArg(6, *((cl::Image *)input->GetHandle().base));
+
+        auto &unit1               = execute_units_[1];
+        int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+        unit1.global_work_size    = {(uint32_t)DimsFunctionUtils::GetDim(output_dims, 3) * output_channel_blocks,
+                                     (uint32_t)DimsFunctionUtils::GetDim(output_dims, 0) *
+                                     DimsFunctionUtils::GetDim(output_dims, 2)};
+        unit1.local_work_size     = LocalWS2DDefault(unit1);
+        unit1.ocl_kernel.setArg(0, unit1.global_work_size[0]);
+        unit1.ocl_kernel.setArg(1, unit1.global_work_size[1]);
+        unit1.ocl_kernel.setArg(2, *buffer_);
+        unit1.ocl_kernel.setArg(3, *((cl::Image *)output->GetHandle().base));
+        unit1.ocl_kernel.setArg(4, 4 * sizeof(int), begins_.data());
+        unit1.ocl_kernel.setArg(5, 4 * sizeof(int), strides_.data());
+        // input_width
+        unit1.ocl_kernel.setArg(6, DimsFunctionUtils::GetDim(input_dims, 3));
+        // input_width * input_height
+        unit1.ocl_kernel.setArg(7, DimsFunctionUtils::GetDim(input_dims, 3) *
+                                   DimsFunctionUtils::GetDim(input_dims, 2));
+        // input_width * input_height * input_channel
+        unit1.ocl_kernel.setArg(8, DimsFunctionUtils::GetDim(input_dims, 3) *
+                                   DimsFunctionUtils::GetDim(input_dims, 2) *
+                                   DimsFunctionUtils::GetDim(input_dims, 1));
+        // input_channel
+        unit1.ocl_kernel.setArg(9, DimsFunctionUtils::GetDim(input_dims, 1));
+        unit1.ocl_kernel.setArg(10, outputWH);
+        // output_channel
+        unit1.ocl_kernel.setArg(11, DimsFunctionUtils::GetDim(output_dims, 1));
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(StrideSlice, LAYER_STRIDED_SLICE)
+REGISTER_OPENCL_LAYOUT(LAYER_STRIDED_SLICE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_v2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_v2_layer_acc.cc
new file mode 100644
index 0000000..c69d3fa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_stride_slice_v2_layer_acc.cc
@@ -0,0 +1,256 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_execute_unit.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+typedef enum { STRIDE_SLICE_IMAGE = 0, STRIDE_SLICE_C4_UNITE = 1, STRIDE_SLICE_C4_SEPARATE = 2 } StrideSliceType;
+
+class OpenCLStrideSliceV2LayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLStrideSliceV2LayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    std::vector<int> begins_;
+    std::vector<int> strides_;
+    std::vector<int> ends_;
+    std::vector<int> axes_;
+    StrideSliceType type_ = STRIDE_SLICE_C4_SEPARATE;
+    std::shared_ptr<cl::Buffer> buffer_;
+};
+
+Status OpenCLStrideSliceV2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init StrideSliceV2 Acc \n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_  = false;
+    op_name_         = "StrideSliceV2";
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param);
+    if (layer_param == nullptr) {
+        LOGE("StrideSliceV2LayerParam is null!\n");
+        return Status(TNNERR_MODEL_ERR, "StrideSliceV2LayerParam is null");
+    }
+
+    auto begins  = layer_param->begins;
+    auto strides = layer_param->strides;
+    auto ends    = layer_param->ends;
+    auto axes    = layer_param->axes;
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+
+    DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &ret);
+    CHECK_TNN_OK(ret)
+
+    for (int i = 0, axes_idx = 0; i < std::max((int)output_dims.size(), 4); i++) {
+        if (axes_idx >= axes.size() || i != axes[axes_idx]) {
+            begins_.push_back(0);
+            strides_.push_back(1);
+            ends_.push_back(DimsFunctionUtils::GetDim(output_dims, i));
+        } else {
+            begins_.push_back(begins[axes_idx]);
+            strides_.push_back(strides[axes_idx]);
+            ends_.push_back(ends[axes_idx]);
+            axes_idx += 1;
+        }
+    }
+
+    int begin_channel  = begins_[1];
+    int end_channel    = ends_[1];
+    int channel_stride = strides_[1];
+    type_              = STRIDE_SLICE_C4_SEPARATE;
+    if (begin_channel % 4 == 0 && channel_stride == 1) {
+        type_ = STRIDE_SLICE_IMAGE;
+        for (auto value : strides_) {
+            if (value > 1) {
+                type_ = STRIDE_SLICE_C4_UNITE;
+            }
+        }
+    }
+
+    if (output_dims.size() > 5 || (output_dims.size() == 5 && type_ != STRIDE_SLICE_IMAGE)) {
+        return Status(TNNERR_PARAM_ERR, "dims type not supported on Stride Slice V2");
+    }
+
+    std::string program_name, kernel_name;
+    if (type_ == STRIDE_SLICE_IMAGE) {
+        execute_units_.resize(1);
+        program_name = "copy";
+        kernel_name  = "CopyImage";
+        if (output_dims.size() == 5) {
+            program_name = "copy_image_5d";
+            kernel_name = "CopyImage5D";
+        }
+        ret = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else if (type_ == STRIDE_SLICE_C4_UNITE) {
+        execute_units_.resize(1);
+        program_name = "stride_slice";
+        kernel_name  = "StrideSliceC4Unite";
+        ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else {
+        execute_units_.resize(2);
+        program_name = "image_to_buffer";
+        kernel_name  = "ImageToNCHWBufferFLOAT";
+        ret          = CreateExecuteUnit(execute_units_[0], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        program_name = "stride_slice";
+        kernel_name  = "StrideSliceC4Separate";
+        ret          = CreateExecuteUnit(execute_units_[1], program_name, kernel_name);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+OpenCLStrideSliceV2LayerAcc::~OpenCLStrideSliceV2LayerAcc() {}
+
+Status OpenCLStrideSliceV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("StrideSlice Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+    auto input                    = inputs[0];
+    auto output                   = outputs[0];
+    auto input_dims               = input->GetBlobDesc().dims;
+    auto output_dims              = output->GetBlobDesc().dims;
+    int inputWH[]  = {DimsFunctionUtils::GetDim(input_dims, 3), DimsFunctionUtils::GetDim(input_dims, 2)};
+    int outputWH[] = {DimsFunctionUtils::GetDim(output_dims, 3), DimsFunctionUtils::GetDim(output_dims, 2)};
+
+    if (type_ == STRIDE_SLICE_IMAGE) {
+        auto &unit0 = execute_units_[0];
+        if (output_dims.size() <= 4) {
+            int inputOffset[]    = {begins_[0], begins_[1] / 4, begins_[2], begins_[3]};
+            int outputOffset[]   = {0, 0, 0, 0};
+            DimsVector exec_dims = {ends_[0] - begins_[0], ends_[1] - begins_[1], ends_[2] - begins_[2],
+                                    ends_[3] - begins_[3]};
+            int idx              = SetExecuteUnit2DSizeInfoDefault(unit0, exec_dims);
+            unit0.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+            unit0.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+            unit0.ocl_kernel.setArg(idx++, inputOffset);
+            unit0.ocl_kernel.setArg(idx++, outputOffset);
+            unit0.ocl_kernel.setArg(idx++, inputWH);
+            unit0.ocl_kernel.setArg(idx++, outputWH);
+            unit0.ocl_kernel.setArg(idx++, outputWH);
+        } else if (output_dims.size() == 5) {
+            int idx              = SetExecuteUnit2DSizeInfoDefault(unit0, output_dims);
+            std::vector<int> input_inner_dims = {input_dims[2], input_dims[3], input_dims[4]};
+            std::vector<int> output_inner_dims = {output_dims[2], output_dims[3], output_dims[4]};
+            unit0.ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+            unit0.ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+            unit0.ocl_kernel.setArg(idx++, begins_.size() * sizeof(int), begins_.data());
+            unit0.ocl_kernel.setArg(idx++, output_inner_dims.size() * sizeof(int), output_inner_dims.data());
+            unit0.ocl_kernel.setArg(idx++, input_inner_dims.size() * sizeof(int), input_inner_dims.data());
+        }
+    } else if (type_ == STRIDE_SLICE_C4_UNITE) {
+        auto &unit0               = execute_units_[0];
+        int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+        // output_width * output_channel_blocks, output_batch * output_height
+        unit0.global_work_size = {
+            (uint32_t)DimsFunctionUtils::GetDim(output_dims, 3) * output_channel_blocks,
+            (uint32_t)DimsFunctionUtils::GetDim(output_dims, 0) * DimsFunctionUtils::GetDim(output_dims, 2)};
+        unit0.local_work_size = LocalWS2DDefault(unit0);
+        unit0.ocl_kernel.setArg(0, *((cl::Image *)input->GetHandle().base));
+        unit0.ocl_kernel.setArg(1, *((cl::Image *)output->GetHandle().base));
+        unit0.ocl_kernel.setArg(2, 4 * sizeof(int), begins_.data());
+        unit0.ocl_kernel.setArg(3, 4 * sizeof(int), strides_.data());
+        unit0.ocl_kernel.setArg(4, inputWH);
+        unit0.ocl_kernel.setArg(5, outputWH);
+    } else {
+        int dims_count = DimsVectorUtils::Count(input->GetBlobDesc().dims);
+        int type_size  = sizeof(float);
+        if (opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            type_size = 2;
+        }
+        buffer_ = std::make_shared<cl::Buffer>(*opencl_runtime->Context(), CL_MEM_READ_WRITE, dims_count * type_size);
+
+        auto &unit0              = execute_units_[0];
+        int input_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4);
+        // input_width * input_channel_blocks, input_batch * input_height
+        unit0.global_work_size = {
+            (uint32_t)DimsFunctionUtils::GetDim(input_dims, 3) * input_channel_blocks,
+            (uint32_t)DimsFunctionUtils::GetDim(input_dims, 0) * DimsFunctionUtils::GetDim(input_dims, 2)};
+        unit0.local_work_size = LocalWS2DDefault(unit0);
+        unit0.ocl_kernel.setArg(0, unit0.global_work_size[0]);
+        unit0.ocl_kernel.setArg(1, unit0.global_work_size[1]);
+        unit0.ocl_kernel.setArg(2, *buffer_);
+        // input height
+        unit0.ocl_kernel.setArg(3, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 2)));
+        // input width
+        unit0.ocl_kernel.setArg(4, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 3)));
+        // input channel
+        unit0.ocl_kernel.setArg(5, static_cast<uint32_t>(DimsFunctionUtils::GetDim(input_dims, 1)));
+        unit0.ocl_kernel.setArg(6, *((cl::Image *)input->GetHandle().base));
+
+        auto &unit1               = execute_units_[1];
+        int output_channel_blocks = UP_DIV(DimsFunctionUtils::GetDim(output_dims, 1), 4);
+        unit1.global_work_size    = {
+            (uint32_t)DimsFunctionUtils::GetDim(output_dims, 3) * output_channel_blocks,
+            (uint32_t)DimsFunctionUtils::GetDim(output_dims, 0) * DimsFunctionUtils::GetDim(output_dims, 2)};
+        unit1.local_work_size = LocalWS2DDefault(unit1);
+        unit1.ocl_kernel.setArg(0, unit1.global_work_size[0]);
+        unit1.ocl_kernel.setArg(1, unit1.global_work_size[1]);
+        unit1.ocl_kernel.setArg(2, *buffer_);
+        unit1.ocl_kernel.setArg(3, *((cl::Image *)output->GetHandle().base));
+        unit1.ocl_kernel.setArg(4, 4 * sizeof(int), begins_.data());
+        unit1.ocl_kernel.setArg(5, 4 * sizeof(int), strides_.data());
+        // input_width
+        unit1.ocl_kernel.setArg(6, DimsFunctionUtils::GetDim(input_dims, 3));
+        // input_width * input_height
+        unit1.ocl_kernel.setArg(7, DimsFunctionUtils::GetDim(input_dims, 3) * DimsFunctionUtils::GetDim(input_dims, 2));
+        // input_width * input_height * input_channel
+        unit1.ocl_kernel.setArg(8, DimsFunctionUtils::GetDim(input_dims, 3) * DimsFunctionUtils::GetDim(input_dims, 2) *
+                                       DimsFunctionUtils::GetDim(input_dims, 1));
+        // input_channel
+        unit1.ocl_kernel.setArg(9, DimsFunctionUtils::GetDim(input_dims, 1));
+        unit1.ocl_kernel.setArg(10, outputWH);
+        // output_channel
+        unit1.ocl_kernel.setArg(11, DimsFunctionUtils::GetDim(output_dims, 1));
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2)
+REGISTER_OPENCL_LAYOUT(LAYER_STRIDED_SLICE_V2, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sub_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sub_layer_acc.cc
new file mode 100644
index 0000000..73feefe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_sub_layer_acc.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_binary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_BINARY_ACC(Sub);
+
+Status OpenCLSubLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Sub Acc\n");
+    Status ret = OpenCLBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Sub";
+
+    // create kernel
+    std::set<std::string> build_options;
+    std::string compute;
+    if (broadcast_param_.input0_broadcast_type == BroadcastTypeNormal) {
+        compute = "in0-in1";
+    } else {
+        compute = "in1-in0";
+    }
+    build_options.emplace(" -DOPERATOR=" + compute);
+    ret = CreateExecuteUnit(execute_units_[0], "binary", kernel_name_, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLSubLayerAcc::~OpenCLSubLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Sub, LAYER_SUB)
+REGISTER_OPENCL_LAYOUT(LAYER_SUB, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tan_layer_acc.cc
new file mode 100644
index 0000000..037c778
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tan_layer_acc.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Tan);
+
+Status OpenCLTanLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Tan Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Tan";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLTanLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    // 使用sin和cos计算Tan，而不是tan计算的原因是OpenCL tan在部分机型上（如LON-AL00）会出现错误
+    std::string compute = "sin(in)/cos(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLTanLayerAcc::~OpenCLTanLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Tan, LAYER_TAN)
+REGISTER_OPENCL_LAYOUT(LAYER_TAN, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tanh_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tanh_layer_acc.cc
new file mode 100644
index 0000000..bd3a1e6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tanh_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENCL_UNARY_ACC(Tanh);
+
+Status OpenCLTanhLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Tanh Acc\n");
+    Status ret = OpenCLUnaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Tanh";
+
+    return TNN_OK;
+}
+
+std::set<std::string> OpenCLTanhLayerAcc::CreateBuildOptions() {
+    std::set<std::string> build_options;
+    std::string compute = "tanh(in)";
+    build_options.emplace(" -DOPERATOR=" + compute);
+    return build_options;
+}
+
+OpenCLTanhLayerAcc::~OpenCLTanhLayerAcc() {}
+
+REGISTER_OPENCL_ACC(Tanh, LAYER_TANH)
+REGISTER_OPENCL_LAYOUT(LAYER_TANH, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tile_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tile_layer_acc.cc
new file mode 100644
index 0000000..33045e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_tile_layer_acc.cc
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+namespace TNN_NS {
+
+class OpenCLTileLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLTileLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+Status OpenCLTileLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Tile Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+
+    if (input_dims.size() != 4 || output_dims.size() != 4) {
+        LOGE("Tile Layer (OpenCL) only support 4-dim by now\n");
+        return Status(TNNERR_INVALID_INPUT, "Tile Layer (OpenCL) only support 4-dim by now\n");
+    }
+
+    run_3d_ndrange_ = false;
+    op_name_        = "Tile";
+    std::string kernel_name;
+    if (input_dims[1] == output_dims[1]) {
+        kernel_name = "Tile_nhw";
+    } else {
+        kernel_name = "Tile";
+    }
+    // create kernel
+    ret = CreateExecuteUnit(execute_units_[0], "tile", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+OpenCLTileLayerAcc::~OpenCLTileLayerAcc() {}
+
+Status OpenCLTileLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    auto input_dims  = inputs[0]->GetBlobDesc().dims;
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, input_dims[3]);
+
+    execute_units_[0].ocl_kernel.setArg(idx++, output_dims[0]);
+    execute_units_[0].ocl_kernel.setArg(idx++, output_dims[1]);
+    execute_units_[0].ocl_kernel.setArg(idx++, output_dims[2]);
+    execute_units_[0].ocl_kernel.setArg(idx++, output_dims[3]);
+    if (input_dims[1] != output_dims[1]) {
+        execute_units_[0].ocl_kernel.setArg(idx++, output_dims[1] * output_dims[2] * output_dims[3]);
+        execute_units_[0].ocl_kernel.setArg(idx++, output_dims[2] * output_dims[3]);
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Tile, LAYER_REPEAT)
+REGISTER_OPENCL_LAYOUT(LAYER_REPEAT, DATA_FORMAT_NHC4W4);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.cc
new file mode 100644
index 0000000..ef5ed98
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_unary_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_utils.h"
+
+namespace TNN_NS {
+
+Status OpenCLUnaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Unary Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    run_3d_ndrange_         = true;
+    std::string kernel_name = "Unary";
+
+    std::set<std::string> build_options = CreateBuildOptions();
+
+    ret = CreateExecuteUnit(execute_units_[0], "unary", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return ret;
+}
+
+OpenCLUnaryLayerAcc::~OpenCLUnaryLayerAcc() {
+}
+
+Status OpenCLUnaryLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Unary Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    auto output_dims = outputs[0]->GetBlobDesc().dims;
+    uint32_t idx = SetExecuteUnit3DSizeInfoDefault(execute_units_[0], output_dims);
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)inputs[0]->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)outputs[0]->GetHandle().base));
+    return TNN_OK;
+}
+
+std::vector<DataFormat> OpenCLUnaryLayerAcc::SupportDataFormat(DataType data_type,
+                                                               int dims_size,
+                                                               BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size >= 2 && dims_size <= 6) { // only support up to 6 dims
+        support_list.push_back(DATA_FORMAT_NHC4W4);
+    }
+    return support_list;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.h b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.h
new file mode 100644
index 0000000..b8c06e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_unary_layer_acc.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_UNARY_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_UNARY_LAYER_ACC_H_
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLUnaryLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual ~OpenCLUnaryLayerAcc() override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) override;
+    virtual std::set<std::string> CreateBuildOptions() = 0;
+};
+
+#define DECLARE_OPENCL_UNARY_ACC(type_string)                                                                          \
+    class OpenCL##type_string##LayerAcc : public OpenCLUnaryLayerAcc {                                                 \
+    public:                                                                                                            \
+        virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,                              \
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;           \
+        virtual ~OpenCL##type_string##LayerAcc() override;                                                             \
+                                                                                                                       \
+    private:                                                                                                           \
+        virtual std::set<std::string> CreateBuildOptions() override;                                                   \
+    }
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_ACC_OPENCL_UNARY_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_upsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_upsample_layer_acc.cc
new file mode 100644
index 0000000..9c57ace
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/acc/opencl_upsample_layer_acc.cc
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/acc/opencl_layer_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+class OpenCLUpsampleLayerAcc : public OpenCLLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false) override { return TNN_OK; }
+};
+
+static std::vector<uint32_t> UpsampleLocalWS3D(std::vector<uint32_t> &gws, const uint32_t max_workgroup_size) {
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+    GpuType gpu_type       = OpenCLRuntime::GetInstance()->GetGpuInfo().type;
+    std::vector<uint32_t> lws(3, 0);
+    if (gpu_type == GpuType::ADRENO) {
+        lws[0] = 4;
+        lws[1] = gcd(gcd(max_workgroup_size / 16, compute_units * 4), gws[1]);
+        lws[2] = 4;
+    } else {
+        lws[0] = 4;
+        lws[1] = max_workgroup_size / 16;
+        lws[2] = 4;
+    }
+    return lws;
+}
+
+Status OpenCLUpsampleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Init Upsample Acc\n");
+    Status ret = OpenCLLayerAcc::Init(context, param, resource, inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    op_name_ = "Upsample";
+
+    UpsampleLayerParam *upsample_param = dynamic_cast<UpsampleLayerParam *>(param);
+    if (!upsample_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    // create kernel
+    std::string kernel_name;
+    if (upsample_param->mode == 1) {  // nearst
+        LOGD("build nearest\n");
+        kernel_name = "Nearest";
+    } else if (upsample_param->mode == 2) {  // bilinear
+        if (upsample_param->align_corners) {
+            LOGD("build bilinear with aligned corners\n");
+            kernel_name = "BilinearAlignCorners";
+        } else {
+            LOGD("build bilinear\n");
+            kernel_name = "Bilinear";
+        }
+    } else if (upsample_param->mode == 3) {  // cubic
+        if (upsample_param->align_corners) {
+            LOGD("build cubic with aligned corners\n");
+            kernel_name = "CubicAlignCorners";
+        } else {
+            LOGD("build cubic\n");
+            kernel_name = "Cubic";
+        }
+    } else {
+        LOGE("Not support Upsample type: %d\n", upsample_param->mode);
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "invalid upsample mode");
+    }
+    if (run_3d_ndrange_) {
+        kernel_name += "GS3D";
+    }
+
+    ret = CreateExecuteUnit(execute_units_[0], "upsample", kernel_name);
+    if (ret != TNN_OK) {
+        LOGE("create execute unit failed!\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLUpsampleLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    LOGD("Upsample Acc Reshape\n");
+    Status ret = OpenCLLayerAcc::Reshape(inputs, outputs);
+    CHECK_TNN_OK(ret)
+
+    UpsampleLayerParam *upsample_param = dynamic_cast<UpsampleLayerParam *>(param_);
+    if (!upsample_param) {
+        LOGE("Error: layer param is null\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is null");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto input_dims  = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    const int batch        = DimsFunctionUtils::GetDim(input_dims, 0);
+    const int channels     = DimsFunctionUtils::GetDim(input_dims, 1);
+    const int input_height = DimsFunctionUtils::GetDim(input_dims, 2);
+    const int input_width  = DimsFunctionUtils::GetDim(input_dims, 3);
+
+    const int output_height = DimsFunctionUtils::GetDim(output_dims, 2);
+    const int output_width  = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    const int channel_blocks = UP_DIV(channels, 4);
+
+    float height_scale;
+    float width_scale;
+    if ((upsample_param->mode == 2 || upsample_param->mode == 3) && upsample_param->align_corners) {
+        height_scale = (float)(input_height - 1) / (float)(output_height - 1);
+        width_scale  = (float)(input_width - 1) / (float)(output_width - 1);
+    } else {
+        height_scale = (float)input_height / (float)output_height;
+        width_scale  = (float)input_width / (float)output_width;
+    }
+
+    uint32_t idx = 0;
+    if (run_3d_ndrange_) {
+        execute_units_[0].global_work_size = {static_cast<uint32_t>(output_width),
+                                            static_cast<uint32_t>(channel_blocks),
+                                            static_cast<uint32_t>(batch * output_height)};
+        execute_units_[0].local_work_size =
+            UpsampleLocalWS3D(execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max);
+        for (auto gws : execute_units_[0].global_work_size) {
+            execute_units_[0].ocl_kernel.setArg(idx++, gws);
+        }
+    } else {
+        idx = SetExecuteUnit2DSizeInfoDefault(execute_units_[0], output_dims);
+    }
+
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)input->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, *((cl::Image *)output->GetHandle().base));
+    execute_units_[0].ocl_kernel.setArg(idx++, height_scale);
+    execute_units_[0].ocl_kernel.setArg(idx++, width_scale);
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(input_height));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(input_width));
+    execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_height));
+    if (!run_3d_ndrange_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, static_cast<int32_t>(output_width));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENCL_ACC(Upsample, LAYER_UPSAMPLE)
+REGISTER_OPENCL_LAYOUT(LAYER_UPSAMPLE, DATA_FORMAT_NHC4W4);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/activation.inc b/3rdparty/TNN/source/tnn/device/opencl/cl/activation.inc
new file mode 100644
index 0000000..d42c639
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/activation.inc
@@ -0,0 +1,19 @@
+enum ActivationType {
+    ActivationType_None        = 0x0000,
+    ActivationType_ReLU        = 0x0001,
+    ActivationType_ReLU6       = 0x0002,
+    ActivationType_SIGMOID_MUL = 0x0100,
+};
+
+inline FLOAT4 ActivationProcess(FLOAT4 out0, enum ActivationType activation_type) {
+    if (activation_type == ActivationType_ReLU) {
+        return fmax(out0, (FLOAT4)0);
+    } else if (activation_type == ActivationType_ReLU6) {
+        return clamp(out0, (FLOAT4)0, (FLOAT4)6);
+    } else if (activation_type == ActivationType_SIGMOID_MUL) {
+        float4 convert_out0 = convert_float4(out0);
+        return CONVERT_FLOAT4(native_recip((float4)1 + native_exp(-convert_out0)) * convert_out0);
+    } else {
+        return out0;
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/arg.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/arg.cl
new file mode 100644
index 0000000..7bdf51b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/arg.cl
@@ -0,0 +1,164 @@
+#include "base.inc"
+
+#define MinOp <
+#define MaxOp >
+
+#define CALCULATE_GUARD(in, guard, idx)     \
+    if (in BINARY_OPERATOR guard) {         \
+        guard = in;                         \
+        out.x = idx;                        \
+    }
+
+
+__kernel void ArgOpN(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                     __write_only image2d_t output,
+                     __private const int output_channel,
+                     __private const int output_height,
+                     __private const int output_width,
+                     __private const int input_batch) {
+    const int out_cw_idx        = get_global_id(0);
+    const int out_height_idx    = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(out_cw_idx, out_height_idx);
+
+    FLOAT4 guard_val    = RI_F(input, SAMPLER, (int2)(out_cw_idx, out_height_idx));
+    int4 out          = 0;
+
+    for (int b = 1; b < input_batch; b++) {
+        int2 pos    = (int2)(out_cw_idx, b * output_height + out_height_idx);
+        FLOAT4 in   = RI_F(input, SAMPLER, pos);
+
+        out.x = select(out.x, b, convert_int(in.x BINARY_OPERATOR guard_val.x));
+        out.y = select(out.y, b, convert_int(in.y BINARY_OPERATOR guard_val.y));
+        out.z = select(out.z, b, convert_int(in.z BINARY_OPERATOR guard_val.z));
+        out.w = select(out.w, b, convert_int(in.w BINARY_OPERATOR guard_val.w));
+
+        guard_val = OPERATOR(guard_val, in);
+    }
+
+    write_imagei(output, (int2)(out_cw_idx, out_height_idx), out);
+}
+
+__kernel void ArgOpC(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                     __write_only image2d_t output,
+                     __private const int output_batch,
+                     __private const int output_height,
+                     __private const int output_width,
+                     __private const int input_channel,
+                     __private const int input_channel_blocks) {
+    const int out_width_idx = get_global_id(0);
+    const int out_bh_idx    = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(out_width_idx, out_bh_idx);
+
+    FLOAT4 in       = RI_F(input, SAMPLER, (int2)(out_width_idx, out_bh_idx));
+    FLOAT guard_val = in.x;
+    int4 out      = 0;
+    
+    int input_c_base, cb;
+    if (input_channel >= 4) {
+        CALCULATE_GUARD(in.y, guard_val, 1);
+        CALCULATE_GUARD(in.z, guard_val, 2);
+        CALCULATE_GUARD(in.w, guard_val, 3);
+
+        for (cb = 1; cb < input_channel_blocks - 1; cb++) {
+            int2 pos        = (int2)(cb * output_width + out_width_idx, out_bh_idx);
+            in              = RI_F(input, SAMPLER, pos);
+            input_c_base    = cb << 2;
+
+            CALCULATE_GUARD(in.x, guard_val, input_c_base);
+            CALCULATE_GUARD(in.y, guard_val, input_c_base + 1);
+            CALCULATE_GUARD(in.z, guard_val, input_c_base + 2);
+            CALCULATE_GUARD(in.w, guard_val, input_c_base + 3);
+        }
+
+        if (input_channel > 4) {
+            int2 pos        = (int2)(cb * output_width + out_width_idx, out_bh_idx);
+            in              = RI_F(input, SAMPLER, pos);
+            input_c_base    = cb << 2;
+            int remain      = input_channel - input_c_base;
+            if (remain >= 4) {
+                CALCULATE_GUARD(in.x, guard_val, input_c_base);
+                CALCULATE_GUARD(in.y, guard_val, input_c_base + 1);
+                CALCULATE_GUARD(in.z, guard_val, input_c_base + 2);
+                CALCULATE_GUARD(in.w, guard_val, input_c_base + 3);
+            } else if (remain == 3) {
+                CALCULATE_GUARD(in.x, guard_val, input_c_base);
+                CALCULATE_GUARD(in.y, guard_val, input_c_base + 1);
+                CALCULATE_GUARD(in.z, guard_val, input_c_base + 2);
+            } else if (remain == 2) {
+                CALCULATE_GUARD(in.x, guard_val, input_c_base);
+                CALCULATE_GUARD(in.y, guard_val, input_c_base + 1);
+            } else if (remain == 1) {
+                CALCULATE_GUARD(in.x, guard_val, input_c_base);
+            }
+        }
+    } else if (input_channel == 3) {
+        CALCULATE_GUARD(in.y, guard_val, 1);
+        CALCULATE_GUARD(in.z, guard_val, 2);
+    } else if (input_channel == 2) {
+        CALCULATE_GUARD(in.y, guard_val, 1);
+    }
+
+    write_imagei(output, (int2)(out_width_idx, out_bh_idx), out);
+}
+
+__kernel void ArgOpH(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                     __write_only image2d_t output,
+                     __private const int output_batch,
+                     __private const int output_channel,
+                     __private const int output_width,
+                     __private const int input_height) {
+    const int out_cw_idx        = get_global_id(0);
+    const int out_batch_idx     = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(out_cw_idx, out_batch_idx);
+
+    int input_bh_idx    = out_batch_idx * input_height;
+    FLOAT4 guard_val    = RI_F(input, SAMPLER, (int2)(out_cw_idx, input_bh_idx));
+    int4 out          = 0;
+
+    for (int h = 1; h < input_height; h++) {
+        int2 pos    = (int2)(out_cw_idx, input_bh_idx + h);
+        FLOAT4 in   = RI_F(input, SAMPLER, pos);
+
+        out.x = select(out.x, h, convert_int(in.x BINARY_OPERATOR guard_val.x));
+        out.y = select(out.y, h, convert_int(in.y BINARY_OPERATOR guard_val.y));
+        out.z = select(out.z, h, convert_int(in.z BINARY_OPERATOR guard_val.z));
+        out.w = select(out.w, h, convert_int(in.w BINARY_OPERATOR guard_val.w));
+
+        guard_val = OPERATOR(guard_val, in);
+    }
+
+    write_imagei(output, (int2)(out_cw_idx, out_batch_idx), out);
+}
+
+__kernel void ArgOpW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                     __write_only image2d_t output,
+                     __private const int output_batch,
+                     __private const int output_channel,
+                     __private const int output_height,
+                     __private const int input_width) {
+    const int out_channel_blocks_idx    = get_global_id(0);
+    const int out_bh_idx                = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(out_channel_blocks_idx, out_bh_idx);
+
+    int input_cw_idx    = out_channel_blocks_idx * input_width;
+    FLOAT4 guard_val    = RI_F(input, SAMPLER, (int2)(input_cw_idx, out_bh_idx));
+    int4 out          = 0;
+
+    for (int w = 1; w < input_width; w++) {
+        int2 pos    = (int2)(input_cw_idx + w, out_bh_idx);
+        FLOAT4 in   = RI_F(input, SAMPLER, pos);
+
+        out.x = select(out.x, w, convert_int(in.x BINARY_OPERATOR guard_val.x));
+        out.y = select(out.y, w, convert_int(in.y BINARY_OPERATOR guard_val.y));
+        out.z = select(out.z, w, convert_int(in.z BINARY_OPERATOR guard_val.z));
+        out.w = select(out.w, w, convert_int(in.w BINARY_OPERATOR guard_val.w));
+
+        guard_val = OPERATOR(guard_val, in);
+    }
+
+    write_imagei(output, (int2)(out_channel_blocks_idx, out_bh_idx), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/base.inc b/3rdparty/TNN/source/tnn/device/opencl/cl/base.inc
new file mode 100644
index 0000000..ccb5a26
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/base.inc
@@ -0,0 +1,66 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef struct shape_3d { int data[3];} shape_3d;
+typedef struct shape_4d { int data[4];} shape_4d;
+typedef struct shape_5d { int data[5];} shape_5d;
+typedef struct shape_6d { int data[6];} shape_6d;
+
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+#define GLOBAL_SIZE_1_DIMS __private const int global_size_dim0,   
+
+#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0, __private const int global_size_dim1,
+
+#define GLOBAL_SIZE_3_DIMS \
+    __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+
+#define DEAL_NON_UNIFORM_DIM1(input1)       \
+    if (input1 >= global_size_dim0) {       \
+        return;                             \
+    }
+
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                       \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
+        return;                                                     \
+    }
+
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                                                   \
+    }
+
+#define CALCULATE_OUTPUT(i)                  \
+    out##i = mad(in##i.x, weights0, out##i); \
+    out##i = mad(in##i.y, weights1, out##i); \
+    out##i = mad(in##i.z, weights2, out##i); \
+    out##i = mad(in##i.w, weights3, out##i);
+
+#define CALCULATE_VEC16_OUTPUT(i)                  \
+    out##i = mad(in##i.x, weights.s0123, out##i); \
+    out##i = mad(in##i.y, weights.s4567, out##i); \
+    out##i = mad(in##i.z, weights.s89ab, out##i); \
+    out##i = mad(in##i.w, weights.scdef, out##i);
+
+#define CALCULATE_SLICE_OUTPUT(s_idx) \
+    out_w0_s##s_idx += weights_c0_s##s_idx * in0.x; \
+    out_w1_s##s_idx += weights_c0_s##s_idx * in1.x; \
+    out_w2_s##s_idx += weights_c0_s##s_idx * in2.x; \
+    out_w3_s##s_idx += weights_c0_s##s_idx * in3.x; \
+    out_w0_s##s_idx += weights_c1_s##s_idx * in0.y; \
+    out_w1_s##s_idx += weights_c1_s##s_idx * in1.y; \
+    out_w2_s##s_idx += weights_c1_s##s_idx * in2.y; \
+    out_w3_s##s_idx += weights_c1_s##s_idx * in3.y; \
+    out_w0_s##s_idx += weights_c2_s##s_idx * in0.z; \
+    out_w1_s##s_idx += weights_c2_s##s_idx * in1.z; \
+    out_w2_s##s_idx += weights_c2_s##s_idx * in2.z; \
+    out_w3_s##s_idx += weights_c2_s##s_idx * in3.z; \
+    out_w0_s##s_idx += weights_c3_s##s_idx * in0.w; \
+    out_w1_s##s_idx += weights_c3_s##s_idx * in1.w; \
+    out_w2_s##s_idx += weights_c3_s##s_idx * in2.w; \
+    out_w3_s##s_idx += weights_c3_s##s_idx * in3.w; \
+
+#define CALCULATE_OUTPUT_DOT(i)       \
+    out##i.x += dot(in##i, weights0); \
+    out##i.y += dot(in##i, weights1); \
+    out##i.z += dot(in##i, weights2); \
+    out##i.w += dot(in##i, weights3);
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/batch_norm.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/batch_norm.cl
new file mode 100644
index 0000000..7f74dfd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/batch_norm.cl
@@ -0,0 +1,63 @@
+#include "base.inc"
+
+__kernel void BatchNorm(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                        __read_only image2d_t scale, __read_only image2d_t eps,
+                        __private const int width,
+                        __write_only image2d_t output) {
+    const int cw_idx = get_global_id(0);
+    const int hb_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw_idx, hb_idx);
+
+    const int chan_blk_idx = cw_idx / width;
+
+    float4 data   = read_imagef(input, SAMPLER, (int2)(cw_idx, hb_idx));
+    float4 scale_ = read_imagef(scale, SAMPLER, (int2)(chan_blk_idx, 0));
+    float4 eps_   = read_imagef(eps, SAMPLER, (int2)(chan_blk_idx, 0));
+    data          = mad(data, scale_, eps_);
+
+    write_imagef(output, (int2)(cw_idx, hb_idx), data);
+}
+
+__kernel void BatchNormBatch(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                             __read_only image2d_t scale,
+                             __read_only image2d_t eps,
+                             __private const int width,
+                             __private const int height,
+                             __write_only image2d_t output) {
+    const int cw_idx = get_global_id(0);
+    const int hb_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw_idx, hb_idx);
+
+    const int chan_blk_idx = cw_idx / width;
+    const int b_idx        = hb_idx / height;
+
+    float4 data   = read_imagef(input, SAMPLER, (int2)(cw_idx, hb_idx));
+    float4 scale_ = read_imagef(scale, SAMPLER, (int2)(chan_blk_idx, b_idx));
+    float4 eps_   = read_imagef(eps, SAMPLER, (int2)(chan_blk_idx, b_idx));
+    data          = mad(data, scale_, eps_);
+
+    write_imagef(output, (int2)(cw_idx, hb_idx), data);
+}
+
+__kernel void BatchNormGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                            __read_only image2d_t scale,
+                            __read_only image2d_t eps,
+                            __write_only image2d_t output) {
+    const int width_idx    = get_global_id(0);
+    const int chan_blk_idx = get_global_id(1);
+    const int hb_idx       = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, chan_blk_idx, hb_idx);
+    const int width = global_size_dim0;
+
+    int pos = mad24(chan_blk_idx, width, width_idx);
+
+    float4 data   = read_imagef(input, SAMPLER, (int2)(pos, hb_idx));
+    float4 scale_ = read_imagef(scale, SAMPLER, (int2)(chan_blk_idx, 0));
+    float4 eps_   = read_imagef(eps, SAMPLER, (int2)(chan_blk_idx, 0));
+    data          = mad(data, scale_, eps_);
+
+    write_imagef(output, (int2)(pos, hb_idx), data);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/binary.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/binary.cl
new file mode 100644
index 0000000..db01c5e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/binary.cl
@@ -0,0 +1,155 @@
+#include "base.inc"
+
+__kernel void BinaryElementWise(GLOBAL_SIZE_2_DIMS __read_only image2d_t input0,
+                                __read_only image2d_t input1,
+                                __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 in0 = RI_F(input0, SAMPLER, (int2)(cw, bh));
+    FLOAT4 in1 = RI_F(input1, SAMPLER, (int2)(cw, bh));
+    FLOAT4 out = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinarySingle(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                           __read_only image2d_t param,
+                           __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(cw, bh));
+    FLOAT in1  = RI_F(param, SAMPLER, (int2)(0, 0)).x;
+    FLOAT4 out = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinaryChannel(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                            __read_only image2d_t param,
+                            __private const int height,
+                            __private const int width,
+                            __private const int param_batch,
+                            __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    const int c_idx = cw / width;
+    const int b_idx = bh / height;
+    const int param_b_idx = b_idx % param_batch;
+    FLOAT4 in0      = RI_F(input, SAMPLER, (int2)(cw, bh));
+    FLOAT4 in1      = RI_F(param, SAMPLER, (int2)(c_idx, param_b_idx));
+    FLOAT4 out      = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinaryWidth(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                            __read_only image2d_t param,
+                            __private const int height,
+                            __private const int width,
+                            __private const int param_batch,
+                            __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    const int w_idx = cw % width;
+    const int b_idx = bh / height;
+    const int param_b_idx = b_idx % param_batch;
+    FLOAT4 in0      = RI_F(input, SAMPLER, (int2)(cw, bh));
+    FLOAT in1       = RI_F(param, SAMPLER, (int2)(w_idx, param_b_idx)).x;
+    FLOAT4 out      = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinaryCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                        __read_only image2d_t param,
+                        __private const int height,
+                        __private const int width,
+                        __private const int param_batch,
+                        __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    const int h_idx = bh % height;
+    const int b_idx = bh / height;
+    const int param_b_idx = b_idx % param_batch;
+    FLOAT4 in0      = RI_F(input, SAMPLER, (int2)(cw, bh));
+    FLOAT4 in1      = RI_F(param, SAMPLER, (int2)(cw, h_idx + param_b_idx * height));
+    FLOAT4 out      = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinaryHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                       __read_only image2d_t param, __private const int height,
+                       __private const int width, __private const int param_batch,
+                       __write_only image2d_t output) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    const int h_idx = bh % height;
+    const int b_idx = bh / height;
+    const int param_b_idx = b_idx % param_batch;
+    const int w_idx = cw % width;
+    FLOAT4 in0      = RI_F(input, SAMPLER, (int2)(cw, bh));
+    FLOAT in1       = RI_F(param, SAMPLER, (int2)(w_idx, h_idx + param_b_idx * height)).x;
+    FLOAT4 out      = OPERATOR;
+    WI_F(output, (int2)(cw, bh), out);
+}
+
+__kernel void BinaryBroadcast(GLOBAL_SIZE_2_DIMS __read_only image2d_t input0,
+                              __read_only image2d_t input1, int4 output_shape,
+                              int4 input0_shape, int4 input1_shape,
+                              __private const int input0_c_4_blocks,
+                              __private const int input1_c_4_blocks,
+                              __write_only image2d_t output) {
+    const int output_cw = get_global_id(0);
+    const int output_bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw, output_bh);
+
+    const int output_h_idx = output_bh % output_shape.z;
+    const int output_b_idx = output_bh / output_shape.z;
+    const int output_w_idx = output_cw % output_shape.w;
+    const int output_c_4_idx = output_cw / output_shape.w;
+
+    FLOAT4 in0, in1;
+    const int input0_h_idx = select(output_h_idx, 0, input0_shape.z == 1);
+    const int input0_b_idx = select(output_b_idx, 0, input0_shape.x == 1);
+    const int input0_w_idx = select(output_w_idx, 0, input0_shape.w == 1);
+    const int input0_c_4_idx = select(input0_c_4_blocks - 1, output_c_4_idx, output_c_4_idx < input0_c_4_blocks);
+    const int input0_c_idx0 = input0_c_4_idx << 2;
+    in0 = RI_F(input0, SAMPLER, (int2)(input0_c_4_idx * input0_shape.w + input0_w_idx,
+                                       input0_b_idx * input0_shape.z + input0_h_idx));
+    if (input0_shape.y == 1) {
+        in0.y = in0.x;
+        in0.z = in0.x;
+        in0.w = in0.x;
+    }
+
+    const int input1_h_idx = select(output_h_idx, 0, input1_shape.z == 1);
+    const int input1_b_idx = select(output_b_idx, 0, input1_shape.x == 1);
+    const int input1_w_idx = select(output_w_idx, 0, input1_shape.w == 1);
+    const int input1_c_4_idx = select(input1_c_4_blocks - 1, output_c_4_idx, output_c_4_idx < input1_c_4_blocks);
+    const int input1_c_idx0 = input1_c_4_idx << 2;
+    in1 = RI_F(input1, SAMPLER, (int2)(input1_c_4_idx * input1_shape.w + input1_w_idx,
+                                       input1_b_idx * input1_shape.z + input1_h_idx));
+    if (input1_shape.y == 1) {
+        in1.y = in1.x;
+        in1.z = in1.x;
+        in1.w = in1.x;
+    }
+
+    FLOAT4 out      = OPERATOR;
+    WI_F(output, (int2)(output_cw, output_bh), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_from_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_from_mat.cl
new file mode 100644
index 0000000..540b28c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_from_mat.cl
@@ -0,0 +1,74 @@
+#include "base.inc"
+
+__kernel void Blob5DConvertFromNCHW(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                                    __global const float *input_ptr, __private const int dim2,
+                                    __private const int dim3, __private const int dim4,
+                                    __private const int channels,
+                                    __global const float* scale,
+                                    __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx;
+
+    const int stride         = dim2 * dim3 * dim4;
+    const int remain_channel = channels - channel_4_idx;
+
+    float4 output_values        = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+        offset += stride;
+        output_values.w = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        output_values = output_values * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        output_values.xyz = output_values.xyz * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        output_values.xy = output_values.xy * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        output_values.x = output_values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx),
+                 output_values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_to_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_to_mat.cl
new file mode 100644
index 0000000..adf2382
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_5d_convert_to_mat.cl
@@ -0,0 +1,71 @@
+#include "base.inc"
+
+__kernel void Blob5DConvertToNCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                                  __global float *output, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int channels,
+                                  __global const float* scale,
+                                  __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx;
+
+    const int stride         = dim2 * dim3 * dim4;
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    float4 values            = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        values = values * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+        offset += stride;
+        output[offset] = values.z;
+        offset += stride;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        values.xyz = values.xyz * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+        offset += stride;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        values.xy = values.xy * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+#ifdef ENABLE_SCALE_BIAS
+        values.x = values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_from_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_from_mat.cl
new file mode 100644
index 0000000..d1f3bf7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_from_mat.cl
@@ -0,0 +1,76 @@
+#include "base.inc"
+
+__kernel void Blob6DConvertFromNCHW(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                                    __global const float *input_ptr, __private const int dim2,
+                                    __private const int dim3, __private const int dim4,
+                                    __private const int dim5, __private const int channels,
+                                    __global const float* scale,
+                                    __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx) * dim5 + dim5_idx;
+
+    const int stride         = dim2 * dim3 * dim4 * dim5;
+    const int remain_channel = channels - channel_4_idx;
+
+    float4 output_values        = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+        offset += stride;
+        output_values.w = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        output_values = output_values * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        output_values.xyz = output_values.xyz * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        output_values.xy = output_values.xy * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        output_values.x = output_values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx),
+                 output_values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_to_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_to_mat.cl
new file mode 100644
index 0000000..58b5fa1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/blob_6d_convert_to_mat.cl
@@ -0,0 +1,73 @@
+#include "base.inc"
+
+__kernel void Blob6DConvertToNCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                                  __global float *output, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int dim5, __private const int channels,
+                                  __global const float* scale,
+                                  __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx) * dim5 + dim5_idx;
+
+    const int stride         = dim2 * dim3 * dim4 * dim5;
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    float4 values            = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        values = values * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+        offset += stride;
+        output[offset] = values.z;
+        offset += stride;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        values.xyz = values.xyz * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+        offset += stride;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        values.xy = values.xy * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += stride;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+#ifdef ENABLE_SCALE_BIAS
+        values.x = values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_buffer.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_buffer.cl
new file mode 100644
index 0000000..a561ef9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_buffer.cl
@@ -0,0 +1,116 @@
+#include "base.inc"
+
+// convert kernel : from buffer(oihw) to buffer(oc/4 h w ic/4 ic4 oc4)
+__kernel void Conv2DFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                            __private const int output_channel, __private const int input_channel, __private const int2 kernel_hw, 
+                                            __private const int h_w_size, __global FLOAT* output_ptr) {
+    int oc_idx = get_global_id(0); // oc
+    int ic_h_w_idx = get_global_id(1); // ic h w
+
+    DEAL_NON_UNIFORM_DIM2(oc_idx, ic_h_w_idx);
+
+    const int ic_idx  = ic_h_w_idx / h_w_size;
+    const int h_w_idx = ic_h_w_idx % h_w_size;
+    const int h_idx   = h_w_idx / kernel_hw.y;
+    const int w_idx   = h_w_idx % kernel_hw.y;
+
+    const int ic_size = global_size_dim1 / h_w_size;
+    float val = 0;
+    if (oc_idx < output_channel && ic_idx < input_channel) {
+        const int input_offset = oc_idx * input_channel * h_w_size + ic_idx * h_w_size +
+            h_idx * kernel_hw.y + w_idx;
+        val = input_ptr[input_offset];
+    }
+
+    const int ocb_idx = oc_idx >> 2;
+    const int icb_idx = ic_idx >> 2;
+    const int output_offset = ocb_idx * ic_size * h_w_size * 4 + h_idx * ic_size * kernel_hw.y * 4 + w_idx * ic_size * 4 + icb_idx * 16 + (ic_idx % 4) * 4 + oc_idx % 4;
+    output_ptr[output_offset] = (FLOAT)(val);
+}
+
+// convert kernel : from buffer(mihw) to buffer(ic/4 h w m ic4)
+// but now dw only support m == 1
+__kernel void DWFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                     __private const int4 kernel_shape, __private const int height_width_size,
+                                     __global FLOAT* output_ptr) {
+    const int hw_idx  = get_global_id(0);
+    const int ic_block_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(hw_idx, ic_block_idx);
+
+    FLOAT4 output_values = 0;
+    if (kernel_shape.x == 1) {
+        const int input_channel_4_idx = ic_block_idx << 2;
+        const int buffer_height_idx   = hw_idx / kernel_shape.w;
+        const int buffer_width_idx    = hw_idx % kernel_shape.w;
+
+        const int buffer_offset =
+            mad24(mad24(input_channel_4_idx, kernel_shape.z, buffer_height_idx), kernel_shape.w, buffer_width_idx);
+
+        const int remain_channel = kernel_shape.y - input_channel_4_idx;
+        if (input_channel_4_idx < kernel_shape.y) {
+            if (remain_channel >= 4) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.z = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.w = (FLOAT)(*(input_ptr + offset));
+            } else if (remain_channel == 3) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.z = (FLOAT)(*(input_ptr + offset));
+
+            } else if (remain_channel == 2) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+            } else if (remain_channel == 1) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+            }
+        }
+        const int output_offset = mad24(mad24(ic_block_idx, kernel_shape.z, buffer_height_idx),
+                                        kernel_shape.w, buffer_width_idx) << 2;
+        vstore4(output_values, 0, output_ptr + output_offset);
+    }
+}
+
+// convert arg as 4 alignment
+__kernel void ArgBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int count,
+                                __global FLOAT* output_ptr) {
+    int global_x_idx = get_global_id(0);
+    int global_y_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_x_idx, global_y_idx);
+
+    const int buffer_4_offset = global_x_idx << 2;
+    const int remain          = count - buffer_4_offset;
+
+    int offset = buffer_4_offset;
+    float4 value = 0;
+    if (remain >= 4) {
+        value = vload4(0, input_ptr + offset);
+    } else if (remain == 3) {
+        value.x = *(input_ptr + offset);
+        offset++;
+        value.y = *(input_ptr + offset);
+        offset++;
+        value.z = *(input_ptr + offset);
+    } else if (remain == 2) {
+        value.x = *(input_ptr + offset);
+        offset++;
+        value.y = *(input_ptr + offset);
+    } else if (remain == 1) {
+        value.x = *(input_ptr + offset);
+    }
+
+    FLOAT4 value_out = (FLOAT4)((FLOAT)(value.x), (FLOAT)(value.y), (FLOAT)(value.z), (FLOAT)(value.w));
+    vstore4(value_out, 0, output_ptr + buffer_4_offset);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image.cl
new file mode 100644
index 0000000..08c05ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image.cl
@@ -0,0 +1,474 @@
+#include "base.inc"
+
+__kernel void NC4HW4BufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int2 output_wh,
+                                     __private const int channel_up_4, __write_only image2d_t output) {
+
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / output_wh.x;
+    const int height_idx        = image_height_idx % output_wh.x;
+    const int width_idx         = image_width_idx % output_wh.y;
+    const int channel_block_idx = image_width_idx / output_wh.y;
+    int buffer_offset =
+        (((batch_idx * channel_up_4 + channel_block_idx) * output_wh.x + height_idx) * output_wh.y + width_idx) * 4;
+
+    float4 values = vload4(0, input_ptr + buffer_offset);
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+// convert kernel : from buffer(oihw) to image [w,h]=(ic oc4, oc/4 h w)
+__kernel void Conv2DFilterBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                            __private const int output_channel, __private const int2 kernel_wh, __private const int ic_h_w_size,
+                                            __private const int height_width_size, __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0); // ic
+    int image_height_idx = get_global_id(1); // oc/4 h w
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int input_channel_4_idx  = image_width_idx;
+    const int output_channel_4_idx = (image_height_idx / height_width_size) * 4;
+    const int height_width_idx     = image_height_idx % height_width_size;
+    const int buffer_height_idx    = height_width_idx / kernel_wh.y;
+    const int buffer_width_idx     = height_width_idx % kernel_wh.y;
+
+    const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
+                              buffer_height_idx * kernel_wh.y + buffer_width_idx;
+
+    float4 output_values = 0;
+    if (output_channel_4_idx < output_channel) {
+        const int remain_channel = output_channel - output_channel_4_idx;
+        if (remain_channel >= 4) {
+            int offset      = buffer_offset;
+            output_values.x = *(input_ptr + offset);
+            offset          = mad24(1, ic_h_w_size, offset);
+            output_values.y = *(input_ptr + offset);
+            offset += ic_h_w_size;
+            output_values.z = *(input_ptr + offset);
+            offset += ic_h_w_size;
+            output_values.w = *(input_ptr + offset);
+        } else if (remain_channel == 3) {
+            int offset      = buffer_offset;
+            output_values.x = *(input_ptr + offset);
+            offset          = mad24(1, ic_h_w_size, offset);
+            output_values.y = *(input_ptr + offset);
+            offset += ic_h_w_size;
+            output_values.z = *(input_ptr + offset);
+
+        } else if (remain_channel == 2) {
+            int offset      = buffer_offset;
+            output_values.x = *(input_ptr + offset);
+            offset          = mad24(1, ic_h_w_size, offset);
+            output_values.y = *(input_ptr + offset);
+        } else if (remain_channel == 1) {
+            int offset      = buffer_offset;
+            output_values.x = *(input_ptr + offset);
+        }
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
+
+
+// convert kernel from buffer(mihw) to image [w,h]=(h w m ic4, ic/4)
+// but now dw only support m == 1
+__kernel void DWFilterBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                        __private const int4 kernel_shape,
+                                        __private const int height_width_size, __write_only image2d_t output) {
+    const int image_width_idx  = get_global_id(0);
+    const int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    float4 output_values = 0;
+    if (kernel_shape.x == 1) {
+        const int input_channel_4_idx = image_height_idx << 2;
+        const int buffer_height_idx   = image_width_idx / kernel_shape.w;
+        const int buffer_width_idx    = image_width_idx % kernel_shape.w;
+
+        const int buffer_offset =
+            mad24(mad24(input_channel_4_idx, kernel_shape.z, buffer_height_idx), kernel_shape.w, buffer_width_idx);
+
+        const int remain_channel = kernel_shape.y - input_channel_4_idx;
+        if (input_channel_4_idx < kernel_shape.y) {
+            if (remain_channel >= 4) {
+                int offset      = buffer_offset;
+                output_values.x = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.y = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.z = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.w = *(input_ptr + offset);
+            } else if (remain_channel == 3) {
+                int offset      = buffer_offset;
+                output_values.x = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.y = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.z = *(input_ptr + offset);
+
+            } else if (remain_channel == 2) {
+                int offset      = buffer_offset;
+                output_values.x = *(input_ptr + offset);
+                offset += height_width_size;
+                output_values.y = *(input_ptr + offset);
+            } else if (remain_channel == 1) {
+                int offset      = buffer_offset;
+                output_values.x = *(input_ptr + offset);
+            }
+        }
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
+
+// convert data from buffer(nhwc) to image(b h, ic/4 w ic4)
+__kernel void NHWCBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int height,
+                                   __private const int width, __private const int channels,
+                                   __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
+
+    const int remain_channel                    = channels - channel_4_idx;
+    __global const float *input_current_ptr = input_ptr + buffer_offset;
+    float4 values                           = 0;
+    values                                      = vload4(0, input_current_ptr);
+
+    if (remain_channel == 3) {
+        values.w = 0;
+    } else if (remain_channel == 2) {
+        values.z = 0;
+        values.w = 0;
+    } else if (remain_channel == 1) {
+        values.y = 0;
+        values.z = 0;
+        values.w = 0;
+    }
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), values);
+}
+
+__kernel void NHWCBufferToImageFLOAT(GLOBAL_SIZE_2_DIMS __global const FLOAT *input_ptr, __private const int height,
+                                   __private const int width, __private const int channels,
+                                   __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
+
+    const int remain_channel                = channels - channel_4_idx;
+    __global const FLOAT *input_current_ptr = input_ptr + buffer_offset;
+    FLOAT4 values                           = 0;
+    values                                  = vload4(0, input_current_ptr);
+
+    if (remain_channel == 3) {
+        values.w = 0;
+    } else if (remain_channel == 2) {
+        values.z = 0;
+        values.w = 0;
+    } else if (remain_channel == 1) {
+        values.y = 0;
+        values.z = 0;
+        values.w = 0;
+    }
+    WI_F(output, (int2)(image_width_idx, image_height_idx), values);
+}
+
+// convert data from buffer(nchw) to image(b h, ic/4 w ic4)
+__kernel void NCHWBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nchw */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
+
+    const int remain_channel    = channels - channel_4_idx;
+    const int height_width_size = height * width;
+    float4 output_values    = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.w = *(input_ptr + offset);
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
+
+__kernel void NCHWBufferToImageFLOAT(GLOBAL_SIZE_2_DIMS __global const FLOAT *input_ptr, /* nchw */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = image_width_idx / width << 2;
+    const int buffer_offset = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
+
+    const int remain_channel    = channels - channel_4_idx;
+    const int height_width_size = height * width;
+    FLOAT4 output_values        = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.w = *(input_ptr + offset);
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+    }
+
+    WI_F(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
+
+// convert data from buffer(nchw) to image(b h, ic/4 w ic4)
+__kernel void ArgBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int count,
+                                  __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int buffer_4_offset = image_width_idx << 2;
+    const int remain          = count - buffer_4_offset;
+
+    int offset = buffer_4_offset;
+    float4 values = 0;
+    if (remain >= 4) {
+        values = vload4(0, input_ptr + offset);
+    } else if (remain == 3) {
+        values.x = *(input_ptr + offset);
+        offset++;
+        values.y = *(input_ptr + offset);
+        offset++;
+        values.z = *(input_ptr + offset);
+    } else if (remain == 2) {
+        values.x = *(input_ptr + offset);
+        offset++;
+        values.y = *(input_ptr + offset);
+    } else if (remain == 1) {
+        values.x = *(input_ptr + offset);
+    }
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), values);
+}
+
+// convert data from buffer(num_directions, 4 * hidden_size, weights_width)
+// to image(weights_width, num_directions * 4 * hidden_size/4 hidden_size4)
+__kernel void LstmFilterBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                      __private const int num_directions,
+                                      __private const int hidden_size,
+                                      __private const int weights_width,
+                                      __private const int hidden_updiv_4_size,
+                                      __private const int hidden_mul_4_size,
+                                      __write_only image2d_t output) {
+    int image_width_idx   = get_global_id(0);
+    int weights_width_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, weights_width_idx);
+
+    const int hid_4_idx     = image_width_idx % hidden_updiv_4_size;
+    const int dir_gate_idx  = image_width_idx / hidden_updiv_4_size;
+    const int dir_idx       = dir_gate_idx >> 2;
+    const int gate_idx      = dir_gate_idx % 4;
+    const int hid_idx       = hid_4_idx << 2;
+    const int remain        = hidden_size - hid_idx;
+
+    int buffer_offset       = (dir_idx * hidden_mul_4_size + gate_idx * hidden_size + hid_idx) *
+                              weights_width + weights_width_idx;
+
+    float4 out = 0;
+    if (remain >= 4) {
+        out.x = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.y = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.z = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.w = *(input_ptr + buffer_offset);
+    } else if (remain == 3) {
+        out.x = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.y = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.z = *(input_ptr + buffer_offset);
+    } else if (remain == 2) {
+        out.x = *(input_ptr + buffer_offset);
+        buffer_offset += weights_width;
+        out.y = *(input_ptr + buffer_offset);
+    } else if (remain == 1) {
+        out.x = *(input_ptr + buffer_offset);
+    }
+    write_imagef(output, (int2)(image_width_idx, weights_width_idx), out);
+}
+
+// convert data from buffer(num_directions, 8 * hidden_size)
+// to image(num_directions, 8 * hidden_size/4 hidden_size4)
+__kernel void LstmBiasBufferToImage(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                      __private const int num_directions,
+                                      __private const int hidden_size,
+                                      __private const int hidden_updiv_4_size,
+                                      __private const int hidden_mul_8_size,
+                                      __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int dir_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, dir_idx);
+
+    const int hid_4_idx     = image_width_idx % hidden_updiv_4_size;
+    const int gate_idx      = image_width_idx / hidden_updiv_4_size;
+    const int hid_idx       = hid_4_idx << 2;
+    const int remain        = hidden_size - hid_idx;
+
+    int buffer_offset       = dir_idx * hidden_mul_8_size + gate_idx * hidden_size + hid_idx;
+
+    float4 out = 0;
+    if (remain >= 4) {
+        out = vload4(0, input_ptr + buffer_offset);
+    } else if (remain == 3) {
+        out.xyz = vload3(0, input_ptr + buffer_offset);
+    } else if (remain == 2) {
+        out.xy = vload2(0, input_ptr + buffer_offset);
+    } else if (remain == 1) {
+        out.x = *(input_ptr + buffer_offset);
+    }
+    write_imagef(output, (int2)(image_width_idx, dir_idx), out);
+}
+
+__kernel void RGBABufferToImage(GLOBAL_SIZE_2_DIMS __global const uchar *input_ptr, __write_only image2d_t output, 
+								   __private const int height, __private const int width,
+					      		   __private const float4 scale, __private const float4 bias) {
+
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+	const int batch_idx  = image_height_idx / height;
+	const int height_idx = image_height_idx % height;
+
+	int buffer_offset = ((batch_idx * height + height_idx) * width + image_width_idx) * 4;
+
+    float4 values = convert_float4(vload4(0, input_ptr + buffer_offset));
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x = values.z;
+    values.z = temp;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void NV21ToImage(GLOBAL_SIZE_2_DIMS __global const uchar *input_ptr, __write_only image2d_t output, 
+						  __private const int height, __private const int width,
+					      __private const float4 scale, __private const float4 bias) {
+
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+	int y_offset = image_height_idx * width + image_width_idx;
+	int v_offset = width * height + (image_height_idx >> 1) * width + (image_width_idx & (~(0x01)));
+	int u_offset = v_offset + 1;
+
+    int y = (int)(input_ptr[y_offset]);
+    int u = (int)(input_ptr[u_offset]);
+    int v = (int)(input_ptr[v_offset]);
+
+    u -= 128;
+    v -= 128;
+
+    int r = y + v + ((v * 103) >> 8);
+    int g = y - ((u * 88) >> 8) - ((v * 183) >> 8);
+    int b = y + u + ((u * 198) >> 8);
+
+    r = clamp(r, 0, 255);
+    g = clamp(g, 0, 255);
+    b = clamp(b, 0, 255);
+
+    float4 values = (float4)((float)r, (float)g, (float)b, (float)0.0f);
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x = values.z;
+    values.z = temp;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_5d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_5d.cl
new file mode 100644
index 0000000..d61895e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_5d.cl
@@ -0,0 +1,88 @@
+#include "base.inc"
+
+// convert data from buffer(n d2 d3 d4 c) to image(b d2 d3, ic/4 d4 ic4)
+__kernel void NHWCBufferToImage5D(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * dim2 + dim2_idx) * dim3 + dim3_idx) * dim4 +
+                                dim4_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    __global const float *input_current_ptr = input_ptr + buffer_offset;
+    float4 values            = vload4(0, input_current_ptr);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel == 3) {
+        values.w = 0;
+    } else if (remain_channel == 2) {
+        values.z = 0;
+        values.w = 0;
+    } else if (remain_channel == 1) {
+        values.y = 0;
+        values.z = 0;
+        values.w = 0;
+    }
+    write_imagef(output, coord, values);
+}
+
+// convert data from buffer(n c d2 d3 d4) to image(b d2 d3, ic/4 d4 ic4)
+__kernel void NCHWBufferToImage5D(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nchw */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx;
+
+    const int stride         = dim2 * dim3 * dim4;
+    float4 output_values    = 0;
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+        offset += stride;
+        output_values.w = *(input_ptr + offset);
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_6d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_6d.cl
new file mode 100644
index 0000000..66ac265
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/buffer_to_image_6d.cl
@@ -0,0 +1,94 @@
+#include "base.inc"
+
+// convert data from buffer(n d2 d3 d4 d5 c) to image(b d2 d3, ic/4 d4 d5 ic4)
+__kernel void NHWCBufferToImage6D(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int dim5,
+                                  __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * dim2 + dim2_idx) * dim3 + dim3_idx) * dim4 +
+                                 dim4_idx) * dim5 + dim5_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    __global const float *input_current_ptr = input_ptr + buffer_offset;
+    float4 values            = vload4(0, input_current_ptr);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel == 3) {
+        values.w = 0;
+    } else if (remain_channel == 2) {
+        values.z = 0;
+        values.w = 0;
+    } else if (remain_channel == 1) {
+        values.y = 0;
+        values.z = 0;
+        values.w = 0;
+    }
+    write_imagef(output, coord, values);
+}
+
+// convert data from buffer(n c d2 d3 d4 d5) to image(b d2 d3, ic/4 d4 d5 ic4)
+__kernel void NCHWBufferToImage6D(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, /* nchw */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int dim5,
+                                  __write_only image2d_t output) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx) * dim5 + dim5_idx;
+
+    const int stride         = dim2 * dim3 * dim4 * dim5;
+    float4 output_values     = 0;
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+        offset += stride;
+        output_values.w = *(input_ptr + offset);
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+        offset += stride;
+        output_values.z = *(input_ptr + offset);
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += stride;
+        output_values.y = *(input_ptr + offset);
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx), output_values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/concat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/concat.cl
new file mode 100644
index 0000000..32e8ffd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/concat.cl
@@ -0,0 +1,79 @@
+#include "base.inc"
+
+__kernel void ConcatChannel(
+                             GLOBAL_SIZE_3_DIMS
+                             __read_only image2d_t input0,
+                             __read_only image2d_t input1,
+                             __private const int input0_channel,
+                             __private const int output_channel,
+                             __write_only image2d_t output) {
+  const int channel_block_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int hb_idx = get_global_id(2);
+
+  DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, hb_idx);
+
+  const int width = global_size_dim1;
+  const int input1_channel = output_channel - input0_channel;
+
+  const int input0_channel_blk = (input0_channel + 3) >> 2;
+
+  FLOAT4 data = 0;
+  if (channel_block_idx < input0_channel_blk - 1) {
+    data = RI_F(input0, SAMPLER, (int2)(mad24(channel_block_idx, width, width_idx), hb_idx));
+  } else if(channel_block_idx == input0_channel_blk - 1) {
+    FLOAT4 data0 = RI_F(input0, SAMPLER, (int2)(mad24(channel_block_idx, width, width_idx), hb_idx));
+    FLOAT4 data1 = RI_F(input1, SAMPLER, (int2)(width_idx, hb_idx));
+#if CHANNEL0_MOD_4 == 1
+    data = (FLOAT4)(data0.s0, data1.s0, data1.s1, data1.s2);
+#elif CHANNEL0_MOD_4 == 2
+    data = (FLOAT4)(data0.s0, data0.s1, data1.s0, data1.s1);
+#else
+    data = (FLOAT4)(data0.s0, data0.s1, data0.s2, data1.s0);
+#endif
+  } else {
+    const int input1_channel_idx = channel_block_idx - input0_channel_blk;
+    FLOAT4 data0 = RI_F(input1, SAMPLER, (int2)(mad24(input1_channel_idx, width, width_idx), hb_idx));
+    FLOAT4 data1 = 0;
+    if (((input1_channel_idx + 1) << 2) < input1_channel) {
+      data1 = RI_F(input1, SAMPLER, (int2)(mad24((input1_channel_idx + 1), width, width_idx), hb_idx));
+    }
+#if CHANNEL0_MOD_4 == 1
+    data = (FLOAT4)(data0.s3, data1.s0, data1.s1, data1.s2);
+#elif CHANNEL0_MOD_4 == 2
+    data = (FLOAT4)(data0.s2, data0.s3, data1.s0, data1.s1);
+#else
+    data = (FLOAT4)(data0.s1, data0.s2, data0.s3, data1.s0);
+#endif
+  } 
+  
+  const int pos = mad24(channel_block_idx, width, width_idx);
+  WI_F(output, (int2)(pos, hb_idx), data);
+}
+
+__kernel void ConcatChannel4X(
+                             GLOBAL_SIZE_3_DIMS
+                             __read_only image2d_t input0,
+                             __read_only image2d_t input1,
+                             __private const int input0_channel,
+                             __private const int output_channel,
+                             __write_only image2d_t output) {
+  const int channel_block_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int hb_idx = get_global_id(2);
+
+  DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, hb_idx);
+
+  const int width = global_size_dim1;
+  const int input0_channel_blk = input0_channel >> 2;
+
+  FLOAT4 data = 0;
+  if (channel_block_idx < input0_channel_blk) {
+    data = RI_F(input0, SAMPLER, (int2)(mad24(channel_block_idx, width, width_idx), hb_idx));
+  } else {
+    const int input1_channel_idx = channel_block_idx - input0_channel_blk;
+    data = RI_F(input1, SAMPLER, (int2)(mad24(input1_channel_idx, width, width_idx), hb_idx));
+  } 
+  
+  WI_F(output, (int2)(mad24(channel_block_idx, width, width_idx), hb_idx), data);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convert_from_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convert_from_mat.cl
new file mode 100644
index 0000000..6c823be
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convert_from_mat.cl
@@ -0,0 +1,388 @@
+#include "base.inc"
+
+__kernel void ConvertFromNCHW(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                              __global const float *input_ptr,
+                              __private const int height,
+                              __private const int width,
+                              __private const int channels,
+                              __global const float* scale,
+                              __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset =
+        ((batch_idx * channels + channel_4_idx) * height + height_idx) * width +
+        width_idx;
+
+    const int remain_channel    = channels - channel_4_idx;
+    const int height_width_size = height * width;
+    float4 output_values        = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.w = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        output_values = output_values * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        output_values.xyz = output_values.xyz * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        output_values.xy = output_values.xy * scale_data + bias_data;
+#endif
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+#ifdef ENABLE_SCALE_BIAS
+        output_values.x = output_values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx),
+                 output_values);
+}
+
+__kernel void IntBlobConvertFromNCINT32(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                                        __global const int *input_ptr,
+                                        __private const int height,
+                                        __private const int width,
+                                        __private const int channels,
+                                        __global const float* scale,
+                                        __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset =
+        ((batch_idx * channels + channel_4_idx) * height + height_idx) * width +
+        width_idx;
+
+    const int remain_channel    = channels - channel_4_idx;
+    const int height_width_size = height * width;
+    int4 output_values        = 0;
+
+    if (remain_channel >= 4) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.w = *(input_ptr + offset);
+    } else if (remain_channel == 3) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.z = *(input_ptr + offset);
+    } else if (remain_channel == 2) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+        offset += height_width_size;
+        output_values.y = *(input_ptr + offset);
+    } else if (remain_channel == 1) {
+        int offset      = buffer_offset;
+        output_values.x = *(input_ptr + offset);
+    }
+
+    write_imagei(output, (int2)(image_width_idx, image_height_idx),
+                 output_values);
+}
+
+__kernel void CNH4BlobConvertFromNCHW(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                                      __global const float *input_ptr,
+                                      __private const int height,
+                                      __private const int batch,
+                                      __private const int channels,
+                                      __global const float* scale,
+                                      __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx % batch;
+    const int channel_idx   = image_height_idx / batch;
+    const int height_4_idx  = image_width_idx << 2;
+    int buffer_offset =
+        (batch_idx * channels + channel_idx) * height + height_4_idx;
+
+    const int remain_height = height - height_4_idx;
+    float4 output_values        = 0;
+    int offset      = buffer_offset;
+    if (remain_height >= 4) {
+        output_values = vload4(0, input_ptr + offset);
+    } else if (remain_height == 3) {
+        output_values.xyz = vload3(0, input_ptr + offset);
+    } else if (remain_height == 2) {
+        output_values.xy = vload2(0, input_ptr + offset);
+    } else if (remain_height == 1) {
+        output_values.x = input_ptr[offset];
+    }
+
+    write_imagef(output, (int2)(image_width_idx, image_height_idx),
+                 output_values);
+}
+
+__kernel void ConvertFromN8UC4(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width,
+                               __private const int channels,
+                               __private const float4 scale,
+                               __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+
+    int buffer_offset =
+        ((batch_idx * height + height_idx) * width + image_width_idx) * 4;
+
+    float4 values = convert_float4(vload4(0, input_ptr + buffer_offset));
+
+    if (channels == 3) {
+        values.w = 0;
+    }
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void ConvertFromN8UC3(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width,
+                               __private const float4 scale,
+                               __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+
+    int buffer_offset =
+        ((batch_idx * height + height_idx) * width + image_width_idx) * 3;
+
+    float4 values = (float4)(0.0f);
+
+#ifdef SWAP_RB
+    values.z = convert_float(input_ptr[buffer_offset]);
+    values.y = convert_float(input_ptr[buffer_offset + 1]);
+    values.x = convert_float(input_ptr[buffer_offset + 2]);
+#else
+    values.x = convert_float(input_ptr[buffer_offset]);
+    values.y = convert_float(input_ptr[buffer_offset + 1]);
+    values.z = convert_float(input_ptr[buffer_offset + 2]);
+#endif
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void ConvertFromNGray(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width,
+                               __private const float4 scale,
+                               __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+
+    int buffer_offset =
+        (batch_idx * height + height_idx) * width + image_width_idx;
+
+    float4 values = 0;
+    values.x      = convert_float(input_ptr[buffer_offset]);
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void ConvertFromNNV21(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width,
+                               __private const float4 scale,
+                               __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int y_offset = image_height_idx * width + image_width_idx;
+    int v_offset = width * height + (image_height_idx >> 1) * width +
+                   (image_width_idx & (~(0x01)));
+    int u_offset = v_offset + 1;
+
+    int y = (int)(input_ptr[y_offset]);
+    int u = (int)(input_ptr[u_offset]);
+    int v = (int)(input_ptr[v_offset]);
+
+    u -= 128;
+    v -= 128;
+
+    int r = y + v + ((v * 103) >> 8);
+    int g = y - ((u * 88) >> 8) - ((v * 183) >> 8);
+    int b = y + u + ((u * 198) >> 8);
+
+    r = clamp(r, 0, 255);
+    g = clamp(g, 0, 255);
+    b = clamp(b, 0, 255);
+
+    float4 values = (float4)((float)r, (float)g, (float)b, (float)0.0f);
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void ConvertFromN32FC4Image(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __write_only image2d_t output, __private const int channels, __private const float4 scale,
+    __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int2 coord    = (int2)(image_width_idx, image_height_idx);
+    float4 values = read_imagef(input, SAMPLER, coord);
+
+    if (channels == 3) {
+        values.w = 0;
+    }
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+    write_imagef(output, coord, values);
+}
+
+__kernel void CopyFromN8UC3(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+
+    int buffer_offset =
+        ((batch_idx * height + height_idx) * width + image_width_idx) * 3;
+
+    float4 values = (float4)(0.0f);
+
+    values.x = convert_float(input_ptr[buffer_offset]);
+    values.y = convert_float(input_ptr[buffer_offset + 1]);
+    values.z = convert_float(input_ptr[buffer_offset + 2]);
+
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
+
+__kernel void CopyFromN8UC4(GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+                               __global const uchar *input_ptr,
+                               __private const int height,
+                               __private const int width) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+
+    int buffer_offset =
+        ((batch_idx * height + height_idx) * width + image_width_idx) * 4;
+
+    float4 values = convert_float4(vload4(0, input_ptr + buffer_offset));
+
+    int2 coord = (int2)(image_width_idx, image_height_idx);
+    write_imagef(output, coord, values);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convert_to_mat.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convert_to_mat.cl
new file mode 100644
index 0000000..b9e7995
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convert_to_mat.cl
@@ -0,0 +1,429 @@
+#include "base.inc"
+
+__kernel void ConvertToNCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                            __global float *output, __private const int height,
+                            __private const int width,
+                            __private const int channels,
+                            __global const float* scale,
+                            __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    int buffer_offset =
+        ((batch_idx * channels + channel_4_idx) * height + height_idx) * width +
+        width_idx;
+    float4 values = read_imagef(input_ptr, SAMPLER,
+                                (int2)(image_width_idx, image_height_idx));
+
+    const int height_width_size = height * width;
+
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        values = values * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+        offset += height_width_size;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        values.xyz = values.xyz * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        values.xy = values.xy * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+#ifdef ENABLE_SCALE_BIAS
+        values.x = values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void IntBlobConvertToNCINT32(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                                      __global int *output, __private const int height,
+                                      __private const int width,
+                                      __private const int channels,
+                                      __global const float* scale,
+                                      __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    int buffer_offset =
+        ((batch_idx * channels + channel_4_idx) * height + height_idx) * width +
+        width_idx;
+    int4 values = read_imagei(input_ptr, SAMPLER,
+                              (int2)(image_width_idx, image_height_idx));
+
+    const int height_width_size = height * width;
+
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+        offset += height_width_size;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void IntBlobConvertToNCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                                   __global float *output, __private const int height,
+                                   __private const int width,
+                                   __private const int channels,
+                                   __global const float* scale,
+                                   __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    int buffer_offset =
+        ((batch_idx * channels + channel_4_idx) * height + height_idx) * width +
+        width_idx;
+    float4 values = convert_float4(read_imagei(input_ptr, SAMPLER,
+                                   (int2)(image_width_idx, image_height_idx)));
+
+    const int height_width_size = height * width;
+
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+#ifdef ENABLE_SCALE_BIAS
+        float4 scale_data   = vload4(0, scale + channel_4_idx);
+        float4 bias_data    = vload4(0, bias + channel_4_idx);
+        values = values * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+        offset += height_width_size;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+#ifdef ENABLE_SCALE_BIAS
+        float3 scale_data   = vload3(0, scale + channel_4_idx);
+        float3 bias_data    = vload3(0, bias + channel_4_idx);
+        values.xyz = values.xyz * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+#ifdef ENABLE_SCALE_BIAS
+        float2 scale_data   = vload2(0, scale + channel_4_idx);
+        float2 bias_data    = vload2(0, bias + channel_4_idx);
+        values.xy = values.xy * scale_data + bias_data;
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+#ifdef ENABLE_SCALE_BIAS
+        values.x = values.x * scale[channel_4_idx] + bias[channel_4_idx];
+#endif
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void NCHWBlobConvertToNCHW(GLOBAL_SIZE_2_DIMS __global const FLOAT *input_ptr,
+                                    __global float *output,
+                                    __private const int channels,
+                                    __global const float* scale,
+                                    __global const float* bias) {
+    int global_id0 = get_global_id(0);
+    int batch_channel_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id0, batch_channel_idx);
+
+    int buffer_offset = batch_channel_idx * global_size_dim0 + global_id0;
+    float value = input_ptr[buffer_offset];
+
+#ifdef ENABLE_SCALE_BIAS
+    const int channel_idx   = batch_channel_idx % channels;
+    float scale_data = scale[channel_idx];
+    float bias_data  = bias[channel_idx];
+
+    value = value * scale_data + bias_data;
+#endif
+
+    output[buffer_offset] = value;
+}
+
+__kernel void CNH4BlobConvertToNCHW(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                                    __global float *output, __private const int height,
+                                    __private const int batch,
+                                    __private const int channels,
+                                    __global const float* scale,
+                                    __global const float* bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx % batch;
+    const int channel_idx   = image_height_idx / batch;
+    const int height_4_idx  = image_width_idx << 2;
+
+    int buffer_offset =
+        (batch_idx * channels + channel_idx) * height + height_4_idx;
+    float4 values = read_imagef(input_ptr, SAMPLER,
+                                (int2)(image_width_idx, image_height_idx));
+
+    const int remain_height = height - height_4_idx;
+
+    if (remain_height >= 4) {
+        int offset     = buffer_offset;
+        output[offset++] = values.x;
+        output[offset++] = values.y;
+        output[offset++] = values.z;
+        output[offset]   = values.w;
+    } else if (remain_height == 3) {
+        int offset     = buffer_offset;
+        output[offset++] = values.x;
+        output[offset++] = values.y;
+        output[offset] = values.z;
+    } else if (remain_height == 2) {
+        int offset     = buffer_offset;
+        output[offset++] = values.x;
+        output[offset] = values.y;
+    } else if (remain_height == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void ConvertToN8UC4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                             __global uchar *output, __private const int height,
+                             __private const int width,
+                             __private const float4 scale,
+                             __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / height;
+    const int height_idx        = image_height_idx % height;
+    const int width_idx         = image_width_idx % width;
+    const int channel_block_idx = image_width_idx / width;
+
+    int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx +
+                         channel_block_idx) *
+                        4;
+
+    int2 coord      = (int2)(image_width_idx, image_height_idx);
+    float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+#ifdef ENABLE_SCALE_BIAS
+    values_f = values_f * scale + bias;
+#endif
+    uchar4 values = convert_uchar4_sat(values_f);
+
+#ifdef SWAP_RB
+    uchar temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+
+    vstore4(values, 0, output + buffer_offset);
+}
+
+__kernel void ConvertToN8UC3(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                             __global uchar *output, __private const int height,
+                             __private const int width,
+                             __private const float4 scale,
+                             __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / height;
+    const int height_idx        = image_height_idx % height;
+    const int width_idx         = image_width_idx % width;
+    const int channel_block_idx = image_width_idx / width;
+
+    int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx +
+                         channel_block_idx) *
+                        3;
+
+    int2 coord      = (int2)(image_width_idx, image_height_idx);
+    float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+#ifdef ENABLE_SCALE_BIAS
+    values_f = values_f * scale + bias;
+#endif
+    uchar4 values = convert_uchar4_sat(values_f);
+
+#ifdef SWAP_RB
+    uchar temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+    output[buffer_offset]     = values.x;
+    output[buffer_offset + 1] = values.y;
+    output[buffer_offset + 2] = values.z;
+}
+
+__kernel void ConvertToNGray(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                             __global uchar *output, __private const int height,
+                             __private const int width,
+                             __private const float4 scale,
+                             __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / height;
+    const int height_idx        = image_height_idx % height;
+    const int width_idx         = image_width_idx % width;
+    const int channel_block_idx = image_width_idx / width;
+
+    int buffer_offset = (batch_idx * height + height_idx) * width + width_idx +
+                        channel_block_idx;
+
+    int2 coord      = (int2)(image_width_idx, image_height_idx);
+    float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+#ifdef ENABLE_SCALE_BIAS
+    values_f = values_f * scale + bias;
+#endif
+    uchar4 values = convert_uchar4_sat(values_f);
+
+    output[buffer_offset] = values.x;
+}
+
+__kernel void ConvertToN32FC4Image(
+    GLOBAL_SIZE_2_DIMS __write_only image2d_t output,
+    __read_only image2d_t input, __private const float4 scale,
+    __private const float4 bias) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int2 coord    = (int2)(image_width_idx, image_height_idx);
+    float4 values = read_imagef(input, SAMPLER, coord);
+#ifdef ENABLE_SCALE_BIAS
+    values = values * scale + bias;
+#endif
+
+#ifdef SWAP_RB
+    float temp = values.x;
+    values.x   = values.z;
+    values.z   = temp;
+#endif
+
+    write_imagef(output, coord, values);
+}
+
+__kernel void CopyToN8UC3(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                             __global uchar *output, __private const int height,
+                             __private const int width) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / height;
+    const int height_idx        = image_height_idx % height;
+    const int width_idx         = image_width_idx % width;
+    const int channel_block_idx = image_width_idx / width;
+
+    int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx +
+                         channel_block_idx) *
+                        3;
+
+    int2 coord      = (int2)(image_width_idx, image_height_idx);
+    float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+    uchar4 values = convert_uchar4_sat(values_f);
+
+    output[buffer_offset]     = values.x;
+    output[buffer_offset + 1] = values.y;
+    output[buffer_offset + 2] = values.z;
+}
+
+__kernel void CopyToN8UC4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr,
+                             __global uchar *output, __private const int height,
+                             __private const int width) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx         = image_height_idx / height;
+    const int height_idx        = image_height_idx % height;
+
+    int buffer_offset = ((batch_idx * height + height_idx) * width + image_width_idx) * 4;
+    int2 coord      = (int2)(image_width_idx, image_height_idx);
+    float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+    
+    uchar4 values = convert_uchar4_sat(values_f);
+
+    vstore4(values, 0, output + buffer_offset);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution.cl
new file mode 100644
index 0000000..48fc3a5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution.cl
@@ -0,0 +1,192 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2D(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __read_only image2d_t weights, __read_only image2d_t bias,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int out_channel_block_idx = output_cw_idx / out_width_blocks;
+    const int out_width_block_idx   = output_cw_idx % out_width_blocks;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width0 + stride_wh.x * 2;
+    int in_width3 = in_width0 + stride_wh.x * 3;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    const int weights_h_idx = mul24(out_channel_block_idx, mul24(kernel_wh.x, kernel_wh.y)) + 
+                              mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+        const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+        int weights_x_idx = input_c_block_idx << 2;
+        int weights_y_idx = weights_h_idx;
+        for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+            int in_hb_value = iy + batch_idx;
+            for (int w = 0; w < kernel_wh.x; w++) {
+                int input_w_base = mul24(w, dilation_wh.x);
+                READ_INPUT_IMAGE(0, input_w_base);
+                READ_INPUT_IMAGE(1, input_w_base);
+                READ_INPUT_IMAGE(2, input_w_base);
+                READ_INPUT_IMAGE(3, input_w_base);
+
+                weights0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx, weights_y_idx));
+                weights1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 1, weights_y_idx));
+                weights2 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 2, weights_y_idx));
+                weights3 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 3, weights_y_idx++));
+
+                CALCULATE_OUTPUT(0);
+                CALCULATE_OUTPUT(1);
+                CALCULATE_OUTPUT(2);
+                CALCULATE_OUTPUT(3);
+            }
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               output_bh_idx, remain);
+}
+
+__kernel void Conv2D_CB2(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __read_only image2d_t weights, __read_only image2d_t bias,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int out_channel_block_length,
+    __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int kernel_size,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int output_channel_slice_w_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_channel_slice_w_idx, output_bh_idx);
+
+    const int out_channel_slice_idx = output_channel_slice_w_idx / out_width_blocks;
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+    const int out_width_block_idx   = output_channel_slice_w_idx % out_width_blocks;
+
+    FLOAT4 out_w0_s0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    FLOAT4 out_w0_s1 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx + 1, 0));
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width1 + stride_wh.x;
+    int in_width3 = in_width2 + stride_wh.x;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0),
+                                dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    int weights_y_idx_s0 = mad24(out_channel_block_idx, kernel_size,
+                                 mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x));
+    int weights_y_idx_s1 = weights_y_idx_s0 + kernel_size;
+    int2 weights_y_idx = {weights_y_idx_s0, weights_y_idx_s1};
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+    for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+        int in_hb_value = iy + batch_idx;
+        int4 in_width = {in_width0, in_width1, in_width2, in_width3};
+        for (int w = 0; w < kernel_wh.x; w++) {
+            int4 weights_x_idx = {0, 1, 2, 3};
+            for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+                const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+                int4 is_w_in_boundary = (in_width >= 0 && in_width < input_wh.x);
+                int4 in_cw_value = in_width + in_idx;
+        
+                in0 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.x, is_w_in_boundary.x), in_hb_value));
+                in1 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.y, is_w_in_boundary.y), in_hb_value));
+                in2 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.z, is_w_in_boundary.z), in_hb_value));
+                in3 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.w, is_w_in_boundary.w), in_hb_value));
+
+                weights_c0_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, weights_y_idx.x));
+                weights_c1_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, weights_y_idx.x));
+                weights_c2_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, weights_y_idx.x));
+                weights_c3_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, weights_y_idx.x));
+
+                weights_c0_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, weights_y_idx.y));
+                weights_c1_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, weights_y_idx.y));
+                weights_c2_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, weights_y_idx.y));
+                weights_c3_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, weights_y_idx.y));
+
+                CALCULATE_SLICE_OUTPUT(0);
+                CALCULATE_SLICE_OUTPUT(1);
+
+                weights_x_idx += 4;
+            }
+            weights_y_idx++;
+            in_width += dilation_wh.x;
+        }
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + output_wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
+
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1.cl
new file mode 100644
index 0000000..199b49b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1.cl
@@ -0,0 +1,140 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2D1x1_S1(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 wh,
+    __private const int input_c_blocks,
+    __private const int output_w_updiv_4,
+    __private const int activation_type) {
+    const int output_cw_idx = get_global_id(0);
+    const int bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, bh_idx);
+
+    const int output_c_block_idx = output_cw_idx / output_w_updiv_4;
+    const int output_w_block_idx = output_cw_idx % output_w_updiv_4;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(output_c_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int input_w_idx0 = output_w_block_idx << 2;
+    int input_w_idx1 = input_w_idx0 + 1;
+    int input_w_idx2 = input_w_idx0 + 2;
+    int input_w_idx3 = input_w_idx0 + 3;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= wh.x);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    int input_w_base   = 0;
+    int weights_w_base = 0;
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, bh_idx));
+
+        weights0 = RI_F(weights, SAMPLER, (int2)(weights_w_base, output_c_block_idx));
+        weights1 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 1, output_c_block_idx));
+        weights2 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 2, output_c_block_idx));
+        weights3 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 3, output_c_block_idx));
+
+        CALCULATE_OUTPUT(0);
+        CALCULATE_OUTPUT(1);
+        CALCULATE_OUTPUT(2);
+        CALCULATE_OUTPUT(3);
+
+        input_w_base   += wh.x;
+        weights_w_base += 4;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, wh.x);
+    int out_x_idx        = output_w_block_idx << 2;
+
+    const int remain = wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               bh_idx, remain);
+}
+
+__kernel void Conv2D1x1(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int input_c_blocks, __private const int2 output_wh,
+    __private const int2 stride_wh, __private const int output_w_updiv_4,
+    __private const int activation_type) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_c_block_idx = output_cw_idx / output_w_updiv_4;
+    const int output_w_block_idx = output_cw_idx % output_w_updiv_4;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(output_c_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int input_w_idx0 = mul24(output_w_block_idx, stride_wh.x << 2);
+    int input_w_idx1 = input_w_idx0 + stride_wh.x;
+    int input_w_idx2 = input_w_idx1 + stride_wh.x;
+    int input_w_idx3 = input_w_idx2 + stride_wh.x;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= input_wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= input_wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= input_wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= input_wh.x);
+
+    int b_idx = output_bh_idx / output_wh.y;
+    int input_bh_idx = mad24(output_bh_idx % output_wh.y, stride_wh.y, b_idx * input_wh.y);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        int input_w_base   = input_c_block_idx * input_wh.x;
+        int weights_w_base = input_c_block_idx << 2;
+
+        in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, input_bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, input_bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, input_bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, input_bh_idx));
+
+        weights0 = RI_F(weights, SAMPLER, (int2)(weights_w_base, output_c_block_idx));
+        weights1 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 1, output_c_block_idx));
+        weights2 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 2, output_c_block_idx));
+        weights3 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 3, output_c_block_idx));
+
+        CALCULATE_OUTPUT(0);
+        CALCULATE_OUTPUT(1);
+        CALCULATE_OUTPUT(2);
+        CALCULATE_OUTPUT(3);
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, output_wh.x);
+    int out_x_idx        = output_w_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               output_bh_idx, remain);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_gws_3d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_gws_3d.cl
new file mode 100644
index 0000000..36d4eb2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_gws_3d.cl
@@ -0,0 +1,314 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2D1x1GS3D(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int input_c_blocks, __private const int2 output_wh,
+    __private const int2 stride_wh, __private const int output_w_updiv_4,
+    __private const int activation_type) {
+    const int output_c_block_idx = get_global_id(0);
+    const int output_w_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(output_c_block_idx, output_w_block_idx, output_bh_idx);
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(output_c_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int input_w_idx0 = mul24(output_w_block_idx, stride_wh.x << 2);
+    int input_w_idx1 = input_w_idx0 + stride_wh.x;
+    int input_w_idx2 = input_w_idx1 + stride_wh.x;
+    int input_w_idx3 = input_w_idx2 + stride_wh.x;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= input_wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= input_wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= input_wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= input_wh.x);
+
+    int b_idx = output_bh_idx / output_wh.y;
+    int input_bh_idx = mul24((output_bh_idx % output_wh.y), stride_wh.y) + b_idx * input_wh.y;
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    int input_w_base   = 0;
+    int weights_w_base = 0;
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, input_bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, input_bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, input_bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, input_bh_idx));
+
+        weights0 = RI_F(weights, SAMPLER, (int2)(weights_w_base, output_c_block_idx));
+        weights1 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 1, output_c_block_idx));
+        weights2 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 2, output_c_block_idx));
+        weights3 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 3, output_c_block_idx));
+
+        CALCULATE_OUTPUT(0);
+        CALCULATE_OUTPUT(1);
+        CALCULATE_OUTPUT(2);
+        CALCULATE_OUTPUT(3);
+
+        input_w_base += input_wh.x;
+        weights_w_base += 4;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, output_wh.x);
+    int out_x_idx        = output_w_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               output_bh_idx, remain);
+}
+
+__kernel void Conv2D1x1GS3D_S1(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 wh,
+    __private const int input_c_blocks,
+    __private const int output_w_updiv_4,
+    __private const int activation_type) {
+    const int output_c_block_idx = get_global_id(0);
+    const int output_w_block_idx = get_global_id(1);
+    const int bh_idx      = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(output_c_block_idx, output_w_block_idx, bh_idx);
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(output_c_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int input_w_idx0 = output_w_block_idx << 2;
+    int input_w_idx1 = input_w_idx0 + 1;
+    int input_w_idx2 = input_w_idx0 + 2;
+    int input_w_idx3 = input_w_idx0 + 3;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= wh.x);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    int input_w_base   = 0;
+    int weights_w_base = 0;
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, bh_idx));
+
+        weights0 = RI_F(weights, SAMPLER, (int2)(weights_w_base, output_c_block_idx));
+        weights1 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 1, output_c_block_idx));
+        weights2 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 2, output_c_block_idx));
+        weights3 = RI_F(weights, SAMPLER, (int2)(weights_w_base + 3, output_c_block_idx));
+
+        CALCULATE_OUTPUT(0);
+        CALCULATE_OUTPUT(1);
+        CALCULATE_OUTPUT(2);
+        CALCULATE_OUTPUT(3);
+
+        input_w_base   += wh.x;
+        weights_w_base += 4;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, wh.x);
+    int out_x_idx        = output_w_block_idx << 2;
+
+    const int remain = wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               bh_idx, remain);
+}
+
+__kernel void Conv2D1x1GS3D_S1_CB2(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 wh,
+    __private const int in_channel_block_length, __private const int out_channel_block_length,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int out_channel_slice_idx = get_global_id(0);
+    const int out_width_block_idx   = get_global_id(1);
+    const int output_bh_idx         = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(out_channel_slice_idx, out_width_block_idx, output_bh_idx);
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+
+    FLOAT4 out_w0_s0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    FLOAT4 out_w0_s1 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx + 1, 0));
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    int in_width0 = out_width_block_idx << 2;
+    int in_width1 = in_width0 + 1;
+    int in_width2 = in_width0 + 2;
+    int in_width3 = in_width0 + 3;
+    int4 in_width = {in_width0, in_width1, in_width2, in_width3};
+    int4 weights_x_idx = {0, 1, 2, 3};
+    int out_channel_block_idx_s1 = out_channel_block_idx + 1;
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(in_width.x, output_bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(in_width.y, output_bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(in_width.z, output_bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(in_width.w, output_bh_idx));
+
+        weights_c0_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, out_channel_block_idx));
+        weights_c1_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, out_channel_block_idx));
+        weights_c2_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, out_channel_block_idx));
+        weights_c3_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, out_channel_block_idx));
+
+        weights_c0_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, out_channel_block_idx_s1));
+        weights_c1_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, out_channel_block_idx_s1));
+        weights_c2_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, out_channel_block_idx_s1));
+        weights_c3_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, out_channel_block_idx_s1));
+
+        CALCULATE_SLICE_OUTPUT(0);
+        CALCULATE_SLICE_OUTPUT(1);
+
+        weights_x_idx += 4;
+        in_width += wh.x;
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
+
+__kernel void Conv2D1x1GS3D_CB2(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input, /* [w,h] [c%4 * w * c/4, h * b] */
+    __read_only image2d_t weights,                  /* [w,h] [cout%4 * cin, cout/4] */
+    __read_only image2d_t bias,                     /* [w,h] [cout%4 * cout/4, 1]   */
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int out_channel_block_length,
+    __private const int2 output_wh,
+    __private const int2 stride_wh, __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int out_channel_slice_idx = get_global_id(0);
+    const int out_width_block_idx   = get_global_id(1);
+    const int output_bh_idx         = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(out_channel_slice_idx, out_width_block_idx, output_bh_idx);
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+
+    FLOAT4 out_w0_s0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    FLOAT4 out_w0_s1 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx + 1, 0));
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    int in_width0 = mul24(out_width_block_idx, stride_wh.x << 2);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width1 + stride_wh.x;
+    int in_width3 = in_width2 + stride_wh.x;
+    int4 in_width = {in_width0, in_width1, in_width2, in_width3};
+    int4 weights_x_idx = {0, 1, 2, 3};
+
+    const int batch_idx     = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    const int in_hb_value   = mad24(output_bh_idx % output_wh.y, stride_wh.y, batch_idx);
+    int out_channel_block_idx_s1 = out_channel_block_idx + 1;
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(in_width.x, in_hb_value));
+        in1 = RI_F(input, SAMPLER, (int2)(in_width.y, in_hb_value));
+        in2 = RI_F(input, SAMPLER, (int2)(in_width.z, in_hb_value));
+        in3 = RI_F(input, SAMPLER, (int2)(in_width.w, in_hb_value));
+
+        weights_c0_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, out_channel_block_idx));
+        weights_c1_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, out_channel_block_idx));
+        weights_c2_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, out_channel_block_idx));
+        weights_c3_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, out_channel_block_idx));
+
+        weights_c0_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, out_channel_block_idx_s1));
+        weights_c1_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, out_channel_block_idx_s1));
+        weights_c2_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, out_channel_block_idx_s1));
+        weights_c3_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, out_channel_block_idx_s1));
+
+        CALCULATE_SLICE_OUTPUT(0);
+        CALCULATE_SLICE_OUTPUT(1);
+
+        weights_x_idx += 4;
+        in_width += input_wh.x;
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + output_wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_mix.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_mix.cl
new file mode 100644
index 0000000..e379c42
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_1x1_mix.cl
@@ -0,0 +1,274 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2D1x1_S1_MIX(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, 
+                          __global const FLOAT16 *weights_ptr,
+                          __global const FLOAT4 *bias_ptr,
+                          __write_only image2d_t output, __private const int2 wh,
+                          __private const int input_c_blocks,
+                          __private const int output_w_updiv_4,
+                          __private const int activation_type) {
+
+    const int output_cw_idx = get_global_id(0); //c/4 w/4
+    const int bh_idx  = get_global_id(1); //b h
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, bh_idx);
+
+    const int output_c_block_idx = output_cw_idx / output_w_updiv_4;
+    const int output_w_block_idx = output_cw_idx % output_w_updiv_4;
+
+    FLOAT4 out0 = bias_ptr[output_c_block_idx];
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    const int out_x_idx = output_w_block_idx << 2;
+
+    int input_w_idx0 = out_x_idx;
+    int input_w_idx1 = out_x_idx + 1;
+    int input_w_idx2 = out_x_idx + 2;
+    int input_w_idx3 = out_x_idx + 3;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= wh.x);
+
+    int input_w_base   = 0;
+    int weights_offset = mul24(output_c_block_idx, input_c_blocks);
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx));
+        FLOAT4 in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, bh_idx));
+        FLOAT4 in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, bh_idx));
+        FLOAT4 in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, bh_idx));
+
+        FLOAT16 weights = weights_ptr[weights_offset];
+        CALCULATE_VEC16_OUTPUT(0);
+        CALCULATE_VEC16_OUTPUT(1);
+        CALCULATE_VEC16_OUTPUT(2);
+        CALCULATE_VEC16_OUTPUT(3);
+
+        input_w_base   += wh.x;
+        weights_offset++;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, wh.x);
+
+    const int remain = wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               bh_idx, remain);
+}
+
+__kernel void Conv2D1x1_S1_MIX_CB2(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                                   __global const FLOAT *weights_ptr,
+                                   __global const FLOAT *bias_ptr,
+                                   __write_only image2d_t output, __private const int2 wh,
+                                   __private const int input_c_blocks,
+                                   __private const int out_channel_block_length,
+                                   __private const int out_width_blocks,
+                                   __private const int activation_type) {
+
+    const int output_channel_slice_w_idx = get_global_id(0); //c/4 w/4
+    const int output_bh_idx  = get_global_id(1); //b h
+
+    DEAL_NON_UNIFORM_DIM2(output_channel_slice_w_idx, output_bh_idx);
+
+    const int out_channel_slice_idx = output_channel_slice_w_idx / out_width_blocks;
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+    const int output_w_block_idx = output_channel_slice_w_idx % out_width_blocks;
+
+    FLOAT4 out_w0_s0 = vload4(out_channel_block_idx, (__global FLOAT *)bias_ptr);
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    FLOAT4 out_w0_s1 = vload4(select(out_channel_block_idx, out_channel_block_idx + 1, is_s1_in_boundary), (__global FLOAT *)bias_ptr);
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+
+    const int out_x_idx = output_w_block_idx << 2;
+
+    int input_w_idx0 = out_x_idx;
+    int input_w_idx1 = out_x_idx + 1;
+    int input_w_idx2 = out_x_idx + 2;
+    int input_w_idx3 = out_x_idx + 3;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+    input_w_idx1 = select(input_w_idx1, INT_MIN, input_w_idx1 >= wh.x);
+    input_w_idx2 = select(input_w_idx2, INT_MIN, input_w_idx2 >= wh.x);
+    input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= wh.x);
+
+    int4 input_w_idx = {input_w_idx0, input_w_idx1, input_w_idx2, input_w_idx3};
+
+    const int input_channels = input_c_blocks << 2;
+    int weights_offset_s0 = mul24(out_channel_block_idx, input_channels);
+    int weights_offset_s1 = weights_offset_s0 + input_channels;
+    int2 weights_offset = {weights_offset_s0, select(0, weights_offset_s1, is_s1_in_boundary)};
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        in0 = RI_F(input, SAMPLER, (int2)(input_w_idx.x, output_bh_idx));
+        in1 = RI_F(input, SAMPLER, (int2)(input_w_idx.y, output_bh_idx));
+        in2 = RI_F(input, SAMPLER, (int2)(input_w_idx.z, output_bh_idx));
+        in3 = RI_F(input, SAMPLER, (int2)(input_w_idx.w, output_bh_idx));
+
+        weights_c0_s0 = vload4(weights_offset.x, (__global FLOAT *)weights_ptr);
+        weights_c1_s0 = vload4(weights_offset.x + 1, (__global FLOAT *)weights_ptr);
+        weights_c2_s0 = vload4(weights_offset.x + 2, (__global FLOAT *)weights_ptr);
+        weights_c3_s0 = vload4(weights_offset.x + 3, (__global FLOAT *)weights_ptr);
+
+        weights_c0_s1 = vload4(weights_offset.y, (__global FLOAT *)weights_ptr);
+        weights_c1_s1 = vload4(weights_offset.y + 1, (__global FLOAT *)weights_ptr);
+        weights_c2_s1 = vload4(weights_offset.y + 2, (__global FLOAT *)weights_ptr);
+        weights_c3_s1 = vload4(weights_offset.y + 3, (__global FLOAT *)weights_ptr);
+
+        CALCULATE_SLICE_OUTPUT(0);
+        CALCULATE_SLICE_OUTPUT(1);
+
+        input_w_idx   += wh.x;
+        weights_offset += 4;
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, wh.x);
+
+    const int remain = wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
+
+__kernel void Conv2D1x1_S1_MIX_WB1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                          __global const FLOAT16 *weights_ptr,
+                          __global const FLOAT4 *bias_ptr,
+                          __write_only image2d_t output, __private const int2 wh,
+                          __private const int input_c_blocks,
+                          __private const int activation_type) {
+
+    const int output_cw_idx = get_global_id(0); //c/4 w
+    const int bh_idx  = get_global_id(1); //b h
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, bh_idx);
+
+    const int output_c_block_idx = output_cw_idx / wh.x;
+    const int out_x_idx = output_cw_idx % wh.x;
+
+    FLOAT4 out0 = bias_ptr[output_c_block_idx];
+
+    int input_w_idx0 = out_x_idx;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+
+    int input_w_base   = 0;
+    int weights_offset = mul24(output_c_block_idx, input_c_blocks);
+    for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) {
+        FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx));
+
+        FLOAT16 weights = weights_ptr[weights_offset];
+
+        CALCULATE_VEC16_OUTPUT(0);
+
+        input_w_base   += wh.x;
+        weights_offset++;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+
+    const int out_x_base = mul24(output_c_block_idx, wh.x);
+
+    int output_w_idx = out_x_base + out_x_idx;
+    WI_F(output, (int2)(output_w_idx, bh_idx), out0);
+}
+
+__kernel void Conv2D1x1_S1_MIX_WB1_Local(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                          __global const FLOAT16 *weights_ptr,
+                          __global const FLOAT4 *bias_ptr,
+                          __write_only image2d_t output, __private const int2 wh,
+                          __private const int input_c_blocks,
+                          __private const int local_block_size,
+                          __local FLOAT4* local_output,
+                          __private const int activation_type) {
+
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int output_cw_idx = global_id / group_size; //c/4 w
+    const int bh_idx  = get_global_id(1); //b h
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh_idx);
+
+    const int output_c_block_idx = output_cw_idx / wh.x;
+    const int out_x_idx = output_cw_idx % wh.x;
+
+    local_output[local_id] = (FLOAT4)0.f;
+
+    int input_w_idx0 = out_x_idx;
+
+    input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x);
+
+    int pos = local_id;
+    int input_w_stride = mul24(group_size, wh.x);
+    int weights_stride = group_size;
+    int input_w_base   = mul24(pos, wh.x);
+    int weights_offset = mad24(output_c_block_idx, input_c_blocks, pos);
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= input_c_blocks) break;
+        FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx));
+
+        FLOAT16 weights = weights_ptr[weights_offset];
+
+        local_output[local_id] += weights.s0123 * in0.x;
+        local_output[local_id] += weights.s4567 * in0.y;
+        local_output[local_id] += weights.s89ab * in0.z;
+        local_output[local_id] += weights.scdef * in0.w;
+
+        input_w_base   += input_w_stride;
+        weights_offset += weights_stride;
+        pos += group_size;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) {
+        if (local_id < stride) {
+            local_output[local_id] += local_output[local_id + stride];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) {
+        local_output[local_id] += bias_ptr[output_c_block_idx];
+        local_output[local_id] = ActivationProcess(local_output[local_id], activation_type);
+
+        const int out_x_base = mul24(output_c_block_idx, wh.x);
+
+        int output_w_idx = out_x_base + out_x_idx;
+        WI_F(output, (int2)(output_w_idx, bh_idx), local_output[local_id]);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_depthwise.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_depthwise.cl
new file mode 100644
index 0000000..081cbd3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_depthwise.cl
@@ -0,0 +1,146 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void DepthwiseConv2DS1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                                __read_only image2d_t filter,
+                                __read_only image2d_t bias,
+                                __write_only image2d_t output,
+                                __private const int2 input_wh,
+                                __private const int2 output_wh,
+                                __private const int2 kernel_wh,
+                                __private const int2 padding_wh,
+                                __private const int activation_type) {
+    const int outChannelWidthIdx = get_global_id(0);
+    const int outHeightBlockIdx  = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(outChannelWidthIdx, outHeightBlockIdx);
+    int ow4                      = (output_wh.x + 3) / 4;
+    const int outChannelBlockIdx = outChannelWidthIdx / ow4;
+    const int outWidthBlockidx   = outChannelWidthIdx % ow4;
+
+    const int inChannelBlockIdx = outChannelBlockIdx;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(outChannelBlockIdx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    const int outWidthBlockidx4 = outWidthBlockidx << 2;
+    const int in_width0         = outWidthBlockidx4 - padding_wh.x;
+    const int in_width1         = in_width0 + 1;
+    const int in_width2         = in_width0 + 2;
+    const int in_width3         = in_width0 + 3;
+
+    int heightIdx = outHeightBlockIdx % output_wh.y - padding_wh.y;
+    const int outBatchIdx =
+        mul24((outHeightBlockIdx / output_wh.y), input_wh.y);
+    const int in_idx = mul24(inChannelBlockIdx, input_wh.x);
+
+    const int inWidthIdx0 = select(in_idx + in_width0, -1, (in_width0 < 0 || in_width0 >= input_wh.x));
+    const int inWidthIdx1 = select(in_idx + in_width1, -1, (in_width1 < 0 || in_width1 >= input_wh.x));
+    const int inWidthIdx2 = select(in_idx + in_width2, -1, (in_width2 < 0 || in_width2 >= input_wh.x));
+
+    FLOAT4 in0, in1, in2, in3;
+    for (int kh = 0; kh < kernel_wh.y; kh++) {
+        int in_hb_value = select(heightIdx + outBatchIdx, -1, (heightIdx < 0 || heightIdx >= input_wh.y));
+        heightIdx++;
+        in1 = RI_F(input, SAMPLER, (int2)(inWidthIdx0, in_hb_value));
+        in2 = RI_F(input, SAMPLER, (int2)(inWidthIdx1, in_hb_value));
+        in3 = RI_F(input, SAMPLER, (int2)(inWidthIdx2, in_hb_value));
+        for (int kw = 0; kw < kernel_wh.x; kw++) {
+            int filterIdx = mad24(kh, kernel_wh.x, kw);
+
+            in0 = in1;
+            in1 = in2;
+            in2 = in3;
+
+            int inWidthIdx = in_width3 + kw;
+            inWidthIdx     = select(in_idx + inWidthIdx, -1, (inWidthIdx < 0 || inWidthIdx >= input_wh.x));
+            in3 = RI_F(input, SAMPLER, (int2)(inWidthIdx, in_hb_value));
+
+            FLOAT4 weights = RI_F(filter, SAMPLER, (int2)(filterIdx, inChannelBlockIdx));
+
+            out0 = mad(in0, weights, out0);
+            out1 = mad(in1, weights, out1);
+            out2 = mad(in2, weights, out2);
+            out3 = mad(in3, weights, out3);
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int remain = output_wh.x - outWidthBlockidx4;
+    int outWidthIdx = mul24(outChannelBlockIdx, output_wh.x) + outWidthBlockidx4;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, outWidthIdx,
+                               outHeightBlockIdx, remain);
+}
+
+__kernel void DepthwiseConv2D(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __read_only image2d_t filter, __read_only image2d_t bias,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int2 output_wh, __private const int2 kernel_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int2 stride_wh,
+    __private const int activation_type) {
+    const int outChannelWidthIdx = get_global_id(0);
+    const int outHeightIdx       = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(outChannelWidthIdx, outHeightIdx);
+
+    int ow4                      = (output_wh.x + 3) / 4;
+    const int outChannelBlockIdx = outChannelWidthIdx / ow4;
+    const int outWidthBlockidx   = outChannelWidthIdx % ow4;
+
+    const int inChannelBlockIdx = outChannelBlockIdx;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(outChannelBlockIdx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    const int in_width0 = mad24(outWidthBlockidx, stride_wh.x << 2, -padding_wh.x);
+    const int in_width1 = in_width0 + stride_wh.x;
+    const int in_width2 = in_width1 + stride_wh.x;
+    const int in_width3 = in_width2 + stride_wh.x;
+    int heightIdx = mad24(outHeightIdx % output_wh.y, stride_wh.y, -padding_wh.y);
+
+    const int outBatchIdx = mul24((outHeightIdx / output_wh.y), input_wh.y);
+
+    const int in_idx = mul24(inChannelBlockIdx, input_wh.x);
+    for (int kh = 0; kh < kernel_wh.y; kh++) {
+        int in_hb_value = select(heightIdx + outBatchIdx, -1, (heightIdx < 0 || heightIdx >= input_wh.y));
+        heightIdx += dilation_wh.y;
+        for (int kw = 0; kw < kernel_wh.x; kw++) {
+            int filterIdx = mad24(kh, kernel_wh.x, kw);
+            FLOAT4 in0, in1, in2, in3;
+            int inWidthIdx = mul24(kw, dilation_wh.x);
+
+            READ_INPUT_IMAGE(0, inWidthIdx);
+            READ_INPUT_IMAGE(1, inWidthIdx);
+            READ_INPUT_IMAGE(2, inWidthIdx);
+            READ_INPUT_IMAGE(3, inWidthIdx);
+
+            FLOAT4 weights = RI_F(filter, SAMPLER, (int2)(filterIdx, inChannelBlockIdx));
+
+            out0 = mad(in0, weights, out0);
+            out1 = mad(in1, weights, out1);
+            out2 = mad(in2, weights, out2);
+            out3 = mad(in3, weights, out3);
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int outWidthBlockidx4 = outWidthBlockidx << 2;
+    const int remain            = output_wh.x - outWidthBlockidx4;
+    int outWidthIdx = mul24(outChannelBlockIdx, output_wh.x) + outWidthBlockidx4;
+
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, outWidthIdx,
+                               outHeightIdx, remain);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_gws_3d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_gws_3d.cl
new file mode 100644
index 0000000..ea1df5e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_gws_3d.cl
@@ -0,0 +1,189 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2DGS3D(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+    __read_only image2d_t weights, __read_only image2d_t bias,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int out_channel_block_idx = get_global_id(0);
+    const int out_width_block_idx   = get_global_id(1);
+    const int output_bh_idx         = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(out_channel_block_idx, out_width_block_idx, output_bh_idx);
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width0 + stride_wh.x * 2;
+    int in_width3 = in_width0 + stride_wh.x * 3;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0),
+                                dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    const int weights_h_idx = mul24(out_channel_block_idx, mul24(kernel_wh.x, kernel_wh.y)) +
+                              mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+        const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+        int weights_x_idx = input_c_block_idx << 2;
+        int weights_y_idx = weights_h_idx;
+        for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+            int in_hb_value = iy + batch_idx;
+            for (int w = 0; w < kernel_wh.x; w++) {
+                int input_w_base = mul24(w, dilation_wh.x);
+                READ_INPUT_IMAGE(0, input_w_base);
+                READ_INPUT_IMAGE(1, input_w_base);
+                READ_INPUT_IMAGE(2, input_w_base);
+                READ_INPUT_IMAGE(3, input_w_base);
+
+                weights0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx, weights_y_idx));
+                weights1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 1, weights_y_idx));
+                weights2 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 2, weights_y_idx));
+                weights3 = RI_F(weights, SAMPLER, (int2)(weights_x_idx + 3, weights_y_idx++));
+
+                CALCULATE_OUTPUT(0);
+                CALCULATE_OUTPUT(1);
+                CALCULATE_OUTPUT(2);
+                CALCULATE_OUTPUT(3);
+            }
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               output_bh_idx, remain);
+}
+
+
+__kernel void Conv2DGS3D_CB2(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+    __read_only image2d_t weights, __read_only image2d_t bias,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int out_channel_block_length,
+    __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int kernel_size,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int out_channel_slice_idx = get_global_id(0);
+    const int out_width_block_idx   = get_global_id(1);
+    const int output_bh_idx         = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(out_channel_slice_idx, out_width_block_idx, output_bh_idx);
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+
+    FLOAT4 out_w0_s0 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx, 0));
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    FLOAT4 out_w0_s1 = RI_F(bias, SAMPLER, (int2)(out_channel_block_idx + 1, 0));
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width1 + stride_wh.x;
+    int in_width3 = in_width2 + stride_wh.x;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0),
+                                dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    int weights_y_idx_s0 = mad24(out_channel_block_idx, kernel_size,
+                                 mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x));
+    int weights_y_idx_s1 = weights_y_idx_s0 + kernel_size;
+    int2 weights_y_idx = {weights_y_idx_s0, weights_y_idx_s1};
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+    for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+        int in_hb_value = iy + batch_idx;
+        int4 in_width = {in_width0, in_width1, in_width2, in_width3};
+        for (int w = 0; w < kernel_wh.x; w++) {
+            int4 weights_x_idx = {0, 1, 2, 3};
+            for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+                const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+                int4 is_w_in_boundary = (in_width >= 0 && in_width < input_wh.x);
+                int4 in_cw_value = in_width + in_idx;
+        
+                in0 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.x, is_w_in_boundary.x), in_hb_value));
+                in1 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.y, is_w_in_boundary.y), in_hb_value));
+                in2 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.z, is_w_in_boundary.z), in_hb_value));
+                in3 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.w, is_w_in_boundary.w), in_hb_value));
+
+                weights_c0_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, weights_y_idx.x));
+                weights_c1_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, weights_y_idx.x));
+                weights_c2_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, weights_y_idx.x));
+                weights_c3_s0 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, weights_y_idx.x));
+
+                weights_c0_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.x, weights_y_idx.y));
+                weights_c1_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.y, weights_y_idx.y));
+                weights_c2_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.z, weights_y_idx.y));
+                weights_c3_s1 = RI_F(weights, SAMPLER, (int2)(weights_x_idx.w, weights_y_idx.y));
+
+                CALCULATE_SLICE_OUTPUT(0);
+                CALCULATE_SLICE_OUTPUT(1);
+
+                weights_x_idx += 4;
+            }
+            weights_y_idx++;
+            in_width += dilation_wh.x;
+        }
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + output_wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_mix.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_mix.cl
new file mode 100644
index 0000000..ec3f92d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/convolution_mix.cl
@@ -0,0 +1,195 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Conv2D_MIX(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __global const FLOAT* weights_ptr , __global const FLOAT* bias_ptr,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int out_channel_block_idx = output_cw_idx / out_width_blocks;
+    const int out_width_block_idx   = output_cw_idx % out_width_blocks;
+
+    FLOAT4 out0 = vload4(out_channel_block_idx, (__global FLOAT *)bias_ptr);
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width1 + stride_wh.x;
+    int in_width3 = in_width2 + stride_wh.x;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    const int in_channels = in_channel_block_length << 2;
+    const int weights_h_idx = mul24(out_channel_block_idx, mul24(kernel_wh.x, kernel_wh.y)) +
+                              mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x);
+    int weights_offset_base = mul24(weights_h_idx, in_channels);
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+        const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+        int weights_offset = weights_offset_base;
+        for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+            int in_hb_value = iy + batch_idx;
+            for (int w = 0; w < kernel_wh.x; w++) {
+                int input_w_base = mul24(w, dilation_wh.x);
+                READ_INPUT_IMAGE(0, input_w_base);
+                READ_INPUT_IMAGE(1, input_w_base);
+                READ_INPUT_IMAGE(2, input_w_base);
+                READ_INPUT_IMAGE(3, input_w_base);
+
+                weights0 = vload4(weights_offset, (__global FLOAT *)weights_ptr);
+                weights1 = vload4(weights_offset + 1, (__global FLOAT *)weights_ptr);
+                weights2 = vload4(weights_offset + 2, (__global FLOAT *)weights_ptr);
+                weights3 = vload4(weights_offset + 3, (__global FLOAT *)weights_ptr);
+
+                CALCULATE_OUTPUT(0);
+                CALCULATE_OUTPUT(1);
+                CALCULATE_OUTPUT(2);
+                CALCULATE_OUTPUT(3);
+
+                weights_offset += in_channels;
+            }
+        }
+        weights_offset_base += 4;
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx = out_x_base + out_x_idx;
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_w_idx,
+                               output_bh_idx, remain);
+}
+
+__kernel void Conv2D_MIX_CB2(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __global const FLOAT* weights_ptr , __global const FLOAT* bias_ptr,
+    __write_only image2d_t output, __private const int2 input_wh,
+    __private const int in_channel_block_length, __private const int out_channel_block_length,
+    __private const int2 output_wh,
+    __private const int2 kernel_wh, __private const int2 stride_wh,
+    __private const int2 padding_wh, __private const int2 dilation_wh,
+    __private const int kernel_size,
+    __private const int out_width_blocks,
+    __private const int activation_type) {
+    // deal with 2 dim image : dim0 = channel + width | dim1 = batch + height
+    const int output_channel_slice_w_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_channel_slice_w_idx, output_bh_idx);
+
+    const int out_channel_slice_idx = output_channel_slice_w_idx / out_width_blocks;
+    const int out_channel_block_idx = out_channel_slice_idx << 1;
+    const int out_width_block_idx   = output_channel_slice_w_idx % out_width_blocks;
+
+    FLOAT4 out_w0_s0 = vload4(out_channel_block_idx, (__global FLOAT *)bias_ptr);
+    FLOAT4 out_w1_s0 = out_w0_s0;
+    FLOAT4 out_w2_s0 = out_w0_s0;
+    FLOAT4 out_w3_s0 = out_w0_s0;
+
+    bool is_s1_in_boundary = (out_channel_block_idx + 1 < out_channel_block_length);
+    FLOAT4 out_w0_s1 = vload4(select(0, out_channel_block_idx + 1, is_s1_in_boundary), (__global FLOAT *)bias_ptr);
+    FLOAT4 out_w1_s1 = out_w0_s1;
+    FLOAT4 out_w2_s1 = out_w0_s1;
+    FLOAT4 out_w3_s1 = out_w0_s1;
+
+    int in_width0 = mad24(out_width_block_idx, stride_wh.x << 2, -padding_wh.x);
+    int in_width1 = in_width0 + stride_wh.x;
+    int in_width2 = in_width1 + stride_wh.x;
+    int in_width3 = in_width2 + stride_wh.x;
+
+    const int height_start = mad24((output_bh_idx % output_wh.y), stride_wh.y, -padding_wh.y);
+    int in_height_start = mad24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0),
+                                dilation_wh.y, height_start);
+    int in_height_end = min(mad24(kernel_wh.y, dilation_wh.y, height_start), input_wh.y);
+
+    const int batch_idx = mul24((output_bh_idx / output_wh.y), input_wh.y);
+    const int in_channels = in_channel_block_length << 2;
+    int weights_y_idx_s0 = mad24(out_channel_block_idx, kernel_size,
+                                 mul24(select(0, (-height_start + dilation_wh.y - 1) / dilation_wh.y, height_start < 0), kernel_wh.x));
+    int weights_y_idx_s1 = weights_y_idx_s0 + kernel_size;
+    int2 weights_offset = {mul24(weights_y_idx_s0, in_channels),
+                           mul24(select(0, weights_y_idx_s1, is_s1_in_boundary), in_channels)};
+
+    FLOAT4 in0, in1, in2, in3;
+    FLOAT4 weights_c0_s0, weights_c1_s0, weights_c2_s0, weights_c3_s0;
+    FLOAT4 weights_c0_s1, weights_c1_s1, weights_c2_s1, weights_c3_s1;
+    for (int iy = in_height_start; iy < in_height_end; iy += dilation_wh.y) {
+        int in_hb_value = iy + batch_idx;
+        int4 in_width = {in_width0, in_width1, in_width2, in_width3};
+        for (int w = 0; w < kernel_wh.x; w++) {
+            for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block_length; ++input_c_block_idx) {
+                const int in_idx  = mul24(input_c_block_idx, input_wh.x);
+                int4 is_w_in_boundary = (in_width >= 0 && in_width < input_wh.x);
+                int4 in_cw_value = in_width + in_idx;
+
+                in0 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.x, is_w_in_boundary.x), in_hb_value));
+                in1 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.y, is_w_in_boundary.y), in_hb_value));
+                in2 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.z, is_w_in_boundary.z), in_hb_value));
+                in3 = RI_F(input, SAMPLER, (int2)(select(-1, in_cw_value.w, is_w_in_boundary.w), in_hb_value));
+
+                weights_c0_s0 = vload4(weights_offset.x, (__global FLOAT *)weights_ptr);
+                weights_c1_s0 = vload4(weights_offset.x + 1, (__global FLOAT *)weights_ptr);
+                weights_c2_s0 = vload4(weights_offset.x + 2, (__global FLOAT *)weights_ptr);
+                weights_c3_s0 = vload4(weights_offset.x + 3, (__global FLOAT *)weights_ptr);
+
+                weights_c0_s1 = vload4(weights_offset.y, (__global FLOAT *)weights_ptr);
+                weights_c1_s1 = vload4(weights_offset.y + 1, (__global FLOAT *)weights_ptr);
+                weights_c2_s1 = vload4(weights_offset.y + 2, (__global FLOAT *)weights_ptr);
+                weights_c3_s1 = vload4(weights_offset.y + 3, (__global FLOAT *)weights_ptr);
+
+                CALCULATE_SLICE_OUTPUT(0);
+                CALCULATE_SLICE_OUTPUT(1);
+
+                weights_offset += 4;
+            }
+            in_width += dilation_wh.x;
+        }
+    }
+
+    out_w0_s0 = ActivationProcess(out_w0_s0, activation_type);
+    out_w1_s0 = ActivationProcess(out_w1_s0, activation_type);
+    out_w2_s0 = ActivationProcess(out_w2_s0, activation_type);
+    out_w3_s0 = ActivationProcess(out_w3_s0, activation_type);
+
+    out_w0_s1 = ActivationProcess(out_w0_s1, activation_type);
+    out_w1_s1 = ActivationProcess(out_w1_s1, activation_type);
+    out_w2_s1 = ActivationProcess(out_w2_s1, activation_type);
+    out_w3_s1 = ActivationProcess(out_w3_s1, activation_type);
+
+    const int out_x_base = mul24(out_channel_block_idx, output_wh.x);
+    int out_x_idx        = out_width_block_idx << 2;
+
+    const int remain = output_wh.x - out_x_idx;
+    int output_w_idx_s0 = out_x_base + out_x_idx;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s0, out_w1_s0,
+                                    out_w2_s0, out_w3_s0, output_w_idx_s0,
+                                    output_bh_idx, remain);
+
+    if (!is_s1_in_boundary) return;
+    int output_w_idx_s1 = output_w_idx_s0 + output_wh.x;
+    WriteSliceOutputAntiOutOfBounds(output, out_w0_s1, out_w1_s1,
+                                    out_w2_s1, out_w3_s1, output_w_idx_s1,
+                                    output_bh_idx, remain);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/copy.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/copy.cl
new file mode 100644
index 0000000..9ffc794
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/copy.cl
@@ -0,0 +1,196 @@
+#include "base.inc"
+
+__kernel void CopyImage(GLOBAL_SIZE_2_DIMS  
+                    __read_only image2d_t input, 
+                    __write_only image2d_t output,
+                    int4 input_offset,
+                    int4 output_offset,
+                    int2 input_wh,
+                    int2 output_wh,
+                    int2 wh
+                    ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    //N, C, H, W
+    int4 pos = (int4)(bh/wh.y, cw/wh.x, bh%wh.y, cw%wh.x);
+
+    int4 pos_input = input_offset + pos;
+    int4 pos_output = output_offset + pos;
+
+    int2 output_pos = (int2)(pos_output.w + pos_output.y*output_wh.x, pos_output.x*output_wh.y + pos_output.z);
+    int2 input_pos = (int2)(pos_input.w + pos_input.y*input_wh.x, pos_input.x*input_wh.y + pos_input.z);
+
+    WI_F(output, output_pos, RI_F(input, SAMPLER, input_pos));
+}
+
+__kernel void Crop(GLOBAL_SIZE_2_DIMS  
+                    __read_only image2d_t input, 
+                    __write_only image2d_t output,
+                    int start_x,
+                    int start_y,
+                    int crop_width,
+                    int crop_height,
+                    int src_width,
+                    int src_height
+                    ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+    const int batch_idx         = bh / crop_height;
+    const int height_idx        = bh % crop_height;
+
+    int2 output_pos = (int2)(cw , height_idx + crop_height*batch_idx);
+    int2 input_pos  = (int2)(cw + start_x, height_idx + src_height*batch_idx + start_y);
+    WI_F(output, output_pos, RI_F(input, SAMPLER, input_pos));
+}
+
+__kernel void CopyMakeBorder(GLOBAL_SIZE_2_DIMS
+                             __read_only image2d_t input,
+                             __write_only image2d_t output,
+                             int top,
+                             int left,
+                             int src_width,
+                             int src_height,
+                             int src_channel_blocks,
+                             int dst_height,
+                             __private const float border_val
+                             ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+    const int batch_idx         = bh / dst_height;
+    const int height_idx        = bh % dst_height;
+    const int width_idx         = cw / src_channel_blocks;
+    const int channel_4_idx     = cw % src_channel_blocks;
+    const int in_width_idx      = width_idx - left;
+    const int in_height_idx     = height_idx - top;
+
+    FLOAT4 out = border_val;
+    int2 output_pos = (int2)(cw, bh);
+
+    if (in_width_idx >= 0 && in_width_idx < src_width &&  in_height_idx >= 0 && in_height_idx < src_height) {
+        int2 input_pos  = (int2)(channel_4_idx * src_width + in_width_idx, in_height_idx + batch_idx * src_height);
+        out = RI_F(input, SAMPLER, input_pos);
+    }
+
+    WI_F(output, output_pos, out);
+}
+
+__kernel void CopyBuffer(GLOBAL_SIZE_2_DIMS  
+                    const __global FLOAT *input, 
+                    __global FLOAT *output,
+                    int4 input_offset,
+                    int4 output_offset,
+                    int4 input_stride,
+                    int4 output_stride,
+                    int2 wh
+                    ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+    //N, C, H, W
+    int4 pos = (int4)(bh/wh.y, cw/wh.x, bh%wh.y, cw%wh.x);
+
+    int4 pos_input = input_offset + pos;
+    int4 pos_output = output_offset + pos;
+
+    int output_pos = pos_output.x * output_stride.x
+        + pos_output.y * output_stride.y
+        + pos_output.z * output_stride.z
+        + pos_output.w * output_stride.w;
+    
+    int input_pos = pos_input.x * input_stride.x
+        + pos_input.y * input_stride.y
+        + pos_input.z * input_stride.z
+        + pos_input.w * input_stride.w;
+    
+    output[output_pos] = input[input_pos];
+}
+
+__kernel void CopyImageToBuffer(GLOBAL_SIZE_2_DIMS  
+                    __read_only image2d_t input, 
+                    __global FLOAT *output,
+                    int4 input_offset,
+                    int4 output_offset,
+                    int2 input_wh,
+                    int4 output_stride,
+                    int4 output_size
+                    ) {
+    int c = output_size.y;
+    int h = output_size.z;
+    int w = output_size.w;
+
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+    //N, C, H, W
+    int4 pos = (int4)(bh/h, cw/w, bh%h, cw%w);
+    int4 bufferPos = pos * (int4)(1, 4, 1, 1);
+
+    int4 pos_input = input_offset + pos;
+    int4 pos_output = output_offset + bufferPos;
+    int2 input_pos = (int2)(pos_input.w + pos_input.y*input_wh.x, pos_input.x*input_wh.y + pos_input.z);
+
+    FLOAT4 color = RI_F(input, SAMPLER, input_pos);
+
+    int output_pos_basic = pos_output.x*output_stride.x
+            + pos_output.y*output_stride.y
+            + pos_output.z*output_stride.z
+            + pos_output.w*output_stride.w;
+
+    int output_pos_0 = output_pos_basic + 0*output_stride.y;
+    output[output_pos_0] = color.x;
+    if (pos_output.y + 1 < c) {
+        int output_pos_1 = output_pos_basic + 1*output_stride.y;
+        output[output_pos_1] = color.y;
+    }
+    if (pos_output.y + 2 < c) {
+        int output_pos_1 = output_pos_basic + 2*output_stride.y;
+        output[output_pos_1] = color.z;
+    }
+    if (pos_output.y + 3 < c) {
+        int output_pos_1 = output_pos_basic + 3*output_stride.y;
+        output[output_pos_1] = color.w;
+    }
+}
+
+__kernel void CopyBufferToImage(GLOBAL_SIZE_2_DIMS  
+                    __global FLOAT *input,
+                    __write_only image2d_t output,
+                    int4 input_offset,
+                    int4 output_offset,
+                    int4 input_stride,
+                    int2 output_wh,
+                    int2 wh,
+                    int buffer_end
+                    ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+    //N, C, H, W
+    int4 pos = (int4)(bh/wh.y, cw/wh.x, bh%wh.y, cw%wh.x);
+    int4 bufferPos = pos * (int4)(1, 4, 1, 1);
+
+    int4 pos_input = input_offset + bufferPos;
+    int4 pos_output = output_offset + pos;
+    int2 output_pos = (int2)(pos_output.w + pos_output.y*output_wh.x, pos_output.x*output_wh.y + pos_output.z);
+    int input_pos_basic = pos_input.x*input_stride.x 
+            +pos_input.y*input_stride.y
+            +pos_input.z*input_stride.z
+            +pos_input.w*input_stride.w;
+
+    int input_pos_0 = input_pos_basic;
+    int input_pos_1 = clamp(input_pos_basic + 1*input_stride.y, 0, buffer_end);
+    int input_pos_2 = clamp(input_pos_basic + 2*input_stride.y, 0, buffer_end);
+    int input_pos_3 = clamp(input_pos_basic + 3*input_stride.y, 0, buffer_end);
+
+    FLOAT4 color;
+    color.x = input[input_pos_0];
+    color.y = input[input_pos_1];
+    color.z = input[input_pos_2];
+    color.w = input[input_pos_3];
+
+    WI_F(output, output_pos, color);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/copy_buffer_to_image2d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_buffer_to_image2d.cl
new file mode 100644
index 0000000..7f4e064
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_buffer_to_image2d.cl
@@ -0,0 +1,10 @@
+#include "base.inc"
+
+__kernel void CopyBufferToImage2d(__global const float4* input, __write_only image2d_t output,
+                                     __private const int width, __private const int height) {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if (x < width && y < height) {
+        write_imagef(output, (int2)(x, y), input[x + y * width]);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_5d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_5d.cl
new file mode 100644
index 0000000..9174701
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_5d.cl
@@ -0,0 +1,168 @@
+#include "base.inc"
+
+__kernel void CopyImage5D(GLOBAL_SIZE_2_DIMS  
+                    __read_only image2d_t input, 
+                    __write_only image2d_t output,
+                    shape_5d input_offset,
+                    shape_3d output_inner_size, // dims2, dims3, dims4
+                    shape_3d input_inner_size // dim2, dim3, dim4
+                    ) {
+    int cw = get_global_id(0);
+    int bh = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    int dim2 = output_inner_size.data[0];
+    int dim3 = output_inner_size.data[1];
+    int dim4 = output_inner_size.data[2];
+
+    const int dim3_idx                  = bh % dim3;
+    const int batch_dim2_idx            = bh / dim3;
+    const int dim2_idx                  = batch_dim2_idx % dim2;
+    const int batch_idx                 = batch_dim2_idx / dim2;
+    const int dim4_idx                  = cw % dim4;
+    const int channel_updiv_4_idx       = cw / dim4;
+
+    const int input_batch_idx           = batch_idx + input_offset.data[0];
+    const int input_channel_updiv_4_idx = channel_updiv_4_idx + input_offset.data[1];
+    const int input_dim2_idx            = dim2_idx + input_offset.data[2];
+    const int input_dim3_idx            = dim3_idx + input_offset.data[3];
+    const int input_dim4_idx            = dim4_idx + input_offset.data[4];
+
+    int2 input_pos = (int2)(mad24(input_channel_updiv_4_idx, input_inner_size.data[2], input_dim4_idx), 
+                      mad24(mad24(input_batch_idx, input_inner_size.data[0], input_dim2_idx), input_inner_size.data[1], input_dim3_idx));
+
+    WI_F(output, (int2)(cw, bh), RI_F(input, SAMPLER, input_pos));
+}
+
+__kernel void CopyImage5DToBuffer(GLOBAL_SIZE_2_DIMS
+                    __read_only image2d_t input,
+                    __global FLOAT *output,
+                    shape_5d input_offset,
+                    shape_5d output_offset,
+                    shape_3d input_size, // dim2 dim3 dim4
+                    shape_5d output_stride,
+                    shape_5d output_size
+                    ) {
+    int image_width_idx = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int channels = output_size.data[1];
+    int dim2 = output_size.data[2];
+    int dim3 = output_size.data[3];
+    int dim4 = output_size.data[4];
+
+    const int dim3_idx                  = image_height_idx % dim3;
+    const int batch_dim2_idx            = image_height_idx / dim3;
+    const int dim2_idx                  = batch_dim2_idx % dim2;
+    const int batch_idx                 = batch_dim2_idx / dim2;
+    const int dim4_idx                  = image_width_idx % dim4;
+    const int channel_updiv_4_idx       = image_width_idx / dim4;
+
+    const int input_batch_idx           = batch_idx + input_offset.data[0];
+    const int input_channel_updiv_4_idx = channel_updiv_4_idx + input_offset.data[1];
+    const int input_dim2_idx            = dim2_idx + input_offset.data[2];
+    const int input_dim3_idx            = dim3_idx + input_offset.data[3];
+    const int input_dim4_idx            = dim4_idx + input_offset.data[4];
+
+    const int output_batch_idx          = batch_idx + output_offset.data[0];
+    const int output_channel_4_idx      = (channel_updiv_4_idx << 2) + output_offset.data[1];
+    const int output_dim2_idx           = dim2_idx + output_offset.data[2];
+    const int output_dim3_idx           = dim3_idx + output_offset.data[3];
+    const int output_dim4_idx           = dim4_idx + output_offset.data[4];
+
+    int2 input_pos = (int2)(input_dim4_idx + input_channel_updiv_4_idx * input_size.data[2],
+        (input_batch_idx * input_size.data[0] + input_dim2_idx) * input_size.data[1] + input_dim3_idx);
+
+    FLOAT4 out = RI_F(input, SAMPLER, input_pos);
+
+    int output_pos_basic = output_batch_idx * output_stride.data[0]
+            + output_channel_4_idx * output_stride.data[1]
+            + output_dim2_idx * output_stride.data[2]
+            + output_dim3_idx * output_stride.data[3]
+            + output_dim4_idx * output_stride.data[4];
+
+    int remain = channels - output_channel_4_idx;
+    int output_pos = output_pos_basic;
+    if (remain >= 4) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.z;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.w;
+    } else if (remain == 3) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.z;
+    } else if (remain == 2) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+    } else {
+        output[output_pos] = out.x;
+    }
+}
+
+__kernel void CopyBufferToImage5D(GLOBAL_SIZE_2_DIMS
+                    __global FLOAT *input,
+                    __write_only image2d_t output,
+                    shape_5d input_offset,
+                    shape_5d output_offset,
+                    shape_5d input_stride,
+                    shape_3d output_size, // dim2 dim3 dim4
+                    shape_3d input_size, // dim2 dim3 dim4
+                    int buffer_end
+                    ) {
+    int image_width_idx = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int dim2 = input_size.data[0];
+    int dim3 = input_size.data[1];
+    int dim4 = input_size.data[2];
+
+    const int dim3_idx                  = image_height_idx % dim3;
+    const int batch_dim2_idx            = image_height_idx / dim3;
+    const int dim2_idx                  = batch_dim2_idx % dim2;
+    const int batch_idx                 = batch_dim2_idx / dim2;
+    const int dim4_idx                  = image_width_idx % dim4;
+    const int channel_updiv_4_idx       = image_width_idx / dim4;
+
+    const int input_batch_idx           = batch_idx + input_offset.data[0];
+    const int input_channel_updiv_4_idx = (channel_updiv_4_idx << 2) + input_offset.data[1];
+    const int input_dim2_idx            = dim2_idx + input_offset.data[2];
+    const int input_dim3_idx            = dim3_idx + input_offset.data[3];
+    const int input_dim4_idx            = dim4_idx + input_offset.data[4];
+
+    const int output_batch_idx          = batch_idx + output_offset.data[0];
+    const int output_channel_4_idx      = channel_updiv_4_idx + output_offset.data[1];
+    const int output_dim2_idx           = dim2_idx + output_offset.data[2];
+    const int output_dim3_idx           = dim3_idx + output_offset.data[3];
+    const int output_dim4_idx           = dim4_idx + output_offset.data[4];
+
+    int2 output_pos = (int2)(output_dim4_idx + output_channel_4_idx * output_size.data[2],
+        (output_batch_idx * output_size.data[0] + output_dim2_idx) * output_size.data[1] + output_dim3_idx);
+
+    int input_pos_basic = input_batch_idx * input_stride.data[0]
+            + input_channel_updiv_4_idx * input_stride.data[1]
+            + input_dim2_idx * input_stride.data[2]
+            + input_dim3_idx * input_stride.data[3]
+            + input_dim4_idx * input_stride.data[4];
+
+    int input_pos_0 = input_pos_basic;
+    int input_pos_1 = clamp(input_pos_basic + 1 * input_stride.data[1], 0, buffer_end);
+    int input_pos_2 = clamp(input_pos_basic + 2 * input_stride.data[1], 0, buffer_end);
+    int input_pos_3 = clamp(input_pos_basic + 3 * input_stride.data[1], 0, buffer_end);
+
+    FLOAT4 color;
+    color.x = input[input_pos_0];
+    color.y = input[input_pos_1];
+    color.z = input[input_pos_2];
+    color.w = input[input_pos_3];
+
+    WI_F(output, output_pos, color);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_6d.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_6d.cl
new file mode 100644
index 0000000..1512b1d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/copy_image_6d.cl
@@ -0,0 +1,148 @@
+#include "base.inc"
+
+__kernel void CopyImage6DToBuffer(GLOBAL_SIZE_2_DIMS
+                    __read_only image2d_t input,
+                    __global FLOAT *output,
+                    shape_6d input_offset,
+                    shape_6d output_offset,
+                    shape_4d input_size, // dim2 dim3 dim4 dim5
+                    shape_6d output_stride,
+                    shape_6d output_size
+                    ) {
+    int image_width_idx = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int channels = output_size.data[1];
+    int dim2 = output_size.data[2];
+    int dim3 = output_size.data[3];
+    int dim4 = output_size.data[4];
+    int dim5 = output_size.data[5];
+
+    const int dim3_idx                  = image_height_idx % dim3;
+    const int batch_dim2_idx            = image_height_idx / dim3;
+    const int dim2_idx                  = batch_dim2_idx % dim2;
+    const int batch_idx                 = batch_dim2_idx / dim2;
+    const int dim5_idx                  = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx  = image_width_idx / dim5;
+    const int dim4_idx                  = channel_updiv_4_dim4_idx % dim4;
+    const int channel_updiv_4_idx       = channel_updiv_4_dim4_idx / dim4;
+
+    const int input_batch_idx           = batch_idx + input_offset.data[0];
+    const int input_channel_updiv_4_idx = channel_updiv_4_idx + input_offset.data[1];
+    const int input_dim2_idx            = dim2_idx + input_offset.data[2];
+    const int input_dim3_idx            = dim3_idx + input_offset.data[3];
+    const int input_dim4_idx            = dim4_idx + input_offset.data[4];
+    const int input_dim5_idx            = dim5_idx + input_offset.data[5];
+
+    const int output_batch_idx          = batch_idx + output_offset.data[0];
+    const int output_channel_4_idx      = (channel_updiv_4_idx << 2) + output_offset.data[1];
+    const int output_dim2_idx           = dim2_idx + output_offset.data[2];
+    const int output_dim3_idx           = dim3_idx + output_offset.data[3];
+    const int output_dim4_idx           = dim4_idx + output_offset.data[4];
+    const int output_dim5_idx           = dim5_idx + output_offset.data[5];
+
+    int2 input_pos = (int2)(
+        (input_dim4_idx + input_channel_updiv_4_idx * input_size.data[2]) * input_size.data[3] + input_dim5_idx,
+        (input_batch_idx * input_size.data[0] + input_dim2_idx) * input_size.data[1] + input_dim3_idx);
+
+    FLOAT4 out = RI_F(input, SAMPLER, input_pos);
+
+    int output_pos_basic = output_batch_idx * output_stride.data[0]
+            + output_channel_4_idx * output_stride.data[1]
+            + output_dim2_idx * output_stride.data[2]
+            + output_dim3_idx * output_stride.data[3]
+            + output_dim4_idx * output_stride.data[4]
+            + output_dim5_idx * output_stride.data[5];
+
+    int remain = channels - output_channel_4_idx;
+    int output_pos = output_pos_basic;
+    if (remain >= 4) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.z;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.w;
+    } else if (remain == 3) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.z;
+    } else if (remain == 2) {
+        output[output_pos] = out.x;
+        output_pos += output_stride.data[1];
+        output[output_pos] = out.y;
+    } else {
+        output[output_pos] = out.x;
+    }
+}
+
+__kernel void CopyBufferToImage6D(GLOBAL_SIZE_2_DIMS
+                    __global FLOAT *input,
+                    __write_only image2d_t output,
+                    shape_6d input_offset,
+                    shape_6d output_offset,
+                    shape_6d input_stride,
+                    shape_4d output_size, // dim2 dim3 dim4 dim5
+                    shape_4d input_size, // dim2 dim3 dim4 dim5
+                    int buffer_end
+                    ) {
+    int image_width_idx = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int dim2 = input_size.data[0];
+    int dim3 = input_size.data[1];
+    int dim4 = input_size.data[2];
+    int dim5 = input_size.data[3];
+
+    const int dim3_idx                  = image_height_idx % dim3;
+    const int batch_dim2_idx            = image_height_idx / dim3;
+    const int dim2_idx                  = batch_dim2_idx % dim2;
+    const int batch_idx                 = batch_dim2_idx / dim2;
+    const int dim5_idx                  = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx  = image_width_idx / dim5;
+    const int dim4_idx                  = channel_updiv_4_dim4_idx % dim4;
+    const int channel_updiv_4_idx       = channel_updiv_4_dim4_idx / dim4;
+
+    const int input_batch_idx           = batch_idx + input_offset.data[0];
+    const int input_channel_updiv_4_idx = (channel_updiv_4_idx << 2) + input_offset.data[1];
+    const int input_dim2_idx            = dim2_idx + input_offset.data[2];
+    const int input_dim3_idx            = dim3_idx + input_offset.data[3];
+    const int input_dim4_idx            = dim4_idx + input_offset.data[4];
+    const int input_dim5_idx            = dim5_idx + input_offset.data[5];
+
+    const int output_batch_idx          = batch_idx + output_offset.data[0];
+    const int output_channel_4_idx      = channel_updiv_4_idx + output_offset.data[1];
+    const int output_dim2_idx           = dim2_idx + output_offset.data[2];
+    const int output_dim3_idx           = dim3_idx + output_offset.data[3];
+    const int output_dim4_idx           = dim4_idx + output_offset.data[4];
+    const int output_dim5_idx           = dim5_idx + output_offset.data[5];
+
+    int2 output_pos = (int2)(
+        (output_dim4_idx + output_channel_4_idx * output_size.data[2]) * output_size.data[3] + output_dim5_idx,
+        (output_batch_idx * output_size.data[0] + output_dim2_idx) * output_size.data[1] + output_dim3_idx);
+
+    int input_pos_basic = input_batch_idx * input_stride.data[0]
+            + input_channel_updiv_4_idx * input_stride.data[1]
+            + input_dim2_idx * input_stride.data[2]
+            + input_dim3_idx * input_stride.data[3]
+            + input_dim4_idx * input_stride.data[4]
+            + input_dim5_idx * input_stride.data[5];
+
+    int input_pos_0 = input_pos_basic;
+    int input_pos_1 = clamp(input_pos_basic + 1 * input_stride.data[1], 0, buffer_end);
+    int input_pos_2 = clamp(input_pos_basic + 2 * input_stride.data[1], 0, buffer_end);
+    int input_pos_3 = clamp(input_pos_basic + 3 * input_stride.data[1], 0, buffer_end);
+
+    FLOAT4 color;
+    color.x = input[input_pos_0];
+    color.y = input[input_pos_1];
+    color.z = input[input_pos_2];
+    color.w = input[input_pos_3];
+
+    WI_F(output, output_pos, color);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/deconvolution.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/deconvolution.cl
new file mode 100644
index 0000000..bc6de14
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/deconvolution.cl
@@ -0,0 +1,265 @@
+#include "base.inc"
+#include "activation.inc"
+#include "io.inc"
+
+__kernel void Deconv2D(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t weights,
+                       __read_only image2d_t bias,
+                       __write_only image2d_t output,
+                       __private const int2 input_wh,
+                       __private const int2 output_wh,
+                       __private const int2 stride_wh,
+                       __private const int2 align_wh,
+                       __private const int2 padding_wh,
+                       __private const int2 kernel_wh,
+                       __private const int kernel_size,
+                       __private const int in_channel_blocks,
+                       __private const int activation_type) {
+
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int out_channel_blocks_idx    = output_cw_idx / output_wh.x;
+    const int out_width_idx             = output_cw_idx % output_wh.x;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(out_channel_blocks_idx, 0));
+
+    const int out_batch_idx  = output_bh_idx / output_wh.y;
+    const int out_height_idx = output_bh_idx % output_wh.y;
+
+    int kernel_start_x = (out_width_idx + align_wh.x) / stride_wh.x;
+    int kernel_start_y = max(0, (out_height_idx + align_wh.y) / stride_wh.y);
+
+    int deal_kernel_width  = kernel_wh.x - mad24(kernel_start_x, stride_wh.x, padding_wh.x) + out_width_idx - 1;
+    int deal_kernel_height = kernel_wh.y - mad24(kernel_start_y, stride_wh.y, padding_wh.y) + out_height_idx - 1;
+
+    int kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_y;
+    FLOAT4 in0;
+    FLOAT4 weights0, weights1, weights2, weights3;
+    for (int k_y = deal_kernel_height, idx_h = kernel_start_y; k_y >= 0; k_y -= stride_wh.y, idx_h++) {
+        int in_idy      = mad24(out_batch_idx, input_wh.y, idx_h);
+        int in_hb_value = select(in_idy, -1, idx_h < 0 || idx_h >= input_wh.y);
+        int in_width0   = kernel_start_x;
+        for (int k_x = deal_kernel_width; k_x >= 0; k_x -= stride_wh.x) {
+            kernel_y = mad24(k_y, kernel_wh.x, k_x);
+            kernel_y = mad24(out_channel_blocks_idx, kernel_size, kernel_y);
+            for (int ic = 0; ic < in_channel_blocks; ic++) {
+                kernel_x_0 = ic << 2;
+                kernel_x_1 = kernel_x_0 + 1;
+                kernel_x_2 = kernel_x_0 + 2;
+                kernel_x_3 = kernel_x_0 + 3;
+
+                int in_idx = mul24(ic, input_wh.x);
+                int in_width_value0 =
+                    select(in_idx + in_width0, -1, (in_width0 < 0 || in_width0 >= input_wh.x));
+                in0 = RI_F(input, SAMPLER, (int2)(in_width_value0, in_hb_value));
+
+                weights0 = RI_F(weights, SAMPLER, (int2)(kernel_x_0, kernel_y));
+                out0 = mad(in0.x, weights0, out0);
+                weights1 = RI_F(weights, SAMPLER, (int2)(kernel_x_1, kernel_y));
+                out0 = mad(in0.y, weights1, out0);
+                weights2 = RI_F(weights, SAMPLER, (int2)(kernel_x_2, kernel_y));
+                out0 = mad(in0.z, weights2, out0);
+                weights3 = RI_F(weights, SAMPLER, (int2)(kernel_x_3, kernel_y));
+                out0 = mad(in0.w, weights3, out0);
+            }
+            in_width0++;
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+
+    WI_F(output, (int2)(output_cw_idx, output_bh_idx), out0);
+}
+
+__kernel void Deconv2D4x4s2p1wb4(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t weights,
+                                 __read_only image2d_t bias,
+                                 __write_only image2d_t output,
+                                 __private const int2 input_wh,
+                                 __private const int2 output_wh,
+                                 __private const int out_width_blocks,
+                                 __private const int in_channel_blocks,
+                                 __private const int activation_type) {
+
+    const int output_cw_blocks_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_blocks_idx, output_bh_idx);
+
+    const int out_channel_blocks_idx    = output_cw_blocks_idx / out_width_blocks;
+    const int out_width_blocks_idx      = output_cw_blocks_idx % out_width_blocks;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(out_channel_blocks_idx, 0));
+    FLOAT4 out1 = out0;
+    FLOAT4 out2 = out0;
+    FLOAT4 out3 = out0;
+
+    const int out_batch_idx  = output_bh_idx / output_wh.y;
+    const int out_height_idx = output_bh_idx % output_wh.y;
+
+    const int in_width_b1       = out_width_blocks_idx << 1;
+    const int in_width_b0       = in_width_b1 - 1;
+    const int in_width_b2       = in_width_b0 + 2;
+    const int in_width_b3       = in_width_b0 + 3;
+    const int kernel_start_y    = (out_height_idx - 1) / 2;
+
+    int deal_kernel_height = out_height_idx - (kernel_start_y << 1) + 1;
+
+    int kernel_x_c0, kernel_x_c1, kernel_x_c2, kernel_x_c3;
+    int kernel_y_b0, kernel_y_b1, kernel_y_b2, kernel_y_b3;
+    FLOAT4 in_b0, in_b1, in_b2, in_b3;
+    FLOAT4 weights_c0_b0, weights_c0_b1, weights_c0_b2, weights_c0_b3;
+    FLOAT4 weights_c1_b0, weights_c1_b1, weights_c1_b2, weights_c1_b3;
+    FLOAT4 weights_c2_b0, weights_c2_b1, weights_c2_b2, weights_c2_b3;
+    FLOAT4 weights_c3_b0, weights_c3_b1, weights_c3_b2, weights_c3_b3;
+    for (int k_y = deal_kernel_height, idx_h = kernel_start_y; k_y >= 0; k_y -= 2, idx_h++) {
+        int in_idy      = mad24(out_batch_idx, input_wh.y, idx_h);
+        int in_hb_value = select(in_idy, -1, idx_h < 0 || idx_h >= input_wh.y);
+        kernel_y_b0 = (out_channel_blocks_idx * 16) + (k_y * 4);
+        kernel_y_b1 = kernel_y_b0 + 1;
+        kernel_y_b2 = kernel_y_b0 + 2;
+        kernel_y_b3 = kernel_y_b0 + 3;
+        for (int ic = 0; ic < in_channel_blocks; ic++) {
+            kernel_x_c0 = ic << 2;
+            kernel_x_c1 = kernel_x_c0 + 1;
+            kernel_x_c2 = kernel_x_c0 + 2;
+            kernel_x_c3 = kernel_x_c0 + 3;
+
+            weights_c0_b0 = RI_F(weights, SAMPLER, (int2)(kernel_x_c0, kernel_y_b0));
+            weights_c1_b0 = RI_F(weights, SAMPLER, (int2)(kernel_x_c1, kernel_y_b0));
+            weights_c2_b0 = RI_F(weights, SAMPLER, (int2)(kernel_x_c2, kernel_y_b0));
+            weights_c3_b0 = RI_F(weights, SAMPLER, (int2)(kernel_x_c3, kernel_y_b0));
+
+            weights_c0_b1 = RI_F(weights, SAMPLER, (int2)(kernel_x_c0, kernel_y_b1));
+            weights_c1_b1 = RI_F(weights, SAMPLER, (int2)(kernel_x_c1, kernel_y_b1));
+            weights_c2_b1 = RI_F(weights, SAMPLER, (int2)(kernel_x_c2, kernel_y_b1));
+            weights_c3_b1 = RI_F(weights, SAMPLER, (int2)(kernel_x_c3, kernel_y_b1));
+
+            weights_c0_b2 = RI_F(weights, SAMPLER, (int2)(kernel_x_c0, kernel_y_b2));
+            weights_c1_b2 = RI_F(weights, SAMPLER, (int2)(kernel_x_c1, kernel_y_b2));
+            weights_c2_b2 = RI_F(weights, SAMPLER, (int2)(kernel_x_c2, kernel_y_b2));
+            weights_c3_b2 = RI_F(weights, SAMPLER, (int2)(kernel_x_c3, kernel_y_b2));
+
+            weights_c0_b3 = RI_F(weights, SAMPLER, (int2)(kernel_x_c0, kernel_y_b3));
+            weights_c1_b3 = RI_F(weights, SAMPLER, (int2)(kernel_x_c1, kernel_y_b3));
+            weights_c2_b3 = RI_F(weights, SAMPLER, (int2)(kernel_x_c2, kernel_y_b3));
+            weights_c3_b3 = RI_F(weights, SAMPLER, (int2)(kernel_x_c3, kernel_y_b3));
+
+            int in_idx = mul24(ic, input_wh.x);
+            int in_width_value_b0 =
+                select(in_idx + in_width_b0, -1, (in_width_b0 < 0 || in_width_b0 >= input_wh.x));
+            int in_width_value_b1 =
+                select(in_idx + in_width_b1, -1, (in_width_b1 < 0 || in_width_b1 >= input_wh.x));
+            int in_width_value_b2 =
+                select(in_idx + in_width_b2, -1, (in_width_b2 < 0 || in_width_b2 >= input_wh.x));
+            int in_width_value_b3 =
+                select(in_idx + in_width_b3, -1, (in_width_b3 < 0 || in_width_b3 >= input_wh.x));
+            in_b0 = RI_F(input, SAMPLER, (int2)(in_width_value_b0, in_hb_value));
+            in_b1 = RI_F(input, SAMPLER, (int2)(in_width_value_b1, in_hb_value));
+            in_b2 = RI_F(input, SAMPLER, (int2)(in_width_value_b2, in_hb_value));
+            in_b3 = RI_F(input, SAMPLER, (int2)(in_width_value_b3, in_hb_value));
+
+            out0 = mad(in_b0.x, weights_c0_b3, out0);
+            out0 = mad(in_b1.x, weights_c0_b1, out0);
+            out1 = mad(in_b1.x, weights_c0_b2, out1);
+            out1 = mad(in_b2.x, weights_c0_b0, out1);
+            out2 = mad(in_b1.x, weights_c0_b3, out2);
+            out2 = mad(in_b2.x, weights_c0_b1, out2);
+            out3 = mad(in_b2.x, weights_c0_b2, out3);
+            out3 = mad(in_b3.x, weights_c0_b0, out3);
+
+            out0 = mad(in_b0.y, weights_c1_b3, out0);
+            out0 = mad(in_b1.y, weights_c1_b1, out0);
+            out1 = mad(in_b1.y, weights_c1_b2, out1);
+            out1 = mad(in_b2.y, weights_c1_b0, out1);
+            out2 = mad(in_b1.y, weights_c1_b3, out2);
+            out2 = mad(in_b2.y, weights_c1_b1, out2);
+            out3 = mad(in_b2.y, weights_c1_b2, out3);
+            out3 = mad(in_b3.y, weights_c1_b0, out3);
+
+            out0 = mad(in_b0.z, weights_c2_b3, out0);
+            out0 = mad(in_b1.z, weights_c2_b1, out0);
+            out1 = mad(in_b1.z, weights_c2_b2, out1);
+            out1 = mad(in_b2.z, weights_c2_b0, out1);
+            out2 = mad(in_b1.z, weights_c2_b3, out2);
+            out2 = mad(in_b2.z, weights_c2_b1, out2);
+            out3 = mad(in_b2.z, weights_c2_b2, out3);
+            out3 = mad(in_b3.z, weights_c2_b0, out3);
+
+            out0 = mad(in_b0.w, weights_c3_b3, out0);
+            out0 = mad(in_b1.w, weights_c3_b1, out0);
+            out1 = mad(in_b1.w, weights_c3_b2, out1);
+            out1 = mad(in_b2.w, weights_c3_b0, out1);
+            out2 = mad(in_b1.w, weights_c3_b3, out2);
+            out2 = mad(in_b2.w, weights_c3_b1, out2);
+            out3 = mad(in_b2.w, weights_c3_b2, out3);
+            out3 = mad(in_b3.w, weights_c3_b0, out3);
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+    out1 = ActivationProcess(out1, activation_type);
+    out2 = ActivationProcess(out2, activation_type);
+    out3 = ActivationProcess(out3, activation_type);
+
+    int out_width_idx   = out_width_blocks_idx << 2;
+    const int remain = output_wh.x - out_width_idx;
+    int output_cw_idx = output_cw_blocks_idx << 2;
+
+    WriteOutputAntiOutOfBounds(output, out0, out1, out2, out3, output_cw_idx,
+                               output_bh_idx, remain);
+}
+
+__kernel void DepthwiseDeconv2D(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t weights,
+                                __read_only image2d_t bias,
+                                __write_only image2d_t output,
+                                __private const int2 input_wh,
+                                __private const int2 output_wh,
+                                __private const int2 stride_wh,
+                                __private const int2 align_wh,
+                                __private const int2 padding_wh,
+                                __private const int2 kernel_wh,
+                                __private const int kernel_size, __private const int out_channel_blocks,
+                                __private const int activation_type) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int out_channel_blocks_idx    = output_cw_idx / output_wh.x;
+    const int out_width_idx             = output_cw_idx % output_wh.x;
+
+    FLOAT4 out0 = RI_F(bias, SAMPLER, (int2)(out_channel_blocks_idx, 0));
+
+    const int out_batch_idx  = output_bh_idx / output_wh.y;
+    const int out_height_idx = output_bh_idx % output_wh.y;
+
+    int kernel_start_x = (out_width_idx + align_wh.x) / stride_wh.x;
+    int kernel_start_y = max(0, (out_height_idx + align_wh.y) / stride_wh.y);
+
+    int deal_kernel_width  = kernel_wh.x - mad24(kernel_start_x, stride_wh.x, padding_wh.x) + out_width_idx - 1;
+    int deal_kernel_height = kernel_wh.y - mad24(kernel_start_y, stride_wh.y, padding_wh.y) + out_height_idx - 1;
+
+    int kernel_image_x;
+    FLOAT4 in0;
+    FLOAT4 weight;
+    int in_width0;
+    int in_idx, in_idy;
+    for (int k_y = deal_kernel_height, idx_h = kernel_start_y; k_y >= 0; k_y -= stride_wh.y, idx_h++) {
+        in_idy          = mad24(out_batch_idx, input_wh.y, idx_h);
+        int in_hb_value = select(in_idy, -1, idx_h < 0 || idx_h >= input_wh.y);
+        for (int k_x = deal_kernel_width, in_width_idx = kernel_start_x; k_x >= 0; k_x -= stride_wh.x, in_width_idx++) {
+            in_width0 = in_width_idx;
+
+            in_idx = mul24(out_channel_blocks_idx, input_wh.x);
+            READ_INPUT_IMAGE(0, 0);
+
+            kernel_image_x = mad24(k_y, kernel_wh.x, k_x);
+            weight         = RI_F(weights, SAMPLER, (int2)(kernel_image_x, out_channel_blocks_idx));
+            out0           = mad(in0, weight, out0);
+        }
+    }
+
+    out0 = ActivationProcess(out0, activation_type);
+
+    const int output_image_x = mad24(out_channel_blocks_idx, output_wh.x, out_width_idx);
+    WI_F(output, (int2)(output_image_x, output_bh_idx), out0);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/expand.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/expand.cl
new file mode 100644
index 0000000..b860129
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/expand.cl
@@ -0,0 +1,19 @@
+#include "base.inc"
+
+__kernel void Expand(GLOBAL_SIZE_1_DIMS
+                     __global float* input,
+                     __global float* output,
+                     shape_6d output_dims,
+                     shape_6d input_step) {
+    int index = get_global_id(0);
+    DEAL_NON_UNIFORM_DIM1(index);
+    int inner_idx = index;
+    int input_idx = 0;
+    for(int i=INNER_DIMS;i>0;i--) {
+        int pos = inner_idx % output_dims[i];
+        inner_idx /= output_dims[i];
+        input_idx += pos * input_step[i];
+    }
+
+    output[index] = input[output_idx];    
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/gather.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/gather.cl
new file mode 100644
index 0000000..4272b69
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/gather.cl
@@ -0,0 +1,20 @@
+#include "base.inc"
+
+__kernel void GatherCommon(GLOBAL_SIZE_3_DIMS
+                     __global float* input,
+                     __global int* indices, 
+                     __global float* output,
+                     int inner_size, 
+                     int input_outer_step,
+                     int output_outer_step) {
+    int inner_id = get_global_id(0);
+    int indice_id = get_global_id(1);
+    int outer_id = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(inner_id, indice_id, outer_id);
+    int indice = indices[indice_id];
+    int input_index = mad24(outer_id, input_outer_step, inner_id);
+    input_index = mad24(indice, inner_size, input_index);
+    int output_index = mad24(outer_id, output_outer_step, inner_id);
+    output_index = mad24(indice_id, inner_size, output_index);
+    output[output_index] = input[input_index];    
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/gridsample.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/gridsample.cl
new file mode 100644
index 0000000..9d8fc6f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/gridsample.cl
@@ -0,0 +1,205 @@
+#include "base.inc"
+
+__kernel void BilinearGridSample(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t grid,
+                                 __write_only image2d_t output, __private const int input_height,
+                                 __private const int input_width, __private const int out_height,
+                                 __private const int out_width) {
+    const int cw = get_global_id(0);
+    const int hb = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, hb);
+    const int output_w_idx       = cw % out_width;
+    const int output_c_block_idx = cw / out_width;
+    const int output_b_idx       = hb / ((out_height + 3) / 4);
+    const int output_h_idx       = hb % ((out_height + 3) / 4);
+    FLOAT4 x = RI_F(grid, SAMPLER, (int2)(output_h_idx * 2, output_b_idx * out_width + output_w_idx));
+    FLOAT4 y = RI_F(grid, SAMPLER, (int2)(output_h_idx * 2 + 1, output_b_idx * out_width + output_w_idx));
+
+    FLOAT4 ix = (x + ((FLOAT)1.0)) * input_width * ((FLOAT)0.5) - ((FLOAT)0.5);
+    FLOAT4 iy = (y + ((FLOAT)1.0)) * input_height * ((FLOAT)0.5) - ((FLOAT)0.5);
+
+    FLOAT4 ix_nw = floor(ix);
+    FLOAT4 iy_nw = floor(iy);
+
+    FLOAT4 ix_ne = ix_nw + 1;
+    FLOAT4 iy_ne = iy_nw;
+
+    FLOAT4 ix_sw = ix_nw;
+    FLOAT4 iy_sw = iy_nw + 1;
+
+    FLOAT4 ix_se = ix_nw + 1;
+    FLOAT4 iy_se = iy_nw + 1;
+
+    // get nw_within_bound
+    int4 low_ix_nw = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_nw >= (int)0);
+    int4 low_iy_nw = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_nw >= (int)0);
+
+    int4 up_ix_nw = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_nw < input_width);
+    int4 up_iy_nw = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_nw < input_height);
+
+    int4 nw_within_bound = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), low_ix_nw);
+    nw_within_bound      = select((int4)(0, 0, 0, 0), nw_within_bound, low_iy_nw);
+    nw_within_bound      = select((int4)(0, 0, 0, 0), nw_within_bound, up_ix_nw);
+    nw_within_bound      = select((int4)(0, 0, 0, 0), nw_within_bound, up_iy_nw);
+
+    // get ne_within_bound
+    low_ix_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_ne >= 0);
+    low_iy_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_ne >= 0);
+    up_ix_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_ne < input_width);
+    up_iy_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_ne < input_height);
+    int4 ne_within_bound = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), low_ix_nw);
+    ne_within_bound      = select((int4)(0, 0, 0, 0), ne_within_bound, low_iy_nw);
+    ne_within_bound      = select((int4)(0, 0, 0, 0), ne_within_bound, up_ix_nw);
+    ne_within_bound      = select((int4)(0, 0, 0, 0), ne_within_bound, up_iy_nw);
+
+    // get sw_within_bound
+    low_ix_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_sw >= 0);
+    low_iy_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_sw >= 0);
+    up_ix_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_sw < input_width);
+    up_iy_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_sw < input_height);
+    int4 sw_within_bound = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), low_ix_nw);
+    sw_within_bound      = select((int4)(0, 0, 0, 0), sw_within_bound, low_iy_nw);
+    sw_within_bound      = select((int4)(0, 0, 0, 0), sw_within_bound, up_ix_nw);
+    sw_within_bound      = select((int4)(0, 0, 0, 0), sw_within_bound, up_iy_nw);
+
+    // get se_within_bound
+    low_ix_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_se >= 0);
+    low_iy_nw            = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_se >= 0);
+    up_ix_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), ix_se < input_width);
+    up_iy_nw             = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), iy_se < input_height);
+    int4 se_within_bound = select((int4)(0, 0, 0, 0), (int4)(-1, -1, -1, -1), low_ix_nw);
+    se_within_bound      = select((int4)(0, 0, 0, 0), se_within_bound, low_iy_nw);
+    se_within_bound      = select((int4)(0, 0, 0, 0), se_within_bound, up_ix_nw);
+    se_within_bound      = select((int4)(0, 0, 0, 0), se_within_bound, up_iy_nw);
+
+    FLOAT4 nw = select((FLOAT)0, (ix_se - ix) * (iy_se - iy), nw_within_bound);
+    FLOAT4 ne = select((FLOAT)0, (ix - ix_sw) * (iy_sw - iy), ne_within_bound);
+    FLOAT4 sw = select((FLOAT)0, (ix_ne - ix) * (iy - iy_ne), sw_within_bound);
+    FLOAT4 se = select((FLOAT)0, (ix - ix_nw) * (iy - iy_nw), se_within_bound);
+
+    int nw_index_x = select(0, (int)(ix_nw.x), nw_within_bound.x);
+    int nw_index_y = select(0, (int)(iy_nw.x), nw_within_bound.x);
+    FLOAT4 res     = (FLOAT4)(0, 0, 0, 0);
+    FLOAT4 input_data_nw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + nw_index_x, output_b_idx * input_height + nw_index_y));
+    res            = res + input_data_nw * nw.x;
+    int ne_index_x = select(0, (int)ix_ne.x, ne_within_bound.x);
+    int ne_index_y = select(0, (int)iy_ne.x, ne_within_bound.x);
+    FLOAT4 input_data_ne =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + ne_index_x, output_b_idx * input_height + ne_index_y));
+    res            = res + input_data_ne * ne.x;
+    int sw_index_x = select(0, (int)ix_sw.x, sw_within_bound.x);
+    int sw_index_y = select(0, (int)iy_sw.x, sw_within_bound.x);
+    FLOAT4 input_data_sw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + sw_index_x, output_b_idx * input_height + sw_index_y));
+    res            = res + input_data_sw * sw.x;
+    int se_index_x = select(0, (int)ix_se.x, se_within_bound.x);
+    int se_index_y = select(0, (int)iy_se.x, se_within_bound.x);
+    FLOAT4 input_data_se =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + se_index_x, output_b_idx * input_height + se_index_y));
+    res = res + input_data_se * se.x;
+    WI_F(output, (int2)((output_c_block_idx * out_width + output_w_idx), output_b_idx * out_height + output_h_idx * 4),
+         res);
+
+    //
+    if (output_h_idx * 4 + 1 >= out_height) {
+        return;
+    }
+
+    nw_index_x = select(0, (int)(ix_nw.y), nw_within_bound.y);
+    nw_index_y = select(0, (int)(iy_nw.y), nw_within_bound.y);
+    res        = (FLOAT4)(0, 0, 0, 0);
+    input_data_nw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + nw_index_x, output_b_idx * input_height + nw_index_y));
+    res        = res + input_data_nw * nw.y;
+    ne_index_x = select(0, (int)ix_ne.y, ne_within_bound.y);
+    ne_index_y = select(0, (int)iy_ne.y, ne_within_bound.y);
+    input_data_ne =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + ne_index_x, output_b_idx * input_height + ne_index_y));
+    res        = res + input_data_ne * ne.y;
+    sw_index_x = select(0, (int)ix_sw.y, sw_within_bound.y);
+    sw_index_y = select(0, (int)iy_sw.y, sw_within_bound.y);
+    input_data_sw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + sw_index_x, output_b_idx * input_height + sw_index_y));
+    res        = res + input_data_sw * sw.y;
+    se_index_x = select(0, (int)ix_se.y, se_within_bound.y);
+    se_index_y = select(0, (int)iy_se.y, se_within_bound.y);
+    input_data_se =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + se_index_x, output_b_idx * input_height + se_index_y));
+    res = res + input_data_se * se.y;
+    WI_F(output,
+         (int2)((output_c_block_idx * out_width + output_w_idx), output_b_idx * out_height + output_h_idx * 4 + 1),
+         res);
+
+    //
+    if (output_h_idx * 4 + 2 >= out_height) {
+        return;
+    }
+    nw_index_x = select(0, (int)(ix_nw.z), nw_within_bound.z);
+    nw_index_y = select(0, (int)(iy_nw.z), nw_within_bound.z);
+    res        = (FLOAT4)(0, 0, 0, 0);
+    input_data_nw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + nw_index_x, output_b_idx * input_height + nw_index_y));
+    res        = res + input_data_nw * nw.z;
+    ne_index_x = select(0, (int)ix_ne.z, ne_within_bound.z);
+    ne_index_y = select(0, (int)iy_ne.z, ne_within_bound.z);
+    input_data_ne =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + ne_index_x, output_b_idx * input_height + ne_index_y));
+    res        = res + input_data_ne * ne.z;
+    sw_index_x = select(0, (int)ix_sw.z, sw_within_bound.z);
+    sw_index_y = select(0, (int)iy_sw.z, sw_within_bound.z);
+    input_data_sw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + sw_index_x, output_b_idx * input_height + sw_index_y));
+    res        = res + input_data_sw * sw.z;
+    se_index_x = select(0, (int)ix_se.z, se_within_bound.z);
+    se_index_y = select(0, (int)iy_se.z, se_within_bound.z);
+    input_data_se =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + se_index_x, output_b_idx * input_height + se_index_y));
+    res = res + input_data_se * se.z;
+    WI_F(output,
+         (int2)((output_c_block_idx * out_width + output_w_idx), output_b_idx * out_height + output_h_idx * 4 + 2),
+         res);
+
+    if (output_h_idx * 4 + 3 >= out_height) {
+        return;
+    }
+    nw_index_x = select(0, (int)(ix_nw.w), nw_within_bound.w);
+    nw_index_y = select(0, (int)(iy_nw.w), nw_within_bound.w);
+    res        = (FLOAT4)(0, 0, 0, 0);
+    input_data_nw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + nw_index_x, output_b_idx * input_height + nw_index_y));
+    res        = res + input_data_nw * nw.w;
+    ne_index_x = select(0, (int)ix_ne.w, ne_within_bound.w);
+    ne_index_y = select(0, (int)iy_ne.w, ne_within_bound.w);
+    input_data_ne =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + ne_index_x, output_b_idx * input_height + ne_index_y));
+    res        = res + input_data_ne * ne.w;
+    sw_index_x = select(0, (int)ix_sw.w, sw_within_bound.w);
+    sw_index_y = select(0, (int)iy_sw.w, sw_within_bound.w);
+    input_data_sw =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + sw_index_x, output_b_idx * input_height + sw_index_y));
+    res        = res + input_data_sw * sw.w;
+    se_index_x = select(0, (int)ix_se.w, se_within_bound.w);
+    se_index_y = select(0, (int)iy_se.w, se_within_bound.w);
+    input_data_se =
+        RI_F(input, SAMPLER,
+             (int2)(output_c_block_idx * input_width + se_index_x, output_b_idx * input_height + se_index_y));
+    res = res + input_data_se * se.w;
+    WI_F(output,
+         (int2)((output_c_block_idx * out_width + output_w_idx), output_b_idx * out_height + output_h_idx * 4 + 3),
+         res);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/hard_sigmoid.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/hard_sigmoid.cl
new file mode 100644
index 0000000..c0cd3af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/hard_sigmoid.cl
@@ -0,0 +1,22 @@
+#include "base.inc"
+
+__kernel void HardSigmoid(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                          __write_only image2d_t output,
+                          __private const float alpha,
+                          __private const float beta, __private const float min,
+                          __private const float max) {
+    const int w                 = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int hb                = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(w, channel_block_idx, hb);
+    const int width = global_size_dim0;
+
+    const int pos = mad24(channel_block_idx, width, w);
+    FLOAT4 in     = RI_F(input, SAMPLER, (int2)(pos, hb));
+    FLOAT4 out =
+        select(select((FLOAT4)(1.0f), in * (FLOAT)(alpha) + (FLOAT)(beta),
+                      in < (FLOAT)(max)),
+               (FLOAT)(0.0f), in <= (FLOAT)(min));
+    WI_F(output, (int2)(pos, hb), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/hdr_guide.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/hdr_guide.cl
new file mode 100644
index 0000000..fc9453a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/hdr_guide.cl
@@ -0,0 +1,41 @@
+#include "base.inc"
+
+__kernel void HdrGuide(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                       __write_only image2d_t output, __read_only image2d_t ccm,
+                       __read_only image2d_t shifts,
+                       __read_only image2d_t slopes,
+                       __read_only image2d_t projection) {
+    const int w  = get_global_id(0);
+    const int hb = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(w, hb);
+
+    FLOAT4 in = RI_F(input, SAMPLER, (int2)(w, hb));
+    in.w      = 1.0f;
+
+    FLOAT4 new_rgb;
+    new_rgb.x = dot(in, RI_F(ccm, SAMPLER, (int2)(0, 0)));
+    new_rgb.y = dot(in, RI_F(ccm, SAMPLER, (int2)(1, 0)));
+    new_rgb.z = dot(in, RI_F(ccm, SAMPLER, (int2)(2, 0)));
+    new_rgb.w = 0.0f;
+
+    FLOAT4 guide_rgb = (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    guide_rgb += max(new_rgb - RI_F(shifts, SAMPLER, (int2)(0, 0)),
+                     (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f)) *
+                 RI_F(slopes, SAMPLER, (int2)(0, 0));
+    guide_rgb += max(new_rgb - RI_F(shifts, SAMPLER, (int2)(1, 0)),
+                     (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f)) *
+                 RI_F(slopes, SAMPLER, (int2)(1, 0));
+    guide_rgb += max(new_rgb - RI_F(shifts, SAMPLER, (int2)(2, 0)),
+                     (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f)) *
+                 RI_F(slopes, SAMPLER, (int2)(2, 0));
+    guide_rgb += max(new_rgb - RI_F(shifts, SAMPLER, (int2)(3, 0)),
+                     (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f)) *
+                 RI_F(slopes, SAMPLER, (int2)(3, 0));
+    guide_rgb.w = 1.0f;
+
+    FLOAT out_val = dot(guide_rgb, RI_F(projection, SAMPLER, (int2)(0, 0)));
+    out_val       = clamp(out_val, (FLOAT)(0.0f), (FLOAT)(1.0f));
+
+    WI_F(output, (int2)(w, hb), (FLOAT4)(out_val, 0.0f, 0.0f, 0.0f));
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/image_5d_to_buffer.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/image_5d_to_buffer.cl
new file mode 100644
index 0000000..77c2c46
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/image_5d_to_buffer.cl
@@ -0,0 +1,99 @@
+#include "base.inc"
+
+// convert data from image(b d2 d3, ic/4 d4 ic4) to buffer(n d2 d3 d4 c)
+__kernel void Image5DToNHWCBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * dim2 + dim2_idx) * dim3 + dim3_idx) * dim4 +
+                                dim4_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    float4 values            = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        vstore4(values, 0, output + buffer_offset);
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+        offset++;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+// convert data from image(b d2 d3, ic/4 d4 ic4) to buffer(n c d2 d3 d4)
+__kernel void Image5DToNCHWBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim4_idx       = image_width_idx % dim4;
+    const int channel_4_idx  = (image_width_idx / dim4) << 2;
+    const int buffer_offset  = (((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx;
+
+    const int stride         = dim2 * dim3 * dim4;
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    #ifdef ENABLE_BUFFER_PRECISION_ADJUST
+    __global FLOAT *output_ptr = (__global FLOAT *)output;
+    FLOAT4 values    = RI_F(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #else
+    __global float *output_ptr = output;
+    float4 values    = read_imagef(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #endif
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+        offset += stride;
+        output_ptr[offset] = values.z;
+        offset += stride;
+        output_ptr[offset] = values.w;
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+        offset += stride;
+        output_ptr[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/image_6d_to_buffer.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/image_6d_to_buffer.cl
new file mode 100644
index 0000000..81cb792
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/image_6d_to_buffer.cl
@@ -0,0 +1,105 @@
+#include "base.inc"
+
+// convert data from image(b d2 d3, ic/4 d4 d5 ic4) to buffer(n d2 d3 d4 d5 c)
+__kernel void Image6DToNHWCBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int dim5,
+                                  __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * dim2 + dim2_idx) * dim3 + dim3_idx) * dim4 +
+                                 dim4_idx) * dim5 + dim5_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    float4 values            = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        vstore4(values, 0, output + buffer_offset);
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+        offset++;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+// convert data from image(b d2 d3, ic/4 d4 d5 ic4) to buffer(n c d2 d3 d4 d5)
+__kernel void Image6DToNCHWBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
+                                  __private const int channels, __private const int dim2,
+                                  __private const int dim3, __private const int dim4,
+                                  __private const int dim5,
+                                  __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int dim3_idx       = image_height_idx % dim3;
+    const int batch_dim2_idx = image_height_idx / dim3;
+    const int dim2_idx       = batch_dim2_idx % dim2;
+    const int batch_idx      = batch_dim2_idx / dim2;
+    const int dim5_idx       = image_width_idx % dim5;
+    const int channel_updiv_4_dim4_idx = image_width_idx / dim5;
+    const int dim4_idx       = channel_updiv_4_dim4_idx % dim4;
+    const int channel_4_idx  = (channel_updiv_4_dim4_idx / dim4) << 2;
+    const int buffer_offset  = ((((batch_idx * channels + channel_4_idx) * dim2 + dim2_idx) *
+                                dim3 + dim3_idx) * dim4 + dim4_idx) * dim5 + dim5_idx;
+
+    const int stride         = dim2 * dim3 * dim4 * dim5;
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    #ifdef ENABLE_BUFFER_PRECISION_ADJUST
+    __global FLOAT *output_ptr = (__global FLOAT *)output;
+    FLOAT4 values    = RI_F(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #else
+    __global float *output_ptr = output;
+    float4 values    = read_imagef(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #endif
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+        offset += stride;
+        output_ptr[offset] = values.z;
+        offset += stride;
+        output_ptr[offset] = values.w;
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+        offset += stride;
+        output_ptr[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += stride;
+        output_ptr[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_buffer.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_buffer.cl
new file mode 100644
index 0000000..c9b1fa4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_buffer.cl
@@ -0,0 +1,328 @@
+#include "base.inc"
+
+__kernel void ImageToNC4HW4Buffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nchw */
+                                     __private const int2 output_wh,
+                                     __private const int channel_up_4,
+                                     __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx   = image_height_idx / output_wh.x;
+    const int height_idx  = image_height_idx % output_wh.x;
+    const int width_idx   = image_width_idx % output_wh.y;
+    int channel_block_idx = image_width_idx / output_wh.y;
+
+    int buffer_offset =
+        (((batch_idx * channel_up_4 + channel_block_idx) * output_wh.x + height_idx) * output_wh.y + width_idx) * 4;
+
+    int2 coord        = (int2)(image_width_idx, image_height_idx);
+    float4 values = read_imagef(input_ptr, SAMPLER, coord);
+
+    vstore4(values, 0, output + buffer_offset);
+}
+
+// only for debug
+// convert kernel : from image(oc/4 h w , ic oc4) to buffer(oihw)
+__kernel void Conv2DFilterImageToBuffer(GLOBAL_SIZE_2_DIMS __global float *output_ptr,
+                                            __private const int output_channel, __private const int2 kernel_wh,
+                                            __private const int ic_h_w_size,
+                                            __private const int height_width_size, __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int input_channel_4_idx  = image_width_idx;
+    const int output_channel_4_idx = image_height_idx / height_width_size * 4;
+    const int height_width_idx     = image_height_idx % height_width_size;
+    const int buffer_height_idx    = height_width_idx / kernel_wh.y;
+    const int buffer_width_idx     = height_width_idx % kernel_wh.y;
+
+    const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
+                              buffer_height_idx * kernel_wh.y + buffer_width_idx;
+
+    if (output_channel_4_idx < output_channel) {
+        int2 coord               = (int2)(image_width_idx, image_height_idx);
+        float4 values        = read_imagef(input_ptr, SAMPLER, coord);
+        const int remain_channel = (output_channel - output_channel_4_idx);
+
+        if (remain_channel >= 4) {
+            int offset         = buffer_offset;
+            output_ptr[offset] = values.x;
+            offset             = mad24(1, ic_h_w_size, offset);
+            output_ptr[offset] = values.y;
+            offset += ic_h_w_size;
+            output_ptr[offset] = values.z;
+            offset += ic_h_w_size;
+            output_ptr[offset] = values.w;
+        } else if (remain_channel == 3) {
+            int offset         = buffer_offset;
+            output_ptr[offset] = values.x;
+            offset             = mad24(1, ic_h_w_size, offset);
+            output_ptr[offset] = values.y;
+            offset += ic_h_w_size;
+            output_ptr[offset] = values.z;
+
+        } else if (remain_channel == 2) {
+            int offset         = buffer_offset;
+            output_ptr[offset] = values.x;
+            offset             = mad24(1, ic_h_w_size, offset);
+            output_ptr[offset] = values.y;
+        } else if (remain_channel == 1) {
+            int offset         = buffer_offset;
+            output_ptr[offset] = values.x;
+        }
+    }
+}
+
+// convert data from image(b h, ic/4 w ic4) to buffer(nhwc)
+__kernel void ImageToNHWCBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nhwc */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    float4 values        = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        vstore4(values, 0, output + buffer_offset);
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+        offset++;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void ImageToNHWCBufferFLOAT(GLOBAL_SIZE_2_DIMS __global FLOAT *output, /* nhwc */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx     = image_height_idx / height;
+    const int height_idx    = image_height_idx % height;
+    const int width_idx     = image_width_idx % width;
+    const int channel_4_idx = (image_width_idx / width) << 2;
+    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
+
+    int2 coord               = (int2)(image_width_idx, image_height_idx);
+    FLOAT4 values            = RI_F(input_ptr, SAMPLER, coord);
+    const int remain_channel = channels - channel_4_idx;
+    if (remain_channel >= 4) {
+        vstore4(values, 0, output + buffer_offset);
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+        offset++;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+// convert data from image(b h, ic/4 w ic4) to buffer(nchw)
+__kernel void ImageToNCHWBuffer(GLOBAL_SIZE_2_DIMS __global float *output, /* nchw */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+    const int width_idx  = image_width_idx % width;
+    int channel_4_idx    = (image_width_idx / width) * 4;
+    int buffer_offset    = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
+    #ifdef ENABLE_BUFFER_PRECISION_ADJUST
+    __global FLOAT *output_ptr = (__global FLOAT *)output;
+    FLOAT4 values    = RI_F(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #else
+    __global float *output_ptr = output;
+    float4 values    = read_imagef(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+    #endif
+
+    const int height_width_size = height * width;
+
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += height_width_size;
+        output_ptr[offset] = values.y;
+        offset += height_width_size;
+        output_ptr[offset] = values.z;
+        offset += height_width_size;
+        output_ptr[offset] = values.w;
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += height_width_size;
+        output_ptr[offset] = values.y;
+        offset += height_width_size;
+        output_ptr[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+        offset += height_width_size;
+        output_ptr[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output_ptr[offset] = values.x;
+    }
+}
+
+__kernel void ImageToNCHWBufferFLOAT(GLOBAL_SIZE_2_DIMS __global FLOAT *output, /* nchw */
+                                   __private const int height, __private const int width, __private const int channels,
+                                   __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int batch_idx  = image_height_idx / height;
+    const int height_idx = image_height_idx % height;
+    const int width_idx  = image_width_idx % width;
+    int channel_4_idx    = (image_width_idx / width) * 4;
+    int buffer_offset    = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
+    FLOAT4 values        = RI_F(input_ptr, SAMPLER, (int2)(image_width_idx, image_height_idx));
+
+    const int height_width_size = height * width;
+
+    const int remain_channel = channels - channel_4_idx;
+
+    if (remain_channel >= 4) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+        offset += height_width_size;
+        output[offset] = values.w;
+    } else if (remain_channel == 3) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+        offset += height_width_size;
+        output[offset] = values.z;
+    } else if (remain_channel == 2) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+        offset += height_width_size;
+        output[offset] = values.y;
+    } else if (remain_channel == 1) {
+        int offset     = buffer_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void ArgImageToBuffer(GLOBAL_SIZE_2_DIMS __global float *output, __private const int count,
+                                  __read_only image2d_t input_ptr) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    const int buffer_4_offset = image_width_idx << 2;
+
+    int2 coord        = (int2)(image_width_idx, image_height_idx);
+    float4 values = read_imagef(input_ptr, SAMPLER, coord);
+    const int remain  = count - buffer_4_offset;
+    if (remain < 4) {
+        switch (remain) {
+            case 3:
+                output[buffer_4_offset + 2] = values.s2;
+            case 2:
+                output[buffer_4_offset + 1] = values.s1;
+            case 1:
+                output[buffer_4_offset] = values.s0;
+        }
+    } else {
+        vstore4(values, 0, output + buffer_4_offset);
+    }
+
+    if (remain >= 4) {
+        vstore4(values, 0, output + buffer_4_offset);
+    } else if (remain == 3) {
+        int offset     = buffer_4_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+        offset++;
+        output[offset] = values.z;
+    } else if (remain == 2) {
+        int offset     = buffer_4_offset;
+        output[offset] = values.x;
+        offset++;
+        output[offset] = values.y;
+    } else if (remain == 1) {
+        int offset     = buffer_4_offset;
+        output[offset] = values.x;
+    }
+}
+
+__kernel void ImageToRGBABuffer(GLOBAL_SIZE_2_DIMS __read_only image2d_t input_ptr, __global uchar *output,
+							  __private const int height, __private const int width, __private const int channel_up_4,
+        					  __private const float4 scale, __private const float4 bias) {
+	int image_width_idx  = get_global_id(0);
+	int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+	const int batch_idx 	    = image_height_idx / height;
+	const int height_idx	    = image_height_idx % height;
+	const int width_idx 	    = image_width_idx % width;
+	const int channel_block_idx = image_width_idx / width;
+
+	int buffer_offset = (((batch_idx * height + height_idx) * width + width_idx) * channel_up_4 + channel_block_idx) * 4;
+
+	int2 coord        = (int2)(image_width_idx, image_height_idx);
+	float4 values_f = read_imagef(input_ptr, SAMPLER, coord);
+#ifdef ENABLE_SCALE_BIAS
+	values_f = values_f * scale + bias;
+#endif
+	uchar4 values = convert_uchar4_sat(values_f);
+
+#ifdef SWAP_RB
+    uchar temp = values.x;
+    values.x = values.z;
+    values.z = temp;
+#endif
+
+	vstore4(values, 0, output + buffer_offset);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_image.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_image.cl
new file mode 100644
index 0000000..f152fcd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/image_to_image.cl
@@ -0,0 +1,115 @@
+#include "base.inc"
+
+__kernel void NHC4W4ImageToCNH4Image(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, /* nhc4w4 */
+                                     __private const int n,
+                                     __private const int h,
+                                     __write_only image2d_t output) {
+    int h_updiv_4_idx  = get_global_id(0);
+    int c_n_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(h_updiv_4_idx, c_n_idx);
+
+    const int c_idx = c_n_idx / n;
+    const int n_idx = c_n_idx % n;
+
+    const int h0 = h_updiv_4_idx << 2;
+    const int h1 = h0 + 1;
+    const int h2 = h1 + 1;
+    const int h3 = h2 + 1;
+
+    int c_4_idx         = c_idx / 4;
+    int c_remain_4_idx  = c_idx % 4;
+
+    int2 src_coord_h0 = (int2)(c_4_idx, n_idx * h + h0);
+    int2 src_coord_h1 = (int2)(c_4_idx, n_idx * h + h1);
+    int2 src_coord_h2 = (int2)(c_4_idx, n_idx * h + h2);
+    int2 src_coord_h3 = (int2)(c_4_idx, n_idx * h + h3);
+    int2 dst_coord = (int2)(h_updiv_4_idx, c_n_idx);
+
+    FLOAT4 values_h0 = RI_F(input, SAMPLER, src_coord_h0);
+    FLOAT4 values_h1 = RI_F(input, SAMPLER, src_coord_h1);
+    FLOAT4 values_h2 = RI_F(input, SAMPLER, src_coord_h2);
+    FLOAT4 values_h3 = RI_F(input, SAMPLER, src_coord_h3);
+
+    FLOAT4 out;
+    if (c_remain_4_idx == 0) {
+        out.x = values_h0.x;
+        out.y = values_h1.x;
+        out.z = values_h2.x;
+        out.w = values_h3.x;
+    } else if (c_remain_4_idx == 1) {
+        out.x = values_h0.y;
+        out.y = values_h1.y;
+        out.z = values_h2.y;
+        out.w = values_h3.y;
+    } else if (c_remain_4_idx == 2) {
+        out.x = values_h0.z;
+        out.y = values_h1.z;
+        out.z = values_h2.z;
+        out.w = values_h3.z;
+    } else if (c_remain_4_idx == 3) {
+        out.x = values_h0.w;
+        out.y = values_h1.w;
+        out.z = values_h2.w;
+        out.w = values_h3.w;
+    }
+
+    WI_F(output, dst_coord, out);
+}
+
+__kernel void CNH4ImageToNHC4W4Image(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, /* nhc4w4 */
+                                     __private const int n,
+                                     __private const int h,
+                                     __write_only image2d_t output) {
+    int c_updiv_4_idx  = get_global_id(0);
+    int n_h_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(c_updiv_4_idx, n_h_idx);
+
+    const int h_idx = n_h_idx % h;
+    const int n_idx = n_h_idx / h;
+
+    const int c0 = c_updiv_4_idx << 2;
+    const int c1 = c0 + 1;
+    const int c2 = c1 + 1;
+    const int c3 = c2 + 1;
+
+    int h_4_idx        = h_idx / 4;
+    int h_remain_4_idx = h_idx % 4;
+
+    int2 src_coord_c0 = (int2)(h_4_idx, c0 * n + n_idx);
+    int2 src_coord_c1 = (int2)(h_4_idx, c1 * n + n_idx);
+    int2 src_coord_c2 = (int2)(h_4_idx, c2 * n + n_idx);
+    int2 src_coord_c3 = (int2)(h_4_idx, c3 * n + n_idx);
+    int2 dst_coord = (int2)(c_updiv_4_idx, n_h_idx);
+
+    FLOAT4 values_c0 = RI_F(input, SAMPLER, src_coord_c0);
+    FLOAT4 values_c1 = RI_F(input, SAMPLER, src_coord_c1);
+    FLOAT4 values_c2 = RI_F(input, SAMPLER, src_coord_c2);
+    FLOAT4 values_c3 = RI_F(input, SAMPLER, src_coord_c3);
+
+    FLOAT4 out;
+    if (h_remain_4_idx == 0) {
+        out.x = values_c0.x;
+        out.y = values_c1.x;
+        out.z = values_c2.x;
+        out.w = values_c3.x;
+    } else if (h_remain_4_idx == 1) {
+        out.x = values_c0.y;
+        out.y = values_c1.y;
+        out.z = values_c2.y;
+        out.w = values_c3.y;
+    } else if (h_remain_4_idx == 2) {
+        out.x = values_c0.z;
+        out.y = values_c1.z;
+        out.z = values_c2.z;
+        out.w = values_c3.z;
+    } else if (h_remain_4_idx == 3) {
+        out.x = values_c0.w;
+        out.y = values_c1.w;
+        out.z = values_c2.w;
+        out.w = values_c3.w;
+    }
+
+    WI_F(output, dst_coord, out);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/innerproduct.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/innerproduct.cl
new file mode 100644
index 0000000..0825ae3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/innerproduct.cl
@@ -0,0 +1,74 @@
+#include "base.inc"
+
+// the input format:   buffer:(M, K, 1, 1) ==> image:(K/4, M)
+// the weight format:  image:(N/4, K)
+// the output format:  buffer:(M, N, 1, 1) ==> image:(N/4, M)
+__kernel void Innerproduct(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __read_only image2d_t weight, __read_only image2d_t bias,
+    __private const int k, __private const int k_remain,
+    __write_only image2d_t output) {
+    const int col = get_global_id(0);
+    const int row = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(col, row);
+
+    FLOAT4 data;
+    FLOAT4 weight_0;
+    FLOAT4 weight_1;
+    FLOAT4 weight_2;
+    FLOAT4 weight_3;
+    FLOAT4 bias_;
+    FLOAT4 sum = (FLOAT4)0;
+
+    int k_size = k;
+    if (k_remain > 0) {
+        k_size--;
+    }
+
+    int y = 0;
+    int i = 0;
+
+    for (i = 0; i < k_size; i++) {
+        y        = i << 2;
+        data     = RI_F(input, SAMPLER, (int2)(i, row));
+        weight_0 = RI_F(weight, SAMPLER, (int2)(col, y));
+        weight_1 = RI_F(weight, SAMPLER, (int2)(col, y + 1));
+        weight_2 = RI_F(weight, SAMPLER, (int2)(col, y + 2));
+        weight_3 = RI_F(weight, SAMPLER, (int2)(col, y + 3));
+
+        sum = mad(data.x, weight_0, sum);
+        sum = mad(data.y, weight_1, sum);
+        sum = mad(data.z, weight_2, sum);
+        sum = mad(data.w, weight_3, sum);
+    }
+
+    if (k_remain > 0) {
+        data = RI_F(input, SAMPLER, (int2)(i, row));
+        y    = i << 2;
+    }
+
+    switch (k_remain) {
+        case 3:
+            weight_0 = RI_F(weight, SAMPLER, (int2)(col, y));
+            weight_1 = RI_F(weight, SAMPLER, (int2)(col, y + 1));
+            weight_2 = RI_F(weight, SAMPLER, (int2)(col, y + 2));
+            sum      = mad(data.x, weight_0, sum);
+            sum      = mad(data.y, weight_1, sum);
+            sum      = mad(data.z, weight_2, sum);
+            break;
+        case 2:
+            weight_0 = RI_F(weight, SAMPLER, (int2)(col, y));
+            weight_1 = RI_F(weight, SAMPLER, (int2)(col, y + 1));
+            sum      = mad(data.x, weight_0, sum);
+            sum      = mad(data.y, weight_1, sum);
+            break;
+        case 1:
+            weight_0 = RI_F(weight, SAMPLER, (int2)(col, y));
+            sum      = mad(data.x, weight_0, sum);
+            break;
+    }
+    bias_ = RI_F(bias, SAMPLER, (int2)(col, 0));
+    sum += bias_;
+    WI_F(output, (int2)(col, row), sum);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/instance_norm.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/instance_norm.cl
new file mode 100644
index 0000000..6489e0c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/instance_norm.cl
@@ -0,0 +1,61 @@
+#include "base.inc"
+
+__kernel void InstanceNormVarBias_LocalMem(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t scale,
+    __read_only image2d_t bias, __private const int c_block, __private const int height,
+    __private const int width, __private const int hxw,
+    __write_only image2d_t var_out, __write_only image2d_t bias_out) {
+    const int thread_idx = get_global_id(0);
+    const int bc_idx     = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(thread_idx, bc_idx);
+
+    __local float4 local_temp_x[THREAD_BLOCK_W * THREAD_BLOCK_W];
+    __local float4 local_temp_x2[THREAD_BLOCK_W * THREAD_BLOCK_W];
+
+    const int c_block_idx = bc_idx % c_block;
+    const int b_idx       = bc_idx / c_block;
+    const int w_offset = thread_idx % THREAD_BLOCK_W;
+    const int h_offset = thread_idx / THREAD_BLOCK_W;
+
+    float4 sum_x = (float4)0.0f;
+    float4 sum_x2 = (float4)0.0f;
+    float4 val;
+    int h, w;
+    int2 pos_base = (int2)(mul24(c_block_idx, width), mul24(b_idx, height));
+    int2 pos;
+    for (h = h_offset, pos.y = pos_base.y + h_offset; h < height; h += THREAD_BLOCK_W) {
+        for (w = w_offset, pos.x = pos_base.x + w_offset; w < width; w += THREAD_BLOCK_W) {
+            val = read_imagef(input, SAMPLER, pos);
+            sum_x += val;
+            sum_x2 = mad(val, val, sum_x2);
+            pos.x += THREAD_BLOCK_W;
+        }
+        pos.y += THREAD_BLOCK_W;
+    }
+    local_temp_x[thread_idx] = sum_x;
+    local_temp_x2[thread_idx] = sum_x2;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (thread_idx == 0) {
+        sum_x = (float4)0.0f;
+        sum_x2 = (float4)0.0f;
+        for (int i = 0; i < THREAD_BLOCK_W * THREAD_BLOCK_W; ++i) {
+            sum_x += local_temp_x[i];
+            sum_x2 += local_temp_x2[i];
+        }
+        float4 mean = sum_x / (float)(hxw);
+        float4 varience = sum_x2 / (float)(hxw) - mean * mean;
+        varience        = 1.0f / sqrt(varience + .00001f);
+
+        float4 k_val = read_imagef(scale, SAMPLER, (int2)(c_block_idx, 0));
+        float4 b_val = read_imagef(bias, SAMPLER, (int2)(c_block_idx, 0));
+
+        varience *= k_val;
+        b_val -= mean * varience;
+
+        write_imagef(var_out, (int2)(c_block_idx, b_idx), varience);
+        write_imagef(bias_out, (int2)(c_block_idx, b_idx), b_val);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/inverse.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/inverse.cl
new file mode 100644
index 0000000..4ddb005
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/inverse.cl
@@ -0,0 +1,33 @@
+#include "base.inc"
+
+__kernel void Inverse(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                      __private const int input_batch, __private const int input_channel,
+                      __private const int input_height, __private const int input_width) {
+    const int cw = get_global_id(0);
+    const int hb = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, hb);
+
+    int in_batch   = (hb / input_height);
+    int in_channel = (cw / input_width);
+    int h          = (hb % input_height);
+    int w          = (cw % input_width);
+
+    if (h != 0 || w != 0) {
+        return;
+    }
+
+    FLOAT4 in0  = RI_F(input, SAMPLER, (int2)(in_channel * input_width + w, in_batch * input_height + h));
+    FLOAT4 in1  = RI_F(input, SAMPLER, (int2)(in_channel * input_width + w + 1, in_batch * input_height + h));
+    FLOAT4 in2  = RI_F(input, SAMPLER, (int2)(in_channel * input_width + w, in_batch * input_height + h + 1));
+    FLOAT4 in3  = RI_F(input, SAMPLER, (int2)(in_channel * input_width + w + 1, in_batch * input_height + h + 1));
+    FLOAT4 det4 = in0 * in3 - in1 * in2;
+    det4        = ((FLOAT)1.0) / det4;
+    FLOAT4 out0 = in3 * det4;
+    FLOAT4 out1 = ((FLOAT)(-1)) * in1 * det4;
+    FLOAT4 out2 = ((FLOAT)(-1)) * in2 * det4;
+    FLOAT4 out3 = in0 * det4;
+    WI_F(output, (int2)(cw, hb), out0);
+    WI_F(output, (int2)((in_channel * input_width + w + 1), hb), out1);
+    WI_F(output, (int2)(cw, in_batch * input_height + h + 1), out2);
+    WI_F(output, (int2)((in_channel * input_width + w + 1), in_batch * input_height + h + 1), out3);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/io.inc b/3rdparty/TNN/source/tnn/device/opencl/cl/io.inc
new file mode 100644
index 0000000..4ccd083
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/io.inc
@@ -0,0 +1,38 @@
+#define READ_INPUT_IMAGE(i, base) \
+    int in_width_value##i = in_width##i + base; \
+    in_width_value##i = \
+        select(in_idx + in_width_value##i, -1, (in_width_value##i < 0 || in_width_value##i >= input_wh.x)); \
+    in##i = RI_F(input, SAMPLER, (int2)(in_width_value##i, in_hb_value));
+
+inline void WriteOutputAntiOutOfBounds(__write_only image2d_t output, FLOAT4 out0, FLOAT4 out1, FLOAT4 out2, FLOAT4 out3,const int output_w_idx, const int output_h_idx, const int remain)
+{ 
+    if (remain >= 4) {
+        WI_F(output, (int2)(output_w_idx, output_h_idx), out0);
+        WI_F(output, (int2)(output_w_idx + 1, output_h_idx), out1);
+        WI_F(output, (int2)(output_w_idx + 2, output_h_idx), out2);
+        WI_F(output, (int2)(output_w_idx + 3, output_h_idx), out3);
+    } else if (remain == 3) {
+        WI_F(output, (int2)(output_w_idx, output_h_idx), out0);
+        WI_F(output, (int2)(output_w_idx + 1, output_h_idx), out1);
+        WI_F(output, (int2)(output_w_idx + 2, output_h_idx), out2);
+    } else if (remain == 2) {
+        WI_F(output, (int2)(output_w_idx, output_h_idx), out0);
+        WI_F(output, (int2)(output_w_idx + 1, output_h_idx), out1);
+    } else if (remain == 1) {
+        WI_F(output, (int2)(output_w_idx, output_h_idx), out0);
+    }
+}
+
+inline void WriteSliceOutputAntiOutOfBounds(__write_only image2d_t output, FLOAT4 out0, FLOAT4 out1, FLOAT4 out2, FLOAT4 out3,const int output_w_idx, const int output_h_idx, const int remain)
+{
+    WI_F(output, (int2)(output_w_idx, output_h_idx), out0);
+    if (remain >= 2) {
+        WI_F(output, (int2)(output_w_idx + 1, output_h_idx), out1);
+    }
+    if (remain >= 3) {
+        WI_F(output, (int2)(output_w_idx + 2, output_h_idx), out2);
+    }
+    if (remain >= 4) {
+        WI_F(output, (int2)(output_w_idx + 3, output_h_idx), out3);
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/lstm.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/lstm.cl
new file mode 100644
index 0000000..cee1b89
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/lstm.cl
@@ -0,0 +1,199 @@
+#include "base.inc"
+
+// input: [input_size / 4, sequence * batch]
+// w: [hidden_size / 4 * 4 * num_directions, input_size]
+// gates: [hidden_size / 4 * 4 * num_directions, sequence * batch]
+__kernel void LSTMONNXGates(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                            __read_only image2d_t w,
+                            __private const int input_size_updiv_4,
+                            __write_only image2d_t output_gates) {
+    const int dir_hid_idx  = get_global_id(0);
+    const int seq_b_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(dir_hid_idx, seq_b_idx);
+
+    FLOAT4 out = 0;
+    FLOAT4 weights_0, weights_1, weights_2, weights_3;
+    for (int i = 0; i < input_size_updiv_4; i++) {
+        int input_index = i << 2;
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(i, seq_b_idx));
+        weights_0 = RI_F(w, SAMPLER, (int2)(dir_hid_idx, input_index));
+        weights_1 = RI_F(w, SAMPLER, (int2)(dir_hid_idx, input_index + 1));
+        weights_2 = RI_F(w, SAMPLER, (int2)(dir_hid_idx, input_index + 2));
+        weights_3 = RI_F(w, SAMPLER, (int2)(dir_hid_idx, input_index + 3));
+
+        out = mad(in.x, weights_0, out);
+        out = mad(in.y, weights_1, out);
+        out = mad(in.z, weights_2, out);
+        out = mad(in.w, weights_3, out);
+    }
+
+    WI_F(output_gates, (int2)(dir_hid_idx, seq_b_idx), out);
+}
+
+// gates: [hidden_size / 4 * 4 * num_directions, sequence * batch]
+// r: [hidden_size / 4 * 4 * num_directions, hidden_size]
+// bias: [2* hidden_size, num_directions]
+// initial cell: [hidden_size / 4, num_directions * batch]
+// initial hidden: [hidden_size / 4, num_directions * batch]
+// output: [hidden_size / 4 * num_directions, sequence * batch]
+// cell: [hidden_size / 4, num_directions * batch]
+// hidden: [hidden_size / 4, num_directions * batch]
+__kernel void LSTMONNXForward(GLOBAL_SIZE_2_DIMS __read_only image2d_t gates,
+                              __read_only image2d_t r,
+                              __read_only image2d_t bias,
+                              __read_only image2d_t h_0,
+                              __read_only image2d_t c_0,
+                              __private const int sequence,
+                              __private const int num_directions,
+                              __private const int hidden_size_updiv_4,
+                              __private const int reverse,
+                              __local FLOAT4* h_local,
+                              __write_only image2d_t output,
+                              __write_only image2d_t output_hidden,
+                              __write_only image2d_t output_cell) {
+    const int hid_4_dir_idx  = get_global_id(0);
+    const int batch = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(hid_4_dir_idx, batch);
+    const int hid_4_idx = hid_4_dir_idx % hidden_size_updiv_4;
+    const int dir_idx   = hid_4_dir_idx / hidden_size_updiv_4;
+
+    bool forward = dir_idx == 0 && !reverse;
+
+    const int hid_4_idx_I = hid_4_idx;
+    const int hid_4_idx_O = hid_4_idx_I + hidden_size_updiv_4;
+    const int hid_4_idx_F = hid_4_idx_O + hidden_size_updiv_4;
+    const int hid_4_idx_C = hid_4_idx_F + hidden_size_updiv_4;
+    int4 hid_4_idx_IOFC = (int4)(hid_4_idx_I, hid_4_idx_O, hid_4_idx_F, hid_4_idx_C);
+
+    const int hid_4_idx_b_r_I = hid_4_idx_C + hidden_size_updiv_4;
+    const int hid_4_idx_b_r_O = hid_4_idx_b_r_I + hidden_size_updiv_4;
+    const int hid_4_idx_b_r_F = hid_4_idx_b_r_O + hidden_size_updiv_4;
+    const int hid_4_idx_b_r_C = hid_4_idx_b_r_F + hidden_size_updiv_4;
+
+    FLOAT4 b_I = RI_F(bias, SAMPLER, (int2)(hid_4_idx_I, dir_idx)) +
+                 RI_F(bias, SAMPLER, (int2)(hid_4_idx_b_r_I, dir_idx));
+    FLOAT4 b_O = RI_F(bias, SAMPLER, (int2)(hid_4_idx_O, dir_idx)) +
+                 RI_F(bias, SAMPLER, (int2)(hid_4_idx_b_r_O, dir_idx));
+    FLOAT4 b_F = RI_F(bias, SAMPLER, (int2)(hid_4_idx_F, dir_idx)) +
+                 RI_F(bias, SAMPLER, (int2)(hid_4_idx_b_r_F, dir_idx));
+    FLOAT4 b_C = RI_F(bias, SAMPLER, (int2)(hid_4_idx_C, dir_idx)) +
+                 RI_F(bias, SAMPLER, (int2)(hid_4_idx_b_r_C, dir_idx));
+
+    int gates_w_offset = (hidden_size_updiv_4 << 2) * dir_idx;
+    int4 gates_w_idx = hid_4_idx_IOFC + gates_w_offset;
+    int gates_h_offset = sequence * batch;
+    int state_h_idx = batch * num_directions + dir_idx;
+
+    FLOAT4 cell = RI_F(c_0, SAMPLER, (int2)(hid_4_idx, state_h_idx));
+    h_local[hid_4_idx] = RI_F(h_0, SAMPLER, (int2)(hid_4_idx, state_h_idx));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int seq = 0; seq < sequence; seq++) {
+        int t = forward ? seq : sequence - 1 - seq;
+        int gates_h = t + gates_h_offset;
+
+        FLOAT4 I = b_I;
+        FLOAT4 O = b_O;
+        FLOAT4 F = b_F;
+        FLOAT4 C = b_C;
+
+        I += RI_F(gates, SAMPLER, (int2)(gates_w_idx.x, gates_h));
+        O += RI_F(gates, SAMPLER, (int2)(gates_w_idx.y, gates_h));
+        F += RI_F(gates, SAMPLER, (int2)(gates_w_idx.z, gates_h));
+        C += RI_F(gates, SAMPLER, (int2)(gates_w_idx.w, gates_h));
+
+        for (int i = 0; i < hidden_size_updiv_4; ++i) {
+            int hidden_index = i << 2;
+            FLOAT4 h = h_local[i];
+            FLOAT4 r_I_0 = RI_F(r, SAMPLER, (int2)(gates_w_idx.x, hidden_index));
+            FLOAT4 r_I_1 = RI_F(r, SAMPLER, (int2)(gates_w_idx.x, hidden_index + 1));
+            FLOAT4 r_I_2 = RI_F(r, SAMPLER, (int2)(gates_w_idx.x, hidden_index + 2));
+            FLOAT4 r_I_3 = RI_F(r, SAMPLER, (int2)(gates_w_idx.x, hidden_index + 3));
+            FLOAT4 r_O_0 = RI_F(r, SAMPLER, (int2)(gates_w_idx.y, hidden_index));
+            FLOAT4 r_O_1 = RI_F(r, SAMPLER, (int2)(gates_w_idx.y, hidden_index + 1));
+            FLOAT4 r_O_2 = RI_F(r, SAMPLER, (int2)(gates_w_idx.y, hidden_index + 2));
+            FLOAT4 r_O_3 = RI_F(r, SAMPLER, (int2)(gates_w_idx.y, hidden_index + 3));
+            FLOAT4 r_F_0 = RI_F(r, SAMPLER, (int2)(gates_w_idx.z, hidden_index));
+            FLOAT4 r_F_1 = RI_F(r, SAMPLER, (int2)(gates_w_idx.z, hidden_index + 1));
+            FLOAT4 r_F_2 = RI_F(r, SAMPLER, (int2)(gates_w_idx.z, hidden_index + 2));
+            FLOAT4 r_F_3 = RI_F(r, SAMPLER, (int2)(gates_w_idx.z, hidden_index + 3));
+            FLOAT4 r_C_0 = RI_F(r, SAMPLER, (int2)(gates_w_idx.w, hidden_index));
+            FLOAT4 r_C_1 = RI_F(r, SAMPLER, (int2)(gates_w_idx.w, hidden_index + 1));
+            FLOAT4 r_C_2 = RI_F(r, SAMPLER, (int2)(gates_w_idx.w, hidden_index + 2));
+            FLOAT4 r_C_3 = RI_F(r, SAMPLER, (int2)(gates_w_idx.w, hidden_index + 3));
+
+            I = mad(h.x, r_I_0, I);
+            I = mad(h.y, r_I_1, I);
+            I = mad(h.z, r_I_2, I);
+            I = mad(h.w, r_I_3, I);
+            
+            O = mad(h.x, r_O_0, O);
+            O = mad(h.y, r_O_1, O);
+            O = mad(h.z, r_O_2, O);
+            O = mad(h.w, r_O_3, O);
+
+            F = mad(h.x, r_F_0, F);
+            F = mad(h.y, r_F_1, F);
+            F = mad(h.z, r_F_2, F);
+            F = mad(h.w, r_F_3, F);
+
+            C = mad(h.x, r_C_0, C);
+            C = mad(h.y, r_C_1, C);
+            C = mad(h.z, r_C_2, C);
+            C = mad(h.w, r_C_3, C);
+        }
+
+        I = (FLOAT4)1.f / ((FLOAT4)1.f + exp(-I));
+        F = (FLOAT4)1.f / ((FLOAT4)1.f + exp(-F));
+        O = (FLOAT4)1.f / ((FLOAT4)1.f + exp(-O));
+        C = tanh(C);
+
+        FLOAT4 cell2 = F * cell + I * C;
+        FLOAT4 H = O * tanh(cell2);
+        h_local[hid_4_idx] = H;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        WI_F(output, (int2)(hid_4_dir_idx, batch * sequence + t), H);
+        cell = cell2;
+    }
+    WI_F(output_cell, (int2)(hid_4_idx, state_h_idx), cell);
+    WI_F(output_hidden, (int2)(hid_4_idx, state_h_idx), h_local[hid_4_idx]);
+}
+
+// input: [hidden_size / 4 * num_directions, sequence * batch]
+// output: [(hidden_size * num_directions) / 4, sequence * batch]
+__kernel void LSTMONNXResultConvert(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                                    __private const int hidden_size,
+                                    __private const int hidden_size_updiv_4,
+                                    __write_only image2d_t output) {
+    const int hid_dir_4_idx  = get_global_id(0);
+    const int image_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(hid_dir_4_idx, image_height_idx);
+
+    const int hid_dir_idx = hid_dir_4_idx << 2;
+
+    FLOAT out_val[4];
+    for (int i = hid_dir_idx; i < hid_dir_idx + 4; i++) {
+        int hid_idx   = i % hidden_size;
+        int hid_4_idx = hid_idx / 4;
+        int dir_idx   = i / hidden_size;
+
+        FLOAT4 in = RI_F(input, SAMPLER, (int2)(hid_4_idx + dir_idx * hidden_size_updiv_4, image_height_idx));
+        if (hid_idx % 4 == 0) {
+            out_val[i - hid_dir_idx] = in.x;
+        } else if (hid_idx % 4 == 1) {
+            out_val[i - hid_dir_idx] = in.y;
+        } else if (hid_idx % 4 == 2) {
+            out_val[i - hid_dir_idx] = in.z;
+        } else {
+            out_val[i - hid_dir_idx] = in.w;
+        }
+    }
+
+    FLOAT4 out = {out_val[0], out_val[1], out_val[2], out_val[3]};
+
+    WI_F(output, (int2)(hid_dir_4_idx, image_height_idx), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/matmul.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/matmul.cl
new file mode 100644
index 0000000..9fa96d3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/matmul.cl
@@ -0,0 +1,77 @@
+#include "base.inc"
+
+// matrix_a format: image:(K/4, batch_a * M)
+// matrix_b format: image:(N/4, batch_b * K)
+// matrix_c format: image:(N/4, batch_c * M)
+__kernel void MatMul(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t matrix_a,
+    __read_only image2d_t matrix_b,
+    __private const int m,
+    __private const int k_blocks, __private const int k, __private const int k_remain,
+    __private const int batch_a, __private const int batch_b,
+    __write_only image2d_t matrix_c) {
+    const int image_row = get_global_id(0);
+    const int image_col = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(image_row, image_col);
+
+    int batch_c_idx = image_col / m;
+    int m_idx       = image_col % m;
+    int batch_a_idx = select(0, batch_c_idx, batch_c_idx < batch_a);
+    int batch_b_idx = select(0, batch_c_idx, batch_c_idx < batch_b);
+
+    FLOAT4 matrix_a_data;
+    FLOAT4 matrix_b_data_0;
+    FLOAT4 matrix_b_data_1;
+    FLOAT4 matrix_b_data_2;
+    FLOAT4 matrix_b_data_3;
+    FLOAT4 sum = (FLOAT4)0;
+
+    int k_size = k_blocks;
+    if (k_remain > 0) {
+        k_size--;
+    }
+
+    int matrix_a_y_idx = batch_a_idx * m + m_idx;
+    int matrix_b_y_offset = batch_b_idx * k;
+    int y = matrix_b_y_offset, i = 0;
+    for (; i < k_size; i++) {
+        matrix_a_data = RI_F(matrix_a, SAMPLER, (int2)(i, matrix_a_y_idx));
+        matrix_b_data_0 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y));
+        matrix_b_data_1 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 1));
+        matrix_b_data_2 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 2));
+        matrix_b_data_3 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 3));
+
+        sum = mad(matrix_a_data.x, matrix_b_data_0, sum);
+        sum = mad(matrix_a_data.y, matrix_b_data_1, sum);
+        sum = mad(matrix_a_data.z, matrix_b_data_2, sum);
+        sum = mad(matrix_a_data.w, matrix_b_data_3, sum);
+        y += 4;
+    }
+
+    if (k_remain > 0) {
+        matrix_a_data = RI_F(matrix_a, SAMPLER, (int2)(i, matrix_a_y_idx));
+    }
+
+    switch (k_remain) {
+        case 3:
+            matrix_b_data_0 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y));
+            matrix_b_data_1 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 1));
+            matrix_b_data_2 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 2));
+            sum      = mad(matrix_a_data.x, matrix_b_data_0, sum);
+            sum      = mad(matrix_a_data.y, matrix_b_data_1, sum);
+            sum      = mad(matrix_a_data.z, matrix_b_data_2, sum);
+            break;
+        case 2:
+            matrix_b_data_0 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y));
+            matrix_b_data_1 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y + 1));
+            sum      = mad(matrix_a_data.x, matrix_b_data_0, sum);
+            sum      = mad(matrix_a_data.y, matrix_b_data_1, sum);
+            break;
+        case 1:
+            matrix_b_data_0 = RI_F(matrix_b, SAMPLER, (int2)(image_row, y));
+            sum      = mad(matrix_a_data.x, matrix_b_data_0, sum);
+            break;
+    }
+    WI_F(matrix_c, (int2)(image_row, image_col), sum);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/normalize.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/normalize.cl
new file mode 100644
index 0000000..067f012
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/normalize.cl
@@ -0,0 +1,191 @@
+#include "base.inc"
+__kernel void NormalizeCommon0(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                                 __private const int channel_blocks,
+                                 __private const int width,
+                                 __private const float eps,
+                                 __write_only image2d_t output) {
+    const int w_idx  = get_global_id(0);
+    const int bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(w_idx, bh_idx);
+
+    FLOAT4 sum_xyzw = (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    int cw_idx      = w_idx;
+    for (int cb = 0; cb < channel_blocks; ++cb) {
+        FLOAT4 val = RI_F(input, SAMPLER, (int2)(cw_idx, bh_idx));
+#ifdef NORMALIZE_P2
+        sum_xyzw += val * val;
+#else
+        sum_xyzw += fabs(val);
+#endif
+        cw_idx += width;
+    }
+    FLOAT sum = sum_xyzw.x + sum_xyzw.y + sum_xyzw.z + sum_xyzw.w;
+#ifdef NORMALIZE_P2
+    sum = max(sqrt(sum), (FLOAT)(eps));
+#endif
+
+    for (int cb = 0; cb < channel_blocks; ++cb) {
+        int2 pos   = (int2)(mad24(cb, width, w_idx), bh_idx);
+        FLOAT4 val = RI_F(input, SAMPLER, pos) / sum;
+        WI_F(output, pos, val);
+    }
+}
+
+__kernel void NormalizeCommon(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                               __private const int channel_blocks,
+                               __private const int channel_remain,
+                               __private const int width,
+                               __private const float eps,
+                               __write_only image2d_t output) {
+    const int w_idx  = get_global_id(0);
+    const int bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(w_idx, bh_idx);
+
+    FLOAT4 sum_xyzw = (FLOAT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    int cw_idx      = w_idx;
+    for (int cb = 0; cb < (channel_blocks - 1); ++cb) {
+        FLOAT4 val = RI_F(input, SAMPLER, (int2)(cw_idx, bh_idx));
+#ifdef NORMALIZE_P2
+        sum_xyzw += val * val;
+#else
+        sum_xyzw += fabs(val);
+#endif
+        cw_idx += width;
+    }
+    FLOAT sum = sum_xyzw.x + sum_xyzw.y + sum_xyzw.z + sum_xyzw.w;
+
+    FLOAT4 val_last = RI_F(input, SAMPLER, (int2)(cw_idx, bh_idx));
+#ifdef NORMALIZE_P2
+    val_last = val_last * val_last;
+#else
+    val_last = fabs(val_last);
+#endif
+    if (1 == channel_remain) {
+        sum += val_last.x;
+    } else if (2 == channel_remain) {
+        sum += val_last.x;
+        sum += val_last.y;
+    } else if (3 == channel_remain) {
+        sum += val_last.x;
+        sum += val_last.y;
+        sum += val_last.z;
+    } else {
+        sum += val_last.x;
+        sum += val_last.y;
+        sum += val_last.z;
+        sum += val_last.w;
+    }
+
+#ifdef NORMALIZE_P2
+    sum = max(sqrt(sum), (FLOAT)(eps));
+#endif
+
+    for (int cb = 0; cb < channel_blocks; ++cb) {
+        int2 pos   = (int2)(mad24(cb, width, w_idx), bh_idx);
+        FLOAT4 val = RI_F(input, SAMPLER, pos) / sum;
+        WI_F(output, pos, val);
+    }
+}
+
+__kernel void ResizeBilinear(GLOBAL_SIZE_2_DIMS __read_only image2d_t sourceImage,
+                            __write_only image2d_t destinationImage,
+                            float widthNormalizationFactor,
+                            float heightNormalizationFactor,
+                            int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height){
+    int cw_idx  = get_global_id(0);
+    int bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw_idx, bh_idx);
+
+    const int batch_idx         = bh_idx / dst_height;
+    int2 coordinate = (int2)(cw_idx, bh_idx);
+
+    float pos_x = (cw_idx + 0.5f) * widthNormalizationFactor - 0.5f;
+    float pos_y = (bh_idx + 0.5f) * heightNormalizationFactor - 0.5f;
+
+    float rat_x,rat_y;
+    int x = floor(pos_x);
+    int y = floor(pos_y);
+
+    rat_x = pos_x - x;
+    rat_y = pos_y - y;
+    if (x < 0) {
+        x = 0;
+        rat_x = 0.f;
+    }
+    if (x >= src_width - 1) {
+        x = src_width - 2;
+        rat_x = 1.f;
+    }
+    if (y < batch_idx*src_height) {
+        y = batch_idx*src_height;
+        rat_y = 0.f;
+    }
+    if (y >= (batch_idx+1)*src_height - 1) {
+        y = (batch_idx+1)*src_height - 2;
+        rat_y = 1.f;
+    }
+
+    float4 colour_upleft = read_imagef(sourceImage, SAMPLER, (int2)(x,y));
+    float4 colour_downleft = read_imagef(sourceImage, SAMPLER, (int2)(x,y+1));
+    float4 colour_upright = read_imagef(sourceImage, SAMPLER, (int2)(x+1,y));
+    float4 colour_downright = read_imagef(sourceImage, SAMPLER, (int2)(x+1,y+1));
+    float4 colour = (1 - rat_x)*(1 - rat_y)*colour_upleft + 
+                    (1 - rat_x)*rat_y*colour_downleft + 
+                    rat_x*rat_y*colour_downright + 
+                    rat_x*(1 - rat_y)*colour_upright;
+
+    write_imagef(destinationImage, coordinate, colour);
+}
+__kernel void ResizeNearest(GLOBAL_SIZE_2_DIMS __read_only image2d_t sourceImage,
+                            __write_only image2d_t destinationImage,
+                            float widthNormalizationFactor,
+                            float heightNormalizationFactor,
+                            int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height){
+    int cw_idx  = get_global_id(0);
+    int bh_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw_idx, bh_idx);
+
+    const int batch_idx         = bh_idx / dst_height;
+    int2 coordinate = (int2)(cw_idx, bh_idx);
+
+    float pos_x = (cw_idx + 0.5f) * widthNormalizationFactor - 0.5f;
+    float pos_y = (bh_idx + 0.5f) * heightNormalizationFactor - 0.5f;
+
+    float rat_x,rat_y;
+    int x = floor(pos_x);
+    int y = floor(pos_y);
+
+    rat_x = pos_x - x;
+    rat_y = pos_y - y;
+    if (x < 0) {
+        x = 0;
+        rat_x = 0.f;
+    }
+    if (x >= src_width - 1) {
+        x = src_width - 2;
+        rat_x = 1.f;
+    }
+    if (y < batch_idx*src_height) {
+        y = batch_idx*src_height;
+        rat_y = 0.f;
+    }
+    if (y >= (batch_idx+1)*src_height - 1) {
+        y = (batch_idx+1)*src_height - 2;
+        rat_y = 1.f;
+    }
+    if(rat_x > 0.5f)
+        x = x + 1;
+    if(rat_y > 0.5f)
+        y = y + 1;
+
+    float4 colour = read_imagef(sourceImage, SAMPLER, (int2)(x,y));
+    write_imagef(destinationImage, coordinate, colour);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/opencl_codegen.py b/3rdparty/TNN/source/tnn/device/opencl/cl/opencl_codegen.py
new file mode 100644
index 0000000..8786b64
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/opencl_codegen.py
@@ -0,0 +1,80 @@
+import os
+import sys
+
+def convert_string_to_hex_list(code_str):
+    hex_list = []
+    for i in range(len(code_str)):
+        hex_ = hex(ord(code_str[i]))
+        hex_list.append(hex_)
+    return hex_list
+
+def opencl_codegen():
+    if len(sys.argv) > 1:
+        cl_kernel_dir = sys.argv[1]
+    else:
+        cl_kernel_dir = "./cl"
+    output_path = cl_kernel_dir + "/opencl_program.cc"
+    if not os.path.exists(cl_kernel_dir):
+        print(cl_kernel_dir + " doesn't exist!")
+
+    base_code = ""
+    activation_code = ""
+    io_code = ""
+    file_path = os.path.join(cl_kernel_dir, "base.inc")
+    with open(file_path, "r") as f:
+        base_code += f.read()
+
+    file_path = os.path.join(cl_kernel_dir, "activation.inc")
+    with open(file_path, "r") as f:
+        activation_code += f.read()
+
+    file_path = os.path.join(cl_kernel_dir, "io.inc")
+    with open(file_path, "r") as f:
+        io_code += f.read()
+
+    opencl_code_maps = {}
+    for file_name in os.listdir(cl_kernel_dir):
+        file_path = os.path.join(cl_kernel_dir, file_name)
+        if file_path[-3:] == ".cl":
+            with open(file_path, "r") as f:
+                code_str = ""
+                for line in f.readlines():
+                    if "#include \"base.inc\"" in line:
+                        code_str += base_code
+                    elif "#include \"activation.inc\"" in line:
+                        code_str += activation_code
+                    elif "#include \"io.inc\"" in line:
+                        code_str += io_code
+                    else:
+                        code_str += line
+                opencl_code_maps[file_name[:-3]] = convert_string_to_hex_list(code_str)
+
+    #source model
+    opencl_source_map = "#include <map> \n"
+    opencl_source_map += "#include <string> \n"
+    opencl_source_map += "#include <vector> \n"
+    opencl_source_map += "#include \"tnn/core/macro.h\" \n"
+    opencl_source_map += "namespace TNN_NS { \n"
+    opencl_source_map += "extern const std::map<std::string, std::vector<unsigned char>> g_opencl_program_map = \n{ \n"
+
+    for file_name, file_source in opencl_code_maps.items():
+        opencl_source_map += "{\n    \""
+        opencl_source_map += file_name
+        opencl_source_map += "\", \n"
+        opencl_source_map += "    { "
+        for source_hex in file_source:
+            opencl_source_map += source_hex
+            opencl_source_map += ","
+        opencl_source_map += "} "
+        opencl_source_map += "\n},\n"
+
+    opencl_source_map += "}; \n"
+    opencl_source_map += "} \n"
+
+    with open(output_path, "w") as w_file:
+        w_file.write(opencl_source_map)
+
+    print("Generate OpenCL Source done !!! \n")
+
+if __name__ == '__main__':
+    opencl_codegen()
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/pad.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/pad.cl
new file mode 100644
index 0000000..734eba8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/pad.cl
@@ -0,0 +1,232 @@
+#include "base.inc"
+
+__kernel void PadConst(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                       __write_only image2d_t output,
+                       __private const int output_height,
+                       __private const int input_channel,
+                       __private const int input_height,
+                       __private const int input_width,
+                       __private const int pad_l, __private const int pad_t,
+                       __private const int pad_c_b, __private float pad_value) {
+    const int output_w_idx      = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int output_hb_idx     = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, channel_block_idx, output_hb_idx);
+    const int b_idx        = output_hb_idx / output_height;
+    const int output_h_idx = output_hb_idx % output_height;
+    const int output_width = global_size_dim0;
+    const int2 output_pos  = (int2)(
+        mad24(channel_block_idx, output_width, output_w_idx), output_hb_idx);
+
+    FLOAT4 val      = (FLOAT4)pad_value;
+    int input_w_idx = output_w_idx - pad_l;
+    int input_h_idx = output_h_idx - pad_t;
+    int remain      = pad_c_b % 4;
+
+    int input_c_b_idx               = channel_block_idx * 4 - pad_c_b;
+    int input_c_e_idx               = input_c_b_idx + 3;
+    int input_channel_b_block_idx   = channel_block_idx - (pad_c_b + 3) / 4;
+    if (input_w_idx >= 0 && input_w_idx < input_width && input_h_idx >= 0 &&
+        input_h_idx < input_height && input_c_b_idx < input_channel && input_c_e_idx >= 0) {
+        if (remain == 0)
+        {
+            const int2 input_pos =
+                (int2)(mad24(input_channel_b_block_idx, input_width, input_w_idx),
+                       mad24(b_idx, input_height, input_h_idx));
+            val = RI_F(input, SAMPLER, input_pos);
+
+            int remain_channels = input_channel - input_c_b_idx;
+
+            if (remain_channels <= 3)
+            {
+                if (remain_channels == 3)
+                {
+                    val.w = pad_value;
+                }
+                else if (remain_channels == 2)
+                {
+                    val.z = val.w = pad_value;
+                }
+                else
+                {
+                    val.y = val.z = val.w = pad_value;
+                }
+            }
+        }
+        else
+        {
+            if (input_c_b_idx >= 0 && input_c_e_idx < input_channel)
+            {
+                const int2 input_b_block_pos =
+                    (int2)(mad24(input_channel_b_block_idx, input_width, input_w_idx),
+                           mad24(b_idx, input_height, input_h_idx));
+                FLOAT4 temp_b_value = RI_F(input, SAMPLER, input_b_block_pos);
+
+                const int2 input_e_block_pos =
+                    (int2)(input_b_block_pos.x + input_width,
+                           input_b_block_pos.y);
+                FLOAT4 temp_e_value = RI_F(input, SAMPLER, input_e_block_pos);
+
+                if (remain == 1)
+                {
+                    val.x = temp_b_value.w;
+                    val.yzw = temp_e_value.xyz;
+                }
+                else if (remain == 2)
+                {
+                    val.xy = temp_b_value.zw;
+                    val.zw = temp_e_value.xy;
+                }
+                else
+                {
+                    val.xyz = temp_b_value.yzw;
+                    val.w = temp_e_value.x;
+                }
+            }
+            else if (input_c_b_idx < 0)
+            {
+                const int2 input_block_pad_pos =
+                    (int2)(input_w_idx,
+                           mad24(b_idx, input_height, input_h_idx));
+                FLOAT4 temp_value = RI_F(input, SAMPLER, input_block_pad_pos);
+
+                if (remain == 1)
+                {
+                    if (input_channel >= 3)
+                    {
+                        val.yzw = temp_value.xyz;
+                    }
+                    else if (input_channel == 2)
+                    {
+                        val.yz = temp_value.xy;
+                    }
+                    else
+                    {
+                        val.y = temp_value.x;
+                    }
+                }
+                else if (remain == 2)
+                {
+                    if (input_channel >= 2)
+                    {
+                        val.zw = temp_value.xy;
+                    }
+                    else
+                    {
+                        val.z = temp_value.x;
+                    }
+                }
+                else
+                {
+                    val.w = temp_value.x;
+                }
+            }
+            else
+            {
+                const int2 input_b_block_pos =
+                    (int2)(mad24(input_channel_b_block_idx, input_width, input_w_idx),
+                           mad24(b_idx, input_height, input_h_idx));
+                FLOAT4 temp_b_value = RI_F(input, SAMPLER, input_b_block_pos);
+
+                const int2 input_e_block_pos =
+                    (int2)(input_b_block_pos.x + input_width,
+                           input_b_block_pos.y);
+                FLOAT4 temp_e_value = RI_F(input, SAMPLER, input_e_block_pos);
+
+                int remain_channels = input_channel - input_c_b_idx;
+
+                if (remain == 1)
+                {
+                    if (remain_channels == 3)
+                    {
+                        val.x = temp_b_value.w;
+                        val.yz = temp_e_value.xy;
+                    }
+                    else if (remain_channels == 2)
+                    {
+                        val.x = temp_b_value.w;
+                        val.y = temp_e_value.x;
+                    }
+                    else
+                    {
+                        val.x = temp_b_value.w;
+                    }
+                }
+                else if (remain == 2)
+                {
+                    if (remain_channels == 3)
+                    {
+                        val.xy = temp_b_value.zw;
+                        val.z = temp_e_value.x;
+                    }
+                    else if (remain_channels == 2)
+                    {
+                        val.xy = temp_b_value.zw;
+                    }
+                    else
+                    {
+                        val.x = temp_b_value.z;
+                    }
+                }
+                else
+                {
+                    if (remain_channels == 3)
+                    {
+                        val.xyz = temp_b_value.yzw;
+                    }
+                    else if (remain_channels == 2)
+                    {
+                        val.xy = temp_b_value.yz;
+                    }
+                    else
+                    {
+                        val.x = temp_b_value.y;
+                    }
+                }
+            }
+        }
+    }
+
+    WI_F(output, output_pos, val);
+}
+
+__kernel void PadReflect(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                         __write_only image2d_t output,
+                         __private const int output_height,
+                         __private const int input_channel,
+                         __private const int input_height,
+                         __private const int input_width,
+                         __private const int pad_l, __private const int pad_t,
+                         __private const int pad_c_b) {
+    // NOTE: reflect mode not support channel level padding for now
+    const int output_w_idx      = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int output_hb_idx     = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, channel_block_idx, output_hb_idx);
+    const int b_idx        = output_hb_idx / output_height;
+    const int output_h_idx = output_hb_idx % output_height;
+    const int output_width = global_size_dim0;
+
+    int input_w_idx = output_w_idx - pad_l;
+    if (input_w_idx < 0) {
+        input_w_idx = -input_w_idx;
+    } else if (input_w_idx >= input_width) {
+        input_w_idx = input_width - (input_w_idx - input_width) - 2;
+    }
+    int input_h_idx = output_h_idx - pad_t;
+    if (input_h_idx < 0) {
+        input_h_idx = -input_h_idx;
+    } else if (input_h_idx >= input_height) {
+        input_h_idx = input_height - (input_h_idx - input_height) - 2;
+    }
+
+    const int2 input_pos =
+        (int2)(mad24(channel_block_idx, input_width, input_w_idx),
+               mad24(b_idx, input_height, input_h_idx));
+    const int2 output_pos = (int2)(
+        mad24(channel_block_idx, output_width, output_w_idx), output_hb_idx);
+
+    WI_F(output, output_pos, RI_F(input, SAMPLER, input_pos));
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/pixel_shuffle.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/pixel_shuffle.cl
new file mode 100644
index 0000000..a7e5a90
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/pixel_shuffle.cl
@@ -0,0 +1,74 @@
+#include "base.inc"
+
+#define SET_CHANNEL_OUTPUT(in, in_c_idx, out)   \
+    in_c_idx_remain = in_c_idx & 3;             \
+    if (in_c_idx_remain == 0) {                 \
+        out = in.x;                             \
+    } else if (in_c_idx_remain == 1) {          \
+        out = in.y;                             \
+    } else if (in_c_idx_remain == 2) {          \
+        out = in.z;                             \
+    } else {                                    \
+        out = in.w;                             \
+    }
+
+__kernel void PixelShuffle(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                           __write_only image2d_t output,
+                           __private const int output_height,
+                           __private const int output_width,
+                           __private const int input_height,
+                           __private const int input_width,
+                           __private const int upscale_factor,
+                           __private const int upscale_factor_pow) {
+    const int out_cw_idx = get_global_id(0);
+    const int out_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(out_cw_idx, out_bh_idx);
+
+    const int out_channel_blocks_idx    = out_cw_idx / output_width;
+    const int out_width_idx             = out_cw_idx % output_width;
+    const int out_batch_idx             = out_bh_idx / output_height;
+    const int out_height_idx            = out_bh_idx % output_height;
+
+    const int in_width_idx  = out_width_idx / upscale_factor;
+    const int in_height_idx = out_height_idx / upscale_factor;
+
+    const int out_channel_base  = out_channel_blocks_idx << 2;
+    const int in_channel_offset = mad24(
+            out_height_idx % upscale_factor, upscale_factor,
+            out_width_idx % upscale_factor);
+    
+    const int in_channel_idx0 = mad24(out_channel_base, upscale_factor_pow, in_channel_offset);
+    const int in_channel_idx1 = in_channel_idx0 + upscale_factor_pow;
+    const int in_channel_idx2 = in_channel_idx1 + upscale_factor_pow;
+    const int in_channel_idx3 = in_channel_idx2 + upscale_factor_pow;
+
+    const int in_channel_blocks_idx0 = in_channel_idx0 >> 2;
+    const int in_channel_blocks_idx1 = in_channel_idx1 >> 2;
+    const int in_channel_blocks_idx2 = in_channel_idx2 >> 2;
+    const int in_channel_blocks_idx3 = in_channel_idx3 >> 2;
+
+    FLOAT4 in0, in1, in2, in3, out;
+    int2 pos0, pos1, pos2, pos3;
+    pos0.x = mad24(in_channel_blocks_idx0, input_width, in_width_idx);
+    pos0.y = mad24(out_batch_idx, input_height, in_height_idx);
+    pos1.x = mad24(in_channel_blocks_idx1, input_width, in_width_idx);
+    pos1.y = pos0.y;
+    pos2.x = mad24(in_channel_blocks_idx2, input_width, in_width_idx);
+    pos2.y = pos0.y;
+    pos3.x = mad24(in_channel_blocks_idx3, input_width, in_width_idx);
+    pos3.y = pos0.y;
+
+    in0 = RI_F(input, SAMPLER, pos0);
+    in1 = RI_F(input, SAMPLER, pos1);
+    in2 = RI_F(input, SAMPLER, pos2);
+    in3 = RI_F(input, SAMPLER, pos3);
+
+    int in_c_idx_remain;
+    SET_CHANNEL_OUTPUT(in0, in_channel_idx0, out.x);
+    SET_CHANNEL_OUTPUT(in1, in_channel_idx1, out.y);
+    SET_CHANNEL_OUTPUT(in2, in_channel_idx2, out.z);
+    SET_CHANNEL_OUTPUT(in3, in_channel_idx3, out.w);
+
+    WI_F(output, (int2)(out_cw_idx, out_bh_idx), out);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/pooling.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/pooling.cl
new file mode 100644
index 0000000..5bbe919
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/pooling.cl
@@ -0,0 +1,199 @@
+#include "base.inc"
+
+__kernel void Pooling(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                      __private const int2 input_wh, __private const int output_height, __private const int2 pad_wh,
+                      __private const int2 stride_wh,
+                      __private const int2 kernel_wh,
+                      __write_only image2d_t output) {
+    const int output_channel_idx      = get_global_id(0);
+    const int output_width_idx        = get_global_id(1);
+    const int output_batch_height_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_channel_idx, output_width_idx, output_batch_height_idx);
+    const int output_width = global_size_dim1;
+
+    const int output_batch_idx    = output_batch_height_idx / output_height;
+    const int output_height_idx   = output_batch_height_idx - mul24(output_batch_idx, output_height);
+    const int input_start         = mul24(output_batch_idx, input_wh.y);
+    const int input_height_start  = mad24(output_height_idx, stride_wh.y, -pad_wh.y);
+    const int input_width_start   = mad24(output_width_idx, stride_wh.x, -pad_wh.x);
+    const int input_channel_start = mul24(output_channel_idx, input_wh.x);
+
+#ifdef POOL_AVG
+    float4 output_result = 0;
+    for (int height = 0; height < kernel_wh.y; height++) {
+        int input_height_idx = input_height_start + height;
+        input_height_idx =
+            select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= input_wh.y));
+        for (int width = 0; width < kernel_wh.x; width++) {
+            int input_width_idx = input_width_start + width;
+            input_width_idx =
+                select(input_channel_start + input_width_idx, -1, (input_width_idx < 0 || input_width_idx >= input_wh.x));
+
+            float4 input_data = read_imagef(input, SAMPLER, (int2)(input_width_idx, input_height_idx));
+            output_result     = output_result + input_data;
+        }
+    }
+
+    const int kernel_height_start = max(0, input_height_start);
+    const int kernel_width_start  = max(0, input_width_start);
+    const int kernel_height_end   = min(input_height_start + kernel_wh.y, input_wh.y);
+    const int kernel_width_end    = min(input_width_start + kernel_wh.x, input_wh.x);
+    const int block_size = mul24((kernel_height_end - kernel_height_start), (kernel_width_end - kernel_width_start));
+    output_result = output_result / (float)block_size;
+
+    const int output_channel_width_idx = mad24(output_channel_idx, output_width, output_width_idx);
+    write_imagef(output, (int2)(output_channel_width_idx, output_batch_height_idx), output_result);
+#else
+    FLOAT4 output_result = (FLOAT4)(-FLT_MAX);
+    for (int height = 0; height < kernel_wh.y; height++) {
+        int input_height_idx = input_height_start + height;
+        input_height_idx =
+            select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= input_wh.y));
+        if (input_height_idx != -1) {
+            for (int width = 0; width < kernel_wh.x; width++) {
+                int input_width_idx = input_width_start + width;
+                input_width_idx     = select(input_channel_start + input_width_idx, -1,
+                                         (input_width_idx < 0 || input_width_idx >= input_wh.x));
+
+                if (input_width_idx != -1) {
+                    FLOAT4 input_data = RI_F(input, SAMPLER, (int2)(input_width_idx, input_height_idx));
+                    output_result         = fmax(output_result, input_data);
+                }
+            }
+        }
+    }
+
+    const int output_channel_width_idx = mad24(output_channel_idx, output_width, output_width_idx);
+    WI_F(output, (int2)(output_channel_width_idx, output_batch_height_idx), output_result);
+#endif
+}
+
+__kernel void PoolingLocal(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                           __private const int2 input_wh, __private const int output_height,
+                           __private const int2 pad_wh,
+                           __private const int2 stride_wh,
+                           __private const int2 kernel_wh,
+                           __private const int local_block_size,
+                           __private const int2 local_block_size_wh,
+                           __private const int2 local_block_count_wh,
+                           __write_only image2d_t output,
+                           __local FLOAT4* local_output) {
+    const int local_id = get_local_id(0);
+    const int global_id                 = get_global_id(0);
+    const int output_channel_idx        = global_id / local_block_size;
+    const int output_width_idx          = get_global_id(1);
+    const int output_batch_height_idx   = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(global_id, output_width_idx, output_batch_height_idx);
+    const int output_width = global_size_dim1;
+
+    const int output_batch_idx      = output_batch_height_idx / output_height;
+    const int output_height_idx     = output_batch_height_idx - mul24(output_batch_idx, output_height);
+    const int input_start           = mul24(output_batch_idx, input_wh.y);
+    const int input_height_start    = mad24(output_height_idx, stride_wh.y, -pad_wh.y);
+    const int input_width_start     = mad24(output_width_idx, stride_wh.x, -pad_wh.x);
+    const int input_channel_start   = mul24(output_channel_idx, input_wh.x);
+    const int local_width_id        = local_id % local_block_size_wh.x;
+    const int local_height_id       = local_id / local_block_size_wh.x;
+
+#ifdef POOL_AVG
+    __local float4* avg_output = (__local float4*)local_output;
+    avg_output[local_id] = (float4)0;
+    int pos_h = local_height_id;
+
+    for (int local_h_block_id = 0; local_h_block_id < local_block_count_wh.y; local_h_block_id++) {
+        if (pos_h >= kernel_wh.y) break;
+        int pos_w = local_width_id;
+        int input_height_idx = input_height_start + pos_h;
+        input_height_idx =
+            select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= input_wh.y));
+        for (int local_w_block_id = 0; local_w_block_id < local_block_count_wh.x; local_w_block_id++) {
+            if (pos_w >= kernel_wh.x) break;
+            int input_width_idx = input_width_start + pos_w;
+            input_width_idx =
+                select(input_channel_start + input_width_idx, -1, (input_width_idx < 0 || input_width_idx >= input_wh.x));
+
+            float4 input_data = read_imagef(input, SAMPLER, (int2)(input_width_idx, input_height_idx));
+            avg_output[local_id] += input_data;
+            pos_w += local_block_size_wh.x;
+        }
+        pos_h += local_block_size_wh.y;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0; stride_h >>= 1) {
+        if (local_height_id < stride_h) {
+            avg_output[local_id] += avg_output[local_id + stride_h * local_block_size_wh.x];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0; stride_w >>= 1) {
+        if (local_height_id == 0 && local_width_id < stride_w) {
+            avg_output[local_id] += avg_output[local_id + stride_w];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) {
+        const int kernel_height_start = max(0, input_height_start);
+        const int kernel_width_start  = max(0, input_width_start);
+        const int kernel_height_end   = min(input_height_start + kernel_wh.y, input_wh.y);
+        const int kernel_width_end    = min(input_width_start + kernel_wh.x, input_wh.x);
+        const int block_size = mul24((kernel_height_end - kernel_height_start), (kernel_width_end - kernel_width_start));
+        avg_output[local_id] = avg_output[local_id] / (float)block_size;
+
+        const int output_channel_width_idx = mad24(output_channel_idx, output_width, output_width_idx);
+        write_imagef(output, (int2)(output_channel_width_idx, output_batch_height_idx), avg_output[local_id]);
+    }
+#else
+    local_output[local_id] = (FLOAT4)(-FLT_MAX);
+    int pos_h = local_height_id;
+
+    for (int local_h_block_id = 0; local_h_block_id < local_block_count_wh.y; local_h_block_id++) {
+        if (pos_h >= kernel_wh.y) break;
+        int pos_w = local_width_id;
+        int input_height_idx = input_height_start + pos_h;
+        input_height_idx =
+            select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= input_wh.y));
+        if (input_height_idx != -1) {
+            for (int local_w_block_id = 0; local_w_block_id < local_block_count_wh.x; local_w_block_id++) {
+                if (pos_w >= kernel_wh.x) break;
+                int input_width_idx = input_width_start + pos_w;
+                input_width_idx =
+                    select(input_channel_start + input_width_idx, -1, (input_width_idx < 0 || input_width_idx >= input_wh.x));
+
+                if (input_width_idx != -1) {
+                    FLOAT4 input_data = RI_F(input, SAMPLER, (int2)(input_width_idx, input_height_idx));
+                    local_output[local_id] = fmax(input_data, local_output[local_id]);
+                }
+                pos_w += local_block_size_wh.x;
+            }
+        }
+        pos_h += local_block_size_wh.y;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0; stride_h >>= 1) {
+        if (local_height_id < stride_h) {
+            local_output[local_id] = fmax(local_output[local_id + stride_h * local_block_size_wh.x], local_output[local_id]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0; stride_w >>= 1) {
+        if (local_height_id == 0 && local_width_id < stride_w) {
+            local_output[local_id] = fmax(local_output[local_id + stride_w], local_output[local_id]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) {
+        const int output_channel_width_idx = mad24(output_channel_idx, output_width, output_width_idx);
+        WI_F(output, (int2)(output_channel_width_idx, output_batch_height_idx), local_output[local_id]);
+    }
+#endif
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/pow.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/pow.cl
new file mode 100644
index 0000000..e4cfcaa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/pow.cl
@@ -0,0 +1,18 @@
+#include "base.inc"
+
+__kernel void Power(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                    __write_only image2d_t output, __private const float scale,
+                    __private const float shift,
+                    __private const float exponent) {
+    const int w                 = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int hb                = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(w, channel_block_idx, hb);
+    const int width = global_size_dim0;
+
+    const int pos = mad24(channel_block_idx, width, w);
+    FLOAT4 in     = RI_F(input, SAMPLER, (int2)(pos, hb));
+    FLOAT4 out = pow(in * (FLOAT)(scale) + (FLOAT)(shift), (FLOAT)(exponent));
+    WI_F(output, (int2)(pos, hb), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/prelu.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/prelu.cl
new file mode 100644
index 0000000..9ebec95
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/prelu.cl
@@ -0,0 +1,34 @@
+#include "base.inc"
+
+__kernel void PRelu(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                    __private const int width, __read_only image2d_t scope,
+                    __write_only image2d_t output) {
+    const int cw_idx      = get_global_id(0);
+    const int bh_idx      = get_global_id(1);
+    const int c_block_idx = cw_idx / width;
+
+    DEAL_NON_UNIFORM_DIM2(cw_idx, bh_idx);
+
+    FLOAT4 in    = RI_F(input, SAMPLER, (int2)(cw_idx, bh_idx));
+    FLOAT4 val_s = RI_F(scope, SAMPLER, (int2)(c_block_idx, 0));
+    FLOAT4 out   = select(in, in * val_s, in < 0);
+    WI_F(output, (int2)(cw_idx, bh_idx), out);
+}
+
+__kernel void PReluGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                        __read_only image2d_t scope,
+                        __write_only image2d_t output) {
+    const int w_idx       = get_global_id(0);
+    const int c_block_idx = get_global_id(1);
+    const int hb_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(w_idx, c_block_idx, hb_idx);
+    const int width = global_size_dim0;
+
+    const int pos = mad24(c_block_idx, width, w_idx);
+
+    FLOAT4 in    = RI_F(input, SAMPLER, (int2)(pos, hb_idx));
+    FLOAT4 val_s = RI_F(scope, SAMPLER, (int2)(c_block_idx, 0));
+    FLOAT4 out   = select(in, in * val_s, in < 0);
+    WI_F(output, (int2)(pos, hb_idx), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/reduce.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce.cl
new file mode 100644
index 0000000..714949e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce.cl
@@ -0,0 +1,305 @@
+#include "base.inc"
+
+#define REDUCE_INPUTS GLOBAL_SIZE_2_DIMS __read_only image2d_t input,               \
+                         __write_only image2d_t output,                             \
+                         __private const int input_n,                               \
+                         __private const int input_c,                               \
+                         __private const int input_h,                               \
+                         __private const int input_w,                               \
+                         __private const int c4_n,                                  \
+                         __private const int c4_r,                                  \
+                         __private const int cw4,                                   \
+                         __private const int axis_n                                 \
+
+#define REDUCE_MULTI_AXIS_INPUTS GLOBAL_SIZE_2_DIMS __read_only image2d_t input,    \
+                            __write_only image2d_t output,                          \
+                            __private const int input_n,                            \
+                            __private const int input_c,                            \
+                            __private const int input_h,                            \
+                            __private const int input_w,                            \
+                            __private const int c4_n,                               \
+                            __private const int c4_r,                               \
+                            __private const int cw4,                                \
+                            __private const int axis_n,                             \
+                            __private const int4 axis_nhwc                          \
+
+#define REDUCE_LOCAL_INPUTS GLOBAL_SIZE_2_DIMS __read_only image2d_t input,         \
+                            __write_only image2d_t output,                          \
+                            __private const int input_n,                            \
+                            __private const int input_c,                            \
+                            __private const int input_h,                            \
+                            __private const int input_w,                            \
+                            __private const int c4_n,                               \
+                            __private const int c4_r,                               \
+                            __private const int cw4,                                \
+                            __private const int axis_n,                             \
+                            __private const int local_block_size,                   \
+                            __local FLOAT4* local_output                            \
+
+#define REDUCE_WRITE_LOCAL_OUTPUT(local_id, group_size, local_output)               \
+    barrier(CLK_LOCAL_MEM_FENCE);                                                   \
+    for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) {     \
+        if (local_id < stride) {                                                    \
+            REDUCEOPERATOR(local_output[local_id], local_output[local_id + stride]) \
+        }                                                                           \
+        barrier(CLK_LOCAL_MEM_FENCE);                                               \
+    }                                                                               \
+                                                                                    \
+    if (local_id == 0) {                                                            \
+        local_output[local_id] = POSTOPERATOR(local_output[local_id]);              \
+        WI_F(output, (int2)(cw, bh), local_output[local_id]);                       \
+    }                                                                               \
+
+__kernel void ReduceMultiAxis(REDUCE_MULTI_AXIS_INPUTS) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 t;
+    FLOAT4 r = (FLOAT4)(DATAINIT);
+    int n_reduce_len = select(1, input_n, axis_nhwc.x);
+    int h_reduce_len = select(1, input_h, axis_nhwc.y);
+    int w_reduce_len = select(1, input_w, axis_nhwc.z);
+
+    for (unsigned short n = 0; n < n_reduce_len; n++) {
+        for (unsigned short h = 0; h < h_reduce_len; h++) {
+            for (unsigned short w = 0; w < w_reduce_len; w++) {
+                for (unsigned short c_4 = 0; c_4 < select(1, c4_n, axis_nhwc.w); c_4++) {
+                    t = RI_F(input, SAMPLER, (int2)(input_w * c_4 + w + cw * w_reduce_len, input_h * n + h + bh * h_reduce_len));
+                    OPERATOR(r, t)
+                }
+
+                if (axis_nhwc.w) {
+                    if (c4_r == 1) {
+                        t = RI_F(input, SAMPLER, (int2)(cw4 + w + cw, input_h * n + h + bh * h_reduce_len));
+                        OPERATOR(r.x, t.x)
+                    } else if (c4_r == 2) {
+                        t = RI_F(input, SAMPLER, (int2)(cw4 + w + cw, input_h * n + h + bh * h_reduce_len));
+                        OPERATOR(r.x, t.x)
+                        OPERATOR(r.y, t.y)
+                    } else if (c4_r == 3) {
+                        t = RI_F(input, SAMPLER, (int2)(cw4 + w + cw, input_h * n + h + bh * h_reduce_len));
+                        OPERATOR(r.x, t.x)
+                        OPERATOR(r.y, t.y)
+                        OPERATOR(r.z, t.z)
+                    }
+                }
+            }
+        }
+    }
+
+    if (axis_nhwc.w) {
+        r.x = INNEROPERATOR(r);
+    }
+    r = POSTOPERATOR(r);
+
+    WI_F(output, (int2)(cw, bh), r);
+}
+
+__kernel void ReduceC0(REDUCE_INPUTS) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 t;
+    FLOAT4 r = (FLOAT4)(DATAINIT);
+
+    for (unsigned short i = 0; i < input_n; i++) {
+        t = RI_F(input, SAMPLER, (int2)(cw, input_h * i + bh));
+        OPERATOR(r, t)
+    }
+    r = POSTOPERATOR(r);
+
+    WI_F(output, (int2)(cw, bh), r);
+}
+
+__kernel void ReduceC1(REDUCE_INPUTS) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 t;
+    FLOAT4 r = (FLOAT4)(DATAINIT);
+
+    for (unsigned short i = 0; i < c4_n; i++) {
+        t = RI_F(input, SAMPLER, (int2)(input_w * i + cw, bh));
+        OPERATOR(r, t)
+    }
+    if (c4_r == 1) {
+        t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+        OPERATOR(r.x, t.x)
+    } else if (c4_r == 2) {
+        t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+        OPERATOR(r.x, t.x)
+        OPERATOR(r.y, t.y)
+    } else if (c4_r == 3) {
+        t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+        OPERATOR(r.x, t.x)
+        OPERATOR(r.y, t.y)
+        OPERATOR(r.z, t.z)
+    }
+    
+    r.x = INNEROPERATOR(r);
+    r = POSTOPERATOR(r);
+
+    WI_F(output, (int2)(cw, bh), r);
+}
+
+__kernel void ReduceC2(REDUCE_INPUTS) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 t;
+    FLOAT4 r = (FLOAT4)(DATAINIT);
+
+    for (unsigned short i = 0; i < input_h; i++) {
+        t = RI_F(input, SAMPLER, (int2)(cw, input_h * bh + i));
+        OPERATOR(r, t)
+    }
+    r = POSTOPERATOR(r);
+
+    WI_F(output, (int2)(cw, bh), r);
+}
+
+__kernel void ReduceC3(REDUCE_INPUTS) {
+    const int cw = get_global_id(0);
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(cw, bh);
+
+    FLOAT4 t;
+    FLOAT4 r = (FLOAT4)(DATAINIT);
+
+    for (unsigned short i = 0; i < input_w; i++) {
+        t = RI_F(input, SAMPLER, (int2)(cw * input_w + i, bh));
+        OPERATOR(r, t)
+    }
+    r = POSTOPERATOR(r);
+
+    WI_F(output, (int2)(cw, bh), r);
+}
+
+__kernel void ReduceC0Local(REDUCE_LOCAL_INPUTS) {
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int cw = global_id / group_size;
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh);
+
+    FLOAT4 t;
+    local_output[local_id] = (FLOAT4)(DATAINIT);
+    int pos = local_id;
+
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= input_n) break;
+        t = RI_F(input, SAMPLER, (int2)(cw, input_h * pos + bh));
+        OPERATOR(local_output[local_id], t)
+        pos += group_size;
+    }
+
+    REDUCE_WRITE_LOCAL_OUTPUT(local_id, group_size, local_output)
+}
+
+__kernel void ReduceC1Local(REDUCE_LOCAL_INPUTS) {
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int cw = global_id / group_size;
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh);
+
+    FLOAT4 t;
+    local_output[local_id] = (FLOAT4)(DATAINIT);
+    int pos = local_id;
+
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= c4_n) break;
+        t = RI_F(input, SAMPLER, (int2)(input_w * pos + cw, bh));
+        OPERATOR(local_output[local_id], t)
+        pos += group_size;
+    }
+
+    if (local_id == 0) {
+        if (c4_r == 1) {
+            t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+            OPERATOR(local_output[local_id].x, t.x)
+        } else if (c4_r == 2) {
+            t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+            OPERATOR(local_output[local_id].x, t.x)
+            OPERATOR(local_output[local_id].y, t.y)
+        } else if (c4_r == 3) {
+            t = RI_F(input, SAMPLER, (int2)(cw4 + cw, bh));
+            OPERATOR(local_output[local_id].x, t.x)
+            OPERATOR(local_output[local_id].y, t.y)
+            OPERATOR(local_output[local_id].z, t.z)
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) {
+        if (local_id < stride) {
+            REDUCEOPERATOR(local_output[local_id], local_output[local_id + stride])
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) {
+        local_output[local_id].x = INNEROPERATOR(local_output[local_id]);
+        local_output[local_id] = POSTOPERATOR(local_output[local_id]);
+        WI_F(output, (int2)(cw, bh), local_output[local_id]);
+    }
+}
+
+__kernel void ReduceC2Local(REDUCE_LOCAL_INPUTS) {
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int cw = global_id / group_size;
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh);
+
+    FLOAT4 t;
+    local_output[local_id] = (FLOAT4)(DATAINIT);
+    int pos = local_id;
+
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= input_h) break;
+        t = RI_F(input, SAMPLER, (int2)(cw, input_h * bh + pos));
+        OPERATOR(local_output[local_id], t)
+        pos += group_size;
+    }
+
+    REDUCE_WRITE_LOCAL_OUTPUT(local_id, group_size, local_output)
+}
+
+__kernel void ReduceC3Local(REDUCE_LOCAL_INPUTS) {
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int cw = global_id / group_size;
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh);
+
+    FLOAT4 t;
+    local_output[local_id] = (FLOAT4)(DATAINIT);
+    int pos = local_id;
+
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= input_w) break;
+        t = RI_F(input, SAMPLER, (int2)(cw * input_w + pos, bh));
+        OPERATOR(local_output[local_id], t)
+        pos += group_size;
+    }
+
+    REDUCE_WRITE_LOCAL_OUTPUT(local_id, group_size, local_output)
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_max.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_max.cl
new file mode 100644
index 0000000..5b217f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_max.cl
@@ -0,0 +1,39 @@
+#include "base.inc"
+
+__kernel void ReduceMaxC(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                         __write_only image2d_t output,
+                         __private const int channels_block,
+                         __private const int remain_channels) {
+    const int width_idx        = get_global_id(0);
+    const int batch_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(width_idx, batch_height_idx);
+
+    FLOAT4 temp_data;
+    FLOAT4 max_result   = (FLOAT4)(-FLT_MAX);
+    int image_width_idx = width_idx;
+    for (short i = 0; i < channels_block - 1; ++i) {
+        temp_data =
+            RI_F(input, SAMPLER, (int2)(image_width_idx, batch_height_idx));
+        max_result = max(temp_data, max_result);
+        image_width_idx += global_size_dim0;
+    }
+
+    temp_data = RI_F(input, SAMPLER, (int2)(image_width_idx, batch_height_idx));
+    if (remain_channels == 1) {
+        temp_data.w = (FLOAT)(-FLT_MAX);
+    } else if (remain_channels == 2) {
+        temp_data.z = (FLOAT)(-FLT_MAX);
+        temp_data.w = (FLOAT)(-FLT_MAX);
+    } else if (remain_channels == 3) {
+        temp_data.y = (FLOAT)(-FLT_MAX);
+        temp_data.z = (FLOAT)(-FLT_MAX);
+        temp_data.w = (FLOAT)(-FLT_MAX);
+    }
+    max_result = max(temp_data, max_result);
+    FLOAT result =
+        max(max(max_result.x, max_result.y), max(max_result.z, max_result.w));
+
+    WI_F(output, (int2)(width_idx, batch_height_idx),
+         (FLOAT4)(result, (FLOAT)0.0f, (FLOAT)0.0f, (FLOAT)0.0f));
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_mean.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_mean.cl
new file mode 100644
index 0000000..f6f4cef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/reduce_mean.cl
@@ -0,0 +1,41 @@
+#include "base.inc"
+
+__kernel void ReduceMeanC(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                          __write_only image2d_t output,
+                          __private const float channel_inv,
+                          __private const int channels_block,
+                          __private const int remain_channels) {
+    const int width_idx        = get_global_id(0);
+    const int batch_height_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(width_idx, batch_height_idx);
+
+    FLOAT4 temp_data;
+    FLOAT4 accum_result = (FLOAT4)0.0f;
+    int image_width_idx = width_idx;
+    for (short i = 0; i < channels_block - 1; ++i) {
+        temp_data =
+            RI_F(input, SAMPLER, (int2)(image_width_idx, batch_height_idx));
+        accum_result += temp_data;
+        image_width_idx += global_size_dim0;
+    }
+
+    temp_data = RI_F(input, SAMPLER, (int2)(image_width_idx, batch_height_idx));
+    if (remain_channels == 1) {
+        temp_data.w = (FLOAT)0.0f;
+    } else if (remain_channels == 2) {
+        temp_data.z = (FLOAT)0.0f;
+        temp_data.w = (FLOAT)0.0f;
+    } else if (remain_channels == 3) {
+        temp_data.y = (FLOAT)0.0f;
+        temp_data.z = (FLOAT)0.0f;
+        temp_data.w = (FLOAT)0.0f;
+    }
+    accum_result += temp_data;
+    FLOAT result =
+        accum_result.x + accum_result.y + accum_result.z + accum_result.w;
+    result *= (FLOAT)channel_inv;
+
+    WI_F(output, (int2)(width_idx, batch_height_idx),
+         (FLOAT4)(result, (FLOAT)0.0f, (FLOAT)0.0f, (FLOAT)0.0f));
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/reorg.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/reorg.cl
new file mode 100644
index 0000000..69b0755
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/reorg.cl
@@ -0,0 +1,41 @@
+#include "base.inc"
+
+__kernel void Reorg(GLOBAL_SIZE_1_DIMS
+                    const __global FLOAT *input, 
+                    __global FLOAT *output,
+                    int w,
+                    int h,
+                    int c,
+                    int batch,
+                    int stride, 
+                    int stride_pow2,
+                    int forward,
+                    int mode
+                    ) {
+      int i = get_global_id(0);
+      DEAL_NON_UNIFORM_DIM1(i);
+      int in_index = i;
+      int in_w = i%w;
+      i = i/w;
+      int in_h = i%h;
+      i = i/h;
+      int in_c = i%c;
+      i = i/c;
+      int b = i%batch;
+      int out_c = c/(stride*stride);
+
+      int c2, offset;
+      c2 = select(in_c % out_c, in_c / stride_pow2, mode);
+      offset = select(in_c / out_c, in_c % stride_pow2, mode);
+      int w2 = in_w*stride + offset % stride;
+      int h2 = in_h*stride + offset / stride;
+      int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b));
+
+      if(forward) {
+          output[out_index] = input[in_index];
+      }
+      else {
+          output[in_index] = input[out_index];
+      }
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/selu.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/selu.cl
new file mode 100644
index 0000000..875e00c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/selu.cl
@@ -0,0 +1,17 @@
+#include "base.inc"
+
+__kernel void Selu(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                  __write_only image2d_t output, __private const float factor1, 
+                  __private const float factor2) {
+    const int w                 = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int hb                = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(w, channel_block_idx, hb);
+    const int width = global_size_dim0;
+
+    const int pos = mad24(channel_block_idx, width, w);
+    FLOAT4 in     = RI_F(input, SAMPLER, (int2)(pos, hb));
+    FLOAT4 out = select((FLOAT)(factor2) * in, (FLOAT)(factor1) * (exp(in)-(FLOAT)(1.0f)), in < 0);
+    WI_F(output, (int2)(pos, hb), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/shuffle.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/shuffle.cl
new file mode 100644
index 0000000..fe96126
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/shuffle.cl
@@ -0,0 +1,47 @@
+#include "base.inc"
+
+#define GET_RESULT_FROM_INPUT(V)                                               \
+    input_val = RI_F(                                                          \
+        input, SAMPLER,                                                        \
+        (int2)(mad24(input_channel_block_idx.V, width, width_idx), hb_idx));   \
+    if (input_channel_sub_idx.V == 0) {                                        \
+        result.V = input_val.x;                                                \
+    } else if (input_channel_sub_idx.V == 1) {                                 \
+        result.V = input_val.y;                                                \
+    } else if (input_channel_sub_idx.V == 2) {                                 \
+        result.V = input_val.z;                                                \
+    } else if (input_channel_sub_idx.V == 3) {                                 \
+        result.V = input_val.w;                                                \
+    }
+
+__kernel void ShuffleChannel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                             __write_only image2d_t output,
+                             __private const int group,
+                             __private const int group_size,
+                             __private const int channels) {
+    const int width_idx         = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int hb_idx            = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(width_idx, channel_block_idx, hb_idx);
+    const int width = global_size_dim0;
+
+    int2 output_pos =
+        (int2)(mad24(channel_block_idx, width, width_idx), hb_idx);
+    int4 output_channel_idx = (channel_block_idx << 2) + (int4)(0, 1, 2, 3);
+    int4 input_channel_idx =
+        output_channel_idx % group * group_size + output_channel_idx / group;
+    input_channel_idx            = min(input_channel_idx, (int4)(channels - 1));
+    int4 input_channel_block_idx = input_channel_idx >> 2;
+    int4 input_channel_sub_idx   = input_channel_idx % 4;
+
+    FLOAT4 result = (FLOAT4)(0, 0, 0, 0);
+    FLOAT4 input_val;
+
+    GET_RESULT_FROM_INPUT(x);
+    GET_RESULT_FROM_INPUT(y);
+    GET_RESULT_FROM_INPUT(z);
+    GET_RESULT_FROM_INPUT(w);
+
+    WI_F(output, output_pos, result);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/signed_mul.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/signed_mul.cl
new file mode 100644
index 0000000..d5bb514
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/signed_mul.cl
@@ -0,0 +1,22 @@
+#include "base.inc"
+
+__kernel void SignedMul(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, float alpha, float beta, float gamma_inv) {
+    const int w_idx       = get_global_id(0);
+    const int c_block_idx = get_global_id(1);
+    const int hb_idx      = get_global_id(2);
+    DEAL_NON_UNIFORM_DIM3(w_idx, c_block_idx, hb_idx);
+    const int width = global_size_dim0;
+
+    const int pos = mad24(c_block_idx, width, w_idx);
+
+    FLOAT4 value    = RI_F(input, SAMPLER, (int2)(pos, hb_idx));    
+    FLOAT mul_value   = RI_F(input, SAMPLER, (int2)(w_idx, hb_idx)).s0;
+    mul_value = mul_value - alpha;
+    value = value - (FLOAT4)(alpha);
+    mul_value = select(select(mul_value, (FLOAT)(-1), CONVERT_INT(mul_value < (FLOAT)0)), (FLOAT)1,
+                       CONVERT_INT(mul_value > (FLOAT)0));
+    value = select(select(value,(FLOAT4)(-1),value<(FLOAT4)0),(FLOAT4)1,value>(FLOAT4)0);
+    mul_value = (mul_value + beta) * gamma_inv;
+    value = (value + (FLOAT4)(beta)) * (FLOAT4)(gamma_inv * mul_value);
+    WI_F(output, (int2)(pos, hb_idx), value);    
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/softmax.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/softmax.cl
new file mode 100644
index 0000000..d9bd58e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/softmax.cl
@@ -0,0 +1,193 @@
+#include "base.inc"
+
+#define CACHE_SIZE 16
+
+__kernel void SoftmaxHeight(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                      __private const int4 shape // NCHW
+                      ) {
+    int wc = get_global_id(0);
+    int b = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(wc, b);
+    /*Compute Max */
+    FLOAT4 maxValue = RI_F(input, SAMPLER, (int2)(wc, b*shape.z));
+    for (int i=1; i<shape.z; ++i) {
+        maxValue = fmax(maxValue, RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)));
+    }
+    /*Compute Exp Sum*/
+    FLOAT4 sumValue = (FLOAT4)0;
+    for (int i=0; i<shape.z; ++i) {
+        sumValue += exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue);
+    }
+    /*Compute Result */
+    for (int i=0; i<shape.z; ++i) {
+        FLOAT4 value = exp(RI_F(input, SAMPLER, (int2)(wc, b*shape.z+i)) - maxValue) / sumValue;
+        WI_F(output, (int2)(wc, b*shape.z+i), value);
+    }
+}
+
+__kernel void SoftmaxChannel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
+                              __private const int remain_channels) {
+
+    const int channel_block_idx = get_global_id(0);
+    const int width_idx    = get_global_id(1);
+    const int batch_height_idx       = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
+
+    // const int width     = global_size_dim1;
+
+    FLOAT float_max_value = -FLT_MAX;
+    FLOAT4 input_data;
+    for (short i = 0; i < global_size_dim0 - 1; ++i) {
+        input_data      = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
+        float_max_value = max(float_max_value, input_data.x);
+        float_max_value = max(float_max_value, input_data.y);
+        float_max_value = max(float_max_value, input_data.z);
+        float_max_value = max(float_max_value, input_data.w);
+    }
+
+    input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1 , batch_height_idx));
+    if (remain_channels == 0) {
+        float_max_value = max(float_max_value, input_data.w);
+        float_max_value = max(float_max_value, input_data.z);
+        float_max_value = max(float_max_value, input_data.y);
+        float_max_value = max(float_max_value, input_data.x);
+    } else if (remain_channels == 1) {
+        float_max_value = max(float_max_value, input_data.z);
+        float_max_value = max(float_max_value, input_data.y);
+        float_max_value = max(float_max_value, input_data.x);
+    } else if (remain_channels == 2) {
+        float_max_value = max(float_max_value, input_data.y);
+        float_max_value = max(float_max_value, input_data.x);
+    } else if (remain_channels == 3) {
+        float_max_value = max(float_max_value, input_data.x);
+    }
+
+    FLOAT accum_result       = 0;
+    for (short i = 0; i < global_size_dim0 - 1; ++i) {
+        input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
+        input_data = exp(input_data - float_max_value);
+        accum_result += input_data.x;
+        accum_result += input_data.y;
+        accum_result += input_data.z;
+        accum_result += input_data.w;
+    }
+
+    input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1, batch_height_idx));
+    input_data -= float_max_value;
+    if (remain_channels == 0) {
+        accum_result += exp(input_data.w);
+        accum_result += exp(input_data.z);
+        accum_result += exp(input_data.y);
+        accum_result += exp(input_data.x);
+    } else if (remain_channels == 1) {
+        accum_result += exp(input_data.z);
+        accum_result += exp(input_data.y);
+        accum_result += exp(input_data.x);
+    } else if (remain_channels == 2) {
+        accum_result += exp(input_data.y);
+        accum_result += exp(input_data.x);
+    } else if (remain_channels == 3) {
+        accum_result += exp(input_data.x);
+    }
+
+    int cur_out_width_pos  = mad24(channel_block_idx, global_size_dim1, width_idx);
+    input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value;
+    const int output_remain = mul24(channel_block_idx, 4) - output_channels;
+
+    if (output_remain == 1) {
+        input_data.z = exp(input_data.z) / accum_result;
+        input_data.y = exp(input_data.y) / accum_result;
+        input_data.x = exp(input_data.x) / accum_result;
+    } else if (output_remain == 2) {
+        input_data.y = exp(input_data.y) / accum_result;
+        input_data.x = exp(input_data.x) / accum_result;
+    } else if (output_remain == 3) {
+        input_data.x = exp(input_data.x) / accum_result;
+    } else{
+        input_data = exp(input_data) / accum_result;
+    }
+
+    WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
+}
+
+__kernel void SoftmaxHeightLocal(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                                 __private const int4 shape,
+                                 __private const int local_block_size,
+                                 __local FLOAT4* local_output) {
+    const int local_id = get_local_id(0);
+    const int group_size = get_local_size(0);
+    const int global_id = get_global_id(0);
+    const int cw = global_id / group_size;
+    const int bh = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(global_id, bh);
+
+    FLOAT4 t;
+    FLOAT4 image_cache[CACHE_SIZE];
+    int pos = local_id;
+    image_cache[0] = RI_F(input, SAMPLER, (int2)(cw, shape.z * bh + pos));
+    local_output[local_id] = image_cache[0];
+
+    for (unsigned short i = 1; i < local_block_size; i++) {
+        pos += group_size;
+        if (pos >= shape.z) break;
+        if (i < CACHE_SIZE) {
+            image_cache[i] = RI_F(input, SAMPLER, (int2)(cw, shape.z * bh + pos));
+            local_output[local_id] = fmax(local_output[local_id], image_cache[i]);
+        } else {
+            t = RI_F(input, SAMPLER, (int2)(cw, shape.z * bh + pos));
+            local_output[local_id] = fmax(local_output[local_id], t);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) {
+        if (local_id < stride) {
+            local_output[local_id] = fmax(local_output[local_id], local_output[local_id + stride]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    FLOAT4 max_value = local_output[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    pos = local_id;
+    image_cache[0] = exp(image_cache[0] - max_value);
+    local_output[local_id] = image_cache[0];
+
+    for (unsigned short i = 1; i < local_block_size; i++) {
+        pos += group_size;
+        if (pos >= shape.z) break;
+        if (i < CACHE_SIZE) {
+            image_cache[i] = exp(image_cache[i] - max_value);
+            local_output[local_id] += image_cache[i];
+        } else {
+            t = RI_F(input, SAMPLER, (int2)(cw, shape.z * bh + pos));
+            local_output[local_id] += exp(t - max_value);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) {
+        if (local_id < stride) {
+            local_output[local_id] += local_output[local_id + stride];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    FLOAT4 sum_value = local_output[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    pos = local_id;
+    for (unsigned short i = 0; i < local_block_size; i++) {
+        if (pos >= shape.z) break;
+        if (i < CACHE_SIZE) {
+            WI_F(output, (int2)(cw, shape.z * bh + pos), image_cache[i] / sum_value);
+        } else {
+            t = RI_F(input, SAMPLER, (int2)(cw, shape.z * bh + pos));
+            WI_F(output, (int2)(cw, shape.z * bh + pos), exp(t - max_value) / sum_value);
+        }
+        pos += group_size;
+    }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/stride_slice.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/stride_slice.cl
new file mode 100644
index 0000000..f7f4c15
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/stride_slice.cl
@@ -0,0 +1,77 @@
+#include "base.inc"
+
+__kernel void StrideSliceC4Unite(
+                    __read_only image2d_t input, 
+                    __write_only image2d_t output,
+                    //[n,c,h,w]
+                    int4 begins,
+                    int4 strides,
+                    int2 input_wh,
+                    int2 output_wh
+                    ) {
+    int w = output_wh.x;
+    int h = output_wh.y;    
+    int2 xy = (int2)(get_global_id(0), get_global_id(1));
+    //N, C, H, W
+    int4 posOutput = (int4)(xy.y/h, xy.x/w, xy.y%h, xy.x%w);
+    int4 posInput = mul24(posOutput, strides) + begins;
+
+    int2 outputPos = (int2)(posOutput.w + posOutput.y*output_wh.x, posOutput.x*output_wh.y + posOutput.z);
+    int2 inputPos = (int2)(posInput.w + posInput.y*input_wh.x, posInput.x*input_wh.y + posInput.z);
+
+    WI_F(output, outputPos, RI_F(input, SAMPLER, inputPos));
+}
+
+__kernel void StrideSliceC4Separate(
+                    GLOBAL_SIZE_2_DIMS
+                    __global FLOAT *input, 
+                    __write_only image2d_t output,
+                    //[n,c,h,w]
+                    int4 begins,
+                    int4 strides,
+                    int input_w_size,
+                    int input_hw_size,
+                    int input_chw_size,
+                    int input_channel,
+                    int2 output_wh,
+                    int output_channel
+                    ) {
+    int image_width_idx  = get_global_id(0);
+    int image_height_idx = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+
+    int w = output_wh.x;
+    int h = output_wh.y;
+    //N, C, H, W
+    int4 posOutput = (int4)(image_height_idx/h, (image_width_idx/w) << 2,
+                              image_height_idx%h, image_width_idx%w);
+    int4 posInput = mul24(posOutput, strides) + begins;
+
+    int2 outputPos = (int2)(image_width_idx, image_height_idx);
+    
+    int pos = mul24(posInput.x, input_chw_size) + mul24(posInput.y, input_hw_size) + 
+                  mul24(posInput.z, input_w_size) + posInput.w;
+
+    int pos_channel_stride = mul24(strides.y , input_hw_size);
+
+    const int remain_channel = output_channel - posOutput.y;
+
+    FLOAT4 value = 0;
+    if (remain_channel >= 4) {
+        value.x = input[pos];
+        value.y = input[pos + pos_channel_stride];
+        value.z = input[pos + mul24(2, pos_channel_stride)];
+        value.w = input[pos + mul24(3, pos_channel_stride)];
+    } else if (remain_channel == 3) {
+        value.x = input[pos];
+        value.y = input[pos + pos_channel_stride];
+        value.z = input[pos + mul24(2, pos_channel_stride)];
+    } else if (remain_channel == 2) {
+        value.x = input[pos];
+        value.y = input[pos + pos_channel_stride];
+    } else if (remain_channel == 1) {
+        value.x = input[pos];
+    }
+
+    WI_F(output, outputPos, value);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/tile.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/tile.cl
new file mode 100644
index 0000000..a740150
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/tile.cl
@@ -0,0 +1,99 @@
+#include "base.inc"
+
+__kernel void Tile(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                   __private const int input_batch, __private const int input_channel, __private const int input_height,
+                   __private const int input_width, __private const int output_batch,
+                   __private const int output_channel, __private const int output_height,
+                   __private const int output_width, __private const int out_chw, __private const int out_hw) {
+    const int cw = get_global_id(0);
+    const int hb = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, hb);
+    int out_batch   = hb / output_height;
+    int out_channel = cw / output_width;
+    int out_height  = hb % output_height;
+    int out_width   = cw % output_width;
+
+    // compute four position
+    // 0 index
+    int index = out_batch * out_chw + out_channel * 4 * out_hw + out_height * output_width + out_width;
+
+    int batch = index / out_chw % input_batch;
+
+    int channel = index / out_hw % input_channel;
+
+    int h = index / output_width % input_height;
+
+    int w = index % input_width;
+
+    FLOAT4 in  = RI_F(input, SAMPLER, (int2)(channel / 4 * input_width + w, batch * input_height + h));
+    int idx    = channel % 4;
+    FLOAT out0 = idx == 0 ? in.x : (idx == 1 ? in.y : (idx == 2 ? in.z : in.w));
+
+    // 1 index
+    index = index + output_height * output_width;
+
+    batch = index / out_chw % input_batch;
+
+    channel = index / out_hw % input_channel;
+
+    h = index / output_width % input_height;
+
+    w = index % input_width;
+
+    in         = RI_F(input, SAMPLER, (int2)(channel / 4 * input_width + w, batch * input_height + h));
+    idx        = channel % 4;
+    FLOAT out1 = idx == 0 ? in.x : (idx == 1 ? in.y : (idx == 2 ? in.z : in.w));
+    out1       = (out_channel * 4 + 1) >= output_channel ? (FLOAT)0 : out1;
+
+    // 2 index
+    index = index + output_height * output_width;
+
+    batch = index / out_chw % input_batch;
+
+    channel = index / out_hw % input_channel;
+
+    h = index / output_width % input_height;
+
+    w = index % input_width;
+
+    in         = RI_F(input, SAMPLER, (int2)(channel / 4 * input_width + w, batch * input_height + h));
+    idx        = channel % 4;
+    FLOAT out2 = idx == 0 ? in.x : (idx == 1 ? in.y : (idx == 2 ? in.z : in.w));
+    out2       = (out_channel * 4 + 2) >= output_channel ? (FLOAT)0 : out2;
+
+    // 3 index
+    index = index + output_height * output_width;
+
+    batch = index / out_chw % input_batch;
+
+    channel = index / out_hw % input_channel;
+
+    h = index / output_width % input_height;
+
+    w = index % input_width;
+
+    in         = RI_F(input, SAMPLER, (int2)(channel / 4 * input_width + w, batch * input_height + h));
+    idx        = channel % 4;
+    FLOAT out3 = idx == 0 ? in.x : (idx == 1 ? in.y : (idx == 2 ? in.z : in.w));
+    out3       = (out_channel * 4 + 3) >= output_channel ? (FLOAT)0 : out3;
+
+    FLOAT4 out = (FLOAT4)(out0, out1, out2, out3);
+    WI_F(output, (int2)(cw, hb), out);
+}
+
+__kernel void Tile_nhw(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __write_only image2d_t output,
+                       __private const int input_batch, __private const int input_channel,
+                       __private const int input_height, __private const int input_width,
+                       __private const int output_batch, __private const int output_channel,
+                       __private const int output_height, __private const int output_width) {
+    const int cw = get_global_id(0);
+    const int hb = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(cw, hb);
+    int out_batch   = (hb / output_height) % input_batch;
+    int out_channel = (cw / output_width);
+    int h           = (hb % output_height) % input_height;
+    int w           = (cw % output_width) % input_width;
+
+    FLOAT4 in = RI_F(input, SAMPLER, (int2)(out_channel * input_width + w, out_batch * input_height + h));
+    WI_F(output, (int2)(cw, hb), in);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/unary.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/unary.cl
new file mode 100644
index 0000000..0099b0b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/unary.cl
@@ -0,0 +1,15 @@
+#include "base.inc"
+
+__kernel void Unary(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output) {
+    const int w                 = get_global_id(0);
+    const int channel_block_idx = get_global_id(1);
+    const int hb                = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(w, channel_block_idx, hb);
+    const int width = global_size_dim0;
+
+    const int pos  = mad24(channel_block_idx, width, w);
+    FLOAT4 in  = RI_F(input, SAMPLER, (int2)(pos, hb));
+    FLOAT4 out = OPERATOR;
+    WI_F(output, (int2)(pos, hb), out);
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/upsample.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/upsample.cl
new file mode 100644
index 0000000..61bc043
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/upsample.cl
@@ -0,0 +1,513 @@
+#include "base.inc"
+
+__kernel void Nearest(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float height_scale,
+                      __private const float width_scale,
+                      __private const int input_height,
+                      __private const int input_width,
+                      __private const int out_height,
+                      __private const int out_width) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_w_idx       = output_cw_idx % out_width;
+    const int output_c_block_idx = output_cw_idx / out_width;
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    const float scale_height = output_h_idx * height_scale;
+    const float scale_width  = output_w_idx * width_scale;
+    const int height_lf      = max(0, (int)floor(scale_height));
+    const int width_lf       = max(0, (int)floor(scale_width));
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 out = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+
+    write_imagef(output, (int2)(output_cw_idx, output_bh_idx), out);
+}
+
+__kernel void Bilinear(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                       __write_only image2d_t output,
+                       __private const float height_scale,
+                       __private const float width_scale,
+                       __private const int input_height,
+                       __private const int input_width,
+                       __private const int out_height,
+                       __private const int out_width) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_w_idx       = output_cw_idx % out_width;
+    const int output_c_block_idx = output_cw_idx / out_width;
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    float scale_height  = ((float)output_h_idx + 0.5f) * height_scale - 0.5f;
+    float scale_width   = ((float)output_w_idx + 0.5f) * width_scale - 0.5f;
+    scale_height        = max(0.0f, scale_height);
+    scale_width         = max(0.0f, scale_width);
+    const int height_lf = max(0, (int)floor(scale_height));
+    const int height_uf = min(input_height - 1, height_lf + 1);
+    const int width_lf  = max(0, (int)floor(scale_width));
+    const int width_uf  = min(input_width - 1, width_lf + 1);
+
+    const float height_gap = scale_height - height_lf;
+    const float width_gap  = scale_width - width_lf;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 top_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+    float4 top_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_lf));
+    float4 bottom_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_uf));
+    float4 bottom_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_uf));
+
+    float4 top    = mad((top_right - top_left), width_gap, top_left);
+    float4 bottom = mad((bottom_right - bottom_left), width_gap, bottom_left);
+    float4 out    = mad((bottom - top), height_gap, top);
+
+    write_imagef(output, (int2)(output_cw_idx, output_bh_idx), out);
+}
+
+__kernel void BilinearAlignCorners(
+    GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+    __write_only image2d_t output, __private const float height_scale,
+    __private const float width_scale, __private const int input_height,
+    __private const int input_width, __private const int out_height,
+    __private const int out_width) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_w_idx       = output_cw_idx % out_width;
+    const int output_c_block_idx = output_cw_idx / out_width;
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    float scale_height  = (float)output_h_idx * height_scale;
+    float scale_width   = (float)output_w_idx * width_scale;
+    const int height_lf = max(0, (int)floor(scale_height));
+    const int height_uf = min(input_height - 1, height_lf + 1);
+    const int width_lf  = max(0, (int)floor(scale_width));
+    const int width_uf  = min(input_width - 1, width_lf + 1);
+
+    const float height_gap = scale_height - height_lf;
+    const float width_gap  = scale_width - width_lf;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 top_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+    float4 top_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_lf));
+    float4 bottom_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_uf));
+    float4 bottom_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_uf));
+
+    float4 top    = mad((top_right - top_left), width_gap, top_left);
+    float4 bottom = mad((bottom_right - bottom_left), width_gap, bottom_left);
+    float4 out    = mad((bottom - top), height_gap, top);
+
+    write_imagef(output, (int2)(output_cw_idx, output_bh_idx), out);
+}
+
+static inline float4 UpsampleCubicInterpolation(float4 A, float4 B, float4 C, float4 D, float factor) {
+    // refer to opencv
+    const float w = -0.75f;
+
+    float coeffs[4];
+    const float factor_plus_1 = factor + 1;
+    const float factor_remain = 1 - factor;
+    coeffs[0] = mad(mad(w, factor_plus_1, 3.75f), factor_plus_1, -6.0f) * factor_plus_1 + 3.0f;
+    coeffs[1] = mad(1.25f, factor, -2.25f) * factor * factor + 1;
+    coeffs[2] = mad(1.25f, factor_remain, - 2.25f) * factor_remain * factor_remain + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+
+    return A * coeffs[0] + B * coeffs[1] + C * coeffs[2] + D * coeffs[3];
+}
+
+#define CUBIC_READ_INPUT(i, scale_h_pos) \
+    float4 A_h##i = read_imagef(input, SAMPLER, (int2)(scale_w_pos.x, scale_h_pos)); \
+    float4 B_h##i = read_imagef(input, SAMPLER, (int2)(scale_w_pos.y, scale_h_pos)); \
+    float4 C_h##i = read_imagef(input, SAMPLER, (int2)(scale_w_pos.z, scale_h_pos)); \
+    float4 D_h##i = read_imagef(input, SAMPLER, (int2)(scale_w_pos.w, scale_h_pos));
+
+__kernel void Cubic(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private const float height_scale,
+                    __private const float width_scale,
+                    __private const int input_height,
+                    __private const int input_width,
+                    __private const int out_height,
+                    __private const int out_width) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_w_idx       = output_cw_idx % out_width;
+    const int output_c_block_idx = output_cw_idx / out_width;
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float scale_height  = ((float)output_h_idx + 0.5f) * height_scale - 0.5f;
+    float scale_width   = ((float)output_w_idx + 0.5f) * width_scale - 0.5f;
+    int scale_hh        = floor(scale_height);
+    int scale_ww        = floor(scale_width);
+
+    const float h_factor = scale_height - scale_hh;
+    const float w_factor = scale_width - scale_ww;
+
+    int4 scale_h_pos    = {scale_hh - 1, scale_hh, scale_hh + 1, scale_hh + 2};
+    scale_h_pos = clamp(scale_h_pos, (int4)0, (int4)(input_height - 1));
+    scale_h_pos += input_h_offset;
+
+    int4 scale_w_pos    = {scale_ww - 1, scale_ww, scale_ww + 1, scale_ww + 2};
+    scale_w_pos = clamp(scale_w_pos, (int4)0, (int4)(input_width - 1));
+    scale_w_pos += input_w_offset;
+
+    float4 out_h0, out_h1, out_h2, out_h3, out;
+
+    CUBIC_READ_INPUT(0, scale_h_pos.x);
+    CUBIC_READ_INPUT(1, scale_h_pos.y);
+    CUBIC_READ_INPUT(2, scale_h_pos.z);
+    CUBIC_READ_INPUT(3, scale_h_pos.w);
+
+    out_h0 = UpsampleCubicInterpolation(A_h0, B_h0, C_h0, D_h0, w_factor);
+    out_h1 = UpsampleCubicInterpolation(A_h1, B_h1, C_h1, D_h1, w_factor);
+    out_h2 = UpsampleCubicInterpolation(A_h2, B_h2, C_h2, D_h2, w_factor);
+    out_h3 = UpsampleCubicInterpolation(A_h3, B_h3, C_h3, D_h3, w_factor);
+
+    out = UpsampleCubicInterpolation(out_h0, out_h1, out_h2, out_h3, h_factor);
+
+    write_imagef(output, (int2)(output_cw_idx, output_bh_idx), out);
+}
+
+__kernel void CubicAlignCorners(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,
+                                __write_only image2d_t output,
+                                __private const float height_scale,
+                                __private const float width_scale,
+                                __private const int input_height,
+                                __private const int input_width,
+                                __private const int out_height,
+                                __private const int out_width) {
+    const int output_cw_idx = get_global_id(0);
+    const int output_bh_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int output_w_idx       = output_cw_idx % out_width;
+    const int output_c_block_idx = output_cw_idx / out_width;
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float scale_height  = (float)output_h_idx * height_scale;
+    float scale_width   = (float)output_w_idx * width_scale;
+    int scale_hh        = floor(scale_height);
+    int scale_ww        = floor(scale_width);
+
+    const float h_factor = scale_height - scale_hh;
+    const float w_factor = scale_width - scale_ww;
+
+    int4 scale_h_pos    = {scale_hh - 1, scale_hh, scale_hh + 1, scale_hh + 2};
+    scale_h_pos = clamp(scale_h_pos, (int4)0, (int4)(input_height - 1));
+    scale_h_pos += input_h_offset;
+
+    int4 scale_w_pos    = {scale_ww - 1, scale_ww, scale_ww + 1, scale_ww + 2};
+    scale_w_pos = clamp(scale_w_pos, (int4)0, (int4)(input_width - 1));
+    scale_w_pos += input_w_offset;
+
+    float4 out_h0, out_h1, out_h2, out_h3, out;
+
+    CUBIC_READ_INPUT(0, scale_h_pos.x);
+    CUBIC_READ_INPUT(1, scale_h_pos.y);
+    CUBIC_READ_INPUT(2, scale_h_pos.z);
+    CUBIC_READ_INPUT(3, scale_h_pos.w);
+
+    out_h0 = UpsampleCubicInterpolation(A_h0, B_h0, C_h0, D_h0, w_factor);
+    out_h1 = UpsampleCubicInterpolation(A_h1, B_h1, C_h1, D_h1, w_factor);
+    out_h2 = UpsampleCubicInterpolation(A_h2, B_h2, C_h2, D_h2, w_factor);
+    out_h3 = UpsampleCubicInterpolation(A_h3, B_h3, C_h3, D_h3, w_factor);
+
+    out = UpsampleCubicInterpolation(out_h0, out_h1, out_h2, out_h3, h_factor);
+
+    write_imagef(output, (int2)(output_cw_idx, output_bh_idx), out);
+}
+
+__kernel void NearestGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                          __write_only image2d_t output,
+                          __private const float height_scale,
+                          __private const float width_scale,
+                          __private const int input_height,
+                          __private const int input_width,
+                          __private const int out_height) {
+    const int output_w_idx       = get_global_id(0);
+    const int output_c_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, output_c_block_idx, output_bh_idx);
+    const int output_width = global_size_dim0;
+
+    const int output_b_idx = output_bh_idx / out_height;
+    const int output_h_idx = output_bh_idx % out_height;
+
+    const float scale_height = output_h_idx * height_scale;
+    const float scale_width  = output_w_idx * width_scale;
+    const int height_lf      = max(0, (int)floor(scale_height));
+    const int width_lf       = max(0, (int)floor(scale_width));
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 out = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+
+    const int out_image_w =
+        mad24(output_c_block_idx, output_width, output_w_idx);
+    const int out_image_h = mad24(output_b_idx, out_height, output_h_idx);
+
+    write_imagef(output, (int2)(out_image_w, out_image_h), out);
+}
+
+__kernel void BilinearGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+                           __write_only image2d_t output,
+                           __private const float height_scale,
+                           __private const float width_scale,
+                           __private const int input_height,
+                           __private const int input_width,
+                           __private const int out_height) {
+    const int output_w_idx       = get_global_id(0);
+    const int output_c_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, output_c_block_idx, output_bh_idx);
+    const int output_width = global_size_dim0;
+
+    const int output_b_idx = output_bh_idx / out_height;
+    const int output_h_idx = output_bh_idx % out_height;
+
+    float scale_height  = ((float)output_h_idx + 0.5f) * height_scale - 0.5f;
+    float scale_width   = ((float)output_w_idx + 0.5f) * width_scale - 0.5f;
+    scale_height        = max(0.0f, scale_height);
+    scale_width         = max(0.0f, scale_width);
+    const int height_lf = max(0, (int)floor(scale_height));
+    const int height_uf = min(input_height - 1, height_lf + 1);
+    const int width_lf  = max(0, (int)floor(scale_width));
+    const int width_uf  = min(input_width - 1, width_lf + 1);
+
+    const float height_gap = scale_height - height_lf;
+    const float width_gap  = scale_width - width_lf;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 top_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+    float4 top_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_lf));
+    float4 bottom_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_uf));
+    float4 bottom_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_uf));
+
+    float4 top    = mad((top_right - top_left), width_gap, top_left);
+    float4 bottom = mad((bottom_right - bottom_left), width_gap, bottom_left);
+    float4 out    = mad((bottom - top), height_gap, top);
+
+    const int out_image_w =
+        mad24(output_c_block_idx, output_width, output_w_idx);
+    const int out_image_h = mad24(output_b_idx, out_height, output_h_idx);
+
+    write_imagef(output, (int2)(out_image_w, out_image_h), out);
+}
+
+__kernel void BilinearAlignCornersGS3D(
+    GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+    __write_only image2d_t output, __private const float height_scale,
+    __private const float width_scale, __private const int input_height,
+    __private const int input_width, __private const int out_height) {
+    const int output_w_idx       = get_global_id(0);
+    const int output_c_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, output_c_block_idx, output_bh_idx);
+    const int output_width = global_size_dim0;
+
+    const int output_b_idx = output_bh_idx / out_height;
+    const int output_h_idx = output_bh_idx % out_height;
+
+    float scale_height  = (float)output_h_idx * height_scale;
+    float scale_width   = (float)output_w_idx * width_scale;
+    const int height_lf = max(0, (int)floor(scale_height));
+    const int height_uf = min(input_height - 1, height_lf + 1);
+    const int width_lf  = max(0, (int)floor(scale_width));
+    const int width_uf  = min(input_width - 1, width_lf + 1);
+
+    const float height_gap = scale_height - height_lf;
+    const float width_gap  = scale_width - width_lf;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float4 top_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_lf));
+    float4 top_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_lf));
+    float4 bottom_left = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_lf, input_h_offset + height_uf));
+    float4 bottom_right = read_imagef(
+        input, SAMPLER,
+        (int2)(input_w_offset + width_uf, input_h_offset + height_uf));
+
+    float4 top    = mad((top_right - top_left), width_gap, top_left);
+    float4 bottom = mad((bottom_right - bottom_left), width_gap, bottom_left);
+    float4 out    = mad((bottom - top), height_gap, top);
+
+    const int out_image_w =
+        mad24(output_c_block_idx, output_width, output_w_idx);
+    const int out_image_h = mad24(output_b_idx, out_height, output_h_idx);
+
+    write_imagef(output, (int2)(out_image_w, out_image_h), out);
+}
+
+__kernel void CubicGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+    __write_only image2d_t output, __private const float height_scale,
+    __private const float width_scale, __private const int input_height,
+    __private const int input_width, __private const int out_height) {
+    const int output_w_idx       = get_global_id(0);
+    const int output_c_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, output_c_block_idx, output_bh_idx);
+    const int output_width = global_size_dim0;
+
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float scale_height  = ((float)output_h_idx + 0.5f) * height_scale - 0.5f;
+    float scale_width   = ((float)output_w_idx + 0.5f) * width_scale - 0.5f;
+    int scale_hh        = floor(scale_height);
+    int scale_ww        = floor(scale_width);
+
+    const float h_factor = scale_height - scale_hh;
+    const float w_factor = scale_width - scale_ww;
+
+    int4 scale_h_pos    = {scale_hh - 1, scale_hh, scale_hh + 1, scale_hh + 2};
+    scale_h_pos = clamp(scale_h_pos, (int4)0, (int4)(input_height - 1));
+    scale_h_pos += input_h_offset;
+
+    int4 scale_w_pos    = {scale_ww - 1, scale_ww, scale_ww + 1, scale_ww + 2};
+    scale_w_pos = clamp(scale_w_pos, (int4)0, (int4)(input_width - 1));
+    scale_w_pos += input_w_offset;
+
+    float4 out_h0, out_h1, out_h2, out_h3, out;
+
+    CUBIC_READ_INPUT(0, scale_h_pos.x);
+    CUBIC_READ_INPUT(1, scale_h_pos.y);
+    CUBIC_READ_INPUT(2, scale_h_pos.z);
+    CUBIC_READ_INPUT(3, scale_h_pos.w);
+
+    out_h0 = UpsampleCubicInterpolation(A_h0, B_h0, C_h0, D_h0, w_factor);
+    out_h1 = UpsampleCubicInterpolation(A_h1, B_h1, C_h1, D_h1, w_factor);
+    out_h2 = UpsampleCubicInterpolation(A_h2, B_h2, C_h2, D_h2, w_factor);
+    out_h3 = UpsampleCubicInterpolation(A_h3, B_h3, C_h3, D_h3, w_factor);
+
+    out = UpsampleCubicInterpolation(out_h0, out_h1, out_h2, out_h3, h_factor);
+
+    const int out_image_w =
+        mad24(output_c_block_idx, output_width, output_w_idx);
+    write_imagef(output, (int2)(out_image_w, output_bh_idx), out);
+}
+
+__kernel void CubicAlignCornersGS3D(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,
+    __write_only image2d_t output, __private const float height_scale,
+    __private const float width_scale, __private const int input_height,
+    __private const int input_width, __private const int out_height) {
+    const int output_w_idx       = get_global_id(0);
+    const int output_c_block_idx = get_global_id(1);
+    const int output_bh_idx      = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_w_idx, output_c_block_idx, output_bh_idx);
+    const int output_width = global_size_dim0;
+
+    const int output_b_idx       = output_bh_idx / out_height;
+    const int output_h_idx       = output_bh_idx % out_height;
+
+    const int input_w_offset = mul24(output_c_block_idx, input_width);
+    const int input_h_offset = mul24(output_b_idx, input_height);
+
+    float scale_height  = (float)output_h_idx * height_scale;
+    float scale_width   = (float)output_w_idx * width_scale;
+    int scale_hh        = floor(scale_height);
+    int scale_ww        = floor(scale_width);
+
+    const float h_factor = scale_height - scale_hh;
+    const float w_factor = scale_width - scale_ww;
+
+    int4 scale_h_pos    = {scale_hh - 1, scale_hh, scale_hh + 1, scale_hh + 2};
+    scale_h_pos = clamp(scale_h_pos, (int4)0, (int4)(input_height - 1));
+    scale_h_pos += input_h_offset;
+
+    int4 scale_w_pos    = {scale_ww - 1, scale_ww, scale_ww + 1, scale_ww + 2};
+    scale_w_pos = clamp(scale_w_pos, (int4)0, (int4)(input_width - 1));
+    scale_w_pos += input_w_offset;
+
+    float4 out_h0, out_h1, out_h2, out_h3, out;
+
+    CUBIC_READ_INPUT(0, scale_h_pos.x);
+    CUBIC_READ_INPUT(1, scale_h_pos.y);
+    CUBIC_READ_INPUT(2, scale_h_pos.z);
+    CUBIC_READ_INPUT(3, scale_h_pos.w);
+
+    out_h0 = UpsampleCubicInterpolation(A_h0, B_h0, C_h0, D_h0, w_factor);
+    out_h1 = UpsampleCubicInterpolation(A_h1, B_h1, C_h1, D_h1, w_factor);
+    out_h2 = UpsampleCubicInterpolation(A_h2, B_h2, C_h2, D_h2, w_factor);
+    out_h3 = UpsampleCubicInterpolation(A_h3, B_h3, C_h3, D_h3, w_factor);
+
+    out = UpsampleCubicInterpolation(out_h0, out_h1, out_h2, out_h3, h_factor);
+
+    const int out_image_w =
+        mad24(output_c_block_idx, output_width, output_w_idx);
+    write_imagef(output, (int2)(out_image_w, output_bh_idx), out);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/warp_affine.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/warp_affine.cl
new file mode 100644
index 0000000..bde31c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/warp_affine.cl
@@ -0,0 +1,189 @@
+#include "base.inc"
+
+#define INTER_REMAP_COEF_BITS  15
+#define INTER_REMAP_COEF_SCALE (1<<INTER_REMAP_COEF_BITS)
+#define INTER_BITS      5
+#define INTER_TAB_SIZE  (1<<INTER_BITS)
+#define KSIZE 2
+#define AB_BITS 10
+#define AB_SCALE (1 << AB_BITS)
+#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))
+
+__constant float coeffs[64] =
+{ 
+    1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
+    0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
+    0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
+    0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
+    0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
+    0.062500f, 0.937500f, 0.031250f, 0.968750f
+};
+
+
+__kernel void WarpAffineLinear(GLOBAL_SIZE_2_DIMS
+                               __read_only image2d_t input,
+                               __write_only image2d_t output,
+                               __private const int output_height,
+                               __private const int output_width,
+                               __private const int channel_blocks,
+                               __private const int input_height,
+                               __private const int input_width,
+                               __constant float* m,
+                               __private const float border_val
+                               ) {
+    int output_cw_idx   = get_global_id(0);
+    int output_bh_idx   = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int batch_idx             = output_bh_idx / output_height;
+    const int height_idx            = output_bh_idx % output_height;
+    const int width_idx             = output_cw_idx / channel_blocks;
+    const int channel_blocks_idx    = output_cw_idx % channel_blocks;
+
+    int scale_x     = width_idx << AB_BITS;
+    int adelta_x    = rint(m[0] * scale_x);
+    int adelta_y    = rint(m[3] * scale_x);
+    int bdelta_x    = rint(fma(m[1], height_idx, m[2]) * AB_SCALE);
+    int bdelta_y    = rint(fma(m[4], height_idx, m[5]) * AB_SCALE);
+
+    int new_x       = adelta_x + bdelta_x + ROUND_DELTA;
+    int new_y       = adelta_y + bdelta_y + ROUND_DELTA;
+    int new_x_loc   = new_x >> AB_BITS;
+    int new_y_loc   = new_y >> AB_BITS;
+    short coeffs_x  = convert_short((new_x >> (AB_BITS - INTER_BITS)) & (INTER_TAB_SIZE - 1));
+    short coeffs_y  = convert_short((new_y >> (AB_BITS - INTER_BITS)) & (INTER_TAB_SIZE - 1));
+
+    int2 output_pos         = (int2)(output_cw_idx, output_bh_idx);
+
+    int scale_coeffs_x      = coeffs_x << 1, scale_coeffs_y = coeffs_y << 1;
+    float tmp_coeffs0       = coeffs[scale_coeffs_y], tmp_coeffs1 = coeffs[scale_coeffs_y + 1];
+    float tmp_coeffs2       = coeffs[scale_coeffs_x], tmp_coeffs3 = coeffs[scale_coeffs_x + 1];
+    short bilinearWeight0   = convert_short_sat_rte(tmp_coeffs0 * tmp_coeffs2 * INTER_REMAP_COEF_SCALE);
+    short bilinearWeight1   = convert_short_sat_rte(tmp_coeffs0 * tmp_coeffs3 * INTER_REMAP_COEF_SCALE);
+    short bilinearWeight2   = convert_short_sat_rte(tmp_coeffs1 * tmp_coeffs2 * INTER_REMAP_COEF_SCALE);
+    short bilinearWeight3   = convert_short_sat_rte(tmp_coeffs1 * tmp_coeffs3 * INTER_REMAP_COEF_SCALE);
+    if (new_x_loc >= 0 && new_x_loc < (input_width - 1) && new_y_loc >= 0 && new_y_loc < (input_height - 1)) {
+        const int2 input_pos0 =
+            (int2)(mad24(new_x_loc, channel_blocks, channel_blocks_idx),
+                   mad24(batch_idx, input_height, new_y_loc));
+        const int2 input_pos1 = (int2)(input_pos0.x + channel_blocks, input_pos0.y);
+        const int2 input_pos2 = (int2)(input_pos0.x, input_pos0.y + 1);
+        const int2 input_pos3 = (int2)(input_pos0.x + channel_blocks, input_pos0.y + 1);
+
+        float4 val0 = read_imagef(input, SAMPLER, input_pos0);
+        float4 val1 = read_imagef(input, SAMPLER, input_pos1);
+        float4 val2 = read_imagef(input, SAMPLER, input_pos2);
+        float4 val3 = read_imagef(input, SAMPLER, input_pos3);
+
+        int4 val = convert_int4_sat(val0) * bilinearWeight0 +
+                   convert_int4_sat(val1) * bilinearWeight1 +
+                   convert_int4_sat(val2) * bilinearWeight2 +
+                   convert_int4_sat(val3) * bilinearWeight3;
+
+        float4 val_out = convert_float4((val + (1 << (INTER_REMAP_COEF_BITS - 1))) >> INTER_REMAP_COEF_BITS);
+        write_imagef(output, output_pos, val_out);
+    }
+    else if (new_x_loc >= -1 && new_x_loc <= (input_width - 1) &&
+                new_y_loc >= -1 && new_y_loc <= (input_height - 1)) {
+        const int2 input_pos0 =
+            (int2)(mad24(new_x_loc, channel_blocks, channel_blocks_idx),
+                   mad24(batch_idx, input_height, new_y_loc));
+        const int2 input_pos1 = (int2)(input_pos0.x + channel_blocks, input_pos0.y);
+        const int2 input_pos2 = (int2)(input_pos0.x, input_pos0.y + 1);
+        const int2 input_pos3 = (int2)(input_pos0.x + channel_blocks, input_pos0.y + 1);
+
+        int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+        int mask1 = new_x_loc <= (input_width - 2) && new_y_loc >= 0;
+        int mask2 = new_x_loc >= 0 && new_y_loc <= (input_height - 2);
+        int mask3 = new_x_loc <= (input_width - 2) && new_y_loc <= (input_height - 2);
+
+        int4 val = 0;
+        float4 val0 = border_val, val1 = border_val, val2 = border_val, val3 = border_val;
+        if (mask0) {
+            val0 = read_imagef(input, SAMPLER, input_pos0);
+        }
+        val += convert_int4_sat(val0) * bilinearWeight0;
+        if (mask1) {
+            val1 = read_imagef(input, SAMPLER, input_pos1);
+        }
+        val += convert_int4_sat(val1) * bilinearWeight1;
+        if (mask2) {
+            val2 = read_imagef(input, SAMPLER, input_pos2);
+        }
+        val += convert_int4_sat(val2) * bilinearWeight2;
+        if (mask3) {
+            val3 = read_imagef(input, SAMPLER, input_pos3);
+        }
+        val += convert_int4_sat(val3) * bilinearWeight3;
+
+        float4 val_out = convert_float4((val + (1 << (INTER_REMAP_COEF_BITS - 1))) >> INTER_REMAP_COEF_BITS);
+        write_imagef(output, output_pos, val_out);
+    }
+    else
+    {
+        float4 val_out = border_val;
+        write_imagef(output, output_pos, val_out);
+    }
+}
+
+__kernel void WarpAffineNearest(GLOBAL_SIZE_2_DIMS
+                                __read_only image2d_t input,
+                                __write_only image2d_t output,
+                                __private const int output_height,
+                                __private const int output_width,
+                                __private const int channel_blocks,
+                                __private const int input_height,
+                                __private const int input_width,
+                                __constant float* m,
+                                __private const float border_val
+                                ) {
+    int output_cw_idx   = get_global_id(0);
+    int output_bh_idx   = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int batch_idx             = output_bh_idx / output_height;
+    const int height_idx            = output_bh_idx % output_height;
+    const int width_idx             = output_cw_idx / channel_blocks;
+    const int channel_blocks_idx    = output_cw_idx % channel_blocks;
+
+    int scale_x     = width_idx << AB_BITS;
+    int adelta_x    = rint(m[0] * scale_x);
+    int adelta_y    = rint(m[3] * scale_x);
+    int bdelta_x    = rint(fma(m[1], height_idx, m[2]) * AB_SCALE);
+    int bdelta_y    = rint(fma(m[4], height_idx, m[5]) * AB_SCALE);
+
+    int new_x       = adelta_x + bdelta_x + ROUND_DELTA;
+    int new_y       = adelta_y + bdelta_y + ROUND_DELTA;
+    int new_x_loc   = new_x >> AB_BITS;
+    int new_y_loc   = new_y >> AB_BITS;
+    short coeffs_x  = convert_short((new_x >> (AB_BITS - INTER_BITS)) & (INTER_TAB_SIZE - 1));
+    short coeffs_y  = convert_short((new_y >> (AB_BITS - INTER_BITS)) & (INTER_TAB_SIZE - 1));
+    int is_right    = coeffs_x >= 16;
+    int is_bottom   = coeffs_y >= 16;
+
+    int2 output_pos         = (int2)(output_cw_idx, output_bh_idx);
+    if (new_x_loc >= 0 && new_x_loc < (input_width - 1) && new_y_loc >= 0 && new_y_loc < (input_height - 1)) {
+        const int2 input_pos =
+            (int2)(mad24(new_x_loc + is_right, channel_blocks, channel_blocks_idx),
+                   mad24(batch_idx, input_height, new_y_loc + is_bottom));
+        write_imagef(output, output_pos, read_imagef(input, SAMPLER, input_pos));
+    }
+    else if (new_x_loc >= -1 && new_x_loc <= (input_width - 1) &&
+                new_y_loc >= -1 && new_y_loc <= (input_height - 1)) {
+        int mask = select(new_x_loc >= 0, new_x_loc <= (input_width - 2), is_right) &&
+                   select(new_y_loc >= 0, new_y_loc <= (input_height - 2), is_bottom);
+        const int2 input_pos =
+            (int2)(mad24(new_x_loc + is_right, channel_blocks, channel_blocks_idx),
+                   mad24(batch_idx, input_height, new_y_loc + is_bottom));
+        float4 val_out = border_val;
+        if (mask) {
+            val_out = read_imagef(input, SAMPLER, input_pos);
+        }
+        write_imagef(output, output_pos, val_out);
+    }
+    else
+    {
+        float4 val_out = border_val;
+        write_imagef(output, output_pos, val_out);
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/opencl/cl/winograd.cl b/3rdparty/TNN/source/tnn/device/opencl/cl/winograd.cl
new file mode 100644
index 0000000..63c31a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/cl/winograd.cl
@@ -0,0 +1,235 @@
+#include "base.inc"
+#include "io.inc"
+#include "activation.inc"
+
+__kernel void TransformToMatrixV(GLOBAL_SIZE_2_DIMS __read_only image2d_t input,                     
+                                 __write_only image2d_t matrix_v,
+                                 __private const int in_height,
+                                 __private const int in_width,
+                                 __private const int in_channel,
+                                 __private const int round_h,
+                                 __private const int round_w,
+                                 __private const int2 padding_wh){
+    const int output_cw_idx = get_global_id(0); //c/4 w/2
+    const int output_bh_idx  = get_global_id(1); //b h/2
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+
+    const int c_block_idx = output_cw_idx / round_w;
+    const int w_block_idx = output_cw_idx - mul24(c_block_idx, round_w);
+    const int batch = output_bh_idx / round_h;
+    const int h_block_idx = output_bh_idx - mul24(batch, round_h);
+    
+    const int width_start_idx = (w_block_idx << 1) - padding_wh.x;
+    const int height_start_idx = (h_block_idx << 1) - padding_wh.y;
+
+    const int4 width_idx = (int4)(width_start_idx) + (int4)(0,1,2,3);
+    const int4 height_idx  = (int4)(height_start_idx) + (int4)(0,1,2,3);
+
+    int4 in_wc_idx = mad24((int4)(c_block_idx), (int4)(in_width), width_idx);
+    int4 in_bh_idx = mad24((int4)(batch), (int4)(in_height), height_idx);
+
+    in_wc_idx = select(in_wc_idx, (int4)(-1), width_idx < (int4)(0) || width_idx >= (int4)(in_width));
+    in_bh_idx = select(in_bh_idx, (int4)(-1), height_idx < (int4)(0) || height_idx >= (int4)(in_height));
+
+    FLOAT4 in00 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s0, in_bh_idx.s0));
+    FLOAT4 in10 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s1, in_bh_idx.s0));
+    FLOAT4 in20 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s2, in_bh_idx.s0));
+    FLOAT4 in30 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s3, in_bh_idx.s0));
+
+    FLOAT4 in01 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s0, in_bh_idx.s1));
+    FLOAT4 in11 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s1, in_bh_idx.s1));
+    FLOAT4 in21 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s2, in_bh_idx.s1));
+    FLOAT4 in31 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s3, in_bh_idx.s1));
+
+    FLOAT4 in02 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s0, in_bh_idx.s2));
+    FLOAT4 in12 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s1, in_bh_idx.s2));
+    FLOAT4 in22 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s2, in_bh_idx.s2));
+    FLOAT4 in32 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s3, in_bh_idx.s2));
+
+    FLOAT4 in03 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s0, in_bh_idx.s3));
+    FLOAT4 in13 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s1, in_bh_idx.s3));
+    FLOAT4 in23 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s2, in_bh_idx.s3));
+    FLOAT4 in33 = RI_F(input, SAMPLER, (int2)(in_wc_idx.s3, in_bh_idx.s3));
+
+    FLOAT4 v00 = in00 - in02;
+    FLOAT4 v10 = in10 - in12;
+    FLOAT4 v20 = in20 - in22;
+    FLOAT4 v30 = in30 - in32;
+
+    FLOAT4 v01 = (FLOAT)0.5f * in01 + (FLOAT)0.5f * in02;
+    FLOAT4 v11 = (FLOAT)0.5f * in11 + (FLOAT)0.5f * in12;
+    FLOAT4 v21 = (FLOAT)0.5f * in21 + (FLOAT)0.5f * in22;
+    FLOAT4 v31 = (FLOAT)0.5f * in31 + (FLOAT)0.5f * in32;
+
+    FLOAT4 v02 = -(FLOAT)0.5f * in01 + (FLOAT)0.5f * in02;
+    FLOAT4 v12 = -(FLOAT)0.5f * in11 + (FLOAT)0.5f * in12;
+    FLOAT4 v22 = -(FLOAT)0.5f * in21 + (FLOAT)0.5f * in22;
+    FLOAT4 v32 = -(FLOAT)0.5f * in31 + (FLOAT)0.5f * in32;
+
+    FLOAT4 v03 = -in01 + in03;
+    FLOAT4 v13 = -in11 + in13;
+    FLOAT4 v23 = -in21 + in23;
+    FLOAT4 v33 = -in31 + in33;
+
+    WI_F(matrix_v, (int2)(output_cw_idx, output_bh_idx), v00 - v20);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(1, global_size_dim1, output_bh_idx)), (FLOAT)0.5f * v10 + (FLOAT)0.5f * v20);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(2, global_size_dim1, output_bh_idx)), -(FLOAT)0.5f * v10 + (FLOAT)0.5f * v20);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(3, global_size_dim1, output_bh_idx)), -v10 + v30);
+
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(4, global_size_dim1, output_bh_idx)), v01 - v21);    
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(5, global_size_dim1, output_bh_idx)), (FLOAT)0.5f * v11 + (FLOAT)0.5f * v21);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(6, global_size_dim1, output_bh_idx)), -(FLOAT)0.5f * v11 + (FLOAT)0.5f * v21);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(7, global_size_dim1, output_bh_idx)), -v11 + v31);
+
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(8, global_size_dim1, output_bh_idx)), v02 - v22);    
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(9, global_size_dim1, output_bh_idx)), (FLOAT)0.5f * v12 + (FLOAT)0.5f * v22);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(10, global_size_dim1, output_bh_idx)), -(FLOAT)0.5f * v12 + (FLOAT)0.5f * v22);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(11, global_size_dim1, output_bh_idx)), -v12 + v32);
+
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(12, global_size_dim1, output_bh_idx)), v03 - v23);    
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(13, global_size_dim1, output_bh_idx)), (FLOAT)0.5f * v13 + (FLOAT)0.5f * v23);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(14, global_size_dim1, output_bh_idx)), -(FLOAT)0.5f * v13 + (FLOAT)0.5f * v23);
+    WI_F(matrix_v, (int2)(output_cw_idx, mad24(15, global_size_dim1, output_bh_idx)), -v13 + v33);
+}
+
+__kernel void MatrixInnerProduct(GLOBAL_SIZE_2_DIMS __read_only image2d_t matrix_v,
+                                 __read_only image2d_t matrix_u,
+                                 __write_only image2d_t matrix_m,
+                                 __private const int round_w,
+                                 __private const int round_4x4_w,
+                                 __private const int batch_round_h,
+                                 __private const int out_channel_block, 
+                                 __private const int in_channel_block){
+    const int output_cw_block_idx = get_global_id(0); //c/4  w/2/4
+    const int output_16_bh_idx  = get_global_id(1); //16 b h/2
+
+    DEAL_NON_UNIFORM_DIM2(output_cw_block_idx, output_16_bh_idx);
+   
+    const int c_block_idx = output_cw_block_idx / round_4x4_w;
+    const int w_block_idx = output_cw_block_idx - mul24(c_block_idx, round_4x4_w);
+    const int4 w_idx = (int4)(w_block_idx << 2) + (int4)(0,1,2,3);
+
+    const int alpha = output_16_bh_idx / batch_round_h;
+    const int u_bh_idx = mul24(alpha, out_channel_block) + c_block_idx;
+
+    FLOAT4 m0 = (FLOAT4)(0);
+    FLOAT4 m1 = (FLOAT4)(0);
+    FLOAT4 m2 = (FLOAT4)(0);
+    FLOAT4 m3 = (FLOAT4)(0);
+
+    for (int input_c_block_idx = 0; input_c_block_idx < in_channel_block; ++input_c_block_idx) {
+        const int4 input_c_idx = (int4)(input_c_block_idx << 2) + (int4)(0,1,2,3);
+        int4 v_cw_idx = select(mad24((int4)(input_c_block_idx), (int4)(round_w), w_idx), (int4)(-1), w_idx >= (int4)(round_w));
+        FLOAT4 v_in0 = RI_F(matrix_v, SAMPLER, (int2)(v_cw_idx.s0, output_16_bh_idx));
+        FLOAT4 v_in1 = RI_F(matrix_v, SAMPLER, (int2)(v_cw_idx.s1, output_16_bh_idx));
+        FLOAT4 v_in2 = RI_F(matrix_v, SAMPLER, (int2)(v_cw_idx.s2, output_16_bh_idx));
+        FLOAT4 v_in3 = RI_F(matrix_v, SAMPLER, (int2)(v_cw_idx.s3, output_16_bh_idx));
+        FLOAT4 u_in0 = RI_F(matrix_u, SAMPLER, (int2)(input_c_idx.s0, u_bh_idx));
+        FLOAT4 u_in1 = RI_F(matrix_u, SAMPLER, (int2)(input_c_idx.s1, u_bh_idx));
+        FLOAT4 u_in2 = RI_F(matrix_u, SAMPLER, (int2)(input_c_idx.s2, u_bh_idx));
+        FLOAT4 u_in3 = RI_F(matrix_u, SAMPLER, (int2)(input_c_idx.s3, u_bh_idx));
+
+        m0 = mad(v_in0.s0, u_in0, m0);
+        m0 = mad(v_in0.s1, u_in1, m0);
+        m0 = mad(v_in0.s2, u_in2, m0);
+        m0 = mad(v_in0.s3, u_in3, m0);
+
+        m1 = mad(v_in1.s0, u_in0, m1);
+        m1 = mad(v_in1.s1, u_in1, m1);
+        m1 = mad(v_in1.s2, u_in2, m1);
+        m1 = mad(v_in1.s3, u_in3, m1);
+
+        m2 = mad(v_in2.s0, u_in0, m2);
+        m2 = mad(v_in2.s1, u_in1, m2);
+        m2 = mad(v_in2.s2, u_in2, m2);
+        m2 = mad(v_in2.s3, u_in3, m2);
+
+        m3 = mad(v_in3.s0, u_in0, m3);
+        m3 = mad(v_in3.s1, u_in1, m3);
+        m3 = mad(v_in3.s2, u_in2, m3);
+        m3 = mad(v_in3.s3, u_in3, m3);
+    }
+
+    const int output_cw_idx = mad24(c_block_idx, round_w, w_idx.s0);
+    WI_F(matrix_m, (int2)(output_cw_idx, output_16_bh_idx), m0);
+
+    if(w_idx.s1 < round_w) {
+        WI_F(matrix_m, (int2)(output_cw_idx + 1, output_16_bh_idx), m1);
+    }
+
+    if(w_idx.s2 < round_w) {
+        WI_F(matrix_m, (int2)(output_cw_idx + 2, output_16_bh_idx), m2);
+    }
+
+    if(w_idx.s3 < round_w) {
+        WI_F(matrix_m, (int2)(output_cw_idx + 3, output_16_bh_idx), m3);
+    }
+    
+}
+
+__kernel void TransformFromMatrixM(GLOBAL_SIZE_2_DIMS __read_only image2d_t matrix_m,
+                                 __read_only image2d_t bias,                     
+                                 __write_only image2d_t output,
+                                 __private const int round_w,
+                                 __private const int round_h,
+                                 __private const int out_width,
+                                 __private const int out_height,
+                                 __private const int activation_type) {
+        const int output_cw_idx = get_global_id(0); //c/4 w/2
+        const int output_bh_idx  = get_global_id(1); //b h/2
+        DEAL_NON_UNIFORM_DIM2(output_cw_idx, output_bh_idx);
+        const int c_block_idx = output_cw_idx / round_w;
+        const int w_block_idx = output_cw_idx - mul24(c_block_idx, round_w);
+        const int batch = output_bh_idx / round_h;
+        const int h_block_idx = output_bh_idx - mul24(batch, round_h);
+
+        FLOAT4 bias_value    = RI_F(bias, SAMPLER, (int2)(c_block_idx, 0));
+
+        FLOAT4 m00  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, output_bh_idx));
+        FLOAT4 m10  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(1, global_size_dim1, output_bh_idx)));
+        FLOAT4 m20  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(2, global_size_dim1, output_bh_idx)));
+        FLOAT4 m30  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(3, global_size_dim1, output_bh_idx)));
+        FLOAT4 m01  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(4, global_size_dim1, output_bh_idx)));
+        FLOAT4 m11  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(5, global_size_dim1, output_bh_idx)));
+        FLOAT4 m21  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(6, global_size_dim1, output_bh_idx)));
+        FLOAT4 m31  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(7, global_size_dim1, output_bh_idx)));
+        FLOAT4 m02  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(8, global_size_dim1, output_bh_idx)));
+        FLOAT4 m12  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(9, global_size_dim1, output_bh_idx)));
+        FLOAT4 m22  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(10, global_size_dim1, output_bh_idx)));
+        FLOAT4 m32  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(11, global_size_dim1, output_bh_idx)));
+        FLOAT4 m03  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(12, global_size_dim1, output_bh_idx)));
+        FLOAT4 m13  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(13, global_size_dim1, output_bh_idx)));
+        FLOAT4 m23  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(14, global_size_dim1, output_bh_idx)));
+        FLOAT4 m33  = RI_F(matrix_m, SAMPLER, (int2)(output_cw_idx, mad24(15, global_size_dim1, output_bh_idx)));
+        FLOAT4 out00  = m00 + m01 + m02;
+        FLOAT4 out10  = m10 + m11 + m12;
+        FLOAT4 out20  = m20 + m21 + m22;
+        FLOAT4 out30  = m30 + m31 + m32;
+        FLOAT4 out01  = m01 - m02 + m03;
+        FLOAT4 out11  = m11 - m12 + m13;
+        FLOAT4 out21  = m21 - m22 + m23;
+        FLOAT4 out31  = m31 - m32 + m33;
+        int2 ow = (int2)(w_block_idx << 1) + (int2)(0, 1);
+        int2 oh = (int2)(h_block_idx << 1) + (int2)(0, 1);
+        int2 ox = mad24((int2)(c_block_idx), (int2)(out_width), ow);
+        int2 oy = mad24((int2)(batch), (int2)(out_height), oh);
+        FLOAT4 res00  = bias_value + out00 + out10 + out20;
+        res00 = ActivationProcess(res00, activation_type);
+        WI_F(output, (int2)(ox.s0, oy.s0), res00);
+        if (ow.s1 < out_width && oh.s0 < out_height) {
+            FLOAT4 res10  = bias_value + out10 - out20 + out30;
+            res10 = ActivationProcess(res10, activation_type);
+            WI_F(output, (int2)(ox.s1, oy.s0), res10);
+        }
+        if (ow.s0 < out_width && oh.s1 < out_height) {
+            FLOAT4 res01  = bias_value + out01 + out11 + out21;
+            res01 = ActivationProcess(res01, activation_type);
+            WI_F(output, (int2)(ox.s0, oy.s1), res01);
+        }
+        if (ow.s1 < out_width && oh.s1 < out_height) {
+            FLOAT4 res11  = bias_value + out11 - out21 + out31;
+            res11 = ActivationProcess(res11, activation_type);
+            WI_F(output, (int2)(ox.s1, oy.s1), res11);
+        }
+}
diff --git a/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.cc b/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.cc
new file mode 100644
index 0000000..1d8e83b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.cc
@@ -0,0 +1,284 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+
+namespace TNN_NS {
+
+Status ImageBufferConvertor::ConvertImageToBuffer(const OpenCLMemory *image, const OpenCLBufferFormat type,
+                                                  DimsVector dims, OpenCLMemory *buffer, bool need_wait) {
+    LOGD("start ConvertImageToBuffer !\n");
+    auto image_shape = GetImageShape(image);
+
+    std::string kernel_name;
+    if (type == NHWC_BUFFER) {
+        kernel_name = "ImageToNHWCBuffer";
+    } else if (type == NCHW_BUFFER) {
+        kernel_name = "ImageToNCHWBuffer";
+    } else if (type == CONV2D_FILTER) {
+        kernel_name = "Conv2DFilterImageToBuffer";
+    } else if (type == ARGUMENT) {
+        kernel_name = "ArgImageToBuffer";
+    } else {
+        LOGE("not support such type !!! \n");
+        return Status(TNNERR_OPENCL_API_ERROR, "type not support");
+    }
+
+    Status ret = TNN_OK;
+    if (image_to_buffer_unit_.ocl_kernel.get() == nullptr || image_to_buffer_kernelname_ != kernel_name) {
+        image_to_buffer_kernelname_ = kernel_name;
+        std::set<std::string> build_options;
+
+        ret = CreateExecuteUnit(image_to_buffer_unit_, "image_to_buffer", kernel_name, build_options);
+        CHECK_TNN_OK(ret)
+    }
+
+    image_to_buffer_unit_.global_work_size = {static_cast<uint32_t>(image_shape[0]),
+                                            static_cast<uint32_t>(image_shape[1])};
+
+    uint32_t idx = 0;
+    image_to_buffer_unit_.ocl_kernel.setArg(idx++, image_to_buffer_unit_.global_work_size[0]);
+    image_to_buffer_unit_.ocl_kernel.setArg(idx++, image_to_buffer_unit_.global_work_size[1]);
+    image_to_buffer_unit_.ocl_kernel.setArg(idx++, GetOpenCLBuffer(buffer));
+
+    if (type == CONV2D_FILTER) {
+        //channel * height * width
+        const int ic_w_h_size = DimsFunctionUtils::GetDim(dims, 1) * DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        //height * width
+        const int w_h_size    = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        int kernel_shape[2]   = {DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3)};
+        //batch
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(ic_w_h_size));
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(w_h_size));
+    } else if (type == ARGUMENT) {
+        //batch
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+    } else {
+        //height
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 2)));
+        //width
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 3)));
+        //channel
+        image_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+    }
+
+    image_to_buffer_unit_.ocl_kernel.setArg(idx++, GetOpenCLImage(image));
+
+    image_to_buffer_unit_.local_work_size = LocalWS2DDefault(image_to_buffer_unit_);
+
+    ret = RunKernel(image_to_buffer_unit_.ocl_kernel, image_to_buffer_unit_.global_work_size,
+                    image_to_buffer_unit_.local_work_size, opencl_command_queue_, "ConvertImageToBuffer");
+    CHECK_TNN_OK(ret)
+
+    if (need_wait) {
+        //sync
+        opencl_command_queue_->finish();
+    }
+    LOGD("end convertImageToBuffer !\n");
+    return TNN_OK;
+}
+
+Status ImageBufferConvertor::ConvertBufferToImage(const OpenCLMemory *buffer, const OpenCLBufferFormat type,
+                                                  DimsVector dims, OpenCLMemory *image, bool need_wait) {
+    LOGD("start ConvertBufferToImage !\n");
+    auto image_shape = GetImageShape(image);
+
+    std::string kernel_name;
+    if (type == CONV2D_FILTER) {
+        kernel_name = "Conv2DFilterBufferToImage";
+    } else if (type == DW_CONV2D_FILTER) {
+        kernel_name = "DWFilterBufferToImage";
+    } else if (type == NHWC_BUFFER) {
+        kernel_name = "NHWCBufferToImage";
+    } else if (type == NCHW_BUFFER) {
+        kernel_name = "NCHWBufferToImage";
+    } else if (type == ARGUMENT) {
+        kernel_name = "ArgBufferToImage";
+    } else if (type == LSTM_FILTER) {
+        kernel_name = "LstmFilterBufferToImage";
+    } else if (type == LSTM_BIAS) {
+        kernel_name = "LstmBiasBufferToImage";
+    } else {
+        LOGE("not support such type !!! \n");
+        return Status(TNNERR_OPENCL_API_ERROR, "type not support");
+    }
+
+    Status ret = TNN_OK;
+    if (buffer_to_image_unit_.ocl_kernel.get() == nullptr || buffer_to_image_kernelname_ != kernel_name) {
+        buffer_to_image_kernelname_ = kernel_name;
+        std::set<std::string> build_options;
+
+        ret = CreateExecuteUnit(buffer_to_image_unit_, "buffer_to_image", kernel_name, build_options);
+        CHECK_TNN_OK(ret)
+    }
+
+    buffer_to_image_unit_.global_work_size = {static_cast<uint32_t>(image_shape[0]),
+                                              static_cast<uint32_t>(image_shape[1])};
+
+    uint32_t idx = 0;
+    buffer_to_image_unit_.ocl_kernel.setArg(idx++, buffer_to_image_unit_.global_work_size[0]);
+    buffer_to_image_unit_.ocl_kernel.setArg(idx++, buffer_to_image_unit_.global_work_size[1]);
+    buffer_to_image_unit_.ocl_kernel.setArg(idx++, GetOpenCLBuffer(buffer));
+
+    if (type == CONV2D_FILTER) {
+        //channel * height * width
+        const int ic_w_h_size = DimsFunctionUtils::GetDim(dims, 1) * DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        //height * width
+        const int w_h_size    = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        int kernel_shape[2]   = {DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3)};
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(ic_w_h_size));
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(w_h_size));
+    } else if (type == DW_CONV2D_FILTER) {
+        //height * width
+        const int w_h_size  = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        int kernel_shape[4] = {DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3)};
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(w_h_size));
+    } else if (type == ARGUMENT) {
+        //batch
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+    } else if (type == LSTM_FILTER) {
+        // num_directions
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+        // hidden_size
+        int hidden_size = DimsFunctionUtils::GetDim(dims, 1) / 4;
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(hidden_size));
+        // weights_width
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 2)));
+        // hidden_updiv_4_size
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(UP_DIV(hidden_size, 4)));
+        // hidden_mul_4_size
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+    } else if (type == LSTM_BIAS) {
+        // num_directions
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+        // hidden_size
+        int hidden_size = DimsFunctionUtils::GetDim(dims, 1) / 8;
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(hidden_size));
+        // hidden_updiv_4_size
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(UP_DIV(hidden_size, 4)));
+        // hidden_mul_8_size
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+    } else {
+        //height
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 2)));
+        //width
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 3)));
+        //channel
+        buffer_to_image_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+    }
+
+    buffer_to_image_unit_.ocl_kernel.setArg(idx++, GetOpenCLImage(image));
+
+    buffer_to_image_unit_.local_work_size = LocalWS2DDefault(buffer_to_image_unit_);
+
+    ret = RunKernel(buffer_to_image_unit_.ocl_kernel, buffer_to_image_unit_.global_work_size,
+                    buffer_to_image_unit_.local_work_size, opencl_command_queue_, "ConvertBufferToImage");
+    CHECK_TNN_OK(ret)
+
+    if (need_wait) {
+        opencl_command_queue_->finish();
+    }
+    LOGD("end ConvertBufferToImage !\n");
+    return TNN_OK;
+}
+
+Status ImageBufferConvertor::ConvertBufferToBuffer(const OpenCLMemory *input, const OpenCLBufferFormat type,
+                                                   DimsVector dims, OpenCLMemory *output, bool need_wait) {
+    LOGD("start ConvertBufferToBuffer !\n");
+
+    std::string kernel_name;
+    if (type == CONV2D_FILTER) {
+        kernel_name = "Conv2DFilterBufferToBuffer";
+    } else if (type == DW_CONV2D_FILTER) {
+        kernel_name = "DWFilterBufferToBuffer";
+    } else if (type == ARGUMENT && dims.size() == 1) {
+        kernel_name = "ArgBufferToBuffer";
+    } else {
+        LOGE("not support such type !!! \n");
+        return Status(TNNERR_OPENCL_API_ERROR, "type not support");
+    }
+
+    Status ret = TNN_OK;
+    if (buffer_to_buffer_unit_.ocl_kernel.get() == nullptr || buffer_to_buffer_kernelname_ != kernel_name) {
+        buffer_to_buffer_kernelname_ = kernel_name;
+        std::set<std::string> build_options;
+
+        ret = CreateExecuteUnit(buffer_to_buffer_unit_, "buffer_to_buffer", kernel_name, build_options);
+        CHECK_TNN_OK(ret)
+    }
+
+    if (type == CONV2D_FILTER) {
+        buffer_to_buffer_unit_.global_work_size.push_back(ROUND_UP(DimsFunctionUtils::GetDim(dims, 0), 4));
+        buffer_to_buffer_unit_.global_work_size.push_back(DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3) * ROUND_UP(DimsFunctionUtils::GetDim(dims, 1), 4));
+    } else if (type == DW_CONV2D_FILTER) {
+        buffer_to_buffer_unit_.global_work_size.push_back(DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3));
+        buffer_to_buffer_unit_.global_work_size.push_back(UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4));
+    } else if (type == ARGUMENT && dims.size() == 1) {
+        buffer_to_buffer_unit_.global_work_size.push_back(UP_DIV(DimsFunctionUtils::GetDim(dims, 0), 4));
+        buffer_to_buffer_unit_.global_work_size.push_back(1);
+    } else {
+        LOGE("not support such type !!! \n");
+        return Status(TNNERR_OPENCL_API_ERROR, "type not support");
+    }
+
+    uint32_t idx = 0;
+    buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, buffer_to_buffer_unit_.global_work_size[0]);
+    buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, buffer_to_buffer_unit_.global_work_size[1]);
+    buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, GetOpenCLBuffer(input));
+
+    if (type == CONV2D_FILTER) {
+        //height * width
+        const int w_h_size  = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        int kernel_shape[2] = {DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3)};
+        //batch
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+        //channel
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1)));
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(w_h_size));
+    } else if (type == DW_CONV2D_FILTER) {
+        //height * width
+        const int w_h_size  = DimsFunctionUtils::GetDim(dims, 2) * DimsFunctionUtils::GetDim(dims, 3);
+        int kernel_shape[4] = {DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), DimsFunctionUtils::GetDim(dims, 2), DimsFunctionUtils::GetDim(dims, 3)};
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape);
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(w_h_size));
+    } else if (type == ARGUMENT) {
+        //batch
+        buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0)));
+    } else {
+        LOGE("not support such type !!! \n");
+        return Status(TNNERR_OPENCL_API_ERROR, "type not support");
+    }
+
+    buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, GetOpenCLBuffer(output));
+
+    buffer_to_buffer_unit_.local_work_size = LocalWS2DDefault(buffer_to_buffer_unit_);
+
+    ret = RunKernel(buffer_to_buffer_unit_.ocl_kernel, buffer_to_buffer_unit_.global_work_size,
+                    buffer_to_buffer_unit_.local_work_size, opencl_command_queue_, "ConvertBufferToBuffer");
+    CHECK_TNN_OK(ret)
+
+    if (need_wait) {
+        opencl_command_queue_->finish();
+    }
+    LOGD("end ConvertBufferToBuffer !\n");
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.h b/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.h
new file mode 100644
index 0000000..91061b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/imagebuffer_convertor.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_IMAGEBUFFER_CONVERTOR_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_IMAGEBUFFER_CONVERTOR_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/device/opencl/opencl_utils.h"
+
+namespace TNN_NS {
+
+class ImageBufferConvertor {
+public:
+    ImageBufferConvertor(OpenCLRuntime *opencl_runtime, cl::CommandQueue *opencl_command_queue)
+        : opencl_runtime_(opencl_runtime), opencl_command_queue_(opencl_command_queue) {}
+    Status ConvertImageToBuffer(const OpenCLMemory *input, const OpenCLBufferFormat type, DimsVector dims,
+                                OpenCLMemory *output, bool need_wait = false);
+    Status ConvertBufferToImage(const OpenCLMemory *input, const OpenCLBufferFormat type, DimsVector dims,
+                                OpenCLMemory *output, bool need_wait = false);
+    Status ConvertBufferToBuffer(const OpenCLMemory *input, const OpenCLBufferFormat type, DimsVector dims,
+                                 OpenCLMemory *output, bool need_wait = false);
+
+private:
+    OpenCLRuntime *opencl_runtime_;
+    cl::CommandQueue *opencl_command_queue_;
+    std::string image_to_buffer_kernelname_ = "";
+    OpenCLExecuteUnit image_to_buffer_unit_;
+    std::string buffer_to_image_kernelname_ = "";
+    OpenCLExecuteUnit buffer_to_image_unit_;
+    std::string buffer_to_buffer_kernelname_ = "";
+    OpenCLExecuteUnit buffer_to_buffer_unit_;
+};
+
+}  // namespace TNN_NS
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_IMAGEBUFFER_CONVERTOR_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.cc
new file mode 100644
index 0000000..244fecc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.cc
@@ -0,0 +1,610 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_blob_converter.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+
+//default constructor will create convert buffer
+OpenCLBlobConverterAcc::OpenCLBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {
+    BlobMemorySizeInfo size_info;
+    if (blob->GetBlobDesc().data_format != DATA_FORMAT_NCHW) {
+        size_info = Calculate2DCLImageMemorySize(blob->GetBlobDesc());
+    } else {
+        size_info = Calculate1DMemorySize(blob->GetBlobDesc());
+    }
+    // force float to get the max memeory
+    size_info.data_type   = DATA_TYPE_FLOAT;  
+    auto opencl_runtime   = OpenCLRuntime::GetInstance();
+    buffer_size_          = GetBlobMemoryBytesSize(size_info);
+    cl_int ret            = CL_SUCCESS;
+    cl::Buffer* cl_buffer = new cl::Buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                            (cl::size_type)buffer_size_, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != cl_buffer)
+            delete cl_buffer;
+    } else {
+        buffer_.reset(cl_buffer);
+    }
+
+    int channel = DimsFunctionUtils::GetDim(blob->GetBlobDesc().dims, 1);
+    scale_bias_buffer_size_ = channel * sizeof(float);
+    cl::Buffer* scale_buffer = new cl::Buffer(*opencl_runtime->Context(),
+                                              CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                              (cl::size_type)scale_bias_buffer_size_, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != scale_buffer)
+            delete scale_buffer;
+    } else {
+        scale_buffer_.reset(scale_buffer);
+    }
+
+    cl::Buffer* bias_buffer = new cl::Buffer(*opencl_runtime->Context(),
+                                             CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                             (cl::size_type)scale_bias_buffer_size_, nullptr, &ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        if (nullptr != bias_buffer)
+            delete bias_buffer;
+    } else {
+        bias_buffer_.reset(bias_buffer);
+    }
+}
+
+OpenCLBlobConverterAcc::~OpenCLBlobConverterAcc() {
+    buffer_.reset();
+    scale_buffer_.reset();
+    bias_buffer_.reset();
+}
+
+//convert blob data to mat async
+Status OpenCLBlobConverterAcc::ConvertToMatAsync(Mat &mat, MatConvertParam param, void *command_queue) {
+    if (nullptr == buffer_) {
+        LOGE("OpenCLBlobConverter buffer allocate failed\n");
+        return Status(TNNERR_NULL_PARAM, "OpenCLBlobConverter buffer allocate failed!");
+    }
+
+    Status ret            = TNN_OK;
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+
+    do_scale_bias_  = NeedDoScaleBias(param);
+    //create identifier
+    std::string to_mat_key = ToString(mat.GetDeviceType()) + "_" + ToString(mat.GetMatType()) + "_" +
+            ToString(blob_->GetBlobDesc().data_format) + "_" + ToString(param.reverse_channel) + "_" +
+            ToString(do_scale_bias_);
+    //create convert unit only once for every key
+    if (convert_to_mat_map_.count(to_mat_key) == 0) {
+        OpenCLExecuteUnit unit;
+        ret = CreateConvertUnit(unit, mat, param, true);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        convert_to_mat_map_[to_mat_key] = unit;
+    }
+
+    OpenCLExecuteUnit unit = convert_to_mat_map_[to_mat_key];
+    // set arguments
+    ret                    = SetConvertArgs(unit, mat, param, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    // if mat type is nchw_float, need copy scale and bias to buffer
+    if (mat.GetMatType() == NCHW_FLOAT) {
+        ret = CopyScaleBiasToBuffer(param, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    // run convert unit
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    //if mat device is cpu, need convert blob to buffer and copy buffer data to mat
+    if (mat.GetDeviceType() != DEVICE_OPENCL) {
+        ret = CopyBufferDataToMat(mat, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    return ret;
+}
+
+//convert mat data to blob async
+Status OpenCLBlobConverterAcc::ConvertFromMatAsync(Mat &mat, MatConvertParam param, void *command_queue) {
+    if (nullptr == buffer_) {
+        LOGE("OpenCLBlobConverter buffer allocate failed\n");
+        return Status(TNNERR_NULL_PARAM, "OpenCLBlobConverter buffer allocate failed!");
+    }
+
+    Status ret            = TNN_OK;
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+
+    do_scale_bias_  = NeedDoScaleBias(param);
+    //create identifier
+    std::string from_mat_key = ToString(mat.GetDeviceType()) + "_" + ToString(mat.GetMatType()) + "_" +
+            ToString(blob_->GetBlobDesc().data_format) + "_" + ToString(param.reverse_channel) + "_" +
+            ToString(do_scale_bias_);
+    //create convert unit only once for every key
+    if (convert_from_mat_map_.count(from_mat_key) == 0) {
+        OpenCLExecuteUnit unit;
+        ret = CreateConvertUnit(unit, mat, param, false);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        convert_from_mat_map_[from_mat_key] = unit;
+    }
+
+    OpenCLExecuteUnit unit = convert_from_mat_map_[from_mat_key];
+    // set arguments
+    ret                    = SetConvertArgs(unit, mat, param, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    //if mat device is cpu, need copy mat data to buffer and convert buffer to blob 
+    if (mat.GetDeviceType() != DEVICE_OPENCL) {
+        ret = CopyMatToBufferData(mat, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    // if mat type is nchw_float, need copy scale and bias to buffer
+    if (mat.GetMatType() == NCHW_FLOAT) {
+        ret = CopyScaleBiasToBuffer(param, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+    // run convert unit
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return ret;
+}
+
+Status OpenCLBlobConverterAcc::ConvertToMat(Mat &mat, MatConvertParam param, void *command_queue) {
+    Status ret = ConvertToMatAsync(mat, param, command_queue);
+    //sync
+    if (ret == TNN_OK) {
+        cl::CommandQueue *opencl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+        opencl_command_queue->finish();
+    }
+    return ret;
+}
+
+Status OpenCLBlobConverterAcc::ConvertFromMat(Mat &mat, MatConvertParam param, void *command_queue) {
+    Status ret = ConvertFromMatAsync(mat, param, command_queue);
+    //sync
+    if (ret == TNN_OK) {
+        cl::CommandQueue *opencl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+        opencl_command_queue->finish();
+    }
+    return ret;
+}
+
+bool OpenCLBlobConverterAcc::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+Status OpenCLBlobConverterAcc::GetConvertToMatKernelName(Mat &mat, std::string& kernel_name, std::string& program_name) {
+    int dims_size = blob_->GetBlobDesc().dims.size();
+    if (blob_->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4 && dims_size <= 4) {
+            if (NC_INT32 == mat.GetMatType()) {
+                kernel_name = "IntBlobConvertToNCINT32";
+            } else if (NCHW_FLOAT == mat.GetMatType()) {
+                kernel_name = "IntBlobConvertToNCHW";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+    if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4) {
+        if (dims_size <= 4) {
+            if (N8UC3 == mat.GetMatType()) {
+                kernel_name = "ConvertToN8UC3";
+            } else if (N8UC4 == mat.GetMatType()) {
+                kernel_name = "ConvertToN8UC4";
+            } else if (NGRAY == mat.GetMatType()) {
+                kernel_name = "ConvertToNGray";
+            } else if (NCHW_FLOAT == mat.GetMatType()) {
+                kernel_name = "ConvertToNCHW";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+        } else if (dims_size == 5) {
+            if (NCHW_FLOAT == mat.GetMatType()) {
+                program_name = "blob_5d_convert_to_mat";
+                kernel_name = "Blob5DConvertToNCHW";
+            } else {
+                char error_str[128];
+                sprintf(error_str, "Blob-5D convert type not support mat type: %d",
+                        mat.GetMatType());
+                return Status(TNNERR_PARAM_ERR, error_str);
+            }
+        } else if (dims_size == 6) {
+            if (NCHW_FLOAT == mat.GetMatType()) {
+                program_name = "blob_6d_convert_to_mat";
+                kernel_name = "Blob6DConvertToNCHW";
+            } else {
+                char error_str[128];
+                sprintf(error_str, "Blob-6D convert type not support mat type: %d",
+                        mat.GetMatType());
+                return Status(TNNERR_PARAM_ERR, error_str);
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert not support dims > 6");
+        }
+    }
+
+    if (blob_->GetBlobDesc().data_format == DATA_FORMAT_CNH4) {
+        if (NCHW_FLOAT == mat.GetMatType()) {
+            kernel_name = "CNH4BlobConvertToNCHW";
+        } else {
+            return Status(TNNERR_PARAM_ERR, "CNH4 blob convert to mat not support yet");
+        }
+    }
+
+    if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+        if (NCHW_FLOAT == mat.GetMatType()) {
+            kernel_name = "NCHWBlobConvertToNCHW";
+        } else {
+            return Status(TNNERR_PARAM_ERR, "NCHW blob convert to mat not support yet");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLBlobConverterAcc::GetConvertFromMatKernelName(Mat &mat, std::string& kernel_name, std::string& program_name) {
+    int dims_size = blob_->GetBlobDesc().dims.size();
+    if (blob_->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4 && dims_size <= 4) {
+            if (NC_INT32 == mat.GetMatType()) {
+                kernel_name = "IntBlobConvertFromNCINT32";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+    if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4) {
+        if (dims_size <= 4) {
+            if (N8UC3 == mat.GetMatType()) {
+                kernel_name = "ConvertFromN8UC3";
+            } else if (N8UC4 == mat.GetMatType()) {
+                kernel_name = "ConvertFromN8UC4";
+            } else if (NGRAY == mat.GetMatType()) {
+                kernel_name = "ConvertFromNGray";
+            } else if (NNV21 == mat.GetMatType()) {
+                kernel_name = "ConvertFromNNV21";
+            } else if (NCHW_FLOAT == mat.GetMatType()) {
+                kernel_name = "ConvertFromNCHW";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+        } else if (dims_size == 5) {
+            if (NCHW_FLOAT == mat.GetMatType()) {
+                program_name = "blob_5d_convert_from_mat";
+                kernel_name = "Blob5DConvertFromNCHW";
+            } else {
+                char error_str[128];
+                sprintf(error_str, "Blob-5D convert type not support mat type: %d",
+                        mat.GetMatType());
+                return Status(TNNERR_PARAM_ERR, error_str);
+            }
+        } else if (dims_size == 6) {
+            if (NCHW_FLOAT == mat.GetMatType()) {
+                program_name = "blob_6d_convert_from_mat";
+                kernel_name = "Blob6DConvertFromNCHW";
+            } else {
+                char error_str[128];
+                sprintf(error_str, "Blob-6D convert type not support mat type: %d",
+                        mat.GetMatType());
+                return Status(TNNERR_PARAM_ERR, error_str);
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert not support dims > 6");
+        }
+    } else {
+        if (blob_->GetBlobDesc().data_format == DATA_FORMAT_CNH4) {
+            if (NCHW_FLOAT == mat.GetMatType()) {
+                kernel_name = "CNH4BlobConvertFromNCHW";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "CNH4 blob convert from mat not support yet");
+            }
+        } else {
+            char error_str[128];
+            sprintf(error_str, "blob convert from mat not support format: %d", blob_->GetBlobDesc().data_format);
+            return Status(TNNERR_PARAM_ERR, error_str);
+        }
+    }
+
+    return TNN_OK;
+}
+
+//CreateConvertUnit select kernel name and create execute unit
+Status OpenCLBlobConverterAcc::CreateConvertUnit(OpenCLExecuteUnit &unit, Mat &mat, MatConvertParam param,
+                                                 bool convert_to_mat) {
+    std::set<std::string> build_options;
+    std::string program_name = "";
+    std::string kernel_name  = "";
+    if (convert_to_mat) {
+        program_name = "convert_to_mat";
+        //DEVICE_NAIVE AND DEVICE_ARM is same for memory type.
+        if (DEVICE_NAIVE == mat.GetDeviceType() || DEVICE_ARM == mat.GetDeviceType()) {
+            Status ret = GetConvertToMatKernelName(mat, kernel_name, program_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+        } else if (DEVICE_OPENCL == mat.GetDeviceType()) {
+            if (N8UC4 == mat.GetMatType()) {
+                kernel_name = "ConvertToN32FC4Image";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    } else {
+        program_name = "convert_from_mat";
+        if (DEVICE_NAIVE == mat.GetDeviceType() || DEVICE_ARM == mat.GetDeviceType()) {
+            Status ret = GetConvertFromMatKernelName(mat, kernel_name, program_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+        } else if (DEVICE_OPENCL == mat.GetDeviceType()) {
+            if (N8UC4 == mat.GetMatType()) {
+                kernel_name = "ConvertFromN32FC4Image";
+            } else {
+                return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+        }
+    }
+
+    if (param.reverse_channel) {
+        build_options.emplace("-DSWAP_RB");
+    }
+
+    if (do_scale_bias_) {
+        if (blob_->GetBlobDesc().data_format == DATA_FORMAT_CNH4) {
+            return Status(TNNERR_PARAM_ERR, "cnh4 not support scale and bias yet");
+        }
+        build_options.emplace("-DENABLE_SCALE_BIAS");
+    }
+
+    return CreateExecuteUnit(unit, program_name, kernel_name, build_options);
+}
+
+Status OpenCLBlobConverterAcc::SetConvertArgs(OpenCLExecuteUnit &unit, Mat &mat, MatConvertParam param,
+                                              bool convert_to_mat) {
+    MatType mat_type = mat.GetMatType();
+    auto dims        = blob_->GetBlobDesc().dims;
+
+    uint32_t idx     = 0;
+    if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4) {
+        idx = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+    } else if (blob_->GetBlobDesc().data_format == DATA_FORMAT_CNH4 &&
+               (DEVICE_NAIVE == mat.GetDeviceType() || DEVICE_ARM == mat.GetDeviceType())) {
+        idx = SetExecuteUnit2DSizeInfoCNH4(unit, dims);
+    } else if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NCHW &&
+               (DEVICE_NAIVE == mat.GetDeviceType() || DEVICE_ARM == mat.GetDeviceType())) {
+        idx = SetExecuteUnit2DSizeInfoNCHW(unit, dims);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "blob data format not support yet");
+    }
+    cl::Image *image;
+    cl::Buffer *buffer;
+    if (blob_->GetBlobDesc().data_format != DATA_FORMAT_NCHW) {
+        image = static_cast<cl::Image *>(blob_->GetHandle().base);
+    } else {
+        buffer = static_cast<cl::Buffer *>(blob_->GetHandle().base);
+    }
+
+    cl_int cl_ret;
+    if (DEVICE_NAIVE == mat.GetDeviceType() || DEVICE_ARM == mat.GetDeviceType()) {
+        if (blob_->GetBlobDesc().data_format != DATA_FORMAT_NCHW) {
+            cl_ret = unit.ocl_kernel.setArg(idx++, *image);
+            CHECK_CL_SUCCESS(cl_ret);
+        } else {
+            cl_ret = unit.ocl_kernel.setArg(idx++, *buffer);
+            CHECK_CL_SUCCESS(cl_ret);
+        }
+        cl_ret = unit.ocl_kernel.setArg(idx++, *buffer_);
+        CHECK_CL_SUCCESS(cl_ret);
+        auto blob_dims_size = blob_->GetBlobDesc().dims.size();
+        if (blob_->GetBlobDesc().data_format != DATA_FORMAT_NCHW) {
+            cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 2));
+            CHECK_CL_SUCCESS(cl_ret);
+            if (blob_->GetBlobDesc().data_format == DATA_FORMAT_NHC4W4) {
+                cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 3));
+                CHECK_CL_SUCCESS(cl_ret);
+                // set dim4, optional
+                if (blob_dims_size >= 5) {
+                    cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 4));
+                    CHECK_CL_SUCCESS(cl_ret);
+                }
+                // set dim5, optional
+                if (blob_dims_size >= 6) {
+                    cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 5));
+                    CHECK_CL_SUCCESS(cl_ret);
+                }
+            } else if (blob_->GetBlobDesc().data_format == DATA_FORMAT_CNH4) {
+                // batch
+                cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 0));
+                CHECK_CL_SUCCESS(cl_ret);
+            }
+        }
+
+        if (NCHW_FLOAT == mat.GetMatType() || NC_INT32 == mat.GetMatType()) {
+            //special for NCHW_FLOAT, need channel parameter
+            cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 1));
+            CHECK_CL_SUCCESS(cl_ret);
+            cl_ret = unit.ocl_kernel.setArg(idx++, *scale_buffer_);
+            CHECK_CL_SUCCESS(cl_ret);
+            cl_ret = unit.ocl_kernel.setArg(idx++, *bias_buffer_);
+            CHECK_CL_SUCCESS(cl_ret);
+        } else {
+            // N8UC4 need channel parameter
+            if (N8UC4 == mat.GetMatType() && !convert_to_mat) {
+                cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 1));
+                CHECK_CL_SUCCESS(cl_ret);
+            }
+            if (param.scale.size() > 4 || param.bias.size() > 4) {
+                return Status(TNNERR_PARAM_ERR, "Cpu convert scale/bias is not valid");
+            }
+            // pad scale && bias for vectors in opencl kernel
+            while (param.scale.size() < 4) {
+                param.scale.push_back(1.0f);
+            }
+            while (param.bias.size() < 4) {
+                param.bias.push_back(0.0f);
+            }
+            cl_ret = unit.ocl_kernel.setArg(idx++, sizeof(float) * param.scale.size(), param.scale.data());
+            CHECK_CL_SUCCESS(cl_ret);
+            cl_ret = unit.ocl_kernel.setArg(idx++, sizeof(float) * param.bias.size(), param.bias.data());
+            CHECK_CL_SUCCESS(cl_ret);
+        }
+    } else if (DEVICE_OPENCL == mat.GetDeviceType()) {
+        cl::Image *mat_image = static_cast<cl::Image *>(mat.GetData());
+        cl_ret               = unit.ocl_kernel.setArg(idx++, *mat_image);
+        CHECK_CL_SUCCESS(cl_ret);
+        cl_ret = unit.ocl_kernel.setArg(idx++, *image);
+        CHECK_CL_SUCCESS(cl_ret);
+        if (!convert_to_mat) {
+            cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 1));
+            CHECK_CL_SUCCESS(cl_ret);
+        }
+        if (param.scale.size() > 4 || param.bias.size() > 4) {
+            return Status(TNNERR_PARAM_ERR, "Gpu convert scale/bias is not valid");
+        }
+        // pad scale && bias for vectors in opencl kernel
+        while (param.scale.size() < 4) {
+            param.scale.push_back(1.0f);
+        }
+        while (param.bias.size() < 4) {
+            param.bias.push_back(0.0f);
+        }
+        cl_ret = unit.ocl_kernel.setArg(idx++, sizeof(float) * param.scale.size(), param.scale.data());
+        CHECK_CL_SUCCESS(cl_ret);
+        cl_ret = unit.ocl_kernel.setArg(idx++, sizeof(float) * param.bias.size(), param.bias.data());
+        CHECK_CL_SUCCESS(cl_ret);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return TNN_OK;
+}
+
+Status OpenCLBlobConverterAcc::RunConvertUnit(OpenCLExecuteUnit &unit, cl::CommandQueue *command_queue,
+                                              bool need_wait) {
+    Status ret = RunKernel(unit.ocl_kernel, unit.global_work_size, unit.local_work_size, command_queue, "BlobConvert");
+    if (need_wait) {
+        //sync
+        command_queue->finish();
+    }
+    return ret;
+}
+
+//enqueueMapBuffer get cpu buffer pointer, copy buffer pointer to mat, enqueueUnmapMemObject.
+Status OpenCLBlobConverterAcc::CopyBufferDataToMat(Mat &mat, cl::CommandQueue *command_queue) {
+    MatType mat_type   = mat.GetMatType();
+    DimsVector dims    = blob_->GetBlobDesc().dims;
+
+    Status ret = CopyBufferToMat(mat, *buffer_, dims, buffer_size_, mat_type, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+//enqueueMapBuffer get cpu buffer pointer, copy mat to buffer pointer, enqueueUnmapMemObject.
+Status OpenCLBlobConverterAcc::CopyMatToBufferData(Mat &mat, cl::CommandQueue *command_queue) {
+    MatType mat_type   = mat.GetMatType();
+    DimsVector dims    = blob_->GetBlobDesc().dims;
+
+    Status ret = CopyMatToBuffer(mat, *buffer_, dims, buffer_size_, mat_type, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLBlobConverterAcc::CopyScaleBiasToBuffer(MatConvertParam param, cl::CommandQueue *cl_command_queue) {
+    cl_int cl_ret;
+    if (param.scale != host_scale_buffer_) {
+        // Copy scale to buffer
+        cl_ret = cl_command_queue->enqueueWriteBuffer(*scale_buffer_, CL_TRUE, 0, scale_bias_buffer_size_, param.scale.data());
+        CHECK_CL_SUCCESS(cl_ret);
+        host_scale_buffer_.assign(param.scale.begin(), param.scale.end());
+    }
+
+    if (param.bias != host_bias_buffer_) {
+        // Copy bias to buffer
+        cl_ret = cl_command_queue->enqueueWriteBuffer(*bias_buffer_, CL_TRUE, 0, scale_bias_buffer_size_, param.bias.data());
+        CHECK_CL_SUCCESS(cl_ret);
+        host_bias_buffer_.assign(param.bias.begin(), param.bias.end());
+    }
+
+    return TNN_OK;
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(OpenCL);
+REGISTER_BLOB_CONVERTER(OpenCL, DEVICE_OPENCL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.h
new file mode 100644
index 0000000..35235b3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_blob_converter.h
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_BLOB_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_BLOB_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+class OpenCLBlobConverterAcc : public BlobConverterAcc {
+public:
+    OpenCLBlobConverterAcc(Blob* blob);
+    virtual ~OpenCLBlobConverterAcc();
+
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+private:
+    bool NeedDoScaleBias(MatConvertParam& param);
+    Status CreateConvertUnit(OpenCLExecuteUnit& unit, Mat& mat, MatConvertParam param, bool convert_to_mat);
+    void CalculateWorkgroupSize(OpenCLExecuteUnit& unit);
+    Status SetConvertArgs(OpenCLExecuteUnit& unit, Mat& mat, MatConvertParam param, bool convert_to_mat);
+
+    Status RunConvertUnit(OpenCLExecuteUnit& unit, cl::CommandQueue* command_queue, bool need_wait = false);
+    Status CopyBufferDataToMat(Mat& mat, cl::CommandQueue* command_queue);
+    Status CopyMatToBufferData(Mat& mat, cl::CommandQueue* command_queue);
+    Status CopyScaleBiasToBuffer(MatConvertParam param, cl::CommandQueue *cl_command_queue);
+
+    Status GetConvertToMatKernelName(Mat &mat, std::string& kernel_name, std::string& program_name);
+    Status GetConvertFromMatKernelName(Mat &mat, std::string& kernel_name, std::string& program_name);
+
+    std::map<std::string, OpenCLExecuteUnit> convert_to_mat_map_ = {};
+    std::map<std::string, OpenCLExecuteUnit> convert_from_mat_map_ = {};
+    std::shared_ptr<cl::Buffer> buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> scale_buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> bias_buffer_ = nullptr;
+    std::vector<float> host_scale_buffer_;
+    std::vector<float> host_bias_buffer_;
+    int64_t buffer_size_ = 0;
+    int scale_bias_buffer_size_ = 0;
+    bool do_scale_bias_ = true;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_BLOB_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_context.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_context.cc
new file mode 100644
index 0000000..6dfcb7a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_context.cc
@@ -0,0 +1,313 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/core/profile.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/utils/string_format.h"
+
+#include <fstream>
+
+namespace TNN_NS {
+
+std::mutex OpenCLContext::s_mutex_;
+
+OpenCLContext::OpenCLContext() : Context() {
+    // Get OpenCL Runtime
+    opencl_runtime_ = OpenCLRuntime::GetInstance();
+    OpenCLRuntime::IncreaseRef();
+}
+
+OpenCLContext::~OpenCLContext() {
+    command_queue_.reset();
+    OpenCLRuntime::DecreaseRef();
+}
+
+Status OpenCLContext::GetCommandQueue(void** command_queue) {
+    *command_queue = command_queue_.get();
+    return TNN_OK;
+}
+
+Status OpenCLContext::ShareCommandQueue(Context* context) {
+    auto context_target = dynamic_cast<OpenCLContext *>(context);
+    if (!context_target) {
+        return Status(TNNERR_DEVICE_LIBRARY_LOAD, "inpute context is not OpenCLContext");
+    }
+
+    command_queue_ = context_target->GetCommandQueue();
+    return TNN_OK;
+}
+
+cl::CommandQueue* OpenCLContext::CommandQueue() {
+    return command_queue_.get();
+}
+
+cl::CommandQueue* OpenCLContext::TuneCommandQueue() {
+    return tune_command_queue_.get();
+}
+
+OpenCLProfilingData::~OpenCLProfilingData() {}
+
+#if TNN_PROFILE
+void OpenCLContext::StartProfile() {
+    Context::StartProfile();
+    profiling_result_ = std::make_shared<OpenCLProfileResult>();
+}
+
+OpenCLProfileResult::~OpenCLProfileResult() {}
+
+std::string OpenCLProfileResult::GetProfilingDataInfo() {
+    // show the time cost of each layer
+    std::string title                     = "Profiling Data";
+    const std::vector<std::string> header = {"name",       "Op Type",   "Kernel(ms)",  "Queued(ms)",   "Submit(ms)",
+                                             "Start(ms)",  "End(ms)",   "Input(NCHW)", "Output(NCHW)", "Filter(OIHW)",
+                                             "Stride",     "Pad",       "Dilation",    "GFlops",       "BW(GB/s)",
+                                             "GWS(0,1,2)", "LWS(0,1,2)"};
+
+    std::vector<std::vector<std::string>> data;
+
+    double kernel_time_sum = 0;
+    for (auto item : profiling_data_) {
+        OpenCLProfilingData* p = dynamic_cast<OpenCLProfilingData*>(item.get());
+        if (nullptr == p) {
+            LOGE("OpenCLProfilingData is nil\n");
+            return "";
+        }
+        // GetProfiling
+        GetProfilingTime(&p->event, p->kernel_time, p->event_queued, p->event_submit, p->event_start, p->event_end);
+    }
+    auto p = dynamic_cast<OpenCLProfilingData*>(profiling_data_[0].get());
+    if (nullptr == p) {
+        LOGE("OpenCLProfilingData is nil\n");
+        return "";
+    }
+    double profile_start = p->event_queued;
+    for (auto item : profiling_data_) {
+        OpenCLProfilingData* p = dynamic_cast<OpenCLProfilingData*>(item.get());
+        if (nullptr == p) {
+            LOGE("OpenCLProfilingData is nil\n");
+            return "";
+        }
+
+        p->event_queued = (p->event_queued - profile_start) / 1000000.0;
+        p->event_submit = (p->event_submit - profile_start) / 1000000.0;
+        p->event_start  = (p->event_start - profile_start) / 1000000.0;
+        p->event_end    = (p->event_end - profile_start) / 1000000.0;
+    }
+
+    for (auto item : profiling_data_) {
+        OpenCLProfilingData* p = dynamic_cast<OpenCLProfilingData*>(item.get());
+        if (nullptr == p) {
+            LOGE("OpenCLProfilingData is nil\n");
+            return "";
+        }
+        std::vector<std::string> tuples = {};
+        tuples.reserve(32);
+
+        tuples.push_back(p->layer_name);
+        tuples.push_back(p->op_name);
+        tuples.push_back(DoubleToString(p->kernel_time));
+        tuples.push_back(DoubleToString(p->event_queued));
+        tuples.push_back(DoubleToString(p->event_submit));
+        tuples.push_back(DoubleToString(p->event_start));
+        tuples.push_back(DoubleToString(p->event_end));
+        tuples.push_back(VectorToString(p->input_dims));
+        tuples.push_back(VectorToString(p->output_dims));
+        tuples.push_back(VectorToString(p->kernel_shape));
+        tuples.push_back(VectorToString(p->stride_shape));
+        tuples.push_back(VectorToString(p->pad_shape));
+        tuples.push_back(VectorToString(p->dilation_shape));
+        tuples.push_back(DoubleToStringFilter(p->flops / p->kernel_time));
+        tuples.push_back(DoubleToStringFilter(p->bandwidth / p->kernel_time));
+        tuples.push_back(VectorToString(p->global_worksize));
+        tuples.push_back(VectorToString(p->local_worksize));
+
+        data.emplace_back(tuples);
+        kernel_time_sum += p->kernel_time;
+    }
+
+    std::string detailed_string = StringFormatter::Table(title, header, data);
+
+    std::string summary_string = GetProfilingDataSummary(false);
+
+    std::ostringstream ostr;
+    ostr << "kernel runtime total: " << kernel_time_sum << " ms\n\n";
+
+    return detailed_string + summary_string + ostr.str();
+}
+#endif
+
+// external dependent library load
+Status OpenCLContext::LoadLibrary(std::vector<std::string> path) {
+    return Init();
+}
+
+Status OpenCLContext::OnInstanceForwardBegin() {
+    return Context::OnInstanceForwardBegin();
+}
+
+Status OpenCLContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+// this function is called before Reshape by Network.
+Status OpenCLContext::OnInstanceReshapeBegin() {
+    if (enable_tune_kernel_) {
+        cl_int err;
+        cl_command_queue_properties properties = properties_ | CL_QUEUE_PROFILING_ENABLE;
+        tune_command_queue_ = std::make_shared<cl::CommandQueue>(*opencl_runtime_->Context(),
+                                                                 *opencl_runtime_->Device(), properties, &err);
+        if (err != CL_SUCCESS) {
+            LOGE("Command Queue create failed! (ERROR CODE: %d)\n", err);
+            return Status(TNNERR_DEVICE_CONTEXT_CREATE, "Command Queue create failed!");
+        }
+
+        //read tune map
+        if(!cache_file_path_.empty() && local_size_tune_map_.empty()) {
+            std::lock_guard<std::mutex> lock(s_mutex_);
+            std::ifstream cache_stream(cache_file_path_);
+            std::string key;
+            uint32_t cache_map_size;
+            uint32_t local_size_length;
+            uint32_t local_value;
+            bool tune_file_error = false;
+            if (cache_stream.is_open() && cache_stream.good()) {
+                cache_stream >> cache_map_size;
+                if(cache_stream.good()) {
+                    for (int i = 0; i < cache_map_size; ++i) {
+                        std::vector<uint32_t> local_size;
+                        cache_stream >> key >> local_size_length;
+                        for (int i = 0; i < local_size_length; ++i) {
+                            cache_stream >> local_value;
+                            local_size.push_back(local_value);
+                        }
+                        if(cache_stream.good()) {
+                            local_size_tune_map_.insert(make_pair(key, local_size));
+                        } else {
+                            local_size_tune_map_.clear();
+                            break;
+                        }
+                    }
+                }
+                cache_stream.close();
+            }
+        }
+
+        tune_map_size_ = local_size_tune_map_.size();
+    }
+    return TNN_OK;
+}
+
+// this function is called after Reshape by Network.
+Status OpenCLContext::OnInstanceReshapeEnd() {
+    if (enable_tune_kernel_) {
+        tune_command_queue_ = nullptr;
+        if (!cache_file_path_.empty() && local_size_tune_map_.size() > tune_map_size_) {
+            std::lock_guard<std::mutex> lock(s_mutex_);
+            tune_map_size_ = local_size_tune_map_.size();
+            std::ofstream cache_stream(cache_file_path_);
+            if (cache_stream.is_open()) {
+                cache_stream << local_size_tune_map_.size() << std::endl;
+                for (auto element : local_size_tune_map_) {
+                    std::string key                  = element.first;
+                    std::vector<uint32_t> local_size = element.second;
+                    cache_stream << key << " " << local_size.size();
+                    for (int i = 0; i < local_size.size(); ++i) {
+                        cache_stream << " " << local_size[i];
+                    }
+                    cache_stream << std::endl;
+                    if(!cache_stream.good()) {
+                        break;
+                    }
+                }
+                cache_stream.close();
+            }
+        }
+    }
+
+    if (opencl_runtime_ == nullptr) {
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "opencl_runtime is nullptr");
+    }
+
+    Status ret = opencl_runtime_->SaveProgramCache();
+    if (ret != TNN_OK) {
+        LOGE("save program cache failed, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+    }
+    return TNN_OK;
+}
+
+// synchronize will wait until the command queue finish
+Status OpenCLContext::Synchronize() {
+    cl_int result = command_queue_->finish();
+    if (result == 0) {
+        return TNN_OK;
+    } else {
+        return Status(TNNERR_OPENCL_FINISH_ERROR, "command queue finish failed");
+    }
+}
+
+// opencl kernel flush strategy, some devices(special for huawei device) whave serious latency.
+unsigned int OpenCLContext::AddAndGetFlushCount() {
+    flush_count_++;
+    return flush_count_;
+}
+
+// Init Will create command queue, get fp16 info
+Status OpenCLContext::Init() {
+    if (opencl_runtime_ == nullptr) {
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "opencl_runtime is nullptr");
+    }
+
+    // set cache path for opencl runtime
+    opencl_runtime_->SetCachePath(cache_path_);
+
+    Status status = opencl_runtime_->Init();
+    if (status != TNN_OK) {
+        LOGE("OpenCL Runtime Init() failed (ret = %d)!\n", (int)status);
+        return status;
+    }
+
+#if TNN_PROFILE
+    properties_ |= CL_QUEUE_PROFILING_ENABLE;
+#endif
+
+    cl_int err;
+    command_queue_ =
+        std::make_shared<cl::CommandQueue>(*opencl_runtime_->Context(), *opencl_runtime_->Device(), properties_, &err);
+    if (err != CL_SUCCESS) {
+        LOGE("Command Queue create failed! (ERROR CODE: %d)\n", err);
+        return Status(TNNERR_DEVICE_CONTEXT_CREATE, "Command Queue create failed!");
+    }
+
+    bool ret = opencl_runtime_->SetPrecision(precision_);
+    if (ret) {
+        LOGD("opencl set precision %d\n", precision_);
+    } else {
+        LOGD("opencl set fp16 precision failed, precision set: %d\n", opencl_runtime_->GetPrecision());
+    }
+
+    return TNN_OK;
+}
+
+//Todo: refactor later
+std::shared_ptr<cl::CommandQueue> OpenCLContext::GetCommandQueue() {
+    return command_queue_; 
+}
+
+std::map<std::string, std::vector<uint32_t>>& OpenCLContext::GetLocalSizeTuneMap() {
+    return local_size_tune_map_;
+}
+
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_context.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_context.h
new file mode 100644
index 0000000..345fe87
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_context.h
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_CONTEXT_H_
+
+#include <memory>
+#include <thread>
+#include <mutex>
+
+#include "tnn/core/context.h"
+#include "tnn/core/profile.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+
+namespace TNN_NS {
+
+struct OpenCLProfilingData : public ProfilingData {
+    virtual ~OpenCLProfilingData();
+    cl::Event event;
+    double event_queued;
+    double event_submit;
+    double event_start;
+    double event_end;
+    std::vector<uint32_t> global_worksize = {};
+    std::vector<uint32_t> local_worksize = {};
+};
+
+#if TNN_PROFILE
+class OpenCLProfileResult : public ProfileResult {
+public:
+    virtual ~OpenCLProfileResult();
+    virtual std::string GetProfilingDataInfo() override;
+};
+#endif
+
+class OpenCLContext : public Context {
+public:
+    OpenCLContext();
+    ~OpenCLContext();
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    Status GetCommandQueue(void **command_queue) override;
+
+    // @brief share tnn command queue to another context
+    Status ShareCommandQueue(Context* context) override;
+
+    /**
+     * @brief get CommandQueue
+     */
+    cl::CommandQueue *CommandQueue();
+
+    cl::CommandQueue *TuneCommandQueue();
+
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+    /**
+     * @brief before instance forward
+     * @param instance instance
+     */
+    virtual Status OnInstanceForwardBegin() override;
+    /**
+     * @brief after instance forward
+     * @param instance instance
+     */
+    virtual Status OnInstanceForwardEnd() override;
+
+     // @brief before instance Reshape
+    virtual Status OnInstanceReshapeBegin() override;
+
+    // @brief after instance Reshape
+    virtual Status OnInstanceReshapeEnd() override;   
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief add flush_count_ and return val
+    unsigned int AddAndGetFlushCount();
+
+    std::map<std::string, std::vector<uint32_t>>& GetLocalSizeTuneMap();
+
+    Status StoreLocalSizeTuneMap();
+
+#if TNN_PROFILE
+public:
+    virtual void StartProfile() override;
+#endif
+
+public:
+    /**
+     * @brief initialize opencl env
+     */
+    Status Init();
+
+private:
+    std::shared_ptr<cl::CommandQueue> command_queue_ = nullptr;
+    std::shared_ptr<cl::CommandQueue> tune_command_queue_ = nullptr;
+    std::shared_ptr<cl::CommandQueue> GetCommandQueue();
+    OpenCLRuntime *opencl_runtime_ = nullptr;
+    unsigned int flush_count_ = 0;
+    cl_command_queue_properties properties_ = 0;
+
+    bool ReadStatusCheck(std::ifstream& is);
+
+    std::map<std::string, std::vector<uint32_t>> local_size_tune_map_;
+    uint32_t tune_map_size_;
+
+    static std::mutex s_mutex_;
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_device.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_device.cc
new file mode 100644
index 0000000..c96a93e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_device.cc
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_device.h"
+
+#include "tnn/device/opencl/acc/opencl_cpu_adapter_acc.h"
+#include "tnn/device/opencl/imagebuffer_convertor.h"
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+OpenCLDevice::OpenCLDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+OpenCLDevice::~OpenCLDevice() {}
+
+BlobMemorySizeInfo OpenCLDevice::Calculate(BlobDesc& desc) {
+    OpenCLRuntime* opencl_runtime = OpenCLRuntime::GetInstance();
+    std::vector<size_t> image_2d_max_size = opencl_runtime->GetImage2dMaxSize();
+    ASSERT(image_2d_max_size.size() == 2);
+    BlobMemorySizeInfo info = Calculate2DCLImageMemorySize(desc);
+    ASSERT(info.dims.size() == 2);
+    if (info.dims[0] > image_2d_max_size[0] || info.dims[1] > image_2d_max_size[1]) {
+        LOGD("Exceed clImage limit, dims: [%d, %d]\n", info.dims[0], info.dims[1]);
+        desc.data_format = DATA_FORMAT_NCHW;
+        info = Calculate1DMemorySize(desc);
+    }
+    return info;
+}
+
+Status OpenCLDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    if (dims.size() != 4) {
+        LOGE("invalid dim size: %d\n", (int)dims.size());
+        return Status(TNNERR_PARAM_ERR, "invalid dim size");
+    }
+
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = GetDeviceType();
+    desc.data_type   = DATA_TYPE_HALF; // try to use half precision
+    if (mat_type == N8UC4) {
+        auto size_info = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("opencl allocator not support this mat type: %d\n", mat_type);
+        return Status(TNNERR_PARAM_ERR, "opencl not support this mat type");
+    }
+}
+
+//allocate clImage/clBuffer
+Status OpenCLDevice::Allocate(void** handle, BlobMemorySizeInfo& desc) {
+    OpenCLRuntime* opencl_runtime = OpenCLRuntime::GetInstance();
+
+    if (DATA_TYPE_HALF != desc.data_type && DATA_TYPE_FLOAT != desc.data_type && DATA_TYPE_INT32 != desc.data_type) {
+        LOGE("opencl allocator not support this data type: %d\n", desc.data_type);
+        return Status(TNNERR_PARAM_ERR, "opencl not support this data type");
+    }
+
+    if (desc.dims.size() == 2) {
+        // allocate clImage
+        cl_mem_flags mem_flag     = CL_MEM_READ_WRITE;
+        cl_channel_type data_type = CL_FLOAT;
+
+        if (DATA_TYPE_HALF == desc.data_type && opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            data_type = CL_HALF_FLOAT;
+        }
+        if (DATA_TYPE_INT32 == desc.data_type) {
+            data_type = CL_SIGNED_INT32;
+        }
+        int w = desc.dims[0];
+        int h = desc.dims[1];
+        cl_int error;
+        *handle = new cl::Image2D(*opencl_runtime->Context(), mem_flag, cl::ImageFormat(CL_RGBA, data_type), w, h, 0,
+                                nullptr, &error);
+        if (error != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(error);
+            char error_str[128];
+            sprintf(error_str, "OpenCL Allocate Image Failed (w=%d, h=%d)", w, h);
+            return Status(TNNERR_OPENCL_API_ERROR, error_str);
+        }
+    } else if (desc.dims.size() == 1) {
+        // allocate clBuffer
+        cl_mem_flags mem_flag     = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+        cl_channel_type data_type = CL_FLOAT;
+        int type_size = sizeof(float);
+
+        if (DATA_TYPE_HALF == desc.data_type && opencl_runtime->GetPrecision() != PRECISION_HIGH) {
+            type_size = 2;
+        }
+        if (DATA_TYPE_INT32 == desc.data_type) {
+            type_size = sizeof(int);
+        }
+        cl_int error;
+        *handle = new cl::Buffer(*opencl_runtime->Context(), mem_flag, (cl::size_type)(type_size * desc.dims[0]),
+                                 nullptr, &error);
+        if (error != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(error);
+            char error_str[128];
+            sprintf(error_str, "OpenCL Allocate Buffer Failed (count=%d)", desc.dims[0]);
+            return Status(TNNERR_OPENCL_API_ERROR, error_str);
+        }
+    } else {
+        char error_str[128];
+        sprintf(error_str, "OpenCL not support Allocate (dims=%d)", (int)desc.dims.size());
+        return Status(TNNERR_PARAM_ERR, error_str);
+    }
+    return TNN_OK;
+}
+
+//release clImage
+Status OpenCLDevice::Free(void* handle) {
+    cl::Image2D* buffer = static_cast<cl::Image2D*>(handle);
+    if (buffer != NULL)
+        delete buffer;
+    return TNN_OK;
+}
+
+//Copy data from Cpu To Device, format is same.
+Status OpenCLDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    OpenCLRuntime* opencl_runtime          = OpenCLRuntime::GetInstance();
+    cl::CommandQueue* opencl_command_queue = static_cast<cl::CommandQueue*>(command_queue);
+    if (opencl_command_queue == nullptr)
+        return Status(TNNERR_DEVICE_INVALID_COMMAND_QUEUE, "command_queue is nullptr");
+
+    // Todo: convert src data type
+    cl_int cl_ret;
+    std::shared_ptr<OpenCLMemory> clbuffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(desc.dims) * sizeof(float));
+    clbuffer->SetData(&buffer);
+    auto clbuffer_ptr = opencl_command_queue->enqueueMapBuffer(
+        buffer, true, CL_MAP_WRITE, 0, DimsVectorUtils::Count(desc.dims) * sizeof(float), nullptr, nullptr, &cl_ret);
+    if (cl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(cl_ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memcpy(clbuffer_ptr, reinterpret_cast<char*>(src->base) + src->bytes_offset,
+           DimsVectorUtils::Count(desc.dims) * sizeof(float));
+    cl_ret = opencl_command_queue->enqueueUnmapMemObject(buffer, clbuffer_ptr);
+    if (cl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(cl_ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    std::shared_ptr<OpenCLMemory> climage(new OpenCLMemory(TNN_CL_IMAGE));
+    climage->SetData(reinterpret_cast<char*>(dst->base) + dst->bytes_offset);
+
+    ImageBufferConvertor convertor(opencl_runtime, opencl_command_queue);
+    return convertor.ConvertBufferToImage(clbuffer.get(), NCHW_BUFFER, desc.dims, climage.get(), true);
+}
+
+//Copy data from Device To Cpu, format is same.
+Status OpenCLDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    OpenCLRuntime* opencl_runtime          = OpenCLRuntime::GetInstance();
+    cl::CommandQueue* opencl_command_queue = static_cast<cl::CommandQueue*>(command_queue);
+    if (opencl_command_queue == nullptr)
+        return Status(TNNERR_DEVICE_INVALID_COMMAND_QUEUE, "command_queue is nullptr");
+
+    std::shared_ptr<OpenCLMemory> clbuffer(new OpenCLMemory(TNN_CL_BUFFER));
+    cl::Buffer buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      DimsVectorUtils::Count(desc.dims) * sizeof(float));
+    clbuffer->SetData(&buffer);
+
+    std::shared_ptr<OpenCLMemory> climage(new OpenCLMemory(TNN_CL_IMAGE));
+    climage->SetData(reinterpret_cast<char*>(src->base) + src->bytes_offset);
+
+    ImageBufferConvertor convertor(opencl_runtime, opencl_command_queue);
+    Status ret = convertor.ConvertImageToBuffer(climage.get(), NCHW_BUFFER, desc.dims, clbuffer.get(), true);
+    if (ret != TNN_OK)
+        return ret;
+
+    cl_int cl_ret;
+    auto clbuffer_ptr = opencl_command_queue->enqueueMapBuffer(
+        buffer, true, CL_MAP_READ, 0, DimsVectorUtils::Count(desc.dims) * sizeof(float), nullptr, nullptr, &cl_ret);
+    if (cl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(cl_ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset, clbuffer_ptr,
+           DimsVectorUtils::Count(desc.dims) * sizeof(float));
+    cl_ret = opencl_command_queue->enqueueUnmapMemObject(buffer, clbuffer_ptr);
+    if (cl_ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(cl_ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    return TNN_OK;
+}
+
+//create layer acc with layer type
+AbstractLayerAcc* OpenCLDevice::CreateLayerAcc(LayerType type) {
+    auto& layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    } else {
+        return new OpenCLCpuAdapterAcc(type);
+    }
+}
+
+std::shared_ptr<const ImplementedLayout> OpenCLDevice::GetImplementedLayout(LayerType type) {
+    auto &layer_layout_map = GetLayerLayoutMap();
+    if (layer_layout_map.count(type) > 0) {
+        return layer_layout_map[type];
+    }
+    return std::make_shared<ImplementedLayout>();
+}
+
+Context* OpenCLDevice::CreateContext(int device_id) {
+    return new OpenCLContext();
+}
+
+NetworkType OpenCLDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_DEFAULT;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>& OpenCLDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+Status OpenCLDevice::RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator) {
+    GetLayerCreatorMap()[type] = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+Status OpenCLDevice::RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+    GetLayerLayoutMap()[type] = layout;
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<ImplementedLayout>> &OpenCLDevice::GetLayerLayoutMap() {
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>> layer_layout_map;
+    return layer_layout_map;
+}
+
+TypeDeviceRegister<OpenCLDevice> g_opencl_device_register(DEVICE_OPENCL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_device.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_device.h
new file mode 100644
index 0000000..880a478
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_device.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_DEVICE_FACTORY_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_DEVICE_FACTORY_H_
+
+#include "tnn/core/abstract_device.h"
+
+#include "tnn/device/opencl/opencl_runtime.h"
+
+namespace TNN_NS {
+
+class OpenCLDevice : public AbstractDevice {
+public:
+    explicit OpenCLDevice(DeviceType device_type);
+
+    ~OpenCLDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    virtual NetworkType ConvertAutoNetworkType();
+    
+    // @brief get implemented layouts on the device by layer type
+    virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+    static Status RegisterLayerLayout(LayerType type, std::shared_ptr<ImplementedLayout> layout);
+
+private:
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+    static std::map<LayerType, std::shared_ptr<ImplementedLayout>>& GetLayerLayoutMap();
+};
+
+//@brief OpenCLTypeLayerAccRegister register OpenCLTypeLayerAccCreator
+template <typename T>
+class OpenCLTypeLayerAccRegister {
+public:
+    explicit OpenCLTypeLayerAccRegister(LayerType type) {
+        OpenCLDevice::RegisterLayerAccCreator(type, new T());
+    }
+};
+
+class OpenCLTypeLayerLayoutRegister {
+public:
+    explicit OpenCLTypeLayerLayoutRegister(LayerType type, std::shared_ptr<ImplementedLayout> layout) {
+        OpenCLDevice::RegisterLayerLayout(type, layout);
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_DEVICE_FACTORY_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_execute_unit.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_execute_unit.h
new file mode 100644
index 0000000..a0fcf68
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_execute_unit.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_EXECUTE_UNIT_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_EXECUTE_UNIT_H_
+
+#include "tnn/device/opencl/opencl_runtime.h"
+
+struct OpenCLExecuteUnit {
+    std::string program_name;
+    std::string kernel_name;
+    cl::Kernel ocl_kernel;
+    uint32_t workgroupsize_max;
+    std::vector<uint32_t> global_work_size = {};
+    std::vector<uint32_t> local_work_size = {};
+    uint32_t sub_group_size;
+    uint64_t local_mem_size;
+};
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_EXECUTE_UNIT_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.cc
new file mode 100644
index 0000000..836f2c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.cc
@@ -0,0 +1,583 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_mat_converter.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+Status OpenCLMatConverterAcc::SetExecuteUnit(
+        OpenCLExecuteUnit& unit, Mat& src, Mat& dst,
+        const bool copy_flag, const std::string& mat_key) {
+    Status ret = TNN_OK;
+    if(copy_flag){
+        if(execute_map_.count(mat_key) == 0) {
+            std::string program_name = "convert_to_mat";
+            std::string kernel_name = "";
+            if(N8UC4 == dst.GetMatType()) {
+                kernel_name = "CopyToN8UC4";
+            } else if (N8UC3 == dst.GetMatType()) {
+                kernel_name = "CopyToN8UC3";
+            }
+            ret = CreateExecuteUnit(unit, program_name, kernel_name);
+            if(ret != TNN_OK) {
+                return ret;
+            }
+            execute_map_[mat_key] = unit;
+        }
+    } else {
+        if(execute_map_.count(mat_key) == 0) {
+            std::string program_name = "convert_from_mat";
+            std::string kernel_name = "";
+            if(N8UC4 == src.GetMatType()) {
+                kernel_name = "CopyFromN8UC4";
+            } else if (N8UC3 == src.GetMatType()) {
+                kernel_name = "CopyFromN8UC3";
+            }
+            ret = CreateExecuteUnit(unit, program_name, kernel_name);
+            if(ret != TNN_OK) {
+                return ret;
+            }
+            execute_map_[mat_key] = unit;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    Status ret           = TNN_OK;
+    // force float to get the max memeory
+    bool copy_flag = false;
+    if (src.GetDeviceType() != DEVICE_OPENCL) {//CPU -> GPU
+        copy_flag = false;
+    } else if (dst.GetDeviceType() != DEVICE_OPENCL){//GPU->CPU
+        copy_flag = true;
+    }
+    // buffer_reset
+    BlobMemorySizeInfo info;
+    info.data_type = DATA_TYPE_FLOAT;
+    int batch, channel, height, width;
+    batch            = src.GetBatch();
+    channel          = src.GetChannel();
+    height           = src.GetHeight();
+    width            = src.GetWidth();
+    //nchw->nhwc
+    int image_width  = UP_DIV(channel, 4) * width;
+    int image_height = batch * height;
+    info.dims.push_back(image_width);
+    info.dims.push_back(image_height);
+
+    info.data_type   = DATA_TYPE_FLOAT;
+    auto opencl_runtime   = OpenCLRuntime::GetInstance();
+    buffer_size_          = GetBlobMemoryBytesSize(info);
+    cl_int ret_cl            = CL_SUCCESS;
+    cl::Buffer* cl_buffer = new cl::Buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                            (cl::size_type)buffer_size_, nullptr, &ret_cl);
+    if (ret_cl != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret_cl)
+        if (nullptr != cl_buffer)
+            delete cl_buffer;
+    } else {
+        buffer_.reset(cl_buffer);
+    }
+
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    MatType src_mat_type = src.GetMatType();
+    MatType dst_mat_type = dst.GetMatType();
+
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+
+    if (nullptr == buffer_) {
+        LOGE("OpenCLBlobConverter buffer allocate failed\n");
+        return Status(TNNERR_NULL_PARAM, "OpenCLBlobConverter buffer allocate failed!");
+    }
+
+    if(src_mat_type != dst_mat_type){
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+
+    //create identifier
+    std::string mat_key = ToString(src.GetDeviceType()) + "_" + ToString(dst.GetDeviceType());
+    //create convert unit only once for every key
+    OpenCLExecuteUnit unit;
+    ret = SetExecuteUnit(unit, src, dst, copy_flag, mat_key);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    // set copy_arguments
+    ret                    = SetConvertArgs(unit, src, dst, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    //if src device is cpu, need copy src_mat data to buffer and bind buffer to dst_mat data
+    if (src.GetDeviceType() != DEVICE_OPENCL) {
+        ret = CopyMatToBufferData(src, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        ret = RunConvertUnit(unit, cl_command_queue, false);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    } else {
+        ret = RunConvertUnit(unit, cl_command_queue, false);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        ret = CopyBufferDataToMat(dst, cl_command_queue);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+    return ret;
+}
+
+//enqueueMapBuffer get cpu buffer pointer, copy buffer pointer to mat, enqueueUnmapMemObject.
+Status OpenCLMatConverterAcc::CopyBufferDataToMat(Mat &mat, cl::CommandQueue *command_queue) {
+    MatType mat_type   = mat.GetMatType();
+    DimsVector dims    = mat.GetDims();
+
+    Status ret = CopyBufferToMat(mat, *buffer_, dims, buffer_size_, mat_type, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+//enqueueMapBuffer get cpu buffer pointer, copy mat to buffer pointer, enqueueUnmapMemObject.
+Status OpenCLMatConverterAcc::CopyMatToBufferData(Mat &mat, cl::CommandQueue *command_queue) {
+    MatType mat_type   = mat.GetMatType();
+    DimsVector dims    = mat.GetDims();
+
+    Status ret = CopyMatToBuffer(mat, *buffer_, dims, buffer_size_, mat_type, command_queue);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::SetConvertArgs(OpenCLExecuteUnit &unit, Mat &src, Mat &dst,
+                                              bool convert_to_mat) {
+    MatType mat_type = src.GetMatType();
+    auto dims        = dst.GetDims();
+
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+
+    cl_int cl_ret;
+    if (DEVICE_NAIVE == src.GetDeviceType()) {
+        cl::Image *image = static_cast<cl::Image *>(dst.GetData());
+        cl_ret = unit.ocl_kernel.setArg(idx++, *image);
+        CHECK_CL_SUCCESS(cl_ret);
+        cl_ret = unit.ocl_kernel.setArg(idx++, *buffer_);
+        CHECK_CL_SUCCESS(cl_ret);
+        //height
+        cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 2));
+        CHECK_CL_SUCCESS(cl_ret);
+        //width
+        cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 3));
+        CHECK_CL_SUCCESS(cl_ret);
+    } else if (DEVICE_OPENCL == src.GetDeviceType()) {
+        cl::Image *mat_image = static_cast<cl::Image *>(src.GetData());
+        cl_ret               = unit.ocl_kernel.setArg(idx++, *mat_image);
+        CHECK_CL_SUCCESS(cl_ret);
+        cl_ret = unit.ocl_kernel.setArg(idx++, *buffer_);
+        CHECK_CL_SUCCESS(cl_ret);
+        //height
+        cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 2));
+        CHECK_CL_SUCCESS(cl_ret);
+        //width
+        cl_ret = unit.ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(dims, 3));
+        CHECK_CL_SUCCESS(cl_ret);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::SetWarpAffineArgs(OpenCLExecuteUnit& unit, Mat& src, Mat& dst, WarpAffineParam param) {
+    MatType mat_type        = src.GetMatType();
+    auto output_dims        = dst.GetDims();
+    auto input_dims         = src.GetDims();
+
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(unit, output_dims);
+
+    cl_int cl_ret;
+    const std::string key = (param.interp_type == INTERP_TYPE_LINEAR) ?
+            "WarpAffineLinear" : "WarpAffineNearest";
+    if (DEVICE_OPENCL == src.GetDeviceType()) {
+        cl::Image *mat_image        = static_cast<cl::Image *>(src.GetData());
+        cl::Image *output_mat_image = static_cast<cl::Image *>(dst.GetData());
+
+        // input mat
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, *mat_image);
+        CHECK_CL_SUCCESS(cl_ret);
+        // output mat
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, *output_mat_image);
+        CHECK_CL_SUCCESS(cl_ret);
+        // output height
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 2));
+        CHECK_CL_SUCCESS(cl_ret);
+        // output width
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(output_dims, 3));
+        CHECK_CL_SUCCESS(cl_ret);
+        // channel
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, UP_DIV(DimsFunctionUtils::GetDim(input_dims, 1), 4));
+        CHECK_CL_SUCCESS(cl_ret);
+        // input height
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 2));
+        CHECK_CL_SUCCESS(cl_ret);
+        // input width
+        cl_ret = execute_map_[key].ocl_kernel.setArg(idx++, DimsFunctionUtils::GetDim(input_dims, 3));
+        CHECK_CL_SUCCESS(cl_ret);
+        // inversed transform matrix
+        cl_ret = unit.ocl_kernel.setArg(idx++, *matrix_buffer_);
+        CHECK_CL_SUCCESS(cl_ret);
+        // border_val
+        cl_ret = unit.ocl_kernel.setArg(idx++, param.border_val);
+        CHECK_CL_SUCCESS(cl_ret);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::RunConvertUnit(OpenCLExecuteUnit &unit, cl::CommandQueue *command_queue,
+                                              bool need_wait) {
+    Status ret = RunKernel(unit.ocl_kernel, unit.global_work_size, unit.local_work_size, command_queue, "MatConvert");
+    if (need_wait) {
+        //sync
+        command_queue->finish();
+    }
+    return ret;
+}
+
+Status OpenCLMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+    if(src.GetDeviceType() != dst.GetDeviceType()) {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+    const std::string key = "Resize";
+    OpenCLExecuteUnit unit;
+    if(execute_map_.count(key) == 0) {
+        std::string program_name = "normalize";
+        std::string kernel_name = "";
+        if(param.type == INTERP_TYPE_LINEAR) {
+            kernel_name = "ResizeBilinear";
+        } else if(param.type == INTERP_TYPE_NEAREST) {
+            kernel_name = "ResizeNearest";
+        } else {
+            return Status(TNNERR_PARAM_ERR, "resize type is illegal");
+        }
+        ret = CreateExecuteUnit(unit, program_name, kernel_name);
+        if(ret != TNN_OK) {
+            return ret;
+        }
+        execute_map_[key] = unit; 
+    }
+
+    auto dims        = dst.GetDims();
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+    int dst_width    = dst.GetWidth();
+    int dst_height   = dst.GetHeight();
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+    float w_scale =  ((float)src.GetWidth() / (float)dst_width);
+    float h_scale =  ((float)src.GetHeight() / (float)dst_height);
+    cl_int cl_ret;
+    cl::Image *image_input = static_cast<cl::Image *>(src.GetData());
+    cl::Image *image_output = static_cast<cl::Image *>(dst.GetData());
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_input);
+    CHECK_CL_SUCCESS(cl_ret);
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_output);
+    CHECK_CL_SUCCESS(cl_ret);
+    //scale_w
+    cl_ret = unit.ocl_kernel.setArg(idx++, w_scale); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //scale_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, h_scale);
+    CHECK_CL_SUCCESS(cl_ret);
+    //src_w
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetWidth()); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //src_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetHeight());
+    CHECK_CL_SUCCESS(cl_ret);
+    //dst_w
+    cl_ret = unit.ocl_kernel.setArg(idx++, dst.GetWidth()); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //dst_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, dst.GetHeight());
+    CHECK_CL_SUCCESS(cl_ret);
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+    if(src.GetDeviceType() != dst.GetDeviceType()) {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+    const std::string key = "Crop"; 
+    OpenCLExecuteUnit unit;
+    if(execute_map_.count(key) == 0) {
+        std::string program_name = "copy";
+        std::string kernel_name = "Crop";
+        ret = CreateExecuteUnit(unit, program_name, kernel_name);
+        if(ret != TNN_OK) {
+            return ret;
+        }
+        execute_map_[key] = unit; 
+    }
+
+    auto dims        = dst.GetDims();
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+
+    cl_int cl_ret;
+
+    cl::Image *image_input = static_cast<cl::Image *>(src.GetData());
+    cl::Image *image_output = static_cast<cl::Image *>(dst.GetData());
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_input);
+    CHECK_CL_SUCCESS(cl_ret);
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_output);
+    CHECK_CL_SUCCESS(cl_ret);
+    //start_x
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.top_left_x); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //start_y
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.top_left_y);
+    CHECK_CL_SUCCESS(cl_ret);
+    //crop_width
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.width); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //crop_height
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.height);
+    CHECK_CL_SUCCESS(cl_ret);
+    //src_w
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetWidth()); 
+    CHECK_CL_SUCCESS(cl_ret);
+    //src_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetHeight());
+    CHECK_CL_SUCCESS(cl_ret);
+
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    return TNN_OK;
+}
+
+Status OpenCLMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+
+    if (src.GetData() == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input mat is null");
+    }
+
+    if (src.GetDeviceType() != dst.GetDeviceType()) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    // Init matrix buffer
+    auto opencl_runtime   = OpenCLRuntime::GetInstance();
+    cl_int ret_cl            = CL_SUCCESS;
+    matrix_buffer_size_ = 6 * sizeof(float);
+    cl::Buffer* matrix_buffer = new cl::Buffer(*opencl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                              (cl::size_type)matrix_buffer_size_, nullptr, &ret_cl);
+    if (ret_cl != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret_cl)
+        if (nullptr != matrix_buffer)
+            delete matrix_buffer;
+    } else {
+        matrix_buffer_.reset(matrix_buffer);
+    }
+
+    // Inverse Transform Matrix
+    double M[6];
+    float m[6];
+    M[0] = param.transform[0][0];
+    M[1] = param.transform[0][1];
+    M[2] = param.transform[0][2];
+    M[3] = param.transform[1][0];
+    M[4] = param.transform[1][1];
+    M[5] = param.transform[1][2];
+
+    double D   = M[0] * M[4] - M[1] * M[3];
+    D          = D != 0 ? 1. / D : 0;
+    double A11 = M[4] * D, A22 = M[0] * D;
+    m[0]      = A11;
+    m[1]      = M[1] * (-D);
+    m[3]      = M[3] * (-D);
+    m[4]      = A22;
+    double b1 = -A11 * M[2] - m[1] * M[5];
+    double b2 = -m[3] * M[2] - A22 * M[5];
+    m[2]      = b1;
+    m[5]      = b2;
+
+    // Copy inversed transform matrix to buffer
+    cl_int buffer_ret = CL_SUCCESS;
+    auto matrix_buffer_ptr =
+        cl_command_queue->enqueueMapBuffer(*matrix_buffer_, true, CL_MAP_WRITE, 0, matrix_buffer_size_, nullptr, nullptr, &buffer_ret);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMMAP_ERROR, "OpenCL MemMap failed");
+    }
+    memcpy(matrix_buffer_ptr, m, matrix_buffer_size_);
+    ret = cl_command_queue->enqueueUnmapMemObject(*matrix_buffer_, matrix_buffer_ptr);
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL MemUnMap failed");
+    }
+
+    // create execute unit
+    const std::string key = (param.interp_type == INTERP_TYPE_LINEAR) ?
+            "WarpAffineLinear" : "WarpAffineNearest";
+    OpenCLExecuteUnit unit;
+    if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {
+        if (execute_map_.count(key) == 0) {
+            std::string program_name = "warp_affine";
+            std::string kernel_name = "WarpAffineLinear";
+            ret = CreateExecuteUnit(unit, program_name, kernel_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+            execute_map_[key] = unit;
+        }
+    } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {
+        if (execute_map_.count(key) == 0) {
+            std::string program_name = "warp_affine";
+            std::string kernel_name = "WarpAffineNearest";
+            ret = CreateExecuteUnit(unit, program_name, kernel_name);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+            execute_map_[key] = unit;
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "warpaffine type not support yet");
+    }
+
+    ret = SetWarpAffineArgs(unit, src, dst, param);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return ret;
+}
+
+Status OpenCLMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    return Status(TNNERR_OPENCL_UNSUPPORT_ERROR, "opencl not support color conversion");
+}
+
+Status OpenCLMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    Status ret            = TNN_OK;
+    if(src.GetDeviceType() != dst.GetDeviceType()) {
+        return Status(TNNERR_PARAM_ERR, "convert type not support yet");
+    }
+    auto cl_command_queue = static_cast<cl::CommandQueue *>(command_queue);
+    if (cl_command_queue == nullptr) {
+        LOGE("Get OpenCL command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "Get OpenCL command queue failed!");
+    }
+    const std::string key = "CopyMakeBorder";
+    OpenCLExecuteUnit unit;
+    if(execute_map_.count(key) == 0) {
+        std::string program_name = "copy";
+        std::string kernel_name = "CopyMakeBorder";
+        ret = CreateExecuteUnit(unit, program_name, kernel_name);
+        if(ret != TNN_OK) {
+            return ret;
+        }
+        execute_map_[key] = unit;
+    }
+
+    auto dims        = dst.GetDims();
+    uint32_t idx     = SetExecuteUnit2DSizeInfoDefault(unit, dims);
+
+    cl_int cl_ret;
+
+    cl::Image *image_input = static_cast<cl::Image *>(src.GetData());
+    cl::Image *image_output = static_cast<cl::Image *>(dst.GetData());
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_input);
+    CHECK_CL_SUCCESS(cl_ret);
+    cl_ret = unit.ocl_kernel.setArg(idx++, *image_output);
+    CHECK_CL_SUCCESS(cl_ret);
+    // make border top
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.top);
+    CHECK_CL_SUCCESS(cl_ret);
+    // make border left
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.left);
+    CHECK_CL_SUCCESS(cl_ret);
+    // src_w
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetWidth());
+    CHECK_CL_SUCCESS(cl_ret);
+    // src_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, src.GetHeight());
+    CHECK_CL_SUCCESS(cl_ret);
+    // src_channel_blocks
+    cl_ret = unit.ocl_kernel.setArg(idx++, UP_DIV(src.GetChannel(), 4));
+    CHECK_CL_SUCCESS(cl_ret);
+    // dst_h
+    cl_ret = unit.ocl_kernel.setArg(idx++, dst.GetHeight());
+    CHECK_CL_SUCCESS(cl_ret);
+    // border_val
+    cl_ret = unit.ocl_kernel.setArg(idx++, param.border_val);
+    CHECK_CL_SUCCESS(cl_ret);
+
+    ret = RunConvertUnit(unit, cl_command_queue, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    return TNN_OK;
+}
+
+DECLARE_MAT_CONVERTER_CREATER(OpenCL);
+REGISTER_MAT_CONVERTER(OpenCL, DEVICE_OPENCL);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.h
new file mode 100644
index 0000000..b396b44
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_mat_converter.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MAT_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/opencl/opencl_utils.h"
+#include "tnn/utils/mat_converter_acc.h"
+
+namespace TNN_NS {
+
+class OpenCLMatConverterAcc : public MatConverterAcc {
+public:
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+private:
+    //Status CreateConvertUnit(OpenCLExecuteUnit& unit, Mat& mat, MatConvertParam param, bool convert_to_mat);
+    Status SetExecuteUnit(OpenCLExecuteUnit& unit, Mat& src, Mat& dst, const bool copy_flag, const std::string& mat_key);
+    Status SetConvertArgs(OpenCLExecuteUnit& unit, Mat& src, Mat& dst, bool convert_to_mat);
+    Status SetWarpAffineArgs(OpenCLExecuteUnit& unit, Mat& src, Mat& dst, WarpAffineParam param);
+    Status RunConvertUnit(OpenCLExecuteUnit& unit, cl::CommandQueue* command_queue, bool need_wait = false);
+    Status CopyBufferDataToMat(Mat& mat, cl::CommandQueue* command_queue);
+    Status CopyMatToBufferData(Mat& mat, cl::CommandQueue* command_queue);
+    std::shared_ptr<cl::Buffer> buffer_ = nullptr;
+    std::shared_ptr<cl::Buffer> matrix_buffer_ = nullptr;
+    int buffer_size_ = 0;
+    int matrix_buffer_size_ = 0;
+    std::map<std::string, OpenCLExecuteUnit> execute_map_; 
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MAT_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.cc
new file mode 100644
index 0000000..c73c7e7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+
+namespace TNN_NS {
+
+OpenCLMemory::OpenCLMemory(OpenCLMemoryType type) {
+    mem_type_ = type;
+}
+
+OpenCLMemory::~OpenCLMemory() {
+    if (own_data_ && data_ != nullptr) {
+        if (mem_type_ == TNN_CL_BUFFER) {
+            cl::Buffer *ptr = (cl::Buffer *)data_;
+            delete ptr;
+            ptr = nullptr;
+        } else if (mem_type_ == TNN_CL_IMAGE) {
+            cl::Image2D *ptr = (cl::Image2D *)data_;
+            delete ptr;
+            ptr = nullptr;
+        }
+    }
+}
+
+void *OpenCLMemory::GetData() const {
+    return data_;
+}
+
+void OpenCLMemory::SetData(void *data_ptr, bool own_data) {
+    data_     = data_ptr;
+    own_data_ = own_data;
+}
+
+OpenCLMemoryType OpenCLMemory::GetMemoryType() const {
+    return mem_type_;
+}
+
+void OpenCLMemory::SetMemoryType(OpenCLMemoryType type) {
+    mem_type_ = type;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.h
new file mode 100644
index 0000000..96e3304
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_memory.h
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MEMORY_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MEMORY_H_
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+enum OpenCLMemoryType { TNN_CL_BUFFER = 0, TNN_CL_IMAGE = 1 };
+
+// @brief OpenCLMemory data store in platform and can be shared
+class OpenCLMemory {
+public:
+    // @brief create OpenCLMemory with type
+    // @param type: the type of memory
+    explicit OpenCLMemory(OpenCLMemoryType type);
+
+    ~OpenCLMemory();
+
+    // @brief get data pointer
+    void* GetData() const;
+
+    // @brief set data pointer
+    void SetData(void* data_ptr, bool own_data = false);
+
+    // @brief get memory type
+    OpenCLMemoryType GetMemoryType() const;
+
+    // @brief set memory type
+    void SetMemoryType(OpenCLMemoryType type);
+
+private:
+    // remove all assignment operator
+    OpenCLMemory(const OpenCLMemory& memory)  = delete;
+    OpenCLMemory(const OpenCLMemory&& memory) = delete;
+    OpenCLMemory& operator=(const OpenCLMemory&) = delete;
+    OpenCLMemory& operator=(const OpenCLMemory&&) = delete;
+
+private:
+    // data pointer
+    void* data_ = nullptr;
+    // memory type
+    OpenCLMemoryType mem_type_ = TNN_CL_IMAGE;
+    // own_data_ decide whether need to release data
+    bool own_data_ = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_MEMORY_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.cc
new file mode 100644
index 0000000..7f7f645
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.cc
@@ -0,0 +1,698 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_runtime.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/md5.h"
+
+#include <stdio.h>
+#include <sstream>
+#include <algorithm>
+#ifdef __ANDROID__
+#include <fcntl.h>
+#include <sys/file.h>
+#endif
+
+#ifdef SHARING_MEM_WITH_OPENGL
+#include <EGL/egl.h>
+#endif
+
+namespace TNN_NS {
+
+#ifdef __ANDROID__
+#define RELEASE_AND_UNLOCK(f) \
+    fclose(f); \
+    flock(fileno(f), LOCK_UN);
+#endif
+
+extern const std::map<std::string, std::vector<unsigned char>> g_opencl_program_map;
+
+static std::mutex g_mtx;
+
+//reserved for incompatible
+const std::string CACHE_TAG = "d1_tnn_ocl";
+
+//magic number
+static std::map<int, int> AdrenoSubGroup{
+    {640, 128}, {630, 128}, {616, 128}, {612, 64}, {610, 64}, {540, 32}, {530, 32},
+    {512, 32},  {510, 32},  {509, 32},  {506, 32}, {505, 32}, {405, 32}, {330, 16},
+};
+
+#define PROGRAM_NAME_MAX_LEN 30
+#define BUILD_OPTION_MAX_LEN 300
+#define KERNEL_KEY_LIST_MAX_LEN 300
+
+struct ProgramCacheInfo {
+    char program_name[PROGRAM_NAME_MAX_LEN];
+    char build_option[BUILD_OPTION_MAX_LEN];
+    char kernel_key_list[KERNEL_KEY_LIST_MAX_LEN];
+    size_t buffer_size;
+};
+
+std::shared_ptr<OpenCLRuntime> OpenCLRuntime::opencl_runtime_singleton_ = nullptr;
+bool OpenCLRuntime::enable_increase_count_ = false;
+int OpenCLRuntime::ref_count_              = 0;
+bool OpenCLRuntime::init_done_             = false;
+
+OpenCLRuntime *OpenCLRuntime::GetInstance() {
+    // don't use DCL
+    std::unique_lock<std::mutex> lck(g_mtx);  
+    if (nullptr == opencl_runtime_singleton_.get()) {
+        opencl_runtime_singleton_.reset(new OpenCLRuntime());
+        ref_count_++;
+        enable_increase_count_ = false;
+    }
+
+    return opencl_runtime_singleton_.get();
+}
+
+//if use shared_ptr for opencl runtime and destructor when process finish,
+//huawei will crash, so we need increase and decrease ref.
+
+//opencl context will increate ref
+void OpenCLRuntime::IncreaseRef() {
+    std::unique_lock<std::mutex> lck(g_mtx);
+    if (enable_increase_count_) {
+        ref_count_++;
+    }
+    enable_increase_count_ = true;
+    LOGD("OpenCLRuntime::IncreaseRef() count=%d\n", ref_count_);
+}
+
+//opencl context will decrease ref
+void OpenCLRuntime::DecreaseRef() {
+    std::unique_lock<std::mutex> lck(g_mtx);
+    ref_count_--;
+    if (0 == ref_count_) {
+        opencl_runtime_singleton_.reset();
+        init_done_ = false;
+    }
+    LOGD("OpenCLRuntime::DecreaseRef() count=%d\n", ref_count_);
+}
+
+OpenCLRuntime::OpenCLRuntime() {
+    LOGD("OpenCLRuntime() start\n");
+    default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror";
+}
+
+//Init will get platforms info, get devices info, create opencl context.
+Status OpenCLRuntime::Init() {
+    std::unique_lock<std::mutex> lck(g_mtx);
+
+    //only init once.
+    if (!init_done_) {
+        LOGD("Init OpenCL Runtime\n");
+        LOGI(
+            "OpenCL version: CL_TARGET_OPENCL_VERSION %d   "
+            "CL_HPP_TARGET_OPENCL_VERSION %d   CL_HPP_MINIMUM_OPENCL_VERSION "
+            "%d\n",
+            CL_TARGET_OPENCL_VERSION, CL_HPP_TARGET_OPENCL_VERSION, CL_HPP_MINIMUM_OPENCL_VERSION);
+
+#ifdef TNN_USE_OPENCL_WRAPPER
+        if (false == OpenCLSymbols::GetInstance()->LoadOpenCLLibrary()) {
+            return Status(TNNERR_DEVICE_LIBRARY_LOAD, "load opencl library failed!");
+        }
+#endif  // TNN_USE_OPENCL_WRAPPER
+
+        RETURN_ON_NEQ(SearchGpuDevice(device_), TNN_OK);
+
+        const std::string device_name    = device_->getInfo<CL_DEVICE_NAME>();
+        const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
+        const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+        LOGD("device name:\t%s\n", device_name.c_str());
+        LOGD("opencl version:\t%s\n", device_version.c_str());
+        LOGD("highest opencl c version:\t%s\n", opencl_version.c_str());
+
+        gpu_info_ = ParseGpuInfo(device_name, device_version);
+
+        cl_int err;
+#if defined(SHARING_MEM_WITH_OPENGL) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
+        // create context from glcontext
+        LOGI("Create special opencl context to share with OpenGL\n");
+        LOGI("eglGetCurrentContext(): 0x%x\n", eglGetCurrentContext());
+        cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
+                                                CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
+        context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, context_prop, nullptr, nullptr, &err));
+
+        if (err != CL_SUCCESS) {
+            LOGE(
+                "Create special opencl context failed, Create common opencl "
+                "context then.\n");
+            context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
+        }
+#else
+        LOGI("Create common opencl context\n");
+        context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
+#endif
+        if (err != CL_SUCCESS) {
+            LOGE("Context create failed! (ERROR CODE: %d)\n", err);
+            return Status(TNNERR_OPENCL_RUNTIME_ERROR, "Context create failed!");
+        }
+
+        //get cache size, compute units and frequency.
+        device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
+        device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
+        device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
+        device_->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &local_memory_size_);
+
+        size_t max_height, max_width;
+        device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
+        device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
+        image_2d_max_size_.push_back(max_width);
+        image_2d_max_size_.push_back(max_height);
+
+        cl_device_fp_config fp_config;
+        auto success  = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
+        support_fp16_ = CL_SUCCESS == success && fp_config > 0;
+
+        if (!cache_path_.empty()) {
+            program_cache_file_path_ =
+                cache_path_ + "/" + CACHE_TAG + "_" + md5(device_name) + "_" +
+                md5(device_version + "_" + opencl_version);
+        }
+
+        Status ret = LoadProgramCache();
+        if (ret != TNN_OK) {
+            LOGE("load program cache skipped, ret: %d, msg: %s\n", (int)ret, ret.description().c_str());
+        }
+
+        LOGD("Program cache file path: %s\n", program_cache_file_path_.c_str());
+        LOGD("Global Mem Cache Size: %d\n", (int)global_memery_cachesize_);
+        LOGD("Compute Unit: %d\n", (int)compute_units_);
+        LOGD("Clock Frequency: %d MHz\n", (int)max_freq_);
+        init_done_ = true;
+        LOGD("OpenCLRuntime init done!\n");
+    }
+
+    return TNN_OK;
+}
+
+OpenCLRuntime::~OpenCLRuntime() {
+    LOGD("~OpenCLRuntime() start\n");
+    program_map_.clear();
+    context_.reset();
+    device_.reset();
+    LOGD("~OpenCLRuntime() end\n");
+}
+
+cl::Context *OpenCLRuntime::Context() {
+    return context_.get();
+}
+
+cl::Device *OpenCLRuntime::Device() {
+    return device_.get();
+}
+
+uint64_t OpenCLRuntime::DeviceGlobalMemeryCacheSize() const {
+    return global_memery_cachesize_;
+}
+
+uint32_t OpenCLRuntime::DeviceComputeUnits() const {
+    return compute_units_;
+}
+
+uint32_t OpenCLRuntime::DeviceMaxFreq() const {
+    return max_freq_;
+}
+
+uint64_t OpenCLRuntime::DeviceLocalMemerySize() const {
+    return local_memory_size_;
+}
+
+//get kernel enqueue max work group size 
+uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
+    uint64_t max_workgroup_size = 0;
+    int ret                     = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
+    if (ret != 0)
+        max_workgroup_size = 0;
+    return max_workgroup_size;
+}
+
+//opencl 2.0 can get SubGroupSize.
+uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
+    uint32_t sub_group_size = 0;
+
+    if (ADRENO == gpu_info_.type) {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+        cl_int cl_ret;
+        sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
+        if (cl_ret != CL_SUCCESS) {
+            CHECK_CL_SUCCESS(cl_ret)
+            sub_group_size = 0;
+        }
+#else
+        if (AdrenoSubGroup.find(gpu_info_.model_num) != AdrenoSubGroup.end()) {
+            sub_group_size = AdrenoSubGroup[gpu_info_.model_num];
+        }
+#endif
+    }
+
+    return sub_group_size;
+}
+
+GpuInfo OpenCLRuntime::GetGpuInfo() {
+    return gpu_info_;
+}
+
+bool OpenCLRuntime::SetPrecision(Precision precision) {
+    precision_ = !support_fp16_ ? PRECISION_HIGH : precision;
+    return precision_ == precision;
+}
+
+void OpenCLRuntime::SetCachePath(const std::string &cache_path) {
+    cache_path_ = cache_path;
+}
+
+Precision OpenCLRuntime::GetPrecision() {
+    return precision_;
+}
+
+Status OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
+                                  const std::set<std::string> &build_options) {
+    std::string build_options_str;
+    bool force_fp32 = false;
+    auto it         = build_options.find("-DFORCE_FP32");
+    if (it != build_options.end()) {
+        force_fp32 = true;
+    }
+    //set default macro
+    if (precision_ != PRECISION_HIGH && !force_fp32) {
+        //fp16 enable, kernel will use half and read_imageh and write_imageh.
+        LOGD("OpenCL Caucluate Pricision is Half!\n");
+        build_options_str =
+            "-DFLOAT=half -DFLOAT4=half4 -DFLOAT16=half16 -DCONVERT_INT=convert_short -DCONVERT_FLOAT4=convert_half4 -DRI_F=read_imageh "
+            "-DWI_F=write_imageh";
+    } else {
+        //fp16 not enable, kernel will use float and read_imagef and write_imagef.
+        LOGD("OpenCL Caucluate Pricision is Float!\n");
+        build_options_str =
+            "-DFLOAT=float -DFLOAT4=float4 -DFLOAT16=float16 -DCONVERT_INT=convert_int -DCONVERT_FLOAT4=convert_float4 -DRI_F=read_imagef "
+            "-DWI_F=write_imagef";
+    }
+    for (auto &option : build_options) {
+        build_options_str += " " + option;
+    }
+    build_options_str += default_build_opts_;
+    //program identifier = program_name + build_options
+    std::pair<std::string, std::string> build_program_key =
+        std::make_pair(program_name, build_options_str);
+
+    auto build_program_it = program_map_.find(build_program_key);
+    cl::Program program;
+    //if search program identifier exist, then use it.
+    if (build_program_it != program_map_.end()) {
+        LOGD("find program: %s, build option: %s\n", build_program_key.first.c_str(),
+             build_program_key.second.c_str());
+        program = build_program_it->second;
+    } else {
+        //load program and build program
+        LOGD("build program: %s, build option: %s\n", build_program_key.first.c_str(),
+             build_program_key.second.c_str());
+        auto status = this->LoadProgram(program_name, &program);
+        if (!status) {
+            LOGE("load program (%s) failed!\n", program_name.c_str());
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR, "load program failed");
+        }
+        status = this->BuildProgram(build_options_str, &program);
+        if (!status) {
+            LOGE("%s build failed!\n", program_name.c_str());
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR, "build program failed");
+        }
+        program_map_.emplace(build_program_key, program);
+    }
+
+    LOGD("build kernel: %s\n", kernel_name.c_str());
+    cl_int err;
+    kernel = cl::Kernel(program, kernel_name.c_str(), &err);
+    if (err != CL_SUCCESS) {
+        LOGE("Kernel create failed! (ERROR CODE: %d)\n", err);
+        return Status(TNNERR_OPENCL_KERNELBUILD_ERROR, "create kernel failed");
+    }
+
+    auto kernel_name_it = kernel_name_map_.find(build_program_key);
+    if (kernel_name_it != kernel_name_map_.end()) {
+        auto& kernel_name_list = kernel_name_it->second;
+        if (std::find(kernel_name_list.begin(), kernel_name_list.end(),
+                      kernel_name) == kernel_name_list.end()) {
+            is_program_cache_changed_ = true;
+            kernel_name_list.push_back(kernel_name);
+        }
+    } else {
+        std::vector<std::string> kernel_name_list = {kernel_name};
+        is_program_cache_changed_ = true;
+        kernel_name_map_.emplace(build_program_key, kernel_name_list);
+    }
+    return TNN_OK;
+}
+
+//get gpu divce type
+GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) {
+    GpuInfo info;
+
+    if (device_name == "QUALCOMM Adreno(TM)") {
+        LOGD("GPU type is ADRENO\n");
+        info.type = ADRENO;
+        sscanf(device_version.c_str(), "%*s%f%*s%d", &info.opencl_version, &info.model_num);
+
+    } else if (device_name.find("Mali") != std::string::npos) {
+        LOGD("GPU type is MALI\n");
+        info.type = MALI;
+        
+        //Mali type MALI-G or MALI_T
+        if (device_name.find("Mali-G") != std::string::npos) {
+            LOGD("GPU type is MALI-G\n");
+            info.type = MALI_G;
+            sscanf(device_name.c_str(), "Mali-G%d", &info.model_num);
+        } else if (device_name.find("Mali-T") != std::string::npos) {
+            LOGD("GPU type is MALI-T\n");
+            info.type = MALI_T;
+            sscanf(device_name.c_str(), "Mali-T%d", &info.model_num);
+        }
+        sscanf(device_version.c_str(), "%*s%f%*s", &info.opencl_version);
+    } else if (device_name.find("Intel") != std::string::npos) {
+        LOGD("GPU type is Intel GPU\n");
+        info.type = INTEL_GPU;
+    } else if (device_name.find("GeForce") != std::string::npos) {
+        LOGD("GPU type is Nvidia GPU\n");
+        info.type = NVIDIA_GPU;
+    }
+    LOGD("GPU Type: %d, model_num: %d, opencl version: %f\n", info.type, info.model_num, info.opencl_version);
+
+    return info;
+}
+
+Status OpenCLRuntime::SearchGpuDevice(std::shared_ptr<cl::Device>& device) {
+    struct DevicePacket {
+        cl::Platform platform;
+        cl::Device device;
+    };
+    std::map<GpuType, std::vector<DevicePacket>> gpu_map;
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    if (platforms.size() <= 0) {
+        LOGE("OpenCL Platform not found!\n");
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "OpenCL Platform not found!");
+    }
+
+    LOGD("find %lu platforms\n", platforms.size());
+
+    // search GPU
+    std::vector<cl::Device> devices;
+    for (auto it = platforms.begin(); it != platforms.end(); ++it) {
+        std::string platform_name;
+        it->getInfo(CL_PLATFORM_NAME, &platform_name);
+        it->getDevices(CL_DEVICE_TYPE_GPU, &devices);
+        LOGD("platform (%s) has %lu GPUs\n", platform_name.c_str(), devices.size());
+
+        for (auto dev: devices) {
+            std::string device_name    = dev.getInfo<CL_DEVICE_NAME>();
+            std::string device_version = dev.getInfo<CL_DEVICE_VERSION>();
+            LOGD("find GPU: %s\n", device_name.c_str());
+            GpuInfo gpu_info = ParseGpuInfo(device_name, device_version);
+            DevicePacket device_packet;
+            device_packet.platform = *it;
+            device_packet.device   = dev;
+            gpu_map[gpu_info.type].push_back(device_packet);
+        }
+    }
+
+    // not found, return error code.
+    if (gpu_map.size() <= 0) {
+        LOGE("OpenCL Device not found!\n");
+        return Status(TNNERR_OPENCL_RUNTIME_ERROR, "OpenCL Device not found!");
+    }
+
+    // choose GPU
+    DevicePacket device_packet_to_use;
+    if (gpu_map.count(NVIDIA_GPU) > 0) {
+        device_packet_to_use = gpu_map[NVIDIA_GPU].front();
+    } else if (gpu_map.count(INTEL_GPU) > 0) {
+        device_packet_to_use = gpu_map[INTEL_GPU].front();
+    } else {
+        device_packet_to_use = gpu_map.begin()->second.front();
+    }
+
+    cl::Platform::setDefault(device_packet_to_use.platform);
+    device.reset(new cl::Device());
+    *device = device_packet_to_use.device;
+
+    return TNN_OK;
+}
+
+//load program with program name.
+bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) {
+    auto it_source = g_opencl_program_map.find(program_name);
+    if (it_source != g_opencl_program_map.end()) {
+        cl::Program::Sources sources;
+        std::string source(it_source->second.begin(), it_source->second.end());
+        sources.push_back(source);
+        *program = cl::Program(*Context(), sources);
+        return true;
+    } else {
+        LOGE("Can't find kernel source !\n");
+        return false;
+    }
+}
+
+//build program with build options
+bool OpenCLRuntime::BuildProgram(const std::string &build_options, cl::Program *program) {
+    cl_int ret = program->build({*device_}, build_options.c_str());
+    if (ret != CL_SUCCESS) {
+        if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
+            std::string build_log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
+            LOGE("Program build log: %s \n", build_log.c_str());
+        }
+        LOGE("Build program failed ! \n");
+        return false;
+    }
+    return true;
+}
+
+Status OpenCLRuntime::LoadProgramCache() {
+#ifdef __ANDROID__
+    if (!program_cache_file_path_.empty()) {
+        FILE* program_cache_fin = fopen(program_cache_file_path_.c_str(), "rb");
+        if (!program_cache_fin) {
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                          "open program cache file failed, input path: " + program_cache_file_path_);
+        }
+        if (flock(fileno(program_cache_fin), LOCK_EX) == 0) {
+            do {
+                auto device_id = device_->get();
+                struct ProgramCacheInfo info;
+                int frsize = fread(&info, sizeof(struct ProgramCacheInfo), 1, program_cache_fin);
+                if (feof(program_cache_fin)) break;
+                if (!frsize) {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                "read program cache file failed, path: " + program_cache_file_path_);
+                }
+                std::stringstream kernel_key_list_stream(info.kernel_key_list);
+                std::vector<std::string> kernel_name_list;
+                std::string kernel_name, program_name(info.program_name), build_option(info.build_option);
+                std::pair<std::string, std::string> key = std::make_pair(program_name, build_option);
+                while (std::getline(kernel_key_list_stream, kernel_name, ' ')) {
+                    kernel_name_list.push_back(kernel_name);
+                }
+                size_t buffer_size = info.buffer_size;
+
+                std::vector<int8_t> buffer;
+                buffer.resize(buffer_size);
+                auto buffer_data = buffer.data();
+                std::string program_source_md5;
+                auto it_source = g_opencl_program_map.find(program_name);
+                if (it_source != g_opencl_program_map.end()) {
+                    std::string source(it_source->second.begin(), it_source->second.end());
+                    program_source_md5 = md5(source);
+                } else {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "get kernel source failed, program name: " + program_name);
+                }
+                std::string program_cache_bin_file_path = program_cache_file_path_ + "_" + program_name +
+                                                          "_" + md5(build_option) + "_" + program_source_md5;
+                FILE* program_binary_stream_fin = fopen(program_cache_bin_file_path.c_str(), "r");
+                if (!program_binary_stream_fin) {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                "open program cache binary file failed, input path: "
+                                + program_cache_bin_file_path);
+                }
+                if (flock(fileno(program_binary_stream_fin), LOCK_EX) == 0) {
+                    size_t block_size = 4096;
+                    size_t block_count = UP_DIV(buffer_size, block_size);
+                    for (size_t i = 0; i < block_count; i++) {
+                        size_t start_loc = block_size * i;
+                        size_t cur_block_size = std::min(block_size, buffer_size - start_loc);
+                        frsize = fread((char *)buffer_data + start_loc, sizeof(char),
+                                    cur_block_size, program_binary_stream_fin);
+                        if (frsize != cur_block_size) {
+                            RELEASE_AND_UNLOCK(program_cache_fin);
+                            RELEASE_AND_UNLOCK(program_binary_stream_fin);
+                            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                        "read program cache binary file failed, path: " +
+                                        program_cache_bin_file_path);
+                        }
+                    }
+                    RELEASE_AND_UNLOCK(program_binary_stream_fin);
+                } else {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    fclose(program_binary_stream_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "lock program cache binary file failed, path: " +
+                                  program_cache_bin_file_path);
+                }
+
+                // create program from binary
+                auto program_raw = clCreateProgramWithBinary(
+                        Context()->get(), 1, &device_id, &buffer_size,
+                        (const unsigned char**)(&buffer_data), nullptr, nullptr);
+                if (!program_raw) {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "Create program with binary failed, program name: " + program_name);
+                }
+                cl::Program program(program_raw);
+                auto status = this->BuildProgram(info.build_option, &program);
+                if (!status) {
+                    RELEASE_AND_UNLOCK(program_cache_fin);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "build program failed, program name: " + program_name);
+                }
+                program_map_.emplace(key, program);
+                kernel_name_map_.emplace(key, kernel_name_list);
+            } while (true);
+            RELEASE_AND_UNLOCK(program_cache_fin);
+        } else {
+            fclose(program_cache_fin);
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                          "lock program cache file failed, input path: " + program_cache_file_path_);
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+Status OpenCLRuntime::SaveProgramCache() {
+#ifdef __ANDROID__
+    if (!program_cache_file_path_.empty() && is_program_cache_changed_) {
+        FILE *program_cache_fout = fopen(program_cache_file_path_.c_str(), "wb");
+        if (!program_cache_fout) {
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                          "open program cache file failed, output path: " + program_cache_file_path_);
+        }
+        if (flock(fileno(program_cache_fout), LOCK_EX) == 0) {
+            for (auto element : program_map_) {
+                const std::pair<std::string, std::string>& key = element.first;
+                const cl::Program& program = element.second;
+                auto program_raw = program.get();
+                auto binSizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
+                auto device_id = device_->get();
+                // use first one
+                size_t buffer_size = binSizes[0];
+                auto program_name = key.first;
+                auto build_option = key.second;
+                struct ProgramCacheInfo info;
+                ASSERT(program_name.size() < PROGRAM_NAME_MAX_LEN);
+                ASSERT(build_option.size() < BUILD_OPTION_MAX_LEN);
+                strcpy(info.program_name, program_name.c_str());
+                strcpy(info.build_option, build_option.c_str());
+
+                // save compiled kernel name
+                std::stringstream kernel_key_list_stream;
+                auto kernel_name_it = kernel_name_map_.find(key);
+                if (kernel_name_it != kernel_name_map_.end()) {
+                    const std::vector<std::string>& kernel_name_list = kernel_name_it->second;
+                    for (auto kernel_name : kernel_name_list) {
+                        kernel_key_list_stream << kernel_name << " ";
+                    }
+                }
+                ASSERT(kernel_key_list_stream.str().size() < KERNEL_KEY_LIST_MAX_LEN);
+                strcpy(info.kernel_key_list, kernel_key_list_stream.str().c_str());
+                info.buffer_size = buffer_size;
+                int fwsize = fwrite(&info, sizeof(struct ProgramCacheInfo), 1, program_cache_fout);
+                if (!fwsize) {
+                    RELEASE_AND_UNLOCK(program_cache_fout);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "write program cache file failed, path: " + program_cache_file_path_);
+                }
+
+                // save compiled program binary
+                std::vector<int8_t> buffer;
+                buffer.resize(buffer_size);
+                auto buffer_data = buffer.data();
+                clGetProgramInfo(program_raw, CL_PROGRAM_BINARIES, sizeof(unsigned char *),
+                                    &buffer_data, nullptr);
+
+                std::string program_source_md5;
+                auto it_source = g_opencl_program_map.find(program_name);
+                if (it_source != g_opencl_program_map.end()) {
+                    std::string source(it_source->second.begin(), it_source->second.end());
+                    program_source_md5 = md5(source);
+                } else {
+                    RELEASE_AND_UNLOCK(program_cache_fout);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "get kernel source failed, program name: " + program_name);
+                }
+                std::string program_cache_bin_file_path = program_cache_file_path_ + "_" + program_name +
+                                                          "_" + md5(build_option) + "_" + program_source_md5;
+                FILE* program_cache_binary_stream = fopen(program_cache_bin_file_path.c_str(), "wb");
+                if (!program_cache_binary_stream) {
+                    RELEASE_AND_UNLOCK(program_cache_fout);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "open program cache binary file failed, path: "
+                                  + program_cache_bin_file_path);
+                }
+
+                if (flock(fileno(program_cache_binary_stream), LOCK_EX) == 0) {
+                    size_t block_size = 4096;
+                    size_t block_count = UP_DIV(buffer_size, block_size);
+                    for (size_t i = 0; i < block_count; i++) {
+                        size_t start_loc = block_size * i;
+                        size_t cur_block_size = std::min(block_size, buffer_size - start_loc);
+                        fwsize = fwrite((const char*)buffer_data + start_loc, sizeof(char),
+                                        cur_block_size, program_cache_binary_stream);
+                        if (fwsize != cur_block_size) {
+                            RELEASE_AND_UNLOCK(program_cache_fout);
+                            RELEASE_AND_UNLOCK(program_cache_binary_stream);
+                            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                          "write program cache binary file failed, path: " +
+                                          program_cache_bin_file_path);
+                        }
+                    }
+                    RELEASE_AND_UNLOCK(program_cache_binary_stream);
+                } else {
+                    fclose(program_cache_binary_stream);
+                    return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                                  "lock program cache binary file failed, path: "
+                                  + program_cache_bin_file_path);
+                }
+            }
+            is_program_cache_changed_ = false;
+            RELEASE_AND_UNLOCK(program_cache_fout);
+        } else {
+            fclose(program_cache_fout);
+            return Status(TNNERR_OPENCL_KERNELBUILD_ERROR,
+                          "lock program cache file failed, output path: " + program_cache_file_path_);
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+std::vector<size_t> OpenCLRuntime::GetImage2dMaxSize() {
+    return image_2d_max_size_;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.h
new file mode 100644
index 0000000..67ed44d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_runtime.h
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_RUNTIME_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_RUNTIME_H_
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include "tnn/core/status.h"
+#include "tnn/core/common.h"
+#include "tnn/device/opencl/opencl_wrapper.h"
+
+namespace TNN_NS {
+
+enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4, INTEL_GPU = 5, NVIDIA_GPU = 6};
+
+struct GpuInfo {
+    GpuType type = OTHER;
+    int model_num = 0;
+    float opencl_version = 0;
+};
+
+// Base GPU cache size used for computing local work group size.
+const int32_t g_base_gpu_mem_cachesize = 16384;
+
+class OpenCLRuntime {
+public:
+    static OpenCLRuntime *GetInstance();
+    static void IncreaseRef();
+    static void DecreaseRef();
+
+    ~OpenCLRuntime();
+    OpenCLRuntime(const OpenCLRuntime &) = delete;
+    OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
+
+    Status Init();
+
+    cl::Context *Context();
+    cl::Device *Device();
+    uint64_t DeviceGlobalMemeryCacheSize() const;
+    uint32_t DeviceComputeUnits() const;
+    uint32_t DeviceMaxFreq() const;
+    uint64_t DeviceLocalMemerySize() const;
+    uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
+    uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
+    GpuInfo GetGpuInfo();
+    std::vector<size_t> GetImage2dMaxSize();
+    bool SetPrecision(Precision precision);
+    void SetCachePath(const std::string &cache_path);
+    Precision GetPrecision();
+
+
+    Status BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
+                       const std::set<std::string> &build_options);
+    Status SaveProgramCache();
+
+private:
+    OpenCLRuntime();
+    GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);
+    Status SearchGpuDevice(std::shared_ptr<cl::Device>& device);
+
+    bool LoadProgram(const std::string &program_name, cl::Program *program);
+    bool BuildProgram(const std::string &build_options, cl::Program *program);
+
+    Status LoadProgramCache();
+
+private:
+    static std::shared_ptr<OpenCLRuntime> opencl_runtime_singleton_;
+    static bool enable_increase_count_;
+    static int ref_count_;
+    static bool init_done_;
+
+    std::shared_ptr<cl::Context> context_ = nullptr;
+    std::shared_ptr<cl::Device> device_ = nullptr;
+    std::map<std::pair<std::string, std::string>, cl::Program> program_map_ = {};
+    uint64_t global_memery_cachesize_ = 0;
+    uint32_t compute_units_ = 0;
+    uint32_t max_freq_ = 0;
+    uint64_t local_memory_size_ = 0;
+    std::string default_build_opts_ = "";
+    GpuInfo gpu_info_;
+    bool support_fp16_ = false;
+    bool fp16_enable_ = false;
+    Precision precision_ = PRECISION_AUTO;
+    std::string cache_path_ = "";
+    std::string program_cache_file_path_ = "";
+    bool is_program_cache_changed_ = false;
+    std::map<std::pair<std::string, std::string>, std::vector<std::string> > kernel_name_map_ = {};
+
+    std::vector<size_t> image_2d_max_size_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_RUNTIME_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.cc
new file mode 100644
index 0000000..36807dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.cc
@@ -0,0 +1,656 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/opencl/opencl_utils.h"
+
+#include <string>
+#include <vector>
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "tnn/core/macro.h"
+#include "tnn/core/profile.h"
+#include "tnn/utils/half_utils_inner.h"
+#include "tnn/utils/string_utils.h"
+
+#if (defined __ANDROID_API__) && (__ANDROID_API__ >= 21)
+#include <sys/system_properties.h>
+#endif
+
+namespace TNN_NS {
+
+// get image width and height
+std::vector<int> GetImageShape(const OpenCLMemory *image) {
+    std::vector<int> shape;
+    cl::Image *cl_image = (cl::Image *)image->GetData();
+    size_t width;
+    cl_image->getImageInfo(CL_IMAGE_WIDTH, &width);
+    size_t height;
+    cl_image->getImageInfo(CL_IMAGE_HEIGHT, &height);
+    shape.push_back(width);
+    shape.push_back(height);
+    return shape;
+}
+
+// get kernel run time info.
+void GetKernelTime(const cl::Event *event, double &kernel_time) {
+    cl_int error = CL_SUCCESS;
+    error        = event->wait();
+    CHECK_CL_SUCCESS(error);
+    unsigned long long start_t  = event->getProfilingInfo<CL_PROFILING_COMMAND_START>(&error);
+    CHECK_CL_SUCCESS(error);
+    unsigned long long end_t    = event->getProfilingInfo<CL_PROFILING_COMMAND_END>(&error);
+    CHECK_CL_SUCCESS(error);
+    kernel_time  = (end_t - start_t) / 1000000.0;
+}
+
+// get kernel run time info.
+void GetProfilingTime(const cl::Event *event, double &kernel_time, double &event_queued, double &event_submit,
+                      double &event_start, double &event_end) {
+    cl_int error = CL_SUCCESS;
+    error        = event->wait();
+    CHECK_CL_SUCCESS(error);
+    unsigned long long queued_t = event->getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>(&error);
+    CHECK_CL_SUCCESS(error);
+    unsigned long long submit_t = event->getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>(&error);
+    CHECK_CL_SUCCESS(error);
+    unsigned long long start_t  = event->getProfilingInfo<CL_PROFILING_COMMAND_START>(&error);
+    CHECK_CL_SUCCESS(error);
+    unsigned long long end_t    = event->getProfilingInfo<CL_PROFILING_COMMAND_END>(&error);
+    CHECK_CL_SUCCESS(error);
+    kernel_time  = (end_t - start_t) / 1000000.0;
+    event_queued = (double)queued_t;
+    event_submit = (double)submit_t;
+    event_start  = (double)start_t;
+    event_end    = (double)end_t;
+}
+
+// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
+Status RunKernel(const cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                 cl::CommandQueue *command_queue, std::string name, OpenCLProfilingData *pdata) {
+    LOGD("start RunKernel !\n");
+
+    ASSERT(lws.size() == 0 || lws.size() == gws.size());
+
+    std::vector<uint32_t> internal_global_ws = gws;
+    for (size_t i = 0; i < lws.size(); ++i) {
+        internal_global_ws[i] = ROUND_UP(gws[i], lws[i]);
+    }
+
+    cl::Event event;
+    cl_int error = CL_SUCCESS;
+
+    LOGD("gws size: %d, lws size: %d \n", (int)gws.size(), (int)lws.size());
+
+    if (gws.size() == 1) {
+        // 1d group size
+        if (lws.size() == 0) {
+            // local size empty
+            error = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(internal_global_ws[0]),
+                                                        cl::NullRange, nullptr, &event);
+            LOGD("run %s, gws:[%u] , lws: NullRange \n", name.c_str(), internal_global_ws[0]);
+        } else {
+            error = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(internal_global_ws[0]),
+                                                        cl::NDRange(lws[0]), nullptr, &event);
+            LOGD("run %s, gws:[%u], lws:[%u] \n", name.c_str(), internal_global_ws[0], lws[0]);
+        }
+    } else if (gws.size() == 2) {
+        // 2d group size
+        if (lws.size() == 0) {
+            // local size empty
+            error = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange,
+                                                        cl::NDRange(internal_global_ws[0], internal_global_ws[1]),
+                                                        cl::NullRange, nullptr, &event);
+            LOGD("run %s, gws:[%u,%u] , lws: NullRange \n", name.c_str(), internal_global_ws[0], internal_global_ws[1]);
+        } else {
+            error = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange,
+                                                        cl::NDRange(internal_global_ws[0], internal_global_ws[1]),
+                                                        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+            LOGD("run %s, gws:[%u,%u], lws:[%u,%u] \n", name.c_str(), internal_global_ws[0], internal_global_ws[1],
+                 lws[0], lws[1]);
+        }
+    } else {
+        // 3d group size
+        if (lws.size() == 0) {
+            // local size empty
+            error = command_queue->enqueueNDRangeKernel(
+                kernel, cl::NullRange, cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]),
+                cl::NullRange, nullptr, &event);
+            LOGD("run %s, gws:[%u,%u,%u] , lws: NullRange \n", name.c_str(), internal_global_ws[0],
+                 internal_global_ws[1], internal_global_ws[2]);
+        } else {
+            error = command_queue->enqueueNDRangeKernel(
+                kernel, cl::NullRange, cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]),
+                cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+            LOGD("run %s, gws:[%u,%u,%u], lws:[%u,%u,%u] \n", name.c_str(), internal_global_ws[0],
+                 internal_global_ws[1], internal_global_ws[2], lws[0], lws[1], lws[2]);
+        }
+    }
+
+    if (error != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(error);
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL NDRange failed");
+    }
+
+    if (pdata != nullptr) {
+        pdata->event = event;
+    }
+    LOGD("end RunKernel !\n");
+    return TNN_OK;
+}
+
+bool AdrenoLocalSizeValid(const std::vector<uint32_t> &gws, std::vector<uint32_t>& lws,
+                          const uint32_t subgroup_size) {
+    return 0 == (lws[0] * lws[1]) % subgroup_size && 0 == gws[0] % lws[0] && 0 == gws[1] % lws[1] &&
+           ((lws[0] < lws[1]) == (gws[0] < gws[1]));
+}
+
+// adreno local size calculate
+std::vector<uint32_t> AdrenoLocalSize2D(const std::vector<uint32_t> &gws, const GpuInfo gpu_info,
+                                        const uint32_t compute_units, const uint32_t max_workgroup_size,
+                                        const uint32_t subgroup_size) {
+    std::vector<uint32_t> lws;
+    lws.clear();
+
+    int min_workgroup_count = compute_units;
+    // for the later verion gpu 1 SP can process more than one workgroup
+    if (gpu_info.model_num >= 540)
+        min_workgroup_count = 2 * compute_units;
+
+    // judge gws[1] fisrt
+    if (gws[1] % min_workgroup_count == 0) {
+        lws.resize(2);
+        lws[1] = std::min<uint32_t>(gws[1] / min_workgroup_count, max_workgroup_size);
+
+        // if subgroup size is got, then use it
+        if (0 != subgroup_size) {
+            int min_workgroup_size = subgroup_size * 2;
+            int max_val            = std::max<uint32_t>(max_workgroup_size / lws[1], 1);
+            int min_val            = std::max<uint32_t>(min_workgroup_size / lws[1], 1);
+            lws[0]                 = std::min<uint32_t>(gws[0], max_val);
+            for (; lws[0] >= min_val; lws[0]--) {
+                if (AdrenoLocalSizeValid(gws, lws, subgroup_size)) {
+                    return lws;
+                }
+            }
+        }
+
+        // another way to calculate lws[0]
+        lws[0] = max_workgroup_size / lws[1];
+        lws[0] = std::max<uint32_t>(std::min<uint32_t>(gws[0], lws[0]), 1);
+        if (0 == gws[0] % lws[0] && 0 == gws[1] % lws[1] && ((lws[0] < lws[1]) == (gws[0] < gws[1]))) {
+            return lws;
+        }
+    }
+
+    // judge gws[0] later
+    if (gws[0] % min_workgroup_count == 0) {
+        lws.resize(2);
+        lws[0] = std::min<uint32_t>(gws[0] / min_workgroup_count, max_workgroup_size);
+
+        // if subgroup size is got, then use it
+        if (0 != subgroup_size) {
+            int min_workgroup_size = subgroup_size * 2;
+            int max_val            = std::max<uint32_t>(max_workgroup_size / lws[0], 1);
+            int min_val            = std::max<uint32_t>(min_workgroup_size / lws[0], 1);
+            lws[1]                 = std::min<uint32_t>(gws[1], max_val);
+            for (; lws[1] >= min_val; lws[1]--) {
+                if (0 == (lws[0] * lws[1]) % subgroup_size && 0 == gws[0] % lws[0] && 0 == gws[1] % lws[1] &&
+                    ((lws[0] < lws[1]) == (gws[0] < gws[1]))) {
+                    return lws;
+                }
+            }
+        }
+
+        // another way to calculate lws[1]
+        lws[1] = max_workgroup_size / lws[0];
+        lws[1] = std::max<uint32_t>(std::min<uint32_t>(gws[1], lws[1]), 1);
+        if (0 == gws[0] % lws[0] && 0 == gws[1] % lws[1] && ((lws[0] < lws[1]) == (gws[0] < gws[1]))) {
+            return lws;
+        }
+    }
+
+    lws.clear();
+
+    return lws;
+}
+
+Status AdjustBuildOptionForFp32(std::set<std::string>& build_options)
+{
+    bool force_fp32 = false;
+#if (defined __ANDROID_API__) && (__ANDROID_API__ >= 21)
+    char sdk[128] = "0";
+    __system_property_get("ro.build.version.sdk", sdk);
+    int sdk_version = atoi(sdk);
+    // Before Android 8.0, the performance of exp fp16 is poor, so use fp32 instead
+    force_fp32 = (sdk_version <= 25);
+#elif (defined __ANDROID_API__) && (__ANDROID_API__ < 21)
+    force_fp32 = true;
+#endif
+
+    if (force_fp32) {
+        build_options.emplace("-DFORCE_FP32");
+    }
+
+    return TNN_OK;
+}
+
+// calculate 3d local size
+std::vector<uint32_t> LocalWS3DDefault(OpenCLExecuteUnit &unit) {
+    return LocalWS3DDefault(unit.global_work_size, unit.workgroupsize_max, unit.sub_group_size);
+}
+
+// adreno will calculate local size, others will return empty local size. the priority is lws[1] > lws[2] > lws[0]
+std::vector<uint32_t> LocalWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
+                                       const uint32_t subgroup_size) {
+    GpuInfo gpu_info = OpenCLRuntime::GetInstance()->GetGpuInfo();
+    std::vector<uint32_t> lws;
+    lws.clear();
+
+    if (gpu_info.type == GpuType::ADRENO) {
+        uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+        lws.resize(3);
+        if (max_workgroup_size == 0) {
+            lws[0] = lws[1] = lws[2] = 1;
+        } else {
+            std::vector<uint32_t> lws_2d_temp;
+            lws_2d_temp =
+                AdrenoLocalSize2D({gws[1], gws[2]}, gpu_info, compute_units, max_workgroup_size, subgroup_size);
+
+            if (lws_2d_temp.size() != 0) {
+                lws[1]                  = lws_2d_temp[0];
+                lws[2]                  = lws_2d_temp[1];
+                const uint32_t lws_size = lws[1] * lws[2];
+                lws[0] = std::max<uint32_t>(max_workgroup_size / lws_size, 1);
+                while (gws[0] % lws[0] != 0) {
+                    lws[0]--;
+                }
+            } else {
+                lws.clear();
+            }
+        }
+    }
+
+    return lws;
+}
+
+// calculate 2d local size
+std::vector<uint32_t> LocalWS2DDefault(OpenCLExecuteUnit &unit) {
+    return LocalWS2DDefault(unit.global_work_size, unit.workgroupsize_max, unit.sub_group_size);
+}
+
+// adreno will calculate local size, others will return empty local size.
+std::vector<uint32_t> LocalWS2DDefault(const std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
+                                       const uint32_t subgroup_size) {
+    GpuInfo gpu_info = OpenCLRuntime::GetInstance()->GetGpuInfo();
+    std::vector<uint32_t> lws;
+    if (gpu_info.type == GpuType::ADRENO) {
+        uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+        lws.resize(2);
+        if (max_workgroup_size == 0) {
+            lws[0] = lws[1] = 1;
+        } else {
+            lws = AdrenoLocalSize2D(gws, gpu_info, compute_units, max_workgroup_size, subgroup_size);
+        }
+    }
+
+    return lws;
+}
+
+std::vector<uint32_t> LocalTune(OpenCLExecuteUnit &unit, OpenCLContext *context, std::string tune_key) {
+    std::map<std::string, std::vector<uint32_t>> &tune_map = context->GetLocalSizeTuneMap();
+    if (tune_map.count(tune_key) > 0) {
+        std::vector<uint32_t> lws = tune_map[tune_key];
+        return lws;
+    } else {
+        cl::CommandQueue *tune_command_queue = context->TuneCommandQueue();
+        std::vector<uint32_t> &gws           = unit.global_work_size;
+        uint32_t workgroupsize_max           = unit.workgroupsize_max;
+        cl::Kernel &kernel                   = unit.ocl_kernel;
+
+        std::vector<uint32_t> opt_lws = unit.local_work_size;
+        std::vector<uint32_t> lws(gws.size(), 1);
+
+        double kernel_min_time;
+        OpenCLProfilingData data;
+        RunKernel(unit.ocl_kernel, unit.global_work_size, unit.local_work_size, tune_command_queue, "tune", &data);
+        GetKernelTime(&data.event, kernel_min_time);
+
+        if (gws.size() == 2) {
+            for (lws[0] = 1; lws[0] < gws[0] * 2; lws[0] *= 2) {
+                for (lws[1] = 1; lws[1] < gws[1] * 2; lws[1] *= 2) {
+                    if (lws[0] * lws[1] <= workgroupsize_max) {
+                        double kernel_time;
+                        RunKernel(unit.ocl_kernel, unit.global_work_size, lws, tune_command_queue, "tune", &data);
+                        GetKernelTime(&data.event, kernel_time);
+                        if (kernel_time < kernel_min_time) {
+                            kernel_min_time = kernel_time;
+                            opt_lws.resize(2);
+                            opt_lws[0] = lws[0];
+                            opt_lws[1] = lws[1];
+                        }
+                    }
+                }
+            }
+        } else if (gws.size() == 3) {
+            for (lws[0] = 1; lws[0] < gws[0] * 2; lws[0] *= 2) {
+                for (lws[1] = 1; lws[1] < gws[1] * 2; lws[1] *= 2) {
+                    for (lws[2] = 1; lws[2] < gws[2] * 2; lws[2] *= 2) {
+                        if (lws[0] * lws[1] * lws[2] <= workgroupsize_max) {
+                            double kernel_time;
+                            RunKernel(unit.ocl_kernel, unit.global_work_size, lws, tune_command_queue, "tune", &data);
+                            GetKernelTime(&data.event, kernel_time);
+                            if (kernel_time < kernel_min_time) {
+                                kernel_min_time = kernel_time;
+                                opt_lws.resize(3);
+                                opt_lws[0] = lws[0];
+                                opt_lws[1] = lws[1];
+                                opt_lws[2] = lws[2];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // double check
+        double kernel_time;
+        RunKernel(unit.ocl_kernel, unit.global_work_size, unit.local_work_size, tune_command_queue, "tune", &data);
+        GetKernelTime(&data.event, kernel_time);
+
+#ifdef WIN32
+        Sleep(10);
+#else
+        usleep(10000);
+#endif
+
+        if (kernel_time < kernel_min_time) {
+            tune_map.insert(make_pair(tune_key, unit.local_work_size));
+            return unit.local_work_size;
+        } else {
+            tune_map.insert(make_pair(tune_key, opt_lws));
+            return opt_lws;
+        }
+    }
+}
+
+
+// copy data from clBuffer to clImage.
+Status CopyBufferToImage(OpenCLRuntime *runtime, OpenCLContext *context, const cl::Buffer &buffer,
+                         const cl::Image &image, int w, int h, bool need_wait) {
+    LOGD("start CopyBufferToImage\n");
+    std::set<std::string> build_options;
+    cl::Kernel kernel;
+    std::string kernel_name = "CopyBufferToImage2d";
+    Status ret              = runtime->BuildKernel(kernel, "copy_buffer_to_image2d", kernel_name, build_options);
+    if (ret != TNN_OK) {
+        LOGE("kernel %s build failed!\n", kernel_name.c_str());
+        return Status(TNNERR_OPENCL_KERNELBUILD_ERROR, "kernel (CopyBufferToImage2d) build failed!");
+    }
+    auto status = kernel.setArg(0, buffer);
+    ASSERT(status == CL_SUCCESS);
+    status = kernel.setArg(1, image);
+    ASSERT(status == CL_SUCCESS);
+    status = kernel.setArg(2, w);
+    ASSERT(status == CL_SUCCESS);
+    status = kernel.setArg(3, h);
+    ASSERT(status == CL_SUCCESS);
+    cl::Event event;
+    cl_int error = context->CommandQueue()->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(w, h, 1),
+                                                                 cl::NDRange(1, 1, 1), nullptr, &event);
+
+    if (error != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(error);
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL NDRange failed");
+    }
+
+    if (need_wait) {
+        event.wait();
+    }
+    LOGD("end CopyBufferToImage\n");
+    return TNN_OK;
+}
+
+// copy from clImage to clImage
+Status CopyImageToImage(OpenCLRuntime *runtime, OpenCLContext *context, const cl::Image &src, const cl::Image &dst,
+                        int w, int h, bool need_wait, OpenCLProfilingData *pdata) {
+    LOGD("start CopyImageToImage\n");
+    cl::Event event;
+    const size_t src_origin[3] = {0, 0, 0};
+    const size_t dst_origin[3] = {0, 0, 0};
+    const size_t region[3]     = {(size_t)w, (size_t)h, 1};
+    cl_int error = context->CommandQueue()->enqueueCopyImage(src, dst, {0, 0, 0}, {0, 0, 0}, {(size_t)w, (size_t)h, 1},
+                                                             nullptr, &event);
+
+    if (error != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(error);
+        return Status(TNNERR_OPENCL_API_ERROR, "OpenCL NDRange failed");
+    }
+
+    if (need_wait) {
+        event.wait();
+    }
+
+#if TNN_PROFILE
+    if (pdata != nullptr) {
+        pdata->event = event;
+    }
+#endif
+
+    LOGD("end CopyImageToImage\n");
+    return TNN_OK;
+}
+
+Status CopyBufferToMat(Mat &mat, cl::Buffer& buffer, DimsVector& dims, const int buffer_size,
+                       const MatType& mat_type, cl::CommandQueue *command_queue) {
+    int data_type_size = 1;
+    if (mat_type == NCHW_FLOAT) {
+        data_type_size = sizeof(float);
+    } else if (mat_type == NC_INT32) {
+        data_type_size = sizeof(int);
+    } else if (mat_type == N8UC4) {
+        //special for 8UC4, blob channel <= 4.
+        dims[1] = 4;
+    }
+    int size_in_bytes = DimsVectorUtils::Count(dims) * data_type_size;
+    if (size_in_bytes > buffer_size) {
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL buffer is smaller than the need!");
+    }
+    cl_int ret = CL_SUCCESS;
+    ret = command_queue->enqueueReadBuffer(buffer, CL_TRUE, 0, size_in_bytes, mat.GetData());
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL enqueueReadBuffer failed");
+    }
+
+    return TNN_OK;
+}
+
+Status CopyMatToBuffer(Mat &mat, cl::Buffer& buffer, DimsVector& dims, const int buffer_size,
+                       const MatType& mat_type, cl::CommandQueue *command_queue) {
+    int data_type_size = 1;
+    if (mat_type == NCHW_FLOAT) {
+        data_type_size = sizeof(float);
+    } else if (mat_type == NC_INT32) {
+        data_type_size = sizeof(int);
+    } else if (mat_type == N8UC4) {
+        //special for 8UC4, blob channel <= 4.
+        dims[1] = 4;
+    }
+    int size_in_bytes = DimsVectorUtils::Count(dims) * data_type_size;
+    if (size_in_bytes > buffer_size) {
+        return Status(TNNERR_OPENCL_MEMALLOC_ERROR, "OpenCL buffer is smaller than the need!");
+    }
+    cl_int ret = CL_SUCCESS;
+    ret = command_queue->enqueueWriteBuffer(buffer, CL_TRUE, 0, size_in_bytes, mat.GetData());
+    if (ret != CL_SUCCESS) {
+        CHECK_CL_SUCCESS(ret)
+        return Status(TNNERR_OPENCL_MEMUNMAP_ERROR, "OpenCL enqueueWriteBuffer failed");
+    }
+
+    return TNN_OK;
+}
+
+uint32_t gcd(uint32_t number1, uint32_t number2) {
+    return number2 == 0 ? number1 : gcd(number2, number1 % number2);
+}
+
+// create execute unit with kernel name and build options.
+Status CreateExecuteUnit(OpenCLExecuteUnit &unit, const std::string &program_name, const std::string &kernel_name,
+                         const std::set<std::string> &build_opt) {
+    OpenCLRuntime *opencl_runtime = OpenCLRuntime::GetInstance();
+
+    unit.program_name = program_name;
+    unit.kernel_name = kernel_name;
+
+    Status ret = opencl_runtime->BuildKernel(unit.ocl_kernel, program_name, kernel_name, build_opt);
+    if (ret != TNN_OK) {
+        LOGE("kernel (%s) build failed!\n", kernel_name.c_str());
+        return ret;
+    }
+    unit.workgroupsize_max = static_cast<uint32_t>(opencl_runtime->GetMaxWorkGroupSize(unit.ocl_kernel));
+    if (unit.workgroupsize_max == 0) {
+        LOGE("Get max workgroup size failed!\n");
+        return Status(TNNERR_OPENCL_ACC_INIT_ERROR, "Get max workgroup size failed!");
+    }
+
+    unit.sub_group_size = static_cast<uint32_t>(opencl_runtime->GetSubGroupSize(unit.ocl_kernel));
+
+    unit.local_mem_size = opencl_runtime->DeviceLocalMemerySize();
+
+    return TNN_OK;
+}
+
+// set execute unit 3d default global size, local size and kernel arguments.
+uint32_t SetExecuteUnit3DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims) {
+    uint32_t gws0 = 0, gws1 = 0, gws2;
+    if (dims.size() == 5) {
+        // dim4
+        gws0 = DimsFunctionUtils::GetDim(dims, 4);
+        // channel-blocks/4
+        gws1 = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4);
+        // batch * dim2 * dim3
+        gws2 = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2) *
+               DimsFunctionUtils::GetDim(dims, 3);
+    } else if (dims.size() == 6) {
+        // dim4 * dim5
+        gws0 = DimsFunctionUtils::GetDim(dims, 4) * DimsFunctionUtils::GetDim(dims, 5);
+        // channel-blocks/4
+        gws1 = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4);
+        // batch * dim2 * dim3
+        gws2 = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2) *
+               DimsFunctionUtils::GetDim(dims, 3);
+    } else {
+        // width
+        gws0 = DimsFunctionUtils::GetDim(dims, 3);
+        // channel-blocks/4
+        gws1 = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4);
+        // batch * height
+        gws2 = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2);
+    }
+    unit.global_work_size = {gws0, gws1, gws2};
+
+    // change the order temporarily to get the local size
+    std::vector<uint32_t> temp_gws = {unit.global_work_size[1], unit.global_work_size[0], unit.global_work_size[2]};
+    std::vector<uint32_t> temp_lws = LocalWS3DDefault(temp_gws, unit.workgroupsize_max, unit.sub_group_size);
+
+    if (3 == temp_lws.size())
+        unit.local_work_size = {temp_lws[1], temp_lws[0], temp_lws[2]};
+    else
+        unit.local_work_size.clear();
+
+    uint32_t idx = 0;
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[0]);
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[1]);
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[2]);
+
+    return idx;
+}
+
+// set execute unit 2d default global size, local size and kernel arguments.
+uint32_t SetExecuteUnit2DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims) {
+    uint32_t image_width = 0, image_height = 0;
+    if (dims.size() == 5) {
+        // channel-blocks * dim4
+        image_width = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4) * DimsFunctionUtils::GetDim(dims, 4);
+        // batch * dim2 * dim3
+        image_height = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2) *
+                       DimsFunctionUtils::GetDim(dims, 3);
+    } else if (dims.size() == 6) {
+        // channel-blocks * dim4 * dim5
+        image_width = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4) * DimsFunctionUtils::GetDim(dims, 4) *
+                      DimsFunctionUtils::GetDim(dims, 5);
+        // batch * dim2 * dim3
+        image_height = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2) *
+                       DimsFunctionUtils::GetDim(dims, 3);
+    } else {
+        // channel-blocks * [width]
+        image_width = UP_DIV(DimsFunctionUtils::GetDim(dims, 1), 4) * DimsFunctionUtils::GetDim(dims, 3);
+        // batch * height
+        image_height = DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 2);
+    }
+    unit.global_work_size = {image_width, image_height};
+    unit.local_work_size = LocalWS2DDefault(unit);
+    uint32_t idx         = 0;
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[0]);
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[1]);
+    return idx;
+}
+
+// set execute unit 2d global size for cnh4, local size and kernel arguments.
+uint32_t SetExecuteUnit2DSizeInfoCNH4(OpenCLExecuteUnit &unit, DimsVector dims) {
+    unit.global_work_size = {
+        // height-blocks
+        static_cast<uint32_t>(UP_DIV(DimsFunctionUtils::GetDim(dims, 2), 4)),
+        // channel * batch
+        static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 1) * DimsFunctionUtils::GetDim(dims, 0)),
+    };
+    unit.local_work_size = LocalWS2DDefault(unit);
+    uint32_t idx         = 0;
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[0]);
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[1]);
+    return idx;
+}
+
+// set execute unit 2d global size for nchw, local size and kernel arguments.
+uint32_t SetExecuteUnit2DSizeInfoNCHW(OpenCLExecuteUnit &unit, DimsVector dims) {
+    int count = DimsVectorUtils::Count(dims, 2);
+    uint32_t gws0 = count == 0 ? 1 : count;
+    unit.global_work_size = {
+        // [dim2 * dim3 ...]
+        gws0,
+        // batch * channel
+        static_cast<uint32_t>(DimsFunctionUtils::GetDim(dims, 0) * DimsFunctionUtils::GetDim(dims, 1)),
+    };
+    unit.local_work_size = LocalWS2DDefault(unit);
+    uint32_t idx         = 0;
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[0]);
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[1]);
+    return idx;
+}
+
+// set execute unit 1d default global size, local size and kernel arguments.
+uint32_t SetExecuteUnit1DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims) {
+    unit.global_work_size = {(uint32_t)DimsVectorUtils::Count(dims)};
+    unit.local_work_size = {unit.workgroupsize_max};
+    uint32_t idx         = 0;
+    unit.ocl_kernel.setArg(idx++, unit.global_work_size[0]);
+    return idx;
+}
+
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.h
new file mode 100644
index 0000000..e3329b9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_utils.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_UTILS_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/device/opencl/opencl_context.h"
+#include "tnn/device/opencl/opencl_execute_unit.h"
+#include "tnn/device/opencl/opencl_memory.h"
+#include "tnn/device/opencl/opencl_runtime.h"
+
+#include "tnn/core/mat.h"
+#include "tnn/core/blob.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+enum OpenCLBufferFormat {
+    CONV2D_FILTER    = 0,
+    NHWC_BUFFER      = 1,
+    ARGUMENT         = 2,
+    DW_CONV2D_FILTER = 3,
+    NCHW_BUFFER      = 4,
+    NHWC4_BUFFER     = 5,
+    LSTM_FILTER      = 6,
+    LSTM_BIAS        = 7,
+};
+
+template <typename T, typename Dim>
+inline void IOHW2OIHW(const T *src, T *dst, Dim O, Dim I, Dim H, Dim W) {
+    for (Dim i = 0; i < I; i++) {
+        for (Dim o = 0; o < O; o++) {
+            for (Dim h = 0; h < H; h++) {
+                for (Dim w = 0; w < W; w++) {
+                    dst[o * I * H * W + i * H * W + h * W + w] = src[i * O * H * W + o * H * W + h * W + w];
+                }
+            }
+        }
+    }
+}
+
+enum GroupWeightsFormat { GOIHW, GIOHW };
+
+template <typename T, typename Dim>
+inline void GROUP_PADDING(const T *src, T *dst, Dim G, Dim O, Dim I, Dim H, Dim W, GroupWeightsFormat src_format) {
+    int input_channel_per_group = I / G;
+    int group_size_in_o         = O / G;
+
+    for (Dim o = 0; o < O; o++) {
+        for (Dim i = 0; i < I; i++) {
+            for (Dim h = 0; h < H; h++) {
+                for (Dim w = 0; w < W; w++) {
+                    int dst_idx = o * I * H * W + i * H * W + h * W + w;
+
+                    int group_id  = o / group_size_in_o;
+                    int valid_i_b = group_id * input_channel_per_group;
+                    int valid_i_e = valid_i_b + input_channel_per_group;
+                    if (i < valid_i_b || i >= valid_i_e) {
+                        dst[dst_idx] = 0;
+                    } else {
+                        int g_idx = group_id;
+                        int o_idx = o % group_size_in_o;
+                        int i_idx = i % input_channel_per_group;
+                        int h_idx = h;
+                        int w_idx = w;
+                        // src is GOIHW
+                        int src_idx;
+                        if (src_format == GOIHW) {
+                            src_idx = g_idx * group_size_in_o * input_channel_per_group * H * W +
+                                      o_idx * input_channel_per_group * H * W + i_idx * H * W + h_idx * W + w_idx;
+                        } else {
+                            // src is GIOHW
+                            src_idx = g_idx * input_channel_per_group * group_size_in_o * H * W +
+                                      i_idx * group_size_in_o * H * W + o_idx * H * W + h_idx * W + w_idx;
+                        }
+                        dst[dst_idx] = src[src_idx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+inline cl::Buffer &GetOpenCLBuffer(const OpenCLMemory *blob) {
+    return (*(cl::Buffer *)(blob->GetData()));
+}
+inline cl::Image &GetOpenCLImage(const OpenCLMemory *blob) {
+    return (*(cl::Image *)(blob->GetData()));
+}
+
+std::vector<int> GetImageShape(const OpenCLMemory *image);
+
+void GetKernelTime(const cl::Event *event, double &kernel_time);
+
+void GetProfilingTime(const cl::Event *event, double &kernel_time, double &event_queued, double &event_submit,
+                      double &event_start, double &event_end);
+
+Status RunKernel(const cl::Kernel &kernel, const std::vector<uint32_t> &gws, const std::vector<uint32_t> &lws,
+                 cl::CommandQueue *command_queue, std::string name = "", OpenCLProfilingData *pdata = nullptr);
+
+std::vector<uint32_t> AdrenoLocalSize2D(const std::vector<uint32_t> &gws, const GpuInfo gpu_info,
+                                        const uint32_t compute_units, const uint32_t max_workgroup_size,
+                                        const uint32_t subgroup_size);
+
+Status AdjustBuildOptionForFp32(std::set<std::string>& build_options);
+
+std::vector<uint32_t> LocalWS3DDefault(OpenCLExecuteUnit &unit);
+
+std::vector<uint32_t> LocalWS3DDefault(const std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
+                                       const uint32_t subgroup_size = 0);
+
+std::vector<uint32_t> LocalWS2DDefault(OpenCLExecuteUnit &unit);
+
+std::vector<uint32_t> LocalWS2DDefault(const std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
+                                       const uint32_t subgroup_size = 0);
+
+std::vector<uint32_t> LocalTune(OpenCLExecuteUnit &unit, OpenCLContext *context, std::string tune_key);
+
+Status CopyBufferToImage(OpenCLRuntime *runtime, OpenCLContext *context, const cl::Buffer &buffer,
+                         const cl::Image &image, int w, int h, bool need_wait = false);
+
+Status CopyImageToImage(OpenCLRuntime *runtime, OpenCLContext *context, const cl::Image &src, const cl::Image &dst,
+                        int w, int h, bool need_wait = false, OpenCLProfilingData *pdata = nullptr);
+
+Status CopyBufferToMat(Mat &mat, cl::Buffer& buffer, DimsVector& dims, const int buffer_size,
+                       const MatType& mat_type, cl::CommandQueue *command_queue);
+
+Status CopyMatToBuffer(Mat &mat, cl::Buffer& buffer, DimsVector& dims, const int buffer_size,
+                       const MatType& mat_type, cl::CommandQueue *command_queue);
+
+uint32_t gcd(uint32_t number1, uint32_t number2);
+
+Status CreateExecuteUnit(OpenCLExecuteUnit &unit, const std::string &program_name, const std::string &kernel_name,
+                         const std::set<std::string> &build_opt = {});
+
+uint32_t SetExecuteUnit3DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims);
+
+uint32_t SetExecuteUnit2DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims);
+
+uint32_t SetExecuteUnit2DSizeInfoCNH4(OpenCLExecuteUnit &unit, DimsVector dims);
+
+uint32_t SetExecuteUnit2DSizeInfoNCHW(OpenCLExecuteUnit &unit, DimsVector dims);
+
+uint32_t SetExecuteUnit1DSizeInfoDefault(OpenCLExecuteUnit &unit, DimsVector dims);
+
+}  // namespace TNN_NS
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.cc b/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.cc
new file mode 100644
index 0000000..2e8323c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.cc
@@ -0,0 +1,671 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifdef TNN_USE_OPENCL_WRAPPER
+
+#ifdef WIN32
+#include <windows.h>
+#include <libloaderapi.h>
+#else
+#include <dlfcn.h>
+#endif
+#include <memory>
+#include <string>
+#include <vector>
+#include "tnn/device/opencl/opencl_runtime.h"
+
+namespace TNN_NS {
+static const std::vector<std::string> g_opencl_library_paths = {
+//default opencl library path
+#if defined(__APPLE__) || defined(__MACOSX)
+    "libOpenCL.so", "/System/Library/Frameworks/OpenCL.framework/OpenCL"
+#elif defined(__ANDROID__)
+    "libOpenCL.so",
+    "libGLES_mali.so",
+    "libmali.so",
+#if defined(__aarch64__)
+    // Qualcomm Adreno
+    "/system/vendor/lib64/libOpenCL.so",
+    "/system/lib64/libOpenCL.so",
+    // Mali
+    "/system/vendor/lib64/egl/libGLES_mali.so",
+    "/system/lib64/egl/libGLES_mali.so",
+    // Pixel Phone
+    "libOpenCL-pixel.so",
+#else
+    // Qualcomm Adreno
+    "/system/vendor/lib/libOpenCL.so", "/system/lib/libOpenCL.so",
+    // Mali
+    "/system/vendor/lib/egl/libGLES_mali.so", "/system/lib/egl/libGLES_mali.so",
+    // other
+    "/system/vendor/lib/libPVROCL.so", "/data/data/org.pocl.libs/files/lib/libpocl.so",
+    // Pixel Phone
+    "libOpenCL-pixel.so",
+#endif
+#elif defined(__linux__)
+    "/usr/lib/libOpenCL.so",
+    "/usr/local/lib/libOpenCL.so",
+    "/usr/local/lib/libpocl.so",
+    "/usr/lib64/libOpenCL.so",
+    "/usr/lib32/libOpenCL.so",
+    "libOpenCL.so"
+#elif defined(_WIN32)
+    // SysWOW64/OpenCL.dll is 32-bit 
+    "C:/Windows/SysWOW64/OpenCL.dll",
+    "C:/Windows/System32/OpenCL.dll"
+#elif defined(_WIN64)
+    "C:/Windows/System32/OpenCL.dll",
+    "C:/Windows/SysWOW64/OpenCL.dll"
+#endif
+};
+
+std::shared_ptr<OpenCLSymbols> OpenCLSymbols::opencl_symbols_singleton_ = nullptr;
+
+OpenCLSymbols *OpenCLSymbols::GetInstance() {
+    static std::once_flag opencl_symbol_once;
+    std::call_once(opencl_symbol_once, []() { opencl_symbols_singleton_.reset(new OpenCLSymbols()); });
+
+    return opencl_symbols_singleton_.get();
+}
+
+OpenCLSymbols::OpenCLSymbols() {
+    LOGD("OpenCLSymbols()\n");
+}
+
+OpenCLSymbols::~OpenCLSymbols() {
+    LOGD("~OpenCLSymbols() start\n");
+    if (nullptr == opencl_symbols_singleton_.get())
+        return;
+    opencl_symbols_singleton_->UnLoadOpenCLLibrary();
+    LOGD("~OpenCLSymbols() end\n");
+}
+
+//load default library path
+bool OpenCLSymbols::LoadOpenCLLibrary() {
+    if (handle_ != nullptr) {
+        return true;
+    }
+    for (const auto &opencl_lib : g_opencl_library_paths) {
+        if (LoadLibraryFromPath(opencl_lib)) {
+            LOGD("OpenCL Lib Path: %s\n", opencl_lib.c_str());
+            return true;
+        }
+    }
+    return false;
+}
+
+bool OpenCLSymbols::UnLoadOpenCLLibrary() {
+    if (handle_ != nullptr) {
+#ifdef WIN32
+        if (FreeLibrary(handle_) == 0) {
+#else
+        if (dlclose(handle_) != 0) {
+#endif
+            return false;
+        }
+        handle_ = nullptr;
+        return true;
+    }
+    return true;
+}
+
+bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
+#ifdef WIN32
+    handle_ = LoadLibraryA(library_path.c_str());
+    if (handle_ == nullptr) {
+        return false;
+    }
+
+#define TNN_LOAD_FUNCTION_PTR(func_name)                                                                               \
+    func_name = reinterpret_cast<func_name##Func>(GetProcAddress(handle_, #func_name));                                         \
+    if (func_name == nullptr) {                                                                                        \
+        LOGE("load func (%s) from (%s) failed!\n", #func_name, library_path.c_str());                                  \
+        return false;                                                                                                  \
+    }
+
+#else  // WIN32
+    handle_ = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (handle_ == nullptr) {
+        return false;
+    }
+    bool is_pixel = library_path == "libOpenCL-pixel.so";
+    typedef void* (*loadOpenCLPointer_t)(const char* name);
+    loadOpenCLPointer_t loadOpenCLPointer;
+    if(is_pixel){
+        typedef void (*enableOpenCL_t)();
+        enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle_, "enableOpenCL"));
+        if (enableOpenCL == nullptr) {
+            return false;
+        }
+        enableOpenCL();
+        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle_, "loadOpenCLPointer"));
+        if (loadOpenCLPointer == nullptr) {
+            return false;
+        }
+    }
+
+// load function ptr use dlopen and dlsym. if cann't find func_name, will return false.
+#define TNN_LOAD_FUNCTION_PTR(func_name)                                                                               \
+    if(is_pixel){                                                                                                      \
+        func_name = reinterpret_cast<func_name##Func>(loadOpenCLPointer(#func_name));                                  \
+    } else {                                                                                                           \
+        func_name = reinterpret_cast<func_name##Func>(dlsym(handle_, #func_name));                                     \
+    }                                                                                                                  \
+    if (func_name == nullptr) {                                                                                        \
+        LOGE("load func (%s) from (%s) failed!\n", #func_name, library_path.c_str());                                  \
+        return false;                                                                                                  \
+    }
+
+#endif // end of WIN32
+
+    TNN_LOAD_FUNCTION_PTR(clGetPlatformIDs);
+    TNN_LOAD_FUNCTION_PTR(clGetPlatformInfo);
+    TNN_LOAD_FUNCTION_PTR(clBuildProgram);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel);
+    TNN_LOAD_FUNCTION_PTR(clSetKernelArg);
+    TNN_LOAD_FUNCTION_PTR(clReleaseKernel);
+    TNN_LOAD_FUNCTION_PTR(clCreateProgramWithSource);
+    TNN_LOAD_FUNCTION_PTR(clCreateBuffer);
+    TNN_LOAD_FUNCTION_PTR(clCreateImage2D);
+    TNN_LOAD_FUNCTION_PTR(clCreateImage3D);
+    TNN_LOAD_FUNCTION_PTR(clRetainKernel);
+    TNN_LOAD_FUNCTION_PTR(clCreateKernel);
+    TNN_LOAD_FUNCTION_PTR(clGetProgramInfo);
+    TNN_LOAD_FUNCTION_PTR(clFlush);
+    TNN_LOAD_FUNCTION_PTR(clFinish);
+    TNN_LOAD_FUNCTION_PTR(clReleaseProgram);
+    TNN_LOAD_FUNCTION_PTR(clRetainContext);
+    TNN_LOAD_FUNCTION_PTR(clGetContextInfo);
+    TNN_LOAD_FUNCTION_PTR(clCreateProgramWithBinary);
+    TNN_LOAD_FUNCTION_PTR(clCreateCommandQueue);
+    TNN_LOAD_FUNCTION_PTR(clGetCommandQueueInfo);
+    TNN_LOAD_FUNCTION_PTR(clReleaseCommandQueue);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueMapBuffer);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueMapImage);
+    TNN_LOAD_FUNCTION_PTR(clRetainProgram);
+    TNN_LOAD_FUNCTION_PTR(clGetProgramBuildInfo);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueReadBuffer);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueWriteBuffer);
+    TNN_LOAD_FUNCTION_PTR(clWaitForEvents);
+    TNN_LOAD_FUNCTION_PTR(clReleaseEvent);
+    TNN_LOAD_FUNCTION_PTR(clCreateContext);
+    TNN_LOAD_FUNCTION_PTR(clCreateContextFromType);
+    TNN_LOAD_FUNCTION_PTR(clReleaseContext);
+    TNN_LOAD_FUNCTION_PTR(clRetainCommandQueue);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueUnmapMemObject);
+    TNN_LOAD_FUNCTION_PTR(clRetainMemObject);
+    TNN_LOAD_FUNCTION_PTR(clReleaseMemObject);
+    TNN_LOAD_FUNCTION_PTR(clGetDeviceInfo);
+    TNN_LOAD_FUNCTION_PTR(clGetDeviceIDs);
+    TNN_LOAD_FUNCTION_PTR(clRetainEvent);
+    TNN_LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo);
+    TNN_LOAD_FUNCTION_PTR(clGetEventInfo);
+    TNN_LOAD_FUNCTION_PTR(clGetEventProfilingInfo);
+    TNN_LOAD_FUNCTION_PTR(clGetImageInfo);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueCopyImage);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueCopyBufferToImage);
+    TNN_LOAD_FUNCTION_PTR(clEnqueueCopyImageToBuffer);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    TNN_LOAD_FUNCTION_PTR(clRetainDevice);
+    TNN_LOAD_FUNCTION_PTR(clReleaseDevice);
+    TNN_LOAD_FUNCTION_PTR(clCreateImage);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    TNN_LOAD_FUNCTION_PTR(clGetKernelSubGroupInfoKHR);
+    TNN_LOAD_FUNCTION_PTR(clCreateCommandQueueWithProperties);
+    TNN_LOAD_FUNCTION_PTR(clGetExtensionFunctionAddress);
+#endif
+
+#undef TNN_LOAD_FUNCTION_PTR
+
+    return true;
+}
+
+}  // namespace TNN_NS
+
+// clGetPlatformIDs wrapper, use OpenCLSymbols function. use OpenCLSymbols function.
+cl_int CL_API_CALL clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetPlatformIDs;
+    CHECK_NOTNULL(func);
+    return func(num_entries, platforms, num_platforms);
+}
+
+//clGetPlatformInfo wrapper, use OpenCLSymbols function. use OpenCLSymbols function.
+cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size,
+                         void *param_value, size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetPlatformInfo;
+    CHECK_NOTNULL(func);
+    return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clGetDeviceIDs wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices,
+                      cl_uint *num_devices) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetDeviceIDs;
+    CHECK_NOTNULL(func);
+    return func(platform, device_type, num_entries, devices, num_devices);
+}
+
+//clGetDeviceInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value,
+                       size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetDeviceInfo;
+    CHECK_NOTNULL(func);
+    return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clCreateContext wrapper, use OpenCLSymbols function.
+cl_context CL_API_CALL clCreateContext(const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices,
+                           void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), void *user_data,
+                           cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateContext;
+    CHECK_NOTNULL(func);
+    return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
+}
+
+//clCreateContextFromType wrapper, use OpenCLSymbols function.
+cl_context CL_API_CALL clCreateContextFromType(const cl_context_properties *properties, cl_device_type device_type,
+                                   void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                                   void *user_data, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateContextFromType;
+    CHECK_NOTNULL(func);
+    return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+}
+
+//clRetainContext wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainContext(cl_context context) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainContext;
+    CHECK_NOTNULL(func);
+    return func(context);
+}
+
+//clReleaseContext wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseContext(cl_context context) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseContext;
+    CHECK_NOTNULL(func);
+    return func(context);
+}
+
+//clGetContextInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetContextInfo(cl_context context, cl_context_info param_name, size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetContextInfo;
+    CHECK_NOTNULL(func);
+    return func(context, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clCreateProgramWithSource wrapper, use OpenCLSymbols function.
+cl_program CL_API_CALL clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths,
+                                     cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateProgramWithSource;
+    CHECK_NOTNULL(func);
+    return func(context, count, strings, lengths, errcode_ret);
+}
+
+//clCreateProgramWithBinary wrapper, use OpenCLSymbols function.
+cl_program CL_API_CALL clCreateProgramWithBinary(cl_context context, cl_uint count, const cl_device_id *device_list,
+                                     const size_t *length, const unsigned char **buffer,
+                                     cl_int *binary_status, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateProgramWithBinary;
+    CHECK_NOTNULL(func);
+    return func(context, count, device_list, length, buffer, binary_status, errcode_ret);
+}
+
+//clGetProgramInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetProgramInfo;
+    CHECK_NOTNULL(func);
+    return func(program, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clGetProgramBuildInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name,
+                             size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetProgramBuildInfo;
+    CHECK_NOTNULL(func);
+    return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clRetainProgram wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainProgram(cl_program program) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainProgram;
+    CHECK_NOTNULL(func);
+    return func(program);
+}
+
+//clReleaseProgram wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseProgram(cl_program program) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseProgram;
+    CHECK_NOTNULL(func);
+    return func(program);
+}
+
+//clBuildProgram wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options,
+                      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clBuildProgram;
+    CHECK_NOTNULL(func);
+    return func(program, num_devices, device_list, options, pfn_notify, user_data);
+}
+
+//clCreateKernel wrapper, use OpenCLSymbols function.
+cl_kernel CL_API_CALL clCreateKernel(cl_program program, const char *kernelName, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateKernel;
+    CHECK_NOTNULL(func);
+    return func(program, kernelName, errcode_ret);
+}
+
+//clRetainKernel wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainKernel(cl_kernel kernel) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainKernel;
+    CHECK_NOTNULL(func);
+    return func(kernel);
+}
+
+//clReleaseKernel wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseKernel(cl_kernel kernel) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseKernel;
+    CHECK_NOTNULL(func);
+    return func(kernel);
+}
+
+//clSetKernelArg wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clSetKernelArg;
+    CHECK_NOTNULL(func);
+    return func(kernel, arg_index, arg_size, arg_value);
+}
+
+//clCreateBuffer wrapper, use OpenCLSymbols function.
+cl_mem CL_API_CALL clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateBuffer;
+    CHECK_NOTNULL(func);
+    return func(context, flags, size, host_ptr, errcode_ret);
+}
+
+//clRetainMemObject wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainMemObject(cl_mem memobj) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainMemObject;
+    CHECK_NOTNULL(func);
+    return func(memobj);
+}
+
+//clReleaseMemObject wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseMemObject(cl_mem memobj) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseMemObject;
+    CHECK_NOTNULL(func);
+    return func(memobj);
+}
+
+//clGetImageInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetImageInfo(cl_mem image, cl_image_info param_name, size_t param_value_size, void *param_value,
+                      size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetImageInfo;
+    CHECK_NOTNULL(func);
+    return func(image, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clRetainCommandQueue wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainCommandQueue(cl_command_queue command_queue) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainCommandQueue;
+    CHECK_NOTNULL(func);
+    return func(command_queue);
+}
+
+//clReleaseCommandQueue wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue command_queue) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseCommandQueue;
+    CHECK_NOTNULL(func);
+    return func(command_queue);
+}
+
+//clEnqueueReadBuffer wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset,
+                           size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                           cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueReadBuffer;
+    CHECK_NOTNULL(func);
+    return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                event);
+}
+
+//clEnqueueWriteBuffer wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset,
+                            size_t size, const void *ptr, cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueWriteBuffer;
+    CHECK_NOTNULL(func);
+    return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                event);
+}
+
+//clEnqueueMapBuffer wrapper, use OpenCLSymbols function.
+void *CL_API_CALL clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags,
+                         size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                         cl_event *event, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueMapBuffer;
+    CHECK_NOTNULL(func);
+    return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list,
+                event, errcode_ret);
+}
+
+//clEnqueueMapImage wrapper, use OpenCLSymbols function.
+void *CL_API_CALL clEnqueueMapImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags,
+                        const size_t *origin, const size_t *region, size_t *image_row_pitch, size_t *image_slice_pitch,
+                        cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event,
+                        cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueMapImage;
+    CHECK_NOTNULL(func);
+    return func(command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch, image_slice_pitch,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
+//clEnqueueUnmapMemObject wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
+                               cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueUnmapMemObject;
+    CHECK_NOTNULL(func);
+    return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+//clGetKernelWorkGroupInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+                                size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetKernelWorkGroupInfo;
+    CHECK_NOTNULL(func);
+    return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clGetEventProfilingInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetEventProfilingInfo;
+    CHECK_NOTNULL(func);
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clEnqueueNDRangeKernel wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+                              const size_t *global_work_offset, const size_t *global_work_size,
+                              const size_t *local_work_size, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueNDRangeKernel;
+    CHECK_NOTNULL(func);
+    return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                num_events_in_wait_list, event_wait_list, event);
+}
+
+//clWaitForEvents wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clWaitForEvents(cl_uint num_events, const cl_event *event_list) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clWaitForEvents;
+    CHECK_NOTNULL(func);
+    return func(num_events, event_list);
+}
+
+//clRetainEvent wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainEvent(cl_event event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainEvent;
+    CHECK_NOTNULL(func);
+    return func(event);
+}
+
+//clReleaseEvent wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseEvent(cl_event event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseEvent;
+    CHECK_NOTNULL(func);
+    return func(event);
+}
+
+//clGetEventInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value,
+                      size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetEventInfo;
+    CHECK_NOTNULL(func);
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clFlush wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clFlush(cl_command_queue command_queue) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clFlush;
+    CHECK_NOTNULL(func);
+    return func(command_queue);
+}
+
+//clFinish wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clFinish(cl_command_queue command_queue) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clFinish;
+    CHECK_NOTNULL(func);
+    return func(command_queue);
+}
+
+//clCreateImage2D wrapper, use OpenCLSymbols function.
+cl_mem CL_API_CALL clCreateImage2D(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t imageWidth,
+                       size_t imageHeight, size_t image_row_pitch, void *host_ptr, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateImage2D;
+    CHECK_NOTNULL(func);
+    return func(context, flags, image_format, imageWidth, imageHeight, image_row_pitch, host_ptr, errcode_ret);
+}
+
+//clCreateImage3D wrapper, use OpenCLSymbols function.
+cl_mem CL_API_CALL clCreateImage3D(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t imageWidth,
+                       size_t imageHeight, size_t imageDepth, size_t image_row_pitch, size_t image_slice_pitch,
+                       void *host_ptr, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateImage3D;
+    CHECK_NOTNULL(func);
+    return func(context, flags, image_format, imageWidth, imageHeight, imageDepth, image_row_pitch, image_slice_pitch,
+                host_ptr, errcode_ret);
+}
+
+//clCreateCommandQueue wrapper, use OpenCLSymbols function.
+cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context context, cl_device_id device, cl_command_queue_properties properties,
+                                      cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateCommandQueue;
+    CHECK_NOTNULL(func);
+    return func(context, device, properties, errcode_ret);
+}
+
+//clGetCommandQueueInfo wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetCommandQueueInfo(cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size,
+                             void *param_value, size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetCommandQueueInfo;
+    CHECK_NOTNULL(func);
+    return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+//clEnqueueCopyImage wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue queue, cl_mem src_image, cl_mem dst_image, const size_t *src_origin,
+                          const size_t *dst_origin, const size_t *region, cl_uint num_events_in_wait_list,
+                          const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueCopyImage;
+    CHECK_NOTNULL(func);
+    return func(queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list,
+                event);
+}
+
+//clEnqueueCopyBufferToImage wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueCopyBufferToImage(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
+                                  size_t src_offset, const size_t *dst_origin, const size_t *region,
+                                  cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueCopyBufferToImage;
+    CHECK_NOTNULL(func);
+    return func(command_queue, src_buffer, dst_image, src_offset, dst_origin, region, num_events_in_wait_list,
+                event_wait_list, event);
+}
+
+//clEnqueueCopyImageToBuffer wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clEnqueueCopyImageToBuffer(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
+                                  const size_t *src_origin, const size_t *region, size_t dst_offset,
+                                  cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clEnqueueCopyImageToBuffer;
+    CHECK_NOTNULL(func);
+    return func(command_queue, src_image, dst_buffer, src_origin, region, dst_offset, num_events_in_wait_list,
+                event_wait_list, event);
+}
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+//clRetainDevice wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clRetainDevice(cl_device_id device) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clRetainDevice;
+    CHECK_NOTNULL(func);
+    return func(device);
+}
+
+//clReleaseDevice wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clReleaseDevice(cl_device_id device) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clReleaseDevice;
+    CHECK_NOTNULL(func);
+    return func(device);
+}
+
+//clCreateImage wrapper, use OpenCLSymbols function.
+cl_mem CL_API_CALL clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+                     const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateImage;
+    CHECK_NOTNULL(func);
+    return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+}
+
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+//clGetKernelSubGroupInfoKHR wrapper, use OpenCLSymbols function.
+cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
+                                  size_t input_value_size, const void *input_value, size_t param_value_size,
+                                  void *param_value, size_t *param_value_size_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetKernelSubGroupInfoKHR;
+    CHECK_NOTNULL(func);
+    return func(kernel, device, param_name, input_value_size, input_value, param_value_size, param_value,
+                param_value_size_ret);
+}
+
+//clCreateCommandQueueWithProperties wrapper, use OpenCLSymbols function.
+cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device,
+                                                    const cl_queue_properties *properties, cl_int *errcode_ret) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clCreateCommandQueueWithProperties;
+    CHECK_NOTNULL(func);
+    return func(context, device, properties, errcode_ret);
+}
+
+//clGetExtensionFunctionAddress wrapper, use OpenCLSymbols function.
+void *CL_API_CALL clGetExtensionFunctionAddress(const char *func_name) {
+    auto func = TNN_NS::OpenCLSymbols::GetInstance()->clGetExtensionFunctionAddress;
+    CHECK_NOTNULL(func);
+    return func(func_name);
+}
+#endif
+
+#endif  // TNN_USE_OPENCL_WRAPPER
diff --git a/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.h b/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.h
new file mode 100644
index 0000000..d318c06
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/opencl/opencl_wrapper.h
@@ -0,0 +1,307 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_WRAPPER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_WRAPPER_H_
+
+#ifdef WIN32
+#define NOMINMAX
+#include <windows.h>
+// Do not remove following statement.
+// windows.h replace LoadLibrary with LoadLibraryA, which cause compiling issue of TNN.
+#undef LoadLibrary
+#endif
+
+#include <memory>
+#include "tnn/core/macro.h"
+
+//support opencl min version is 1.1
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 200
+#endif
+#ifndef CL_HPP_TARGET_OPENCL_VERSION
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#endif
+#ifndef CL_HPP_MINIMUM_OPENCL_VERSION
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#endif
+
+#include "CL/cl2.hpp"
+
+#define CHECK_NOTNULL(X)                                                       \
+    ASSERT(X != NULL)                                                          \
+    if (X == NULL) {                                                           \
+        LOGE("OpenCL API is null\n");                                          \
+    }
+
+#define CHECK_CL_SUCCESS(error)                                                \
+    if (error != CL_SUCCESS) {                                                 \
+        LOGE("OpenCL ERROR CODE : %d \n", (int)error);                         \
+    }
+
+#define CHECK_TNN_OK(error)                                                    \
+    if (error != TNN_OK) {                                                     \
+        LOGE("%s\n", error.description().c_str());                             \
+        return error;                                                          \
+    }
+
+#ifdef TNN_USE_OPENCL_WRAPPER
+
+namespace TNN_NS {
+
+// OpenCLSymbols is a opencl function wrapper.
+// if device not support opencl or opencl target function, 
+// app will not crash and can get error code.
+class OpenCLSymbols {
+public:
+    static OpenCLSymbols *GetInstance();
+
+    ~OpenCLSymbols();
+    OpenCLSymbols(const OpenCLSymbols &) = delete;
+    OpenCLSymbols &operator=(const OpenCLSymbols &) = delete;
+
+    bool LoadOpenCLLibrary();
+    bool UnLoadOpenCLLibrary();
+    //get platfrom id 
+    using clGetPlatformIDsFunc  = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *,
+                                            cl_uint *);
+    //get platform info
+    using clGetPlatformInfoFunc = cl_int (CL_API_CALL *)(cl_platform_id, cl_platform_info,
+                                             size_t, void *, size_t *);
+    // build program
+    using clBuildProgramFunc =
+        cl_int (CL_API_CALL *)(cl_program, cl_uint, const cl_device_id *, const char *,
+                   void (CL_CALLBACK *pfn_notify)(cl_program, void *), void *);
+    //enqueue run kernel
+    using clEnqueueNDRangeKernelFunc  = cl_int (CL_API_CALL *)(cl_command_queue, cl_kernel,
+                                                cl_uint, const size_t *,
+                                                const size_t *,
+                                                const size_t *, cl_uint,
+                                                const cl_event *, cl_event *);
+    //set kernel parameter
+    using clSetKernelArgFunc          = cl_int (CL_API_CALL *)(cl_kernel, cl_uint, size_t,
+                                            const void *);
+    using clRetainMemObjectFunc       = cl_int (CL_API_CALL *)(cl_mem);
+    using clReleaseMemObjectFunc      = cl_int (CL_API_CALL *)(cl_mem);
+    using clEnqueueUnmapMemObjectFunc = cl_int (CL_API_CALL *)(cl_command_queue, cl_mem,
+                                                    void *, cl_uint,
+                                                    const cl_event *,
+                                                    cl_event *);
+    using clRetainCommandQueueFunc = cl_int (CL_API_CALL *)(cl_command_queue command_queue);
+    //create context
+    using clCreateContextFunc      = cl_context (CL_API_CALL *)(
+        const cl_context_properties *, cl_uint, const cl_device_id *,
+        void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+            const char *, const void *, size_t, void *),
+        void *, cl_int *);
+    using clEnqueueCopyImageFunc = cl_int (CL_API_CALL *)(cl_command_queue, cl_mem, cl_mem,
+                                            const size_t *, const size_t *,
+                                            const size_t *, cl_uint,
+                                            const cl_event *, cl_event *);
+
+    using clCreateContextFromTypeFunc =
+        cl_context (CL_API_CALL *)(const cl_context_properties *, cl_device_type,
+                        void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+                        const char *, const void *, size_t, void *),
+                        void *, cl_int *);
+    using clReleaseContextFunc      = cl_int (CL_API_CALL *)(cl_context);
+    using clWaitForEventsFunc       = cl_int (CL_API_CALL *)(cl_uint, const cl_event *);
+    using clReleaseEventFunc        = cl_int (CL_API_CALL *)(cl_event);
+    using clEnqueueWriteBufferFunc  = cl_int (CL_API_CALL *)(cl_command_queue, cl_mem,
+                                                cl_bool, size_t, size_t,
+                                                const void *, cl_uint,
+                                                const cl_event *, cl_event *);
+    using clEnqueueReadBufferFunc   = cl_int (CL_API_CALL *)(cl_command_queue, cl_mem,
+                                                cl_bool, size_t, size_t, void *,
+                                                cl_uint, const cl_event *,
+                                                cl_event *);
+    using clGetProgramBuildInfoFunc = cl_int (CL_API_CALL *)(cl_program, cl_device_id,
+                                                cl_program_build_info, size_t,
+                                                void *, size_t *);
+    using clRetainProgramFunc       = cl_int (CL_API_CALL *)(cl_program program);
+    using clEnqueueMapBufferFunc = void *(CL_API_CALL *)(cl_command_queue, cl_mem, cl_bool,
+                                            cl_map_flags, size_t, size_t,
+                                            cl_uint, const cl_event *,
+                                            cl_event *, cl_int *);
+    using clEnqueueMapImageFunc  = void *(CL_API_CALL *)(cl_command_queue, cl_mem, cl_bool,
+                                            cl_map_flags, const size_t *,
+                                            const size_t *, size_t *, size_t *,
+                                            cl_uint, const cl_event *,
+                                            cl_event *, cl_int *);
+    using clCreateCommandQueueFunc = cl_command_queue(CL_API_CALL *)(  // NOLINT
+        cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+    using clGetCommandQueueInfoFunc     = cl_int (CL_API_CALL *)(cl_command_queue,
+                                                cl_command_queue_info, size_t,
+                                                void *, size_t *);
+    using clReleaseCommandQueueFunc     = cl_int (CL_API_CALL *)(cl_command_queue);
+    using clCreateProgramWithBinaryFunc = cl_program (CL_API_CALL *)(cl_context, cl_uint,
+                                                        const cl_device_id *,
+                                                        const size_t *,
+                                                        const unsigned char **,
+                                                        cl_int *, cl_int *);
+    using clRetainContextFunc           = cl_int (CL_API_CALL *)(cl_context context);
+    using clGetContextInfoFunc = cl_int (CL_API_CALL *)(cl_context, cl_context_info, size_t,
+                                            void *, size_t *);
+    using clReleaseProgramFunc = cl_int (CL_API_CALL *)(cl_program program);
+    //flush command queue to target device
+    using clFlushFunc          = cl_int (CL_API_CALL *)(cl_command_queue command_queue);
+    //sync device command queue
+    using clFinishFunc         = cl_int (CL_API_CALL *)(cl_command_queue command_queue);
+    using clGetProgramInfoFunc = cl_int (CL_API_CALL *)(cl_program, cl_program_info, size_t,
+                                            void *, size_t *);
+    //create kernel with kernel name
+    using clCreateKernelFunc   = cl_kernel (CL_API_CALL *)(cl_program, const char *,
+                                            cl_int *);
+    using clRetainKernelFunc   = cl_int (CL_API_CALL *)(cl_kernel kernel);
+    using clCreateBufferFunc   = cl_mem (CL_API_CALL *)(cl_context, cl_mem_flags, size_t,
+                                            void *, cl_int *);
+    //create image 2d
+    using clCreateImage2DFunc  = cl_mem(CL_API_CALL *)(cl_context,  // NOLINT
+                                                    cl_mem_flags,
+                                                    const cl_image_format *,
+                                                    size_t, size_t, size_t,
+                                                    void *, cl_int *);
+    //create image 3d
+    using clCreateImage3DFunc  = cl_mem(CL_API_CALL *)(cl_context,  // NOLINT
+                                                    cl_mem_flags,
+                                                    const cl_image_format *,
+                                                    size_t, size_t, size_t,
+                                                    size_t, size_t, void *,
+                                                    cl_int *);
+    //crete program with source code
+    using clCreateProgramWithSourceFunc = cl_program (CL_API_CALL *)(cl_context, cl_uint,
+                                                        const char **,
+                                                        const size_t *,
+                                                        cl_int *);
+    using clReleaseKernelFunc           = cl_int (CL_API_CALL *)(cl_kernel kernel);
+    using clGetDeviceInfoFunc = cl_int (CL_API_CALL *)(cl_device_id, cl_device_info, size_t,
+                                           void *, size_t *);
+    //get device id, two device have different device id
+    using clGetDeviceIDsFunc  = cl_int (CL_API_CALL *)(cl_platform_id, cl_device_type,
+                                        cl_uint, cl_device_id *, cl_uint *);
+    using clRetainEventFunc   = cl_int (CL_API_CALL *)(cl_event);
+    using clGetKernelWorkGroupInfoFunc = cl_int (CL_API_CALL *)(cl_kernel, cl_device_id,
+                                                    cl_kernel_work_group_info,
+                                                    size_t, void *, size_t *);
+    using clGetEventInfoFunc           = cl_int (CL_API_CALL *)(cl_event event,
+                                            cl_event_info param_name,
+                                            size_t param_value_size,
+                                            void *param_value,
+                                            size_t *param_value_size_ret);
+    using clGetEventProfilingInfoFunc  = cl_int (CL_API_CALL *)(
+        cl_event event, cl_profiling_info param_name, size_t param_value_size,
+        void *param_value, size_t *param_value_size_ret);
+    using clGetImageInfoFunc = cl_int (CL_API_CALL *)(cl_mem, cl_image_info, size_t, void *,
+                                          size_t *);
+    using clEnqueueCopyBufferToImageFunc = cl_int(CL_API_CALL *)(
+        cl_command_queue, cl_mem, cl_mem, size_t, const size_t *,
+        const size_t *, cl_uint, const cl_event *, cl_event *);
+    using clEnqueueCopyImageToBufferFunc = cl_int(CL_API_CALL *)(
+        cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *,
+        size_t, cl_uint, const cl_event *, cl_event *);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    using clRetainDeviceFunc        = cl_int (CL_API_CALL *)(cl_device_id);
+    using clReleaseDeviceFunc       = cl_int (CL_API_CALL *)(cl_device_id);
+    using clCreateImageFunc         = cl_mem (CL_API_CALL *)(cl_context, cl_mem_flags,
+                                        const cl_image_format *,
+                                        const cl_image_desc *, void *,
+                                        cl_int *);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    //opencl 2.0 can get sub group info and wave size.
+    using clGetKernelSubGroupInfoKHRFunc = cl_int(CL_API_CALL *)(
+        cl_kernel, cl_device_id, cl_kernel_sub_group_info, size_t, const void  *, size_t, void *, size_t *);
+    using clCreateCommandQueueWithPropertiesFunc = cl_command_queue(CL_API_CALL *)(
+        cl_context, cl_device_id, const cl_queue_properties *, cl_int *);
+    using clGetExtensionFunctionAddressFunc = void * (CL_API_CALL *)(const char *);
+#endif
+
+#define TNN_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
+
+    TNN_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
+    TNN_CL_DEFINE_FUNC_PTR(clGetPlatformInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clBuildProgram);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueNDRangeKernel);
+    TNN_CL_DEFINE_FUNC_PTR(clSetKernelArg);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseKernel);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateBuffer);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateImage2D);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateImage3D);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainKernel);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateKernel);
+    TNN_CL_DEFINE_FUNC_PTR(clGetProgramInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clFlush);
+    TNN_CL_DEFINE_FUNC_PTR(clFinish);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseProgram);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainContext);
+    TNN_CL_DEFINE_FUNC_PTR(clGetContextInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateProgramWithBinary);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateCommandQueue);
+    TNN_CL_DEFINE_FUNC_PTR(clGetCommandQueueInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseCommandQueue);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueMapBuffer);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueMapImage);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueCopyImage);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainProgram);
+    TNN_CL_DEFINE_FUNC_PTR(clGetProgramBuildInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueReadBuffer);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueWriteBuffer);
+    TNN_CL_DEFINE_FUNC_PTR(clWaitForEvents);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseEvent);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateContext);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateContextFromType);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseContext);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainCommandQueue);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueUnmapMemObject);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainMemObject);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseMemObject);
+    TNN_CL_DEFINE_FUNC_PTR(clGetDeviceInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clGetDeviceIDs);
+    TNN_CL_DEFINE_FUNC_PTR(clRetainEvent);
+    TNN_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clGetEventInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clGetImageInfo);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueCopyBufferToImage);
+    TNN_CL_DEFINE_FUNC_PTR(clEnqueueCopyImageToBuffer);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    TNN_CL_DEFINE_FUNC_PTR(clRetainDevice);
+    TNN_CL_DEFINE_FUNC_PTR(clReleaseDevice);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateImage);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    TNN_CL_DEFINE_FUNC_PTR(clGetKernelSubGroupInfoKHR);
+    TNN_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
+    TNN_CL_DEFINE_FUNC_PTR(clGetExtensionFunctionAddress);
+#endif
+
+#undef TNN_CL_DEFINE_FUNC_PTR
+
+private:
+    OpenCLSymbols();
+    bool LoadLibraryFromPath(const std::string &path);
+
+private:
+    static std::shared_ptr<OpenCLSymbols> opencl_symbols_singleton_;
+#ifdef WIN32
+    HMODULE handle_ = nullptr;
+#else
+    void *handle_ = nullptr;
+#endif
+};
+
+}  // namespace TNN_NS
+#endif  // TNN_USE_OPENCL_WRAPPER
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENCL_OPENCL_WRAPPER_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/rknpu/CMakeLists.txt
new file mode 100644
index 0000000..875bb08
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB RKNPU_SRC
+        *.h
+        *.cc
+        convert/*.h
+        convert/*.cc
+        convert/math/*.h
+        convert/math/*.cc)
+add_library(TNNRKNPU OBJECT ${RKNPU_SRC})
+include_directories(../../../../third_party/rknpu/rknpu_ddk/include)
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_abs_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_abs_layer.cc
new file mode 100644
index 0000000..99c3136
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_abs_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_unary_operator.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuAbsLayer : public RknpuUnaryLayer {
+public:
+    RknpuAbsLayer(LayerType ignore) : RknpuUnaryLayer(LAYER_ABS) {}
+    ~RknpuAbsLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuUnaryLayer::UnaryConvert(rk::nn::OperatorType::ABS);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Abs, LAYER_ABS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_add_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_add_layer.cc
new file mode 100644
index 0000000..fbb6cbc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_add_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_binary_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuAddLayer : public RknpuBinaryLayer {
+public:
+    RknpuAddLayer(LayerType ignore) : RknpuBinaryLayer(LAYER_ADD) {}
+    ~RknpuAddLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuBinaryLayer::BinaryConvert(rk::nn::OperatorType::ADD);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_binary_layer.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_binary_layer.h
new file mode 100644
index 0000000..89c0658
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_binary_layer.h
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_BINARY_LAYER__H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_BINARY_LAYER__H_
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+#include "tnn/layer/base_layer.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+class RknpuBinaryLayer : public RknpuBaseLayer {
+public:
+    RknpuBinaryLayer(LayerType layer_type) : RknpuBaseLayer(layer_type){};
+    virtual ~RknpuBinaryLayer() {}
+
+protected:
+    Status BinaryConvert(rk::nn::OperatorType op_type) {
+        auto param    = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+        auto resource = dynamic_cast<EltwiseLayerResource *>(resource_);
+        CHECK_PARAM_NULL(param);
+
+        int input_size = input_ops_.size();
+        if (!((input_size == 1 && resource) || input_size == 2)) {
+            return Status(TNNERR_LAYER_ERR, "Error: the Binary layer input number is not correct");
+        }
+
+        Status ret = TNN_OK;
+        std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+        // input
+        if (input_size == 2) {
+            inputs.push_back(input_ops_[0]);
+            inputs.push_back(input_ops_[1]);
+        } else {
+            std::vector<int> weight_shape = resource->element_shape;
+            std::vector<int> input_shape;
+            for (auto dim : input_ops_[0]->GetDims()) {
+                input_shape.push_back((int)dim);
+            }
+            Status calculate_ret = NpuCommonUtils::CalculateBroadcastSize(weight_shape, resource, input_shape);
+            if (calculate_ret != TNN_OK) {
+                return calculate_ret;
+            }
+
+            std::vector<int> weight_shape_op = {weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]};
+            auto weight_const                = RknpuUtils::CreateRknnTensor(
+                graph_, layer_name_ + "_weight", weight_shape, resource->element_handle.force_to<void *>(),
+                rk::nn::TensorRole::CONST, resource->element_handle.GetDataType());
+
+            if (param->weight_input_index == 0) {
+                inputs.push_back(weight_const);
+                inputs.push_back(input_ops_[0]);
+            } else {
+                inputs.push_back(input_ops_[0]);
+                inputs.push_back(weight_const);
+            }
+        }
+
+        // output
+        ADD_OUTPUT_OP();
+
+        graph_->AddOperator(op_type, inputs, output_ops_, NULL);
+
+        return ret;
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_BINARY_LAYER__H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_max_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_max_layer.cc
new file mode 100644
index 0000000..7f3e575
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_max_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_binary_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuMaxLayer : public RknpuBinaryLayer {
+public:
+    RknpuMaxLayer(LayerType ignore) : RknpuBinaryLayer(LAYER_MAXIMUM) {}
+    ~RknpuMaxLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuBinaryLayer::BinaryConvert(rk::nn::OperatorType::MAXIMUM);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_mul_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_mul_layer.cc
new file mode 100644
index 0000000..d498e99
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_mul_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_binary_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuMulLayer : public RknpuBinaryLayer {
+public:
+    RknpuMulLayer(LayerType ignore) : RknpuBinaryLayer(LAYER_MUL) {}
+    ~RknpuMulLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuBinaryLayer::BinaryConvert(rk::nn::OperatorType::MULTIPLY);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Mul, LAYER_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_neg_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_neg_layer.cc
new file mode 100644
index 0000000..57b1ef5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_neg_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_unary_operator.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuNegLayer : public RknpuUnaryLayer {
+public:
+    RknpuNegLayer(LayerType ignore) : RknpuUnaryLayer(LAYER_NEG) {}
+    ~RknpuNegLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuUnaryLayer::UnaryConvert(rk::nn::OperatorType::NEG);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Neg, LAYER_NEG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sqrt_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sqrt_layer.cc
new file mode 100644
index 0000000..fb392ad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sqrt_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_unary_operator.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuSqrtLayer : public RknpuUnaryLayer {
+public:
+    RknpuSqrtLayer(LayerType ignore) : RknpuUnaryLayer(LAYER_SQRT) {}
+    ~RknpuSqrtLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuUnaryLayer::UnaryConvert(rk::nn::OperatorType::SQRT);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Sqrt, LAYER_SQRT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sub_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sub_layer.cc
new file mode 100644
index 0000000..7370389
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_sub_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_binary_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_base_layer.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuSubLayer : public RknpuBinaryLayer {
+public:
+    RknpuSubLayer(LayerType ignore) : RknpuBinaryLayer(LAYER_SUB) {}
+    ~RknpuSubLayer() {}
+
+protected:
+    Status Convert() {
+        return RknpuBinaryLayer::BinaryConvert(rk::nn::OperatorType::SUBTRACT);
+    }
+};
+
+REGISTER_RKNPU_LAYER(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_unary_operator.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_unary_operator.h
new file mode 100644
index 0000000..effeafb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/math/rknpu_unary_operator.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_UNARY_OPERATOR_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_UNARY_OPERATOR_H_
+
+#include <tnn/core/layer_type.h>
+#include <tnn/device/rknpu/convert/rknpu_base_layer.h>
+#include <tnn/device/rknpu/convert/rknpu_utils.h>
+
+namespace TNN_NS {
+
+class RknpuUnaryLayer : public RknpuBaseLayer {
+public:
+    RknpuUnaryLayer(LayerType layer_type) : RknpuBaseLayer(layer_type){};
+    virtual ~RknpuUnaryLayer() {}
+
+protected:
+    Status UnaryConvert(rk::nn::OperatorType op_type) {
+        int input_size = input_ops_.size();
+        if (input_size >= 2) {
+            return Status(TNNERR_PARAM_ERR, "Error: the Unary layer count is not correct");
+        }
+
+        Status ret = TNN_OK;
+        std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+        // input
+        inputs.push_back(input_ops_[0]);
+
+        // output
+        ADD_OUTPUT_OP();
+
+        graph_->AddOperator(op_type, inputs, output_ops_, NULL);
+
+        return ret;
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_MATH_RKNPU_UNARY_OPERATOR_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_activation_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_activation_layer.cc
new file mode 100644
index 0000000..978c765
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_activation_layer.cc
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+#include "tnn/core/layer_type.h"
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_ACTIVATION_LAYER_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_ACTIVATION_LAYER_H_
+
+namespace TNN_NS {
+
+class RknpuActivationLayer : public RknpuBaseLayer {
+protected:
+    rk::nn::OperatorType type = rk::nn::OperatorType::RELU;
+
+    virtual Status Convert() {
+        Status ret = TNN_OK;
+        std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+        // input
+        inputs.push_back(input_ops_[0]);
+
+        // output
+        ADD_OUTPUT_OP();
+
+        switch (type_) {
+            case LAYER_SIGMOID:
+                type = rk::nn::OperatorType::SIGMOID;
+                break;
+            case LAYER_RELU:
+                type = rk::nn::OperatorType::RELU;
+                break;
+            case LAYER_TANH:
+                type = rk::nn::OperatorType::TANH;
+                break;
+            case LAYER_ELU: {
+                type       = rk::nn::OperatorType::ELU;
+                auto param = dynamic_cast<EluLayerParam *>(param_);
+                CHECK_PARAM_NULL(param);
+                // output->set_attr_coef(param->alpha);
+            } break;
+            case LAYER_PRELU: {
+                type          = rk::nn::OperatorType::PRELU;
+                auto param    = dynamic_cast<PReluLayerParam *>(param_);
+                auto resource = dynamic_cast<PReluLayerResource *>(resource_);
+                CHECK_PARAM_NULL(param);
+                if (!resource) {
+                    return Status(TNNERR_MODEL_ERR, "Error: prelu layer resource is nil");
+                }
+                const float *slope_data = resource->slope_handle.force_to<float *>();
+                if (param->channel_shared) {
+                    rk::nn::LeakyReluAttr attr;
+                    attr.alpha = slope_data[0];
+                    graph_->AddOperator(rk::nn::OperatorType::LEAKY_RELU, inputs, output_ops_, (void *)&attr);
+
+                    return TNN_OK;
+                } else {
+                    // std::vector<int> slope_shape = {1, output_shapes[0][1], 1, 1};
+                    std::vector<int> slope_shape = {output_shapes[0][1]};
+                    auto slope_const =
+                        RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_slope", slope_shape, slope_data,
+                                                     rk::nn::TensorRole::CONST, resource->slope_handle.GetDataType());
+                    inputs.push_back(slope_const);
+                }
+            } break;
+            case LAYER_ABS:
+                type = rk::nn::OperatorType::ABS;
+                break;
+            case LAYER_RELU6:
+                type = rk::nn::OperatorType::RELU6;
+                break;
+            case LAYER_LEAKY_RELU:
+                type = rk::nn::OperatorType::LEAKY_RELU;
+                // 没有 LeakyreluLayerParam 定义, rknpu有 alpha 参数 (coefficient of leakage)
+                break;
+            default:
+                return Status(TNNERR_UNKNOWN_LAYER, "This activation is not defined in NPU");
+        }
+
+        graph_->AddOperator(type, inputs, output_ops_, nullptr);
+
+        return ret;
+    }
+
+public:
+    RknpuActivationLayer(LayerType layer_type) : RknpuBaseLayer(layer_type) {}
+    ~RknpuActivationLayer() {}
+};
+
+#define DECLARE_RKNPU_ACTIVATION_LAYER(type_string, layer_type)                                                        \
+    class Rknpu##type_string##Layer : public RknpuActivationLayer {                                                    \
+    public:                                                                                                            \
+        Rknpu##type_string##Layer(LayerType ignore) : RknpuActivationLayer(layer_type){};                              \
+        ~Rknpu##type_string##Layer(){};                                                                                \
+    };
+
+DECLARE_RKNPU_ACTIVATION_LAYER(Sigmoid, LAYER_SIGMOID)
+REGISTER_RKNPU_LAYER(Sigmoid, LAYER_SIGMOID)
+DECLARE_RKNPU_ACTIVATION_LAYER(Relu, LAYER_RELU)
+REGISTER_RKNPU_LAYER(Relu, LAYER_RELU)
+// DECLARE_RKNPU_ACTIVATION_LAYER(Tanh, LAYER_TANH)
+// REGISTER_RKNPU_LAYER(Tanh, LAYER_TANH)
+// DECLARE_RKNPU_ACTIVATION_LAYER(Elu, LAYER_ELU)
+// REGISTER_RKNPU_LAYER(Elu, LAYER_ELU)
+DECLARE_RKNPU_ACTIVATION_LAYER(Prelu, LAYER_PRELU)
+REGISTER_RKNPU_LAYER(Prelu, LAYER_PRELU)
+// DECLARE_RKNPU_ACTIVATION_LAYER(Abs, LAYER_ABS)
+// REGISTER_RKNPU_LAYER(Abs, LAYER_ABS)
+DECLARE_RKNPU_ACTIVATION_LAYER(Relu6, LAYER_RELU6)
+REGISTER_RKNPU_LAYER(Relu6, LAYER_RELU6)
+DECLARE_RKNPU_ACTIVATION_LAYER(Leakyrelu, LAYER_LEAKY_RELU)
+REGISTER_RKNPU_LAYER(Leakyrelu, LAYER_LEAKY_RELU)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_ACTIVATION_LAYER_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.cc
new file mode 100644
index 0000000..a024a4d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.cc
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+
+#include <mutex>
+
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+RknpuBaseLayer::RknpuBaseLayer(LayerType type) {
+    this->type_ = type;
+}
+
+RknpuBaseLayer::~RknpuBaseLayer(){};
+
+Status RknpuBaseLayer::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            std::vector<std::shared_ptr<rk::nn::Tensor>> input_ops, rk::nn::Graph *graph,
+                            std::vector<std::string> outputs) {
+    param_        = param;
+    resource_     = resource;
+    input_ops_    = input_ops;
+    outputs_name_ = outputs;
+    graph_        = graph;
+
+    Status ret = Convert();
+    return ret;
+}
+
+void RknpuBaseLayer::SetLayerName(std::string layer_name) {
+    layer_name_ = layer_name;
+}
+
+std::string RknpuBaseLayer::GetLayerName() {
+    return layer_name_;
+}
+
+Status RknpuBaseLayer::GetOutputShape(int i, std::vector<int> &output_shape) {
+    std::vector<std::vector<int>> output_shapes;
+    Status ret = CalculateOutputShape(output_shapes);
+    if (ret != TNN_OK)
+        return ret;
+    output_shape = output_shapes[i];
+    return TNN_OK;
+}
+
+Status RknpuBaseLayer::CalculateOutputShape(std::vector<std::vector<int>> &output_shapes) {
+    std::vector<BlobDesc> blob_descs;
+    std::vector<Blob *> input_blobs;
+    std::vector<Blob *> output_blobs;
+
+    blob_descs.clear();
+    for (auto &input_op : input_ops_) {
+        BlobDesc blob_desc;
+        for (auto dim : input_op->GetDims())
+            blob_desc.dims.push_back((int)dim);
+        blob_descs.emplace_back(blob_desc);
+    }
+    RETURN_ON_NEQ(NpuCommonUtils::CreateBlobs(blob_descs, input_blobs), TNN_OK);
+
+    blob_descs.clear();
+    for (int i = 0; i < outputs_name_.size(); i++) {
+        BlobDesc blob_desc;
+        blob_descs.emplace_back(blob_desc);
+    }
+    RETURN_ON_NEQ(NpuCommonUtils::CreateBlobs(blob_descs, output_blobs), TNN_OK);
+
+    RETURN_ON_NEQ(NpuCommonUtils::CalculateOutputShape(type_, input_blobs, output_blobs, param_, resource_,
+                                                       outputs_name_, output_shapes),
+                  TNN_OK);
+
+    RETURN_ON_NEQ(NpuCommonUtils::ReleaseBlobs(input_blobs, output_blobs), TNN_OK);
+
+    return TNN_OK;
+}
+
+std::vector<std::shared_ptr<rk::nn::Tensor>> &RknpuBaseLayer::GetOutputOps() {
+    return output_ops_;
+}
+
+std::map<LayerType, std::shared_ptr<RknpuLayerCreator>> &GetGlobalRknpuLayerCreatorMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<RknpuLayerCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<RknpuLayerCreator>>); });
+    return *creators;
+}
+
+RknpuBaseLayer *CreateRknpuBaseLayer(LayerType type) {
+    RknpuBaseLayer *cur_layer = nullptr;
+    auto &layer_creater_map   = GetGlobalRknpuLayerCreatorMap();
+    if (layer_creater_map.count(type) > 0) {
+        cur_layer = layer_creater_map[type]->CreateRknpuBaseLayer();
+    }
+    return cur_layer;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.h
new file mode 100644
index 0000000..5bee5b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_base_layer.h
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_RK_NPU_CONVERT_RKNPU_BASE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_RK_NPU_CONVERT_RKNPU_BASE_LAYER_ACC_H_
+
+#include <tnn/layer/base_layer.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rknpu/rknpu_pub.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+//@brief BaseLaye define the layer interface
+class RknpuBaseLayer {
+public:
+    explicit RknpuBaseLayer(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~RknpuBaseLayer();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                std::vector<std::shared_ptr<rk::nn::Tensor>> input_ops, rk::nn::Graph *graph,
+                std::vector<std::string> outputs);
+
+    // @brief layer init
+    // @param ...
+    //@brief get layer name
+    std::string GetLayerName();
+
+    //@brief set laye name
+    void SetLayerName(std::string layer_name);
+
+    // add for rknpu
+    //@brief get output operators
+    std::vector<std::shared_ptr<rk::nn::Tensor>> &GetOutputOps();
+
+    Status GetOutputShape(int i, std::vector<int> &output_shape);
+    Status CalculateOutputShape(std::vector<std::vector<int>> &output_shapes);
+
+protected:
+    LayerType type_;
+    std::string layer_name_;
+    // add for rknpu
+    rk::nn::Graph *graph_;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_ops_;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_ops_;
+    LayerParam *param_;
+    LayerResource *resource_;
+
+    std::vector<std::string> outputs_name_;
+    virtual Status Convert() = 0;
+};
+
+//@brief LayerCreator define the create layer interface
+class RknpuLayerCreator {
+public:
+    virtual RknpuBaseLayer *CreateRknpuBaseLayer() = 0;
+};
+
+//@brief TypeLayerCreator create TypeLayer
+template <typename T>
+class TypeRknpuLayerCreator : public RknpuLayerCreator {
+public:
+    explicit TypeRknpuLayerCreator(LayerType type) {
+        this->type_ = type;
+    };
+    virtual RknpuBaseLayer *CreateRknpuBaseLayer() {
+        auto layer = new T(type_);
+        return layer;
+    }
+
+protected:
+    LayerType type_;
+};
+
+//@brief TypeLayerCreator register map
+std::map<LayerType, std::shared_ptr<RknpuLayerCreator>> &GetGlobalRknpuLayerCreatorMap();
+
+//@brief TypeLayerRegister register TypeLayerCreator
+template <typename T>
+class TypeRknpuLayerRegister {
+public:
+    explicit TypeRknpuLayerRegister(LayerType type) {
+        GetGlobalRknpuLayerCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+RknpuBaseLayer *CreateRknpuBaseLayer(LayerType type);
+
+#define DECLARE_RKNPU_LAYER(type_string, layer_type)                                                                   \
+    class Rknpu##type_string##Layer : public RknpuBaseLayer {                                                          \
+    public:                                                                                                            \
+        Rknpu##type_string##Layer(LayerType ignore) : RknpuBaseLayer(layer_type){};                                    \
+        virtual ~Rknpu##type_string##Layer(){};                                                                        \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status Convert();                                                                                      \
+    };
+
+#define DECLARE_RKNPU_LAYER_WEIGHT(type_string, layer_type)                                                            \
+    class Rknpu##type_string##Layer : public RknpuBaseLayer {                                                          \
+    public:                                                                                                            \
+        Rknpu##type_string##Layer(LayerType ignore) : RknpuBaseLayer(layer_type){};                                    \
+        virtual ~Rknpu##type_string##Layer(){};                                                                        \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status Convert();                                                                                      \
+    };
+
+#define REGISTER_RKNPU_LAYER(type_string, layer_type)                                                                  \
+    TypeRknpuLayerRegister<TypeRknpuLayerCreator<Rknpu##type_string##Layer>> g_Rknpu##layer_type##_register(layer_type);
+
+#define ADD_OUTPUT_OP()                                                                                                \
+    std::vector<std::vector<int>> output_shapes;                                                                       \
+    ret = CalculateOutputShape(output_shapes);                                                                         \
+    if (ret != TNN_OK)                                                                                                 \
+        return ret;                                                                                                    \
+    auto rk_output =                                                                                                   \
+        RknpuUtils::CreateRknnTensor(graph_, outputs_name_[0], output_shapes[0], NULL, rk::nn::TensorRole::VAR);       \
+    output_ops_.push_back(rk_output);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_RK_NPU_CONVERT_RKNPU_BASE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer.cc
new file mode 100644
index 0000000..51ded44
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer.cc
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+
+#include "rknpu_base_layer.h"
+#include "rknpu_batch_norm_layer_impl.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuBatchNormLayer : public RknpuBatchNormImplLayer {
+public:
+    RknpuBatchNormLayer(LayerType ignore) : RknpuBatchNormImplLayer(LAYER_BATCH_NORM){};
+    virtual ~RknpuBatchNormLayer() {}
+
+protected:
+    virtual Status Convert() {
+        auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+        if (!resource) {
+            return Status(TNNERR_MODEL_ERR, "Error: BatchNorm layer resource is nil");
+        }
+
+        auto param = dynamic_cast<BatchNormLayerParam *>(param_);
+
+        // channel is the 1 element of NCHW
+        int channel = input_ops_[0]->GetDims()[1];
+        bool share_channel =
+            resource->scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(resource->scale_handle.GetDataType());
+        auto *scale_data = resource->scale_handle.force_to<float *>();
+        auto *bias_data  = resource->bias_handle.force_to<float *>();
+
+        for (int i = 0; i < channel; i++) {
+            mean_data.push_back(0.0f);
+            variance_data.push_back(1.0f);
+            if (share_channel) {
+                share_scale_data.push_back(scale_data[0]);
+                share_bias_data.push_back(bias_data[0]);
+            }
+        }
+
+        Status ret = TNN_OK;
+        std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+        // input
+        inputs.push_back(input_ops_[0]);
+
+        std::vector<int> shape = {channel};
+
+        // out = scale * ((in - mean) / variance) + bias
+
+        // mean
+        auto mean_const = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_mean", shape, mean_data.data(),
+                                                       rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+        inputs.push_back(mean_const);
+
+        // variance
+        auto variance_const = RknpuUtils::CreateRknnTensor(
+            graph_, layer_name_ + "_variance", shape, variance_data.data(), rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+        inputs.push_back(variance_const);
+
+        // scale & bias
+        if (share_channel) {
+            auto scale_const =
+                RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_scale", shape, share_scale_data.data(),
+                                             rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+            auto bias_const = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_bias", shape, share_bias_data.data(),
+                                                           rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+            inputs.push_back(scale_const);
+            inputs.push_back(bias_const);
+        } else {
+            auto scale_const = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_scale", shape, scale_data,
+                                                            rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+
+            auto bias_const = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_bias", shape, bias_data,
+                                                           rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+            inputs.push_back(scale_const);
+            inputs.push_back(bias_const);
+        }
+
+        // output
+        ADD_OUTPUT_OP();
+
+        rk::nn::BatchNormAttr attrs;
+        attrs.eps = (param != NULL) ? param->eps : 0.000001;
+
+        graph_->AddOperator(rk::nn::OperatorType::BATCH_NORM, inputs, output_ops_, &attrs);
+
+        return ret;
+    }
+};
+
+REGISTER_RKNPU_LAYER(BatchNorm, LAYER_BATCH_NORM)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer_impl.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer_impl.h
new file mode 100644
index 0000000..7238e26
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_batch_norm_layer_impl.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_BATCH_NORM_LAYER_IMPL_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_BATCH_NORM_LAYER_IMPL_H_
+
+namespace TNN_NS {
+
+class RknpuBatchNormImplLayer : public RknpuBaseLayer {
+public:
+    RknpuBatchNormImplLayer(LayerType layer_type) : RknpuBaseLayer(layer_type){};
+    virtual ~RknpuBatchNormImplLayer() {}
+
+protected:
+    std::vector<float> mean_data;
+    std::vector<float> variance_data;
+    std::vector<float> share_scale_data;
+    std::vector<float> share_bias_data;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_BATCH_NORM_LAYER_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_concat_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_concat_layer.cc
new file mode 100644
index 0000000..793738c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_concat_layer.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Concat, LAYER_CONCAT)
+
+Status RknpuConcatLayer::Convert() {
+    int input_size = input_ops_.size();
+    auto param     = dynamic_cast<ConcatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    if (input_size < 2) {
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 2");
+    }
+    int axis = param->axis;
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    for (int i = 0; i < input_size; i++) {
+        inputs.push_back(input_ops_[i]);
+    }
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::ConcatAttr attr;
+    attr.axis = axis;
+    graph_->AddOperator(rk::nn::OperatorType::CONCAT, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Concat, LAYER_CONCAT)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer.cc
new file mode 100644
index 0000000..19595ea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer.cc
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_conv_layer_impl.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+class RknpuConvLayer : public RknpuConvImplLayer {
+public:
+    RknpuConvLayer(LayerType ignore) : RknpuConvImplLayer(LAYER_CONVOLUTION){};
+    virtual ~RknpuConvLayer() {}
+
+protected:
+    virtual Status Convert() {
+        Status ret    = ObtainParam();
+        auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        if (!resource) {
+            return Status(TNNERR_MODEL_ERR, "Error: ConvLayerResource is empty");
+        }
+
+        // pad type
+        rk::nn::PadType rk_pad_type = rk::nn::PadType::AUTO;
+        ret                         = RknpuUtils::GetPadType(rk_pad_type, pad_type);
+        if (ret != TNN_OK)
+            return ret;
+
+        // fuse relu
+        if (activation_type > ActivationType_ReLU) {
+            return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support fuse ActivationType except ReLu");
+        }
+
+        std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+        // input
+        inputs.push_back(input_ops_[0]);
+
+        // weight
+        int total_data_size           = resource->filter_handle.GetDataCount();
+        int in_group                  = total_data_size / (kernel_h * kernel_w * output_channel);
+        std::vector<int> weight_shape = {output_channel, in_group, kernel_h, kernel_w};
+        auto weight_const             = RknpuUtils::CreateRknnTensor(
+            graph_, layer_name_ + "_weight", weight_shape, resource->filter_handle.force_to<void *>(),
+            rk::nn::TensorRole::CONST, resource->filter_handle.GetDataType());
+        inputs.push_back(weight_const);
+
+        // bias
+        int bias_count = resource->bias_handle.GetDataCount();
+        if (bias_count != 0) {
+            std::vector<int> bias_shape = {1, bias_count, 1, 1};
+            auto bias_const             = RknpuUtils::CreateRknnTensor(
+                graph_, layer_name_ + "_bias", bias_shape, resource->bias_handle.force_to<void *>(),
+                rk::nn::TensorRole::CONST, resource->bias_handle.GetDataType());
+            inputs.push_back(bias_const);
+        } else {
+            void *ptr = (void *)malloc(sizeof(float) * output_channel);
+            memset(ptr, 0, sizeof(float) * output_channel);
+            free_list.push_back(ptr);
+
+            std::vector<int> dims = {output_channel};
+            auto rk_bias          = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_bias", dims, ptr,
+                                                        rk::nn::TensorRole::CONST, DATA_TYPE_FLOAT);
+            inputs.push_back(rk_bias);
+        }
+
+        // output
+        ADD_OUTPUT_OP();
+
+        int32_t multiplier = 0;
+        if (group == 1) {
+            multiplier = 0;
+        } else if (in_group == 1) {
+            multiplier = output_channel / group;
+        } else {
+            multiplier = 0;
+        }
+
+        rk::nn::Conv2DAttr attr;
+        attr.ksize[0]    = kernel_w;
+        attr.ksize[1]    = kernel_h;
+        attr.stride[0]   = stride_w;
+        attr.stride[1]   = stride_h;
+        attr.pad[0]      = pad_w_begin;
+        attr.pad[1]      = pad_w_end;
+        attr.pad[2]      = pad_h_begin;
+        attr.pad[3]      = pad_h_end;
+        attr.group       = group;
+        attr.weights     = output_channel;  // TODO
+        attr.dilation[0] = 1;
+        attr.dilation[1] = 1;
+        attr.pad_type    = rk_pad_type;
+        attr.multiplier  = multiplier;  // TODO
+        attr.has_relu    = (activation_type == ActivationType_ReLU) ? true : false;
+
+        graph_->AddOperator(rk::nn::OperatorType::CONV2D, inputs, output_ops_, (void *)&attr);
+
+        return ret;
+    }
+};
+
+REGISTER_RKNPU_LAYER(Conv, LAYER_CONVOLUTION)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer_impl.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer_impl.h
new file mode 100644
index 0000000..1b276e1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_conv_layer_impl.h
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_CONV_LAYER_IMPL_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_CONV_LAYER_IMPL_H_
+
+namespace TNN_NS {
+
+class RknpuConvImplLayer : public RknpuBaseLayer {
+public:
+    RknpuConvImplLayer(LayerType layer_type) : RknpuBaseLayer(layer_type){};
+    virtual ~RknpuConvImplLayer() {
+        for (const auto p : free_list) {
+            if (p)
+                free(p);
+        }
+        free_list.clear();
+    }
+
+protected:
+    Status ObtainParam() {
+        auto param = dynamic_cast<ConvLayerParam *>(param_);
+        CHECK_PARAM_NULL(param);
+        stride_w = param->strides[0];
+        stride_h = param->strides[1];
+
+        dilation_w = param->dialations[0];
+        dilation_h = param->dialations[1];
+
+        kernel_w       = param->kernels[0];
+        kernel_h       = param->kernels[1];
+        group          = param->group;
+        output_channel = param->output_channel;
+        pad_w_begin    = param->pads[0];
+        pad_w_end      = param->pads[1];
+        pad_h_begin    = param->pads[2];
+        pad_h_end      = param->pads[3];
+        pad_type       = param->pad_type;
+
+        activation_type = param->activation_type;
+
+        return TNN_OK;
+    }
+
+    int stride_w;
+    int stride_h;
+
+    int dilation_w;
+    int dilation_h;
+
+    int kernel_w;
+    int kernel_h;
+
+    int pad_w_begin;
+    int pad_w_end;
+    int pad_h_begin;
+    int pad_h_end;
+
+    int group;
+    int output_channel;
+    int pad_type;
+
+    int activation_type;
+
+    std::vector<void *> free_list;  // remember free
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_CONVERT_RKNPU_CONV_LAYER_CONVERT_IMPL_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_flatten_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_flatten_layer.cc
new file mode 100644
index 0000000..71762f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_flatten_layer.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Flatten, LAYER_FLATTEN)
+
+Status RknpuFlattenLayer::Convert() {
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::ReshapeAttr attr;
+    for (const auto dim : output_shapes[0]) {
+        attr.shapes.push_back(static_cast<uint32_t>(dim));
+    }
+    graph_->AddOperator(rk::nn::OperatorType::RESHAPE, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Flatten, LAYER_FLATTEN)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_gather_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_gather_layer.cc
new file mode 100644
index 0000000..abda6dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_gather_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Gather, LAYER_GATHER)
+
+Status RknpuGatherLayer::Convert() {
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::GatherAttr attr;
+    attr.axis = 0;  // 没有 GatherLayerParam
+    graph_->AddOperator(rk::nn::OperatorType::GATHER, inputs, output_ops_, (void*)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Gather, LAYER_GATHER)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_inner_product_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_inner_product_layer.cc
new file mode 100644
index 0000000..ecec29d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_inner_product_layer.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER_WEIGHT(InnerProduct, LAYER_INNER_PRODUCT)
+
+Status RknpuInnerProductLayer::Convert() {
+    auto param    = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto resource = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(param);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: InnerProductLayerResource is nil");
+    }
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // weight
+    auto input_shape              = input_ops_[0]->GetDims();
+    std::vector<int> weight_shape = {param->num_output, (int)input_shape[1], 1, 1};
+    auto weight_const             = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_weight", weight_shape,
+                                                     resource->weight_handle.force_to<void *>(),
+                                                     rk::nn::TensorRole::CONST, resource->weight_handle.GetDataType());
+    inputs.push_back(weight_const);
+
+    // bias
+    int bias_count = resource->bias_handle.GetDataCount();
+    if (param->has_bias) {
+        std::vector<int> bias_shape = {1, bias_count, 1, 1};
+        auto bias_const             = RknpuUtils::CreateRknnTensor(graph_, layer_name_ + "_bias", bias_shape,
+                                                       resource->bias_handle.force_to<void *>(),
+                                                       rk::nn::TensorRole::CONST, resource->bias_handle.GetDataType());
+        inputs.push_back(bias_const);
+    }
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::FCAttr attr;
+    attr.weights  = weight_shape[0];  // TODO
+    attr.has_relu = false;
+
+    graph_->AddOperator(rk::nn::OperatorType::FULLCONNECT, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(InnerProduct, LAYER_INNER_PRODUCT)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_normalize_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_normalize_layer.cc
new file mode 100644
index 0000000..fb124ac
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_normalize_layer.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Normalize, LAYER_NORMALIZE)
+
+Status RknpuNormalizeLayer::Convert() {
+    auto param = dynamic_cast<NormalizeLayerParam *>(param_);
+
+    if (param->axis != 1 || param->across_spatial != 0 || param->p != 2) {
+        return Status(TNNERR_PARAM_ERR, "Error: NormalizeLayer dont support these param!");
+    }
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    graph_->AddOperator(rk::nn::OperatorType::L2_NORMALIZE, inputs, output_ops_, NULL);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Normalize, LAYER_NORMALIZE)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pad_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pad_layer.cc
new file mode 100644
index 0000000..e289173
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pad_layer.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Pad, LAYER_PAD);
+
+Status RknpuPadLayer::Convert() {
+    auto param = dynamic_cast<PadLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::PadAttr attr;
+    switch (param->type) {
+        case 0:
+            attr.mode = rk::nn::PadMode::PAD_CONSTANT;
+            break;
+        case 1:
+            attr.mode = rk::nn::PadMode::PAD_REFLECT;
+            break;
+        case 2:
+            attr.mode = rk::nn::PadMode::PAD_REPLICATE;
+            break;
+        default:
+            throw std::invalid_argument("RknpuPadLayer::Convert: unknow pad mode!");
+            break;
+    }
+
+    int dims_num = input_ops_[0]->GetDims().size();
+    int pads_num = param->pads.size() / 2;
+    for (int i = 0; i < dims_num; i++) {
+        if (i < pads_num) {
+            attr.begin.push_back(param->pads[2 * i]);
+            attr.end.push_back(param->pads[2 * i + 1]);
+        } else {
+            attr.begin.push_back(0);
+            attr.end.push_back(0);
+        }
+    }
+    std::reverse(attr.begin.begin(), attr.begin.end());
+    std::reverse(attr.end.begin(), attr.end.end());
+    attr.const_val = 0;
+
+    graph_->AddOperator(rk::nn::OperatorType::PAD, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Pad, LAYER_PAD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_permute_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_permute_layer.cc
new file mode 100644
index 0000000..2d85ccd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_permute_layer.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Permute, LAYER_PERMUTE)
+
+Status RknpuPermuteLayer::Convert() {
+    auto param = dynamic_cast<PermuteLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::PermuteAttr attr;
+    for (const auto val : param->orders) {
+        attr.perm.push_back(static_cast<uint32_t>(val));
+    }
+    graph_->AddOperator(rk::nn::OperatorType::PERMUTE, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Permute, LAYER_PERMUTE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pooling_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pooling_layer.cc
new file mode 100644
index 0000000..af8bf8b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_pooling_layer.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Pool, LAYER_POOLING)
+
+Status RknpuPoolLayer::Convert() {
+    // parameter and weight of the pooling layer
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    rk::nn::PadType rk_pad_type = rk::nn::PadType::AUTO;
+    Status ret                  = RknpuUtils::GetPadType(rk_pad_type, param->pad_type);
+    if (ret != TNN_OK)
+        return ret;
+
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::PoolAttr attr;
+    attr.ksize[0]       = param->kernels[0];
+    attr.ksize[1]       = param->kernels[1];
+    attr.stride[0]      = param->strides[0];
+    attr.stride[1]      = param->strides[1];
+    attr.pad[0]         = param->pads[0];
+    attr.pad[1]         = param->pads[1];
+    attr.pad[2]         = param->pads[2];
+    attr.pad[3]         = param->pads[3];
+    attr.pad_type       = rk_pad_type;
+    attr.pool_type      = (0 == param->pool_type) ? rk::nn::PoolType::POOLING_MAX : rk::nn::PoolType::POOLING_AVG;
+    attr.round_type     = (1 == param->ceil_mode) ? rk::nn::RoundType::ROUND_CEIL : rk::nn::RoundType::ROUND_FLOOR;
+    attr.global_pooling = (attr.ksize[0] == -1 && attr.ksize[1] == -1);
+
+    graph_->AddOperator(rk::nn::OperatorType::POOL, inputs, output_ops_, (void *)&attr);
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Pool, LAYER_POOLING)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reduce_mean_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reduce_mean_layer.cc
new file mode 100644
index 0000000..20cc411
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reduce_mean_layer.cc
@@ -0,0 +1,70 @@
+//
+// Created by 李烨 on 20/7/20.
+//
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(ReduceMean, LAYER_REDUCE_MEAN)
+
+Status RknpuReduceMeanLayer::Convert() {
+    // parameter and weight of the pooling layer
+    auto param = dynamic_cast<ReduceLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    std::vector<int> axes                 = param->axis;
+    std::vector<int32_t> input_shape_vec = input_ops_[0]->GetDims();
+
+    // check if all reduce
+    if (param->all_reduce) {
+        axes.clear();
+        for (int i = 0; i < input_shape_vec.size(); i++) {
+            axes.push_back(i);
+        }
+    } else {
+        for (int i = 0; i < axes.size(); i++) {
+            if (axes[i] < 0) {
+                axes[i] = input_shape_vec.size() + axes[i];
+            }
+        }
+    }
+
+    rk::nn::ReduceAttr attr;
+    attr.type = rk::nn::ReduceType::REDUCE_MEAN;
+    for (const auto val : axes) {
+        attr.axis.push_back(static_cast<uint32_t>(val));
+    }
+    attr.axis_num = attr.axis.size();
+    attr.keep_dim = param->keep_dims;
+    graph_->AddOperator(rk::nn::OperatorType::REDUCE, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(ReduceMean, LAYER_REDUCE_MEAN)
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reshape_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reshape_layer.cc
new file mode 100644
index 0000000..65b93c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_reshape_layer.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Reshape, LAYER_RESHAPE)
+
+void AddPermute(rk::nn::Graph *graph, std::shared_ptr<rk::nn::Tensor> input, std::shared_ptr<rk::nn::Tensor> output,
+                std::vector<uint32_t> perm) {
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs, outputs;
+    inputs.push_back(input);
+    outputs.push_back(output);
+
+    rk::nn::PermuteAttr attr;
+    attr.perm.push_back(perm[0]);
+    attr.perm.push_back(perm[1]);
+    attr.perm.push_back(perm[2]);
+    attr.perm.push_back(perm[3]);
+    graph->AddOperator(rk::nn::OperatorType::PERMUTE, inputs, outputs, (void *)&attr);
+}
+
+Status RknpuReshapeLayer::Convert() {
+    auto param = dynamic_cast<ReshapeLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs, outputs;
+
+    if (param->reshape_type == 0) {
+        // input
+        inputs.push_back(input_ops_[0]);
+
+        // output
+        ADD_OUTPUT_OP();
+
+        // ???
+        // param->axis
+        // param->num_axes
+
+        rk::nn::ReshapeAttr attr;
+        for (const auto dim : output_shapes[0]) {
+            attr.shapes.push_back(static_cast<uint32_t>(dim));
+        }
+        graph_->AddOperator(rk::nn::OperatorType::RESHAPE, inputs, output_ops_, (void *)&attr);
+
+    } else if (param->reshape_type == 1) {  // tensorflow reshape, need NCHW => NHWC => Reshape => NCHW
+        // output nchw
+        ADD_OUTPUT_OP();
+
+        // output nhwc
+        auto dims_in               = input_ops_[0]->GetDims();
+        std::vector<int> dims_nhwc = {(int)dims_in[0], (int)dims_in[2], (int)dims_in[3], (int)dims_in[1]};
+        auto output_nhwc =
+            RknpuUtils::CreateRknnTensor(graph_, outputs_name_[0] + "_nhwc", dims_nhwc, NULL, rk::nn::TensorRole::VAR);
+
+        // nchw => nhwc
+        AddPermute(graph_, input_ops_[0], output_nhwc, std::vector<uint32_t>{0, 2, 3, 1});
+
+        // input nhwc
+        inputs.push_back(output_nhwc);
+
+        // output reshape nhwc
+        auto dims_out                 = output_shapes[0];
+        std::vector<int> dims_reshape = {dims_out[0], dims_out[2], dims_out[3], dims_out[1]};
+        auto output_reshape = RknpuUtils::CreateRknnTensor(graph_, outputs_name_[0] + "_reshape", dims_reshape, NULL,
+                                                           rk::nn::TensorRole::VAR);
+        outputs.push_back(output_reshape);
+        rk::nn::ReshapeAttr attr;
+        for (const auto dim : dims_reshape) {
+            attr.shapes.push_back(static_cast<uint32_t>(dim));
+        }
+        graph_->AddOperator(rk::nn::OperatorType::RESHAPE, inputs, outputs, (void *)&attr);
+
+        // nhwc => nchw
+        AddPermute(graph_, output_reshape, output_ops_[0], std::vector<uint32_t>{0, 3, 1, 2});
+
+    } else {
+        return Status(TNNERR_PARAM_ERR, "Error: ReshapeLayer dont support reshape_type");
+    }
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Reshape, LAYER_RESHAPE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_slice_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_slice_layer.cc
new file mode 100644
index 0000000..2046fad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_slice_layer.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Slice, LAYER_SLICE);
+
+Status RknpuSliceLayer::Convert() {
+    auto param = dynamic_cast<SliceLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::SliceAttr attr;
+
+    for (const auto dim : output_shapes[0]) {
+        attr.start.push_back(0);
+        attr.length.push_back(static_cast<uint32_t>(dim));
+    }
+
+    // struct SliceLayerParam : public LayerParam {
+    //     // size of each slice
+    //     std::vector<int> slices;
+    //     int axis;
+    // };  SliceLayerParam 参数如何转为 starts / ends ?
+    //
+    // const auto input_dims = shaper_[input];
+    // for (size_t i = 0; i < v_axes.size(); i++) {
+    //     int32_t dim = input_dims[v_axes[i]];
+    //     if (dim > 0) {
+    //         int32_t start = v_starts[i] < 0 ? (v_starts[i] + dim) : v_starts[i];
+    //         attr.start[v_axes[i]] = std::max(start, 0);
+    //     }
+    // }
+
+    graph_->AddOperator(rk::nn::OperatorType::SLICE, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Slice, LAYER_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_softmax_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_softmax_layer.cc
new file mode 100644
index 0000000..10ec6a8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_softmax_layer.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER(Softmax, LAYER_SOFTMAX)
+
+Status RknpuSoftmaxLayer::Convert() {
+    auto param = dynamic_cast<SoftmaxLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::SoftmaxAttr attr;
+    attr.axis = param->axis;
+    attr.beta = 1.0;
+
+    graph_->AddOperator(rk::nn::OperatorType::SOFTMAX, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(Softmax, LAYER_SOFTMAX)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_stride_slice_layer.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_stride_slice_layer.cc
new file mode 100644
index 0000000..b24d72b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_stride_slice_layer.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "rknpu_base_layer.h"
+#include "rknpu_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_RKNPU_LAYER_WEIGHT(StridedSlice, LAYER_STRIDED_SLICE);
+
+Status RknpuStridedSliceLayer::Convert() {
+    auto param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Status ret = TNN_OK;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+
+    // input
+    inputs.push_back(input_ops_[0]);
+
+    // output
+    ADD_OUTPUT_OP();
+
+    rk::nn::StridedSliceAttr attr;
+
+    attr.begin = param->begins;
+    std::reverse(attr.begin.begin(), attr.begin.end());
+    attr.end = param->ends;
+    std::reverse(attr.end.begin(), attr.end.end());
+    attr.stride = param->strides;
+    std::reverse(attr.stride.begin(), attr.stride.end());
+
+    for (int i = 0; i < attr.end.size(); ++i) {
+        if (attr.end[i] == 0) {
+            attr.end[i] = input_ops_[0]->GetDims()[i];
+        }
+    }
+
+    graph_->AddOperator(rk::nn::OperatorType::STRIDED_SLICE, inputs, output_ops_, (void *)&attr);
+
+    return ret;
+}
+
+REGISTER_RKNPU_LAYER(StridedSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.cc b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.cc
new file mode 100644
index 0000000..3c59ac1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_utils.h"
+
+#include <tnn/interpreter/layer_resource.h>
+#include <tnn/utils/dims_utils.h>
+
+#include <numeric>
+#include <sstream>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+std::shared_ptr<rk::nn::Tensor> RknpuUtils::CreateRknnTensor(rk::nn::Graph *graph, const std::string &name,
+                                                             const std::vector<int> &dims, const void *data,
+                                                             const rk::nn::TensorRole role, const DataType type,
+                                                             const rk::nn::DataLayoutType layout,
+                                                             const rk::nn::QuantizationType qntType, const uint8_t bits,
+                                                             const float scale, const uint32_t zero_point,
+                                                             const int8_t fl) {
+    auto attr  = std::make_shared<rk::nn::TensorAttr>();
+    attr->name = name;
+    for (auto dim : dims) {
+        attr->dims.push_back((uint32_t)dim);
+    }
+    switch (type) {
+        case DATA_TYPE_FLOAT:
+            attr->precision = rk::nn::PrecisionType::FLOAT32;
+            break;
+        case DATA_TYPE_HALF:
+            attr->precision = rk::nn::PrecisionType::FLOAT16;
+            break;
+        case DATA_TYPE_INT8:
+            attr->precision = rk::nn::PrecisionType::INT8;
+            break;
+        case DATA_TYPE_INT32:
+            attr->precision = rk::nn::PrecisionType::INT32;
+            break;
+        default:
+            break;
+    }
+
+    attr->layout  = layout;
+    attr->qntType = qntType;
+    attr->role    = role;
+    attr->qntBits = bits;
+    attr->qntParamDFP.fl.push_back(fl);
+    attr->qntParamAffineAsymmetric.zero_point.push_back(zero_point);
+    attr->qntParamAffineAsymmetric.scale.push_back(scale);
+    attr->qntParamSymmetric.scale.push_back(scale);
+    return graph->CreateTensor(attr, (void *)data);
+}
+
+Status RknpuUtils::GetPadType(rk::nn::PadType &rk_pad_type, int pad_type) {
+    // rknpu pad mode
+    if (pad_type == 0) {  // SAME_UPPER or SAME_LOWER
+        rk_pad_type = rk::nn::PadType::SAME;
+    } else if (pad_type == 1) {  // VALID
+        rk_pad_type = rk::nn::PadType::VALID;
+    } else if (pad_type == -1) {  // NOSET
+        rk_pad_type = rk::nn::PadType::AUTO;
+    } else {
+        return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+    }
+    return TNN_OK;
+}
+
+uint32_t RknpuUtils::CalcSize(rk::nn::PrecisionType type, std::vector<int32_t> dims) {
+    size_t type_size = 4;
+    switch (type) {
+        case rk::nn::PrecisionType::FLOAT32:
+            type_size = 4;
+            break;
+        case rk::nn::PrecisionType::UINT8:
+            type_size = 1;
+            break;
+        case rk::nn::PrecisionType::INT32:
+            type_size = 4;
+            break;
+        case rk::nn::PrecisionType::INT64:
+            type_size = 8;
+            break;
+        default:
+            throw std::invalid_argument("Init: unknow input or output data type!");
+            break;
+    }
+
+    return std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<uint32_t>()) * type_size;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.h b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.h
new file mode 100644
index 0000000..433babc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/convert/rknpu_utils.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_UTILS_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_UTILS_H_
+
+#include <tnn/core/blob.h>
+#include <tnn/interpreter/layer_resource.h>
+#include <tnn/interpreter/raw_buffer.h>
+
+#include "rknpu/rknpu_pub.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+
+namespace TNN_NS {
+
+class RknpuUtils {
+public:
+    static std::shared_ptr<rk::nn::Tensor> CreateRknnTensor(
+        rk::nn::Graph *graph, const std::string &name, const std::vector<int> &dims, const void *data = NULL,
+        const rk::nn::TensorRole role = rk::nn::TensorRole::VAR, const DataType type = DATA_TYPE_FLOAT,
+        const rk::nn::DataLayoutType layout    = rk::nn::DataLayoutType::NCHW,
+        const rk::nn::QuantizationType qntType = rk::nn::QuantizationType::NONE, const uint8_t bits = 8,
+        const float scale = 1.0, const uint32_t zero_point = 0, const int8_t fl = 0);
+
+    static Status GetPadType(rk::nn::PadType &rk_pad_type, int pad_type);
+
+    static uint32_t CalcSize(rk::nn::PrecisionType type, std::vector<int32_t> dims);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_blob_converter.cc b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_blob_converter.cc
new file mode 100644
index 0000000..f6eb282
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_blob_converter.cc
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_BLOB_CONVERTER_CC_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_BLOB_CONVERTER_CC_
+#include "tnn/core/macro.h"
+#include "tnn/utils/blob_converter_default.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+class RknpuBlobConverterAcc : public DefaultBlobConverterAcc {
+public:
+    RknpuBlobConverterAcc(Blob *blob) : DefaultBlobConverterAcc(blob) {}
+    ~RknpuBlobConverterAcc() {}
+};
+
+DECLARE_BLOB_CONVERTER_CREATER(Rknpu);
+REGISTER_BLOB_CONVERTER(Rknpu, DEVICE_RK_NPU);
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_BLOB_CONVERTER_CC_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.cc b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.cc
new file mode 100644
index 0000000..0040e4f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/rknpu/rknpu_context.h"
+
+namespace TNN_NS {
+
+Status RknpuContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status RknpuContext::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status RknpuContext::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+
+Status RknpuContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status RknpuContext::Synchronize() {
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.h b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.h
new file mode 100644
index 0000000..79d9730
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_context.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class RknpuContext : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.cc b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.cc
new file mode 100644
index 0000000..3893c87
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/rknpu/rknpu_device.h"
+
+#include "tnn/device/rknpu/rknpu_context.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+
+namespace TNN_NS {
+
+RknpuDevice::RknpuDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+RknpuDevice::~RknpuDevice() {}
+
+BlobMemorySizeInfo RknpuDevice::Calculate(BlobDesc& desc) {
+    return Calculate1DMemorySize(desc);
+}
+
+Status RknpuDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    if (handle) {
+        *handle = malloc(GetBlobMemoryBytesSize(size_info));
+    }
+    return TNN_OK;
+}
+
+Status RknpuDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_NAIVE;
+    if (mat_type == NCHW_FLOAT) {
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("CpuDevice dont support mat_type:%d", mat_type);
+        return Status(TNNERR_PARAM_ERR, "cpu dont support mat_type");
+    }
+}
+
+Status RknpuDevice::Free(void* handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+Status RknpuDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+Status RknpuDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc* RknpuDevice::CreateLayerAcc(LayerType type) {
+    auto& layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    }
+    return NULL;
+}
+
+Context* RknpuDevice::CreateContext(int device_id) {
+    return new RknpuContext();
+}
+
+NetworkType RknpuDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_RK_NPU;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>& RknpuDevice::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+TypeDeviceRegister<RknpuDevice> g_rknpu_device_register(DEVICE_RK_NPU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.h b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.h
new file mode 100644
index 0000000..1a06f20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_device.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_DEVICE_H_
+
+#include <cstring>
+#include <map>
+#include <memory>
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief RknpuDevice create rknpu memory and rknpu layer acc
+
+class RknpuDevice : public AbstractDevice {
+public:
+    explicit RknpuDevice(DeviceType device_type);
+
+    ~RknpuDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+private:
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_DEVICE_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_model_interpreter.cc b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_model_interpreter.cc
new file mode 100644
index 0000000..ec6f094
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_model_interpreter.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+class RknpuModelInterpreter : public AbstractModelInterpreter {
+public:
+    RknpuModelInterpreter(){};
+    ~RknpuModelInterpreter(){};
+    Status Interpret(std::vector<std::string> &params) {
+        return TNN_OK;
+    }
+};
+
+TypeModelInterpreterRegister<TypeModelInterpreterCreator<RknpuModelInterpreter>> g_atlas_model_interpreter_register(
+    MODEL_TYPE_RKCACHE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.cc b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.cc
new file mode 100644
index 0000000..391be92
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.cc
@@ -0,0 +1,430 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rknpu_network.h"
+
+#include <tnn/device/rknpu/convert/rknpu_base_layer.h>
+#include <tnn/interpreter/layer_resource_generator.h>
+
+#include <numeric>
+#include <sstream>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/device/rknpu/convert/rknpu_utils.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/utils/npu_common_utils.h"
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<RknpuNetwork>> g_network_impl_rknpu_factory_register(NETWORK_TYPE_RK_NPU);
+
+RknpuNetwork::RknpuNetwork() {
+    input_inf_.clear();
+    output_inf_.clear();
+}
+
+RknpuNetwork::~RknpuNetwork() {
+    DeInit();
+}
+
+Status RknpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                          InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+    if (net_config.device_type != DEVICE_RK_NPU ||
+        (model_config.model_type != MODEL_TYPE_TNN && model_config.model_type != MODEL_TYPE_RKCACHE)) {
+        return Status(TNNERR_NULL_PARAM, "Rknpu not support device_type or model type");
+    }
+
+    device_ = GetDevice(net_config.device_type);
+    if (device_ == nullptr) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+    context_ = device_->CreateContext(net_config.device_id);
+    if (context_ == nullptr) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+
+    blob_manager_ = new BlobManager(device_);
+    if (blob_manager_ == nullptr) {
+        return TNNERR_COMMON_ERROR;
+    }
+
+    rk::nn::Graph *graph = new rk::nn::Graph();
+
+    if (model_config.model_type == MODEL_TYPE_RKCACHE) {
+        RETURN_ON_NEQ(InitCacheGraph(model_config.params[0], graph), TNN_OK);
+    } else {
+        auto *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+        net_structure_            = default_interpreter->GetNetStructure();
+
+        auto instance_input_shapes_map = net_structure_->inputs_shape_map;
+        // RKNPU IR Build
+        bool use_path = (net_config.cache_path.compare("") != 0);
+        NpuCommonUtils::modifyModelInputSize(max_inputs_shape, instance_input_shapes_map);
+
+        std::string model_save = use_path ? net_config.cache_path : "";
+
+        // delete in network deinit
+        if (use_path && !NpuCommonUtils::FileExits(model_save)) {
+            graph->EnableCreateCache(model_save);
+        }
+
+        if (use_path && NpuCommonUtils::FileExits(model_save)) {
+            RETURN_ON_NEQ(InitCacheGraph(model_save, graph), TNN_OK);
+        } else {
+            exector_         = std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(graph));
+            Status build_ret = IRInitLayers(net_config, interpreter, instance_input_shapes_map);
+            if (build_ret != TNN_OK) {
+                return build_ret;
+            }
+        }
+    }
+
+    int ret = exector_->Build();
+    if (rk::nn::RK_SUCCESS != ret)
+        return TNNERR_MODEL_ERR;
+
+    input_inf_.clear();
+    output_inf_.clear();
+
+    // init input buffers
+    auto input_attrs = exector_->GetGraph()->GetInputTensorsAttr();
+    input_inf_.resize(input_attrs.size());
+    for (int i = 0; i < input_inf_.size(); i++) {
+        auto type     = input_attrs[i]->precision;
+        auto dims     = input_attrs[i]->dims;
+        uint32_t size = RknpuUtils::CalcSize(type, dims);
+
+        input_inf_[i].index        = i;
+        input_inf_[i].buf          = malloc(size);
+        input_inf_[i].size         = size;
+        input_inf_[i].pass_through = false;
+        input_inf_[i].type         = type;
+        input_inf_[i].layout       = rk::nn::DataLayoutType::NCHW;
+
+        BlobDesc desc;
+        desc.device_type = DEVICE_RK_NPU;
+        desc.data_format = DATA_FORMAT_NCHW;
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.name        = input_attrs[i]->name;
+        for (auto dim : dims) {
+            desc.dims.push_back((int)dim);
+        }
+        BlobHandle handle;
+        handle.base                = input_inf_[i].buf;
+        input_blob_map_[desc.name] = new Blob(desc, handle);
+    }
+
+    // init output buffers
+    auto output_attrs = exector_->GetGraph()->GetOutputTensorsAttr();
+    output_inf_.resize(output_attrs.size());
+    for (int i = 0; i < output_inf_.size(); ++i) {
+        auto type     = output_attrs[i]->precision;
+        auto dims     = output_attrs[i]->dims;
+        uint32_t size = RknpuUtils::CalcSize(type, dims);
+
+        output_inf_[i].index      = i;
+        output_inf_[i].buf        = malloc(size);
+        output_inf_[i].size       = size;
+        output_inf_[i].type       = type;
+        output_inf_[i].layout     = rk::nn::DataLayoutType::NCHW;
+        output_inf_[i].want_float = true;
+
+        // add blob
+        BlobDesc desc;
+        desc.device_type = DEVICE_RK_NPU;
+        desc.data_format = DATA_FORMAT_NCHW;
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.name        = output_attrs[i]->name;
+        for (auto dim : dims) {
+            desc.dims.push_back((int)dim);
+        }
+        BlobHandle handle;
+        handle.base                 = output_inf_[i].buf;
+        output_blob_map_[desc.name] = new Blob(desc, handle);
+    }
+
+    return TNN_OK;
+}
+
+Status RknpuNetwork::IRInitLayers(NetworkConfig &net_config, AbstractModelInterpreter *interpreter,
+                                  InputShapesMap &inputs_shape) {
+    Status ret                = TNN_OK;
+    auto *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    NetResource *net_resource = default_interpreter->GetNetResource();
+
+    if (net_structure_ == nullptr || net_resource == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "network_ is nil, network_type may not support");
+    }
+
+    ret = optimizer::NetOptimizerManager::Optimize(net_structure_, net_resource, net_config);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    ret = blob_manager_->Init(net_config, net_structure_, inputs_shape, GetNetResourceDataType(net_resource));
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    // Create input operators
+    ret = CreateGraphInputs(inputs_shape);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    // Init layers
+    ret = ConvertLayers(net_resource);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    // Set Graph
+    SetGraphInputsAndOutputs(inputs_shape);
+    return TNN_OK;
+}
+
+Status RknpuNetwork::GetOutputShapeMap(NetworkConfig &net_config, AbstractModelInterpreter *interpreter,
+                                       InputShapesMap &input_shape, OutputShapesMap &output_shape) {
+    Status ret                = TNN_OK;
+    auto *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    NetResource *net_resource = default_interpreter->GetNetResource();
+
+    if (net_structure_ == nullptr || net_resource == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "network_ is nil, network_type may not support");
+    }
+
+    ret = blob_manager_->Init(net_config, net_structure_, input_shape, GetNetResourceDataType(net_resource));
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    for (auto &layer_info : net_structure_->layers) {
+        std::vector<std::string> &input_names  = layer_info->inputs;
+        std::vector<std::string> &output_names = layer_info->outputs;
+
+        LayerType type       = layer_info->type;
+        BaseLayer *cur_layer = CreateLayer(type);
+
+        std::string layer_name = layer_info->name;
+        cur_layer->SetLayerName(layer_name);
+
+        std::vector<Blob *> inputs;
+        std::vector<Blob *> outputs_for_shape;
+        for (auto name : input_names) {
+            inputs.push_back(blob_manager_->GetBlob(name));
+        }
+
+        for (auto name : output_names) {
+            outputs_for_shape.push_back(blob_manager_->GetBlob(name));
+        }
+
+        cur_layer->InferShapeAhead(inputs, outputs_for_shape, layer_info->param.get(),
+                                   net_resource->resource_map[layer_name].get());
+
+        delete cur_layer;
+    }
+
+    for (auto &output : net_structure_->outputs) {
+        output_shape[output] = blob_manager_->GetBlob(output)->GetBlobDesc().dims;
+    }
+
+    return TNN_OK;
+}
+
+Status RknpuNetwork::CreateGraphInputs(InputShapesMap &input_shape_map) {
+    Status ret = TNN_OK;
+
+    // init graph input
+    auto iterator = input_shape_map.begin();
+    for (; iterator != input_shape_map.end(); iterator++) {
+        std::string input_name = iterator->first;
+        DimsVector dims_vector = iterator->second;
+
+        auto rk_input = RknpuUtils::CreateRknnTensor(exector_->GetGraph(), input_name, dims_vector, nullptr,
+                                                     rk::nn::TensorRole::DATA, DATA_TYPE_FLOAT);
+
+        global_operator_map_[input_name] = rk_input;
+    }
+
+    return ret;
+}
+
+Status RknpuNetwork::SetGraphInputsAndOutputs(InputShapesMap &input_shape_map) {
+    // init graph input
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_tensors;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_tensors;
+    auto iterator = input_shape_map.begin();
+    for (; iterator != input_shape_map.end(); iterator++) {
+        std::string input_name = iterator->first;
+        input_tensors.push_back(global_operator_map_[input_name]);
+    }
+    // init graph output
+    for (auto &name : net_structure_->outputs) {
+        auto tensor = global_operator_map_[name];
+        tensor->SetRole(rk::nn::TensorRole::DATA);
+        tensor->SetPrecision(rk::nn::PrecisionType::FLOAT32);
+        output_tensors.push_back(tensor);
+    }
+
+    exector_->GetGraph()->SetInputsOutputs(input_tensors, output_tensors);
+
+    return TNN_OK;
+}
+
+Status RknpuNetwork::ConvertLayers(NetResource *net_resource) {
+    Status ret = TNN_OK;
+    // loop net_structure
+    for (auto layer_info : net_structure_->layers) {
+        LayerType type            = layer_info->type;
+        RknpuBaseLayer *cur_layer = CreateRknpuBaseLayer(type);
+        if (cur_layer == nullptr) {
+            LOGE("Error: CreateLayer failed, type:%d\n", type);
+            return Status(TNNERR_PARAM_ERR, "CreateLayer failed");
+        }
+        std::string layer_name = layer_info->name;
+        cur_layer->SetLayerName(layer_name);
+
+        // set layer nodes
+        std::vector<std::shared_ptr<rk::nn::Tensor>> input_ops;
+
+        for (std::string &name : layer_info->inputs) {
+            input_ops.push_back(global_operator_map_[name]);
+        }
+
+        LayerResource *layer_resource = net_resource->resource_map[layer_name].get();
+        /*
+         * cur_layer->convert
+         */
+        ret = cur_layer->Init(context_, layer_info->param.get(), layer_resource, input_ops, exector_->GetGraph(),
+                              layer_info->outputs);
+        if (ret != TNN_OK) {
+            LOGE("Error Init layer %s (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(), (int)ret, (int)ret);
+            return ret;
+        }
+
+        for (auto &op : cur_layer->GetOutputOps()) {
+            global_operator_map_[op->GetName()] = op;
+        }
+        layers_.push_back(cur_layer);
+    }
+    return ret;
+}
+
+Status RknpuNetwork::InitCacheGraph(std::string &cache_path, rk::nn::Graph *graph) {
+    if (cache_path.compare("") == 0) {
+        return Status(TNNERR_NULL_PARAM, "network_ is nil, network_type may not support");    
+    }
+
+    graph->LoadCache(cache_path);
+    auto input_attrs = graph->GetInputTensorsAttr();
+    auto output_attrs = graph->GetOutputTensorsAttr();
+    std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
+    std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
+    for (auto &attr : input_attrs) {
+        auto rk_input = RknpuUtils::CreateRknnTensor(graph, attr->name, attr->dims, nullptr,
+                                                        rk::nn::TensorRole::DATA, DATA_TYPE_FLOAT);
+        inputs.push_back(rk_input);
+    }
+    for (auto &attr : output_attrs) {
+        auto rk_output = RknpuUtils::CreateRknnTensor(graph, attr->name, attr->dims, nullptr,
+                                                        rk::nn::TensorRole::DATA, DATA_TYPE_FLOAT);
+        outputs.push_back(rk_output);
+    }
+    graph->SetInputsOutputs(inputs, outputs);
+    exector_ = std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(graph));     
+
+    return TNN_OK;
+}
+
+Status RknpuNetwork::GetForwardMemorySize(int &memory_size) {
+    memory_size = 0;
+    return TNN_OK;
+}
+
+Status RknpuNetwork::SetForwardMemory(void *memory) {
+    return TNN_OK;
+}
+
+Status RknpuNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blob_map_;
+    return TNN_OK;
+}
+
+Status RknpuNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blob_map_;
+    return TNN_OK;
+}
+
+Status RknpuNetwork::SetDeviceAffinity(const std::vector<int> &) {
+    return TNN_OK;
+}
+
+Status RknpuNetwork::Reshape(const InputShapesMap &inputs) {
+    return TNN_OK;
+}
+
+Status RknpuNetwork::DeInit() {
+    rk::nn::Graph *graph = exector_->GetGraph();
+    delete graph;
+
+    for (auto &layer : layers_) {
+        delete (layer);
+    }
+
+    for (auto inf : input_inf_) {
+        if (inf.buf) {
+            free(inf.buf);
+            inf.buf = nullptr;
+        }
+    }
+    for (auto inf : output_inf_) {
+        if (inf.buf) {
+            free(inf.buf);
+            inf.buf = nullptr;
+        }
+    }
+
+    for (auto &input_blob : input_blob_map_) {
+        if (input_blob.second)
+            delete input_blob.second;
+    }
+    for (auto &output_blob : output_blob_map_) {
+        if (output_blob.second)
+            delete output_blob.second;
+    }
+    if (blob_manager_)
+        delete blob_manager_;
+
+    if (context_ != nullptr) {
+        delete context_;
+        context_ = nullptr;
+    }
+
+    return TNN_OK;
+}
+
+Status RknpuNetwork::GetCommandQueue(void **command_queue) {
+    return TNN_OK;
+}
+
+Status RknpuNetwork::Forward() {
+    exector_->SetInputs(input_inf_);
+    exector_->Run();
+    exector_->GetOutputs(output_inf_);
+    return TNN_OK;
+}
+
+Status RknpuNetwork::ForwardAsync(Callback call_back) {
+    return RknpuNetwork::Forward();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.h b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.h
new file mode 100644
index 0000000..0fc1d24
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/rknpu_network.h
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_NETWORK_H_
+#define TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_NETWORK_H_
+
+#include <tnn/core/abstract_device.h>
+#include <tnn/core/blob_manager.h>
+#include <tnn/device/rknpu/convert/rknpu_base_layer.h>
+#include <tnn/interpreter/net_resource.h>
+#include <tnn/interpreter/net_structure.h>
+#include <tnn/layer/base_layer.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "rknpu/rknpu_pub.h"
+#include "tnn/core/abstract_network.h"
+
+namespace TNN_NS {
+
+using OutputShapesMap = std::map<std::string, DimsVector>;
+
+class RknpuNetwork : public AbstractNetwork {
+public:
+    // @brief default constructor
+    RknpuNetwork();
+
+    // @brief virtual default destructor
+    virtual ~RknpuNetwork();
+
+    // @brief init network with net cfg and net res.
+    // @param net_cfg
+    // @param net_res
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+
+    // @brief deinit release init create resource
+    virtual Status DeInit();
+
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by rapidnet layers for
+    //  forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    //  @brief: set memory used by the rapidnet instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the rapidnet network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by rapidnet layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    //
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief network infer
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get rapidnet command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward();
+
+    // @brief rapidnet instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    // @brief set device affinity
+    virtual Status SetDeviceAffinity(const std::vector<int> &device_list);
+
+private:
+    // add for rknpu
+    Status IRInitLayers(NetworkConfig &net_config, AbstractModelInterpreter *interpreter, InputShapesMap &inputs_shape);
+
+    Status ConvertLayers(NetResource *net_resource);
+
+    Status CreateGraphInputs(InputShapesMap &input_shape_map);
+
+    Status SetGraphInputsAndOutputs(InputShapesMap &input_shape_map);
+
+    Status GetOutputShapeMap(NetworkConfig &net_config, AbstractModelInterpreter *interpreter,
+                             InputShapesMap &input_shape, OutputShapesMap &output_shape);
+
+    Status InitCacheGraph(std::string &cache_path, rk::nn::Graph *graph);
+
+    AbstractDevice *device_ = nullptr;
+
+    Context *context_ = nullptr;
+
+    std::vector<RknpuBaseLayer *> layers_;
+
+    BlobManager *blob_manager_ = nullptr;
+
+    NetStructure *net_structure_ = nullptr;
+
+    // map to store the operators corresponding to their names
+    std::map<std::string, std::shared_ptr<rk::nn::Tensor>> global_operator_map_;
+
+    // the name of the model
+    std::string model_name_;
+    std::unique_ptr<rk::nn::Exection> exector_;
+    std::vector<rk::nn::InputInfo> input_inf_;
+    std::vector<rk::nn::OutputInfo> output_inf_;
+
+    // blob map used only for input
+    BlobMap input_blob_map_;
+    BlobMap output_blob_map_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_RK_NPU_RKNPU_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.cc b/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.cc
new file mode 100644
index 0000000..ddecee0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/rknpu/tnn_impl_rknpu.h"
+#include "tnn/core/instance.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+TNNImplRknpu::TNNImplRknpu() {}
+
+TNNImplRknpu::~TNNImplRknpu() {}
+
+Status TNNImplRknpu::Init(ModelConfig& config) {
+    TNNImpl::Init(config);
+    auto interpreter = CreateModelInterpreter(config.model_type);
+    if (!interpreter) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+    interpreter_ = std::shared_ptr<AbstractModelInterpreter>(interpreter);
+    return interpreter_->Interpret(config.params);
+}
+
+Status TNNImplRknpu::DeInit() {
+    return TNN_OK;
+}
+
+Status TNNImplRknpu::AddOutput(const std::string& layer_name, int output_index) {
+    return Status(TNNERR_COMMON_ERROR, "RKNPU IMPL can not add output");
+}
+
+Status TNNImplRknpu::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+    return Status(TNNERR_COMMON_ERROR, "RKNPU IMPL can not get model input shapes map");
+}
+
+std::shared_ptr<Instance> TNNImplRknpu::CreateInst(NetworkConfig& net_config, Status& status,
+                                                   InputShapesMap inputs_shape) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, inputs_shape);
+    return instance;
+}
+
+std::shared_ptr<Instance> TNNImplRknpu::CreateInst(NetworkConfig& net_config, Status& status,
+                                                   InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape);
+    return instance;
+}
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplRknpu>> g_tnn_impl_atlas_factory_register(MODEL_TYPE_RKCACHE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.h b/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.h
new file mode 100644
index 0000000..86eb06b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/rknpu/tnn_impl_rknpu.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_DEVICE_RKNPU_TNN_IMPL_RKNPU_H_
+#define TNN_SOURCE_DEVICE_RKNPU_TNN_IMPL_RKNPU_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn_impl.h"
+
+namespace TNN_NS {
+
+// @brief tnn impl with interpreter
+class TNNImplRknpu : public TNNImpl {
+public:
+    // @brief tnn constructor
+    TNNImplRknpu();
+
+    // @brief tnn destructor
+    virtual ~TNNImplRknpu();
+
+    // @brief init the tnn, construct model interpreter
+    // @param config config model type and params
+    // @return status code: Successful, returns zero. Otherwise, returns
+    // error code.
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: If successful, returns zero. Otherwise, returns
+    // error
+    // code.
+    virtual Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    //@brief get input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use the shape in the
+    // proto
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap inputs_shape = InputShapesMap());
+
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+
+
+private:
+    std::shared_ptr<AbstractModelInterpreter> interpreter_;
+};
+
+}
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/x86/CMakeLists.txt b/3rdparty/TNN/source/tnn/device/x86/CMakeLists.txt
new file mode 100644
index 0000000..3d84040
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/CMakeLists.txt
@@ -0,0 +1,13 @@
+file(GLOB_RECURSE X86_SRC *.h *.cc)
+
+include_directories(${CMAKE_SOURCE_DIR}/third_party/xbyak)
+
+add_library(TNNX86 OBJECT ${X86_SRC})
+
+if (MSVC)
+    add_compile_options(/arch:AVX2)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__")
+else()
+    add_definitions(-mavx2 -mavx -mfma -ffast-math)
+endif()
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/Float4.h b/3rdparty/TNN/source/tnn/device/x86/acc/Float4.h
new file mode 100644
index 0000000..fcf8059
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/Float4.h
@@ -0,0 +1,299 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef Float4_hpp
+#define Float4_hpp
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/acc/sse_mathfun.h"
+
+#if defined(__GNUC__) && !defined(__llvm__)
+#if __GNUC__ < 5
+#include "tnn/device/arm/acc/TNNVector.h"
+#define VEC_NAIVE_IMPL
+#endif
+#endif
+
+namespace TNN_NS {
+
+#ifdef VEC_NAIVE_IMPL
+using Float4 = TNNVector<float, 4>;
+
+struct Float4x4 {
+    Float4 value[4];
+
+    static Float4x4 ld4u(const float* addr) {
+        Float4x4 v;
+        for (int i = 0; i < 4; i++) {
+            v.value[0].value[i] = addr[i*4 + 0];
+            v.value[1].value[i] = addr[i*4 + 1];
+            v.value[2].value[i] = addr[i*4 + 2];
+            v.value[3].value[i] = addr[i*4 + 3];
+        }
+        return v;
+    }
+    static Float4x4 loadu(const float* addr) {
+        Float4x4 v;
+        for (int i = 0; i < 4; i++) {
+            v.value[i].value[0] = addr[i*4 + 0];
+            v.value[i].value[1] = addr[i*4 + 1];
+            v.value[i].value[2] = addr[i*4 + 2];
+            v.value[i].value[3] = addr[i*4 + 3];
+        }
+        return v;
+    }
+    static Float4x4 ld4(const float* addr) {
+        Float4x4 v;
+        for (int i = 0; i < 4; i++) {
+            v.value[0].value[i] = addr[i*4 + 0];
+            v.value[1].value[i] = addr[i*4 + 1];
+            v.value[2].value[i] = addr[i*4 + 2];
+            v.value[3].value[i] = addr[i*4 + 3];
+        }
+        return v;
+    }
+    static Float4x4 load(const float* addr) {
+        Float4x4 v;
+        for (int i = 0; i < 4; i++) {
+            v.value[i].value[0] = addr[i*4 + 0];
+            v.value[i].value[1] = addr[i*4 + 1];
+            v.value[i].value[2] = addr[i*4 + 2];
+            v.value[i].value[3] = addr[i*4 + 3];
+        }
+        return v;
+    }
+    void get_lane(Float4& v, int index) {
+        v = value[index];
+    }
+};
+#else
+struct Float4 {
+    __m128 value;
+    Float4() {}
+    Float4(const float v) {
+        value = _mm_set1_ps(v);
+    }
+    Float4(const float *addr) {
+        value = _mm_set1_ps(*addr);
+    }
+    Float4(const __m128& v) {
+        value = v;
+    }
+    Float4(const __m128&& v) {
+        value = std::move(v);
+    }
+    Float4(const Float4& lr) {
+        value = lr.value;
+    }
+    Float4(const Float4&& lr) {
+        value = std::move(lr.value);
+    }
+
+    // void set_lane(float v, int i) {
+    //     value[i] = v;
+    // }
+
+    // const float operator[](const int i) const {
+    //     return value[i];
+    // }
+
+    static Float4 load(const float* addr) {
+        Float4 v;
+        v.value = _mm_load_ps(addr);
+        return v;
+    }
+    static Float4 loadu(const float* addr) {
+        Float4 v;
+        v.value = _mm_loadu_ps(addr);
+        return v;
+    }
+    static void save(float* addr, const Float4& v) {
+        _mm_store_ps(addr, v.value);
+    }
+    static void saveu(float* addr, const Float4& v) {
+        _mm_storeu_ps(addr, v.value);
+    }
+    static void mla(Float4& v1, const Float4& v2, const Float4& v3) {
+        v1.value = _mm_add_ps(v1.value, _mm_mul_ps(v2.value, v3.value));
+    }
+    static void mls(Float4& v1, const Float4& v2, const Float4& v3) {
+        v1.value = _mm_sub_ps(v1.value, _mm_mul_ps(v2.value, v3.value));
+    }
+    static Float4 bsl_cle(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_blendv_ps(v1.value, v2.value, _mm_cmpge_ps(c2.value, c1.value));
+        return dst;
+    }
+    static Float4 bsl_clt(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_blendv_ps(v1.value, v2.value, _mm_cmpgt_ps(c2.value, c1.value));
+        return dst;
+    }
+    static Float4 bsl_cge(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_blendv_ps(v1.value, v2.value, _mm_cmpge_ps(c1.value, c2.value));
+        return dst;
+    }
+    static Float4 bsl_cgt(const Float4& c1, const Float4& c2, const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_blendv_ps(v1.value, v2.value, _mm_cmpgt_ps(c1.value, c2.value));
+        return dst;
+    }
+    static Float4 max(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_max_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 min(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_min_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 add(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_add_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 sub(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_sub_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 mul(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_mul_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 div(const Float4& v1, const Float4& v2) {
+        Float4 dst;
+        dst.value = _mm_div_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float4 neg(const Float4 &v) {
+        Float4 dst;
+        dst.value = _mm_xor_ps (v.value, *(__m128*) _ps_sign_mask);
+        return dst;
+    }
+    static Float4 abs(const Float4 &v) {
+        Float4 dst;
+        dst.value = _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), v.value), v.value);
+        return dst;
+    }
+    static Float4 sqrt(const Float4 &v) {
+        Float4 dst;
+        dst.value = _mm_sqrt_ps(v.value);
+        return dst;
+    }
+    static Float4 sigmoid(const Float4 &v) {
+        Float4 dst;
+        const __m128 one = _mm_set1_ps(1.0f);
+        dst.value = _mm_div_ps(one, _mm_add_ps(one, exp_ps(_mm_sub_ps(_mm_setzero_ps(), v.value))));
+        return dst;
+    }
+    static Float4 exp(const Float4 &v) {
+        Float4 dst;
+        dst.value = exp_ps(v.value);
+        return dst;
+    }
+    static Float4 log(const Float4 &v) {
+        Float4 dst;
+        dst.value = log_ps(v.value);
+        return dst;
+    }
+    static Float4 tanh(const Float4& v) {
+        Float4 dst;
+        dst.value = tanh_ps(v.value);
+        return dst;
+    }
+    Float4 operator+(const Float4& lr) const {
+        Float4 dst;
+        dst.value = _mm_add_ps(value, lr.value);
+        return dst;
+    }
+    Float4 operator-(const Float4& lr) const {
+        Float4 dst;
+        dst.value = _mm_sub_ps(value, lr.value);
+        return dst;
+    }
+    Float4 operator*(float lr) const {
+        Float4 dst;
+        __m128 tmp = _mm_set1_ps(lr);
+        dst.value = _mm_mul_ps(value, tmp);
+        return dst;
+    }
+    Float4 operator*(const Float4& lr) const {
+        Float4 dst;
+        dst.value = _mm_mul_ps(value, lr.value);
+        return dst;
+    }
+    Float4& operator=(const Float4& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Float4& operator=(const Float4&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Float4 operator-() const {
+        Float4 dst;
+        dst.value = _mm_sub_ps(_mm_setzero_ps(), value);
+        return dst;
+    }
+};
+struct Float4x4 {
+    __m128 value[4];
+
+    static Float4x4 ld4u(const float* addr) {
+        Float4x4 v;
+        v.value[0] = _mm_loadu_ps(addr);
+        v.value[1] = _mm_loadu_ps(addr + 4);
+        v.value[2] = _mm_loadu_ps(addr + 8);
+        v.value[3] = _mm_loadu_ps(addr + 12);
+        _MM_TRANSPOSE4_PS(v.value[0], v.value[1], v.value[2], v.value[3]);
+        return v;
+    }
+    static Float4x4 loadu(const float* addr) {
+        Float4x4 v;
+        v.value[0] = _mm_loadu_ps(addr);
+        v.value[1] = _mm_loadu_ps(addr + 4);
+        v.value[2] = _mm_loadu_ps(addr + 8);
+        v.value[3] = _mm_loadu_ps(addr + 12);
+        return v;
+    }
+    static Float4x4 ld4(const float* addr) {
+        Float4x4 v;
+        v.value[0] = _mm_load_ps(addr);
+        v.value[1] = _mm_load_ps(addr + 4);
+        v.value[2] = _mm_load_ps(addr + 8);
+        v.value[3] = _mm_load_ps(addr + 12);
+        _MM_TRANSPOSE4_PS(v.value[0], v.value[1], v.value[2], v.value[3]);
+        return v;
+    }
+    static Float4x4 load(const float* addr) {
+        Float4x4 v;
+        v.value[0] = _mm_load_ps(addr);
+        v.value[1] = _mm_load_ps(addr + 4);
+        v.value[2] = _mm_load_ps(addr + 8);
+        v.value[3] = _mm_load_ps(addr + 12);
+        return v;
+    }
+    void get_lane(Float4& v, int index) {
+        v.value = value[index];
+    }
+};
+#endif
+
+}  // namespace TNN_NS
+
+#endif /* Float4_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/Float8.h b/3rdparty/TNN/source/tnn/device/x86/acc/Float8.h
new file mode 100644
index 0000000..e8d07c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/Float8.h
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef Float8_hpp
+#define Float8_hpp
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/x86_common.h"
+#ifdef __AVX2__
+#include "tnn/device/x86/acc/avx_mathfun.h"
+#include "tnn/device/x86/acc/sse_mathfun.h"
+#else
+#include "tnn/device/arm/acc/TNNVector.h"
+#endif
+
+namespace TNN_NS {
+#ifndef __AVX2__
+using Float8 = TNNVector<float, 8>;
+#else
+struct Float8 {
+    __m256 value;
+    Float8() {}
+    Float8(const float v) {
+        value = _mm256_set1_ps(v);
+    }
+    Float8(const float *addr) {
+        value = _mm256_broadcast_ss(addr);
+    }
+    Float8(const __m256& v) {
+        value = v;
+    }
+    Float8(const __m256&& v) {
+        value = std::move(v);
+    }
+    Float8(const Float8& lr) {
+        value = lr.value;
+    }
+    Float8(const Float8&& lr) {
+        value = std::move(lr.value);
+    }
+
+    // void set_lane(float v, int i) {
+    //     value[i] = v;
+    // }
+
+    // const float operator[](const int i) const {
+    //     return value[i];
+    // }
+
+    static Float8 load(const float* addr) {
+        Float8 v;
+        v.value = _mm256_load_ps(addr);
+        return v;
+    }
+    static Float8 loadu(const float* addr) {
+        Float8 v;
+        v.value = _mm256_loadu_ps(addr);
+        return v;
+    }
+    static void save(float* addr, const Float8& v) {
+        _mm256_store_ps(addr, v.value);
+    }
+    static void saveu(float* addr, const Float8& v) {
+        _mm256_storeu_ps(addr, v.value);
+    }
+    static void mla(Float8& v1, const Float8& v2, const Float8& v3) {
+        v1.value = _mm256_fmadd_ps(v2.value, v3.value, v1.value);
+    }
+    static void mls(Float8& v1, const Float8& v2, const Float8& v3) {
+        v1.value = _mm256_fmsub_ps(v2.value, v3.value, v1.value);
+    }
+    static Float8 bsl_cle(const Float8& c1, const Float8& c2, const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_blendv_ps(v1.value, v2.value, _mm256_cmp_ps(c1.value, c2.value, _CMP_GE_OQ));
+        return dst;
+    }
+    static Float8 bsl_clt(const Float8& c1, const Float8& c2, const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_blendv_ps(v1.value, v2.value, _mm256_cmp_ps(c1.value, c2.value, _CMP_GT_OQ));
+        return dst;
+    }
+    static Float8 bsl_cge(const Float8& c1, const Float8& c2, const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_blendv_ps(v1.value, v2.value, _mm256_cmp_ps(c1.value, c2.value, _CMP_LE_OQ));
+        return dst;
+    }
+    static Float8 bsl_cgt(const Float8& c1, const Float8& c2, const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_blendv_ps(v1.value, v2.value, _mm256_cmp_ps(c1.value, c2.value, _CMP_LT_OQ));
+        return dst;
+    }
+    static Float8 max(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_max_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 min(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_min_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 add(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_add_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 sub(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_sub_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 mul(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_mul_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 div(const Float8& v1, const Float8& v2) {
+        Float8 dst;
+        dst.value = _mm256_div_ps(v1.value, v2.value);
+        return dst;
+    }
+    static Float8 neg(const Float8 &v) {
+        Float8 dst;
+        dst.value = _mm256_xor_ps (v.value, *(__m256*) _ps256_sign_mask);
+        return dst;
+    }
+    static Float8 abs(const Float8 &v) {
+        Float8 dst;
+        dst.value = _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), v.value), v.value);
+        return dst;
+    }
+    static Float8 sqrt(const Float8 &v) {
+        Float8 dst;
+        dst.value = _mm256_sqrt_ps(v.value);
+        return dst;
+    }
+    static Float8 sigmoid(const Float8 &v) {
+        Float8 dst;
+        const __m256 one = _mm256_set1_ps(1.0f);
+        dst.value = _mm256_div_ps(one, _mm256_add_ps(one, exp256_ps(_mm256_sub_ps(_mm256_setzero_ps(), v.value))));
+        return dst;
+    }
+    static Float8 exp(const Float8 &v) {
+        Float8 dst;
+        dst.value = exp256_ps(v.value);
+        return dst;
+    }
+    static Float8 log(const Float8 &v) {
+        Float8 dst;
+        dst.value = log256_ps(v.value);
+        return dst;
+    }
+    static Float8 tanh(const Float8& v) {
+        Float8 dst;
+        __m128 low = _mm256_extractf128_ps(v.value, 0);
+        __m128 high = _mm256_extractf128_ps(v.value, 1);
+        low = tanh_ps(low);
+        high = tanh_ps(high);
+        dst.value = _mm256_castps128_ps256(low);
+        dst.value = _mm256_insertf128_ps(dst.value, high, 1);
+        return dst;
+    }
+    Float8 operator+(const Float8& lr) const {
+        Float8 dst;
+        dst.value = _mm256_add_ps(value, lr.value);
+        return dst;
+    }
+    Float8 operator-(const Float8& lr) const {
+        Float8 dst;
+        dst.value = _mm256_sub_ps(value, lr.value);
+        return dst;
+    }
+    Float8 operator*(float lr) const {
+        Float8 dst;
+        __m256 tmp = _mm256_set1_ps(lr);
+        dst.value = _mm256_mul_ps(value, tmp);
+        return dst;
+    }
+    Float8 operator*(const Float8& lr) const {
+        Float8 dst;
+        dst.value = _mm256_mul_ps(value, lr.value);
+        return dst;
+    }
+    Float8& operator=(const Float8& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Float8& operator=(const Float8&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Float8 operator-() const {
+        Float8 dst;
+        dst.value = _mm256_sub_ps(_mm256_setzero_ps(), value);
+        return dst;
+    }
+};
+#endif
+}  // namespace TNN_NS
+
+#endif /* Float8_hpp */
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/avx_mathfun.h b/3rdparty/TNN/source/tnn/device/x86/acc/avx_mathfun.h
new file mode 100644
index 0000000..c978d91
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/avx_mathfun.h
@@ -0,0 +1,736 @@
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifndef AVX_MATHFUN_H
+#define AVX_MATHFUN_H
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN32_BEG __declspec(align(32))
+#define ALIGN32_END
+#else /* gcc or icc */
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
+#endif
+
+#define _PI32AVX_CONST(Name, Val) \
+    static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, Val, Val}
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val) \
+    static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val) \
+    static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val) \
+    static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1f);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1f);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1f);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1f);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1f);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1f);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1f);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4f);
+_PS256_CONST(cephes_log_q2, 0.693359375f);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union
+{
+    __m256i imm;
+    __m128i xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)      \
+    {                                            \
+        ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
+        u.imm = imm_;                            \
+        xmm0_ = u.xmm[0];                        \
+        xmm1_ = u.xmm[1];                        \
+    }
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)      \
+    {                                            \
+        ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
+        u.xmm[0] = xmm0_;                        \
+        u.xmm[1] = xmm1_;                        \
+        imm_ = u.imm;                            \
+    }
+
+#define AVX2_BITOP_USING_SSE2(fn)                            \
+    static inline __m256i _mm256_##fn(__m256i x, int a)      \
+    {                                                        \
+        /* use SSE2 instruction to perform the bitop AVX2 */ \
+        __m128i x1, x2;                                      \
+        __m256i ret;                                         \
+        COPY_IMM_TO_XMM(x, x1, x2);                          \
+        x1 = _mm_##fn(x1, a);                                \
+        x2 = _mm_##fn(x2, a);                                \
+        COPY_XMM_TO_IMM(x1, x2, ret);                        \
+        return (ret);                                        \
+    }
+
+#if _MSC_VER
+#pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
+#else
+#warning "Using SSE2 to perform AVX2 bitshift ops"
+#endif
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn)                                         \
+    static inline __m256i _mm256_##fn(__m256i x, __m256i y)               \
+    {                                                                     \
+        /* use SSE2 instructions to perform the AVX2 integer operation */ \
+        __m128i x1, x2;                                                   \
+        __m128i y1, y2;                                                   \
+        __m256i ret;                                                      \
+        COPY_IMM_TO_XMM(x, x1, x2);                                       \
+        COPY_IMM_TO_XMM(y, y1, y2);                                       \
+        x1 = _mm_##fn(x1, y1);                                            \
+        x2 = _mm_##fn(x2, y2);                                            \
+        COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+        return (ret);                                                     \
+    }
+
+#if _MSC_VER
+#pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
+#else
+#warning "Using SSE2 to perform AVX2 integer ops"
+#endif
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+
+#endif /* __AVX2__ */
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+static inline __m256 log256_ps(__m256 x)
+{
+    __m256i imm0;
+    __m256 one = *(__m256*)_ps256_1;
+
+    //__m256 invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+    __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+    x = _mm256_max_ps(x, *(__m256*)_ps256_min_norm_pos); /* cut off denormalized stuff */
+
+    // can be done with AVX2
+    imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+    /* keep only the fractional part */
+    x = _mm256_and_ps(x, *(__m256*)_ps256_inv_mant_mask);
+    x = _mm256_or_ps(x, *(__m256*)_ps256_0p5);
+
+    // this is again another AVX2 instruction
+    imm0 = _mm256_sub_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
+    __m256 e = _mm256_cvtepi32_ps(imm0);
+
+    e = _mm256_add_ps(e, one);
+
+    /* part2:
+       if( x < SQRTHF ) {
+         e -= 1;
+         x = x + x - 1.0;
+       } else { x = x - 1.0; }
+    */
+    //__m256 mask = _mm256_cmplt_ps(x, *(__m256*)_ps256_cephes_SQRTHF);
+    __m256 mask = _mm256_cmp_ps(x, *(__m256*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+    __m256 tmp = _mm256_and_ps(x, mask);
+    x = _mm256_sub_ps(x, one);
+    e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+    x = _mm256_add_ps(x, tmp);
+
+    __m256 z = _mm256_mul_ps(x, x);
+
+    __m256 y = *(__m256*)_ps256_cephes_log_p0;
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p1);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p2);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p3);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p4);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p5);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p6);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p7);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p8);
+    y = _mm256_mul_ps(y, x);
+
+    y = _mm256_mul_ps(y, z);
+
+    tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q1);
+    y = _mm256_add_ps(y, tmp);
+
+    tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
+    y = _mm256_sub_ps(y, tmp);
+
+    tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q2);
+    x = _mm256_add_ps(x, y);
+    x = _mm256_add_ps(x, tmp);
+    y = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+    return y;
+}
+
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
+_PS256_CONST(cephes_exp_C1, 0.693359375f);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
+
+static inline __m256 exp256_ps(__m256 x)
+{
+    __m256 tmp = _mm256_setzero_ps(), fx;
+    __m256i imm0;
+    __m256 one = *(__m256*)_ps256_1;
+
+    x = _mm256_min_ps(x, *(__m256*)_ps256_exp_hi);
+    x = _mm256_max_ps(x, *(__m256*)_ps256_exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_LOG2EF);
+    fx = _mm256_add_ps(fx, *(__m256*)_ps256_0p5);
+
+    /* how to perform a floorf with SSE: just below */
+    //imm0 = _mm256_cvttps_epi32(fx);
+    //tmp  = _mm256_cvtepi32_ps(imm0);
+
+    tmp = _mm256_floor_ps(fx);
+
+    /* if greater, subtract 1 */
+    //__m256 mask = _mm256_cmpgt_ps(tmp, fx);
+    __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+    mask = _mm256_and_ps(mask, one);
+    fx = _mm256_sub_ps(tmp, mask);
+
+    tmp = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C1);
+    __m256 z = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C2);
+    x = _mm256_sub_ps(x, tmp);
+    x = _mm256_sub_ps(x, z);
+
+    z = _mm256_mul_ps(x, x);
+
+    __m256 y = *(__m256*)_ps256_cephes_exp_p0;
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p1);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p2);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p3);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p4);
+    y = _mm256_mul_ps(y, x);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p5);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, x);
+    y = _mm256_add_ps(y, one);
+
+    /* build 2^n */
+    imm0 = _mm256_cvttps_epi32(fx);
+    // another two AVX2 instructions
+    imm0 = _mm256_add_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
+    imm0 = _mm256_slli_epi32(imm0, 23);
+    __m256 pow2n = _mm256_castsi256_ps(imm0);
+    y = _mm256_mul_ps(y, pow2n);
+    return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625f);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS256_CONST(sincof_p0, -1.9515295891E-4f);
+_PS256_CONST(sincof_p1, 8.3321608736E-3f);
+_PS256_CONST(sincof_p2, -1.6666654611E-1f);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005f);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS256_CONST(coscof_p2, 4.166664568298827E-002f);
+_PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+static inline __m256 sin256_ps(__m256 x)
+{   // any x
+    __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+    __m256i imm0, imm2;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+#endif
+
+    sign_bit = x;
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit = _mm256_and_ps(sign_bit, *(__m256*)_ps256_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
+
+    /*
+      Here we start a series of integer operations, which are in the
+      realm of AVX2.
+      If we don't have AVX, let's perform them using SSE2 directives
+    */
+
+#ifdef __AVX2__
+    /* store the integer part of y in mm0 */
+    imm2 = _mm256_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    // another two AVX2 instruction
+    imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    /* get the swap sign flag */
+    imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
+    imm0 = _mm256_slli_epi32(imm0, 29);
+    /* get the polynom selection mask
+       there is one polynom for 0 <= x <= Pi/4
+       and another one for Pi/4<x<=Pi/2
+
+       Both branches will be computed.
+    */
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
+#else
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
+    imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+    __m256 swap_sign_bit = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+    sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
+    xmm1 = _mm256_mul_ps(y, xmm1);
+    xmm2 = _mm256_mul_ps(y, xmm2);
+    xmm3 = _mm256_mul_ps(y, xmm3);
+    x = _mm256_add_ps(x, xmm1);
+    x = _mm256_add_ps(x, xmm2);
+    x = _mm256_add_ps(x, xmm3);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(__m256*)_ps256_coscof_p0;
+    __m256 z = _mm256_mul_ps(x, x);
+
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
+    y = _mm256_sub_ps(y, tmp);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256*)_ps256_sincof_p0;
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_mul_ps(y2, x);
+    y2 = _mm256_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+    y = _mm256_andnot_ps(xmm3, y);
+    y = _mm256_add_ps(y, y2);
+    /* update the sign */
+    y = _mm256_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+/* almost the same as sin_ps */
+static inline __m256 cos256_ps(__m256 x)
+{   // any x
+    __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+    __m256i imm0, imm2;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+#endif
+
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+    /* store the integer part of y in mm0 */
+    imm2 = _mm256_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
+    y = _mm256_cvtepi32_ps(imm2);
+    imm2 = _mm256_sub_epi32(imm2, *(__m256i*)_pi32_256_2);
+
+    /* get the swap sign flag */
+    imm0 = _mm256_andnot_si256(imm2, *(__m256i*)_pi32_256_4);
+    imm0 = _mm256_slli_epi32(imm0, 29);
+    /* get the polynom selection mask */
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
+#else
+
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm2_1 = _mm_sub_epi32(imm2_1, *(__m128i*)_pi32avx_2);
+    imm2_2 = _mm_sub_epi32(imm2_2, *(__m128i*)_pi32avx_2);
+
+    imm0_1 = _mm_andnot_si128(imm2_1, *(__m128i*)_pi32avx_4);
+    imm0_2 = _mm_andnot_si128(imm2_2, *(__m128i*)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+    __m256 sign_bit = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
+    xmm1 = _mm256_mul_ps(y, xmm1);
+    xmm2 = _mm256_mul_ps(y, xmm2);
+    xmm3 = _mm256_mul_ps(y, xmm3);
+    x = _mm256_add_ps(x, xmm1);
+    x = _mm256_add_ps(x, xmm2);
+    x = _mm256_add_ps(x, xmm3);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(__m256*)_ps256_coscof_p0;
+    __m256 z = _mm256_mul_ps(x, x);
+
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
+    y = _mm256_sub_ps(y, tmp);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256*)_ps256_sincof_p0;
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_mul_ps(y2, x);
+    y2 = _mm256_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+    y = _mm256_andnot_ps(xmm3, y);
+    y = _mm256_add_ps(y, y2);
+    /* update the sign */
+    y = _mm256_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+static inline void sincos256_ps(__m256 x, __m256* s, __m256* c)
+{
+    __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+    __m256i imm0, imm2, imm4;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+    __m128i imm4_1, imm4_2;
+#endif
+
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps256_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+    /* store the integer part of y in imm2 */
+    imm2 = _mm256_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
+
+    y = _mm256_cvtepi32_ps(imm2);
+    imm4 = imm2;
+
+    /* get the swap sign flag for the sine */
+    imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
+    imm0 = _mm256_slli_epi32(imm0, 29);
+    //__m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+    /* get the polynom selection mask for the sine*/
+    imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
+    //__m256 poly_mask = _mm256_castsi256_ps(imm2);
+#else
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm4_1 = imm2_1;
+    imm4_2 = imm2_2;
+
+    imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
+    imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+    __m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
+    xmm1 = _mm256_mul_ps(y, xmm1);
+    xmm2 = _mm256_mul_ps(y, xmm2);
+    xmm3 = _mm256_mul_ps(y, xmm3);
+    x = _mm256_add_ps(x, xmm1);
+    x = _mm256_add_ps(x, xmm2);
+    x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+    imm4 = _mm256_sub_epi32(imm4, *(__m256i*)_pi32_256_2);
+    imm4 = _mm256_andnot_si256(imm4, *(__m256i*)_pi32_256_4);
+    imm4 = _mm256_slli_epi32(imm4, 29);
+#else
+    imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
+    imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
+
+    imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
+    imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
+
+    imm4_1 = _mm_slli_epi32(imm4_1, 29);
+    imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+    COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+    __m256 sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+    sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    __m256 z = _mm256_mul_ps(x, x);
+    y = *(__m256*)_ps256_coscof_p0;
+
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
+    y = _mm256_sub_ps(y, tmp);
+    y = _mm256_add_ps(y, *(__m256*)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256*)_ps256_sincof_p0;
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_mul_ps(y2, x);
+    y2 = _mm256_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    __m256 ysin2 = _mm256_and_ps(xmm3, y2);
+    __m256 ysin1 = _mm256_andnot_ps(xmm3, y);
+    y2 = _mm256_sub_ps(y2, ysin2);
+    y = _mm256_sub_ps(y, ysin1);
+
+    xmm1 = _mm256_add_ps(ysin1, ysin2);
+    xmm2 = _mm256_add_ps(y, y2);
+
+    /* update the sign */
+    *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+    *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
+
+static inline __m256 pow_ps(__m256 a, __m256 b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
+}
+
+#endif // AVX_MATHFUN_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas.h
new file mode 100644
index 0000000..4be51e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas.h
@@ -0,0 +1,64 @@
+#ifndef TNN_JIT_CBLAS_OPT_H
+#define TNN_JIT_CBLAS_OPT_H
+
+#include <stddef.h>
+
+#define FINTEGER ptrdiff_t
+
+#ifdef __cplusplus
+extern "C" {
+    /* Assume C declarations for C++ */
+#endif  /* __cplusplus */
+
+/*
+ * Since all of GotoBlas was written without const,
+ * we disable it at build time.
+ */
+#ifndef OPENBLAS_CONST
+# define OPENBLAS_CONST const
+#endif
+
+#ifndef blasint
+# define blasint int
+#endif
+
+typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
+typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
+
+float cblas_sasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+
+void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
+
+void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+         OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy);
+
+void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+         OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+
+void conv_cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+         OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc,
+         OPENBLAS_CONST float *bias, int act_type);
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+int sgemm_opt1(const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+
+extern void openblas_set_num_threads(int n);
+
+#ifdef __cplusplus
+}
+#endif  /* __cplusplus */
+
+#endif // TNN_JIT_CBLAS_OPT_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas_driver.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas_driver.cc
new file mode 100644
index 0000000..30bc05a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/cblas_driver.cc
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "tnn/device/x86/acc/compute/jit/cblas.h"
+
+extern "C" {
+
+#define ABS_(a) ((a) > 0 ? (a) : (-(a)))
+
+void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+    OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy) 
+{
+
+    blasint n_stride  = (order == CblasRowMajor) ?  1 : lda;
+    blasint m_stride  = (order == CblasRowMajor) ?  lda : 1;
+
+    // blasint first_dim  = (trans == CblasNoTrans) ? m : n;
+    // blasint second_dim = (trans == CblasNoTrans) ? n : m;
+    blasint first_dim;
+    blasint second_dim;
+    blasint a_first_stride;
+    blasint a_second_stride;
+
+
+    if (trans == CblasNoTrans) {
+        first_dim = m;
+        second_dim = n;
+        a_first_stride = m_stride;
+        a_second_stride = n_stride;
+    } else {
+        first_dim = n;
+        second_dim = m;
+        a_first_stride = n_stride;
+        a_second_stride = m_stride;
+    }
+
+    if (ABS_(beta) < 1e-6) {
+        for(int i=0;i<first_dim;i++) {
+            y[i * incy] = 0.f;
+        }
+    }
+
+    for(int i=0;i<first_dim;i++) {
+        float accu = 0.f;
+
+        OPENBLAS_CONST float * x_ptr = x;
+        OPENBLAS_CONST float * a_ptr = a + i * a_first_stride;
+
+        for(int j=0;j<second_dim;j++) {
+            accu += (*a_ptr) * (*x_ptr);
+
+            x_ptr += incx;
+            a_ptr += a_second_stride;
+        }
+
+        accu = accu * alpha + beta * y[i * incy];
+        y[i * incy] = accu;
+
+    }
+}
+
+void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy) 
+{
+    for(int i=0;i<n;i++) {
+        y[i * incy] += alpha * x[i * incx];
+    }
+
+}
+
+
+float cblas_sasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx) 
+{
+    float res = 0;
+    for(int i=0;i<n;i++) {
+        res += ABS_(x[i * incx]);
+    }
+    return res;
+}
+
+
+void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX) 
+{
+    for(int i=0;i<N;i++) {
+        X[i*incX] *= alpha;
+    }
+}
+
+
+void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy) 
+{
+    for(int i=0;i<n;i++)  {
+        y[i*incy] = x[i*incx];
+    }
+}
+
+#undef ABS_
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/abi_info.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/abi_info.h
new file mode 100644
index 0000000..df7e687
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/abi_info.h
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ABI_INFO_HPP_
+#define TNN_ABI_INFO_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include <xbyak/xbyak.h>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+namespace jit {
+namespace abi {
+
+#ifdef XBYAK64
+// ------------------------------------  64 bit related abi info ----------------------------------------
+
+    constexpr Xbyak::Operand::Code abi_regs_callee_save[] = {
+        Xbyak::Operand::RBX, Xbyak::Operand::R12,
+        Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15,
+    #ifdef _WIN32
+        Xbyak::Operand::RDI, Xbyak::Operand::RSI,
+    #endif // _WIN32
+    };
+
+
+    constexpr Xbyak::Operand::Code abi_args_in_register[] = {
+        #ifdef _WIN32
+            Xbyak::Operand::RCX, Xbyak::Operand::RDX,
+            Xbyak::Operand::R8,  Xbyak::Operand::R9,
+        #else 
+            Xbyak::Operand::RDI, Xbyak::Operand::RSI,
+            Xbyak::Operand::RDX, Xbyak::Operand::RCX,
+            Xbyak::Operand::R8,  Xbyak::Operand::R9,
+        #endif 
+    };
+
+    constexpr const int abi_nb_args_in_register = sizeof(abi_args_in_register) / sizeof(Xbyak::Operand::Code);
+
+    #ifdef _WIN32
+    constexpr const int abi_stack_param_offset = 32;
+    #else
+    constexpr const int abi_stack_param_offset = 0;
+    #endif
+
+#else // XBYAK64
+
+// ------------------------------------  32 bit related abi info ----------------------------------------
+
+    constexpr Xbyak::Operand::Code abi_regs_callee_save[] = {
+        Xbyak::Operand::EBX,  Xbyak::Operand::EDI,
+        Xbyak::Operand::ESI,
+    };
+
+    static Xbyak::Operand::Code * abi_args_in_register;
+
+    constexpr const int abi_nb_args_in_register = 0;
+    constexpr const int abi_stack_param_offset = 0;
+#endif // XBYAK64
+
+constexpr const int abi_nb_regs_callee_save = sizeof(abi_regs_callee_save) / sizeof(Xbyak::Operand::Code);
+constexpr const int register_width_in_bytes = sizeof(void*);
+
+
+} // namespace abi
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_ABI_INFO_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/asm_common.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/asm_common.h
new file mode 100644
index 0000000..399db8c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/asm_common.h
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_ASM_COMMON_HPP_
+#define TNN_ASM_COMMON_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <vector>
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include <xbyak/xbyak.h>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+namespace jit {
+namespace common {
+
+#ifdef XBYAK64
+// ------------------------------------  64 bit related ----------------------------------------
+
+typedef Xbyak::Reg64 rf_;
+#define rword_ Xbyak::util::qword
+
+static rf_ bp(Xbyak::Operand::RBP);
+static rf_ sp(Xbyak::Operand::RSP);
+
+const std::vector<Xbyak::Reg64> regs_= {
+    Xbyak::util::rax, Xbyak::util::rbx, Xbyak::util::rcx, Xbyak::util::rdx,
+    /*Xbyak::util::rsp, Xbyak::util::rbp,*/ Xbyak::util::rsi, Xbyak::util::rdi,
+    Xbyak::util::r8 , Xbyak::util::r9 , Xbyak::util::r10, Xbyak::util::r11,
+    Xbyak::util::r12, Xbyak::util::r13, Xbyak::util::r14, Xbyak::util::r15,
+};
+
+#else // XBYAK64
+// ------------------------------------  32 bit related ----------------------------------------
+
+typedef Xbyak::Reg32 rf_;
+#define rword_ Xbyak::util::dword
+
+static rf_ bp(Xbyak::Operand::EBP);
+static rf_ sp(Xbyak::Operand::ESP);
+
+const std::vector<Xbyak::Reg32> regs_= {
+    Xbyak::util::eax, Xbyak::util::ebx, Xbyak::util::ecx, Xbyak::util::edx,
+    /*Xbyak::util::esp, Xbyak::util::ebp,*/ Xbyak::util::esi, Xbyak::util::edi,
+};
+
+#endif // XBYAK64
+
+const std::vector<Xbyak::Mmx> mmx_ = {
+    Xbyak::Mmx(0), Xbyak::Mmx(1), Xbyak::Mmx(2), Xbyak::Mmx(3),
+    Xbyak::Mmx(4), Xbyak::Mmx(5), Xbyak::Mmx(6), Xbyak::Mmx(7),
+#ifdef XBYAK64
+    Xbyak::Mmx(8), Xbyak::Mmx(9), Xbyak::Mmx(10),Xbyak::Mmx(11),
+    Xbyak::Mmx(12),Xbyak::Mmx(13),Xbyak::Mmx(14),Xbyak::Mmx(15)
+#endif
+};
+
+// const std::vector<Xbyak::Xmm> xmm_ = {
+//     Xbyak::Xmm(0), Xbyak::Xmm(1), Xbyak::Xmm(2), Xbyak::Xmm(3),
+//     Xbyak::Xmm(4), Xbyak::Xmm(5), Xbyak::Xmm(6), Xbyak::Xmm(7),
+// #ifdef XBYAK64
+//     Xbyak::Xmm(8), Xbyak::Xmm(9), Xbyak::Xmm(10),Xbyak::Xmm(11),
+//     Xbyak::Xmm(12),Xbyak::Xmm(13),Xbyak::Xmm(14),Xbyak::Xmm(15)
+// #endif
+// };
+
+// const std::vector<Xbyak::Ymm> ymm_ = {
+//     Xbyak::Ymm(0), Xbyak::Ymm(1), Xbyak::Ymm(2), Xbyak::Ymm(3),
+//     Xbyak::Ymm(4), Xbyak::Ymm(5), Xbyak::Ymm(6), Xbyak::Ymm(7),
+// #ifdef XBYAK64
+//     Xbyak::Ymm(8), Xbyak::Ymm(9), Xbyak::Ymm(10),Xbyak::Ymm(11),
+//     Xbyak::Ymm(12),Xbyak::Ymm(13),Xbyak::Ymm(14),Xbyak::Ymm(15)
+// #endif
+// };
+
+} // namespace common
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_ASM_COMMON_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/type_def.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/type_def.h
new file mode 100644
index 0000000..720c2d8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/common/type_def.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_DEVICE_X86_ACC_COMPUTE_JIT_TYPE_DEF_H_
+#define TNN_DEVICE_X86_ACC_COMPUTE_JIT_TYPE_DEF_H_
+
+#ifndef FLT_MIN
+#define FLT_MIN 1.075494351e-38F 
+#endif
+#ifndef FLT_MAX
+#define FLT_MAX 2.402823466e+38F 
+#endif
+
+#include <stddef.h>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+typedef ptrdiff_t dim_t;
+
+} // namespace tnn
+
+#endif // TNN_DEVICE_X86_ACC_COMPUTE_JIT_TYPE_DEF_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.cc
new file mode 100644
index 0000000..a264043
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.cc
@@ -0,0 +1,207 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/x86/acc/compute/jit/data_packing.h"
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include <stdio.h>
+
+#include "tnn/device/x86/acc/compute/jit/conv_gemm_config.h"
+
+namespace TNN_NS {
+
+//  pack block_size on non-leading dimension, n denotes no-transpose. 
+//  eg. input:   A MxN matrix in col major, so the storage-format is (N, M) 
+//      output:  B MxN matrix in col major(N-packed), so the storage-format is 
+//                    (divUp(N, block_size), M, block_size)
+template<typename T>
+void pack_col_b_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, conv_gemm_config<T, T, T> &conv_gemm_conf)
+{
+    int block_size = conv_gemm_conf.n_block_;
+    dim_t i=0; 
+    for(;n-i>= block_size;i+= block_size) {
+        const T * cur_a = a + i * lda;
+        T * cur_b = b + i * ldb;
+        conv_gemm_conf.pack_n_ker_[block_size](m, cur_a, lda, cur_b, ldb, block_size);
+    }
+    dim_t tail = n - i;
+    if (tail > 0) {
+        const T * cur_a = a + i * lda;
+        T * cur_b = b + i * ldb;
+        conv_gemm_conf.pack_n_ker_[tail](m, cur_a, lda, cur_b, ldb, block_size);
+    }
+
+}
+
+template 
+void pack_col_b_n<float>(const float * a, dim_t lda, float * b, dim_t ldb, 
+                   dim_t m, dim_t n,
+                   conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+//  pack block_size on leading dimension, t denotes transpose. 
+//  eg. input:   A MxN matrix in row major, so the storage-format is (M, N) 
+//      output:  B MxN matrix in col major(N-packed), so the storage-format is 
+//                    (divUp(N, block_size), M, block_size)
+template<typename T>
+void pack_col_a_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, conv_gemm_config<T, T, T> &conv_gemm_conf) 
+{
+    dim_t block_size = conv_gemm_conf.m_block_;
+    dim_t i = 0; 
+
+    if (block_size == 16) {
+        for(;i + 64 <=n;i+=64) {
+            const T * cur_a = a + i;
+            T * cur_b = b + i * ldb;
+            conv_gemm_conf.pack_t_4x16_ker_(m, cur_a, lda, cur_b, ldb, block_size);
+        }
+    } else if (block_size == 8) {
+        
+    }
+
+    for(;i<n;) {
+        const T * cur_a = a + i;
+        T * cur_b = b + divDown(i, block_size) * ldb + i % block_size;
+        dim_t cur_n = std::min(n - i, block_size);
+        switch(cur_n) {
+            case 1:
+                conv_gemm_conf.pack_t_ker_[1](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=1;
+                break;
+            case 2:
+                conv_gemm_conf.pack_t_ker_[2](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=2;
+                break;
+            case 3:
+                conv_gemm_conf.pack_t_ker_[3](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=3;
+                break;
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                conv_gemm_conf.pack_t_ker_[4](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=4;
+                break;
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            case 12:
+            case 13:
+            case 14:
+            case 15:
+                conv_gemm_conf.pack_t_ker_[8](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=8;
+                break;
+            default:
+                conv_gemm_conf.pack_t_ker_[16](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=16;
+                break;
+        }
+    }
+    /*
+    if (block_n == 16) {
+
+        int i=0;
+        // must be a perfect loop, n % block_n = 0
+        for(;i + 64 <=n;i+=64) {
+            const float * cur_a = a + i;
+            float * cur_b = b + i * ldb;
+            sgemm_fetch_t_4x16(m, cur_a, lda, cur_b, ldb, 16);
+        }
+
+    }
+
+    */
+}
+
+template 
+void pack_col_a_n<float>(const float * a, dim_t lda, float * b, dim_t ldb, 
+                   dim_t m, dim_t n,
+                   conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+template <int blk, typename T>
+void pack_a_t_trans(
+    const T *src,
+    dim_t lda,
+    T *dst,
+    dim_t cur_k,
+    dim_t block_size)
+{
+    for (int j = 0; j < blk; j++) {
+        auto src_j = src + j * lda;
+        auto dst_j = dst + j;
+        for (int k = 0; k < cur_k; k++) {
+            dst_j[k * block_size] = src_j[k];
+        }
+    }
+}
+
+// lda -> total_k
+// ldb -> m_block_size (M_c)
+// cur_k
+// cur_m
+template<typename T>
+void pack_col_a_t(
+    const T *src_a,
+    dim_t lda,
+    T *src_b,
+    dim_t ldb,
+    dim_t cur_k,
+    dim_t cur_m,
+    conv_gemm_config<T, T, T> &conv_gemm_conf)
+{
+    dim_t block_size = conv_gemm_conf.m_block_;
+    dim_t i = 0;
+
+    if (block_size == 16) {
+        for (; i + 15 < cur_m; i += 16) {
+            auto a_ptr = src_a + i * lda;
+            auto b_ptr = src_b + i * ldb;
+            pack_a_t_trans<16, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+        }
+    }
+    for (; i + 7 < cur_m; i += 8) {
+        auto a_ptr = src_a + i * lda;
+        auto b_ptr = src_b + divDown(i, block_size) * ldb + i % block_size;
+        pack_a_t_trans<8, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+    }
+    for (; i + 3 < cur_m; i += 4) {
+        auto a_ptr = src_a + i * lda;
+        auto b_ptr = src_b + divDown(i, block_size) * ldb + i % block_size;
+        pack_a_t_trans<4, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+    }
+    for (; i + 2 < cur_m; i += 3) {
+        auto a_ptr = src_a + i * lda;
+        auto b_ptr = src_b + divDown(i, block_size) * ldb + i % block_size;
+        pack_a_t_trans<3, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+    }
+    for (; i + 1 < cur_m; i += 2) {
+        auto a_ptr = src_a + i * lda;
+        auto b_ptr = src_b + divDown(i, block_size) * ldb + i % block_size;
+        pack_a_t_trans<2, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+    }
+    for (; i < cur_m; i++) {
+        auto a_ptr = src_a + i * lda;
+        auto b_ptr = src_b + divDown(i, block_size) * ldb + i % block_size;
+        pack_a_t_trans<1, T>(a_ptr, lda, b_ptr, cur_k, block_size);
+    }
+}
+template void pack_col_a_t<float>(const float * a, dim_t lda, float * b, dim_t ldb, 
+                   dim_t m, dim_t n, conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+} // namespace tnn
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.h
new file mode 100644
index 0000000..a8b9ea1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_data_packing.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#ifndef TNN_JIT_DATA_PACKING_H_
+#define TNN_JIT_DATA_PACKING_H_
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/utils.h"
+#include "tnn/device/x86/acc/compute/jit/conv_gemm_config.h"
+
+namespace TNN_NS {
+
+template<typename T>
+void pack_col_a_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, conv_gemm_config<T, T, T> &conv_gemm_conf);
+
+template<typename T>
+void pack_col_b_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, conv_gemm_config<T, T, T> &conv_gemm_conf);
+
+template<typename T>
+void pack_col_a_t(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, conv_gemm_config<T, T, T> &conv_gemm_conf);
+
+} // namespace tnn
+
+#endif //TNN_JIT_DATA_PACKING_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.cc
new file mode 100644
index 0000000..357dbc4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.cc
@@ -0,0 +1,237 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "conv_gemm_config.h"
+
+#include <mutex>
+#include <memory>
+#include <vector>
+#include <algorithm>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/jit_kernels.h"
+#include "tnn/device/x86/acc/compute/jit/utils/cpu_isa.h"
+
+namespace TNN_NS {
+
+template<typename a_t, typename b_t, typename c_t>
+conv_gemm_config<a_t, b_t, c_t>::conv_gemm_config(
+    const dim_t m_block, const dim_t n_block) : m_block_(m_block), n_block_(n_block)
+{
+    std::vector<int> supported_m_block = {4, 8, 16};
+    std::vector<int> supported_n_block = {6};
+    if (std::find(supported_m_block.begin(), supported_m_block.end(), m_block_) == supported_m_block.end()) {
+       throw std::runtime_error("value of m_block is not supported.");
+    }
+    if (std::find(supported_n_block.begin(), supported_n_block.end(), n_block_) == supported_n_block.end()) {
+       throw std::runtime_error("value of n_block is not supported.");
+    }
+
+    M_c_ = 64;
+    K_c_ = 256;
+
+    if (cpu_with_isa(avx2)) {
+#ifdef XBYAK64
+        m_block_ = 16;
+        kernel_m_r_ = 16; 
+#else 
+        m_block_ = 8;
+        kernel_m_r_ = 8; 
+#endif
+    } else if (cpu_with_isa(sse42)) {
+        m_block_ = 4;
+        kernel_m_r_ = 4;
+    }
+    kernel_n_r_ = 6;
+    this->init_jit_kernel();
+}
+
+template conv_gemm_config<float, float, float>::conv_gemm_config(const dim_t m_block, const dim_t n_block);
+
+template<typename a_t, typename b_t, typename c_t>
+void conv_gemm_config<a_t, b_t, c_t>::init_jit_kernel()
+{
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_t_ker[nb_kernels_m + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_t_4x16_ker;
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_n_ker[nb_kernels_n + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_kernel_4 [nb_kernels_m + 1][nb_kernels_n + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_kernel_8 [nb_kernels_m + 1][nb_kernels_n + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_kernel_16[nb_kernels_m + 1][nb_kernels_n + 1];
+
+    static std::once_flag initialized;
+    std::call_once(initialized, [] {
+
+        // initialize the pack t kernels
+        for(int i = 1; i <= nb_kernels_n; i++) {
+            switch (i) {
+                case 1:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_1_ker_t>();
+                    break;
+                case 2:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_2_ker_t>();
+                    break;
+                case 3:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_3_ker_t>();
+                    break;
+                case 4:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_4_ker_t>();
+                    break;
+                case 5:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_5_ker_t>();
+                    break;
+                case 6:
+                    if (cpu_with_isa(avx2)) {
+                        g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_6_ker_t>();
+                    } else if (cpu_with_isa(sse42)) {
+                        g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_6i_ker_t>();
+                    }
+                    break;
+                default:
+                    throw std::runtime_error("uninitialized kernel."); 
+                    break;
+            }
+        }
+
+        // initialize the pack t kernels
+        for(int i = 1; i <= nb_kernels_m; i++) {
+            switch (i) {
+                case 0:
+                    break;
+                case 1:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_1_ker_t>();
+                    break;
+                case 2:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_2_ker_t>();
+                    break;
+                case 3:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_3_ker_t>();
+                    break;
+                case 4:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_4_ker_t>();
+                    break;
+                case 8:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_8_ker_t>();
+                    break;
+                case 16:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_16_ker_t>();
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        g_pack_t_4x16_ker = std::make_shared<jit::sgemm_fetch_t_4x16_ker_t>();
+
+#define REGISTER_KERNEL(M, N)                                                                   \
+            if (M <= 4)  g_kernel_4[M][N] =                                                     \
+            std::make_shared<jit::conv_sgemm_avx_kernel<M, N, 4, 6>>();                         \
+            if (M <= 8)  g_kernel_8[M][N] =                                                     \
+            std::make_shared<jit::conv_sgemm_avx_kernel<M, N, 8, 6>>();                         \
+            if (M <= 16) g_kernel_16[M][N] =                                                    \
+            std::make_shared<jit::conv_sgemm_avx_kernel<M, N, 16, 6>>();                        \
+
+#define REGISTER_KERNEL_M(M)                                                                    \
+            REGISTER_KERNEL(M, 1);                                                              \
+            REGISTER_KERNEL(M, 2);                                                              \
+            REGISTER_KERNEL(M, 3);                                                              \
+            REGISTER_KERNEL(M, 4);                                                              \
+            REGISTER_KERNEL(M, 5);                                                              \
+            REGISTER_KERNEL(M, 6)
+
+        REGISTER_KERNEL_M(1);
+        REGISTER_KERNEL_M(2);
+        REGISTER_KERNEL_M(4);
+        REGISTER_KERNEL_M(8);
+        REGISTER_KERNEL_M(16);
+
+#ifdef TNN_JIT_DUMP_KERNEL
+        for(int i=1;i<=nb_kernels_m;i++) {
+            if (g_pack_t_ker[i]){
+                g_pack_t_ker[i]->dump_to_file();
+            }
+        }
+        g_pack_t_4x16_ker->dump_to_file();
+
+        for(int i=1;i<=nb_kernels_n;i++) {
+            if (g_pack_n_ker[i]){
+                g_pack_n_ker[i]->dump_to_file();
+            } 
+        }
+
+        for(int m=1;m<=nb_kernels_m;m++) {
+            for(int n=1;n<=nb_kernels_n;n++) {
+                if (g_kernel_4[m][n]) {
+                    g_kernel_4[m][n]->dump_to_file();
+                }
+                if (g_kernel_8[m][n]) {
+                    g_kernel_8[m][n]->dump_to_file();
+                }
+                if (g_kernel_16[m][n]) {
+                    g_kernel_16[m][n]->dump_to_file();
+                }
+            }
+        }
+#endif
+
+    }); // end of initialized
+
+    for(int i = 1; i <= nb_kernels_m; i++) {
+        if (g_pack_t_ker[i]) {
+            pack_t_ker_[i] = jit::get_func_ptr<jit::sgemm_fetch_t_1_ker_t>(g_pack_t_ker[i].get());
+        } else {
+            pack_t_ker_[i] = nullptr;
+        }
+    }
+
+    if (g_pack_t_4x16_ker) {
+        pack_t_4x16_ker_ = jit::get_func_ptr<jit::sgemm_fetch_t_1_ker_t>(g_pack_t_4x16_ker.get());
+    }
+
+    for(int i = 1; i <= nb_kernels_n; i++) {
+        if (g_pack_n_ker[i]) {
+            pack_n_ker_[i] = jit::get_func_ptr<jit::sgemm_fetch_n_1_ker_t>(g_pack_n_ker[i].get());
+        } else {
+            pack_n_ker_[i] = nullptr;
+        }
+    }
+
+    using kernel_array_ptr = decltype(&g_kernel_16);
+    kernel_array_ptr kernel_array;
+    if (m_block_ == 4) {
+        kernel_array = &g_kernel_4;
+    } else if (m_block_ == 8) {
+        kernel_array = &g_kernel_8;
+    } else if (m_block_ == 16) {
+        kernel_array = &g_kernel_16;
+    } else {
+        throw std::runtime_error("unsupported m_block value."); 
+    }
+
+    for(int m = 1; m <= nb_kernels_m; m++) {
+        for(int n = 1; n <= nb_kernels_n; n++) {
+            if ((*kernel_array)[m][n]){
+                kernels_[m][n] = jit::get_func_ptr<jit::conv_sgemm_avx_kernel<0, 0>>((*kernel_array)[m][n].get());
+            } else {
+                kernels_[m][n] = nullptr;
+            }
+        }
+    }
+}
+
+template void conv_gemm_config<float, float, float>::init_jit_kernel();
+
+} // namespace tnn
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.h
new file mode 100644
index 0000000..15dc095
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_gemm_config.h
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_JIT_CONV_GEMM_CONFIG_HPP_
+#define TNN_JIT_CONV_GEMM_CONFIG_HPP_
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+
+namespace TNN_NS {
+
+template<typename a_t, typename b_t, typename c_t>
+struct conv_gemm_config {
+    
+typedef void (*fetch_n_func_t)(const dim_t m, const a_t * a, const dim_t lda, a_t * b, const dim_t ldb, const dim_t block_size);
+typedef void (*fetch_t_func_t)(const dim_t m, const a_t * a, const dim_t lda, a_t * b, const dim_t ldb, const dim_t block_size);
+typedef void (*conv_sgemm_ker_func_t)(const dim_t K,
+                                 const a_t * src_a, dim_t lda,
+                                 const b_t * src_b, dim_t ldb,
+                                 c_t * dst, dim_t ldc,
+                                 const b_t * bias, dim_t first, dim_t act_type);
+
+    conv_gemm_config(const dim_t m_block = 16, const dim_t n_block = 6);
+
+    // block size for data packing
+    dim_t m_block_;
+    dim_t n_block_;
+
+    // block size for kernel register blocking
+    dim_t kernel_m_r_;
+    dim_t kernel_n_r_;
+
+    // block size for matrix spliting
+    dim_t M_c_;
+    dim_t K_c_;
+
+    constexpr static int nb_kernels_m = 16;
+    constexpr static int nb_kernels_n = 6;
+
+    fetch_t_func_t pack_t_ker_[nb_kernels_m + 1];
+    fetch_t_func_t pack_t_4x16_ker_;
+    fetch_t_func_t pack_t_4x8_ker_;
+
+    fetch_n_func_t pack_n_ker_[nb_kernels_n + 1];
+
+    conv_sgemm_ker_func_t kernels_[nb_kernels_m + 1][nb_kernels_n + 1];
+
+private:
+    void init_jit_kernel();
+};
+
+} // namespace tnn
+
+#endif // TNN_JIT_GEMM_CONFIG_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.cc
new file mode 100644
index 0000000..bbafa22
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.cc
@@ -0,0 +1,401 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/compute/jit/cblas.h"
+
+#include <stdio.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/utils.h"
+#include "tnn/device/x86/acc/compute/jit/conv_data_packing.h"
+#include "tnn/device/x86/acc/compute/jit/conv_gemm_config.h"
+#include "tnn/device/x86/acc/compute/jit/utils/timer.hpp"
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+#include "tnn/utils/omp_utils.h"
+#include <xbyak/xbyak.h>
+
+namespace TNN_NS {
+
+void conv_sgemm_block_n(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t first, dim_t act_type,
+        conv_gemm_config<float, float, float> &conv_gemm_conf) 
+{
+
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+
+    for(dim_t i=0;i<M;)  {
+        dim_t cur_m = MIN(M - i, conv_gemm_conf.kernel_m_r_);
+
+        const float * cur_a = src_a + divDown(i, m_block) * K_c + i % m_block;
+        const float * cur_b = src_b;
+        float * cur_c = dst + i;
+
+        switch(cur_m) {
+            case 1:
+                conv_gemm_conf.kernels_[1][N](K, cur_a, lda, cur_b, ldb, cur_c, ldc, bias, first, act_type);
+                i+=1;
+                break;
+            case 2:
+            case 3:
+                conv_gemm_conf.kernels_[2][N](K, cur_a, lda, cur_b, ldb, cur_c, ldc, bias, first, act_type);
+                i+=2;
+                break;
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                conv_gemm_conf.kernels_[4][N](K, cur_a, lda, cur_b, ldb, cur_c, ldc, bias, first, act_type);
+                i+=4;
+                break;
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            case 12:
+            case 13:
+            case 14:
+            case 15:
+                conv_gemm_conf.kernels_[8][N](K, cur_a, lda, cur_b, ldb, cur_c, ldc, bias, first, act_type);
+                i+=8;
+                break;
+            default:
+                conv_gemm_conf.kernels_[16][N](K, cur_a, lda, cur_b, ldb, cur_c, ldc, bias, first, act_type);
+                i+=16;
+                break;
+        }
+    }
+}
+
+// sgemm col_major a no_trans, b no_trans
+// src_a: M * K, lda = M
+// src_b: K * N, ldb = K
+// dst  : M * N, ldc = M
+void conv_sgemm_nn_col_major(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *pack_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t M_c = conv_gemm_conf.M_c_;
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+
+    dim_t i, j, k;
+    i = j = k = 0;
+
+    dim_t first = 0;
+    dim_t post_type;
+
+    auto pack_a_buf = pack_buf;
+    auto pack_b_buf = pack_buf + divUp(M_c * K_c * sizeof(float), 32) / sizeof(float);
+
+    // if no bias, first set to 1, load c from dst
+    if (bias == nullptr) {
+        first = 1;
+    }
+
+    for (k = 0; k < K; k += K_c)  {
+        if (k + K_c >= K) {
+            post_type = act_type;
+        } else {
+            post_type = 0;
+        }
+
+        dim_t cur_k = MIN(K - k, K_c);
+
+        // pack b -> K_c * N;
+        // const float *pack_b_k = src_b + k * divUp(N, n_block);
+        pack_col_b_n(src_b + k, ldb, pack_b_buf, K_c, cur_k, N, conv_gemm_conf);
+
+        for (i = 0; i < M; i += M_c)  {
+            dim_t cur_m = MIN(M - i, M_c);
+            // pack a -> M_c * K_c;
+            pack_col_a_n(src_a + i + k * lda, lda, pack_a_buf, K_c, cur_k, cur_m, conv_gemm_conf);
+
+            for (j = 0; j < N;)  {
+                dim_t cur_n = MIN(N - j, conv_gemm_conf.kernel_n_r_);
+                float * cur_c = dst + i + j * ldc;
+
+                const float * packed_cur_b = pack_b_buf + divDown(j, n_block) * K_c + j % n_block;
+                const float * cur_bias = bias + j;
+                conv_sgemm_block_n(cur_m, cur_n, cur_k, pack_a_buf, lda, packed_cur_b, ldb, cur_c, ldc, cur_bias, first, post_type, conv_gemm_conf);
+                j += cur_n;
+            }
+        }
+        // if k != 0, first = 1
+        first = 1;
+    }
+}
+
+// sgemm col_major a no_trans, b no_trans
+// src_a: M * K, lda = M
+// src_b: K * N, ldb = K, prepacked
+// dst  : M * N, ldc = M
+void conv_sgemm_nn_col_major_prepack_b(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *src_trans_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t M_c = conv_gemm_conf.M_c_;
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+
+    dim_t first = 0;
+    dim_t post_type;
+
+    // if no bias, first set to 1, load c from dst
+    if (bias == nullptr) {
+        first = 1;
+    }
+
+    for (dim_t k = 0; k < K; k += K_c)  {
+        if (k + K_c >= K) {
+            post_type = act_type;
+        } else {
+            post_type = 0;
+        }
+
+        dim_t cur_k = MIN(K - k, K_c);
+
+        // pack b -> K_c * N;
+        const float *pack_b_k = src_b + k * divUp(N, n_block);
+
+        OMP_PARALLEL_FOR_DYNAMIC_
+        for (dim_t i = 0; i < M; i += M_c)  {
+            int thread_id = OMP_TID_;
+            auto src_trans_per_t = src_trans_buf + thread_id * M_c * K_c;
+            dim_t cur_m = MIN(M - i, M_c);
+            // pack a -> M_c * K_c;
+            pack_col_a_n(src_a + i + k * lda, lda, src_trans_per_t, K_c, cur_k, cur_m, conv_gemm_conf);
+
+            for (dim_t j = 0; j < N;)  {
+                dim_t cur_n = MIN(N - j, conv_gemm_conf.kernel_n_r_);
+                float * cur_c = dst + i + j * ldc;
+
+                const float * packed_cur_b = pack_b_k + divDown(j, n_block) * K_c + j % n_block;
+                const float * cur_bias = bias + j;
+                conv_sgemm_block_n(cur_m, cur_n, cur_k, src_trans_per_t, lda, packed_cur_b, ldb, cur_c, ldc, cur_bias, first, post_type, conv_gemm_conf);
+                j += cur_n;
+            }
+        }
+        // if k != 0, first = 1
+        first = 1;
+    }
+}
+
+// sgemm col_major a trans, b no_trans
+// src_a: K * M, lda = K
+// src_b: K * N, ldb = K, prepacked
+// dst  : M * N, ldc = M
+void conv_sgemm_tn_col_major_prepack_b(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *src_trans_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t M_c = conv_gemm_conf.M_c_;
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+
+    dim_t i, j, k;
+    i = j = k = 0;
+
+    dim_t first = 0;
+    dim_t post_type;
+
+    // if no bias, first set to 1, load c from dst
+    if (bias == nullptr) {
+        first = 1;
+    }
+
+    for (k = 0; k < K; k += K_c)  {
+        if (k + K_c >= K) {
+            post_type = act_type;
+        } else {
+            post_type = 0;
+        }
+
+        dim_t cur_k = MIN(K - k, K_c);
+
+        // pack b -> K_c * N;
+        const float *pack_b_k = src_b + k * divUp(N, n_block);
+
+        for (i = 0; i < M; i += M_c)  {
+            dim_t cur_m = MIN(M - i, M_c);
+            // pack a -> M_c * K_c;
+            pack_col_a_t(src_a + k + i * lda, lda, src_trans_buf, K_c, cur_k, cur_m, conv_gemm_conf);
+
+            for (j = 0; j < N;)  {
+                dim_t cur_n = MIN(N - j, conv_gemm_conf.kernel_n_r_);
+                float * cur_c = dst + i + j * ldc;
+
+                const float * packed_cur_b = pack_b_k + divDown(j, n_block) * K_c + j % n_block;
+                const float * cur_bias = bias + j;
+                conv_sgemm_block_n(cur_m, cur_n, cur_k, src_trans_buf, lda, packed_cur_b, ldb, cur_c, ldc, cur_bias, first, post_type, conv_gemm_conf);
+                j += cur_n;
+            }
+        }
+        // if k != 0, first = 1
+        first = 1;
+    }
+}
+
+// sgemm col_major a trans, b no_trans
+// src_a: K * M, lda = K, prepacked
+// src_b: K * N, ldb = K
+// dst  : M * N, ldc = M
+void conv_sgemm_tn_col_major_prepack_a(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *pack_b_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t M_c = conv_gemm_conf.M_c_;
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+
+    dim_t first = 0;
+    dim_t post_type;
+
+    // if no bias, first set to 1, load c from dst
+    if (bias == nullptr) {
+        first = 1;
+    }
+
+    for (dim_t k = 0; k < K; k += K_c)  {
+        if (k + K_c >= K) {
+            post_type = act_type;
+        } else {
+            post_type = 0;
+        }
+
+        dim_t cur_k = MIN(K - k, K_c);
+
+        // pack b -> K_c * N;
+        pack_col_b_n(src_b + k, ldb, pack_b_buf, K_c, cur_k, N, conv_gemm_conf);
+
+        OMP_PARALLEL_FOR_DYNAMIC_
+        for (dim_t i = 0; i < M; i += M_c)  {
+            dim_t cur_m = MIN(M - i, M_c);
+            // pack a -> M_c * K_c;
+            auto src_a_i = src_a + k * divUp(M, m_block) + i * K_c;
+
+            for (dim_t j = 0; j < N;)  {
+                dim_t cur_n = MIN(N - j, conv_gemm_conf.kernel_n_r_);
+                float * cur_c = dst + i + j * ldc;
+
+                const float * packed_cur_b = pack_b_buf + divDown(j, n_block) * K_c + j % n_block;
+                const float * cur_bias = bias + j;
+                conv_sgemm_block_n(cur_m, cur_n, cur_k, src_a_i, lda, packed_cur_b, ldb, cur_c, ldc, cur_bias, first, post_type, conv_gemm_conf);
+                j += cur_n;
+            }
+        }
+        // if k != 0, first = 1
+        first = 1;
+    }
+}
+
+// pack col major B no_trans [K x N]
+void conv_pack_col_b_n(
+        dim_t N, dim_t K,
+        const float * src, dim_t ld_src,
+        float * dst,
+        conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+    dim_t N_round_up = divUp(N, n_block);
+
+    for (dim_t k = 0; k < K; k += K_c) {
+        dim_t cur_k = MIN(K - k, K_c);
+        float *pack_b = dst + k * N_round_up;
+        pack_col_b_n(src + k, ld_src, pack_b, K_c, cur_k, N, conv_gemm_conf);
+    }
+}
+
+// pack col major A trans [K x M]
+void conv_pack_col_a_t(
+    dim_t M, dim_t K,
+    const float * src, dim_t lda,
+    float * dst,
+    conv_gemm_config<float, float, float> &conv_gemm_conf)
+{
+    dim_t M_c = conv_gemm_conf.M_c_;
+    dim_t K_c = conv_gemm_conf.K_c_;
+    dim_t m_block = conv_gemm_conf.m_block_;
+    dim_t n_block = conv_gemm_conf.n_block_;
+
+    dim_t i, j, k;
+    i = j = k = 0;
+
+    for (k = 0; k < K; k += K_c)  {
+        dim_t cur_k = MIN(K - k, K_c);
+        auto src_k = src + k;
+        auto dst_k = dst + k * divUp(M, m_block);
+
+        for (i = 0; i < M; i += M_c)  {
+            dim_t cur_m = MIN(M - i, M_c);
+            // pack a -> M_c * K_c;
+            pack_col_a_t(src_k + i * lda, lda, dst_k + i * K_c, K_c, cur_k, cur_m, conv_gemm_conf);
+        }
+    }
+}
+
+// // pack A [K * M]
+// void conv_pack_a_n()
+// {
+
+// }
+
+void conv_ajust_m_blk_size(
+    int max_num_threads,
+    dim_t m_all,
+    dim_t &m_blk)
+{
+    // for 32bit, min M blk = 8
+    // for 64bit, min M blk = 16
+    int m_min = 8;
+#ifdef XBYAK64
+    m_min = 16;
+#endif
+
+    while ((m_all / m_blk) < max_num_threads &&
+           m_blk > m_min) {
+        m_blk = MAX(m_blk / 2, m_min);
+    }
+}
+
+} // namespace tnn
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h
new file mode 100644
index 0000000..c026607
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef SOURCE_TNN_DEVICE_X86_ACC_COMPUTE_JIT_CONV_SGEMM_DRIVER_H_
+#define SOURCE_TNN_DEVICE_X86_ACC_COMPUTE_JIT_CONV_SGEMM_DRIVER_H_
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/acc/compute/jit/conv_gemm_config.h"
+
+namespace TNN_NS {
+
+// sgemm col_major a no_trans, b no_trans
+void conv_sgemm_nn_col_major(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *pack_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// sgemm col_major a no_trans, b no_trans prepacked
+void conv_sgemm_nn_col_major_prepack_b(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float * src_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// sgemm col_major a trans, b no_trans prepacked
+void conv_sgemm_tn_col_major_prepack_b(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *src_trans_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// sgemm col_major a trans prepacked, b no_trans
+void conv_sgemm_tn_col_major_prepack_a(
+        dim_t M, dim_t N, dim_t K,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb,
+        float * dst, dim_t ldc,
+        const float * bias, dim_t act_type,
+        float *src_trans_buf,
+        conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// sgemm col_major pack b no_trans
+void conv_pack_col_b_n(
+        dim_t N, dim_t K,
+        const float * src, dim_t ld_src,
+        float * dst,
+        conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// sgemm col_major pack a trans
+void conv_pack_col_a_t(
+    dim_t M, dim_t K,
+    const float * src, dim_t lda,
+    float * dst,
+    conv_gemm_config<float, float, float> &conv_gemm_conf);
+
+// adjust M block size (M_c_) for mutil-thread
+void conv_ajust_m_blk_size(
+    int max_num_threads,
+    dim_t m_all,
+    dim_t &m_blk);
+
+}   // namespace TNN_NS
+
+#endif
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.cc
new file mode 100644
index 0000000..d86bd16
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/x86/acc/compute/jit/data_packing.h"
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include <stdio.h>
+
+#include "tnn/device/x86/acc/compute/jit/gemm_config.h"
+
+namespace TNN_NS {
+
+//  pack block_size on non-leading dimension, n denotes no-transpose. 
+//  eg. input:   A MxN matrix in col major, so the storage-format is (N, M) 
+//      output:  B MxN matrix in col major(N-packed), so the storage-format is 
+//                    (divUp(N, block_size), M, block_size)
+template<typename T>
+void pack_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, gemm_config<T, T, T> &gemm_conf)
+{
+    int block_size = gemm_conf.n_block_;
+    dim_t i=0; 
+    for(;n-i>= block_size;i+= block_size) {
+        const T * cur_a = a + i * lda;
+        T * cur_b = b + i * ldb;
+        gemm_conf.pack_n_ker_[block_size](m, cur_a, lda, cur_b, ldb, block_size);
+    }
+    dim_t tail = n - i;
+    if (tail > 0) {
+        const T * cur_a = a + i * lda;
+        T * cur_b = b + i * ldb;
+        gemm_conf.pack_n_ker_[tail](m, cur_a, lda, cur_b, ldb, block_size);
+    }
+
+}
+
+template 
+void pack_n<float>(const float * a, dim_t lda, float * b, dim_t ldb, 
+                   dim_t m, dim_t n,
+                   gemm_config<float, float, float> &gemm_conf);
+
+//  pack block_size on leading dimension, t denotes transpose. 
+//  eg. input:   A MxN matrix in row major, so the storage-format is (M, N) 
+//      output:  B MxN matrix in col major(N-packed), so the storage-format is 
+//                    (divUp(N, block_size), M, block_size)
+template<typename T>
+void pack_t(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, gemm_config<T, T, T> &gemm_conf) 
+{
+    dim_t block_size = gemm_conf.m_block_;
+    dim_t i = 0; 
+
+    if (block_size == 16) {
+        for(;i + 64 <=n;i+=64) {
+            const T * cur_a = a + i;
+            T * cur_b = b + i * ldb;
+            gemm_conf.pack_t_4x16_ker_(m, cur_a, lda, cur_b, ldb, block_size);
+        }
+    } else if (block_size == 8) {
+        
+    }
+
+    for(;i<n;) {
+        const T * cur_a = a + i;
+        T * cur_b = b + divDown(i, block_size) * ldb + i % block_size;
+        dim_t cur_n = std::min(n - i, block_size);
+        switch(cur_n) {
+            case 1:
+                gemm_conf.pack_t_ker_[1](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=1;
+                break;
+            case 2:
+                gemm_conf.pack_t_ker_[2](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=2;
+                break;
+            case 3:
+                gemm_conf.pack_t_ker_[3](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=3;
+                break;
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                gemm_conf.pack_t_ker_[4](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=4;
+                break;
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            case 12:
+            case 13:
+            case 14:
+            case 15:
+                gemm_conf.pack_t_ker_[8](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=8;
+                break;
+            default:
+                gemm_conf.pack_t_ker_[16](m, cur_a, lda, cur_b, ldb, block_size);
+                i+=16;
+                break;
+        }
+    }
+    /*
+    if (block_n == 16) {
+
+        int i=0;
+        // must be a perfect loop, n % block_n = 0
+        for(;i + 64 <=n;i+=64) {
+            const float * cur_a = a + i;
+            float * cur_b = b + i * ldb;
+            sgemm_fetch_t_4x16(m, cur_a, lda, cur_b, ldb, 16);
+        }
+
+    }
+
+    */
+}
+
+template 
+void pack_t<float>(const float * a, dim_t lda, float * b, dim_t ldb, 
+                   dim_t m, dim_t n,
+                   gemm_config<float, float, float> &gemm_conf);
+
+} // namespace tnn
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.h
new file mode 100644
index 0000000..85cc4a8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/data_packing.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+
+#ifndef TNN_JIT_DATA_PACKING_H_
+#define TNN_JIT_DATA_PACKING_H_
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/utils.h"
+#include "tnn/device/x86/acc/compute/jit/gemm_config.h"
+
+namespace TNN_NS {
+
+template<typename T>
+void pack_t(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, gemm_config<T, T, T> &gemm_conf);
+
+template<typename T>
+void pack_n(const T * a, dim_t lda, T * b, dim_t ldb, dim_t m, dim_t n, gemm_config<T, T, T> &gemm_conf);
+
+} // namespace tnn
+
+#endif //TNN_JIT_DATA_PACKING_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.cc
new file mode 100644
index 0000000..9e6e347
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.cc
@@ -0,0 +1,237 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "gemm_config.h"
+
+#include <mutex>
+#include <memory>
+#include <vector>
+#include <algorithm>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/jit_kernels.h"
+
+namespace TNN_NS {
+
+template<typename a_t, typename b_t, typename c_t>
+gemm_config<a_t, b_t, c_t>::gemm_config(const char * trans_a, const char * trans_b, 
+                const dim_t m, const dim_t n, const dim_t k,
+                const float * alpha, const a_t * a, const dim_t lda, const b_t * b, const dim_t ldb, 
+                const float * beta, c_t * c, const dim_t ldc, const dim_t m_block, const dim_t n_block)  :
+                a_(a), b_(b), c_(c), m_(m), n_(n), k_(k), lda_(lda), ldb_(ldb), ldc_(ldc), 
+                alpha_(alpha), beta_(beta), m_block_(m_block), n_block_(n_block)
+{
+    std::vector<int> supported_m_block = {8, 16};
+    std::vector<int> supported_n_block = {6};
+    if (std::find(supported_m_block.begin(), supported_m_block.end(), m_block_) == supported_m_block.end()) {
+       throw std::runtime_error("value of m_block is not supported.");
+    }
+    if (std::find(supported_n_block.begin(), supported_n_block.end(), n_block_) == supported_n_block.end()) {
+       throw std::runtime_error("value of n_block is not supported.");
+    }
+
+    M_c_ = 64;
+    K_c_ = 256;
+
+#ifdef XBYAK64
+    kernel_m_r_ = 16; 
+#else 
+    m_block_ = 8;
+    kernel_m_r_ = 8; 
+#endif
+    kernel_n_r_ = 6; 
+    this->init_jit_kernel();
+}
+
+template gemm_config<float, float, float>::gemm_config(const char * trans_a, const char * trans_b, 
+                const dim_t m, const dim_t n, const dim_t k,
+                const float * alpha, const float * a, const dim_t lda, const float * b, const dim_t ldb, 
+                const float * beta, float * c, const dim_t ldc, const dim_t m_block, const dim_t n_block);
+
+template<typename a_t, typename b_t, typename c_t>
+void gemm_config<a_t, b_t, c_t>::init_jit_kernel() 
+{
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_t_ker[nb_kernels_m + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_t_4x16_ker;
+    static std::shared_ptr<jit::base_jit_kernel> g_pack_n_ker[nb_kernels_n + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_kernel_8 [nb_kernels_m + 1][nb_kernels_n + 1];
+    static std::shared_ptr<jit::base_jit_kernel> g_kernel_16[nb_kernels_m + 1][nb_kernels_n + 1];
+
+    static std::once_flag initialized;
+    std::call_once(initialized, [] {
+
+        // initialize the pack t kernels
+        for(int i=1;i<=nb_kernels_n;i++) {
+            switch (i) {
+                case 1:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_1_ker_t>();
+                    break;
+                case 2:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_2_ker_t>();
+                    break;
+                case 3:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_3_ker_t>();
+                    break;
+                case 4:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_4_ker_t>();
+                    break;
+                case 5:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_5_ker_t>();
+                    break;
+                case 6:
+                    g_pack_n_ker[i] = std::make_shared<jit::sgemm_fetch_n_6_ker_t>();
+                    break;
+                default:
+                    throw std::runtime_error("uninitialized kernel."); 
+                    break;
+            }
+        }
+
+        // initialize the pack t kernels
+        for(int i=1;i<=nb_kernels_m;i++) {
+            switch (i) {
+                case 0:
+                    break;
+                case 1:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_1_ker_t>();
+                    break;
+                case 2:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_2_ker_t>();
+                    break;
+                case 3:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_3_ker_t>();
+                    break;
+                case 4:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_4_ker_t>();
+                    break;
+                // case 5:
+                //     g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_5_ker_t>();
+                //     break;
+                // case 6:
+                //     g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_6_ker_t>();
+                //     break;
+                // case 7:
+                //     g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_7_ker_t>();
+                //     break;
+                case 8:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_8_ker_t>();
+                    break;
+                case 16:
+                    g_pack_t_ker[i] = std::make_shared<jit::sgemm_fetch_t_16_ker_t>();
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        g_pack_t_4x16_ker = std::make_shared<jit::sgemm_fetch_t_4x16_ker_t>();
+
+#define REGISTER_KERNEL(M, N)                                                                   \
+            if (M <= 8)  g_kernel_8[M][N] =                                                     \
+            std::make_shared<jit::sgemm_avx_kernel<M, N, 8, 6>>();                              \
+            if (M <= 16) g_kernel_16[M][N] =                                                    \
+            std::make_shared<jit::sgemm_avx_kernel<M, N, 16, 6>>();                             \
+
+#define REGISTER_KERNEL_M(M)                                                                    \
+            REGISTER_KERNEL(M, 1);                                                              \
+            REGISTER_KERNEL(M, 2);                                                              \
+            REGISTER_KERNEL(M, 3);                                                              \
+            REGISTER_KERNEL(M, 4);                                                              \
+            REGISTER_KERNEL(M, 5);                                                              \
+            REGISTER_KERNEL(M, 6)
+
+        REGISTER_KERNEL_M(1);
+        REGISTER_KERNEL_M(2);
+        REGISTER_KERNEL_M(4);
+        REGISTER_KERNEL_M(8);
+        REGISTER_KERNEL_M(16);
+
+#ifdef TNN_JIT_DUMP_KERNEL 
+        for(int i=1;i<=nb_kernels_m;i++) {
+            if (g_pack_t_ker[i]){
+                g_pack_t_ker[i]->dump_to_file();
+            }
+        }
+        g_pack_t_4x16_ker->dump_to_file();
+
+        for(int i=1;i<=nb_kernels_n;i++) {
+            if (g_pack_n_ker[i]){
+                g_pack_n_ker[i]->dump_to_file();
+            } 
+        }
+
+        for(int m=1;m<=nb_kernels_m;m++) {
+            for(int n=1;n<=nb_kernels_n;n++) {
+                if (g_kernel_8[m][n]) {
+                    g_kernel_8[m][n]->dump_to_file();
+                }
+                if (g_kernel_16[m][n]) {
+                    g_kernel_16[m][n]->dump_to_file();
+                }
+            }
+        }
+#endif
+
+    });
+
+
+    for(int i=1;i<=nb_kernels_m;i++) {
+        if (g_pack_t_ker[i]){
+            pack_t_ker_[i] = jit::get_func_ptr<jit::sgemm_fetch_t_1_ker_t>(g_pack_t_ker[i].get());
+        } else {
+            pack_t_ker_[i] = nullptr;
+        }
+    }
+
+    if (g_pack_t_4x16_ker){
+        pack_t_4x16_ker_ = jit::get_func_ptr<jit::sgemm_fetch_t_1_ker_t>(g_pack_t_4x16_ker.get());
+    }
+
+    for(int i=1;i<=nb_kernels_n;i++) {
+        if (g_pack_n_ker[i]){
+            pack_n_ker_[i] = jit::get_func_ptr<jit::sgemm_fetch_n_1_ker_t>(g_pack_n_ker[i].get());
+        } else {
+            pack_n_ker_[i] = nullptr;
+        }
+    }
+
+
+    using kernel_array_ptr = decltype(&g_kernel_16);
+    kernel_array_ptr kernel_array;
+    if (m_block_ == 8) {
+        kernel_array = &g_kernel_8;
+    } else if (m_block_ == 16)  {
+        kernel_array = &g_kernel_16;
+    } else {
+        throw std::runtime_error("unsupported m_block value."); 
+    }
+
+    for(int m=1;m<=nb_kernels_m;m++) {
+        for(int n=1;n<=nb_kernels_n;n++) {
+            if ((*kernel_array)[m][n]){
+                kernels_[m][n] = jit::get_func_ptr<jit::sgemm_avx_kernel<0, 0>>((*kernel_array)[m][n].get());
+            } else {
+                kernels_[m][n] = nullptr;
+            }
+        }
+    }
+    
+}
+
+template void gemm_config<float, float, float>::init_jit_kernel();
+
+} // namespace tnn
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.h
new file mode 100644
index 0000000..2c3fab8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/gemm_config.h
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_JIT_GEMM_CONFIG_HPP_
+#define TNN_JIT_GEMM_CONFIG_HPP_
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+
+namespace TNN_NS {
+
+template<typename a_t, typename b_t, typename c_t>
+struct gemm_config {
+    
+typedef void (*fetch_n_func_t)(const dim_t m, const a_t * a, const dim_t lda, a_t * b, const dim_t ldb, const dim_t block_size);
+typedef void (*fetch_t_func_t)(const dim_t m, const a_t * a, const dim_t lda, a_t * b, const dim_t ldb, const dim_t block_size);
+typedef void (*sgemm_ker_func_t)(const dim_t K, const float * alpha_ptr,
+                                 const a_t * src_a, dim_t lda,
+                                 const b_t * src_b, dim_t ldb, const float * beta_ptr,
+                                 c_t * dst, dim_t ldc);
+
+    gemm_config(const char * trans_a, const char * trans_b, const dim_t m, const dim_t n, const dim_t k,
+                const float * alpha, const a_t * a, const dim_t lda, const b_t * b, const dim_t ldb, 
+                const float * beta, c_t * c, const dim_t ldc, 
+                const dim_t m_block = 16, const dim_t n_block = 6);
+
+    const a_t * a_;
+    const b_t * b_;
+    c_t * c_; 
+
+    const dim_t m_, n_, k_;
+    const dim_t lda_, ldb_, ldc_;
+
+    const float * alpha_;
+    const float * beta_;
+
+    // block size for data packing
+    dim_t m_block_;
+    dim_t n_block_;
+
+    // block size for kernel register blocking
+    dim_t kernel_m_r_;
+    dim_t kernel_n_r_;
+
+    // block size for matrix spliting
+    dim_t M_c_;
+    dim_t K_c_;
+
+    constexpr static int nb_kernels_m = 16;
+    constexpr static int nb_kernels_n = 6;
+
+    fetch_t_func_t pack_t_ker_[nb_kernels_m + 1];
+    fetch_t_func_t pack_t_4x16_ker_;
+    fetch_t_func_t pack_t_4x8_ker_;
+
+    fetch_n_func_t pack_n_ker_[nb_kernels_n + 1];
+    
+    sgemm_ker_func_t kernels_[nb_kernels_m + 1][nb_kernels_n + 1];
+
+private:
+    void init_jit_kernel();
+
+};
+
+} // namespace tnn
+
+#endif // TNN_JIT_GEMM_CONFIG_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h
new file mode 100644
index 0000000..899a162
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h
@@ -0,0 +1,441 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_BASE_JIT_KERNEL_HPP_
+#define TNN_BASE_JIT_KERNEL_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <string>
+#include <sstream>
+#include <queue>
+#include <stack>
+#include <algorithm>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/utils/utils.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/utils/cpu_isa.h"
+
+namespace TNN_NS {
+namespace jit {
+
+class base_jit_kernel;
+
+template<typename T>
+typename T::func_ptr_t get_func_ptr(T * jit_kernel) {
+    typename T::func_ptr_t ptr;
+    T* our_kernel = dynamic_cast<T*>(jit_kernel);
+    if ( !our_kernel ) {
+        throw std::runtime_error("Not a type of base_jit_kernel"); 
+    }
+    const void * jit_func_addr = our_kernel->get_func_ptr();
+    if (jit_func_addr == nullptr) {
+        throw std::runtime_error("getCode() got null ptr"); 
+    }
+    memcpy(&ptr, &jit_func_addr, sizeof(void*));
+    return ptr;
+}
+
+template<typename T>
+typename T::func_ptr_t get_func_ptr(base_jit_kernel * jit_kernel) {
+    return get_func_ptr<T>((T*) jit_kernel);
+}
+
+class base_jit_kernel : public Xbyak::CodeGenerator {
+public:
+    typedef common::rf_ rf_t;
+
+    enum WHEN_TO_USE_FLAG : int32_t {
+        NOW = 0,
+        LATER = 1,
+    };
+
+    struct stack_var : public Xbyak::Address {
+        explicit stack_var(rf_t base, size_t offset, uint32_t bit = rword_.bit_): 
+                Xbyak::Address(bit, false, base + offset) {}
+
+    };
+
+    struct reg_var : public rf_t {
+        explicit reg_var(base_jit_kernel * ker) : ker_(ker), v_stack_(ker->get_stack_var()) {}
+        explicit reg_var(base_jit_kernel * ker, stack_var var_s) : ker_(ker), v_stack_(var_s) {}
+
+        ~reg_var() {if (in_use_) this->release(); }
+
+        // get a real register
+        rf_t aquire() {
+            if (!in_use_) {
+                rf_t rf = this->ker_->get_free_reg();
+                setIdx(rf.getIdx());
+                setKind(rf.getKind());
+                setBit(rf.getBit());
+                setOpmaskIdx(rf.getOpmaskIdx());
+                setRounding(rf.getRounding());
+                zero_ = rf.hasZero();
+                in_use_ = true;
+            } else {
+                throw std::runtime_error("the register is already in use!"); 
+            }
+            return *this;
+        }
+
+        // release the real register
+        rf_t release() {
+            if (in_use_) {
+                in_use_ = false;
+                this->ker_->drop_reg(*this);
+            } else {
+                throw std::runtime_error("the register is not in use, can't be relased."); 
+            }
+            return *this;
+        }
+
+        // save the content to stack, then release the real register
+        void stash() {
+            if (in_use_) {
+                ker_->mov(v_stack_, *this);
+                this->release();
+            } else {
+                throw std::runtime_error("the register is not in use, can't be stashed."); 
+            }
+            return;
+        }
+
+        // save the content to stack
+        void store() {
+            if (in_use_) {
+                ker_->mov(v_stack_, *this);
+            } else {
+                throw std::runtime_error("the register is not in use, can't be stored."); 
+            }
+            return;
+        }
+
+        // restore the content from stack, then aquire a register
+        rf_t restore() {
+            if (!in_use_) {
+                this->aquire();
+                ker_->mov(*this, v_stack_);
+            } else {
+                throw std::runtime_error("the register is already in use, can't be restored."); 
+            }
+            return *this;
+        }
+
+        bool in_use_ = false;
+        stack_var v_stack_;
+        base_jit_kernel * ker_;
+    };
+
+    struct vreg_var : public Xbyak::Ymm {
+        explicit vreg_var(base_jit_kernel * ker) : ker_(ker) {}
+
+        ~vreg_var() {if (in_use_) this->release(); }
+
+        Xbyak::Ymm aquire() {
+            if (!in_use_) {
+                Xbyak::Ymm rf = this->ker_->get_free_vreg<Xbyak::Ymm>();
+                setIdx(rf.getIdx());
+                setKind(rf.getKind());
+                setBit(rf.getBit());
+                setOpmaskIdx(rf.getOpmaskIdx());
+                setRounding(rf.getRounding());
+                zero_ = rf.hasZero();
+                in_use_ = true;
+            } else {
+                throw std::runtime_error("the vregister is already in use!");
+            }
+            return *this;
+        }
+
+        Xbyak::Ymm release() {
+            if (in_use_) {
+                in_use_ = false;
+                this->ker_->drop_vreg(*this);
+            } else {
+                throw std::runtime_error("the vregister is not in use, can't be relased."); 
+            }
+            return *this;
+        }
+
+        Xbyak::Xmm xmm() {
+            return Xbyak::Xmm(getIdx());
+        }
+
+        bool in_use_ = false;
+        base_jit_kernel * ker_;
+    };
+
+    static void naive_impl(const float * a, const float * b, const size_t n, float * c);
+
+    using func_ptr_t = decltype(&base_jit_kernel::naive_impl);
+
+    virtual void dump_to_file(const char * fname = nullptr) {
+        base_jit_kernel * ptr = this;
+        std::string default_fname_name = ptr->get_kernel_name() + std::string(".bin");
+        if (fname == nullptr) {
+            fname = default_fname_name.c_str();
+        }
+        FILE * fp = tnn_fopen(fname, "w+");
+        if (! fp) 
+            return;
+        fwrite(get_func_ptr(), get_func_size(), 1, fp);
+        fclose(fp);
+    }
+
+    virtual void * get_func_ptr() {
+        return Xbyak::CodeArray::getCode<void*>();
+    }
+
+    virtual size_t get_func_size() {
+        return Xbyak::CodeArray::getSize();
+    }
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(base_jit_kernel);
+    }
+
+protected:
+
+    template<typename T>
+    void declare_param() {
+        if (sizeof(T) != abi::register_width_in_bytes) {
+            throw std::runtime_error("unsupported param type."); 
+        }
+        abi_nb_argment += 1;
+    }
+
+    void abi_prolog() {
+        /* 1. save base stack pointer */
+        push(xbp);
+        mov(xbp, xsp);
+
+        /* 
+            push(a) does the following things:
+                rsp -= sizeof(void *)
+                rsp = (a)
+            so, after mov(xbp, xsp), [xbp] is the xbp of caller frame
+            next pushed object will be at -4[xbp] or -8[xbp]
+        */
+        abi_bp_offset_ = -abi::register_width_in_bytes; 
+        /* 2. save the regs that abi require callee to save */
+        /* only windows x64 need to save xmm6 - xmm15 regs */
+#ifdef XBYAK64
+        if (xmm_to_preserve) {
+            sub(rsp, xmm_to_preserve * xmm_len);
+            for (size_t i = 0; i < xmm_to_preserve; ++i) {
+                movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i));
+            }
+        }
+        abi_bp_offset_ -= xmm_to_preserve * xmm_len;
+#endif
+        for(int i=0;i<abi::abi_nb_regs_callee_save;i++) {
+            push(rf_t(abi::abi_regs_callee_save[i]));
+            abi_bp_offset_ -= abi::register_width_in_bytes;
+        }
+
+        /* 3. save the arguements from register to stack */
+        size_t abi_stack_arg_offset = abi::abi_stack_param_offset + 2 * abi::register_width_in_bytes;
+        for(int i=0;i<abi_nb_argment;i++) {
+            if (i < abi::abi_nb_args_in_register) {
+                push(rf_t(abi::abi_args_in_register[i]));
+                arguement_offsets_.push_back(abi_bp_offset_);
+                abi_bp_offset_ -= abi::register_width_in_bytes;
+            } else {
+                arguement_offsets_.push_back(abi_stack_arg_offset);
+                abi_stack_arg_offset += abi::register_width_in_bytes;
+            }
+        }
+    }
+
+    void abi_epilog() {
+        /* 1. rewind the stack for function arguments */
+        size_t size_in_bytes = 0;
+        for(int i=0;i<abi_nb_argment;i++) {
+            if (i < abi::abi_nb_args_in_register) {
+                size_in_bytes += abi::register_width_in_bytes;
+            }
+        }
+        if (size_in_bytes > 0) add(xsp, size_in_bytes);
+
+        /* 2. restore the regs that abi require callee to save */
+        for(int i=abi::abi_nb_regs_callee_save-1;i>=0;i--){
+            pop(rf_t(abi::abi_regs_callee_save[i]));
+        }
+#ifdef XBYAK64
+        if (xmm_to_preserve) {
+            for (size_t i = 0; i < xmm_to_preserve; ++i) {
+                movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]);
+            }
+            add(rsp, xmm_to_preserve * xmm_len);
+        }
+#endif
+        /* 3. restore the base stack pointer */
+        leave(); // mov(sp, bp); pop(bp);
+    }
+
+    rf_t get_free_reg() {
+        if (free_regs_.empty()) {
+            throw std::runtime_error("not enough regs");
+        }
+        int id = free_regs_.front();
+        // printf("total regs:%d get reg%d\n", free_regs_.size(), id);
+        free_regs_.pop();
+        return rf_t(common::regs_[id]);
+    }
+
+    void drop_reg(rf_t r) {
+        auto it = std::find(common::regs_.begin(), common::regs_.end(), r);
+        int id = it - common::regs_.begin();
+        // printf("drop reg%d\n", id);
+        free_regs_.push(id);
+    }
+
+    template<typename T>
+    T get_free_vreg() {
+        if (free_vregs_.empty()) {
+            throw std::runtime_error("not enough vector regs");
+        }
+        int id = free_vregs_.front();
+        // printf("[xmm]total regs:%d get reg%d\n", free_vregs_.size(), id);
+        free_vregs_.pop();
+        return T(common::mmx_[id].getIdx());
+    }
+
+    template<typename T>
+    void drop_vreg(T r) {
+        Xbyak::Mmx mmx_r(r.getIdx());
+        auto it = std::find(common::mmx_.begin(), common::mmx_.end(), mmx_r);
+        int id = it - common::mmx_.begin();
+        // printf("[xmm]drop reg%d\n", id);
+        free_vregs_.push(id);
+    }
+
+    stack_var get_stack_var(bool clear=false, size_t size_in_bytes = rword_.bit_ / 8) {
+        std::vector<stack_var>::iterator candidate = stack_var_list_.end();
+        for(auto it = stack_var_list_.begin(); it != stack_var_list_.end(); it ++) {
+            if (it->getBit() == size_in_bytes * 8) {
+                candidate = it;
+            }
+        }
+
+        stack_var ret(xsp, 0);
+        if (candidate != stack_var_list_.end()) {
+            ret = *candidate;
+            stack_var_list_.erase(candidate);
+        } else {
+            stack_variable_offset_ += size_in_bytes;
+            ret = stack_var(xsp, -stack_variable_offset_, size_in_bytes * 8);
+        }
+        if (clear) {
+            rf_t f = get_free_reg();
+            xor_(f, f);
+            mov(ret, f);
+            drop_reg(f);
+        }
+        return ret;
+    }
+
+    void drop_stack_var(stack_var v) {
+        stack_var_list_.push_back(v);
+    }
+
+    reg_var get_arguement(int i, WHEN_TO_USE_FLAG when = WHEN_TO_USE_FLAG::LATER) {
+        if (i >= arguement_offsets_.size() ) {
+            throw std::runtime_error("invalide arguement id.");
+        }
+        reg_var rf(this, stack_var(xbp, arguement_offsets_[i]));
+        if (when == WHEN_TO_USE_FLAG::NOW) {
+            rf.restore();
+        }
+        return rf;
+    }
+
+    stack_var get_arguement_to_stack(int i) {
+        if (i >= arguement_offsets_.size() ) {
+            throw std::runtime_error("invalide arguement id.");
+        }
+        return stack_var(xbp, arguement_offsets_[i]);
+    }
+
+public:
+    base_jit_kernel(): rword(rword_.bit_, rword_.broadcast_) {
+        for(int i=0;i<common::regs_.size();i++)free_regs_.push(i);
+        for(int i=0;i<common::mmx_.size();i++)free_vregs_.push(i);
+    }
+
+    virtual ~base_jit_kernel() {
+
+    }
+
+    inline void vfmadd231ps_sse(Xbyak::Xmm v1, Xbyak::Xmm v2, Xbyak::Xmm v3) {
+        if (cpu_with_isa(avx2)) {
+            vfmadd231ps(v1, v2, v3);
+        } else if (cpu_with_isa(sse42)) {
+            // v2 * v3 -> v3
+            // v1 + v3 -> v1
+            mulps(v3, v2);
+            addps(v1, v3);
+        }
+    }
+
+    inline void vbroadcastss_sse(Xbyak::Xmm v, Xbyak::Address addr) {
+        if (cpu_with_isa(avx2)) {
+            vbroadcastss(v, addr);
+        } else if (cpu_with_isa(sse42)) {
+            movss(v, addr);
+            shufps(v, v, 0);
+        }
+    }
+
+protected:
+    size_t abi_nb_argment = 0;
+    size_t abi_bp_offset_ = 0;
+    size_t stack_variable_offset_ = 0;
+
+    std::vector<size_t> arguement_offsets_;
+    std::vector<stack_var> stack_var_list_;
+
+    rf_t xbp = common::bp;
+    rf_t xsp = common::sp;
+
+    std::queue<int> free_regs_;
+    std::queue<int> free_vregs_;
+
+    const Xbyak::AddressFrame rword;
+
+    /* xmm regs need to be preserve */
+    const size_t xmm_len = 16;
+#ifdef _WIN32
+    const size_t xmm_to_preserve_start = 6;
+    const size_t xmm_to_preserve = 10;
+#else
+    const size_t xmm_to_preserve_start = 0;
+    const size_t xmm_to_preserve = 0;
+#endif
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_BASE_JIT_KERNEL_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_16_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_16_i.h
new file mode 100644
index 0000000..9093e76
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_16_i.h
@@ -0,0 +1,207 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CONV_SGEMM_AVX_16xI_H_
+#define TNN_CONV_SGEMM_AVX_16xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class conv_sgemm_avx_16xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_16xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(conv_sgemm_avx_16) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    conv_sgemm_avx_16xi() {
+
+#ifdef XBYAK64
+        constexpr int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. src_a
+        declare_param<const dim_t>();       // 2. lda
+        declare_param<const float *>();     // 3. src_b
+        declare_param<const dim_t>();       // 4. ldb
+        declare_param<float *>();           // 5. dst
+        declare_param<const dim_t>();       // 6. ldc
+        declare_param<const float *>();     // 7. bias
+        declare_param<dim_t>();             // 8. first
+        declare_param<dim_t>();             // 9. act_type
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var src_a       = get_arguement(1);
+        reg_var lda         = get_arguement(2);
+        reg_var src_b       = get_arguement(3);
+        reg_var ldb         = get_arguement(4);
+        reg_var dst         = get_arguement(5);
+        reg_var ldc         = get_arguement(6);
+        reg_var bias        = get_arguement(7);
+        reg_var first       = get_arguement(8);
+        reg_var act_type    = get_arguement(9);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        reg_var op_6f(this);
+        vreg_var v_const(this);
+        vreg_var c_data[2][6] = {{VREG_VAR_ARRAY_6}, {VREG_VAR_ARRAY_6}};
+        vreg_var a_data[2] = {VREG_VAR_ARRAY_2};
+        vreg_var b_data[2] = {VREG_VAR_ARRAY_2};
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        first.restore();
+        cmp(first, 0);
+        jne("L_init");
+        bias.restore();
+        for(int i=0;i<N_r;i++) {
+            vbroadcastss(c_data[0][i].aquire(), dword[bias + i * 4]);
+            vbroadcastss(c_data[1][i].aquire(), dword[bias + i * 4]);
+        }
+        bias.release();
+        jmp("L_init_end");
+        L("L_init");
+        for(int i=0;i<N_r;i++) {
+            vmovups(c_data[0][i], yword[c_addr[i]]);
+            vmovups(c_data[1][i], yword[c_addr[i] + 8 * 4]);
+        }
+        L("L_init_end");
+        first.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            vmovaps(a_data[0].aquire(), yword[src_a]);
+            vmovaps(a_data[1].aquire(), yword[src_a + 8 * 4]);
+
+            for(int i=0;i<N_r;i+=2) {
+                vbroadcastss(b_data[0].aquire(), yword[src_b + i * 4]);
+                vfmadd231ps(c_data[0][i],   a_data[0], b_data[0]);
+                vfmadd231ps(c_data[1][i],   a_data[1], b_data[0].release());
+
+                if (i + 1 < N_r) {
+                    vbroadcastss(b_data[1].aquire(), yword[src_b + i * 4 + 4]);
+                    vfmadd231ps(c_data[0][i+1], a_data[0], b_data[1]);
+                    vfmadd231ps(c_data[1][i+1], a_data[1], b_data[1].release());
+                }
+            }
+
+            a_data[0].release();
+            a_data[1].release();
+
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        // only support fuse relu, relu6
+        act_type.restore();
+        cmp(act_type, 0);
+        je("L_post_end_1");
+            v_const.aquire();
+            vxorps(v_const, v_const, v_const);
+            for(int i=0;i<N_r;i++) {
+                vmaxps(c_data[0][i], c_data[0][i], v_const);
+                vmaxps(c_data[1][i], c_data[1][i], v_const);
+            }
+            v_const.release();
+        L("L_post_end_1");
+
+        cmp(act_type, 2);
+        jne("L_post_end_2");
+            op_6f.restore();
+            v_const.aquire();
+            // 6.f
+            mov(op_6f.cvt32(), 0x40C00000);
+            movd(v_const.xmm(), op_6f.cvt32());
+            vbroadcastss(v_const, v_const.xmm());
+            for(int i=0;i<N_r;i++) {
+                vminps(c_data[0][i], c_data[0][i], v_const);
+                vminps(c_data[1][i], c_data[1][i], v_const);
+            }
+            v_const.release();
+            op_6f.release();
+        L("L_post_end_2");
+        act_type.release();
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(yword[c_addr[i]],         c_data[0][i]);
+            vmovups(yword[c_addr[i] + 8 * 4], c_data[1][i]);
+        }
+
+        abi_epilog();
+#endif // XBYAK64
+        ret();
+    }
+
+    virtual ~conv_sgemm_avx_16xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_CONV_SGEMM_AVX_16xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_1_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_1_i.h
new file mode 100644
index 0000000..94b2282
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_1_i.h
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef CONV_TNN_SGEMM_AVX_1xI_H_
+#define CONV_TNN_SGEMM_AVX_1xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class conv_sgemm_avx_1xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_1xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(conv_sgemm_avx_1) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    conv_sgemm_avx_1xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. src_a
+        declare_param<const dim_t>();       // 2. lda
+        declare_param<const float *>();     // 3. src_b
+        declare_param<const dim_t>();       // 4. ldb
+        declare_param<float *>();           // 5. dst
+        declare_param<const dim_t>();       // 6. ldc
+        declare_param<const float *>();     // 7. bias
+        declare_param<dim_t>();             // 8. first
+        declare_param<dim_t>();             // 9. act_type
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var src_a       = get_arguement(1);
+        reg_var lda         = get_arguement(2);
+        reg_var src_b       = get_arguement(3);
+        reg_var ldb         = get_arguement(4);
+        reg_var dst         = get_arguement(5);
+        reg_var ldc         = get_arguement(6);
+        reg_var bias        = get_arguement(7);
+        reg_var first       = get_arguement(8);
+        reg_var act_type    = get_arguement(9);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        reg_var op_6f(this);
+        vreg_var v_const(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        first.restore();
+        cmp(first, 0);
+        jne("L_init");
+        bias.restore();
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vbroadcastss_sse(c_data[i].xmm(), dword[bias + i * 4]);
+        }
+        bias.release();
+        jmp("L_init_end");
+        L("L_init");
+        for(int i=0;i<N_r;i++) {
+            vbroadcastss_sse(c_data[i].xmm(), dword[c_addr[i]]);
+        }
+        L("L_init_end");
+        first.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            movups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss_sse(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps_sse(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        // only support fuse relu, relu6
+        act_type.restore();
+        cmp(act_type, 0);
+        je("L_post_end");
+            v_const.aquire();
+            xorps(v_const.xmm(), v_const.xmm());
+            for(int i=0;i<N_r;i++) {
+                maxps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+        cmp(act_type, 2);
+        jne("L_post_end");
+            op_6f.restore();
+            v_const.aquire();
+            // 6.f
+            mov(op_6f.cvt32(), 0x40C00000);
+            movd(v_const.xmm(), op_6f.cvt32());
+            shufps(v_const.xmm(), v_const.xmm(), 0);
+            for(int i=0;i<N_r;i++) {
+                minps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+            op_6f.release();
+        L("L_post_end");
+        act_type.release();
+
+        for(int i=0;i<N_r;i++) {
+            movss(dword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~conv_sgemm_avx_1xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // CONV_TNN_SGEMM_AVX_1xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_2_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_2_i.h
new file mode 100644
index 0000000..913ab0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_2_i.h
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef CONV_TNN_SGEMM_AVX_2xI_H_
+#define CONV_TNN_SGEMM_AVX_2xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class conv_sgemm_avx_2xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_2xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(conv_sgemm_avx_2) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    conv_sgemm_avx_2xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. src_a
+        declare_param<const dim_t>();       // 2. lda
+        declare_param<const float *>();     // 3. src_b
+        declare_param<const dim_t>();       // 4. ldb
+        declare_param<float *>();           // 5. dst
+        declare_param<const dim_t>();       // 6. ldc
+        declare_param<const float *>();     // 7. bias
+        declare_param<dim_t>();             // 8. first
+        declare_param<dim_t>();             // 9. act_type
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var src_a       = get_arguement(1);
+        reg_var lda         = get_arguement(2);
+        reg_var src_b       = get_arguement(3);
+        reg_var ldb         = get_arguement(4);
+        reg_var dst         = get_arguement(5);
+        reg_var ldc         = get_arguement(6);
+        reg_var bias        = get_arguement(7);
+        reg_var first       = get_arguement(8);
+        reg_var act_type    = get_arguement(9);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        reg_var op_6f(this);
+        vreg_var v_const(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        first.restore();
+        cmp(first, 0);
+        jne("L_init");
+        bias.restore();
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vbroadcastss_sse(c_data[i].xmm(), dword[bias + i * 4]);
+        }
+        bias.release();
+        jmp("L_init_end");
+        L("L_init");
+        for(int i=0;i<N_r;i++) {
+            movlps(c_data[i].xmm(), qword[c_addr[i]]);
+        }
+        L("L_init_end");
+        first.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            movups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss_sse(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps_sse(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        // only support fuse relu, relu6
+        act_type.restore();
+        cmp(act_type, 0);
+        je("L_post_end");
+            v_const.aquire();
+            xorps(v_const.xmm(), v_const.xmm());
+            for(int i=0;i<N_r;i++) {
+                maxps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+        cmp(act_type, 2);
+        jne("L_post_end");
+            op_6f.restore();
+            v_const.aquire();
+            // 6.f
+            mov(op_6f.cvt32(), 0x40C00000);
+            movd(v_const.xmm(), op_6f.cvt32());
+            shufps(v_const.xmm(), v_const.xmm(), 0);
+            for(int i=0;i<N_r;i++) {
+                minps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+            op_6f.release();
+        L("L_post_end");
+        act_type.release();
+
+        for(int i=0;i<N_r;i++) {
+            movlps(qword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~conv_sgemm_avx_2xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // CONV_TNN_SGEMM_AVX_2xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_4_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_4_i.h
new file mode 100644
index 0000000..3d1cec3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_4_i.h
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef CONV_TNN_SGEMM_AVX_4xI_H_
+#define CONV_TNN_SGEMM_AVX_4xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class conv_sgemm_avx_4xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_4xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(conv_sgemm_avx_4) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+    
+public:
+    conv_sgemm_avx_4xi() {
+        constexpr int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. src_a
+        declare_param<const dim_t>();       // 2. lda
+        declare_param<const float *>();     // 3. src_b
+        declare_param<const dim_t>();       // 4. ldb
+        declare_param<float *>();           // 5. dst
+        declare_param<const dim_t>();       // 6. ldc
+        declare_param<const float *>();     // 7. bias
+        declare_param<dim_t>();             // 8. first
+        declare_param<dim_t>();             // 9. act_type
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var src_a       = get_arguement(1);
+        reg_var lda         = get_arguement(2);
+        reg_var src_b       = get_arguement(3);
+        reg_var ldb         = get_arguement(4);
+        reg_var dst         = get_arguement(5);
+        reg_var ldc         = get_arguement(6);
+        reg_var bias        = get_arguement(7);
+        reg_var first       = get_arguement(8);
+        reg_var act_type    = get_arguement(9);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        reg_var op_6f(this);
+        vreg_var v_const(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        first.restore();
+        cmp(first, 0);
+        jne("L_init");
+        bias.restore();
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vbroadcastss_sse(c_data[i].xmm(), dword[bias + i * 4]);
+        }
+        bias.release();
+        jmp("L_init_end");
+        L("L_init");
+        for(int i=0;i<N_r;i++) {
+            movups(c_data[i].xmm(), xword[c_addr[i]]);
+        }
+        L("L_init_end");
+        first.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            movups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss_sse(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps_sse(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        // only support fuse relu, relu6
+        act_type.restore();
+        cmp(act_type, 0);
+        je("L_post_end");
+            v_const.aquire();
+            xorps(v_const.xmm(), v_const.xmm());
+            for(int i=0;i<N_r;i++) {
+                maxps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+        cmp(act_type, 2);
+        jne("L_post_end");
+            op_6f.restore();
+            v_const.aquire();
+            // 6.f
+            mov(op_6f.cvt32(), 0x40C00000);
+            movd(v_const.xmm(), op_6f.cvt32());
+            shufps(v_const.xmm(), v_const.xmm(), 0);
+            for(int i=0;i<N_r;i++) {
+                minps(c_data[i].xmm(), v_const.xmm());
+            }
+            v_const.release();
+            op_6f.release();
+        L("L_post_end");
+        act_type.release();
+
+        for(int i=0;i<N_r;i++) {
+            movups(xword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~conv_sgemm_avx_4xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // CONV_TNN_SGEMM_AVX_4xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_8_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_8_i.h
new file mode 100644
index 0000000..071d041
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_8_i.h
@@ -0,0 +1,187 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CONV_SGEMM_AVX_8xI_H_
+#define TNN_CONV_SGEMM_AVX_8xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class conv_sgemm_avx_8xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_8xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(conv_sgemm_avx_8) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    conv_sgemm_avx_8xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. src_a
+        declare_param<const dim_t>();       // 2. lda
+        declare_param<const float *>();     // 3. src_b
+        declare_param<const dim_t>();       // 4. ldb
+        declare_param<float *>();           // 5. dst
+        declare_param<const dim_t>();       // 6. ldc
+        declare_param<const float *>();     // 7. bias
+        declare_param<dim_t>();             // 8. first
+        declare_param<dim_t>();             // 9. act_type
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var src_a       = get_arguement(1);
+        reg_var lda         = get_arguement(2);
+        reg_var src_b       = get_arguement(3);
+        reg_var ldb         = get_arguement(4);
+        reg_var dst         = get_arguement(5);
+        reg_var ldc         = get_arguement(6);
+        reg_var bias        = get_arguement(7);
+        reg_var first       = get_arguement(8);
+        reg_var act_type    = get_arguement(9);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        reg_var op_6f(this);
+        vreg_var v_const(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        first.restore();
+        cmp(first, 0);
+        jne("L_init");
+        bias.restore();
+        for(int i=0;i<N_r;i++) {
+            vbroadcastss(c_data[i].aquire(), dword[bias + i * 4]);
+        }
+        bias.release();
+        jmp("L_init_end");
+        L("L_init");
+        for(int i=0;i<N_r;i++) {
+            vmovups(c_data[i], yword[c_addr[i]]);
+        }
+        L("L_init_end");
+        first.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            vmovaps(a_data, yword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss(b_data, yword[src_b + i * 4]);
+                vfmadd231ps(c_data[i], a_data, b_data);
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        // only support fuse relu, relu6
+        act_type.restore();
+        cmp(act_type, 0);
+        je("L_post_end");
+            v_const.aquire();
+            vxorps(v_const, v_const, v_const);
+            for(int i=0;i<N_r;i++) {
+                vmaxps(c_data[i], c_data[i], v_const);
+            }
+            v_const.release();
+        cmp(act_type, 2);
+        jne("L_post_end");
+            op_6f.restore();
+            v_const.aquire();
+            // 6.f
+            mov(op_6f.cvt32(), 0x40C00000);
+            movd(v_const.xmm(), op_6f.cvt32());
+            vbroadcastss(v_const, v_const.xmm());
+            for(int i=0;i<N_r;i++) {
+                vminps(c_data[i], c_data[i], v_const);
+            }
+            v_const.release();
+            op_6f.release();
+        L("L_post_end");
+        act_type.release();
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(yword[c_addr[i]], c_data[i]);
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~conv_sgemm_avx_8xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_CONV_SGEMM_AVX_8xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_kernels.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_kernels.h
new file mode 100644
index 0000000..98e7034
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_kernels.h
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CONV_SGEMM_AVX_KERNELS_H
+#define TNN_CONV_SGEMM_AVX_KERNELS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_16_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_8_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_4_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_2_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_1_i.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int M = 8, int N = 6, int M_BLOCK_SIZE = 16, int N_BLOCK_SIZE = 6>
+class conv_sgemm_avx_kernel: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb,
+                           float * dst, dim_t ldc,
+                           const float * bias, dim_t first, dim_t act_type) {}
+
+    using func_ptr_t = decltype(&conv_sgemm_avx_kernel::naive_impl);
+
+public:
+    conv_sgemm_avx_kernel() {
+        switch (M) {
+            case 1:
+                actual = new conv_sgemm_avx_1xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 2:
+                actual = new conv_sgemm_avx_2xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 4:
+                actual = new conv_sgemm_avx_4xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 8:
+                actual = new conv_sgemm_avx_8xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 16:
+                actual = new conv_sgemm_avx_16xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            default:
+                throw std::runtime_error("kernel not found for specified param."); 
+                break;
+        }
+        ret();
+    }
+
+    virtual void * get_func_ptr() {
+        if (actual != nullptr) {
+            return actual->get_func_ptr();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return base_jit_kernel::get_func_ptr();
+    }
+
+
+    virtual std::string get_kernel_name() {
+        if (actual) {
+            return actual->get_kernel_name();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return JIT_KERNEL_NAME(conv_sgemm_avx_kernel);
+    }
+
+    virtual size_t get_func_size() {
+        if (actual) {
+            return actual->get_func_size();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return base_jit_kernel::get_func_size();
+    }
+
+    virtual ~conv_sgemm_avx_kernel() {
+        if (actual != nullptr) {
+            delete actual;
+            actual = nullptr;
+        }
+    }
+
+private:
+    base_jit_kernel * actual = nullptr;
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_KERNELS_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/jit_kernels.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/jit_kernels.h
new file mode 100644
index 0000000..0bd21ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/jit_kernels.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_JIT_JIT_KERNELS_H_
+#define TNN_JIT_JIT_KERNELS_H_
+
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n_6.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_8.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_16.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4x16.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_kernels.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/conv_sgemm_avx_kernels.h"
+
+#endif // TNN_JIT_JIT_KERNELS_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_16_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_16_i.h
new file mode 100644
index 0000000..87c6508
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_16_i.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_16xI_H_
+#define TNN_SGEMM_AVX_16xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class sgemm_avx_16xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_16xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_avx_16) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    sgemm_avx_16xi() {
+
+#ifdef XBYAK64
+        constexpr int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. alpha_ptr
+        declare_param<const float *>();     // 2. src_a
+        declare_param<const dim_t>();       // 3. lda
+        declare_param<const float *>();     // 4. src_b
+        declare_param<const dim_t>();       // 5. ldb
+        declare_param<const float *>();     // 6. beta_ptr
+        declare_param<float *>();           // 7. dst
+        declare_param<const dim_t>();       // 8. ldc
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var alpha_ptr   = get_arguement(1);
+        reg_var src_a       = get_arguement(2);
+        reg_var lda         = get_arguement(3);
+        reg_var src_b       = get_arguement(4);
+        reg_var ldb         = get_arguement(5);
+        reg_var beta_ptr    = get_arguement(6);
+        reg_var dst         = get_arguement(7);
+        reg_var ldc         = get_arguement(8);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        vreg_var v_alpha(this), v_beta(this);
+        vreg_var c_data[2][6] = {{VREG_VAR_ARRAY_6}, {VREG_VAR_ARRAY_6}};
+        vreg_var a_data[2] = {VREG_VAR_ARRAY_2};
+        vreg_var b_data[2] = {VREG_VAR_ARRAY_2};
+
+        v_beta.aquire();
+        vbroadcastss(v_beta, dword[beta_ptr.restore()]);
+        beta_ptr.release();
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(c_data[0][i].aquire(), yword[c_addr[i]]);
+            vmovups(c_data[1][i].aquire(), yword[c_addr[i] + 8 * 4]);
+        }
+
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[0][i], c_data[0][i], v_beta);
+            vmulps(c_data[1][i], c_data[1][i], v_beta);
+        }
+        v_beta.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            vmovaps(a_data[0].aquire(), yword[src_a]);
+            vmovaps(a_data[1].aquire(), yword[src_a + 8 * 4]);
+
+            for(int i=0;i<N_r;i+=2) {
+                vbroadcastss(b_data[0].aquire(), yword[src_b + i * 4]);
+                vfmadd231ps(c_data[0][i],   a_data[0], b_data[0]);
+                vfmadd231ps(c_data[1][i],   a_data[1], b_data[0].release());
+
+                if (i + 1 < N_r) {
+                    vbroadcastss(b_data[1].aquire(), yword[src_b + i * 4 + 4]);
+                    vfmadd231ps(c_data[0][i+1], a_data[0], b_data[1]);
+                    vfmadd231ps(c_data[1][i+1], a_data[1], b_data[1].release());
+                }
+            }
+
+            a_data[0].release();
+            a_data[1].release();
+
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        v_alpha.aquire();
+        vbroadcastss(v_alpha, dword[alpha_ptr.restore()]); alpha_ptr.release();
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[0][i], c_data[0][i], v_alpha);
+            vmulps(c_data[1][i], c_data[1][i], v_alpha);
+        }
+        v_alpha.release();
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(yword[c_addr[i]],         c_data[0][i]);
+            vmovups(yword[c_addr[i] + 8 * 4], c_data[1][i]);
+        }
+
+        abi_epilog();
+#endif // XBYAK64
+        ret();
+    }
+
+    virtual ~sgemm_avx_16xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_16xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_1_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_1_i.h
new file mode 100644
index 0000000..91280b6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_1_i.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_1xI_H_
+#define TNN_SGEMM_AVX_1xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class sgemm_avx_1xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_1xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_avx_1) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    sgemm_avx_1xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. alpha_ptr
+        declare_param<const float *>();     // 2. src_a
+        declare_param<const dim_t>();       // 3. lda
+        declare_param<const float *>();     // 4. src_b
+        declare_param<const dim_t>();       // 5. ldb
+        declare_param<const float *>();     // 6. beta_ptr
+        declare_param<float *>();           // 7. dst
+        declare_param<const dim_t>();       // 8. ldc
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var alpha_ptr   = get_arguement(1);
+        reg_var src_a       = get_arguement(2);
+        reg_var lda         = get_arguement(3);
+        reg_var src_b       = get_arguement(4);
+        reg_var ldb         = get_arguement(5);
+        reg_var beta_ptr    = get_arguement(6);
+        reg_var dst         = get_arguement(7);
+        reg_var ldc         = get_arguement(8);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        vreg_var v_alpha(this), v_beta(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+
+        v_beta.aquire();
+        vbroadcastss(v_beta, dword[beta_ptr.restore()]);
+        beta_ptr.release();
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vbroadcastss(c_data[i].xmm(), dword[c_addr[i]]);
+        }
+
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i], c_data[i], v_beta);
+        }
+        v_beta.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            vmovups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        v_alpha.aquire();
+        vbroadcastss(v_alpha.xmm(), dword[alpha_ptr.restore()]); alpha_ptr.release();
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i].xmm(), c_data[i].xmm(), v_alpha.xmm());
+        }
+        v_alpha.release();
+
+        for(int i=0;i<N_r;i++) {
+            movss(dword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_avx_1xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_1xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_2_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_2_i.h
new file mode 100644
index 0000000..22fd84c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_2_i.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_2xI_H_
+#define TNN_SGEMM_AVX_2xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class sgemm_avx_2xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_2xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_avx_2) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    sgemm_avx_2xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. alpha_ptr
+        declare_param<const float *>();     // 2. src_a
+        declare_param<const dim_t>();       // 3. lda
+        declare_param<const float *>();     // 4. src_b
+        declare_param<const dim_t>();       // 5. ldb
+        declare_param<const float *>();     // 6. beta_ptr
+        declare_param<float *>();           // 7. dst
+        declare_param<const dim_t>();       // 8. ldc
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var alpha_ptr   = get_arguement(1);
+        reg_var src_a       = get_arguement(2);
+        reg_var lda         = get_arguement(3);
+        reg_var src_b       = get_arguement(4);
+        reg_var ldb         = get_arguement(5);
+        reg_var beta_ptr    = get_arguement(6);
+        reg_var dst         = get_arguement(7);
+        reg_var ldc         = get_arguement(8);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        vreg_var v_alpha(this), v_beta(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+
+        v_beta.aquire();
+        vbroadcastss(v_beta, dword[beta_ptr.restore()]);
+        beta_ptr.release();
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            movlps(c_data[i].xmm(), qword[c_addr[i]]);
+        }
+
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i], c_data[i], v_beta);
+        }
+        v_beta.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            vmovups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        v_alpha.aquire();
+        vbroadcastss(v_alpha.xmm(), dword[alpha_ptr.restore()]); alpha_ptr.release();
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i].xmm(), c_data[i].xmm(), v_alpha.xmm());
+        }
+        v_alpha.release();
+
+        for(int i=0;i<N_r;i++) {
+            movlps(qword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_avx_2xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_2xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_4_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_4_i.h
new file mode 100644
index 0000000..8a343c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_4_i.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_4xI_H_
+#define TNN_SGEMM_AVX_4xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class sgemm_avx_4xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_4xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_avx_4) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+    
+public:
+    sgemm_avx_4xi() {
+        constexpr int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. alpha_ptr
+        declare_param<const float *>();     // 2. src_a
+        declare_param<const dim_t>();       // 3. lda
+        declare_param<const float *>();     // 4. src_b
+        declare_param<const dim_t>();       // 5. ldb
+        declare_param<const float *>();     // 6. beta_ptr
+        declare_param<float *>();           // 7. dst
+        declare_param<const dim_t>();       // 8. ldc
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var alpha_ptr   = get_arguement(1);
+        reg_var src_a       = get_arguement(2);
+        reg_var lda         = get_arguement(3);
+        reg_var src_b       = get_arguement(4);
+        reg_var ldb         = get_arguement(5);
+        reg_var beta_ptr    = get_arguement(6);
+        reg_var dst         = get_arguement(7);
+        reg_var ldc         = get_arguement(8);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        vreg_var v_alpha(this), v_beta(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+
+        v_beta.aquire();
+        vbroadcastss(v_beta, dword[beta_ptr.restore()]);
+        beta_ptr.release();
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vmovups(c_data[i].xmm(), xword[c_addr[i]]);
+        }
+
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i], c_data[i], v_beta);
+        }
+        v_beta.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            vmovups(a_data.xmm(), xword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss(b_data.xmm(), xword[src_b + i * 4]);
+                vfmadd231ps(c_data[i].xmm(), a_data.xmm(), b_data.xmm());
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        v_alpha.aquire();
+        vbroadcastss(v_alpha.xmm(), dword[alpha_ptr.restore()]); alpha_ptr.release();
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i].xmm(), c_data[i].xmm(), v_alpha.xmm());
+        }
+        v_alpha.release();
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(xword[c_addr[i]], c_data[i].xmm());
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_avx_4xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_4xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_8_i.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_8_i.h
new file mode 100644
index 0000000..35bf7e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_8_i.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_8xI_H_
+#define TNN_SGEMM_AVX_8xI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I, int M_BLOCK_SIZE, int N_BLOCK_SIZE>
+class sgemm_avx_8xi: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_8xi::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_avx_8) << "_" << I << "_" << M_BLOCK_SIZE << "_" << N_BLOCK_SIZE;
+        return buf.str();
+    }
+
+public:
+    sgemm_avx_8xi() {
+        int N_r = MIN_(6, I);
+
+        declare_param<const dim_t>();       // 0. K
+        declare_param<const float *>();     // 1. alpha_ptr
+        declare_param<const float *>();     // 2. src_a
+        declare_param<const dim_t>();       // 3. lda
+        declare_param<const float *>();     // 4. src_b
+        declare_param<const dim_t>();       // 5. ldb
+        declare_param<const float *>();     // 6. beta_ptr
+        declare_param<float *>();           // 7. dst
+        declare_param<const dim_t>();       // 8. ldc
+
+        abi_prolog();
+
+        stack_var K         = get_arguement_to_stack(0);
+        reg_var alpha_ptr   = get_arguement(1);
+        reg_var src_a       = get_arguement(2);
+        reg_var lda         = get_arguement(3);
+        reg_var src_b       = get_arguement(4);
+        reg_var ldb         = get_arguement(5);
+        reg_var beta_ptr    = get_arguement(6);
+        reg_var dst         = get_arguement(7);
+        reg_var ldc         = get_arguement(8);
+
+        reg_var c[3] = {REG_VAR_ARRAY_3};
+        vreg_var v_alpha(this), v_beta(this);
+        vreg_var c_data[6] = {VREG_VAR_ARRAY_6};
+        vreg_var a_data(this), b_data(this);
+
+        v_beta.aquire();
+        vbroadcastss(v_beta, dword[beta_ptr.restore()]);
+        beta_ptr.release();
+        
+        ldc.restore();
+        mov(c[0].aquire(), dst.restore());
+        lea(c[1].aquire(), byte[dst + (ldc * 8)]);
+        lea(c[2].aquire(), byte[c[1]+ (ldc * 8)]);
+        dst.release();
+
+        Xbyak::RegExp c_addr[6] = {
+            Xbyak::RegExp(c[0]),
+            Xbyak::RegExp(c[0] + (ldc * 4)),
+            Xbyak::RegExp(c[1]),
+            Xbyak::RegExp(c[1] + (ldc * 4)),
+            Xbyak::RegExp(c[2]),
+            Xbyak::RegExp(c[2] + (ldc * 4)),
+        };
+
+        for(int i=0;i<N_r;i++) {
+            c_data[i].aquire();
+            vmovups(c_data[i], yword[c_addr[i]]);
+        }
+
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i], c_data[i], v_beta);
+        }
+        v_beta.release();
+
+        src_a.restore();
+        src_b.restore();
+
+        LOOP_STACK_VAR(K, SGEMM_AVX_8X6_K) 
+        {
+            a_data.aquire();
+            vmovaps(a_data, yword[src_a]);
+            for(int i=0;i<N_r;i++) {
+                b_data.aquire();
+                vbroadcastss(b_data, yword[src_b + i * 4]);
+                vfmadd231ps(c_data[i], a_data, b_data);
+                b_data.release();
+            }
+            a_data.release();
+            lea(src_a, byte[src_a + M_BLOCK_SIZE * 4]);
+            lea(src_b, byte[src_b + N_BLOCK_SIZE * 4]);
+        }
+
+        src_a.release();
+        src_b.release();
+
+        v_alpha.aquire();
+        vbroadcastss(v_alpha, dword[alpha_ptr.restore()]); alpha_ptr.release();
+        for(int i=0;i<N_r;i++) {
+            vmulps(c_data[i], c_data[i], v_alpha);
+        }
+        v_alpha.release();
+
+        for(int i=0;i<N_r;i++) {
+            vmovups(yword[c_addr[i]], c_data[i]);
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_avx_8xi() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_8xI_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_kernels.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_kernels.h
new file mode 100644
index 0000000..5a26a3d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_kernels.h
@@ -0,0 +1,124 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_AVX_KERNELS_H
+#define TNN_SGEMM_AVX_KERNELS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_16_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_8_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_4_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_2_i.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/sgemm_avx_1_i.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int M = 8, int N = 6, int M_BLOCK_SIZE = 16, int N_BLOCK_SIZE = 6>
+class sgemm_avx_kernel: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t K, const float * alpha_ptr,
+                           const float * src_a, const dim_t lda,
+                           const float * src_b, dim_t ldb, const float * beta_ptr,
+                           float * dst, dim_t ldc) {}
+
+    using func_ptr_t = decltype(&sgemm_avx_kernel::naive_impl);
+
+public:
+    sgemm_avx_kernel() {
+        switch (M) {
+            case 1:
+                actual = new sgemm_avx_1xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 2:
+                actual = new sgemm_avx_2xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 4:
+                actual = new sgemm_avx_4xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 8:
+                actual = new sgemm_avx_8xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            case 16:
+                actual = new sgemm_avx_16xi<N, M_BLOCK_SIZE, N_BLOCK_SIZE>();
+                break;
+            default:
+                throw std::runtime_error("kernel not found for specified param."); 
+                break;
+        }
+        ret();
+    }
+
+    virtual void * get_func_ptr() {
+        if (actual != nullptr) {
+            return actual->get_func_ptr();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return base_jit_kernel::get_func_ptr();
+    }
+
+
+    virtual std::string get_kernel_name() {
+        if (actual) {
+            return actual->get_kernel_name();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return JIT_KERNEL_NAME(sgemm_avx_kernel);
+    }
+
+    virtual size_t get_func_size() {
+        if (actual) {
+            return actual->get_func_size();
+        } else {
+            throw std::runtime_error("kernel not initialized."); 
+        }
+        return base_jit_kernel::get_func_size();
+    }
+
+    virtual ~sgemm_avx_kernel() {
+        if (actual != nullptr) {
+            delete actual;
+            actual = nullptr;
+        }
+    }
+
+private:
+    base_jit_kernel * actual = nullptr;
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_AVX_KERNELS_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n.h
new file mode 100644
index 0000000..a524fa6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n.h
@@ -0,0 +1,224 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_N_BASE_HPP_
+#define TNN_SGEMM_FETCH_N_BASE_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+//  only support block_size = 6
+template<int I>
+class sgemm_fetch_n_i_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * src, const dim_t lda, float * dst, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_n_i_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_fetch_n_) << I;
+        return buf.str();
+    }
+
+public:
+    sgemm_fetch_n_i_ker_t() {
+        constexpr int I4 = MIN_(I, 4);
+        constexpr int I6 = MIN_(I, 6);
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const size_t>();
+        declare_param<const size_t>();
+
+        abi_prolog();
+
+
+        stack_var m         = get_arguement_to_stack(0);
+        stack_var a_stack   = get_arguement_to_stack(1);
+        stack_var lda       = get_arguement_to_stack(2);
+        stack_var b_stack   = get_arguement_to_stack(3);
+        stack_var ldb       = get_arguement_to_stack(4);
+        reg_var   block_size= get_arguement(5);
+
+        stack_var m8 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[6] = {REG_VAR_ARRAY_6};
+        stack_var a_s[6] = {STACK_VAR_ARRAY_6};
+
+        reg_var ldax4(this);
+
+        // init m8 = m / 8
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x3);
+        mov(m8, tmp);
+
+        // init m1 = m % 8
+        mov(tmp, m);
+        and_(tmp, 0x7);
+        mov(m1, tmp);
+
+        block_size.restore();
+        cmp(block_size, 0x6);
+        block_size.release();
+        jne("FUNCTION_END", T_NEAR);
+
+        // init ldax4 = lda * sizeof(float);
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*sizeof(float)]);
+
+
+        // init a pointers 
+        mov(tmp, a_stack);
+        mov(a_r[0].aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a_r[1].aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a_r[2].aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a_r[3].aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a_s[4], tmp);
+
+        add(tmp, ldax4.release());
+        mov(a_s[5], tmp);
+
+        reg_var b_r(this);
+
+        LOOP_STACK_VAR(m8, SGEMM_FETCH_N6_M8) 
+        {
+            stack_var a0_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var a1_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var a2_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var a3_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var a4_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var a5_data[8] = {STACK_VAR_ARRAY_8};
+            stack_var * a_data[] = {
+                a0_data, a1_data, a2_data,
+                a3_data, a4_data, a5_data, 
+            };
+
+            // ai in register
+            for(int i=0;i<I4;i++) {
+                for(int j=0;j<8;j++) {
+                    mov(tmp.cvt32(), dword[a_r[i] + j * sizeof(float)]);
+                    mov(a_data[i][j], tmp.cvt32());
+                }
+            }
+
+            // ai in stack
+            reg_var a_r_tmp(this);
+            for(int i=4;i<I6;i++) {
+                for(int j=0;j<8;j++) {
+                    mov(a_r_tmp.aquire(), a_s[i]);
+                    mov(tmp.cvt32(), dword[a_r_tmp.release() + j * sizeof(float)]);
+                    mov(a_data[i][j], tmp.cvt32());
+                }
+            }
+
+
+            mov(b_r.aquire(), b_stack);
+
+            for(int j=0;j<8;j++) {
+                for(int i=0;i<I6;i++) {
+                    mov(tmp.cvt32(), a_data[i][j]);
+                    mov(dword[b_r + (j * 6 + i) * sizeof(float)], tmp.cvt32());
+                }
+            }
+
+            b_r.release();
+
+            size_t vsize_in_bytes = 8 * sizeof(float);
+
+            add(b_stack, 6 * vsize_in_bytes);
+            add(a_r[0], vsize_in_bytes);
+            add(a_r[1], vsize_in_bytes);
+            add(a_r[2], vsize_in_bytes);
+            add(a_r[3], vsize_in_bytes);
+            add(a_s[4], vsize_in_bytes);
+            add(a_s[5], vsize_in_bytes);
+        }
+
+        size_t ele_size = sizeof(float);
+        mov(b_r.aquire(), b_stack);
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_N6_M1) 
+        {
+            for(int i=0;i<I4;i++) {
+                mov(tmp.cvt32(), dword[a_r[i]]);
+                mov(dword[b_r + i * ele_size ], tmp.cvt32());
+                add(a_r[i], sizeof(float));
+            }
+
+            for(int i=4;i<I6;i++) {
+                mov(tmp, a_s[i]);
+                add(a_s[i], sizeof(float));
+                mov(tmp.cvt32(), dword[tmp]);
+                mov(dword[b_r + i * ele_size ], tmp.cvt32());
+            }
+
+            add(b_r, 6 * sizeof(float));
+        }
+
+        L("FUNCTION_END");
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_n_i_ker_t() {
+
+    }
+
+private:
+
+};
+
+
+class sgemm_fetch_n_1_ker_t: public sgemm_fetch_n_i_ker_t<1> {};
+class sgemm_fetch_n_2_ker_t: public sgemm_fetch_n_i_ker_t<2> {};
+class sgemm_fetch_n_3_ker_t: public sgemm_fetch_n_i_ker_t<3> {};
+class sgemm_fetch_n_4_ker_t: public sgemm_fetch_n_i_ker_t<4> {};
+class sgemm_fetch_n_5_ker_t: public sgemm_fetch_n_i_ker_t<5> {};
+class sgemm_fetch_n_6i_ker_t: public sgemm_fetch_n_i_ker_t<6> {};
+
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_N_BASE_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n_6.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n_6.h
new file mode 100644
index 0000000..eb8bcd9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_n_6.h
@@ -0,0 +1,221 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_N_6_HPP_
+#define TNN_SGEMM_FETCH_N_6_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+//  only support block_size = 6
+class sgemm_fetch_n_6_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_n_6_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(sgemm_fetch_n_6);
+    }
+
+public:
+    sgemm_fetch_n_6_ker_t() {
+
+        declare_param<const dim_t>();
+        declare_param<const float *>();
+        declare_param<const dim_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_ptr = get_arguement_to_stack(1);
+        stack_var lda   = get_arguement_to_stack(2);
+        stack_var b_ptr = get_arguement_to_stack(3);
+        stack_var ldb   = get_arguement_to_stack(4);
+        reg_var block_size = get_arguement(5);
+
+        stack_var m8 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this), a0(this), a1(this), a2(this), a3(this), b_ptr_reg(this); 
+        stack_var a4 = get_stack_var();
+        stack_var a5 = get_stack_var();
+
+        reg_var ldax4(this);
+
+        // init m8 = m / 8
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x3);
+        mov(m8, tmp);
+
+        // init m1 = m % 8
+        mov(tmp, m);
+        and_(tmp, 0x7);
+        mov(m1, tmp);
+
+        block_size.restore();
+        cmp(block_size, 0x6);
+        block_size.release();
+        jne("FUNCTION_END", T_NEAR);
+
+        // init ldax4 = lda * sizeof(float);
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*sizeof(float)]);
+
+
+        // init a pointers 
+        mov(tmp, a_ptr);
+        mov(a0.aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a1.aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a2.aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a3.aquire(), tmp);
+
+        add(tmp, ldax4);
+        mov(a4, tmp);
+
+        add(tmp, ldax4.release());
+        mov(a5, tmp);
+
+        vreg_var v0(this), v1(this), v2(this), v3(this), v4(this), v5(this); 
+        vreg_var upack0(this), upack1(this), upack2(this), upack3(this), upack4(this), upack5(this); 
+        vreg_var shf0(this), shf1(this), shf2(this), shf3(this), shf4(this), shf5(this);
+        vreg_var res0(this), res1(this), res2(this), res3(this), res4(this), res5(this);
+
+        LOOP_STACK_VAR(m8, SGEMM_FETCH_N6_M8) 
+        {
+            vmovups(v0.aquire(), yword[a0]);
+            vmovups(v1.aquire(), yword[a1]);
+            vmovups(v2.aquire(), yword[a2]);
+            vmovups(v3.aquire(), yword[a3]); mov(tmp, a4);
+            vmovups(v4.aquire(), yword[tmp]); mov(tmp, a5);
+            vmovups(v5.aquire(), yword[tmp]);
+
+            vunpcklps(upack0.aquire(), v0, v1);
+            vunpckhps(upack1.aquire(), v0.release(), v1.release());
+            vunpcklps(upack2.aquire(), v2, v3);
+            vunpckhps(upack3.aquire(), v2.release(), v3.release());
+            vunpcklps(upack4.aquire(), v4, v5);
+            vunpckhps(upack5.aquire(), v4.release(), v5.release());
+
+            vshufps(shf0.aquire(), upack0, upack2, 0x44);
+            vshufps(shf1.aquire(), upack4, upack0.release(), 0xe4);
+            vshufps(shf2.aquire(), upack2.release(), upack4.release(), 0xee);
+            vshufps(shf3.aquire(), upack5, upack1, 0xe4);
+            vshufps(shf4.aquire(), upack3, upack5.release(), 0xee);
+            vshufps(shf5.aquire(), upack1.release(), upack3.release(), 0x44);
+
+            vinsertf128(res0.aquire(), shf0, Xbyak::Xmm(shf1.getIdx()), 0x1);
+            vperm2f128( res1.aquire(), shf0.release(), shf1.release(), 0x31);
+            vinsertf128(res2.aquire(), shf2, Xbyak::Xmm(shf5.getIdx()), 0x1);
+            vperm2f128( res3.aquire(), shf2.release(), shf5.release(), 0x31);
+            vinsertf128(res4.aquire(), shf3, Xbyak::Xmm(shf4.getIdx()), 0x1);
+            vperm2f128( res5.aquire(), shf3.release(), shf4.release(), 0x31);
+
+            mov(tmp, b_ptr);
+            size_t vsize_in_bytes = 8 * sizeof(float);
+            vmovups(yword[tmp+ 0 * vsize_in_bytes ], res0);
+            vmovups(yword[tmp+ 1 * vsize_in_bytes ], res2);
+            vmovups(yword[tmp+ 2 * vsize_in_bytes ], res4);
+            vmovups(yword[tmp+ 3 * vsize_in_bytes ], res1);
+            vmovups(yword[tmp+ 4 * vsize_in_bytes ], res3);
+            vmovups(yword[tmp+ 5 * vsize_in_bytes ], res5);
+
+            add(b_ptr, 6 * vsize_in_bytes);
+            add(a0, vsize_in_bytes);
+            add(a1, vsize_in_bytes);
+            add(a2, vsize_in_bytes);
+            add(a3, vsize_in_bytes);
+            add(a4, vsize_in_bytes);
+            add(a5, vsize_in_bytes);
+        }
+
+        size_t ele_size = sizeof(float);
+        mov(b_ptr_reg.aquire(), b_ptr);
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_N6_M1) 
+        {
+            mov(tmp.cvt32(), dword[a0]);
+            mov(dword[b_ptr_reg+ 0 * ele_size ], tmp.cvt32());
+            
+            mov(tmp.cvt32(), dword[a1]);
+            mov(dword[b_ptr_reg+ 1 * ele_size ] , tmp.cvt32());
+
+            mov(tmp.cvt32(), dword[a2]);
+            mov(dword[b_ptr_reg+ 2 * ele_size ] , tmp.cvt32());
+
+            mov(tmp.cvt32(), dword[a3]);
+            mov(dword[b_ptr_reg+ 3 * ele_size ] , tmp.cvt32());
+
+            mov(tmp, a4);
+            mov(tmp.cvt32(), dword[tmp]);
+            mov(dword[b_ptr_reg+ 4 * ele_size ] , tmp.cvt32());
+
+            mov(tmp, a5);
+            mov(tmp.cvt32(), dword[tmp]);
+            mov(dword[b_ptr_reg+ 5 * ele_size ] , tmp.cvt32());
+
+            add(a0, sizeof(float));
+            add(a1, sizeof(float));
+            add(a2, sizeof(float));
+            add(a3, sizeof(float));
+            add(a4, sizeof(float));
+            add(a5, sizeof(float));
+            add(b_ptr_reg, 6 * sizeof(float));
+        }
+
+        L("FUNCTION_END");
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_n_6_ker_t() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_N_6_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t.h
new file mode 100644
index 0000000..a0b265a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t.h
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_T_I_HPP_
+#define TNN_SGEMM_FETCH_T_I_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+template<int I>
+class sgemm_fetch_t_i_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_t_i_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        std::stringstream buf;
+        buf << JIT_KERNEL_NAME(sgemm_fetch_t_) << I;
+        return buf.str();
+    }
+
+public:
+    sgemm_fetch_t_i_ker_t() {
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_stack = get_arguement_to_stack(1);
+        stack_var lda   = get_arguement_to_stack(2);
+        stack_var b_stack = get_arguement_to_stack(3);
+        stack_var ldb   = get_arguement_to_stack(4);
+        stack_var block_size = get_arguement_to_stack(5);
+
+        stack_var m4 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[2] = {REG_VAR_ARRAY_2};
+        reg_var b_r(this);
+
+        reg_var ldax4(this);
+        reg_var ldax8(this);
+        reg_var block_size_x4(this);
+
+        // init m8 = m / 4
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x2);
+        mov(m4, tmp);
+
+        // init m1 = m % 4
+        mov(tmp, m);
+        and_(tmp, 0x3);
+        mov(m1, tmp);
+
+        mov(b_r.aquire(), b_stack);
+        b_r.stash();
+
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*sizeof(float)]);
+        ldax4.store();
+
+        // init a pointers 
+        mov(tmp, a_stack);
+        mov(a_r[0].aquire(), tmp);
+        add(tmp, ldax4);
+        mov(a_r[1].aquire(), tmp);
+        tmp.release();
+
+        // lda for 4 lines
+        lea(ldax8.aquire(), qword[ldax4.release()*2]);
+        ldax8.stash();
+
+        mov(block_size_x4.aquire(), block_size);
+        lea(block_size_x4, qword[block_size_x4*sizeof(float)]);
+        block_size_x4.stash();
+
+        stack_var a_data[4][8] = {
+            {STACK_VAR_ARRAY_8},
+            {STACK_VAR_ARRAY_8},
+            {STACK_VAR_ARRAY_8},
+            {STACK_VAR_ARRAY_8},
+        };
+
+        LOOP_STACK_VAR(m4, SGEMM_FETCH_TI_M4) 
+        {
+            // load 
+            ldax8.restore();
+            Xbyak::RegExp a_addr[4] = {
+                Xbyak::RegExp(a_r[0]),
+                Xbyak::RegExp(a_r[1]),
+                Xbyak::RegExp(a_r[0] + ldax8),
+                Xbyak::RegExp(a_r[1] + ldax8),
+            };
+
+            for(int line=0;line<4;line++) {
+                for(int i=0;i<I;i++) {
+                    mov(tmp.aquire().cvt32(), dword[a_addr[line] + i * sizeof(float)]);
+                    mov(a_data[line][i], tmp.release().cvt32());
+                }
+            }
+            lea(a_r[0], byte[a_r[0] + ldax8 * 2]);
+            lea(a_r[1], byte[a_r[1] + ldax8 * 2]);
+
+            ldax8.release();
+
+            // store
+            b_r.restore();
+            for(int line=0;line<4;line++) {
+                for(int i=0;i<I;i++) {
+                    mov(tmp.aquire().cvt32(), a_data[line][i]);
+                    mov(dword[b_r + i * sizeof(float)], tmp.release().cvt32());
+                }
+                lea(b_r, byte[b_r + block_size_x4.restore()]);
+                block_size_x4.release();
+            }
+            b_r.stash();
+        }
+        
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_TI_M1) 
+        {
+            for(int i=0;i<I;i++) {
+                mov(tmp.aquire().cvt32(), dword[a_r[0] + i * sizeof(float)]);
+                mov(a_data[0][i], tmp.release().cvt32());
+            }
+            lea(a_r[0], byte[a_r[0] + ldax4.restore()]);
+            ldax4.release();
+
+            b_r.restore();
+            for(int i=0;i<I;i++) {
+                mov(tmp.aquire().cvt32(), a_data[0][i]);
+                mov(dword[b_r + i * sizeof(float)], tmp.release().cvt32());
+            }
+            lea(b_r, byte[b_r + block_size_x4.restore()]);
+            b_r.stash();
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_t_i_ker_t() {
+
+    }
+
+private:
+
+};
+
+class sgemm_fetch_t_1_ker_t: public sgemm_fetch_t_i_ker_t<1> {};
+class sgemm_fetch_t_2_ker_t: public sgemm_fetch_t_i_ker_t<2> {};
+class sgemm_fetch_t_3_ker_t: public sgemm_fetch_t_i_ker_t<3> {};
+// class sgemm_fetch_t_4_ker_t: public sgemm_fetch_t_i_ker_t<4> {};
+class sgemm_fetch_t_5_ker_t: public sgemm_fetch_t_i_ker_t<5> {};
+class sgemm_fetch_t_6_ker_t: public sgemm_fetch_t_i_ker_t<6> {};
+class sgemm_fetch_t_7_ker_t: public sgemm_fetch_t_i_ker_t<7> {};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_T_I_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_16.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_16.h
new file mode 100644
index 0000000..9e12084
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_16.h
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_T_16_HPP_
+#define TNN_SGEMM_FETCH_T_16_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+class sgemm_fetch_t_16_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_t_16_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(sgemm_fetch_t_16);
+    }
+
+public:
+    sgemm_fetch_t_16_ker_t() {
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_stack = get_arguement_to_stack(1);
+        stack_var lda   = get_arguement_to_stack(2);
+        reg_var   b_ptr = get_arguement(3);
+        stack_var ldb   = get_arguement_to_stack(4);
+        stack_var block_size = get_arguement_to_stack(5);
+
+        stack_var m4 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[4] = {REG_VAR_ARRAY_4};
+
+        reg_var ldax4(this);
+        reg_var block_size_x4(this);
+
+        // init m4 = m / 4
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x2);
+        mov(m4, tmp);
+
+        // init m1 = m % 4
+        mov(tmp, m);
+        and_(tmp, 0x3);
+        mov(m1, tmp);
+
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*4]);
+        ldax4.stash();
+
+        // init a pointers 
+        mov(tmp, a_stack);
+        for(int i=0;i<4;i++) {
+            mov(a_r[i].aquire(), tmp);
+            if (i<3) add(tmp, ldax4);
+        }
+        tmp.release();
+
+        mov(block_size_x4.aquire(), block_size);
+        lea(block_size_x4, qword[block_size_x4 * 4]);
+        block_size_x4.stash();
+
+        vreg_var v[8] = {VREG_VAR_ARRAY_8};
+        
+        LOOP_STACK_VAR(m4, SGEMM_FETCH_T8_M8) 
+        {
+            //read 
+            ldax4.restore();
+            for(int i=0;i<4;i++) {
+                vmovups(v[i].aquire(),   yword[a_r[i]]);
+                vmovups(v[i+4].aquire(), yword[a_r[i] + 8 * 4]);
+                lea(a_r[i], byte[a_r[i] + ldax4 * 4]); // next 4 lines
+            }
+            ldax4.release();
+
+            // write
+            b_ptr.restore();
+            block_size_x4.restore();
+            for(int i=0;i<4;i++) {
+                vmovups(yword[b_ptr], v[i].release());
+                vmovups(yword[b_ptr + 8 * 4], v[i+4].release());
+                lea(b_ptr, byte[b_ptr + block_size_x4]);
+            }
+            b_ptr.stash();
+            block_size_x4.release();
+        }
+        
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_T8_M1) 
+        {
+            //read 
+            vmovups(v[0].aquire(), yword[a_r[0]]);
+            vmovups(v[1].aquire(), yword[a_r[0] + 8 * 4]);
+
+            lea(a_r[0], byte[a_r[0] + ldax4.restore()]);
+            ldax4.release();
+
+            vmovups(yword[b_ptr.restore()], v[0].release());
+            vmovups(yword[b_ptr + 8 * 4], v[1].release());
+            b_ptr.release();
+
+            add(b_ptr.v_stack_, block_size_x4.restore());
+            block_size_x4.release();
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_t_16_ker_t() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_T_16_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4.h
new file mode 100644
index 0000000..0a4be7a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4.h
@@ -0,0 +1,172 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_T_4_HPP_
+#define TNN_SGEMM_FETCH_T_4_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+class sgemm_fetch_t_4_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_t_4_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(sgemm_fetch_t_4);
+    }
+
+public:
+    sgemm_fetch_t_4_ker_t() {
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_stack = get_arguement_to_stack(1);
+        stack_var lda   = get_arguement_to_stack(2);
+        stack_var b_stack = get_arguement_to_stack(3);
+        stack_var ldb   = get_arguement_to_stack(4);
+        stack_var block_size = get_arguement_to_stack(5);
+
+        stack_var m8 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[4] = {REG_VAR_ARRAY_4};
+
+        reg_var ldax4(this);
+        reg_var ldax16(this);
+        reg_var block_size_x4(this);
+
+        // init m8 = m / 8
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x3);
+        mov(m8, tmp);
+
+        // init m1 = m % 8
+        mov(tmp, m);
+        and_(tmp, 0x7);
+        mov(m1, tmp);
+
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*sizeof(float)]);
+        ldax4.store();
+
+        // init a pointers 
+        mov(tmp, a_stack);
+        for(int i=0;i<4;i++) {
+            mov(a_r[i].aquire(), tmp);
+            if (i<3) add(tmp, ldax4);
+        }
+        tmp.release();
+
+        // lda for 4 lines
+        lea(ldax16.aquire(), qword[ldax4.release()*sizeof(float)]);
+        ldax16.stash();
+
+        mov(block_size_x4.aquire(), block_size);
+        lea(block_size_x4, qword[block_size_x4*sizeof(float)]);
+        block_size_x4.stash();
+
+        vreg_var v[8] = {VREG_VAR_ARRAY_8};
+
+        
+        LOOP_STACK_VAR(m8, SGEMM_FETCH_T4_M8) 
+        {
+            //read 
+            for(int i=0;i<4;i++) {
+                v[i].aquire();
+                movups(v[i].xmm(), xword[a_r[i]]);
+            }
+
+            ldax16.restore();
+            for(int i=0;i<4;i++) {
+                v[i+4].aquire();
+                movups(v[i+4].xmm(), xword[a_r[i] + ldax16]);
+                lea(a_r[i], byte[a_r[i] + ldax16 * 2]);
+            }
+            ldax16.release();
+
+            // write
+            block_size_x4.restore();
+            mov(tmp.aquire(), b_stack);
+            for(int i=0;i<8;i++) {
+                movups(yword[tmp], v[i].xmm());
+                v[i].release();
+                lea(tmp, byte[tmp + block_size_x4]);
+            }
+            mov(b_stack, tmp);
+            block_size_x4.release();
+        }
+        
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_T4_M1) 
+        {
+            //read
+            v[0].aquire();
+            movups(v[0].xmm(), yword[a_r[0]]);
+
+            lea(a_r[0], byte[a_r[0] + ldax4.restore()]);
+            ldax4.release();
+
+            mov(tmp, b_stack);
+            movups(yword[tmp], v[0].xmm());
+            v[0].release();
+            add(b_stack, block_size_x4.restore());
+            block_size_x4.release();
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_t_4_ker_t() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_T_4_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4x16.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4x16.h
new file mode 100644
index 0000000..a0fd07f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_4x16.h
@@ -0,0 +1,210 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_T_4x16_HPP_
+#define TNN_SGEMM_FETCH_T_4x16_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+// only supports block_size >= 16
+// only supports 64bit machines
+class sgemm_fetch_t_4x16_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_t_4x16_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(sgemm_fetch_t_4x16);
+    }
+
+public:
+    sgemm_fetch_t_4x16_ker_t() {
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+#ifdef XBYAK64
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_stack = get_arguement_to_stack(1);
+        reg_var   lda   = get_arguement(2);
+        stack_var b_stack = get_arguement_to_stack(3);
+        reg_var   ldb   = get_arguement(4);
+        reg_var block_size = get_arguement(5);
+
+        stack_var m4 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[4] = {REG_VAR_ARRAY_4};
+        reg_var b_r[4] = {REG_VAR_ARRAY_4};
+
+        // init m4 = m / 4
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x2);
+        mov(m4, tmp);
+
+        // init m1 = m % 4
+        mov(tmp, m);
+        and_(tmp, 0x3);
+        mov(m1, tmp);
+
+        lda.restore();
+        lea(lda, byte[lda*4]);
+
+        ldb.restore();
+        sal(ldb, 0x6); // ldb = ldb * 16(block_size) * sizeof(float)
+
+        block_size.restore();
+        lea(block_size, byte[block_size*4]);
+
+        // init a pointers along m 
+        mov(tmp, a_stack);
+        for(int i=0;i<4;i++) {
+            mov(a_r[i].aquire(), tmp);
+            if (i<3) add(tmp, lda);
+        }
+
+        // init b pointers along n
+        mov(tmp, b_stack);
+        for(int i=0;i<4;i++) {
+            mov(b_r[i].aquire(), tmp);
+            if (i<3) add(tmp, ldb);
+        }
+        tmp.release();
+        ldb.release();
+
+        vreg_var v[4][4] = {{VREG_VAR_ARRAY_4},{VREG_VAR_ARRAY_4},{VREG_VAR_ARRAY_4},{VREG_VAR_ARRAY_4}};
+        
+        sal(lda, 0x2);
+        sal(block_size, 0x2);
+
+        LOOP_STACK_VAR(m4, SGEMM_FETCH_T8_M8) 
+        {
+            for(int m=0;m<4;m++) {
+                for(int n=0;n<2;n++){
+                    vmovups(v[m][n].aquire(),     yword[a_r[m] + n * 32]);
+                }
+            }
+
+            for(int m=0;m<4;m++) {
+                vmovaps(yword[b_r[0] + (m*2 + 0) * 8 * 4],  v[m][0].release());
+                vmovaps(yword[b_r[0] + (m*2 + 1) * 8 * 4],  v[m][1].release());
+            }
+            add(b_r[0], block_size);
+
+            for(int m=0;m<4;m++) {
+                for(int n=0;n<2;n++){
+                    vmovups(v[m][n].aquire(),     yword[a_r[m] + (n + 2) * 32]);
+                }
+            }
+
+            for(int m=0;m<4;m++) {
+                vmovaps(yword[b_r[1] + (m*2 + 0) * 8 * 4],  v[m][0].release());
+                vmovaps(yword[b_r[1] + (m*2 + 1) * 8 * 4],  v[m][1].release());
+            }
+            add(b_r[1], block_size);
+
+            for(int m=0;m<4;m++) {
+                for(int n=0;n<2;n++){
+                    vmovups(v[m][n].aquire(),     yword[a_r[m] + (n + 4) * 32]);
+                }
+            }
+
+            for(int m=0;m<4;m++) {
+                vmovaps(yword[b_r[2] + (m*2 + 0) * 8 * 4],  v[m][0].release());
+                vmovaps(yword[b_r[2] + (m*2 + 1) * 8 * 4],  v[m][1].release());
+            }
+            add(b_r[2], block_size);
+
+            for(int m=0;m<4;m++) {
+                for(int n=0;n<2;n++){
+                    vmovups(v[m][n].aquire(),     yword[a_r[m] + (n + 6) * 32]);
+                }
+                lea(a_r[m], byte[a_r[m] + lda]);
+            }
+
+            for(int m=0;m<4;m++) {
+                vmovaps(yword[b_r[3] + (m*2 + 0) * 8 * 4],  v[m][0].release());
+                vmovaps(yword[b_r[3] + (m*2 + 1) * 8 * 4],  v[m][1].release());
+            }
+            add(b_r[3], block_size);
+        }
+        
+        
+        sar(lda, 0x2);
+        sar(block_size, 0x2);
+
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_T8_M1) 
+        {
+            for(int n=0;n<4;n++) {
+                vmovups(v[0][n].aquire(),   yword[a_r[0] + (n * 16) * 4]);
+                vmovups(v[0][n+4].aquire(), yword[a_r[0] + (n * 16 + 8) * 4]);
+            }
+
+            for(int n=0;n<4;n++) {
+                vmovaps(yword[b_r[n]],         v[0][n].release());
+                vmovaps(yword[b_r[n] + 8 * 4], v[0][n+4].release());
+            }
+
+            add(a_r[0], lda);
+            for(int i=0;i<4;i++) {
+                lea(b_r[i], byte[b_r[i] + block_size]);
+            }
+        }
+
+#endif
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_t_4x16_ker_t() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_T_4x16_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_8.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_8.h
new file mode 100644
index 0000000..3b3112c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/kernels/sgemm_fetch_t_8.h
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SGEMM_FETCH_T_8_HPP_
+#define TNN_SGEMM_FETCH_T_8_HPP_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <fstream>
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <exception>
+#include <utility>
+
+#include <xbyak/xbyak.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/macro.h"
+#include "tnn/device/x86/acc/compute/jit/common/abi_info.h"
+#include "tnn/device/x86/acc/compute/jit/common/asm_common.h"
+#include "tnn/device/x86/acc/compute/jit/kernels/base_jit_kernel.h"
+
+namespace TNN_NS {
+namespace jit {
+
+class sgemm_fetch_t_8_ker_t: public base_jit_kernel {
+
+public:
+    static void naive_impl(const dim_t m, const float * a, const dim_t lda, float * b, const dim_t ldb, const dim_t block_size) {
+    }
+
+    using func_ptr_t = decltype(&sgemm_fetch_t_8_ker_t::naive_impl);
+
+    virtual std::string get_kernel_name() {
+        return JIT_KERNEL_NAME(sgemm_fetch_t_8);
+    }
+
+public:
+    sgemm_fetch_t_8_ker_t() {
+
+        declare_param<const size_t>();
+        declare_param<const float *>();
+        declare_param<const size_t>();
+        declare_param<float *>();
+        declare_param<const dim_t>();
+        declare_param<const dim_t>();
+
+        abi_prolog();
+
+
+        stack_var m     = get_arguement_to_stack(0);
+        stack_var a_stack = get_arguement_to_stack(1);
+        stack_var lda   = get_arguement_to_stack(2);
+        stack_var b_stack = get_arguement_to_stack(3);
+        stack_var ldb   = get_arguement_to_stack(4);
+        stack_var block_size = get_arguement_to_stack(5);
+
+        stack_var m8 = get_stack_var();
+        stack_var m1 = get_stack_var();
+
+        reg_var tmp(this);
+        reg_var a_r[4] = {REG_VAR_ARRAY_4};
+
+        reg_var ldax4(this);
+        reg_var ldax16(this);
+        reg_var block_size_x4(this);
+
+        // init m8 = m / 8
+        mov(tmp.aquire(), m);
+        sar(tmp, 0x3);
+        mov(m8, tmp);
+
+        // init m1 = m % 8
+        mov(tmp, m);
+        and_(tmp, 0x7);
+        mov(m1, tmp);
+
+        mov(ldax4.aquire(), lda);
+        lea(ldax4, qword[ldax4*sizeof(float)]);
+        ldax4.store();
+
+        // init a pointers 
+        mov(tmp, a_stack);
+        for(int i=0;i<4;i++) {
+            mov(a_r[i].aquire(), tmp);
+            if (i<3) add(tmp, ldax4);
+        }
+        tmp.release();
+
+        // lda for 4 lines
+        lea(ldax16.aquire(), qword[ldax4.release()*sizeof(float)]);
+        ldax16.stash();
+
+        mov(block_size_x4.aquire(), block_size);
+        lea(block_size_x4, qword[block_size_x4*sizeof(float)]);
+        block_size_x4.stash();
+
+        vreg_var v[8] = {VREG_VAR_ARRAY_8};
+
+        
+        LOOP_STACK_VAR(m8, SGEMM_FETCH_T8_M8) 
+        {
+            //read 
+            for(int i=0;i<4;i++) {
+                vmovups(v[i].aquire(), yword[a_r[i]]);
+            }
+
+            ldax16.restore();
+            for(int i=0;i<4;i++) {
+                vmovups(v[i+4].aquire(), yword[a_r[i] + ldax16]);
+                lea(a_r[i], byte[a_r[i] + ldax16 * 2]);
+            }
+            ldax16.release();
+
+            // write
+            block_size_x4.restore();
+            mov(tmp.aquire(), b_stack);
+            for(int i=0;i<8;i++) {
+                vmovups(yword[tmp], v[i].release());
+                lea(tmp, byte[tmp + block_size_x4]);
+            }
+            mov(b_stack, tmp);
+            block_size_x4.release();
+        }
+        
+        LOOP_STACK_VAR(m1, SGEMM_FETCH_T8_M1) 
+        {
+            //read 
+            vmovups(v[0].aquire(), yword[a_r[0]]);
+
+            lea(a_r[0], byte[a_r[0] + ldax4.restore()]);
+            ldax4.release();
+
+            mov(tmp, b_stack);
+            vmovups(yword[tmp], v[0].release());
+            add(b_stack, block_size_x4.restore());
+            block_size_x4.release();
+        }
+
+        abi_epilog();
+        ret();
+    }
+
+    virtual ~sgemm_fetch_t_8_ker_t() {
+
+    }
+
+private:
+
+};
+
+} // namespace jit
+} // namespace tnn
+
+#endif // TNN_SGEMM_FETCH_T_8_HPP_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/sgemm_driver.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/sgemm_driver.cc
new file mode 100644
index 0000000..7f31372
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/sgemm_driver.cc
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/compute/jit/cblas.h"
+
+#include <stdio.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+#include "tnn/device/x86/acc/compute/jit/utils/utils.h"
+#include "tnn/device/x86/acc/compute/jit/data_packing.h"
+#include "tnn/device/x86/acc/compute/jit/gemm_config.h"
+#include "tnn/device/x86/acc/compute/jit/utils/timer.hpp"
+
+
+namespace TNN_NS {
+
+void sgemm_block_n(
+        dim_t M, dim_t N, dim_t K,
+        const float alpha,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb, 
+        const float beta,
+        float * dst, dim_t ldc, 
+        gemm_config<float, float, float> &gemm_conf) 
+{
+
+    dim_t K_c = gemm_conf.K_c_;
+    dim_t m_block = gemm_conf.m_block_;
+
+    for(dim_t i=0;i<M;)  {
+        dim_t cur_m = std::min(M - i, gemm_conf.kernel_m_r_);
+
+        const float * cur_a = src_a + divDown(i, m_block) * K_c + i % m_block;
+        const float * cur_b = src_b;
+        float * cur_c = dst + i;
+
+        switch(cur_m) {
+            case 1:
+                gemm_conf.kernels_[1][N](K, &alpha, cur_a, lda, cur_b, ldb, &beta, cur_c, ldc);
+                i+=1;
+                break;
+            case 2:
+            case 3:
+                gemm_conf.kernels_[2][N](K, &alpha, cur_a, lda, cur_b, ldb, &beta, cur_c, ldc);
+                i+=2;
+                break;
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                gemm_conf.kernels_[4][N](K, &alpha, cur_a, lda, cur_b, ldb, &beta, cur_c, ldc);
+                i+=4;
+                break;
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            case 12:
+            case 13:
+            case 14:
+            case 15:
+                gemm_conf.kernels_[8][N](K, &alpha, cur_a, lda, cur_b, ldb, &beta, cur_c, ldc);
+                i+=8;
+                break;
+            default:
+                gemm_conf.kernels_[16][N](K, &alpha, cur_a, lda, cur_b, ldb, &beta, cur_c, ldc);
+                i+=16;
+                break;
+        }
+    }
+}
+
+void sgemm_nn_col_major(
+        dim_t M, dim_t N, dim_t K, 
+        const float alpha,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb, 
+        const float beta,
+        float * dst, dim_t ldc)
+{
+    if (alpha == 0) {
+        return ;
+    }
+
+    float beta_div_alpha = beta / alpha; 
+    gemm_config<float, float, float> gemm_conf("N", "N", M, N, K, &alpha, src_b, lda, src_b, ldb, &beta_div_alpha, dst, ldc);
+
+    dim_t M_c = gemm_conf.M_c_;
+    dim_t K_c = gemm_conf.K_c_;
+    dim_t m_block = gemm_conf.m_block_;
+    dim_t n_block = gemm_conf.n_block_;
+
+    float * pack_a = (float*)_mm_malloc(M_c * K_c*sizeof(float), 32);
+    float * pack_b = (float*)_mm_malloc(K_c * divUp(N, n_block) * sizeof(float), 32);
+
+    dim_t i, j, k;
+    i = j = k = 0;
+
+    for(k=0;k<K;k+=K_c)  {
+
+        float cur_beta = 1.0 / alpha;
+        if (k == 0) cur_beta = beta_div_alpha;
+
+        dim_t cur_k = std::min(K - k, K_c);
+
+        // pack b -> K_c * N;
+        pack_n(src_b + k, ldb, pack_b, K_c, cur_k, N, gemm_conf);
+
+        for(i=0;i<M;i+=M_c)  {
+
+            dim_t cur_m = std::min(M - i, M_c);
+            // pack a -> M_c * K_c;
+            pack_t(src_a + i + k * lda, lda, pack_a, K_c, cur_k, cur_m, gemm_conf);
+
+            for(j=0;j<N;)  {
+
+                dim_t cur_n = std::min(N - j, gemm_conf.kernel_n_r_);
+                float * cur_c = dst + i + j * ldc;
+
+                const float * packed_cur_b = pack_b + divDown(j, n_block) * K_c + j % n_block;
+                sgemm_block_n(cur_m, cur_n, cur_k, alpha, pack_a, lda, packed_cur_b, ldb, cur_beta, cur_c, ldc, gemm_conf);
+                j+= cur_n;
+            }
+        }
+    }
+
+    _mm_free(pack_a);
+    _mm_free(pack_b);
+
+}
+
+
+void sgemm_col_naive(
+        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+        dim_t M, dim_t N, dim_t K, 
+        const float alpha,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb, 
+        const float beta,
+        float * dst, dim_t ldc)
+{
+    dim_t a_stride_m = TransA == CblasNoTrans ? 1 : lda;
+    dim_t a_stride_k = TransA == CblasNoTrans ? lda : 1;
+    dim_t b_stride_k = TransB == CblasNoTrans ? 1 : ldb;
+    dim_t b_stride_n = TransB == CblasNoTrans ? ldb : 1;
+
+    for(dim_t m=0;m<M;m++) {
+        for(dim_t n=0;n<N;n++) {
+            float acc = 0.f;
+            const float * a_ptr = src_a + m * a_stride_m;
+            const float * b_ptr = src_b + n * b_stride_n;
+
+            for(dim_t k=0;k<K;k++) {
+                acc += a_ptr[0] * b_ptr[0];
+                a_ptr += a_stride_k;
+                b_ptr += b_stride_k;
+            }
+
+            dst[m + n * ldc ] = alpha * acc + beta * dst[m + n * ldc];
+        }
+    }
+}
+
+
+void sgemm_row_naive(
+        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+        dim_t M, dim_t N, dim_t K, 
+        const float alpha,
+        const float * src_a, dim_t lda,
+        const float * src_b, dim_t ldb, 
+        const float beta,
+        float * dst, dim_t ldc)
+{
+    dim_t a_stride_m = TransA == CblasNoTrans ? lda : 1;
+    dim_t a_stride_k = TransA == CblasNoTrans ? 1 : lda;
+    dim_t b_stride_k = TransB == CblasNoTrans ? ldb : 1;
+    dim_t b_stride_n = TransB == CblasNoTrans ? 1 : ldb;
+
+    for(dim_t m=0;m<M;m++) {
+        for(dim_t n=0;n<N;n++) {
+            float acc = 0.f;
+            const float * a_ptr = src_a + m * a_stride_m;
+            const float * b_ptr = src_b + n * b_stride_n;
+
+            for(dim_t k=0;k<K;k++) {
+                acc += a_ptr[0] * b_ptr[0];
+                a_ptr += a_stride_k;
+                b_ptr += b_stride_k;
+            }
+
+            dst[m * ldc  + n] = alpha * acc + beta * dst[m * ldc + n];
+        }
+    }
+}
+
+
+} // namespace tnn
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+         OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) {
+
+    if (Order == CblasColMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
+        TNN_NS::sgemm_nn_col_major(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+    } else if (Order == CblasColMajor) {
+        TNN_NS::sgemm_col_naive(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+    } else {
+        TNN_NS::sgemm_row_naive(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+    }
+
+    #if 0
+    FILE* fp = fopen("clbas_opt.log", "a+");
+    fprintf(fp, "A:%.6f B:%6f C:%6f, m:%d n:%d k:%d, alpha:%.6f beta:%.6f, lda:%d ldb:%d ldc:%d\n", A[0], B[0], C[0], M, N, K, alpha, beta, lda, ldb, ldc);
+    fclose(fp);
+    #endif
+
+    return;
+}
+
+
+int sgemm_opt1(const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc) {
+
+    TNN_NS::sgemm_nn_col_major(*m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
+
+    return 0; 
+}
+
+
+#ifdef __cplusplus
+}
+#endif  /* __cplusplus */
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.cc
new file mode 100644
index 0000000..1e4a399
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/compute/jit/utils/cpu_isa.h"
+
+#include <stdio.h>
+
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>
+
+namespace TNN_NS {
+
+using namespace Xbyak::util;
+
+static Xbyak::util::Cpu cpu;
+
+bool cpu_with_isa(x86_isa_t arch) {
+    switch (arch) {
+        case sse42:
+            return cpu.has(Cpu::tSSE42);
+        case avx:
+            return cpu.has(Cpu::tAVX);
+        case avx2:
+            return cpu.has(Cpu::tAVX2) && cpu.has(Cpu::tFMA);
+        case avx512:
+            return cpu.has(Cpu::tAVX512F)  && cpu.has(Cpu::tAVX512BW) &&
+                   cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
+        case avx512_vnni:
+            return cpu.has(Cpu::tAVX512F)  && cpu.has(Cpu::tAVX512BW) &&
+                   cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
+                   cpu.has(Cpu::tAVX512_VNNI);
+        default:
+            return false;
+    }
+    return false;
+}
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.h
new file mode 100644
index 0000000..fea497f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/cpu_isa.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_CPU_ISA_HPP_
+#define TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_CPU_ISA_HPP_
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#include <chrono>
+#include <random>
+#include <fstream>
+#include <exception>
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    sse42,
+    avx,
+    avx2,
+    avx512,
+    avx512_vnni,
+} x86_isa_t;
+
+bool cpu_with_isa(x86_isa_t arch);
+
+} // namespace tnn
+
+#endif // TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_CPU_ISA_HPP_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/macro.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/macro.h
new file mode 100644
index 0000000..455883c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/macro.h
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_JIT_MACRO_H_
+#define TNN_JIT_MACRO_H_
+
+#ifdef XBYAK64 
+#define movx movq
+#define JIT_KERNEL_NAME(class)           "tnn_jit_64_"#class
+#else // XBYAK64
+#define movx mov
+#define JIT_KERNEL_NAME(class)           "tnn_jit_32_"#class
+#endif
+
+#define MIN_(A, B) (A < B ? A : B)
+
+#define LOOP(n, tag)                                                \
+    cmp(n, 0);                                                      \
+    jle("end"#tag, T_NEAR);                                         \
+    L(""#tag);                                                      \
+    bool _##tag##first_call = true;                                 \
+    auto _##tag##condition = [&](rf_t n, bool &first_call){         \
+        if (first_call) {                                           \
+            first_call = false;                                     \
+            return true;                                            \
+        } else {                                                    \
+            sub(n, 0x1);                                            \
+            jne(""#tag, T_NEAR);                                    \
+            L("end"#tag);                                           \
+            return false;                                           \
+        }                                                           \
+    };                                                              \
+    while(_##tag##condition(n, _##tag##first_call))
+
+#define LOOP_STACK_VAR(n, tag)                                      \
+    cmp(n, 0);                                                      \
+    jle("end"#tag, T_NEAR);                                         \
+    L(""#tag);                                                      \
+    bool _##tag##first_call = true;                                 \
+    auto _##tag##condition = [&](stack_var &n, bool &first_call){   \
+        if (first_call) {                                           \
+            first_call = false;                                     \
+            return true;                                            \
+        } else {                                                    \
+            sub(n, 0x1);                                            \
+            jg(""#tag, T_NEAR);                                     \
+            L("end"#tag);                                           \
+            return false;                                           \
+        }                                                           \
+    };                                                              \
+    while(_##tag##condition(n, _##tag##first_call))
+
+#define USE_REG(rf)                                                 \
+    rf.aquire();                                                    \
+    bool _##tag##first_call = true;                                 \
+    auto _##tag##condition = [&](bool &first_call){                 \
+        if (first_call) {                                           \
+            first_call = false;                                     \
+            return true;                                            \
+        } else {                                                    \
+            rf.release();                                           \
+            return false;                                           \
+        }                                                           \
+    };                                                              \
+    while(_##tag##condition(n, _##tag##first_call))
+
+
+#define STACK_VAR_ARRAY_2   get_stack_var(), get_stack_var(),
+#define STACK_VAR_ARRAY_3                                           \
+        get_stack_var(), get_stack_var(), get_stack_var()
+#define STACK_VAR_ARRAY_4 STACK_VAR_ARRAY_2 STACK_VAR_ARRAY_2
+#define STACK_VAR_ARRAY_5 STACK_VAR_ARRAY_2 STACK_VAR_ARRAY_3
+#define STACK_VAR_ARRAY_6 STACK_VAR_ARRAY_4 STACK_VAR_ARRAY_2
+#define STACK_VAR_ARRAY_7 STACK_VAR_ARRAY_4 STACK_VAR_ARRAY_3
+#define STACK_VAR_ARRAY_8 STACK_VAR_ARRAY_4 STACK_VAR_ARRAY_4
+
+#define REG_VAR_ARRAY_2   reg_var(this), reg_var(this),
+#define REG_VAR_ARRAY_3   reg_var(this), reg_var(this), reg_var(this),
+#define REG_VAR_ARRAY_4   REG_VAR_ARRAY_2 REG_VAR_ARRAY_2
+#define REG_VAR_ARRAY_5   REG_VAR_ARRAY_2 REG_VAR_ARRAY_3
+#define REG_VAR_ARRAY_6   REG_VAR_ARRAY_2 REG_VAR_ARRAY_4
+#define REG_VAR_ARRAY_7   REG_VAR_ARRAY_3 REG_VAR_ARRAY_4
+#define REG_VAR_ARRAY_8   REG_VAR_ARRAY_4 REG_VAR_ARRAY_4
+#define REG_VAR_ARRAY_16  REG_VAR_ARRAY_8 REG_VAR_ARRAY_8
+
+#define VREG_VAR_ARRAY_2   vreg_var(this), vreg_var(this),
+#define VREG_VAR_ARRAY_3   vreg_var(this), vreg_var(this), vreg_var(this),
+#define VREG_VAR_ARRAY_4   VREG_VAR_ARRAY_2 VREG_VAR_ARRAY_2
+#define VREG_VAR_ARRAY_5   VREG_VAR_ARRAY_2 VREG_VAR_ARRAY_3
+#define VREG_VAR_ARRAY_6   VREG_VAR_ARRAY_2 VREG_VAR_ARRAY_4
+#define VREG_VAR_ARRAY_7   VREG_VAR_ARRAY_3 VREG_VAR_ARRAY_4
+#define VREG_VAR_ARRAY_8   VREG_VAR_ARRAY_4 VREG_VAR_ARRAY_4
+#define VREG_VAR_ARRAY_16  VREG_VAR_ARRAY_8 VREG_VAR_ARRAY_8
+
+#endif // TNN_JIT_MACRO_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/timer.hpp b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/timer.hpp
new file mode 100644
index 0000000..c25acf8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/timer.hpp
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_TIMER_H_
+#define TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_TIMER_H_
+
+#include <map>
+#include <chrono>
+
+namespace TNN_NS {
+
+using std::chrono::time_point;
+using std::chrono::system_clock;
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+class NiceTimer {
+
+public:
+
+    float tick(int cur_id, int cmp_id = 0) {
+
+        time_point<system_clock> cur = system_clock::now();
+        m_footprint_list[cur_id] = cur;
+
+        time_point<system_clock> last = cur;
+        if (m_footprint_list.find(cmp_id) != m_footprint_list.end()) {
+            last = m_footprint_list[cmp_id];
+        }
+
+        return cmp(last, cur);
+    }
+
+
+private:
+
+    float cmp(time_point<system_clock> t1, time_point<system_clock> t2) {
+        return duration_cast<microseconds>(t2 - t1).count() / 1000.0f;
+    }
+
+
+std::map<int, time_point<system_clock> > m_footprint_list;
+
+};
+
+
+};
+
+#endif // TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_TIMER_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/utils.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/utils.h
new file mode 100644
index 0000000..1bc5e84
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/jit/utils/utils.h
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_HPP_
+#define TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_HPP_
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#include <chrono>
+#include <random>
+#include <fstream>
+#include <exception>
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#include "tnn/device/x86/acc/compute/jit/common/type_def.h"
+
+namespace TNN_NS {
+
+inline FILE * tnn_fopen(const char * fname, const char * mode) {
+#ifdef _WIN32
+    FILE *fp = nullptr;
+    return fopen_s(&fp, fname, mode) ? nullptr : fp;
+#else
+    return fopen(fname, mode);
+#endif
+}
+
+inline dim_t divDown(dim_t n, dim_t div) {
+    return n / div * div;
+}
+
+inline dim_t divUp(dim_t n, dim_t div) {
+    return (n + div -1 ) / div * div;
+}
+
+
+template <typename T>
+int initRandom(T* src, size_t n, T range_min, T range_max) {
+    std::mt19937 g(42);
+    std::uniform_real_distribution<> rnd(range_min, range_max);
+
+    for (unsigned long long i = 0; i < n; i++) {
+        src[i] = static_cast<T>(rnd(g));
+    }
+
+    return 0;
+}
+
+inline float GFLOPS(int m, int n, int k, float time_ms ) {
+    float ops = float(m) * n * k * 2;
+    return ops / 1024.0f / 1024.0f / 1024.0f / (time_ms / 1000);
+}
+
+inline float DramBW(int m, int n, int k, float time_ms ) {
+    float bytes = 0;
+    bytes += float(m) * k * sizeof(float);
+    bytes += float(n) * k * sizeof(float);
+    bytes += float(m) * n * sizeof(float) * 2;
+    return bytes / 1024.0f / 1024.0f / 1024.0f / (time_ms / 1000);
+}
+
+inline float DramBWPacking(int m, int n, int k, float time_ms ) {
+    float bytes = 0;
+    bytes += float(m) * k * sizeof(float) * 3; // packing: read + write, compute: read
+    bytes += float(n) * k * sizeof(float) * 3;      
+    bytes += float(m) * n * sizeof(float) * 2; // computing: read + write
+    return bytes / 1024.0f / 1024.0f / 1024.0f / (time_ms / 1000);
+}
+
+} // namespace tnn
+
+#endif // TNN_DEVICE_X86_ACC_COMPUTE_JIT_UTILS_HPP_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.cc
new file mode 100644
index 0000000..52329fa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.cc
@@ -0,0 +1,997 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/acc/Float8.h"
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <type_traits>
+#include <immintrin.h>
+
+#include "jit/cblas.h"
+
+namespace TNN_NS {
+
+Status X86_IM2COL(float* src, int channel, int height, int width, int kernelh, int kernelw, int padl, int padr,
+                  int padt, int padb, int strideh, int stridew, int dilationh, int dilationw, float* dst) {
+    int height_col   = (height + padt + padb - dilationh * (kernelh - 1) - 1) / strideh + 1;
+    int width_col    = (width + padl + padr - dilationw * (kernelw - 1) - 1) / stridew + 1;
+    int channels_col = channel * kernelh * kernelw;
+
+    // im2col
+    for (int c = 0; c < channels_col; c++) {
+        int w_offset = c % kernelw;
+        int h_offset = (c / kernelw) % kernelh;
+        int c_im     = c / kernelh / kernelw;
+
+        int h_base = h_offset * dilationh - padt;
+        int w_base = w_offset * dilationw - padl;
+
+        int h_base_start = MAX(0, (UP_DIV(-h_base, strideh)));
+        int h_base_end   = MIN(height_col, UP_DIV(height - h_base, strideh));
+        int w_base_start = MAX(0, (UP_DIV(-w_base, stridew)));
+        int w_base_end   = MIN(width_col, UP_DIV(width - w_base, stridew));
+
+        auto src_c = src + c_im * height * width;
+        auto dst_c = dst + c * height_col * width_col;
+
+        memset(dst_c, 0, h_base_start * width_col * sizeof(float));
+        for (int h = h_base_start; h < h_base_end; h++) {
+            int h_pad = h_base + h * strideh;
+
+            auto src_h = src_c + h_pad * width;
+            auto dst_h = dst_c + h * width_col;
+
+            for (int w = 0; w < w_base_start; w++) {
+                dst_h[w] = 0;
+            }
+            for (int w = w_base_start; w < w_base_end; w++) {
+                int w_pad = w_base + w * stridew;
+                dst_h[w]  = src_h[w_pad];
+            }
+            for (int w = w_base_end; w < width_col; w++) {
+                dst_h[w] = 0;
+            }
+        }
+        memset(dst_c + h_base_end * width_col, 0, (height_col - h_base_end) * width_col * sizeof(float));
+    }
+
+    return TNN_OK;
+}
+
+Status X86_COL2IM(float* src, int channels, int height, int width, int kernelh, int kernelw, int padh, int padw,
+                  int strideh, int stridew, int dilationh, int dilationw, int output_height, int output_width, float* dst) {
+    for (int c = 0; c < channels; ++c) {
+        auto dst_c = dst + c * output_height * output_width;
+        auto src_c = src + c * kernelh * kernelw * width * height;
+        memset(dst_c, 0, output_height * output_width * 4);
+        for (int dh = 0; dh < height; ++dh) {
+            for (int dw = 0; dw < width; ++dw) {
+                int src_start_y = dh * strideh - padh;
+                int src_start_x = dw * stridew - padw;
+                int sfy = MAX(0, UP_DIV(-src_start_y, dilationh));
+                int efy = MIN(kernelh, UP_DIV(output_height - src_start_y, dilationh));
+                int sfx = MAX(0, UP_DIV(-src_start_x, dilationw));
+                int efx = MIN(kernelw, UP_DIV(output_width - src_start_x, dilationw));
+
+                auto dst_start = dst_c + src_start_y * output_width + src_start_x;
+                auto src_start = src_c + dh * width + dw;
+
+                for (int fy = sfy; fy < efy; ++fy) {
+                    auto dst_y = dst_start + fy * dilationh * output_width;
+                    auto src_y = src_start + fy * kernelw * height * width;
+                    for (int fx = sfx; fx < efx; ++fx) {
+                        dst_y[fx * dilationw] += src_y[fx * width * height];
+                    }
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status X86_matrixMul(int m, int n, int k, const float *A, const float *B, float *C,
+                     int has_bias, const float *bias, int activation_type) {
+
+    if (ActivationType_None == activation_type) {
+        // row major matrix A(m, k) x B(k, n) equals to :
+        // col major matrix B(n, k) x A(k, m).
+        float alpha = 1.0;
+        float beta = 0.0;
+        if (1 == has_bias){
+            beta = 1.0;
+            for (int mm = 0; mm < m; mm++) {
+                for (int nn = 0; nn < n; nn++) {
+                    C[mm * n + nn] = bias[mm];
+                }
+            }
+        } 
+        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 
+                    n, m, k, alpha, B, n, A, k, beta, C, n);
+    } else {
+        for (int mm = 0; mm < m; mm++) {
+            for (int nn = 0; nn < n; nn++) {
+                float tmp = 0.f;
+                for (int kk = 0; kk < k; kk++) {
+                    tmp += A[mm * k + kk] * B[kk * n + nn];
+                }
+                if (has_bias) tmp += bias[mm];
+                if (activation_type == ActivationType_ReLU) tmp = std::max(0.f, tmp);
+                C[mm * n + nn] = tmp;
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+// max pooling can use heap
+Status X86_MAX_POOLING(float *input, float *output, DimsVector input_dim, DimsVector output_dim,
+                       int stride_h, int stride_w, int kernel_h, int kernel_w, int pad_h, int pad_w) {
+    
+    auto input_width = input_dim[3], input_height = input_dim[2], input_channel = input_dim[1];
+    auto output_width = output_dim[3], output_height = output_dim[2], output_channel = output_dim[1];
+
+    for (int b = 0; b < input_dim[0]; b++) {
+        for (int c = 0; c < output_channel; c++) {
+            float* input_data  = input + (b * input_channel + c) * input_height * input_width;
+            float* output_data = output + (b * output_channel + c) * output_height * output_width;
+            for (int h = 0; h < output_height; h++) {
+                for (int w = 0; w < output_width; w++) {
+                    // if use heap, build for first try
+                    int h_start = h * stride_h - pad_h, h_end = h_start + kernel_h;
+                    int w_start = w * stride_w - pad_w, w_end = w_start + kernel_w;
+                    h_start = std::max(0, h_start); h_end = std::min(input_height, h_end);
+                    w_start = std::max(0, w_start); w_end = std::min(input_width, w_end);
+                    float tmp = input_data[h_start * input_width + w_start];
+                    for (int hin = h_start; hin < h_end; hin++) {
+                        for (int win = w_start; win < w_end; win++) {
+                            tmp = std::max(tmp, input_data[hin * input_width + win]);
+                        }
+                    }
+                    output_data[h * output_width + w] = tmp;
+                }
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status X86_AVERAGE_POOLING(float *input, float *output, DimsVector input_dim, DimsVector output_dim,
+                           int stride_h, int stride_w, int kernel_h, int kernel_w, int pad_h, int pad_w) {
+    
+    auto input_width = input_dim[3], input_height = input_dim[2], input_channel = input_dim[1];
+    auto output_width = output_dim[3], output_height = output_dim[2], output_channel = output_dim[1];
+
+    for (int b = 0; b < input_dim[0]; b++) {
+        for (int c = 0; c < output_channel; c++) {
+            float* input_data  = input + (b * input_channel + c) * input_height * input_width;
+            float* output_data = output + (b * output_channel + c) * output_height * output_width;
+            for (int h = 0; h < output_height; h++) {
+                for (int w = 0; w < output_width; w++) {
+                    // if use heap, build for first try
+                    int h_start = h * stride_h - pad_h, h_end = h_start + kernel_h;
+                    int w_start = w * stride_w - pad_w, w_end = w_start + kernel_w;
+                    h_start = std::max(0, h_start); h_end = std::min(input_height, h_end);
+                    w_start = std::max(0, w_start); w_end = std::min(input_width, w_end);
+                    auto kernel_count = (h_end - h_start) * (w_end - w_start);
+                    float tmp = 0.f;
+                    for (int hin = h_start; hin < h_end; hin++) {
+                        for (int win = w_start; win < w_end; win++) {
+                            tmp += input_data[hin * input_width + win];
+                        }
+                    }
+                    output_data[h * output_width + w] = tmp / kernel_count;
+                }
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+/*
+max pooling corner func, left/right/top/bottom
+*/
+template <class T, int pack_c>
+void X86MaxPoolingCorner(const float* src, long iw, long ih, float* dst, long ow, long kw, long kh, long stride_w, long stride_h,
+                      long pad_w, long pad_h, long l, long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            T vmax(-FLT_MAX);
+
+            const long srcOriginX = ox * stride_w - pad_w;
+            const long srcOriginY = oy * stride_h - pad_h;
+            const long kxs        = MAX(0, -srcOriginX);
+            const long kxe        = MIN(kw, iw - srcOriginX);
+            const long kys        = MAX(0, -srcOriginY);
+            const long kye        = MIN(kh, ih - srcOriginY);
+            const auto src_ptr    = src + (srcOriginY * iw + srcOriginX) * pack_c;
+            auto dst_ptr          = dst + (oy * ow + ox) * pack_c;
+
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * pack_c;
+                for (long kx = kxs; kx < kxe; kx++) {
+                    vmax = T::max(vmax, T::load(src_ptr_h + kx * pack_c));
+                }
+            }
+
+            T::save(dst_ptr, vmax);
+        }
+    }
+}
+
+/*
+max pooling 3x3s2 kernel
+*/
+template <class T, int pack_c>
+void X86MaxPoolingCenter3x3s2(const float* src, long iw, long ih, float* dst, long ow, long oh, long pad_w, long pad_h, long l,
+                           long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            T vmax(-FLT_MAX);
+
+            const long src_offset_x = ox * 2 - pad_w;
+            const long src_offset_y = oy * 2 - pad_h;
+            const auto src_ptr      = src + (src_offset_y * iw + src_offset_x) * pack_c;
+            auto dst_ptr            = dst + (oy * ow + ox) * pack_c;
+
+            for (long ky = 0; ky < 3; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * pack_c;
+                vmax                 = T::max(vmax, T::load(src_ptr_h + 0 * pack_c));
+                vmax                 = T::max(vmax, T::load(src_ptr_h + 1 * pack_c));
+                vmax                 = T::max(vmax, T::load(src_ptr_h + 2 * pack_c));
+            }
+            T::save(dst_ptr, vmax);
+        }
+    }
+}
+
+/*
+general max pooling center kernel
+*/
+template <class T, int pack_c>
+void X86MaxPoolingCenter(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                      long stride_h, long pad_w, long pad_h, long l, long r, long t, long b) {
+    for (long oy = t; oy < b; ++oy) {
+        for (long ox = l; ox < r; ++ox) {
+            T vmax(-FLT_MAX);
+
+            const long src_offset_x = ox * stride_w - pad_w;
+            const long src_offset_y = oy * stride_h - pad_h;
+            const auto src_ptr      = src + (src_offset_y * iw + src_offset_x) * pack_c;
+            auto dst_ptr            = dst + (oy * ow + ox) * pack_c;
+
+            for (long ky = 0; ky < kh; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * pack_c;
+                for (long kx = 0; kx < kw; kx++) {
+                    vmax = T::max(vmax, T::load(src_ptr_h + kx * pack_c));
+                }
+            }
+
+            T::save(dst_ptr, vmax);
+        }
+    }
+}
+
+/*
+max pooling func, process four corners and center
+*/
+template <class T, int pack_c>
+void X86MaxPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h, long l, long r, long t, long b) {
+    // top corner
+    X86MaxPoolingCorner<T, pack_c>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, ow, 0, t);
+    if (kw == 3 && kh == 3 && stride_h == 2 && stride_w == 2) {
+        X86MaxPoolingCenter3x3s2<T, pack_c>(src, iw, ih, dst, ow, oh, pad_w, pad_h, l, r, t, b);
+    } else {
+        X86MaxPoolingCenter<T, pack_c>(src, iw, ih, dst, ow, oh, kw, kh, stride_w, stride_h, pad_w, pad_h, l, r, t, b);
+    }
+
+    // bottom corner
+    X86MaxPoolingCorner<T, pack_c>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, ow, b, oh);
+    // left corner
+    X86MaxPoolingCorner<T, pack_c>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, 0, l, t, b);
+    // right corner
+    X86MaxPoolingCorner<T, pack_c>(src, iw, ih, dst, ow, kw, kh, stride_w, stride_h, pad_w, pad_h, r, ow, t, b);
+}
+
+template void X86MaxPooling<Float8, 8>(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+template void X86MaxPooling<Float4, 4>(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+
+/*
+general avg pooling func
+*/
+template <class T, int pack_c>
+void X86AvgPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h) {
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            T vavg(0.f);
+
+            const long srcOriginX    = ox * stride_w - pad_w;
+            const long srcOriginY    = oy * stride_h - pad_h;
+            const long kxs           = MAX(0, -srcOriginX);
+            const long kxe           = MIN(kw, iw - srcOriginX);
+            const long kys           = MAX(0, -srcOriginY);
+            const long kye           = MIN(kh, ih - srcOriginY);
+            const float kernel_count = 1.0 / ((kxe - kxs) * (kye - kys));
+            const auto src_ptr       = src + (srcOriginY * iw + srcOriginX) * pack_c;
+            auto dst_ptr             = dst + (oy * ow + ox) * pack_c;
+
+            for (long ky = kys; ky < kye; ++ky) {
+                const auto src_ptr_h = src_ptr + (ky * iw) * pack_c;
+                for (long kx = kxs; kx < kxe; kx++) {
+                    vavg = vavg + T::load(src_ptr_h + kx * pack_c);
+                }
+            }
+
+            vavg = vavg * T(kernel_count);
+            T::save(dst_ptr, vavg);
+        }
+    }
+}
+
+template void X86AvgPooling<Float8, 8>(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h);
+template void X86AvgPooling<Float4, 4>(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h);
+
+Status X86_FMA(float *input_data, float *output_data, float *scale_data, float *bias_data,
+               bool shared_channel, bool has_bias, DimsVector output_dim) {
+    
+    int channel = output_dim[1];
+    int cal_count;
+    if (shared_channel)
+        cal_count = DimsVectorUtils::Count(output_dim);
+    else
+        cal_count = DimsVectorUtils::Count(output_dim, 2);
+    
+    if (shared_channel) {
+#ifdef __AVX2__
+        int tail = cal_count - cal_count % 8;
+        if (has_bias) {     // has bias
+            // register __m256 src, scale, bias;
+            __m256 src, scale, bias;
+            scale = _mm256_broadcast_ss(&scale_data[0]);
+            bias  = _mm256_broadcast_ss(&bias_data[0]);
+            for (size_t i = 0; i < tail; i += 8) {
+                src = _mm256_loadu_ps(input_data + i);
+                src = _mm256_fmadd_ps(src, scale, bias);
+                _mm256_storeu_ps(output_data + i, src);
+            }
+            for (size_t i = tail; i < cal_count; i++) {
+                output_data[i] = input_data[i] * scale_data[0] + bias_data[0];
+            }
+        } else {        // no bias
+            // register __m256 src, scale;
+            __m256 src, scale;
+            scale = _mm256_broadcast_ss(&scale_data[0]);
+            for (size_t i = 0; i < tail; i += 8) {
+                src = _mm256_loadu_ps(input_data + i);
+                src = _mm256_mul_ps(src, scale);
+                _mm256_storeu_ps(output_data + i, src);
+            }
+            for (size_t i = tail; i < cal_count; i++) {
+                output_data[i] = input_data[i] * scale_data[0];
+            }
+        }  
+#else
+        const float scale = scale_data[0];
+        float bias = 0.f;
+        if (has_bias) bias = bias_data[0];
+        for (int index = 0; index < cal_count; index++) {
+            if (has_bias)
+                output_data[index] = input_data[index] * scale + bias;
+            else 
+                output_data[index] = input_data[index] * scale;
+        }
+#endif
+    } else {
+#ifdef __AVX2__
+        int tail = cal_count - cal_count % 8;
+        for (int b = 0; b < output_dim[0]; b++) {
+            for (int c = 0; c < channel; c++) {
+                if (has_bias) {
+                    // register __m256 src, scale, bias;
+                    __m256 src, scale, bias;
+                    scale = _mm256_broadcast_ss(&scale_data[c]);
+                    bias  = _mm256_broadcast_ss(&bias_data[c]);
+                    float *input  = input_data + (b * channel + c) * cal_count;
+                    float *output = output_data + (b * channel + c) * cal_count;
+                    for (size_t index = 0; index < tail; index += 8) {
+                        src = _mm256_loadu_ps(input + index);
+                        src = _mm256_fmadd_ps(src, scale, bias);
+                        _mm256_storeu_ps(output + index, src);
+                    }
+                    for (size_t index = tail; index < cal_count; index++)
+                        output[index] = input[index] * scale_data[c] + bias_data[c];
+                } else {
+                    // register __m256 src, scale;
+                    __m256 src, scale;
+                    scale = _mm256_broadcast_ss(&scale_data[c]);
+                    float *input  = input_data + (b * channel + c) * cal_count;
+                    float *output = output_data + (b * channel + c) * cal_count;
+                    for (size_t index = 0; index < tail; index += 8) {
+                        src = _mm256_loadu_ps(input + index);
+                        src = _mm256_mul_ps(src, scale);
+                        _mm256_storeu_ps(output + index, src);
+                    }
+                    for (size_t index = tail; index < cal_count; index++)
+                        output[index] = input[index] * scale_data[c];
+                }
+            }
+        }
+#else
+        for (int b = 0; b < output_dim[0]; b++) {
+            for (int c = 0; c < channel; c++) {
+                float *input  = input_data + (b * channel + c) * cal_count;
+                float *output = output_data + (b * channel + c) * cal_count;
+                const float scale = scale_data[c];
+                float bias = 0.f;
+                if (has_bias) bias = bias_data[c];
+                for (int index = 0; index < cal_count; index++) {
+                    if (has_bias)
+                        output[index] = input[index] * scale + bias;
+                    else
+                        output[index] = input[index] * scale;
+                }
+            }
+        }
+#endif
+    }
+    return TNN_OK;
+}
+
+
+template<X86ReduceOpType type>
+float reduce_iter_op(const float acc, const float v) {
+    return acc + v;
+}
+
+template<X86ReduceOpType type>
+float reduce_final_op(const float acc, const float num) {
+    return acc;
+}
+
+template<> float reduce_iter_op<X86ReduceOpType::kL1>(const float acc, const float v) {return acc + std::abs(v); }
+template<> float reduce_iter_op<X86ReduceOpType::kMIN>(const float acc, const float v) {return std::min(acc, v); }
+template<> float reduce_iter_op<X86ReduceOpType::kMAX>(const float acc, const float v) {return std::max(acc, v); }
+template<> float reduce_iter_op<X86ReduceOpType::kPROD>(const float acc, const float v) {return acc * v; }
+
+template<> float reduce_final_op<X86ReduceOpType::kMEAN>(const float acc, const float num) {return acc / num; }
+
+template<X86ReduceOpType type>
+void reduce_preprocess(float* input, float*& output, float* workspace, size_t count) {
+    output = input;
+}
+
+template<X86ReduceOpType type>
+void reduce_postprocess(float* input, float* output, size_t count) {
+    memcpy(output, input, count * sizeof(float));
+}
+
+template<> void reduce_preprocess<X86ReduceOpType::kL2>(float* input, float*& output, float* workspace, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        workspace[i] = std::pow(input[i], 2);
+    }
+    output = workspace;
+}
+
+template<> void reduce_postprocess<X86ReduceOpType::kL2>(float* input, float* output, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        output[i] = std::sqrt(input[i]);
+    }
+}
+
+template<> void reduce_preprocess<X86ReduceOpType::kLOGSUMEXP>(float* input, float*& output, float* workspace, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        workspace[i] = std::exp(input[i]);
+    }
+    output = workspace;
+}
+
+template<> void reduce_postprocess<X86ReduceOpType::kLOGSUMEXP>(float* input, float* output, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        output[i] = std::log(input[i]);
+    }
+}
+
+template<> void reduce_preprocess<X86ReduceOpType::kSUMSQUARE>(float* input, float*& output, float* workspace, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        workspace[i] = std::pow(input[i], 2);
+    }
+    output = workspace;
+}
+
+template<> void reduce_postprocess<X86ReduceOpType::kLOGSUM>(float* input, float* output, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        output[i] = std::log(input[i]);
+    }
+}
+
+template<X86ReduceOpType type>
+void reduce_kernel(float * input, float * output, size_t outer_size, size_t inner_size, size_t reduce_size) 
+{
+    for(long outer_idx = 0; outer_idx < outer_size; outer_idx++) {
+        OMP_PARALLEL_FOR_GUIDED_
+        for(long inner_idx = 0; inner_idx < inner_size; inner_idx++) {
+            float acc = 0;
+            if (type == X86ReduceOpType::kMIN) {
+                acc = FLT_MAX;
+            } else if (type == X86ReduceOpType::kMAX) {
+                acc = -FLT_MAX;
+            }
+            for(int i = 0; i < reduce_size; i++) {
+                acc = reduce_iter_op<type>(acc, input[i * inner_size + inner_idx]);
+            }
+            output[inner_idx] = reduce_final_op<type>(acc, float(reduce_size));
+        }
+        input += reduce_size * inner_size;
+        output += inner_size;
+    }
+}
+
+using reduce_kernel_ptr_t = decltype(&reduce_kernel<X86ReduceOpType::kMEAN>);
+using reduce_preprocess_ptr_t = decltype(&reduce_preprocess<X86ReduceOpType::kMEAN>);
+using reduce_postprocess_ptr_t = decltype(&reduce_postprocess<X86ReduceOpType::kMEAN>);
+
+Status X86_REDUCE_CALCULATE(float *input, float *output, float *workspace,
+                            std::vector<std::tuple<int, int, int>> &reduce_dims,
+                            DimsVector input_dim, DimsVector output_dim, X86ReduceOpType op_type)
+{
+    reduce_kernel_ptr_t reduce_kernel_ptr = nullptr;
+    reduce_preprocess_ptr_t reduce_preprocess_ptr = nullptr;
+    reduce_postprocess_ptr_t reduce_postprocess_ptr = nullptr;
+    switch (op_type) {
+        case X86ReduceOpType::kMEAN:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kMEAN>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kMEAN>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kMEAN>;
+            break;
+        case X86ReduceOpType::kL1:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kL1>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kL1>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kL1>;
+            break;
+        case X86ReduceOpType::kL2:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kL2>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kL2>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kL2>;
+            break;
+        case X86ReduceOpType::kMIN:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kMIN>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kMIN>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kMIN>;
+            break;
+        case X86ReduceOpType::kMAX:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kMAX>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kMAX>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kMAX>;
+            break;
+        case X86ReduceOpType::kSUM:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kSUM>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kSUM>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kSUM>;
+            break;
+        case X86ReduceOpType::kPROD:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kPROD>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kPROD>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kPROD>;
+            break;
+        case X86ReduceOpType::kLOGSUM:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kLOGSUM>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kLOGSUM>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kLOGSUM>;
+            break;
+        case X86ReduceOpType::kLOGSUMEXP:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kLOGSUMEXP>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kLOGSUMEXP>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kLOGSUMEXP>;
+            break;
+        case X86ReduceOpType::kSUMSQUARE:
+            reduce_kernel_ptr = reduce_kernel<X86ReduceOpType::kSUMSQUARE>;
+            reduce_preprocess_ptr = reduce_preprocess<X86ReduceOpType::kSUMSQUARE>;
+            reduce_postprocess_ptr = reduce_postprocess<X86ReduceOpType::kSUMSQUARE>;
+            break;
+        default:
+            LOGE("Error, unknown binary op_type\n");
+            return TNNERR_LAYER_ERR;
+    }
+
+    size_t input_count  = DimsVectorUtils::Count(input_dim);
+    size_t output_count = DimsVectorUtils::Count(output_dim);
+
+    float *ping_buf = nullptr;
+    float *pong_buf = workspace + input_count;
+    reduce_preprocess_ptr(input, ping_buf, workspace, input_count);
+    int first = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+        auto reduce_dim   = reduce_dims[i];
+        auto outer_count  = std::get<0>(reduce_dim);
+        auto reduce_count = std::get<1>(reduce_dim);
+        auto inner_count  = std::get<2>(reduce_dim);
+
+        reduce_kernel_ptr(ping_buf, pong_buf, outer_count, inner_count, reduce_count);
+        if (first) {
+            first = 0;
+            ping_buf = workspace + input_count;
+            pong_buf = workspace;
+        } else {
+            float *tmp = pong_buf;
+            pong_buf = ping_buf;
+            ping_buf = tmp;
+        }
+    }
+    reduce_postprocess_ptr(ping_buf, output, output_count);
+    return TNN_OK;
+}
+
+Status X86_NORMALIZE_CALCULATE(float *input, float *output, int axis,
+                               DimsVector input_dim, DimsVector output_dim, int mode, float epsilon) {
+    int outer_size = DimsVectorUtils::Count(input_dim, 0, axis);
+    int reduce_size = input_dim[axis];
+    int inner_size = DimsVectorUtils::Count(input_dim, axis + 1);
+
+    float* tmp = (float*)malloc(outer_size * inner_size * sizeof(float));
+    if (mode == 2) { // L2
+        reduce_kernel<X86ReduceOpType::kL2>(input, tmp, outer_size, inner_size, reduce_size);
+    } else if (mode == 1) { // L1
+        reduce_kernel<X86ReduceOpType::kL1>(input, tmp, outer_size, inner_size, reduce_size);
+    } else if (mode == INT_MAX) { // MAX
+        reduce_kernel<X86ReduceOpType::kMAX>(input, tmp, outer_size, inner_size, reduce_size);
+    } else if (mode == INT_MIN) { // MIN
+        reduce_kernel<X86ReduceOpType::kMIN>(input, tmp, outer_size, inner_size, reduce_size);
+    }
+
+    for (int o = 0; o < outer_size; o++) {
+        for (int i = 0; i < inner_size; i++) {
+            tmp[o * inner_size + i] = std::max(epsilon, tmp[o * inner_size + i]);
+            for (int r = 0; r < reduce_size; r++) {
+                output[o * inner_size * reduce_size + r * inner_size + i] = \
+                input[o * inner_size * reduce_size + r * inner_size + i] / tmp[o * inner_size + i];
+            }
+        }
+    }
+
+    if(tmp) free(tmp);
+    return TNN_OK;
+}
+
+template <int activation_type, typename VEC, int pack>
+void DepthwiseConv(float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+                   long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep) {
+    long dx, fx, fy;
+    VEC bias_v = VEC::loadu(bias);
+    VEC v_zero = VEC(0.f);
+    VEC v_6    = VEC(6.f);
+    for (long y = 0; y < height; ++y) {
+        auto srcY = src + y * srcHStep;
+        auto dstY = dst + y * dstHStep;
+        dx        = 0;
+        for (; dx + 3 < width; dx += 4) {
+            VEC dst_v[4];
+            for (long i = 0; i < 4; i++)
+                dst_v[i] = bias_v;
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * pack;
+                for (fx = 0; fx < fw; ++fx) {
+                    VEC weight_v = VEC::loadu(weight_y + pack * fx);
+                    VEC src_v0   = VEC::load(src_y + fx * dilate_x_step);
+                    VEC src_v1   = VEC::load(src_y + fx * dilate_x_step + src_w_step);
+                    VEC src_v2   = VEC::load(src_y + fx * dilate_x_step + 2 * src_w_step);
+                    VEC src_v3   = VEC::load(src_y + fx * dilate_x_step + 3 * src_w_step);
+                    VEC::mla(dst_v[0], src_v0, weight_v);
+                    VEC::mla(dst_v[1], src_v1, weight_v);
+                    VEC::mla(dst_v[2], src_v2, weight_v);
+                    VEC::mla(dst_v[3], src_v3, weight_v);
+                }
+            }
+            if (activation_type == ActivationType_ReLU || 
+                activation_type == ActivationType_ReLU6) {
+                dst_v[0] = VEC::max(dst_v[0], v_zero);
+                dst_v[1] = VEC::max(dst_v[1], v_zero);
+                dst_v[2] = VEC::max(dst_v[2], v_zero);
+                dst_v[3] = VEC::max(dst_v[3], v_zero);
+            }
+            if (activation_type == ActivationType_ReLU6) {
+                dst_v[0] = VEC::min(dst_v[0], v_6);
+                dst_v[1] = VEC::min(dst_v[1], v_6);
+                dst_v[2] = VEC::min(dst_v[2], v_6);
+                dst_v[3] = VEC::min(dst_v[3], v_6);
+            }
+            VEC::save(dstY + (dx + 0) * pack, dst_v[0]);
+            VEC::save(dstY + (dx + 1) * pack, dst_v[1]);
+            VEC::save(dstY + (dx + 2) * pack, dst_v[2]);
+            VEC::save(dstY + (dx + 3) * pack, dst_v[3]);
+        }
+        for (; dx < width; ++dx) {
+            VEC dst_v = bias_v;
+            const auto* src_z    = srcY + src_w_step * dx;
+            const auto* weight_z = weight;
+            for (fy = 0; fy < fh; ++fy) {
+                const auto* src_y    = src_z + fy * dilate_y_step;
+                const auto* weight_y = weight_z + fy * fw * pack;
+                for (fx = 0; fx < fw; ++fx) {
+                    VEC src_v    = VEC::load(src_y + fx * dilate_x_step);
+                    VEC weight_v = VEC::loadu(weight_y + pack * fx);
+                    VEC::mla(dst_v, src_v, weight_v);
+                }
+            }
+            if (activation_type == ActivationType_ReLU || 
+                activation_type == ActivationType_ReLU6) {
+                dst_v = VEC::max(dst_v, v_zero);
+            }
+            if (activation_type == ActivationType_ReLU6) {
+                dst_v = VEC::min(dst_v, v_6);
+            }
+            VEC::save(dstY + dx * pack, dst_v);
+        }
+    }
+}
+
+template void DepthwiseConv<ActivationType_None, Float4, 4>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template void DepthwiseConv<ActivationType_ReLU, Float4, 4>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template void DepthwiseConv<ActivationType_ReLU6, Float4, 4>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template void DepthwiseConv<ActivationType_None, Float8, 8>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template void DepthwiseConv<ActivationType_ReLU, Float8, 8>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template void DepthwiseConv<ActivationType_ReLU6, Float8, 8>(
+    float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+    long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template <int left, int oc_>
+void X86SgemvLeft(float* dst, const float* src, const float* weight, float *bias, size_t batch_stride) {
+    float acc[8];
+    for (int i = 0; i < left; i++) {
+        acc[i] = bias[i];
+    }
+    for (size_t ic = 0; ic < batch_stride; ic++) {
+        auto weight_ic = weight + ic * oc_;
+        for (int i = 0; i < left; i++) {
+            acc[i] += weight_ic[i] * src[ic];
+        }
+    }
+    for (int i = 0; i < left; i++) {
+        dst[i] = acc[i];
+    }
+}
+
+template <typename VEC, int pack>
+void X86Sgemv(float* dst, const float* src, const float* weight, float *bias, DimsVector dims_input, DimsVector dims_output) {
+    size_t batch_stride = DimsVectorUtils::Count(dims_input, 1);
+    int oc_vec_size = dims_output[1] / pack * pack;
+    int oc_left = dims_output[1] - oc_vec_size;
+    for (int b = 0; b < dims_output[0]; ++b) {
+        const float *src_batch = src + b * batch_stride;
+        float *dst_batch = dst + b * dims_output[1];
+
+        OMP_PARALLEL_FOR_GUIDED_
+        for (int oc = 0; oc < oc_vec_size; oc += pack) {
+            auto weight_oc = weight + oc * batch_stride;
+            VEC acc = VEC::loadu(bias + oc);
+            size_t ic = 0;
+            for (; ic + 3 < batch_stride; ic += 4) {
+                auto weight_ic   = weight_oc + ic * pack;
+                VEC src_v0    = VEC(src_batch[ic]);
+                VEC src_v1    = VEC(src_batch[ic + 1]);
+                VEC src_v2    = VEC(src_batch[ic + 2]);
+                VEC src_v3    = VEC(src_batch[ic + 3]);
+                VEC weight_v0 = VEC::load(weight_ic);
+                VEC weight_v1 = VEC::load(weight_ic + pack * 1);
+                VEC weight_v2 = VEC::load(weight_ic + pack * 2);
+                VEC weight_v3 = VEC::load(weight_ic + pack * 3);
+                VEC::mla(acc, weight_v0, src_v0);
+                VEC::mla(acc, weight_v1, src_v1);
+                VEC::mla(acc, weight_v2, src_v2);
+                VEC::mla(acc, weight_v3, src_v3);
+            }
+            for (; ic < batch_stride; ic++) {
+                VEC src_v    = VEC(src_batch[ic]);
+                VEC weight_v = VEC::load(weight_oc + ic * pack);
+                VEC::mla(acc, weight_v, src_v);
+            }
+            VEC::saveu(dst_batch + oc, acc);
+        }
+        int left = oc_left;
+        int oc = oc_vec_size;
+        if (pack == 8) {
+            if (left == 7) {
+                X86SgemvLeft<7, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+            } else if (left == 6) {
+                X86SgemvLeft<6, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+            } else if (left == 5) {
+                X86SgemvLeft<5, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+            } else if (left == 4) {
+                X86SgemvLeft<4, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+            }
+        }
+        if (left == 3) {
+            X86SgemvLeft<3, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+        } else if (left == 2) {
+            X86SgemvLeft<2, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+        } else if (left == 1) {
+            X86SgemvLeft<1, pack>(dst_batch + oc, src_batch, weight + oc * batch_stride, bias + oc, batch_stride);
+        }
+    }
+}
+template void X86Sgemv<Float4, 4>(float* dst, const float* src, const float* weight, float *bias, DimsVector dims_input, DimsVector dims_output);
+template void X86Sgemv<Float8, 8>(float* dst, const float* src, const float* weight, float *bias, DimsVector dims_input, DimsVector dims_output);
+
+template <int activation_type, typename VEC, int pack>
+void X86_Post_Exec(float *dst, const float *bias, long channel, long area) {
+    for (long c = 0; c < channel; c++) {
+        auto dst_c = dst + c * area;
+        VEC bias_v = VEC(bias + c);
+        VEC zero_v = VEC(0.f);
+        VEC six_v = VEC(6.f);
+        long i = 0;
+        for (; i + pack - 1 < area; i += pack) {
+            VEC src_v = VEC::loadu(dst_c + i);
+            VEC dst_v = VEC::add(src_v, bias_v);
+
+            if (activation_type == ActivationType_ReLU || 
+                activation_type == ActivationType_ReLU6) {
+                dst_v = VEC::max(dst_v, zero_v);
+            }
+            if (activation_type == ActivationType_ReLU6) {
+                dst_v = VEC::min(dst_v, six_v);
+            }
+            VEC::saveu(dst_c + i, dst_v);
+        }
+
+        for (; i < area; i++) {
+            float dst_value = dst_c[i] + bias[c];
+            if (activation_type == ActivationType_ReLU || 
+                activation_type == ActivationType_ReLU6) {
+                dst_value = std::max(dst_value, 0.f);
+            }
+            if (activation_type == ActivationType_ReLU6) {
+                dst_value = std::min(dst_value, 6.f);
+            }
+            dst_c[i] = dst_value;
+        }
+    }
+}
+template void X86_Post_Exec<ActivationType_None, Float4, 4>(float *dst, const float *bias, long channel, long area);
+template void X86_Post_Exec<ActivationType_ReLU, Float4, 4>(float *dst, const float *bias, long channel, long area);
+template void X86_Post_Exec<ActivationType_ReLU6, Float4, 4>(float *dst, const float *bias, long channel, long area);
+template void X86_Post_Exec<ActivationType_None, Float8, 8>(float *dst, const float *bias, long channel, long area);
+template void X86_Post_Exec<ActivationType_ReLU, Float8, 8>(float *dst, const float *bias, long channel, long area);
+template void X86_Post_Exec<ActivationType_ReLU6, Float8, 8>(float *dst, const float *bias, long channel, long area);
+
+//ActivationType_SIGMOID_MUL TBD
+
+template <typename VEC, int pack>
+void X86_VectorAdd(float *dst, const float *src_a, const float *src_b, long len) {
+    long i = 0;
+    for (; i + pack - 1 < len; i += pack) {
+        VEC a_vec = VEC::loadu(src_a + i);
+        VEC b_vec = VEC::loadu(src_b + i);
+        VEC c_vec = VEC::add(a_vec, b_vec);
+        VEC::saveu(dst + i, c_vec);
+    }
+    for (; i < len; i++) {
+        dst[i] = src_a[i] + src_b[i];
+    }
+}
+template void X86_VectorAdd<Float4, 4>(float *dst, const float *src_a, const float *src_b, long len);
+template void X86_VectorAdd<Float8, 8>(float *dst, const float *src_a, const float *src_b, long len);
+
+template <typename VEC, int pack>
+void X86_VectorAdd(float *dst, const float *src, long len) {
+    long i = 0;
+    for (; i + pack - 1 < len; i += pack) {
+        VEC a_vec = VEC::loadu(src + i);
+        VEC b_vec = VEC::loadu(dst + i);
+        VEC c_vec = VEC::add(a_vec, b_vec);
+        VEC::saveu(dst + i, c_vec);
+    }
+    for (; i < len; i++) {
+        dst[i] += src[i];
+    }
+}
+template void X86_VectorAdd<Float4, 4>(float *dst, const float *src, long len);
+template void X86_VectorAdd<Float8, 8>(float *dst, const float *src, long len);
+
+void X86StrideSliceImpl(DimsVector begins, DimsVector strides, DimsVector dims_output,
+                        DimsVector input_strides, DimsVector output_strides,
+                        const float* input_data, float* output_data) {
+    if (dims_output.size() == 5) {
+        for (int n = 0, n_idx = begins[0]; n < dims_output[0]; n++, n_idx += strides[0]) {
+            auto input_n = input_data + n_idx * input_strides[0];
+            auto output_n = output_data + n * output_strides[0];
+            for (int c = 0, c_idx = begins[1]; c < dims_output[1]; c++, c_idx += strides[1]) {
+                auto input_c = input_n + c_idx * input_strides[1];
+                auto output_c = output_n + c * output_strides[1];
+                for (int h = 0, h_idx = begins[2]; h < dims_output[2]; h++, h_idx += strides[2]) {
+                    auto input_h = input_c + h_idx * input_strides[2];
+                    auto output_h = output_c + h * output_strides[2];
+                    for (int w = 0, w_idx = begins[3]; w < dims_output[3]; w++, w_idx += strides[3]) {
+                        auto input_w = input_h + w_idx * input_strides[3];
+                        auto output_w = output_h + w * output_strides[3];
+                        for (int x = 0, x_idx = begins[4]; x < dims_output[4]; x++, x_idx += strides[4]) {
+                            output_w[x] = input_w[x_idx];
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dims_output.size() == 4) {
+        for (int n = 0, n_idx = begins[0]; n < dims_output[0]; n++, n_idx += strides[0]) {
+            auto input_n = input_data + n_idx * input_strides[0];
+            auto output_n = output_data + n * output_strides[0];
+            for (int c = 0, c_idx = begins[1]; c < dims_output[1]; c++, c_idx += strides[1]) {
+                auto input_c = input_n + c_idx * input_strides[1];
+                auto output_c = output_n + c * output_strides[1];
+                for (int h = 0, h_idx = begins[2]; h < dims_output[2]; h++, h_idx += strides[2]) {
+                    auto input_h = input_c + h_idx * input_strides[2];
+                    auto output_h = output_c + h * output_strides[2];
+                    for (int w = 0, w_idx = begins[3]; w < dims_output[3]; w++, w_idx += strides[3]) {
+                        output_h[w] = input_h[w_idx];
+                    }
+                }
+            }
+        }
+    } else if (dims_output.size() == 3) {
+        for (int n = 0, n_idx = begins[0]; n < dims_output[0]; n++, n_idx += strides[0]) {
+            auto input_n = input_data + n_idx * input_strides[0];
+            auto output_n = output_data + n * output_strides[0];
+            for (int c = 0, c_idx = begins[1]; c < dims_output[1]; c++, c_idx += strides[1]) {
+                auto input_c = input_n + c_idx * input_strides[1];
+                auto output_c = output_n + c * output_strides[1];
+                for (int h = 0, h_idx = begins[2]; h < dims_output[2]; h++, h_idx += strides[2]) {
+                    output_c[h] = input_c[h_idx];
+                }
+            }
+        }
+    } else if (dims_output.size() == 2) {
+        for (int n = 0, n_idx = begins[0]; n < dims_output[0]; n++, n_idx += strides[0]) {
+            auto input_n = input_data + n_idx * input_strides[0];
+            auto output_n = output_data + n * output_strides[0];
+            for (int c = 0, c_idx = begins[1]; c < dims_output[1]; c++, c_idx += strides[1]) {
+                output_n[c] = input_n[c_idx];
+            }
+        }
+    } else if (dims_output.size() == 1) {
+        for (int n = 0, n_idx = begins[0]; n < dims_output[0]; n++, n_idx += strides[0]) {
+            output_data[n] = input_data[n_idx];
+        }
+    }
+}
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.h
new file mode 100644
index 0000000..f5374bc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute.h
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef SOURCE_TNN_DEVICE_X86_ACC_COMPUTE_H_
+#define SOURCE_TNN_DEVICE_X86_ACC_COMPUTE_H_
+
+#include "tnn/core/common.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/status.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+
+namespace TNN_NS {
+
+// @brief store by row
+Status X86_IM2COL(float *src, int channel, int height, int width, int kernelh, int kernelw, int padl, int padr,
+                  int padt, int padb, int strideh, int stridew, int dilationh, int dilationw, float *dst);
+
+Status X86_COL2IM(float *src, int channel, int height, int width, int kernelh, int kernelw, int padh, int padw,
+                  int strideh, int stridew, int dilationh, int dilationw, int output_height, int output_width,
+                  float *dst);
+// @brief C = A * B with B tranposed, (m * k) * (k * n), NAIVE
+Status X86_matrixMul(int m, int n, int k, const float *A, const float *B, float *C, 
+                     int has_bias = 0, const float *bias = nullptr, int activation_type = ActivationType_None);
+
+Status X86_MAX_POOLING(float *input, float *output, DimsVector input_dim, DimsVector output_dim,
+                       int stride_h, int stride_w, int kernel_h, int kernel_w, int pad_h, int pad_w);
+
+Status X86_AVERAGE_POOLING(float *input, float *output, DimsVector input_dim, DimsVector output_dim,
+                           int stride_h, int stride_w, int kernel_h, int kernel_w, int pad_h, int pad_w);
+
+Status X86_FMA(float *input, float *output, float *scale, float *bias,
+               bool shared_channel, bool has_bias, DimsVector output_dim);
+
+Status X86_REDUCE_CALCULATE(float *input, float *output, float *workspace,
+                            std::vector<std::tuple<int, int, int>> &reduce_dims,
+                            DimsVector input_dim, DimsVector output_dim, X86ReduceOpType op_type);
+
+template <class T, int pack_c>
+void X86MaxPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h, long l, long r, long t, long b);
+
+template <class T, int pack_c>
+void X86AvgPooling(const float* src, long iw, long ih, float* dst, long ow, long oh, long kw, long kh, long stride_w,
+                long stride_h, long pad_w, long pad_h);
+
+template <int activation_type, typename VEC, int pack>
+void DepthwiseConv(float* dst, const float* src, const float* weight, const float* bias, long width, long src_w_step, long fw, long fh,
+                   long dilate_x_step, long dilate_y_step, long height, long srcHStep, long dstHStep);
+
+template <typename VEC, int pack>
+void X86Sgemv(float* dst, const float* src, const float* weight, float *bias, DimsVector dims_input, DimsVector dims_output);
+
+template <int activation_type, typename VEC, int pack>
+void X86_Post_Exec(float *dst, const float *bias, long channel, long area);
+
+template <typename VEC, int pack>
+void X86_VectorAdd(float *dst, const float *src_a, const float *src_b, long len);
+
+template <typename VEC, int pack>
+void X86_VectorAdd(float *dst, const float *src, long len);
+
+void X86StrideSliceImpl(DimsVector begins, DimsVector strides, DimsVector dims_output,
+                        DimsVector input_strides, DimsVector output_strides,
+                        const float* input_data, float* output_data);
+
+}   // namespace TNN_NS
+
+#endif
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.cc b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.cc
new file mode 100644
index 0000000..31a53a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.cc
@@ -0,0 +1,1316 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+// rounding to zero(val + (val >= 0.f ? 0.5f : -0.5f)) = rounding to nearest ties away from zero
+#define DeclareRounding()                                             \
+    __m128 zero_f32 = _mm_set1_ps(0.f);                               \
+    __m128 add_05   = _mm_set1_ps(0.5f);                              \
+    __m128 sub_05   = _mm_set1_ps(-0.5f);
+
+#define F32X4TOI8X4(f32x4, dst)                                       \
+    __m128 cmp_zero = _mm_cmpge_ps(f32x4, zero_f32);                  \
+    __m128 adjust_vec = _mm_blendv_ps(sub_05, add_05, cmp_zero);      \
+    f32x4 = _mm_add_ps(f32x4, adjust_vec);                            \
+    __m128i dst_i32x4 = _mm_cvttps_epi32(f32x4);                      \
+    __m128i dst_i16x4 = _mm_packs_epi32(dst_i32x4, dst_i32x4);        \
+    __m128i dst_i8x4  = _mm_packs_epi16(dst_i16x4, dst_i16x4);        \
+    int i8x4 = _mm_extract_epi32(dst_i8x4, 0);                        \
+    *((int*)(dst)) = i8x4;
+
+#define F32X4TOI8X4_NODEF(f32x4, dst)                                 \
+    cmp_zero = _mm_cmpge_ps(f32x4, zero_f32);                         \
+    adjust_vec = _mm_blendv_ps(sub_05, add_05, cmp_zero);             \
+    f32x4 = _mm_add_ps(f32x4, adjust_vec);                            \
+    dst_i32x4 = _mm_cvttps_epi32(f32x4);                              \
+    dst_i16x4 = _mm_packs_epi32(dst_i32x4, dst_i32x4);                \
+    dst_i8x4  = _mm_packs_epi16(dst_i16x4, dst_i16x4);                \
+    i8x4 = _mm_extract_epi32(dst_i8x4, 0);                            \
+    *((int*)(dst)) = i8x4;
+
+#define F32X8TOI8X8(f32x4_a, f32x4_b, dst)                            \
+    __m128 cmp_zero_0 = _mm_cmpge_ps(f32x4_a, zero_f32);              \
+    __m128 cmp_zero_1 = _mm_cmpge_ps(f32x4_b, zero_f32);              \
+    __m128 adjust_vec_0 = _mm_blendv_ps(sub_05, add_05, cmp_zero_0);  \
+    __m128 adjust_vec_1 = _mm_blendv_ps(sub_05, add_05, cmp_zero_1);  \
+    f32x4_a = _mm_add_ps(f32x4_a, adjust_vec_0);                      \
+    f32x4_b = _mm_add_ps(f32x4_b, adjust_vec_1);                      \
+    __m128i dst_i32x4_a = _mm_cvttps_epi32(f32x4_a);                  \
+    __m128i dst_i32x4_b = _mm_cvttps_epi32(f32x4_b);                  \
+    __m128i dst_i16x8   = _mm_packs_epi32(dst_i32x4_a, dst_i32x4_b);  \
+    __m128i dst_i8x8    = _mm_packs_epi16(dst_i16x8, dst_i16x8);      \
+    _mm_storel_epi64((__m128i*)(dst), dst_i8x8);
+
+void X86AVXGemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+    __m128 relu6_max_vec;
+
+    if (relu == 2) {
+        float tmp4[4];
+        tmp4[0] = (float)relu6_max[0];
+        tmp4[1] = (float)relu6_max[1];
+        tmp4[2] = (float)relu6_max[2];
+        tmp4[3] = (float)relu6_max[3];
+        relu6_max_vec = _mm_loadu_ps(tmp4);
+    }
+
+    for (long w = 0; w < 4; ++w) {
+        const auto src_x   = src + w * src_w_step;
+        auto dst_x         = dst + w * dst_depth;
+        auto add_input_x   = add_input ? add_input + w * dst_depth : nullptr;
+        long sz            = 0;
+
+        __m256i dst_i32x8_0 = _mm256_setzero_si256();
+        __m256i dst_i32x8_1 = _mm256_setzero_si256();
+        __m256i dst_i32x8_2 = _mm256_setzero_si256();
+        __m256i dst_i32x8_3 = _mm256_setzero_si256();
+
+        for (; sz < cdiv8 / 2; ++sz) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            __m128i w_vec0  = _mm_loadu_si128((__m128i*)(weight_sz));
+            __m128i w_vec1  = _mm_loadu_si128((__m128i*)(weight_sz + 16));
+            __m128i w_vec2  = _mm_loadu_si128((__m128i*)(weight_sz + 32));
+            __m128i w_vec3  = _mm_loadu_si128((__m128i*)(weight_sz + 48));
+            __m128i src_vec = _mm_loadu_si128((__m128i*)(src_z));
+
+            __m256i w_16_0  = _mm256_cvtepi8_epi16(w_vec0);
+            __m256i w_16_1  = _mm256_cvtepi8_epi16(w_vec1);
+            __m256i w_16_2  = _mm256_cvtepi8_epi16(w_vec2);
+            __m256i w_16_3  = _mm256_cvtepi8_epi16(w_vec3);
+
+            __m256i src_16  = _mm256_cvtepi8_epi16(src_vec);
+
+            __m256i d_32_0  = _mm256_madd_epi16(w_16_0, src_16);
+            __m256i d_32_1  = _mm256_madd_epi16(w_16_1, src_16);
+            __m256i d_32_2  = _mm256_madd_epi16(w_16_2, src_16);
+            __m256i d_32_3  = _mm256_madd_epi16(w_16_3, src_16);
+
+            dst_i32x8_0     = _mm256_add_epi32(dst_i32x8_0, d_32_0);
+            dst_i32x8_1     = _mm256_add_epi32(dst_i32x8_1, d_32_1);
+            dst_i32x8_2     = _mm256_add_epi32(dst_i32x8_2, d_32_2);
+            dst_i32x8_3     = _mm256_add_epi32(dst_i32x8_3, d_32_3);
+        }
+
+        dst_i32x8_0       = _mm256_hadd_epi32(dst_i32x8_0, dst_i32x8_1);
+        dst_i32x8_1       = _mm256_hadd_epi32(dst_i32x8_2, dst_i32x8_3);
+        dst_i32x8_0       = _mm256_hadd_epi32(dst_i32x8_0, dst_i32x8_1);
+        __m128i dst_vec_0 = _mm_add_epi32(_mm256_extracti128_si256(dst_i32x8_0, 0),
+                                          _mm256_extracti128_si256(dst_i32x8_0, 1));
+
+        if (sz < cdiv8 / 2 + cdiv8 % 2) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            __m128i w_vec0  = _mm_loadl_epi64((__m128i*)(weight_sz));
+            __m128i w_vec1  = _mm_loadl_epi64((__m128i*)(weight_sz + 16));
+            __m128i w_vec2  = _mm_loadl_epi64((__m128i*)(weight_sz + 32));
+            __m128i w_vec3  = _mm_loadl_epi64((__m128i*)(weight_sz + 48));
+            __m128i src_vec = _mm_loadl_epi64((__m128i*)(src_z));
+
+            __m128i w_16_00 = _mm_cvtepi8_epi16(w_vec0);
+            __m128i w_16_10 = _mm_cvtepi8_epi16(w_vec1);
+            __m128i w_16_20 = _mm_cvtepi8_epi16(w_vec2);
+            __m128i w_16_30 = _mm_cvtepi8_epi16(w_vec3);
+
+            __m128i src_16_0 = _mm_cvtepi8_epi16(src_vec);
+
+            __m128i d_32_00 = _mm_madd_epi16(w_16_00, src_16_0);
+            __m128i d_32_10 = _mm_madd_epi16(w_16_10, src_16_0);
+            __m128i d_32_20 = _mm_madd_epi16(w_16_20, src_16_0);
+            __m128i d_32_30 = _mm_madd_epi16(w_16_30, src_16_0);
+
+            d_32_00   = _mm_hadd_epi32(d_32_00, d_32_10);
+            d_32_10   = _mm_hadd_epi32(d_32_20, d_32_30);
+            d_32_00   = _mm_hadd_epi32(d_32_00, d_32_10);
+            dst_vec_0 = _mm_add_epi32(dst_vec_0, d_32_00);
+        }
+
+        __m128i bias_vec = _mm_loadu_si128((__m128i*)bias);
+        __m128 scale_vec = _mm_loadu_ps(scale);
+        __m128 dst_4x32   = _mm_cvtepi32_ps(_mm_add_epi32(dst_vec_0, bias_vec));
+        dst_4x32          = _mm_mul_ps(dst_4x32, scale_vec);
+
+        if (relu == -1) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+        }
+        if (add_input_x) {
+            int add_input_4x8 = *((int*)(add_input_x));
+            __m128 add_scale_vec = _mm_loadu_ps(add_scale);
+            __m128 add_input_vec = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(add_input_4x8)));
+            dst_4x32 = _mm_add_ps(dst_4x32, _mm_mul_ps(add_input_vec, add_scale_vec));
+        }
+        if (relu == 1) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+        }
+        // Conv-Add-Relu6
+        else if (relu == 2) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+            dst_4x32 = _mm_min_ps(dst_4x32, relu6_max_vec);
+        }
+        F32X4TOI8X4(dst_4x32, dst_x);
+    }
+}
+
+void X86SSEGemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+    __m128 relu6_max_vec;
+
+    if (relu == 2) {
+        float tmp4[4];
+        tmp4[0] = (float)relu6_max[0];
+        tmp4[1] = (float)relu6_max[1];
+        tmp4[2] = (float)relu6_max[2];
+        tmp4[3] = (float)relu6_max[3];
+        relu6_max_vec = _mm_loadu_ps(tmp4);
+    }
+
+    for (long w = 0; w < 4; ++w) {
+        const auto src_x   = src + w * src_w_step;
+        auto dst_x         = dst + w * dst_depth;
+        auto add_input_x   = add_input ? add_input + w * dst_depth : nullptr;
+        int32_t dstTemp[4] = {0, 0, 0, 0};
+        long sz            = 0;
+
+        __m128i dst_vec_0 = _mm_setzero_si128();
+        __m128i dst_vec_1 = _mm_setzero_si128();
+        __m128i dst_vec_2 = _mm_setzero_si128();
+        __m128i dst_vec_3 = _mm_setzero_si128();
+
+        for (; sz < cdiv8 / 2; ++sz) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            __m128i w_vec0  = _mm_loadu_si128((__m128i*)(weight_sz));
+            __m128i w_vec1  = _mm_loadu_si128((__m128i*)(weight_sz + 16));
+            __m128i w_vec2  = _mm_loadu_si128((__m128i*)(weight_sz + 32));
+            __m128i w_vec3  = _mm_loadu_si128((__m128i*)(weight_sz + 48));
+            __m128i src_vec = _mm_loadu_si128((__m128i*)(src_z));
+
+            __m128i w_16_00 = _mm_cvtepi8_epi16(w_vec0);
+            __m128i w_16_10 = _mm_cvtepi8_epi16(w_vec1);
+            __m128i w_16_20 = _mm_cvtepi8_epi16(w_vec2);
+            __m128i w_16_30 = _mm_cvtepi8_epi16(w_vec3);
+            __m128i w_16_01 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(w_vec0, w_vec0));
+            __m128i w_16_11 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(w_vec1, w_vec1));
+            __m128i w_16_21 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(w_vec2, w_vec2));
+            __m128i w_16_31 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(w_vec3, w_vec3));
+
+            __m128i src_16_0 = _mm_cvtepi8_epi16(src_vec);
+            __m128i src_16_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(src_vec, src_vec));
+
+            __m128i d_32_00 = _mm_madd_epi16(w_16_00, src_16_0);
+            __m128i d_32_01 = _mm_madd_epi16(w_16_01, src_16_1);
+            __m128i d_32_10 = _mm_madd_epi16(w_16_10, src_16_0);
+            __m128i d_32_11 = _mm_madd_epi16(w_16_11, src_16_1);
+            __m128i d_32_20 = _mm_madd_epi16(w_16_20, src_16_0);
+            __m128i d_32_21 = _mm_madd_epi16(w_16_21, src_16_1);
+            __m128i d_32_30 = _mm_madd_epi16(w_16_30, src_16_0);
+            __m128i d_32_31 = _mm_madd_epi16(w_16_31, src_16_1);
+
+            dst_vec_0 = _mm_add_epi32(dst_vec_0, d_32_00);
+            dst_vec_1 = _mm_add_epi32(dst_vec_1, d_32_10);
+            dst_vec_2 = _mm_add_epi32(dst_vec_2, d_32_20);
+            dst_vec_3 = _mm_add_epi32(dst_vec_3, d_32_30);
+            dst_vec_0 = _mm_add_epi32(dst_vec_0, d_32_01);
+            dst_vec_1 = _mm_add_epi32(dst_vec_1, d_32_11);
+            dst_vec_2 = _mm_add_epi32(dst_vec_2, d_32_21);
+            dst_vec_3 = _mm_add_epi32(dst_vec_3, d_32_31);
+        }
+
+        if (sz < cdiv8 / 2 + cdiv8 % 2) {
+            const auto weight_sz = weight + (4 * 16) * sz;
+            const auto src_z     = src_x + sz * 16;
+
+            __m128i w_vec0  = _mm_loadl_epi64((__m128i*)(weight_sz));
+            __m128i w_vec1  = _mm_loadl_epi64((__m128i*)(weight_sz + 16));
+            __m128i w_vec2  = _mm_loadl_epi64((__m128i*)(weight_sz + 32));
+            __m128i w_vec3  = _mm_loadl_epi64((__m128i*)(weight_sz + 48));
+            __m128i src_vec = _mm_loadl_epi64((__m128i*)(src_z));
+
+            __m128i w_16_00 = _mm_cvtepi8_epi16(w_vec0);
+            __m128i w_16_10 = _mm_cvtepi8_epi16(w_vec1);
+            __m128i w_16_20 = _mm_cvtepi8_epi16(w_vec2);
+            __m128i w_16_30 = _mm_cvtepi8_epi16(w_vec3);
+
+            __m128i src_16_0 = _mm_cvtepi8_epi16(src_vec);
+
+            __m128i d_32_00 = _mm_madd_epi16(w_16_00, src_16_0);
+            __m128i d_32_10 = _mm_madd_epi16(w_16_10, src_16_0);
+            __m128i d_32_20 = _mm_madd_epi16(w_16_20, src_16_0);
+            __m128i d_32_30 = _mm_madd_epi16(w_16_30, src_16_0);
+
+            dst_vec_0 = _mm_add_epi32(dst_vec_0, d_32_00);
+            dst_vec_1 = _mm_add_epi32(dst_vec_1, d_32_10);
+            dst_vec_2 = _mm_add_epi32(dst_vec_2, d_32_20);
+            dst_vec_3 = _mm_add_epi32(dst_vec_3, d_32_30);
+        }
+
+        dst_vec_0 = _mm_hadd_epi32(dst_vec_0, dst_vec_1);
+        dst_vec_1 = _mm_hadd_epi32(dst_vec_2, dst_vec_3);
+        dst_vec_0 = _mm_hadd_epi32(dst_vec_0, dst_vec_1);
+
+        __m128i bias_vec = _mm_loadu_si128((__m128i*)bias);
+        __m128 scale_vec = _mm_loadu_ps(scale);
+        __m128 dst_4x32   = _mm_cvtepi32_ps(_mm_add_epi32(dst_vec_0, bias_vec));
+        dst_4x32          = _mm_mul_ps(dst_4x32, scale_vec);
+
+        if (relu == -1) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+        }
+        if (add_input_x) {
+            int add_input_4x8 = *((int*)(add_input_x));
+            __m128 add_scale_vec = _mm_loadu_ps(add_scale);
+            __m128 add_input_vec = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(add_input_4x8)));
+            dst_4x32 = _mm_add_ps(dst_4x32, _mm_mul_ps(add_input_vec, add_scale_vec));
+        }
+        if (relu == 1) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+        }
+        // Conv-Add-Relu6
+        else if (relu == 2) {
+            dst_4x32 = _mm_max_ps(dst_4x32, zero_f32);
+            dst_4x32 = _mm_min_ps(dst_4x32, relu6_max_vec);
+        }
+        F32X4TOI8X4(dst_4x32, dst_x);
+    }
+}
+
+static void DepthwiseI8K3Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+                                long src_y_step, long src_w_step, long dst_depth, const float* scale_z,
+                                long dx, long dc) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+
+    auto dst_x       = dst + dx * dst_depth + dc;
+    const auto src_z = src + dx * src_w_step + dc;
+    __m128i bias_vec0 = _mm_loadu_si128((__m128i*)(bias_z + dc));
+    __m128i bias_vec1 = _mm_loadu_si128((__m128i*)(bias_z + dc + 4));
+
+    for (long fy = 0; fy < 3; ++fy) {
+        const auto src_y    = src_z + fy * src_y_step;
+        const auto weight_y = weight + fy * 3 * dst_depth + dc;
+
+        __m128i src_vec_0 = _mm_loadl_epi64((__m128i*)(src_y));
+        __m128i src_vec_1 = _mm_loadl_epi64((__m128i*)(src_y + dst_depth));
+        __m128i src_vec_2 = _mm_loadl_epi64((__m128i*)(src_y + 2 * dst_depth));
+        __m128i w_vec_0   = _mm_loadl_epi64((__m128i*)(weight_y));
+        __m128i w_vec_1   = _mm_loadl_epi64((__m128i*)(weight_y + dst_depth));
+        __m128i w_vec_2   = _mm_loadl_epi64((__m128i*)(weight_y + 2 * dst_depth));
+
+        __m128i src_16_0  = _mm_cvtepi8_epi16(src_vec_0);
+        __m128i src_16_1  = _mm_cvtepi8_epi16(src_vec_1);
+        __m128i src_16_2  = _mm_cvtepi8_epi16(src_vec_2);
+        __m128i w_16_0    = _mm_cvtepi8_epi16(w_vec_0);
+        __m128i w_16_1    = _mm_cvtepi8_epi16(w_vec_1);
+        __m128i w_16_2    = _mm_cvtepi8_epi16(w_vec_2);
+
+        __m128i w_16_00   = _mm_unpacklo_epi16(w_16_0, zero_i8);
+        __m128i w_16_01   = _mm_unpackhi_epi16(w_16_0, zero_i8);
+        __m128i w_16_10   = _mm_unpacklo_epi16(w_16_1, zero_i8);
+        __m128i w_16_11   = _mm_unpackhi_epi16(w_16_1, zero_i8);
+        __m128i w_16_20   = _mm_unpacklo_epi16(w_16_2, zero_i8);
+        __m128i w_16_21   = _mm_unpackhi_epi16(w_16_2, zero_i8);
+
+        __m128i src_16_00 = _mm_unpacklo_epi16(src_16_0, zero_i8);
+        __m128i src_16_01 = _mm_unpackhi_epi16(src_16_0, zero_i8);
+        __m128i src_16_10 = _mm_unpacklo_epi16(src_16_1, zero_i8);
+        __m128i src_16_11 = _mm_unpackhi_epi16(src_16_1, zero_i8);
+        __m128i src_16_20 = _mm_unpacklo_epi16(src_16_2, zero_i8);
+        __m128i src_16_21 = _mm_unpackhi_epi16(src_16_2, zero_i8);
+
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_00, src_16_00));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_01, src_16_01));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_10, src_16_10));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_11, src_16_11));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_20, src_16_20));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_21, src_16_21));
+    }
+    __m128 scale_vec0 = _mm_loadu_ps(scale_z + dc);
+    __m128 scale_vec1 = _mm_loadu_ps(scale_z + dc + 4);
+
+    __m128 dst_4x32_0 = _mm_cvtepi32_ps(bias_vec0);
+    __m128 dst_4x32_1 = _mm_cvtepi32_ps(bias_vec1);
+    dst_4x32_0        = _mm_mul_ps(dst_4x32_0, scale_vec0);
+    dst_4x32_1        = _mm_mul_ps(dst_4x32_1, scale_vec1);
+
+    F32X8TOI8X8(dst_4x32_0, dst_4x32_1, dst_x);
+}
+
+void DepthwiseI8K5Kernel(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z,
+                         long src_y_step, long src_w_step, long dst_depth, const float* scale_z,
+                         long dx, long dc) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+
+    auto dst_x       = dst + dx * dst_depth + dc;
+    const auto src_z = src + dx * src_w_step + dc;
+    __m128i bias_vec0 = _mm_loadu_si128((__m128i*)(bias_z + dc));
+    __m128i bias_vec1 = _mm_loadu_si128((__m128i*)(bias_z + dc + 4));
+
+    for (long fy = 0; fy < 5; ++fy) {
+        const auto src_y    = src_z + fy * src_y_step;
+        const auto weight_y = weight + fy * 5 * dst_depth + dc;
+
+        __m128i src_vec_0 = _mm_loadl_epi64((__m128i*)(src_y));
+        __m128i src_vec_1 = _mm_loadl_epi64((__m128i*)(src_y + dst_depth));
+        __m128i src_vec_2 = _mm_loadl_epi64((__m128i*)(src_y + 2 * dst_depth));
+        __m128i src_vec_3 = _mm_loadl_epi64((__m128i*)(src_y + 3 * dst_depth));
+        __m128i src_vec_4 = _mm_loadl_epi64((__m128i*)(src_y + 4 * dst_depth));
+        __m128i w_vec_0   = _mm_loadl_epi64((__m128i*)(weight_y));
+        __m128i w_vec_1   = _mm_loadl_epi64((__m128i*)(weight_y + dst_depth));
+        __m128i w_vec_2   = _mm_loadl_epi64((__m128i*)(weight_y + 2 * dst_depth));
+        __m128i w_vec_3   = _mm_loadl_epi64((__m128i*)(weight_y + 3 * dst_depth));
+        __m128i w_vec_4   = _mm_loadl_epi64((__m128i*)(weight_y + 4 * dst_depth));
+
+        __m128i src_16_0  = _mm_cvtepi8_epi16(src_vec_0);
+        __m128i src_16_1  = _mm_cvtepi8_epi16(src_vec_1);
+        __m128i src_16_2  = _mm_cvtepi8_epi16(src_vec_2);
+        __m128i src_16_3  = _mm_cvtepi8_epi16(src_vec_3);
+        __m128i src_16_4  = _mm_cvtepi8_epi16(src_vec_4);
+
+        __m128i w_16_0    = _mm_cvtepi8_epi16(w_vec_0);
+        __m128i w_16_1    = _mm_cvtepi8_epi16(w_vec_1);
+        __m128i w_16_2    = _mm_cvtepi8_epi16(w_vec_2);
+        __m128i w_16_3    = _mm_cvtepi8_epi16(w_vec_3);
+        __m128i w_16_4    = _mm_cvtepi8_epi16(w_vec_4);
+
+        __m128i w_16_00   = _mm_unpacklo_epi16(w_16_0, zero_i8);
+        __m128i w_16_01   = _mm_unpackhi_epi16(w_16_0, zero_i8);
+        __m128i w_16_10   = _mm_unpacklo_epi16(w_16_1, zero_i8);
+        __m128i w_16_11   = _mm_unpackhi_epi16(w_16_1, zero_i8);
+        __m128i w_16_20   = _mm_unpacklo_epi16(w_16_2, zero_i8);
+        __m128i w_16_21   = _mm_unpackhi_epi16(w_16_2, zero_i8);
+        __m128i w_16_30   = _mm_unpacklo_epi16(w_16_3, zero_i8);
+        __m128i w_16_31   = _mm_unpackhi_epi16(w_16_3, zero_i8);
+        __m128i w_16_40   = _mm_unpacklo_epi16(w_16_4, zero_i8);
+        __m128i w_16_41   = _mm_unpackhi_epi16(w_16_4, zero_i8);
+
+        __m128i src_16_00 = _mm_unpacklo_epi16(src_16_0, zero_i8);
+        __m128i src_16_01 = _mm_unpackhi_epi16(src_16_0, zero_i8);
+        __m128i src_16_10 = _mm_unpacklo_epi16(src_16_1, zero_i8);
+        __m128i src_16_11 = _mm_unpackhi_epi16(src_16_1, zero_i8);
+        __m128i src_16_20 = _mm_unpacklo_epi16(src_16_2, zero_i8);
+        __m128i src_16_21 = _mm_unpackhi_epi16(src_16_2, zero_i8);
+        __m128i src_16_30 = _mm_unpacklo_epi16(src_16_3, zero_i8);
+        __m128i src_16_31 = _mm_unpackhi_epi16(src_16_3, zero_i8);
+        __m128i src_16_40 = _mm_unpacklo_epi16(src_16_4, zero_i8);
+        __m128i src_16_41 = _mm_unpackhi_epi16(src_16_4, zero_i8);
+
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_00, src_16_00));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_01, src_16_01));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_10, src_16_10));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_11, src_16_11));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_20, src_16_20));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_21, src_16_21));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_30, src_16_30));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_31, src_16_31));
+        bias_vec0 = _mm_add_epi32(bias_vec0, _mm_madd_epi16(w_16_40, src_16_40));
+        bias_vec1 = _mm_add_epi32(bias_vec1, _mm_madd_epi16(w_16_41, src_16_41));
+    }
+    __m128 scale_vec0 = _mm_loadu_ps(scale_z + dc);
+    __m128 scale_vec1 = _mm_loadu_ps(scale_z + dc + 4);
+
+    __m128 dst_4x32_0 = _mm_cvtepi32_ps(bias_vec0);
+    __m128 dst_4x32_1 = _mm_cvtepi32_ps(bias_vec1);
+    dst_4x32_0        = _mm_mul_ps(dst_4x32_0, scale_vec0);
+    dst_4x32_1        = _mm_mul_ps(dst_4x32_1, scale_vec1);
+
+    F32X8TOI8X8(dst_4x32_0, dst_4x32_1, dst_x);
+}
+
+void X86DepthwiseI8K3(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z) {
+    // general k3 process, calc left dx
+    for (long dx = 0; dx < width; dx++) {
+        long dc = 0;
+        for (; dc < dst_depth - 7; dc += 8) {
+            DepthwiseI8K3Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+
+        if (dc < dst_depth) {
+            dc = dst_depth - 8;
+            DepthwiseI8K3Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+    }
+}
+
+void X86DepthwiseI8K5(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z) {
+    // general k3 process, calc left dx
+    for (long dx = 0; dx < width; dx++) {
+        long dc = 0;
+        for (; dc < dst_depth - 7; dc += 8) {
+            DepthwiseI8K5Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+
+        if (dc < dst_depth) {
+            dc = dst_depth - 8;
+            DepthwiseI8K5Kernel(dst, src, weight, bias_z, dilate_y_step, src_w_step, dst_depth, scale_z,
+                                dx, dc);
+        }
+    }
+}
+
+/*
+convdw int8 kernel, used in corner process
+*/
+void X86DepthwiseI8Unit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, long fw, long fh,
+                     long weight_y_step, long dilate_y_step, long dilate_x_step, const float* scale, long dst_depth) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+    long dc = 0;
+    for (; dc < dst_depth - 4; dc += 8) {
+        __m128i bias_vec0 = _mm_loadu_si128((__m128i*)(bias + dc));
+        __m128i bias_vec1 = _mm_loadu_si128((__m128i*)(bias + dc + 4));
+        for (long fy = 0; fy < fh; ++fy) {
+            const auto src_y    = src + fy * dilate_y_step + dc;
+            const auto weight_y = weight + fy * weight_y_step + dc;
+            for (long fx = 0; fx < fw; ++fx) {
+                const auto src_x    = src_y + fx * dilate_x_step;
+                const auto weight_x = weight_y + dst_depth * fx;
+
+                __m128i w_vec    = _mm_loadl_epi64((__m128i*)(weight_x));
+                __m128i src_vec  = _mm_loadl_epi64((__m128i*)(src_x));
+                __m128i w_16     = _mm_cvtepi8_epi16(w_vec);
+                __m128i src_16   = _mm_cvtepi8_epi16(src_vec);
+
+                // w_vec   = [w0, 0, w1, 0, w2, 0, w3, 0]
+                // src_vec = [s0, 0, s1, 0, s2, 0, s3, 0]
+                __m128i w_16_0   = _mm_unpacklo_epi16(w_16, zero_i8);
+                __m128i w_16_1   = _mm_unpackhi_epi16(w_16, zero_i8);
+                __m128i src_16_0 = _mm_unpacklo_epi16(src_16, zero_i8);
+                __m128i src_16_1 = _mm_unpackhi_epi16(src_16, zero_i8);
+
+                __m128i dst_0    = _mm_madd_epi16(w_16_0, src_16_0);
+                __m128i dst_1    = _mm_madd_epi16(w_16_1, src_16_1);
+
+                bias_vec0        = _mm_add_epi32(bias_vec0, dst_0);
+                bias_vec1        = _mm_add_epi32(bias_vec1, dst_1);
+            }
+        }
+        __m128 scale_vec0 = _mm_loadu_ps(scale + dc);
+        __m128 scale_vec1 = _mm_loadu_ps(scale + dc + 4);
+
+        __m128 dst_4x32_0 = _mm_cvtepi32_ps(bias_vec0);
+        __m128 dst_4x32_1 = _mm_cvtepi32_ps(bias_vec1);
+        dst_4x32_0        = _mm_mul_ps(dst_4x32_0, scale_vec0);
+        dst_4x32_1        = _mm_mul_ps(dst_4x32_1, scale_vec1);
+
+        F32X8TOI8X8(dst_4x32_0, dst_4x32_1, (dst + dc));
+    }
+    for (; dc < dst_depth; dc += 4) {
+        long dst_temp[4] = {0, 0, 0, 0};
+        for (long fy = 0; fy < fh; ++fy) {
+            const auto src_y    = src + fy * dilate_y_step + dc;
+            const auto weight_y = weight + fy * weight_y_step + dc;
+            for (long fx = 0; fx < fw; ++fx) {
+                const auto weight_x = weight_y + fx * dst_depth;
+                const auto src_x    = src_y + fx * dilate_x_step;
+                for (long j = 0; j < 4; ++j) {
+                    dst_temp[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
+                }
+            }
+        }
+        for (long i = 0; i < 4; ++i) {
+            dst[dc + i] = float2int8(static_cast<float>(dst_temp[i] + bias[dc + i]) * scale[dc + i]);
+        }
+    }
+}
+
+/*
+general convdw int8 func
+*/
+void X86DepthwiseI8General(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                        long dilate_y_step, long dilate_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                        const float* scale_z) {
+    DeclareRounding();
+    __m128i zero_i8 = _mm_setzero_si128();
+
+    long dx, fx, fy;
+    for (dx = 0; dx < width; ++dx) {
+        long dc = 0;
+        for (; dc < dst_depth - 4; dc += 8) {
+            auto dst_x       = dst + dx * dst_depth + dc;
+            const auto src_z = src + dx * src_w_step + dc;
+            __m128i bias_vec0 = _mm_loadu_si128((__m128i*)(bias_z + dc));
+            __m128i bias_vec1 = _mm_loadu_si128((__m128i*)(bias_z + dc + 4));
+
+            for (fy = 0; fy < fh; ++fy) {
+                const auto src_y    = src_z + fy * dilate_y_step;
+                const auto weight_y = weight + fy * fw * dst_depth + dc;
+                for (fx = 0; fx < fw; ++fx) {
+                    const auto src_x    = src_y + fx * dilate_x_step;
+                    const auto weight_x = weight_y + dst_depth * fx;
+
+                    __m128i w_vec    = _mm_loadl_epi64((__m128i*)(weight_x));
+                    __m128i src_vec  = _mm_loadl_epi64((__m128i*)(src_x));
+                    __m128i w_16     = _mm_cvtepi8_epi16(w_vec);
+                    __m128i src_16   = _mm_cvtepi8_epi16(src_vec);
+                    __m128i w_16_0   = _mm_unpacklo_epi16(w_16, zero_i8);
+                    __m128i w_16_1   = _mm_unpackhi_epi16(w_16, zero_i8);
+                    __m128i src_16_0 = _mm_unpacklo_epi16(src_16, zero_i8);
+                    __m128i src_16_1 = _mm_unpackhi_epi16(src_16, zero_i8);
+                    __m128i dst_0    = _mm_madd_epi16(w_16_0, src_16_0);
+                    __m128i dst_1    = _mm_madd_epi16(w_16_1, src_16_1);
+                    bias_vec0        = _mm_add_epi32(bias_vec0, dst_0);
+                    bias_vec1        = _mm_add_epi32(bias_vec1, dst_1);
+                }
+            }
+            __m128 scale_vec0 = _mm_loadu_ps(scale_z + dc);
+            __m128 scale_vec1 = _mm_loadu_ps(scale_z + dc + 4);
+            __m128 dst_4x32_0 = _mm_cvtepi32_ps(bias_vec0);
+            __m128 dst_4x32_1 = _mm_cvtepi32_ps(bias_vec1);
+            dst_4x32_0        = _mm_mul_ps(dst_4x32_0, scale_vec0);
+            dst_4x32_1        = _mm_mul_ps(dst_4x32_1, scale_vec1);
+
+            F32X8TOI8X8(dst_4x32_0, dst_4x32_1, dst_x);
+        }
+        for (; dc < dst_depth; dc += 4) {
+            auto dst_x          = dst + dx * dst_depth + dc;
+            const auto src_z    = src + dx * src_w_step + dc;
+            int32_t dstInt32[4] = {0, 0, 0, 0};
+            for (fy = 0; fy < fh; ++fy) {
+                const auto src_y    = src_z + fy * dilate_y_step;
+                const auto weight_y = weight + fy * fw * dst_depth + dc;
+                for (fx = 0; fx < fw; ++fx) {
+                    const auto src_x    = src_y + fx * dilate_x_step;
+                    const auto weight_x = weight_y + dst_depth * fx;
+                    for (long j = 0; j < 4; ++j) {
+                        dstInt32[j] += (int32_t)src_x[j] * (int32_t)weight_x[j];
+                    }
+                }
+            }
+
+            for (long i = 0; i < 4; ++i) {
+                dst_x[i] = float2int8(static_cast<float>(dstInt32[i] + bias_z[i + dc]) * scale_z[i + dc]);
+            }
+        }
+    }
+}
+
+void X86ReluInt8(int8_t* dst, const int8_t* src, long len) {
+    __m128i zero_i8 = _mm_setzero_si128();
+    long idx = len - len % 16;
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long i = 0; i < idx; i += 16) {
+        __m128i vec = _mm_loadu_si128((__m128i*)(src + i));
+        _mm_storeu_si128((__m128i*)(dst + i), _mm_max_epi8(vec, zero_i8));
+    }
+    for (; idx < len; idx++) {
+        dst[idx] = MAX(0, src[idx]);
+    }
+}
+
+void X86Relu6Int8(int8_t* dst, const int8_t* src, const int8_t* relu6_max, long width, long dst_depth) {
+    __m128i zero_i8 = _mm_setzero_si128();
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long dx = 0; dx < width; dx++) {
+        auto src_dx = src + dx * dst_depth;
+        auto dst_dx = dst + dx * dst_depth;
+
+        long dc = 0;
+        for (; dc + 15 < dst_depth; dc += 16) {
+            __m128i src_vec   = _mm_loadu_si128((__m128i*)(src_dx + dc));
+            __m128i relu6_vec = _mm_loadu_si128((__m128i*)(relu6_max + dc));
+            _mm_storeu_si128((__m128i*)(dst_dx + dc), 
+                _mm_max_epi8(zero_i8, _mm_min_epi8(src_vec, relu6_vec)));
+        }
+        for (; dc < dst_depth; dc++) {
+            int8_t tmp = MIN(src_dx[dc], relu6_max[dc]);
+            dst_dx[dc] = MAX(0, tmp);
+        }
+    }
+}
+
+void X86MaxPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    OMP_PARALLEL_FOR_COLLAPSE_(2)
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX = ox * stride_w - pad_w;
+            const long srcOriginY = oy * stride_h - pad_h;
+            const long kxs        = MAX(0, -srcOriginX);
+            const long kxe        = MIN(kw, iw - srcOriginX);
+            const long kys        = MAX(0, -srcOriginY);
+            const long kye        = MIN(kh, ih - srcOriginY);
+            long oc               = 0;
+
+            for (; oc + 15 < c_r4; oc += 16) {
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                __m128i max_reg    = _mm_set1_epi8(-127);
+                // find kernel_w * kernel_h max value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; kx++) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        max_reg                = _mm_max_epi8(max_reg, _mm_loadu_si128((__m128i*)srcPtrStart));
+                    }
+                }
+                _mm_storeu_si128((__m128i*)dst_ptr, max_reg);
+            }
+            for (; oc + 7 < c_r4; oc += 8) {
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                __m128i max_reg    = _mm_set1_epi8(-127);
+                // find kernel_w * kernel_h max value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; kx++) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        max_reg                = _mm_max_epi8(max_reg, _mm_loadl_epi64((__m128i*)srcPtrStart));
+                    }
+                }
+                _mm_storel_epi64((__m128i*)(dst_ptr), max_reg);
+            }
+            for (; oc < c_r4; oc += 4) {
+                int8_t maxValue[4] = {-127, -127, -127, -127};
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h max value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; ++kx) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        for (long j = 0; j < 4; ++j) {
+                            maxValue[j] = MAX(maxValue[j], srcPtrStart[j]);
+                        }
+                    }
+                }
+                // output
+                *(int32_t*)dst_ptr = *(int32_t*)maxValue;
+            }
+        }
+    }
+}
+
+void X86AvgPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h) {
+    OMP_PARALLEL_FOR_COLLAPSE_(2)
+    for (long oy = 0; oy < oh; ++oy) {
+        for (long ox = 0; ox < ow; ++ox) {
+            const long srcOriginX   = ox * stride_w - pad_w;
+            const long srcOriginY   = oy * stride_h - pad_h;
+            const long kxs          = MAX(0, -srcOriginX);
+            const long kxe          = MIN(kw, iw - srcOriginX);
+            const long kys          = MAX(0, -srcOriginY);
+            const long kye          = MIN(kh, ih - srcOriginY);
+            const long kernel_count = (kxe - kxs) * (kye - kys);
+            long oc                 = 0;
+
+            int16_t sum[8];
+            __m128 div_vec = _mm_set1_ps((float)kernel_count);
+            for (; oc + 7 < c_r4; oc += 8) {
+                __m128i avg_reg    = _mm_setzero_si128();
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h avg value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+                    for (; kx < kxe; kx++) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        __m128i cur_val = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)srcPtrStart));
+                        avg_reg         = _mm_add_epi16(avg_reg, cur_val);
+                    }
+                }
+                __m128 avg_reg_lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(avg_reg));
+                __m128 avg_reg_hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(avg_reg, avg_reg)));
+                avg_reg_lo        = _mm_div_ps(avg_reg_lo, div_vec);
+                avg_reg_hi        = _mm_div_ps(avg_reg_hi, div_vec);
+
+                __m128i i32x8_a   = _mm_cvttps_epi32(avg_reg_lo);
+                __m128i i32x8_b   = _mm_cvttps_epi32(avg_reg_hi);
+                __m128i i16x8     = _mm_packs_epi32(i32x8_a, i32x8_b);
+                __m128i i8x8      = _mm_packs_epi16(i16x8, i16x8);
+                _mm_storel_epi64((__m128i*)(dst_ptr), i8x8);
+            }
+
+            for (; oc < c_r4; oc += 4) {
+                int16_t sum[4]     = {0, 0, 0, 0};
+                const auto src_ptr = src + (srcOriginY * iw + srcOriginX) * c_r4 + oc;
+                auto dst_ptr       = dst + (oy * ow + ox) * c_r4 + oc;
+                // find kernel_w * kernel_h avg value
+                for (long ky = kys; ky < kye; ++ky) {
+                    const auto src_ptr_h = src_ptr + (ky * iw) * c_r4;
+                    long kx              = kxs;
+
+                    for (; kx < kxe; ++kx) {
+                        const auto srcPtrStart = src_ptr_h + kx * c_r4;
+                        for (long j = 0; j < 4; ++j) {
+                            sum[j] += srcPtrStart[j];
+                        }
+                    }
+                }
+                // output
+                for (long j = 0; j < 4; j++) {
+                    dst_ptr[j] = static_cast<int8_t>(sum[j] / kernel_count);
+                }
+            }
+        }
+    }
+}
+
+/*
+element add int8 func
+*/
+void X86MatrixAddInt8(int8_t* dst, const int8_t* A, const int8_t* B, float* dst_scale, const float* a_scale,
+                   float* b_scale, long channel, long hw_size) {
+    DeclareRounding();
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long hw = 0; hw < hw_size; hw++) {
+        long c = 0;
+
+        auto A_hw   = A + hw * channel;
+        auto B_hw   = B + hw * channel;
+        auto dst_hw = dst + hw * channel;
+        for (; c < channel - 4; c += 8) {
+            __m128 scale_a_neon0   = _mm_loadu_ps(a_scale + c);
+            __m128 scale_a_neon1   = _mm_loadu_ps(a_scale + c + 4);
+            __m128 scale_b_neon0   = _mm_loadu_ps(b_scale + c);
+            __m128 scale_b_neon1   = _mm_loadu_ps(b_scale + c + 4);
+            __m128 scale_dst_neon0 = _mm_loadu_ps(dst_scale + c);
+            __m128 scale_dst_neon1 = _mm_loadu_ps(dst_scale + c + 4);
+
+            __m128i aval       = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(A_hw + c)));
+            __m128i bval       = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(B_hw + c)));
+            __m128 a0          = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(aval));
+            __m128 a1          = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(aval, aval)));
+            __m128 b0          = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bval));
+            __m128 b1          = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(bval, bval)));
+            __m128 mul0        = _mm_add_ps(_mm_mul_ps(a0, scale_a_neon0), _mm_mul_ps(b0, scale_b_neon0));
+            __m128 mul1        = _mm_add_ps(_mm_mul_ps(a1, scale_a_neon1), _mm_mul_ps(b1, scale_b_neon1));
+            mul0               = _mm_mul_ps(mul0, scale_dst_neon0);
+            mul1               = _mm_mul_ps(mul1, scale_dst_neon1);
+
+            F32X8TOI8X8(mul0, mul1, (dst_hw + c));
+        }
+        for (; c < channel; c++) {
+            float aval  = A_hw[c] * a_scale[c] + B_hw[c] * b_scale[c];
+            dst_hw[c] = float2int8(aval * dst_scale[c]);
+        }
+    }
+}
+
+void X86GemvInt8(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, const float* scale, long ic_r4,
+              long oc_r4) {
+    DeclareRounding();
+    OMP_PARALLEL_FOR_GUIDED_
+    for (long dc = 0; dc < oc_r4; dc += 4) {
+        __m128i acc0 = _mm_setzero_si128();
+        __m128i acc1 = _mm_setzero_si128();
+        __m128i acc2 = _mm_setzero_si128();
+        __m128i acc3 = _mm_setzero_si128();
+        auto weight_o_0 = weight + dc * ic_r4;
+        auto weight_o_1 = weight_o_0 + ic_r4;
+        auto weight_o_2 = weight_o_1 + ic_r4;
+        auto weight_o_3 = weight_o_2 + ic_r4;
+
+        long c         = 0;
+        for (; c + 15 < ic_r4; c += 16) {
+            __m128i a     = _mm_loadu_si128((__m128i*)(src + c));
+            __m128i b0    = _mm_loadu_si128((__m128i*)(weight_o_0 + c));
+            __m128i b1    = _mm_loadu_si128((__m128i*)(weight_o_1 + c));
+            __m128i b2    = _mm_loadu_si128((__m128i*)(weight_o_2 + c));
+            __m128i b3    = _mm_loadu_si128((__m128i*)(weight_o_3 + c));
+
+            __m128i a_lo  = _mm_cvtepi8_epi16(a);
+            __m128i b0_lo = _mm_cvtepi8_epi16(b0);
+            __m128i b1_lo = _mm_cvtepi8_epi16(b1);
+            __m128i b2_lo = _mm_cvtepi8_epi16(b2);
+            __m128i b3_lo = _mm_cvtepi8_epi16(b3);
+            __m128i a_hi  = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(a, a));
+            __m128i b0_hi = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(b0, b0));
+            __m128i b1_hi = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(b1, b1));
+            __m128i b2_hi = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(b2, b2));
+            __m128i b3_hi = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(b3, b3));
+
+            acc0          = _mm_add_epi32(acc0, _mm_madd_epi16(a_lo, b0_lo));
+            acc1          = _mm_add_epi32(acc1, _mm_madd_epi16(a_lo, b1_lo));
+            acc2          = _mm_add_epi32(acc2, _mm_madd_epi16(a_lo, b2_lo));
+            acc3          = _mm_add_epi32(acc3, _mm_madd_epi16(a_lo, b3_lo));
+            acc0          = _mm_add_epi32(acc0, _mm_madd_epi16(a_hi, b0_hi));
+            acc1          = _mm_add_epi32(acc1, _mm_madd_epi16(a_hi, b1_hi));
+            acc2          = _mm_add_epi32(acc2, _mm_madd_epi16(a_hi, b2_hi));
+            acc3          = _mm_add_epi32(acc3, _mm_madd_epi16(a_hi, b3_hi));
+        }
+        for (; c + 7 < ic_r4; c += 8) {
+            __m128i a  = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(src + c)));
+            __m128i b0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(weight_o_0 + c)));
+            __m128i b1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(weight_o_1 + c)));
+            __m128i b2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(weight_o_2 + c)));
+            __m128i b3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)(weight_o_3 + c)));
+
+            acc0       = _mm_add_epi32(acc0, _mm_madd_epi16(a, b0));
+            acc1       = _mm_add_epi32(acc1, _mm_madd_epi16(a, b1));
+            acc2       = _mm_add_epi32(acc2, _mm_madd_epi16(a, b2));
+            acc3       = _mm_add_epi32(acc3, _mm_madd_epi16(a, b3));
+        }
+        for (; c < ic_r4; c += 4) {
+            int a_4xi8  = *((int*)(src + c));
+            int b0_4xi8 = *((int*)(weight_o_0 + c));
+            int b1_4xi8 = *((int*)(weight_o_1 + c));
+            int b2_4xi8 = *((int*)(weight_o_2 + c));
+            int b3_4xi8 = *((int*)(weight_o_3 + c));
+
+            __m128i a  = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(a_4xi8));
+            __m128i b0 = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(b0_4xi8));
+            __m128i b1 = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(b1_4xi8));
+            __m128i b2 = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(b2_4xi8));
+            __m128i b3 = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(b3_4xi8));
+
+            acc0       = _mm_add_epi32(acc0, _mm_madd_epi16(a, b0));
+            acc1       = _mm_add_epi32(acc1, _mm_madd_epi16(a, b1));
+            acc2       = _mm_add_epi32(acc2, _mm_madd_epi16(a, b2));
+            acc3       = _mm_add_epi32(acc3, _mm_madd_epi16(a, b3));
+        }
+
+        __m128i dst_4xi32 = _mm_hadd_epi32(_mm_hadd_epi32(acc0, acc1), _mm_hadd_epi32(acc2, acc3));
+        __m128i bias_vec  = _mm_loadu_si128((__m128i*)(bias + dc));
+        __m128 scale_vec  = _mm_loadu_ps(scale + dc);
+        __m128 dst_4xf32  = _mm_cvtepi32_ps(_mm_add_epi32(dst_4xi32, bias_vec));
+        dst_4xf32         = _mm_mul_ps(dst_4xf32, scale_vec);
+
+        F32X4TOI8X4(dst_4xf32, (dst + dc));
+    }
+}
+
+static bool is_per_tensor_quant(const std::vector<Blob *> &inputs) {
+    bool int8_per_tensor_flag = true;
+    for (auto &blob : inputs) {
+        if (reinterpret_cast<BlobInt8 *>(blob)->GetIntResource()->scale_handle.GetDataCount() > 1) {
+            int8_per_tensor_flag = false;
+            break;
+        }
+    }
+
+    return int8_per_tensor_flag;
+}
+
+/*
+concat channel int8, nhwc format
+*/
+void X86ConcatChannelInt8(Blob *output, const std::vector<Blob *> &inputs) {
+    DeclareRounding();
+    auto dims_output = output->GetBlobDesc().dims;
+    auto full_hw     = DimsVectorUtils::Count(dims_output, 2);
+    auto oc          = DimsFunctionUtils::GetDim(dims_output, 1);
+    auto oc_c4       = ROUND_UP(oc, 4);
+
+    int8_t *output_origin = reinterpret_cast<int8_t *>(output->GetHandle().base);
+
+    if (!is_per_tensor_quant(inputs)) {
+        for (int n = 0; n < dims_output[0]; n++) {
+            int c_offset = 0;
+            for (int b = 0; b < inputs.size(); b++) {
+                auto input_channel = inputs[b]->GetBlobDesc().dims[1];
+                auto ic_c4 = ROUND_UP(input_channel, 4);
+                auto input_ptr = reinterpret_cast<int8_t *>(inputs[b]->GetHandle().base) + n * ic_c4 * full_hw;
+                auto output_ptr = output_origin + n * full_hw * oc_c4 + c_offset;
+                OMP_PARALLEL_FOR_GUIDED_
+                for (int cur_hw = 0; cur_hw < full_hw; cur_hw++) {
+                    memcpy(output_ptr + cur_hw * oc_c4, input_ptr + cur_hw * ic_c4, input_channel);
+                }
+                c_offset += input_channel;
+            }
+        }
+    } else {
+        float *output_scale = reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>();
+        for (int n = 0; n < dims_output[0]; n++) {
+            int c_offset = 0;
+            for (int b = 0; b < inputs.size(); b++) {
+                float *input_scale = reinterpret_cast<BlobInt8 *>(inputs[b])->GetIntResource()->scale_handle.force_to<float *>();
+                float scale        = input_scale[0] / output_scale[0];
+                __m128 scale_vec   = _mm_set1_ps(scale);
+                auto input_channel = inputs[b]->GetBlobDesc().dims[1];
+                auto ic_c4         = ROUND_UP(input_channel, 4);
+                auto input_ptr     = reinterpret_cast<int8_t *>(inputs[b]->GetHandle().base) + n * ic_c4 * full_hw;
+                auto output_ptr    = output_origin + n * full_hw * oc_c4 + c_offset;
+                OMP_PARALLEL_FOR_GUIDED_
+                for (int cur_hw = 0; cur_hw < full_hw; cur_hw++) {
+                    auto src_ic = input_ptr + cur_hw * ic_c4;
+                    auto dst_ic = output_ptr + cur_hw * oc_c4;
+                    int ic = 0;
+                    for (; ic + 7 < input_channel; ic += 8) {
+                        __m128i src_i8x8    = _mm_loadl_epi64((__m128i*)(src_ic + ic));
+                        __m128i src_i32x4_0 = _mm_cvtepi8_epi32(src_i8x8);
+                        __m128i src_i32x4_1 = _mm_cvtepi8_epi32(_mm_shuffle_epi32(src_i8x8, 0x01));
+                        __m128 src_f32x4_0  = _mm_cvtepi32_ps(src_i32x4_0);
+                        __m128 src_f32x4_1  = _mm_cvtepi32_ps(src_i32x4_1);
+                        src_f32x4_0 = _mm_mul_ps(src_f32x4_0, scale_vec);
+                        src_f32x4_1 = _mm_mul_ps(src_f32x4_1, scale_vec);
+                        F32X8TOI8X8(src_f32x4_0, src_f32x4_1, (dst_ic + ic));
+                    }
+                    for (; ic < input_channel; ic++) {
+                        dst_ic[ic] = float2int8(src_ic[ic] * scale);
+                    }
+                }
+                c_offset += input_channel;
+            }
+        }
+    }
+}
+
+static DimsVector GetNHWCXRoundDims(const DimsVector &dims, const int round) {
+    DimsVector round_dims = {dims[0]};
+    for (int i = 2; i < dims.size(); ++i) {
+        round_dims.push_back(dims[i]);
+    }
+    round_dims.push_back(ROUND_UP(dims[1], round));
+    return round_dims;
+}
+
+/*
+concat common int8, nhwc format
+*/
+void X86ConcatCommonInt8(Blob *output, const std::vector<Blob *> &inputs, int axis) {
+    DeclareRounding();
+    auto output_dims             = output->GetBlobDesc().dims;
+    DimsVector round_output_dims = GetNHWCXRoundDims(output_dims, 4);
+    auto slice_count             = DimsVectorUtils::Count(round_output_dims, 0, axis - 1);
+    auto output_stride           = DimsVectorUtils::Count(round_output_dims, axis - 1);
+    auto *output_origin          = reinterpret_cast<int8_t *>(output->GetHandle().base);
+
+    if (!is_per_tensor_quant(inputs)) {
+        for (int n = 0; n < slice_count; n++) {
+            auto output_ptr = output_origin + n * output_stride;
+            for (int b = 0; b < inputs.size(); b++) {
+                auto input                  = inputs[b];
+                auto input_dims             = input->GetBlobDesc().dims;
+                DimsVector round_input_dims = GetNHWCXRoundDims(input_dims, 4);
+                auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis - 1);
+                auto input_ptr = reinterpret_cast<int8_t *>(input->GetHandle().base) + n * input_stride;
+                memcpy(output_ptr, input_ptr, input_stride * sizeof(int8_t));
+                output_ptr += input_stride;
+            }
+        }
+    } else {
+        float *output_scale = reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>();
+        for (int n = 0; n < slice_count; n++) {
+            auto output_ptr = output_origin + n * output_stride;
+            for (int b = 0; b < inputs.size(); b++) {
+                float *input_scale = reinterpret_cast<BlobInt8 *>(inputs[b])->GetIntResource()->scale_handle.force_to<float *>();
+                float scale                 = input_scale[0] / output_scale[0];
+                __m128 scale_vec            = _mm_set1_ps(scale);
+                auto input                  = inputs[b];
+                auto input_dims             = input->GetBlobDesc().dims;
+                DimsVector round_input_dims = GetNHWCXRoundDims(input_dims, 4);
+                auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis - 1);
+                auto input_ptr = reinterpret_cast<int8_t *>(input->GetHandle().base) + n * input_stride;
+
+                int ic = 0;
+                for (; ic + 7 < input_stride; ic += 8) {
+                    __m128i src_i8x8    = _mm_loadl_epi64((__m128i*)(input_ptr + ic));
+                    __m128i src_i32x4_0 = _mm_cvtepi8_epi32(src_i8x8);
+                    __m128i src_i32x4_1 = _mm_cvtepi8_epi32(_mm_shuffle_epi32(src_i8x8, 0x01));
+                    __m128 src_f32x4_0  = _mm_cvtepi32_ps(src_i32x4_0);
+                    __m128 src_f32x4_1  = _mm_cvtepi32_ps(src_i32x4_1);
+                    src_f32x4_0 = _mm_mul_ps(src_f32x4_0, scale_vec);
+                    src_f32x4_1 = _mm_mul_ps(src_f32x4_1, scale_vec);
+                    F32X8TOI8X8(src_f32x4_0, src_f32x4_1, (output_ptr + ic));
+                }
+                for (; ic < input_stride; ic++) {
+                    output_ptr[ic] = float2int8(input_ptr[ic] * scale);
+                }
+                output_ptr += input_stride;
+            }
+        }
+    }
+}
+
+static inline void get_bilinear_coeffs(float *h_coeffs_ptr, float *w_coeffs_ptr, int ih, int iw, int oh, int ow,
+                                       bool align_corners) {
+    if (align_corners) {
+        const float rheight = (oh > 1) ? (float)(ih - 1) / (oh - 1) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw - 1) / (ow - 1) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            h_coeffs_ptr[h] = h * rheight;
+        }
+        for (int w = 0; w < ow; ++w) {
+            w_coeffs_ptr[w] = w * rwidth;
+        }
+    } else {
+        const float rheight = (oh > 1) ? (float)(ih) / (oh) : 0.f;
+        const float rwidth  = (ow > 1) ? (float)(iw) / (ow) : 0.f;
+        for (int h = 0; h < oh; ++h) {
+            h_coeffs_ptr[h] = rheight * (h + 0.5) - 0.5;
+            h_coeffs_ptr[h] = h_coeffs_ptr[h] >= 0 ? h_coeffs_ptr[h] : 0;
+        }
+        for (int w = 0; w < ow; ++w) {
+            w_coeffs_ptr[w] = rwidth * (w + 0.5) - 0.5;
+            w_coeffs_ptr[w] = w_coeffs_ptr[w] >= 0 ? w_coeffs_ptr[w] : 0;
+        }
+    }
+}
+
+template <bool do_scale>
+static void upsample_bilinear_cn(int8_t *output_data, const int8_t *input_data, const float *h_coeffs_ptr,
+                                 const float *w_coeffs_ptr, int c_4, int ih, int iw, int oh, int ow,
+                                 const float *scale) {
+    auto c_r4       = c_4 * 4;
+    auto src_y_step = iw * c_r4;
+    auto dst_y_step = ow * c_r4;
+
+    const float INTER_RESIZE_COEF_SCALE = float(1 << 11);
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int h2 = 0; h2 < oh; ++h2) {
+        const float h1r      = h_coeffs_ptr[h2];
+        const int h1         = h1r;
+        const int h1p        = (h1 < ih - 1) ? 1 : 0;
+        const float h1lambda = h1r - h1;
+        const float h0lambda = (float)1. - h1lambda;
+        for (int w2 = 0; w2 < ow; ++w2) {
+            const float w1r       = w_coeffs_ptr[w2];
+            const int w1          = w1r;
+            const int w1p         = (w1 < iw - 1) ? 1 : 0;
+            const float w1lambda  = w1r - w1;
+            const float w0lambda  = (float)1. - w1lambda;
+            const int8_t *Xdata00 = &(input_data[h1 * src_y_step + w1 * c_r4]);
+            const int8_t *Xdata01 = Xdata00 + w1p * c_r4;
+            const int8_t *Xdata10 = Xdata00 + h1p * src_y_step;
+            const int8_t *Xdata11 = Xdata10 + w1p * c_r4;
+            int8_t *Ydata         = &(output_data[h2 * dst_y_step + w2 * c_r4]);
+            const float *scale_p  = scale;
+            for (int c = 0; c < c_r4; ++c) {
+                if (do_scale) {
+                    Ydata[c] = float2int8(((Xdata00[c] * w0lambda + Xdata01[c] * w1lambda) * h0lambda +
+                                           (Xdata10[c] * w0lambda + Xdata11[c] * w1lambda) * h1lambda) * scale_p[c]);
+                } else {
+                    Ydata[c] = float2int8(((Xdata00[c] * w0lambda + Xdata01[c] * w1lambda) * h0lambda +
+                                           (Xdata10[c] * w0lambda + Xdata11[c] * w1lambda) * h1lambda));
+                }
+            }
+        }
+    }
+}
+
+template <bool do_scale>
+void X86UpsampleNearest2D(int8_t *output_data, const int8_t *input_data,
+                          int ih, int iw, int oh, int ow, int c_4, const float *scale) {
+    auto c_r4       = c_4 * 4;
+    auto src_y_step = iw * c_r4;
+    auto dst_y_step = ow * c_r4;
+
+    const float height_scale = (float)ih / (float)oh;
+    const float width_scale  = (float)iw / (float)ow;
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int h = 0; h < oh; h++) {
+        int scale_h = static_cast<int>(h * height_scale);
+        auto dst_y  = output_data + h * dst_y_step;
+        auto src_y  = input_data + scale_h * src_y_step;
+        for (int w = 0; w < ow; w++) {
+            int scale_w = static_cast<int>(w * width_scale);
+            auto dst_x  = dst_y + w * c_r4;
+            auto src_x  = src_y + scale_w * c_r4;
+            if (!do_scale) {
+                memcpy(dst_x, src_x, c_r4);
+            } else {
+                for (int c = 0; c < c_r4; ++c) {
+                    dst_x[c] = float2int8(src_x[c] * scale[c]);
+                }
+            }
+        }
+    }
+}
+
+template void X86UpsampleNearest2D<true>(int8_t *output_data, const int8_t *input_data,
+                          int ih, int iw, int oh, int ow, int c_4, const float *scale);
+template void X86UpsampleNearest2D<false>(int8_t *output_data, const int8_t *input_data,
+                          int ih, int iw, int oh, int ow, int c_4, const float *scale);
+
+template <bool do_scale>
+void X86UpsampleBilinear2D(int8_t *output_data, const int8_t *input_data,
+                           int batch, int ih, int iw, int oh, int ow,
+                           int c_4, bool align_corners, const float *scale) {
+    auto src_plane = iw * ih * c_4 * 4;
+    auto dst_plane = ow * oh * c_4 * 4;
+
+    RawBuffer h_coeffs(oh * sizeof(float));
+    RawBuffer w_coeffs(ow * sizeof(float));
+    auto h_coeffs_ptr = h_coeffs.force_to<float *>();
+    auto w_coeffs_ptr = w_coeffs.force_to<float *>();
+
+    get_bilinear_coeffs(h_coeffs_ptr, w_coeffs_ptr, ih, iw, oh, ow, align_corners);
+
+    for (int b = 0; b < batch; ++b) {
+        auto input_b  = input_data + b * src_plane;
+        auto output_b = output_data + b * dst_plane;
+        if (do_scale) {
+            upsample_bilinear_cn<true>(output_b, input_b, h_coeffs_ptr, w_coeffs_ptr, c_4, ih, iw, oh, ow, scale);
+        } else {
+            upsample_bilinear_cn<false>(output_b, input_b, h_coeffs_ptr, w_coeffs_ptr, c_4, ih, iw, oh, ow, scale);
+        }
+    }
+}
+
+template void X86UpsampleBilinear2D<true>(int8_t *output_data, const int8_t *input_data,
+                                          int batch, int ih, int iw, int oh, int ow,
+                                          int c_4, bool align_corners, const float *scale);
+template void X86UpsampleBilinear2D<false>(int8_t *output_data, const int8_t *input_data,
+                                          int batch, int ih, int iw, int oh, int ow,
+                                          int c_4, bool align_corners, const float *scale);
+
+void X86Int8ToFloat(float* dst, const int8_t* src, const float* scale, long batch, long channel, long hw) {
+    long channel_r4 = ROUND_UP(channel, 4);
+    for (long b = 0; b < batch; b++) {
+        auto dst_b = dst + b * channel * hw;
+        auto src_b = src + b * channel_r4 * hw;
+
+        long c = 0;
+        for (; c + 3 < channel; c += 4) {
+            auto dst_c_0 = dst_b + c * hw;
+            auto dst_c_1 = dst_c_0 + hw;
+            auto dst_c_2 = dst_c_1 + hw;
+            auto dst_c_3 = dst_c_2 + hw;
+            auto src_c   = src_b + c;
+
+            __m128 scale_vec = _mm_loadu_ps(scale + c);
+            long i = 0;
+            for (; i + 3 < hw; i += 4) {
+                auto src_hw = src_c + i * channel_r4;
+                int i8x4_0  = (*(int *)(src_hw));
+                int i8x4_1  = (*(int *)(src_hw + channel_r4));
+                int i8x4_2  = (*(int *)(src_hw + channel_r4 * 2));
+                int i8x4_3  = (*(int *)(src_hw + channel_r4 * 3));
+
+                __m128 f32x4_0 = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(i8x4_0)));
+                __m128 f32x4_1 = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(i8x4_1)));
+                __m128 f32x4_2 = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(i8x4_2)));
+                __m128 f32x4_3 = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(i8x4_3)));
+                f32x4_0        = _mm_mul_ps(f32x4_0, scale_vec);
+                f32x4_1        = _mm_mul_ps(f32x4_1, scale_vec);
+                f32x4_2        = _mm_mul_ps(f32x4_2, scale_vec);
+                f32x4_3        = _mm_mul_ps(f32x4_3, scale_vec);
+                _MM_TRANSPOSE4_PS(f32x4_0, f32x4_1, f32x4_2, f32x4_3);
+                _mm_storeu_ps(dst_c_0 + i, f32x4_0);
+                _mm_storeu_ps(dst_c_1 + i, f32x4_1);
+                _mm_storeu_ps(dst_c_2 + i, f32x4_2);
+                _mm_storeu_ps(dst_c_3 + i, f32x4_3);
+            }
+            float dst_tmp[4];
+            for (; i < hw; i++) {
+                int i8x4     = *((int *)(src_c + i * channel_r4));
+                __m128 f32x4 = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(i8x4)));
+                f32x4        = _mm_mul_ps(f32x4, scale_vec);
+                _mm_store_ps(dst_tmp, f32x4);
+                dst_c_0[i]   = dst_tmp[0];
+                dst_c_1[i]   = dst_tmp[1];
+                dst_c_2[i]   = dst_tmp[2];
+                dst_c_3[i]   = dst_tmp[3];
+            }
+        }
+        for (; c < channel; c++) {
+            auto dst_c = dst_b + c * hw;
+            auto src_c = src_b + c;
+            auto sc    = scale[c];
+
+            for (long i = 0; i < hw; i++) {
+                dst_c[i] = static_cast<float>(src_c[i * channel_r4]) * sc;
+            }
+        }
+    }
+}
+
+void X86FloatToInt8(int8_t* dst, const float* src, const float* scale, long batch, long channel, long hw) {
+    DeclareRounding();
+    long channel_r4 = ROUND_UP(channel, 4);
+    for (long b = 0; b < batch; b++) {
+        auto dst_b = dst + b * channel_r4 * hw;
+        auto src_b = src + b * channel * hw;
+
+        long i = 0;
+        for (; i + 3 < hw; i += 4) {
+            auto dst_hw_0 = dst_b + i * channel_r4;
+            auto dst_hw_1 = dst_hw_0 + channel_r4;
+            auto dst_hw_2 = dst_hw_1 + channel_r4;
+            auto dst_hw_3 = dst_hw_2 + channel_r4;
+            auto src_hw   = src_b + i;
+
+            long c = 0;
+            for (; c + 3 < channel; c += 4) {
+                auto src_c = src_hw + c * hw;
+                __m128 s_vec   = _mm_loadu_ps(scale + c);
+                __m128 f32x4_0 = _mm_loadu_ps(src_c);
+                __m128 f32x4_1 = _mm_loadu_ps(src_c + hw);
+                __m128 f32x4_2 = _mm_loadu_ps(src_c + hw * 2);
+                __m128 f32x4_3 = _mm_loadu_ps(src_c + hw * 3);
+                _MM_TRANSPOSE4_PS(f32x4_0, f32x4_1, f32x4_2, f32x4_3);
+                f32x4_0        = _mm_mul_ps(f32x4_0, s_vec);
+                f32x4_1        = _mm_mul_ps(f32x4_1, s_vec);
+                f32x4_2        = _mm_mul_ps(f32x4_2, s_vec);
+                f32x4_3        = _mm_mul_ps(f32x4_3, s_vec);
+                F32X4TOI8X4(f32x4_0, dst_hw_0 + c);
+                F32X4TOI8X4_NODEF(f32x4_1, dst_hw_1 + c);
+                F32X4TOI8X4_NODEF(f32x4_2, dst_hw_2 + c);
+                F32X4TOI8X4_NODEF(f32x4_3, dst_hw_3 + c);
+            }
+            int8_t int8_tmp[4];
+            for (; c < channel; c++) {
+                __m128 s_vec = _mm_set1_ps(scale[c]);
+                __m128 f32x4 = _mm_loadu_ps(src_hw + c * hw);
+                f32x4        = _mm_mul_ps(f32x4, s_vec);
+                F32X4TOI8X4(f32x4, int8_tmp);
+                dst_hw_0[c]  = int8_tmp[0];
+                dst_hw_1[c]  = int8_tmp[1];
+                dst_hw_2[c]  = int8_tmp[2];
+                dst_hw_3[c]  = int8_tmp[3];
+            }
+            for (; c < channel_r4; c++) {
+                dst_hw_0[c]  = 0;
+                dst_hw_1[c]  = 0;
+                dst_hw_2[c]  = 0;
+                dst_hw_3[c]  = 0;
+            }
+        }
+        for (; i < hw; i++) {
+            auto dst_hw = dst_b + i * channel_r4;
+            auto src_hw = src_b + i;
+
+            long c = 0;
+            for (; c < channel; c++) {
+                dst_hw[c] = float2int8(src_hw[c * hw] * scale[c]);
+            }
+            for (; c < channel_r4; c++) {
+                dst_hw[c] = 0;
+            }
+        }
+    }
+}
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.h b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.h
new file mode 100644
index 0000000..1eda905
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/compute/x86_compute_int8.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef SOURCE_TNN_DEVICE_X86_ACC_X86_COMPUTE_INT8_H_
+#define SOURCE_TNN_DEVICE_X86_ACC_X86_COMPUTE_INT8_H_
+
+#include "tnn/core/common.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+
+namespace TNN_NS {
+
+void X86AVXGemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max);
+
+void X86SSEGemmInt8Unit4x4(const int8_t* src, const int8_t* weight, int8_t* dst, long src_w_step, long dst_depth, long cdiv8,
+                     const float* scale, const int32_t* bias, long relu, const int8_t* add_input,
+                     const float* add_scale, const int8_t* relu6_max);
+
+void X86DepthwiseI8Unit(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, long fw, long fh,
+                     long weight_y_step, long dilate_y_step, long dilate_x_step, const float* scale, long dst_depth);
+
+void X86DepthwiseI8General(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                        long dilate_y_step, long dilate_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                        const float* scale_z);
+
+void X86DepthwiseI8K3(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z);
+
+void X86DepthwiseI8K5(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias_z, long width,
+                   long dilate_y_step, long dialte_x_step, long src_w_step, long dst_depth, long fw, long fh,
+                   const float* scale_z);
+
+void X86ReluInt8(int8_t* dst, const int8_t* src, long len);
+void X86Relu6Int8(int8_t* dst, const int8_t* src, const int8_t* relu6_max, long width, long dst_depth);
+
+void X86MaxPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h);
+
+void X86AvgPoolingINT8(const int8_t* src, long iw, long ih, int8_t* dst, long ow, long oh, long c_r4, long kw, long kh,
+                    long stride_w, long stride_h, long pad_w, long pad_h);
+
+void X86MatrixAddInt8(int8_t* dst, const int8_t* A, const int8_t* B, float* dst_scale, const float* a_scale,
+                   float* b_scale, long channel, long hw_size);
+
+void X86GemvInt8(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias, const float* scale,
+                 long ic_r4, long oc_r4);
+
+void X86ConcatChannelInt8(Blob *output, const std::vector<Blob *> &inputs);
+void X86ConcatCommonInt8(Blob *output, const std::vector<Blob *> &inputs, int axis);
+
+template <bool do_scale>
+void X86UpsampleNearest2D(int8_t *output_data, const int8_t *input_data,
+                          int ih, int iw, int oh, int ow, int c_4, const float *scale);
+
+template <bool do_scale>
+void X86UpsampleBilinear2D(int8_t *output_data, const int8_t *input_data,
+                           int batch, int ih, int iw, int oh, int ow,
+                           int c_4, bool align_corners, const float *scale);
+
+void X86Int8ToFloat(float* dst, const int8_t* src, const float* scale, long batch, long channel, long hw);
+void X86FloatToInt8(int8_t* dst, const float* src, const float* scale, long batch, long channel, long hw);
+
+}   // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.cc
new file mode 100644
index 0000000..c8239d1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.cc
@@ -0,0 +1,564 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h"
+
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+/*
+X86ConvInt8LayerCommon as the last conv int8 solution
+*/
+bool X86ConvInt8LayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                        const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        return true;
+    }
+    return false;
+}
+
+X86ConvInt8LayerCommon::~X86ConvInt8LayerCommon() {}
+
+Status X86ConvInt8LayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                    const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = conv_param->kernels[0];
+        int kh = conv_param->kernels[1];
+        const int group = conv_param->group;
+
+        const int oc         = dims_output[1];
+        const int ic         = dims_input[1];
+        const int oc_g       = oc / group;
+        const int ic_g       = ic / group;
+        const int oc_g_r4    = ROUND_UP(oc_g, 4);
+        const int ic_g_r4    = ROUND_UP(ic_g, 4);
+        const int icrs_g_r16 = ROUND_UP(ic_g_r4 * kw * kh, 16);
+        const int icrs_g     = ic_g * kw * kh;
+
+        int weight_count   = group * oc_g_r4 * icrs_g_r16;
+        int data_byte_size = weight_count * DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+        RawBuffer temp_buffer(data_byte_size + SIMD_KERNEL_EXTRA_LOAD);
+
+        for (int g = 0; g < group; g++) {
+            auto weight_src_g = conv_res->filter_handle.force_to<int8_t *>() + g * oc_g * icrs_g;
+            auto weight_dst_g = temp_buffer.force_to<int8_t *>() + g * oc_g_r4 * icrs_g_r16;
+            // from [o][i][h][w]
+            // to: [o/4][h][w][i/16][o4][i16]
+            PackINT8Weight(weight_src_g, weight_dst_g, ic_g, oc_g,
+                           conv_param->kernels[1], conv_param->kernels[0]);
+        }
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status X86ConvInt8LayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    if (!buffer_bias_.GetBytesSize()) {
+        if (conv_param->bias) {
+            int total_byte_size =
+                ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->bias_handle.GetDataType());
+
+            const int bias_handle_size      = conv_res->bias_handle.GetBytesSize();
+            const int32_t *bias_handle_data = conv_res->bias_handle.force_to<int32_t *>();
+
+            RawBuffer temp_buffer(total_byte_size);
+            memcpy(temp_buffer.force_to<int32_t *>(), conv_res->bias_handle.force_to<int32_t *>(), bias_handle_size);
+
+            buffer_bias_ = temp_buffer;
+        } else if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            // int 8 kernel always add bias, if not, set zeros
+            buffer_bias_ = RawBuffer(ROUND_UP(dims_output[1], 4) * sizeof(int32_t));
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status X86ConvInt8LayerCommon::allocateBufferScale(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    // alloc scale buffer
+    if (!buffer_scale_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->scale_handle.GetDataType());
+
+        const int scale_handle_size = conv_res->scale_handle.GetBytesSize();
+        const float *w_scale        = conv_res->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        int scale_len_w = conv_res->scale_handle.GetDataCount();
+        int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_w = scale_len_w == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+            if (w_scale[scale_idx_w] < 0.0f || o_scale[scale_idx_o] < 0.0f) {
+                return Status(TNNERR_PARAM_ERR, "int8-blob scale can not be negative");
+            }
+
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = w_scale[scale_idx_w] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        buffer_scale_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status X86ConvInt8LayerCommon::allocateBufferAddScale(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims) !=
+        DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims)) {
+        return Status(TNNERR_LAYER_ERR, "Conv-Add fusion does not support broadcast-add");
+    }
+
+    // alloc add scale buffer
+    if (!buffer_add_scale_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(conv_res->scale_handle.GetDataType());
+
+        const float *i_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        int scale_len_i = reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.GetDataCount();
+        int scale_len_o = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_i = scale_len_i == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+
+            if (i_scale[scale_idx_i] < 0.0f || o_scale[scale_idx_o] < 0.0f) {
+                return Status(TNNERR_PARAM_ERR, "int8-blob scale can not be negative");
+            }
+
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        buffer_add_scale_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+#define DEF_IMG2COL_VAL                                                                                                \
+    int x_id = (int)x_start + i;                                                                                       \
+    int ox   = x_id % output_dims[3];                                                                                  \
+    int oy   = x_id / output_dims[3];                                                                                  \
+    int sx   = ox * param->strides[0] - param->pads[0];                                                                \
+    int sy   = oy * param->strides[1] - param->pads[2];                                                                \
+    int sfy  = MAX(0, (UP_DIV(-sy, param->dialations[1])));                                                            \
+    int efy  = MIN(kh, UP_DIV(input_dims[2] - sy, param->dialations[1]));                                              \
+    int sfx  = MAX(0, (UP_DIV(-sx, param->dialations[0])));                                                            \
+    int efx  = MIN(kw, UP_DIV(input_dims[3] - sx, param->dialations[0]));                                              \
+    int fyC  = efy - sfy;                                                                                              \
+    int fxC  = efx - sfx;
+
+/*
+general img2col func
+*/
+static void im2col(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                   int crs_div8, DimsVector input_dims, DimsVector output_dims) {
+    const int src_w_step      = ROUND_UP(input_dims[1], 4);
+    const int crs_r8          = crs_div8 * 8;
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r8 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * input_dims[3]) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * src_w_step;
+        memset(dst_i, 0, crs_r8 * sizeof(int8_t));
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * src_w_step;
+            auto src_y = input_offset + fy * input_dims[3] * src_w_step * dilate_y;
+            for (int fx = 0; fx < fxC; ++fx) {
+                auto dst_x = dst_y + fx * src_w_step;
+                auto src_x = src_y + fx * dilate_x * src_w_step;
+                memcpy(dst_x, src_x, src_w_step);
+            }
+        }
+    }
+    if (ROUND_UP(dst_cnt, 4) > dst_cnt) {
+        memset(dst + crs_r8 * dst_cnt, 0, crs_r8 * (ROUND_UP(dst_cnt, 4) - dst_cnt) * sizeof(int8_t));
+    }
+}
+
+/*
+template img2col func when c is small(eg. 1,2,3)
+*/
+template <int REALC>
+static void im2col_smallc(int8_t *dst, const int8_t *src, const ConvLayerParam *param, size_t x_start, size_t dst_cnt,
+                          int crs_div8, DimsVector input_dims, DimsVector output_dims) {
+    const int src_w_step      = 4;
+    const int crs_r8          = crs_div8 * 8;
+    auto kh       = param->kernels[1];
+    auto kw       = param->kernels[0];
+    auto dilate_y = param->dialations[1];
+    auto dilate_x = param->dialations[0];
+    for (int i = 0; i < dst_cnt; ++i) {
+        DEF_IMG2COL_VAL;
+
+        auto dst_i        = dst + crs_r8 * i;
+        auto input_offset = src + (sx + sfx * dilate_x + (sy + sfy * dilate_y) * input_dims[3]) * src_w_step;
+        auto idx_offset   = (sfy * kw + sfx) * REALC;
+        memset(dst_i, 0, crs_r8 * sizeof(int8_t));
+        for (int fy = 0; fy < fyC; ++fy) {
+            auto dst_y = dst_i + idx_offset + fy * kw * REALC;
+            auto src_y = input_offset + fy * input_dims[3] * src_w_step * dilate_y;
+            for (int fx = 0; fx < fxC; fx++) {
+                auto dst_x = dst_y + fx * REALC;
+                auto src_x = src_y + fx * src_w_step * dilate_x;
+                for (int c = 0; c < REALC; c++) {
+                    dst_x[c] = src_x[c];
+                }
+            }
+        }
+    }
+    if (ROUND_UP(dst_cnt, 4) > dst_cnt) {
+        memset(dst + crs_r8 * dst_cnt, 0, crs_r8 * (ROUND_UP(dst_cnt, 4) - dst_cnt) * sizeof(int8_t));
+    }
+}
+
+Status X86ConvInt8LayerCommon::setFusionParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    // fused add input
+    if (conv_param->fusion_type != FusionType_None) {
+        RETURN_ON_NEQ(allocateBufferAddScale(inputs, outputs), TNN_OK);
+    }
+
+    // only support relu activation
+    if (conv_param->activation_type == ActivationType_ReLU) {
+        relu_ = 1;
+        if (conv_param->fusion_type == FusionType_Conv_Activation_Add) {
+            relu_ = -1;
+        }
+    } else if (conv_param->activation_type == ActivationType_ReLU6) {
+        relu_ = 2;
+        if (conv_param->fusion_type == FusionType_Conv_Activation_Add) {
+            return Status(TNNERR_LAYER_ERR, "Conv-Activation-Add fusion does not support relu6");
+        }
+    }
+
+    // compute relu6 max
+    if (conv_param->activation_type == ActivationType_ReLU6) {
+        auto output_scale_resource      = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        auto output_scale_len           = output_scale_resource->scale_handle.GetDataCount();
+        auto output_scale_resource_data = output_scale_resource->scale_handle.force_to<float *>();
+        auto &dims_output               = outputs[0]->GetBlobDesc().dims;
+        auto &output_channel            = dims_output[1];
+        RawBuffer relu6_max             = RawBuffer(ROUND_UP(output_channel, 4) * sizeof(int8_t));
+        auto relu6_max_data             = relu6_max.force_to<int8_t *>();
+        for (int i = 0; i < output_channel; ++i) {
+            int scale_idx     = output_scale_len == 1 ? 0 : i;
+            relu6_max_data[i] = float2int8(6.0f / output_scale_resource_data[scale_idx]);
+        }
+        for (int i = output_channel; i < ROUND_UP(output_channel, 4); ++i) {
+            relu6_max_data[i] = 127;
+        }
+        relu6_max_ = relu6_max;
+        relu6_max_.SetDataType(DATA_TYPE_INT8);
+    }
+
+    return TNN_OK;
+}
+
+Status X86ConvInt8LayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                    const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(X86LayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferScale(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(setFusionParam(inputs, outputs), TNN_OK);
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto dims_input = inputs[0]->GetBlobDesc().dims;
+    int kernel_x    = conv_param->kernels[0];
+    int kernel_y    = conv_param->kernels[1];
+    int stride_x    = conv_param->strides[0];
+    int stride_y    = conv_param->strides[1];
+    int pad_x       = conv_param->pads[0];
+    int pad_y       = conv_param->pads[2];
+    int ic_g        = dims_input[1] / conv_param->group;
+    int ic_g_r4     = ROUND_UP(ic_g, 4);
+
+    // fast mode
+    bool no_im2col = kernel_x == 1 && kernel_y == 1 && ic_g_r4 % 8 == 0 && stride_x == 1 && stride_y == 1 &&
+                     pad_x == 0 && pad_y == 0 && dims_input[2] * dims_input[3] % 4 == 0;
+    if (!no_im2col) {
+        im_col_func_ = im2col;
+        if (ic_g == 1) {
+            im_col_func_ = im2col_smallc<1>;
+        } else if (ic_g == 2) {
+            im_col_func_ = im2col_smallc<2>;
+        } else if (ic_g == 3) {
+            im_col_func_ = im2col_smallc<3>;
+        }
+    } else {
+        im_col_func_ = nullptr;
+    }
+
+    // set tile blk size, which be limit to 16KB
+    // 16 * 1024 / sizeof(int8_t)
+    int tile_blk = 16384 / (ic_g_r4 * kernel_x * kernel_y);
+    tile_blk = ROUND_UP(tile_blk, SIMD_INT8CONV_TILE_HW);
+    if (tile_blk < SIMD_INT8CONV_TILE_HW) {
+        tile_blk = SIMD_INT8CONV_TILE_HW;
+    }
+    if (tile_blk > 512) {
+        tile_blk = 512;
+    }
+    tile_blk_ = tile_blk;
+
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+
+    return TNN_OK;
+}
+
+void GemmInt8(int8_t* dst, const int8_t* src, const int8_t* weight, const int32_t* bias,
+              const float* scale, int hw_tile, int src_depth_d8, int src_w_step, int dst_depth, int relu,
+              const int8_t* add_input, const float* add_scale, const int8_t* relu6_max, x86_isa_t arch) {
+    const int src_depth_d16 = UP_DIV(src_depth_d8, 2);
+    auto gemm_kernel = X86AVXGemmInt8Unit4x4;
+    if (arch == sse42) {
+        gemm_kernel = X86SSEGemmInt8Unit4x4;
+    }
+
+    for (int j = 0; j < dst_depth; j += 4) {
+        int hw = 0;
+        for (; hw + SIMD_INT8CONV_TILE_HW - 1 < hw_tile; hw += SIMD_INT8CONV_TILE_HW) {
+            auto src_hw = src + hw * src_w_step;
+            auto dst_hw = dst + hw * dst_depth;
+            auto add_input_hw = add_input ? add_input + hw * dst_depth : nullptr;
+            gemm_kernel(src_hw, weight, dst_hw, src_w_step, dst_depth, src_depth_d8,
+                        scale + j, bias + j, relu, add_input_hw, add_scale, relu6_max);
+        }
+        if (hw < hw_tile) {
+            auto src_hw = src + hw * src_w_step;
+            auto dst_hw = dst + hw * dst_depth;
+            auto add_input_hw = add_input ? add_input + hw * dst_depth : nullptr;
+            int real_hw_tile = hw_tile - hw;
+
+            int8_t outptr_tmp[16] = {0};
+            int8_t add_input_tmp[16] = {0};
+            int8_t * add_input_ptr_tmp = nullptr;
+
+            if (add_input) {
+                add_input_ptr_tmp = add_input_tmp;
+                for (int i = 0; i < real_hw_tile; i++) {
+                    memcpy(add_input_ptr_tmp + i * 4, add_input_hw + i * dst_depth, 4 * sizeof(int8_t));
+                }
+            }
+            gemm_kernel(src_hw, weight, outptr_tmp, src_w_step, 4, src_depth_d8,
+                        scale + j, bias + j, relu, add_input_ptr_tmp, add_scale, relu6_max);
+
+            for (int i = 0; i < real_hw_tile; i++) {
+                memcpy(dst_hw + i * dst_depth, outptr_tmp + i * 4, 4 * sizeof(int8_t));
+            }
+        }
+
+        dst += 4;
+        weight += 4 * src_depth_d16 * 16;
+        if (add_input) {
+            add_input += 4;
+            add_scale += 4;
+        }
+        if (relu6_max) {
+            relu6_max += 4;
+        }
+    }
+}
+
+static inline void memcpy_2d(int8_t *dst, int8_t *src, int height, int width, int dst_stride, int src_stride) {
+    for (int h = 0; h < height; h++) {
+        memcpy(dst + h * dst_stride, src + h * src_stride, width);
+    }
+}
+
+Status X86ConvInt8LayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    auto input     = inputs[0];
+    auto output    = outputs[0];
+    auto add_input = (conv_param->fusion_type == FusionType_None) ? nullptr : inputs[1];
+
+    DataType data_type = output->GetBlobDesc().data_type;
+    int data_byte_size = DataTypeUtils::GetBytesSize(data_type);
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+    const int batch  = dims_output[0];
+    auto ic          = dims_input[1];
+    auto ic_r4       = ROUND_UP(ic, 4);
+    auto oc_r4       = ROUND_UP(dims_output[1], 4);
+    auto ic_g        = dims_input[1] / conv_param->group;
+    auto ic_g_r4     = ROUND_UP(ic_g, 4);
+    auto oc_g        = dims_output[1] / conv_param->group;
+    auto oc_g_r4     = ROUND_UP(oc_g, 4);
+    auto ic_calc     = ic_g < 4 ? ic_g : ic_g_r4;
+
+    auto input_channel_stride  = DimsVectorUtils::Count(dims_input, 2);
+    auto output_channel_stride = DimsVectorUtils::Count(dims_output, 2);
+    auto input_batch_stride    = input_channel_stride * ic_r4;
+    auto output_batch_stride   = output_channel_stride * oc_r4;
+    auto kernel_group_stride   = oc_g_r4 * ROUND_UP(ic_g_r4 * conv_param->kernels[0] * conv_param->kernels[1], 16);
+
+    int8_t *input_data     = reinterpret_cast<int8_t *>(input->GetHandle().base);
+    int8_t *output_data    = reinterpret_cast<int8_t *>(output->GetHandle().base);
+    int8_t *add_input_data = add_input ? reinterpret_cast<int8_t *>(add_input->GetHandle().base) : nullptr;
+
+    float *scale_ptr   = buffer_scale_.force_to<float *>();
+    int32_t *bias_ptr  = buffer_bias_.force_to<int32_t *>();
+    int8_t *weight_ptr = buffer_weight_.force_to<int8_t *>();
+
+    const int crs_div8   = UP_DIV(ic_calc * conv_param->kernels[1] * conv_param->kernels[0], 8);
+    int tile_count = UP_DIV(dims_output[2] * dims_output[3], tile_blk_);
+
+    // for multi-threads, adjust tile_blk to make more threads parallel
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    if (max_num_threads > 1) {
+        while (tile_count < max_num_threads && tile_blk_ > SIMD_INT8CONV_TILE_HW) {
+            tile_blk_ = ROUND_UP(tile_blk_ / 2, SIMD_INT8CONV_TILE_HW);
+            tile_count = UP_DIV(dims_output[2] * dims_output[3], tile_blk_);
+        }
+    }
+
+    size_t workspace_size = 0;
+    size_t src_group_buf_size = 0;
+    size_t dst_group_buf_size = 0;
+    if (conv_param->group > 1) {
+        src_group_buf_size = ic_g_r4 * input_channel_stride;
+        dst_group_buf_size = oc_g_r4 * output_channel_stride;
+        dims_input[1] = ic_g;
+    }
+    size_t im2col_buf_size = ROUND_UP(ic_g_r4 * conv_param->kernels[0] * conv_param->kernels[1], 16) *
+                             tile_blk_ * max_num_threads + SIMD_KERNEL_EXTRA_LOAD;
+    workspace_size = src_group_buf_size + dst_group_buf_size + im2col_buf_size;
+    int8_t *workspace = reinterpret_cast<int8_t *>(context_->GetSharedWorkSpace(workspace_size));
+    // im2col will memset 0 by itself
+    memset(workspace, 0, src_group_buf_size + dst_group_buf_size);
+    auto im2col_buf_ptr = workspace + src_group_buf_size + dst_group_buf_size;
+
+    for (int n = 0; n < batch; ++n) {
+        auto input_batch     = input_data + n * input_batch_stride;
+        auto output_batch    = output_data + n * output_batch_stride;
+        auto add_input_batch = add_input_data ? add_input_data + n * output_batch_stride : nullptr;
+
+        auto input_group  = input_batch;
+        auto output_group = output_batch;
+        if (conv_param->group > 1) {
+            input_group  = workspace;
+            output_group = workspace + src_group_buf_size;
+        }
+
+        for (int g = 0; g < conv_param->group; g++) {
+            if (conv_param->group > 1) {
+                auto input_ptr = input_batch + g * ic_g;
+                memcpy_2d(input_group, input_ptr, input_channel_stride, ic_g, ic_g_r4, ic_r4);
+            }
+            auto bias_g      = bias_ptr + g * oc_g;
+            auto scale_g     = scale_ptr + g * oc_g;
+            auto relu6_max_g = relu6_max_.force_to<int8_t *>() + g * oc_g;
+            auto weight_g    = weight_ptr + g * kernel_group_stride;
+
+            OMP_PARALLEL_FOR_GUIDED_
+            for (int t_idx = 0; t_idx < tile_count; t_idx++) {
+                int thread_id          = OMP_TID_;
+                int8_t *input_kernel   = nullptr;
+                const int hw_start     = t_idx * tile_blk_;
+                const int real_hw_tile = MIN(output_channel_stride - hw_start, tile_blk_);
+                const int input_count  = crs_div8 * tile_blk_ * 8;
+
+                // im2col
+                if (im_col_func_) {
+                    input_kernel = im2col_buf_ptr + input_count * thread_id;
+                    im_col_func_(input_kernel, input_group, conv_param,
+                                 hw_start, real_hw_tile, crs_div8, dims_input, dims_output);
+                } else {
+                    input_kernel = input_group + hw_start * ic_calc;
+                }
+                auto output_kernel    = output_group + hw_start * oc_g_r4;
+                // add_input not support group conv
+                auto add_input_kernel = add_input_batch ? add_input_batch + hw_start * oc_g_r4 : nullptr;
+
+                GemmInt8(output_kernel, input_kernel, weight_g, bias_g, scale_g,
+                         real_hw_tile, crs_div8, crs_div8 * 8, oc_g_r4, relu_,
+                         add_input_kernel, buffer_add_scale_.force_to<float *>(),
+                         relu6_max_g, arch_);
+            }
+
+            if (conv_param->group > 1) {
+                auto output_ptr = output_batch + g * oc_g;
+                memcpy_2d(output_ptr, output_group, output_channel_stride, oc_g, oc_r4, oc_g_r4);
+
+                if (oc_r4 != dims_output[1] && g == conv_param->group - 1) {
+                    auto output_ptr_corner_case = output_batch + dims_output[1];
+                    for (int h = 0; h < output_channel_stride; h++) {
+                        memset(output_ptr_corner_case + h * oc_r4, 0, (oc_r4 - dims_output[1]));
+                    }
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h
new file mode 100644
index 0000000..08c53c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+#ifndef SIMD_INT8CONV_TILE_HW
+#define SIMD_INT8CONV_TILE_HW (4)
+#endif
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+
+class X86ConvInt8LayerCommon : public X86LayerAcc {
+public:
+    virtual ~X86ConvInt8LayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+                
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferScale(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferAddScale(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status setFusionParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_im2col_;
+    RawBuffer buffer_bias_;
+    RawBuffer buffer_scale_;
+    RawBuffer buffer_add_scale_;
+    // for conv relu6 fusion
+    RawBuffer relu6_max_;
+
+    long relu_ = 0;
+    int tile_blk_ = 32;
+
+    std::function<void(int8_t *, const int8_t *, const ConvLayerParam *, size_t, size_t, int,
+                       DimsVector, DimsVector)> im_col_func_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.cc
new file mode 100644
index 0000000..97da960
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.cc
@@ -0,0 +1,200 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.h"
+
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool X86ConvInt8LayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                           const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+    if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_INT8) {
+        return false;
+    }
+    if (param->fusion_type != FusionType_None) {
+        return false;
+    }
+
+    auto dims_input          = inputs[0]->GetBlobDesc().dims;
+    auto dims_output         = outputs[0]->GetBlobDesc().dims;
+    const int input_channel  = dims_input[1];
+    const int output_channel = dims_output[1];
+
+    return param->group == input_channel && param->group == output_channel;
+}
+
+X86ConvInt8LayerDepthwise::~X86ConvInt8LayerDepthwise() {}
+
+Status X86ConvInt8LayerDepthwise::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                      const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int8_t *filter = conv_res->filter_handle.force_to<int8_t *>();
+        CHECK_PARAM_NULL(filter);
+        int kw             = conv_param->kernels[0];
+        int kh             = conv_param->kernels[1];
+        const int channel  = inputs[0]->GetBlobDesc().dims[1];
+        const int c_4      = ROUND_UP(channel, 4);
+        int data_byte_size = c_4 * kh * kw;
+        RawBuffer temp_buffer(data_byte_size);
+        int8_t *temp_ptr = temp_buffer.force_to<int8_t *>();
+
+        for (int c = 0; c < channel; c++) {
+            int8_t *f_c = filter + c * kw * kh;
+            int8_t *t_c = temp_ptr + c;
+            for (int k = 0; k < kh * kw; k++) {
+                t_c[k * c_4] = f_c[k];
+            }
+        }
+
+        buffer_weight_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status X86ConvInt8LayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch  = dims_output[0];
+    const int group  = conv_param->group;
+    auto input_width = dims_input[3], input_height = dims_input[2], ic = dims_input[1];
+    auto output_width = dims_output[3], output_height = dims_output[2], oc = dims_output[1];
+    auto ic_r4 = ROUND_UP(dims_input[1], 4);
+    auto oc_r4 = ROUND_UP(dims_output[1], 4);
+
+    int kernel_x = conv_param->kernels[0];
+    int kernel_y = conv_param->kernels[1];
+    int stride_x = conv_param->strides[0];
+    int stride_y = conv_param->strides[1];
+    int pad_x    = conv_param->pads[0];
+    int pad_y    = conv_param->pads[2];
+    int dilate_x = conv_param->dialations[0];
+    int dilate_y = conv_param->dialations[1];
+
+    const int dst_y_step = output_width * oc_r4;
+    const int src_y_step = input_width * ic_r4;
+
+    int8_t *input_data  = reinterpret_cast<int8_t *>(input->GetHandle().base);
+    int8_t *output_data = reinterpret_cast<int8_t *>(output->GetHandle().base);
+
+    const int32_t *bias_data = buffer_bias_.force_to<int32_t *>();
+    const float *scale_data  = buffer_scale_.force_to<float *>();
+    int8_t *weight_data      = buffer_weight_.force_to<int8_t *>();
+
+    int l = 0, t = 0, r = output_width, b = output_height;
+    for (; l * stride_x - pad_x < 0; l++)
+        ;
+    for (; t * stride_y - pad_y < 0; t++)
+        ;
+    for (; (r - 1) * stride_x - pad_x + kernel_x * dilate_x > input_width && r > l; r--)
+        ;
+    for (; (b - 1) * stride_y - pad_y + kernel_y * dilate_y > input_height && b > t; b--)
+        ;
+
+    auto RunCorner = [=](int8_t *dst_z, const int8_t *src_z, int left, int top, int right, int bottom) {
+        for (long dy = top; dy < bottom; ++dy) {
+            auto dst_y             = dst_z + dy * dst_y_step;
+            const long src_start_y = dy * stride_y - pad_y;
+            const auto src_y       = src_z + src_start_y * src_y_step;
+            const long sfy         = MAX(0, (UP_DIV(-src_start_y, dilate_y)));
+            const long efy         = MIN(kernel_y, (UP_DIV(dims_input[2] - src_start_y, dilate_y)));
+            for (long dx = left; dx < right; ++dx) {
+                auto dst_x             = dst_y + oc_r4 * dx;
+                const long src_start_x = dx * stride_x - pad_x;
+                const auto src_x       = src_y + src_start_x * oc_r4;
+                const long sfx         = MAX(0, (UP_DIV(-src_start_x, dilate_x)));
+                const long efx         = MIN(kernel_x, (UP_DIV(dims_input[3] - src_start_x, dilate_x)));
+                const long srcIndex    = (sfx * dilate_x + sfy * dilate_y * dims_input[3]) * oc_r4;
+                const long weightIndex = (kernel_x * sfy + sfx) * oc_r4;
+
+                X86DepthwiseI8Unit(dst_x, src_x + srcIndex, weight_data + weightIndex, bias_data,
+                                efx - sfx, efy - sfy, oc_r4 * kernel_x, src_y_step * dilate_y,
+                                oc_r4 * dilate_x, scale_data, oc_r4);
+            }
+        }
+    };
+
+    for (int bIndex = 0; bIndex < batch; ++bIndex) {
+        const auto input_batch = input_data + bIndex * src_y_step * input_height;
+        auto output_batch      = output_data + bIndex * dst_y_step * output_height;
+
+        long src_w_step = oc_r4 * conv_param->strides[0];
+        auto dwfunc     = X86DepthwiseI8General;
+
+        if (kernel_x == kernel_y && kernel_x == 3 && oc_r4 >= 8 && dilate_x == 1 && dilate_y == 1) {
+            dwfunc = X86DepthwiseI8K3;
+        } else if (kernel_x == kernel_y && kernel_x == 5 && oc_r4 >= 8 && dilate_x == 1 && dilate_y == 1) {
+            dwfunc = X86DepthwiseI8K5;
+        }
+
+        OMP_PARALLEL_SECTIONS_ {
+            OMP_SECTION_ {
+                // top corner
+                RunCorner(output_batch, input_batch, 0, 0, dims_output[3], t);
+            }
+            OMP_SECTION_ {
+                // bottom corner
+                RunCorner(output_batch, input_batch, 0, b, dims_output[3], dims_output[2]);
+            }
+            OMP_SECTION_ {
+                // left corner
+                RunCorner(output_batch, input_batch, 0, t, l, b);
+            }
+            OMP_SECTION_ {
+                // bottom corner
+                RunCorner(output_batch, input_batch, r, t, dims_output[3], b);
+            }
+        }
+        if (r > l && b > t) {
+            OMP_PARALLEL_FOR_GUIDED_
+            for (long dy = t; dy < b; ++dy) {
+                const long src_start_y = dy * conv_param->strides[1] - conv_param->pads[2];
+                const auto src_dy      = input_batch + src_start_y * src_y_step;
+                auto dst_y             = output_batch + dy * dst_y_step;
+                dwfunc(dst_y + l * oc_r4,
+                       src_dy + (l * conv_param->strides[0] - conv_param->pads[0]) * oc_r4,
+                       weight_data, bias_data,
+                       r - l, src_y_step * dilate_y, oc_r4 * dilate_x, src_w_step, oc_r4,
+                       conv_param->kernels[0], conv_param->kernels[1], scale_data);
+            }
+        }
+
+        if (conv_param->activation_type == ActivationType_ReLU) {
+            X86ReluInt8(output_batch, output_batch, output_height * dst_y_step);
+        } else if (conv_param->activation_type == ActivationType_ReLU6) {
+            X86Relu6Int8(output_batch, output_batch, relu6_max_.force_to<int8_t *>(), output_height * output_width, oc_r4);
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.h
new file mode 100644
index 0000000..b762916
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class X86ConvInt8LayerDepthwise : public X86ConvInt8LayerCommon {
+public:
+    virtual ~X86ConvInt8LayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // preferred when gourp == input channel and group == outputchanenl
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_INT8_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.cc
new file mode 100644
index 0000000..5880a4e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_1x1.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool X86ConvLayer1x1::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if (param->group != 1) {
+        return false;
+    }
+
+    const int kw = param->kernels[0];
+    const int kh = param->kernels[1];
+    const int dw = param->dialations[0];
+    const int dh = param->dialations[1];
+    const int sw = param->strides[0];
+    const int sh = param->strides[1];
+    const int pw = param->pads[0] + param->pads[1];
+    const int ph = param->pads[2] + param->pads[3];
+
+    return kw == 1 && kh == 1 &&
+           dw == 1 && dh == 1 &&
+           sw == 1 && sh == 1 &&
+           pw == 0 && ph == 0;
+}
+
+X86ConvLayer1x1::~X86ConvLayer1x1() {}
+
+Status X86ConvLayer1x1::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *resource = dynamic_cast<ConvLayerResource *>(resource_);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    const int batch    = dims_output[0];
+
+    int dst_z_step     = dims_output[2] * dims_output[3];
+    int src_z_step     = dims_input[2] * dims_input[3];
+
+    float *weights_data = buffer_weight_.force_to<float*>();
+    float *bias_data    = buffer_bias_.force_to<float*>();
+
+    const float *src_origin = reinterpret_cast<const float *>(input->GetHandle().base);
+    float *dst_origin = reinterpret_cast<float *>(output->GetHandle().base);
+
+    // X86_matrixMul in row major format
+    int m = dims_output[1];
+    int n = src_z_step;
+    int k = dims_input[1];
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    conv_ajust_m_blk_size(max_num_threads, src_z_step, conv_gemm_conf_.M_c_);
+
+    int m_c = conv_gemm_conf_.M_c_;
+    int k_c = conv_gemm_conf_.K_c_;
+
+    float *src_buf = reinterpret_cast<float *>(
+        context_->GetSharedWorkSpace(m_c * k_c * max_num_threads * sizeof(float)));
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        const float * B = src_origin + batch_idx * k * n;
+        const float * A = weights_data;
+        float * C = dst_origin + batch_idx * m * n;
+
+        conv_sgemm_nn_col_major_prepack_b(n, m, k, B, n, A, k, C, n,
+            bias_data, param->activation_type, src_buf, conv_gemm_conf_);
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.h
new file mode 100644
index 0000000..a4ea44a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_1x1.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_1x1_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_1x1_H_
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class X86ConvLayer1x1: public X86ConvLayerCommon {
+public:
+    virtual ~X86ConvLayer1x1();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_1x1_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.cc
new file mode 100644
index 0000000..8d3d9bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.cc
@@ -0,0 +1,856 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_3x3.h"
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/x86_util.h"
+
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+static void weight_transform(const float *src, float *dst, int kernel_size, int unit, int in_channel, int out_channel,
+                             int CH_PACK, const float (*G)[3]) {
+#define UNIT_MAX 8
+    float M[UNIT_MAX][3];
+    float K_trans[UNIT_MAX * UNIT_MAX];
+
+    int ic_pack     = UP_DIV(in_channel, CH_PACK);
+    int oc_pack     = UP_DIV(out_channel, CH_PACK);
+    int unit_stride = ic_pack * oc_pack * CH_PACK * CH_PACK;
+    int oc_stride   = ic_pack * CH_PACK * CH_PACK;
+    int ic_stride   = CH_PACK * CH_PACK;
+
+    for (int oc = 0; oc < out_channel; oc++) {
+        int zo        = oc / CH_PACK;
+        int ro        = oc % CH_PACK;
+        float *dst_oz = dst + zo * oc_stride + ro;
+        for (int ic = 0; ic < in_channel; ic++) {
+            const float *src_z = src + (oc * in_channel + ic) * 3 * 3;
+            const float *k0    = src_z;
+            const float *k1    = k0 + 3;
+            const float *k2    = k1 + 3;
+
+            int zi = ic / CH_PACK;
+            int ri = ic % CH_PACK;
+
+            // M=G*g
+            for (int i = 0; i < unit; i++) {
+                M[i][0] = k0[0] * G[i][0] + k1[0] * G[i][1] + k2[0] * G[i][2];
+                M[i][1] = k0[1] * G[i][0] + k1[1] * G[i][1] + k2[1] * G[i][2];
+                M[i][2] = k0[2] * G[i][0] + k1[2] * G[i][1] + k2[2] * G[i][2];
+            }
+
+            // K_trans=M*GT
+            for (int j = 0; j < unit; j++) {
+                float *Mp = &M[j][0];
+                for (int i = 0; i < unit; i++) {
+                    K_trans[j * unit + i] = Mp[0] * G[i][0] + Mp[1] * G[i][1] + Mp[2] * G[i][2];
+                }
+            }
+
+            auto dst_sz = dst_oz + zi * ic_stride + CH_PACK * ri;
+
+            for (int i = 0; i < unit; i++) {
+                for (int j = 0; j < unit; j++) {
+                    *(dst_sz + (j * unit + i) * unit_stride) = K_trans[i * unit + j];
+                }
+            }
+        }
+    }
+#undef UNIT_MAX
+}
+
+static void pack_input_c8(const float *din, float *dout, int cs, int hs, int he, int ws, int we, int channel, int width,
+                          int height, float *zero_ptr) {
+    int size_w = we - ws;
+    int size_c = width * height;
+    int pad_l  = ws < 0 ? -ws : 0;
+    int pad_r  = we > width ? we - width : 0;
+    auto dst   = dout;
+
+    for (int h = hs; h < he; h++) {
+        dst         = dout + (h - hs) * 8 * size_w;
+        auto ptr_c0 = din + cs * size_c + h * width;
+        auto ptr_c1 = ptr_c0 + size_c;
+        auto ptr_c2 = ptr_c1 + size_c;
+        auto ptr_c3 = ptr_c2 + size_c;
+        auto ptr_c4 = ptr_c3 + size_c;
+        auto ptr_c5 = ptr_c4 + size_c;
+        auto ptr_c6 = ptr_c5 + size_c;
+        auto ptr_c7 = ptr_c6 + size_c;
+
+        if (h < 0 || h >= height) {
+            memset(dst, 0, sizeof(float) * 8 * size_w);
+            continue;
+        } else if (cs + 8 > channel) {
+            switch (cs + 8 - channel) {
+                case 7:
+                    ptr_c1 = zero_ptr;
+                case 6:
+                    ptr_c2 = zero_ptr;
+                case 5:
+                    ptr_c3 = zero_ptr;
+                case 4:
+                    ptr_c4 = zero_ptr;
+                case 3:
+                    ptr_c5 = zero_ptr;
+                case 2:
+                    ptr_c6 = zero_ptr;
+                case 1:
+                    ptr_c7 = zero_ptr;
+                default:
+                    break;
+            }
+        }
+
+        for (int w = ws; w < we; w++) {
+            if (w < 0 || w >= width) {
+                memset(dst, 0, 8 * sizeof(float));
+                dst += 8;
+                continue;
+            }
+
+            dst[0] = ptr_c0[w];
+            dst[1] = ptr_c1[w];
+            dst[2] = ptr_c2[w];
+            dst[3] = ptr_c3[w];
+            dst[4] = ptr_c4[w];
+            dst[5] = ptr_c5[w];
+            dst[6] = ptr_c6[w];
+            dst[7] = ptr_c7[w];
+            dst += 8;
+        }
+    }
+}
+
+static void pack_input_c4(const float *din, float *dout, int cs, int hs, int he, int ws, int we, int channel, int width,
+                          int height, float *zero_ptr) {
+    int size_w = we - ws;
+    int size_c = width * height;
+    int pad_l  = ws < 0 ? -ws : 0;
+    int pad_r  = we > width ? we - width : 0;
+    auto dst   = dout;
+
+    for (int h = hs; h < he; h++) {
+        dst         = dout + (h - hs) * 4 * size_w;
+        auto ptr_c0 = din + cs * size_c + h * width;
+        auto ptr_c1 = ptr_c0 + size_c;
+        auto ptr_c2 = ptr_c1 + size_c;
+        auto ptr_c3 = ptr_c2 + size_c;
+
+        if (h < 0 || h >= height) {
+            memset(dst, 0, sizeof(float) * 4 * size_w);
+            continue;
+        } else if (cs + 4 > channel) {
+            switch (cs + 4 - channel) {
+                case 3:
+                    ptr_c1 = zero_ptr;
+                case 2:
+                    ptr_c2 = zero_ptr;
+                case 1:
+                    ptr_c3 = zero_ptr;
+                default:
+                    break;
+            }
+        }
+
+        for (int w = ws; w < we; w++) {
+            if (w < 0 || w >= width) {
+                memset(dst, 0, 4 * sizeof(float));
+                dst += 4;
+                continue;
+            }
+
+            dst[0] = ptr_c0[w];
+            dst[1] = ptr_c1[w];
+            dst[2] = ptr_c2[w];
+            dst[3] = ptr_c3[w];
+            dst += 4;
+        }
+    }
+}
+
+// unpack c8
+static void unpack_output_c8(const float *din, float *dout, int cs, int ce, int hs, int he, int ws, int we, int channel,
+                             int height, int width, bool flag_relu, float *trash_ptr) {
+    int size_c_out = width * height;
+
+    float *doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+    float *doutc1r0 = doutc0r0 + size_c_out;
+    float *doutc2r0 = doutc1r0 + size_c_out;
+    float *doutc3r0 = doutc2r0 + size_c_out;
+    float *doutc4r0 = doutc3r0 + size_c_out;
+    float *doutc5r0 = doutc4r0 + size_c_out;
+    float *doutc6r0 = doutc5r0 + size_c_out;
+    float *doutc7r0 = doutc6r0 + size_c_out;
+
+    const float *ptr_din = din;
+
+    int size_h  = (he > height ? height : he) - hs;
+    int size_w  = (we > width ? width : we) - ws;
+    int valid_w = we - ws;
+
+    for (int h = 0; h < size_h; h++) {
+        float *doutc0_ptr = doutc0r0 + h * width;  // doutc0r0 + width;
+        float *doutc1_ptr = doutc1r0 + h * width;
+        float *doutc2_ptr = doutc2r0 + h * width;
+        float *doutc3_ptr = doutc3r0 + h * width;
+        float *doutc4_ptr = doutc4r0 + h * width;
+        float *doutc5_ptr = doutc5r0 + h * width;
+        float *doutc6_ptr = doutc6r0 + h * width;
+        float *doutc7_ptr = doutc7r0 + h * width;
+        if (ce > channel) {
+            switch (ce - channel) {
+                case 7:
+                    doutc1_ptr = trash_ptr;
+                case 6:
+                    doutc2_ptr = trash_ptr;
+                case 5:
+                    doutc3_ptr = trash_ptr;
+                case 4:
+                    doutc4_ptr = trash_ptr;
+                case 3:
+                    doutc5_ptr = trash_ptr;
+                case 2:
+                    doutc6_ptr = trash_ptr;
+                case 1:
+                    doutc7_ptr = trash_ptr;
+                default:
+                    break;
+            }
+        }
+        for (int w = 0; w < size_w; w++) {
+            doutc0_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 0];
+            doutc1_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 1];
+            doutc2_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 2];
+            doutc3_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 3];
+            doutc4_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 4];
+            doutc5_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 5];
+            doutc6_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 6];
+            doutc7_ptr[w] = ptr_din[(h * valid_w + w) * 8 + 7];
+        }
+    }
+}
+
+static void unpack_output_c4(const float *din, float *dout, int cs, int ce, int hs, int he, int ws, int we, int channel,
+                             int height, int width, bool flag_relu, float *trash_ptr) {
+    int size_c_out = width * height;
+
+    float *doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+    float *doutc1r0 = doutc0r0 + size_c_out;
+    float *doutc2r0 = doutc1r0 + size_c_out;
+    float *doutc3r0 = doutc2r0 + size_c_out;
+
+    const float *ptr_din = din;
+
+    int size_h  = (he > height ? height : he) - hs;
+    int size_w  = (we > width ? width : we) - ws;
+    int valid_w = we - ws;
+
+    for (int h = 0; h < size_h; h++) {
+        float *doutc0_ptr = doutc0r0 + h * width;  // doutc0r0 + width;
+        float *doutc1_ptr = doutc1r0 + h * width;
+        float *doutc2_ptr = doutc2r0 + h * width;
+        float *doutc3_ptr = doutc3r0 + h * width;
+        if (ce > channel) {
+            switch (ce - channel) {
+                case 3:
+                    doutc1_ptr = trash_ptr;
+                case 2:
+                    doutc2_ptr = trash_ptr;
+                case 1:
+                    doutc3_ptr = trash_ptr;
+                default:
+                    break;
+            }
+        }
+        for (int w = 0; w < size_w; w++) {
+            doutc0_ptr[w] = ptr_din[(h * valid_w + w) * 4 + 0];
+            doutc1_ptr[w] = ptr_din[(h * valid_w + w) * 4 + 1];
+            doutc2_ptr[w] = ptr_din[(h * valid_w + w) * 4 + 2];
+            doutc3_ptr[w] = ptr_din[(h * valid_w + w) * 4 + 3];
+        }
+    }
+}
+
+#define COMPUTE_UNIT(c)                                                                                                \
+    wgt  = VEC::loadu(weight_z + c * N);                                                                               \
+    data = VEC(src_z + K * 0 + c);                                                                                     \
+    VEC::mla(acc0, data, wgt);                                                                                         \
+    data = VEC(src_z + K * 1 + c);                                                                                     \
+    VEC::mla(acc1, data, wgt);                                                                                         \
+    data = VEC(src_z + K * 2 + c);                                                                                     \
+    VEC::mla(acc2, data, wgt);                                                                                         \
+    data = VEC(src_z + K * 3 + c);                                                                                     \
+    VEC::mla(acc3, data, wgt);                                                                                         \
+    data = VEC(src_z + K * 4 + c);                                                                                     \
+    VEC::mla(acc4, data, wgt);                                                                                         \
+    data = VEC(src_z + K * 5 + c);                                                                                     \
+    VEC::mla(acc5, data, wgt);
+
+#define COMPUTE_VEC(c)                                                                                                 \
+    data = VEC(src_z + c);                                                                                             \
+    wgt  = VEC::loadu(weight_z + c * N);                                                                               \
+    VEC::mla(acc, data, wgt);
+
+// A=6x8, B=8x8, C=6x8
+template <typename VEC, int M, int K, int N>
+static void gemm_kernel_avx(float *dst, float *src, const float *weight, const float *bias, int ic_8, int oc_8,
+                            int width) {
+    auto w_unit         = width / M;
+    auto w_unit_end     = M * w_unit;
+    auto src_depth_step = width * K;
+
+    const int weight_unit_step = K * N;
+
+    for (int co = 0; co < oc_8; co++) {
+        auto dst_z     = dst + co * N * width;
+        auto weight_dz = weight + co * ic_8 * weight_unit_step;
+
+        for (int dx = 0; dx < w_unit; dx++) {
+            auto dst_x = dst_z + dx * N * M;
+            auto src_x = src + dx * K * M;
+
+            VEC acc0 = (bias) ? VEC::loadu(bias) : VEC(0.0f);
+            VEC acc1 = acc0;
+            VEC acc2 = acc0;
+            VEC acc3 = acc0;
+            VEC acc4 = acc0;
+            VEC acc5 = acc0;
+            VEC data;
+            VEC wgt;
+
+            for (int ci = 0; ci < ic_8; ci++) {
+                auto src_z    = src_x + ci * src_depth_step;
+                auto weight_z = weight_dz + ci * weight_unit_step;
+                if (K == 8) {
+                    COMPUTE_UNIT(0);
+                    COMPUTE_UNIT(1);
+                    COMPUTE_UNIT(2);
+                    COMPUTE_UNIT(3);
+                    COMPUTE_UNIT(4);
+                    COMPUTE_UNIT(5);
+                    COMPUTE_UNIT(6);
+                    COMPUTE_UNIT(7);
+                } else if (K == 4) {
+                    COMPUTE_UNIT(0);
+                    COMPUTE_UNIT(1);
+                    COMPUTE_UNIT(2);
+                    COMPUTE_UNIT(3);
+                }
+            }
+
+            VEC::saveu(dst_x + N * 0, acc0);
+            VEC::saveu(dst_x + N * 1, acc1);
+            VEC::saveu(dst_x + N * 2, acc2);
+            VEC::saveu(dst_x + N * 3, acc3);
+            VEC::saveu(dst_x + N * 4, acc4);
+            VEC::saveu(dst_x + N * 5, acc5);
+        }
+
+        for (int dx = w_unit_end; dx < width; dx++) {
+            auto dst_x = dst_z + dx * N;
+            auto src_x = src + dx * K;
+
+            VEC acc = (bias) ? VEC::loadu(bias) : VEC(0.0f);
+            VEC data;
+            VEC wgt;
+
+            for (int ci = 0; ci < ic_8; ci++) {
+                auto src_z    = src_x + ci * src_depth_step;
+                auto weight_z = weight_dz + ci * weight_unit_step;
+
+                if (K == 8) {
+                    COMPUTE_VEC(0);
+                    COMPUTE_VEC(1);
+                    COMPUTE_VEC(2);
+                    COMPUTE_VEC(3);
+                    COMPUTE_VEC(4);
+                    COMPUTE_VEC(5);
+                    COMPUTE_VEC(6);
+                    COMPUTE_VEC(7);
+                } else if (K == 4) {
+                    COMPUTE_VEC(0);
+                    COMPUTE_VEC(1);
+                    COMPUTE_VEC(2);
+                    COMPUTE_VEC(3);
+                }
+            }
+
+            VEC::saveu(dst_x, acc);
+        }
+    }
+}
+
+// BT=[1, 0, -1, 0,
+//    0, 1,  1, 0,
+//    0, -1, 1, 0,
+//    0, 1,  0, -1]
+template <typename VEC>
+static void input_trans_4x4(const float *src, int src_stride, int src_h_stride, float *dest, int dest_stride,
+                            int dest_h_stride) {
+    VEC src00 = VEC::loadu(src);
+    VEC src01 = VEC::loadu(src + src_stride);
+    VEC src02 = VEC::loadu(src + src_stride + src_stride);
+    VEC src03 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src10 = VEC::loadu(src);
+    VEC src11 = VEC::loadu(src + src_stride);
+    VEC src12 = VEC::loadu(src + src_stride + src_stride);
+    VEC src13 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src20 = VEC::loadu(src);
+    VEC src21 = VEC::loadu(src + src_stride);
+    VEC src22 = VEC::loadu(src + src_stride + src_stride);
+    VEC src23 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src30 = VEC::loadu(src);
+    VEC src31 = VEC::loadu(src + src_stride);
+    VEC src32 = VEC::loadu(src + src_stride + src_stride);
+    VEC src33 = VEC::loadu(src + src_stride + src_stride + src_stride);
+
+    VEC dst00 = (src00 - src02);
+    VEC dst10 = (src01 + src02);
+    VEC dst20 = (src02 - src01);
+    VEC dst30 = (src01 - src03);
+
+    VEC dst01 = (src10 - src12);
+    VEC dst11 = (src11 + src12);
+    VEC dst21 = (src12 - src11);
+    VEC dst31 = (src11 - src13);
+
+    VEC dst02 = (src20 - src22);
+    VEC dst12 = (src21 + src22);
+    VEC dst22 = (src22 - src21);
+    VEC dst32 = (src21 - src23);
+
+    VEC dst03 = (src30 - src32);
+    VEC dst13 = (src31 + src32);
+    VEC dst23 = (src32 - src31);
+    VEC dst33 = (src31 - src33);
+
+    VEC dest00 = (dst00 - dst02);
+    VEC dest10 = (dst01 + dst02);
+    VEC dest20 = (dst02 - dst01);
+    VEC dest30 = (dst01 - dst03);
+
+    VEC dest01 = (dst10 - dst12);
+    VEC dest11 = (dst11 + dst12);
+    VEC dest21 = (dst12 - dst11);
+    VEC dest31 = (dst11 - dst13);
+
+    VEC dest02 = (dst20 - dst22);
+    VEC dest12 = (dst21 + dst22);
+    VEC dest22 = (dst22 - dst21);
+    VEC dest32 = (dst21 - dst23);
+
+    VEC dest03 = (dst30 - dst32);
+    VEC dest13 = (dst31 + dst32);
+    VEC dest23 = (dst32 - dst31);
+    VEC dest33 = (dst31 - dst33);
+
+    VEC::saveu(dest, dest00);
+    VEC::saveu(dest + dest_stride, dest10);
+    VEC::saveu(dest + dest_stride + dest_stride, dest20);
+    VEC::saveu(dest + dest_stride + dest_stride + dest_stride, dest30);
+    dest += dest_h_stride;
+    VEC::saveu(dest, dest01);
+    VEC::saveu(dest + dest_stride, dest11);
+    VEC::saveu(dest + dest_stride + dest_stride, dest21);
+    VEC::saveu(dest + dest_stride + dest_stride + dest_stride, dest31);
+    dest += dest_h_stride;
+    VEC::saveu(dest, dest02);
+    VEC::saveu(dest + dest_stride, dest12);
+    VEC::saveu(dest + dest_stride + dest_stride, dest22);
+    VEC::saveu(dest + dest_stride + dest_stride + dest_stride, dest32);
+    dest += dest_h_stride;
+    VEC::saveu(dest, dest03);
+    VEC::saveu(dest + dest_stride, dest13);
+    VEC::saveu(dest + dest_stride + dest_stride, dest23);
+    VEC::saveu(dest + dest_stride + dest_stride + dest_stride, dest33);
+}
+template void input_trans_4x4<Float8>(const float *src, int src_stride, int src_h_stride, float *dest, int dest_stride,
+                                      int dest_h_stride);
+template void input_trans_4x4<Float4>(const float *src, int src_stride, int src_h_stride, float *dest, int dest_stride,
+                                      int dest_h_stride);
+
+// AT=[1, 1,  1,  0,
+//    0, 1, -1, -1
+template <typename VEC>
+static void output_trans_post_2x4(const float *src, int src_stride, int src_h_stride, float *dest, int dest_stride,
+                                  int dest_h_stride, const float *bias_value, int relu_type) {
+    VEC src00 = VEC::loadu(src);
+    VEC src01 = VEC::loadu(src + src_stride);
+    VEC src02 = VEC::loadu(src + src_stride + src_stride);
+    VEC src03 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src10 = VEC::loadu(src);
+    VEC src11 = VEC::loadu(src + src_stride);
+    VEC src12 = VEC::loadu(src + src_stride + src_stride);
+    VEC src13 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src20 = VEC::loadu(src);
+    VEC src21 = VEC::loadu(src + src_stride);
+    VEC src22 = VEC::loadu(src + src_stride + src_stride);
+    VEC src23 = VEC::loadu(src + src_stride + src_stride + src_stride);
+    src += src_h_stride;
+    VEC src30 = VEC::loadu(src);
+    VEC src31 = VEC::loadu(src + src_stride);
+    VEC src32 = VEC::loadu(src + src_stride + src_stride);
+    VEC src33 = VEC::loadu(src + src_stride + src_stride + src_stride);
+
+    VEC dst00 = (src00 + src01 + src02);
+    VEC dst10 = (src01 - src02 - src03);
+    VEC dst01 = (src10 + src11 + src12);
+    VEC dst11 = (src11 - src12 - src13);
+    VEC dst02 = (src20 + src21 + src22);
+    VEC dst12 = (src21 - src22 - src23);
+    VEC dst03 = (src30 + src31 + src32);
+    VEC dst13 = (src31 - src32 - src33);
+
+    VEC dest00 = (dst00 + dst01 + dst02);
+    VEC dest10 = (dst01 - dst02 - dst03);
+    VEC dest01 = (dst10 + dst11 + dst12);
+    VEC dest11 = (dst11 - dst12 - dst13);
+
+    if (bias_value) {
+        VEC bias = VEC::loadu(bias_value);
+        dest00   = (dest00 + bias);
+        dest10   = (dest10 + bias);
+        dest01   = (dest01 + bias);
+        dest11   = (dest11 + bias);
+    }
+
+    if (relu_type == ActivationType_ReLU || relu_type == ActivationType_ReLU6) {
+        VEC zeros = VEC(0.f);
+        dest00    = VEC::max(dest00, zeros);
+        dest10    = VEC::max(dest10, zeros);
+        dest01    = VEC::max(dest01, zeros);
+        dest11    = VEC::max(dest11, zeros);
+    }
+    if (relu_type == ActivationType_ReLU6) {
+        VEC sixs  = VEC(6.f);
+        dest00    = VEC::min(dest00, sixs);
+        dest10    = VEC::min(dest10, sixs);
+        dest01    = VEC::min(dest01, sixs);
+        dest11    = VEC::min(dest11, sixs);
+    }
+
+    VEC::saveu(dest, dest00);
+    VEC::saveu(dest + dest_stride, dest10);
+    dest += dest_h_stride;
+    VEC::saveu(dest, dest01);
+    VEC::saveu(dest + dest_stride, dest11);
+}
+template void output_trans_post_2x4<Float8>(const float *src, int src_stride, int src_h_stride, float *dest,
+                                            int dest_stride, int dest_h_stride, const float *bias_value, int relu_type);
+template void output_trans_post_2x4<Float4>(const float *src, int src_stride, int src_h_stride, float *dest,
+                                            int dest_stride, int dest_h_stride, const float *bias_value, int relu_type);
+
+bool X86ConvLayer3x3::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                 const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    if (param->group != 1) {
+        return false;
+    }
+
+    const int kw = param->kernels[0];
+    const int kh = param->kernels[1];
+    const int dw = param->dialations[0];
+    const int dh = param->dialations[1];
+    const int sw = param->strides[0];
+    const int sh = param->strides[1];
+    const int ic = inputs[0]->GetBlobDesc().dims[1];
+
+    return kw == 3 && kh == 3 && dw == 1 && dh == 1 && sw == 1 && sh == 1 && ic >= 16;
+}
+
+X86ConvLayer3x3::~X86ConvLayer3x3() {}
+
+Status X86ConvLayer3x3::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto dims_input  = inputs[0]->GetBlobDesc().dims;
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        const float *src = conv_res->filter_handle.force_to<float *>();
+        auto CH_PACK     = 4;
+        if (arch_ == avx2)
+            CH_PACK = 8;
+
+        const int input_channel  = dims_input[1];
+        const int output_channel = dims_output[1];
+        const int weight_count   = ROUND_UP(input_channel, CH_PACK) * ROUND_UP(output_channel, CH_PACK) * 4 * 4;
+        const int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            RawBuffer pack_buffer(weight_count * data_byte_size);
+            float *dst = pack_buffer.force_to<float *>();
+
+            const float G[4][3] = {{1.0f, 0.0f, 0.0f}, {0.5f, 0.5f, 0.5f}, {0.5f, -0.5f, 0.5f}, {0.0f, 0.0f, 1.0f}};
+            weight_transform(src, dst, 3, 4, input_channel, output_channel, CH_PACK, G);
+
+            pack_buffer.SetDataType(DATA_TYPE_FLOAT);
+            buffer_weight_ = pack_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+// pack weight offline
+// pack input c8
+// input trans
+// gemm
+// output trans
+// write c8 to nchw
+
+#define TILE_NUM 6
+// #define CH_PACK 8
+
+Status X86ConvLayer3x3::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto src_origin = reinterpret_cast<float *>(input->GetHandle().base);
+    auto dst_origin = reinterpret_cast<float *>(output->GetHandle().base);
+
+    const int batch       = dims_output[0];
+    const int channel_in  = dims_input[1];
+    const int height_in   = dims_input[2];
+    const int width_in    = dims_input[3];
+    const int channel_out = dims_output[1];
+    const int height_out  = dims_output[2];
+    const int width_out   = dims_output[3];
+
+    const int pad_left   = param->pads[0];
+    const int pad_right  = param->pads[1];
+    const int pad_top    = param->pads[2];
+    const int pad_bottom = param->pads[3];
+
+    int in_n_stride  = channel_in * width_in * height_in;
+    int out_n_stride = channel_out * width_out * height_out;
+    int ic_stride    = width_in * height_in;
+    int oc_stride    = width_out * height_out;
+
+    auto input_trans_func  = input_trans_4x4<Float4>;
+    auto output_trans_func = output_trans_post_2x4<Float4>;
+    auto pack_func         = pack_input_c4;
+    auto unpack_func       = unpack_output_c4;
+    auto gemm_func         = gemm_kernel_avx<Float4, 6, 4, 4>;
+    auto CH_PACK           = 4;
+    if (arch_ == avx2) {
+        input_trans_func  = input_trans_4x4<Float8>;
+        output_trans_func = output_trans_post_2x4<Float8>;
+        pack_func         = pack_input_c8;
+        unpack_func       = unpack_output_c8;
+        gemm_func         = gemm_kernel_avx<Float8, 6, 8, 8>;
+        CH_PACK           = 8;
+    }
+
+    int ic_8 = UP_DIV(channel_in, CH_PACK);
+    int oc_8 = UP_DIV(channel_out, CH_PACK);
+
+    const int src_unit = 4;
+    const int dst_unit = 2;
+    int w_unit         = UP_DIV(width_out, dst_unit);
+    int h_unit         = UP_DIV(height_out, dst_unit);
+    int total_cnt      = UP_DIV(w_unit * h_unit, TILE_NUM);
+
+    int w_pad = width_in + pad_left + pad_right;
+    int h_pad = height_in + pad_top + pad_bottom;
+
+    int new_h_stride = w_pad * CH_PACK;
+    int new_c_stride = new_h_stride * h_pad;
+    int ic_8_stride  = w_pad * h_pad * CH_PACK;
+    int oc_8_stride  = width_out * height_out * CH_PACK;
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    size_t zero_size = ROUND_UP(w_pad * sizeof(float), 32);
+    size_t pack_input_size = ROUND_UP(w_pad * h_pad * ROUND_UP(channel_in, CH_PACK) * sizeof(float), 32);
+    size_t tmp_size = ROUND_UP((ic_8 + oc_8) * src_unit * src_unit * CH_PACK * TILE_NUM * sizeof(float), 32);
+    size_t src_trans_size = ROUND_UP(src_unit * src_unit * CH_PACK * sizeof(float), 32);
+    size_t dst_trans_size = ROUND_UP(dst_unit * dst_unit * CH_PACK * sizeof(float), 32);
+    float *workspace = reinterpret_cast<float *>(
+        context_->GetSharedWorkSpace(zero_size + pack_input_size + tmp_size +
+                                     (src_trans_size + dst_trans_size) * max_num_threads));
+
+    float *zero_ptr = workspace;
+    memset(zero_ptr, 0, sizeof(float) * w_pad);
+    float *pack_input = zero_ptr + zero_size / sizeof(float);
+    float *input_c8 = pack_input;
+    float *tmp_data = pack_input + pack_input_size / sizeof(float);
+    float *src_trans_tmp_data = tmp_data + tmp_size / sizeof(float);
+    float *dst_trans_tmp_data = src_trans_tmp_data + max_num_threads * src_trans_size / sizeof(float);
+
+    for (int ni = 0; ni < batch; ni++) {
+        auto input_ptr  = src_origin + ni * in_n_stride;
+        auto output_ptr = dst_origin + ni * out_n_stride;
+
+        for (int i = 0; i < ic_8; ++i) {
+            pack_func(input_ptr, input_c8 + i * new_c_stride, i * CH_PACK, -pad_top, height_in + pad_bottom, -pad_left,
+                      width_in + pad_right, channel_in, width_in, height_in, zero_ptr);
+        }
+        const float *weight_ptr = buffer_weight_.force_to<float *>();
+        const float *bias_ptr   = buffer_bias_.force_to<float *>();
+
+        for (int t_idx = 0; t_idx < total_cnt; t_idx++) {
+            int tile_index  = t_idx * TILE_NUM;
+            int tile_remain = w_unit * h_unit - tile_index;
+            int tile_count  = tile_remain > TILE_NUM ? TILE_NUM : tile_remain;
+
+            // ----------------------------------------- input trans -------------------------------------
+            int c_gi_stride = tile_count * oc_8 * CH_PACK;
+            int b_gi_stride = tile_count * ic_8 * CH_PACK;
+
+            OMP_PARALLEL_FOR_DYNAMIC_
+            for (int x_i = 0; x_i < tile_count; x_i++) {
+                int thread_id = OMP_TID_;
+                auto src_trans_tmp_per_thread = src_trans_tmp_data + thread_id * (src_trans_size / sizeof(float));
+
+                int index = tile_index + x_i;
+                int w_idx = index % w_unit;
+                int h_idx = index / w_unit;
+
+                int src_x = w_idx * dst_unit;
+                int src_y = h_idx * dst_unit;
+                int ex    = src_x + src_unit > w_pad ? w_pad - src_x : src_unit;
+                int ey    = src_y + src_unit > h_pad ? h_pad - src_y : src_unit;
+
+                float *dst_ptr       = tmp_data + x_i * CH_PACK;
+                const float *src_ptr = input_c8 + (src_y * w_pad + src_x) * CH_PACK;
+
+                if (ex == src_unit && ey == src_unit) {
+                    // trans input
+                    for (int ci = 0; ci < ic_8; ++ci) {
+                        const float *src_ci = src_ptr + ci * ic_8_stride;
+                        float *dst_ci       = dst_ptr + ci * tile_count * CH_PACK;
+                        input_trans_func(src_ci, CH_PACK, w_pad * CH_PACK, dst_ci, b_gi_stride, b_gi_stride * src_unit);
+                    }
+                } else {
+                    int x_size = ex;
+                    for (int ci = 0; ci < ic_8; ++ci) {
+                        const float *src_ci = src_ptr + ci * ic_8_stride;
+                        // pad
+                        memset(src_trans_tmp_per_thread, 0, 16 * CH_PACK * sizeof(float));  // src_unit * src_unit * ch_pack
+                        if (x_size > 0) {
+                            for (int yi = 0; yi < ey; ++yi) {
+                                float *dst_yi       = src_trans_tmp_per_thread + yi * src_unit * CH_PACK;
+                                const float *src_yi = src_ci + w_pad * yi * CH_PACK;
+                                memcpy(dst_yi, src_yi, x_size * sizeof(float) * CH_PACK);
+                            }
+                        }
+
+                        // trans
+                        float *dst_ci = dst_ptr + ci * tile_count * CH_PACK;
+                        input_trans_func(src_trans_tmp_per_thread, CH_PACK, src_unit * CH_PACK, dst_ci, b_gi_stride,
+                                         b_gi_stride * src_unit);
+                    }
+                }
+            }
+
+            // ---------------------------------------- gemm func ----------------------------------------
+            // gemm
+            float *dst_temp_data = tmp_data + TILE_NUM * ic_8 * 16 * CH_PACK;  // src_unit * src_unit * ch_pack
+            float *b_ptr         = tmp_data;
+            int w_gi_stride      = ic_8 * oc_8 * CH_PACK * CH_PACK;
+            OMP_PARALLEL_FOR_GUIDED_
+            for (int gi = 0; gi < src_unit * src_unit; ++gi) {
+                float *trans_dst          = dst_temp_data + gi * c_gi_stride;
+                float *trans_src          = b_ptr + gi * b_gi_stride;
+                const float *trans_weight = weight_ptr + gi * w_gi_stride;
+
+                gemm_func(trans_dst, trans_src, trans_weight, nullptr, ic_8, oc_8, tile_count);
+            }
+
+            // ---------------------------------------- output trans --------------------------------------
+
+            OMP_PARALLEL_FOR_DYNAMIC_
+            for (int ti = 0; ti < tile_count; ++ti) {
+                int thread_id = OMP_TID_;
+                auto src_trans_tmp_per_thread = src_trans_tmp_data + thread_id * (src_trans_size / sizeof(float));
+                auto dst_trans_tmp_per_thread = dst_trans_tmp_data + thread_id * (dst_trans_size / sizeof(float));
+
+                int index = tile_index + ti;
+
+                int w_idx = index % w_unit;
+                int h_idx = index / w_unit;
+
+                int dst_x = w_idx * dst_unit;
+                int dst_y = h_idx * dst_unit;
+
+                int ex = dst_x + dst_unit > width_out ? width_out - dst_x : dst_unit;
+                int ey = dst_y + dst_unit > height_out ? height_out - dst_y : dst_unit;
+
+                float *dst_ptr = output_ptr + (dst_y * width_out + dst_x) * CH_PACK;
+                float *src_ptr = dst_temp_data + ti * CH_PACK;
+
+                if (ex == 2) {
+                    // trans output
+                    for (int ci = 0; ci < oc_8; ++ci) {
+                        const float *bias_ci = bias_ptr + ci * CH_PACK;
+                        float *dst_ci = dst_ptr + ci * oc_8_stride;
+                        float *src_ci = src_ptr + ci * tile_count * CH_PACK;
+                        output_trans_func(src_ci, c_gi_stride, c_gi_stride * src_unit, src_trans_tmp_per_thread, CH_PACK,
+                                          dst_unit * CH_PACK, bias_ci, param->activation_type);
+                        unpack_func(src_trans_tmp_per_thread, output_ptr, ci * CH_PACK, ci * CH_PACK + CH_PACK, dst_y,
+                                    dst_y + ey, dst_x, dst_x + ex, channel_out, height_out, width_out, false, zero_ptr);
+                    }
+                } else {
+                    for (int ci = 0; ci < oc_8; ++ci) {
+                        const float *bias_ci = bias_ptr + ci * CH_PACK;
+                        // trans output
+                        float *dst_ci = dst_ptr + ci * oc_8_stride;
+                        float *src_ci = src_ptr + ci * tile_count * CH_PACK;
+                        output_trans_func(src_ci, c_gi_stride, c_gi_stride * src_unit, src_trans_tmp_per_thread, CH_PACK,
+                                          dst_unit * CH_PACK, bias_ci, param->activation_type);
+                        // copy to dest
+                        memset(dst_trans_tmp_per_thread, 0, 4 * CH_PACK * sizeof(float));  // dst_unit * dst_unit * ch_pack
+                        for (int i = 0; i < ey; ++i) {
+                            memcpy(dst_trans_tmp_per_thread + i * ex * CH_PACK, src_trans_tmp_per_thread + i * CH_PACK * dst_unit,
+                                   ex * sizeof(float) * CH_PACK);
+                        }
+                        unpack_func(dst_trans_tmp_per_thread, output_ptr, ci * CH_PACK, ci * CH_PACK + CH_PACK, dst_y,
+                                    dst_y + ey, dst_x, dst_x + ex, channel_out, height_out, width_out, false, zero_ptr);
+                    }
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.h
new file mode 100644
index 0000000..56f4f6a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_3x3.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_3x3_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_3x3_H_
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class X86ConvLayer3x3: public X86ConvLayerCommon {
+public:
+    virtual ~X86ConvLayer3x3();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_3x3_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.cc
new file mode 100644
index 0000000..bc4ca4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.h"
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_1x1.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_3x3.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_common.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_int8_layer_common.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_int8_layer_depthwise.h"
+
+namespace TNN_NS {
+
+/*
+get different impl based on conv params
+X86ConvLayerCommon always as the last solution
+*/
+void X86ConvLayerAccFactory::CreateImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                         LayerParam *param, std::shared_ptr<X86LayerAcc> &conv_acc_impl) {
+    if (X86ConvLayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<X86ConvLayerDepthwise *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<X86ConvLayerDepthwise>();
+        }
+    } else if (X86ConvLayer1x1::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<X86ConvLayer1x1*>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<X86ConvLayer1x1>();
+        }
+    } else if (X86ConvLayer3x3::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<X86ConvLayer3x3*>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<X86ConvLayer3x3>();
+        }
+    } else if (!conv_acc_impl) {
+        conv_acc_impl = std::make_shared<X86ConvLayerCommon>();
+    }
+}
+
+/*
+get different impl based on conv params
+X86ConvInt8LayerCommon always as the last solution
+*/
+void X86ConvLayerAccFactory::CreateImpInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                           LayerParam *param, std::shared_ptr<X86LayerAcc> &conv_acc_impl) {
+    if (X86ConvInt8LayerDepthwise::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<X86ConvInt8LayerDepthwise *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<X86ConvInt8LayerDepthwise>();
+        }
+    } else if (X86ConvInt8LayerCommon::isPrefered(dynamic_cast<ConvLayerParam *>(param), inputs, outputs)) {
+        if (!dynamic_cast<X86ConvInt8LayerCommon *>(conv_acc_impl.get())) {
+            conv_acc_impl = std::make_shared<X86ConvInt8LayerCommon>();
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.h
new file mode 100644
index 0000000..32721e7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_FACTORY_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_FACTORY_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include <memory>
+#include <type_traits>
+
+namespace TNN_NS {
+
+class X86ConvLayerAccFactory {
+public:
+    static void CreateImpFP(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, LayerParam *param,
+                            std::shared_ptr<X86LayerAcc> &conv_acc_impl);
+
+    static void CreateImpInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, LayerParam *param,
+                            std::shared_ptr<X86LayerAcc> &conv_acc_impl);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_FACTORY_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.cc
new file mode 100644
index 0000000..ea65c7b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.cc
@@ -0,0 +1,176 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_common.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+/*
+X86ConvLayerCommonas as the last solution, always return true
+handle the case group != 1, dilate != 1, any pads and strides
+*/
+bool X86ConvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                    const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+X86ConvLayerCommon::~X86ConvLayerCommon() {}
+
+Status X86ConvLayerCommon::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status X86ConvLayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int k_c = conv_gemm_conf_.K_c_;
+        int m_c = conv_gemm_conf_.M_c_;
+        int n_block = conv_gemm_conf_.n_block_;
+        int K = dims_input[1] * param->kernels[0] * param->kernels[1] / param->group;
+        int M = dims_output[1] / param->group;
+        size_t weight_pack_per_group = ROUND_UP(K, k_c) * ROUND_UP(M, n_block);
+
+        const float *src = conv_res->filter_handle.force_to<float *>();
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            RawBuffer temp_buffer(weight_pack_per_group * param->group * sizeof(float));
+            float *dst = temp_buffer.force_to<float *>();
+
+            for (int g = 0; g < param->group; g++) {
+                auto src_g = src + K * M * g;
+                auto dst_g = dst + weight_pack_per_group * g;
+                conv_pack_col_b_n(M, K, src_g, K, dst_g, conv_gemm_conf_);
+            }
+
+            temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+Status X86ConvLayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_bias_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 8) * DataTypeUtils::GetBytesSize(conv_res->bias_handle.GetDataType());
+        RawBuffer temp_buffer(total_byte_size);
+        if (conv_param->bias) {
+            const int bias_handle_size    = conv_res->bias_handle.GetBytesSize();
+            const float *bias_handle_data = conv_res->bias_handle.force_to<float *>();
+            memcpy(temp_buffer.force_to<float *>(), conv_res->bias_handle.force_to<float *>(), bias_handle_size);
+        }
+        buffer_bias_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status X86ConvLayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    conv_gemm_conf_ = conv_gemm_config<float, float, float>();
+
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    return TNN_OK;
+}
+
+Status X86ConvLayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob    = inputs[0];
+    Blob *output_blob   = outputs[0];
+    auto input_dims     = inputs[0]->GetBlobDesc().dims;
+    auto output_dims    = outputs[0]->GetBlobDesc().dims;
+    void *input_ptr     = input_blob->GetHandle().base;
+    void *output_ptr    = output_blob->GetHandle().base;
+    auto param = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+
+    int conv_in_offset_ = input_dims[2] * input_dims[3] * input_dims[1];
+    int conv_out_spatial_dim_ = output_dims[2] * output_dims[3];
+    int output_offset_ = output_dims[1] * conv_out_spatial_dim_ / param->group;
+    size_t col_offset_ = param->kernels[0] * param->kernels[1] * output_dims[2] * output_dims[3] * (input_dims[1] / param->group);
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    conv_ajust_m_blk_size(max_num_threads, conv_out_spatial_dim_, conv_gemm_conf_.M_c_);
+
+    int m_c = conv_gemm_conf_.M_c_;
+    int k_c = conv_gemm_conf_.K_c_;
+    int n_block = conv_gemm_conf_.n_block_;
+    size_t src_trans_size = m_c * k_c;
+
+    size_t im2col_size = ROUND_UP(col_offset_ * param->group * sizeof(float), 32);
+    size_t workspace_size = (im2col_size + ROUND_UP(src_trans_size * max_num_threads * sizeof(float), 32));
+    float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+
+    float *im2col_workspace = workspace;
+    float *src_trans_workspace = workspace + im2col_size / sizeof(float);
+
+    int K = input_dims[1] * param->kernels[0] * param->kernels[1] / param->group;
+    int M = output_dims[1] / param->group;
+    int N = conv_out_spatial_dim_;
+    size_t weight_offset_per_group = ROUND_UP(K, k_c) * ROUND_UP(M, n_block);
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_data = static_cast<float*>(input_ptr);
+        auto output_data = static_cast<float*>(output_ptr);
+        auto weights_data = buffer_weight_.force_to<float*>();
+        float *bias_data  = buffer_bias_.force_to<float*>();
+        for (size_t b = 0; b < outputs[0]->GetBlobDesc().dims[0]; b++) {
+            X86_IM2COL(input_data + b * conv_in_offset_, input_dims[1],
+                        input_dims[2], input_dims[3],
+                        param->kernels[1], param->kernels[0],
+                        param->pads[0], param->pads[1],
+                        param->pads[2], param->pads[3],
+                        param->strides[1], param->strides[0],
+                        param->dialations[1], param->dialations[0],
+                        im2col_workspace);
+
+            for (int g = 0; g < param->group; g++) {
+                conv_sgemm_nn_col_major_prepack_b(N, M, K,
+                    im2col_workspace + col_offset_ * g, N,
+                    weights_data + weight_offset_per_group * g, K,
+                    output_data + (b * param->group + g) * output_offset_, N,
+                    bias_data + g * param->output_channel / param->group,
+                    param->activation_type, src_trans_workspace, conv_gemm_conf_);
+            }
+        }
+    } else {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "Error: x86 device not support this data type");
+    }
+
+    return TNN_OK;
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.h
new file mode 100644
index 0000000..9741cfe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_common.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+
+namespace TNN_NS {
+
+class X86ConvLayerCommon : public X86LayerAcc {
+public:
+    virtual ~X86ConvLayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // always true as last solution
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc conv params and set post op
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    bool do_im2col_ = true;
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    conv_gemm_config<float, float, float> conv_gemm_conf_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.cc b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.cc
new file mode 100644
index 0000000..231bb57
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.cc
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+bool X86ConvLayerDepthwise::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                       const std::vector<Blob *> &outputs) {
+    if (!param) {
+        return false;
+    }
+
+    const int group          = param->group;
+    const int input_channel  = inputs[0]->GetBlobDesc().dims[1];
+    const int output_channel = outputs[0]->GetBlobDesc().dims[1];
+
+    return group == input_channel && group == output_channel;
+}
+
+X86ConvLayerDepthwise::~X86ConvLayerDepthwise() {}
+
+Status X86ConvLayerDepthwise::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                   const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int kw = param->kernels[0];
+        int kh = param->kernels[1];
+
+        const int group  = param->group;
+        const float *src = conv_res->filter_handle.force_to<float *>();
+
+        int group_rup = ROUND_UP(group, 8);
+        if (arch_ == sse42) {
+            group_rup = ROUND_UP(group, 4);
+        }
+        int weight_count   = group_rup * kh * kw;
+        int data_byte_size = DataTypeUtils::GetBytesSize(conv_res->filter_handle.GetDataType());
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            RawBuffer temp_buffer(weight_count * data_byte_size);
+            float *dst = temp_buffer.force_to<float *>();
+
+            if (arch_ == avx2) {
+                PackC8(dst, src, kh * kw, kh * kw, kh * kw, group);
+            } else if (arch_ == sse42) {
+                PackC4(dst, src, kh * kw, kh * kw, kh * kw, group);
+            }
+            temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+template <int c_pack>
+void PackWithPad(
+    const float* src,
+    float* dst,
+    std::vector<int> pads,
+    int src_h,
+    int src_w,
+    int channels) {
+
+    auto PackAcc = PackC4;
+    if (c_pack == 8) {
+        PackAcc = PackC8;
+    }
+    int src_pad_w_stride = (src_w + pads[0] + pads[1]) * c_pack;
+    memset(dst, 0, pads[2] * src_pad_w_stride * sizeof(float));
+
+    auto dst_ptr = dst + pads[2] * src_pad_w_stride;
+    for (int i = 0; i < src_h; i++) {
+        auto dst_h_ptr = dst_ptr + i * src_pad_w_stride;
+        auto src_h_ptr = src + i * src_w;
+        memset(dst_h_ptr, 0, pads[0] * c_pack * sizeof(float));
+        PackAcc(dst_h_ptr + pads[0] * c_pack, src_h_ptr, src_w, src_h * src_w, src_w, channels);
+        memset(dst_h_ptr + pads[0] * c_pack + src_w * c_pack, 0, pads[1] * c_pack * sizeof(float));
+    }
+    memset(dst_ptr + src_h * src_pad_w_stride, 0, pads[3] * src_pad_w_stride * sizeof(float));
+}
+
+Status X86ConvLayerDepthwise::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    ConvLayerResource *resource = dynamic_cast<ConvLayerResource *>(resource_);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    int c_pack = 8;
+    if (arch_ == sse42) {
+        c_pack = 4;
+    }
+
+    const int batch    = dims_output[0];
+    int dst_z_step     = dims_output[2] * dims_output[3];
+    int src_z_step     = dims_input[2] * dims_input[3];
+    int src_pad_w      = dims_input[3] + param->pads[0] + param->pads[1];
+    int dilate_y_step  = src_pad_w * c_pack * param->dialations[1];
+    int dilate_x_step  = c_pack * param->dialations[0];
+    int weight_z_step  = param->kernels[0] * param->kernels[1];
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    size_t src_pad_size = ROUND_UP(src_pad_w * (dims_input[2] + param->pads[2] + param->pads[3]) * c_pack * sizeof(float), 32);
+    size_t dst_tmp_size = ROUND_UP(dst_z_step * c_pack * sizeof(float), 32);
+    float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(
+                                                 (src_pad_size + dst_tmp_size) * max_num_threads));
+
+    const float *src_origin = reinterpret_cast<const float *>(input->GetHandle().base);
+    float *dst_origin = reinterpret_cast<float *>(output->GetHandle().base);
+
+    auto dw_full = DepthwiseConv<ActivationType_None, Float8, 8>;
+    if (param->activation_type == ActivationType_ReLU) {
+        dw_full  = DepthwiseConv<ActivationType_ReLU, Float8, 8>;
+    } else if (param->activation_type == ActivationType_ReLU6) {
+        dw_full  = DepthwiseConv<ActivationType_ReLU6, Float8, 8>;
+    }
+    if (arch_ == sse42) {
+        dw_full = DepthwiseConv<ActivationType_None, Float4, 4>;
+        if (param->activation_type == ActivationType_ReLU) {
+            dw_full  = DepthwiseConv<ActivationType_ReLU, Float4, 4>;
+        } else if (param->activation_type == ActivationType_ReLU6) {
+            dw_full  = DepthwiseConv<ActivationType_ReLU6, Float4, 4>;
+        }
+    }
+
+    auto PackWithPadAcc = PackWithPad<8>;
+    if (arch_ == sse42) {
+        PackWithPadAcc = PackWithPad<4>;
+    }
+
+    auto UnpackAcc = UnpackC8;
+    if (arch_ == sse42) {
+        UnpackAcc = UnpackC4;
+    }
+
+    float *weights_data = buffer_weight_.force_to<float*>();
+    float *bias_data = buffer_bias_.force_to<float*>();;
+
+    for (int batch_idx = 0; batch_idx < batch; batch_idx++) {
+        auto src_ptr = src_origin + batch_idx * dims_input[1] * src_z_step;
+        auto dst_ptr = dst_origin + batch_idx * dims_output[1] * dst_z_step;
+
+        OMP_PARALLEL_FOR_GUIDED_
+        for (int dz = 0; dz < dims_output[1]; dz += c_pack) {
+            int real_dz     = MIN(c_pack, dims_output[1] - dz);
+            auto *dst_z     = dst_ptr + dst_z_step * dz;
+            auto *src_z     = src_ptr + src_z_step * dz;
+            auto *weight_dz = weights_data + dz * weight_z_step;
+            auto *bias_z    = bias_data + dz;
+            int thread_id   = OMP_TID_;
+            auto *tmp_buf   = workspace + thread_id * ((src_pad_size + dst_tmp_size) / sizeof(float));
+            auto *src_buf   = tmp_buf;
+            auto *dst_buf   = tmp_buf + src_pad_size / sizeof(float);
+
+            PackWithPadAcc(src_z, src_buf, param->pads, dims_input[2], dims_input[3], real_dz);
+            dw_full(dst_buf, src_buf, weight_dz, bias_z, dims_output[3], param->strides[0] * c_pack,
+                    param->kernels[0], param->kernels[1], dilate_x_step, dilate_y_step,
+                    dims_output[2], src_pad_w * c_pack * param->strides[1], dims_output[3] * c_pack);
+            UnpackAcc(dst_z, dst_buf, dst_z_step, dst_z_step, dst_z_step, real_dz);
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.h b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.h
new file mode 100644
index 0000000..0885b0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/convolution/x86_conv_layer_depthwise.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_DEPTHWISE_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_DEPTHWISE_H_
+
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_common.h"
+
+namespace TNN_NS {
+
+class X86ConvLayerDepthwise : public X86ConvLayerCommon {
+public:
+    virtual ~X86ConvLayerDepthwise();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONV_LAYER_ACC_DEPTHWISE_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.cc b/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.cc
new file mode 100644
index 0000000..fc86201
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.cc
@@ -0,0 +1,203 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+/*
+X86DeconvLayerCommonas as the last solution, always return true
+handle the case group != 1, dilate != 1, any pads and strides
+*/
+bool X86DeconvLayerCommon::isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                                      const std::vector<Blob *> &outputs) {
+    return true;
+}
+
+X86DeconvLayerCommon::~X86DeconvLayerCommon() {}
+
+Status X86DeconvLayerCommon::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status X86DeconvLayerCommon::allocateBufferWeight(const std::vector<Blob *> &inputs,
+                                                  const std::vector<Blob *> &outputs) {
+    ConvLayerParam *param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    auto input       = inputs[0];
+    auto output      = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        int k_c     = conv_gemm_conf_.K_c_;
+        int m_c     = conv_gemm_conf_.M_c_;
+        int n_block = conv_gemm_conf_.n_block_;
+
+        int K = dims_input[1] / param->group;
+        int M = dims_output[1] * param->kernels[0] * param->kernels[1] / param->group;
+
+        size_t weight_pack_per_group = ROUND_UP(K, k_c) * ROUND_UP(M, n_block);
+
+        RawBuffer transpose_buffer(conv_res->filter_handle.GetBytesSize() / param->group);
+        const float *src = conv_res->filter_handle.force_to<float *>();
+
+        if (conv_res->filter_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            float *trans = transpose_buffer.force_to<float *>();
+            RawBuffer temp_buffer(weight_pack_per_group * param->group * sizeof(float));
+            float *dst = temp_buffer.force_to<float *>();
+
+            for (int g = 0; g < param->group; g++) {
+                auto src_g = src + K * M * g;
+                MatTranspose(trans, src_g, K, M);
+                auto dst_g = dst + weight_pack_per_group * g;
+                conv_pack_col_b_n(M, K, trans, K, dst_g, conv_gemm_conf_);
+            }
+
+            temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", conv_res->filter_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "conv_res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+Status X86DeconvLayerCommon::allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+    CHECK_PARAM_NULL(conv_param);
+    ConvLayerResource *conv_res = dynamic_cast<ConvLayerResource *>(resource_);
+    CHECK_PARAM_NULL(conv_res);
+
+    if (!buffer_bias_.GetBytesSize()) {
+        auto dims_output = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size =
+            ROUND_UP(dims_output[1], 8) * DataTypeUtils::GetBytesSize(conv_res->bias_handle.GetDataType());
+        RawBuffer temp_buffer(total_byte_size);
+        if (conv_param->bias) {
+            const int bias_handle_size    = conv_res->bias_handle.GetBytesSize();
+            const float *bias_handle_data = conv_res->bias_handle.force_to<float *>();
+            memcpy(temp_buffer.force_to<float *>(), conv_res->bias_handle.force_to<float *>(), bias_handle_size);
+        }
+        buffer_bias_ = temp_buffer;
+    }
+    return TNN_OK;
+}
+
+Status X86DeconvLayerCommon::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+    auto status                = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    conv_gemm_conf_ = conv_gemm_config<float, float, float>();
+
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    switch (conv_param->activation_type) {
+        case ActivationType_None:
+            post_func_ = (arch_ == avx2) ? X86_Post_Exec<ActivationType_None, Float8, 8>
+                                         : X86_Post_Exec<ActivationType_None, Float4, 4>;
+            break;
+        case ActivationType_ReLU:
+            post_func_ = (arch_ == avx2) ? X86_Post_Exec<ActivationType_ReLU, Float8, 8>
+                                         : X86_Post_Exec<ActivationType_ReLU, Float4, 4>;
+            break;
+        case ActivationType_ReLU6:
+            post_func_ = (arch_ == avx2) ? X86_Post_Exec<ActivationType_ReLU6, Float8, 8>
+                                         : X86_Post_Exec<ActivationType_ReLU6, Float4, 4>;
+            break;
+        default:
+            break;
+    }
+
+    return TNN_OK;
+}
+
+Status X86DeconvLayerCommon::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto input_dims   = inputs[0]->GetBlobDesc().dims;
+    auto output_dims  = outputs[0]->GetBlobDesc().dims;
+    void *input_ptr   = input_blob->GetHandle().base;
+    void *output_ptr  = output_blob->GetHandle().base;
+    auto param        = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource     = dynamic_cast<ConvLayerResource *>(resource_);
+
+    int conv_out_offset_      = output_dims[2] * output_dims[3] * output_dims[1];
+    int conv_in_spatial_dim_  = input_dims[2] * input_dims[3];
+    int conv_out_spatial_dim_ = output_dims[2] * output_dims[3];
+    int input_offset_         = input_dims[1] * conv_in_spatial_dim_ / param->group;
+    size_t col_offset_ =
+        param->kernels[0] * param->kernels[1] * input_dims[2] * input_dims[3] * (output_dims[1] / param->group);
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    conv_ajust_m_blk_size(max_num_threads, conv_in_spatial_dim_, conv_gemm_conf_.M_c_);
+
+    int m_c               = conv_gemm_conf_.M_c_;
+    int k_c               = conv_gemm_conf_.K_c_;
+    int n_block           = conv_gemm_conf_.n_block_;
+    size_t src_trans_size = m_c * k_c;
+
+    size_t im2col_size    = ROUND_UP(col_offset_ * param->group * sizeof(float), 32);
+    size_t workspace_size = (im2col_size + ROUND_UP(src_trans_size * max_num_threads * sizeof(float), 32));
+    float *workspace      = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+
+    float *im2col_workspace    = workspace;
+    float *src_trans_workspace = workspace + im2col_size / sizeof(float);
+
+    int K = input_dims[1] / param->group;
+    int M = output_dims[1] * param->kernels[0] * param->kernels[1] / param->group;
+    int N = conv_in_spatial_dim_;
+
+    size_t weight_offset_per_group = ROUND_UP(K, k_c) * ROUND_UP(M, n_block);
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_data   = static_cast<float *>(input_ptr);
+        auto output_data  = static_cast<float *>(output_ptr);
+        auto weights_data = buffer_weight_.force_to<float *>();
+        float *bias_data  = buffer_bias_.force_to<float *>();
+        RawBuffer fake_bias(ROUND_UP(M, 8) * sizeof(float));
+        for (size_t b = 0; b < output_dims[0]; b++) {
+            for (int g = 0; g < param->group; g++) {
+                conv_sgemm_nn_col_major_prepack_b(N, M, K, input_data + (b * param->group + g) * input_offset_, N,
+                                        weights_data + weight_offset_per_group * g, K,
+                                        im2col_workspace + col_offset_ * g, N, fake_bias.force_to<float *>(), 0,
+                                        src_trans_workspace, conv_gemm_conf_);
+            }
+
+            X86_COL2IM(im2col_workspace, output_dims[1], input_dims[2], input_dims[3], param->kernels[1],
+                       param->kernels[0], param->pads[2], param->pads[0], param->strides[1], param->strides[0],
+                       param->dialations[1], param->dialations[0], output_dims[2], output_dims[3],
+                       output_data + b * conv_out_offset_);
+
+            if (post_func_) {
+                post_func_(output_data + b * conv_out_offset_, bias_data, output_dims[1], conv_out_spatial_dim_);
+            }
+        }
+    } else {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "Error: x86 device not support this data type");
+    }
+
+    return TNN_OK;
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.h b/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.h
new file mode 100644
index 0000000..f3c0b83
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_DECONV_LAYER_ACC_COMMON_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_DECONV_LAYER_ACC_COMMON_H_
+
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+using post_func_t = decltype(&X86_Post_Exec<ActivationType_None, Float4, 4>);
+
+class X86DeconvLayerCommon : public X86LayerAcc {
+public:
+    virtual ~X86DeconvLayerCommon();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // always true as last solution
+    static bool isPrefered(ConvLayerParam *param, const std::vector<Blob *> &inputs,
+                           const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    // alloc conv params and set post op
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    bool do_im2col_ = true;
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    conv_gemm_config<float, float, float> conv_gemm_conf_;
+    post_func_t post_func_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_DECONV_LAYER_ACC_COMMON_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/sse_mathfun.h b/3rdparty/TNN/source/tnn/device/x86/acc/sse_mathfun.h
new file mode 100644
index 0000000..7c445c3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/sse_mathfun.h
@@ -0,0 +1,791 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifndef SSE_MATHFUN_H
+#define SSE_MATHFUN_H
+
+#define USE_SSE2 1
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf; // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+#include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si; // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val) \
+    static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
+#define _PI32_CONST(Name, Val) \
+    static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
+#define _PS_CONST_TYPE(Name, Type, Val) \
+    static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
+
+_PS_CONST(1, 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2f);
+_PS_CONST(cephes_log_p1, -1.1514610310E-1f);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1f);
+_PS_CONST(cephes_log_p3, -1.2420140846E-1f);
+_PS_CONST(cephes_log_p4, +1.4249322787E-1f);
+_PS_CONST(cephes_log_p5, -1.6668057665E-1f);
+_PS_CONST(cephes_log_p6, +2.0000714765E-1f);
+_PS_CONST(cephes_log_p7, -2.4999993993E-1f);
+_PS_CONST(cephes_log_p8, +3.3333331174E-1f);
+_PS_CONST(cephes_log_q1, -2.12194440e-4f);
+_PS_CONST(cephes_log_q2, 0.693359375f);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union
+{
+    __m128 xmm;
+    __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) \
+    {                                    \
+        xmm_mm_union u;                  \
+        u.xmm = xmm_;                    \
+        mm0_ = u.mm[0];                  \
+        mm1_ = u.mm[1];                  \
+    }
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) \
+    {                                    \
+        xmm_mm_union u;                  \
+        u.mm[0] = mm0_;                  \
+        u.mm[1] = mm1_;                  \
+        xmm_ = u.xmm;                    \
+    }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float
+   return NaN for x <= 0
+*/
+static inline v4sf log_ps(v4sf x)
+{
+#ifdef USE_SSE2
+    v4si emm0;
+#else
+    v2si mm0, mm1;
+#endif
+    v4sf one = *(v4sf*)_ps_1;
+
+    v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+    x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+    /* part 1: x = frexpf(x, &e); */
+    COPY_XMM_TO_MM(x, mm0, mm1);
+    mm0 = _mm_srli_pi32(mm0, 23);
+    mm1 = _mm_srli_pi32(mm1, 23);
+#else
+    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+    /* keep only the fractional part */
+    x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+    x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+    /* now e=mm0:mm1 contain the really base-2 exponent */
+    mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+    mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+    v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+    _mm_empty(); /* bye bye mmx */
+#else
+    emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+    v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+    e = _mm_add_ps(e, one);
+
+    /* part2:
+       if( x < SQRTHF ) {
+         e -= 1;
+         x = x + x - 1.0;
+       } else { x = x - 1.0; }
+    */
+    v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+    v4sf tmp = _mm_and_ps(x, mask);
+    x = _mm_sub_ps(x, one);
+    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+    x = _mm_add_ps(x, tmp);
+
+    v4sf z = _mm_mul_ps(x, x);
+
+    v4sf y = *(v4sf*)_ps_cephes_log_p0;
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+    y = _mm_mul_ps(y, x);
+
+    y = _mm_mul_ps(y, z);
+
+    tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+    y = _mm_add_ps(y, tmp);
+
+    tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+
+    tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+    x = _mm_add_ps(x, y);
+    x = _mm_add_ps(x, tmp);
+    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+    return x;
+}
+
+_PS_CONST(exp_hi, 88.3762626647949f);
+_PS_CONST(exp_lo, -88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
+_PS_CONST(cephes_exp_C1, 0.693359375f);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4f);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
+
+static inline v4sf exp_ps(v4sf x)
+{
+    v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+    v4si emm0;
+#else
+    v2si mm0, mm1;
+#endif
+    v4sf one = *(v4sf*)_ps_1;
+
+    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+    x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+    /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+    /* step 1 : cast to int */
+    tmp = _mm_movehl_ps(tmp, fx);
+    mm0 = _mm_cvttps_pi32(fx);
+    mm1 = _mm_cvttps_pi32(tmp);
+    /* step 2 : cast back to float */
+    tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp = _mm_cvtepi32_ps(emm0);
+#endif
+    /* if greater, substract 1 */
+    v4sf mask = _mm_cmpgt_ps(tmp, fx);
+    mask = _mm_and_ps(mask, one);
+    fx = _mm_sub_ps(tmp, mask);
+
+    tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+    v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+    x = _mm_sub_ps(x, tmp);
+    x = _mm_sub_ps(x, z);
+
+    z = _mm_mul_ps(x, x);
+
+    v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, x);
+    y = _mm_add_ps(y, one);
+
+    /* build 2^n */
+#ifndef USE_SSE2
+    z = _mm_movehl_ps(z, fx);
+    mm0 = _mm_cvttps_pi32(fx);
+    mm1 = _mm_cvttps_pi32(z);
+    mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+    mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+    mm0 = _mm_slli_pi32(mm0, 23);
+    mm1 = _mm_slli_pi32(mm1, 23);
+
+    v4sf pow2n;
+    COPY_MM_TO_XMM(mm0, mm1, pow2n);
+    _mm_empty();
+#else
+    emm0 = _mm_cvttps_epi32(fx);
+    emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+    emm0 = _mm_slli_epi32(emm0, 23);
+    v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+    y = _mm_mul_ps(y, pow2n);
+    return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625f);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS_CONST(sincof_p0, -1.9515295891E-4f);
+_PS_CONST(sincof_p1, 8.3321608736E-3f);
+_PS_CONST(sincof_p2, -1.6666654611E-1f);
+_PS_CONST(coscof_p0, 2.443315711809948E-005f);
+_PS_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS_CONST(coscof_p2, 4.166664568298827E-002f);
+_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+static inline v4sf sin_ps(v4sf x)
+{   // any x
+    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+    v4si emm0, emm2;
+#else
+    v2si mm0, mm1, mm2, mm3;
+#endif
+    sign_bit = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+    /* store the integer part of y in mm0 */
+    emm2 = _mm_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    /* get the swap sign flag */
+    emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    /* get the polynom selection mask
+       there is one polynom for 0 <= x <= Pi/4
+       and another one for Pi/4<x<=Pi/2
+
+       Both branches will be computed.
+    */
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+    v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+    v4sf poly_mask = _mm_castsi128_ps(emm2);
+    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+#else
+    /* store the integer part of y in mm0:mm1 */
+    xmm2 = _mm_movehl_ps(xmm2, y);
+    mm2 = _mm_cvttps_pi32(y);
+    mm3 = _mm_cvttps_pi32(xmm2);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+    y = _mm_cvtpi32x2_ps(mm2, mm3);
+    /* get the swap sign flag */
+    mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+    mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+    mm0 = _mm_slli_pi32(mm0, 29);
+    mm1 = _mm_slli_pi32(mm1, 29);
+    /* get the polynom selection mask */
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+    v4sf swap_sign_bit, poly_mask;
+    COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+    _mm_empty(); /* good-bye mmx */
+#endif
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(v4sf*)_ps_coscof_p0;
+    v4sf z = _mm_mul_ps(x, x);
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, *(v4sf*)_ps_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    v4sf y2 = *(v4sf*)_ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    y = _mm_andnot_ps(xmm3, y);
+    y = _mm_add_ps(y, y2);
+    /* update the sign */
+    y = _mm_xor_ps(y, sign_bit);
+    return y;
+}
+
+/* almost the same as sin_ps */
+static inline v4sf cos_ps(v4sf x)
+{   // any x
+    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+    v4si emm0, emm2;
+#else
+    v2si mm0, mm1, mm2, mm3;
+#endif
+    /* take the absolute value */
+    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+    /* store the integer part of y in mm0 */
+    emm2 = _mm_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+
+    /* get the swap sign flag */
+    emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    /* get the polynom selection mask */
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+    v4sf sign_bit = _mm_castsi128_ps(emm0);
+    v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+    /* store the integer part of y in mm0:mm1 */
+    xmm2 = _mm_movehl_ps(xmm2, y);
+    mm2 = _mm_cvttps_pi32(y);
+    mm3 = _mm_cvttps_pi32(xmm2);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+    y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+    mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+    mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+    /* get the swap sign flag in mm0:mm1 and the
+       polynom selection mask in mm2:mm3 */
+
+    mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+    mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+    mm0 = _mm_slli_pi32(mm0, 29);
+    mm1 = _mm_slli_pi32(mm1, 29);
+
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+    v4sf sign_bit, poly_mask;
+    COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+    _mm_empty(); /* good-bye mmx */
+#endif
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(v4sf*)_ps_coscof_p0;
+    v4sf z = _mm_mul_ps(x, x);
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, *(v4sf*)_ps_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    v4sf y2 = *(v4sf*)_ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    y = _mm_andnot_ps(xmm3, y);
+    y = _mm_add_ps(y, y2);
+    /* update the sign */
+    y = _mm_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+static inline void sincos_ps(v4sf x, v4sf* s, v4sf* c)
+{
+    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+    v4si emm0, emm2, emm4;
+#else
+    v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+    /* store the integer part of y in emm2 */
+    emm2 = _mm_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    emm4 = emm2;
+
+    /* get the swap sign flag for the sine */
+    emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+    /* get the polynom selection mask for the sine*/
+    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+    /* store the integer part of y in mm2:mm3 */
+    xmm3 = _mm_movehl_ps(xmm3, y);
+    mm2 = _mm_cvttps_pi32(y);
+    mm3 = _mm_cvttps_pi32(xmm3);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+    y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+    mm4 = mm2;
+    mm5 = mm3;
+
+    /* get the swap sign flag for the sine */
+    mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+    mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+    mm0 = _mm_slli_pi32(mm0, 29);
+    mm1 = _mm_slli_pi32(mm1, 29);
+    v4sf swap_sign_bit_sin;
+    COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+    /* get the polynom selection mask for the sine */
+
+    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+    v4sf poly_mask;
+    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+    emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+    emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+    emm4 = _mm_slli_epi32(emm4, 29);
+    v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+    /* get the sign flag for the cosine */
+    mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+    mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+    mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+    mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+    mm4 = _mm_slli_pi32(mm4, 29);
+    mm5 = _mm_slli_pi32(mm5, 29);
+    v4sf sign_bit_cos;
+    COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+    _mm_empty(); /* good-bye mmx */
+#endif
+
+    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    v4sf z = _mm_mul_ps(x, x);
+    y = *(v4sf*)_ps_coscof_p0;
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, *(v4sf*)_ps_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    v4sf y2 = *(v4sf*)_ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    v4sf ysin2 = _mm_and_ps(xmm3, y2);
+    v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+    y2 = _mm_sub_ps(y2, ysin2);
+    y = _mm_sub_ps(y, ysin1);
+
+    xmm1 = _mm_add_ps(ysin1, ysin2);
+    xmm2 = _mm_add_ps(y, y2);
+
+    /* update the sign */
+    *s = _mm_xor_ps(xmm1, sign_bit_sin);
+    *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+// tanh sse vector version
+// refer the scalar version from Cephes Math Library
+
+#define c_cephes_HALFMAXLOGF 44.014845935754205f
+#define c_cephes_tanh_C1 0.625f
+
+#define c_cephes_tanh_p0 - 5.70498872745E-3
+#define c_cephes_tanh_p1 + 2.06390887954E-2
+#define c_cephes_tanh_p2 - 5.37397155531E-2
+#define c_cephes_tanh_p3 + 1.33314422036E-1
+#define c_cephes_tanh_p4 - 3.33332819422E-1
+
+static inline v4sf tanh_ps(v4sf x)
+{
+    v4sf x2 = _mm_andnot_ps(_mm_set1_ps(-0.f), x); // fabs, -0.f = 1 << 31
+
+    v4sf mask_l = _mm_cmpge_ps(x2, _mm_set1_ps(c_cephes_tanh_C1));
+    v4sf mask_l2 = _mm_cmpgt_ps(x2, _mm_set1_ps(c_cephes_HALFMAXLOGF));
+
+    // abs(x) >= 0.625
+    // tanh(x) = 1 − 2 / (exp(2x) + 1)
+    v4sf _one = _mm_set1_ps(1.f);
+    v4sf _two = _mm_set1_ps(2.f);
+    v4sf exp_x_x = exp_ps(_mm_add_ps(x, x));
+    v4sf y0 = _mm_sub_ps(_one, _mm_div_ps(_two, _mm_add_ps(exp_x_x, _one)));
+
+    // abs(x) < 0.625
+    /*
+        z = x2 * x2;
+        z =
+        (((( -5.70498872745E-3 * z
+        + 2.06390887954E-2) * z
+        - 5.37397155531E-2) * z
+        + 1.33314422036E-1) * z
+        - 3.33332819422E-1) * z * x
+        + x;
+    */
+    v4sf y = _mm_set1_ps(c_cephes_tanh_p0);
+    v4sf c1 = _mm_set1_ps(c_cephes_tanh_p1);
+    v4sf c2 = _mm_set1_ps(c_cephes_tanh_p2);
+    v4sf c3 = _mm_set1_ps(c_cephes_tanh_p3);
+    v4sf c4 = _mm_set1_ps(c_cephes_tanh_p4);
+
+    v4sf z = _mm_mul_ps(x, x);
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, c1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, c2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, c3);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, c4);
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, x);
+
+    // abs(x) > HALFMAXLOGF
+    // return 1.0 or -1.0
+    v4sf mask_pos = _mm_cmpgt_ps(x, _mm_set1_ps(0.f));
+    v4sf y1 = _mm_blendv_ps(_mm_set1_ps(-1.f), _mm_set1_ps(1.f), mask_pos);
+
+    y = _mm_blendv_ps(y, y0, mask_l);
+    y = _mm_blendv_ps(y, y1, mask_l2);
+    return y;
+}
+
+#endif // SSE_MATHFUN_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_abs_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_abs_layer_acc.cc
new file mode 100644
index 0000000..4e82512
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_abs_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_abs_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return fabs(v);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::abs(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::abs(v);
+    }
+} X86_ABS_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_ABS, avx2, unary2_kernel_avx<X86_ABS_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_ABS, sse42, unary2_kernel_sse<X86_ABS_OP>);
+DECLARE_X86_UNARY2_ACC(Abs, LAYER_ABS);
+REGISTER_X86_ACC(Abs, LAYER_ABS);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_acos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_acos_layer_acc.cc
new file mode 100644
index 0000000..0a5626c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_acos_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_acos_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return acos(v);
+    }
+} X86_ACOS_OP;
+
+DECLARE_X86_UNARY_ACC(Acos, X86_ACOS_OP);
+
+REGISTER_X86_ACC(Acos, LAYER_ACOS);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.cc
new file mode 100644
index 0000000..8d93592
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/device/x86/acc/x86_add_layer_acc.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+
+namespace TNN_NS {
+
+X86AddLayerAcc::~X86AddLayerAcc() {}
+
+Status X86AddLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(X86BinaryOpLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    op_type_ = X86BinaryOpType::kADD;
+    return allocateBufferParam(inputs, outputs);
+}
+
+Status X86AddLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // alloc scale buffer, two input scales and output scale
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8 && !input0_int_scale_.GetBytesSize()) {
+        auto dims_output    = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+
+        const float *i0_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *i1_scale =
+            reinterpret_cast<BlobInt8 *>(inputs[1])->GetIntResource()->scale_handle.force_to<float *>();
+
+        const float *o_scale =
+            reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle.force_to<float *>();
+        int scale_cnt = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource()->scale_handle.GetDataCount();
+        RawBuffer temp_buffer0(total_byte_size);
+        RawBuffer temp_buffer1(total_byte_size);
+        RawBuffer temp_buffer2(total_byte_size);
+        float *temp_ptr0 = temp_buffer0.force_to<float *>();
+        float *temp_ptr1 = temp_buffer1.force_to<float *>();
+        float *temp_ptr2 = temp_buffer2.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx = scale_cnt == 1 ? 0 : i;
+            temp_ptr0[i]  = i0_scale[scale_idx];
+            temp_ptr1[i]  = i1_scale[scale_idx];
+            temp_ptr2[i]  = 1.0 / o_scale[scale_idx];
+        }
+        input0_int_scale_ = temp_buffer0;
+        input1_int_scale_ = temp_buffer1;
+        output_int_scale_ = temp_buffer2;
+    }
+    return TNN_OK;
+}
+
+Status X86AddLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+    if (!((inputs.size() == 1 && layer_res) || inputs.size() >= 2)) {
+        LOGE("Error: X86AddLayerAcc invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "X86AddLayerAcc invalid inputs count");
+    }
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        auto dims = outputs[0]->GetBlobDesc().dims;
+        // only support inputs.size() == 2
+        if (inputs.size() > 2) {
+            return Status(TNNERR_UNSUPPORT_NET, "INPUT > 2 NOT IMPLEMENT FOR INT8");
+        }
+        auto output_ptr   = reinterpret_cast<int8_t *>(outputs[0]->GetHandle().base);
+        auto input0_ptr   = reinterpret_cast<int8_t *>(inputs[0]->GetHandle().base);
+        auto input1_ptr   = reinterpret_cast<int8_t *>(inputs[1]->GetHandle().base);
+        auto output_scale = output_int_scale_.force_to<float *>();
+        auto input0_scale = input0_int_scale_.force_to<float *>();
+        auto input1_scale = input1_int_scale_.force_to<float *>();
+        X86MatrixAddInt8(output_ptr, input0_ptr, input1_ptr, output_scale, input0_scale, input1_scale,
+                         ROUND_UP(dims[1], 4), DimsVectorUtils::Count(dims, 2));
+    } else {
+        return X86BinaryOpLayerAcc::DoForward(inputs, outputs);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Add, LAYER_ADD);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.h
new file mode 100644
index 0000000..cdb3261
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_add_layer_acc.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_ADD_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_ADD_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class X86AddLayerAcc : public X86BinaryOpLayerAcc {
+public:
+    virtual ~X86AddLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer input0_int_scale_;
+    RawBuffer input1_int_scale_;
+    RawBuffer output_int_scale_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_ADD_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_arg_max_or_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_arg_max_or_min_layer_acc.cc
new file mode 100644
index 0000000..d51abff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_arg_max_or_min_layer_acc.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include <iostream>
+
+namespace TNN_NS {
+DECLARE_X86_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+Status X86ArgMaxOrMinLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param       = dynamic_cast<ArgMaxOrMinLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    int axis         = param->axis;
+    int num          = DimsVectorUtils::Count(input_dims, 0, axis);
+    int channels     = input_dims[axis];
+    int stride       = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (stride == 0) {
+        stride = 1;
+    }
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        auto input_ptr  = static_cast<float *>(input_blob->GetHandle().base);
+        auto output_ptr = static_cast<int *>(output_blob->GetHandle().base);
+        for (int n = 0; n < num; ++n) {
+            for (int s = 0; s < stride; ++s) {
+                int guard_index = 0;
+                for (int c = 1; c < channels; ++c) {
+                    float guard_value = input_ptr[n * stride * channels + guard_index * stride + s];
+                    float cur_value   = input_ptr[n * stride * channels + c * stride + s];
+                    if (param->mode == 0) {
+                        // ArgMin
+                        guard_index = cur_value < guard_value ? c : guard_index;
+                    } else {
+                        // ArgMax
+                        guard_index = cur_value > guard_value ? c : guard_index;
+                    }
+                };  // end for loop
+                output_ptr[n * stride + s] = guard_index;
+                // std::cout << output_ptr[n * stride + s] << " ";
+            }
+        }  // end for
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }  // end if
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_asin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_asin_layer_acc.cc
new file mode 100644
index 0000000..606ccd8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_asin_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_asin_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return asin(v);
+    }
+} X86_ASIN_OP;
+
+DECLARE_X86_UNARY_ACC(Asin, X86_ASIN_OP);
+
+REGISTER_X86_ACC(Asin, LAYER_ASIN);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_atan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_atan_layer_acc.cc
new file mode 100644
index 0000000..a96cd4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_atan_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_atan_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return atan(v);
+    }
+} X86_ATAN_OP;
+
+DECLARE_X86_UNARY_ACC(Atan, X86_ATAN_OP);
+
+REGISTER_X86_ACC(Atan, LAYER_ATAN);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.cc
new file mode 100644
index 0000000..f2f6179
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/x86_batch_norm_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+
+namespace TNN_NS {
+
+X86BatchNormLayerAcc::~X86BatchNormLayerAcc() {}
+
+Status X86BatchNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto res = dynamic_cast<BatchNormLayerResource *>(resource);
+    CHECK_PARAM_NULL(res);
+
+    Status ret;
+    if (res->scale_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_BATCH_NORM, res, &fp32_res), TNN_OK);
+        bn_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret = X86LayerAcc::Init(context, param, bn_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    RETURN_ON_NEQ(ret, TNN_OK);
+    return TNN_OK;
+}
+
+Status X86BatchNormLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNormLayerResource is nil");
+    }
+
+    auto input_blob        = inputs[0];
+    auto output_blob       = outputs[0];
+
+    RawBuffer scale_handle = resource->scale_handle;
+    bool shared_channel     = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(scale_handle.GetDataType());
+    RawBuffer bias_handle  = resource->bias_handle;
+    bool has_bias          = bias_handle.GetDataCount() > 0; 
+
+    X86_FMA(static_cast<float *>(input_blob->GetHandle().base),
+            static_cast<float *>(output_blob->GetHandle().base),
+            scale_handle.force_to<float *>(), bias_handle.force_to<float *>(),
+            shared_channel, has_bias, output_blob->GetBlobDesc().dims);
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(BatchNorm, LAYER_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.h
new file mode 100644
index 0000000..d4345ba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_batch_norm_layer_acc.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_BATCH_NORM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_BATCH_NORM_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+class X86BatchNormLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86BatchNormLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    std::shared_ptr<LayerResource> bn_acc_f32_resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_BATCH_NORM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.cc
new file mode 100644
index 0000000..727e95d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.cc
@@ -0,0 +1,791 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+
+namespace TNN_NS {
+
+template<X86BinaryOpType type>
+float binary_op(const float &a, const float &b) {
+    return a;
+}
+template<> float binary_op<X86BinaryOpType::kADD>(const float &a, const float &b) {
+    return a + b;
+}
+template<> float binary_op<X86BinaryOpType::kSUB>(const float &a, const float &b) {
+    return a - b;
+}
+template<> float binary_op<X86BinaryOpType::kMUL>(const float &a, const float &b) {
+    return a * b;
+}
+template<> float binary_op<X86BinaryOpType::kDIV>(const float &a, const float &b) {
+    return a / b;
+}
+template<> float binary_op<X86BinaryOpType::kMAX>(const float &a, const float &b) {
+    return a > b ? a : b;
+}
+template<> float binary_op<X86BinaryOpType::kMIN>(const float &a, const float &b) {
+    return a < b ? a : b;
+}
+
+template<X86BinaryOpType type, typename VEC>
+VEC binary_op(const VEC &a, const VEC &b) {
+    return a;
+}
+template<> Float4 binary_op<X86BinaryOpType::kADD, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::add(a, b);
+}
+template<> Float4 binary_op<X86BinaryOpType::kSUB, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::sub(a, b);
+}
+template<> Float4 binary_op<X86BinaryOpType::kMUL, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::mul(a, b);
+}
+template<> Float4 binary_op<X86BinaryOpType::kDIV, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::div(a, b);
+}
+template<> Float4 binary_op<X86BinaryOpType::kMAX, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::max(a, b);
+}
+template<> Float4 binary_op<X86BinaryOpType::kMIN, Float4>(const Float4 &a, const Float4 &b) {
+    return Float4::min(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kADD, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::add(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kSUB, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::sub(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kMUL, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::mul(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kDIV, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::div(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kMAX, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::max(a, b);
+}
+template<> Float8 binary_op<X86BinaryOpType::kMIN, Float8>(const Float8 &a, const Float8 &b) {
+    return Float8::min(a, b);
+}
+
+static inline void PadShape(const int pad_size, const int dim_size, DimsVector &pad_shape, DimsVector in_shape) {
+    int j = 0;
+    for (; j < pad_size; j++) {
+        pad_shape[j] = 1;
+    }
+    for (; j < dim_size; j++) {
+        pad_shape[j] = in_shape[j - pad_size];
+    }
+}
+
+static void BroadCastTypeFilter(const DimsVector &dims_output, const DimsVector &dims_input, BroadcastType &type) {
+    if (DimsVectorUtils::Equal(dims_output, dims_input)) {
+        type = BroadcastTypeNormal;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 1) &&
+        DimsVectorUtils::Count(dims_input, 0, 1) == 1) {
+        type = BroadcastTypeElement;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 2) &&
+        DimsVectorUtils::Count(dims_input, 0, 2) == 1) {
+        type = BroadcastTypeHeightWidth;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 3) &&
+        DimsVectorUtils::Count(dims_input, 0, 3) == 1) {
+        type = BroadcastTypeWidth;
+        return;
+    }
+    int broadcast_count = DimsVectorUtils::Count(dims_input);
+    if (broadcast_count == 1) {
+        type = BroadcastTypeSingle;
+    } else if (broadcast_count == dims_output[1]) {
+        // broadcast dim = [1, channel, 1...]
+        if (dims_input[1] == dims_output[1]) {
+            type = BroadcastTypeChannel;
+        } else {
+            type = BroadcastTypeGeneral;
+        }
+    } else {
+        type = BroadcastTypeGeneral;
+    }
+    return;
+}
+
+static void BroadCastInit(const DimsVector &dims, const DimsVector &dims0, const DimsVector &dims1, BroadcastType &type,
+                          DimsVector &dims_broadcast, bool &swap_flag) {
+    if (DimsVectorUtils::Equal(dims0, dims1)) {
+        type = BroadcastTypeNormal;
+        dims_broadcast.clear();
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 1)) {
+        type = BroadcastTypeElement;
+        dims_broadcast.clear();
+        if (dims0[0] < dims1[0])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 2)) {
+        type = BroadcastTypeHeightWidth;
+        dims_broadcast.clear();
+        if (dims0[1] < dims1[1])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims1, 3)) {
+        type = BroadcastTypeWidth;
+        dims_broadcast.clear();
+        if (dims0[1] < dims1[1])
+            swap_flag = true;
+    } else if (DimsVectorUtils::Equal(dims0, dims)) {
+        dims_broadcast = dims1;
+    } else {
+        dims_broadcast = dims0;
+        swap_flag      = true;
+    }
+}
+
+static void BinaryComputeOffset(DimsVector &offset, const DimsVector dims_in, const DimsVector dims_out) {
+    DimsVector dims_pad_in;
+    int pad_size = dims_out.size() - dims_in.size();
+    int i = 0;
+    for (; i < pad_size; i++) {
+        dims_pad_in.push_back(1);
+    }
+    for (; i < dims_out.size(); i++) {
+        dims_pad_in.push_back(dims_in[i - pad_size]);
+    }
+
+    offset.resize(dims_out.size());
+    int s = 1;
+    for (i = dims_out.size() - 1; i >= 0; i--) {
+        offset[i] = (dims_pad_in[i] == dims_out[i]) ? s : 0;
+        s *= dims_pad_in[i];
+    }
+}
+
+static void BinaryComputeFirst(const DimsVector input_offset, const DimsVector output_offset,
+                               const DimsVector output_shape, const float* input_ptr, float* output_ptr) {
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = in_i5[0];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <X86BinaryOpType op_type>
+void BinaryCompute(const DimsVector input_offset, const DimsVector output_offset,
+                          const DimsVector output_shape, const float* input_ptr, float* output_ptr) {
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = binary_op<op_type>(ou_i5[0], in_i5[0]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <X86BinaryOpType op_type>
+void BinaryGeneral(DimsVector output_shape, const std::vector<DimsVector> &input_shapes,
+                   float *output_ptr, std::vector<float *> &input_ptrs) {
+    DimsVector output_offset;
+    BinaryComputeOffset(output_offset, output_shape, output_shape);
+
+    for (int i = 0; i < input_shapes.size(); i++) {
+        auto input_shape = input_shapes[i];
+        float *input_ptr = input_ptrs[i];
+
+        DimsVector input_offset;
+        BinaryComputeOffset(input_offset, input_shape, output_shape);
+
+        if (i == 0) {
+            BinaryComputeFirst(input_offset, output_offset, output_shape, input_ptr, output_ptr);
+        } else {
+            BinaryCompute<op_type>(input_offset, output_offset, output_shape, input_ptr, output_ptr);
+        }
+    }
+}
+
+/*
+Binary func with different opreator,
+set dims0 full shape, dims1 broadcast shape, so we need to swap input ptrs
+*/
+template <X86BinaryOpType op_type, typename VEC, int pack>
+Status BinaryFunc(float *output_ptr, const float *input0_ptr, const float *input1_ptr, DimsVector &dims0, DimsVector &dims1, DimsVector &output_dims) {
+    DimsVector dims = DimsVectorUtils::Max(dims0, dims1);
+    DimsVector dims_broadcast;
+    BroadcastType type = BroadcastTypeUnknown;
+    auto _input0       = input0_ptr;
+    auto _input1       = input1_ptr;
+    bool swap_flag     = false;
+
+    BroadCastInit(dims, dims0, dims1, type, dims_broadcast, swap_flag);
+
+    if (swap_flag) {
+        std::swap(_input0, _input1);
+    }
+
+    if (dims_broadcast.size() == 1) {
+        type = BroadcastTypeSingle; // dims_broadcast[0] == 1
+    } else if (dims_broadcast.size() >= 2) {
+        type = (dims_broadcast[1] == 1) ? BroadcastTypeSingle : BroadcastTypeChannel;
+    }
+
+    size_t count = DimsVectorUtils::Count(dims);
+    size_t batch_stride = 1;
+    size_t channel_stride = 1;
+    if (dims.size() > 1) {
+        batch_stride = DimsVectorUtils::Count(dims, 1);
+    }
+    if (dims.size() > 2) {
+        channel_stride = DimsVectorUtils::Count(dims, 2);
+    }
+
+    if (type == BroadcastTypeNormal) {
+        size_t n = 0;
+        for (; n + pack - 1 < count; n += pack) {
+            VEC v1 = VEC::loadu(_input0 + n);
+            VEC v2 = VEC::loadu(_input1 + n);
+            VEC::saveu(output_ptr + n, binary_op<op_type, VEC>(v1, v2));
+        }
+        for (; n < count; n++) {
+            output_ptr[n] = binary_op<op_type>(_input0[n], _input1[n]);
+        }
+        return TNN_OK;
+    }
+
+    if (swap_flag) {
+        if (type == BroadcastTypeSingle) {
+            // broadcast single
+            VEC v2 = VEC(_input1[0]);
+            size_t n = 0;
+            for (; n + pack - 1 < count; n += pack) {
+                VEC v1 = VEC::loadu(_input0 + n);
+                VEC::saveu(output_ptr + n, binary_op<op_type, VEC>(v2, v1));
+            }
+            for (; n < count; n++) {
+                output_ptr[n] = binary_op<op_type>(_input1[0], _input0[n]);
+            }
+        } else if (type == BroadcastTypeChannel) {
+            // broadcast channel
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    VEC v2 = VEC(_input1[c]);
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    int hw = 0;
+                    for (; hw + pack - 1 < channel_stride; hw += pack) {
+                        VEC v1 = VEC::loadu(_input0_c + hw);
+                        VEC::saveu(_output_c + hw, binary_op<op_type, VEC>(v2, v1));
+                    }
+                    for (; hw < channel_stride; hw++) {
+                        _output_c[hw] = binary_op<op_type>(_input1[c], _input0_c[hw]);
+                    }
+                }
+            }
+        } else if (type == BroadcastTypeElement) {
+            // broadcast chw
+            for (int b = 0; b < dims[0]; b++) {
+                auto _input0_b = _input0 + b * batch_stride;
+                auto _output_b = output_ptr + b * batch_stride;
+                int chw = 0;
+                for (; chw + pack - 1 < batch_stride; chw += pack) {
+                    VEC v2 = VEC::loadu(_input1 + chw);
+                    VEC v1 = VEC::loadu(_input0_b + chw);
+                    VEC::saveu(_output_b + chw, binary_op<op_type, VEC>(v2, v1));
+                }
+                for (; chw < batch_stride; chw++) {
+                    _output_b[chw] = binary_op<op_type>(_input1[chw], _input0_b[chw]);
+                }
+            }
+        } else if (type == BroadcastTypeHeightWidth) {
+            // broadcast hw
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    int hw = 0;
+                    for (; hw + pack - 1 < channel_stride; hw += pack) {
+                        VEC v2 = VEC::loadu(_input1 + hw);
+                        VEC v1 = VEC::loadu(_input0_c + hw);
+                        VEC::saveu(_output_c + hw, binary_op<op_type, VEC>(v2, v1));
+                    }
+                    for (; hw < channel_stride; hw++) {
+                        _output_c[hw] = binary_op<op_type>(_input1[hw], _input0_c[hw]);
+                    }
+                }
+            }
+        } else if (type == BroadcastTypeWidth) {
+            // broadcast w
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    for (int h = 0; h < dims[2]; h++) {
+                        auto _input0_h = _input0_c + h * dims[3];
+                        auto _output_h = _output_c + h * dims[3];
+                        int w = 0;
+                        for (; w + pack - 1 < dims[3]; w += pack) {
+                            VEC v2 = VEC::loadu(_input1 + w);
+                            VEC v1 = VEC::loadu(_input0_h + w);
+                            VEC::saveu(_output_h + w, binary_op<op_type, VEC>(v2, v1));
+                        }
+                        for (; w < dims[3]; w++) {
+                            _output_h[w] = binary_op<op_type>(_input1[w], _input0_h[w]);
+                        }
+                    }
+                }
+            }
+        } else {
+            LOGE("Error: invalid add type\n");
+            return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unsupported broadcast type");
+        }
+    } else {
+        if (type == BroadcastTypeSingle) {
+            // broadcast single
+            VEC v2 = VEC(_input1[0]);
+            size_t n = 0;
+            for (; n + pack - 1 < count; n += pack) {
+                VEC v1 = VEC::loadu(_input0 + n);
+                VEC::saveu(output_ptr + n, binary_op<op_type, VEC>(v1, v2));
+            }
+            for (; n < count; n++) {
+                output_ptr[n] = binary_op<op_type>(_input0[n], _input1[0]);
+            }
+        } else if (type == BroadcastTypeChannel) {
+            // broadcast channel
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    VEC v2 = VEC(_input1[c]);
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    int hw = 0;
+                    for (; hw + pack - 1 < channel_stride; hw += pack) {
+                        VEC v1 = VEC::loadu(_input0_c + hw);
+                        VEC::saveu(_output_c + hw, binary_op<op_type, VEC>(v1, v2));
+                    }
+                    for (; hw < channel_stride; hw++) {
+                        _output_c[hw] = binary_op<op_type>(_input0_c[hw], _input1[c]);
+                    }
+                }
+            }
+        } else if (type == BroadcastTypeElement) {
+            // broadcast chw
+            for (int b = 0; b < dims[0]; b++) {
+                auto _input0_b = _input0 + b * batch_stride;
+                auto _output_b = output_ptr + b * batch_stride;
+                int chw = 0;
+                for (; chw + pack - 1 < batch_stride; chw += pack) {
+                    VEC v2 = VEC::loadu(_input1 + chw);
+                    VEC v1 = VEC::loadu(_input0_b + chw);
+                    VEC::saveu(_output_b + chw, binary_op<op_type, VEC>(v1, v2));
+                }
+                for (; chw < batch_stride; chw++) {
+                    _output_b[chw] = binary_op<op_type>(_input0_b[chw], _input1[chw]);
+                }
+            }
+        } else if (type == BroadcastTypeHeightWidth) {
+            // broadcast hw
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    int hw = 0;
+                    for (; hw + pack - 1 < channel_stride; hw += pack) {
+                        VEC v2 = VEC::loadu(_input1 + hw);
+                        VEC v1 = VEC::loadu(_input0_c + hw);
+                        VEC::saveu(_output_c + hw, binary_op<op_type, VEC>(v1, v2));
+                    }
+                    for (; hw < channel_stride; hw++) {
+                        _output_c[hw] = binary_op<op_type>(_input0_c[hw], _input1[hw]);
+                    }
+                }
+            }
+        } else if (type == BroadcastTypeWidth) {
+            // broadcast w
+            for (int b = 0; b < dims[0]; b++) {
+                for (int c = 0; c < dims[1]; c++) {
+                    auto _input0_c = _input0 + b * batch_stride + c * channel_stride;
+                    auto _output_c = output_ptr + b * batch_stride + c * channel_stride;
+                    for (int h = 0; h < dims[2]; h++) {
+                        auto _input0_h = _input0_c + h * dims[3];
+                        auto _output_h = _output_c + h * dims[3];
+                        int w = 0;
+                        for (; w + pack - 1 < dims[3]; w += pack) {
+                            VEC v2 = VEC::loadu(_input1 + w);
+                            VEC v1 = VEC::loadu(_input0_h + w);
+                            VEC::saveu(_output_h + w, binary_op<op_type, VEC>(v1, v2));
+                        }
+                        for (; w < dims[3]; w++) {
+                            _output_h[w] = binary_op<op_type>(_input0_h[w], _input1[w]);
+                        }
+                    }
+                }
+            }
+        } else {
+            LOGE("Error: invalid add type\n");
+            return Status(TNNERR_LAYER_ERR, "Error: Binary layer's unsupported broadcast type");
+        }
+    }
+
+    return TNN_OK;
+}
+
+X86BinaryOpLayerAcc::~X86BinaryOpLayerAcc() {}
+
+Status X86BinaryOpLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    // prepare input shapes
+    input_shapes_.clear();
+    input_shapes_.reserve(4);
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (layer_res && inputs.size() == 1) {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_shapes_.push_back(layer_res->element_shape);
+            input_shapes_.push_back(input_shape0);
+        } else {
+            input_shapes_.push_back(input_shape0);
+            input_shapes_.push_back(layer_res->element_shape);
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_shapes_.push_back(inputs[inid]->GetBlobDesc().dims);
+            }
+        }
+    }
+
+    btype_ = BroadcastTypeUnknown;
+    // check broadcast type is general or other optimized ncxhwx types
+    // if type is general, go to nchw general impl
+    DimsVector input_pad_shape;
+    input_pad_shape.resize(output_dims.size());
+    for (int i = 0; i < input_shapes_.size(); i++) {
+        int pad_size = output_dims.size() - input_shapes_[i].size();
+        PadShape(pad_size, output_dims.size(), input_pad_shape, input_shapes_[i]);
+        BroadCastTypeFilter(output_dims, input_pad_shape, btype_);
+        if (btype_ == BroadcastTypeGeneral) {
+            break;
+        }
+    }
+
+    // set binary function pointer
+    binary_func_ = BinaryFunc<X86BinaryOpType::kADD, Float4, 4>;
+    binary_general_func_ = BinaryGeneral<X86BinaryOpType::kADD>;
+
+    if (arch_ == avx2) {
+        switch(op_type_) {
+            case X86BinaryOpType::kADD :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kADD, Float8, 8>;
+                break;
+            case X86BinaryOpType::kSUB :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kSUB, Float8, 8>;
+                break;
+            case X86BinaryOpType::kMUL :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMUL, Float8, 8>;
+                break;
+            case X86BinaryOpType::kDIV :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kDIV, Float8, 8>;
+                break;
+            case X86BinaryOpType::kMAX :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMAX, Float8, 8>;
+                break;
+            case X86BinaryOpType::kMIN :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMIN, Float8, 8>;
+                break;
+
+            default :
+                LOGE("Error, unknown binary op_type\n");
+                return TNNERR_LAYER_ERR;
+        }
+    } else if (arch_ == sse42) {
+        switch(op_type_) {
+            case X86BinaryOpType::kADD :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kADD, Float4, 4>;
+                break;
+            case X86BinaryOpType::kSUB :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kSUB, Float4, 4>;
+                break;
+            case X86BinaryOpType::kMUL :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMUL, Float4, 4>;
+                break;
+            case X86BinaryOpType::kDIV :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kDIV, Float4, 4>;
+                break;
+            case X86BinaryOpType::kMAX :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMAX, Float4, 4>;
+                break;
+            case X86BinaryOpType::kMIN :
+                binary_func_ = BinaryFunc<X86BinaryOpType::kMIN, Float4, 4>;
+                break;
+
+            default :
+                LOGE("Error, unknown binary op_type\n");
+                return TNNERR_LAYER_ERR;
+        }
+    }
+    switch(op_type_) {
+        case X86BinaryOpType::kADD :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kADD>;
+            break;
+        case X86BinaryOpType::kSUB :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kSUB>;
+            break;
+        case X86BinaryOpType::kMUL :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kMUL>;
+            break;
+        case X86BinaryOpType::kDIV :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kDIV>;
+            break;
+        case X86BinaryOpType::kMAX :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kMAX>;
+            break;
+        case X86BinaryOpType::kMIN :
+            binary_general_func_ = BinaryGeneral<X86BinaryOpType::kMIN>;
+            break;
+
+        default :
+            LOGE("Error, unknown binary op_type\n");
+            return TNNERR_LAYER_ERR;
+    }
+
+    return TNN_OK;
+}
+
+// if reshape, reset input_shapes and broadcast type
+Status X86BinaryOpLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    // prepare input shapes
+    input_shapes_.clear();
+    input_shapes_.reserve(4);
+    auto output = outputs[0];
+    auto output_dims = output->GetBlobDesc().dims;
+
+    if (layer_res && inputs.size() == 1) {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_shapes_.push_back(layer_res->element_shape);
+            input_shapes_.push_back(input_shape0);
+        } else {
+            input_shapes_.push_back(input_shape0);
+            input_shapes_.push_back(layer_res->element_shape);
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+            input_shapes_.push_back(inputs[0]->GetBlobDesc().dims);
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_shapes_.push_back(inputs[inid]->GetBlobDesc().dims);
+            }
+        }
+    }
+
+    btype_ = BroadcastTypeUnknown;
+    // check broadcast type is general or other optimized ncxhwx types
+    // if type is general, go to nchw general impl
+    DimsVector input_pad_shape;
+    input_pad_shape.resize(output_dims.size());
+    for (int i = 0; i < input_shapes_.size(); i++) {
+        int pad_size = output_dims.size() - input_shapes_[i].size();
+        PadShape(pad_size, output_dims.size(), input_pad_shape, input_shapes_[i]);
+        BroadCastTypeFilter(output_dims, input_pad_shape, btype_);
+        if (btype_ == BroadcastTypeGeneral) {
+            break;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status X86BinaryOpLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    std::vector<float *> input_ptrs;
+    input_ptrs.reserve(4);
+    auto output = outputs[0];
+    auto dims   = output->GetBlobDesc().dims;
+
+    if (layer_res && inputs.size() == 1) {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        // prepare input ptrs and shapes
+        if (layer_param->weight_input_index == 0) {
+            // bias as another input
+            input_ptrs.push_back(layer_res->element_handle.force_to<float *>());
+
+            input_ptrs.push_back(reinterpret_cast<float *>(inputs[0]->GetHandle().base));
+        } else {
+            input_ptrs.push_back(reinterpret_cast<float *>(inputs[0]->GetHandle().base));
+
+            input_ptrs.push_back(layer_res->element_handle.force_to<float *>());
+        }
+    } else {
+        if (inputs.size() == 1) {
+            input_ptrs.push_back(reinterpret_cast<float *>(inputs[0]->GetHandle().base));
+            input_ptrs.push_back(reinterpret_cast<float *>(inputs[0]->GetHandle().base));
+        } else {
+            for (size_t inid = 0; inid < inputs.size(); inid++) {
+                input_ptrs.push_back(reinterpret_cast<float *>(inputs[inid]->GetHandle().base));
+            }
+        }
+    }
+
+    if (btype_ == BroadcastTypeUnknown) {
+        LOGE("Error: unknown broadcast type\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Binary layer unknown broadcast type");
+    } else if (btype_ == BroadcastTypeGeneral) {
+        auto output_ptr = reinterpret_cast<float *>(output->GetHandle().base);
+        binary_general_func_(dims, input_shapes_, output_ptr, input_ptrs);
+    } else {
+        auto output_ptr = reinterpret_cast<float *>(output->GetHandle().base);
+        auto input0_ptr = reinterpret_cast<float *>(input_ptrs[0]);
+        auto input1_ptr = reinterpret_cast<float *>(input_ptrs[1]);
+
+        // input0_shape != output_shape && input1_shape != output_shape -> general impl
+        if (!DimsVectorUtils::Equal(dims, input_shapes_[0]) &&
+            !DimsVectorUtils::Equal(dims, input_shapes_[1])) {
+            std::vector<DimsVector> shapes_tmp = {input_shapes_[0], input_shapes_[1]};
+            std::vector<float *> ptrs_tmp = {input0_ptr, input1_ptr};
+
+            binary_general_func_(dims, shapes_tmp, output_ptr, ptrs_tmp);
+        } else {
+            DimsVector input0_pad_shape, input1_pad_shape;
+            input0_pad_shape.resize(dims.size());
+            input1_pad_shape.resize(dims.size());
+            PadShape(dims.size() - input_shapes_[0].size(), dims.size(), input0_pad_shape, input_shapes_[0]);
+            PadShape(dims.size() - input_shapes_[1].size(), dims.size(), input1_pad_shape, input_shapes_[1]);
+
+            binary_func_(output_ptr, input0_ptr, input1_ptr, input0_pad_shape, input1_pad_shape, output->GetBlobDesc().dims);
+        }
+
+        for (int i = 2; i < input_ptrs.size(); i++) {
+            DimsVector input0_pad_shape;
+            auto input_ptr = reinterpret_cast<float *>(input_ptrs[i]);
+            PadShape(dims.size() - input_shapes_[i].size(), dims.size(), input0_pad_shape, input_shapes_[i]);
+            binary_func_(output_ptr, output_ptr, input_ptr, dims, input0_pad_shape, output->GetBlobDesc().dims);
+        }
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.h
new file mode 100644
index 0000000..1c5654c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_binary_op_layer_acc.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_BINARY_OP_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_BINARY_OP_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/Float4.h"
+
+namespace TNN_NS {
+
+enum class X86BinaryOpType : int {
+    kADD = 0,
+    kSUB = 1,
+    kMUL = 2,
+    kDIV = 3,
+    kMAX = 4,
+    kMIN = 5,
+};
+
+template <X86BinaryOpType op_type, typename VEC, int pack>
+Status BinaryFunc(float *output_ptr, const float *input0_ptr, const float *input1_ptr,
+                  DimsVector &dims0, DimsVector &dims1, DimsVector &output_dims);
+template <X86BinaryOpType op_type>
+void BinaryGeneral(DimsVector output_shape, const std::vector<DimsVector> &input_shapes,
+                   float *output_ptr, std::vector<float *> &input_ptrs);
+using binary_func_t = decltype(&BinaryFunc<X86BinaryOpType::kADD, Float4, 4>);
+using binary_general_func_t = decltype(&BinaryGeneral<X86BinaryOpType::kADD>);
+
+class X86BinaryOpLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86BinaryOpLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+protected:
+    // Calculate Function
+    Status Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                     const std::vector<DimsVector> &input_shapes, Blob *output);
+    X86BinaryOpType op_type_;
+private:
+    std::vector<DimsVector> input_shapes_;
+    BroadcastType btype_;
+
+    binary_func_t binary_func_;
+    binary_general_func_t binary_general_func_;
+};
+
+#define DECLARE_X86_BINARY_OP_ACC(type_string, op_type)                                                                 \
+    class X86##type_string##LayerAcc : public X86BinaryOpLayerAcc {                                                     \
+    public:                                                                                                             \
+        X86##type_string##LayerAcc() {                                                                                  \
+            X86BinaryOpLayerAcc::op_type_ = op_type;                                                                    \
+        }                                                                                                               \
+        virtual ~X86##type_string##LayerAcc(){};                                                                        \
+                                                                                                                        \
+    }
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_BINARY_OP_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_cast_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_cast_layer_acc.cc
new file mode 100644
index 0000000..206a6e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_cast_layer_acc.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Cast, LAYER_CAST);
+
+Status X86CastLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    const auto param = dynamic_cast<CastLayerParam*>(param_);
+    void *input_data = inputs[0]->GetHandle().base;
+    auto input_data_type = inputs[0]->GetBlobDesc().data_type;
+    void *output_data = outputs[0]->GetHandle().base;
+    auto output_data_type = outputs[0]->GetBlobDesc().data_type;
+    
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    
+    const int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    if (input_data_type == output_data_type) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, count*ele_size);
+        }
+    } else if (input_data_type == DATA_TYPE_FLOAT &&
+        output_data_type == DATA_TYPE_INT32) {
+        auto *input_data_ptr = (float *)input_data;
+        auto *output_data_ptr = (int *)output_data;
+        for(int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<float>(input_data_ptr[i]);
+        }
+    } else if (input_data_type == DATA_TYPE_INT32 &&
+               output_data_type == DATA_TYPE_FLOAT) {
+        auto *input_data_ptr = (int *)input_data;
+        auto *output_data_ptr = (float *)output_data;
+        for(int i = 0; i < count; ++i) {
+            output_data_ptr[i] = static_cast<int>(input_data_ptr[i]);
+        }
+    } else {
+        LOGE("unsupport data type to cast\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Cast, LAYER_CAST);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_ceil_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_ceil_layer_acc.cc
new file mode 100644
index 0000000..7282242
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_ceil_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct x86_ceil_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return std::ceil(v);
+    }
+} X86_CEIL_OP;
+
+DECLARE_X86_UNARY_ACC(Ceil, X86_CEIL_OP);
+
+REGISTER_X86_ACC(Ceil, LAYER_CEIL);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_clip_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_clip_layer_acc.cc
new file mode 100644
index 0000000..73a64cd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_clip_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_clip_operator : x86_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<ClipLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: clip layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error, clip layer param is nil");
+        }
+        min_ = layer_param->min;
+        max_ = layer_param->max;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        float tmp = std::min(max_, v);
+        tmp       = std::max(min_, tmp);
+        return tmp;
+    }
+
+private:
+    float min_ = 0.f;
+    float max_ = 0.f;
+} X86_CLIP_OP;
+
+DECLARE_X86_UNARY_ACC(Clip, X86_CLIP_OP);
+
+REGISTER_X86_ACC(Clip, LAYER_CLIP);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_concat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_concat_layer_acc.cc
new file mode 100644
index 0000000..22696ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_concat_layer_acc.cc
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Concat, LAYER_CONCAT);
+
+Status X86ConcatLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ConcatLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: ConcatLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ConcatLayerParam is nil");
+    }
+    if (inputs.size() < 2) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "Concat layer's inputs size must >= 2");
+    }
+
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        switch (param->axis) {
+            case 1:
+                X86ConcatChannelInt8(outputs[0], inputs);
+                break;
+            default:
+                X86ConcatCommonInt8(outputs[0], inputs, param->axis);
+                break;
+        }
+        return TNN_OK;
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto dims   = input->GetBlobDesc().dims;
+
+    const int axis = param->axis;
+    if (axis > dims.size() || axis < 0) {
+        LOGE("Error: Concat layer param invalid\n");
+        return Status(TNNERR_PARAM_ERR, "Concat layer param invalid");
+    }
+
+    int num_concats = 1;
+    for (int i = 0; i < axis; i++) {
+        num_concats *= dims[i];
+    }
+
+    int concate_size = 1;
+    for (int i = axis + 1; i < dims.size(); i++) {
+        concate_size *= dims[i];
+    }
+
+    auto datasize                 = DataTypeUtils::GetBytesSize(input->GetBlobDesc().data_type);
+    int8_t *output_data           = static_cast<int8_t *>(output->GetHandle().base);
+    int output_concat_axis        = output->GetBlobDesc().dims[axis];
+    int output_concat_axis_offset = 0;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        // use int8_t for all types
+        int8_t *input_data          = static_cast<int8_t *>(inputs[i]->GetHandle().base);
+        const int input_concat_axis = inputs[i]->GetBlobDesc().dims[axis];
+        for (int n = 0; n < num_concats; ++n) {
+            memcpy(output_data + (n * output_concat_axis + output_concat_axis_offset) * concate_size * datasize,
+                   input_data + n * input_concat_axis * concate_size * datasize,
+                   input_concat_axis * concate_size * datasize);
+        }
+        output_concat_axis_offset += input_concat_axis;
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Concat, LAYER_CONCAT);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.cc
new file mode 100644
index 0000000..b54b810
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "x86_conv_layer_acc.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/acc/convolution/x86_conv_layer_acc_factory.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+
+namespace TNN_NS {
+
+Status X86ConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto conv_param    = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    auto conv_resource = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_resource);
+
+    Status ret;
+    if (conv_resource->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_CONVOLUTION, conv_resource, &fp32_res), TNN_OK);
+        conv_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret = X86LayerAcc::Init(context, param, conv_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto data_type = inputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_INT8) {
+        X86ConvLayerAccFactory::CreateImpInt8(inputs, outputs, param_, conv_acc_impl_);
+    } else {
+        X86ConvLayerAccFactory::CreateImpFP(inputs, outputs, param_, conv_acc_impl_);
+    }
+
+    if (!conv_acc_impl_) {
+        return Status(TNNERR_NET_ERR, "Could not create conv impl_");
+    }
+    ret = conv_acc_impl_->Init(context_, param_, resource_, inputs, outputs);
+
+    // converted weights are assumed to be packed, and can be freed now
+    if (conv_acc_f32_resource_) {
+        conv_acc_f32_resource_.reset();
+    }
+
+    return ret;
+}
+
+Status X86ConvLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_acc_impl_) {
+        return conv_acc_impl_->DoForward(inputs, outputs);
+    } else {
+        return Status(TNNERR_CONTEXT_ERR, "conv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_X86_ACC(Conv, LAYER_CONVOLUTION);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.h
new file mode 100644
index 0000000..7dfa751
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_conv_layer_acc.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_CONV_LAYER_ACC_H
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_CONV_LAYER_ACC_H
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+class X86ConvLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86ConvLayerAcc(){};
+    
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    std::shared_ptr<X86LayerAcc> conv_acc_impl_ = nullptr;
+    std::shared_ptr<LayerResource> conv_acc_f32_resource_ = nullptr;
+};
+
+}   // namespace TNN_NS
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_cos_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_cos_layer_acc.cc
new file mode 100644
index 0000000..926f1b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_cos_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_cos_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return cos(v);
+    }
+} X86_COS_OP;
+
+DECLARE_X86_UNARY_ACC(Cos, X86_COS_OP);
+
+REGISTER_X86_ACC(Cos, LAYER_COS);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.cc
new file mode 100644
index 0000000..e80f4e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_deconv_layer_acc.h"
+#include "tnn/device/x86/acc/deconvolution/x86_deconv_layer_common.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+
+namespace TNN_NS {
+Status X86DeconvLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto conv_param    = dynamic_cast<ConvLayerParam *>(param);
+    CHECK_PARAM_NULL(conv_param);
+    auto conv_resource = dynamic_cast<ConvLayerResource *>(resource);
+    CHECK_PARAM_NULL(conv_resource);
+
+    Status ret;
+    if (conv_resource->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_DECONVOLUTION, conv_resource, &fp32_res), TNN_OK);
+        conv_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret = X86LayerAcc::Init(context, param, conv_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (!conv_acc_impl_) {
+        conv_acc_impl_ = std::make_shared<X86DeconvLayerCommon>();
+    }
+
+    if (!conv_acc_impl_) {
+        return Status(TNNERR_NET_ERR, "Could not create conv impl_");
+    }
+    ret = conv_acc_impl_->Init(context_, param_, resource_, inputs, outputs);
+
+    // converted weights are assumed to be packed, and can be freed now
+    if (conv_acc_f32_resource_) {
+        conv_acc_f32_resource_.reset();
+    }
+
+    return ret;
+}
+
+Status X86DeconvLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (conv_acc_impl_) {
+        return conv_acc_impl_->DoForward(inputs, outputs);
+    } else {
+        return Status(TNNERR_CONTEXT_ERR, "conv_acc_impl_ is nil");
+    }
+}
+
+REGISTER_X86_ACC(Deconv, LAYER_DECONVOLUTION);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.h
new file mode 100644
index 0000000..18fc8b6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_deconv_layer_acc.h
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_DECONV_LAYER_ACC_H
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_DECONV_LAYER_ACC_H
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+// class X86DeconvLayerAcc : public X86LayerAcc {
+// public:
+//     virtual ~X86DeconvLayerAcc(){};
+    
+//     Status Init(Context *context, LayerParam *param, LayerResource *resource,
+//                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    
+//     virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+//     virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+// protected:
+//     bool do_im2col_ = true;
+//     RawBuffer col_buffer_;
+// private:
+//     size_t col_offset_;
+//     size_t weight_offset_;
+//     size_t conv_in_width_;
+//     size_t conv_in_height_;
+//     size_t conv_in_channels_;
+//     size_t conv_out_channles_;
+//     size_t conv_out_spatial_dim_;
+//     size_t kernel_dim_;
+//     size_t conv_in_offset_;
+//     size_t output_offset_;
+// };
+
+class X86DeconvLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86DeconvLayerAcc(){};
+    
+    Status Init(Context *context, LayerParam *param, LayerResource *resource,
+                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    std::shared_ptr<X86LayerAcc> conv_acc_impl_ = nullptr;
+    std::shared_ptr<LayerResource> conv_acc_f32_resource_ = nullptr;
+};
+
+}   // namespace TNN_NS
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_detection_output_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_detection_output_layer_acc.cc
new file mode 100644
index 0000000..f712562
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_detection_output_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+Status X86DetectionOutputLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    DetectionOutputLayerParam *param = dynamic_cast<DetectionOutputLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    NaiveDetectionOutput(inputs, outputs, param);
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(DetectionOutput, LAYER_DETECTION_OUTPUT)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_div_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_div_layer_acc.cc
new file mode 100644
index 0000000..e56bdae
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_div_layer_acc.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_BINARY_OP_ACC(Div, X86BinaryOpType::kDIV);
+
+REGISTER_X86_ACC(Div, LAYER_DIV);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_elu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_elu_layer_acc.cc
new file mode 100644
index 0000000..e179ec1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_elu_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_elu_layer_acc : x86_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<EluLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        float tmp = v;
+        if (tmp < 0) {
+            tmp = alpha_ * (exp(tmp) - 1.0f);
+        }
+        return tmp;
+    }
+private:
+    float alpha_ = 0;
+} X86_ELU_OP;
+
+
+DECLARE_X86_UNARY_ACC(Elu, X86_ELU_OP);
+
+REGISTER_X86_ACC(Elu, LAYER_ELU);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_erf_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_erf_layer_acc.cc
new file mode 100644
index 0000000..c322367
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_erf_layer_acc.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+inline float fast_erf_approximation(float x) {
+    //use x*x instead of pow(x, 2), see  https://www.zhihu.com/question/60172486
+    auto t = 1 / (1 + 0.5 * fabs(x));
+
+    auto t_2 = t * t;
+    auto t_3 = t_2 * t;
+    auto t_4 = t_3 * t;
+    auto t_5 = t_4 * t;
+    auto t_6 = t_5 * t;
+    auto t_7 = t_6 * t;
+    auto t_8 = t_7 * t;
+    auto t_9 = t_8 * t;
+
+    auto v = t * exp(-x * x - 1.26551223 + 1.00002368 * t + 0.37409196 * t_2 + 0.09678418 * t_3 -
+                    0.18628806 * t_4 + 0.27886807 * t_5 - 1.13520398 * t_6 +
+                    1.48851587 * t_7 - 0.82215223 * t_8 + 0.17087277 * t_9);
+    if (x >= 0) {
+        return 1 - v;
+    } else {
+        return v - 1;
+    }
+}
+typedef struct x86_erf_operator : x86_unary_operator {
+    virtual float operator()(const float x) {
+        return erff(x);
+    }
+} X86_ERF_OP;
+
+DECLARE_X86_UNARY_ACC(Erf, X86_ERF_OP);
+
+REGISTER_X86_ACC(Erf, LAYER_ERF);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_exp_layer_acc.cc
new file mode 100644
index 0000000..93fe679
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_exp_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_exp_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return exp(v);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::exp(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::exp(v);
+    }
+} X86_EXP_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_EXP, avx2, unary2_kernel_avx<X86_EXP_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_EXP, sse42, unary2_kernel_sse<X86_EXP_OP>);
+DECLARE_X86_UNARY2_ACC(Exp, LAYER_EXP);
+REGISTER_X86_ACC(Exp, LAYER_EXP);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.cc
new file mode 100644
index 0000000..45b1b08
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.cc
@@ -0,0 +1,143 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/x86_expand_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+X86ExpandLayerAcc::~X86ExpandLayerAcc() {}
+
+Status X86ExpandLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto expand_param = dynamic_cast<ExpandLayerParam*>(param_);
+    CHECK_PARAM_NULL(expand_param);
+    
+    if (inputs.size() == 2) {
+        auto data_dims = inputs[0]->GetBlobDesc().dims;
+        DimsVector shape_dims;
+        auto shape_data = (int *)inputs[1]->GetHandle().base;
+        auto shape_data_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        for (int i=0; i<shape_data_count; i++) {
+            shape_dims.push_back(shape_data[i]);
+        }
+        
+        expand_param->shape = shape_dims;
+        
+        auto output_dims = DimsFunctionUtils::Expand(data_dims, shape_dims, nullptr);
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+    
+    return AbstractLayerAcc::InferRuntimeOutputShape(inputs, outputs);
+}
+
+static void ExpandComputeOffset(DimsVector &offset, const DimsVector dims_in, const DimsVector dims_out) {
+    DimsVector dims_pad_in;
+    int pad_size = dims_out.size() - dims_in.size();
+    int i = 0;
+    for (; i < pad_size; i++) {
+        dims_pad_in.push_back(1);
+    }
+    for (; i < dims_out.size(); i++) {
+        dims_pad_in.push_back(dims_in[i - pad_size]);
+    }
+
+    offset.resize(dims_out.size());
+    int s = 1;
+    for (i = dims_out.size() - 1; i >= 0; i--) {
+        offset[i] = (dims_pad_in[i] == dims_out[i]) ? s : 0;
+        s *= dims_pad_in[i];
+    }
+}
+
+static void X86Expand(DimsVector output_shape, DimsVector input_shape,
+                      float *output_ptr, const float *input_ptr) {
+    DimsVector output_offset;
+    ExpandComputeOffset(output_offset, output_shape, output_shape);
+
+    DimsVector input_offset;
+    ExpandComputeOffset(input_offset, input_shape, output_shape);
+
+    DimsVector out_shape;
+    DimsVector in_offset;
+    DimsVector ou_offset;
+    // support maximum 6 dimension, may be extended in furture
+    out_shape.resize(6);
+    in_offset.resize(6);
+    ou_offset.resize(6);
+    // if dim < 6, pad to 6
+    int pad_size = 6 - output_shape.size();
+    for (int i = 0; i < pad_size; i++) {
+        out_shape[i] = 1;
+        in_offset[i] = 0;
+        ou_offset[i] = 0;
+    }
+    for (int i = pad_size; i < 6; i++) {
+        out_shape[i] = output_shape[i - pad_size];
+        in_offset[i] = input_offset[i - pad_size];
+        ou_offset[i] = output_offset[i - pad_size];
+    }
+
+    for (int i0 = 0; i0 < out_shape[0]; i0++) {
+        auto in_i0 = input_ptr + i0 * in_offset[0];
+        auto ou_i0 = output_ptr + i0 * ou_offset[0];
+        for (int i1 = 0; i1 < out_shape[1]; i1++) {
+            auto in_i1 = in_i0 + i1 * in_offset[1];
+            auto ou_i1 = ou_i0 + i1 * ou_offset[1];
+            for (int i2 = 0; i2 < out_shape[2]; i2++) {
+                auto in_i2 = in_i1 + i2 * in_offset[2];
+                auto ou_i2 = ou_i1 + i2 * ou_offset[2];
+                for (int i3 = 0; i3 < out_shape[3]; i3++) {
+                    auto in_i3 = in_i2 + i3 * in_offset[3];
+                    auto ou_i3 = ou_i2 + i3 * ou_offset[3];
+                    for (int i4 = 0; i4 < out_shape[4]; i4++) {
+                        auto in_i4 = in_i3 + i4 * in_offset[4];
+                        auto ou_i4 = ou_i3 + i4 * ou_offset[4];
+                        for (int i5 = 0; i5 < out_shape[5]; i5++) {
+                            auto in_i5 = in_i4 + i5 * in_offset[5];
+                            auto ou_i5 = ou_i4 + i5 * ou_offset[5];
+                            ou_i5[0] = in_i5[0];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+Status X86ExpandLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+
+    if (output_dims.size() > 6) {
+        return Status(TNNERR_MODEL_ERR, "x86 expand only support dims <= 6");
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        const float *input_data = reinterpret_cast<const float *>(input_blob->GetHandle().base);
+        float *output_data = reinterpret_cast<float *>(output_blob->GetHandle().base);
+
+        X86Expand(output_dims, input_dims, output_data, input_data);
+    } else {
+        return Status(TNNERR_MODEL_ERR, "blob type is unsupported");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Expand, LAYER_EXPAND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.h
new file mode 100644
index 0000000..62e8d65
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_expand_layer_acc.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_EXPAND_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_EXPAND_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class X86ExpandLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86ExpandLayerAcc();
+
+    virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_EXPAND_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_flatten_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_flatten_layer_acc.cc
new file mode 100644
index 0000000..904171f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_flatten_layer_acc.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Flatten, LAYER_FLATTEN);
+
+Status X86FlattenLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<FlattenLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: FlattenLayerParam is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    if (output->GetHandle().base != input->GetHandle().base) {
+        auto dims_input    = input->GetBlobDesc().dims;
+        int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+        auto size_in_bytes = DimsVectorUtils::Count(dims_input) * data_byte_size;
+        memcpy(output->GetHandle().base, input->GetHandle().base, size_in_bytes);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_floor_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_floor_layer_acc.cc
new file mode 100644
index 0000000..e7ed5c8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_floor_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct x86_floor_layer_acc : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return floorf(v);
+    }
+} X86_FLOOR_OP;
+
+DECLARE_X86_UNARY_ACC(Floor, X86_FLOOR_OP);
+
+REGISTER_X86_ACC(Floor, LAYER_FLOOR);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_gather_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_gather_layer_acc.cc
new file mode 100644
index 0000000..63a3587
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_gather_layer_acc.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Gather, LAYER_GATHER);
+
+Status X86GatherLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int axis = layer_param->axis;
+    
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+    
+    DimsVector input_data_dims;
+    char *input_data_ptr = nullptr;
+    if (layer_param->data_in_resource) {
+        input_data_dims = layer_resource->data.GetBufferDims();
+        input_data_ptr = layer_resource->data.force_to<char*>();
+    } else {
+        input_data_dims = (*(inputs.begin()))->GetBlobDesc().dims;
+        input_data_ptr = (char*)(*(inputs.begin()))->GetHandle().base;
+    }
+    
+    DimsVector indices_dims;
+    int *indices_data_ptr = nullptr;
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_data_ptr = layer_resource->indices.force_to<int*>();
+    } else {
+        indices_dims = (*(inputs.rbegin()))->GetBlobDesc().dims;
+        indices_data_ptr = (int *)(*(inputs.rbegin()))->GetHandle().base;
+    }
+    
+    const int slice_size = DimsVectorUtils::Count(input_data_dims, axis+1);
+    const int input_slice_count = DimsVectorUtils::Count(input_data_dims, axis, axis+1);
+    const int batch = DimsVectorUtils::Count(input_data_dims, 0, axis);
+    
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const int output_slice_count = DimsVectorUtils::Count(indices_dims);
+    
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto output_data_ptr = (char*)outputs[0]->GetHandle().base;
+    
+    for (int b=0; b<batch; b++) {
+        int input_index_b = b*input_slice_count*slice_size;
+        int output_index_b = b*output_slice_count*slice_size;
+        for (int i=0; i<output_slice_count; i++) {
+            int slice_index = indices_data_ptr[i];
+            if (slice_index < 0 || slice_index >= input_slice_count) {
+                LOGE("X86GatherLayerAcc::Forward invalid slice_index\n");
+                return Status(TNNERR_MODEL_ERR, "X86GatherLayerAcc::Forward invalid slice_index");
+            }
+            int input_index = input_index_b + slice_index*slice_size;
+            int output_index = output_index_b + i*slice_size;
+            
+            memcpy(output_data_ptr + output_index*ele_size,
+                   input_data_ptr + input_index*ele_size,
+                   slice_size * ele_size);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Gather, LAYER_GATHER);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_gelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_gelu_layer_acc.cc
new file mode 100644
index 0000000..bb65ca0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_gelu_layer_acc.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+
+template <typename VEC>
+static VEC fast_erf_approximation(VEC x) {
+    auto t = VEC::div(VEC(1.f), VEC(1.f) + VEC(0.5f) * VEC::abs(x));
+    auto t_2 = t * t;
+    auto t_3 = t_2 * t;
+    auto t_4 = t_3 * t;
+    auto t_5 = t_4 * t;
+    auto t_6 = t_5 * t;
+    auto t_7 = t_6 * t;
+    auto t_8 = t_7 * t;
+    auto t_9 = t_8 * t;
+
+    auto v = t * VEC::exp(VEC::neg(x) * x - VEC(1.26551223) +
+                             VEC(1.00002368) * t +
+                             VEC(0.37409196) * t_2 +
+                             VEC(0.09678418) * t_3 -
+                             VEC(0.18628806) * t_4 +
+                             VEC(0.27886807) * t_5 -
+                             VEC(1.13520398) * t_6 +
+                             VEC(1.48851587) * t_7 -
+                             VEC(0.82215223) * t_8 +
+                             VEC(0.17087277) * t_9);
+    auto v_pos = VEC(1.f) - v;
+    auto v_neg = v - VEC(1.f);
+
+    return VEC::bsl_cge(x, VEC(0.f), v_pos, v_neg);
+}
+
+typedef struct x86_gelu_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return 0.5f * v * (erff(v * 0.707106793288165f) + 1.0f);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4(0.5f) * v * (fast_erf_approximation<Float4>(v * Float4(0.707106793288165f)) + Float4(1.f));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8(0.5f) * v * (fast_erf_approximation<Float8>(v * Float8(0.707106793288165f)) + Float8(1.f));
+    }
+} X86_GELU_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_GELU, avx2, unary2_kernel_avx<X86_GELU_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_GELU, sse42, unary2_kernel_sse<X86_GELU_OP>);
+DECLARE_X86_UNARY2_ACC(Gelu, LAYER_GELU);
+REGISTER_X86_ACC(Gelu, LAYER_GELU);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..dda765e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_sigmoid_layer_acc.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_hardsigmoid_operator : x86_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<HardSigmoidLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: hardsigmoid layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: hardsigmoid layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        beta_  = layer_param->beta;
+        min_   = - beta_ / alpha_;
+        max_   = (1.0f - beta_) / alpha_;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        float tmp = v;
+        if (tmp <= min_) {
+            tmp = 0.f;
+        } else if (tmp < max_) {
+            tmp = tmp * alpha_ + beta_;
+        } else {
+            tmp = 1.0f;
+        }
+        return tmp;
+    }
+    
+private:
+    float min_ = 0.f;
+    float max_ = 0.f;
+    float alpha_ = 0.f;
+    float beta_ = 0.f;
+} X86_HARDSIGMOID_OP;
+
+DECLARE_X86_UNARY_ACC(HardSigmoid, X86_HARDSIGMOID_OP);
+
+REGISTER_X86_ACC(HardSigmoid, LAYER_HARDSIGMOID);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_swish_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_swish_layer_acc.cc
new file mode 100644
index 0000000..2dadfaa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_hard_swish_layer_acc.cc
@@ -0,0 +1,264 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/x86_common.h"
+
+#include <math.h>
+#include <algorithm>
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(HardSwish, X86_HARDSWISH_OP);
+
+Status X86HardSwishLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    auto param = dynamic_cast<HardSwishLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: HardSwishLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: HardSwishLayerParam is nil");
+    }
+
+    Blob *input_blob0, *input_blob1;
+    if (inputs.size() == 1) {
+        input_blob0 = inputs[0];
+        input_blob1 = inputs[0];
+    } else {
+        input_blob0 = inputs[0];
+        input_blob1 = inputs[1];
+    }
+
+    const float alpha = param->alpha;
+    const float beta  = param->beta;
+
+    auto input_dim0  = input_blob0->GetBlobDesc().dims;
+    auto input_dim1  = input_blob1->GetBlobDesc().dims;
+
+    int batch = input_dim0[0];
+    int channel = 1;
+    if (input_dim0.size() > 1) {
+        channel = input_dim0[1];
+    }
+    int channel_size = 1;
+    if (input_dim0.size() > 2) {
+        channel_size = DimsVectorUtils::Count(input_dim0, 2);
+    }
+
+    auto input_ptr0 = reinterpret_cast<float *>(input_blob0->GetHandle().base);
+    auto input_ptr1 = reinterpret_cast<float *>(input_blob1->GetHandle().base);
+
+    auto output_ptr = reinterpret_cast<float *>(outputs[0]->GetHandle().base);
+
+#ifdef __AVX2__
+    __m256 alpha_, beta_, zero_, one_, tmp00_, tmp01_, tmp02_, tmp03_, tmp10_, tmp11_, tmp12_, tmp13_;
+    alpha_ = _mm256_set1_ps(alpha);
+    beta_  = _mm256_set1_ps(beta);
+    zero_  = _mm256_setzero_ps();
+    one_   = _mm256_set1_ps(1.f);
+
+    int total_size = batch * channel * channel_size;
+    int tail = total_size - total_size % 32;
+    int part_tail = tail / 4;
+    const int offset1 = part_tail, offset2 = part_tail * 2, offset3 = part_tail * 3;
+    
+    if (input_dim0.size() > 2 && input_dim1.size() > 2) {
+        if (input_dim0[2] == input_dim1[2]) {
+            for (int index = 0; index < part_tail; index += 8) {
+                tmp00_ = _mm256_loadu_ps(input_ptr0 + index);
+                tmp01_ = _mm256_loadu_ps(input_ptr0 + index + offset1);
+                tmp02_ = _mm256_loadu_ps(input_ptr0 + index + offset2);
+                tmp03_ = _mm256_loadu_ps(input_ptr0 + index + offset3);
+                tmp10_ = _mm256_loadu_ps(input_ptr1 + index);
+                tmp11_ = _mm256_loadu_ps(input_ptr1 + index + offset1);
+                tmp12_ = _mm256_loadu_ps(input_ptr1 + index + offset2);
+                tmp13_ = _mm256_loadu_ps(input_ptr1 + index + offset3);
+
+                tmp10_ = _mm256_fmadd_ps(tmp10_, alpha_, beta_);
+                tmp11_ = _mm256_fmadd_ps(tmp11_, alpha_, beta_);
+                tmp12_ = _mm256_fmadd_ps(tmp12_, alpha_, beta_);
+                tmp13_ = _mm256_fmadd_ps(tmp13_, alpha_, beta_);
+
+                tmp10_ = _mm256_min_ps(tmp10_, one_);
+                tmp11_ = _mm256_min_ps(tmp11_, one_);
+                tmp12_ = _mm256_min_ps(tmp12_, one_);
+                tmp13_ = _mm256_min_ps(tmp13_, one_);
+
+                tmp10_ = _mm256_max_ps(tmp10_, zero_);
+                tmp11_ = _mm256_max_ps(tmp11_, zero_);
+                tmp12_ = _mm256_max_ps(tmp12_, zero_);
+                tmp13_ = _mm256_max_ps(tmp13_, zero_);
+
+                tmp10_ = _mm256_mul_ps(tmp10_, tmp00_);
+                tmp11_ = _mm256_mul_ps(tmp11_, tmp01_);
+                tmp12_ = _mm256_mul_ps(tmp12_, tmp02_);
+                tmp13_ = _mm256_mul_ps(tmp13_, tmp03_);
+
+                _mm256_storeu_ps(output_ptr + index,           tmp10_);
+                _mm256_storeu_ps(output_ptr + index + offset1, tmp11_);
+                _mm256_storeu_ps(output_ptr + index + offset2, tmp12_);
+                _mm256_storeu_ps(output_ptr + index + offset3, tmp13_);
+            }
+
+            // build mask
+            float mask[32] = {0.f};
+            for (int i = 0; i < total_size % 32; i++) mask[i] = -1.f;
+            __m256i mask0_ = _mm256_loadu_si256((__m256i*)mask);
+            __m256i mask1_ = _mm256_loadu_si256((__m256i*)(mask + 8));
+            __m256i mask2_ = _mm256_loadu_si256((__m256i*)(mask + 16));
+            __m256i mask3_ = _mm256_loadu_si256((__m256i*)(mask + 24));
+
+            tmp00_ = _mm256_maskload_ps(input_ptr0 + tail,      mask0_);
+            tmp01_ = _mm256_maskload_ps(input_ptr0 + tail + 8,  mask1_);
+            tmp02_ = _mm256_maskload_ps(input_ptr0 + tail + 16, mask2_);
+            tmp03_ = _mm256_maskload_ps(input_ptr0 + tail + 24, mask3_);
+
+            tmp10_ = _mm256_maskload_ps(input_ptr1 + tail,      mask0_);
+            tmp11_ = _mm256_maskload_ps(input_ptr1 + tail + 8,  mask1_);
+            tmp12_ = _mm256_maskload_ps(input_ptr1 + tail + 16, mask2_);
+            tmp13_ = _mm256_maskload_ps(input_ptr1 + tail + 24, mask3_);
+
+            tmp10_ = _mm256_fmadd_ps(tmp10_, alpha_, beta_);
+            tmp11_ = _mm256_fmadd_ps(tmp11_, alpha_, beta_);
+            tmp12_ = _mm256_fmadd_ps(tmp12_, alpha_, beta_);
+            tmp13_ = _mm256_fmadd_ps(tmp13_, alpha_, beta_);
+
+            tmp10_ = _mm256_min_ps(tmp10_, one_);
+            tmp11_ = _mm256_min_ps(tmp11_, one_);
+            tmp12_ = _mm256_min_ps(tmp12_, one_);
+            tmp13_ = _mm256_min_ps(tmp13_, one_);
+
+            tmp10_ = _mm256_max_ps(tmp10_, zero_);
+            tmp11_ = _mm256_max_ps(tmp11_, zero_);
+            tmp12_ = _mm256_max_ps(tmp12_, zero_);
+            tmp13_ = _mm256_max_ps(tmp13_, zero_);
+
+            tmp10_ = _mm256_mul_ps(tmp00_, tmp10_);
+            tmp11_ = _mm256_mul_ps(tmp01_, tmp11_);
+            tmp12_ = _mm256_mul_ps(tmp02_, tmp12_);
+            tmp13_ = _mm256_mul_ps(tmp03_, tmp13_);
+
+            _mm256_maskstore_ps(output_ptr + tail,      mask0_, tmp10_);
+            _mm256_maskstore_ps(output_ptr + tail + 8,  mask1_, tmp11_);
+            _mm256_maskstore_ps(output_ptr + tail + 16, mask2_, tmp12_);
+            _mm256_maskstore_ps(output_ptr + tail + 24, mask3_, tmp13_);
+
+            return TNN_OK;
+        }
+    }
+
+    tail = channel_size - channel_size % 8;
+    float mask[8] = {0.f};
+    for (int i = 0; i < channel_size % 8; i++) mask[i] = -1.f;
+    __m256i mask_ = _mm256_loadu_si256((__m256i*)mask);
+
+    if (channel % 4 == 0) {
+        int c_offset = channel / 4;
+        for (int b = 0; b < batch; b++) {
+            for (int c = 0; c < channel / 4; c++) {
+                float *single_data = input_ptr1 + b * channel + c;
+                float tmp0 = single_data[0], tmp1 = single_data[c_offset], tmp2 = single_data[c_offset*2], tmp3 = single_data[c_offset*3];
+                tmp0 = std::max(std::min(tmp0 * alpha + beta, 1.f), 0.f);
+                tmp1 = std::max(std::min(tmp1 * alpha + beta, 1.f), 0.f);
+                tmp2 = std::max(std::min(tmp2 * alpha + beta, 1.f), 0.f);
+                tmp3 = std::max(std::min(tmp3 * alpha + beta, 1.f), 0.f);
+                tmp10_ = _mm256_set1_ps(tmp0);
+                tmp11_ = _mm256_set1_ps(tmp1);
+                tmp12_ = _mm256_set1_ps(tmp2);
+                tmp13_ = _mm256_set1_ps(tmp3);
+
+                float* input_data00 = input_ptr0 + (b * channel + c) * channel_size;
+                float* input_data01 = input_data00 + c_offset * channel_size;
+                float* input_data02 = input_data01 + c_offset * channel_size;
+                float* input_data03 = input_data02 + c_offset * channel_size;
+                float* output_data0 = output_ptr + (b * channel + c) * channel_size;
+                float* output_data1 = output_data0 + c_offset * channel_size;
+                float* output_data2 = output_data1 + c_offset * channel_size;
+                float* output_data3 = output_data2 + c_offset * channel_size;
+
+                for (int index = 0; index < tail; index += 8) {
+                    tmp00_ = _mm256_loadu_ps(input_data00 + index);
+                    tmp01_ = _mm256_loadu_ps(input_data01 + index);
+                    tmp02_ = _mm256_loadu_ps(input_data02 + index);
+                    tmp03_ = _mm256_loadu_ps(input_data03 + index);
+
+                    tmp00_ = _mm256_mul_ps(tmp00_, tmp10_);
+                    tmp01_ = _mm256_mul_ps(tmp01_, tmp11_);
+                    tmp02_ = _mm256_mul_ps(tmp02_, tmp12_);
+                    tmp03_ = _mm256_mul_ps(tmp03_, tmp13_);
+
+                    _mm256_storeu_ps(output_data0 + index, tmp00_);
+                    _mm256_storeu_ps(output_data1 + index, tmp01_);
+                    _mm256_storeu_ps(output_data2 + index, tmp02_);
+                    _mm256_storeu_ps(output_data3 + index, tmp03_);
+                }
+
+                tmp00_ = _mm256_maskload_ps(input_data00 + tail, mask_);
+                tmp01_ = _mm256_maskload_ps(input_data01 + tail, mask_);
+                tmp02_ = _mm256_maskload_ps(input_data02 + tail, mask_);
+                tmp03_ = _mm256_maskload_ps(input_data03 + tail, mask_);
+
+                tmp00_ = _mm256_mul_ps(tmp00_, tmp10_);
+                tmp01_ = _mm256_mul_ps(tmp01_, tmp11_);
+                tmp02_ = _mm256_mul_ps(tmp02_, tmp12_);
+                tmp03_ = _mm256_mul_ps(tmp03_, tmp13_);
+
+                _mm256_maskstore_ps(output_data0 + tail, mask_, tmp00_);
+                _mm256_maskstore_ps(output_data1 + tail, mask_, tmp01_);
+                _mm256_maskstore_ps(output_data2 + tail, mask_, tmp02_);
+                _mm256_maskstore_ps(output_data3 + tail, mask_, tmp03_);
+            }
+        }
+    } else {
+        for (int b = 0; b < batch; b++) {
+            for (int c = 0; c < channel; c++) {
+                float* input_data0 = input_ptr0 + (b * channel + c) * channel_size;
+                float* input_data1 = input_ptr1 + (b * channel + c);
+                float* output_data = output_ptr + (b * channel + c) * channel_size;
+                float tmp = (*input_data1) * alpha + beta;
+                tmp = std::max(std::min(tmp, 1.f), 0.f);
+                tmp10_ = _mm256_set1_ps(tmp);
+                for (int index = 0; index < tail; index += 8) {
+                    tmp00_ = _mm256_loadu_ps(input_data0 + index);
+                    tmp00_ = _mm256_mul_ps(tmp00_, tmp10_);
+                    _mm256_storeu_ps(output_data + index, tmp00_);
+                }
+                tmp00_ = _mm256_maskload_ps(input_data0 + tail, mask_);
+                tmp00_ = _mm256_mul_ps(tmp00_, tmp10_);
+                _mm256_maskstore_ps(output_data + tail, mask_, tmp00_);
+            }
+        }
+    }
+
+#else
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < channel; c++) {
+            auto input_data0 = input_ptr0 + (b * channel + c) * channel_size;
+            auto input_data1 = input_ptr1 + (b * channel + c) * channel_size;
+            auto output_data = output_ptr + (b * channel + c) * channel_size;
+            for (int index = 0; index < channel_size; index++) {
+                float tmp = input_data1[index] * alpha + beta;
+                tmp = std::min(tmp, 1.f);
+                tmp = std::max(tmp, 0.f);
+                output_data[index] = input_data0[index] * tmp;
+            }
+        }
+    }
+#endif
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(HardSwish, LAYER_HARDSWISH);
+
+}   // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.cc
new file mode 100644
index 0000000..0b9385c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.cc
@@ -0,0 +1,296 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+#include "tnn/device/x86/acc/x86_inner_product_layer_acc.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+
+namespace TNN_NS {
+
+Status X86InnerProductLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto input_dims   = inputs[0]->GetBlobDesc().dims;
+    auto output_dims  = outputs[0]->GetBlobDesc().dims;
+    if (input_dims.size() < 2) {
+        LOGE("Error: input dim size must >= 2, but it is %lu\n", input_dims.size());
+        return Status(TNNERR_MODEL_ERR, "input dim size is not supported");
+    }
+    if (output_dims.size() < 2) {
+        LOGE("Error: output dim size must >= 2, but it is %lu\n", output_dims.size());
+        return Status(TNNERR_MODEL_ERR, "output dim size is not supported");
+    }
+
+    size_t flops = 2 * output_dims[0] * 
+        DimsVectorUtils::Count(input_dims, 1) * 
+        DimsVectorUtils::Count(output_dims, 1);
+    size_t in_bytes = DimsVectorUtils::Count(input_dims) * sizeof(float);
+    size_t out_bytes = DimsVectorUtils::Count(output_dims) * sizeof(float);
+    size_t weight_bytes = DimsVectorUtils::Count(input_dims, 1) * 
+                        DimsVectorUtils::Count(output_dims, 1) * sizeof(float);
+    float ai = (float)flops / (float)(in_bytes + out_bytes + weight_bytes);
+
+    impl_ = InnerProductSgemv;
+    if (ai >= 2.f) {
+        impl_ = InnerProductSgemm;
+    }
+
+    auto res = dynamic_cast<InnerProductLayerResource *>(resource);
+    CHECK_PARAM_NULL(res);
+
+    Status ret;
+    if (res->weight_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_INNER_PRODUCT, res, &fp32_res), TNN_OK);
+        fc_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret = X86LayerAcc::Init(context, param, fc_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    RETURN_ON_NEQ(ret, TNN_OK);
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    // converted weights are assumed to be packed, and can be freed now
+    if (fc_acc_f32_resource_) {
+        fc_acc_f32_resource_.reset();
+    }
+
+    return TNN_OK;
+}
+
+X86InnerProductLayerAcc::~X86InnerProductLayerAcc() {}
+
+Status X86InnerProductLayerAcc::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    InnerProductLayerResource *res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(res);
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto input_dims   = inputs[0]->GetBlobDesc().dims;
+    auto output_dims  = outputs[0]->GetBlobDesc().dims;
+
+    if (!buffer_weight_.GetBytesSize()) {
+        if (res->weight_handle.GetDataType() == DATA_TYPE_FLOAT) {
+            if (impl_ == InnerProductSgemv) {
+                int oc_rup = 8;
+                if (arch_ == sse42) {
+                    oc_rup = 4;
+                }
+                const float *src = res->weight_handle.force_to<float *>();
+                size_t input_stride = DimsVectorUtils::Count(input_dims, 1);
+                size_t weight_count = ROUND_UP(output_dims[1], oc_rup) * input_stride;
+                int data_byte_size = DataTypeUtils::GetBytesSize(res->weight_handle.GetDataType());
+
+                RawBuffer temp_buffer(weight_count * data_byte_size);
+                float *dst = temp_buffer.force_to<float *>();
+
+                if (arch_ == avx2) {
+                    PackC8(dst, src, input_stride, input_stride, input_stride, output_dims[1]);
+                } else if (arch_ == sse42) {
+                    PackC4(dst, src, input_stride, input_stride, input_stride, output_dims[1]);
+                }
+
+                temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+                buffer_weight_ = temp_buffer;
+            } else {
+                int k_c = conv_gemm_conf_.K_c_;
+                int m_block = conv_gemm_conf_.m_block_;
+                int K = DimsVectorUtils::Count(input_dims, 1);
+                int M = DimsVectorUtils::Count(output_dims, 1);
+                size_t weight_pack_size = ROUND_UP(K, k_c) * ROUND_UP(M, m_block);
+                const float *src = res->weight_handle.force_to<float *>();
+
+                // align pointer of packed weights, since gemm use aligned load for input A
+                RawBuffer temp_buffer(weight_pack_size * sizeof(float), 32);
+                float *dst = temp_buffer.force_to<float *>();
+
+                conv_pack_col_a_t(M, K, src, K, dst, conv_gemm_conf_);
+
+                temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+                buffer_weight_ = temp_buffer;
+            }
+        } else if (res->weight_handle.GetDataType() == DATA_TYPE_INT8) {
+            // trans nchw to nhwc4
+            size_t oc      = output_dims[1];
+            size_t oc_r4   = ROUND_UP(oc, 4);
+            size_t ic      = input_dims[1];
+            size_t ic_r4   = ROUND_UP(ic, 4);
+            size_t hw_size = DimsVectorUtils::Count(input_dims, 2);
+
+            int data_byte_size = DataTypeUtils::GetBytesSize(res->weight_handle.GetDataType());
+            RawBuffer temp_buffer(oc_r4 * ic_r4 * hw_size * data_byte_size);
+            const int8_t *weight_ptr = res->weight_handle.force_to<const int8_t*>();
+
+            int i = 0;
+            for (; i < oc; i++) {
+                auto w_src_oc = weight_ptr + i * ic * hw_size;
+                auto w_dst_oc = temp_buffer.force_to<int8_t *>() + i * ic_r4 * hw_size;
+                for (int hw = 0; hw < hw_size; hw++) {
+                    auto w_src_hw = w_src_oc + hw;
+                    auto w_dst_hw = w_dst_oc + hw * ic_r4;
+                    int j = 0;
+                    for (; j < ic; j++) {
+                        w_dst_hw[j] = w_src_hw[j * hw_size];
+                    }
+                    for (; j < ic_r4; j++) {
+                        w_dst_hw[j] = 0;
+                    }
+                }
+            }
+            for (; i < oc_r4; i++) {
+                auto w_dst_oc = temp_buffer.force_to<int8_t *>() + i * ic_r4 * hw_size;
+                memset(w_dst_oc, 0, ic_r4 * hw_size * data_byte_size);
+            }
+
+            temp_buffer.SetDataType(DATA_TYPE_INT8);
+            buffer_weight_ = temp_buffer;
+        } else {
+            LOGE("Error: DataType %d not support\n", res->weight_handle.GetDataType());
+            return Status(TNNERR_MODEL_ERR, "innerproduct res DataType is not supported");
+        }
+    }
+    return TNN_OK;
+}
+
+Status X86InnerProductLayerAcc::allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    InnerProductLayerParam *param = dynamic_cast<InnerProductLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    InnerProductLayerResource *res = dynamic_cast<InnerProductLayerResource *>(resource_);
+    CHECK_PARAM_NULL(res);
+
+    auto dims_output = outputs[0]->GetBlobDesc().dims;
+    if (!buffer_bias_.GetBytesSize()) {
+        // int8 bias needs oc_r4 memory space 
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * DataTypeUtils::GetBytesSize(res->bias_handle.GetDataType());
+        RawBuffer temp_buffer(total_byte_size);
+        if (param->has_bias) {
+            const int bias_handle_size    = res->bias_handle.GetBytesSize();
+            const float *bias_handle_data = res->bias_handle.force_to<float *>();
+            memcpy(temp_buffer.force_to<float *>(), res->bias_handle.force_to<float *>(), bias_handle_size);
+        }
+        buffer_bias_ = temp_buffer;
+    }
+
+    // alloc scale buffer for int8 kernel
+    if (!buffer_scale_.GetBytesSize()) {
+        if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+            auto o_scale        = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource()->scale_handle;
+            auto w_scale        = res->scale_handle;
+
+            if (w_scale.GetDataType() == DATA_TYPE_HALF)
+                w_scale = ConvertHalfHandle(w_scale);
+
+            int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+            buffer_scale_       = RawBuffer(total_byte_size);
+            auto w_scale_ptr    = w_scale.force_to<float *>();
+            CHECK_PARAM_NULL(w_scale_ptr);
+            auto o_scale_ptr = o_scale.force_to<float *>();
+            CHECK_PARAM_NULL(o_scale_ptr);
+            for (int i = 0; i < dims_output[1]; i++) {
+                int scale_idx_w = w_scale.GetDataCount() == 1 ? 0 : i;
+                int scale_idx_o = o_scale.GetDataCount() == 1 ? 0 : i;
+
+                if (o_scale_ptr[scale_idx_o] >= FLT_MIN)
+                    buffer_scale_.force_to<float *>()[i] = w_scale_ptr[scale_idx_w] / o_scale_ptr[scale_idx_o];
+                else
+                    buffer_scale_.force_to<float *>()[i] = 0.0;
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status X86InnerProductLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<InnerProductLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: InnerProductLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto input_dims   = inputs[0]->GetBlobDesc().dims;
+    auto output_dims  = outputs[0]->GetBlobDesc().dims;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto X86SgemvFunc = X86Sgemv<Float4, 4>;
+        void (*X86VecAddFunc)(float*, const float*, long) = X86_VectorAdd<Float4, 4>;
+        if (arch_ == avx2) {
+            X86SgemvFunc = X86Sgemv<Float8, 8>;
+            X86VecAddFunc = X86_VectorAdd<Float8, 8>;
+        }
+
+        float *input_data  = static_cast<float*>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float*>(output_blob->GetHandle().base);
+        float *weight_data = buffer_weight_.force_to<float *>();
+        float *bias_data   = buffer_bias_.force_to<float *>();
+
+        if (impl_ == InnerProductSgemv) {
+            X86SgemvFunc(output_data, input_data, weight_data, bias_data, input_dims, output_dims);
+        } else {
+            int k_c = conv_gemm_conf_.K_c_;
+            int n_block = conv_gemm_conf_.n_block_;
+            int K = DimsVectorUtils::Count(input_dims, 1);
+            int N = input_dims[0];
+            int M = DimsVectorUtils::Count(output_dims, 1);
+
+            size_t workspace_size = k_c * ROUND_UP(N, n_block) * sizeof(float);
+            float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+
+            RawBuffer fake_bias(N * sizeof(float));
+            float *fake_bias_ptr = fake_bias.force_to<float *>();
+
+            conv_sgemm_tn_col_major_prepack_a(M, N, K, weight_data, K,
+                                input_data, K, output_data, M,
+                                fake_bias_ptr, ActivationType_None,
+                                workspace, conv_gemm_conf_);
+            for (int i = 0; i < N; i++) {
+                auto dst = output_data + i * M;
+                X86VecAddFunc(dst, bias_data, M);
+            }
+        }
+    } else if (output_blob->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        int8_t *input_data   = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data  = static_cast<int8_t *>(output_blob->GetHandle().base);
+        int8_t *weight_data  = buffer_weight_.force_to<int8_t *>();
+        int32_t *bias_data   = buffer_bias_.force_to<int32_t *>();
+        float *scale_data    = buffer_scale_.force_to<float *>();
+        int ic_r4 = ROUND_UP(input_dims[1], 4);
+        int oc_r4 = ROUND_UP(output_dims[1], 4);
+        int hw    = DimsVectorUtils::Count(input_dims, 2);
+
+        for (int n = 0; n < output_dims[0]; n++) {
+            auto input_ptr  = input_data + n * ic_r4 * hw;
+            auto output_ptr = output_data + n * oc_r4;
+            X86GemvInt8(output_ptr, input_ptr, weight_data, bias_data, scale_data, ic_r4 * hw, oc_r4);
+        }
+    } else {
+        return Status(TNNERR_MODEL_ERR, "blob type is unsupported");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.h
new file mode 100644
index 0000000..27c3365
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inner_product_layer_acc.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_INNER_PRODUCT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_INNER_PRODUCT_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+
+enum InnerProductCompute {
+    InnerProductSgemv = 0x0000,
+    InnerProductSgemm = 0x0001,
+};
+
+namespace TNN_NS {
+class X86InnerProductLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86InnerProductLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    RawBuffer buffer_weight_;
+    RawBuffer buffer_bias_;
+    RawBuffer buffer_scale_;
+    conv_gemm_config<float, float, float> conv_gemm_conf_;
+    InnerProductCompute impl_;
+    std::shared_ptr<LayerResource> fc_acc_f32_resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_INNER_PRODUCT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_inst_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inst_norm_layer_acc.cc
new file mode 100644
index 0000000..8a0f4d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_inst_norm_layer_acc.cc
@@ -0,0 +1,437 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include <math.h>
+#include "tnn/device/x86/x86_common.h"
+
+// #define AVX2 1
+namespace TNN_NS {
+
+DECLARE_X86_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+Status X86InstanceNormLayerAcc::DoForward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    auto resource = dynamic_cast<InstanceNormLayerResource*>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error, InstanceNormLayerResource is nil");
+    }
+
+    auto input_blob         = inputs[0];
+    auto output_blob        = outputs[0];
+    float *input_data       = static_cast<float*>(input_blob->GetHandle().base);
+    float *output_data      = static_cast<float*>(output_blob->GetHandle().base);
+
+    int batch    = output_blob->GetBlobDesc().dims[0];
+    int channels = output_blob->GetBlobDesc().dims[1];
+    int area     = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+
+    if (area == 0) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    float *k_data = resource->scale_handle.force_to<float*>();
+    float *b_data = resource->bias_handle.force_to<float*>();
+
+    float epsilon = 0.00001f;
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        for (int b = 0; b < batch; b++) {
+            if (1) {
+                for (int c = 0; c < channels; c++) {
+#ifdef __AVX2__
+                    __m256 _sum_x, _sum_x2;
+                    float buffer[8];
+                    _sum_x = _mm256_setzero_ps();
+                    _sum_x2 = _mm256_setzero_ps();
+                    int head = 0;
+                    const int tail = area - area % 8;
+                    double temp;
+                    __m256 _temp;
+                    for (size_t i = head; i < tail; i += 8) {
+                        _temp = _mm256_loadu_ps(input_data + i);
+                        _sum_x = _mm256_add_ps(_sum_x, _temp);
+                        _sum_x2 = _mm256_fmadd_ps(_temp, _temp, _sum_x2);
+                    }
+
+                    float sum_x, sum_x2;
+                    _mm256_storeu_ps(buffer, _sum_x);
+                    sum_x = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum_x2);
+                    sum_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    for (size_t i = tail; i < area; i++) {
+                        temp = input_data[i];
+                        sum_x += temp;
+                        sum_x2 += temp * temp;
+                    }
+
+                    auto mean_x = sum_x / area;
+                    auto mean_x2 = sum_x2 / area;
+                    float variance = mean_x2 - mean_x * mean_x;
+                    variance = variance > 0 ? variance : 0;
+                    variance = 1.0f / sqrt(variance + epsilon);
+                    float k = k_data[c];
+                    variance *= k;
+
+                    float b = b_data == NULL ? 0.0f : b_data[c];
+                    b -= mean_x * variance;
+
+                    _sum_x = _mm256_broadcast_ss(&variance);
+                    _sum_x2 = _mm256_broadcast_ss(&b);
+                    const float *tail_p = output_data + tail;
+                    for (; output_data < tail_p; output_data += 8, input_data += 8) {
+                        // std::cout << i << std::endl;
+                        _temp = _mm256_loadu_ps(input_data);
+                        _temp = _mm256_fmadd_ps(_temp, _sum_x, _sum_x2);
+                        _mm256_storeu_ps(output_data, _temp);
+                    }
+                    for (size_t i = tail; i < area; i++, output_data++, input_data++) {
+                        *output_data = (*input_data) * variance + b;
+                    }
+#else
+                    __m128 _sum_x, _sum_x2;
+                    float buffer[4];
+                    _sum_x = _mm_setzero_ps();
+                    _sum_x2 = _mm_setzero_ps();
+
+                    int head = 0;
+                    const int tail = area - area % 4;
+                    double temp;
+                    __m128 _temp;
+                    for (size_t i = head; i < tail; i += 4) {
+                        _temp = _mm_loadu_ps(input_data + i);
+                        _sum_x = _mm_add_ps(_sum_x, _temp);
+                        _sum_x2 = _mm_add_ps(_mm_mul_ps(_temp, _temp), _sum_x2);
+                    }
+
+                    float sum_x, sum_x2;
+                    _mm_storeu_ps(buffer, _sum_x);
+                    sum_x = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum_x2);
+                    sum_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    for (size_t i = tail; i < area; i++) {
+                        temp = input_data[i];
+                        sum_x += temp;
+                        sum_x2 += temp * temp;
+                    }
+
+                    auto mean_x = sum_x / area;
+                    auto mean_x2 = sum_x2 / area;
+                    float variance = mean_x2 - mean_x * mean_x;
+                    variance = variance > 0 ? variance : 0;
+                    variance = 1.0f / sqrt(variance + epsilon);
+                    float k = k_data[c];
+                    variance *= k;
+
+                    float b = b_data == NULL ? 0.0f : b_data[c];
+                    b -= mean_x * variance;
+
+                    _sum_x = _mm_load1_ps(&variance);
+                    _sum_x2 = _mm_load1_ps(&b);
+                    const float *tail_p = output_data + tail;
+                    for (; output_data < tail_p; output_data += 4, input_data += 4) {
+                        // std::cout << i << std::endl;
+                        _temp = _mm_loadu_ps(input_data);
+                        _temp = _mm_add_ps(_mm_mul_ps(_temp, _sum_x), _sum_x2);
+                        _mm_storeu_ps(output_data, _temp);
+                    }
+                    for (size_t i = tail; i < area; i++, output_data++, input_data++) {
+                        *output_data = (*input_data) * variance + b;
+                    }
+#endif
+                }
+            } else {
+                for (int c = 0; c < channels; c += 4) {
+#ifdef __AVX2__
+                    float buffer[8];
+                    __m256 _sum1_x  = _mm256_setzero_ps();
+                    __m256 _sum2_x  = _mm256_setzero_ps();
+                    __m256 _sum3_x  = _mm256_setzero_ps();
+                    __m256 _sum4_x  = _mm256_setzero_ps();
+                    __m256 _sum1_x2 = _mm256_setzero_ps();
+                    __m256 _sum2_x2 = _mm256_setzero_ps();
+                    __m256 _sum3_x2 = _mm256_setzero_ps();
+                    __m256 _sum4_x2 = _mm256_setzero_ps();
+
+                    const int tail = area - area % 8;
+                    double temp;
+                    __m256 _temp1, _temp2, _temp3, _temp4;
+                    float *input_data1 = input_data + c * area;
+                    float *input_data2 = input_data + (c + 1) * area;
+                    float *input_data3 = input_data + (c + 2) * area;
+                    float *input_data4 = input_data + (c + 3) * area;
+                    float *output_data1 = output_data + c * area;
+                    float *output_data2 = output_data + (c + 1) * area;
+                    float *output_data3 = output_data + (c + 2) * area;
+                    float *output_data4 = output_data + (c + 3) * area;
+
+                    for (size_t i = 0; i < tail; i += 8) {
+                        _temp1 = _mm256_loadu_ps(input_data1 + i);
+                        _temp2 = _mm256_loadu_ps(input_data2 + i);
+                        _temp3 = _mm256_loadu_ps(input_data3 + i);
+                        _temp4 = _mm256_loadu_ps(input_data4 + i);
+                        _sum1_x = _mm256_add_ps(_sum1_x, _temp1);
+                        _sum2_x = _mm256_add_ps(_sum2_x, _temp2);
+                        _sum3_x = _mm256_add_ps(_sum3_x, _temp3);
+                        _sum4_x = _mm256_add_ps(_sum4_x, _temp4);
+                        _sum1_x2 = _mm256_fmadd_ps(_temp1, _temp1, _sum1_x2);
+                        _sum2_x2 = _mm256_fmadd_ps(_temp2, _temp2, _sum2_x2);
+                        _sum3_x2 = _mm256_fmadd_ps(_temp3, _temp3, _sum3_x2);
+                        _sum4_x2 = _mm256_fmadd_ps(_temp4, _temp4, _sum4_x2);
+                    }
+
+                    _mm256_storeu_ps(buffer, _sum1_x);
+                    float sum1_x = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum2_x);
+                    float sum2_x = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum3_x);
+                    float sum3_x = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum4_x);
+                    float sum4_x = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum1_x2);
+                    float sum1_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum2_x2);
+                    float sum2_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum3_x2);
+                    float sum3_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+                    _mm256_storeu_ps(buffer, _sum4_x2);
+                    float sum4_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
+
+                    for (size_t i = tail; i < area; i++) {
+                        sum1_x += input_data1[i];
+                        sum1_x2 += input_data1[i] * input_data1[i];
+
+                        sum2_x += input_data2[i];
+                        sum2_x2 += input_data2[i] * input_data2[i];
+
+                        sum3_x += input_data3[i];
+                        sum3_x2 += input_data3[i] * input_data3[i];
+
+                        sum4_x += input_data4[i];
+                        sum4_x2 += input_data4[i] * input_data4[i];
+                    }
+
+                    float mean1_x = sum1_x / area;
+                    float mean2_x = sum2_x / area;
+                    float mean3_x = sum3_x / area;
+                    float mean4_x = sum4_x / area;
+                    float mean1_x2 = sum1_x2 / area;
+                    float mean2_x2 = sum2_x2 / area;
+                    float mean3_x2 = sum3_x2 / area;
+                    float mean4_x2 = sum4_x2 / area;
+                    float variance1 = mean1_x2 - mean1_x * mean1_x;
+                    float variance2 = mean2_x2 - mean2_x * mean2_x;
+                    float variance3 = mean3_x2 - mean3_x * mean3_x;
+                    float variance4 = mean4_x2 - mean4_x * mean4_x;
+                    variance1 = variance1 > 0 ? variance1 : 0;
+                    variance2 = variance2 > 0 ? variance2 : 0;
+                    variance3 = variance3 > 0 ? variance3 : 0;
+                    variance4 = variance4 > 0 ? variance4 : 0;
+                    variance1 = 1.0f / sqrt(variance1 + epsilon);
+                    variance2 = 1.0f / sqrt(variance2 + epsilon);
+                    variance3 = 1.0f / sqrt(variance3 + epsilon);
+                    variance4 = 1.0f / sqrt(variance4 + epsilon);
+                    variance1 *= k_data[c];
+                    variance2 *= k_data[c + 1];
+                    variance3 *= k_data[c + 2];
+                    variance4 *= k_data[c + 3];
+                    float b1 = b_data == NULL ? 0.0f : b_data[c];
+                    float b2 = b_data == NULL ? 0.0f : b_data[c + 1];
+                    float b3 = b_data == NULL ? 0.0f : b_data[c + 2];
+                    float b4 = b_data == NULL ? 0.0f : b_data[c + 3];
+                    b1 -= mean1_x * variance1;
+                    b2 -= mean2_x * variance2;
+                    b3 -= mean3_x * variance3;
+                    b4 -= mean4_x * variance4;
+
+                    _sum1_x = _mm256_broadcast_ss(&variance1);
+                    _sum2_x = _mm256_broadcast_ss(&variance2);
+                    _sum3_x = _mm256_broadcast_ss(&variance3);
+                    _sum4_x = _mm256_broadcast_ss(&variance4);
+                    _sum1_x2 = _mm256_broadcast_ss(&b1);
+                    _sum2_x2 = _mm256_broadcast_ss(&b2);
+                    _sum3_x2 = _mm256_broadcast_ss(&b3);
+                    _sum4_x2 = _mm256_broadcast_ss(&b4);
+
+                    for (size_t i = 0; i < tail; i += 8) {
+                        _temp1 = _mm256_loadu_ps(input_data1 + i);
+                        _temp2 = _mm256_loadu_ps(input_data2 + i);
+                        _temp3 = _mm256_loadu_ps(input_data3 + i);
+                        _temp4 = _mm256_loadu_ps(input_data4 + i);
+                        _temp1 = _mm256_fmadd_ps(_temp1, _sum1_x, _sum1_x2);
+                        _temp2 = _mm256_fmadd_ps(_temp2, _sum2_x, _sum2_x2);
+                        _temp3 = _mm256_fmadd_ps(_temp3, _sum3_x, _sum3_x2);
+                        _temp4 = _mm256_fmadd_ps(_temp4, _sum4_x, _sum4_x2);
+                        _mm256_storeu_ps(output_data1 + i, _temp1);
+                        _mm256_storeu_ps(output_data2 + i, _temp2);
+                        _mm256_storeu_ps(output_data3 + i, _temp3);
+                        _mm256_storeu_ps(output_data4 + i, _temp4);
+                    }
+
+                    for (size_t i = tail; i < area; i++) {
+                        *(output_data1 + i) = *(input_data1 + i) * variance1 + b1;
+                        *(output_data2 + i) = *(input_data2 + i) * variance2 + b2;
+                        *(output_data3 + i) = *(input_data3 + i) * variance3 + b3;
+                        *(output_data4 + i) = *(input_data4 + i) * variance4 + b4;
+                    }
+#else
+                    float buffer[4];
+                    __m128 _sum1_x  = _mm_setzero_ps();
+                    __m128 _sum2_x  = _mm_setzero_ps();
+                    __m128 _sum3_x  = _mm_setzero_ps();
+                    __m128 _sum4_x  = _mm_setzero_ps();
+                    __m128 _sum1_x2 = _mm_setzero_ps();
+                    __m128 _sum2_x2 = _mm_setzero_ps();
+                    __m128 _sum3_x2 = _mm_setzero_ps();
+                    __m128 _sum4_x2 = _mm_setzero_ps();
+
+                    const int tail = area - area % 4;
+                    double temp;
+                    __m128 _temp1, _temp2, _temp3, _temp4;
+                    float *input_data1 = input_data + c * area;
+                    float *input_data2 = input_data + (c + 1) * area;
+                    float *input_data3 = input_data + (c + 2) * area;
+                    float *input_data4 = input_data + (c + 3) * area;
+                    float *output_data1 = output_data + c * area;
+                    float *output_data2 = output_data + (c + 1) * area;
+                    float *output_data3 = output_data + (c + 2) * area;
+                    float *output_data4 = output_data + (c + 3) * area;
+
+                    for (size_t i = 0; i < tail; i += 4) {
+                        _temp1 = _mm_loadu_ps(input_data1 + i);
+                        _temp2 = _mm_loadu_ps(input_data2 + i);
+                        _temp3 = _mm_loadu_ps(input_data3 + i);
+                        _temp4 = _mm_loadu_ps(input_data4 + i);
+                        _sum1_x = _mm_add_ps(_sum1_x, _temp1);
+                        _sum2_x = _mm_add_ps(_sum2_x, _temp2);
+                        _sum3_x = _mm_add_ps(_sum3_x, _temp3);
+                        _sum4_x = _mm_add_ps(_sum4_x, _temp4);
+
+                        _sum1_x2 = _mm_add_ps(_mm_mul_ps(_temp1, _temp1), _sum1_x2);
+                        _sum2_x2 = _mm_add_ps(_mm_mul_ps(_temp2, _temp2), _sum2_x2);
+                        _sum3_x2 = _mm_add_ps(_mm_mul_ps(_temp3, _temp3), _sum3_x2);
+                        _sum4_x2 = _mm_add_ps(_mm_mul_ps(_temp4, _temp4), _sum4_x2);
+                    }
+
+                    float sum1_x, sum2_x, sum3_x, sum4_x, sum1_x2, sum2_x2, sum3_x2, sum4_x2;
+                    _mm_storeu_ps(buffer, _sum1_x);
+                    sum1_x = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum2_x);
+                    sum2_x = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum3_x);
+                    sum3_x = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum4_x);
+                    sum4_x = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum1_x2);
+                    sum1_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum2_x2);
+                    sum2_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum3_x2);
+                    sum3_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+                    _mm_storeu_ps(buffer, _sum4_x2);
+                    sum4_x2 = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+
+                    for (size_t i = tail; i < area; i++) {
+                        sum1_x += input_data1[i];
+                        sum1_x2 += input_data1[i] * input_data1[i];
+
+                        sum2_x += input_data2[i];
+                        sum2_x2 += input_data2[i] * input_data2[i];
+
+                        sum3_x += input_data3[i];
+                        sum3_x2 += input_data3[i] * input_data3[i];
+
+                        sum4_x += input_data4[i];
+                        sum4_x2 += input_data4[i] * input_data4[i];
+                    }
+
+                    float mean1_x = sum1_x / area;
+                    float mean2_x = sum2_x / area;
+                    float mean3_x = sum3_x / area;
+                    float mean4_x = sum4_x / area;
+                    float mean1_x2 = sum1_x2 / area;
+                    float mean2_x2 = sum2_x2 / area;
+                    float mean3_x2 = sum3_x2 / area;
+                    float mean4_x2 = sum4_x2 / area;
+                    float variance1 = mean1_x2 - mean1_x * mean1_x;
+                    float variance2 = mean2_x2 - mean2_x * mean2_x;
+                    float variance3 = mean3_x2 - mean3_x * mean3_x;
+                    float variance4 = mean4_x2 - mean4_x * mean4_x;
+                    variance1 = variance1 > 0 ? variance1 : 0;
+                    variance2 = variance2 > 0 ? variance2 : 0;
+                    variance3 = variance3 > 0 ? variance3 : 0;
+                    variance4 = variance4 > 0 ? variance4 : 0;
+                    variance1 = 1.0f / sqrt(variance1 + epsilon);
+                    variance2 = 1.0f / sqrt(variance2 + epsilon);
+                    variance3 = 1.0f / sqrt(variance3 + epsilon);
+                    variance4 = 1.0f / sqrt(variance4 + epsilon);
+                    variance1 *= k_data[c];
+                    variance2 *= k_data[c + 1];
+                    variance3 *= k_data[c + 2];
+                    variance4 *= k_data[c + 3];
+                    float b1 = b_data == NULL ? 0.0f : b_data[c];
+                    float b2 = b_data == NULL ? 0.0f : b_data[c + 1];
+                    float b3 = b_data == NULL ? 0.0f : b_data[c + 2];
+                    float b4 = b_data == NULL ? 0.0f : b_data[c + 3];
+                    b1 -= mean1_x * variance1;
+                    b2 -= mean2_x * variance2;
+                    b3 -= mean3_x * variance3;
+                    b4 -= mean4_x * variance4;
+
+                    _sum1_x = _mm_load1_ps(&variance1);
+                    _sum2_x = _mm_load1_ps(&variance2);
+                    _sum3_x = _mm_load1_ps(&variance3);
+                    _sum4_x = _mm_load1_ps(&variance4);
+                    _sum1_x2 = _mm_load1_ps(&b1);
+                    _sum2_x2 = _mm_load1_ps(&b2);
+                    _sum3_x2 = _mm_load1_ps(&b3);
+                    _sum4_x2 = _mm_load1_ps(&b4);
+
+                    for (size_t i = 0; i < tail; i += 4) {
+                        _temp1 = _mm_loadu_ps(input_data1 + i);
+                        _temp2 = _mm_loadu_ps(input_data2 + i);
+                        _temp3 = _mm_loadu_ps(input_data3 + i);
+                        _temp4 = _mm_loadu_ps(input_data4 + i);
+                        _temp1 = _mm_add_ps(_mm_mul_ps(_temp1, _sum1_x), _sum1_x2);
+                        _temp2 = _mm_add_ps(_mm_mul_ps(_temp2, _sum2_x), _sum2_x2);
+                        _temp3 = _mm_add_ps(_mm_mul_ps(_temp3, _sum3_x), _sum3_x2);
+                        _temp4 = _mm_add_ps(_mm_mul_ps(_temp4, _sum4_x), _sum4_x2);
+                        _mm_storeu_ps(output_data1 + i, _temp1);
+                        _mm_storeu_ps(output_data2 + i, _temp2);
+                        _mm_storeu_ps(output_data3 + i, _temp3);
+                        _mm_storeu_ps(output_data4 + i, _temp4);
+                    }
+
+                    for (size_t i = tail; i < area; i++) {
+                        *(output_data1 + i) = *(input_data1 + i) * variance1 + b1;
+                        *(output_data2 + i) = *(input_data2 + i) * variance2 + b2;
+                        *(output_data3 + i) = *(input_data3 + i) * variance3 + b3;
+                        *(output_data4 + i) = *(input_data4 + i) * variance4 + b4;
+                    }
+#endif
+                }
+            }
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(InstanceNorm, LAYER_INST_BATCH_NORM);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.cc
new file mode 100644
index 0000000..39acfe8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.cc
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/blob_transfer_utils.h"
+
+namespace TNN_NS {
+
+X86LayerAcc::~X86LayerAcc() {}
+
+Status X86LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    AbstractLayerAcc::Init(context, param, resource, inputs, outputs);
+    context_ = reinterpret_cast<X86Context *>(context);
+
+    param_    = param;
+    resource_ = resource;
+
+    RETURN_ON_NEQ(ReloadConstantBlobs(inputs, false), TNN_OK);
+
+    if (cpu_with_isa(avx2)) {
+        arch_ = avx2;
+    } else if (cpu_with_isa(sse42)) {
+        arch_ = sse42;
+    } else {
+        return Status(TNNERR_DEVICE_NOT_SUPPORT, "Cat not support X86 arch before SSE4.2");
+    }
+
+    return Reshape(inputs, outputs);
+}
+
+std::vector<DataFormat> X86LayerAcc::SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type) {
+    std::vector<DataFormat> support_list;
+    if (dims_size == 4) {
+        if (data_type == DATA_TYPE_FLOAT)
+            support_list.push_back(DATA_FORMAT_NCHW);
+        else if (data_type == DATA_TYPE_INT8)
+            support_list.push_back(DATA_FORMAT_NHWC4);
+    }
+    return support_list;
+}
+
+Status X86LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status;
+#if TNN_PROFILE
+    auto pdata = std::make_shared<ProfilingData>();
+    UpdateProfilingData(pdata.get(), param_, inputs[0]->GetBlobDesc().dims, outputs[0]->GetBlobDesc().dims);
+    timer.Start();
+#endif
+
+    status = this->DoForward(inputs, outputs);
+
+#if TNN_PROFILE
+    pdata->kernel_time = timer.TimeEclapsed();
+    context_->AddProfilingData(pdata);
+#endif
+
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    return TNN_OK;
+}
+
+Status X86LayerAcc::Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    return TNN_OK;
+}
+
+Status X86LayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return Status(TNNERR_LAYER_ERR, "DoForward not implement");
+}
+
+Status X86LayerAcc::ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob) {
+    auto const_resource = const_resource_;
+    auto const_resource_flag = const_resource_flag_;
+    auto const_blob_map = const_blob_map_;
+    for (auto iter : inputs) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        if (only_reload_shape_differ_blob && const_resource_flag &&
+            const_resource_flag->find(name) == const_resource_flag->end()) {
+            continue;
+        }
+
+        auto buffer = (*const_resource)[name];
+        std::shared_ptr<Blob> blob = nullptr;
+        if (const_blob_map.find(name) != const_blob_map.end()) {
+            blob = const_blob_map[name];
+        }
+        auto status = RawBuffer2Blob(buffer.get(), blob);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+        const_blob_map[name] = blob;
+        iter->SetHandle(blob->GetHandle());
+        LOGD("Reload constant blob: %s\n", name.c_str());
+    }
+    const_blob_map_ = const_blob_map;
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.h
new file mode 100644
index 0000000..1e9fb27
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_acc.h
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/abstract_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/device/x86/acc/compute/jit/utils/cpu_isa.h"
+
+namespace TNN_NS {
+
+// @brief x86 layer acc
+class X86LayerAcc : public AbstractLayerAcc {
+public:
+    // @brief virtual destructor
+    virtual ~X86LayerAcc();
+
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource,
+                        const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs);
+    
+    virtual Status Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs);
+
+    virtual Status Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs);
+
+    // @brief allocate or update constant blobs if constant resource change
+    // Note: this func may cost much time, call this func only when necessary.
+    virtual Status ReloadConstantBlobs(const std::vector<Blob *> &inputs, bool only_reload_shape_differ_blob = false);
+
+#if TNN_PROFILE
+    Timer timer;
+#endif
+
+protected:
+    LayerParam* param_          = nullptr;
+    LayerResource* resource_    = nullptr;
+    X86Context *context_           = nullptr;
+    x86_isa_t arch_;
+
+private:
+    // @brief return device layer acc support data format
+    virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
+};
+
+#define DECLARE_X86_ACC(type_string, layer_type)                                                                   \
+    class X86##type_string##LayerAcc : public X86LayerAcc {                                                        \
+    public:                                                                                                        \
+        virtual ~X86##type_string##LayerAcc(){};                                                                   \
+        virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;  \
+    }
+
+#define REGISTER_X86_ACC(type_string, layer_type)                                                               \
+    X86TypeLayerAccRegister<TypeLayerAccCreator<X86##type_string##LayerAcc>> g_x86_##layer_type##_acc_register( \
+        layer_type);                                                                                            \
+
+} // TNN_NS
+
+#endif // TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_ACC_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc
new file mode 100644
index 0000000..a1f1ba1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+namespace TNN_NS {
+
+DECLARE_X86_ACC(LayerNorm, LAYER_LAYER_NORM);
+
+template <typename VEC, int pack>
+static void norm_func(float *input, float *output, int channels, int area, const float *k_data, const float *b_data,
+                      float ep) {
+    float *input_data  = input;
+    float *output_data = output;
+    for (int c = 0; c < channels; c++) {
+        // step 1: calc varience
+        VEC v_sum_x  = VEC(0.f);
+        VEC v_sum_x2 = VEC(0.f);
+        VEC v_temp   = VEC(0.f);
+        float buffer[pack];
+        int head       = 0;
+        const int tail = area - area % pack;
+        double var;
+
+        for (size_t i = head; i < tail; i += pack) {
+            v_temp  = VEC::loadu(input_data + i);
+            v_sum_x = VEC::add(v_sum_x, v_temp);
+            VEC::mla(v_sum_x2, v_temp, v_temp);
+        }
+
+        float sum_x = 0.f, sum_x2 = 0.f;
+        VEC::saveu(buffer, v_sum_x);
+        for (int i = 0; i < pack; i++)
+            sum_x += buffer[i];
+        VEC::saveu(buffer, v_sum_x2);
+        for (int i = 0; i < pack; i++)
+            sum_x2 += buffer[i];
+        for (size_t i = tail; i < area; i++) {
+            var = input_data[i];
+            sum_x += var;
+            sum_x2 += var * var;
+        }
+
+        auto mean_x    = sum_x / area;
+        auto mean_x2   = sum_x2 / area;
+        float variance = mean_x2 - mean_x * mean_x;
+        variance       = variance > 0 ? variance : 0;
+        variance       = 1.0f / sqrt(variance + ep);
+
+        // step2: normlization
+        VEC v_variance = VEC(&variance);
+        VEC v_mean     = VEC(&mean_x);
+        for (size_t i = head; i < tail; i += pack, input_data += pack, output_data += pack) {
+            VEC v_scale = VEC::loadu(k_data + i);
+            VEC v_bias  = VEC::loadu(b_data + i);
+            VEC v_data  = VEC::loadu(input_data);
+
+            v_scale = VEC::mul(v_variance, v_scale);  // var * k
+            v_bias  = VEC::sub(v_bias, VEC::mul(v_scale, v_mean));
+
+            VEC::mla(v_bias, v_data, v_scale);
+            VEC::saveu(output_data, v_bias);
+        }
+
+        for (size_t i = tail; i < area; i++, output_data++, input_data++) {
+            float b      = b_data[i] - variance * mean_x * k_data[i];
+            *output_data = (*input_data) * variance * k_data[i] + b;
+        }
+    }
+}
+
+Status X86LayerNormLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<LayerNormLayerParam *>(param_);
+    auto input_blob  = inputs[0];
+    auto scale_blob  = inputs[1];
+    auto bias_blob   = inputs[2];
+    auto output_blob = outputs[0];
+    auto dims_input  = input_blob->GetBlobDesc().dims;
+
+    const int reduce_dim_size  = layer_param->reduce_dims_size;
+    const int channel_dim_size = (int)dims_input.size() - reduce_dim_size;
+
+    const int channels = DimsVectorUtils::Count(dims_input, 0, channel_dim_size);
+    const int area     = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, channel_dim_size);
+
+    if (0 == channels || 0 == area) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    float *k_data = (float *)((char *)scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+    float *b_data = (float *)((char *)bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
+
+    const float epsilon = layer_param->eps;
+
+    auto func = norm_func<Float8, 8>;
+    if (arch_ == sse42) {
+        func = norm_func<Float4, 4>;
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = (float *)((char *)input_blob->GetHandle().base + input_blob->GetHandle().bytes_offset);
+        float *output_data = (float *)((char *)output_blob->GetHandle().base + output_blob->GetHandle().bytes_offset);
+        func(input_data, output_data, channels, area, k_data, b_data, epsilon);
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(LayerNorm, LAYER_LAYER_NORM);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_layer_acc.cc
new file mode 100644
index 0000000..64d51ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_log_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return log(v);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::log(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::log(v);
+    }
+} X86_LOG_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_LOG, avx2, unary2_kernel_avx<X86_LOG_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_LOG, sse42, unary2_kernel_sse<X86_LOG_OP>);
+DECLARE_X86_UNARY2_ACC(Log, LAYER_LOG);
+REGISTER_X86_ACC(Log, LAYER_LOG);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..d4faad9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_log_sigmoid_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_log_sigmoid_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return log(1.0f / (1.0f + exp(-v)));
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::log(Float4::sigmoid(v));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::log(Float8::sigmoid(v));
+    }
+} X86_LOGSIGMOID_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_LOGSIGMOID, avx2, unary2_kernel_avx<X86_LOGSIGMOID_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_LOGSIGMOID, sse42, unary2_kernel_sse<X86_LOGSIGMOID_OP>);
+DECLARE_X86_UNARY2_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+REGISTER_X86_ACC(LogSigmoid, LAYER_LOGSIGMOID);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_lrn_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lrn_layer_acc.cc
new file mode 100644
index 0000000..7166927
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lrn_layer_acc.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(LRN, LAYER_LRN);
+
+Status X86LRNLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<LRNLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: LRNLayerParam is empty");
+    }
+    float alpha = param->alpha;
+    float beta  = param->beta;
+    float bias  = param->bias;
+    int size    = param->size;
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        int batch   = output_blob->GetBlobDesc().dims[0];
+        int channel = output_blob->GetBlobDesc().dims[1];
+        int height  = output_blob->GetBlobDesc().dims[2];
+        int width   = output_blob->GetBlobDesc().dims[3];
+        int count   = height * width;
+
+        int squre_size = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 0);
+        float *square_data = reinterpret_cast<float *>(context_->GetSharedWorkSpace(squre_size * sizeof(float)));
+
+        for (int n = 0; n < batch; ++n) {
+            for (int c = 0; c < channel; ++c) {
+                // max(0, c - int(math.floor((nsize - 1) / 2)))
+                // :
+                // min(C-1, c + int(math.ceil((nsize - 1) / 2)) + 1),
+                int begin = std::max(0, c - int(std::floor((size - 1) / 2)));
+                int end   = std::min(channel, c + int(std::ceil((size - 1) / 2)) + 1);
+                for (int i = begin; i < end; ++i) {
+                    for (int j = 0; j < count; ++j) {
+                        int input_index = n * channel * count + i * count + j;
+                        int index       = n * channel * count + c * count + j;
+                        // square
+                        square_data[index] += std::pow(input_data[input_index], 2);
+                    }
+                }
+            }
+        }
+        // y = x / ((bias + (alpha / nsize) * square_sum) ** beta)
+        for (int n = 0; n < batch; ++n) {
+            for (int c = 0; c < channel; ++c) {
+                for (int i = 0; i < count; ++i) {
+                    int index        = n * channel * count + c * count + i;
+                    int output_index = index;
+                    output_data[output_index] =
+                        input_data[index] / (std::pow(bias + (alpha / float(size)) * square_data[index], beta));
+                }
+            }
+        }
+    } else {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "Error: this data type not supported in LRN layer");
+    }
+
+    return TNN_OK;
+}  // namespace TNN_NS
+
+REGISTER_X86_ACC(LRN, LAYER_LRN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.cc
new file mode 100644
index 0000000..f806524
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.cc
@@ -0,0 +1,331 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/device/x86/acc/x86_lstm_layer_acc.h"
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/utils/omp_utils.h"
+namespace TNN_NS {
+
+static void X86LSTMActivate(const float *gates, float *h_t, float *c_t, float *y, int len) {
+    int len_vec  = len / 4 * 4;
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int i = 0; i < len_vec; i += 4) {
+        Float4x4 vec = Float4x4::ld4u(gates + i * 4);
+        Float4 I, O, F, C;
+        vec.get_lane(I, 0);
+        vec.get_lane(O, 1);
+        vec.get_lane(F, 2);
+        vec.get_lane(C, 3);
+
+        I = Float4::sigmoid(I);
+        O = Float4::sigmoid(O);
+        F = Float4::sigmoid(F);
+        C = Float4::tanh(C);
+
+        Float4 cell2_vec = F * Float4::loadu(c_t + i) + I * C;
+        Float4 h_vec = O * Float4::tanh(cell2_vec);
+        Float4::saveu(c_t + i, cell2_vec);
+        Float4::saveu(h_t + i, h_vec);
+        Float4::saveu(y + i, h_vec);
+    }
+    for (int i = len_vec; i < len; i++) {
+        float I = gates[i * 4];
+        float O = gates[i * 4 + 1];
+        float F = gates[i * 4 + 2];
+        float C = gates[i * 4 + 3];
+
+        I = 1.f / (1.f + exp(-I));
+        F = 1.f / (1.f + exp(-F));
+        O = 1.f / (1.f + exp(-O));
+        C = tanh(C);
+
+        float cell2 = F * c_t[i] + I * C;
+        float H = O * tanh(cell2);
+        c_t[i] = cell2;
+        h_t[i] = H;
+        y[i] = H;
+    }
+}
+
+Status X86LSTMONNXLayerAcc::LSTMOneDirection(const float *x, float *y, const float *w, const float *r,
+                              const float *b, float *h_t, float *c_t, int seq_len, int batch_size,
+                              int input_size, int hidden_size, int reverse) {
+    int k_c = conv_gemm_conf_.K_c_;
+    int n_block = conv_gemm_conf_.n_block_;
+
+    // sgemm for weight tensor
+    // weights: [4*hidden_size, input_size]
+    // inputs: [seq_len, batch, input_size]
+    int K = input_size;
+    int N = seq_len * batch_size;
+    int M = 4 * hidden_size;
+
+    // two temp buf: gemm_buf and gates_buf
+    size_t gemm_buf_size = ROUND_UP(k_c * ROUND_UP(N, n_block) * sizeof(float), 32);
+    size_t gates_buf_size = ROUND_UP(N * M * sizeof(float), 32);
+    size_t workspace_size = gemm_buf_size + gates_buf_size;
+    float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+    float *gemm_buf = workspace;
+    float *gates_buf = workspace + gemm_buf_size / sizeof(float);
+
+    RawBuffer fake_bias(N * sizeof(float));
+    float *fake_bias_ptr = fake_bias.force_to<float *>();
+    conv_sgemm_tn_col_major_prepack_a(M, N, K, w, K, x, K, gates_buf, M,
+            fake_bias_ptr, ActivationType_None, gemm_buf, conv_gemm_conf_);
+    
+    for (int t = 0; t < seq_len; t++) {
+        int ti = reverse ? seq_len - 1 - t : t;
+        auto gates_t = gates_buf +  ti * batch_size * 4 * hidden_size;
+        auto y_t = y + ti * batch_size * hidden_size;
+
+        // add bias
+        OMP_PARALLEL_FOR_GUIDED_
+        for (int i = 0; i < batch_size; i++) {
+            auto gates_b = gates_t + i * 4 * hidden_size;
+            for (int j = 0; j < hidden_size; j++) {
+                auto gates_j = gates_b + j * 4;
+                auto bias_j = b + j * 4;
+                Float4::saveu(gates_j, Float4::loadu(gates_j) + Float4::loadu(bias_j));
+            }
+        }
+
+        // sgemm for recurrence weight
+        // weights: [4*hidden_size, hidden_size]
+        // inputs: [batch, hidden_size]
+        K = hidden_size;
+        N = batch_size;
+        M = 4 * hidden_size;
+        conv_sgemm_tn_col_major_prepack_a(M, N, K, r, K, h_t, K, gates_t, M,
+                nullptr, ActivationType_None, gemm_buf, conv_gemm_conf_);
+
+        // activation for h_t, c_t, output
+        X86LSTMActivate(gates_t, h_t, c_t, y_t, batch_size * hidden_size);
+    }
+    return TNN_OK;
+}
+
+X86LSTMONNXLayerAcc::~X86LSTMONNXLayerAcc() {}
+
+Status X86LSTMONNXLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+    }
+
+    RETURN_ON_NEQ(allocateBufferWeight(inputs, outputs), TNN_OK);
+    RETURN_ON_NEQ(allocateBufferBias(inputs, outputs), TNN_OK);
+
+    return TNN_OK;
+}
+
+Status X86LSTMONNXLayerAcc::allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // weights for gates, [num_direction, 4 * hidden_size, input_size]
+    auto w_dims = inputs[1]->GetBlobDesc().dims;
+    int w_direction_size = DimsVectorUtils::Count(w_dims, 1);
+    float *w_ptr = (float *)((char*)(inputs[1]->GetHandle().base) + inputs[1]->GetHandle().bytes_offset);
+
+    // recurrence weights, [num_direction, 4 * hidden_size, hidden_size]
+    auto r_dims = inputs[2]->GetBlobDesc().dims;
+    int r_direction_size = DimsVectorUtils::Count(r_dims, 1);
+    float *r_ptr = (float *)((char*)(inputs[2]->GetHandle().base) + inputs[2]->GetHandle().bytes_offset);
+
+    int k_c = conv_gemm_conf_.K_c_;
+    int m_block = conv_gemm_conf_.m_block_;
+    // gate weights
+    int K = w_dims[2];
+    int M = w_dims[1];
+    size_t w_pack_size = ROUND_UP(K, k_c) * ROUND_UP(M, m_block);
+    // align pointer of packed weights, since gemm use aligned load for input A
+    RawBuffer w_temp_buffer(w_dims[0] * w_pack_size * sizeof(float), 32);
+    
+    // before conv_pack, trans from 4 * hidden_size to hidden_size * 4
+    size_t trans_size = MAX(w_direction_size, r_direction_size);
+    RawBuffer trans_buf(trans_size * sizeof(float));
+    int hidden_size = w_dims[1] / 4;
+    float *trans_ptr = trans_buf.force_to<float *>();
+
+    for (int d = 0; d < w_dims[0]; d++) {
+        float *w_src = w_ptr + d * w_direction_size;
+        float *w_dst = w_temp_buffer.force_to<float *>() + d * w_pack_size;
+
+        // transpose
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < hidden_size; j++) {
+                auto trans_dst = trans_ptr + j * 4 * w_dims[2] + i * w_dims[2];
+                auto trans_src = w_src + i * hidden_size * w_dims[2] + j * w_dims[2];
+                memcpy(trans_dst, trans_src, w_dims[2] * sizeof(float));
+            }
+        }
+
+        conv_pack_col_a_t(M, K, trans_ptr, K, w_dst, conv_gemm_conf_);
+    }
+
+    // recurrence weights
+    K = r_dims[2];
+    M = r_dims[1];
+    size_t r_pack_size = ROUND_UP(K, k_c) * ROUND_UP(M, m_block);
+    RawBuffer r_temp_buffer(r_dims[0] * r_pack_size * sizeof(float), 32);
+    for (int d = 0; d < r_dims[0]; d++) {
+        float *r_src = r_ptr + d * r_direction_size;
+        float *r_dst = r_temp_buffer.force_to<float *>() + d * r_pack_size;
+
+        // transpose
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < hidden_size; j++) {
+                auto trans_dst = trans_ptr + j * 4 * r_dims[2] + i * r_dims[2];
+                auto trans_src = r_src + i * hidden_size * r_dims[2] + j * r_dims[2];
+                memcpy(trans_dst, trans_src, r_dims[2] * sizeof(float));
+            }
+        }
+
+        conv_pack_col_a_t(M, K, trans_ptr, K, r_dst, conv_gemm_conf_);
+    }
+
+    w_temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+    r_temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+    buffer_w_ = w_temp_buffer;
+    buffer_r_ = r_temp_buffer;
+
+    return TNN_OK;
+}
+
+Status X86LSTMONNXLayerAcc::allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // bias for gate and recurrence, [num_directions, 8*hidden_size]
+    auto b_dims = inputs[3]->GetBlobDesc().dims;
+    int hidden_size = b_dims[1] / 8;
+    int bias_size = hidden_size * 4;
+    RawBuffer b_temp_buffer(b_dims[0] * bias_size * sizeof(float));
+
+    float *b_ptr = (float *)((char*)(inputs[3]->GetHandle().base) + inputs[3]->GetHandle().bytes_offset);
+
+    for (int d = 0; d < b_dims[0]; d++) {
+        float *b_d = b_ptr + d * b_dims[1];
+        float *wb_d = b_d;
+        float *rb_d = b_d + 4 * hidden_size;
+        float *b_dst = b_temp_buffer.force_to<float *>() + d * bias_size;
+
+        // add bias and transpose to hidden_size * 4
+        for (int i = 0; i < hidden_size; i++) {
+            b_dst[i * 4 + 0] = wb_d[i + 0 * hidden_size] + rb_d[i + 0 * hidden_size];
+            b_dst[i * 4 + 1] = wb_d[i + 1 * hidden_size] + rb_d[i + 1 * hidden_size];
+            b_dst[i * 4 + 2] = wb_d[i + 2 * hidden_size] + rb_d[i + 2 * hidden_size];
+            b_dst[i * 4 + 3] = wb_d[i + 3 * hidden_size] + rb_d[i + 3 * hidden_size];
+        }
+    }
+    b_temp_buffer.SetDataType(DATA_TYPE_FLOAT);
+    buffer_b_ = b_temp_buffer;
+
+    return TNN_OK;
+}
+
+Status X86LSTMONNXLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+    int num_directions = layer_param->direction >=2 ? 2 : 1;
+    
+    bool reverse = false;
+    if (inputs.size() < 4) {
+        return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+    }
+    Blob * blob_h0 = nullptr;
+    Blob * blob_c0 = nullptr;
+    
+    if (inputs.size() >= 6) {
+        blob_h0 = inputs[4];
+        blob_c0 = inputs[5];
+    }
+    
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto T = input_dims[0]; // length of sequence
+    const auto batch = input_dims[1];  // batch_size
+    const auto input_size = DimsVectorUtils::Count(input_dims, 2); // input dimension
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    const auto hidden_size = layer_param->hidden_size; // output dimension
+    // block size for gemm
+    int k_c = conv_gemm_conf_.K_c_;
+    int m_block = conv_gemm_conf_.m_block_;
+    
+    //X shape [sequence batch_size input_size]
+    float *x = (float *)((char*)(inputs[0]->GetHandle().base) + inputs[0]->GetHandle().bytes_offset);
+    
+    //Y shape [sequence batch_size num_directions *hidden_size]
+    float *y = (float *)((char*)(outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset);
+    
+    //W[iofc], weight tensor for the gates, shape [num_directions, 4*hidden_size, input_size]
+    float *w = (float *)buffer_w_.force_to<float *>();
+    auto w_dims = inputs[1]->GetBlobDesc().dims;
+    size_t w_pack_size = ROUND_UP(w_dims[2], k_c) * ROUND_UP(w_dims[1], m_block);
+
+    //R[iofc], recurrence weight tensor, shape [num_directions, 4*hidden_size, hidden_size]
+    float *r = (float *)buffer_r_.force_to<float *>();
+    auto r_dims = inputs[2]->GetBlobDesc().dims;
+    size_t r_pack_size = ROUND_UP(r_dims[2], k_c) * ROUND_UP(r_dims[1], m_block);
+    
+    //B[iofc] Concatenation of [Wb[iofc], Rb[iofc]], [num_directions, 8*hidden_size]
+    float *b = (float *)buffer_b_.force_to<float *>();
+    
+    //initial_h, initial value of the hidden, If not specified - assumed to be 0. shape [num_directions, batch_size, hidden_size]
+    auto h_t = (float *)((char*)(outputs[1]->GetHandle().base) + outputs[1]->GetHandle().bytes_offset);
+    //initial_c, initial value of the cell, If not specified - assumed to be 0. shape [num_directions, batch_size, hidden_size]
+    auto c_t = (float *)((char*)(outputs[2]->GetHandle().base) + outputs[2]->GetHandle().bytes_offset);
+
+    if (inputs.size() >= 6) {
+        auto h_0 = (float *)((char*)(blob_h0->GetHandle().base) + blob_h0->GetHandle().bytes_offset);
+        auto c_0 = (float *)((char*)(blob_c0->GetHandle().base) + blob_c0->GetHandle().bytes_offset);
+        memcpy((void *)h_t, h_0, num_directions * batch * hidden_size * sizeof(float));
+        memcpy((void *)c_t, c_0, num_directions * batch * hidden_size * sizeof(float));
+    } else {
+        memset((void *)h_t, 0, num_directions * batch * hidden_size * sizeof(float));
+        memset((void *)c_t, 0, num_directions * batch * hidden_size * sizeof(float));
+    }
+    
+    if (layer_param->direction == 0 || layer_param->direction == 1) {
+        return LSTMOneDirection(x, y, w, r, b, h_t, c_t, T, batch, input_size, hidden_size, layer_param->direction);
+    } else if (layer_param->direction == 2) {
+        //Y shape [num_directions sequence batch_size hidden_size]
+        auto y_temp = std::shared_ptr<float>(new float[num_directions*T*batch*hidden_size], [](float* p) { delete[] p; });
+        auto y0 = y_temp.get();
+        auto y1 = y0 + T * batch * hidden_size;
+        LSTMOneDirection(x, y0, w, r, b, h_t, c_t, T, batch, input_size, hidden_size, 0);
+        
+        auto w1 = w + w_pack_size;
+        auto r1 = r + r_pack_size;
+        auto b1 = b + 4 * hidden_size;
+        auto h_t1 = h_t + batch * hidden_size;
+        auto c_t1 = c_t + batch * hidden_size;
+        LSTMOneDirection(x, y1, w1, r1, b1, h_t1, c_t1, T, batch, input_size, hidden_size, 1);
+        
+        //transpose [num_directions sequence batch_size hidden_size] to [sequence batch_size num_directions*hidden_size]
+        for (int i = 0; i < T*batch; i++) {
+            auto y0_data = y0 + i * hidden_size;
+            auto y1_data = y1 + i * hidden_size;
+            auto y_data = y + i * num_directions * hidden_size;
+
+            memcpy(y_data, y0_data, hidden_size * sizeof(float));
+            memcpy(y_data + hidden_size, y1_data, hidden_size * sizeof(float));
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "LSTMONNX has invalid direction param");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(LSTMONNX, LAYER_LSTMONNX);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.h
new file mode 100644
index 0000000..f310e80
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_lstm_layer_acc.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_LSTM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_LSTM_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+
+namespace TNN_NS {
+
+class X86LSTMONNXLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86LSTMONNXLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+    virtual Status allocateBufferWeight(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status allocateBufferBias(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+protected:
+    Status LSTMOneDirection(const float *x, float *y, const float *w, const float *r,
+                           const float *b, float *h_t, float *c_t, int seq_len, int batch_size,
+                           int input_size, int hidden_size, int reverse);
+
+    RawBuffer buffer_w_;
+    RawBuffer buffer_r_;
+    RawBuffer buffer_b_;
+    conv_gemm_config<float, float, float> conv_gemm_conf_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_LSTM_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.cc
new file mode 100644
index 0000000..460deea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/device/x86/acc/x86_mat_mul_layer_acc.h"
+
+namespace TNN_NS {
+
+X86MatMulLayerAcc::~X86MatMulLayerAcc() {}
+
+Status X86MatMulLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param               = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource            = dynamic_cast<MatMulLayerResource *>(resource_);
+    DimsVector matrix_a_dims = param->matrix_a_dims;
+    DimsVector matrix_b_dims = param->matrix_b_dims;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+    }
+    DataType data_type       = inputs[0]->GetBlobDesc().data_type;
+    auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
+    if (data_type == DATA_TYPE_FLOAT) {
+        float *matrix_a;
+        float *matrix_b;
+
+        if (inputs.size() == 2) {
+            matrix_a = static_cast<float *>(inputs[0]->GetHandle().base);
+            matrix_b = static_cast<float *>(inputs[1]->GetHandle().base);
+        } else {
+            auto weight = resource->weight.force_to<float *>();
+            matrix_a    = param->weight_position == 0 ? weight : static_cast<float *>(inputs[0]->GetHandle().base);
+            matrix_b    = param->weight_position == 1 ? weight : static_cast<float *>(inputs[0]->GetHandle().base);
+        }
+        auto matrix_c = static_cast<float *>(outputs[0]->GetHandle().base);
+
+        int k_c = conv_gemm_conf_.K_c_;
+        int m_c = conv_gemm_conf_.M_c_;
+        int n_block = conv_gemm_conf_.n_block_;
+
+        int M = matrix_b_dims[matrix_b_dims.size() - 1];
+        int K = matrix_a_dims[matrix_a_dims.size() - 1];
+        int N = matrix_a_dims[matrix_a_dims.size() - 2];
+
+        size_t pack_a_size = ROUND_UP(m_c * k_c * sizeof(float), 32);
+        size_t pack_b_size = k_c * ROUND_UP(N, n_block) * sizeof(float);
+        size_t workspace_size = pack_a_size + pack_b_size;
+        float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+
+        RawBuffer fake_bias(N * sizeof(float));
+        float *fake_bias_ptr = fake_bias.force_to<float *>();
+
+        int count_a     = DimsVectorUtils::Count(matrix_a_dims);
+        int count_b     = DimsVectorUtils::Count(matrix_b_dims);
+        int count_c     = DimsVectorUtils::Count(matrix_c_dims);
+        int batch_a   = count_a / (K * N);
+        int batch_b   = count_b / (M * K);
+        int batch_c   = count_c / (M * N);
+        for (int bc = 0; bc < batch_c; ++bc) {
+            int ba = bc < batch_a ? bc : 0;
+            int bb = bc < batch_b ? bc : 0;
+            auto a_ptr = matrix_a + ba * K * N;
+            auto b_ptr = matrix_b + bb * M * K;
+            auto c_ptr = matrix_c + bc * M * N;
+
+            // row major A[N * K] * B[K * M] = C[N * M]
+            // equals to
+            // col major B[M * K] * A[K * N] = C[M * N]
+            conv_sgemm_nn_col_major(M, N, K, b_ptr, M, a_ptr, K, c_ptr, M,
+                fake_bias_ptr, ActivationType_None, workspace, conv_gemm_conf_);
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(MatMul, LAYER_MATMUL)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.h
new file mode 100644
index 0000000..0ffa21f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mat_mul_layer_acc.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_MUL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_MUL_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/compute/jit/conv_sgemm_driver.h"
+
+namespace TNN_NS {
+class X86MatMulLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86MatMulLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    conv_gemm_config<float, float, float> conv_gemm_conf_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_MUL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_max_layer_acc.cc
new file mode 100644
index 0000000..c105726
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_max_layer_acc.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_BINARY_OP_ACC(Max, X86BinaryOpType::kMAX);
+
+REGISTER_X86_ACC(Max, LAYER_MAXIMUM);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_min_layer_acc.cc
new file mode 100644
index 0000000..06375a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_min_layer_acc.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_BINARY_OP_ACC(Min, X86BinaryOpType::kMIN);
+
+REGISTER_X86_ACC(Min, LAYER_MINIMUM);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mul_layer_acc.cc
new file mode 100644
index 0000000..d641c50
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_mul_layer_acc.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_BINARY_OP_ACC(Mul, X86BinaryOpType::kMUL);
+
+REGISTER_X86_ACC(Mul, LAYER_MUL);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_neg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_neg_layer_acc.cc
new file mode 100644
index 0000000..f1347bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_neg_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_neg_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return -v;
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::neg(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::neg(v);
+    }
+} X86_NEG_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_NEG, avx2, unary2_kernel_avx<X86_NEG_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_NEG, sse42, unary2_kernel_sse<X86_NEG_OP>);
+DECLARE_X86_UNARY2_ACC(Neg, LAYER_NEG);
+REGISTER_X86_ACC(Neg, LAYER_NEG);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_normalize_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_normalize_layer_acc.cc
new file mode 100644
index 0000000..be6a378
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_normalize_layer_acc.cc
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+#include <limits.h>
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Normalize, X86_NORMALIZE_OP);
+
+bool X86CheckNormalizeLayerParam(const int p, const int axis, const int across_spatial) {
+    return (p != 1 && p != 2 && p != INT_MAX && p != INT_MIN) || axis != 1 || across_spatial != 0;
+}
+
+Status X86NormalizeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs.size() < 1) {
+        LOGE("Error: invalid inputs count\n");
+        return Status(TNNERR_LAYER_ERR, "layer's inputs size must >= 2");
+    }
+    auto layer_param = dynamic_cast<NormalizeLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer param is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: layer param is nil");
+    }
+
+    float epsilon = layer_param->epsilon;
+    int axis      = layer_param->axis;
+    int p         = layer_param->p;
+
+    int across_spatial = layer_param->across_spatial;
+    int channel_shared = layer_param->channel_shared;
+
+    // old tnn support scale the result of normalize and only norm2
+    if (X86CheckNormalizeLayerParam(p, axis, across_spatial)) {
+        LOGE("Error: layer param is not supported now\n");
+        return Status(TNNERR_INST_ERR, "Error: layer param is not supported now");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto output_dims  = output_blob->GetBlobDesc().dims;
+    int batch         = output_dims[0];
+    int channel       = output_dims[1];
+    int channel_size  = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        float *denominator = reinterpret_cast<float *>(context_->GetSharedWorkSpace(channel_size * sizeof(float)));
+        memset(denominator, 0, channel_size * sizeof(float));
+
+        for (int b = 0; b < batch; b++) {
+            float *input_data_b  = input_data + b * channel * channel_size;
+            float *output_data_b = output_data + b * channel * channel_size;
+            int start_channel    = 0;
+            if (layer_param->p == INT_MAX || layer_param->p == INT_MIN) {
+                memcpy(denominator, input_data_b, channel_size * sizeof(float));
+                start_channel = 1;
+            }
+
+            for (int c = start_channel; c < channel; c++) {
+                float *input_data_c = input_data_b + c * channel_size;
+                for (int index = 0; index < channel_size; index++) {
+                    if (layer_param->p == 1) {
+                        // sum - abs(x)
+                        denominator[index] += fabs(input_data_c[index]);
+                    } else if (layer_param->p == 2) {
+                        // sum - x*x
+                        denominator[index] += input_data_c[index] * input_data_c[index];
+                    } else if (layer_param->p == INT_MAX) {
+                        denominator[index] = std::max(denominator[index], input_data_c[index]);
+                    } else if (layer_param->p == INT_MIN) {
+                        denominator[index] = std::min(denominator[index], input_data_c[index]);
+                    }
+                }
+            }
+
+            if (layer_param->p == 2) {
+                // max - sqrt
+                for (int index = 0; index < channel_size; index++) {
+                    denominator[index] = std::max((float)sqrt(denominator[index]), epsilon);
+                }
+            }
+
+            // div
+            for (int c = 0; c < channel; c++) {
+                float *input_data_c  = input_data_b + c * channel_size;
+                float *output_data_c = output_data_b + c * channel_size;
+                for (int index = 0; index < channel_size; index++) {
+                    output_data_c[index] = input_data_c[index] / denominator[index];
+                }
+            }
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Normalize, LAYER_NORMALIZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_onehot_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_onehot_layer_acc.cc
new file mode 100644
index 0000000..8f70453
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_onehot_layer_acc.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(OneHot, LAYER_ONEHOT);
+
+Status X86OneHotLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<OneHotLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    int axis = layer_param->axis;
+    const auto input_dims = inputs[0]->GetBlobDesc().dims;
+    const auto output_dims = outputs[0]->GetBlobDesc().dims;
+    if (axis < 0) {
+        axis += input_dims.size() + 1;
+    }
+    
+    //see https://github.com/onnx/onnx/blob/master/docs/Operators.md#OneHot
+    auto input_data_type  = inputs[0]->GetBlobDesc().data_type;
+    RETURN_VALUE_ON_NEQ(input_data_type, DATA_TYPE_INT32,
+                        Status(TNNERR_MODEL_ERR, "OneHot input indices must be INT"));
+    auto output_data_type  = outputs[0]->GetBlobDesc().data_type;
+    RETURN_VALUE_ON_NEQ(output_data_type, DATA_TYPE_FLOAT,
+                        Status(TNNERR_MODEL_ERR, "OneHot only supports output with FLOAT"));
+    
+    auto input_data = (int *)((char *)inputs[0]->GetHandle().base + inputs[0]->GetHandle().bytes_offset);
+    auto output_data = (float *)((char *)outputs[0]->GetHandle().base + outputs[0]->GetHandle().bytes_offset);
+
+    const int output_count = DimsVectorUtils::Count(output_dims);
+    
+    auto value_off = layer_param->value_off;
+    auto value_on = layer_param->value_on;
+    
+    for (int i = 0; i < output_count; ++i) {
+        output_data[i] = value_off;
+    }
+    
+    DimsVector input_index(input_dims.size(), 0);
+    const int input_count = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims);
+    for (int i = 0; i < input_count; ++i) {
+        int depth = input_data[i];
+        auto output_index = input_index;
+        output_index.insert(output_index.begin()+axis, depth);
+        auto output_offset = DimsOffsetUtils::ConvertIndexToOffset(output_dims, output_index);
+        output_data[output_offset] = value_on;
+        
+        input_index = DimsFunctionUtils::IncreaseIndex(input_index, input_dims);
+    }
+    
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(OneHot, LAYER_ONEHOT);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_pad_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pad_layer_acc.cc
new file mode 100644
index 0000000..79b73aa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pad_layer_acc.cc
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Pad, LAYER_PAD);
+
+#define GetPadCommonParams                                          \
+    int pad_l = layer_param->pads[0];                               \
+    int pad_r = layer_param->pads[1];                               \
+    int pad_t = layer_param->pads[2];                               \
+    int pad_b = layer_param->pads[3];                               \
+    int pad_c_b = layer_param->pads[4];                               \
+    int pad_c_e = layer_param->pads[5];                               \
+
+Status X86_CONST_PAD(float *input_data, float *output_data, const int batch,
+                     const int input_channel, const int input_height, const int input_width,
+                     const int output_channel,const int output_height,const int output_width,
+                     PadLayerParam* layer_param) {
+    GetPadCommonParams;
+    const float value = layer_param->value;
+
+    int cb_border = pad_c_b;
+    int ce_border = pad_c_b + input_channel;
+    int hb_border = pad_t;
+    int he_border = pad_t + input_height;
+    int wb_border = pad_l;
+    int we_border = pad_l + input_width;
+
+    for (int n = 0; n < batch; n++) {
+        auto output_data_ptr = output_data + n * output_channel * output_height * output_width;
+        for (int idx = 0; idx < cb_border * output_height * output_width; idx++) { // c_b
+            output_data_ptr[idx] = value;
+        }
+        output_data_ptr += cb_border * output_height * output_width;
+        for (int c = cb_border; c < ce_border; c++) {
+            auto input_data_ptr = input_data + (n * input_channel + c - cb_border) * input_height * input_width;
+            for (int h = 0; h < hb_border * output_width; h++) { // h_b
+                output_data_ptr[h] = value;
+            }
+            for (int h = hb_border; h < he_border; h++) { // h_center
+                for (int w = 0; w < wb_border; w++) { // w_b
+                    output_data_ptr[h * output_width + w] = value;
+                }
+                memcpy(output_data_ptr + h * output_width + wb_border, 
+                        input_data_ptr + (h - hb_border) * input_width,
+                        input_width * sizeof(float));
+                for (int w = we_border; w < output_width; w++) { // w_e
+                    output_data_ptr[h * output_width + w] = value;
+                }
+            }
+            for (int h = he_border * output_width; h < output_height * output_width; h++) { // h_e
+                output_data_ptr[h] = value;
+            }
+            output_data_ptr += output_height * output_width;
+        }
+        for (int idx = 0; idx < pad_c_e * output_height * output_width; idx++) { // c_e
+            output_data_ptr[idx] = value;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status X86_REFELCT_PAD(float *input_data, float *output_data, const int batch,
+                       const int input_channel, const int input_height, const int input_width,
+                       const int output_channel,const int output_height,const int output_width,
+                       PadLayerParam* layer_param) {
+    GetPadCommonParams;
+
+    for (int c = 0; c < batch * output_channel; c++) {
+        auto input_data_ptr = input_data + c * input_height * input_width;
+        auto output_data_ptr = output_data + c * output_height * output_width;
+
+        // center
+        for (int h = 0; h < input_height; h++) {
+            auto output_ptr_h = output_data_ptr + (h + pad_t) * output_width;
+            auto input_ptr_h  = input_data_ptr + h * input_width;
+            for (int i = 0; i < pad_l; i++) {
+                output_ptr_h[i] = input_ptr_h[pad_l - i];
+            }
+            memcpy(output_ptr_h + pad_l, input_ptr_h, input_width * sizeof(float));
+            for (int i = 0; i < pad_r; i++) {
+                output_ptr_h[i + pad_l + input_width] = input_ptr_h[input_width - i - 2];
+            }
+        }
+
+        // top
+        for (int h = 0; h < pad_t; h++) {
+            auto output_ptr_h = output_data_ptr + output_width * h;
+            auto output_ref_h = output_data_ptr + output_width * (pad_t + pad_t - h);
+            memcpy(output_ptr_h, output_ref_h, output_width * sizeof(float));
+        }
+
+        // bottom
+        for (int h = 0; h < pad_b; h++) {
+            auto output_ptr_h = output_data_ptr + output_width * (h + input_height + pad_t);
+            auto output_ref_h = output_data_ptr + output_width * (input_height + pad_t - 2 - h);
+            memcpy(output_ptr_h, output_ref_h, output_width * sizeof(float));
+        }  
+    }
+
+    return TNN_OK;
+}
+
+Status X86PadLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+    auto param = dynamic_cast<PadLayerParam *>(param_);
+
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+
+    auto input_dim   = input_blob->GetBlobDesc().dims;
+    auto output_dim  = output_blob->GetBlobDesc().dims;
+
+    int batch               = output_dim[0];
+    int channels            = output_dim[0] * output_dim[1];
+    int output_channel      = output_dim[1];
+    int output_height       = output_dim[2];
+    int output_width        = output_dim[3];
+    int input_channel       = input_dim[1];
+    int input_height        = input_dim[2];
+    int input_width         = input_dim[3];
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+        if (param->type == 0) {
+            // mode: const
+            X86_CONST_PAD(input_data, output_data, batch,
+                          input_channel, input_height, input_width,
+                          output_channel, output_height, output_width, param);
+        } else if (param->type == 1) {
+            // mode: reflect
+            X86_REFELCT_PAD(input_data, output_data, batch,
+                            input_channel, input_height, input_width,
+                            output_channel, output_height, output_width, param);
+        } else {
+            LOGE("Error: layer param is not supported: type:%d\n", param->type);
+            return Status(TNNERR_PARAM_ERR, "Error: layer param is not supported");
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Pad, LAYER_PAD);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.cc
new file mode 100644
index 0000000..c23c4d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_permute_layer_acc.h"
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+X86PermuteLayerAcc::~X86PermuteLayerAcc(){};
+
+template <typename T>
+void X86Permute(const int count, DimsVector dims, T *bottom_data, const std::vector<int> &permute_order,
+                const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                T *top_data) {
+    if (num_axes == 5) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    for (int w = 0; w < dims[3]; ++w) {
+                        int idx_w     = idx_h + w * new_steps[3];
+                        int old_idx_w = old_idx_h + w * old_steps[permute_order[3]];
+                        for (int x = 0; x < dims[4]; ++x) {
+                            int idx_x     = idx_w + x * new_steps[4];
+                            int old_idx_x = old_idx_w + x * old_steps[permute_order[4]];
+                            top_data[idx_x] = bottom_data[old_idx_x];
+                        }
+                    }
+                }
+            }
+        }
+    } else if (num_axes == 4) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    for (int w = 0; w < dims[3]; ++w) {
+                        int idx_w     = idx_h + w * new_steps[3];
+                        int old_idx_w = old_idx_h + w * old_steps[permute_order[3]];
+                        top_data[idx_w] = bottom_data[old_idx_w];
+                    }
+                }
+            }
+        }
+    } else if (num_axes == 3) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                for (int h = 0; h < dims[2]; ++h) {
+                    int idx_h     = idx_c + h * new_steps[2];
+                    int old_idx_h = old_idx_c + h * old_steps[permute_order[2]];
+                    top_data[idx_h] = bottom_data[old_idx_h];
+                }
+            }
+        }
+    } else if (num_axes == 2) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            for (int c = 0; c < dims[1]; ++c) {
+                int idx_c     = idx + c * new_steps[1];
+                int old_idx_c = old_idx + c * old_steps[permute_order[1]];
+                top_data[idx_c] = bottom_data[old_idx_c];
+            }
+        }
+    } else if (num_axes == 1) {
+        for (int n = 0; n < dims[0]; ++n) {
+            int idx = n * new_steps[0];
+            int old_idx = n * old_steps[permute_order[0]];
+            top_data[idx] = bottom_data[old_idx];
+        }
+    } else {
+        for (int i = 0; i < count; ++i) {
+            int old_idx = 0;
+            int idx     = i;
+            for (int j = num_axes-1; j >= 0; --j) {
+                int order = permute_order[j];
+                old_idx += (idx % dims[j]) * old_steps[order];
+                idx  /= dims[j];
+            }
+            top_data[i] = bottom_data[old_idx];
+        }
+    }
+};
+
+Status X86PermuteLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PermuteLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PermuteLayerParam is empyt");
+    }
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    DataType data_type     = output_blob->GetBlobDesc().data_type;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    const int output_count = DimsVectorUtils::Count(output_dims);
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    int num_dims = int(input_dims.size());
+    ASSERT(input_dims.size() == output_dims.size());
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(X86PermuteLayerAcc::count(input_dims, i + 1));
+        output_step.push_back(X86PermuteLayerAcc::count(output_dims, i + 1));
+    }
+
+    if (data_type != DATA_TYPE_INT8) {
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        X86Permute<float>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    } else {
+        // DATA_TYPE_INT8
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        X86Permute<int8_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    }
+    return TNN_OK;
+}
+
+X86TypeLayerAccRegister<TypeLayerAccCreator<X86PermuteLayerAcc>> g_x86_permute_layer_acc_register(LAYER_PERMUTE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.h
new file mode 100644
index 0000000..4e5b99e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_permute_layer_acc.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_PERMUTE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_PERMUTE_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+class X86PermuteLayerAcc : public X86LayerAcc {
+    virtual ~X86PermuteLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    /**
+     * @brief Compute the volume of a slice; i.e., the product of dimensions
+     *        among a range of axes.
+     *
+     * @param dimes the dimensions
+     *
+     * @param start_axis The first axis to include in the slice.
+     *
+     */
+    inline int count(std::vector<int> dimes, int start_axis) const {
+        const int end_axis = int(dimes.size());
+        ASSERT(start_axis <= end_axis);
+        ASSERT(start_axis >= 0);
+        ASSERT(end_axis >= 0);
+        int count = 1;
+        for (int i = start_axis; i < end_axis; ++i) {
+            count *= dimes[i];
+        }
+        return count;
+    };
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_PERMUTE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_pixel_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pixel_shuffle_layer_acc.cc
new file mode 100644
index 0000000..dd8906a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pixel_shuffle_layer_acc.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status X86PixelShuffleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param   = dynamic_cast<PixelShuffleLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int upscale_factor = layer_param->upscale_factor;
+    auto input_blob    = inputs[0];
+    auto input_dims    = input_blob->GetBlobDesc().dims;
+    auto output_blob   = outputs[0];
+    auto output_dims   = output_blob->GetBlobDesc().dims;
+    int slice_size     = DimsVectorUtils::Count(output_dims, 0, 2);
+    auto input_h       = input_dims[2];
+    auto input_w       = input_dims[3];
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto input_prt  = static_cast<float *>(input_blob->GetHandle().base);
+        auto output_ptr = static_cast<float *>(output_blob->GetHandle().base);
+        for (int s = 0; s < slice_size; ++s) {
+            for (int i = 0; i < upscale_factor; ++i) {
+                for (int j = 0; j < upscale_factor; ++j) {
+                    for (int h = 0; h < input_h; ++h) {
+                        for (int w = 0; w < input_w; ++w) {
+                            output_ptr[s * input_h * upscale_factor * input_w * upscale_factor +
+                                       h * upscale_factor * input_w * upscale_factor + i * input_w * upscale_factor +
+                                       w * upscale_factor + j] =
+                                input_prt[s * upscale_factor * upscale_factor * input_h * input_w +
+                                          i * upscale_factor * input_h * input_w + j * input_h * input_w + h * input_w +
+                                          w];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.cc
new file mode 100644
index 0000000..4cc3093
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.cc
@@ -0,0 +1,146 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/blob.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/utils/omp_utils.h"
+
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+#include "tnn/device/x86/acc/x86_pool_layer_acc.h"
+#include "tnn/device/x86/acc/Float8.h"
+#include "tnn/device/x86/acc/Float4.h"
+
+namespace TNN_NS {
+
+X86PoolLayerAcc::~X86PoolLayerAcc() {}
+
+Status X86PoolLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    PoolingLayerParam *param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    int corner_l_ = 0, corner_t_ = 0, corner_r_ = dims_output[3], corner_b_ = dims_output[2];
+    for (; corner_l_ * param->strides[0] - param->pads[0] < 0; corner_l_++)
+        ;
+    for (; corner_t_ * param->strides[1] - param->pads[2] < 0; corner_t_++)
+        ;
+    for (; (corner_r_ - 1) * param->strides[0] - param->pads[0] + param->kernels[0] > dims_input[3] &&
+            corner_r_ > corner_l_;
+        corner_r_--)
+        ;
+    for (; (corner_b_ - 1) * param->strides[1] - param->pads[2] + param->kernels[1] > dims_input[2] &&
+            corner_b_ > corner_t_;
+        corner_b_--)
+        ;
+    return TNN_OK;
+}
+
+Status X86PoolLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PoolingLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PoolingLayerParam is nil");
+    }
+
+    auto pool_type = param->pool_type;
+
+    auto input = inputs[0];
+    auto output = outputs[0];
+
+    auto dims_input  = input->GetBlobDesc().dims;
+    auto dims_output = output->GetBlobDesc().dims;
+
+    auto batch      = dims_output[0];
+    auto oc_r4      = ROUND_UP(dims_output[1], 4);
+    auto input_ptr  = static_cast<float *>(input->GetHandle().base);
+    auto output_ptr = static_cast<float *>(output->GetHandle().base);
+
+    auto X86MaxPoolingAcc = X86MaxPooling<Float4, 4>;
+    auto X86AvgPoolingAcc = X86AvgPooling<Float4, 4>;
+    auto PackAcc          = PackC4;
+    auto UnpackAcc        = UnpackC4;
+    int c_pack = 4;
+    if (arch_ == avx2) {
+        X86MaxPoolingAcc = X86MaxPooling<Float8, 8>;
+        X86AvgPoolingAcc = X86AvgPooling<Float8, 8>;
+        PackAcc          = PackC8;
+        UnpackAcc        = UnpackC8;
+        c_pack = 8;
+    }
+
+    int max_num_threads  = OMP_MAX_THREADS_NUM_;
+    size_t src_hw        = dims_input[3] * dims_input[2];
+    size_t dst_hw        = dims_output[3] * dims_output[2];
+    size_t src_pack_size = ROUND_UP(src_hw * c_pack * sizeof(float), 32);
+    size_t dst_pack_size = ROUND_UP(dst_hw * c_pack * sizeof(float), 32);
+    float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(
+                        (src_pack_size + dst_pack_size) * max_num_threads));
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        for (int b = 0; b < batch; b++) {
+            auto input_b  = reinterpret_cast<float *>(input_ptr) + b * dims_input[1] * src_hw;
+            auto output_b = reinterpret_cast<float *>(output_ptr) + b * dims_output[1] * dst_hw;
+            OMP_PARALLEL_FOR_GUIDED_
+            for (int c = 0; c < dims_output[1]; c += c_pack) {
+                int thread_id = OMP_TID_;
+                auto workspace_per_t = workspace + thread_id * ((src_pack_size + dst_pack_size) / sizeof(float));
+                auto src_pack_ptr    = workspace_per_t;
+                auto dst_pack_ptr    = workspace_per_t + src_pack_size / sizeof(float);
+                int left_c = MIN(dims_output[1] - c, c_pack);
+                PackAcc(src_pack_ptr, input_b + c * src_hw, src_hw, src_hw, src_hw, left_c);
+                if (param->pool_type == 0) {
+                    X86MaxPoolingAcc(src_pack_ptr, dims_input[3], dims_input[2], dst_pack_ptr,
+                            dims_output[3], dims_output[2], param->kernels[0], param->kernels[1], param->strides[0],
+                            param->strides[1], param->pads[0], param->pads[2], corner_l_, corner_r_, corner_t_,
+                            corner_b_);
+                } else {
+                    X86AvgPoolingAcc(src_pack_ptr, dims_input[3], dims_input[2], dst_pack_ptr,
+                            dims_output[3], dims_output[2], param->kernels[0], param->kernels[1], param->strides[0],
+                            param->strides[1], param->pads[0], param->pads[2]);
+                }
+                UnpackAcc(output_b + c * dst_hw, dst_pack_ptr, dst_hw, dst_hw, dst_hw, left_c);
+            }
+        }
+    } else if (input->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        // INT8
+        for (int n = 0; n < batch; n++) {
+            auto input_batch_stride  = dims_input[3] * dims_input[2] * oc_r4;
+            auto output_batch_stride = dims_output[3] * dims_output[2] * oc_r4;
+            if (param->pool_type == 0) {
+                X86MaxPoolingINT8(reinterpret_cast<int8_t *>(input_ptr) + n * input_batch_stride, dims_input[3],
+                               dims_input[2], reinterpret_cast<int8_t *>(output_ptr) + n * output_batch_stride,
+                               dims_output[3], dims_output[2], oc_r4, param->kernels[0], param->kernels[1],
+                               param->strides[0], param->strides[1], param->pads[0], param->pads[2]);
+            } else {
+                X86AvgPoolingINT8(reinterpret_cast<int8_t *>(input_ptr) + n * input_batch_stride, dims_input[3],
+                               dims_input[2], reinterpret_cast<int8_t *>(output_ptr) + n * output_batch_stride,
+                               dims_output[3], dims_output[2], oc_r4, param->kernels[0], param->kernels[1],
+                               param->strides[0], param->strides[1], param->pads[0], param->pads[2]);
+            }
+        }
+    } else {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "Error: this data type not supported in pooling layer");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Pool, LAYER_POOLING);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.h
new file mode 100644
index 0000000..a33dc58
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pool_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_POOL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_POOL_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class X86PoolLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86PoolLayerAcc();
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+private:
+    int corner_l_;
+    int corner_r_;
+    int corner_t_;
+    int corner_b_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_POOL_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_pow_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pow_layer_acc.cc
new file mode 100644
index 0000000..3893326
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_pow_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+typedef struct x86_power_operator : x86_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<PowLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: pow layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: pow layer param is nil");
+        }
+        scale_      = layer_param->scale;
+        shift_      = layer_param->shift;
+        exponent_   = layer_param->exponent;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        return pow(v * scale_ + shift_, exponent_);
+    }
+
+private:
+    float scale_ = 1.f;
+    float shift_ = 0.f;
+    float exponent_ = 0.f;
+} X86_POW_OP;
+
+DECLARE_X86_UNARY_ACC(Pow, X86_POW_OP);
+
+REGISTER_X86_ACC(Pow, LAYER_POWER);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.cc
new file mode 100644
index 0000000..b959dac
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.cc
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/x86/acc/x86_prelu_layer_acc.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+
+namespace TNN_NS {
+
+template <typename VEC, int pack>
+static void prelu_func(float *input, float *output, const float *slope, DimsVector dims, bool is_channel_shared) {
+    auto plane = DimsVectorUtils::Count(dims, 2);
+
+    for (int b = 0; b < dims[0]; b++) {
+        for (int c = 0; c < dims[1]; c++) {
+            float coef    = is_channel_shared ? slope[0] : slope[c];
+            auto input_c  = input + b * DimsVectorUtils::Count(dims, 1) + c * plane;
+            auto output_c = output + b * DimsVectorUtils::Count(dims, 1) + c * plane;
+            int i         = 0;
+            VEC v_zero(0.f);
+            VEC v_slope(coef);
+            for (; i + pack - 1 < plane; i += pack) {
+                VEC v_data = VEC::loadu(input_c + i);
+                VEC v_res  = VEC::bsl_clt(v_data, v_zero, v_data * v_slope, v_data);
+                VEC::saveu(output_c + i, v_res);
+            }
+
+            if (i > 0 && i < plane) {
+                i = plane - pack;
+                VEC v_data = VEC::loadu(input_c + i);
+                VEC v_res  = VEC::bsl_clt(v_data, v_zero, v_data * v_slope, v_data);
+                VEC::saveu(output_c + i, v_res);
+                i = plane;
+            }
+
+            for (; i < plane; i++) {
+                if (input_c[i] < 0) {
+                    output_c[i] = input_c[i] * coef;
+                } else {
+                    output_c[i] = input_c[i];
+                }
+            }
+        }
+    }
+}
+
+X86PReluLayerAcc::~X86PReluLayerAcc() {}
+
+Status X86PReluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                     const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto res = dynamic_cast<PReluLayerResource *>(resource);
+    CHECK_PARAM_NULL(res);
+
+    Status ret;
+    if (res->slope_handle.GetDataType() == DATA_TYPE_HALF) {
+        LayerResource *fp32_res = nullptr;
+        RETURN_ON_NEQ(ConvertHalfResource(LAYER_PRELU, res, &fp32_res), TNN_OK);
+        prelu_acc_f32_resource_ = std::shared_ptr<LayerResource>(fp32_res);
+        ret = X86LayerAcc::Init(context, param, prelu_acc_f32_resource_.get(), inputs, outputs);
+    } else {
+        ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    }
+
+    RETURN_ON_NEQ(ret, TNN_OK);
+    return TNN_OK;
+}
+
+Status X86PReluLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PReluLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: PReluLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerParam is nil");
+    }
+
+    auto layer_res = dynamic_cast<PReluLayerResource *>(resource_);
+    if (!layer_res) {
+        LOGE("Error: PReluLayerResource is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: PReluLayerResource is nil");
+    }
+
+    const int slope_size     = layer_res->slope_handle.GetBytesSize();
+    const DataType data_type = layer_res->slope_handle.GetDataType();
+
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    int channel            = output_blob->GetBlobDesc().dims[1];
+    int count              = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+    const int channel_size = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+    if (0 == channel_size) {
+        LOGE("Error: blob count is zero\n");
+        return Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+    }
+
+    auto calc = prelu_func<Float8, 8>;
+    if (arch_ == sse42) {
+        calc = prelu_func<Float4, 4>;
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        const float *slope_data = layer_res->slope_handle.force_to<float *>();
+
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        calc(input_data, output_data, slope_data, output_blob->GetBlobDesc().dims, layer_param->channel_shared);
+    } else {
+        return Status(TNNERR_DEVICE_ACC_DATA_FORMAT_NOT_SUPPORT, "Error: this data type not supported in prelu layer");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(PRelu, LAYER_PRELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.h
new file mode 100644
index 0000000..8e65be0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prelu_layer_acc.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_PRELU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_PRELU_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+class X86PReluLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86PReluLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    std::shared_ptr<LayerResource> prelu_acc_f32_resource_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_PRELU_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.cc
new file mode 100644
index 0000000..7b530e1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License./
+
+#include "tnn/device/x86/acc/x86_prior_box_layer_acc.h"
+
+#include "tnn/utils/pribox_generator_utils.h"
+
+namespace TNN_NS {
+
+X86PriorBoxLayerAcc::~X86PriorBoxLayerAcc(){};
+
+Status X86PriorBoxLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PriorBoxLayerParam is empyt");
+    }
+
+    Blob *output_blob  = outputs[0];
+    void *output_data  = output_blob->GetHandle().base;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    // compute
+    if (data_type == DATA_TYPE_FLOAT) {
+        auto prior_box = GeneratePriorBox(inputs, outputs, param);
+        memcpy(output_data, prior_box.data(), prior_box.size() * sizeof(float));
+    } else {
+        return Status(TNNERR_LAYER_ERR, "datatype not support");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(PriorBox, LAYER_PRIOR_BOX)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.h
new file mode 100644
index 0000000..35b909b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_prior_box_layer_acc.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_PRIOR_BOX_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_PRIOR_BOX_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class X86PriorBoxLayerAcc : public X86LayerAcc {
+    // @brief virtual destrcutor
+    virtual ~X86PriorBoxLayerAcc();
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    template <typename T>
+    inline void set_value(const int N, const T alpha, T *Y) {
+        if (alpha == 0) {
+            memset(Y, 0, sizeof(T) * N);  // NOLINT(caffe/alt_fn)
+            return;
+        }
+        for (int i = 0; i < N; ++i) {
+            Y[i] = alpha;
+        }
+    }
+
+    template <typename T>
+    void compute(Blob *output_blob, T *output_data, PriorBoxLayerParam *param, int layer_height, int layer_width,
+                 int img_height, int img_width, float step_h, float step_w);
+};
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_ACC_PRIOR_BOX_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reciprocal_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reciprocal_layer_acc.cc
new file mode 100644
index 0000000..176214c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reciprocal_layer_acc.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/interpreter/layer_param.h"
+
+#include <math.h>
+
+namespace TNN_NS {
+
+typedef struct reciprocal_operator : x86_unary_operator {
+    virtual float operator()(float in) {
+        float temp = in;
+        if (temp != 0) {
+            temp = (1.0 / temp);
+        }
+        return temp;
+    }
+} X86_RECIPROCAL_OP;
+
+DECLARE_X86_UNARY_ACC(Reciprocal, X86_RECIPROCAL_OP);
+
+REGISTER_X86_ACC(Reciprocal, LAYER_RECIPROCAL);
+
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l1_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l1_layer_acc.cc
new file mode 100644
index 0000000..1c254c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l1_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <math.h>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceL1, X86ReduceOpType::kL1);
+
+REGISTER_X86_ACC(ReduceL1, LAYER_REDUCE_L1);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l2_layer_acc.cc
new file mode 100644
index 0000000..c2c2f45
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_l2_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceL2, X86ReduceOpType::kL2);
+
+REGISTER_X86_ACC(ReduceL2, LAYER_REDUCE_L2);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_exp_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_exp_layer_acc.cc
new file mode 100644
index 0000000..d29a9ae
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_exp_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceLogSumExp, X86ReduceOpType::kLOGSUMEXP);
+
+REGISTER_X86_ACC(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_layer_acc.cc
new file mode 100644
index 0000000..a33eb8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_log_sum_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceLogSum, X86ReduceOpType::kLOGSUM);
+
+REGISTER_X86_ACC(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_max_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_max_layer_acc.cc
new file mode 100644
index 0000000..3ff03df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_max_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <math.h>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceMax, X86ReduceOpType::kMAX);
+
+REGISTER_X86_ACC(ReduceMax, LAYER_REDUCE_MAX);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_mean_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_mean_layer_acc.cc
new file mode 100644
index 0000000..28e2dcc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_mean_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <math.h>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceMean, X86ReduceOpType::kMEAN);
+
+REGISTER_X86_ACC(ReduceMean, LAYER_REDUCE_MEAN);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_min_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_min_layer_acc.cc
new file mode 100644
index 0000000..f3415c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_min_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceMin, X86ReduceOpType::kMIN);
+
+REGISTER_X86_ACC(ReduceMin, LAYER_REDUCE_MIN);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.cc
new file mode 100644
index 0000000..71531c1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "x86_reduce_op_layer_acc.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+
+namespace TNN_NS {
+
+X86ReduceOpLayerAcc::~X86ReduceOpLayerAcc() {}
+
+Status X86CalculateReduceDims(Blob *input_blob, ReduceLayerParam *layer_param,
+                           std::vector<std::tuple<int, int, int>> &reduce_dims) {
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto axes       = layer_param->axis;
+    std::sort(axes.begin(), axes.end());
+    reduce_dims.clear();
+    for (const auto &axis : axes) {
+        int outer_count   = DimsVectorUtils::Count(input_dims, 0, axis);
+        int reducer_count = input_dims[axis];
+        int inner_count   = DimsVectorUtils::Count(input_dims, axis + 1);
+        inner_count       = inner_count == 0 ? 1 : inner_count;
+        reduce_dims.emplace_back(std::make_tuple(outer_count, reducer_count, inner_count));
+        input_dims[axis] = 1;
+    }
+    return TNN_OK;
+}
+
+Status X86ReduceOpLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    auto input_blob = inputs[0];
+    auto output_blob = outputs[0];
+
+    auto input_dim = input_blob->GetBlobDesc().dims;
+    auto output_dim = output_blob->GetBlobDesc().dims;
+
+    auto layer_param = dynamic_cast<ReduceLayerParam*>(param_);
+
+    size_t workspace_size = DimsVectorUtils::Count(input_dim) * 2 * sizeof(float);
+    float *workspace = reinterpret_cast<float *>(context_->GetSharedWorkSpace(workspace_size));
+
+    std::vector<std::tuple<int, int, int>> reduce_dims;
+    X86CalculateReduceDims(input_blob, layer_param, reduce_dims);
+
+    X86_REDUCE_CALCULATE(static_cast<float *>(input_blob->GetHandle().base),
+                         static_cast<float *>(output_blob->GetHandle().base),
+                         workspace, reduce_dims, input_dim, output_dim, op_type_);
+
+    return TNN_OK;
+}
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.h
new file mode 100644
index 0000000..49cdc07
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_op_layer_acc.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_REDUCE_OP_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_REDUCE_OP_LAYER_ACC_H_
+
+#include <vector>
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+enum class X86ReduceOpType : int {
+    kL1     = 0,
+    kL2     = 1,
+    kMAX    = 2,
+    kMIN    = 3,
+    kMEAN   = 4,
+    kSUM    = 5,
+    kPROD   = 6,
+    kSUMSQUARE  = 7,
+    kLOGSUM     = 8,
+    kLOGSUMEXP  = 9
+};
+
+class X86ReduceOpLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86ReduceOpLayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+protected:
+    X86ReduceOpType op_type_;
+};
+
+#define DECLARE_X86_REDUCE_OP_ACC(type_string, op_type)                                                                 \
+    class X86##type_string##LayerAcc : public X86ReduceOpLayerAcc {                                                     \
+    public:                                                                                                             \
+        X86##type_string##LayerAcc() {                                                                                  \
+            X86ReduceOpLayerAcc::op_type_ = op_type;                                                                    \
+        }                                                                                                               \
+        virtual ~X86##type_string##LayerAcc(){};                                                                        \
+                                                                                                                       \
+    }
+}  // namespace TNN_NS
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_prod_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_prod_layer_acc.cc
new file mode 100644
index 0000000..dfb2aa3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_prod_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceProd, X86ReduceOpType::kPROD);
+
+REGISTER_X86_ACC(ReduceProd, LAYER_REDUCE_PROD);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_layer_acc.cc
new file mode 100644
index 0000000..48817ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_layer_acc.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceSum, X86ReduceOpType::kSUM);
+
+REGISTER_X86_ACC(ReduceSum, LAYER_REDUCE_SUM);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_square_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_square_layer_acc.cc
new file mode 100644
index 0000000..bbe0bc4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reduce_sum_square_layer_acc.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/acc/x86_reduce_op_layer_acc.h"
+
+#include <vector>
+#include <cmath>
+#include <immintrin.h>
+
+namespace TNN_NS {
+
+DECLARE_X86_REDUCE_OP_ACC(ReduceSumSquare, X86ReduceOpType::kSUMSQUARE);
+
+REGISTER_X86_ACC(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+}   // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.cc
new file mode 100644
index 0000000..ee22150
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_reformat_layer_acc.h"
+
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+
+namespace TNN_NS {
+
+Status X86ReformatLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                 const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(X86LayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+
+    auto reformat_param = dynamic_cast<ReformatLayerParam *>(param);
+    CHECK_PARAM_NULL(reformat_param);
+
+    if (reformat_param->src_type == DATA_TYPE_INT8 && reformat_param->dst_type == DATA_TYPE_FLOAT) {
+        reformat_param->type = DEQUANT_ONLY;
+        for (auto blob : outputs) {
+            blob->GetBlobDesc().data_format = DATA_FORMAT_NCHW;
+        }
+    } else if (reformat_param->src_type == DATA_TYPE_FLOAT && reformat_param->dst_type == DATA_TYPE_INT8) {
+        reformat_param->type = QUANT_ONLY;
+        for (auto blob : outputs) {
+            blob->GetBlobDesc().data_format = DATA_FORMAT_NHWC4;
+        }
+    } else {
+        return Status(TNNERR_MODEL_ERR, "unsupport precision mode");
+    }
+    return allocateBufferParam(inputs, outputs);
+}
+
+X86ReformatLayerAcc::~X86ReformatLayerAcc() {}
+
+Status X86ReformatLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    if (param->src_type != param->dst_type && !scale_buffer_.GetBytesSize()) {
+        auto dims_output    = outputs[0]->GetBlobDesc().dims;
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+        IntScaleResource *reformat_scale;
+        if (param->src_type == DATA_TYPE_INT8) {
+            reformat_scale = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource();
+        } else {
+            reformat_scale = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        }
+        const float *scale = reformat_scale->scale_handle.force_to<float *>();
+        int scale_cnt      = reformat_scale->scale_handle.GetDataCount();
+        RawBuffer temp_buffer(total_byte_size);
+        float *temp_ptr = temp_buffer.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx = scale_cnt == 1 ? 0 : i;
+            if (param->type == QUANT_ONLY)
+                temp_ptr[i] = 1.0 / scale[scale_idx];
+            if (param->type == DEQUANT_ONLY)
+                temp_ptr[i] = scale[scale_idx];
+        }
+        scale_buffer_ = temp_buffer;
+    }
+
+    return TNN_OK;
+}
+
+Status X86ReformatLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ReformatLayerParam *>(param_);
+    CHECK_PARAM_NULL(param);
+
+    for (int i = 0; i < inputs.size(); ++i) {
+        auto dims   = outputs[i]->GetBlobDesc().dims;
+        int batch   = dims[0];
+        int channel = dims[1];
+        int hw      = DimsVectorUtils::Count(dims, 2);
+        if (param->type == DEQUANT_ONLY) {
+            X86Int8ToFloat(reinterpret_cast<float *>(outputs[i]->GetHandle().base),
+                           reinterpret_cast<int8_t *>(inputs[i]->GetHandle().base),
+                           scale_buffer_.force_to<float *>(), batch, channel, hw);
+        } else if (param->type == QUANT_ONLY) {
+            X86FloatToInt8(reinterpret_cast<int8_t *>(outputs[i]->GetHandle().base),
+                           reinterpret_cast<float *>(inputs[i]->GetHandle().base),
+                           scale_buffer_.force_to<float *>(), batch, channel, hw);
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Reformat, LAYER_REFORMAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.h
new file mode 100644
index 0000000..0660c79
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reformat_layer_acc.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_REFORMAT_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_REFORMAT_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class X86ReformatLayerAcc : public X86LayerAcc {
+public:
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual ~X86ReformatLayerAcc();
+
+    Status allocateBufferParam(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer scale_buffer_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_REFORMAT_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu6_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu6_layer_acc.cc
new file mode 100644
index 0000000..dda22b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu6_layer_acc.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_relu6_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        float tmp = std::max(v, 0.0f);
+        tmp       = std::min(tmp, 6.0f);
+        return tmp;
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::min(Float4::max(v, Float4(0.f)), Float4(6.f));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::min(Float8::max(v, Float8(0.f)), Float8(6.f));
+    }
+} X86_RELU6_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_RELU6, avx2, unary2_kernel_avx<X86_RELU6_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_RELU6, sse42, unary2_kernel_sse<X86_RELU6_OP>);
+DECLARE_X86_UNARY2_ACC(Relu6, LAYER_RELU6);
+REGISTER_X86_ACC(Relu6, LAYER_RELU6);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.cc
new file mode 100644
index 0000000..4c8b556
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_relu_layer_acc.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_relu_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return std::max(v, 0.0f);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::max(v, Float4(0.f));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return (Float8::max(v, Float8(0.f)));
+    }
+} X86_RELU_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_RELU, avx2, unary2_kernel_avx<X86_RELU_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_RELU, sse42, unary2_kernel_sse<X86_RELU_OP>);
+
+Status X86ReluLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        auto dims = inputs[0]->GetBlobDesc().dims;
+        long count = dims[0] * ROUND_UP(dims[1], 4) * DimsVectorUtils::Count(dims, 2);
+        X86ReluInt8(reinterpret_cast<int8_t *>(outputs[0]->GetHandle().base),
+                    reinterpret_cast<int8_t *>(inputs[0]->GetHandle().base), count);
+        return TNN_OK;
+    } else {
+        return X86Unary2LayerAcc::DoForward(inputs, outputs);
+    }
+}
+
+REGISTER_X86_ACC(Relu, LAYER_RELU);
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.h
new file mode 100644
index 0000000..9a9d539
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_relu_layer_acc.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_RELU_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_X86_RELU_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+namespace TNN_NS {
+
+class X86ReluLayerAcc : public X86Unary2LayerAcc {
+public:
+    X86ReluLayerAcc() { type_ = LAYER_RELU; };
+    virtual ~X86ReluLayerAcc() {};
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reorg_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reorg_layer_acc.cc
new file mode 100644
index 0000000..53b4508
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reorg_layer_acc.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Reorg, LAYER_REORG);
+
+Status X86ReorgLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto layer_param  = dynamic_cast<ReorgLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    int stride  = layer_param->stride;
+    int forward = layer_param->forward;
+    int mode    = layer_param->mode;
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *bottom_data = static_cast<float *>(input_blob->GetHandle().base);
+        float *top_data    = static_cast<float *>(output_blob->GetHandle().base);
+        if (forward) {
+            DimsVector input_dims = input_blob->GetBlobDesc().dims;
+            int batch             = input_dims[0];
+            int channel           = input_dims[1];
+            int height            = input_dims[2];
+            int width             = input_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        } else {
+            DimsVector output_dims = output_blob->GetBlobDesc().dims;
+            int batch              = output_dims[0];
+            int channel            = output_dims[1];
+            int height             = output_dims[2];
+            int width              = output_dims[3];
+            NaiveReorg(bottom_data, width, height, channel, batch, stride, forward, mode, top_data);
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Reorg, LAYER_REORG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_reshape_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reshape_layer_acc.cc
new file mode 100644
index 0000000..f39d06a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_reshape_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Reshape, LAYER_RESHAPE);
+
+Status X86ReshapeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto &input  = inputs[0];
+    auto &output = outputs[0];
+    auto param   = (ReshapeLayerParam *)param_;
+    ASSERT(param != nullptr);
+    if (param->reshape_type == 0) {
+        if (output->GetHandle().base != input->GetHandle().base) {
+            auto dims_input    = input->GetBlobDesc().dims;
+            int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+            auto size_in_bytes = DimsVectorUtils::Count(dims_input) * data_byte_size;
+            memcpy(output->GetHandle().base, input->GetHandle().base, size_in_bytes);
+        }
+    } else if (param->reshape_type == 1) {
+        // tensorflow reshape
+        DataFormatConverter::ConvertFromNCHWToNHWC<float>(input, output);
+        DataFormatConverter::ConvertFromNHWCToNCHW<float>(output, nullptr);
+    } else {
+        LOGE("Error: Unsupport reshape type(%d)", param->reshape_type);
+        return Status(TNNERR_MODEL_ERR, "Error: X86ReshapeLayerAcc failed!\n");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Reshape, LAYER_RESHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_scale_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_scale_layer_acc.cc
new file mode 100644
index 0000000..2347dc9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_scale_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "immintrin.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Scale, LAYER_SCALE);
+
+Status X86ScaleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+    if (!resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNormLayerResource is nil");
+    }
+
+    auto input_blob        = inputs[0];
+    auto output_blob       = outputs[0];
+
+    RawBuffer scale_handle = resource->scale_handle;
+    bool shared_channel     = scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(scale_handle.GetDataType());
+    RawBuffer bias_handle  = resource->bias_handle;
+    bool has_bias          = bias_handle.GetDataCount() > 0; 
+
+    X86_FMA(static_cast<float *>(input_blob->GetHandle().base),
+            static_cast<float *>(output_blob->GetHandle().base),
+            scale_handle.force_to<float *>(), bias_handle.force_to<float *>(),
+            shared_channel, has_bias, output_blob->GetBlobDesc().dims);
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Scale, LAYER_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_scatter_nd_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_scatter_nd_layer_acc.cc
new file mode 100644
index 0000000..cf9ba6e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_scatter_nd_layer_acc.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(ScatterND, LAYER_SCATTER_ND);
+
+Status X86ScatterNDLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto resource = dynamic_cast<ScatterNDLayerResource *>(resource_);
+    CHECK_PARAM_NULL(resource);
+    Blob *output_blob = outputs[0];
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        auto indices_dims = resource->indices.GetBufferDims();
+        auto indices = resource->indices;
+        Blob* input_data_blob = inputs[0];
+        Blob* update_data_blob = inputs[1];
+        float* input_data = reinterpret_cast<float*>(input_data_blob->GetHandle().base);
+        float* update_data = reinterpret_cast<float*>(update_data_blob->GetHandle().base);
+        float* output_data = reinterpret_cast<float*>(output_blob->GetHandle().base);
+        auto input_dims = input_data_blob->GetBlobDesc().dims;
+        auto update_dims = update_data_blob->GetBlobDesc().dims;
+        if(indices_dims.empty()) {
+            LOGE("Error: indices dims has rank 0");
+            return Status(TNNERR_PARAM_ERR, "Error: indices dims has rank 0");
+        }
+        auto indice_rank = indices_dims.size();
+        auto last_indice_dimension = indices_dims[indice_rank - 1];
+        if(last_indice_dimension > input_dims.size()) {
+            LOGE("Error: last dimension of indices larger than input blob dims size ");
+            return Status(TNNERR_PARAM_ERR, "Error: last dimension of indices larger than input blob dims size ");
+        }
+
+        auto update_rank = update_dims.size();
+        auto input_rank = input_dims.size();
+        if(update_rank < indice_rank - 1) {
+            LOGE("Error: update_rank < indice_rank -1 ");
+            return Status(TNNERR_PARAM_ERR, "Error: update_rank < indice_rank -1 ");
+        }
+
+        for(int i = 0; i < indice_rank -1; ++i) {
+            if(indices_dims[i] != update_dims[i]) {
+                LOGE("Error: indices_dims and update dims not equal before index indice_rank -1");
+                return Status(TNNERR_PARAM_ERR, "Error: indices_dims and update dims not equal before index indice_rank -1");
+            }
+        }
+
+        if(DimsVectorUtils::Count(update_dims, indice_rank -1) != DimsVectorUtils::Count(input_dims, last_indice_dimension)) {
+                LOGE("Error: indices_dims and update dims not equal before index indice_rank -1");
+                return Status(TNNERR_PARAM_ERR, "Error: indices_dims and update dims not equal before index indice_rank -1");
+        }
+
+        //copy input to output
+        memcpy(output_data, input_data, DimsVectorUtils::Count(input_dims) * sizeof(float));
+
+        std::vector<int> element_counts(last_indice_dimension, 0);
+
+        for (int i = 0; i < last_indice_dimension; ++i) {
+            element_counts[i] = DimsVectorUtils::Count(input_dims, i + 1);
+        }
+
+        int element_to_copy = DimsVectorUtils::Count(input_dims, last_indice_dimension);
+        int* indice_offset = indices.force_to<int*>();
+        int offset_count = DimsVectorUtils::Count(indices_dims, 0, indice_rank - 1);
+ 
+        for(int i = 0; i < offset_count; ++i) {
+            int offset = 0;
+            for(int j = 0; j < last_indice_dimension; ++j) {
+                auto indice = *(indice_offset + i * last_indice_dimension + j);
+                offset += indice * element_counts[j];
+            }
+            memcpy(output_data + offset, update_data + i * element_to_copy, element_to_copy * sizeof(float));
+        }
+
+        return TNN_OK;
+
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: expand layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_selu_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_selu_layer_acc.cc
new file mode 100644
index 0000000..3d79261
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_selu_layer_acc.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_selu_operator : x86_unary_operator {
+public:
+    Status Init(LayerParam *param) {
+        auto layer_param = dynamic_cast<SeluLayerParam *>(param);
+        if (!layer_param) {
+            LOGE("Error: selu layer param is nil\n");
+            return Status(TNNERR_MODEL_ERR, "Error: selu layer param is nil");
+        }
+        alpha_ = layer_param->alpha;
+        gamma_ = layer_param->gamma;
+        return TNN_OK;
+    }
+    virtual float operator()(const float v) {
+        float tmp = v;
+        if (tmp <= 0.f) {
+            tmp = gamma_ * (alpha_ * exp(tmp) - alpha_);
+        } else {
+            tmp = gamma_ * tmp;
+        }
+        return tmp;
+    }
+
+private:
+    float alpha_ = 0.f;
+    float gamma_ = 0.f;
+} X86_SELU_OP;
+
+DECLARE_X86_UNARY_ACC(Selu, X86_SELU_OP);
+
+REGISTER_X86_ACC(Selu, LAYER_SELU);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_shuffle_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_shuffle_layer_acc.cc
new file mode 100644
index 0000000..6c68e3a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_shuffle_layer_acc.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+static int shuffle_cpu(float *output, const float *input, int group_row, int group_column, int len) {
+    int ele_size = sizeof(float);
+    for (int i = 0; i < group_row; ++i)  // 2
+    {
+        for (int j = 0; j < group_column; ++j)  // 3
+        {
+            const float *p_i = input + (i * group_column + j) * len;
+            float *p_o       = output + (j * group_row + i) * len;
+            memcpy(p_o, p_i, len * ele_size);
+        }
+    }
+
+    return 0;
+}
+
+DECLARE_X86_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+Status X86ShuffleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<ShuffleLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: ShuffleLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ShuffleLayerParam is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+    auto dims   = input->GetBlobDesc().dims;
+
+    const float *bottom_data = static_cast<float *>(input->GetHandle().base);
+    float *top_data          = static_cast<float *>(output->GetHandle().base);
+
+    const int num              = dims[0];
+    const int feature_map_size = DimsVectorUtils::Count(dims, 1);
+    const int sp_sz            = DimsVectorUtils::Count(dims, 2);
+    const int chs              = dims[1];
+
+    int group_row    = param->group;
+    int group_column = int(chs / group_row);
+
+    assert(chs == (group_column * group_row));
+
+    // Dtype* temp_data = temp_blob_.mutable_cpu_data();
+    for (int n = 0; n < num; ++n) {
+        shuffle_cpu(top_data + n * feature_map_size, bottom_data + n * feature_map_size, group_row, group_column,
+                    sp_sz);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_sigmoid_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sigmoid_layer_acc.cc
new file mode 100644
index 0000000..d527f4f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sigmoid_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_sigmoid_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return 1.0f / (1.0f + exp(-v));
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::sigmoid(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::sigmoid(v);
+    }
+} X86_SIGMOID_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_SIGMOID, avx2, unary2_kernel_avx<X86_SIGMOID_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_SIGMOID, sse42, unary2_kernel_sse<X86_SIGMOID_OP>);
+DECLARE_X86_UNARY2_ACC(Sigmoid, LAYER_SIGMOID);
+REGISTER_X86_ACC(Sigmoid, LAYER_SIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_sign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sign_layer_acc.cc
new file mode 100644
index 0000000..4b36b2f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sign_layer_acc.cc
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_sign_operator : x86_unary_operator {
+    virtual float operator()(const float v) {
+        float tmp = v;
+        if (tmp > 0) return 1;
+        else if (tmp < 0) return -1;
+        else return 0;
+    }
+} X86_SIGN_OP;
+
+DECLARE_X86_UNARY_ACC(Sign, X86_SIGN_OP);
+
+REGISTER_X86_ACC(Sign, LAYER_SIGN);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_signed_mul_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_signed_mul_layer_acc.cc
new file mode 100644
index 0000000..226ff60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_signed_mul_layer_acc.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(SignedMul, LAYER_SIGNED_MUL);
+
+Status X86SignedMulLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SignedMulLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: SignedMulLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SignedMulLayerParam is nil");
+    }
+
+    auto alpha     = layer_param->alpha;
+    auto beta      = layer_param->beta;
+    auto gamma_inv = 1.0f / layer_param->gamma;
+
+    auto input_blob    = inputs[0];
+    auto output_blob   = outputs[0];
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+    int batch          = input_blob->GetBlobDesc().dims[0];
+    int channel        = input_blob->GetBlobDesc().dims[1];
+    int channel_size   = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, 2);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c < channel; c++) {
+            int channel_index = b * channel + c;
+            float *input_data_c = input_data + channel_index * channel_size;
+            float *output_data_c = output_data + channel_index * channel_size;
+            for (int i = 0; i < channel_size; i++) {
+                //sub 
+                float temp = input_data_c[i] - alpha;
+                
+                //sign
+                if (temp > 0) {
+                    temp = 1;
+                } else if (temp < 0) {
+                    temp = -1;
+                }
+                
+                //add
+                temp += beta;
+                
+                //div
+                temp *= gamma_inv;
+                output_data_c[i] = temp;
+            }
+        }
+        
+        //mul
+        float *output_data_c0 = output_data + b * channel * channel_size;
+        for (int c = channel - 1; c >= 0; c--) {
+            int channel_index = b * channel + c;
+            float *output_data_c = output_data + channel_index * channel_size;
+            for (int i = 0; i < channel_size; i++) {
+                output_data_c[i] *= output_data_c0[i];
+            }
+        }
+    }
+    return 0;
+}
+
+REGISTER_X86_ACC(SignedMul, LAYER_SIGNED_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_sin_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sin_layer_acc.cc
new file mode 100644
index 0000000..110e0c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sin_layer_acc.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_sin_operator : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return sin(v);
+    }
+} X86_SIN_OP;
+
+DECLARE_X86_UNARY_ACC(Sin, X86_SIN_OP);
+
+REGISTER_X86_ACC(Sin, LAYER_SIN);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_softmax_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softmax_layer_acc.cc
new file mode 100644
index 0000000..572bd5a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softmax_layer_acc.cc
@@ -0,0 +1,191 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(SoftMax, LAYER_SOFTMAX);
+
+template <typename VEC, int pack>
+static void softmax_channel_func(float *input_ptr, float *output_ptr, int channel, int count, void *workspace) {
+    float *temp = reinterpret_cast<float *>(workspace);
+
+    temp[0] = input_ptr[0];
+    // max
+    int ele    = 0;
+    auto v_max = VEC(temp[0]);
+    float vec_buf[pack];
+    for (; ele + pack - 1 < channel; ele += pack) {
+        v_max = VEC::max(v_max, VEC::load(input_ptr + ele));
+    }
+    for (; ele < channel; ele++) {
+        temp[0] = std::max(temp[0], input_ptr[ele]);
+    }
+    VEC::saveu(vec_buf, v_max);
+    for (int i = 0; i < pack; i++) {
+        temp[0] = std::max(temp[0], vec_buf[i]);
+    }
+
+    // exp
+    ele = 0;
+    for (; ele + pack - 1 < channel; ele += pack) {
+        VEC::saveu(output_ptr + ele, VEC::exp(VEC::loadu(input_ptr + ele) - VEC(temp[0])));
+    }
+    for (; ele < channel; ele++) {
+        output_ptr[ele] = expf(input_ptr[ele] - temp[0]);
+    }
+
+    // sum
+    temp[0]    = 0.f;
+    auto v_sum = VEC(0.f);
+    ele        = 0;
+    for (; ele + pack - 1 < channel; ele += pack) {
+        v_sum = v_sum + VEC::loadu(output_ptr + ele);
+    }
+    for (; ele < channel; ele++) {
+        temp[0] += output_ptr[ele];
+    }
+    VEC::saveu(vec_buf, v_sum);
+    for (int i = 0; i < pack; i++) {
+        temp[0] += vec_buf[i];
+    }
+
+    // division
+    temp[0] = 1.f / temp[0];
+
+    //
+    ele = 0;
+    for (; ele + pack - 1 < channel; ele += pack) {
+        VEC::saveu(output_ptr + ele, VEC::loadu(output_ptr + ele) * temp[0]);
+    }
+    for (; ele < channel; ele++) {
+        output_ptr[ele] *= temp[0];
+    }
+}
+
+template <typename VEC, int pack>
+static void softmax_func(float *input_ptr, float *output_ptr, int channel, int count, void *workspace) {
+    if (count == 1) {
+        return softmax_channel_func<VEC, pack>(input_ptr, output_ptr, channel, count, workspace);
+    }
+
+    float *temp = reinterpret_cast<float *>(workspace);
+    // max
+    memcpy(temp, input_ptr, count * sizeof(float));
+    for (int c = 1; c < channel; c++) {
+        float *input_channel = input_ptr + c * count;
+        int ele              = 0;
+        for (; ele + pack - 1 < count; ele += pack) {
+            VEC::saveu(temp + ele, VEC::max(VEC::loadu(temp + ele), VEC::loadu(input_channel + ele)));
+        }
+        for (; ele < count; ele++) {
+            temp[ele] = std::max(temp[ele], input_channel[ele]);
+        }
+    }
+
+    // exp
+    for (int c = 0; c < channel; c++) {
+        float *input_channel  = input_ptr + c * count;
+        float *output_channel = output_ptr + c * count;
+
+        int ele = 0;
+        for (; ele + pack - 1 < count; ele += pack) {
+            VEC::saveu(output_channel + ele, VEC::exp(VEC::loadu(input_channel + ele) - VEC::loadu(temp + ele)));
+        }
+        for (; ele < count; ele++) {
+            output_channel[ele] = expf(input_channel[ele] - temp[ele]);
+        }
+    }
+
+    // sum
+    memcpy(temp, output_ptr, count * sizeof(float));
+    for (int c = 1; c < channel; c++) {
+        float *output_channel = output_ptr + c * count;
+        int ele               = 0;
+        for (; ele + pack - 1 < count; ele += pack) {
+            VEC::saveu(temp + ele, VEC::loadu(temp + ele) + VEC::loadu(output_channel + ele));
+        }
+        for (; ele < count; ele++) {
+            temp[ele] += output_channel[ele];
+        }
+    }
+
+    // division
+    int ele = 0;
+    for (; ele + pack - 1 < count; ele += pack) {
+        VEC::saveu(temp + ele, VEC::div(VEC(1.f), VEC::loadu(temp + ele)));
+    }
+    for (; ele < count; ele++) {
+        temp[ele] = 1.0f / temp[ele];
+    }
+
+    for (int c = 0; c < channel; c++) {
+        float *output_channel = output_ptr + c * count;
+        int ele               = 0;
+        for (; ele + pack - 1 < count; ele += pack) {
+            VEC::saveu(output_channel + ele, VEC::loadu(temp + ele) * VEC::loadu(output_channel + ele));
+        }
+        for (; ele < count; ele++) {
+            output_channel[ele] *= temp[ele];
+        }
+    }
+}
+
+Status X86SoftMaxLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<SoftmaxLayerParam *>(param_);
+
+    if (!params) {
+        LOGE("Error: SoftmaxLayerParam is unsupported\n");
+        return Status(TNNERR_MODEL_ERR, "Error: SoftmaxLayerParam is unsupported");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+    auto dims          = input_blob->GetBlobDesc().dims;
+    int axis           = params->axis;
+    axis               = static_cast<int>((axis + dims.size()) % dims.size());
+    int batch          = DimsVectorUtils::Count(dims, 0, axis);
+    int channel        = dims[axis];
+    int count          = DimsVectorUtils::Count(dims, axis + 1);
+
+    auto workspace = context_->GetSharedWorkSpace(count * sizeof(float));
+
+    auto func = softmax_func<Float8, 8>;
+    if (arch_ == sse42) {
+        func = softmax_func<Float4, 4>;
+    }
+
+    for (int n = 0; n < batch; n++) {
+        float *const input_batch  = input_data + n * channel * count;
+        float *const output_batch = output_data + n * channel * count;
+
+        func(input_batch, output_batch, channel, count, workspace);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(SoftMax, LAYER_SOFTMAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_softplus_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softplus_layer_acc.cc
new file mode 100644
index 0000000..fbcbab8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softplus_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_softplus_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return log(exp(v) + 1.0f);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::log(Float4::exp(v) + Float4(1.0f));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::log(Float8::exp(v) + Float8(1.0f));
+    }
+} X86_SOFTPLUS_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_SOFTPLUS, avx2, unary2_kernel_avx<X86_SOFTPLUS_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_SOFTPLUS, sse42, unary2_kernel_sse<X86_SOFTPLUS_OP>);
+DECLARE_X86_UNARY2_ACC(Softplus, LAYER_SOFTPLUS);
+REGISTER_X86_ACC(Softplus, LAYER_SOFTPLUS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_softsign_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softsign_layer_acc.cc
new file mode 100644
index 0000000..ee0a8fb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_softsign_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_softsign_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return v / (std::abs(v) + 1.0f);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::div(v, (Float4::abs(v) + Float4(1.0f)));
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::div(v, (Float8::abs(v) + Float8(1.0f)));
+    }
+} X86_SOFTSIGN_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_SOFTSIGN, avx2, unary2_kernel_avx<X86_SOFTSIGN_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_SOFTSIGN, sse42, unary2_kernel_sse<X86_SOFTSIGN_OP>);
+DECLARE_X86_UNARY2_ACC(Softsign, LAYER_SOFTSIGN);
+REGISTER_X86_ACC(Softsign, LAYER_SOFTSIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc
new file mode 100644
index 0000000..fa1d7c9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(SplitV, LAYER_SPLITV);
+
+Status X86SplitVLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
+    if (!layer_param || layer_param->slices.size() != outputs.size()) {
+        return Status(TNNERR_PARAM_ERR, "X86SplitVLayerAcc has invalid param, slices size != output blobs size");
+    }
+
+    const int axis  = layer_param->axis;
+    auto input_blob = inputs[0];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    const int batch = DimsVectorUtils::Count(input_dims, 0, axis);
+    int slice_size  = DimsVectorUtils::Count(input_dims, axis + 1);
+    if (slice_size == 0) {
+        slice_size = 1;
+    }
+    const int slice_input = input_dims[axis];
+    auto input_data       = static_cast<float *>(input_blob->GetHandle().base);
+
+    if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        for (size_t b = 0; b < batch; b++) {
+            int slice_input_offset = 0;
+            for (size_t i = 0; i < outputs.size(); i++) {
+                auto output_blob = outputs[i];
+                auto output_data = static_cast<float *>(output_blob->GetHandle().base);
+                const int slice  = output_blob->GetBlobDesc().dims[axis];
+
+                auto input_data_ptr  = input_data + b * slice_input * slice_size + slice_input_offset * slice_size;
+                auto output_data_ptr = output_data + b * slice * slice_size;
+
+                memcpy(output_data_ptr, input_data_ptr, slice * slice_size * sizeof(float));
+                slice_input_offset += slice;
+            }
+        }
+    } else {
+        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc dont support datatype");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(SplitV, LAYER_SPLITV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_sqrt_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sqrt_layer_acc.cc
new file mode 100644
index 0000000..fccd8d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sqrt_layer_acc.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace TNN_NS {
+typedef struct x86_sqrt_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return sqrt(v);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::sqrt(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::sqrt(v);
+    }
+} X86_SQRT_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_SQRT, avx2, unary2_kernel_avx<X86_SQRT_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_SQRT, sse42, unary2_kernel_sse<X86_SQRT_OP>);
+DECLARE_X86_UNARY2_ACC(Sqrt, LAYER_SQRT);
+REGISTER_X86_ACC(Sqrt, LAYER_SQRT);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_squeeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_squeeze_layer_acc.cc
new file mode 100644
index 0000000..43e1321
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_squeeze_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Squeeze, LAYER_SQUEEZE);
+
+Status X86SqueezeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *input_data  = inputs[0]->GetHandle().base;
+    void *output_data = outputs[0]->GetHandle().base;
+    auto input_dims   = outputs[0]->GetBlobDesc().dims;
+    auto count        = DimsVectorUtils::Count(input_dims);
+    auto ele_size     = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Squeeze, LAYER_SQUEEZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_layer_acc.cc
new file mode 100644
index 0000000..f3ff916
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_layer_acc.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status X86StrideSliceLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    Blob *input_blob   = inputs[0];
+    Blob *output_blob  = outputs[0];
+    auto dims_input    = input_blob->GetBlobDesc().dims;
+    auto dims_output   = output_blob->GetBlobDesc().dims;
+
+    if (dims_output.size() > 4 || dims_input.size() > 4) {
+        LOGE("Error: x86 stride slice only support dimension <= 4\n");
+        return Status(TNNERR_MODEL_ERR, "Error: x86 stride slice only support dimension <= 4");
+    }
+
+    auto begins  = layer_param->begins;
+    auto ends    = layer_param->ends;
+    auto strides = layer_param->strides;
+    std::reverse(begins.begin(), begins.end());
+    std::reverse(ends.begin(), ends.end());
+    std::reverse(strides.begin(), strides.end());
+
+    for (int i = 0; i < ends.size(); ++i) {
+        if (ends[i] == 0) {
+            ends[i] = input_blob->GetBlobDesc().dims[i];
+        }
+    }
+    
+    DimsVector input_strides;
+    DimsVector output_strides;
+    input_strides.reserve(dims_output.size());
+    output_strides.reserve(dims_output.size());
+
+    for (int i = 0; i < dims_output.size() - 1; i++) {
+        input_strides.push_back(DimsVectorUtils::Count(dims_input, i + 1));
+        output_strides.push_back(DimsVectorUtils::Count(dims_output, i + 1));
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = reinterpret_cast<float *>(output_blob->GetHandle().base);
+
+        X86StrideSliceImpl(begins, strides, dims_output, input_strides, output_strides, input_data, output_data);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8/bfp16 StrideSlice");
+    }
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(StrideSlice, LAYER_STRIDED_SLICE)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.cc
new file mode 100644
index 0000000..41d7a78
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/dims_offset_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute.h"
+
+namespace TNN_NS {
+
+X86StrideSliceV2LayerAcc::~X86StrideSliceV2LayerAcc() {}
+
+Status X86StrideSliceV2LayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                         const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (inputs.size() >= 2) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "stride slice input(begins) has invalid data type");
+        }
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->begins = dims;
+    }
+    
+    if (inputs.size() >= 3) {
+        if (inputs[2]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "stride slice input(ends) has invalid data type");
+        }
+        auto input_dims = inputs[2]->GetBlobDesc().dims;
+        
+        auto dim_count = DimsVectorUtils::Count(inputs[2]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[2]->GetHandle().base + inputs[2]->GetHandle().bytes_offset);
+        DimsVector dims;
+        for (int i=0; i<dim_count; i++) {
+            dims.push_back(dim_data[i]);
+        }
+        layer_param->ends = dims;
+    }
+    
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    
+    auto begins = layer_param->begins;
+    auto ends = layer_param->ends;
+    auto axes = layer_param->axes;
+    auto strides = layer_param->strides;
+    
+    Status status = TNN_OK;
+    auto output_dims = DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    outputs[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+
+Status X86StrideSliceV2LayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: StrideSliceLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: StrideSliceLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+
+    auto begins = layer_param->begins;
+    auto ends = layer_param->ends;
+    auto strides = layer_param->strides;
+    auto axes = layer_param->axes;
+    
+    DimsVector input_dims = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    int output_count = DimsVectorUtils::Count(output_dims);
+    
+    //rectify begins and ends here for value < 0 or = INT_MAX
+    Status status = TNN_OK;
+    DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    DimsVector begins_compute;
+    DimsVector strides_compute;
+    begins_compute.reserve(output_dims.size());
+    strides_compute.reserve(output_dims.size());
+    for (int i = 0, axes_index = 0; i < output_dims.size(); i++) {
+        if (axes_index < axes.size() && i == axes[axes_index]) {
+            begins_compute.push_back(begins[axes_index]);
+            strides_compute.push_back(strides[axes_index]);
+            ++axes_index;
+        }  else {
+            begins_compute.push_back(0);
+            strides_compute.push_back(1);
+        }
+    }
+
+    DimsVector input_strides;
+    DimsVector output_strides;
+    input_strides.reserve(output_dims.size());
+    output_strides.reserve(output_dims.size());
+
+    for (int i = 0; i < output_dims.size() - 1; i++) {
+        input_strides.push_back(DimsVectorUtils::Count(input_dims, i + 1));
+        output_strides.push_back(DimsVectorUtils::Count(output_dims, i + 1));
+    }
+
+    if (output_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        float *input_data  = reinterpret_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = reinterpret_cast<float *>(output_blob->GetHandle().base);
+
+        X86StrideSliceImpl(begins_compute, strides_compute, output_dims, input_strides, output_strides, input_data, output_data);
+    } else {
+        return Status(TNNERR_LAYER_ERR, "NO IMPLEMENT FOR int8/bfp16 StrideSliceV2");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.h
new file mode 100644
index 0000000..b8ef0a5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_stride_slice_v2_layer_acc.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_STRIDE_SLICE_V2_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_STRIDE_SLICE_V2_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+// @brief pooling layer cpu acc
+class X86StrideSliceV2LayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86StrideSliceV2LayerAcc();
+
+    virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_STRIDE_SLICE_V2_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_sub_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sub_layer_acc.cc
new file mode 100644
index 0000000..e499125
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_sub_layer_acc.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_binary_op_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_BINARY_OP_ACC(Sub, X86BinaryOpType::kSUB);
+
+REGISTER_X86_ACC(Sub, LAYER_SUB);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_tan_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_tan_layer_acc.cc
new file mode 100644
index 0000000..5b0ea6c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_tan_layer_acc.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_tan_operator : x86_unary_operator {
+    virtual float operator()(const float v) {
+        return tan(v);
+    }
+} X86_TAN_OP;
+
+DECLARE_X86_UNARY_ACC(Tan, X86_TAN_OP);
+
+REGISTER_X86_ACC(Tan, LAYER_TAN);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_tanh_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_tanh_layer_acc.cc
new file mode 100644
index 0000000..ea7af7a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_tanh_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+#include <cmath>
+
+namespace TNN_NS {
+typedef struct x86_tanh_operator : x86_unary2_operator {
+    virtual float operator()(const float v) {
+        return tanh(v);
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return Float4::tanh(v);
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return Float8::tanh(v);
+    }
+} X86_TANH_OP;
+
+X86_REGISTER_UNARY2_KERNEL(LAYER_TANH, avx2, unary2_kernel_avx<X86_TANH_OP>);
+X86_REGISTER_UNARY2_KERNEL(LAYER_TANH, sse42, unary2_kernel_sse<X86_TANH_OP>);
+DECLARE_X86_UNARY2_ACC(Tanh, LAYER_TANH);
+REGISTER_X86_ACC(Tanh, LAYER_TANH);
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.cc
new file mode 100644
index 0000000..c46218b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary2_layer_acc.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+std::map<std::string, unary2_kernel_avx_func_t> &X86Unary2LayerAcc::GetUnary2KernelMap() {
+    static std::map<std::string, unary2_kernel_avx_func_t> kernel_map;
+    return kernel_map;
+}
+
+std::string X86Unary2LayerAcc::GetUnaryKernelName(LayerType type, x86_isa_t arch) {
+    return ToString(type) + "_" + ToString(arch);
+}
+
+Status X86Unary2LayerAcc::RegisterUnary2Kernel(LayerType type, x86_isa_t arch, unary2_kernel_avx_func_t kernel) {
+    std::string kernel_name = GetUnaryKernelName(type, arch);
+    auto &kernel_map        = GetUnary2KernelMap();
+    kernel_map[kernel_name] = kernel;
+    return TNN_OK;
+}
+
+Status X86Unary2LayerAcc::GetUnary2Kernel(LayerType type, x86_isa_t arch, unary2_kernel_avx_func_t &kernel) {
+    const auto &kernel_map  = GetUnary2KernelMap();
+    std::string kernel_name = GetUnaryKernelName(type, arch);
+    if (kernel_map.find(kernel_name) == kernel_map.end() || kernel_map.at(kernel_name) == nullptr) {
+        return Status(TNNERR_PARAM_ERR, "X86Unary2LayerAcc can not find unary kernel");
+    }
+    kernel = kernel_map.at(kernel_name);
+    return TNN_OK;
+}
+
+Status X86_UNARY2_CALCULATE(DimsVector &dims, const float *src, float *dst, LayerType type, x86_isa_t arch,
+                            LayerParam *param) {
+    unary2_kernel_avx_func_t unary2_kernel_func = nullptr;
+    RETURN_ON_NEQ(X86Unary2LayerAcc::GetUnary2Kernel(type, arch, unary2_kernel_func), TNN_OK);
+
+    unary2_kernel_func(dims, src, dst, param);
+
+    return TNN_OK;
+}
+
+X86Unary2LayerAcc::~X86Unary2LayerAcc() {}
+
+Status X86Unary2LayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims = output->GetBlobDesc().dims;
+
+    int count        = DimsVectorUtils::Count(dims);
+    auto input_data  = static_cast<float *>(input->GetHandle().base);
+    auto output_data = static_cast<float *>(output->GetHandle().base);
+
+    RETURN_ON_NEQ(X86_UNARY2_CALCULATE(dims, input_data, output_data, type_, arch_, param_), TNN_OK);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.h
new file mode 100644
index 0000000..35e057b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary2_layer_acc.h
@@ -0,0 +1,124 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY2_LAYER_ACC_H
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY2_LAYER_ACC_H
+
+#include <algorithm>
+#include "tnn/device/x86/acc/Float4.h"
+#include "tnn/device/x86/acc/Float8.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+typedef struct x86_unary2_operator {
+public:
+    virtual Status Init(LayerParam *param = nullptr) {
+        param_ = param;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        return v;
+    }
+
+    virtual Float4 operator()(const Float4 &v) {
+        return v;
+    }
+
+    virtual Float8 operator()(const Float8 &v) {
+        return v;
+    }
+
+protected:
+    LayerParam *param_ = nullptr;
+} X86_UNARY2_OP;
+
+template <typename UNARY2_OP>
+void unary2_kernel_avx(std::vector<int> dims, const float *src, float *dst, LayerParam *param) {
+    UNARY2_OP op;
+    op.Init(param);
+
+    auto count = DimsVectorUtils::Count(dims);
+    auto count_vec = count / 8 * 8;
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int x = 0; x < count_vec; x += 8) {
+        Float8::saveu(dst + x, op(Float8::loadu(src + x)));
+    }
+    for (int x = count_vec; x < count; x++) {
+        dst[x] = op(src[x]);
+    }
+}
+
+template <typename UNARY2_OP>
+void unary2_kernel_sse(std::vector<int> dims, const float *src, float *dst, LayerParam *param) {
+    UNARY2_OP op;
+    op.Init(param);
+
+    auto count = DimsVectorUtils::Count(dims);
+    auto count_vec = count / 4 * 4;
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int x = 0; x < count_vec; x += 4) {
+        Float4::save(dst + x, op(Float4::load(src + x)));
+    }
+    for (int x = count_vec; x < count; x++) {
+        dst[x] = op(src[x]);
+    }
+}
+
+using unary2_kernel_avx_func_t = decltype(&unary2_kernel_avx<X86_UNARY2_OP>);
+using unary2_kernel_sse_func_t = decltype(&unary2_kernel_sse<X86_UNARY2_OP>);
+
+class X86Unary2LayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86Unary2LayerAcc();
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+
+    static Status RegisterUnary2Kernel(LayerType type, x86_isa_t arch, unary2_kernel_avx_func_t kernel);
+    static Status GetUnary2Kernel(LayerType type, x86_isa_t arch, unary2_kernel_avx_func_t &kernel);
+
+protected:
+    // std::shared_ptr<X86_UNARY2_OP> op_;
+    LayerType type_;
+
+    static std::string GetUnaryKernelName(LayerType type, x86_isa_t arch);
+    static std::map<std::string, unary2_kernel_avx_func_t> &GetUnary2KernelMap();
+};
+
+class X86Unary2KernelRegister {
+public:
+    explicit X86Unary2KernelRegister(LayerType type, x86_isa_t arch, unary2_kernel_avx_func_t kernel) {
+        X86Unary2LayerAcc::RegisterUnary2Kernel(type, arch, kernel);
+    }
+};
+
+#define X86_REGISTER_UNARY2_KERNEL(layer_type, arch, kernel)                                                           \
+    X86Unary2KernelRegister g_x86_##layer_type##_##arch##_unary2_register(layer_type, arch, kernel);
+
+#define DECLARE_X86_UNARY2_ACC(type_string, op_type)                                                                   \
+    class X86##type_string##LayerAcc : public X86Unary2LayerAcc {                                                      \
+    public:                                                                                                            \
+        X86##type_string##LayerAcc() {                                                                                 \
+            type_ = op_type;                                                                                           \
+        };                                                                                                             \
+        virtual ~X86##type_string##LayerAcc(){};                                                                       \
+    }
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY2_LAYER_ACC_H
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.cc
new file mode 100644
index 0000000..a163289
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_unary_layer_acc.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+X86UnaryLayerAcc::~X86UnaryLayerAcc() {}
+
+Status X86UnaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                              const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    RETURN_ON_NEQ(X86LayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
+    return op_->Init(param);
+}
+
+Status X86UnaryLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    auto dims = output->GetBlobDesc().dims;
+
+    int count = DimsVectorUtils::Count(dims);
+    auto input_data  = static_cast<float*>(input->GetHandle().base);
+    auto output_data = static_cast<float*>(output->GetHandle().base);
+
+    OMP_PARALLEL_FOR_GUIDED_
+    for (int n = 0; n < count; n++) {
+        output_data[n] = (*op_)(input_data[n]);
+    }
+
+    return TNN_OK;
+}
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.h
new file mode 100644
index 0000000..5b1de09
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unary_layer_acc.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY_LAYER_ACC_H
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY_LAYER_ACC_H
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+typedef struct x86_unary_operator {
+public:
+    virtual Status Init(LayerParam *param = nullptr) {
+        param_ = param;
+        return TNN_OK;
+    }
+
+    virtual float operator()(const float v) {
+        return v;
+    }
+
+protected:
+    LayerParam *param_ = nullptr;
+} X86_UNARY_OP;
+
+class X86UnaryLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86UnaryLayerAcc();
+
+    virtual Status Init(Context *context, LayerParam *param, LayerResource* resource, const std::vector<Blob*> &inputs,
+                        const std::vector<Blob *> &outputs) override;
+
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+protected:
+    std::shared_ptr<X86_UNARY_OP> op_;
+};
+
+#define DECLARE_X86_UNARY_ACC(type_string, op_type)                                     \
+    class X86##type_string##LayerAcc : public X86UnaryLayerAcc {                        \
+    public:                                                                             \
+        X86##type_string##LayerAcc() {                                                  \
+            X86UnaryLayerAcc::op_ = std::make_shared<op_type>();                       \
+        }                                                                               \
+        virtual ~X86##type_string##LayerAcc(){};                                        \
+    }
+} // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_UNARY_LAYER_ACC_H
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_unsqueeze_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unsqueeze_layer_acc.cc
new file mode 100644
index 0000000..08aaf67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_unsqueeze_layer_acc.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_X86_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status X86UnsqueezeLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    void *input_data  = inputs[0]->GetHandle().base;
+    void *output_data = outputs[0]->GetHandle().base;
+    auto dims         = outputs[0]->GetBlobDesc().dims;
+    auto count        = DimsVectorUtils::Count(dims);
+    auto ele_size     = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+
+    if (input_data != output_data) {
+        memcpy(output_data, input_data, count * ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Unsqueeze, LAYER_UNSQUEEZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.cc b/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.cc
new file mode 100644
index 0000000..37a4748
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.cc
@@ -0,0 +1,331 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/acc/x86_upsample_layer_acc.h"
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/device/x86/acc/compute/x86_compute_int8.h"
+
+namespace TNN_NS {
+
+static inline bool CheckInputOutputSizeSame(int input_height, int input_width, int output_height, int output_width) {
+    return input_height == output_height && input_width == output_width;
+}
+
+// nearest interpolate function
+static inline int upsample_nearest2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                     int output_height, int output_width, int channels, bool align_corners) {
+    // special case: just copy
+    if (CheckInputOutputSizeSame(input_height, input_width, output_height, output_width)) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, channels * input_height * input_width * sizeof(float));
+        }
+        return 0;
+    }
+
+    const float height_scale = (float)input_height / (float)output_height;
+    const float width_scale  = (float)input_width / (float)output_width;
+
+    OMP_PARALLEL_FOR_
+    for (int i = 0; i < channels; ++i) {
+        int output_index  = i * output_height * output_width;
+        int input_index_i = i * input_height * input_width;
+        for (int j = 0; j < output_height; ++j) {
+            int scaled_j      = static_cast<int>(j * height_scale);
+            int input_index_j = input_index_i + scaled_j * input_width;
+            for (int u = 0; u < output_width; ++u) {
+                int scaled_u                = static_cast<int>(u * width_scale);
+                output_data[output_index++] = input_data[input_index_j + scaled_u];
+            }
+        }
+    }
+
+    return 0;
+}
+
+// bilinear interpolate function
+static inline int upsample_bilinear2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                      int output_height, int output_width, int channels, bool align_corners) {
+    // special case: just copy
+    if (CheckInputOutputSizeSame(input_height, input_width, output_height, output_width)) {
+        if (output_data != input_data) {
+            memcpy(output_data, input_data, channels * input_height * input_width * sizeof(float));
+        }
+        return 0;
+    }
+
+    // align corners option from pytorch
+    if (align_corners) {
+        const float rheight = (output_height > 1) ? (float)(input_height - 1) / (output_height - 1) : 0.f;
+        const float rwidth  = (output_width > 1) ? (float)(input_width - 1) / (output_width - 1) : 0.f;
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < output_height; ++h2) {
+            const float h1r = rheight * h2;
+
+            const int h1         = static_cast<int>(h1r);
+            const int h1p        = (h1 < input_height - 1) ? 1 : 0;
+            const float h1lambda = h1r - h1;
+            const float h0lambda = (float)1. - h1lambda;
+            for (int w2 = 0; w2 < output_width; ++w2) {
+                const float w1r      = rwidth * w2;
+                const int w1         = static_cast<int>(w1r);
+                const int w1p        = (w1 < input_width - 1) ? 1 : 0;
+                const float w1lambda = w1r - w1;
+                const float w0lambda = (float)1. - w1lambda;
+                const float *Xdata   = &(input_data[h1 * input_width + w1]);
+                float *Ydata         = &(output_data[h2 * output_width + w2]);
+                for (int c = 0; c < channels; ++c) {
+                    Ydata[0] =
+                        h0lambda * (w0lambda * Xdata[0] + w1lambda * Xdata[w1p]) +
+                        h1lambda * (w0lambda * Xdata[h1p * input_width] + w1lambda * Xdata[h1p * input_width + w1p]);
+                    Xdata += input_width * input_height;
+                    Ydata += output_width * output_height;
+                }
+            }
+        }
+    } else {
+        const float rheight = (output_height > 1) ? (float)(input_height) / (output_height) : 0.f;
+        const float rwidth  = (output_width > 1) ? (float)(input_width) / (output_width) : 0.f;
+
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < output_height; ++h2) {
+            float h1r     = static_cast<float>(rheight * (h2 + 0.5) - 0.5);
+            h1r           = h1r >= 0 ? h1r : 0;
+            const int h1  = static_cast<int>(h1r);
+            const int h1p = (h1 < input_height - 1) ? 1 : 0;
+
+            const float h1lambda = h1r - h1;
+            const float h0lambda = (float)1. - h1lambda;
+
+            for (int w2 = 0; w2 < output_width; ++w2) {
+                float w1r = static_cast<float>(rwidth * (w2 + 0.5) - 0.5);
+                w1r       = w1r >= 0 ? w1r : 0;
+
+                const int w1            = static_cast<int>(w1r);
+                const int w1p           = (w1 < input_width - 1) ? 1 : 0;
+                const float w1lambda    = w1r - w1;
+                const float w0lambda    = (float)1. - w1lambda;
+                const float *x_data_ptr = &(input_data[h1 * input_width + w1]);
+                float *y_data_ptr       = &(output_data[h2 * output_width + w2]);
+                for (int c = 0; c < channels; ++c) {
+                    y_data_ptr[0] = h0lambda * (w0lambda * x_data_ptr[0] + w1lambda * x_data_ptr[w1p]) +
+                                    h1lambda * (w0lambda * x_data_ptr[h1p * input_width] +
+                                                w1lambda * x_data_ptr[h1p * input_width + w1p]);
+                    x_data_ptr += input_width * input_height;
+                    y_data_ptr += output_width * output_height;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+// cubic interpolate weights
+template <typename T>
+static void GetCubicWeights(float coor, T coeffs[4]) {
+    // opencv uses -0.75
+    static const float A = -0.75f;
+    float x = coor - std::floor(coor);
+
+    coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
+    coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
+    coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+// cubic interpolate function
+template <bool align_corners>
+static void upsample_cubic2d_impl(float *dst, const float *src, int sh, int sw,
+                                      int dh, int dw, int channels) {
+    const float h_scale = (dh > 1) ? (align_corners ? (float)(sh - 1) / (dh - 1)
+                                                : (float)(sh) / (dh)) : 0.f;
+    const float w_scale = (dw > 1) ? (align_corners ? (float)(sw - 1) / (dw - 1)
+                                                : (float)(sw) / (dw)) : 0.f;
+#define Clip(x,X) ( (x) >=0 ? ((x)<(X)?(x):((X)-1)) : 0 )
+#define SrcValueAt(c, h, w) (src[c*sh*sw+(Clip(h,sh))*sw+(Clip(w,sw))])
+
+        OMP_PARALLEL_FOR_
+        for (int h2 = 0; h2 < dh; ++h2) {
+            float h1 = static_cast<float>(align_corners ? h_scale * h2 : h_scale * (h2 + 0.5) - 0.5);
+            int hh = std::floor(h1);
+            float wy[4];
+            GetCubicWeights(h1, wy);
+            for (int w2 = 0; w2 < dw; ++w2) {
+                float w1 = static_cast<float>(align_corners? w_scale * w2 : w_scale * (w2 + 0.5) - 0.5);
+                int ww = std::floor(w1);
+                float wx[4];
+                GetCubicWeights(w1, wx);
+                for (int c = 0; c < channels; ++c) {
+                    float src_arr[4][4] = {
+                        {SrcValueAt(c, hh-1, ww-1), SrcValueAt(c, hh-1, ww), SrcValueAt(c, hh-1, ww+1), SrcValueAt(c, hh-1, ww+2)},
+                        {SrcValueAt(c, hh+0, ww-1), SrcValueAt(c, hh+0, ww), SrcValueAt(c, hh+0, ww+1), SrcValueAt(c, hh+0, ww+2)},
+                        {SrcValueAt(c, hh+1, ww-1), SrcValueAt(c, hh+1, ww), SrcValueAt(c, hh+1, ww+1), SrcValueAt(c, hh+1, ww+2)},
+                        {SrcValueAt(c, hh+2, ww-1), SrcValueAt(c, hh+2, ww), SrcValueAt(c, hh+2, ww+1), SrcValueAt(c, hh+2, ww+2)}
+                    };
+                    float vals[4];
+                    vals[0] = wx[0]*src_arr[0][0] + wx[1]*src_arr[0][1] + wx[2]*src_arr[0][2] + wx[3]*src_arr[0][3];
+                    vals[1] = wx[0]*src_arr[1][0] + wx[1]*src_arr[1][1] + wx[2]*src_arr[1][2] + wx[3]*src_arr[1][3];
+                    vals[2] = wx[0]*src_arr[2][0] + wx[1]*src_arr[2][1] + wx[2]*src_arr[2][2] + wx[3]*src_arr[2][3];
+                    vals[3] = wx[0]*src_arr[3][0] + wx[1]*src_arr[3][1] + wx[2]*src_arr[3][2] + wx[3]*src_arr[3][3];
+
+                    float sum = wy[0]*vals[0] + wy[1]*vals[1] + wy[2]*vals[2] + wy[3]*vals[3];
+                    dst[(c * dh + h2) * dw + w2] = sum;
+                }
+            }
+        }
+#undef Clip
+#undef SrcValueAt
+}
+
+static inline int upsample_cubic2d(float *output_data, const float *input_data, int input_height, int input_width,
+                                      int output_height, int output_width, int channels, bool align_corners) {
+    if (align_corners)
+        upsample_cubic2d_impl<true>(output_data, input_data, input_height,
+                     input_width, output_height, output_width, channels);
+    else
+        upsample_cubic2d_impl<false>(output_data, input_data, input_height,
+                     input_width, output_height, output_width, channels);
+
+    return 0;
+}
+
+static inline bool need_do_scale(const float *scale, int len) {
+    for (int i = 0; i < len; ++i) {
+        if (fabs(scale[i] - 1.0) > 0.0078125) {
+            return true;
+        }
+    }
+    return false;
+}
+
+X86UpsampleLayerAcc::~X86UpsampleLayerAcc() {}
+
+Status X86UpsampleLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<UpsampleLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: UpsampleLayerParam is nil");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto dims_input   = input_blob->GetBlobDesc().dims;
+    auto dims_output  = output_blob->GetBlobDesc().dims;
+
+    auto batch       = dims_input[0];
+    auto channel     = dims_input[1];
+    auto input_width = dims_input[3], input_height = dims_input[2];
+    auto output_width = dims_output[3], output_height = dims_output[2];
+    auto input_plane  = input_width * input_height * channel;
+    auto output_plane = output_width * output_height * channel;
+
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+    float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+
+    RawBuffer buffer_scale_;
+    bool do_scale_;
+    if (data_type == DATA_TYPE_INT8) {
+        int total_byte_size = ROUND_UP(dims_output[1], 4) * sizeof(float);
+        input_plane  = ROUND_UP(dims_input[1], 4) * DimsVectorUtils::Count(dims_input, 2);
+        output_plane = ROUND_UP(dims_output[1], 4) * DimsVectorUtils::Count(dims_output, 2);;
+
+        auto input_resource  = reinterpret_cast<BlobInt8 *>(inputs[0])->GetIntResource();
+        auto output_resource = reinterpret_cast<BlobInt8 *>(outputs[0])->GetIntResource();
+        const float *i_scale = input_resource->scale_handle.force_to<float *>();
+        const float *o_scale = output_resource->scale_handle.force_to<float *>();
+        int scale_len_i      = input_resource->scale_handle.GetDataCount();
+        int scale_len_o      = output_resource->scale_handle.GetDataCount();
+
+        if (buffer_scale_.GetBytesSize() < total_byte_size) {
+            buffer_scale_ = RawBuffer(total_byte_size);
+        }
+        float *temp_ptr = buffer_scale_.force_to<float *>();
+        for (int i = 0; i < dims_output[1]; i++) {
+            int scale_idx_i = scale_len_i == 1 ? 0 : i;
+            int scale_idx_o = scale_len_o == 1 ? 0 : i;
+            if (o_scale[scale_idx_o] >= FLT_MIN)
+                temp_ptr[i] = i_scale[scale_idx_i] / o_scale[scale_idx_o];
+            else
+                temp_ptr[i] = 0.0;
+        }
+        do_scale_ = need_do_scale(temp_ptr, dims_output[1]);
+
+        auto oc_4 = UP_DIV(dims_output[1], 4);
+        if (dims_input[2] == dims_output[2] && dims_input[3] == dims_output[3] && !do_scale_) {
+            if (output_data != input_data) {
+                memcpy(output_data, input_data, batch * input_plane * DataTypeUtils::GetBytesSize(data_type));
+            }
+        } else if (param->mode == 1) {
+            for (int b = 0; b < batch; ++b) {
+                auto output_b = reinterpret_cast<int8_t *>(output_data) + b * output_plane;
+                auto input_b  = reinterpret_cast<int8_t *>(input_data) + b * input_plane;
+                if (do_scale_)
+                    X86UpsampleNearest2D<true>(output_b, input_b, dims_input[2], dims_input[3], dims_output[2],
+                                               dims_output[3], oc_4, buffer_scale_.force_to<float *>());
+                else
+                    X86UpsampleNearest2D<false>(output_b, input_b, dims_input[2], dims_input[3], dims_output[2],
+                                                dims_output[3], oc_4, buffer_scale_.force_to<float *>());
+            }
+        } else if (param->mode == 2) {
+            if (do_scale_)
+                X86UpsampleBilinear2D<true>(reinterpret_cast<int8_t *>(output_data),
+                                            reinterpret_cast<int8_t *>(input_data), batch, dims_input[2], dims_input[3],
+                                            dims_output[2], dims_output[3], oc_4, (bool)param->align_corners,
+                                            buffer_scale_.force_to<float *>());
+            else
+                X86UpsampleBilinear2D<false>(reinterpret_cast<int8_t *>(output_data),
+                                             reinterpret_cast<int8_t *>(input_data), batch, dims_input[2], dims_input[3],
+                                             dims_output[2], dims_output[3], oc_4, (bool)param->align_corners,
+                                             buffer_scale_.force_to<float *>());
+        } else {
+            LOGE("Error: Not supported mode for x86 int8 upsample\n");
+            return Status(TNNERR_PARAM_ERR, "Error: Not supported mode for x86 int8 upsample");
+        }
+    } else if (data_type == DATA_TYPE_FLOAT) {
+        if (param->mode == 1) {  // nearest
+            for (int b = 0; b < batch; ++b) {
+                upsample_nearest2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                                output_height, output_width, channel, (bool)param->align_corners);
+            }
+        } else if (param->mode == 2) {  // bilinear/linear
+            for (int b = 0; b < batch; ++b) {
+                upsample_bilinear2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                                    output_height, output_width, channel, (bool)param->align_corners);
+            }
+        } else if (param->mode == 3) { // cubic
+            for (int b = 0; b < batch; ++b) {
+                upsample_cubic2d(output_data + b * output_plane, input_data + b * input_plane, input_height, input_width,
+                                output_height, output_width, channel, (bool)param->align_corners);
+            }
+        } else {
+            LOGE("Error: Not supported mode for x86 float upsample\n");
+            return Status(TNNERR_MODEL_ERR, "Error: Not supported mode for x86 float upsample");
+        }
+    } else {
+        LOGE("Error: Not supported data type for upsample\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Not supported data type for upsample");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_X86_ACC(Upsample, LAYER_UPSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.h b/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.h
new file mode 100644
index 0000000..02c2802
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/acc/x86_upsample_layer_acc.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_ACC_UPSAMPLE_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_ACC_UPSAMPLE_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+// @brief upsample layer cpu acc
+class X86UpsampleLayerAcc : public X86LayerAcc {
+    // @brief virtual destrcutor
+    virtual ~X86UpsampleLayerAcc();
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_ACC_UPSAMPLE_LAYER_ACC_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.cc b/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.cc
new file mode 100644
index 0000000..38cce8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.cc
@@ -0,0 +1,427 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/macro.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/device/x86/x86_blob_converter.h"
+#include "tnn/device/x86/x86_mat_util.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+std::string X86BlobConverterAcc::GetUniqueBlobConvertKey(MatType mat_type, DataType data_type,
+                                                         BlobConvertDirection cvt_dir) {
+    return ToString(mat_type) + "_" + ToString(data_type) + "_" + ToString(cvt_dir);
+}
+
+std::map<std::string, X86BlobConvertFunc>& X86BlobConverterAcc::GetBlobConvertFuncMap() {
+    static std::map<std::string, X86BlobConvertFunc> cvt_map;
+    return cvt_map;
+}
+
+Status X86BlobConverterAcc::RegisterBlobConvertFunc(MatType mat_type, DataType data_type,
+                                                    BlobConvertDirection cvt_dir, X86BlobConvertFunc cvt_func) {
+    auto& cvt_map       = GetBlobConvertFuncMap();
+    const auto& cvt_key = GetUniqueBlobConvertKey(mat_type, data_type, cvt_dir);
+    cvt_map[cvt_key] = cvt_func;
+    return TNN_OK;
+}
+
+Status X86BlobConverterAcc::GetBlobConvertFunc(MatType mat_type, DataType data_type,
+                                               BlobConvertDirection cvt_dir, X86BlobConvertFunc& cvt_func) {
+    const auto& cvt_map = GetBlobConvertFuncMap();
+    const auto& cvt_key = GetUniqueBlobConvertKey(mat_type, data_type, cvt_dir);
+    if (cvt_map.find(cvt_key) == cvt_map.end() || cvt_map.at(cvt_key) == nullptr) {
+        LOGE("X86BlobConverterAcc::GetBlobConvertFunc, convert type not support yet. mat_type:%d data_type:%d cvt_dir:%d\n", mat_type, data_type, cvt_dir);
+        return Status(TNNERR_PARAM_ERR, "X86BlobConverterAcc::GetBlobConvertFunc, convert type not support yet");
+    }
+    cvt_func = cvt_map.at(cvt_key);
+    return TNN_OK;
+}
+
+Status X86BlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam param, void *command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob is null");
+    }
+
+    auto desc       = blob_->GetBlobDesc();
+    if (desc.data_type == DATA_TYPE_INT8) {
+        auto dims       = desc.dims;
+        auto hw         = DimsVectorUtils::Count(dims, 2);
+        auto c          = DimsFunctionUtils::GetDim(dims, 1);
+        auto c_r4       = ROUND_UP(c, 4);
+
+        if (fused_int8_scale.size() < c_r4) {
+            fused_int8_scale.resize(c_r4);
+            fused_int8_bias.resize(c_r4);
+        }
+        auto scale_handle = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle;
+        auto scale_data   = scale_handle.force_to<float *>();
+        auto scale_count  = scale_handle.GetDataCount();
+        for (int i = 0; i < dims[1]; i++) {
+            auto scale_idx      = scale_count == 1 ? 0 : i;
+            fused_int8_scale[i] = param.scale[i] * scale_data[scale_idx];
+            fused_int8_bias[i]  = param.bias[i];
+        }
+
+        auto cvt_handle_ptr = reinterpret_cast<char *>(blob_->GetHandle().base);
+
+        ret = GetBlobConvertFunc(image.GetMatType(), DATA_TYPE_INT8, CVT_DIR_BLOB2MAT, cvt_func_);
+        if (ret == TNN_OK) {
+            return cvt_func_(image, cvt_handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+        } else {
+            return ret;
+        }
+    } else {
+        return DefaultBlobConverterAcc::ConvertToMatAsync(image, param, command_queue);
+    }
+}
+
+Status X86BlobConverterAcc::ConvertFromMatAsync(Mat &image, MatConvertParam param, void *command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob_ is null");
+    }
+    auto desc       = blob_->GetBlobDesc();
+    if (desc.data_type == DATA_TYPE_INT8) {
+        auto dims       = desc.dims;
+        auto hw         = DimsVectorUtils::Count(dims, 2);
+        auto c          = DimsFunctionUtils::GetDim(dims, 1);
+        auto c_r4       = ROUND_UP(c, 4);
+
+        if (fused_int8_scale.size() < c_r4) {
+            fused_int8_scale.resize(c_r4);
+            fused_int8_bias.resize(c_r4);
+        }
+        auto scale_handle = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle;
+        auto scale_data   = scale_handle.force_to<float *>();
+        auto scale_count  = scale_handle.GetDataCount();
+        for (int i = 0; i < dims[1]; i++) {
+            auto scale_idx = scale_count == 1 ? 0 : i;
+            if (scale_data[scale_idx] != 0) {
+                fused_int8_scale[i] = param.scale[i] / scale_data[scale_idx];
+                fused_int8_bias[i]  = param.bias[i] / scale_data[scale_idx];
+            } else {
+                fused_int8_scale[i] = 0;
+                fused_int8_bias[i]  = 0;
+            }
+        }
+
+        auto cvt_handle_ptr = reinterpret_cast<char *>(blob_->GetHandle().base);
+
+        ret = GetBlobConvertFunc(image.GetMatType(), DATA_TYPE_INT8, CVT_DIR_MAT2BLOB, cvt_func_);
+        if (ret == TNN_OK) {
+            ret = cvt_func_(image, cvt_handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+        } else {
+            return ret;
+        }
+    } else {
+        return DefaultBlobConverterAcc::ConvertFromMatAsync(image, param, command_queue);
+    }
+
+    return ret;
+}
+
+Status X86BlobConverterAcc::ConvertToMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertToMatAsync(image, param, command_queue);
+}
+
+Status X86BlobConverterAcc::ConvertFromMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertFromMatAsync(image, param, command_queue);
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(X86);
+REGISTER_BLOB_CONVERTER(X86, DEVICE_X86);
+
+/*
+Convert From Mat and Convert To Mat Implementions
+*/
+
+/*
+convert data type from uint8 to int8, data format from nhw4 2 nhw4
+*/
+template <bool reverse_channel>
+static void BGRAToBlobImpl(const uint8_t *src, int8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    for (int i = 0; i < hw; ++i) {
+        dst[4 * i + 0] = float2int8(scale[0] * src[4 * i + (reverse_channel ? 2 : 0)] + bias[0]);
+        dst[4 * i + 1] = float2int8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2int8(scale[2] * src[4 * i + (reverse_channel ? 0 : 2)] + bias[2]);
+        dst[4 * i + 3] = float2int8(scale[3] * src[4 * i + 3] + bias[3]);
+        if (channel == 3) {
+            dst[4 * i + 3] = 0;
+        }
+    }
+}
+
+/*
+convert data type from uint8 to int8, data format from nhw3 2 nhw4
+*/
+template <bool reverse_channel>
+static void BGRToBlobImpl(const uint8_t *src, int8_t *dst, const float *scale, const float *bias, int hw) {
+    for (int i = 0; i < hw; ++i) {
+        dst[4 * i + 0] = float2int8(scale[0] * src[3 * i + (reverse_channel ? 2 : 0)] + bias[0]);
+        dst[4 * i + 1] = float2int8(scale[1] * src[3 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2int8(scale[2] * src[3 * i + (reverse_channel ? 0 : 2)] + bias[2]);
+        dst[4 * i + 3] = 0;
+    }
+}
+
+static void BGRAToBlob(const uint8_t *src, int8_t *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel, int channel) {
+    if (reverse_channel) {
+        BGRAToBlobImpl<true>(src, dst, scale, bias, hw, channel);
+    } else {
+        BGRAToBlobImpl<false>(src, dst, scale, bias, hw, channel);
+    }
+}
+
+static void BGRToBlob(const uint8_t *src, int8_t *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel) {
+    if (reverse_channel) {
+        BGRToBlobImpl<true>(src, dst, scale, bias, hw);
+    } else {
+        BGRToBlobImpl<false>(src, dst, scale, bias, hw);
+    }
+}
+
+/*
+convert data type from uint8 to int8, data format from nhw1 2 nhwc
+*/
+static void GrayToBlob(const uint8_t *src, int8_t *dst, const float scale, const float bias, int hw) {
+    int i = 0;
+    memset(dst, 0, hw * 4 * sizeof(int8_t));
+    for (; i < hw; ++i) {
+        dst[4 * i] = float2int8(scale * src[i] + bias);
+    }
+}
+
+static Mat GetBGRFromYUV(Mat& image, const DimsVector& dims, const int hw, bool is_nv12) {
+    Mat bgr(DEVICE_X86, N8UC3, image.GetDims());
+    for (int n = 0; n < dims[0]; n++) {
+        if (is_nv12) {
+            NV12ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+        } else {
+            NV21ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+        }
+    }
+    return bgr;
+}
+
+static void NCHWToBlob(const float *src, int8_t *dst, int channel, int hw, float *scale) {
+    int idx  = 0;
+    int c_r4 = ROUND_UP(channel, 4);
+    memset(dst, 0, hw * c_r4);
+    for (int c = 0; c < channel; ++c) {
+        int8_t *dst_c = dst + c;
+        for (int cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst_c[cur_hw * c_r4] = float2int8(src[idx++] * scale[c]);
+        }
+    }
+}
+
+static Status ConvertN8UC4ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        BGRAToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw,
+                   reinterpret_cast<int8_t *>(handle_ptr) + n * 4 * hw,
+                   fused_int8_scale.data(), fused_int8_bias.data(), hw, param.reverse_channel, dims[1]);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertN8UC3ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        BGRToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw,
+                  reinterpret_cast<int8_t *>(handle_ptr) + n * 4 * hw,
+                  fused_int8_scale.data(), fused_int8_bias.data(), hw, param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNGRAYToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        GrayToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * 1 * hw,
+                   reinterpret_cast<int8_t *>(handle_ptr) + n * 4 * hw,
+                   fused_int8_scale[0], fused_int8_bias[0], hw);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertNNV12ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, true);
+    return ConvertN8UC3ToInt8Blob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNNV21ToInt8Blob(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    Mat bgr = GetBGRFromYUV(image, dims, hw, false);
+    return ConvertN8UC3ToInt8Blob(bgr, handle_ptr, param, dims, hw, c_r4, fused_int8_scale, fused_int8_bias);
+}
+
+static Status ConvertNCHWFloatToInt8Blob(Mat& image, char* handle_ptr,
+                                         const MatConvertParam& param, const DimsVector& dims,
+                                         const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        NCHWToBlob(reinterpret_cast<float *>(image.GetData()) + n * dims[1] * hw,
+                   reinterpret_cast<int8_t *>(handle_ptr) + n * c_r4 * hw, dims[1], hw,
+                   fused_int8_scale.data());
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8MatToInt8Blob(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                       const DimsVector& dims, const int hw, const int c_r4,
+                                       std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    return DataFormatConverter::ConvertFromNCHWToNHWC4Int8(reinterpret_cast<int8_t*>(image.GetData()),
+                                                           reinterpret_cast<int8_t*>(handle_ptr), batch, channel, hw);
+}
+
+REGISTER_X86_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertN8UC4ToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertN8UC3ToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNGRAYToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV12ToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV21ToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNCHWFloatToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
+
+template <bool reverse_channel>
+static void BlobToBGRAImpl(const int8_t *src, uint8_t *dst, const float *scale, const float *bias,
+                           int hw, int channel) {
+    for (int i = 0; i < hw; ++i) {
+        dst[4 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[4 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[4 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+        if (channel == 4) {
+            dst[4 * i + 3] = float2uint8(scale[3] * src[4 * i + 3] + bias[3]);
+        }
+    }
+}
+
+template <bool reverse_channel>
+static void BlobToBGRImpl(const int8_t *src, uint8_t *dst, const float *scale, const float *bias, int hw) {
+    for (int i = 0; i < hw; ++i) {
+        dst[3 * i + 0] = float2uint8(reverse_channel ? (scale[2] * src[4 * i + 2] + bias[2]) :
+                                                       (scale[0] * src[4 * i + 0] + bias[0]));
+        dst[3 * i + 1] = float2uint8(scale[1] * src[4 * i + 1] + bias[1]);
+        dst[3 * i + 2] = float2uint8(reverse_channel ? (scale[0] * src[4 * i + 0] + bias[0]) :
+                                                       (scale[2] * src[4 * i + 2] + bias[2]));
+    }
+}
+
+static void BlobToBGRA(const int8_t *src, uint8_t *dst, const float *scale, const float *bias, int hw,
+                       bool reverse_channel, int channel) {
+    if (reverse_channel) {
+        BlobToBGRAImpl<true>(src, dst, scale, bias, hw, channel);
+    } else {
+        BlobToBGRAImpl<false>(src, dst, scale, bias, hw, channel);
+    }
+}
+
+static void BlobToBGR(const int8_t *src, uint8_t *dst, const float *scale, const float *bias, int hw,
+                      bool reverse_channel) {
+    if (reverse_channel) {
+        BlobToBGRImpl<true>(src, dst, scale, bias, hw);
+    } else {
+        BlobToBGRImpl<false>(src, dst, scale, bias, hw);
+    }
+}
+
+static void Int8BlobToNCHW(const int8_t *src, float *dst, int channel, int hw, float *scale, float *bias) {
+    int cur_hw;
+    int c;
+    int idx  = 0;
+    int c_r4 = ROUND_UP(channel, 4);
+    for (c = 0; c < channel; ++c) {
+        auto *src_c = src + c;
+        for (cur_hw = 0; cur_hw < hw; ++cur_hw) {
+            dst[idx++] = src_c[c_r4 * cur_hw] * scale[c] + bias[c];
+        }
+    }
+}
+
+static Status ConvertInt8BlobToN8UC4(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        BlobToBGRA(reinterpret_cast<int8_t *>(handle_ptr) + n * 4 * hw,
+                   reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw,
+                   fused_int8_scale.data(), fused_int8_bias.data(), hw, param.reverse_channel, dims[1]);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToN8UC3(Mat& image, char* handle_ptr,
+                                     const MatConvertParam& param, const DimsVector& dims,
+                                     const int hw, const int c_r4,
+                                     std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        BlobToBGR(reinterpret_cast<int8_t *>(handle_ptr) + n * 4 * hw,
+                   reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw,
+                   fused_int8_scale.data(), fused_int8_bias.data(), hw, param.reverse_channel);
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToNCHWFloat(Mat& image, char* handle_ptr,
+                                         const MatConvertParam& param, const DimsVector& dims,
+                                         const int hw, const int c_r4,
+                                         std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    for (int n = 0; n < dims[0]; n++) {
+        Int8BlobToNCHW(reinterpret_cast<int8_t *>(handle_ptr) + n * c_r4 * hw,
+                       reinterpret_cast<float *>(image.GetData()) + n * dims[1] * hw, dims[1], hw,
+                       fused_int8_scale.data(), fused_int8_bias.data());
+    }
+    return TNN_OK;
+}
+
+static Status ConvertInt8BlobToInt8Mat(Mat& image, char* handle_ptr, const MatConvertParam& param,
+                                       const DimsVector& dims, const int hw, const int c_r4,
+                                       std::vector<float>& fused_int8_scale, std::vector<float>& fused_int8_bias) {
+    auto batch   = DimsFunctionUtils::GetDim(dims, 0);
+    auto channel = DimsFunctionUtils::GetDim(dims, 1);
+    return DataFormatConverter::ConvertFromNHWC4ToNCHWInt8(
+        reinterpret_cast<int8_t*>(handle_ptr), reinterpret_cast<int8_t*>(image.GetData()), batch, channel, hw);
+}
+
+REGISTER_X86_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC4)
+REGISTER_X86_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC3)
+REGISTER_X86_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToNCHWFloat)
+REGISTER_X86_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.h b/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.h
new file mode 100644
index 0000000..3f822e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_blob_converter.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_BLOB_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_BLOB_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/utils/blob_converter_default.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+typedef Status (*X86BlobConvertFunc)(Mat& image,
+                                     char* handle_ptr,
+                                     const MatConvertParam& param,
+                                     const DimsVector& dims,
+                                     const int hw,
+                                     const int c_r4,
+                                     std::vector<float>& fused_int8_scale,
+                                     std::vector<float>& fused_int8_bias);
+
+typedef enum {
+    CVT_DIR_MAT2BLOB = 0,
+    CVT_DIR_BLOB2MAT = 1
+} BlobConvertDirection;
+
+class X86BlobConverterAcc : public DefaultBlobConverterAcc {
+public:
+    X86BlobConverterAcc(Blob *blob) : DefaultBlobConverterAcc(blob) {}
+    ~X86BlobConverterAcc() {}
+
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = NULL) override;
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL) override;
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = NULL) override;
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL) override;
+
+    static Status RegisterBlobConvertFunc(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                          X86BlobConvertFunc cvt_func);
+
+private:
+    std::vector<float> fused_int8_scale;
+    std::vector<float> fused_int8_bias;
+    X86BlobConvertFunc cvt_func_;
+
+    static Status GetBlobConvertFunc(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                     X86BlobConvertFunc& cvt_func);
+    static std::string GetUniqueBlobConvertKey(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir);
+    static std::map<std::string, X86BlobConvertFunc>& GetBlobConvertFuncMap();
+};
+
+class X86BlobConvertFuncRegister {
+public:
+    explicit X86BlobConvertFuncRegister(MatType mat_type, DataType data_type, BlobConvertDirection cvt_dir,
+                                        X86BlobConvertFunc cvt_func) {
+        X86BlobConverterAcc::RegisterBlobConvertFunc(mat_type, data_type, cvt_dir, cvt_func);
+    }
+};
+
+#define REGISTER_X86_BLOB_CONVERT_FUNC(mat_type, data_type, cvt_dir, cvt_func)                                  \
+    X86BlobConvertFuncRegister g_arm_##mat_type##_##data_type##_##cvt_dir##_register(mat_type, data_type,       \
+                                                                                     cvt_dir, cvt_func);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_BLOB_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_common.h b/3rdparty/TNN/source/tnn/device/x86/x86_common.h
new file mode 100644
index 0000000..b4fe38d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_common.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_X86_COMMON_H_
+#define TNN_X86_COMMON_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/core/blob_int8.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#define SIMD_KERNEL_EXTRA_LOAD (64)
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_context.cc b/3rdparty/TNN/source/tnn/device/x86/x86_context.cc
new file mode 100644
index 0000000..61782c3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_context.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+Status X86Context::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status X86Context::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status X86Context::OnInstanceForwardBegin() {
+    Context::OnInstanceForwardBegin();
+    OMP_SET_THREADS_(GetNumThreads());
+    return TNN_OK;
+}
+
+Status X86Context::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status X86Context::Synchronize() {
+    return TNN_OK;
+}
+
+Status X86Context::SetNumThreads(int num_threads) {
+    num_threads_ = MIN(MAX(num_threads, 1), OMP_CORES_);
+    return TNN_OK;
+}
+
+int X86Context::GetNumThreads() {
+    return num_threads_;
+}
+
+void* X86Context::GetSharedWorkSpace(size_t size) {
+    return GetSharedWorkSpace(size, 0);
+}
+
+void* X86Context::GetSharedWorkSpace(size_t size, int index) {
+    while(work_space_.size() < index + 1) {
+        work_space_.push_back(RawBuffer(size, 32));
+    }
+    if (work_space_[index].GetBytesSize() < size) {
+        work_space_[index] = RawBuffer(size, 32);
+    }
+    return work_space_[index].force_to<void*>();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_context.h b/3rdparty/TNN/source/tnn/device/x86/x86_context.h
new file mode 100644
index 0000000..c94fae8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_context.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/context.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+class X86Context : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief set threads run on device
+    virtual Status SetNumThreads(int num_threads) override;
+
+    // @brief get threads run on device
+    virtual int GetNumThreads();
+
+    void* GetSharedWorkSpace(size_t size);
+    void* GetSharedWorkSpace(size_t size, int index);
+
+private:
+    int num_threads_ = 1;
+    std::vector<RawBuffer> work_space_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_CONTEXT_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_device.cc b/3rdparty/TNN/source/tnn/device/x86/x86_device.cc
new file mode 100644
index 0000000..1918f4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_device.cc
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/device/x86/x86_context.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+X86Device::X86Device(DeviceType device_type) : AbstractDevice(device_type) {}
+
+X86Device::~X86Device() {}
+
+BlobMemorySizeInfo X86Device::Calculate1DMemorySize(BlobDesc &desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    int count      = 0;
+    if (desc.data_type == DATA_TYPE_INT8) {
+        count = desc.dims[0] * ROUND_UP(desc.dims[1], 4) * DimsVectorUtils::Count(desc.dims, 2);
+    } else {
+        count = DimsVectorUtils::Count(desc.dims);
+    }
+    info.dims.push_back(count);
+    return info;
+}
+
+BlobMemorySizeInfo X86Device::Calculate(BlobDesc &desc) {
+    return this->Calculate1DMemorySize(desc);
+}
+
+Status X86Device::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    BlobDesc desc;
+    desc.dims = dims;
+    desc.device_type = DEVICE_X86;
+    if (mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_INT8_TEST) {
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == N8UC3 || mat_type == N8UC4 || mat_type == NGRAY ||
+               mat_type == NNV21 || mat_type == NNV12) {
+        desc.data_type   = DATA_TYPE_INT8;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("X86Device dont support mat_type:%d", mat_type);
+        return Status(TNNERR_PARAM_ERR, "x86 dont support mat_type");
+    }
+}
+
+Status X86Device::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    if (handle) {
+        *handle = malloc(GetBlobMemoryBytesSize(size_info));
+    }
+    return TNN_OK;
+}
+
+Status X86Device::Free(void* handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+std::shared_ptr<const ImplementedLayout> X86Device::GetImplementedLayout(LayerType type) {
+    auto layouts = new ImplementedLayout();
+    layouts->layouts.push_back(DATA_FORMAT_NCHW);
+    return std::shared_ptr<ImplementedLayout>(layouts);
+}
+
+Status X86Device::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+Status X86Device::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+
+    return TNN_OK;
+}
+
+AbstractLayerAcc* X86Device::CreateLayerAcc(LayerType type) {
+    auto &layer_creator_map = GetLayerCreatorMap();
+    if (layer_creator_map.count(type) > 0) {
+        return layer_creator_map[type]->CreateLayerAcc(type);
+    }
+    return NULL;
+}
+
+Context* X86Device::CreateContext(int device_id) {
+    return new X86Context();
+}
+
+NetworkType X86Device::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_DEFAULT;
+}
+
+Status X86Device::RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator) {
+    GetLayerCreatorMap()[type] = std::shared_ptr<LayerAccCreator>(creator);
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<LayerAccCreator>>& X86Device::GetLayerCreatorMap() {
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> layer_creator_map;
+    return layer_creator_map;
+}
+
+TypeDeviceRegister<X86Device> g_x86_device_register(DEVICE_X86);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_device.h b/3rdparty/TNN/source/tnn/device/x86/x86_device.h
new file mode 100644
index 0000000..dac3346
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_device.h
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_DEVICE_H_
+
+#include <map>
+#include <memory>
+#include <cstring>
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief X86Device create x86 memory and x86 layer acc
+
+class X86Device : public AbstractDevice {
+public:
+
+    friend class OpenVINOLayerBuilder;
+    
+    explicit X86Device(DeviceType device_type);
+
+    ~X86Device();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc,
+                                void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc,
+                                  void* command_queue);
+    
+    virtual AbstractLayerAcc *CreateLayerAcc(LayerType type);
+
+    virtual Context *CreateContext(int device_id);
+
+    virtual NetworkType ConvertAutoNetworkType();
+
+    static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
+
+private:
+    BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc);
+    static std::map<LayerType, std::shared_ptr<LayerAccCreator>> &GetLayerCreatorMap();
+};
+
+// @brief X86TypeLayerAccRegister register X86TypeLayerAccCreator
+template <typename T>
+class X86TypeLayerAccRegister {
+public:
+    explicit X86TypeLayerAccRegister(LayerType type) {
+        X86Device::RegisterLayerAccCreator(type, new T());
+    }
+};
+
+} // namespace TNN_NS
+
+#endif // TNN_SOURCE_TNN_DEVICE_X86_X86_DEVICE_H
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.cc b/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.cc
new file mode 100644
index 0000000..1ed941d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.cc
@@ -0,0 +1,289 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_mat_converter.h"
+
+#include "tnn/device/x86/x86_mat_util.h"
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+
+namespace TNN_NS {
+
+Status X86MatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, false);
+    if (ret != TNN_OK)
+        return ret;
+
+    auto elem_num = DimsVectorUtils::Count(src.GetDims());
+
+    if (src.GetMatType() == NGRAY || src.GetMatType() == NNV21 || src.GetMatType() == NNV12 || 
+        src.GetMatType() == N8UC3 || src.GetMatType() == N8UC4) {
+        memcpy(dst.GetData(), src.GetData(), elem_num * sizeof(uint8_t));
+    } else if(src.GetMatType() == NCHW_FLOAT) {
+        memcpy(dst.GetData(), src.GetData(), elem_num * sizeof(float));
+    } else {
+        return Status(TNNERR_PARAM_ERR, "X86MatConverterAcc::Copy, convert type not support yet");
+    }
+    return ret;
+}
+
+Status X86MatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NGRAY) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == N8UC3) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC3((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC3((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == N8UC4) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearC4((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                             (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestC4((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                            (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.type == INTERP_TYPE_LINEAR) {
+            ResizeBilinearYUV420sp((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                   (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else if (param.type == INTERP_TYPE_NEAREST) {
+            ResizeNearestYUV420sp((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),
+                                  (uint8_t*)dst.GetData(), dst_width, dst_height);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "X86MatConverterAcc::Resize, convert type not support yet");
+    }
+
+    return ret;
+}
+
+Status X86MatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY) {
+        // element size 1
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() +
+                                          param.top_left_x + param.top_left_y * src.GetWidth());
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth());
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height, src.GetWidth(), dst.GetWidth());
+        }
+    } else if (src.GetMatType() == N8UC3) {
+        // element size 3
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 +
+                                          (param.top_left_x + param.top_left_y * src.GetWidth()) * 3);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width * 3, param.height, src.GetWidth() * 3, dst.GetWidth() * 3);
+        }
+    } else if (src.GetMatType() == N8UC4) {
+        // element size 4
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 4 +
+                                          (param.top_left_x + param.top_left_y * src.GetWidth()) * 4);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 4);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width * 4, param.height, src.GetWidth() * 4, dst.GetWidth() * 4);
+        }
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        if (param.top_left_x % 2 || param.top_left_y % 2 || param.width % 2 || param.height % 2) {
+            return Status(TNNERR_PARAM_ERR, "corp param can not be odd");
+        }
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            // crop y
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 / 2 +
+                                          param.top_left_x + param.top_left_y * src.GetWidth());
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3 / 2);
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height, src.GetWidth(), dst.GetWidth());
+            // crop uv
+            src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * 3 / 2 +
+                      src.GetWidth() * src.GetHeight() + param.top_left_x + param.top_left_y * src.GetWidth() / 2);
+            dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * 3 / 2 +
+                      dst.GetWidth() * dst.GetHeight());
+            MatMemcpy2D(src_ptr, dst_ptr, param.width, param.height / 2, src.GetWidth(), dst.GetWidth());
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "X86MatConverterAcc::Crop, convert type not support yet");
+    }
+
+    return ret;
+}
+
+#define AFFINE_CHECK_RUN(func1, func2)                                                                      \
+    if (param.interp_type == INTERP_TYPE_LINEAR && param.border_type == BORDER_TYPE_CONSTANT) {             \
+        func1((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),                     \
+                                (uint8_t*)dst.GetData(), dst_width, dst_height,                             \
+                                param.transform, param.border_val);                                         \
+    } else if (param.interp_type == INTERP_TYPE_NEAREST && param.border_type == BORDER_TYPE_CONSTANT) {     \
+        func2((uint8_t*)src.GetData(), src.GetBatch(), src.GetWidth(), src.GetHeight(),                     \
+                            (uint8_t*)dst.GetData(), dst_width, dst_height,                                 \
+                            param.transform, param.border_val);                                             \
+    } else {                                                                                                \
+        return Status(TNNERR_PARAM_ERR, "warpaffine type not support yet");                                 \
+    }
+
+Status X86MatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    int dst_width  = dst.GetWidth();
+    int dst_height = dst.GetHeight();
+
+    if (dst_width == 0 || dst_height == 0) {
+        return Status(TNNERR_INVALID_INPUT, "dst size is zero");
+    }
+
+    if (src.GetMatType() == NGRAY) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC1, WarpAffineNearestC1);
+    } else if (src.GetMatType() == N8UC3) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC3, WarpAffineNearestC3);
+    } else if (src.GetMatType() == N8UC4) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearC4, WarpAffineNearestC4);
+    } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
+        AFFINE_CHECK_RUN(WarpAffineBilinearYUV420sp, WarpAffineNearestYUV420sp);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "X86MatConverterAcc::WarpAffine, convert type not support yet");
+    }
+
+    return ret;
+}
+
+#undef AFFINE_CHECK_RUN
+
+Status X86MatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    switch (type) {
+        case COLOR_CONVERT_NV12TOBGR:
+            NV12ToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV21TOBGR:
+            NV21ToBGR((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV12TOBGRA:
+            NV12ToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_NV21TOBGRA:
+            NV21ToBGRA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_BGRTOGRAY:
+            BGRToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_BGRATOGRAY:
+            BGRAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_RGBTOGRAY:
+            RGBToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        case COLOR_CONVERT_RGBATOGRAY:
+            RGBAToGray((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch()*src.GetHeight(), src.GetWidth());
+            break;
+        default:
+            return Status(TNNERR_PARAM_ERR, "X86MatConverterAcc::CvtColor, color conversion type not support yet");
+    }
+
+    return ret;
+}
+
+static Status CopyMakeBorderImpl(Mat& src, Mat& dst, CopyMakeBorderParam param, int channel) {
+    Status ret = TNN_OK;
+
+    if (param.border_type == BORDER_TYPE_CONSTANT) {
+        uint8_t border_ival = uint8_t(param.border_val);
+        int src_stride      = src.GetWidth() * channel;
+        int dst_stride      = dst.GetWidth() * channel;
+        for (int b = 0; b < src.GetBatch(); ++b) {
+            auto src_ptr = GET_OFFSET_PTR(src.GetData(), b * src.GetHeight() * src.GetWidth() * channel);
+            auto dst_ptr = GET_OFFSET_PTR(dst.GetData(), b * dst.GetHeight() * dst.GetWidth() * channel);
+            MatMemcpy2DWithPadding(src_ptr, dst_ptr, src.GetWidth() * channel, src.GetHeight(), src_stride, dst_stride,
+                                   param.top, param.bottom, param.left * channel, param.right * channel, border_ival);
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder border type not support yet");
+    }
+
+    return ret;
+}
+
+Status X86MatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    Status ret = TNN_OK;
+
+    ret = CheckMatConverterParams(src, dst, true);
+    if (ret != TNN_OK)
+        return ret;
+
+    if (src.GetMatType() == NGRAY) {
+        // element size 1
+        ret = CopyMakeBorderImpl(src, dst, param, 1);
+    } else if (src.GetMatType() == N8UC3) {
+        // element size 3
+        ret = CopyMakeBorderImpl(src, dst, param, 3);
+    } else if (src.GetMatType() == N8UC4) {
+        // element size 4
+        ret = CopyMakeBorderImpl(src, dst, param, 4);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "CopyMakeBorder mat type not support yet");
+    }
+
+    return ret;
+}
+
+DECLARE_MAT_CONVERTER_CREATER(X86);
+REGISTER_MAT_CONVERTER(X86, DEVICE_X86);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.h b/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.h
new file mode 100644
index 0000000..09bd9d3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_mat_converter.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_CONVERTER_H_
+
+#include "tnn/utils/mat_converter_acc.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/x86_util.h"
+
+namespace TNN_NS {
+
+class X86MatConverterAcc : public MatConverterAcc {
+public:
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL);
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL);
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL);
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL);
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL);
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.cc b/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.cc
new file mode 100644
index 0000000..a90fbdf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.cc
@@ -0,0 +1,1872 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_mat_util.h"
+
+#include <algorithm>
+#include <type_traits>
+
+#include "tnn/core/macro.h"
+#include "tnn/device/x86/x86_common.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/mat_converter_utils.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+static inline void* x86Malloc(size_t size) {
+    return _mm_malloc(size, 32);
+}
+
+static inline void x86Free(void* ptr) {
+    _mm_free(ptr);
+}
+
+#define SATURATE_CAST_UCHAR(X)                                                                                         \
+    (unsigned char)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)0), (int)UCHAR_MAX)
+#define SATURATE_CAST_SHORT(X)                                                                                         \
+    (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)SHRT_MIN), (int)SHRT_MAX)
+#define SATURATE_CAST_INT(X)                                                                                           \
+    (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), (int)INT_MIN), (int)INT_MAX)
+
+void MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride) {
+    auto src_ptr = reinterpret_cast<uint8_t*>(src);
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+
+    for (int h = 0; h < height; h++) {
+        memcpy(dst_ptr, src_ptr, width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+    }
+}
+
+void MatMemcpy2DWithPadding(void* src, void* dst, int width, int height, int src_stride, int dst_stride, int top,
+                            int bottom, int left, int right, uint8_t pad_val) {
+    auto src_ptr = reinterpret_cast<uint8_t*>(src);
+    auto dst_ptr = reinterpret_cast<uint8_t*>(dst);
+
+    int top_plane = top * dst_stride;
+    memset(dst_ptr, pad_val, top_plane);
+    dst_ptr += top_plane;
+
+    for (int h = 0; h < height; h++) {
+        memset(dst_ptr, pad_val, left);
+        dst_ptr += left;
+        memcpy(dst_ptr, src_ptr, width);
+        src_ptr += src_stride;
+        dst_ptr += width;
+        memset(dst_ptr, pad_val, right);
+        dst_ptr += right;
+    }
+
+    int bottom_plane = bottom * dst_stride;
+    memset(dst_ptr, pad_val, bottom_plane);
+}
+
+#ifdef __SSE4_2__
+
+static inline __m128i load_element_c4(const uint8_t* addr) {
+    return _mm_loadl_epi64((__m128i*)addr);
+}
+
+static inline __m128i load_element_c3(const uint8_t* addr) {
+    __m128i val;
+    val = _mm_insert_epi32(val, *(int*)addr, 0);
+    return _mm_insert_epi16(val, *(short*)(addr + 4), 2);
+}
+
+static inline __m128i load_element_c3_pack4(const uint8_t* addr) {
+    __m128i val = load_element_c3(addr);
+    return _mm_shuffle_epi8(val, _mm_setr_epi8(0, 1, 2, 6, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+}
+
+static inline __m128i load_element_c2(const uint8_t* addr) {
+    __m128i val;
+    return _mm_insert_epi32(val, *(int*)addr, 0);
+}
+
+#endif
+
+/*
+color convert
+*/
+
+// float
+//     r = 1.164 * (y - 16) + 1.596 * (v - 128);
+//     g = 1.164 * (y - 16) - 0.813 * (v - 128) - 0.391 * (u - 128);
+//     b = 1.164 * (y - 16) + 2.018 * (u - 128);
+// int 16
+//     r = (74 * y - 1135 + 102 * vv ) >> 6
+//     g = (74 * y - 1135 - 52 * vv - 25 * uu ) >> 6
+//     b = (74 * y - 1135 + 129 * uu ) >> 6
+template <bool is_nv12, bool has_alpha>
+void YUVToBGR(const unsigned char* yuv, unsigned char* bgr, int h, int w) {
+    const unsigned char* yptr  = yuv;
+    const unsigned char* vuptr = yuv + w * h;
+    const int channel          = has_alpha ? 4 : 3;
+
+#ifdef __SSE4_2__
+    __m128i _v1135 = _mm_set1_epi16(-1135);
+    __m128i _v74   = _mm_set1_epi16(74);
+    __m128i _v128  = _mm_set1_epi16(128);
+    __m128i _v102  = _mm_set1_epi16(102);
+    __m128i _v52   = _mm_set1_epi16(-52);
+    __m128i _v25   = _mm_set1_epi16(-25);
+    __m128i _v129  = _mm_set1_epi16(129);
+    __m128i _v240  = _mm_set1_epi8(0xf0); // 240
+    __m128i _aa    = _mm_set1_epi8(0xff); // 255
+
+    const __m128i sh_vu = _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15);
+    const __m128i sh_a  = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_b  = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_c  = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    const __m128i m0    = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1    = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+#endif
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0        = bgr;
+        unsigned char* rgb1        = bgr + w * channel;
+
+        int remain = w;
+#ifdef __SSE4_2__
+        int nn = w >> 4;
+        remain = w - (nn << 4);
+        for (; nn > 0; nn--) {
+            __m128i _yy00_load = _mm_loadl_epi64((__m128i*)yptr0);
+            __m128i _yy01_load = _mm_loadl_epi64((__m128i*)(yptr0 + 8));
+            __m128i _yy10_load = _mm_loadl_epi64((__m128i*)yptr1);
+            __m128i _yy11_load = _mm_loadl_epi64((__m128i*)(yptr1 + 8));
+            __m128i _vu00_load = _mm_loadl_epi64((__m128i*)vuptr);
+            __m128i _vu01_load = _mm_loadl_epi64((__m128i*)(vuptr + 8));
+            _vu00_load         = _mm_min_epu8(_v240, _vu00_load);
+            _vu01_load         = _mm_min_epu8(_v240, _vu01_load);
+
+            __m128i _yy00 = _mm_add_epi16(_mm_mullo_epi16(_mm_cvtepu8_epi16(_yy00_load), _v74), _v1135);
+            __m128i _yy01 = _mm_add_epi16(_mm_mullo_epi16(_mm_cvtepu8_epi16(_yy01_load), _v74), _v1135);
+            __m128i _yy10 = _mm_add_epi16(_mm_mullo_epi16(_mm_cvtepu8_epi16(_yy10_load), _v74), _v1135);
+            __m128i _yy11 = _mm_add_epi16(_mm_mullo_epi16(_mm_cvtepu8_epi16(_yy11_load), _v74), _v1135);
+
+            __m128i _vu00 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_shuffle_epi8(_vu00_load, sh_vu)), _v128);
+            __m128i _vu01 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_shuffle_epi8(_vu01_load, sh_vu)), _v128);
+
+            // nv12 u,v,u,v,...
+            __m128i _uu00 = is_nv12 ? _mm_unpacklo_epi16(_vu00, _vu00) : _mm_unpackhi_epi16(_vu00, _vu00);
+            __m128i _vv00 = is_nv12 ? _mm_unpackhi_epi16(_vu00, _vu00) : _mm_unpacklo_epi16(_vu00, _vu00);
+            __m128i _uu01 = is_nv12 ? _mm_unpacklo_epi16(_vu01, _vu01) : _mm_unpackhi_epi16(_vu01, _vu01);
+            __m128i _vv01 = is_nv12 ? _mm_unpackhi_epi16(_vu01, _vu01) : _mm_unpacklo_epi16(_vu01, _vu01);
+
+            __m128i _r00 = _mm_add_epi16(_yy00, _mm_mullo_epi16(_vv00, _v102));
+            __m128i _g00 = _mm_add_epi16(_yy00, _mm_mullo_epi16(_vv00, _v52));
+            _g00         = _mm_add_epi16(_g00, _mm_mullo_epi16(_uu00, _v25));
+            __m128i _b00 = _mm_add_epi16(_yy00, _mm_mullo_epi16(_uu00, _v129));
+
+            __m128i _r01 = _mm_add_epi16(_yy01, _mm_mullo_epi16(_vv01, _v102));
+            __m128i _g01 = _mm_add_epi16(_yy01, _mm_mullo_epi16(_vv01, _v52));
+            _g01         = _mm_add_epi16(_g01, _mm_mullo_epi16(_uu01, _v25));
+            __m128i _b01 = _mm_add_epi16(_yy01, _mm_mullo_epi16(_uu01, _v129));
+
+            __m128i _r10 = _mm_add_epi16(_yy10, _mm_mullo_epi16(_vv00, _v102));
+            __m128i _g10 = _mm_add_epi16(_yy10, _mm_mullo_epi16(_vv00, _v52));
+            _g10         = _mm_add_epi16(_g10, _mm_mullo_epi16(_uu00, _v25));
+            __m128i _b10 = _mm_add_epi16(_yy10, _mm_mullo_epi16(_uu00, _v129));
+
+            __m128i _r11 = _mm_add_epi16(_yy11, _mm_mullo_epi16(_vv01, _v102));
+            __m128i _g11 = _mm_add_epi16(_yy11, _mm_mullo_epi16(_vv01, _v52));
+            _g11         = _mm_add_epi16(_g11, _mm_mullo_epi16(_uu01, _v25));
+            __m128i _b11 = _mm_add_epi16(_yy11, _mm_mullo_epi16(_uu01, _v129));
+
+            __m128i _r00_srai = _mm_srai_epi16(_r00, 6);
+            __m128i _g00_srai = _mm_srai_epi16(_g00, 6);
+            __m128i _b00_srai = _mm_srai_epi16(_b00, 6);
+            __m128i _r01_srai = _mm_srai_epi16(_r01, 6);
+            __m128i _g01_srai = _mm_srai_epi16(_g01, 6);
+            __m128i _b01_srai = _mm_srai_epi16(_b01, 6);
+            __m128i _r10_srai = _mm_srai_epi16(_r10, 6);
+            __m128i _g10_srai = _mm_srai_epi16(_g10, 6);
+            __m128i _b10_srai = _mm_srai_epi16(_b10, 6);
+            __m128i _r11_srai = _mm_srai_epi16(_r11, 6);
+            __m128i _g11_srai = _mm_srai_epi16(_g11, 6);
+            __m128i _b11_srai = _mm_srai_epi16(_b11, 6);
+
+            __m128i rr0 = _mm_packus_epi16(_r00_srai, _r01_srai);
+            __m128i gg0 = _mm_packus_epi16(_g00_srai, _g01_srai);
+            __m128i bb0 = _mm_packus_epi16(_b00_srai, _b01_srai);
+            __m128i rr1 = _mm_packus_epi16(_r10_srai, _r11_srai);
+            __m128i gg1 = _mm_packus_epi16(_g10_srai, _g11_srai);
+            __m128i bb1 = _mm_packus_epi16(_b10_srai, _b11_srai);
+
+            if (!has_alpha) {
+                // bbbb, gggg, rrrr to bgr,bgr,bgr,bgr
+                __m128i a0 = _mm_shuffle_epi8(bb0, sh_a);
+                __m128i b0 = _mm_shuffle_epi8(gg0, sh_b);
+                __m128i c0 = _mm_shuffle_epi8(rr0, sh_c);
+                __m128i a1 = _mm_shuffle_epi8(bb1, sh_a);
+                __m128i b1 = _mm_shuffle_epi8(gg1, sh_b);
+                __m128i c1 = _mm_shuffle_epi8(rr1, sh_c);
+
+                __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+                __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+                __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+                __m128i v3 = _mm_blendv_epi8(_mm_blendv_epi8(a1, b1, m1), c1, m0);
+                __m128i v4 = _mm_blendv_epi8(_mm_blendv_epi8(b1, c1, m1), a1, m0);
+                __m128i v5 = _mm_blendv_epi8(_mm_blendv_epi8(c1, a1, m1), b1, m0);
+
+                _mm_storeu_si128((__m128i*)rgb0, v0);
+                _mm_storeu_si128((__m128i*)(rgb0 + 16), v1);
+                _mm_storeu_si128((__m128i*)(rgb0 + 32), v2);
+                _mm_storeu_si128((__m128i*)rgb1, v3);
+                _mm_storeu_si128((__m128i*)(rgb1 + 16), v4);
+                _mm_storeu_si128((__m128i*)(rgb1 + 32), v5);
+            } else {
+                // bbbb, gggg, rrrr, aaaa to bgra,bgra,bgra,bgra
+                __m128i _bg, _ra, _res0, _res1, _res2, _res3;
+
+                _bg   = _mm_unpacklo_epi8(bb0, gg0);
+                _ra   = _mm_unpacklo_epi8(rr0, _aa);
+                _res0 = _mm_unpacklo_epi16(_bg, _ra);
+                _res1 = _mm_unpackhi_epi16(_bg, _ra);
+                _bg   = _mm_unpackhi_epi8(bb0, gg0);
+                _ra   = _mm_unpackhi_epi8(rr0, _aa);
+                _res2 = _mm_unpacklo_epi16(_bg, _ra);
+                _res3 = _mm_unpackhi_epi16(_bg, _ra);
+
+                _mm_storeu_si128((__m128i*)rgb0, _res0);
+                _mm_storeu_si128((__m128i*)(rgb0 + 16), _res1);
+                _mm_storeu_si128((__m128i*)(rgb0 + 32), _res2);
+                _mm_storeu_si128((__m128i*)(rgb0 + 48), _res3);
+
+                _bg   = _mm_unpacklo_epi8(bb1, gg1);
+                _ra   = _mm_unpacklo_epi8(rr1, _aa);
+                _res0 = _mm_unpacklo_epi16(_bg, _ra);
+                _res1 = _mm_unpackhi_epi16(_bg, _ra);
+                _bg   = _mm_unpackhi_epi8(bb1, gg1);
+                _ra   = _mm_unpackhi_epi8(rr1, _aa);
+                _res2 = _mm_unpacklo_epi16(_bg, _ra);
+                _res3 = _mm_unpackhi_epi16(_bg, _ra);
+
+                _mm_storeu_si128((__m128i*)rgb1, _res0);
+                _mm_storeu_si128((__m128i*)(rgb1 + 16), _res1);
+                _mm_storeu_si128((__m128i*)(rgb1 + 32), _res2);
+                _mm_storeu_si128((__m128i*)(rgb1 + 48), _res3);
+            }
+
+            yptr0 += 16;
+            yptr1 += 16;
+            vuptr += 16;
+            rgb0 += 16 * channel;
+            rgb1 += 16 * channel;
+        }
+#endif
+
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, remain, is_nv12, channel);
+        yptr += 2 * w;
+        vuptr += remain;
+        bgr += 2 * channel * w;
+    }
+}
+
+void NV12ToBGR(const unsigned char* nv12, unsigned char* bgr, int h, int w) {
+    return YUVToBGR<true, false>(nv12, bgr, h, w);
+}
+void NV21ToBGR(const unsigned char* nv21, unsigned char* bgr, int h, int w) {
+    return YUVToBGR<false, false>(nv21, bgr, h, w);
+}
+void NV12ToBGRA(const unsigned char* nv12, unsigned char* bgra, int h, int w) {
+    return YUVToBGR<true, true>(nv12, bgra, h, w);
+}
+void NV21ToBGRA(const unsigned char* nv21, unsigned char* bgra, int h, int w) {
+    return YUVToBGR<false, true>(nv21, bgra, h, w);
+}
+
+template <int channel, bool bgr_order>
+void ColorToGray(const unsigned char* bgr, unsigned char* gray, int h, int w) {
+    int offset = 0;
+    int plane  = h * w;
+
+#ifdef __SSE4_2__
+    const unsigned char* Sp = bgr;
+    unsigned char* Dp       = gray;
+    __m128 _coeff_b         = _mm_set1_ps(0.114);
+    __m128 _coeff_g         = _mm_set1_ps(0.587);
+    __m128 _coeff_r         = _mm_set1_ps(0.299);
+    __m128i _vzero          = _mm_set1_epi16(0);
+    __m128i _maski16_to_i8  = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m128i _maski32_to_i16 = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m128i _maskld3_0      = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    __m128i _maskld3_1      = _mm_setr_epi8(2, 5, 0, 3, 6, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+    for (; offset<plane>> 3 << 3; offset += 8) {
+        __m128i b_h, g_h, r_h;
+        if (channel == 3) {
+            __m128i _bgra0_load = _mm_loadu_si128((__m128i*)Sp);              // b0g0r0 b1g1r1 b2g2r2 b3g3r3 b4g4r4 b5
+            __m128i _bgra1_load = _mm_loadl_epi64((__m128i*)(Sp + 16));       // g5r5 b6g6r6 b7g7r7
+            __m128i tmp0        = _mm_shuffle_epi8(_bgra0_load, _maskld3_0);  // b0b1b2b3b4b5 g0g1g2g3g4 r0r1r2r3r4
+            __m128i tmp1        = _mm_shuffle_epi8(_bgra1_load, _maskld3_1);  // b6b7 g5g6g7 r5r6r7
+            _bgra0_load         = _mm_srli_si128(_mm_slli_si128(tmp0, 10), 10);  // b0b1b2b3b4b5 0
+            _bgra1_load         = _mm_slli_si128(tmp1, 6);                       // 0 0 0 0 0 0  b6b7
+            __m128i b_b         = _mm_or_si128(_bgra0_load, _bgra1_load);
+            _bgra0_load         = _mm_srli_si128(_mm_slli_si128(tmp0, 5), 11);  // g0g1g2g3g4 0
+            _bgra1_load         = _mm_slli_si128(_mm_srli_si128(tmp1, 2), 5);   // 0 0 0 0 0  g5g6g7
+            __m128i g_b         = _mm_or_si128(_bgra0_load, _bgra1_load);
+            _bgra0_load         = _mm_srli_si128(tmp0, 11);                    // r0r1r2r3r4 0
+            _bgra1_load         = _mm_slli_si128(_mm_srli_si128(tmp1, 5), 5);  // 0 0 0 0 0  r5r6r7
+            __m128i r_b         = _mm_or_si128(_bgra0_load, _bgra1_load);
+
+            b_h = _mm_cvtepu8_epi16(bgr_order ? b_b : r_b);
+            g_h = _mm_cvtepu8_epi16(g_b);
+            r_h = _mm_cvtepu8_epi16(bgr_order ? r_b : b_b);
+        } else {
+            __m128i _bgra0_load = _mm_loadu_si128((__m128i*)Sp);                // a0 - a15
+            __m128i _bgra1_load = _mm_loadu_si128((__m128i*)(Sp + 16));         // b0 - b15
+            __m128i tmp0        = _mm_unpacklo_epi8(_bgra0_load, _bgra1_load);  // a0,b0, - a7,b7
+            __m128i tmp1        = _mm_unpackhi_epi8(_bgra0_load, _bgra1_load);  // a8,b8, - a15,b15
+            _bgra0_load         = _mm_unpacklo_epi8(tmp0, tmp1);                // a0,a8,b0,b8, - a3,a11,b3,b11
+            _bgra1_load         = _mm_unpackhi_epi8(tmp0, tmp1);                // a4,a12,b4,b12, - a7,a15,b7,b15
+            tmp0                = _mm_unpacklo_epi8(_bgra0_load, _bgra1_load);  // a0 - b12, a1 - b13
+            tmp1                = _mm_unpackhi_epi8(_bgra0_load, _bgra1_load);  // a2 - b14, a3 - b15
+
+            b_h = _mm_cvtepu8_epi16(bgr_order ? tmp0 : tmp1);
+            g_h = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(tmp0, tmp0));
+            r_h = _mm_cvtepu8_epi16(bgr_order ? tmp1 : tmp0);
+        }
+
+        __m128 b_val, g_val, r_val, acc;
+        b_val        = _mm_cvtepi32_ps(_mm_unpacklo_epi16(b_h, _vzero));
+        g_val        = _mm_cvtepi32_ps(_mm_unpacklo_epi16(g_h, _vzero));
+        r_val        = _mm_cvtepi32_ps(_mm_unpacklo_epi16(r_h, _vzero));
+        acc          = _mm_mul_ps(b_val, _coeff_b);
+        acc          = _mm_add_ps(acc, _mm_mul_ps(g_val, _coeff_g));
+        acc          = _mm_add_ps(acc, _mm_mul_ps(r_val, _coeff_r));
+        __m128i tmp0 = _mm_shuffle_epi8(_mm_cvtps_epi32(acc), _maski32_to_i16);
+
+        b_val        = _mm_cvtepi32_ps(_mm_unpackhi_epi16(b_h, _vzero));
+        g_val        = _mm_cvtepi32_ps(_mm_unpackhi_epi16(g_h, _vzero));
+        r_val        = _mm_cvtepi32_ps(_mm_unpackhi_epi16(r_h, _vzero));
+        acc          = _mm_mul_ps(b_val, _coeff_b);
+        acc          = _mm_add_ps(acc, _mm_mul_ps(g_val, _coeff_g));
+        acc          = _mm_add_ps(acc, _mm_mul_ps(r_val, _coeff_r));
+        __m128i tmp1 = _mm_shuffle_epi8(_mm_cvtps_epi32(acc), _maski32_to_i16);
+
+        _mm_storel_epi64((__m128i*)Dp, _mm_shuffle_epi8(_mm_unpacklo_epi64(tmp0, tmp1), _maski16_to_i8));
+
+        Sp += 8 * channel;
+        Dp += 8;
+    }
+    if (plane % 8) {
+        offset -= 8;
+    }
+#endif
+
+    for (; offset < plane; ++offset) {
+        unsigned b       = bgr[offset * channel + (bgr_order ? 0 : 2)];
+        unsigned g       = bgr[offset * channel + 1];
+        unsigned r       = bgr[offset * channel + (bgr_order ? 2 : 0)];
+        float gray_color = 0.114 * b + 0.587 * g + 0.299 * r;
+        gray[offset]     = gray_color;
+    }
+}
+
+void BGRToGray(const unsigned char* bgr, unsigned char* gray, int height, int width) {
+    ColorToGray<3, true>(bgr, gray, height, width);
+}
+void BGRAToGray(const unsigned char* bgra, unsigned char* gray, int height, int width) {
+    ColorToGray<4, true>(bgra, gray, height, width);
+}
+void RGBToGray(const unsigned char* rgb, unsigned char* gray, int height, int width) {
+    ColorToGray<3, false>(rgb, gray, height, width);
+}
+void RGBAToGray(const unsigned char* rgba, unsigned char* gray, int height, int width) {
+    ColorToGray<4, false>(rgba, gray, height, width);
+}
+
+/*
+resize
+*/
+
+template <int c>
+static void ResizeGetAdjacentRows(int sy, int prev_sy, short** rows0, short** rows1, int* xofs, const uint8_t* src,
+                                  int src_stride, int w, const short* ialphap) {
+    if (sy == prev_sy) {
+        // reuse all rows
+    } else if (sy == prev_sy + 1) {
+        // hresize one row
+        short* rows0_old  = *rows0;
+        *rows0            = *rows1;
+        *rows1            = rows0_old;
+        const uint8_t* S1 = src + src_stride * (sy + 1);
+
+        short* rows1p = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const uint8_t* S1p = S1 + sx;
+
+#ifndef __SSE4_2__
+            for (int dc = 0; dc < c; ++dc) {
+                rows1p[dc] = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+            }
+#else
+            __m128i _maski32_to_i16 = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            if (c == 2) {
+                __m128i _a0 = _mm_set1_epi16(a0);
+                __m128i _a1 = _mm_set1_epi16(a1);
+                __m128i _S1 = _mm_cvtepu8_epi16(load_element_c2(S1p));
+                __m128i _Sh = _mm_srli_si128(_S1, 4);
+
+                __m128i _res = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh));
+                _res         = _mm_shuffle_epi8(_mm_srai_epi32(_res, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows1p, _res);
+            } else if (c == 3) {
+                __m128i _a0 = _mm_set1_epi16(a0);
+                __m128i _a1 = _mm_set1_epi16(a1);
+                __m128i _S1 = _mm_cvtepu8_epi16(load_element_c3(S1p));
+                __m128i _Sh = _mm_srli_si128(_S1, 6);
+
+                __m128i _res = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh));
+                _res         = _mm_shuffle_epi8(_mm_srai_epi32(_res, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows1p, _res);
+            } else if (c == 4) {
+                __m128i _a0 = _mm_set1_epi16(a0);
+                __m128i _a1 = _mm_set1_epi16(a1);
+                __m128i _S1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)S1p));
+                __m128i _Sh = _mm_unpackhi_epi64(_S1, _S1);
+
+                __m128i _res = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh));
+                _res         = _mm_shuffle_epi8(_mm_srai_epi32(_res, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows1p, _res);
+            } else {
+                for (int dc = 0; dc < c; ++dc) {
+                    rows1p[dc] = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+                }
+            }
+#endif
+
+            ialphap += 2;
+            rows1p += c;
+        }
+    } else {
+        // hresize two rows
+        const uint8_t* S0 = src + src_stride * (sy);
+        const uint8_t* S1 = src + src_stride * (sy + 1);
+
+        short* rows0p = *rows0;
+        short* rows1p = *rows1;
+        for (int dx = 0; dx < w; dx++) {
+            int sx   = xofs[dx];
+            short a0 = ialphap[0];
+            short a1 = ialphap[1];
+
+            const uint8_t* S0p = S0 + sx;
+            const uint8_t* S1p = S1 + sx;
+
+#ifndef __SSE4_2__
+            for (int dc = 0; dc < c; ++dc) {
+                rows0p[dc] = (S0p[dc] * a0 + S0p[dc + c] * a1) >> 4;
+                rows1p[dc] = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+            }
+#else
+            __m128i _maski32_to_i16 = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            if (c == 2) {
+                __m128i _a0  = _mm_set1_epi16(a0);
+                __m128i _a1  = _mm_set1_epi16(a1);
+                __m128i _S0  = _mm_cvtepu8_epi16(load_element_c2(S0p));
+                __m128i _S1  = _mm_cvtepu8_epi16(load_element_c2(S1p));
+                __m128i _Sh0 = _mm_srli_si128(_S0, 4);
+                __m128i _Sh1 = _mm_srli_si128(_S1, 4);
+
+                __m128i _res0 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S0, _Sh0));
+                __m128i _res1 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh1));
+                _res0         = _mm_shuffle_epi8(_mm_srai_epi32(_res0, 4), _maski32_to_i16);
+                _res1         = _mm_shuffle_epi8(_mm_srai_epi32(_res1, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows0p, _res0);
+                _mm_storel_epi64((__m128i*)rows1p, _res1);
+            } else if (c == 3) {
+                __m128i _a0  = _mm_set1_epi16(a0);
+                __m128i _a1  = _mm_set1_epi16(a1);
+                __m128i _S0  = _mm_cvtepu8_epi16(load_element_c3(S0p));
+                __m128i _S1  = _mm_cvtepu8_epi16(load_element_c3(S1p));
+                __m128i _Sh0 = _mm_srli_si128(_S0, 6);
+                __m128i _Sh1 = _mm_srli_si128(_S1, 6);
+
+                __m128i _res0 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S0, _Sh0));
+                __m128i _res1 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh1));
+                _res0         = _mm_shuffle_epi8(_mm_srai_epi32(_res0, 4), _maski32_to_i16);
+                _res1         = _mm_shuffle_epi8(_mm_srai_epi32(_res1, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows0p, _res0);
+                _mm_storel_epi64((__m128i*)rows1p, _res1);
+            } else if (c == 4) {
+                __m128i _a0  = _mm_set1_epi16(a0);
+                __m128i _a1  = _mm_set1_epi16(a1);
+                __m128i _S0  = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)S0p));
+                __m128i _S1  = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)S1p));
+                __m128i _Sh0 = _mm_unpackhi_epi64(_S0, _S0);
+                __m128i _Sh1 = _mm_unpackhi_epi64(_S1, _S1);
+
+                __m128i _res0 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S0, _Sh0));
+                __m128i _res1 = _mm_madd_epi16(_mm_unpacklo_epi16(_a0, _a1), _mm_unpacklo_epi16(_S1, _Sh1));
+                _res0         = _mm_shuffle_epi8(_mm_srai_epi32(_res0, 4), _maski32_to_i16);
+                _res1         = _mm_shuffle_epi8(_mm_srai_epi32(_res1, 4), _maski32_to_i16);
+                _mm_storel_epi64((__m128i*)rows0p, _res0);
+                _mm_storel_epi64((__m128i*)rows1p, _res1);
+            } else {
+                for (int dc = 0; dc < c; ++dc) {
+                    rows0p[dc] = (S0p[dc] * a0 + S0p[dc + c] * a1) >> 4;
+                    rows1p[dc] = (S1p[dc] * a0 + S1p[dc + c] * a1) >> 4;
+                }
+            }
+#endif
+
+            ialphap += 2;
+            rows0p += c;
+            rows1p += c;
+        }
+    }
+}
+
+static void ResizeCalculateOneRow(short* rows0p, short* rows1p, const short b0, const short b1, const int w,
+                                  const int c, uint8_t* Dp) {
+#ifndef __SSE4_2__
+    int remain = w * c;
+#else
+    int nn = (w * c) >> 4;
+    int remain = (w * c) - (nn << 4);
+    __m128i _b0 = _mm_set1_epi16(b0);
+    __m128i _b1 = _mm_set1_epi16(b1);
+    __m128i _v2 = _mm_set1_epi16(2);
+    for (; nn > 0; nn--) {
+        __m128i _rows0p_sr8 = _mm_loadu_si128((__m128i*)rows0p);
+        __m128i _rows1p_sr8 = _mm_loadu_si128((__m128i*)rows1p);
+        __m128i _rows0p_1_sr8 = _mm_loadu_si128((__m128i*)(rows0p + 8));
+        __m128i _rows1p_1_sr8 = _mm_loadu_si128((__m128i*)(rows1p + 8));
+
+        __m128i _rows0p_sr8_hi = _mm_mulhi_epi16(_rows0p_sr8, _b0);
+        __m128i _rows1p_sr8_hi = _mm_mulhi_epi16(_rows1p_sr8, _b1);
+        __m128i _rows0p_1_sr8_hi = _mm_mulhi_epi16(_rows0p_1_sr8, _b0);
+        __m128i _rows1p_1_sr8_hi = _mm_mulhi_epi16(_rows1p_1_sr8, _b1);
+
+        __m128i _acc = _mm_adds_epi16(_rows0p_sr8_hi, _rows1p_sr8_hi);
+        __m128i _acc_1 = _mm_adds_epi16(_rows0p_1_sr8_hi, _rows1p_1_sr8_hi);
+        _acc = _mm_srai_epi16(_mm_adds_epi16(_acc, _v2), 2);
+        _acc_1 = _mm_srai_epi16(_mm_adds_epi16(_acc_1, _v2), 2);
+
+        _mm_storeu_si128((__m128i*)Dp, _mm_packus_epi16(_acc, _acc_1));
+
+        Dp += 16;
+        rows0p += 16;
+        rows1p += 16;
+    }
+#endif
+    for (; remain; --remain) {
+        *Dp++ =
+            (uint8_t)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
+    }
+}
+
+struct ResizeBilinearKernelParm {
+    ResizeBilinearKernelParm(int* _xofs, int* _yofs, short* _ialpha, short* _ibeta, const uint8_t* _src, uint8_t* _dst,
+                             int _src_plane, int _src_stride, int _schannel) {
+        xofs       = _xofs;
+        yofs       = _yofs;
+        ialpha     = _ialpha;
+        ibeta      = _ibeta;
+        src        = _src;
+        dst        = _dst;
+        src_plane  = _src_plane;
+        src_stride = _src_stride;
+        schannel   = _schannel;
+    };
+
+    int* xofs;
+    int* yofs;
+    short* ialpha;
+    short* ibeta;
+    const uint8_t* src;
+    uint8_t* dst;
+    int src_plane;
+    int src_stride;
+    int schannel;
+};
+
+template <int channel>
+void ResizeBilinearOneRow(ResizeBilinearKernelParm& param, int thread_id, short** rows0_t, short** rows1_t,
+                          int* prev_sy, int b, int w, int h, int stride, int dy) {
+    int sy = param.yofs[dy];
+    ResizeGetAdjacentRows<channel>(sy, prev_sy[thread_id], &rows0_t[thread_id], &rows1_t[thread_id], param.xofs,
+                                   param.src + b * param.src_plane, param.src_stride, w, param.ialpha);
+    prev_sy[thread_id] = sy;
+
+    // vresize
+    short b0 = param.ibeta[dy * 2];
+    short b1 = param.ibeta[dy * 2 + 1];
+
+    uint8_t* Dp = param.dst + stride * (b * h + dy);
+
+    ResizeCalculateOneRow(rows0_t[thread_id], rows1_t[thread_id], b0, b1, w, channel, Dp);
+}
+
+#define ResizeBilinearPreparation(channel)                                                                             \
+    int schannel = channel;                                                                                            \
+    int* buf     = nullptr;                                                                                            \
+    GetResizeBuf(src_w, src_h, w, h, schannel, &buf);                                                                  \
+    int* xofs     = buf;                                                                                               \
+    int* yofs     = buf + w;                                                                                           \
+    short* ialpha = (short*)(buf + w + h);                                                                             \
+    short* ibeta  = (short*)(buf + w + h + w);                                                                         \
+    int src_plane = src_h * src_stride;
+
+void ResizeBilinearC1Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                          int h, int stride) {
+    ResizeBilinearPreparation(1);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[w * max_num_threads];
+    short* rows1        = new short[w * max_num_threads];
+    short** rows0_t     = new short*[max_num_threads];
+    short** rows1_t     = new short*[max_num_threads];
+    int* prev_sy        = new int[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * w;
+            rows1_t[t] = rows1 + t * w;
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id = OMP_TID_;
+            ResizeBilinearOneRow<1>(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+    delete[] rows0_t;
+    delete[] rows1_t;
+    delete[] prev_sy;
+}
+
+void ResizeBilinearC2Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                          int h, int stride) {
+    ResizeBilinearPreparation(2);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 2 + 2) * max_num_threads];
+    short* rows1        = new short[(w * 2 + 2) * max_num_threads];
+    short** rows0_t     = new short*[max_num_threads];
+    short** rows1_t     = new short*[max_num_threads];
+    int* prev_sy        = new int[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 2 + 2);
+            rows1_t[t] = rows1 + t * (w * 2 + 2);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id = OMP_TID_;
+            ResizeBilinearOneRow<2>(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+    delete[] rows0_t;
+    delete[] rows1_t;
+    delete[] prev_sy;
+}
+
+void ResizeBilinearC3Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                          int h, int stride) {
+    ResizeBilinearPreparation(3);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 3 + 1) * max_num_threads];
+    short* rows1        = new short[(w * 3 + 1) * max_num_threads];
+    short** rows0_t     = new short*[max_num_threads];
+    short** rows1_t     = new short*[max_num_threads];
+    int* prev_sy        = new int[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 3 + 1);
+            rows1_t[t] = rows1 + t * (w * 3 + 1);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id = OMP_TID_;
+            ResizeBilinearOneRow<3>(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+    delete[] rows0_t;
+    delete[] rows1_t;
+    delete[] prev_sy;
+}
+
+void ResizeBilinearC4Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                          int h, int stride) {
+    ResizeBilinearPreparation(4);
+
+    ResizeBilinearKernelParm param(xofs, yofs, ialpha, ibeta, src, dst, src_plane, src_stride, schannel);
+
+    // loop body
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    short* rows0        = new short[(w * 4) * max_num_threads];
+    short* rows1        = new short[(w * 4) * max_num_threads];
+    short** rows0_t     = new short*[max_num_threads];
+    short** rows1_t     = new short*[max_num_threads];
+    int* prev_sy        = new int[max_num_threads];
+
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < max_num_threads; ++t) {
+            prev_sy[t] = -2;
+            rows0_t[t] = rows0 + t * (w * 4);
+            rows1_t[t] = rows1 + t * (w * 4);
+        }
+
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            int thread_id = OMP_TID_;
+            ResizeBilinearOneRow<4>(param, thread_id, rows0_t, rows1_t, prev_sy, b, w, h, stride, dy);
+        }
+    }
+
+    delete[] rows0;
+    delete[] rows1;
+    delete[] buf;
+    delete[] rows0_t;
+    delete[] rows1_t;
+    delete[] prev_sy;
+}
+
+void ResizeBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC1Impl(src, batch, src_w, src_h, src_w, dst, w, h, w);
+}
+
+void ResizeBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC2Impl(src, batch, src_w, src_h, src_w * 2, dst, w, h, w * 2);
+}
+
+void ResizeBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC3Impl(src, batch, src_w, src_h, src_w * 3, dst, w, h, w * 3);
+}
+
+void ResizeBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeBilinearC4Impl(src, batch, src_w, src_h, src_w * 4, dst, w, h, w * 4);
+}
+
+void ResizeBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = w * h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY = src + b * src_plane;
+        uint8_t* dstY       = dst + b * dst_plane;
+        ResizeBilinearC1(srcY, 1, src_w, src_h, dstY, w, h);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + w * h;
+        ResizeBilinearC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, w / 2, h / 2);
+    }
+}
+
+#define ResizeNearestPreparation(channel)                                                                              \
+    int schannel = channel;                                                                                            \
+    int* buf     = nullptr;                                                                                            \
+    GetResizeBufNearset(src_w, src_h, w, h, schannel, &buf);                                                           \
+    int* xofs       = buf;                                                                                             \
+    int* yofs       = buf + w;                                                                                         \
+    uint8_t* ialpha = (uint8_t*)(buf + w + h);                                                                         \
+    uint8_t* ibeta  = (uint8_t*)(buf + w + h + w);
+
+#define ResizeNearestLoopPreparation()                                                                                 \
+    int sy            = (ibeta[dy] == 0) ? yofs[dy] + 1 : yofs[dy];                                                    \
+    const uint8_t* Sp = src + src_stride * (b * src_h + sy);                                                           \
+    uint8_t* Dp       = dst + stride * (b * h + dy);                                                                   \
+    int dx            = 0;
+
+void ResizeNearestC1Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                         int h, int stride) {
+    ResizeNearestPreparation(1);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef __SSE4_2__
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            __m128i _S0, _tmp0, _tmp1;
+            __m128i _mask0 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            __m128i _mask1 = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14);
+            int simd_loop  = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                __m128i _mask = _mm_loadl_epi64((__m128i*)ialpha_p);  // 01234567
+
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[0]), 0);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[1]), 1);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[2]), 2);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[3]), 3);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[4]), 4);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[5]), 5);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[6]), 6);
+                _S0 = _mm_insert_epi16(_S0, *(short*)(Sp + xofs_p[7]), 7);  // 0l0r 1l1r ... 7l7r
+
+                _tmp0 = _mm_shuffle_epi8(_S0, _mask0);
+                _tmp1 = _mm_shuffle_epi8(_S0, _mask1);
+
+                _mm_storel_epi64((__m128i*)Dp_p, _mm_blendv_epi8(_tmp1, _tmp0, _mask));
+
+                xofs_p += 8;
+                ialpha_p += 8;
+                Dp_p += 8 * 1;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx = xofs[dx];
+                Dp[dx] = (ialpha[dx] == 0) ? Sp[sx + 1] : Sp[sx];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC2Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                         int h, int stride) {
+    ResizeNearestPreparation(2);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef __SSE4_2__
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            __m128i _S0, _S1, _tmp0, _tmp1;
+            __m128i _mask0 = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            int simd_loop  = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                __m128i _mask = _mm_loadl_epi64((__m128i*)ialpha_p);  // 01234567
+                _mask         = _mm_unpacklo_epi8(_mask, _mask);      // 0011223344556677
+
+                _S0 = _mm_insert_epi32(_S0, *(int*)(Sp + xofs_p[0]), 0);
+                _S0 = _mm_insert_epi32(_S0, *(int*)(Sp + xofs_p[1]), 1);
+                _S0 = _mm_insert_epi32(_S0, *(int*)(Sp + xofs_p[2]), 2);
+                _S0 = _mm_insert_epi32(_S0, *(int*)(Sp + xofs_p[3]), 3);  // 0l0l0r0r 1l1l1r1r 2l2l2r2r 3l3l3r3r
+                _S1 = _mm_insert_epi32(_S1, *(int*)(Sp + xofs_p[4]), 0);
+                _S1 = _mm_insert_epi32(_S1, *(int*)(Sp + xofs_p[5]), 1);
+                _S1 = _mm_insert_epi32(_S1, *(int*)(Sp + xofs_p[6]), 2);
+                _S1 = _mm_insert_epi32(_S1, *(int*)(Sp + xofs_p[7]), 3);  // 4l4l4r4r 5l5l5r5r 6l6l6r6r 7l7l7r7r
+
+                _tmp0 = _mm_shuffle_epi8(_S0, _mask0);  // 0l0l 1l1l 2l2l 3l3l 0r0r 1r1r 2r2r 3r3r
+                _tmp1 = _mm_shuffle_epi8(_S1, _mask0);  // 4l4l 5l5l 6l6l 7l7l 4r4r 5r5r 6r6r 7r7r
+
+                _S0 = _mm_unpacklo_epi64(_tmp0, _tmp1);  // 0l0l - 7l7l
+                _S1 = _mm_unpackhi_epi64(_tmp0, _tmp1);  // 0r0r - 7r7r
+
+                _mm_storeu_si128((__m128i*)Dp_p, _mm_blendv_epi8(_S1, _S0, _mask));
+
+                xofs_p += 8;
+                ialpha_p += 8;
+                Dp_p += 8 * 2;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx         = xofs[dx];
+                Dp[dx * 2]     = (ialpha[dx] == 0) ? Sp[sx + 2] : Sp[sx];
+                Dp[dx * 2 + 1] = (ialpha[dx] == 0) ? Sp[sx + 3] : Sp[sx + 1];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC3Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                         int h, int stride) {
+    ResizeNearestPreparation(3);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef __SSE4_2__
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            __m128i _S0, _S1, _S2, _S3, _tmp0, _tmp1;
+            __m128i _mask0, _mask1;
+            __m128i _mask_res = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
+            int simd_loop     = 0;
+            for (int i = 0; i < (w - 2) - 7; i += 8) {
+                __m128i _mask = _mm_loadl_epi64((__m128i*)ialpha_p);  // 01234567
+                _mask         = _mm_unpacklo_epi8(_mask, _mask);      // 0011223344556677
+                _mask0        = _mm_unpacklo_epi8(_mask, _mask);      // 0000111122223333
+                _mask1        = _mm_unpackhi_epi8(_mask, _mask);      // 4444555566667777
+
+                _S0 = load_element_c3_pack4(Sp + xofs_p[0]);  // 0l0l0lxx0r0r0rxx
+                _S1 = load_element_c3_pack4(Sp + xofs_p[1]);  // 1l1l1lxx1r1r1rxx
+                _S2 = load_element_c3_pack4(Sp + xofs_p[2]);  // 2l2l2lxx2r2r2rxx
+                _S3 = load_element_c3_pack4(Sp + xofs_p[3]);  // 3l3l3lxx3r3r3rxx
+
+                _S0   = _mm_unpacklo_epi64(_S0, _S1);      // 0l 0r 1l 1r
+                _S1   = _mm_unpacklo_epi64(_S2, _S3);      // 2l 2r 3l 3r
+                _tmp0 = _mm_unpacklo_epi32(_S0, _S1);      // 0l 2l 0r 2r
+                _tmp1 = _mm_unpackhi_epi32(_S0, _S1);      // 1l 3l 1r 3r
+                _S0   = _mm_unpacklo_epi32(_tmp0, _tmp1);  // 0l 1l 2l 3l
+                _S1   = _mm_unpackhi_epi32(_tmp0, _tmp1);  // 0r 1r 2r 3r
+
+                _mm_storeu_si128((__m128i*)Dp_p, _mm_shuffle_epi8(_mm_blendv_epi8(_S1, _S0, _mask0), _mask_res));
+
+                _S0 = load_element_c3_pack4(Sp + xofs_p[4]);  // 4l 4r
+                _S1 = load_element_c3_pack4(Sp + xofs_p[5]);  // 5l 5r
+                _S2 = load_element_c3_pack4(Sp + xofs_p[6]);  // 6l 6r
+                _S3 = load_element_c3_pack4(Sp + xofs_p[7]);  // 7l 7r
+
+                _S0   = _mm_unpacklo_epi64(_S0, _S1);      // 4l 4r 5l 5r
+                _S1   = _mm_unpacklo_epi64(_S2, _S3);      // 6l 6r 7l 7r
+                _tmp0 = _mm_unpacklo_epi32(_S0, _S1);      // 4l 6l 4r 6r
+                _tmp1 = _mm_unpackhi_epi32(_S0, _S1);      // 5l 7l 5r 7r
+                _S0   = _mm_unpacklo_epi32(_tmp0, _tmp1);  // 4l 5l 6l 7l
+                _S1   = _mm_unpackhi_epi32(_tmp0, _tmp1);  // 4r 5r 6r 7r
+
+                _mm_storeu_si128((__m128i*)(Dp_p + 12), _mm_shuffle_epi8(_mm_blendv_epi8(_S1, _S0, _mask1), _mask_res));
+
+                xofs_p += 8;
+                ialpha_p += 8;
+                Dp_p += 8 * 3;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx         = xofs[dx];
+                Dp[dx * 3]     = (ialpha[dx] == 0) ? Sp[sx + 3] : Sp[sx];
+                Dp[dx * 3 + 1] = (ialpha[dx] == 0) ? Sp[sx + 4] : Sp[sx + 1];
+                Dp[dx * 3 + 2] = (ialpha[dx] == 0) ? Sp[sx + 5] : Sp[sx + 2];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC4Impl(const uint8_t* src, int batch, int src_w, int src_h, int src_stride, uint8_t* dst, int w,
+                         int h, int stride) {
+    ResizeNearestPreparation(4);
+
+    // loop body
+    for (int b = 0; b < batch; ++b) {
+        OMP_PARALLEL_FOR_
+        for (int dy = 0; dy < h; dy++) {
+            ResizeNearestLoopPreparation();
+#ifdef __SSE4_2__
+            int* xofs_p       = xofs;
+            uint8_t* ialpha_p = ialpha;
+            uint8_t* Dp_p     = Dp;
+            __m128i _S0, _S1, _S2, _S3, _tmp0, _tmp1;
+            __m128i _mask0, _mask1;
+            int simd_loop = 0;
+            for (int i = 0; i < w - 7; i += 8) {
+                __m128i _mask = _mm_loadl_epi64((__m128i*)ialpha_p);  // 01234567
+                _mask         = _mm_unpacklo_epi8(_mask, _mask);      // 0011223344556677
+                _mask0        = _mm_unpacklo_epi8(_mask, _mask);      // 0000111122223333
+                _mask1        = _mm_unpackhi_epi8(_mask, _mask);      // 4444555566667777
+
+                _S0   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[0]));  // 0l0l0l0l0r0r0r0r
+                _S1   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[1]));  // 1l1l1l1l1r1r1r1r
+                _S2   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[2]));  // 2l2l2l2l2r2r2r2r
+                _S3   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[3]));  // 3l3l3l3l3r3r3r3r
+                _S0   = _mm_unpacklo_epi64(_S0, _S1);                 // 0l0l0l0l0r0r0r0r 1l1l1l1l1r1r1r1r
+                _S1   = _mm_unpacklo_epi64(_S2, _S3);                 // 2l2l2l2l2r2r2r2r 3l3l3l3l3r3r3r3r
+                _tmp0 = _mm_unpacklo_epi32(_S0, _S1);                 // 0l 2l 0r 2r
+                _tmp1 = _mm_unpackhi_epi32(_S0, _S1);                 // 1l 3l 1r 3r
+                _S0   = _mm_unpacklo_epi32(_tmp0, _tmp1);             // 0l 1l 2l 3l
+                _S1   = _mm_unpackhi_epi32(_tmp0, _tmp1);             // 0r 1r 2r 3r
+                _mm_storeu_si128((__m128i*)Dp_p, _mm_blendv_epi8(_S1, _S0, _mask0));
+
+                _S0   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[4]));  // 4l 4r
+                _S1   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[5]));  // 5l 5r
+                _S2   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[6]));  // 6l 6r
+                _S3   = _mm_loadl_epi64((__m128i*)(Sp + xofs_p[7]));  // 7l 7r
+                _S0   = _mm_unpacklo_epi64(_S0, _S1);                 // 4l 4r 5l 5r
+                _S1   = _mm_unpacklo_epi64(_S2, _S3);                 // 6l 6r 7l 7r
+                _tmp0 = _mm_unpacklo_epi32(_S0, _S1);                 // 4l 6l 4r 6r
+                _tmp1 = _mm_unpackhi_epi32(_S0, _S1);                 // 5l 7l 5r 7r
+                _S0   = _mm_unpacklo_epi32(_tmp0, _tmp1);             // 4l 5l 6l 7l
+                _S1   = _mm_unpackhi_epi32(_tmp0, _tmp1);             // 4r 5r 6r 7r
+                _mm_storeu_si128((__m128i*)(Dp_p + 16), _mm_blendv_epi8(_S1, _S0, _mask1));
+
+                xofs_p += 8;
+                ialpha_p += 8;
+                Dp_p += 8 * 4;
+                ++simd_loop;
+            }
+            dx += simd_loop * 8;
+#endif
+            for (; dx < w; dx++) {
+                int sx         = xofs[dx];
+                Dp[dx * 4]     = (ialpha[dx] == 0) ? Sp[sx + 4] : Sp[sx];
+                Dp[dx * 4 + 1] = (ialpha[dx] == 0) ? Sp[sx + 5] : Sp[sx + 1];
+                Dp[dx * 4 + 2] = (ialpha[dx] == 0) ? Sp[sx + 6] : Sp[sx + 2];
+                Dp[dx * 4 + 3] = (ialpha[dx] == 0) ? Sp[sx + 7] : Sp[sx + 3];
+            }
+        }
+    }
+
+    delete[] buf;
+}
+
+void ResizeNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC1Impl(src, batch, src_w, src_h, src_w, dst, w, h, w);
+}
+
+void ResizeNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC2Impl(src, batch, src_w, src_h, src_w * 2, dst, w, h, w * 2);
+}
+
+void ResizeNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC3Impl(src, batch, src_w, src_h, src_w * 3, dst, w, h, w * 3);
+}
+
+void ResizeNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    return ResizeNearestC4Impl(src, batch, src_w, src_h, src_w * 4, dst, w, h, w * 4);
+}
+
+void ResizeNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert w % 2 == 0
+    // assert h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = w * h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY = src + b * src_plane;
+        uint8_t* dstY       = dst + b * dst_plane;
+        ResizeNearestC1(srcY, 1, src_w, src_h, dstY, w, h);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + w * h;
+        ResizeNearestC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, w / 2, h / 2);
+    }
+}
+
+#define INTER_REMAP_COEF_BITS 15
+#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
+#define INTER_BITS 5
+#define INTER_TAB_SIZE (1 << INTER_BITS)
+#define KSIZE 2
+static short BilinearTab_i[INTER_TAB_SIZE * INTER_TAB_SIZE][KSIZE][KSIZE];
+
+// Interpolation table of size 32 x 32 x 4:
+// (1*1,     0*1,     1*0,     0*0)    , ... , (1/32*1,     31/32*1,     1/32*0,     31/32*0)
+// (1*31/32, 0*31/32, 1*1/32,  0*1/32) , ... , (1/32*31/32, 31/32*31/32, 1/32*1/32,  31/32*1/32)
+//                                       ...
+// (1*1/32,  0*1/32,  1*31/32, 0*31/32), ... , (1/32*1/32,  31/32*1/32,  1/32*31/32, 31/32*31/32)
+static void InitInterTab2D() {
+    static bool inited = false;
+    if (inited) {
+        return;
+    }
+
+    short* itab = BilinearTab_i[0][0];
+    int ksize   = KSIZE;
+
+    float* _tab = new float[2 * INTER_TAB_SIZE];
+    int i, j, k1, k2;
+    InitInterTab1D(_tab, INTER_TAB_SIZE);
+    for (i = 0; i < INTER_TAB_SIZE; i++) {
+        for (j = 0; j < INTER_TAB_SIZE; j++, itab += ksize * ksize) {
+            int isum = 0;
+
+            for (k1 = 0; k1 < ksize; k1++) {
+                float vy = _tab[i * ksize + k1];
+                for (k2 = 0; k2 < ksize; k2++) {
+                    float v                       = vy * _tab[j * ksize + k2];
+                    isum += itab[k1 * ksize + k2] = SATURATE_CAST_SHORT(v * INTER_REMAP_COEF_SCALE);
+                }
+            }
+
+            if (isum != INTER_REMAP_COEF_SCALE) {
+                int diff   = isum - INTER_REMAP_COEF_SCALE;
+                int ksize2 = ksize / 2, Mk1 = ksize2, Mk2 = ksize2, mk1 = ksize2, mk2 = ksize2;
+                for (k1 = ksize2; k1 < ksize2 + 2; k1++)
+                    for (k2 = ksize2; k2 < ksize2 + 2; k2++) {
+                        if (itab[k1 * ksize + k2] < itab[mk1 * ksize + mk2])
+                            mk1 = k1, mk2 = k2;
+                        else if (itab[k1 * ksize + k2] > itab[Mk1 * ksize + Mk2])
+                            Mk1 = k1, Mk2 = k2;
+                    }
+                if (diff < 0)
+                    itab[Mk1 * ksize + Mk2] = (short)(itab[Mk1 * ksize + Mk2] - diff);
+                else
+                    itab[mk1 * ksize + mk2] = (short)(itab[mk1 * ksize + mk2] - diff);
+            }
+        }
+    }
+
+    delete[] _tab;
+}
+
+// The buffer contains adelta and bdelta, which are used to calculate src position (src_x, src_y)
+// from dst position (x, y):
+// src_x = adelta[2*x]   + bdelta[2*y]
+// src_y = adelta[2*x+1] + bdelta[2*y+1]
+static void WarpAffineInit(uint8_t* dst, int batch, int dst_w, int dst_h, int channel, const float border_val,
+                           const float (*transform)[3], int** buffer) {
+    uint8_t border_ival = (uint8_t)border_val;
+    memset(dst, border_ival, batch * dst_h * dst_w * channel);
+
+    // Init LookUp Table
+    InitInterTab2D();
+
+    double m[6];
+    WarpAffineMatrixInverse(transform, m);
+
+    *buffer = reinterpret_cast<int*>(x86Malloc((dst_w + dst_h) * 2 * sizeof(int)));
+
+    int* adelta = *buffer;
+    int* bdelta = *buffer + dst_w * 2;
+
+    for (int x = 0; x < dst_w; x++) {
+        *adelta++ = SATURATE_CAST_INT(m[0] * x * 1024);
+        *adelta++ = SATURATE_CAST_INT(m[3] * x * 1024);
+    }
+
+    for (int y = 0; y < dst_h; y++) {
+        *bdelta++ = SATURATE_CAST_INT((m[1] * y + m[2]) * 1024);
+        *bdelta++ = SATURATE_CAST_INT((m[4] * y + m[5]) * 1024);
+    }
+}
+
+static inline bool CheckDataIsOnBoundary(const int new_x_loc, const int new_y_loc, const int src_w, const int src_h) {
+    return new_x_loc >= -1 && new_x_loc <= (src_w - 1) && new_y_loc >= -1 && new_y_loc <= (src_h - 1);
+}
+
+static inline bool CheckDataIsInBoundary(const int new_x_loc, const int new_y_loc, const int src_w, const int src_h) {
+    return new_x_loc >= 0 && new_x_loc < (src_w - 1) && new_y_loc >= 0 && new_y_loc < (src_h - 1);
+}
+
+static void WarpAffinePrepareOneRow(int* buf_loc, short* tab_loc, int* adelta, int* bdelta, int channel,
+                                    const uint8_t* src, int src_w, int src_h, uint8_t* dst, int dst_w, int y,
+                                    int src_offset, int& x_count, int& end_x, float border_val = 0) {
+    const unsigned char* src2 = src + src_w * channel;
+
+    short* xy_loc_buf = new short[dst_w * 2 + 4];
+    short* tb_loc_buf = new short[dst_w + 4];
+    // short xy_loc_buf[dst_w * 2];
+    // short tb_loc_buf[dst_w];
+    // int   sc_loc_buf[dst_w];
+    // short* xy_loc_buf_p = xy_loc_buf;
+    // short* tb_loc_buf_p = tb_loc_buf;
+    int x = 0;
+#if defined(__SSE4_2__)
+    {
+        __m128i off_vec   = _mm_set1_epi32(16);
+        __m128i mask31    = _mm_set1_epi32(31);
+        __m128i mask_mull = _mm_setr_epi16(1, 32, 1, 32, 1, 32, 1, 32);
+
+        __m128i bdelta_vec = _mm_setr_epi32(bdelta[2 * y], bdelta[2 * y + 1], bdelta[2 * y], bdelta[2 * y + 1]);
+        for (; x < dst_w; x += 4) {
+            __m128i adelta0 = _mm_add_epi32(_mm_loadu_si128((__m128i*)(adelta + 2 * x)), off_vec);
+            __m128i adelta1 = _mm_add_epi32(_mm_loadu_si128((__m128i*)(adelta + 2 * x + 4)), off_vec);
+            // x0y0,x1y1
+            __m128i x0y0 = _mm_add_epi32(adelta0, bdelta_vec);
+            // x2y2,x3y3
+            __m128i x2y2     = _mm_add_epi32(adelta1, bdelta_vec);
+            x0y0             = _mm_srai_epi32(x0y0, 5);
+            x2y2             = _mm_srai_epi32(x2y2, 5);
+            __m128i xy_float = _mm_packs_epi32(_mm_and_si128(x0y0, mask31), _mm_and_si128(x2y2, mask31));
+            xy_float         = _mm_mullo_epi16(xy_float, mask_mull);
+            __m128i xy       = _mm_packs_epi32(_mm_srai_epi32(x0y0, 5), _mm_srai_epi32(x2y2, 5));
+            xy_float         = _mm_hadd_epi16(xy_float, xy_float);
+            _mm_storeu_si128((__m128i*)(xy_loc_buf + x * 2), xy);
+            _mm_storel_epi64((__m128i*)(tb_loc_buf + x), xy_float);
+        }
+    }
+#endif
+    for (; x < dst_w; ++x) {
+        int new_x             = adelta[2 * x] + bdelta[2 * y] + 16;
+        int new_y             = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
+        int new_x_loc         = new_x >> 10;
+        int new_y_loc         = new_y >> 10;
+        xy_loc_buf[2 * x]     = new_x_loc;
+        xy_loc_buf[2 * x + 1] = new_y_loc;
+        tb_loc_buf[x]         = ((new_x >> 5) & 31) + ((new_y >> 5) & 31) * 32;
+        // sc_loc_buf[x] = (new_x_loc + new_y_loc * src_w) * channel + src_offset;
+    }
+
+    for (x = 0; x < dst_w; ++x) {
+        short new_x_loc    = xy_loc_buf[2 * x];
+        short new_y_loc    = xy_loc_buf[2 * x + 1];
+        short new_xy_float = tb_loc_buf[x];
+        int src_loc        = (new_x_loc + new_y_loc * src_w) * channel + src_offset;
+
+        if ((unsigned)new_x_loc < (src_w - 1) && (unsigned)new_y_loc < (src_h - 1)) {
+            buf_loc[x] = src_loc;
+            tab_loc[x] = new_xy_float;
+            x_count++;
+            end_x = x;
+        } else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+            short* wtab = BilinearTab_i[new_xy_float][0];
+            int dsc_loc = x * channel;
+
+            int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+            int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+            int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+            int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+            for (int c = 0; c < channel; ++c) {
+                int val_xy = 0;
+                val_xy += wtab[0] * (mask0 ? src[src_loc + c] : border_val);
+                val_xy += wtab[1] * (mask1 ? src[src_loc + channel + c] : border_val);
+                val_xy += wtab[2] * (mask2 ? src2[src_loc + c] : border_val);
+                val_xy += wtab[3] * (mask3 ? src2[src_loc + channel + c] : border_val);
+                dst[dsc_loc + c] = SATURATE_CAST_UCHAR((val_xy + (1 << 14)) >> 15);
+            }
+        }
+    }
+
+    delete[] xy_loc_buf;
+    delete[] tb_loc_buf;
+}
+
+template <int schannel>
+static void WarpAffineCalculateOneRow(int begin_x, int end_x, int channel, int dst_loc_base, const int* buf_loc,
+                                      const short* tab_loc, const uint8_t* src1, const uint8_t* src2, uint8_t* dst) {
+    const int* buf_loc_p   = buf_loc + begin_x;
+    const short* tab_loc_p = tab_loc + begin_x;
+    const short* tab_p     = BilinearTab_i[0][0];
+    int x                  = begin_x;
+
+#if defined(__SSE4_2__)
+    __m128i mask_vec;
+    if (channel == 4) {
+        mask_vec = _mm_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    } else if (channel == 3) {
+        mask_vec = _mm_setr_epi8(0, 3, 1, 4, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    } else if (channel == 2) {
+        mask_vec = _mm_setr_epi8(0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    }
+
+    const int shift_l = (4 - schannel) * 4;
+    const int shift_r = 4 - schannel;
+    __m128i DELTA_vec = _mm_set1_epi32(1 << 14);
+    __m128i mask_vec2 = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
+    __m128i mask_vec3 = _mm_setr_epi8(4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
+
+    uint8_t* dst_loc_p = dst + dst_loc_base + x * channel;
+    for (; x + 8 <= end_x - 2 * shift_r; x += 8) {
+        short* wtab0 = BilinearTab_i[tab_loc[x]][0];
+        short* wtab1 = BilinearTab_i[tab_loc[x + 1]][0];
+        short* wtab2 = BilinearTab_i[tab_loc[x + 2]][0];
+        short* wtab3 = BilinearTab_i[tab_loc[x + 3]][0];
+        short* wtab4 = BilinearTab_i[tab_loc[x + 4]][0];
+        short* wtab5 = BilinearTab_i[tab_loc[x + 5]][0];
+        short* wtab6 = BilinearTab_i[tab_loc[x + 6]][0];
+        short* wtab7 = BilinearTab_i[tab_loc[x + 7]][0];
+
+        const uint8_t* src_base0  = src1 + buf_loc[x];
+        const uint8_t* src_base1  = src1 + buf_loc[x + 1];
+        const uint8_t* src_base2  = src1 + buf_loc[x + 2];
+        const uint8_t* src_base3  = src1 + buf_loc[x + 3];
+        const uint8_t* src_base4  = src1 + buf_loc[x + 4];
+        const uint8_t* src_base5  = src1 + buf_loc[x + 5];
+        const uint8_t* src_base6  = src1 + buf_loc[x + 6];
+        const uint8_t* src_base7  = src1 + buf_loc[x + 7];
+        const uint8_t* src2_base0 = src2 + buf_loc[x];
+        const uint8_t* src2_base1 = src2 + buf_loc[x + 1];
+        const uint8_t* src2_base2 = src2 + buf_loc[x + 2];
+        const uint8_t* src2_base3 = src2 + buf_loc[x + 3];
+        const uint8_t* src2_base4 = src2 + buf_loc[x + 4];
+        const uint8_t* src2_base5 = src2 + buf_loc[x + 5];
+        const uint8_t* src2_base6 = src2 + buf_loc[x + 6];
+        const uint8_t* src2_base7 = src2 + buf_loc[x + 7];
+
+        __m128i wtab0_vec = _mm_loadl_epi64((__m128i*)wtab0);
+        __m128i wtab1_vec = _mm_loadl_epi64((__m128i*)wtab1);
+        __m128i wtab2_vec = _mm_loadl_epi64((__m128i*)wtab2);
+        __m128i wtab3_vec = _mm_loadl_epi64((__m128i*)wtab3);
+        __m128i wtab4_vec = _mm_loadl_epi64((__m128i*)wtab4);
+        __m128i wtab5_vec = _mm_loadl_epi64((__m128i*)wtab5);
+        __m128i wtab6_vec = _mm_loadl_epi64((__m128i*)wtab6);
+        __m128i wtab7_vec = _mm_loadl_epi64((__m128i*)wtab7);
+
+        __m128i wtab00_vec = _mm_shuffle_epi8(wtab0_vec, mask_vec2);
+        __m128i wtab01_vec = _mm_shuffle_epi8(wtab0_vec, mask_vec3);
+        __m128i wtab10_vec = _mm_shuffle_epi8(wtab1_vec, mask_vec2);
+        __m128i wtab11_vec = _mm_shuffle_epi8(wtab1_vec, mask_vec3);
+        __m128i wtab20_vec = _mm_shuffle_epi8(wtab2_vec, mask_vec2);
+        __m128i wtab21_vec = _mm_shuffle_epi8(wtab2_vec, mask_vec3);
+        __m128i wtab30_vec = _mm_shuffle_epi8(wtab3_vec, mask_vec2);
+        __m128i wtab31_vec = _mm_shuffle_epi8(wtab3_vec, mask_vec3);
+        __m128i wtab40_vec = _mm_shuffle_epi8(wtab4_vec, mask_vec2);
+        __m128i wtab41_vec = _mm_shuffle_epi8(wtab4_vec, mask_vec3);
+        __m128i wtab50_vec = _mm_shuffle_epi8(wtab5_vec, mask_vec2);
+        __m128i wtab51_vec = _mm_shuffle_epi8(wtab5_vec, mask_vec3);
+        __m128i wtab60_vec = _mm_shuffle_epi8(wtab6_vec, mask_vec2);
+        __m128i wtab61_vec = _mm_shuffle_epi8(wtab6_vec, mask_vec3);
+        __m128i wtab70_vec = _mm_shuffle_epi8(wtab7_vec, mask_vec2);
+        __m128i wtab71_vec = _mm_shuffle_epi8(wtab7_vec, mask_vec3);
+
+        __m128i point_vec00;
+        __m128i point_vec01;
+        __m128i point_vec10;
+        __m128i point_vec11;
+        __m128i point_vec20;
+        __m128i point_vec21;
+        __m128i point_vec30;
+        __m128i point_vec31;
+        __m128i point_vec40;
+        __m128i point_vec41;
+        __m128i point_vec50;
+        __m128i point_vec51;
+        __m128i point_vec60;
+        __m128i point_vec61;
+        __m128i point_vec70;
+        __m128i point_vec71;
+
+        if (schannel == 3) {
+            point_vec00 = _mm_shuffle_epi8(load_element_c3(src_base0), mask_vec);
+            point_vec01 = _mm_shuffle_epi8(load_element_c3(src2_base0), mask_vec);
+            point_vec10 = _mm_shuffle_epi8(load_element_c3(src_base1), mask_vec);
+            point_vec11 = _mm_shuffle_epi8(load_element_c3(src2_base1), mask_vec);
+            point_vec20 = _mm_shuffle_epi8(load_element_c3(src_base2), mask_vec);
+            point_vec21 = _mm_shuffle_epi8(load_element_c3(src2_base2), mask_vec);
+            point_vec30 = _mm_shuffle_epi8(load_element_c3(src_base3), mask_vec);
+            point_vec31 = _mm_shuffle_epi8(load_element_c3(src2_base3), mask_vec);
+            point_vec40 = _mm_shuffle_epi8(load_element_c3(src_base4), mask_vec);
+            point_vec41 = _mm_shuffle_epi8(load_element_c3(src2_base4), mask_vec);
+            point_vec50 = _mm_shuffle_epi8(load_element_c3(src_base5), mask_vec);
+            point_vec51 = _mm_shuffle_epi8(load_element_c3(src2_base5), mask_vec);
+            point_vec60 = _mm_shuffle_epi8(load_element_c3(src_base6), mask_vec);
+            point_vec61 = _mm_shuffle_epi8(load_element_c3(src2_base6), mask_vec);
+            point_vec70 = _mm_shuffle_epi8(load_element_c3(src_base7), mask_vec);
+            point_vec71 = _mm_shuffle_epi8(load_element_c3(src2_base7), mask_vec);
+        } else if (schannel == 4) {
+            point_vec00 = _mm_shuffle_epi8(load_element_c4(src_base0), mask_vec);
+            point_vec01 = _mm_shuffle_epi8(load_element_c4(src2_base0), mask_vec);
+            point_vec10 = _mm_shuffle_epi8(load_element_c4(src_base1), mask_vec);
+            point_vec11 = _mm_shuffle_epi8(load_element_c4(src2_base1), mask_vec);
+            point_vec20 = _mm_shuffle_epi8(load_element_c4(src_base2), mask_vec);
+            point_vec21 = _mm_shuffle_epi8(load_element_c4(src2_base2), mask_vec);
+            point_vec30 = _mm_shuffle_epi8(load_element_c4(src_base3), mask_vec);
+            point_vec31 = _mm_shuffle_epi8(load_element_c4(src2_base3), mask_vec);
+            point_vec40 = _mm_shuffle_epi8(load_element_c4(src_base4), mask_vec);
+            point_vec41 = _mm_shuffle_epi8(load_element_c4(src2_base4), mask_vec);
+            point_vec50 = _mm_shuffle_epi8(load_element_c4(src_base5), mask_vec);
+            point_vec51 = _mm_shuffle_epi8(load_element_c4(src2_base5), mask_vec);
+            point_vec60 = _mm_shuffle_epi8(load_element_c4(src_base6), mask_vec);
+            point_vec61 = _mm_shuffle_epi8(load_element_c4(src2_base6), mask_vec);
+            point_vec70 = _mm_shuffle_epi8(load_element_c4(src_base7), mask_vec);
+            point_vec71 = _mm_shuffle_epi8(load_element_c4(src2_base7), mask_vec);
+        } else if (schannel == 2) {
+            point_vec00 = _mm_shuffle_epi8(load_element_c2(src_base0), mask_vec);
+            point_vec01 = _mm_shuffle_epi8(load_element_c2(src2_base0), mask_vec);
+            point_vec10 = _mm_shuffle_epi8(load_element_c2(src_base1), mask_vec);
+            point_vec11 = _mm_shuffle_epi8(load_element_c2(src2_base1), mask_vec);
+            point_vec20 = _mm_shuffle_epi8(load_element_c2(src_base2), mask_vec);
+            point_vec21 = _mm_shuffle_epi8(load_element_c2(src2_base2), mask_vec);
+            point_vec30 = _mm_shuffle_epi8(load_element_c2(src_base3), mask_vec);
+            point_vec31 = _mm_shuffle_epi8(load_element_c2(src2_base3), mask_vec);
+            point_vec40 = _mm_shuffle_epi8(load_element_c2(src_base4), mask_vec);
+            point_vec41 = _mm_shuffle_epi8(load_element_c2(src2_base4), mask_vec);
+            point_vec50 = _mm_shuffle_epi8(load_element_c2(src_base5), mask_vec);
+            point_vec51 = _mm_shuffle_epi8(load_element_c2(src2_base5), mask_vec);
+            point_vec60 = _mm_shuffle_epi8(load_element_c2(src_base6), mask_vec);
+            point_vec61 = _mm_shuffle_epi8(load_element_c2(src2_base6), mask_vec);
+            point_vec70 = _mm_shuffle_epi8(load_element_c2(src_base7), mask_vec);
+            point_vec71 = _mm_shuffle_epi8(load_element_c2(src2_base7), mask_vec);
+        }
+
+        point_vec00 = _mm_cvtepu8_epi16(point_vec00);
+        point_vec01 = _mm_cvtepu8_epi16(point_vec01);
+        point_vec10 = _mm_cvtepu8_epi16(point_vec10);
+        point_vec11 = _mm_cvtepu8_epi16(point_vec11);
+        point_vec20 = _mm_cvtepu8_epi16(point_vec20);
+        point_vec21 = _mm_cvtepu8_epi16(point_vec21);
+        point_vec30 = _mm_cvtepu8_epi16(point_vec30);
+        point_vec31 = _mm_cvtepu8_epi16(point_vec31);
+        point_vec40 = _mm_cvtepu8_epi16(point_vec40);
+        point_vec41 = _mm_cvtepu8_epi16(point_vec41);
+        point_vec50 = _mm_cvtepu8_epi16(point_vec50);
+        point_vec51 = _mm_cvtepu8_epi16(point_vec51);
+        point_vec60 = _mm_cvtepu8_epi16(point_vec60);
+        point_vec61 = _mm_cvtepu8_epi16(point_vec61);
+        point_vec70 = _mm_cvtepu8_epi16(point_vec70);
+        point_vec71 = _mm_cvtepu8_epi16(point_vec71);
+
+        // int val_xy0  = wtab[0] * point0 + wtab[1] * point1 + wtab[2] * point2 + wtab[3] * point3;
+        // dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+        __m128i p0 = _mm_add_epi32(_mm_madd_epi16(wtab00_vec, point_vec00), DELTA_vec);
+        __m128i p1 = _mm_add_epi32(_mm_madd_epi16(wtab10_vec, point_vec10), DELTA_vec);
+        __m128i p2 = _mm_add_epi32(_mm_madd_epi16(wtab20_vec, point_vec20), DELTA_vec);
+        __m128i p3 = _mm_add_epi32(_mm_madd_epi16(wtab30_vec, point_vec30), DELTA_vec);
+        __m128i p4 = _mm_add_epi32(_mm_madd_epi16(wtab40_vec, point_vec40), DELTA_vec);
+        __m128i p5 = _mm_add_epi32(_mm_madd_epi16(wtab50_vec, point_vec50), DELTA_vec);
+        __m128i p6 = _mm_add_epi32(_mm_madd_epi16(wtab60_vec, point_vec60), DELTA_vec);
+        __m128i p7 = _mm_add_epi32(_mm_madd_epi16(wtab70_vec, point_vec70), DELTA_vec);
+        p0         = _mm_add_epi32(_mm_madd_epi16(wtab01_vec, point_vec01), p0);
+        p1         = _mm_add_epi32(_mm_madd_epi16(wtab11_vec, point_vec11), p1);
+        p2         = _mm_add_epi32(_mm_madd_epi16(wtab21_vec, point_vec21), p2);
+        p3         = _mm_add_epi32(_mm_madd_epi16(wtab31_vec, point_vec31), p3);
+        p4         = _mm_add_epi32(_mm_madd_epi16(wtab41_vec, point_vec41), p4);
+        p5         = _mm_add_epi32(_mm_madd_epi16(wtab51_vec, point_vec51), p5);
+        p6         = _mm_add_epi32(_mm_madd_epi16(wtab61_vec, point_vec61), p6);
+        p7         = _mm_add_epi32(_mm_madd_epi16(wtab71_vec, point_vec71), p7);
+        p0         = _mm_srli_epi32(p0, 15);
+        p1         = _mm_srli_epi32(p1, 15);
+        p2         = _mm_srli_epi32(p2, 15);
+        p3         = _mm_srli_epi32(p3, 15);
+        p4         = _mm_srli_epi32(p4, 15);
+        p5         = _mm_srli_epi32(p5, 15);
+        p6         = _mm_srli_epi32(p6, 15);
+        p7         = _mm_srli_epi32(p7, 15);
+
+        // store
+        __m128i ans0 = _mm_packs_epi32(_mm_slli_si128(p0, shift_l), p1);
+        __m128i ans1 = _mm_packs_epi32(_mm_slli_si128(p2, shift_l), p3);
+        __m128i ans2 = _mm_packs_epi32(_mm_slli_si128(p4, shift_l), p5);
+        __m128i ans3 = _mm_packs_epi32(_mm_slli_si128(p6, shift_l), p7);
+        ans0         = _mm_packus_epi16(ans0, ans0);
+        ans1         = _mm_packus_epi16(ans1, ans1);
+        ans2         = _mm_packus_epi16(ans2, ans2);
+        ans3         = _mm_packus_epi16(ans3, ans3);
+        ans0         = _mm_srli_si128(ans0, shift_r);
+        ans1         = _mm_srli_si128(ans1, shift_r);
+        ans2         = _mm_srli_si128(ans2, shift_r);
+        ans3         = _mm_srli_si128(ans3, shift_r);
+        _mm_storel_epi64((__m128i*)dst_loc_p, ans0);
+        _mm_storel_epi64((__m128i*)(dst_loc_p + channel * 2), ans1);
+        _mm_storel_epi64((__m128i*)(dst_loc_p + channel * 4), ans2);
+        _mm_storel_epi64((__m128i*)(dst_loc_p + channel * 6), ans3);
+
+        dst_loc_p += 8 * channel;
+    }
+#endif
+
+    if (channel == 2) {
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 2;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point02 + wtab[2] * point10 + wtab[3] * point12;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point03 + wtab[2] * point11 + wtab[3] * point13;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+        }
+    } else if (channel == 3) {
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 3;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point04 = src1[src_loc + 4];
+            int point05 = src1[src_loc + 5];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+            int point14 = src2[src_loc + 4];
+            int point15 = src2[src_loc + 5];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point03 + wtab[2] * point10 + wtab[3] * point13;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point04 + wtab[2] * point11 + wtab[3] * point14;
+            int val_xy2      = wtab[0] * point02 + wtab[1] * point05 + wtab[2] * point12 + wtab[3] * point15;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+            dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
+        }
+    } else if (channel == 4) {
+        for (; x <= end_x; x++) {
+            int dst_loc = dst_loc_base + x * 4;
+            int src_loc = buf_loc[x];
+            short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+            int point00 = src1[src_loc];
+            int point01 = src1[src_loc + 1];
+            int point02 = src1[src_loc + 2];
+            int point03 = src1[src_loc + 3];
+            int point04 = src1[src_loc + 4];
+            int point05 = src1[src_loc + 5];
+            int point06 = src1[src_loc + 6];
+            int point07 = src1[src_loc + 7];
+            int point10 = src2[src_loc];
+            int point11 = src2[src_loc + 1];
+            int point12 = src2[src_loc + 2];
+            int point13 = src2[src_loc + 3];
+            int point14 = src2[src_loc + 4];
+            int point15 = src2[src_loc + 5];
+            int point16 = src2[src_loc + 6];
+            int point17 = src2[src_loc + 7];
+
+            int val_xy0      = wtab[0] * point00 + wtab[1] * point04 + wtab[2] * point10 + wtab[3] * point14;
+            int val_xy1      = wtab[0] * point01 + wtab[1] * point05 + wtab[2] * point11 + wtab[3] * point15;
+            int val_xy2      = wtab[0] * point02 + wtab[1] * point06 + wtab[2] * point12 + wtab[3] * point16;
+            int val_xy3      = wtab[0] * point03 + wtab[1] * point07 + wtab[2] * point13 + wtab[3] * point17;
+            dst[dst_loc]     = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+            dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
+            dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
+            dst[dst_loc + 3] = SATURATE_CAST_UCHAR((val_xy3 + (1 << 14)) >> 15);
+        }
+    }
+}
+
+template <>
+void WarpAffineCalculateOneRow<1>(int begin_x, int end_x, int channel, int dst_loc_base, const int* buf_loc,
+                                  const short* tab_loc, const uint8_t* src1, const uint8_t* src2, uint8_t* dst) {
+    const int* buf_loc_p   = buf_loc + begin_x;
+    const short* tab_loc_p = tab_loc + begin_x;
+    const short* tab_p     = BilinearTab_i[0][0];
+    int x                  = begin_x;
+
+    // uint8_t buf[(end_x - begin_x + 4) * 4];
+    uint8_t* buf = new uint8_t[(end_x - begin_x + 4) * 4];
+    uint8_t* ptr = buf;
+
+    for (int x = begin_x; x <= end_x; x++) {
+        int src_loc = buf_loc[x];
+        *ptr++      = src1[src_loc];
+        *ptr++      = src1[src_loc + 1];
+        *ptr++      = src2[src_loc];
+        *ptr++      = src2[src_loc + 1];
+    }
+    ptr = buf;
+
+#if defined(__SSE4_2__)
+    __m128i DELTA_vec  = _mm_set1_epi32(1 << 14);
+    uint8_t* dst_loc_p = dst + dst_loc_base + x * channel;
+    for (; x + 7 <= end_x; x += 8) {
+        short* wtab0 = BilinearTab_i[tab_loc[x]][0];
+        short* wtab1 = BilinearTab_i[tab_loc[x + 1]][0];
+        short* wtab2 = BilinearTab_i[tab_loc[x + 2]][0];
+        short* wtab3 = BilinearTab_i[tab_loc[x + 3]][0];
+        short* wtab4 = BilinearTab_i[tab_loc[x + 4]][0];
+        short* wtab5 = BilinearTab_i[tab_loc[x + 5]][0];
+        short* wtab6 = BilinearTab_i[tab_loc[x + 6]][0];
+        short* wtab7 = BilinearTab_i[tab_loc[x + 7]][0];
+
+        __m128i p0_load = _mm_loadl_epi64((__m128i*)ptr);
+        __m128i p1_load = _mm_loadl_epi64((__m128i*)(ptr + 8));
+        __m128i p2_load = _mm_loadl_epi64((__m128i*)(ptr + 16));
+        __m128i p3_load = _mm_loadl_epi64((__m128i*)(ptr + 24));
+
+        __m128i wvec0 = _mm_loadl_epi64((__m128i*)wtab0);
+        __m128i wvec1 = _mm_loadl_epi64((__m128i*)wtab1);
+        __m128i wvec2 = _mm_loadl_epi64((__m128i*)wtab2);
+        __m128i wvec3 = _mm_loadl_epi64((__m128i*)wtab3);
+        __m128i wvec4 = _mm_loadl_epi64((__m128i*)wtab4);
+        __m128i wvec5 = _mm_loadl_epi64((__m128i*)wtab5);
+        __m128i wvec6 = _mm_loadl_epi64((__m128i*)wtab6);
+        __m128i wvec7 = _mm_loadl_epi64((__m128i*)wtab7);
+
+        __m128i w0 = _mm_unpacklo_epi64(wvec0, wvec1);
+        __m128i w1 = _mm_unpacklo_epi64(wvec2, wvec3);
+        __m128i w2 = _mm_unpacklo_epi64(wvec4, wvec5);
+        __m128i w3 = _mm_unpacklo_epi64(wvec6, wvec7);
+
+        __m128i p0 = _mm_cvtepu8_epi16(p0_load);
+        __m128i p1 = _mm_cvtepu8_epi16(p1_load);
+        __m128i p2 = _mm_cvtepu8_epi16(p2_load);
+        __m128i p3 = _mm_cvtepu8_epi16(p3_load);
+
+        __m128i r0 = _mm_madd_epi16(p0, w0);
+        __m128i r1 = _mm_madd_epi16(p1, w1);
+        __m128i r2 = _mm_madd_epi16(p2, w2);
+        __m128i r3 = _mm_madd_epi16(p3, w3);
+
+        __m128i ans0 = _mm_hadd_epi32(r0, r1);
+        __m128i ans1 = _mm_hadd_epi32(r2, r3);
+
+        ans0 = _mm_srai_epi32(_mm_add_epi32(ans0, DELTA_vec), 15);
+        ans1 = _mm_srai_epi32(_mm_add_epi32(ans1, DELTA_vec), 15);
+
+        __m128i ans_16 = _mm_packs_epi32(ans0, ans1);
+        __m128i ans_8  = _mm_packus_epi16(ans_16, ans_16);
+        _mm_storel_epi64((__m128i*)dst_loc_p, ans_8);
+        dst_loc_p += 8;
+        ptr += 32;
+    }
+#endif
+    for (; x <= end_x; x++) {
+        int dst_loc = dst_loc_base + x * 1;
+        int src_loc = buf_loc[x];
+        short* wtab = BilinearTab_i[tab_loc[x]][0];
+
+        int point0 = ptr[0];
+        int point1 = ptr[1];
+        int point2 = ptr[2];
+        int point3 = ptr[3];
+        ptr += 4;
+
+        int val_xy0  = wtab[0] * point0 + wtab[1] * point1 + wtab[2] * point2 + wtab[3] * point3;
+        dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
+    }
+
+    delete[] buf;
+}
+
+template <int schannel>
+static void WarpAffineBilinear(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                               const float (*transform)[3], const float border_val) {
+    int src_plane = src_h * src_w * schannel;
+
+    int* buffer = nullptr;
+    WarpAffineInit(dst, batch, dst_w, dst_h, schannel, border_val, transform, &buffer);
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    int max_num_threads = OMP_MAX_THREADS_NUM_;
+    int* buf_loc        = new int[dst_w * max_num_threads];
+    short* tab_loc      = new short[dst_w * max_num_threads];
+
+    const unsigned char* src2 = src + src_w * schannel;
+
+    OMP_PARALLEL_FOR_
+    for (int y = 0; y < dst_h * batch; ++y) {
+        int thread_id    = OMP_TID_;
+        int x_count      = 0;
+        int end_x        = 0;
+        int dst_loc_base = y * dst_w * schannel;
+        int* buf_loc_t   = buf_loc + thread_id * dst_w;
+        short* tab_loc_t = tab_loc + thread_id * dst_w;
+
+        WarpAffinePrepareOneRow(buf_loc_t, tab_loc_t, adelta, bdelta, schannel, src, src_w, src_h, dst + dst_loc_base,
+                                dst_w, y % dst_h, (y / dst_h) * src_plane, x_count, end_x, border_val);
+        WarpAffineCalculateOneRow<schannel>(end_x - x_count + 1, end_x, schannel, dst_loc_base, buf_loc_t, tab_loc_t,
+                                            src, src2, dst);
+    }
+
+    delete[] buf_loc;
+    delete[] tab_loc;
+
+    x86Free(buffer);
+}
+
+// warp affine
+void WarpAffineBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<1>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<2>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<3>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                          const float (*transform)[3], const float border_val) {
+    WarpAffineBilinear<4>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                                const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY = src + b * src_plane;
+        uint8_t* dstY       = dst + b * dst_plane;
+        WarpAffineBilinearC1(srcY, 1, src_w, src_h, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineBilinearC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+template <int schannel>
+static void WarpAffineNearest(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                              const float (*transform)[3], const float border_val) {
+    uint8_t border_ival = (uint8_t)border_val;
+    int* buffer         = nullptr;
+    WarpAffineInit(dst, batch, dst_w, dst_h, schannel, border_val, transform, &buffer);
+    int* adelta = buffer;
+    int* bdelta = buffer + dst_w * 2;
+
+    int src_stride = src_w * schannel;
+    int src_plane  = src_h * src_w * schannel;
+    OMP_PARALLEL_FOR_
+    for (int y = 0; y < dst_h * batch; ++y) {
+        int y_c = y / dst_h;
+        int y_r = y % dst_h;
+
+        auto src_b = src + y_c * src_plane;
+        auto dst_y = dst + y * dst_w * schannel;
+
+        for (int x = 0; x < dst_w; ++x) {
+            int new_x     = adelta[2 * x] + bdelta[2 * y_r] + 16;
+            int new_y     = adelta[2 * x + 1] + bdelta[2 * y_r + 1] + 16;
+            int new_x_loc = new_x >> 10;
+            int new_y_loc = new_y >> 10;
+
+            bool is_left = ((new_x >> 5) & 31) < 16;
+            bool is_top  = ((new_y >> 5) & 31) < 16;
+
+            int src_loc = (new_x_loc + new_y_loc * src_w) * schannel;
+            auto src_y1 = src_b + src_loc;
+            auto src_y2 = src_y1 + src_stride;
+            auto dst_x  = dst_y + x * schannel;
+
+            if (CheckDataIsInBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+                int c = 0;
+#ifdef __SSE4_2__
+                if (schannel == 4) {
+                    __m128i _vsrc    = is_top ? _mm_loadl_epi64((__m128i*)src_y1) : _mm_loadl_epi64((__m128i*)src_y2);
+                    *(int32_t*)dst_x = is_left ? _mm_extract_epi32(_vsrc, 0) : _mm_extract_epi32(_vsrc, 1);
+                    c                = 4;
+                }
+#endif
+                for (; c < schannel; c++) {
+                    uint8_t point00 = src_y1[c];
+                    uint8_t point01 = src_y1[schannel + c];
+                    uint8_t point10 = src_y2[c];
+                    uint8_t point11 = src_y2[schannel + c];
+                    if (is_top) {
+                        dst_x[c] = is_left ? point00 : point01;
+                    } else {
+                        dst_x[c] = is_left ? point10 : point11;
+                    }
+                }
+            } else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
+                int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
+                int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
+                int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
+                int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
+
+                for (int c = 0; c < schannel; ++c) {
+                    uint8_t point00 = mask0 ? src_y1[c] : border_ival;
+                    uint8_t point01 = mask1 ? src_y1[schannel + c] : border_ival;
+                    uint8_t point10 = mask2 ? src_y2[c] : border_ival;
+                    uint8_t point11 = mask3 ? src_y2[schannel + c] : border_ival;
+                    if (is_top) {
+                        dst_x[c] = is_left ? point00 : point01;
+                    } else {
+                        dst_x[c] = is_left ? point10 : point11;
+                    }
+                }
+            }
+        }
+    }
+
+    free(buffer);
+}
+
+void WarpAffineNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<1>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<2>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<3>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                         const float (*transform)[3], const float border_val) {
+    WarpAffineNearest<4>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
+}
+
+void WarpAffineNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
+                               const float (*transform)[3], const float border_val) {
+    // assert src_w % 2 == 0
+    // assert src_h % 2 == 0
+    // assert dst_w % 2 == 0
+    // assert dst_h % 2 == 0
+
+    int src_plane = src_w * src_h * 3 / 2;
+    int dst_plane = dst_w * dst_h * 3 / 2;
+
+    for (int b = 0; b < batch; ++b) {
+        const uint8_t* srcY = src + b * src_plane;
+        uint8_t* dstY       = dst + b * dst_plane;
+        WarpAffineNearestC1(srcY, 1, src_w, src_h, dstY, dst_w, dst_h, transform, border_val);
+
+        const uint8_t* srcUV = srcY + src_w * src_h;
+        uint8_t* dstUV       = dstY + dst_w * dst_h;
+        WarpAffineNearestC2(srcUV, 1, src_w / 2, src_h / 2, dstUV, dst_w / 2, dst_h / 2, transform, border_val);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.h b/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.h
new file mode 100644
index 0000000..ee2a98a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_mat_util.h
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_UTIL_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_UTIL_H_
+
+#include <string.h>
+#include <cstdlib>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+#define GET_OFFSET_PTR(ptr, offset) (reinterpret_cast<int8_t*>(ptr) + offset)
+
+void MatMemcpy2D(void* src, void* dst, int width, int height, int src_stride, int dst_stride);
+void MatMemcpy2DWithPadding(void* src, void* dst, int width, int height, int src_stride, int dst_stride, int top,
+                            int bottom, int left, int right, uint8_t pad_val);
+
+// color convert
+void NV12ToBGR(const unsigned char* nv12, unsigned char* bgr, int height, int width);
+void NV21ToBGR(const unsigned char* nv21, unsigned char* bgr, int height, int width);
+void NV12ToBGRA(const unsigned char* nv12, unsigned char* bgra, int height, int width);
+void NV21ToBGRA(const unsigned char* nv21, unsigned char* bgra, int height, int width);
+
+void BGRToGray(const unsigned char* bgr, unsigned char* gray, int height, int width);
+void BGRAToGray(const unsigned char* bgra, unsigned char* gray, int height, int width);
+void RGBToGray(const unsigned char* rgb, unsigned char* gray, int height, int width);
+void RGBAToGray(const unsigned char* rgba, unsigned char* gray, int height, int width);
+
+// resize
+void ResizeBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+
+void ResizeNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+void ResizeNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h);
+
+// warp affine
+void WarpAffineBilinearC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                          const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineBilinearYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                                const float (*transform)[3], const float border_val = 0.0);
+
+void WarpAffineNearestC1(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC2(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC3(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestC4(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                         const float (*transform)[3], const float border_val = 0.0);
+void WarpAffineNearestYUV420sp(const uint8_t* src, int batch, int src_w, int src_h, uint8_t* dst, int w, int h,
+                               const float (*transform)[3], const float border_val = 0.0);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_MAT_UTIL_H_
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_util.cc b/3rdparty/TNN/source/tnn/device/x86/x86_util.cc
new file mode 100644
index 0000000..25d3d40
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_util.cc
@@ -0,0 +1,508 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/x86/x86_util.h"
+#include "tnn/device/x86/x86_common.h"
+
+#include <type_traits>
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+#define _MM_TRANSPOSE4_LEFT(row0, row1, row2, row3) \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  __m128 t0, t1, t2, t3; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  t0 = _mm_movelh_ps(tmp0, tmp2); \
+  t1 = _mm_movehl_ps(tmp2, tmp0); \
+  t2 = _mm_movelh_ps(tmp1, tmp3); \
+  t3 = _mm_movehl_ps(tmp3, tmp1);
+
+template <int left_c>
+inline void PackC4_Left(float *dst, const float *src, size_t hw, size_t src_hw_stride) {
+    auto src0 = src;
+    auto src1 = src + src_hw_stride;
+    auto src2 = src + src_hw_stride * 2;
+
+    int cur_hw = 0;
+    __m128 v1 = _mm_setzero_ps();
+    __m128 v2 = _mm_setzero_ps();
+    __m128 v3 = _mm_setzero_ps();
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        auto dst_hw = dst + cur_hw * 4;
+        __m128 v0 = _mm_loadu_ps(src0 + cur_hw);
+        if (left_c > 1) v1 = _mm_loadu_ps(src1 + cur_hw);
+        if (left_c > 2) v2 = _mm_loadu_ps(src2 + cur_hw);
+        _MM_TRANSPOSE4_LEFT(v0, v1, v2, v3);
+        _mm_storeu_ps(dst_hw, t0);
+        _mm_storeu_ps(dst_hw + 4, t1);
+        _mm_storeu_ps(dst_hw + 8, t2);
+        _mm_storeu_ps(dst_hw + 12, t3);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 4 + 0] = src0[cur_hw];
+        if (left_c > 1) {
+            dst[cur_hw * 4 + 1] = src1[cur_hw];
+        } else {
+            dst[cur_hw * 4 + 1] = 0;
+        }
+        if (left_c > 2) {
+            dst[cur_hw * 4 + 2] = src2[cur_hw];
+        } else {
+            dst[cur_hw * 4 + 2] = 0;
+        }
+        dst[cur_hw * 4 + 3] = 0;
+    }
+}
+
+int PackC4(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel) {
+    int c = 0;
+    for (; c + 3 < channel; c += 4) {
+        auto src0 = src + c * src_hw_stride;
+        auto src1 = src0 + src_hw_stride;
+        auto src2 = src0 + src_hw_stride * 2;
+        auto src3 = src0 + src_hw_stride * 3;
+        auto dst_c = dst + c * dst_hw_stride;
+        int cur_hw = 0;
+        for (; cur_hw + 3 < hw; cur_hw += 4) {
+            auto dst_hw = dst_c + cur_hw * 4;
+            __m128 v0 = _mm_loadu_ps(src0 + cur_hw);
+            __m128 v1 = _mm_loadu_ps(src1 + cur_hw);
+            __m128 v2 = _mm_loadu_ps(src2 + cur_hw);
+            __m128 v3 = _mm_loadu_ps(src3 + cur_hw);
+            _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
+            _mm_storeu_ps(dst_hw, v0);
+            _mm_storeu_ps(dst_hw + 4, v1);
+            _mm_storeu_ps(dst_hw + 8, v2);
+            _mm_storeu_ps(dst_hw + 12, v3);
+        }
+        for (; cur_hw < hw; cur_hw++) {
+            dst_c[cur_hw * 4 + 0] = src0[cur_hw];
+            dst_c[cur_hw * 4 + 1] = src1[cur_hw];
+            dst_c[cur_hw * 4 + 2] = src2[cur_hw];
+            dst_c[cur_hw * 4 + 3] = src3[cur_hw];
+        }
+    }
+    int left_c = channel - c;
+    if (left_c == 3) {
+        PackC4_Left<3>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 2) {
+        PackC4_Left<2>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 1) {
+        PackC4_Left<1>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    }
+    return 0;
+}
+
+#define _MM256_TRANSPOSE8(v0, v1, v2, v3, v4, v5, v6, v7) \
+    __m256 t0 = _mm256_unpacklo_ps(v0, v1); \
+    __m256 t1 = _mm256_unpackhi_ps(v0, v1); \
+    __m256 t2 = _mm256_unpacklo_ps(v2, v3); \
+    __m256 t3 = _mm256_unpackhi_ps(v2, v3); \
+    __m256 t4 = _mm256_unpacklo_ps(v4, v5); \
+    __m256 t5 = _mm256_unpackhi_ps(v4, v5); \
+    __m256 t6 = _mm256_unpacklo_ps(v6, v7); \
+    __m256 t7 = _mm256_unpackhi_ps(v6, v7); \
+    __m256 v;                               \
+    v  = _mm256_shuffle_ps(t0, t2, 0x4E);   \
+    v0 = _mm256_blend_ps(t0, v, 0xCC);      \
+    v1 = _mm256_blend_ps(t2, v, 0x33);      \
+    v  = _mm256_shuffle_ps(t1, t3, 0x4E);   \
+    v2 = _mm256_blend_ps(t1, v, 0xCC);      \
+    v3 = _mm256_blend_ps(t3, v, 0x33);      \
+    v  = _mm256_shuffle_ps(t4, t6, 0x4E);   \
+    v4 = _mm256_blend_ps(t4, v, 0xCC);      \
+    v5 = _mm256_blend_ps(t6, v, 0x33);      \
+    v  = _mm256_shuffle_ps(t5, t7, 0x4E);   \
+    v6 = _mm256_blend_ps(t5, v, 0xCC);      \
+    v7 = _mm256_blend_ps(t7, v, 0x33);
+
+#define _MM256_TRANSPOSE8_LEFT(v0, v1, v2, v3, v4, v5, v6, v7) \
+  __m256 t0 = _mm256_unpacklo_ps(v0, v1);                      \
+  __m256 t1 = _mm256_unpackhi_ps(v0, v1);                      \
+  __m256 t2 = _mm256_unpacklo_ps(v2, v3);                      \
+  __m256 t3 = _mm256_unpackhi_ps(v2, v3);                      \
+  __m256 t4 = _mm256_unpacklo_ps(v4, v5);                      \
+  __m256 t5 = _mm256_unpackhi_ps(v4, v5);                      \
+  __m256 t6 = _mm256_unpacklo_ps(v6, v7);                      \
+  __m256 t7 = _mm256_unpackhi_ps(v6, v7);                      \
+  v0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));          \
+  v1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));          \
+  v2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));          \
+  v3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));          \
+  v4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));          \
+  v5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));          \
+  v6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));          \
+  v7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));          \
+  t0 = _mm256_permute2f128_ps(v0, v4, 0x20);                   \
+  t1 = _mm256_permute2f128_ps(v1, v5, 0x20);                   \
+  t2 = _mm256_permute2f128_ps(v2, v6, 0x20);                   \
+  t3 = _mm256_permute2f128_ps(v3, v7, 0x20);                   \
+  t4 = _mm256_permute2f128_ps(v0, v4, 0x31);                   \
+  t5 = _mm256_permute2f128_ps(v1, v5, 0x31);                   \
+  t6 = _mm256_permute2f128_ps(v2, v6, 0x31);                   \
+  t7 = _mm256_permute2f128_ps(v3, v7, 0x31);
+
+template <int left_c>
+inline void PackC8_Left(float *dst, const float *src, size_t hw, size_t src_hw_stride) {
+    auto src0 = src;
+    auto src1 = src + src_hw_stride;
+    auto src2 = src + src_hw_stride * 2;
+    auto src3 = src + src_hw_stride * 3;
+    auto src4 = src + src_hw_stride * 4;
+    auto src5 = src + src_hw_stride * 5;
+    auto src6 = src + src_hw_stride * 6;
+    int cur_hw = 0;
+#ifdef __AVX2__
+    __m256 v1 = _mm256_setzero_ps();
+    __m256 v2 = _mm256_setzero_ps();
+    __m256 v3 = _mm256_setzero_ps();
+    __m256 v4 = _mm256_setzero_ps();
+    __m256 v5 = _mm256_setzero_ps();
+    __m256 v6 = _mm256_setzero_ps();
+    __m256 v7 = _mm256_setzero_ps();
+
+    for (; cur_hw + 7 < hw; cur_hw += 8) {
+        auto dst_hw = dst + cur_hw * 8;
+        __m256 v0 = _mm256_loadu_ps(src0 + cur_hw);
+        if (left_c > 1) v1 = _mm256_loadu_ps(src1 + cur_hw);
+        if (left_c > 2) v2 = _mm256_loadu_ps(src2 + cur_hw);
+        if (left_c > 3) v3 = _mm256_loadu_ps(src3 + cur_hw);
+        if (left_c > 4) v4 = _mm256_loadu_ps(src4 + cur_hw);
+        if (left_c > 5) v5 = _mm256_loadu_ps(src5 + cur_hw);
+        if (left_c > 6) v6 = _mm256_loadu_ps(src6 + cur_hw);
+        _MM256_TRANSPOSE8_LEFT(v0, v1, v2, v3, v4, v5, v6, v7);
+        _mm256_storeu_ps(dst_hw, t0);
+        _mm256_storeu_ps(dst_hw + 8, t1);
+        _mm256_storeu_ps(dst_hw + 16, t2);
+        _mm256_storeu_ps(dst_hw + 24, t3);
+        _mm256_storeu_ps(dst_hw + 32, t4);
+        _mm256_storeu_ps(dst_hw + 40, t5);
+        _mm256_storeu_ps(dst_hw + 48, t6);
+        _mm256_storeu_ps(dst_hw + 56, t7);
+    }
+#endif
+    for (; cur_hw < hw; cur_hw++) {
+        dst[cur_hw * 8 + 0] = src0[cur_hw];
+        if (left_c > 1) {
+            dst[cur_hw * 8 + 1] = src1[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 1] = 0;
+        }
+        if (left_c > 2) {
+            dst[cur_hw * 8 + 2] = src2[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 2] = 0;
+        }
+        if (left_c > 3) {
+            dst[cur_hw * 8 + 3] = src3[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 3] = 0;
+        }
+        if (left_c > 4) {
+            dst[cur_hw * 8 + 4] = src4[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 4] = 0;
+        }
+        if (left_c > 5) {
+            dst[cur_hw * 8 + 5] = src5[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 5] = 0;
+        }
+        if (left_c > 6) {
+            dst[cur_hw * 8 + 6] = src6[cur_hw];
+        } else {
+            dst[cur_hw * 8 + 6] = 0;
+        }
+       dst[cur_hw * 8 + 7]  = 0;
+    }
+}
+int PackC8(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel) {
+    int c = 0;
+    for (; c + 7 < channel; c += 8) {
+        auto src0 = src + c * src_hw_stride;
+        auto src1 = src0 + src_hw_stride;
+        auto src2 = src0 + src_hw_stride * 2;
+        auto src3 = src0 + src_hw_stride * 3;
+        auto src4 = src0 + src_hw_stride * 4;
+        auto src5 = src0 + src_hw_stride * 5;
+        auto src6 = src0 + src_hw_stride * 6;
+        auto src7 = src0 + src_hw_stride * 7;
+        auto dst_c = dst + c * dst_hw_stride;
+        int cur_hw = 0;
+#ifdef __AVX2__
+        for (; cur_hw + 7 < hw; cur_hw += 8) {
+            auto dst_hw = dst_c + cur_hw * 8;
+            __m256 v0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src0 + cur_hw)),     _mm_loadu_ps(src4 + cur_hw), 1);
+            __m256 v1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src1 + cur_hw)),     _mm_loadu_ps(src5 + cur_hw), 1);
+            __m256 v2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src2 + cur_hw)),     _mm_loadu_ps(src6 + cur_hw), 1);
+            __m256 v3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src3 + cur_hw)),     _mm_loadu_ps(src7 + cur_hw), 1);
+            __m256 v4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src0 + cur_hw + 4)), _mm_loadu_ps(src4 + cur_hw + 4), 1);
+            __m256 v5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src1 + cur_hw + 4)), _mm_loadu_ps(src5 + cur_hw + 4), 1);
+            __m256 v6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src2 + cur_hw + 4)), _mm_loadu_ps(src6 + cur_hw + 4), 1);
+            __m256 v7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(src3 + cur_hw + 4)), _mm_loadu_ps(src7 + cur_hw + 4), 1);
+            _MM256_TRANSPOSE8(v0, v1, v2, v3, v4, v5, v6, v7);
+            _mm256_storeu_ps(dst_hw, v0);
+            _mm256_storeu_ps(dst_hw + 8, v1);
+            _mm256_storeu_ps(dst_hw + 16, v2);
+            _mm256_storeu_ps(dst_hw + 24, v3);
+            _mm256_storeu_ps(dst_hw + 32, v4);
+            _mm256_storeu_ps(dst_hw + 40, v5);
+            _mm256_storeu_ps(dst_hw + 48, v6);
+            _mm256_storeu_ps(dst_hw + 56, v7);
+        }
+#endif
+        for (; cur_hw < hw; cur_hw++) {
+            dst_c[cur_hw * 8 + 0] = src0[cur_hw];
+            dst_c[cur_hw * 8 + 1] = src1[cur_hw];
+            dst_c[cur_hw * 8 + 2] = src2[cur_hw];
+            dst_c[cur_hw * 8 + 3] = src3[cur_hw];
+            dst_c[cur_hw * 8 + 4] = src4[cur_hw];
+            dst_c[cur_hw * 8 + 5] = src5[cur_hw];
+            dst_c[cur_hw * 8 + 6] = src6[cur_hw];
+            dst_c[cur_hw * 8 + 7] = src7[cur_hw];
+        }
+    }
+    int left_c = channel - c;
+    if (left_c == 7) {
+        PackC8_Left<7>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 6) {
+        PackC8_Left<6>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 5) {
+        PackC8_Left<5>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 4) {
+        PackC8_Left<4>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 3) {
+        PackC8_Left<3>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 2) {
+        PackC8_Left<2>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    } else if (left_c == 1) {
+        PackC8_Left<1>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, src_hw_stride);
+    }
+    return 0;
+}
+
+template <int left_c>
+inline void UnpackC4_Left(float *dst, const float *src, size_t hw, size_t dst_hw_stride) {
+    auto dst0 = dst;
+    auto dst1 = dst + dst_hw_stride;
+    auto dst2 = dst + dst_hw_stride * 2;
+    int cur_hw = 0;
+    for (; cur_hw + 3 < hw; cur_hw += 4) {
+        auto src_hw = src + cur_hw * 4;
+        __m128 v0 = _mm_load_ps(src_hw);
+        __m128 v1 = _mm_load_ps(src_hw + 4);
+        __m128 v2 = _mm_load_ps(src_hw + 8);
+        __m128 v3 = _mm_load_ps(src_hw + 12);
+        _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
+        _mm_storeu_ps(dst0 + cur_hw, v0);
+        if (left_c > 1) _mm_storeu_ps(dst1 + cur_hw, v1);
+        if (left_c > 2) _mm_storeu_ps(dst2 + cur_hw, v2);
+    }
+    for (; cur_hw < hw; cur_hw++) {
+        dst0[cur_hw] = src[cur_hw * 4 + 0];
+        if (left_c > 1) dst1[cur_hw] = src[cur_hw * 4 + 1];
+        if (left_c > 2) dst2[cur_hw] = src[cur_hw * 4 + 2];
+    }
+}
+
+int UnpackC4(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel) {
+    int c = 0;
+    for (; c + 3 < channel; c += 4) {
+        auto src_c = src + c * src_hw_stride;
+        auto dst0 = dst + c * dst_hw_stride;
+        auto dst1 = dst0 + dst_hw_stride;
+        auto dst2 = dst0 + dst_hw_stride * 2;
+        auto dst3 = dst0 + dst_hw_stride * 3;
+        int cur_hw = 0;
+        for (; cur_hw + 3 < hw; cur_hw += 4) {
+            auto src_hw = src_c + cur_hw * 4;
+            __m128 v0 = _mm_load_ps(src_hw);
+            __m128 v1 = _mm_load_ps(src_hw + 4);
+            __m128 v2 = _mm_load_ps(src_hw + 8);
+            __m128 v3 = _mm_load_ps(src_hw + 12);
+            _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
+            _mm_storeu_ps(dst0 + cur_hw, v0);
+            _mm_storeu_ps(dst1 + cur_hw, v1);
+            _mm_storeu_ps(dst2 + cur_hw, v2);
+            _mm_storeu_ps(dst3 + cur_hw, v3);
+        }
+        for (; cur_hw < hw; cur_hw++) {
+            dst0[cur_hw] = src_c[cur_hw * 4 + 0];
+            dst1[cur_hw] = src_c[cur_hw * 4 + 1];
+            dst2[cur_hw] = src_c[cur_hw * 4 + 2];
+            dst3[cur_hw] = src_c[cur_hw * 4 + 3];
+        }
+    }
+    int left_c = channel - c;
+    if (left_c == 3) {
+        UnpackC4_Left<3>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 2) {
+        UnpackC4_Left<2>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 1) {
+        UnpackC4_Left<1>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    }
+    return 0;
+}
+
+template <int left_c>
+inline void UnpackC8_Left(float *dst, const float *src, size_t hw, size_t dst_hw_stride) {
+    auto dst0 = dst;
+    auto dst1 = dst + dst_hw_stride;
+    auto dst2 = dst + dst_hw_stride * 2;
+    auto dst3 = dst + dst_hw_stride * 3;
+    auto dst4 = dst + dst_hw_stride * 4;
+    auto dst5 = dst + dst_hw_stride * 5;
+    auto dst6 = dst + dst_hw_stride * 6;
+    int cur_hw = 0;
+#ifdef __AVX2__
+    for (; cur_hw + 7 < hw; cur_hw += 8) {
+        auto src_hw = src + cur_hw * 8;
+        __m256 v0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw)),      _mm_load_ps(src_hw + 32), 1);
+        __m256 v1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 8)),  _mm_load_ps(src_hw + 40), 1);
+        __m256 v2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 16)), _mm_load_ps(src_hw + 48), 1);
+        __m256 v3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 24)), _mm_load_ps(src_hw + 56), 1);
+        __m256 v4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 4)),  _mm_load_ps(src_hw + 36), 1);
+        __m256 v5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 12)), _mm_load_ps(src_hw + 44), 1);
+        __m256 v6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 20)), _mm_load_ps(src_hw + 52), 1);
+        __m256 v7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 28)), _mm_load_ps(src_hw + 60), 1);
+        _MM256_TRANSPOSE8(v0, v1, v2, v3, v4, v5, v6, v7);
+        _mm256_storeu_ps(dst0 + cur_hw, v0);
+        if(left_c > 1) _mm256_storeu_ps(dst1 + cur_hw, v1);
+        if(left_c > 2) _mm256_storeu_ps(dst2 + cur_hw, v2);
+        if(left_c > 3) _mm256_storeu_ps(dst3 + cur_hw, v3);
+        if(left_c > 4) _mm256_storeu_ps(dst4 + cur_hw, v4);
+        if(left_c > 5) _mm256_storeu_ps(dst5 + cur_hw, v5);
+        if(left_c > 6) _mm256_storeu_ps(dst6 + cur_hw, v6);
+    }
+#endif
+    for (; cur_hw < hw; cur_hw++) {
+        dst0[cur_hw] = src[cur_hw * 8 + 0];
+        if (left_c > 1) dst1[cur_hw] = src[cur_hw * 8 + 1];
+        if (left_c > 2) dst2[cur_hw] = src[cur_hw * 8 + 2];
+        if (left_c > 3) dst3[cur_hw] = src[cur_hw * 8 + 3];
+        if (left_c > 4) dst4[cur_hw] = src[cur_hw * 8 + 4];
+        if (left_c > 5) dst5[cur_hw] = src[cur_hw * 8 + 5];
+        if (left_c > 6) dst6[cur_hw] = src[cur_hw * 8 + 6];
+    }
+}
+
+int UnpackC8(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel) {
+    int c = 0;
+    for (; c + 7 < channel; c += 8) {
+        auto src_c = src + c * src_hw_stride;
+        auto dst0 = dst + c * dst_hw_stride;
+        auto dst1 = dst0 + dst_hw_stride;
+        auto dst2 = dst0 + dst_hw_stride * 2;
+        auto dst3 = dst0 + dst_hw_stride * 3;
+        auto dst4 = dst0 + dst_hw_stride * 4;
+        auto dst5 = dst0 + dst_hw_stride * 5;
+        auto dst6 = dst0 + dst_hw_stride * 6;
+        auto dst7 = dst0 + dst_hw_stride * 7;
+        int cur_hw = 0;
+#ifdef __AVX2__
+        for (; cur_hw + 7 < hw; cur_hw += 8) {
+            auto src_hw = src_c + cur_hw * 8;
+            __m256 v0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw)),      _mm_load_ps(src_hw + 32), 1);
+            __m256 v1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 8)),  _mm_load_ps(src_hw + 40), 1);
+            __m256 v2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 16)), _mm_load_ps(src_hw + 48), 1);
+            __m256 v3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 24)), _mm_load_ps(src_hw + 56), 1);
+            __m256 v4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 4)),  _mm_load_ps(src_hw + 36), 1);
+            __m256 v5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 12)), _mm_load_ps(src_hw + 44), 1);
+            __m256 v6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 20)), _mm_load_ps(src_hw + 52), 1);
+            __m256 v7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(src_hw + 28)), _mm_load_ps(src_hw + 60), 1);
+            _MM256_TRANSPOSE8(v0, v1, v2, v3, v4, v5, v6, v7);
+            _mm256_storeu_ps(dst0 + cur_hw, v0);
+            _mm256_storeu_ps(dst1 + cur_hw, v1);
+            _mm256_storeu_ps(dst2 + cur_hw, v2);
+            _mm256_storeu_ps(dst3 + cur_hw, v3);
+            _mm256_storeu_ps(dst4 + cur_hw, v4);
+            _mm256_storeu_ps(dst5 + cur_hw, v5);
+            _mm256_storeu_ps(dst6 + cur_hw, v6);
+            _mm256_storeu_ps(dst7 + cur_hw, v7);
+        }
+#endif
+        for (; cur_hw < hw; cur_hw++) {
+            dst0[cur_hw] = src_c[cur_hw * 8 + 0];
+            dst1[cur_hw] = src_c[cur_hw * 8 + 1];
+            dst2[cur_hw] = src_c[cur_hw * 8 + 2];
+            dst3[cur_hw] = src_c[cur_hw * 8 + 3];
+            dst4[cur_hw] = src_c[cur_hw * 8 + 4];
+            dst5[cur_hw] = src_c[cur_hw * 8 + 5];
+            dst6[cur_hw] = src_c[cur_hw * 8 + 6];
+            dst7[cur_hw] = src_c[cur_hw * 8 + 7];
+        }
+    }
+    int left_c = channel - c;
+    if (left_c == 7) {
+        UnpackC8_Left<7>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 6) {
+        UnpackC8_Left<6>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 5) {
+        UnpackC8_Left<5>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 4) {
+        UnpackC8_Left<4>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 3) {
+        UnpackC8_Left<3>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 2) {
+        UnpackC8_Left<2>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    } else if (left_c == 1) {
+        UnpackC8_Left<1>(dst + c * dst_hw_stride, src + c * src_hw_stride, hw, dst_hw_stride);
+    }
+    return 0;
+}
+
+template<typename T>
+int MatTranspose(T *dst, const T *src, size_t M, size_t N) {
+    for (size_t m = 0; m < M; m++) {
+        for (size_t n = 0; n < N; n++) {
+            dst[n * M + m] = src[m * N + n];
+        }
+    }
+
+    return 0;
+}
+template int MatTranspose(float *dst, const float *src, size_t M, size_t N);
+
+// from [o][i][h][w]
+// to: [o/4][h][w][i/16][o4][i16]
+int PackINT8Weight(int8_t *src, int8_t *dst, int input_channel, int output_channel, int height, int width) {
+    const int oc_4        = (output_channel + 3) / 4;
+    const int ic_calc     = input_channel < 4 ? input_channel : ROUND_UP(input_channel, 4);
+    const int crs_round16 = ROUND_UP(ic_calc * height * width, 16);
+    memset(dst, 0, oc_4 * 4 * crs_round16);
+    for (int o = 0; o < output_channel; o++) {
+        auto zo = o / 4, ro = o % 4;
+        for (int h = 0; h < height; h++) {
+            for (int w = 0; w < width; w++) {
+                for (int i = 0; i < input_channel; i++) {
+                    // to: [o/4][h][w][i/16][o4][i16]
+                    auto o_dst = dst + zo * 4 * crs_round16 + ro * 16;
+                    auto ri    = ((h * width + w) * ic_calc + i) % 16;
+                    auto zi    = ((h * width + w) * ic_calc + i) / 16;
+                    o_dst[zi * 16 * 4 + ri] =
+                        src[o * input_channel * height * width + i * height * width + h * width + w];
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+}
diff --git a/3rdparty/TNN/source/tnn/device/x86/x86_util.h b/3rdparty/TNN/source/tnn/device/x86/x86_util.h
new file mode 100644
index 0000000..ad80ef8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/device/x86/x86_util.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_X86_UTIL_H_
+#define TNN_X86_UTIL_H_
+
+#include <string.h>
+#include <cstdlib>
+#if TNN_PROFILE
+#include <chrono>
+#endif
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+#if TNN_PROFILE
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+using std::chrono::time_point;
+using std::chrono::system_clock;
+struct Timer {
+public:
+    void Start() {
+        start_ = system_clock::now();
+    }
+    float TimeEclapsed() {
+        stop_ = system_clock::now();
+        float elapsed = duration_cast<microseconds>(stop_ - start_).count() / 1000.0f;
+        start_ = system_clock::now();
+        return elapsed;
+    }
+private:
+    time_point<system_clock> start_;
+    time_point<system_clock> stop_;
+};
+#endif
+
+int PackC4(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel);
+
+int PackC8(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel);
+
+int UnpackC4(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel);
+
+int UnpackC8(float *dst, const float *src, size_t hw, size_t src_hw_stride, size_t dst_hw_stride, size_t channel);
+
+template<typename T>
+int MatTranspose(T *dst, const T *src, size_t M, size_t N);
+
+int PackINT8Weight(int8_t *src, int8_t *dst, int input_channel, int output_channel, int height, int width);
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.cc b/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.cc
new file mode 100644
index 0000000..6e5a2b7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/extern_wrapper/base_layer_builder.h"
+
+#include <mutex>
+
+#include "tnn/core/macro.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+BaseLayerBuilder::BaseLayerBuilder(LayerType type): BaseLayer(type) {
+}
+
+BaseLayerBuilder::~BaseLayerBuilder() {
+}
+
+Status BaseLayerBuilder::Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& input_blobs,
+                       std::vector<Blob*>& output_blobs, AbstractDevice* device) {
+
+    input_blobs_  = input_blobs;
+    output_blobs_ = output_blobs;
+
+    param_    = param;
+    resource_ = resource;
+
+    auto status = InferOutputDataType();
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    status = InferOutputShape();
+    LOGD("InferOutputShape: name:%s shape:%d %d %d %d \n", param->name.c_str(), output_blobs[0]->GetBlobDesc().dims[0],
+         output_blobs[0]->GetBlobDesc().dims[1], output_blobs[0]->GetBlobDesc().dims[2],
+         output_blobs[0]->GetBlobDesc().dims[3]);
+    if (status != TNN_OK) {
+        return status;
+    }
+    auto dims = output_blobs[0]->GetBlobDesc().dims;
+    for (auto item : dims) {
+        if (item <= 0) {
+            LOGE("Error: layer(%s) output dims is invalid\n", layer_name_.c_str());
+            return Status(TNNERR_LAYER_ERR, "layer output dims is invalid");
+        }
+    }
+    return TNN_OK;
+}
+
+Status BaseLayerBuilder::Reshape(){
+    RETURN_ON_NEQ(InferOutputShape(), TNN_OK);
+    auto dims = output_blobs_[0]->GetBlobDesc().dims;
+    for (auto item : dims) {
+        if (item <= 0) {
+            LOGE("Error: layer(%s) output dims is invalid\n", layer_name_.c_str());
+            return Status(TNNERR_LAYER_ERR, "layer output dims is invalid");
+        }
+    }
+    return TNN_OK;
+}
+
+Status BaseLayerBuilder::Forward(){
+    return TNN_OK;
+}
+
+//@brief get all input tensors
+std::vector<std::shared_ptr<ForeignTensor>> BaseLayerBuilder::GetInputTensors() {
+    std::vector<std::shared_ptr<ForeignTensor>> input_tensors;
+    for(auto blob : GetInputBlobs()) {
+        ForeignBlob * foreign_blob  = dynamic_cast<ForeignBlob*>(blob);
+        if (foreign_blob != nullptr) {
+            auto foreign_tensor = foreign_blob->GetForeignTensor();
+            input_tensors.push_back(foreign_tensor);
+        } else {
+            LOGE("Error: BaseLayerBuilder(%s) got none-foreign input blob\n", layer_name_.c_str());
+            return std::vector<std::shared_ptr<ForeignTensor>>();
+        }
+    }
+    return input_tensors;
+}
+
+//@brief get all output tensors
+std::vector<std::shared_ptr<ForeignTensor>> BaseLayerBuilder::GetOutputTensors() {
+    std::vector<std::shared_ptr<ForeignTensor>> output_tensors;
+    for(auto blob : GetOutputBlobs()) {
+        ForeignBlob * foreign_blob  = dynamic_cast<ForeignBlob*>(blob);
+        if (foreign_blob != nullptr) {
+            auto foreign_tensor = foreign_blob->GetForeignTensor();
+            output_tensors.push_back(foreign_tensor);
+        } else {
+            LOGE("Error: BaseLayerBuilder(%s) got none-foreign output blob\n", layer_name_.c_str());
+            return std::vector<std::shared_ptr<ForeignTensor>>();
+        }
+    }
+    return output_tensors;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.h b/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.h
new file mode 100644
index 0000000..606d256
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/extern_wrapper/base_layer_builder.h
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_EXTERN_WRAPPER_BASE_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_EXTERN_WRAPPER_BASE_LAYER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+//@brief BaseLayeer Builder, defines the layer builder interface
+class BaseLayerBuilder: public BaseLayer {
+public:
+    explicit BaseLayerBuilder(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~BaseLayerBuilder();
+
+    // @brief layer init
+    // @param ...
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+                std::vector<Blob*>& outputs, AbstractDevice* device);
+
+    //@brief Reshape recalculate the output tensor dims
+    virtual Status Reshape();
+
+    //@brief layer infer
+    virtual Status Forward();
+protected:
+
+    //@brief Build the foreign network 
+    virtual Status Build() = 0 ;
+
+    //@brief get all input tensors
+    virtual std::vector<std::shared_ptr<ForeignTensor>> GetInputTensors();
+
+    //@brief get all output tensors
+    virtual std::vector<std::shared_ptr<ForeignTensor>> GetOutputTensors();
+
+};
+
+//@brief LayerBuilderCreator define the create layer interface
+class LayerBuilderCreator {
+public:
+    virtual BaseLayerBuilder* CreateLayerBuilder() = 0;
+};
+
+//@brief TypeLayerBuilderCreator create specifiled LayerBuilder
+template <typename T>
+class TypeLayerBuilderCreator : public LayerBuilderCreator {
+public:
+    explicit TypeLayerBuilderCreator(LayerType type) {
+        this->type_ = type;
+    };
+    virtual BaseLayerBuilder* CreateLayerBuilder() {
+        auto layer_builder = new T(type_);
+        return layer_builder;
+    }
+
+protected:
+    LayerType type_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_EXTERN_WRAPPER_BASE_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.cc b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.cc
new file mode 100644
index 0000000..ff8bb1f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+
+#include <memory>
+
+#include "tnn/core/blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+//@brief create foreignBlob with blob only
+ForeignBlob::ForeignBlob(Blob* blob): Blob(blob->GetBlobDesc(), blob->GetHandle()) {
+    foreign_tensor_ = std::make_shared<ForeignTensor>();
+}
+
+//@brief create foreignBlob with blob descript only
+ForeignBlob::ForeignBlob(BlobDesc desc, bool alloc_memory): Blob(desc, alloc_memory) {
+}
+
+//@brief create foreignBlob with blob descript and data handle
+ForeignBlob::ForeignBlob(BlobDesc desc, BlobHandle handle): Blob(desc, handle) {
+}
+
+ForeignBlob::~ForeignBlob() {
+}
+
+//@brief get the ForeignTensor
+std::shared_ptr<ForeignTensor> ForeignBlob::GetForeignTensor() {
+    return foreign_tensor_;
+}
+
+//@brief set the ForeignTensor
+Status ForeignBlob::SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor) {
+    foreign_tensor_ = foreign_tensor;
+    return TNN_OK;
+}
+
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.h b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.h
new file mode 100644
index 0000000..c0abf7e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_blob.h
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_BLOB_H_
+#define TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_BLOB_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/status.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+
+// @brief ForeignBlob holds foreign network tensor.
+class ForeignBlob : public Blob {
+public:
+    //@brief create foreignblob with blob descript
+    explicit ForeignBlob(BlobDesc desc);
+
+    //@brief create foreignBlob with blob descript only
+    ForeignBlob(BlobDesc desc, bool alloc_memory);
+
+    //@brief create foreignBlob with blob descript and data handle
+    ForeignBlob(BlobDesc desc, BlobHandle handle);
+
+    //@brief create foreignBlob with blob only
+    ForeignBlob(Blob * blob);
+
+    ~ForeignBlob();    
+
+    //@brief get the ForeignTensor
+    std::shared_ptr<ForeignTensor> GetForeignTensor();
+
+    //@brief set the ForeignTensor
+    Status SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor);
+
+protected:
+
+    std::shared_ptr<ForeignTensor> foreign_tensor_;
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_BLOB_H_
diff --git a/3rdparty/TNN/source/tnn/extern_wrapper/foreign_tensor.h b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_tensor.h
new file mode 100644
index 0000000..9b4e91f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/extern_wrapper/foreign_tensor.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_TENSOR_H_
+#define TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_TENSOR_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+//@brief Base Type of a ForeignTensor 
+class ForeignTensor {
+public:
+    explicit ForeignTensor(){};
+
+    // @brief virtual destructor
+    virtual ~ForeignTensor(){};
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_EXTERN_WRAPPER_FOREIGN_TENSOR_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.cc
new file mode 100644
index 0000000..7dd1b88
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include <mutex>
+
+namespace TNN_NS {
+
+std::map<ModelType, std::shared_ptr<ModelInterpreterCreator>> &GetGlobalModelInterpreterCreatorMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<ModelType, std::shared_ptr<ModelInterpreterCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<ModelType, std::shared_ptr<ModelInterpreterCreator>>); });
+    return *creators;
+}
+
+AbstractModelInterpreter *CreateModelInterpreter(ModelType type) {
+    AbstractModelInterpreter *interpreter = NULL;
+    auto &creater_map                     = GetGlobalModelInterpreterCreatorMap();
+    if (creater_map.count(type) > 0) {
+        interpreter = creater_map[type]->CreateModelInterpreter();
+    }
+    return interpreter;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.h
new file mode 100644
index 0000000..0fa4930
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/abstract_model_interpreter.h
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_ABSTRACT_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_ABSTRACT_MODEL_INTERPRETER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+// @brief AbstraceModelInterpreter define common interface, different
+// interpreter different style model.
+class AbstractModelInterpreter {
+public:
+    // @brief virtual destructor
+    virtual ~AbstractModelInterpreter(){};
+
+    // @brief different interpreter has different order param
+    virtual Status Interpret(std::vector<std::string>& params) = 0;
+
+    // @brief copy interpreter
+    virtual std::shared_ptr<AbstractModelInterpreter> Copy() {
+        return nullptr;
+    };
+};
+
+// @brief ModelInterpreterCreator define model interpreter creator interface
+class ModelInterpreterCreator {
+public:
+    virtual ~ModelInterpreterCreator(){};
+    virtual AbstractModelInterpreter* CreateModelInterpreter() = 0;
+};
+
+// @brief TypeModelInterpreterCreator create different type model interpreter
+template <typename T>
+class TypeModelInterpreterCreator : public ModelInterpreterCreator {
+    virtual AbstractModelInterpreter* CreateModelInterpreter() {
+        return new T();
+    }
+};
+
+//@brief TypeModelInterpreterCreator register map
+std::map<ModelType, std::shared_ptr<ModelInterpreterCreator>>& GetGlobalModelInterpreterCreatorMap();
+
+//@brief TypeModelInterpreterRegister register TypeModelInterpreterCreator
+template <typename T>
+class TypeModelInterpreterRegister {
+public:
+    explicit TypeModelInterpreterRegister(ModelType type) {
+        GetGlobalModelInterpreterCreatorMap()[type] = std::shared_ptr<T>(new T());
+    }
+};
+
+AbstractModelInterpreter* CreateModelInterpreter(ModelType type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_ABSTRACT_MODEL_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.cc
new file mode 100644
index 0000000..d4a0456
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/default_model_interpreter.h"
+
+namespace TNN_NS {
+
+DefaultModelInterpreter::DefaultModelInterpreter() {
+    net_structure_ = new NetStructure();
+    net_resource_  = new NetResource();
+    params_md5_.clear();
+}
+
+DefaultModelInterpreter::~DefaultModelInterpreter() {
+    if (nullptr != net_structure_)
+        delete net_structure_;
+    if (nullptr != net_resource_)
+        delete net_resource_;
+}
+
+NetStructure *DefaultModelInterpreter::GetNetStructure() {
+    return net_structure_;
+}
+
+NetResource *DefaultModelInterpreter::GetNetResource() {
+    return net_resource_;
+}
+
+std::vector<std::string> DefaultModelInterpreter::GetParamsMd5() {
+    return params_md5_;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.h
new file mode 100644
index 0000000..40eb88d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/default_model_interpreter.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_INTERPRETER_H_
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+// @brief DefaultModelInterpreter define common interface for rpn model,
+// different interpreter different style model.
+class DefaultModelInterpreter : public AbstractModelInterpreter {
+public:
+    // @brief default constructor
+    DefaultModelInterpreter();
+
+    // @brief virtual destructor
+    virtual ~DefaultModelInterpreter() = 0;
+
+    // @brief different interpreter has different order param
+    virtual Status Interpret(std::vector<std::string> &params) = 0;
+
+    // @brief GetNetStruture return network build info
+    virtual NetStructure *GetNetStructure();
+
+    // @brief GetNetResource return network weights data
+    virtual NetResource *GetNetResource();
+
+    //@brief GetParamsMd5 return md5 string of params string
+    std::vector<std::string> GetParamsMd5();
+
+protected:
+    std::vector<std::string> params_md5_;
+    NetStructure *net_structure_;
+    NetResource *net_resource_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/default_model_packer.cc b/3rdparty/TNN/source/tnn/interpreter/default_model_packer.cc
new file mode 100644
index 0000000..28f2fc6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/default_model_packer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/default_model_packer.h"
+
+namespace TNN_NS {
+
+DefaultModelPacker::DefaultModelPacker(NetStructure *net_struct, NetResource *net_res) {
+    net_structure_ = net_struct;
+    net_resource_  = net_res;
+}
+
+DefaultModelPacker::~DefaultModelPacker() {}
+
+NetStructure *DefaultModelPacker::GetNetStructure() {
+    return net_structure_;
+}
+
+NetResource *DefaultModelPacker::GetNetResource() {
+    return net_resource_;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/default_model_packer.h b/3rdparty/TNN/source/tnn/interpreter/default_model_packer.h
new file mode 100644
index 0000000..b10facf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/default_model_packer.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_PACKER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_PACKER_H_
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+// @brief DefaultModelPacker define common interface for rpn model, different
+// interpreter different style model.
+class DefaultModelPacker {
+public:
+    // @brief default constructor
+    DefaultModelPacker(NetStructure *net_struct, NetResource *net_res);
+
+    // @brief virtual destructor
+    virtual ~DefaultModelPacker() = 0;
+
+    // @brief save the rpn model into files
+    virtual Status Pack(std::string proto_path, std::string model_path) = 0;
+
+    //@brief GetNetStruture return network build info
+    virtual NetStructure *GetNetStructure();
+
+    //@brief GetNetResource return network weights data
+    virtual NetResource *GetNetResource();
+
+private:
+    NetStructure *net_structure_;
+    NetResource *net_resource_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_DEFAULT_MODEL_PACKER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/layer_param.h b/3rdparty/TNN/source/tnn/interpreter/layer_param.h
new file mode 100644
index 0000000..5f47652
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/layer_param.h
@@ -0,0 +1,664 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_LAYER_PARAM_H_
+#define TNN_SOURCE_TNN_INTERPRETER_LAYER_PARAM_H_
+
+#include <limits.h>
+
+#include <cfloat>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+#define PARAM_COPY(param_type)                                                                                         \
+public:                                                                                                                \
+    virtual std::shared_ptr<LayerParam> Copy() {                                                                       \
+        std::shared_ptr<LayerParam> param(new param_type());                                                           \
+        param_type* param_ptr = dynamic_cast<param_type*>(param.get());                                                \
+        if (nullptr == param_ptr) {                                                                                    \
+            LOGE("dynamic cast to %s failed\n", #param_type);                                                          \
+            return nullptr;                                                                                            \
+        }                                                                                                              \
+        *param_ptr = *this;                                                                                            \
+        return param;                                                                                                  \
+    }
+
+struct LayerParam {
+    virtual ~LayerParam() {}
+    /**layer type*/
+    std::string type;
+    /**layer name*/
+    std::string name;
+    bool quantized = false;
+    // weight data size for ncnn param
+    size_t weight_data_size = 0;
+
+    PARAM_COPY(LayerParam)
+};
+
+enum ActivationType {
+    ActivationType_None        = 0x0000,
+    ActivationType_ReLU        = 0x0001,
+    ActivationType_ReLU6       = 0x0002,
+    ActivationType_SIGMOID_MUL = 0x0100,
+};
+
+enum FusionType {
+    FusionType_None                = 0x0000,
+    FusionType_Conv_Add_Activation = 0x0001,
+    FusionType_Conv_Activation_Add = 0x0002,
+};
+
+struct BatchNormLayerParam : public LayerParam {
+    int channels = 0;
+    float eps    = 1e-5f;
+
+    PARAM_COPY(BatchNormLayerParam)
+};
+struct InstanceNormLayerParam : public LayerParam {
+    int channels = 0;
+    float eps    = 1e-5f;
+
+    PARAM_COPY(InstanceNormLayerParam)
+};
+
+struct GroupNormLayerParam : public LayerParam {
+    int group = 0;
+    float eps = 1e-5f;
+
+    PARAM_COPY(GroupNormLayerParam)
+};
+
+struct LayerNormLayerParam : public LayerParam {
+    int reduce_dims_size = 0;
+    float eps            = 1e-5f;
+
+    PARAM_COPY(LayerNormLayerParam)
+};
+
+struct GridSampleLayerParam : public LayerParam {
+    // 1: nereast 2: bilinear/linear 3: cubic
+    int mode = 2;
+    // 0:const 1:reflect 2:edge
+    int pad_type      = 0;
+    int align_corners = 0;
+
+    PARAM_COPY(GridSampleLayerParam)
+};
+
+struct TileLayerParam : public LayerParam {
+    // nchw order
+    std::vector<int> reps;
+
+    PARAM_COPY(TileLayerParam)
+};
+
+struct ConvLayerParam : public LayerParam {
+    int pad_type = -1;
+    // input channels of blob, devide by group
+    int input_channel = 0;
+    // the total output channels of blob, not devide by group
+    int output_channel = 0;
+    //[w_begin w_end h_begin h_end d_begin d_end]
+    std::vector<int> pads;
+    // order [w h d]
+    std::vector<int> kernels;
+    // order [w h d]
+    std::vector<int> strides;
+    // order [w h d]
+    std::vector<int> dialations;
+    int group           = 1;
+    int bias            = 0;
+    int activation_type = ActivationType_None;
+    int fusion_type     = FusionType_None;
+
+    PARAM_COPY(ConvLayerParam)
+};
+
+struct PadLayerParam : public LayerParam {
+    // for old Pad the order is  [w_begin, w_end, h_begin, h_end, c_begin, c_end]
+    // for PadV2 the order correspand to input dims, same as ONNX, like [x1_begin, x2_begin,...,x1_end, x2_end,...]
+    std::vector<int> pads;
+    // 0:const 1:reflect 2:edge
+    int type    = 0;
+    float value = 0.0f;
+
+    PARAM_COPY(PadLayerParam)
+};
+
+struct PoolingLayerParam : public LayerParam {
+    int pool_type = 0;
+    //-1:caffe typy default 0:SAME 1:VALID
+    int pad_type  = -1;
+    int ceil_mode = 1;
+
+    //[w_begin w_end h_begin h_end d_begin d_end]
+    std::vector<int> pads;
+    // order [w h d]
+    std::vector<int> kernels;
+    std::vector<int> kernels_params;
+    // order [w h d]
+    std::vector<int> strides;
+
+    // order [w h d] for adaptive pool
+    std::vector<int> kernel_indexs;
+
+    int is_adaptive_pool = 0;
+    int is_global_pool   = 0;
+    // order [w h d]
+    std::vector<int> output_shape;
+
+    PARAM_COPY(PoolingLayerParam)
+};
+
+struct RoiPoolingLayerParam : public LayerParam {
+    // pool type of roi pooling
+    int pool_type = 0;
+
+    // scale of the input image / roi
+    float spatial_scale = 1.0f;
+
+    // output spatial dimensions, [WHD]
+    std::vector<int> pooled_dims;
+
+    PARAM_COPY(RoiPoolingLayerParam)
+};
+
+struct UpsampleLayerParam : public LayerParam {
+    // 1: nereast 2: bilinear/linear 3: cubic
+    int mode          = 0;
+    int align_corners = 0;
+
+    // order [w h d]
+    std::vector<float> scales;
+    // order [w h d]
+    std::vector<int> dims;
+
+    PARAM_COPY(UpsampleLayerParam)
+};
+
+struct RangeLayerParam : public LayerParam {
+    DataType data_type = DATA_TYPE_FLOAT;
+    RangeData start    = {0};
+    RangeData limit    = {0};
+
+    // designated initializer may cause compile error in msvc
+    RangeData delta = {1};
+    // RangeData delta = { .i = 1};
+
+    PARAM_COPY(RangeLayerParam)
+};
+
+struct SoftmaxLayerParam : public LayerParam {
+    int axis = 1;
+
+    PARAM_COPY(SoftmaxLayerParam)
+};
+
+struct PowLayerParam : public LayerParam {
+    float exponent = 1.0f;
+    float scale    = 1.0f;
+    float shift    = 0.0f;
+
+    PARAM_COPY(PowLayerParam)
+};
+
+struct NormalizeLayerParam : public LayerParam {
+    float epsilon = 1e-12f;
+    int axis      = 1;
+    int p         = 2;
+
+    int across_spatial = 0;
+    int channel_shared = 1;
+
+    PARAM_COPY(NormalizeLayerParam)
+};
+
+struct ReshapeLayerParam : public LayerParam {
+    // reshape_type:
+    // onnx caffe reshape(nchw): 0
+    // Tensorflow TFLite reshape(nhwc): 1
+    int reshape_type = 0;
+    int axis         = 0;
+    int num_axes     = 0;
+    std::vector<int> shape;
+
+    PARAM_COPY(ReshapeLayerParam)
+};
+
+struct PermuteLayerParam : public LayerParam {
+    std::vector<int> orders;
+
+    PARAM_COPY(PermuteLayerParam)
+};
+
+struct CastLayerParam : public LayerParam {
+    int to   = 0;
+    int from = 0;  // used for HUAWEI_NPU
+
+    PARAM_COPY(CastLayerParam)
+};
+
+struct HistogramLayerParam : public LayerParam {
+    int depth;
+    PARAM_COPY(HistogramLayerParam)
+};
+
+struct OneHotLayerParam : public LayerParam {
+    int axis        = -1;
+    int depth       = -1;
+    float value_off = 0;
+    float value_on  = 1;
+
+    PARAM_COPY(OneHotLayerParam)
+};
+
+struct BitShiftLayerParam : public LayerParam {
+    // 0: rigth 1:left
+    int direction = 0;
+    int bits      = 0;
+    PARAM_COPY(BitShiftLayerParam)
+};
+
+struct ScaleLayerParam : public LayerParam {
+    int axis      = 1;
+    int num_axes  = 1;
+    int bias_term = 0;
+
+    PARAM_COPY(ScaleLayerParam)
+};
+
+struct SplitVLayerParam : public LayerParam {
+    int axis = 1;
+    // size of each slice
+    std::vector<int> slices;
+    // judge whether slices is specified or calculated by equal sized parts
+    bool is_split_specified = true;
+
+    PARAM_COPY(SplitVLayerParam)
+};
+
+struct ReduceLayerParam : public LayerParam {
+    int keep_dims = 0;
+    std::vector<int> axis;
+    // ignore axis, reduce all to one
+    int all_reduce = 0;
+
+    PARAM_COPY(ReduceLayerParam)
+};
+
+struct ReduceSumLayerParam : public ReduceLayerParam {
+    PARAM_COPY(ReduceSumLayerParam)
+};
+
+struct ReduceMeanLayerParam : public ReduceLayerParam {
+    PARAM_COPY(ReduceMeanLayerParam)
+};
+
+struct ReduceMaxLayerParam : public ReduceLayerParam {
+    PARAM_COPY(ReduceMaxLayerParam)
+};
+
+struct InnerProductLayerParam : public LayerParam {
+    int num_output = 0;
+    int has_bias   = 0;
+    int transpose  = 0;
+    int axis       = 0;
+
+    PARAM_COPY(InnerProductLayerParam)
+};
+
+struct ConcatLayerParam : public LayerParam {
+    int axis = 1;
+
+    PARAM_COPY(ConcatLayerParam)
+};
+
+struct PReluLayerParam : public LayerParam {
+    int channel_shared = 0;
+    int has_filler;
+
+    PARAM_COPY(PReluLayerParam)
+};
+
+struct EluLayerParam : public LayerParam {
+    float alpha = 1.0;
+
+    PARAM_COPY(EluLayerParam)
+};
+
+struct ClipLayerParam : public LayerParam {
+    float min = -FLT_MAX;
+    float max = FLT_MAX;
+
+    PARAM_COPY(ClipLayerParam)
+};
+
+struct SeluLayerParam : public LayerParam {
+    float alpha;
+    float gamma;
+
+    PARAM_COPY(SeluLayerParam)
+};
+
+//前闭后开区间
+struct StrideSliceLayerParam : public LayerParam {
+    // order [w h d c n]
+    std::vector<int> begins;
+    // order [w h d c n]
+    std::vector<int> ends;
+    // order [w h d c n]
+    std::vector<int> strides;
+
+    PARAM_COPY(StrideSliceLayerParam)
+};
+
+struct StrideSliceV2LayerParam : public LayerParam {
+    std::vector<int> begins;
+    std::vector<int> ends;
+    std::vector<int> axes;
+    std::vector<int> strides;
+
+    PARAM_COPY(StrideSliceV2LayerParam)
+};
+
+struct SliceLayerParam : public LayerParam {
+    // size of each slice
+    std::vector<int> slices;
+    int axis;
+
+    PARAM_COPY(SliceLayerParam)
+};
+
+struct ElementWiseLayerParam : public LayerParam {
+    PARAM_COPY(ElementWiseLayerParam)
+};
+
+typedef enum {
+    // unknown or decided by runtime
+    BroadcastTypeUnknown = -1,
+    // no broadcast
+    BroadcastTypeNormal = 0,
+    // broadcast single element
+    BroadcastTypeSingle = 1,
+    // broadcast channel
+    BroadcastTypeChannel = 2,
+    // broadcast channel x height x width
+    BroadcastTypeElement = 3,
+    // broadcast height x width
+    BroadcastTypeHeightWidth = 4,
+    // broadcast width
+    BroadcastTypeWidth = 5,
+    // broadcast for any dim
+    BroadcastTypeGeneral = 6,
+    // broadcast channel x height
+    BroadcastTypeChannelHeight = 7,
+    // broadcast channel x width
+    BroadcastTypeChannelWidth = 8
+} BroadcastType;
+
+struct MultidirBroadcastLayerParam : public ElementWiseLayerParam {
+    int input0_broadcast_type = BroadcastTypeUnknown;
+    int input1_broadcast_type = BroadcastTypeUnknown;
+    int weight_input_index    = 1;
+
+    PARAM_COPY(MultidirBroadcastLayerParam)
+};
+
+struct HardSwishLayerParam : public MultidirBroadcastLayerParam {
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+
+    PARAM_COPY(HardSwishLayerParam)
+};
+
+struct HardSigmoidLayerParam : public LayerParam {
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+
+    PARAM_COPY(HardSigmoidLayerParam)
+};
+
+typedef enum {
+    // only data_type
+    QUANT_ONLY   = 0,
+    DEQUANT_ONLY = 1,
+    // data_type + layout for arm
+    QUANT_NCHW4_2_NHWC   = 2,
+    DEQUANT_NHWC_2_NCHW4 = 3,
+    // data_type + layout for half data type in armv8.2
+    NC4HW4FP32_2_NC8HW8FP16 = 4,
+    NC8HW8FP16_2_NC4HW4FP32 = 5,
+    // nchw <-> nc4hw4 fp32
+    NC4HW4FP32_2_NCHWFP32 = 6,
+    NCHWFP32_2_NC4HW4FP32 = 7,
+    // nchw <-> nc8hw8 fp16
+    NC8HW8FP16_2_NCHWFP16 = 8,
+    NCHWFP16_2_NC8HW8FP16 = 9,
+    // to be continued
+} ReformatType;
+
+struct ReformatLayerParam : public LayerParam {
+    DataType src_type     = DATA_TYPE_AUTO;
+    DataType dst_type     = DATA_TYPE_AUTO;
+    DataFormat src_format = DATA_FORMAT_AUTO;
+    DataFormat dst_format = DATA_FORMAT_AUTO;
+    ReformatType type;
+
+    PARAM_COPY(ReformatLayerParam)
+};
+
+struct ShuffleLayerParam : public LayerParam {
+    int group;
+
+    PARAM_COPY(ShuffleLayerParam)
+};
+
+struct PriorBoxLayerParam : public LayerParam {
+    std::vector<float> min_sizes;
+    std::vector<float> max_sizes;
+    bool clip = false;
+    bool flip = true;
+
+    std::vector<float> variances;
+    std::vector<float> aspect_ratios;
+    // order [img_h, img_w]
+    int img_w;
+    int img_h;
+    // order [step_h, step_w]
+    float step_w;
+    float step_h;
+
+    float offset = 0.5;
+
+    PARAM_COPY(PriorBoxLayerParam)
+};
+
+struct DetectionOutputLayerParam : public LayerParam {
+    int num_classes;
+    bool share_location;
+    int background_label_id;
+    bool variance_encoded_in_target;
+    int code_type;
+    int keep_top_k;
+    float confidence_threshold;
+
+    struct nms_param {
+        float nms_threshold;
+        int top_k;
+    } nms_param;
+    float eta;
+
+    PARAM_COPY(DetectionOutputLayerParam)
+};
+
+struct DetectionPostProcessLayerParam : public LayerParam {
+    int max_detections;
+    int max_classes_per_detection;
+    int detections_per_class;
+    bool use_regular_nms;
+    float nms_score_threshold;
+    float nms_iou_threshold;
+    int num_classes;
+    // y_scale, x_scale, h_scale, w_scale
+    std::vector<float> center_size_encoding;
+    bool has_anchors;
+    int num_anchors;
+    int anchors_coord_num;
+
+    PARAM_COPY(DetectionPostProcessLayerParam)
+};
+
+struct LRNLayerParam : public LayerParam {
+    float alpha;
+    float beta;
+    float bias;
+    int size;
+
+    PARAM_COPY(LRNLayerParam)
+};
+
+struct ReorgLayerParam : public LayerParam {
+    int stride;
+    bool forward;
+    int mode;  // DCR: 0  CRD: 1
+
+    PARAM_COPY(ReorgLayerParam)
+};
+
+struct ConstLayerParam : public LayerParam {
+    std::vector<int> dims;
+
+    PARAM_COPY(ConstLayerParam)
+};
+
+struct SignedMulLayerParam : public LayerParam {
+    float alpha = 1.0f;
+    float beta  = 1.0f;
+    float gamma = 2.0f;
+
+    PARAM_COPY(SignedMulLayerParam)
+};
+
+struct SqueezeLayerParam : public LayerParam {
+    // Note the axes is ascending order,  see SqueezeLayer::InferOutputShape and UnsqueezeLayer::InferOutputShape
+    std::vector<int> axes;
+    bool data_in_resource = false;
+
+    PARAM_COPY(SqueezeLayerParam)
+};
+
+struct UnsqueezeLayerParam : public SqueezeLayerParam {
+    PARAM_COPY(UnsqueezeLayerParam)
+};
+
+struct ArgMaxOrMinLayerParam : public LayerParam {
+    int mode;
+    int axis;
+    int keep_dims = 1;
+    int select_last_index;
+
+    PARAM_COPY(ArgMaxOrMinLayerParam)
+};
+
+struct PixelShuffleLayerParam : public LayerParam {
+    int upscale_factor;
+    int axis;
+
+    PARAM_COPY(PixelShuffleLayerParam)
+};
+
+struct GatherLayerParam : public LayerParam {
+    int axis                 = 0;
+    bool data_in_resource    = false;
+    bool indices_in_resource = true;
+
+    PARAM_COPY(GatherLayerParam)
+};
+
+struct GatherNDLayerParam : public LayerParam {
+    int batch_dims = 0;
+    PARAM_COPY(GatherNDLayerParam)
+};
+
+struct LSTMONNXLayerParam : public LayerParam {
+    float clip_threshold = 0;
+    int hidden_size      = 0;
+    // 0: forward 1:reverse 2:bidirection
+    int direction = 0;
+
+    PARAM_COPY(LSTMONNXLayerParam)
+};
+
+struct ExpandLayerParam : public LayerParam {
+    std::vector<int> shape;
+
+    PARAM_COPY(ExpandLayerParam)
+};
+
+struct MatMulLayerParam : public LayerParam {
+    int weight_position = -1;
+    DimsVector matrix_a_dims;
+    DimsVector matrix_b_dims;
+    int axis = 0;
+
+    PARAM_COPY(MatMulLayerParam)
+};
+
+struct RoiAlignLayerParam : public LayerParam {
+    // 0: max, 1: avg
+    int mode = 1;
+    int output_height;
+    int output_width;
+    int sampling_ratio;
+    float spatial_scale;
+
+    PARAM_COPY(RoiAlignLayerParam)
+};
+
+struct FlattenLayerParam : public LayerParam {
+    int axis = 1;
+
+    PARAM_COPY(FlattenLayerParam)
+};
+
+struct EinsumLayerParam : public LayerParam {
+    std::string equation;
+    int out_size;
+    bool has_zero_size_dim = false;
+    std::vector<std::vector<int>> perm_shapes;
+    std::vector<std::size_t> dim_last_op;
+    std::vector<DimsVector> operand_dims;
+
+    PARAM_COPY(EinsumLayerParam)
+};
+
+struct TopKLayerParam : public LayerParam {
+    int axis    = -1;
+    int largest = 1;
+    int sorted  = 1;
+    int k;
+
+    PARAM_COPY(TopKLayerParam)
+};
+
+};  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_LAYER_PARAM_H
diff --git a/3rdparty/TNN/source/tnn/interpreter/layer_resource.h b/3rdparty/TNN/source/tnn/interpreter/layer_resource.h
new file mode 100644
index 0000000..6ca6dba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/layer_resource.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_H_
+#define TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+typedef std::map<std::string, DimsVector> BlobShapesMap;
+typedef std::map<std::string, std::shared_ptr<RawBuffer> > ConstantResource;
+typedef std::map<std::string, int > ConstantResourceFlag;
+
+struct LayerResource {
+    std::string name = "";
+    // default virtual destructor
+    virtual ~LayerResource(){};
+};
+
+// @brief conv layer filter format
+typedef enum { OIHW = 0, IHWO = 1, OIDHW = 2 } ConvLayerFilterFormat;
+
+// @brief ConvLayerResource different device holds different handle
+struct ConvLayerResource : public LayerResource {
+    // conv layer filter format
+    ConvLayerFilterFormat filter_format = OIHW;
+
+    // conv layer handle
+    // NOTE: for deconv, the weight's default format is [n][i][o][h][w]
+    RawBuffer filter_handle;
+
+    // bias handle
+    RawBuffer bias_handle;
+
+    // extra scale handle for different precision
+    RawBuffer scale_handle;
+};
+
+struct BatchNormLayerResource : public LayerResource {
+    // bn k buffer
+    RawBuffer scale_handle;
+
+    // bn b buffer
+    RawBuffer bias_handle;
+};
+
+struct InstanceNormLayerResource : public BatchNormLayerResource {};
+
+struct EltwiseLayerResource : public LayerResource {
+    // elements
+    RawBuffer element_handle;
+
+    std::vector<int> element_shape;
+};
+
+struct InnerProductLayerResource : public LayerResource {
+    // weight buffer
+    RawBuffer weight_handle;
+
+    // bias buffer
+    RawBuffer bias_handle;
+
+    // extra scale handle for different precision
+    RawBuffer scale_handle;
+};
+
+struct PReluLayerResource : public LayerResource {
+    // slope
+    RawBuffer slope_handle;
+};
+
+struct IntScaleResource : public LayerResource {
+    // scale buffer
+    RawBuffer scale_handle;
+    // bias buffer
+    RawBuffer bias_handle;
+};
+
+// @brief HdrGuideLayerResource different device holds different handle
+struct HdrGuideLayerResource : public LayerResource {
+    // ccm weight
+    RawBuffer ccm_weight_handle;
+    // ccm bias
+    RawBuffer ccm_bias_handle;
+    // shifts
+    RawBuffer shifts_handle;
+    // slopes
+    RawBuffer slopes_handle;
+    // projection weights
+    RawBuffer projection_weight_handle;
+    // projection bias
+    RawBuffer projection_bias_handle;
+};
+
+struct ConstLayerResource : public LayerResource {
+    // const weights
+    RawBuffer weight_handle;
+};
+
+struct DetectionPostProcessLayerResource : public LayerResource {
+    RawBuffer anchors_handle;
+};
+
+struct ScatterNDLayerResource : public LayerResource {
+    RawBuffer indices;
+    // optional
+    RawBuffer updates;
+};
+
+struct GatherLayerResource : public LayerResource {
+    //RawBuffer has dims
+    RawBuffer data;
+    RawBuffer indices;
+};
+
+struct ConstantOfShapeLayerResource : public LayerResource {
+    //RawBuffer has dims
+    RawBuffer value;
+};
+
+struct SqueezeLayerResource : public LayerResource {
+    std::vector<int> data_dims;
+    RawBuffer data;
+};
+
+struct UnsqueezeLayerResource : public SqueezeLayerResource {};
+
+struct MatMulLayerResource : public LayerResource {
+    RawBuffer weight;
+};
+
+struct BiasAddLayerResource : public LayerResource {
+    RawBuffer bias_handle;
+};
+
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.cc b/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.cc
new file mode 100644
index 0000000..f1dbe08
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.cc
@@ -0,0 +1,479 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/utils/random_data_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
+
+#include <mutex>
+
+#include "tnn/utils/random_data_utils.h"
+
+namespace TNN_NS {
+
+std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>& GetGlobalLayerResourceGeneratorMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>); });
+    return *creators;
+}
+
+std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>& GetGlobalLayerConstantResourceGeneratorMap() {
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>); });
+    return *creators;
+}
+
+Status GenerateRandomResource(LayerType type, LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs, ConstantResource* consts) {
+    auto& layer_resource_generator_map = GetGlobalLayerResourceGeneratorMap();
+    auto& layer_constant_resource_generator_map = GetGlobalLayerConstantResourceGeneratorMap();
+    if (layer_resource_generator_map.count(type) > 0) {
+        layer_resource_generator_map[type]->GenLayerResource(param, resource, inputs);
+    } else if (layer_constant_resource_generator_map.count(type) > 0) {
+        layer_constant_resource_generator_map[type]->GenLayerConstantResource(param, resource, inputs, consts);
+    }
+    return TNN_OK;
+}
+
+Status ConvertHalfResource(LayerType type, LayerResource* src_res, LayerResource** dst_res) {
+    auto& layer_resource_generator_map = GetGlobalLayerResourceGeneratorMap();
+    if (layer_resource_generator_map.count(type) > 0) {
+        return layer_resource_generator_map[type]->ConvertHalfLayerResource(src_res, dst_res);
+    }
+    return Status(TNNERR_PARAM_ERR, "ConvertHalfResource, layer type not supported yet.");
+}
+
+/*
+ * Generate conv resource
+ */
+class ConvolutionLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("ConvolutionLayerResourceGenerator\n");
+        auto layer_param = dynamic_cast<ConvLayerParam*>(param);
+        CHECK_PARAM_NULL(layer_param);
+        auto layer_res = new ConvLayerResource();
+
+        auto dims              = inputs[0]->GetBlobDesc().dims;
+        // check if 3d convolution
+        bool is_conv3d = (dims.size() == 5 && layer_param->kernels.size() == 3);
+        int filter_handle_size = dims[1] * layer_param->output_channel * layer_param->kernels[0] *
+                                 layer_param->kernels[1] / layer_param->group;
+        if (is_conv3d) {
+            filter_handle_size *= layer_param->kernels[2]; 
+        }
+        if (layer_param->quantized) {
+            layer_res->filter_handle = RawBuffer(filter_handle_size * sizeof(int8_t));
+            layer_res->bias_handle   = RawBuffer(layer_param->output_channel * sizeof(int32_t));
+            layer_res->scale_handle  = RawBuffer(layer_param->output_channel * sizeof(float));
+
+            layer_res->filter_handle.SetDataType(DATA_TYPE_INT8);
+            InitRandom(layer_res->filter_handle.force_to<int8_t*>(), filter_handle_size, (int8_t)8);
+            layer_res->bias_handle.SetDataType(DATA_TYPE_INT32);
+            InitRandom(layer_res->bias_handle.force_to<int32_t*>(), layer_param->output_channel, (int32_t)8);
+            layer_res->scale_handle.SetDataType(DATA_TYPE_FLOAT);
+            InitRandom(layer_res->scale_handle.force_to<float*>(), layer_param->output_channel, 0.0f, 1.0f);
+
+        } else {
+            layer_res->filter_handle = RawBuffer(filter_handle_size * sizeof(float));
+            InitRandom(layer_res->filter_handle.force_to<float*>(), filter_handle_size, 1.0f);
+
+            if (layer_param->bias) {
+                layer_res->bias_handle = RawBuffer(layer_param->output_channel * sizeof(float));
+                InitRandom(layer_res->bias_handle.force_to<float*>(), layer_param->output_channel, 1.0f);
+            }
+        }
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<ConvLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new ConvLayerResource();
+
+        dst_res->filter_handle = ConvertHalfHandle(src_res->filter_handle);
+        dst_res->scale_handle  = ConvertHalfHandle(src_res->scale_handle);
+        dst_res->bias_handle   = ConvertHalfHandle(src_res->bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate deconv resource
+ */
+class DeconvolutionLayerResourceGenerator : public ConvolutionLayerResourceGenerator {};
+
+/*
+ * Generate conv3d resource
+ */
+class Convolution3DLayerResourceGenerator : public ConvolutionLayerResourceGenerator {};
+
+/*
+ * Generate weights for innerproduct layer
+ */
+class InnerProductLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("InnerProductLayerResourceGenerator\n");
+        auto layer_param = dynamic_cast<InnerProductLayerParam*>(param);
+        CHECK_PARAM_NULL(layer_param);
+        auto layer_res = new InnerProductLayerResource();
+
+        auto dims = inputs[0]->GetBlobDesc().dims;
+
+        int weight_handle_size = layer_param->num_output * DimsVectorUtils::Count(dims, 1);
+        if (param->quantized) {
+            layer_res->weight_handle = RawBuffer(weight_handle_size * sizeof(int8_t));
+            layer_res->bias_handle   = RawBuffer(layer_param->num_output * sizeof(int32_t));
+            layer_res->scale_handle  = RawBuffer(layer_param->num_output * sizeof(float));
+
+            layer_res->weight_handle.SetDataType(DATA_TYPE_INT8);
+            InitRandom(layer_res->weight_handle.force_to<int8_t*>(), weight_handle_size, (int8_t)4);
+            layer_res->bias_handle.SetDataType(DATA_TYPE_INT32);
+            InitRandom(layer_res->bias_handle.force_to<int32_t*>(), layer_param->num_output, (int32_t)8);
+            layer_res->scale_handle.SetDataType(DATA_TYPE_FLOAT);
+            InitRandom(layer_res->scale_handle.force_to<float*>(), layer_param->num_output, 0.0f, 1.0f);
+        } else {
+            layer_res->weight_handle = RawBuffer(weight_handle_size * sizeof(float));
+            InitRandom(layer_res->weight_handle.force_to<float*>(), weight_handle_size, 1.0f);
+
+            if (layer_param->has_bias) {
+                layer_res->bias_handle = RawBuffer(layer_param->num_output * sizeof(float));
+                InitRandom(layer_res->bias_handle.force_to<float*>(), layer_param->num_output, 1.0f);
+            }
+        }
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<InnerProductLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new InnerProductLayerResource();
+
+        dst_res->weight_handle = ConvertHalfHandle(src_res->weight_handle);
+        dst_res->scale_handle  = ConvertHalfHandle(src_res->scale_handle);
+        dst_res->bias_handle   = ConvertHalfHandle(src_res->bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate weights for Batchnorm layer
+ */
+class BatchnormLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("BatchnormLayerResourceGenerator\n");
+        auto layer_res = new BatchNormLayerResource();
+
+        auto dims = inputs[0]->GetBlobDesc().dims;
+
+        layer_res->scale_handle = RawBuffer(dims[1] * sizeof(float));
+        InitRandom(layer_res->scale_handle.force_to<float*>(), dims[1], 0.0f, 1.0f);
+        layer_res->bias_handle = RawBuffer(dims[1] * sizeof(float));
+        InitRandom(layer_res->bias_handle.force_to<float*>(), dims[1], 1.0f);
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<BatchNormLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new BatchNormLayerResource();
+
+        dst_res->scale_handle = ConvertHalfHandle(src_res->scale_handle);
+        dst_res->bias_handle  = ConvertHalfHandle(src_res->bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate scale resource
+ */
+class ScaleLayerResourceGenerator : public BatchnormLayerResourceGenerator {};
+
+/*
+ * Generate weights for InstanceNorm layer
+ */
+class InstanceNormLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("InstanceNormLayerResourceGenerator\n");
+        auto layer_res = new InstanceNormLayerResource();
+
+        auto dims = inputs[0]->GetBlobDesc().dims;
+
+        layer_res->scale_handle = RawBuffer(dims[1] * sizeof(float));
+        InitRandom(layer_res->scale_handle.force_to<float*>(), dims[1], 0.0f, 1.0f);
+        layer_res->bias_handle = RawBuffer(dims[1] * sizeof(float));
+        InitRandom(layer_res->bias_handle.force_to<float*>(), dims[1], 1.0f);
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<InstanceNormLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new InstanceNormLayerResource();
+
+        dst_res->scale_handle = ConvertHalfHandle(src_res->scale_handle);
+        dst_res->bias_handle  = ConvertHalfHandle(src_res->bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate weights for Prelu layer
+ */
+class PReluLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("PReluLayerResourceGenerator\n");
+        auto layer_res = new PReluLayerResource();
+
+        auto dims = inputs[0]->GetBlobDesc().dims;
+
+        layer_res->slope_handle = RawBuffer(dims[1] * sizeof(float));
+        InitRandom(layer_res->slope_handle.force_to<float*>(), dims[1], 1.0f);
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<PReluLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new PReluLayerResource();
+
+        dst_res->slope_handle = ConvertHalfHandle(src_res->slope_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate weights for Blobscale
+ */
+class BlobScaleLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("BlobScaleLayerResourceGenerator\n");
+        auto layer_res = new IntScaleResource();
+
+        auto dims = inputs[0]->GetBlobDesc().dims;
+
+        layer_res->scale_handle = RawBuffer(dims[1] * sizeof(float));
+        layer_res->bias_handle  = RawBuffer(dims[1] * sizeof(int32_t));
+        layer_res->scale_handle.SetDataType(DATA_TYPE_FLOAT);
+        InitRandom(layer_res->scale_handle.force_to<float*>(), dims[1], 0.f, 1.0f);
+        float* k_data = layer_res->scale_handle.force_to<float*>();
+        for (int k = 0; k < dims[1]; k++) {
+            k_data[k] = std::fabs(k_data[k] - 0.f) < FLT_EPSILON ? 1.f : k_data[k];
+        }
+        layer_res->bias_handle.SetDataType(DATA_TYPE_INT32);
+        InitRandom(layer_res->bias_handle.force_to<int32_t*>(), dims[1], (int32_t)32);
+
+        *resource = layer_res;
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<IntScaleResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new IntScaleResource();
+
+        dst_res->scale_handle = ConvertHalfHandle(src_res->scale_handle);
+        dst_res->bias_handle  = ConvertHalfHandle(src_res->bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+/*
+ * Generate weights for Binary
+ */
+class BinaryLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("BinaryLayerResourceGenerator\n");
+
+        if (inputs.size() == 1) {
+            LOGE(
+                "[WARNNING] can't infer resource shape from binary param in benchmark mode, random generator may not "
+                "be exactly same with the real resource!\n");
+            auto layer_res           = new EltwiseLayerResource();
+            auto dims                = inputs[0]->GetBlobDesc().dims;
+            layer_res->element_shape = {1, 1, 1, 1};
+            // broad cast in channel
+            layer_res->element_shape[1] = dims[1];
+            layer_res->element_handle   = RawBuffer(dims[1] * sizeof(float));
+            InitRandom(layer_res->element_handle.force_to<float*>(), dims[1], 1.0f);
+
+            *resource = layer_res;
+        }
+
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<EltwiseLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new EltwiseLayerResource();
+
+        dst_res->element_handle = ConvertHalfHandle(src_res->element_handle);
+        dst_res->element_shape  = src_res->element_shape;
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+class AddLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class SubLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class MaxLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class MinLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class DivLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class MulLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+class SquaredDifferenceLayerResourceGenerator : public BinaryLayerResourceGenerator {};
+
+/*
+ * Generate Hdr resource
+ */
+class HdrGuideLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs) {
+        LOGD("HdrGuideLayerResourceGenerator\n");
+        auto layer_res = new HdrGuideLayerResource();
+
+        layer_res->ccm_weight_handle        = RawBuffer(9 * sizeof(float));
+        layer_res->ccm_bias_handle          = RawBuffer(3 * sizeof(float));
+        layer_res->shifts_handle            = RawBuffer(12 * sizeof(float));
+        layer_res->slopes_handle            = RawBuffer(12 * sizeof(float));
+        layer_res->projection_weight_handle = RawBuffer(3 * sizeof(float));
+        layer_res->projection_bias_handle   = RawBuffer(1 * sizeof(float));
+        InitRandom(layer_res->ccm_weight_handle.force_to<float*>(), 9, 1.0f);
+        InitRandom(layer_res->ccm_bias_handle.force_to<float*>(), 3, 1.0f);
+        InitRandom(layer_res->shifts_handle.force_to<float*>(), 12, 1.0f);
+        InitRandom(layer_res->slopes_handle.force_to<float*>(), 12, 1.0f);
+        InitRandom(layer_res->projection_weight_handle.force_to<float*>(), 3, 1.0f);
+        InitRandom(layer_res->projection_bias_handle.force_to<float*>(), 1, 1.0f);
+
+        *resource = layer_res;
+
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        auto src_res = dynamic_cast<HdrGuideLayerResource*>(fp16_res);
+        CHECK_PARAM_NULL(src_res);
+
+        auto dst_res = new HdrGuideLayerResource();
+
+        dst_res->ccm_weight_handle        = ConvertHalfHandle(src_res->ccm_weight_handle);
+        dst_res->ccm_bias_handle          = ConvertHalfHandle(src_res->ccm_bias_handle);
+        dst_res->shifts_handle            = ConvertHalfHandle(src_res->shifts_handle);
+        dst_res->slopes_handle            = ConvertHalfHandle(src_res->slopes_handle);
+        dst_res->projection_weight_handle = ConvertHalfHandle(src_res->projection_weight_handle);
+        dst_res->projection_bias_handle   = ConvertHalfHandle(src_res->projection_bias_handle);
+
+        *fp32_res = dst_res;
+        return TNN_OK;
+    }
+};
+
+class LSTMONNXLayerResourceGenerator : public LayerResourceGenerator {
+    virtual Status GenLayerConstantResource(LayerParam* param, LayerResource** resource,
+                                            std::vector<Blob*>& inputs, ConstantResource* consts) {
+        LOGD("LSTMONNXLayerResourceGenerator\n");
+        auto layer_param = dynamic_cast<LSTMONNXLayerParam*>(param);
+        CHECK_PARAM_NULL(layer_param);
+        auto hidden_size = layer_param->hidden_size;
+        auto num_directions = layer_param->direction == 2? 2: 1;
+        auto input_size  = DimsVectorUtils::Count(inputs[0]->GetBlobDesc().dims, 2);
+
+        auto fill_map_for_blob = [&](Blob *blob) {
+            if (blob == nullptr)
+                return;
+            auto blob_name = blob->GetBlobDesc().name;
+            auto data_type = blob->GetBlobDesc().data_type;
+            auto count = DimsVectorUtils::Count(blob->GetBlobDesc().dims);
+            if (consts->count(blob_name) > 0) {
+                return;
+            }
+            if (data_type == DATA_TYPE_FLOAT) {
+                auto buffer = std::make_shared<RawBuffer>(count * sizeof(float));
+                buffer->SetBufferDims(blob->GetBlobDesc().dims);
+                buffer->SetDataType(DATA_TYPE_FLOAT);
+                InitRandom(buffer->force_to<float *>(), count, 1.0f);
+                (*consts)[blob_name] = buffer;
+            } else if (data_type == DATA_TYPE_HALF) {
+                auto buffer = std::make_shared<RawBuffer>(count * sizeof(fp16_t));
+                buffer->SetBufferDims(blob->GetBlobDesc().dims);
+                buffer->SetDataType(DATA_TYPE_HALF);
+                InitRandom(buffer->force_to<fp16_t *>(), count, fp16_t(1));
+                (*consts)[blob_name] = buffer;
+            }
+        };
+
+        fill_map_for_blob(inputs[1]);
+        fill_map_for_blob(inputs[2]);
+        fill_map_for_blob(inputs[3]);
+
+        return TNN_OK;
+    }
+
+    virtual Status ConvertHalfLayerResource(LayerResource* fp16_res, LayerResource** fp32_res) {
+        return TNN_OK;
+    }
+};
+
+REGISTER_LAYER_RESOURCE(Convolution, LAYER_CONVOLUTION);
+REGISTER_LAYER_RESOURCE(Deconvolution, LAYER_DECONVOLUTION);
+REGISTER_LAYER_RESOURCE(Convolution3D, LAYER_CONVOLUTION_3D);
+REGISTER_LAYER_RESOURCE(InnerProduct, LAYER_INNER_PRODUCT);
+REGISTER_LAYER_RESOURCE(Batchnorm, LAYER_BATCH_NORM);
+REGISTER_LAYER_RESOURCE(Scale, LAYER_SCALE);
+REGISTER_LAYER_RESOURCE(InstanceNorm, LAYER_INST_BATCH_NORM);
+REGISTER_LAYER_RESOURCE(PRelu, LAYER_PRELU);
+REGISTER_LAYER_RESOURCE(BlobScale, LAYER_BLOB_SCALE);
+REGISTER_LAYER_RESOURCE(Add, LAYER_ADD);
+REGISTER_LAYER_RESOURCE(Sub, LAYER_SUB);
+REGISTER_LAYER_RESOURCE(Max, LAYER_MAXIMUM);
+REGISTER_LAYER_RESOURCE(Min, LAYER_MINIMUM);
+REGISTER_LAYER_RESOURCE(Div, LAYER_DIV);
+REGISTER_LAYER_RESOURCE(Mul, LAYER_MUL);
+REGISTER_LAYER_RESOURCE(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+REGISTER_LAYER_RESOURCE(HdrGuide, LAYER_HDRGUIDE);
+
+REGISTER_LAYER_CONSTANT_RESOURCE(LSTMONNX, LAYER_LSTMONNX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.h b/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.h
new file mode 100644
index 0000000..80bd6e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/layer_resource_generator.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_GENERATOR_H_
+#define TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_GENERATOR_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+//@brief random gen layer resource in benchmark mode, save upload model time
+class LayerResourceGenerator {
+public:
+    virtual Status GenLayerResource(LayerParam* param, LayerResource** resource,
+                                    std::vector<Blob*>& inputs) { return TNN_OK;}
+    virtual Status GenLayerConstantResource(LayerParam* param, LayerResource** resource,
+                                            std::vector<Blob*>& inputs, ConstantResource* consts) {return TNN_OK;}
+    virtual Status ConvertHalfLayerResource(LayerResource* src_res, LayerResource** dst_res) {return TNN_OK;};
+};
+
+std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>& GetGlobalLayerResourceGeneratorMap();
+std::map<LayerType, std::shared_ptr<LayerResourceGenerator>>& GetGlobalLayerConstantResourceGeneratorMap();
+
+Status GenerateRandomResource(LayerType type, LayerParam* param, LayerResource** resource, std::vector<Blob*>& inputs,
+                              ConstantResource* consts=nullptr);
+
+//@brief only convert items of half data type to fp32 data type
+Status ConvertHalfResource(LayerType type, LayerResource* src_res, LayerResource** dst_res);
+
+template <typename T>
+class TypeLayerResourceRegister {
+public:
+    explicit TypeLayerResourceRegister(LayerType type) {
+        GetGlobalLayerResourceGeneratorMap()[type] = shared_ptr<T>(new T);
+    }
+};
+
+template <typename T>
+class TypeLayerConstantResourceRegister {
+public:
+    explicit TypeLayerConstantResourceRegister(LayerType type) {
+        GetGlobalLayerConstantResourceGeneratorMap()[type] = shared_ptr<T>(new T);
+    }
+};
+
+#define REGISTER_LAYER_RESOURCE(type_string, layer_type)                                                               \
+    TypeLayerResourceRegister<type_string##LayerResourceGenerator> g_##layer_type##_resource_register(layer_type);
+
+#define REGISTER_LAYER_CONSTANT_RESOURCE(type_string, layer_type)                                                               \
+TypeLayerConstantResourceRegister<type_string##LayerResourceGenerator> g_##layer_type##_constant_resource_register(layer_type);
+
+}  // TNN_NS
+
+#endif // TNN_SOURCE_TNN_INTERPRETER_LAYER_RESOURCE_GENERATOR_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h
new file mode 100644
index 0000000..c4eb171
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
+
+#include <cstdlib>
+#include <string>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/ncnn/ncnn_model_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+#include "tnn/interpreter/ncnn/serializer.h"
+#include "tnn/utils/split_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    class AbstractLayerInterpreter {
+    public:
+        // @brief create layer param form layer cfg array
+        virtual Status InterpretProto(std::string type_name, str_dict layer_cfg_arr, LayerType &type,
+                                      LayerParam **param) = 0;
+
+        virtual Status InterpretResource(Deserializer &deserializer, std::shared_ptr<LayerInfo> info,
+                                         LayerResource **resource) = 0;
+
+        virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) = 0;
+
+        virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) = 0;
+
+        virtual ~AbstractLayerInterpreter(){};
+    };
+
+    template <typename T>
+    class TypeLayerInterpreterRegister {
+    public:
+        TypeLayerInterpreterRegister(std::string type_name) {
+            NCNNModelInterpreter::RegisterLayerInterpreter(type_name, new T());
+        }
+    };
+
+#define DECLARE_LAYER_INTERPRETER(interpreter_name)                                                                    \
+    class interpreter_name##LayerInterpreter : public AbstractLayerInterpreter {                                       \
+    public:                                                                                                            \
+        virtual Status InterpretProto(std::string type_name, str_dict layer_cfg_arr, LayerType &type,                  \
+                                      LayerParam **param);                                                             \
+        virtual Status InterpretResource(Deserializer &deserializer, std::shared_ptr<LayerInfo> info,                  \
+                                         LayerResource **Resource);                                                    \
+        virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) {                                    \
+            return TNNERR_LAYER_ERR;                                                                                   \
+        }                                                                                                              \
+        virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {              \
+            return TNNERR_LAYER_ERR;                                                                                   \
+        }                                                                                                              \
+    }
+
+#define REGISTER_LAYER_INTERPRETER(interpreter_name, type_name)                                                        \
+    TypeLayerInterpreterRegister<interpreter_name##LayerInterpreter> g_##type_name##_layer_interpreter(#type_name);
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/batch_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/batch_norm_layer_interpreter.cc
new file mode 100644
index 0000000..f946e0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/batch_norm_layer_interpreter.cc
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(BatchNorm);
+
+    REGISTER_LAYER_INTERPRETER(BatchNorm, BatchNorm);
+
+    Status BatchNormLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        BatchNormLayerParam* layer_param = new BatchNormLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+        layer_param->channels = GetInt(p, 0, 0);
+        layer_param->eps      = GetFloat(p, 1, 0.0);
+
+        return TNN_OK;
+    }
+
+    Status BatchNormLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+        BatchNormLayerResource* layer_res = new BatchNormLayerResource();
+        *resource                         = layer_res;
+
+        auto param = std::dynamic_pointer_cast<BatchNormLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "layer param is nil: BatchNormLayerParam");
+        }
+
+        RawBuffer slope;
+        RawBuffer mean;
+        RawBuffer var;
+        RawBuffer bias;
+
+        deserializer.GetRawSimple(slope, param->channels);
+        deserializer.GetRawSimple(mean, param->channels);
+        deserializer.GetRawSimple(var, param->channels);
+        deserializer.GetRawSimple(bias, param->channels);
+
+        int k_size_in_bytes = static_cast<int>(param->channels * sizeof(float));
+        RawBuffer k(k_size_in_bytes);
+        RawBuffer b(k_size_in_bytes);
+
+        float* slope_data = slope.force_to<float*>();
+        float* mean_data  = mean.force_to<float*>();
+        float* var_data   = var.force_to<float*>();
+        float* bias_data  = bias.force_to<float*>();
+        float* k_data     = k.force_to<float*>();
+        float* b_data     = b.force_to<float*>();
+        float eps         = param->eps;
+
+        for (int i = 0; i < param->channels; i++) {
+            float sqrt_var = static_cast<float>(sqrt(var_data[i] + eps));
+            k_data[i]      = slope_data[i] / sqrt_var;
+            b_data[i]      = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
+        }
+
+        layer_res->scale_handle = k;
+        layer_res->bias_handle  = b;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/binary_op_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/binary_op_interpreter.cc
new file mode 100644
index 0000000..03e1851
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/binary_op_interpreter.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(BinaryOp);
+
+    REGISTER_LAYER_INTERPRETER(BinaryOp, BinaryOp);
+
+    static std::map<int, LayerType> global_layer_type_map = {
+        {0, LAYER_ADD},         {1, LAYER_SUB},     {2, LAYER_MUL},   {3, LAYER_DIV},
+        {4, LAYER_MAXIMUM},     {5, LAYER_MINIMUM}, {6, LAYER_POWER}, {7, LAYER_NOT_SUPPORT},  // RSUB
+        {8, LAYER_NOT_SUPPORT},                                                                // RDIV
+    };
+
+    Status BinaryOpLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                    LayerParam** param) {
+        MultidirBroadcastLayerParam* layer_param = new MultidirBroadcastLayerParam();
+        *param                                   = layer_param;
+
+        auto& p = param_dict;
+
+        int op_type     = GetInt(p, 0, 0);
+        int with_scalar = GetInt(p, 1, 0);
+        float b         = GetFloat(p, 2, 0.f);
+
+        type = global_layer_type_map[op_type];
+
+        if (with_scalar != 0) {
+            LOGET("BinaryOp with scaler not supported\n", "ncnn");
+            type = LAYER_NOT_SUPPORT;
+        }
+
+        return TNN_OK;
+    }
+
+    Status BinaryOpLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                       LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/clip_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/clip_layer_interpreter.cc
new file mode 100644
index 0000000..a828d8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/clip_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Clip);
+
+    REGISTER_LAYER_INTERPRETER(Clip, Clip);
+
+    Status ClipLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        ClipLayerParam* layer_param = new ClipLayerParam();
+        *param                      = layer_param;
+
+        layer_param->min = GetFloat(p, 0, 0.0);
+        layer_param->max = GetFloat(p, 1, 0.0);
+
+        return TNN_OK;
+    }
+
+    Status ClipLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/concat_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/concat_layer_interpreter.cc
new file mode 100644
index 0000000..978f822
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/concat_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Concat);
+
+    REGISTER_LAYER_INTERPRETER(Concat, Concat);
+
+    Status ConcatLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                  LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        ConcatLayerParam* layer_param = new ConcatLayerParam();
+        *param                        = layer_param;
+
+        auto& p = param_dict;
+
+        layer_param->axis = GetInt(p, 0, 0) + 1;
+
+        return TNN_OK;
+    }
+
+    Status ConcatLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                     LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/conv_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/conv_layer_interpreter.cc
new file mode 100644
index 0000000..4405046
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/conv_layer_interpreter.cc
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Conv);
+
+    REGISTER_LAYER_INTERPRETER(Conv, Convolution);
+    REGISTER_LAYER_INTERPRETER(Conv, ConvolutionDepthWise);
+
+    Status ConvLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        int num_output = GetInt(p, 0, 0);
+
+        int kernel_w = GetInt(p, 1, 0);
+        int kernel_h = GetInt(p, 11, kernel_w);
+
+        int dilation_w = GetInt(p, 2, 1);
+        int dilation_h = GetInt(p, 12, dilation_w);
+
+        int stride_w = GetInt(p, 3, 1);
+        int stride_h = GetInt(p, 13, stride_w);
+
+        int pad_left    = GetInt(p, 4, 0);
+        int pad_right   = GetInt(p, 15, pad_left);
+        int pad_top     = GetInt(p, 14, pad_left);
+        int pad_bottom  = GetInt(p, 16, pad_top);
+        float pad_value = GetFloat(p, 18, 0.f);
+
+        int bias_term        = GetInt(p, 5, 0);
+        int weight_data_size = GetInt(p, 6, 0);
+
+        int group = GetInt(p, 7, 1);
+
+        int int8_scale_term = GetInt(p, 8, 0);
+
+        int activation_type    = GetInt(p, 9, 0);
+        auto activation_params = GetFloatList(p, 10);
+
+        int impl_type = GetInt(p, 17, 0);
+
+        ConvLayerParam* layer_param = new ConvLayerParam();
+        *param                      = layer_param;
+
+        // group
+        layer_param->group = group;
+
+        // input and output channel
+        layer_param->input_channel  = 0;
+        layer_param->output_channel = num_output;
+
+        // kernels
+        layer_param->kernels.push_back(kernel_w);
+        layer_param->kernels.push_back(kernel_h);
+
+        // strides
+        layer_param->strides.push_back(stride_w);
+        layer_param->strides.push_back(stride_h);
+
+        // pads
+        layer_param->pads.push_back(pad_left);
+        layer_param->pads.push_back(pad_right);
+        layer_param->pads.push_back(pad_top);
+        layer_param->pads.push_back(pad_bottom);
+
+        // bias
+        layer_param->bias = bias_term;
+
+        // padding type
+        if (pad_left == -233 && pad_top == -233 && pad_right == -233 && pad_bottom == -233) {
+            layer_param->pad_type = 0;  // SAME
+        } else if (pad_left == -234 && pad_top == -234 && pad_right == -234 && pad_bottom == -234) {
+            // SAME LOWER
+            return Status(TNNERR_INVALID_NETCFG, "ncnn conv padding mode same_lower is not supported now");
+        } else {
+            layer_param->pad_type = -1;  // DEFAULT
+        }
+
+        // dailations
+        layer_param->dialations.push_back(dilation_w);
+        layer_param->dialations.push_back(dilation_h);
+
+        // activation
+        layer_param->activation_type = activation_type;
+
+        // weight_data_size
+        layer_param->weight_data_size = weight_data_size;
+
+        return TNN_OK;
+    }
+
+    Status ConvLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        ConvLayerResource* layer_res = new ConvLayerResource();
+        *resource                    = layer_res;
+
+        auto param = std::dynamic_pointer_cast<ConvLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "conv layer param is nil: ConvLayerParam");
+        }
+
+        RawBuffer weights;
+        deserializer.GetRaw(weights, param->weight_data_size);
+
+        // LOGDT("conv model %d %.6f\n", "ncnn", param->weight_data_size, weights.force_to<float *>()[0]);
+
+        layer_res->filter_format = OIHW;
+        layer_res->filter_handle = weights;
+
+        if (param->bias) {
+            RawBuffer bias;
+            deserializer.GetRawSimple(bias, param->output_channel);
+            layer_res->bias_handle = bias;
+        }
+
+        // if (weights.GetDataType() == DATA_TYPE_INT8) {
+        //     // quantized
+        //     RawBuffer scale;
+        //     deserializer.GetRaw(scale);
+        //     layer_res->scale_handle = scale;
+        // }
+        // if (int8_scale_term)
+        // {
+        //     weight_data_int8_scales = mb.load(num_output, 1);
+        //     bottom_blob_int8_scale = mb.load(1, 1)[0];
+        // }
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/crop_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/crop_layer_interpreter.cc
new file mode 100644
index 0000000..67931ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/crop_layer_interpreter.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Crop);
+
+    REGISTER_LAYER_INTERPRETER(Crop, Crop);
+
+    Status CropLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        StrideSliceLayerParam* layer_param = new StrideSliceLayerParam();
+        *param                             = layer_param;
+
+        auto& p = param_dict;
+
+        int woffset  = GetInt(p, 0, 0);
+        int hoffset  = GetInt(p, 1, 0);
+        int coffset  = GetInt(p, 2, 0);
+        int outw     = GetInt(p, 3, 0);
+        int outh     = GetInt(p, 4, 0);
+        int outc     = GetInt(p, 5, 0);
+        int woffset2 = GetInt(p, 6, 0);
+        int hoffset2 = GetInt(p, 7, 0);
+        int coffset2 = GetInt(p, 8, 0);
+
+        layer_param->begins = GetIntList(p, 9);
+        layer_param->ends   = GetIntList(p, 10);
+
+        std::vector<int> stride_one = {1, 1, 1, 1};
+        layer_param->strides        = stride_one;
+
+        bool not_numpy_style_crop = layer_param->begins.size() == 0 && layer_param->ends.size() == 0;
+
+        if (not_numpy_style_crop) {
+            int dims = int(HasField(p, 0)) + int(HasField(p, 1)) + int(HasField(p, 2));
+
+            // w h c n
+            if (dims == 1) {
+                layer_param->begins = {0, 0, woffset, 0};
+                layer_param->ends   = {0, 0, -woffset2, 0};
+            } else if (dims == 2) {
+                layer_param->begins = {0, hoffset, woffset, 0};
+                layer_param->ends   = {0, -hoffset2, -woffset2, 0};
+            } else if (dims == 3) {
+                layer_param->begins = {woffset, hoffset, coffset, 0};
+                layer_param->ends   = {-woffset2, -hoffset2, -coffset2, 0};
+            } else {
+                return Status(TNNERR_INVALID_NETCFG, "ncnn crop layer invalid dims.");
+            }
+        } else {
+            std::reverse(layer_param->begins.begin(), layer_param->begins.end());
+            std::reverse(layer_param->ends.begin(), layer_param->ends.end());
+        }
+
+        if (layer_param->begins.size() != 4 || layer_param->ends.size() != 4) {
+            // TODO fully support crop layer.
+            // onnx2ncnn failed to convert onnx slice layer now
+            return Status(TNNERR_INVALID_NETCFG, "ncnn crop layer not fully supported now");
+        }
+
+        return TNN_OK;
+    }
+
+    Status CropLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/deconv_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/deconv_layer_interpreter.cc
new file mode 100644
index 0000000..93f004b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/deconv_layer_interpreter.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Deconv);
+
+    REGISTER_LAYER_INTERPRETER(Deconv, Deconvolution);
+    REGISTER_LAYER_INTERPRETER(Deconv, DeconvolutionDepthWise);
+
+    Status DeconvLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                  LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        ConvLayerParam* layer_param = new ConvLayerParam();
+        *param                      = layer_param;
+
+        auto& p = param_dict;
+
+        int num_output = GetInt(p, 0, 0);
+        // input and output channel
+        layer_param->input_channel  = 0;
+        layer_param->output_channel = num_output;
+
+        // kernels
+        int kernel_w = GetInt(p, 1, 0);
+        int kernel_h = GetInt(p, 11, kernel_w);
+        layer_param->kernels.push_back(kernel_w);
+        layer_param->kernels.push_back(kernel_h);
+
+        // strides
+        int stride_w = GetInt(p, 3, 1);
+        int stride_h = GetInt(p, 13, stride_w);
+        layer_param->strides.push_back(stride_w);
+        layer_param->strides.push_back(stride_h);
+
+        // pads
+        int pad_left   = GetInt(p, 4, 0);
+        int pad_right  = GetInt(p, 15, pad_left);
+        int pad_top    = GetInt(p, 14, pad_left);
+        int pad_bottom = GetInt(p, 16, pad_top);
+        layer_param->pads.push_back(pad_left);
+        layer_param->pads.push_back(pad_right);
+        layer_param->pads.push_back(pad_top);
+        layer_param->pads.push_back(pad_bottom);
+
+        // dailations
+        int dilation_w = GetInt(p, 2, 1);
+        int dilation_h = GetInt(p, 12, dilation_w);
+        layer_param->dialations.push_back(dilation_w);
+        layer_param->dialations.push_back(dilation_h);
+
+        // bias
+        int bias_term                 = GetInt(p, 5, 0);
+        int weight_data_size          = GetInt(p, 6, 0);
+        layer_param->bias             = bias_term;
+        layer_param->weight_data_size = weight_data_size;
+
+        // group
+        int group          = GetInt(p, 7, 1);
+        layer_param->group = group;
+
+        int int8_scale_term = GetInt(p, 8, 0);
+
+        // activation
+        int activation_type          = GetInt(p, 9, 0);
+        auto activation_params       = GetFloatList(p, 10);
+        layer_param->activation_type = activation_type;
+
+        int output_pad_right  = GetInt(p, 18, 0);
+        int output_pad_bottom = GetInt(p, 19, 0);
+        int output_w          = GetInt(p, 20, 0);
+        int output_h          = GetInt(p, 21, 0);
+
+        if (output_h != 0 || output_w != 0) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn deconv with output hw is not supported now");
+        }
+
+        // padding type
+        layer_param->pad_type = -1;  // DEFAULT
+        if (output_pad_right != 0 || output_pad_bottom != 0) {
+            // deconv exchange pad_right and pad_left because of output_padding
+            layer_param->pad_type = 3;
+        }
+
+        return TNN_OK;
+    }
+
+    Status DeconvLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                     LayerResource** resource) {
+        ConvLayerResource* layer_res = new ConvLayerResource();
+        *resource                    = layer_res;
+
+        auto param = std::dynamic_pointer_cast<ConvLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "layer param is nil: ConvLayerParam");
+        }
+
+        RawBuffer weights;
+        deserializer.GetRaw(weights, param->weight_data_size);
+
+        // LOGDT("conv model %d %.6f\n", "ncnn", param->weight_data_size, weights.force_to<float *>()[0]);
+
+        layer_res->filter_format = OIHW;
+        layer_res->filter_handle = weights;
+
+        if (param->bias) {
+            RawBuffer bias;
+            deserializer.GetRawSimple(bias, param->output_channel);
+            layer_res->bias_handle = bias;
+        }
+
+        // if (weights.GetDataType() == DATA_TYPE_INT8) {
+        //     // quantized
+        //     RawBuffer scale;
+        //     deserializer.GetRaw(scale);
+        //     layer_res->scale_handle = scale;
+        // }
+        // if (int8_scale_term)
+        // {
+        //     weight_data_int8_scales = mb.load(num_output, 1);
+        //     bottom_blob_int8_scale = mb.load(1, 1)[0];
+        // }
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/default_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/default_layer_interpreter.cc
new file mode 100644
index 0000000..7f6a23b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/default_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Default);
+
+    REGISTER_LAYER_INTERPRETER(Default, AbsVal);
+    REGISTER_LAYER_INTERPRETER(Default, Sigmoid);
+    REGISTER_LAYER_INTERPRETER(Default, Split);
+    REGISTER_LAYER_INTERPRETER(Default, TanH);
+    REGISTER_LAYER_INTERPRETER(Default, Flatten);
+    REGISTER_LAYER_INTERPRETER(Default, Dropout);
+
+    Status DefaultLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        LayerParam* layer_param = new LayerParam();
+        *param                  = layer_param;
+        type                    = ConvertNCNNLayerType(type_name);
+
+        return TNN_OK;
+    }
+
+    Status DefaultLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/detection_output_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/detection_output_layer_interpreter.cc
new file mode 100644
index 0000000..03c6da8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/detection_output_layer_interpreter.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(DetectionOutput);
+
+    REGISTER_LAYER_INTERPRETER(DetectionOutput, DetectionOutput);
+
+    Status DetectionOutputLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                           LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        DetectionOutputLayerParam* layer_param = new DetectionOutputLayerParam();
+        *param                                 = layer_param;
+
+        auto p = param_dict;
+
+        int num_class = GetInt(p, 0, 0);
+        layer_param->num_classes = num_class;
+
+        layer_param->share_location = true; // ncnn does not have this controlled param
+        
+        layer_param->variance_encoded_in_target = num_class == -233 ? true : false; // 
+        layer_param->code_type = 2; // code_type == PriorBoxParameter_CodeType_CENTER_SIZE
+        layer_param->nms_param.nms_threshold = GetFloat(p, 1, 0.05f);
+        layer_param->nms_param.top_k = GetInt(p, 2, 300);
+        layer_param->keep_top_k = GetInt(p, 3, 100);
+        layer_param->confidence_threshold = GetFloat(p, 4, 0.5f);
+        layer_param->eta = 1.0f; // eta < 1.0 will enter a tnn branch while ncnn does not
+        layer_param->background_label_id = 0; // tnn will skip background_label_id class from 0, ncnn does that from 1
+        float variance = GetFloat(p, 5, -0.2f);
+        if (variance != -0.2f && num_class == -233) {
+            return Status(TNNERR_LAYER_ERR, "DetectionOutput Param is invalid: DetectionOutputLayerParam");
+            // this means ncnn will use variances param in param list while tnn does not have that
+        }
+
+        return TNN_OK;
+    }
+
+    Status DetectionOutputLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                              LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/eltwise_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/eltwise_layer_interpreter.cc
new file mode 100644
index 0000000..86edea0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/eltwise_layer_interpreter.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Eltwise);
+
+    REGISTER_LAYER_INTERPRETER(Eltwise, Eltwise);
+
+    static std::map<int, LayerType> global_elementwise_layer_type_map = {
+        // NCNN Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
+        {0, LAYER_MUL},
+        {1, LAYER_ADD},
+        {2, LAYER_MAXIMUM},
+    };
+
+    Status EltwiseLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        MultidirBroadcastLayerParam* layer_param = new MultidirBroadcastLayerParam();
+        *param                                   = layer_param;
+
+        auto& p = param_dict;
+
+        int op_type             = GetInt(p, 0, 0);
+        std::vector<float> coef = GetFloatList(p, 1);
+
+        if (op_type < 0 || op_type > 2) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn eltwise got invalid op_type");
+        }
+
+        type = global_elementwise_layer_type_map[op_type];
+
+        if (coef.size() != 0) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn eltwise layer with coefs is not supported now.");
+        }
+
+        return TNN_OK;
+    }
+
+    Status EltwiseLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/elu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/elu_layer_interpreter.cc
new file mode 100644
index 0000000..2a1fbcd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/elu_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn    {
+
+    DECLARE_LAYER_INTERPRETER(Elu);
+
+    REGISTER_LAYER_INTERPRETER(Elu, ELU);
+
+    Status EluLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                               LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+        
+        EluLayerParam *layer_param = new EluLayerParam();
+        *param                     = layer_param;
+
+        auto p = param_dict;
+        layer_param->alpha = GetFloat(p, 0, 0.1f);
+
+        return TNN_OK;
+    }
+
+    Status EluLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                  LayerResource** resource) {
+        return TNN_OK;
+    }
+
+} // namespace ncnn
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
new file mode 100644
index 0000000..476a47e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(HardSigmoid);
+
+    REGISTER_LAYER_INTERPRETER(HardSigmoid, HardSigmoid);
+
+    Status HardSigmoidLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                       LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        HardSigmoidLayerParam* layer_param = new HardSigmoidLayerParam();
+        *param                             = layer_param;
+
+        auto& p            = param_dict;
+        layer_param->alpha = GetFloat(p, 0, 0.2f);
+        layer_param->beta  = GetFloat(p, 1, 0.5f);
+
+        return TNN_OK;
+    }
+
+    Status HardSigmoidLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                          LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_swish_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_swish_layer_interpreter.cc
new file mode 100644
index 0000000..ebb8cbf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/hard_swish_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(HardSwish);
+
+    REGISTER_LAYER_INTERPRETER(HardSwish, HardSwish);
+
+    Status HardSwishLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        HardSwishLayerParam* layer_param = new HardSwishLayerParam();
+        *param                           = layer_param;
+
+        auto& p            = param_dict;
+        layer_param->alpha = GetFloat(p, 0, 0.2f);
+        layer_param->beta  = GetFloat(p, 1, 0.5f);
+
+        return TNN_OK;
+    }
+
+    Status HardSwishLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/inner_product_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/inner_product_layer_interpreter.cc
new file mode 100644
index 0000000..640e331
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/inner_product_layer_interpreter.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(InnerProduct);
+
+    REGISTER_LAYER_INTERPRETER(InnerProduct, InnerProduct);
+
+    Status InnerProductLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                        LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        int num_output       = GetInt(p, 0, 0);
+        int bias_term        = GetInt(p, 1, 0);
+        int weight_data_size = GetInt(p, 2, 0);
+
+        int int8_scale_term = GetInt(p, 8, 0);
+
+        // Not supported yet
+        int activation_type    = GetInt(p, 9, 0);
+        auto activation_params = GetFloatList(p, 10);
+
+        InnerProductLayerParam* layer_param = new InnerProductLayerParam();
+        *param                              = layer_param;
+
+        layer_param->num_output       = num_output;
+        layer_param->has_bias         = bias_term;
+        layer_param->transpose        = 0;  // TODO
+        layer_param->axis             = 1;  // Default axies is w in ncnn;
+        layer_param->weight_data_size = weight_data_size;
+
+        return TNN_OK;
+    }
+
+    Status InnerProductLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                           LayerResource** resource) {
+        InnerProductLayerResource* layer_res = new InnerProductLayerResource();
+        *resource                            = layer_res;
+
+        auto param = std::dynamic_pointer_cast<InnerProductLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "layer param is nil: InnerProductLayerParam");
+        }
+
+        RawBuffer weights;
+        deserializer.GetRaw(weights, param->weight_data_size);
+        layer_res->weight_handle = weights;
+
+        if (param->has_bias) {
+            RawBuffer bias;
+            deserializer.GetRawSimple(bias, param->num_output);
+            layer_res->bias_handle = bias;
+        }
+
+        // LOGDT("ip model %d %.6f\n", "ncnn", param->weight_data_size, weights.force_to<float *>()[0]);
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/instance_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/instance_norm_layer_interpreter.cc
new file mode 100644
index 0000000..4ee2b65
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/instance_norm_layer_interpreter.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(InstanceNorm);
+
+    REGISTER_LAYER_INTERPRETER(InstanceNorm, InstanceNorm);
+
+    Status InstanceNormLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        InstanceNormLayerParam* layer_param = new InstanceNormLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+        layer_param->channels = GetInt(p, 0, 0);
+        layer_param->eps      = GetFloat(p, 1, 1e-5f);
+
+        return TNN_OK;
+    }
+
+    Status InstanceNormLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+       InstanceNormLayerResource* layer_res = new InstanceNormLayerResource();
+        *resource                         = layer_res;
+
+        auto param = std::dynamic_pointer_cast<InstanceNormLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "layer param is nil: InstanceNormLayerParam");
+        }
+
+        RawBuffer gamma;
+        RawBuffer beta;
+
+        deserializer.GetRawSimple(gamma, param->channels);
+        deserializer.GetRawSimple(beta, param->channels);
+
+        layer_res->scale_handle = gamma;
+        layer_res->bias_handle  = beta;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/interp_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/interp_layer_interpreter.cc
new file mode 100644
index 0000000..e99c4e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/interp_layer_interpreter.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Interp);
+
+    REGISTER_LAYER_INTERPRETER(Interp, Interp);
+
+    Status InterpLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                  LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        UpsampleLayerParam* layer_param = new UpsampleLayerParam();
+        *param                          = layer_param;
+
+        auto& p = param_dict;
+
+        /* ncnn resize_type:
+            1  : nearest
+            2  : bilinear
+            3  : bicubic
+        */
+        int resize_type    = GetInt(p, 0, 0);
+        float height_scale = GetFloat(p, 1, 1.f);
+        float width_scale  = GetFloat(p, 2, 1.f);
+        int output_height  = GetInt(p, 3, 0);
+        int output_width   = GetInt(p, 4, 0);
+
+        // only supports nearest and bilinear now
+        if (resize_type != 1 && resize_type != 2) {
+            return Status(TNNERR_INVALID_NETCFG, "Interp layer: unsupported resize_type");
+        }
+
+        layer_param->type          = resize_type;
+        layer_param->align_corners = 0;
+        layer_param->scales.push_back(width_scale);
+        layer_param->scales.push_back(height_scale);
+        if (output_height != 0 && output_width != 0) {
+            layer_param->dims.push_back(output_width);
+            layer_param->dims.push_back(output_height);
+        }
+
+        return TNN_OK;
+    }
+
+    Status InterpLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                     LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/lrn_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/lrn_layer_interpreter.cc
new file mode 100644
index 0000000..dd65a39
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/lrn_layer_interpreter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(LRN);
+
+    REGISTER_LAYER_INTERPRETER(LRN, LRN);
+
+    Status LRNLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                               LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        LRNLayerParam* layer_param = new LRNLayerParam();
+        *param                     = layer_param;
+
+        auto& p = param_dict;
+
+        int region_type = GetInt(p, 0, 0);
+
+        layer_param->size  = GetInt(p, 1, 5);
+        layer_param->alpha = GetFloat(p, 2, 1.f);
+        layer_param->beta  = GetFloat(p, 3, 0.75f);
+        layer_param->bias  = GetFloat(p, 4, 1.f);
+
+        return TNN_OK;
+    }
+
+    Status LRNLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                  LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/memory_data_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/memory_data_layer_interpreter.cc
new file mode 100644
index 0000000..397a780
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/memory_data_layer_interpreter.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+#include <vector>
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(MemoryData);
+
+    REGISTER_LAYER_INTERPRETER(MemoryData, MemoryData);
+
+    Status MemoryDataLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                      LayerParam** param) {
+        ConstLayerParam* layer_param = new ConstLayerParam();
+        *param                       = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+        int w   = GetInt(p, 0, 0);
+        int h   = GetInt(p, 1, 0);
+        int c   = GetInt(p, 2, 0);
+
+        std::vector<int> dims = {w, h, c};
+        layer_param->dims.resize(0);
+        for (auto i : dims) {
+            if (i != 0) {
+                layer_param->dims.push_back(i);
+            }
+        }
+
+        if (layer_param->dims.size() == 0) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn MemoryData param error.");
+        }
+
+        return TNN_OK;
+    }
+
+    Status MemoryDataLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                         LayerResource** resource) {
+        ConstLayerResource* layer_res = new ConstLayerResource();
+        *resource                     = layer_res;
+
+        auto param = std::dynamic_pointer_cast<ConstLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "layer param is nil: ConstLayerParam");
+        }
+
+        int data_count = 1;
+        for (int dim_i : param->dims) {
+            data_count *= dim_i;
+        }
+
+        RawBuffer weight;
+        deserializer.GetRawSimple(weight, data_count);
+
+        layer_res->weight_handle = weight;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/normalize_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/normalize_layer_interpreter.cc
new file mode 100644
index 0000000..180f482
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/normalize_layer_interpreter.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Normalize);
+
+    REGISTER_LAYER_INTERPRETER(Normalize, Normalize);
+
+    Status NormalizeLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        NormalizeLayerParam* layer_param = new NormalizeLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+
+   
+        //across_channel = pd.get(4, 1);
+        //eps_mode = pd.get(9, 0);
+        //scale_data_size = pd.get(3, 0);
+        printf("wrong");
+        layer_param->across_spatial = GetInt(p, 0, 0);
+        layer_param->epsilon =  GetFloat(p, 2, 0.0001f);
+        layer_param->channel_shared  =  GetInt(p, 1, 0);
+        layer_param->axis = 1;
+        layer_param->p = 2;
+
+
+        return TNN_OK;
+    }
+
+    Status NormalizeLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pad_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pad_layer_interpreter.cc
new file mode 100644
index 0000000..63822c4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pad_layer_interpreter.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Pad);
+
+    REGISTER_LAYER_INTERPRETER(Pad, Padding);
+
+    Status PadLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                               LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        PadLayerParam* layer_param = new PadLayerParam();
+        *param                     = layer_param;
+
+        auto p = param_dict;
+        int pad_t = INT_MIN;
+        int pad_b = INT_MIN;
+        int pad_l = INT_MIN;
+        int pad_r = INT_MIN;
+
+        pad_t = GetInt(p, 0, 0);
+        pad_b = GetInt(p, 1, 0);
+        pad_l = GetInt(p, 2, 0);
+        pad_r = GetInt(p, 3, 0);
+
+        layer_param->type = GetInt(p, p.size()-2, 0);
+        layer_param->pads = {pad_t,pad_b,pad_l,pad_r};
+
+        return TNN_OK;
+    }
+
+    Status PadLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                  LayerResource** resource) {
+        return TNN_OK;                                    
+    }
+
+} // namespace ncnn
+
+} // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/permute_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/permute_layer_interpreter.cc
new file mode 100644
index 0000000..d514652
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/permute_layer_interpreter.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <map>
+#include <vector>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Permute);
+
+    REGISTER_LAYER_INTERPRETER(Permute, Permute);
+
+    /* ncnn permute from c h w to :
+    order_type:
+        0 = c h w
+        1 = c w h
+        2 = h c w
+        3 = h w c
+        4 = w c h
+        5 = w h c
+    */
+
+    std::map<int, std::vector<int>> order_type_map = {
+        {0, {0, 1, 2, 3}}, {1, {0, 1, 3, 2}}, {2, {0, 2, 1, 3}},
+        {3, {0, 2, 3, 1}}, {4, {0, 3, 1, 2}}, {5, {0, 3, 2, 1}},
+    };
+
+    Status PermuteLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        PermuteLayerParam* layer_param = new PermuteLayerParam();
+        *param                         = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        int order_type      = GetInt(p, 0, 0);
+        layer_param->orders = order_type_map[order_type];
+
+        return TNN_OK;
+    }
+
+    Status PermuteLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pooling_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pooling_layer_interpreter.cc
new file mode 100644
index 0000000..1fdd4f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/pooling_layer_interpreter.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Pooling);
+
+    REGISTER_LAYER_INTERPRETER(Pooling, Pooling);
+
+    static std::map<int, int> global_ncnn_pad_type_map = {
+        {0, -1},  // ncnn full padding not supported
+        {1, -1},  // ncnn valid padding -> rpn default padding
+        {2, 0},   // tf same padding
+        {3, 0},   // onnx same_lower
+    };
+
+    Status PoolingLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        PoolingLayerParam* layer_param = new PoolingLayerParam();
+        *param                         = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        int pooling_type = GetInt(p, 0, 0);
+
+        int kernel_w = GetInt(p, 1, 0);
+        int kernel_h = GetInt(p, 11, kernel_w);
+
+        int stride_w = GetInt(p, 2, 1);
+        int stride_h = GetInt(p, 2, stride_w);
+
+        int pad_left   = GetInt(p, 3, 0);
+        int pad_right  = GetInt(p, 14, pad_left);
+        int pad_top    = GetInt(p, 13, pad_left);
+        int pad_bottom = GetInt(p, 15, pad_top);
+
+        int global_pooling = GetInt(p, 4, 0);
+        int pad_mod        = GetInt(p, 5, 0);
+
+        if (global_pooling == 1) {
+            kernel_w   = 0;
+            kernel_h   = 0;
+            pad_left   = 0;
+            pad_right  = 0;
+            pad_top    = 0;
+            pad_bottom = 0;
+            pad_mod    = 1;
+        }
+
+        layer_param->pool_type = pooling_type;
+
+        layer_param->kernels_params.push_back(kernel_w);
+        layer_param->kernels_params.push_back(kernel_h);
+        layer_param->kernels.push_back(kernel_w);
+        layer_param->kernels.push_back(kernel_h);
+
+        layer_param->strides.push_back(stride_w);
+        layer_param->strides.push_back(stride_h);
+
+        layer_param->pads.push_back(pad_left);
+        layer_param->pads.push_back(pad_right);
+        layer_param->pads.push_back(pad_top);
+        layer_param->pads.push_back(pad_bottom);
+        layer_param->pad_type  = global_ncnn_pad_type_map[pad_mod];
+        layer_param->ceil_mode = -1;
+
+        layer_param->kernel_indexs.push_back(-1);
+        layer_param->kernel_indexs.push_back(-1);
+
+        if (pad_mod == 0) {
+            layer_param->ceil_mode = 1;
+        }
+
+        /* ncnn same_lower padding:
+            pad right and bottom first;
+            we pads left and top as default.
+            */
+        if (pad_mod == 3) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn pool mod 3 SAME_LOWER is not supported now");
+        }
+
+        return TNN_OK;
+    }
+
+    Status PoolingLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prelu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prelu_layer_interpreter.cc
new file mode 100644
index 0000000..5b78161
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prelu_layer_interpreter.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(PRelu);
+
+    REGISTER_LAYER_INTERPRETER(PRelu, PReLU);
+
+    Status PReluLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                 LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        PReluLayerParam* layer_param = new PReluLayerParam();
+        *param                       = layer_param;
+
+        auto& p = param_dict;
+        layer_param->channel_shared = GetInt(p, 0, 0) == 1 ? 1 : 0;
+        layer_param->has_filler = 0;
+
+        layer_param->weight_data_size = GetInt(p, 0, 1); // PReLU resource byteSize
+
+        return TNN_OK;
+    }
+
+    Status PReluLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        PReluLayerResource* layer_res = new PReluLayerResource();
+        *resource                     = layer_res;
+
+        auto param = std::dynamic_pointer_cast<PReluLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "PReLU layer param is nil: PreluLayerParam");
+        }
+        
+        layer_res->name = param->name;
+    
+        RawBuffer k;
+        deserializer.GetRawSimple(k, param->weight_data_size);
+        layer_res->slope_handle = k;
+
+        return TNN_OK;
+    }
+
+}  // namspace ncnn
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prior_box_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prior_box_layer_interpreter.cc
new file mode 100644
index 0000000..cb1b4d8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/prior_box_layer_interpreter.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(PriorBox);
+
+    REGISTER_LAYER_INTERPRETER(PriorBox, PriorBox);
+
+    Status PriorBoxLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        PriorBoxLayerParam* layer_param = new PriorBoxLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+        layer_param->min_sizes = GetFloatList(p, 0);
+        layer_param->max_sizes =  GetFloatList(p, 1);
+        layer_param->flip =  GetInt(p, 7, 1);
+        layer_param->clip = GetInt(p, 8, 0);
+
+        float variances[4];
+        variances[0]=GetFloat(p, 3, 0.1f);
+        variances[1]=GetFloat(p, 4, 0.1f);
+        variances[2]=GetFloat(p, 5, 0.2f);
+        variances[3]=GetFloat(p, 6, 0.2f);
+        
+        layer_param->variances.push_back(variances[0]);
+        layer_param->variances.push_back(variances[1]);
+        layer_param->variances.push_back(variances[2]);
+        layer_param-> variances.push_back(variances[3]);
+
+
+        layer_param->aspect_ratios =  GetFloatList(p, 2);
+        layer_param->img_w =GetInt(p, 9, 0);
+        layer_param->img_h = GetInt(p, 10, 0);
+        layer_param->step_w = GetFloat(p, 11, -233.f);
+        layer_param->step_h = GetFloat(p, 12, -233.f);
+        layer_param->offset = GetFloat(p, 13, 0.f);
+        //layer_param->step_mmdetection =GetInt(p, 14, 0);
+        //layer_param->center_mmdetection =GetInt(p, 15, 0);
+
+        return TNN_OK;
+    }
+
+    Status PriorBoxLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reduce_op_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reduce_op_layer_interpreter.cc
new file mode 100644
index 0000000..ffc3149
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reduce_op_layer_interpreter.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn    {
+
+    DECLARE_LAYER_INTERPRETER(ReduceOp);
+
+    REGISTER_LAYER_INTERPRETER(ReduceOp, Reduction);
+
+    Status ReduceOpLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        ReduceLayerParam *layer_param = new ReduceLayerParam();
+        *param                        = layer_param;
+        
+        static std::map<int, LayerType> reductipon_layer_type_map = {
+            {0, LAYER_REDUCE_SUM},      {1, LAYER_NOT_SUPPORT},     {2, LAYER_REDUCE_SUM_SQUARE},
+            {3, LAYER_REDUCE_MEAN},     {4, LAYER_REDUCE_MAX},      {5, LAYER_REDUCE_MIN},
+            {6, LAYER_REDUCE_PROD},     {7, LAYER_NOT_SUPPORT},       {8, LAYER_REDUCE_L2},
+            {9, LAYER_REDUCE_LOG_SUM},  {10,LAYER_REDUCE_LOG_SUM_EXP}
+        }; // ncnn reduction map
+
+        auto p = param_dict;
+        int op_type = GetInt(p, 0, 0);
+        type = reductipon_layer_type_map[op_type];
+        
+        int keep_dims = GetInt(p, 4, 0);
+        std::vector<int> axis = GetIntList(p, 3);
+        int reduce_all = GetInt(p, 1, 1);
+
+        layer_param->keep_dims = keep_dims;
+        layer_param->axis.assign(axis.begin(),axis.end());
+        layer_param->all_reduce = reduce_all;
+        
+        return TNN_OK;
+    }
+
+    Status ReduceOpLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                       LayerResource** resource) {
+        return TNN_OK;
+    }
+
+} // namespace ncnn
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/relu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/relu_layer_interpreter.cc
new file mode 100644
index 0000000..04c23d4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/relu_layer_interpreter.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(ReLU);
+
+    REGISTER_LAYER_INTERPRETER(ReLU, ReLU);
+
+    Status ReLULayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        float slope = GetFloat(p, 0, 0.0);
+
+        if (fabs(slope) > 1e-6) {
+            type = LAYER_PRELU;
+
+            PReluLayerParam* layer_param = new PReluLayerParam();
+            *param                       = layer_param;
+
+            layer_param->channel_shared = 1;
+            float* ptr                  = reinterpret_cast<float*>(&layer_param->has_filler);
+            ptr[0]                      = slope;
+        }
+
+        return TNN_OK;
+    }
+
+    Status ReLULayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        if (info->type == LAYER_PRELU) {
+            auto param = std::dynamic_pointer_cast<PReluLayerParam>(info->param);
+            if (!param) {
+                return Status(TNNERR_LAYER_ERR, "layer param is nil: PReluLayerParam");
+            }
+
+            PReluLayerResource* layer_res = new PReluLayerResource();
+            *resource                     = layer_res;
+
+            RawBuffer slope(4);
+            float* slope_data = slope.force_to<float*>();
+            float* ptr        = reinterpret_cast<float*>(&param->has_filler);
+            slope_data[0]     = ptr[0];
+
+            layer_res->slope_handle = slope;
+        }
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reorg_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reorg_layer_interpreter.cc
new file mode 100644
index 0000000..aee26f8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reorg_layer_interpreter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Reorg);
+
+    REGISTER_LAYER_INTERPRETER(Reorg, Reorg);
+
+    Status ReorgLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        ReorgLayerParam* layer_param = new ReorgLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+
+        layer_param->stride = GetInt(p, 0, 0);
+        layer_param->forward = false;
+
+
+        return TNN_OK;
+    }
+
+    Status ReorgLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reshape_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reshape_layer_interpreter.cc
new file mode 100644
index 0000000..a82e99f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/reshape_layer_interpreter.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Reshape);
+
+    REGISTER_LAYER_INTERPRETER(Reshape, Reshape);
+
+    Status ReshapeLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        ReshapeLayerParam* layer_param = new ReshapeLayerParam();
+        *param                         = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p = param_dict;
+
+        int w       = GetInt(p, 0, 0);
+        int h       = GetInt(p, 1, 0);
+        int c       = GetInt(p, 2, 0);
+        int permute = GetInt(p, 3, 0);
+
+        if (permute != 0) {
+            return Status(TNNERR_INVALID_NETCFG, "ncnn reshape with permute is not supported now");
+        }
+
+        if (c == 0 && h == 0) {
+            layer_param->shape = {0, w, 1, 1};
+        } else if (c == 0) {
+            layer_param->shape = {0, w, h, 1};
+        } else {
+            layer_param->shape = {0, c, h, w};
+        }
+
+        layer_param->axis     = 0;
+        layer_param->num_axes = 4;
+
+        return TNN_OK;
+    }
+
+    Status ReshapeLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/roi_pooling_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/roi_pooling_layer_interpreter.cc
new file mode 100644
index 0000000..889126e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/roi_pooling_layer_interpreter.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(RoiPooling);
+
+    REGISTER_LAYER_INTERPRETER(RoiPooling, RoiPooling);
+
+    Status RoiPoolingLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                     LayerParam** param) {
+        RoiPoolingLayerParam* layer_param = new RoiPoolingLayerParam();
+        *param                           = layer_param;
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto& p               = param_dict;
+    
+        int pooled_w = GetInt(p, 0, 0);
+        int pooled_h = GetInt(p, 1, 0);
+        layer_param->spatial_scale=GetFloat(p, 2, 1.f);
+        
+        layer_param->pool_type= 0;
+        
+        layer_param->pooled_dims.push_back(pooled_w);
+        layer_param->pooled_dims.push_back(pooled_h);
+
+        return TNN_OK;
+    }
+
+    Status RoiPoolingLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                        LayerResource** resource) {
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/scale_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/scale_layer_interpreter.cc
new file mode 100644
index 0000000..91af8bd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/scale_layer_interpreter.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Scale);
+
+    REGISTER_LAYER_INTERPRETER(Scale, Scale);
+
+    Status ScaleLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                          LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+        ScaleLayerParam* layer_param = new ScaleLayerParam();
+        *param                       = layer_param;
+
+        auto p = param_dict;
+        layer_param->axis = 1;
+        layer_param->num_axes = 1;
+        layer_param->bias_term = GetInt(p, 1, 0);
+        layer_param->weight_data_size = GetInt(p, 0, 0);
+
+        return TNN_OK;
+    }
+
+    Status ScaleLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                    LayerResource** resource) {
+        BatchNormLayerResource* layer_res = new BatchNormLayerResource();
+        *resource                         = layer_res;
+
+        auto param = std::dynamic_pointer_cast<ScaleLayerParam>(info->param);
+        if (!param) {
+            return Status(TNNERR_LAYER_ERR, "Scale Layer Param is nil: ScaleLayerParam");
+        }
+
+        if (param->weight_data_size == -233) {
+            return Status(TNNERR_PARAM_ERR, "Scale Layer Param is invalid: ScaleLayerParam");
+        }
+
+        layer_res->name = param->name;
+
+        RawBuffer scale;
+        deserializer.GetRawSimple(scale, param->weight_data_size);
+
+        RawBuffer bias;
+        if (param->bias_term) {
+            deserializer.GetRawSimple(bias, param->weight_data_size);
+        }
+
+        layer_res->scale_handle = scale;
+        layer_res->bias_handle = bias;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/selu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/selu_layer_interpreter.cc
new file mode 100644
index 0000000..16c4b55
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/selu_layer_interpreter.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+    
+    DECLARE_LAYER_INTERPRETER(Selu);
+
+    REGISTER_LAYER_INTERPRETER(Selu, SELU)
+
+    Status SeluLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+        SeluLayerParam* layer_param = new SeluLayerParam();
+        *param                      = layer_param;
+
+        auto p = param_dict;
+
+        float alpha = GetFloat(p, 0, 1.67326324f);
+        float gamma = GetFloat(p, 1, 1.050700987f);
+
+        layer_param->alpha = alpha;
+        layer_param->gamma = gamma;
+
+        return TNN_OK;
+    }
+
+    Status SeluLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                   LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}   // namespace ncnn
+
+}   // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/shuffle_channel_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/shuffle_channel_layer_interpreter.cc
new file mode 100644
index 0000000..f37baf4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/shuffle_channel_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(ShuffleChannel);
+
+    REGISTER_LAYER_INTERPRETER(ShuffleChannel, ShuffleChannel);
+
+    Status ShuffleChannelLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                          LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        ShuffleLayerParam* layer_param = new ShuffleLayerParam();
+        *param                         = layer_param;
+
+        auto& p = param_dict;
+
+        layer_param->group = GetInt(p, 0, 1);
+
+        return TNN_OK;
+    }
+
+    Status ShuffleChannelLayerInterpreter::InterpretResource(Deserializer& deserializer,
+                                                             std::shared_ptr<LayerInfo> info,
+                                                             LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/slice_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/slice_layer_interpreter.cc
new file mode 100644
index 0000000..d883224
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/slice_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(Slice);
+
+    REGISTER_LAYER_INTERPRETER(Slice, Slice);
+
+    Status SliceLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                 LayerParam** param) {
+        type = ConvertNCNNLayerType(type_name);
+
+        SliceLayerParam* layer_param = new SliceLayerParam();
+        *param                       = layer_param;
+
+        auto& p = param_dict;
+
+        layer_param->slices = GetIntList(p, 0);
+        layer_param->axis   = GetInt(p, 1, 0) + 1;
+
+        return TNN_OK;
+    }
+
+    Status SliceLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                    LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/softmax_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/softmax_layer_interpreter.cc
new file mode 100644
index 0000000..3fcdd6f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/softmax_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn{
+
+DECLARE_LAYER_INTERPRETER(Softmax);
+
+REGISTER_LAYER_INTERPRETER(Softmax, Softmax);
+
+    Status SoftmaxLayerInterpreter::InterpretProto(std::string type_name, 
+                                                str_dict param_dict, 
+                                                LayerType &type,
+                                                LayerParam **param) {
+
+        SoftmaxLayerParam* layer_param = new SoftmaxLayerParam();
+        *param                        = layer_param; 
+
+        type = ConvertNCNNLayerType(type_name);
+
+        auto &p = param_dict;
+
+        // ncnn axis [hwc], rpn axis nchw
+        layer_param->axis = GetInt(p, 0, 0) + 1; 
+
+        return TNN_OK;
+    }
+
+    Status SoftmaxLayerInterpreter::InterpretResource(Deserializer& deserializer,
+                                                    std::shared_ptr<LayerInfo> info,   
+                                                    LayerResource** resource) {
+
+        return TNN_OK;
+    }
+
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/unary_op_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/unary_op_layer_interpreter.cc
new file mode 100644
index 0000000..8d2984a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/layer_interpreter/unary_op_layer_interpreter.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_LAYER_INTERPRETER(UnaryOp);
+
+    REGISTER_LAYER_INTERPRETER(UnaryOp, UnaryOp);
+
+    static std::map<int, LayerType> unary_op_layer_type_map = {
+        {0, LAYER_ABS},     {1, LAYER_NEG},     {2, LAYER_FLOOR},   {3, LAYER_CEIL},
+        {4, LAYER_SQUARE},  {5, LAYER_SQRT},    {6, LAYER_RSQRT},   {7, LAYER_EXP},
+        {8, LAYER_LOG},     {9, LAYER_SIN},     {10,LAYER_COS},     {11,LAYER_TAN},
+        {12,LAYER_ASIN},    {13,LAYER_ACOS},    {14,LAYER_ATAN},    {15,LAYER_RECIPROCAL},
+        {16,LAYER_TANH},
+    };
+
+    Status UnaryOpLayerInterpreter::InterpretProto(std::string type_name, str_dict param_dict, LayerType& type,
+                                                   LayerParam** param) {
+        auto& p = param_dict;
+        int op_type = GetInt(p, 0, 0);
+
+        type = unary_op_layer_type_map[op_type];
+        
+        return TNN_OK;
+    }
+
+    Status UnaryOpLayerInterpreter::InterpretResource(Deserializer& deserializer, std::shared_ptr<LayerInfo> info,
+                                                      LayerResource** resource) {
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.cc
new file mode 100644
index 0000000..a0b7c82
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ncnn_layer_type.h"
+#include "tnn/core/layer_type.h"
+
+#include <map>
+#include <string>
+
+namespace TNN_NS {
+
+static std::map<std::string, LayerType> global_layer_type_map = {
+    {"Convolution", LAYER_CONVOLUTION},
+    {"ConvolutionDepthWise", LAYER_CONVOLUTION},
+    {"Deconvolution", LAYER_DECONVOLUTION},
+    {"DeconvolutionDepthWise", LAYER_DECONVOLUTION},
+    {"BatchNorm", LAYER_BATCH_NORM},
+    {"InnerProduct", LAYER_INNER_PRODUCT},
+    {"Pooling", LAYER_POOLING},
+    {"Softmax", LAYER_SOFTMAX},
+    {"ReLU", LAYER_RELU},
+    {"Sigmoid", LAYER_SIGMOID},
+    {"Tanh", LAYER_TANH},
+    {"HardSwish", LAYER_HARDSWISH},
+    {"HardSigmoid", LAYER_HARDSIGMOID},
+    {"LRN", LAYER_LRN},
+    {"AbsVal", LAYER_ABS},
+    {"Split", LAYER_SPLITING},
+    {"Concat", LAYER_CONCAT},
+    {"Reshape", LAYER_RESHAPE},
+    {"Slice", LAYER_SLICE},
+    {"Flatten", LAYER_FLATTEN},
+    {"Dropout", LAYER_DROPOUT},
+    {"ShuffleChannel", LAYER_SHUFFLE_CHANNEL},
+    {"Crop", LAYER_STRIDED_SLICE},
+    {"Permute", LAYER_PERMUTE},
+    {"Interp", LAYER_UPSAMPLE},
+    {"MemoryData", LAYER_CONST},
+    {"ELU", LAYER_ELU},
+    {"PReLU", LAYER_PRELU},
+    {"Clip", LAYER_CLIP},
+    {"Padding", LAYER_PAD},
+    {"SELU", LAYER_SELU},
+    {"DetectionOutput", LAYER_DETECTION_OUTPUT},
+    {"InstanceNorm", LAYER_INST_BATCH_NORM},
+    {"PriorBox", LAYER_PRIOR_BOX},
+    {"DetectionOutput", LAYER_DETECTION_OUTPUT},
+    {"Reorg", LAYER_REORG},
+    {"Normalize", LAYER_NORMALIZE},
+    {"RoiPooling", LAYER_ROIPOOLING},
+    {"Scale", LAYER_SCALE}
+};
+
+LayerType ConvertNCNNLayerType(std::string layer_type_str) {
+    if (global_layer_type_map.count(layer_type_str) > 0) {
+        return global_layer_type_map[layer_type_str];
+    } else {
+        return LAYER_NOT_SUPPORT;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.h
new file mode 100644
index 0000000..a4eb94c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_layer_type.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_LAYER_TYPE_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_LAYER_TYPE_H_
+
+#include <string>
+
+#include "tnn/core/layer_type.h"
+
+namespace TNN_NS {
+
+LayerType ConvertNCNNLayerType(std::string layer_type_str);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_LAYER_TYPE_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc
new file mode 100644
index 0000000..a7fb7e0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc
@@ -0,0 +1,306 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+#include <set>
+#include <sstream>
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/ncnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_layer_type.h"
+#include "tnn/interpreter/ncnn/ncnn_model_interpreter.h"
+#include "tnn/interpreter/ncnn/ncnn_param_utils.h"
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+#define RETURN_ON_ERROR(status) RETURN_ON_NEQ(status, TNN_OK)
+
+    TypeModelInterpreterRegister<TypeModelInterpreterCreator<NCNNModelInterpreter>> g_ncnn_model_interpreter_register(
+        MODEL_TYPE_NCNN);
+
+    Status NCNNModelInterpreter::Interpret(std::vector<std::string> &params) {
+        std::string empty_content = "";
+        std::string &proto_content = params.size() > 0 ? params[0] : empty_content;
+        RETURN_ON_ERROR(InterpretProto(proto_content));
+        std::string &model_content = params.size() > 1 ? params[1] : empty_content;
+        RETURN_ON_ERROR(InterpretModel(model_content));
+        RETURN_ON_ERROR(NCNNOptimizerManager::Optimize(GetNetStructure(), GetNetResource()));
+        RETURN_ON_ERROR(FindOutputs());
+
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::FindOutputs() {
+        NetStructure *structure = GetNetStructure();
+        auto layers             = structure->layers;
+        auto blobs              = structure->blobs;
+        std::set<std::string> out_blobs;
+
+        for (auto layer : layers) {
+            for (auto out_blob : layer->outputs) {
+                out_blobs.insert(out_blob);
+            }
+
+            for (auto in_blob : layer->inputs) {
+                if (out_blobs.find(in_blob) != out_blobs.end()) {
+                    out_blobs.erase(in_blob);
+                }
+            }
+        }
+
+        structure->outputs = out_blobs;
+
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::Convert(shared_ptr<LayerInfo> cur_layer,
+                                         std::vector<std::shared_ptr<LayerInfo>> output_layers) {
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::InterpretProto(std::string &content) {
+        Status ret                   = TNN_OK;
+        NetStructure *structure      = GetNetStructure();
+        structure->source_model_type = MODEL_TYPE_NCNN;
+
+        size_t size = content.size();
+
+        char *proto_buffer = new char[size + 1];
+        size_t fill        = 0;
+        for (size_t i = 0; i < size; ++i) {
+            proto_buffer[fill++] = content[i];
+        }
+        proto_buffer[fill] = '\0';
+
+        // 0. Split lines
+        str_arr cfg_arr;
+        ret = SplitUtils::SplitStr(proto_buffer, cfg_arr, "\n", true, false);
+        delete[] proto_buffer;
+        if (ret != TNN_OK) {
+            return Status(TNNERR_INVALID_NETCFG, "split proto error");
+        }
+        if (cfg_arr.empty() || cfg_arr.size() <= 3) {
+            return Status(TNNERR_INVALID_NETCFG, "content line <= 3");
+        }
+
+        // 1. Parse ncnn magic number
+        int magic_number = atoi(cfg_arr[0].c_str());
+        if (magic_number != ncnn_magic_number) {
+            return Status(TNNERR_INVALID_NETCFG, "invalid_magic_number");
+        }
+
+        // 2. Parse ncnn layer cnt and blob cnt
+        std::string layer_cfg_content = cfg_arr[1];
+        str_arr layer_cfg_vec;
+        ret = SplitUtils::SplitStr(layer_cfg_content.c_str(), layer_cfg_vec, " ", true, false);
+        if (ret != TNN_OK || layer_cfg_vec.size() != 2) {
+            return Status(TNNERR_INVALID_NETCFG, "split layer cnt failed");
+        }
+
+        int layer_cnt = atoi(layer_cfg_vec[0].c_str());
+        int blob_cnt  = atoi(layer_cfg_vec[1].c_str());
+
+        auto &layer_interpreter_map = GetLayerInterpreterMap();
+
+        for (size_t i = layer_cfg_start_id; i < cfg_arr.size(); i++) {
+            str_arr layer_cfg_arr;
+            std::string layer_str = cfg_arr.at(i);
+            if (layer_str.empty()) {
+                continue;
+            }
+            // 0. Split layer str
+            ret = SplitUtils::SplitStr(layer_str.c_str(), layer_cfg_arr, " ", true, true);
+            if (ret != TNN_OK || layer_cfg_arr.empty()) {
+                return Status(TNNERR_INVALID_NETCFG, "split layer info error");
+            }
+
+            if(layer_cfg_arr[0] == "Input") {
+                auto input_name = layer_cfg_arr[layer_param_start_id];
+
+                DimsVector input_shape = DimsVector();
+
+                if (layer_cfg_arr.size() > 5) {
+                    str_arr hwc_param_str(layer_cfg_arr.begin() + 5, layer_cfg_arr.end());
+                    str_dict hwc_vec;
+                    ret = SplitUtils::SplitParamList(hwc_param_str, hwc_vec);
+                    if (ret != TNN_OK) {
+                        LOGE("%s\n", ret.description().c_str());
+                        return Status(TNNERR_INVALID_NETCFG, "split input layer failed");
+                    }
+
+                    // Default batch size 1
+                    input_shape.push_back(1);
+                    input_shape.push_back(GetInt(hwc_vec, 2, 0));  // c
+                    input_shape.push_back(GetInt(hwc_vec, 1, 0));  // h
+                    input_shape.push_back(GetInt(hwc_vec, 0, 0));  // w
+                }
+                structure->inputs_shape_map[input_name] = input_shape;
+            } else {
+                ret = AppendCommonLayer(layer_cfg_arr, structure, layer_interpreter_map);
+                if (ret != TNN_OK) {
+                    return ret;
+                }
+            }
+        }
+
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::AppendCommonLayer(
+            str_arr& layer_cfg_arr,
+            NetStructure *structure,
+            std::map<std::string, std::shared_ptr<AbstractLayerInterpreter>> &layer_interpreter_map) {
+        Status ret = TNN_OK;
+        auto cur_layer = std::make_shared<LayerInfo>();
+        // 0.LayerType;1.layer_name;2.input_count;3.output_count
+        std::string type_str = layer_cfg_arr[0];
+
+        cur_layer->type_str = type_str;
+        cur_layer->type     = LAYER_NOT_SUPPORT;
+        cur_layer->name     = layer_cfg_arr[1];
+
+        // 1. Parse in out nodes
+        int in_count = atoi(layer_cfg_arr[2].c_str());
+        cur_layer->inputs.clear();
+        int out_count = atoi(layer_cfg_arr[3].c_str());
+        cur_layer->outputs.clear();
+        int in_id  = layer_param_start_id;
+        int in_end = in_id + in_count;
+
+        cur_layer->inputs.reserve(std::max(in_end-in_id, 1));
+        for (; in_id < in_end; in_id++) {
+            cur_layer->inputs.push_back(layer_cfg_arr[in_id]);
+            structure->blobs.insert(layer_cfg_arr[in_id]);
+        }
+
+        int out_id  = in_end;
+        int out_end = out_id + out_count;
+
+        cur_layer->outputs.reserve(std::max(out_end-out_id, 1));
+        for (; out_id < out_end; out_id++) {
+            cur_layer->outputs.push_back(layer_cfg_arr[out_id]);
+            structure->blobs.insert(layer_cfg_arr[out_id]);
+        }
+
+        // 2. Split param dict
+        str_arr param_arr(layer_cfg_arr.begin() + out_end, layer_cfg_arr.end());
+        str_dict param_dict;
+        ret = SplitUtils::SplitParamList(param_arr, param_dict);
+        if (ret != TNN_OK) {
+            LOGE("%s\n", ret.description().c_str());
+            return Status(TNNERR_INVALID_NETCFG, "split layer param failed");
+        }
+
+        // 3. Create Layer interpreter
+        auto layer_interpreter = layer_interpreter_map[type_str];
+        if (layer_interpreter == NULL) {
+            LOGET("layer %s not supported\n", "ncnn", type_str.c_str());
+            return Status(TNNERR_INVALID_NETCFG, "nill interpreter");
+        }
+
+        // 4. Interpreter layer
+        LayerParam *param = NULL;
+        ret               = layer_interpreter->InterpretProto(type_str, param_dict, cur_layer->type, &param);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+
+        // 5. check Type
+        if (cur_layer->type == LAYER_NOT_SUPPORT) {
+            LOGET("layer %s interprete failed\n", "ncnn", type_str.c_str());
+            return Status(TNNERR_INVALID_NETCFG, "interpreter failed");
+        }
+
+        if (!param) {
+            param = new LayerParam();
+        }
+
+        // name
+        if (param && layer_cfg_arr.size() >= 2) {
+            param->name = layer_cfg_arr[1];
+        }
+
+        cur_layer->param = shared_ptr<LayerParam>(param);
+
+        structure->layers.push_back(cur_layer);
+
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::InterpretModel(std::string &model_content) {
+        auto &layer_interpreter_map = GetLayerInterpreterMap();
+
+        NetResource *net_resource = GetNetResource();
+        NetStructure *structure   = GetNetStructure();
+
+        const auto model_length = model_content.length();
+        if (model_length <= 0) {
+#ifdef GENERATE_RESOURCE
+            return TNN_OK;
+#else
+            return Status(TNNERR_LOAD_MODEL, "model content is invalid");
+#endif
+        }
+
+        std::istringstream content_stream;
+        content_stream.str(model_content);
+
+        Deserializer deserializer(content_stream);
+
+        for (auto layer : structure->layers) {
+            auto type_str = layer->type_str;
+
+            // 0. Create Layer interpreter
+            auto layer_interpreter = layer_interpreter_map[type_str];
+            if (layer_interpreter == NULL) {
+                LOGET("layer %s not supported\n", "ncnn", type_str.c_str());
+                return Status(TNNERR_INVALID_NETCFG, "nill interpreter");
+            }
+
+            // 1. Interpreter layer
+            LayerResource *layer_resource = NULL;
+
+            Status result = layer_interpreter->InterpretResource(deserializer, layer, &layer_resource);
+            if (result != TNN_OK) {
+                LOGDT("ncnn model %s interpreter failed\n", "ncnn", type_str.c_str());
+                return result;
+            }
+
+            // 1. Interpreter layer
+            net_resource->resource_map[layer->name] = std::shared_ptr<LayerResource>(layer_resource);
+        }
+
+        return TNN_OK;
+    }
+
+    Status NCNNModelInterpreter::RegisterLayerInterpreter(std::string type_name,
+                                                          AbstractLayerInterpreter *interpreter) {
+        auto &layer_interpreter_map      = GetLayerInterpreterMap();
+        layer_interpreter_map[type_name] = std::shared_ptr<AbstractLayerInterpreter>(interpreter);
+        return TNN_OK;
+    }
+
+    std::map<std::string, std::shared_ptr<AbstractLayerInterpreter>> &NCNNModelInterpreter::GetLayerInterpreterMap() {
+        static std::map<std::string, std::shared_ptr<AbstractLayerInterpreter>> layer_interpreter_map;
+        return layer_interpreter_map;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.h
new file mode 100644
index 0000000..10c82a6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_model_interpreter.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_MODEL_INTERPRETER_H_
+
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include "tnn/interpreter/default_model_interpreter.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    class AbstractLayerInterpreter;
+
+    static const int layer_cfg_start_id   = 2;
+    static const int layer_param_start_id = 4;
+    static const int ncnn_magic_number    = 7767517;
+
+    // @brief NCNNModelInterpreter used to interpreter ncnn model
+    class NCNNModelInterpreter : public DefaultModelInterpreter {
+    public:
+        // @brief ncnn model interpreter load params is param content and bin
+        virtual Status Interpret(std::vector<std::string> &params);
+
+        static Status RegisterLayerInterpreter(std::string type_name, AbstractLayerInterpreter* creator);
+
+        // @brief get layer interpreter by layer type
+        static std::map<std::string, std::shared_ptr<AbstractLayerInterpreter>>& GetLayerInterpreterMap();
+
+    private:
+        Status InterpretProto(std::string &content);
+        Status InterpretModel(std::string &model_content);
+        Status InterpretInput();
+        Status AppendCommonLayer(
+            str_arr& layer_cfg_arr, NetStructure *structure,
+            std::map<std::string, std::shared_ptr<AbstractLayerInterpreter>> &layer_interpreter_map);
+
+        Status FindOutputs();
+        Status Convert(shared_ptr<LayerInfo> cur_layer, std::vector<std::shared_ptr<LayerInfo>> output_layers);
+    };
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_MODEL_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.cc
new file mode 100644
index 0000000..9453d92
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ncnn_param_utils.h"
+
+#include <stdlib.h>
+#include <string>
+
+#include "tnn/core/status.h"
+#include "tnn/utils/split_utils.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    int GetInt(str_dict param, int index, int default_value) {
+        if (param.find(index) == param.end()) {
+            return default_value;
+        }
+        return atoi(param[index].c_str());
+    }
+
+    float GetFloat(str_dict param, int index, float default_value) {
+        if (param.find(index) == param.end()) {
+            return default_value;
+        }
+        return static_cast<float>(atof(param[index].c_str()));
+    }
+
+    str_arr GetStrList(str_dict param, int index) {
+        str_arr param_vec;
+        if (param.find(index) == param.end()) {
+            return param_vec;
+        }
+
+        char *str = const_cast<char *>(param[index].c_str());
+
+        Status ret = SplitUtils::SplitStr(str, param_vec, ",", true, true);
+        if (ret != TNN_OK) {
+            LOGE("split param list failed\n");
+            return param_vec;
+        }
+        return param_vec;
+    }
+
+    std::vector<float> GetFloatList(str_dict param, int index) {
+        std::vector<float> float_result;
+        str_arr param_vec = GetStrList(param, index);
+        // start from offset 1; first element is the length
+        for (size_t i = 1; i < param_vec.size(); i++) {
+            float_result.push_back(atof(param_vec[i].c_str()));
+        }
+        return float_result;
+    }
+
+    std::vector<int> GetIntList(str_dict param, int index) {
+        std::vector<int> int_result;
+        str_arr param_vec = GetStrList(param, index);
+        // start from the offset 1; first element is the length
+        for (size_t i = 1; i < param_vec.size(); i++) {
+            int_result.push_back(atoi(param_vec[i].c_str()));
+        }
+        return int_result;
+    }
+
+    bool HasField(str_dict param, int index) {
+        return param.find(index) != param.end();
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.h
new file mode 100644
index 0000000..062be0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/ncnn_param_utils.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_PARAM_UTILS_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_PARAM_UTILS_H_
+
+#include "tnn/utils/split_utils.h"
+
+#include <string>
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    int GetInt(str_dict param, int index, int default_value = 0);
+
+    float GetFloat(str_dict param, int index, float default_value = 0.f);
+
+    std::vector<float> GetFloatList(str_dict param, int index);
+
+    std::vector<int> GetIntList(str_dict param, int index);
+
+    bool HasField(str_dict param, int index);
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_NCNN_PARAM_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/expand_slice_optimizer.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/expand_slice_optimizer.cc
new file mode 100644
index 0000000..f5ad0f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/expand_slice_optimizer.cc
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_NCNN_OPTIMIZER(ExpandSlice);
+
+    NCNNOptimizerRegister<ExpandSliceOptimizer> g_ncnn_expand_slice_optimizer;
+
+    std::string ExpandSliceOptimizer::Strategy() {
+        return "NCNNOptimizerExpandSlice";
+    }
+
+    Status expand_slice(std::shared_ptr<LayerInfo> layer, std::vector<std::shared_ptr<LayerInfo>> &output_layers) {
+        output_layers.resize(0);
+
+        SliceLayerParam *slice_param = dynamic_cast<SliceLayerParam *>(layer->param.get());
+        if (slice_param == NULL) {
+            return Status(TNNERR_NET_ERR, "Error: slice param nil.");
+        }
+
+        if (layer->outputs.size() != slice_param->slices.size()) {
+            return Status(TNNERR_NET_ERR, "Error: slice param error.");
+        }
+
+        std::vector<int> ones  = {1, 1, 1, 1};
+        std::vector<int> zeros = {0, 0, 0, 0};
+
+        int begin_acc = 0;
+        for (size_t i = 0; i < layer->outputs.size(); i++) {
+            auto out_name = layer->outputs[i];
+
+            LayerInfo *new_layer = new LayerInfo();
+            new_layer->name      = out_name;
+            new_layer->type      = LAYER_STRIDED_SLICE;
+            new_layer->type_str  = "StridedSlice";
+            new_layer->inputs    = layer->inputs;
+            new_layer->outputs   = {out_name};
+
+            StrideSliceLayerParam *new_param = new StrideSliceLayerParam();
+            new_param->strides               = ones;
+            new_param->begins                = zeros;
+            new_param->ends                  = zeros;
+            int slice_size                   = slice_param->slices[i];
+            // order [w h d c n]
+            new_param->begins[3 - slice_param->axis] = begin_acc;
+            new_param->ends[3 - slice_param->axis]   = begin_acc + slice_size;
+
+            new_layer->param = std::shared_ptr<LayerParam>(new_param);
+            output_layers.push_back(std::shared_ptr<LayerInfo>(new_layer));
+
+            if (slice_size == -233) {
+                new_param->ends[3 - slice_param->axis] = 0;
+                break;
+            }
+            begin_acc += slice_size;
+        }
+
+        if (output_layers.size() != layer->outputs.size()) {
+            return Status(TNNERR_NET_ERR, "Error: expand slice fail.");
+        }
+        return TNN_OK;
+    }
+
+    Status ExpandSliceOptimizer::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        for (int index = 0; index < count; index++) {
+            auto cur_layer = layers_orig[index];
+            if (cur_layer->type == LAYER_SLICE) {
+                std::vector<std::shared_ptr<LayerInfo>> expanded_layers;
+                RETURN_ON_NEQ(expand_slice(cur_layer, expanded_layers), TNN_OK);
+                layers_fused.insert(layers_fused.end(), expanded_layers.begin(), expanded_layers.end());
+            } else {
+                layers_fused.push_back(cur_layer);
+            }
+        }
+
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/memory_data_optimizer.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/memory_data_optimizer.cc
new file mode 100644
index 0000000..dd14664
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/memory_data_optimizer.cc
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    DECLARE_NCNN_OPTIMIZER(MemoryData);
+
+    NCNNOptimizerRegister<MemoryDataOptimizer> g_ncnn_memory_data_optimizer;
+
+    std::string MemoryDataOptimizer::Strategy() {
+        return "NCNNOptimizerRemoveMemoryData";
+    }
+
+    std::set<LayerType> binary_op_sets = {
+        LAYER_ADD,
+        LAYER_SUB,
+        LAYER_MUL,
+        LAYER_DIV,
+    };
+
+    Status convert_const_to_weights(std::shared_ptr<LayerInfo> op, std::shared_ptr<LayerInfo> const_op,
+                                    NetResource *net_resource) {
+        // EltwiseLayerResource * ele_res = new EltwiseLayerResource();
+        std::shared_ptr<EltwiseLayerResource> ele_res(new EltwiseLayerResource());
+
+        auto it = std::find(op->inputs.begin(), op->inputs.end(), const_op->name);
+        if (it != op->inputs.end()) {
+            op->inputs.erase(it);
+        } else {
+            return Status(TNNERR_NET_ERR, "Error in convert_const_to_weights");
+        }
+
+        ConstLayerParam *const_param = dynamic_cast<ConstLayerParam *>(const_op->param.get());
+
+        if (const_param == nullptr) {
+            return Status(TNNERR_NET_ERR, "Error: const param null.");
+        }
+
+        RawBuffer weights;
+        if (net_resource->resource_map.count(const_op->name) > 0) {
+            ConstLayerResource *const_res =
+                dynamic_cast<ConstLayerResource *>(net_resource->resource_map[const_op->name].get());
+
+            if (const_res == nullptr) {
+                return Status(TNNERR_NET_ERR, "Error: const weights null.");
+            }
+
+            weights = const_res->weight_handle;
+        } else {
+#ifdef GENERATE_RESOURCE
+            // generate weights in benchmark mode
+            int weight_size = 1;
+            for (auto dim_i : const_param->dims) {
+                weight_size *= dim_i;
+            }
+            weights = RawBuffer(weight_size * sizeof(float));
+            for (int i = 0; i < weight_size; i++) {
+                weights.force_to<float *>()[i] = 1.0;
+            }
+#else
+            return Status(TNNERR_NET_ERR, "Error: not found const weights.");
+#endif
+        }
+
+        ele_res->element_handle = weights;
+        ele_res->element_shape  = const_param->dims;
+
+        net_resource->resource_map[op->name] = std::dynamic_pointer_cast<LayerResource>(ele_res);
+
+        return TNN_OK;
+    }
+
+    Status MemoryDataOptimizer::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+        std::map<std::string, std::shared_ptr<LayerInfo>> const_layers;
+
+        for (int index = 0; index < count; index++) {
+            auto cur_layer = layers_orig[index];
+
+            // mark Const Layer, aka MemoryData layer
+            if (cur_layer->type == LAYER_CONST) {
+                const_layers[cur_layer->name] = cur_layer;
+                continue;
+            }
+
+            layers_fused.push_back(cur_layer);
+
+            // only work for layer with 2 inputs
+            if (cur_layer->inputs.size() != 2) {
+                continue;
+            }
+
+            // only work for binary ops
+            if (binary_op_sets.find(cur_layer->type) == binary_op_sets.end()) {
+                continue;
+            }
+
+            // ignore this case: Const as the first input of LAYER_DIV
+            if (cur_layer->type == LAYER_DIV && const_layers.find(cur_layer->inputs[0]) != const_layers.end()) {
+                continue;
+            }
+
+            for (auto in_name : cur_layer->inputs) {
+                if (const_layers.find(in_name) != const_layers.end()) {
+                    auto status = convert_const_to_weights(cur_layer, const_layers[in_name], resource);
+                    if (status != TNN_OK) {
+                        return status;
+                    }
+                    const_layers.erase(in_name);
+                }
+            }
+        }
+
+        if (const_layers.size() != 0) {
+            return Status(TNNERR_NET_ERR, "Error: unfused const layer.");
+        }
+
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h
new file mode 100644
index 0000000..317632f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    //@brief interprete ncnn models to tnn models
+    class NCNNOptimizer {
+    public:
+        NCNNOptimizer(){};
+        virtual std::string Strategy() = 0;
+        virtual ~NCNNOptimizer(){};
+        virtual Status Optimize(NetStructure *structure, NetResource *resource) = 0;
+    };
+
+#define DECLARE_NCNN_OPTIMIZER(optimizer_name)                                                                         \
+    class optimizer_name##Optimizer : public NCNNOptimizer {                                                           \
+    public:                                                                                                            \
+        virtual std::string Strategy();                                                                                \
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);                                       \
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.cc b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.cc
new file mode 100644
index 0000000..3892d90
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    std::map<std::string, shared_ptr<NCNNOptimizer>> &NCNNOptimizerManager::GetNetOptimizerMap() {
+        static std::map<std::string, std::shared_ptr<NCNNOptimizer>> s_ncnn_optimizer_map;
+        return s_ncnn_optimizer_map;
+    }
+
+    std::vector<std::string> &NCNNOptimizerManager::GetNetOptimizerSeq() {
+        static std::vector<std::string> s_ncnn_optimizer_seq;
+        return s_ncnn_optimizer_seq;
+    }
+
+    Status NCNNOptimizerManager::Optimize(NetStructure *structure, NetResource *resource) {
+        auto &optimizer_map  = NCNNOptimizerManager::GetNetOptimizerMap();
+        auto &optimizer_list = NCNNOptimizerManager::GetNetOptimizerSeq();
+
+        for (auto optimizer_name : optimizer_list) {
+            auto optimizer = optimizer_map[optimizer_name];
+            auto status    = optimizer->Optimize(structure, resource);
+            if (status != TNN_OK) {
+                return status;
+            }
+        }
+        return TNN_OK;
+    }
+
+    void NCNNOptimizerManager::RegisterNetOptimizer(NCNNOptimizer *optimizer) {
+        auto &optimizer_map           = NCNNOptimizerManager::GetNetOptimizerMap();
+        auto optimizer_name           = optimizer->Strategy();
+        optimizer_map[optimizer_name] = std::shared_ptr<NCNNOptimizer>(optimizer);
+        NCNNOptimizerManager::GetNetOptimizerSeq().push_back(optimizer_name);
+    }
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h
new file mode 100644
index 0000000..f9f25c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/optimizer/ncnn_optimizer_manager.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_MANAGER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_MANAGER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/interpreter/ncnn/optimizer/ncnn_optimizer.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+
+    //@brief ncnn optimize manager
+    class NCNNOptimizerManager {
+    public:
+        static Status Optimize(NetStructure *structure, NetResource *resource);
+
+        static void RegisterNetOptimizer(NCNNOptimizer *ptimizer);
+
+    private:
+        static std::map<std::string, std::shared_ptr<NCNNOptimizer>>
+            &GetNetOptimizerMap();
+
+        static std::vector<std::string> &GetNetOptimizerSeq();
+    };
+
+    template <typename T>
+    class NCNNOptimizerRegister {
+    public:
+        explicit NCNNOptimizerRegister() {
+            NCNNOptimizerManager::RegisterNetOptimizer(new T());
+        }
+    };
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_OPTIMIZER_NCNN_OPTMIZER_MANAGER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/ncnn/serializer.h b/3rdparty/TNN/source/tnn/interpreter/ncnn/serializer.h
new file mode 100644
index 0000000..b2a7ebb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/ncnn/serializer.h
@@ -0,0 +1,157 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NCNN_SERIALIZER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NCNN_SERIALIZER_H_
+
+#include <string.h>
+#include <fstream>
+#include <string>
+#include <typeinfo>
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/core/common.h"
+#include "tnn/interpreter/tnn/objseri.h"
+
+namespace TNN_NS {
+
+namespace ncnn {
+
+    typedef union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } ncnn_header_t;
+
+    static inline size_t AlignSize(size_t sz, int n)
+    {
+        return (sz + n-1) & -n;
+    }
+
+    class Deserializer : TNN_NS::Deserializer {
+    public:
+        explicit Deserializer(std::istream &is) : TNN_NS::Deserializer(is) {}
+
+        int GetInt() {
+            return get_basic_t<int>();
+        }
+
+        void GetRaw(RawBuffer &value, size_t w) {
+
+            ncnn_header_t flag_struct;
+
+            _istream.read((char*)&flag_struct,
+                static_cast<std::streamsize>(sizeof(ncnn_header_t)));
+            if (_istream.eof()) {
+                return;
+            }
+            unsigned int flag = flag_struct.f0 +
+                                flag_struct.f1 +
+                                flag_struct.f2 +
+                                flag_struct.f3;
+
+            size_t read_size;
+            DataType data_type;
+
+            if (flag_struct.tag == 0x01306B47)
+            {
+                // fp16
+                read_size = AlignSize(w * sizeof(unsigned short), 4);
+                data_type = DATA_TYPE_HALF;
+            }
+            else if (flag_struct.tag == 0x000D4B38)
+            {
+                // int8 data
+                read_size = AlignSize(w, 4);
+                data_type = DATA_TYPE_INT8;
+            }
+            else if (flag_struct.tag == 0x0002C056)
+            {
+                // float
+                read_size = w * sizeof(float);
+                data_type = DATA_TYPE_FLOAT;
+            }
+            else if (flag !=0)
+            {
+                // quantized data
+                float quantization_value[256];
+                _istream.read(reinterpret_cast<char *>(quantization_value),
+                    static_cast<std::streamsize>(256 * sizeof(float)));
+
+                size_t index_size = AlignSize(w, 4);
+                std::vector<unsigned char> index_array;
+                index_array.resize(index_size);
+
+                _istream.read(reinterpret_cast<char *>(index_array.data()),
+                    static_cast<std::streamsize>(256 * sizeof(uint8_t)));
+
+                value = RawBuffer(256 * sizeof(float));
+                value.SetDataType(DATA_TYPE_FLOAT);
+
+                float* ptr = value.force_to<float *>();
+                for (size_t i = 0; i < w; i++)
+                {
+                    ptr[i] = quantization_value[ index_array[i] ];
+                }
+
+                return;
+            }
+            else if (flag_struct.f0 == 0)
+            {
+                // float
+                read_size = w * sizeof(float);
+                data_type = DATA_TYPE_FLOAT;
+
+            }
+
+            value = RawBuffer(static_cast<int>(read_size));
+            value.SetDataType(data_type);
+
+            char *buffer = value.force_to<char *>();
+            if (_istream.eof())
+                return;
+            _istream.read(buffer, static_cast<std::streamsize>(read_size));
+
+            return;
+
+        }
+
+        void GetRawSimple(RawBuffer &value, size_t w) {
+
+            size_t read_size = w * sizeof(float);
+            value = RawBuffer(static_cast<int>(read_size));
+            value.SetDataType(DATA_TYPE_FLOAT);
+
+            char *buffer = value.force_to<char *>();
+            if (_istream.eof())
+                return;
+            _istream.read(buffer, static_cast<std::streamsize>(read_size));
+
+            return;
+        }
+
+    private:
+        Deserializer &operator=(const Deserializer &);
+    };
+
+}  // namespace ncnn
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NCNN_SERIALIZER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/net_resource.cc b/3rdparty/TNN/source/tnn/interpreter/net_resource.cc
new file mode 100644
index 0000000..aea5bc2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/net_resource.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/interpreter/net_resource.h"
+#include <map>
+
+namespace TNN_NS {
+DataType GetNetResourceDataType(NetResource* resource) {
+    if (!resource) {
+        return DATA_TYPE_FLOAT;
+    }
+    for (auto iter : resource->resource_map) {
+        ConvLayerResource* conv_resource = dynamic_cast<ConvLayerResource*>(iter.second.get());
+        if (conv_resource) {
+            auto type = conv_resource->filter_handle.GetDataType();
+            if (type == DATA_TYPE_HALF)
+                return DATA_TYPE_HALF;
+        }
+    }
+    return DATA_TYPE_FLOAT;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/net_resource.h b/3rdparty/TNN/source/tnn/interpreter/net_resource.h
new file mode 100644
index 0000000..0c28839
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/net_resource.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NET_RESOURCE_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NET_RESOURCE_H_
+
+#include <map>
+#include <set>
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+struct NetResource {
+    std::map<std::string, std::shared_ptr<LayerResource>> resource_map;
+    ConstantResource constant_map;
+    
+    //data flag of constant blobs
+    ConstantResourceFlag constant_blob_flags;
+    
+    //names of constant layer whose output blob data flag is DATA_FLAG_CHANGE_NEVER or DATA_FLAG_CHANGE_IF_SHAPE_DIFFER
+    std::set<std::string> constant_layers;
+    
+    //names of constant layer whose output blob data flag is DATA_FLAG_CHANGE_IF_SHAPE_DIFFER
+    std::set<std::string> shape_differ_layers;
+    
+    //default shape map, also it is max shape map corresponding to max_inputs_shape in Instance.Init
+    BlobShapesMap blob_shapes_map;
+    //min shape map, corresponding to min_inputs_shape in Instance.Init
+    BlobShapesMap min_blob_shapes_map;
+    
+};
+
+DataType GetNetResourceDataType(NetResource *resource);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NET_RESOURCE_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/net_structure.cc b/3rdparty/TNN/source/tnn/interpreter/net_structure.cc
new file mode 100644
index 0000000..8a04c48
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/net_structure.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/net_structure.h"
+
+#include <algorithm>
+
+namespace TNN_NS {
+
+std::shared_ptr<LayerInfo> GetLayerInfoFromName(NetStructure* net_struct, std::string name) {
+    std::shared_ptr<LayerInfo> layer_info;
+    for (auto item : net_struct->layers) {
+        if (item != nullptr && item->name == name) {
+            layer_info = item;
+            break;
+        }
+    }
+
+    return layer_info;
+}
+
+bool GetQuantizedInfoFromNetStructure(NetStructure* net_struct) {
+    std::vector<std::shared_ptr<LayerInfo>> layers = net_struct->layers;
+    auto quantize_layer = std::find_if(layers.begin(), layers.end(), [](std::shared_ptr<LayerInfo> iter) {
+        return iter->param->quantized == true;
+    });
+    return quantize_layer != layers.end();
+}
+
+bool NeedDoConstantFolding(NetStructure* net_struct) {
+    if (!net_struct) {
+        return false;
+    }
+    
+    for (auto item : net_struct->layers) {
+        if (item != nullptr &&
+            (item->type == LAYER_SHAPE || item->type_str == "Shape")) {
+            return true;
+        }
+    }
+    
+    return false;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/net_structure.h b/3rdparty/TNN/source/tnn/interpreter/net_structure.h
new file mode 100644
index 0000000..2185d5e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/net_structure.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_NET_STRUCTURE_H_
+#define TNN_SOURCE_TNN_INTERPRETER_NET_STRUCTURE_H_
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/split_utils.h"
+
+namespace TNN_NS {
+
+// @brief LayerInfo describes layer name, type, inputs, outputs and
+// parameter info
+struct LayerInfo {
+    LayerType type;
+    std::string type_str;
+    std::string name;
+    std::vector<std::string> inputs;
+    std::vector<std::string> outputs;
+    std::shared_ptr<LayerParam> param = nullptr;
+
+public:
+    std::shared_ptr<LayerInfo> Copy() {
+        std::shared_ptr<LayerInfo> layer_info(new LayerInfo());
+        if (nullptr == layer_info) {
+            return nullptr;
+        }
+        *layer_info.get() = *this;
+        auto layer_param = param->Copy();
+        if (nullptr != layer_param) {
+            layer_info->param = layer_param;
+        }
+
+        return layer_info;
+    }
+};
+
+// @brief NetStructure describes network build info
+struct NetStructure {
+    InputShapesMap inputs_shape_map;
+    InputDataTypeMap  input_data_type_map;
+    std::set<std::string> outputs;
+    std::vector<std::shared_ptr<LayerInfo>> layers;
+    std::set<std::string> blobs;
+    ModelType source_model_type = MODEL_TYPE_TNN;
+
+public:
+    std::shared_ptr<NetStructure> Copy() {
+        std::shared_ptr<NetStructure> net_structure(new NetStructure());
+        if (nullptr == net_structure) {
+            return nullptr;
+        }
+        *net_structure.get() = *this;
+        net_structure->layers.clear();
+        for (auto layer_info : layers) {
+            auto layer_info_new = layer_info->Copy();
+            if (nullptr != layer_info_new) {
+                net_structure->layers.push_back(layer_info_new);
+            } else {
+                net_structure->layers.push_back(layer_info);
+            }
+        }
+        return net_structure;
+    }
+
+    NetStructure* CreateNew() {
+        NetStructure* net_structure(new NetStructure());
+        if (nullptr == net_structure) {
+            return nullptr;
+        }
+        *net_structure = *this;
+        net_structure->layers.clear();
+        for (auto layer_info : layers) {
+            auto layer_info_new = layer_info->Copy();
+            if (nullptr != layer_info_new) {
+                net_structure->layers.push_back(layer_info_new);
+            } else {
+                net_structure->layers.push_back(layer_info);
+            }
+        }
+        return net_structure;
+    }
+};
+
+std::shared_ptr<LayerInfo> GetLayerInfoFromName(NetStructure* net_struct, std::string name);
+
+bool GetQuantizedInfoFromNetStructure(NetStructure* net_struct);
+
+bool NeedDoConstantFolding(NetStructure* net_struct);
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_NET_STRUCTURE_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/raw_buffer.cc b/3rdparty/TNN/source/tnn/interpreter/raw_buffer.cc
new file mode 100644
index 0000000..7a059fb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/raw_buffer.cc
@@ -0,0 +1,237 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/raw_buffer.h"
+#include <fstream>
+#include <string>
+#include <typeinfo>
+#include <utility>
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+using namespace TNN_NS;
+
+namespace TNN_NS {
+RawBuffer::~RawBuffer() {
+    buff_ = nullptr;
+}
+
+RawBuffer::RawBuffer() :
+  bytes_size_(0),
+  data_type_(DATA_TYPE_FLOAT) {}
+
+RawBuffer::RawBuffer(int bytes_size) {
+    if (bytes_size > 0) {
+        buff_ = shared_ptr<char>(new char[bytes_size], [](char *p) { delete[] p; });
+        memset(buff_.get(), 0, bytes_size);
+    } else {
+        buff_ = nullptr;
+    }
+
+    bytes_size_ = bytes_size;
+}
+
+RawBuffer::RawBuffer(int bytes_size, DimsVector dims) : RawBuffer(bytes_size){
+    this->dims_ = dims;
+}
+
+RawBuffer::RawBuffer(int bytes_size, char *buffer) {
+    if (bytes_size > 0) {
+        buff_ = shared_ptr<char>(new char[bytes_size], [](char *p) { delete[] p; });
+        memcpy(buff_.get(), buffer, bytes_size);
+    } else {
+        buff_ = nullptr;
+    }
+    
+    bytes_size_ = bytes_size;
+}
+
+RawBuffer::RawBuffer(int bytes_size, char* buffer, DimsVector dims) : RawBuffer(bytes_size, buffer) {
+          this->dims_ = dims;
+}
+
+RawBuffer::RawBuffer(const RawBuffer &buf) {
+    this->bytes_size_ = buf.bytes_size_;
+    this->data_type_  = buf.data_type_;
+    this->buff_       = buf.buff_;
+    this->dims_       = buf.dims_;
+}
+
+void RawBuffer::SetBufferDims(DimsVector dims) {
+    this->dims_ = dims;
+}
+
+DimsVector RawBuffer::GetBufferDims() {
+    return this->dims_;
+}
+
+void* aligned_malloc(size_t bytes_size, size_t alignment) {
+    void* origin_ptr;
+    void** align_ptr;
+    int offset = alignment - 1 + sizeof(void*);
+
+    origin_ptr = (void*)malloc(bytes_size + offset);
+    align_ptr = (void**)(((size_t)(origin_ptr) + offset) & ~(alignment - 1));
+    align_ptr[-1] = origin_ptr;
+    return align_ptr;
+}
+
+void aligned_free(void *align_ptr) {
+    free(((void**)align_ptr)[-1]);
+}
+
+RawBuffer::RawBuffer(int bytes_size, int alignment) {
+    buff_ = shared_ptr<char>(static_cast<char*>(aligned_malloc(bytes_size, alignment)), &aligned_free);
+    memset(buff_.get(), 0, bytes_size);
+    bytes_size_ = bytes_size;
+}
+
+template <typename T>
+void permute(void *in, void *out, size_t outter, size_t inner) {
+    T *in_ptr  = static_cast<T *>(in);
+    T *out_ptr = static_cast<T *>(out);
+    for (size_t i = 0; i < outter; i++) {
+        for (size_t j = 0; j < inner; j++) {
+            out_ptr[j * outter + i] = in_ptr[i * inner + j];
+        }
+    }
+}
+
+void RawBuffer::Permute(size_t outter, size_t inner) {
+    RawBuffer tmp(bytes_size_);
+    switch (data_type_) {
+        case DATA_TYPE_FLOAT:
+            permute<float>(buff_.get(), tmp.buff_.get(), outter, inner);
+            break;
+        case DATA_TYPE_HALF:
+            permute<short>(buff_.get(), tmp.buff_.get(), outter, inner);
+            break;
+        case DATA_TYPE_INT8:
+            permute<int8_t>(buff_.get(), tmp.buff_.get(), outter, inner);
+            break;
+        default:
+            break;
+    }
+
+    buff_ = tmp.buff_;
+    return;
+}
+
+RawBuffer &RawBuffer::operator=(RawBuffer buf) {
+    this->bytes_size_ = buf.bytes_size_;
+    this->data_type_  = buf.data_type_;
+    this->buff_       = buf.buff_;
+    this->dims_       = buf.dims_;
+    return *this;
+}
+
+void RawBuffer::buffer(char *buf, int bytes_size) {
+    if (bytes_size > bytes_size_) {
+        return;
+    }
+    if (!buff_) {
+        buff_ = shared_ptr<char>(new char[bytes_size_], [](char *p) { delete[] p; });
+    }
+    memcpy(buff_.get(), buf, bytes_size);
+    // buff_ = buf;
+}
+
+void RawBuffer::SetDataType(DataType data_type) {
+    data_type_ = data_type;
+}
+
+DataType RawBuffer::GetDataType() {
+    return data_type_;
+}
+
+int RawBuffer::GetBytesSize() {
+    return bytes_size_;
+}
+
+int RawBuffer::GetDataCount() {
+    int elem_size = DataTypeUtils::GetBytesSize(data_type_);
+    return elem_size > 0 ? bytes_size_ / elem_size : 0;
+}
+
+/*
+ * Convert the data handle form half to Float32
+ */
+RawBuffer ConvertHalfHandle(RawBuffer &buf) {
+    if (buf.GetBytesSize() > 0 && buf.GetDataType() == DATA_TYPE_HALF) {
+        auto data_count = buf.GetDataCount();
+        RawBuffer buf_f32(data_count * sizeof(float));
+        ConvertFromHalfToFloat(buf.force_to<void *>(), buf_f32.force_to<float *>(), data_count);
+        return buf_f32;
+    } else {
+        return buf;
+    }
+}
+
+/*
+ * Convert the data handle form float to bfp16
+ */
+RawBuffer ConvertFloatToBFP16(RawBuffer &buf) {
+    if (buf.GetBytesSize() > 0 && buf.GetDataType() == DATA_TYPE_FLOAT) {
+        auto data_count = buf.GetDataCount();
+        RawBuffer buf_bfp16(data_count * sizeof(bfp16_t));
+        ConvertFromFloatToBFP16(buf.force_to<float *>(), buf_bfp16.force_to<void *>(), data_count);
+        buf_bfp16.SetDataType(DATA_TYPE_BFP16);
+        return buf_bfp16;
+    } else {
+        return buf;
+    }
+}
+
+/*
+ * Convert the data handle form half to bfp16
+ */
+RawBuffer ConvertHalfToBFP16(RawBuffer &buf) {
+    if (buf.GetBytesSize() > 0 && buf.GetDataType() == DATA_TYPE_HALF) {
+        auto buf_fp32   = ConvertHalfHandle(buf);
+        auto data_count = buf_fp32.GetDataCount();
+        RawBuffer buf_bfp16(data_count * sizeof(bfp16_t));
+        ConvertFromFloatToBFP16(buf_fp32.force_to<float *>(), buf_bfp16.force_to<void *>(), data_count);
+        buf_bfp16.SetDataType(DATA_TYPE_BFP16);
+        return buf_bfp16;
+    } else {
+        return buf;
+    }
+}
+
+std::shared_ptr<float> GetFloatFromRawBuffer(RawBuffer &raw_buffer) {
+    int element_size = 0;
+    DataType type    = raw_buffer.GetDataType();
+    int bytes        = raw_buffer.GetBytesSize();
+    if (0 == bytes)
+        return nullptr;
+
+    std::shared_ptr<float> float_data;
+    if (type == DATA_TYPE_FLOAT) {
+        element_size = bytes / sizeof(float);
+        float_data.reset(new float[element_size], [](float *p) { delete[] p; });
+        memcpy(float_data.get(), raw_buffer.force_to<float *>(), bytes);
+    } else if (type == DATA_TYPE_HALF) {
+        element_size = bytes / 2;
+        float_data.reset(new float[element_size], [](float *p) { delete[] p; });
+        ConvertFromHalfToFloat(raw_buffer.force_to<void *>(), float_data.get(), element_size);
+    } else if (type == DATA_TYPE_INT8) {
+        LOGE("Not support INT8 raw buffer\n");
+        return nullptr;
+    }
+
+    return float_data;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/raw_buffer.h b/3rdparty/TNN/source/tnn/interpreter/raw_buffer.h
new file mode 100644
index 0000000..a7afe0d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/raw_buffer.h
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_RAW_BUFFER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_RAW_BUFFER_H_
+
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <typeinfo>
+#include "tnn/core/common.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+using std::shared_ptr;
+
+class RawBuffer {
+public:
+    RawBuffer();
+    explicit RawBuffer(int bytes_size);
+    RawBuffer(int bytes_size, DimsVector dims);
+    RawBuffer(int bytes_size, char *buffer);
+    RawBuffer(int bytes_size, char* buffer, DimsVector dims);
+    RawBuffer(const RawBuffer &buf);
+    RawBuffer(int bytes_size, int alignment);
+    RawBuffer &operator=(RawBuffer buf);
+    ~RawBuffer();
+
+    void buffer(char *buf, int bytes_size);
+    void SetDataType(DataType data_type);
+    void SetBufferDims(DimsVector shape);
+
+
+
+    DataType GetDataType();
+    int GetBytesSize();
+    int GetDataCount();
+    DimsVector GetBufferDims();
+
+    void Permute(size_t outter, size_t inner);
+
+    template <typename T>
+    T force_to() {
+        return reinterpret_cast<T>(buff_ ? buff_.get() : nullptr);
+    }
+
+private:
+    shared_ptr<char> buff_ = nullptr;
+    int bytes_size_        = 0;
+    DataType data_type_    = DATA_TYPE_FLOAT;
+    DimsVector dims_ = {};
+};
+
+RawBuffer ConvertHalfHandle(RawBuffer &buf);
+RawBuffer ConvertFloatToBFP16(RawBuffer &buf);
+RawBuffer ConvertHalfToBFP16(RawBuffer &buf);
+std::shared_ptr<float> GetFloatFromRawBuffer(RawBuffer &raw_buffer);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_RAW_BUFFER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h
new file mode 100644
index 0000000..4bffa1d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
+
+#include <cstdlib>
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h"
+#include "tnn/interpreter/tnn/model_interpreter.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/split_utils.h"
+
+using namespace TNN_NS;
+namespace TNN_NS {
+
+class AbstractLayerInterpreter {
+public:
+    // @brief create layer param form layer cfg array
+    virtual TNN_NS::Status InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) = 0;
+
+    virtual Status InterpretResource(Deserializer &deserializer, LayerResource **resource) = 0;
+
+    virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) = 0;
+
+    virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) = 0;
+
+    virtual ~AbstractLayerInterpreter(){};
+};
+
+template <typename T>
+class TypeLayerInterpreterRegister {
+public:
+    explicit TypeLayerInterpreterRegister(LayerType type) {
+        ModelInterpreter::RegisterLayerInterpreter(type, new T());
+    }
+};
+
+template <typename T>
+T *CreateLayerRes(LayerResource **resource) {
+    T *layer_res = new T();
+    *resource    = layer_res;
+    return layer_res;
+}
+
+template <typename T>
+T *CreateLayerParam(LayerParam **param) {
+    T *layer_param = new T();
+    *param         = layer_param;
+    return layer_param;
+}
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_ABSTRACT_LAYER_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc
new file mode 100644
index 0000000..e28b15c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Add, LAYER_ADD);
+
+Status AddLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<MultidirBroadcastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->weight_input_index, 1);
+    return TNN_OK;
+}
+
+Status AddLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<EltwiseLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, element_handle, deserializer);
+    return TNN_OK;
+}
+
+Status AddLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
+    output_stream << layer_param->weight_input_index << " ";
+    return TNN_OK;
+}
+
+Status AddLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, EltwiseLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc
new file mode 100644
index 0000000..8c0fdb5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+Status ArgMaxOrMinLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<ArgMaxOrMinLayerParam>(param);
+
+
+    if (index < layer_cfg_arr.size()) {
+        p->mode = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        p->axis = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        p->keep_dims = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        p->select_last_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status ArgMaxOrMinLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ArgMaxOrMinLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+	auto layer_param = dynamic_cast<ArgMaxOrMinLayerParam*>(param);
+    CHECK_PARAM_NULL(layer_param);
+    output_stream << layer_param->mode << " ";
+    output_stream << layer_param->axis << " ";
+    output_stream << layer_param->keep_dims << " ";
+    output_stream << layer_param->select_last_index << " ";
+
+    return TNN_OK;
+}
+
+Status ArgMaxOrMinLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc
new file mode 100644
index 0000000..da980c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(BatchNorm, LAYER_BATCH_NORM);
+
+Status BatchNormLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status BatchNormLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto batchnorm_res = CreateLayerRes<BatchNormLayerResource>(resource);
+
+    GET_BUFFER_FOR_ATTR(batchnorm_res, scale_handle, deserializer);
+    GET_BUFFER_FOR_ATTR(batchnorm_res, bias_handle, deserializer);
+
+    if (batchnorm_res->bias_handle.GetBytesSize() == 0) {
+        size_t scal_byte_size      = batchnorm_res->scale_handle.GetBytesSize();
+        batchnorm_res->bias_handle = RawBuffer(scal_byte_size);
+    }
+
+    return TNN_OK;
+}
+
+Status BatchNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status BatchNormLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(batchnorm_res, BatchNormLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(batchnorm_res->scale_handle);
+    serializer.PutRaw(batchnorm_res->bias_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(BatchNorm, LAYER_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc
new file mode 100644
index 0000000..47a71e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(BiasAdd, LAYER_BIAS_ADD);
+
+Status BiasAddLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status BiasAddLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto bias_res = CreateLayerRes<BatchNormLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(bias_res, bias_handle, deserializer);
+    return TNN_OK;
+}
+
+Status BiasAddLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status BiasAddLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(bias_res, BiasAddLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(bias_res->bias_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(BiasAdd, LAYER_BIAS_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc
new file mode 100644
index 0000000..c7429ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(BitShift, LAYER_BITSHIFT);
+
+Status BitShiftLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<BitShiftLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->direction, 0);
+    GET_INT_1_OR_DEFAULT(layer_param->bits, 0);
+    return TNN_OK;
+}
+
+Status BitShiftLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status BitShiftLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<BitShiftLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->direction << " "<< layer_param->bits << " ";
+    return TNN_OK;
+}
+
+Status BitShiftLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(BitShift, LAYER_BITSHIFT);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc
new file mode 100644
index 0000000..4590960
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(BlobScale, LAYER_BLOB_SCALE);
+
+Status BlobScaleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status BlobScaleLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<IntScaleResource>(resource);
+
+    GET_BUFFER_FOR_ATTR(layer_res, scale_handle, deserializer);
+    GET_BUFFER_FOR_ATTR(layer_res, bias_handle, deserializer);
+
+    return TNN_OK;
+}
+
+Status BlobScaleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status BlobScaleLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, IntScaleResource, "invalid blob_scale to save", resource);
+
+    serializer.PutRaw(layer_res->scale_handle);
+    serializer.PutRaw(layer_res->bias_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(BlobScale, LAYER_BLOB_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc
new file mode 100644
index 0000000..4202aba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Cast, LAYER_CAST);
+
+Status CastLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<CastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->to, 0);
+    return TNN_OK;
+}
+
+Status CastLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status CastLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<CastLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->to << " ";
+    return TNN_OK;
+}
+
+Status CastLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Cast, LAYER_CAST);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc
new file mode 100644
index 0000000..069d4e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Clip, LAYER_CLIP);
+
+Status ClipLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    int index = start_index;
+    auto p    = CreateLayerParam<ClipLayerParam>(param);
+
+    GET_FLOAT_2(p->min, p->max);
+
+    return TNN_OK;
+}
+
+Status ClipLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ClipLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ClipLayerParam, "invalid clip param to save", param);
+    output_stream << layer_param->min << " " << layer_param->max << " ";
+    return TNN_OK;
+}
+
+Status ClipLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Clip, LAYER_CLIP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc
new file mode 100644
index 0000000..eabd6a2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Concat, LAYER_CONCAT);
+
+Status ConcatLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<ConcatLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->axis, 1);
+    return TNN_OK;
+}
+
+Status ConcatLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ConcatLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ConcatLayerParam, "invalid concat param to save", param);
+    output_stream << layer_param->axis << " ";
+    return TNN_OK;
+}
+
+Status ConcatLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Concat, LAYER_CONCAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc
new file mode 100644
index 0000000..b446f66
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Const, LAYER_CONST);
+
+Status ConstLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = new ConstLayerParam();
+    *param           = layer_param;
+    int dims_size    = 0;
+    if (index < layer_cfg_arr.size()) {
+        dims_size = atoi(layer_cfg_arr[index++].c_str());
+    }
+    for (int i = 0; i < dims_size; ++i) {
+        layer_param->dims.push_back(atoi(layer_cfg_arr[index++].c_str()));
+    }
+    return TNN_OK;
+}
+
+Status ConstLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<ConstLayerResource>(resource);
+    RawBuffer buffer;
+    deserializer.GetRaw(buffer);
+    layer_res->weight_handle = buffer;
+    return TNN_OK;
+}
+
+Status ConstLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<ConstLayerParam*>(param);
+    output_stream << layer_param->dims.size() << " ";
+    for (const auto& dim : layer_param->dims) {
+        output_stream << dim << " ";
+    }
+    return TNN_OK;
+}
+
+Status ConstLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    auto layer_res = dynamic_cast<ConstLayerResource*>(resource);
+    serializer.PutRaw(layer_res->weight_handle);
+    return TNN_OK;
+}
+REGISTER_LAYER_INTERPRETER(Const, LAYER_CONST);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc
new file mode 100644
index 0000000..824c8f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+Status ConstantOfShapeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status ConstantOfShapeLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<ConstantOfShapeLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, value, deserializer);
+
+    return TNN_OK;
+}
+
+Status ConstantOfShapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status ConstantOfShapeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    auto layer_res   = dynamic_cast<ConstantOfShapeLayerResource*>(resource);
+    RETURN_VALUE_ON_NEQ(!layer_res, false,
+                        Status(TNNERR_INVALID_MODEL, "ConstantOfShape value is empty\n"));
+    
+    serializer.PutRaw(layer_res->value);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc
new file mode 100644
index 0000000..1e4f123
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Conv1D, LAYER_CONVOLUTION_1D);
+
+Status Conv1DLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<ConvLayerParam>(param);
+
+    GET_INT_3(p->group, p->input_channel, p->output_channel);
+
+    GET_INT_N_INTO_VEC_REVERSE(p->kernels, 1);
+
+    GET_INT_N_INTO_VEC_REVERSE(p->strides, 1);
+
+    int pad_w = 0, pad_h = 0;
+    GET_INT_2(pad_h, pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_h);
+
+    // bias
+    GET_INT_2(p->bias, p->pad_type);
+
+    GET_INT_N_INTO_VEC_REVERSE_DEFAULT(p->dialations, 1, 1);
+
+    // activation
+    GET_INT_1(p->activation_type);
+
+    return TNN_OK;
+}
+
+Status Conv1DLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto conv_1d_res = CreateLayerRes<ConvLayerResource>(resource);
+
+    // Todo. refactor later, for all data type
+    conv_1d_res->filter_format = OIHW;
+
+    std::string layer_name = deserializer.GetString();
+    int has_bias           = deserializer.GetInt();
+
+    GET_BUFFER_FOR_ATTR(conv_1d_res, filter_handle, deserializer);
+
+    if (has_bias) {
+        GET_BUFFER_FOR_ATTR(conv_1d_res, bias_handle, deserializer);
+    }
+
+    return TNN_OK;
+}
+
+Status Conv1DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->group << " ";
+    output_stream << layer_param->input_channel << " ";
+    output_stream << layer_param->output_channel << " ";
+
+    ASSERT(layer_param->kernels.size() == 1);
+    output_stream << layer_param->kernels[0] << " ";
+
+    ASSERT(layer_param->strides.size() == 1);
+    output_stream << layer_param->strides[0] << " ";
+
+    ASSERT(layer_param->pads.size() == 2);
+    output_stream << layer_param->pads[1] << " ";
+    output_stream << layer_param->pads[0] << " ";
+
+    output_stream << layer_param->bias << " ";
+    output_stream << layer_param->pad_type << " ";
+
+    ASSERT(layer_param->dialations.size() == 1);
+    output_stream << layer_param->dialations[0] << " ";
+
+    output_stream << layer_param->activation_type << " ";
+
+    return TNN_OK;
+}
+
+Status Conv1DLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(conv_1d_param, ConvLayerParam, "invalid layer param", param);
+    CAST_OR_RET_ERROR(conv_1d_res, ConvLayerResource, "invalid layer res to save", resource);
+
+    serializer.PutString(conv_1d_param->name);
+    serializer.PutInt(conv_1d_param->bias);
+    serializer.PutRaw(conv_1d_res->filter_handle);
+    if (conv_1d_param->bias) {
+        serializer.PutRaw(conv_1d_res->bias_handle);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Conv1D, LAYER_CONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc
new file mode 100644
index 0000000..dc16ff8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Conv3D, LAYER_CONVOLUTION_3D);
+
+Status Conv3DLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<ConvLayerParam>(param);
+
+    GET_INT_3(p->group, p->input_channel, p->output_channel);
+
+    // kernels d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE(p->kernels, 3);
+
+    // strides d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE(p->strides, 3);
+
+    // pads d, h, w -> w, h, d
+    int pad_w = 0, pad_h = 0, pad_d = 0;
+    GET_INT_3(pad_d, pad_h, pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_d);
+    p->pads.push_back(pad_d);
+
+    // bias
+    GET_INT_2(p->bias, p->pad_type);
+
+    // dailations d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE_DEFAULT(p->dialations, 3, 1);
+
+    // activation
+    GET_INT_1(p->activation_type);
+
+    return TNN_OK;
+}
+
+Status Conv3DLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto conv_3d_res = CreateLayerRes<ConvLayerResource>(resource);
+
+    // Todo. refactor later, for all data type
+    conv_3d_res->filter_format = OIDHW;
+
+    std::string layer_name = deserializer.GetString();
+    int has_bias           = deserializer.GetInt();
+
+    GET_BUFFER_FOR_ATTR(conv_3d_res, filter_handle, deserializer);
+
+    if (has_bias) {
+        GET_BUFFER_FOR_ATTR(conv_3d_res, bias_handle, deserializer);
+    }
+
+    return TNN_OK;
+}
+
+Status Conv3DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->group << " ";
+    output_stream << layer_param->input_channel << " ";
+    output_stream << layer_param->output_channel << " ";
+
+    ASSERT(layer_param->kernels.size() == 3);
+    output_stream << layer_param->kernels[2] << " ";
+    output_stream << layer_param->kernels[1] << " ";
+    output_stream << layer_param->kernels[0] << " ";
+
+    ASSERT(layer_param->strides.size() == 3);
+    output_stream << layer_param->strides[2] << " ";
+    output_stream << layer_param->strides[1] << " ";
+    output_stream << layer_param->strides[0] << " ";
+
+    ASSERT(layer_param->pads.size() == 6);
+    output_stream << layer_param->pads[4] << " ";
+    output_stream << layer_param->pads[2] << " ";
+    output_stream << layer_param->pads[0] << " ";
+
+    output_stream << layer_param->bias << " ";
+    output_stream << layer_param->pad_type << " ";
+
+    ASSERT(layer_param->dialations.size() == 3);
+    output_stream << layer_param->dialations[2] << " ";
+    output_stream << layer_param->dialations[1] << " ";
+    output_stream << layer_param->dialations[0] << " ";
+
+    output_stream << layer_param->activation_type << " ";
+
+    return TNN_OK;
+}
+
+Status Conv3DLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(conv_3d_param, ConvLayerParam, "invalid layer param", param);
+    CAST_OR_RET_ERROR(conv_3d_res, ConvLayerResource, "invalid layer res to save", resource);
+
+    serializer.PutString(conv_3d_param->name);
+    serializer.PutInt(conv_3d_param->bias);
+    serializer.PutRaw(conv_3d_res->filter_handle);
+    if (conv_3d_param->bias) {
+        serializer.PutRaw(conv_3d_res->bias_handle);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Conv3D, LAYER_CONVOLUTION_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc
new file mode 100644
index 0000000..b528731
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Conv, LAYER_CONVOLUTION);
+
+Status ConvLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    int index = start_index;
+
+    auto p = CreateLayerParam<ConvLayerParam>(param);
+
+    GET_INT_3(p->group, p->input_channel, p->output_channel);
+
+    // kernels h,w -> w,h
+    GET_INT_2_INTO_VEC_REVERSE(p->kernels);
+
+    // strides h,w -> w,h
+    GET_INT_2_INTO_VEC_REVERSE(p->strides);
+
+    // pads
+    int pad_h = 0, pad_w = 0;
+    GET_INT_2(pad_h, pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_h);
+
+    // bias
+    GET_INT_2(p->bias, p->pad_type);
+
+    // dailations
+    GET_INT_2_INTO_VEC_REVERSE_DEFAULT(p->dialations, 1);
+
+    // activation
+    GET_INT_1(p->activation_type);
+
+    return TNN_OK;
+}
+
+Status ConvLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<ConvLayerResource>(resource);
+
+    // Todo. refactor later, for all data type
+    layer_res->filter_format = OIHW;
+
+    std::string layer_name = deserializer.GetString();
+    int has_bias           = deserializer.GetInt();
+
+    GET_BUFFER_FOR_ATTR(layer_res, filter_handle, deserializer);
+
+    if (has_bias) {
+        GET_BUFFER_FOR_ATTR(layer_res, bias_handle, deserializer);
+    }
+
+    if (layer_res->filter_handle.GetDataType() == DATA_TYPE_INT8) {
+        // quantized
+        GET_BUFFER_FOR_ATTR(layer_res, scale_handle, deserializer);
+    }
+
+    return TNN_OK;
+}
+
+Status ConvLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->group << " ";
+    output_stream << layer_param->input_channel << " ";
+    output_stream << layer_param->output_channel << " ";
+
+    ASSERT(layer_param->kernels.size() == 2);
+    output_stream << layer_param->kernels[1] << " ";
+    output_stream << layer_param->kernels[0] << " ";
+
+    ASSERT(layer_param->strides.size() == 2);
+    output_stream << layer_param->strides[1] << " ";
+    output_stream << layer_param->strides[0] << " ";
+
+    ASSERT(layer_param->pads.size() == 4);
+    output_stream << layer_param->pads[2] << " ";
+    output_stream << layer_param->pads[0] << " ";
+
+    output_stream << layer_param->bias << " ";
+    output_stream << layer_param->pad_type << " ";
+
+    ASSERT(layer_param->dialations.size() == 2);
+    output_stream << layer_param->dialations[1] << " ";
+    output_stream << layer_param->dialations[0] << " ";
+
+    output_stream << layer_param->activation_type << " ";
+
+    return TNN_OK;
+}
+
+Status ConvLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param", param);
+    CAST_OR_RET_ERROR(layer_res, ConvLayerResource, "invalid layer res to save", resource);
+
+    serializer.PutString(layer_param->name);
+    serializer.PutInt(layer_param->bias);
+    serializer.PutRaw(layer_res->filter_handle);
+    if (layer_param->bias) {
+        serializer.PutRaw(layer_res->bias_handle);
+    }
+    if (layer_param->quantized) {
+        serializer.PutRaw(layer_res->scale_handle);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Conv, LAYER_CONVOLUTION);
+REGISTER_LAYER_INTERPRETER(Conv, LAYER_DECONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc
new file mode 100644
index 0000000..2dc054c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+Status DetectionOutputLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto p = CreateLayerParam<DetectionOutputLayerParam>(param);
+
+    GET_INT_1(p->num_classes);
+
+    int share_location = 0;
+    GET_INT_1(share_location);
+    p->share_location = share_location == 0 ? false : true;
+
+    GET_INT_1(p->background_label_id);
+
+    int variance_encoded_in_target = 0;
+    GET_INT_1(variance_encoded_in_target);
+    p->variance_encoded_in_target = variance_encoded_in_target ? true : false;
+
+    GET_INT_2(p->code_type, p->keep_top_k);
+    GET_FLOAT_2(p->confidence_threshold, p->nms_param.nms_threshold);
+    GET_INT_1(p->nms_param.top_k);
+    GET_FLOAT_1(p->eta);
+
+    return TNN_OK;
+}
+
+Status DetectionOutputLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status DetectionOutputLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    CAST_OR_RET_ERROR(layer_param, DetectionOutputLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->num_classes << " ";
+    output_stream << layer_param->share_location << " ";
+    output_stream << layer_param->background_label_id << " ";
+    output_stream << layer_param->variance_encoded_in_target << " ";
+    output_stream << layer_param->code_type << " ";
+    output_stream << layer_param->keep_top_k << " ";
+    output_stream << layer_param->confidence_threshold << " ";
+    output_stream << layer_param->nms_param.nms_threshold << " ";
+    output_stream << layer_param->nms_param.top_k << " ";
+    output_stream << layer_param->eta << " ";
+
+    return TNN_OK;
+}
+
+Status DetectionOutputLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param,
+                                                     LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc
new file mode 100644
index 0000000..1c03495
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(DetectionPostProcess, LAYER_DETECTION_POST_PROCESS);
+
+Status DetectionPostProcessLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index,
+                                                            LayerParam **param) {
+    int index = start_index;
+    auto p    = CreateLayerParam<DetectionPostProcessLayerParam>(param);
+    GET_INT_1(p->max_detections);
+    GET_INT_1(p->max_classes_per_detection);
+    GET_INT_1(p->detections_per_class);
+    int use_regular_nms = 0;
+    GET_INT_1(use_regular_nms);
+    p->use_regular_nms = use_regular_nms == 0 ? false : true;
+    GET_FLOAT_1(p->nms_score_threshold);
+    GET_FLOAT_1(p->nms_iou_threshold);
+    GET_INT_1(p->num_classes);
+    float y_scale = 1, x_scale = 1, h_scale = 1, w_scale = 1;
+    GET_FLOAT_1(y_scale);
+    GET_FLOAT_1(x_scale);
+    GET_FLOAT_1(h_scale);
+    GET_FLOAT_1(w_scale);
+    p->center_size_encoding.push_back(y_scale);
+    p->center_size_encoding.push_back(x_scale);
+    p->center_size_encoding.push_back(h_scale);
+    p->center_size_encoding.push_back(w_scale);
+    int has_anchors = 0;
+    GET_INT_1(has_anchors);
+    p->has_anchors = has_anchors == 0 ? false : true;
+    GET_INT_1(p->num_anchors);
+    GET_INT_1(p->anchors_coord_num);
+    return TNN_OK;
+}
+
+Status DetectionPostProcessLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **resource) {
+    auto layer_res = CreateLayerRes<DetectionPostProcessLayerResource>(resource);
+
+    std::string layer_name = deserializer.GetString();
+    GET_BUFFER_FOR_ATTR(layer_res, anchors_handle, deserializer);
+    return TNN_OK;
+}
+
+Status DetectionPostProcessLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    CAST_OR_RET_ERROR(layer_param, DetectionPostProcessLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->max_detections << " ";
+    output_stream << layer_param->max_classes_per_detection << " ";
+    output_stream << layer_param->detections_per_class << " ";
+    output_stream << layer_param->use_regular_nms << " ";
+    output_stream << layer_param->nms_score_threshold << " ";
+    output_stream << layer_param->nms_iou_threshold << " ";
+    output_stream << layer_param->num_classes << " ";
+    for (auto scale : layer_param->center_size_encoding) {
+        output_stream << scale << " ";
+    }
+    output_stream << layer_param->has_anchors << " ";
+    output_stream << layer_param->num_anchors << " ";
+    output_stream << layer_param->anchors_coord_num << " ";
+    return TNN_OK;
+}
+
+Status DetectionPostProcessLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param,
+                                                          LayerResource *resource) {
+    CAST_OR_RET_ERROR(layer_param, DetectionPostProcessLayerParam, "invalid layer param", param);
+    CAST_OR_RET_ERROR(layer_res, DetectionPostProcessLayerResource, "invalid layer resource", resource);
+    serializer.PutString(layer_res->name);
+    serializer.PutRaw(layer_res->anchors_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(DetectionPostProcess, LAYER_DETECTION_POST_PROCESS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc
new file mode 100644
index 0000000..8b9a03e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Div, LAYER_DIV);
+
+Status DivLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status DivLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = new EltwiseLayerResource();
+    *resource      = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status DivLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status DivLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Div, LAYER_DIV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc
new file mode 100644
index 0000000..8274ab6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Einsum, LAYER_EINSUM);
+
+Status EinsumLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    EinsumLayerParam* layer_param = new EinsumLayerParam();
+    *param                        = layer_param;
+    int index                     = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->equation = layer_cfg_arr[index++];
+    }
+
+    return TNN_OK;
+}
+
+Status EinsumLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status EinsumLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<EinsumLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->equation << " ";
+    return TNN_OK;
+}
+
+Status EinsumLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Einsum, LAYER_EINSUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc
new file mode 100644
index 0000000..732b109
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Elu, LAYER_ELU);
+
+Status EluLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    EluLayerParam* layer_param = new EluLayerParam();
+    *param                     = layer_param;
+    int index                  = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->alpha = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    return TNN_OK;
+}
+
+Status EluLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status EluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    EluLayerParam* layer_param = dynamic_cast<EluLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->alpha << " ";
+
+    return TNN_OK;
+}
+
+Status EluLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Elu, LAYER_ELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc
new file mode 100644
index 0000000..5f3cd76
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Expand, LAYER_EXPAND);
+
+Status ExpandLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto p   = CreateLayerParam<ExpandLayerParam>(param);
+    int shape_size = 0;
+    GET_INT_1_OR_DEFAULT(shape_size, 0);
+    GET_INT_N_INTO_VEC(p->shape, shape_size);
+    return TNN_OK;
+}
+
+Status ExpandLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status ExpandLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    CAST_OR_RET_ERROR(layer_param, ExpandLayerParam, "invalid expand param to save", param);
+    output_stream << layer_param->shape.size() << " ";
+    for (const auto &item : layer_param->shape) {
+        output_stream << item << " ";
+    }
+    return TNN_OK;
+}
+
+Status ExpandLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Expand, LAYER_EXPAND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc
new file mode 100644
index 0000000..c6ad225
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Flatten, LAYER_FLATTEN);
+
+Status FlattenLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto* layer_param = new FlattenLayerParam();
+    *param            = layer_param;
+    int index         = start_index;
+
+    layer_param->axis = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status FlattenLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status FlattenLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto* layer_param = static_cast<FlattenLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->axis << " ";
+
+    return TNN_OK;
+}
+
+Status FlattenLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc
new file mode 100644
index 0000000..5498261
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Gather, LAYER_GATHER);
+
+Status GatherLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<GatherLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->axis, 0);
+    GET_INT_1_OR_DEFAULT(layer_param->data_in_resource, 0);
+    GET_INT_1_OR_DEFAULT(layer_param->indices_in_resource, 1);
+    return TNN_OK;
+}
+
+Status GatherLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res        = CreateLayerRes<GatherLayerResource>(resource);
+    bool data_in_resource = deserializer.GetInt() == 1;
+    if (data_in_resource) {
+        GET_BUFFER_FOR_ATTR(layer_res, data, deserializer);
+    }
+    bool indices_in_resource = deserializer.GetInt() == 1;
+    if (indices_in_resource) {
+        GET_BUFFER_FOR_ATTR(layer_res, indices, deserializer);
+    }
+    return TNN_OK;
+}
+
+Status GatherLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param);
+    if (layer_param == nullptr) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    int data_in_resource    = layer_param->data_in_resource ? 1 : 0;
+    int indices_in_resource = layer_param->indices_in_resource ? 1 : 0;
+    output_stream << layer_param->axis << " ";
+    output_stream << data_in_resource << " ";
+    output_stream << indices_in_resource << " ";
+    return TNN_OK;
+}
+
+Status GatherLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param);
+    auto layer_res   = dynamic_cast<GatherLayerResource*>(resource);
+    if (layer_param == nullptr || layer_res == nullptr) {
+        LOGE("Interpreter Gather: layer param or layer resource is null\n");
+        return TNNERR_INVALID_MODEL;
+    }
+    if (layer_param->data_in_resource) {
+        serializer.PutInt(1);
+        serializer.PutRaw(layer_res->data);
+    } else {
+        serializer.PutInt(0);
+    }
+    if (layer_param->indices_in_resource) {
+        serializer.PutInt(1);
+        serializer.PutRaw(layer_res->indices);
+    } else {
+        serializer.PutInt(0);
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Gather, LAYER_GATHER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc
new file mode 100644
index 0000000..f7bbcb0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(GatherND, LAYER_GATHERND);
+
+Status GatherNDLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<GatherNDLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->batch_dims, 0);
+    return TNN_OK;
+}
+
+Status GatherNDLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status GatherNDLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<GatherNDLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->batch_dims << " ";
+    return TNN_OK;
+}
+
+Status GatherNDLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(GatherND, LAYER_GATHERND);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc
new file mode 100644
index 0000000..6112a96
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(GridSample, LAYER_GRIDSAMPLE);
+
+Status GridSampleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<GridSampleLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->mode, 2);
+    GET_INT_1_OR_DEFAULT(p->pad_type, 0);
+    GET_INT_1_OR_DEFAULT(p->align_corners, 0);
+    return TNN_OK;
+}
+
+Status GridSampleLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status GridSampleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, GridSampleLayerParam, "invalid grid sample layer param to save", param);
+    output_stream << layer_param->mode << " ";
+    output_stream << layer_param->pad_type << " ";
+    output_stream << layer_param->align_corners << " ";
+    return TNN_OK;
+}
+
+Status GridSampleLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(GridSample, LAYER_GRIDSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc
new file mode 100644
index 0000000..8fea9cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(GroupNorm, LAYER_GROUP_NORM);
+
+Status GroupNormLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<GroupNormLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->group, 0);
+    GET_FLOAT_1_OR_DEFAULT(p->eps, 1e-5f);
+    return TNN_OK;
+}
+
+Status GroupNormLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status GroupNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, GroupNormLayerParam, "invalid group norm layer param to save", param);
+    output_stream << layer_param->group << " ";
+    output_stream << layer_param->eps << " ";
+    return TNN_OK;
+}
+
+Status GroupNormLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(GroupNorm, LAYER_GROUP_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
new file mode 100644
index 0000000..60c1a48
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(HardSigmoid, LAYER_HARDSIGMOID);
+
+Status HardSigmoidLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    HardSigmoidLayerParam* layer_param = new HardSigmoidLayerParam();
+    *param                             = layer_param;
+    int index                          = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->alpha = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->beta = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+    return TNN_OK;
+}
+
+Status HardSigmoidLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status HardSigmoidLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<HardSigmoidLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->alpha << " " << layer_param->beta << " ";
+    return TNN_OK;
+}
+
+Status HardSigmoidLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc
new file mode 100644
index 0000000..63b7ac2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(HardSwish, LAYER_HARDSWISH);
+
+Status HardSwishLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    HardSwishLayerParam* layer_param = new HardSwishLayerParam();
+    *param                           = layer_param;
+    int index                        = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->alpha = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->beta = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+    return TNN_OK;
+}
+
+Status HardSwishLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status HardSwishLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<HardSwishLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->alpha << " " << layer_param->beta << " ";
+    return TNN_OK;
+}
+
+Status HardSwishLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(HardSwish, LAYER_HARDSWISH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc
new file mode 100644
index 0000000..0bc438b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(HdrGuide, LAYER_HDRGUIDE);
+
+Status HdrGuideLayerInterpreter::InterpretProto(str_arr, int, LayerParam**) {
+    return TNN_OK;
+}
+
+Status HdrGuideLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    HdrGuideLayerResource* layer_res = new HdrGuideLayerResource();
+    *resource                        = layer_res;
+
+    RawBuffer ccm_weight;
+    deserializer.GetRaw(ccm_weight);
+    layer_res->ccm_weight_handle = ccm_weight;
+
+    RawBuffer ccm_bias;
+    deserializer.GetRaw(ccm_bias);
+    layer_res->ccm_bias_handle = ccm_bias;
+
+    RawBuffer shifts;
+    deserializer.GetRaw(shifts);
+    layer_res->shifts_handle = shifts;
+
+    RawBuffer slopes;
+    deserializer.GetRaw(slopes);
+    layer_res->slopes_handle = slopes;
+
+    RawBuffer projection_weight;
+    deserializer.GetRaw(projection_weight);
+    layer_res->projection_weight_handle = projection_weight;
+
+    RawBuffer projection_bias;
+    deserializer.GetRaw(projection_bias);
+    layer_res->projection_bias_handle = projection_bias;
+
+    return TNN_OK;
+}
+
+Status HdrGuideLayerInterpreter::SaveProto(std::ofstream&, LayerParam*) {
+    return TNN_OK;
+}
+
+Status HdrGuideLayerInterpreter::SaveResource(Serializer& serializer, LayerParam*, LayerResource* resource) {
+    HdrGuideLayerResource* layer_res = dynamic_cast<HdrGuideLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+
+    serializer.PutRaw(layer_res->ccm_weight_handle);
+    serializer.PutRaw(layer_res->ccm_bias_handle);
+    serializer.PutRaw(layer_res->shifts_handle);
+    serializer.PutRaw(layer_res->slopes_handle);
+    serializer.PutRaw(layer_res->projection_weight_handle);
+    serializer.PutRaw(layer_res->projection_bias_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(HdrGuide, LAYER_HDRGUIDE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc
new file mode 100644
index 0000000..fffb823
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Histogram, LAYER_HISTOGRAM);
+
+Status HistogramLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<HistogramLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->depth, 0);
+    return TNN_OK;
+}
+
+Status HistogramLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status HistogramLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<HistogramLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->depth << " ";
+    return TNN_OK;
+}
+
+Status HistogramLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Histogram, LAYER_HISTOGRAM);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc
new file mode 100644
index 0000000..040b325
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(InnerProduct, LAYER_INNER_PRODUCT);
+
+Status InnerProductLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    InnerProductLayerParam* layer_param = new InnerProductLayerParam();
+    *param                              = layer_param;
+    int index                           = start_index;
+
+    layer_param->num_output = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->has_bias   = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->transpose  = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->axis       = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status InnerProductLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    InnerProductLayerResource* layer_res = new InnerProductLayerResource();
+    *resource                            = layer_res;
+
+    std::string layer_name = deserializer.GetString();
+
+    RawBuffer weights;
+    deserializer.GetRaw(weights);
+    layer_res->weight_handle = weights;
+
+    RawBuffer bias;
+    deserializer.GetRaw(bias);
+    layer_res->bias_handle = bias;
+
+    if (weights.GetDataType() == DATA_TYPE_INT8) {
+        // quantized
+        RawBuffer scale;
+        deserializer.GetRaw(scale);
+        layer_res->scale_handle = scale;
+    }
+
+    return TNN_OK;
+}
+
+Status InnerProductLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    InnerProductLayerParam* layer_param = dynamic_cast<InnerProductLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->num_output << " ";
+    output_stream << layer_param->has_bias << " ";
+    output_stream << layer_param->transpose << " ";
+    output_stream << layer_param->axis << " ";
+
+    return TNN_OK;
+}
+
+Status InnerProductLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    InnerProductLayerParam* layer_param = dynamic_cast<InnerProductLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    InnerProductLayerResource* layer_res = dynamic_cast<InnerProductLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+
+    serializer.PutString(layer_param->name);
+    serializer.PutRaw(layer_res->weight_handle);
+    serializer.PutRaw(layer_res->bias_handle);
+
+    if (layer_param->quantized) {
+        serializer.PutRaw(layer_res->scale_handle);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc
new file mode 100644
index 0000000..46a3512
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+Status InstanceNormLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<InstanceNormLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->channels, 0);
+    GET_FLOAT_1_OR_DEFAULT(p->eps, 1e-5f);
+    return TNN_OK;
+}
+
+Status InstanceNormLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto instnorm_res = CreateLayerRes<InstanceNormLayerResource>(resource);
+
+    GET_BUFFER_FOR_ATTR(instnorm_res, scale_handle, deserializer);
+    GET_BUFFER_FOR_ATTR(instnorm_res, bias_handle, deserializer);
+
+    if (instnorm_res->bias_handle.GetBytesSize() == 0) {
+        size_t scal_byte_size     = instnorm_res->scale_handle.GetBytesSize();
+        instnorm_res->bias_handle = RawBuffer(scal_byte_size);
+    }
+
+    return TNN_OK;
+}
+
+Status InstanceNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, InstanceNormLayerParam, "invalid group norm layer param to save", param);
+    output_stream << layer_param->channels << " ";
+    output_stream << layer_param->eps << " ";
+    return TNN_OK;
+}
+
+Status InstanceNormLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(instnorm_res, InstanceNormLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(instnorm_res->scale_handle);
+    serializer.PutRaw(instnorm_res->bias_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h
new file mode 100644
index 0000000..0af1957
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_LAYER_INTERPRETER_MACRO_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_LAYER_INTERPRETER_MACRO_H_
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#define DECLARE_LAYER_INTERPRETER(type_string, layer_type)                                                             \
+    class type_string##LayerInterpreter : public AbstractLayerInterpreter {                                            \
+    public:                                                                                                            \
+        virtual Status InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param);                     \
+        virtual Status InterpretResource(Deserializer &deserializer, LayerResource **resource);                        \
+        virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param);                                     \
+        virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource);               \
+    }
+
+#define REGISTER_LAYER_INTERPRETER(type_string, layer_type)                                                            \
+    TypeLayerInterpreterRegister<type_string##LayerInterpreter> g_##layer_type##_layer_interpreter(layer_type);
+
+#define DEFAULT_ARR_VAR layer_cfg_arr
+#define DEFAULT_INDEX_VAR index
+
+#define GET_INT_1_OR_DEFAULT(var, default_value)                                                                       \
+    do {                                                                                                               \
+        if (DEFAULT_INDEX_VAR < DEFAULT_ARR_VAR.size()) {                                                              \
+            var = atoi(DEFAULT_ARR_VAR[DEFAULT_INDEX_VAR++].c_str());                                                  \
+        } else {                                                                                                       \
+            var = default_value;                                                                                       \
+        }                                                                                                              \
+    } while (0)
+
+#define GET_FLOAT_1_OR_DEFAULT(var, default_value)                                                                     \
+    do {                                                                                                               \
+        if (DEFAULT_INDEX_VAR < DEFAULT_ARR_VAR.size()) {                                                              \
+            var = atof(DEFAULT_ARR_VAR[DEFAULT_INDEX_VAR++].c_str());                                                  \
+        } else {                                                                                                       \
+            var = default_value;                                                                                       \
+        }                                                                                                              \
+    } while (0)
+
+#define GET_INT_1(var) GET_INT_1_OR_DEFAULT(var, 0)
+#define GET_FLOAT_1(var) GET_FLOAT_1_OR_DEFAULT(var, 0.0f)
+
+#define GET_INT_2(var1, var2)                                                                                          \
+    GET_INT_1(var1);                                                                                                   \
+    GET_INT_1(var2)
+#define GET_FLOAT_2(var1, var2)                                                                                        \
+    GET_FLOAT_1(var1);                                                                                                 \
+    GET_FLOAT_1(var2)
+
+#define GET_INT_3(var1, var2, var3)                                                                                    \
+    GET_INT_2(var1, var2);                                                                                             \
+    GET_INT_1(var3)
+
+#define GET_INT_N_INTO_VEC_DEFAULT(vec, n, default_value)                                                              \
+    do {                                                                                                               \
+        for (int _ii = 0; _ii < n; _ii++) {                                                                            \
+            int var = default_value;                                                                                   \
+            GET_INT_1_OR_DEFAULT(var, default_value);                                                                  \
+            vec.push_back(var);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+#define GET_INT_N_INTO_VEC_REVERSE_DEFAULT(vec, n, default_value)                                                      \
+    do {                                                                                                               \
+        vec.resize(n);                                                                                                 \
+        for (int _ii = n-1; _ii >= 0; _ii--) {                                                                         \
+            int var = default_value;                                                                                   \
+            GET_INT_1_OR_DEFAULT(var, default_value);                                                                  \
+            vec[_ii] = var;                                                                                            \
+        }                                                                                                              \
+    } while (0)
+
+#define GET_INT_N_INTO_VEC(vec, n) GET_INT_N_INTO_VEC_DEFAULT(vec, n, 0)
+
+#define GET_INT_N_INTO_VEC_REVERSE(vec, n) GET_INT_N_INTO_VEC_REVERSE_DEFAULT(vec, n, 0)
+
+#define GET_INT_2_INTO_VEC_DEFAULT(vec, default_value) GET_INT_N_INTO_VEC_DEFAULT(vec, 2, default_value)
+
+#define GET_INT_2_INTO_VEC(vec) GET_INT_2_INTO_VEC_DEFAULT(vec, 0)
+
+#define GET_INT_2_INTO_VEC_REVERSE_DEFAULT(vec, default_value) GET_INT_N_INTO_VEC_REVERSE_DEFAULT(vec, 2, default_value)
+
+#define GET_INT_2_INTO_VEC_REVERSE(vec) GET_INT_2_INTO_VEC_REVERSE_DEFAULT(vec, 0)
+
+#define GET_BUFFER_FOR_ATTR(layer_res, attr, deserializer)                                                             \
+    do {                                                                                                               \
+        RawBuffer buf;                                                                                                 \
+        deserializer.GetRaw(buf);                                                                                      \
+        layer_res->attr = buf;                                                                                         \
+    } while (0)
+
+#define CAST_OR_RET_ERROR(layer_res, T, msg, resource)                                                                 \
+    T *layer_res = dynamic_cast<T *>(resource);                                                                        \
+    do {                                                                                                               \
+        if (nullptr == layer_res) {                                                                                    \
+            LOGE(msg);                                                                                                 \
+            return Status(TNNERR_NULL_PARAM, msg);                                                                     \
+        }                                                                                                              \
+    } while (0)
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_LAYER_INTERPRETER_MACRO_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc
new file mode 100644
index 0000000..7b6ee31
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(LayerNorm, LAYER_LAYER_NORM);
+
+Status LayerNormLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<LayerNormLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->reduce_dims_size, 0);
+    GET_FLOAT_1_OR_DEFAULT(p->eps, 1e-5f);
+    return TNN_OK;
+}
+
+Status LayerNormLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status LayerNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, LayerNormLayerParam, "invalid layer norm layer param to save", param);
+    output_stream << layer_param->reduce_dims_size << " ";
+    output_stream << layer_param->eps << " ";
+    return TNN_OK;
+}
+
+Status LayerNormLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(LayerNorm, LAYER_LAYER_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc
new file mode 100644
index 0000000..1e038b4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(LRN, LAYER_LRN);
+
+Status LRNLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    LRNLayerParam* layer_param = new LRNLayerParam();
+    *param                     = layer_param;
+    int index                  = start_index;
+
+    // get min_size
+    layer_param->alpha = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    layer_param->beta  = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    layer_param->bias  = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    layer_param->size  = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status LRNLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status LRNLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<LRNLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->alpha << " ";
+    output_stream << layer_param->beta << " ";
+    output_stream << layer_param->bias << " ";
+    output_stream << layer_param->size << " ";
+
+    return TNN_OK;
+}
+
+Status LRNLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(LRN, LAYER_LRN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc
new file mode 100644
index 0000000..b28e08d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(LSTMONNX, LAYER_LSTMONNX);
+
+Status LSTMONNXLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<LSTMONNXLayerParam>(param);
+    GET_FLOAT_1_OR_DEFAULT(layer_param->clip_threshold, 0);
+    GET_INT_1_OR_DEFAULT(layer_param->hidden_size, 0);
+    GET_INT_1_OR_DEFAULT(layer_param->direction, 0);
+    return TNN_OK;
+}
+
+Status LSTMONNXLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status LSTMONNXLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam*>(param);
+    if (layer_param == nullptr) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->clip_threshold << " " << layer_param->hidden_size << " " << layer_param->direction << " ";
+
+    return TNN_OK;
+}
+
+Status LSTMONNXLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(LSTMONNX, LAYER_LSTMONNX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc
new file mode 100644
index 0000000..b23f7d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(MatMul, LAYER_MATMUL);
+
+Status MatMulLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = new MatMulLayerParam();
+    *param            = layer_param;
+    if (index < layer_cfg_arr.size()) {
+       layer_param->weight_position = atoi(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status MatMulLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<MatMulLayerResource>(resource);
+    RawBuffer buf;
+    deserializer.GetRaw(buf);
+    layer_res->weight = buf;
+    return TNN_OK;
+}
+
+Status MatMulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MatMulLayerParam*>(param);
+    if (nullptr == layer_param) {
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->weight_position << " ";
+    return TNN_OK;
+}
+
+Status MatMulLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, MatMulLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->weight);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(MatMul, LAYER_MATMUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc
new file mode 100644
index 0000000..28af667
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Max, LAYER_MAXIMUM);
+
+Status MaxLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status MaxLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    EltwiseLayerResource* layer_res = new EltwiseLayerResource();
+    *resource                       = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status MaxLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status MaxLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc
new file mode 100644
index 0000000..633ad12
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Min, LAYER_MINIMUM);
+
+Status MinLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status MinLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    EltwiseLayerResource* layer_res = new EltwiseLayerResource();
+    *resource                       = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status MinLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status MinLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Min, LAYER_MINIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc
new file mode 100644
index 0000000..e13bc39
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Mul, LAYER_MUL);
+
+Status MulLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status MulLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    EltwiseLayerResource* layer_res = new EltwiseLayerResource();
+    *resource                       = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status MulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status MulLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Mul, LAYER_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc
new file mode 100644
index 0000000..96881a2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Normalize, LAYER_NORMALIZE);
+
+Status NormalizeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new NormalizeLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->across_spatial = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->epsilon = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->channel_shared = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->axis = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->p = atoi(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status NormalizeLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status NormalizeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<NormalizeLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->across_spatial << " " << layer_param->epsilon << " " << layer_param->channel_shared
+                  << " " << layer_param->axis << " " << layer_param->p << " ";
+    return TNN_OK;
+}
+
+Status NormalizeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Normalize, LAYER_NORMALIZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc
new file mode 100644
index 0000000..4942fd9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(OneHot, LAYER_ONEHOT);
+
+Status OneHotLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<OneHotLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->axis, -1);
+    GET_INT_1_OR_DEFAULT(layer_param->depth, -1);
+    GET_FLOAT_1_OR_DEFAULT(layer_param->value_off, 0);
+    GET_FLOAT_1_OR_DEFAULT(layer_param->value_on, 1);
+    return TNN_OK;
+}
+
+Status OneHotLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status OneHotLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<OneHotLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid onehot layer param to save");
+    }
+    output_stream << layer_param->axis << " " << layer_param->depth << " "
+                            << layer_param->value_off << " " << layer_param->value_on << " ";
+    return TNN_OK;
+}
+
+Status OneHotLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(OneHot, LAYER_ONEHOT);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc
new file mode 100644
index 0000000..28f3fc9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Pad, LAYER_PAD);
+
+Status PadLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new PadLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        int n1 = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        int n2 = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    int pad_t = INT_MIN;
+    int pad_b = INT_MIN;
+    int pad_l = INT_MIN;
+    int pad_r = INT_MIN;
+    int pad_c_b = INT_MIN;
+    int pad_c_e = INT_MIN;
+    if (index < layer_cfg_arr.size()) {
+        pad_t = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        pad_b = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        pad_l = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        pad_r = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        pad_c_b = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        pad_c_e = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        layer_param->type = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        layer_param->value = atof(layer_cfg_arr[index++].c_str());
+    }
+    if (layer_param->type != 0 && (pad_c_b != 0 || pad_c_e != 0)) {
+        LOGE("Pad (edge, reflect) do not support pad in channel!");
+    }
+    layer_param->pads = {pad_l, pad_r, pad_t, pad_b, pad_c_b, pad_c_e};
+    return TNN_OK;
+}
+
+Status PadLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PadLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<PadLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << "0 0 " << layer_param->pads[2] << " " << layer_param->pads[3] << " " << layer_param->pads[0] << " "
+                  << layer_param->pads[1] << " " << layer_param->pads[4] << " " << layer_param->pads[5] << " "
+                  << layer_param->type << " "<< layer_param->value << " ";
+
+    return TNN_OK;
+}
+
+Status PadLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Pad, LAYER_PAD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc
new file mode 100644
index 0000000..62c8157
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(PadV2, LAYER_PADV2);
+
+Status PadV2LayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new PadLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+    
+    int dim_size = 0;
+    if (index < layer_cfg_arr.size()) {
+        dim_size = atoi(layer_cfg_arr[index++].c_str());
+    }
+    
+    DimsVector pads;
+    for (int i=0; i<2*dim_size; i++) {
+        pads.push_back(atoi(layer_cfg_arr[index++].c_str()));
+    }
+    layer_param->pads = pads;
+    
+    if (index < layer_cfg_arr.size()) {
+        layer_param->type = atoi(layer_cfg_arr[index++].c_str());
+    }
+    if (index < layer_cfg_arr.size()) {
+        layer_param->value = atof(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status PadV2LayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PadV2LayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<PadLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    
+    const auto pads = layer_param->pads;
+    int dim_size = (int)pads.size()/2;
+    output_stream << dim_size << " ";
+    for (int i=0; i<pads.size(); i++) {
+        output_stream << pads[i] << " ";
+    }
+    
+    output_stream << layer_param->type << " "<< layer_param->value << " ";
+    return TNN_OK;
+}
+
+Status PadV2LayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(PadV2, LAYER_PADV2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc
new file mode 100644
index 0000000..96ca2c2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Permute, LAYER_PERMUTE);
+
+Status PermuteLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new PermuteLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    int order_size;
+    order_size = atoi(layer_cfg_arr[index++].c_str());
+
+    layer_param->orders.clear();
+    for (int i = 0; i < order_size; i++) {
+        int v = atoi(layer_cfg_arr[index++].c_str());
+        // v should be less than the input dimension.
+        ASSERT(v < order_size);
+        layer_param->orders.push_back(v);
+    }
+
+    return TNN_OK;
+}
+
+Status PermuteLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PermuteLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    PermuteLayerParam* layer_param = dynamic_cast<PermuteLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->orders.size() << " ";
+    for (auto item : layer_param->orders)
+        output_stream << item << " ";
+
+    return TNN_OK;
+}
+
+Status PermuteLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Permute, LAYER_PERMUTE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc
new file mode 100644
index 0000000..ec0af6e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status PixelShuffleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+    auto layer_param            = new PixelShuffleLayerParam();
+    *param                      = layer_param;
+    int index                   = start_index;
+    int upscale_factor          = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->upscale_factor = upscale_factor;
+    return TNN_OK;
+}
+
+Status PixelShuffleLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status PixelShuffleLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<PixelShuffleLayerParam *>(param);
+    CHECK_PARAM_NULL(layer_param);
+    output_stream << layer_param->upscale_factor << " ";
+    return TNN_OK;
+}
+
+Status PixelShuffleLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc
new file mode 100644
index 0000000..26061c2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Pooling3D, LAYER_POOLING_3D);
+
+Status Pooling3DLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<PoolingLayerParam>(param);
+
+    GET_INT_1(p->pool_type);
+
+    // kernels d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE(p->kernels, 3);
+    p->kernels_params = p->kernels;
+
+    // strides d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE(p->strides, 3);
+
+    // pad d, h, w -> w, h, d
+    int pad_w = 0, pad_h = 0, pad_d = 0;
+    GET_INT_3(pad_d, pad_h, pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_d);
+    p->pads.push_back(pad_d);
+
+    // kernel index d, h, w -> w, h, d
+    GET_INT_N_INTO_VEC_REVERSE_DEFAULT(p->kernel_indexs, 3, -1);
+    if (p->kernel_indexs[0] == -1) {
+        p->kernel_indexs[0] = p->kernel_indexs[2];
+    }
+    if (p->kernel_indexs[1] == -1) {
+        p->kernel_indexs[1] = p->kernel_indexs[2];
+    }
+
+    GET_INT_2(p->pad_type, p->ceil_mode);
+
+    return TNN_OK;
+}
+
+Status Pooling3DLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status Pooling3DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, PoolingLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->pool_type << " ";
+
+    ASSERT(layer_param->kernels_params.size() == 3);
+    output_stream << layer_param->kernels_params[2] << " ";
+    output_stream << layer_param->kernels_params[1] << " ";
+    output_stream << layer_param->kernels_params[0] << " ";
+
+    ASSERT(layer_param->strides.size() == 3);
+    output_stream << layer_param->strides[2] << " ";
+    output_stream << layer_param->strides[1] << " ";
+    output_stream << layer_param->strides[0] << " ";
+
+    ASSERT(layer_param->pads.size() == 6);
+    output_stream << layer_param->pads[4] << " ";
+    output_stream << layer_param->pads[2] << " ";
+    output_stream << layer_param->pads[0] << " ";
+
+    ASSERT(layer_param->kernel_indexs.size() == 3);
+    output_stream << layer_param->kernel_indexs[2] << " ";
+    output_stream << layer_param->kernel_indexs[1] << " ";
+    output_stream << layer_param->kernel_indexs[0] << " ";
+
+    output_stream << layer_param->pad_type << " ";
+    output_stream << layer_param->ceil_mode << " ";
+
+    return TNN_OK;
+}
+
+Status Pooling3DLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Pooling3D, LAYER_POOLING_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc
new file mode 100644
index 0000000..13f48b7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc
@@ -0,0 +1,100 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Pooling, LAYER_POOLING);
+
+Status PoolingLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<PoolingLayerParam>(param);
+
+    GET_INT_1(p->pool_type);
+
+    // kernels
+    GET_INT_2_INTO_VEC_REVERSE(p->kernels);
+    p->kernels_params = p->kernels;
+    p->is_global_pool = p->kernels[0] == 0 && p->kernels[1] == 0;
+
+    // strides h,w -> w,h
+    GET_INT_2_INTO_VEC_REVERSE(p->strides);
+
+    // pads h, w -> w, h
+    int pad_h = 0, pad_w = 0;
+    GET_INT_2(pad_h, pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_w);
+    p->pads.push_back(pad_h);
+    p->pads.push_back(pad_h);
+
+    // kernel index h, w -> w, w
+    GET_INT_2_INTO_VEC_REVERSE_DEFAULT(p->kernel_indexs, -1);
+    if (p->kernel_indexs[0] == -1) {
+        p->kernel_indexs[0] = p->kernel_indexs[1];
+    }
+
+    GET_INT_2(p->pad_type, p->ceil_mode);
+
+    GET_INT_1(p->is_adaptive_pool);
+    int output_h = 0, output_w = 0;
+    GET_INT_2(output_h, output_w);
+    p->output_shape = {output_w, output_h};
+
+    return TNN_OK;
+}
+
+Status PoolingLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PoolingLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, PoolingLayerParam, "invalid layer param to save", param);
+
+    output_stream << layer_param->pool_type << " ";
+
+    ASSERT(layer_param->kernels_params.size() == 2);
+    output_stream << layer_param->kernels_params[1] << " ";
+    output_stream << layer_param->kernels_params[0] << " ";
+
+    ASSERT(layer_param->strides.size() == 2);
+    output_stream << layer_param->strides[1] << " ";
+    output_stream << layer_param->strides[0] << " ";
+
+    ASSERT(layer_param->pads.size() == 4);
+    output_stream << layer_param->pads[2] << " ";
+    output_stream << layer_param->pads[0] << " ";
+
+    ASSERT(layer_param->kernel_indexs.size() == 2);
+    output_stream << layer_param->kernel_indexs[1] << " ";
+    output_stream << layer_param->kernel_indexs[0] << " ";
+
+    output_stream << layer_param->pad_type << " ";
+    output_stream << layer_param->ceil_mode << " ";
+
+    output_stream << layer_param->is_adaptive_pool << " ";
+    output_stream << layer_param->output_shape[0] << " " << layer_param->output_shape[1] << " ";
+
+    return TNN_OK;
+}
+
+Status PoolingLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Pooling, LAYER_POOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc
new file mode 100644
index 0000000..a12600c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Pow, LAYER_POWER);
+
+Status PowLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    PowLayerParam* layer_param = new PowLayerParam();
+    *param                     = layer_param;
+    int index                  = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->exponent = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->scale = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->shift = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+    return TNN_OK;
+}
+
+Status PowLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PowLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<PowLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->exponent << " " << layer_param->scale << " " << layer_param->shift << " ";
+    return TNN_OK;
+}
+
+Status PowLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Pow, LAYER_POWER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc
new file mode 100644
index 0000000..3cdcaa9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(PRelu, LAYER_PRELU);
+
+Status PReluLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    PReluLayerParam* layer_param = new PReluLayerParam();
+    *param                       = layer_param;
+    int index                    = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->channel_shared = (atoi(layer_cfg_arr[index++].c_str()) == 1) ? 1 : 0;
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->has_filler = (atoi(layer_cfg_arr[index++].c_str()) == 1) ? 1 : 0;
+    }
+    return TNN_OK;
+}
+
+Status PReluLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    PReluLayerResource* layer_res = new PReluLayerResource();
+    *resource                     = layer_res;
+
+    layer_res->name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->slope_handle = k;
+    return TNN_OK;
+}
+
+Status PReluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<PReluLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->channel_shared << " " << layer_param->has_filler << " ";
+    return TNN_OK;
+}
+
+Status PReluLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    auto layer_res = dynamic_cast<PReluLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutString(layer_res->name);
+    serializer.PutRaw(layer_res->slope_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(PRelu, LAYER_PRELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc
new file mode 100644
index 0000000..f91cbd8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(PriorBox, LAYER_PRIOR_BOX);
+
+Status PriorBoxLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+    auto layer_param = new PriorBoxLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    // get min_size
+    int min_size_count = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < min_size_count; ++i) {
+        layer_param->min_sizes.push_back(static_cast<float>(atof(layer_cfg_arr[index++].c_str())));
+    }
+    // get max_size
+    int max_size_count = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < max_size_count; ++i) {
+        layer_param->max_sizes.push_back(static_cast<float>(atof(layer_cfg_arr[index++].c_str())));
+    }
+
+    // get clip
+    if (atoi(layer_cfg_arr[index++].c_str()) == 1) {
+        layer_param->clip = true;
+    } else {
+        layer_param->clip = false;
+    }
+
+    // get flip
+    if (atoi(layer_cfg_arr[index++].c_str()) == 1) {
+        layer_param->flip = true;
+    } else {
+        layer_param->flip = false;
+    }
+
+    // get variance
+    int variance_count = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < variance_count; ++i) {
+        layer_param->variances.push_back(static_cast<float>(atof(layer_cfg_arr[index++].c_str())));
+    }
+
+    // get aspect_ratio
+    int aspect_ratios_count = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < aspect_ratios_count; ++i) {
+        layer_param->aspect_ratios.push_back(static_cast<float>(atof(layer_cfg_arr[index++].c_str())));
+    }
+    // get img_sizes
+    layer_param->img_w = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->img_h = atoi(layer_cfg_arr[index++].c_str());
+    // get step
+    layer_param->step_w = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->step_h = atoi(layer_cfg_arr[index++].c_str());
+    // get offset
+    layer_param->offset = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+
+    return TNN_OK;
+}
+
+Status PriorBoxLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status PriorBoxLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    PriorBoxLayerParam *layer_param = dynamic_cast<PriorBoxLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    // write min_size
+    output_stream << layer_param->min_sizes.size() << " ";
+    for (float min_size : layer_param->min_sizes) {
+        output_stream << min_size << " ";
+    }
+
+    // write max_size
+    output_stream << layer_param->max_sizes.size() << " ";
+    for (float max_size : layer_param->max_sizes) {
+        output_stream << max_size << " ";
+    }
+
+    // write clip
+    output_stream << (layer_param->clip ? 1 : 0) << " ";
+
+    // write flip
+    output_stream << (layer_param->flip ? 1 : 0) << " ";
+
+    // write variances
+    output_stream << layer_param->variances.size() << " ";
+    for (float variance : layer_param->variances) {
+        output_stream << variance << " ";
+    }
+
+    // write aspect_ratio
+    output_stream << layer_param->aspect_ratios.size() << " ";
+    for (float aspect_ratio : layer_param->aspect_ratios) {
+        output_stream << aspect_ratio << " ";
+    }
+
+    // write img_size : order img_size[img_w, img_h]
+    output_stream << layer_param->img_w << " ";
+    output_stream << layer_param->img_h << " ";
+
+    // write step
+    output_stream << layer_param->step_w << " ";
+    output_stream << layer_param->step_h << " ";
+
+    // write offset
+    output_stream << layer_param->offset << " ";
+    return TNN_OK;
+}
+
+Status PriorBoxLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(PriorBox, LAYER_PRIOR_BOX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc
new file mode 100644
index 0000000..930e77e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Range, LAYER_RANGE);
+
+Status RangeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<RangeLayerParam>(param);
+    return TNN_OK;
+}
+
+Status RangeLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status RangeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status RangeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Range, LAYER_RANGE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc
new file mode 100644
index 0000000..52c00f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "reduce_op_interpreter.h"
+namespace TNN_NS {
+
+Status ReduceOpLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+    auto *layer_param = new ReduceLayerParam();
+    *param            = layer_param;
+    int index         = start_index;
+
+    layer_param->keep_dims = atoi(layer_cfg_arr[index++].c_str());
+
+    layer_param->axis.clear();
+    for (int i = index; i < layer_cfg_arr.size(); ++i) {
+        int axis = atoi(layer_cfg_arr[index++].c_str());
+        layer_param->axis.push_back(axis);
+    }
+    return TNN_OK;
+}
+
+Status ReduceOpLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto *layer_param = dynamic_cast<ReduceLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->keep_dims << " ";
+    //ASSERT(layer_param->axis.size() == 1);
+    for(auto axis : layer_param->axis) {
+        output_stream << axis << " ";
+    }
+    return TNN_OK;
+}
+}  // namespace TNN_NS
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceL1, LAYER_REDUCE_L1)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceL2, LAYER_REDUCE_L2)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceLogSum, LAYER_REDUCE_LOG_SUM)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceMax, LAYER_REDUCE_MAX)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceMean, LAYER_REDUCE_MEAN)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceMin, LAYER_REDUCE_MIN)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceProd, LAYER_REDUCE_PROD)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceSum, LAYER_REDUCE_SUM)
+
+REGISTER_REDUCE_OP_LAYER_INTERPRETER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE)
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h
new file mode 100644
index 0000000..eb9e833
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_REDUCE_OP_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_REDUCE_OP_INTERPRETER_H_
+
+#include <tnn/core/layer_type.h>
+#include <tnn/core/status.h>
+#include <tnn/interpreter/layer_resource.h>
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+class ReduceOpLayerInterpreter : public AbstractLayerInterpreter {
+public:
+    virtual Status InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param);
+    virtual Status InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+        return TNN_OK;
+    }
+
+    Status SaveProto(std::ofstream &output_stream, LayerParam *param);
+    virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+        return TNN_OK;
+    }
+    virtual ~ReduceOpLayerInterpreter(){};
+};
+}  // namespace TNN_NS
+
+#define REGISTER_REDUCE_OP_LAYER_INTERPRETER(type_string, layer_type)                                                  \
+    TypeLayerInterpreterRegister<ReduceOpLayerInterpreter> g_##layer_type##_layer_interpreter(layer_type);
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_REDUCE_OP_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc
new file mode 100644
index 0000000..246c8ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Reformat, LAYER_MAXIMUM);
+
+DataType GetDataType(int type_value) {
+    switch (type_value) {
+        case DATA_TYPE_FLOAT:
+            return DATA_TYPE_FLOAT;
+        case DATA_TYPE_HALF:
+            return DATA_TYPE_HALF;
+        case DATA_TYPE_INT8:
+            return DATA_TYPE_INT8;
+        case DATA_TYPE_INT32:
+            return DATA_TYPE_INT32;
+        case DATA_TYPE_BFP16:
+            return DATA_TYPE_BFP16;
+        default:
+            LOGE("Interpreter: do not support reformat src type");
+            return DATA_TYPE_FLOAT;
+    }
+}
+
+Status ReformatLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param   = CreateLayerParam<ReformatLayerParam>(param);
+    int index          = start_index;
+    int src_type_value = 0;
+    if (index < layer_cfg_arr.size()) {
+        src_type_value = atoi(layer_cfg_arr[index++].c_str());
+    }
+    layer_param->src_type = GetDataType(src_type_value);
+    int dst_type_value    = 0;
+    if (index < layer_cfg_arr.size()) {
+        dst_type_value = atoi(layer_cfg_arr[index++].c_str());
+    }
+    layer_param->dst_type = GetDataType(dst_type_value);
+    return TNN_OK;
+}
+
+Status ReformatLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ReformatLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<ReformatLayerParam*>(param);
+    output_stream << layer_param->src_type << " ";
+    output_stream << layer_param->dst_type << " ";
+    return TNN_OK;
+}
+
+Status ReformatLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Reformat, LAYER_REFORMAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc
new file mode 100644
index 0000000..874c40f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Reorg, LAYER_REORG);
+
+Status ReorgLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    ReorgLayerParam* layer_param = new ReorgLayerParam();
+    *param                       = layer_param;
+    int index                    = start_index;
+
+    layer_param->stride      = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->forward     = atoi(layer_cfg_arr[index++].c_str()) == 0 ? false : true;
+    int run_with_output_dims = atoi(layer_cfg_arr[index++].c_str());  // unuseful for now
+    layer_param->mode        = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status ReorgLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ReorgLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    ReorgLayerParam* layer_param = dynamic_cast<ReorgLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->stride << " ";
+    if (layer_param->forward) {
+        output_stream << 1 << " ";
+    } else {
+        output_stream << 0 << " ";
+    }
+    output_stream << 0 << " ";  // write fake run_with_output_dims
+    output_stream << layer_param->mode << " ";
+
+    return TNN_OK;
+}
+
+Status ReorgLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Reorg, LAYER_REORG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc
new file mode 100644
index 0000000..a3eabb2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Reshape, LAYER_RESHAPE);
+
+Status ReshapeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<ReshapeLayerParam>(param);
+    GET_INT_2(p->axis, p->num_axes);
+
+    int top_blob_dim_size;
+    GET_INT_1_OR_DEFAULT(top_blob_dim_size, -1);
+    if (top_blob_dim_size == -1) {
+        LOGE("Error: ReshapeLayerInterpreter: invalid layer param\n");
+        return Status(TNNERR_PARAM_ERR, "ReshapeLayerInterpreter: invalid layer param");
+    }
+
+    p->shape.clear();
+    GET_INT_N_INTO_VEC(p->shape, top_blob_dim_size);
+
+    GET_INT_1_OR_DEFAULT(p->reshape_type, 0);
+
+    return TNN_OK;
+}
+
+Status ReshapeLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ReshapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, ReshapeLayerParam, "invalid reshape param to save", param);
+
+    output_stream << layer_param->axis << " ";
+    output_stream << layer_param->num_axes << " ";
+    output_stream << layer_param->shape.size() << " ";
+    for (auto item : layer_param->shape) {
+        output_stream << item << " ";
+    }
+    output_stream << layer_param->reshape_type << " ";
+
+    return TNN_OK;
+}
+
+Status ReshapeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Reshape, LAYER_RESHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc
new file mode 100644
index 0000000..99b25ba
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(RoiPooling, LAYER_ROIPOOLING);
+
+Status RoiPoolingLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    RoiPoolingLayerParam* layer_param = new RoiPoolingLayerParam();
+    *param                            = layer_param;
+    int index                         = start_index;
+
+    // pool_type
+    layer_param->pool_type = atoi(layer_cfg_arr[index++].c_str());
+
+    // spatial_scale
+    layer_param->spatial_scale = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+
+    // pooled_dims
+    int pooled_w = atoi(layer_cfg_arr[index++].c_str());
+    int pooled_h = atoi(layer_cfg_arr[index++].c_str());
+
+    layer_param->pooled_dims.push_back(pooled_w);
+    layer_param->pooled_dims.push_back(pooled_h);
+
+    if (index < layer_cfg_arr.size()) {
+        int pooled_d = atoi(layer_cfg_arr[index++].c_str());
+        layer_param->pooled_dims.push_back(pooled_d);
+    }
+
+    return TNN_OK;
+}
+
+Status RoiPoolingLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status RoiPoolingLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    RoiPoolingLayerParam* layer_param = dynamic_cast<RoiPoolingLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->pool_type << " ";
+    output_stream << layer_param->spatial_scale << " ";
+
+    ASSERT(layer_param->pooled_dims.size() == 2);
+    output_stream << layer_param->pooled_dims[0] << " ";
+    output_stream << layer_param->pooled_dims[1] << " ";
+
+    for (auto item : layer_param->pooled_dims)
+        output_stream << item << " ";
+
+    return TNN_OK;
+}
+
+Status RoiPoolingLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(RoiPooling, LAYER_ROIPOOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc
new file mode 100644
index 0000000..a4da982
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(RoiAlign, LAYER_ROIALIGN);
+
+Status RoiAlignLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<RoiAlignLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->mode, 1);
+    GET_INT_3(p->output_height, p->output_width, p->sampling_ratio);
+    GET_FLOAT_1(p->spatial_scale);
+    return TNN_OK;
+}
+
+Status RoiAlignLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status RoiAlignLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto* layer_param = dynamic_cast<RoiAlignLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->mode << " ";
+    output_stream << layer_param->output_height << " ";
+    output_stream << layer_param->output_width << " ";
+    output_stream << layer_param->sampling_ratio << " ";
+    output_stream << layer_param->spatial_scale << " ";
+    return TNN_OK;
+}
+
+Status RoiAlignLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(RoiAlign, LAYER_ROIALIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc
new file mode 100644
index 0000000..96b4f34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Scale, LAYER_SCALE);
+
+Status ScaleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    ScaleLayerParam* layer_param = new ScaleLayerParam();
+    *param                       = layer_param;
+    int index                    = start_index;
+
+    layer_param->axis      = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->num_axes  = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->bias_term = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status ScaleLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    BatchNormLayerResource* layer_res = new BatchNormLayerResource();
+    *resource                         = layer_res;
+
+    std::string layer_name = deserializer.GetString();
+
+    int has_bias = deserializer.GetInt();
+
+    RawBuffer scale;
+    deserializer.GetRaw(scale);
+
+    RawBuffer bias;
+    if (has_bias)
+        deserializer.GetRaw(bias);
+
+    layer_res->scale_handle = scale;
+    layer_res->bias_handle  = bias;
+
+    return TNN_OK;
+}
+
+Status ScaleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    ScaleLayerParam* layer_param = dynamic_cast<ScaleLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->axis << " ";
+    output_stream << layer_param->num_axes << " ";
+    output_stream << layer_param->bias_term << " ";
+
+    return TNN_OK;
+}
+
+Status ScaleLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    ScaleLayerParam* layer_param = dynamic_cast<ScaleLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    BatchNormLayerResource* layer_res = dynamic_cast<BatchNormLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+
+    serializer.PutString(layer_param->name);
+    serializer.PutInt(layer_param->bias_term);
+    serializer.PutRaw(layer_res->scale_handle);
+    if (layer_param->bias_term) {
+        serializer.PutRaw(layer_res->bias_handle);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Scale, LAYER_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc
new file mode 100644
index 0000000..e621ea3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_INTERPRETER(ScatterND, LAYER_SCATTER_ND);
+
+Status ScatterNDLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+    return TNN_OK;
+}
+
+Status ScatterNDLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **resource) {
+    auto layer_resource = CreateLayerRes<ScatterNDLayerResource>(resource);
+    bool has_indices    = deserializer.GetBool();
+    if (has_indices) {
+        GET_BUFFER_FOR_ATTR(layer_resource, indices, deserializer);
+    }
+    bool has_update = deserializer.GetBool();
+    if (has_update) {
+        GET_BUFFER_FOR_ATTR(layer_resource, updates, deserializer);
+    }
+    return TNN_OK;
+}
+
+Status ScatterNDLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    return TNN_OK;
+}
+
+Status ScatterNDLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    CAST_OR_RET_ERROR(layer_param, LayerParam, "invalid layer param", param);
+    auto layer_resource = dynamic_cast<ScatterNDLayerResource *>(resource);
+    if (!layer_resource) {
+        return TNN_OK;
+    }
+    const auto &indices_dims = layer_resource->indices.GetBufferDims();
+    bool has_indices         = !indices_dims.empty();
+    if (has_indices) {
+        serializer.PutBool(true);
+        serializer.PutRaw(layer_resource->indices);
+    } else {
+        serializer.PutBool(false);
+    }
+    const auto &update_dims = layer_resource->updates.GetBufferDims();
+    bool has_update         = !update_dims.empty();
+    if (has_update) {
+        serializer.PutBool(true);
+        serializer.PutRaw(layer_resource->updates);
+    } else {
+        serializer.PutBool(false);
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc
new file mode 100644
index 0000000..84460f5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Selu, LAYER_SELU);
+
+Status SeluLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    SeluLayerParam* layer_param = new SeluLayerParam();
+    *param                      = layer_param;
+    int index                   = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->alpha = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->gamma = static_cast<float>(atof(layer_cfg_arr[index++].c_str()));
+    }
+    return TNN_OK;
+}
+
+Status SeluLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status SeluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<SeluLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->alpha << " " << layer_param->gamma << " ";
+    return TNN_OK;
+}
+
+Status SeluLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Selu, LAYER_SELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc
new file mode 100644
index 0000000..1e1eb00
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Shape, LAYER_SHAPE);
+
+Status ShapeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status ShapeLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ShapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status ShapeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Shape, LAYER_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc
new file mode 100644
index 0000000..c25ac6d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+Status ShuffleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    ShuffleLayerParam* layer_param = new ShuffleLayerParam();
+    *param                         = layer_param;
+    int index                      = start_index;
+
+    layer_param->group = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->group = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status ShuffleLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status ShuffleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    ShuffleLayerParam* layer_param = dynamic_cast<ShuffleLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->group << " ";
+
+    return TNN_OK;
+}
+
+Status ShuffleLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc
new file mode 100644
index 0000000..94dc9bf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(SignedMul, LAYER_SIGNED_MUL);
+
+    Status SignedMulLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+        int index = start_index;
+
+        auto layer_param = new SignedMulLayerParam();
+        *param                         = layer_param;
+
+        layer_param->alpha = (float)atof(layer_cfg_arr[index++].c_str());
+        layer_param->beta = (float)atof(layer_cfg_arr[index++].c_str());
+        layer_param->gamma = (float)atof(layer_cfg_arr[index++].c_str());
+
+        return TNN_OK;
+    }
+
+    Status SignedMulLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+        return TNN_OK;
+    }
+
+    Status SignedMulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+        auto layer_param = dynamic_cast<SignedMulLayerParam*>(param);
+
+        if (nullptr == layer_param) {
+            LOGE("invalid layer param to save\n");
+            return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+        }
+
+        output_stream << layer_param->alpha << " ";
+        output_stream << layer_param->beta << " ";
+        output_stream << layer_param->gamma << " ";
+
+        return TNN_OK;
+    }
+
+    Status SignedMulLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+        return TNN_OK;
+    }
+
+REGISTER_LAYER_INTERPRETER(SignedMul, LAYER_SIGNED_MUL);
+
+} // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc
new file mode 100644
index 0000000..6949611
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Size, LAYER_SIZE);
+
+Status SizeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    return TNN_OK;
+}
+
+Status SizeLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status SizeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    return TNN_OK;
+}
+
+Status SizeLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Size, LAYER_SIZE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc
new file mode 100644
index 0000000..37b266f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Softmax, LAYER_SOFTMAX);
+
+Status SoftmaxLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    SoftmaxLayerParam* layer_param = new SoftmaxLayerParam();
+    *param                         = layer_param;
+    int index                      = start_index;
+
+    layer_param->axis = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->axis = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status SoftmaxLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status SoftmaxLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    SoftmaxLayerParam* layer_param = dynamic_cast<SoftmaxLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->axis << " ";
+    return TNN_OK;
+}
+
+Status SoftmaxLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Softmax, LAYER_SOFTMAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc
new file mode 100644
index 0000000..bd7a371
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(SplitV, LAYER_SPLITV);
+
+Status SplitVLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<SplitVLayerParam>(param);
+
+    int slice_count = 0;
+    GET_INT_2(p->axis, slice_count);
+
+    p->slices.clear();
+    GET_INT_N_INTO_VEC(p->slices, slice_count);
+
+    return TNN_OK;
+}
+
+Status SplitVLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status SplitVLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(splitv_param, SplitVLayerParam, "invalid layer param to save", param);
+
+    output_stream << splitv_param->axis << " ";
+    output_stream << splitv_param->slices.size() << " ";
+    for (auto item : splitv_param->slices) {
+        output_stream << item << " ";
+    }
+
+    return TNN_OK;
+}
+
+Status SplitVLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(SplitV, LAYER_SPLITV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc
new file mode 100644
index 0000000..c08ba54
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+
+Status SquaredDifferenceLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status SquaredDifferenceLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    EltwiseLayerResource* layer_res = new EltwiseLayerResource();
+    *resource                       = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status SquaredDifferenceLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status SquaredDifferenceLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param,
+                                                       LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+
+    return TNN_OK;
+}
+REGISTER_LAYER_INTERPRETER(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc
new file mode 100644
index 0000000..a883043
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Squeeze, LAYER_SQUEEZE);
+
+Status SqueezeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto squeeze_param = CreateLayerParam<SqueezeLayerParam>(param);
+    int size           = 0;
+    GET_INT_1(size);
+    for (int i = 0; i < size; ++i) {
+        int axis = 0;
+        GET_INT_1(axis);
+        squeeze_param->axes.push_back(axis);
+    }
+    return TNN_OK;
+}
+
+Status SqueezeLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status SqueezeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto squeeze_param = dynamic_cast<SqueezeLayerParam *>(param);
+    if (nullptr == squeeze_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << squeeze_param->axes.size() << " ";
+    for (auto axis : squeeze_param->axes) {
+        output_stream << axis << " ";
+    }
+    return TNN_OK;
+}
+
+Status SqueezeLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Squeeze, LAYER_SQUEEZE);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc
new file mode 100644
index 0000000..efc3835
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+#include <algorithm>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status StrideSliceLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    StrideSliceLayerParam* layer_param = new StrideSliceLayerParam();
+    *param                             = layer_param;
+    int index                          = start_index;
+
+    // Note: old order is n c h w,
+    std::vector<int> begins;
+    int n1 = atoi(layer_cfg_arr[index++].c_str());
+    for (; n1 > 0; index++, n1--) {
+        if (index < layer_cfg_arr.size()) {
+            begins.push_back(atoi(layer_cfg_arr[index].c_str()));
+        } else {
+            LOGE("StrideSliceLayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceLayerInterpreter param is invalid");
+        }
+    }
+    std::reverse(begins.begin(), begins.end());
+    layer_param->begins = begins;
+
+    std::vector<int> ends;
+    n1 = atoi(layer_cfg_arr[index++].c_str());
+    for (; n1 > 0; index++, n1--) {
+        if (index < layer_cfg_arr.size()) {
+            ends.push_back(atoi(layer_cfg_arr[index].c_str()));
+        } else {
+            LOGE("StrideSliceLayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceLayerInterpreter param is invalid");
+        }
+    }
+    std::reverse(ends.begin(), ends.end());
+    layer_param->ends = ends;
+
+    std::vector<int> strides;
+    n1 = atoi(layer_cfg_arr[index++].c_str());
+    for (; n1 > 0; index++, n1--) {
+        if (index < layer_cfg_arr.size()) {
+            strides.push_back(atoi(layer_cfg_arr[index].c_str()));
+        } else {
+            LOGE("StrideSliceLayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceLayerInterpreter param is invalid");
+        }
+    }
+    std::reverse(strides.begin(), strides.end());
+    layer_param->strides = strides;
+    return TNN_OK;
+}
+
+Status StrideSliceLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status StrideSliceLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<StrideSliceLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    auto begins = layer_param->begins;
+    std::reverse(begins.begin(), begins.end());
+    output_stream << begins.size() << " ";
+    for (int i = 0; i < begins.size(); i++) {
+        output_stream << begins[i] << " ";
+    }
+
+    auto ends = layer_param->ends;
+    std::reverse(ends.begin(), ends.end());
+    output_stream << ends.size() << " ";
+    for (int i = 0; i < ends.size(); i++) {
+        output_stream << ends[i] << " ";
+    }
+
+    auto strides = layer_param->strides;
+    std::reverse(strides.begin(), strides.end());
+    output_stream << strides.size() << " ";
+    for (int i = 0; i < strides.size(); i++) {
+        output_stream << strides[i] << " ";
+    }
+    return TNN_OK;
+}
+
+Status StrideSliceLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(StrideSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc
new file mode 100644
index 0000000..0b57839
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+Status StrideSliceV2LayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new StrideSliceV2LayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    std::vector<int> begins;
+    int begin_sizes = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < begin_sizes; ++i) {
+        if (index < layer_cfg_arr.size()) {
+            begins.push_back(atoi(layer_cfg_arr[index++].c_str()));
+        } else {
+            LOGE("StrideSliceV2LayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceV2LayerInterpreter param is invalid");
+        }
+    }
+    layer_param->begins = begins;
+
+    std::vector<int> ends;
+    int end_size = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < end_size; ++i) {
+        if (index < layer_cfg_arr.size()) {
+            ends.push_back(atoi(layer_cfg_arr[index++].c_str()));
+        } else {
+            LOGE("StrideSliceV2LayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceV2LayerInterpreter param is invalid");
+        }
+    }
+    layer_param->ends = ends;
+
+    std::vector<int> axes;
+    int axes_size = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < axes_size; ++i) {
+        if (index < layer_cfg_arr.size()) {
+            axes.push_back(atoi(layer_cfg_arr[index++].c_str()));
+        } else {
+            LOGE("StrideSliceV2LayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceV2LayerInterpreter param is invalid");
+        }
+    }
+    layer_param->axes = axes;
+
+    std::vector<int> strides;
+    int stride_size = atoi(layer_cfg_arr[index++].c_str());
+    for (int i = 0; i < stride_size; i++) {
+        if (index < layer_cfg_arr.size()) {
+            strides.push_back(atoi(layer_cfg_arr[index++].c_str()));
+        } else {
+            LOGE("StrideSliceV2LayerInterpreter param is invalid\n");
+            return Status(TNNERR_PARAM_ERR, "StrideSliceV2LayerInterpreter param is invalid");
+        }
+    }
+    layer_param->strides = strides;
+    return TNN_OK;
+}
+
+Status StrideSliceV2LayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status StrideSliceV2LayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return TNNERR_NULL_PARAM;
+    }
+    const auto& begins  = layer_param->begins;
+    const auto& ends    = layer_param->ends;
+    const auto& axes    = layer_param->axes;
+    const auto& strides = layer_param->strides;
+    output_stream << begins.size() << " ";
+    for (const auto& begin : begins) {
+        output_stream << begin << " ";
+    }
+    output_stream << ends.size() << " ";
+    for (const auto& end : ends) {
+        output_stream << end << " ";
+    }
+    output_stream << axes.size() << " ";
+    for (const auto& axis : axes) {
+        output_stream << axis << " ";
+    }
+    output_stream << strides.size() << " ";
+    for (const auto& stride : strides) {
+        output_stream << stride << " ";
+    }
+    return TNN_OK;
+}
+
+Status StrideSliceV2LayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc
new file mode 100644
index 0000000..0f6089b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Sub, LAYER_SUB);
+
+Status SubLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new MultidirBroadcastLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    layer_param->weight_input_index = 1;
+    if (index < layer_cfg_arr.size()) {
+        layer_param->weight_input_index = atoi(layer_cfg_arr[index++].c_str());
+    }
+
+    return TNN_OK;
+}
+
+Status SubLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    EltwiseLayerResource* layer_res = new EltwiseLayerResource();
+    *resource                       = layer_res;
+
+    // std::string layer_name = deserializer.GetString();
+
+    RawBuffer k;
+    deserializer.GetRaw(k);
+    layer_res->element_handle = k;
+
+    return TNN_OK;
+}
+
+Status SubLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
+    if (layer_param && layer_param->weight_input_index >= 0) {
+        output_stream << layer_param->weight_input_index << " ";
+    }
+    return TNN_OK;
+}
+
+Status SubLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    EltwiseLayerResource* layer_res = dynamic_cast<EltwiseLayerResource*>(resource);
+    if (nullptr == layer_res) {
+        LOGE("invalid layer res to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer res to save");
+    }
+    serializer.PutRaw(layer_res->element_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc
new file mode 100644
index 0000000..6cece51
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Tile, LAYER_REPEAT);
+
+Status TileLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<TileLayerParam>(param);
+    int dim_count = (int)layer_cfg_arr.size() - index;
+    GET_INT_N_INTO_VEC_DEFAULT(p->reps, dim_count, 1);
+    return TNN_OK;
+}
+
+Status TileLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status TileLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, TileLayerParam, "invalid tile layer param to save", param);
+    
+    for (int i=0; i< layer_param->reps.size(); i++) {
+        output_stream << layer_param->reps[i] << " ";
+    }
+    
+    return TNN_OK;
+}
+
+Status TileLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Tile, LAYER_REPEAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc
new file mode 100644
index 0000000..f3c422e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(TopK, LAYER_TOPK);
+
+Status TopKLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    int index = start_index;
+    auto p    = CreateLayerParam<TopKLayerParam>(param);
+
+    GET_INT_3(p->axis, p->largest, p->sorted);
+    GET_INT_1(p->k);
+
+    return TNN_OK;
+}
+
+Status TopKLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status TopKLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, TopKLayerParam, "invalid topk param to save", param);
+    output_stream << layer_param->axis << " " << layer_param->largest << " " << 
+                     layer_param->sorted << " " << layer_param->k << " ";
+    return TNN_OK;
+}
+
+Status TopKLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(TopK, LAYER_TOPK);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.cc
new file mode 100644
index 0000000..cbd768e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "unary_op_layer_interpreter.h"
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Cos, LAYER_COS);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Acos, LAYER_ACOS);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Sin, LAYER_SIN);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Asin, LAYER_ASIN);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Tan, LAYER_TAN);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Atan, LAYER_ATAN);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Log, LAYER_LOG);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Reciprocal, LAYER_RECIPROCAL);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Floor, LAYER_FLOOR);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Ceil, LAYER_CEIL);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Softsign, LAYER_SOFTSIGN);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Sigmoid, LAYER_SIGMOID);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Rsqrt, LAYER_RSQRT);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Erf, LAYER_ERF);
+
+REGISTER_UNARY_OP_LAYER_INTERPRETER(Softplus, LAYER_SOFTPLUS);
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h
new file mode 100644
index 0000000..bac4550
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_UNARY_OP_LAYER_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_UNARY_OP_LAYER_INTERPRETER_H_
+
+#include <tnn/core/layer_type.h>
+#include <tnn/core/status.h>
+#include <tnn/interpreter/layer_resource.h>
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+class UnaryOpLayerInterpreter : public AbstractLayerInterpreter {
+public:
+    virtual Status InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+        return TNN_OK;
+    }
+    virtual Status InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+        return TNN_OK;
+    }
+    virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) {
+        return TNN_OK;
+    }
+    virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+        return TNN_OK;
+    }
+    virtual ~UnaryOpLayerInterpreter(){};
+};
+}  // namespace TNN_NS
+
+#define REGISTER_UNARY_OP_LAYER_INTERPRETER(type_string, layer_type)                                                   \
+    TypeLayerInterpreterRegister<UnaryOpLayerInterpreter> g_##layer_type##_layer_interpreter(layer_type);
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_LAYER_INTERPRETER_UNARY_OP_LAYER_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc
new file mode 100644
index 0000000..96f22c0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status UnsqueezeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto layer_param = CreateLayerParam<UnsqueezeLayerParam>(param);
+    int size         = 0;
+    GET_INT_1(size);
+    for (int i = 0; i < size; ++i) {
+        int axis = 0;
+        GET_INT_1(axis);
+        layer_param->axes.push_back(axis);
+    }
+    return TNN_OK;
+}
+
+Status UnsqueezeLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status UnsqueezeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    auto layer_param = dynamic_cast<UnsqueezeLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->axes.size() << " ";
+    for (auto axis : layer_param->axes) {
+        output_stream << axis << " ";
+    }
+    return TNN_OK;
+}
+
+Status UnsqueezeLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Unsqueeze, LAYER_UNSQUEEZE);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc
new file mode 100644
index 0000000..05d7544
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Upsample, LAYER_UPSAMPLE);
+
+    Status UpsampleLayerInterpreter::InterpretProto(str_arr layer_cfg_arr,
+                                                    int start_index,
+                                                    LayerParam** param) {
+        UpsampleLayerParam* layer_param = new UpsampleLayerParam();
+        *param                         = layer_param;
+        int index                      = start_index;
+
+        // pool_type
+        layer_param->mode = atoi(layer_cfg_arr[index++].c_str());
+
+        // scales
+        float scale_h = (float)atof(layer_cfg_arr[index++].c_str());
+        float scale_w = (float)atof(layer_cfg_arr[index++].c_str());
+        layer_param->scales.push_back(scale_w);
+        layer_param->scales.push_back(scale_h);
+
+        layer_param->align_corners = 0;
+        if (index < layer_cfg_arr.size()) {
+            layer_param->align_corners = atoi(layer_cfg_arr[index++].c_str());
+        }
+
+        // size
+        int width, height;
+        if ((index + 1) < layer_cfg_arr.size()) {
+            height = atoi(layer_cfg_arr[index++].c_str());
+            width  = atoi(layer_cfg_arr[index++].c_str());
+            layer_param->dims.push_back(width);
+            layer_param->dims.push_back(height);
+        }
+        return TNN_OK;
+    }
+
+    Status UpsampleLayerInterpreter::InterpretResource(
+        Deserializer& deserializer, LayerResource** resource) {
+        return TNN_OK;
+    }
+
+    Status UpsampleLayerInterpreter::SaveProto(std::ofstream& output_stream,
+                                               LayerParam* param) {
+        UpsampleLayerParam* layer_param =
+            dynamic_cast<UpsampleLayerParam*>(param);
+        if (nullptr == layer_param) {
+            LOGE("invalid layer param to save\n");
+            return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+        }
+
+        output_stream << layer_param->mode << " ";
+
+        /*ASSERT(layer_param->scales.size() == 2);
+        output_stream << layer_param->scales[1] << " ";
+        output_stream << layer_param->scales[0] << " ";*/
+        for(int i = layer_param->scales.size() - 1; i >= 0; i--) {
+            output_stream << layer_param->scales[i] << " ";
+        }
+
+        output_stream << layer_param->align_corners << " ";
+
+        if (layer_param->dims.size() == 2) {
+            output_stream << layer_param->dims[1] << " ";
+            output_stream << layer_param->dims[0] << " ";
+        }
+
+        return TNN_OK;
+    }
+
+    Status UpsampleLayerInterpreter::SaveResource(Serializer& serializer,
+                                                  LayerParam* param,
+                                                  LayerResource* resource) {
+        return TNN_OK;
+    }
+
+REGISTER_LAYER_INTERPRETER(Upsample, LAYER_UPSAMPLE);
+
+}  // namespace TNN_NS
+
+
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.cc
new file mode 100644
index 0000000..d8d8532
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.cc
@@ -0,0 +1,423 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/model_interpreter.h"
+#include <stdlib.h>
+#include <sstream>
+
+#include "tnn/core/common.h"
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/md5.h"
+
+namespace TNN_NS {
+
+TypeModelInterpreterRegister<TypeModelInterpreterCreator<ModelInterpreter>> g_tnn_model_interpreter_register(
+    MODEL_TYPE_TNN);
+
+std::string ModelInterpreter::Transfer(std::string content) {
+    return content;
+}
+
+// Check if the magic number is valid.
+bool ModelInterpreter::IsValidVersionNumber(uint32_t number) {
+    return number == g_version_magic_number || number == g_version_magic_number_v2;
+}
+
+std::shared_ptr<Deserializer> ModelInterpreter::GetDeserializer(std::istream &is) {
+    return std::make_shared<Deserializer>(is);
+}
+
+ModelInterpreter::ModelInterpreter() {}
+
+ModelInterpreter::ModelInterpreter(const ModelInterpreter &interp) {
+    this->version_magic_number = interp.version_magic_number;
+
+    if (nullptr != this->net_structure_) {
+        delete this->net_structure_;
+        this->net_structure_ = nullptr;
+    }
+    this->net_structure_ = interp.net_structure_->CreateNew();
+
+    if (nullptr == this->net_resource_) {
+        this->net_resource_ = new NetResource();
+    }
+
+    *(this->net_resource_) = *interp.net_resource_;
+
+    this->params_md5_ = interp.params_md5_;
+}
+
+ModelInterpreter &ModelInterpreter::operator=(ModelInterpreter interp) {
+    if (this == &interp) {
+        return *this;
+    }
+
+    this->version_magic_number = interp.version_magic_number;
+
+    if (nullptr != this->net_structure_) {
+        delete this->net_structure_;
+        this->net_structure_ = nullptr;
+    }
+    this->net_structure_ = interp.net_structure_->CreateNew();
+
+    if (nullptr == this->net_resource_) {
+        this->net_resource_ = new NetResource();
+    }
+    *(this->net_resource_) = *interp.net_resource_;
+
+    this->params_md5_ = interp.params_md5_;
+
+    return *this;
+}
+
+// Interpret the proto and model.
+Status ModelInterpreter::Interpret(std::vector<std::string> &params) {
+    std::string empty_content = "";
+
+    auto &proto_content = params.size() > 0 ? params[0] : empty_content;
+    Status status       = InterpretProto(proto_content);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    auto &model_content = params.size() > 1 ? params[1] : empty_content;
+    status              = InterpretModel(model_content);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    for (auto item : params) {
+        params_md5_.push_back(md5(item));
+        LOGD("model params md5: %s\n", md5(item).c_str());
+    }
+    return status;
+}
+
+// Copy Interpreter
+std::shared_ptr<AbstractModelInterpreter> ModelInterpreter::Copy() {
+    std::shared_ptr<AbstractModelInterpreter> interp(new ModelInterpreter(*this));
+    return interp;
+}
+
+Status ModelInterpreter::InterpretProto(std::string &content) {
+    Status ret              = TNN_OK;
+    NetStructure *structure = GetNetStructure();
+    // NOTE??????
+    structure->source_model_type = MODEL_TYPE_TNN;
+
+    /*
+     * each line of tnn proto File is in this format :
+     *  "xxxxxxxxx,"
+     * Here we remove the leading and tailing " and \n
+     */
+    int size           = static_cast<int>(content.size());
+    char *proto_buffer = new char[size + 1];
+    size_t fill        = 0;
+    for (size_t i = 0; i < size; ++i) {
+        if (content[i] != '\"' && content[i] != '\n') {
+            proto_buffer[fill++] = content[i];
+        }
+    }
+    proto_buffer[fill] = '\0';
+
+    str_arr cfg_arr;
+    if (fill == 0) {
+        delete[] proto_buffer;
+        return Status(TNNERR_INVALID_NETCFG, "proto content is empty");
+    }
+
+    ret = SplitUtils::SplitStr(proto_buffer, cfg_arr, ",", true, false);
+    delete[] proto_buffer;
+    if (ret != TNN_OK) {
+        return Status(TNNERR_INVALID_NETCFG, "split proto error");
+    }
+    if (cfg_arr.empty() || cfg_arr.size() <= 5) {
+        return Status(TNNERR_INVALID_NETCFG, "content line <= 5");
+    }
+
+    {  // get magic number
+        str_arr cfg_line0;
+        ret = SplitUtils::SplitStr(cfg_arr[0].c_str(), cfg_line0, " ", true, false);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+        if (cfg_line0.size() >= 4) {
+            this->version_magic_number = atoll(cfg_line0[3].c_str());
+        }
+    }
+
+    std::string inputs_content = cfg_arr[1];
+    ret                        = InterpretInput(inputs_content);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    std::string outputs_content = cfg_arr[3];
+    ret                         = InterpretOutput(outputs_content);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    for (int i = layer_cfg_start_id; i < cfg_arr.size(); i++) {
+        std::string layer_str = cfg_arr.at(i);
+        if (layer_str.empty()) {
+            continue;
+        }
+        ret = InterpretLayer(layer_str);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ModelInterpreter::InterpretInput(const std::string &inputs_content) {
+    NetStructure *structure = GetNetStructure();
+    str_arr inputs_cfg_vec;
+    Status ret = SplitUtils::SplitStr(inputs_content.c_str(), inputs_cfg_vec, ":", true, false);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_INVALID_NETCFG, "split input line error");
+    }
+    if (this->version_magic_number == g_version_magic_number) {
+        /*
+         * input list is separated by : symbol
+         * eg:
+         *  input0 1 3 384 128 : input1 1 3 64 64
+         */
+        for (int i = 0; i < inputs_cfg_vec.size(); i++) {
+            str_arr input_cfg_vec;
+            ret = SplitUtils::SplitStr(inputs_cfg_vec[i].c_str(), input_cfg_vec, " ", true, false);
+            if (ret != TNN_OK || input_cfg_vec.size() < input_layer_cfg_count) {
+                return Status(TNNERR_INVALID_NETCFG, "split input line error");
+            }
+            DimsVector &input_shape = structure->inputs_shape_map[input_cfg_vec[0]];
+            // input_shape.set_name(input_cfg_vec[0]);
+            for (int dim_i = 1; dim_i < input_cfg_vec.size(); dim_i++) {
+                input_shape.push_back(atoi(input_cfg_vec[dim_i].c_str()));
+            }
+        }
+    } else if (this->version_magic_number == g_version_magic_number_v2) {
+        /* new tnn input format
+         * input list is separated by : symbol
+         * eg:
+         *  input_name size n c h w date_type : input_name size n c h w data_type
+         */
+        for (const auto &config : inputs_cfg_vec) {
+            str_arr input_cfg;
+            ret = SplitUtils::SplitStr(config.c_str(), input_cfg, " ", true, false);
+            if (ret != TNN_OK || input_cfg.size() < input_layer_cfg_count) {
+                return Status(TNNERR_INVALID_NETCFG, "split input line error");
+            }
+            DimsVector &input_shape = structure->inputs_shape_map[input_cfg[0]];
+            int dims_size           = atoi(input_cfg[1].c_str());
+            for (int i = 2; i < dims_size + 2; ++i) {
+                if (i >= input_cfg.size()) {
+                    return Status(TNNERR_INVALID_NETCFG, "get input dims error");
+                }
+                input_shape.push_back(atoi(input_cfg[i].c_str()));
+            }
+            DataType data_type                           = (DataType)atoi(input_cfg[input_cfg.size() - 1].c_str());
+            structure->input_data_type_map[input_cfg[0]] = data_type;
+        }
+    } else {
+        LOGE("Do not support tnn proto type\n");
+        return Status(TNNERR_INVALID_MODEL, "Do not support tnn proto type");
+    }
+
+    return TNN_OK;
+}
+
+Status ModelInterpreter::InterpretOutput(const std::string &outputs_content) {
+    NetStructure *structure = GetNetStructure();
+    str_arr output_cfg_vec;
+    Status ret = SplitUtils::SplitStr(outputs_content.c_str(), output_cfg_vec, " ", true, false);
+    if (ret != TNN_OK || output_cfg_vec.size() <= 0) {
+        return Status(TNNERR_INVALID_NETCFG, "split output line error");
+    }
+    for (auto iter : output_cfg_vec) {
+        structure->outputs.insert(iter);
+    }
+    return TNN_OK;
+}
+
+Status ModelInterpreter::InterpretLayer(const std::string &layer_str) {
+    NetStructure *structure     = GetNetStructure();
+    auto &layer_interpreter_map = GetLayerInterpreterMap();
+    str_arr layer_cfg_arr;
+    Status ret = SplitUtils::SplitStr(layer_str.c_str(), layer_cfg_arr, " ", true, true);
+    if (ret != TNN_OK || layer_cfg_arr.empty()) {
+        return Status(TNNERR_INVALID_NETCFG, "split layer info error");
+    }
+
+    auto cur_layer = std::make_shared<LayerInfo>();
+    // 0.LayerType;1.layer_name;2.input_count;3.output_count
+    std::string type_str = layer_cfg_arr[0];
+    type_str             = Transfer(type_str);
+    LayerType type       = GlobalConvertLayerType(type_str);
+    if (type == LAYER_NOT_SUPPORT) {
+        LOGE("Error: layer type %s is not supported.\n", layer_cfg_arr[0].c_str());
+        return Status(TNNERR_PARAM_ERR, "layer type is not supported");
+    }
+    cur_layer->type     = type;
+    cur_layer->type_str = type_str;
+    cur_layer->name     = Transfer(layer_cfg_arr[1]);
+
+    int in_count = atoi(layer_cfg_arr[2].c_str());
+    cur_layer->inputs.clear();
+    int out_count = atoi(layer_cfg_arr[3].c_str());
+    cur_layer->outputs.clear();
+    int in_id  = layer_param_start_id;
+    int in_end = in_id + in_count;
+
+    cur_layer->inputs.reserve(std::max(in_end - in_id, 1));
+    for (; in_id < in_end; in_id++) {
+        auto blob_name = Transfer(layer_cfg_arr[in_id]);
+        cur_layer->inputs.push_back(blob_name);
+        structure->blobs.insert(blob_name);
+    }
+
+    int out_id  = in_end;
+    int out_end = out_id + out_count;
+
+    cur_layer->outputs.reserve(std::max(out_end - out_id, 1));
+    for (; out_id < out_end; out_id++) {
+        auto blob_name = Transfer(layer_cfg_arr[out_id]);
+        cur_layer->outputs.push_back(blob_name);
+        structure->blobs.insert(blob_name);
+    }
+
+    LayerParam *param      = NULL;
+    auto layer_interpreter = layer_interpreter_map[type];
+    if (layer_interpreter != NULL) {
+        layer_interpreter->InterpretProto(layer_cfg_arr, out_end, &param);
+    }
+
+    if (!param) {
+        param = new LayerParam();
+    }
+    // is quantized
+    if (type_str.compare(0, 9, "Quantized") == 0) {
+        param->quantized = true;
+    }
+
+    // type
+    if (param && layer_cfg_arr.size() >= 1) {
+        param->type = cur_layer->type_str;
+    }
+
+    // name
+    if (param && layer_cfg_arr.size() >= 2) {
+        param->name = cur_layer->name;
+    }
+
+    cur_layer->param = shared_ptr<LayerParam>(param);
+
+    if (ret != TNN_OK) {
+        return TNNERR_INVALID_NETCFG;
+    }
+
+    structure->layers.push_back(cur_layer);
+    return TNN_OK;
+}
+
+Status ModelInterpreter::InterpretModel(std::string &model_content) {
+    NetResource *net_resource = GetNetResource();
+
+    const auto model_length = model_content.length();
+    if (model_length <= 0) {
+#ifdef GENERATE_RESOURCE
+        LOGD("model content is empty, will generate random data\n");
+        return TNN_OK;
+#else
+        return Status(TNNERR_LOAD_MODEL, "model content is invalid");
+#endif
+    }
+
+    std::istringstream content_stream;
+    content_stream.str(model_content);
+
+    uint32_t magic_version_number = 0;
+    content_stream.read(reinterpret_cast<char *>(&magic_version_number), sizeof(g_version_magic_number));
+    if (!IsValidVersionNumber(magic_version_number)) {
+        content_stream.seekg(0, std::ios::beg);
+    }
+
+    res_header header;
+    auto deserializer = GetDeserializer(content_stream);
+    header.deserialize(*deserializer);
+    if (header.layer_cnt_ < 0 || header.layer_cnt_ >= 10000) {
+        LOGE("tnnmodel is invalid, maybe you should upgrade TNN\n");
+        return Status(TNNERR_INVALID_MODEL, "Error: model is illegal");
+    }
+    
+    auto &layer_interpreter_map = GetLayerInterpreterMap();
+    for (int index = 0; index < header.layer_cnt_; ++index) {
+        layer_header ly_head;
+        ly_head.deserialize(*deserializer);
+
+        LayerResource *layer_resource = NULL;
+        auto layer_interpreter        = layer_interpreter_map[ly_head.type_];
+        // refactor later, layer_interpreter NULL return error_code.
+        if (layer_interpreter != NULL) {
+            Status result = layer_interpreter->InterpretResource(*deserializer, &layer_resource);
+            if (result != TNN_OK) {
+                return result;
+            }
+            net_resource->resource_map[ly_head.name_] = std::shared_ptr<LayerResource>(layer_resource);
+        } else {
+            LOGE(
+                "Error: layer_interpreter nil name:%s type_from_str:%s "
+                "type:%d\n",
+                ly_head.name_.c_str(), ly_head.type_str_.c_str(), ly_head.type_);
+            return Status(TNNERR_LOAD_MODEL, "Error: layer_interpreter is nil");
+        }
+    }
+
+    //解析constant_map
+    const auto pos_cur = content_stream.tellg();
+    content_stream.seekg(0, std::ios::end);
+    auto pos_diff = content_stream.tellg() - pos_cur;
+    content_stream.seekg(pos_cur, std::ios::beg);
+    if (pos_diff < 4) {
+        return TNN_OK;
+    }
+
+    uint32_t magic_number_ignore = deserializer->GetInt();
+    int const_map_size           = deserializer->GetInt();
+    ConstantResource const_map;
+    for (int ii = 0; ii < const_map_size; ii++) {
+        auto key    = deserializer->GetString();
+        auto buffer = std::make_shared<RawBuffer>();
+        deserializer->GetRaw(*(buffer.get()));
+
+        const_map[key] = buffer;
+    }
+    net_resource->constant_map = const_map;
+
+    return TNN_OK;
+}
+
+Status ModelInterpreter::RegisterLayerInterpreter(LayerType type, AbstractLayerInterpreter *interpreter) {
+    std::map<LayerType, std::shared_ptr<AbstractLayerInterpreter>> &layer_interpreter_map = GetLayerInterpreterMap();
+    layer_interpreter_map[type] = std::shared_ptr<AbstractLayerInterpreter>(interpreter);
+    return TNN_OK;
+}
+
+std::map<LayerType, std::shared_ptr<AbstractLayerInterpreter>> &ModelInterpreter::GetLayerInterpreterMap() {
+    static std::map<LayerType, std::shared_ptr<AbstractLayerInterpreter>> layer_interpreter_map;
+    return layer_interpreter_map;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.h b/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.h
new file mode 100644
index 0000000..8f2f4a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/model_interpreter.h
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_INTERPRETER_H_
+
+#include <algorithm>
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/tnn/objseri.h"
+
+using namespace TNN_NS;
+namespace TNN_NS {
+
+class AbstractLayerInterpreter;
+
+static const int layer_cfg_start_id    = 5;
+static const int layer_param_start_id  = 4;
+static const int input_layer_cfg_count = 2;
+
+// refactor later
+struct res_header : public Serializable {
+    int layer_cnt_;
+
+    res_header() : layer_cnt_(0) {}
+
+public:
+    virtual void serialize(Serializer& out) {
+        out.PutInt(layer_cnt_);
+    }
+
+    virtual void deserialize(Deserializer& in) {
+        layer_cnt_ = in.GetInt();
+        layer_cnt_ = layer_cnt_ & 0x1FFFFFFF;
+    }
+};
+
+struct layer_header : public Serializable {
+public:
+    layer_header() {
+        type_ = LAYER_NOT_SUPPORT;
+    }
+
+public:
+    virtual void serialize(Serializer& out) {
+        out.PutInt((LayerType)type_);
+        out.PutString(type_str_);
+        out.PutString(name_);
+    }
+    virtual void deserialize(Deserializer& in) {
+        int type_v = in.GetInt();
+        if (type_v < 0 || type_v > 10000) {
+            return;
+        }
+        type_ = (LayerType)type_v;
+
+        std::string type_str    = in.GetString();
+        LayerType type_from_str = GlobalConvertLayerType(type_str);
+        if (LAYER_NOT_SUPPORT != type_from_str) {
+            type_     = type_from_str;
+            name_     = in.GetString();
+            type_str_ = type_str;
+        } else {
+            name_     = type_str;
+            type_str_ = "";
+        }
+    }
+
+public:
+    LayerType type_;
+    std::string type_str_;
+    std::string name_;
+};
+
+// @brief ModelInterpreter used to interpreter raidnet v1 model
+class ModelInterpreter : public DefaultModelInterpreter {
+public:
+    // @brief copy constructor
+    ModelInterpreter();
+
+    // @brief copy constructor
+    ModelInterpreter(const ModelInterpreter& interp);
+
+    // @brief assign constructor
+    ModelInterpreter& operator=(ModelInterpreter interp);
+
+    // @brief model interpreter load params is proto contents,
+    // model contents.
+    virtual Status Interpret(std::vector<std::string>& params);
+
+    static Status RegisterLayerInterpreter(LayerType type, AbstractLayerInterpreter* creator);
+
+    // @brief get layer interpreter by layer type
+    static std::map<LayerType, std::shared_ptr<AbstractLayerInterpreter>>& GetLayerInterpreterMap();
+
+    // @brief copy interpreter
+    virtual std::shared_ptr<AbstractModelInterpreter> Copy();
+
+protected:
+    virtual Status InterpretProto(std::string& content);
+    virtual Status InterpretModel(std::string& model_content);
+    virtual Status InterpretInput(const std::string& inputs_content);
+    virtual Status InterpretOutput(const std::string& outputs_content);
+    virtual Status InterpretLayer(const std::string& layer_str);
+
+protected:
+    virtual std::string Transfer(std::string content);
+    virtual bool IsValidVersionNumber(uint32_t number);
+    virtual std::shared_ptr<Deserializer> GetDeserializer(std::istream& is);
+    ;
+
+protected:
+    uint32_t version_magic_number = 0;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_INTERPRETER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.cc b/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.cc
new file mode 100644
index 0000000..8171599
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.cc
@@ -0,0 +1,347 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/model_packer.h"
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+#include "tnn/interpreter/tnn/model_interpreter.h"
+#include "tnn/interpreter/tnn/objseri.h"
+
+namespace TNN_NS {
+
+std::string ModelPacker::Transfer(std::string content) {
+    return content;
+}
+
+uint32_t ModelPacker::GetMagicNumber() {
+    return g_version_magic_number_v2;
+}
+
+std::shared_ptr<Serializer> ModelPacker::GetSerializer(std::ostream &os) {
+    return std::make_shared<Serializer>(os);
+}
+
+Status ModelPacker::Pack(std::string proto_path, std::string model_path) {
+    Status ret = TNN_OK;
+    ret        = PackProto(proto_path);
+    if (ret != TNN_OK) {
+        LOGE("Pack TNN Prototxt failed!\n");
+        return ret;
+    }
+    ret = PackModel(model_path);
+    if (ret != TNN_OK) {
+        LOGE("Pack TNN Model failed!\n");
+        return ret;
+    }
+    return TNN_OK;
+}
+
+void ModelPacker::SetVersion(int version) {
+    model_version_ = version;
+}
+
+std::shared_ptr<LayerInfo> ModelPacker::FindLayerInfo(std::string layer_name) {
+    std::shared_ptr<LayerInfo> layer_info;
+
+    if (layer_name.rfind(BLOB_SCALE_SUFFIX) != std::string::npos) {
+        // blob scale layer
+        layer_info           = std::make_shared<LayerInfo>();
+        layer_info->type     = LAYER_BLOB_SCALE;
+        layer_info->type_str = "BlobScale";
+        layer_info->name     = layer_name;
+    } else {
+        layer_info = GetLayerInfoFromName(GetNetStructure(), layer_name);
+    }
+
+    return layer_info;
+}
+
+Status ModelPacker::PackProto(std::string file_path) {
+    Status ret              = TNN_OK;
+    NetStructure *net_struc = GetNetStructure();
+
+    std::ofstream write_stream;
+    write_stream.open(file_path);
+    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
+        write_stream.close();
+        LOGE("invalid proto file name! (%s)\n", file_path.c_str());
+        return Status(TNNERR_PACK_MODEL, "proto file cannot be written");
+    }
+
+    // 1st line: "1 <blob_size> 1 <magic_num> ,"
+    auto magic_number = GetMagicNumber();
+    if (magic_number > 0) {
+        write_stream << "\"1 " << net_struc->blobs.size() << " 1 " << magic_number << " ,\"" << std::endl;
+    } else {
+        write_stream << "\"1 " << net_struc->blobs.size() << " 1 "
+                     << ",\"" << std::endl;
+    }
+
+    // 2nd line: "input_name size n c h w date_type : input_name size n c h w data_type ... ,"
+    write_stream << "\"";
+    int input_count = net_struc->inputs_shape_map.size();
+    int idx         = 0;
+    for (auto input_shape : net_struc->inputs_shape_map) {
+        write_stream << input_shape.first << " ";
+        const auto& input_dims = input_shape.second;
+        if (magic_number == g_version_magic_number_v2){
+            write_stream << input_dims.size() << " ";
+        }
+        for (auto item : input_shape.second) {
+            write_stream << item << " ";
+        }
+        if (magic_number == g_version_magic_number_v2) {
+            const auto& input_data_type_map = net_struc->input_data_type_map;
+            if (input_data_type_map.find(input_shape.first) != input_data_type_map.end()) {
+                write_stream << input_data_type_map.find(input_shape.first)->second << " ";
+            } else {
+                // default data type: float
+                write_stream << "0" << " ";
+            }
+        }
+        if (input_count > 1 && idx < (input_count - 1)) {
+            write_stream << ": ";
+        }
+        idx++;
+    }
+    write_stream << ",\"" << std::endl;
+
+    // 3rd line: all blobs  " <blob1> <blob2> ... ,"
+    write_stream << "\" ";
+    for (auto item : net_struc->blobs) {
+        write_stream << item << " ";
+    }
+    write_stream << ",\"" << std::endl;
+
+    // 4th line: "<output_blob1> <output_blob2> .., ,"
+    write_stream << "\"";
+    for (auto item : net_struc->outputs) {
+        write_stream << item << " ";
+    }
+    write_stream << ",\"" << std::endl;
+
+    // 5th line: " <layer_count> ,"
+    write_stream << "\" " << net_struc->layers.size() << " ,\"" << std::endl;
+
+    // each layer info
+    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
+    for (auto item : net_struc->layers) {
+        write_stream << "\"";
+        // layer type
+        std::string layer_type_str = item->type_str;
+        if (item->param->quantized) {
+            if (layer_type_str.compare(0, 9, "Quantized") != 0) {
+                layer_type_str = "Quantized" + layer_type_str;
+            }
+        }
+        layer_type_str = Transfer(layer_type_str);
+        write_stream << layer_type_str << " ";
+
+        // layer name
+        std::string layer_name = item->name;
+        layer_name             = Transfer(layer_name);
+        write_stream << layer_name << " ";
+
+        // input/output size
+        write_stream << item->inputs.size() << " " << item->outputs.size() << " ";
+        // input name
+        for (auto name : item->inputs) {
+            std::string input_name = name;
+            input_name             = Transfer(input_name);
+            write_stream << input_name << " ";
+        }
+
+        // output name
+        for (auto name : item->outputs) {
+            std::string output_name = name;
+            output_name             = Transfer(output_name);
+            write_stream << output_name << " ";
+        }
+
+        auto layer_interpreter = layer_interpreter_map[item->type];
+        if (layer_interpreter != nullptr) {
+            layer_interpreter->SaveProto(write_stream, item->param.get());
+        }
+
+        write_stream << ",\"" << std::endl;
+    }
+
+    write_stream.close();
+
+    return TNN_OK;
+}
+
+Status ModelPacker::PackModel(std::string file_path) {
+    NetResource *net_resource = GetNetResource();
+    NetStructure *net_struct  = GetNetStructure();
+    std::ofstream write_stream;
+    write_stream.open(file_path, std::ios::binary);
+    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
+        write_stream.close();
+        LOGE("invalid model file name! (%s)\n", file_path.c_str());
+        return Status(TNNERR_PACK_MODEL, "model file cannot be written");
+    }
+    auto magic_number = GetMagicNumber();
+    if (magic_number > 0) {
+        write_stream.write(reinterpret_cast<char *>(&magic_number), sizeof(uint32_t));
+    }
+
+    res_header header;
+    header.layer_cnt_ = 0;
+
+    int resource_count = 0;
+    auto serializer    = GetSerializer(write_stream);
+    auto ret           = PackLayers(serializer, false, resource_count);
+    if (ret != TNN_OK) {
+        write_stream.close();
+        return ret;
+    }
+
+    header.layer_cnt_ = resource_count;
+    if (header.layer_cnt_ < 0) {
+        return Status(TNNERR_INVALID_MODEL, "invalid model: layer count is less than 1");
+    }
+    header.serialize(*serializer);
+
+    ret = PackLayers(serializer, true, resource_count);
+    if (ret != TNN_OK) {
+        write_stream.close();
+        return ret;
+    }
+    
+    // save const_map
+    auto const_map = net_resource->constant_map;
+    if (const_map.size() > 0) {
+        // write magic num
+        serializer->PutInt(magic_number);
+        // write const map size
+        serializer->PutInt((int)const_map.size());
+        for (const auto& iter : const_map) {
+            serializer->PutString(iter.first);
+            serializer->PutRaw(*(iter.second.get()));
+        }
+    }
+    
+    write_stream.close();
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status ModelPacker::PackLayers(std::shared_ptr<Serializer> &serializer, bool save_resource, int &resource_count) {
+    resource_count = 0;
+
+    NetResource *net_resource = GetNetResource();
+    NetStructure *net_struct  = GetNetStructure();
+
+    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
+    auto layers                 = net_struct->layers;
+    auto resource_map           = net_resource->resource_map;
+
+    std::set<std::string> blob_scale_set;
+    Status result;
+    for (const auto &layer_info : layers) {
+        // save input blobs scale
+        std::string layer_name = layer_info->name;
+        if (layer_info->param->quantized) {
+            for (auto &input_name : layer_info->inputs) {
+                auto blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
+                    continue;
+                }
+                if (resource_map.find(blob_scale_name) == resource_map.end() ||
+                    resource_map.find(blob_scale_name)->second == nullptr) {
+                    continue;
+                }
+                if (save_resource) {
+                    result = PackResource(resource_map, blob_scale_name, serializer);
+                    if (result != TNN_OK) {
+                        return result;
+                    }
+                }
+                resource_count++;
+                blob_scale_set.insert(blob_scale_name);
+            }
+        }
+        // save layer resource
+        if (resource_map.find(layer_name) != resource_map.end() && resource_map.find(layer_name)->second != nullptr) {
+            if (save_resource) {
+                result = PackResource(resource_map, layer_name, serializer);
+                if (result != TNN_OK) {
+                    return result;
+                }
+            }
+            resource_count++;
+        }
+        // save output blob scale
+        if (layer_info->param->quantized) {
+            for (auto &output_name : layer_info->outputs) {
+                auto blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
+                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
+                    continue;
+                }
+                if (resource_map.find(blob_scale_name) == resource_map.end() ||
+                    resource_map.find(blob_scale_name)->second == nullptr) {
+                    continue;
+                }
+                if (save_resource) {
+                    result = PackResource(resource_map, blob_scale_name, serializer);
+                    if (result != TNN_OK) {
+                        return result;
+                    }
+                }
+                resource_count++;
+                blob_scale_set.insert(blob_scale_name);
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status ModelPacker::PackResource(std::map<std::string, std::shared_ptr<LayerResource>> &resource_map,
+                                 std::string &layer_name, std::shared_ptr<Serializer> &serializer) {
+    // quantized
+    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
+    auto iter                   = resource_map.find(layer_name);
+    auto layer_info             = FindLayerInfo(layer_name);
+    layer_header ly_header;
+    ly_header.name_                = iter->first;
+    ly_header.type_                = layer_info->type;
+    ly_header.type_str_            = layer_info->type_str;
+    static int resource_pack_count = 0;
+    ly_header.serialize(*serializer);
+    LayerResource *layer_resource = iter->second.get();
+    auto layer_interpreter        = layer_interpreter_map[layer_info->type];
+    if (layer_interpreter != nullptr) {
+        Status result = layer_interpreter->SaveResource(*serializer, layer_info->param.get(), layer_resource);
+        if (result != TNN_OK) {
+            LOGE(
+                "Error: layer interpreter save resource failed (name:%s "
+                "type_from_str:%s type:%d)\n",
+                ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
+            return Status(TNNERR_PACK_MODEL, "model content is invalid");
+        }
+    } else {
+        LOGE(
+            "Error: layer interpreter is null (name:%s "
+            "type_from_str:%s type:%d)\n",
+            ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
+        return Status(TNNERR_PACK_MODEL, "unsupport layer resource type");
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.h b/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.h
new file mode 100644
index 0000000..b1fc54c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/model_packer.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_PACKER_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_PACKER_H_
+
+#include "tnn/interpreter/default_model_packer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+
+using namespace TNN_NS;
+namespace TNN_NS {
+
+// @brief ModelPacker used to save raidnet v1 model
+class ModelPacker : public DefaultModelPacker {
+public:
+    ModelPacker(NetStructure *net_struct, NetResource *net_res)
+        : DefaultModelPacker(net_struct, net_res), model_version_(1) {}
+    // @brief save the rpn model into files
+    virtual Status Pack(std::string proto_path, std::string model_path);
+
+    // @brief set the model version to pack
+    void SetVersion(int version);
+
+private:
+    std::shared_ptr<LayerInfo> FindLayerInfo(std::string layer_name);
+    Status PackProto(std::string file_path);
+    Status PackModel(std::string file_path);
+    Status PackLayers(std::shared_ptr<Serializer> &serializer, bool save_resource, int &resource_count);
+    Status PackResource(std::map<std::string, std::shared_ptr<LayerResource>> &resource_map, std::string &layer_name,
+                        std::shared_ptr<Serializer> &serializer);
+
+protected:
+    int model_version_ = 1;
+
+    virtual std::string Transfer(std::string content);
+    virtual uint32_t GetMagicNumber();
+    virtual std::shared_ptr<TNN_NS::Serializer> GetSerializer(std::ostream &os);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_TNN_MODEL_PACKER_H_
diff --git a/3rdparty/TNN/source/tnn/interpreter/tnn/objseri.h b/3rdparty/TNN/source/tnn/interpreter/tnn/objseri.h
new file mode 100644
index 0000000..5310561
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/interpreter/tnn/objseri.h
@@ -0,0 +1,221 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_INTERPRETER_TNN_OBJSERI_H_
+#define TNN_SOURCE_TNN_INTERPRETER_TNN_OBJSERI_H_
+
+#include <string>
+#include <fstream>
+#include <string>
+#include <typeinfo>
+#include "tnn/core/common.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+#define BLOB_SCALE_SUFFIX "_scale_data_"
+
+
+namespace TNN_NS {
+    static const uint32_t g_version_magic_number = 0x0FABC0002;
+    static const uint32_t g_version_magic_number_v2 = 0x0FABC0004;
+
+    class Serializer {
+    public:
+        explicit Serializer(std::ostream &os) : _ostream(os) {}
+
+        void PutBool(bool value) {
+            return put_basic_t<bool>(value);
+        }
+        void PutShort(short value) {
+            return put_basic_t<short>(value);
+        }
+        void PutInt(int value) {
+            return put_basic_t<int>(value);
+        }
+        void PutString(const std::string &value) {
+            return PutString_t<std::string>(value);
+        }
+
+        virtual void PutRaw(TNN_NS::RawBuffer &value) {
+            int length = value.GetBytesSize();
+            auto data_type = (TNN_NS::DataType)value.GetDataType();
+            DimsVector  dims  = value.GetBufferDims();
+            char *buffer = value.force_to<char *>();
+            PutRaw(length, buffer, dims, data_type);
+        }
+        
+        void PutRaw(int length, char* buffer, std::vector<int> dims, DataType data_type = DATA_TYPE_FLOAT)
+        {
+            PutInt(g_version_magic_number_v2);
+            PutInt(data_type);
+            PutInt(static_cast<int>(length));
+            if (length <= 0) {
+                return;
+            }
+           
+            PutInt((int)(dims.size()));
+            if (dims.size() > 0) {
+                _ostream.write(reinterpret_cast<char *>(dims.data()),
+                               static_cast<std::streamsize>(dims.size() * sizeof(int32_t)));
+            }
+            if (_ostream.bad())
+                return;
+ 
+            _ostream.write(reinterpret_cast<char *>(buffer),
+                           static_cast<std::streamsize>(length));
+            return;
+        }
+
+
+    protected:
+        std::ostream &_ostream;
+        
+        template <typename T>
+        void put_basic_t(T value);
+        template <typename T>
+        void PutString_t(const T &value);
+
+    private:
+        Serializer &operator=(const Serializer &);
+    };
+
+    template <typename T>
+    void Serializer::put_basic_t(T value) {
+        _ostream.write(reinterpret_cast<char *>(&value), sizeof(T));
+        if (_ostream.bad())
+            return;
+    }
+
+    template <typename T>
+    void Serializer::PutString_t(const T &value) {
+        if (typeid(T) == typeid(std::string)) {
+            int len = static_cast<int>(value.length() *
+                                       sizeof(std::string::value_type));
+            PutInt(len);
+            _ostream.write(reinterpret_cast<const char *>(value.data()), len);
+            if (_ostream.bad())
+                return;
+        } else
+            return;
+    }
+
+    class Deserializer {
+    public:
+        explicit Deserializer(std::istream &is) : _istream(is) {}
+
+        bool GetBool() {
+            return get_basic_t<bool>();
+        }
+        short GetShort() {
+            return get_basic_t<short>();
+        }
+        int GetInt() {
+            return get_basic_t<int>();
+        }
+        std::string GetString() {
+            return get_string_t<std::string>();
+        }
+
+        virtual void GetDims(std::vector<int>& dims) {
+            auto magic_number = GetInt();
+            auto data_type = (TNN_NS::DataType)GetInt();
+            int size = GetInt();
+            if (size <= 0) {
+                return;
+            }
+            for (int i = 0; i < size; ++i) {
+                dims.push_back(GetInt());
+            }
+        }
+
+        virtual void GetRaw(TNN_NS::RawBuffer &value) {
+            auto magic_number  = static_cast<uint32_t>(GetInt());
+            auto data_type = (TNN_NS::DataType)GetInt();
+            int length = GetInt();
+            if (length <= 0) {
+                return;
+            }
+
+            DimsVector dims;
+            if(magic_number == g_version_magic_number_v2) {
+                int size = GetInt();
+                for (int i = 0; i < size; ++i) {
+                    dims.push_back(GetInt());
+                }
+            }
+ 
+            value = TNN_NS::RawBuffer(length);
+            value.SetDataType(data_type);
+            value.SetBufferDims(dims);
+
+            char *buffer = value.force_to<char *>();
+            if (_istream.eof())
+                return;
+            
+            _istream.read(buffer, static_cast<std::streamsize>(length));
+            return;
+        }
+
+    protected:
+        std::istream &_istream;
+        
+        template <typename T>
+        T get_basic_t();
+        template <typename T>
+        T get_string_t();
+
+    private:
+        Deserializer &operator=(const Deserializer &);
+    };
+
+    template <typename T>
+    T Deserializer::get_basic_t() {
+        T value = 0;
+        if (_istream.eof())
+            // throw std::exception("unexpected_eof");
+            return value;
+        // T value;
+        _istream.read(reinterpret_cast<char *>(&value), sizeof(T));
+        return value;
+    }
+
+    template <typename T>
+    T Deserializer::get_string_t() {
+        int len = GetInt();
+        T value;
+        if (typeid(T) == typeid(std::string)) {
+            value.resize(len / sizeof(std::string::value_type));
+            if (_istream.eof())
+                return value;
+            _istream.read(reinterpret_cast<char *>(&value[0]), len);
+            // if (_istream.bad())
+            // return value;
+        }
+        return value;
+    }
+
+    class Serializable {
+    public:
+        Serializable() {}
+        virtual ~Serializable() {}
+
+    public:
+        virtual void serialize(Serializer &out)    = 0;
+        virtual void deserialize(Deserializer &in) = 0;
+    };
+
+}  // namespace TNN_NS
+
+
+
+#endif  // TNN_SOURCE_TNN_INTERPRETER_TNN_OBJSERI_H_
diff --git a/3rdparty/TNN/source/tnn/layer/abs_layer.cc b/3rdparty/TNN/source/tnn/layer/abs_layer.cc
new file mode 100644
index 0000000..254e485
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/abs_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Abs, LAYER_ABS);
+
+REGISTER_ELEMENTWISE_LAYER(Abs, LAYER_ABS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/acos_layer.cc b/3rdparty/TNN/source/tnn/layer/acos_layer.cc
new file mode 100644
index 0000000..dbd3853
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/acos_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Acos, LAYER_ACOS);
+
+REGISTER_ELEMENTWISE_LAYER(Acos, LAYER_ACOS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/add_layer.cc b/3rdparty/TNN/source/tnn/layer/add_layer.cc
new file mode 100644
index 0000000..c304c8f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/add_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Add, LAYER_ADD);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Add, LAYER_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/arg_max_or_min_layer.cc b/3rdparty/TNN/source/tnn/layer/arg_max_or_min_layer.cc
new file mode 100644
index 0000000..081c39a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/arg_max_or_min_layer.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+Status ArgMaxOrMinLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    
+    return TNN_OK;
+}
+
+Status ArgMaxOrMinLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto param                      = dynamic_cast<ArgMaxOrMinLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    auto input_blob                 = input_blobs_[0];
+    auto output_blob                = output_blobs_[0];
+    auto output_dims = input_blob->GetBlobDesc().dims;
+    if (param->axis < 0) {
+        param->axis += input_blob->GetBlobDesc().dims.size();
+    }
+    
+    if (!param->keep_dims) {
+        output_dims.erase(output_dims.begin() + param->axis);
+    } else {
+        output_dims[param->axis] = 1;
+    }
+    
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/asin_layer.cc b/3rdparty/TNN/source/tnn/layer/asin_layer.cc
new file mode 100644
index 0000000..b5f3262
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/asin_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Asin, LAYER_ASIN);
+
+REGISTER_ELEMENTWISE_LAYER(Asin, LAYER_ASIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/atan_layer.cc b/3rdparty/TNN/source/tnn/layer/atan_layer.cc
new file mode 100644
index 0000000..d2bf4b6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/atan_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Atan, LAYER_ATAN);
+
+REGISTER_ELEMENTWISE_LAYER(Atan, LAYER_ATAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/base_layer.cc b/3rdparty/TNN/source/tnn/layer/base_layer.cc
new file mode 100644
index 0000000..1c3858f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/base_layer.cc
@@ -0,0 +1,311 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/data_flag_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+#include <mutex>
+#include <sstream>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+BaseLayer::BaseLayer(LayerType type) {
+    this->type_      = type;
+    this->layer_acc_ = nullptr;
+    this->param_     = nullptr;
+    this->resource_  = nullptr;
+}
+
+BaseLayer::~BaseLayer() {
+    if (layer_acc_ != NULL) {
+        delete layer_acc_;
+        layer_acc_ = NULL;
+    }
+}
+
+Status BaseLayer::Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& input_blobs,
+                       std::vector<Blob*>& output_blobs, AbstractDevice* device, bool enable_const_folder) {
+    input_blobs_  = input_blobs;
+    output_blobs_ = output_blobs;
+
+    param_    = param;
+    resource_ = resource;
+    enable_const_folder_ = enable_const_folder;
+
+    auto status = InferOutputDataType();
+    if (status != TNN_OK) {
+        LOGE("InferOutputDataType failed\n");
+        return status;
+    }
+    
+    if (!output_blobs_[0]->NeedAllocateInForward()){
+        status = InferOutputShape();
+        if (status != TNN_OK) {
+            LOGE("InferOutputShape failed\n");
+            return status;
+        }
+    }
+    
+    if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+        for (auto& output_blob : output_blobs) {
+            LOGD("InferOutputShape: %s\n", output_blob->GetBlobDesc().description().c_str());
+        }
+        auto dims = output_blobs[0]->GetBlobDesc().dims;
+        for (auto item : dims) {
+            if (item < 0) {
+                LOGE("Error: layer(%s) output dims is invalid\n", layer_name_.c_str());
+                return Status(TNNERR_LAYER_ERR, "layer output dims is invalid");
+            }
+        }
+    }
+
+    if (device->GetDeviceType() == DEVICE_NAIVE || !IsOutputConstant() ||
+            (device->GetDeviceType() == DEVICE_CUDA && !enable_const_folder)) {
+        layer_acc_ = device->CreateLayerAcc(type_);
+        if (layer_acc_ != NULL) {
+            layer_acc_->SetRuntimeMode(runtime_model_);
+            layer_acc_->SetConstantResource(const_resource_);
+            layer_acc_->SetConstantResourceFlag(const_resource_flag_);
+            return layer_acc_->Init(context, param, resource, input_blobs_, output_blobs_);
+        } else {
+            LOGE("layer acc of type(%d) is nil\n", type_);
+            return Status(TNNERR_LAYER_ERR, "layer acc is nil");
+        }
+    }
+    return TNN_OK;
+}
+
+Status BaseLayer::FillLayerParamWithConstantResource() {
+    return TNN_OK;
+}
+
+Status BaseLayer::InferOutputShape(bool ignore_error) {
+    //get dims from const for input
+    auto const_resource = const_resource_;
+    for (auto iter : input_blobs_) {
+        auto name = iter->GetBlobDesc().name;
+        if (const_resource == nullptr || const_resource->find(name) == const_resource->end()) {
+            continue;
+        }
+        iter->GetBlobDesc().data_type = (*const_resource)[name]->GetDataType();
+        
+        //only DATA_FLAG_CHANGE_NEVER read dims and type from const resource
+        //blob with flag DATA_FLAG_CHANGE_IF_SHAPE_DIFFER may change dims in runtime
+        if (DataFlagUtils::ChangeStatus(iter->GetFlag()) == DATA_FLAG_CHANGE_NEVER) {
+            iter->GetBlobDesc().dims = (*const_resource)[name]->GetBufferDims();
+        }
+    }
+    
+    //
+    if (runtime_model_ == RUNTIME_MODE_NORMAL || GetLayerChangeFlag() == DATA_FLAG_CHANGE_NEVER) {
+        return FillLayerParamWithConstantResource();
+    }
+    return TNN_OK;
+}
+
+Status BaseLayer::InferOutputDataType() {
+    auto const_resource = const_resource_;
+    
+    // Init base type, will re write in different device acc
+    // output data_type = input_data_tyep as default.
+    
+    int flag = DATA_FLAG_CHANGE_NEVER;
+    for (auto iter : input_blobs_) {
+        if (const_resource) {
+            auto res = const_resource->find(iter->GetBlobDesc().name);
+            if (res!= const_resource->end()) {
+                iter->SetFlag(iter->GetFlag() | DATA_FLAG_CHANGE_NEVER);
+                iter->GetBlobDesc().data_type = res->second->GetDataType();
+            }
+        }
+        flag = DataFlagUtils::MinChangeStatus(flag, iter->GetFlag());
+    }
+    
+    //find first blob which is not const
+    auto input_blob_not_const = input_blobs_[0];
+    for (auto input_blob : input_blobs_) {
+        if (const_resource == nullptr || const_resource->find(input_blob->GetBlobDesc().name) == const_resource->end()) {
+            input_blob_not_const = input_blob;
+            break;
+        }
+    }
+    
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = input_blob_not_const->GetBlobDesc().data_type;
+    }
+    
+    for (auto iter : output_blobs_) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+            if (const_resource != nullptr && const_resource->find(iter->GetBlobDesc().name) != const_resource->end()) {
+                flag = flag & 0x0000FFFF;
+            }
+        } else {
+            //allocate output blob of const layer in const folding
+            if (DataFlagUtils::ChangeStatus(flag) != DATA_FLAG_CHANGE_ALWAYS) {
+                flag = flag | DATA_FLAG_ALLOCATE_IN_FORWARD;
+            }
+        }
+
+        iter->SetFlag(flag);
+    }
+    return TNN_OK;
+}
+
+Status BaseLayer::Reshape() {
+    if (!output_blobs_[0]->NeedAllocateInForward()) {
+        auto status = InferOutputShape();
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        auto dims = output_blobs_[0]->GetBlobDesc().dims;
+        for (auto item : dims) {
+            if (item < 0) {
+                LOGE("Error: layer(%s) output dims is invalid\n", layer_name_.c_str());
+                return Status(TNNERR_LAYER_ERR, "layer output dims is invalid");
+            }
+        }
+    }
+    if (layer_acc_ != NULL) {
+        auto status = layer_acc_->ReloadConstantBlobs(input_blobs_, true);
+        RETURN_ON_NEQ(status, TNN_OK);
+        return layer_acc_->Reshape(input_blobs_, output_blobs_);
+    } else {
+        LOGE("layer acc is nil\n");
+        return Status(TNNERR_LAYER_ERR, "layer acc is nil");
+    }
+}
+
+Status BaseLayer::Forward() {
+    if (layer_acc_ != NULL) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+            auto status = layer_acc_->BeforeForward(input_blobs_, output_blobs_);
+            RETURN_ON_NEQ(status, TNN_OK);
+
+            if (!IsOutputConstant() ||
+                    (input_blobs_[0]->GetBlobDesc().device_type == DEVICE_CUDA && !enable_const_folder_)) {
+                status = layer_acc_->Forward(input_blobs_, output_blobs_);
+                RETURN_ON_NEQ(status, TNN_OK);
+            }
+        } else {
+            //dont check the status of InferOutputShape in constant folding
+            auto status = InferOutputShape(true);
+            
+            //fill layer param and infer runtime output shape in BeforeForward
+            status = layer_acc_->BeforeForward(input_blobs_, output_blobs_);
+            RETURN_ON_NEQ(status, TNN_OK);
+            
+            if (IsOutputConstant()) {
+                status = layer_acc_->Forward(input_blobs_, output_blobs_);
+                RETURN_ON_NEQ(status, TNN_OK);
+            } else {
+                status = InferOutputShape(false);
+                RETURN_ON_NEQ(status, TNN_OK);
+            }
+        }
+        
+        return layer_acc_->AfterForward(input_blobs_, output_blobs_);
+    } else {
+        LOGE("layer acc is nil\n");
+        return Status(TNNERR_LAYER_ERR, "layer acc is nil");
+    }
+}
+
+void BaseLayer::SetLayerName(std::string layer_name) {
+    layer_name_ = layer_name;
+}
+
+std::string BaseLayer::GetLayerName() {
+    return layer_name_;
+}
+
+//@brief get all input blobs
+std::vector<Blob*> BaseLayer::GetInputBlobs() {
+    return input_blobs_;
+}
+
+//@brief get all input blobs
+std::vector<Blob*> BaseLayer::GetOutputBlobs() {
+    return output_blobs_;
+}
+
+Status BaseLayer::InferShapeAhead(std::vector<Blob*>& input_blobs, std::vector<Blob*>& output_blobs, LayerParam* param,
+                                  LayerResource* resource) {
+    input_blobs_  = input_blobs;
+    output_blobs_ = output_blobs;
+    param_        = param;
+    resource_     = resource;
+
+    if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+        InferOutputShape();
+    } else {
+        InferOutputShape(true);
+    }
+    return TNN_OK;
+}
+
+void BaseLayer::SetRuntimeBlobMemoryPool(BlobMemoryPool *runtime_blob_pool) {
+    if (layer_acc_) {
+        layer_acc_->SetRuntimeBlobMemoryPool(runtime_blob_pool);
+    }
+}
+
+bool BaseLayer::IsOutputConstant() {
+    for (auto iter : output_blobs_) {
+        if (!iter->IsConstant()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+int BaseLayer::GetLayerChangeFlag() {
+    int flag = DATA_FLAG_CHANGE_NEVER;
+    for (auto iter : output_blobs_) {
+        flag = DataFlagUtils::ChangeStatus(DataFlagUtils::MinChangeStatus(flag, iter->GetFlag()));
+    }
+    return flag;
+}
+
+void BaseLayer::SetConstantResource(ConstantResource* consts) {
+    const_resource_ = consts;
+}
+
+void BaseLayer::SetConstantResourceFlag(ConstantResourceFlag *flags) {
+    const_resource_flag_ = flags;
+}
+
+// @brief set runtime mode
+void BaseLayer::SetRuntimeMode(RuntimeMode mode) {
+    runtime_model_ = mode;
+}
+
+std::map<LayerType, std::shared_ptr<LayerCreator>>& GetGlobalLayerCreatorMap() {
+    // static shared_ptr of LayerCreatorMap.
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerCreator>>); });
+    return *creators;
+}
+
+BaseLayer* CreateLayer(LayerType type) {
+    BaseLayer* cur_layer    = NULL;
+    auto& layer_creater_map = GetGlobalLayerCreatorMap();
+    if (layer_creater_map.count(type) > 0) {
+        cur_layer = layer_creater_map[type]->CreateLayer();
+    }
+    return cur_layer;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/base_layer.h b/3rdparty/TNN/source/tnn/layer/base_layer.h
new file mode 100644
index 0000000..507ef0e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/base_layer.h
@@ -0,0 +1,174 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_LAYER_BASE_LAYER_H_
+#define TNN_SOURCE_TNN_LAYER_BASE_LAYER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+//@brief BaseLaye define the layer interface
+class BaseLayer {
+public:
+    explicit BaseLayer(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~BaseLayer();
+
+    // @brief layer init
+    // @param ...
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+                std::vector<Blob*>& outputs, AbstractDevice* device, bool enable_const_folder=true);
+
+    //@brief Reshape recalculate the output tensor dims
+    virtual Status Reshape();
+
+    //@brief layer infer
+    virtual Status Forward();
+
+    //@brief get layer name
+    std::string GetLayerName();
+
+    //@brief set laye name
+    void SetLayerName(std::string layer_name);
+
+    //@brief get all input blobs
+    virtual std::vector<Blob*> GetInputBlobs();
+
+    //@brief get all output blobs
+    virtual std::vector<Blob*> GetOutputBlobs();
+
+    //@brief infer shape ahead for generate resource
+    virtual Status InferShapeAhead(std::vector<Blob*>& input_blobs, std::vector<Blob*>& output_blobs, LayerParam* param,
+                                   LayerResource* resource);
+    
+    // @brief set runtime bolob pool
+    void SetRuntimeBlobMemoryPool(BlobMemoryPool *runtime_blob_pool);
+    
+    // @brief check if the layer's output is const with flag DATA_FLAG_CHANGE_NEVER or DATA_FLAG_CHANGE_IF_SHAPE_DIFFER
+    bool IsOutputConstant();
+    
+    // @brief check if the layer's output is const with flag DATA_FLAG_CHANGE_IF_SHAPE_DIFFER
+    int GetLayerChangeFlag();
+    
+    // @brief set constant resource
+    void SetConstantResource(ConstantResource* consts);
+    
+    // @brief set constant resource data flag
+    void SetConstantResourceFlag(ConstantResourceFlag* flags);
+    
+    // @brief set runtime mode
+    void SetRuntimeMode(RuntimeMode mode);
+
+protected:
+    LayerType type_;
+
+    std::string layer_name_;
+    std::vector<Blob*> input_blobs_;
+    std::vector<Blob*> output_blobs_;
+    AbstractLayerAcc* layer_acc_;
+
+    LayerParam* param_;
+    LayerResource* resource_;
+    ConstantResource* const_resource_ = nullptr;
+    ConstantResourceFlag* const_resource_flag_ = nullptr;
+    RuntimeMode runtime_model_ = RUNTIME_MODE_NORMAL;
+    bool enable_const_folder_ = true;
+
+    //@brief calculate the output tensor dims
+    virtual Status InferOutputShape(bool ignore_error = false);
+    //@brief infer the output data type, by default it is the same as input. Meanwhile, it will updata the daat flag of output blobs
+    virtual Status InferOutputDataType();
+    //@brief fill layer param with constant resource
+    virtual Status FillLayerParamWithConstantResource();
+};
+
+//@brief LayerCreator define the create layer interface
+class LayerCreator {
+public:
+    virtual BaseLayer* CreateLayer() = 0;
+};
+
+//@brief TypeLayerCreator create TypeLayer
+template <typename T>
+class TypeLayerCreator : public LayerCreator {
+public:
+    explicit TypeLayerCreator(LayerType type) {
+        this->type_ = type;
+    };
+    virtual BaseLayer* CreateLayer() {
+        auto layer = new T(type_);
+        //        auto layer_base = dynamic_cast<BaseLayer*>(layer);
+        return layer;
+    }
+
+protected:
+    LayerType type_;
+};
+
+//@brief TypeLayerCreator register map
+std::map<LayerType, std::shared_ptr<LayerCreator>>& GetGlobalLayerCreatorMap();
+
+//@brief TypeLayerRegister register TypeLayerCreator
+template <typename T>
+class TypeLayerRegister {
+public:
+    explicit TypeLayerRegister(LayerType type) {
+        GetGlobalLayerCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+BaseLayer* CreateLayer(LayerType type);
+
+#define DECLARE_LAYER_WITH_FUNC(type_string, layer_type, extra_funcs)    \
+    class type_string##Layer : public BaseLayer {                                                     \
+    public:                                                                                                                   \
+        type_string##Layer(LayerType ignore) : BaseLayer(layer_type){};                  \
+        virtual ~type_string##Layer(){};                                                                         \
+                                                                                                                                  \
+    protected:                                                                                                              \
+        virtual Status InferOutputShape(bool ignore_error = false);                             \
+        virtual Status InferOutputDataType();                                                               \
+        extra_funcs \
+    }
+
+#define DECLARE_LAYER(type_string, layer_type)                                                 \
+    class type_string##Layer : public BaseLayer {                                                      \
+    public:                                                                                                                    \
+        type_string##Layer(LayerType ignore) : BaseLayer(layer_type){};                   \
+        virtual ~type_string##Layer(){};                                                                         \
+                                                                                                                                   \
+    protected:                                                                                                               \
+        virtual Status InferOutputShape(bool ignore_error = false);                              \
+        virtual Status InferOutputDataType();                                                                \
+    }
+
+#define REGISTER_LAYER(type_string, layer_type)                                                                        \
+    TypeLayerRegister<TypeLayerCreator<type_string##Layer>> g_##layer_type##_register(layer_type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_LAYER_BASE_LAYER_H_
diff --git a/3rdparty/TNN/source/tnn/layer/batch_norm_layer.cc b/3rdparty/TNN/source/tnn/layer/batch_norm_layer.cc
new file mode 100644
index 0000000..b51f5d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/batch_norm_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(BatchNorm, LAYER_BATCH_NORM);
+
+REGISTER_ELEMENTWISE_LAYER(BatchNorm, LAYER_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/bias_add_layer.cc b/3rdparty/TNN/source/tnn/layer/bias_add_layer.cc
new file mode 100644
index 0000000..3d91805
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/bias_add_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(BiasAdd, LAYER_BIAS_ADD);
+
+REGISTER_ELEMENTWISE_LAYER(BiasAdd, LAYER_BIAS_ADD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/bitshift_layer.cc b/3rdparty/TNN/source/tnn/layer/bitshift_layer.cc
new file mode 100644
index 0000000..934d104
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/bitshift_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(BitShift, LAYER_BITSHIFT);
+
+Status BitShiftLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status BitShiftLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    output_blobs_[0]->GetBlobDesc().dims = input_blobs_[0]->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(BitShift, LAYER_BITSHIFT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/cast_layer.cc b/3rdparty/TNN/source/tnn/layer/cast_layer.cc
new file mode 100644
index 0000000..dc65902
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/cast_layer.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Cast, LAYER_CAST);
+
+Status CastLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    auto layer_param = dynamic_cast<CastLayerParam*>(param_);
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = (DataType)layer_param->to;
+    }
+    return TNN_OK;
+}
+
+Status CastLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    auto layer_param  = dynamic_cast<CastLayerParam*>(param_);
+    layer_param->from = (int)input_blob->GetBlobDesc().data_type; // for HUAWEI_NPU
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Cast, LAYER_CAST);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/cbam_fused_pooling_layer.cc b/3rdparty/TNN/source/tnn/layer/cbam_fused_pooling_layer.cc
new file mode 100644
index 0000000..fbdd806
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/cbam_fused_pooling_layer.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+Status CbamFusedPoolingLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status CbamFusedPoolingLayer::InferOutputShape(bool ignore_error) {
+    Blob* input_blob = input_blobs_[0];
+
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    int num         = dims_input[0];
+    int channels    = dims_input[1];
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(channels);
+    output_dims.push_back(1);
+    output_dims.push_back(1);
+
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/cbam_fused_reduce_layer.cc b/3rdparty/TNN/source/tnn/layer/cbam_fused_reduce_layer.cc
new file mode 100644
index 0000000..e53dcc2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/cbam_fused_reduce_layer.cc
@@ -0,0 +1,37 @@
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+Status CbamFusedReduceLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status CbamFusedReduceLayer::InferOutputShape(bool ignore_error) {
+    Blob* input_blob = input_blobs_[0];
+
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    int num         = dims_input[0];
+    int channels    = 2;
+    int height      = dims_input[2];
+    int width       = dims_input[3];
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(channels);
+    output_dims.push_back(height);
+    output_dims.push_back(width);
+
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/ceil_layer.cc b/3rdparty/TNN/source/tnn/layer/ceil_layer.cc
new file mode 100644
index 0000000..62ebf3c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/ceil_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Ceil, LAYER_CEIL);
+
+REGISTER_ELEMENTWISE_LAYER(Ceil, LAYER_CEIL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/clip_layer.cc b/3rdparty/TNN/source/tnn/layer/clip_layer.cc
new file mode 100644
index 0000000..0101ab1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/clip_layer.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(Clip, LAYER_CLIP,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status ClipLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ClipLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<ClipLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+Status ClipLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<ClipLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() >= 2) {
+        auto min_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(min_blob_name) != const_resource_->end()) {
+            auto min_buffer =  (*const_resource_)[min_blob_name];
+            auto dim_count = min_buffer->GetDataCount();
+            if (min_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                auto dim_data = (float *)min_buffer->force_to<float *>();
+                layer_param->min = *dim_data;
+            } else {
+                return Status(TNNERR_PARAM_ERR, "ClipLayer has invalid data type for min value");
+            }
+        }
+    }
+    
+    if (input_blobs_.size() >= 3) {
+        auto max_blob_name = input_blobs_[2]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(max_blob_name) != const_resource_->end()) {
+            auto max_buffer =  (*const_resource_)[max_blob_name];
+            auto dim_count = max_buffer->GetDataCount();
+            if (max_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                auto dim_data = (float *)max_buffer->force_to<float *>();
+                layer_param->max = *dim_data;
+            } else {
+                return Status(TNNERR_PARAM_ERR, "ClipLayer has invalid data type for min value");
+            }
+        }
+    }
+    
+    return status;
+}
+
+REGISTER_ELEMENTWISE_LAYER(Clip, LAYER_CLIP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/concat_layer.cc b/3rdparty/TNN/source/tnn/layer/concat_layer.cc
new file mode 100644
index 0000000..47c9754
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/concat_layer.cc
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/core/common.h"
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Concat, LAYER_CONCAT);
+
+inline bool ConcatLayerCheckShape(DimsVector shape1, DimsVector shape2, int exclude_axis, bool ignore_error) {
+    if (shape1.size() != shape2.size()) {
+        LOGE_IF(!ignore_error, "shape1 dim size %d  shape2 dim size %d\n", (int)shape1.size(), (int)shape2.size());
+        return false;
+    }
+
+    int i = 0;
+    for (; i < shape1.size(); i++) {
+        //support shape1[i] == 0 for empty blob in yolov5
+        if ((i != exclude_axis && shape1[i] != shape2[i]) ||
+            (shape1[i] < 0 || shape2[i] < 0)) {
+            LOGE_IF(!ignore_error, "dim[%d] not match (shape1:%d, shape2:%d)\n", i, shape1[i], shape2[i]);
+            return false;
+        }
+    }
+
+    if (exclude_axis >= shape1.size()) {
+        LOGE_IF(!ignore_error, "exclude_axis:%d out of shape size:%d\n", exclude_axis, (int)shape1.size());
+        return false;
+    }
+    return true;
+}
+
+Status ConcatLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ConcatLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<ConcatLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    int axis = layer_param->axis;
+    if (axis < 0) {
+        axis += (int)input_blob->GetBlobDesc().dims.size();
+        layer_param->axis = axis;
+    }
+    if (axis < 0 || axis > input_blob->GetBlobDesc().dims.size()) {
+        LOGE_IF(!ignore_error, "Error: ConcatLayer axis(%d) is invalid\n", axis);
+        return Status(TNNERR_PARAM_ERR, "ConcatLayer axis is invalid");
+    }
+
+    size_t i                = 0;
+    auto last_shape         = input_blobs_[i]->GetBlobDesc().dims;
+    int out_concat_dim_size = 0;
+    for (; i < input_blobs_.size(); i++) {
+        auto input_blob = input_blobs_[i];
+        auto cur_shape  = input_blob->GetBlobDesc().dims;
+        if (!ConcatLayerCheckShape(last_shape, cur_shape, axis, ignore_error)) {
+            LOGE_IF(!ignore_error,
+                "Error: ConcatLayer's (layer name: %s) inputs can not be "
+                "concatenated with "
+                "axis=%d\n",
+                GetLayerName().c_str(), axis);
+            return Status(TNNERR_PARAM_ERR, "ConcatLayer's inputs can not be concatenated");
+        }
+        out_concat_dim_size += cur_shape[axis];
+    }
+
+    last_shape[axis] = out_concat_dim_size;
+
+    output_blob->GetBlobDesc().dims = last_shape;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Concat, LAYER_CONCAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/constantofshape_layer.cc b/3rdparty/TNN/source/tnn/layer/constantofshape_layer.cc
new file mode 100644
index 0000000..7c732be
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/constantofshape_layer.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_flag_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+Status ConstantOfShapeLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    auto layer_resource = dynamic_cast<ConstantOfShapeLayerResource*>(resource_);
+    CHECK_PARAM_NULL(layer_resource);
+    
+    auto flag = DataFlagUtils::ChangeStatus(input_blobs_[0]->GetFlag());
+    if (flag < DATA_FLAG_CHANGE_NEVER) {
+        flag = DATA_FLAG_CHANGE_IF_SHAPE_DIFFER;
+    }
+    
+    for (auto& iter : output_blobs_) {
+        int allocate_status = DATA_FLAG_ALLOCATE_IN_FORWARD;
+        if (runtime_model_ == RUNTIME_MODE_NORMAL &&
+            const_resource_ != nullptr && const_resource_->find(iter->GetBlobDesc().name) != const_resource_->end()) {
+            allocate_status = 0;
+        }
+        iter->SetFlag(flag | allocate_status);
+        iter->GetBlobDesc().data_type = layer_resource->value.GetDataType();
+    }
+    
+    return TNN_OK;
+}
+
+Status ConstantOfShapeLayer::InferOutputShape(bool ignore_error) {
+    //NOTE: This layer should not be excuted on device which is not NAIVE. see RangeLayer
+    
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto input_dims = input_blobs_[0]->GetBlobDesc().dims;
+    auto data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    if (data_type != DATA_TYPE_INT32) {
+        return Status(TNNERR_MODEL_ERR, "ConstantOfShapeLayer input blob has invalid data type");
+    }
+    if (input_blobs_[0]->GetBlobDesc().device_type != DEVICE_NAIVE) {
+        return Status(TNNERR_MODEL_ERR, "ConstantOfShapeLayer input blob has invalid device type");
+    }
+    
+    //runtime infer output shape
+    {
+        auto input_data = (int *)input_blobs_[0]->GetHandle().base;
+        auto count = DimsVectorUtils::Count(input_dims);
+        if (input_dims.size() <= 0 || input_data==nullptr || count <= 0) {
+            return Status(TNNERR_LAYER_ERR, "ConstantOfShape has invalid output dims");
+        }
+        
+        DimsVector output_dims;
+        for (int i=0; i<count; i++) {
+            output_dims.push_back(input_data[i]);
+        }
+        output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/conv1d_layer.cc b/3rdparty/TNN/source/tnn/layer/conv1d_layer.cc
new file mode 100644
index 0000000..4aa2df6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/conv1d_layer.cc
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Conv1D, LAYER_CONVOLUTION_1D);
+
+Status Conv1DLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status Conv1DLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    Blob* input_blob           = input_blobs_[0];
+    Blob* output_blob          = output_blobs_[0];
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    int num    = input_blob->GetBlobDesc().dims[0];
+    int height = input_blob->GetBlobDesc().dims[2];
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+
+    const int kernel = conv_param->kernels[0];
+
+    const int stride = conv_param->strides[0];
+
+    const int dilation_w = conv_param->dialations[0];
+
+    int height_out = 0;
+
+    const int pad_type = conv_param->pad_type;
+
+    // Refactored the code to support tensorflow models
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        int kernel_extent = dilation_w * (kernel - 1) + 1;
+
+        const int pad_top    = conv_param->pads[0];
+        const int pad_bottom = conv_param->pads[1];
+
+        height_out = (height + pad_top + pad_bottom - kernel_extent) / stride + 1;
+
+    } else if (pad_type == 0 || pad_type == 1 || pad_type == 2) {
+        if (pad_type == 0)  // SAME type
+        {
+            height_out = static_cast<int>(std::ceil(float(height) / float(stride)));
+        } else if (pad_type == 1)  // VALID type
+        {
+            height_out = static_cast<int>(std::ceil(float(height - kernel + 1) / float(stride)));
+        } else  // FULL type
+        {
+            // to-do: deconv has full type, what's conv's full type?
+            LOGE_IF(!ignore_error, "Error: ConvLayer dont support pad type: %d\n", pad_type);
+            return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+        }
+
+        int pad_along_height = ((height_out - 1) * stride + kernel - height);
+        int pad_top          = pad_along_height / 2;
+
+        int pad_down = pad_along_height - pad_top;
+        if (pad_down < 0) {
+            pad_down = 0;
+        }
+
+        // reset pad_h and pad_w
+        conv_param->pads[0] = pad_top;
+        conv_param->pads[1] = pad_down;
+    } else {
+        LOGE_IF(!ignore_error, "Error: ConvLayer dont support pad type: %d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+    }
+
+    int group = conv_param->group;
+    if (group == 0) {
+        LOGE_IF(!ignore_error, "Error: ConvLayer Error: invalid group param\n");
+        return Status(TNNERR_INVALID_GROUP, "ConvLayer Error: invalid group param");
+    }
+
+    if (height_out <= 0) {
+        LOGE_IF(!ignore_error, "Error: invalid conv param, height_out(%d) less than zero\n", height_out);
+        return Status(TNNERR_PARAM_ERR, "invalid conv param, height_out or width_out is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(conv_param->output_channel);
+    output_dims.push_back(height_out);
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Conv1D, LAYER_CONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/conv3d_layer.cc b/3rdparty/TNN/source/tnn/layer/conv3d_layer.cc
new file mode 100644
index 0000000..4e05e64
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/conv3d_layer.cc
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Conv3D, LAYER_CONVOLUTION_3D);
+
+Status Conv3DLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status Conv3DLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    int num = input_blob->GetBlobDesc().dims[0];
+
+    int depth  = input_blob->GetBlobDesc().dims[2];
+    int height = input_blob->GetBlobDesc().dims[3];
+    int width  = input_blob->GetBlobDesc().dims[4];
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+    const int pad_d_begin = conv_param->pads[4];
+
+    const int kernel_w = conv_param->kernels[0];
+    const int kernel_h = conv_param->kernels[1];
+    const int kernel_d = conv_param->kernels[2];
+
+    const int stride_w = conv_param->strides[0];
+    const int stride_h = conv_param->strides[1];
+    const int stride_d = conv_param->strides[2];
+
+    const int dilation_w = conv_param->dialations[0];
+    const int dilation_h = conv_param->dialations[1];
+    const int dilation_d = conv_param->dialations[2];
+
+    int height_out = 0;
+    int width_out  = 0;
+    int depth_out  = 0;
+
+    const int pad_type = conv_param->pad_type;
+
+    int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;
+
+    // Refactored the code to support tensorflow models
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        height_out = (height + 2 * pad_h_begin - kernel_extent_h) / stride_h + 1;
+        width_out  = (width + 2 * pad_w_begin - kernel_extent_w) / stride_w + 1;
+        depth_out  = (depth + 2 * pad_d_begin - kernel_extent_d) / stride_d + 1;
+    } else if (pad_type == 0 || pad_type == 1 || pad_type == 2) {
+        // The code below is based on the logic from
+        // https://www.tensorflow.org/api_docs/python/nn/convolution
+        if (pad_type == 0)  // SAME type
+        {
+            height_out = static_cast<int>(std::ceil(float(height) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width) / float(stride_w)));
+            depth_out  = static_cast<int>(std::ceil(float(depth) / float(stride_d)));
+        } else if (pad_type == 1)  // VALID type
+        {
+            height_out = static_cast<int>(std::ceil(float(height - kernel_extent_h + 1) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width - kernel_extent_w + 1) / float(stride_w)));
+            depth_out  = static_cast<int>(std::ceil(float(depth - kernel_extent_d + 1) / float(stride_d)));
+        } else  // FULL type
+        {
+            // to-do: deconv has full type, what's conv's full type?
+            LOGE_IF(!ignore_error, "Error: Conv3DLayer dont support pad type: %d\n", pad_type);
+            return Status(TNNERR_PARAM_ERR, "Error: Conv3DLayer dont support pad type");
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_extent_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_extent_w - width);
+        int pad_along_depth  = ((depth_out - 1) * stride_d + kernel_extent_d - depth);
+
+        int pad_top   = pad_along_height / 2;
+        int pad_left  = pad_along_width / 2;
+        int pad_front = pad_along_depth / 2;
+
+        int pad_down  = pad_along_height - pad_top;
+        int pad_right = pad_along_width - pad_left;
+        int pad_back  = pad_along_depth - pad_front;
+
+        if (pad_down < 0) {
+            pad_down = 0;
+        }
+        if (pad_right < 0) {
+            pad_right = 0;
+        }
+        if (pad_front < 0) {
+            pad_front = 0;
+        }
+
+        // reset pad_h and pad_w
+        conv_param->pads[0] = pad_left;
+        conv_param->pads[1] = pad_right;
+        conv_param->pads[2] = pad_top;
+        conv_param->pads[3] = pad_down;
+        conv_param->pads[4] = pad_front;
+        conv_param->pads[5] = pad_back;
+    } else {
+        LOGE_IF(!ignore_error, "Error: Conv3DLayer dont support pad type: %d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: Conv3DLayer dont support pad type");
+    }
+
+    int group = conv_param->group;
+    if (group == 0) {
+        return Status(TNNERR_INVALID_GROUP, "Error: invalid group param");
+    }
+
+    if (height_out <= 0 || width_out <= 0) {
+        return Status(TNNERR_PARAM_ERR, "Error: invalid conv param, height_out or width_out is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(conv_param->output_channel);
+    output_dims.push_back(depth_out);
+    output_dims.push_back(height_out);
+    output_dims.push_back(width_out);
+    output_blob->GetBlobDesc().dims        = output_dims;
+    output_blob->GetBlobDesc().data_format = DATA_FORMAT_NCDHW;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Conv3D, LAYER_CONVOLUTION_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/conv_layer.cc b/3rdparty/TNN/source/tnn/layer/conv_layer.cc
new file mode 100644
index 0000000..f5cd2ad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/conv_layer.cc
@@ -0,0 +1,135 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Conv, LAYER_CONVOLUTION);
+
+Status ConvLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ConvLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob           = input_blobs_[0];
+    Blob* output_blob          = output_blobs_[0];
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    CHECK_PARAM_NULL(conv_param);
+
+    int num    = input_blob->GetBlobDesc().dims[0];
+    int height = input_blob->GetBlobDesc().dims[2];
+    int width  = input_blob->GetBlobDesc().dims[3];
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+
+    const int kernel_w = conv_param->kernels[0];
+    const int kernel_h = conv_param->kernels[1];
+
+    const int stride_w = conv_param->strides[0];
+    const int stride_h = conv_param->strides[1];
+
+    const int dilation_w = conv_param->dialations[0];
+    const int dilation_h = conv_param->dialations[1];
+
+    int height_out = 0;
+    int width_out  = 0;
+
+    const int pad_type = conv_param->pad_type;
+
+    int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    // Refactored the code to support tensorflow models
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        const int pad_left   = conv_param->pads[0];
+        const int pad_right  = conv_param->pads[1];
+        const int pad_top    = conv_param->pads[2];
+        const int pad_bottom = conv_param->pads[3];
+
+        height_out = (height + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1;
+        width_out  = (width + pad_left + pad_right - kernel_extent_w) / stride_w + 1;
+
+    } else if (pad_type == 0 || pad_type == 1 || pad_type == 2) {
+        // The code below is based on the logic from
+        // https://www.tensorflow.org/api_docs/python/nn/convolution
+        if (pad_type == 0)  // SAME type
+        {
+            height_out = static_cast<int>(std::ceil(float(height) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width) / float(stride_w)));
+        } else if (pad_type == 1)  // VALID type
+        {
+            height_out = static_cast<int>(std::ceil(float(height - kernel_extent_h + 1) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width - kernel_extent_w + 1) / float(stride_w)));
+        } else  // FULL type
+        {
+            // to-do: deconv has full type, what's conv's full type?
+            LOGE_IF(!ignore_error, "Error: ConvLayer dont support pad type: %d\n", pad_type);
+            return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_extent_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_extent_w - width);
+        int pad_top          = pad_along_height / 2;
+        int pad_left         = pad_along_width / 2;
+
+        int pad_down  = pad_along_height - pad_top;
+        int pad_right = pad_along_width - pad_left;
+        if (pad_down < 0) {
+            pad_down = 0;
+        }
+        if (pad_right < 0) {
+            pad_right = 0;
+        }
+
+        // reset pad_h and pad_w
+        conv_param->pads[0] = pad_left;
+        conv_param->pads[1] = pad_right;
+        conv_param->pads[2] = pad_top;
+        conv_param->pads[3] = pad_down;
+    } else {
+        LOGE_IF(!ignore_error, "Error: ConvLayer dont support pad type: %d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
+    }
+
+    int group = conv_param->group;
+    if (group == 0) {
+        LOGE_IF(!ignore_error, "Error: ConvLayer Error: invalid group param\n");
+        return Status(TNNERR_INVALID_GROUP, "ConvLayer Error: invalid group param");
+    }
+
+    if (height_out <= 0 || width_out <= 0) {
+        LOGE_IF(!ignore_error, "Error: invalid deconv param, height_out(%d) or width_out(%d) is less than zero\n", height_out, width_out);
+        return Status(TNNERR_PARAM_ERR, "invalid conv param, height_out or width_out is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(conv_param->output_channel);
+    output_dims.push_back(height_out);
+    output_dims.push_back(width_out);
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Conv, LAYER_CONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/cos_layer.cc b/3rdparty/TNN/source/tnn/layer/cos_layer.cc
new file mode 100644
index 0000000..2103d26
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/cos_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Cos, LAYER_COS);
+
+REGISTER_ELEMENTWISE_LAYER(Cos, LAYER_COS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/deconv_layer.cc b/3rdparty/TNN/source/tnn/layer/deconv_layer.cc
new file mode 100644
index 0000000..b8913c5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/deconv_layer.cc
@@ -0,0 +1,143 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Deconv, LAYER_DECONVOLUTION);
+
+Status DeconvLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status DeconvLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob             = input_blobs_[0];
+    Blob* output_blob            = output_blobs_[0];
+    ConvLayerParam* deconv_param = dynamic_cast<ConvLayerParam*>(param_);
+    CHECK_PARAM_NULL(deconv_param);
+
+    int num    = input_blob->GetBlobDesc().dims[0];
+    int height = input_blob->GetBlobDesc().dims[2];
+    int width  = input_blob->GetBlobDesc().dims[3];
+
+    const int pad_w_begin = deconv_param->pads[0];
+    const int pad_h_begin = deconv_param->pads[2];
+
+    const int kernel_w = deconv_param->kernels[0];
+    const int kernel_h = deconv_param->kernels[1];
+
+    const int stride_w = deconv_param->strides[0];
+    const int stride_h = deconv_param->strides[1];
+
+    const int dilation_w = deconv_param->dialations[0];
+    const int dilation_h = deconv_param->dialations[1];
+
+    int height_out = 0;
+    int width_out  = 0;
+
+    const int pad_type = deconv_param->pad_type;
+
+    int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    // Refactored the code to support tensorflow models
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        height_out = stride_h * (height - 1) + kernel_extent_h - 2 * pad_h_begin;
+        width_out  = stride_w * (width - 1) + kernel_extent_w - 2 * pad_w_begin;
+    } else if (pad_type == 0 || pad_type == 1 || pad_type == 2 || pad_type == 3) {
+        // The code below is based on the logic from tensorflow
+        height_out = height * stride_h;
+        width_out  = width * stride_w;
+        if (pad_type == 0 || pad_type == 3)  // SAME type
+        {
+            height_out = height * stride_h;
+            width_out  = width * stride_w;
+        } else if (pad_type == 1)  // VALID type
+        {
+            height_out = height * stride_h + std::max(kernel_extent_h - stride_h, 0);
+            width_out  = width * stride_w + std::max(kernel_extent_w - stride_w, 0);
+        } else if (pad_type == 2)  // FULL type
+        {
+            height_out = height * stride_h - (stride_h + kernel_extent_h - 2);
+            width_out  = width * stride_w - (stride_w + kernel_extent_w - 2);
+        } else {
+            LOGE_IF(!ignore_error, "Error: DeconvLayer dont support pad type: %d\n", pad_type);
+            return Status(TNNERR_PARAM_ERR, "Error: DeconvLayer dont support pad type");
+        }
+
+        int pad_along_height = ((height - 1) * stride_h + kernel_extent_h - height_out);
+        int pad_along_width  = ((width - 1) * stride_w + kernel_extent_w - width_out);
+        if (pad_type == 3) {
+            pad_along_height = std::max(pad_along_height, 0);
+            pad_along_width  = std::max(pad_along_width, 0);
+        }
+
+        int pad_top  = pad_along_height / 2;
+        int pad_left = pad_along_width / 2;
+
+        int pad_down  = pad_along_height - pad_top;
+        int pad_right = pad_along_width - pad_left;
+
+        // reset pad_h and pad_w
+        deconv_param->pads[0] = pad_left;
+        deconv_param->pads[1] = pad_right;
+        deconv_param->pads[2] = pad_top;
+        deconv_param->pads[3] = pad_down;
+
+        if (pad_type == 3) {
+            // deconv exchange pad_right and pad_left because of output_padding
+            deconv_param->pads[0] = pad_right;
+            deconv_param->pads[1] = pad_left;
+            deconv_param->pads[2] = pad_down;
+            deconv_param->pads[3] = pad_top;
+        }
+        //        LOGE("DeconvLayerpads: %d %d %d %d\n", deconv_param->pads[0],
+        //             deconv_param->pads[1],
+        //             deconv_param->pads[2],
+        //             deconv_param->pads[3]);
+    } else {
+        LOGE_IF(!ignore_error, "Error: DeconvLayer dont support pad type: %d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: DeconvLayer dont support pad type");
+    }
+
+    int group = deconv_param->group;
+    if (group == 0) {
+        return Status(TNNERR_INVALID_GROUP, "Error: invalid group param");
+    }
+
+    if (height_out <= 0 || width_out <= 0) {
+        LOGE_IF(!ignore_error, "Error: invalid deconv param, height_out(%d) or width_out(%d) is less than zero\n", height_out, width_out);
+        return Status(TNNERR_PARAM_ERR, "Error: invalid deconv param, height_out or width_out is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(deconv_param->output_channel);
+    output_dims.push_back(height_out);
+    output_dims.push_back(width_out);
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Deconv, LAYER_DECONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/detection_output_layer.cc b/3rdparty/TNN/source/tnn/layer/detection_output_layer.cc
new file mode 100644
index 0000000..58de995
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/detection_output_layer.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+Status DetectionOutputLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status DetectionOutputLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    DetectionOutputLayerParam* param = dynamic_cast<DetectionOutputLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    ASSERT(input_blobs_.size() > 2);
+    //
+    ASSERT(input_blobs_[0]->GetBlobDesc().dims[0] == input_blobs_[1]->GetBlobDesc().dims[0]);
+    // get input node 2 height
+    int num_priors      = input_blobs_[2]->GetBlobDesc().dims[2] / 4;
+    int num_loc_classes = param->share_location ? 1 : param->num_classes;
+
+    ASSERT(num_priors * num_loc_classes * 4 == input_blobs_[0]->GetBlobDesc().dims[1]);
+    ASSERT(num_priors * param->num_classes == input_blobs_[1]->GetBlobDesc().dims[1]);
+
+    // num() and channels() are 1.
+    std::vector<int> output_dims(2, 1);
+    // Since the number of bboxes to be kept is unknown before nms, we manually
+    // set it to (fake) 1.
+    output_dims.push_back(param->keep_top_k);
+    // Each row is a 7 dimension vector, which stores
+    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+    output_dims.push_back(7);
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/detection_post_process_layer.cc b/3rdparty/TNN/source/tnn/layer/detection_post_process_layer.cc
new file mode 100644
index 0000000..6f2f731
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/detection_post_process_layer.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(DetectionPostProcess, LAYER_DETECTION_POST_PROCESS);
+
+Status DetectionPostProcessLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status DetectionPostProcessLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    assert(input_blobs_.size() == 2);
+    assert(output_blobs_.size() == 4);
+    auto param                   = dynamic_cast<DetectionPostProcessLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+    const int num_detected_boxes = param->max_detections * param->max_classes_per_detection;
+    const int bath_size          = input_blobs_[0]->GetBlobDesc().dims[0];
+    // Outputs: detection_boxes, detection_classes, detection_scores, num_detections
+    auto detection_boxes_dims = std::vector<int>();
+    detection_boxes_dims.push_back(bath_size);
+    detection_boxes_dims.push_back(num_detected_boxes);
+    detection_boxes_dims.push_back(4);
+    detection_boxes_dims.push_back(1);
+    output_blobs_[0]->GetBlobDesc().dims = detection_boxes_dims;
+
+    auto detection_classes_dims = std::vector<int>();
+    detection_classes_dims.push_back(bath_size);
+    detection_classes_dims.push_back(num_detected_boxes);
+    detection_classes_dims.push_back(1);
+    detection_classes_dims.push_back(1);
+    output_blobs_[1]->GetBlobDesc().dims = detection_classes_dims;
+
+    auto detection_scores_dims = std::vector<int>();
+    detection_scores_dims.push_back(bath_size);
+    detection_scores_dims.push_back(num_detected_boxes);
+    detection_scores_dims.push_back(1);
+    detection_scores_dims.push_back(1);
+    output_blobs_[2]->GetBlobDesc().dims = detection_scores_dims;
+
+    auto num_detections_dims = std::vector<int>({1, 1, 1, 1});
+    output_blobs_[3]->GetBlobDesc().dims = num_detections_dims;
+    return TNN_OK;
+    // detection_boxes
+}
+
+REGISTER_LAYER(DetectionPostProcess, LAYER_DETECTION_POST_PROCESS);
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/div_layer.cc b/3rdparty/TNN/source/tnn/layer/div_layer.cc
new file mode 100644
index 0000000..cb949e5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/div_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Div, LAYER_DIV);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Div, LAYER_DIV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/einsum_layer.cc b/3rdparty/TNN/source/tnn/layer/einsum_layer.cc
new file mode 100644
index 0000000..cd2779d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/einsum_layer.cc
@@ -0,0 +1,423 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/compute/compute_elewise.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Einsum, LAYER_EINSUM);
+
+inline int count(std::vector<int> dimes, int start_axis) {
+    const int end_axis = int(dimes.size());
+    ASSERT(start_axis <= end_axis);
+    ASSERT(start_axis >= 0);
+    ASSERT(end_axis >= 0);
+    int count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+        count *= dimes[i];
+    }
+    return count;
+};
+
+DimsVector CalPermuteOutputShape(const DimsVector &input_dims, const std::vector<int> &orders) {
+    auto output_dims    = input_dims;
+    const int dims_size = input_dims.size();
+    for (int i = 0; i < dims_size; i++) {
+        output_dims[i] = input_dims[orders[i]];
+    }
+
+    return output_dims;
+}
+
+DimsVector CalSqueezeOutputShape(const DimsVector &input_dims, const int axis) {
+    auto output_dims = input_dims;
+    output_dims.erase(output_dims.begin() + axis);
+
+    return output_dims;
+}
+
+DimsVector CalSumOutputShape(const DimsVector &input_dims, const int axis) {
+    auto output_dims = input_dims;
+    output_dims.erase(output_dims.begin() + axis);
+
+    return output_dims;
+}
+
+DimsVector CalMulOutputShape(const DimsVector &input_dims_a, const DimsVector &input_dims_b) {
+    return DimsVectorUtils::Max(input_dims_a, input_dims_b);
+}
+
+DimsVector CalFlattenOutputShape(const DimsVector &input_dims) {
+    return {DimsVectorUtils::Count(input_dims)};
+}
+
+DimsVector CalDotOutputShape() {
+    return {1};
+}
+
+Status EinsumLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status EinsumLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    auto inputs  = input_blobs_;
+    auto outputs = output_blobs_;
+
+    auto param = dynamic_cast<EinsumLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: EinsumLayerParam is nil");
+    }
+
+    param->perm_shapes.clear();
+    param->dim_last_op.clear();
+    param->operand_dims.clear();
+    param->has_zero_size_dim = false;
+    const auto equation    = param->equation;
+    constexpr int ELLIPSIS = '.';
+
+    // Find arrow (->) to split equation into lhs and rhs
+    const auto arrow_pos = equation.find("->");
+    const auto lhs       = equation.substr(0, arrow_pos);
+
+    const auto num_ops = inputs.size();
+
+    // Convert labels for input operands into an index in [0, 25] and store
+    // them in op_labels for each operand along with ELLIPSIS if present.
+    std::vector<std::vector<int>> op_labels(num_ops);
+    bool found_ell      = false;
+    std::size_t curr_op = 0;
+    for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
+        switch (lhs[i]) {
+            case ' ':
+                // Ignore spaces
+                break;
+
+            case '.':
+                if (found_ell) {
+                    const std::string message = "Error: einsum() found \'.\' for operand " + std::to_string(curr_op) +
+                                                " for which an ellipsis was already found";
+                    return Status(TNNERR_MODEL_ERR, message);
+                }
+                if (!(i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.')) {
+                    const std::string message = "einsum() found \'.\' for operand " + std::to_string(curr_op) +
+                                                " that is not part of any ellipsis";
+                    return Status(TNNERR_MODEL_ERR, message);
+                }
+                op_labels[curr_op].push_back(ELLIPSIS);
+                found_ell = true;
+                break;
+
+            case ';':
+                // Move onto next operand
+                ++curr_op;
+                if (curr_op >= num_ops) {
+                    return Status(TNNERR_MODEL_ERR,
+                                  "einsum() fewer operands were provided than specified in the equation");
+                }
+                found_ell = false;
+                break;
+
+            default:
+                // Parse label
+                if (lhs[i] < 'a' && lhs[i] > 'z') {
+                    const std::string message = "einsum() operand subscript must be in range [a, z] but found " +
+                                                std::to_string(lhs[i]) + " for operand " + std::to_string(curr_op);
+                    return Status(TNNERR_MODEL_ERR, message);
+                }
+                // Convert label to index in [0, 25] and store
+                op_labels[curr_op].push_back(lhs[i] - 'a');
+        }
+    }
+
+    if (curr_op != num_ops - 1) {
+        return Status(TNNERR_MODEL_ERR, "einsum() more operands were provided than specified in the equation");
+    }
+
+    // Labels must be within [a, z].
+    constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
+    std::vector<int> label_count(TOTAL_LABELS, 0);
+
+    // The maximum number of dimensions covered by any ellipsis, needed when
+    // unsqueezing missing dimensions from operands to permute and broadcast
+    int ell_num_dim = 0;
+
+    // Compute label frequency and number of dimensions covered by ellipsis
+    // We do this after parsing labels to make it more readable and simpler
+    // to compute the number of dimensions covered by ellipsis.
+    for (int i = 0; i < num_ops; i++) {
+        const auto operand_dims = inputs[i]->GetBlobDesc().dims;
+        const auto labels       = op_labels[i];
+        const int ndims         = operand_dims.size();
+        int nlabels             = labels.size();
+        bool has_ellipsis       = false;
+
+        for (const auto &label : labels) {
+            if (label == ELLIPSIS) {
+                --nlabels;
+                has_ellipsis = true;
+                ell_num_dim  = std::max(ell_num_dim, ndims - nlabels);
+            } else {
+                ++label_count[label];
+            }
+        }
+
+        if (!(has_ellipsis ? nlabels <= ndims : nlabels == ndims)) {
+            const std::string message = "einsum() the number of subscripts in the equation (" +
+                                        std::to_string(nlabels) +
+                                        (has_ellipsis ? ") is more than the number of dimensions ("
+                                                      : ") does not match the number of dimensions (") +
+                                        std::to_string(ndims) + ") for operand " + std::to_string(i) +
+                                        (has_ellipsis ? "" : " and no ellipsis was given");
+
+            return Status(TNNERR_MODEL_ERR, message);
+        }
+    }
+
+    // We want to align the dimensions of every input tensor to have
+    // shape out_dims + sum_dims. For this, we create a mapping of label
+    // to index into the permuted shape.
+    std::vector<int> label_perm_index(TOTAL_LABELS, -1);
+
+    // Current index in the permuted shape
+    int perm_index = 0;
+
+    // Start index of ellipsis dimensions in the permuted shape
+    int ell_index = 0;
+    found_ell     = false;
+
+    if (arrow_pos == std::string::npos) {
+        // Implicit output is ellipsis (...) + labels seen only once
+        perm_index = ell_num_dim;
+        found_ell  = true;
+        for (int label = 0; label < TOTAL_LABELS; label++) {
+            if (label_count[label] == 1) {
+                label_perm_index[label] = perm_index++;
+            }
+        }
+    } else {
+        // Parse explicit output
+        const auto rhs = equation.substr(arrow_pos + 2);
+        for (auto i = decltype(rhs.length()){0}; i < rhs.length(); ++i) {
+            switch (rhs[i]) {
+                case ' ':
+                    // Ignore spaces
+                    break;
+
+                case '.':
+                    if (found_ell) {
+                        return Status(TNNERR_MODEL_ERR,
+                                      "einsum() found \'.\' for output but an ellipsis (...) was already found");
+                    }
+                    if (!(i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.')) {
+                        return Status(TNNERR_MODEL_ERR,
+                                      "einsum() found \'.\' for output that is not part of any ellipsis (...)");
+                    }
+                    ell_index = perm_index;
+                    perm_index += ell_num_dim;
+                    found_ell = true;
+                    break;
+
+                default:
+                    if (rhs[i] < 'a' && rhs[i] > 'z') {
+                        const std::string message = "einsum() subscripts must be in range [a, z] but found " +
+                                                    std::to_string(rhs[i]) + " for the output";
+                        return Status(TNNERR_MODEL_ERR, message);
+                    }
+                    const auto label = rhs[i] - 'a';
+                    if (!(label_count[label] > 0 && label_perm_index[label] == -1)) {
+                        const std::string message =
+                            "einsum() output subscript " + std::to_string(rhs[i]) +
+                            (label_perm_index[label] > -1 ? " appears more than once in the output"
+                                                          : " does not appear in the equation for any input operand");
+                        return Status(TNNERR_MODEL_ERR, message);
+                    }
+                    label_perm_index[label] = perm_index++;
+            }
+        }
+    }
+
+    // Save output size before adding contraction dims (dims to sum out)
+    const int out_size = perm_index;
+    param->out_size = out_size;
+
+    // If ellipsis is not part of the output, add to contraction dimensions
+    if (!found_ell) {
+        ell_index = perm_index;
+        perm_index += ell_num_dim;
+    }
+
+    // Add contraction labels (labels not present in output)
+    for (int label = 0; label < TOTAL_LABELS; label++) {
+        if (label_count[label] > 0 && label_perm_index[label] == -1) {
+            label_perm_index[label] = perm_index++;
+        }
+    }
+
+    // Here we unsqueeze missing dimensions to make all operands have the same
+    // number of dimensions. We take diagonals for repeated labels within the
+    // same operand. Finally we permute the operands to align dimensions as
+    // per the perm_out_index we computed above.
+    std::vector<DimsVector> permuted_operands_dims;
+    for (int i = 0; i < num_ops; i++) {
+        std::vector<int> perm_shape(perm_index, -1);
+        std::vector<int> label_dim(TOTAL_LABELS, -1);
+        TNN_NS::DimsVector operand_dims(inputs[i]->GetBlobDesc().dims);
+        const auto labels         = op_labels[i];
+        const auto original_sizes = operand_dims;
+
+        std::size_t j = 0;
+        for (const auto &label : labels) {
+            if (label == ELLIPSIS) {
+                // Add missing dimensions covered by the ellipsis
+                const int num_missing_dim = ell_num_dim - (original_sizes.size() - labels.size() + 1);
+                for (int k = 0; k < num_missing_dim; k++) {
+                    // unsqueeze
+                    operand_dims.insert(operand_dims.begin() + j, 1);
+                }
+                for (int k = 0; k < ell_num_dim; k++) {
+                    perm_shape[ell_index + k] = j++;
+                }
+            } else if (label_dim[label] != -1) {
+                // Repeated label, take diagonal
+                const auto dim = label_dim[label];
+                if (operand_dims[j] != operand_dims[dim]) {
+                    const std::string message = "einsum() subscript " + std::to_string(char(label + 'a')) +
+                                                " is repeated for operand " + std::to_string(i) +
+                                                " but the sizes don't match, " + std::to_string(operand_dims[j]) +
+                                                " != " + std::to_string(operand_dims[dim]);
+                    return Status(TNNERR_MODEL_ERR, message);
+                }
+                // diagonal is not supported
+                // TODO
+                // operand = operand.diagonal(0, dim, j).movedim(-1, dim);
+                return Status(TNNERR_MODEL_ERR, "diagonal in einsum is not supported");
+            } else {
+                // Lookup output index for label
+                label_dim[label]                    = j;
+                perm_shape[label_perm_index[label]] = j++;
+            }
+        }
+
+        // Add dimensions for missing labels
+        for (int &index : perm_shape) {
+            if (index == -1) {
+                auto dims = operand_dims;
+                dims.push_back(1);
+                operand_dims = dims;
+                index        = j++;
+            }
+        }
+        param->operand_dims.push_back(operand_dims);
+        param->perm_shapes.push_back(perm_shape);
+        permuted_operands_dims.push_back(CalPermuteOutputShape(operand_dims, perm_shape));
+
+    }
+
+    // Check if operands broadcast and keep track of last operand with
+    // dimension size != 1 for optimizing reductions
+    std::vector<std::size_t> dim_last_op(perm_index, 0);
+    bool has_zero_size_dim = false;
+    for (int dim = 0; dim < perm_index; dim++) {
+        auto broadcast_size = permuted_operands_dims[0][dim];
+        for (int i = 1; i < num_ops; i++) {
+            const auto dim_size = permuted_operands_dims[i][dim];
+            if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) {
+                const std::string message =
+                    "einsum() operands do not broadcast with remapped shapes [original->remapped]";
+                return Status(TNNERR_MODEL_ERR, message);
+            }
+            if (dim_size != 1) {
+                broadcast_size   = dim_size;
+                dim_last_op[dim] = i;
+            }
+        }
+        has_zero_size_dim |= broadcast_size == 0;
+    }
+    param->has_zero_size_dim = has_zero_size_dim;
+    param->dim_last_op = dim_last_op;
+
+    // Compute result
+    auto result = permuted_operands_dims[0];
+
+    // Fast path for when an operand has zero sized dim
+    if (has_zero_size_dim) {
+        std::vector<int> out_shape(out_size);
+        for (int i = 0; i < out_size; i++) {
+            out_shape[i] = permuted_operands_dims[dim_last_op[i]][i];
+        }
+        output_blobs_[0]->GetBlobDesc().dims = out_shape;
+
+        return TNN_OK;
+    }
+
+    // Sum out or squeeze dimensions that are size 1 for all later operands
+    int dim = out_size;
+    for (int i = dim; i < perm_index; ++i, ++dim) {
+        if (dim_last_op[i] == 0) {
+            if (result[dim] == 1) {
+                result = CalSqueezeOutputShape(result, dim--);
+            } else {
+                result = CalSumOutputShape(result, dim--);
+            }
+        }
+    }
+
+    for (int i = 1; i < num_ops; i++) {
+        auto operand_dims = permuted_operands_dims[i];
+        std::vector<int> sum_dims;
+
+        // Sum out or squeeze dimensions that are size 1 for all later operands
+        dim = out_size;
+        for (int j = dim; j < perm_index; ++j, ++dim) {
+            if (dim_last_op[j] < i) {
+                operand_dims = CalSqueezeOutputShape(operand_dims, dim);
+                --dim;
+            } else if (dim_last_op[j] == i) {
+                if (result[dim] == 1) {
+                    operand_dims = CalSumOutputShape(operand_dims, dim);
+                    result       = CalSqueezeOutputShape(result, dim);
+                    --dim;
+                } else {
+                    sum_dims.push_back(dim);
+                }
+            }
+        }
+
+        // Multiply tensors and sum out dimensions in sum_dims
+        if (sum_dims.empty()) {
+            result = CalMulOutputShape(result, operand_dims);
+        } else if (sum_dims.size() == result.size()) {
+            result = CalDotOutputShape();
+        } else {
+            result = CalMulOutputShape(result, operand_dims);
+            for (const auto axis : sum_dims) {
+                result = CalSumOutputShape(result, axis);
+            }
+        }
+    }
+
+    output_blobs_[0]->GetBlobDesc().dims = result;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Einsum, LAYER_EINSUM);
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/layer/elementwise_layer.cc b/3rdparty/TNN/source/tnn/layer/elementwise_layer.cc
new file mode 100644
index 0000000..6f3a7a1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/elementwise_layer.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status ElementwiseLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+
+    auto dims0       = input_blob->GetBlobDesc().dims;
+    auto dims_output = dims0;
+    for (auto iter : input_blobs_) {
+        dims0       = iter->GetBlobDesc().dims;
+        dims_output = DimsVectorUtils::Max(dims0, dims_output);
+    }
+
+    output_blobs_[0]->GetBlobDesc().dims = dims_output;
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/elementwise_layer.h b/3rdparty/TNN/source/tnn/layer/elementwise_layer.h
new file mode 100644
index 0000000..c3b2f67
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/elementwise_layer.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_LAYER_ElEMENTWISE_LAYER_H_
+#define TNN_SOURCE_TNN_LAYER_ElEMENTWISE_LAYER_H_
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+class ElementwiseLayer : public BaseLayer {
+public:
+    explicit ElementwiseLayer(LayerType ignore) : BaseLayer(ignore){};
+    virtual ~ElementwiseLayer(){};
+
+protected:
+    virtual Status InferOutputShape(bool ignore_error = false);
+};
+
+#define DECLARE_ELEMENTWISE_LAYER(type_string, layer_type)                                                             \
+    class type_string##Layer : public ElementwiseLayer {                                                               \
+    public:                                                                                                            \
+        type_string##Layer(LayerType ignore) : ElementwiseLayer(layer_type){};                                         \
+        virtual ~type_string##Layer(){};                                                                               \
+    }
+
+#define REGISTER_ELEMENTWISE_LAYER(type_string, layer_type) REGISTER_LAYER(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_LAYER_ElEMENTWISE_LAYER_H_
diff --git a/3rdparty/TNN/source/tnn/layer/elu_layer.cc b/3rdparty/TNN/source/tnn/layer/elu_layer.cc
new file mode 100644
index 0000000..7f411bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/elu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Elu, LAYER_ELU);
+
+REGISTER_ELEMENTWISE_LAYER(Elu, LAYER_ELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/equal_layer.cc b/3rdparty/TNN/source/tnn/layer/equal_layer.cc
new file mode 100644
index 0000000..ad4c522
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/equal_layer.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Equal, LAYER_EQUAL);
+Status EqualLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_INT8;
+    return TNN_OK;
+}
+
+Status EqualLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    Blob* input_blob = input_blobs_[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    auto dims_output = dims;
+    for (auto iter : input_blobs_) {
+        dims       = iter->GetBlobDesc().dims;
+        dims_output = DimsVectorUtils::Max(dims, dims_output);
+    }
+
+    output_blobs_[0]->GetBlobDesc().dims = dims_output;
+    return TNN_OK;
+}
+REGISTER_LAYER(Equal, LAYER_EQUAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/erf_layer.cc b/3rdparty/TNN/source/tnn/layer/erf_layer.cc
new file mode 100644
index 0000000..f976abc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/erf_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Erf, LAYER_ERF);
+
+REGISTER_ELEMENTWISE_LAYER(Erf, LAYER_ERF);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/exp_layer.cc b/3rdparty/TNN/source/tnn/layer/exp_layer.cc
new file mode 100644
index 0000000..b3556cd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/exp_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Exp, LAYER_EXP);
+
+REGISTER_ELEMENTWISE_LAYER(Exp, LAYER_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/expand_layer.cc b/3rdparty/TNN/source/tnn/layer/expand_layer.cc
new file mode 100644
index 0000000..d70dd84
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/expand_layer.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Expand, LAYER_EXPAND);
+
+Status ExpandLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+
+    if (const_resource_) {
+        const auto iter = const_resource_->find(input_blobs_[0]->GetBlobDesc().name);
+        if (iter != const_resource_->end()) {
+            output_blobs_[0]->GetBlobDesc().data_type = iter->second->GetDataType();
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ExpandLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<ExpandLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    Blob* input_blob = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto shape_dims = layer_param->shape;
+    auto output_dims = DimsFunctionUtils::Expand(input_dims, shape_dims, nullptr);
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Expand, LAYER_EXPAND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/flatten_layer.cc b/3rdparty/TNN/source/tnn/layer/flatten_layer.cc
new file mode 100644
index 0000000..e52d1eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/flatten_layer.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Flatten, LAYER_FLATTEN);
+
+Status FlattenLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status FlattenLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    FlattenLayerParam* flatten_param = dynamic_cast<FlattenLayerParam*>(param_);
+    CHECK_PARAM_NULL(flatten_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    const int axis            = flatten_param->axis;
+    const auto input_dims     = input_blob->GetBlobDesc().dims;
+    const int input_dims_size = input_dims.size();
+    if (axis < 0 || axis > input_dims_size) {
+        LOGE_IF(!ignore_error, "flatten param size error\n");
+        return Status(TNNERR_PARAM_ERR, "flatten param size error");
+    }
+
+    int dim0 = 1;
+    int dim1 = 1;
+    for (int i = 0; i < axis; i++) {
+        dim0 *= input_dims[i];
+    }
+    for (int i = axis; i < input_dims_size; i++) {
+        dim1 *= input_dims[i];
+    }
+
+    output_blob->GetBlobDesc().dims = {dim0, dim1};
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/floor_layer.cc b/3rdparty/TNN/source/tnn/layer/floor_layer.cc
new file mode 100644
index 0000000..2c9430a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/floor_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Floor, LAYER_FLOOR);
+
+REGISTER_ELEMENTWISE_LAYER(Floor, LAYER_FLOOR);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/gather_layer.cc b/3rdparty/TNN/source/tnn/layer/gather_layer.cc
new file mode 100644
index 0000000..c79bc34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/gather_layer.cc
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Gather, LAYER_GATHER);
+
+Status GatherLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+    
+    //修改indices输入 data type
+    if (!layer_param->indices_in_resource) {
+        (*(input_blobs_.rbegin()))->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    }
+    
+    //修改输出data type
+    if (layer_param->data_in_resource) {
+        output_blobs_[0]->GetBlobDesc().data_type = layer_resource->data.GetDataType();
+    }
+    
+    return TNN_OK;
+}
+
+Status GatherLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+    }
+    
+    DimsVector data_dims, indices_dims;
+    if (layer_param->data_in_resource) {
+        data_dims = layer_resource->data.GetBufferDims();
+    } else {
+        data_dims = (*(input_blobs_.begin()))->GetBlobDesc().dims;
+    }
+    
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+    } else {
+        indices_dims = (*(input_blobs_.rbegin()))->GetBlobDesc().dims;
+    }
+    
+    int axis = layer_param->axis;
+    while (axis < 0) {
+        axis += data_dims.size();
+    }
+    layer_param->axis = axis;
+
+    DimsVector output_dims;
+    if (axis > 0 && axis<data_dims.size()) {
+        output_dims.insert(output_dims.end(), data_dims.begin(), data_dims.begin()+axis);
+    }
+    
+    output_dims.insert(output_dims.end(), indices_dims.begin(), indices_dims.end());
+    
+    if (axis<data_dims.size()-1) {
+        output_dims.insert(output_dims.end(), data_dims.begin()+axis+1, data_dims.end());
+    }
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Gather, LAYER_GATHER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/gathernd_layer.cc b/3rdparty/TNN/source/tnn/layer/gathernd_layer.cc
new file mode 100644
index 0000000..98c5af1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/gathernd_layer.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(GatherND, LAYER_GATHERND);
+
+Status GatherNDLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+//    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+//    CHECK_PARAM_NULL(layer_param);
+//    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+//    if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+//        return Status(TNNERR_MODEL_ERR, "Gather resource is invalid");
+//    }
+//
+//    //修改indices输入 data type
+//    if (!layer_param->indices_in_resource) {
+//        (*(input_blobs_.rbegin()))->GetBlobDesc().data_type = DATA_TYPE_INT32;
+//    }
+//
+//    //修改输出data type
+//    if (layer_param->data_in_resource) {
+//        output_blobs_[0]->GetBlobDesc().data_type = layer_resource->data.GetDataType();
+//    }
+    
+    return TNN_OK;
+}
+
+Status GatherNDLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<GatherNDLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    auto data_dims = input_blobs_[0]->GetBlobDesc().dims;
+    auto indices_dims = input_blobs_[1]->GetBlobDesc().dims;
+    
+    int dim_index = 0;
+    DimsVector output_dims;
+    while (dim_index < indices_dims.size()-1) {
+        output_dims.push_back(indices_dims[dim_index]);
+        dim_index++;
+    }
+    
+    dim_index  = indices_dims[indices_dims.size() -1];
+    while (dim_index < data_dims.size()) {
+        output_dims.push_back(data_dims[dim_index]);
+        dim_index++;
+    }
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(GatherND, LAYER_GATHERND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/gelu_layer.cc b/3rdparty/TNN/source/tnn/layer/gelu_layer.cc
new file mode 100644
index 0000000..8c5b73a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/gelu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Gelu, LAYER_GELU);
+
+REGISTER_ELEMENTWISE_LAYER(Gelu, LAYER_GELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/gridsample_layer.cc b/3rdparty/TNN/source/tnn/layer/gridsample_layer.cc
new file mode 100644
index 0000000..ede4ea4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/gridsample_layer.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(GridSample, LAYER_GRIDSAMPLE);
+
+Status GridSampleLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status GridSampleLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* grid_blob  = input_blobs_[1];
+    Blob* output_blob = output_blobs_[0];
+     
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto grid_dims = grid_blob->GetBlobDesc().dims;
+    
+    auto output_dims = input_dims;
+    for (int i=2,j=1; i<output_dims.size() && j<grid_dims.size(); i++,j++) {
+        output_dims[i] = grid_dims[j];
+    }
+
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(GridSample, LAYER_GRIDSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/group_norm_layer.cc b/3rdparty/TNN/source/tnn/layer/group_norm_layer.cc
new file mode 100644
index 0000000..a4de451
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/group_norm_layer.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(GroupNorm, LAYER_GROUP_NORM);
+
+Status GroupNormLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status GroupNormLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(GroupNorm, LAYER_GROUP_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/hard_sigmoid_layer.cc b/3rdparty/TNN/source/tnn/layer/hard_sigmoid_layer.cc
new file mode 100644
index 0000000..349a591
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/hard_sigmoid_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(HardSigmoid, LAYER_HARDSIGMOID);
+
+REGISTER_ELEMENTWISE_LAYER(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/hard_swish_layer.cc b/3rdparty/TNN/source/tnn/layer/hard_swish_layer.cc
new file mode 100644
index 0000000..6347cd5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/hard_swish_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(HardSwish, LAYER_HARDSWISH);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(HardSwish, LAYER_HARDSWISH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/hdrguide_layer.cc b/3rdparty/TNN/source/tnn/layer/hdrguide_layer.cc
new file mode 100644
index 0000000..4b5b5a8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/hdrguide_layer.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(HdrGuide, LAYER_HDRGUIDE);
+
+Status HdrGuideLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status HdrGuideLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    int num     = input_blob->GetBlobDesc().dims[0];
+    int channel = input_blob->GetBlobDesc().dims[1];
+    int height  = input_blob->GetBlobDesc().dims[2];
+    int width   = input_blob->GetBlobDesc().dims[3];
+
+    if (channel != 3) {
+        LOGE_IF(!ignore_error,
+            "Error: HdrGuideLayer Error: invalid channel size (need to be "
+            "3)\n");
+        return Status(TNNERR_PARAM_ERR, "HdrGuideLayer Error: invalid channel size");
+    }
+
+    if (height <= 0 || width <= 0) {
+        LOGE_IF(!ignore_error, "Error: invalid height or width, is less than zero\n");
+        return Status(TNNERR_PARAM_ERR, "invalid height or width, is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(1);
+    output_dims.push_back(height);
+    output_dims.push_back(width);
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(HdrGuide, LAYER_HDRGUIDE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/histogram_layer.cc b/3rdparty/TNN/source/tnn/layer/histogram_layer.cc
new file mode 100644
index 0000000..2e5c2f0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/histogram_layer.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Histogram, LAYER_HISTOGRAM);
+
+Status HistogramLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    }
+    return TNN_OK;
+}
+
+Status HistogramLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<HistogramLayerParam*>(param_);
+    
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = {layer_param->depth};
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Histogram, LAYER_HISTOGRAM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/inner_product_layer.cc b/3rdparty/TNN/source/tnn/layer/inner_product_layer.cc
new file mode 100644
index 0000000..bb6f067
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/inner_product_layer.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(InnerProduct, LAYER_INNER_PRODUCT);
+
+Status InnerProductLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status InnerProductLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    InnerProductLayerParam* ip_param = dynamic_cast<InnerProductLayerParam*>(param_);
+    CHECK_PARAM_NULL(ip_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+
+    int N    = ip_param->num_output;
+    int axis = ip_param->axis;
+    DimsVector output_dims;
+    for (int i = 0; i < axis; ++i) {
+        output_dims.push_back(input_dims[i]);
+    }
+    output_dims.push_back(N);
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/instance_norm_layer.cc b/3rdparty/TNN/source/tnn/layer/instance_norm_layer.cc
new file mode 100644
index 0000000..9fed60d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/instance_norm_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+REGISTER_ELEMENTWISE_LAYER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/inverse_layer.cc b/3rdparty/TNN/source/tnn/layer/inverse_layer.cc
new file mode 100644
index 0000000..291f546
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/inverse_layer.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Inverse, LAYER_INVERSE);
+
+Status InverseLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status InverseLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Inverse, LAYER_INVERSE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/layer_norm_layer.cc b/3rdparty/TNN/source/tnn/layer/layer_norm_layer.cc
new file mode 100644
index 0000000..4e894ca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/layer_norm_layer.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(LayerNorm, LAYER_LAYER_NORM);
+
+Status LayerNormLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status LayerNormLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    //check dims
+    auto layer_param = dynamic_cast<LayerNormLayerParam *>(param_);
+    RETURN_VALUE_ON_NEQ(!layer_param, false, Status(TNNERR_PARAM_ERR, "LayerNormLayerParam is nil"));
+    if (input_blobs_.size() < 3) {
+        return Status(TNNERR_PARAM_ERR, "LayerNormLayer has no input blob of scale or bias");
+    }
+    
+    auto input_blob  = input_blobs_[0];
+    auto scale_blob  = input_blobs_[1];
+    auto bias_blob  = input_blobs_[2];
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    auto dims_scale = scale_blob->GetBlobDesc().dims;
+    auto dims_bias = bias_blob->GetBlobDesc().dims;
+
+    if (layer_param->reduce_dims_size != dims_scale.size() || !DimsVectorUtils::Equal(dims_scale, dims_bias)) {
+        return Status(TNNERR_PARAM_ERR, "LayerNormLayer has invalid dims for input blob of scale or bias");
+    }
+    
+    //enure dims are valid
+    const int dim_offset = (int)dims_input.size() - (int)dims_scale.size();
+    for (int i=0; i<dims_scale.size(); i++) {
+        if (dim_offset < 0 || dims_input[i + dim_offset] != dims_scale[i] || dims_scale[i] != dims_bias[i]) {
+            return Status(TNNERR_PARAM_ERR, "LayerNormLayer has invalid dims for input blob");
+        }
+    }
+    
+    output_blobs_[0]->GetBlobDesc().dims =dims_input;
+    
+    return TNN_OK;
+}
+
+REGISTER_LAYER(LayerNorm, LAYER_LAYER_NORM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/log_layer.cc b/3rdparty/TNN/source/tnn/layer/log_layer.cc
new file mode 100644
index 0000000..68c6e72
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/log_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Log, LAYER_LOG);
+
+REGISTER_ELEMENTWISE_LAYER(Log, LAYER_LOG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/log_sigmoid_layer.cc b/3rdparty/TNN/source/tnn/layer/log_sigmoid_layer.cc
new file mode 100644
index 0000000..ee62989
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/log_sigmoid_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(LogSigmoid, LAYER_LOGSIGMOID);
+
+REGISTER_ELEMENTWISE_LAYER(LogSigmoid, LAYER_LOGSIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/lrn_layer.cc b/3rdparty/TNN/source/tnn/layer/lrn_layer.cc
new file mode 100644
index 0000000..5e88f60
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/lrn_layer.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(LRN, LAYER_LRN);
+
+Status LRNLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status LRNLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob *input_blob                = input_blobs_[0];
+    Blob *output_blob               = output_blobs_[0];
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(LRN, LAYER_LRN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/lstm_layer.cc b/3rdparty/TNN/source/tnn/layer/lstm_layer.cc
new file mode 100644
index 0000000..05149f3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/lstm_layer.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(LSTMONNX, LAYER_LSTMONNX);
+
+Status LSTMONNXLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status LSTMONNXLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int num_directions = layer_param->direction >=2 ? 2 : 1;
+    
+    auto input_dims = input_blobs_[0]->GetBlobDesc().dims;
+    auto sequence_len = input_dims[0]; // length of sequence
+    auto batch = input_dims[1];  // batch_size
+    auto input_size = DimsVectorUtils::Count(input_dims, 2); // input dimension
+    auto output_size = layer_param->hidden_size;
+    
+    //[seq_length, batch_size, num_directions*hidden_size], shape after transpose and reshape
+    DimsVector output_dims = {sequence_len, batch, num_directions*output_size};
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    if (output_blobs_.size() >= 3) {
+        //[num_directions, batch_size, output_size]
+        output_dims = {num_directions, batch, output_size};
+        output_blobs_[1]->GetBlobDesc().dims = output_dims;
+        output_blobs_[2]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(LSTMONNX, LAYER_LSTMONNX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/mat_mul_layer.cc b/3rdparty/TNN/source/tnn/layer/mat_mul_layer.cc
new file mode 100644
index 0000000..5129e8c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/mat_mul_layer.cc
@@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(MatMul, LAYER_MATMUL);
+
+Status MatMulLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+// @brief matmul op to MatMul matrix_a_dims and matrix_b_dims
+//    see https://pytorch.org/docs/stable/generated/torch.matmul.html?highlight=matmul#torch.matmul
+//    Matrix product of two tensors.
+//
+//    The behavior depends on the dimensionality of the tensors as follows:
+//
+//    If both tensors are 1-dimensional, the dot product (scalar) is returned.
+//
+//    If both arguments are 2-dimensional, the matrix-matrix product is returned.
+//
+//    If the first argument is 1-dimensional and the second argument is 2-dimensional, a 1 is prepended to its dimension for the purpose of the matrix multiply. After the matrix multiply, the prepended dimension is removed.
+//
+//    If the first argument is 2-dimensional and the second argument is 1-dimensional, the matrix-vector product is returned.
+//
+//    If both arguments are at least 1-dimensional and at least one argument is N-dimensional (where N > 2), then a batched matrix multiply is returned. If the first argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the batched matrix multiply and removed after. If the second argument is 1-dimensional, a 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. The non-matrix (i.e. batch) dimensions are broadcasted (and thus must be broadcastable). For example, if input is a (j,1,n,m) tensor and other is a (k,m,p) tensor, out will be an (j,k,n,p) tensor.
+DimsVector CalculateOutputDim(DimsVector matrix_a_dims, DimsVector matrix_b_dims, Status *status) {
+    DimsVector output_dims;
+    bool squeeze_matrix_a = false;
+    bool squeeze_matrix_b = false;
+    if (matrix_a_dims.size() == 1) {
+        matrix_a_dims.insert(matrix_a_dims.begin(), 1);
+        squeeze_matrix_a = true;
+    }
+    if (matrix_b_dims.size() == 1) {
+        matrix_b_dims.push_back(1);
+        squeeze_matrix_b = true;
+    }
+    if (matrix_a_dims.size() == 2 && matrix_b_dims.size() == 2) {
+        output_dims = {matrix_a_dims[0], matrix_b_dims[1]};
+    } else if (matrix_a_dims.size() == 2 && matrix_b_dims.size() > 2) {
+        if (matrix_a_dims[1] == matrix_b_dims[matrix_b_dims.size() - 2]) {
+            output_dims                           = matrix_b_dims;
+            output_dims[matrix_b_dims.size() - 2] = matrix_a_dims[matrix_a_dims.size() - 2];
+        } else {
+            LOGE("MatMul get wrong matrix_a or matrix_b\n");
+            if (status) {
+                *status = Status(TNNERR_PARAM_ERR, "MatMul has wrong shape of matrix_a or matrix_b");
+            }
+            return output_dims;
+        }
+    } else if (matrix_a_dims.size() > 2 && matrix_b_dims.size() == 2) {
+        if (matrix_a_dims.back() == matrix_b_dims[matrix_b_dims.size() - 2]) {
+            output_dims                           = matrix_a_dims;
+            output_dims[matrix_a_dims.size() - 1] = matrix_b_dims.back();
+        } else {
+            LOGE("MatMul get wrong matrix_a or matrix_b\n");
+            if (status) {
+                *status = Status(TNNERR_PARAM_ERR, "MatMul has wrong shape of matrix_a or matrix_b");
+            }
+            return output_dims;
+        }
+    } else if (matrix_a_dims.size() > 2 && matrix_b_dims.size() > 2) {
+        // check matrix_a and matrix_b
+        if (matrix_a_dims.back() != matrix_b_dims[matrix_b_dims.size() - 2]) {
+            LOGE("MatMul get wrong matrix_a or matrix_b\n");
+            if (status) {
+                *status = Status(TNNERR_PARAM_ERR, "MatMul has wrong shape of matrix_a or matrix_b");
+            }
+            return output_dims;
+        }
+        output_dims = matrix_a_dims.size() >= matrix_b_dims.size() ? matrix_a_dims : matrix_b_dims;
+        output_dims[output_dims.size() - 2] = matrix_a_dims[matrix_a_dims.size() - 2];
+        output_dims[output_dims.size() - 1] = matrix_b_dims[matrix_b_dims.size() - 1];
+
+        int count = matrix_a_dims.size() <= matrix_b_dims.size() ? matrix_a_dims.size() : matrix_b_dims.size();
+        for (int i = 0; i < count - 2; ++i) {
+            int matrix_a_dim = matrix_a_dims[matrix_a_dims.size() - 3 - i];
+            int matrix_b_dim = matrix_b_dims[matrix_b_dims.size() - 3 - i];
+            if (matrix_a_dim != 1 && matrix_b_dim != 1 && matrix_a_dim != matrix_b_dim) {
+                LOGE("MatMul get wrong matrix_a or matrix_b (matrix_a_dim=%d, matrix_b_dim=%d)\n", matrix_a_dim, matrix_b_dim);
+                if (status) {
+                    *status = Status(TNNERR_PARAM_ERR, "MatMul has wrong shape of matrix_a or matrix_b");
+                }
+                return output_dims;
+            } else {
+                output_dims[output_dims.size() - 3 - i] = matrix_a_dim >= matrix_b_dim ? matrix_a_dim : matrix_b_dim;
+            }
+        }
+    }
+    if (squeeze_matrix_a && output_dims[output_dims.size() - matrix_a_dims.size()] == 1) {
+        output_dims.erase(output_dims.end() - matrix_a_dims.size());
+    }
+    if (squeeze_matrix_b && output_dims[output_dims.size() - 1] == 1) {
+        output_dims.erase(output_dims.end() - 1);
+    }
+
+    return output_dims;
+}
+
+Status MatMulLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto param    = dynamic_cast<MatMulLayerParam*>(param_);
+    auto resource = dynamic_cast<MatMulLayerResource*>(resource_);
+    DimsVector matrix_a_dims;
+    DimsVector matrix_b_dims;
+    if (input_blobs_.size() == 1) {
+        if (param->weight_position == 0) {
+            matrix_a_dims = resource->weight.GetBufferDims();
+            matrix_b_dims = input_blobs_[0]->GetBlobDesc().dims;
+        } else if (param->weight_position == 1) {
+            matrix_a_dims = input_blobs_[0]->GetBlobDesc().dims;
+            matrix_b_dims = resource->weight.GetBufferDims();
+        } else {
+            return Status(TNNERR_INVALID_MODEL, "MatMul input size is error");
+        }
+    } else if (input_blobs_.size() == 2) {
+        matrix_a_dims = input_blobs_[0]->GetBlobDesc().dims;
+        matrix_b_dims = input_blobs_[1]->GetBlobDesc().dims;
+    } else {
+        return Status(TNNERR_INVALID_MODEL, "MatMul input size is error");
+    }
+    param->matrix_a_dims = matrix_a_dims;
+    param->matrix_b_dims = matrix_b_dims;
+
+    auto output_dims = CalculateOutputDim(matrix_a_dims, matrix_b_dims, &status);
+    LOGE_IF(!ignore_error && status != TNN_OK, "MatMulLayer: %s\n", status.description().c_str());
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+
+    return status;
+}
+
+REGISTER_LAYER(MatMul, LAYER_MATMUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/max_layer.cc b/3rdparty/TNN/source/tnn/layer/max_layer.cc
new file mode 100644
index 0000000..fdd84d4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/max_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Max, LAYER_MAXIMUM);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Max, LAYER_MAXIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/min_layer.cc b/3rdparty/TNN/source/tnn/layer/min_layer.cc
new file mode 100644
index 0000000..7789c4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/min_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Min, LAYER_MINIMUM);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Min, LAYER_MINIMUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/mul_layer.cc b/3rdparty/TNN/source/tnn/layer/mul_layer.cc
new file mode 100644
index 0000000..9090c71
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/mul_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Mul, LAYER_MUL);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Mul, LAYER_MUL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.cc b/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.cc
new file mode 100644
index 0000000..1d752bd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.cc
@@ -0,0 +1,217 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+void BroadCastTypeFilter(const DimsVector &dims_output, const DimsVector &dims_input, int &type) {
+    if (DimsVectorUtils::Equal(dims_output, dims_input)) {
+        type = BroadcastTypeNormal;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 1)) {
+        type = BroadcastTypeElement;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 2)) {
+        type = BroadcastTypeHeightWidth;
+        return;
+    }
+    if (DimsVectorUtils::Equal(dims_output, dims_input, 3)) {
+        type = BroadcastTypeWidth;
+        return;
+    }
+    int broadcast_count = DimsVectorUtils::Count(dims_input);
+    if (broadcast_count == 1) {
+        type = BroadcastTypeSingle;
+    } else if (broadcast_count == dims_output[1]) {
+        // broadcast dim = [1, channel, 1...]
+        if (dims_input[1] == dims_output[1]) {
+            type = BroadcastTypeChannel;
+        } else {
+            type = BroadcastTypeGeneral;
+        }
+    } else {
+        type = BroadcastTypeGeneral;
+    }
+    return;
+}
+
+static Status GetBroadcastType(DimsVector input, DimsVector output, int &type) {
+    DimsVector input_pad_shape;
+    // support input dims size diff with output dims
+    int pad_size = output.size() - input.size();
+    while (pad_size-- != 0) {
+        input_pad_shape.push_back(1);
+    }
+    input_pad_shape.insert(input_pad_shape.end(), input.begin(), input.end());
+
+    BroadCastTypeFilter(output, input_pad_shape, type);
+    return TNN_OK;
+}
+
+void EXPAND(DimsVector &dim0, DimsVector &dim1) {
+    if (dim0.size() < dim1.size()) {
+        // dim0 < dim1
+        size_t diff = dim1.size() - dim0.size();
+        for (int i = 0; i < diff; ++i) {
+            dim0.insert(dim0.begin(), 1);
+        }
+    } else {
+        // dim0 > dim1
+        size_t diff = dim0.size() - dim1.size();
+        for (int i = 0; i < diff; ++i) {
+            dim1.insert(dim1.begin(), 1);
+        }
+    }
+}
+
+bool SupportBroadcast(DimsVector dim0, DimsVector dim1) {
+    if (dim0.size() != dim1.size()) {
+        EXPAND(dim0, dim1);
+    }
+    ASSERT(dim0.size() == dim1.size());
+
+    for (int i = 0; i < dim0.size(); ++i) {
+        // the "!" is key point
+        if (!(dim0[i] == 1 || dim1[i] == 1 || dim1[i] == dim0[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+Status MultidirBroadcastLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if (layer_res) {
+        const int weight_input_index = layer_param->weight_input_index;
+        if (weight_input_index != 0 && weight_input_index != 1) {
+            LOGE_IF(!ignore_error, "Error: unsupported weight_input_index\n");
+            return Status(TNNERR_LAYER_ERR, "Error: unsupported weight_input_index");
+        }
+
+        DimsVector input_shape = input_blobs_[0]->GetBlobDesc().dims;
+        int input_count        = DimsVectorUtils::Count(input_shape,1);// 这个地方好像有问题
+
+        DimsVector weight_shape = layer_res->element_handle.GetBufferDims();
+        if (input_shape.empty() && weight_shape.empty()) {
+            output_blobs_[0]->GetBlobDesc().dims = input_shape;
+            return TNN_OK;
+        }
+        if (weight_shape.size() <= 0) {
+            weight_shape       = DimsVector(input_shape.size(), 1);
+            int layer_res_size = layer_res->element_handle.GetDataCount();
+            if (layer_res_size == 1) {
+                // single element
+                weight_shape[1] = layer_res_size;
+            } else if (layer_res_size == input_shape[1]) {
+                // channel broadcast
+                weight_shape[1] = layer_res_size;
+            } else if (layer_res_size == input_count) {
+                // element broadcast
+                for (int i = 1; i < input_shape.size(); ++i) {
+                    weight_shape[i] = input_shape[i];
+                }
+            } else if ((input_shape.size() >= 4) && (layer_res_size == input_shape[3])) {
+                weight_shape[3] = input_shape[3];
+            } else if (layer_res_size == DimsVectorUtils::Count(input_shape, 2)) {
+                for (int i = 2; i < input_shape.size(); ++i) {
+                    weight_shape[i] = input_shape[i];
+                }
+            } else {
+                LOGE_IF(!ignore_error, "Error: unsupported broadcast type\n");
+                return Status(TNNERR_LAYER_ERR, "Error: unsupported broadcast type");
+            }
+            layer_res->element_shape = weight_shape;
+        } else {
+            layer_res->element_shape = weight_shape;
+        }
+        EXPAND(input_shape, weight_shape);
+        DimsVector dims_output               = DimsVectorUtils::Max(input_shape, weight_shape);
+        output_blobs_[0]->GetBlobDesc().dims = dims_output;
+
+        int input_broadcast_type  = BroadcastTypeNormal;
+        int weight_broadcast_type = BroadcastTypeNormal;
+        auto status               = GetBroadcastType(input_shape, dims_output, input_broadcast_type);
+        if (status != TNN_OK) {
+            return status;
+        }
+        status = GetBroadcastType(weight_shape, dims_output, weight_broadcast_type);
+        if (status != TNN_OK) {
+            return status;
+        }
+
+        if (weight_input_index == 0) {
+            layer_param->input0_broadcast_type = weight_broadcast_type;
+            layer_param->input1_broadcast_type = input_broadcast_type;
+        } else {
+            layer_param->input0_broadcast_type = input_broadcast_type;
+            layer_param->input1_broadcast_type = weight_broadcast_type;
+        }
+    } else {
+        DimsVector dim0 = input_blobs_[0]->GetBlobDesc().dims;
+        DimsVector dim1 = dim0;
+        if (input_blobs_.size() > 1) {
+            dim1 = input_blobs_[1]->GetBlobDesc().dims;
+        }
+
+        if (!SupportBroadcast(dim0, dim1)) {
+            LOGE_IF(!ignore_error, 
+                "Error: operands could not be broadcast together with wrong "
+                "shape (name: %s)\n", layer_param->name.c_str());
+            return Status(TNNERR_LAYER_ERR,
+                          "Error: operands could not be broadcast together "
+                          "with wrong shape");
+        }
+
+        auto dims_output = dim0;
+        for (auto iter : input_blobs_) {
+            auto tmp = iter->GetBlobDesc().dims;
+            if (dims_output.size() != tmp.size()) {
+                EXPAND(dims_output, tmp);
+            }
+            dims_output = DimsVectorUtils::Max(dims_output, tmp);
+        }
+        output_blobs_[0]->GetBlobDesc().dims = dims_output;
+
+        int input0_broadcast_type = BroadcastTypeNormal;
+        int input1_broadcast_type = BroadcastTypeNormal;
+        auto status               = GetBroadcastType(dim0, dims_output, input0_broadcast_type);
+        if (status != TNN_OK) {
+            return status;
+        }
+        status = GetBroadcastType(dim1, dims_output, input1_broadcast_type);
+        if (status != TNN_OK) {
+            return status;
+        }
+
+        layer_param->input0_broadcast_type = input0_broadcast_type;
+        layer_param->input1_broadcast_type = input1_broadcast_type;
+    }
+
+//    LOGD("broadcast_type: input0(%d) input1(%d)\n", layer_param->input0_broadcast_type,
+//         layer_param->input1_broadcast_type);
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.h b/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.h
new file mode 100644
index 0000000..93174f0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/multidir_broadcast_layer.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_LAYER_MULTIDIR_BROADCAST_LAYER_H_
+#define TNN_SOURCE_TNN_LAYER_MULTIDIR_BROADCAST_LAYER_H_
+
+#include "tnn/layer/elementwise_layer.h"
+namespace TNN_NS {
+
+class MultidirBroadcastLayer : public ElementwiseLayer {
+public:
+    MultidirBroadcastLayer(LayerType ignore) : ElementwiseLayer(ignore){};
+    virtual ~MultidirBroadcastLayer(){};
+
+protected:
+    virtual Status InferOutputShape(bool ignore_error = false);
+};
+
+#define DECLARE_MULTIDIR_BROADCAST_LAYER(type_string, layer_type)                                                      \
+    class type_string##Layer : public MultidirBroadcastLayer {                                                         \
+    public:                                                                                                            \
+        type_string##Layer(LayerType ignore) : MultidirBroadcastLayer(layer_type){};                                   \
+        virtual ~type_string##Layer(){};                                                                               \
+    }
+
+#define REGISTER_MULTIDIR_BROADCAST_LAYER(type_string, layer_type) REGISTER_LAYER(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/layer/neg_layer.cc b/3rdparty/TNN/source/tnn/layer/neg_layer.cc
new file mode 100644
index 0000000..940985f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/neg_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Neg, LAYER_NEG);
+
+REGISTER_ELEMENTWISE_LAYER(Neg, LAYER_NEG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/nonzero_layer.cc b/3rdparty/TNN/source/tnn/layer/nonzero_layer.cc
new file mode 100644
index 0000000..f4eaf34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/nonzero_layer.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(NonZero, LAYER_NONZERO);
+
+Status NonZeroLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    for (auto& iter : output_blobs_) {
+        int allocate_status = DATA_FLAG_ALLOCATE_IN_FORWARD;
+        if (runtime_model_ == RUNTIME_MODE_NORMAL &&
+            const_resource_ != nullptr && const_resource_->find(iter->GetBlobDesc().name) != const_resource_->end()) {
+            allocate_status = 0;
+        }
+        iter->SetFlag(iter->GetFlag() | allocate_status);
+        iter->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    }
+    return TNN_OK;
+}
+
+Status NonZeroLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto input_dims  = input_blobs_[0]->GetBlobDesc().dims;
+    int input_dim_size = (int)input_dims.size();
+    int count = DimsVectorUtils::Count(input_dims);
+    
+    output_blobs_[0]->GetBlobDesc().dims = {input_dim_size, count};
+    return TNN_OK;
+}
+
+REGISTER_LAYER(NonZero, LAYER_NONZERO);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/normalize_layer.cc b/3rdparty/TNN/source/tnn/layer/normalize_layer.cc
new file mode 100644
index 0000000..31fba2b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/normalize_layer.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Normalize, LAYER_NORMALIZE);
+
+Status NormalizeLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status NormalizeLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Normalize, LAYER_NORMALIZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/onehot_layer.cc b/3rdparty/TNN/source/tnn/layer/onehot_layer.cc
new file mode 100644
index 0000000..dbec879
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/onehot_layer.cc
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_WITH_FUNC(OneHot, LAYER_ONEHOT,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status OneHotLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    DataType output_data_type = DATA_TYPE_INT32;
+    if (input_blobs_.size() >=3) {
+        output_data_type = input_blobs_[2]->GetBlobDesc().data_type;
+    }
+    
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = output_data_type;
+    }
+    return TNN_OK;
+}
+
+Status OneHotLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<OneHotLayerParam*>(param_);
+    RETURN_VALUE_ON_NEQ(!layer_param, false,
+                        Status(TNNERR_PARAM_ERR,"OneHotLayerParam is nil"));
+    
+    int axis = layer_param->axis;
+    auto output_dims = input_blobs_[0]->GetBlobDesc().dims;
+    if (axis < 0) {
+        axis += output_dims.size() + 1;
+    }
+    
+    output_dims.insert(output_dims.begin()+axis, layer_param->depth);
+
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+Status OneHotLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<OneHotLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() < 3) {
+        return Status(TNNERR_PARAM_ERR, "OneHotLayer has invalid layer param");
+    }
+    
+    //depth
+    {
+        const auto res_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(res_name) != const_resource_->end()) {
+            auto buffer = (*const_resource_)[res_name];
+            if (buffer->GetDataType() != DATA_TYPE_INT32) {
+                return Status(TNNERR_PARAM_ERR, "OneHotLayer has invalid layer resource for depth param");
+            }
+            
+            auto data = buffer->force_to<int *>();
+            layer_param->depth = data[0];
+        }
+    }
+    
+    //values
+    {
+        const auto res_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(res_name) != const_resource_->end()) {
+            auto buffer = (*const_resource_)[res_name];
+            if (buffer->GetDataType() != DATA_TYPE_FLOAT || buffer->GetDataCount() < 2) {
+                return Status(TNNERR_PARAM_ERR, "OneHotLayer has invalid layer resource for values param");
+            }
+            auto data = buffer->force_to<float *>();
+            
+            layer_param->value_off = data[0];
+            layer_param->value_on = data[1];
+        }
+    }
+    return status;
+}
+
+REGISTER_LAYER(OneHot, LAYER_ONEHOT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/pad_layer.cc b/3rdparty/TNN/source/tnn/layer/pad_layer.cc
new file mode 100644
index 0000000..76dae2e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/pad_layer.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Pad, LAYER_PAD);
+
+Status PadLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PadLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<PadLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE_IF(!ignore_error, "Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto dims         = input_blob->GetBlobDesc().dims;
+    dims[3] += layer_param->pads[0] + layer_param->pads[1];
+    dims[2] += layer_param->pads[2] + layer_param->pads[3];
+    dims[1] += layer_param->pads[4] + layer_param->pads[5];
+
+    output_blob->GetBlobDesc().dims = dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Pad, LAYER_PAD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/padv2_layer.cc b/3rdparty/TNN/source/tnn/layer/padv2_layer.cc
new file mode 100644
index 0000000..880da52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/padv2_layer.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(PadV2, LAYER_PADV2,
+                        virtual Status FillLayerParamWithConstantResource(););
+Status PadV2Layer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PadV2Layer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<PadLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE_IF(!ignore_error, "Error: layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer param is nil");
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto output_dims = input_blob->GetBlobDesc().dims;
+    auto dim_size = layer_param->pads.size()/2;
+    dim_size = dim_size <= output_dims.size() ? dim_size : output_dims.size();
+    for (int i = 0; i<dim_size; i++) {
+        output_dims[i] += layer_param->pads[i] + layer_param->pads[i+dim_size];
+    }
+    
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+Status PadV2Layer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<PadLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() >= 2) {
+        auto pads_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(pads_blob_name) != const_resource_->end()) {
+            auto begins_buffer =  (*const_resource_)[pads_blob_name];
+            auto dim_count = begins_buffer->GetDataCount();
+            if (begins_buffer->GetDataType() == DATA_TYPE_INT32) {
+                auto dim_data = (int *)begins_buffer->force_to<int *>();
+                DimsVector dims;
+                for (int i=0; i<dim_count; i++) {
+                    dims.push_back(dim_data[i]);
+                }
+                layer_param->pads = dims;
+            } else if(begins_buffer->GetDataType() == DATA_TYPE_INT64){
+                auto dim_data = (long long int *)begins_buffer->force_to<long long int *>();
+                DimsVector dims;
+                for (int i=0; i<dim_count; i++) {
+                    dims.push_back(DataTypeUtils::SaturateCast(dim_data[i]));
+                }
+                layer_param->pads = dims;
+            }
+        }
+    }
+    
+    return status;
+}
+
+REGISTER_LAYER(PadV2, LAYER_PADV2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/permute_layer.cc b/3rdparty/TNN/source/tnn/layer/permute_layer.cc
new file mode 100644
index 0000000..217a206
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/permute_layer.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Permute, LAYER_PERMUTE);
+
+Status PermuteLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PermuteLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto permute_param = dynamic_cast<PermuteLayerParam*>(param_);
+    CHECK_PARAM_NULL(permute_param);
+
+    auto input_blob  = input_blobs_[0];
+    auto output_blob = output_blobs_[0];
+
+    DimsVector output_dims;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    std::vector<int>& orders = permute_param->orders;
+
+    for (int i = 0; i < input_dims.size(); ++i) {
+        if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
+            orders.push_back(i);
+        }
+    }
+    if (permute_param->orders.size() != input_dims.size()) {
+        LOGE_IF(!ignore_error, "Permute param got wrong size.\n");
+        return Status(TNNERR_PARAM_ERR, "Permute param got wrong size");
+    }
+
+    for (int i = 0; i < permute_param->orders.size(); ++i) {
+        int order = permute_param->orders[i];
+        if (order < 0 || order > input_dims.size() - 1) {
+            LOGE_IF(!ignore_error, "Permute param out of range.\n");
+            return Status(TNNERR_PARAM_ERR, "Permute param out of range");
+        }
+        output_dims.push_back(input_dims[order]);
+    }
+    
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Permute, LAYER_PERMUTE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/pixel_shuffle_layer.cc b/3rdparty/TNN/source/tnn/layer/pixel_shuffle_layer.cc
new file mode 100644
index 0000000..1963456
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/pixel_shuffle_layer.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+Status PixelShuffleLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PixelShuffleLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto input_blob        = input_blobs_[0];
+    auto input_dims        = input_blob->GetBlobDesc().dims;
+    auto output_blob       = output_blobs_[0];
+    auto layer_param       = dynamic_cast<PixelShuffleLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    int upscale_factor     = layer_param->upscale_factor;
+    DimsVector output_dims = input_dims;
+    ASSERT(input_dims[1] >= upscale_factor && input_dims[1] % (upscale_factor * upscale_factor) == 0);
+    output_dims[1]                  = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2]                  = input_dims[2] * upscale_factor;
+    output_dims[3]                  = input_dims[3] * upscale_factor;
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/pooling_3d_layer.cc b/3rdparty/TNN/source/tnn/layer/pooling_3d_layer.cc
new file mode 100644
index 0000000..ceae6f0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/pooling_3d_layer.cc
@@ -0,0 +1,209 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Pooling3D, LAYER_POOLING_3D);
+
+inline int Pooling3DLayerRuntimeKernelHeight(PoolingLayerParam* pool_param, DimsVector input_dims) {
+    int kernel_h = pool_param->kernels_params[1];
+    // If the kernel_h and kernel_w are zero, it means the kernel size is
+    // equal to the input height and width
+    if (kernel_h == 0) {
+        kernel_h = input_dims[3];  // NCDHW
+    }
+
+    if (pool_param->kernel_indexs[1] != -1) {
+        kernel_h = input_dims[pool_param->kernel_indexs[1]];
+    }
+    pool_param->kernels[1] = kernel_h;
+    return kernel_h;
+}
+
+inline int Pooling3DLayerRuntimeKernelWidth(PoolingLayerParam* pool_param, DimsVector input_dims) {
+    int kernel_w = pool_param->kernels_params[0];
+    // If the kernel_h and kernel_w are zero, it means the kernel size is
+    // equal to the input height and width
+    if (kernel_w == 0) {
+        kernel_w = input_dims[4];  // NCDHW
+    }
+
+    if (pool_param->kernel_indexs[0] != -1) {
+        kernel_w = input_dims[pool_param->kernel_indexs[0]];
+    }
+    pool_param->kernels[0] = kernel_w;
+    return kernel_w;
+}
+
+inline int Pooling3DLayerRuntimeKernelDepth(PoolingLayerParam* pool_param, DimsVector input_dims) {
+    int kernel_d = pool_param->kernels_params[2];
+    // If the kernel_h and kernel_w are zero, it means the kernel size is
+    // equal to the input height and width
+    if (kernel_d == 0) {
+        kernel_d = input_dims[2];  // NCDHW
+    }
+
+    if (pool_param->kernel_indexs[2] != -1) {
+        kernel_d = input_dims[pool_param->kernel_indexs[2]];
+    }
+    pool_param->kernels[2] = kernel_d;
+    return kernel_d;
+}
+
+Status Pooling3DLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status Pooling3DLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+
+    PoolingLayerParam* pool_param = dynamic_cast<PoolingLayerParam*>(param_);
+    CHECK_PARAM_NULL(pool_param);
+
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    int num         = input_blob->GetBlobDesc().dims[0];
+    int channels    = input_blob->GetBlobDesc().dims[1];
+    int depth       = input_blob->GetBlobDesc().dims[2];
+    int height      = input_blob->GetBlobDesc().dims[3];
+    int width       = input_blob->GetBlobDesc().dims[4];
+
+    const int kernel_w = Pooling3DLayerRuntimeKernelWidth(pool_param, dims_input);
+    const int kernel_h = Pooling3DLayerRuntimeKernelHeight(pool_param, dims_input);
+    const int kernel_d = Pooling3DLayerRuntimeKernelDepth(pool_param, dims_input);
+
+    int stride_w = pool_param->strides[0];
+    int stride_h = pool_param->strides[1];
+    int stride_d = pool_param->strides[2];
+
+    int height_out = 0;
+    int width_out  = 0;
+    int depth_out  = 0;
+
+    // default padding following the proto setting
+    if (pool_param->pad_type == -1) {
+        int pad_w = pool_param->pads[0];
+        int pad_h = pool_param->pads[2];
+        int pad_d = pool_param->pads[4];
+        if (pool_param->ceil_mode == 1) {
+            height_out = int(std::ceil(float(height + 2 * pad_h - kernel_h) / (float)stride_h + 1));
+            width_out  = int(std::ceil(float(width + 2 * pad_w - kernel_w) / (float)stride_w + 1));
+            depth_out  = int(std::ceil(float(depth + 2 * pad_d - kernel_d) / (float)stride_d + 1));
+        } else {
+            height_out = int(std::floor(float(height + 2 * pad_h - kernel_h) / (float)stride_h + 1));
+            width_out  = int(std::floor(float(width + 2 * pad_w - kernel_w) / (float)stride_w + 1));
+            depth_out  = int(std::floor(float(depth + 2 * pad_d - kernel_d) / (float)stride_d + 1));
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_w - width);
+        int pad_along_depth  = ((depth_out - 1) * stride_d + kernel_d - depth);
+        int pad_down         = pad_along_height - pad_h;
+        int pad_right        = pad_along_width - pad_w;
+        int pad_back         = pad_along_depth - pad_d;
+        if (pad_down < 0 || pad_right < 0 || pad_back < 0) {
+            pad_down  = std::max(pad_down, 0);
+            pad_right = std::max(pad_right, 0);
+            pad_back  = std::max(pad_back, 0);
+
+            // verify
+            int rectify_height_out = 0;
+            int rectify_width_out  = 0;
+            int rectify_depth_out  = 0;
+            if (pool_param->ceil_mode == 1) {
+                rectify_height_out = int(std::ceil(float(height + pad_h + pad_down - kernel_h) / (float)stride_h + 1));
+                rectify_width_out  = int(std::ceil(float(width + pad_w + pad_right - kernel_w) / (float)stride_w + 1));
+                rectify_depth_out  = int(std::ceil(float(depth + pad_d + pad_back  - kernel_d) / (float)stride_d + 1));
+            } else {
+                rectify_height_out = int(std::floor(float(height + pad_h + pad_down - kernel_h) / (float)stride_h + 1));
+                rectify_width_out  = int(std::floor(float(width + pad_w + pad_right - kernel_w) / (float)stride_w + 1));
+                rectify_depth_out  = int(std::floor(float(depth + pad_d + pad_back  - kernel_d) / (float)stride_d + 1));
+            }
+
+            if (rectify_height_out != height_out || rectify_width_out != width_out || rectify_depth_out != depth_out) {
+                LOGE_IF(!ignore_error, "Error: Pooling3DLayer, maybe it is the case for global pooling\n");
+                return Status(TNNERR_PARAM_ERR, "Error: Pooling3DLayer, maybe it is the case for global pooling");
+            }
+        }
+
+        pool_param->pads[1] = pad_right;
+        pool_param->pads[3] = pad_down;
+        pool_param->pads[5] = pad_back;
+    } else {
+        // The code below is based on the logic from
+        // https://www.tensorflow.org/api_docs/python/nn/convolution
+        if (pool_param->pad_type == 0)  // SAME type
+        {
+            if (pool_param->ceil_mode == 1) {
+                height_out = int(std::ceil(float(height) / float(stride_h)));
+                width_out  = int(std::ceil(float(width) / float(stride_w)));
+                depth_out  = int(std::ceil(float(depth) / float(stride_d)));
+            } else {
+                height_out = int(std::floor(float(height) / float(stride_h)));
+                width_out  = int(std::floor(float(width) / float(stride_w)));
+                depth_out  = int(std::floor(float(depth) / float(stride_d)));
+            }
+        } else if (pool_param->pad_type == 1)  // VALID type
+        {
+            height_out = int(std::ceil(float(height - kernel_h + 1) / float(stride_h)));
+            width_out  = int(std::ceil(float(width - kernel_w + 1) / float(stride_w)));
+            depth_out  = int(std::ceil(float(depth - kernel_d + 1) / float(stride_d)));
+        } else {
+            LOGE_IF(!ignore_error, "Error: Pooling3DLayer, maybe it is the case for global pooling\n");
+            return Status(TNNERR_PARAM_ERR, "Error: Pooling3DLayer, maybe it is the case for global pooling");
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_w - width);
+        int pad_along_depth  = ((depth_out - 1) * stride_d + kernel_d - depth);
+
+        // align with pytorch in ceil mode.
+        int pad_top   = int(std::ceil(float(pad_along_height) / float(stride_h)));
+        int pad_left  = int(std::ceil(float(pad_along_width) / float(stride_h)));
+        int pad_front = int(std::ceil(float(pad_along_depth) / float(stride_h)));
+
+        int pad_down  = pad_along_height - pad_top;
+        int pad_right = pad_along_width - pad_left;
+        int pad_back  = pad_along_depth - pad_front;
+
+        pool_param->pads[0] = pad_left;
+        pool_param->pads[1] = pad_right;
+        pool_param->pads[2] = pad_top;
+        pool_param->pads[3] = pad_down;
+        pool_param->pads[4] = pad_front;
+        pool_param->pads[5] = pad_back;
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(channels);
+    output_dims.push_back(depth_out);
+    output_dims.push_back(height_out);
+    output_dims.push_back(width_out);
+
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Pooling3D, LAYER_POOLING_3D);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/pooling_layer.cc b/3rdparty/TNN/source/tnn/layer/pooling_layer.cc
new file mode 100644
index 0000000..0a944bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/pooling_layer.cc
@@ -0,0 +1,189 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Pooling, LAYER_POOLING);
+
+inline int PoolingLayerRuntimeKernelHeight(PoolingLayerParam* pool_param, DimsVector input_dims) {
+    int kernel_h = pool_param->kernels_params[1];
+    // If the kernel_h and kernel_w are zero, it means the kernel size is
+    // equal to the input height and width
+    if (kernel_h == 0) {
+        kernel_h = input_dims[2];
+    }
+
+    if (pool_param->kernel_indexs[1] != -1) {
+        kernel_h = input_dims[pool_param->kernel_indexs[1]];
+    }
+    pool_param->kernels[1] = kernel_h;
+    return kernel_h;
+}
+
+inline int PoolingLayerRuntimeKernelWidth(PoolingLayerParam* pool_param, DimsVector input_dims) {
+    int kernel_w = pool_param->kernels_params[0];
+    // If the kernel_h and kernel_w are zero, it means the kernel size is
+    // equal to the input height and width
+    if (kernel_w == 0) {
+        kernel_w = input_dims[3];
+    }
+
+    if (pool_param->kernel_indexs[0] != -1) {
+        kernel_w = input_dims[pool_param->kernel_indexs[0]];
+    }
+    pool_param->kernels[0] = kernel_w;
+    return kernel_w;
+}
+
+Status PoolingLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PoolingLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    Blob* input_blob = input_blobs_[0];
+
+    PoolingLayerParam* pool_param = dynamic_cast<PoolingLayerParam*>(param_);
+    CHECK_PARAM_NULL(pool_param);
+
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    int num         = dims_input[0];
+    int channels    = dims_input[1];
+    int height      = dims_input[2];
+    int width       = dims_input[3];
+
+    if (pool_param->is_adaptive_pool) {
+        const int output_blobs_size = output_blobs_.size();
+        const auto output_shape     = pool_param->output_shape;
+        for (int i = 0; i < output_blobs_size; i++) {
+            output_blobs_[i]->GetBlobDesc().dims = {num, channels, output_shape[1], output_shape[0]};
+        }
+
+        return TNN_OK;
+    }
+
+    const int kernel_w = PoolingLayerRuntimeKernelWidth(pool_param, dims_input);
+    const int kernel_h = PoolingLayerRuntimeKernelHeight(pool_param, dims_input);
+    int stride_w       = pool_param->strides[0];
+    int stride_h       = pool_param->strides[1];
+
+    // int height_out = (int)ceil((height + 2 * pad_h - kernel_h) /
+    // (float)stride_h) + 1; int width_out = (int)ceil((width + 2 * pad_w -
+    // kernel_w) / (float)stride_w) + 1;
+
+    int height_out = 0;
+    int width_out  = 0;
+
+    if (pool_param->pad_type == -1)  // default padding following the proto setting
+    {
+        int pad_left  = pool_param->pads[0];
+        int pad_right = pool_param->pads[1];
+        int pad_top   = pool_param->pads[2];
+        int pad_down  = pool_param->pads[3];
+        if (pool_param->ceil_mode == 1) {
+            height_out =
+                static_cast<int>(std::ceil(float(height + pad_top + pad_down - kernel_h) / (float)stride_h + 1));
+            width_out =
+                static_cast<int>(std::ceil(float(width + pad_left + pad_right - kernel_w) / (float)stride_w + 1));
+        } else {
+            height_out =
+                static_cast<int>(std::floor(float(height + pad_top + pad_down - kernel_h) / (float)stride_h + 1));
+            width_out =
+                static_cast<int>(std::floor(float(width + pad_left + pad_right - kernel_w) / (float)stride_w + 1));
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_w - width);
+        pad_down             = pad_along_height - pad_top;
+        pad_right            = pad_along_width - pad_left;
+        if (pad_down < 0 || pad_right < 0) {
+            pad_down  = std::max(pad_down, 0);
+            pad_right = std::max(pad_right, 0);
+
+            // verify
+            int rectify_height_out = 0;
+            int rectify_width_out  = 0;
+            if (pool_param->ceil_mode == 1) {
+                rectify_height_out =
+                    static_cast<int>(std::ceil(float(height + pad_top + pad_down - kernel_h) / (float)stride_h + 1));
+                rectify_width_out =
+                    static_cast<int>(std::ceil(float(width + pad_left + pad_right - kernel_w) / (float)stride_w + 1));
+            } else {
+                rectify_height_out =
+                    static_cast<int>(std::floor(float(height + pad_top + pad_down - kernel_h) / (float)stride_h + 1));
+                rectify_width_out =
+                    static_cast<int>(std::floor(float(width + pad_down + pad_right - kernel_w) / (float)stride_w + 1));
+            }
+
+            if (rectify_height_out != height_out || rectify_width_out != width_out) {
+                LOGE_IF(!ignore_error, "Error: PoolingLayer, maybe it is the case for global pooling\n");
+                return Status(TNNERR_PARAM_ERR, "Error: Pooling3DLayer, maybe it is the case for global pooling");
+            }
+        }
+
+        pool_param->pads[1] = pad_right;
+        pool_param->pads[3] = pad_down;
+    } else {
+        // The code below is based on the logic from
+        // https://www.tensorflow.org/api_docs/python/nn/convolution
+        if (pool_param->pad_type == 0)  // SAME type
+        {
+            height_out = static_cast<int>(std::ceil(float(height) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width) / float(stride_w)));
+        } else if (pool_param->pad_type == 1)  // VALID type
+        {
+            height_out = static_cast<int>(std::ceil(float(height - kernel_h + 1) / float(stride_h)));
+            width_out  = static_cast<int>(std::ceil(float(width - kernel_w + 1) / float(stride_w)));
+        } else {
+            LOGE_IF(!ignore_error, "Error: PoolingLayer %s, maybe it is the case for global pooling\n",
+                    GetLayerName().c_str());
+            return Status(TNNERR_PARAM_ERR, "Error: PoolingLayer, maybe it is the case for global pooling");
+        }
+
+        int pad_along_height = ((height_out - 1) * stride_h + kernel_h - height);
+        int pad_along_width  = ((width_out - 1) * stride_w + kernel_w - width);
+
+        int pad_top  = pad_along_height / 2;
+        int pad_left = pad_along_width / 2;
+
+        int pad_down  = pad_along_height - pad_top;
+        int pad_right = pad_along_width - pad_left;
+
+        pool_param->pads[0] = pad_left;
+        pool_param->pads[1] = pad_right;
+        pool_param->pads[2] = pad_top;
+        pool_param->pads[3] = pad_down;
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(channels);
+    output_dims.push_back(height_out);
+    output_dims.push_back(width_out);
+
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Pooling, LAYER_POOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/pow_layer.cc b/3rdparty/TNN/source/tnn/layer/pow_layer.cc
new file mode 100644
index 0000000..c49e575
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/pow_layer.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(Pow, LAYER_POWER,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status PowLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PowLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<PowLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+Status PowLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<PowLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() >= 2) {
+        auto min_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(min_blob_name) != const_resource_->end()) {
+            auto min_buffer =  (*const_resource_)[min_blob_name];
+            auto dim_count = min_buffer->GetDataCount();
+            if (min_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                auto dim_data = (float *)min_buffer->force_to<float *>();
+                layer_param->exponent = *dim_data;
+            } else {
+                return Status(TNNERR_PARAM_ERR, "ClipLayer has invalid data type for min value");
+            }
+            
+            if (dim_count > 1) {
+                return Status(TNNERR_PARAM_ERR, "PowLayer only dont support broad cast right now");
+            }
+        }
+    }
+    return status;
+}
+REGISTER_ELEMENTWISE_LAYER(Pow, LAYER_POWER);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/prelu_layer.cc b/3rdparty/TNN/source/tnn/layer/prelu_layer.cc
new file mode 100644
index 0000000..44dec01
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/prelu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(PRelu, LAYER_PRELU);
+
+REGISTER_ELEMENTWISE_LAYER(PRelu, LAYER_PRELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/prior_box_layer.cc b/3rdparty/TNN/source/tnn/layer/prior_box_layer.cc
new file mode 100644
index 0000000..53ad60a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/prior_box_layer.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(PriorBox, LAYER_PRIOR_BOX);
+
+Status PriorBoxLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PriorBoxLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob *input_blob                    = input_blobs_[0];
+    Blob *output_blob                   = output_blobs_[0];
+    PriorBoxLayerParam *prior_box_param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    CHECK_PARAM_NULL(prior_box_param);
+
+    int num_priors = static_cast<int>(prior_box_param->aspect_ratios.size() * prior_box_param->min_sizes.size());
+    if (!prior_box_param->max_sizes.empty()) {
+        ASSERT(prior_box_param->min_sizes.size() == prior_box_param->max_sizes.size());
+        for (int i = 0; i < prior_box_param->max_sizes.size(); ++i) {
+            ASSERT(prior_box_param->max_sizes[i] > prior_box_param->min_sizes[i]);
+            num_priors++;
+        }
+    }
+    int num     = input_blob->GetBlobDesc().dims[0];
+    int channel = input_blob->GetBlobDesc().dims[1];
+    int height  = input_blob->GetBlobDesc().dims[2];
+    int width   = input_blob->GetBlobDesc().dims[3];
+    DimsVector output_dims;
+    output_dims.push_back(1);
+    output_dims.push_back(2);
+    output_dims.push_back(height * width * num_priors * 4);
+    // a little trick. hack the prior box output
+    output_dims.push_back(1);
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(PriorBox, LAYER_PRIOR_BOX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/range_layer.cc b/3rdparty/TNN/source/tnn/layer/range_layer.cc
new file mode 100644
index 0000000..12f60e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/range_layer.cc
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(Range, LAYER_RANGE,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status RangeLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
+        return status;
+    }
+    const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
+    const auto& const_res  = const_resource_;
+    if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+    }
+    return status;
+}
+
+Status RangeLayer::InferOutputShape(bool ignore_error) {
+    //NOTE: This layer should not be excuted on device which is not NAIVE. see ConstantOfShapeLayer
+    
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto *layer_param = dynamic_cast<RangeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_[0]->GetBlobDesc().device_type != DEVICE_NAIVE) {
+        return Status(TNNERR_MODEL_ERR, "RangeLayer input blob has invalid device type");
+    }
+    
+    auto output_dims = DimsFunctionUtils::Range(layer_param->start, layer_param->limit,
+                                              layer_param->delta, layer_param->data_type, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+Status RangeLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<RangeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() != 3) {
+        return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid layer param");
+    }
+    
+    //start
+    {
+        const auto start_name = input_blobs_[0]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(start_name) != const_resource_->end()) {
+            auto start_buffer = (*const_resource_)[start_name];
+            layer_param->data_type = start_buffer->GetDataType();
+            auto start_data   = start_buffer->force_to<float *>();
+            auto start = layer_param->start;
+            if (start_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                start.f = *start_data;
+            } else if (start_buffer->GetDataType() == DATA_TYPE_INT32) {
+                start.i = *((int *)start_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid start data type");
+            }
+            layer_param->start = start;
+        }
+    }
+    
+    //limit
+    {
+        const auto limit_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(limit_name) != const_resource_->end()) {
+            auto limit_buffer = (*const_resource_)[limit_name];
+            layer_param->data_type = limit_buffer->GetDataType();
+            auto limit_data   = limit_buffer->force_to<float *>();
+            auto limit = layer_param->limit;
+            if (limit_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                limit.f = *limit_data;
+            } else if (limit_buffer->GetDataType() == DATA_TYPE_INT32) {
+                limit.i = *((int *)limit_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid limit data type");
+            }
+            layer_param->limit = limit;
+        }
+    }
+    
+    //delta
+    {
+        const auto delta_name = input_blobs_[2]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(delta_name) != const_resource_->end()) {
+            auto delta_buffer = (*const_resource_)[delta_name];
+            layer_param->data_type = delta_buffer->GetDataType();
+            auto delta_data   = delta_buffer->force_to<float *>();
+            auto delta = layer_param->delta;
+            if (delta_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                delta.f = *delta_data;
+            } else if (delta_buffer->GetDataType() == DATA_TYPE_INT32) {
+                delta.i = *((int *)delta_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid limit data type");
+            }
+            layer_param->delta = delta;
+        }
+    }
+    return status;
+}
+
+REGISTER_LAYER(Range, LAYER_RANGE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reciprocal_layer.cc b/3rdparty/TNN/source/tnn/layer/reciprocal_layer.cc
new file mode 100644
index 0000000..7ea9cd4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reciprocal_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Reciprocal, LAYER_RECIPROCAL);
+
+REGISTER_ELEMENTWISE_LAYER(Reciprocal, LAYER_RECIPROCAL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_l1_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_l1_layer.cc
new file mode 100644
index 0000000..d6c4413
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_l1_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceL1, LAYER_REDUCE_L1);
+
+REGISTER_REDUCE_LAYER(ReduceL1, LAYER_REDUCE_L1);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_l2_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_l2_layer.cc
new file mode 100644
index 0000000..f094eca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_l2_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceL2, LAYER_REDUCE_L2);
+
+REGISTER_REDUCE_LAYER(ReduceL2, LAYER_REDUCE_L2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_layer.cc
new file mode 100644
index 0000000..48bf3da
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_layer.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "reduce_layer.h"
+
+#include <set>
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status ReduceLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<ReduceLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE_IF(!ignore_error, "Error: Reduce may not support axes != 1, depend on device\n");
+        return Status(TNNERR_MODEL_ERR, "Error: Reduce may not support axes != 1, depend on device");
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto dims  = input_blob->GetBlobDesc().dims;
+
+    std::set<int> axis_filter;
+    for (auto& axis : layer_param->axis) {
+        axis = axis >= 0 ? axis : axis + (int)dims.size();
+        if (axis < 0 || axis >= dims.size()) {
+            LOGE_IF(!ignore_error, "Error: layer param axis is invalid\n");
+            return Status(TNNERR_MODEL_ERR, "Error: layer param axis is invalid");
+        }
+        dims[axis] = 1;
+        axis_filter.insert(axis);
+    }
+    
+  
+    DimsVector output_dims;
+    if (layer_param->keep_dims == 0) {
+        for(int i = 0; i < dims.size(); ++i) {
+            if(axis_filter.count(i) == 0) {
+                output_dims.push_back(dims[i]);           
+            }
+        }
+    } else {
+        output_dims = dims;
+    }
+    
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_layer.h b/3rdparty/TNN/source/tnn/layer/reduce_layer.h
new file mode 100644
index 0000000..4462807
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_layer.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_LAYER_REDUCE_LAYER_H_
+#define TNN_SOURCE_TNN_LAYER_REDUCE_LAYER_H_
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+class ReduceLayer : public BaseLayer {
+public:
+    explicit ReduceLayer(LayerType ignore) : BaseLayer(ignore){};
+    virtual ~ReduceLayer(){};
+
+protected:
+    virtual Status InferOutputShape(bool ignore_error = false);
+};
+
+#define DECLARE_REDUCE_LAYER(type_string, layer_type)                                                                  \
+    class type_string##Layer : public ReduceLayer {                                                                    \
+    public:                                                                                                            \
+        type_string##Layer(LayerType ignore) : ReduceLayer(layer_type){};                                              \
+        virtual ~type_string##Layer(){};                                                                               \
+    }
+
+#define REGISTER_REDUCE_LAYER(type_string, layer_type) REGISTER_LAYER(type_string, layer_type)
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_LAYER_REDUCE_LAYER_H_
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_log_sum_exp_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_log_sum_exp_layer.cc
new file mode 100644
index 0000000..9d4c13c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_log_sum_exp_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+REGISTER_REDUCE_LAYER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_log_sum_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_log_sum_layer.cc
new file mode 100644
index 0000000..2fa30dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_log_sum_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+REGISTER_REDUCE_LAYER(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_max_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_max_layer.cc
new file mode 100644
index 0000000..aaf9887
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_max_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceMax, LAYER_REDUCE_MAX);
+
+REGISTER_REDUCE_LAYER(ReduceMax, LAYER_REDUCE_MAX);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_mean_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_mean_layer.cc
new file mode 100644
index 0000000..675d480
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_mean_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceMean, LAYER_REDUCE_MEAN);
+
+REGISTER_REDUCE_LAYER(ReduceMean, LAYER_REDUCE_MEAN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_min_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_min_layer.cc
new file mode 100644
index 0000000..cb9608f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_min_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceMin, LAYER_REDUCE_MIN);
+
+REGISTER_REDUCE_LAYER(ReduceMin, LAYER_REDUCE_MIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_prod_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_prod_layer.cc
new file mode 100644
index 0000000..0503e18
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_prod_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceProd, LAYER_REDUCE_PROD);
+
+REGISTER_REDUCE_LAYER(ReduceProd, LAYER_REDUCE_PROD);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_sum_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_sum_layer.cc
new file mode 100644
index 0000000..4221859
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_sum_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceSum, LAYER_REDUCE_SUM);
+
+REGISTER_REDUCE_LAYER(ReduceSum, LAYER_REDUCE_SUM);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reduce_sum_square_layer.cc b/3rdparty/TNN/source/tnn/layer/reduce_sum_square_layer.cc
new file mode 100644
index 0000000..263d79b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reduce_sum_square_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/reduce_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_REDUCE_LAYER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+REGISTER_REDUCE_LAYER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reformat_layer.cc b/3rdparty/TNN/source/tnn/layer/reformat_layer.cc
new file mode 100644
index 0000000..1356451
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reformat_layer.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Reformat, LAYER_REFORMAT);
+
+Status ReformatLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = reinterpret_cast<ReformatLayerParam*>(param_)->dst_type;
+    }
+    return TNN_OK;
+}
+
+Status ReformatLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    for (int idx = 0; idx < input_blobs_.size(); idx++) {
+        Blob* input_blob  = input_blobs_[idx];
+        Blob* output_blob = output_blobs_[idx];
+
+        output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Reformat, LAYER_REFORMAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/relu6_layer.cc b/3rdparty/TNN/source/tnn/layer/relu6_layer.cc
new file mode 100644
index 0000000..3afad99
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/relu6_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Relu6, LAYER_RELU6);
+
+REGISTER_ELEMENTWISE_LAYER(Relu6, LAYER_RELU6);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/relu_layer.cc b/3rdparty/TNN/source/tnn/layer/relu_layer.cc
new file mode 100644
index 0000000..06466c8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/relu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Relu, LAYER_RELU);
+
+REGISTER_ELEMENTWISE_LAYER(Relu, LAYER_RELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reorg_layer.cc b/3rdparty/TNN/source/tnn/layer/reorg_layer.cc
new file mode 100644
index 0000000..a9cf27e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reorg_layer.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Reorg, LAYER_REORG);
+
+Status ReorgLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ReorgLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+
+    ReorgLayerParam* reorg_param = dynamic_cast<ReorgLayerParam*>(param_);
+    CHECK_PARAM_NULL(reorg_param);
+
+    bool forward = reorg_param->forward;
+    int stride   = reorg_param->stride;
+    int mode     = reorg_param->mode;
+
+    auto dims_input = input_blob->GetBlobDesc().dims;
+    int num         = dims_input[0];
+    int channels    = dims_input[1];
+    int height      = dims_input[2];
+    int width       = dims_input[3];
+    int reorged_channels, reorged_height, reorged_width;
+
+    if (forward) {
+        if (channels % (stride * stride) != 0) {
+            return Status(TNNERR_LAYER_ERR, "Error: channel and parameter stride is not compatible");
+        }
+        reorged_channels = channels / (stride * stride);
+        reorged_width    = width * stride;
+        reorged_height   = height * stride;
+    } else {
+        if ((height % stride != 0) || (width % stride != 0)) {
+            return Status(TNNERR_LAYER_ERR, "Error: size and parameter stride is not compatible");
+        }
+        reorged_channels = channels * stride * stride;
+        reorged_height   = height / stride;
+        reorged_width    = width / stride;
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(reorged_channels);
+    output_dims.push_back(reorged_height);
+    output_dims.push_back(reorged_width);
+
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Reorg, LAYER_REORG);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/reshape_layer.cc b/3rdparty/TNN/source/tnn/layer/reshape_layer.cc
new file mode 100644
index 0000000..67b373b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/reshape_layer.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(Reshape, LAYER_RESHAPE,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status ReshapeLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    auto layer_param = dynamic_cast<ReshapeLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (runtime_model_ == RUNTIME_MODE_CONST_FOLD) {
+        for (auto& iter : output_blobs_) {
+            int allocate_status = DATA_FLAG_ALLOCATE_IN_FORWARD;
+            iter->SetFlag(iter->GetFlag() | allocate_status);
+        }
+    }
+    
+    return TNN_OK;
+}
+
+Status ReshapeLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<ReshapeLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    if (!layer_param->shape.empty()) {
+
+        auto input_dims = input_blob->GetBlobDesc().dims;
+        
+        Status status = TNN_OK;
+        auto output_dims = DimsFunctionUtils::Reshape(input_dims, layer_param->shape, layer_param->axis, layer_param->num_axes, &status);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
+        output_blob->GetBlobDesc().dims = output_dims;
+        return TNN_OK;
+    } else {
+        // shape is empty
+        LOGE_IF(!ignore_error, "Reshape has no shape param. layer name: %s\n", layer_param->name.c_str());
+        return Status(TNNERR_PARAM_ERR, "Reshape has no shape param");
+    }
+}
+
+Status ReshapeLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto layer_param = dynamic_cast<ReshapeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    //根据const resource更新维度信息
+    if (input_blobs_.size() >= 2) {
+        auto shape_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(shape_blob_name) != const_resource_->end()) {
+            auto shape_buffer = (*const_resource_)[shape_blob_name];
+            auto dim_count = shape_buffer->GetDataCount();
+            auto dim_data = (int *)shape_buffer->force_to<int *>();
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->shape = dims;
+            layer_param->num_axes = dim_count;
+        }
+    }
+    return status;
+}
+
+REGISTER_LAYER(Reshape, LAYER_RESHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/roi_pooling_layer.cc b/3rdparty/TNN/source/tnn/layer/roi_pooling_layer.cc
new file mode 100644
index 0000000..99fcefb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/roi_pooling_layer.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(RoiPooling, LAYER_ROIPOOLING);
+
+Status RoiPoolingLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status RoiPoolingLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+    Blob* rpn_blob   = input_blobs_[1];  // of shape <n, num_rois, 5, 1 ,1>
+
+    RoiPoolingLayerParam* pool_param = dynamic_cast<RoiPoolingLayerParam*>(param_);
+
+    bool is_5d_input = input_blob->GetBlobDesc().dims.size() == 5;
+
+    int idx      = 0;
+    int num      = input_blob->GetBlobDesc().dims[idx++];
+    int channels = input_blob->GetBlobDesc().dims[idx++];
+    int depth    = is_5d_input ? input_blob->GetBlobDesc().dims[idx++] : 1;
+    int height   = input_blob->GetBlobDesc().dims[idx++];
+    int width    = input_blob->GetBlobDesc().dims[idx++];
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(rpn_blob->GetBlobDesc().dims[1]);
+    if (is_5d_input) {
+        output_dims.push_back(pool_param->pooled_dims[2]);
+    }
+    output_dims.push_back(pool_param->pooled_dims[1]);
+    output_dims.push_back(pool_param->pooled_dims[0]);
+
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(RoiPooling, LAYER_ROIPOOLING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/roialign_layer.cc b/3rdparty/TNN/source/tnn/layer/roialign_layer.cc
new file mode 100644
index 0000000..23d3ec3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/roialign_layer.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(RoiAlign, LAYER_ROIALIGN);
+
+Status RoiAlignLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status RoiAlignLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    auto* input_blob        = input_blobs_[0];
+    auto* rois              = input_blobs_[1];
+    auto* output_blob       = output_blobs_[0];
+    auto* param             = dynamic_cast<RoiAlignLayerParam*>(param_);
+    const int num_rois      = rois->GetBlobDesc().dims[0];
+    const int channels      = input_blob->GetBlobDesc().dims[1];
+    const int output_height = param->output_height;
+    const int output_width  = param->output_width;
+
+    output_blob->GetBlobDesc().dims = {num_rois, channels, output_height, output_width};
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(RoiAlign, LAYER_ROIALIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/rsqrt_layer.cc b/3rdparty/TNN/source/tnn/layer/rsqrt_layer.cc
new file mode 100644
index 0000000..3cf6ba2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/rsqrt_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Rsqrt, LAYER_RSQRT);
+
+REGISTER_ELEMENTWISE_LAYER(Rsqrt, LAYER_RSQRT);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/layer/scale_layer.cc b/3rdparty/TNN/source/tnn/layer/scale_layer.cc
new file mode 100644
index 0000000..9385441
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/scale_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Scale, LAYER_SCALE);
+
+REGISTER_ELEMENTWISE_LAYER(Scale, LAYER_SCALE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/scatter_nd_layer.cc b/3rdparty/TNN/source/tnn/layer/scatter_nd_layer.cc
new file mode 100644
index 0000000..bb143ad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/scatter_nd_layer.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(ScatterND, LAYER_SCATTER_ND);
+
+Status ScatterNDLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ScatterNDLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/selu_layer.cc b/3rdparty/TNN/source/tnn/layer/selu_layer.cc
new file mode 100644
index 0000000..ab66263
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/selu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Selu, LAYER_SELU);
+
+REGISTER_ELEMENTWISE_LAYER(Selu, LAYER_SELU);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/shape_layer.cc b/3rdparty/TNN/source/tnn/layer/shape_layer.cc
new file mode 100644
index 0000000..ea14a88
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/shape_layer.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Shape, LAYER_SHAPE);
+
+Status ShapeLayer::InferOutputDataType() {
+    ASSERT(output_blobs_.size() == 1);
+    BaseLayer::InferOutputDataType();
+    
+    output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    
+    //insure no blob reuse in const forward, so set alloc status DATA_FLAG_ALLOCATE_IN_FORWARD
+    for (auto& iter : output_blobs_) {
+        iter->SetFlag(iter->GetFlag() | DATA_FLAG_CHANGE_IF_SHAPE_DIFFER | DATA_FLAG_ALLOCATE_IN_FORWARD);
+    }
+    return TNN_OK;
+}
+
+Status ShapeLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    ASSERT(input_blobs_.size() == 1);
+    const auto& input_blob = input_blobs_[0];
+    const auto& output_blob = output_blobs_[0];
+    // the output blob has only one dim, the value is the size of input blob dims
+    output_blob->GetBlobDesc().dims = {(int)input_blob->GetBlobDesc().dims.size()};
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Shape, LAYER_SHAPE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/shuffle_layer.cc b/3rdparty/TNN/source/tnn/layer/shuffle_layer.cc
new file mode 100644
index 0000000..b6dc3be
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/shuffle_layer.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+Status ShuffleLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status ShuffleLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/sigmoid_layer.cc b/3rdparty/TNN/source/tnn/layer/sigmoid_layer.cc
new file mode 100644
index 0000000..fa5b02e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/sigmoid_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Sigmoid, LAYER_SIGMOID);
+
+REGISTER_ELEMENTWISE_LAYER(Sigmoid, LAYER_SIGMOID);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/sign_layer.cc b/3rdparty/TNN/source/tnn/layer/sign_layer.cc
new file mode 100644
index 0000000..5115dfc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/sign_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Sign, LAYER_SIGN);
+
+REGISTER_ELEMENTWISE_LAYER(Sign, LAYER_SIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/signed_mul_layer.cc b/3rdparty/TNN/source/tnn/layer/signed_mul_layer.cc
new file mode 100644
index 0000000..f76c6dd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/signed_mul_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(SignedMul, LAYER_SIGNED_MUL);
+
+REGISTER_ELEMENTWISE_LAYER(SignedMul, LAYER_SIGNED_MUL);
+
+}
diff --git a/3rdparty/TNN/source/tnn/layer/sin_layer.cc b/3rdparty/TNN/source/tnn/layer/sin_layer.cc
new file mode 100644
index 0000000..5812e8e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/sin_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Sin, LAYER_SIN);
+
+REGISTER_ELEMENTWISE_LAYER(Sin, LAYER_SIN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/size_layer.cc b/3rdparty/TNN/source/tnn/layer/size_layer.cc
new file mode 100644
index 0000000..20e4c2f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/size_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Size, LAYER_SIZE);
+
+Status SizeLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    
+    for (auto output_blob : output_blobs_) {
+        output_blob->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    }
+    return TNN_OK;
+}
+
+Status SizeLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = {1};
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Size, LAYER_SIZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/softmax_layer.cc b/3rdparty/TNN/source/tnn/layer/softmax_layer.cc
new file mode 100644
index 0000000..a5a67e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/softmax_layer.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Softmax, LAYER_SOFTMAX);
+
+Status SoftmaxLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status SoftmaxLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Softmax, LAYER_SOFTMAX);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/softplus_layer.cc b/3rdparty/TNN/source/tnn/layer/softplus_layer.cc
new file mode 100644
index 0000000..76f5f56
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/softplus_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Softplus, LAYER_SOFTPLUS);
+
+REGISTER_ELEMENTWISE_LAYER(Softplus, LAYER_SOFTPLUS);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/softsign_layer.cc b/3rdparty/TNN/source/tnn/layer/softsign_layer.cc
new file mode 100644
index 0000000..a544cff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/softsign_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Softsign, LAYER_SOFTSIGN);
+
+REGISTER_ELEMENTWISE_LAYER(Softsign, LAYER_SOFTSIGN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/split_layer.cc b/3rdparty/TNN/source/tnn/layer/split_layer.cc
new file mode 100644
index 0000000..4fc68b3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/split_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Split, LAYER_SPLITING);
+
+Status SplitLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status SplitLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+
+    for (size_t i = 0; i < output_blobs_.size(); i++) {
+        output_blobs_[i]->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Split, LAYER_SPLITING);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/splitv_layer.cc b/3rdparty/TNN/source/tnn/layer/splitv_layer.cc
new file mode 100644
index 0000000..69c1a4b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/splitv_layer.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(SplitV, LAYER_SPLITV);
+
+Status SplitVLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status SplitVLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<SplitVLayerParam*>(param_);
+    if (!layer_param) {
+        return Status(TNNERR_PARAM_ERR, "SplitVLayer do not have valid param, please check node: " + layer_name_);
+    }
+    Blob* input_blob = input_blobs_[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    //
+    if (layer_param->axis < 0) {
+        layer_param->axis += input_dims.size();
+    }
+    // slices may be empty
+    if (layer_param->slices.empty() || !layer_param->is_split_specified) {
+        layer_param->slices.clear();
+        int input_size       = input_dims[layer_param->axis];
+        int output_blob_size = output_blobs_.size();
+        if (input_size % output_blob_size == 0) {
+            for (int i = 0; i < output_blob_size; ++i) {
+                layer_param->slices.push_back(input_size / output_blob_size);
+            }
+            layer_param->is_split_specified = false;
+        } else {
+            return Status(
+                TNNERR_PARAM_ERR,
+                "SplitVLayer has invalid param, slice size is zero, Input cannot be split evenly on select axis");
+        }
+    }
+    if (layer_param->slices.size() != output_blobs_.size()) {
+        return Status(TNNERR_PARAM_ERR, "SplitVLayer has invalid param, slices size != output blobs size ");
+    }
+
+    int size_sum = layer_param->slices[0];
+    for (int i = 1; i < layer_param->slices.size(); i++) {
+        size_sum += layer_param->slices[i];
+    }
+
+    if (size_sum != input_dims[layer_param->axis]) {
+        return Status(TNNERR_PARAM_ERR, "SplitVLayer has invalid slices");
+    }
+
+    for (size_t i = 0; i < output_blobs_.size(); i++) {
+        auto output_dims                     = input_dims;
+        output_dims[layer_param->axis]       = layer_param->slices[i];
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(SplitV, LAYER_SPLITV);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/sqrt_layer.cc b/3rdparty/TNN/source/tnn/layer/sqrt_layer.cc
new file mode 100644
index 0000000..f0a2be9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/sqrt_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Sqrt, LAYER_SQRT);
+
+REGISTER_ELEMENTWISE_LAYER(Sqrt, LAYER_SQRT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/squared_difference_layer.cc b/3rdparty/TNN/source/tnn/layer/squared_difference_layer.cc
new file mode 100644
index 0000000..f96f75f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/squared_difference_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(SquaredDifference, LAYER_SQUARED_DIFFERENCE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/squeeze_layer.cc b/3rdparty/TNN/source/tnn/layer/squeeze_layer.cc
new file mode 100644
index 0000000..7a78ec7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/squeeze_layer.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Squeeze, LAYER_SQUEEZE);
+
+Status SqueezeLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
+        return status;
+    }
+    const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
+    const auto& const_res  = const_resource_;
+    if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+    }
+    return status;
+}
+
+Status SqueezeLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto* layer_param = dynamic_cast<SqueezeLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    const auto& output_blob = output_blobs_[0];
+    DimsVector input_dims   = input_blobs_[0]->GetBlobDesc().dims;
+    DimsVector output_dims  = input_dims;
+    RETURN_VALUE_ON_NEQ(input_dims.size() > 0, true, Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input size"));
+    auto axes = layer_param->axes;
+    
+    for (auto iter = axes.rbegin(); iter != axes.rend(); iter++) {
+        //Note: here it is diffreent from UnsqueezeLayer
+        int axis = *iter;
+        axis =  axis < 0 ? axis + (int)output_dims.size() : axis;
+        if (axis < 0 || axis >= output_dims.size() || output_dims[axis] != 1) {
+            return Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input axes");
+        }
+        output_dims.erase(output_dims.begin() + axis);
+    }
+    output_blob->GetBlobDesc().dims = output_dims;
+    return status;
+}
+
+REGISTER_LAYER(Squeeze, LAYER_SQUEEZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/stride_slice_layer.cc b/3rdparty/TNN/source/tnn/layer/stride_slice_layer.cc
new file mode 100644
index 0000000..ab6cd79
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/stride_slice_layer.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status StrideSliceLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status StrideSliceLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    StrideSliceLayerParam* layer_param = dynamic_cast<StrideSliceLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE_IF(!ignore_error, "StrideSliceLayer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "StrideSliceLayer param is nil");
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims.clear();
+    auto input_dims = input_blob->GetBlobDesc().dims;
+
+    if (layer_param->begins.size() != input_dims.size() || layer_param->ends.size() != input_dims.size() ||
+        layer_param->strides.size() != input_dims.size()) {
+        LOGE_IF(!ignore_error, "StrideSliceLayer param got wrong size: input dims size: %ld\n", input_dims.size());
+        return Status(TNNERR_PARAM_ERR, "StrideSliceLayer param got wrong size");
+    }
+
+    auto begins = layer_param->begins;
+    std::reverse(begins.begin(), begins.end());
+
+    auto ends = layer_param->ends;
+    std::reverse(ends.begin(), ends.end());
+
+    auto strides = layer_param->strides;
+    std::reverse(strides.begin(), strides.end());
+
+    auto sizes = strides;
+
+    if (input_blobs_.size() > 1) {
+        // NCNN crop layer, reference mode :
+        // blob[0].size = blob[1].size
+        //前闭后开区间
+        sizes = input_blobs_[1]->GetBlobDesc().dims;
+        for (int i = 0; i < input_dims.size(); i++) {
+            ends[i] = sizes[i] + begins[i];
+            if (ends[i] > input_dims[i]) {
+                LOGE_IF(!ignore_error, "StrideSliceLayer param is invalid. Check NCNN Param\n");
+                return Status(TNNERR_PARAM_ERR, "StrideSliceLayer param is invalid. Check NCNN Param");
+            }
+        }
+
+    } else {
+        //前闭后开区间
+        for (int i = 0; i < input_dims.size(); i++) {
+            if (begins[i] < 0) {
+                begins[i] += input_blob->GetBlobDesc().dims[i];
+            }
+            if (ends[i] == 0) {
+                ends[i] = input_dims[i];
+            }
+
+            if (ends[i] < 0) {
+                ends[i] += input_dims[i];
+            }
+
+            if (begins[i] >= ends[i]) {
+                LOGE_IF(!ignore_error, "StrideSliceLayer param is invalid\n");
+                return Status(TNNERR_PARAM_ERR, "StrideSliceLayer param is invalid");
+            }
+
+            sizes[i] = (ends[i] - begins[i] - 1) / strides[i] + 1;
+
+            if (sizes[i] <= 0) {
+                LOGE_IF(!ignore_error, "StrideSliceLayer param is invalid\n");
+                return Status(TNNERR_PARAM_ERR, "StrideSliceLayer param is invalid");
+            }
+        }
+    }
+
+    output_blob->GetBlobDesc().dims = sizes;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(StrideSlice, LAYER_STRIDED_SLICE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/stride_slice_v2_layer.cc b/3rdparty/TNN/source/tnn/layer/stride_slice_v2_layer.cc
new file mode 100644
index 0000000..fd5e73f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/stride_slice_v2_layer.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_WITH_FUNC(StrideSliceV2, LAYER_STRIDED_SLICE_V2,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status StrideSliceV2Layer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status StrideSliceV2Layer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<StrideSliceV2LayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    auto input_dims = input_blob->GetBlobDesc().dims;
+
+    auto begins = layer_param->begins;
+    auto ends = layer_param->ends;
+    auto axes = layer_param->axes;
+    auto strides = layer_param->strides;
+
+    //[begin end)
+    auto output_dims = DimsFunctionUtils::StrideSlice(input_dims, begins, ends, strides, axes, &status);
+    //support empty blob for yolov5 Slice_507, only in device cpu
+    if (status != TNN_OK && !(output_dims.size() == input_dims.size() &&  runtime_model_ == RUNTIME_MODE_CONST_FOLD)) {
+        return status;
+    }
+  
+    //dont rectify begins and ends here, input shape may change, do it in runtime forward see cpu_stride_slice_v2_layer_acc.cc Forward
+//    layer_param->begins = begins;
+//    layer_param->ends = ends;
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+Status StrideSliceV2Layer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() >= 2) {
+        auto begins_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(begins_blob_name) != const_resource_->end()) {
+            auto begins_buffer =  (*const_resource_)[begins_blob_name];
+            auto dim_count = begins_buffer->GetDataCount();
+            auto dim_data = (int *)begins_buffer->force_to<int *>();
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->begins = dims;
+        }
+    }
+    
+    if (input_blobs_.size() >= 3) {
+        auto ends_blob_name = input_blobs_[2]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(ends_blob_name) != const_resource_->end()) {
+            auto ends_buffer =  (*const_resource_)[ends_blob_name];
+            auto dim_count = ends_buffer->GetDataCount();
+            auto dim_data = (int *)ends_buffer->force_to<int *>();
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->ends = dims;
+        }
+    }
+    
+    return status;
+}
+
+REGISTER_LAYER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/sub_layer.cc b/3rdparty/TNN/source/tnn/layer/sub_layer.cc
new file mode 100644
index 0000000..ae9c074
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/sub_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Sub, LAYER_SUB);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Sub, LAYER_SUB);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/tan_layer.cc b/3rdparty/TNN/source/tnn/layer/tan_layer.cc
new file mode 100644
index 0000000..37059c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/tan_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Tan, LAYER_TAN);
+
+REGISTER_ELEMENTWISE_LAYER(Tan, LAYER_TAN);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/tanh_layer.cc b/3rdparty/TNN/source/tnn/layer/tanh_layer.cc
new file mode 100644
index 0000000..04d4fb4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/tanh_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(Tanh, LAYER_TANH);
+
+REGISTER_ELEMENTWISE_LAYER(Tanh, LAYER_TANH);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/tile_layer.cc b/3rdparty/TNN/source/tnn/layer/tile_layer.cc
new file mode 100644
index 0000000..0b1e3bf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/tile_layer.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_WITH_FUNC(Tile, LAYER_REPEAT,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status TileLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status TileLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto *layer_param = dynamic_cast<TileLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto reps = layer_param->reps;
+    auto output_dims = DimsFunctionUtils::Tile(input_dims, reps);
+
+    output_blob->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+Status TileLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<TileLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() < 2) {
+        return TNN_OK;
+    }
+    
+    //reps
+    {
+        const auto name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(name) != const_resource_->end()) {
+            auto buffer = (*const_resource_)[name];
+            if (buffer->GetDataType() != DATA_TYPE_INT32) {
+                return Status(TNNERR_PARAM_ERR, "TileLayer has invalid reps data type");
+            }
+            auto dim_count = buffer->GetDataCount();
+            auto dim_data = (int *)buffer->force_to<int *>();
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->reps = dims;
+        }
+    }
+    
+    return status;
+}
+
+REGISTER_LAYER(Tile, LAYER_REPEAT);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/topk_layer.cc b/3rdparty/TNN/source/tnn/layer/topk_layer.cc
new file mode 100644
index 0000000..5f15932
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/topk_layer.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+//different implementation may generate different order, cause unalign [onnx vs cpu vs other devices]  
+
+DECLARE_LAYER(TopK, LAYER_TOPK);
+
+Status TopKLayer::InferOutputDataType() {
+    Status status = BaseLayer::InferOutputDataType();
+    // dtype of indices output should be int32
+    output_blobs_[1]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+    return status;
+}
+
+Status TopKLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto layer_param = dynamic_cast<TopKLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob = input_blobs_[0];
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    auto output_dims = input_dims;
+
+    int axis = layer_param->axis;
+    if (axis < 0) {
+        axis += (int)input_blob->GetBlobDesc().dims.size();
+        layer_param->axis = axis;
+    }
+    if (axis < 0 || axis > input_blob->GetBlobDesc().dims.size()) {
+        LOGE_IF(!ignore_error, "Error: TopKLayer axis(%d) is invalid\n", axis);
+        return Status(TNNERR_PARAM_ERR, "TopKLayer axis is invalid");
+    }
+
+    if (output_blobs_.size() != 2) {
+        return Status(TNNERR_PARAM_ERR, "TopKLayer output blobs size != 2");
+    }
+
+    if (input_blobs_.size() > 1) {
+        return Status(TNNERR_PARAM_ERR, "TopKLayer only support one input blob, not support k as a input blob");
+    }
+
+    if (layer_param->k > 0) {
+        output_dims[layer_param->axis] = std::min(layer_param->k, input_dims[layer_param->axis]);
+    }
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    output_blobs_[1]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(TopK, LAYER_TOPK);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/unsqueeze_layer.cc b/3rdparty/TNN/source/tnn/layer/unsqueeze_layer.cc
new file mode 100644
index 0000000..030e836
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/unsqueeze_layer.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status UnsqueezeLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
+        return status;
+    }
+    const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
+    const auto& const_res  = const_resource_;
+    if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+    }
+    return status;
+}
+
+Status UnsqueezeLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto *layer_param = dynamic_cast<UnsqueezeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    const auto input_dims  = input_blobs_[0]->GetBlobDesc().dims;
+    // the output blob has only one dim, the value is the size of input blob dims
+    
+    auto axes = layer_param->axes;
+    auto output_dims = input_dims;
+    for (auto iter = axes.begin(); iter != axes.end(); iter++) {
+        //Note: here it is diffreent from SqueezeLayer
+        int axis = *iter;
+        axis = axis < 0 ? axis + (int)output_dims.size() + 1 : axis;
+        if (axis < 0 || axis > output_dims.size()) {
+            return Status(TNNERR_PARAM_ERR, "UnsqueezeLayer has invalid input axes");
+        }
+        output_dims.insert(output_dims.begin() + axis, 1);
+    }
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Unsqueeze, LAYER_UNSQUEEZE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/upsample_layer.cc b/3rdparty/TNN/source/tnn/layer/upsample_layer.cc
new file mode 100644
index 0000000..b8543ff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/upsample_layer.cc
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making TNN
+// available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_WITH_FUNC(Upsample, LAYER_UPSAMPLE,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status UpsampleLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    auto layer_param = dynamic_cast<UpsampleLayerParam *>(param_);
+
+    if (layer_param->scales.empty() && runtime_model_ == RUNTIME_MODE_CONST_FOLD) {
+        for (auto &iter : output_blobs_) {
+            int allocat_status = DATA_FLAG_ALLOCATE_IN_FORWARD;
+            iter->SetFlag(iter->GetFlag() | allocat_status);
+        }
+    }
+    return TNN_OK;
+}
+
+Status UpsampleLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<UpsampleLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    auto scales = layer_param->scales;
+    auto sizes = layer_param->dims;
+    if (scales.empty()) {
+        LOGE_IF(!ignore_error, "Upsample has no scale param. layer name: %s\n", layer_param->name.c_str());
+        return Status(TNNERR_PARAM_ERR, "Upsample has no scale param");
+    }
+    
+    if (sizes.size() <= 0 && scales.size() >= 2) {
+        // width_scale height_scale
+        float w_scale = scales[scales.size() - 1];
+        float h_scale = scales[scales.size() - 2];
+        
+        if (layer_param->align_corners < 0) {
+            if (w_scale >= 1.0f && h_scale >= 1.0f) {
+                layer_param->align_corners = 0;
+            } else {
+                layer_param->align_corners = 1;
+            }
+        }
+    }
+    
+    auto input_dims = input_blobs_[0]->GetBlobDesc().dims;
+    auto output_dims = DimsFunctionUtils::Upsample(input_dims, scales, sizes, layer_param->mode, &status);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    return TNN_OK;
+}
+
+Status UpsampleLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<UpsampleLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() > 1) {
+        //fill param with inputs
+        std::vector<float> scales;
+        std::vector<int> sizes;
+        std::shared_ptr<RawBuffer> scales_buffer = nullptr;
+        std::shared_ptr<RawBuffer> sizes_buffer = nullptr;
+        if (input_blobs_.size() == 2) {
+            const auto scales_name = input_blobs_[1]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && const_resource_->find(scales_name) != const_resource_->end()) {
+                scales_buffer = (*const_resource_)[scales_name];
+            }
+        } else if (input_blobs_.size() == 3) {
+            const auto scales_name = input_blobs_[2]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && const_resource_->find(scales_name) != const_resource_->end()) {
+                scales_buffer = (*const_resource_)[scales_name];
+            }
+        } else if (input_blobs_.size() == 4) {
+            const auto sizes_name = input_blobs_[3]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && const_resource_->find(sizes_name) != const_resource_->end()) {
+                sizes_buffer = (*const_resource_)[sizes_name];
+            }
+        }
+        
+        if (scales_buffer && scales_buffer->GetBytesSize() > 0) {
+            auto scales_data   = scales_buffer->force_to<float *>();
+            auto scales_count  = scales_buffer->GetDataCount();
+            if (scales_count < 2) {
+                LOGE("Error: Upsample has invalid scales count:%d", scales_count);
+                return Status(TNNERR_PARAM_ERR, "Error: Upsample has invalid scales count");
+            }
+            for (int i = 0; i < scales_count; ++i) {
+                scales.push_back(scales_data[i]);
+            }
+            // width_scale height_scale
+            float w_scale = scales[scales.size() - 1];
+            float h_scale = scales[scales.size() - 2];
+            scales = {w_scale, h_scale};
+            layer_param->scales = scales;
+        }
+        
+        if (sizes_buffer && sizes_buffer->GetBytesSize() > 0) {
+            auto sizes_data   = sizes_buffer->force_to<int *>();
+            auto sizes_count  = sizes_buffer->GetDataCount();
+            if (sizes_count < 2) {
+                LOGE("Error: Upsample has invalid sizes count:%d", sizes_count);
+                return Status(TNNERR_PARAM_ERR, "Error: Upsample has invalid scales count");
+            }
+            for (int i = 0; i < sizes_count; ++i) {
+                sizes.push_back(sizes_data[i]);
+            }
+            // width_scale height_scale
+            int w_size = sizes[sizes.size() - 1];
+            int h_size = sizes[sizes.size() - 2];
+            sizes = {w_size, h_size};
+            layer_param->dims = sizes;
+        }
+    }
+    return status;
+}
+
+REGISTER_LAYER(Upsample, LAYER_UPSAMPLE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/layer/where_layer.cc b/3rdparty/TNN/source/tnn/layer/where_layer.cc
new file mode 100644
index 0000000..588c11e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/layer/where_layer.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Where, LAYER_WHERE);
+
+Status WhereLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    return TNN_OK;
+}
+
+Status WhereLayer::InferOutputShape(bool ignore_error) {
+    //X, Y, condition order for input
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    Blob* input_blob = input_blobs_[0];
+    auto dims = input_blob->GetBlobDesc().dims;
+    auto dims_output = dims;
+    for (auto iter : input_blobs_) {
+        dims       = iter->GetBlobDesc().dims;
+        dims_output = DimsVectorUtils::Max(dims, dims_output);
+    }
+
+    output_blobs_[0]->GetBlobDesc().dims = dims_output;
+    return TNN_OK;
+}
+REGISTER_LAYER(Where, LAYER_WHERE);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.cc
new file mode 100644
index 0000000..1019b2b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.cc
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_1d_memory.h"
+
+namespace TNN_NS {
+
+Blob1DMemory::Blob1DMemory(AbstractDevice* device, BlobMemorySizeInfo& size_info, int use_count)
+    : BlobMemory(device, size_info, use_count) {}
+
+Blob1DMemory::~Blob1DMemory() {}
+
+void Blob1DMemory::UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info) {
+    int current_bytes_size = GetBlobMemoryBytesSize(size_info_);
+    int new_bytes_size     = GetBlobMemoryBytesSize(info);
+    if (new_bytes_size > current_bytes_size) {
+        size_info_ = info;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.h b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.h
new file mode 100644
index 0000000..2d7d521
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_H_
+
+#include "tnn/memory_manager/blob_memory.h"
+
+namespace TNN_NS {
+
+class Blob1DMemory : public BlobMemory {
+public:
+    explicit Blob1DMemory(AbstractDevice *device, BlobMemorySizeInfo &size_info, int use_count = 0);
+    virtual ~Blob1DMemory();
+
+    virtual void UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info);
+
+private:
+    Blob1DMemory(const Blob1DMemory &);
+    Blob1DMemory &operator=(const Blob1DMemory &);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.cc
new file mode 100644
index 0000000..f458c9a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_1d_memory_pool.h"
+#include <cmath>
+#include "tnn/memory_manager/blob_1d_memory.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Blob1DMemoryPool::Blob1DMemoryPool(AbstractDevice* device) : BlobMemoryPool(device) {
+    blob_memory_list_header_ = NULL;
+}
+
+Blob1DMemoryPool::~Blob1DMemoryPool() {
+    ClearBlobMemoryPool();
+}
+
+void Blob1DMemoryPool::ClearBlobMemoryPool() {
+    BlobMemoryPool::ClearBlobMemoryPool();
+    
+    ReleaseBlobMemoryNodeList(blob_memory_list_header_);
+    blob_memory_list_header_ = NULL;
+}
+
+BlobMemory* Blob1DMemoryPool::CreateBlobMemory(int use_count, BlobMemorySizeInfo& size_info) {
+    return new Blob1DMemory(device_, size_info, use_count);
+}
+
+BlobMemoryNode* Blob1DMemoryPool::GetBlobMemoryNodeListHeader(DataType data_type) {
+    return blob_memory_list_header_;
+}
+
+void Blob1DMemoryPool::SetBlobMemoryNodeListHeader(DataType data_type, BlobMemoryNode* new_header) {
+    blob_memory_list_header_ = new_header;
+}
+
+int64_t Blob1DMemoryPool::ResolveBlobMemoryNodeBytesDiff(BlobMemorySizeInfo& size_info, BlobMemoryNode* node) {
+    int64_t target_bytes_size = GetBlobMemoryBytesSize(size_info);
+    auto node_cur_info    = node->blob_memory->GetBlobMemorySizeInfo();
+    int64_t node_bytes_size   = GetBlobMemoryBytesSize(node_cur_info);
+    return std::abs(target_bytes_size - node_bytes_size);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.h b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.h
new file mode 100644
index 0000000..d615f34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_1d_memory_pool.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_POOL_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_POOL_H_
+
+#include "tnn/memory_manager/blob_memory_pool.h"
+
+namespace TNN_NS {
+
+class Blob1DMemoryPool : public BlobMemoryPool {
+public:
+    explicit Blob1DMemoryPool(AbstractDevice* device);
+    virtual ~Blob1DMemoryPool();
+    virtual void ClearBlobMemoryPool();
+    
+private:
+    virtual BlobMemory* CreateBlobMemory(int use_count, BlobMemorySizeInfo& size_info);
+    virtual BlobMemoryNode* GetBlobMemoryNodeListHeader(DataType data_type);
+    virtual void SetBlobMemoryNodeListHeader(DataType data_type, BlobMemoryNode* new_header);
+    virtual int64_t ResolveBlobMemoryNodeBytesDiff(BlobMemorySizeInfo& size_info, BlobMemoryNode* node);
+    BlobMemoryNode* blob_memory_list_header_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_POOL_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.cc
new file mode 100644
index 0000000..3f7a2e4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_2d_memory.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Blob2DMemory::Blob2DMemory(AbstractDevice* device, BlobMemorySizeInfo& size_info, int use_count)
+    : BlobMemory(device, size_info, use_count) {}
+
+Blob2DMemory::~Blob2DMemory() {}
+
+void Blob2DMemory::UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info) {
+    size_info_.data_type = info.data_type;
+    size_info_.dims      = DimsVectorUtils::Max(size_info_.dims, info.dims);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.h b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.h
new file mode 100644
index 0000000..abeefd1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_H_
+
+#include <map>
+
+#include "tnn/memory_manager/blob_memory.h"
+
+namespace TNN_NS {
+
+class Blob2DMemory : public BlobMemory {
+public:
+    explicit Blob2DMemory(AbstractDevice *device, BlobMemorySizeInfo &size_info, int use_count = 0);
+    virtual ~Blob2DMemory();
+
+    virtual void UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info);
+
+private:
+    Blob2DMemory(const Blob2DMemory &);
+    Blob2DMemory &operator=(const Blob2DMemory &);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.cc
new file mode 100644
index 0000000..dae1b04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.cc
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_2d_memory_pool.h"
+#include "tnn/memory_manager/blob_2d_memory.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Blob2DMemoryPool::Blob2DMemoryPool(AbstractDevice *device) : BlobMemoryPool(device) {
+    blob_memory_list_header_map_.clear();
+}
+
+Blob2DMemoryPool::~Blob2DMemoryPool() {
+    ClearBlobMemoryPool();
+}
+
+void Blob2DMemoryPool::ClearBlobMemoryPool() {
+    BlobMemoryPool::ClearBlobMemoryPool();
+    
+    auto blob_memory_list_header_map = blob_memory_list_header_map_;
+    for (auto iter : blob_memory_list_header_map) {
+        auto list_header = iter.second;
+        ReleaseBlobMemoryNodeList(list_header);
+    }
+    blob_memory_list_header_map_.clear();
+}
+
+BlobMemory *Blob2DMemoryPool::CreateBlobMemory(int use_count, BlobMemorySizeInfo &size_info) {
+    return new Blob2DMemory(device_, size_info, use_count);
+}
+
+BlobMemoryNode *Blob2DMemoryPool::GetBlobMemoryNodeListHeader(DataType data_type) {
+    return blob_memory_list_header_map_[data_type];
+}
+
+void Blob2DMemoryPool::SetBlobMemoryNodeListHeader(DataType data_type, BlobMemoryNode *new_header) {
+    blob_memory_list_header_map_[data_type] = new_header;
+}
+
+int64_t Blob2DMemoryPool::ResolveBlobMemoryNodeBytesDiff(BlobMemorySizeInfo &size_info, BlobMemoryNode *node) {
+    int64_t target_bytes_size = GetBlobMemoryBytesSize(size_info);
+
+    auto node_cur_info       = node->blob_memory->GetBlobMemorySizeInfo();
+    int64_t node_cur_bytes_size = GetBlobMemoryBytesSize(node_cur_info);
+
+    BlobMemorySizeInfo max_info;
+    max_info.data_type = size_info.data_type;
+    max_info.dims      = DimsVectorUtils::Max(size_info.dims, node_cur_info.dims);
+    int64_t max_bytes_size = GetBlobMemoryBytesSize(max_info);
+
+
+    if (size_info.dims[0] <= node_cur_info.dims[0] && size_info.dims[1] <= node_cur_info.dims[1]) {
+        return max_bytes_size - target_bytes_size;
+    } else {
+        return max_bytes_size - node_cur_bytes_size;
+    }
+}
+
+BlobMemoryNode *Blob2DMemoryPool::ExtractNearestBlobMemoryNode(BlobMemorySizeInfo &size_info) {
+    BlobMemoryNode *list_header = GetBlobMemoryNodeListHeader(size_info.data_type);
+    if (!list_header) {
+        return nullptr;
+    }
+
+    BlobMemoryNode *node_prev                                           = nullptr;
+    BlobMemoryNode *node_cur                                            = list_header;
+    std::tuple<BlobMemoryNode *, BlobMemoryNode *, int64_t> min_diff_exist  = std::make_tuple(nullptr, nullptr, LLONG_MAX);
+    std::tuple<BlobMemoryNode *, BlobMemoryNode *, int64_t> min_diff_extend = std::make_tuple(nullptr, nullptr, LLONG_MAX);
+    while (node_cur) {
+        int64_t bytes_diff = ResolveBlobMemoryNodeBytesDiff(size_info, node_cur);
+
+        auto node_cur_sizeinfo = node_cur->blob_memory->GetBlobMemorySizeInfo();
+        ASSERT(2 == size_info.dims.size() && 2 == node_cur_sizeinfo.dims.size());
+        if (size_info.dims[0] <= node_cur_sizeinfo.dims[0] && size_info.dims[1] <= node_cur_sizeinfo.dims[1]) {
+            // the memory pool have the blob to fit the size_info
+            if (bytes_diff < std::get<2>(min_diff_exist)) {
+                min_diff_exist = std::make_tuple(node_prev, node_cur, bytes_diff);
+            }
+        } else {
+            int target_bytes_size = GetBlobMemoryBytesSize(size_info);
+            if (bytes_diff < target_bytes_size) {
+                // can extend
+                if (bytes_diff < std::get<2>(min_diff_extend)) {
+                    min_diff_extend = std::make_tuple(node_prev, node_cur, bytes_diff);
+                }
+            }
+        }
+
+        node_prev = node_cur;
+        node_cur  = node_cur->next;
+    }
+
+    if (nullptr != std::get<1>(min_diff_exist)) {
+        node_prev = std::get<0>(min_diff_exist);
+        node_cur  = std::get<1>(min_diff_exist);
+    } else if (nullptr != std::get<1>(min_diff_extend)) {
+        node_prev = std::get<0>(min_diff_extend);
+        node_cur  = std::get<1>(min_diff_extend);
+    } else {
+        return nullptr;
+    }
+
+    ASSERT(node_cur != nullptr);
+
+    if (node_prev) {
+        node_prev->next = node_cur->next;
+    } else {
+        list_header = node_cur->next;
+        SetBlobMemoryNodeListHeader(size_info.data_type, list_header);
+    }
+    return node_cur;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.h b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.h
new file mode 100644
index 0000000..db47d5b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_2d_memory_pool.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_POOL_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_POOL_H_
+
+#include <map>
+
+#include "tnn/memory_manager/blob_memory_pool.h"
+
+namespace TNN_NS {
+
+class Blob2DMemoryPool : public BlobMemoryPool {
+public:
+    explicit Blob2DMemoryPool(AbstractDevice* device);
+    virtual ~Blob2DMemoryPool();
+    virtual void ClearBlobMemoryPool() override;
+
+private:
+    virtual BlobMemory* CreateBlobMemory(int use_count, BlobMemorySizeInfo& size_info) override;
+    virtual BlobMemoryNode* GetBlobMemoryNodeListHeader(DataType data_type) override;
+    virtual void SetBlobMemoryNodeListHeader(DataType data_type, BlobMemoryNode* new_header) override;
+    virtual int64_t ResolveBlobMemoryNodeBytesDiff(BlobMemorySizeInfo& size_info, BlobMemoryNode* node) override;
+    // extract the closest BlobMemoryNode from BlobMemoryNode list for 2D memory
+    virtual BlobMemoryNode* ExtractNearestBlobMemoryNode(BlobMemorySizeInfo& size_info) override;
+    std::map<DataType, BlobMemoryNode*> blob_memory_list_header_map_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_2D_MEMORY_POOL_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_memory.cc
new file mode 100644
index 0000000..224f7bc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_memory.h"
+
+namespace TNN_NS {
+
+BlobMemory::BlobMemory(AbstractDevice* device, BlobMemorySizeInfo& size_info, int use_count)
+    : device_(device), size_info_(size_info), use_count_(use_count) {
+    need_release_memory_ = false;
+}
+BlobMemory::~BlobMemory() {
+    if (need_release_memory_) {
+        need_release_memory_ = false;
+        device_->Free(handle_.base);
+    }
+}
+
+BlobMemorySizeInfo BlobMemory::GetBlobMemorySizeInfo() const {
+    return size_info_;
+}
+
+void BlobMemory::SetUseCount(int use_count) {
+    use_count_ = use_count;
+}
+
+int BlobMemory::GetUseCount() const {
+    return use_count_;
+}
+
+bool BlobMemory::DecrementUseCount() {
+    if (use_count_ > 0) {
+        --use_count_;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+Status BlobMemory::AllocateHandle() {
+    auto status = device_->Allocate(&handle_, size_info_);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    need_release_memory_ = true;
+    return TNN_OK;
+}
+
+void BlobMemory::SetHandleFromExternal(BlobHandle handle) {
+    handle_              = handle;
+    need_release_memory_ = false;
+}
+
+BlobHandle BlobMemory::GetHandle() {
+    return handle_;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory.h b/3rdparty/TNN/source/tnn/memory_manager/blob_memory.h
new file mode 100644
index 0000000..6b7292e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_H_
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+
+namespace TNN_NS {
+
+class BlobMemory {
+public:
+    BlobMemory(AbstractDevice *device, BlobMemorySizeInfo &size_info, int use_count = 0);
+    virtual ~BlobMemory();
+
+    virtual void UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info) = 0;
+    BlobMemorySizeInfo GetBlobMemorySizeInfo() const;
+
+    void SetUseCount(int use_count);
+    int GetUseCount() const;
+    bool DecrementUseCount();
+
+    Status AllocateHandle();
+    void SetHandleFromExternal(BlobHandle handle);
+    BlobHandle GetHandle();
+
+protected:
+    BlobMemorySizeInfo size_info_;
+
+private:
+    BlobMemory(const BlobMemory &);
+    BlobMemory &operator=(const BlobMemory &);
+
+    AbstractDevice *device_;
+    BlobHandle handle_;
+    bool need_release_memory_;
+    int use_count_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.cc
new file mode 100644
index 0000000..72a262e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_memory_pool.h"
+
+#include <limits.h>
+
+#include <map>
+#include <tuple>
+
+namespace TNN_NS {
+
+BlobMemoryPool::BlobMemoryPool(AbstractDevice *device) {
+    all_blob_memory_size_ = 0;
+    device_               = device;
+}
+
+BlobMemoryPool::~BlobMemoryPool() {
+    ClearBlobMemoryPool();
+}
+
+void BlobMemoryPool::ClearBlobMemoryPool() {
+    auto blob_memory_library = blob_memory_library_;
+    for (auto &iter : blob_memory_library) {
+        delete iter;
+    }
+    
+    blob_memory_library_.clear();
+    all_blob_memory_size_ = 0;
+}
+
+AbstractDevice *BlobMemoryPool::GetDevice() {
+    return device_;
+}
+
+BlobMemory *BlobMemoryPool::BorrowBlobMemory(int use_count, BlobMemorySizeInfo &size_info, bool use_new_memory) {
+    if (use_new_memory) {
+        BlobMemory *blob_memory = CreateBlobMemory(use_count, size_info);
+        blob_memory_library_.insert(blob_memory);
+        return blob_memory;
+    } else {
+        BlobMemoryNode *blob_node = ExtractNearestBlobMemoryNode(size_info);
+        if (blob_node == NULL) {
+            BlobMemory *blob_memory = CreateBlobMemory(use_count, size_info);
+            blob_memory_library_.insert(blob_memory);
+            return blob_memory;
+        } else {
+            BlobMemory *blob_memory = blob_node->blob_memory;
+            blob_memory->UpdateBlobMemorySizeInfo(size_info);
+            blob_memory->SetUseCount(use_count);
+            delete blob_node;
+            return blob_memory;
+        }
+    }
+}
+
+void BlobMemoryPool::RefundBlobMemory(BlobMemory *blob_memory) {
+    ASSERT(blob_memory != NULL);
+    DataType data_type          = blob_memory->GetBlobMemorySizeInfo().data_type;
+    BlobMemoryNode *list_header = GetBlobMemoryNodeListHeader(data_type);
+    BlobMemoryNode *new_header  = new BlobMemoryNode();
+    new_header->blob_memory     = blob_memory;
+    new_header->next            = list_header;
+    SetBlobMemoryNodeListHeader(data_type, new_header);
+}
+
+void BlobMemoryPool::ReleaseBlobMemoryNodeList(BlobMemoryNode *list_header) {
+    while (list_header) {
+        auto temp   = list_header;
+        list_header = list_header->next;
+        delete temp;
+    }
+}
+
+BlobMemoryNode *BlobMemoryPool::ExtractNearestBlobMemoryNode(BlobMemorySizeInfo &size_info) {
+    BlobMemoryNode *list_header = GetBlobMemoryNodeListHeader(size_info.data_type);
+    if (!list_header) {
+        return nullptr;
+    }
+
+    BlobMemoryNode *node_prev                                         = nullptr;
+    BlobMemoryNode *node_cur                                          = list_header;
+    std::tuple<BlobMemoryNode *, BlobMemoryNode *, int64_t> min_diff_area = std::make_tuple(nullptr, nullptr, LLONG_MAX);
+    while (node_cur) {
+        int64_t bytes_diff = ResolveBlobMemoryNodeBytesDiff(size_info, node_cur);
+
+        if (bytes_diff < std::get<2>(min_diff_area)) {
+            min_diff_area = std::make_tuple(node_prev, node_cur, bytes_diff);
+        }
+
+        node_prev = node_cur;
+        node_cur  = node_cur->next;
+    }
+
+    node_prev = std::get<0>(min_diff_area);
+    node_cur  = std::get<1>(min_diff_area);
+    ASSERT(node_cur != nullptr);
+
+    if (node_prev) {
+        node_prev->next = node_cur->next;
+    } else {
+        list_header = node_cur->next;
+        SetBlobMemoryNodeListHeader(size_info.data_type, list_header);
+    }
+    return node_cur;
+}
+
+Status BlobMemoryPool::AssignAllBlobMemory(MemoryAssignStrategy &strategy) {
+    return strategy.AssignAllBlobMemory(blob_memory_library_);
+}
+
+int BlobMemoryPool::GetAllBlobMemorySize() {
+    CalculateAllBlobMemorySize();
+    return all_blob_memory_size_;
+}
+
+void BlobMemoryPool::CalculateAllBlobMemorySize() {
+    typename std::set<BlobMemory *>::iterator iter;
+    all_blob_memory_size_ = 0;
+    for (auto iter : blob_memory_library_) {
+        BlobMemorySizeInfo info = iter->GetBlobMemorySizeInfo();
+        all_blob_memory_size_ += GetBlobMemoryBytesSize(info);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.h b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.h
new file mode 100644
index 0000000..dd3df99
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_POOL_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_POOL_H_
+
+#include <queue>
+#include <set>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/memory_manager/blob_memory.h"
+#include "tnn/memory_manager/memory_seperate_assign_strategy.h"
+#include "tnn/memory_manager/memory_unify_assign_strategy.h"
+
+namespace TNN_NS {
+struct BlobMemoryNode {
+    BlobMemory *blob_memory = nullptr;
+    BlobMemoryNode *next    = nullptr;
+};
+
+class BlobMemoryPool {
+public:
+    explicit BlobMemoryPool(AbstractDevice *device);
+    virtual ~BlobMemoryPool();
+    BlobMemory *BorrowBlobMemory(int use_count, BlobMemorySizeInfo &size_info, bool use_new_memory = false);
+    void RefundBlobMemory(BlobMemory *blob_memory);
+    int GetAllBlobMemorySize();
+    Status AssignAllBlobMemory(MemoryAssignStrategy &strategy);
+    virtual void ClearBlobMemoryPool();
+    AbstractDevice *GetDevice();
+protected:
+    AbstractDevice *device_ = nullptr;
+    void ReleaseBlobMemoryNodeList(BlobMemoryNode *list_header);
+
+private:
+    BlobMemoryPool(const BlobMemoryPool &);
+    BlobMemoryPool &operator=(const BlobMemoryPool &);
+
+    virtual BlobMemory *CreateBlobMemory(int use_count, BlobMemorySizeInfo &size_info)              = 0;
+    virtual BlobMemoryNode *GetBlobMemoryNodeListHeader(DataType data_type)                         = 0;
+    virtual void SetBlobMemoryNodeListHeader(DataType data_type, BlobMemoryNode *new_header)        = 0;
+    virtual int64_t ResolveBlobMemoryNodeBytesDiff(BlobMemorySizeInfo &size_info, BlobMemoryNode *node) = 0;
+
+    void CalculateAllBlobMemorySize();
+    // extract the closest BlobMemoryNode from BlobMemoryNode list
+    virtual BlobMemoryNode *ExtractNearestBlobMemoryNode(BlobMemorySizeInfo &size_info);
+
+    int all_blob_memory_size_ = 0;;
+    std::set<BlobMemory *> blob_memory_library_ = {};
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_POOL_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.cc
new file mode 100644
index 0000000..341d620
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_memory_pool_factory.h"
+#include "tnn/memory_manager/blob_1d_memory_pool.h"
+#include "tnn/memory_manager/blob_2d_memory_pool.h"
+
+namespace TNN_NS {
+
+BlobMemoryPool* BlobMemoryPoolFactory::CreateBlobMemoryPool(AbstractDevice* device, int dimensions) {
+    if (DEVICE_OPENCL == device->GetDeviceType()) {
+        if (dimensions == 2) {
+            return new Blob2DMemoryPool(device);
+        } else if (dimensions == 1) {
+            return new Blob1DMemoryPool(device);
+        } else {
+            return nullptr;
+        }
+    } else {
+        return new Blob1DMemoryPool(device);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.h b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.h
new file mode 100644
index 0000000..a88a8f1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_pool_factory.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_POOL_FACTORY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_POOL_FACTORY_H_
+
+#include "tnn/memory_manager/blob_memory_pool.h"
+
+namespace TNN_NS {
+
+class BlobMemoryPoolFactory {
+public:
+    static BlobMemoryPool* CreateBlobMemoryPool(AbstractDevice* device, int dimensions = 1);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_1D_MEMORY_POOL_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.cc b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.cc
new file mode 100644
index 0000000..661554f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+int64_t GetBlobMemoryBytesSize(BlobMemorySizeInfo& size_info) {
+    if (size_info.dims.size() == 1) {
+        int64_t dims_count = DimsVectorUtils::Count(size_info.dims);
+        return dims_count * DataTypeUtils::GetBytesSize(size_info.data_type);
+
+    } else if (size_info.dims.size() == 2) {
+        // 2d blob memory with 4 channel
+        int64_t dims_count = 1;
+        for (auto dim : size_info.dims) {
+            dims_count *= dim;
+        }
+        return dims_count * 4 * DataTypeUtils::GetBytesSize(size_info.data_type);
+    } else {
+        return 0;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.h b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.h
new file mode 100644
index 0000000..3aff135
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/blob_memory_size_info.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_SIZE_INFO_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_SIZE_INFO_H_
+
+#include <vector>
+
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+// @brief blob memory info data type and data memory dims
+struct BlobMemorySizeInfo {
+    DataType data_type = DATA_TYPE_FLOAT;
+    std::vector<int> dims = {};
+};
+
+int64_t GetBlobMemoryBytesSize(BlobMemorySizeInfo& size_info);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_BLOB_MEMORY_SIZE_INFO_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_assign_strategy.h b/3rdparty/TNN/source/tnn/memory_manager/memory_assign_strategy.h
new file mode 100644
index 0000000..5afbd84
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_assign_strategy.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_ASSIGN_STRATEGY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_ASSIGN_STRATEGY_H_
+
+#include <set>
+#include "tnn/memory_manager/blob_memory.h"
+
+namespace TNN_NS {
+
+enum MemoryAssignStragegyType { UNIFY = 0, SEPERATE = 1 };
+
+class MemoryAssignStrategy {
+public:
+    virtual Status AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library) = 0;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_ALLOCATE_STRATEGY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.cc b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.cc
new file mode 100644
index 0000000..4050551
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.cc
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/memory_mode_state.h"
+
+namespace TNN_NS {
+
+MemoryModeState::~MemoryModeState() {}
+
+void MemoryModeState::SetMemoryAllocatedFlag() {
+    memory_allocated = true;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.h b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.h
new file mode 100644
index 0000000..b670f52
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_H_
+
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class MemoryModeState {
+public:
+    virtual ~MemoryModeState();
+
+    // @brief get memory mode state status, different memory mode may
+    // need different conditions.
+    virtual Status GetStatus() = 0;
+
+    // @brief if blob memory assigned, set the flag.
+    void SetMemoryAllocatedFlag();
+
+protected:
+    bool memory_allocated;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.cc b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.cc
new file mode 100644
index 0000000..eeefb97
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/memory_mode_state_factory.h"
+#include "tnn/memory_manager/others_memory_mode_state.h"
+#include "tnn/memory_manager/share_one_thread_memory_mode_state.h"
+
+namespace TNN_NS {
+
+MemoryModeState* MemoryModeStateFactory::CreateMemoryModeState(ShareMemoryMode mode) {
+    if (mode == SHARE_MEMORY_MODE_SHARE_ONE_THREAD) {
+        return new ShareOneThreadMemoryModeState();
+    } else {
+        return new OthersMemoryModeState();
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.h b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.h
new file mode 100644
index 0000000..a5f1180
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_mode_state_factory.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_FACTORY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_FACTORY_H_
+
+#include "tnn/core/common.h"
+#include "tnn/memory_manager/memory_mode_state.h"
+
+namespace TNN_NS {
+
+class MemoryModeStateFactory {
+public:
+    // @brief create memory mode state
+    static MemoryModeState* CreateMemoryModeState(ShareMemoryMode mode);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_MODE_STATE_FACTORY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.cc b/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.cc
new file mode 100644
index 0000000..f138010
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/memory_seperate_assign_strategy.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+Status MemorySeperateAssignStrategy::AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library) {
+    typename std::set<BlobMemory*>::iterator iter;
+    for (auto iter : blob_memory_library) {
+        auto status = iter->AllocateHandle();
+        if (status != TNN_OK) {
+            return status;
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.h b/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.h
new file mode 100644
index 0000000..752c593
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_seperate_assign_strategy.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_SEPERATE_ASSIGN_STRATEGY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_SEPERATE_ASSIGN_STRATEGY_H_
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/memory_manager/memory_assign_strategy.h"
+
+namespace TNN_NS {
+
+class MemorySeperateAssignStrategy : public MemoryAssignStrategy {
+public:
+    virtual Status AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_SEPERATE_ASSIGN_STRATEGY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.cc b/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.cc
new file mode 100644
index 0000000..3c315b2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/memory_unify_assign_strategy.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+MemoryUnifyAssignStrategy::MemoryUnifyAssignStrategy(void* data) {
+    all_blob_memory_data_ = data;
+}
+
+Status MemoryUnifyAssignStrategy::AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library) {
+    int blob_memory_start_offset = 0;
+    for (auto& iter : blob_memory_library) {
+        BlobHandle handle;
+        handle.base         = all_blob_memory_data_;
+        handle.bytes_offset = blob_memory_start_offset;
+        iter->SetHandleFromExternal(handle);
+        BlobMemorySizeInfo size_info = iter->GetBlobMemorySizeInfo();
+        blob_memory_start_offset += GetBlobMemoryBytesSize(size_info);
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
+
+// namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.h b/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.h
new file mode 100644
index 0000000..e63abc3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/memory_unify_assign_strategy.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_UNIFY_ASSIGN_STRATEGY_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_UNIFY_ASSIGN_STRATEGY_H_
+
+#include "tnn/memory_manager/memory_assign_strategy.h"
+
+namespace TNN_NS {
+
+class MemoryUnifyAssignStrategy : public MemoryAssignStrategy {
+public:
+    explicit MemoryUnifyAssignStrategy(void* data);
+    virtual Status AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library);
+
+private:
+    void* all_blob_memory_data_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_MEMORY_UNIFY_ASSIGN_STRATEGY_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.cc b/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.cc
new file mode 100644
index 0000000..f8fe692
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/others_memory_mode_state.h"
+
+namespace TNN_NS {
+
+Status OthersMemoryModeState::GetStatus() {
+    if (memory_allocated) {
+        return TNN_OK;
+    } else {
+        return Status(TNNERR_FORWARD_MEM_NOT_SET, "memory is not set");
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.h b/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.h
new file mode 100644
index 0000000..908f2f1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/others_memory_mode_state.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_OTHERS_MEMORY_MODE_STATE_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_OTHERS_MEMORY_MODE_STATE_H_
+
+#include "tnn/memory_manager/memory_mode_state.h"
+
+namespace TNN_NS {
+
+// @brief only share one thread memory mode need more conditions, others need
+// same conditions now.
+class OthersMemoryModeState : public MemoryModeState {
+public:
+    // @brief get memory mode state status, different memory mode may
+    // need different conditions.
+    virtual Status GetStatus();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_OTHERS_MEMORY_MODE_STATE_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.cc b/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.cc
new file mode 100644
index 0000000..1cd36bb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/share_one_thread_memory_mode_state.h"
+
+namespace TNN_NS {
+
+ShareOneThreadMemoryModeState::ShareOneThreadMemoryModeState() {
+    init_thread_id_ = std::this_thread::get_id();
+}
+
+Status ShareOneThreadMemoryModeState::GetStatus() {
+    std::thread::id current_thread_id = std::this_thread::get_id();
+    if (memory_allocated && (current_thread_id == init_thread_id_)) {
+        return TNN_OK;
+    } else if (!memory_allocated) {
+        return Status(TNNERR_FORWARD_MEM_NOT_SET, "memory is not set");
+    } else {
+        return Status(TNNERR_SHARED_MEMORY_FORWARD_NOT_SAME_THREAD, "memory canbe shared only in the same thread");
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.h b/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.h
new file mode 100644
index 0000000..3ddba1b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/share_one_thread_memory_mode_state.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_SHARE_ONE_THREAD_MEMORY_MODE_STATE_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_SHARE_ONE_THREAD_MEMORY_MODE_STATE_H_
+
+#include <thread>
+#include "tnn/memory_manager/memory_mode_state.h"
+
+namespace TNN_NS {
+
+class ShareOneThreadMemoryModeState : public MemoryModeState {
+public:
+    // @brief default constructor
+    ShareOneThreadMemoryModeState();
+
+    // @brief get memory mode state status, different memory mode may
+    // need different conditions.
+    virtual Status GetStatus();
+
+private:
+    std::thread::id init_thread_id_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_SHARE_ONE_THREAD_MEMORY_MODE_STATE_H_
diff --git a/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.cc b/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.cc
new file mode 100644
index 0000000..2114811
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/memory_manager/shared_memory_manager.h"
+
+namespace TNN_NS {
+
+bool operator<(SharedMemoryId lhs, SharedMemoryId rhs) {
+    if ((lhs.thread_id < rhs.thread_id) || ((lhs.thread_id == rhs.thread_id) && lhs.device_type < rhs.device_type) ||
+        ((lhs.thread_id == rhs.thread_id) && (lhs.device_type == rhs.device_type) && (lhs.device_id < rhs.device_id))) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+std::map<SharedMemoryId, SharedMemory> SharedMemoryManager::s_shared_forward_memory;
+std::map<SharedMemoryId, std::vector<ISharedMemoryChangeListener *>> SharedMemoryManager::s_shared_memory_instances;
+
+SharedMemory SharedMemoryManager::GetSharedMemory(int forward_memory_size, std::thread::id thread_id,
+                                                  AbstractDevice *device, int device_id,
+                                                  ISharedMemoryChangeListener *listener,
+                                                  Status &status) {
+    SharedMemoryId memory_id;
+    memory_id.thread_id                                          = thread_id;
+    memory_id.device_type                                        = device->GetDeviceType();
+    memory_id.device_id                                          = device_id;
+    SharedMemory &share_memory                                   = s_shared_forward_memory[memory_id];
+    std::vector<ISharedMemoryChangeListener *> &shared_instances = s_shared_memory_instances[memory_id];
+    if (forward_memory_size > share_memory.shared_memory_size) {
+        void *new_shared_memory = NULL;
+        BlobMemorySizeInfo info;
+        info.data_type = DATA_TYPE_INT8; 
+        info.dims.push_back(forward_memory_size);
+        status = device->Allocate(&new_shared_memory, info);
+        if (status != TNN_OK) {
+            return SharedMemory();
+        }
+
+        if (share_memory.shared_memory_data != NULL) {
+            device->Free(share_memory.shared_memory_data);
+        }
+ 
+        for (int i = 0; i < shared_instances.size(); ++i) {
+            shared_instances[i]->OnSharedForwardMemoryChanged(new_shared_memory);
+        }
+        share_memory.shared_memory_data = new_shared_memory;
+        share_memory.shared_memory_size = forward_memory_size;
+    }
+    share_memory.shared_memory_ref_count++;
+    shared_instances.push_back(listener);
+    return share_memory;
+}
+
+void SharedMemoryManager::ReleaseSharedMemory(std::thread::id thread_id, AbstractDevice *device, int device_id,
+                                              ISharedMemoryChangeListener *listener) {
+    SharedMemoryId memory_id;
+    memory_id.thread_id                                          = thread_id;
+    memory_id.device_type                                        = device->GetDeviceType();
+    memory_id.device_id                                          = device_id;
+    std::vector<ISharedMemoryChangeListener *> &shared_instances = s_shared_memory_instances[memory_id];
+    std::vector<ISharedMemoryChangeListener *>::iterator it =
+        std::find(shared_instances.begin(), shared_instances.end(), listener);
+    if (it != shared_instances.end()) {
+        shared_instances.erase(it);
+    }
+    SharedMemory &memory = s_shared_forward_memory[memory_id];
+    memory.shared_memory_ref_count--;
+    if (memory.shared_memory_ref_count == 0) {
+        device->Free(memory.shared_memory_data);
+        s_shared_forward_memory.erase(memory_id);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.h b/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.h
new file mode 100644
index 0000000..82b876e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/memory_manager/shared_memory_manager.h
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_MEMORY_MANAGER_SHARED_MEMORY_MANAGER_H_
+#define TNN_SOURCE_TNN_MEMORY_MANAGER_SHARED_MEMORY_MANAGER_H_
+
+#include <algorithm>
+#include <map>
+#include <queue>
+#include <set>
+#include <thread>
+#include <vector>
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+
+namespace TNN_NS {
+
+struct SharedMemory {
+    int shared_memory_size      = 0;
+    void *shared_memory_data    = NULL;
+    int shared_memory_ref_count = 0;
+};
+
+struct SharedMemoryId {
+    std::thread::id thread_id;
+    DeviceType device_type;
+    int device_id;
+};
+
+class ISharedMemoryChangeListener {
+public:
+    virtual void OnSharedForwardMemoryChanged(void *memory) = 0;
+};
+
+bool operator<(SharedMemoryId lhs, SharedMemoryId rhs);
+
+class SharedMemoryManager {
+public:
+    static SharedMemory GetSharedMemory(
+        int forward_memory_size, std::thread::id thread_id,
+        AbstractDevice *device, int device_id,
+        ISharedMemoryChangeListener *listener,
+        Status &status);
+
+    static void ReleaseSharedMemory(std::thread::id thread_id,
+                                    AbstractDevice *device, int device_id,
+                                    ISharedMemoryChangeListener *listener);
+
+private:
+    static std::map<SharedMemoryId, SharedMemory> s_shared_forward_memory;
+    static std::map<SharedMemoryId, std::vector<ISharedMemoryChangeListener *>>
+        s_shared_memory_instances;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_MEMORY_MANAGER_SHARED_MEMORY_MANAGEER_H_
diff --git a/3rdparty/TNN/source/tnn/network/openvino/.gitignore b/3rdparty/TNN/source/tnn/network/openvino/.gitignore
new file mode 100644
index 0000000..7d5c723
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/.gitignore
@@ -0,0 +1 @@
+thirdparty
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/CMakeLists.txt b/3rdparty/TNN/source/tnn/network/openvino/CMakeLists.txt
new file mode 100644
index 0000000..987f0f0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/CMakeLists.txt
@@ -0,0 +1,17 @@
+file(GLOB CPU_BASE_SRC *.h *.cc)
+file(GLOB_RECURSE X86_SRC *.h *.cc)
+
+add_library(TNNOpenVINO OBJECT ${CPU_BASE_SRC} ${X86_SRC})
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(CMAKE_CXX_FLAGS, "${CMAKE_CXX_FLAGS} /w -mavx2 -mfma")
+else ()
+    set(CMAKE_CXX_FLAGS, "${CMAKE_CXX_FLAGS} -mavx2 -mavx -mfma")
+endif()
+add_definitions(-mavx2 -mavx -mfma)
+
+if(NOT DEFINED ENV{OPENVINO_ROOT_DIR})
+    message(FATAL_ERROR "not defined environment variable:OPENVINO_ROOT_DIR")
+endif()
+
+include_directories($ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/include)
+include_directories($ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/include)
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_abs.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_abs.h
new file mode 100644
index 0000000..67be5f4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_abs.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Abs);
+REGISTER_CUSTOM_OP(Abs);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Abs);
+REGISTER_CUSTOM_IMPLEMENTATION(Abs, CustomAbs);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_batch_norm.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_batch_norm.h
new file mode 100644
index 0000000..2f104cc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_batch_norm.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(BatchNorm);
+REGISTER_CUSTOM_OP(BatchNorm);
+
+DECLARE_CUSTOM_IMPLEMENTATION(BatchNorm);
+REGISTER_CUSTOM_IMPLEMENTATION(BatchNorm, CustomBatchNorm);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_expand.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_expand.h
new file mode 100644
index 0000000..82e3f93
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_expand.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Expand);
+REGISTER_CUSTOM_OP(Expand);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Expand);
+REGISTER_CUSTOM_IMPLEMENTATION(Expand, CustomExpand);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_hard_swish.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_hard_swish.h
new file mode 100644
index 0000000..fb6412c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_hard_swish.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(HardSwish);
+REGISTER_CUSTOM_OP(HardSwish);
+
+DECLARE_CUSTOM_IMPLEMENTATION(HardSwish);
+REGISTER_CUSTOM_IMPLEMENTATION(HardSwish, CustomHardSwish);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_implementation.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_implementation.h
new file mode 100644
index 0000000..9e172d8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_implementation.h
@@ -0,0 +1,317 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <ie_blob.h>
+#include <ie_iextension.h>
+#include <ie_layouts.h>
+
+#include <tnn/core/status.h>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/opsets/opset.hpp>
+
+#ifndef TNN_DEVICE_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAIO_
+#define TNN_DEVICE_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAIO_
+
+namespace TNN_NS {
+
+class CustomOpenvinoOp : public ngraph::op::Op {
+public:
+    CustomOpenvinoOp() = default;
+    explicit CustomOpenvinoOp(const ngraph::OutputVector input_nodes, BaseLayer* baselayer,
+                              const std::vector<Blob*> input_blobs, const std::vector<Blob*> output_blobs)
+        : Op(input_nodes), base_layer_(baselayer), input_blobs_(input_blobs), output_blobs_(output_blobs) {
+        constructor_validate_and_infer_types();
+    };
+
+    void validate_and_infer_types() override {
+        for (size_t i = 0; i < output_blobs_.size(); i++) {
+            auto dims0 = output_blobs_[i]->GetBlobDesc().dims;
+            ngraph::Shape output_shape(dims0.size());
+            for (size_t j = 0; j < dims0.size(); j++) {
+                output_shape[j] = dims0[j];
+            }
+            set_output_type(i, get_input_element_type(0), ngraph::PartialShape(output_shape));
+        }
+    }
+
+    bool visit_attributes(ngraph::AttributeVisitor& visitor) override {
+        return true;
+    }
+
+    BaseLayer* getBaseLayer() {
+        return base_layer_;
+    }
+    std::vector<Blob*> getInputBlobs() {
+        return input_blobs_;
+    }
+    std::vector<Blob*> getOutputBlobs() {
+        return output_blobs_;
+    }
+
+protected:
+    BaseLayer* base_layer_;
+    std::vector<Blob*> input_blobs_, output_blobs_;
+};
+
+class CustomOpenvinoImpl : public InferenceEngine::ILayerExecImpl {
+public:
+    explicit CustomOpenvinoImpl(const std::shared_ptr<ngraph::Node>& node) : node_(node) {}
+
+    // @brief get configurations desc of custom node implementation
+    InferenceEngine::StatusCode getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig>& conf,
+                                                           InferenceEngine::ResponseDesc*) noexcept override {
+        InferenceEngine::LayerConfig layerConfig;
+        layerConfig.dynBatchSupport = true;
+
+        auto node = GetNode();
+        for (size_t i = 0; i < node_->inputs().size(); i++) {
+            InferenceEngine::DataConfig cfg;
+            cfg.constant = false;
+            cfg.inPlace  = -1;
+
+            InferenceEngine::SizeVector order;
+            auto partialShape = node_->get_input_partial_shape(i);
+            if (partialShape.is_dynamic())
+                return InferenceEngine::GENERAL_ERROR;
+
+            auto shape = node_->get_input_shape(i);
+            for (size_t j = 0; j < shape.size(); j++) {
+                order.push_back(j);
+            }
+            cfg.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order});
+            layerConfig.inConfs.push_back(cfg);
+        }
+
+        for (size_t i = 0; i < node_->outputs().size(); i++) {
+            InferenceEngine::DataConfig cfg;
+            cfg.constant = false;
+            cfg.inPlace  = -1;
+
+            InferenceEngine::SizeVector order;
+            auto partialShape = node_->get_output_partial_shape(i);
+            if (partialShape.is_dynamic())
+                return InferenceEngine::GENERAL_ERROR;
+
+            auto shape = node_->get_output_shape(i);
+            for (size_t j = 0; j < shape.size(); j++) {
+                order.push_back(j);
+            }
+            cfg.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order});
+            layerConfig.outConfs.push_back(cfg);
+        }
+
+        conf.push_back(layerConfig);
+        return InferenceEngine::OK;
+    };
+
+    // @brief init custom node implementaion
+    InferenceEngine::StatusCode init(InferenceEngine::LayerConfig&, InferenceEngine::ResponseDesc*) noexcept override {
+        return InferenceEngine::StatusCode::OK;
+    }
+
+    // @brief custom node execution
+    InferenceEngine::StatusCode execute(std::vector<InferenceEngine::Blob::Ptr>& inputs,
+                                        std::vector<InferenceEngine::Blob::Ptr>& outputs,
+                                        InferenceEngine::ResponseDesc* resp) noexcept {
+        const auto node = std::dynamic_pointer_cast<CustomOpenvinoOp>(GetNode());
+        auto input_blob = node->getInputBlobs();
+
+        for (size_t i = 0; i < inputs.size(); i++) {
+            InferenceEngine::MemoryBlob::CPtr minput = InferenceEngine::as<InferenceEngine::MemoryBlob>(inputs[i]);
+            if (!minput) {
+                return InferenceEngine::StatusCode::PARAMETER_MISMATCH;
+            }
+
+            auto minputHolder = minput->rmap();
+            BlobHandle input_handle;
+
+            input_handle.base = minputHolder.as<void*>();
+            input_blob[i]->SetHandle(input_handle);
+        }
+
+        auto output_blob = node->getOutputBlobs();
+        for (size_t i = 0; i < outputs.size(); i++) {
+            InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(outputs[i]);
+            if (!moutput) {
+                return InferenceEngine::StatusCode::PARAMETER_MISMATCH;
+            }
+
+            auto moutputHolder = moutput->rmap();
+            BlobHandle output_handle;
+
+            output_handle.base = moutputHolder.as<void*>();
+            output_blob[i]->SetHandle(output_handle);
+        }
+
+        auto base_layer = node->getBaseLayer();
+        base_layer->Forward();
+
+        return InferenceEngine::OK;
+    }
+
+    // @brief get node_
+    const std::shared_ptr<ngraph::Node> GetNode() {
+        return node_;
+    }
+
+private:
+    const std::shared_ptr<ngraph::Node> node_;
+};
+
+class CustomOpenvinoLayerManager : public InferenceEngine::IExtension {
+public:
+    // @brief register impl
+    template <typename T>
+    static void RegisterCustomOpenvinoLayer(std::string type) {
+        std::map<std::string, std::function<InferenceEngine::ILayerImpl::Ptr(const std::shared_ptr<ngraph::Node>)>>&
+            custom_openvino_layer_map = GetCustomOpenvinoLayerMap();
+        custom_openvino_layer_map[type] =
+            [](const std::shared_ptr<ngraph::Node>& node) -> InferenceEngine::ILayerImpl::Ptr {
+            return std::make_shared<T>(node);
+        };
+    }
+
+    // @brief this map is used to create opset into inference engine extension
+    static std::map<std::string, std::function<InferenceEngine::ILayerImpl::Ptr(const std::shared_ptr<ngraph::Node>)>>&
+    GetCustomOpenvinoLayerMap() {
+        static std::map<std::string,
+                        std::function<InferenceEngine::ILayerImpl::Ptr(const std::shared_ptr<ngraph::Node>)>>
+            custom_openvino_layer_map;
+        return custom_openvino_layer_map;
+    }
+
+    static std::set<LayerType> &GetCustomLayerTypeSet() {
+        static std::set<LayerType> custom_layer_type_set;
+        return custom_layer_type_set;
+    }
+
+    static void RegisterCustomLayerType(LayerType &type) {
+        std::set<LayerType> &layer_type_set = GetCustomLayerTypeSet();
+        if (layer_type_set.find(type) == layer_type_set.end()) {
+            layer_type_set.insert(type);
+        }
+    }
+
+    void GetVersion(const InferenceEngine::Version*& versionInfo) const noexcept override {}
+
+    void Unload() noexcept override {}
+
+    void Release() noexcept override {}
+
+    std::vector<std::string> getImplTypes(const std::shared_ptr<ngraph::Node>& node) override {
+        auto impls = GetCustomOpenvinoLayerMap();
+        if (impls.find(node->description()) == impls.end())
+            return {};
+        return {"CPU"};
+    }
+
+    InferenceEngine::ILayerImpl::Ptr getImplementation(const std::shared_ptr<ngraph::Node>& node,
+                                                       const std::string& implType) override {
+        auto impls = GetCustomOpenvinoLayerMap();
+        if (impls.find(node->description()) == impls.end() || implType != "CPU")
+            return nullptr;
+        return impls[node->description()](node);
+    }
+
+    std::map<std::string, ngraph::OpSet> getOpSets() override {
+        static std::mutex g_mutex;
+        const std::lock_guard<std::mutex> lock(g_mutex);
+
+        static std::map<std::string, ngraph::OpSet> opsets;
+        if (opsets.empty()) {
+            opsets["tnnCustom"] = getCustomOpSet();
+        }
+        return opsets;
+    }
+
+    // @brief register ngraph::Op
+    template <typename T>
+    static Status RegisterCustomOp() {
+        ngraph::OpSet& opset = getCustomOpSet();
+        opset.insert<T>();
+    }
+
+    // @brief static op set
+    static ngraph::OpSet& getCustomOpSet() {
+        static ngraph::OpSet opset;
+        return opset;
+    }
+
+    static Status status;
+};
+
+template <typename T>
+class CustomImplementationRegister {
+public:
+    explicit CustomImplementationRegister(std::string type_string) {
+        CustomOpenvinoLayerManager::RegisterCustomOpenvinoLayer<T>(type_string);
+    }
+};
+
+template <typename T>
+class CustomOpRegister {
+public:
+    explicit CustomOpRegister() {
+        CustomOpenvinoLayerManager::RegisterCustomOp<T>();
+    }
+};
+
+class CustomTypeRegister {
+public:
+    explicit CustomTypeRegister(LayerType type) {
+        CustomOpenvinoLayerManager::RegisterCustomLayerType(type);
+    }
+};
+
+}  // namespace TNN_NS
+
+#define DECLARE_CUSTOM_IMPLEMENTATION(type)                                                                            \
+    class Custom##type##Impl : public TNN_NS::CustomOpenvinoImpl {                                                     \
+    public:                                                                                                            \
+        explicit Custom##type##Impl(const std::shared_ptr<ngraph::Node>& node) : CustomOpenvinoImpl(node) {}           \
+    }
+
+// 注册到一起，方便一次导入，一个 getmap 函数把这些都 get 过来
+#define REGISTER_CUSTOM_IMPLEMENTATION(type, type_string)                                                              \
+    CustomImplementationRegister<Custom##type##Impl> g_custom_##type##_impl_register(#type_string);
+
+#define REGISTER_CUSTOM_TYPE(type) CustomTypeRegister g_custom_##type##_register(type);
+
+#define REGISTER_CUSTOM_OP(type)                                                                                       \
+    CustomOpRegister<Custom##type##Op> g_custom_##type##_op_register();                                                \
+    constexpr ngraph::NodeTypeInfo Custom##type##Op::type_info;
+
+#define DECLARE_CUSTOM_OP(type)                                                                                        \
+    class Custom##type##Op : public TNN_NS::CustomOpenvinoOp {                                                         \
+    public:                                                                                                            \
+        static constexpr ngraph::NodeTypeInfo type_info{"Custom" #type, 0};                                            \
+        const ngraph::NodeTypeInfo& get_type_info() const {                                                            \
+            return type_info;                                                                                          \
+        }                                                                                                              \
+        Custom##type##Op(const ngraph::OutputVector input_nodes, TNN_NS::BaseLayer* baselayer,                         \
+                         const std::vector<TNN_NS::Blob*> input_blobs, const std::vector<TNN_NS::Blob*> output_blobs)  \
+            : TNN_NS::CustomOpenvinoOp(input_nodes, baselayer, input_blobs, output_blobs){};                           \
+        std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector& new_args) const override {     \
+            return std::make_shared<Custom##type##Op>(new_args, base_layer_, input_blobs_, output_blobs_);             \
+        }                                                                                                              \
+    }
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_instance_norm.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_instance_norm.h
new file mode 100644
index 0000000..86c5cc2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_instance_norm.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(InstanceNorm);
+REGISTER_CUSTOM_OP(InstanceNorm);
+
+DECLARE_CUSTOM_IMPLEMENTATION(InstanceNorm);
+REGISTER_CUSTOM_IMPLEMENTATION(InstanceNorm, CustomInstanceNorm);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_layer_norm.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_layer_norm.h
new file mode 100644
index 0000000..b90e0ea
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_layer_norm.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(LayerNorm);
+REGISTER_CUSTOM_OP(LayerNorm);
+
+DECLARE_CUSTOM_IMPLEMENTATION(LayerNorm);
+REGISTER_CUSTOM_IMPLEMENTATION(LayerNorm, CustomLayerNorm);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_lstm_onnx.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_lstm_onnx.h
new file mode 100644
index 0000000..b1595bd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_lstm_onnx.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(LSTMONNX);
+REGISTER_CUSTOM_OP(LSTMONNX);
+
+DECLARE_CUSTOM_IMPLEMENTATION(LSTMONNX);
+REGISTER_CUSTOM_IMPLEMENTATION(LSTMONNX, CustomLSTMONNX);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pad_v2.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pad_v2.h
new file mode 100644
index 0000000..7f943a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pad_v2.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(PadV2);
+REGISTER_CUSTOM_OP(PadV2);
+
+DECLARE_CUSTOM_IMPLEMENTATION(PadV2);
+REGISTER_CUSTOM_IMPLEMENTATION(PadV2, CustomPadV2);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pooling.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pooling.h
new file mode 100644
index 0000000..70f9437
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_pooling.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Pooling);
+REGISTER_CUSTOM_OP(Pooling);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Pooling);
+REGISTER_CUSTOM_IMPLEMENTATION(Pooling, CustomPooling);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_reshape.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_reshape.h
new file mode 100644
index 0000000..890f8a8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_reshape.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Reshape);
+REGISTER_CUSTOM_OP(Reshape);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Reshape);
+REGISTER_CUSTOM_IMPLEMENTATION(Reshape, CustomReshape);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softplus.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softplus.h
new file mode 100644
index 0000000..45d264b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softplus.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Softplus);
+REGISTER_CUSTOM_OP(Softplus);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Softplus);
+REGISTER_CUSTOM_IMPLEMENTATION(Softplus, CustomSoftplus);
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softsign.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softsign.h
new file mode 100644
index 0000000..fdc4753
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_softsign.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(Softsign);
+REGISTER_CUSTOM_OP(Softsign);
+
+DECLARE_CUSTOM_IMPLEMENTATION(Softsign);
+REGISTER_CUSTOM_IMPLEMENTATION(Softsign, CustomSoftsign);
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_stride_slice_v2.h b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_stride_slice_v2.h
new file mode 100644
index 0000000..d8bb09e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/custom_layer/custom_stride_slice_v2.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "immintrin.h"
+#include "time.h"
+#include <chrono>
+
+namespace TNN_NS {
+    
+DECLARE_CUSTOM_OP(StrideSliceV2);
+REGISTER_CUSTOM_OP(StrideSliceV2);
+
+DECLARE_CUSTOM_IMPLEMENTATION(StrideSliceV2);
+REGISTER_CUSTOM_IMPLEMENTATION(StrideSliceV2, CustomStrideSliceV2);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc
new file mode 100644
index 0000000..2af54f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Abs, LAYER_ABS);
+
+REGISTER_UNARY_LAYER_BUILDER(Abs, LAYER_ABS);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc
new file mode 100644
index 0000000..5d58446
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Acos, LAYER_ACOS);
+
+REGISTER_UNARY_LAYER_BUILDER(Acos, LAYER_ACOS);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/add_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/add_layer_builder.cc
new file mode 100644
index 0000000..755798b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/add_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_BINARY_LAYER_BUILDER(Add, LAYER_ADD);
+
+REGISTER_BINARY_LAYER_BUILDER(Add, LAYER_ADD);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc
new file mode 100644
index 0000000..dd0cada
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+namespace opset = ngraph::opset3;
+
+Status ArgMaxOrMinOVLayerBuilder::Build() {
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    auto topk_mode = param->mode == 0 ? opset::TopK::Mode::MAX : opset::TopK::Mode::MIN ;
+    auto input_node = GetInputNodes()[0];
+    auto k_node = opset::Constant::create(ngraph::element::i64, ngraph::Shape{}, {1});
+    auto topk = std::make_shared<opset::TopK>(
+        input_node, k_node, param->axis, topk_mode, opset::TopK::SortType::NONE);
+
+    auto cur_node = std::make_shared<opset::Convert>(topk->output(1), ngraph::element::i64);
+
+    cur_node->set_friendly_name(param_->name);
+    cur_node->validate_and_infer_types();
+
+    ngraph::NodeVector node_vector = {cur_node};
+    SetOutputTensors(node_vector);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc
new file mode 100644
index 0000000..f34e2cc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Asin, LAYER_ASIN);
+
+REGISTER_UNARY_LAYER_BUILDER(Asin, LAYER_ASIN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc
new file mode 100644
index 0000000..48b3f83
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Atan, LAYER_ATAN);
+
+REGISTER_UNARY_LAYER_BUILDER(Atan, LAYER_ATAN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc
new file mode 100644
index 0000000..3a7d68b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/network/openvino/custom_layer/custom_batch_norm.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
+
+Status BatchNormOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<BatchNormLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto resource = dynamic_cast<BatchNormLayerResource*>(GetResource());
+    bool share_channel = resource->scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(resource->scale_handle.GetDataType());
+    auto *scale_data = resource->scale_handle.force_to<float*>();
+    auto *bias_data = resource->bias_handle.force_to<float*>();
+
+    auto input_shape = input_node->get_output_shape(0);
+    ngraph::Shape batchNromShape;
+    for (size_t i = 0; i < input_shape.size(); i++) {
+        if (i == 1 && !share_channel) batchNromShape.push_back(input_shape.at(1));
+        else batchNromShape.push_back(1); 
+    }
+
+    auto scaleConstNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, batchNromShape, scale_data);
+    auto biasConstNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, batchNromShape, bias_data);
+
+    if (1) { // choose custom or origin node
+        auto scaleNode = std::make_shared<ngraph::op::v1::Multiply>(
+            input_node->output(0), scaleConstNode);
+
+        scaleNode->validate_and_infer_types();
+
+        auto biasNode = std::make_shared<ngraph::op::v1::Add>(
+            scaleNode->output(0), biasConstNode, ngraph::op::AutoBroadcastType::NUMPY);
+
+        biasNode->set_friendly_name(param_->name);
+        biasNode->validate_and_infer_types();
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(biasNode);
+        SetOutputTensors(outputNodes);
+    } else {
+        ngraph::OutputVector outputs;
+        auto batchNormNode = std::make_shared<CustomBatchNormOp>(input_node->outputs(), base_layer_, GetInputBlobs(), GetOutputBlobs());
+        batchNormNode->set_friendly_name(param_->name);
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(batchNormNode);
+        SetOutputTensors(outputNodes);   
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc
new file mode 100644
index 0000000..0aac703
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/core/macro.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+Status BinaryLayerBuilder::Build() {
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+
+    std::shared_ptr<ngraph::Node> binary_node;
+    if (input_node.size() == 2) {
+        binary_node = CreateNode(input_node[0]->output(0), input_node[1]->output(0));
+    } else {
+        auto resource = dynamic_cast<EltwiseLayerResource*>(resource_);
+        auto layer_param = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+        CHECK_PARAM_NULL(layer_param);
+        const int weight_input_index = layer_param->weight_input_index;
+
+        ngraph::Shape weight_node_shape;
+        DimsVector weight_shape = resource->element_shape;
+        for(auto d : weight_shape) weight_node_shape.push_back(d);
+
+        auto weight_node = std::make_shared<ngraph::op::Constant>(
+            ngraph::element::Type_t::f32, weight_node_shape, resource->element_handle.force_to<float*>());
+        weight_node->validate_and_infer_types();
+
+        if (weight_input_index == 0) {
+            binary_node = CreateNode(weight_node, input_node[0]->output(0));
+        } else {
+            binary_node = CreateNode(input_node[0]->output(0), weight_node);
+        }
+    }
+    binary_node->set_friendly_name(param_->name);
+    binary_node->validate_and_infer_types();
+
+    SetOutputTensors(ngraph::NodeVector({binary_node}));
+
+    return TNN_OK;
+}
+
+} // namespace openvino
+} // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.h b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.h
new file mode 100644
index 0000000..f4c5ab9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/binary_layer_builder.h
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+//@brief BaseLayeer Builder, defines the layer builder interface
+class BinaryLayerBuilder: public OpenVINOLayerBuilder {
+public:
+    // @brief virtual destructor
+    explicit BinaryLayerBuilder(LayerType type): OpenVINOLayerBuilder(type) {};
+
+    // @brief virtual destructor
+    virtual ~BinaryLayerBuilder() {};
+
+protected:
+    virtual Status InferOutputShape() {return TNN_OK;};
+    virtual Status InferOutputDataType() {return TNN_OK;};
+    virtual Status Build();            
+    virtual std::shared_ptr<ngraph::Node> CreateNode(const ngraph::Output<ngraph::Node>& arg0, 
+                                                     const ngraph::Output<ngraph::Node>& arg1) = 0;
+};
+
+#define DECLARE_BINARY_LAYER_BUILDER(type_string, layer_type)                                                          \
+    class type_string##LayerBuilder : public BinaryLayerBuilder {                                                      \
+    public:                                                                                                            \
+        using ngraph_node_type = ngraph::op::v1::type_string;                                                          \
+        type_string##LayerBuilder(LayerType ignore) : BinaryLayerBuilder(layer_type){};                                \
+        virtual ~type_string##LayerBuilder(){};                                                                        \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual std::shared_ptr<ngraph::Node> CreateNode(const ngraph::Output<ngraph::Node>& arg0,                     \
+                                                         const ngraph::Output<ngraph::Node>& arg1) {                   \
+            return std::make_shared<ngraph_node_type>(arg0, arg1);                                                     \
+        }                                                                                                              \
+    };
+
+#define REGISTER_BINARY_LAYER_BUILDER(type_string, layer_type)                                                       \
+    TypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##LayerBuilder>>                                     \
+        g_##layer_type##_layer_builder_register(layer_type);
+
+}  // namespace openvino
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc
new file mode 100644
index 0000000..9692e5b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Cast, LAYER_CAST);
+
+Status CastOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<CastLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // auto input_blobs = GetInputBlobs();
+    auto output_blobs = GetOutputBlobs();
+
+    // auto input_desc = input_blobs[0]->GetBlobDesc();
+    auto output_desc = output_blobs[0]->GetBlobDesc();
+
+    auto castNode =
+        std::make_shared<ngraph::op::Convert>(input_node->output(0), ConvertToOVDataType(output_desc.data_type));
+
+    castNode->set_friendly_name(paramlist->name);
+    castNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(castNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Cast, LAYER_CAST);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc
new file mode 100644
index 0000000..5ca175f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Ceil, LAYER_CEIL);
+
+Status CeilOVLayerBuilder::Build() {
+    
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto ceilNode = std::make_shared<ngraph::op::Ceiling>(input_node->output(0));
+
+    ceilNode->set_friendly_name(param_->name);
+    ceilNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(ceilNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Ceil, LAYER_CEIL);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc
new file mode 100644
index 0000000..3fde188
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Clip, LAYER_CLIP);
+
+Status ClipOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ClipLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto clipNode = std::make_shared<ngraph::op::Clamp>(
+        input_node->output(0), paramlist->min, paramlist->max);
+
+    clipNode->validate_and_infer_types();
+    clipNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(clipNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Clip, LAYER_CLIP);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc
new file mode 100644
index 0000000..d841952
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Concat, LAYER_CONCAT);
+
+Status ConcatOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ConcatLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+
+    auto concatNode = std::make_shared<ngraph::op::Concat>(input_node, paramlist->axis);
+    concatNode->set_friendly_name(paramlist->name);
+    concatNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(concatNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Concat, LAYER_CONCAT);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc
new file mode 100644
index 0000000..7686d0f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Conv, LAYER_CONVOLUTION);
+
+Status ConvOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto convNode = std::make_shared<ngraph::op::v1::GroupConvolution>();
+
+    // set strides
+    ngraph::Strides stride;
+    for (auto item : paramlist->strides) {
+        stride.push_back(item);
+    }
+    std::reverse(stride.begin(), stride.end());
+    convNode->set_strides(stride);
+
+    // set pads
+    ngraph::CoordinateDiff pad_begin, pad_end;
+    pad_begin.push_back(paramlist->pads.at(2));
+    pad_begin.push_back(paramlist->pads.at(0));
+    pad_end.push_back(paramlist->pads.at(3));
+    pad_end.push_back(paramlist->pads.at(1));
+    convNode->set_pads_begin(pad_begin);
+    convNode->set_adding_above(pad_end);
+
+    // set dilations
+    ngraph::Strides dilation;
+    for (auto item : paramlist->dialations) {
+        dilation.push_back(item);
+    }
+    std::reverse(dilation.begin(), dilation.end());
+    convNode->set_dilations(dilation);
+
+    // set pad type
+    ngraph::op::PadType pad_type;
+    if (paramlist->pad_type == -1) {
+        pad_type = ngraph::op::PadType::EXPLICIT;
+    } else if (paramlist->pad_type == 0) {
+        pad_type = ngraph::op::PadType::SAME_UPPER;
+    } else {
+        pad_type = ngraph::op::PadType::VALID;
+    }
+    convNode->set_auto_pad(pad_type);
+
+    // set weights
+    size_t weight_size = 1;
+    ngraph::Shape weights_shape;
+    weights_shape.push_back(paramlist->group);
+    weights_shape.push_back(paramlist->output_channel / paramlist->group);
+    weights_shape.push_back(input_node->get_output_shape(0).at(1) / paramlist->group);
+    weight_size *= paramlist->output_channel * paramlist->input_channel;
+    for (int i = paramlist->kernels.size() - 1; i >= 0; i--) {
+        weights_shape.push_back(paramlist->kernels.at(i));
+        weight_size *= paramlist->kernels.at(i);
+    }
+
+    auto resource = dynamic_cast<ConvLayerResource*>(GetResource());
+
+    std::shared_ptr<ngraph::Node> weights_Node = std::make_shared<ngraph::op::Constant>(
+        DataTransfer(resource->filter_handle.GetDataType()), weights_shape, resource->filter_handle.force_to<float*>());
+
+    // if input channels > weights input channels
+    if (input_node->get_output_shape(0).at(1) > paramlist->input_channel * paramlist->group) {
+        auto channels = paramlist->input_channel * paramlist->group;
+        ngraph::Shape axisShape, lengthShape;
+        axisShape.push_back(1);
+        lengthShape.push_back(2);
+        auto axisNode =
+            std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, axisShape, std::vector<int>({1}));
+        auto lengthNode = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, lengthShape,
+                                                                 std::vector<int>({channels, -1}));
+        auto sliceNode  = std::make_shared<ngraph::op::VariadicSplit>(input_node->output(0), axisNode, lengthNode);
+        convNode->set_argument(0, sliceNode->output(0));
+    } else {
+        convNode->set_argument(0, input_node->output(0));
+    }
+    convNode->set_argument(1, weights_Node->output(0));
+    convNode->validate_and_infer_types();
+
+    std::shared_ptr<ngraph::Node> output_node = nullptr;
+    ngraph::NodeVector outputNodes;
+
+    if (paramlist->bias) {
+        // set bias shape
+        ngraph::Shape biasShape;
+        for (size_t i = 0; i < convNode->get_output_shape(0).size(); i++) {
+            if (i == 1)
+                biasShape.push_back(convNode->get_output_shape(0).at(1));
+            else
+                biasShape.push_back(1);
+        }
+
+        // set bias node
+        std::shared_ptr<ngraph::Node> biasNode = std::make_shared<ngraph::op::Constant>(
+            DataTransfer(resource->bias_handle.GetDataType()), biasShape, resource->bias_handle.force_to<float*>());
+
+        auto addNode = std::make_shared<ngraph::op::v1::Add>();
+        addNode->set_argument(0, convNode->output(0));
+        addNode->set_argument(1, biasNode->output(0));
+        addNode->validate_and_infer_types();
+        output_node = addNode;
+    } else {
+        output_node = convNode;
+    }
+
+    if (paramlist->activation_type != ActivationType_None) {
+        if (paramlist->activation_type == ActivationType_ReLU) {
+            output_node = std::make_shared<ngraph::op::Relu>(output_node->output(0));
+        } else if (paramlist->activation_type == ActivationType_ReLU6) {
+            output_node = std::make_shared<ngraph::op::Clamp>(output_node->output(0), 0, 6);
+        } else {
+            return Status(TNNERR_PARAM_ERR, "Unsupported activation type");
+        }
+    }
+
+    output_node->validate_and_infer_types();
+    output_node->set_friendly_name(paramlist->name);
+    outputNodes.push_back(output_node);
+    SetOutputTensors(outputNodes);
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Conv, LAYER_CONVOLUTION);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc
new file mode 100644
index 0000000..e1390ce
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Cos, LAYER_COS);
+
+REGISTER_UNARY_LAYER_BUILDER(Cos, LAYER_COS);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc
new file mode 100644
index 0000000..d7bf048
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Deconv, LAYER_DECONVOLUTION);
+
+Status DeconvOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // set strides
+    ngraph::Strides strides;
+    for (auto item : paramlist->strides) {
+        strides.push_back(item);
+    }
+    std::reverse(strides.begin(), strides.end());
+
+    // set pads
+    ngraph::CoordinateDiff pads_begin, pads_end;
+    pads_begin.push_back(paramlist->pads.at(2));
+    pads_begin.push_back(paramlist->pads.at(0));
+    pads_end.push_back(paramlist->pads.at(3));
+    pads_end.push_back(paramlist->pads.at(1));
+
+    // set dilations
+    ngraph::Strides dilations;
+    for (auto item : paramlist->dialations) {
+        dilations.push_back(item);
+    }
+    std::reverse(dilations.begin(), dilations.end());
+
+    // set pad type
+    ngraph::op::PadType pad_type;
+    if (paramlist->pad_type == -1) {
+        pad_type = ngraph::op::PadType::EXPLICIT;
+    } else if (paramlist->pad_type == 0) {
+        pad_type = ngraph::op::PadType::SAME_UPPER;
+    } else {
+        pad_type = ngraph::op::PadType::VALID;
+    }
+
+    // set weights
+    ngraph::Shape weights_shape;
+    weights_shape.push_back(paramlist->group);
+    weights_shape.push_back(paramlist->input_channel);
+    weights_shape.push_back(paramlist->output_channel / paramlist->group);
+    auto kernels = std::vector<int>{paramlist->kernels};
+    std::reverse(kernels.begin(), kernels.end());
+    for (auto item : kernels) {
+        weights_shape.push_back(item);
+    }
+    
+    auto resource = dynamic_cast<ConvLayerResource*>(GetResource());
+    auto weightsNode = std::make_shared<ngraph::op::Constant>(
+        DataTransfer(resource->filter_handle.GetDataType()), weights_shape, resource->filter_handle.force_to<float*>());
+
+    // set output shape as tnn 
+    auto output_spatial_dims = std::vector<int>({GetOutputBlobs()[0]->GetBlobDesc().dims[2], GetOutputBlobs()[0]->GetBlobDesc().dims[3]});
+    auto output_shape = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape({2}), output_spatial_dims);
+    auto dims = GetOutputBlobs()[0]->GetBlobDesc().dims;
+
+    // assume that channels == weights input channels
+    auto deConvNode = std::make_shared<ngraph::op::v1::GroupConvolutionBackpropData>(
+        input_node->output(0), weightsNode, output_shape, strides, pads_begin, pads_end, dilations, pad_type);
+
+    deConvNode->validate_and_infer_types();
+
+    // has bias
+    if (paramlist->bias) {
+        // set bias shape
+        ngraph::Shape biasShape;
+        for (size_t i = 0; i < deConvNode->get_output_shape(0).size(); i++) {
+            if (i == 1) biasShape.push_back(resource->bias_handle.GetDataCount());
+            else biasShape.push_back(1);
+        }
+
+        // set bias node 
+        auto biasNode = std::make_shared<ngraph::op::Constant>(
+            DataTransfer(resource->bias_handle.GetDataType()), biasShape, resource->bias_handle.force_to<float*>());
+        auto addNode = std::make_shared<ngraph::op::v1::Add>(
+            deConvNode->output(0), biasNode);
+        addNode->validate_and_infer_types();
+
+        addNode->set_friendly_name(paramlist->name);
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(addNode);
+        SetOutputTensors(outputNodes);
+
+    } else {
+        deConvNode->set_friendly_name(paramlist->name);
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(deConvNode);
+        SetOutputTensors(outputNodes);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Deconv, LAYER_DECONVOLUTION);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc
new file mode 100644
index 0000000..9bb6882
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/utils/bbox_util.h"
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+DECLARE_CUSTOM_OP(DetectionOutput);
+REGISTER_CUSTOM_OP(DetectionOutput);
+DECLARE_CUSTOM_IMPLEMENTATION(DetectionOutput);
+REGISTER_CUSTOM_IMPLEMENTATION(DetectionOutput, CustomDetectionOutput);
+DECLARE_OPENVINO_LAYER_BUILDER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+Status DetectionOutputOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<DetectionOutputLayerParam*>(param_);
+
+    if (GetInputNodes().size() < 3) {
+        LOGE("Error: Detection Output Layer requires 3 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+    ADD_CUSTOM_NODE(DetectionOutput, paramlist->name);
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/div_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/div_layer_builder.cc
new file mode 100644
index 0000000..83b8e9e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/div_layer_builder.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+
+DECLARE_BINARY_LAYER_BUILDER(Divide, LAYER_DIV);
+
+REGISTER_BINARY_LAYER_BUILDER(Divide, LAYER_DIV);
+
+}
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc
new file mode 100644
index 0000000..cb20bb5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Elu, LAYER_ELU);
+
+Status EluOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<EluLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto eluNode = std::make_shared<ngraph::op::Elu>(
+        input_node->output(0), paramlist->alpha);
+
+    eluNode->set_friendly_name(paramlist->name);
+    eluNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(eluNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Elu, LAYER_ELU);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc
new file mode 100644
index 0000000..e3ea77a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Erf, LAYER_ERF);
+
+REGISTER_UNARY_LAYER_BUILDER(Erf, LAYER_ERF);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc
new file mode 100644
index 0000000..a33833f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Exp, LAYER_EXP);
+
+REGISTER_UNARY_LAYER_BUILDER(Exp, LAYER_EXP);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc
new file mode 100644
index 0000000..1373311
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_expand.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Expand, LAYER_EXPAND);
+
+Status ExpandOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ExpandLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    ADD_CUSTOM_NODE(Expand, paramlist->name);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Expand, LAYER_EXPAND);
+REGISTER_CUSTOM_TYPE(LAYER_EXPAND);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc
new file mode 100644
index 0000000..c72d505
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Flatten, LAYER_FLATTEN);
+
+Status FlattenOVLayerBuilder::Build() {
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto paramlist = dynamic_cast<FlattenLayerParam *>(param_);
+    CHECK_PARAM_NULL(paramlist);
+
+    auto get_shape_count = [&](const ngraph::Shape &shape, int axis) -> size_t {
+        size_t res = 1;
+        for (int i = axis; i < shape.size(); i++)
+            res *= shape[i];
+        return res;
+    };
+    size_t m = input_node->get_output_shape(0)[0];
+    size_t n = get_shape_count(input_node->get_output_shape(0), 1);
+
+    std::vector<int> flattenShape;
+    flattenShape.push_back(m);
+    flattenShape.push_back(n);
+
+    auto patternNode =
+        std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape({2}), flattenShape);
+    auto reshapeNode = std::make_shared<ngraph::op::v1::Reshape>(input_node->output(0), patternNode, true);
+
+    reshapeNode->set_friendly_name(param_->name);
+    reshapeNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reshapeNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Flatten, LAYER_FLATTEN);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc
new file mode 100644
index 0000000..61e17ad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Floor, LAYER_FLOOR);
+
+REGISTER_UNARY_LAYER_BUILDER(Floor, LAYER_FLOOR);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc
new file mode 100644
index 0000000..e68907d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Gather, LAYER_GATHER);
+
+Status GatherOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<GatherLayerParam *>(param_);
+    auto resource  = dynamic_cast<GatherLayerResource *>(resource_);
+
+    if ((paramlist->data_in_resource || paramlist->indices_in_resource) && !resource) {
+        LOGE("Gather resource is invalid");
+        return TNNERR_INIT_LAYER;
+    }
+
+    std::shared_ptr<ngraph::op::Constant> const_node = nullptr;
+    if (paramlist->data_in_resource) {
+        const_node = ConvertToConstNode(&resource->data);
+    } else if (paramlist->indices_in_resource) {
+        const_node = ConvertToConstNode(&resource->indices);
+    } else {
+        return TNNERR_PARAM_ERR;
+    }
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+    auto axis_node  = std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape(), paramlist->axis);
+
+    std::shared_ptr<ngraph::op::v1::Gather> gatherNode = nullptr;
+    if (paramlist->data_in_resource) {
+        gatherNode = std::make_shared<ngraph::op::v1::Gather>(const_node->output(0), input_node->output(0),
+                                                              axis_node->output(0));
+    } else {
+        gatherNode = std::make_shared<ngraph::op::v1::Gather>(input_node->output(0), const_node->output(0),
+                                                              axis_node->output(0));
+    }
+
+    gatherNode->validate_and_infer_types();
+    gatherNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(gatherNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Gather, LAYER_GATHER);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc
new file mode 100644
index 0000000..6ef32e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Gelu, LAYER_GELU);
+
+Status GeluOVLayerBuilder::Build() {
+
+    auto paramlist = param_;
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto geluNode = std::make_shared<ngraph::op::Gelu>(input_node->output(0));
+
+    geluNode->set_friendly_name(paramlist->name);
+    geluNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(geluNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Gelu, LAYER_GELU);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc
new file mode 100644
index 0000000..b078136
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(HardSigmoid, LAYER_HARDSIGMOID);
+
+Status HardSigmoidOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<HardSigmoidLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto alphaNode =
+        std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::f32, ngraph::Shape(), paramlist->alpha);
+
+    auto betaNode =
+        std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::f32, ngraph::Shape(), paramlist->beta);
+
+    auto hardSigmoidNode =
+        std::make_shared<ngraph::op::HardSigmoid>(input_node->output(0), alphaNode->output(0), betaNode->output(0));
+
+    hardSigmoidNode->validate_and_infer_types();
+    hardSigmoidNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(hardSigmoidNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc
new file mode 100644
index 0000000..d68eee1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_hard_swish.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
+
+Status HardSwishOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<HardSwishLayerParam*>(param_);
+
+     if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+    
+    std::shared_ptr<ngraph::Node> input_node0, input_node1;
+    if (input_node.size() == 1) {
+        input_node0 = input_node[0];
+        input_node1 = input_node[0];
+    } else {
+        input_node0 = input_node[0];
+        input_node1 = input_node[1];
+    }
+
+    // mul-add-clamp-mul
+    // auto shape = ngraph::Shape(input_node1->get_output_shape(0).size(), 1);
+
+    // auto mulConst = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::f32, shape, std::vector<float>{paramlist->alpha});
+    // auto mulNode = std::make_shared<ngraph::op::v1::Multiply>(input_node1->output(0), mulConst);
+    // mulNode->validate_and_infer_types();
+
+    // auto addConst = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::f32, shape, std::vector<float>{paramlist->beta});
+    // auto addNode = std::make_shared<ngraph::op::v1::Add>(mulNode->output(0), addConst);
+    // addNode->validate_and_infer_types();
+
+    // auto clampNode = std::make_shared<ngraph::op::Clamp>(
+    //     addNode->output(0), 0.0f, 1.0f);
+    // clampNode->validate_and_infer_types();
+
+    // auto hardSwishNode = std::make_shared<ngraph::op::v1::Multiply>(
+    //     input_node0->output(0), clampNode->output(0));
+    ADD_CUSTOM_NODE(HardSwish, paramlist->name);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
+REGISTER_CUSTOM_TYPE(LAYER_HARDSWISH);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc
new file mode 100644
index 0000000..5764016
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+
+Status InnerProductOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<InnerProductLayerParam *>(param_);
+    auto resource  = dynamic_cast<InnerProductLayerResource *>(resource_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto get_shape_count = [&](const ngraph::Shape &shape, int axis) -> size_t {
+        size_t res = 1;
+        for (int i = axis; i < shape.size(); i++)
+            res *= shape[i];
+        return res;
+    };
+    size_t m = input_node->get_output_shape(0)[0];
+    size_t n = get_shape_count(input_node->get_output_shape(0), 1);
+    size_t k = paramlist->num_output;
+
+    std::vector<int> matShape;
+    matShape.push_back(m);
+    matShape.push_back(n);
+
+    auto reshapeConstNode =
+        std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape({2}), matShape);
+
+    auto reshapeNode = std::make_shared<ngraph::op::v1::Reshape>(input_node->output(0), reshapeConstNode, true);
+
+    auto weightsNode = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::f32, ngraph::Shape({k, n}),
+                                                              resource->weight_handle.force_to<float *>());
+
+    auto matMulNode = std::make_shared<ngraph::op::MatMul>(reshapeNode->output(0), weightsNode->output(0), false, true);
+
+    if (paramlist->has_bias) {
+        ngraph::Shape biasShape;
+        auto output_shape = matMulNode->get_output_shape(0);
+        for (int i = 0; i < output_shape.size(); i++) {
+            if (i == paramlist->axis) {
+                biasShape.push_back(output_shape.at(i));
+            } else {
+                biasShape.push_back(1);
+            }
+        }
+
+        auto biasNode = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::f32, biasShape,
+                                                               resource->bias_handle.force_to<float *>());
+
+        auto addNode = std::make_shared<ngraph::op::v1::Add>(matMulNode->output(0), biasNode->output(0));
+
+        addNode->set_friendly_name(paramlist->name);
+        addNode->validate_and_infer_types();
+
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(addNode);
+        SetOutputTensors(outputNodes);
+
+    } else {
+        matMulNode->set_friendly_name(paramlist->name);
+        matMulNode->validate_and_infer_types();
+
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(matMulNode);
+        SetOutputTensors(outputNodes);
+    }
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc
new file mode 100644
index 0000000..7fc2ef7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/network/openvino/custom_layer/custom_instance_norm.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+Status InstanceNormOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<InstanceNormLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto input_node = GetInputNodes()[0];
+
+    auto input_shape = input_node->get_output_shape(0);
+    ngraph::Shape instNormShape;
+    for (size_t i = 0; i < input_shape.size(); i++) {
+        if (i == 1) instNormShape.push_back(input_shape.at(1));
+        else instNormShape.push_back(1); 
+    }
+    auto resource = dynamic_cast<InstanceNormLayerResource*>(GetResource());
+
+    if (1) {
+        auto scaleConstNode = std::make_shared<ngraph::op::Constant>(
+            ngraph::element::Type_t::f32, instNormShape, resource->scale_handle.force_to<float*>());
+        auto biasConstNode  = std::make_shared<ngraph::op::Constant>(
+            ngraph::element::Type_t::f32, instNormShape, resource->bias_handle.force_to<float*>());
+
+        auto MVNNode = std::make_shared<ngraph::op::MVN>(input_node->output(0), false, true, 1e-05f);
+        MVNNode->validate_and_infer_types();
+        auto scaleNode = std::make_shared<ngraph::op::v1::Multiply>(MVNNode->output(0), scaleConstNode);
+        scaleNode->validate_and_infer_types();
+        auto biasNode  = std::make_shared<ngraph::op::v1::Add>(scaleNode->output(0), biasConstNode);
+        biasNode->validate_and_infer_types();
+
+        biasNode->set_friendly_name(param_->name);
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(biasNode);
+        SetOutputTensors(outputNodes);
+    } else {
+        auto instNormNode = std::make_shared<CustomInstanceNormOp>(input_node->outputs(), base_layer_, GetInputBlobs(), GetOutputBlobs());
+        instNormNode->set_friendly_name(param_->name);
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(instNormNode);
+        SetOutputTensors(outputNodes);
+    }
+
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc
new file mode 100644
index 0000000..5d4ccb5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_layer_norm.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(LayerNorm, LAYER_LAYER_NORM);
+
+Status LayerNormOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<LayerNormLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 2) {
+        LOGE("Error: input nodes size should be 3\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    ADD_CUSTOM_NODE(LayerNorm, paramlist->name);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(LayerNorm, LAYER_LAYER_NORM);
+REGISTER_CUSTOM_TYPE(LAYER_LAYER_NORM);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_layer_builder.cc
new file mode 100644
index 0000000..4a4f1ae
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Log, LAYER_LOG);
+
+REGISTER_UNARY_LAYER_BUILDER(Log, LAYER_LOG);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc
new file mode 100644
index 0000000..635d02f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(LogSigmoid, LAYER_LOGSIGMOID);
+
+Status LogSigmoidOVLayerBuilder::Build() {
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto sigmoidNode = std::make_shared<ngraph::op::Sigmoid>(input_node->output(0));
+    sigmoidNode->validate_and_infer_types();
+    
+    auto logNode = std::make_shared<ngraph::op::Log>(sigmoidNode->output(0));
+    logNode->validate_and_infer_types();
+
+    logNode->set_friendly_name(param_->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(logNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(LogSigmoid, LAYER_LOGSIGMOID);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc
new file mode 100644
index 0000000..a10da84
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(LRN, LAYER_LRN);
+
+Status LRNOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<LRNLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto lrnNode = std::make_shared<ngraph::op::LRN>(
+        input_node->output(0), paramlist->alpha, paramlist->beta, paramlist->bias, paramlist->size);
+    
+    lrnNode->set_friendly_name(paramlist->name);
+    lrnNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(lrnNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(LRN, LAYER_LRN);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc
new file mode 100644
index 0000000..31a23f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc
@@ -0,0 +1,140 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+#include "tnn/network/openvino/custom_layer/custom_lstm_onnx.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
+
+Status LSTMONNXOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<LSTMONNXLayerParam *>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_nodes = GetInputNodes();
+
+    // // input0: X:[sequence,batch,inputsize] -> [batch,sequence,inputsize] permute node (1,0,2)
+    // // input4: hidden initial state:[direction,batch,hiddensize] -> [batch,direction,hiddensize] permute node (1,0,2)
+    // // input5: hidden initial state:[direction,batch,hiddensize] -> [batch,direction,hiddensize] permute node (1,0,2)
+    // // input1: W: const node
+    // // input2: R: const node
+    // // input3: bias:[direction, 8*hiddensize] -> [direction, 4*hiddensize] slice node, add node
+    // // output: Y [batch,sequence,outputsize] -> [sequence,batch,outputsize] permute node[1,0,2]
+
+    // auto directions = paramlist->direction >= 2 ? ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL
+    //                                             : ngraph::op::RecurrentSequenceDirection::FORWARD;
+    // const std::vector<float> activations_alpha = {};
+    // const std::vector<float> activations_beta  = {};
+    // const std::vector<std::string> activations = {"sigmoid", "sigmoid", "sigmoid"};
+    // std::vector<int> permutePattern            = {1, 0, 2};
+    // std::vector<int> sliceAxis                 = {1};
+    // std::vector<int> sequence                  = {(int)input_nodes[0]->get_output_shape(0)[0]};
+    // std::vector<int> lstm_permute              = {2, 0, 1, 3};
+    // std::vector<int> lstm_reshape              = {0, 0, -1};
+
+    // auto orderNode =
+    //     std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape{3}, permutePattern);
+    // auto splitNode = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape{}, sliceAxis);
+    // auto sequenceNode = ngraph::op::Constant::create(ngraph::element::i32, ngraph::Shape{1}, sequence);
+    // auto reshapeNode =
+    //     std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape{3}, lstm_reshape);
+    // // auto zeroNode = ngraph::op::Constant::create(
+    // //                           ngraph::element::f32,
+    // //                           ngraph::Shape{(paramlist->direction >= 2 ? 2UL : 1UL),
+    // //                                 3UL * static_cast<size_t>(paramlist->hidden_size)},
+    // //                           std::vector<float>{0.f});
+
+    // auto inputNode       = std::make_shared<ngraph::op::Transpose>(input_nodes[0]->output(0), orderNode);
+    // auto hiddenStateNode = std::make_shared<ngraph::op::Transpose>(input_nodes[4]->output(0), orderNode);
+    // auto cellStateNode   = std::make_shared<ngraph::op::Transpose>(input_nodes[5]->output(0), orderNode);
+    // auto sliceNode       = std::make_shared<ngraph::op::v1::Split>(input_nodes[3]->output(0), splitNode, 2);
+    // auto addNode         = std::make_shared<ngraph::op::v0::Add>(sliceNode->output(0), sliceNode->output(1));
+    // auto lstmNode        = std::make_shared<ngraph::op::v5::LSTMSequence>(inputNode->output(0), //input
+    //                                                                       hiddenStateNode->output(0), //hiddenstate
+    //                                                                       cellStateNode->output(0), //cellstate
+    //                                                                       sequenceNode->output(0),
+    //                                                                       input_nodes[1]->output(0), //W
+    //                                                                       input_nodes[2]->output(0), //R
+    //                                                                       addNode->output(0), //B
+    //                                                                       paramlist->hidden_size,
+    //                                                                       directions,
+    //                                                                       activations_alpha,
+    //                                                                       activations_beta,
+    //                                                                       activations);
+    
+    // auto lstmOrderNode =
+    //     std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape{4}, lstm_permute);
+    // auto lstmPermuteNode = std::make_shared<ngraph::op::Transpose>(lstmNode->output(0), lstmOrderNode);
+    // auto lstmReshapeNode = std::make_shared<ngraph::op::v1::Reshape>(lstmPermuteNode->output(0), reshapeNode, true);
+
+    // auto lstmHiddenPermuteNode = std::make_shared<ngraph::op::Transpose>(lstmNode->output(1), orderNode);
+    // auto lstmCellPermuteNode   = std::make_shared<ngraph::op::Transpose>(lstmNode->output(2), orderNode);
+
+    // lstmReshapeNode->validate_and_infer_types();
+    // lstmReshapeNode->set_friendly_name(paramlist->name);
+
+    // ngraph::NodeVector outputNodes;
+    // outputNodes.push_back(lstmReshapeNode);
+    // outputNodes.push_back(lstmHiddenPermuteNode);
+    // outputNodes.push_back(lstmCellPermuteNode);
+    // SetOutputTensors(outputNodes);
+
+    // return TNN_OK;
+
+
+
+    // build to openvino lstm sequence node success, but crc wrong, use custom lstm op instead
+
+    ngraph::OutputVector inputs;
+    for (auto item : input_nodes) {
+        inputs.push_back(item->output(0));
+    }
+    auto lstmNode = std::make_shared<CustomLSTMONNXOp>(
+        inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());
+
+    lstmNode->validate_and_infer_types();
+    lstmNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    for (auto &iter : lstmNode->outputs()) {
+        outputNodes.push_back(iter.get_node_shared_ptr());
+    }
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
+REGISTER_CUSTOM_TYPE(LAYER_LSTMONNX);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc
new file mode 100644
index 0000000..3aa53cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(MatMul, LAYER_MATMUL);
+
+Status MatMulOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource  = dynamic_cast<MatMulLayerResource *>(resource_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto input_node  = GetInputNodes()[0];
+    auto input_shape = input_node->get_output_shape(0);
+
+    std::shared_ptr<ngraph::Node> matmul_node;
+    auto input_nodes = GetInputNodes();
+    if (input_nodes.size() == 2) {
+        matmul_node = std::make_shared<ngraph::op::MatMul>(input_nodes[0], input_nodes[1], false, false);
+    } else {
+        auto weight_dims = paramlist->weight_position == 0 ? paramlist->matrix_a_dims : paramlist->matrix_b_dims;
+        auto reshape_const_node =
+            std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, ngraph::Shape({2}), weight_dims);
+
+        auto weight_node = std::make_shared<ngraph::op::Constant>(
+            ngraph::element::Type_t::f32,
+            ngraph::Shape({1, static_cast<uint64_t>(DimsVectorUtils::Count(weight_dims))}),
+            resource->weight.force_to<float *>());
+        auto weight_reshape_node =
+            std::make_shared<ngraph::op::v1::Reshape>(weight_node->output(0), reshape_const_node, true);
+
+        if (paramlist->weight_position == 0) {
+            matmul_node = std::make_shared<ngraph::op::MatMul>(weight_reshape_node, input_nodes[0], false, false);
+        } else {
+            matmul_node = std::make_shared<ngraph::op::MatMul>(input_nodes[0], weight_reshape_node, false, false);
+        }
+    }
+
+    matmul_node->set_friendly_name(paramlist->name);
+    matmul_node->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(matmul_node);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(MatMul, LAYER_MATMUL);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/max_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/max_layer_builder.cc
new file mode 100644
index 0000000..81b9620
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/max_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_BINARY_LAYER_BUILDER(Maximum, LAYER_MAXIMUM);
+
+REGISTER_BINARY_LAYER_BUILDER(Maximum, LAYER_MAXIMUM);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/min_layer_builder.cc
new file mode 100644
index 0000000..eef55af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/min_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_BINARY_LAYER_BUILDER(Minimum, LAYER_MINIMUM);
+
+REGISTER_BINARY_LAYER_BUILDER(Minimum, LAYER_MINIMUM);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc
new file mode 100644
index 0000000..1ffcb5c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_BINARY_LAYER_BUILDER(Multiply, LAYER_MUL);
+
+REGISTER_BINARY_LAYER_BUILDER(Multiply, LAYER_MUL);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc
new file mode 100644
index 0000000..edc05b0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Negative, LAYER_NEG);
+
+REGISTER_UNARY_LAYER_BUILDER(Negative, LAYER_NEG);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc
new file mode 100644
index 0000000..91ec563
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Normalize, LAYER_NORMALIZE);
+
+Status NormalizeOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<NormalizeLayerParam*>(param_);
+
+    int p = paramlist->p;
+    if ((p != 1 && p != 2 && p != INT_MAX && p != INT_MIN) || paramlist->axis != 1 || paramlist->across_spatial != 0) {
+        LOGE("Error: Normalize layer param is not supported now\n");
+        return Status(TNNERR_INST_ERR, "Error: Normalize layer param is not supported now");
+    }
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // channle shared, across_spatial not use
+    size_t axisNum = 1;
+    std::vector<int> axis({paramlist->axis});
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{axisNum}, axis);
+    
+    auto epsilonNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), \
+        std::vector<float>{paramlist->epsilon});
+
+    std::shared_ptr<ngraph::Node> reduceNode;
+    if (p == 1) {
+        auto absNode = std::make_shared<ngraph::op::Abs>(input_node->output(0));
+        reduceNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+            absNode, axisNode, true);
+    } else if (p == 2) {
+        auto squareNode = std::make_shared<ngraph::op::v1::Multiply>(
+            input_node->output(0), input_node->output(0));
+        auto sumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+            squareNode, axisNode, true);
+        auto sqrtNode = std::make_shared<ngraph::op::Sqrt>(sumNode);
+        reduceNode = std::make_shared<ngraph::op::v1::Maximum>(
+            sqrtNode, epsilonNode);
+    } else if (p == INT_MAX) {
+        reduceNode = std::make_shared<ngraph::op::v1::ReduceMax>(
+            input_node->output(0), axisNode, true);
+    } else if (p == INT_MIN) {
+        reduceNode = std::make_shared<ngraph::op::v1::ReduceMin>(
+            input_node->output(0), axisNode, true);
+    }
+
+    auto divNode = std::make_shared<ngraph::op::v1::Divide>(
+        input_node->output(0), reduceNode);
+    divNode->validate_and_infer_types();
+
+    divNode->set_friendly_name(paramlist->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(divNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Normalize, LAYER_NORMALIZE);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc
new file mode 100644
index 0000000..345a144
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(OneHot, LAYER_ONEHOT);
+
+Status OneHotOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<OneHotLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto input_node = GetInputNodes();
+
+    auto depthNode = std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{}, paramlist->depth);
+    auto onNode    = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{}, paramlist->value_on);
+    auto offNode = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{}, paramlist->value_off);
+
+    auto onehotNode = std::make_shared<ngraph::op::v1::OneHot>(input_node[0]->output(0), depthNode->output(0),
+                                                               onNode->output(0), offNode->output(0), paramlist->axis);
+
+    onehotNode->validate_and_infer_types();
+    onehotNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(onehotNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(OneHot, LAYER_ONEHOT);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc
new file mode 100644
index 0000000..53ab2fe
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+
+#include <mutex>
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+
+namespace TNN_NS {
+
+OpenVINOLayerBuilder::OpenVINOLayerBuilder(LayerType type): BaseLayerBuilder(type) {
+    _x86_map = X86Device::GetLayerCreatorMap();
+    _ov_custom_type = CustomOpenvinoLayerManager::GetCustomLayerTypeSet();
+    base_layer_ = CreateLayer(type_);
+}
+
+OpenVINOLayerBuilder::~OpenVINOLayerBuilder() {
+    if (base_layer_) {
+        delete base_layer_;
+    }
+}
+
+Status OpenVINOLayerBuilder::Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& input_blobs,
+                       std::vector<Blob*>& output_blobs, AbstractDevice* device) {
+    input_blobs_  = input_blobs;
+    output_blobs_ = output_blobs;
+    
+    param_    = param;
+    resource_ = resource;
+
+    if (_x86_map.find(type_) != _x86_map.end() && _ov_custom_type.find(type_) != _ov_custom_type.end()) {
+        base_layer_->Init(context, param, resource, input_blobs, output_blobs, device);
+    } else {
+        base_layer_->Init(context, param, resource, input_blobs, output_blobs, GetDevice(DEVICE_NAIVE));
+    }
+
+    RETURN_ON_NEQ(Build(), TNN_OK);
+    RETURN_ON_NEQ(InferOutputDataType(), TNN_OK);
+
+    LOGD("InferOutputShape: name:%s %s \n", param->name.c_str(), output_blobs[0]->GetBlobDesc().description().c_str());
+
+    return TNN_OK;
+
+}
+
+std::vector<std::shared_ptr<ngraph::Node>> OpenVINOLayerBuilder::GetInputNodes() {
+    std::vector<std::shared_ptr<ngraph::Node>> input_nodes;
+    for(auto tensor : GetInputTensors()) {
+        auto openvino_tensor = std::dynamic_pointer_cast<OpenvinoTensor>(tensor);
+        if (openvino_tensor){
+            input_nodes.push_back(openvino_tensor->GetNode());
+        } else {
+            LOGE("Error: OpenVINOLayerBuilder(%s) got none-openvino input tensor\n", layer_name_.c_str());
+            return std::vector<std::shared_ptr<ngraph::Node>>();
+        }
+    }
+    return input_nodes;
+}
+
+LayerResource* OpenVINOLayerBuilder::GetResource() {
+    return resource_;
+}
+
+Status OpenVINOLayerBuilder::SetOutputTensors(ngraph::NodeVector nodes) {
+    int index = 0;
+    for (auto blob : output_blobs_) {
+        auto name = blob->GetBlobDesc().name;
+        auto tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+        auto openvino_tensor = std::dynamic_pointer_cast<OpenvinoTensor>(tensor);
+        openvino_tensor->SetNode(nodes[index]);
+        index++;
+    }
+
+    return TNN_OK;
+}
+
+std::vector<std::shared_ptr<ngraph::Node>> OpenVINOLayerBuilder::GetOutputNodes() {
+    std::vector<std::shared_ptr<ngraph::Node>> output_nodes;
+    for(auto tensor : GetOutputTensors()) {
+        auto openvino_tensor = std::dynamic_pointer_cast<OpenvinoTensor>(tensor);
+        if (openvino_tensor){
+            output_nodes.push_back(openvino_tensor->GetNode());
+        } else {
+            LOGE("Error: OpenVINOLayerBuilder(%s) got none-openvino output tensor\n", layer_name_.c_str());
+            return std::vector<std::shared_ptr<ngraph::Node>>();
+        }
+    }
+     return output_nodes;
+}
+
+Status OpenVINOLayerBuilder::Reshape(){
+    return TNN_OK;
+}
+
+Status OpenVINOLayerBuilder::Forward(){
+    return TNN_OK;
+}
+
+void OpenVINOLayerBuilder::SetConstantResource(ConstantResource* consts) {
+    BaseLayer::SetConstantResource(consts);
+    this->base_layer_->SetConstantResource(consts);
+}
+
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetOpenVINOLayerBuilderCreatorMap() {
+    // static shared_ptr of LayerCreatorMap.
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>); });
+    return *creators;
+}
+
+OpenVINOLayerBuilder* CreateOpenVINOLayerBuilder(LayerType type) {
+    OpenVINOLayerBuilder* cur_layer    = NULL;
+    auto& map = GetOpenVINOLayerBuilderCreatorMap();
+    if (map.count(type) > 0) {
+        auto base_layer = map[type]->CreateLayerBuilder();
+        cur_layer = dynamic_cast<OpenVINOLayerBuilder*>(base_layer);
+    }
+    return cur_layer;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h
new file mode 100644
index 0000000..81d212f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h
@@ -0,0 +1,124 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_OPENVINO_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_OPENVINO_LAYER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/device/x86/x86_device.h"
+
+namespace TNN_NS {
+
+//@brief BaseLayeer Builder, defines the layer builder interface
+class OpenVINOLayerBuilder: public BaseLayerBuilder {
+public:
+    explicit OpenVINOLayerBuilder(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~OpenVINOLayerBuilder();
+
+    // @brief layer init
+    // @param ...
+    Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+                std::vector<Blob*>& outputs, AbstractDevice* device);
+
+    //@brief Reshape recalculate the output tensor dims
+    virtual Status Reshape();
+
+    //@brief layer infer
+    virtual Status Forward();
+
+    // @brief set constant resource
+    virtual void SetConstantResource(ConstantResource* consts);
+
+    ngraph::element::Type_t DataTransfer(DataType type) {
+        return dataTypeTransfer[type];
+    }
+protected:
+
+    //@brief get all input nodes
+    virtual std::vector<std::shared_ptr<ngraph::Node>> GetInputNodes();
+
+    //@brief get all input nodes
+    virtual std::vector<std::shared_ptr<ngraph::Node>> GetOutputNodes();
+
+    virtual Status SetOutputTensors(ngraph::NodeVector);
+    //@brief Build the foreign network 
+    virtual Status Build() = 0 ;
+
+    virtual LayerResource* GetResource();
+    BaseLayer* base_layer_;
+
+    std::map<DataType, ngraph::element::Type_t> dataTypeTransfer = {
+        {DATA_TYPE_FLOAT, ngraph::element::Type_t::f32},
+        {DATA_TYPE_BFP16, ngraph::element::Type_t::bf16},
+        {DATA_TYPE_HALF, ngraph::element::Type_t::f16},
+        {DATA_TYPE_INT32, ngraph::element::Type_t::i32},
+        {DATA_TYPE_INT8, ngraph::element::Type_t::i8}
+    };
+
+    std::map<LayerType, std::shared_ptr<LayerAccCreator>> _x86_map;
+    std::set<LayerType> _ov_custom_type;
+};
+
+//@brief TypeLayerBuilderCreator register map
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetOpenVINOLayerBuilderCreatorMap();
+
+//@brief TypeLayerBuilderRegister register TypeLayerBuilderCreator
+template <typename T>
+class TypeLayerBuilderRegister {
+public:
+    explicit TypeLayerBuilderRegister(LayerType type) {
+        GetOpenVINOLayerBuilderCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+OpenVINOLayerBuilder* CreateOpenVINOLayerBuilder(LayerType type);
+
+#define DECLARE_OPENVINO_LAYER_BUILDER(type_string, layer_type)                                                        \
+    class type_string##OVLayerBuilder : public OpenVINOLayerBuilder {                                                  \
+    public:                                                                                                            \
+        type_string##OVLayerBuilder(LayerType ignore) : OpenVINOLayerBuilder(layer_type){};                            \
+        virtual ~type_string##OVLayerBuilder(){};                                                                      \
+                                                                                                                       \
+    protected:                                                                                                         \
+        virtual Status InferOutputShape() {return TNN_OK;};                                                          \
+        virtual Status InferOutputDataType() {return TNN_OK;};                                                         \
+        virtual Status Build();                                                                                        \
+    }
+
+#define REGISTER_OPENVINO_LAYER_BUILDER(type_string, layer_type)                                                       \
+    TypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##OVLayerBuilder>>                                     \
+        g_##layer_type##_ov_layer_builder_register(layer_type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_OPENVINO_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc
new file mode 100644
index 0000000..1bba09c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Pad, LAYER_PAD);
+
+Status PadOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<PadLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // set pad node
+    std::vector<int> beginPattern, endPattern;
+    for (size_t i = 0; i < 1; i++) {
+        beginPattern.push_back(0);
+        endPattern.push_back(0);
+    }
+
+    beginPattern.push_back(paramlist->pads.at(4));
+    beginPattern.push_back(paramlist->pads.at(2));
+    beginPattern.push_back(paramlist->pads.at(0));
+    endPattern.push_back(paramlist->pads.at(5));
+    endPattern.push_back(paramlist->pads.at(3));
+    endPattern.push_back(paramlist->pads.at(1));
+
+    auto pad_begin = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{4}, beginPattern);
+    auto pad_end = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{4}, endPattern);
+
+    ngraph::op::PadMode padMode;
+    if (paramlist->type == 0) {
+        padMode = ngraph::op::PadMode::CONSTANT;
+    } else if (paramlist->type == 1) {
+        padMode = ngraph::op::PadMode::REFLECT;
+    } else {
+        padMode = ngraph::op::PadMode::EDGE;
+    }
+
+    auto padNode = std::make_shared<ngraph::op::v1::Pad>(
+        input_node->output(0), pad_begin, pad_end, padMode);
+
+    padNode->validate_and_infer_types();
+    padNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(padNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Pad, LAYER_PAD);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc
new file mode 100644
index 0000000..fbd8360
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_pad_v2.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(PadV2, LAYER_PADV2);
+
+Status PadV2OVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<PadLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    if (paramlist->pads.size() % 2 != 0) {
+        return Status(TNNERR_PARAM_ERR, "Error: Pads size must be even\n");
+    }
+
+    if (paramlist->type != 0) {
+        return Status(TNNERR_PARAM_ERR, "Error: padv2 layer param is not supported");
+    }
+
+    // // set pad node
+    // std::vector<int> beginPattern, endPattern;
+    // for (int i = paramlist->pads.size() / 2 - 1; i >= 0; i--) {
+    //     beginPattern.push_back(paramlist->pads[i*2]);
+    //     endPattern.push_back(paramlist->pads[i*2+1]);
+    // }
+
+    // auto pad_begin = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::i32, ngraph::Shape{paramlist->pads.size()/2}, beginPattern);
+    // auto pad_end = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::i32, ngraph::Shape{paramlist->pads.size()/2}, endPattern);
+    // auto pad_value = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::f32, ngraph::Shape{}, paramlist->value);
+
+    // ngraph::op::PadMode padMode;
+    // if (paramlist->type == 0) {
+    //     padMode = ngraph::op::PadMode::CONSTANT;
+    // } else if (paramlist->type == 1) {
+    //     padMode = ngraph::op::PadMode::REFLECT;
+    // } else {
+    //     padMode = ngraph::op::PadMode::EDGE;
+    // }
+
+    // auto padNode = std::make_shared<ngraph::op::v1::Pad>(
+    //     input_node[0]->output(0), pad_begin, pad_end, pad_value, padMode);
+
+    // padNode->validate_and_infer_types();
+    // padNode->set_friendly_name(paramlist->name);
+
+    // ngraph::NodeVector outputNodes;
+    // outputNodes.push_back(padNode);
+    // SetOutputTensors(outputNodes);
+
+
+    ADD_CUSTOM_NODE(PadV2, paramlist->name);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(PadV2, LAYER_PADV2);
+REGISTER_CUSTOM_TYPE(LAYER_PADV2);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc
new file mode 100644
index 0000000..d22451f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Permute, LAYER_PERMUTE);
+
+Status PermuteOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<PermuteLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    size_t size = paramlist->orders.size();
+
+    // set transpose node
+    ngraph::Shape permuteShape;
+    permuteShape.push_back(size);
+    std::vector<int> permutePattern;
+    for (auto item : paramlist->orders) {
+        permutePattern.push_back(item);
+    }
+
+    auto orderNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, permuteShape, permutePattern);
+    
+    // set transpose Node
+    auto permuteNode = std::make_shared<ngraph::op::Transpose>(
+        input_node->output(0), orderNode);
+    permuteNode->set_friendly_name(paramlist->name);
+    permuteNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(permuteNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Permute, LAYER_PERMUTE);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc
new file mode 100644
index 0000000..f1ff7d1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_pooling.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Pooling, LAYER_POOLING);
+
+Status PoolingOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+
+    // set strides
+    ngraph::Strides strides;
+    for (auto item : paramlist->strides) {
+        strides.push_back(item);
+    }
+    std::reverse(strides.begin(), strides.end());
+
+    // set pads
+    ngraph::Shape pad_begin, pad_end;
+    pad_begin.push_back(paramlist->pads.at(2));
+    pad_begin.push_back(paramlist->pads.at(0));
+    pad_end.push_back(paramlist->pads.at(3));
+    pad_end.push_back(paramlist->pads.at(1));
+
+    // set rounding
+    ngraph::op::RoundingType rounding_type;
+    if (paramlist->ceil_mode == 1) {
+        rounding_type = ngraph::op::RoundingType::CEIL;
+    } else {
+        rounding_type = ngraph::op::RoundingType::FLOOR;
+    }
+
+    // set pad type
+    ngraph::op::PadType pad_type;  //= ngraph::op::PadType::EXPLICIT;
+    if (paramlist->pad_type == -1) {
+        pad_type = ngraph::op::PadType::EXPLICIT;
+    } else if (paramlist->pad_type == 0) {
+        pad_type = ngraph::op::PadType::SAME_UPPER;
+    } else {
+        pad_type = ngraph::op::PadType::VALID;
+    }
+
+    // kernel shape
+    ngraph::Shape kernel_shape;
+    for (int i = 1; i >= 0; i--) {
+        if (paramlist->kernels.at(i) == 0) {
+            kernel_shape.push_back(input_node[0]->output(0).get_shape().at(3 - i));
+        } else {
+            kernel_shape.push_back(paramlist->kernels.at(i));
+        }
+    }
+
+    std::shared_ptr<ngraph::Node> poolNode;
+
+    if (0/*ngraph::op::PadType::Valid == pad_type*/) {
+        if (paramlist->pool_type == 0) {  // max pool
+            poolNode = std::make_shared<ngraph::op::v1::MaxPool>(input_node[0]->output(0), strides, pad_begin, pad_end,
+                                                                 kernel_shape, rounding_type, pad_type);
+        } else {  // average pool
+            poolNode = std::make_shared<ngraph::op::v1::AvgPool>(input_node[0]->output(0), strides, pad_begin, pad_end,
+                                                                 kernel_shape, true, rounding_type, pad_type);
+        }
+    } else {
+        // different shape calculation in valid&same mode, use tnn op instead
+        // https://docs.openvinotoolkit.org/2021.3/openvino_docs_ops_pooling_MaxPool_1.html
+        // tnn: H_out = ceil or floor((H - kernel + 1 )/ strides)
+        // ov : H_out = ceil or floor((H - kernel ) / strides) + 1
+        ngraph::OutputVector inputs;
+        for (auto item : input_node) {
+            inputs.push_back(item->output(0));
+        }
+        poolNode = std::make_shared<CustomPoolingOp>(inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());
+    }
+
+    poolNode->validate_and_infer_types();
+    poolNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(poolNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Pooling, LAYER_POOLING);
+REGISTER_CUSTOM_TYPE(LAYER_POOLING);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc
new file mode 100644
index 0000000..6e06561
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Pow, LAYER_POWER);
+
+Status PowOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<PowLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto powNodeShape = ngraph::Shape(input_node->get_output_shape(0).size(), 1);
+    auto mulConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, powNodeShape, std::vector<float>{paramlist->scale});
+    auto mulNode = std::make_shared<ngraph::op::v1::Multiply>(input_node->output(0), mulConst);
+    mulNode->set_friendly_name(paramlist->name + "_mul");
+    mulNode->validate_and_infer_types();
+
+    auto addConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, powNodeShape, std::vector<float>{paramlist->shift});
+    auto addNode = std::make_shared<ngraph::op::v1::Add>(mulNode->output(0), addConst);
+    addNode->set_friendly_name(paramlist->name + "_add");
+    addNode->validate_and_infer_types();
+
+    auto powerConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, powNodeShape, std::vector<float>{paramlist->exponent});
+    auto powerNode = std::make_shared<ngraph::op::v1::Power>(addNode->output(0), powerConst);
+    powerNode->validate_and_infer_types();
+
+    powerNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(powerNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Pow, LAYER_POWER);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc
new file mode 100644
index 0000000..cf9ad04
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(PReLU, LAYER_PRELU);
+
+Status PReLUOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<PReluLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // set slope node
+    ngraph::Shape slopeShape;
+    for (auto item : input_node->get_output_shape(0)) {
+        slopeShape.push_back(1);
+    }
+
+    if (!paramlist->channel_shared) { // shared channel
+        slopeShape[1] = input_node->get_output_shape(0).at(1);
+    }
+
+    auto resource = dynamic_cast<PReluLayerResource*>(GetResource());
+    auto slopeNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, slopeShape, resource->slope_handle.force_to<float*>());
+    
+    // set prelu node
+    auto preluNode = std::make_shared<ngraph::op::PRelu>(
+        input_node->output(0), slopeNode);
+    
+    preluNode->set_friendly_name(paramlist->name);
+    preluNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(preluNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(PReLU, LAYER_PRELU);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc
new file mode 100644
index 0000000..44980b9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(PriorBox, LAYER_PRIOR_BOX);
+
+Status PriorBoxOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<PriorBoxLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=1) {
+        LOGE("Error: Prior box requires more than 1 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+
+    ngraph::op::PriorBoxAttrs attrs;
+
+    attrs.min_size      = paramlist->min_sizes;
+    attrs.max_size      = paramlist->max_sizes;
+    attrs.aspect_ratio  = paramlist->aspect_ratios;
+    attrs.density;                                  // miss
+    attrs.fixed_ratio;                              // miss
+    attrs.fixed_size;                               // miss
+    attrs.clip          = paramlist->clip;
+    attrs.flip          = 0; //paramlist->flip;
+    attrs.step          = paramlist->step_h;        // step_w
+    attrs.offset        = paramlist->offset;
+    attrs.variance      = paramlist->variances;
+    attrs.scale_all_sizes = true;                          // miss
+
+    auto size_shape = input_node[0]->get_output_shape(0);
+    auto sizeNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i64, ngraph::Shape({2}), std::vector<size_t>({size_shape.at(2), size_shape.at(3)}));
+    auto image_shape = input_node[1]->get_output_shape(0);
+    auto imageNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i64, ngraph::Shape({2}), std::vector<size_t>({image_shape.at(2), image_shape.at(3)}));
+    auto priorBoxNode = std::make_shared<ngraph::op::PriorBox>(
+        sizeNode->output(0), imageNode->output(0), attrs);
+    
+    ngraph::Shape reshape;
+    reshape.push_back(1);
+    for (auto shape : priorBoxNode->get_output_shape(0)) {
+        reshape.push_back(shape);
+    }
+    reshape.push_back(1);
+    auto reshapeConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape({reshape.size()}), reshape);
+    auto reshapeNode = std::make_shared<ngraph::op::v1::Reshape>(
+        priorBoxNode->output(0), reshapeConst, false);
+    reshapeNode->set_friendly_name(paramlist->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reshapeNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(PriorBox, LAYER_PRIOR_BOX);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc
new file mode 100644
index 0000000..f6baa9c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Reciprocal, LAYER_RECIPROCAL);
+
+Status ReciprocalOVLayerBuilder::Build() {
+    
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto oneConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), std::vector<float>{1.0f});
+    
+    auto divNode = std::make_shared<ngraph::op::v1::Divide>(oneConst, input_node->output(0));
+    divNode->set_friendly_name(param_->name);
+    
+    divNode->validate_and_infer_types();
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(divNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Reciprocal, LAYER_RECIPROCAL);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc
new file mode 100644
index 0000000..bea7ef9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceL1, LAYER_REDUCE_L1);
+
+Status ReduceL1OVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto absNode = std::make_shared<ngraph::op::Abs>(input_node->output(0));
+    absNode->validate_and_infer_types();
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    reduceSumNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceSumNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceL1, LAYER_REDUCE_L1);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc
new file mode 100644
index 0000000..ea8a2f8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceL2, LAYER_REDUCE_L2);
+
+Status ReduceL2OVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // reduce l2 : square -> reduceSum -> sqrt
+    auto squareConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), std::vector<float>{2.0f});
+    auto squareNode = std::make_shared<ngraph::op::v1::Power>(input_node->output(0), squareConst);
+    squareNode->validate_and_infer_types();
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        squareNode->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    auto sqrtNode = std::make_shared<ngraph::op::Sqrt>(reduceSumNode->output(0));
+    sqrtNode->validate_and_infer_types();
+    
+    sqrtNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(sqrtNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceL2, LAYER_REDUCE_L2);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc
new file mode 100644
index 0000000..e17380c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+Status ReduceLogSumExpOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto expNode = std::make_shared<ngraph::op::Exp>(input_node->output(0));
+    expNode->validate_and_infer_types();
+    
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        expNode->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    auto logNode = std::make_shared<ngraph::op::Log>(reduceSumNode->output(0));
+    logNode->validate_and_infer_types();
+
+    logNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(logNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc
new file mode 100644
index 0000000..b7996d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+Status ReduceLogSumOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    auto logNode = std::make_shared<ngraph::op::Log>(reduceSumNode->output(0));
+    logNode->validate_and_infer_types();
+
+    logNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(logNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc
new file mode 100644
index 0000000..130eada
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceMax, LAYER_REDUCE_MAX);
+
+Status ReduceMaxOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceMaxNode = std::make_shared<ngraph::op::v1::ReduceMax>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceMaxNode->validate_and_infer_types();
+
+    reduceMaxNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceMaxNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceMax, LAYER_REDUCE_MAX);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc
new file mode 100644
index 0000000..8b0c72f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceMean, LAYER_REDUCE_MEAN);
+
+Status ReduceMeanOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceMeanNode = std::make_shared<ngraph::op::v1::ReduceMean>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceMeanNode->validate_and_infer_types();
+
+    reduceMeanNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceMeanNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceMean, LAYER_REDUCE_MEAN);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc
new file mode 100644
index 0000000..2c20751
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceMin, LAYER_REDUCE_MIN);
+
+Status ReduceMinOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceMinNode = std::make_shared<ngraph::op::v1::ReduceMin>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceMinNode->validate_and_infer_types();
+
+    reduceMinNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceMinNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceMin, LAYER_REDUCE_MIN);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc
new file mode 100644
index 0000000..4ab8cf2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceProd, LAYER_REDUCE_PROD);
+
+Status ReduceProdOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceProdNode = std::make_shared<ngraph::op::v1::ReduceProd>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceProdNode->validate_and_infer_types();
+
+    reduceProdNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceProdNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceProd, LAYER_REDUCE_PROD);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc
new file mode 100644
index 0000000..cf3dbca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceSum, LAYER_REDUCE_SUM);
+
+Status ReduceSumOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        input_node->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    reduceSumNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceSumNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceSum, LAYER_REDUCE_SUM);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc
new file mode 100644
index 0000000..0af972d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+Status ReduceSumSquareOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto squareConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), std::vector<float>{2.0f});
+    auto squareNode = std::make_shared<ngraph::op::v1::Power>(input_node->output(0), squareConst);
+    squareNode->validate_and_infer_types();
+
+    auto axisNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, ngraph::Shape{paramlist->axis.size()}, paramlist->axis);
+    auto reduceSumNode = std::make_shared<ngraph::op::v1::ReduceSum>(
+        squareNode->output(0), axisNode, paramlist->keep_dims);
+    reduceSumNode->validate_and_infer_types();
+
+    reduceSumNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reduceSumNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc
new file mode 100644
index 0000000..03b9ea3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Relu6, LAYER_RELU6);
+
+Status Relu6OVLayerBuilder::Build() {
+    
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto relu6Node = std::make_shared<ngraph::op::Clamp>(
+        input_node->output(0), 0, 6);
+
+    relu6Node->set_friendly_name(param_->name);
+    relu6Node->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(relu6Node);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Relu6, LAYER_RELU6);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc
new file mode 100644
index 0000000..4d2b334
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Relu, LAYER_RELU);
+
+REGISTER_UNARY_LAYER_BUILDER(Relu, LAYER_RELU);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc
new file mode 100644
index 0000000..d8b72ed
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Reorg, LAYER_REORG);
+
+Status ReorgOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<ReorgLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    ngraph::Strides strides;
+    for (size_t i = 2; i < input_node->get_output_shape(0).size(); i++) {
+        strides.push_back(paramlist->stride);
+    }
+    
+    auto reorgNode = std::make_shared<ngraph::op::ReorgYolo>(
+        input_node->output(0), strides);
+    reorgNode->validate_and_infer_types();
+
+    reorgNode->set_friendly_name(paramlist->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reorgNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Reorg, LAYER_REORG);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc
new file mode 100644
index 0000000..68c04ad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/custom_layer/custom_reshape.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+DECLARE_OPENVINO_LAYER_BUILDER(Reshape, LAYER_RESHAPE);
+
+Status ReshapeOVLayerBuilder::Build() {
+    auto paramlist = dynamic_cast<ReshapeLayerParam*>(param_);
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    ngraph::Shape output_shape;
+    output_shape.push_back(paramlist->num_axes);
+
+    std::vector<int> shapePattern;
+    for (auto item : paramlist->shape) {
+        shapePattern.push_back(item);
+    }
+    auto patternNode = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32, output_shape, shapePattern);
+
+    std::shared_ptr<ngraph::Node> reshapeNode = nullptr;
+    if (paramlist->reshape_type != 1) {
+        reshapeNode = std::make_shared<ngraph::op::v1::Reshape>(input_node->output(0), patternNode, true);
+        reshapeNode->set_friendly_name(paramlist->name);
+        reshapeNode->validate_and_infer_types();
+        ngraph::NodeVector outputNodes;
+        outputNodes.push_back(reshapeNode);
+        SetOutputTensors(outputNodes);
+    } else {
+        ADD_CUSTOM_NODE(Reshape, paramlist->name);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Reshape, LAYER_RESHAPE);
+REGISTER_CUSTOM_TYPE(LAYER_RESHAPE);
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc
new file mode 100644
index 0000000..ab94dbc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(RoiPooling, LAYER_ROIPOOLING);
+
+Status RoiPoolingOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<RoiPoolingLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=1) {
+        LOGE("Error: ROI Pooling requires 2 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes();
+
+    ngraph::Shape outputSize;
+    std::reverse(paramlist->pooled_dims.begin(), paramlist->pooled_dims.end());
+    for (auto item : paramlist->pooled_dims) {
+        outputSize.push_back(item);
+    }
+
+    std::string pool_type;
+    if (paramlist->pool_type == 1) {
+        pool_type = "Bilinear";
+    } else {
+        pool_type = "Max";
+    }
+
+    auto roiPoolingNode = std::make_shared<ngraph::op::ROIPooling>(
+        input_node[0]->output(0), input_node[1]->output(1), outputSize, paramlist->spatial_scale, pool_type);
+    roiPoolingNode->validate_and_infer_types();
+
+    roiPoolingNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(roiPoolingNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(RoiPooling, LAYER_ROIPOOLING);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc
new file mode 100644
index 0000000..a5ecdc9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ScatterND, LAYER_SCATTER_ND);
+
+Status ScatterNDOVLayerBuilder::Build() {
+    auto paramlist = param_;
+    auto resource  = dynamic_cast<ScatterNDLayerResource *>(resource_);
+
+    if (!resource && GetInputNodes().size() < 3) {
+        LOGE("ScatterNDOVLayerBuilder has not layer resource\n");
+        return Status(TNNERR_PARAM_ERR, "ScatterNDOVLayerBuilder has not layer resource");
+    }
+
+    auto input_nodes = GetInputNodes();
+
+    std::shared_ptr<ngraph::op::v3::ScatterNDUpdate> scatterNode = nullptr;
+    if (input_nodes.size() == 3) {
+        scatterNode = std::make_shared<ngraph::op::v3::ScatterNDUpdate>(
+            input_nodes[0]->output(0), input_nodes[1]->output(0), input_nodes[2]->output(0));
+    } else {
+        auto indiceNode = ConvertToConstNode(&resource->indices);
+        scatterNode     = std::make_shared<ngraph::op::v3::ScatterNDUpdate>(
+            input_nodes[0]->output(0), indiceNode->output(0), input_nodes[1]->output(0));
+    }
+
+    scatterNode->validate_and_infer_types();
+    scatterNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(scatterNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ScatterND, LAYER_SCATTER_ND);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc
new file mode 100644
index 0000000..c8c8b84
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Selu, LAYER_SELU);
+
+Status SeluOVLayerBuilder::Build() {
+    
+    auto paramlist = dynamic_cast<SeluLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    // {1,1,1,1} alpha gamma
+    auto alphaNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), \
+        std::vector<float>{paramlist->alpha});
+    auto gammaNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), \
+        std::vector<float>{paramlist->gamma});
+    auto zeroNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::f32, ngraph::Shape(input_node->get_output_shape(0).size(), 1), \
+        std::vector<float>{0.0f});
+
+    auto posNode = std::make_shared<ngraph::op::v1::Maximum>(input_node->output(0), zeroNode);
+    auto negNode = std::make_shared<ngraph::op::v1::Minimum>(input_node->output(0), zeroNode);
+
+    // exp(negNode) * alpha - alpha
+    auto powNode = std::make_shared<ngraph::op::Exp>(negNode);
+    auto mulNode = std::make_shared<ngraph::op::v1::Multiply>(powNode, alphaNode);
+    auto subNode = std::make_shared<ngraph::op::v1::Subtract>(mulNode, alphaNode);
+
+    auto addNode = std::make_shared<ngraph::op::v1::Add>(subNode, posNode);
+    addNode->validate_and_infer_types();
+
+    addNode->set_friendly_name(paramlist->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(addNode);
+    SetOutputTensors(outputNodes);
+    
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Selu, LAYER_SELU);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc
new file mode 100644
index 0000000..d8e0c1e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Shape, LAYER_SHAPE);
+
+Status ShapeOVLayerBuilder::Build() {
+    auto paramlist = param_;
+
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node  = GetInputNodes()[0];
+    auto input_shape = input_node->get_output_shape(0);
+
+    auto shapeNode = std::make_shared<ngraph::op::ShapeOf>(input_node->output(0));
+
+    shapeNode->validate_and_infer_types();
+    shapeNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(shapeNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+
+    // auto input_node = GetInputNodes();
+
+    // // auto input_blobs = GetInputBlobs();
+    // // auto output_blobs = GetOutputBlobs();
+    // // print(input_blobs[0]->GetBlobDesc().dims);
+    // // print(output_blobs[0]->GetBlobDesc().dims);
+
+    // ngraph::OutputVector inputs;
+    // for (auto item : input_node) {
+    //     inputs.push_back(item->output(0));
+    // }
+    // auto unsqueezeNode = std::make_shared<CustomShapeOp>(
+    //     inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());
+
+    // unsqueezeNode->validate_and_infer_types();
+    // unsqueezeNode->set_friendly_name(param_->name);
+
+    // ngraph::NodeVector outputNodes;
+    // outputNodes.push_back(unsqueezeNode);
+    // SetOutputTensors(outputNodes);
+
+    // return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Shape, LAYER_SHAPE);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc
new file mode 100644
index 0000000..9594bf0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(ShuffleChannel, LAYER_SHUFFLE_CHANNEL);
+
+Status ShuffleChannelOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<ShuffleLayerParam*>(param_);
+    
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto shuffleChannelNode = std::make_shared<ngraph::op::ShuffleChannels>(input_node->output(0), 1, paramlist->group);
+    /*
+    // reshape-transpose-reshape
+    std::vector<int> reshapePattern;
+    auto input_shape = input_node->get_output_shape(0);
+
+    reshapePattern.push_back(input_shape.at(0));
+    reshapePattern.push_back(paramlist->group);
+    reshapePattern.push_back(input_shape.at(1) / paramlist->group);
+    for (size_t i = 2; i < input_shape.size(); i++) {
+        reshapePattern.push_back(input_shape.at(i));
+    }
+
+    ngraph::Shape shuffleShape;
+    shuffleShape.push_back(input_shape.size() + 1);
+    
+    auto patternNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, shuffleShape, reshapePattern);
+    
+    auto reshapeNode = std::make_shared<ngraph::op::v1::Reshape>(
+        input_node->output(0), patternNode, true);
+    
+    reshapeNode->validate_and_infer_types();
+    
+    // transpose
+    std::vector<int> transposeAxis;
+    for (size_t i = 0; i <= input_shape.size(); i++) {
+        if (i == 1) {
+            transposeAxis.push_back(2);
+        } else if (i == 2) {
+            transposeAxis.push_back(1);
+        } else {
+            transposeAxis.push_back(i);
+        }
+    }
+    ngraph::Shape transposeShape;
+    transposeShape.push_back(input_shape.size() + 1);
+    
+    auto transposeConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, transposeShape, transposeAxis);
+    
+    auto transposeNode = std::make_shared<ngraph::op::Transpose>(
+        reshapeNode->output(0), transposeConst->output(0));
+    
+    transposeNode->validate_and_infer_types();
+    
+    // reshape
+    ngraph::Shape shuffleShape1;
+    shuffleShape1.push_back(input_shape.size());
+    std::vector<int> reshapePattern1;
+    for (auto item : input_shape) {
+        reshapePattern1.push_back(item);
+    }
+    
+    auto patternNode1 = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, shuffleShape1, reshapePattern1);
+    
+    auto reshapeNode1 = std::make_shared<ngraph::op::v1::Reshape>(
+        transposeNode->output(0), patternNode1->output(0), true);
+
+    reshapeNode1->set_friendly_name(paramlist->name);
+    reshapeNode1->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(reshapeNode1);
+    SetOutputTensors(outputNodes);
+    */
+    shuffleChannelNode->set_friendly_name(paramlist->name);
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(shuffleChannelNode);
+    SetOutputTensors(outputNodes);
+    
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(ShuffleChannel, LAYER_SHUFFLE_CHANNEL);
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc
new file mode 100644
index 0000000..1cae56f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Sigmoid, LAYER_SIGMOID);
+
+REGISTER_UNARY_LAYER_BUILDER(Sigmoid, LAYER_SIGMOID);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc
new file mode 100644
index 0000000..773ee34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Sign, LAYER_SIGN);
+
+REGISTER_UNARY_LAYER_BUILDER(Sign, LAYER_SIGN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc
new file mode 100644
index 0000000..349895b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Sin, LAYER_SIN);
+
+REGISTER_UNARY_LAYER_BUILDER(Sin, LAYER_SIN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc
new file mode 100644
index 0000000..0f72ee3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Softmax, LAYER_SOFTMAX);
+
+Status SoftmaxOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<SoftmaxLayerParam*>(param_);
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    auto softmaxNode = std::make_shared<ngraph::op::v1::Softmax>(
+        input_node->output(0), paramlist->axis);
+    softmaxNode->set_friendly_name(paramlist->name);
+    softmaxNode->validate_and_infer_types();
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(softmaxNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Softmax, LAYER_SOFTMAX);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc
new file mode 100644
index 0000000..034322c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_softplus.h"
+#include <iostream>
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Softplus, LAYER_SOFTPLUS);
+
+Status SoftplusOVLayerBuilder::Build() {
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto input_node = GetInputNodes();
+    ngraph::OutputVector inputs;
+    for (auto item : input_node) {
+        inputs.push_back(item->output(0));
+    }
+   
+    auto softplusNode = std::make_shared<CustomSoftplusOp>(
+        inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());
+
+    softplusNode->validate_and_infer_types();
+    softplusNode->set_friendly_name(param_->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(softplusNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Softplus, LAYER_SOFTPLUS);
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc
new file mode 100644
index 0000000..8498d62
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_softsign.h"
+#include <iostream>
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Softsign, LAYER_SOFTSIGN);
+
+Status SoftsignOVLayerBuilder::Build() {
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    auto input_node = GetInputNodes();
+    ngraph::OutputVector inputs;
+    for (auto item : input_node) {
+        inputs.push_back(item->output(0));
+    }
+   
+    auto softsignNode = std::make_shared<CustomSoftsignOp>(
+        inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());
+
+    softsignNode->validate_and_infer_types();
+    softsignNode->set_friendly_name(param_->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(softsignNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Softsign, LAYER_SOFTSIGN);
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc
new file mode 100644
index 0000000..3377590
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Splitv, LAYER_SPLITV);
+
+Status SplitvOVLayerBuilder::Build() {
+    
+    if (GetInputNodes().size() <= 0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto param = dynamic_cast<SplitVLayerParam *>(param_);
+
+    auto input_node = GetInputNodes()[0];
+
+    auto get_split_flag = [&]() -> bool {
+        auto &slices = param->slices;
+        std::set<int> slice_set(slices.begin(), slices.end());
+        if (slice_set.size() != 1) {
+            return false;
+        }
+        if (slices[0] * slices.size() != input_node->get_output_shape(0)[param->axis]) {
+            return false;
+        }
+        return true;
+    };
+
+    if (0) {
+    // if (get_split_flag()) {
+        LOGD("SplitV: use split operation\n");
+        // use split instead of splitV
+        auto axis_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape(), param->axis);
+        auto splitNode = std::make_shared<ngraph::op::v1::Split>(input_node->output(0), axis_node->output(0), param->slices.size());
+        
+        splitNode->set_friendly_name(param->name);
+        splitNode->validate_and_infer_types();
+
+        ngraph::NodeVector outputNodes;
+        for (auto &iter : splitNode->outputs()) {
+            outputNodes.push_back(iter.get_node_shared_ptr());
+        }
+
+        SetOutputTensors(outputNodes);
+    } else {
+        LOGD("SplitV: use strideslice operation\n");
+        // use stride slice instead of splitV
+        std::vector<int> begins, ends;
+        std::vector<int64_t> begin_mask, end_mask;
+        size_t input_dims = input_node->get_output_shape(0).size();
+        ngraph::Shape dims_shape({input_dims});
+
+        auto output_blobs = GetOutputBlobs();
+
+        for (int i = 0; i < input_dims; i++) {
+            if (i == param->axis) {
+                begin_mask.push_back(0);
+                end_mask.push_back(0);
+            } else {
+                begin_mask.push_back(1);
+                end_mask.push_back(1);
+            }
+            begins.push_back(0);
+            ends.push_back(0);
+        }
+
+        ngraph::NodeVector outputNodes;
+
+        for (int i = 0; i < param->slices.size(); i++) {
+            begins[param->axis] = ends[param->axis];
+            ends[param->axis]  += param->slices[i];
+
+            auto beginNode = std::make_shared<ngraph::op::Constant>(
+                ngraph::element::Type_t::i32, dims_shape, begins);
+            auto endNode   = std::make_shared<ngraph::op::Constant>(
+                ngraph::element::Type_t::i32, dims_shape, ends);
+            
+            auto strideSliceNode = std::make_shared<ngraph::op::v1::StridedSlice>(
+                input_node->output(0), beginNode, endNode, begin_mask, end_mask);
+            
+            strideSliceNode->set_friendly_name(output_blobs[i]->GetBlobDesc().name);
+            strideSliceNode->validate_and_infer_types();
+            outputNodes.push_back(strideSliceNode);
+        }
+
+        SetOutputTensors(outputNodes);
+    }
+
+    return TNN_OK;
+
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Splitv, LAYER_SPLITV);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc
new file mode 100644
index 0000000..a334825
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Sqrt, LAYER_SQRT);
+
+REGISTER_UNARY_LAYER_BUILDER(Sqrt, LAYER_SQRT);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc
new file mode 100644
index 0000000..bac4b0a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Squeeze, LAYER_SQUEEZE);
+
+Status SqueezeOVLayerBuilder::Build() {
+    auto paramlist  = dynamic_cast<SqueezeLayerParam *>(param_);
+    auto input_node = GetInputNodes();
+
+    auto axis_node   = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32,
+                                                            ngraph::Shape{paramlist->axes.size()}, paramlist->axes);
+    auto squeezeNode = std::make_shared<ngraph::op::Squeeze>(input_node[0]->output(0), axis_node->output(0));
+
+    squeezeNode->validate_and_infer_types();
+    squeezeNode->set_friendly_name(param_->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(squeezeNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Squeeze, LAYER_SQUEEZE);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc
new file mode 100644
index 0000000..bfd43d5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(StrideSlice, LAYER_STRIDED_SLICE);
+
+Status StrideSliceOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<StrideSliceLayerParam*>(param_);
+    
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    ngraph::Shape strideSliceShape;
+    strideSliceShape.push_back(paramlist->begins.size());
+
+    std::vector<int> begins, ends, strides;
+    std::vector<int64_t> begin_mask, end_mask;
+    for (int i = paramlist->begins.size() - 1; i > -1; i--) {
+        if (paramlist->begins.at(i) == 0) begin_mask.push_back(1);
+        else begin_mask.push_back(0);
+        if (paramlist->ends.at(i) == 0) end_mask.push_back(1);
+        else end_mask.push_back(0);
+        begins.push_back(paramlist->begins.at(i));
+        ends.push_back(paramlist->ends.at(i));
+        strides.push_back(paramlist->strides.at(i));
+    }
+
+    auto beginNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, strideSliceShape, begins);
+    auto endNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, strideSliceShape, ends);
+    auto strideNode = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i32, strideSliceShape, paramlist->strides);
+
+    auto strideSliceNode = std::make_shared<ngraph::op::v1::StridedSlice>(
+        input_node->output(0), beginNode, endNode, begin_mask, end_mask);
+    
+    strideSliceNode->validate_and_infer_types();
+    strideSliceNode->set_friendly_name(paramlist->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(strideSliceNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(StrideSlice, LAYER_STRIDED_SLICE);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc
new file mode 100644
index 0000000..4edb4cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_stride_slice_v2.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+Status StrideSliceV2OVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<StrideSliceV2LayerParam*>(param_);
+
+     if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+
+    // // Todo : build error, zero dims is not allowed???
+    // ngraph::Shape strideSliceShape;
+    // auto dims = input_node[0]->get_output_shape(0);
+    // auto dims_len = dims.size();
+    // strideSliceShape.push_back(dims_len);
+
+    // std::vector<int> begins(dims_len);
+    // std::vector<int> ends(dims_len);
+    // std::vector<int> strides(dims_len, 1);
+    // std::vector<int64_t> begin_mask(dims_len, 1);
+    // std::vector<int64_t> end_mask(dims_len, 1);
+    // for (int i = 0; i < paramlist->axes.size(); i++) {
+    //     if (paramlist->begins[i] < 0) {
+    //         begins[paramlist->axes[i]] = paramlist->begins[i] + dims[i];
+    //     } else {
+    //         begins[paramlist->axes[i]] = paramlist->begins[i];
+    //     }
+
+    //     if (paramlist->ends[i] == INT_MAX) {
+    //         ends[paramlist->axes[i]] = dims[i];
+    //     } else if (paramlist->ends[i] < 0) {
+    //         ends[paramlist->axes[i]] = paramlist->ends[i] + dims[i];
+    //     } else {
+    //         ends[paramlist->axes[i]] = paramlist->ends[i];
+    //     }
+    //     strides[paramlist->axes[i]] = paramlist->strides[i];
+    //     begin_mask[paramlist->axes[i]] = 0; // 0 means valid at this dim
+    //     end_mask[paramlist->axes[i]] = 0; // 0 means valid at this dim
+    // }
+
+    // auto beginNode = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::i32, strideSliceShape, begins);
+    // auto endNode = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::i32, strideSliceShape, ends);
+    // auto strideNode = std::make_shared<ngraph::op::Constant>(
+    //     ngraph::element::Type_t::i32, strideSliceShape, strides);
+
+    // auto strideSliceNode = std::make_shared<ngraph::op::v1::StridedSlice>(
+    //     input_node[0]->output(0), beginNode, endNode, strideNode, begin_mask, end_mask);
+    
+    // strideSliceNode->validate_and_infer_types();
+    // strideSliceNode->set_friendly_name(paramlist->name);
+
+    // ngraph::NodeVector outputNodes;
+    // outputNodes.push_back(strideSliceNode);
+    // SetOutputTensors(outputNodes);
+
+    // auto input_dims = GetInputBlobs()[0]->GetBlobDesc().dims;
+    // auto output_dims = GetOutputBlobs()[0]->GetBlobDesc().dims;
+
+    ADD_CUSTOM_NODE(StrideSliceV2, paramlist->name);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+REGISTER_CUSTOM_TYPE(LAYER_STRIDED_SLICE_V2);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc
new file mode 100644
index 0000000..41ecb03
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_BINARY_LAYER_BUILDER(Subtract, LAYER_SUB);
+
+REGISTER_BINARY_LAYER_BUILDER(Subtract, LAYER_SUB);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc
new file mode 100644
index 0000000..c666f6b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Tan, LAYER_TAN);
+
+REGISTER_UNARY_LAYER_BUILDER(Tan, LAYER_TAN);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc
new file mode 100644
index 0000000..18a7106
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+DECLARE_UNARY_LAYER_BUILDER(Tanh, LAYER_TANH);
+
+REGISTER_UNARY_LAYER_BUILDER(Tanh, LAYER_TANH);
+
+}
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unary_layer_builder.h b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unary_layer_builder.h
new file mode 100644
index 0000000..0407c87
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unary_layer_builder.h
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_UNARY_BINARY_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_UNARY_BINARY_LAYER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+
+namespace TNN_NS {
+namespace openvino {
+
+//@brief BaseLayeer Builder, defines the layer builder interface
+template<class NGRAPH_OP_TYPE>
+class UnaryLayerBuilder: public OpenVINOLayerBuilder {
+public:
+    // @brief virtual destructor
+    explicit UnaryLayerBuilder(LayerType type): OpenVINOLayerBuilder(type) {};
+
+    // @brief virtual destructor
+    virtual ~UnaryLayerBuilder() {};
+
+protected:
+    virtual Status InferOutputShape() {return TNN_OK;};
+    virtual Status InferOutputDataType() {return TNN_OK;};
+    virtual Status Build() {
+
+        if (GetInputNodes().size() <=0) {
+            LOGE("Error: 0 input nodes\n");
+            return TNNERR_INIT_LAYER;
+        }
+        auto input_node = GetInputNodes()[0];
+        auto unary_node = std::make_shared<NGRAPH_OP_TYPE>(input_node->output(0));
+
+        unary_node->set_friendly_name(param_->name);
+        unary_node->validate_and_infer_types();
+
+        SetOutputTensors(ngraph::NodeVector({unary_node}));
+
+        return TNN_OK;    
+    }
+};
+
+#define DECLARE_UNARY_LAYER_BUILDER(type_string, layer_type)                                                           \
+    class type_string##LayerBuilder : public UnaryLayerBuilder<ngraph::op::type_string> {                              \
+    public:                                                                                                            \
+        type_string##LayerBuilder(LayerType ignore) : UnaryLayerBuilder(layer_type){};                                 \
+        virtual ~type_string##LayerBuilder(){};                                                                        \
+    };
+
+#define REGISTER_UNARY_LAYER_BUILDER(type_string, layer_type)                                                          \
+    TypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##LayerBuilder>>                                       \
+        g_##layer_type##_layer_builder_register(layer_type);
+
+}  // namespace openvino
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_UNARY_BINARY_LAYER_BUILDER_H_O
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc
new file mode 100644
index 0000000..0be9159
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Unsqueeze, LAYER_UNSQUEEZE);
+
+Status UnsqueezeOVLayerBuilder::Build() {
+    auto paramlist  = dynamic_cast<UnsqueezeLayerParam *>(param_);
+    auto input_node = GetInputNodes();
+
+    auto axis_node     = std::make_shared<ngraph::op::Constant>(ngraph::element::Type_t::i32,
+                                                            ngraph::Shape{paramlist->axes.size()}, paramlist->axes);
+    auto unsqueezeNode = std::make_shared<ngraph::op::Unsqueeze>(input_node[0]->output(0), axis_node->output(0));
+
+    unsqueezeNode->validate_and_infer_types();
+    unsqueezeNode->set_friendly_name(param_->name);
+
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(unsqueezeNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Unsqueeze, LAYER_UNSQUEEZE);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc
new file mode 100644
index 0000000..0913b54
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <memory>
+
+#include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <inference_engine.hpp>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+namespace TNN_NS {
+
+DECLARE_OPENVINO_LAYER_BUILDER(Upsample, LAYER_UPSAMPLE);
+
+Status UpsampleOVLayerBuilder::Build() {
+
+    auto paramlist = dynamic_cast<UpsampleLayerParam*>(param_);
+
+    if (GetInputNodes().size() <=0) {
+        LOGE("Error: 0 input nodes\n");
+        return TNNERR_INIT_LAYER;
+    }
+    auto input_node = GetInputNodes()[0];
+
+    ngraph::op::v0::InterpolateAttrs attrs;
+    attrs.align_corners = paramlist->align_corners;
+    // if (paramlist->align_corners) 
+    //     attrs.coordinate_transformation_mode = ngraph::op::v3::Interpolate::CoordinateTransformMode::align_corners;
+    for (size_t axis = 2; axis < input_node->get_output_shape(0).size(); axis++) {
+        attrs.axes.insert(axis);
+    }
+
+    if (paramlist->mode == 1) {
+        attrs.mode = "nearest";
+    } else if (paramlist->mode == 2) {
+        attrs.mode = "linear";
+    } else if (paramlist->mode == 3){
+        attrs.mode = "cubic";
+    } else {
+        return Status(TNNERR_MODEL_ERR, "Error: Upsample dont support resize type");
+    }
+
+    std::vector<int64_t> upsampleShape;
+    if (paramlist->dims.size() != 0) {
+        if (paramlist->dims[0] != 0 && paramlist->dims[1] != 0) {
+            upsampleShape.push_back(paramlist->dims[1]);
+            upsampleShape.push_back(paramlist->dims[0]);
+        } else {
+            return Status(TNNERR_MODEL_ERR, "Error: Upsample size error");
+        }
+    } else {
+        upsampleShape.push_back(input_node->get_output_shape(0).at(2) * paramlist->scales.at(1));
+        upsampleShape.push_back(input_node->get_output_shape(0).at(3) * paramlist->scales.at(0));
+    }
+    auto upsampleConst = std::make_shared<ngraph::op::Constant>(
+        ngraph::element::Type_t::i64, ngraph::Shape{2}, upsampleShape);
+
+    auto upsampleNode = std::make_shared<ngraph::op::v0::Interpolate>(
+        input_node->output(0), upsampleConst, attrs);
+    upsampleNode->validate_and_infer_types();
+
+    upsampleNode->set_friendly_name(paramlist->name);
+    
+    ngraph::NodeVector outputNodes;
+    outputNodes.push_back(upsampleNode);
+    SetOutputTensors(outputNodes);
+
+    return TNN_OK;
+}
+
+REGISTER_OPENVINO_LAYER_BUILDER(Upsample, LAYER_UPSAMPLE);
+
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/openvino_network.cc b/3rdparty/TNN/source/tnn/network/openvino/openvino_network.cc
new file mode 100644
index 0000000..60403bc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/openvino_network.cc
@@ -0,0 +1,425 @@
+// Copyright 2019 Tencent. All Rights Reserved
+
+#include "openvino_network.h"
+#include <string.h>
+
+#include <inference_engine.hpp>
+#include <ngraph/ngraph.hpp>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/profile.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/utils/blob_dump_utils.h"
+#include "tnn/utils/blob_transfer_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+#include "tnn/network/openvino/openvino_types.h"
+
+#include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<OpenVINONetwork_>> g_network_impl_openvino_ngraph_factory_register(NETWORK_TYPE_OPENVINO);
+
+OpenVINONetwork_::~OpenVINONetwork_() {
+    DeInit();
+}
+
+Status OpenVINONetwork_::Init(NetworkConfig &net_config, ModelConfig &model_config,
+                            AbstractModelInterpreter* interpreter,
+                            InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+
+    Status ret  = TNN_OK;
+
+    // RETURN_ON_NEQ(DefaultNetwork::Init(net_config, model_config, interpreter, inputs_shape), TNN_OK);
+    DefaultModelInterpreter *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    CHECK_PARAM_NULL(default_interpreter);
+
+    NetStructure *net_structure = default_interpreter->GetNetStructure();
+    NetResource *net_resource   = default_interpreter->GetNetResource();
+
+    if (net_structure == NULL || net_resource == NULL) {
+        LOGE("ERROR: network_ is nil, network_type may not support\n");
+        return Status(TNNERR_NULL_PARAM, "network_ is nil, network_type may not support");
+    }
+
+    device_ = GetDevice(net_config.device_type);
+    if (device_ == NULL) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    context_ = device_->CreateContext(net_config.device_id);
+    if (context_ == NULL) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+
+    /*
+     * The NetOptimizeManager holds a list of network optimization processes.
+     * The optimization process may change the network structure accoundingly.
+     * eg. fuse conv+bn, conv+relu.
+     */
+    {
+        // use mutex to protect net_resource and net_structure in multi-thread
+        std::unique_lock<std::mutex> lck(optimize_mtx_);
+        ret = optimizer::NetOptimizerManager::Optimize(net_structure, net_resource, net_config);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    blob_manager_ = new BlobManager(device_);
+    ret = blob_manager_->Init(net_config, net_structure, max_inputs_shape, GetNetResourceDataType(net_resource));
+
+    //set inputnode
+    RETURN_ON_NEQ(SetNetInputNode(), TNN_OK);
+    //init layers and nodes
+    RETURN_ON_NEQ(InitLayers(net_structure, net_resource), TNN_OK);
+
+    // build ngraph network
+    RETURN_ON_NEQ(BuildNgraphNetwork(net_structure), TNN_OK);
+    //////////////////////////////////////////////////////////////
+    std::map<std::string, std::string> config = {
+        {CONFIG_KEY(CPU_THREADS_NUM), "1"},
+        {CONFIG_KEY(CPU_THROUGHPUT_STREAMS), "0"},
+        {CONFIG_KEY(CPU_BIND_THREAD), "NO"},
+    };
+
+    ie_.SetConfig(config, "CPU");
+    InferenceEngine::IExtensionPtr extensionPtr;
+    extensionPtr = std::make_shared<CustomOpenvinoLayerManager>();
+    ie_.AddExtension(extensionPtr, "CPU");
+
+    return Reshape(max_inputs_shape);
+}
+
+Status OpenVINONetwork_::SetNetInputNode() {
+    BlobMap blob_map;
+    blob_manager_->GetAllInputBlobs(blob_map);
+
+    for(auto it : blob_map) {
+        std::string input_name = it.first; 
+        BlobDesc blob_desc = it.second->GetBlobDesc();
+
+        ngraph::Shape  ngraph_input_shape;
+        for(auto d : blob_desc.dims) {
+            ngraph_input_shape.push_back(d);
+        }
+
+        std::shared_ptr<ngraph::op::Parameter> input_node = 
+                std::make_shared<ngraph::op::Parameter>(ConvertToOVDataType(blob_desc.data_type), ngraph::Shape(ngraph_input_shape));
+        input_node->set_friendly_name(input_name);
+
+        auto foreign_blob = new ForeignBlob(it.second);
+        foreign_blob->SetForeignTensor(std::make_shared<OpenvinoTensor>(input_node));
+
+        blob_manager_->ReplaceBlob(input_name, foreign_blob);
+    }
+
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::BuildNgraphNetwork(NetStructure *net_structure) {
+
+    ngraph::ParameterVector input_nodes;
+    for(auto it : net_structure->inputs_shape_map) {
+        auto name = it.first;
+        auto input_tensor = dynamic_cast<ForeignBlob*>(blob_manager_->GetBlob(name))->GetForeignTensor();
+        auto input_openvino_tensor = std::dynamic_pointer_cast<OpenvinoTensor>(input_tensor);
+        input_nodes.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(input_openvino_tensor->GetNode()));
+    }
+
+    ngraph::NodeVector output_nodes;
+    for (auto name : net_structure->outputs) {
+        auto output_tensor = dynamic_cast<ForeignBlob*>(blob_manager_->GetBlob(name))->GetForeignTensor();
+        auto output_openvino_tensor = std::dynamic_pointer_cast<OpenvinoTensor>(output_tensor);
+        output_openvino_tensor->GetNode()->set_friendly_name(name);
+        auto result_node = std::make_shared<ngraph::op::Result>(output_openvino_tensor->GetNode());
+        output_nodes.push_back(result_node);
+    } 
+    
+    std::shared_ptr<ngraph::Function> node_funtion = std::make_shared<ngraph::Function>(
+         output_nodes, input_nodes, "net");
+
+    network_ =  std::make_shared<InferenceEngine::CNNNetwork>(node_funtion);
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::GetForwardMemorySize(int &memory_size) {
+    memory_size = 0;
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::SetForwardMemory(void *memory) {   
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blob_map_;
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blob_map_;
+    // if tnn blob != openvino blob dims, use min(tnn_blob, openvino_blob)
+    // eg. detection_output layer
+    for (auto iter : blobs) {
+        auto tnn_blob_dims = blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims;
+        auto openvino_blob_dims = iter.second->GetBlobDesc().dims;
+        iter.second->GetBlobDesc().dims = DimsVectorUtils::Min(tnn_blob_dims, openvino_blob_dims);
+    }
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::Reshape(const InputShapesMap &inputs) {
+    RETURN_ON_NEQ(DefaultNetwork::Reshape(inputs), TNN_OK);
+
+    auto network_shapes = network_->getInputShapes();
+
+    for(auto item : inputs) {
+        std::string input_name = item.first;
+        if (network_shapes.find(input_name) == network_shapes.end()) {
+            return TNNERR_PARAM_ERR;
+        }
+        if (item.second.size() != network_shapes.find(input_name)->second.size()) {
+            return TNNERR_PARAM_ERR;
+        }
+
+        InferenceEngine::SizeVector input_shape;
+        for(int i=0;i<item.second.size();i++) {
+            input_shape.push_back(item.second[i]);
+        }
+        network_shapes[input_name] = input_shape;
+
+    }
+
+    network_->reshape(network_shapes);
+
+    executable_network_ = ie_.LoadNetwork(*network_, "CPU");
+    infer_request_ = executable_network_.CreateInferRequest();
+
+    auto input_map = executable_network_.GetInputsInfo();
+    for(auto item : input_map) {
+        std::string key = item.first;
+        auto blob_ptr = infer_request_.GetBlob(key);
+
+        BlobDesc desc;
+        desc.data_format = DATA_FORMAT_NCHW;
+        desc.name = key;
+        desc.device_type = DEVICE_X86;
+        desc.data_type = ConvertOVPrecisionToDataType(blob_ptr->getTensorDesc().getPrecision());
+        auto dims = blob_ptr->getTensorDesc().getDims();
+        for(int index = 0; index<dims.size(); index++) {
+            desc.dims.push_back(dims[index]);
+        }
+
+        BlobHandle handle;
+        handle.base = blob_ptr->buffer().as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>();
+
+        if (input_blob_map_.find(key) != input_blob_map_.end())  {
+            input_blob_map_[key]->SetBlobDesc(desc);
+            input_blob_map_[key]->SetHandle(handle);
+        } else {
+            input_blob_map_[key] = new Blob(desc, handle);  
+        }
+    }
+
+    auto output_map = executable_network_.GetOutputsInfo();
+    for(auto item : output_map) {
+        std::string key = item.first;
+        auto blob_ptr = infer_request_.GetBlob(key);
+        BlobDesc desc;
+        desc.data_format = DATA_FORMAT_NCHW;
+        desc.name = key;
+        desc.device_type = DEVICE_X86;
+        desc.data_type = ConvertOVPrecisionToDataType(blob_ptr->getTensorDesc().getPrecision());
+        auto dims = blob_ptr->getTensorDesc().getDims();
+        for(int index = 0; index<dims.size(); index++) {
+            desc.dims.push_back(dims[index]);
+        }
+        BlobHandle handle;
+        handle.base = blob_ptr->buffer().as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>();
+        if (output_blob_map_.find(key) != output_blob_map_.end())  {
+            output_blob_map_[key]->SetBlobDesc(desc);
+            output_blob_map_[key]->SetHandle(handle);
+        } else {
+            output_blob_map_[key] = new Blob(desc, handle);  
+        }
+    }
+
+    return TNN_OK;
+}
+
+/*
+ * InitLayerBuilders funcion does the following things:
+ *  1. Set Blob type accordingly.
+ *  2. Set data_tyep accordingly.
+ *  3. Infer the blob shapes.
+ *  4. Check the weights required.
+ *  5. Create Layer Builders.
+ */
+Status OpenVINONetwork_::InitLayers(NetStructure *net_structure, NetResource *net_resource) {
+    Status ret = TNN_OK;
+
+    auto const_blobs = net_resource->constant_map;
+    for (auto layer_info : net_structure->layers) {
+        std::vector<std::string> &input_names = layer_info->inputs;
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (const_blobs.find(name) != const_blobs.end()) {
+                blob->GetBlobDesc().data_type = const_blobs[name]->GetDataType();
+                if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+                    // printf("const blob name %s\n", name.c_str());
+                    blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+
+                    auto const_node = ConvertToConstNode(const_blobs[name].get());
+                    const_node->set_friendly_name(name);
+
+                    auto foreign_blob = new ForeignBlob(blob);
+                    foreign_blob->GetBlobDesc().dims = const_blobs[name].get()->GetBufferDims();
+                    foreign_blob->SetForeignTensor(std::make_shared<OpenvinoTensor>(const_node));
+
+                    blob_manager_->ReplaceBlob(name, foreign_blob);
+                }
+            }
+        }
+    }
+
+    auto const_layers = net_resource->constant_layers;
+
+    for (auto layer_info : net_structure->layers) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL && const_layers.find(layer_info->name) != const_layers.end()) {
+            continue;
+        }
+
+        LayerType type       = layer_info->type;
+        OpenVINOLayerBuilder *cur_layer = CreateOpenVINOLayerBuilder(type);
+        
+        if (cur_layer == NULL) {
+            LOGE("Error: CreateLayerBuilder failed, type:%d\n", type);
+            return Status(TNNERR_PARAM_ERR, "CreateLayerBuilder failed");
+        }
+        std::string layer_name = layer_info->name;
+        cur_layer->SetLayerName(layer_name);
+        // set layer nodes
+        std::vector<Blob *> inputs;
+        std::vector<std::string> &input_names = layer_info->inputs;
+        // get input nodes
+        for (auto name : input_names) {
+            ForeignBlob* blob = dynamic_cast<ForeignBlob*>(blob_manager_->GetBlob(name));
+            inputs.push_back(blob);
+        }
+        std::vector<Blob *> outputs;
+        std::vector<std::string> &output_names = layer_info->outputs;
+
+#ifdef GENERATE_RESOURCE
+        // generate resource if null
+        if (net_resource->resource_map.count(layer_name) == 0) {
+            LayerParam *layer_param  = layer_info->param.get();
+            LayerResource *layer_res = nullptr;
+            GenerateRandomResource(type, layer_param, &layer_res, inputs);
+            net_resource->resource_map[layer_name] = std::shared_ptr<LayerResource>(layer_res);
+        }
+
+        std::vector<Blob *> outputs_for_shape;
+        for (auto name : output_names) {
+            outputs_for_shape.push_back(blob_manager_->GetBlob(name));
+        }
+        cur_layer->InferShapeAhead(inputs, outputs_for_shape, layer_info->param.get(),
+                                   net_resource->resource_map[layer_name].get());
+#endif
+        // init output nodes
+        for (auto name : output_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            auto foreign_blob = new ForeignBlob(blob);
+            foreign_blob->SetForeignTensor(std::make_shared<OpenvinoTensor>());
+            blob_manager_->ReplaceBlob(name, foreign_blob);
+            blob = foreign_blob;
+            outputs.push_back(blob);
+        }
+
+        LayerResource *layer_resource = nullptr;
+        auto resouce_it = net_resource->resource_map.find(layer_name);
+        if (resouce_it != net_resource->resource_map.end()) {
+            layer_resource = resouce_it->second.get();
+        }
+
+        cur_layer->SetRuntimeMode(runtime_model_);
+        cur_layer->SetConstantResource(&net_resource->constant_map);
+        // init node
+        ret = cur_layer->Init(context_, layer_info->param.get(), layer_resource, inputs, outputs, device_);
+        if (ret != TNN_OK) {
+            LOGE("Error Init layer %s (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(), (int)ret, (int)ret);
+            return ret;
+        }
+        
+        layers_.push_back(cur_layer);
+    }
+    return ret;
+}
+
+Status OpenVINONetwork_::DeInit() {
+    for(auto item : input_blob_map_) {
+        delete item.second;
+    }
+    input_blob_map_.clear();
+    for(auto item : output_blob_map_) {
+        delete item.second;
+    }
+    output_blob_map_.clear();
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::GetCommandQueue(void **command_queue) {
+    return TNN_OK;
+}
+
+Status OpenVINONetwork_::Forward() {
+    infer_request_.Infer();
+#if TNN_PROFILE
+    auto perf_count = infer_request_.GetPerformanceCounts();
+    for (auto iter : perf_count) {
+        if (std::string(iter.second.layer_type).find("Custom") != std::string::npos) {
+            continue;
+        }
+        auto pdata = std::make_shared<ProfilingData>();
+        pdata->layer_name = iter.first;
+        pdata->op_name = iter.second.layer_type;
+        pdata->kernel_time = iter.second.cpu_uSec / 1000.0f;
+        context_->AddProfilingData(pdata);
+    }
+#endif
+    return TNN_OK;
+}
+
+// @brief openvino instance network infer, it will not wait
+Status OpenVINONetwork_::ForwardAsync(Callback call_back) {
+    return Forward();
+}
+
+Status OpenVINONetwork_::SetCpuNumThreads(int num_threads) {
+    std::map<std::string, std::string> config = {
+        {CONFIG_KEY(CPU_THREADS_NUM), ToString(num_threads)},
+        {CONFIG_KEY(CPU_THROUGHPUT_STREAMS), "0"},
+        {CONFIG_KEY(CPU_BIND_THREAD), "NO"},
+    };
+    ie_.SetConfig(config, "CPU");
+
+    BlobMap input_blobs;
+    blob_manager_->GetAllInputBlobs(input_blobs);
+    InputShapesMap network_shapes;
+    for (auto &iter : input_blobs) {
+        network_shapes[iter.first] = iter.second->GetBlobDesc().dims;
+    }
+
+    // load network again
+    return Reshape(network_shapes);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/openvino/openvino_network.h b/3rdparty/TNN/source/tnn/network/openvino/openvino_network.h
new file mode 100644
index 0000000..8089716
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/openvino_network.h
@@ -0,0 +1,101 @@
+// Copyright 2019 Tencent. All Rights Reserved
+
+#ifndef TNN_SOURCE_DEVICE_OPENVINO_OPENVINO_NETWORK_H_
+#define TNN_SOURCE_DEVICE_OPENVINO_OPENVINO_NETWORK_H_
+
+#include <vector>
+
+#include <inference_engine.hpp>
+
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/default_network.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/blob_manager.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
+
+namespace TNN_NS {
+
+class OpenVINONetwork_:public DefaultNetwork {
+public:
+    // @brief virtual default destructor
+    virtual ~OpenVINONetwork_();
+
+    // @brief init network with net cfg and net res.
+    // @param net_cfg
+    // @param net_res
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config,
+                        AbstractModelInterpreter* interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+
+    // @brief deinit release init create resource
+    virtual Status DeInit();
+
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by tnn layers for
+    //  forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    //  @brief: set memory used by the tnn instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the tnn network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by tnn layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    //
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief network infer
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward();
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    //
+    virtual Status SetNetInputNode();
+
+    virtual Status BuildNgraphNetwork(NetStructure *net_structure);
+
+    // @brief set threads run on device
+    virtual Status SetCpuNumThreads(int num_threads);
+
+private:
+    virtual Status InitLayers(NetStructure *net_structure, NetResource *net_resource);
+  
+    InferenceEngine::Core ie_;
+    // InferenceEngine::CNNNetwork network_;
+    std::shared_ptr<InferenceEngine::CNNNetwork> network_;
+    InferenceEngine::ExecutableNetwork executable_network_;
+    InferenceEngine::InferRequest infer_request_;
+    BlobMap input_blob_map_;
+    BlobMap output_blob_map_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_OPENVINO_OPENVINO_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/network/openvino/openvino_types.cc b/3rdparty/TNN/source/tnn/network/openvino/openvino_types.cc
new file mode 100644
index 0000000..9fa7aa7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/openvino_types.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/openvino_types.h"
+
+#include <memory>
+
+#include <ngraph/node.hpp>
+
+#include "tnn/core/blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+//@brief create OpenvinoTensor
+OpenvinoTensor::OpenvinoTensor() {
+    node_ = nullptr;
+}
+
+//@brief create OpenvinoTensor with ngraph::Node
+OpenvinoTensor::OpenvinoTensor(std::shared_ptr<ngraph::Node> node) {
+    node_ = node;
+}
+
+//@brief OpenvinoTensor destructor
+OpenvinoTensor::~OpenvinoTensor() {
+}
+
+//@brief get the ForeignTensor
+std::shared_ptr<ngraph::Node> OpenvinoTensor::GetNode() {
+    return node_;
+}
+
+//@brief set the ForeignTensor
+Status OpenvinoTensor::SetNode(std::shared_ptr<ngraph::Node> node) {
+    node_ = node;
+    return TNN_OK;
+}
+
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/openvino_types.h b/3rdparty/TNN/source/tnn/network/openvino/openvino_types.h
new file mode 100644
index 0000000..fabc987
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/openvino_types.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_OPENVINO_TYPES_H_
+#define TNN_SOURCE_TNN_DEVICE_OPENVINO_OPENVINO_TYPES_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+//@brief Base Type of a OpenvinoTensor 
+class OpenvinoTensor: public ForeignTensor {
+public:
+    explicit OpenvinoTensor();
+
+    explicit OpenvinoTensor(std::shared_ptr<ngraph::Node> node);
+
+    // @brief virtual destructor
+    virtual ~OpenvinoTensor();
+
+    //@brief get the ForeignTensor
+    std::shared_ptr<ngraph::Node> GetNode();
+
+    //@brief set the ForeignTensor
+    Status SetNode(std::shared_ptr<ngraph::Node> node);
+
+protected:
+    std::shared_ptr<ngraph::Node> node_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_OPENVINO_OPENVINO_TYPES_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/openvino/utils.cc b/3rdparty/TNN/source/tnn/network/openvino/utils.cc
new file mode 100644
index 0000000..fc6ddad
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/utils.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/openvino/utils.h"
+
+namespace TNN_NS {
+
+ngraph::element::Type_t ConvertToOVDataType(DataType type) {
+    switch (type) {
+        case DATA_TYPE_FLOAT:
+            return ngraph::element::Type_t::f32;
+        case DATA_TYPE_HALF:
+            return ngraph::element::Type_t::f16;
+        case DATA_TYPE_INT64:
+            return ngraph::element::Type_t::i64;
+        case DATA_TYPE_INT32:
+            return ngraph::element::Type_t::i32;
+        case DATA_TYPE_INT8:
+            return ngraph::element::Type_t::i8;
+        default:
+            return ngraph::element::Type_t::f32;
+    }
+}
+
+DataType ConvertOVPrecisionToDataType(const InferenceEngine::Precision &precision) {
+    switch (precision.getPrecVal()) {
+        case InferenceEngine::Precision::FP32:
+            return DATA_TYPE_FLOAT;
+        case InferenceEngine::Precision::FP16:
+            return DATA_TYPE_HALF;
+        case InferenceEngine::Precision::I32:
+            return DATA_TYPE_INT32;
+        case InferenceEngine::Precision::I8:
+            return DATA_TYPE_INT8;
+        default:
+            return DATA_TYPE_FLOAT;
+    }
+}
+
+std::shared_ptr<ngraph::op::Constant> ConvertToConstNode(RawBuffer *buffer) {
+    ngraph::Shape constShape;
+
+    if (buffer->GetBufferDims().size() == 0 && buffer->GetBytesSize() == 0) {
+        return std::make_shared<ngraph::op::Constant>();
+    }
+
+    for (auto &iter : buffer->GetBufferDims()) {
+        constShape.push_back(iter);
+    }
+
+    return std::make_shared<ngraph::op::Constant>(ConvertToOVDataType(buffer->GetDataType()), constShape,
+                                                  buffer->force_to<void *>());
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/openvino/utils.h b/3rdparty/TNN/source/tnn/network/openvino/utils.h
new file mode 100644
index 0000000..34d68e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/openvino/utils.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_OPENVINO_UTILS_H_
+#define TNN_SOURCE_TNN_NETWORK_OPENVINO_UTILS_H_
+
+#include <ie_precision.hpp>
+#include <ngraph/ngraph.hpp>
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+#define ADD_CUSTOM_NODE(type, name)                                                                                    \
+    std::shared_ptr<ngraph::Node> customNode;                                                                          \
+    auto input_nodes = GetInputNodes();                                                                                \
+    ngraph::OutputVector inputs;                                                                                       \
+    for (auto item : input_nodes) {                                                                                    \
+        inputs.push_back(item->output(0));                                                                             \
+    }                                                                                                                  \
+    customNode = std::make_shared<Custom##type##Op>(inputs, base_layer_, GetInputBlobs(), GetOutputBlobs());           \
+    customNode->set_friendly_name(name);                                                                               \
+    customNode->validate_and_infer_types();                                                                            \
+    ngraph::NodeVector outputNodes;                                                                                    \
+    outputNodes.push_back(customNode);                                                                                 \
+    SetOutputTensors(outputNodes);
+
+ngraph::element::Type_t ConvertToOVDataType(DataType type);
+std::shared_ptr<ngraph::op::Constant> ConvertToConstNode(RawBuffer *buffer);
+DataType ConvertOVPrecisionToDataType(const InferenceEngine::Precision &precision);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/CMakeLists.txt b/3rdparty/TNN/source/tnn/network/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000..794a3cd
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.8)
+
+if(NOT DEFINED ENV{TENSORRT_ROOT_DIR})
+    message(FATAL_ERROR "not defined environment variable:TENSORRT_ROOT_DIR")
+endif()
+
+enable_language(CUDA)
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+file(GLOB_RECURSE TENSORRT_SRC *.h *.cc)
+add_library(TNNTensorRT OBJECT ${TENSORRT_SRC})
+
+include_directories($ENV{TENSORRT_ROOT_DIR}/include)
+include_directories($ENV{CUDNN_ROOT_DIR}/include)
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.cc b/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.cc
new file mode 100644
index 0000000..c615451
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/dimension_expr.h"
+
+#include <string.h>
+#include <string>
+#include <stdio.h>
+
+#include "NvInfer.h"
+
+#include "tnn/network/tensorrt/utils.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+DimensionExpr::DimensionExpr(const nvinfer1::IDimensionExpr * idimexpr, nvinfer1::IExprBuilder &builder) :
+                    expr_(idimexpr), builder_(builder)
+{}
+
+DimensionExpr::DimensionExpr(const int v, nvinfer1::IExprBuilder &builder) :
+                    builder_(builder)
+{
+    expr_ = builder.constant(v);
+}
+
+// @brief virtual destructor
+DimensionExpr::~DimensionExpr() {}
+
+}
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.h b/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.h
new file mode 100644
index 0000000..2e5da63
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/dimension_expr.h
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_DIMENSION_EXPR_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_DIMENSION_EXPR_H_
+
+#include "NvInfer.h"
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+// @brief Dimension builder helper class of tensorrt IDimensionExpr
+class DimensionExpr {
+public:
+    explicit DimensionExpr(const nvinfer1::IDimensionExpr * idimexpr, nvinfer1::IExprBuilder &builder);
+
+    explicit DimensionExpr(const int v, nvinfer1::IExprBuilder &builder);
+
+    // @brief virtual destructor
+    virtual ~DimensionExpr();
+
+    const nvinfer1::IDimensionExpr* expr() const {
+        return expr_;
+    }
+
+    // DimensionExpr with DimensionExpr
+    DimensionExpr& operator=(const DimensionExpr& other)
+    {
+        expr_ = other.expr_;  
+        builder_ = other.builder_;
+        return *this;
+    }
+
+    // DimensionExpr with DimensionExpr
+    friend DimensionExpr operator+(const DimensionExpr &lhs, const DimensionExpr& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUM, *lhs.expr_, *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator-(const DimensionExpr &lhs, const DimensionExpr& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUB, *lhs.expr_, *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator*(const DimensionExpr &lhs, const DimensionExpr& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kPROD, *lhs.expr_, *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator/(const DimensionExpr &lhs, const DimensionExpr& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kFLOOR_DIV, *lhs.expr_, *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr ceil_div(const DimensionExpr &lhs, const DimensionExpr& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV, *lhs.expr_, *rhs.expr_), builder);
+    }
+
+    // DimensionExpr with integer
+    friend DimensionExpr operator+(const DimensionExpr &lhs, const int& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUM, *lhs.expr_, *builder.constant(rhs)), builder);
+    }
+
+    friend DimensionExpr operator-(const DimensionExpr &lhs, const int& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUB, *lhs.expr_, *builder.constant(rhs)), builder);
+    }
+
+    friend DimensionExpr operator*(const DimensionExpr &lhs, const int& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kPROD, *lhs.expr_, *builder.constant(rhs)), builder);
+    }
+
+    friend DimensionExpr operator/(const DimensionExpr &lhs, const int& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kFLOOR_DIV, *lhs.expr_, *builder.constant(rhs)), builder);
+    }
+
+    friend DimensionExpr ceil_div(const DimensionExpr &lhs, const int& rhs)
+    {
+        nvinfer1::IExprBuilder& builder = lhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV, *lhs.expr_, *builder.constant(rhs)), builder);
+    }
+
+    // integer with DimensionExpr
+    friend DimensionExpr operator+( const int& lhs, const DimensionExpr &rhs)
+    {
+        nvinfer1::IExprBuilder& builder = rhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUM, *builder.constant(lhs), *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator-( const int& lhs, const DimensionExpr &rhs)
+    {
+        nvinfer1::IExprBuilder& builder = rhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kSUB,  *builder.constant(lhs), *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator*( const int& lhs, const DimensionExpr &rhs)
+    {
+        nvinfer1::IExprBuilder& builder = rhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kPROD,  *builder.constant(lhs), *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr operator/( const int& lhs, const DimensionExpr &rhs)
+    {
+        nvinfer1::IExprBuilder& builder = rhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kFLOOR_DIV,  *builder.constant(lhs), *rhs.expr_), builder);
+    }
+
+    friend DimensionExpr ceil_div( const int& lhs, const DimensionExpr &rhs)
+    {
+        nvinfer1::IExprBuilder& builder = rhs.builder_;
+        return DimensionExpr(builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,  *builder.constant(lhs), *rhs.expr_), builder);
+    }
+
+
+private:
+    const nvinfer1::IDimensionExpr * expr_ = nullptr;
+    nvinfer1::IExprBuilder& builder_;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_DIMENSION_EXPR_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.cc b/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.cc
new file mode 100644
index 0000000..6ae9584
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.cc
@@ -0,0 +1,254 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/exclusive_file.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <mutex>
+
+#if defined _WIN32 
+#include <windows.h>
+#include <winbase.h>
+#else
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/md5.h"
+
+namespace TNN_NS {
+
+static std::mutex static_mutex;
+
+#if defined _WIN32 
+
+shared_mutex_t shared_mutex_init(const char* iname) {
+    shared_mutex_t mutex = {NULL, 0, NULL, 1};
+    std::string iname_md5 = md5(std::string(iname));
+
+    mutex.name = (char*)malloc(TNN_NAME_MAX + 1);
+    sprintf(mutex.name, ".%s.tnnmutex", iname_md5.c_str());
+
+    mutex.ptr = CreateMutex( 
+        NULL,              // default security attributes
+        FALSE,             // initially not owned
+        mutex.name);             
+
+    if (mutex.ptr == NULL) 
+    {
+        printf("CreateMutex error: %d\n", GetLastError());
+        exit(-1);
+    }
+
+    return mutex;
+}
+
+int shared_mutex_close(shared_mutex_t m) {
+    if (m.ptr != NULL) {
+        if (!CloseHandle(m.ptr)) {
+           printf("CloseHandle failed\n");
+           return -1;
+        }
+        m.ptr = NULL;
+    }
+    free(m.name);
+    return 0;
+}
+
+void shared_mutex_lock(shared_mutex_t * mutex) {
+    // no time-out interval
+    DWORD dwWaitResult = WaitForSingleObject(mutex->ptr, INFINITE);     
+    switch (dwWaitResult) 
+    {
+            // The thread got ownership of the mutex
+            case WAIT_OBJECT_0: 
+                break; 
+
+            // The thread got ownership of an abandoned mutex
+            // Since windows Mutex has no api to fix the mutex now,
+            // we exit(here) to avoid further undefined behavior.
+            case WAIT_ABANDONED: 
+                return; 
+    }
+}
+
+void shared_mutex_unlock(shared_mutex_t * mutex) {
+    if (! ReleaseMutex(mutex->ptr)) 
+    { 
+        // Handle error.
+    } 
+}
+
+static bool file_exists(const char * fname) {
+    auto ret = GetFileAttributes(fname); 
+    if(INVALID_FILE_ATTRIBUTES == ret && GetLastError() == ERROR_FILE_NOT_FOUND)
+    {
+        return false;
+    }
+    return true;
+}
+
+static void create_file(const char * fname) {
+    HANDLE h = CreateFile(fname,    // name of the file
+                          GENERIC_WRITE, // open for writing
+                          0,             // sharing mode, none in this case
+                          0,             // use default security descriptor
+                          CREATE_NEW, // overwrite if exists
+                          FILE_ATTRIBUTE_NORMAL,
+                          0);
+    if (h)
+    {
+        CloseHandle(h);
+    }
+}
+
+#else // _WIN32
+
+shared_mutex_t shared_mutex_init(const char* iname) {
+    shared_mutex_t mutex = {NULL, 0, NULL, 1};
+    std::string iname_md5 = md5(std::string(iname));
+    char name[TNN_NAME_MAX];
+
+    sprintf(name, ".%u.%s.tnnmutex", getuid(), iname_md5.c_str());
+
+    // @brief open existing shared memory object, or create one.
+    mutex.shm_fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL, 0660);
+    if (mutex.shm_fd < 0) {
+        // sleep to wait ftruncated 
+        usleep(100*1000);
+        mutex.shm_fd = shm_open(name, O_RDWR | O_EXCL, 0660);
+        mutex.created = 0;
+    }
+
+    if (mutex.created ) {
+        if (ftruncate(mutex.shm_fd, sizeof(pthread_mutex_t)) != 0) {
+            perror("rapidnet cache file ftruncate failed");
+            return mutex;
+        }
+    }
+
+    // Map pthread mutex into the shared memory.
+    void *addr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
+        MAP_SHARED, mutex.shm_fd, 0);
+
+    if (addr == MAP_FAILED) {
+        perror("rapidnet cache file mmap failed");
+        return mutex;
+    }
+
+    pthread_mutex_t *mutex_ptr = (pthread_mutex_t*)addr;
+
+    // initialize the mutex.
+    if (mutex.created) {
+        pthread_mutexattr_t attr;
+        pthread_mutexattr_init(&attr);
+        pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+        pthread_mutexattr_setrobust(&attr,  PTHREAD_MUTEX_ROBUST);
+        pthread_mutex_init(mutex_ptr, &attr);
+    }
+
+    mutex.ptr = mutex_ptr;
+    mutex.name = (char*)malloc(TNN_NAME_MAX + 1);
+    strcpy(mutex.name, name);
+
+    return mutex;
+}
+
+int shared_mutex_close(shared_mutex_t mutex) {
+    if (mutex.ptr != NULL) {
+        if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) {
+            perror("rapidnet cache file munmap failed");
+            return -1;
+        }
+        mutex.ptr = NULL;
+    }
+    if (close(mutex.shm_fd)) {
+        perror("rapidnet cache file close failed");
+        return -1;
+    }
+    mutex.shm_fd = 0;
+    free(mutex.name);
+    return 0;
+}
+
+void shared_mutex_lock(shared_mutex_t * mutex) {
+    int ret = pthread_mutex_lock(mutex->ptr);
+    // make mutex consistent when last owner was crashed.
+    if (ret == EOWNERDEAD) {
+        pthread_mutex_consistent(mutex->ptr);
+    }
+}
+
+void shared_mutex_unlock(shared_mutex_t * mutex) {
+    pthread_mutex_unlock(mutex->ptr);
+}
+
+static bool file_exists(const char * fname) {
+    int fd = open(fname, O_RDWR | O_EXCL, 0666);
+    if (fd < 0) {
+        return false; 
+    }
+    close(fd);
+    return true;
+}
+
+static void create_file(const char * fname) {
+    int fd = open(fname, O_RDWR | O_CREAT, 0666);
+    close(fd);
+}
+
+#endif // _WIN32
+
+ExclFile::ExclFile(std::string fname) : m_fname(fname) {
+    static_mutex.lock();
+    this->m_lock_name = this->m_fname + "~";
+    this->m_created = false;
+    this->m_mutex = shared_mutex_init(this->m_lock_name.c_str());
+    // get mutex 
+    shared_mutex_lock(&this->m_mutex);
+}
+
+ExclFile::~ExclFile() {
+    if (this->m_created) {
+        create_file(this->m_lock_name.c_str());
+    }
+    // release mutex
+    shared_mutex_unlock(&this->m_mutex);
+    shared_mutex_close(this->m_mutex);
+    static_mutex.unlock();
+}
+
+
+bool ExclFile::Ready() {
+    bool success;
+    if (this->IsLockFileExists() && this->IsFileExists()) {
+        success = true;
+    }  else {
+        success = false; 
+        this->m_created = true;
+    }
+    return success;
+}
+
+bool ExclFile::IsFileExists() {
+    return file_exists(this->m_fname.c_str());
+}
+
+bool ExclFile::IsLockFileExists() {
+    return file_exists(this->m_lock_name.c_str());
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.h b/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.h
new file mode 100644
index 0000000..fad15dc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/exclusive_file.h
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_EXCLUSIVE_FILE_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_EXCLUSIVE_FILE_H_
+
+#include <stdlib.h>
+#include <string>
+
+#if defined _WIN32 
+#include <windows.h>
+#include <winbase.h>
+// Do not remove following statement.
+// windows.h replace LoadLibrary with LoadLibraryA, which cause compiling issue of TNN.
+#undef LoadLibrary
+#else
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <pthread.h>
+#endif
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+const int TNN_NAME_MAX = 128;
+
+typedef struct shared_mutex_struct {
+    // Pointer to the pthread mutex and share memory segment.
+#if defined _WIN32 
+    HANDLE ptr;
+#else
+    pthread_mutex_t * ptr;
+#endif
+    int shm_fd;           // Descriptor of shared memory object.
+    char* name;           // Name of the mutex and associated shared memory object.
+    int created;          // Equals 1 (true) if initialization of this structure caused creation of 
+                          // a new shared mutex.
+                          // Equals 0 (false) false if this mutex was just retrieved from shared memory. 
+} shared_mutex_t;
+
+class ExclFile {
+public:
+    explicit ExclFile(std::string fname);
+
+    ~ExclFile();
+
+    // @brief try create the file exclusively
+    bool Ready();
+
+private:
+    bool IsLockFileExists();
+    bool IsFileExists();
+
+    std::string m_fname;
+    std::string m_lock_name;
+    bool m_created;
+    shared_mutex_t m_mutex;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_EXCLUSIVE_FILE_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc
new file mode 100644
index 0000000..2b38ddf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Abs, LAYER_ABS);
+
+ILayer* AbsTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kABS);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Abs, LAYER_ABS);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/acos_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/acos_layer_builder.cc
new file mode 100644
index 0000000..4f78c59
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/acos_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Acos, LAYER_ACOS);
+
+ILayer* AcosTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kACOS);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Acos, LAYER_ACOS);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc
new file mode 100644
index 0000000..a70bcff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <vector>
+#include <algorithm>
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+ActivationTRTLayerBuilder::ActivationTRTLayerBuilder(LayerType ignore) : TensorRTLayerBuilder(ignore) {
+}
+
+ILayer* ActivationTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    ILayer* last_layer;
+    IActivationLayer* activation_layer = network->addActivation(*input_tensor, m_type);
+    if (activation_layer != nullptr) {
+        activation_layer->setName(layer_name_.c_str());
+        if (m_type == nvinfer1::ActivationType::kCLIP) {
+            auto paramlist = dynamic_cast<ClipLayerParam *>(param_);
+            activation_layer->setAlpha(std::max(-FLT_MAX, paramlist->min));
+            activation_layer->setBeta(std::min(paramlist->max, FLT_MAX));
+        }
+        if (m_type == nvinfer1::ActivationType::kLEAKY_RELU) {
+            auto resource = dynamic_cast<PReluLayerResource*>(resource_);
+            auto scope = resource->slope_handle.force_to<float*>();
+            activation_layer->setAlpha(*scope);
+        }
+        last_layer = activation_layer;
+    }
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
+            output_scale_value, 1 / output_scale_value);
+    }
+
+    return last_layer;
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.h
new file mode 100644
index 0000000..744c820
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_ACTIVATION_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_ACTIVATION_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+class ActivationTRTLayerBuilder : public TensorRTLayerBuilder {
+public:
+    ActivationTRTLayerBuilder(LayerType ignore);
+    virtual ILayer* AddToNetwork(INetworkDefinition* network);
+
+protected:
+    nvinfer1::ActivationType m_type;
+};
+
+#define DECLARE_TRT_ACTIVATION_LAYER_BUILDER(type_string)                               \
+    class type_string##TRTLayerBuilder : public ActivationTRTLayerBuilder {             \
+    public:                                                                             \
+        type_string##TRTLayerBuilder(LayerType ignore);                                 \
+    }
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_ACTIVATION_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/add_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/add_layer_builder.cc
new file mode 100644
index 0000000..a89761b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/add_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Add);
+
+AddTRTLayerBuilder::AddTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kSUM;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Add, LAYER_ADD);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/argmax_or_min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/argmax_or_min_layer_builder.cc
new file mode 100644
index 0000000..420857b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/argmax_or_min_layer_builder.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+bool ArgMaxOrMinTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    if (pos == 0) {
+        return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+    } else {
+        return ((inOut[pos].type == nvinfer1::DataType::kINT32) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+    }
+}
+
+Status ArgMaxOrMinTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* ArgMaxOrMinTRTPluginLayerBuilder::getPluginType() const {
+    return "ArgMaxOrMin";
+}
+
+nvinfer1::DataType ArgMaxOrMinTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return nvinfer1::DataType::kINT32;
+}
+
+ILayer* ArgMaxOrMinTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs ArgMaxOrMinTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<ArgMaxOrMinLayerParam*>(param_);
+    DimsExprs output(inputs[0]);
+    output.d[param->axis] = exprBuilder.constant(1);
+    if (!param->keep_dims) {
+        for (int i = param->axis; i < output.nbDims-1; i++) {
+            output.d[i] = output.d[i+1];
+        }
+        output.nbDims = output.nbDims - 1;
+    }
+    return output;
+}
+
+const char* ArgMaxOrMinPluginCreator::getPluginName() const {
+    return "ArgMaxOrMin";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/asin_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/asin_layer_builder.cc
new file mode 100644
index 0000000..ec28e3b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/asin_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Asin, LAYER_ASIN);
+
+ILayer* AsinTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kASIN);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Asin, LAYER_ASIN);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/atan_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/atan_layer_builder.cc
new file mode 100644
index 0000000..b98ccd0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/atan_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Atan, LAYER_ATAN);
+
+ILayer* AtanTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kATAN);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Atan, LAYER_ATAN);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc
new file mode 100644
index 0000000..984d71c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
+
+ILayer* BatchNormTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
+
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    auto input_dims        = input_blobs_[0]->GetBlobDesc().dims;
+    int channel            = input_dims[1];
+    int count              = DimsVectorUtils::Count(input_dims);
+
+    Weights power { nvinfer1::DataType::kFLOAT, nullptr, 0 };
+    Weights shift;
+    shift.type = nvinfer1::DataType::kFLOAT;
+    shift.count = resource->bias_handle.GetDataCount();
+    shift.values = resource->bias_handle.force_to<void *>();
+
+    Weights scale;
+    scale.type = nvinfer1::DataType::kFLOAT;
+    scale.count = resource->scale_handle.GetDataCount();
+    scale.values = resource->scale_handle.force_to<void *>();
+
+    int dims_size = tensor->getDimensions().nbDims;
+    // unsqueeze 
+    ILayer* layer;
+    if (dims_size == 2 || dims_size == 3) {
+        DimsVector unsqueeze_dims;
+        for (int i = 0; i < dims_size; i++) {
+            unsqueeze_dims.push_back(0);
+        }
+        for (int i = 0; i < 4 - dims_size; i++) {
+            unsqueeze_dims.push_back(1);
+        }
+        layer = AddReshapeToNetwork(network, tensor, unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+        tensor = layer->getOutput(0);
+    }
+
+    //add scale
+    if (resource->scale_handle.GetBytesSize() == DataTypeUtils::GetBytesSize(resource->scale_handle.GetDataType())) {
+        layer = network->addScaleNd(*tensor, ScaleMode::kUNIFORM, shift, scale, power, 1);
+    } else {
+        layer = network->addScaleNd(*tensor, ScaleMode::kCHANNEL, shift, scale, power, 1);
+    }
+    if (layer != NULL) {
+        layer->setName(layer_name_.c_str());
+        tensor = layer->getOutput(0);
+    }
+
+    //squeeze
+    if(dims_size == 2 || dims_size == 3) {
+        DimsVector squeeze_dims;
+        for (int i = 0; i < dims_size; i++) {
+            squeeze_dims.push_back(0);
+        }
+        layer = AddReshapeToNetwork(network, tensor, squeeze_dims, (layer_name_ + "unsqueeze").c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
+REGISTER_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_SCALE);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc
new file mode 100644
index 0000000..fc32a8b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+BinaryTRTLayerBuilder::BinaryTRTLayerBuilder(LayerType ignore) : TensorRTLayerBuilder(ignore) {
+}
+
+ILayer* BinaryTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    IElementWiseLayer* layer;
+    if (input_blobs_.size() == 2) {
+        auto input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+        auto input_tensor1 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor1)->GetTensor();
+        auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+
+        DimsVector unsqueeze_dims;
+        if (input_tensor1->getDimensions().nbDims < input_tensor2->getDimensions().nbDims) {
+            for (int i = 0; i < input_tensor2->getDimensions().nbDims - input_tensor1->getDimensions().nbDims; i++) {
+                unsqueeze_dims.push_back(1);
+            }
+
+            for (int i = 0; i < input_tensor1->getDimensions().nbDims; i++) {
+                unsqueeze_dims.push_back(input_tensor1->getDimensions().d[i]);
+            }
+            ILayer* unsqueeze_layer = AddReshapeToNetwork(network, input_tensor1,
+                unsqueeze_dims, (layer_name_ + "_unsqueeze").c_str());
+            input_tensor1 = unsqueeze_layer->getOutput(0);
+        } else if (input_tensor1->getDimensions().nbDims > input_tensor2->getDimensions().nbDims) {
+            for (int i = 0; i < input_tensor1->getDimensions().nbDims - input_tensor2->getDimensions().nbDims; i++) {
+                unsqueeze_dims.push_back(1);
+            }
+
+            for (int i = 0; i < input_tensor2->getDimensions().nbDims; i++) {
+                unsqueeze_dims.push_back(input_tensor2->getDimensions().d[i]);
+            }
+            ILayer* unsqueeze_layer = AddReshapeToNetwork(network, input_tensor2,
+                unsqueeze_dims, (layer_name_ + "_unsqueeze").c_str());
+            input_tensor2 = unsqueeze_layer->getOutput(0);
+        }
+
+        layer = network->addElementWise(*input_tensor1, *input_tensor2, m_op);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+        }
+    } else {
+        auto paramlist = dynamic_cast<MultidirBroadcastLayerParam*>(param_);
+        auto resource = dynamic_cast<EltwiseLayerResource*>(resource_);
+
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto src_a = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+        auto const_layer = ConvertWeightToConstLayer(network, &(resource->element_handle),
+            resource->element_shape, src_a->getDimensions().nbDims);
+
+        if (const_layer == nullptr) {
+            LOGE("BinaryTRTLayerBuilder create weights node failed\n");
+            return nullptr;
+        }
+
+        auto src_b = const_layer->getOutput(0);
+        if (src_a->getDimensions().nbDims < src_b->getDimensions().nbDims) {
+            DimsVector unsqueeze_dims;
+            for (int i = 0; i < src_b->getDimensions().nbDims - src_a->getDimensions().nbDims; i++) {
+                unsqueeze_dims.push_back(1);
+            }
+
+            auto input_dims = input_blobs_[0]->GetBlobDesc().dims;
+            for (int i = 0; i < input_dims.size(); i++) {
+                unsqueeze_dims.push_back(input_dims[i]);
+            }
+            ILayer* unsqueeze_layer = AddReshapeToNetwork(network, src_a,
+                unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+            src_a = unsqueeze_layer->getOutput(0);
+        }
+
+        if (paramlist->weight_input_index == 0) {
+            std::swap(src_a, src_b);
+        }
+        layer = network->addElementWise(*src_a, *src_b, m_op);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+        }
+    }
+    return layer;
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.h
new file mode 100644
index 0000000..77eda7f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+class BinaryTRTLayerBuilder : public TensorRTLayerBuilder {
+public:
+    BinaryTRTLayerBuilder(LayerType ignore);
+    virtual ILayer* AddToNetwork(INetworkDefinition* network);
+
+protected:
+    ElementWiseOperation m_op;
+};
+
+#define DECLARE_TRT_BINARY_LAYER_BUILDER(type_string)                                                                  \
+    class type_string##TRTLayerBuilder : public BinaryTRTLayerBuilder {                                                \
+    public:                                                                                                            \
+        type_string##TRTLayerBuilder(LayerType ignore);                                                                \
+    }
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/bitshift_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/bitshift_layer_builder.cc
new file mode 100644
index 0000000..6528674
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/bitshift_layer_builder.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(BitShift, LAYER_BITSHIFT);
+
+bool BitShiftTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kINT32);
+}
+
+Status BitShiftTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* BitShiftTRTPluginLayerBuilder::getPluginType() const {
+    return "BitShift";
+}
+
+nvinfer1::DataType BitShiftTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* BitShiftTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs BitShiftTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* BitShiftPluginCreator::getPluginName() const {
+    return "BitShift";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(BitShift, LAYER_BITSHIFT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc
new file mode 100644
index 0000000..0242890
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Cast, LAYER_CAST);
+
+ILayer* CastTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_param = dynamic_cast<CastLayerParam *>(param_);
+    if (!layer_param) {
+        return nullptr;
+    }
+
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    ILayer* layer = network->addIdentity(*tensor);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setOutputType(0, ConvertToTRTDataType((DataType)layer_param->to));
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Cast, LAYER_CAST);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_pooling_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_pooling_layer_builder.cc
new file mode 100644
index 0000000..25240b3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_pooling_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+bool CbamFusedPoolingTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return nbInputs == 1 && nbOutputs == 2 && pos < nbInputs + nbOutputs &&
+        inOut[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+        (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF);
+}
+
+Status CbamFusedPoolingTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* CbamFusedPoolingTRTPluginLayerBuilder::getPluginType() const {
+    return "CbamFusedPooling";
+}
+
+nvinfer1::DataType CbamFusedPoolingTRTPluginLayerBuilder::getOutputDataType(int index,
+        const nvinfer1::DataType* inputTypes, int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* CbamFusedPoolingTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs CbamFusedPoolingTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    output.d[2] = exprBuilder.constant(1);
+    output.d[3] = exprBuilder.constant(1);
+    return output;
+}
+
+const char* CbamFusedPoolingPluginCreator::getPluginName() const {
+    return "CbamFusedPooling";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(CbamFusedPooling, LAYER_CBAM_FUSED_POOLING);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_reduce_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_reduce_layer_builder.cc
new file mode 100644
index 0000000..435fd02
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cbam_fused_reduce_layer_builder.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+bool CbamFusedReduceTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs &&
+        inOut[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+        (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF);
+}
+
+Status CbamFusedReduceTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* CbamFusedReduceTRTPluginLayerBuilder::getPluginType() const {
+    return "CbamFusedReduce";
+}
+
+nvinfer1::DataType CbamFusedReduceTRTPluginLayerBuilder::getOutputDataType(int index,
+        const nvinfer1::DataType* inputTypes, int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* CbamFusedReduceTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs CbamFusedReduceTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    output.d[1] = exprBuilder.constant(2);
+    return output;
+}
+
+const char* CbamFusedReducePluginCreator::getPluginName() const {
+    return "CbamFusedReduce";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(CbamFusedReduce, LAYER_CBAM_FUSED_REDUCE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc
new file mode 100644
index 0000000..a7dc23f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Ceil, LAYER_CEIL);
+
+ILayer* CeilTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kCEIL);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Ceil, LAYER_CEIL);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/clip_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/clip_layer_builder.cc
new file mode 100644
index 0000000..12877f6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/clip_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_ACTIVATION_LAYER_BUILDER(Clip);
+
+ClipTRTLayerBuilder::ClipTRTLayerBuilder(LayerType ignore) : ActivationTRTLayerBuilder(ignore) {
+    m_type = nvinfer1::ActivationType::kCLIP;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Clip, LAYER_CLIP);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc
new file mode 100644
index 0000000..f0feda4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Concat, LAYER_CONCAT);
+
+ILayer* ConcatTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ConcatLayerParam*>(param_);
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+    size_t nbInputs = input_blobs_.size();
+    ITensor ** input_tensors = new ITensor*[nbInputs];
+    for (int i = 0; i < nbInputs; i++) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[i])->GetForeignTensor();
+        auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+        input_tensors[i] = tensor;
+    }
+
+    m_network->m_concat_blob_names.insert(output_blobs_[0]->GetBlobDesc().name);
+
+    ILayer* last_layer;
+    IConcatenationLayer* layer = network->addConcatenation(input_tensors, nbInputs);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setAxis(paramlist->axis);
+        last_layer = layer;
+    }
+    delete [] input_tensors;
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        Weights output_quant_shift;
+        output_quant_shift.type = nvinfer1::DataType::kFLOAT;
+        output_quant_shift.values = nullptr;
+        output_quant_shift.count = 0;
+
+        Weights output_quant_scale;
+        float* output_quant_scale_data = (float*)malloc(sizeof(float));
+        int8_weight_data.push_back(output_quant_scale_data);
+        *output_quant_scale_data = output_scale_value;
+        output_quant_scale.type = nvinfer1::DataType::kFLOAT;
+        output_quant_scale.values = (void*)output_quant_scale_data;
+        output_quant_scale.count = 1;
+
+        Weights output_quant_power;
+        output_quant_power.type = nvinfer1::DataType::kFLOAT;
+        output_quant_power.values = nullptr;
+        output_quant_power.count = 0;
+
+        auto output_quant_layer = network->addScale(*(last_layer->getOutput(0)), ScaleMode::kUNIFORM,
+            output_quant_shift, output_quant_scale, output_quant_power);
+        std::string output_quant_name = layer_name_ + "_output_quant_";
+        output_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
+        output_quant_layer->setName(output_quant_name.c_str());
+
+        Weights output_dequant_shift;
+        output_dequant_shift.type = nvinfer1::DataType::kFLOAT;
+        output_dequant_shift.values = nullptr;
+        output_dequant_shift.count = 0;
+
+        Weights output_dequant_scale;
+        float* output_dequant_scale_data = (float*)malloc(sizeof(float));
+        int8_weight_data.push_back(output_dequant_scale_data);
+        *output_dequant_scale_data = 1 / output_scale_value;
+        output_dequant_scale.type = nvinfer1::DataType::kFLOAT;
+        output_dequant_scale.values = (void*)output_dequant_scale_data;
+        output_dequant_scale.count = 1;
+
+        Weights output_dequant_power;
+        output_dequant_power.type = nvinfer1::DataType::kFLOAT;
+        output_dequant_power.values = nullptr;
+        output_dequant_power.count = 0;
+
+        auto output_dequant_layer = network->addScale(*(output_quant_layer->getOutput(0)),
+            ScaleMode::kUNIFORM, output_dequant_shift, output_dequant_scale, output_dequant_power);
+        std::string output_dequant_name = layer_name_ + "_output_dequant_";
+        output_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        output_dequant_layer->setName(output_dequant_name.c_str());
+        last_layer = output_dequant_layer;
+        std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->SetQuantized();
+    }
+
+    return last_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Concat, LAYER_CONCAT);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc
new file mode 100644
index 0000000..a88bd20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+ILayer* ConstantOfShapeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_resource = dynamic_cast<ConstantOfShapeLayerResource*>(resource_);
+
+    auto input_tensors = GetInputITensors();
+    int rank = input_tensors[0]->getDimensions().d[0];
+
+    Dims strides{rank};
+    std::fill(strides.d, strides.d + strides.nbDims, 0);
+
+    Weights const_weight;
+    const_weight.type = ConvertToTRTDataType(layer_resource->value.GetDataType());
+    const_weight.values = layer_resource->value.force_to<char *>();
+    const_weight.count = layer_resource->value.GetDataCount();
+
+    auto weightDims = ConvertToTRTDims(layer_resource->value.GetBufferDims());
+    ILayer* constant_layer = network->addConstant(weightDims, const_weight);
+    nvinfer1::Dims unsqueezeDims{rank};
+    std::fill(unsqueezeDims.d, unsqueezeDims.d + unsqueezeDims.nbDims, 1);
+    IShuffleLayer* unsqueeze = network->addShuffle(*constant_layer->getOutput(0));
+    unsqueeze->setReshapeDimensions(unsqueezeDims);
+
+    Dims starts;
+    starts.nbDims = rank;
+    for (int i = 0; i < rank; i++) {
+        starts.d[i] = 0;
+    }
+    ISliceLayer* broadcast_layer = network->addSlice(*unsqueeze->getOutput(0), starts,
+        nvinfer1::Dims{}, strides);
+
+    if (broadcast_layer != nullptr) {
+        broadcast_layer->setName(layer_name_.c_str());   
+        broadcast_layer->setInput(2, *input_tensors[0]);
+    }
+
+    return broadcast_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc
new file mode 100644
index 0000000..0b0359b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Convolution1D, LAYER_CONVOLUTION_1D);
+
+ILayer* Convolution1DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+    auto resource = dynamic_cast<ConvLayerResource*>(resource_);
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    Weights kernelWeights;
+    Weights biasWeights;
+    ILayer* last_layer;
+    kernelWeights.type = nvinfer1::DataType::kFLOAT;
+    kernelWeights.values = resource->filter_handle.force_to<void*>();
+    kernelWeights.count = resource->filter_handle.GetDataCount();
+    if (paramlist->bias) {
+        biasWeights.type = nvinfer1::DataType::kFLOAT;
+        biasWeights.values = resource->bias_handle.force_to<void*>();
+        biasWeights.count = resource->bias_handle.GetDataCount();
+    } else {
+        biasWeights.type = nvinfer1::DataType::kFLOAT;
+        biasWeights.values = nullptr;
+        biasWeights.count = 0;
+    }
+
+    Dims kernelSize;
+    kernelSize.nbDims = 2;
+    kernelSize.d[0] = paramlist->kernels[0];
+    kernelSize.d[1] = 1;
+
+    DimsVector unsqueeze_dims(input_tensor->getDimensions().nbDims, 0);
+    unsqueeze_dims.push_back(1);
+    ILayer* layer = AddReshapeToNetwork(network, input_tensor, unsqueeze_dims, (layer_name_ + "unsqueeze").c_str());
+
+    IConvolutionLayer* conv_layer;
+    conv_layer = network->addConvolutionNd(*(layer->getOutput(0)), paramlist->output_channel, kernelSize,
+        kernelWeights, biasWeights);
+    if (conv_layer != nullptr) {
+        conv_layer->setName(layer_name_.c_str());
+        Dims strides;
+        strides.nbDims = 2;
+        strides.d[0] = paramlist->strides[0];
+        strides.d[1] = 1;
+        conv_layer->setStrideNd(strides);
+        Dims dialations;
+        dialations.nbDims = 2;
+        dialations.d[0] = paramlist->dialations[0];
+        dialations.d[1] = 1;
+        conv_layer->setDilationNd(dialations);
+        Dims pads;
+        pads.nbDims = 2;
+        pads.d[0] = paramlist->pads[0];
+        pads.d[1] = 0;
+        conv_layer->setPaddingNd(pads);
+        conv_layer->setNbGroups(paramlist->group);
+    }
+
+    last_layer = conv_layer;
+
+    IActivationLayer* activation_layer;
+    if (paramlist->activation_type == ActivationType_ReLU) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type == ActivationType_ReLU6) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kCLIP);
+        activation_layer->setAlpha(0.f);
+        activation_layer->setBeta(6.f);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type != ActivationType_None) {
+        LOGE("Error: Unsupport reshape type(%d)", paramlist->activation_type);
+        return nullptr;
+    }
+
+    unsqueeze_dims.erase(unsqueeze_dims.end()-1);
+    last_layer = AddReshapeToNetwork(network, last_layer->getOutput(0), unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+
+    return last_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Convolution1D, LAYER_CONVOLUTION_1D);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc
new file mode 100644
index 0000000..730e454
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc
@@ -0,0 +1,233 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+#include "tnn/network/tensorrt/dimension_expr.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Convolution3D, LAYER_CONVOLUTION_3D);
+
+bool Convolution3DTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+}
+
+Status Convolution3DTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* Convolution3DTRTPluginLayerBuilder::getPluginType() const {
+    return "Convolution3D";
+}
+
+nvinfer1::DataType Convolution3DTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+DimsExprs Convolution3DTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    if (!conv_param) {
+        LOGE("Convolution3DTRTPluginLayerBuilder got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    if (inputs[0].nbDims <5) {
+        LOGE("Convolution3DTRTPluginLayerBuilder expect dims.size() >= 5\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    for(int i=0;i<3;i++) {
+        if (conv_param->pads[i*2] != conv_param->pads[i*2+1]) {
+            LOGE("Convolution3DTRTPluginLayerBuilder does not support asymmetric padding.\n");
+            return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+        }
+    }
+
+    nvinfer1::IExprBuilder& e = exprBuilder;
+
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+    const int pad_d_begin = conv_param->pads[4];
+
+    const int kernel_w = conv_param->kernels[0];
+    const int kernel_h = conv_param->kernels[1];
+    const int kernel_d = conv_param->kernels[2];
+
+    const int stride_w = conv_param->strides[0];
+    const int stride_h = conv_param->strides[1];
+    const int stride_d = conv_param->strides[2];
+
+    const int dilation_w = conv_param->dialations[0];
+    const int dilation_h = conv_param->dialations[1];
+    const int dilation_d = conv_param->dialations[2];
+
+    DimensionExpr height_out(nullptr, e);
+    DimensionExpr width_out(nullptr, e);
+    DimensionExpr depth_out(nullptr, e);
+
+    DimensionExpr depth(inputs[0].d[2], e);
+    DimensionExpr height(inputs[0].d[3], e);
+    DimensionExpr width(inputs[0].d[4], e);
+
+    const int pad_type = conv_param->pad_type;
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+        int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+        int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;
+
+        height_out = (height + 2 * pad_h_begin - kernel_extent_h) / stride_h + 1;
+        width_out  = (width + 2 * pad_w_begin - kernel_extent_w) / stride_w + 1;
+        depth_out  = (depth + 2 * pad_d_begin - kernel_extent_d) / stride_d + 1;
+
+    } else if (pad_type == 0) {// SAME type
+        height_out = ceil_div(height, stride_h);
+        width_out  = ceil_div(width , stride_w);
+        depth_out  = ceil_div(depth , stride_d);
+
+    } else if (pad_type == 1)  {// VALID type
+        height_out = ceil_div(height - kernel_h + 1, stride_h);
+        width_out  = ceil_div(width  - kernel_w + 1, stride_w);
+        depth_out  = ceil_div(depth  - kernel_d + 1, stride_d);
+
+    } else {
+        LOGE("Convolution3DTRTPluginLayerBuilder only support default padding m\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimsExprs output(inputs[0]);
+
+    output.d[1] = e.constant(conv_param->output_channel);
+    output.d[2] = depth_out.expr();
+    output.d[3] = height_out.expr();
+    output.d[4] = width_out.expr();
+
+    return output;
+}
+
+ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+    auto resource = dynamic_cast<ConvLayerResource*>(resource_);
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    Weights kernelWeights;
+    Weights biasWeights;
+    ILayer* last_layer;
+    if (int8) {
+        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
+        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        std::vector<int> dims;
+        dims.push_back(paramlist->output_channel);
+        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1] / paramlist->group);
+        dims.push_back(paramlist->kernels[1]);
+        dims.push_back(paramlist->kernels[0]);
+        last_layer = AddInt8WeightQDQLayers(network, &(resource->filter_handle), kernelWeights,
+            paramlist->bias ? &(resource->bias_handle) : nullptr, biasWeights,
+            1 / (weight_scale_value / input_scale_value), dims);
+    } else {
+        kernelWeights.type = nvinfer1::DataType::kFLOAT;
+        kernelWeights.values = resource->filter_handle.force_to<void*>();
+        kernelWeights.count = resource->filter_handle.GetDataCount();
+        if (paramlist->bias) {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = resource->bias_handle.force_to<void*>();
+            biasWeights.count = resource->bias_handle.GetDataCount();
+        } else {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = nullptr;
+            biasWeights.count = 0;
+        }
+    }
+
+    Dims kernelSize = ConvertToTRTDimsReverse(paramlist->kernels);
+    IConvolutionLayer* conv_layer;
+    auto pads = paramlist->pads;
+    if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3] && pads[4] == pads[5])) {
+        conv_layer = network->addConvolutionNd(*input_tensor, paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
+        if (conv_layer != nullptr) {
+            conv_layer->setName(layer_name_.c_str());
+            conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+            conv_layer->setDilationNd(ConvertToTRTDimsReverse(paramlist->dialations));
+            conv_layer->setPaddingNd(ConvertPaddingToTRTDims(paramlist->pads));
+            conv_layer->setNbGroups(paramlist->group);
+        }
+    } else {
+        DimsVector postPadding{pads[3], pads[1]};
+        DimsVector  prePadding{pads[2], pads[0]};
+        IPaddingLayer* padding_layer = network->addPaddingNd(*input_tensor, 
+                                                    ConvertToTRTDims(prePadding), 
+                                                    ConvertToTRTDims(postPadding));
+        ITensor* pad_tensor = padding_layer->getOutput(0);
+        conv_layer = network->addConvolutionNd(*pad_tensor, paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
+        if(conv_layer != NULL) {
+            conv_layer->setName(layer_name_.c_str());
+            conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+            conv_layer->setDilationNd(ConvertToTRTDimsReverse(paramlist->dialations));
+            conv_layer->setNbGroups(paramlist->group);
+        }
+    }
+
+    last_layer = conv_layer;
+
+    if (int8) {
+        conv_layer->setPrecision(nvinfer1::DataType::kINT8);
+    }
+
+    IActivationLayer* activation_layer;
+    if (paramlist->activation_type == ActivationType_ReLU) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type == ActivationType_ReLU6) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kCLIP);
+        activation_layer->setAlpha(0.f);
+        activation_layer->setBeta(6.f);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type != ActivationType_None) {
+        LOGE("Error: Unsupport reshape type(%d)", paramlist->activation_type);
+        return nullptr;
+    }
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
+            output_scale_value, 1 / output_scale_value);
+    }
+
+    return last_layer;
+}
+
+const char* Convolution3DPluginCreator::getPluginName() const {
+    return "Convolution3D";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Convolution3D, LAYER_CONVOLUTION_3D);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc
new file mode 100644
index 0000000..ef447e3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/dimension_expr.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Convolution, LAYER_CONVOLUTION);
+
+bool ConvolutionTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+}
+
+Status ConvolutionTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* ConvolutionTRTPluginLayerBuilder::getPluginType() const {
+    return "Convolution";
+}
+
+nvinfer1::DataType ConvolutionTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+DimsExprs ConvolutionTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    if (!conv_param) {
+        LOGE("ConvolutionTRTPluginLayerBuilder got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    for (int i = 0; i < 2; i++) {
+        if (conv_param->pads[i*2] != conv_param->pads[i*2+1]) {
+            LOGE("ConvolutionTRTPluginLayerBuilder does not support asymmetric padding.\n");
+            return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+        }
+    }
+
+    nvinfer1::IExprBuilder& e = exprBuilder;
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+
+    const int kernel_w = conv_param->kernels[0];
+    const int kernel_h = conv_param->kernels[1];
+
+    const int stride_w = conv_param->strides[0];
+    const int stride_h = conv_param->strides[1];
+
+    const int dilation_w = conv_param->dialations[0];
+    const int dilation_h = conv_param->dialations[1];
+
+    DimensionExpr height_out(nullptr, e);
+    DimensionExpr width_out(nullptr, e);
+
+    DimensionExpr height(inputs[0].d[2], e);
+    DimensionExpr width(inputs[0].d[3], e);
+
+    const int pad_type = conv_param->pad_type;
+    if (pad_type == -1) {
+        int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+        int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+        height_out = (height + 2 * pad_h_begin - kernel_extent_h) / stride_h + 1;
+        width_out  = (width + 2 * pad_w_begin - kernel_extent_w) / stride_w + 1;
+    } else if (pad_type == 0) {// SAME type
+        height_out = ceil_div(height, stride_h);
+        width_out  = ceil_div(width , stride_w);
+    } else if (pad_type == 1)  {// VALID type
+        height_out = ceil_div(height - kernel_h + 1, stride_h);
+        width_out  = ceil_div(width  - kernel_w + 1, stride_w);
+    } else {
+        LOGE("ConvolutionTRTPluginLayerBuilder only support default padding m\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimsExprs output(inputs[0]);
+
+    output.d[1] = e.constant(conv_param->output_channel);
+    output.d[2] = height_out.expr();
+    output.d[3] = width_out.expr();
+
+    return output;
+}
+
+const char* ConvolutionPluginCreator::getPluginName() const {
+    return "Convolution";
+}
+
+ILayer* ConvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+    auto resource = dynamic_cast<ConvLayerResource*>(resource_);
+
+    auto in_blob_name = input_blobs_[0]->GetBlobDesc().name;
+    bool following_a_concat_layer =
+        m_network->m_concat_blob_names.find(in_blob_name) != m_network->m_concat_blob_names.end();
+
+    auto pads = paramlist->pads;
+    bool symmetric = (pads[0] == pads[1]) && (pads[2] == pads[3]);
+    if (symmetric && paramlist->kernels[0] == 7 && paramlist->kernels[1] == 7 && following_a_concat_layer) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    Weights kernelWeights;
+    Weights biasWeights;
+    ILayer* last_layer;
+    if (int8) {
+        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
+        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        std::vector<int> dims;
+        dims.push_back(paramlist->output_channel);
+        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1] / paramlist->group);
+        dims.push_back(paramlist->kernels[1]);
+        dims.push_back(paramlist->kernels[0]);
+        last_layer = AddInt8WeightQDQLayers(network, &(resource->filter_handle), kernelWeights,
+            paramlist->bias ? &(resource->bias_handle) : nullptr, biasWeights,
+            1 / (weight_scale_value / input_scale_value), dims);
+    } else {
+        kernelWeights.type = nvinfer1::DataType::kFLOAT;
+        kernelWeights.values = resource->filter_handle.force_to<void*>();
+        kernelWeights.count = resource->filter_handle.GetDataCount();
+        if (paramlist->bias) {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = resource->bias_handle.force_to<void*>();
+            biasWeights.count = resource->bias_handle.GetDataCount();
+        } else {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = nullptr;
+            biasWeights.count = 0;
+        }
+    }
+
+    Dims kernelSize = ConvertToTRTDimsReverse(paramlist->kernels);
+    IConvolutionLayer* conv_layer;
+    if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3])) {
+        conv_layer = network->addConvolutionNd(*input_tensor, paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
+        if (conv_layer != nullptr) {
+            conv_layer->setName(layer_name_.c_str());
+            conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+            conv_layer->setDilationNd(ConvertToTRTDimsReverse(paramlist->dialations));
+            conv_layer->setPaddingNd(ConvertPaddingToTRTDims(paramlist->pads));
+            conv_layer->setNbGroups(paramlist->group);
+        }
+    } else {
+        DimsVector postPadding{pads[3], pads[1]};
+        DimsVector  prePadding{pads[2], pads[0]};
+        IPaddingLayer* padding_layer = network->addPaddingNd(*input_tensor, 
+                                                    ConvertToTRTDims(prePadding), 
+                                                    ConvertToTRTDims(postPadding));
+        ITensor* pad_tensor = padding_layer->getOutput(0);
+        conv_layer = network->addConvolutionNd(*pad_tensor, paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
+        if(conv_layer != NULL) {
+            conv_layer->setName(layer_name_.c_str());
+            conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+            conv_layer->setDilationNd(ConvertToTRTDimsReverse(paramlist->dialations));
+            conv_layer->setNbGroups(paramlist->group);
+        }
+    }
+
+    last_layer = conv_layer;
+
+    if (int8) {
+        conv_layer->setPrecision(nvinfer1::DataType::kINT8);
+    }
+
+    IActivationLayer* activation_layer;
+    if (paramlist->activation_type == ActivationType_ReLU) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type == ActivationType_ReLU6) {
+        activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kCLIP);
+        activation_layer->setAlpha(0.f);
+        activation_layer->setBeta(6.f);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type != ActivationType_None) {
+        LOGE("Error: Unsupport reshape type(%d)", paramlist->activation_type);
+        return nullptr;
+    }
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
+            output_scale_value, 1 / output_scale_value);
+    }
+
+    return last_layer;
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Convolution, LAYER_CONVOLUTION);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cos_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cos_layer_builder.cc
new file mode 100644
index 0000000..9330be3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/cos_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Cos, LAYER_COS);
+
+ILayer* CosTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kCOS);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Cos, LAYER_COS);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc
new file mode 100644
index 0000000..51692da
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+
+ILayer* DeconvolutionTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+
+    if (paramlist->dialations[1] != 1 || paramlist->dialations[0] != 1) {
+        LOGE("TRT does not support dilated deconvolutions");
+        return nullptr;
+    }
+    auto resource = dynamic_cast<ConvLayerResource*>(resource_);
+    Weights kernelWeights;
+    kernelWeights.type = nvinfer1::DataType::kFLOAT;
+    kernelWeights.values = resource->filter_handle.force_to<void*>();
+    kernelWeights.count = resource->filter_handle.GetDataCount();
+
+    Weights biasWeights;
+    biasWeights.type = nvinfer1::DataType::kFLOAT;
+    if (paramlist->bias) {
+        biasWeights.values = resource->bias_handle.force_to<void*>();
+        biasWeights.count = resource->bias_handle.GetDataCount();
+    } else {
+        biasWeights.values = nullptr;
+        biasWeights.count = 0;
+    }
+
+    ILayer* last_layer;
+    DimsHW kernelSize(paramlist->kernels[1], paramlist->kernels[0]);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    auto pads = paramlist->pads;
+    IDeconvolutionLayer* deconv_layer;
+    if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3])) {
+        deconv_layer = network->addDeconvolution(*tensor, paramlist->output_channel,
+            kernelSize, kernelWeights, biasWeights);
+        if (deconv_layer != nullptr) {
+            deconv_layer->setName(layer_name_.c_str());
+            deconv_layer->setStride(DimsHW(paramlist->strides[1], paramlist->strides[0]));
+            deconv_layer->setPadding(DimsHW(paramlist->pads[2], paramlist->pads[0]));
+            deconv_layer->setNbGroups(paramlist->group);
+            //deconv_layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+        }
+    } else {
+        DimsVector postPadding{pads[3], pads[1]};
+        DimsVector  prePadding{pads[2], pads[0]};
+        IPaddingLayer* padding_layer = network->addPaddingNd(*tensor, 
+                                                    ConvertToTRTDims(prePadding), 
+                                                    ConvertToTRTDims(postPadding));
+        ITensor* pad_tensor = padding_layer->getOutput(0);
+        deconv_layer = network->addDeconvolution(*pad_tensor, paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if(deconv_layer != NULL) {
+            deconv_layer->setName(layer_name_.c_str());
+            deconv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 71
+            deconv_layer->setDilationNd(ConvertToTRTDimsReverse(paramlist->dialations));
+#endif
+            deconv_layer->setNbGroups(paramlist->group);
+        }
+    }
+    last_layer = deconv_layer;
+
+    IActivationLayer* activation_layer;
+    if (paramlist->activation_type == ActivationType_ReLU) {
+        activation_layer = network->addActivation(*(deconv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type == ActivationType_ReLU6) {
+        activation_layer = network->addActivation(*(deconv_layer->getOutput(0)), nvinfer1::ActivationType::kCLIP);
+        activation_layer->setAlpha(0.f);
+        activation_layer->setBeta(6.f);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type != ActivationType_None) {
+        LOGE("Error: Unsupport reshape type(%d)", paramlist->activation_type);
+        return nullptr;
+    }
+
+    return last_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/detection_output_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/detection_output_layer_builder.cc
new file mode 100644
index 0000000..7bd32c7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/detection_output_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+bool DetectionOutputTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status DetectionOutputTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* DetectionOutputTRTPluginLayerBuilder::getPluginType() const {
+    return "DetectionOutput";
+}
+
+nvinfer1::DataType DetectionOutputTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* DetectionOutputTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs DetectionOutputTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DetectionOutputLayerParam* param = dynamic_cast<DetectionOutputLayerParam*>(param_);
+    DimsExprs output;
+    output.nbDims = 4;
+    output.d[0] = exprBuilder.constant(1);
+    output.d[1] = exprBuilder.constant(1);
+    output.d[2] = exprBuilder.constant(param->keep_top_k);
+    output.d[3] = exprBuilder.constant(7);
+    return output;
+}
+
+const char* DetectionOutputPluginCreator::getPluginName() const {
+    return "DetectionOutput";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(DetectionOutput, LAYER_DETECTION_OUTPUT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/div_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/div_layer_builder.cc
new file mode 100644
index 0000000..18f2cca
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/div_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Div);
+
+DivTRTLayerBuilder::DivTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kDIV;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Div, LAYER_DIV);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/einsum_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/einsum_layer_builder.cc
new file mode 100644
index 0000000..b01c589
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/einsum_layer_builder.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Einsum, LAYER_EINSUM);
+
+bool EinsumTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status EinsumTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* EinsumTRTPluginLayerBuilder::getPluginType() const {
+    return "Einsum";
+}
+
+nvinfer1::DataType EinsumTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* EinsumTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs EinsumTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<EinsumLayerParam *>(param_);
+    DimsExprs output;
+    auto output_dims = output_blobs_[0]->GetBlobDesc().dims;
+    output.nbDims = output_dims.size();
+    for (int i = 1; i < output.nbDims; i++) {
+        output.d[i] = exprBuilder.constant(output_dims[i]);
+    }
+    output.d[0] = inputs[0].d[0];
+    return output;
+}
+
+const char* EinsumPluginCreator::getPluginName() const {
+    return "Einsum";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Einsum, LAYER_EINSUM);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/elu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/elu_layer_builder.cc
new file mode 100644
index 0000000..498b178
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/elu_layer_builder.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Elu, LAYER_ELU);
+
+bool EluTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status EluTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* EluTRTPluginLayerBuilder::getPluginType() const {
+    return "Elu";
+}
+
+nvinfer1::DataType EluTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* EluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs EluTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    for (int i = 1; i < nbInputs; i++) {
+        for (int j = 0; j < output.nbDims; j++) {
+            output.d[j] = exprBuilder.operation(DimensionOperation::kMAX, *output.d[j], *inputs[i].d[j]);
+        }
+    }
+    return output;
+}
+
+const char* EluPluginCreator::getPluginName() const {
+    return "Elu";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Elu, LAYER_ELU);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/erf_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/erf_layer_builder.cc
new file mode 100644
index 0000000..fc2f75d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/erf_layer_builder.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Erf, LAYER_ERF);
+
+ILayer* ErfTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto tensor = GetInputITensors()[0];
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kERF);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Erf, LAYER_ERF);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/exp_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/exp_layer_builder.cc
new file mode 100644
index 0000000..96e298e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/exp_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Exp, LAYER_EXP);
+
+ILayer* ExpTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kEXP);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Exp, LAYER_EXP);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc
new file mode 100644
index 0000000..3352066
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Expand, LAYER_EXPAND);
+
+ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_param = dynamic_cast<ExpandLayerParam*>(param_);
+
+    auto input_tensors = GetInputITensors();
+    ITensor* input_data_tensor = input_tensors[0];
+    ITensor* inputDims;
+    if (input_tensors[0]->getDimensions().nbDims != 0)
+        inputDims = network->addShape(*input_tensors[0])->getOutput(0);
+    int inputRank;
+    if (input_tensors[0]->getDimensions().nbDims != 0) {
+        inputRank = inputDims->getDimensions().d[0];
+    } else {
+        inputRank = 0;
+    }
+
+    auto shape = input_tensors[1];
+    auto shapeLength = input_tensors[1]->getDimensions().d[0];
+    int newRank = std::max(shapeLength, inputRank);
+
+    ITensor* newDims;
+    if (newRank - inputRank != 0) {
+        Dims tmpDims;
+        tmpDims.nbDims = newRank - inputRank;
+        for (int i = 0; i < newRank - inputRank; i++) {
+            tmpDims.d[i] = 1;
+        }
+        Weights tmpWeight;
+        tmpWeight.type = nvinfer1::DataType::kINT32;
+        tmpWeight.values = layer_param->shape.data();
+        tmpWeight.count = 1;
+        if (input_tensors[0]->getDimensions().nbDims != 0) {
+            nvinfer1::ITensor* const args[2] = {
+                network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0), inputDims};
+            newDims = network->addConcatenation(args, 2)->getOutput(0);
+        } else {
+            newDims = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+        }
+    } else {
+        newDims = inputDims;
+    }
+
+    if (newRank - inputRank != 0) {
+        IShuffleLayer* reshape_layer = network->addShuffle(*input_data_tensor);
+        reshape_layer->setInput(1, *newDims);
+        input_data_tensor = reshape_layer->getOutput(0);
+    }
+
+    ITensor* newShape;
+    if (newRank - shapeLength != 0) {
+        Dims tmpDims;
+        tmpDims.nbDims = newRank - shapeLength;
+        for (int i = 0; i < newRank - shapeLength; i++) {
+            tmpDims.d[i] = 1;
+        }
+        Weights tmpWeight;
+        tmpWeight.type = nvinfer1::DataType::kINT32;
+        tmpWeight.values = layer_param->shape.data();
+        tmpWeight.count = 1;
+        nvinfer1::ITensor* const args[2] = {
+            network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0), shape};
+        newShape = network->addConcatenation(args, 2)->getOutput(0);
+    } else {
+        newShape = shape;
+    }
+
+    Dims startDims;
+    startDims.nbDims = newRank;
+    for (int i = 0; i < newRank; i++) {
+        startDims.d[i] = 0;
+    }
+
+    ITensor* sizes = network->addElementWise(*newDims, *newShape, ElementWiseOperation::kMAX)->getOutput(0);
+    ITensor* one;
+    {
+        Dims tmpDims;
+        tmpDims.nbDims = newRank;
+        for (int i = 0; i < newRank; i++)
+            tmpDims.d[i] = 1;
+        Weights tmpWeight;
+        tmpWeight.type = nvinfer1::DataType::kINT32;
+        tmpWeight.values = layer_param->shape.data();
+        tmpWeight.count = 1;
+        one = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+    }
+
+    ITensor* strides = network->addElementWise(*one,
+        *network->addElementWise(*newDims, *one, ElementWiseOperation::kSUB)->getOutput(0),
+        ElementWiseOperation::kMIN)->getOutput(0);
+
+    ISliceLayer* broadcast_layer = network->addSlice(*input_data_tensor, startDims, nvinfer1::Dims{}, nvinfer1::Dims{});
+    if (broadcast_layer != nullptr) {
+        broadcast_layer->setInput(2, *sizes);
+        broadcast_layer->setInput(3, *strides);
+    }
+
+    return broadcast_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Expand, LAYER_EXPAND);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/flatten_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/flatten_layer_builder.cc
new file mode 100644
index 0000000..3e57a34
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/flatten_layer_builder.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Flatten, LAYER_FLATTEN);
+
+bool FlattenTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kINT32;
+}
+
+Status FlattenTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* FlattenTRTPluginLayerBuilder::getPluginType() const {
+    return "Flatten";
+}
+
+nvinfer1::DataType FlattenTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* FlattenTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs FlattenTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output;
+    output.nbDims = 2;
+    output.d[0] = exprBuilder.constant(1);
+    output.d[1] = exprBuilder.constant(1);
+    auto *layer_param = dynamic_cast<FlattenLayerParam *>(param_);
+    int axis = layer_param->axis;
+    for (int i = 0; i < axis; i++) {
+        output.d[0] = exprBuilder.operation(DimensionOperation::kPROD, *output.d[0], *inputs[0].d[i]);
+    }
+
+    for (int i = axis; i < inputs[0].nbDims; i++) {
+        output.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *output.d[1], *inputs[0].d[i]);
+    }
+
+    return output;
+}
+
+const char* FlattenPluginCreator::getPluginName() const {
+    return "Flatten";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Flatten, LAYER_FLATTEN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc
new file mode 100644
index 0000000..542b630
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Floor, LAYER_FLOOR);
+
+ILayer* FloorTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kFLOOR);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Floor, LAYER_FLOOR);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc
new file mode 100644
index 0000000..eedd402
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Gather, LAYER_GATHER);
+
+bool GatherTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+
+    auto support_fp32_i32 = (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+                             inOut[pos].type == nvinfer1::DataType::kINT32) &&
+                             inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+    auto support_i32 = inOut[pos].type == nvinfer1::DataType::kINT32 &&
+                       inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+    auto support_f32_f16_i32 = (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+                                inOut[pos].type == nvinfer1::DataType::kINT32 ||
+                                inOut[pos].type == nvinfer1::DataType::kHALF) &&
+                                inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+
+    if (layer_param->data_in_resource) {
+        // if data in resource, output dtype only support fp32 & int32
+        if (layer_param->indices_in_resource) {
+            // resource -> input_data, output[0] -> output_data
+            // resource -> input_indices
+            if (nbInputs != 0 && nbOutputs != 1) {
+                return false;
+            }
+            return support_fp32_i32;
+        } else {
+            // resource -> input_data, output[0] -> output_data
+            // input[0] -> input_indices
+            if (nbInputs != 1 && nbOutputs != 1) {
+                return false;
+            }
+            if (pos == 0) {
+                return support_i32;
+            } else {
+                return support_fp32_i32;
+            }
+        }
+    } else {
+        // if data not in resouce, output dtype = input dtype, support fp32 & fp16 & int32
+        if (layer_param->indices_in_resource) {
+            // input[0] -> input_data, output[0] -> output_data
+            // resource -> input_indices
+            if (nbInputs != 1 && nbOutputs != 1) {
+                return false;
+            }
+            return support_f32_f16_i32 && inOut[pos].type == inOut[0].type;
+        } else {
+            // input[0] -> input_data, output[0] -> output_data
+            // input[1] -> input_indices
+            if (nbInputs != 2 && nbOutputs != 1) {
+                return false;
+            }
+            if (pos == 1) {
+                return support_i32;
+            } else {
+                return support_f32_f16_i32 && inOut[pos].type == inOut[0].type;
+            }
+        }
+    }
+}
+
+Status GatherTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* GatherTRTPluginLayerBuilder::getPluginType() const {
+    return "Gather";
+}
+
+nvinfer1::DataType GatherTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+
+    // if data_in_reousce, output dtype == resource dtype
+    // else output dtype == input dtype
+    if (layer_param->data_in_resource) {
+        return ConvertToTRTDataType(layer_resource->data.GetDataType());
+    } else {
+        return inputTypes[0];
+    }
+}
+
+// trt gather performs slow in some cases, and may not support indices_data < 0
+// if shape tensor, we keep trt IGatherLayer
+// if not shape tensor, we use customized gather implementation
+ILayer* GatherTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    // if shape tensor
+    if (GetInputITensors()[0]->getDimensions().nbDims == 0 ||
+        GetInputITensors()[0]->getDimensions().nbDims == 1) {
+
+        auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+        if (layer_param == nullptr) {
+            LOGE("GatherTRTLayerBuilder layer_param is null");
+            return nullptr;
+        }
+        int axis = layer_param->axis;
+
+        auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+        if ((layer_param->data_in_resource || layer_param->indices_in_resource) && !layer_resource) {
+            LOGE("Gather resource is invalid");
+            return nullptr;
+        }
+
+        nvinfer1::ITensor * data = nullptr;
+        nvinfer1::ITensor * indices = nullptr;
+        if (layer_param->data_in_resource) {
+            auto const_layer = ConvertWeightToConstLayer(network, &(layer_resource->data));
+            if (const_layer != nullptr) {
+                data = const_layer->getOutput(0);
+            }
+        } else {
+            auto foreign_tensor = dynamic_cast<ForeignBlob*>(*input_blobs_.begin())->GetForeignTensor();
+            data = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+        }
+
+        if (layer_param->indices_in_resource) {
+            auto const_layer = ConvertWeightToConstLayer(network, &(layer_resource->indices));
+            if (const_layer != nullptr) {
+                indices = const_layer->getOutput(0);
+            }
+        } else {
+            auto foreign_tensor = dynamic_cast<ForeignBlob*>(*input_blobs_.rbegin())->GetForeignTensor();
+            indices = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+        }
+
+        if (data == nullptr || indices == nullptr) {
+            LOGE("GatherTRTLayerBuilder can not find data or indices\n");
+            return nullptr;
+        }
+
+        return network->addGather(*data, *indices, axis);
+    }
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs GatherTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto layer_param = dynamic_cast<GatherLayerParam*>(param_);
+    auto layer_resource = dynamic_cast<GatherLayerResource*>(resource_);
+
+    DimsVector data_dims, indices_dims;
+    DimsExprs inputs_data, inputs_indices;
+    int data_size, indices_size;
+    int input_count = 0;
+    if (layer_param->data_in_resource) {
+        data_dims = layer_resource->data.GetBufferDims();
+        data_size = data_dims.size();
+    } else {
+        inputs_data = inputs[input_count];
+        data_size = inputs_data.nbDims;
+        input_count++;
+    }
+
+    if (layer_param->indices_in_resource) {
+        indices_dims = layer_resource->indices.GetBufferDims();
+        indices_size = indices_dims.size();
+    } else {
+        inputs_indices = inputs[input_count];
+        indices_size = inputs_indices.nbDims;
+        input_count++;
+    }
+
+    int axis = layer_param->axis;
+    DimsExprs output;
+    int idx = 0;
+    if (axis > 0 && axis < data_size) {
+        for (int i = 0; i < axis; i++) {
+            if (layer_param->data_in_resource) {
+                output.d[idx] = exprBuilder.constant(data_dims[i]);
+            } else {
+                output.d[idx] = inputs_data.d[i];
+            }
+            idx++;
+        }
+    }
+    for (int i = 0; i < indices_size; i++) {
+        if (layer_param->indices_in_resource) {
+            output.d[idx] = exprBuilder.constant(indices_dims[i]);
+        } else {
+            output.d[idx] = inputs_indices.d[i];
+        }
+        idx++;
+    }
+    if (axis < data_size - 1) {
+        for (int i = axis + 1; i < data_size; i++) {
+            if (layer_param->data_in_resource) {
+                output.d[idx] = exprBuilder.constant(data_dims[i]);
+            } else {
+                output.d[idx] = inputs_data.d[i];
+            }
+            idx++;
+        }
+    }
+    output.nbDims = idx;
+
+    return output;
+}
+
+const char* GatherPluginCreator::getPluginName() const {
+    return "Gather";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Gather, LAYER_GATHER);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_nd_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_nd_layer_builder.cc
new file mode 100644
index 0000000..db6885f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gather_nd_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(GatherND, LAYER_GATHERND);
+
+bool GatherNDTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[0].type == nvinfer1::DataType::kFLOAT;
+}
+
+Status GatherNDTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* GatherNDTRTPluginLayerBuilder::getPluginType() const {
+    return "GatherND";
+}
+
+nvinfer1::DataType GatherNDTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* GatherNDTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs GatherNDTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output;
+
+    output.nbDims = output_blobs_[0]->GetBlobDesc().dims.size();
+    for (int i = 0; i < output.nbDims; i++) {
+        output.d[i] = exprBuilder.constant(output_blobs_[0]->GetBlobDesc().dims[i]);
+    }
+    return output;
+}
+
+const char* GatherNDPluginCreator::getPluginName() const {
+    return "GatherND";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(GatherND, LAYER_GATHERND);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc
new file mode 100644
index 0000000..a546568
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Gelu, LAYER_GELU);
+
+bool GeluTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status GeluTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* GeluTRTPluginLayerBuilder::getPluginType() const {
+    return "Gelu";
+}
+
+nvinfer1::DataType GeluTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* GeluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs GeluTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    for (int i = 1; i < nbInputs; i++) {
+        for (int j = 0; j < output.nbDims; j++) {
+            output.d[j] = exprBuilder.operation(DimensionOperation::kMAX, *output.d[j], *inputs[i].d[j]);
+        }
+    }
+    return output;
+}
+
+const char* GeluPluginCreator::getPluginName() const {
+    return "Gelu";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Gelu, LAYER_GELU);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gridsample_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gridsample_layer_builder.cc
new file mode 100644
index 0000000..896e1a4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/gridsample_layer_builder.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(GridSample, LAYER_GRIDSAMPLE);
+
+bool GridSampleTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kHALF || inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+        inOut[pos].type == nvinfer1::DataType::kINT32) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+}
+
+Status GridSampleTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* GridSampleTRTPluginLayerBuilder::getPluginType() const {
+    return "GridSample";
+}
+
+nvinfer1::DataType GridSampleTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* GridSampleTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs GridSampleTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    for (int i=2,j=1; i<inputs[0].nbDims && j<inputs[1].nbDims; i++,j++) {
+        output.d[i] = inputs[1].d[j];
+    }
+    return output;
+}
+
+const char* GridSamplePluginCreator::getPluginName() const {
+    return "GridSample";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(GridSample, LAYER_GRIDSAMPLE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc
new file mode 100644
index 0000000..3e0d878
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(GroupNorm, LAYER_GROUP_NORM);
+
+bool GroupNormTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    const auto &desc = inOut[pos];
+    const auto common_cond = nbInputs == 3 && nbOutputs == 1;
+    switch (pos)
+    {
+    case 0:
+        return common_cond 
+            && (desc.type == nvinfer1::DataType::kFLOAT || desc.type == nvinfer1::DataType::kHALF)
+            && desc.format == nvinfer1::TensorFormat::kNCHW;
+    case 1:
+    case 2:
+        return common_cond && desc.type == nvinfer1::DataType::kFLOAT;
+    case 3:
+        return common_cond
+            && desc.type == inOut[0].type
+            && desc.format == nvinfer1::TensorFormat::kNCHW;
+    default:
+        return false;
+    }
+}
+
+Status GroupNormTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* GroupNormTRTPluginLayerBuilder::getPluginType() const {
+    return "GroupNorm";
+}
+
+nvinfer1::DataType GroupNormTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* GroupNormTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs GroupNormTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    return output;
+}
+
+const char* GroupNormPluginCreator::getPluginName() const {
+    return "GroupNorm";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(GroupNorm, LAYER_GROUP_NORM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc
new file mode 100644
index 0000000..5a68f2e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(HardSigmoid, LAYER_HARDSIGMOID);
+
+bool HardSigmoidTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status HardSigmoidTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* HardSigmoidTRTPluginLayerBuilder::getPluginType() const {
+    return "HardSigmoid";
+}
+
+nvinfer1::DataType HardSigmoidTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* HardSigmoidTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs HardSigmoidTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputDims, exprBuilder);
+}
+
+const char* HardSigmoidPluginCreator::getPluginName() const {
+    return "HardSigmoid";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(HardSigmoid, LAYER_HARDSIGMOID);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc
new file mode 100644
index 0000000..9be076e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
+
+bool HardSwishTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status HardSwishTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* HardSwishTRTPluginLayerBuilder::getPluginType() const {
+    return "HardSwish";
+}
+
+nvinfer1::DataType HardSwishTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* HardSwishTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs HardSwishTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputDims, exprBuilder);
+}
+
+const char* HardSwishPluginCreator::getPluginName() const {
+    return "HardSwish";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/histogram_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/histogram_layer_builder.cc
new file mode 100644
index 0000000..03ab4f7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/histogram_layer_builder.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Histogram, LAYER_HISTOGRAM);
+
+bool HistogramTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kINT32);
+}
+
+Status HistogramTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* HistogramTRTPluginLayerBuilder::getPluginType() const {
+    return "Histogram";
+}
+
+nvinfer1::DataType HistogramTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* HistogramTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs HistogramTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto layer_param = dynamic_cast<HistogramLayerParam*>(param_);
+    DimsExprs output;
+    output.nbDims = 1;
+    output.d[0] = exprBuilder.constant(layer_param->depth);
+    return output;
+}
+
+const char* HistogramPluginCreator::getPluginName() const {
+    return "Histogram";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Histogram, LAYER_HISTOGRAM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc
new file mode 100644
index 0000000..a942ec8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+
+ILayer* InnerProductTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<InnerProductLayerParam*>(param_);
+    auto resource = dynamic_cast<InnerProductLayerResource*>(resource_);
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    Weights kernelWeights;
+    Weights biasWeights;
+    ILayer* weight_layer;
+    if (int8) {
+        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
+        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        std::vector<int> dims;
+        dims.push_back(output_blobs_[0]->GetBlobDesc().dims[1]);
+        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1]);
+        dims.push_back(1);
+        dims.push_back(1);
+        weight_layer = AddInt8WeightQDQLayers(network, &(resource->scale_handle), kernelWeights,
+            paramlist->has_bias ? &(resource->bias_handle) : nullptr,
+            biasWeights, output_scale_value / (weight_scale_value / input_scale_value), dims);
+
+        if (!std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->IsQuantized()) {
+            Weights input_quant_shift;
+            input_quant_shift.type = nvinfer1::DataType::kFLOAT;
+            input_quant_shift.values = nullptr;
+            input_quant_shift.count = 0;
+
+            Weights input_quant_scale;
+            input_quant_scale.type = nvinfer1::DataType::kFLOAT;
+            float* input_quant_scale_data = (float*)malloc(sizeof(float));
+            int8_weight_data.push_back(input_quant_scale_data);
+            *input_quant_scale_data = input_scale_value;
+            input_quant_scale.values = (void*)input_quant_scale_data;
+            input_quant_scale.count = 1;
+
+            Weights input_quant_power;
+            input_quant_power.type = nvinfer1::DataType::kFLOAT;
+            input_quant_power.values = nullptr;
+            input_quant_power.count = 0;
+
+            auto input_quant_layer = network->addScale(*input_tensor, ScaleMode::kUNIFORM,
+                input_quant_shift, input_quant_scale, input_quant_power);
+            std::string input_scale_name = layer_name_ + "_input_quant_";
+            input_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
+            input_quant_layer->setName(input_scale_name.c_str());
+
+            Weights input_dequant_shift;
+            input_dequant_shift.type = nvinfer1::DataType::kFLOAT;
+            input_dequant_shift.values = nullptr;
+            input_dequant_shift.count = 0;
+
+            Weights input_dequant_scale;
+            input_dequant_scale.type = nvinfer1::DataType::kFLOAT;
+            float* input_dequant_scale_data = (float*)malloc(sizeof(float));
+            int8_weight_data.push_back(input_dequant_scale_data);
+            *input_dequant_scale_data = 1 / input_scale_value;
+            input_dequant_scale.values = (void*)input_dequant_scale_data;
+            input_dequant_scale.count = 1;
+
+            Weights input_dequant_power;
+            input_dequant_power.type = nvinfer1::DataType::kFLOAT;
+            input_dequant_power.values = nullptr;
+            input_dequant_power.count = 0;
+
+            auto input_dequant_layer = network->addScale(*(input_quant_layer->getOutput(0)), ScaleMode::kUNIFORM,
+                input_dequant_shift, input_dequant_scale, input_dequant_power);
+            std::string input_dequant_layer_name = layer_name_ + "_input_dequant_";
+            input_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            input_dequant_layer->setName(input_dequant_layer_name.c_str());
+            std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->SetQuantized();
+            input_tensor = input_dequant_layer->getOutput(0);
+        }
+    } else {
+        kernelWeights.type = nvinfer1::DataType::kFLOAT;
+        kernelWeights.values = resource->weight_handle.force_to<void*>();
+        kernelWeights.count = resource->weight_handle.GetDataCount();
+        if (paramlist->has_bias) {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = resource->bias_handle.force_to<void*>();
+            biasWeights.count = resource->bias_handle.GetDataCount();
+        } else {
+            biasWeights.type = nvinfer1::DataType::kFLOAT;
+            biasWeights.values = nullptr;
+            biasWeights.count = 0;
+        }
+    }
+
+    ILayer* layer;
+
+    Dims in_dims;
+    in_dims.nbDims = 4;
+    in_dims.d[0] = -1;
+    in_dims.d[1] = kernelWeights.count / paramlist->num_output;
+    in_dims.d[2] = 1;
+    in_dims.d[3] = 1;
+    IShuffleLayer* in_reshape_layer = network->addShuffle(*input_tensor);
+    in_reshape_layer->setReshapeDimensions(in_dims);
+    input_tensor = in_reshape_layer->getOutput(0);
+
+    //FullyConnected
+    layer = network->addFullyConnected(*input_tensor, paramlist->num_output, 
+        kernelWeights, biasWeights);
+    if (int8) {
+        layer->setInput(1, *(weight_layer->getOutput(0)));
+        layer->setPrecision(nvinfer1::DataType::kINT8);
+    }
+
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        input_tensor = layer->getOutput(0);
+    }
+
+    Dims out_dims;
+    out_dims.nbDims = paramlist->axis + 1;
+    for (int i = 0; i < out_dims.nbDims; i++) {
+        out_dims.d[i] = 0;
+    }
+    IShuffleLayer* out_reshape_layer = network->addShuffle(*input_tensor);
+    out_reshape_layer->setReshapeDimensions(out_dims);
+    input_tensor = out_reshape_layer->getOutput(0);
+    layer = out_reshape_layer;
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, layer->getOutput(0), output_foreign_tensor, 1, 1 / output_scale_value);
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_plugin_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_plugin_builder.cc
new file mode 100644
index 0000000..753ac58
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inner_product_layer_plugin_builder.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+
+bool InnerProductTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+}
+
+Status InnerProductTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* InnerProductTRTPluginLayerBuilder::getPluginType() const {
+    return "InnerProduct";
+}
+
+nvinfer1::DataType InnerProductTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+DimsExprs InnerProductTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+
+    InnerProductLayerParam* ip_param = dynamic_cast<InnerProductLayerParam*>(param_);
+    if (!ip_param) {
+        LOGE("InnerProductTRTPluginLayerBuilder got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimsExprs output(inputs[0]);
+
+    int N    = ip_param->num_output;
+    int axis = ip_param->axis;
+
+    output.d[axis] = exprBuilder.constant(N);
+
+    for (int i = axis + 1; i < inputs[0].nbDims; i++) {
+        output.d[i] = exprBuilder.constant(1);
+    }
+
+    return output;
+}
+
+ILayer* InnerProductTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+const char* InnerProductPluginCreator::getPluginName() const {
+    return "InnerProduct";
+}
+
+// REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/instance_norm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/instance_norm_layer_builder.cc
new file mode 100644
index 0000000..bbec70b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/instance_norm_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+bool InstanceNormTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kHALF || inOut[pos].type == nvinfer1::DataType::kFLOAT) &&
+        inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+}
+
+Status InstanceNormTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* InstanceNormTRTPluginLayerBuilder::getPluginType() const {
+    return "InstanceNorm";
+}
+
+nvinfer1::DataType InstanceNormTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* InstanceNormTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs InstanceNormTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* InstanceNormPluginCreator::getPluginName() const {
+    return "InstanceNorm";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(InstanceNorm, LAYER_INST_BATCH_NORM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inverse_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inverse_layer_builder.cc
new file mode 100644
index 0000000..52b393e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/inverse_layer_builder.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Inverse, LAYER_INVERSE);
+
+bool InverseTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT;
+}
+
+Status InverseTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* InverseTRTPluginLayerBuilder::getPluginType() const {
+    return "Inverse";
+}
+
+nvinfer1::DataType InverseTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* InverseTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs InverseTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInput, exprBuilder);
+}
+
+const char* InversePluginCreator::getPluginName() const {
+    return "Inverse";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Inverse, LAYER_INVERSE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc
new file mode 100644
index 0000000..04c7d72
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(LayerNorm, LAYER_LAYER_NORM);
+
+bool LayerNormTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    bool layout_check = inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+
+    bool datatype_check = true;
+    if (pos == 0) {
+        datatype_check = inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF;
+    } else {
+        datatype_check = inOut[pos].type == inOut[0].type;
+    }
+
+    return layout_check && datatype_check;
+}
+
+Status LayerNormTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* LayerNormTRTPluginLayerBuilder::getPluginType() const {
+    return "LayerNorm";
+}
+
+nvinfer1::DataType LayerNormTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* LayerNormTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs LayerNormTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* LayerNormPluginCreator::getPluginName() const {
+    return "LayerNorm";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(LayerNorm, LAYER_LAYER_NORM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/log_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/log_layer_builder.cc
new file mode 100644
index 0000000..a7e17eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/log_layer_builder.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Log, LAYER_LOG);
+
+ILayer* LogTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    if (tensor->getDimensions().nbDims == 0) {
+        IShuffleLayer* shuffle_layer = network->addShuffle(*GetInputITensors()[0]);
+        nvinfer1::Dims d;
+        d.nbDims = 1;
+        d.d[0] = 1;
+        shuffle_layer->setReshapeDimensions(d);
+        tensor = shuffle_layer->getOutput(0);
+    }
+
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kLOG);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    if (GetInputITensors()[0]->getDimensions().nbDims == 0) {
+        IShuffleLayer* shuffle_layer = network->addShuffle(*layer->getOutput(0));
+        nvinfer1::Dims d;
+        d.nbDims = 0;
+        shuffle_layer->setReshapeDimensions(d);
+        return shuffle_layer;
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Log, LAYER_LOG);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lrn_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lrn_layer_builder.cc
new file mode 100644
index 0000000..27f63d9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lrn_layer_builder.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(LRN, LAYER_LRN);
+
+ILayer* LRNTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<LRNLayerParam*>(param_);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    ILRNLayer* layer = network->addLRN(*tensor, paramlist->size, paramlist->alpha, paramlist->beta, paramlist->bias);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(LRN, LAYER_LRN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lstm_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lstm_layer_builder.cc
new file mode 100644
index 0000000..2e5653b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/lstm_layer_builder.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+#include <cudnn.h>
+
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/network/tensorrt/dimension_expr.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
+
+bool LSTMONNXTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+}
+
+Status LSTMONNXTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+    return TNN_OK;
+}
+
+const char* LSTMONNXTRTPluginLayerBuilder::getPluginType() const {
+    return "LSTMONNX";
+}
+
+nvinfer1::DataType LSTMONNXTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+
+DimsExprs LSTMONNXTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+
+    nvinfer1::IExprBuilder& e = exprBuilder;
+   
+    auto layer_param = dynamic_cast<LSTMONNXLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("LSTMONNXTRTPluginLayerBuilder::getOutputDimensions got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimensionExpr num_directions(layer_param->direction >=2 ? 2 : 1, e);
+    DimensionExpr output_size(layer_param->hidden_size, e);
+    
+    DimensionExpr sequence_len(inputs[0].d[0], e);
+    DimensionExpr batch(inputs[0].d[1], e);
+
+    DimsExprs output;
+
+    switch(index) {
+        case 0:
+            output.d[0] = sequence_len.expr();
+            output.d[1] = batch.expr();
+            output.d[2] = (num_directions * output_size).expr();
+            output.nbDims = 3;
+            break;
+        case 1:
+        case 2:
+            output.d[0] = num_directions.expr();
+            output.d[1] = batch.expr();
+            output.d[2] = output_size.expr();
+            output.nbDims = 3;
+            break;
+        default:
+            break;
+    }
+
+    return output;
+}
+
+ILayer* LSTMONNXTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+const char* LSTMONNXPluginCreator::getPluginName() const {
+    return "LSTMONNX";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc
new file mode 100644
index 0000000..9457765
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+#include "tnn/core/macro.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(MatMul, LAYER_MATMUL);
+
+nvinfer1::Dims unsqueeze_trt_dims(const nvinfer1::Dims &input_dims, int unsqueeze_len) {
+    nvinfer1::Dims ret;
+    ret.nbDims = std::min(input_dims.nbDims + unsqueeze_len, 5);
+    int insert_num = ret.nbDims - input_dims.nbDims;
+    int i=0;
+    for(;i<insert_num;i++) ret.d[i] = 1;
+    for(;i<ret.nbDims;i++) ret.d[i] = input_dims.d[i - insert_num];
+    return ret;
+}
+
+bool MatMulTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    if (pos == 1) {
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    } else {
+        return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
+        inOut[0].type == inOut[pos].type && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+}
+
+Status MatMulTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* MatMulTRTPluginLayerBuilder::getPluginType() const {
+    return "MatMul";
+}
+
+nvinfer1::DataType MatMulTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* MatMulTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<MatMulLayerParam *>(param_);
+    auto resource  = dynamic_cast<MatMulLayerResource *>(resource_);
+    auto input_tensors = GetInputITensors();
+
+    ITensor * matrix_a = nullptr;
+    ITensor * matrix_b = nullptr;
+    
+    if (input_tensors.size() == 2) {
+        matrix_a = input_tensors[0];
+        matrix_b = input_tensors[1];
+    } else {
+        auto buf = resource->weight;
+        DimsVector buf_dims = buf.GetBufferDims();
+        int nbDims = input_tensors[0]->getDimensions().nbDims;
+        int diff = nbDims - buf_dims.size();
+        for(int i = 0; i < diff; ++i) {
+            buf_dims.insert(buf_dims.begin(), 1);
+        }
+        auto const_layer = ConvertWeightToConstLayer(network, &buf, buf_dims);
+        matrix_a    = paramlist->weight_position == 0 ? const_layer->getOutput(0) : input_tensors[0];
+        matrix_b    = paramlist->weight_position == 1 ? const_layer->getOutput(0) : input_tensors[0];
+    }
+
+    if (matrix_a == nullptr || matrix_b == nullptr) {
+        LOGE("MatMulTRTLayerBuilder got null inputs");
+        return nullptr;
+    }
+
+    // TRT Restrict that : dimsA.nbDims == dimsB.nbDims , when nbDims >= 2
+    auto dims_a = matrix_a->getDimensions();
+    auto dims_b = matrix_b->getDimensions();
+    int nbDimsDiff = std::abs(dims_a.nbDims - dims_b.nbDims);
+    if (dims_a.nbDims > dims_b.nbDims)
+    {
+        nvinfer1::Dims new_dims = unsqueeze_trt_dims(dims_b, nbDimsDiff);
+        nvinfer1::IShuffleLayer* unsqueeze = network->addShuffle(*matrix_b);
+        unsqueeze->setReshapeDimensions(new_dims);
+        unsqueeze->setName((GetLayerName()+"_unqueeze_b").c_str());
+        matrix_b = unsqueeze->getOutput(0);
+    }
+
+    if (dims_b.nbDims > dims_a.nbDims)
+    {
+        nvinfer1::Dims new_dims = unsqueeze_trt_dims(dims_a, nbDimsDiff);
+        nvinfer1::IShuffleLayer* unsqueeze = network->addShuffle(*matrix_a);
+        unsqueeze->setReshapeDimensions(new_dims);
+        unsqueeze->setName((GetLayerName()+"_unqueeze_a").c_str());
+        matrix_a = unsqueeze->getOutput(0);
+    }
+
+    const auto getMatrixOp = [](const nvinfer1::ITensor* input) {
+        return (input->getDimensions().nbDims == 1) ? MatrixOperation::kVECTOR
+                                                   : MatrixOperation::kNONE;
+    };
+
+    MatrixOperation opA = getMatrixOp(matrix_a);
+    MatrixOperation opB = getMatrixOp(matrix_b);
+
+    if (opA == MatrixOperation::kNONE && opB == MatrixOperation::kNONE && dims_b.d[dims_b.nbDims - 1] == 1 &&
+            input_tensors.size() == 2 &&
+            input_tensors[0]->getDimensions().nbDims == input_tensors[1]->getDimensions().nbDims) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    IMatrixMultiplyLayer* layer = network->addMatrixMultiply(*matrix_a, opA, *matrix_b, opB);
+
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+DimsExprs MatMulTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    int size = inputs[0].nbDims;
+    output.d[size - 1] = inputs[1].d[size - 1];
+    output.d[size - 2] = inputs[0].d[size - 2];
+    for (int i = size - 3; i >= 0; i--) {
+        output.d[i] = exprBuilder.operation(DimensionOperation::kMAX, *inputs[0].d[i], *inputs[1].d[i]);
+    }
+    return output;
+}
+
+const char* MatMulPluginCreator::getPluginName() const {
+    return "MatMul";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(MatMul, LAYER_MATMUL);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/max_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/max_layer_builder.cc
new file mode 100644
index 0000000..4859791
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/max_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Max);
+
+MaxTRTLayerBuilder::MaxTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kMAX;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Max, LAYER_MAXIMUM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/min_layer_builder.cc
new file mode 100644
index 0000000..02ffda0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/min_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Min);
+
+MinTRTLayerBuilder::MinTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kMIN;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Min, LAYER_MINIMUM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc
new file mode 100644
index 0000000..b0f68eb
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Mul);
+
+MulTRTLayerBuilder::MulTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kPROD;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Mul, LAYER_MUL);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc
new file mode 100644
index 0000000..535c7e6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Neg, LAYER_NEG);
+
+ILayer* NegTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    if (tensor->getDimensions().nbDims == 0) {
+        IShuffleLayer* shuffle_layer = network->addShuffle(*GetInputITensors()[0]);
+        nvinfer1::Dims d;
+        d.nbDims = 1;
+        d.d[0] = 1;
+        shuffle_layer->setReshapeDimensions(d);
+        tensor = shuffle_layer->getOutput(0);
+    }
+
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kNEG);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    if (GetInputITensors()[0]->getDimensions().nbDims == 0) {
+        IShuffleLayer* shuffle_layer = network->addShuffle(*layer->getOutput(0));
+        nvinfer1::Dims d;
+        d.nbDims = 0;
+        shuffle_layer->setReshapeDimensions(d);
+        return shuffle_layer;
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Neg, LAYER_NEG);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/normalize_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/normalize_layer_builder.cc
new file mode 100644
index 0000000..ece7172
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/normalize_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Normalize, LAYER_NORMALIZE);
+
+bool NormalizeTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status NormalizeTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* NormalizeTRTPluginLayerBuilder::getPluginType() const {
+    return "Normalize";
+}
+
+nvinfer1::DataType NormalizeTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* NormalizeTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs NormalizeTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* NormalizePluginCreator::getPluginName() const {
+    return "Normalize";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Normalize, LAYER_NORMALIZE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/onehot_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/onehot_layer_builder.cc
new file mode 100644
index 0000000..0c0fdf9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/onehot_layer_builder.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(OneHot, LAYER_ONEHOT);
+
+bool OneHotTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kINT32 || inOut[pos].type == nvinfer1::DataType::kFLOAT);
+}
+
+Status OneHotTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* OneHotTRTPluginLayerBuilder::getPluginType() const {
+    return "OneHot";
+}
+
+nvinfer1::DataType OneHotTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return nvinfer1::DataType::kFLOAT;
+}
+
+ILayer* OneHotTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs OneHotTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<OneHotLayerParam*>(param_);
+    DimsExprs output(inputs[0]);
+    output.nbDims = output.nbDims + 1;
+    int axis = param->axis;
+    if(axis < 0) {
+        axis += output.nbDims;
+    } 
+    for (int i = axis; i < output.nbDims - 1; i++) {
+            output.d[i + 1] = output.d[i];
+    }
+
+    output.d[axis] = exprBuilder.constant(param->depth);
+    return output;
+}
+
+const char* OneHotPluginCreator::getPluginName() const {
+    return "OneHot";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(OneHot, LAYER_ONEHOT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc
new file mode 100644
index 0000000..6260009
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Pad, LAYER_PAD);
+
+static bool UseTRTPaddingND(PadLayerParam* paramlist) {
+    // Only zero-padding is supported.
+    if (paramlist->type != 0 || paramlist->value != 0) {
+        return false;
+    }
+
+    // A must have three dimensions or more.
+    if (paramlist->pads.size() < 6) {
+        return false;
+    }
+
+    // The padding can only be applied along the two innermost dimensions.
+    if (paramlist->pads[4] != 0 || paramlist->pads[5] != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+bool PadTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    int channels = inOut[0].dims.d[1];
+    bool is_pad_8 = (channels % 8 == 0);
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW) ||
+        (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format == nvinfer1::TensorFormat::kNHWC8 && is_pad_8));
+}
+
+Status PadTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PadTRTPluginLayerBuilder::getPluginType() const {
+    return "Pad";
+}
+
+nvinfer1::DataType PadTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PadTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PadLayerParam*>(param_);
+
+    if (!UseTRTPaddingND(paramlist)) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    std::vector<int> pads = paramlist->pads;
+    // use IPaddingLayer
+    IPaddingLayer* pad_layer;
+    Dims pre_padding = ConvertToTRTDims({pads[2], pads[0]});
+    Dims post_padding = ConvertToTRTDims({pads[3], pads[1]});
+    pad_layer = network->addPaddingNd(*input_tensor, pre_padding, post_padding);
+
+    return pad_layer;
+}
+
+DimsExprs PadTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    auto param = dynamic_cast<PadLayerParam*>(param_);
+    auto pads0 = exprBuilder.constant(param->pads[0] + param->pads[1]);
+    auto pads1 = exprBuilder.constant(param->pads[2] + param->pads[3]);
+    auto pads2 = exprBuilder.constant(param->pads[4] + param->pads[5]);
+
+    output.d[3] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[3], *pads0);
+    output.d[2] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[2], *pads1);
+    output.d[1] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[1], *pads2);
+    return output;
+}
+
+const char* PadPluginCreator::getPluginName() const {
+    return "Pad";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Pad, LAYER_PAD);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc
new file mode 100644
index 0000000..92e02fc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(PadV2, LAYER_PADV2);
+
+static bool UseTRTPaddingND(PadLayerParam* paramlist) {
+    // Only zero-padding is supported.
+    if (paramlist->type != 0 || paramlist->value != 0) {
+        return false;
+    }
+
+    // input must have 4 dimensions or more.
+    if (paramlist->pads.size() < 8) {
+        return false;
+    }
+
+    // The padding can only be applied along the two innermost dimensions.
+    int dim_size = paramlist->pads.size() / 2; 
+    for (int i = 0; i < dim_size - 2; ++i) {
+        if (paramlist->pads[i] != 0 || paramlist->pads[i + dim_size] != 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool PadV2TRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    bool base_check = (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kINT32
+           || inOut[pos].type == nvinfer1::DataType::kHALF) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+    if(nbInputs == 1) {
+        return base_check && inOut[pos].type == inOut[0].type;
+    } else if(pos == 1) {
+        return base_check && inOut[pos].type == nvinfer1::DataType::kINT32;
+    } else {
+        return base_check && inOut[pos].type == inOut[0].type;
+    }
+}
+
+Status PadV2TRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PadV2TRTPluginLayerBuilder::getPluginType() const {
+    return "PadV2";
+}
+
+nvinfer1::DataType PadV2TRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PadV2TRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PadLayerParam*>(param_);
+
+    if (!UseTRTPaddingND(paramlist)) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    std::vector<int> pads = paramlist->pads;
+    int dim_size = pads.size() / 2;
+    // use IPaddingLayer
+    IPaddingLayer* pad_layer;
+    Dims pre_padding = ConvertToTRTDims({pads[dim_size - 2], pads[dim_size - 1]});
+    Dims post_padding = ConvertToTRTDims({pads[2 * dim_size - 2], pads[2 * dim_size - 1]});
+    pad_layer = network->addPaddingNd(*input_tensor, pre_padding, post_padding);
+
+    return pad_layer;
+}
+
+DimsExprs PadV2TRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output(inputs[0]);
+    auto param = dynamic_cast<PadLayerParam*>(param_);
+    auto output_dims = input_blobs_[0]->GetBlobDesc().dims;
+    int dim_size = param->pads.size()/2;
+    dim_size = dim_size <= output_dims.size() ? dim_size : output_dims.size();
+    for (int i = 0; i < dim_size; ++i) {
+        auto sum = exprBuilder.constant(param->pads[i] + param->pads[dim_size + i]);
+        output.d[i] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[i], *sum);
+    }
+    return output;
+}
+
+const char* PadV2PluginCreator::getPluginName() const {
+    return "PadV2";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(PadV2, LAYER_PADV2);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc
new file mode 100644
index 0000000..17a7db0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Permute, LAYER_PERMUTE);
+
+ILayer* PermuteTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PermuteLayerParam*>(param_);
+    Permutation permute;
+    for (int i = 0; i < paramlist->orders.size(); ++i) {
+        permute.order[i] = paramlist->orders[i];
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    auto input_tensors = GetInputITensors();
+    IShuffleLayer* layer = network->addShuffle(*input_tensors[0]);
+    if (layer != nullptr) {
+        Dims reshape_dims;
+        reshape_dims.nbDims = input_tensors[0]->getDimensions().nbDims;
+        for (int i = 0; i < reshape_dims.nbDims; i++) {
+            reshape_dims.d[i] = 0;
+        }
+        layer->setName(layer_name_.c_str());
+        layer->setReshapeDimensions(reshape_dims);
+        layer->setSecondTranspose(permute);
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Permute, LAYER_PERMUTE);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pixel_shuffle_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pixel_shuffle_layer_builder.cc
new file mode 100644
index 0000000..177ed1f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pixel_shuffle_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+bool PixelShuffleTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status PixelShuffleTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PixelShuffleTRTPluginLayerBuilder::getPluginType() const {
+    return "PixelShuffle";
+}
+
+nvinfer1::DataType PixelShuffleTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PixelShuffleTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs PixelShuffleTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<PixelShuffleLayerParam*>(param_);
+    DimsExprs output(inputs[0]);
+    auto upscale_factor = exprBuilder.constant(param->upscale_factor);
+    auto upscale_factor_square = exprBuilder.constant(param->upscale_factor * param->upscale_factor);
+    output.d[1] = exprBuilder.operation(DimensionOperation::kFLOOR_DIV, *inputs[0].d[1], *upscale_factor_square);
+    output.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *upscale_factor);
+    output.d[3] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *upscale_factor);
+    return output;
+}
+
+const char* PixelShufflePluginCreator::getPluginName() const {
+    return "PixelShuffle";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc
new file mode 100644
index 0000000..3fd708d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc
@@ -0,0 +1,140 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING);
+
+bool PoolingTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+}
+
+Status PoolingTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PoolingTRTPluginLayerBuilder::getPluginType() const {
+    return "Pooling";
+}
+
+nvinfer1::DataType PoolingTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PoolingTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    bool symmetric = (paramlist->pads[0] == paramlist->pads[1]) && (paramlist->pads[2] == paramlist->pads[3]);
+    if (symmetric && (paramlist->is_global_pool || (int8 && paramlist->pool_type == 1) || paramlist->is_adaptive_pool)) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    Dims kernelSize(ConvertToTRTDimsReverse(paramlist->kernels));
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    PoolingType type;
+    if (paramlist->pool_type == 0) {
+        type = PoolingType::kMAX;
+    } else {
+        type = PoolingType::kAVERAGE;
+    }
+
+    IPoolingLayer *layer;
+    auto pads = paramlist->pads;
+
+    bool padNeg = false;
+    for(const auto& p : pads)
+        padNeg |= p < 0;
+
+    if (padNeg) {
+        DimsVector postPadding{pads[3], pads[1]};
+        DimsVector  prePadding{pads[2], pads[0]};
+        IPaddingLayer* padding_layer = network->addPaddingNd(*input_tensor,
+                                                    ConvertToTRTDims(prePadding),
+                                                    ConvertToTRTDims(postPadding));
+        input_tensor = padding_layer->getOutput(0);
+        pads = {0, 0, 0, 0};
+    }
+    layer = network->addPoolingNd(*input_tensor, type, kernelSize);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+        if (!padNeg) {
+            if (symmetric) {
+                layer->setPaddingNd(ConvertPaddingToTRTDims(pads));
+            } else {
+                DimsVector postPadding{pads[3], pads[1]};
+                DimsVector  prePadding{pads[2], pads[0]};
+                layer->setPrePadding(ConvertToTRTDims(prePadding));
+                layer->setPostPadding(ConvertToTRTDims(postPadding));
+            }
+        }
+        if (paramlist->pad_type == -1) {
+            if (paramlist->ceil_mode == 1) {
+                layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_UP);
+            } else {
+                layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+            }
+        } else if (paramlist->pad_type == 0) {
+            layer->setPaddingMode(PaddingMode::kSAME_UPPER);
+        } else if (paramlist->pad_type == 1) {
+            layer->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+        }
+        if (paramlist->pool_type == 1) {
+            layer->setAverageCountExcludesPadding(true);
+        }
+    }
+    if (int8 && std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->GetInt8Mode()) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, layer->getOutput(0), output_foreign_tensor, output_scale_value, 1 / output_scale_value);
+    }
+    return layer;
+}
+
+DimsExprs PoolingTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
+    if (paramlist->is_adaptive_pool) {
+        DimsExprs output(inputs[0]);
+        output.d[2] = exprBuilder.constant(paramlist->output_shape[1]);
+        output.d[3] = exprBuilder.constant(paramlist->output_shape[0]);
+        return output;
+    } else if (paramlist->is_global_pool) {
+        DimsExprs output(inputs[0]);
+        output.d[2] = exprBuilder.constant(1);
+        output.d[3] = exprBuilder.constant(1);
+        return output;
+    }
+
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputDims, exprBuilder);
+}
+
+const char* PoolingPluginCreator::getPluginName() const {
+    return "Pooling";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING);
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING_3D);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc
new file mode 100644
index 0000000..746156e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Pow, LAYER_POWER);
+
+ILayer* PowTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto params = dynamic_cast<PowLayerParam *>(param_);
+
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    auto input_dims        = input_blobs_[0]->GetBlobDesc().dims;
+
+    if (params->exponent == 0.5 && params->shift == 0 && params->scale == 1) {
+        if (tensor->getDimensions().nbDims == 0) {
+            IShuffleLayer* shuffle_layer = network->addShuffle(*GetInputITensors()[0]);
+            nvinfer1::Dims d;
+            d.nbDims = 1;
+            d.d[0] = 1;
+            shuffle_layer->setReshapeDimensions(d);
+            tensor = shuffle_layer->getOutput(0);
+        }
+        IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kSQRT);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+        }
+        return layer;
+    }
+
+    Weights power;
+    power.type = nvinfer1::DataType::kFLOAT;
+    power.count = 1;
+    power.values = &(params->exponent);
+
+    Weights shift;
+    shift.type = nvinfer1::DataType::kFLOAT;
+    shift.count = 1;
+    shift.values = &(params->shift);
+
+    Weights scale;
+    scale.type = nvinfer1::DataType::kFLOAT;
+    scale.count = 1;
+    scale.values = &(params->scale);
+
+    int dims_size = tensor->getDimensions().nbDims;
+    // unsqueeze 
+    ILayer* layer;
+    if (dims_size == 2 || dims_size == 3) {
+        DimsVector unsqueeze_dims;
+        for (int i = 0; i < dims_size; i++) {
+            unsqueeze_dims.push_back(0);
+        }
+        for (int i = 0; i < 4 - dims_size; i++) {
+            unsqueeze_dims.push_back(1);
+        }
+        layer = AddReshapeToNetwork(network, tensor, unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+        tensor = layer->getOutput(0);
+    }
+
+    layer = network->addScaleNd(*tensor, ScaleMode::kUNIFORM, shift, scale, power, 1);
+
+    if (layer != NULL) {
+        layer->setName(layer_name_.c_str());
+        tensor = layer->getOutput(0);
+    }
+
+    //squeeze
+    if(dims_size == 2 || dims_size == 3) {
+        DimsVector squeeze_dims;
+        for (int i = 0; i < dims_size; i++) {
+            squeeze_dims.push_back(0);
+        }
+        layer = AddReshapeToNetwork(network, tensor, squeeze_dims, (layer_name_ + "unsqueeze").c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Pow, LAYER_POWER);
+
+}  //  namespace TNN_NS
+
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc
new file mode 100644
index 0000000..9c8ec2a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc
@@ -0,0 +1,70 @@
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(PRelu, LAYER_PRELU);
+
+bool PReluTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) &&
+        inOut[pos].format == nvinfer1::TensorFormat::kNCHW);
+}
+
+Status PReluTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PReluTRTPluginLayerBuilder::getPluginType() const {
+    return "PRelu";
+}
+
+nvinfer1::DataType PReluTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PReluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    const auto paramlist = dynamic_cast<PReluLayerParam *>(param_);
+    if (!paramlist->channel_shared) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    const auto activation_type = nvinfer1::ActivationType::kLEAKY_RELU;
+    ILayer* last_layer;
+    IActivationLayer* activation_layer = network->addActivation(*input_tensor, activation_type);
+    if (activation_layer != nullptr) {
+        activation_layer->setName(layer_name_.c_str());
+        auto resource = dynamic_cast<PReluLayerResource*>(resource_);
+        auto scope = resource->slope_handle.force_to<float*>();
+        activation_layer->setAlpha(*scope);
+        last_layer = activation_layer;
+    }
+
+    if (int8) {
+        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
+            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
+        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
+            output_scale_value, 1 / output_scale_value);
+    }
+
+    return last_layer;
+}
+
+DimsExprs PReluTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* PReluPluginCreator::getPluginName() const {
+    return "PRelu";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(PRelu, LAYER_PRELU);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prior_box_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prior_box_layer_builder.cc
new file mode 100644
index 0000000..dd8f85a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/prior_box_layer_builder.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(PriorBox, LAYER_PRIOR_BOX);
+
+bool PriorBoxTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status PriorBoxTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* PriorBoxTRTPluginLayerBuilder::getPluginType() const {
+    return "PriorBox";
+}
+
+nvinfer1::DataType PriorBoxTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* PriorBoxTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs PriorBoxTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    PriorBoxLayerParam* param = dynamic_cast<PriorBoxLayerParam *>(param_);
+    int num_priors = static_cast<int>(param->aspect_ratios.size() * param->min_sizes.size());
+    if (!param->max_sizes.empty()) {
+        for (int i = 0; i < param->max_sizes.size(); ++i) {
+            ASSERT(param->max_sizes[i] > param->min_sizes[i]);
+            num_priors++;
+        }
+    }
+    DimsExprs output(inputs[0]);
+    output.d[0] = exprBuilder.constant(1);
+    output.d[1] = exprBuilder.constant(2);
+    auto four = exprBuilder.constant(4 * num_priors);
+    output.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
+    output.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *output.d[2], *four);
+    output.d[3] = exprBuilder.constant(1);
+    return output;
+}
+
+const char* PriorBoxPluginCreator::getPluginName() const {
+    return "PriorBox";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(PriorBox, LAYER_PRIOR_BOX);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reciprocal_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reciprocal_layer_builder.cc
new file mode 100644
index 0000000..86ca181
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reciprocal_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Reciprocal, LAYER_RECIPROCAL);
+
+ILayer* ReciprocalTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kRECIP);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Reciprocal, LAYER_RECIPROCAL);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_l2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_l2_layer_builder.cc
new file mode 100644
index 0000000..1fda712
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_l2_layer_builder.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(ReduceL2, LAYER_REDUCE_L2);
+
+bool ReduceL2TRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status ReduceL2TRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* ReduceL2TRTPluginLayerBuilder::getPluginType() const {
+    return "ReduceL2";
+}
+
+nvinfer1::DataType ReduceL2TRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* ReduceL2TRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs ReduceL2TRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<ReduceLayerParam*>(param_);
+    DimsExprs output;
+    if (param->keep_dims == 0) {
+        int index = 0;
+        for (int i = 0; i < inputs[0].nbDims; i++) {
+            if (std::find(param->axis.begin(), param->axis.end(), i) == param->axis.end()) {
+                output.d[index++] = inputs[0].d[i];
+            }
+        }
+        output.nbDims = index;
+    } else {
+        for (int i = 0; i < inputs[0].nbDims; i++) {
+            output.d[i] = inputs[0].d[i];
+        }
+        output.nbDims = inputs[0].nbDims;
+        for (auto& axis : param->axis) {
+            output.d[axis] = exprBuilder.constant(1);
+        }
+    }
+
+    return output;
+}
+
+const char* ReduceL2PluginCreator::getPluginName() const {
+    return "ReduceL2";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(ReduceL2, LAYER_REDUCE_L2);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc
new file mode 100644
index 0000000..38483f0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <vector>
+#include <algorithm>
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+ReduceTRTLayerBuilder::ReduceTRTLayerBuilder(LayerType ignore) : TensorRTLayerBuilder(ignore) {
+}
+
+uint32_t ReduceTRTLayerBuilder::GetReduceAxis() {
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+    auto axis = paramlist->axis;
+    uint32_t reduceAxis = 0x0;
+    for (int i = 0; i < axis.size(); i++) {
+        axis[i] = axis[i] > 0 ? axis[i] : axis[i] + GetInputITensors()[0]->getDimensions().nbDims;
+    }
+
+    if (std::find(axis.begin(), axis.end(), 1) != axis.end()) {
+        reduceAxis |= 0x2;
+    }
+    if (std::find(axis.begin(), axis.end(), 2) != axis.end()) {
+        reduceAxis |= 0x4;
+    }
+    if (std::find(axis.begin(), axis.end(), 3) != axis.end()) {
+        reduceAxis |= 0x8;
+    }
+    if (std::find(axis.begin(), axis.end(), 4) != axis.end()) {
+        reduceAxis |= 0x10;
+    }
+    return reduceAxis;
+}
+
+ILayer* ReduceTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ReduceLayerParam*>(param_);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IReduceLayer* layer = network->addReduce(*tensor, m_op, GetReduceAxis(), paramlist->keep_dims);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.h
new file mode 100644
index 0000000..4286d1c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+class ReduceTRTLayerBuilder : public TensorRTLayerBuilder {
+public:
+    ReduceTRTLayerBuilder(LayerType ignore);
+    virtual ILayer* AddToNetwork(INetworkDefinition* network);
+
+protected:
+    uint32_t GetReduceAxis();
+
+    ReduceOperation m_op;
+};
+
+#define DECLARE_TRT_REDUCE_LAYER_BUILDER(type_string)                     \
+    class type_string##TRTLayerBuilder : public ReduceTRTLayerBuilder {   \
+    public:                                                               \
+        type_string##TRTLayerBuilder(LayerType ignore);                   \
+    }
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.cc
new file mode 100644
index 0000000..8a997ab
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+bool ReduceLogSumExpTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
+        inOut[pos].format == nvinfer1::TensorFormat::kNCHW && inOut[pos].type == inOut[0].type;
+}
+
+Status ReduceLogSumExpTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* ReduceLogSumExpTRTPluginLayerBuilder::getPluginType() const {
+    return "ReduceLogSumExp";
+}
+
+nvinfer1::DataType ReduceLogSumExpTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* ReduceLogSumExpTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs ReduceLogSumExpTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<ReduceLayerParam*>(param_);
+    DimsExprs output;
+    if (param->keep_dims == 0) {
+        int index = 0;
+        for (int i = 0; i < inputs[0].nbDims; i++) {
+            if (std::find(param->axis.begin(), param->axis.end(), i) == param->axis.end()) {
+                output.d[index++] = inputs[0].d[i];
+            }
+        }
+        output.nbDims = index;
+    } else {
+        for (int i = 0; i < inputs[0].nbDims; i++) {
+            output.d[i] = inputs[0].d[i];
+        }
+        output.nbDims = inputs[0].nbDims;
+        for (auto& axis : param->axis) {
+            output.d[axis] = exprBuilder.constant(1);
+        }
+    }
+
+    return output;
+}
+
+const char* ReduceLogSumExpPluginCreator::getPluginName() const {
+    return "ReduceLogSumExp";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.h
new file mode 100644
index 0000000..aeb0620
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_log_sum_exp_layer_builder.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LOG_SUM_EXP_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LOG_SUM_EXP_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+class ReduceLogSumExpTRTLayerBuilder : public ReduceTRTLayerBuilder {
+public:
+    ReduceLogSumExpTRTLayerBuilder(LayerType ignore);
+    virtual ILayer* AddToNetwork(INetworkDefinition* network);
+};
+
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_REDUCE_LOG_SUM_EXP_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_max_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_max_layer_builder.cc
new file mode 100644
index 0000000..e61488d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_max_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_REDUCE_LAYER_BUILDER(ReduceMax);
+
+ReduceMaxTRTLayerBuilder::ReduceMaxTRTLayerBuilder(LayerType ignore) : ReduceTRTLayerBuilder(ignore) {
+    m_op = ReduceOperation::kMAX;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReduceMax, LAYER_REDUCE_MAX);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_mean_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_mean_layer_builder.cc
new file mode 100644
index 0000000..f60dc33
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_mean_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_REDUCE_LAYER_BUILDER(ReduceMean);
+
+ReduceMeanTRTLayerBuilder::ReduceMeanTRTLayerBuilder(LayerType ignore) : ReduceTRTLayerBuilder(ignore) {
+    m_op = ReduceOperation::kAVG;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReduceMean, LAYER_REDUCE_MEAN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_min_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_min_layer_builder.cc
new file mode 100644
index 0000000..ee14f44
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_min_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_REDUCE_LAYER_BUILDER(ReduceMin);
+
+ReduceMinTRTLayerBuilder::ReduceMinTRTLayerBuilder(LayerType ignore) : ReduceTRTLayerBuilder(ignore) {
+    m_op = ReduceOperation::kMIN;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReduceMin, LAYER_REDUCE_MIN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_sum_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_sum_layer_builder.cc
new file mode 100644
index 0000000..07d9cc9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reduce_sum_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_REDUCE_LAYER_BUILDER(ReduceSum);
+
+ReduceSumTRTLayerBuilder::ReduceSumTRTLayerBuilder(LayerType ignore) : ReduceTRTLayerBuilder(ignore) {
+    m_op = ReduceOperation::kSUM;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReduceSum, LAYER_REDUCE_SUM);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu6_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu6_layer_builder.cc
new file mode 100644
index 0000000..75cdb20
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu6_layer_builder.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(ReLU6, LAYER_RELU6);
+
+ILayer* ReLU6TRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IActivationLayer* layer = network->addActivation(*tensor, nvinfer1::ActivationType::kCLIP);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setAlpha(0.f);
+        layer->setBeta(6.f);
+    }
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReLU6, LAYER_RELU6);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu_layer_builder.cc
new file mode 100644
index 0000000..b64604d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/relu_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_ACTIVATION_LAYER_BUILDER(ReLU);
+
+ReLUTRTLayerBuilder::ReLUTRTLayerBuilder(LayerType ignore) : ActivationTRTLayerBuilder(ignore) {
+    m_type = nvinfer1::ActivationType::kRELU;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReLU, LAYER_RELU);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reshape_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reshape_layer_builder.cc
new file mode 100644
index 0000000..5ece0df
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/reshape_layer_builder.cc
@@ -0,0 +1,75 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Reshape, LAYER_RESHAPE);
+
+ILayer* ReshapeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ReshapeLayerParam*>(param_);
+
+    Blob* output_blob  = output_blobs_[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    Dims reshape_dims = ConvertToTRTDims(paramlist->shape);
+    auto input_tensors = GetInputITensors();
+    auto output_tensors = GetOutputITensors();
+    IShuffleLayer* layer = network->addShuffle(*input_tensors[0]);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        if (input_tensors.size() == 1) {
+            layer->setReshapeDimensions(reshape_dims);
+        } else {
+            layer->setInput(1, *input_tensors[1]);
+        }
+        if (paramlist->reshape_type != 0 && output_dims.size() <= 4) {
+            Permutation CHW2HWC;
+            const auto& input_dims = input_blobs_[0]->GetBlobDesc().dims;
+            CHW2HWC.order[0] = 0;
+            CHW2HWC.order[input_dims.size()-1] = 1;
+            for(int i=1; i<input_dims.size()-1; ++i) {
+                CHW2HWC.order[i] = i+1;
+            }
+            layer->setFirstTranspose(CHW2HWC);
+            Permutation HWC2CHW;
+            HWC2CHW.order[0] = 0;
+            HWC2CHW.order[1] = output_dims.size()-1;
+            for(int i=2; i<output_dims.size(); ++i) {
+                HWC2CHW.order[i] = i-1;
+            }
+            auto permuted_dims = output_dims;
+            permuted_dims[output_dims.size()-1] = output_dims[1];
+            for(int i=2; i<output_dims.size(); ++i) {
+                permuted_dims[i-1] = output_dims[i];
+            }
+            layer->setReshapeDimensions(ConvertToTRTDynamicDims(permuted_dims));
+            layer->setSecondTranspose(HWC2CHW);
+        }
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    if (std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->IsQuantized()) {
+        std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->SetQuantized();
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Reshape, LAYER_RESHAPE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc
new file mode 100644
index 0000000..647090a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(RoiAlign, LAYER_ROIALIGN);
+
+bool RoiAlignTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return nbInputs == 3 && nbOutputs == 1 && pos < nbInputs + nbOutputs;
+}
+
+Status RoiAlignTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* RoiAlignTRTPluginLayerBuilder::getPluginType() const {
+    return "RoiAlign";
+}
+
+nvinfer1::DataType RoiAlignTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* RoiAlignTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs RoiAlignTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    RoiAlignLayerParam* param = dynamic_cast<RoiAlignLayerParam*>(param_);
+
+    DimsExprs output;
+    output.nbDims = 4;
+    output.d[0] = inputs[1].d[0];
+    output.d[1] = inputs[0].d[1];
+    output.d[2] = exprBuilder.constant(param->output_height);
+    output.d[3] = exprBuilder.constant(param->output_width);
+    return output;
+}
+
+const char* RoiAlignPluginCreator::getPluginName() const {
+    return "RoiAlign";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(RoiAlign, LAYER_ROIALIGN);
+
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc
new file mode 100644
index 0000000..13959b3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(ScatterND, LAYER_SCATTERND);
+
+bool ScatterNDTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kINT32 || inOut[pos].type == nvinfer1::DataType::kFLOAT);
+}
+
+Status ScatterNDTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* ScatterNDTRTPluginLayerBuilder::getPluginType() const {
+    return "ScatterND";
+}
+
+nvinfer1::DataType ScatterNDTRTPluginLayerBuilder::getOutputDataType(int index,
+        const nvinfer1::DataType* inputTypes, int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* ScatterNDTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs ScatterNDTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* ScatterNDPluginCreator::getPluginName() const {
+    return "ScatterND";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(ScatterND, LAYER_SCATTER_ND);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc
new file mode 100644
index 0000000..2f74486
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Shape, LAYER_SHAPE);
+
+ILayer* ShapeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    IShapeLayer* layer = network->addShape(*input_tensor);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());   
+    }
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Shape, LAYER_SHAPE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shuffle_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shuffle_layer_builder.cc
new file mode 100644
index 0000000..91ea778
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/shuffle_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+bool ShuffleTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status ShuffleTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* ShuffleTRTPluginLayerBuilder::getPluginType() const {
+    return "Shuffle";
+}
+
+nvinfer1::DataType ShuffleTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* ShuffleTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs ShuffleTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* ShufflePluginCreator::getPluginName() const {
+    return "Shuffle";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Shuffle, LAYER_SHUFFLE_CHANNEL);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sigmoid_layer.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sigmoid_layer.cc
new file mode 100644
index 0000000..149894c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sigmoid_layer.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_ACTIVATION_LAYER_BUILDER(Sigmoid);
+
+SigmoidTRTLayerBuilder::SigmoidTRTLayerBuilder(LayerType ignore) : ActivationTRTLayerBuilder(ignore) {
+    m_type = nvinfer1::ActivationType::kSIGMOID;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Sigmoid, LAYER_SIGMOID);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sign_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sign_layer_builder.cc
new file mode 100644
index 0000000..a758107
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sign_layer_builder.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Sign, LAYER_SIGN);
+
+bool SignTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status SignTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* SignTRTPluginLayerBuilder::getPluginType() const {
+    return "Sign";
+}
+
+nvinfer1::DataType SignTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* SignTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs SignTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputDims, exprBuilder);
+}
+
+const char* SignPluginCreator::getPluginName() const {
+    return "Sign";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Sign, LAYER_SIGN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sin_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sin_layer_builder.cc
new file mode 100644
index 0000000..c357fd1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sin_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Sin, LAYER_SIN);
+
+ILayer* SinTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kSIN);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Sin, LAYER_SIN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc
new file mode 100644
index 0000000..b826711
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Softmax, LAYER_SOFTMAX);
+
+ILayer* SoftmaxTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<SoftmaxLayerParam*>(param_);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    ILayer* layer;
+
+    ISoftMaxLayer* softmax_layer = network->addSoftMax(*input_tensor);
+    if (softmax_layer != nullptr) {
+        softmax_layer->setName(layer_name_.c_str());
+        softmax_layer->setAxes(1 << paramlist->axis);
+        input_tensor = softmax_layer->getOutput(0);
+        layer = softmax_layer;
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Softmax, LAYER_SOFTMAX);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softplus_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softplus_layer_builder.cc
new file mode 100644
index 0000000..a20d628
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/softplus_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_ACTIVATION_LAYER_BUILDER(Softplus);
+
+SoftplusTRTLayerBuilder::SoftplusTRTLayerBuilder(LayerType ignore) : ActivationTRTLayerBuilder(ignore) {
+    m_type = nvinfer1::ActivationType::kSOFTPLUS;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Softplus, LAYER_SOFTPLUS);
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc
new file mode 100644
index 0000000..3aa663d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(SplitV, LAYER_SPLITV);
+
+bool SplitVTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status SplitVTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* SplitVTRTPluginLayerBuilder::getPluginType() const {
+    return "SplitV";
+}
+
+nvinfer1::DataType SplitVTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* SplitVTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs SplitVTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<SplitVLayerParam*>(param_);
+    DimsExprs output(inputs[0]);
+    if (param->is_split_specified) {
+        output.d[param->axis] = exprBuilder.constant(param->slices[index]);
+    } else {
+        output.d[param->axis] = exprBuilder.operation(DimensionOperation::kCEIL_DIV, *inputs[0].d[param->axis],
+                                                      *exprBuilder.constant(param->slices.size()));
+    }
+    return output;
+}
+
+const char* SplitVPluginCreator::getPluginName() const {
+    return "SplitV";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(SplitV, LAYER_SPLITV);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sqrt_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sqrt_layer_builder.cc
new file mode 100644
index 0000000..4fb003f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sqrt_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Sqrt, LAYER_SQRT);
+
+ILayer* SqrtTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kSQRT);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Sqrt, LAYER_SQRT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc
new file mode 100644
index 0000000..eb8eff2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Squeeze, LAYER_SQUEEZE);
+
+bool SqueezeTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF ||
+            inOut[pos].type == nvinfer1::DataType::kINT32) && inOut[pos].type == inOut[0].type &&
+            inOut[pos].format == nvinfer1::TensorFormat::kNCHW;
+}
+
+Status SqueezeTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* SqueezeTRTPluginLayerBuilder::getPluginType() const {
+    return "Squeeze";
+}
+
+nvinfer1::DataType SqueezeTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* SqueezeTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs SqueezeTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<SqueezeLayerParam*>(param_);
+    auto axes = param->axes;
+    DimsExprs output;
+    output.nbDims = inputs[0].nbDims - axes.size();
+    for (auto& axis : axes) {
+        axis = axis < 0 ? axis + inputs[0].nbDims : axis;
+    }
+
+    int out_index = 0;
+    for (int i = 0; i < inputs[0].nbDims; i++) {
+        if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+            output.d[out_index++] = inputs[0].d[i];
+        }
+    }
+    return output;
+}
+
+const char* SqueezePluginCreator::getPluginName() const {
+    return "Squeeze";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Squeeze, LAYER_SQUEEZE);
+
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_layer_builder.cc
new file mode 100644
index 0000000..b01891c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_layer_builder.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(StrideSlice, LAYER_STRIDED_SLICE);
+
+bool StrideSliceTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kNCHW
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status StrideSliceTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* StrideSliceTRTPluginLayerBuilder::getPluginType() const {
+    return "StrideSlice";
+}
+
+nvinfer1::DataType StrideSliceTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* StrideSliceTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs StrideSliceTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    StrideSliceLayerParam* param = dynamic_cast<StrideSliceLayerParam*>(param_);
+    auto begins  = param->begins;
+    auto ends    = param->ends;
+    auto strides = param->strides;
+    std::reverse(begins.begin(), begins.end());
+    std::reverse(ends.begin(), ends.end());
+    std::reverse(strides.begin(), strides.end());
+    DimsExprs output(inputs[0]);
+    if (nbInputs == 1) {
+        for (int i = 0; i < inputs[0].nbDims; i++) {
+            output.d[i] = exprBuilder.constant((ends[i] - begins[i] - 1) / strides[i] + 1);
+        }
+    }
+
+    return output;
+}
+
+const char* StrideSlicePluginCreator::getPluginName() const {
+    return "StrideSlice";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(StrideSlice, LAYER_STRIDED_SLICE);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc
new file mode 100644
index 0000000..64738c8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+bool StrideSliceV2TRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kINT32;
+}
+
+Status StrideSliceV2TRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* StrideSliceV2TRTPluginLayerBuilder::getPluginType() const {
+    return "StrideSliceV2";
+}
+
+nvinfer1::DataType StrideSliceV2TRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* StrideSliceV2TRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs StrideSliceV2TRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    StrideSliceV2LayerParam* param = dynamic_cast<StrideSliceV2LayerParam*>(param_);
+    DimsExprs output(inputs[0]);
+    for (int i = 0; i < param->axes.size(); i++) {
+        int index = param->axes[i];
+
+        auto begin = exprBuilder.constant(param->begins[i]);
+        if (param->begins[i] < 0) {
+            begin = exprBuilder.operation(DimensionOperation::kSUM, *begin, *inputs[0].d[index]);
+        }
+
+        auto end = exprBuilder.constant(param->ends[i]);
+        if (param->ends[i] == INT_MAX) {
+            end = inputs[0].d[index];
+        }
+
+        if (param->ends[i] < 0) {
+            end = exprBuilder.operation(DimensionOperation::kSUM, *end, *inputs[0].d[index]);
+        }
+        auto stride = exprBuilder.constant(param->strides[i]);
+        auto one = exprBuilder.constant(1);
+        auto diff = exprBuilder.operation(DimensionOperation::kSUB, *end, *begin);
+        diff = exprBuilder.operation(DimensionOperation::kSUB, *diff, *one);
+        output.d[index] = exprBuilder.operation(DimensionOperation::kFLOOR_DIV, *diff, *stride);
+        output.d[index] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[index], *one);
+    }
+    return output;
+}
+
+const char* StrideSliceV2PluginCreator::getPluginName() const {
+    return "StrideSliceV2";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sub_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sub_layer_builder.cc
new file mode 100644
index 0000000..df59d79
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/sub_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Sub);
+
+SubTRTLayerBuilder::SubTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kSUB;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Sub, LAYER_SUB);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tan_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tan_layer_builder.cc
new file mode 100644
index 0000000..1402ea8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tan_layer_builder.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Tan, LAYER_TAN);
+
+ILayer* TanTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kTAN);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Tan, LAYER_TAN);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tanh_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tanh_layer_builder.cc
new file mode 100644
index 0000000..dac5f57
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tanh_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/activation_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_ACTIVATION_LAYER_BUILDER(Tanh);
+
+TanhTRTLayerBuilder::TanhTRTLayerBuilder(LayerType ignore) : ActivationTRTLayerBuilder(ignore) {
+    m_type = nvinfer1::ActivationType::kTANH;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Tanh, LAYER_TANH);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.cc
new file mode 100644
index 0000000..50108d2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.cc
@@ -0,0 +1,275 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <mutex>
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+TensorRTBaseLayerBuilder::TensorRTBaseLayerBuilder(LayerType type) : BaseLayerBuilder(type), trt_batchsize(0) {
+    m_layer = std::shared_ptr<BaseLayer>(CreateLayer(type));
+}
+
+TensorRTBaseLayerBuilder::~TensorRTBaseLayerBuilder() {
+    for (int i = 0; i < int8_weight_data.size(); i++) {
+        if (int8_weight_data[i]) {
+            free(int8_weight_data[i]);
+        }
+    }
+}
+
+Status TensorRTBaseLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+Status TensorRTBaseLayerBuilder::InferOutputShape() {
+    return TNN_OK;
+}
+
+Status TensorRTBaseLayerBuilder::Build() {
+    return TNN_OK;
+}
+
+bool TensorRTBaseLayerBuilder::IsPluginLayer() {
+    return this->is_plugin;
+}
+
+void TensorRTBaseLayerBuilder::SetBatchSize(int value) {
+    this->trt_batchsize = value;
+}
+
+void TensorRTBaseLayerBuilder::SetConstantResource(ConstantResource* consts) {
+    BaseLayer::SetConstantResource(consts);
+    this->m_layer->SetConstantResource(consts);
+}
+
+void TensorRTBaseLayerBuilder::SetNetwork(TensorRTNetwork_* network) {
+    this->m_network = network;
+}
+
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetTRTLayerBuilderCreatorMap() {
+    // static shared_ptr of LayerCreatorMap.
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>); });
+    return *creators;
+}
+
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetTRTPluginLayerBuilderCreatorMap() {
+    // static shared_ptr of LayerCreatorMap.
+    static std::once_flag once;
+    static std::shared_ptr<std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>> creators;
+    std::call_once(once, []() { creators.reset(new std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>); });
+    return *creators;
+}
+
+TensorRTBaseLayerBuilder* CreateTensorRTBaseLayerBuilder(LayerType type) {
+    TensorRTBaseLayerBuilder* cur_layer = nullptr;
+    auto& trt_map = GetTRTLayerBuilderCreatorMap();
+    auto& plugin_map = GetTRTPluginLayerBuilderCreatorMap();
+    if (trt_map.count(type) > 0) {
+        auto base_layer = trt_map[type]->CreateLayerBuilder();
+        cur_layer = dynamic_cast<TensorRTBaseLayerBuilder*>(base_layer);
+    } else if (plugin_map.count(type) > 0) {
+        auto base_layer = plugin_map[type]->CreateLayerBuilder();
+        cur_layer = dynamic_cast<TensorRTBaseLayerBuilder*>(base_layer);
+    }
+    return cur_layer;
+}
+
+ILayer* TensorRTBaseLayerBuilder::AddInt8OutputQDQLayers(nvinfer1::INetworkDefinition* network, ITensor* tensor,
+        std::shared_ptr<ForeignTensor> foreign_tensor, float quant_scale, float dequant_scale) {
+    Weights output_quant_shift;
+    output_quant_shift.type = nvinfer1::DataType::kFLOAT;
+    output_quant_shift.values = nullptr;
+    output_quant_shift.count = 0;
+
+    Weights output_quant_scale;
+    output_quant_scale.type = nvinfer1::DataType::kFLOAT;
+    float* output_quant_scale_data = (float*)malloc(sizeof(float));
+    int8_weight_data.push_back(output_quant_scale_data);
+    *output_quant_scale_data = quant_scale;
+    output_quant_scale.values = (void*)output_quant_scale_data;
+    output_quant_scale.count = 1;
+
+    Weights output_quant_power;
+    output_quant_power.type = nvinfer1::DataType::kFLOAT;
+    output_quant_power.values = nullptr;
+    output_quant_power.count = 0;
+
+    ILayer* output_quant_layer = network->addScale(*tensor, ScaleMode::kUNIFORM,
+        output_quant_shift, output_quant_scale, output_quant_power);
+    std::string output_quant_layer_name = layer_name_ + "_output_quant_";
+    output_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
+    output_quant_layer->setName(output_quant_layer_name.c_str());
+
+    Weights output_dequant_shift;
+    output_dequant_shift.type = nvinfer1::DataType::kFLOAT;
+    output_dequant_shift.values = nullptr;
+    output_dequant_shift.count = 0;
+
+    Weights output_dequant_scale;
+    output_dequant_scale.type = nvinfer1::DataType::kFLOAT;
+    float* output_dequant_scale_data = (float*)malloc(sizeof(float));
+    int8_weight_data.push_back(output_dequant_scale_data);
+    *output_dequant_scale_data = dequant_scale;
+    output_dequant_scale.values = (void*)output_dequant_scale_data;
+    output_dequant_scale.count = 1;
+
+    Weights output_dequant_power;
+    output_dequant_power.type = nvinfer1::DataType::kFLOAT;
+    output_dequant_power.values = nullptr;
+    output_dequant_power.count = 0;
+
+    ILayer* output_dequant_layer = network->addScale(*(output_quant_layer->getOutput(0)), ScaleMode::kUNIFORM,
+        output_dequant_shift, output_dequant_scale, output_dequant_power);
+    std::string output_dequant_layer_name = layer_name_ + "_output_dequant_";
+    output_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+    output_dequant_layer->setName(output_dequant_layer_name.c_str());
+    std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->SetQuantized();
+}
+
+ILayer* TensorRTBaseLayerBuilder::AddInt8WeightQDQLayers(nvinfer1::INetworkDefinition* network,
+        RawBuffer* weight, nvinfer1::Weights kernelWeights, RawBuffer* bias, nvinfer1::Weights biasWeights,
+        float scale, std::vector<int> dims) {
+    kernelWeights.type = nvinfer1::DataType::kFLOAT;
+    kernelWeights.values = nullptr;
+    kernelWeights.count = 0;
+
+    Weights int8Weights;
+    int8Weights.type = nvinfer1::DataType::kFLOAT;
+    float* host_weight = (float*)malloc(weight->GetDataCount() * sizeof(float));
+    int8_weight_data.push_back(host_weight);
+    for (int i = 0; i < weight->GetDataCount(); i++) {
+        host_weight[i] = weight->force_to<int8_t*>()[i];
+    }
+    int8Weights.values = (void*)host_weight;
+    int8Weights.count = weight->GetDataCount();
+
+    biasWeights.type = nvinfer1::DataType::kFLOAT;
+    if (bias) {
+        float* host_bias = (float*)malloc(bias->GetDataCount() * sizeof(float));
+        int8_weight_data.push_back(host_bias);
+        for (int i = 0; i < bias->GetDataCount(); i++) {
+            host_bias[i] = (bias->force_to<int*>())[i];
+        }
+        biasWeights.values = (void*)host_bias;
+        biasWeights.count = bias->GetDataCount();
+    } else {
+        biasWeights.values = nullptr;
+        biasWeights.count = 0;
+    }
+
+    Dims weightDims;
+    weightDims.nbDims = dims.size();
+    weightDims.d[0] = dims[0];
+    weightDims.d[1] = dims[1];
+    weightDims.d[2] = dims[2];
+    weightDims.d[3] = dims[3];
+    ILayer* constant_layer = network->addConstant(weightDims, int8Weights);
+
+    Weights weight_quant_shift;
+    weight_quant_shift.type = nvinfer1::DataType::kFLOAT;
+    weight_quant_shift.values = nullptr;
+    weight_quant_shift.count = 0;
+
+    Weights weight_quant_scale;
+    float* weight_quant_scale_data = (float*)malloc(sizeof(float));
+    int8_weight_data.push_back(weight_quant_scale_data);
+    *weight_quant_scale_data = 1.f;
+    weight_quant_scale.type = nvinfer1::DataType::kFLOAT;
+    weight_quant_scale.values = (void*)weight_quant_scale_data;
+    weight_quant_scale.count = 1;
+
+    Weights weight_quant_power;
+    weight_quant_power.type = nvinfer1::DataType::kFLOAT;
+    weight_quant_power.values = nullptr;
+    weight_quant_power.count = 0;
+
+    ILayer* weight_quant_layer = network->addScale(*(constant_layer->getOutput(0)), ScaleMode::kUNIFORM,
+        weight_quant_shift, weight_quant_scale, weight_quant_power);
+    std::string weight_quant_layer_name = layer_name_ + "_weight_quant_";
+    weight_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
+    weight_quant_layer->setName(weight_quant_layer_name.c_str());
+
+    Weights weight_dequant_shift;
+    weight_dequant_shift.type = nvinfer1::DataType::kFLOAT;
+    weight_dequant_shift.values = nullptr;
+    weight_dequant_shift.count = 0;
+
+    Weights weight_dequant_scale;
+    float* weight_dequant_scale_data = (float*)malloc(sizeof(float));
+    *weight_dequant_scale_data = scale;
+    int8_weight_data.push_back(weight_dequant_scale_data);
+    weight_dequant_scale.type = nvinfer1::DataType::kFLOAT;
+    weight_dequant_scale.values = (void*)weight_dequant_scale_data;
+    weight_dequant_scale.count = 1;
+
+    Weights weight_dequant_power;
+    weight_dequant_power.type = nvinfer1::DataType::kFLOAT;
+    weight_dequant_power.values = nullptr;
+    weight_dequant_power.count = 0;
+
+    ILayer* weight_dequant_layer = network->addScale(*(weight_quant_layer->getOutput(0)), ScaleMode::kUNIFORM,
+        weight_dequant_shift, weight_dequant_scale, weight_dequant_power);
+    std::string weight_dequant_layer_name = layer_name_ + "_weight_dequant_";
+    weight_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+    weight_dequant_layer->setName(weight_dequant_layer_name.c_str());
+    return weight_dequant_layer;
+}
+
+std::vector<ITensor*> TensorRTBaseLayerBuilder::GetInputITensors() {
+    std::vector<ITensor *> inputs;
+    for(auto blob : input_blobs_) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+        if (foreign_tensor) {
+            auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+            if (tensorrt_tensor) {
+                if (nullptr == tensorrt_tensor->GetTensor()) {
+                    LOGE("InputITensors[%d]:%s got nullptr for layer %s\n", 
+                                        inputs.size(), blob->GetBlobDesc().name.c_str(), GetLayerName().c_str());
+                }
+                inputs.push_back(tensorrt_tensor->GetTensor());
+            } else {
+                LOGE("GetInputITensors got non-TensorRTTensor\n");
+            }
+        } else {
+            LOGE("GetInputITensors got non-ForeignBlob\n");
+        }
+    }
+    return inputs;
+}
+
+std::vector<ITensor*> TensorRTBaseLayerBuilder::GetOutputITensors() {
+    std::vector<ITensor *> outputs;
+    for(auto blob : output_blobs_) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+        if (foreign_tensor) {
+            auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+            if (tensorrt_tensor) {
+                outputs.push_back(tensorrt_tensor->GetTensor());
+            } else {
+                LOGE("GetOutputITensors got non-TensorRTTensor\n");
+            }
+        } else {
+            LOGE("GetOutputITensors got non-ForeignBlob\n");
+        }
+    }
+    return outputs;   
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h
new file mode 100644
index 0000000..211495c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_BASE_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_BASE_LAYER_BUILDER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvUtils.h"
+#include "NvInferPlugin.h"
+
+#include "tnn/core/macro.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/network/tensorrt/tensorrt_tensor.h"
+#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+
+using namespace nvinfer1;
+using namespace plugin;
+
+namespace TNN_NS {
+
+class TensorRTNetwork_;
+
+// @brief BaseLayer Builder, defines the layer builder interface
+class TensorRTBaseLayerBuilder: public BaseLayerBuilder {
+public:
+    explicit TensorRTBaseLayerBuilder(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~TensorRTBaseLayerBuilder();
+
+    // @brief virtual layer init
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+            std::vector<Blob*>& outputs, AbstractDevice* device, bool enable_const_folder=true) = 0;
+
+    // @brief virtual Reshape recalculate the output tensor dims
+    virtual Status Reshape();
+
+    // @brief layer infer
+    virtual Status Forward() = 0;
+
+    // @brief add layer to tensorRT network
+    virtual ILayer* AddToNetwork(INetworkDefinition* network) = 0;
+
+    // @brief calculate the output tensor dims
+    virtual Status InferOutputShape();
+
+    // @brief check whether is a plugin layer
+    bool IsPluginLayer();
+
+    // @brief set tensorRT batchsize
+    void SetBatchSize(int value);
+
+    // @brief set constant resource
+    virtual void SetConstantResource(ConstantResource* consts);
+
+    // @brief set tensorrt_network
+    void SetNetwork(TensorRTNetwork_ *network);
+
+protected:
+    // @brief Build the foreign network
+    virtual Status Build();
+
+    ILayer* AddInt8OutputQDQLayers(nvinfer1::INetworkDefinition* network, ITensor* tensor,
+        std::shared_ptr<ForeignTensor> foreign_tensor, float quant_scale, float dequant_scale);
+
+    ILayer* AddInt8WeightQDQLayers(nvinfer1::INetworkDefinition* network, RawBuffer* weight,
+        nvinfer1::Weights kernelWeights, RawBuffer* bias, nvinfer1::Weights biasWeights,
+        float scale, std::vector<int> dims);
+
+    std::vector<ITensor*> GetInputITensors();
+
+    std::vector<ITensor*> GetOutputITensors();
+
+    std::shared_ptr<BaseLayer> m_layer;
+    std::vector<float*> int8_weight_data;
+    bool is_plugin;
+    int trt_batchsize;
+
+    TensorRTNetwork_* m_network;
+};
+
+class TensorRTLayerBuilder;
+class TensorRTLayerPluginBuilder;
+
+TensorRTBaseLayerBuilder* CreateTensorRTBaseLayerBuilder(LayerType type);
+
+// @brief TensorRTLayerBuilderCreator register map
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetTRTLayerBuilderCreatorMap();
+
+// @brief TensorRTPluginLayerBuilderCreator register map
+std::map<LayerType, std::shared_ptr<LayerBuilderCreator>>& GetTRTPluginLayerBuilderCreatorMap();
+
+}  //  TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_BASE_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.cc
new file mode 100644
index 0000000..59d2b76
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+TensorRTLayerBuilder::TensorRTLayerBuilder(LayerType type) : TensorRTBaseLayerBuilder(type) {
+    is_plugin = false;
+}
+
+TensorRTLayerBuilder::~TensorRTLayerBuilder() {
+}
+
+Status TensorRTLayerBuilder::Init(Context* context, LayerParam* param, LayerResource* resource,
+        std::vector<Blob*>& input_blobs, std::vector<Blob*>& output_blobs, AbstractDevice* device,
+        bool enable_const_folder) {
+    
+    m_layer->SetLayerName(this->GetLayerName());
+
+    Status ret = m_layer->Init(context, param, resource, input_blobs, output_blobs,
+        GetDevice(DEVICE_CUDA), enable_const_folder);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    input_blobs_  = m_layer->GetInputBlobs();
+    output_blobs_ = m_layer->GetOutputBlobs();
+
+    if (type_ == LayerType::LAYER_UPSAMPLE && input_blobs.size() == 4) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[input_blobs.size()-1])->GetForeignTensor();
+        auto name = output_blobs_[0]->GetBlobDesc().name;
+        std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->SetShapeBlobName(name);
+        std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->SetShapeTensor();
+    }
+
+    if (type_ == LayerType::LAYER_RESHAPE && input_blobs.size() > 1) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto name = output_blobs_[0]->GetBlobDesc().name;
+        std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->SetShapeBlobName(name);
+        std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->SetShapeTensor();
+    }
+
+    param_    = param;
+    resource_ = resource;
+
+    return TNN_OK;
+}
+
+Status TensorRTLayerBuilder::Forward() {
+    return TNN_OK;
+}
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h
new file mode 100644
index 0000000..ec15b8c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h"
+
+namespace TNN_NS {
+
+// @brief BaseLayer Builder, defines the layer builder interface
+class TensorRTLayerBuilder: public TensorRTBaseLayerBuilder {
+public:
+    explicit TensorRTLayerBuilder(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~TensorRTLayerBuilder();
+
+    // @brief virtual layer init
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+                std::vector<Blob*>& outputs, AbstractDevice* device, bool enable_const_folder=true);
+
+    // @brief layer infer
+    virtual Status Forward();
+
+    // @brief add layer to tensorRT network
+    virtual ILayer* AddToNetwork(INetworkDefinition* network) = 0;
+
+};
+
+//@brief TensorRTTypeLayerBuilderRegister register TypeLayerBuilderCreator
+template <typename T>
+class TensorRTTypeLayerBuilderRegister {
+public:
+    explicit TensorRTTypeLayerBuilderRegister(LayerType type) {
+        GetTRTLayerBuilderCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+#define DECLARE_TENSORRT_LAYER_BUILDER(type_string, layer_type)                                                        \
+    class type_string##TRTLayerBuilder : public TensorRTLayerBuilder {                                                 \
+    public:                                                                                                            \
+        type_string##TRTLayerBuilder(LayerType ignore) : TensorRTLayerBuilder(layer_type) {}                           \
+        virtual ~type_string##TRTLayerBuilder() {}                                                                     \
+        virtual ILayer* AddToNetwork(INetworkDefinition* network);                                                     \
+    }
+
+#define REGISTER_TENSORRT_LAYER_BUILDER(type_string, layer_type)                                                       \
+    TensorRTTypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##TRTLayerBuilder>>                            \
+        g_##layer_type##_trt_layer_builder_register(layer_type);
+
+}  //  namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NETWORK_TENSORRT_LAYER_BUILDER_TENSORRT_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc
new file mode 100644
index 0000000..89ed4e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc
@@ -0,0 +1,189 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cuda_runtime.h>
+
+#include <sstream>
+#include <memory>
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/tensorrt_tensor.h"
+#include "tnn/network/tensorrt/utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+TensorRTPluginLayerBuilder::TensorRTPluginLayerBuilder(LayerType type) : TensorRTBaseLayerBuilder(type) {
+    is_plugin = true;
+}
+
+TensorRTPluginLayerBuilder::~TensorRTPluginLayerBuilder() {
+}
+
+Status TensorRTPluginLayerBuilder::Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& input_blobs,
+        std::vector<Blob*>& output_blobs, AbstractDevice* device, bool enable_const_folder) {
+    
+    m_layer->SetLayerName(this->GetLayerName());
+
+    Status ret = m_layer->Init(context, param, resource, input_blobs, output_blobs, device, enable_const_folder);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    
+    input_blobs_  = m_layer->GetInputBlobs();
+    output_blobs_ = m_layer->GetOutputBlobs();
+
+    param_    = param;
+    resource_ = resource;
+    context_ = context;
+
+    m_format = nvinfer1::TensorFormat::kLINEAR;
+    m_type = nvinfer1::DataType::kFLOAT;
+
+    return TNN_OK;
+}
+
+Status TensorRTPluginLayerBuilder::Forward() {
+    return TNN_OK;
+}
+
+int TensorRTPluginLayerBuilder::getNbOutputs() const {
+    return output_blobs_.size();
+}
+
+DimsExprs TensorRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder) {
+    nvinfer1::DimsExprs output(inputs[0]);
+    return output;
+}
+
+int TensorRTPluginLayerBuilder::initialize() {
+    return 0;
+}
+
+void TensorRTPluginLayerBuilder::terminate() {
+}
+
+size_t TensorRTPluginLayerBuilder::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+    return 0;
+}
+
+bool dims_equal(DimsVector dims, nvinfer1::Dims trt_dims) {
+    bool same = true;
+    same &= (dims.size() == trt_dims.nbDims);
+    for(int i=0;i<dims.size();i++) {
+        same &= (dims[i] == trt_dims.d[i]);
+    }
+    return same;
+}
+
+int TensorRTPluginLayerBuilder::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+        const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs,
+        void* workspace, cudaStream_t stream) {
+    bool is_input_zero = false;
+    for (int i = 0; i < input_blobs_.size(); i++) {
+        Blob* input_blob = input_blobs_[i];
+        BlobHandle input_handle;
+        input_handle.base = const_cast<void *>(inputs[i]);
+        input_handle.bytes_offset = input_blob->GetHandle().bytes_offset;
+        input_blob->SetHandle(input_handle);
+        DimsVector dims;
+        for (int j = 0; j < inputDesc[i].dims.nbDims; j++) {
+            dims.push_back(inputDesc[i].dims.d[j]);
+            if (inputDesc[i].dims.d[j] == 0) is_input_zero = true;
+        }
+        input_blob->GetBlobDesc().dims = dims;
+    }
+
+    for (int i = 0; i < output_blobs_.size(); i++) {
+        Blob* output_blob = output_blobs_[i];
+        BlobHandle output_handle;
+        output_handle.base = const_cast<void *>(outputs[i]);
+        output_handle.bytes_offset = output_blob->GetHandle().bytes_offset;
+        output_blob->SetHandle(output_handle);
+        DimsVector dims;
+        for (int j = 0; j < outputDesc[i].dims.nbDims; j++) {
+            dims.push_back(outputDesc[i].dims.d[j]);
+        }
+        output_blob->GetBlobDesc().dims = dims;
+    }
+
+    if (is_input_zero) return 0;
+
+    Status ret = m_layer->Forward();
+    if (ret != TNN_OK) return -1;
+
+    return 0;
+}
+
+size_t TensorRTPluginLayerBuilder::getSerializationSize() const {
+    return sizeof(m_type) + sizeof(m_format);
+}
+
+void TensorRTPluginLayerBuilder::serialize(void* buffer) const {
+    char* d = reinterpret_cast<char*>(buffer);
+    write(d, m_type);
+    write(d, m_format);
+}
+
+const char* TensorRTPluginLayerBuilder::getPluginVersion() const {
+    return PLUGIN_VERSION;
+}
+
+void TensorRTPluginLayerBuilder::destroy() {
+    delete this;
+}
+
+void TensorRTPluginLayerBuilder::setPluginNamespace(const char* libNamespace) {
+    m_plugin_namespace = libNamespace;
+}
+
+const char* TensorRTPluginLayerBuilder::getPluginNamespace() const {
+    return m_plugin_namespace.c_str();
+}
+
+void TensorRTPluginLayerBuilder::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {
+    for (int i = 0; i < nbInputs; i++) {
+        input_blobs_[i]->GetBlobDesc().data_type = ConvertTRTDataType(in[i].desc.type);
+    }
+
+    for (int i = 0; i < nbOutputs; i++) {
+        output_blobs_[i]->GetBlobDesc().data_type = ConvertTRTDataType(out[i].desc.type);
+    }
+}
+
+nvinfer1::IPluginV2DynamicExt* TensorRTPluginLayerBuilder::CreatePlugin() {
+    return this;
+}
+
+nvinfer1::IPluginV2DynamicExt* TensorRTPluginLayerBuilder::CreatePlugin(const void* data, size_t length) {
+    const char* d = reinterpret_cast<const char*>(data);
+    m_type = read<nvinfer1::DataType>(d);
+    m_format = read<TensorFormat>(d);
+    return this;
+}
+
+ILayer* TensorRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    std::vector<ITensor*> tensors = GetInputITensors();
+    ILayer* layer = network->addPluginV2(tensors.data(), tensors.size(), *this);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+    return layer;
+}
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h
new file mode 100644
index 0000000..0675367
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h
@@ -0,0 +1,174 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORR_LAYER_BUILDER_TENSORRT_PLUGIN_LAYER_BUILDER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORR_LAYER_BUILDER_TENSORRT_PLUGIN_LAYER_BUILDER_H_
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h"
+
+namespace TNN_NS {
+
+constexpr const char* PLUGIN_VERSION{"1"};
+
+// @brief TensorRTPluginLayer Builder, defines the tensorRT plugin layer builder interface
+class TensorRTPluginLayerBuilder : public TensorRTBaseLayerBuilder, public nvinfer1::IPluginV2DynamicExt {
+public:
+    explicit TensorRTPluginLayerBuilder(LayerType type);
+
+    // @brief virtual destructor
+    virtual ~TensorRTPluginLayerBuilder();
+
+    // @brief virtual layer init
+    virtual Status Init(Context* context, LayerParam* param, LayerResource* resource, std::vector<Blob*>& inputs,
+                std::vector<Blob*>& outputs, AbstractDevice* device, bool enable_const_folder);
+
+    // @brief virtual layer infer
+    virtual Status Forward();
+
+    // @brief add layer to tensorRT network
+    virtual ILayer* AddToNetwork(INetworkDefinition* network);
+
+    virtual int getNbOutputs() const;
+
+    virtual DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder);
+
+    virtual int initialize();
+
+    virtual void terminate();
+
+    virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const;
+
+    virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream);
+
+    virtual size_t getSerializationSize() const;
+
+    virtual void serialize(void* buffer) const;
+
+    virtual const char* getPluginVersion() const;
+
+    virtual void destroy();
+
+    virtual void setPluginNamespace(const char* pluginNamespace);
+
+    virtual const char* getPluginNamespace() const;
+
+    virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs);
+
+    nvinfer1::IPluginV2DynamicExt* CreatePlugin();
+
+    nvinfer1::IPluginV2DynamicExt* CreatePlugin(const void* data, size_t length);
+
+protected:
+    std::string m_plugin_namespace;
+    nvinfer1::DataType m_type;
+    TensorFormat m_format;
+    Context* context_;
+
+private:
+    template<typename T>
+    void write(char*& buffer, const T& val) const {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T>
+    T read(const char*& buffer) const {
+        T val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+        return val;
+    }
+};
+
+//@brief TRTPluginTypeLayerBuilderRegister register TypeLayerBuilderCreator
+template <typename T>
+class TRTPluginTypeLayerBuilderRegister {
+public:
+    explicit TRTPluginTypeLayerBuilderRegister(LayerType type) {
+        GetTRTPluginLayerBuilderCreatorMap()[type] = shared_ptr<T>(new T(type));
+    }
+};
+
+#define DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                             \
+    class type_string##TRTPluginLayerBuilder : public TensorRTPluginLayerBuilder {                                 \
+    public:                                                                                                        \
+        type_string##TRTPluginLayerBuilder(LayerType layer_type) : TensorRTPluginLayerBuilder(layer_type) {}       \
+        virtual ~type_string##TRTPluginLayerBuilder() {}                                                           \
+        virtual bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,                   \
+            int nbInputs, int nbOutputs);                                                                          \
+        virtual DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,          \
+            nvinfer1::IExprBuilder& exprBuilder);                                                                  \
+        virtual const char* getPluginType() const;                                                                 \
+        virtual nvinfer1::IPluginV2DynamicExt* clone() const {                                                     \
+            auto* plugin = new type_string##TRTPluginLayerBuilder(*this);                                          \
+            plugin->setPluginNamespace(this->m_plugin_namespace.c_str());                                          \
+            return plugin;                                                                                         \
+        }                                                                                                          \
+        virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,              \
+            int nbInputs) const;                                                                                   \
+        virtual ILayer* AddToNetwork(INetworkDefinition* network);                                                 \
+        virtual Status Reshape();                                                                                  \
+    };                                                                                                             \
+    class type_string##PluginCreator : public nvinfer1::IPluginCreator {                                           \
+    public:                                                                                                        \
+        type_string##PluginCreator() {                                                                             \
+            m_fc.nbFields = 0;                                                                                     \
+            m_fc.fields = nullptr;                                                                                 \
+        }                                                                                                          \
+        virtual const char* getPluginName() const;                                                                 \
+        virtual const char* getPluginVersion() const { return PLUGIN_VERSION; }                                    \
+        virtual const nvinfer1::PluginFieldCollection* getFieldNames() { return &m_fc; }                           \
+        virtual const char* getPluginNamespace() const { return m_plugin_namespace.c_str(); }                      \
+        virtual void setPluginNamespace(const char* libNamespace) { m_plugin_namespace = libNamespace; }           \
+        virtual nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name,                                      \
+                const nvinfer1::PluginFieldCollection* fc) {                                                       \
+            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                               \
+                TensorRTNetwork_::GetPluginLayerNameMap();                                                         \
+            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                   \
+            auto plugin = layer->CreatePlugin();                                                                   \
+            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                \
+            return plugin;                                                                                         \
+        }                                                                                                          \
+        virtual nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,                                 \
+                const void* serialData, size_t serialLength) {                                                     \
+            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                               \
+                TensorRTNetwork_::GetPluginLayerNameMap();                                                         \
+            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                   \
+            IPluginV2DynamicExt* plugin;                                                                           \
+            if (serialLength == 0) {                                                                               \
+                plugin = layer->CreatePlugin();                                                                    \
+            } else {                                                                                               \
+                plugin = layer->CreatePlugin(serialData, serialLength);                                            \
+            }                                                                                                      \
+            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                \
+            auto new_plugin = plugin->clone();                                                                     \
+            return new_plugin;                                                                                     \
+        }                                                                                                          \
+    private:                                                                                                       \
+        nvinfer1::PluginFieldCollection m_fc;                                                                      \
+        std::string m_plugin_namespace;                                                                            \
+    };                                                                                                             \
+    REGISTER_TENSORRT_PLUGIN(type_string##PluginCreator);
+
+#define REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                            \
+    TRTPluginTypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##TRTPluginLayerBuilder>>                 \
+        g_##layer_type##_trt_plugin_layer_builder_register(layer_type);
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORR_LAYER_BUILDER_TENSORRT_PLUGIN_LAYER_BUILDER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc
new file mode 100644
index 0000000..40ae1b0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Tile, LAYER_REPEAT);
+
+bool TileTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kINT32;
+}
+
+Status TileTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* TileTRTPluginLayerBuilder::getPluginType() const {
+    return "Tile";
+}
+
+nvinfer1::DataType TileTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* TileTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs TileTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) {
+    DimsExprs output;
+    auto *layer_param = dynamic_cast<TileLayerParam *>(param_);
+    auto reps = layer_param->reps;
+    auto input_dims = inputs[0].nbDims;
+    int reps_size = reps.size();
+    output.nbDims = std::max(reps_size, inputs[0].nbDims);
+    int index_i = inputs[0].nbDims-1, index_o = output.nbDims-1, index_r = reps_size-1;
+    for (; index_i>=0 && index_r>=0; index_i--, index_o--, index_r--) {
+        auto rep = exprBuilder.constant(reps[index_r]);
+        output.d[index_o] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[index_i], *rep);
+    }
+
+    for (; index_r>=0; index_r--) {
+        output.d[index_o] = exprBuilder.constant(reps[index_r]);
+    }
+
+    return output;
+}
+
+const char* TilePluginCreator::getPluginName() const {
+    return "Tile";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Tile, LAYER_REPEAT);
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/topk_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/topk_layer_builder.cc
new file mode 100644
index 0000000..814c0cc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/topk_layer_builder.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(TopK, LAYER_TOPK);
+
+ILayer* TopKTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto param = dynamic_cast<TopKLayerParam*>(param_);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    auto topk_largest = nvinfer1::TopKOperation::kMAX;
+    if (param->largest != 1) {
+        topk_largest = nvinfer1::TopKOperation::kMIN;
+    }
+
+    uint32_t reduceAxis = 0x1 << param->axis;
+
+    ITopKLayer* layer = network->addTopK(*tensor, topk_largest, param->k, reduceAxis);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(TopK, LAYER_TOPK);
+
+}  //  namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/unsqueeze_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/unsqueeze_layer_builder.cc
new file mode 100644
index 0000000..0f5ea89
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/unsqueeze_layer_builder.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Unsqueeze, LAYER_UNSQUEEZE);
+
+bool UnsqueezeTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF ||
+        inOut[pos].type == nvinfer1::DataType::kINT32) &&
+        inOut[pos].format == nvinfer1::TensorFormat::kNCHW &&
+        inOut[pos].type == inOut[0].type;
+}
+
+Status UnsqueezeTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* UnsqueezeTRTPluginLayerBuilder::getPluginType() const {
+    return "Unsqueeze";
+}
+
+nvinfer1::DataType UnsqueezeTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const {
+    return inputTypes[0];
+}
+
+ILayer* UnsqueezeTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    if (GetInputITensors()[0]->getDimensions().nbDims == 0) {
+        auto param = dynamic_cast<UnsqueezeLayerParam*>(param_);
+        nvinfer1::Dims trt_dims;
+        trt_dims.nbDims = 1;
+        trt_dims.d[0] = 1;
+        nvinfer1::Weights const_weight;
+        const_weight.type = nvinfer1::DataType::kINT32;
+        const_weight.values = &param->axes[0];
+        const_weight.count = 1;
+        ILayer* constant_layer = network->addConstant(trt_dims, const_weight);
+        IShuffleLayer* shuffle_layer = network->addShuffle(*GetInputITensors()[0]);
+        nvinfer1::Dims d;
+        d.nbDims = 1;
+        d.d[0] = 1;
+        shuffle_layer->setReshapeDimensions(d);
+        return shuffle_layer;
+    }
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs UnsqueezeTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) {
+    auto param = dynamic_cast<UnsqueezeLayerParam*>(param_);
+    auto axes = param->axes;
+    DimsExprs output;
+    output.nbDims = axes.size() + inputs[0].nbDims;
+    for (auto& axis : axes) {
+        axis = axis < 0 ? axis + inputs[0].nbDims + 1 : axis;
+    }
+
+    int in_index = 0;
+    for (int i = 0; i < output.nbDims; i++) {
+        if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+            output.d[i] = inputs[0].d[in_index++];
+        } else {
+            output.d[i] = exprBuilder.constant(1);
+        }
+    }
+    return output;
+}
+
+const char* UnsqueezePluginCreator::getPluginName() const {
+    return "Unsqueeze";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Unsqueeze, LAYER_UNSQUEEZE);
+
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc
new file mode 100644
index 0000000..6e98d26
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Upsample, LAYER_UPSAMPLE);
+
+ILayer* UpsampleTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<UpsampleLayerParam*>(param_);
+    Blob* output_blob  = output_blobs_[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    IResizeLayer* layer = network->addResize(*input_tensor);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        if (input_blobs_.size() == 1) {
+            if (!paramlist->dims.empty()) {
+                nvinfer1::Dims4 dims(output_dims[0], output_dims[1], output_dims[2], output_dims[3]);
+                layer->setOutputDimensions(dims);
+            } else {
+                float scale[4];
+                scale[0] = 1;
+                scale[1] = 1;
+                scale[2] = paramlist->scales[1];
+                scale[3] = paramlist->scales[0];
+                layer->setScales(scale, 4);
+            }
+        } else if (input_blobs_.size() == 4) {
+            auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[input_blobs_.size()-1])->GetForeignTensor();
+            auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+            layer->setInput(1, *input_tensor2);
+        } else {
+            return nullptr;
+        }
+        layer->setResizeMode(paramlist->mode == 1 ? ResizeMode::kNEAREST : ResizeMode::kLINEAR);
+        layer->setAlignCorners(paramlist->align_corners);
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Upsample, LAYER_UPSAMPLE);
+
+}  //  namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.cc b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.cc
new file mode 100644
index 0000000..7a6ad92
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.cc
@@ -0,0 +1,162 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <string>
+
+#include "tnn/device/cuda/cuda_device.h"
+#include "tnn/network/tensorrt/tensorrt_blob_manager.h"
+#include "tnn/network/tensorrt/tensorrt_tensor.h"
+#include "tnn/memory_manager/blob_memory_pool_factory.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/memory_manager/memory_mode_state_factory.h"
+#include "tnn/memory_manager/memory_seperate_assign_strategy.h"
+#include "tnn/memory_manager/memory_unify_assign_strategy.h"
+
+namespace TNN_NS {
+
+TensorRTBlobManager::TensorRTBlobManager(AbstractDevice *device) : BlobManager(device) {
+}
+
+TensorRTBlobManager::~TensorRTBlobManager() {
+}
+
+Status TensorRTBlobManager::Init(NetworkConfig &config, NetStructure *net_structure, InputShapesMap inputs_shape_map,
+        DataType input_data_type) {
+    if (net_structure->blobs.empty()) {
+        LOGE("net_structure blobs is empty\n");
+        return Status(TNNERR_PARAM_ERR, "net_structure blobs is empty");
+    }
+
+    net_structure_ = net_structure;
+    // modify input shape, only set invalid net input shape
+    auto instance_input_shapes_map = net_structure_->inputs_shape_map;
+    for (auto iter : inputs_shape_map) {
+        if (instance_input_shapes_map.count(iter.first) > 0) {
+            instance_input_shapes_map[iter.first] = iter.second;
+        }
+    }
+
+    config_            = config;
+    init_thread_id_    = std::this_thread::get_id();
+    memory_mode_state_ = MemoryModeStateFactory::CreateMemoryModeState(SHARE_MEMORY_MODE_DEFAULT);
+
+    // get the maximum dimension of all inputs
+    int input_dims = 0;
+    for (auto blob_dims : instance_input_shapes_map) {
+        int dims   = (int)blob_dims.second.size();
+        input_dims = std::max(input_dims, dims);
+    }
+
+    for (auto node_name : net_structure_->blobs) {
+        BlobDesc desc;
+        desc.device_type = config.device_type;
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.name        = node_name;
+        // set to the specified data format
+        if (config.data_format != DATA_FORMAT_AUTO) {
+            desc.data_format = config.data_format;
+        }
+
+        // check whether the input_shape is defined or not.
+        if (instance_input_shapes_map.count(node_name) > 0) {
+            desc.dims = instance_input_shapes_map[node_name];
+        }
+        BlobHandle handle;
+        blobs_[node_name] = new ForeignBlob(desc, handle);
+        auto tensorrtTensor = std::make_shared<TensorRTTensor>();
+        dynamic_cast<ForeignBlob*>(blobs_[node_name])->SetForeignTensor(tensorrtTensor);
+    }
+
+    // input blobs
+    for (auto iter : instance_input_shapes_map) {
+        std::string current_blob_name         = iter.first;
+        Blob *current_blob                    = blobs_[current_blob_name];
+        current_blob->GetBlobDesc().data_type = input_data_type;
+        input_blobs_[current_blob_name]       = current_blob;
+    }
+
+    // input data types
+    const auto& input_data_type_map = net_structure->input_data_type_map;
+    for (auto iter : instance_input_shapes_map) {
+        std::string current_blob_name         = iter.first;
+        Blob *current_blob                    = blobs_[current_blob_name];
+        if (input_data_type_map.find(current_blob_name) != input_data_type_map.end()) {
+            current_blob->GetBlobDesc().data_type = input_data_type_map.find(current_blob_name)->second;
+        }
+        input_blobs_[current_blob_name]       = current_blob;
+    }
+
+    // output blobs
+    std::set<std::string> &output_blob_names = net_structure_->outputs;
+    for (auto name : output_blob_names) {
+        Blob *blob = blobs_[name];
+        if (std::find(net_structure_->blobs.begin(), net_structure_->blobs.end(), name) != net_structure_->blobs.end()) {
+            output_blobs_[name] = blob;
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status TensorRTBlobManager::AllocateBlobMemory(int flag) {
+    // input
+    for (auto iter : input_blobs_) {
+        Blob *current_blob = iter.second;
+        BlobMemorySizeInfo info = device_->Calculate(current_blob->GetBlobDesc());
+        info.data_type = DATA_TYPE_FLOAT;
+        int use_count = 1;
+        BlobMemory *blob_memory = nullptr;
+        blob_memory = blob_memory_pool_map_[info.dims.size()]->BorrowBlobMemory(use_count, info, true);
+        blob_memory_mapping_.insert(std::make_pair(current_blob, blob_memory));
+    }
+
+    // output
+    for (auto iter : output_blobs_) {
+        Blob *current_blob = iter.second;
+        BlobMemorySizeInfo info = device_->Calculate(current_blob->GetBlobDesc());
+        info.data_type = DATA_TYPE_FLOAT;
+        int use_count = 1;
+        BlobMemory *blob_memory = nullptr;
+        blob_memory = blob_memory_pool_map_[info.dims.size()]->BorrowBlobMemory(use_count, info, true);
+        blob_memory_mapping_.insert(std::make_pair(current_blob, blob_memory));
+    }
+
+    Status status = TNN_OK;
+
+    do {
+            // ignore share memory mode, allocated the blob memory seperately.
+            MemorySeperateAssignStrategy strategy;
+            for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
+                status = blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
+                BREAK_IF(status != TNN_OK);
+            }
+            BREAK_IF(status != TNN_OK);
+            BindBlobMemory();
+    } while (0);
+
+    return status;
+}
+
+Status TensorRTBlobManager::MemAlloc(void **ptr, size_t size) {
+    Status ret = dynamic_cast<CudaDevice*>(device_)->Allocate(ptr, size);
+    return ret;
+}
+
+Status TensorRTBlobManager::MemFree(void* ptr) {
+    Status ret = dynamic_cast<CudaDevice*>(device_)->Free(ptr);
+    return ret;
+}
+
+}  //  TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.h b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.h
new file mode 100644
index 0000000..4223c91
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_blob_manager.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_BLOB_MANAGER_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_BLOB_MANAGER_H_
+
+#include "NvInfer.h"
+
+#include "tnn/core/blob_manager.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+
+namespace TNN_NS {
+
+class TensorRTBlobManager : public BlobManager {
+public:
+    // @brief TensorRTBlobManager constructor
+    explicit TensorRTBlobManager(AbstractDevice *device);
+
+    // @brief TensorRTBlobManager destructor
+    ~TensorRTBlobManager();
+
+    // @brief init tensorrt blobs
+    // @param structure net structure
+    virtual Status Init(NetworkConfig &config, NetStructure *net_structure, InputShapesMap inputs_shape_map,
+                DataType input_data_type);
+
+    // @brief AllocateBlobMemory
+    Status AllocateBlobMemory(int flag = DATA_FLAG_CHANGE_ALWAYS) override;
+
+    // @brief Allocate a memory buffer
+    Status MemAlloc(void** ptr, size_t size);
+
+    // @brief Free a memory buffer
+    Status MemFree(void* ptr);
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_BLOB_MANAGER_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.cc b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.cc
new file mode 100644
index 0000000..5bb0e6a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.cc
@@ -0,0 +1,830 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <memory>
+#include <sstream>
+#include <mutex>
+
+#include "tnn/device/cuda/cuda_context.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/network/tensorrt/exclusive_file.h"
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/md5.h"
+#include "tnn/device/cuda/cuda_macro.h"
+#include "tnn/utils/blob_dump_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+#define MAX_SCRATCH_MEMORY (1<<31 - 1)
+#define TENSORRT_SERIALIZE_VERSION "v1.2"
+
+NetworkImplFactoryRegister<NetworkImplFactory<TensorRTNetwork_>>
+    g_network_impl_tensorrt_factory_register(NETWORK_TYPE_TENSORRT);
+
+std::unordered_map<std::string, TensorRTPluginLayerBuilder*> TensorRTNetwork_::m_plugin_layer_name_map;
+
+std::mutex TensorRTNetwork_::network_mutex;
+
+TensorRTNetwork_::TensorRTNetwork_() {
+    int8_mode = false;
+    test_mode = false;
+    m_trt_engine = nullptr;
+    m_trt_context = nullptr;
+    m_context_memory = nullptr;
+    m_trt_bindings = nullptr;
+}
+
+TensorRTNetwork_::~TensorRTNetwork_() {
+    CUDA_CHECK(cudaSetDevice(device_id_));
+
+    if(config_.share_memory_mode == SHARE_MEMORY_MODE_SHARE_ONE_THREAD) {
+        SharedMemoryManager::ReleaseSharedMemory(init_thread_id_, device_, config_.device_id, this);
+    } else {
+        if (m_context_memory) {
+            Status ret = dynamic_cast<TensorRTBlobManager*>(blob_manager_)->MemFree(m_context_memory);
+            if (ret != TNN_OK) {
+                LOGE("Error deconstruct TensorRT Network\n");
+            }
+        }
+    }
+
+    if (m_trt_context) {
+        m_trt_context->destroy();
+    }
+
+    if (m_trt_engine) m_trt_engine->destroy();
+
+    if(m_trt_bindings) delete[] m_trt_bindings;
+}
+
+Status TensorRTNetwork_::Init(NetworkConfig &net_config, ModelConfig &model_config,
+        AbstractModelInterpreter* interpreter, InputShapesMap min_inputs_shape,
+        InputShapesMap max_inputs_shape, bool enable_const_folder) {
+    std::unique_lock<std::mutex> lck(network_mutex);
+    device_id_ = net_config.device_id;
+    CUDA_CHECK(cudaSetDevice(net_config.device_id));
+    config_ = net_config;
+    DefaultModelInterpreter *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    CHECK_PARAM_NULL(default_interpreter);
+
+    auto params_md5 = default_interpreter->GetParamsMd5();
+    if (params_md5.size() == 0) {
+        test_mode = true;
+    }
+
+    NetStructure *net_structure = default_interpreter->GetNetStructure();
+    NetResource *net_resource   = default_interpreter->GetNetResource();
+    net_resource_ = net_resource;
+    CHECK_PARAM_NULL(net_structure);
+    CHECK_PARAM_NULL(net_resource);
+    net_structure_ = net_structure;
+    net_resource_ = net_resource;
+
+    device_ = GetDevice(net_config.device_type);
+    CHECK_PARAM_NULL(device_);
+
+    context_ = device_->CreateContext(net_config.device_id);
+    CHECK_PARAM_NULL(context_);
+
+    Status ret = context_->LoadLibrary(net_config.library_path);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    {
+        // use mutex to protect net_resource and net_structure in multi-thread
+        std::unique_lock<std::mutex> lck(optimize_mtx_);
+        ret = optimizer::NetOptimizerManager::Optimize(net_structure, net_resource, net_config);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    init_thread_id_    = std::this_thread::get_id();
+
+    blob_manager_ = new TensorRTBlobManager(device_);
+    ret = blob_manager_->Init(net_config, net_structure, max_inputs_shape, GetNetResourceDataType(net_resource));
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    this->m_max_batchsize = 1;
+    BlobMap inputs;
+    ret = blob_manager_->GetAllInputBlobs(inputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get input blobs failed");
+        return ret;
+    }
+
+    for (auto iter : inputs) {
+        if (iter.second->GetBlobDesc().dims[0] > this->m_max_batchsize) {
+            this->m_max_batchsize = iter.second->GetBlobDesc().dims[0];
+        }
+    }
+
+    ret = InitLayers(net_structure, net_resource, enable_const_folder);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    RETURN_ON_NEQ(CheckConstBlobs(), TNN_OK);
+
+    BlobMap outputs;
+    ret = blob_manager_->GetAllOutputBlobs(outputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get output blobs failed");
+        return ret;
+    }
+
+    std::string cache_file_name = GetCacheFileName(params_md5, inputs, outputs, min_inputs_shape,
+        net_config.device_id, this->m_max_batchsize, this->int8_mode, config_.precision == PRECISION_LOW,
+        enable_const_folder);
+
+    std::unique_ptr<ExclFile> file_lock(new ExclFile(cache_file_name));
+
+    if (test_mode || false == file_lock->Ready()) {
+        ret = InitWithoutCache(inputs, outputs, cache_file_name, net_resource, min_inputs_shape);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    if (!test_mode) {
+        size_t size = 0;
+        std::ifstream deploy_input(cache_file_name, std::ios::binary);
+        deploy_input.seekg(0, deploy_input.end);
+        size = deploy_input.tellg();
+        deploy_input.seekg(0, deploy_input.beg);
+        char *model_stream = new char[size + 1];
+        deploy_input.read(model_stream, size);
+        IRuntime* runtime = createInferRuntime(m_trt_logger);
+        m_trt_engine = runtime->deserializeCudaEngine(model_stream, size);
+        delete[] model_stream;
+        ret = CreateExecuteContext();
+        if (ret != TNN_OK)
+            return ret;
+
+        runtime->destroy();
+        deploy_input.close();
+    } else {
+        ret = CreateExecuteContext();
+        if (ret != TNN_OK)
+            return ret;
+    }
+
+    int bind_num = m_trt_engine->getNbBindings();
+    this->m_trt_bindings = new void*[bind_num];
+
+    ret = ReshapeLayers();
+    if (ret != TNN_OK) {
+        LOGE("tensorrt network reshape layers failed\n");
+        return ret;
+    }
+
+    ret = blob_manager_->AllocateBlobMemory();
+    if (ret != TNN_OK) {
+       return ret;
+    }
+
+    for (auto iter : outputs) {
+        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
+        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+    }
+
+    return TNN_OK;
+}
+
+Status TensorRTNetwork_::Forward() {
+    CUDA_CHECK(cudaSetDevice(device_id_));
+    BlobMap inputs;
+    auto ret = blob_manager_->GetAllInputBlobs(inputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get input blobs failed");
+        return ret;
+    }
+
+    for (auto iter : inputs) {
+        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
+        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+    }
+
+    bool trt_ret = this->m_trt_context->enqueueV2(this->m_trt_bindings,
+        dynamic_cast<CudaContext*>(context_)->GetStream(), nullptr);
+    if (trt_ret != true) {
+        return TNNERR_CUDA_TENSORRT_ERROR;
+    }
+    Status status = context_->Synchronize();
+    if(status != TNN_OK) {
+        return status;
+    }
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+    status = DumpAllOutputBlob();
+#endif
+    return status;
+}
+
+Status TensorRTNetwork_::ReshapeLayers() {
+    for (auto cur_layer : layers_) {
+        auto ret = dynamic_cast<TensorRTBaseLayerBuilder*>(cur_layer)->Reshape();
+        if (ret != TNN_OK) {
+            return ret;
+        }
+    }
+
+    BlobMap inputs;
+    auto ret = blob_manager_->GetAllInputBlobs(inputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get input blobs failed");
+        return ret;
+    }
+
+    for (auto iter : inputs) {
+        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
+        auto dims = blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims;
+        nvinfer1::Dims inputDims = ConvertToTRTDims(dims);
+        m_trt_context->setBindingDimensions(index, inputDims);
+        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+    }
+
+    BlobMap outputs;
+    ret = blob_manager_->GetAllOutputBlobs(outputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get output blobs failed");
+        return ret;
+    }
+
+    for (auto blob_name : const_input_blobs_) {
+        Blob *blob = blob_manager_->GetBlob(blob_name);
+        auto buf = net_resource_->constant_map[blob_name];
+        int index = m_trt_engine->getBindingIndex(blob_name.c_str());
+        if (index < 0) continue;
+        // Data is reload from const_map to blob in CudaLayerAcc::ReloadConstantBlobs
+        m_trt_bindings[index] = blob->GetHandle().base;
+
+        bool ret;
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+        if (std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->IsShapeTensor()) {
+            auto name = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetShapeBlobName();
+            auto dims = net_resource_->blob_shapes_map[name];
+            ret = m_trt_context->setInputShapeBinding(index, dims.data());
+        } else {
+            nvinfer1::Dims inputDims = ConvertToTRTDims(buf->GetBufferDims());
+            ret = m_trt_context->setBindingDimensions(index, inputDims);
+        }
+
+        if (!ret) {
+            return Status(TNNERR_PARAM_ERR, "Reshape failed\n");
+        }
+    }
+
+    for (auto iter : outputs) {
+        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
+        auto trt_dims = m_trt_context->getBindingDimensions(index).d;
+        DimsVector dims;
+        for (int i = 0; i < m_trt_context->getBindingDimensions(index).nbDims; i++) {
+            dims.push_back(trt_dims[i]);
+        }
+        blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims = dims;
+    }
+
+    return TNN_OK;
+}
+
+Status TensorRTNetwork_::Reshape(const InputShapesMap &inputs) {
+    CUDA_CHECK(cudaSetDevice(device_id_));
+    Status ret = TNN_OK;
+    bool do_reshape = false;
+    for (auto iter : inputs) {
+        Blob *blob = blob_manager_->GetBlob(iter.first);
+        if (blob == nullptr) {
+            LOGE("DefaultNetwork reshape blob is empty\n");
+            return Status(TNNERR_PARAM_ERR, "DefaultNetwork reshape blob is empty");
+        }
+        if(!DimsVectorUtils::Equal(blob->GetBlobDesc().dims, iter.second)) {
+            blob->GetBlobDesc().dims = iter.second;
+            do_reshape = true;
+        }
+    }
+
+    if(!do_reshape) {
+        return ret;
+    }
+
+    return ReshapeLayers();
+}
+
+Status TensorRTNetwork_::ForwardAsync(Callback call_back) {
+    CUDA_CHECK(cudaSetDevice(device_id_));
+    BlobMap inputs;
+    auto ret = blob_manager_->GetAllInputBlobs(inputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get input blobs failed");
+        return ret;
+    }
+
+    for (auto iter : inputs) {
+        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
+        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+    }
+
+    bool trt_ret = this->m_trt_context->enqueueV2(this->m_trt_bindings,
+        dynamic_cast<CudaContext*>(context_)->GetStream(), nullptr);
+    if (trt_ret != true) {
+        return TNNERR_CUDA_TENSORRT_ERROR;
+    }
+    Status status = TNN_OK;
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+    status = context_->Synchronize();
+    if(status != TNN_OK) {
+        return status;
+    }
+    status = DumpAllOutputBlob();
+#endif
+    return status;
+}
+
+std::unordered_map<std::string, TensorRTPluginLayerBuilder*> TensorRTNetwork_::GetPluginLayerNameMap() {
+    return m_plugin_layer_name_map;
+}
+
+Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *net_resource, bool enable_const_folder) {
+    Status ret = TNN_OK;
+
+    // mark const blobs and blob data type
+    auto const_blobs = net_resource->constant_map;
+    for (auto layer_info : net_structure->layers) {
+        std::vector<std::string> &input_names  = layer_info->inputs;
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (const_blobs.find(name) != const_blobs.end()) {
+                if (runtime_model_ == RUNTIME_MODE_NORMAL) {
+                    blob->SetFlag(DATA_FLAG_CHANGE_NEVER);
+                }
+                blob->GetBlobDesc().data_type = const_blobs[name]->GetDataType();
+            }
+        }
+    }
+
+    auto const_layers = net_resource->constant_layers;
+    for (auto layer_info : net_structure->layers) {
+        if (runtime_model_ == RUNTIME_MODE_NORMAL && const_layers.find(layer_info->name) != const_layers.end()) {
+            continue;
+        }
+
+        LayerType type = layer_info->type;
+        TensorRTBaseLayerBuilder *cur_layer = CreateTensorRTBaseLayerBuilder(type);
+        if (nullptr == cur_layer) {
+            LOGE("Error: CreateLayer failed, type:%d\n", type);
+            return Status(TNNERR_PARAM_ERR, "CreateLayer failed");
+        }
+
+        std::string layer_name = layer_info->name;
+        cur_layer->SetNetwork(this);
+        cur_layer->SetLayerName(layer_name);
+        // set layer nodes
+        std::vector<Blob *> inputs;
+        std::vector<std::string> &input_names = layer_info->inputs;
+        // get input nodes
+        bool is_int8_blob = layer_info->param->quantized;
+
+        for (auto name : input_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (is_int8_blob) {
+                auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+                auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+                if (!tensorrt_tensor->GetInt8Mode()) {
+                    std::string blob_scale_name = name + "_scale_data_";
+                    tensorrt_tensor->SetIntResource(
+                        reinterpret_cast<IntScaleResource *>(net_resource->resource_map[blob_scale_name].get()));
+                    tensorrt_tensor->SetInt8Mode(true);
+                }
+                this->int8_mode = true;
+            }
+            inputs.push_back(blob);
+        }
+
+        std::vector<Blob *> outputs;
+        std::vector<std::string> &output_names = layer_info->outputs;
+
+        for (auto name : output_names) {
+            auto blob = blob_manager_->GetBlob(name);
+            if (is_int8_blob) {
+                auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
+                auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+                if (!tensorrt_tensor->GetInt8Mode()) {
+                    std::string blob_scale_name = name + "_scale_data_";
+                    tensorrt_tensor->SetIntResource(
+                        reinterpret_cast<IntScaleResource *>(net_resource->resource_map[blob_scale_name].get()));
+                    tensorrt_tensor->SetInt8Mode(true);
+                }
+                this->int8_mode = true;
+            }
+            outputs.push_back(blob);
+        }
+
+        LayerResource *layer_resource = nullptr;
+        if (net_resource->resource_map.count(layer_name) != 0 ) {
+            layer_resource = net_resource->resource_map[layer_name].get();
+        }
+
+        cur_layer->SetRuntimeMode(runtime_model_);
+        cur_layer->SetConstantResource(&net_resource->constant_map);
+        ret = cur_layer->Init(context_, layer_info->param.get(), layer_resource, inputs,
+            outputs, device_, enable_const_folder);
+        if (ret != TNN_OK) {
+            LOGE("Error Init layer %s (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(), (int)ret, (int)ret);
+            return ret;
+        }
+
+        layers_.push_back(cur_layer);
+        if (cur_layer->IsPluginLayer()) {
+            m_plugin_layer_name_map[layer_info->name] = dynamic_cast<TensorRTPluginLayerBuilder*>(cur_layer);
+        }
+        cur_layer->SetBatchSize(m_max_batchsize);
+    }
+    return ret;
+}
+
+Status TensorRTNetwork_::CreateExecuteContext() {
+    m_trt_context = m_trt_engine->createExecutionContextWithoutDeviceMemory();
+    size_t context_memory_size = (std::max)(m_trt_engine->getDeviceMemorySize(), size_t(1024));
+    Status status = TNN_OK;
+    if(config_.share_memory_mode == SHARE_MEMORY_MODE_SHARE_ONE_THREAD) { 
+        SharedMemory share_memory = SharedMemoryManager::GetSharedMemory(
+                        context_memory_size, init_thread_id_, device_,
+                        config_.device_id, this, status);
+        m_trt_context->setDeviceMemory(share_memory.shared_memory_data);
+    } else {
+        Status ret = dynamic_cast<TensorRTBlobManager*>(blob_manager_)->MemAlloc(&m_context_memory, context_memory_size);
+        if (ret != TNN_OK) {
+            LOGE("Error Create TensorRT execute context\n");
+            return ret;
+        }
+        m_trt_context->setDeviceMemory(m_context_memory);
+    }
+    return TNN_OK;
+}
+
+Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std::string cache_file_name,
+        NetResource *net_resource, const InputShapesMap &min_inputs_shape) {
+    auto m_trt_builder = nvinfer1::createInferBuilder(m_trt_logger);
+    NetworkDefinitionCreationFlags networkFlags = 1U << static_cast<uint32_t>(
+        NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    if (int8_mode) networkFlags |= 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION);
+    auto m_trt_network = m_trt_builder->createNetworkV2(networkFlags);
+    auto m_trt_config = m_trt_builder->createBuilderConfig();
+    auto profile = m_trt_builder->createOptimizationProfile();
+    for (auto input : inputs) {
+        auto foreign_blob = dynamic_cast<ForeignBlob*>(input.second);
+        auto desc = input.second->GetBlobDesc();
+        auto max_dims = ConvertToTRTDims(desc.dims);
+        auto min_dims = max_dims;
+        if (min_inputs_shape.count(desc.name) != 0) {
+            min_dims = ConvertToTRTDims(min_inputs_shape.at(desc.name));
+        }
+        auto opt_dims = max_dims;
+        auto nv_dims = ConvertToTRTDynamicDims(max_dims, min_dims);
+        nvinfer1::ITensor* in_tensor = m_trt_network->addInput(desc.name.c_str(),
+            ConvertToTRTDataType(desc.data_type), nv_dims);
+        profile->setDimensions(desc.name.c_str(), OptProfileSelector::kMIN, min_dims);
+        profile->setDimensions(desc.name.c_str(), OptProfileSelector::kOPT, opt_dims);
+        profile->setDimensions(desc.name.c_str(), OptProfileSelector::kMAX, max_dims);
+        auto foreign_tensor = foreign_blob->GetForeignTensor();
+        auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+        if (int8_mode) {
+            auto input_scale_value = tensorrt_tensor->GetIntResource()->scale_handle.force_to<float *>()[0];
+
+            Weights input_quant_shift;
+            input_quant_shift.type = nvinfer1::DataType::kFLOAT;
+            input_quant_shift.values = nullptr;
+            input_quant_shift.count = 0;
+
+            Weights input_quant_scale;
+            input_quant_scale.type = nvinfer1::DataType::kFLOAT;
+            float* input_quant_scale_data = (float*)malloc(sizeof(float));
+            *input_quant_scale_data = input_scale_value;
+            input_quant_scale.values = (void*)input_quant_scale_data;
+            input_quant_scale.count = 1;
+
+            Weights input_quant_power;
+            input_quant_power.type = nvinfer1::DataType::kFLOAT;
+            input_quant_power.values = nullptr;
+            input_quant_power.count = 0;
+
+            auto input_quant_layer = m_trt_network->addScale(*in_tensor, ScaleMode::kUNIFORM,
+                input_quant_shift, input_quant_scale, input_quant_power);
+            std::string input_quant_layer_name = desc.name + "_input_quant_";
+            input_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
+            input_quant_layer->setName(input_quant_layer_name.c_str());
+
+            Weights input_dequant_shift;
+            input_dequant_shift.type = nvinfer1::DataType::kFLOAT;
+            input_dequant_shift.values = nullptr;
+            input_dequant_shift.count = 0;
+
+            Weights input_dequant_scale;
+            input_dequant_scale.type = nvinfer1::DataType::kFLOAT;
+            float* input_dequant_scale_data = (float*)malloc(sizeof(float));
+            *input_dequant_scale_data = 1 / input_scale_value;
+            input_dequant_scale.values = (void*)input_dequant_scale_data;
+            input_dequant_scale.count = 1;
+
+            Weights input_dequant_power;
+            input_dequant_power.type = nvinfer1::DataType::kFLOAT;
+            input_dequant_power.values = nullptr;
+            input_dequant_power.count = 0;
+
+            auto input_dequant_layer = m_trt_network->addScale(*(input_quant_layer->getOutput(0)),
+                ScaleMode::kUNIFORM, input_dequant_shift, input_dequant_scale, input_dequant_power);
+            std::string input_dequant_layer_name = desc.name + "_input_dequant_";
+            input_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            input_dequant_layer->setName(input_dequant_layer_name.c_str());
+            tensorrt_tensor->SetTensor(input_dequant_layer->getOutput(0));
+        } else {
+            tensorrt_tensor->SetTensor(in_tensor);
+        }
+    }
+
+    // Add Const_resources as inputs to tensorrt network
+    for (auto blob_name : const_input_blobs_) {
+        Blob *blob = blob_manager_->GetBlob(blob_name);
+        auto buf = net_resource->constant_map[blob_name];
+        auto foreign_blob = dynamic_cast<ForeignBlob*>(blob);
+        auto foreign_tensor = foreign_blob->GetForeignTensor();
+
+        ITensor * const_tensor = nullptr;
+        DimsVector max_dims, min_dims;
+        if (std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->IsShapeTensor()){
+            auto shape_dims = ConvertToTRTDims(buf->GetBufferDims());
+            const_tensor = m_trt_network->addInput(blob_name.c_str(),
+                                            ConvertToTRTDataType(buf->GetDataType()), shape_dims);
+            auto dims_blob_name = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetShapeBlobName();
+            max_dims = net_resource->blob_shapes_map[dims_blob_name];
+            min_dims = net_resource->min_blob_shapes_map[dims_blob_name];
+            profile->setShapeValues(blob_name.c_str(), OptProfileSelector::kMIN, min_dims.data(), min_dims.size());
+            profile->setShapeValues(blob_name.c_str(), OptProfileSelector::kMAX, max_dims.data(), max_dims.size());
+            profile->setShapeValues(blob_name.c_str(), OptProfileSelector::kOPT, max_dims.data(), max_dims.size());
+        } else {
+            max_dims = net_resource->blob_shapes_map[blob_name];
+            min_dims = net_resource->min_blob_shapes_map[blob_name];
+            auto nv_max_dims = ConvertToTRTDims(max_dims);
+            auto nv_min_dims = ConvertToTRTDims(min_dims);
+            auto nv_input_dims = ConvertToTRTDynamicDims(nv_max_dims, nv_min_dims);
+            const_tensor = m_trt_network->addInput(blob_name.c_str(),
+                                            ConvertToTRTDataType(buf->GetDataType()), nv_input_dims);
+            profile->setDimensions(blob_name.c_str(), OptProfileSelector::kMIN, nv_min_dims);
+            profile->setDimensions(blob_name.c_str(), OptProfileSelector::kOPT, nv_max_dims);
+            profile->setDimensions(blob_name.c_str(), OptProfileSelector::kMAX, nv_max_dims);
+        }
+
+        auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+        tensorrt_tensor->SetTensor(const_tensor);
+
+        {
+            std::stringstream ss;
+            if (std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->IsShapeTensor()){
+                 ss << "shape tensor ";
+            }
+            ss << "<" << blob->GetBlobDesc().name << "> max_shape:[";
+            for(int i: max_dims) {ss <<  i << ","; } ss << "] min_shape: [";
+            for(int i: min_dims) {ss <<  i << ","; } ss << "]";
+            LOGD("Add %s as input from constant_map to trt network\n", ss.str().c_str());
+        }
+    }
+    m_trt_config->addOptimizationProfile(profile);
+
+    // Add Const_resources as weights to tensorrt network
+    for (auto blob_name : const_weight_blobs_) {
+        Blob *blob = blob_manager_->GetBlob(blob_name);
+        auto foreign_blob = dynamic_cast<ForeignBlob*>(blob);
+        auto foreign_tensor = foreign_blob->GetForeignTensor();
+        auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+        auto buf = net_resource_->constant_map[blob_name];
+
+        {
+            std::stringstream ss;
+            ss << "<" << blob->GetBlobDesc().name << "> count:" << buf->GetDataCount();
+            ss << " DataType:" << buf->GetDataType() << " shape:[";
+            for(int i: blob->GetBlobDesc().dims) {ss <<  i << ","; }
+            ss << "]";
+            LOGD("Adding %s as weights from constant_map to trt network\n", ss.str().c_str());
+        }            
+        
+        auto const_layer = ConvertWeightToConstLayer(m_trt_network, buf.get());
+        if (const_layer != nullptr) {
+            const_layer->setName(blob_name.c_str());
+            tensorrt_tensor->SetTensor(const_layer->getOutput(0));
+        } else {
+            LOGE("Add Const [%s] as weights to trt network failed\n", blob_name.c_str());
+            return TNNERR_LAYER_ERR;
+        }
+
+    }
+
+    for (int layer_id = 0; layer_id < this->layers_.size(); layer_id++) {
+        BaseLayer* cur_layer = this->layers_[layer_id];
+        nvinfer1::ILayer *cur_trt_layer = 
+            dynamic_cast<TensorRTBaseLayerBuilder*>(cur_layer)->AddToNetwork(m_trt_network);
+        if (cur_trt_layer == nullptr ) {
+            LOGE("build trt layer for \"%s\" failed\n", cur_layer->GetLayerName().c_str());
+            return TNNERR_LAYER_ERR;
+        }
+        for (int out_id = 0; out_id < cur_layer->GetOutputBlobs().size(); out_id++) {
+            auto output = cur_layer->GetOutputBlobs()[out_id];
+            auto foreign_blob = dynamic_cast<ForeignBlob*>(output);
+            nvinfer1::ITensor* output_tensor = cur_trt_layer->getOutput(out_id);
+            output_tensor->setName(output->GetBlobDesc().name.c_str());
+            auto foreign_tensor = foreign_blob->GetForeignTensor();
+            auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
+            tensorrt_tensor->SetTensor(output_tensor);
+
+            {
+                std::stringstream ss;
+                int nbDims = output_tensor->getDimensions().nbDims;
+                for( int d=0;d<nbDims;d++) ss << output_tensor->getDimensions().d[d] << ","; 
+                ss << " blob shape:";
+                for(auto d:output->GetBlobDesc().dims) ss << d << ",";
+                LOGD("build trt layer for \"%s\", tensor shape %s\n", cur_layer->GetLayerName().c_str(), ss.str().c_str());
+            }
+        }
+    }
+
+    for (auto output : outputs) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(output.second)->GetForeignTensor();
+        auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+        //Do not delete, may cause trt bug
+        for (int i = 0; i < tensor->getDimensions().nbDims; i++) {
+            LOGD("shape: %d\n", tensor->getDimensions().d[i]);
+        }
+        m_trt_network->markOutput(*tensor);
+    }
+
+    m_trt_config->setMaxWorkspaceSize(MAX_SCRATCH_MEMORY);
+    if (config_.precision == PRECISION_LOW && !this->int8_mode) {
+        m_trt_config->setFlag(BuilderFlag::kFP16);
+    }
+    if (this->int8_mode) {
+        m_trt_config->setFlag(BuilderFlag::kINT8);
+    }
+    m_trt_engine = m_trt_builder->buildEngineWithConfig(*m_trt_network, *m_trt_config);
+    if (!m_trt_engine) {
+        LOGE("create tensorrt engine failed\n");
+        return TNNERR_CUDA_TENSORRT_ERROR;
+    }
+//    Status ret = CreateExecuteContext();
+//    if (ret != TNN_OK)
+//        return ret;
+    m_trt_builder->destroy();
+    m_trt_config->destroy();
+    m_trt_network->destroy();
+
+    if (!test_mode) {
+        IHostMemory *model_stream = nullptr;
+        model_stream = m_trt_engine->serialize();
+        std::ofstream deploy_output(cache_file_name, std::ofstream::binary);
+        char *model_stream_ptr = reinterpret_cast<char*>(model_stream->data());
+        deploy_output.write(model_stream_ptr, model_stream->size());
+        deploy_output.close();
+        delete model_stream_ptr;
+    }
+
+    return TNN_OK;
+}
+
+bool TensorRTNetwork_::IsBlobUsed(Blob* blob) {
+    for (int i = 0; i < layers_.size(); i++) {
+        auto inputs = layers_[i]->GetInputBlobs();
+        if (std::find(inputs.begin(), inputs.end(), blob) != inputs.end()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::string TensorRTNetwork_::GetCacheFileName(std::vector<std::string> params_md5, BlobMap input_map,
+        BlobMap output_map, const InputShapesMap &min_inputs_shape, int device_id, int batchsize,
+        bool int8_mode, bool use_fp16, bool enable_const_folder) {
+    std::string md5_source = "";
+
+    for (auto iter : params_md5) {
+        md5_source += iter;
+    }
+
+    for (auto iter : input_map) {
+        std::stringstream ss;
+        ss << "dims:";
+        for (int d : iter.second->GetBlobDesc().dims) {
+            ss << d << ",";
+        }
+        if (min_inputs_shape.count(iter.first) != 0) {
+            ss << "min_dims:";
+            auto min_dims = min_inputs_shape.at(iter.first);
+            for (int i = 0; i < min_dims.size(); i++) {
+                ss << min_dims[i] << ",";
+            }
+        }
+        md5_source += ss.str();
+    }
+    for (auto iter : output_map) {
+        md5_source += iter.first;
+    }
+
+    std::string precision;
+    if (int8_mode) {
+        precision = "-int8";
+    } else if (use_fp16) {
+        precision = "-fp16";
+    } else {
+        precision = "";
+    }
+
+    std::string const_folder = enable_const_folder ? "const_folder_on" : "const_folder_off";
+
+    std::string cache_file_name = "." +  md5(md5_source) + precision
+        + TENSORRT_SERIALIZE_VERSION + "-b-" + std::to_string(batchsize)
+        + "-" + GetGpuType(device_id) + "-" + GetTrtVersion() + GetCudaVersion()
+        + "-" + const_folder + ".cache";
+    return cache_file_name;
+}
+
+
+Status TensorRTNetwork_::DumpAllOutputBlob() {
+    BlobMap outputs;
+    Status ret = blob_manager_->GetAllOutputBlobs(outputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get output blobs failed");
+        return ret;
+    }
+    for(auto output : outputs) {
+        ret = DumpDeviceBlob(output.second, context_, "cuda");
+        if(ret != TNN_OK) {
+            LOGE("DumpDeviceBlob failed error code: %d, msg: %s \n", (int)ret, ret.description().c_str());
+        }
+    }
+    return TNN_OK;
+}
+
+Status TensorRTNetwork_::CheckConstBlobs() {
+    auto shape_differ_layers = net_resource_->shape_differ_layers;
+    std::set<std::string> shape_differ_blobs;
+
+    for (auto layer_info : net_structure_->layers) {
+        if (shape_differ_layers.find(layer_info->name) != shape_differ_layers.end()) {
+            for (auto name : layer_info->outputs) {
+                shape_differ_blobs.insert(name);
+            }
+        }
+    }
+
+    std::vector<std::string> const_input_blobs;
+    std::vector<std::string> const_weight_blobs;
+
+    
+    for (auto iter : net_resource_->constant_map) {
+        auto blob_name = iter.first;
+        Blob *blob = blob_manager_->GetBlob(blob_name);
+        if (false == IsBlobUsed(blob)) {
+            continue;
+        }
+
+        if (shape_differ_blobs.find(blob_name) != shape_differ_blobs.end()) {
+            const_input_blobs.push_back(blob_name);
+        } else {
+            const_weight_blobs.push_back(blob_name);
+            if (iter.second->GetDataCount() == 0) {
+                auto data_type = iter.second->GetDataType();
+                size_t ele_size = DataTypeUtils::GetBytesSize(data_type);
+                net_resource_->constant_map[iter.first] = std::make_shared<RawBuffer>(ele_size);
+                net_resource_->constant_map[iter.first]->SetDataType(data_type);
+                LOGD("Updating empty buffer [%s], so trt won't crash\n", blob_name.c_str());
+            }
+        }
+    }
+
+    const_input_blobs_  = const_input_blobs;
+    const_weight_blobs_ = const_weight_blobs;
+
+    return TNN_OK;
+}
+
+void TensorRTNetwork_::OnSharedForwardMemoryChanged(void *memory) {
+    m_trt_context->setDeviceMemory(memory);    
+}
+
+}  //  namespace  TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.h b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.h
new file mode 100644
index 0000000..c26123b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_network.h
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_NETWORK_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_NETWORK_H_
+
+#include <iostream>
+#include <unordered_set>
+#include <cuda_runtime.h>
+#include <thread>
+
+#include "tnn/core/default_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/tensorrt_tensor.h"
+#include "tnn/network/tensorrt/tensorrt_blob_manager.h"
+
+namespace TNN_NS {
+
+class TRTLogger : public nvinfer1::ILogger {
+public:
+    void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+        // suppress info-level messages
+#ifndef DEBUG
+        if (severity == Severity::kINFO || severity == Severity::kVERBOSE) return;
+#endif
+        const char * skips[] = {
+            "INVALID_ARGUMENT: Cannot find binding of given name",
+            "Unused Input:",
+        };
+
+        std::string msg_str = std::string(msg);
+        for(auto skip : skips) {
+            if (msg_str.rfind(skip, 0) == 0) {
+                return;
+            }
+        }
+        switch (severity) {
+            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+            case Severity::kERROR: std::cerr << "ERROR: "; break;
+            case Severity::kWARNING: std::cerr << "WARNING: "; break;
+            case Severity::kINFO: break;
+            case Severity::kVERBOSE: std::cerr << "VERBOSE: "; break;
+            default: break;
+        }
+        std::cerr << msg << std::endl;
+    }
+};
+
+class TensorRTPluginLayerBuilder;
+
+class TensorRTNetwork_ : public DefaultNetwork, public ISharedMemoryChangeListener {
+public:
+    // @brief TensorRTNetwork_ Constructor
+    TensorRTNetwork_();
+
+    // @brief virtual default destructor
+    virtual ~TensorRTNetwork_();
+
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param net_structure network structure info
+    // @param net_resource network resource info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config,
+        AbstractModelInterpreter* interpreter, InputShapesMap min_inputs_shape,
+        InputShapesMap max_inputs_shape, bool enable_const_folder);
+
+    // @brief network forward
+    virtual Status Forward();
+
+    // @brief reshape with input shape info
+    // @inputs input shape info
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief OnSharedForwardMemoryChanged for share memory change observer
+    virtual void OnSharedForwardMemoryChanged(void *memory);
+
+    static std::unordered_map<std::string, TensorRTPluginLayerBuilder*> GetPluginLayerNameMap();
+
+    std::string GetCacheFileName(std::vector<std::string> params_md5, BlobMap input_map,
+        BlobMap output_map, const InputShapesMap &min_inputs_shape, int device_id, int batchsize,
+        bool int8_mode, bool use_fp16, bool enable_const_folder);
+
+    std::set<std::string> m_concat_blob_names;
+
+private:
+    virtual Status InitLayers(NetStructure *net_structure, NetResource *net_resource, bool enable_const_folder);
+
+    bool IsBlobUsed(Blob* blob);
+
+    Status InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std::string cache_file_name,
+        NetResource *net_resource, const InputShapesMap &min_inputs_shape);
+
+    Status CreateExecuteContext();
+
+    Status ReshapeLayers();
+
+    Status DumpAllOutputBlob();
+
+    Status CheckConstBlobs();
+
+    bool int8_mode;
+    bool test_mode;
+    int m_max_batchsize;
+    nvinfer1::ICudaEngine* m_trt_engine;
+    nvinfer1::IExecutionContext* m_trt_context;
+    TRTLogger m_trt_logger;
+    std::unordered_map<std::string, std::shared_ptr<nvinfer1::ITensor>> m_blob_tensor_map;
+    std::unordered_set<nvinfer1::ITensor *> m_tensor_set;
+    void** m_trt_bindings;
+    void* m_context_memory;
+    NetResource *net_resource_;
+    int device_id_;
+
+    std::thread::id init_thread_id_;
+
+    std::vector<std::string> const_input_blobs_;
+    std::vector<std::string> const_weight_blobs_;
+
+    static std::unordered_map<std::string, TensorRTPluginLayerBuilder*> m_plugin_layer_name_map;
+    static std::mutex network_mutex;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_NETWORK_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_tensor.h b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_tensor.h
new file mode 100644
index 0000000..13ba984
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/tensorrt_tensor.h
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_TENSOR_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_TENSOR_H_
+
+#include "NvInfer.h"
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
+
+namespace TNN_NS {
+
+// @brief Base Type of a TensorRT Tensor
+class TensorRTTensor : public ForeignTensor {
+public:
+    explicit TensorRTTensor() {};
+
+    // @brief virtual destructor
+    virtual ~TensorRTTensor() {};
+
+    // @brief get the ITensor
+    nvinfer1::ITensor* GetTensor() {
+        return m_trt_tensor;
+    }
+
+    Status SetTensor(nvinfer1::ITensor* tensor) {
+        m_trt_tensor = tensor;
+        return TNN_OK;
+    }
+
+    IntScaleResource * GetIntResource() {
+        return resource_;
+    }
+
+    void SetIntResource(IntScaleResource * resource) {
+        resource_ = resource;
+    }
+
+    void SetInt8Mode(bool flag) {
+        int8_mode = flag;
+    }
+
+    bool GetInt8Mode() {
+        return int8_mode;
+    }
+
+    bool IsQuantized() {
+        return quantized;
+    }
+
+    void SetQuantized() {
+        quantized = true;
+    }
+
+    bool IsShapeTensor() {
+        return shape_tensor;
+    }
+
+    void SetShapeTensor() {
+        shape_tensor = true;
+    }
+
+    void SetShapeBlobName(std::string name) {
+        shape_blob_name = name;
+    }
+
+    std::string GetShapeBlobName() {
+        return shape_blob_name;
+    }
+
+private:
+    bool int8_mode = false;
+    bool quantized = false;
+    bool shape_tensor = false;
+    std::string shape_blob_name;
+    IntScaleResource *resource_ = nullptr;
+    nvinfer1::ITensor* m_trt_tensor = nullptr;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_TENSORRT_TENSOR_H_
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/utils.cc b/3rdparty/TNN/source/tnn/network/tensorrt/utils.cc
new file mode 100644
index 0000000..ed55e0b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/utils.cc
@@ -0,0 +1,232 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <string.h>
+#include <string>
+#include <stdio.h>
+
+#include "tnn/network/tensorrt/utils.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+std::string GetGpuType(int gpu_id) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, gpu_id);
+    int length = strlen(prop.name);
+    for(int i=0;i<length;i++) {
+        char c = prop.name[i];
+        if (((c >= 'a') && (c<='z')) ||
+            ((c >= 'A') && (c<='Z')) ||
+            ((c >= '0') && (c<='9'))) {
+            continue;
+        }
+        prop.name[i] = '_';
+    }
+    return std::string(prop.name);
+}
+
+std::string GetGpuArch(int gpu_id) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, gpu_id);
+    char ss[50];
+    sprintf(ss, "sm%1d%1d", prop.major, prop.minor);
+    return std::string(ss);
+}
+
+std::string GetCudaVersion() {
+    int version_num;
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#else
+    version_num = CUDART_VERSION;
+#endif 
+
+    char ss[50];
+    sprintf(ss, "%02d", version_num / 1000);
+
+    return std::string(ss);
+}
+
+std::string GetTrtVersion() {
+    int version_num;
+
+#ifndef NV_TENSORRT_MAJOR
+#error NV_TENSORRT_MAJOR Undefined!
+#else
+    version_num = NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR * 10 + NV_TENSORRT_PATCH;
+#endif 
+
+    char ss[50];
+    sprintf(ss, "%3d", version_num);
+
+    return std::string(ss);
+}
+
+DataType ConvertTRTDataType(nvinfer1::DataType type) {
+    switch (type) {
+        case nvinfer1::DataType::kFLOAT :
+            return DATA_TYPE_FLOAT;
+        case nvinfer1::DataType::kHALF :
+            return DATA_TYPE_HALF;
+        case nvinfer1::DataType::kINT32 :
+            return DATA_TYPE_INT32;
+        default:
+            return DATA_TYPE_FLOAT;
+    }
+}
+
+DataFormat ConvertTRTDataFormat(nvinfer1::TensorFormat format) {
+    switch (format) {
+        case nvinfer1::TensorFormat::kNCHW :
+            return DATA_FORMAT_NCHW;
+        case nvinfer1::TensorFormat::kCHW2 :
+            return DATA_FORMAT_NC2HW2;
+        case nvinfer1::TensorFormat::kCHW4 :
+            return DATA_FORMAT_NC4HW4;
+        case nvinfer1::TensorFormat::kCHW16 :
+            return DATA_FORMAT_NC16HW16;
+        default:
+            return DATA_FORMAT_NCHW;
+    }
+}
+
+nvinfer1::Dims ConvertToTRTDims(DimsVector dims) {
+    int dims_size = dims.size();
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = dims_size;
+    for(int i = 0; i < dims_size; ++i) {
+        trt_dims.d[i] = dims[i];
+    }
+    return trt_dims;
+}
+
+nvinfer1::Dims ConvertToTRTDimsReverse(DimsVector dims) {
+    int dims_size = dims.size();
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = dims_size;
+    int offset = 0;
+    for(int i = dims_size-1; i >=0; i--) {
+        trt_dims.d[offset++] = dims[i];
+    }
+    return trt_dims;
+}
+
+nvinfer1::Dims ConvertToTRTDynamicDims(DimsVector dims) {
+    int dims_size = dims.size();
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = dims_size;
+    trt_dims.d[0] = -1;
+    for(int i = 1; i < dims_size; ++i) {
+        trt_dims.d[i] = dims[i];
+    }
+    return trt_dims;
+}
+
+nvinfer1::Dims ConvertToTRTDynamicDims(nvinfer1::Dims max_dims, nvinfer1::Dims min_dims) {
+    int dims_size = max_dims.nbDims;
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = dims_size;
+    for(int i = 0; i < dims_size; ++i) {
+        if (i == 1)
+            trt_dims.d[i] = (max_dims.d[i] != min_dims.d[i]) ? -1 : max_dims.d[i];
+        else
+            trt_dims.d[i] = -1;
+    }
+    return trt_dims;
+}
+
+nvinfer1::DataType ConvertToTRTDataType(DataType type) {
+    switch (type) {
+        case DATA_TYPE_FLOAT:
+            return nvinfer1::DataType::kFLOAT;
+        case DATA_TYPE_HALF: 
+            return nvinfer1::DataType::kHALF;
+        case DATA_TYPE_INT32: 
+            return nvinfer1::DataType::kINT32;
+        default:
+            return nvinfer1::DataType::kFLOAT;
+    } 
+}
+
+nvinfer1::ILayer* ConvertWeightToConstLayer(nvinfer1::INetworkDefinition* network, RawBuffer *buf,
+                                                            DimsVector recommend_dims, int expand_dims) {
+
+    size_t buf_size_in_bytes = buf->GetDataCount() * DataTypeUtils::GetBytesSize(buf->GetDataType());
+    
+    nvinfer1::Weights const_weight;
+    const_weight.type = ConvertToTRTDataType(buf->GetDataType());
+    const_weight.values = buf->force_to<void*>();
+    const_weight.count = buf->GetDataCount();
+
+    DimsVector buf_dims = buf->GetBufferDims();
+    if (recommend_dims.size() > 0 ) {
+        buf_dims = recommend_dims;
+    }
+
+    if (buf_dims.size() == 0) {
+        if (buf->GetDataCount() == 0) {
+            LOGE("Warning: ConvertWeightToConstLayer got empty buffer\n");
+            return nullptr;
+        } else if (buf->GetDataCount() > 1) {
+            LOGE("ConvertWeightToConstLayer got empty shapes\n");
+            return nullptr;
+        }
+    }
+
+    auto weightDims = ConvertToTRTDims(buf_dims);
+    int origin_dims = weightDims.nbDims;
+    if(expand_dims > origin_dims) {
+        weightDims.nbDims = expand_dims;
+        int diff = expand_dims - origin_dims;
+        for(int i = expand_dims - 1; i >= diff; --i) {
+            weightDims.d[i] = weightDims.d[i-diff];
+        }
+        for(int i = 0; i < diff; ++i) {
+            weightDims.d[i] = 1;
+        }
+    }
+
+    nvinfer1::ILayer* constant_layer = network->addConstant(weightDims, const_weight); 
+    return constant_layer;
+}
+
+nvinfer1::ILayer* AddReshapeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor, DimsVector reshape_dims, const char* layer_name) {
+    nvinfer1::IShuffleLayer* shuffle_layer = network->addShuffle(*input_tensor);
+    if (shuffle_layer != nullptr) {
+        shuffle_layer->setName(layer_name);
+        shuffle_layer->setReshapeDimensions(ConvertToTRTDynamicDims(reshape_dims));
+    }
+    return shuffle_layer;
+}
+
+nvinfer1::Dims ConvertPaddingToTRTDims(DimsVector dims) {
+    nvinfer1::Dims trt_dims;
+    if (dims.size() == 6) {
+        trt_dims.nbDims = 3;
+        trt_dims.d[0] = dims[4];
+        trt_dims.d[1] = dims[2];
+        trt_dims.d[2] = dims[0];
+    } else if (dims.size() == 4) {
+        trt_dims.nbDims = 2;
+        trt_dims.d[0] = dims[2];
+        trt_dims.d[1] = dims[0];
+    }
+    return trt_dims;
+}
+
+
+}  //  namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/network/tensorrt/utils.h b/3rdparty/TNN/source/tnn/network/tensorrt/utils.h
new file mode 100644
index 0000000..043046c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/network/tensorrt/utils.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NETWORK_TENSORRT_UTILS_H_
+#define TNN_SOURCE_TNN_NETWORK_TENSORRT_UTILS_H_
+
+#include <cuda_runtime_api.h>
+#include <NvInfer.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/common.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+std::string GetGpuType(int gpu_id);
+
+std::string GetGpuArch(int gpu_id);
+
+std::string GetCudaVersion();
+
+std::string GetTrtVersion();
+
+DataType ConvertTRTDataType(nvinfer1::DataType type);
+
+DataFormat ConvertTRTDataFormat(nvinfer1::TensorFormat format);
+
+nvinfer1::Dims ConvertPaddingToTRTDims(DimsVector dims);
+
+nvinfer1::Dims ConvertToTRTDims(DimsVector dims);
+
+nvinfer1::Dims ConvertToTRTDynamicDims(DimsVector dims);
+
+nvinfer1::Dims ConvertToTRTDynamicDims(nvinfer1::Dims max_dims, nvinfer1::Dims min_dims);
+
+nvinfer1::Dims ConvertToTRTDimsReverse(DimsVector dims);
+
+nvinfer1::DataType ConvertToTRTDataType(DataType type);
+
+nvinfer1::ILayer* AddReshapeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor, DimsVector reshape_dims, const char* layer_name);
+
+nvinfer1::ILayer* ConvertWeightToConstLayer(nvinfer1::INetworkDefinition* network, RawBuffer *buf, DimsVector recommend_dims=DimsVector(), int expand_dims = 0);
+
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer.h
new file mode 100644
index 0000000..5796ad6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize must have strict rules, or error will be induced
+    class NetOptimizer {
+    public:
+        virtual ~NetOptimizer() {}
+        virtual std::string Strategy()                                          = 0;
+        virtual bool IsSupported(const NetworkConfig &net_config)               = 0;
+        virtual Status Optimize(NetStructure *structure, NetResource *resource) = 0;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_ABSTRACT_OPTIMIZER_NET_OPTIMIZER_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.cc
new file mode 100644
index 0000000..5671781
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_cbam_fused_pooling.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerCbamFusedPooling> g_net_optimizer_cbam_fused_pooling(OptPriority::P1);
+
+    std::string NetOptimizerCbamFusedPooling::Strategy() {
+        return kNetOptimizerCbamFusedPooling;
+    }
+
+    bool NetOptimizerCbamFusedPooling::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        return device == DEVICE_CUDA;
+    }
+
+    static bool NeedCbamPoolingFusion(std::shared_ptr<LayerInfo> ave, std::shared_ptr<LayerInfo> max) {
+        if (max->type != LayerType::LAYER_POOLING || ave->type != LayerType::LAYER_POOLING) {
+            return false;
+        }
+
+        auto max_param = dynamic_cast<PoolingLayerParam *>(max->param.get());
+        auto ave_param = dynamic_cast<PoolingLayerParam *>(ave->param.get());
+        if (!max_param || !ave_param) {
+            return false;
+        }
+
+        if (max_param->pool_type != 0 || ave_param->pool_type != 1 || max->inputs[0] != ave->inputs[0] ||
+                ave_param->kernels[0] != 0 || ave_param->kernels[1] != 0) {
+            return false;
+        }
+        return true;
+    }
+
+    Status NetOptimizerCbamFusedPooling::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count = (const int)layers_orig.size();
+        if (count <= 3) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        std::vector<int> deleteLayers;
+        for (int i = 0; i < count; i++) {
+            if (std::find(deleteLayers.begin(), deleteLayers.end(), i) != deleteLayers.end()) continue;
+            auto layer_info_current = layers_orig[i];
+            bool fused = false;
+            for (int j = i + 1; j < std::min(count, i + 5); j++) {
+                if (std::find(deleteLayers.begin(), deleteLayers.end(), j) != deleteLayers.end()) continue;
+                auto layer_info_max = layers_orig[j];
+                if (NeedCbamPoolingFusion(layer_info_current, layer_info_max)) {
+                    std::shared_ptr<LayerInfo> layer_info_fused_pooling = std::make_shared<LayerInfo>();
+                    layer_info_fused_pooling->type = LayerType::LAYER_CBAM_FUSED_POOLING;
+                    layer_info_fused_pooling->type_str = "CbamFusedPooling";
+                    layer_info_fused_pooling->name = layer_info_max->name;
+                    layer_info_fused_pooling->inputs = layer_info_max->inputs;
+                    layer_info_fused_pooling->outputs.push_back(layer_info_current->outputs[0]);
+                    layer_info_fused_pooling->outputs.push_back(layer_info_max->outputs[0]);
+                    layer_info_fused_pooling->param = layer_info_max->param;
+                    layers_fused.push_back(layer_info_fused_pooling);
+                    deleteLayers.push_back(j);
+                    fused = true;
+                    break;
+                }
+            }
+            if (!fused) {
+                layers_fused.push_back(layer_info_current);
+            }
+        }
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.h
new file mode 100644
index 0000000..968c2f6
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_pooling.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_POOLING_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_POOLING_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse ave pool and max pool into one op
+    class NetOptimizerCbamFusedPooling : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_POOLING_H_
+
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.cc
new file mode 100644
index 0000000..9bec294
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_cbam_fused_reduce.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerCbamFusedReduce> g_net_optimizer_cbam_fused_reduce(OptPriority::P1);
+
+    std::string NetOptimizerCbamFusedReduce::Strategy() {
+        return kNetOptimizerCbamFusedReduce;
+    }
+
+    bool NetOptimizerCbamFusedReduce::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        return device == DEVICE_CUDA;
+    }
+
+    static bool NeedCbamReduceFusion(std::shared_ptr<LayerInfo> current, std::shared_ptr<LayerInfo> next,
+            std::shared_ptr<LayerInfo> next_next) {
+        if (current->type != LayerType::LAYER_REDUCE_MEAN || next->type != LayerType::LAYER_REDUCE_MAX ||
+                next_next->type != LayerType::LAYER_CONCAT) {
+            return false;
+        }
+        auto reduce_mean_param = dynamic_cast<ReduceLayerParam *>(current->param.get());
+        auto reduce_max_param = dynamic_cast<ReduceLayerParam *>(next->param.get());
+        auto concat_param = dynamic_cast<ConcatLayerParam *>(next_next->param.get());
+        if (!reduce_mean_param || !reduce_max_param || !concat_param) {
+            return false;
+        }
+        if (concat_param->axis != 1 || (reduce_mean_param->axis.size() != 1 && reduce_mean_param->axis[0] != 1) ||
+                (reduce_max_param->axis.size() != 1 && reduce_max_param->axis[0] != 1) ||
+                current->inputs[0] != next->inputs[0] || current->outputs[0] != next_next->inputs[0] ||
+                next->outputs[0] != next_next->inputs[1]) {
+            return false;
+        }
+        return true;
+    }
+
+    Status NetOptimizerCbamFusedReduce::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count = (const int)layers_orig.size();
+        if (count <= 3) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        int index = 0;
+        for (; index < count - 2; index++) {
+            auto layer_info_current = layers_orig[index];
+            auto layer_info_next    = layers_orig[index + 1];
+            auto layer_info_next_next = layers_orig[index + 2];
+            if (NeedCbamReduceFusion(layer_info_current, layer_info_next, layer_info_next_next)) {
+                std::shared_ptr<LayerInfo> layer_info_fused_reduce = std::make_shared<LayerInfo>();
+                layer_info_fused_reduce->type = LayerType::LAYER_CBAM_FUSED_REDUCE;
+                layer_info_fused_reduce->type_str = "CbamFusedReduce";
+                layer_info_fused_reduce->name = layer_info_next_next->name;
+                layer_info_fused_reduce->inputs = layer_info_current->inputs;
+                layer_info_fused_reduce->outputs = layer_info_next_next->outputs;
+                layer_info_fused_reduce->param = layer_info_next_next->param;
+                layers_fused.push_back(layer_info_fused_reduce);
+                index += 2;
+            } else {
+                layers_fused.push_back(layer_info_current);
+            }
+        }
+        for (; index < count; index++) {
+            layers_fused.push_back(layers_orig[index]);
+        }
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.h
new file mode 100644
index 0000000..cedd37f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_cbam_fused_reduce.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_REDUCE_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_REDUCE_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse reduce max and reduce mean into fused reduce op
+    class NetOptimizerCbamFusedReduce : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_CBAM_FUSED_REDUCE_H_
+
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.cc
new file mode 100644
index 0000000..509b197
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.cc
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_conv_add.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_fuse_conv_post.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerFuseConvAdd> g_net_optimizer_fuse_conv_add(OptPriority::P1);
+
+    std::string NetOptimizerFuseConvAdd::Strategy() {
+        return kNetOptimizerFuseConvAdd;
+    }
+
+    bool NetOptimizerFuseConvAdd::IsSupported(const NetworkConfig &net_config) {
+#ifdef TNN_CONVERTER_RUNTIME
+        return false;
+#else
+        auto device = net_config.device_type;
+        if (device == DEVICE_ARM || device == DEVICE_NAIVE || device == DEVICE_X86) {
+            auto conv_post_optimizer = NetOptimizerManager::GetNetOptimizerByName(kNetOptimizerFuseConvPost);
+            if (conv_post_optimizer && conv_post_optimizer->IsSupported(net_config)) {
+                conv_post_opt_ = conv_post_optimizer;
+            } else {
+                conv_post_opt_ = nullptr;
+            }
+            return true;
+        }
+        return false;
+#endif
+    }
+
+    static bool IsPreviousLayerSupportFusion(std::shared_ptr<LayerInfo> layer_info) {
+        auto param = dynamic_cast<ConvLayerParam *>(layer_info->param.get());
+        if (param) {
+            // only fuse conv 1x1 now
+            if (param->group != 1 || param->kernels[0] != 1 || param->kernels[1] != 1 || param->strides[0] != 1 ||
+                param->strides[1] != 1 || param->pads[0] != 0 || param->pads[1] != 0 || param->pads[2] != 0 ||
+                param->pads[3] != 0) {
+                return false;
+            } else {
+                return param->quantized;
+            }
+        }
+        return false;
+    }
+
+    static bool IsCurrentLayerSupportFusion(std::shared_ptr<LayerInfo> layer_info) {
+        return (layer_info->type == LAYER_ADD && layer_info->param->quantized);
+    }
+
+    static bool NeedConvAddFusion(std::shared_ptr<LayerInfo> prev, std::shared_ptr<LayerInfo> current) {
+        return (IsPreviousLayerSupportFusion(prev) && IsCurrentLayerSupportFusion(current));
+    }
+
+    Status NetOptimizerFuseConvAdd::Optimize(NetStructure *structure, NetResource *resource) {
+        auto ret = Status(TNN_OK);
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        // Only fuse quantized network now
+        auto is_quantized_net = GetQuantizedInfoFromNetStructure(structure);
+        if (!is_quantized_net) {
+            return TNN_OK;
+        }
+        if (structure->layers.size() <= 1) {
+            return TNN_OK;
+        }
+        // step1: do conv_post fusion before conv_add fusion
+        if (conv_post_opt_) {
+            ret = conv_post_opt_->Optimize(structure, resource);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+        layers_fused.push_back(layers_orig[0]);
+
+        // step2: do conv_add fusion
+        for (int index = 1; index < count; index++) {
+            auto layer_info_current = layers_orig[index];
+            auto layer_info_prev    = layers_orig[index - 1];
+            auto conv_param = dynamic_cast<ConvLayerParam *>(layer_info_prev->param.get());
+            if (NeedConvAddFusion(layer_info_prev, layer_info_current)) {
+                auto conv_output_name   = layer_info_prev->outputs[0];
+                auto conv_inputs        = layer_info_prev->inputs;
+                // inputs of add should contain conv_outputs, and others are pushed back to conv_inputs
+                bool is_add_after_conv  = false;
+                for (auto input_current : layer_info_current->inputs) {
+                    if (conv_output_name != input_current) {
+                        conv_inputs.push_back(input_current);
+                    } else {
+                        is_add_after_conv = true;
+                    }
+                }
+                // outputs of conv cannot be inputs of other layeres except add
+                bool is_input_of_others = false;
+                for (int next = index + 1; next < count; next++) {
+                    auto layer_info_next = layers_orig[next];
+                    for (auto input_next : layer_info_next->inputs) {
+                        if (conv_output_name == input_next) {
+                            is_input_of_others = true;
+                            break;
+                        }
+                    }
+                    if (is_input_of_others) {
+                        break;
+                    }
+                }
+
+                if (is_add_after_conv && !is_input_of_others) {
+                    layer_info_prev->outputs = layer_info_current->outputs;
+                    layer_info_prev->inputs  = conv_inputs;
+                    if (conv_param->activation_type == ActivationType_None) {
+                        conv_param->fusion_type  = FusionType_Conv_Add_Activation;
+                    } else {
+                        conv_param->fusion_type  = FusionType_Conv_Activation_Add;
+                    }
+                } else {
+                    layers_fused.push_back(layer_info_current);
+                }
+            } else {
+                layers_fused.push_back(layer_info_current);
+            }
+        }
+        structure->layers = layers_fused;
+
+        // step3: do conv_post fusion after conv_add fusion
+        if (conv_post_opt_) {
+            ret = conv_post_opt_->Optimize(structure, resource);
+        }
+
+        return ret;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.h
new file mode 100644
index 0000000..d4aaeb5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_add.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_ADD_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_ADD_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse conv and add into conv op
+    class NetOptimizerFuseConvAdd : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::shared_ptr<NetOptimizer> conv_post_opt_ = nullptr;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_ADD_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.cc
new file mode 100644
index 0000000..fb86333
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.cc
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_conv_post.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerFuseConvPost> g_net_optimizer_fuse_conv_post(OptPriority::P1);
+
+    std::string NetOptimizerFuseConvPost::Strategy() {
+        return kNetOptimizerFuseConvPost;
+    }
+
+    bool NetOptimizerFuseConvPost::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        if (device == DEVICE_METAL || device == DEVICE_OPENCL || device == DEVICE_ARM || device == DEVICE_NAIVE) {
+            kLayerActivationMap[LAYER_RELU]    = ActivationType_ReLU;
+            kLayerActivationMap[LAYER_RELU6]   = ActivationType_ReLU6;
+            kLayerActivationMap[LAYER_SIGMOID] = ActivationType_SIGMOID_MUL;
+            return true;
+        }
+        if (device == DEVICE_RK_NPU) {
+            kLayerActivationMap[LAYER_RELU] = ActivationType_ReLU;
+            return true;
+        }
+        if (device == DEVICE_X86 && net_config.network_type != NETWORK_TYPE_OPENVINO) {
+            kLayerActivationMap[LAYER_RELU]  = ActivationType_ReLU;
+            kLayerActivationMap[LAYER_RELU6] = ActivationType_ReLU6;
+            return true;
+        }
+        return false;
+    }
+
+    Status NetOptimizerFuseConvPost::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+        layers_fused.push_back(layers_orig[0]);
+
+        for (int index = 1; index < count; index++) {
+            auto layer_info_current = layers_orig[index];
+            auto layer_info_prev    = layers_orig[index - 1];
+            auto layer_current_type = layer_info_current->type;
+
+            auto conv_param = dynamic_cast<ConvLayerParam *>(layer_info_prev->param.get());
+            auto activation = kLayerActivationMap.find(layer_current_type);
+            if (conv_param && activation != kLayerActivationMap.end()) {
+                auto conv_output_name       = layer_info_prev->outputs[0];
+                auto activation_type        = activation->second;
+                bool conv_output_name_check = false;
+                if (activation_type == ActivationType_SIGMOID_MUL) {
+                    auto sigmoid_output_name = layer_info_current->outputs[0];
+                    if (index + 1 < count) {
+                        auto layer_info_next = layers_orig[index + 1];
+                        auto layer_next_type = layer_info_next->type;
+                        auto next_inputs     = layer_info_next->inputs;
+                        if (layer_next_type == LAYER_MUL && next_inputs.size() == 2 &&
+                            next_inputs[0] == conv_output_name && next_inputs[1] == sigmoid_output_name) {
+                            ++index;
+                            layer_info_current     = layer_info_next;
+                            conv_output_name_check = true;
+                        }
+                    }
+                } else {
+                    conv_output_name_check = true;
+                }
+
+                if (conv_output_name_check) {
+                    // outputs of conv cannot be inputs of other layeres from index + 1
+                    bool is_input_of_others = false;
+                    for (int next = index + 1; next < count; next++) {
+                        auto layer_info_next = layers_orig[next];
+                        for (auto input_next : layer_info_next->inputs) {
+                            if (conv_output_name == input_next) {
+                                is_input_of_others = true;
+                                break;
+                            }
+                        }
+                        if (is_input_of_others) {
+                            break;
+                        }
+                    }
+
+                    // prevent fusing multiple activation layers into one conv layer
+                    if (!is_input_of_others && conv_param->activation_type == ActivationType_None) {
+                        if (conv_param->quantized)  {
+                            // quantized conv fuse relu and relu6
+                            if (activation_type == ActivationType_ReLU || activation_type == ActivationType_ReLU6) {
+                                conv_param->activation_type = activation_type;
+                                layer_info_prev->outputs = layer_info_current->outputs;
+                            } else {
+                                layers_fused.push_back(layer_info_current);
+                            }
+                        } else {
+                            // float conv fuse
+                            conv_param->activation_type = activation_type;
+                            layer_info_prev->outputs    = layer_info_current->outputs;
+                        }
+                    } else {
+                        layers_fused.push_back(layer_info_current);
+                    }
+                } else {
+                    layers_fused.push_back(layer_info_current);
+                }
+            } else {
+                layers_fused.push_back(layer_info_current);
+            }
+        }
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.h
new file mode 100644
index 0000000..3cff5a2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_fuse_conv_post.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_SIGMOID_MUL_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_SIGMOID_MUL_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse conv post(relu, relu6 ... ) to convolution
+    class NetOptimizerFuseConvPost : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::map<LayerType, ActivationType> kLayerActivationMap;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CONV_SIGMOID_MUL_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc
new file mode 100644
index 0000000..22ee494
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_insert_fp16_reformat.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // Plast priority: reformat after all fuse
+    NetOptimizerRegister<NetOptimizerInsertFp16Reformat> g_net_optimizer_insert_fp16_reformat(OptPriority::P2);
+    static const std::string reformat_name_suffix = "_fp16_reformat";
+
+    std::string NetOptimizerInsertFp16Reformat::Strategy() {
+        return kNetOptimizerInsertFp16Reformat;
+    }
+
+    bool NetOptimizerInsertFp16Reformat::IsSupported(const NetworkConfig &net_config) {
+        auto device    = net_config.device_type;
+        auto precision = net_config.precision;
+        device_        = GetDevice(device);
+        return (device == DEVICE_ARM || device == DEVICE_NAIVE) &&
+               (precision == PRECISION_NORMAL || precision == PRECISION_AUTO) && CpuUtils::CpuSupportFp16();
+    }
+
+    static std::shared_ptr<LayerInfo> CreateReformat(std::string name, bool src_fp16) {
+        std::shared_ptr<LayerInfo> new_layer = std::shared_ptr<LayerInfo>(new LayerInfo());
+        new_layer->type                      = LAYER_REFORMAT;
+        new_layer->type_str                  = "Reformat";
+        new_layer->name                      = name;
+        ReformatLayerParam *param            = new ReformatLayerParam();
+        new_layer->param                     = std::shared_ptr<LayerParam>(param);
+        new_layer->param->type               = new_layer->type_str;
+        new_layer->param->name               = new_layer->name;
+        param->src_type                      = src_fp16 ? DATA_TYPE_HALF : DATA_TYPE_FLOAT;
+        param->dst_type                      = src_fp16 ? DATA_TYPE_FLOAT : DATA_TYPE_HALF;
+        return new_layer;
+    }
+
+    Status NetOptimizerInsertFp16Reformat::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        // skip if network is quantized
+        auto is_quantized_net = GetQuantizedInfoFromNetStructure(structure);
+        if (is_quantized_net) {
+            return TNN_OK;
+        }
+
+        // only insert reformat for fp16-implemented layer
+        auto fp16_layer = std::find_if(layers_orig.begin(), layers_orig.end(), [&](std::shared_ptr<LayerInfo> iter) {
+            return device_->GetImplementedPrecision(iter->type)->fp16_implemented;
+        });
+        if (fp16_layer == layers_orig.end()) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        const auto &constant_layers = resource->constant_layers;
+        const auto &constant_blobs  = resource->constant_map;
+        // if model input is used for multiple layers with different data types,
+        // reformat layers are inserted at beginning.
+        // support multi inputs/outputs.
+        for (const auto &iter : structure->inputs_shape_map) {
+            const auto &model_input = iter.first;
+            LOGD("NetOptimizerInsertFp16Reformat::Optimize, process model input: %s\n", model_input.c_str());
+            if (constant_blobs.count(model_input) > 0) {
+                continue;
+            }
+            int need_fp16_input = 0;
+            int need_fp32_input = 0;
+            for (const auto &cur_layer : layers_orig) {
+                if (constant_layers.count(cur_layer->name) > 0) {
+                    continue;
+                }
+                for (const auto &layer_input : cur_layer->inputs) {
+                    if (layer_input == model_input) {
+                        if (device_->GetImplementedPrecision(cur_layer->type)->fp16_implemented) {
+                            ++need_fp16_input;
+                        } else {
+                            ++need_fp32_input;
+                        }
+                        break;
+                    }
+                }
+            }
+            if (need_fp16_input > 0 && need_fp32_input > 0) {
+                std::vector<std::string> reformat_outs = {model_input};
+                // create fp16 -> fp32 reformat layer
+                std::shared_ptr<LayerInfo> new_layer =
+                    CreateReformat(model_input + reformat_name_suffix + "__from_model_input__", true);
+
+                AdjustLayer(layers_orig, structure, constant_layers, true, new_layer, reformat_outs,
+                            reformat_name_suffix, -1, count);
+
+                LOGD("Insert fp16 refomat layer : src %s dst %s\n", new_layer->inputs[0].c_str(),
+                     new_layer->outputs[0].c_str());
+                layers_fused.push_back(new_layer);
+            }
+        }
+
+        for (int index = 0; index < count; index++) {
+            auto cur_layer = layers_orig[index];
+            layers_fused.push_back(cur_layer);
+            if (constant_layers.count(cur_layer->name) > 0) {
+                continue;
+            }
+            // find blobs need reformat
+            // support multi inputs/outputs
+            std::vector<std::string> reformat_outs;
+            bool is_cur_layer_fp16 = device_->GetImplementedPrecision(cur_layer->type)->fp16_implemented;
+            for (auto cur_out : cur_layer->outputs) {
+                if (constant_blobs.count(cur_out) > 0) {
+                    continue;
+                }
+                bool need_reformat = false;
+                for (int next_id = index + 1; next_id < count; next_id++) {
+                    auto next_layer = layers_orig[next_id];
+                    if (constant_layers.count(next_layer->name) > 0) {
+                        continue;
+                    }
+                    bool is_next_layer_fp16 = device_->GetImplementedPrecision(next_layer->type)->fp16_implemented;
+                    for (auto next_in : next_layer->inputs) {
+                        if (next_in == cur_out && is_next_layer_fp16 != is_cur_layer_fp16) {
+                            need_reformat = true;
+                        }
+                    }
+                }
+                if (need_reformat)
+                    reformat_outs.push_back(cur_out);
+            }
+            if (!reformat_outs.size()) {
+                continue;
+            }
+
+            std::shared_ptr<LayerInfo> new_layer =
+                CreateReformat(cur_layer->name + reformat_name_suffix, is_cur_layer_fp16);
+
+            AdjustLayer(layers_orig, structure, constant_layers, is_cur_layer_fp16, new_layer, reformat_outs,
+                        reformat_name_suffix, index, count);
+
+            LOGD("Insert fp16 refomat layer: src %s dst %s\n", new_layer->inputs[0].c_str(),
+                 new_layer->outputs[0].c_str());
+            layers_fused.push_back(new_layer);
+        }
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+    void NetOptimizerInsertFp16Reformat::AdjustLayer(std::vector<std::shared_ptr<LayerInfo>> &layers_orig,
+                                                     NetStructure *structure,
+                                                     const std::set<std::string> &constant_layers,
+                                                     bool is_cur_layer_fp16, std::shared_ptr<LayerInfo> &new_layer,
+                                                     std::vector<std::string> &reformat_outs,
+                                                     const std::string &reformat_name_suffix, const int index,
+                                                     const int count) {
+        // change blobs for for layers to read blob data correctly
+        new_layer->inputs = reformat_outs;
+        for (auto cur_out : reformat_outs) {
+            auto new_out = cur_out + reformat_name_suffix;
+            new_layer->outputs.push_back(new_out);
+            structure->blobs.insert(new_out);
+            // change the inputs of successed layers
+            for (int next_id = index + 1; next_id < count; next_id++) {
+                auto next_layer = layers_orig[next_id];
+                if (constant_layers.count(next_layer->name) > 0)
+                    continue;
+                bool is_next_layer_fp16 = device_->GetImplementedPrecision(next_layer->type)->fp16_implemented;
+                for (auto &next_in : next_layer->inputs) {
+                    // only use reformat out when fp16 status diff
+                    if (next_in == cur_out && is_next_layer_fp16 != is_cur_layer_fp16) {
+                        next_in = new_out;
+                    }
+                }
+            }
+        }
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.h
new file mode 100644
index 0000000..93a521f
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_FP16_REFORMAT_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_FP16_REFORMAT_H_
+
+#include <string>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: insert reformat layer between fp16 and fp32 layers
+    class NetOptimizerInsertFp16Reformat : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig& net_config);
+        virtual Status Optimize(NetStructure* structure, NetResource* resource);
+
+        void AdjustLayer(std::vector<std::shared_ptr<LayerInfo>>& layers_orig, NetStructure* structure,
+                         const std::set<std::string>& constant_layers, bool is_cur_layer_fp16,
+                         std::shared_ptr<LayerInfo>& new_layer, std::vector<std::string>& reformat_outs,
+                         const std::string& reformat_name_suffix, const int index, const int count);
+
+    private:
+        AbstractDevice* device_;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_FP16_REFORMAT_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.cc
new file mode 100644
index 0000000..2fbb7e2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.cc
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_insert_int8_reformat.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // Plast priority: reformat after all fuse
+    NetOptimizerRegister<NetOptimizerInsertInt8Reformat> g_net_optimizer_insert_int8_reformat(OptPriority::P2);
+    static const std::string reformat_name_suffix = "_int8_reformat";
+
+    std::string NetOptimizerInsertInt8Reformat::Strategy() {
+        return kNetOptimizerInsertInt8Reformat;
+    }
+
+    bool NetOptimizerInsertInt8Reformat::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        return device == DEVICE_ARM || device == DEVICE_NAIVE || device == DEVICE_X86;
+    }
+
+    static std::shared_ptr<LayerInfo> CreateReformat(std::string name, bool src_quantized) {
+        std::shared_ptr<LayerInfo> new_layer = std::shared_ptr<LayerInfo>(new LayerInfo());
+        new_layer->type                      = LAYER_REFORMAT;
+        new_layer->type_str                  = "Reformat";
+        new_layer->name                      = name;
+        ReformatLayerParam *param            = new ReformatLayerParam();
+        new_layer->param                     = std::shared_ptr<LayerParam>(param);
+        new_layer->param->type               = new_layer->type_str;
+        new_layer->param->name               = new_layer->name;
+        // only define quant/dequant here, layout after layer init
+        param->src_type = src_quantized ? DATA_TYPE_INT8 : DATA_TYPE_FLOAT;
+        param->dst_type = src_quantized ? DATA_TYPE_FLOAT : DATA_TYPE_INT8;
+        return new_layer;
+    }
+
+    Status NetOptimizerInsertInt8Reformat::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        // only insert reformat before quantized layer now
+        auto is_quantized_net = GetQuantizedInfoFromNetStructure(structure);
+        if (!is_quantized_net) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        for (int index = 0; index < count; index++) {
+            auto cur_layer = layers_orig[index];
+            layers_fused.push_back(cur_layer);
+            if (cur_layer->type == LAYER_REFORMAT) {
+                continue;
+            }
+
+            // find blobs need reformat
+            // support multi inputs/outputs
+            // only quant & dequant now
+            std::vector<std::string> reformat_outs;
+            for (auto cur_out : cur_layer->outputs) {
+                bool need_reformat = false;
+                for (int next_id = index + 1; next_id < count; next_id++) {
+                    auto next_layer = layers_orig[next_id];
+                    if (next_layer->type == LAYER_REFORMAT) {
+                        continue;
+                    }
+                    for (auto next_in : next_layer->inputs) {
+                        if (next_in == cur_out && next_layer->param->quantized != cur_layer->param->quantized) {
+                            need_reformat = true;
+                        }
+                    }
+                }
+                if (need_reformat)
+                    reformat_outs.push_back(cur_out);
+            }
+            if (!reformat_outs.size()) {
+                continue;
+            }
+
+            std::shared_ptr<LayerInfo> new_layer =
+                CreateReformat(cur_layer->name + reformat_name_suffix, cur_layer->param->quantized);
+
+            AdjustLayer(layers_orig, structure, cur_layer, new_layer, reformat_outs, reformat_name_suffix, index,
+                        count);
+
+            LOGD("Insert int8 refomat layer: src %s dst %s\n", new_layer->inputs[0].c_str(),
+                 new_layer->outputs[0].c_str());
+            layers_fused.push_back(new_layer);
+        }
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+    void NetOptimizerInsertInt8Reformat::AdjustLayer(std::vector<std::shared_ptr<LayerInfo>> &layers_orig,
+                                                     NetStructure *structure, std::shared_ptr<LayerInfo> &cur_layer,
+                                                     std::shared_ptr<LayerInfo> &new_layer,
+                                                     std::vector<std::string> &reformat_outs,
+                                                     const std::string &reformat_name_suffix, const int index,
+                                                     const int count) {
+        // change blobs for unquantized layer for layers to read
+        // int8resource correctly
+        // src_type int8, change dst blob
+        if (cur_layer->param->quantized) {
+            new_layer->inputs = reformat_outs;
+            for (auto cur_out : reformat_outs) {
+                auto new_out = cur_out + reformat_name_suffix;
+                new_layer->outputs.push_back(new_out);
+                structure->blobs.insert(new_out);
+                // change the inputs of successed int8 layers
+                for (int next_id = index + 1; next_id < count; next_id++) {
+                    auto next_layer = layers_orig[next_id];
+                    for (auto &next_in : next_layer->inputs) {
+                        // only use reformat out when quantized param diff
+                        if (next_in == cur_out && next_layer->param->quantized != cur_layer->param->quantized) {
+                            next_in = new_out;
+                        }
+                    }
+                }
+            }
+        } else {
+            // dst type int8, change src blob
+            new_layer->outputs = reformat_outs;
+            for (auto cur_out : reformat_outs) {
+                auto new_out = cur_out + reformat_name_suffix;
+                new_layer->inputs.push_back(new_out);
+                structure->blobs.insert(new_out);
+                for (auto &cur_layer_out : cur_layer->outputs) {
+                    cur_layer_out = new_out;
+                }
+                // change the inputs of successed float layers
+                for (int next_id = index + 1; next_id < count; next_id++) {
+                    auto next_layer = layers_orig[next_id];
+                    for (auto &next_in : next_layer->inputs) {
+                        if (next_in == cur_out && !next_layer->param->quantized) {
+                            next_in = new_out;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.h
new file mode 100644
index 0000000..ad96b02
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_int8_reformat.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_INT8_REFORMAT_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_INT8_REFORMAT_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: insert reformat layer between int8 and fp32 layers
+    class NetOptimizerInsertInt8Reformat : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig& net_config);
+        virtual Status Optimize(NetStructure* structure, NetResource* resource);
+
+        void AdjustLayer(std::vector<std::shared_ptr<LayerInfo>>& layers_orig, NetStructure* structure,
+                         std::shared_ptr<LayerInfo>& cur_layer, std::shared_ptr<LayerInfo>& new_layer,
+                         std::vector<std::string>& reformat_outs, const std::string& reformat_name_suffix,
+                         const int index, const int count);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_INT8_REFORMAT_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.cc
new file mode 100644
index 0000000..d70f3ab
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.cc
@@ -0,0 +1,382 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_insert_layout_reformat.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // Plast priority: reformat after all fuse and data type reformat
+    NetOptimizerRegister<NetOptimizerInsertLayoutReformat> g_net_optimizer_insert_layout_reformat(OptPriority::PLAST);
+    static const std::string reformat_name_suffix(DataFormat layout) {
+        return std::string("_") + ToString(layout) + "_layout_reformat";
+    }
+
+    std::string NetOptimizerInsertLayoutReformat::Strategy() {
+        return kNetOptimizerInsertLayoutReformat;
+    }
+
+    bool NetOptimizerInsertLayoutReformat::IsSupported(const NetworkConfig &net_config) {
+        // save net_config
+        net_config_    = &net_config;
+        auto device    = net_config.device_type;
+        auto precision = net_config.precision;
+        device_        = GetDevice(device);
+        // possible adapter devices
+        static DeviceType adapter_device_list[2] = {DEVICE_ARM, DEVICE_X86};
+        adaptor_device_                          = nullptr;
+        auto adaptor_device                      = device;
+        for (const auto &dev : adapter_device_list) {
+            adaptor_device_ = GetDevice(dev);
+            if (adaptor_device_) {
+                adaptor_device = dev;
+                break;
+            }
+        }
+
+        return device == DEVICE_ARM || device == DEVICE_OPENCL || device == DEVICE_METAL;
+    }
+
+    static std::shared_ptr<LayerInfo> CreateReformat(std::string name, DataFormat src_fmt, DataFormat dst_fmt) {
+        std::shared_ptr<LayerInfo> new_layer = std::shared_ptr<LayerInfo>(new LayerInfo());
+        new_layer->type                      = LAYER_REFORMAT;
+        new_layer->type_str                  = "Reformat";
+        new_layer->name                      = name;
+        ReformatLayerParam *param            = new ReformatLayerParam();
+        new_layer->param                     = std::shared_ptr<LayerParam>(param);
+        new_layer->param->type               = new_layer->type_str;
+        new_layer->param->name               = new_layer->name;
+        param->src_format                    = src_fmt;
+        param->dst_format                    = dst_fmt;
+        return new_layer;
+    }
+
+    // only support all inputs with the same layout now
+    static DataFormat GetInputLayout(const NetworkConfig *config, const DeviceType &type) {
+        if (config != nullptr && config->data_format != DATA_FORMAT_AUTO)
+            return config->data_format;
+        if (type == DEVICE_ARM || type == DEVICE_METAL) {
+            return DATA_FORMAT_NC4HW4;
+        } else if (type == DEVICE_OPENCL) {
+            return DATA_FORMAT_NHC4W4;
+        } else {
+            return DATA_FORMAT_AUTO;
+        }
+    }
+
+    static std::shared_ptr<const ImplementedLayout> GetAdaptorLayouts(const DeviceType &type) {
+        auto res = std::make_shared<ImplementedLayout>();
+        if (type == DEVICE_METAL) {
+            res->layouts.push_back(DATA_FORMAT_NC4HW4);
+        } else if (type == DEVICE_OPENCL) {
+            res->layouts.push_back(DATA_FORMAT_NHC4W4);
+        }
+        return res;
+    }
+
+    static bool NeedDoReformat(DataFormat src_fmt, std::shared_ptr<const ImplementedLayout> dst_fmts,
+                               const std::map<std::string, DataFormat> &layer_choosed_layout,
+                               const std::string &layer_name) {
+        // if layer's layout is already choosed
+        if (layer_choosed_layout.find(layer_name) != layer_choosed_layout.end()) {
+            if (src_fmt == layer_choosed_layout.at(layer_name)) {
+                return false;
+            } else {
+                return true;
+            }
+        }
+        for (const auto &dst_fmt : dst_fmts->layouts) {
+            if (dst_fmt == src_fmt) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // metal and opencl may use adaptor layer to fall back computing on arm
+    std::shared_ptr<const ImplementedLayout> NetOptimizerInsertLayoutReformat::GetLayoutsByLayerType(LayerType type) {
+        auto device_layouts = device_->GetImplementedLayout(type);
+        if (!device_layouts || device_layouts->layouts.size() < 1) {
+            auto adaptor_device_layouts = adaptor_device_->GetImplementedLayout(type);
+            if (!adaptor_device_layouts || adaptor_device_layouts->layouts.size() < 1) {
+                LOGE("NetOptimizerInsertLayoutReformat Error: empty adaptor device layouts of %d\n", type);
+                return std::make_shared<ImplementedLayout>();
+            } else {
+                return GetAdaptorLayouts(device_->GetDeviceType());
+            }
+        } else {
+            return device_layouts;
+        }
+    }
+
+    Status NetOptimizerInsertLayoutReformat::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+
+        // skip if network is quantized
+        auto is_quantized_net = GetQuantizedInfoFromNetStructure(structure);
+        if (is_quantized_net) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_modified;
+        layer_choosed_layout.clear();
+
+        const auto &constant_layers = resource->constant_layers;
+        const auto &constant_blobs  = resource->constant_map;
+        // reformat input layers if needed.
+        // support multi inputs/outputs.
+        // support multi layouts
+        for (const auto &iter : structure->inputs_shape_map) {
+            const auto &model_input = iter.first;
+            LOGD("NetOptimizerInsertLayoutReformat::Optimize, process model input: %s\n", model_input.c_str());
+            if (constant_blobs.count(model_input) > 0)
+                continue;
+            std::vector<DataFormat> reformat_layouts;
+            DataFormat input_layout = GetInputLayout(net_config_, device_->GetDeviceType());
+            for (const auto &cur_layer : layers_orig) {
+                if (constant_layers.count(cur_layer->name) > 0) {
+                    continue;
+                }
+                for (const auto &layer_input : cur_layer->inputs) {
+                    if (layer_input == model_input) {
+                        // get implemented layouts
+                        auto implemented_layouts = GetLayoutsByLayerType(cur_layer->type);
+                        if (!implemented_layouts || implemented_layouts->layouts.size() < 1) {
+                            LOGE("NetOptimizerInsertLayoutReformat Error: empty implemented_layouts of layer %d\n",
+                                 cur_layer->type);
+                            return Status(TNNERR_LAYER_ERR,
+                                          "NetOptimizerInsertLayoutReformat Error: empty implemented_layouts");
+                        }
+                        // If the already choosed layout is different from input_layout or input_layout is not
+                        // implemented, reformat is needed.
+                        if (NeedDoReformat(input_layout, implemented_layouts, layer_choosed_layout, cur_layer->name)) {
+                            // use the choosed layout, or the first implemented layout
+                            auto reformat_layout                  = layer_choosed_layout.count(cur_layer->name)
+                                                                        ? layer_choosed_layout[cur_layer->name]
+                                                                        : implemented_layouts->layouts[0];
+                            layer_choosed_layout[cur_layer->name] = reformat_layout;
+                            if (std::find(reformat_layouts.begin(), reformat_layouts.end(), reformat_layout) ==
+                                reformat_layouts.end()) {
+                                reformat_layouts.push_back(reformat_layout);
+                            }
+                        } else {
+                            layer_choosed_layout[cur_layer->name] = input_layout;
+                        }
+                        break;
+                    }
+                }
+            }
+            for (const auto &reformat_layout : reformat_layouts) {
+                std::vector<std::string> reformat_outs = {model_input};
+                // create input_layout -> implemented_layout reformat layer
+                std::shared_ptr<LayerInfo> new_layer =
+                    CreateReformat(model_input + reformat_name_suffix(reformat_layout) + "__from_model_input__",
+                                   input_layout, reformat_layout);
+
+                RETURN_ON_NEQ(AdjustLayer(layers_orig, structure, constant_layers, input_layout, reformat_layout,
+                                          new_layer, reformat_outs, reformat_name_suffix(reformat_layout), -1, count),
+                              TNN_OK);
+
+                LOGD("Insert layout refomat layer : src %s dst %s\n", new_layer->inputs[0].c_str(),
+                     new_layer->outputs[0].c_str());
+                layers_modified.push_back(new_layer);
+            }
+        }
+
+        for (int index = 0; index < count; index++) {
+            auto cur_layer = layers_orig[index];
+            layers_modified.push_back(cur_layer);
+            if (constant_layers.count(cur_layer->name) > 0) {
+                continue;
+            }
+            if (layer_choosed_layout.find(cur_layer->name) == layer_choosed_layout.end()) {
+                LOGE("NetOptimizerInsertLayoutReformat Error: layout of cur layer not choosen, index: %d, layer: %s\n",
+                     index, cur_layer->name.c_str());
+                return Status(TNNERR_LAYER_ERR,
+                              "NetOptimizerInsertLayoutReformat Error: layout of cur layer not choosen");
+            }
+            auto cur_layer_layout = layer_choosed_layout[cur_layer->name];
+
+            // find blobs need reformat
+            // support multi inputs/outputs
+            // support multi layouts
+            std::map<DataFormat, std::vector<std::string>> layout_reformat_outs;
+            for (auto cur_out : cur_layer->outputs) {
+                if (constant_blobs.count(cur_out) > 0) {
+                    continue;
+                }
+                for (int next_id = index + 1; next_id < count; next_id++) {
+                    auto next_layer = layers_orig[next_id];
+                    if (constant_layers.count(next_layer->name) > 0) {
+                        continue;
+                    }
+                    auto next_layer_layouts = GetLayoutsByLayerType(next_layer->type);
+                    if (!next_layer_layouts || next_layer_layouts->layouts.size() < 1) {
+                        LOGE("NetOptimizerInsertLayoutReformat Error: empty implemented_layouts of layer %d\n",
+                             next_layer->type);
+                        return Status(TNNERR_LAYER_ERR,
+                                      "NetOptimizerInsertLayoutReformat Error: empty implemented_layouts");
+                    }
+                    for (auto next_in : next_layer->inputs) {
+                        if (next_in == cur_out) {
+                            if (NeedDoReformat(cur_layer_layout, next_layer_layouts, layer_choosed_layout,
+                                               next_layer->name)) {
+                                // use the choosed layout, or the first implemented layout
+                                auto reformat_layout                   = layer_choosed_layout.count(next_layer->name)
+                                                                             ? layer_choosed_layout[next_layer->name]
+                                                                             : next_layer_layouts->layouts[0];
+                                layer_choosed_layout[next_layer->name] = reformat_layout;
+                                auto &reformat_outs                    = layout_reformat_outs[reformat_layout];
+                                if (std::find(reformat_outs.begin(), reformat_outs.end(), cur_out) ==
+                                    reformat_outs.end()) {
+                                    reformat_outs.push_back(cur_out);
+                                }
+                            } else {
+                                layer_choosed_layout[next_layer->name] = cur_layer_layout;
+                            }
+                            break;
+                        }
+                    }
+                }
+            }
+
+            for (const auto &iter : layout_reformat_outs) {
+                auto reformat_layout = iter.first;
+                auto reformat_outs   = iter.second;
+
+                std::shared_ptr<LayerInfo> new_layer = CreateReformat(
+                    cur_layer->name + reformat_name_suffix(reformat_layout), cur_layer_layout, reformat_layout);
+
+                RETURN_ON_NEQ(
+                    AdjustLayer(layers_orig, structure, constant_layers, cur_layer_layout, reformat_layout, new_layer,
+                                reformat_outs, reformat_name_suffix(reformat_layout), index, count),
+                    TNN_OK);
+
+                LOGD("Insert layout refomat layer : src %s dst %s\n", new_layer->inputs[0].c_str(),
+                     new_layer->outputs[0].c_str());
+                layers_modified.push_back(new_layer);
+            }
+        }
+
+        // reformat output layers if needed.
+        // support multi outputs.
+        // for (const auto &model_output : structure->outputs) {
+        //     LOGD("NetOptimizerInsertLayoutReformat::Optimize, process model output: %s\n", model_output.c_str());
+        //     bool need_reformat                      = false;
+        //     std::shared_ptr<LayerInfo> output_layer = layers_orig[0];
+        //     DataFormat output_layout                = GetInputLayout(net_config_, device_->GetDeviceType());
+        //     for (const auto &layer : layers_orig) {
+        //         for (const auto &output : layer->outputs) {
+        //             if (output == model_output) {
+        //                 if (layer_choosed_layout.find(layer->name) == layer_choosed_layout.end()) {
+        //                     LOGE("NetOptimizerInsertLayoutReformat Error: layout of layer %s not choosen\n",
+        //                          layer->name.c_str());
+        //                     return Status(TNNERR_LAYER_ERR,
+        //                                   "NetOptimizerInsertLayoutReformat Error: layout of layer not choosen");
+        //                 }
+        //                 if (layer_choosed_layout[layer->name] != output_layout) {
+        //                     need_reformat = true;
+        //                     output_layer  = layer;
+        //                     break;
+        //                 }
+        //             }
+        //         }
+        //         if (need_reformat) {
+        //             break;
+        //         }
+        //     }
+
+        //     if (need_reformat) {
+        //         auto choosed_layout = layer_choosed_layout[output_layer->name];
+        //         // create choosed_layout -> output_layout reformat layer
+        //         std::shared_ptr<LayerInfo> new_layer =
+        //             CreateReformat(model_output + reformat_name_suffix(choosed_layout) + "__to_model_output__",
+        //                            choosed_layout, output_layout);
+
+        //         auto new_blob = model_output + reformat_name_suffix(choosed_layout);
+        //         structure->blobs.insert(new_blob);
+        //         output_layer->outputs = {new_blob};
+        //         new_layer->inputs     = {new_blob};
+        //         new_layer->outputs    = {model_output};
+
+        //         LOGD("Insert layout refomat layer : src %s dst %s\n", new_layer->inputs[0].c_str(),
+        //              new_layer->outputs[0].c_str());
+        //         layers_modified.push_back(new_layer);
+        //     }
+        // }
+
+        structure->layers = layers_modified;
+        layer_choosed_layout.clear();
+
+        return TNN_OK;
+    }
+
+    Status NetOptimizerInsertLayoutReformat::AdjustLayer(
+        std::vector<std::shared_ptr<LayerInfo>> &layers_orig, NetStructure *structure,
+        const std::set<std::string> &constant_layers, DataFormat cur_layer_layout, DataFormat reformat_layout,
+        std::shared_ptr<LayerInfo> &new_layer, std::vector<std::string> &reformat_outs,
+        const std::string &reformat_name_suffix, const int index, const int count) {
+        // change blobs for layers to read blob data correctly
+        new_layer->inputs = reformat_outs;
+        for (auto cur_out : reformat_outs) {
+            auto new_out = cur_out + reformat_name_suffix;
+            new_layer->outputs.push_back(new_out);
+            structure->blobs.insert(new_out);
+            // change the inputs of successed layers
+            for (int next_id = index + 1; next_id < count; next_id++) {
+                auto next_layer = layers_orig[next_id];
+                if (constant_layers.count(next_layer->name) > 0)
+                    continue;
+                auto next_layer_layouts = GetLayoutsByLayerType(next_layer->type);
+                for (auto &next_in : next_layer->inputs) {
+                    // only use reformat out when cur_layer_layout not supported
+                    if (next_in == cur_out &&
+                        NeedDoReformat(cur_layer_layout, next_layer_layouts, layer_choosed_layout, next_layer->name)) {
+                        if (layer_choosed_layout.find(next_layer->name) == layer_choosed_layout.end()) {
+                            LOGE("NetOptimizerInsertLayoutReformat Error: layout of next layer not choosen\n");
+                            return Status(TNNERR_LAYER_ERR,
+                                          "NetOptimizerInsertLayoutReformat Error: layout of next layer not choosen");
+                        }
+                        if (layer_choosed_layout[next_layer->name] == reformat_layout)
+                            next_in = new_out;
+                    }
+                }
+            }
+        }
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.h
new file mode 100644
index 0000000..ddb91af
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_insert_layout_reformat.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_LAYOUT_REFORMAT_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_LAYOUT_REFORMAT_H_
+
+#include <string>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: insert reformat layer between layers with different layouts
+    class NetOptimizerInsertLayoutReformat : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig& net_config);
+        virtual Status Optimize(NetStructure* structure, NetResource* resource);
+
+        Status AdjustLayer(std::vector<std::shared_ptr<LayerInfo>>& layers_orig, NetStructure* structure,
+                           const std::set<std::string>& constant_layers, DataFormat cur_layer_layout,
+                           DataFormat reformat_layout, std::shared_ptr<LayerInfo>& new_layer,
+                           std::vector<std::string>& reformat_outs, const std::string& reformat_name_suffix,
+                           const int index, const int count);
+
+    private:
+        std::shared_ptr<const ImplementedLayout> GetLayoutsByLayerType(LayerType type);
+
+        AbstractDevice* device_;
+        AbstractDevice* adaptor_device_;
+        std::map<std::string, DataFormat> layer_choosed_layout;
+        const NetworkConfig* net_config_;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_INSERT_LAYOUT_REFORMAT_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.cc
new file mode 100644
index 0000000..b4024e8
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_manager.h"
+
+#include <algorithm>
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    std::map<std::string, shared_ptr<NetOptimizer>> &NetOptimizerManager::GetNetOptimizerMap() {
+        static std::map<std::string, std::shared_ptr<NetOptimizer>> s_net_optimizer_map;
+        return s_net_optimizer_map;
+    }
+
+    std::vector<std::pair<OptPriority, std::string>> &NetOptimizerManager::GetNetOptimizerSeq() {
+        static std::vector<std::pair<OptPriority, std::string>> s_net_optimizer_seq;
+        return s_net_optimizer_seq;
+    }
+
+    Status NetOptimizerManager::Optimize(NetStructure *structure, NetResource *resource, const NetworkConfig &net_config) {
+        auto &optimizer_map = NetOptimizerManager::GetNetOptimizerMap();
+        std::sort(NetOptimizerManager::GetNetOptimizerSeq().begin(), NetOptimizerManager::GetNetOptimizerSeq().end());
+
+        for (auto iter : NetOptimizerManager::GetNetOptimizerSeq()) {
+            auto optimizer = optimizer_map[iter.second];
+            if (optimizer->IsSupported(net_config)) {
+                auto status = optimizer->Optimize(structure, resource);
+                if (status != TNN_OK) {
+                    return status;
+                }
+            }
+        }
+
+        return TNN_OK;
+    }
+
+    void NetOptimizerManager::RegisterNetOptimizer(NetOptimizer *optimizer, OptPriority prior) {
+        if (optimizer && optimizer->Strategy().length() > 0) {
+            auto &optimizer_map                  = NetOptimizerManager::GetNetOptimizerMap();
+            optimizer_map[optimizer->Strategy()] = std::shared_ptr<NetOptimizer>(optimizer);
+            NetOptimizerManager::GetNetOptimizerSeq().push_back(std::make_pair(prior, optimizer->Strategy()));
+        }
+    }
+
+    std::shared_ptr<NetOptimizer> NetOptimizerManager::GetNetOptimizerByName(const std::string &k_net_optimizer) {
+        auto &optimizer_map = NetOptimizerManager::GetNetOptimizerMap();
+        if (optimizer_map.find(k_net_optimizer) != optimizer_map.end()) {
+            return optimizer_map[k_net_optimizer];
+        } else {
+            return nullptr;
+        }
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.h
new file mode 100644
index 0000000..32c1441
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_manager.h
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_MANAGER_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_MANAGER_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    typedef enum {
+        // TOP
+        P0 = 0,
+        // MIDDLE
+        P1 = 1,
+        //
+        P2 = 2,
+        // LAST
+        PLAST = 1000
+    } OptPriority;
+
+    //@brief net optimize: fuse relu and relu6 to convolution
+    class NetOptimizerManager {
+    public:
+        static Status Optimize(NetStructure *structure, NetResource *resource, const NetworkConfig &net_config);
+
+        static void RegisterNetOptimizer(NetOptimizer *ptimizer, OptPriority prior);
+
+        static std::shared_ptr<NetOptimizer> GetNetOptimizerByName(const std::string &k_net_optimizer);
+
+    private:
+        static std::map<std::string, std::shared_ptr<NetOptimizer>> &GetNetOptimizerMap();
+
+        static std::vector<std::pair<OptPriority, std::string>> &GetNetOptimizerSeq();
+    };
+
+    template <typename T>
+    class NetOptimizerRegister {
+    public:
+        explicit NetOptimizerRegister(OptPriority p) {
+            NetOptimizerManager::RegisterNetOptimizer(new T(), p);
+        }
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_OPTIMIZER_NET_OPTIMIZER_MANAGER_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.cc b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.cc
new file mode 100644
index 0000000..7fdb235
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.cc
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_remove_layers.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    static std::set<LayerType> global_remove_layer_types_set = {
+        LAYER_SPLITING,
+        LAYER_FLATTEN,
+        LAYER_DROPOUT,
+        LAYER_CONCAT,
+    };
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerRemoveLayers> g_net_optimizer_remove_split(OptPriority::P1);
+
+    std::string NetOptimizerRemoveLayers::Strategy() {
+        return kNetOptimizerRemoveLayers;
+    }
+
+    bool NetOptimizerRemoveLayers::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerRemoveLayers::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        if (structure->source_model_type != MODEL_TYPE_NCNN) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        auto &ly_sets = global_remove_layer_types_set;
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        std::map<std::string, std::string> rename_map;
+        for (int index = 0; index < count; index++) {
+            auto layer = layers_orig[index];
+            if (ly_sets.find(layer->type) != ly_sets.end() && layer->inputs.size() == 1) {
+                auto in_name = layer->inputs[0];
+                for (auto out_name : layer->outputs) {
+                    if (rename_map.find(out_name) == rename_map.end()) {
+                        rename_map[out_name] = in_name;
+                    } else {
+                        return Status(TNNERR_NET_ERR, "duplicated output blobs");
+                    }
+                }
+            } else {
+                std::vector<std::string> new_inputs;
+                new_inputs.reserve(layer->inputs.size());
+                for (auto in_name : layer->inputs) {
+                    while (rename_map.find(in_name) != rename_map.end()) {
+                        in_name = rename_map[in_name];
+                    }
+                    new_inputs.push_back(in_name);
+                }
+                layer->inputs = new_inputs;
+                layers_fused.push_back(layer);
+            }
+        }
+
+        structure->layers = layers_fused;
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.h b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.h
new file mode 100644
index 0000000..6803eb7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/net_optimizer_remove_layers.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_LAYERS_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_LAYERS_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse relu and relu6 to convolution
+    class NetOptimizerRemoveLayers : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_LAYERS_H_
diff --git a/3rdparty/TNN/source/tnn/optimizer/optimizer_const.h b/3rdparty/TNN/source/tnn/optimizer/optimizer_const.h
new file mode 100644
index 0000000..9ebdcf4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/optimizer/optimizer_const.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_OPTIMIZER_OPTIMIZER_CONST_H_
+#define TNN_SOURCE_TNN_OPTIMIZER_OPTIMIZER_CONST_H_
+
+#include <string>
+
+namespace TNN_NS {
+
+static const std::string kNetOptimizerFuseConvPost =
+    "net_optimizer_fuse_conv_post";
+
+static const std::string kNetOptimizerFuseConvAdd =
+    "net_optimizer_fuse_conv_add";
+
+static const std::string kNetOptimizerCbamFusedReduce =
+    "net_optimizer_cbam_fused_reduce";
+
+static const std::string kNetOptimizerCbamFusedPooling =
+    "net_optimizer_cbam_fused_pooling";
+
+static const std::string kNetOptimizerInsertInt8Reformat =
+    "net_optimizer_insert_int8_reformat";
+
+static const std::string kNetOptimizerInsertFp16Reformat =
+    "net_optimizer_insert_fp16_reformat";
+
+static const std::string kNetOptimizerInsertLayoutReformat =
+    "net_optimizer_insert_layout_reformat";
+
+static const std::string kNetOptimizerRemoveLayers =
+    "net_optimizer_remove_layers";
+
+static const std::string kNetOptimizerConvertInt8Layers =
+    "net_optimizer_convert_int8_layers";
+
+}
+
+#endif // TNN_SOURCE_TNN_OPTIMIZER_OPTIMIZER_CONST_H_
diff --git a/3rdparty/TNN/source/tnn/utils/bbox_util.cc b/3rdparty/TNN/source/tnn/utils/bbox_util.cc
new file mode 100644
index 0000000..5c6268c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/bbox_util.cc
@@ -0,0 +1,1294 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/bbox_util.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <cmath>
+#include <string>
+
+namespace TNN_NS {
+
+bool SortBBoxAscend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+    return bbox1.score() < bbox2.score();
+}
+
+bool SortBBoxDescend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+    return bbox1.score() > bbox2.score();
+}
+
+template <typename T>
+bool SortScorePairAscend(const pair<float, T>& pair1, const pair<float, T>& pair2) {
+    return pair1.first < pair2.first;
+}
+
+// Explicit initialization.
+template bool SortScorePairAscend(const pair<float, int>& pair1, const pair<float, int>& pair2);
+template bool SortScorePairAscend(const pair<float, pair<int, int>>& pair1, const pair<float, pair<int, int>>& pair2);
+
+template <typename T>
+bool SortScorePairDescend(const pair<float, T>& pair1, const pair<float, T>& pair2) {
+    return pair1.first > pair2.first;
+}
+
+// Explicit initialization.
+template bool SortScorePairDescend(const pair<float, int>& pair1, const pair<float, int>& pair2);
+template bool SortScorePairDescend(const pair<float, pair<int, int>>& pair1, const pair<float, pair<int, int>>& pair2);
+
+NormalizedBBox UnitBBox() {
+    NormalizedBBox unit_bbox;
+    unit_bbox.set_xmin(0.);
+    unit_bbox.set_ymin(0.);
+    unit_bbox.set_xmax(1.);
+    unit_bbox.set_ymax(1.);
+    return unit_bbox;
+}
+
+void IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2, NormalizedBBox* intersect_bbox) {
+    if (bbox2.xmin() > bbox1.xmax() || bbox2.xmax() < bbox1.xmin() || bbox2.ymin() > bbox1.ymax() ||
+        bbox2.ymax() < bbox1.ymin()) {
+        // Return [0, 0, 0, 0] if there is no intersection.
+        intersect_bbox->set_xmin(0);
+        intersect_bbox->set_ymin(0);
+        intersect_bbox->set_xmax(0);
+        intersect_bbox->set_ymax(0);
+    } else {
+        intersect_bbox->set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
+        intersect_bbox->set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
+        intersect_bbox->set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
+        intersect_bbox->set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
+    }
+}
+
+float BBoxSize(const NormalizedBBox& bbox, const bool normalized) {
+    if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin()) {
+        // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+        return 0;
+    } else {
+        if (bbox.has_size()) {
+            return bbox.size();
+        } else {
+            float width  = bbox.xmax() - bbox.xmin();
+            float height = bbox.ymax() - bbox.ymin();
+            if (normalized) {
+                return width * height;
+            } else {
+                // If bbox is not within range [0, 1].
+                return (width + 1) * (height + 1);
+            }
+        }
+    }
+}
+
+void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox) {
+    clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
+    clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
+    clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
+    clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
+    clip_bbox->clear_size();
+    clip_bbox->set_size(BBoxSize(*clip_bbox));
+    clip_bbox->set_difficult(bbox.difficult());
+}
+
+void ScaleBBox(const NormalizedBBox& bbox, const int height, const int width, NormalizedBBox* scale_bbox) {
+    scale_bbox->set_xmin(bbox.xmin() * width);
+    scale_bbox->set_ymin(bbox.ymin() * height);
+    scale_bbox->set_xmax(bbox.xmax() * width);
+    scale_bbox->set_ymax(bbox.ymax() * height);
+    scale_bbox->clear_size();
+    bool normalized = !(width > 1 || height > 1);
+    scale_bbox->set_size(BBoxSize(*scale_bbox, normalized));
+    scale_bbox->set_difficult(bbox.difficult());
+}
+
+void LocateBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox, NormalizedBBox* loc_bbox) {
+    float src_width  = src_bbox.xmax() - src_bbox.xmin();
+    float src_height = src_bbox.ymax() - src_bbox.ymin();
+    loc_bbox->set_xmin(src_bbox.xmin() + bbox.xmin() * src_width);
+    loc_bbox->set_ymin(src_bbox.ymin() + bbox.ymin() * src_height);
+    loc_bbox->set_xmax(src_bbox.xmin() + bbox.xmax() * src_width);
+    loc_bbox->set_ymax(src_bbox.ymin() + bbox.ymax() * src_height);
+    loc_bbox->set_difficult(bbox.difficult());
+}
+
+bool ProjectBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox, NormalizedBBox* proj_bbox) {
+    if (bbox.xmin() >= src_bbox.xmax() || bbox.xmax() <= src_bbox.xmin() || bbox.ymin() >= src_bbox.ymax() ||
+        bbox.ymax() <= src_bbox.ymin()) {
+        return false;
+    }
+    float src_width  = src_bbox.xmax() - src_bbox.xmin();
+    float src_height = src_bbox.ymax() - src_bbox.ymin();
+    proj_bbox->set_xmin((bbox.xmin() - src_bbox.xmin()) / src_width);
+    proj_bbox->set_ymin((bbox.ymin() - src_bbox.ymin()) / src_height);
+    proj_bbox->set_xmax((bbox.xmax() - src_bbox.xmin()) / src_width);
+    proj_bbox->set_ymax((bbox.ymax() - src_bbox.ymin()) / src_height);
+    proj_bbox->set_difficult(bbox.difficult());
+    ClipBBox(*proj_bbox, proj_bbox);
+    if (BBoxSize(*proj_bbox) > 0) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2, const bool normalized) {
+    NormalizedBBox intersect_bbox;
+    IntersectBBox(bbox1, bbox2, &intersect_bbox);
+    float intersect_width, intersect_height;
+    if (normalized) {
+        intersect_width  = intersect_bbox.xmax() - intersect_bbox.xmin();
+        intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
+    } else {
+        intersect_width  = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
+        intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
+    }
+    if (intersect_width > 0 && intersect_height > 0) {
+        float intersect_size = intersect_width * intersect_height;
+        float bbox1_size     = BBoxSize(bbox1);
+        float bbox2_size     = BBoxSize(bbox2);
+        return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+    } else {
+        return 0.;
+    }
+}
+
+float BBoxCoverage(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+    NormalizedBBox intersect_bbox;
+    IntersectBBox(bbox1, bbox2, &intersect_bbox);
+    float intersect_size = BBoxSize(intersect_bbox);
+    if (intersect_size > 0) {
+        float bbox1_size = BBoxSize(bbox1);
+        return intersect_size / bbox1_size;
+    } else {
+        return 0.;
+    }
+}
+
+/*
+ * bool MeetEmitConstraint(const NormalizedBBox& src_bbox,
+                        const NormalizedBBox& bbox,
+                        const EmitConstraint& emit_constraint) {
+  EmitType emit_type = emit_constraint.emit_type();
+  if (emit_type == EmitConstraint_EmitType_CENTER) {
+    float x_center = (bbox.xmin() + bbox.xmax()) / 2;
+    float y_center = (bbox.ymin() + bbox.ymax()) / 2;
+    if (x_center >= src_bbox.xmin() && x_center <= src_bbox.xmax() &&
+        y_center >= src_bbox.ymin() && y_center <= src_bbox.ymax()) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (emit_type == EmitConstraint_EmitType_MIN_OVERLAP) {
+    float bbox_coverage = BBoxCoverage(bbox, src_bbox);
+    return bbox_coverage > emit_constraint.emit_overlap();
+  } else {
+    LOG(FATAL) << "Unknown emit type.";
+    return false;
+  }
+}*/
+
+void EncodeBBox(const NormalizedBBox& prior_bbox, const vector<float>& prior_variance, const CodeType code_type,
+                const bool encode_variance_in_target, const NormalizedBBox& bbox, NormalizedBBox* encode_bbox) {
+    if (code_type == PriorBoxParameter_CodeType_CORNER) {
+        if (encode_variance_in_target) {
+            encode_bbox->set_xmin(bbox.xmin() - prior_bbox.xmin());
+            encode_bbox->set_ymin(bbox.ymin() - prior_bbox.ymin());
+            encode_bbox->set_xmax(bbox.xmax() - prior_bbox.xmax());
+            encode_bbox->set_ymax(bbox.ymax() - prior_bbox.ymax());
+        } else {
+            // Encode variance in bbox.
+            assert(prior_variance.size() == 4);
+            for (size_t i = 0; i < prior_variance.size(); ++i) {
+                assert(prior_variance[i] > 0);
+            }
+            encode_bbox->set_xmin((bbox.xmin() - prior_bbox.xmin()) / prior_variance[0]);
+            encode_bbox->set_ymin((bbox.ymin() - prior_bbox.ymin()) / prior_variance[1]);
+            encode_bbox->set_xmax((bbox.xmax() - prior_bbox.xmax()) / prior_variance[2]);
+            encode_bbox->set_ymax((bbox.ymax() - prior_bbox.ymax()) / prior_variance[3]);
+        }
+    } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+        float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+        assert(prior_width > 0);
+        float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+        assert(prior_height > 0);
+        float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.f;
+        float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.f;
+
+        float bbox_width = bbox.xmax() - bbox.xmin();
+        assert(bbox_width > 0);
+        float bbox_height = bbox.ymax() - bbox.ymin();
+        assert(bbox_height > 0);
+        float bbox_center_x = (bbox.xmin() + bbox.xmax()) / 2.f;
+        float bbox_center_y = (bbox.ymin() + bbox.ymax()) / 2.f;
+
+        if (encode_variance_in_target) {
+            encode_bbox->set_xmin((bbox_center_x - prior_center_x) / prior_width);
+            encode_bbox->set_ymin((bbox_center_y - prior_center_y) / prior_height);
+            encode_bbox->set_xmax(log(bbox_width / prior_width));
+            encode_bbox->set_ymax(log(bbox_height / prior_height));
+        } else {
+            // Encode variance in bbox.
+            encode_bbox->set_xmin((bbox_center_x - prior_center_x) / prior_width / prior_variance[0]);
+            encode_bbox->set_ymin((bbox_center_y - prior_center_y) / prior_height / prior_variance[1]);
+            encode_bbox->set_xmax(log(bbox_width / prior_width) / prior_variance[2]);
+            encode_bbox->set_ymax(log(bbox_height / prior_height) / prior_variance[3]);
+        }
+    } else {
+        // LOG(FATAL) << "Unknown LocLossType.";
+        assert(false);
+    }
+}
+
+void DecodeBBox(const NormalizedBBox& prior_bbox, const vector<float>& prior_variance, const CodeType code_type,
+                const bool variance_encoded_in_target, const bool clip_bbox, const NormalizedBBox& bbox,
+                NormalizedBBox* decode_bbox) {
+    if (code_type == PriorBoxParameter_CodeType_CORNER) {
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to add the offset
+            // predictions.
+            decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin());
+            decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin());
+            decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax());
+            decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax());
+        } else {
+            // variance is encoded in bbox, we need to scale the offset
+            // accordingly.
+            decode_bbox->set_xmin(prior_bbox.xmin() + prior_variance[0] * bbox.xmin());
+            decode_bbox->set_ymin(prior_bbox.ymin() + prior_variance[1] * bbox.ymin());
+            decode_bbox->set_xmax(prior_bbox.xmax() + prior_variance[2] * bbox.xmax());
+            decode_bbox->set_ymax(prior_bbox.ymax() + prior_variance[3] * bbox.ymax());
+        }
+    } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+        float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+        assert(prior_width > 0);
+        float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+        assert(prior_height > 0);
+        float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.f;
+        float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.f;
+
+        float decode_bbox_center_x, decode_bbox_center_y;
+        float decode_bbox_width, decode_bbox_height;
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to retore the
+            // offset predictions.
+            decode_bbox_center_x = bbox.xmin() * prior_width + prior_center_x;
+            decode_bbox_center_y = bbox.ymin() * prior_height + prior_center_y;
+            decode_bbox_width    = exp(bbox.xmax()) * prior_width;
+            decode_bbox_height   = exp(bbox.ymax()) * prior_height;
+        } else {
+            // variance is encoded in bbox, we need to scale the offset
+            // accordingly.
+            decode_bbox_center_x = prior_variance[0] * bbox.xmin() * prior_width + prior_center_x;
+            decode_bbox_center_y = prior_variance[1] * bbox.ymin() * prior_height + prior_center_y;
+            decode_bbox_width    = exp(prior_variance[2] * bbox.xmax()) * prior_width;
+            decode_bbox_height   = exp(prior_variance[3] * bbox.ymax()) * prior_height;
+        }
+
+        decode_bbox->set_xmin(decode_bbox_center_x - decode_bbox_width / 2.f);
+        decode_bbox->set_ymin(decode_bbox_center_y - decode_bbox_height / 2.f);
+        decode_bbox->set_xmax(decode_bbox_center_x + decode_bbox_width / 2.f);
+        decode_bbox->set_ymax(decode_bbox_center_y + decode_bbox_height / 2.f);
+    } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
+        float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+        assert(prior_width > 0);
+        float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+        assert(prior_height > 0);
+        if (variance_encoded_in_target) {
+            // variance is encoded in target, we simply need to add the offset
+            // predictions.
+            decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin() * prior_width);
+            decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin() * prior_height);
+            decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax() * prior_width);
+            decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax() * prior_height);
+        } else {
+            // variance is encoded in bbox, we need to scale the offset
+            // accordingly.
+            decode_bbox->set_xmin(prior_bbox.xmin() + prior_variance[0] * bbox.xmin() * prior_width);
+            decode_bbox->set_ymin(prior_bbox.ymin() + prior_variance[1] * bbox.ymin() * prior_height);
+            decode_bbox->set_xmax(prior_bbox.xmax() + prior_variance[2] * bbox.xmax() * prior_width);
+            decode_bbox->set_ymax(prior_bbox.ymax() + prior_variance[3] * bbox.ymax() * prior_height);
+        }
+    } else {
+        // LOG(FATAL) << "Unknown LocLossType.";
+        assert(false);  // Unknown LocLossType.
+    }
+    float bbox_size = BBoxSize(*decode_bbox);
+    decode_bbox->set_size(bbox_size);
+    if (clip_bbox) {
+        ClipBBox(*decode_bbox, decode_bbox);
+    }
+}
+
+void DecodeBBoxes(const vector<NormalizedBBox>& prior_bboxes, const vector<vector<float>>& prior_variances,
+                  const CodeType code_type, const bool variance_encoded_in_target, const bool clip_bbox,
+                  const vector<NormalizedBBox>& bboxes, vector<NormalizedBBox>* decode_bboxes) {
+    CHECK_EQ(prior_bboxes.size(), prior_variances.size());
+    CHECK_EQ(prior_bboxes.size(), bboxes.size());
+    int num_bboxes = static_cast<int>(prior_bboxes.size());
+    if (num_bboxes >= 1) {
+        CHECK_EQ(prior_variances[0].size(), 4);
+    }
+    decode_bboxes->clear();
+    for (int i = 0; i < num_bboxes; ++i) {
+        NormalizedBBox decode_bbox;
+        DecodeBBox(prior_bboxes[i], prior_variances[i], code_type, variance_encoded_in_target, clip_bbox, bboxes[i],
+                   &decode_bbox);
+        decode_bboxes->push_back(decode_bbox);
+    }
+}
+
+void DecodeBBoxesAll(const vector<LabelBBox>& all_loc_preds, const vector<NormalizedBBox>& prior_bboxes,
+                     const vector<vector<float>>& prior_variances, const int num, const bool share_location,
+                     const int num_loc_classes, const int background_label_id, const CodeType code_type,
+                     const bool variance_encoded_in_target, const bool clip, vector<LabelBBox>* all_decode_bboxes) {
+    assert(all_loc_preds.size() == num);
+    all_decode_bboxes->clear();
+    all_decode_bboxes->resize(num);
+    for (int i = 0; i < num; ++i) {
+        // Decode predictions into bboxes.
+        LabelBBox& decode_bboxes = (*all_decode_bboxes)[i];
+        for (int c = 0; c < num_loc_classes; ++c) {
+            int label = share_location ? -1 : c;
+            if (label == background_label_id) {
+                // Ignore background class.
+                continue;
+            }
+            if (all_loc_preds[i].find(label) == all_loc_preds[i].end()) {
+                // Something bad happened if there are no predictions for
+                // current label.
+                // LOG(FATAL) << "Could not find location predictions for label
+                // " << label;
+                assert(false);  // Could not find location predictions for label
+            }
+            const vector<NormalizedBBox>& label_loc_preds = all_loc_preds[i].find(label)->second;
+            DecodeBBoxes(prior_bboxes, prior_variances, code_type, variance_encoded_in_target, clip, label_loc_preds,
+                         &(decode_bboxes[label]));
+        }
+    }
+}
+
+void CasRegDecodeBBoxesAll(const vector<LabelBBox>& all_loc_preds, const vector<NormalizedBBox>& prior_bboxes,
+                           const vector<vector<float>>& prior_variances, const int num, const bool share_location,
+                           const int num_loc_classes, const int background_label_id, const CodeType code_type,
+                           const bool variance_encoded_in_target, const bool clip, vector<LabelBBox>* all_decode_bboxes,
+                           const vector<LabelBBox>& all_arm_loc_preds) {
+    CHECK_EQ(all_loc_preds.size(), num);
+    all_decode_bboxes->clear();
+    all_decode_bboxes->resize(num);
+    for (int i = 0; i < num; ++i) {
+        // apply arm_loc_preds to prior_box
+        const vector<NormalizedBBox>& arm_loc_preds = all_arm_loc_preds[i].find(-1)->second;
+        vector<NormalizedBBox> decode_prior_bboxes;
+        bool clip_bbox = false;
+        DecodeBBoxes(prior_bboxes, prior_variances, code_type, variance_encoded_in_target, clip_bbox, arm_loc_preds,
+                     &decode_prior_bboxes);
+        // Decode predictions into bboxes.
+        LabelBBox& decode_bboxes = (*all_decode_bboxes)[i];
+        for (int c = 0; c < num_loc_classes; ++c) {
+            int label = share_location ? -1 : c;
+            if (label == background_label_id) {
+                // Ignore background class.
+                continue;
+            }
+            if (all_loc_preds[i].find(label) == all_loc_preds[i].end()) {
+                // Something bad happened if there are no predictions for
+                // current label. LOG(FATAL) << "Could not find location
+                // predictions for label " << label;
+            }
+            const vector<NormalizedBBox>& label_loc_preds = all_loc_preds[i].find(label)->second;
+            DecodeBBoxes(decode_prior_bboxes, prior_variances, code_type, variance_encoded_in_target, clip,
+                         label_loc_preds, &(decode_bboxes[label]));
+        }
+    }
+}
+
+void MatchBBoxPostProcess(map<int, map<int, float>>& overlaps, vector<int>* match_indices,
+                          const MatchType match_type, vector<float>* match_overlaps,
+                          vector<int>& gt_indices, int num_gt, const float overlap_threshold) {
+    switch (match_type) {
+        case MultiBoxLossParameter_MatchType_BIPARTITE:
+            // Already done.
+            break;
+        case MultiBoxLossParameter_MatchType_PER_PREDICTION:
+            // Get most overlaped for the rest prediction bboxes.
+            for (map<int, map<int, float>>::iterator it = overlaps.begin(); it != overlaps.end(); ++it) {
+                int i = it->first;
+                if ((*match_indices)[i] != -1) {
+                    // The prediction already have matched ground truth.
+                    continue;
+                }
+                int max_gt_idx    = -1;
+                float max_overlap = -1;
+                for (int j = 0; j < num_gt; ++j) {
+                    if (it->second.find(j) == it->second.end()) {
+                        // No overlap between the i-th prediction and j-th
+                        // ground truth.
+                        continue;
+                    }
+                    // Find the maximum overlapped pair.
+                    float overlap = it->second[j];
+                    if (overlap >= overlap_threshold && overlap > max_overlap) {
+                        // If the prediction has not been matched to any ground
+                        // truth, and the overlap is larger than maximum
+                        // overlap, update.
+                        max_gt_idx  = j;
+                        max_overlap = overlap;
+                    }
+                }
+                if (max_gt_idx != -1) {
+                    // Found a matched ground truth.
+                    CHECK_EQ((*match_indices)[i], -1);
+                    (*match_indices)[i]  = gt_indices[max_gt_idx];
+                    (*match_overlaps)[i] = max_overlap;
+                }
+            }
+            break;
+        default:
+            // LOG(FATAL) << "Unknown matching type.";
+            assert(false);  // Unknown matching type.
+            break;
+    }
+}
+
+void MatchBBox(const vector<NormalizedBBox>& gt_bboxes, const vector<NormalizedBBox>& pred_bboxes, const int label,
+               const MatchType match_type, const float overlap_threshold, vector<int>* match_indices,
+               vector<float>* match_overlaps) {
+    int num_pred = static_cast<int>(pred_bboxes.size());
+    match_indices->clear();
+    match_indices->resize(num_pred, -1);
+    match_overlaps->clear();
+    match_overlaps->resize(num_pred, 0.);
+
+    int num_gt = 0;
+    vector<int> gt_indices;
+    if (label == -1) {
+        // label -1 means comparing against all ground truth.
+        num_gt = static_cast<int>(gt_bboxes.size());
+        for (int i = 0; i < num_gt; ++i) {
+            gt_indices.push_back(i);
+        }
+    } else {
+        // Count number of ground truth boxes which has the desired label.
+        for (size_t i = 0; i < gt_bboxes.size(); ++i) {
+            if (gt_bboxes[i].label() == label) {
+                num_gt++;
+                gt_indices.push_back(static_cast<int>(i));
+            }
+        }
+    }
+    if (num_gt == 0) {
+        return;
+    }
+
+    // Store the positive overlap between predictions and ground truth.
+    map<int, map<int, float>> overlaps;
+    for (int i = 0; i < num_pred; ++i) {
+        for (int j = 0; j < num_gt; ++j) {
+            float overlap = JaccardOverlap(pred_bboxes[i], gt_bboxes[gt_indices[j]]);
+            if (overlap > 1e-6) {
+                (*match_overlaps)[i] = std::max((*match_overlaps)[i], overlap);
+                overlaps[i][j]       = overlap;
+            }
+        }
+    }
+
+    // Bipartite matching.
+    vector<int> gt_pool;
+    for (int i = 0; i < num_gt; ++i) {
+        gt_pool.push_back(i);
+    }
+    while (gt_pool.size() > 0) {
+        // Find the most overlapped gt and cooresponding predictions.
+        int max_idx       = -1;
+        int max_gt_idx    = -1;
+        float max_overlap = -1;
+        for (map<int, map<int, float>>::iterator it = overlaps.begin(); it != overlaps.end(); ++it) {
+            int i = it->first;
+            if ((*match_indices)[i] != -1) {
+                // The prediction already have matched ground truth.
+                continue;
+            }
+            for (size_t p = 0; p < gt_pool.size(); ++p) {
+                int j = gt_pool[p];
+                if (it->second.find(j) == it->second.end()) {
+                    // No overlap between the i-th prediction and j-th ground
+                    // truth.
+                    continue;
+                }
+                // Find the maximum overlapped pair.
+                if (it->second[j] > max_overlap) {
+                    // If the prediction has not been matched to any ground
+                    // truth, and the overlap is larger than maximum overlap,
+                    // update.
+                    max_idx     = i;
+                    max_gt_idx  = j;
+                    max_overlap = it->second[j];
+                }
+            }
+        }
+        if (max_idx == -1) {
+            // Cannot find good match.
+            break;
+        } else {
+            CHECK_EQ((*match_indices)[max_idx], -1);
+            (*match_indices)[max_idx]  = gt_indices[max_gt_idx];
+            (*match_overlaps)[max_idx] = max_overlap;
+            // Erase the ground truth.
+            gt_pool.erase(std::find(gt_pool.begin(), gt_pool.end(), max_gt_idx));
+        }
+    }
+
+    MatchBBoxPostProcess(overlaps, match_indices, match_type, match_overlaps,
+                         gt_indices, num_gt, overlap_threshold);
+
+    return;
+}
+
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt, const int background_label_id, const bool use_difficult_gt,
+                    map<int, vector<NormalizedBBox>>* all_gt_bboxes) {
+    all_gt_bboxes->clear();
+    for (int i = 0; i < num_gt; ++i) {
+        int start_idx = i * 8;
+        int item_id   = static_cast<int>(gt_data[start_idx]);
+        if (item_id == -1) {
+            continue;
+        }
+        int label = static_cast<int>(gt_data[start_idx + 1]);
+        assert(background_label_id != label);
+        bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
+        if (!use_difficult_gt && difficult) {
+            // Skip reading difficult ground truth.
+            continue;
+        }
+        NormalizedBBox bbox;
+        bbox.set_label(label);
+        bbox.set_xmin(static_cast<float>(gt_data[start_idx + 3]));
+        bbox.set_ymin(static_cast<float>(gt_data[start_idx + 4]));
+        bbox.set_xmax(static_cast<float>(gt_data[start_idx + 5]));
+        bbox.set_ymax(static_cast<float>(gt_data[start_idx + 6]));
+        bbox.set_difficult(difficult);
+        float bbox_size = BBoxSize(bbox);
+        bbox.set_size(bbox_size);
+        (*all_gt_bboxes)[item_id].push_back(bbox);
+    }
+}
+
+// Explicit initialization.
+template void GetGroundTruth(const float* gt_data, const int num_gt, const int background_label_id,
+                             const bool use_difficult_gt, map<int, vector<NormalizedBBox>>* all_gt_bboxes);
+template void GetGroundTruth(const double* gt_data, const int num_gt, const int background_label_id,
+                             const bool use_difficult_gt, map<int, vector<NormalizedBBox>>* all_gt_bboxes);
+
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt, const int background_label_id, const bool use_difficult_gt,
+                    map<int, LabelBBox>* all_gt_bboxes) {
+    all_gt_bboxes->clear();
+    for (int i = 0; i < num_gt; ++i) {
+        int start_idx = i * 8;
+        int item_id   = static_cast<int>(gt_data[start_idx]);
+        if (item_id == -1) {
+            break;
+        }
+        NormalizedBBox bbox;
+        int label = static_cast<int>(gt_data[start_idx + 1]);
+        assert(background_label_id != label);
+        bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
+        if (!use_difficult_gt && difficult) {
+            // Skip reading difficult ground truth.
+            continue;
+        }
+        bbox.set_xmin(static_cast<float>(gt_data[start_idx + 3]));
+        bbox.set_ymin(static_cast<float>(gt_data[start_idx + 4]));
+        bbox.set_xmax(static_cast<float>(gt_data[start_idx + 5]));
+        bbox.set_ymax(static_cast<float>(gt_data[start_idx + 6]));
+        bbox.set_difficult(difficult);
+        float bbox_size = BBoxSize(bbox);
+        bbox.set_size(bbox_size);
+        (*all_gt_bboxes)[item_id][label].push_back(bbox);
+    }
+}
+
+// Explicit initialization.
+template void GetGroundTruth(const float* gt_data, const int num_gt, const int background_label_id,
+                             const bool use_difficult_gt, map<int, LabelBBox>* all_gt_bboxes);
+template void GetGroundTruth(const double* gt_data, const int num_gt, const int background_label_id,
+                             const bool use_difficult_gt, map<int, LabelBBox>* all_gt_bboxes);
+
+template <typename Dtype>
+void GetLocPredictions(const Dtype* loc_data, const int num, const int num_preds_per_class, const int num_loc_classes,
+                       const bool share_location, vector<LabelBBox>* loc_preds) {
+    loc_preds->clear();
+    if (share_location) {
+        CHECK_EQ(num_loc_classes, 1);
+    }
+    loc_preds->resize(num);
+    for (int i = 0; i < num; ++i) {
+        LabelBBox& label_bbox = (*loc_preds)[i];
+        for (int p = 0; p < num_preds_per_class; ++p) {
+            int start_idx = p * num_loc_classes * 4;
+            for (int c = 0; c < num_loc_classes; ++c) {
+                int label = share_location ? -1 : c;
+                if (label_bbox.find(label) == label_bbox.end()) {
+                    label_bbox[label].resize(num_preds_per_class);
+                }
+                label_bbox[label][p].set_xmin(static_cast<float>(loc_data[start_idx + c * 4]));
+                label_bbox[label][p].set_ymin(static_cast<float>(loc_data[start_idx + c * 4 + 1]));
+                label_bbox[label][p].set_xmax(static_cast<float>(loc_data[start_idx + c * 4 + 2]));
+                label_bbox[label][p].set_ymax(static_cast<float>(loc_data[start_idx + c * 4 + 3]));
+            }
+        }
+        loc_data += num_preds_per_class * num_loc_classes * 4;
+    }
+}
+
+// Explicit initialization.
+template void GetLocPredictions(const float* loc_data, const int num, const int num_preds_per_class,
+                                const int num_loc_classes, const bool share_location, vector<LabelBBox>* loc_preds);
+template void GetLocPredictions(const double* loc_data, const int num, const int num_preds_per_class,
+                                const int num_loc_classes, const bool share_location, vector<LabelBBox>* loc_preds);
+
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                         vector<map<int, vector<float>>>* conf_preds) {
+    conf_preds->clear();
+    conf_preds->resize(num);
+    for (int i = 0; i < num; ++i) {
+        map<int, vector<float>>& label_scores = (*conf_preds)[i];
+        for (int p = 0; p < num_preds_per_class; ++p) {
+            int start_idx = p * num_classes;
+            for (int c = 0; c < num_classes; ++c) {
+                label_scores[c].push_back(static_cast<float>(conf_data[start_idx + c]));
+            }
+        }
+        conf_data += num_preds_per_class * num_classes;
+    }
+}
+
+// Explicit initialization.
+template void GetConfidenceScores(const float* conf_data, const int num, const int num_preds_per_class,
+                                  const int num_classes, vector<map<int, vector<float>>>* conf_preds);
+template void GetConfidenceScores(const double* conf_data, const int num, const int num_preds_per_class,
+                                  const int num_classes, vector<map<int, vector<float>>>* conf_preds);
+
+template <typename Dtype>
+void OSGetConfidenceScores(const Dtype* conf_data, const Dtype* arm_conf_data, const int num,
+                           const int num_preds_per_class, const int num_classes,
+                           vector<map<int, vector<float>>>* conf_preds, float objectness_score) {
+    conf_preds->clear();
+    conf_preds->resize(num);
+    for (int i = 0; i < num; ++i) {
+        map<int, vector<float>>& label_scores = (*conf_preds)[i];
+        for (int p = 0; p < num_preds_per_class; ++p) {
+            int start_idx = p * num_classes;
+            if (arm_conf_data[p * 2 + 1] < objectness_score) {
+                for (int c = 0; c < num_classes; ++c) {
+                    if (c == 0) {
+                        label_scores[c].push_back(1.0);
+                    } else {
+                        label_scores[c].push_back(0.0);
+                    }
+                }
+            } else {
+                for (int c = 0; c < num_classes; ++c) {
+                    label_scores[c].push_back(static_cast<float>(conf_data[start_idx + c]));
+                }
+            }
+        }
+        conf_data += num_preds_per_class * num_classes;
+        arm_conf_data += num_preds_per_class * 2;
+    }
+}
+
+// Explicit initialization.
+template void OSGetConfidenceScores(const float* conf_data, const float* arm_conf_data, const int num,
+                                    const int num_preds_per_class, const int num_classes,
+                                    vector<map<int, vector<float>>>* conf_preds, float objectness_score);
+template void OSGetConfidenceScores(const double* conf_data, const double* arm_conf_data, const int num,
+                                    const int num_preds_per_class, const int num_classes,
+                                    vector<map<int, vector<float>>>* conf_preds, float objectness_score);
+
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                         const bool class_major, vector<map<int, vector<float>>>* conf_preds) {
+    conf_preds->clear();
+    conf_preds->resize(num);
+    for (int i = 0; i < num; ++i) {
+        map<int, vector<float>>& label_scores = (*conf_preds)[i];
+        if (class_major) {
+            for (int c = 0; c < num_classes; ++c) {
+                label_scores[c].assign(conf_data, conf_data + num_preds_per_class);
+                conf_data += num_preds_per_class;
+            }
+        } else {
+            for (int p = 0; p < num_preds_per_class; ++p) {
+                int start_idx = p * num_classes;
+                for (int c = 0; c < num_classes; ++c) {
+                    label_scores[c].push_back(static_cast<float>(conf_data[start_idx + c]));
+                }
+            }
+            conf_data += num_preds_per_class * num_classes;
+        }
+    }
+}
+
+// Explicit initialization.
+template void GetConfidenceScores(const float* conf_data, const int num, const int num_preds_per_class,
+                                  const int num_classes, const bool class_major,
+                                  vector<map<int, vector<float>>>* conf_preds);
+template void GetConfidenceScores(const double* conf_data, const int num, const int num_preds_per_class,
+                                  const int num_classes, const bool class_major,
+                                  vector<map<int, vector<float>>>* conf_preds);
+
+template <typename Dtype>
+void GetMaxConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                            const int background_label_id, const ConfLossType loss_type,
+                            vector<vector<float>>* all_max_scores) {
+    all_max_scores->clear();
+    for (int i = 0; i < num; ++i) {
+        vector<float> max_scores;
+        for (int p = 0; p < num_preds_per_class; ++p) {
+            int start_idx    = p * num_classes;
+            Dtype maxval     = -FLT_MAX;
+            Dtype maxval_pos = -FLT_MAX;
+            for (int c = 0; c < num_classes; ++c) {
+                maxval = std::max<Dtype>(conf_data[start_idx + c], maxval);
+                if (c != background_label_id) {
+                    // Find maximum scores for positive classes.
+                    maxval_pos = std::max<Dtype>(conf_data[start_idx + c], maxval_pos);
+                }
+            }
+            if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+                // Compute softmax probability.
+                Dtype sum = 0.;
+                for (int c = 0; c < num_classes; ++c) {
+                    sum += std::exp(conf_data[start_idx + c] - maxval);
+                }
+                maxval_pos = std::exp(maxval_pos - maxval) / sum;
+            } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+                maxval_pos = static_cast<Dtype>(1.f / (1.f + exp(-maxval_pos)));
+            } else {
+                // LOG(FATAL) << "Unknown conf loss type.";
+                assert(false);
+            }
+            max_scores.push_back(static_cast<float>(maxval_pos));
+        }
+        conf_data += num_preds_per_class * num_classes;
+        all_max_scores->push_back(max_scores);
+    }
+}
+
+// Explicit initialization.
+template void GetMaxConfidenceScores(const float* conf_data, const int num, const int num_preds_per_class,
+                                     const int num_classes, const int background_label_id, const ConfLossType loss_type,
+                                     vector<vector<float>>* all_max_scores);
+template void GetMaxConfidenceScores(const double* conf_data, const int num, const int num_preds_per_class,
+                                     const int num_classes, const int background_label_id, const ConfLossType loss_type,
+                                     vector<vector<float>>* all_max_scores);
+
+template <typename Dtype>
+void GetPriorBBoxes(const Dtype* prior_data, const int num_priors, vector<NormalizedBBox>* prior_bboxes,
+                    vector<vector<float>>* prior_variances) {
+    prior_bboxes->clear();
+    prior_variances->clear();
+    for (int i = 0; i < num_priors; ++i) {
+        int start_idx = i * 4;
+        NormalizedBBox bbox;
+        bbox.set_xmin(static_cast<const float>(prior_data[start_idx]));
+        bbox.set_ymin(static_cast<const float>(prior_data[start_idx + 1]));
+        bbox.set_xmax(static_cast<const float>(prior_data[start_idx + 2]));
+        bbox.set_ymax(static_cast<const float>(prior_data[start_idx + 3]));
+        float bbox_size = BBoxSize(bbox);
+        bbox.set_size(bbox_size);
+        prior_bboxes->push_back(bbox);
+    }
+
+    for (int i = 0; i < num_priors; ++i) {
+        int start_idx = (num_priors + i) * 4;
+        vector<float> var;
+        for (int j = 0; j < 4; ++j) {
+            var.push_back(static_cast<float>(prior_data[start_idx + j]));
+        }
+        prior_variances->push_back(var);
+    }
+}
+
+// Explicit initialization.
+template void GetPriorBBoxes(const float* prior_data, const int num_priors, vector<NormalizedBBox>* prior_bboxes,
+                             vector<vector<float>>* prior_variances);
+template void GetPriorBBoxes(const double* prior_data, const int num_priors, vector<NormalizedBBox>* prior_bboxes,
+                             vector<vector<float>>* prior_variances);
+
+template <typename Dtype>
+void GetDetectionResults(const Dtype* det_data, const int num_det, const int background_label_id,
+                         map<int, map<int, vector<NormalizedBBox>>>* all_detections) {
+    all_detections->clear();
+    for (int i = 0; i < num_det; ++i) {
+        int start_idx = i * 7;
+        int item_id   = static_cast<int>(det_data[start_idx]);
+        if (item_id == -1) {
+            continue;
+        }
+        int label = static_cast<int>(det_data[start_idx + 1]);
+        CHECK_NE(background_label_id, label);
+        NormalizedBBox bbox;
+        bbox.set_score(static_cast<float>(det_data[start_idx + 2]));
+        bbox.set_xmin(static_cast<float>(det_data[start_idx + 3]));
+        bbox.set_ymin(static_cast<float>(det_data[start_idx + 4]));
+        bbox.set_xmax(static_cast<float>(det_data[start_idx + 5]));
+        bbox.set_ymax(static_cast<float>(det_data[start_idx + 6]));
+        float bbox_size = BBoxSize(bbox);
+        bbox.set_size(bbox_size);
+        (*all_detections)[item_id][label].push_back(bbox);
+    }
+}
+
+// Explicit initialization.
+template void GetDetectionResults(const float* det_data, const int num_det, const int background_label_id,
+                                  map<int, map<int, vector<NormalizedBBox>>>* all_detections);
+template void GetDetectionResults(const double* det_data, const int num_det, const int background_label_id,
+                                  map<int, map<int, vector<NormalizedBBox>>>* all_detections);
+
+void GetTopKScoreIndex(const vector<float>& scores, const vector<int>& indices, const int top_k,
+                       vector<pair<float, int>>* score_index_vec) {
+    CHECK_EQ(scores.size(), indices.size());
+
+    // Generate index score pairs.
+    for (size_t i = 0; i < scores.size(); ++i) {
+        score_index_vec->push_back(std::make_pair(scores[i], indices[i]));
+    }
+
+    // Sort the score pair according to the scores in descending order
+    std::stable_sort(score_index_vec->begin(), score_index_vec->end(), SortScorePairDescend<int>);
+
+    // Keep top_k scores if needed.
+    if (top_k > -1 && top_k < score_index_vec->size()) {
+        score_index_vec->resize(top_k);
+    }
+}
+
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores, const float threshold, const int top_k,
+              const bool reuse_overlaps, map<int, map<int, float>>* overlaps, vector<int>* indices) {
+    // Sanity check.
+    assert(bboxes.size() == scores.size());
+
+    // Get top_k scores (with corresponding indices).
+    /*vector<int> idx(boost::counting_iterator<int>(0),
+                    boost::counting_iterator<int>(scores.size()));*/
+    vector<int> idx;
+    for (size_t i = 0; i < scores.size(); ++i) {
+        idx.push_back(static_cast<int>(i));
+    }
+
+    vector<pair<float, int>> score_index_vec;
+    GetTopKScoreIndex(scores, idx, top_k, &score_index_vec);
+
+    // Do nms.
+    indices->clear();
+    while (score_index_vec.size() != 0) {
+        // Get the current highest score box.
+        int best_idx                    = score_index_vec.front().second;
+        const NormalizedBBox& best_bbox = bboxes[best_idx];
+        if (BBoxSize(best_bbox) < 1e-5) {
+            // Erase small box.
+            score_index_vec.erase(score_index_vec.begin());
+            continue;
+        }
+        indices->push_back(best_idx);
+        // Erase the best box.
+        score_index_vec.erase(score_index_vec.begin());
+
+        if (top_k > -1 && indices->size() >= top_k) {
+            // Stop if finding enough bboxes for nms.
+            break;
+        }
+
+        // Compute overlap between best_bbox and other remaining bboxes.
+        // Remove a bbox if the overlap with best_bbox is larger than
+        // nms_threshold.
+        for (vector<pair<float, int>>::iterator it = score_index_vec.begin(); it != score_index_vec.end();) {
+            int cur_idx                    = it->second;
+            const NormalizedBBox& cur_bbox = bboxes[cur_idx];
+            if (BBoxSize(cur_bbox) < 1e-5) {
+                // Erase small box.
+                it = score_index_vec.erase(it);
+                continue;
+            }
+            float cur_overlap = 0.;
+            if (reuse_overlaps) {
+                if (overlaps->find(best_idx) != overlaps->end() &&
+                    overlaps->find(best_idx)->second.find(cur_idx) != overlaps->find(best_idx)->second.end()) {
+                    // Use the computed overlap.
+                    cur_overlap = (*overlaps)[best_idx][cur_idx];
+                } else if (overlaps->find(cur_idx) != overlaps->end() &&
+                           overlaps->find(cur_idx)->second.find(best_idx) != overlaps->find(cur_idx)->second.end()) {
+                    // Use the computed overlap.
+                    cur_overlap = (*overlaps)[cur_idx][best_idx];
+                } else {
+                    cur_overlap = JaccardOverlap(best_bbox, cur_bbox);
+                    // Store the overlap for future use.
+                    (*overlaps)[best_idx][cur_idx] = cur_overlap;
+                }
+            } else {
+                cur_overlap = JaccardOverlap(best_bbox, cur_bbox);
+            }
+
+            // Remove it if necessary
+            if (cur_overlap > threshold) {
+                it = score_index_vec.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+}
+
+void ApplyNMS(const bool* overlapped, const int num, vector<int>* indices) {
+    /*vector<int> index_vec(boost::counting_iterator<int>(0),
+                          boost::counting_iterator<int>(num));*/
+
+    vector<int> index_vec;
+    for (int i = 0; i < num; ++i) {
+        index_vec.push_back(i);
+    }
+    // Do nms.
+    indices->clear();
+    while (index_vec.size() != 0) {
+        // Get the current highest score box.
+        int best_idx = index_vec.front();
+        indices->push_back(best_idx);
+        // Erase the best box.
+        index_vec.erase(index_vec.begin());
+
+        for (vector<int>::iterator it = index_vec.begin(); it != index_vec.end();) {
+            int cur_idx = *it;
+
+            // Remove it if necessary
+            if (overlapped[best_idx * num + cur_idx]) {
+                it = index_vec.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+}
+
+void GetMaxScoreIndex(const vector<float>& scores, const float threshold, const int top_k,
+                      vector<pair<float, int>>* score_index_vec) {
+    // Generate index score pairs.
+    for (size_t i = 0; i < scores.size(); ++i) {
+        if (scores[i] > threshold) {
+            score_index_vec->push_back(std::make_pair(scores[i], i));
+        }
+    }
+
+    // Sort the score pair according to the scores in descending order
+    std::stable_sort(score_index_vec->begin(), score_index_vec->end(), SortScorePairDescend<int>);
+
+    // Keep top_k scores if needed.
+    if (top_k > -1 && top_k < score_index_vec->size()) {
+        score_index_vec->resize(top_k);
+    }
+}
+
+void ApplyNMSFast(const vector<NormalizedBBox>& bboxes, const vector<float>& scores, const float score_threshold,
+                  const float nms_threshold, const float eta, const int top_k, vector<int>* indices) {
+    // Sanity check.
+    assert(bboxes.size() == scores.size());  //"bboxes and scores have different size."
+
+    // Get top_k scores (with corresponding indices).
+    vector<pair<float, int>> score_index_vec;
+    GetMaxScoreIndex(scores, score_threshold, top_k, &score_index_vec);
+
+    // Do nms.
+    float adaptive_threshold = nms_threshold;
+    indices->clear();
+    while (score_index_vec.size() != 0) {
+        const int idx = score_index_vec.front().second;
+        bool keep     = true;
+        for (int k = 0; k < indices->size(); ++k) {
+            if (keep) {
+                const int kept_idx = (*indices)[k];
+                float overlap      = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
+                keep               = overlap <= adaptive_threshold;
+            } else {
+                break;
+            }
+        }
+        if (keep) {
+            indices->push_back(idx);
+        }
+        score_index_vec.erase(score_index_vec.begin());
+        if (keep && eta < 1 && adaptive_threshold > 0.5) {
+            adaptive_threshold *= eta;
+        }
+    }
+}
+
+void CumSum(const vector<pair<float, int>>& pairs, vector<int>* cumsum) {
+    // Sort the pairs based on first item of the pair.
+    vector<pair<float, int>> sort_pairs = pairs;
+    std::stable_sort(sort_pairs.begin(), sort_pairs.end(), SortScorePairDescend<int>);
+
+    cumsum->clear();
+    for (size_t i = 0; i < sort_pairs.size(); ++i) {
+        if (i == 0) {
+            cumsum->push_back(sort_pairs[i].second);
+        } else {
+            cumsum->push_back(cumsum->back() + sort_pairs[i].second);
+        }
+    }
+}
+
+void ComputeAP(const vector<pair<float, int>>& tp, const int num_pos, const vector<pair<float, int>>& fp,
+               const string ap_version, vector<float>* prec, vector<float>* rec, float* ap) {
+    const float eps = 1e-6f;
+    CHECK_EQ(tp.size(), fp.size());
+    const int num = static_cast<int>(tp.size());
+    // Make sure that tp and fp have complement value.
+    for (int i = 0; i < num; ++i) {
+        CHECK_LE(fabs(tp[i].first - fp[i].first), eps);
+        CHECK_EQ(tp[i].second, 1 - fp[i].second);
+    }
+    prec->clear();
+    rec->clear();
+    *ap = 0;
+    if (tp.size() == 0 || num_pos == 0) {
+        return;
+    }
+
+    // Compute cumsum of tp.
+    vector<int> tp_cumsum;
+    CumSum(tp, &tp_cumsum);
+    CHECK_EQ(tp_cumsum.size(), num);
+
+    // Compute cumsum of fp.
+    vector<int> fp_cumsum;
+    CumSum(fp, &fp_cumsum);
+    CHECK_EQ(fp_cumsum.size(), num);
+
+    // Compute precision.
+    for (int i = 0; i < num; ++i) {
+        prec->push_back(static_cast<float>(tp_cumsum[i]) / (tp_cumsum[i] + fp_cumsum[i]));
+    }
+
+    // Compute recall.
+    for (int i = 0; i < num; ++i) {
+        CHECK_LE(tp_cumsum[i], num_pos);
+        rec->push_back(static_cast<float>(tp_cumsum[i]) / num_pos);
+    }
+
+    if (ap_version == "11point") {
+        // VOC2007 style for computing AP.
+        vector<float> max_precs(11, 0.);
+        int start_idx = num - 1;
+        for (int j = 10; j >= 0; --j) {
+            for (int i = start_idx; i >= 0; --i) {
+                if ((*rec)[i] < j / 10.) {
+                    start_idx = i;
+                    if (j > 0) {
+                        max_precs[j - 1] = max_precs[j];
+                    }
+                    break;
+                } else {
+                    if (max_precs[j] < (*prec)[i]) {
+                        max_precs[j] = (*prec)[i];
+                    }
+                }
+            }
+        }
+        for (int j = 10; j >= 0; --j) {
+            *ap += max_precs[j] / 11;
+        }
+    } else if (ap_version == "MaxIntegral") {
+        // VOC2012 or ILSVRC style for computing AP.
+        float cur_rec  = rec->back();
+        float cur_prec = prec->back();
+        for (int i = num - 2; i >= 0; --i) {
+            cur_prec = std::max<float>((*prec)[i], cur_prec);
+            if (fabs(cur_rec - (*rec)[i]) > eps) {
+                *ap += cur_prec * fabs(cur_rec - (*rec)[i]);
+            }
+            cur_rec = (*rec)[i];
+        }
+        *ap += cur_rec * cur_prec;
+    } else if (ap_version == "Integral") {
+        // Natural integral.
+        float prev_rec = 0.;
+        for (int i = 0; i < num; ++i) {
+            if (fabs((*rec)[i] - prev_rec) > eps) {
+                *ap += (*prec)[i] * fabs((*rec)[i] - prev_rec);
+            }
+            prev_rec = (*rec)[i];
+        }
+    } else {
+        // LOG(FATAL) << "Unknown ap_version: " << ap_version;
+        assert(false);
+    }
+}
+
+#ifdef USE_OPENCV
+cv::Scalar HSV2RGB(const float h, const float s, const float v) {
+    const int h_i = static_cast<int>(h * 6);
+    const float f = h * 6 - h_i;
+    const float p = v * (1 - s);
+    const float q = v * (1 - f * s);
+    const float t = v * (1 - (1 - f) * s);
+    float r, g, b;
+    switch (h_i) {
+        case 0:
+            r = v;
+            g = t;
+            b = p;
+            break;
+        case 1:
+            r = q;
+            g = v;
+            b = p;
+            break;
+        case 2:
+            r = p;
+            g = v;
+            b = t;
+            break;
+        case 3:
+            r = p;
+            g = q;
+            b = v;
+            break;
+        case 4:
+            r = t;
+            g = p;
+            b = v;
+            break;
+        case 5:
+            r = v;
+            g = p;
+            b = q;
+            break;
+        default:
+            r = 1;
+            g = 1;
+            b = 1;
+            break;
+    }
+    return cv::Scalar(r * 255, g * 255, b * 255);
+}
+
+// http://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically
+vector<cv::Scalar> GetColors(const int n) {
+    vector<cv::Scalar> colors;
+    cv::RNG rng(12345);
+    const float golden_ratio_conjugate = 0.618033988749895;
+    const float s                      = 0.3;
+    const float v                      = 0.99;
+    for (int i = 0; i < n; ++i) {
+        const float h = std::fmod(rng.uniform(0.f, 1.f) + golden_ratio_conjugate, 1.f);
+        colors.push_back(HSV2RGB(h, s, v));
+    }
+    return colors;
+}
+
+static clock_t start_clock = clock();
+
+template <typename Dtype>
+void VisualizeBBox(const vector<cv::Mat>& images, const Blob<Dtype>* detections, const float threshold,
+                   const vector<cv::Scalar>& colors, const map<int, string>& label_to_display_name) {
+    // Retrieve detections.
+    CHECK_EQ(detections->width(), 7);
+    const int num_det = detections->height();
+    const int num_img = images.size();
+    if (num_det == 0 || num_img == 0) {
+        return;
+    }
+    // Comute FPS.
+    float fps = num_img / (static_cast<double>(clock() - start_clock) / CLOCKS_PER_SEC);
+
+    const Dtype* detections_data = detections->cpu_data();
+    const int width              = images[0].cols;
+    const int height             = images[0].rows;
+    vector<LabelBBox> all_detections(num_img);
+    for (int i = 0; i < num_det; ++i) {
+        const int img_idx = detections_data[i * 7];
+        CHECK_LT(img_idx, num_img);
+        const int label   = detections_data[i * 7 + 1];
+        const float score = detections_data[i * 7 + 2];
+        if (score < threshold) {
+            continue;
+        }
+        NormalizedBBox bbox;
+        bbox.set_xmin(detections_data[i * 7 + 3] * width);
+        bbox.set_ymin(detections_data[i * 7 + 4] * height);
+        bbox.set_xmax(detections_data[i * 7 + 5] * width);
+        bbox.set_ymax(detections_data[i * 7 + 6] * height);
+        bbox.set_score(score);
+        all_detections[img_idx][label].push_back(bbox);
+    }
+
+    int fontface  = cv::FONT_HERSHEY_SIMPLEX;
+    double scale  = 1;
+    int thickness = 2;
+    int baseline  = 0;
+    char buffer[50];
+    for (int i = 0; i < num_img; ++i) {
+        cv::Mat image = images[i];
+        // Show FPS.
+        snprintf(buffer, sizeof(buffer), "FPS: %.2f", fps);
+        cv::Size text = cv::getTextSize(buffer, fontface, scale, thickness, &baseline);
+        cv::rectangle(image, cv::Point(0, 0), cv::Point(text.width, text.height + baseline), CV_RGB(255, 255, 255),
+                      CV_FILLED);
+        cv::putText(image, buffer, cv::Point(0, text.height + baseline / 2.), fontface, scale, CV_RGB(0, 0, 0),
+                    thickness, 8);
+        // Draw bboxes.
+        for (map<int, vector<NormalizedBBox>>::iterator it = all_detections[i].begin(); it != all_detections[i].end();
+             ++it) {
+            int label         = it->first;
+            string label_name = "Unknown";
+            if (label_to_display_name.find(label) != label_to_display_name.end()) {
+                label_name = label_to_display_name.find(label)->second;
+            }
+            CHECK_LT(label, colors.size());
+            const cv::Scalar& color              = colors[label];
+            const vector<NormalizedBBox>& bboxes = it->second;
+            for (int j = 0; j < bboxes.size(); ++j) {
+                cv::Point top_left_pt(bboxes[j].xmin(), bboxes[j].ymin());
+                cv::Point bottom_right_pt(bboxes[j].xmax(), bboxes[j].ymax());
+                cv::rectangle(image, top_left_pt, bottom_right_pt, color, 4);
+                cv::Point bottom_left_pt(bboxes[j].xmin(), bboxes[j].ymax());
+                snprintf(buffer, sizeof(buffer), "%s: %.2f", label_name.c_str(), bboxes[j].score());
+                cv::Size text = cv::getTextSize(buffer, fontface, scale, thickness, &baseline);
+                cv::rectangle(image, bottom_left_pt + cv::Point(0, 0),
+                              bottom_left_pt + cv::Point(text.width, -text.height - baseline), color, CV_FILLED);
+                cv::putText(image, buffer, bottom_left_pt - cv::Point(0, baseline), fontface, scale, CV_RGB(0, 0, 0),
+                            thickness, 8);
+            }
+        }
+        cv::imshow("detections", image);
+        if (cv::waitKey(1) == 27) {
+            raise(SIGINT);
+        }
+    }
+    start_clock = clock();
+}
+
+template void VisualizeBBox(const vector<cv::Mat>& images, const Blob<float>* detections, const float threshold,
+                            const vector<cv::Scalar>& colors, const map<int, string>& label_to_display_name);
+template void VisualizeBBox(const vector<cv::Mat>& images, const Blob<double>* detections, const float threshold,
+                            const vector<cv::Scalar>& colors, const map<int, string>& label_to_display_name);
+
+#endif  // USE_OPENCV
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/bbox_util.h b/3rdparty/TNN/source/tnn/utils/bbox_util.h
new file mode 100644
index 0000000..4d10b75
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/bbox_util.h
@@ -0,0 +1,351 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_UTILS_BBOX_UTIL_H_
+#define TNN_UTILS_BBOX_UTIL_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "tnn/core/macro.h"
+#include "tnn/device/cpu/acc/compute/normalized_bbox.h"
+using namespace std;
+
+namespace TNN_NS {
+
+#define CHECK_EQ(a, b) assert(a == b)
+#define CHECK_NE(a, b) assert(a != b)
+#define CHECK_LE(a, b) assert(a <= b)
+
+enum PriorBoxParameter_CodeType {
+    PriorBoxParameter_CodeType_CORNER      = 1,
+    PriorBoxParameter_CodeType_CENTER_SIZE = 2,
+    PriorBoxParameter_CodeType_CORNER_SIZE = 3
+};
+
+typedef PriorBoxParameter_CodeType CodeType;
+
+typedef std::map<int, std::vector<NormalizedBBox>> LabelBBox;
+
+enum EmitConstraint_EmitType { EmitConstraint_EmitType_CENTER = 0, EmitConstraint_EmitType_MIN_OVERLAP = 1 };
+
+enum MultiBoxLossParameter_MatchType {
+    MultiBoxLossParameter_MatchType_BIPARTITE      = 0,
+    MultiBoxLossParameter_MatchType_PER_PREDICTION = 1
+};
+
+enum MultiBoxLossParameter_LocLossType {
+    MultiBoxLossParameter_LocLossType_L2        = 0,
+    MultiBoxLossParameter_LocLossType_SMOOTH_L1 = 1
+};
+
+enum MultiBoxLossParameter_ConfLossType {
+    MultiBoxLossParameter_ConfLossType_SOFTMAX  = 0,
+    MultiBoxLossParameter_ConfLossType_LOGISTIC = 1
+};
+
+typedef EmitConstraint_EmitType EmitType;
+typedef PriorBoxParameter_CodeType CodeType;
+typedef MultiBoxLossParameter_MatchType MatchType;
+typedef MultiBoxLossParameter_LocLossType LocLossType;
+typedef MultiBoxLossParameter_ConfLossType ConfLossType;
+
+// typedef map<int, vector<NormalizedBBox> > LabelBBox
+
+// Function used to sort NormalizedBBox, stored in STL container (e.g. vector),
+// in ascend order based on the score value.
+bool SortBBoxAscend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Function used to sort NormalizedBBox, stored in STL container (e.g. vector),
+// in descend order based on the score value.
+bool SortBBoxDescend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Function sued to sort pair<float, T>, stored in STL container (e.g. vector)
+// in descend order based on the score (first) value.
+template <typename T>
+bool SortScorePairAscend(const pair<float, T>& pair1, const pair<float, T>& pair2);
+
+// Function sued to sort pair<float, T>, stored in STL container (e.g. vector)
+// in descend order based on the score (first) value.
+template <typename T>
+bool SortScorePairDescend(const pair<float, T>& pair1, const pair<float, T>& pair2);
+
+// Generate unit bbox [0, 0, 1, 1]
+NormalizedBBox UnitBBox();
+
+// Compute the intersection between two bboxes.
+void IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2, NormalizedBBox* intersect_bbox);
+
+// Compute bbox size.
+float BBoxSize(const NormalizedBBox& bbox, const bool normalized = true);
+
+// Clip the NormalizedBBox such that the range for each corner is [0, 1].
+void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox);
+
+// Scale the NormalizedBBox w.r.t. height and width.
+void ScaleBBox(const NormalizedBBox& bbox, const int height, const int width, NormalizedBBox* scale_bbox);
+
+// Locate bbox in the coordinate system that src_bbox sits.
+void LocateBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox, NormalizedBBox* loc_bbox);
+
+// Project bbox onto the coordinate system defined by src_bbox.
+bool ProjectBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox, NormalizedBBox* proj_bbox);
+
+// Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2, const bool normalized = true);
+
+// Compute the coverage of bbox1 by bbox2.
+float BBoxCoverage(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Encode a bbox according to a prior bbox.
+void EncodeBBox(const NormalizedBBox& prior_bbox, const vector<float>& prior_variance, const CodeType code_type,
+                const bool encode_variance_in_target, const NormalizedBBox& bbox, NormalizedBBox* encode_bbox);
+
+/*
+// Check if a bbox meet emit constraint w.r.t. src_bbox.
+bool MeetEmitConstraint(const NormalizedBBox& src_bbox,
+    const NormalizedBBox& bbox, const EmitConstraint& emit_constraint);*/
+
+// Decode a bbox according to a prior bbox.
+void DecodeBBox(const NormalizedBBox& prior_bbox, const vector<float>& prior_variance, const CodeType code_type,
+                const bool variance_encoded_in_target, const NormalizedBBox& bbox, NormalizedBBox* decode_bbox);
+
+// Decode a set of bboxes according to a set of prior bboxes.
+void DecodeBBoxes(const vector<NormalizedBBox>& prior_bboxes, const vector<vector<float>>& prior_variances,
+                  const CodeType code_type, const bool variance_encoded_in_target, const vector<NormalizedBBox>& bboxes,
+                  vector<NormalizedBBox>* decode_bboxes);
+
+// Decode all bboxes in a batch.
+void DecodeBBoxesAll(const vector<LabelBBox>& all_loc_preds, const vector<NormalizedBBox>& prior_bboxes,
+                     const vector<vector<float>>& prior_variances, const int num, const bool share_location,
+                     const int num_loc_classes, const int background_label_id, const CodeType code_type,
+                     const bool variance_encoded_in_target, const bool clip, vector<LabelBBox>* all_decode_bboxes);
+
+void CasRegDecodeBBoxesAll(const vector<LabelBBox>& all_loc_pred, const vector<NormalizedBBox>& prior_bboxes,
+                           const vector<vector<float>>& prior_variances, const int num, const bool share_location,
+                           const int num_loc_classes, const int background_label_id, const CodeType code_type,
+                           const bool variance_encoded_in_target, const bool clip, vector<LabelBBox>* all_decode_bboxes,
+                           const vector<LabelBBox>& all_arm_loc_pred);
+
+// Match prediction bboxes with ground truth bboxes.
+void MatchBBox(const vector<NormalizedBBox>& gt, const vector<NormalizedBBox>& pred_bboxes, const int label,
+               const MatchType match_type, const float overlap_threshold, vector<int>* match_indices,
+               vector<float>* match_overlaps);
+
+// Retrieve bounding box ground truth from gt_data.
+//    gt_data: 1 x 1 x num_gt x 7 blob.
+//    num_gt: the number of ground truth.
+//    background_label_id: the label for background class which is used to do
+//      santity check so that no ground truth contains it.
+//    all_gt_bboxes: stores ground truth for each image. Label of each bbox is
+//      stored in NormalizedBBox.
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt, const int background_label_id, const bool use_difficult_gt,
+                    map<int, vector<NormalizedBBox>>* all_gt_bboxes);
+// Store ground truth bboxes of same label in a group.
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt, const int background_label_id, const bool use_difficult_gt,
+                    map<int, LabelBBox>* all_gt_bboxes);
+
+// Get location predictions from loc_data.
+//    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_loc_classes: number of location classes. It is 1 if share_location is
+//      true; and is equal to number of classes needed to predict otherwise.
+//    share_location: if true, all classes share the same location prediction.
+//    loc_preds: stores the location prediction, where each item contains
+//      location prediction for an image.
+template <typename Dtype>
+void GetLocPredictions(const Dtype* loc_data, const int num, const int num_preds_per_class, const int num_loc_classes,
+                       const bool share_location, vector<LabelBBox>* loc_preds);
+
+// Get confidence predictions from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    conf_preds: stores the confidence prediction, where each item contains
+//      confidence prediction for an image.
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                         vector<map<int, vector<float>>>* conf_scores);
+
+template <typename Dtype>
+void OSGetConfidenceScores(const Dtype* conf_data, const Dtype* arm_conf_data, const int num,
+                           const int num_preds_per_class, const int num_classes,
+                           vector<map<int, vector<float>>>* conf_scores, float objectness_score);
+
+// Get confidence predictions from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    class_major: if true, data layout is
+//      num x num_classes x num_preds_per_class; otherwise, data layerout is
+//      num x num_preds_per_class * num_classes.
+//    conf_preds: stores the confidence prediction, where each item contains
+//      confidence prediction for an image.
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                         const bool class_major, vector<map<int, vector<float>>>* conf_scores);
+
+// Get max confidence scores for each prior from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    background_label_id: it is used to skip selecting max scores from
+//      background class.
+//    loss_type: compute the probability according to the loss type.
+//    all_max_scores: stores the max confidence per location for each image.
+template <typename Dtype>
+void GetMaxConfidenceScores(const Dtype* conf_data, const int num, const int num_preds_per_class, const int num_classes,
+                            const int background_label_id, const ConfLossType loss_type,
+                            vector<vector<float>>* all_max_scores);
+
+// Get prior bounding boxes from prior_data.
+//    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
+//    num_priors: number of priors.
+//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
+//    prior_variances: stores all the variances needed by prior bboxes.
+template <typename Dtype>
+void GetPriorBBoxes(const Dtype* prior_data, const int num_priors, vector<NormalizedBBox>* prior_bboxes,
+                    vector<vector<float>>* prior_variances);
+
+// Get detection results from det_data.
+//    det_data: 1 x 1 x num_det x 7 blob.
+//    num_det: the number of detections.
+//    background_label_id: the label for background class which is used to do
+//      santity check so that no detection contains it.
+//    all_detections: stores detection results for each class from each image.
+template <typename Dtype>
+void GetDetectionResults(const Dtype* det_data, const int num_det, const int background_label_id,
+                         map<int, LabelBBox>* all_detections);
+
+// Get top_k scores with corresponding indices.
+//    scores: a set of scores.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+void GetTopKScoreIndex(const vector<float>& scores, const int top_k, vector<pair<float, int>>* score_index_vec);
+
+// Get max scores with corresponding indices.
+//    scores: a set of scores.
+//    threshold: only consider scores higher than the threshold.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+void GetMaxScoreIndex(const vector<float>& scores, const float threshold, const int top_k,
+                      vector<pair<float, int>>* score_index_vec);
+
+// Do non maximum suppression given bboxes and scores.
+//    bboxes: a set of bounding boxes.
+//    scores: a set of corresponding confidences.
+//    threshold: the threshold used in non maximu suppression.
+//    top_k: if not -1, keep at most top_k picked indices.
+//    reuse_overlaps: if true, use and update overlaps; otherwise, always
+//      compute overlap.
+//    overlaps: a temp place to optionally store the overlaps between pairs of
+//      bboxes if reuse_overlaps is true.
+//    indices: the kept indices of bboxes after nms.
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores, const float threshold, const int top_k,
+              const bool reuse_overlaps, map<int, map<int, float>>* overlaps, vector<int>* indices);
+
+void ApplyNMS(const bool* overlapped, const int num, vector<int>* indices);
+
+// Do non maximum suppression given bboxes and scores.
+// Inspired by Piotr Dollar's NMS implementation in EdgeBox.
+// https://goo.gl/jV3JYS
+//    bboxes: a set of bounding boxes.
+//    scores: a set of corresponding confidences.
+//    score_threshold: a threshold used to filter detection results.
+//    nms_threshold: a threshold used in non maximum suppression.
+//    top_k: if not -1, keep at most top_k picked indices.
+//    indices: the kept indices of bboxes after nms.
+void ApplyNMSFast(const vector<NormalizedBBox>& bboxes, const vector<float>& scores, const float score_threshold,
+                  const float nms_threshold, const float eta, const int top_k, vector<int>* indices);
+
+// Compute cumsum of a set of pairs.
+void CumSum(const vector<pair<float, int>>& pairs, vector<int>* cumsum);
+
+// Compute average precision given true positive and false positive vectors.
+//    tp: contains pairs of scores and true positive.
+//    num_pos: number of positives.
+//    fp: contains pairs of scores and false positive.
+//    ap_version: different ways of computing Average Precision.
+//      Check https://sanchom.wordpress.com/tag/average-precision/ for details.
+//      11point: the 11-point interpolated average precision. Used in VOC2007.
+//      MaxIntegral: maximally interpolated AP. Used in VOC2012/ILSVRC.
+//      Integral: the natural integral of the precision-recall curve.
+//    prec: stores the computed precisions.
+//    rec: stores the computed recalls.
+//    ap: the computed Average Precision.
+void ComputeAP(const vector<pair<float, int>>& tp, const int num_pos, const vector<pair<float, int>>& fp,
+               const string ap_version, vector<float>* prec, vector<float>* rec, float* ap);
+
+// not support GPU
+// comments blow code
+//#ifndef CPU_ONLY  // GPU
+// template <typename Dtype>
+//__host__ __device__ Dtype BBoxSizeGPU(const Dtype* bbox,
+//                                      const bool normalized = true);
+//
+// template <typename Dtype>
+//__host__ __device__ Dtype JaccardOverlapGPU(const Dtype* bbox1,
+//                                            const Dtype* bbox2);
+//
+// template <typename Dtype>
+// void DecodeBBoxesGPU(const int nthreads,
+//          const Dtype* loc_data, const Dtype* prior_data,
+//          const CodeType code_type, const bool variance_encoded_in_target,
+//          const int num_priors, const bool share_location,
+//          const int num_loc_classes, const int background_label_id,
+//          Dtype* bbox_data);
+//
+// template <typename Dtype>
+// void PermuteDataGPU(const int nthreads,
+//          const Dtype* data, const int num_classes, const int num_data,
+//          const int num_dim, Dtype* new_data);
+//
+// template <typename Dtype>
+// void ComputeOverlappedGPU(const int nthreads,
+//          const Dtype* bbox_data, const int num_bboxes, const int num_classes,
+//          const Dtype overlap_threshold, bool* overlapped_data);
+//
+// template <typename Dtype>
+// void ComputeOverlappedByIdxGPU(const int nthreads,
+//          const Dtype* bbox_data, const Dtype overlap_threshold,
+//          const int* idx, const int num_idx, bool* overlapped_data);
+//
+// template <typename Dtype>
+// void ApplyNMSGPU(const Dtype* bbox_data, const Dtype* conf_data,
+//          const int num_bboxes, const float confidence_threshold,
+//          const int top_k, const float nms_threshold, vector<int>* indices);
+//
+// template <typename Dtype>
+// void GetDetectionsGPU(const Dtype* bbox_data, const Dtype* conf_data,
+//          const int image_id, const int label, const vector<int>& indices,
+//          const bool clip_bbox, Blob<Dtype>* detection_blob);
+//#endif  // !CPU_ONLY
+
+#ifdef USE_OPENCV
+vector<cv::Scalar> GetColors(const int n);
+
+template <typename Dtype>
+void VisualizeBBox(const vector<cv::Mat>& images, const Blob<Dtype>* detections, const float threshold,
+                   const vector<cv::Scalar>& colors, const map<int, string>& label_to_display_name);
+#endif  // USE_OPENCV
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/utils/bfp16.h b/3rdparty/TNN/source/tnn/utils/bfp16.h
new file mode 100644
index 0000000..4f16e03
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/bfp16.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_BFP16_H_
+#define TNN_SOURCE_TNN_UTILS_BFP16_H_
+
+#include <stdint.h>
+
+namespace TNN_NS {
+
+typedef union {
+    float f;
+    uint32_t u;
+} cvt_32b;
+
+typedef struct bfp16_struct {
+public:
+    uint16_t w = 0;
+
+    bfp16_struct() : w(0) {}
+
+    bfp16_struct(float vf) {
+        cvt_32b c;
+        c.f = vf;
+        w   = c.u >> 16;
+    }
+
+    operator const float() const {
+        cvt_32b c;
+        c.u = w << 16;
+        return c.f;
+    }
+} bfp16_t;
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_BFP16_H_
diff --git a/3rdparty/TNN/source/tnn/utils/bfp16_utils.cc b/3rdparty/TNN/source/tnn/utils/bfp16_utils.cc
new file mode 100644
index 0000000..4453723
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/bfp16_utils.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/bfp16_utils.h"
+
+#include <stdint.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+int ConvertFromFloatToBFP16(float *fp32, void *fp16, int count) {
+    bfp16_t *bfp16PTR = (bfp16_t *)fp16;
+    for (int i = 0; i < count; ++i) {
+        bfp16PTR[i] = fp32[i];
+    }
+
+    return 0;
+}
+
+int ConvertFromBFP16ToFloat(void *fp16, float *fp32, int count) {
+    bfp16_t *bfp16PTR = (bfp16_t *)fp16;
+    for (int i = 0; i < count; ++i) {
+        fp32[i] = float(bfp16PTR[i]);
+    }
+
+    return 0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_converter_default.cc b/3rdparty/TNN/source/tnn/utils/blob_converter_default.cc
new file mode 100644
index 0000000..13429c4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_converter_default.cc
@@ -0,0 +1,425 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/blob_converter_default.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "tnn/core/blob_int8.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/naive_compute.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/bfp16_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DefaultBlobConverterAcc::DefaultBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {}
+DefaultBlobConverterAcc::~DefaultBlobConverterAcc() {}
+
+static uint8_t saturate_cast(float data) {
+    data += 0.5;
+    data = std::min(std::max(data, 0.0f), 255.0f);
+    return static_cast<uint8_t>(data);
+}
+
+/*
+ * Convert an uint8 BGR / BGRA image to nchw float blob
+ */
+static void BGRAToBlob(const uint8_t *src, float *dst, float *scale, float *bias, int channel, int hw) {
+    auto dst_c0 = dst, dst_c1 = dst + hw;
+    auto dst_c2 = dst + hw * 2, dst_c3 = dst + hw * 3;
+    for (int i = 0; i < hw; ++i) {
+        dst_c0[i] = scale[0] * src[4 * i + 0] + bias[0];
+        dst_c1[i] = scale[1] * src[4 * i + 1] + bias[1];
+        dst_c2[i] = scale[2] * src[4 * i + 2] + bias[2];
+        if (channel == 4)
+            dst_c3[i] = scale[3] * src[4 * i + 3] + bias[3];
+    }
+}
+
+/*
+ * Convert an uint8 single channel image to nchw float blob
+ */
+static void GrayToBlob(const uint8_t *src, float *dst, float scale, float bias, int hw) {
+    for (int i = 0; i < hw; ++i) {
+        dst[i] = scale * src[i] + bias;
+    }
+}
+
+/*
+ * Convert an uint8 BGR image to nchw float blob
+ */
+static void BGRToBlob(const uint8_t *src, float *dst, float *scale, float *bias, int hw) {
+    auto dst_c0 = dst, dst_c1 = dst + hw, dst_c2 = dst + hw * 2;
+    for (int i = 0; i < hw; ++i) {
+        dst_c0[i] = scale[0] * src[3 * i + 0] + bias[0];
+        dst_c1[i] = scale[1] * src[3 * i + 1] + bias[1];
+        dst_c2[i] = scale[2] * src[3 * i + 2] + bias[2];
+    }
+}
+
+/*
+ * Convert a nchw float mat to/from nchw float blob
+ */
+static void NCHWConvert(const float *src, float *dst, float *scale, float *bias, int channel, int hw) {
+    for (int c = 0; c < channel; ++c) {
+        for (int i = 0; i < hw; ++i) {
+            int data_pos = c * hw + i;
+            dst[data_pos] = scale[c] * src[data_pos] + bias[c];
+        }
+    }
+}
+
+/*
+ * Convert a nchw float mat to/from nchw int blob
+ */
+static void NCHWConvert(const int *src, float *dst, float *scale, float *bias, int channel, int hw) {
+    for (int c = 0; c < channel; ++c) {
+        for (int i = 0; i < hw; ++i) {
+            int data_pos = c * hw + i;
+            dst[data_pos] = scale[c] * (float)src[data_pos] + bias[c];
+        }
+    }
+}
+
+/*
+ * Convert a nchw float blob to BGRA
+ * input blob must have 3 or 4 channels
+ */
+static void BlobToBGRA(const float *src, uint8_t *dst, float *scale, float *bias, int channel, int hw) {
+    auto src_c0 = src, src_c1 = src + hw;
+    auto src_c2 = src + hw * 2, src_c3 = src + hw * 3;
+    for (int i = 0; i < hw; ++i) {
+        dst[4 * i + 0] = saturate_cast(scale[0] * src_c0[i] + bias[0]);
+        dst[4 * i + 1] = saturate_cast(scale[1] * src_c1[i] + bias[1]);
+        dst[4 * i + 2] = saturate_cast(scale[2] * src_c2[i] + bias[2]);
+        if (channel == 4)
+            dst[4 * i + 3] = saturate_cast(scale[3] * src_c3[i] + bias[3]);
+    }
+}
+
+/*
+ * Convert a nchw float blob to grayscale uint8 image
+ * input blob must have only 1 channel
+ */
+static void BlobToGray(const float *src, uint8_t *dst, float scale, float bias, int hw) {
+    for (int i = 0; i < hw; ++i) {
+        dst[i] = saturate_cast(scale * src[i] + bias);
+    }
+}
+
+/*
+ * Convert a nchw float blob to bgr uint8 image
+ * input blob must have 3 channel
+ */
+static void BlobToBGR(const float *src, uint8_t *dst, float *scale, float *bias, int hw) {
+    auto src_c0 = src, src_c1 = src + hw, src_c2 = src + hw * 2;
+    for (int i = 0; i < hw; ++i) {
+        dst[3 * i + 0] = saturate_cast(scale[0] * src_c0[i] + bias[0]);
+        dst[3 * i + 1] = saturate_cast(scale[1] * src_c1[i] + bias[1]);
+        dst[3 * i + 2] = saturate_cast(scale[2] * src_c2[i] + bias[2]);
+    }
+}
+
+static void NV12ToBGR(const unsigned char* nv12, unsigned char* bgr, int h, int w) {
+    NaiveYUVToBGROrBGRA(nv12, bgr, 3, h, w, true);
+}
+
+static void NV21ToBGR(const unsigned char* nv21, unsigned char* bgr, int h, int w) {
+    NaiveYUVToBGROrBGRA(nv21, bgr, 3, h, w, false);
+}
+
+/*
+reverse channel in format rgb uint8
+*/
+void RGBChannelReverseNaive(uint8_t *src, uint8_t *dst, int channel, int hw) {
+    for (int i = 0; i < hw; i++) {
+        uint8_t tmp    = src[i * 3];
+        dst[i * 3]     = src[i * 3 + 2];
+        dst[i * 3 + 2] = tmp;
+        dst[i * 3 + 1] = src[i * 3 + 1];
+    }
+}
+
+/*
+reverse channel in format rgba uint8, only reverse rgb
+*/
+void RGBAChannelReverseNaive(uint8_t *src, uint8_t *dst, int channel, int hw) {
+    for (int i = 0; i < hw; i++) {
+        uint8_t tmp    = src[i * 4];
+        dst[i * 4]     = src[i * 4 + 2];
+        dst[i * 4 + 2] = tmp;
+        dst[i * 4 + 1] = src[i * 4 + 1];
+        if (channel == 4)
+            dst[i * 4 + 3] = src[i * 4 + 3];
+    }
+}
+
+#define FREE_INT8_TEMP_DATA()                               \
+    if (desc.data_type == DATA_TYPE_INT8 && blob_data) {    \
+        delete[] blob_data;                                 \
+    }                                                       \
+
+Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam param, void *command_queue) {
+    Status ret = TNN_OK;
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob is null");
+    }
+    auto blob_data = reinterpret_cast<float *>(blob_->GetHandle().base);
+    auto desc      = blob_->GetBlobDesc();
+    auto dims      = desc.dims;
+    auto hw        = DimsVectorUtils::Count(dims, 2);
+    hw             = hw == 0 ? 1 : hw;
+
+    if (desc.data_type == DATA_TYPE_INT8) {
+        if (image.GetMatType() == RESERVED_INT8_TEST) {
+            memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims));
+            return TNN_OK;
+        } else {
+            auto count = DimsVectorUtils::Count(dims);
+            auto real_blob_data = new float[count];
+            auto blob_scale = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle.force_to<float *>();
+            auto scale_len  = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle.GetDataCount();
+            NaiveDequant(reinterpret_cast<int8_t *>(blob_->GetHandle().base), blob_scale, scale_len, real_blob_data, dims);
+            blob_data = real_blob_data;
+        }
+    } else if (desc.data_type == DATA_TYPE_BFP16) {
+        if (image.GetMatType() == RESERVED_BFP16_TEST) {
+            memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * 2);
+            return TNN_OK;
+        }
+    } else if (desc.data_type == DATA_TYPE_INT32) {
+        if (image.GetMatType() == NC_INT32) {
+            memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int32_t));
+            return TNN_OK;
+        } else if (image.GetMatType() == NCHW_FLOAT) {
+            int batch = DimsFunctionUtils::GetDim(dims, 0);
+            int channel = DimsFunctionUtils::GetDim(dims, 1);
+            for (int n = 0; n < batch; n++) {
+                NCHWConvert((int*)blob_data + n * channel * hw, reinterpret_cast<float *>(image.GetData()) + n * channel * hw,
+                        param.scale.data(), param.bias.data(), channel, hw);
+            }
+        }
+    }
+
+    if (image.GetMatType() == NCHW_FLOAT) {
+        int batch = DimsFunctionUtils::GetDim(dims, 0);
+        int channel = DimsFunctionUtils::GetDim(dims, 1);
+        for (int n = 0; n < batch; n++) {
+            NCHWConvert(blob_data + n * channel * hw, reinterpret_cast<float *>(image.GetData()) + n * channel * hw,
+                        param.scale.data(), param.bias.data(), channel, hw);
+        }
+    } else if (image.GetMatType() == N8UC4) {
+        for (int n = 0; n < dims[0]; n++) {
+            BlobToBGRA(blob_data + n * dims[1] * hw, reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw,
+                       param.scale.data(), param.bias.data(), dims[1], hw);
+        }
+    } else if (image.GetMatType() == N8UC3) {
+        for (int n = 0; n < dims[0]; n++) {
+            BlobToBGR(blob_data + n * 3 * hw, reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw,
+                      param.scale.data(), param.bias.data(), hw);
+        }
+    } else if (image.GetMatType() == NGRAY) {
+        for (int n = 0; n < dims[0]; n++) {
+            BlobToGray(blob_data + n * hw, reinterpret_cast<uint8_t *>(image.GetData()) + n * hw, param.scale[0],
+                       param.bias[0], hw);
+        }
+    } else if (image.GetMatType() == RESERVED_BFP16_TEST) {
+        for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
+            reinterpret_cast<bfp16_t *>(image.GetData())[n] = blob_data[n];
+        }
+    } else {
+        FREE_INT8_TEMP_DATA();
+        return Status(TNNERR_PARAM_ERR, "DefaultBlobConverterAcc::ConvertToMatAsync, convert type not support yet");
+    }
+
+    // reverse channel before convert if needed
+    if (param.reverse_channel) {
+        if (image.GetMatType() == N8UC3) {
+            for (int n = 0; n < dims[0]; n++) {
+                RGBChannelReverseNaive(
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw,
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw, dims[1], hw);
+            }
+        } else if (image.GetMatType() == N8UC4) {
+            for (int n = 0; n < dims[0]; n++) {
+                RGBAChannelReverseNaive(
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw,
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw, dims[1], hw);
+            }
+        } else {
+            FREE_INT8_TEMP_DATA();
+            return Status(TNNERR_PARAM_ERR, "reverse type not support yet, mat type: " +
+                          std::to_string(image.GetMatType()));
+        }
+    }
+
+    FREE_INT8_TEMP_DATA();
+    return ret;
+}
+
+static bool NeedDoScaleBias(const MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+Status DefaultBlobConverterAcc::ConvertFromMatFunc(Mat& image, float* blob_data,
+        MatConvertParam& param, BlobDesc& desc, const DimsVector& dims, const int hw) {
+    if (image.GetMatType() == NCHW_FLOAT) {
+        int batch = DimsFunctionUtils::GetDim(dims, 0);
+        int channel = DimsFunctionUtils::GetDim(dims, 1);
+        if (NeedDoScaleBias(param)) {
+            for (int n = 0; n < batch; n++) {
+                NCHWConvert(reinterpret_cast<float *>(image.GetData()) + n * channel * hw, blob_data + n * channel * hw,
+                            param.scale.data(), param.bias.data(), channel, hw);
+            }
+        } else {
+            memcpy(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(float));
+        }
+    } else if (image.GetMatType() == N8UC4) {
+        for (int n = 0; n < dims[0]; n++) {
+            BGRAToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw, blob_data + n * dims[1] * hw,
+                       param.scale.data(), param.bias.data(), dims[1], hw);
+        }
+    } else if (image.GetMatType() == N8UC3) {
+        for (int n = 0; n < dims[0]; n++) {
+            BGRToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw, blob_data + n * 3 * hw,
+                      param.scale.data(), param.bias.data(), hw);
+        }
+    } else if (image.GetMatType() == NGRAY) {
+        for (int n = 0; n < dims[0]; n++) {
+            GrayToBlob(reinterpret_cast<uint8_t *>(image.GetData()) + n * hw, blob_data + n * hw, param.scale[0],
+                       param.bias[0], hw);
+        }
+    } else if (image.GetMatType() == NNV12) {
+        Mat bgr(DEVICE_NAIVE, RESERVED_INT8_TEST, image.GetDims());
+        for (int n = 0; n < dims[0]; n++) {
+            NV12ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+            BGRToBlob(reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, blob_data + n * 3 * hw,
+                      param.scale.data(), param.bias.data(), hw);
+        }
+    } else if (image.GetMatType() == NNV21) {
+        Mat bgr(DEVICE_NAIVE, RESERVED_INT8_TEST, image.GetDims());
+        for (int n = 0; n < dims[0]; n++) {
+            NV21ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
+                      reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
+            BGRToBlob(reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, blob_data + n * 3 * hw,
+                      param.scale.data(), param.bias.data(), hw);
+        }
+    } else if (image.GetMatType() == RESERVED_BFP16_TEST) {
+        for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
+            blob_data[n] = float(reinterpret_cast<bfp16_t *>(image.GetData())[n]);
+        }
+    } else if (image.GetMatType() == RESERVED_FP16_TEST) {
+        for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
+            blob_data[n] = (float)(reinterpret_cast<fp16_t *>(image.GetData())[n]);
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "DefaultBlobConverterAcc::ConvertFromMatFunc, convert type not support yet");
+    }
+
+    return TNN_OK;
+}
+
+Status DefaultBlobConverterAcc::ConvertFromMatAsync(Mat &image_src, MatConvertParam param, void *command_queue) {
+    if (blob_ == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input/output blob_ is null");
+    }
+    auto desc      = blob_->GetBlobDesc();
+    auto dims      = desc.dims;
+    auto hw        = DimsVectorUtils::Count(dims, 2);
+    auto blob_data = reinterpret_cast<float *>(blob_->GetHandle().base);
+    if (desc.data_type == DATA_TYPE_INT8) {
+        if (image_src.GetMatType() == RESERVED_INT8_TEST) {
+            memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims));
+            return TNN_OK;
+        } else
+            blob_data = new float[dims[0] * dims[1] * hw];
+    } else if (desc.data_type == DATA_TYPE_BFP16) {
+        if (image_src.GetMatType() == RESERVED_BFP16_TEST) {
+            memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims) * 2);
+            return TNN_OK;
+        }
+    } else if (desc.data_type == DATA_TYPE_INT32) {
+        if (image_src.GetMatType() == NC_INT32) {
+            memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t));
+            return TNN_OK;
+        }
+    }
+
+    Mat image(image_src.GetDeviceType(), image_src.GetMatType(), image_src.GetDims(), image_src.GetData());
+
+    // reverse channel before convert if needed
+    if (param.reverse_channel) {
+        Mat reversed(image.GetDeviceType(), image.GetMatType(), image.GetDims());
+        if (image.GetMatType() == N8UC3) {
+            for (int n = 0; n < dims[0]; n++) {
+                RGBChannelReverseNaive(
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw,
+                    reinterpret_cast<uint8_t *>(reversed.GetData()) + n * 3 * hw, dims[1], hw);
+            }
+        } else if (image.GetMatType() == N8UC4) {
+            for (int n = 0; n < dims[0]; n++) {
+                RGBAChannelReverseNaive(
+                    reinterpret_cast<uint8_t *>(image.GetData()) + n * 4 * hw,
+                    reinterpret_cast<uint8_t *>(reversed.GetData()) + n * 4 * hw, dims[1], hw);
+            }
+        } else {
+            FREE_INT8_TEMP_DATA();
+            return Status(TNNERR_PARAM_ERR, "reverse type not support yet, mat type: " +
+                          std::to_string(image.GetMatType()));
+        }
+        image = reversed;
+    }
+
+    Status ret = ConvertFromMatFunc(image, blob_data, param, desc, dims, hw);
+    if (ret != TNN_OK) {
+        FREE_INT8_TEMP_DATA();
+        return ret;
+    }
+
+    if (desc.data_type == DATA_TYPE_INT8) {
+        auto blob_scale     = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle.force_to<float *>();
+        auto scale_len      = reinterpret_cast<BlobInt8 *>(blob_)->GetIntResource()->scale_handle.GetDataCount();
+        auto real_blob_data = reinterpret_cast<int8_t *>(blob_->GetHandle().base);
+        NaiveQuant(blob_data, blob_scale, scale_len, real_blob_data, dims);
+        delete[] blob_data;
+    }
+    return TNN_OK;
+}
+
+#undef FREE_INT8_TEMP_DATA
+
+Status DefaultBlobConverterAcc::ConvertToMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertToMatAsync(image, param, command_queue);
+}
+
+Status DefaultBlobConverterAcc::ConvertFromMat(Mat &image, MatConvertParam param, void *command_queue) {
+    return ConvertFromMatAsync(image, param, command_queue);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_converter_default.h b/3rdparty/TNN/source/tnn/utils/blob_converter_default.h
new file mode 100644
index 0000000..2e2a149
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_converter_default.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_DEFAULT_H_
+#define TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_DEFAULT_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+class DefaultBlobConverterAcc : public BlobConverterAcc {
+public:
+    DefaultBlobConverterAcc(Blob* blob);
+    virtual ~DefaultBlobConverterAcc();
+
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
+
+private:
+    Status ConvertFromMatFunc(Mat& image, float* blob_data, MatConvertParam& param, BlobDesc& desc,
+                              const DimsVector& dims, const int hw);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_DEFAULT_H_
diff --git a/3rdparty/TNN/source/tnn/utils/blob_converter_internal.cc b/3rdparty/TNN/source/tnn/utils/blob_converter_internal.cc
new file mode 100644
index 0000000..44bca83
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_converter_internal.cc
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <mutex>
+#include <string>
+#include "tnn/utils/blob_converter.h"
+
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+BlobConverter::BlobConverter(Blob* blob) {
+    blob_ = blob;
+    impl_ = BlobConverterManager::Shared()->CreateBlobConverterAcc(blob);
+}
+
+Status BlobConverter::ConvertToMat(Mat& image, MatConvertParam param, void* command_queue) {
+    if (!impl_) {
+        return Status(TNNERR_INIT_LAYER, "image converter is nil, check device type");
+    }
+
+    Status ret = CheckScaleBiasInParam(image, param, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return impl_->ConvertToMat(image, param, command_queue);
+}
+
+Status BlobConverter::ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    if (!impl_) {
+        return Status(TNNERR_INIT_LAYER, "image converter is nil, check device type");
+    }
+
+    Status ret = CheckScaleBiasInParam(image, param, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return impl_->ConvertToMatAsync(image, param, command_queue);
+}
+
+Status BlobConverter::ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue) {
+    if (!impl_) {
+        return Status(TNNERR_INIT_LAYER, "image converter is nil, check device type");
+    }
+
+    Status ret = CheckScaleBiasInParam(image, param, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return impl_->ConvertFromMat(image, param, command_queue);
+}
+
+Status BlobConverter::ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue) {
+    if (!impl_) {
+        return Status(TNNERR_INIT_LAYER, "image converter is nil, check device type");
+    }
+
+    Status ret = CheckScaleBiasInParam(image, param, false);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    return impl_->ConvertFromMatAsync(image, param, command_queue);
+}
+
+Status BlobConverter::CheckScaleBiasInParam(Mat& image, MatConvertParam& param, bool convert_to_mat) {
+    int channel = convert_to_mat ? blob_->GetBlobDesc().dims[1] : image.GetChannel();
+    // 非图像类的Mat channel和scale/bias长度与不匹配时，如果scale全1，bias全0，会默认调整，否则报错
+    if ((image.GetMatType() == NCHW_FLOAT || image.GetMatType() == RESERVED_BFP16_TEST ||
+         image.GetMatType() == RESERVED_FP16_TEST || image.GetMatType() == RESERVED_INT8_TEST ||
+         image.GetMatType() == NC_INT32) && (channel > param.scale.size() || channel > param.bias.size())) {
+        if (!NeedDoScaleBias(param)) {
+            param.scale = std::vector<float>(channel, 1.0f);
+            param.bias = std::vector<float>(channel, 0.0f);
+        } else {
+            LOGE("blob converter param is invalid, scale bias not match Mat channel,"
+                 "scale size: %d, bias size: %d, Mat channel: %d\n", (int)param.scale.size(),
+                (int)param.bias.size(), image.GetChannel());
+            return Status(TNNERR_PARAM_ERR, "blob converter param is invalid!");
+        }
+    }
+
+    return TNN_OK;
+}
+
+bool BlobConverter::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::shared_ptr<BlobConverterManager>& BlobConverterManager::Shared() {
+    static std::once_flag once;
+    static std::shared_ptr<BlobConverterManager> g_global_blob_converter_manager;
+    std::call_once(once, []() { g_global_blob_converter_manager = std::make_shared<BlobConverterManager>(); });
+    return g_global_blob_converter_manager;
+}
+
+std::shared_ptr<BlobConverterAcc> BlobConverterManager::CreateBlobConverterAcc(Blob* blob) {
+    auto iter = converter_creater_map_.find(blob->GetBlobDesc().device_type);
+    if (iter != converter_creater_map_.end()) {
+        return iter->second->CreateBlobConverterAcc(blob);
+    }
+    return nullptr;
+}
+
+int BlobConverterManager::RegisterBlobConverterAccCreater(DeviceType type,
+                                                          std::shared_ptr<BlobConverterAccCreater> creater) {
+    auto iter = converter_creater_map_.find(type);
+    if (iter != converter_creater_map_.end()) {
+        LOGE("Error: device_type(%d) cannot be registered twice\n", type);
+        return 1;
+    }
+    if (!creater) {
+        LOGE("Error: MatBlobConverterAccCreater is nil device_type(%d)\n", type);
+        return 1;
+    }
+    converter_creater_map_[type] = creater;
+    return 0;
+}
+
+BlobConverterManager::BlobConverterManager() {}
+BlobConverterManager::~BlobConverterManager() {}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_converter_internal.h b/3rdparty/TNN/source/tnn/utils/blob_converter_internal.h
new file mode 100644
index 0000000..460807c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_converter_internal.h
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_INTERNAL_H_
+#define TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_INTERNAL_H_
+
+#include <map>
+#include <memory>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+class BlobConverterAcc {
+public:
+    BlobConverterAcc(Blob* blob) : blob_(blob){};
+    virtual ~BlobConverterAcc(){};
+    virtual Status ConvertToMat(Mat& image, MatConvertParam param, void* command_queue = NULL)      = 0;
+    virtual Status ConvertToMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL) = 0;
+
+    virtual Status ConvertFromMat(Mat& image, MatConvertParam param, void* command_queue = NULL)      = 0;
+    virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL) = 0;
+
+protected:
+    Blob* blob_;
+};
+
+class BlobConverterAccCreater {
+public:
+    virtual ~BlobConverterAccCreater(){};
+    virtual std::shared_ptr<BlobConverterAcc> CreateBlobConverterAcc(Blob* blob) = 0;
+};
+
+class BlobConverterManager {
+public:
+    static std::shared_ptr<BlobConverterManager>& Shared();
+    BlobConverterManager();
+    ~BlobConverterManager();
+    std::shared_ptr<BlobConverterAcc> CreateBlobConverterAcc(Blob* blob);
+    int RegisterBlobConverterAccCreater(DeviceType type, std::shared_ptr<BlobConverterAccCreater> creater);
+
+private:
+    std::map<DeviceType, std::shared_ptr<BlobConverterAccCreater>> converter_creater_map_;
+};
+
+template <typename T>
+class BlobConverterAccRegister {
+public:
+    explicit BlobConverterAccRegister(DeviceType type) {
+        auto creater  = std::make_shared<T>();
+        auto& manager = BlobConverterManager::Shared();
+        manager->RegisterBlobConverterAccCreater(type, creater);
+    };
+    ~BlobConverterAccRegister(){};
+};
+}  // namespace TNN_NS
+
+#define DECLARE_BLOB_CONVERTER_CREATER(device)                                                                         \
+    class device##BlobConverterAccCreater : public BlobConverterAccCreater {                                           \
+    public:                                                                                                            \
+        virtual ~device##BlobConverterAccCreater(){};                                                                  \
+        virtual std::shared_ptr<BlobConverterAcc> CreateBlobConverterAcc(Blob* blob) {                                 \
+            return std::make_shared<device##BlobConverterAcc>(blob);                                                   \
+        };                                                                                                             \
+    }
+
+#define REGISTER_BLOB_CONVERTER(device, device_type)                                                                   \
+    BlobConverterAccRegister<device##BlobConverterAccCreater> g_blob_converter_##device(device_type)
+
+#endif  // TNN_SOURCE_TNN_UTILS_BLOB_CONVERTER_INTERNAL_H_
diff --git a/3rdparty/TNN/source/tnn/utils/blob_dump_utils.cc b/3rdparty/TNN/source/tnn/utils/blob_dump_utils.cc
new file mode 100644
index 0000000..2815cf5
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_dump_utils.cc
@@ -0,0 +1,141 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/blob_dump_utils.h"
+
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "tnn/core/mat.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+#pragma warning(push)
+#pragma warning(disable : 4996)
+
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+#ifdef __ANDROID__
+std::string g_tnn_dump_directory = "/storage/emulated/0/";
+#else
+std::string g_tnn_dump_directory = "./";
+#endif //__ANDROID__
+#endif
+
+// #define DUMP_RAW_INT8
+
+std::string BlobDescToString(BlobDesc desc) {
+    std::string dims_info = "dims";
+    for(int i = 0; i < desc.dims.size(); ++i) {
+       dims_info += "-" + ToString(desc.dims[i]);
+    }
+
+    // blob name rather than layer name
+    char ss[1000];
+    std::string name = desc.name;
+    std::replace(name.begin(), name.end(), '/', '_');
+    std::replace(name.begin(), name.end(), ':', '_');
+    snprintf(ss, 1000, "%s-%s", name.c_str(), dims_info.c_str());
+    return std::string(ss);
+}
+
+Status DumpDeviceBlob(Blob* blob, Context* context, std::string fname_prefix) {
+    void* command_queue;
+    context->GetCommandQueue(&command_queue);
+
+    auto blob_desc = blob->GetBlobDesc();
+    MatType mat_type = NCHW_FLOAT;
+    auto data_type = blob_desc.data_type;
+
+#ifdef DUMP_RAW_INT8
+    if(blob_desc.data_type == DATA_TYPE_INT8) {
+        mat_type = RESERVED_INT8_TEST;
+    }
+#endif
+    if(blob_desc.data_type == DATA_TYPE_INT32) {
+        mat_type = NC_INT32;
+    }
+
+    Mat cpu_mat(DEVICE_NAIVE, mat_type, blob_desc.dims);
+    void *data_ptr = cpu_mat.GetData();
+    
+    if (blob->GetBlobDesc().device_type != DEVICE_NAIVE || data_type == DATA_TYPE_INT8) {
+        BlobConverter blob_converter_dev(blob);
+        Status ret = blob_converter_dev.ConvertToMat(cpu_mat, MatConvertParam(), command_queue);
+        if (ret != TNN_OK) {
+            LOGE("output blob_converter failed (%s)\n", ret.description().c_str());
+            return ret;
+        }
+        if (mat_type == NCHW_FLOAT) {
+            data_type = DATA_TYPE_FLOAT;
+        }
+    } else {
+        data_ptr = blob->GetHandle().base;
+    }
+    if (data_ptr == nullptr) {
+        LOGD("This blob: %s does not need dump\n", blob_desc.name.c_str());
+        return TNN_OK;
+    }
+    char fname[1000];
+    snprintf(fname, 1000, "%s-%s.txt", fname_prefix.c_str(), BlobDescToString(blob_desc).c_str());
+    FILE* fp = fopen(fname, "wb");
+    if (!fp) {
+        return Status(TNNERR_OPEN_FILE, "open file error");
+    }
+    int count = DimsVectorUtils::Count(blob_desc.dims);
+
+#ifdef DUMP_RAW_INT8
+    int8_t* ptr = reinterpret_cast<int8_t*>(cpu_mat.GetData());
+    for (int n = 0; n < count; ++n) {
+        fprintf(fp, "%d\n", int(ptr[n]));
+    }
+#else
+    if (data_type == DATA_TYPE_FLOAT) {
+        auto ptr = (float *)data_ptr;
+        for (int n = 0; n < count; ++n) {
+            fprintf(fp, "%.9f\n", ptr[n]);
+        }
+    } else if (data_type == DATA_TYPE_INT8) {
+        auto ptr = (char *)data_ptr;
+        for (int n = 0; n < count; ++n) {
+            fprintf(fp, "%d\n", ptr[n]);
+        }
+    }else if (data_type == DATA_TYPE_INT32) {
+        auto ptr = (int *)data_ptr;
+        for (int n = 0; n < count; ++n) {
+            fprintf(fp, "%d\n", ptr[n]);
+        }
+    } else if (data_type == DATA_TYPE_UINT32) {
+        auto ptr = (unsigned int *)data_ptr;
+        for (int n = 0; n < count; ++n) {
+            fprintf(fp, "%d\n", ptr[n]);
+        }
+    } else {
+        LOGE("unsupport data type to dump: %d\n", data_type);
+    }
+#endif
+
+    fclose(fp);
+
+    return TNN_OK;
+}
+
+
+#pragma warning(push)
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_dump_utils.h b/3rdparty/TNN/source/tnn/utils/blob_dump_utils.h
new file mode 100644
index 0000000..c14595e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_dump_utils.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_BLOB_DUMP_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_BLOB_DUMP_UTILS_H_
+
+#include <string>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/context.h"
+#include "tnn/core/status.h"
+
+#define DUMP_INPUT_BLOB 0
+#define DUMP_OUTPUT_BLOB 0
+
+namespace TNN_NS {
+
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+extern PUBLIC std::string g_tnn_dump_directory;
+#endif
+
+Status DumpDeviceBlob(Blob* dst, Context* context, std::string fname_prefix);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_BLOB_DUMP_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.cc b/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.cc
new file mode 100644
index 0000000..05710ee
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/blob_memory_size_utils.h"
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    int count      = 0;
+    if (desc.data_format == DATA_FORMAT_NC4HW4) {
+        count = desc.dims[0] * ROUND_UP(desc.dims[1], 4) * desc.dims[2] * desc.dims[3];
+    } else if (desc.data_format == DATA_FORMAT_NHWC4) {
+        count = desc.dims[0] * ROUND_UP(desc.dims[1], 4) * ROUND_UP(desc.dims[2] * desc.dims[3], 4);
+    } else {
+        count = DimsVectorUtils::Count(desc.dims);
+    }
+    info.dims.push_back(count);
+    return info;
+}
+
+BlobMemorySizeInfo Calculate2DCLImageMemorySize(BlobDesc& desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    if (desc.data_format == DATA_FORMAT_NHC4W4 || desc.data_format == DATA_FORMAT_AUTO) {
+        if (desc.dims.size() <= 4) {
+            int batch, channel, height, width;
+            auto dims        = desc.dims;
+            batch            = DimsFunctionUtils::GetDim(dims, 0);
+            channel          = DimsFunctionUtils::GetDim(dims, 1);
+            height           = DimsFunctionUtils::GetDim(dims, 2);
+            width            = DimsFunctionUtils::GetDim(dims, 3);
+            int image_width  = UP_DIV(channel, 4) * width;
+            int image_height = batch * height;
+            info.dims.push_back(image_width);
+            info.dims.push_back(image_height);
+        } else if (desc.dims.size() == 5) {
+            int batch, channel, dim2, dim3, dim4;
+            auto dims       = desc.dims;
+            batch = DimsFunctionUtils::GetDim(dims, 0);
+            channel = DimsFunctionUtils::GetDim(dims, 1);
+            dim2 = DimsFunctionUtils::GetDim(dims, 2);
+            dim3 = DimsFunctionUtils::GetDim(dims, 3);
+            dim4 = DimsFunctionUtils::GetDim(dims, 4);
+            int image_width  = UP_DIV(channel, 4) * dim4;
+            int image_height = batch * dim2 * dim3;
+            info.dims.push_back(image_width);
+            info.dims.push_back(image_height);
+        } else if (desc.dims.size() == 6) {
+            int batch, channel, dim2, dim3, dim4, dim5;
+            auto dims       = desc.dims;
+            batch = DimsFunctionUtils::GetDim(dims, 0);
+            channel = DimsFunctionUtils::GetDim(dims, 1);
+            dim2 = DimsFunctionUtils::GetDim(dims, 2);
+            dim3 = DimsFunctionUtils::GetDim(dims, 3);
+            dim4 = DimsFunctionUtils::GetDim(dims, 4);
+            dim5 = DimsFunctionUtils::GetDim(dims, 5);
+            int image_width  = UP_DIV(channel, 4) * dim4 * dim5;
+            int image_height = batch * dim2 * dim3;
+            info.dims.push_back(image_width);
+            info.dims.push_back(image_height);
+        } else {
+            LOGE("TNN Blob not support dims(%d)\n", (int)desc.dims.size());
+            return info;
+        }
+    } else if (desc.data_format == DATA_FORMAT_CNH4) {
+        int batch, channel, height;
+        auto dims        = desc.dims;
+        batch            = DimsFunctionUtils::GetDim(dims, 0);
+        channel          = DimsFunctionUtils::GetDim(dims, 1);
+        height           = DimsFunctionUtils::GetDim(dims, 2);
+        int image_width  = UP_DIV(height, 4);
+        int image_height = channel * batch;
+        info.dims.push_back(image_width);
+        info.dims.push_back(image_height);
+    } else {
+        LOGE("TNN Blob format(%d) not support on CLImage\n", desc.data_format);
+        return info;
+    }
+    return info;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.h b/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.h
new file mode 100644
index 0000000..8336ce3
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_memory_size_utils.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_BLOB_MEMORY_SIZE_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_BLOB_MEMORY_SIZE_UTILS_H_
+
+#include "tnn/core/blob.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+
+namespace TNN_NS {
+
+BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc);
+
+BlobMemorySizeInfo Calculate2DCLImageMemorySize(BlobDesc& desc);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_BLOB_MEMORY_SIZE_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.cc b/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.cc
new file mode 100644
index 0000000..d5a12ef
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.cc
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/blob_transfer_utils.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+Status CopyToDevice(Blob* dst, Blob* src, void* command_queue) {
+    DeviceType device_type = dst->GetBlobDesc().device_type;
+
+    Status ret = TNN_OK;
+
+    auto device = GetDevice(device_type);
+    if (device == NULL) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    BlobHandle src_handle = src->GetHandle();
+    BlobHandle dst_handle = dst->GetHandle();
+    BlobDesc blob_desc    = src->GetBlobDesc();
+
+    ret = device->CopyToDevice(&dst_handle, &src_handle, blob_desc, command_queue);
+
+    if (ret != TNN_OK) {
+        LOGD("Copy blob to device failed\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status CopyFromDevice(Blob* dst, Blob* src, void* command_queue) {
+    DeviceType device_type = src->GetBlobDesc().device_type;
+
+    Status ret = TNN_OK;
+
+    auto device = GetDevice(device_type);
+
+    if (device == NULL) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    BlobHandle src_handle = src->GetHandle();
+    BlobHandle dst_handle = dst->GetHandle();
+    BlobDesc blob_desc    = src->GetBlobDesc();
+
+    ret = device->CopyFromDevice(&dst_handle, &src_handle, blob_desc, command_queue);
+
+    if (ret != TNN_OK) {
+        LOGD("Copy blob to device failed\n");
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status Blob2RawBuffer(Blob *blob, std::shared_ptr<RawBuffer> &buffer) {
+    if (!blob) {
+        return Status(TNNERR_PARAM_ERR, "blob is null");
+    }
+    if (blob->GetBlobDesc().device_type != DEVICE_NAIVE) {
+        LOGE("Blob2RawBuffer dont support device type: %d", blob->GetBlobDesc().device_type);
+        return Status(TNNERR_PARAM_ERR, "Blob2RawBuffer dont support device type");
+    }
+    
+    const auto dims = blob->GetBlobDesc().dims;
+    
+    int count = DimsVectorUtils::Count(dims);
+    if (dims.size() == 0 && !blob->GetHandle().base) {
+        count = 0;
+    }
+    const int ele_size = DataTypeUtils::GetBytesSize(blob->GetBlobDesc().data_type);
+    
+    //处理原来buffer已有分配内存的情况
+    if (!buffer || buffer->GetBytesSize() != count*ele_size) {
+        buffer = std::make_shared<RawBuffer>(count*ele_size);
+    }
+    buffer->SetDataType(blob->GetBlobDesc().data_type);
+    buffer->SetBufferDims(blob->GetBlobDesc().dims);
+    
+    if (count > 0) {
+        memcpy(buffer->force_to<void *>(), blob->GetHandle().base, count*ele_size);
+    }
+    
+    return TNN_OK;
+}
+
+Status RawBuffer2Blob(RawBuffer *buffer, std::shared_ptr<Blob> &blob) {
+    if (!buffer) {
+        LOGE("RawBuffer2Blob:: buffer is null \n");
+        return Status(TNNERR_PARAM_ERR, "RawBuffer2Blob:: buffer is null");
+    }
+    
+    const int count = blob ? DimsVectorUtils::Count(blob->GetBlobDesc().dims) : 0;
+    const int ele_size = blob ? DataTypeUtils::GetBytesSize(blob->GetBlobDesc().data_type) : 0;
+    
+    if (!blob || buffer->GetBytesSize() != count*ele_size) {
+        BlobDesc desc;
+        {
+            desc.device_type = DEVICE_NAIVE;
+            desc.data_type = buffer->GetDataType();
+            desc.dims = buffer->GetBufferDims();
+        }
+        if (buffer->GetBytesSize() > 0) {
+            blob = std::make_shared<Blob>(desc, true);
+        } else {
+            blob = std::make_shared<Blob>(desc, false);
+        }
+    }
+    
+    if (blob->GetHandle().base && buffer->GetBytesSize() > 0) {
+        memcpy(blob->GetHandle().base, buffer->force_to<void *>(), buffer->GetBytesSize());
+    }
+    
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.h b/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.h
new file mode 100644
index 0000000..0da0c59
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/blob_transfer_utils.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_BLOB_TRANSFER_UTILS_H
+#define TNN_INCLUDE_TNN_UTILS_BLOB_TRANSFER_UTILS_H
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class RawBuffer;
+
+Status CopyToDevice(Blob *dst, Blob *src, void *command_queue);
+Status CopyFromDevice(Blob *blob, Blob *src, void *command_queue);
+
+// @brief transfer blob to rawbuffer. The device of blob must be DEVICE_NAIVE
+Status Blob2RawBuffer(Blob *blob, std::shared_ptr<RawBuffer> &buffer);
+// @brief transfer rawbuffer to blob. The device of blob will be DEVICE_NAIVE
+Status RawBuffer2Blob(RawBuffer *buffer, std::shared_ptr<Blob> &blob);
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_BLOB_TRANSFER_UTILS_H
diff --git a/3rdparty/TNN/source/tnn/utils/cpu_info.cc b/3rdparty/TNN/source/tnn/utils/cpu_info.cc
new file mode 100644
index 0000000..3da53d7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/cpu_info.cc
@@ -0,0 +1,660 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_info.h"
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+#include <alloca.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif
+
+#define BUFFER_SIZE 1024
+
+typedef bool (*cpuinfo_line_callback)(const char *, const char *, void *, uint64_t);
+
+/* Only contain hardware and midr now. */
+struct proc_cpuinfo_parser_state {
+    char *hardware;
+    struct cpuinfo_arm_linux_processor *processor;
+};
+
+inline static uint32_t midr_set_part(uint32_t midr, uint32_t part) {
+    return (midr & ~CPUINFO_ARM_MIDR_PART_MASK) | ((part << CPUINFO_ARM_MIDR_PART_OFFSET) & CPUINFO_ARM_MIDR_PART_MASK);
+}
+
+inline static uint32_t midr_set_implementer(uint32_t midr, uint32_t implementer) {
+    return (midr & ~CPUINFO_ARM_MIDR_IMPLEMENTER_MASK) |
+           ((implementer << CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET) & CPUINFO_ARM_MIDR_IMPLEMENTER_MASK);
+}
+
+static void parse_cpu_part(const char *cpu_part_start, const char *cpu_part_end,
+                           struct cpuinfo_arm_linux_processor *processor) {
+    const size_t cpu_part_length = (size_t)(cpu_part_end - cpu_part_start);
+
+    /*
+     * CPU part should contain hex prefix (0x) and one to three hex digits.
+     * I have never seen less than three digits as a value of this field,
+     * but I don't think it is impossible to see such values in future.
+     * Value can not contain more than three hex digits since
+     * Main ID Register (MIDR) assigns only a 12-bit value for CPU part.
+     */
+    if (cpu_part_length < 3 || cpu_part_length > 5) {
+        return;
+    }
+
+    /* Verify the presence of hex prefix */
+    if (cpu_part_start[0] != '0' || cpu_part_start[1] != 'x') {
+        return;
+    }
+
+    /* Verify that characters after hex prefix are hexadecimal digits and decode them */
+    uint32_t cpu_part = 0;
+    for (const char *digit_ptr = cpu_part_start + 2; digit_ptr != cpu_part_end; digit_ptr++) {
+        const char digit_char = *digit_ptr;
+        uint32_t digit;
+        if (digit_char >= '0' && digit_char <= '9') {
+            digit = digit_char - '0';
+        } else if ((uint32_t)(digit_char - 'A') < 6) {
+            digit = 10 + (digit_char - 'A');
+        } else if ((uint32_t)(digit_char - 'a') < 6) {
+            digit = 10 + (digit_char - 'a');
+        } else {
+            return;
+        }
+        cpu_part = cpu_part * 16 + digit;
+    }
+
+    processor->midr = midr_set_part(processor->midr, cpu_part);
+}
+
+static void parse_cpu_implementer(const char *cpu_implementer_start, const char *cpu_implementer_end,
+                                  struct cpuinfo_arm_linux_processor *processor) {
+    const size_t cpu_implementer_length = cpu_implementer_end - cpu_implementer_start;
+
+    /*
+     * Value should contain hex prefix (0x) and one or two hex digits.
+     * I have never seen single hex digit as a value of this field,
+     * but I don't think it is impossible in future.
+     * Value can not contain more than two hex digits since
+     * Main ID Register (MIDR) assigns only an 8-bit value for CPU implementer.
+     */
+    switch (cpu_implementer_length) {
+        case 3:
+        case 4:
+            break;
+        default:
+            return;
+    }
+
+    /* Verify the presence of hex prefix */
+    if (cpu_implementer_start[0] != '0' || cpu_implementer_start[1] != 'x') {
+        return;
+    }
+
+    /* Verify that characters after hex prefix are hexadecimal digits and decode them */
+    uint32_t cpu_implementer = 0;
+    for (const char *digit_ptr = cpu_implementer_start + 2; digit_ptr != cpu_implementer_end; digit_ptr++) {
+        const char digit_char = *digit_ptr;
+        uint32_t digit;
+        if (digit_char >= '0' && digit_char <= '9') {
+            digit = digit_char - '0';
+        } else if ((uint32_t)(digit_char - 'A') < 6) {
+            digit = 10 + (digit_char - 'A');
+        } else if ((uint32_t)(digit_char - 'a') < 6) {
+            digit = 10 + (digit_char - 'a');
+        } else {
+            return;
+        }
+        cpu_implementer = cpu_implementer * 16 + digit;
+    }
+
+    processor->midr = midr_set_implementer(processor->midr, cpu_implementer);
+}
+
+/* Decode a single line of /proc/cpuinfo information. */
+/* Only decode Hardware now. */
+static bool parse_line(const char *line_start, const char *line_end, struct proc_cpuinfo_parser_state *state,
+                       uint64_t line_number) {
+    /* Empty line. Skip. */
+    if (line_start == line_end) {
+        return true;
+    }
+
+    /* Search for ':' on the line. */
+    const char *separator = line_start;
+    for (; separator != line_end; separator++) {
+        if (*separator == ':') {
+            break;
+        }
+    }
+    /* Skip line if no ':' separator was found. */
+    if (separator == line_end) {
+        return true;
+    }
+
+    /* Skip trailing spaces in key part. */
+    const char *key_end = separator;
+    for (; key_end != line_start; key_end--) {
+        if (key_end[-1] != ' ' && key_end[-1] != '\t') {
+            break;
+        }
+    }
+    /* Skip line if key contains nothing but spaces. */
+    if (key_end == line_start) {
+        return true;
+    }
+
+    /* Skip leading spaces in value part. */
+    const char *value_start = separator + 1;
+    for (; value_start != line_end; value_start++) {
+        if (*value_start != ' ') {
+            break;
+        }
+    }
+    /* Value part contains nothing but spaces. Skip line. */
+    if (value_start == line_end) {
+        return true;
+    }
+
+    /* Skip trailing spaces in value part (if any) */
+    const char *value_end = line_end;
+    for (; value_end != value_start; value_end--) {
+        if (value_end[-1] != ' ') {
+            break;
+        }
+    }
+
+    const size_t key_length = key_end - line_start;
+    switch (key_length) {
+        case 8:
+            if (memcmp(line_start, "CPU part", key_length) == 0) {
+                parse_cpu_part(value_start, value_end, state->processor);
+            } else if (memcmp(line_start, "Hardware", key_length) == 0) {
+                size_t value_length = value_end - value_start;
+                if (value_length > CPUINFO_HARDWARE_VALUE_MAX) {
+                    value_length = CPUINFO_HARDWARE_VALUE_MAX;
+                } else {
+                    state->hardware[value_length] = '\0';
+                }
+                memcpy(state->hardware, value_start, value_length);
+            }
+            break;
+        case 15:
+            if (memcmp(line_start, "CPU implementer", key_length) == 0) {
+                parse_cpu_implementer(value_start, value_end, state->processor);
+            } else if (memcmp(line_start, "CPU implementor", key_length) == 0) {
+                parse_cpu_implementer(value_start, value_end, state->processor);
+            }
+        default:
+            break;
+    }
+    return true;
+}
+
+#define CLEAN_UP                                                                                                       \
+    if (file != -1) {                                                                                                  \
+        close(file);                                                                                                   \
+        file = -1;                                                                                                     \
+    }                                                                                                                  \
+    return status;
+
+bool cpuinfo_linux_parse_multiline_file(const char *filename, size_t buffer_size, cpuinfo_line_callback callback,
+                                        void *context) {
+    int file     = -1;
+    bool status  = false;
+    char *buffer = (char *)alloca(buffer_size);
+
+    file = open(filename, O_RDONLY);
+    if (file == -1) {
+        CLEAN_UP;
+    }
+
+    /* Only used for error reporting */
+    size_t position        = 0;
+    uint64_t line_number   = 1;
+    const char *buffer_end = &buffer[buffer_size];
+    char *data_start       = buffer;
+    ssize_t bytes_read;
+    do {
+        bytes_read = read(file, data_start, (size_t)(buffer_end - data_start));
+        if (bytes_read < 0) {
+            CLEAN_UP;
+        }
+
+        position += (size_t)bytes_read;
+        const char *data_end   = data_start + (size_t)bytes_read;
+        const char *line_start = buffer;
+
+        if (bytes_read == 0) {
+            /* No more data in the file: process the remaining text in the buffer as a single entry */
+            const char *line_end = data_end;
+            if (!callback(line_start, line_end, context, line_number)) {
+                CLEAN_UP;
+            }
+        } else {
+            const char *line_end;
+            do {
+                /* Find the end of the entry, as indicated by newline character ('\n') */
+                for (line_end = line_start; line_end != data_end; line_end++) {
+                    if (*line_end == '\n') {
+                        break;
+                    }
+                }
+
+                /*
+                 * If we located separator at the end of the entry, parse it.
+                 * Otherwise, there may be more data at the end; read the file once again.
+                 */
+                if (line_end != data_end) {
+                    if (!callback(line_start, line_end, context, line_number++)) {
+                        CLEAN_UP;
+                    }
+                    line_start = line_end + 1;
+                }
+            } while (line_end != data_end);
+
+            /* Move remaining partial line data at the end to the beginning of the buffer */
+            const size_t line_length = (size_t)(line_end - line_start);
+            memmove(buffer, line_start, line_length);
+            data_start = &buffer[line_length];
+        }
+    } while (bytes_read != 0);
+
+    /* Commit */
+    status = true;
+
+    CLEAN_UP;
+}
+
+#undef CLEAN_UP
+
+/* Only get hardware and midr now*/
+bool cpuinfo_arm_linux_parse_proc_cpuinfo(char *hardware, struct cpuinfo_arm_linux_processor *processor) {
+    struct proc_cpuinfo_parser_state state = {
+        .hardware  = hardware,
+        .processor = processor,
+    };
+    return cpuinfo_linux_parse_multiline_file("/proc/cpuinfo", BUFFER_SIZE, (cpuinfo_line_callback)parse_line, &state);
+}
+
+#ifdef __ANDROID__
+void cpuinfo_arm_android_parse_properties(struct cpuinfo_android_properties *properties) {
+    __system_property_get("ro.product.board", properties->ro_product_board);
+    __system_property_get("ro.board.platform", properties->ro_board_platform);
+    __system_property_get("ro.mediatek.platform", properties->ro_mediatek_platform);
+    __system_property_get("ro.arch", properties->ro_arch);
+    __system_property_get("ro.chipname", properties->ro_chipname);
+    __system_property_get("ro.hardware.chipname", properties->ro_hardware_chipname);
+}
+#endif
+
+enum cpuinfo_android_chipset_property {
+    cpuinfo_android_chipset_property_proc_cpuinfo_hardware = 0,
+    cpuinfo_android_chipset_property_ro_product_board,
+    cpuinfo_android_chipset_property_ro_board_platform,
+    cpuinfo_android_chipset_property_ro_mediatek_platform,
+    cpuinfo_android_chipset_property_ro_arch,
+    cpuinfo_android_chipset_property_ro_chipname,
+    cpuinfo_android_chipset_property_ro_hardware_chipname,
+    cpuinfo_android_chipset_property_max,
+};
+
+static inline uint32_t load_u32le(const void *ptr) {
+    return *((const uint32_t *)ptr);
+}
+
+static inline uint16_t load_u16le(const void *ptr) {
+    return *((const uint16_t *)ptr);
+}
+
+/**
+ * Tries to match /Samsung Exynos\d{4}$/ signature (case-insensitive) for Samsung Exynos chipsets.
+ * If match successful, extracts model information into \p chipset argument.
+ *
+ * @param start - start of the /proc/cpuinfo Hardware string to match.
+ * @param end - end of the /proc/cpuinfo Hardware string to match.
+ * @param[out] chipset - location where chipset information will be stored upon a successful match.
+ *
+ * @returns true if signature matched, false otherwise.
+ */
+static bool match_samsung_exynos(const char *start, const char *end, struct cpuinfo_arm_chipset *chipset) {
+    /*
+     * Expect at 18-19 symbols:
+     * - "Samsung" (7 symbols) + space + "Exynos" (6 symbols) + optional space 4-digit model number
+     */
+    const size_t length = end - start;
+    switch (length) {
+        case 18:
+        case 19:
+            break;
+        default:
+            return false;
+    }
+
+    /*
+     * Check that the string starts with "samsung exynos", case-insensitive.
+     * Blocks of 4 characters are loaded and compared as little-endian 32-bit word.
+     * Case-insensitive characters are binary ORed with 0x20 to convert them to lowercase.
+     */
+    const uint32_t expected_sams = UINT32_C(0x20202000) | load_u32le(start);
+    if (expected_sams != UINT32_C(0x736D6153) /* "smaS" = reverse("Sams") */) {
+        return false;
+    }
+    const uint32_t expected_ung = UINT32_C(0x00202020) | load_u32le(start + 4);
+    if (expected_ung != UINT32_C(0x20676E75) /* " ung" = reverse("ung ") */) {
+        return false;
+    }
+    const uint32_t expected_exyn = UINT32_C(0x20202000) | load_u32le(start + 8);
+    if (expected_exyn != UINT32_C(0x6E797845) /* "nyxE" = reverse("Exyn") */) {
+        return false;
+    }
+    const uint16_t expected_os = UINT16_C(0x2020) | load_u16le(start + 12);
+    if (expected_os != UINT16_C(0x736F) /* "so" = reverse("os") */) {
+        return false;
+    }
+
+    const char *pos = start + 14;
+
+    /* There can be a space ' ' following the "Exynos" string */
+    if (*pos == ' ') {
+        pos++;
+
+        /* If optional space if present, we expect exactly 19 characters */
+        if (length != 19) {
+            return false;
+        }
+    }
+
+    /* Validate and parse 4-digit model number */
+    uint32_t model = 0;
+    for (uint32_t i = 0; i < 4; i++) {
+        const uint32_t digit = (uint32_t)(uint8_t)(*pos++) - '0';
+        if (digit >= 10) {
+            /* Not really a digit */
+            return false;
+        }
+        model = model * 10 + digit;
+    }
+
+    /* Return parsed chipset */
+    *chipset = (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_samsung,
+        .series = cpuinfo_arm_chipset_series_samsung_exynos,
+        .model  = model,
+    };
+    return true;
+}
+
+/**
+ * Tries to match /exynos\d{4}$/ signature for Samsung Exynos chipsets.
+ * If match successful, extracts model information into \p chipset argument.
+ *
+ * @param start - start of the platform identifier (ro.board.platform or ro.chipname) to match.
+ * @param end - end of the platform identifier (ro.board.platform or ro.chipname) to match.
+ * @param[out] chipset - location where chipset information will be stored upon a successful match.
+ *
+ * @returns true if signature matched, false otherwise.
+ */
+static bool match_exynos(const char *start, const char *end, struct cpuinfo_arm_chipset *chipset) {
+    /* Expect exactly 10 symbols: "exynos" (6 symbols) + 4-digit model number */
+    if (start + 10 != end) {
+        return false;
+    }
+
+    /* Load first 4 bytes as little endian 32-bit word */
+    const uint32_t expected_exyn = load_u32le(start);
+    if (expected_exyn != UINT32_C(0x6E797865) /* "nyxe" = reverse("exyn") */) {
+        return false;
+    }
+
+    /* Load next 2 bytes as little endian 16-bit word */
+    const uint16_t expected_os = load_u16le(start + 4);
+    if (expected_os != UINT16_C(0x736F) /* "so" = reverse("os") */) {
+        return false;
+    }
+
+    /* Check and parse 4-digit model number */
+    uint32_t model = 0;
+    for (uint32_t i = 6; i < 10; i++) {
+        const uint32_t digit = (uint32_t)(uint8_t)start[i] - '0';
+        if (digit >= 10) {
+            /* Not really a digit */
+            return false;
+        }
+        model = model * 10 + digit;
+    }
+
+    /* Return parsed chipset. */
+    *chipset = (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_samsung,
+        .series = cpuinfo_arm_chipset_series_samsung_exynos,
+        .model  = model,
+    };
+    return true;
+}
+
+/**
+ * Tries to match /universal\d{4}$/ signature for Samsung Exynos chipsets.
+ * If match successful, extracts model information into \p chipset argument.
+ *
+ * @param start - start of the platform identifier (/proc/cpuinfo Hardware string, ro.product.board or ro.chipname)
+ *                to match.
+ * @param end - end of the platform identifier (/proc/cpuinfo Hardware string, ro.product.board or ro.chipname)
+ *              to match.
+ * @param[out] chipset - location where chipset information will be stored upon a successful match.
+ *
+ * @returns true if signature matched, false otherwise.
+ */
+static bool match_universal(const char *start, const char *end, struct cpuinfo_arm_chipset *chipset) {
+    /* Expect exactly 13 symbols: "universal" (9 symbols) + 4-digit model number */
+    if (start + 13 != end) {
+        return false;
+    }
+
+    /*
+     * Check that the string starts with "universal".
+     * Blocks of 4 characters are loaded and compared as little-endian 32-bit word.
+     * Case-insensitive characters are binary ORed with 0x20 to convert them to lowercase.
+     */
+    const uint8_t expected_u = UINT8_C(0x20) | (uint8_t)start[0];
+    if (expected_u != UINT8_C(0x75) /* "u" */) {
+        return false;
+    }
+    const uint32_t expected_nive = UINT32_C(0x20202020) | load_u32le(start + 1);
+    if (expected_nive != UINT32_C(0x6576696E) /* "evin" = reverse("nive") */) {
+        return false;
+    }
+    const uint32_t expected_ersa = UINT32_C(0x20202020) | load_u32le(start + 5);
+    if (expected_ersa != UINT32_C(0x6C617372) /* "lasr" = reverse("rsal") */) {
+        return false;
+    }
+
+    /* Validate and parse 4-digit model number */
+    uint32_t model = 0;
+    for (uint32_t i = 9; i < 13; i++) {
+        const uint32_t digit = (uint32_t)(uint8_t)start[i] - '0';
+        if (digit >= 10) {
+            /* Not really a digit */
+            return false;
+        }
+        model = model * 10 + digit;
+    }
+
+    /* Return parsed chipset. */
+    *chipset = (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_samsung,
+        .series = cpuinfo_arm_chipset_series_samsung_exynos,
+        .model  = model,
+    };
+    return true;
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_hardware(const char *hardware) {
+    struct cpuinfo_arm_chipset chipset;
+    const size_t hardware_length = strnlen(hardware, CPUINFO_HARDWARE_VALUE_MAX);
+    const char *hardware_end     = hardware + hardware_length;
+
+    /* Check Samsung Exynos signature */
+    if (match_samsung_exynos(hardware, hardware_end, &chipset)) {
+        return chipset;
+    }
+
+    /* Check universalXXXX (Samsung Exynos) signature */
+    if (match_universal(hardware, hardware_end, &chipset)) {
+        return chipset;
+    }
+
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset_from_ro_product_board(const char *ro_product_board) {
+    struct cpuinfo_arm_chipset chipset;
+    const char *board         = ro_product_board;
+    const size_t board_length = strnlen(ro_product_board, CPUINFO_BUILD_PROP_VALUE_MAX);
+    const char *board_end     = ro_product_board + board_length;
+
+    /* Check universaXXXX (Samsung Exynos) signature */
+    if (match_universal(board, board_end, &chipset)) {
+        return chipset;
+    }
+
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset_from_ro_board_platform(const char *platform) {
+    struct cpuinfo_arm_chipset chipset;
+    const size_t platform_length = strnlen(platform, CPUINFO_BUILD_PROP_VALUE_MAX);
+    const char *platform_end     = platform + platform_length;
+
+    /* Check exynosXXXX (Samsung Exynos) signature */
+    if (match_exynos(platform, platform_end, &chipset)) {
+        return chipset;
+    }
+
+    /* None of the ro.board.platform signatures matched, indicate unknown chipset */
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset_from_ro_mediatek_platform(const char *platform) {
+    struct cpuinfo_arm_chipset chipset;
+    const char *platform_end = platform + strnlen(platform, CPUINFO_BUILD_PROP_VALUE_MAX);
+
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset_from_ro_arch(const char *arch) {
+    struct cpuinfo_arm_chipset chipset;
+    const char *arch_end = arch + strnlen(arch, CPUINFO_BUILD_PROP_VALUE_MAX);
+
+    /* Check Samsung exynosXXXX signature */
+    if (match_exynos(arch, arch_end, &chipset)) {
+        return chipset;
+    }
+
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset_from_ro_chipname(const char *chipname) {
+    struct cpuinfo_arm_chipset chipset;
+    const size_t chipname_length = strnlen(chipname, CPUINFO_BUILD_PROP_VALUE_MAX);
+    const char *chipname_end     = chipname + chipname_length;
+
+    /* Check exynosXXXX (Samsung Exynos) signature */
+    if (match_exynos(chipname, chipname_end, &chipset)) {
+        return chipset;
+    }
+
+    /* Check universalXXXX (Samsung Exynos) signature */
+    if (match_universal(chipname, chipname_end, &chipset)) {
+        return chipset;
+    }
+
+    return (struct cpuinfo_arm_chipset){
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+}
+
+/* Only detect Samsung Exynos chipsets now*/
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset(const struct cpuinfo_android_properties *properties) {
+    struct cpuinfo_arm_chipset chipset = {
+        .vendor = cpuinfo_arm_chipset_vendor_unknown,
+        .series = cpuinfo_arm_chipset_series_unknown,
+    };
+
+    struct cpuinfo_arm_chipset chipsets[cpuinfo_android_chipset_property_max] = {
+        [cpuinfo_android_chipset_property_proc_cpuinfo_hardware] =
+            cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_hardware(properties->proc_cpuinfo_hardware),
+        [cpuinfo_android_chipset_property_ro_product_board] =
+            cpuinfo_arm_android_decode_chipset_from_ro_product_board(properties->ro_product_board),
+        [cpuinfo_android_chipset_property_ro_board_platform] =
+            cpuinfo_arm_android_decode_chipset_from_ro_board_platform(properties->ro_board_platform),
+        [cpuinfo_android_chipset_property_ro_mediatek_platform] =
+            cpuinfo_arm_android_decode_chipset_from_ro_mediatek_platform(properties->ro_mediatek_platform),
+        [cpuinfo_android_chipset_property_ro_arch] =
+            cpuinfo_arm_android_decode_chipset_from_ro_arch(properties->ro_arch),
+        [cpuinfo_android_chipset_property_ro_chipname] =
+            cpuinfo_arm_android_decode_chipset_from_ro_chipname(properties->ro_chipname),
+        [cpuinfo_android_chipset_property_ro_hardware_chipname] =
+            cpuinfo_arm_android_decode_chipset_from_ro_chipname(properties->ro_hardware_chipname),
+    };
+
+    enum cpuinfo_arm_chipset_vendor vendor = cpuinfo_arm_chipset_vendor_unknown;
+    for (size_t i = 0; i < cpuinfo_android_chipset_property_max; i++) {
+        const enum cpuinfo_arm_chipset_vendor decoded_vendor = chipsets[i].vendor;
+        if (decoded_vendor != cpuinfo_arm_chipset_vendor_unknown) {
+            if (vendor == cpuinfo_arm_chipset_vendor_unknown) {
+                vendor = decoded_vendor;
+            } else if (vendor != decoded_vendor) {
+                /* Parsing different system properties produces different chipset vendors. This situation is rare. */
+                return chipset;
+            }
+        }
+    }
+    if (vendor == cpuinfo_arm_chipset_vendor_unknown) {
+        return chipset;
+    }
+
+    for (size_t i = 0; i < cpuinfo_android_chipset_property_max; i++) {
+        if (chipsets[i].series != cpuinfo_arm_chipset_series_unknown) {
+            chipset = chipsets[i];
+            break;
+        }
+    }
+
+    return chipset;
+}
+
+#endif  // __ANDROID__ || __linux__
diff --git a/3rdparty/TNN/source/tnn/utils/cpu_info.h b/3rdparty/TNN/source/tnn/utils/cpu_info.h
new file mode 100644
index 0000000..0755498
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/cpu_info.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_CPU_INFO_H_
+#define TNN_SOURCE_TNN_UTILS_CPU_INFO_H_
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+#include <stdint.h>
+
+/* No hard limit in the kernel, maximum length observed on non-rogue kernels is 64 */
+#define CPUINFO_HARDWARE_VALUE_MAX 64
+/* As per include/sys/system_properties.h in Android NDK */
+#define CPUINFO_BUILD_PROP_VALUE_MAX 92
+
+struct cpuinfo_android_properties {
+    char proc_cpuinfo_hardware[CPUINFO_HARDWARE_VALUE_MAX];
+    char ro_product_board[CPUINFO_BUILD_PROP_VALUE_MAX];
+    char ro_board_platform[CPUINFO_BUILD_PROP_VALUE_MAX];
+    char ro_mediatek_platform[CPUINFO_BUILD_PROP_VALUE_MAX];
+    char ro_arch[CPUINFO_BUILD_PROP_VALUE_MAX];
+    char ro_chipname[CPUINFO_BUILD_PROP_VALUE_MAX];
+    char ro_hardware_chipname[CPUINFO_BUILD_PROP_VALUE_MAX];
+};
+
+enum cpuinfo_arm_chipset_vendor {
+    cpuinfo_arm_chipset_vendor_unknown = 0,
+    cpuinfo_arm_chipset_vendor_samsung,
+};
+
+enum cpuinfo_arm_chipset_series {
+    cpuinfo_arm_chipset_series_unknown = 0,
+    cpuinfo_arm_chipset_series_samsung_exynos,
+};
+
+struct cpuinfo_arm_chipset {
+    enum cpuinfo_arm_chipset_vendor vendor;
+    enum cpuinfo_arm_chipset_series series;
+    uint32_t model;
+};
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_MASK UINT32_C(0xFF000000)
+#define CPUINFO_ARM_MIDR_VARIANT_MASK UINT32_C(0x00F00000)
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_MASK UINT32_C(0x000F0000)
+#define CPUINFO_ARM_MIDR_PART_MASK UINT32_C(0x0000FFF0)
+#define CPUINFO_ARM_MIDR_REVISION_MASK UINT32_C(0x0000000F)
+
+#define CPUINFO_ARM_MIDR_IMPLEMENTER_OFFSET 24
+#define CPUINFO_ARM_MIDR_VARIANT_OFFSET 20
+#define CPUINFO_ARM_MIDR_ARCHITECTURE_OFFSET 16
+#define CPUINFO_ARM_MIDR_PART_OFFSET 4
+#define CPUINFO_ARM_MIDR_REVISION_OFFSET 0
+
+struct cpuinfo_arm_linux_processor {
+    /**
+     * Main ID Register value.
+     */
+    uint32_t midr = 0;
+};
+
+bool cpuinfo_arm_linux_parse_proc_cpuinfo(char *hardware, struct cpuinfo_arm_linux_processor *processor);
+#ifdef __ANDROID__
+void cpuinfo_arm_android_parse_properties(struct cpuinfo_android_properties *properties);
+#endif
+struct cpuinfo_arm_chipset cpuinfo_arm_android_decode_chipset(const struct cpuinfo_android_properties *properties);
+
+#endif  // __ANDROID__ || __linux__
+
+#endif  // TNN_SOURCE_TNN_UTILS_CPU_INFO_H_
diff --git a/3rdparty/TNN/source/tnn/utils/cpu_utils.cc b/3rdparty/TNN/source/tnn/utils/cpu_utils.cc
new file mode 100644
index 0000000..f17c3e7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/cpu_utils.cc
@@ -0,0 +1,589 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/cpu_utils.h"
+#include <stdio.h>
+#include <string.h>
+#include <vector>
+#include "tnn/utils/cpu_info.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#if defined(__ANDROID__) || defined(__linux__)
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#if defined(__ANDROID__) || defined(__linux__)
+#include <sys/auxv.h>
+#define AT_HWCAP 16
+#define AT_HWCAP2 26
+// from arch/arm64/include/uapi/asm/hwcap.h
+#define HWCAP_FPHP (1 << 9)
+#define HWCAP_ASIMDHP (1 << 10)
+#endif  // __ANDROID__
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#include <sys/sysctl.h>
+#if TARGET_OS_IPHONE
+#include <mach/machine.h>
+#include <sys/types.h>
+#define __IOS__ 1
+// A11
+#ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
+#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
+#endif
+// A12
+#ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
+#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
+#endif
+// A13
+#ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
+#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
+#endif
+// A14
+#ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
+#define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1B588BB3
+#endif
+#elif TARGET_OS_OSX
+#define __OSX__ 1
+// M1
+#ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
+#define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
+#endif
+#endif  // TARGET_OS_IPHONE
+#endif  // __APPLE__
+
+// X86 CPU
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+#ifdef __ANDROID__
+static int GetMaxFreqOfCpu(int cpuid) {
+    // first try, for all possible cpu
+    char path[256];
+    snprintf(path, 256, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+
+    if (!fp) {
+        // second try, for online cpu
+        snprintf(path, 256, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
+        fp = fopen(path, "rb");
+
+        if (fp) {
+            int max_freq_khz = 0;
+            while (!feof(fp)) {
+                int freq_khz = 0;
+                int nscan    = fscanf(fp, "%d %*d", &freq_khz);
+                if (nscan != 1)
+                    break;
+
+                if (freq_khz > max_freq_khz)
+                    max_freq_khz = freq_khz;
+            }
+
+            fclose(fp);
+
+            if (max_freq_khz != 0)
+                return max_freq_khz;
+
+            fp = NULL;
+        }
+
+        if (!fp) {
+            // third try, for online cpu
+            snprintf(path, 256, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+            fp = fopen(path, "rb");
+
+            if (!fp)
+                return -1;
+
+            int max_freq_khz = -1;
+            fscanf(fp, "%d", &max_freq_khz);
+
+            fclose(fp);
+
+            return max_freq_khz;
+        }
+    }
+
+    int max_freq_khz = 0;
+    while (!feof(fp)) {
+        int freq_khz = 0;
+        int nscan    = fscanf(fp, "%d %*d", &freq_khz);
+        if (nscan != 1)
+            break;
+
+        if (freq_khz > max_freq_khz)
+            max_freq_khz = freq_khz;
+    }
+
+    fclose(fp);
+
+    return max_freq_khz;
+}
+
+static int GetCpuCount() {
+    // get cpu count from /proc/cpuinfo
+    FILE* fp = fopen("/proc/cpuinfo", "rb");
+    if (!fp)
+        return 1;
+
+    int count = 0;
+    char line[1024];
+    while (!feof(fp)) {
+        char* s = fgets(line, 1024, fp);
+        if (!s)
+            break;
+
+        if (memcmp(line, "processor", 9) == 0) {
+            count++;
+        }
+    }
+
+    fclose(fp);
+
+    if (count < 1)
+        count = 1;
+
+    return count;
+}
+
+static int SortCpuidByMaxFrequency(std::vector<int>& cpuids, int* little_cluster_offset) {
+    const int cpu_count = cpuids.size();
+
+    *little_cluster_offset = 0;
+
+    if (cpu_count == 0)
+        return 0;
+
+    std::vector<int> cpu_max_freq_khz;
+    cpu_max_freq_khz.resize(cpu_count);
+
+    for (int i = 0; i < cpu_count; i++) {
+        int max_freq_khz = GetMaxFreqOfCpu(i);
+
+        //         printf("%d max freq = %d khz\n", i, max_freq_khz);
+
+        cpuids[i]           = i;
+        cpu_max_freq_khz[i] = max_freq_khz;
+    }
+
+    // sort cpuid as big core first
+    // simple bubble sort
+    for (int i = 0; i < cpu_count; i++) {
+        for (int j = i + 1; j < cpu_count; j++) {
+            if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j]) {
+                // swap
+                int tmp   = cpuids[i];
+                cpuids[i] = cpuids[j];
+                cpuids[j] = tmp;
+
+                tmp                 = cpu_max_freq_khz[i];
+                cpu_max_freq_khz[i] = cpu_max_freq_khz[j];
+                cpu_max_freq_khz[j] = tmp;
+            }
+        }
+    }
+
+    // SMP
+    int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2;
+    if (mid_max_freq_khz == cpu_max_freq_khz.back())
+        return 0;
+
+    for (int i = 0; i < cpu_count; i++) {
+        if (cpu_max_freq_khz[i] < mid_max_freq_khz) {
+            *little_cluster_offset = i;
+            break;
+        }
+    }
+
+    return 0;
+}
+#endif  // __ANDROID__
+
+static int SetSchedAffinity(const std::vector<int>& cpuids) {
+#if defined(__ANDROID__) || defined(__linux__)
+    // cpu_set_t definition
+    // ref
+    // http://stackoverflow.com/questions/16319725/android-set-thread-affinity
+#define TNN_CPU_SETSIZE 1024
+#define TNN_NCPUBITS (8 * sizeof(unsigned long))
+    typedef struct {
+        unsigned long __bits[TNN_CPU_SETSIZE / TNN_NCPUBITS];
+    } cpu_set_t;
+
+#define TNN_CPU_SET(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / TNN_NCPUBITS] |= (1UL << ((cpu) % TNN_NCPUBITS)))
+
+#define TNN_CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
+
+    // set affinity for thread
+#ifdef __GLIBC__
+    pid_t pid = syscall(SYS_gettid);
+#else
+#ifdef PI3
+    pid_t pid  = getpid();
+#else
+    pid_t pid = gettid();
+#endif
+#endif
+    cpu_set_t mask;
+    TNN_CPU_ZERO(&mask);
+    for (int i = 0; i < (int)cpuids.size(); i++) {
+        TNN_CPU_SET(cpuids[i], &mask);
+    }
+
+    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+    if (syscallret) {
+        fprintf(stderr, "syscall error %d\n", syscallret);
+        return -1;
+    }
+#endif
+    return 0;
+}
+
+Status CpuUtils::SetCpuPowersave(int powersave) {
+#ifdef __ANDROID__
+    static std::vector<int> sorted_cpuids;
+    static int little_cluster_offset = 0;
+    static int cpucount              = GetCpuCount();
+
+    if (sorted_cpuids.empty()) {
+        // 0 ~ g_cpucount
+
+        sorted_cpuids.resize(cpucount);
+        for (int i = 0; i < cpucount; i++) {
+            sorted_cpuids[i] = i;
+        }
+
+        // descent sort by max frequency
+        SortCpuidByMaxFrequency(sorted_cpuids, &little_cluster_offset);
+    }
+
+    if (little_cluster_offset == 0 && powersave != 0) {
+        powersave = 0;
+        fprintf(stderr, "SMP cpu powersave not supported\n");
+    }
+
+    // prepare affinity cpuid
+    std::vector<int> cpuids;
+    if (powersave == 0) {
+        cpuids = sorted_cpuids;
+    } else if (powersave == 1) {
+        cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
+    } else if (powersave == 2) {
+        cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() + little_cluster_offset);
+    } else {
+        fprintf(stderr, "powersave %d not supported\n", powersave);
+        return TNNERR_SET_CPU_AFFINITY;
+    }
+
+#ifdef _OPENMP
+    // set affinity for each thread
+    int num_threads = cpuids.size();
+    omp_set_num_threads(num_threads);
+    std::vector<int> ssarets(num_threads, 0);
+#pragma omp parallel for
+    for (int i = 0; i < num_threads; i++) {
+        ssarets[i] = SetSchedAffinity(cpuids);
+    }
+    for (int i = 0; i < num_threads; i++) {
+        if (ssarets[i] != 0) {
+            return TNNERR_SET_CPU_AFFINITY;
+        }
+    }
+#else
+    int ssaret = SetSchedAffinity(cpuids);
+    if (ssaret != 0) {
+        return TNNERR_SET_CPU_AFFINITY;
+    }
+#endif
+
+    return TNN_OK;
+#else
+    // TODO
+    (void)powersave;  // Avoid unused parameter warning.
+    return TNNERR_SET_CPU_AFFINITY;
+#endif
+}
+
+Status CpuUtils::SetCpuAffinity(const std::vector<int>& cpu_list) {
+#if defined(__ANDROID__) || defined(__linux__)
+    if (0 != SetSchedAffinity(cpu_list)) {
+        return TNNERR_SET_CPU_AFFINITY;
+    }
+    return TNN_OK;
+#else
+    return TNNERR_SET_CPU_AFFINITY;
+#endif
+}
+
+bool CpuUtils::CpuSupportFp16() {
+    bool fp16arith = false;
+
+#if !TNN_ARM82
+    LOGD("CpuUtils::CpuSupportFp16, TNN_ARM82 is off, fp16arith = 0.\n");
+    return false;
+#else
+
+// TNN_ARM82_SIMU
+#if defined(TNN_ARM82_SIMU)
+    LOGD("CpuUtils::CpuSupportFp16, TNN_ARM82_SIMU is on, fp16arith = 1.\n");
+    return true;
+
+// IOS
+#elif defined(__IOS__)
+#ifdef __aarch64__
+    unsigned int cpu_family = 0;
+    size_t len              = sizeof(cpu_family);
+    sysctlbyname("hw.cpufamily", &cpu_family, &len, NULL, 0);
+    fp16arith = cpu_family == CPUFAMILY_ARM_MONSOON_MISTRAL || cpu_family == CPUFAMILY_ARM_VORTEX_TEMPEST ||
+                cpu_family == CPUFAMILY_ARM_LIGHTNING_THUNDER || cpu_family == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
+    LOGD("CpuUtils::CpuSupportFp16, IOS and arm64, hw.cpufamily = %x, fp16arith = %d.\n", cpu_family, fp16arith);
+    return fp16arith;
+#else
+    LOGD("CpuUtils::CpuSupportFp16, IOS and arm32, fp16arith = 0.\n");
+    return false;
+#endif
+
+// ANDROID
+#elif defined(__ANDROID__)
+    cpuinfo_android_properties cpu_prop;
+    cpuinfo_arm_linux_processor processor;
+    cpuinfo_arm_linux_parse_proc_cpuinfo(cpu_prop.proc_cpuinfo_hardware, &processor);
+    cpuinfo_arm_android_parse_properties(&cpu_prop);
+    auto chipset = cpuinfo_arm_android_decode_chipset(&cpu_prop);
+    LOGD("CpuUtils::CpuSupportFp16, ANDROID, vendor = %d, series = %d, model = %d.\n", chipset.vendor, chipset.series,
+         chipset.model);
+    if (chipset.series == cpuinfo_arm_chipset_series_samsung_exynos && chipset.model == 9810) {
+        LOGD("Big cores of Exynos 9810 do not support FP16 compute, fp16arith = 0.\n");
+        return false;
+    }
+#ifdef __aarch64__
+    unsigned int hwcap = getauxval(AT_HWCAP);
+    fp16arith          = hwcap & HWCAP_FPHP && hwcap & HWCAP_ASIMDHP;
+    LOGD("CpuUtils::CpuSupportFp16, ANDROID and arm64, hwcap = %x, fp16arith = %d.\n", hwcap, fp16arith);
+    return fp16arith;
+#else
+    switch (processor.midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+        case UINT32_C(0x4100D050): /* Cortex-A55 */
+        case UINT32_C(0x4100D060): /* Cortex-A65 */
+        case UINT32_C(0x4100D0B0): /* Cortex-A76 */
+        case UINT32_C(0x4100D0C0): /* Neoverse N1 */
+        case UINT32_C(0x4100D0D0): /* Cortex-A77 */
+        case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
+        case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+        case UINT32_C(0x51008020): /* Kryo 385 Gold (Cortex-A75) */
+        case UINT32_C(0x51008030): /* Kryo 385 Silver (Cortex-A55) */
+        case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
+        case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
+        case UINT32_C(0x53000030): /* Exynos M4 */
+        case UINT32_C(0x53000040): /* Exynos M5 */
+            fp16arith = true;
+            break;
+    }
+    LOGD("CpuUtils::CpuSupportFp16, ANDROID and arm32, midr = %x, fp16arith = %d.\n", processor.midr, fp16arith);
+    return fp16arith;
+#endif
+
+// linux
+#elif defined(__linux__)
+#ifdef __aarch64__
+    unsigned int hwcap = getauxval(AT_HWCAP);
+    fp16arith          = hwcap & HWCAP_FPHP && hwcap & HWCAP_ASIMDHP;
+    LOGD("CpuUtils::CpuSupportFp16, linux and arm64, hwcap = %x, fp16arith = %d.\n", hwcap, fp16arith);
+    return fp16arith;
+#else
+    LOGD("CpuUtils::CpuSupportFp16, linux and arm32, fp16arith = 0.\n");
+    return false;
+#endif
+
+// OSX
+#elif defined(__OSX__)
+#ifdef __aarch64__
+    unsigned int cpu_family = 0;
+    size_t len              = sizeof(cpu_family);
+    sysctlbyname("hw.cpufamily", &cpu_family, &len, NULL, 0);
+    fp16arith = cpu_family == CPUFAMILY_AARCH64_FIRESTORM_ICESTORM;
+    LOGD("CpuUtils::CpuSupportFp16, OSX and arm64, hw.cpufamily = %x, fp16arith = %d.\n", cpu_family, fp16arith);
+    return fp16arith;
+#else
+    LOGD("CpuUtils::CpuSupportFp16, OSX and arm32, fp16arith = 0.\n");
+    return false;
+#endif
+
+// unknown
+#else
+    LOGE("CpuUtils::CpuSupportFp16, unknown platform, fp16arith = 0.\n");
+    return false;
+#endif
+
+#endif  // TNN_ARM82
+}
+
+bool CpuUtils::CpuSupportInt8Dot() {
+    bool int8dot = false;
+
+#if !TNN_ARM82
+    LOGD("CpuUtils::CpuSupportInt8Dot, TNN_ARM82 is off, int8dot = 0.\n");
+    return false;
+#else
+
+// IOS
+#if defined(__IOS__)
+#ifdef __aarch64__
+    unsigned int cpu_family = 0;
+    size_t len              = sizeof(cpu_family);
+    sysctlbyname("hw.cpufamily", &cpu_family, &len, NULL, 0);
+    int8dot = cpu_family == CPUFAMILY_ARM_LIGHTNING_THUNDER || cpu_family == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
+    LOGD("CpuUtils::CpuSupportInt8Dot, IOS and arm64, hw.cpufamily = %x, int8dot = %d.\n", cpu_family, int8dot);
+    return int8dot;
+#else
+    LOGD("CpuUtils::CpuSupportInt8Dot, IOS and arm32, int8dot = 0.\n");
+    return false;
+#endif
+
+// ANDROID
+#elif defined(__ANDROID__)
+    // hwcap & HWCAP_ASIMDDP (1 << 20) may also work, need more test
+    // unsigned int hwcap = getauxval(AT_HWCAP);
+    // int8dot = hwcap & (1 << 20);
+
+    cpuinfo_android_properties cpu_prop;
+    cpuinfo_arm_linux_processor processor;
+    cpuinfo_arm_linux_parse_proc_cpuinfo(cpu_prop.proc_cpuinfo_hardware, &processor);
+    cpuinfo_arm_android_parse_properties(&cpu_prop);
+    auto chipset = cpuinfo_arm_android_decode_chipset(&cpu_prop);
+    LOGD("CpuUtils::CpuSupportInt8Dot, ANDROID, vendor = %d, series = %d, model = %d.\n", chipset.vendor, chipset.series,
+         chipset.model);
+
+    // for aarch64, ID_AA64ISAR0_EL1 bits [47:44] indicates support for Dot Product
+    // for aarch32, ID_ISAR6_EL1 bits [7:4] indicates support for Dot Product
+    // __asm ("MRS %x0, ID_AA64ISAR0_EL1 \n" : "=r" (id_aa64isar0) );
+    // but el0 application access el1 register will casue exception
+
+    // we use table lookup to check dot product support
+    switch (processor.midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+        case UINT32_C(0x4100D060): /* Cortex-A65 */
+        case UINT32_C(0x4100D0B0): /* Cortex-A76 */
+        case UINT32_C(0x4100D0C0): /* Neoverse N1 */
+        case UINT32_C(0x4100D0D0): /* Cortex-A77 */
+        case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
+        case UINT32_C(0x4100D4A0): /* Neoverse E1 */
+        case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+        case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
+        case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
+        case UINT32_C(0x53000030): /* Exynos M4 */
+        case UINT32_C(0x53000040): /* Exynos M5 */
+            int8dot = true;
+            break;
+        case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
+            int8dot = !!(((processor.midr & UINT32_C(0x00F00000)) >> 20) >= 1);
+            break;
+        case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
+            int8dot = !!(((processor.midr & UINT32_C(0x00F00000)) >> 20) >= 2);
+            break;
+    }
+
+    LOGD("CpuUtils::CpuSupportInt8Dot, ANDROID, midr = %x, int8dot = %d.\n", processor.midr, int8dot);
+    return int8dot;
+
+// OSX
+#elif defined(__OSX__)
+#ifdef __aarch64__
+    unsigned int cpu_family = 0;
+    size_t len              = sizeof(cpu_family);
+    sysctlbyname("hw.cpufamily", &cpu_family, &len, NULL, 0);
+    int8dot = cpu_family == CPUFAMILY_AARCH64_FIRESTORM_ICESTORM;
+    LOGD("CpuUtils::CpuSupportInt8Dot, OSX and arm64, hw.cpufamily = %x, int8dot = %d.\n", cpu_family, int8dot);
+    return int8dot;
+#else
+    LOGD("CpuUtils::CpuSupportInt8Dot, OSX and arm32, int8dot = 0.\n");
+    return false;
+#endif
+
+// Linux
+#elif defined(__linux__)
+    // HWCAP_ASIMDDP not work on kunpeng 920
+    cpuinfo_android_properties cpu_prop;
+    cpuinfo_arm_linux_processor processor;
+    cpuinfo_arm_linux_parse_proc_cpuinfo(cpu_prop.proc_cpuinfo_hardware, &processor);
+
+    // we use table lookup to check dot product support
+    switch (processor.midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
+        case UINT32_C(0x4100D060): /* Cortex-A65 */
+        case UINT32_C(0x4100D0B0): /* Cortex-A76 */
+        case UINT32_C(0x4100D0C0): /* Neoverse N1 */
+        case UINT32_C(0x4100D0D0): /* Cortex-A77 */
+        case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
+        case UINT32_C(0x4100D4A0): /* Neoverse E1 */
+        case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+        case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
+        case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
+        case UINT32_C(0x53000030): /* Exynos M4 */
+        case UINT32_C(0x53000040): /* Exynos M5 */
+        case UINT32_C(0x4800d010): /* Kunpeng 920 */
+            int8dot = true;
+            break;
+        case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
+            int8dot = !!(((processor.midr & UINT32_C(0x00F00000)) >> 20) >= 1);
+            break;
+        case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
+            int8dot = !!(((processor.midr & UINT32_C(0x00F00000)) >> 20) >= 2);
+            break;
+    }
+
+    LOGD("CpuUtils::CpuSupportInt8Dot, ANDROID, midr = %x, int8dot = %d.\n", processor.midr, int8dot);
+    return int8dot;
+
+// unknown
+#else
+    LOGD("CpuUtils::CpuSupportInt8Dot, unknown platform, int8dot = 0.\n");
+    return false;
+#endif
+
+#endif  // TNN_ARM82
+}
+
+void CpuUtils::SetCpuDenormal(int denormal) {
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
+    if (denormal == 1) {
+        // _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+        _mm_setcsr((_mm_getcsr() & ~0x8000) | (0x8000));
+        _mm_setcsr((_mm_getcsr() & ~0x0040) | (0x0040));
+    } else if (denormal == 0) {
+        // _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+        // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+        _mm_setcsr((_mm_getcsr() & ~0x8000) | (0x8000));
+        _mm_setcsr((_mm_getcsr() & ~0x0040) | (0x0000));
+    }
+#endif
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/data_flag_utils.cc b/3rdparty/TNN/source/tnn/utils/data_flag_utils.cc
new file mode 100644
index 0000000..fe3e539
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/data_flag_utils.cc
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/data_flag_utils.h"
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+bool DataFlagUtils::AllocateInForward(int flag) {
+    return flag & DATA_FLAG_ALLOCATE_IN_FORWARD;
+}
+
+int DataFlagUtils::ChangeStatus(int flag) {
+    return flag & 0x0000FFFF;
+}
+
+int DataFlagUtils::MinChangeStatus(int flag0, int flag1) {
+    auto allocate_in_forward = DataFlagUtils::AllocateInForward(flag0) || DataFlagUtils::AllocateInForward(flag1);
+    flag0 = ChangeStatus(flag0);
+    flag1 = ChangeStatus(flag1);
+    flag0 = flag0 < flag1 ? flag0 : flag1;
+    return allocate_in_forward ? (flag0 | DATA_FLAG_ALLOCATE_IN_FORWARD): flag0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/data_flag_utils.h b/3rdparty/TNN/source/tnn/utils/data_flag_utils.h
new file mode 100644
index 0000000..fb2dea4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/data_flag_utils.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_DATA_FLAG_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_DATA_FLAG_UTILS_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+class DataFlagUtils {
+public:
+    // @brief to check wether the data is allocated in forward
+    // @param flag data flag
+    static bool AllocateInForward(int flag);
+
+    // @brief to check the data change flag
+    // @param flag data flag
+    static int ChangeStatus(int flag);
+    
+    // @brief get the minimal change flag, ignore allocate flag
+    // @param flag data flag
+    static int MinChangeStatus(int flag0, int  flag1);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_DATA_TYPE_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/data_format_converter.cc b/3rdparty/TNN/source/tnn/utils/data_format_converter.cc
new file mode 100644
index 0000000..8407bc4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/data_format_converter.cc
@@ -0,0 +1,316 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/data_format_converter.h"
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+template <class T>
+static Status ConvertWeightsFromGOIHWToGOIHW16(T *src, T *dst, int group, int input_channel, int output_channel,
+                                               int height, int width, bool tanspose = false) {
+    const int goc       = output_channel / group;
+    const int gic       = input_channel / group;
+    const int goc_4     = (goc + 3) / 4;
+    const int gic_4     = (gic + 3) / 4;
+    const int src_count = group * goc * gic * height * width;
+
+    int src_index = 0;
+
+    if (tanspose) {
+        for (int g = 0; g < group; g++) {
+            auto g_dst = dst + g * goc_4 * gic_4 * height * width * 16;  // g
+#pragma clang loop vectorize(enable)
+            for (int i = 0; i < gic; i++) {
+                auto zi = i / 4, ri = i % 4;
+                auto i_dst = g_dst + zi * height * width * 16 + ri;
+#pragma clang loop vectorize(enable)
+                for (int o = 0; o < goc; o++) {
+                    auto zo = o / 4, ro = o % 4;
+                    auto o_dst = i_dst + zo * gic_4 * height * width * 16 + ro * 4;
+#pragma clang loop vectorize(enable)
+                    for (int h = 0; h < height; h++) {
+#pragma clang loop vectorize(enable) unroll(enable)
+                        for (int w = 0; w < width; w++) {
+                            // to   [g][o/4][i/4][h][w][16]
+                            // from [g][i][o][h][w]
+                            if (src_index < src_count) {
+                                o_dst[(h * width + w) * 16] = src[src_index++];
+                            } else {
+                                o_dst[(h * width + w) * 16] = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (int g = 0; g < group; g++) {
+            auto g_dst = dst + g * goc_4 * gic_4 * height * width * 16;  // g
+#pragma clang loop vectorize(enable)
+            for (int o = 0; o < goc; o++) {
+                auto zo = o / 4, ro = o % 4;
+                auto o_dst = g_dst + zo * gic_4 * height * width * 16 + ro * 4;  // o/4 x 4
+#pragma clang loop vectorize(enable)
+                for (int i = 0; i < gic; i++) {
+                    auto zi = i / 4, ri = i % 4;
+                    auto i_dst = o_dst + zi * height * width * 16 + ri;  // i/4 x 4
+#pragma clang loop vectorize(enable)
+                    for (int h = 0; h < height; h++) {
+#pragma clang loop vectorize(enable) unroll(enable)
+                        for (int w = 0; w < width; w++) {
+                            // to   [g][o/4][i/4][h][w][16]
+                            // from [g][o][i][h][w]
+                            if (src_index < src_count) {
+                                i_dst[(h * width + w) * 16] = src[src_index++];
+                            } else {
+                                i_dst[(h * width + w) * 16] = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+};
+
+template <class T>
+static Status ConvertFromNCHWToNCHW4(T *src, T *dst, int num, int channel, int height, int width,
+                                     bool transpose = false) {
+    int round_channel = ROUND_UP(channel, 4);
+    if (transpose) {
+        for (int n = 0; n < num; n++) {
+            auto n_dst = dst + n * round_channel * height * width;
+            auto n_src = src + n * channel * height * width;
+            for (int h = 0; h < height; h++) {
+                auto h_dst = n_dst + h * width * 4;
+                auto h_src = n_src + h * channel * width;
+#pragma clang loop vectorize(enable)
+                for (int c = 0; c < round_channel; c++) {
+                    auto z = c / 4, r = c % 4;
+                    auto z_dst = h_dst + z * height * width * 4 + r;
+#pragma clang loop vectorize(enable) unroll(enable)
+                    for (int w = 0; w < width; w++) {
+                        // to   [c/4][h][w][4]
+                        // from [h][c][w]
+                        // dst[(z * height  * width + h * width + w) * 4 + r] =
+                        // src[ h * channel * width + c * width + w];
+                        if (c < channel)
+                            z_dst[w * 4] = h_src[c * width + w];
+                        else
+                            z_dst[w * 4] = 0;
+                    }
+                }
+            }
+        }
+    } else {
+        for (int n = 0; n < num; n++) {
+            auto n_dst = dst + n * round_channel * height * width;
+            auto n_src = src + n * channel * height * width;
+            for (int c = 0; c < round_channel; c++) {
+                auto z = c / 4, r = c % 4;
+                auto z_dst = n_dst + z * height * width * 4 + r;
+                auto z_src = n_src + c * height * width;
+#pragma clang loop vectorize(enable)
+                for (int h = 0; h < height; h++) {
+#pragma clang loop vectorize(enable) unroll(enable)
+                    for (int w = 0; w < width; w++) {
+                        // to   [c/4][h][w][4]
+                        // from [c][h][w]
+                        // dst[(z * height * width + h * width + w) * 4 + r] =
+                        // src[ c * height * width + h * width + w];
+                        if (c < channel)
+                            z_dst[(h * width + w) * 4] = z_src[h * width + w];
+                        else
+                            z_dst[(h * width + w) * 4] = 0;
+                    }
+                }
+            }
+        }
+    }
+    return TNN_OK;
+};
+
+template <class T>
+static Status ConvertFromNCHWToNHWC4(T *src, T *dst, int num, int channel, int hw) {
+    int round_channel = ROUND_UP(channel, 4);
+    for (int n = 0; n < num; n++) {
+        auto n_dst = dst + n * round_channel * hw;
+        auto n_src = src + n * channel * hw;
+        for (int z = 0; z < hw; z++) {
+            auto z_dst = n_dst + z * round_channel;
+            auto z_src = n_src + z;
+            for (int c = 0; c < round_channel; c++) {
+                // to   [c][hw]
+                // from [hw][c4]
+                if (c < channel)
+                    z_dst[c] = z_src[c * hw];
+                else
+                    z_dst[c] = 0;
+            }
+        }
+    }
+    return TNN_OK;
+};
+
+template <class T>
+static Status ConvertFromNCHW4ToNCHW(T *src, T *dst, int num, int channel, int height, int width) {
+    int round_channel = ROUND_UP(channel, 4);
+    for (int n = 0; n < num; n++) {
+        auto n_src = src + n * round_channel * height * width;
+        auto n_dst = dst + n * channel * height * width;
+        for (int c = 0; c < channel; c++) {
+            auto z = c / 4, r = c % 4;
+            auto z_src = n_src + z * height * width * 4 + r;
+            auto z_dst = n_dst + c * height * width;
+#pragma clang loop vectorize(enable)
+            for (int h = 0; h < height; h++) {
+#pragma clang loop vectorize(enable) unroll(enable)
+                for (int w = 0; w < width; w++) {
+                    // to [c][h][w]
+                    // from   [c/4][h][w][4]
+                    z_dst[h * width + w] = z_src[(h * width + w) * 4];
+                }
+            }
+        }
+    }
+    return TNN_OK;
+};
+
+template <class T>
+static Status ConvertFromNHWC4ToNCHW(T *src, T *dst, int num, int channel, int hw) {
+    int round_channel = ROUND_UP(channel, 4);
+    for (int n = 0; n < num; n++) {
+        auto n_src = src + n * round_channel * hw;
+        auto n_dst = dst + n * channel * hw;
+        for (int c = 0; c < channel; c++) {
+            auto z_src = n_src + c;
+            auto z_dst = n_dst + c * hw;
+#pragma clang loop vectorize(enable) unroll(enable)
+            for (int z = 0; z < hw; z++) {
+                // to [c][hw]
+                // from   [c/4][hw][4]
+                z_dst[z] = z_src[z * round_channel];
+            }
+        }
+    }
+    return TNN_OK;
+};
+
+Status DataFormatConverter::ConvertFromGOIHWToGOIHW16Float(float *src, float *dst, int group, int input_channel,
+                                                           int output_channel, int height, int width, bool tanspose) {
+    return ConvertWeightsFromGOIHWToGOIHW16<float>(src, dst, group, input_channel, output_channel, height, width,
+                                                   tanspose);
+}
+Status DataFormatConverter::ConvertFromGOIHWToGOIHW16Half(short *src, short *dst, int group, int input_channel,
+                                                          int output_channel, int height, int width, bool tanspose) {
+    return ConvertWeightsFromGOIHWToGOIHW16<short>(src, dst, group, input_channel, output_channel, height, width,
+                                                   tanspose);
+}
+Status DataFormatConverter::ConvertFromGOIHWToGOIHW16Int8(int8_t *src, int8_t *dst, int group, int input_channel,
+                                                          int output_channel, int height, int width, bool tanspose) {
+    return ConvertWeightsFromGOIHWToGOIHW16<int8_t>(src, dst, group, input_channel, output_channel, height, width,
+                                                    tanspose);
+}
+
+Status DataFormatConverter::ConvertFromInt8ToFloatNCHW4(int8_t *src, float *dst, float *scale, int scale_len, int num,
+                                                        int channel, int height, int width) {
+    LOGD("scale %g, %d\n", scale[0], scale_len);
+    for (int n = 0; n < num; n++) {
+        for (int c = 0; c < channel; c++) {
+            auto z = c / 4, r = c % 4;
+            int offset     = n * ROUND_UP(channel, 4) * height * width + z * 4 * height * width + r;
+            auto z_src     = src + offset;
+            auto z_dst     = dst + offset;
+            auto scale_idx = scale_len == 1 ? 0 : c;
+#pragma clang loop vectorize(enable)
+            for (int hw = 0; hw < height * width; hw++) {
+                z_dst[hw * 4] = static_cast<float>(z_src[hw * 4]) * scale[scale_idx];
+            }
+        }
+    }
+
+    return 0;
+}
+
+Status DataFormatConverter::ConvertFromInt8ToFloatNHWC4(int8_t *src, float *dst, float *scale, int scale_len, int num,
+                                                        int channel, int height, int width) {
+    LOGD("scale %g, %d\n", scale[0], scale_len);
+    int c_r4 = ROUND_UP(channel, 4);
+    for (int n = 0; n < num; n++) {
+        for (int c = 0; c < channel; c++) {
+            auto z = c / 4, r = c % 4;
+            int dst_offset = n * c_r4 * height * width + z * 4 * height * width + r;
+            int src_offset = n * c_r4 * height * width + c;
+            auto z_src     = src + src_offset;
+            auto z_dst     = dst + dst_offset;
+            auto scale_idx = scale_len == 1 ? 0 : c;
+#pragma clang loop vectorize(enable)
+            for (int hw = 0; hw < height * width; hw++) {
+                z_dst[hw * 4] = static_cast<float>(z_src[hw * c_r4]) * scale[scale_idx];
+            }
+        }
+    }
+
+    return 0;
+}
+
+Status DataFormatConverter::ConvertFromInt8ToFloatNCHW(int8_t *src, float *dst, float *scale, int scale_len, int num,
+                                                       int channel, int height, int width) {
+    LOGD("scale %g, %d\n", scale[0], scale_len);
+
+    for (int n = 0; n < num; n++) {
+        for (int c = 0; c < channel; c++) {
+            int offset     = n * channel * height * width + c * height * width;
+            auto z_src     = src + offset;
+            auto z_dst     = dst + offset;
+            auto scale_idx = scale_len == 1 ? 0 : c;
+#pragma clang loop vectorize(enable)
+            for (int hw = 0; hw < height * width; hw++) {
+                z_dst[hw] = static_cast<float>(z_src[hw]) * scale[scale_idx];
+            }
+        }
+    }
+
+    return 0;
+}
+
+Status DataFormatConverter::ConvertFromNCHWToNCHW4Float(float *src, float *dst, int num, int channel, int height,
+                                                        int width, bool transpose) {
+    return ConvertFromNCHWToNCHW4<float>(src, dst, num, channel, height, width, transpose);
+}
+Status DataFormatConverter::ConvertFromNCHWToNCHW4Half(short *src, short *dst, int num, int channel, int height,
+                                                       int width, bool transpose) {
+    return ConvertFromNCHWToNCHW4<short>(src, dst, num, channel, height, width, transpose);
+}
+
+Status DataFormatConverter::ConvertFromNCHWToNHWC4Int8(int8_t *src, int8_t *dst, int num, int channel, int hw) {
+    return ConvertFromNCHWToNHWC4<int8_t>(src, dst, num, channel, hw);
+}
+Status DataFormatConverter::ConvertFromNCHW4ToNCHWFloat(float *src, float *dst, int num, int channel, int height,
+                                                        int width) {
+    return ConvertFromNCHW4ToNCHW<float>(src, dst, num, channel, height, width);
+}
+Status DataFormatConverter::ConvertFromNCHW4ToNCHWHalf(short *src, short *dst, int num, int channel, int height,
+                                                       int width) {
+    return ConvertFromNCHW4ToNCHW<short>(src, dst, num, channel, height, width);
+}
+Status DataFormatConverter::ConvertFromNHWC4ToNCHWInt8(int8_t *src, int8_t *dst, int num, int channel, int hw) {
+    return ConvertFromNHWC4ToNCHW<int8_t>(src, dst, num, channel, hw);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/data_format_converter.h b/3rdparty/TNN/source/tnn/utils/data_format_converter.h
new file mode 100644
index 0000000..e43f20c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/data_format_converter.h
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_DATA_FORMAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_UTILS_DATA_FORMAT_CONVERTER_H_
+
+#include <cstdint>
+#include <cstring>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class DataFormatConverter {
+public:
+    // @brief convert weights from [g][o][i][h][w] to [g][o/4][i/4][h][w][16]
+    // @param data_tyep data type info
+    static Status ConvertFromGOIHWToGOIHW16Float(float *src, float *dst, int group, int input_channel,
+                                                 int output_channel, int height, int width, bool tanspose = false);
+    static Status ConvertFromGOIHWToGOIHW16Half(short *src, short *dst, int group, int input_channel,
+                                                int output_channel, int height, int width, bool tanspose = false);
+    static Status ConvertFromGOIHWToGOIHW16Int8(int8_t *src, int8_t *dst, int group, int input_channel,
+                                                int output_channel, int height, int width, bool tanspose = false);
+
+    // @brief convert weights from [n][c][h][w] to [n][c/4][h][w][4]
+    // @param data_tyep data type info
+    static Status ConvertFromNCHWToNCHW4Float(float *src, float *dst, int num, int channel, int height, int width, bool transpose = false);
+    static Status ConvertFromNCHWToNCHW4Half(short *src, short *dst, int num, int channel, int height, int width, bool transpose = false);
+    static Status ConvertFromNCHWToNHWC4Int8(int8_t *src, int8_t *dst, int num, int channel, int hw);
+
+    static Status ConvertFromNCHW4ToNCHWFloat(float *src, float *dst, int num, int channel, int height, int width);
+    static Status ConvertFromNCHW4ToNCHWHalf(short *src, short *dst, int num, int channel, int height, int width);
+    static Status ConvertFromNHWC4ToNCHWInt8(int8_t *src, int8_t *dst, int num, int channel, int hw);
+
+    static Status ConvertFromInt8ToFloatNCHW4(int8_t *src, float *dst, float *scale, int scale_len, int num,
+                                              int channel, int height, int width);
+    static Status ConvertFromInt8ToFloatNCHW(int8_t *src, float *dst, float *scale, int scale_len, int num, int channel,
+                                             int height, int width);
+
+    static Status ConvertFromInt8ToFloatNHWC4(int8_t *src, float *dst, float *scale, int scale_len, int num,
+                                              int channel, int height, int width);
+
+    enum CVT_DIR { NHWC2NCHW, NCHW2NHWC };
+
+    template <class T>
+    static Status ConvertBetweenNHWCAndNCHW(T *src, T *dst, int num, int channel, int height, int width, CVT_DIR dir) {
+        ASSERT(dir == NHWC2NCHW || dir == NCHW2NHWC);
+        bool alloc_mem = false;
+        if (dst == nullptr) {
+            alloc_mem = true;
+            dst       = new T[num * channel * height * width]();
+        }
+
+        for (int n = 0; n < num; ++n) {
+            for (int c = 0; c < channel; ++c) {
+                for (int h = 0; h < height; ++h) {
+                    for (int w = 0; w < width; ++w) {
+                        // n * channel * height * width + c * height * width + h * width + w
+                        // n * height * width * channel + h * width * channel + w * channel + c
+                        if (NHWC2NCHW == dir) {
+                            // nhwc -> nchw
+                            dst[n * channel * height * width + c * height * width + h * width + w] =
+                                src[n * height * width * channel + h * width * channel + w * channel + c];
+                        } else {
+                            // nchw -> nhwc
+                            dst[n * height * width * channel + h * width * channel + w * channel + c] =
+                                src[n * channel * height * width + c * height * width + h * width + w];
+                        }
+                    }
+                }
+            }
+        }
+        if (alloc_mem) {
+            memcpy(src, dst, num * channel * height * width * sizeof(T));
+            delete[] dst;
+        }
+        return TNN_OK;
+    }
+
+    template <class T>
+    static Status ConvertFromNCHWToNHWC(Blob *src, Blob *dst) {
+        ASSERT(src != nullptr);
+        const auto src_dims = src->GetBlobDesc().dims;
+        ASSERT(src_dims.size() > 1);
+        const int num     = src_dims[0];
+        const int channel = src_dims[1];
+        const int height  = src_dims.size() > 2 ? src_dims[2] : 1;
+        const int width   = src_dims.size() > 3 ? src_dims[3] : 1;
+        T *src_data_ptr   = (T *)src->GetHandle().base;
+        T *dst_data_ptr   = dst == nullptr ? nullptr : (T *)dst->GetHandle().base;
+
+        auto status = ConvertBetweenNHWCAndNCHW<T>(src_data_ptr, dst_data_ptr, num, channel, height, width, NCHW2NHWC);
+        return status;
+    }
+
+    template <class T>
+    static Status ConvertFromNHWCToNCHW(Blob *src, Blob *dst) {
+        ASSERT(src != nullptr);
+        const auto src_dims = src->GetBlobDesc().dims;
+        ASSERT(src_dims.size() > 1);
+        const int num     = src_dims[0];
+        const int channel = src_dims[1];
+        const int height  = src_dims.size() > 2 ? src_dims[2] : 1;
+        const int width   = src_dims.size() > 3 ? src_dims[3] : 1;
+        T *src_data_ptr   = (T *)src->GetHandle().base;
+        T *dst_data_ptr   = dst == nullptr ? nullptr : (T *)dst->GetHandle().base;
+
+        auto status = ConvertBetweenNHWCAndNCHW<T>(src_data_ptr, dst_data_ptr, num, channel, height, width, NHWC2NCHW);
+        return status;
+    }
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_DATA_FORMAT_CONVERTER_H_
diff --git a/3rdparty/TNN/source/tnn/utils/data_type_utils.cc b/3rdparty/TNN/source/tnn/utils/data_type_utils.cc
new file mode 100644
index 0000000..f6c4ffc
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/data_type_utils.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/data_type_utils.h"
+#include <limits.h>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+int DataTypeUtils::GetBytesSize(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT) {
+        return 4;
+    } else if (data_type == DATA_TYPE_HALF) {
+        return 2;
+    } else if (data_type == DATA_TYPE_BFP16) {
+        return 2;
+    } else if (data_type == DATA_TYPE_INT8) {
+        return 1;
+    } else if (data_type == DATA_TYPE_INT32) {
+        return 4;
+    } else if (data_type == DATA_TYPE_UINT32) {
+        return 4;
+    } else {
+        LOGE("GetBytes Undefined \n");
+        return -1;
+    }
+}
+
+std::string DataTypeUtils::GetDataTypeString(DataType data_type) {
+    if (data_type == DATA_TYPE_FLOAT) {
+        return "float";
+    } else if (data_type == DATA_TYPE_HALF) {
+        return "half";
+    } else if (data_type == DATA_TYPE_BFP16) {
+        return "bfp16";
+    } else if (data_type == DATA_TYPE_INT8) {
+        return "int8";
+    } else if (data_type == DATA_TYPE_INT32) {
+        return "int32";
+    } else {
+        return "";
+    }
+}
+
+int DataTypeUtils::SaturateCast(long long int data) {
+    return (int)((uint64_t)(data - INT_MIN) <= (uint64_t)UINT_MAX ? data : data > 0 ? INT_MAX : INT_MIN);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.cc b/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.cc
new file mode 100644
index 0000000..65f62aa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.cc
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "detection_post_process_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <queue>
+namespace TNN_NS {
+
+void DecodeBoxes(DetectionPostProcessLayerParam* param, DetectionPostProcessLayerResource* resource,
+                 Blob* boxes_encoding_blob, const CenterSizeEncoding& scale_values, Blob* decode_boxes_blob) {
+    const int num_boxes         = boxes_encoding_blob->GetBlobDesc().dims[1];
+    const int box_coord_num     = boxes_encoding_blob->GetBlobDesc().dims[3];
+    const int num_anchors       = param->num_anchors;
+    const int anchors_coord_num = param->anchors_coord_num;
+    assert(num_boxes == num_anchors);
+    assert(box_coord_num >= 4);
+    assert(anchors_coord_num == 4);
+    const auto boxes_ptr   = (float*)(boxes_encoding_blob->GetHandle().base);
+    const auto anchors_ptr = reinterpret_cast<const CenterSizeEncoding*>(resource->anchors_handle.force_to<void*>());
+    auto decode_boxes_ptr  = reinterpret_cast<BoxCornerEncoding*>(decode_boxes_blob->GetHandle().base);
+    CenterSizeEncoding box_center_size_encoding;
+    CenterSizeEncoding anchor_center_size_encoding;
+    for (int i = 0; i < num_boxes; ++i) {
+        const int box_index         = i * box_coord_num;
+        box_center_size_encoding    = *reinterpret_cast<const CenterSizeEncoding*>(boxes_ptr + box_index);
+        anchor_center_size_encoding = anchors_ptr[i];
+        float ycenter =
+            box_center_size_encoding.y / scale_values.y * anchor_center_size_encoding.h + anchor_center_size_encoding.y;
+        float xcenter =
+            box_center_size_encoding.x / scale_values.x * anchor_center_size_encoding.w + anchor_center_size_encoding.x;
+        float halfh =
+            0.5f * static_cast<float>(exp(box_center_size_encoding.h / scale_values.h)) * anchor_center_size_encoding.h;
+        float halfw =
+            0.5f * static_cast<float>(exp(box_center_size_encoding.w / scale_values.w)) * anchor_center_size_encoding.w;
+        auto& cur_box = decode_boxes_ptr[i];
+        cur_box.ymin  = ycenter - halfh;
+        cur_box.xmin  = xcenter - halfw;
+        cur_box.ymax  = ycenter + halfh;
+        cur_box.xmax  = xcenter + halfw;
+    }
+}
+
+void NonMaxSuppressionMultiClassFastImpl(DetectionPostProcessLayerParam* param,
+                                         DetectionPostProcessLayerResource* resource, Blob* decoded_boxes,
+                                         Blob* class_predictions, Blob* detection_boxes, Blob* detection_class,
+                                         Blob* detection_scores, Blob* num_detections) {
+    const int num_boxes                 = decoded_boxes->GetBlobDesc().dims[0];
+    const int num_classes               = param->num_classes;
+    const int max_classes_per_anchor    = param->max_classes_per_detection;
+    const int num_class_with_background = class_predictions->GetBlobDesc().dims[3];
+
+    const int label_offset = num_class_with_background - num_classes;
+    assert(max_classes_per_anchor > 0);
+    const int num_categories_per_anchor = std::min(max_classes_per_anchor, num_classes);
+    std::vector<float> max_scores;
+    max_scores.resize(num_boxes);
+    std::vector<int> sorted_class_indices;
+    sorted_class_indices.resize(num_boxes * num_classes);
+    const auto scores_start_ptr = static_cast<float*>(class_predictions->GetHandle().base);
+
+    // sort scores on every anchor
+    for (int idx = 0; idx < num_boxes; ++idx) {
+        const auto box_scores = scores_start_ptr + idx * num_class_with_background + label_offset;
+        auto class_indices    = sorted_class_indices.data() + idx * num_classes;
+
+        std::iota(class_indices, class_indices + num_classes, 0);
+        std::partial_sort(class_indices, class_indices + num_categories_per_anchor, class_indices + num_classes,
+                          [&box_scores](const int i, const int j) { return box_scores[i] > box_scores[j]; });
+        max_scores[idx] = box_scores[class_indices[0]];
+    }
+
+    std::vector<int> seleted;
+    NonMaxSuppressionSingleClasssImpl(decoded_boxes, max_scores.data(), param->max_detections, param->nms_iou_threshold,
+                                      param->nms_score_threshold, &seleted);
+
+    const auto decoded_boxes_ptr = reinterpret_cast<const BoxCornerEncoding*>(decoded_boxes->GetHandle().base);
+    auto detection_boxes_ptr     = reinterpret_cast<BoxCornerEncoding*>(detection_boxes->GetHandle().base);
+    auto detection_classes_ptr   = static_cast<float*>(detection_class->GetHandle().base);
+    auto detection_scores_ptr    = static_cast<float*>(detection_scores->GetHandle().base);
+    auto num_detections_ptr      = static_cast<float*>(num_detections->GetHandle().base);
+
+    int output_box_index = 0;
+    for (const auto& selected_index : seleted) {
+        const float* box_scores  = scores_start_ptr + selected_index * num_class_with_background + label_offset;
+        const int* class_indices = sorted_class_indices.data() + selected_index * num_classes;
+        for (int col = 0; col < num_categories_per_anchor; ++col) {
+            int boxOffset                    = num_categories_per_anchor * output_box_index + col;
+            detection_boxes_ptr[boxOffset]   = decoded_boxes_ptr[selected_index];
+            detection_classes_ptr[boxOffset] = class_indices[col];
+            detection_scores_ptr[boxOffset]  = box_scores[class_indices[col]];
+            output_box_index++;
+        }
+    }
+    *num_detections_ptr = output_box_index;
+}
+void NonMaxSuppressionSingleClasssImpl(Blob* decoded_boxes, const float* scores, int max_detections,
+                                       float iou_threshold, float score_threshold, std::vector<int32_t>* selected) {
+    ASSERT(iou_threshold >= 0.0f && iou_threshold <= 1.0f);
+    ASSERT(decoded_boxes->GetBlobDesc().dims.size() == 4);
+    const int num_boxes = decoded_boxes->GetBlobDesc().dims[0];
+    ASSERT(decoded_boxes->GetBlobDesc().dims[1] == 4);
+
+    const int output_num = std::min(max_detections, num_boxes);
+    std::vector<float> scores_data(num_boxes);
+    std::copy_n(scores, num_boxes, scores_data.begin());
+
+    struct Candidate {
+        int box_index;
+        float score;
+    };
+
+    auto cmp = [](const Candidate bs_i, const Candidate bs_j) { return bs_i.score < bs_j.score; };
+
+    std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)> candidate_priority_queue(cmp);
+
+    for (int i = 0; i < scores_data.size(); ++i) {
+        if (scores_data[i] > score_threshold) {
+            candidate_priority_queue.emplace(Candidate({i, scores_data[i]}));
+        }
+    }
+
+    // std::vector<float> selectedScores;
+    Candidate next_candidate;
+    float iou, original_score;
+
+    const auto boxes_ptr = static_cast<float*>(decoded_boxes->GetHandle().base);
+    while (selected->size() < output_num && !candidate_priority_queue.empty()) {
+        next_candidate = candidate_priority_queue.top();
+        original_score = next_candidate.score;
+        candidate_priority_queue.pop();
+
+        // Overlapping boxes are likely to have similar scores,
+        // therefore we iterate through the previously selected boxes backwards
+        // in order to see if `next_candidate` should be suppressed.
+        bool should_select = true;
+        for (int j = (int)selected->size() - 1; j >= 0; --j) {
+            iou = IOU(boxes_ptr, next_candidate.box_index, selected->at(j));
+            if (iou == 0.0) {
+                continue;
+            }
+            if (iou > iou_threshold) {
+                should_select = false;
+            }
+        }
+
+        if (should_select) {
+            selected->push_back(next_candidate.box_index);
+            // selectedScores.push_back(next_candidate.score);
+        }
+    }
+}
+static inline float IOU(const float* boxes, int i, int j) {
+    const float y_min_i = std::min<float>(boxes[i * 4 + 0], boxes[i * 4 + 2]);
+    const float x_min_i = std::min<float>(boxes[i * 4 + 1], boxes[i * 4 + 3]);
+    const float y_max_i = std::max<float>(boxes[i * 4 + 0], boxes[i * 4 + 2]);
+    const float x_max_i = std::max<float>(boxes[i * 4 + 1], boxes[i * 4 + 3]);
+    const float y_min_j = std::min<float>(boxes[j * 4 + 0], boxes[j * 4 + 2]);
+    const float x_min_j = std::min<float>(boxes[j * 4 + 1], boxes[j * 4 + 3]);
+    const float y_max_j = std::max<float>(boxes[j * 4 + 0], boxes[j * 4 + 2]);
+    const float x_max_j = std::max<float>(boxes[j * 4 + 1], boxes[j * 4 + 3]);
+    const float area_i  = (y_max_i - y_min_i) * (x_max_i - x_min_i);
+    const float area_j  = (y_max_j - y_min_j) * (x_max_j - x_min_j);
+    if (area_i <= 0 || area_j <= 0)
+        return 0.0;
+    const float intersection_y_min = std::max<float>(y_min_i, y_min_j);
+    const float intersection_x_min = std::max<float>(x_min_i, x_min_j);
+    const float intersection_y_max = std::min<float>(y_max_i, y_max_j);
+    const float intersection_x_max = std::min<float>(x_max_i, x_max_j);
+    const float intersection_area  = std::max<float>(intersection_y_max - intersection_y_min, 0.0) *
+                                    std::max<float>(intersection_x_max - intersection_x_min, 0.0);
+    return intersection_area / (area_i + area_j - intersection_area);
+}
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.h b/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.h
new file mode 100644
index 0000000..8e61421
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/detection_post_process_utils.h
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_DETECTION_POST_PROCESS_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_DETECTION_POST_PROCESS_UTILS_H_
+#include "tnn/core/blob.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+
+namespace TNN_NS {
+
+struct CenterSizeEncoding {
+    float y = 0;
+    float x = 0;
+    float h = 0;
+    float w = 0;
+};
+
+struct BoxCornerEncoding {
+    float ymin;
+    float xmin;
+    float ymax;
+    float xmax;
+};
+
+void DecodeBoxes(DetectionPostProcessLayerParam* param, DetectionPostProcessLayerResource* resource,
+                 Blob* boxes_encoding_blob, const CenterSizeEncoding& scale_values, Blob* decode_boxes_blob);
+
+void NonMaxSuppressionMultiClassFastImpl(DetectionPostProcessLayerParam* param,
+                                         DetectionPostProcessLayerResource* resource, Blob* decoded_boxes,
+                                         Blob* class_predictions, Blob* detection_boxes, Blob* detection_class,
+                                         Blob* detection_scores, Blob* num_detections);
+
+void NonMaxSuppressionSingleClasssImpl(Blob* decoded_boxes, const float* scores, int max_detections,
+                                       float iou_threshold, float score_threshold, std::vector<int32_t>* selected);
+
+static inline float IOU(const float* boxes, int i, int j);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_DETECTION_POST_PROCESS_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/dims_function_utils.cc b/3rdparty/TNN/source/tnn/utils/dims_function_utils.cc
new file mode 100644
index 0000000..146e78b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_function_utils.cc
@@ -0,0 +1,314 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_function_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+
+#include <cmath>
+#include <climits>
+
+namespace TNN_NS {
+
+DimsVector DimsFunctionUtils::Expand(DimsVector dims0, DimsVector dims1, Status *status) {
+    DimsVector max_dims;
+    DimsVector min_dims;
+    if (dims0.size() >= dims1.size()) {
+        max_dims   = dims0;
+        min_dims = dims1;
+    } else {
+        max_dims   = dims1;
+        min_dims = dims0;
+    }
+    
+    auto output_dims = max_dims;
+    const int offset = (int)(max_dims.size() - min_dims.size());
+    for(int i = 0; i < min_dims.size(); ++i) {
+        if(max_dims[offset + i] == 1 || max_dims[offset + i] == -1) {
+            if (min_dims[i] > output_dims[offset + i]) {
+                output_dims[offset + i] = min_dims[i];
+            }
+        } else if (max_dims[offset + i] != min_dims[i]) {
+            if (status) {
+                *status = Status(TNNERR_PARAM_ERR, "expand param dims error");
+            }
+        }
+    }
+
+    return output_dims;
+}
+
+DimsVector DimsFunctionUtils::Upsample(const DimsVector input_dims,
+                                     std::vector<float> scales, std::vector<int> sizes, int mode, Status *status) {
+    int num          = input_dims[0];
+    int channels   = input_dims[1];
+    int height       = input_dims[2];
+    int width        = input_dims[3];
+    
+    int width_out    = 0;
+    int height_out   = 0;
+    
+    if (sizes.size() <= 0) {
+        if (mode == 1 || mode == 2 || mode == 3) {
+            //floor is wrong for some model
+            width_out  = int(round(width * scales[0]));
+            height_out = int(round(height * scales[1]));
+        } else {
+            if (status) {
+                *status = Status(TNNERR_PARAM_ERR, "unsupport upsample type");
+            }
+            return DimsVector();
+        }
+    } else {
+        width_out  = sizes[0];
+        height_out = sizes[1];
+    }
+
+    if (width_out <= 0 || height_out <= 0) {
+        if (status) {
+            *status = Status(TNNERR_PARAM_ERR, "UpsampleLayer has invalid output shape");
+        }
+    }
+    
+    return {num, channels, height_out, width_out};
+}
+
+DimsVector DimsFunctionUtils::Range(const RangeData start, const RangeData limit,
+                        const RangeData delta, DataType type, Status *status) {
+    int count = 0;
+    if (type == DATA_TYPE_FLOAT) {
+        count = ceil((limit.f - start.f) / delta.f);
+    } else if (type == DATA_TYPE_INT32) {
+        count = ceil((limit.i - start.i) / delta.i);
+    } else {
+        if (status) {
+            *status = Status(TNNERR_PARAM_ERR, "RangeLayer has invalid type");
+        }
+    }
+    
+    count = count >= 0 ? count : 0;
+    
+    return {count};
+}
+
+DimsVector DimsFunctionUtils::Reshape(const DimsVector input_dims, const DimsVector shape,
+                                    const int axis, const int num_axes, Status *status) {
+
+    int output_size = shape.size() + axis;
+    DimsVector output_dims(output_size, 1);
+
+    for(int i = 0; i < axis; ++i) {
+        output_dims[i] = input_dims[i];
+    }
+
+    int infer_dim_count = 0;
+    int infer_dim_pos   = -1;
+    for (int i = axis, j = 0; i < num_axes; i++, j++) {
+        if (shape[j] == -1) {
+            infer_dim_count += 1;
+            infer_dim_pos  = i;
+            output_dims[i] = 1;
+        } else if (shape[j] == 0) {
+            output_dims[i] = input_dims[i];
+        } else {
+            output_dims[i] = shape[j];
+        }
+    }
+    
+    // temporary fix reshpae init error
+    if (infer_dim_count == 0 && infer_dim_pos == -1) {
+        return output_dims;
+    }
+
+    if (infer_dim_count != 1 || infer_dim_pos == -1) {
+        if (status) {
+            *status = Status(TNNERR_PARAM_ERR, "reshape param size error");
+        }
+        return DimsVector();
+    }
+
+    int in_cnt  = DimsVectorUtils::Count(input_dims);
+    int out_cnt = DimsVectorUtils::Count(output_dims);
+    if (0 == out_cnt) {
+        if (status) {
+            *status = Status(TNNERR_COMMON_ERROR, "Error: blob count is zero");
+        }
+    }
+    
+    int infer_dim_v = in_cnt / out_cnt;
+    if (infer_dim_v <= 0) {
+        if (status) {
+            *status = Status(TNNERR_COMMON_ERROR, "Error: blob shape is zero");
+        }
+    }
+    output_dims[infer_dim_pos] = infer_dim_v;
+    return output_dims;
+}
+
+DimsVector DimsFunctionUtils::StrideSlice(const DimsVector input_dims,
+                                        DimsVector& begins, DimsVector& ends, const DimsVector strides,
+                                        const DimsVector axes, Status *status) {
+    if (axes.size() != begins.size() || axes.size() != ends.size() || axes.size() != strides.size()) {
+        if (status) {
+            LOGE("StrideSliceV2Layer param of axes, ends, strides size is invalid\n");
+            *status = Status(TNNERR_PARAM_ERR, "StrideSliceV2Layer param of axes, ends, strides size is invalid");
+            return DimsVector();
+        }
+    }
+
+    auto output_dims = input_dims;
+
+    //前闭后开区间
+    for (int i = 0; i < axes.size(); i++) {
+        int index = axes[i];
+        if (input_dims.size() <= index || output_dims.size() <= index)
+            continue;
+        if (begins[i] < 0) {
+            begins[i] += input_dims[index];
+        }
+
+        if (ends[i] == INT_MAX) {
+            ends[i] = input_dims[index];
+        } else if (ends[i] == INT_MIN) {
+            ends[i] = -1;
+        } else if (ends[i] < 0) {
+            ends[i] += input_dims[index];
+        }
+        
+        if (strides[i] > 0) {
+            output_dims[index] = (ends[i] - begins[i] - 1) / strides[i] + 1;
+        } else {
+            output_dims[index] = (ends[i] - begins[i] + 1) / strides[i] + 1;
+        }
+        
+
+        if (output_dims[index] <= 0) {
+            if (status) {
+                LOGE("StrideSliceV2Layer param is invalid\n");
+                *status = Status(TNNERR_PARAM_ERR, "StrideSliceV2Layer param is invalid");
+            }
+        }
+    }
+    
+    return output_dims;
+}
+
+DimsVector DimsFunctionUtils::Pad(const DimsVector output_index, DimsVector input_dims, DimsVector pads,
+                      int type, Status *status) {
+    DimsVector input_index(output_index.size(), 0);
+    if (type != 0 && type != 1) {
+        if (status) {
+            *status = Status(TNNERR_PARAM_ERR, "PadV2 type is not supported");
+        }
+        return input_index;
+    }
+
+    if (type == 0) {
+        for (int i = 0; i < input_dims.size(); i++) {
+            input_index[i] = output_index[i] - pads[i];
+        }
+    } else if (type == 1) {
+        for (int i = 0; i < input_dims.size(); i++) {
+            int dst        = output_index[i];
+            int pad        = pads[i];
+            int input      = input_dims[i];
+            input_index[i] = dst >= pad ? (dst < pad + input ? dst - pad : pad - 2 - dst + 2 * input) : pad - dst;
+        }
+    }
+    
+    return input_index;
+}
+
+bool DimsFunctionUtils::IsInBox(const DimsVector index, const DimsVector shape) {
+    for (int i=0; i<index.size(); i++) {
+        if (index[i] < 0 || index[i] >= shape[i]) {
+            return false;
+        }
+    }
+    
+    return true;
+}
+
+DimsVector DimsFunctionUtils::IncreaseIndex(DimsVector index, const DimsVector shape, int offset) {
+    if (index.size() <= 0) {
+        return index;
+    }
+    
+    int value = offset;
+    for (int i=(int)index.size()-1; i>=0; i--) {
+        value += index[i];
+        int next_offset = 0;
+        while (value >= shape[i]) {
+            value -= shape[i];
+            next_offset++;
+        }
+        index[i] = value;
+        
+        value = next_offset;
+    }
+    
+    return index;
+}
+
+DimsVector DimsFunctionUtils::StrideOfShape(DimsVector shape) {
+    if (shape.size() <= 0) {
+        return shape;
+    }
+    
+    DimsVector stride(shape.size());
+    for (int i=0; i<stride.size(); i++) {
+        stride[i] = DimsVectorUtils::Count(shape, i+1);
+    }
+    return stride;
+}
+
+DimsVector DimsFunctionUtils::Tile(const DimsVector input_dims, const DimsVector reps) {
+    DimsVector output_dims = input_dims;
+    if (reps.size() > input_dims.size()) {
+        output_dims = reps;
+    }
+    for (auto index_i=(int)input_dims.size()-1, index_o=(int)output_dims.size()-1, index_r=(int)reps.size()-1;
+         index_i>=0 && index_r>=0;
+         index_i--,index_o--,index_r--) {
+        output_dims[index_o] = input_dims[index_i] * reps[index_r];
+    }
+    return output_dims;
+}
+
+
+DimsVector DimsFunctionUtils::ModIndex(DimsVector index, const DimsVector shape) {
+    for (int i=0; i<index.size() && i< shape.size(); i++) {
+        index[i] %= shape[i];
+    }
+    return index;
+}
+
+int DimsFunctionUtils::GetDim(const DimsVector dims, const int index) {
+    return dims.size() > index ? dims[index] : 1;
+}
+
+int DimsFunctionUtils::GetDimProduct(const DimsVector dims, const int start_index, const int end_index) {
+    auto count = DimsVectorUtils::Count(dims, start_index, end_index);
+    return count > 0? count : 1;
+}
+
+DimsVector DimsFunctionUtils::GetDimsStep(const DimsVector& dims) {
+    DimsVector step_dims;
+    for(int i = 0; i < dims.size(); ++i) {
+        step_dims.push_back(DimsVectorUtils::Count(dims, i+1));
+    }
+    return step_dims;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/dims_function_utils.h b/3rdparty/TNN/source/tnn/utils/dims_function_utils.h
new file mode 100644
index 0000000..adc5ff2
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_function_utils.h
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_DIMS_FUNCTION_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_DIMS_FUNCTION_UTILS_H_
+
+#include <algorithm>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class PUBLIC DimsFunctionUtils {
+public:
+    // @brief like onnx expand. The broadcast rule is similar to numpy.array(input) * numpy.ones(shape): Dimensions are right alignment.
+    // Example:
+    // shape = [3, 4]
+    // data = np.reshape(np.arange(1, np.prod(shape) + 1, dtype=np.float32), shape)
+    // print(data)
+    //    [[ 1.  2.  3.  4.]
+    //     [ 5.  6.  7.  8.]
+    //     [ 9. 10. 11. 12.]]
+    // new_shape = [2,1,1, 4]
+    // expanded = data * np.ones(new_shape, dtype=np.float32)
+    // print("shape：",expanded.shape)
+    // shape： (2, 1, 3, 4)
+    // print(expanded)
+    //    [[[[ 1.  2.  3.  4.]
+    //       [ 5.  6.  7.  8.]
+    //       [ 9. 10. 11. 12.]]]
+    //
+    //
+    //     [[[ 1.  2.  3.  4.]
+    //       [ 5.  6.  7.  8.]
+    //       [ 9. 10. 11. 12.]]]]
+    static DimsVector Expand(DimsVector dims0, DimsVector dims1, Status *status);
+    
+    // @brief reshape op to reshape input dims
+    static DimsVector Reshape(const DimsVector input_dims, const DimsVector shape,
+                              const int axis, const int num_axes, Status *status);
+    
+    // @brief strideslice op to slice input dims, it also rectify begins and ends in case value < 0 or = INT_MAX
+    static DimsVector StrideSlice(const DimsVector input_dims,
+                                  DimsVector& begins, DimsVector& ends, const DimsVector strides,
+                                  const DimsVector axes, Status *status);
+    
+    // @brief upsample/resize op to resize input dims
+    static DimsVector Upsample(const DimsVector input_dims,
+                                  std::vector<float> scales, std::vector<int> sizes, int mode, Status *status);
+    // @brief PadV2 to calc input dims index
+    static DimsVector Pad(const DimsVector output_index, DimsVector input_dims, DimsVector pads,
+                          int type, Status *status);
+    
+    // @brief range op to infer output dims
+    static DimsVector Range(const RangeData start, const RangeData limit,
+                            const RangeData delta, DataType type, Status *status);
+
+    static bool IsInBox(const DimsVector index, const DimsVector box);
+    
+    // @brief Increase index by offset, bounded by shape
+    // @param index
+    static DimsVector IncreaseIndex(DimsVector index, const DimsVector shape, int offset = 1);
+    
+    // @brief compute stride of shape index by offset, bounded by shape
+    // @param shape
+    static DimsVector StrideOfShape(DimsVector shape);
+
+    static DimsVector Tile(const DimsVector input_dims, const DimsVector reps);
+
+    static DimsVector ModIndex(DimsVector index, const DimsVector shape);
+
+    // @brief Get dim in dims vector, if index is larger than dims size, return 1
+    static int GetDim(const DimsVector dims, const int index); 
+
+    // @brief Get the product of dims between [start_index, end_index), return 1 if the range is invalid
+    static int GetDimProduct(const DimsVector dims, const int start_index, const int end_index=-1);
+    // @brief step[i]: DimsVectorUtils::Count(dims, i + 1)
+    static DimsVector GetDimsStep(const DimsVector& dims);
+
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_DIMS_FUNCTION_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/dims_offset_utils.cc b/3rdparty/TNN/source/tnn/utils/dims_offset_utils.cc
new file mode 100644
index 0000000..409267a
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_offset_utils.cc
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DimsVector DimsOffsetUtils::ConvertOffsetToIndex(DimsVector dims, int offset) {
+    DimsVector index(dims.size(), 1);
+    for(int i = dims.size() - 1; i >=0; --i) {
+        index[i] = offset % dims[i];
+        offset /= dims[i];
+    }
+    return index;
+}
+
+int DimsOffsetUtils::ConvertIndexToOffset(DimsVector dims, DimsVector index) {
+    int offset = 0;
+    for(int i = 0; i < dims.size(); ++i) {
+        offset = offset * dims[i] + index[i];
+    }
+    return offset;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/dims_offset_utils.h b/3rdparty/TNN/source/tnn/utils/dims_offset_utils.h
new file mode 100644
index 0000000..4697f4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_offset_utils.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_DIMS_OFFSET_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_DIMS_OFFSET_UTILS_H_
+
+#include <algorithm>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+class DimsOffsetUtils {
+public:
+    static DimsVector ConvertOffsetToIndex(DimsVector dims, int offset);
+
+    static int ConvertIndexToOffset(DimsVector dims, DimsVector index);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_DIMS_OFFSET_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/dims_utils.h b/3rdparty/TNN/source/tnn/utils/dims_utils.h
new file mode 100644
index 0000000..11dcf49
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_utils.h
@@ -0,0 +1,22 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_DIMS_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_DIMS_UTILS_H_
+
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/dims_function_utils.h"
+#include "tnn/utils/dims_offset_utils.h"
+
+#endif  // TNN_SOURCE_TNN_UTILS_DIMS_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/dims_vector_utils.cc b/3rdparty/TNN/source/tnn/utils/dims_vector_utils.cc
new file mode 100644
index 0000000..df32a59
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/dims_vector_utils.cc
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_utils.h"
+
+#include <cmath>
+#include <climits>
+
+namespace TNN_NS {
+
+int DimsVectorUtils::Count(const DimsVector &dims, int start_index, int end_index) {
+    if (-1 == end_index || end_index > dims.size()) {
+        end_index = static_cast<int>(dims.size());
+    }
+
+    int result = 1;
+    for (int index = start_index; index < end_index; ++index) {
+        result *= dims[index];
+    }
+    return result;
+}
+
+DimsVector DimsVectorUtils::Max(const DimsVector &dims0, const DimsVector &dims1, int start_index, int end_index) {
+    DimsVector max_dims;
+    DimsVector small_dims;
+    if (dims0.size() >= dims1.size()) {
+        max_dims   = dims0;
+        small_dims = dims1;
+    } else {
+        max_dims   = dims1;
+        small_dims = dims0;
+    }
+
+    if (small_dims.size() <= start_index) {
+        return max_dims;
+    }
+
+    if (-1 == end_index || end_index > small_dims.size()) {
+        end_index = static_cast<int>(small_dims.size());
+    }
+
+    for (int i = start_index; i < end_index; i++) {
+        max_dims[i] = std::max(max_dims[i], small_dims[i]);
+    }
+
+    return max_dims;
+}
+
+DimsVector DimsVectorUtils::Min(const DimsVector &dims0, const DimsVector &dims1, int start_index, int end_index) {
+    DimsVector min_dims;
+    DimsVector small_dims;
+    if (dims0.size() >= dims1.size()) {
+        min_dims   = dims0;
+        small_dims = dims1;
+    } else {
+        min_dims   = dims1;
+        small_dims = dims0;
+    }
+
+    if (small_dims.size() <= start_index) {
+        return small_dims;
+    }
+
+    if (-1 == end_index || end_index > small_dims.size()) {
+        end_index = static_cast<int>(small_dims.size());
+    }
+
+    for (int i = start_index; i < end_index; i++) {
+        min_dims[i] = std::min(min_dims[i], small_dims[i]);
+    }
+
+    return min_dims;
+}
+
+bool DimsVectorUtils::Equal(const DimsVector &dims0, const DimsVector &dims1, int start_index, int end_index) {
+    if (dims0.size() <= start_index) {
+        return false;
+    }
+
+    if (-1 == end_index || end_index > dims0.size()) {
+        end_index = static_cast<int>(dims0.size());
+    }
+
+    if (dims0.size() != dims1.size()) {
+        return false;
+    }
+
+    for (int i = start_index; i < end_index; i++) {
+        if (dims0[i] != dims1[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+DimsVector DimsVectorUtils::NCHW2NHWC(const DimsVector &dims) {
+    ASSERT(dims.size() == 4);
+    const int n           = dims[0];
+    const int c           = dims[1];
+    const int h           = dims[2];
+    const int w           = dims[3];
+    std::vector<int> nhwc = {n, h, w, c};
+    return nhwc;
+}
+
+DimsVector DimsVectorUtils::NHWC2NCHW(const DimsVector &dims) {
+    ASSERT(dims.size() == 4);
+    const int n           = dims[0];
+    const int h           = dims[1];
+    const int w           = dims[2];
+    const int c           = dims[3];
+    std::vector<int> nhwc = {n, c, h, w};
+    return nhwc;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/half.hpp b/3rdparty/TNN/source/tnn/utils/half.hpp
new file mode 100644
index 0000000..340424d
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/half.hpp
@@ -0,0 +1,4272 @@
+// half - IEEE 754-based half-precision floating point library.
+//
+// Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Version 1.12.0
+
+/// \file
+/// Main header file for half precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+/// Combined gcc version number.
+#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// check C++11 language features
+#if defined(__clang__)  // clang
+#if __has_feature(cxx_static_assert) &&                                        \
+    !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if __has_feature(cxx_user_literals) &&                                        \
+    !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) &&         \
+    !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+/*#elif defined(__INTEL_COMPILER)								//Intel C++
+    #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+   ???????? #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 #endif #if
+   __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+   ???????? #define HALF_ENABLE_CPP11_CONSTEXPR 1 #endif #if __INTEL_COMPILER >=
+   1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)			???????? #define
+   HALF_ENABLE_CPP11_NOEXCEPT 1 #endif #if __INTEL_COMPILER >= 1100 &&
+   !defined(HALF_ENABLE_CPP11_LONG_LONG)			???????? #define
+   HALF_ENABLE_CPP11_LONG_LONG 1 #endif*/
+#elif defined(__GNUC__)  // gcc
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#endif
+#elif defined(_MSC_VER)  // Visual C++
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#define HALF_POP_WARNINGS 1
+#pragma warning(push)
+#pragma warning(disable : 4099 4127 4146)  // struct vs class, constant in if,
+                                           // negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)  // libc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#elif defined(__GLIBCXX__)  // libstdc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifdef __clang__
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#else
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#endif
+#elif defined(_CPPLIB_VER)  // Dinkumware/Visual C++
+#if _CPPLIB_VER >= 520
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#endif
+#if _CPPLIB_VER >= 610
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#endif
+#endif
+#undef HALF_GNUC_VERSION
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+#define HALF_CONSTEXPR constexpr
+#define HALF_CONSTEXPR_CONST constexpr
+#else
+#define HALF_CONSTEXPR
+#define HALF_CONSTEXPR_CONST const
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+#define HALF_NOEXCEPT noexcept
+#define HALF_NOTHROW noexcept
+#else
+#define HALF_NOEXCEPT
+#define HALF_NOTHROW throw()
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+#include <cstdint>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+#include <functional>
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between
+/// [half](\ref half_float::half)s and `float`s as well as for the half_cast()
+/// if not specifying a rounding mode explicitly. It can be redefined (before
+/// including half.hpp) to one of the standard rounding modes using their
+/// respective constants or the equivalent values of `std::float_round_style`:
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest (default)
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `-1` (`std::round_indeterminate`), which uses
+/// truncation (round toward zero, but with overflows set to infinity) and is
+/// the fastest rounding mode possible. It can even be set to
+/// `std::numeric_limits<float>::round_style` to synchronize the rounding mode
+/// with that of the underlying single-precision implementation.
+#ifndef HALF_ROUND_STYLE
+#define HALF_ROUND_STYLE -1  // = std::round_indeterminate
+#endif
+
+/// Tie-breaking behaviour for round to nearest.
+/// This specifies if ties in round to nearest should be resolved by rounding to
+/// the nearest even value. By default this is defined to `0` resulting in the
+/// faster but slightly more biased behaviour of rounding away from zero in
+/// half-way cases (and thus equal to the round() function), but can be
+/// redefined to `1` (before including half.hpp) if more IEEE-conformant
+/// behaviour is needed.
+#ifndef HALF_ROUND_TIES_TO_EVEN
+#define HALF_ROUND_TIES_TO_EVEN 0  // ties away from zero
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to
+/// a positive value signaling the overflow of an operation, in particular it
+/// just evaluates to positive infinity.
+#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is only defined if the fma() function generally executes as fast
+/// as, or faster than, a separate half-precision multiplication followed by an
+/// addition. Due to the internal single-precision implementation of all
+/// arithmetic operations, this is in fact always the case.
+#define FP_FAST_FMAH 1
+
+#ifndef FP_ILOGB0
+#define FP_ILOGB0 INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+#define FP_SUBNORMAL 0
+#endif
+#ifndef FP_ZERO
+#define FP_ZERO 1
+#endif
+#ifndef FP_NAN
+#define FP_NAN 2
+#endif
+#ifndef FP_INFINITE
+#define FP_INFINITE 3
+#endif
+#ifndef FP_NORMAL
+#define FP_NORMAL 4
+#endif
+
+/// Main namespace for half precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float {
+class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+/// Library-defined half-precision literals.
+/// Import this namespace to enable half-precision floating point literals:
+/// ~~~~{.cpp}
+/// using namespace half_float::literal;
+/// half_float::half = 4.2_h;
+/// ~~~~
+namespace literal {
+    // half operator""_h(long double);
+}
+#endif
+
+/// \internal
+/// \brief Implementation details.
+namespace detail {
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+    /// Conditional type.
+    template <bool B, typename T, typename F>
+    struct conditional : std::conditional<B, T, F> {};
+
+    /// Helper for tag dispatching.
+    template <bool B>
+    struct bool_type : std::integral_constant<bool, B> {};
+    using std::false_type;
+    using std::true_type;
+
+    /// Type traits for floating point types.
+    template <typename T>
+    struct is_float : std::is_floating_point<T> {};
+#else
+    /// Conditional type.
+    template <bool, typename T, typename>
+    struct conditional {
+        typedef T type;
+    };
+    template <typename T, typename F>
+    struct conditional<false, T, F> {
+        typedef F type;
+    };
+
+    /// Helper for tag dispatching.
+    template <bool>
+    struct bool_type {};
+    typedef bool_type<true> true_type;
+    typedef bool_type<false> false_type;
+
+    /// Type traits for floating point types.
+    template <typename>
+    struct is_float : false_type {};
+    template <typename T>
+    struct is_float<const T> : is_float<T> {};
+    template <typename T>
+    struct is_float<volatile T> : is_float<T> {};
+    template <typename T>
+    struct is_float<const volatile T> : is_float<T> {};
+    template <>
+    struct is_float<float> : true_type {};
+    template <>
+    struct is_float<double> : true_type {};
+    template <>
+    struct is_float<long double> : true_type {};
+#endif
+
+    /// Type traits for floating point bits.
+    template <typename T>
+    struct bits {
+        typedef unsigned char type;
+    };
+    template <typename T>
+    struct bits<const T> : bits<T> {};
+    template <typename T>
+    struct bits<volatile T> : bits<T> {};
+    template <typename T>
+    struct bits<const volatile T> : bits<T> {};
+
+#if HALF_ENABLE_CPP11_CSTDINT
+    /// Unsigned integer of (at least) 16 bits width.
+    typedef std::uint_least16_t uint16;
+
+    /// Unsigned integer of (at least) 32 bits width.
+    template <>
+    struct bits<float> {
+        typedef std::uint_least32_t type;
+    };
+
+    /// Unsigned integer of (at least) 64 bits width.
+    template <>
+    struct bits<double> {
+        typedef std::uint_least64_t type;
+    };
+#else
+    /// Unsigned integer of (at least) 16 bits width.
+    typedef unsigned short uint16;
+
+    /// Unsigned integer of (at least) 32 bits width.
+    template <>
+    struct bits<float>
+        : conditional<std::numeric_limits<unsigned int>::digits >= 32,
+                      unsigned int, unsigned long> {};
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+    /// Unsigned integer of (at least) 64 bits width.
+    template <>
+    struct bits<double>
+        : conditional<std::numeric_limits<unsigned long>::digits >= 64,
+                      unsigned long, unsigned long long> {};
+#else
+    /// Unsigned integer of (at least) 64 bits width.
+    template <>
+    struct bits<double> {
+        typedef unsigned long type;
+    };
+#endif
+#endif
+
+    /// Tag type for binary construction.
+    struct binary_t {};
+
+    /// Tag for binary construction.
+    HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+    /// Temporary half-precision expression.
+    /// This class represents a half-precision expression which just stores a
+    /// single-precision value internally.
+    struct expr {
+        /// Conversion constructor.
+        /// \param f single-precision value to convert
+        explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {}
+
+        /// Conversion to single-precision.
+        /// \return single precision value representing expression value
+        HALF_CONSTEXPR operator float() const HALF_NOEXCEPT {
+            return value_;
+        }
+
+    private:
+        /// Internal expression value stored in single-precision.
+        float value_;
+    };
+
+    /// SFINAE helper for generic half-precision functions.
+    /// This class template has to be specialized for each valid combination of
+    /// argument types to provide a corresponding `type` member equivalent to \a
+    /// T. \tparam T type to return
+    template <typename T, typename, typename = void, typename = void>
+    struct enable {};
+    template <typename T>
+    struct enable<T, half, void, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, void, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, half, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, expr, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, half, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, expr, void> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, half, half> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, half, expr> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, expr, half> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, half, expr, expr> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, half, half> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, half, expr> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, expr, half> {
+        typedef T type;
+    };
+    template <typename T>
+    struct enable<T, expr, expr, expr> {
+        typedef T type;
+    };
+
+    /// Return type for specialized generic 2-argument half-precision functions.
+    /// This class template has to be specialized for each valid combination of
+    /// argument types to provide a corresponding `type` member denoting the
+    /// appropriate return type. \tparam T first argument type \tparam U first
+    /// argument type
+    template <typename T, typename U>
+    struct result : enable<expr, T, U> {};
+    template <>
+    struct result<half, half> {
+        typedef half type;
+    };
+
+    /// \name Classification helpers
+    /// \{
+
+    /// Check for infinity.
+    /// \tparam T argument type (builtin floating point type)
+    /// \param arg value to query
+    /// \retval true if infinity
+    /// \retval false else
+    template <typename T>
+    bool builtin_isinf(T arg) {
+#if HALF_ENABLE_CPP11_CMATH
+        return std::isinf(arg);
+#elif defined(_MSC_VER)
+        return !::_finite(static_cast<double>(arg)) &&
+               !::_isnan(static_cast<double>(arg));
+#else
+        return arg == std::numeric_limits<T>::infinity() ||
+               arg == -std::numeric_limits<T>::infinity();
+#endif
+    }
+
+    /// Check for NaN.
+    /// \tparam T argument type (builtin floating point type)
+    /// \param arg value to query
+    /// \retval true if not a number
+    /// \retval false else
+    template <typename T>
+    bool builtin_isnan(T arg) {
+#if HALF_ENABLE_CPP11_CMATH
+        return std::isnan(arg);
+#elif defined(_MSC_VER)
+        return ::_isnan(static_cast<double>(arg)) != 0;
+#else
+        return arg != arg;
+#endif
+    }
+
+    /// Check sign.
+    /// \tparam T argument type (builtin floating point type)
+    /// \param arg value to query
+    /// \retval true if signbit set
+    /// \retval false else
+    template <typename T>
+    bool builtin_signbit(T arg) {
+#if HALF_ENABLE_CPP11_CMATH
+        return std::signbit(arg);
+#else
+        return arg < T() || (arg == T() && T(1) / arg < T());
+#endif
+    }
+
+    /// \}
+    /// \name Conversion
+    /// \{
+
+    /// Convert IEEE single-precision to half-precision.
+    /// Credit for this goes to [Jeroen van der
+    /// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \param value single-precision value \return binary
+    /// representation of half-precision value
+    template <std::float_round_style R>
+    uint16 float2half_impl(float value, true_type) {
+        typedef bits<float>::type uint32;
+        uint32 bits;  // = *reinterpret_cast<uint32*>(&value);		//violating
+                      // strict aliasing!
+        std::memcpy(&bits, &value, sizeof(float));
+        /*			uint16 hbits = (bits>>16) & 0x8000;
+                    bits &= 0x7FFFFFFF;
+                    int exp = bits >> 23;
+                    if(exp == 255)
+                        return hbits | 0x7C00 |
+           (0x3FF&-static_cast<unsigned>((bits&0x7FFFFF)!=0)); if(exp > 142)
+                    {
+                        if(R == std::round_toward_infinity)
+                            return hbits | 0x7C00 - (hbits>>15);
+                        if(R == std::round_toward_neg_infinity)
+                            return hbits | 0x7BFF + (hbits>>15);
+                        return hbits | 0x7BFF + (R!=std::round_toward_zero);
+                    }
+                    int g, s;
+                    if(exp > 112)
+                    {
+                        g = (bits>>12) & 1;
+                        s = (bits&0xFFF) != 0;
+                        hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF);
+                    }
+                    else if(exp > 101)
+                    {
+                        int i = 125 - exp;
+                        bits = (bits&0x7FFFFF) | 0x800000;
+                        g = (bits>>i) & 1;
+                        s = (bits&((1L<<i)-1)) != 0;
+                        hbits |= bits >> (i+1);
+                    }
+                    else
+                    {
+                        g = 0;
+                        s = bits != 0;
+                    }
+                    if(R == std::round_to_nearest)
+                        #if HALF_ROUND_TIES_TO_EVEN
+                            hbits += g & (s|hbits);
+                        #else
+                            hbits += g;
+                        #endif
+                    else if(R == std::round_toward_infinity)
+                        hbits += ~(hbits>>15) & (s|g);
+                    else if(R == std::round_toward_neg_infinity)
+                        hbits += (hbits>>15) & (g|s);
+        */
+        static const uint16 base_table[512] = {
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001,
+            0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+            0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00,
+            0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+            0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00,
+            0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+            0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+            0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+            0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00,
+            0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+            0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00,
+            0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00};
+        static const unsigned char shift_table[512] = {
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13,
+            13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+            13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13,
+            13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+            13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+            24, 13};
+        uint16 hbits =
+            base_table[bits >> 23] +
+            static_cast<uint16>((bits & 0x7FFFFF) >> shift_table[bits >> 23]);
+        if (R == std::round_to_nearest)
+            hbits +=
+                (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) |
+                 (((bits >> 23) & 0xFF) == 102)) &
+                ((hbits & 0x7C00) != 0x7C00)
+#if HALF_ROUND_TIES_TO_EVEN
+                &
+                (((((static_cast<uint32>(1) << (shift_table[bits >> 23] - 1)) -
+                    1) &
+                   bits) != 0) |
+                 hbits)
+#endif
+                ;
+        else if (R == std::round_toward_zero)
+            hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23];
+        else if (R == std::round_toward_infinity)
+            hbits +=
+                ((((bits & 0x7FFFFF &
+                    ((static_cast<uint32>(1) << (shift_table[bits >> 23])) -
+                     1)) != 0) |
+                  (((bits >> 23) <= 102) & ((bits >> 23) != 0))) &
+                 (hbits < 0x7C00)) -
+                ((hbits == 0xFC00) & ((bits >> 23) != 511));
+        else if (R == std::round_toward_neg_infinity)
+            hbits +=
+                ((((bits & 0x7FFFFF &
+                    ((static_cast<uint32>(1) << (shift_table[bits >> 23])) -
+                     1)) != 0) |
+                  (((bits >> 23) <= 358) & ((bits >> 23) != 256))) &
+                 (hbits < 0xFC00) & (hbits >> 15)) -
+                ((hbits == 0x7C00) & ((bits >> 23) != 255));
+        return hbits;
+    }
+
+    /// Convert IEEE double-precision to half-precision.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \param value double-precision value \return binary
+    /// representation of half-precision value
+    template <std::float_round_style R>
+    uint16 float2half_impl(double value, true_type) {
+        typedef bits<float>::type uint32;
+        typedef bits<double>::type uint64;
+        uint64 bits;  // = *reinterpret_cast<uint64*>(&value);		//violating
+                      // strict aliasing!
+        std::memcpy(&bits, &value, sizeof(double));
+        uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF;
+        uint16 hbits = (hi >> 16) & 0x8000;
+        hi &= 0x7FFFFFFF;
+        int exp = hi >> 20;
+        if (exp == 2047)
+            return hbits | 0x7C00 |
+                   (0x3FF &
+                    -static_cast<unsigned>((bits & 0xFFFFFFFFFFFFF) != 0));
+        if (exp > 1038) {
+            if (R == std::round_toward_infinity)
+                return hbits | 0x7C00 - (hbits >> 15);
+            if (R == std::round_toward_neg_infinity)
+                return hbits | 0x7BFF + (hbits >> 15);
+            return hbits | 0x7BFF + (R != std::round_toward_zero);
+        }
+        int g, s = lo != 0;
+        if (exp > 1008) {
+            g = (hi >> 9) & 1;
+            s |= (hi & 0x1FF) != 0;
+            hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF);
+        } else if (exp > 997) {
+            int i = 1018 - exp;
+            hi    = (hi & 0xFFFFF) | 0x100000;
+            g     = (hi >> i) & 1;
+            s |= (hi & ((1L << i) - 1)) != 0;
+            hbits |= hi >> (i + 1);
+        } else {
+            g = 0;
+            s |= hi != 0;
+        }
+        if (R == std::round_to_nearest)
+#if HALF_ROUND_TIES_TO_EVEN
+            hbits += g & (s | hbits);
+#else
+            hbits += g;
+#endif
+        else if (R == std::round_toward_infinity)
+            hbits += ~(hbits >> 15) & (s | g);
+        else if (R == std::round_toward_neg_infinity)
+            hbits += (hbits >> 15) & (g | s);
+        return hbits;
+    }
+
+    /// Convert non-IEEE floating point to half-precision.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam T source type (builtin floating point type) \param
+    /// value floating point value \return binary representation of
+    /// half-precision value
+    template <std::float_round_style R, typename T>
+    uint16 float2half_impl(T value, ...) {
+        uint16 hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+        if (value == T())
+            return hbits;
+        if (builtin_isnan(value))
+            return hbits | 0x7FFF;
+        if (builtin_isinf(value))
+            return hbits | 0x7C00;
+        int exp;
+        std::frexp(value, &exp);
+        if (exp > 16) {
+            if (R == std::round_toward_infinity)
+                return hbits | 0x7C00 - (hbits >> 15);
+            else if (R == std::round_toward_neg_infinity)
+                return hbits | 0x7BFF + (hbits >> 15);
+            return hbits | 0x7BFF + (R != std::round_toward_zero);
+        }
+        if (exp < -13)
+            value = std::ldexp(value, 24);
+        else {
+            value = std::ldexp(value, 11 - exp);
+            hbits |= ((exp + 13) << 10);
+        }
+        T ival, frac = std::modf(value, &ival);
+        hbits += static_cast<uint16>(std::abs(static_cast<int>(ival)));
+        if (R == std::round_to_nearest) {
+            frac = std::abs(frac);
+#if HALF_ROUND_TIES_TO_EVEN
+            hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits);
+#else
+            hbits += frac >= T(0.5);
+#endif
+        } else if (R == std::round_toward_infinity)
+            hbits += frac > T();
+        else if (R == std::round_toward_neg_infinity)
+            hbits += frac < T();
+        return hbits;
+    }
+
+    /// Convert floating point to half-precision.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam T source type (builtin floating point type) \param
+    /// value floating point value \return binary representation of
+    /// half-precision value
+    template <std::float_round_style R, typename T>
+    uint16 float2half(T value) {
+        return float2half_impl<R>(
+            value, bool_type < std::numeric_limits<T>::is_iec559 &&
+                       sizeof(typename bits<T>::type) == sizeof(T) > ());
+    }
+
+    /// Convert integer to half-precision floating point.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam S `true` if value negative, `false` else \tparam T type
+    /// to convert (builtin integer type) \param value non-negative integral
+    /// value \return binary representation of half-precision value
+    template <std::float_round_style R, bool S, typename T>
+    uint16 int2half_impl(T value) {
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+        static_assert(
+            std::is_integral<T>::value,
+            "int to half conversion only supports builtin integer types");
+#endif
+        if (S)
+            value = -value;
+        uint16 bits = S << 15;
+        if (value > 0xFFFF) {
+            if (R == std::round_toward_infinity)
+                bits |= 0x7C00 - S;
+            else if (R == std::round_toward_neg_infinity)
+                bits |= 0x7BFF + S;
+            else
+                bits |= 0x7BFF + (R != std::round_toward_zero);
+        } else if (value) {
+            unsigned int m = value, exp = 24;
+            for (; m < 0x400; m <<= 1, --exp)
+                ;
+            for (; m > 0x7FF; m >>= 1, ++exp)
+                ;
+            bits |= (exp << 10) + m;
+            if (exp > 24) {
+                if (R == std::round_to_nearest)
+                    bits += (value >> (exp - 25)) & 1
+#if HALF_ROUND_TIES_TO_EVEN
+                            & (((((1 << (exp - 25)) - 1) & value) != 0) | bits)
+#endif
+                        ;
+                else if (R == std::round_toward_infinity)
+                    bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S;
+                else if (R == std::round_toward_neg_infinity)
+                    bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S;
+            }
+        }
+        return bits;
+    }
+
+    /// Convert integer to half-precision floating point.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam T type to convert (builtin integer type) \param value
+    /// integral value \return binary representation of half-precision value
+    template <std::float_round_style R, typename T>
+    uint16 int2half(T value) {
+        return (value < 0) ? int2half_impl<R, true>(value)
+                           : int2half_impl<R, false>(value);
+    }
+
+    /// Convert half-precision to IEEE single-precision.
+    /// Credit for this goes to [Jeroen van der
+    /// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). \param
+    /// value binary representation of half-precision value \return
+    /// single-precision value
+    inline float half2float_impl(uint16 value, float, true_type) {
+        typedef bits<float>::type uint32;
+        /*			uint32 bits = static_cast<uint32>(value&0x8000) << 16;
+                    int abs = value & 0x7FFF;
+                    if(abs)
+                    {
+                        bits |= 0x38000000 <<
+           static_cast<unsigned>(abs>=0x7C00); for(; abs<0x400;
+           abs<<=1,bits-=0x800000) ; bits += static_cast<uint32>(abs) << 13;
+                    }
+        */
+        static const uint32 mantissa_table[2048] = {
+            0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000,
+            0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000,
+            0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000,
+            0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000,
+            0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000,
+            0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000,
+            0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000,
+            0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000,
+            0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000,
+            0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000,
+            0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000,
+            0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000,
+            0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000,
+            0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000,
+            0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000,
+            0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+            0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000,
+            0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000,
+            0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000,
+            0x36BE0000, 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000,
+            0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000,
+            0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000,
+            0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, 0x36E40000,
+            0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000,
+            0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000,
+            0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000,
+            0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000,
+            0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000,
+            0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000,
+            0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+            0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000,
+            0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+            0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000,
+            0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000,
+            0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000,
+            0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+            0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000,
+            0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000,
+            0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000,
+            0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000,
+            0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000,
+            0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000,
+            0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000,
+            0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000,
+            0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 0x37600000,
+            0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000,
+            0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000,
+            0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+            0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000,
+            0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000,
+            0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000,
+            0x377F0000, 0x37800000, 0x37808000, 0x37810000, 0x37818000,
+            0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000,
+            0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+            0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000,
+            0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000,
+            0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000,
+            0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000,
+            0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000,
+            0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+            0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000,
+            0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000,
+            0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000,
+            0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+            0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000,
+            0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000,
+            0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000,
+            0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000,
+            0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000,
+            0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000,
+            0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000,
+            0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000,
+            0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000,
+            0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000,
+            0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000,
+            0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000,
+            0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000,
+            0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000,
+            0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000,
+            0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+            0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000,
+            0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000,
+            0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000,
+            0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
+            0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000,
+            0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000,
+            0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000,
+            0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000,
+            0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000,
+            0x37DE8000, 0x37DF0000, 0x37DF8000, 0x37E00000, 0x37E08000,
+            0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000,
+            0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000,
+            0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000,
+            0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000,
+            0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000,
+            0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+            0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000,
+            0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000,
+            0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000,
+            0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000,
+            0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000,
+            0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000,
+            0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000,
+            0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000,
+            0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000,
+            0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000,
+            0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000,
+            0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000,
+            0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000,
+            0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000,
+            0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000,
+            0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+            0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000,
+            0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000,
+            0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000,
+            0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000,
+            0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000,
+            0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000,
+            0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000,
+            0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000,
+            0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000,
+            0x38174000, 0x38178000, 0x3817C000, 0x38180000, 0x38184000,
+            0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000,
+            0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000,
+            0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000,
+            0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000,
+            0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000,
+            0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+            0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000,
+            0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000,
+            0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000,
+            0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000,
+            0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000,
+            0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000,
+            0x38278000, 0x3827C000, 0x38280000, 0x38284000, 0x38288000,
+            0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000,
+            0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000,
+            0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000,
+            0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000,
+            0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000,
+            0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000,
+            0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000,
+            0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000,
+            0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+            0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000,
+            0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000,
+            0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000,
+            0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000,
+            0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000,
+            0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000,
+            0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000,
+            0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000,
+            0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000,
+            0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000,
+            0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000,
+            0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000,
+            0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000,
+            0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000,
+            0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000,
+            0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+            0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000,
+            0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000,
+            0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000,
+            0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000,
+            0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000,
+            0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000,
+            0x384F8000, 0x384FC000, 0x38500000, 0x38504000, 0x38508000,
+            0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000,
+            0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000,
+            0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000,
+            0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000,
+            0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000,
+            0x38570000, 0x38574000, 0x38578000, 0x3857C000, 0x38580000,
+            0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000,
+            0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000,
+            0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+            0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000,
+            0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000,
+            0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000,
+            0x385FC000, 0x38600000, 0x38604000, 0x38608000, 0x3860C000,
+            0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000,
+            0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000,
+            0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000,
+            0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000,
+            0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000,
+            0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000,
+            0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000,
+            0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000,
+            0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000,
+            0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000,
+            0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000,
+            0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+            0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000,
+            0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000,
+            0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000,
+            0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000,
+            0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000,
+            0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000,
+            0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000,
+            0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000,
+            0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000,
+            0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000,
+            0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000,
+            0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000,
+            0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 0x38000000,
+            0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000,
+            0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000,
+            0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+            0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000,
+            0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
+            0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000,
+            0x3803E000, 0x38040000, 0x38042000, 0x38044000, 0x38046000,
+            0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000,
+            0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000,
+            0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000,
+            0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000,
+            0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000,
+            0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000,
+            0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000,
+            0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000,
+            0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000,
+            0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000,
+            0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000,
+            0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+            0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000,
+            0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000,
+            0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000,
+            0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000,
+            0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000,
+            0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000,
+            0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000,
+            0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000,
+            0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000,
+            0x3811A000, 0x3811C000, 0x3811E000, 0x38120000, 0x38122000,
+            0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000,
+            0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000,
+            0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000,
+            0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000,
+            0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000,
+            0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+            0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000,
+            0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000,
+            0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000,
+            0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000,
+            0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000,
+            0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000,
+            0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000,
+            0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000,
+            0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000,
+            0x381BA000, 0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000,
+            0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000,
+            0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000,
+            0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000,
+            0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000,
+            0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000,
+            0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+            0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000,
+            0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000,
+            0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000,
+            0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000,
+            0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000,
+            0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000,
+            0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000,
+            0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000,
+            0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000,
+            0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000,
+            0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000,
+            0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000,
+            0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000,
+            0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000,
+            0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000,
+            0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+            0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000,
+            0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000,
+            0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000,
+            0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000,
+            0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000,
+            0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000,
+            0x382DC000, 0x382DE000, 0x382E0000, 0x382E2000, 0x382E4000,
+            0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000,
+            0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000,
+            0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000,
+            0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000,
+            0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000,
+            0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000,
+            0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000,
+            0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000,
+            0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+            0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000,
+            0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000,
+            0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000,
+            0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000,
+            0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000,
+            0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000,
+            0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000,
+            0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000,
+            0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000,
+            0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000,
+            0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000,
+            0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000,
+            0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000,
+            0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000,
+            0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000,
+            0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+            0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000,
+            0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000,
+            0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000,
+            0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000,
+            0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000,
+            0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000,
+            0x3841C000, 0x3841E000, 0x38420000, 0x38422000, 0x38424000,
+            0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000,
+            0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000,
+            0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000,
+            0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000,
+            0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000,
+            0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000,
+            0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000,
+            0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000,
+            0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+            0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000,
+            0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000,
+            0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000,
+            0x3849E000, 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000,
+            0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000,
+            0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000,
+            0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000,
+            0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000,
+            0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000,
+            0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000,
+            0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000,
+            0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000,
+            0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000,
+            0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000,
+            0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000,
+            0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+            0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000,
+            0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000,
+            0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000,
+            0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000,
+            0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000,
+            0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000,
+            0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000,
+            0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000,
+            0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000,
+            0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000,
+            0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000,
+            0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+            0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 0x385A0000,
+            0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000,
+            0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000,
+            0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+            0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000,
+            0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000,
+            0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000,
+            0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000,
+            0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000,
+            0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000,
+            0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000,
+            0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000,
+            0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000,
+            0x3861A000, 0x3861C000, 0x3861E000, 0x38620000, 0x38622000,
+            0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000,
+            0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000,
+            0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000,
+            0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000,
+            0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000,
+            0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+            0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000,
+            0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000,
+            0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000,
+            0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+            0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000,
+            0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000,
+            0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000,
+            0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000,
+            0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000,
+            0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000,
+            0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000,
+            0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000,
+            0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000,
+            0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000,
+            0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000,
+            0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+            0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000,
+            0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000,
+            0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000,
+            0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000,
+            0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000,
+            0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000,
+            0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000,
+            0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000,
+            0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000,
+            0x3875A000, 0x3875C000, 0x3875E000, 0x38760000, 0x38762000,
+            0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000,
+            0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+            0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000,
+            0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000,
+            0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000,
+            0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+            0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000,
+            0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000,
+            0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000,
+            0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000,
+            0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000,
+            0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000,
+            0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000,
+            0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000,
+            0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000,
+            0x387FA000, 0x387FC000, 0x387FE000};
+        static const uint32 exponent_table[64] = {
+            0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000,
+            0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000,
+            0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000,
+            0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000,
+            0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000,
+            0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000,
+            0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000,
+            0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000,
+            0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000,
+            0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000,
+            0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000,
+            0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000,
+            0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000};
+        static const unsigned short offset_table[64] = {
+            0,    1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 0,
+            1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+            1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
+        uint32 bits =
+            mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] +
+            exponent_table[value >> 10];
+        //			return *reinterpret_cast<float*>(&bits);			//violating
+        //strict aliasing!
+        float out;
+        std::memcpy(&out, &bits, sizeof(float));
+        return out;
+    }
+
+    /// Convert half-precision to IEEE double-precision.
+    /// \param value binary representation of half-precision value
+    /// \return double-precision value
+    inline double half2float_impl(uint16 value, double, true_type) {
+        typedef bits<float>::type uint32;
+        typedef bits<double>::type uint64;
+        uint32 hi = static_cast<uint32>(value & 0x8000) << 16;
+        int abs   = value & 0x7FFF;
+        if (abs) {
+            hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
+            for (; abs < 0x400; abs <<= 1, hi -= 0x100000)
+                ;
+            hi += static_cast<uint32>(abs) << 10;
+        }
+        uint64 bits = static_cast<uint64>(hi) << 32;
+        //			return *reinterpret_cast<double*>(&bits);
+        ////violating strict aliasing!
+        double out;
+        std::memcpy(&out, &bits, sizeof(double));
+        return out;
+    }
+
+    /// Convert half-precision to non-IEEE floating point.
+    /// \tparam T type to convert to (builtin integer type)
+    /// \param value binary representation of half-precision value
+    /// \return floating point value
+    template <typename T>
+    T half2float_impl(uint16 value, T, ...) {
+        T out;
+        int abs = value & 0x7FFF;
+        if (abs > 0x7C00)
+            out = std::numeric_limits<T>::has_quiet_NaN
+                      ? std::numeric_limits<T>::quiet_NaN()
+                      : T();
+        else if (abs == 0x7C00)
+            out = std::numeric_limits<T>::has_infinity
+                      ? std::numeric_limits<T>::infinity()
+                      : std::numeric_limits<T>::max();
+        else if (abs > 0x3FF)
+            out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400),
+                             (abs >> 10) - 25);
+        else
+            out = std::ldexp(static_cast<T>(abs), -24);
+        return (value & 0x8000) ? -out : out;
+    }
+
+    /// Convert half-precision to floating point.
+    /// \tparam T type to convert to (builtin integer type)
+    /// \param value binary representation of half-precision value
+    /// \return floating point value
+    template <typename T>
+    T half2float(uint16 value) {
+        return half2float_impl(
+            value, T(),
+            bool_type < std::numeric_limits<T>::is_iec559 &&
+                sizeof(typename bits<T>::type) == sizeof(T) > ());
+    }
+
+    /// Convert half-precision floating point to integer.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam E `true` for round to even, `false` for round away from
+    /// zero \tparam T type to convert to (buitlin integer type with at least 16
+    /// bits precision, excluding any implicit sign bits) \param value binary
+    /// representation of half-precision value \return integral value
+    template <std::float_round_style R, bool E, typename T>
+    T half2int_impl(uint16 value) {
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+        static_assert(
+            std::is_integral<T>::value,
+            "half to int conversion only supports builtin integer types");
+#endif
+        unsigned int e = value & 0x7FFF;
+        if (e >= 0x7C00)
+            return (value & 0x8000) ? std::numeric_limits<T>::min()
+                                    : std::numeric_limits<T>::max();
+        if (e < 0x3800) {
+            if (R == std::round_toward_infinity)
+                return T(~(value >> 15) & (e != 0));
+            else if (R == std::round_toward_neg_infinity)
+                return -T(value > 0x8000);
+            return T();
+        }
+        unsigned int m = (value & 0x3FF) | 0x400;
+        e >>= 10;
+        if (e < 25) {
+            if (R == std::round_to_nearest)
+                m += (1 << (24 - e)) - (~(m >> (25 - e)) & E);
+            else if (R == std::round_toward_infinity)
+                m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U);
+            else if (R == std::round_toward_neg_infinity)
+                m += -(value >> 15) & ((1 << (25 - e)) - 1U);
+            m >>= 25 - e;
+        } else
+            m <<= e - 25;
+        return (value & 0x8000) ? -static_cast<T>(m) : static_cast<T>(m);
+    }
+
+    /// Convert half-precision floating point to integer.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam T type to convert to (buitlin integer type with at
+    /// least 16 bits precision, excluding any implicit sign bits) \param value
+    /// binary representation of half-precision value \return integral value
+    template <std::float_round_style R, typename T>
+    T half2int(uint16 value) {
+        return half2int_impl<R, HALF_ROUND_TIES_TO_EVEN, T>(value);
+    }
+
+    /// Convert half-precision floating point to integer using
+    /// round-to-nearest-away-from-zero. \tparam T type to convert to (buitlin
+    /// integer type with at least 16 bits precision, excluding any implicit
+    /// sign bits) \param value binary representation of half-precision value
+    /// \return integral value
+    template <typename T>
+    T half2int_up(uint16 value) {
+        return half2int_impl<std::round_to_nearest, 0, T>(value);
+    }
+
+    /// Round half-precision number to nearest integer value.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \tparam E `true` for round to even, `false` for round away from
+    /// zero \param value binary representation of half-precision value \return
+    /// half-precision bits for nearest integral value
+    template <std::float_round_style R, bool E>
+    uint16 round_half_impl(uint16 value) {
+        unsigned int e = value & 0x7FFF;
+        uint16 result  = value;
+        if (e < 0x3C00) {
+            result &= 0x8000;
+            if (R == std::round_to_nearest)
+                result |= 0x3C00U & -(e >= (0x3800 + E));
+            else if (R == std::round_toward_infinity)
+                result |= 0x3C00U & -(~(value >> 15) & (e != 0));
+            else if (R == std::round_toward_neg_infinity)
+                result |= 0x3C00U & -(value > 0x8000);
+        } else if (e < 0x6400) {
+            e                 = 25 - (e >> 10);
+            unsigned int mask = (1 << e) - 1;
+            if (R == std::round_to_nearest)
+                result += (1 << (e - 1)) - (~(result >> e) & E);
+            else if (R == std::round_toward_infinity)
+                result += mask & ((value >> 15) - 1);
+            else if (R == std::round_toward_neg_infinity)
+                result += mask & -(value >> 15);
+            result &= ~mask;
+        }
+        return result;
+    }
+
+    /// Round half-precision number to nearest integer value.
+    /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest
+    /// rounding \param value binary representation of half-precision value
+    /// \return half-precision bits for nearest integral value
+    template <std::float_round_style R>
+    uint16 round_half(uint16 value) {
+        return round_half_impl<R, HALF_ROUND_TIES_TO_EVEN>(value);
+    }
+
+    /// Round half-precision number to nearest integer value using
+    /// round-to-nearest-away-from-zero. \param value binary representation of
+    /// half-precision value \return half-precision bits for nearest integral
+    /// value
+    inline uint16 round_half_up(uint16 value) {
+        return round_half_impl<std::round_to_nearest, 0>(value);
+    }
+    /// \}
+
+    struct functions;
+    template <typename>
+    struct unary_specialized;
+    template <typename, typename>
+    struct binary_specialized;
+    template <typename, typename, std::float_round_style>
+    struct half_caster;
+}  // namespace detail
+
+/// Half-precision floating point type.
+/// This class implements an IEEE-conformant half-precision floating point type
+/// with the usual arithmetic operators and conversions. It is implicitly
+/// convertible to single-precision floating point, which makes artihmetic
+/// expressions and functions with mixed-type operands to be of the most precise
+/// operand type. Additionally all arithmetic operations (and many mathematical
+/// functions) are carried out in single-precision internally. All conversions
+/// from single- to half-precision are done using the library's default rounding
+/// mode, but temporary results inside chained arithmetic expressions are kept
+/// in single-precision as long as possible (while of course still maintaining a
+/// strong half-precision type).
+///
+/// According to the C++98/03 definition, the half type is not a POD type. But
+/// according to C++11's less strict and extended definitions it is both a
+/// standard layout type and a trivially copyable type (even if not a POD type),
+/// which means it can be standard-conformantly copied using raw binary copies.
+/// But in this context some more words about the actual size of the type.
+/// Although the half is representing an IEEE 16-bit type, it does not
+/// neccessarily have to be of exactly 16-bits size. But on any reasonable
+/// implementation the actual binary representation of this type will most
+/// probably not ivolve any additional "magic" or padding beyond the simple
+/// binary representation of the underlying 16-bit IEEE number, even if not
+/// strictly guaranteed by the standard. But even then it only has an actual
+/// size of 16 bits if your C++ implementation supports an unsigned integer type
+/// of exactly 16 bits width. But this should be the case on nearly any
+/// reasonable platform.
+///
+/// So if your C++ implementation is not totally exotic or imposes special
+/// alignment requirements, it is a reasonable assumption that the data of a
+/// half is just comprised of the 2 bytes of the underlying IEEE representation.
+class half {
+    friend struct detail::functions;
+    friend struct detail::unary_specialized<half>;
+    friend struct detail::binary_specialized<half, half>;
+    template <typename, typename, std::float_round_style>
+    friend struct detail::half_caster;
+    friend class std::numeric_limits<half>;
+#if HALF_ENABLE_CPP11_HASH
+    friend struct std::hash<half>;
+#endif
+#if HALF_ENABLE_CPP11_USER_LITERALS
+    // friend half literal::operator""_h(long double);
+#endif
+
+public:
+    /// Default constructor.
+    /// This initializes the half to 0. Although this does not match the builtin
+    /// types' default-initialization semantics and may be less efficient than
+    /// no initialization, it is needed to provide proper value-initialization
+    /// semantics.
+    HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+    /// Copy constructor.
+    /// param T type of concrete half expression
+    /// param rhs half expression to copy from
+    half(detail::expr rhs)
+        : data_(detail::float2half<round_style>(static_cast<float>(rhs))) {}
+
+    /// Conversion constructor.
+    /// param rhs float to convert
+    explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+
+    /// Conversion to single-precision.
+    /// return single precision value representing expression value
+    operator float() const {
+        return detail::half2float<float>(data_);
+    }
+
+    /// Assignment operator.
+    /// tparam T type of concrete half expression
+    /// param rhs half expression to copy from
+    /// return reference to this half
+    half &operator=(detail::expr rhs) {
+        return *this = static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to add
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half &, T>::type operator+=(T rhs) {
+        return *this += static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to subtract
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half &, T>::type operator-=(T rhs) {
+        return *this -= static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to multiply with
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half &, T>::type operator*=(T rhs) {
+        return *this *= static_cast<float>(rhs);
+    }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to divide by
+    /// \return reference to this half
+    template <typename T>
+    typename detail::enable<half &, T>::type operator/=(T rhs) {
+        return *this /= static_cast<float>(rhs);
+    }
+
+    /// Assignment operator.
+    /// \param rhs single-precision value to copy from
+    /// \return reference to this half
+    half &operator=(float rhs) {
+        data_ = detail::float2half<round_style>(rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to add
+    /// \return reference to this half
+    half &operator+=(float rhs) {
+        data_ = detail::float2half<round_style>(
+            detail::half2float<float>(data_) + rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to subtract
+    /// \return reference to this half
+    half &operator-=(float rhs) {
+        data_ = detail::float2half<round_style>(
+            detail::half2float<float>(data_) - rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to multiply with
+    /// \return reference to this half
+    half &operator*=(float rhs) {
+        data_ = detail::float2half<round_style>(
+            detail::half2float<float>(data_) * rhs);
+        return *this;
+    }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to divide by
+    /// \return reference to this half
+    half &operator/=(float rhs) {
+        data_ = detail::float2half<round_style>(
+            detail::half2float<float>(data_) / rhs);
+        return *this;
+    }
+
+    /// Prefix increment.
+    /// \return incremented half value
+    half &operator++() {
+        return *this += 1.0f;
+    }
+
+    /// Prefix decrement.
+    /// \return decremented half value
+    half &operator--() {
+        return *this -= 1.0f;
+    }
+
+    /// Postfix increment.
+    /// \return non-incremented half value
+    half operator++(int) {
+        half out(*this);
+        ++*this;
+        return out;
+    }
+
+    /// Postfix decrement.
+    /// \return non-decremented half value
+    half operator--(int) {
+        half out(*this);
+        --*this;
+        return out;
+    }
+
+private:
+    /// Rounding mode to use
+    static const std::float_round_style round_style =
+        (std::float_round_style)(HALF_ROUND_STYLE);
+
+    /// Constructor.
+    /// \param bits binary representation to set half to
+    HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT
+        : data_(bits) {}
+
+    /// Internal binary representation
+    detail::uint16 data_;
+};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+namespace literal {
+    /// Half literal.
+    /// While this returns an actual half-precision value, half literals can
+    /// unfortunately not be constant expressions due to rather involved
+    /// conversions. \param value literal value \return half with given value
+    /// (if representable)
+    // inline half operator""_h(long double value) { return half(detail::binary,
+    // detail::float2half<half::round_style>(value)); }
+}
+#endif
+
+namespace detail {
+    /// Wrapper implementing unspecialized half-precision functions.
+    struct functions {
+        /// Addition implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision sum stored in single-precision
+        static expr plus(float x, float y) {
+            return expr(x + y);
+        }
+
+        /// Subtraction implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision difference stored in single-precision
+        static expr minus(float x, float y) {
+            return expr(x - y);
+        }
+
+        /// Multiplication implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision product stored in single-precision
+        static expr multiplies(float x, float y) {
+            return expr(x * y);
+        }
+
+        /// Division implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision quotient stored in single-precision
+        static expr divides(float x, float y) {
+            return expr(x / y);
+        }
+
+        /// Output implementation.
+        /// \param out stream to write to
+        /// \param arg value to write
+        /// \return reference to stream
+        template <typename charT, typename traits>
+        static std::basic_ostream<charT, traits> &write(
+            std::basic_ostream<charT, traits> &out, float arg) {
+            return out << arg;
+        }
+
+        /// Input implementation.
+        /// \param in stream to read from
+        /// \param arg half to read into
+        /// \return reference to stream
+        template <typename charT, typename traits>
+        static std::basic_istream<charT, traits> &read(
+            std::basic_istream<charT, traits> &in, half &arg) {
+            float f;
+            if (in >> f)
+                arg = f;
+            return in;
+        }
+
+        /// Modulo implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision division remainder stored in single-precision
+        static expr fmod(float x, float y) {
+            return expr(std::fmod(x, y));
+        }
+
+        /// Remainder implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Half-precision division remainder stored in single-precision
+        static expr remainder(float x, float y) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::remainder(x, y));
+#else
+            if (builtin_isnan(x) || builtin_isnan(y))
+                return expr(std::numeric_limits<float>::quiet_NaN());
+            float ax = std::fabs(x), ay = std::fabs(y);
+            if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+                return expr(std::numeric_limits<float>::quiet_NaN());
+            if (ay >= 65536.0f)
+                return expr(x);
+            if (ax == ay)
+                return expr(builtin_signbit(x) ? -0.0f : 0.0f);
+            ax       = std::fmod(ax, ay + ay);
+            float y2 = 0.5f * ay;
+            if (ax > y2) {
+                ax -= ay;
+                if (ax >= y2)
+                    ax -= ay;
+            }
+            return expr(builtin_signbit(x) ? -ax : ax);
+#endif
+        }
+
+        /// Remainder implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \param quo address to store quotient bits at
+        /// \return Half-precision division remainder stored in single-precision
+        static expr remquo(float x, float y, int *quo) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::remquo(x, y, quo));
+#else
+            if (builtin_isnan(x) || builtin_isnan(y))
+                return expr(std::numeric_limits<float>::quiet_NaN());
+            bool sign  = builtin_signbit(x),
+                 qsign = static_cast<bool>(sign ^ builtin_signbit(y));
+            float ax = std::fabs(x), ay = std::fabs(y);
+            if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+                return expr(std::numeric_limits<float>::quiet_NaN());
+            if (ay >= 65536.0f)
+                return expr(x);
+            if (ax == ay)
+                return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f);
+            ax       = std::fmod(ax, 8.0f * ay);
+            int cquo = 0;
+            if (ax >= 4.0f * ay) {
+                ax -= 4.0f * ay;
+                cquo += 4;
+            }
+            if (ax >= 2.0f * ay) {
+                ax -= 2.0f * ay;
+                cquo += 2;
+            }
+            float y2 = 0.5f * ay;
+            if (ax > y2) {
+                ax -= ay;
+                ++cquo;
+                if (ax >= y2) {
+                    ax -= ay;
+                    ++cquo;
+                }
+            }
+            return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax);
+#endif
+        }
+
+        /// Positive difference implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return Positive difference stored in single-precision
+        static expr fdim(float x, float y) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::fdim(x, y));
+#else
+            return expr((x <= y) ? 0.0f : (x - y));
+#endif
+        }
+
+        /// Fused multiply-add implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \param z third operand
+        /// \return \a x * \a y + \a z stored in single-precision
+        static expr fma(float x, float y, float z) {
+#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF)
+            return expr(std::fma(x, y, z));
+#else
+            return expr(x * y + z);
+#endif
+        }
+
+        /// Get NaN.
+        /// \return Half-precision quiet NaN
+        static half nanh() {
+            return half(binary, 0x7FFF);
+        }
+
+        /// Exponential implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr exp(float arg) {
+            return expr(std::exp(arg));
+        }
+
+        /// Exponential implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr expm1(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::expm1(arg));
+#else
+            return expr(
+                static_cast<float>(std::exp(static_cast<double>(arg)) - 1.0));
+#endif
+        }
+
+        /// Binary exponential implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr exp2(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::exp2(arg));
+#else
+            return expr(static_cast<float>(
+                std::exp(arg * 0.69314718055994530941723212145818)));
+#endif
+        }
+
+        /// Logarithm implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr log(float arg) {
+            return expr(std::log(arg));
+        }
+
+        /// Common logarithm implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr log10(float arg) {
+            return expr(std::log10(arg));
+        }
+
+        /// Logarithm implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr log1p(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::log1p(arg));
+#else
+            return expr(static_cast<float>(std::log(1.0 + arg)));
+#endif
+        }
+
+        /// Binary logarithm implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr log2(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::log2(arg));
+#else
+            return expr(static_cast<float>(std::log(static_cast<double>(arg)) *
+                                           1.4426950408889634073599246810019));
+#endif
+        }
+
+        /// Square root implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr sqrt(float arg) {
+            return expr(std::sqrt(arg));
+        }
+
+        /// Cubic root implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr cbrt(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::cbrt(arg));
+#else
+            if (builtin_isnan(arg) || builtin_isinf(arg))
+                return expr(arg);
+            return expr(builtin_signbit(arg)
+                            ? -static_cast<float>(std::pow(
+                                  -static_cast<double>(arg), 1.0 / 3.0))
+                            : static_cast<float>(std::pow(
+                                  static_cast<double>(arg), 1.0 / 3.0)));
+#endif
+        }
+
+        /// Hypotenuse implementation.
+        /// \param x first argument
+        /// \param y second argument
+        /// \return function value stored in single-preicision
+        static expr hypot(float x, float y) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::hypot(x, y));
+#else
+            return expr((builtin_isinf(x) || builtin_isinf(y))
+                            ? std::numeric_limits<float>::infinity()
+                            : static_cast<float>(
+                                  std::sqrt(static_cast<double>(x) * x +
+                                            static_cast<double>(y) * y)));
+#endif
+        }
+
+        /// Power implementation.
+        /// \param base value to exponentiate
+        /// \param exp power to expontiate to
+        /// \return function value stored in single-preicision
+        static expr pow(float base, float exp) {
+            return expr(std::pow(base, exp));
+        }
+
+        /// Sine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr sin(float arg) {
+            return expr(std::sin(arg));
+        }
+
+        /// Cosine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr cos(float arg) {
+            return expr(std::cos(arg));
+        }
+
+        /// Tan implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr tan(float arg) {
+            return expr(std::tan(arg));
+        }
+
+        /// Arc sine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr asin(float arg) {
+            return expr(std::asin(arg));
+        }
+
+        /// Arc cosine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr acos(float arg) {
+            return expr(std::acos(arg));
+        }
+
+        /// Arc tangent implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr atan(float arg) {
+            return expr(std::atan(arg));
+        }
+
+        /// Arc tangent implementation.
+        /// \param x first argument
+        /// \param y second argument
+        /// \return function value stored in single-preicision
+        static expr atan2(float x, float y) {
+            return expr(std::atan2(x, y));
+        }
+
+        /// Hyperbolic sine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr sinh(float arg) {
+            return expr(std::sinh(arg));
+        }
+
+        /// Hyperbolic cosine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr cosh(float arg) {
+            return expr(std::cosh(arg));
+        }
+
+        /// Hyperbolic tangent implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr tanh(float arg) {
+            return expr(std::tanh(arg));
+        }
+
+        /// Hyperbolic area sine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr asinh(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::asinh(arg));
+#else
+            return expr((arg == -std::numeric_limits<float>::infinity())
+                            ? arg
+                            : static_cast<float>(
+                                  std::log(arg + std::sqrt(arg * arg + 1.0))));
+#endif
+        }
+
+        /// Hyperbolic area cosine implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr acosh(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::acosh(arg));
+#else
+            return expr((arg < -1.0f) ? std::numeric_limits<float>::quiet_NaN()
+                                      : static_cast<float>(std::log(
+                                            arg + std::sqrt(arg * arg - 1.0))));
+#endif
+        }
+
+        /// Hyperbolic area tangent implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr atanh(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::atanh(arg));
+#else
+            return expr(
+                static_cast<float>(0.5 * std::log((1.0 + arg) / (1.0 - arg))));
+#endif
+        }
+
+        /// Error function implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr erf(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::erf(arg));
+#else
+            return expr(static_cast<float>(erf(static_cast<double>(arg))));
+#endif
+        }
+
+        /// Complementary implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr erfc(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::erfc(arg));
+#else
+            return expr(
+                static_cast<float>(1.0 - erf(static_cast<double>(arg))));
+#endif
+        }
+
+        /// Gamma logarithm implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr lgamma(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::lgamma(arg));
+#else
+            if (builtin_isinf(arg))
+                return expr(std::numeric_limits<float>::infinity());
+            if (arg < 0.0f) {
+                float i, f = std::modf(-arg, &i);
+                if (f == 0.0f)
+                    return expr(std::numeric_limits<float>::infinity());
+                return expr(static_cast<float>(
+                    1.1447298858494001741434273513531 -
+                    std::log(std::abs(
+                        std::sin(3.1415926535897932384626433832795 * f))) -
+                    lgamma(1.0 - arg)));
+            }
+            return expr(static_cast<float>(lgamma(static_cast<double>(arg))));
+#endif
+        }
+
+        /// Gamma implementation.
+        /// \param arg function argument
+        /// \return function value stored in single-preicision
+        static expr tgamma(float arg) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::tgamma(arg));
+#else
+            if (arg == 0.0f)
+                return builtin_signbit(arg)
+                           ? expr(-std::numeric_limits<float>::infinity())
+                           : expr(std::numeric_limits<float>::infinity());
+            if (arg < 0.0f) {
+                float i, f = std::modf(-arg, &i);
+                if (f == 0.0f)
+                    return expr(std::numeric_limits<float>::quiet_NaN());
+                double value =
+                    3.1415926535897932384626433832795 /
+                    (std::sin(3.1415926535897932384626433832795 * f) *
+                     std::exp(lgamma(1.0 - arg)));
+                return expr(static_cast<float>(
+                    (std::fmod(i, 2.0f) == 0.0f) ? -value : value));
+            }
+            if (builtin_isinf(arg))
+                return expr(arg);
+            return expr(
+                static_cast<float>(std::exp(lgamma(static_cast<double>(arg)))));
+#endif
+        }
+
+        /// Floor implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static half floor(half arg) {
+            return half(binary,
+                        round_half<std::round_toward_neg_infinity>(arg.data_));
+        }
+
+        /// Ceiling implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static half ceil(half arg) {
+            return half(binary,
+                        round_half<std::round_toward_infinity>(arg.data_));
+        }
+
+        /// Truncation implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static half trunc(half arg) {
+            return half(binary, round_half<std::round_toward_zero>(arg.data_));
+        }
+
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static half round(half arg) {
+            return half(binary, round_half_up(arg.data_));
+        }
+
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static long lround(half arg) {
+            return detail::half2int_up<long>(arg.data_);
+        }
+
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static half rint(half arg) {
+            return half(binary, round_half<half::round_style>(arg.data_));
+        }
+
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static long lrint(half arg) {
+            return detail::half2int<half::round_style, long>(arg.data_);
+        }
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static long long llround(half arg) {
+            return detail::half2int_up<long long>(arg.data_);
+        }
+
+        /// Nearest integer implementation.
+        /// \param arg value to round
+        /// \return rounded value
+        static long long llrint(half arg) {
+            return detail::half2int<half::round_style, long long>(arg.data_);
+        }
+#endif
+
+        /// Decompression implementation.
+        /// \param arg number to decompress
+        /// \param exp address to store exponent at
+        /// \return normalized significant
+        static half frexp(half arg, int *exp) {
+            int m = arg.data_ & 0x7FFF, e = -14;
+            if (m >= 0x7C00 || !m)
+                return *exp = 0, arg;
+            for (; m < 0x400; m <<= 1, --e)
+                ;
+            return *exp = e + (m >> 10),
+                   half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF));
+        }
+
+        /// Decompression implementation.
+        /// \param arg number to decompress
+        /// \param iptr address to store integer part at
+        /// \return fractional part
+        static half modf(half arg, half *iptr) {
+            unsigned int e = arg.data_ & 0x7FFF;
+            if (e >= 0x6400)
+                return *iptr = arg,
+                       half(binary, arg.data_ & (0x8000U | -(e > 0x7C00)));
+            if (e < 0x3C00)
+                return iptr->data_ = arg.data_ & 0x8000, arg;
+            e >>= 10;
+            unsigned int mask = (1 << (25 - e)) - 1, m = arg.data_ & mask;
+            iptr->data_ = arg.data_ & ~mask;
+            if (!m)
+                return half(binary, arg.data_ & 0x8000);
+            for (; m < 0x400; m <<= 1, --e)
+                ;
+            return half(binary, static_cast<uint16>((arg.data_ & 0x8000) |
+                                                    (e << 10) | (m & 0x3FF)));
+        }
+
+        /// Scaling implementation.
+        /// \param arg number to scale
+        /// \param exp power of two to scale by
+        /// \return scaled number
+        static half scalbln(half arg, long exp) {
+            unsigned int m = arg.data_ & 0x7FFF;
+            if (m >= 0x7C00 || !m)
+                return arg;
+            for (; m < 0x400; m <<= 1, --exp)
+                ;
+            exp += m >> 10;
+            uint16 value = arg.data_ & 0x8000;
+            if (exp > 30) {
+                if (half::round_style == std::round_toward_zero)
+                    value |= 0x7BFF;
+                else if (half::round_style == std::round_toward_infinity)
+                    value |= 0x7C00 - (value >> 15);
+                else if (half::round_style == std::round_toward_neg_infinity)
+                    value |= 0x7BFF + (value >> 15);
+                else
+                    value |= 0x7C00;
+            } else if (exp > 0)
+                value |= (exp << 10) | (m & 0x3FF);
+            else if (exp > -11) {
+                m = (m & 0x3FF) | 0x400;
+                if (half::round_style == std::round_to_nearest) {
+                    m += 1 << -exp;
+#if HALF_ROUND_TIES_TO_EVEN
+                    m -= (m >> (1 - exp)) & 1;
+#endif
+                } else if (half::round_style == std::round_toward_infinity)
+                    m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U);
+                else if (half::round_style == std::round_toward_neg_infinity)
+                    m += -(value >> 15) & ((1 << (1 - exp)) - 1U);
+                value |= m >> (1 - exp);
+            } else if (half::round_style == std::round_toward_infinity)
+                value -= (value >> 15) - 1;
+            else if (half::round_style == std::round_toward_neg_infinity)
+                value += value >> 15;
+            return half(binary, value);
+        }
+
+        /// Exponent implementation.
+        /// \param arg number to query
+        /// \return floating point exponent
+        static int ilogb(half arg) {
+            int abs = arg.data_ & 0x7FFF;
+            if (!abs)
+                return FP_ILOGB0;
+            if (abs < 0x7C00) {
+                int exp = (abs >> 10) - 15;
+                if (abs < 0x400)
+                    for (; abs < 0x200; abs <<= 1, --exp)
+                        ;
+                return exp;
+            }
+            if (abs > 0x7C00)
+                return FP_ILOGBNAN;
+            return INT_MAX;
+        }
+
+        /// Exponent implementation.
+        /// \param arg number to query
+        /// \return floating point exponent
+        static half logb(half arg) {
+            int abs = arg.data_ & 0x7FFF;
+            if (!abs)
+                return half(binary, 0xFC00);
+            if (abs < 0x7C00) {
+                int exp = (abs >> 10) - 15;
+                if (abs < 0x400)
+                    for (; abs < 0x200; abs <<= 1, --exp)
+                        ;
+                uint16 bits = (exp < 0) << 15;
+                if (exp) {
+                    unsigned int m = std::abs(exp) << 6, e = 18;
+                    for (; m < 0x400; m <<= 1, --e)
+                        ;
+                    bits |= (e << 10) + m;
+                }
+                return half(binary, bits);
+            }
+            if (abs > 0x7C00)
+                return arg;
+            return half(binary, 0x7C00);
+        }
+
+        /// Enumeration implementation.
+        /// \param from number to increase/decrease
+        /// \param to direction to enumerate into
+        /// \return next representable number
+        static half nextafter(half from, half to) {
+            uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+            if (fabs > 0x7C00)
+                return from;
+            if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs))
+                return to;
+            if (!fabs)
+                return half(binary, (to.data_ & 0x8000) + 1);
+            bool lt = ((fabs == from.data_) ? static_cast<int>(fabs)
+                                            : -static_cast<int>(fabs)) <
+                      ((tabs == to.data_) ? static_cast<int>(tabs)
+                                          : -static_cast<int>(tabs));
+            return half(
+                binary,
+                from.data_ +
+                    (((from.data_ >> 15) ^ static_cast<unsigned>(lt)) << 1) -
+                    1);
+        }
+
+        /// Enumeration implementation.
+        /// \param from number to increase/decrease
+        /// \param to direction to enumerate into
+        /// \return next representable number
+        static half nexttoward(half from, long double to) {
+            if (isnan(from))
+                return from;
+            long double lfrom = static_cast<long double>(from);
+            if (builtin_isnan(to) || lfrom == to)
+                return half(static_cast<float>(to));
+            if (!(from.data_ & 0x7FFF))
+                return half(
+                    binary,
+                    (static_cast<detail::uint16>(builtin_signbit(to)) << 15) +
+                        1);
+            return half(
+                binary,
+                from.data_ +
+                    (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to))
+                     << 1) -
+                    1);
+        }
+
+        /// Sign implementation
+        /// \param x first operand
+        /// \param y second operand
+        /// \return composed value
+        static half copysign(half x, half y) {
+            return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000));
+        }
+
+        /// Classification implementation.
+        /// \param arg value to classify
+        /// \retval true if infinite number
+        /// \retval false else
+        static int fpclassify(half arg) {
+            unsigned int abs = arg.data_ & 0x7FFF;
+            return abs ? ((abs > 0x3FF)
+                              ? ((abs >= 0x7C00)
+                                     ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE)
+                                     : FP_NORMAL)
+                              : FP_SUBNORMAL)
+                       : FP_ZERO;
+        }
+
+        /// Classification implementation.
+        /// \param arg value to classify
+        /// \retval true if finite number
+        /// \retval false else
+        static bool isfinite(half arg) {
+            return (arg.data_ & 0x7C00) != 0x7C00;
+        }
+
+        /// Classification implementation.
+        /// \param arg value to classify
+        /// \retval true if infinite number
+        /// \retval false else
+        static bool isinf(half arg) {
+            return (arg.data_ & 0x7FFF) == 0x7C00;
+        }
+
+        /// Classification implementation.
+        /// \param arg value to classify
+        /// \retval true if not a number
+        /// \retval false else
+        static bool isnan(half arg) {
+            return (arg.data_ & 0x7FFF) > 0x7C00;
+        }
+
+        /// Classification implementation.
+        /// \param arg value to classify
+        /// \retval true if normal number
+        /// \retval false else
+        static bool isnormal(half arg) {
+            return ((arg.data_ & 0x7C00) != 0) &
+                   ((arg.data_ & 0x7C00) != 0x7C00);
+        }
+
+        /// Sign bit implementation.
+        /// \param arg value to check
+        /// \retval true if signed
+        /// \retval false if unsigned
+        static bool signbit(half arg) {
+            return (arg.data_ & 0x8000) != 0;
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if operands equal
+        /// \retval false else
+        static bool isequal(half x, half y) {
+            return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) &&
+                   !isnan(x);
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if operands not equal
+        /// \retval false else
+        static bool isnotequal(half x, half y) {
+            return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) ||
+                   isnan(x);
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if \a x > \a y
+        /// \retval false else
+        static bool isgreater(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            return xabs <= 0x7C00 && yabs <= 0x7C00 &&
+                   (((xabs == x.data_) ? xabs : -xabs) >
+                    ((yabs == y.data_) ? yabs : -yabs));
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if \a x >= \a y
+        /// \retval false else
+        static bool isgreaterequal(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            return xabs <= 0x7C00 && yabs <= 0x7C00 &&
+                   (((xabs == x.data_) ? xabs : -xabs) >=
+                    ((yabs == y.data_) ? yabs : -yabs));
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if \a x < \a y
+        /// \retval false else
+        static bool isless(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            return xabs <= 0x7C00 && yabs <= 0x7C00 &&
+                   (((xabs == x.data_) ? xabs : -xabs) <
+                    ((yabs == y.data_) ? yabs : -yabs));
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if \a x <= \a y
+        /// \retval false else
+        static bool islessequal(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            return xabs <= 0x7C00 && yabs <= 0x7C00 &&
+                   (((xabs == x.data_) ? xabs : -xabs) <=
+                    ((yabs == y.data_) ? yabs : -yabs));
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if either \a x > \a y nor \a x < \a y
+        /// \retval false else
+        static bool islessgreater(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            if (xabs > 0x7C00 || yabs > 0x7C00)
+                return false;
+            int a = (xabs == x.data_) ? xabs : -xabs,
+                b = (yabs == y.data_) ? yabs : -yabs;
+            return a < b || a > b;
+        }
+
+        /// Comparison implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \retval true if operand unordered
+        /// \retval false else
+        static bool isunordered(half x, half y) {
+            return isnan(x) || isnan(y);
+        }
+
+    private:
+        static double erf(double arg) {
+            if (builtin_isinf(arg))
+                return (arg < 0.0) ? -1.0 : 1.0;
+            double x2 = arg * arg, ax2 = 0.147 * x2,
+                   value = std::sqrt(
+                       1.0 -
+                       std::exp(-x2 *
+                                (1.2732395447351626861510701069801 + ax2) /
+                                (1.0 + ax2)));
+            return builtin_signbit(arg) ? -value : value;
+        }
+
+        static double lgamma(double arg) {
+            double v = 1.0;
+            for (; arg < 8.0; ++arg)
+                v *= arg;
+            double w = 1.0 / (arg * arg);
+            return (((((((-0.02955065359477124183006535947712 * w +
+                          0.00641025641025641025641025641026) *
+                             w +
+                         -0.00191752691752691752691752691753) *
+                            w +
+                        8.4175084175084175084175084175084e-4) *
+                           w +
+                       -5.952380952380952380952380952381e-4) *
+                          w +
+                      7.9365079365079365079365079365079e-4) *
+                         w +
+                     -0.00277777777777777777777777777778) *
+                        w +
+                    0.08333333333333333333333333333333) /
+                       arg +
+                   0.91893853320467274178032973640562 - std::log(v) - arg +
+                   (arg - 0.5) * std::log(arg);
+        }
+    };
+
+    /// Wrapper for unary half-precision functions needing specialization for
+    /// individual argument types. \tparam T argument type
+    template <typename T>
+    struct unary_specialized {
+        /// Negation implementation.
+        /// \param arg value to negate
+        /// \return negated value
+        static HALF_CONSTEXPR half negate(half arg) {
+            return half(binary, arg.data_ ^ 0x8000);
+        }
+
+        /// Absolute value implementation.
+        /// \param arg function argument
+        /// \return absolute value
+        static half fabs(half arg) {
+            return half(binary, arg.data_ & 0x7FFF);
+        }
+    };
+    template <>
+    struct unary_specialized<expr> {
+        static HALF_CONSTEXPR expr negate(float arg) {
+            return expr(-arg);
+        }
+        static expr fabs(float arg) {
+            return expr(std::fabs(arg));
+        }
+    };
+
+    /// Wrapper for binary half-precision functions needing specialization for
+    /// individual argument types. \tparam T first argument type \tparam U first
+    /// argument type
+    template <typename T, typename U>
+    struct binary_specialized {
+        /// Minimum implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return minimum value
+        static expr fmin(float x, float y) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::fmin(x, y));
+#else
+            if (builtin_isnan(x))
+                return expr(y);
+            if (builtin_isnan(y))
+                return expr(x);
+            return expr(std::min(x, y));
+#endif
+        }
+
+        /// Maximum implementation.
+        /// \param x first operand
+        /// \param y second operand
+        /// \return maximum value
+        static expr fmax(float x, float y) {
+#if HALF_ENABLE_CPP11_CMATH
+            return expr(std::fmax(x, y));
+#else
+            if (builtin_isnan(x))
+                return expr(y);
+            if (builtin_isnan(y))
+                return expr(x);
+            return expr(std::max(x, y));
+#endif
+        }
+    };
+    template <>
+    struct binary_specialized<half, half> {
+        static half fmin(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            if (xabs > 0x7C00)
+                return y;
+            if (yabs > 0x7C00)
+                return x;
+            return (((xabs == x.data_) ? xabs : -xabs) >
+                    ((yabs == y.data_) ? yabs : -yabs))
+                       ? y
+                       : x;
+        }
+        static half fmax(half x, half y) {
+            int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+            if (xabs > 0x7C00)
+                return y;
+            if (yabs > 0x7C00)
+                return x;
+            return (((xabs == x.data_) ? xabs : -xabs) <
+                    ((yabs == y.data_) ? yabs : -yabs))
+                       ? y
+                       : x;
+        }
+    };
+
+    /// Helper class for half casts.
+    /// This class template has to be specialized for all valid cast argument to
+    /// define an appropriate static `cast` member function and a corresponding
+    /// `type` member denoting its return type. \tparam T destination type
+    /// \tparam U source type
+    /// \tparam R rounding mode to use
+    template <typename T, typename U,
+              std::float_round_style R =
+                  (std::float_round_style)(HALF_ROUND_STYLE)>
+    struct half_caster {};
+    template <typename U, std::float_round_style R>
+    struct half_caster<half, U, R> {
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+        static_assert(std::is_arithmetic<U>::value,
+                      "half_cast from non-arithmetic type unsupported");
+#endif
+
+        static half cast(U arg) {
+            return cast_impl(arg, is_float<U>());
+        };
+
+    private:
+        static half cast_impl(U arg, true_type) {
+            return half(binary, float2half<R>(arg));
+        }
+        static half cast_impl(U arg, false_type) {
+            return half(binary, int2half<R>(arg));
+        }
+    };
+    template <typename T, std::float_round_style R>
+    struct half_caster<T, half, R> {
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+        static_assert(std::is_arithmetic<T>::value,
+                      "half_cast to non-arithmetic type unsupported");
+#endif
+
+        static T cast(half arg) {
+            return cast_impl(arg, is_float<T>());
+        }
+
+    private:
+        static T cast_impl(half arg, true_type) {
+            return half2float<T>(arg.data_);
+        }
+        static T cast_impl(half arg, false_type) {
+            return half2int<R, T>(arg.data_);
+        }
+    };
+    template <typename T, std::float_round_style R>
+    struct half_caster<T, expr, R> {
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+        static_assert(std::is_arithmetic<T>::value,
+                      "half_cast to non-arithmetic type unsupported");
+#endif
+
+        static T cast(expr arg) {
+            return cast_impl(arg, is_float<T>());
+        }
+
+    private:
+        static T cast_impl(float arg, true_type) {
+            return static_cast<T>(arg);
+        }
+        static T cast_impl(half arg, false_type) {
+            return half2int<R, T>(arg.data_);
+        }
+    };
+    template <std::float_round_style R>
+    struct half_caster<half, half, R> {
+        static half cast(half arg) {
+            return arg;
+        }
+    };
+    template <std::float_round_style R>
+    struct half_caster<half, expr, R> : half_caster<half, half, R> {};
+
+    /// \name Comparison operators
+    /// \{
+
+    /// Comparison for equality.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if operands equal
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator==(T x, U y) {
+        return functions::isequal(x, y);
+    }
+
+    /// Comparison for inequality.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if operands not equal
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator!=(T x, U y) {
+        return functions::isnotequal(x, y);
+    }
+
+    /// Comparison for less than.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x less than \a y
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator<(T x, U y) {
+        return functions::isless(x, y);
+    }
+
+    /// Comparison for greater than.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x greater than \a y
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator>(T x, U y) {
+        return functions::isgreater(x, y);
+    }
+
+    /// Comparison for less equal.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x less equal \a y
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator<=(T x, U y) {
+        return functions::islessequal(x, y);
+    }
+
+    /// Comparison for greater equal.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x greater equal \a y
+    /// \retval false else
+    template <typename T, typename U>
+    typename enable<bool, T, U>::type operator>=(T x, U y) {
+        return functions::isgreaterequal(x, y);
+    }
+
+    /// \}
+    /// \name Arithmetic operators
+    /// \{
+
+    /// Add halfs.
+    /// \param x left operand
+    /// \param y right operand
+    /// \return sum of half expressions
+    template <typename T, typename U>
+    typename enable<expr, T, U>::type operator+(T x, U y) {
+        return functions::plus(x, y);
+    }
+
+    /// Subtract halfs.
+    /// \param x left operand
+    /// \param y right operand
+    /// \return difference of half expressions
+    template <typename T, typename U>
+    typename enable<expr, T, U>::type operator-(T x, U y) {
+        return functions::minus(x, y);
+    }
+
+    /// Multiply halfs.
+    /// \param x left operand
+    /// \param y right operand
+    /// \return product of half expressions
+    template <typename T, typename U>
+    typename enable<expr, T, U>::type operator*(T x, U y) {
+        return functions::multiplies(x, y);
+    }
+
+    /// Divide halfs.
+    /// \param x left operand
+    /// \param y right operand
+    /// \return quotient of half expressions
+    template <typename T, typename U>
+    typename enable<expr, T, U>::type operator/(T x, U y) {
+        return functions::divides(x, y);
+    }
+
+    /// Identity.
+    /// \param arg operand
+    /// \return uncahnged operand
+    template <typename T>
+    HALF_CONSTEXPR typename enable<T, T>::type operator+(T arg) {
+        return arg;
+    }
+
+    /// Negation.
+    /// \param arg operand
+    /// \return negated operand
+    template <typename T>
+    HALF_CONSTEXPR typename enable<T, T>::type operator-(T arg) {
+        return unary_specialized<T>::negate(arg);
+    }
+
+    /// \}
+    /// \name Input and output
+    /// \{
+
+    /// Output operator.
+    /// \param out output stream to write into
+    /// \param arg half expression to write
+    /// \return reference to output stream
+    template <typename T, typename charT, typename traits>
+    typename enable<std::basic_ostream<charT, traits> &, T>::type operator<<(
+        std::basic_ostream<charT, traits> &out, T arg) {
+        return functions::write(out, arg);
+    }
+
+    /// Input operator.
+    /// \param in input stream to read from
+    /// \param arg half to read into
+    /// \return reference to input stream
+    template <typename charT, typename traits>
+    std::basic_istream<charT, traits> &operator>>(
+        std::basic_istream<charT, traits> &in, half &arg) {
+        return functions::read(in, arg);
+    }
+
+    /// \}
+    /// \name Basic mathematical operations
+    /// \{
+
+    /// Absolute value.
+    /// \param arg operand
+    /// \return absolute value of \a arg
+    //		template<typename T> typename enable<T,T>::type abs(T arg) { return
+    // unary_specialized<T>::fabs(arg); }
+    inline half abs(half arg) {
+        return unary_specialized<half>::fabs(arg);
+    }
+    inline expr abs(expr arg) {
+        return unary_specialized<expr>::fabs(arg);
+    }
+
+    /// Absolute value.
+    /// \param arg operand
+    /// \return absolute value of \a arg
+    //		template<typename T> typename enable<T,T>::type fabs(T arg) { return
+    // unary_specialized<T>::fabs(arg); }
+    inline half fabs(half arg) {
+        return unary_specialized<half>::fabs(arg);
+    }
+    inline expr fabs(expr arg) {
+        return unary_specialized<expr>::fabs(arg);
+    }
+
+    /// Remainder of division.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return remainder of floating point division.
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    //fmod(T x, U y) { return functions::fmod(x, y); }
+    inline expr fmod(half x, half y) {
+        return functions::fmod(x, y);
+    }
+    inline expr fmod(half x, expr y) {
+        return functions::fmod(x, y);
+    }
+    inline expr fmod(expr x, half y) {
+        return functions::fmod(x, y);
+    }
+    inline expr fmod(expr x, expr y) {
+        return functions::fmod(x, y);
+    }
+
+    /// Remainder of division.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return remainder of floating point division.
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    // remainder(T x, U y) { return functions::remainder(x, y); }
+    inline expr remainder(half x, half y) {
+        return functions::remainder(x, y);
+    }
+    inline expr remainder(half x, expr y) {
+        return functions::remainder(x, y);
+    }
+    inline expr remainder(expr x, half y) {
+        return functions::remainder(x, y);
+    }
+    inline expr remainder(expr x, expr y) {
+        return functions::remainder(x, y);
+    }
+
+    /// Remainder of division.
+    /// \param x first operand
+    /// \param y second operand
+    /// \param quo address to store some bits of quotient at
+    /// \return remainder of floating point division.
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    // remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); }
+    inline expr remquo(half x, half y, int *quo) {
+        return functions::remquo(x, y, quo);
+    }
+    inline expr remquo(half x, expr y, int *quo) {
+        return functions::remquo(x, y, quo);
+    }
+    inline expr remquo(expr x, half y, int *quo) {
+        return functions::remquo(x, y, quo);
+    }
+    inline expr remquo(expr x, expr y, int *quo) {
+        return functions::remquo(x, y, quo);
+    }
+
+    /// Fused multiply add.
+    /// \param x first operand
+    /// \param y second operand
+    /// \param z third operand
+    /// \return ( \a x * \a y ) + \a z rounded as one operation.
+    //		template<typename T,typename U,typename V> typename
+    // enable<expr,T,U,V>::type fma(T x, U y, V z) { return functions::fma(x, y,
+    // z); }
+    inline expr fma(half x, half y, half z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(half x, half y, expr z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(half x, expr y, half z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(half x, expr y, expr z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(expr x, half y, half z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(expr x, half y, expr z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(expr x, expr y, half z) {
+        return functions::fma(x, y, z);
+    }
+    inline expr fma(expr x, expr y, expr z) {
+        return functions::fma(x, y, z);
+    }
+
+    /// Maximum of half expressions.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return maximum of operands
+    //		template<typename T,typename U> typename result<T,U>::type fmax(T x,
+    //U y) { return binary_specialized<T,U>::fmax(x, y); }
+    inline half fmax(half x, half y) {
+        return binary_specialized<half, half>::fmax(x, y);
+    }
+    inline expr fmax(half x, expr y) {
+        return binary_specialized<half, expr>::fmax(x, y);
+    }
+    inline expr fmax(expr x, half y) {
+        return binary_specialized<expr, half>::fmax(x, y);
+    }
+    inline expr fmax(expr x, expr y) {
+        return binary_specialized<expr, expr>::fmax(x, y);
+    }
+
+    /// Minimum of half expressions.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return minimum of operands
+    //		template<typename T,typename U> typename result<T,U>::type fmin(T x,
+    //U y) { return binary_specialized<T,U>::fmin(x, y); }
+    inline half fmin(half x, half y) {
+        return binary_specialized<half, half>::fmin(x, y);
+    }
+    inline expr fmin(half x, expr y) {
+        return binary_specialized<half, expr>::fmin(x, y);
+    }
+    inline expr fmin(expr x, half y) {
+        return binary_specialized<expr, half>::fmin(x, y);
+    }
+    inline expr fmin(expr x, expr y) {
+        return binary_specialized<expr, expr>::fmin(x, y);
+    }
+
+    /// Positive difference.
+    /// \param x first operand
+    /// \param y second operand
+    /// \return \a x - \a y or 0 if difference negative
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    //fdim(T x, U y) { return functions::fdim(x, y); }
+    inline expr fdim(half x, half y) {
+        return functions::fdim(x, y);
+    }
+    inline expr fdim(half x, expr y) {
+        return functions::fdim(x, y);
+    }
+    inline expr fdim(expr x, half y) {
+        return functions::fdim(x, y);
+    }
+    inline expr fdim(expr x, expr y) {
+        return functions::fdim(x, y);
+    }
+
+    /// Get NaN value.
+    /// \return quiet NaN
+    inline half nanh(const char *) {
+        return functions::nanh();
+    }
+
+    /// \}
+    /// \name Exponential functions
+    /// \{
+
+    /// Exponential function.
+    /// \param arg function argument
+    /// \return e raised to \a arg
+    //		template<typename T> typename enable<expr,T>::type exp(T arg) {
+    //return functions::exp(arg); }
+    inline expr exp(half arg) {
+        return functions::exp(arg);
+    }
+    inline expr exp(expr arg) {
+        return functions::exp(arg);
+    }
+
+    /// Exponential minus one.
+    /// \param arg function argument
+    /// \return e raised to \a arg subtracted by 1
+    //		template<typename T> typename enable<expr,T>::type expm1(T arg) {
+    // return functions::expm1(arg); }
+    inline expr expm1(half arg) {
+        return functions::expm1(arg);
+    }
+    inline expr expm1(expr arg) {
+        return functions::expm1(arg);
+    }
+
+    /// Binary exponential.
+    /// \param arg function argument
+    /// \return 2 raised to \a arg
+    //		template<typename T> typename enable<expr,T>::type exp2(T arg) {
+    // return functions::exp2(arg); }
+    inline expr exp2(half arg) {
+        return functions::exp2(arg);
+    }
+    inline expr exp2(expr arg) {
+        return functions::exp2(arg);
+    }
+
+    /// Natural logorithm.
+    /// \param arg function argument
+    /// \return logarithm of \a arg to base e
+    //		template<typename T> typename enable<expr,T>::type log(T arg) {
+    //return functions::log(arg); }
+    inline expr log(half arg) {
+        return functions::log(arg);
+    }
+    inline expr log(expr arg) {
+        return functions::log(arg);
+    }
+
+    /// Common logorithm.
+    /// \param arg function argument
+    /// \return logarithm of \a arg to base 10
+    //		template<typename T> typename enable<expr,T>::type log10(T arg) {
+    // return functions::log10(arg); }
+    inline expr log10(half arg) {
+        return functions::log10(arg);
+    }
+    inline expr log10(expr arg) {
+        return functions::log10(arg);
+    }
+
+    /// Natural logorithm.
+    /// \param arg function argument
+    /// \return logarithm of \a arg plus 1 to base e
+    //		template<typename T> typename enable<expr,T>::type log1p(T arg) {
+    // return functions::log1p(arg); }
+    inline expr log1p(half arg) {
+        return functions::log1p(arg);
+    }
+    inline expr log1p(expr arg) {
+        return functions::log1p(arg);
+    }
+
+    /// Binary logorithm.
+    /// \param arg function argument
+    /// \return logarithm of \a arg to base 2
+    //		template<typename T> typename enable<expr,T>::type log2(T arg) {
+    // return functions::log2(arg); }
+    inline expr log2(half arg) {
+        return functions::log2(arg);
+    }
+    inline expr log2(expr arg) {
+        return functions::log2(arg);
+    }
+
+    /// \}
+    /// \name Power functions
+    /// \{
+
+    /// Square root.
+    /// \param arg function argument
+    /// \return square root of \a arg
+    //		template<typename T> typename enable<expr,T>::type sqrt(T arg) {
+    // return functions::sqrt(arg); }
+    inline expr sqrt(half arg) {
+        return functions::sqrt(arg);
+    }
+    inline expr sqrt(expr arg) {
+        return functions::sqrt(arg);
+    }
+
+    /// Cubic root.
+    /// \param arg function argument
+    /// \return cubic root of \a arg
+    //		template<typename T> typename enable<expr,T>::type cbrt(T arg) {
+    // return functions::cbrt(arg); }
+    inline expr cbrt(half arg) {
+        return functions::cbrt(arg);
+    }
+    inline expr cbrt(expr arg) {
+        return functions::cbrt(arg);
+    }
+
+    /// Hypotenuse function.
+    /// \param x first argument
+    /// \param y second argument
+    /// \return square root of sum of squares without internal over- or
+    /// underflows
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    // hypot(T x, U y) { return functions::hypot(x, y); }
+    inline expr hypot(half x, half y) {
+        return functions::hypot(x, y);
+    }
+    inline expr hypot(half x, expr y) {
+        return functions::hypot(x, y);
+    }
+    inline expr hypot(expr x, half y) {
+        return functions::hypot(x, y);
+    }
+    inline expr hypot(expr x, expr y) {
+        return functions::hypot(x, y);
+    }
+
+    /// Power function.
+    /// \param base first argument
+    /// \param exp second argument
+    /// \return \a base raised to \a exp
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    //pow(T base, U exp) { return functions::pow(base, exp); }
+    inline expr pow(half base, half exp) {
+        return functions::pow(base, exp);
+    }
+    inline expr pow(half base, expr exp) {
+        return functions::pow(base, exp);
+    }
+    inline expr pow(expr base, half exp) {
+        return functions::pow(base, exp);
+    }
+    inline expr pow(expr base, expr exp) {
+        return functions::pow(base, exp);
+    }
+
+    /// \}
+    /// \name Trigonometric functions
+    /// \{
+
+    /// Sine function.
+    /// \param arg function argument
+    /// \return sine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type sin(T arg) {
+    //return functions::sin(arg); }
+    inline expr sin(half arg) {
+        return functions::sin(arg);
+    }
+    inline expr sin(expr arg) {
+        return functions::sin(arg);
+    }
+
+    /// Cosine function.
+    /// \param arg function argument
+    /// \return cosine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type cos(T arg) {
+    //return functions::cos(arg); }
+    inline expr cos(half arg) {
+        return functions::cos(arg);
+    }
+    inline expr cos(expr arg) {
+        return functions::cos(arg);
+    }
+
+    /// Tangent function.
+    /// \param arg function argument
+    /// \return tangent value of \a arg
+    //		template<typename T> typename enable<expr,T>::type tan(T arg) {
+    //return functions::tan(arg); }
+    inline expr tan(half arg) {
+        return functions::tan(arg);
+    }
+    inline expr tan(expr arg) {
+        return functions::tan(arg);
+    }
+
+    /// Arc sine.
+    /// \param arg function argument
+    /// \return arc sine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type asin(T arg) {
+    // return functions::asin(arg); }
+    inline expr asin(half arg) {
+        return functions::asin(arg);
+    }
+    inline expr asin(expr arg) {
+        return functions::asin(arg);
+    }
+
+    /// Arc cosine function.
+    /// \param arg function argument
+    /// \return arc cosine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type acos(T arg) {
+    // return functions::acos(arg); }
+    inline expr acos(half arg) {
+        return functions::acos(arg);
+    }
+    inline expr acos(expr arg) {
+        return functions::acos(arg);
+    }
+
+    /// Arc tangent function.
+    /// \param arg function argument
+    /// \return arc tangent value of \a arg
+    //		template<typename T> typename enable<expr,T>::type atan(T arg) {
+    // return functions::atan(arg); }
+    inline expr atan(half arg) {
+        return functions::atan(arg);
+    }
+    inline expr atan(expr arg) {
+        return functions::atan(arg);
+    }
+
+    /// Arc tangent function.
+    /// \param x first argument
+    /// \param y second argument
+    /// \return arc tangent value
+    //		template<typename T,typename U> typename enable<expr,T,U>::type
+    // atan2(T x, U y) { return functions::atan2(x, y); }
+    inline expr atan2(half x, half y) {
+        return functions::atan2(x, y);
+    }
+    inline expr atan2(half x, expr y) {
+        return functions::atan2(x, y);
+    }
+    inline expr atan2(expr x, half y) {
+        return functions::atan2(x, y);
+    }
+    inline expr atan2(expr x, expr y) {
+        return functions::atan2(x, y);
+    }
+
+    /// \}
+    /// \name Hyperbolic functions
+    /// \{
+
+    /// Hyperbolic sine.
+    /// \param arg function argument
+    /// \return hyperbolic sine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type sinh(T arg) {
+    // return functions::sinh(arg); }
+    inline expr sinh(half arg) {
+        return functions::sinh(arg);
+    }
+    inline expr sinh(expr arg) {
+        return functions::sinh(arg);
+    }
+
+    /// Hyperbolic cosine.
+    /// \param arg function argument
+    /// \return hyperbolic cosine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type cosh(T arg) {
+    // return functions::cosh(arg); }
+    inline expr cosh(half arg) {
+        return functions::cosh(arg);
+    }
+    inline expr cosh(expr arg) {
+        return functions::cosh(arg);
+    }
+
+    /// Hyperbolic tangent.
+    /// \param arg function argument
+    /// \return hyperbolic tangent value of \a arg
+    //		template<typename T> typename enable<expr,T>::type tanh(T arg) {
+    // return functions::tanh(arg); }
+    inline expr tanh(half arg) {
+        return functions::tanh(arg);
+    }
+    inline expr tanh(expr arg) {
+        return functions::tanh(arg);
+    }
+
+    /// Hyperbolic area sine.
+    /// \param arg function argument
+    /// \return area sine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type asinh(T arg) {
+    // return functions::asinh(arg); }
+    inline expr asinh(half arg) {
+        return functions::asinh(arg);
+    }
+    inline expr asinh(expr arg) {
+        return functions::asinh(arg);
+    }
+
+    /// Hyperbolic area cosine.
+    /// \param arg function argument
+    /// \return area cosine value of \a arg
+    //		template<typename T> typename enable<expr,T>::type acosh(T arg) {
+    // return functions::acosh(arg); }
+    inline expr acosh(half arg) {
+        return functions::acosh(arg);
+    }
+    inline expr acosh(expr arg) {
+        return functions::acosh(arg);
+    }
+
+    /// Hyperbolic area tangent.
+    /// \param arg function argument
+    /// \return area tangent value of \a arg
+    //		template<typename T> typename enable<expr,T>::type atanh(T arg) {
+    // return functions::atanh(arg); }
+    inline expr atanh(half arg) {
+        return functions::atanh(arg);
+    }
+    inline expr atanh(expr arg) {
+        return functions::atanh(arg);
+    }
+
+    /// \}
+    /// \name Error and gamma functions
+    /// \{
+
+    /// Error function.
+    /// \param arg function argument
+    /// \return error function value of \a arg
+    //		template<typename T> typename enable<expr,T>::type erf(T arg) {
+    //return functions::erf(arg); }
+    inline expr erf(half arg) {
+        return functions::erf(arg);
+    }
+    inline expr erf(expr arg) {
+        return functions::erf(arg);
+    }
+
+    /// Complementary error function.
+    /// \param arg function argument
+    /// \return 1 minus error function value of \a arg
+    //		template<typename T> typename enable<expr,T>::type erfc(T arg) {
+    // return functions::erfc(arg); }
+    inline expr erfc(half arg) {
+        return functions::erfc(arg);
+    }
+    inline expr erfc(expr arg) {
+        return functions::erfc(arg);
+    }
+
+    /// Natural logarithm of gamma function.
+    /// \param arg function argument
+    /// \return natural logarith of gamma function for \a arg
+    //		template<typename T> typename enable<expr,T>::type lgamma(T arg) {
+    // return functions::lgamma(arg); }
+    inline expr lgamma(half arg) {
+        return functions::lgamma(arg);
+    }
+    inline expr lgamma(expr arg) {
+        return functions::lgamma(arg);
+    }
+
+    /// Gamma function.
+    /// \param arg function argument
+    /// \return gamma function value of \a arg
+    //		template<typename T> typename enable<expr,T>::type tgamma(T arg) {
+    // return functions::tgamma(arg); }
+    inline expr tgamma(half arg) {
+        return functions::tgamma(arg);
+    }
+    inline expr tgamma(expr arg) {
+        return functions::tgamma(arg);
+    }
+
+    /// \}
+    /// \name Rounding
+    /// \{
+
+    /// Nearest integer not less than half value.
+    /// \param arg half to round
+    /// \return nearest integer not less than \a arg
+    //		template<typename T> typename enable<half,T>::type ceil(T arg) {
+    // return functions::ceil(arg); }
+    inline half ceil(half arg) {
+        return functions::ceil(arg);
+    }
+    inline half ceil(expr arg) {
+        return functions::ceil(arg);
+    }
+
+    /// Nearest integer not greater than half value.
+    /// \param arg half to round
+    /// \return nearest integer not greater than \a arg
+    //		template<typename T> typename enable<half,T>::type floor(T arg) {
+    // return functions::floor(arg); }
+    inline half floor(half arg) {
+        return functions::floor(arg);
+    }
+    inline half floor(expr arg) {
+        return functions::floor(arg);
+    }
+
+    /// Nearest integer not greater in magnitude than half value.
+    /// \param arg half to round
+    /// \return nearest integer not greater in magnitude than \a arg
+    //		template<typename T> typename enable<half,T>::type trunc(T arg) {
+    // return functions::trunc(arg); }
+    inline half trunc(half arg) {
+        return functions::trunc(arg);
+    }
+    inline half trunc(expr arg) {
+        return functions::trunc(arg);
+    }
+
+    /// Nearest integer.
+    /// \param arg half to round
+    /// \return nearest integer, rounded away from zero in half-way cases
+    //		template<typename T> typename enable<half,T>::type round(T arg) {
+    // return functions::round(arg); }
+    inline half round(half arg) {
+        return functions::round(arg);
+    }
+    inline half round(expr arg) {
+        return functions::round(arg);
+    }
+
+    /// Nearest integer.
+    /// \param arg half to round
+    /// \return nearest integer, rounded away from zero in half-way cases
+    //		template<typename T> typename enable<long,T>::type lround(T arg) {
+    // return functions::lround(arg); }
+    inline long lround(half arg) {
+        return functions::lround(arg);
+    }
+    inline long lround(expr arg) {
+        return functions::lround(arg);
+    }
+
+    /// Nearest integer using half's internal rounding mode.
+    /// \param arg half expression to round
+    /// \return nearest integer using default rounding mode
+    //		template<typename T> typename enable<half,T>::type nearbyint(T arg)
+    //{ return functions::nearbyint(arg); }
+    inline half nearbyint(half arg) {
+        return functions::rint(arg);
+    }
+    inline half nearbyint(expr arg) {
+        return functions::rint(arg);
+    }
+
+    /// Nearest integer using half's internal rounding mode.
+    /// \param arg half expression to round
+    /// \return nearest integer using default rounding mode
+    //		template<typename T> typename enable<half,T>::type rint(T arg) {
+    // return functions::rint(arg); }
+    inline half rint(half arg) {
+        return functions::rint(arg);
+    }
+    inline half rint(expr arg) {
+        return functions::rint(arg);
+    }
+
+    /// Nearest integer using half's internal rounding mode.
+    /// \param arg half expression to round
+    /// \return nearest integer using default rounding mode
+    //		template<typename T> typename enable<long,T>::type lrint(T arg) {
+    // return functions::lrint(arg); }
+    inline long lrint(half arg) {
+        return functions::lrint(arg);
+    }
+    inline long lrint(expr arg) {
+        return functions::lrint(arg);
+    }
+#if HALF_ENABLE_CPP11_LONG_LONG
+    /// Nearest integer.
+    /// \param arg half to round
+    /// \return nearest integer, rounded away from zero in half-way cases
+    //		template<typename T> typename enable<long long,T>::type llround(T
+    //arg) { return functions::llround(arg); }
+    inline long long llround(half arg) {
+        return functions::llround(arg);
+    }
+    inline long long llround(expr arg) {
+        return functions::llround(arg);
+    }
+
+    /// Nearest integer using half's internal rounding mode.
+    /// \param arg half expression to round
+    /// \return nearest integer using default rounding mode
+    //		template<typename T> typename enable<long long,T>::type llrint(T
+    //arg) { return functions::llrint(arg); }
+    inline long long llrint(half arg) {
+        return functions::llrint(arg);
+    }
+    inline long long llrint(expr arg) {
+        return functions::llrint(arg);
+    }
+#endif
+
+    /// \}
+    /// \name Floating point manipulation
+    /// \{
+
+    /// Decompress floating point number.
+    /// \param arg number to decompress
+    /// \param exp address to store exponent at
+    /// \return significant in range [0.5, 1)
+    //		template<typename T> typename enable<half,T>::type frexp(T arg, int
+    //*exp) { return functions::frexp(arg, exp); }
+    inline half frexp(half arg, int *exp) {
+        return functions::frexp(arg, exp);
+    }
+    inline half frexp(expr arg, int *exp) {
+        return functions::frexp(arg, exp);
+    }
+
+    /// Multiply by power of two.
+    /// \param arg number to modify
+    /// \param exp power of two to multiply with
+    /// \return \a arg multplied by 2 raised to \a exp
+    //		template<typename T> typename enable<half,T>::type ldexp(T arg, int
+    // exp) { return functions::scalbln(arg, exp); }
+    inline half ldexp(half arg, int exp) {
+        return functions::scalbln(arg, exp);
+    }
+    inline half ldexp(expr arg, int exp) {
+        return functions::scalbln(arg, exp);
+    }
+
+    /// Extract integer and fractional parts.
+    /// \param arg number to decompress
+    /// \param iptr address to store integer part at
+    /// \return fractional part
+    //		template<typename T> typename enable<half,T>::type modf(T arg, half
+    //*iptr) { return functions::modf(arg, iptr); }
+    inline half modf(half arg, half *iptr) {
+        return functions::modf(arg, iptr);
+    }
+    inline half modf(expr arg, half *iptr) {
+        return functions::modf(arg, iptr);
+    }
+
+    /// Multiply by power of two.
+    /// \param arg number to modify
+    /// \param exp power of two to multiply with
+    /// \return \a arg multplied by 2 raised to \a exp
+    //		template<typename T> typename enable<half,T>::type scalbn(T arg, int
+    // exp) { return functions::scalbln(arg, exp); }
+    inline half scalbn(half arg, int exp) {
+        return functions::scalbln(arg, exp);
+    }
+    inline half scalbn(expr arg, int exp) {
+        return functions::scalbln(arg, exp);
+    }
+
+    /// Multiply by power of two.
+    /// \param arg number to modify
+    /// \param exp power of two to multiply with
+    /// \return \a arg multplied by 2 raised to \a exp
+    //		template<typename T> typename enable<half,T>::type scalbln(T arg,
+    //long exp) { return functions::scalbln(arg, exp); }
+    inline half scalbln(half arg, long exp) {
+        return functions::scalbln(arg, exp);
+    }
+    inline half scalbln(expr arg, long exp) {
+        return functions::scalbln(arg, exp);
+    }
+
+    /// Extract exponent.
+    /// \param arg number to query
+    /// \return floating point exponent
+    /// \retval FP_ILOGB0 for zero
+    /// \retval FP_ILOGBNAN for NaN
+    /// \retval MAX_INT for infinity
+    //		template<typename T> typename enable<int,T>::type ilogb(T arg) {
+    // return functions::ilogb(arg); }
+    inline int ilogb(half arg) {
+        return functions::ilogb(arg);
+    }
+    inline int ilogb(expr arg) {
+        return functions::ilogb(arg);
+    }
+
+    /// Extract exponent.
+    /// \param arg number to query
+    /// \return floating point exponent
+    //		template<typename T> typename enable<half,T>::type logb(T arg) {
+    // return functions::logb(arg); }
+    inline half logb(half arg) {
+        return functions::logb(arg);
+    }
+    inline half logb(expr arg) {
+        return functions::logb(arg);
+    }
+
+    /// Next representable value.
+    /// \param from value to compute next representable value for
+    /// \param to direction towards which to compute next value
+    /// \return next representable value after \a from in direction towards \a
+    /// to
+    //		template<typename T,typename U> typename enable<half,T,U>::type
+    // nextafter(T from, U to) { return functions::nextafter(from, to); }
+    inline half nextafter(half from, half to) {
+        return functions::nextafter(from, to);
+    }
+    inline half nextafter(half from, expr to) {
+        return functions::nextafter(from, to);
+    }
+    inline half nextafter(expr from, half to) {
+        return functions::nextafter(from, to);
+    }
+    inline half nextafter(expr from, expr to) {
+        return functions::nextafter(from, to);
+    }
+
+    /// Next representable value.
+    /// \param from value to compute next representable value for
+    /// \param to direction towards which to compute next value
+    /// \return next representable value after \a from in direction towards \a
+    /// to
+    //		template<typename T> typename enable<half,T>::type nexttoward(T
+    //from, long double to) { return functions::nexttoward(from, to); }
+    inline half nexttoward(half from, long double to) {
+        return functions::nexttoward(from, to);
+    }
+    inline half nexttoward(expr from, long double to) {
+        return functions::nexttoward(from, to);
+    }
+
+    /// Take sign.
+    /// \param x value to change sign for
+    /// \param y value to take sign from
+    /// \return value equal to \a x in magnitude and to \a y in sign
+    //		template<typename T,typename U> typename enable<half,T,U>::type
+    // copysign(T x, U y) { return functions::copysign(x, y); }
+    inline half copysign(half x, half y) {
+        return functions::copysign(x, y);
+    }
+    inline half copysign(half x, expr y) {
+        return functions::copysign(x, y);
+    }
+    inline half copysign(expr x, half y) {
+        return functions::copysign(x, y);
+    }
+    inline half copysign(expr x, expr y) {
+        return functions::copysign(x, y);
+    }
+
+    /// \}
+    /// \name Floating point classification
+    /// \{
+
+    /// Classify floating point value.
+    /// \param arg number to classify
+    /// \retval FP_ZERO for positive and negative zero
+    /// \retval FP_SUBNORMAL for subnormal numbers
+    /// \retval FP_INFINITY for positive and negative infinity
+    /// \retval FP_NAN for NaNs
+    /// \retval FP_NORMAL for all other (normal) values
+    //		template<typename T> typename enable<int,T>::type fpclassify(T arg)
+    //{ return functions::fpclassify(arg); }
+    inline int fpclassify(half arg) {
+        return functions::fpclassify(arg);
+    }
+    inline int fpclassify(expr arg) {
+        return functions::fpclassify(arg);
+    }
+
+    /// Check if finite number.
+    /// \param arg number to check
+    /// \retval true if neither infinity nor NaN
+    /// \retval false else
+    //		template<typename T> typename enable<bool,T>::type isfinite(T arg) {
+    // return functions::isfinite(arg); }
+    inline bool isfinite(half arg) {
+        return functions::isfinite(arg);
+    }
+    inline bool isfinite(expr arg) {
+        return functions::isfinite(arg);
+    }
+
+    /// Check for infinity.
+    /// \param arg number to check
+    /// \retval true for positive or negative infinity
+    /// \retval false else
+    //		template<typename T> typename enable<bool,T>::type isinf(T arg) {
+    // return functions::isinf(arg); }
+    inline bool isinf(half arg) {
+        return functions::isinf(arg);
+    }
+    inline bool isinf(expr arg) {
+        return functions::isinf(arg);
+    }
+
+    /// Check for NaN.
+    /// \param arg number to check
+    /// \retval true for NaNs
+    /// \retval false else
+    //		template<typename T> typename enable<bool,T>::type isnan(T arg) {
+    // return functions::isnan(arg); }
+    inline bool isnan(half arg) {
+        return functions::isnan(arg);
+    }
+    inline bool isnan(expr arg) {
+        return functions::isnan(arg);
+    }
+
+    /// Check if normal number.
+    /// \param arg number to check
+    /// \retval true if normal number
+    /// \retval false if either subnormal, zero, infinity or NaN
+    //		template<typename T> typename enable<bool,T>::type isnormal(T arg) {
+    // return functions::isnormal(arg); }
+    inline bool isnormal(half arg) {
+        return functions::isnormal(arg);
+    }
+    inline bool isnormal(expr arg) {
+        return functions::isnormal(arg);
+    }
+
+    /// Check sign.
+    /// \param arg number to check
+    /// \retval true for negative number
+    /// \retval false for positive number
+    //		template<typename T> typename enable<bool,T>::type signbit(T arg) {
+    // return functions::signbit(arg); }
+    inline bool signbit(half arg) {
+        return functions::signbit(arg);
+    }
+    inline bool signbit(expr arg) {
+        return functions::signbit(arg);
+    }
+
+    /// \}
+    /// \name Comparison
+    /// \{
+
+    /// Comparison for greater than.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x greater than \a y
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // isgreater(T x, U y) { return functions::isgreater(x, y); }
+    inline bool isgreater(half x, half y) {
+        return functions::isgreater(x, y);
+    }
+    inline bool isgreater(half x, expr y) {
+        return functions::isgreater(x, y);
+    }
+    inline bool isgreater(expr x, half y) {
+        return functions::isgreater(x, y);
+    }
+    inline bool isgreater(expr x, expr y) {
+        return functions::isgreater(x, y);
+    }
+
+    /// Comparison for greater equal.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x greater equal \a y
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); }
+    inline bool isgreaterequal(half x, half y) {
+        return functions::isgreaterequal(x, y);
+    }
+    inline bool isgreaterequal(half x, expr y) {
+        return functions::isgreaterequal(x, y);
+    }
+    inline bool isgreaterequal(expr x, half y) {
+        return functions::isgreaterequal(x, y);
+    }
+    inline bool isgreaterequal(expr x, expr y) {
+        return functions::isgreaterequal(x, y);
+    }
+
+    /// Comparison for less than.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x less than \a y
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // isless(T x, U y) { return functions::isless(x, y); }
+    inline bool isless(half x, half y) {
+        return functions::isless(x, y);
+    }
+    inline bool isless(half x, expr y) {
+        return functions::isless(x, y);
+    }
+    inline bool isless(expr x, half y) {
+        return functions::isless(x, y);
+    }
+    inline bool isless(expr x, expr y) {
+        return functions::isless(x, y);
+    }
+
+    /// Comparison for less equal.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if \a x less equal \a y
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // islessequal(T x, U y) { return functions::islessequal(x, y); }
+    inline bool islessequal(half x, half y) {
+        return functions::islessequal(x, y);
+    }
+    inline bool islessequal(half x, expr y) {
+        return functions::islessequal(x, y);
+    }
+    inline bool islessequal(expr x, half y) {
+        return functions::islessequal(x, y);
+    }
+    inline bool islessequal(expr x, expr y) {
+        return functions::islessequal(x, y);
+    }
+
+    /// Comarison for less or greater.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if either less or greater
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // islessgreater(T x, U y) { return functions::islessgreater(x, y); }
+    inline bool islessgreater(half x, half y) {
+        return functions::islessgreater(x, y);
+    }
+    inline bool islessgreater(half x, expr y) {
+        return functions::islessgreater(x, y);
+    }
+    inline bool islessgreater(expr x, half y) {
+        return functions::islessgreater(x, y);
+    }
+    inline bool islessgreater(expr x, expr y) {
+        return functions::islessgreater(x, y);
+    }
+
+    /// Check if unordered.
+    /// \param x first operand
+    /// \param y second operand
+    /// \retval true if unordered (one or two NaN operands)
+    /// \retval false else
+    //		template<typename T,typename U> typename enable<bool,T,U>::type
+    // isunordered(T x, U y) { return functions::isunordered(x, y); }
+    inline bool isunordered(half x, half y) {
+        return functions::isunordered(x, y);
+    }
+    inline bool isunordered(half x, expr y) {
+        return functions::isunordered(x, y);
+    }
+    inline bool isunordered(expr x, half y) {
+        return functions::isunordered(x, y);
+    }
+    inline bool isunordered(expr x, expr y) {
+        return functions::isunordered(x, y);
+    }
+
+    /// \name Casting
+    /// \{
+
+    /// Cast to or from half-precision floating point number.
+    /// This casts between [half](\ref half_float::half) and any built-in
+    /// arithmetic type. The values are converted directly using the given
+    /// rounding mode, without any roundtrip over `float` that a `static_cast`
+    /// would otherwise do. It uses the default rounding mode.
+    ///
+    /// Using this cast with neither of the two types being a [half](\ref
+    /// half_float::half) or with any of the two types not being a built-in
+    /// arithmetic type (apart from [half](\ref half_float::half), of course)
+    /// results in a compiler error and casting between [half](\ref
+    /// half_float::half)s is just a no-op. \tparam T destination type (half or
+    /// built-in arithmetic type) \tparam U source type (half or built-in
+    /// arithmetic type) \param arg value to cast \return \a arg converted to
+    /// destination type
+    template <typename T, typename U>
+    T half_cast(U arg) {
+        return half_caster<T, U>::cast(arg);
+    }
+
+    /// Cast to or from half-precision floating point number.
+    /// This casts between [half](\ref half_float::half) and any built-in
+    /// arithmetic type. The values are converted directly using the given
+    /// rounding mode, without any roundtrip over `float` that a `static_cast`
+    /// would otherwise do.
+    ///
+    /// Using this cast with neither of the two types being a [half](\ref
+    /// half_float::half) or with any of the two types not being a built-in
+    /// arithmetic type (apart from [half](\ref half_float::half), of course)
+    /// results in a compiler error and casting between [half](\ref
+    /// half_float::half)s is just a no-op. \tparam T destination type (half or
+    /// built-in arithmetic type) \tparam R rounding mode to use. \tparam U
+    /// source type (half or built-in arithmetic type) \param arg value to cast
+    /// \return \a arg converted to destination type
+    template <typename T, std::float_round_style R, typename U>
+    T half_cast(U arg) {
+        return half_caster<T, U, R>::cast(arg);
+    }
+    /// \}
+}  // namespace detail
+
+using detail::operator==;
+using detail::operator!=;
+using detail::operator<;
+using detail::operator>;
+using detail::operator<=;
+using detail::operator>=;
+using detail::operator+;
+using detail::operator-;
+using detail::operator*;
+using detail::operator/;
+using detail::operator<<;
+using detail::operator>>;
+
+using detail::abs;
+using detail::acos;
+using detail::acosh;
+using detail::asin;
+using detail::asinh;
+using detail::atan;
+using detail::atan2;
+using detail::atanh;
+using detail::cbrt;
+using detail::ceil;
+using detail::cos;
+using detail::cosh;
+using detail::erf;
+using detail::erfc;
+using detail::exp;
+using detail::exp2;
+using detail::expm1;
+using detail::fabs;
+using detail::fdim;
+using detail::floor;
+using detail::fma;
+using detail::fmax;
+using detail::fmin;
+using detail::fmod;
+using detail::hypot;
+using detail::lgamma;
+using detail::log;
+using detail::log10;
+using detail::log1p;
+using detail::log2;
+using detail::lrint;
+using detail::lround;
+using detail::nanh;
+using detail::nearbyint;
+using detail::pow;
+using detail::remainder;
+using detail::remquo;
+using detail::rint;
+using detail::round;
+using detail::sin;
+using detail::sinh;
+using detail::sqrt;
+using detail::tan;
+using detail::tanh;
+using detail::tgamma;
+using detail::trunc;
+#if HALF_ENABLE_CPP11_LONG_LONG
+using detail::llrint;
+using detail::llround;
+#endif
+using detail::copysign;
+using detail::fpclassify;
+using detail::frexp;
+using detail::ilogb;
+using detail::isfinite;
+using detail::isgreater;
+using detail::isgreaterequal;
+using detail::isinf;
+using detail::isless;
+using detail::islessequal;
+using detail::islessgreater;
+using detail::isnan;
+using detail::isnormal;
+using detail::isunordered;
+using detail::ldexp;
+using detail::logb;
+using detail::modf;
+using detail::nextafter;
+using detail::nexttoward;
+using detail::scalbln;
+using detail::scalbn;
+using detail::signbit;
+
+using detail::half_cast;
+}  // namespace half_float
+
+/// Extensions to the C++ standard library.
+namespace std {
+/// Numeric limits for half-precision floats.
+/// Because of the underlying single-precision implementation of many
+/// operations, it inherits some properties from `std::numeric_limits<float>`.
+template <>
+class numeric_limits<half_float::half> : public numeric_limits<float> {
+public:
+    /// Supports signed values.
+    static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+    /// Is not exact.
+    static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+    /// Doesn't provide modulo arithmetic.
+    static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+    /// IEEE conformant.
+    static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+    /// Supports infinity.
+    static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+    /// Supports quiet NaNs.
+    static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+    /// Supports subnormal values.
+    static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+    /// Rounding mode.
+    /// Due to the mix of internal single-precision computations (using the
+    /// rounding mode of the underlying single-precision implementation) with
+    /// the rounding mode of the single-to-half conversions, the actual rounding
+    /// mode might be `std::round_indeterminate` if the default half-precision
+    /// rounding mode doesn't match the single-precision rounding mode.
+    static HALF_CONSTEXPR_CONST float_round_style round_style =
+        (std::numeric_limits<float>::round_style ==
+         half_float::half::round_style)
+            ? half_float::half::round_style
+            : round_indeterminate;
+
+    /// Significant digits.
+    static HALF_CONSTEXPR_CONST int digits = 11;
+
+    /// Significant decimal digits.
+    static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+    /// Required decimal digits to represent all possible values.
+    static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+    /// Number base.
+    static HALF_CONSTEXPR_CONST int radix = 2;
+
+    /// One more than smallest exponent.
+    static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+    /// Smallest normalized representable power of 10.
+    static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+    /// One more than largest exponent
+    static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+    /// Largest finitely representable power of 10.
+    static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+    /// Smallest positive normal value.
+    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x0400);
+    }
+
+    /// Smallest finite value.
+    static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0xFBFF);
+    }
+
+    /// Largest finite value.
+    static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x7BFF);
+    }
+
+    /// Difference between one and next representable value.
+    static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x1400);
+    }
+
+    /// Maximum rounding error.
+    static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW {
+        return half_float::half(
+            half_float::detail::binary,
+            (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
+    }
+
+    /// Positive infinity.
+    static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x7C00);
+    }
+
+    /// Quiet NaN.
+    static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x7FFF);
+    }
+
+    /// Signalling NaN.
+    static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x7DFF);
+    }
+
+    /// Smallest positive subnormal value.
+    static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW {
+        return half_float::half(half_float::detail::binary, 0x0001);
+    }
+};
+
+#if HALF_ENABLE_CPP11_HASH
+/// Hash function for half-precision floats.
+/// This is only defined if C++11 `std::hash` is supported and enabled.
+template <>
+struct hash<half_float::half>  //: unary_function<half_float::half,size_t>
+{
+    /// Type of function argument.
+    typedef half_float::half argument_type;
+
+    /// Function return type.
+    typedef size_t result_type;
+
+    /// Compute hash function.
+    /// \param arg half to hash
+    /// \return hash value
+    result_type operator()(argument_type arg) const {
+        return hash<half_float::detail::uint16>()(
+            static_cast<unsigned>(arg.data_) & -(arg.data_ != 0x8000));
+    }
+};
+#endif
+}  // namespace std
+
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#ifdef HALF_POP_WARNINGS
+#pragma warning(pop)
+#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/utils/half_utils.cc b/3rdparty/TNN/source/tnn/utils/half_utils.cc
new file mode 100644
index 0000000..ac45bd4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/half_utils.cc
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/half_utils_inner.h"
+
+#include "tnn/core/macro.h"
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#import <Accelerate/Accelerate.h>
+#endif
+#endif
+
+using namespace half_float;
+
+namespace TNN_NS {
+
+const float MAX_HALF_FLOAT = 65504.0f;
+const float MIN_HALF_FLOAT = -65504.0f;
+
+int ConvertFromFloatToHalf(float *fp32, void *fp16, int count) {
+#if defined(__APPLE__) && TARGET_OS_IPHONE
+    vImage_Buffer halfImage, floatImage;
+    {
+        halfImage.width    = count;
+        halfImage.height   = 1;
+        halfImage.rowBytes = count * sizeof(float) / 2;
+        halfImage.data     = fp16;
+
+        floatImage.width    = count;
+        floatImage.height   = 1;
+        floatImage.rowBytes = count * sizeof(float);
+        floatImage.data     = fp32;
+    }
+
+    auto error = vImageConvert_PlanarFtoPlanar16F(&floatImage, &halfImage, 0);
+    if (error != kvImageNoError) {
+        LOGE("vImageConvert_PlanarFtoPlanar16F error\n");
+        return -1;
+    } else {
+        return 0;
+    }
+#else
+    bool exceedUplimits     = false;
+    detail::uint16 *fp16PTR = (detail::uint16 *)fp16;
+    for (int i = 0; i < count; ++i) {
+        if (fp32[i] > MAX_HALF_FLOAT) {
+            exceedUplimits = true;
+            LOGE(
+                "ERROR: the weights[%d]=%f of conv_layer_data is out of bounds "
+                "of float16 max %f. \n",
+                i, fp32[i], MAX_HALF_FLOAT);
+            fp16PTR[i] = detail::float2half<(std::float_round_style)(HALF_ROUND_STYLE)>(MAX_HALF_FLOAT);
+        } else if (fp32[i] < MIN_HALF_FLOAT) {
+            exceedUplimits = true;
+            LOGE(
+                "ERROR: the weights[%d]=%f of conv_layer_data is out of bounds "
+                "of float16 min %f. \n",
+                i, fp32[i], MIN_HALF_FLOAT);
+            fp16PTR[i] = detail::float2half<(std::float_round_style)(HALF_ROUND_STYLE)>(MIN_HALF_FLOAT);
+            ;
+        } else {
+            fp16PTR[i] = detail::float2half<(std::float_round_style)(HALF_ROUND_STYLE)>(fp32[i]);
+        }
+    }
+    return exceedUplimits ? -1 : 0;
+#endif
+}
+
+int ConvertFromHalfToFloat(void *fp16, float *fp32, int count) {
+#if defined(__APPLE__) && TARGET_OS_IPHONE
+    vImage_Buffer halfImage, floatImage;
+    {
+        halfImage.width    = count;
+        halfImage.height   = 1;
+        halfImage.rowBytes = count * sizeof(float) / 2;
+        halfImage.data     = fp16;
+
+        floatImage.width    = count;
+        floatImage.height   = 1;
+        floatImage.rowBytes = count * sizeof(float);
+        floatImage.data     = fp32;
+    }
+
+    auto error = vImageConvert_Planar16FtoPlanarF(&halfImage, &floatImage, 0);
+    if (error != kvImageNoError) {
+        LOGE("vImageConvert_Planar16FtoPlanarF error\n");
+        return -1;
+    } else {
+        return 0;
+    }
+#else
+    detail::uint16 *fp16PTR = (detail::uint16 *)fp16;
+    for (int i = 0; i < count; ++i) {
+        fp32[i] = detail::half2float<float>(fp16PTR[i]);
+    }
+
+    return 0;
+#endif
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/half_utils_inner.h b/3rdparty/TNN/source/tnn/utils/half_utils_inner.h
new file mode 100644
index 0000000..63b2fc0
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/half_utils_inner.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_HALF_UTILS_INNER_H_
+#define TNN_INCLUDE_TNN_UTILS_HALF_UTILS_INNER_H_
+
+#ifdef TNN_ARM82_A64
+
+#include <cstdint>
+typedef __fp16 fp16_t;
+
+typedef union {
+    uint16_t u;
+    fp16_t f;
+} cvt_16b;
+
+static fp16_t cvt_half_from_raw_uint16(uint16_t u) {
+    cvt_16b c;
+    c.u = u;
+    return c.f;
+}
+
+// Largest finite value.
+#define HALF_MAX    cvt_half_from_raw_uint16(uint16_t(0x7BFF))
+// Smallest positive normal value.
+#define HALF_MIN    cvt_half_from_raw_uint16(uint16_t(0x0400))
+// Smallest finite value.
+#define HALF_LOWEST cvt_half_from_raw_uint16(uint16_t(0xFBFF))
+
+#else // TNN_ARM82_A64
+
+#include "tnn/utils/half.hpp"
+typedef half_float::half fp16_t;
+// Largest finite value.
+#define HALF_MAX    std::numeric_limits<half_float::half>::max()
+// Smallest positive normal value.
+#define HALF_MIN    std::numeric_limits<half_float::half>::min()
+// Smallest finite value.
+#define HALF_LOWEST std::numeric_limits<half_float::half>::lowest()
+
+#endif // TNN_ARM82_A64
+
+#include "tnn/utils/half_utils.h"
+
+#endif  // TNN_INCLUDE_TNN_UTILS_HALF_UTILS_INNER_H_
diff --git a/3rdparty/TNN/source/tnn/utils/mat_converter_acc.cc b/3rdparty/TNN/source/tnn/utils/mat_converter_acc.cc
new file mode 100644
index 0000000..291944c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/mat_converter_acc.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <mutex>
+#include <string>
+#include "tnn/utils/blob_converter.h"
+
+#include "tnn/utils/mat_converter_acc.h"
+
+namespace TNN_NS {
+
+std::shared_ptr<MatConverterManager>& MatConverterManager::Shared() {
+    static std::once_flag once;
+    static std::shared_ptr<MatConverterManager> g_global_blob_converter_manager;
+    std::call_once(once, []() { g_global_blob_converter_manager = std::make_shared<MatConverterManager>(); });
+    return g_global_blob_converter_manager;
+}
+
+std::shared_ptr<MatConverterAcc> MatConverterManager::CreateMatConverterAcc(DeviceType device_type) {
+    auto iter = converter_creater_map_.find(device_type);
+    if (iter != converter_creater_map_.end()) {
+        return iter->second->CreateMatConverterAcc();
+    }
+    return nullptr;
+}
+
+int MatConverterManager::RegisterMatConverterAccCreater(DeviceType type,
+                                                        std::shared_ptr<MatConverterAccCreater> creater) {
+    auto iter = converter_creater_map_.find(type);
+    if (iter != converter_creater_map_.end()) {
+        LOGE("Error: device_type(%d) cannot be registered twice\n", type);
+        return 1;
+    }
+    if (!creater) {
+        LOGE("Error: MatConverterAccCreater is nil device_type(%d)\n", type);
+        return 1;
+    }
+    converter_creater_map_[type] = creater;
+    return 0;
+}
+
+MatConverterManager::MatConverterManager() {}
+MatConverterManager::~MatConverterManager() {}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/mat_converter_acc.h b/3rdparty/TNN/source/tnn/utils/mat_converter_acc.h
new file mode 100644
index 0000000..c631043
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/mat_converter_acc.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_MAT_CONVERTER_INTERNAL_H_
+#define TNN_SOURCE_TNN_UTILS_MAT_CONVERTER_INTERNAL_H_
+
+#include <map>
+#include <memory>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+class MatConverterAcc {
+public:
+    MatConverterAcc() {
+        OMP_SET_THREADS_(1);
+    };
+    virtual ~MatConverterAcc(){};
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL)                                      = 0;
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL)                 = 0;
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL)                     = 0;
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL)         = 0;
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL)        = 0;
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL) = 0;
+};
+
+class MatConverterAccCreater {
+public:
+    virtual ~MatConverterAccCreater(){};
+    virtual std::shared_ptr<MatConverterAcc> CreateMatConverterAcc() = 0;
+};
+
+class MatConverterManager {
+public:
+    static std::shared_ptr<MatConverterManager>& Shared();
+    MatConverterManager();
+    ~MatConverterManager();
+    std::shared_ptr<MatConverterAcc> CreateMatConverterAcc(DeviceType device_type);
+    int RegisterMatConverterAccCreater(DeviceType type, std::shared_ptr<MatConverterAccCreater> creater);
+
+private:
+    std::map<DeviceType, std::shared_ptr<MatConverterAccCreater>> converter_creater_map_;
+};
+
+template <typename T>
+class MatConverterAccRegister {
+public:
+    explicit MatConverterAccRegister(DeviceType type) {
+        auto creater  = std::make_shared<T>();
+        auto& manager = MatConverterManager::Shared();
+        manager->RegisterMatConverterAccCreater(type, creater);
+    };
+    ~MatConverterAccRegister(){};
+};
+}  // namespace TNN_NS
+
+#define DECLARE_MAT_CONVERTER_CREATER(device)                                                                          \
+    class device##MatConverterAccCreater : public MatConverterAccCreater {                                             \
+    public:                                                                                                            \
+        virtual ~device##MatConverterAccCreater(){};                                                                   \
+        virtual std::shared_ptr<MatConverterAcc> CreateMatConverterAcc() {                                             \
+            return std::make_shared<device##MatConverterAcc>();                                                        \
+        };                                                                                                             \
+    }
+
+#define REGISTER_MAT_CONVERTER(device, device_type)                                                                    \
+    MatConverterAccRegister<device##MatConverterAccCreater> g_mat_converter_##device(device_type)
+
+#endif  // TNN_SOURCE_TNN_UTILS_MAT_CONVERTER_INTERNAL_H_
diff --git a/3rdparty/TNN/source/tnn/utils/mat_converter_utils.cc b/3rdparty/TNN/source/tnn/utils/mat_converter_utils.cc
new file mode 100644
index 0000000..b0745c4
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/mat_converter_utils.cc
@@ -0,0 +1,168 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/mat_converter_utils.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace TNN_NS {
+
+#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX)
+
+Status CheckMatConverterParams(Mat& src, Mat& dst, bool check_same_device) {
+    if (src.GetData() == nullptr) {
+        return Status(TNNERR_NULL_PARAM, "input mat is null");
+    }
+
+    if (check_same_device && (src.GetDeviceType() != dst.GetDeviceType())) {
+        return Status(TNNERR_PARAM_ERR, "src and dst mat type must be same");
+    }
+
+    if (dst.GetData() == nullptr) {
+        dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dst.GetDims());
+    }
+
+    return TNN_OK;
+}
+
+static float CalculatePosition(int* position, int i, double scale, int border, int channel) {
+    float pos_f = (float)((i + 0.5) * scale - 0.5);
+    int pos_i = static_cast<int>(floor(pos_f));
+    float rat_f = pos_f - pos_i;
+    if (pos_i < 0) {
+        pos_i = 0;
+        rat_f = 0.f;
+    }
+    if (pos_i >= border - 1) {
+        pos_i = border - 2;
+        rat_f = 1.f;
+    }
+    position[i] = pos_i * channel;
+
+    return rat_f;
+}
+
+void CalculatePositionAndRatio(int length, double scale, int border, int channel,
+                                         int* position, short* ratio) {
+    const int INTER_RESIZE_COEF_BITS  = 11;
+    const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
+    for (int i = 0; i < length; i++) {
+        float rat_f = CalculatePosition(position, i, scale, border, channel);
+        float a0 = (1.f - rat_f) * INTER_RESIZE_COEF_SCALE;
+        float a1 = rat_f * INTER_RESIZE_COEF_SCALE;
+
+        ratio[i * 2]     = SATURATE_CAST_SHORT(a0);
+        ratio[i * 2 + 1] = SATURATE_CAST_SHORT(a1);
+    }
+}
+
+void CalculatePositionAndMask(int length, double scale, int border, int channel,
+                                     int* position, uint8_t* mask) {
+    for (int i = 0; i < length; i++) {
+        float rat_f = CalculatePosition(position, i, scale, border, channel);
+        mask[i] = (rat_f <= 0.5) ? -1 : 0;
+    }
+}
+
+#define  GetResizeBufPreparation(type)                                    \
+    double scale_x = (double)src_w / w;                                   \
+    double scale_y = (double)src_h / h;                                   \
+    *buf = new int[w + h + w + h];                                        \
+    int* xofs = *buf;                                                     \
+    int* yofs = *buf + w;                                                 \
+    type* ialpha = (type*)(*buf + w + h);                                 \
+    type* ibeta  = (type*)(*buf + w + h + w);
+
+// Meanings of xofs, yofs, ialpha, ibeta in src image:
+//                               |  ialpha[2*x]  |  ialpha[2*x+1]  |
+//     --       (xofs[x], yofs[y])                                 (xofs[x]+1, yofs[y])
+// ibeta[2*y]
+//     --                              (x*scale_x, y*scale_y)
+// ibeta[2*y+1]
+//     --       (xofs[x], yofs[y]+1)                               (xofs[x]+1, yofs[y]+1)
+void GetResizeBuf(int src_w, int src_h, int w, int h, int c, int** buf) {
+    GetResizeBufPreparation(short);
+
+    CalculatePositionAndRatio(w, scale_x, src_w, c, xofs, ialpha);
+    CalculatePositionAndRatio(h, scale_y, src_h, 1, yofs, ibeta);
+}
+
+
+// Meanings of xofs, yofs, ialpha, ibeta in src image:
+//                               |  ialpha[x] (1: left, 0: right)  |
+//     --       (xofs[x], yofs[y])                                 (xofs[x]+1, yofs[y])
+// ibeta[y]
+// (1: top,                            (x*scale_x, y*scale_y)
+//  0: bottom)
+//     --       (xofs[x], yofs[y]+1)                               (xofs[x]+1, yofs[y]+1)
+void GetResizeBufNearset(int src_w, int src_h, int w, int h, int c, int** buf) {
+    GetResizeBufPreparation(uint8_t);
+
+    CalculatePositionAndMask(w, scale_x, src_w, c, xofs, ialpha);
+    CalculatePositionAndMask(h, scale_y, src_h, 1, yofs, ibeta);
+}
+
+inline void InterpolateLinear(float x, float* coeffs) {
+    coeffs[0] = 1.f - x;
+    coeffs[1] = x;
+}
+
+void InitInterTab1D(float* tab, int tabsz) {
+    float scale = 1.f / tabsz;
+    for (int i = 0; i < tabsz; i++, tab += 2)
+        InterpolateLinear(i * scale, tab);
+}
+
+void WarpAffineMatrixInverse(const float (*transform)[3], double* inverse) {
+    double M[6];
+    M[0] = transform[0][0];
+    M[1] = transform[0][1];
+    M[2] = transform[0][2];
+    M[3] = transform[1][0];
+    M[4] = transform[1][1];
+    M[5] = transform[1][2];
+
+    // Inverse transform matrix
+    double D   = M[0] * M[4] - M[1] * M[3];
+    D          = D != 0 ? 1. / D : 0;
+    double A11 = M[4] * D, A22 = M[0] * D;
+    inverse[0]      = A11;
+    inverse[1]      = M[1] * (-D);
+    inverse[3]      = M[3] * (-D);
+    inverse[4]      = A22;
+    double b1 = -A11        * M[2] - inverse[1] * M[5];
+    double b2 = -inverse[3] * M[2] - A22        * M[5];
+    inverse[2]      = b1;
+    inverse[5]      = b2;
+}
+
+int GetMatElementSize(Mat* mat) {
+    MatType mat_type = mat->GetMatType();
+    if (NCHW_FLOAT == mat_type) {
+        return 4;
+    } else if (NC_INT32 == mat_type) {
+        return 4;
+    } else if (N8UC3 == mat_type || N8UC4 == mat_type || NGRAY == mat_type || NNV21 == mat_type || NNV12 == mat_type) {
+        return 1;
+    } else if (RESERVED_BFP16_TEST == mat_type || RESERVED_FP16_TEST == mat_type) {
+        return 2;
+    } else if (RESERVED_INT8_TEST == mat_type) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/mat_converter_utils.h b/3rdparty/TNN/source/tnn/utils/mat_converter_utils.h
new file mode 100644
index 0000000..a874237
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/mat_converter_utils.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_UTILS_MAT_CONVERTER_UTILS_H_
+#define TNN_UTILS_MAT_CONVERTER_UTILS_H_
+
+#include <cmath>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/core/mat.h"
+
+namespace TNN_NS {
+
+Status CheckMatConverterParams(Mat& src, Mat& dst, bool check_same_device);
+
+void CalculatePositionAndRatio(int length, double scale, int border, int channel,
+                                         int* position, short* ratio);
+
+void CalculatePositionAndMask(int length, double scale, int border, int channel,
+                                     int* position, uint8_t* mask);
+
+// Meanings of xofs, yofs, ialpha, ibeta in src image:
+//                               |  ialpha[2*x]  |  ialpha[2*x+1]  |
+//     --       (xofs[x], yofs[y])                                 (xofs[x]+1, yofs[y])
+// ibeta[2*y]
+//     --                              (x*scale_x, y*scale_y)
+// ibeta[2*y+1]
+//     --       (xofs[x], yofs[y]+1)                               (xofs[x]+1, yofs[y]+1)
+void GetResizeBuf(int src_w, int src_h, int w, int h, int c, int** buf);
+
+
+// Meanings of xofs, yofs, ialpha, ibeta in src image:
+//                               |  ialpha[x] (1: left, 0: right)  |
+//     --       (xofs[x], yofs[y])                                 (xofs[x]+1, yofs[y])
+// ibeta[y]
+// (1: top,                            (x*scale_x, y*scale_y)
+//  0: bottom)
+//     --       (xofs[x], yofs[y]+1)                               (xofs[x]+1, yofs[y]+1)
+void GetResizeBufNearset(int src_w, int src_h, int w, int h, int c, int** buf);
+
+void InitInterTab1D(float* tab, int tabsz);
+
+void WarpAffineMatrixInverse(const float (*transform)[3], double* inverse);
+
+int GetMatElementSize(Mat* mat);
+
+}  // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/utils/mat_utils.cc b/3rdparty/TNN/source/tnn/utils/mat_utils.cc
new file mode 100644
index 0000000..925d754
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/mat_utils.cc
@@ -0,0 +1,219 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/mat_converter_acc.h"
+#include <math.h>
+
+namespace TNN_NS {
+
+#define MAT_CONVERTER_PREPARATION(device_type)                                          \
+    if (dst.GetData() == nullptr) {                                                     \
+        dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dst.GetDims());                \
+    }                                                                                   \
+    auto converter = MatConverterManager::Shared()->CreateMatConverterAcc(device_type); \
+    if (!converter) {                                                                   \
+        return Status(TNNERR_INIT_LAYER, "image converter is nil, check device type");  \
+    }
+
+#define CHECK_DST_DATA_NULL                                                             \
+    if (dst.GetData() != nullptr) {                                                     \
+        return Status(TNNERR_PARAM_ERR, "Incompatible param and dst size.\n "           \
+                      "\tSet compatible param and dst size, "                           \
+                      "or set dst mat data to null and let tnn infer dst size.");       \
+    }
+
+static Status CheckSrcAndDstMat(Mat& src, Mat& dst, bool check_device_type, bool check_mat_type,
+                                bool check_src_size) {
+    if (check_device_type && (src.GetDeviceType() != dst.GetDeviceType())) {
+        return Status(TNNERR_PARAM_ERR, "src and dst DeviceType not equal");
+    }
+
+    if (check_mat_type && (src.GetMatType() != dst.GetMatType())) {
+        return Status(TNNERR_PARAM_ERR, "src and dst MatType not equal");
+    }
+
+    if (check_src_size && (src.GetWidth() <= 0 || src.GetHeight() <= 0)) {
+        return Status(TNNERR_INVALID_INPUT, "src size is zero or negnative");
+    }
+
+    return TNN_OK;
+}
+
+static int GetCvtColorDstChannel(ColorConversionType type) {
+    switch (type) {
+        case COLOR_CONVERT_BGRTOGRAY:
+        case COLOR_CONVERT_BGRATOGRAY:
+        case COLOR_CONVERT_RGBTOGRAY:
+        case COLOR_CONVERT_RGBATOGRAY:
+            return 1;
+        case COLOR_CONVERT_NV12TOBGR:
+        case COLOR_CONVERT_NV21TOBGR:
+            return 3;
+        case COLOR_CONVERT_NV12TOBGRA:
+        case COLOR_CONVERT_NV21TOBGRA:
+            return 4;
+        default:
+            return Status(TNNERR_PARAM_ERR, "color conversion type not supported");
+    }
+}
+
+Status MatUtils::Copy(Mat& src, Mat& dst, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, false, true, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    DimsVector src_dims = src.GetDims();
+    DimsVector dst_dims = dst.GetDims();
+    if (DimsVectorUtils::Equal(src_dims, dst_dims)) {
+        DeviceType device_type = DEVICE_NAIVE;
+        // get device type
+        DeviceType src_dt = src.GetDeviceType();
+        DeviceType dst_dt = dst.GetDeviceType();
+        if (src_dt == dst_dt) {
+            device_type = src_dt;
+        } else if (DEVICE_NAIVE == src_dt || DEVICE_ARM == src_dt) {
+            device_type = dst_dt;
+        } else if (DEVICE_NAIVE == dst_dt || DEVICE_ARM == dst_dt) {
+            device_type = src_dt;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "src and dst DeviceType need be equal or one is device cpu");
+        }
+        MAT_CONVERTER_PREPARATION(device_type);
+        return converter->Copy(src, dst, command_queue);
+    } else {
+        return Status(TNNERR_PARAM_ERR, "src and dst dims not equal");
+    }
+}
+
+Status MatUtils::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (param.scale_w > 0 && param.scale_h > 0) {
+        int new_h = int(round(param.scale_h * src.GetHeight()));
+        int new_w = int(round(param.scale_w * src.GetWidth()));
+        if (dst.GetWidth() != new_w || dst.GetHeight() != new_h) {
+            CHECK_DST_DATA_NULL;
+            // calculate dst size using param scale_h and scale_w
+            DimsVector dims = {src.GetBatch(), src.GetChannel(), new_h, new_w};
+            dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dims);
+        }
+    } else {
+        if (dst.GetWidth() <= 0 || dst.GetHeight() <= 0) {
+            return Status(TNNERR_PARAM_ERR, "both dsize and param scale have zero or negnative value");
+        } else {
+            param.scale_w = dst.GetWidth() * 1.0 / src.GetWidth();
+            param.scale_h = dst.GetHeight() * 1.0 / src.GetHeight();
+        }
+    }
+
+    MAT_CONVERTER_PREPARATION(src.GetDeviceType());
+    return converter->Resize(src, dst, param, command_queue);
+}
+
+Status MatUtils::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (param.width > 0 && param.height > 0) {
+        if (dst.GetWidth() != param.width || dst.GetHeight() != param.height) {
+            CHECK_DST_DATA_NULL;
+            // set dst size by param height and width
+            DimsVector dims = {src.GetBatch(), src.GetChannel(), param.height, param.width};
+            dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dims);
+        }
+    } else {
+        if (dst.GetWidth() <= 0 || dst.GetHeight() <= 0) {
+            return Status(TNNERR_PARAM_ERR, "both dsize and param size have zero or negnative value");
+        } else {
+            param.width  = dst.GetWidth();
+            param.height = dst.GetHeight();
+        }
+    }
+
+    MAT_CONVERTER_PREPARATION(src.GetDeviceType());
+    return converter->Crop(src, dst, param, command_queue);
+}
+
+Status MatUtils::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (dst.GetData() == nullptr) {
+        // set dst size to src size
+        dst = Mat(dst.GetDeviceType(), dst.GetMatType(), src.GetDims());
+    }
+
+    MAT_CONVERTER_PREPARATION(src.GetDeviceType());
+    return converter->WarpAffine(src, dst, param, command_queue);
+}
+
+Status MatUtils::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, true, false, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (dst.GetData() == nullptr) {
+        // set dst size by src size and cvt type
+        DimsVector dims = src.GetDims();
+        dims[1] = GetCvtColorDstChannel(type);
+        dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dims);
+    } else {
+        if (dst.GetWidth() < src.GetWidth() || dst.GetHeight() < src.GetHeight() ||
+            dst.GetChannel() < GetCvtColorDstChannel(type)) {
+            return Status(TNNERR_PARAM_ERR, "cvt color dst size too small");
+        }
+    }
+
+    MAT_CONVERTER_PREPARATION(src.GetDeviceType());
+    return converter->CvtColor(src, dst, type, command_queue);
+}
+
+Status MatUtils::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    if (param.top >= 0 && param.bottom >= 0 && param.left >= 0 && param.right >= 0) {
+        int new_h = src.GetHeight() + param.top + param.bottom;
+        int new_w = src.GetWidth() + param.left + param.right;
+        if (dst.GetWidth() != new_w || dst.GetHeight() != new_h) {
+            CHECK_DST_DATA_NULL;
+            // calculate dst size using param top, bottom, left and right
+            DimsVector dims = {src.GetBatch(), src.GetChannel(), new_h, new_w};
+            dst = Mat(dst.GetDeviceType(), dst.GetMatType(), dims);
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "border size is negnative");
+    }
+
+    MAT_CONVERTER_PREPARATION(src.GetDeviceType());
+    return converter->CopyMakeBorder(src, dst, param, command_queue);
+}
+
+#undef CHECK_DST_DATA_NULL
+#undef MAT_CONVERTER_PREPARATION
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/md5.cc b/3rdparty/TNN/source/tnn/utils/md5.cc
new file mode 100644
index 0000000..a2af8cf
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/md5.cc
@@ -0,0 +1,365 @@
+/* MD5
+ converted to C++ class by Frank Thilo (thilo@unix-ag.org)
+ for bzflag (http://www.bzflag.org)
+ 
+   based on:
+ 
+   md5.h and md5.c
+   reference implemantion of RFC 1321
+ 
+   Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+ 
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+ 
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+ 
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+ 
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ 
+*/
+ 
+/* interface header */
+#include "md5.h"
+ 
+/* system implementation headers */
+#include <cstdio>
+
+namespace TNN_NS {
+ 
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ 
+///////////////////////////////////////////////
+ 
+// F, G, H and I are basic MD5 functions.
+inline MD5::uint4 MD5::F(uint4 x, uint4 y, uint4 z) {
+  return x&y | ~x&z;
+}
+ 
+inline MD5::uint4 MD5::G(uint4 x, uint4 y, uint4 z) {
+  return x&z | y&~z;
+}
+ 
+inline MD5::uint4 MD5::H(uint4 x, uint4 y, uint4 z) {
+  return x^y^z;
+}
+ 
+inline MD5::uint4 MD5::I(uint4 x, uint4 y, uint4 z) {
+  return y ^ (x | ~z);
+}
+ 
+// rotate_left rotates x left n bits.
+inline MD5::uint4 MD5::rotate_left(uint4 x, int n) {
+  return (x << n) | (x >> (32-n));
+}
+ 
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+inline void MD5::FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a+ F(b,c,d) + x + ac, s) + b;
+}
+ 
+inline void MD5::GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + G(b,c,d) + x + ac, s) + b;
+}
+ 
+inline void MD5::HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + H(b,c,d) + x + ac, s) + b;
+}
+ 
+inline void MD5::II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac) {
+  a = rotate_left(a + I(b,c,d) + x + ac, s) + b;
+}
+ 
+//////////////////////////////////////////////
+ 
+// default ctor, just initailize
+MD5::MD5()
+{
+  init();
+}
+ 
+//////////////////////////////////////////////
+ 
+// nifty shortcut ctor, compute MD5 for string and finalize it right away
+MD5::MD5(const std::string &text)
+{
+  init();
+  update(text.c_str(), text.length());
+  finalize();
+}
+ 
+//////////////////////////////
+ 
+void MD5::init()
+{
+  finalized=false;
+ 
+  count[0] = 0;
+  count[1] = 0;
+ 
+  // load magic initialization constants.
+  state[0] = 0x67452301;
+  state[1] = 0xefcdab89;
+  state[2] = 0x98badcfe;
+  state[3] = 0x10325476;
+}
+ 
+//////////////////////////////
+ 
+// decodes input (unsigned char) into output (uint4). Assumes len is a multiple of 4.
+void MD5::decode(uint4 output[], const uint1 input[], size_type len)
+{
+  for (unsigned int i = 0, j = 0; j < len; i++, j += 4)
+    output[i] = ((uint4)input[j]) | (((uint4)input[j+1]) << 8) |
+      (((uint4)input[j+2]) << 16) | (((uint4)input[j+3]) << 24);
+}
+ 
+//////////////////////////////
+ 
+// encodes input (uint4) into output (unsigned char). Assumes len is
+// a multiple of 4.
+void MD5::encode(uint1 output[], const uint4 input[], size_type len)
+{
+  for (size_type i = 0, j = 0; j < len; i++, j += 4) {
+    output[j] = input[i] & 0xff;
+    output[j+1] = (input[i] >> 8) & 0xff;
+    output[j+2] = (input[i] >> 16) & 0xff;
+    output[j+3] = (input[i] >> 24) & 0xff;
+  }
+}
+ 
+//////////////////////////////
+ 
+// apply MD5 algo on a block
+void MD5::transform(const uint1 block[blocksize])
+{
+  uint4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+  decode (x, block, blocksize);
+ 
+  /* Round 1 */
+  FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+  FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+  FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+  FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+  FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+  FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+  FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+  FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+  FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+  FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+  FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+  FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+  FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+  FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+  FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+  FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+ 
+  /* Round 2 */
+  GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+  GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+  GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+  GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+  GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+  GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+  GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+  GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+  GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+  GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+  GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+  GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+  GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+  GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+  GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+  GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+ 
+  /* Round 3 */
+  HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+  HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+  HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+  HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+  HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+  HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+  HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+  HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+  HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+  HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+  HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+  HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+  HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+  HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+  HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+  HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+ 
+  /* Round 4 */
+  II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+  II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+  II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+  II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+  II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+  II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+  II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+  II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+  II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+  II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+  II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+  II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+  II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+  II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+  II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+  II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+ 
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+ 
+  // Zeroize sensitive information.
+  memset(x, 0, sizeof x);
+}
+ 
+//////////////////////////////
+ 
+// MD5 block update operation. Continues an MD5 message-digest
+// operation, processing another message block
+void MD5::update(const unsigned char input[], size_type length)
+{
+  // compute number of bytes mod 64
+  size_type index = count[0] / 8 % blocksize;
+ 
+  // Update number of bits
+  if ((count[0] += (length << 3)) < (length << 3))
+    count[1]++;
+  count[1] += (length >> 29);
+ 
+  // number of bytes we need to fill in buffer
+  size_type firstpart = 64 - index;
+ 
+  size_type i;
+ 
+  // transform as many times as possible.
+  if (length >= firstpart)
+  {
+    // fill buffer first, transform
+    memcpy(&buffer[index], input, firstpart);
+    transform(buffer);
+ 
+    // transform chunks of blocksize (64 bytes)
+    for (i = firstpart; i + blocksize <= length; i += blocksize)
+      transform(&input[i]);
+ 
+    index = 0;
+  }
+  else
+    i = 0;
+ 
+  // buffer remaining input
+  memcpy(&buffer[index], &input[i], length-i);
+}
+ 
+//////////////////////////////
+ 
+// for convenience provide a verson with signed char
+void MD5::update(const char input[], size_type length)
+{
+  update((const unsigned char*)input, length);
+}
+ 
+//////////////////////////////
+ 
+// MD5 finalization. Ends an MD5 message-digest operation, writing the
+// the message digest and zeroizing the context.
+MD5& MD5::finalize()
+{
+  static unsigned char padding[64] = {
+    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+ 
+  if (!finalized) {
+    // Save number of bits
+    unsigned char bits[8];
+    encode(bits, count, 8);
+ 
+    // pad out to 56 mod 64.
+    size_type index = count[0] / 8 % 64;
+    size_type padLen = (index < 56) ? (56 - index) : (120 - index);
+    update(padding, padLen);
+ 
+    // Append length (before padding)
+    update(bits, 8);
+ 
+    // Store state in digest
+    encode(digest, state, 16);
+ 
+    // Zeroize sensitive information.
+    memset(buffer, 0, sizeof buffer);
+    memset(count, 0, sizeof count);
+ 
+    finalized=true;
+  }
+ 
+  return *this;
+}
+ 
+//////////////////////////////
+ 
+// return hex representation of digest as string
+std::string MD5::hexdigest() const
+{
+  if (!finalized)
+    return "";
+ 
+  char buf[33];
+  for (int i=0; i<16; i++)
+    sprintf(buf+i*2, "%02x", digest[i]);
+  buf[32]=0;
+ 
+  return std::string(buf);
+}
+ 
+//////////////////////////////
+ 
+std::ostream& operator<<(std::ostream& out, MD5 md5)
+{
+  return out << md5.hexdigest();
+}
+ 
+//////////////////////////////
+ 
+std::string md5(const std::string str)
+{
+    MD5 md5 = MD5(str);
+ 
+    return md5.hexdigest();
+}
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/md5.h b/3rdparty/TNN/source/tnn/utils/md5.h
new file mode 100644
index 0000000..a2d3a14
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/md5.h
@@ -0,0 +1,98 @@
+/* MD5
+ converted to C++ class by Frank Thilo (thilo@unix-ag.org)
+ for bzflag (http://www.bzflag.org)
+ 
+   based on:
+ 
+   md5.h and md5.c
+   reference implementation of RFC 1321
+ 
+   Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+ 
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+ 
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+ 
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+ 
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ 
+*/
+ 
+#ifndef BZF_MD5_H
+#define BZF_MD5_H
+ 
+#include <cstring>
+#include <iostream>
+
+#include "tnn/core/macro.h"
+ 
+namespace TNN_NS {
+ 
+// a small class for calculating MD5 hashes of strings or byte arrays
+// it is not meant to be fast or secure
+//
+// usage: 1) feed it blocks of uchars with update()
+//      2) finalize()
+//      3) get hexdigest() string
+//      or
+//      MD5(std::string).hexdigest()
+//
+// assumes that char is 8 bit and int is 32 bit
+class MD5
+{
+public:
+  typedef unsigned int size_type; // must be 32bit
+ 
+  MD5();
+  MD5(const std::string& text);
+  void update(const unsigned char *buf, size_type length);
+  void update(const char *buf, size_type length);
+  MD5& finalize();
+  std::string hexdigest() const;
+  friend std::ostream& operator<<(std::ostream&, MD5 md5);
+ 
+private:
+  void init();
+  typedef unsigned char uint1; //  8bit
+  typedef unsigned int uint4;  // 32bit
+  enum {blocksize = 64}; // VC6 won't eat a const static int here
+ 
+  void transform(const uint1 block[blocksize]);
+  static void decode(uint4 output[], const uint1 input[], size_type len);
+  static void encode(uint1 output[], const uint4 input[], size_type len);
+ 
+  bool finalized;
+  uint1 buffer[blocksize]; // bytes that didn't fit in last 64 byte chunk
+  uint4 count[2];   // 64bit counter for number of bits (lo, hi)
+  uint4 state[4];   // digest so far
+  uint1 digest[16]; // the result
+ 
+  // low level logic operations
+  static inline uint4 F(uint4 x, uint4 y, uint4 z);
+  static inline uint4 G(uint4 x, uint4 y, uint4 z);
+  static inline uint4 H(uint4 x, uint4 y, uint4 z);
+  static inline uint4 I(uint4 x, uint4 y, uint4 z);
+  static inline uint4 rotate_left(uint4 x, int n);
+  static inline void FF(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void GG(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void HH(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+  static inline void II(uint4 &a, uint4 b, uint4 c, uint4 d, uint4 x, uint4 s, uint4 ac);
+};
+ 
+std::string md5(const std::string str);
+
+} // namespace TNN_NS
+
+#endif
diff --git a/3rdparty/TNN/source/tnn/utils/naive_compute.cc b/3rdparty/TNN/source/tnn/utils/naive_compute.cc
new file mode 100644
index 0000000..fb81dff
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/naive_compute.cc
@@ -0,0 +1,1189 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/naive_compute.h"
+
+#include <cstring>
+#include <type_traits>
+
+#include "math.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/bbox_util.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+int8_t float2int8(float val) {
+    return static_cast<int8_t>(MAX(MIN(val + (val >= 0.f ? 0.5f : -0.5f), 127.0f), -128.0f));
+}
+
+uint8_t float2uint8(float val) {
+    return static_cast<uint8_t>(MAX(MIN(val + (val >= 0.f ? 0.5f : -0.5f), 255.0f), 0.0f));
+}
+
+int8_t half2int8(fp16_t val) {
+    return static_cast<int8_t>(MAX(MIN(val + (val >= 0.f ? 0.5f : -0.5f), 127.0f), -128.0f));
+}
+
+uint8_t half2uint8(fp16_t val) {
+    return static_cast<uint8_t>(MAX(MIN(val + (val >= 0.f ? 0.5f : -0.5f), 255.0f), 0.0f));
+}
+
+static inline int start_index(int a, int b, int c) {
+    return (int)std::floor((float)(a * c) / b);
+}
+
+static inline int end_index(int a, int b, int c) {
+    return (int)std::ceil((float)((a + 1) * c) / b);
+}
+
+template <typename T, typename Tacc>
+void NaiveAdaptivePooling(T *input_data, T *output_data, DimsVector dims_input, DimsVector dims_output, int pool_type) {
+    bool is_1d             = dims_input.size() == 3;
+    const int channels     = is_1d ? dims_input[0] : dims_input[0] * dims_input[1];
+    const int input_height = is_1d ? dims_input[1] : dims_input[2];
+    const int input_width  = is_1d ? dims_input[2] : dims_input[3];
+    int64_t output_height  = is_1d ? dims_output[1] : dims_output[2];
+    int64_t output_width   = is_1d ? dims_output[2] : dims_output[3];
+
+    for (int c = 0; c < channels; c++) {
+        T *input_ptr  = input_data + c * input_height * input_width;
+        T *output_ptr = output_data + c * output_height * output_width;
+
+        for (int oh = 0; oh < output_height; oh++) {
+            int ih0 = start_index(oh, output_height, input_height);
+            int ih1 = end_index(oh, output_height, input_height);
+            int kh  = ih1 - ih0;
+
+            for (int ow = 0; ow < output_width; ow++) {
+                int iw0 = start_index(ow, output_width, input_width);
+                int iw1 = end_index(ow, output_width, input_width);
+                int kw  = iw1 - iw0;
+
+                // compute local average
+                if (pool_type == 1) {
+                    T sum = 0;
+                    for (int ih = ih0; ih < ih1; ih++) {
+                        for (int iw = iw0; iw < iw1; iw++) {
+                            sum += input_ptr[ih * input_width + iw];
+                        }
+                    }
+                    output_ptr[oh * output_width + ow] = sum / kh / kw;
+                }
+            }
+        }
+    }
+}
+
+// initialize the NaiveAdaptivePooling FUNTION with float
+template void NaiveAdaptivePooling<float, float>(float *input_data, float *output_data, DimsVector dims_input,
+                                                 DimsVector dims_output, int pool_type);
+
+/*
+ * Computes max pooling or average pooling
+ * blob data format must be NCHW
+ */
+template <typename T, typename Tacc>
+void NaivePooling(T *input_ptr, T *output_ptr, DimsVector dims_input, DimsVector dims_output, int stride_y,
+                  int stride_x, int kernel_y, int kernel_x, int pad_y, int pad_x, int pool_type) {
+    auto input_width = dims_input[3], input_height = dims_input[2];
+    auto output_width = dims_output[3], output_height = dims_output[2], output_channel = dims_output[1];
+    for (int n = 0; n < dims_output[0]; n++) {
+        T *in_current_batch = input_ptr + n * input_width * input_height * output_channel;
+        T *ou_current_batch = output_ptr + n * output_width * output_height * output_channel;
+        for (int c = 0; c < output_channel; c++) {
+            for (int h = 0; h < output_height; h++) {
+                for (int w = 0; w < output_width; w++) {
+                    // value is accumulated in the type Tacc
+                    // which is float for both float and bfp16
+                    Tacc calc_val;
+                    if (std::is_same<T, float>::value || std::is_same<T, bfp16_t>::value) {
+                        calc_val = static_cast<Tacc>(-FLT_MAX);
+                    } else if (std::is_same<T, int8_t>::value) {
+                        calc_val = static_cast<Tacc>(-INT8_MAX);
+                    }
+                    calc_val = pool_type == 0 ? calc_val : 0;
+
+                    T cur_val = static_cast<T>(0);
+
+                    int hstart       = h * stride_y - pad_y;
+                    int wstart       = w * stride_x - pad_x;
+                    int hend         = std::min(hstart + kernel_y, input_height);
+                    int wend         = std::min(wstart + kernel_x, input_width);
+                    hstart           = std::max(hstart, 0);
+                    wstart           = std::max(wstart, 0);
+                    int kernel_count = (hend - hstart) * (wend - wstart);
+
+                    for (int inh = hstart; inh < hend; ++inh) {
+                        for (int inw = wstart; inw < wend; ++inw) {
+                            cur_val = in_current_batch[c * input_height * input_width + inh * input_width + inw];
+
+                            if (pool_type == 0) {  // max pooling
+                                calc_val = std::max((Tacc)cur_val, calc_val);
+                            } else {
+                                // pool_type ==1 for average pooling
+                                calc_val += cur_val;
+                            }
+                        }
+                    }
+
+                    if (pool_type == 0) {  // max pooling
+                        calc_val = std::max((Tacc)cur_val, calc_val);
+                    } else {
+                        // average pooling
+                        calc_val = calc_val / kernel_count;
+                    }
+
+                    ou_current_batch[c * output_height * output_width + h * output_width + w] =
+                        static_cast<T>(calc_val);
+                }
+            }
+        }
+    }
+}
+
+// initialize the NaivePooling FUNTION with float
+template void NaivePooling<float, float>(float *input_ptr, float *output_ptr, DimsVector dims_input,
+                                         DimsVector dims_output, int stride_y, int stride_x, int kernel_y, int kernel_x,
+                                         int pad_y, int pad_x, int pool_type);
+
+// initialize the NaivePooling FUNTION with bfp16
+template void NaivePooling<bfp16_t, float>(bfp16_t *input_ptr, bfp16_t *output_ptr, DimsVector dims_input,
+                                           DimsVector dims_output, int stride_y, int stride_x, int kernel_y,
+                                           int kernel_x, int pad_y, int pad_x, int pool_type);
+
+// initialize the NaivePooling FUNTION with int8
+template void NaivePooling<int8_t, int32_t>(int8_t *input_ptr, int8_t *output_ptr, DimsVector dims_input,
+                                            DimsVector dims_output, int stride_y, int stride_x, int kernel_y,
+                                            int kernel_x, int pad_y, int pad_x, int pool_type);
+
+/*
+ * Computes max pooling or average 3d pooling
+ * blob data format must be NCDHW
+ */
+template <typename T, typename Tacc>
+void NaivePooling3D(T *input_ptr, T *output_ptr, DimsVector dims_input, DimsVector dims_output,
+                    int stride_d, int stride_y, int stride_x, int kernel_d, int kernel_y, int kernel_x,
+                    int pad_d, int pad_y, int pad_x, int pool_type) {
+    auto input_width = dims_input[4], input_height = dims_input[3], input_depth = dims_input[2];
+    auto output_width = dims_output[4], output_height = dims_output[3];
+    auto output_depth = dims_output[2], output_channel = dims_output[1];
+    for (int n = 0; n < dims_output[0]; n++) {
+        T *in_current_batch = input_ptr + n * input_width * input_height * input_depth * output_channel;
+        T *ou_current_batch = output_ptr + n * output_width * output_height * output_depth * output_channel;
+        for (int c = 0; c < output_channel; c++) {
+            for (int d = 0; d < output_depth; d++) {
+                for (int h = 0; h < output_height; h++) {
+                    for (int w = 0; w < output_width; w++) {
+                        // value is accumulated in the type Tacc
+                        // which is float for both float and bfp16
+                        Tacc calc_val;
+                        if (std::is_same<T, float>::value || std::is_same<T, bfp16_t>::value) {
+                            calc_val = static_cast<Tacc>(-FLT_MAX);
+                        } else if (std::is_same<T, int8_t>::value) {
+                            calc_val = static_cast<Tacc>(-INT8_MAX);
+                        }
+                        calc_val = pool_type == 0 ? calc_val : 0;
+
+                        T cur_val = static_cast<T>(0);
+
+                        int dstart       = d * stride_d - pad_d;
+                        int hstart       = h * stride_y - pad_y;
+                        int wstart       = w * stride_x - pad_x;
+                        int dend         = std::min(dstart + kernel_d, input_depth);
+                        int hend         = std::min(hstart + kernel_y, input_height);
+                        int wend         = std::min(wstart + kernel_x, input_width);
+                        dstart           = std::max(dstart, 0);
+                        hstart           = std::max(hstart, 0);
+                        wstart           = std::max(wstart, 0);
+                        int kernel_count = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+                        for (int ind = dstart; ind < dend; ++ind) {
+                            for (int inh = hstart; inh < hend; ++inh) {
+                                for (int inw = wstart; inw < wend; ++inw) {
+                                    cur_val =
+                                        in_current_batch[c * input_height * input_width * input_depth + 
+                                                            ind * input_height * input_width + inh * input_width + inw];
+
+                                    if (pool_type == 0) {  // max pooling
+                                        calc_val = std::max((Tacc)cur_val, calc_val);
+                                    } else {
+                                        // pool_type ==1 for average pooling
+                                        calc_val += cur_val;
+                                    }
+                                }
+                            }
+                        }
+
+                        if (pool_type == 0) {  // max pooling
+                            calc_val = std::max((Tacc)cur_val, calc_val);
+                        } else {
+                            // average pooling
+                            calc_val = calc_val / kernel_count;
+                        }
+
+                        ou_current_batch[c * output_height * output_width * output_depth
+                                            + d * output_height * output_width
+                                            + h * output_width + w] =
+                            static_cast<T>(calc_val);
+                    }
+                }
+            }
+        }
+    }
+}
+
+// initialize the NaivePooling FUNTION with float
+template void NaivePooling3D<float, float>(float *input_ptr, float *output_ptr, DimsVector dims_input, DimsVector dims_output,
+                    int stride_d, int stride_y, int stride_x, int kernel_d, int kernel_y, int kernel_x,
+                    int pad_d, int pad_y, int pad_x, int pool_type);
+
+// initialize the NaivePooling FUNTION with bfp16
+template void NaivePooling3D<bfp16_t, float>(bfp16_t *input_ptr, bfp16_t *output_ptr, DimsVector dims_input, DimsVector dims_output,
+                    int stride_d, int stride_y, int stride_x, int kernel_d, int kernel_y, int kernel_x,
+                    int pad_d, int pad_y, int pad_x, int pool_type);
+
+// initialize the NaivePooling FUNTION with int8
+template void NaivePooling3D<int8_t, int32_t>(int8_t *input_ptr, int8_t *output_ptr, DimsVector dims_input, DimsVector dims_output,
+                    int stride_d, int stride_y, int stride_x, int kernel_d, int kernel_y, int kernel_x,
+                    int pad_d, int pad_y, int pad_x, int pool_type);
+
+/*
+ * Full Connected funtion
+ * blob data format is required to be NCHW
+ */
+template <typename T>
+void NaiveFC(T *input_ptr, T *output_ptr, T *weight_data, float *bias, DimsVector dims_input, DimsVector dims_output) {
+    int ip_dim_in = DimsVectorUtils::Count(dims_input, 1);
+    for (int n = 0; n < dims_output[0]; ++n) {
+        T *in_current_batch = input_ptr + n * ip_dim_in;
+        T *ou_current_batch = output_ptr + n * dims_output[1];
+        OMP_PARALLEL_FOR_
+        for (int oc = 0; oc < dims_output[1]; ++oc) {
+            float acc = 0;
+            for (int ic = 0; ic < ip_dim_in; ++ic) {
+                acc += float(static_cast<T *>(weight_data)[oc * ip_dim_in + ic]) * float(in_current_batch[ic]);
+            }
+            if (bias)
+                acc += bias[oc];
+            ou_current_batch[oc] = acc;
+        }
+    }
+}
+
+template void NaiveFC(float *input_ptr, float *output_ptr, float *weight_data, float *bias, DimsVector dims_input,
+                      DimsVector dims_output);
+
+template void NaiveFC(bfp16_t *input_ptr, bfp16_t *output_ptr, bfp16_t *weight_data, float *bias, DimsVector dims_input,
+                      DimsVector dims_output);
+
+// specialize for the case data_type=int8
+void NaiveFC(void *input_ptr, void *output_ptr, void *weight_data, float *scale, int scale_len, void *bias,
+            DimsVector dims_input, DimsVector dims_output) {
+    int ip_dim_in = DimsVectorUtils::Count(dims_input, 1);
+    for (int n = 0; n < dims_output[0]; ++n) {
+        int8_t *in_current_batch = static_cast<int8_t *>(input_ptr) + n * ip_dim_in;
+        int8_t *ou_current_batch = static_cast<int8_t *>(output_ptr) + n * dims_output[1];
+        OMP_PARALLEL_FOR_
+        for (int oc = 0; oc < dims_output[1]; ++oc) {
+            float cur_scale = scale_len == 1 ? scale[0] : scale[oc];
+            int32_t acc     = 0;
+            for (int ic = 0; ic < ip_dim_in; ++ic) {
+                acc += static_cast<int8_t *>(weight_data)[oc * ip_dim_in + ic] * in_current_batch[ic];
+            }
+            if (bias)
+                acc += static_cast<int32_t *>(bias)[oc];
+            ou_current_batch[oc] = float2int8(acc * cur_scale);
+        }
+    }
+}
+
+template <typename Tacc>
+void FloatActivate(Tacc &result, const int activation_type) {
+    if (activation_type == ActivationType_ReLU) {
+        result = static_cast<Tacc>(result > 0.0f ? result : 0.0f);
+    } else if (activation_type == ActivationType_ReLU6) {
+        if (result > 6.0f) {
+            result = static_cast<Tacc>(6.0f);
+        } else if (result < 0.0f) {
+            result = static_cast<Tacc>(0.0f);
+        }
+    } else if(activation_type == ActivationType_SIGMOID_MUL) {
+        result = 1.0f / (1.0f + exp(-result)) * result;
+    }
+}
+
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv1D(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+                 DimsVector dims_output, int stride, int kernel_size, int pad, int group, int dilation,
+                 int activation_type, float *scale, int scale_len, int fusion_type, void *add_input, float *add_scale) {
+    Tin *input_data               = static_cast<Tin *>(input_ptr);
+    Tw *weight_data               = static_cast<Tw *>(weight_ptr);
+    Tout *output_data             = static_cast<Tout *>(output_ptr);
+    Tacc *bias_data               = static_cast<Tacc *>(bias);
+    int number                    = dims_output[0];
+    int output_channel            = dims_output[1];
+    int output_height             = dims_output[2];
+    int input_channel             = dims_input[1];
+    int input_height              = dims_input[2];
+    int output_channels_per_group = output_channel / group;
+    int input_channels_per_group  = input_channel / group;
+
+    // #pragma omp parallel for
+    for (int n = 0; n < number; ++n) {
+        for (int g = 0; g < group; ++g) {
+            int output_c_start = g * output_channels_per_group;
+            int output_c_end   = (g + 1) * output_channels_per_group;
+            int input_c_start  = g * input_channels_per_group;
+            int input_c_end    = (g + 1) * input_channels_per_group;
+            int weights_start  = g * output_channels_per_group * input_channels_per_group * kernel_size;
+            for (int output_c = output_c_start; output_c < output_c_end; ++output_c) {
+                for (int h = 0; h < output_height; ++h) {
+                    int input_h_start = h * stride - pad;
+                    Tacc result       = static_cast<Tacc>(0.0f);
+                    for (int kernel_h = 0; kernel_h < kernel_size; ++kernel_h) {
+                        int input_h = input_h_start + kernel_h * dilation;
+                        if (input_h < 0 || input_h >= input_height) {
+                            continue;
+                        }
+                        for (int input_c = input_c_start; input_c < input_c_end; ++input_c) {
+                            int input_position = (n * input_channel + input_c) * input_height + input_h;
+                            int weight_position =
+                                weights_start +
+                                ((output_c - output_c_start) * input_channels_per_group + input_c - input_c_start) *
+                                kernel_size +
+                                kernel_h;
+                            auto ip = input_data[input_position];
+                            auto wd = weight_data[weight_position];
+                            result += input_data[input_position] * weight_data[weight_position];
+                        }
+                    }
+
+                    int output_position = (n * output_channel + output_c) * output_height + h;
+                    if (bias_data) {
+                        result += bias_data[output_c];
+                    }
+                    if (sizeof(Tin) > 1) {  // float
+                        FloatActivate(result, activation_type);
+                        output_data[output_position] = result;
+                    } else {
+                        int scaleidx = scale_len == 1 ? 0 : output_c;
+                        float val    = result * scale[scaleidx];
+                        if (fusion_type == FusionType_Conv_Add_Activation) {
+                            val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                        }
+                        if (activation_type == ActivationType_ReLU) {
+                            val = std::max(0.0f, val);
+                        }
+                        if (fusion_type == FusionType_Conv_Activation_Add) {
+                            val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                        }
+                        output_data[output_position] = float2int8(val);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template void NaiveConv1D<float, float, float, float>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias,
+                                                      DimsVector dims_input, DimsVector dims_output, int stride,
+                                                      int kernel_size, int pad, int group, int dilation,
+                                                      int activation_type, float *scale, int scale_len, int fusion_type,
+                                                      void *add_input, float *add_scale);
+
+/*
+ * convolution funtion
+ * input & output data_format is NCHW
+ * depthwise is supported
+ */
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+               DimsVector dims_output, int stride_y, int stride_x, int kernel_size_y, int kernel_size_x, int pad_y,
+               int pad_x, int group, int dilation, int activation_type, float *weight_scale, int weight_scale_len,
+               int8_t *relu6_max, int relu6_max_len, int fusion_type, void *add_input, float *add_scale) {
+    Tin *input_data               = static_cast<Tin *>(input_ptr);
+    Tw *weight_data               = static_cast<Tw *>(weight_ptr);
+    Tout *output_data             = static_cast<Tout *>(output_ptr);
+    Tacc *bias_data               = static_cast<Tacc *>(bias);
+    int number                    = dims_output[0];
+    int output_channel            = dims_output[1];
+    int output_height             = dims_output[2];
+    int output_width              = dims_output[3];
+    int input_channel             = dims_input[1];
+    int input_height              = dims_input[2];
+    int input_width               = dims_input[3];
+    int output_channels_per_group = output_channel / group;
+    int input_channels_per_group  = input_channel / group;
+
+    // #pragma omp parallel for
+    for (int n = 0; n < number; ++n) {
+        for (int g = 0; g < group; ++g) {
+            int output_c_start = g * output_channels_per_group;
+            int output_c_end   = (g + 1) * output_channels_per_group;
+            int input_c_start  = g * input_channels_per_group;
+            int input_c_end    = (g + 1) * input_channels_per_group;
+            int weights_start =
+                g * output_channels_per_group * input_channels_per_group * kernel_size_x * kernel_size_y;
+            for (int output_c = output_c_start; output_c < output_c_end; ++output_c) {
+                for (int h = 0; h < output_height; ++h) {
+                    int input_h_start = h * stride_y - pad_y;
+                    for (int w = 0; w < output_width; ++w) {
+                        int input_w_start = w * stride_x - pad_x;
+                        Tacc result       = static_cast<Tacc>(0.0f);
+                        for (int kernel_h = 0; kernel_h < kernel_size_y; ++kernel_h) {
+                            int input_h = input_h_start + kernel_h * dilation;
+                            if (input_h < 0 || input_h >= input_height) {
+                                continue;
+                            }
+                            for (int kernel_w = 0; kernel_w < kernel_size_x; ++kernel_w) {
+                                int input_w = input_w_start + kernel_w * dilation;
+                                if (input_w < 0 || input_w >= input_width) {
+                                    continue;
+                                }
+                                for (int input_c = input_c_start; input_c < input_c_end; ++input_c) {
+                                    int input_position =
+                                        ((n * input_channel + input_c) * input_height + input_h) * input_width +
+                                        input_w;
+                                    int weight_position = weights_start +
+                                                          (((output_c - output_c_start) * input_channels_per_group +
+                                                            input_c - input_c_start) *
+                                                               kernel_size_y +
+                                                           kernel_h) *
+                                                              kernel_size_x +
+                                                          kernel_w;
+                                    result += input_data[input_position] * weight_data[weight_position];
+                                }
+                            }
+                        }
+
+                        int output_position = ((n * output_channel + output_c) * output_height + h) * output_width + w;
+                        if (bias_data) {
+                            result += bias_data[output_c];
+                        }
+                        if (sizeof(Tin) > 1) {  // float
+                            FloatActivate(result, activation_type);
+                            output_data[output_position] = result;
+                        } else {
+                            int scale_idx = weight_scale_len == 1 ? 0 : output_c;
+                            float val    = result * weight_scale[scale_idx];
+                            if (fusion_type == FusionType_Conv_Add_Activation) {
+                                val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                            }
+                            if (activation_type == ActivationType_ReLU) {
+                                val = std::max(0.0f, val);
+                            } else if (activation_type == ActivationType_ReLU6) {
+                                int relu6_max_idx = relu6_max_len == 1 ? 0:output_c;
+                                int8_t res = std::min(float2int8(val), relu6_max[relu6_max_idx]);
+                                res = std::max((int8_t)0, res);
+                                output_data[output_position] = res;
+                                continue;
+                            }
+                            if (fusion_type == FusionType_Conv_Activation_Add) {
+                                val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                            }
+                            output_data[output_position] = float2int8(val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template void NaiveConv<float, float, float, float>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias,
+                                                    DimsVector dims_input, DimsVector dims_output, int stride_y,
+                                                    int stride_x, int kernel_size_y, int kernel_size_x, int pad_y,
+                                                    int pad_x, int group, int dilation, int activation_type,
+                                                    float *weight_scale, int weight_scale_len, int8_t *relu6_max,
+                                                    int relu6_max_len, int fusion_type, void *add_input,
+                                                    float *add_scale);
+
+template void NaiveConv<int8_t, int8_t, int32_t, int8_t>(
+    void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input, DimsVector dims_output,
+    int stride_y, int stride_x, int kernel_size_y, int kernel_size_x, int pad_y, int pad_x, int group, int dilation,
+    int activation_type, float *weight_scale, int weight_scale_len,  int8_t *relu6_max, int relu6_max_len,
+    int fusion_type, void *add_input, float *add_scale);
+
+template void NaiveConv<bfp16_t, float, float, bfp16_t>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias,
+                                                        DimsVector dims_input, DimsVector dims_output, int stride_y,
+                                                        int stride_x, int kernel_size_y, int kernel_size_x, int pad_y,
+                                                        int pad_x, int group, int dilation, int activation_type,
+                                                        float *weight_scale, int weight_scale_len,  int8_t *relu6_max,
+                                                        int relu6_max_len, int fusion_type, void *add_input,
+                                                        float *add_scale);
+
+/*
+ * 3d convolution funtion
+ * input & output data_format is NCDHW
+ * weight data_format is K-C/group-KD-KH-KW
+ * depthwise is supported
+ */
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv3D(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+               DimsVector dims_output, int stride_d, int stride_y, int stride_x,
+               int kernel_size_d, int kernel_size_y, int kernel_size_x,
+               int pad_d, int pad_y, int pad_x, int group,
+               int dilation_d, int dilation_y, int dilation_x,
+               int activation_type, float *scale, int scale_len,
+               int fusion_type, void *add_input, float *add_scale) {
+    Tin *input_data               = static_cast<Tin *>(input_ptr);
+    Tw *weight_data               = static_cast<Tw *>(weight_ptr);
+    Tout *output_data             = static_cast<Tout *>(output_ptr);
+    Tacc *bias_data               = static_cast<Tacc *>(bias);
+    int number                    = dims_output[0];
+    int output_channel            = dims_output[1];
+    int output_depth              = dims_output[2];
+    int output_height             = dims_output[3];
+    int output_width              = dims_output[4];
+    int input_channel             = dims_input[1];
+    int input_depth               = dims_input[2];
+    int input_height              = dims_input[3];
+    int input_width               = dims_input[4];
+    int output_channels_per_group = output_channel / group;
+    int input_channels_per_group  = input_channel / group;
+
+    // #pragma omp parallel for
+    for (int n = 0; n < number; ++n) {
+        for (int g = 0; g < group; ++g) {
+            int output_c_start = g * output_channels_per_group;
+            int output_c_end   = (g + 1) * output_channels_per_group;
+            int input_c_start  = g * input_channels_per_group;
+            int input_c_end    = (g + 1) * input_channels_per_group;
+            int weights_start =
+                g * output_channels_per_group * input_channels_per_group * kernel_size_x * kernel_size_y * kernel_size_d;
+            for (int output_c = output_c_start; output_c < output_c_end; ++output_c) {
+                for (int d = 0; d < output_depth; ++d) {
+                    int input_d_start = d * stride_d - pad_d;
+                    for (int h = 0; h < output_height; ++h) {
+                        int input_h_start = h * stride_y - pad_y;
+                        for (int w = 0; w < output_width; ++w) {
+                            int input_w_start = w * stride_x - pad_x;
+                            Tacc result       = static_cast<Tacc>(0.0f);
+                            for (int input_c = input_c_start; input_c < input_c_end; ++input_c) {
+                                for (int kernel_d = 0; kernel_d < kernel_size_d; ++kernel_d) {
+                                    int input_d = input_d_start + kernel_d * dilation_d;
+                                    if (input_d < 0 || input_d >= input_depth) {
+                                        continue;
+                                    }
+                                    for (int kernel_h = 0; kernel_h < kernel_size_y; ++kernel_h) {
+                                        int input_h = input_h_start + kernel_h * dilation_y;
+                                        if (input_h < 0 || input_h >= input_height) {
+                                            continue;
+                                        }
+                                        for (int kernel_w = 0; kernel_w < kernel_size_x; ++kernel_w) {
+                                            int input_w = input_w_start + kernel_w * dilation_x;
+                                            if (input_w < 0 || input_w >= input_width) {
+                                                continue;
+                                            }
+                                            int input_position =
+                                                (((n * input_channel + input_c) * input_depth + input_d) *
+                                                     input_height +
+                                                 input_h) *
+                                                    input_width +
+                                                input_w;
+                                            int weight_position =
+                                                weights_start +
+                                                ((((output_c - output_c_start) * input_channels_per_group + input_c -
+                                                   input_c_start) *
+                                                      kernel_size_d +
+                                                  kernel_d) *
+                                                     kernel_size_y +
+                                                 kernel_h) *
+                                                    kernel_size_x +
+                                                kernel_w;
+                                            result += input_data[input_position] * weight_data[weight_position];
+                                        }
+                                    }
+                                }
+                            }
+
+                            int output_position =
+                                (((n * output_channel + output_c) * output_depth + d) * output_height + h) * output_width + w;
+                            if (bias_data) {
+                                result += bias_data[output_c];
+                            }
+                            if (sizeof(Tin) > 1) {  // float
+                                FloatActivate(result, activation_type);
+                                output_data[output_position] = result;
+                            } else {
+                                int scaleidx = scale_len == 1 ? 0 : output_c;
+                                float val    = result * scale[scaleidx];
+                                if (fusion_type == FusionType_Conv_Add_Activation) {
+                                    val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                                }
+                                if (activation_type == ActivationType_ReLU) {
+                                    val = std::max(0.0f, val);
+                                }
+                                if (fusion_type == FusionType_Conv_Activation_Add) {
+                                    val += static_cast<Tin *>(add_input)[output_position] * add_scale[output_c];
+                                }
+                                output_data[output_position] = float2int8(val);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template void NaiveConv3D<float, float, float, float>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+                                                        DimsVector dims_output, int stride_d, int stride_y, int stride_x,
+                                                        int kernel_size_d, int kernel_size_y, int kernel_size_x,
+                                                        int pad_d, int pad_y, int pad_x, int group,
+                                                        int dilation_d, int dilation_y, int dilation_x,
+                                                        int activation_type, float *scale, int scale_len,
+                                                        int fusion_type, void *add_input, float *add_scale);
+
+template void NaiveConv3D<int8_t, int8_t, int32_t, int8_t>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+                                                        DimsVector dims_output, int stride_d, int stride_y, int stride_x,
+                                                        int kernel_size_d, int kernel_size_y, int kernel_size_x,
+                                                        int pad_d, int pad_y, int pad_x, int group,
+                                                        int dilation_d, int dilation_y, int dilation_x,
+                                                        int activation_type, float *scale, int scale_len,
+                                                        int fusion_type, void *add_input, float *add_scale);
+
+template void NaiveConv3D<bfp16_t, float, float, bfp16_t>(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+                                                        DimsVector dims_output, int stride_d, int stride_y, int stride_x,
+                                                        int kernel_size_d, int kernel_size_y, int kernel_size_x,
+                                                        int pad_d, int pad_y, int pad_x, int group,
+                                                        int dilation_d, int dilation_y, int dilation_x,
+                                                        int activation_type, float *scale, int scale_len,
+                                                        int fusion_type, void *add_input, float *add_scale);
+
+template <typename T>
+void NaivePermute(const int count, DimsVector dims, T *bottom_data, const std::vector<int> &permute_order,
+                const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                T *top_data) {
+    for (int i = 0; i < count; ++i) {
+        int old_idx = 0;
+        int idx     = i;
+        for (int j = num_axes-1; j >= 0; --j) {
+            int order = permute_order[j];
+            old_idx += (idx % dims[j]) * old_steps[order];
+            idx  /= dims[j];
+        }
+        top_data[i] = bottom_data[old_idx];
+    }
+};
+template void NaivePermute(const int count, DimsVector dims, float *bottom_data, const std::vector<int> &permute_order,
+                        const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                        float *top_data);
+
+template void NaivePermute(const int count, DimsVector dims, int8_t *bottom_data, const std::vector<int> &permute_order,
+                        const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                        int8_t *top_data);
+
+template void NaivePermute(const int count, DimsVector dims, fp16_t *bottom_data, const std::vector<int> &permute_order,
+                        const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes,
+                        fp16_t *top_data);
+
+void NaiveReorg(float *bottom_data, int width, int height, int channel, int number, int stride, int forward, int mode,
+                float *top_data) {
+    int in_index, c2, offset, h2, w2, out_index;
+    int out_c = channel / (stride * stride);
+    for (int n = 0; n < number; ++n) {
+        for (int c = 0; c < channel; ++c) {
+            for (int h = 0; h < height; ++h) {
+                for (int w = 0; w < width; ++w) {
+                    if (mode == 0) {
+                        // DCR mode
+                        in_index  = w + width * (h + height * (c + channel * n));
+                        c2        = c % out_c;
+                        offset    = c / out_c;
+                        h2        = h * stride + offset / stride;
+                        w2        = w * stride + offset % stride;
+                        out_index = w2 + width * stride * (h2 + height * stride * (c2 + out_c * n));
+                    } else if (mode == 1) {
+                        // CRD mode
+                        in_index  = w + width * (h + height * (c + channel * n));
+                        c2        = c / (stride * stride);
+                        offset    = c % (stride * stride);
+                        h2        = h * stride + offset / stride;
+                        w2        = w * stride + offset % stride;
+                        out_index = w2 + width * stride * (h2 + height * stride * (c2 + out_c * n));
+                    } else {
+                        LOGE("Naive Reorg do not support mode\n");
+                        assert(-1);
+                    }
+                    if (forward) {
+                        top_data[out_index] = bottom_data[in_index];
+                    } else {
+                        top_data[in_index] = bottom_data[out_index];
+                    };
+                }
+            }
+        }
+    }
+}
+
+void priorbox_set_value(const int N, const float alpha, float *Y) {
+    if (alpha == 0) {
+        memset(Y, 0, sizeof(float) * N);  // NOLINT(caffe/alt_fn)
+        return;
+    }
+    for (int i = 0; i < N; ++i) {
+        Y[i] = alpha;
+    }
+}
+
+void NaivePriorbox(PriorBoxLayerParam *param, int output_h, int output_w, float *output_data, int layer_height,
+                   int layer_width, int img_height, int img_width, float step_h, float step_w) {
+    int num_priors = output_h / (layer_height * layer_width * 4);
+
+    float offset = param->offset;
+    int dim      = output_h;
+    int idx      = 0;
+    for (int h = 0; h < layer_height; ++h) {
+        for (int w = 0; w < layer_width; ++w) {
+            float center_x = (w + offset) * step_w;
+            float center_y = (h + offset) * step_h;
+            float box_width, box_height;
+
+            for (int s = 0; s < param->min_sizes.size(); ++s) {
+                int min_size = int(param->min_sizes[s]);
+                box_width = box_height = float(min_size);
+                // xmin
+                output_data[idx++] = (center_x - box_width / 2.) / img_width;
+                // ymin
+                output_data[idx++] = ((center_y - box_height / 2.) / img_height);
+                // xmax
+                output_data[idx++] = ((center_x + box_width / 2.) / img_width);
+                // ymax
+                output_data[idx++] = ((center_y + box_height / 2.) / img_height);
+                // if we have max_size
+                if (param->max_sizes.size() > 0) {
+                    int max_size = int(param->max_sizes[s]);
+                    // second prior: aspect_ratio = 1, size = sqrt(min_size *
+                    // max_size)
+                    box_width = box_height = (sqrt(min_size * max_size));
+                    // xmin
+                    output_data[idx++] = ((center_x - box_width / 2.) / img_width);
+                    // ymin
+                    output_data[idx++] = ((center_y - box_height / 2.) / img_height);
+                    // xmax
+                    output_data[idx++] = ((center_x + box_width / 2.) / img_width);
+                    // ymax
+                    output_data[idx++] = ((center_y + box_height / 2.) / img_height);
+                }
+                for (int r = 0; r < param->aspect_ratios.size(); ++r) {
+                    float ar = param->aspect_ratios[r];
+                    if (fabs(ar - 1.) < 1e-6) {
+                        continue;
+                    }
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+                    // xmin
+                    output_data[idx++] = ((center_x - box_width / 2.) / img_width);
+                    // ymin
+                    output_data[idx++] = ((center_y - box_height / 2.) / img_height);
+                    // xmax
+                    output_data[idx++] = ((center_x + box_width / 2.) / img_width);
+                    // ymax
+                    output_data[idx++] = ((center_y + box_height / 2.) / img_height);
+                }
+            }
+        }
+    }
+    // clip默认值是false，是否进行越界处理
+    // clip the prior's coordiate such that it is within [0, 1]
+    if (param->clip) {
+        //  归一化
+        for (int d = 0; d < dim; ++d) {
+            output_data[d] = std::min<float>(std::max<float>(output_data[d], (0.)), (1.));
+        }
+    }
+    // 前面提到过，输出c维大小是2，第一部分存放预选框数据，第二部分存放variance
+    // set the variance.
+    float *variance_addr = output_data + output_h * output_w;
+    if (param->variances.size() == 1) {
+        priorbox_set_value(dim, float(param->variances[0]), variance_addr);
+    } else {
+        int count = 0;
+        for (int h = 0; h < layer_height; ++h) {
+            for (int w = 0; w < layer_width; ++w) {
+                for (int i = 0; i < num_priors; ++i) {
+                    for (int j = 0; j < 4; ++j) {
+                        variance_addr[count] = (param->variances[j]);
+                        ++count;
+                    }
+                }
+            }
+        }
+    }
+}
+
+inline CodeType GetCodeType(const int number) {
+    ASSERT(number > 0 && number < 4);
+
+    switch (number) {
+        case 1: {
+            return PriorBoxParameter_CodeType_CORNER;
+        }
+        case 2: {
+            return PriorBoxParameter_CodeType_CENTER_SIZE;
+        }
+        default: {
+            return PriorBoxParameter_CodeType_CORNER_SIZE;
+        }
+    }
+}
+
+void DealOutput(Blob *output_blob, const int num_kept, const int num,
+                std::vector<std::map<int, std::vector<float>>> &all_conf_scores,
+                std::vector<LabelBBox> &all_decode_bboxes, std::vector<std::map<int, std::vector<int>>> &all_indices,
+                DetectionOutputLayerParam *param) {
+    float *top_data = static_cast<float *>(output_blob->GetHandle().base);
+    // clear all output to 0
+    priorbox_set_value(DimsVectorUtils::Count(output_blob->GetBlobDesc().dims), 0, top_data);
+
+    // if no detection
+    if (num_kept == 0) {
+        LOGD("%s:Couldn't find any detections.", __FUNCTION__);
+        output_blob->GetBlobDesc().dims[2] = num;
+        priorbox_set_value(DimsVectorUtils::Count(output_blob->GetBlobDesc().dims), -1, top_data);
+
+        // Generate fake results per image.
+        float *top_data_tmp = top_data;
+        for (int i = 0; i < num; ++i) {
+            top_data_tmp[0] = static_cast<float>(i);
+            top_data_tmp += 7;
+        }
+    } else {
+        output_blob->GetBlobDesc().dims[2] = num_kept;
+    }
+
+    int count = 0;
+    for (int i = 0; i < num; ++i) {
+        const std::map<int, std::vector<float>> &conf_scores = all_conf_scores[i];
+        const LabelBBox &decode_bboxes                       = all_decode_bboxes[i];
+        for (std::map<int, std::vector<int>>::iterator it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
+            int label = it->first;
+            if (conf_scores.find(label) == conf_scores.end()) {
+                // Something bad happened if there are no predictions for
+                // current label.
+                LOGE("Could not find confidence predictions for ");
+                continue;
+            }
+            const std::vector<float> &scores = conf_scores.find(label)->second;
+            int loc_label                    = param->share_location ? -1 : label;
+            if (decode_bboxes.find(loc_label) == decode_bboxes.end()) {
+                // Something bad happened if there are no predictions for
+                // current label.
+                LOGE("Could not find location predictions for ");
+                continue;
+            }
+            const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(loc_label)->second;
+            std::vector<int> &indices                 = it->second;
+
+            for (size_t j = 0; j < indices.size(); ++j) {
+                int idx                    = indices[j];
+                top_data[count * 7]        = static_cast<float>(i);
+                top_data[count * 7 + 1]    = static_cast<float>(label);
+                top_data[count * 7 + 2]    = scores[idx];
+                const NormalizedBBox &bbox = bboxes[idx];
+                top_data[count * 7 + 3]    = bbox.xmin();
+                top_data[count * 7 + 4]    = bbox.ymin();
+                top_data[count * 7 + 5]    = bbox.xmax();
+                top_data[count * 7 + 6]    = bbox.ymax();
+                ++count;
+            }
+        }
+    }
+}
+
+void NaiveDetectionOutput(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                          DetectionOutputLayerParam *param) {
+    ASSERT(inputs.size() >= 3);
+    Blob *loc_blob   = inputs[0];
+    Blob *conf_blob  = inputs[1];
+    Blob *prior_blob = inputs[2];
+    auto loc_dims    = loc_blob->GetBlobDesc().dims;
+    auto conf_dims   = conf_blob->GetBlobDesc().dims;
+    auto prior_dims  = prior_blob->GetBlobDesc().dims;
+    LOGD("the loc_lob: (%d, %d, %d, %d)\n", loc_dims[0], loc_dims[1], loc_dims[2], loc_dims[3]);
+    LOGD("the conf_lob: (%d, %d, %d, %d)\n", conf_dims[0], conf_dims[1], conf_dims[2], conf_dims[3]);
+    LOGD("the prior_lob: (%d, %d, %d, %d)\n", prior_dims[0], prior_dims[1], prior_dims[2], prior_dims[3]);
+    const int num = loc_blob->GetBlobDesc().dims[0];
+    // get output blob
+    Blob *output_blob = outputs[0];
+    // why defination objectness_score_ ?
+    const float objectness_score_ = 0.1f;
+
+    const float *loc_data   = static_cast<const float *>(loc_blob->GetHandle().base);
+    const float *conf_data  = static_cast<const float *>(conf_blob->GetHandle().base);
+    const float *prior_data = static_cast<const float *>(prior_blob->GetHandle().base);
+
+    const float *arm_conf_data = nullptr;
+    const float *arm_loc_data  = nullptr;
+
+    vector<LabelBBox> all_arm_loc_preds;
+
+    int num_loc_classes = param->share_location ? 1 : param->num_classes;
+    int num_priors      = prior_blob->GetBlobDesc().dims[2] / 4;
+
+    // TODO: differ
+    if (inputs.size() >= 4) {
+        arm_conf_data = static_cast<float *>(inputs[3]->GetHandle().base);
+    }
+    if (inputs.size() >= 5) {
+        arm_loc_data = static_cast<float *>(inputs[4]->GetHandle().base);
+        GetLocPredictions(arm_loc_data, num, num_priors, num_loc_classes, param->share_location, &all_arm_loc_preds);
+    }
+
+    // Retrieve all location predictions.
+    std::vector<LabelBBox> all_loc_preds;
+    GetLocPredictions(loc_data, num, num_priors, num_loc_classes, param->share_location, &all_loc_preds);
+
+    // Retrieve all confidences.
+    std::vector<std::map<int, std::vector<float>>> all_conf_scores;
+    if (arm_conf_data != nullptr) {
+        OSGetConfidenceScores(conf_data, arm_conf_data, num, num_priors, param->num_classes, &all_conf_scores,
+                              objectness_score_);
+    } else {
+        GetConfidenceScores(conf_data, num, num_priors, param->num_classes, &all_conf_scores);
+    }
+
+    // Retrieve all prior bboxes. It is same within a batch since we assume all
+    // images in a batch are of same dimension.
+    std::vector<NormalizedBBox> prior_bboxes;
+    std::vector<std::vector<float>> prior_variances;
+    // TODO: differ
+    GetPriorBBoxes(prior_data, num_priors, &prior_bboxes, &prior_variances);
+
+    // Decode all loc predictions to bboxes.
+    std::vector<LabelBBox> all_decode_bboxes;
+    const bool clip_bbox = false;
+    CodeType code_type   = GetCodeType(param->code_type);
+    // TODO: differ
+    if (inputs.size() >= 5) {
+        CasRegDecodeBBoxesAll(all_loc_preds, prior_bboxes, prior_variances, num, param->share_location, num_loc_classes,
+                              param->background_label_id, code_type, param->variance_encoded_in_target, clip_bbox,
+                              &all_decode_bboxes, all_arm_loc_preds);
+    } else {
+        DecodeBBoxesAll(all_loc_preds, prior_bboxes, prior_variances, num, param->share_location, num_loc_classes,
+                        param->background_label_id, code_type, param->variance_encoded_in_target, clip_bbox,
+                        &all_decode_bboxes);
+    }
+
+    int num_kept = 0;
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    for (int i = 0; i < num; ++i) {
+        const LabelBBox &decode_bboxes                       = all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores = all_conf_scores[i];
+        std::map<int, std::vector<int>> indices;
+        int num_det = 0;
+        for (int c = 0; c < param->num_classes; ++c) {
+            if (c == param->background_label_id) {
+                // Ignore background class.
+                continue;
+            }
+            if (conf_scores.find(c) == conf_scores.end()) {
+                // Something bad happened if there are no predictions for
+                // current label.
+                LOGE("Could not find confidence predictions for label ");
+                // assert(false); // "Could not find confidence predictions for
+                // label
+            }
+            const std::vector<float> &scores = conf_scores.find(c)->second;
+            int label                        = param->share_location ? -1 : c;
+            if (decode_bboxes.find(label) == decode_bboxes.end()) {
+                // Something bad happened if there are no predictions for
+                LOGE("Could not find location predictions for label");
+                continue;
+            }
+            const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(label)->second;
+            ApplyNMSFast(bboxes, scores, param->confidence_threshold, param->nms_param.nms_threshold, param->eta,
+                         param->nms_param.top_k, &(indices[c]));
+            num_det += static_cast<int>(indices[c].size());
+        }
+        if (param->keep_top_k > -1 && num_det > param->keep_top_k) {
+            std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+            for (std::map<int, std::vector<int>>::iterator it = indices.begin(); it != indices.end(); ++it) {
+                int label                             = it->first;
+                const std::vector<int> &label_indices = it->second;
+                if (conf_scores.find(label) == conf_scores.end()) {
+                    // Something bad happened for current label.
+                    LOGE("Could not find location predictions for ");
+                    continue;
+                }
+                const std::vector<float> &scores = conf_scores.find(label)->second;
+                for (size_t j = 0; j < label_indices.size(); ++j) {
+                    size_t idx = label_indices[j];
+                    assert(idx < scores.size());
+                    score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                }
+            }
+            // Keep top k results per image.
+            std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<std::pair<int, int>>);
+            score_index_pairs.resize(param->keep_top_k);
+            // Store the new indices.
+            std::map<int, std::vector<int>> new_indices;
+            for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+                int label = score_index_pairs[j].second.first;
+                int idx   = score_index_pairs[j].second.second;
+                new_indices[label].push_back(idx);
+            }
+            all_indices.push_back(new_indices);
+            num_kept += param->keep_top_k;
+        } else {
+            all_indices.push_back(indices);
+            num_kept += num_det;
+        }
+    }
+
+    DealOutput(output_blob, num_kept, num, all_conf_scores, all_decode_bboxes, all_indices, param);
+}
+
+void NaiveColorToGray(const uint8_t *src, uint8_t *dst, int h, int w, int channel, bool bgr_order) {
+    int offset = 0;
+    for (int y = 0; y < h; ++y) {
+        for (int x = 0; x < w; ++x) {
+            unsigned c1      = src[offset * channel + 0];
+            unsigned c2      = src[offset * channel + 1];
+            unsigned c3      = src[offset * channel + 2];
+            unsigned b       = bgr_order ? c1 : c3;
+            unsigned g       = c2;
+            unsigned r       = bgr_order ? c3 : c1;
+            float gray_color = 0.114f * b + 0.587 * g + 0.299 * r;
+            dst[offset]      = gray_color;
+            offset += 1;
+        }
+    }
+}
+
+void NaiveBGROrBGRAToGray(const uint8_t *src, uint8_t *dst, int h, int w, int channel) {
+    return NaiveColorToGray(src, dst, h, w, channel, true);
+}
+
+void NaiveRGBOrRGBAToGray(const uint8_t *src, uint8_t *dst, int h, int w, int channel) {
+    return NaiveColorToGray(src, dst, h, w, channel, false);
+}
+
+void NaiveYUVToBGROrBGRALoop(const unsigned char *yptr0, const unsigned char *yptr1, const unsigned char *vuptr,
+                             unsigned char *rgb0, unsigned char *rgb1, int remain, bool is_nv12, int channel) {
+    for (; remain > 0; remain -= 2) {
+        int u, v;
+        if (is_nv12) {
+            u = (vuptr[0] > 240 ? 240 : vuptr[0]) - 128;
+            v = (vuptr[1] > 240 ? 240 : vuptr[1]) - 128;
+        } else {
+            v = (vuptr[0] > 240 ? 240 : vuptr[0]) - 128;
+            u = (vuptr[1] > 240 ? 240 : vuptr[1]) - 128;
+        }
+
+        int ruv = 102 * v;
+        int guv = -52 * v + -25 * u;
+        int buv = 129 * u;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max(X, 0), 255);
+
+        int y00 = yptr0[0] * 74 - 1135;
+        if (channel == 4)
+            rgb0[3] = 255;
+        rgb0[0 * channel + 2] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
+        rgb0[0 * channel + 1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
+        rgb0[0 * channel + 0] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
+
+        int y01 = yptr0[1] * 74 - 1135;
+        if (channel == 4)
+            rgb0[7] = 255;
+        rgb0[1 * channel + 2] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
+        rgb0[1 * channel + 1] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
+        rgb0[1 * channel + 0] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
+
+        int y10 = yptr1[0] * 74 - 1135;
+        if (channel == 4)
+            rgb1[3] = 255;
+        rgb1[0 * channel + 2] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
+        rgb1[0 * channel + 1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
+        rgb1[0 * channel + 0] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
+
+        int y11 = yptr1[1] * 74 - 1135;
+        if (channel == 4)
+            rgb1[7] = 255;
+        rgb1[1 * channel + 2] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
+        rgb1[1 * channel + 1] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
+        rgb1[1 * channel + 0] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
+
+#undef SATURATE_CAST_UCHAR
+
+        yptr0 += 2;
+        yptr1 += 2;
+        vuptr += 2;
+        rgb0 += 2 * channel;
+        rgb1 += 2 * channel;
+    }
+}
+
+void NaiveYUVToBGROrBGRA(const unsigned char *yuv, unsigned char *bgr, const int channel, const int h, const int w,
+                         bool is_nv12) {
+    const unsigned char *yptr  = yuv;
+    const unsigned char *vuptr = yuv + w * h;
+
+    for (int y = 0; y < h; y += 2) {
+        const unsigned char *yptr0 = yptr;
+        const unsigned char *yptr1 = yptr + w;
+        unsigned char *rgb0        = bgr;
+        unsigned char *rgb1        = bgr + w * channel;
+
+        NaiveYUVToBGROrBGRALoop(yptr0, yptr1, vuptr, rgb0, rgb1, w, is_nv12, channel);
+
+        yptr += 2 * w;
+        vuptr += w;
+        bgr += 2 * channel * w;
+    }
+}
+
+void NaiveDequant(const int8_t *input_ptr, const float *scale_ptr, int scale_len, float *output, DimsVector dims) {
+    int batch   = DimsFunctionUtils::GetDim(dims, 0);
+    int channel = DimsFunctionUtils::GetDim(dims, 1);
+    int hw_size = DimsVectorUtils::Count(dims, 2);
+    for (int n = 0; n < batch; n++) {
+        OMP_PARALLEL_FOR_
+        for (int c = 0; c < channel; c++) {
+            int offset    = n * channel * hw_size + c * hw_size;
+            int scale_idx = scale_len == 1 ? 0 : c;
+            for (int hw = 0; hw < hw_size; hw++) {
+                output[offset + hw] = scale_ptr[scale_idx] * static_cast<float>(input_ptr[offset + hw]);
+            }
+        }
+    }
+}
+
+void NaiveQuant(const float *input_ptr, const float *scale_ptr, int scale_len, int8_t *output, DimsVector dims) {
+    int batch   = DimsFunctionUtils::GetDim(dims, 0);
+    int channel = DimsFunctionUtils::GetDim(dims, 1);
+    int hw_size = DimsVectorUtils::Count(dims, 2);
+    for (int n = 0; n < batch; n++) {
+        OMP_PARALLEL_FOR_
+        for (int c = 0; c < channel; c++) {
+            int offset    = n * channel * hw_size + c * hw_size;
+            int scale_idx = scale_len == 1 ? 0 : c;
+            for (int hw = 0; hw < hw_size; hw++) {
+                if (scale_ptr[scale_idx] != 0)
+                    output[offset + hw] = float2int8(input_ptr[offset + hw] / scale_ptr[scale_idx]);
+                else
+                    output[offset + hw] = 0;
+            }
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/naive_compute.h b/3rdparty/TNN/source/tnn/utils/naive_compute.h
new file mode 100644
index 0000000..71e071e
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/naive_compute.h
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_UTILS_NAIVE_COMPUTE_H_
+#define TNN_UTILS_NAIVE_COMPUTE_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+int8_t float2int8(float val);
+uint8_t float2uint8(float val);
+int8_t half2int8(fp16_t val);
+uint8_t half2uint8(fp16_t val);
+
+template <typename T, typename Tacc>
+void NaiveAdaptivePooling(T *input_data, T *output_data, DimsVector dims_input, DimsVector dims_output, int pool_type);
+
+template <typename T, typename Tacc>
+void NaivePooling(T *input_ptr, T *output_ptr, DimsVector dims_input, DimsVector dims_output, 
+                int stride_y, int stride_x, int kernel_y, int kernel_x, int pad_y, int pad_x, int pool_type);
+
+template <typename T, typename Tacc>
+void NaivePooling3D(T *input_ptr, T *output_ptr, DimsVector dims_input, DimsVector dims_output, 
+                int stride_d, int stride_y, int stride_x,
+                int kernel_d, int kernel_y, int kernel_x,
+                int pad_d, int pad_y, int pad_x, int pool_type);
+
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv1D(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+                 DimsVector dims_output, int stride, int kernel_size, int pad, int group, int dilation,
+                 int activation_type, float *scale, int scale_len, int fusion_type = FusionType_None,
+                 void *add_input = nullptr, float *add_scale = nullptr);
+
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+               DimsVector dims_output, int stride_y, int stride_x, int kernel_size_y, int kernel_size_x, int pad_y,
+               int pad_x, int group, int dilation, int activation_type, float *weight_scale, int weight_scale_len,
+               int8_t *relu6_max, int relu6_max_len, int fusion_type = FusionType_None, void *add_input = nullptr,
+               float *add_scale = nullptr);
+
+template <typename Tin, typename Tw, typename Tacc, typename Tout>
+void NaiveConv3D(void *input_ptr, void *output_ptr, void *weight_ptr, void *bias, DimsVector dims_input,
+               DimsVector dims_output, int stride_d, int stride_y, int stride_x,
+               int kernel_size_d, int kernel_size_y, int kernel_size_x,
+               int pad_d, int pad_y, int pad_x, int group,
+               int dilation_d, int dilation_y, int dilation_x,
+               int activation_type, float *scale, int scale_len,
+               int fusion_type = FusionType_None, void *add_input = nullptr, float *add_scale = nullptr);
+
+// float fc
+template <typename T>
+void NaiveFC(T *input_ptr, T *output_ptr, T *weight_data, float *bias, DimsVector dims_input, DimsVector dims_output);
+
+// int8 fc: reload by scale and scale_len
+void NaiveFC(void *input_ptr, void *output_ptr, void *weight_data, float *scale, int scale_len, void *bias,
+            DimsVector dims_input, DimsVector dims_output);
+
+/**
+ * @brief Permute the input blob by changing the memory order of the data.
+ **/
+template <typename T>
+void NaivePermute(const int count, DimsVector dims, T *bottom_data, const std::vector<int> &permute_order,
+                const std::vector<int> &old_steps, const std::vector<int> &new_steps, const int num_axes, T *top_data);
+
+void NaiveReorg(float *bottom_data, int w, int h, int c, int batch, int stride, int reverse, int mode, float *top_data);
+
+void NaivePriorbox(PriorBoxLayerParam *param, int output_h, int output_w, float *output_data, int layer_height,
+                   int layer_width, int img_height, int img_width, float step_h, float step_w);
+
+void priorbox_set_value(const int N, const float alpha, float *Y);
+
+void NaiveDetectionOutput(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                          DetectionOutputLayerParam *param);
+
+void NaiveColorToGray(const uint8_t *src, uint8_t *dst, int h, int w, int channel, bool bgr_order);
+
+void NaiveBGROrBGRAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel);
+
+void NaiveRGBOrRGBAToGray(const uint8_t* src, uint8_t* dst, int h, int w, int channel);
+
+void NaiveYUVToBGR(const unsigned char* yuv, unsigned char* bgr, int h, int w, bool is_nv12);
+
+void NaiveYUVToBGRA(const unsigned char* yuv, unsigned char* bgra, int h, int w, bool is_nv12);
+
+void NaiveYUVToBGROrBGRALoop(const unsigned char *yptr0, const unsigned char *yptr1, const unsigned char *vuptr,
+                             unsigned char* rgb0, unsigned char* rgb1, int remain, bool is_nv12, int channel);
+
+void NaiveYUVToBGROrBGRA(const unsigned char* yuv, unsigned char* bgr, const int channel, const int h, const int w, bool is_nv12);
+
+void NaiveDequant(const int8_t *input_ptr, const float *scale_ptr, int scale_len, float *output, DimsVector dims);
+
+void NaiveQuant(const float *input_ptr, const float *scale_ptr, int scale_len, int8_t *output, DimsVector dims);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_UTILS_NAIVE_COMPUTE_H_
diff --git a/3rdparty/TNN/source/tnn/utils/npu_common_utils.cc b/3rdparty/TNN/source/tnn/utils/npu_common_utils.cc
new file mode 100644
index 0000000..a93c127
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/npu_common_utils.cc
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/npu_common_utils.h"
+
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+Status NpuCommonUtils::CalculateBroadcastSize(std::vector<int> &weight, EltwiseLayerResource *layer_res,
+                                              std::vector<int> &input) {
+    int input_count = DimsVectorUtils::Count(input, 1);
+    if (weight.size() < 4) {
+        weight             = {1, 1, 1, 1};
+        int layer_res_size = layer_res->element_handle.GetDataCount();
+        if (layer_res_size == 1) {
+            // single element
+            weight[1] = layer_res_size;
+        } else if (layer_res_size == input[1]) {
+            // channel broadcast
+            weight[1] = layer_res_size;
+        } else if (layer_res_size == input_count) {
+            // element broadcast
+            weight[1] = input[1];
+            weight[2] = input[2];
+            weight[3] = input[3];
+        } else if (layer_res_size == input[3]) {
+            weight[3] = input[3];
+        } else {
+            return Status(TNNERR_LAYER_ERR, "Error: unsupported broadcast type");
+        }
+        layer_res->element_shape = weight;
+    }
+    return TNN_OK;
+}
+
+std::string NpuCommonUtils::GetFileHash(ModelConfig &model_config) {
+    if (model_config.params.size() < 2) {
+        return "empty_model_config";
+    }
+    std::string file_content = model_config.params[1] + model_config.params[0];
+    int hash                 = 0;
+    for (size_t i = 0; i < file_content.length(); ++i)
+        hash = 65599 * hash + file_content.at(i);
+    return ToString(hash ^ (hash >> 16));
+}
+
+std::string NpuCommonUtils::modifyModelInputSize(InputShapesMap &inputs_shape,
+                                                 InputShapesMap &instance_input_shapes_map) {
+    std::stringstream model_suffix_stream("");
+    for (auto iter : inputs_shape) {
+        if (instance_input_shapes_map.count(iter.first) > 0 && instance_input_shapes_map[iter.first] != iter.second) {
+            instance_input_shapes_map[iter.first] = iter.second;
+            model_suffix_stream << "_" << iter.first << "[";
+            DimsVector value = iter.second;
+            for (size_t i = 0; i < value.size(); ++i) {
+                if (i != 0) {
+                    model_suffix_stream << "x";
+                }
+                model_suffix_stream << value[i];
+            }
+            model_suffix_stream << "]";
+        }
+    }
+    return model_suffix_stream.str();
+}
+
+bool NpuCommonUtils::FileExits(std::string model_path) {
+    std::ifstream infile(model_path);
+    return infile.good();
+}
+
+Status NpuCommonUtils::CalculateOutputShape(LayerType type, std::vector<Blob *> &input_blobs,
+                                            std::vector<Blob *> &output_blobs, LayerParam *param,
+                                            LayerResource *resource, std::vector<std::string> &outputs_name,
+                                            std::vector<std::vector<int>> &output_shapes) {
+    BaseLayer *shape_calculator = CreateLayer(type);
+
+    Status ret = shape_calculator->InferShapeAhead(input_blobs, output_blobs, param, resource);
+    RETURN_ON_NEQ(ret, TNN_OK);
+
+    for (int i = 0; i < outputs_name.size(); i++) {
+        output_shapes.push_back(output_blobs[i]->GetBlobDesc().dims);
+    }
+
+    delete (shape_calculator);
+
+    return TNN_OK;
+}
+
+Status NpuCommonUtils::CreateBlobs(std::vector<BlobDesc> blob_descs, std::vector<Blob *> &blobs) {
+    for (auto &desc : blob_descs) {
+        Blob *blob = new Blob(desc);
+        blobs.push_back(blob);
+    }
+
+    return TNN_OK;
+}
+
+Status NpuCommonUtils::ReleaseBlobs(std::vector<Blob *> &input_blobs, std::vector<Blob *> &output_blobs) {
+    for (auto &blob : input_blobs) {
+        delete (blob);
+    }
+    for (auto &blob : output_blobs) {
+        delete (blob);
+    }
+    input_blobs.clear();
+    output_blobs.clear();
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/npu_common_utils.h b/3rdparty/TNN/source/tnn/utils/npu_common_utils.h
new file mode 100644
index 0000000..f1564aa
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/npu_common_utils.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_NPU_COMMON_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_NPU_COMMON_UTILS_H_
+
+#include <tnn/core/blob.h>
+#include <tnn/interpreter/layer_resource.h>
+#include <tnn/interpreter/raw_buffer.h>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_param.h"
+
+namespace TNN_NS {
+
+class NpuCommonUtils {
+public:
+    static Status CalculateBroadcastSize(std::vector<int> &weight_shape, EltwiseLayerResource *layer_res,
+                                         std::vector<int> &input_shape);
+    static std::string GetFileHash(ModelConfig &model_config);
+
+    static bool FileExits(std::string model_path);
+
+    static std::string modifyModelInputSize(InputShapesMap &inputs_shape, InputShapesMap &instance_input_shapes_map);
+
+    static Status CalculateOutputShape(LayerType type, std::vector<Blob *> &input_blobs,
+                                       std::vector<Blob *> &output_blobs, LayerParam *param, LayerResource *resource,
+                                       std::vector<std::string> &outputs_name,
+                                       std::vector<std::vector<int>> &output_shapes);
+
+    static Status CreateBlobs(std::vector<BlobDesc> blob_descs, std::vector<Blob *> &blobs);
+
+    static Status ReleaseBlobs(std::vector<Blob *> &input_blobs, std::vector<Blob *> &output_blobs);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_NPU_COMMON_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/omp_utils.h b/3rdparty/TNN/source/tnn/utils/omp_utils.h
new file mode 100644
index 0000000..b028372
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/omp_utils.h
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_OMP_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_OMP_UTILS_H_
+
+#ifdef _OPENMP
+
+#include <omp.h>
+
+#ifdef _MSC_VER
+#define PRAGMA_(X) __pragma(X)
+#else
+#define PRAGMA_(X) _Pragma(#X)
+#endif
+
+#if OpenMP_CXX_VERSION_MAJOR >= 3
+#define OMP_PARALLEL_FOR_COLLAPSE_(t) PRAGMA_(omp parallel for collapse(t))
+#else
+#define OMP_PARALLEL_FOR_COLLAPSE_(t) PRAGMA_(omp parallel for)
+#endif
+#define OMP_PARALLEL_FOR_ PRAGMA_(omp parallel for)
+#define OMP_PARALLEL_FOR_GUIDED_ PRAGMA_(omp parallel for)
+#define OMP_PARALLEL_FOR_DYNAMIC_ PRAGMA_(omp parallel for schedule(dynamic))
+#define OMP_SECTION_ PRAGMA_(omp section)
+#define OMP_PARALLEL_SECTIONS_ PRAGMA_(omp parallel sections)
+#define OMP_CORES_ (omp_get_num_procs())
+#define OMP_MAX_THREADS_NUM_ (omp_get_max_threads())
+#define OMP_TID_ (omp_get_thread_num())
+#define OMP_SET_THREADS_(t) (omp_set_num_threads(t))
+
+#else
+
+#define OMP_PARALLEL_FOR_
+#define OMP_PARALLEL_FOR_GUIDED_
+#define OMP_PARALLEL_FOR_DYNAMIC_
+#define OMP_PARALLEL_FOR_COLLAPSE_(t)
+#define OMP_SECTION_
+#define OMP_PARALLEL_SECTIONS_
+#define OMP_CORES_ (1)
+#define OMP_MAX_THREADS_NUM_ (1)
+#define OMP_TID_ (0)
+#define OMP_SET_THREADS_(t)
+
+#endif  // _OPENMP
+#endif  // TNN_SOURCE_TNN_UTILS_OMP_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/pad_utils.cc b/3rdparty/TNN/source/tnn/utils/pad_utils.cc
new file mode 100644
index 0000000..a3ab369
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/pad_utils.cc
@@ -0,0 +1,303 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "tnn/utils/pad_utils.h"
+
+#include <cstring>
+
+#include "tnn/device/arm/acc/Float4.h"
+
+namespace TNN_NS {
+
+// Common pad in height and width directions
+static void CommonPadImpl(float *input_data, float *output_data, int batch_c_r4, int ih, int iw, int oh, int ow,
+                          int pad_t, int pad_b, int pad_l, int iw_bytes, Float4 &vvalue) {
+    for (int c = 0; c < batch_c_r4; c += 4) {
+        auto input_ptr_c  = input_data + c * ih * iw;
+        auto output_ptr_c = output_data + c * oh * ow;
+
+        if (pad_t)
+            for (int i = 0; i < ow * pad_t; ++i)
+                Float4::save(output_ptr_c + i * 4, vvalue);
+
+        for (int h = 0; h < ih; ++h) {
+            auto output_ptr_h = output_ptr_c + ow * (h + pad_t) * 4;
+            auto input_ptr_h  = input_ptr_c + iw * h * 4;
+            for (int i = 0; i < pad_l; i++)
+                Float4::save(output_ptr_h + i * 4, vvalue);
+
+            memcpy(output_ptr_h + pad_l * 4, input_ptr_h, iw_bytes);
+
+            for (int i = iw + pad_l; i < ow; i++)
+                Float4::save(output_ptr_h + i * 4, vvalue);
+        }
+
+        if (pad_b) {
+            auto output_ptr_h = output_ptr_c + ow * (ih + pad_t) * 4;
+            for (int i = 0; i < ow * pad_b; ++i)
+                Float4::save(output_ptr_h + i * 4, vvalue);
+        }
+    }
+}
+
+static void CalculatePad(Float4 &src, const Float4 &vvalue, const int padded_zero) {
+    if (padded_zero)
+        src = Float4::pad(src, vvalue, padded_zero);
+}
+
+// ic_mapped is not alligned to 4 when pad_c_b % 4 != 0
+static void ChannelPadNotAligned(float *input_data_base, float *output_ptr_c, int ic_mapped, int ic, int ih, int iw,
+                                 int oh, int ow, int pad_t, int pad_b, int pad_l, int pad_r, int iw_bytes,
+                                 Float4 &vvalue) {
+    int ic_r4 = ROUND_UP(ic, 4);
+    // some channel may already be padded with zero
+    int padded_zero  = ic_r4 - ic;
+    auto ic_mapped_1 = ROUND_UP(ic_mapped, 4);
+    auto ic_mapped_0 = ic_mapped_1 - 4;
+    // shift_c is used to extract 4 values from two vectors
+    auto shift_c = ic_mapped - ic_mapped_0;
+    if (ic_mapped_1 < 0 || ic_mapped_0 >= ic_r4) {
+        // pad with vvalue
+        for (int i = 0; i < ow * oh; ++i)
+            Float4::save(output_ptr_c + i * 4, vvalue);
+    } else {
+        auto input_ptr_c0 = input_data_base + ic_mapped_0 * ih * iw;
+        auto input_ptr_c1 = input_data_base + ic_mapped_1 * ih * iw;
+        if (pad_t)
+            for (int i = 0; i < ow * pad_t; ++i)
+                Float4::save(output_ptr_c + i * 4, vvalue);
+
+        for (int h = 0; h < ih; ++h) {
+            auto output_ptr_h = output_ptr_c + ow * (h + pad_t) * 4;
+            auto input_ptr_h0 = input_ptr_c0 + iw * h * 4;
+            auto input_ptr_h1 = input_ptr_c1 + iw * h * 4;
+            for (int i = 0; i < pad_l; i++) {
+                Float4::save(output_ptr_h, vvalue);
+                output_ptr_h += 4;
+            }
+
+            if (ic_mapped_0 >= 0 && ic_mapped_1 < ic_r4 - 4) {
+                // extract from two vectors
+                for (int i = 0; i < iw; i++) {
+                    Float4 res = Float4::extract(Float4::load(input_ptr_h0), Float4::load(input_ptr_h1), shift_c);
+                    Float4::save(output_ptr_h, res);
+                    input_ptr_h0 += 4;
+                    input_ptr_h1 += 4;
+                    output_ptr_h += 4;
+                }
+            } else if (ic_mapped_0 < 0) {
+                // extract from vvalue && left boundary
+                for (int i = 0; i < iw; i++) {
+                    Float4 src = Float4::load(input_ptr_h1);
+                    if (ic_mapped_1 == ic_r4 - 4)
+                        CalculatePad(src, vvalue, padded_zero);
+                    Float4 res = Float4::extract(vvalue, src, shift_c);
+                    Float4::save(output_ptr_h, res);
+                    input_ptr_h1 += 4;
+                    output_ptr_h += 4;
+                }
+            } else if (ic_mapped_1 == ic_r4 - 4) {
+                // extract from two vectors, the right one at the boundary
+                for (int i = 0; i < iw; i++) {
+                    Float4 src = Float4::load(input_ptr_h1);
+                    CalculatePad(src, vvalue, padded_zero);
+                    Float4 res = Float4::extract(Float4::load(input_ptr_h0), src, shift_c);
+                    Float4::save(output_ptr_h, res);
+                    input_ptr_h0 += 4;
+                    input_ptr_h1 += 4;
+                    output_ptr_h += 4;
+                }
+            } else {
+                // extract from right boundary && vvalue
+                for (int i = 0; i < iw; i++) {
+                    Float4 src = Float4::load(input_ptr_h0);
+                    CalculatePad(src, vvalue, padded_zero);
+                    Float4 res = Float4::extract(src, vvalue, shift_c);
+                    Float4::save(output_ptr_h, res);
+                    input_ptr_h0 += 4;
+                    output_ptr_h += 4;
+                }
+            }
+
+            for (int i = 0; i < pad_r; i++) {
+                Float4::save(output_ptr_h, vvalue);
+                output_ptr_h += 4;
+            }
+        }
+
+        if (pad_b) {
+            auto output_ptr_h = output_ptr_c + ow * (ih + pad_t) * 4;
+            for (int i = 0; i < ow * pad_b; ++i)
+                Float4::save(output_ptr_h + i * 4, vvalue);
+        }
+    }
+}
+
+// ic_mapped is alligned to 4 when pad_c_b % 4 == 0
+static void ChannelPadAligned(float *input_data_base, float *output_ptr_c, int ic_mapped, int ic, int ih, int iw,
+                              int oh, int ow, int pad_t, int pad_b, int pad_l, int pad_r, int iw_bytes,
+                              Float4 &vvalue) {
+    int ic_r4       = ROUND_UP(ic, 4);
+    bool ic_aligned = ((ic % 4) == 0);
+    // some channel may already be padded with zero
+    int padded_zero = ic_r4 - ic;
+    if (ic_mapped < 0 || ic_mapped >= ic_r4) {
+        for (int i = 0; i < ow * oh; ++i)
+            Float4::save(output_ptr_c + i * 4, vvalue);
+    } else {
+        auto input_ptr_c = input_data_base + ic_mapped * ih * iw;
+        if (pad_t)
+            for (int i = 0; i < ow * pad_t; ++i)
+                Float4::save(output_ptr_c + i * 4, vvalue);
+
+        for (int h = 0; h < ih; ++h) {
+            auto output_ptr_h = output_ptr_c + ow * (h + pad_t) * 4;
+            auto input_ptr_h  = input_ptr_c + iw * h * 4;
+            for (int i = 0; i < pad_l; i++) {
+                Float4::save(output_ptr_h, vvalue);
+                output_ptr_h += 4;
+            }
+
+            if (ic_aligned || ic_mapped <= ic - 4) {
+                memcpy(output_ptr_h, input_ptr_h, iw_bytes);
+                output_ptr_h += iw * 4;
+            } else {
+                for (int i = 0; i < iw; i++) {
+                    Float4 res = Float4::pad(Float4::load(input_ptr_h), vvalue, padded_zero);
+                    Float4::save(output_ptr_h, res);
+                    input_ptr_h += 4;
+                    output_ptr_h += 4;
+                }
+            }
+
+            for (int i = 0; i < pad_r; i++) {
+                Float4::save(output_ptr_h, vvalue);
+                output_ptr_h += 4;
+            }
+        }
+        if (pad_b) {
+            auto output_ptr_h = output_ptr_c + ow * (ih + pad_t) * 4;
+            for (int i = 0; i < ow * pad_b; ++i)
+                Float4::save(output_ptr_h + i * 4, vvalue);
+        }
+    }
+}
+
+// Channel pad in channel, height and width directions
+static void ChannelPadImpl(float *input_data, float *output_data, int batch, int c_r4, int oh, int ow, int ic, int ih,
+                           int iw, int pad_t, int pad_b, int pad_l, int pad_r, int pad_c_b, int pad_c_e, int iw_bytes,
+                           Float4 &vvalue) {
+    int ic_r4          = ROUND_UP(ic, 4);
+    bool pad_c_aligned = ((pad_c_b % 4) == 0);
+    for (int n = 0; n < batch; ++n) {
+        auto input_data_base  = input_data + n * ic_r4 * ih * iw;
+        auto output_data_base = output_data + n * c_r4 * oh * ow;
+        for (int c = 0; c < c_r4; c += 4) {
+            auto output_ptr_c = output_data_base + c * oh * ow;
+            auto ic_mapped    = c - pad_c_b;
+            if (pad_c_aligned) {
+                ChannelPadAligned(input_data_base, output_ptr_c, ic_mapped, ic, ih, iw, oh, ow, pad_t, pad_b, pad_l,
+                                  pad_r, iw_bytes, vvalue);
+            } else {
+                ChannelPadNotAligned(input_data_base, output_ptr_c, ic_mapped, ic, ih, iw, oh, ow, pad_t, pad_b, pad_l,
+                                     pad_r, iw_bytes, vvalue);
+            }
+        }
+    }
+}
+
+Status PadUtils::ConstPadV2(float *input_data, float *output_data, DimsVector input_dims, DimsVector output_dims,
+                            PadContext context) {
+    if (input_dims.size() != 4) {
+        LOGE("Arm PadV2(const type) only support 4 dims\n");
+        return Status(TNNERR_UNKNOWN_LAYER, "Arm PadV2 only support 4 dims");
+    }
+    const int batch    = context.output_batch;
+    const int oc_r4    = context.output_channel_r4;
+    const int oh       = context.output_height;
+    const int ow       = context.output_width;
+    const int ic       = context.input_channel;
+    const int ih       = context.input_height;
+    const int iw       = context.input_width;
+    const int iw_bytes = iw * sizeof(float) * 4;
+    const int pad_c_b  = context.pad_c_b;
+    const int pad_c_e  = context.pad_c_e;
+    const int pad_t    = context.pad_t;
+    const int pad_l    = context.pad_l;
+    const int pad_b    = context.pad_b;
+    const int pad_r    = context.pad_r;
+    Float4 value_v     = Float4(context.value);
+    if (pad_c_b == 0 && pad_c_e == 0) {
+        CommonPadImpl(input_data, output_data, batch * oc_r4, ih, iw, oh, ow, pad_t, pad_b, pad_l, iw_bytes, value_v);
+    } else {
+        ChannelPadImpl(input_data, output_data, batch, oc_r4, oh, ow, ic, ih, iw, pad_t, pad_b, pad_l, pad_r, pad_c_b,
+                       pad_c_e, iw_bytes, value_v);
+    }
+    return TNN_OK;
+}
+
+Status PadUtils::ReflectPadV2(float *input_data, float *output_data, DimsVector input_dims, DimsVector output_dims,
+                              PadContext context) {
+    if (input_dims.size() != 4) {
+        LOGE("Arm PadV2(reflect type)only support 4 dims\n");
+        return Status(TNNERR_UNKNOWN_LAYER, "Arm PadV2 only support 4 dims");
+    }
+    const int batch     = context.output_batch;
+    const int c_r4      = context.output_channel_r4;
+    const int oh        = context.output_height;
+    const int ow        = context.output_width;
+    const int ic        = context.input_channel;
+    const int ih        = context.input_height;
+    const int iw        = context.input_width;
+    const int byte_size = sizeof(float);
+    const int iw_bytes  = iw * byte_size * 4;
+    const int pad_c_b   = context.pad_c_b;
+    const int pad_c_e   = context.pad_c_e;
+    const int pad_t     = context.pad_t;
+    const int pad_l     = context.pad_l;
+    const int pad_b     = context.pad_b;
+    const int pad_r     = context.pad_r;
+    for (int c = 0; c < batch * c_r4; c += 4) {
+        auto input_ptr_c  = input_data + c * ih * iw;
+        auto output_ptr_c = output_data + c * oh * ow;
+
+        for (int h = 0; h < ih; ++h) {
+            auto output_ptr_h = output_ptr_c + ow * (h + pad_t) * 4;
+            auto input_ptr_h  = input_ptr_c + iw * h * 4;
+            for (int i = 0; i < pad_l; i++) {
+                Float4::save(output_ptr_h + i * 4, Float4::load(input_ptr_h + (pad_l - i) * 4));
+            }
+
+            memcpy(output_ptr_h + pad_l * 4, input_ptr_h, iw_bytes);
+
+            for (int i = 0; i < pad_r; i++) {
+                Float4::save(output_ptr_h + (i + pad_l + iw) * 4, Float4::load(input_ptr_h + (iw - 1 - (i + 1)) * 4));
+            }
+        }
+        // pad: copy from output
+        for (int h = 0; h < pad_t; h++) {
+            auto output_ptr_h = output_ptr_c + ow * h * 4;
+            auto output_ref_h = output_ptr_c + ow * (pad_t + pad_t - h) * 4;
+            memcpy(output_ptr_h, output_ref_h, ow * byte_size * 4);
+        }
+
+        for (int h = 0; h < pad_b; h++) {
+            auto output_ptr_h = output_ptr_c + ow * (h + ih + pad_t) * 4;
+            auto output_ref_h = output_ptr_c + ow * (ih + pad_t - 1 - (h + 1)) * 4;
+            memcpy(output_ptr_h, output_ref_h, ow * byte_size * 4);
+        }
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/pad_utils.h b/3rdparty/TNN/source/tnn/utils/pad_utils.h
new file mode 100644
index 0000000..c76e740
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/pad_utils.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#ifndef TNN_SOURCE_TNN_UTILS_PAD_UTILS_H
+#define TNN_SOURCE_TNN_UTILS_PAD_UTILS_H
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+class PadUtils {
+public:
+    struct PadContext {
+        // input context
+        int32_t input_batch      = 1;
+        int32_t input_channel    = 1;
+        int32_t input_channel_r4 = 4;
+        int32_t input_depth      = 1;
+        int32_t input_height     = 1;
+        int32_t input_width      = 1;
+        // output context
+        int32_t output_batch      = 1;
+        int32_t output_channel    = 1;
+        int32_t output_channel_r4 = 4;
+        int32_t output_depth      = 1;
+        int32_t output_height     = 1;
+        int32_t output_width      = 1;
+        // param context
+        int32_t pad_b_b = 0;
+        int32_t pad_b_e = 0;
+        int32_t pad_c_b = 0;
+        int32_t pad_c_e = 0;
+        int32_t pad_d_b = 0;
+        int32_t pad_d_e = 0;
+        int32_t pad_t   = 0;
+        int32_t pad_b   = 0;
+        int32_t pad_l   = 0;
+        int32_t pad_r   = 0;
+        int32_t type    = 0;
+        float value     = 0.0f;
+    };
+    static Status ConstPadV2(float *input_data, float *output_data, DimsVector input_dims, DimsVector output_dims,
+                             PadContext);
+
+    static Status ReflectPadV2(float *input_data, float *output_data, DimsVector input_dims, DimsVector output_dims,
+                               PadContext);
+};
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_PAD_UTILS_H
diff --git a/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.cc b/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.cc
new file mode 100644
index 0000000..bd3f426
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.cc
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/pribox_generator_utils.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+std::vector<float> GeneratePriorBox(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                    PriorBoxLayerParam *param) {
+    // layer size
+    const int layer_height = inputs[0]->GetBlobDesc().dims[2];
+    const int layer_width  = inputs[0]->GetBlobDesc().dims[3];
+
+    // image size
+    int img_height = param->img_h;
+    int img_width  = param->img_w;
+    if (img_height == 0 || img_width == 0) {
+        img_height = inputs[1]->GetBlobDesc().dims[2];
+        img_width  = inputs[1]->GetBlobDesc().dims[3];
+    }
+
+    // step
+    float step_h = param->step_h;
+    float step_w = param->step_w;
+    if (step_w == 0 || step_h == 0) {
+        step_h = static_cast<float>(img_height) / layer_height;
+        step_w = static_cast<float>(img_width) / layer_width;
+    }
+
+    int pribox_size = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims, 1);
+    std::vector<float> priorbox(pribox_size);
+
+    int num_priors = outputs[0]->GetBlobDesc().dims[2] / (layer_height * layer_width * 4);
+
+    float offset = param->offset;
+    int dim      = outputs[0]->GetBlobDesc().dims[2];
+    int idx      = 0;
+    for (int h = 0; h < layer_height; ++h) {
+        for (int w = 0; w < layer_width; ++w) {
+            float center_x = (w + offset) * step_w;
+            float center_y = (h + offset) * step_h;
+            float box_width, box_height;
+
+            for (int s = 0; s < param->min_sizes.size(); ++s) {
+                int min_size = static_cast<int>(param->min_sizes[s]);
+                box_width = box_height = static_cast<float>(min_size);
+                // xmin
+                priorbox[idx++] = (center_x - box_width / 2.f) / img_width;
+                // ymin
+                priorbox[idx++] = (center_y - box_height / 2.f) / img_height;
+                // xmax
+                priorbox[idx++] = (center_x + box_width / 2.f) / img_width;
+                // ymax
+                priorbox[idx++] = (center_y + box_height / 2.f) / img_height;
+                // if we have max_size
+                if (param->max_sizes.size() > 0) {
+                    int max_size = static_cast<int>(param->max_sizes[s]);
+                    // second prior: aspect_ratio = 1, size = sqrt(min_size *
+                    // max_size)
+                    box_width = box_height = static_cast<float>(sqrtf(float(min_size * max_size)));
+                    // xmin
+                    priorbox[idx++] = (center_x - box_width / 2.f) / img_width;
+                    // ymin
+                    priorbox[idx++] = (center_y - box_height / 2.f) / img_height;
+                    // xmax
+                    priorbox[idx++] = (center_x + box_width / 2.f) / img_width;
+                    // ymax
+                    priorbox[idx++] = (center_y + box_height / 2.f) / img_height;
+                }
+                for (int r = 0; r < param->aspect_ratios.size(); ++r) {
+                    float ar = param->aspect_ratios[r];
+                    if (fabs(ar - 1.) < 1e-6) {
+                        continue;
+                    }
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+                    // xmin
+                    priorbox[idx++] = (center_x - box_width / 2.f) / img_width;
+                    // ymin
+                    priorbox[idx++] = (center_y - box_height / 2.f) / img_height;
+                    // xmax
+                    priorbox[idx++] = (center_x + box_width / 2.f) / img_width;
+                    // ymax
+                    priorbox[idx++] = (center_y + box_height / 2.f) / img_height;
+                }
+            }
+        }
+    }
+
+    // clip the prior's coordiate such that it is within [0, 1]
+    if (param->clip) {
+        for (int d = 0; d < dim; ++d) {
+            priorbox[d] = std::min<float>(std::max<float>(priorbox[d], 0), 1);
+        }
+    }
+
+    // set the variance.
+    if (param->variances.size() == 1) {
+        for (int vi = 0; vi < dim; vi++) {
+            priorbox[dim + vi] = param->variances[0];
+        }
+    } else {
+        int count = 0;
+        for (int index = 0; index < layer_height * layer_width * num_priors; index++) {
+            for (int j = 0; j < 4; ++j) {
+                priorbox[dim + count] = param->variances[j];
+                ++count;
+            }
+        }
+    }
+    return priorbox;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.h b/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.h
new file mode 100644
index 0000000..6e91cb9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/pribox_generator_utils.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_PRIBOX_GENERATOR_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_PRIBOX_GENERATOR_UTILS_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/layer_param.h"
+
+namespace TNN_NS {
+
+std::vector<float> GeneratePriorBox(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs,
+                                    PriorBoxLayerParam *param);
+
+}
+
+#endif  // TNN_SOURCE_TNN_UTILS_PRIBOX_GENERATOR_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/random_data_utils.cc b/3rdparty/TNN/source/tnn/utils/random_data_utils.cc
new file mode 100644
index 0000000..32fe9e7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/random_data_utils.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/random_data_utils.h"
+
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
+
+namespace TNN_NS {
+
+template <typename T>
+int InitRandom(T* host_data, size_t n, T range) {
+    for (unsigned long long i = 0; i < n; i++) {
+        host_data[i] = (T)((rand() % 16 - 8) / 8.0f * range);
+    }
+
+    return 0;
+}
+template int InitRandom(float* host_data, size_t n, float range);
+template int InitRandom(int32_t* host_data, size_t n, int32_t range);
+template int InitRandom(int8_t* host_data, size_t n, int8_t range);
+template int InitRandom(bfp16_t* host_data, size_t n, bfp16_t range);
+template int InitRandom(fp16_t * host_data, size_t n, fp16_t range);
+
+template <typename T>
+int InitRandom(T* host_data, size_t n, T range_min, T range_max) {
+    static std::mt19937 g(42);
+    std::uniform_real_distribution<> rnd(range_min, range_max);
+
+    for (unsigned long long i = 0; i < n; i++) {
+        host_data[i] = static_cast<T>(rnd(g));
+    }
+
+    return 0;
+}
+template int InitRandom(float* host_data, size_t n, float range_min, float range_max);
+template int InitRandom(int32_t* host_data, size_t n, int32_t range_min, int32_t range_max);
+template int InitRandom(int8_t* host_data, size_t n, int8_t range_min, int8_t range_max);
+template int InitRandom(uint8_t* host_data, size_t n, uint8_t range_min, uint8_t range_max);
+template int InitRandom(fp16_t* host_data, size_t n, fp16_t range_min, fp16_t range_max);
+
+template <>
+int InitRandom(bfp16_t* host_data, size_t n, bfp16_t range_min, bfp16_t range_max) {
+    static std::mt19937 g(42);
+    std::uniform_real_distribution<> rnd((float)range_min, (float)range_max);
+
+    for (unsigned long long i = 0; i < n; i++) {
+        host_data[i] = static_cast<bfp16_t>(rnd(g));
+    }
+
+    return 0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/random_data_utils.h b/3rdparty/TNN/source/tnn/utils/random_data_utils.h
new file mode 100644
index 0000000..4f0d4b1
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/random_data_utils.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_UTILS_RANDOM_DATA_UTILS_H_
+#define TNN_UTILS_RANDOM_DATA_UTILS_H_
+
+#include <chrono>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+template <typename T>
+int InitRandom(T* host_data, size_t n, T range);
+template <typename T>
+int InitRandom(T* host_data, size_t n, T range_min, T range_max);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_UTILS_RANDOM_DATA_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/split_utils.cc b/3rdparty/TNN/source/tnn/utils/split_utils.cc
new file mode 100644
index 0000000..26dfe4c
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/split_utils.cc
@@ -0,0 +1,206 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/split_utils.h"
+
+#include <stdlib.h>
+#include <algorithm>
+#include <cstring>
+
+namespace TNN_NS {
+
+bool SplitUtils::IsQuote(char c) {
+    return c == '\'' || c == '\"';
+}
+
+bool SplitUtils::IsFullWidth(const char *pstr) {
+    if (pstr == 0)
+        return false;
+    if (*pstr == 0 || *(pstr + 1) == 0)
+        return false;
+    return ((*pstr) & 0x80) && ((unsigned char)(*pstr) != 0xFF);
+}
+
+int SplitUtils::TrimStr(char *pstr, const char trim_char /* = ' ' */, bool trim_gb /* = false */) {
+    const char *p = pstr;
+    if (0 == p)
+        return 0;
+
+    // Get start and end position
+    int start = 0, end = 0;
+    while (*p) {
+        // 全角
+        if (trim_gb && IsFullWidth(p)) {
+            if (*((unsigned short *)p) == 0xa1a1) {
+                if (end == 0)
+                    start += 2;
+            } else {
+                end = (int)(p - pstr + 2);
+            }
+            p += 2;
+            continue;
+        }
+        // 半角
+        if (((unsigned char)*p < 0x20 || trim_char == *p)) {
+            if (end == 0)
+                start++;
+        } else
+            end = (int)(p - pstr + 1);
+        p++;
+    }
+
+    // trim it
+    end > 0 ? pstr[end] = 0 : end = (int)(p - pstr);
+    if (end == start)
+        pstr[0] = 0;
+    else if (start > 0)
+        memmove(pstr, pstr + start, end - start + sizeof(char));
+
+    return end - start;
+}
+
+char *SplitUtils::StrNCpy(char *dst, const char *src, int maxcnt) {
+    char *rdst       = dst;
+    const char *rsrc = src;
+    int rmaxlen      = maxcnt;
+
+    if (rmaxlen > 0) {
+        if (rdst != rsrc) {
+            *rdst = '\0';
+            if (rsrc != 0)
+                strncat(rdst, rsrc, --rmaxlen);
+        } else {
+            rdst += (rmaxlen - 1);
+            *rdst = '\0';
+        }
+    }
+    return dst;
+}
+
+void SplitUtils::ParseStr(const char *str, char *subs, const int len,
+                          const bool supp_quote, const bool trim, const bool ignore_blank,
+                          const bool trim_quote, const bool supp_quanjiao, const int i,
+                          int& cursor, bool &left_quote, bool &right_quote) {
+    if (len > 0) {
+        if (!supp_quote)
+            StrNCpy(subs, str + cursor, len + 1);
+        else {
+            if (trim_quote && IsQuote(str[cursor])) {
+                left_quote = true;
+                if (str[i - 1] == str[cursor])
+                    right_quote = true;
+
+                StrNCpy(subs, str + cursor + left_quote, len + 1 - left_quote - right_quote);
+            } else
+                StrNCpy(subs, str + cursor, len + 1);
+
+            right_quote = false;
+        }
+    }
+    cursor = i + 1;
+
+    // trim it
+    if (trim || ignore_blank)
+        TrimStr(subs, ' ', supp_quanjiao);
+}
+
+Status SplitUtils::SplitStr(const char *str, str_arr &subs_array, const char spliter[] /* = ",;" */,
+                            bool trim /* = true */, bool ignore_blank /* = false */, bool supp_quote /* = false */,
+                            bool trim_quote /* = true */, bool supp_quanjiao /* = false */) {
+    bool quote_start = false;
+    char last_quote  = 0;
+    bool left_quote = false, right_quote = false;
+    int step = 1;
+
+    if (str[0] == 0) {
+        return TNN_OK;
+    }
+    
+    const int subs_length = 2048;
+    char *subs = (char *)calloc(subs_length, sizeof(char));
+
+    for (int i = 0, cursor = 0;; i += step) {
+        const char &c = str[i];
+
+        // check full width
+        step = 1;
+        if (supp_quanjiao && IsFullWidth(str + i)) {
+            step = 2;
+            continue;
+        }
+
+        // check quote
+        if (supp_quote) {
+            if (IsQuote(c)) {
+                if (quote_start == false) {
+                    quote_start = true;
+                    left_quote  = true;
+                    last_quote  = c;
+                    continue;
+                } else if (c == last_quote) {
+                    quote_start = false;
+                    right_quote = true;
+                    last_quote  = 0;
+                }
+            }
+        }
+
+        // segment check
+        if (c == 0 || (!quote_start && strchr(spliter, c))) {
+            subs[0] = 0;
+#if defined(WIN32) && _MSC_VER < 1300  // VC++ 6.0
+            int len = min((i - cursor), subs_length - 1);
+#else
+            int len = std::min<int>(i - cursor, subs_length - 1);
+#endif
+
+            ParseStr(str, subs, len, supp_quote, trim, ignore_blank,
+                     trim_quote, supp_quanjiao, i, cursor, left_quote, right_quote);
+
+            std::string subs_str(subs);  // for gnu_stl
+            if (!ignore_blank || subs[0] != 0)
+                subs_array.push_back(subs_str);
+        }
+
+        // end loop
+        if (str[i] == 0)
+            break;
+    }
+    
+    free(subs);
+    return TNN_OK;
+}
+
+Status SplitUtils::SplitParamList(const str_arr kv_pairs, str_dict &subs_dict, const char spliter[]) {
+    for (int i = 0; i < kv_pairs.size(); i++) {
+        str_arr kv;
+        auto ret = SplitStr(kv_pairs[i].c_str(), kv, spliter, true, false);
+        if (ret != TNN_OK || kv.size() != 2) {
+            return Status(TNNERR_PARAM_ERR, "split param list failed");
+        }
+        int key = atoi(kv[0].c_str());
+
+        bool is_array = key <= -23300;
+        if (is_array) {
+            key = -key - 23300;
+        }
+
+        std::string value(kv[1]);
+        subs_dict[key] = value;
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/split_utils.h b/3rdparty/TNN/source/tnn/utils/split_utils.h
new file mode 100644
index 0000000..51a6b28
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/split_utils.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_SPLIT_UTILS_H_
+#define TNN_SOURCE_TNN_UTILS_SPLIT_UTILS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+typedef std::vector<std::string> str_arr;
+typedef std::map<int, std::string> str_dict;
+
+class SplitUtils {
+public:
+    static Status SplitStr(const char *str, str_arr &subs_array, const char spliter[] = ",;:", bool trim = true,
+                           bool ignore_blank = false, bool supp_quote = false, bool trim_quote = true,
+                           bool supp_quanjiao = false);
+
+    static Status SplitParamList(const str_arr input_arr, str_dict &subs_array, const char spliter[] = "=");
+
+private:
+    static bool IsFullWidth(const char *pstr);
+    static bool IsQuote(char c);
+    static char *StrNCpy(char *dst, const char *src, int maxcnt);
+    static int TrimStr(char *pstr, const char trim_char = ' ', bool trim_gb = false);
+    static void ParseStr(const char *str, char *subs, const int len,
+                         const bool supp_quote, const bool trim, const bool ignore_blank,
+                         const bool trim_quote, const bool supp_quanjiao, const int i,
+                         int& cursor, bool &left_quote, bool &right_quote);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_SPLIT_UTILS_H_
diff --git a/3rdparty/TNN/source/tnn/utils/string_format.cc b/3rdparty/TNN/source/tnn/utils/string_format.cc
new file mode 100644
index 0000000..cf8e810
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/string_format.cc
@@ -0,0 +1,137 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/string_format.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <numeric>
+
+namespace TNN_NS {
+
+std::string DoubleToString(double val) {
+    std::stringstream stream;
+    stream << std::setprecision(3) << std::setiosflags(std::ios::fixed) << val;
+    return stream.str();
+}
+
+std::string DoubleToStringFilter(double val) {
+    if (0 == val) {
+        return "";
+    } else {
+        return DoubleToString(val);
+    }
+}
+
+std::string MatTypeToString(MatType mat_type) {
+    if (N8UC3 == mat_type) {
+        return "N8UC3";
+    } else if (N8UC4 == mat_type) {
+        return "N8UC4";
+    } else if (NGRAY == mat_type) {
+        return "NGRAY";
+    } else if (NNV21 == mat_type) {
+        return "NNV21";
+    } else if (NNV12 == mat_type) {
+        return "NNV12";
+    } else if (NCHW_FLOAT == mat_type) {
+        return "NCHW_FLOAT";
+    } else if (NC_INT32 == mat_type) {
+        return "NC_INT32";
+    } else if (RESERVED_BFP16_TEST == mat_type) {
+        return "RESERVED_BFP16_TEST";
+    } else if (RESERVED_FP16_TEST == mat_type) {
+        return "RESERVED_FP16_TEST";
+    } else if (RESERVED_INT8_TEST == mat_type) {
+        return "RESERVED_INT8_TEST";
+    } else {
+        return "INVALID Mat Type";
+    }
+}
+
+std::string DimsToString(std::vector<int> dims) {
+    std::stringstream stream;
+    stream << "[";
+    for (int i = 0; i < dims.size() - 1; ++i) {
+        stream << dims[i] << ", ";
+    }
+    stream << dims[dims.size() - 1] << "]";
+
+    return stream.str();
+}
+
+struct CmpByValue {
+    bool operator()(const std::pair<std::string, std::vector<float>> &lhs,
+                    const std::pair<std::string, std::vector<float>> &rhs) {
+        return lhs.second[0] > rhs.second[0];
+    }
+};
+
+std::vector<std::pair<std::string, std::vector<float>>> SortMapByValue(std::map<std::string, std::vector<float>> &map) {
+    std::vector<std::pair<std::string, std::vector<float>>> pair_vec(map.begin(), map.end());
+    sort(pair_vec.begin(), pair_vec.end(), CmpByValue());
+    return pair_vec;
+}
+
+std::ostream &FormatRow(std::ostream &stream, int width) {
+    stream << std::right << std::setw(width);
+    return stream;
+}
+
+std::string StringFormatter::Table(const std::string &title, const std::vector<std::string> &header,
+                                   const std::vector<std::vector<std::string>> &data) {
+    if (header.empty())
+        return "";
+    const size_t column_size = header.size();
+    const size_t data_size   = data.size();
+    std::vector<int> max_column_len(header.size(), 0);
+    for (size_t col_idx = 0; col_idx < column_size; ++col_idx) {
+        max_column_len[col_idx] = std::max<int>(max_column_len[col_idx], static_cast<int>(header[col_idx].size()));
+        for (size_t data_idx = 0; data_idx < data_size; ++data_idx) {
+            if (col_idx < data[data_idx].size()) {
+                max_column_len[col_idx] =
+                    std::max<int>(max_column_len[col_idx], static_cast<int>(data[data_idx][col_idx].size()));
+            }
+        }
+    }
+    const size_t row_length = std::accumulate(max_column_len.begin(), max_column_len.end(), 0, std::plus<size_t>()) +
+                              2 * column_size + column_size + 1;
+    const std::string dash_line(row_length, '-');
+    std::stringstream stream;
+    stream << dash_line << std::endl;
+    FormatRow(stream, static_cast<int>(row_length / 2 + title.size() / 2)) << title << std::endl;
+    stream << dash_line << std::endl;
+    // format header
+    stream << "|";
+    for (size_t h_idx = 0; h_idx < column_size; ++h_idx) {
+        stream << " ";
+        FormatRow(stream, max_column_len[h_idx]) << header[h_idx];
+        stream << " |";
+    }
+    stream << std::endl << dash_line << std::endl;
+    // format data
+    for (size_t data_idx = 0; data_idx < data_size; ++data_idx) {
+        stream << "|";
+        for (size_t h_idx = 0; h_idx < column_size; ++h_idx) {
+            stream << " ";
+            FormatRow(stream, max_column_len[h_idx]) << data[data_idx][h_idx];
+            stream << " |";
+        }
+        stream << std::endl;
+    }
+    stream << dash_line << std::endl;
+    return stream.str();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/string_format.h b/3rdparty/TNN/source/tnn/utils/string_format.h
new file mode 100644
index 0000000..6375915
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/string_format.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_STRING_FORMATTER_H_
+#define TNN_SOURCE_TNN_UTILS_STRING_FORMATTER_H_
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/mat.h"
+
+namespace TNN_NS {
+
+std::string DoubleToString(double val);
+
+std::string DoubleToStringFilter(double val);
+
+std::string MatTypeToString(MatType mat_type);
+
+std::string DimsToString(std::vector<int> dims);
+
+template <typename Int>
+std::string IntToString(Int val) {
+    static_assert(std::is_integral<Int>::value, "Integral type required!");
+    std::stringstream stream;
+    stream << val;
+    return stream.str();
+}
+
+template <typename Int>
+std::string IntToStringFilter(Int val) {
+    if (static_cast<Int>(0) == val) {
+        return "";
+    } else {
+        return IntToString(val);
+    }
+}
+
+template <typename T>
+std::string VectorToString(std::vector<T> val) {
+    if (val.empty())
+        return "";
+
+    std::stringstream stream;
+    stream << "[";
+    for (int i = 0; i < val.size(); ++i) {
+        stream << val[i];
+        if (i != val.size() - 1)
+            stream << ",";
+    }
+    stream << "]";
+    return stream.str();
+}
+
+std::vector<std::pair<std::string, std::vector<float>>> SortMapByValue(std::map<std::string, std::vector<float>> &map);
+
+class StringFormatter {
+public:
+    static std::string Table(const std::string &title, const std::vector<std::string> &header,
+                             const std::vector<std::vector<std::string>> &data);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_STRING_FORMATTER_H_
diff --git a/3rdparty/TNN/source/tnn/utils/string_utils.cc b/3rdparty/TNN/source/tnn/utils/string_utils.cc
new file mode 100644
index 0000000..0b22b91
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/string_utils.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/string_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+namespace TNN_NS {
+
+std::string UcharToString(const unsigned char *buffer, int length){
+    std::string param;
+    for(int i = 0; i<length; i++){
+        param += buffer[i];
+    }
+    return param;
+}
+
+template <>
+std::string ToString<float>(float value) {
+    std::ostringstream os;
+    os << std::showpoint << value;
+    return os.str();
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/string_utils_inner.h b/3rdparty/TNN/source/tnn/utils/string_utils_inner.h
new file mode 100644
index 0000000..57852f9
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/string_utils_inner.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_UTILS_STRING_UTILS_INNER_H_
+#define TNN_SOURCE_TNN_UTILS_STRING_UTILS_INNER_H_
+
+#include <sstream>
+#include <string>
+#include "tnn/core/macro.h"
+#include "tnn/utils/string_utils.h"
+
+namespace TNN_NS {
+
+template <typename T>
+std::string ToString(T value) {
+    std::ostringstream os;
+    os << value;
+    return os.str();
+}
+
+template <>
+std::string ToString<float>(float value);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_UTILS_STRING_UTILS_INNER_H_
diff --git a/3rdparty/TNN/source/tnn/utils/winograd_generator.cc b/3rdparty/TNN/source/tnn/utils/winograd_generator.cc
new file mode 100644
index 0000000..f1353b7
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/winograd_generator.cc
@@ -0,0 +1,548 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/winograd_generator.h"
+
+#include <math.h>
+#include <memory.h>
+
+#include "tnn/core/macro.h"
+#include "tnn/utils/dims_utils.h"
+
+#ifdef TNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace TNN_NS {
+
+/*
+create matrix with w&h, alloc matrix buffer inner
+*/
+CMatrix CMatrixCreate(int w, int h) {
+    std::shared_ptr<float> result = std::shared_ptr<float>(new float[w * h], [](float* p) { delete[] p; });
+    DimsVector dims               = {w, h};
+    return make_tuple(result, dims);
+}
+
+/*
+create matrix with dims, alloc matrix buffer inner
+*/
+CMatrix CMatrixCreate(DimsVector dims) {
+    int count = 1;
+    for (auto iter : dims) {
+        count *= iter;
+    }
+    std::shared_ptr<float> result = std::shared_ptr<float>(new float[count], [](float* p) { delete[] p; });
+    return make_tuple(result, dims);
+}
+
+/*
+get cmatrix stride
+*/
+DimsVector CMatrixGetStrides(CMatrix& matrix) {
+    auto dims = std::get<1>(matrix);
+    DimsVector strides;
+    // strides.push_back(1);
+    // int count = 1;
+    // for (auto iter : dims) {
+    //     count *= iter;
+    //     strides.push_back(count);
+    // }
+    int count = 1;
+    for (auto iter : dims) {
+        count *= iter;
+    }
+    for (auto iter : dims) {
+        count /= iter;
+        strides.push_back(count);
+    }
+    return strides;
+}
+
+/*
+matrix mul: M*K x K*N = M*N
+*/
+static void matmul(CMatrix& C, const CMatrix& A, const float* B, DimsVector B_dims) {
+    auto C_dims = std::get<1>(C);
+    auto A_dims = std::get<1>(A);
+
+    ASSERT(2 == C_dims.size());
+    ASSERT(2 == A_dims.size());
+    ASSERT(2 == B_dims.size());
+
+    const auto a = std::get<0>(A).get();
+    const auto b = B;
+    auto c       = std::get<0>(C).get();
+
+    const int h = A_dims[1];
+    const int k = A_dims[0];
+    const int w = B_dims[0];
+
+    const int aw = A_dims[0];
+    const int bw = B_dims[0];
+    const int cw = C_dims[0];
+
+    ASSERT(k == B_dims[1]);
+
+    int y = 0;
+    for (; y < h; ++y) {
+        int x            = 0;
+        const auto aLine = a + y * aw;
+        auto cLine       = c + y * cw;
+#ifdef TNN_USE_NEON
+        // firstly, compute 16 together
+        for (; x <= w - 16; x += 16) {
+            auto bColumn     = b + x;
+            float32x4_t sum0 = vdupq_n_f32(0.0);
+            float32x4_t sum1 = vdupq_n_f32(0.0);
+            float32x4_t sum2 = vdupq_n_f32(0.0);
+            float32x4_t sum3 = vdupq_n_f32(0.0);
+            for (int i = 0; i < k; ++i) {
+                const auto bLine = bColumn + i * bw;
+                float32x4_t a0   = vdupq_n_f32(aLine[i]);
+                float32x4_t b0   = vld1q_f32(bLine);
+                float32x4_t b1   = vld1q_f32(bLine + 4);
+                float32x4_t b2   = vld1q_f32(bLine + 8);
+                float32x4_t b3   = vld1q_f32(bLine + 12);
+                sum0             = vmlaq_f32(sum0, a0, b0);
+                sum1             = vmlaq_f32(sum1, a0, b1);
+                sum2             = vmlaq_f32(sum2, a0, b2);
+                sum3             = vmlaq_f32(sum3, a0, b3);
+            }
+            vst1q_f32(cLine + x, sum0);
+            vst1q_f32(cLine + x + 4, sum1);
+            vst1q_f32(cLine + x + 8, sum2);
+            vst1q_f32(cLine + x + 12, sum3);
+        }
+        // secondly, compute 4 together
+        for (; x <= w - 4; x += 4) {
+            auto bColumn    = b + x;
+            float32x4_t sum = vdupq_n_f32(0.0);
+            for (int i = 0; i < k; ++i) {
+                const auto bLine = bColumn + i * bw;
+                float32x4_t a4   = vdupq_n_f32(aLine[i]);
+                float32x4_t b4   = vld1q_f32(bLine);
+                sum              = vmlaq_f32(sum, a4, b4);
+            }
+            vst1q_f32(cLine + x, sum);
+        }
+#endif
+        // naive matrix mul
+        for (; x < w; ++x) {
+            auto bColumn = b + x;
+            float sum    = 0.0f;
+            for (int i = 0; i < k; ++i) {
+                sum += aLine[i] * bColumn[i * bw];
+            }
+            cLine[x] = sum;
+        }
+    }
+}
+
+static void matmul(CMatrix& C, const CMatrix& A, const CMatrix& B) {
+    matmul(C, A, std::get<0>(B).get(), std::get<1>(B));
+}
+
+static void divPerLine(CMatrix& C, const CMatrix& A, const CMatrix& Line) {
+    auto C_dims    = std::get<1>(C);
+    auto A_dims    = std::get<1>(A);
+    auto Line_dims = std::get<1>(Line);
+
+    auto c = std::get<0>(C).get();
+    auto a = std::get<0>(A).get();
+    auto l = std::get<0>(Line).get();
+    auto w = C_dims[0];
+    auto h = C_dims[1];
+
+    ASSERT(Line_dims[0] >= h);
+    ASSERT(A_dims[1] == h);
+    ASSERT(A_dims[0] == w);
+    ASSERT(Line_dims[1] == 1);
+
+    for (int y = 0; y < h; ++y) {
+        for (int x = 0; x < w; ++x) {
+            c[x + y * w] = a[x + y * w] / l[y];
+        }
+    }
+}
+
+static void transpose(CMatrix dst, CMatrix src) {
+    auto src_data = std::get<0>(src).get();
+    auto dst_data = std::get<0>(dst).get();
+
+    auto src_dims = std::get<1>(src);
+    auto dst_dims = std::get<1>(dst);
+
+    int w = dst_dims[0];
+    int h = dst_dims[1];
+
+    ASSERT(w == src_dims[1] || h == src_dims[0]);
+
+    for (int y = 0; y < h; ++y) {
+        for (int x = 0; x < w; ++x) {
+            dst_data[w * y + x] = src_data[h * x + y];
+        }
+    }
+}
+
+static CMatrix polyMulti(CMatrix A, CMatrix B) {
+    auto a = std::get<0>(A).get();
+    auto b = std::get<0>(B).get();
+
+    auto a_dims = std::get<1>(A);
+    auto b_dims = std::get<1>(B);
+
+    ASSERT(a_dims[1] == 1);
+    ASSERT(b_dims[1] == 1);
+
+    auto aw = a_dims[0];
+    auto bw = b_dims[0];
+
+    std::shared_ptr<float> result = std::shared_ptr<float>(new float[aw + bw - 1], [](float* p) { delete[] p; });
+    DimsVector result_dims        = {aw + bw - 1, 1};
+
+    auto c = result.get();
+    for (int i = 0; i < aw + bw - 1; ++i) {
+        c[i] = 0.0f;
+    }
+    for (int y = 0; y < bw; ++y) {
+        auto bValue = b[y];
+        for (int x = 0; x < aw; ++x) {
+            auto aValue = a[x];
+            c[x + y] += bValue * aValue;
+        }
+    }
+    return make_tuple(result, result_dims);
+}
+
+static CMatrix computeF(const float* a, int alpha) {
+    auto res      = CMatrixCreate(alpha, 1);
+    auto diagData = std::get<0>(res).get();
+    for (int x = 0; x < alpha; ++x) {
+        float product = 1.0f;
+        for (int i = 0; i < alpha; ++i) {
+            if (x == i) {
+                continue;
+            }
+            product *= (a[x] - a[i]);
+        }
+        diagData[x] = product;
+    }
+    return res;
+}
+
+static CMatrix computeT(const float* a, int n) {
+    auto result = CMatrixCreate(n + 1, n);
+    for (int y = 0; y < n; ++y) {
+        auto line = std::get<0>(result).get() + (n + 1) * y;
+        ::memset(line, 0, (n + 1) * sizeof(float));
+        line[y] = 1.0f;
+        line[n] = -::powf(a[y], (float)n);
+    }
+    return result;
+}
+
+static CMatrix computeL(const float* a, int n) {
+    ASSERT(n >= 1);
+    auto result = CMatrixCreate(n, n);
+    for (int k = 0; k < n; ++k) {
+        auto poly = CMatrixCreate(1, 1);
+        auto p    = std::get<0>(poly).get();
+        p[0]      = 1.0f;
+
+        auto poly2 = CMatrixCreate(2, 1);
+        auto p2    = std::get<0>(poly2).get();
+        for (int i = 0; i < n; ++i) {
+            if (i == k) {
+                continue;
+            }
+            p2[0] = -a[i];
+            p2[1] = 1.0f;
+            poly  = polyMulti(poly, poly2);
+        }
+        ::memcpy(std::get<0>(result).get() + n * k, std::get<0>(poly).get(), n * sizeof(float));
+    }
+    return result;
+}
+
+static CMatrix computeB(const float* a, int alpha) {
+    std::shared_ptr<float> res;
+    auto LT    = computeL(a, alpha - 1);
+    auto fdiag = computeF(a, alpha - 1);
+    divPerLine(LT, LT, fdiag);
+
+    auto L = CMatrixCreate(alpha - 1, alpha - 1);
+    transpose(L, LT);
+
+    auto T  = computeT(a, alpha - 1);
+    auto BT = CMatrixCreate(alpha, alpha - 1);
+    matmul(BT, L, T);
+
+    auto B = CMatrixCreate(alpha, alpha);
+    for (int y = 0; y < alpha - 1; ++y) {
+        ::memcpy(std::get<0>(B).get() + alpha * y, std::get<0>(BT).get() + alpha * y, alpha * sizeof(float));
+    }
+    auto BLast = std::get<0>(B).get() + alpha * (alpha - 1);
+    for (int x = 0; x < alpha - 1; ++x) {
+        BLast[x] = 0;
+    }
+    BLast[alpha - 1] = 1.0f;
+
+    return B;
+}
+
+static CMatrix computeA(const float* a, int m, int n) {
+    auto res = CMatrixCreate(m, n);
+    for (int y = 0; y < n; ++y) {
+        float* line = std::get<0>(res).get() + m * y;
+        for (int x = 0; x < m - 1; ++x) {
+            if (x == 0 && y == 0) {
+                line[x] = 1.0f;
+            } else {
+                line[x] = ::powf(a[x], (float)y);
+            }
+        }
+        if (y == n - 1) {
+            line[m - 1] = 1.0f;
+        } else {
+            line[m - 1] = 0.0f;
+        }
+    }
+    return res;
+}
+
+static CMatrix computeFDiag(const float* a, int alpha) {
+    auto res      = CMatrixCreate(alpha, 1);
+    auto diagData = std::get<0>(res).get();
+    for (int x = 0; x < alpha - 1; ++x) {
+        float product = 1.0f;
+        for (int i = 0; i < alpha - 1; ++i) {
+            if (x == i) {
+                continue;
+            }
+            product *= (a[x] - a[i]);
+        }
+        diagData[x] = product;
+    }
+    diagData[alpha - 1] = 1.0f;
+    if (diagData[0] < 0) {
+        diagData[0] = -diagData[0];
+    }
+    return res;
+}
+
+/*
+1D: AT*((G*g)(BT*d))
+2D: AT*((G*g*GT)(BT*d*B))*A
+https://github.com/andravin/wincnn
+*/
+WinogradGenerator::WinogradGenerator(int computeUnit, int kernelSize, float interp, bool transform_inner) {
+    ASSERT(computeUnit > 0 && kernelSize > 0);
+    unit_        = computeUnit;
+    kernel_size_ = kernelSize;
+    transform_inner_ = transform_inner;
+
+    int n     = computeUnit;
+    int r     = kernelSize;
+    int alpha = n + r - 1;
+    G_        = CMatrixCreate(r, alpha);
+    B_        = CMatrixCreate(alpha, alpha);
+    A_        = CMatrixCreate(n, alpha);
+
+    auto polyBuffer = CMatrixCreate(alpha, 1);
+    auto a          = std::get<0>(polyBuffer).get();
+    a[0]            = 0.0f;
+    int sign        = 1;
+    for (int i = 0; i < alpha - 1; ++i) {
+        int value = 1 + i / 2;
+        a[i + 1]  = sign * value * interp;
+        sign *= -1;
+    }
+    // Matrix::print(polyBuffer.get());
+    {
+        auto A = computeA(a, alpha, n);
+        transpose(A_, A);
+    }
+    auto fdiag = computeFDiag(a, alpha);
+    // Matrix::print(fdiag.get());
+    {
+        auto A = computeA(a, alpha, r);
+        transpose(G_, A);
+    }
+    {
+        auto B = computeB(a, alpha);
+        transpose(B_, B);
+        transpose(B, B_);
+        B_ = B;
+    }
+}
+
+/*
+transform weight size: unit*unit*ROUND_UP(oc, 4)*ROUND_UP(ic, 4)
+*/
+CMatrix WinogradGenerator::allocTransformWeight(int batch, int channel, int height, int width, int unitCi, int unitCo) {
+    int ci = channel;
+    int co = batch;
+    ASSERT(width == height && width == std::get<1>(G_)[0]);
+    int ciC4 = UP_DIV(ci, unitCi);
+    int coC4 = UP_DIV(co, unitCo);
+    if(transform_inner_) {
+        return CMatrixCreate({coC4, std::get<1>(B_)[0] * std::get<1>(B_)[1], ciC4, unitCi, unitCo});
+    } else {
+        return CMatrixCreate({std::get<1>(B_)[0] * std::get<1>(B_)[1], coC4, ciC4, unitCi, unitCo});
+    }
+}
+
+// Winograd 4x4 For Conv 3x3
+static inline void WeightTransform4x4_3x3(const float *src, float *dst, float *k_tranform_data, const CMatrix& G,
+                                   const CMatrix& GTranspose, CMatrix& M, const DimsVector& weight_dest_strides,
+                                   int in_channel, int out_channel, int oz_index, int alpha_index) {
+    int ic_stride = 9;
+    int oc_stride = in_channel * ic_stride;
+    int unit_co = 4;
+    int unit_ci = 4;
+    const float* g = std::get<0>(G).get();
+    const float* gt = std::get<0>(GTranspose).get();
+    float* m = std::get<0>(M).get();
+    for (int oz = 0; oz < out_channel; ++oz) {
+        auto srcOz = src + oz * oc_stride;
+
+        int ozC4 = oz / unit_co;
+        int mx   = oz % unit_co;
+
+        auto dstOz = dst + weight_dest_strides[oz_index] * ozC4 + mx;
+        for (int sz = 0; sz < in_channel; ++sz) {
+            int szC4   = sz / unit_ci;
+            int my     = sz % unit_ci;
+            auto srcSz = srcOz + ic_stride * sz;
+            const float *k0    = srcSz;
+            const float *k1    = k0 + 3;
+            const float *k2    = k1 + 3;
+
+            // M = G * K
+            m[0] = k0[0] * g[0] + k1[0] * g[1] + k2[0] * g[2];
+            m[1] = k0[1] * g[0] + k1[1] * g[1] + k2[1] * g[2];
+            m[2] = k0[2] * g[0] + k1[2] * g[1] + k2[2] * g[2];
+            m[3] = k0[0] * g[3] + k1[0] * g[4] + k2[0] * g[5];
+            m[4] = k0[1] * g[3] + k1[1] * g[4] + k2[1] * g[5];
+            m[5] = k0[2] * g[3] + k1[2] * g[4] + k2[2] * g[5];
+            m[6] = k0[0] * g[6] + k1[0] * g[7] + k2[0] * g[8];
+            m[7] = k0[1] * g[6] + k1[1] * g[7] + k2[1] * g[8];
+            m[8] = k0[2] * g[6] + k1[2] * g[7] + k2[2] * g[8];
+            m[9] = k0[0] * g[9] + k1[0] * g[10] + k2[0] * g[11];
+            m[10] = k0[1] * g[9] + k1[1] * g[10] + k2[1] * g[11];
+            m[11] = k0[2] * g[9] + k1[2] * g[10] + k2[2] * g[11];
+
+            // K_Transform = M*GT
+            const float *gt0 = gt;
+            const float *gt1 = gt0 + 4;
+            const float *gt2 = gt1 + 4;
+            k_tranform_data[0] = m[0] * gt0[0] + m[1] * gt1[0] + m[2] * gt2[0];
+            k_tranform_data[1] = m[0] * gt0[1] + m[1] * gt1[1] + m[2] * gt2[1];
+            k_tranform_data[2] = m[0] * gt0[2] + m[1] * gt1[2] + m[2] * gt2[2];
+            k_tranform_data[3] = m[0] * gt0[3] + m[1] * gt1[3] + m[2] * gt2[3];
+            k_tranform_data[4] = m[3] * gt0[0] + m[4] * gt1[0] + m[5] * gt2[0];
+            k_tranform_data[5] = m[3] * gt0[1] + m[4] * gt1[1] + m[5] * gt2[1];
+            k_tranform_data[6] = m[3] * gt0[2] + m[4] * gt1[2] + m[5] * gt2[2];
+            k_tranform_data[7] = m[3] * gt0[3] + m[4] * gt1[3] + m[5] * gt2[3];
+            k_tranform_data[8] = m[6] * gt0[0] + m[7] * gt1[0] + m[8] * gt2[0];
+            k_tranform_data[9] = m[6] * gt0[1] + m[7] * gt1[1] + m[8] * gt2[1];
+            k_tranform_data[10] = m[6] * gt0[2] + m[7] * gt1[2] + m[8] * gt2[2];
+            k_tranform_data[11] = m[6] * gt0[3] + m[7] * gt1[3] + m[8] * gt2[3];
+            k_tranform_data[12] = m[9] * gt0[0] + m[10] * gt1[0] + m[11] * gt2[0];
+            k_tranform_data[13] = m[9] * gt0[1] + m[10] * gt1[1] + m[11] * gt2[1];
+            k_tranform_data[14] = m[9] * gt0[2] + m[10] * gt1[2] + m[11] * gt2[2];
+            k_tranform_data[15] = m[9] * gt0[3] + m[10] * gt1[3] + m[11] * gt2[3];
+
+            auto dstSz = dstOz + szC4 * weight_dest_strides[2] + unit_co * my;
+            // [alpha][alpha][oc4][ic4][16]
+            for (int i = 0; i < 16; ++i) {
+                *dstSz = k_tranform_data[i];
+                dstSz += weight_dest_strides[alpha_index];
+            }
+        }
+    }
+}
+
+/*
+transform weight from [oc][ic][kh][kw] to [unit][unit][co4][ci4][16]
+*/
+void WinogradGenerator::transformWeight(CMatrix& weightDest, const float* source, int batch, int channel, int height,
+                                        int width) {
+    auto GT = CMatrixCreate(std::get<1>(G_)[1], std::get<1>(G_)[0]);
+    transpose(GT, G_);
+
+    auto weight_dest_data    = std::get<0>(weightDest).get();
+    auto weight_dest_dims    = std::get<1>(weightDest);
+    auto weight_dest_strides = CMatrixGetStrides(weightDest);
+    auto B_dims              = std::get<1>(B_);
+
+    int ci          = channel;
+    int co          = batch;
+    int kernelCount = height;
+    int unitCi      = weight_dest_dims[3];
+    int unitCo      = weight_dest_dims[4];
+    auto alpha      = B_dims[0];
+
+    if (ci % unitCi != 0 || co % unitCo != 0) {
+        ::memset(weight_dest_data, 0, DimsVectorUtils::Count(weight_dest_dims) * sizeof(float));
+    }
+
+    auto M           = CMatrixCreate(kernelCount, alpha);
+    auto K_Transform = CMatrixCreate(alpha, alpha);
+
+    auto weightPtr      = source;
+    auto KTransformData = std::get<0>(K_Transform).get();
+
+    int oz_index, alpha_index;
+    if(transform_inner_) {
+        oz_index = 0;
+        alpha_index = 1;
+    } else {
+        oz_index = 1;
+        alpha_index = 0;
+    }
+
+    if (unitCi == 4 && unitCo == 4 && kernelCount == 3) {
+        WeightTransform4x4_3x3(weightPtr, weight_dest_data, KTransformData, G_, GT, M,
+                               weight_dest_strides, ci, co, oz_index, alpha_index);
+    } else {
+        for (int oz = 0; oz < co; ++oz) {
+            auto srcOz = weightPtr + oz * ci * kernelCount * kernelCount;
+
+            int ozC4 = oz / unitCo;
+            int mx   = oz % unitCo;
+
+            auto dstOz = weight_dest_data + weight_dest_strides[oz_index] * ozC4 + mx;
+            for (int sz = 0; sz < ci; ++sz) {
+                int szC4   = sz / unitCi;
+                int my     = sz % unitCi;
+                auto srcSz = srcOz + kernelCount * kernelCount * sz;
+
+                // M = G * K
+                matmul(M, G_, srcSz, {kernelCount, kernelCount});
+                // K_Transform = M*GT
+                matmul(K_Transform, M, GT);
+
+                auto dstSz = dstOz + szC4 * weight_dest_strides[2] + unitCo * my;
+                // [alpha][alpha][oc4][ic4][16]
+                for (int i = 0; i < alpha * alpha; ++i) {
+                    *(dstSz + i * weight_dest_strides[alpha_index]) = KTransformData[i];
+                }
+            }
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/source/tnn/utils/winograd_generator.h b/3rdparty/TNN/source/tnn/utils/winograd_generator.h
new file mode 100644
index 0000000..276b47b
--- /dev/null
+++ b/3rdparty/TNN/source/tnn/utils/winograd_generator.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_WINOGRAD_GENERATOR_H_
+#define TNN_SOURCE_TNN_WINOGRAD_GENERATOR_H_
+
+#include <memory>
+#include <tuple>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+typedef std::tuple<std::shared_ptr<float>, DimsVector> CMatrix;
+CMatrix CMatrixCreate(int w, int h);
+CMatrix CMatrixCreate(DimsVector dims);
+DimsVector CMatrixGetStrides(CMatrix &matrix);
+
+class WinogradGenerator {
+public:
+    WinogradGenerator(int computeUnit, int kernelSize, float interp = 0.5f, bool transform_inner = false);
+    ~WinogradGenerator() = default;
+
+    CMatrix A() const {
+        return A_;
+    }
+    CMatrix B() const {
+        return B_;
+    }
+    CMatrix G() const {
+        return G_;
+    }
+
+    CMatrix allocTransformWeight(int batch, int channel, int height, int width, int unitCi, int unitCo);
+    void transformWeight(CMatrix &dest, const float* source, int batch, int channel, int height, int width);
+
+private:
+    CMatrix A_;
+    CMatrix G_;
+    CMatrix B_;
+    int unit_;
+    int kernel_size_;
+    bool transform_inner_;
+};
+
+} // namespace TNN_NS
+
+#endif //TNN_SOURCE_TNN_WINOGRAD_GENERATOR_H_
diff --git a/3rdparty/TNN/test/CMakeLists.txt b/3rdparty/TNN/test/CMakeLists.txt
new file mode 100644
index 0000000..d272e13
--- /dev/null
+++ b/3rdparty/TNN/test/CMakeLists.txt
@@ -0,0 +1,55 @@
+file(GLOB TEST_SRCS *.h *.cc)
+#message(${TEST_SRCS})
+
+if(UNIX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
+endif()
+
+get_filename_component(TNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..  ABSOLUTE)
+
+include_directories(${TNN_ROOT})
+
+add_executable(TNNTest ${TEST_SRCS})
+
+if(TNN_BUILD_SHARED)
+    target_link_libraries(TNNTest
+        TNN
+        gflags
+    )
+
+    if(TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE)
+        add_library(TNNBenchmarkTest SHARED ${TEST_SRCS})
+
+        target_link_libraries(TNNBenchmarkTest
+            TNN
+            gflags
+        )
+    endif()
+elseif(SYSTEM.iOS OR SYSTEM.Darwin)
+    target_link_libraries(TNNTest
+        -Wl,-force_load TNN
+        gflags
+    )
+
+    if(TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE)
+        add_library(TNNBenchmarkTest STATIC ${TEST_SRCS})
+    endif()
+else()
+    message("target link libraries whole-archive")
+    target_link_libraries(TNNTest
+        -Wl,--whole-archive TNN -Wl,--no-whole-archive
+        gflags
+    )
+
+    if(TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE)
+        add_library(TNNBenchmarkTest STATIC ${TEST_SRCS})
+    endif()
+endif()
+
+if(TNN_TENSORRT_ENABLE)
+    target_link_libraries(TNNTest nvinfer)
+endif()
+
+if(TNN_UNIT_TEST_ENABLE)
+    add_subdirectory(unit_test)
+endif()
diff --git a/3rdparty/TNN/test/flags.cc b/3rdparty/TNN/test/flags.cc
new file mode 100644
index 0000000..7d3def5
--- /dev/null
+++ b/3rdparty/TNN/test/flags.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/flags.h"
+
+namespace TNN_NS {
+
+DEFINE_bool(h, false, help_message);
+
+DEFINE_string(mt, "", model_type_message);
+
+DEFINE_string(mp, "", model_path_message);
+
+DEFINE_string(dt, "ARM", device_type_message);
+
+DEFINE_string(nt, "NAIVE", network_type_message);
+
+DEFINE_string(lp, "", library_path_message);
+
+DEFINE_int32(di, 0, device_id_message);
+
+DEFINE_int32(ic, 1, iterations_count_message);
+
+DEFINE_int32(wc, 0, warm_up_count_message);
+
+DEFINE_string(ip, "", input_path_message);
+
+DEFINE_string(op, "", output_path_message);
+
+DEFINE_bool(fc, false, output_format_cmp_message);
+
+DEFINE_string(dl, "", device_list_message);
+
+DEFINE_bool(ub, false, unit_test_benchmark_message);
+
+DEFINE_int32(th, 1, cpu_thread_num_message);
+
+DEFINE_int32(it, 0, input_format_message);
+
+DEFINE_string(pr, "AUTO", precision_message);
+
+DEFINE_string(is, "", input_shape_message);
+
+DEFINE_bool(et, false, enable_tune_message);
+
+DEFINE_string(sc, "", scale_message);
+
+DEFINE_string(bi, "", bias_message);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/flags.h b/3rdparty/TNN/test/flags.h
new file mode 100644
index 0000000..71847e5
--- /dev/null
+++ b/3rdparty/TNN/test/flags.h
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_FLAGS_H_
+#define TNN_TEST_FLAGS_H_
+
+#include "gflags/gflags.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+static const char help_message[] = "print a usage message.";
+
+static const char model_type_message[] = "specify model type: TNN, OPENVINO, COREML, SNPE, NCNN, RKCACHE.";
+
+static const char model_path_message[] =
+    "specify model path: tnn proto path, openvino xml path, coreml "
+    "mlmodel path, snpe dlc path.";
+
+static const char device_type_message[] =
+    "specify tnn device type: NAIVE, X86, ARM, CUDA, METAL, OPENCL, default "
+    "is ARM.";
+
+static const char library_path_message[] =
+    "specify tnn NetworkConfig library_path. For metal, it is the tnn.metallib full path";
+
+static const char device_id_message[] = "specify device id(default 0).";
+
+static const char iterations_count_message[] = "iterations count (default 1).";
+
+static const char warm_up_count_message[] = "warm up count (default 0).";
+
+static const char input_path_message[] = "input file path";
+
+static const char output_path_message[] = "output file path";
+
+static const char output_format_cmp_message[] = "output format for comparison";
+
+static const char device_list_message[] = "device list(eg: 0,1,2,3)";
+
+static const char unit_test_benchmark_message[] = "enable unit benchmark(default false)";
+
+static const char cpu_thread_num_message[] = "cpu thread num(eg: 0,1,2,3, default 1)";
+
+static const char input_format_message[] = "input format(0: nchw float, 1:bgr u8, 2, gray u8, 3, int32)";
+
+static const char precision_message[] = "compute precision(HIGH, NORMAL, LOW)";
+
+static const char input_shape_message[] = "input shape: name[n,c,h,w]";
+
+static const char network_type_message[] = "network type: NAIVE, NPU, COREML, SNPE, OPENVINO, default NAIVE";
+
+static const char enable_tune_message[] = "enable tune kernel(default false)";
+
+static const char scale_message[] = "input scale: s0,s1,s2,...)";
+
+static const char bias_message[] = "input bias: b0,b1,b2,...)";
+
+DECLARE_bool(h);
+
+DECLARE_string(mt);
+
+DECLARE_string(nt);
+
+DECLARE_string(mp);
+
+DECLARE_string(dt);
+
+DECLARE_string(lp);
+
+DECLARE_int32(di);
+
+DECLARE_int32(ic);
+
+DECLARE_int32(wc);
+
+DECLARE_string(ip);
+
+DECLARE_string(op);
+
+DECLARE_bool(fc);
+
+DECLARE_string(dl);
+
+DECLARE_bool(ub);
+
+DECLARE_int32(th);
+
+DECLARE_int32(it);
+
+DECLARE_string(pr);
+
+DECLARE_string(is);
+
+DECLARE_bool(et);
+
+DECLARE_string(sc);
+
+DECLARE_string(bi);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_FLAGS_H_
diff --git a/3rdparty/TNN/test/test.cc b/3rdparty/TNN/test/test.cc
new file mode 100644
index 0000000..6d9ce1f
--- /dev/null
+++ b/3rdparty/TNN/test/test.cc
@@ -0,0 +1,555 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/test.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cfloat>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "test/timer.h"
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_dump_utils.h"
+#include "tnn/utils/blob_transfer_utils.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/omp_utils.h"
+#include "tnn/utils/string_utils_inner.h"
+
+int main(int argc, char* argv[]) {
+    return TNN_NS::test::Run(argc, argv);
+}
+
+namespace TNN_NS {
+
+namespace test {
+
+    int Run(int argc, char* argv[]) {
+        // parse command line params
+        if (!ParseAndCheckCommandLine(argc, argv))
+            return -1;
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+        g_tnn_dump_directory = FLAGS_op;
+#endif
+
+        // Set the cpu affinity.
+        // usually, -dl 0-3 for little core, -dl 4-7 for big core
+        // only works when -dl flags were set. benchmark script not set -dl flags
+        SetCpuAffinity();
+
+        ModelConfig model_config     = GetModelConfig();
+        NetworkConfig network_config = GetNetworkConfig();
+
+        InputShapesMap input_shape = GetInputShapesMap();
+
+        srand(102);
+
+        TNN net;
+        Status ret = net.Init(model_config);
+        model_config.params.clear();
+        if (CheckResult("init tnn", ret)) {
+            auto instance = net.CreateInst(network_config, ret, input_shape);
+            if (!CheckResult("create instance", ret)) {
+                return ret;
+            }
+            instance->SetCpuNumThreads(std::max(FLAGS_th, 1));
+
+            //get blob
+            BlobMap input_blob_map;
+            BlobMap output_blob_map;
+            void* command_queue;
+            instance->GetAllInputBlobs(input_blob_map);
+            instance->GetAllOutputBlobs(output_blob_map);
+            instance->GetCommandQueue(&command_queue);
+
+            //create mat and converter
+            MatMap input_mat_map = CreateBlobMatMap(input_blob_map, FLAGS_it);
+            InitInputMatMap(input_mat_map);
+            auto input_converters_map = CreateBlobConverterMap(input_blob_map);
+            auto input_params_map = CreateConvertParamMap(input_mat_map, true);
+
+            //mat format NCHW_FLOAT
+            MatMap output_mat_map = CreateBlobMatMap(output_blob_map, 0);
+            auto output_converters_map = CreateBlobConverterMap(output_blob_map);
+            auto output_params_map = CreateConvertParamMap(output_mat_map, false);
+
+            for (int i = 0; i < FLAGS_wc; ++i) {
+                for(auto element : input_converters_map) {
+                    auto name = element.first;
+                    auto blob_converter = element.second;
+                    blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
+                }
+                ret = instance->ForwardAsync(nullptr);
+                for(auto element : output_converters_map) {
+                    auto name = element.first;
+                    auto blob_converter = element.second;
+                    blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
+                }
+            }
+#if TNN_PROFILE
+            instance->StartProfile();
+#endif
+
+            std::string model_name = FLAGS_mp;
+            if(FLAGS_mp.find_last_of("/") != -1) {
+                model_name = FLAGS_mp.substr(FLAGS_mp.find_last_of("/") + 1); 
+            }   
+ 
+            Timer timer(model_name + " - " + FLAGS_dt);
+
+            for (int i = 0; i < FLAGS_ic; ++i) {
+                timer.Start();
+                for(auto element : input_converters_map) {
+                    auto name = element.first;
+                    auto blob_converter = element.second;
+                    ret = blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
+                    if (!CheckResult("ConvertFromMat", ret)) {
+                        return ret;
+                    }
+                }
+#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+                ret = instance->Forward();
+#else
+                ret = instance->ForwardAsync(nullptr);
+#endif
+                if (!CheckResult("Forward", ret)) {
+                    return ret;
+                }
+                for(auto element : output_converters_map) {
+                    auto name = element.first;
+                    auto blob_converter = element.second;
+                    ret = blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
+                    if (!CheckResult("ConvertToMat", ret)) {
+                        return ret;
+                    }
+                }
+                timer.Stop();
+            }
+#if TNN_PROFILE
+            instance->FinishProfile(true);
+#endif
+            if (!FLAGS_op.empty()) {
+                WriteOutput(output_mat_map);
+            }
+
+            timer.Print();
+
+            FreeMatMapMemory(input_mat_map);
+            FreeMatMapMemory(output_mat_map);
+            return 0;
+        } else {
+            return ret;
+        }
+    }
+
+    bool ParseAndCheckCommandLine(int argc, char* argv[]) {
+        gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+        if (FLAGS_h) {
+            ShowUsage();
+            return false;
+        }
+
+        if (FLAGS_ic < 1) {
+            printf("Parameter -ic should be greater than zero (default 1) \n");
+            ShowUsage();
+            return false;
+        }
+
+        if (FLAGS_mp.empty()) {
+            printf("Parameter -mp is not set \n");
+            ShowUsage();
+            return false;
+        }
+
+        return true;
+    }
+
+    void ShowUsage() {
+        printf("    -h                      \t%s \n", help_message);
+        printf("    -mt \"<model type>\"    \t%s \n", model_type_message);
+        printf("    -mp \"<model path>\"    \t%s \n", model_path_message);
+        printf("    -dt \"<device type>\"   \t%s \n", device_type_message);
+        printf("    -lp \"<library path>\"  \t%s \n", library_path_message);
+        printf("    -ic \"<number>\"        \t%s \n", iterations_count_message);
+        printf("    -wc \"<number>\"        \t%s \n", warm_up_count_message);
+        printf("    -ip \"<path>\"          \t%s \n", input_path_message);
+        printf("    -op \"<path>\"          \t%s \n", output_path_message);
+        printf("    -dl \"<device list>\"   \t%s \n", device_list_message);
+        printf("    -th \"<thread umber>\"  \t%s \n", cpu_thread_num_message);
+        printf("    -it \"<input type>\"    \t%s \n", input_format_message);
+        printf("    -pr \"<precision >\"    \t%s \n", precision_message);
+        printf("    -is \"<input shape>\"   \t%s \n", input_shape_message);
+        printf("    -fc \"<format for compare>\t%s \n", output_format_cmp_message);
+        printf("    -nt \"<network type>\t%s \n", output_format_cmp_message);
+        printf("    -et \"<enable tune>\t%s \n", enable_tune_message);
+        printf("    -sc \"<input scale>\t%s \n", scale_message);
+        printf("    -bi \"<input bias>\t%s \n", bias_message);
+    }
+
+    void SetCpuAffinity() {
+        // set cpu affinity, only work in arm mode
+        if (!FLAGS_dl.empty()) {
+            auto split = [=](const std::string str, char delim, std::vector<std::string>& str_vec) {
+                std::stringstream ss(str);
+                std::string element;
+                while (std::getline(ss, element, delim))
+                    str_vec.push_back(element);
+            };
+
+            std::vector<std::string> devices;
+            split(FLAGS_dl, ',', devices);
+            std::vector<int> device_list;
+            for (auto iter : devices) {
+                device_list.push_back(atoi(iter.c_str()));
+            }
+            CpuUtils::SetCpuAffinity(device_list);
+        }
+    }
+
+    InputShapesMap GetInputShapesMap() {
+        InputShapesMap input_shape;
+        if(!FLAGS_is.empty()) {
+            std::string input_shape_message(FLAGS_is);
+            std::string delimiter = "[";
+            std::vector<int> input_dim;
+            std::ptrdiff_t p1 = 0, p2;
+            p2 = input_shape_message.find(delimiter, p1);
+            std::string input_name = input_shape_message.substr(p1, p2 -p1);
+            p1 = p2 + 1;
+            delimiter = ",";
+            while (true) {
+                p2 = input_shape_message.find(delimiter, p1);
+                if (p2 != std::string::npos) {
+                    input_dim.push_back(atoi(input_shape_message.substr(p1, p2 - p1).c_str()));
+                    p1 = p2 + 1;
+                } else {
+                    input_dim.push_back(atoi(input_shape_message.substr(p1, input_shape_message.length() - 1 - p1).c_str()));
+                    break;
+                }
+            }
+            input_shape[input_name] = input_dim;
+        }
+        return input_shape;
+    }
+
+    ModelConfig GetModelConfig() {
+        ModelConfig config;
+        config.model_type = ConvertModelType(FLAGS_mt);
+        if (config.model_type == MODEL_TYPE_TNN || config.model_type == MODEL_TYPE_OPENVINO ||
+            config.model_type == MODEL_TYPE_NCNN) {
+            std::string network_path = FLAGS_mp;
+            int size                 = static_cast<int>(network_path.size());
+            std::string model_path;
+
+            // TNN file names: xxx.tnnproto  xxx.tnnmodel
+            // NCNN file names: xxx.param xxx.bin
+            if (config.model_type == MODEL_TYPE_TNN) {
+                model_path = network_path.substr(0, size - 5) + "model";
+            } else if (config.model_type == MODEL_TYPE_NCNN) {
+                model_path = network_path.substr(0, size - 5) + "bin";
+            } else {
+                model_path = network_path.substr(0, size - 3) + "bin";
+            }
+
+            std::ifstream proto_stream(network_path);
+            if (!proto_stream.is_open() || !proto_stream.good()) {
+                printf("read proto_file failed!\n");
+                return config;
+            }
+            auto buffer =
+                    std::string((std::istreambuf_iterator<char>(proto_stream)), std::istreambuf_iterator<char>());
+            config.params.push_back(buffer);
+
+            if (config.model_type == MODEL_TYPE_TNN || config.model_type == MODEL_TYPE_NCNN) {
+                std::ifstream model_stream(model_path, std::ios::binary);
+                if (!model_stream.is_open() || !model_stream.good()) {
+                    config.params.push_back("");
+                    return config;
+                }
+                std::stringstream model_content;
+                model_content << model_stream.rdbuf();
+
+                config.params.push_back(model_content.str());
+            } else {
+                config.params.push_back(model_path);
+            }
+        } else {
+            config.params.push_back(FLAGS_mp);
+        }
+        return config;
+    }
+
+    NetworkConfig GetNetworkConfig() {
+        NetworkConfig config;
+        // Precision : AUTO for float computing.
+        config.precision = ConvertPrecision(FLAGS_pr);
+
+        config.enable_tune_kernel = FLAGS_et;
+#if defined(__ANDROID__)
+        config.cache_path = "/data/local/tmp/";
+#else
+        config.cache_path = "";
+#endif
+
+        // Device Type: ARM, OPENECL, ...
+        config.device_type = ConvertDeviceType(FLAGS_dt);
+
+        // use model type instead, may change later for same model type with
+        // different network type
+        if (config.device_type == DEVICE_CUDA) {
+            config.network_type = NETWORK_TYPE_TENSORRT;
+        } else {
+            config.network_type = ConvertNetworkType(FLAGS_nt);
+        }
+
+        if (FLAGS_lp.length() > 0) {
+            config.library_path = {FLAGS_lp};
+        }
+        return config;
+    }
+
+    bool CheckResult(std::string desc, Status result) {
+        if (result != 0) {
+            LOGE("%s failed: %s \n", desc.c_str(), result.description().c_str());
+            return false;
+        } else {
+            LOGD("%s success! \n", desc.c_str());
+            return true;
+        }
+    }
+
+    MatMap CreateBlobMatMap(BlobMap& blob_map, int format_type) {
+        MatMap mat_map;
+        for (auto iter : blob_map) {
+            auto name = iter.first;
+            Blob* device_blob = iter.second;
+            BlobDesc blob_desc = device_blob->GetBlobDesc();
+
+            // Format Types: (0: NCHW FLOAT), (1: 8UC3), (2: 8UC1)
+            DataType data_type = DATA_TYPE_INT8;
+            MatType mat_type;
+            if (format_type == 0) {
+                data_type = DATA_TYPE_FLOAT;
+                mat_type = NCHW_FLOAT;
+            } else if (format_type == 1) {
+                mat_type = N8UC3;
+            } else if (format_type == 2) {
+                mat_type = NGRAY;
+            } else if (format_type == 3) {
+                mat_type = NC_INT32;
+                data_type = DATA_TYPE_INT32;
+            } else if (format_type == 4) {
+                mat_type = RESERVED_INT8_TEST;
+            }
+
+            if (blob_desc.data_type == DATA_TYPE_INT32) {
+                mat_type = NC_INT32;
+            }
+
+            int bytes = DimsVectorUtils::Count(blob_desc.dims) * DataTypeUtils::GetBytesSize(data_type);
+            void* mat_data = malloc(bytes);
+            auto mat = std::make_shared<Mat>(DEVICE_NAIVE, mat_type, blob_desc.dims, mat_data);
+            mat_map[name] = mat;
+        }
+        return mat_map;
+    }
+
+
+    void InitInputMatMap(MatMap& mat_map) {
+        for (auto iter : mat_map) {
+            auto name = iter.first;
+            auto mat = iter.second;
+            void* mat_data = mat->GetData();
+            int data_count     = DimsVectorUtils::Count(mat->GetDims());
+            auto mat_type = mat->GetMatType();
+            if (FLAGS_ip.empty()) {
+                for (int i = 0; i < data_count; i++) {
+                    if (mat_type == NCHW_FLOAT) {
+                        reinterpret_cast<float*>(mat_data)[i] = (float)(rand() % 256) / 128.0f;
+                    } else if (mat_type == NC_INT32) {
+                        reinterpret_cast<int32_t*>(mat_data)[i] = rand() % 2;
+                    } else {
+                        reinterpret_cast<uint8_t*>(mat_data)[i] = (rand() % 256);
+                    }
+                }
+            } else {
+                LOGD("input path: %s\n", FLAGS_ip.c_str());
+                std::ifstream input_stream(FLAGS_ip);
+                for (int i = 0; i < data_count; i++) {
+                    if (mat_type == NCHW_FLOAT) {
+                        input_stream >> reinterpret_cast<float*>(mat_data)[i];
+                    } else if (mat_type == NC_INT32) {
+                        input_stream >> reinterpret_cast<int32_t*>(mat_data)[i];
+                    } else {
+                        int val;
+                        input_stream >> val;
+                        reinterpret_cast<uint8_t*>(mat_data)[i] = (uint8_t)val;
+                    }
+                }
+                input_stream.close();
+            }
+        }
+    }
+
+    std::map<std::string, std::shared_ptr<BlobConverter>> CreateBlobConverterMap(BlobMap& blob_map) {
+        std::map<std::string, std::shared_ptr<BlobConverter>> converter_map;
+        for(auto iter : blob_map) {
+            auto blob_converter = std::make_shared<BlobConverter>(iter.second);
+            converter_map[iter.first] = blob_converter;
+        }
+        return converter_map;
+    }
+
+    static void SetScaleOrBias(std::vector<float> &param, const std::string &message) {
+        std::string delimiter = ",";
+        std::vector<float> fval;
+        std::ptrdiff_t p1 = 0, p2;
+        while (true) {
+            p2 = message.find(delimiter, p1);
+            if (p2 != std::string::npos) {
+                fval.push_back(atof(message.substr(p1, p2 - p1).c_str()));
+                p1 = p2 + 1;
+            } else {
+                fval.push_back(atof(message.substr(p1, message.length() - p1).c_str()));
+                break;
+            }
+        }
+        if (fval.size() > param.size()) {
+            param = fval;
+        } else {
+            for (int i = 0; i < fval.size(); ++i) {
+                param[i] = fval[i];
+            }
+        }
+    }
+
+    static bool IsImageMat(MatType mat_type) {
+        if (mat_type == N8UC3 || mat_type == N8UC4 || mat_type == NGRAY ||
+            mat_type == NNV12 || mat_type == NNV21) {
+                return true;
+        }
+        return false;
+    }
+
+    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input) {
+        std::map<std::string, MatConvertParam> param_map;
+        for(auto iter : mat_map) {
+            MatConvertParam param;
+            auto name = iter.first;
+            auto mat = iter.second;
+            auto mat_type = mat->GetMatType();
+            auto dims = mat->GetDims();
+            // scale
+            if(is_input && !FLAGS_sc.empty()) {
+                SetScaleOrBias(param.scale, FLAGS_sc);
+            } else {
+                if (mat_type == RESERVED_INT8_TEST) {
+                    std::fill(param.scale.begin(), param.scale.end(), 1.0f);
+                } else if(IsImageMat(mat_type)) {
+                    std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
+                } else if(dims[1] > 4) {
+                    param.scale = std::vector<float>(dims[1], 1);
+                }
+            }
+
+            // bias
+            if(is_input && !FLAGS_bi.empty()) {
+                SetScaleOrBias(param.bias, FLAGS_bi);
+            } else {
+                if (mat_type == RESERVED_INT8_TEST) {
+                    std::fill(param.bias.begin(), param.bias.end(), 0);
+                } else if(IsImageMat(mat_type)) {
+                    std::fill(param.bias.begin(), param.bias.end(), 0);
+                } else if(dims[1] > 4) {
+                    param.bias  = std::vector<float>(dims[1], 0);
+                }
+            }
+
+            param_map[name] = param;
+        }
+        return param_map;
+    }
+
+
+    void WriteOutput(MatMap& outputs) {
+        std::ofstream f(FLAGS_op);
+        LOGD("the output path: %s\n", FLAGS_op.c_str());
+        if (!FLAGS_fc) {
+            LOGD("output path: %s\n", FLAGS_op.c_str());
+            f << outputs.size() << std::endl;
+            for (auto output : outputs) {
+                f << output.first;
+                auto mat      = output.second;
+                DimsVector dims = mat->GetDims();
+                f << " mat_type: " << mat->GetMatType() ;
+                f << " dims: " << dims.size();
+                for (auto dim : dims) {
+                    f << " " << dim;
+                }
+                f << std::endl;
+            }
+        } else {
+            for (auto output : outputs) {
+                LOGD("the output name: %s\n", output.first.c_str());
+                auto mat        = output.second;
+                DimsVector dims   = mat->GetDims();
+                std::string shape = "( ";
+                for (auto dim : dims) {
+                    shape += ToString(dim) + " ";
+                }
+                shape += ")";
+                LOGD("the output shape: %s\n", shape.c_str());
+            }
+        }
+        for (auto output : outputs) {
+            auto mat  = output.second;
+            int data_count     = DimsVectorUtils::Count(mat->GetDims());
+            if (mat->GetMatType() == NC_INT32 ) {
+                int * data = reinterpret_cast<int*>(mat->GetData());
+                for (int c = 0; c < data_count; ++c) {
+                    f << data[c] << std::endl;
+                }
+            } else {
+                float* data = reinterpret_cast<float*>(mat->GetData());
+                for (int c = 0; c < data_count; ++c) {
+                    f << std::fixed << std::setprecision(6) << data[c] << std::endl;
+                }
+            }
+        }
+        f.close();
+    }
+
+    void FreeMatMapMemory(MatMap& mat_map) {
+        for(auto iter : mat_map) {
+            free(iter.second->GetData());
+        }
+    }
+
+}  // namespace test
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/test.h b/3rdparty/TNN/test/test.h
new file mode 100644
index 0000000..e98fbc2
--- /dev/null
+++ b/3rdparty/TNN/test/test.h
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_TEST_H_
+#define TNN_TEST_TEST_H_
+
+#include "tnn/core/blob.h"
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+namespace test {
+
+    int Run(int argc, char* argv[]);
+
+    bool ParseAndCheckCommandLine(int argc, char* argv[]);
+
+    void ShowUsage();
+
+    void SetCpuAffinity();
+
+    InputShapesMap GetInputShapesMap();
+
+    ModelConfig GetModelConfig();
+
+    NetworkConfig GetNetworkConfig();
+
+    bool CheckResult(std::string desc, Status result);
+
+    MatMap CreateBlobMatMap(BlobMap& blob_map, int mat_type);
+
+    void InitInputMatMap(MatMap& mat_map);
+
+    std::map<std::string, std::shared_ptr<BlobConverter>> CreateBlobConverterMap(BlobMap& blob_map);
+
+    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input);
+
+    void WriteOutput(MatMap& outputs);
+
+    void FreeMatMapMemory(MatMap& mat_map);
+
+}  // namespace test
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_TEST_H_
diff --git a/3rdparty/TNN/test/test_utils.cc b/3rdparty/TNN/test/test_utils.cc
new file mode 100644
index 0000000..2bfd876
--- /dev/null
+++ b/3rdparty/TNN/test/test_utils.cc
@@ -0,0 +1,197 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "test_utils.h"
+
+#include <math.h>
+
+#include <algorithm>
+
+#include "tnn/core/common.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+DeviceType ConvertDeviceType(std::string device_type) {
+    std::transform(device_type.begin(), device_type.end(), device_type.begin(), ::toupper);
+    if ("METAL" == device_type) {
+        return DEVICE_METAL;
+    } else if ("OPENCL" == device_type) {
+        return DEVICE_OPENCL;
+    } else if ("CUDA" == device_type) {
+        return DEVICE_CUDA;
+    } else if ("X86" == device_type) {
+        return DEVICE_X86;
+    } else if ("NAIVE" == device_type) {
+        return DEVICE_NAIVE;
+    } else if ("HUAWEI_NPU" == device_type) {
+        return DEVICE_HUAWEI_NPU;
+    } else if ("RKNPU" == device_type) {
+        return DEVICE_RK_NPU;
+    } else {
+        return DEVICE_ARM;
+    }
+}
+
+ModelType ConvertModelType(std::string model_type) {
+    if ("OPENVINO" == model_type) {
+        return MODEL_TYPE_OPENVINO;
+    } else if ("SNPE" == model_type) {
+        return MODEL_TYPE_SNPE;
+    } else if ("COREML" == model_type) {
+        return MODEL_TYPE_COREML;
+    } else if ("NCNN" == model_type) {
+        return MODEL_TYPE_NCNN;
+    } else if ("RKCACHE" == model_type) {
+        return MODEL_TYPE_RKCACHE;
+    } else {
+        return MODEL_TYPE_TNN;
+    }
+}
+
+NetworkType ConvertNetworkType(std::string network_type) {
+    if ("OPENVINO" == network_type) {
+        return NETWORK_TYPE_OPENVINO;
+    } else if ("SNPE" == network_type) {
+        return NETWORK_TYPE_SNPE;
+    } else if ("COREML" == network_type) {
+        return NETWORK_TYPE_COREML;
+    } else if ("HUAWEI_NPU" == network_type) {
+        return NETWORK_TYPE_HUAWEI_NPU;
+    } else if ("RKNPU" == network_type) {
+        return NETWORK_TYPE_RK_NPU;
+    } else if ("TRT" == network_type) {
+        return NETWORK_TYPE_TENSORRT;
+    } else {
+        return NETWORK_TYPE_DEFAULT;
+    }
+}
+
+Precision ConvertPrecision(std::string precision) {
+    if ("HIGH" == precision) {
+        return PRECISION_HIGH;
+    } else if ("NORMAL" == precision) {
+        return PRECISION_NORMAL;
+    } else if ("LOW" == precision) {
+        return PRECISION_LOW;
+    } else {
+        return PRECISION_AUTO;
+    }
+}
+
+Precision SetPrecision(DeviceType dev, DataType dtype) {
+    if (DATA_TYPE_BFP16 == dtype) {
+        return PRECISION_LOW;
+    } else if (DATA_TYPE_FLOAT == dtype && dev == DEVICE_ARM) {
+        return PRECISION_HIGH;
+    }
+
+    return PRECISION_AUTO;
+}
+
+int CompareData(const float* ref_data, const float* result_data, size_t n, float ep) {
+    for (unsigned long long i = 0; i < n; i++) {
+        float diff = static_cast<float>(fabs(result_data[i] - ref_data[i]));
+        float sum  = static_cast<float>(fabs(result_data[i]) + fabs(ref_data[i]));
+        if (fabs(diff / sum) > ep && fabs(diff) > 1e-4f) {
+            printf("ERROR AT %llu result %.6f ref %.6f\n", i, result_data[i], ref_data[i]);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+// for arm half check
+int CompareData(const float* ref_data, const float* result_data, size_t n, float ep, float dp) {
+    int relative_error_flag = 0;
+    for (unsigned long long i = 0; i < n; i++) {
+        float diff = static_cast<float>(fabs(result_data[i] - ref_data[i]));
+        float sum  = static_cast<float>(fabs(result_data[i]) + fabs(ref_data[i]));
+        if (fabs(diff / sum) > ep && fabs(diff) > dp) {
+            relative_error_flag = 1;
+            break;
+        }
+    }
+    // if relative error not pass, calculate cosine similarity
+    if (relative_error_flag == 0) {
+        return 0;
+    } else {
+        double sum_res = 0.f;
+        double sum_ref = 0.f;
+        double sum_dot = 0.f;
+        for (unsigned long long i = 0; i < n; i++) {
+            sum_res += result_data[i] * result_data[i];
+            sum_ref += ref_data[i] * ref_data[i];
+            sum_dot += result_data[i] * ref_data[i];
+        }
+        // need to avoid sum_res or sum_ref is 0.0f
+        double cos_sim = sum_dot / ((sqrt(sum_res) + 1e-9f) * (sqrt(sum_ref) + 1e-9f));
+        if (cos_sim < 0.9998f) {
+            printf("ERROR COSINE SIMILARITY %.6f < 0.9998\n", cos_sim);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int CompareData(const bfp16_t* ref_data, const bfp16_t* result_data, size_t n, float ep) {
+    for (unsigned long long i = 0; i < n; i++) {
+        float diff = static_cast<float>(fabs(float(result_data[i]) - float(ref_data[i])));
+        float sum  = static_cast<float>(fabs(float(result_data[i])) + fabs(float(ref_data[i])));
+        if (fabs(diff / sum) > ep && fabs(diff) > 5e-2f) {
+            printf("ERROR AT %llu result %.6f ref %.6f\n", i, float(result_data[i]), float(ref_data[i]));
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int CompareData(const int8_t* ref_data, const int8_t* result_data, size_t n) {
+    for (unsigned long long i = 0; i < n; i++) {
+        if (abs(result_data[i] - ref_data[i]) > 1) {
+            LOGE("ERROR AT %llu result %d ref %d\n", i, result_data[i], ref_data[i]);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int CompareData(const uint8_t* ref_data, const uint8_t* result_data, int mat_channel, int channel, size_t n) {
+    for (unsigned long long i = 0; i < n; i++) {
+        int c = i % mat_channel;
+        if (c >= channel)
+            continue;
+        if (abs(result_data[i] - ref_data[i]) > 1) {
+            LOGE("ERROR AT %llu result %d ref %d\n", i, result_data[i], ref_data[i]);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int CompareData(const int* ref_data, const int* result_data, size_t n) {
+    for (unsigned long long i = 0; i < n; i++) {
+        if (result_data[i] - ref_data[i] != 0) {
+            LOGE("ERROR AT %llu result %d ref %d\n", i, result_data[i], ref_data[i]);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/test_utils.h b/3rdparty/TNN/test/test_utils.h
new file mode 100644
index 0000000..f2b74e6
--- /dev/null
+++ b/3rdparty/TNN/test/test_utils.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_TEST_UTILS_H_
+#define TNN_TEST_TEST_UTILS_H_
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+DeviceType ConvertDeviceType(std::string device_type);
+
+ModelType ConvertModelType(std::string model_type);
+
+NetworkType ConvertNetworkType(std::string network_type);
+
+Precision ConvertPrecision(std::string precision);
+
+Precision SetPrecision(DeviceType dev, DataType dtype);
+
+int CompareData(const float* ref_data, const float* result_data, size_t n, float ep);
+int CompareData(const float* ref_data, const float* result_data, size_t n, float ep, float dp);
+int CompareData(const bfp16_t* ref_data, const bfp16_t* result_data, size_t n, float ep);
+int CompareData(const int8_t* ref_data, const int8_t* result_data, size_t n);
+int CompareData(const int* ref_data, const int* result_data, size_t n);
+
+int CompareData(const uint8_t* ref_data, const uint8_t* result_data, int mat_channel, int channel, size_t n);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_TEST_UTILS_H_
diff --git a/3rdparty/TNN/test/timer.cc b/3rdparty/TNN/test/timer.cc
new file mode 100644
index 0000000..ca4a754
--- /dev/null
+++ b/3rdparty/TNN/test/timer.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/timer.h"
+
+#include <cmath>
+
+namespace TNN_NS {
+
+namespace test {
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+Timer::Timer(std::string timer_info) {
+    timer_info_ = timer_info;
+    Reset();
+}
+
+void Timer::Start() {
+    start_ = system_clock::now();
+}
+
+void Timer::Stop() {
+    stop_ = system_clock::now();
+    float delta = duration_cast<microseconds>(stop_ - start_).count() / 1000.0f;
+    min_         = static_cast<float>(fmin(min_, delta));
+    max_         = static_cast<float>(fmax(max_, delta));
+    sum_ += delta;
+    count_++;
+}
+
+void Timer::Reset() {
+    min_ = FLT_MAX;
+    max_ = FLT_MIN;
+    sum_ = 0.0f;
+    count_ = 0;
+    stop_ = start_ = system_clock::now();
+}
+   
+void Timer::Print() {
+    char min_str[16];
+    snprintf(min_str, 16, "%6.3f", min_);
+    char max_str[16];
+    snprintf(max_str, 16, "%6.3f", max_);
+    char avg_str[16];
+    snprintf(avg_str, 16, "%6.3f", sum_ / (float)count_);
+    LOGI("%-45s TNN Benchmark time cost: min = %-8s ms  |  max = %-8s ms  |  avg = %-8s ms \n", timer_info_.c_str(),
+         min_str, max_str, avg_str);
+}
+
+} // namespace test
+
+} // namespace TNN_NS
diff --git a/3rdparty/TNN/test/timer.h b/3rdparty/TNN/test/timer.h
new file mode 100644
index 0000000..9ed258f
--- /dev/null
+++ b/3rdparty/TNN/test/timer.h
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_TIMER_H_
+#define TNN_TEST_TIMER_H_
+
+#include <chrono>
+#include <string>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+namespace test {
+
+using std::chrono::time_point;
+using std::chrono::system_clock;
+
+class Timer {
+public:
+    Timer(std::string timer_info);
+    void Start();
+    void Stop();
+    void Reset();
+    void Print();
+
+private:
+    float min_;
+    float max_;
+    float sum_;
+    std::string timer_info_;
+    time_point<system_clock> start_;
+    time_point<system_clock> stop_;
+    int count_;
+};
+
+} // namespace test
+
+} // namespace TNN_NS
+
+#endif // TNN_TEST_TIMER_H_ 
diff --git a/3rdparty/TNN/test/unit_test/CMakeLists.txt b/3rdparty/TNN/test/unit_test/CMakeLists.txt
new file mode 100644
index 0000000..25d7ccc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_subdirectory(third_party/googletest)
+
+set(GTEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest")
+set(GTEST_INCLUDE_DIR "${GTEST_DIR}/include")
+include_directories(${GTEST_INCLUDE_DIR})
+
+if(TNN_UNIT_TEST_BENCHMARK)
+    add_definitions(-DTNN_UNIT_TEST_BENCHMARK)
+endif()
+
+file(GLOB UNIT_TEST_SRCS *.cc layer_test/*.cc utils/*.cc ../test_utils.cc ../flags.cc ../timer.cc)
+#message(${UNIT_TEST_SRCS})
+include_directories(${CMAKE_SOURCE_DIR}/test/unit_test)
+include_directories(${CMAKE_SOURCE_DIR})
+
+add_executable(unit_test ${UNIT_TEST_SRCS})
+
+target_link_libraries(unit_test
+    TNN
+    gtest
+    gflags
+    )
+
+if(TNN_TENSORRT_ENABLE)
+    target_link_libraries(unit_test nvinfer)
+endif()
+
+add_test(NAME unit_test COMMAND unit_test)
diff --git a/3rdparty/TNN/test/unit_test/README.md b/3rdparty/TNN/test/unit_test/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/test/unit_test/blob_converter_test.cc b/3rdparty/TNN/test/unit_test/blob_converter_test.cc
new file mode 100644
index 0000000..251602c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/blob_converter_test.cc
@@ -0,0 +1,411 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/blob_converter_test.h"
+
+#include "test/timer.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/unit_test_macro.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/string_format.h"
+#include "utils/network_helpers.h"
+
+namespace TNN_NS {
+
+AbstractDevice* BlobConverterTest::cpu_;
+AbstractDevice* BlobConverterTest::device_;
+Context* BlobConverterTest::cpu_context_;
+Context* BlobConverterTest::device_context_;
+
+void BlobConverterTest::SetUpTestCase() {
+    SetUpEnvironment(&cpu_, &device_, &cpu_context_, &device_context_);
+}
+
+void BlobConverterTest::TearDownTestCase() {
+    delete cpu_context_;
+    delete device_context_;
+}
+
+bool BlobConverterTest::TestFilterCheck(const DataType& blob_data_type, const DeviceType& dev, const MatType& mat_type,
+                                        const int batch, const int channel, const int input_size,
+                                        const bool reverse_channel) {
+    if (blob_data_type == DATA_TYPE_INT8 && DEVICE_ARM != dev) {
+        return true;
+    }
+    if (blob_data_type == DATA_TYPE_HALF && DEVICE_ARM != dev) {
+        return true;
+    }
+
+#if TNN_ARM82
+    static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
+    if (blob_data_type == DATA_TYPE_HALF && !cpu_support_fp16) {
+        return true;
+    }
+#else
+    if (blob_data_type == DATA_TYPE_HALF) {
+        return true;
+    }
+#endif
+
+    if (DEVICE_METAL == dev && !(NCHW_FLOAT == mat_type || ((N8UC3 == mat_type || N8UC4 == mat_type) && batch == 1))) {
+        return true;
+    }
+
+    if (mat_type == N8UC3 && channel != 3) {
+        return true;
+    } else if (mat_type == N8UC4 && channel != 3 && channel != 4) {
+        return true;
+    } else if (mat_type == NGRAY && channel != 1) {
+        return true;
+    } else if ((mat_type == NNV12 || mat_type == NNV21) && (channel != 3 || input_size % 2 != 0 || DEVICE_ARM != dev)) {
+        return true;
+    } else if ((mat_type == NGRAY || mat_type == NNV12 || mat_type == NNV21 || mat_type == NCHW_FLOAT) &&
+               reverse_channel) {
+        return true;
+    }
+    return false;
+}
+
+bool BlobConverterTest::OpenCLMatTest(Mat& cpu_mat_in, MatConvertParam& from_mat_param, MatConvertParam& to_mat_param,
+                                      const DimsVector& dims, const int in_size, const int out_size, MatType mat_type,
+                                      const int mat_channel, const int channel, BlobConverter& device_converter,
+                                      void* device_command_queue, void* mat_out_ref_data) {
+    DeviceType dev  = ConvertDeviceType(FLAGS_dt);
+    bool cmp_result = 0;
+    if (dev == DEVICE_OPENCL && mat_type == N8UC4) {
+        void* dev_mat_out_dev_data = nullptr;
+        void* dev_mat_out_data     = nullptr;
+        dev_mat_out_dev_data       = malloc(out_size * sizeof(uint8_t));
+        dev_mat_out_data           = malloc(out_size * sizeof(uint8_t));
+#define FREE()                                                                                                         \
+    free(dev_mat_out_dev_data);                                                                                        \
+    free(dev_mat_out_data);
+
+#define FREE_AND_RETURN()                                                                                              \
+    FREE();                                                                                                            \
+    return false;
+
+        Mat device_mat_in(dev, mat_type, dims);
+        Status ret;
+        ret = MatUtils::Copy(cpu_mat_in, device_mat_in, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("copy cpu mat to device failed, mat type: %d\n", mat_type);
+            FREE_AND_RETURN();
+        }
+        ret = device_converter.ConvertFromMat(device_mat_in, from_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert mat to blob failed, mat type: %d\n", mat_type);
+            FREE_AND_RETURN();
+        }
+
+        Mat device_mat_out(dev, mat_type, dims);
+        Mat dev_mat_out_dev(DEVICE_NAIVE, mat_type, dims, dev_mat_out_dev_data);
+        Mat dev_mat_out(DEVICE_NAIVE, mat_type, dims, dev_mat_out_data);
+
+        ret = device_converter.ConvertToMat(device_mat_out, to_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert blob to mat failed, mat type: %d\n", mat_type);
+            FREE_AND_RETURN();
+        }
+        ret = MatUtils::Copy(device_mat_out, dev_mat_out_dev, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("copy device mat to cpu failed, mat type: %d\n", mat_type);
+            FREE_AND_RETURN();
+        }
+        cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data), static_cast<uint8_t*>(dev_mat_out_dev_data),
+                                  mat_channel, channel, out_size);
+
+        ret = device_converter.ConvertToMat(dev_mat_out, to_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert blob to mat failed, mat type: %d\n", mat_type);
+            FREE_AND_RETURN();
+        }
+        cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data), static_cast<uint8_t*>(dev_mat_out_data),
+                                  mat_channel, channel, out_size);
+
+        FREE();
+
+#undef FREE
+#undef FREE_AND_RETURN
+    }
+    return cmp_result;
+}
+
+INSTANTIATE_TEST_SUITE_P(BlobConverterTest, BlobConverterTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel
+                             testing::Values(1, 3, 4, 16),
+                             // inputsize
+                             testing::Values(1, 15, 16),
+                             // scale
+                             testing::Values(0.5, 1.0),
+                             // bias
+                             testing::Values(0.0, 1.0),
+                             // reverse_channel
+                             testing::Values(false, true),
+                             // mat type
+                             testing::Values(N8UC4, N8UC3, NGRAY, NNV12, NNV21,
+                                             NCHW_FLOAT),  // datatype
+                             testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_INT8, DATA_TYPE_HALF)));
+
+TEST_P(BlobConverterTest, BlobConverterTest) {
+    int batch               = std::get<0>(GetParam());
+    int channel             = std::get<1>(GetParam());
+    int input_size          = std::get<2>(GetParam());
+    float scale             = std::get<3>(GetParam());
+    float bias              = std::get<4>(GetParam());
+    bool reverse_channel    = std::get<5>(GetParam());
+    MatType mat_type        = std::get<6>(GetParam());
+    DataType blob_data_type = std::get<7>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (TestFilterCheck(blob_data_type, dev, mat_type, batch, channel, input_size, reverse_channel)) {
+        GTEST_SKIP();
+    }
+
+    int mat_channel;
+    if (mat_type == N8UC4) {
+        mat_channel = 4;
+    } else {
+        mat_channel = channel;
+    }
+    int in_size       = batch * mat_channel * input_size * input_size;
+    int out_size      = in_size;
+    int out_nchw_size = batch * channel * input_size * input_size;
+
+    DimsVector dims = {batch, channel, input_size, input_size};
+
+    void* mat_in_data           = nullptr;
+    void* mat_out_ref_nchw_data = nullptr;
+    void* mat_out_dev_nchw_data = nullptr;
+    void* mat_out_ref_data      = nullptr;
+    void* mat_out_dev_data      = nullptr;
+
+    mat_out_ref_nchw_data = malloc(out_nchw_size * sizeof(float));
+    mat_out_dev_nchw_data = malloc(out_nchw_size * sizeof(float));
+
+    if (mat_type == NCHW_FLOAT) {
+        mat_in_data = malloc(in_size * sizeof(float));
+        InitRandom(static_cast<float*>(mat_in_data), in_size, 0.0f, 1.0f);
+        mat_out_ref_data = malloc(out_size * sizeof(float));
+        mat_out_dev_data = malloc(out_size * sizeof(float));
+    } else {
+        mat_in_data = malloc(in_size * sizeof(uint8_t));
+        InitRandom(static_cast<uint8_t*>(mat_in_data), in_size, static_cast<uint8_t>(0), static_cast<uint8_t>(255));
+        mat_out_ref_data = malloc(out_size * sizeof(uint8_t));
+        mat_out_dev_data = malloc(out_size * sizeof(uint8_t));
+    }
+
+#define CLEANUP()                                                                                                      \
+    free(mat_in_data);                                                                                                 \
+    free(mat_out_ref_nchw_data);                                                                                       \
+    free(mat_out_dev_nchw_data);                                                                                       \
+    free(mat_out_ref_data);                                                                                            \
+    free(mat_out_dev_data);
+
+#define CLEANUP_AND_FAIL()                                                                                             \
+    CLEANUP();                                                                                                         \
+    FAIL();
+
+    // blob desc
+    BlobDesc cpu_blob_desc, device_blob_desc;
+    cpu_blob_desc.dims          = {batch, channel, input_size, input_size};
+    cpu_blob_desc.device_type   = DEVICE_NAIVE;
+    cpu_blob_desc.data_type     = blob_data_type;
+    cpu_blob_desc.data_format   = GetDefaultDataFormat(DEVICE_NAIVE);
+    Blob* cpu_blob              = nullptr;
+    Blob* device_blob           = nullptr;
+    IntScaleResource* int_scale = nullptr;
+
+    device_blob_desc             = cpu_blob_desc;
+    DeviceType device_type       = device_->GetDeviceType();
+    device_blob_desc.device_type = device_type;
+    device_blob_desc.data_format = GetDefaultDataFormat(device_type);
+    float max_i8_diff            = 0;
+    if (blob_data_type == DATA_TYPE_FLOAT) {
+        cpu_blob    = new Blob(cpu_blob_desc);
+        device_blob = new Blob(device_blob_desc);
+    } else if (blob_data_type == DATA_TYPE_HALF) {
+        // compare with naive fp32 results
+        cpu_blob_desc.data_type = DATA_TYPE_FLOAT;
+        cpu_blob                = new Blob(cpu_blob_desc);
+        device_blob             = new Blob(device_blob_desc);
+    } else {
+        int_scale     = CreateIntScale(channel);
+        auto scaleptr = int_scale->scale_handle.force_to<float*>();
+        for (int i = 0; i < channel; i++) {
+            auto s = fabs(scaleptr[i]);
+            if (s != 0 && 1.0 / s > max_i8_diff)
+                max_i8_diff = 1.0 / s;
+        }
+        auto tmp = new BlobInt8(cpu_blob_desc);
+        tmp->SetIntResource(int_scale);
+        cpu_blob = tmp;
+
+        tmp = new BlobInt8(device_blob_desc);
+        tmp->SetIntResource(int_scale);
+        device_blob = tmp;
+    }
+
+    BlobHandleAllocate(cpu_blob, cpu_);
+    BlobHandleAllocate(device_blob, device_);
+    void* device_command_queue;
+    device_context_->GetCommandQueue(&device_command_queue);
+
+    BlobConverter cpu_converter(cpu_blob);
+    BlobConverter device_converter(device_blob);
+
+    MatConvertParam from_mat_param;
+    from_mat_param.reverse_channel = reverse_channel;
+    std::vector<float> scale_data, bias_data;
+    for (int i = 0; i < mat_channel; i++) {
+        scale_data.push_back(scale * (i + 1));
+        bias_data.push_back(bias * (i + 1));
+    }
+    from_mat_param.scale = scale_data;
+    from_mat_param.bias  = bias_data;
+
+    Mat mat_in(DEVICE_NAIVE, mat_type, dims, mat_in_data);
+
+    test::Timer timer("");
+    int loop_cnt = 10;
+
+    Status ret;
+#ifndef TNN_UNIT_TEST_BENCHMARK
+    ret = cpu_converter.ConvertFromMat(mat_in, from_mat_param, NULL);
+    if (ret != TNN_OK) {
+        LOGE("cpu converter convert mat to blob failed, mat type: %d\n", mat_type);
+        CLEANUP_AND_FAIL();
+    }
+    loop_cnt = 1;
+#endif
+    for (int i = 0; i < loop_cnt; ++i) {
+        timer.Start();
+
+        ret = device_converter.ConvertFromMat(mat_in, from_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert mat to blob failed, mat type: %d, msg:%s\n", mat_type,
+                 ret.description().c_str());
+            CLEANUP_AND_FAIL();
+        }
+
+        timer.Stop();
+    }
+    if (FLAGS_ub) {
+        LOGI("ConvertFromMat (device: %s  mat type: %s  dims: %s)\n", FLAGS_dt.c_str(),
+               MatTypeToString(mat_type).c_str(), DimsToString(dims).c_str());
+        timer.Print();
+    }
+
+    MatConvertParam to_mat_param;
+    // nchw float not support reverse channel
+    to_mat_param.reverse_channel = false;
+    to_mat_param.scale           = scale_data;
+    to_mat_param.bias            = bias_data;
+    Mat mat_out_ref_nchw(DEVICE_NAIVE, NCHW_FLOAT, dims, mat_out_ref_nchw_data);
+#ifndef TNN_UNIT_TEST_BENCHMARK
+    ret = cpu_converter.ConvertToMat(mat_out_ref_nchw, to_mat_param, NULL);
+    if (ret != TNN_OK) {
+        LOGE("cpu converter convert blob to mat failed, mat type: %d\n", NCHW_FLOAT);
+        CLEANUP_AND_FAIL();
+    }
+#endif
+    Mat mat_out_dev_nchw(DEVICE_NAIVE, NCHW_FLOAT, dims, mat_out_dev_nchw_data);
+
+    timer.Reset();
+    for (int i = 0; i < loop_cnt; ++i) {
+        timer.Start();
+
+        ret = device_converter.ConvertToMat(mat_out_dev_nchw, to_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert blob to mat failed, mat type: %d\n", NCHW_FLOAT);
+            CLEANUP_AND_FAIL();
+        }
+
+        timer.Stop();
+    }
+    if (FLAGS_ub) {
+        LOGI("ConvertToMat (device: %s  mat type: %s  dims: %s)\n", FLAGS_dt.c_str(),
+               MatTypeToString(mat_type).c_str(), DimsToString(dims).c_str());
+        timer.Print();
+    }
+
+    int cmp_result = 0;
+#ifndef TNN_UNIT_TEST_BENCHMARK
+    float compare_eps = blob_data_type == DATA_TYPE_INT8 ? max_i8_diff + 0.01 : 0.01;
+
+    cmp_result |= CompareData(static_cast<float*>(mat_out_ref_nchw_data), static_cast<float*>(mat_out_dev_nchw_data),
+                              out_nchw_size, compare_eps);
+
+    EXPECT_EQ(0, cmp_result);
+
+    Mat mat_out_ref(DEVICE_NAIVE, mat_type, dims, mat_out_ref_data);
+    Mat mat_out_dev(DEVICE_NAIVE, mat_type, dims, mat_out_dev_data);
+
+    if (mat_type != NCHW_FLOAT &&
+        (dev != DEVICE_ARM || (dev == DEVICE_ARM && (mat_type == N8UC4 || mat_type == N8UC3)))) {
+        to_mat_param.scale           = scale_data;
+        to_mat_param.bias            = bias_data;
+        to_mat_param.reverse_channel = reverse_channel;
+
+        ret = cpu_converter.ConvertToMat(mat_out_ref, to_mat_param, NULL);
+        if (ret != TNN_OK) {
+            LOGE("cpu converter convert blob to mat failed, mat type: %d\n", mat_type);
+            CLEANUP_AND_FAIL();
+        }
+
+        ret = device_converter.ConvertToMat(mat_out_dev, to_mat_param, device_command_queue);
+        if (ret != TNN_OK) {
+            LOGE("device converter convert blob to mat failed, mat type: %d\n", mat_type);
+            CLEANUP_AND_FAIL();
+        }
+        cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data), static_cast<uint8_t*>(mat_out_dev_data),
+                                  mat_channel, channel, out_size);
+
+        cmp_result |= OpenCLMatTest(mat_in, from_mat_param, to_mat_param, dims, in_size, out_size, mat_type,
+                                    mat_channel, channel, device_converter, device_command_queue, mat_out_ref_data);
+    }
+#endif
+
+    EXPECT_EQ(0, cmp_result);
+
+    BlobHandleFree(cpu_blob, cpu_);
+    BlobHandleFree(device_blob, device_);
+
+    if (nullptr != cpu_blob) {
+        delete cpu_blob;
+    }
+    if (nullptr != device_blob) {
+        delete device_blob;
+    }
+    if (nullptr != int_scale) {
+        delete int_scale;
+    }
+
+    CLEANUP();
+
+#undef CLEANUP
+#undef CLEANUP_AND_FAIL
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/blob_converter_test.h b/3rdparty/TNN/test/unit_test/blob_converter_test.h
new file mode 100644
index 0000000..5175ebf
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/blob_converter_test.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
+#define TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
+
+#include <gtest/gtest.h>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+class BlobConverterTest : public ::testing::TestWithParam<std::tuple<int, int, int, float, float, bool, MatType, DataType>> {
+public:
+    static void SetUpTestCase();
+    static void TearDownTestCase();
+
+protected:
+    int Compare(Blob* cpu_blob, Blob* device_blob);
+    bool TestFilterCheck(const DataType& blob_data_type, const DeviceType& dev,
+                         const MatType& mat_type, const int batch, const int channel,
+                         const int input_size, const bool reverse_channel);
+    bool OpenCLMatTest(Mat& cpu_mat_in,
+                       MatConvertParam& from_mat_param, MatConvertParam& to_mat_param,
+                       const DimsVector& dims, const int in_size, const int out_size,
+                       MatType mat_type, const int mat_channel, const int channel,
+                       BlobConverter& device_converter, void* device_command_queue,
+                       void* mat_out_ref_data);
+    static AbstractDevice* cpu_;
+    static AbstractDevice* device_;
+    static Context* cpu_context_;
+    static Context* device_context_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/layer_test/layer_test.cc b/3rdparty/TNN/test/unit_test/layer_test/layer_test.cc
new file mode 100644
index 0000000..cffd269
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/layer_test.cc
@@ -0,0 +1,460 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+
+#include <sstream>
+
+#include "test/timer.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+std::shared_ptr<Instance> LayerTest::instance_cpu_       = nullptr;
+std::shared_ptr<Instance> LayerTest::instance_device_    = nullptr;
+std::shared_ptr<Instance> LayerTest::instance_ocl_cache_ = nullptr;
+
+void LayerTest::SetUpTestCase() {}
+
+bool LayerTest::CheckDataTypeSkip(DataType data_type) {
+#ifndef TNN_ARM82
+    if (data_type == DATA_TYPE_HALF) {
+        return true;
+    }
+#endif
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+    if (data_type == DATA_TYPE_INT8 && DEVICE_ARM != dev && DEVICE_X86 != dev && DEVICE_NAIVE != dev) {
+        return true;
+    }
+    if ( (data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_BFP16) && (DEVICE_ARM != dev &&  DEVICE_NAIVE != dev))  {
+        return true;
+    }
+    return false;
+}
+
+
+void LayerTest::Run(std::shared_ptr<AbstractModelInterpreter> interp, Precision precision, DataFormat cpu_input_data_format, DataFormat device_input_data_format) {
+#if defined(__OBJC__) && defined(__APPLE__)
+    @autoreleasepool{
+#endif
+    TNN_NS::Status ret = TNN_NS::TNN_OK;
+
+    ret = Init(interp, precision, cpu_input_data_format, device_input_data_format);
+    if (ret != TNN_OK) {
+        EXPECT_EQ((int)ret, TNN_OK);
+        DeInit();
+        return;
+    }
+
+    ret = InitInputBlobsDataRandom();
+    if (ret != TNN_OK) {
+        EXPECT_EQ((int)ret, TNN_OK);
+        DeInit();
+        return;
+    }
+
+    ret = Forward();
+    if (ret != TNN_OK) {
+        EXPECT_EQ((int)ret, TNN_OK);
+        DeInit();
+        return;
+    }
+
+#ifndef TNN_UNIT_TEST_BENCHMARK
+    // Compare the result for both cpu and device layer
+    ret = Compare();
+    if (ret != TNN_OK) {
+        EXPECT_EQ((int)ret, TNN_OK);
+        DeInit();
+        return;
+    }
+#endif
+
+    DeInit();
+    if (ret != TNN_OK) {
+        EXPECT_EQ((int)ret, TNN_OK);
+        return;
+    }
+#if defined(__OBJC__) && defined(__APPLE__)
+    }
+#endif
+}
+
+Status LayerTest::Init(std::shared_ptr<AbstractModelInterpreter> interp, Precision precision, DataFormat cpu_input_data_format, DataFormat device_input_data_format) {
+    TNN_NS::Status ret = TNN_NS::TNN_OK;
+
+    ModelConfig model_config;
+    model_config.params.push_back("");
+    model_config.params.push_back("");
+
+    NetworkConfig config_cpu;
+    config_cpu.device_type = DEVICE_NAIVE;
+    config_cpu.data_format = cpu_input_data_format;
+
+    NetworkConfig config_device;
+    config_device.device_type = ConvertDeviceType(FLAGS_dt);
+    config_device.enable_tune_kernel = FLAGS_et;
+    if (DEVICE_HUAWEI_NPU == config_device.device_type) {
+        config_device.network_type = NETWORK_TYPE_HUAWEI_NPU;
+    }
+    if (DEVICE_CUDA == config_device.device_type) {
+        config_device.network_type = NETWORK_TYPE_TENSORRT;
+    }
+    if (!FLAGS_ub && (DEVICE_OPENCL == config_device.device_type || DEVICE_METAL == config_device.device_type)) {
+        config_device.precision = PRECISION_HIGH;
+    } else {
+        config_device.precision = precision;
+    }
+    if (FLAGS_lp.length() > 0) {
+        config_device.library_path = {FLAGS_lp};
+    }
+    config_device.data_format = device_input_data_format;
+
+    instance_cpu_ = std::make_shared<Instance>(config_cpu, model_config);
+    if (nullptr == instance_cpu_) {
+        LOGE("tnn create cpu instance failed\n");
+        return Status(TNNERR_NULL_PARAM, "instance is null");
+    }
+
+    instance_device_ = std::make_shared<Instance>(config_device, model_config);
+    if (nullptr == instance_device_) {
+        LOGE("tnn create device instance failed\n");
+        return Status(TNNERR_NULL_PARAM, "instance is null");
+    }
+
+    InputShapesMap input_shape = InputShapesMap();
+    ret                        = instance_cpu_->Init(interp, input_shape);
+    if (ret != TNN_OK) {
+        LOGE("tnn init cpu instance failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+    ret = instance_device_->Init(instance_cpu_->GetInterpreter(), input_shape);
+    if (ret != TNN_OK) {
+        LOGE("tnn init device instance failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+
+    if (nullptr == instance_ocl_cache_ && DEVICE_OPENCL == config_device.device_type) {
+        instance_ocl_cache_ = std::make_shared<Instance>(config_device, model_config);
+        if (nullptr == instance_ocl_cache_) {
+            LOGE("tnn create ocl cache instance failed\n");
+            return Status(TNNERR_NULL_PARAM, "instance is null");
+        }
+
+        ret = instance_ocl_cache_->Init(interp, input_shape);
+        if (ret != TNN_OK) {
+            LOGE("tnn init device instance failed\n");
+            return ret;
+        }
+    }
+
+    return ret;
+}
+
+Status LayerTest::Forward() {
+    TNN_NS::Status ret = TNN_NS::TNN_OK;
+
+#ifndef TNN_UNIT_TEST_BENCHMARK
+    ret = instance_cpu_->Forward();
+    EXPECT_EQ_OR_RETURN(ret, TNN_OK);
+#endif
+
+#if TNN_PROFILE && defined(TNN_UNIT_TEST_BENCHMARK)
+    instance_device_->StartProfile();
+#endif
+
+    test::Timer timer("device " + FLAGS_dt);
+    for (int i = 0; i < FLAGS_ic; ++i) {
+        timer.Start();
+
+        ret = instance_device_->Forward();
+        EXPECT_EQ_OR_RETURN(ret, TNN_OK);
+
+        timer.Stop();
+    }
+
+#if TNN_PROFILE && defined(TNN_UNIT_TEST_BENCHMARK)
+    instance_device_->FinishProfile(true);
+#endif
+
+    /*
+     * shows the timings of device layer.
+     * Used for benchmarking.
+     */
+    if (FLAGS_ub) {
+        timer.Print();
+    }
+
+    return ret;
+}
+
+Status LayerTest::Compare() {
+    BlobMap output_blobs_cpu;
+    BlobMap output_blobs_device;
+    Status ret = TNN_OK;
+    ret        = instance_cpu_->GetAllOutputBlobs(output_blobs_cpu);
+    if (ret != TNN_OK)
+        return ret;
+    ret = instance_device_->GetAllOutputBlobs(output_blobs_device);
+    if (ret != TNN_OK)
+        return ret;
+
+    void* command_queue;
+    ret = instance_device_->GetCommandQueue(&command_queue);
+    if (ret != TNN_OK) {
+        LOGE("get device command queue failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+
+    int cmp_result = 0;
+    for (auto blob_item : output_blobs_cpu) {
+        cmp_result =
+            CompareBlob(output_blobs_cpu[blob_item.first], output_blobs_device[blob_item.first], command_queue);
+        if (cmp_result != 0) {
+            break;
+        }
+    }
+
+    EXPECT_EQ(0, cmp_result);
+    return TNN_OK;
+}
+
+Status LayerTest::DeInit() {
+    instance_cpu_.reset();
+    instance_device_.reset();
+
+    return TNN_OK;
+}
+
+void LayerTest::TearDownTestCase() {
+    instance_cpu_.reset();
+    instance_device_.reset();
+    instance_ocl_cache_.reset();
+}
+
+Status LayerTest::GenerateRandomBlob(Blob* cpu_blob, Blob* device_blob, void* command_queue_dev, int magic_num) {
+    Status ret = TNN_OK;
+    // init cpu input blob
+    BlobDesc blob_desc                = cpu_blob->GetBlobDesc();
+    BlobMemorySizeInfo blob_size_info = Calculate1DMemorySize(blob_desc);
+    int blob_count                    = DimsVectorUtils::Count(blob_size_info.dims);
+
+    BlobDesc blob_desc_device = device_blob->GetBlobDesc();
+    MatType mat_type          = NCHW_FLOAT;
+    if (blob_desc_device.data_type == DATA_TYPE_BFP16) {
+        // the value is initialized as bfp16
+        mat_type = RESERVED_BFP16_TEST;
+    } else if (blob_desc_device.data_type == DATA_TYPE_INT8) {
+        // the value is initialized as int8
+        mat_type = RESERVED_INT8_TEST;
+    } else if (blob_desc_device.data_type == DATA_TYPE_HALF && device_blob->GetBlobDesc().device_type == DEVICE_ARM) {
+        // the value is initialized as half
+        mat_type = RESERVED_FP16_TEST;
+    }
+    TNN_NS::Mat input_mat_cpu(DEVICE_NAIVE, mat_type, blob_desc.dims);
+    void* input_data = input_mat_cpu.GetData();
+    if (mat_type == NCHW_FLOAT) {
+        if (ensure_input_positive_) {
+            // some layers only supports positive data as input
+            InitRandom(static_cast<float*>(input_data), blob_count, 0.0001f, 1.0f + (float)magic_num);
+        } else {
+            InitRandom(static_cast<float*>(input_data), blob_count, 1.0f + (float)magic_num);
+        }
+    } else if (mat_type == RESERVED_FP16_TEST) {
+        if (ensure_input_positive_) {
+            // some layers only supports positive values as input
+            InitRandom(static_cast<fp16_t*>(input_data), blob_count, (fp16_t)0.0f, (fp16_t)(1.0f + magic_num));
+        } else {
+            InitRandom(static_cast<fp16_t*>(input_data), blob_count, (fp16_t)(1.0f + magic_num));
+        }
+    } else if (mat_type == RESERVED_INT8_TEST) {
+        if (ensure_input_positive_) {
+            // some layers only supports positive values as input
+            InitRandom(static_cast<int8_t*>(input_data), blob_count, (int8_t)0, (int8_t)8);
+        } else {
+            InitRandom(static_cast<int8_t*>(input_data), blob_count, (int8_t)8);
+        }
+    } else if (mat_type == RESERVED_BFP16_TEST) {
+        if (ensure_input_positive_) {
+            InitRandom(static_cast<bfp16_t*>(input_data), blob_count, bfp16_t(0.f), bfp16_t(1.0f + magic_num));
+        } else {
+            InitRandom(static_cast<bfp16_t*>(input_data), blob_count, bfp16_t(1.0f + magic_num));
+        }
+    }
+
+    // default param for the blob_converter
+    MatConvertParam param;
+    if (blob_desc.dims.size() >= 2) {
+        param.scale = std::vector<float>(blob_desc.dims[1], 1.f);
+        param.bias  = std::vector<float>(blob_desc.dims[1], 0.f);
+    } else {
+        param.scale = {1.f};
+        param.bias  = {0.f};
+    }
+
+    // CONVERT TO CPU BLOB
+    BlobConverter blob_converter_cpu(cpu_blob);
+    ret = blob_converter_cpu.ConvertFromMat(input_mat_cpu, param, nullptr);
+    if (ret != TNN_OK) {
+        LOGE("input blob_converter failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+
+    // CONVERT TO DEVICE BLOB
+    TNN_NS::Mat input_mat_device(DEVICE_NAIVE, mat_type, device_blob->GetBlobDesc().dims, input_data); // For HUAWEI_NPU, dim size not equal
+    BlobConverter blob_converter(device_blob);
+    ret = blob_converter.ConvertFromMat(input_mat_device, param, command_queue_dev);
+    if (ret != TNN_OK) {
+        LOGE("input blob_converter failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+    return ret;
+}
+
+int LayerTest::CompareBlob(Blob* cpu_blob, Blob* device_blob, void* command_queue_dev) {
+    Status ret       = TNN_OK;
+    auto dims_cpu    = cpu_blob->GetBlobDesc().dims;
+    auto dims_device = device_blob->GetBlobDesc().dims;
+    if (this->CompareDims(dims_cpu, dims_device) != 0) {
+        std::stringstream dims_cpu_stream, dims_device_stream;
+        std::copy(dims_cpu.begin(),    dims_cpu.end(),    std::ostream_iterator<int>(dims_cpu_stream,    ","));
+        std::copy(dims_device.begin(), dims_device.end(), std::ostream_iterator<int>(dims_device_stream, ","));
+        if (device_blob->GetBlobDesc().device_type == DEVICE_HUAWEI_NPU &&
+            DimsVectorUtils::Count(dims_cpu) == DimsVectorUtils::Count(dims_device)) {
+            LOGI("blob dims not equal, cpu:%s device:%s, but count is equal\n", dims_cpu_stream.str().c_str(), dims_device_stream.str().c_str());
+        } else {
+            LOGE("blob dims not equal, cpu:%s device:%s\n", dims_cpu_stream.str().c_str(), dims_device_stream.str().c_str());
+            return -1;
+        }
+    }
+    auto blob_desc_device = device_blob->GetBlobDesc();
+    // mat type for both
+    MatType mat_type = NCHW_FLOAT;
+    if (blob_desc_device.data_type == DATA_TYPE_BFP16) {
+        mat_type = RESERVED_BFP16_TEST;
+    } else if (blob_desc_device.data_type == DATA_TYPE_INT8) {
+        mat_type = RESERVED_INT8_TEST;
+    } else if (blob_desc_device.data_type == DATA_TYPE_INT32) {
+        mat_type = NC_INT32;
+    }
+    int count = DimsVectorUtils::Count(dims_cpu);
+    // convert cpu blob to mat
+    TNN_NS::Mat cpu_mat(DEVICE_NAIVE, mat_type, dims_cpu);
+    BlobConverter blob_converter_cpu(cpu_blob);
+    ret = blob_converter_cpu.ConvertToMat(cpu_mat, MatConvertParam(), nullptr);
+    if (ret != TNN_OK) {
+        LOGE("output blob_converter failed (%s)\n", ret.description().c_str());
+        return -1;
+    }
+
+    // convert dev blob to cpu mat nchw
+    TNN_NS::Mat dev_cpu_mat(DEVICE_NAIVE, mat_type, dims_device);
+    BlobConverter blob_converter_dev(device_blob);
+    ret = blob_converter_dev.ConvertToMat(dev_cpu_mat, MatConvertParam(), command_queue_dev);
+    if (ret != TNN_OK) {
+        LOGE("output blob_converter failed (%s)\n", ret.description().c_str());
+        return -1;
+    }
+
+    // compare data
+    int cmp_result = 0;
+    if (blob_desc_device.data_type == DATA_TYPE_FLOAT) {
+        cmp_result |= CompareData(static_cast<float*>(cpu_mat.GetData()), static_cast<float*>(dev_cpu_mat.GetData()),
+                                  count, 0.01, 0.0001);
+    } else if (blob_desc_device.data_type == DATA_TYPE_HALF) {
+        cmp_result |= CompareData(static_cast<float*>(cpu_mat.GetData()), static_cast<float*>(dev_cpu_mat.GetData()),
+                                  count, 0.01, 0.001);
+    } else if (blob_desc_device.data_type == DATA_TYPE_BFP16) {
+        cmp_result |= CompareData(static_cast<bfp16_t*>(cpu_mat.GetData()),
+                                  static_cast<bfp16_t*>(dev_cpu_mat.GetData()), count, 0.05);
+    } else if (blob_desc_device.data_type == DATA_TYPE_INT8) {
+        cmp_result |=
+            CompareData(static_cast<int8_t*>(cpu_mat.GetData()), static_cast<int8_t*>(dev_cpu_mat.GetData()), count);
+    } else if (blob_desc_device.data_type == DATA_TYPE_INT32) {
+        cmp_result |=
+            CompareData(static_cast<int*>(cpu_mat.GetData()), static_cast<int*>(dev_cpu_mat.GetData()), count);
+    } else {
+        LOGE("UNKNOWN DATA TYPE!");
+    }
+
+    if (cmp_result != 0) {
+        if (blob_desc_device.data_type == DATA_TYPE_INT32) {
+            LOGE("cpu_mat.GetData(): %d %d %d %d\n", static_cast<int*>(cpu_mat.GetData())[0],
+                static_cast<int*>(cpu_mat.GetData())[1], static_cast<int*>(cpu_mat.GetData())[2],
+                static_cast<int*>(cpu_mat.GetData())[3]);
+            LOGE("dev_cpu_mat.GetData(): %d %d %d %d\n", static_cast<int*>(dev_cpu_mat.GetData())[0],
+                static_cast<int*>(dev_cpu_mat.GetData())[1], static_cast<int*>(dev_cpu_mat.GetData())[2],
+                static_cast<int*>(dev_cpu_mat.GetData())[3]);
+        } else {
+            LOGE("cpu_mat.GetData(): %.6f %.6f %.6f %.6f\n", static_cast<float*>(cpu_mat.GetData())[0],
+                static_cast<float*>(cpu_mat.GetData())[1], static_cast<float*>(cpu_mat.GetData())[2],
+                static_cast<float*>(cpu_mat.GetData())[3]);
+            LOGE("dev_cpu_mat.GetData(): %.6f %.6f %.6f %.6f\n", static_cast<float*>(dev_cpu_mat.GetData())[0],
+                static_cast<float*>(dev_cpu_mat.GetData())[1], static_cast<float*>(dev_cpu_mat.GetData())[2],
+                static_cast<float*>(dev_cpu_mat.GetData())[3]);
+        }
+    }
+
+    return cmp_result;
+}
+
+Status LayerTest::InitInputBlobsDataRandom() {
+    BlobMap input_blobs_cpu;
+    BlobMap input_blobs_device;
+    Status ret = TNN_OK;
+    ret        = instance_cpu_->GetAllInputBlobs(input_blobs_cpu);
+    if (ret != TNN_OK)
+        return ret;
+    ret = instance_device_->GetAllInputBlobs(input_blobs_device);
+    if (ret != TNN_OK)
+        return ret;
+
+    // CONVERT TO DEVICE BLOB
+    void* command_queue;
+    ret = instance_device_->GetCommandQueue(&command_queue);
+    if (ret != TNN_OK) {
+        LOGE("get device command queue failed (%s)\n", ret.description().c_str());
+        return ret;
+    }
+
+    int index = 0;
+    for (auto blob_item : input_blobs_cpu) {
+        if (blob_item.second->IsConstant()) {
+            continue;
+        }
+        ret = GenerateRandomBlob(input_blobs_cpu[blob_item.first], input_blobs_device[blob_item.first], command_queue,
+                                 index);
+        if (ret != TNN_OK) {
+            return ret;
+        }
+
+        index++;
+    }
+
+    return TNN_OK;
+}
+
+int LayerTest::CompareDims(DimsVector dims_a, DimsVector dims_b) {
+    return dims_a == dims_b ? 0 : 1;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/layer_test.h b/3rdparty/TNN/test/unit_test/layer_test/layer_test.h
new file mode 100644
index 0000000..05eb779
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/layer_test.h
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_H_
+#define TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_H_
+
+#include <gtest/gtest.h>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "test/unit_test/layer_test/layer_test_utils.h"
+#include "test/unit_test/unit_test_macro.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/core/tnn.h"
+#include "tnn/layer/base_layer.h"
+
+#define EXPECT_EQ_OR_RETURN(status, target)                                                                            \
+    if ((status) != (target))                                                                                          \
+    return (status)
+
+namespace TNN_NS {
+
+class LayerTest : public ::testing::Test {
+protected:
+    static void SetUpTestCase();
+
+    void Run(std::shared_ptr<AbstractModelInterpreter> interp, Precision precision = PRECISION_AUTO, DataFormat cpu_input_data_format = DATA_FORMAT_AUTO, DataFormat device_input_data_format = DATA_FORMAT_AUTO);
+
+    bool CheckDataTypeSkip(DataType data_type);
+
+    static void TearDownTestCase();
+
+private:
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interp, Precision precision, DataFormat cpu_input_data_format, DataFormat device_input_data_format);
+    Status Forward();
+    Status Compare();
+    Status DeInit();
+
+protected:
+    int ensure_input_positive_ = 0;
+
+    static std::shared_ptr<Instance> instance_cpu_;
+    static std::shared_ptr<Instance> instance_device_;
+    static std::shared_ptr<Instance> instance_ocl_cache_;
+
+private:
+    Status GenerateRandomBlob(Blob* cpu_blob, Blob* device_blob, void* command_queue_dev, int magic_num);
+    int CompareBlob(Blob* cpu_blob, Blob* device_blob, void* command_queue_dev);
+    int CompareDims(DimsVector dims_a, DimsVector dims_b);
+
+    Status InitInputBlobsDataRandom();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.cc b/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.cc
new file mode 100644
index 0000000..428ea15
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.cc
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test_utils.h"
+#include <fstream>
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+std::vector<BlobDesc> CreateInputBlobsDesc(int batch, int channel, int input_size, int blob_count, DataType data_type) {
+    // blob desc
+    std::vector<BlobDesc> inputs_desc;
+    for (int i = 0; i < blob_count; i++) {
+        BlobDesc input_desc;
+        input_desc.dims.push_back(batch);
+        input_desc.dims.push_back(channel);
+        input_desc.dims.push_back(input_size);
+        input_desc.dims.push_back(input_size);
+        input_desc.device_type = DEVICE_NAIVE;
+        input_desc.data_type   = data_type;
+        inputs_desc.push_back(input_desc);
+    }
+    return inputs_desc;
+}
+
+std::vector<BlobDesc> CreateInputBlobsDesc(int batch, int channel, int height, int width, int blob_count,
+                                           DataType data_type) {
+    // blob desc
+    std::vector<BlobDesc> inputs_desc;
+    for (int i = 0; i < blob_count; i++) {
+        BlobDesc input_desc;
+        input_desc.dims.push_back(batch);
+        input_desc.dims.push_back(channel);
+        input_desc.dims.push_back(height);
+        input_desc.dims.push_back(width);
+        input_desc.device_type = DEVICE_NAIVE;
+        input_desc.data_type   = data_type;
+        inputs_desc.push_back(input_desc);
+    }
+    return inputs_desc;
+}
+
+std::vector<BlobDesc> CreateOutputBlobsDesc(int blob_count, DataType data_type) {
+    std::vector<BlobDesc> outputs_desc;
+    for (int i = 0; i < blob_count; ++i) {
+        BlobDesc output_desc;
+        output_desc.data_type   = data_type;
+        output_desc.device_type = DEVICE_NAIVE;
+        outputs_desc.push_back(output_desc);
+    }
+    return outputs_desc;
+}
+
+int ReadBlobFromFile(Blob *blob, std::string path) {
+    BlobDesc blob_desc = blob->GetBlobDesc();
+    int data_count     = DimsVectorUtils::Count(blob_desc.dims);
+    auto data_ptr      = (float *)blob->GetHandle().base;
+    std::ifstream input_stream(path);
+    int data_index = 0;
+    while (!input_stream.eof() && data_count-- > 0) {
+        float tmp;
+        input_stream >> tmp;
+        data_ptr[data_index++] = tmp;
+    }
+    input_stream.close();
+
+    return 0;
+}
+
+int WriteBlobToFile(Blob *blob, std::string path) {
+    BlobDesc blob_desc = blob->GetBlobDesc();
+    int data_count     = DimsVectorUtils::Count(blob_desc.dims);
+    auto data_ptr      = (float *)blob->GetHandle().base;
+    std::ofstream output_stream(path);
+    int data_index = 0;
+    while (data_index < data_count) {
+        output_stream << data_ptr[data_index++] << std::endl;
+    }
+    output_stream.close();
+
+    return 0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.h b/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.h
new file mode 100644
index 0000000..5297ecb
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/layer_test_utils.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_UTILS_H_
+#define TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_UTILS_H_
+
+#include "tnn/core/blob.h"
+
+namespace TNN_NS {
+
+std::vector<BlobDesc> CreateInputBlobsDesc(int batch, int channel, int size, int blob_count, DataType data_type);
+
+std::vector<BlobDesc> CreateInputBlobsDesc(int batch, int channel, int height, int width, int blob_count,
+                                           DataType data_type);
+
+std::vector<BlobDesc> CreateOutputBlobsDesc(int blob_count, DataType data_type);
+
+int ReadBlobFromFile(Blob* blob, std::string path);
+int WriteBlobToFile(Blob* blob, std::string path);
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_UTILS_H_
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_abs_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_abs_layer.cc
new file mode 100644
index 0000000..17a59c5
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_abs_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class AbsLayerTest : public UnaryLayerTest {
+public:
+    AbsLayerTest() : UnaryLayerTest(LAYER_ABS) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, AbsLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_BFP16)));
+
+TEST_P(AbsLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Abs");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_acos_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_acos_layer.cc
new file mode 100644
index 0000000..ba1d248
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_acos_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class AcosLayerTest : public UnaryLayerTest {
+public:
+    AcosLayerTest() : UnaryLayerTest(LAYER_ACOS) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, AcosLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(AcosLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Acos");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_add_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_add_layer.cc
new file mode 100644
index 0000000..c9f307e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_add_layer.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class AddLayerTest : public BinaryLayerTest {
+public:
+    AddLayerTest() : BinaryLayerTest(LAYER_ADD) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, AddLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_INT8, DATA_TYPE_HALF)));
+
+TEST_P(AddLayerTest, BinaryLayerTest) {
+    int batch               = std::get<0>(GetParam());
+    int input_cnt           = std::get<3>(GetParam());
+    int param_size_type     = std::get<4>(GetParam());
+    DataType blob_data_type = std::get<7>(GetParam());
+
+    if (blob_data_type == DATA_TYPE_INT8) {
+        // currently only single batch and non-broadcasting add is implemented
+        if (batch != 1 || input_cnt != 2 || param_size_type != 2) {
+            GTEST_SKIP();
+        }
+    }
+
+    RunBinaryTest("Add");
+}
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_arg_max_or_min_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_arg_max_or_min_layer.cc
new file mode 100644
index 0000000..5aceab8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_arg_max_or_min_layer.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ArgMaxOrMinLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ArgMaxOrMinLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // mode: ArgMax or ArgMin
+                                            testing::Values(0, 1),
+                                            // axis
+                                            testing::Values(0, 1, 2, 3),
+                                            // keep dims
+                                            testing::Values(0, 1),
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5),
+                                            // select_last_index: we will support this feature in future;
+                                            testing::Values(0), testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(ArgMaxOrMinLayerTest, ArgMaxOrMinLayer) {
+    // get param
+    int batch             = std::get<0>(GetParam());
+    int channel           = std::get<1>(GetParam());
+    int input_size        = std::get<2>(GetParam());
+    int mode              = std::get<3>(GetParam());
+    int axis              = std::get<4>(GetParam());
+    int keep_dims         = std::get<5>(GetParam());
+    int dim_count         = std::get<6>(GetParam());
+    int select_last_index = std::get<7>(GetParam());
+    DataType dtype        = std::get<8>(GetParam());
+    DeviceType dev        = ConvertDeviceType(FLAGS_dt);
+
+    if (dim_count > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+    if (mode != 1 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (dtype != DATA_TYPE_FLOAT) {
+        GTEST_SKIP();
+    }
+
+    if (axis >= dim_count) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && (dim_count > 4 || keep_dims == 0)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_METAL == dev && keep_dims == 0 && axis <= 1) {
+        GTEST_SKIP();
+    }
+
+    if (keep_dims == 0 && dim_count <= 2) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ArgMaxOrMinLayerParam> param(new ArgMaxOrMinLayerParam());
+    param->name              = "ArgMaxOrMin";
+    param->mode              = mode;
+    param->axis              = axis;
+    param->keep_dims         = keep_dims;
+    param->select_last_index = select_last_index;
+
+    // generate proto string
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("ArgMaxOrMin", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_asin_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_asin_layer.cc
new file mode 100644
index 0000000..ff82934
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_asin_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class AsinLayerTest : public UnaryLayerTest {
+public:
+    AsinLayerTest() : UnaryLayerTest(LAYER_ASIN) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, AsinLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(AsinLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Asin");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_atan_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_atan_layer.cc
new file mode 100644
index 0000000..e216b45
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_atan_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class AtanLayerTest : public UnaryLayerTest {
+public:
+    AtanLayerTest() : UnaryLayerTest(LAYER_ATAN) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, AtanLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(AtanLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Atan");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_batch_norm_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_batch_norm_layer.cc
new file mode 100644
index 0000000..03a3711
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_batch_norm_layer.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class BatchNormScaleLayerTest : public LayerTest,
+                                public ::testing::WithParamInterface<std::tuple<int, int, int, int, bool, bool, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, BatchNormScaleLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5),
+                                            // share channel
+                                            testing::Values(false, true),
+                                            // has bias
+                                            testing::Values(true, false),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(BatchNormScaleLayerTest, BatchNormScaleLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    bool share_channel = std::get<4>(GetParam());
+    bool has_bias      = std::get<5>(GetParam());
+    auto dtype         = std::get<6>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(dtype)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count != 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<LayerParam> param(new LayerParam());
+    param->name = "BatchNorm";
+
+    // resource
+    std::shared_ptr<BatchNormLayerResource> resource(new BatchNormLayerResource());
+    int k_count = share_channel ? 1 : channel;
+    RawBuffer filter_k(k_count * sizeof(float));
+    float* k_data = filter_k.force_to<float*>();
+    InitRandom(k_data, k_count, 1.0f);
+    resource->scale_handle = filter_k;
+    if (has_bias) {
+        RawBuffer bias(k_count * sizeof(float));
+        float* bias_data = bias.force_to<float*>();
+        InitRandom(bias_data, k_count, 1.0f);
+        resource->bias_handle = bias;
+    }
+
+    // generate interpreter
+    //std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter1           = GenerateInterpreter("BatchNormCxx", {input_dims}, param, resource);
+    auto interpreter2           = GenerateInterpreter("Scale", {input_dims}, param, resource);
+    Precision precision = SetPrecision(dev, dtype);
+    Run(interpreter1, precision);
+    Run(interpreter2, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.cc
new file mode 100644
index 0000000..75589f8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.cc
@@ -0,0 +1,141 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+BinaryLayerTest::BinaryLayerTest(LayerType type) {
+    layer_type_ = type;
+}
+
+void BinaryLayerTest::RunBinaryTest(std::string layer_type_str, bool resource_positive) {
+    // get param
+    int batch           = std::get<0>(GetParam());
+    int channel         = std::get<1>(GetParam());
+    int input_size      = std::get<2>(GetParam());
+    int input_count     = std::get<3>(GetParam());
+    int param_size_type = std::get<4>(GetParam());
+    int weight_idx      = std::get<5>(GetParam());
+    int dims_size       = std::get<6>(GetParam());
+    DataType data_type  = std::get<7>(GetParam());
+    DeviceType dev      = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (batch > 1 && DEVICE_METAL == dev) {
+         GTEST_SKIP();
+    }
+    if (batch > 1 && param_size_type == 3 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+    if (dims_size != 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    std::vector<int> param_dims;
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    if (0 == param_size_type) {
+        param_dims  = {1, 1, 1, 1};
+    } else if (1 == param_size_type) {
+        param_dims  = {1, channel, 1, 1};
+    } else if (2 == param_size_type) {
+        param_dims  = {1, channel, input_size, input_size};
+    } else if (3 == param_size_type) {
+        param_dims  = {1, 1, input_size, input_size};
+    }
+
+    for (int i = dims_size; i < 4; ++i) {
+        param_dims.pop_back();
+        input_dims.pop_back();
+    }
+
+    std::shared_ptr<EltwiseLayerResource> resource = nullptr;
+    if (input_count == 1) {
+        int param_count = DimsVectorUtils::Count(param_dims);
+        resource = std::shared_ptr<EltwiseLayerResource>(new EltwiseLayerResource());
+        RawBuffer buffer(param_count * sizeof(float));
+        float* buffer_data = buffer.force_to<float*>();
+        if (resource_positive) {
+            InitRandom(buffer_data, param_count, 0.001f, 1.0f);
+        } else {
+            InitRandom(buffer_data, param_count, 1.0f);
+        }
+        resource->element_handle = buffer;
+        resource->element_shape  = param_dims;
+    }
+
+    // param
+    std::shared_ptr<MultidirBroadcastLayerParam> param = nullptr;
+    if (LAYER_HARDSWISH == layer_type_) {
+        param = std::shared_ptr<MultidirBroadcastLayerParam>(new HardSwishLayerParam());
+    } else {
+        param = std::shared_ptr<MultidirBroadcastLayerParam>(new MultidirBroadcastLayerParam());
+    }
+
+    param->name               = "Binary";
+    param->weight_input_index = weight_idx;
+
+    std::vector<int> input0_dims;
+    std::vector<int> input1_dims;
+    // blob desc
+    std::vector<BlobDesc> inputs_desc;
+    if (1 == input_count) {
+        if (-1 == weight_idx) {
+            // this case doesn't exist
+            return;
+        } else if (0 == weight_idx) {
+            input0_dims = input_dims;
+        } else if (1 == weight_idx) {
+            input0_dims = input_dims;
+        }
+    } else if (2 == input_count) {
+        if (-1 == weight_idx) {
+            // the size of input are same
+            input0_dims = input_dims;
+            input1_dims = input_dims;
+        } else {
+            if (0 == weight_idx) {
+                input0_dims = param_dims;
+                input1_dims = input_dims;
+            } else if (1 == weight_idx) {
+                input0_dims = input_dims;
+                input1_dims = param_dims;
+            }
+        }
+    } else {
+        // not support yet
+        return;
+    }
+
+
+    Precision precision = SetPrecision(dev, data_type);
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    } 
+    
+    std::shared_ptr<AbstractModelInterpreter> interpreter;
+    if (1 == input_count) {
+        interpreter = GenerateInterpreter(layer_type_str, {input0_dims}, param, resource);
+    } else if (2 == input_count) {
+        interpreter = GenerateInterpreter(layer_type_str, {input0_dims, input1_dims}, param);
+    }
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.h b/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.h
new file mode 100644
index 0000000..37a3669
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_binary_layer.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_LAYER_TEST_BINARY_LAYER_HPP_
+#define TNN_TEST_UNIT_TEST_LAYER_TEST_BINARY_LAYER_HPP_
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class BinaryLayerTest : public LayerTest,
+                        public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, DataType>> {
+public:
+    BinaryLayerTest(LayerType type);
+    void RunBinaryTest(std::string layer_type_str, bool resource_positive = false);
+
+protected:
+    LayerType layer_type_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_LAYER_TEST_BINARY_LAYER_HPP_
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_ceil_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_ceil_layer.cc
new file mode 100644
index 0000000..a248692
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_ceil_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class CeilLayerTest : public UnaryLayerTest {
+public:
+    CeilLayerTest() : UnaryLayerTest(LAYER_CEIL) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, CeilLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(CeilLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Ceil");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_clip_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_clip_layer.cc
new file mode 100644
index 0000000..245c5dc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_clip_layer.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ClipLayerTest : public LayerTest,
+                      public ::testing::WithParamInterface<std::tuple<int, int, int, int, float, float, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ClipLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 8),
+                             testing::Values(1, 4, 15),
+                             // size Values(16, 19),
+                             testing::Values(1, 6, 8, 13),
+                             // dim count
+                             testing::Values(2, 3, 4, 5),
+                             // min
+                             testing::Values(-1.234, 2.30, 0),
+                             // max
+                             testing::Values(-1.234, 1.234, 0.564),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(ClipLayerTest, ClipLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+    float minV     = std::get<4>(GetParam());
+    float maxV     = std::get<5>(GetParam());
+    if (maxV < minV) {
+        maxV = minV;
+    }
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (dev == DEVICE_OPENCL && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    ClipLayerParam* param = new ClipLayerParam();
+    param->name           = "Clip";
+    param->min            = minV;
+    param->max            = maxV;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("Clip", {input_dims}, std::shared_ptr<LayerParam>(param));
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_concat_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_concat_layer.cc
new file mode 100644
index 0000000..f2fea54
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_concat_layer.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+class ConcatLayerTest : public LayerTest,
+                        public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ConcatLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // axis
+                                            testing::Values(1, 2, 3, 4, 5),
+                                            // input cnt
+                                            testing::Values(2, 3),
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5, 6),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_INT8, DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(ConcatLayerTest, ConcatLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int axis           = std::get<3>(GetParam());
+    int input_count    = std::get<4>(GetParam());
+    int dim_count      = std::get<5>(GetParam());
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (axis >= dim_count) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ConcatLayerParam> param(new ConcatLayerParam());
+    param->name = "Concat";
+    param->axis = axis;
+
+    Precision precision = SetPrecision(dev, data_type);
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    }
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    std::vector<std::vector<int>> input_dims_vec;
+    for (int i = 0; i < input_count; ++i)
+        input_dims_vec.push_back(input_dims);
+    auto interpreter = GenerateInterpreter("Concat", input_dims_vec, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_conv_3d_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_conv_3d_layer.cc
new file mode 100644
index 0000000..a74d2e7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_conv_3d_layer.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+class Conv3DLayerTest : public LayerTest,
+                      public ::testing::WithParamInterface<
+                          std::tuple<int, int, int, int, int, int, int, int, int, DataType, ActivationType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, Conv3DLayerTest,
+                         ::testing::Combine(  // batch
+                             testing::Values(1, 2),
+                             // channel
+                             testing::Values(1, 3, 10),
+                             // depth
+                             testing::Values(9, 13),
+                             // hw
+                             testing::Values(9, 19),
+                             // group
+                             testing::Values(1, 2),
+                             // kernel
+                             testing::Values(1, 2, 5),
+                             // dilation
+                             testing::Values(1, 2),
+                             // stride
+                             testing::Values(1, 2),
+                             // pads
+                             testing::Values(0, 1),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_BFP16, DATA_TYPE_INT8),
+                             // activation_type
+                             testing::Values(ActivationType_None, ActivationType_ReLU, ActivationType_ReLU6,
+                                             ActivationType_SIGMOID_MUL)));
+
+TEST_P(Conv3DLayerTest, Conv3DLayer) {
+    // get param
+    int batch             = std::get<0>(GetParam());
+    int channel_per_group = std::get<1>(GetParam());
+    int depth             = std::get<2>(GetParam());
+    int input_size        = std::get<3>(GetParam());
+    int group             = std::get<4>(GetParam());
+    int channel           = group * channel_per_group;
+    int kernel            = std::get<5>(GetParam());
+    int dilation          = std::get<6>(GetParam());
+    int stride            = std::get<7>(GetParam());
+    int pad               = std::get<8>(GetParam());
+    auto dtype            = std::get<9>(GetParam());
+    int activation_type   = std::get<10>(GetParam());
+    DeviceType dev        = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_NAIVE != dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ConvLayerParam> param(new ConvLayerParam());
+    param->name            = "Conv3D";
+    param->input_channel   = channel;
+    param->output_channel  = channel;
+    param->group           = group;
+    param->kernels         = {kernel, kernel, kernel};
+    param->dialations      = {dilation, dilation, dilation};
+    param->strides         = {stride, stride, stride};
+    param->pads            = {pad, pad, pad, pad, pad, pad};
+    param->bias            = 1;
+    param->activation_type = activation_type;
+
+    // generate interpreter
+    Precision precision = SetPrecision(dev, dtype);
+    std::vector<int> input_dims = {batch, channel, depth, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Convolution3D", {input_dims}, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_conv_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_conv_layer.cc
new file mode 100644
index 0000000..b323a33
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_conv_layer.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ConvLayerTest : public LayerTest,
+                      public ::testing::WithParamInterface<
+                          std::tuple<int, int, int, int, int, int, int, int, int, DataType, ActivationType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ConvLayerTest,
+                         ::testing::Combine(  // batch
+                             testing::Values(1, 2),
+                             // channel
+                             testing::Values(1, 3, 10, 48),
+                             // hw
+                             testing::Values(9, 10, 16, 19),
+                             // group
+                             testing::Values(1, 2),
+                             // kernel
+                             testing::Values(1, 2, 3, 5),
+                             // dilation
+                             testing::Values(1, 2),
+                             // stride
+                             testing::Values(1, 2),
+                             // pads
+                             testing::Values(0, 1),
+                             // pad type
+                             testing::Values(-1, 0, 1),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF),
+                             // activation_type
+                             testing::Values(ActivationType_None, ActivationType_ReLU, ActivationType_ReLU6,
+                                             ActivationType_SIGMOID_MUL)));
+
+TEST_P(ConvLayerTest, ConvLayer) {
+    // get param
+    int batch             = std::get<0>(GetParam());
+    int channel_per_group = std::get<1>(GetParam());
+    int input_size        = std::get<2>(GetParam());
+    int group             = std::get<3>(GetParam());
+    int channel           = group * channel_per_group;
+    int kernel            = std::get<4>(GetParam());
+    int dilation          = std::get<5>(GetParam());
+    int stride            = std::get<6>(GetParam());
+    int pad               = std::get<7>(GetParam());
+    int pad_type          = std::get<8>(GetParam());
+    auto dtype            = std::get<9>(GetParam());
+    int activation_type   = std::get<10>(GetParam());
+    DeviceType dev        = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(dtype)) {
+        GTEST_SKIP();
+    }
+
+    if (activation_type == ActivationType_SIGMOID_MUL && DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    if (activation_type == ActivationType_ReLU6 && DEVICE_X86 == dev) {
+        GTEST_SKIP();
+    }
+    if (activation_type == ActivationType_SIGMOID_MUL && DEVICE_X86 == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ConvLayerParam> param(new ConvLayerParam());
+    param->name            = "Conv";
+    param->input_channel   = channel;
+    param->output_channel  = channel;
+    param->group           = group;
+    param->kernels         = {kernel, kernel};
+    param->dialations      = {dilation, dilation};
+    param->strides         = {stride, stride};
+    param->pads            = {pad, pad, pad, pad};
+    param->pad_type        = pad_type;
+    param->bias            = 1;
+    param->activation_type = activation_type;
+
+    // generate interpreter
+    Precision precision         = SetPrecision(dev, dtype);
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Convolution", {input_dims}, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_conv_quant_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_conv_quant_layer.cc
new file mode 100644
index 0000000..63096c3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_conv_quant_layer.cc
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ConvQuantLayerTest : public LayerTest,
+                           public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, DataType,
+                                                                           ActivationType, FusionType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ConvQuantLayerTest,
+                         ::testing::Combine(testing::Values(1), 
+                                            testing::Values(1, 3, 10, 64),
+                                            testing::Values(9, 10, 16, 19),
+                                            // kernel
+                                            testing::Values(1, 3),
+                                            // stride
+                                            testing::Values(1, 2),
+                                            // group
+                                            testing::Values(1, 2, 3, 8),
+                                            // dilation
+                                            testing::Values(1, 2),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_INT8, DATA_TYPE_BFP16),
+                                            // activation_type
+                                            testing::Values(ActivationType_None, ActivationType_ReLU, ActivationType_ReLU6),
+                                            // fusion_type
+                                            testing::Values(FusionType_None, FusionType_Conv_Add_Activation,
+                                                            FusionType_Conv_Activation_Add)));
+
+TEST_P(ConvQuantLayerTest, ConvLayer) {
+    // get param
+    int batch             = std::get<0>(GetParam());
+    int channel_per_group = std::get<1>(GetParam());
+    int input_size        = std::get<2>(GetParam());
+    int kernel            = std::get<3>(GetParam());
+    int stride            = std::get<4>(GetParam());
+    int group             = std::get<5>(GetParam());
+    int dilation          = std::get<6>(GetParam());
+    DataType data_type    = std::get<7>(GetParam());
+    auto activation_type  = std::get<8>(GetParam());
+    auto fusion_type      = std::get<9>(GetParam());
+    int channel           = group * channel_per_group;
+    DeviceType dev        = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (fusion_type != FusionType_None) {
+        // only int8 data type support conv add fusion
+        if (group != 1 || data_type != DATA_TYPE_INT8) {
+            GTEST_SKIP();
+        }
+    }
+
+    if (fusion_type == FusionType_Conv_Activation_Add) {
+        if (activation_type == ActivationType_ReLU6) {
+            GTEST_SKIP();
+        }
+    }
+
+    // param
+    std::shared_ptr<ConvLayerParam> param(new ConvLayerParam());
+    param->name            = "Conv";
+    param->input_channel   = channel;
+    param->output_channel  = channel;
+    param->group           = group;
+    param->kernels         = {kernel, kernel};
+    param->dialations      = {dilation, dilation};
+    param->strides         = {stride, stride};
+    param->pads            = {kernel / 2, kernel / 2, kernel / 2, kernel / 2};
+    param->bias            = 1;
+    param->activation_type = activation_type;
+    param->fusion_type     = fusion_type;
+
+    std::vector<int> conv_input_dims = {batch, channel, input_size, input_size};
+    std::vector<std::vector<int>> input_vec;
+    input_vec.push_back(conv_input_dims);
+
+    // get add input dim
+    if (fusion_type != FusionType_None) {
+        auto inputs_desc              = CreateInputBlobsDesc(batch, channel, input_size, 1, data_type);
+        Blob conv_input_blob          = Blob(inputs_desc[0]);
+        std::vector<Blob*> conv_input = {&conv_input_blob};
+
+        BlobDesc conv_output_desc;
+        conv_output_desc.data_type     = data_type;
+        conv_output_desc.device_type   = DEVICE_NAIVE;
+        Blob conv_output_blob          = Blob(conv_output_desc);
+        std::vector<Blob*> conv_output = {&conv_output_blob};
+
+        auto layer_creator_map = GetGlobalLayerCreatorMap();
+        auto conv_layer        = layer_creator_map[LAYER_CONVOLUTION]->CreateLayer();
+
+        conv_layer->InferShapeAhead(conv_input, conv_output, param.get(), nullptr);
+        input_vec.push_back(conv_output[0]->GetBlobDesc().dims);
+        delete conv_layer;
+    }
+
+    // generate proto string
+    Precision precision = PRECISION_AUTO;
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    } else if (DATA_TYPE_BFP16 == data_type) {
+        precision = PRECISION_LOW;
+    }
+
+    auto interpreter = GenerateInterpreter("Convolution", input_vec, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_cos_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_cos_layer.cc
new file mode 100644
index 0000000..ff05d25
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_cos_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class CosLayerTest : public UnaryLayerTest {
+public:
+    CosLayerTest() : UnaryLayerTest(LAYER_COS) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, CosLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(CosLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Cos");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_deconv_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_deconv_layer.cc
new file mode 100644
index 0000000..462f62d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_deconv_layer.cc
@@ -0,0 +1,134 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include <fstream>
+#include <streambuf>
+#include <string>
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/interpreter/tnn/model_interpreter.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class DeconvLayerTest : public LayerTest,
+                        public ::testing::WithParamInterface<
+                            std::tuple<int, int, int, int, int, int, int, int, int, int, int, DataType, int>> {};
+INSTANTIATE_TEST_SUITE_P(LayerTest, DeconvLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 2, 5, 13),
+                                            testing::Values(1, 3, 4, 16),
+                                            // input_size
+                                            testing::Values(2, 3, 8, 15),
+                                            // group
+                                            testing::Values(1, 2, 5),
+                                            // kernel
+                                            testing::Values(2, 3, 4),
+                                            // dilation
+                                            testing::Values(1),
+                                            // stride
+                                            testing::Values(2),
+                                            // pads
+                                            testing::Values(1),
+                                            // output_pads
+                                            testing::Values(0),
+                                            // pad type
+                                            testing::Values(-1, 1, 2),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_BFP16, DATA_TYPE_HALF),
+                                            // activation_type
+                                            testing::Values(ActivationType_None, ActivationType_ReLU,
+                                                            ActivationType_ReLU6, ActivationType_SIGMOID_MUL)));
+
+TEST_P(DeconvLayerTest, DeconvLayer) {
+    // get param
+    int batch                    = std::get<0>(GetParam());
+    int input_channel_per_group  = std::get<1>(GetParam());
+    int output_channel_per_group = std::get<2>(GetParam());
+    int input_size               = std::get<3>(GetParam());
+    int group                    = std::get<4>(GetParam());
+    int kernel                   = std::get<5>(GetParam());
+    int dilation                 = std::get<6>(GetParam());
+    int stride                   = std::get<7>(GetParam());
+    int pad                      = std::get<8>(GetParam());
+    int output_pad               = std::get<9>(GetParam());
+    int pad_type                 = std::get<10>(GetParam());
+    auto data_type               = std::get<11>(GetParam());
+    int activation_type          = std::get<12>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (stride > kernel) {
+        GTEST_SKIP();
+    }
+
+    if (input_size == 2 && kernel == 4 && pad_type == 2) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_CUDA == dev && (activation_type == ActivationType_SIGMOID_MUL || dilation != 1))  {
+        GTEST_SKIP();
+    }
+
+    if (activation_type == ActivationType_ReLU6 && DEVICE_X86 == dev) {
+        GTEST_SKIP();
+    }
+    if (activation_type == ActivationType_SIGMOID_MUL && DEVICE_X86 == dev) {
+        GTEST_SKIP();
+    }
+
+    if (kernel <= 1) {
+        pad = 0;
+    } else if (kernel == 2) {
+        output_pad = 1;
+    }
+
+    int input_channel  = group * input_channel_per_group;
+    int output_channel = group * output_channel_per_group;
+
+    // deconv param
+    std::shared_ptr<ConvLayerParam> param(new ConvLayerParam());
+    param->name            = "Deconv";
+    param->input_channel   = input_channel;
+    param->output_channel  = output_channel;
+    param->group           = group;
+    param->kernels         = {kernel, kernel};
+    param->dialations      = {dilation, dilation};
+    param->strides         = {stride, stride};
+    param->activation_type = activation_type;
+
+    param->pads = {pad, pad, pad, pad};
+    param->bias = 1;
+
+    if (output_pad > 0) {
+        param->pad_type = 3;
+    } else {
+        param->pad_type = pad_type;
+    }
+
+    Precision precision = SetPrecision(dev, data_type);
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, input_channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Deconvolution", {input_dims}, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_div_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_div_layer.cc
new file mode 100644
index 0000000..a7090da
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_div_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class DivLayerTest : public BinaryLayerTest {
+public:
+    DivLayerTest() : BinaryLayerTest(LAYER_DIV) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, DivLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(DivLayerTest, BinaryLayerTest) {
+    RunBinaryTest("Div", true);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_elu_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_elu_layer.cc
new file mode 100644
index 0000000..5272402
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_elu_layer.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class EluLayerTest : public LayerTest,
+                     public ::testing::WithParamInterface<std::tuple<int, int, int, int, float, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, EluLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 8),
+                             testing::Values(1, 4, 15),
+                             // size Values(16, 19),
+                             testing::Values(1, 6, 8, 13),
+                             // dim count
+                             testing::Values(2, 3, 4, 5),
+                             // alpha
+                             testing::Values(-1.234, 2.30, 0.564),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(EluLayerTest, EluLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    float alpha        = std::get<4>(GetParam());
+    DataType data_type = std::get<5>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<EluLayerParam> param(new EluLayerParam());
+    param->name  = "Elu";
+    param->alpha = alpha;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("Elu", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_exp_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_exp_layer.cc
new file mode 100644
index 0000000..66cfb6e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_exp_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class ExpLayerTest : public UnaryLayerTest {
+public:
+    ExpLayerTest() : UnaryLayerTest(LAYER_EXP) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ExpLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),    
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(ExpLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Exp");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_floor_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_floor_layer.cc
new file mode 100644
index 0000000..fcf8c4b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_floor_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class FloorLayerTest : public UnaryLayerTest {
+public:
+    FloorLayerTest() : UnaryLayerTest(LAYER_FLOOR) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, FloorLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(FloorLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Floor");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_gelu_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_gelu_layer.cc
new file mode 100644
index 0000000..8ba5d5f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_gelu_layer.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class GeluLayerTest : public LayerTest,
+                     public ::testing::WithParamInterface<std::tuple<int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, GeluLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 8),
+                             testing::Values(1, 4, 15),
+                             // size Values(16, 19),
+                             testing::Values(1, 6, 8, 13),
+                             // dim count
+                             testing::Values(2, 3, 4, 5),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(GeluLayerTest, GeluLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    DataType data_type = std::get<4>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev || DEVICE_METAL == dev || DEVICE_ARM == dev) {
+        GTEST_SKIP();
+    }
+    if (dim_count > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<LayerParam> param(new LayerParam());
+    param->name  = "Gelu";
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("GELU", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_grid_sample_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_grid_sample_layer.cc
new file mode 100644
index 0000000..e1f0806
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_grid_sample_layer.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class GridSampleLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, GridSampleLayerTest,
+                         ::testing::Combine(testing::Values(1, 2),
+                                            // channel
+                                            testing::Values(1, 3, 4, 10),
+                                            // input_height
+                                            testing::Values(3, 4, 10),
+                                            // input_weight
+                                            testing::Values(3, 4, 10),
+                                            // output_height
+                                            testing::Values(3, 4, 10, 20),
+                                            // output_weight
+                                            testing::Values(3, 4, 10, 20),
+                                            // mode:
+                                            testing::Values(2),
+                                            // pad_type
+                                            testing::Values(0),
+                                            // align_corners
+                                            testing::Values(0),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(GridSampleLayerTest, GridSampleLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_height   = std::get<2>(GetParam());
+    int input_width    = std::get<3>(GetParam());
+    int output_height  = std::get<4>(GetParam());
+    int output_width   = std::get<5>(GetParam());
+    int mode           = std::get<6>(GetParam());
+    int pad_type       = std::get<7>(GetParam());
+    int align_corners  = std::get<8>(GetParam());
+    DataType data_type = std::get<9>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+    if (!(DEVICE_NAIVE == dev || DEVICE_ARM == dev || DEVICE_CUDA == dev || DEVICE_OPENCL == dev)) {
+        GTEST_SKIP();
+    }
+
+    if (!(mode == 2 && pad_type == 0 && align_corners == 0)) {
+        GTEST_SKIP();
+    }
+
+    Precision precision = SetPrecision(dev, data_type);
+
+    // param
+    std::shared_ptr<GridSampleLayerParam> param(new GridSampleLayerParam());
+    param->name          = "GridSample";
+    param->mode          = mode;
+    param->pad_type      = pad_type;
+    param->align_corners = align_corners;
+
+    // generate interpreter
+    std::vector<std::vector<int32_t>> input_dims_vec;
+    std::vector<int32_t> input_dims = {batch, channel, input_height, input_width};
+    std::vector<int32_t> grid_dims  = {batch, output_height, output_width, 2};
+    input_dims_vec.push_back(input_dims);
+    input_dims_vec.push_back(grid_dims);
+    auto interpreter = GenerateInterpreter("GridSample", input_dims_vec, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_hard_sigmoid_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_hard_sigmoid_layer.cc
new file mode 100644
index 0000000..a81a02f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_hard_sigmoid_layer.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class HardSigmoidLayerTest : public LayerTest,
+                             public ::testing::WithParamInterface<std::tuple<int, int, int, int, float, float, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, HardSigmoidLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 6, 8, 13),
+                             testing::Values(1, 6, 8, 13),
+                             // size Values(1, 6, 8, 13),
+                             testing::Values(6),
+                             // dim count
+                             testing::Values(2, 3, 4, 5),
+                             // alpha Values(2, 1, 0.5),
+                             testing::Values(2, 1, 0.5),
+                             // beta Values(0, 2, 1.5, 3),
+                             testing::Values(0, 2, 1.5, 3),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(HardSigmoidLayerTest, HardSigmoidLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+    float alpha    = std::get<4>(GetParam());
+    float beta     = std::get<5>(GetParam());
+
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<HardSigmoidLayerParam> param(new HardSigmoidLayerParam());
+    param->name  = "HardSigmoid";
+    param->alpha = alpha;
+    param->beta  = beta;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("HardSigmoid", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_broadcast_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_broadcast_layer.cc
new file mode 100644
index 0000000..648514a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_broadcast_layer.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class HardSwishBroadcastLayerTest : public BinaryLayerTest {
+public:
+    HardSwishBroadcastLayerTest() : BinaryLayerTest(LAYER_HARDSWISH) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, HardSwishBroadcastLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(HardSwishBroadcastLayerTest, BinaryLayerTest) {
+    // get param
+    int weight_idx = std::get<5>(GetParam());
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // hard swish dont support weight
+    if (weight_idx != -1) {
+        GTEST_SKIP();
+    }
+    RunBinaryTest("HardSwish");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_layer.cc
new file mode 100644
index 0000000..472605c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_hard_swish_layer.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class HardSwishLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, float, float, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, HardSwishLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 6, 8, 13),
+                             testing::Values(1, 6, 8, 13),
+                             // size Values(1, 6, 8, 13),
+                             testing::Values(6),
+                             // alpha Values(2, 1, 0.5),
+                             testing::Values(2, 1, 0.5),
+                             // beta Values(0, 2, 1.5, 3),
+                             testing::Values(0, 2, 1.5, 3),
+                             // input count
+                             testing::Values(1, 2),
+                             // input dim
+                             testing::Values(2, 3, 4),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(HardSwishLayerTest, HardSwishLayer) {
+    // get param
+    int batch       = std::get<0>(GetParam());
+    int channel     = std::get<1>(GetParam());
+    int input_size  = std::get<2>(GetParam());
+    float alpha     = std::get<3>(GetParam());
+    float beta      = std::get<4>(GetParam());
+    int input_count = std::get<5>(GetParam());
+    int dim_count   = std::get<6>(GetParam());
+
+    DataType data_type = std::get<7>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<HardSwishLayerParam> param(new HardSwishLayerParam());
+    param->name  = "HardSwish";
+    param->alpha = alpha;
+    param->beta  = beta;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+
+    std::vector<std::vector<int>> input_dims_vec;
+    for (int i = 0; i < input_count; ++i) {
+        input_dims_vec.push_back(input_dims);
+    }
+    auto interpreter = GenerateInterpreter("HardSwish", input_dims_vec, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_hdrguide_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_hdrguide_layer.cc
new file mode 100644
index 0000000..f43659a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_hdrguide_layer.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class HdrGuideLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, HdrGuideLayerTest, ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE));
+
+TEST_P(HdrGuideLayerTest, HdrGuideLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (channel != 3) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_METAL != dev && DEVICE_OPENCL != dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<LayerParam> param(new LayerParam());
+    param->name = "HDRGuide";
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("HDRGuide", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_int8_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_int8_layer.cc
new file mode 100644
index 0000000..a288d02
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_int8_layer.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class InnerProductInt8LayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, InnerProductInt8LayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 8, 9, 16),
+                                            testing::Values(1, 9, 16, 19),
+                                            // output channel
+                                            testing::Values(1, 4, 8, 16, 32)));
+
+TEST_P(InnerProductInt8LayerTest, InnerProductLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int input_channel  = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int output_channel = std::get<3>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(DATA_TYPE_INT8)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<InnerProductLayerParam> param(new InnerProductLayerParam());
+    param->name       = "InnerProduct";
+    param->num_output = output_channel;
+    param->has_bias   = 0;
+    param->axis       = 1;
+    param->quantized  = true;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, input_channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("InnerProduct", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_layer.cc
new file mode 100644
index 0000000..b97c9b6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_innerproduct_layer.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class InnerProductLayerTest : public LayerTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, InnerProductLayerTest,
+                         ::testing::Combine(testing::Values(1, 2, 8, 11), testing::Values(1, 3, 10, 32),
+                                            testing::Values(9, 10, 16, 19),
+                                            // output channel
+                                            testing::Values(4, 8, 21, 50),
+                                            // has bias Values(0, 1)));
+                                            testing::Values(0, 1),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF, DATA_TYPE_BFP16)));
+
+TEST_P(InnerProductLayerTest, InnerProductLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int input_channel  = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int output_channel = std::get<3>(GetParam());
+    int has_bias       = std::get<4>(GetParam());
+    DataType dtype     = std::get<5>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (dev == DEVICE_ARM && dtype == DATA_TYPE_HALF) {
+        // error of fp16 result will accumulate as input size increases
+        if (input_channel * input_size * input_size > 5000) {
+            GTEST_SKIP();
+        }
+    }
+
+    if(CheckDataTypeSkip(dtype)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<InnerProductLayerParam> param(new InnerProductLayerParam());
+    param->name       = "InnerProduct";
+    param->num_output = output_channel;
+    param->has_bias   = has_bias;
+    param->axis       = 1;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, input_channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("InnerProduct", {input_dims}, param);
+
+    Precision precision = SetPrecision(dev, dtype);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_instance_norm_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_instance_norm_layer.cc
new file mode 100644
index 0000000..993f799
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_instance_norm_layer.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class InstanceNormLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, InstanceNormLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 4, 6),
+                                            testing::Values(10, 20, 65, 128),
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5)));
+
+TEST_P(InstanceNormLayerTest, InstanceNormLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+    if (input_size > 64 && dim_count > 5) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev & dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev & dim_count != 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<InstanceNormLayerParam> param(new InstanceNormLayerParam());
+    param->name = "InstanceNorm";
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+
+    auto interpreter            = GenerateInterpreter("InstBatchNormCxx", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_inverse_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_inverse_layer.cc
new file mode 100644
index 0000000..a5d9772
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_inverse_layer.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class InverseLayerTest : public LayerTest,
+                         public ::testing::WithParamInterface<std::tuple<int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, InverseLayerTest,
+                         ::testing::Combine(testing::Values(1, 2, 3, 4),
+                                            // channel
+                                            testing::Values(1, 2, 3, 4),
+                                            // input_height
+                                            testing::Values(2),
+                                            // input_weight
+                                            testing::Values(2),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_INT8)));
+
+TEST_P(InverseLayerTest, InverseLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_height   = std::get<2>(GetParam());
+    int input_width    = std::get<3>(GetParam());
+    DataType data_type = std::get<4>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+    LOGE("(%d,%d,%d,%d) ", batch, channel, input_height, input_width);
+    if (CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+    if (!(DEVICE_NAIVE == dev || DEVICE_OPENCL == dev)) {
+        GTEST_SKIP();
+    }
+
+    Precision precision = SetPrecision(dev, data_type);
+
+    // param
+    std::shared_ptr<LayerParam> param(new LayerParam());
+    param->name = "Inverse";
+
+    // generate interpreter
+    std::vector<std::vector<int32_t>> input_dims_vec;
+    std::vector<int32_t> input_dims = {batch, channel, input_height, input_width};
+    input_dims_vec.push_back(input_dims);
+    auto interpreter = GenerateInterpreter("Inverse", input_dims_vec, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_layer_norm_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_layer_norm_layer.cc
new file mode 100644
index 0000000..f853c39
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_layer_norm_layer.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class LayerNormLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, LayerNormLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 4, 6),
+                                            testing::Values(10, 20, 65, 128),
+                                            testing::Values(1, 2, 3, 4, 5),
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5)));
+
+TEST_P(LayerNormLayerTest, LayerNormLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int reduce_dims_size = std::get<3>(GetParam());
+    int dim_count  = std::get<4>(GetParam());
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    const int channel_dim_size = dim_count - reduce_dims_size;
+    if (channel_dim_size < 0 || channel_dim_size > dim_count) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev || DEVICE_METAL == dev || DEVICE_ARM == dev || DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<LayerNormLayerParam> param(new LayerNormLayerParam());
+    param->name = "LayerNorm";
+    param->reduce_dims_size = reduce_dims_size;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+
+    std::vector<int> scale_bias_dims(reduce_dims_size, 0);
+    for(int i=0; i<reduce_dims_size; ++i) scale_bias_dims[i] = input_dims[channel_dim_size+i];
+
+    auto interpreter            = GenerateInterpreter("LayerNorm", {input_dims, scale_bias_dims, scale_bias_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_log_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_log_layer.cc
new file mode 100644
index 0000000..8d07f8d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_log_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class LogLayerTest : public UnaryLayerTest {
+public:
+    LogLayerTest() : UnaryLayerTest(LAYER_LOG) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, LogLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(LogLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Log");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_log_sigmoid_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_log_sigmoid_layer.cc
new file mode 100644
index 0000000..f2498fd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_log_sigmoid_layer.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class LogSigmoidLayerTest : public UnaryLayerTest {
+public:
+    LogSigmoidLayerTest() : UnaryLayerTest(LAYER_LOGSIGMOID) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, LogSigmoidLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(LogSigmoidLayerTest, UnaryLayerTest) {
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    RunUnaryTest("LogSigmoid");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_lstm_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_lstm_layer.cc
new file mode 100644
index 0000000..776205c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_lstm_layer.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+static bool TestFilter(DeviceType device_type) {
+    if (device_type == DEVICE_NAIVE || device_type == DEVICE_ARM || device_type == DEVICE_METAL || 
+        device_type == DEVICE_X86 || device_type == DEVICE_OPENCL) {
+        return true;
+    }
+    return false;
+}
+
+class LSTMLayerTest : public LayerTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, DataType>> {};
+// seq_len, batch, input, output
+// direction: 0, 1, 2
+INSTANTIATE_TEST_SUITE_P(LayerTest, LSTMLayerTest,
+                         ::testing::Combine(testing::Values(1, 4, 16),  // seq_len
+                                            testing::Values(1, 2, 4),   // batch_size
+                                            testing::Values(1, 3, 7, 13, 8, 32),  // input_size
+                                            testing::Values(1, 3, 7, 15, 16, 32), // hidden_size
+                                            testing::Values(0, 1, 2),   // direction, 0:forward, 1:backward, 2:bi-direction
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(LSTMLayerTest, LSTMONNXLayer) {
+    // get param
+    int seq_len        = std::get<0>(GetParam());
+    int batch          = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int output_size    = std::get<3>(GetParam());
+    int direction      = std::get<4>(GetParam());
+    DataType dtype     = std::get<5>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(dtype)) {
+        GTEST_SKIP();
+    }
+
+    if (!TestFilter(dev)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<LSTMONNXLayerParam> param(new LSTMONNXLayerParam());
+    param->name        = "LSTMONNX";
+    param->hidden_size = output_size;
+    param->direction   = direction;
+
+    // generate interpreter
+    const int num_directions = param->direction==2? 2: 1;
+    std::vector<int> input_dims = {seq_len, batch, input_size};
+    std::vector<int> wi_dims    = {num_directions, 4*output_size, input_size};
+    std::vector<int> wh_dims    = {num_directions, 4*output_size, output_size};
+    std::vector<int> bias_dims  = {num_directions, 8*output_size};
+    auto interpreter            = GenerateInterpreter("LSTMONNX", {input_dims, wi_dims, wh_dims, bias_dims}, param, nullptr, 3);
+
+    Precision precision = SetPrecision(dev, dtype);
+
+    DataFormat format = DATA_FORMAT_NCHW, device_format = DATA_FORMAT_NCHW;
+    if (dev == DEVICE_OPENCL) {
+        device_format = DATA_FORMAT_CNH4;
+    }
+
+    //Run(interpreter, precision);
+    Run(interpreter, precision, format, device_format);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_matmul_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_matmul_layer.cc
new file mode 100644
index 0000000..484f3aa
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_matmul_layer.cc
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class MatMulLayerTest : public LayerTest,
+                        public ::testing::WithParamInterface<std::tuple<std::vector<int>, std::vector<int>, int>> {};
+
+bool IsCrossBroadCast(std::vector<int> dim0, std::vector<int> dim1) {
+    auto dim0_extend = dim0;
+    auto dim1_extend = dim1;
+    if (dim0.size() > dim1.size()) {
+        for (int i = 0; i < dim0.size() - dim1.size(); ++i) {
+            dim1_extend.insert(dim1_extend.begin(), 1);
+        }
+    } else if (dim0.size() < dim1.size()) {
+        for (int i = 0; i < dim1.size() - dim0.size(); ++i) {
+            dim0_extend.insert(dim0_extend.begin(), 1);
+        }
+    }
+
+    bool dim0_broadcast = false;
+    bool dim1_broadcast = false;
+    for (int i = 0; i < dim0_extend.size() - 2; ++i) {
+        if (dim0_extend[i] != dim1_extend[i]) {
+            if (dim0_extend[i] > dim1_extend[i]) {
+                dim1_broadcast = true;
+            } else if (dim0_extend[i] < dim1_extend[i]) {
+                dim0_broadcast = true;
+            }
+        }
+    }
+
+    return dim0_broadcast & dim1_broadcast;
+}
+
+bool IsBothDiffBatch(std::vector<int> dim0, std::vector<int> dim1) {
+    int dim0_batch = DimsVectorUtils::Count(dim0) / DimsVectorUtils::Count(dim0, dim0.size() - 2);
+    int dim1_batch = DimsVectorUtils::Count(dim1) / DimsVectorUtils::Count(dim1, dim1.size() - 2);
+    return dim0_batch > 1 && dim1_batch > 1 && dim0_batch != dim1_batch;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    LayerTest, MatMulLayerTest,
+    ::testing::Combine(::testing::Values(std::vector<int>({3, 4, 8, 16}), std::vector<int>({3, 1, 8, 16}),
+                                         std::vector<int>({1, 4, 8, 16}), std::vector<int>({4, 8, 16}),
+                                         std::vector<int>({1, 8, 16}), std::vector<int>({8, 16})),
+                       ::testing::Values(std::vector<int>({16, 9}), std::vector<int>({1, 16, 9}),
+                                         std::vector<int>({4, 16, 9}), std::vector<int>({3, 4, 16, 9}),
+                                         std::vector<int>({1, 4, 16, 9}), std::vector<int>({3, 1, 16, 9})),
+                       ::testing::Values(-1, 0, 1)));
+
+TEST_P(MatMulLayerTest, MatMulLayer) {
+    // get param
+    std::vector<int> input0_dim = std::get<0>(GetParam());
+    std::vector<int> input1_dim = std::get<1>(GetParam());
+    int weight_pos              = std::get<2>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (IsCrossBroadCast(input0_dim, input1_dim)) {
+        GTEST_SKIP();
+    }
+    if (IsBothDiffBatch(input0_dim, input1_dim)) {
+        GTEST_SKIP();
+    }
+
+    std::shared_ptr<MatMulLayerParam> param(new MatMulLayerParam());
+    param->name            = "MatMul";
+    param->weight_position = weight_pos;
+
+    std::vector<int> weight_dim;
+    if (weight_pos == 0) {
+        weight_dim = input0_dim;
+    } else if (weight_pos == 1) {
+        weight_dim = input1_dim;
+    }
+
+    std::shared_ptr<MatMulLayerResource> resource = nullptr;
+    if (weight_pos != -1) {
+        int param_count = DimsVectorUtils::Count(weight_dim);
+        resource        = std::shared_ptr<MatMulLayerResource>(new MatMulLayerResource());
+        RawBuffer buffer(param_count * sizeof(float), weight_dim);
+        float* buffer_data = buffer.force_to<float*>();
+        InitRandom(buffer_data, param_count, 1.0f);
+        resource->weight = buffer;
+    }
+
+    // generate interpreter
+    std::shared_ptr<AbstractModelInterpreter> interpreter;
+    if (weight_pos == 0) {
+        interpreter = GenerateInterpreter("MatMul", {input1_dim}, param, resource);
+    } else if (weight_pos == 1) {
+        interpreter = GenerateInterpreter("MatMul", {input0_dim}, param, resource);
+    } else {
+        interpreter = GenerateInterpreter("MatMul", {input0_dim, input1_dim}, param);
+    }
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_max_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_max_layer.cc
new file mode 100644
index 0000000..ceba85c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_max_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class MaxLayerTest : public BinaryLayerTest {
+public:
+    MaxLayerTest() : BinaryLayerTest(LAYER_MAXIMUM) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, MaxLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(MaxLayerTest, BinaryLayerTest) {
+    RunBinaryTest("Maximum");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_min_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_min_layer.cc
new file mode 100644
index 0000000..4f2db3e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_min_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class MinLayerTest : public BinaryLayerTest {
+public:
+    MinLayerTest() : BinaryLayerTest(LAYER_MINIMUM) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, MinLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(MinLayerTest, BinaryLayerTest) {
+    RunBinaryTest("Minimum");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_mul_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_mul_layer.cc
new file mode 100644
index 0000000..933d5b7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_mul_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class MulLayerTest : public BinaryLayerTest {
+public:
+    MulLayerTest() : BinaryLayerTest(LAYER_MUL) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, MulLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(MulLayerTest, BinaryLayerTest) {
+    RunBinaryTest("Mul");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_neg_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_neg_layer.cc
new file mode 100644
index 0000000..9552382
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_neg_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class NegLayerTest : public UnaryLayerTest {
+public:
+    NegLayerTest() : UnaryLayerTest(LAYER_NEG) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, NegLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(NegLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Neg");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_normalize_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_normalize_layer.cc
new file mode 100644
index 0000000..e66db31
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_normalize_layer.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class NormalizeLayerTest : public LayerTest,
+                           public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, NormalizeLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // p
+                                            testing::Values(1, 2),
+                                            // axis
+                                            testing::Values(1),
+                                            // dim_count
+                                            testing::Values(2, 3, 4, 5),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(NormalizeLayerTest, NormalizeLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int p              = std::get<3>(GetParam());
+    int axis           = std::get<4>(GetParam());
+    int dim_count      = std::get<5>(GetParam());
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (batch > 1 || axis != 1 || channel < 2) {
+        GTEST_SKIP();
+    }
+
+    if (channel > 4 && channel % 4 != 0) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<NormalizeLayerParam> param(new NormalizeLayerParam());
+    param->name = "Normalize";
+    param->p    = p;
+    param->axis = axis;
+
+    // generate interpreter
+    //std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("Normalize", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_pad_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_pad_layer.cc
new file mode 100644
index 0000000..81ff46b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_pad_layer.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class PadLayerTest : public LayerTest,
+                     public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, float>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PadLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // pad_w
+                                            testing::Values(0, 1, 2),
+                                            // pad_h
+                                            testing::Values(0, 1, 2),
+                                            // pad_c
+                                            testing::Values(0, 1, 2),
+                                            // pad_type
+                                            testing::Values(0, 1),
+                                            // pad value
+                                            testing::Values(-FLT_MAX, 0, 2, FLT_MAX)));
+
+TEST_P(PadLayerTest, PadLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int pad_w      = std::get<3>(GetParam());
+    int pad_h      = std::get<4>(GetParam());
+    int pad_c      = std::get<5>(GetParam());
+    int pad_type   = std::get<6>(GetParam());
+    float value    = std::get<7>(GetParam());
+
+    // insure pad is valid
+    if (pad_w >= input_size) {
+        pad_w = pad_w % input_size;
+    }
+    if (pad_h >= input_size) {
+        pad_h = pad_h % input_size;
+    }
+    // 目前 只有pad mode 为 const 时, 才支持在channel上进行pad
+    if ((pad_type == 1 || pad_type == 2) && (pad_c != 0)) {
+        GTEST_SKIP();
+    }
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if ((-FLT_MAX == value || FLT_MAX == value) && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PadLayerParam> param(new PadLayerParam());
+    param->name  = "Pad";
+    param->type  = pad_type;
+    param->pads  = {pad_w, pad_w, pad_h, pad_h, pad_c, pad_c};
+    param->value = value;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Pad", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_padv2_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_padv2_layer.cc
new file mode 100644
index 0000000..41636f1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_padv2_layer.cc
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class PadV2LayerTest : public LayerTest,
+                       public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, int, float>> {
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PadV2LayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // pad_w
+                                            testing::Values(0, 1, 2),
+                                            // pad_h
+                                            testing::Values(0, 1, 2),
+                                            // pad_c
+                                            testing::Values(0, 1, 2),
+                                            // pad_type
+                                            testing::Values(0, 1),
+                                            // dim size
+                                            testing::Values(3, 4, 5),
+                                            // pad value
+                                            testing::Values(-FLT_MAX, 0, 2, FLT_MAX)));
+
+TEST_P(PadV2LayerTest, PadV2Layer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int pad_w      = std::get<3>(GetParam());
+    int pad_h      = std::get<4>(GetParam());
+    int pad_c      = std::get<5>(GetParam());
+    int pad_type   = std::get<6>(GetParam());
+    int dim_count  = std::get<7>(GetParam());
+    float value    = std::get<8>(GetParam());
+
+    // insure pad is valid
+    if (pad_w >= input_size) {
+        pad_w = pad_w % input_size;
+    }
+    if (pad_h >= input_size) {
+        pad_h = pad_h % input_size;
+    }
+    // 目前 只有pad mode 为 const 时, 才支持在channel上进行pad
+    if ((pad_type == 1 || pad_type == 2) && (pad_c != 0)) {
+        GTEST_SKIP();
+    }
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    // only cuda, arm, opencl implements padv2 now
+    if (!(DEVICE_CUDA == dev || DEVICE_ARM == dev || DEVICE_OPENCL == dev)) {
+        GTEST_SKIP();
+    }
+    // arm only support dims size 4
+    if (DEVICE_ARM == dev && dim_count != 4) {
+        GTEST_SKIP();
+    }
+    // opnecl only support dims size 4
+    if (DEVICE_OPENCL == dev && dim_count != 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PadLayerParam> param(new PadLayerParam());
+    param->name = "PadV2";
+    param->type = pad_type;
+    if (dim_count == 2) {
+        param->pads = {0, pad_c, 0, pad_c};
+    } else if (dim_count == 3) {
+        param->pads = {0, pad_c, pad_h, 0, pad_c, pad_h};
+    } else if (dim_count == 4) {
+        param->pads = {0, pad_c, pad_h, pad_w, 0, pad_c, pad_h, pad_w};
+    } else if (dim_count == 5) {
+        param->pads = {0, pad_c, pad_h, pad_w, pad_w, 0, pad_c, pad_h, pad_w, pad_w};
+    }
+    param->value = value;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while (input_dims.size() < dim_count)
+        input_dims.push_back(input_size);
+    auto interpreter = GenerateInterpreter("PadV2", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_permute_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_permute_layer.cc
new file mode 100644
index 0000000..8650dc7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_permute_layer.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <numeric>
+#include <map>
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+using List = std::vector<int>;
+namespace TNN_NS {
+static std::vector<List>& GetPermuteOrders(int dim_size) {
+    static std::map<int, std::vector<List>> PermutationLib;
+    if (PermutationLib.count(dim_size) == 0) {
+        List base_order(dim_size, 0);
+        std::iota(base_order.begin(), base_order.end(), 0);
+        std::vector<List> all_permutations;
+        // initialize
+        do {
+            all_permutations.push_back(base_order);
+        } while(std::next_permutation(base_order.begin(), base_order.end()));
+        PermutationLib.insert({dim_size, all_permutations});
+    }
+    return PermutationLib[dim_size];
+}
+
+class PermuteLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PermuteLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                         testing::Values(2, 3, 4, 5)));
+
+TEST_P(PermuteLayerTest, PermuteLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+
+    for(const auto& orders : GetPermuteOrders(dim_count)) {
+        // param
+        std::shared_ptr<PermuteLayerParam> param(new PermuteLayerParam());
+        param->orders = orders;
+
+        auto interpreter            = GenerateInterpreter("Permute", {input_dims}, param);
+        Run(interpreter);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_pixel_shuffle_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_pixel_shuffle_layer.cc
new file mode 100644
index 0000000..14f57f0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_pixel_shuffle_layer.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/unit_test_macro.h"
+
+namespace TNN_NS {
+
+class PixelShuffleLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PixelShuffleLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 2, 3, 4, 9, 10, 16, 18, 32, 50),
+                                            testing::Values(9, 10, 16, 19),
+                                            // upscale_factor
+                                            testing::Values(1, 2, 3, 4, 5)));
+
+TEST_P(PixelShuffleLayerTest, PixelShuffleLayer) {
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int upscale_factor = std::get<3>(GetParam());
+    if (channel < upscale_factor || channel % (upscale_factor * upscale_factor) != 0) {
+        GTEST_SKIP();
+    }
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    std::shared_ptr<PixelShuffleLayerParam> param(new PixelShuffleLayerParam());
+    param->name           = "PixelShuffle";
+    param->upscale_factor = upscale_factor;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("PixelShuffle", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_pooling_3d_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_pooling_3d_layer.cc
new file mode 100644
index 0000000..d51a1f8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_pooling_3d_layer.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+class Pooling3DLayerTest : public LayerTest,
+                         public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, Pooling3DLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // depth
+                                            testing::Values(5, 7),
+                                            // kernel
+                                            testing::Values(2, 3),
+                                            // stride
+                                            testing::Values(1, 2),
+                                            // pool type
+                                            testing::Values(0, 1),
+                                            // datatype
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_BFP16, DATA_TYPE_INT8)));
+
+TEST_P(Pooling3DLayerTest, Pooling3DLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int input_depth    = std::get<3>(GetParam());
+    int kernel         = std::get<4>(GetParam());
+    int stride         = std::get<5>(GetParam());
+    int pool_type      = std::get<6>(GetParam());
+    DataType data_type = std::get<7>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+    if (dev != DEVICE_NAIVE) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PoolingLayerParam> param(new PoolingLayerParam());
+    param->name           = "Pooling3D";
+    param->kernels_params = {kernel, kernel, kernel};
+    param->kernels        = {kernel, kernel, kernel};
+    param->strides        = {stride, stride, stride};
+    if (kernel == 3)
+        param->pads = {1, 1, 1, 1, 1, 1};
+    else
+        param->pads = {0, 0, 0, 0, 0, 0};
+    param->pad_type  = -1;
+    param->pool_type = pool_type;
+    param->kernel_indexs = {-1, -1, -1};
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_depth, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Pooling3D", {input_dims}, param);
+    Precision precision         = SetPrecision(dev, data_type);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_pooling_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_pooling_layer.cc
new file mode 100644
index 0000000..417560d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_pooling_layer.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+class PoolingLayerTest : public LayerTest,
+                         public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PoolingLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // kernel
+                                            testing::Values(3, 2),
+                                            // stride
+                                            testing::Values(1, 2),
+                                            // pool type
+                                            testing::Values(0, 1),
+                                            // datatype
+                                            testing::Values(DATA_TYPE_INT8, DATA_TYPE_FLOAT, DATA_TYPE_BFP16, DATA_TYPE_HALF)));
+
+TEST_P(PoolingLayerTest, PoolingLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int kernel         = std::get<3>(GetParam());
+    int stride         = std::get<4>(GetParam());
+    int pool_type      = std::get<5>(GetParam());
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PoolingLayerParam> param(new PoolingLayerParam());
+    param->name           = "Pooling";
+    param->kernels_params = {kernel, kernel};
+    param->kernels        = {kernel, kernel};
+    param->strides        = {stride, stride};
+    if (kernel == 3)
+        param->pads = {1, 1, 1, 1};
+    else
+        param->pads = {0, 0, 0, 0};
+    param->pad_type  = -1;
+    param->pool_type = pool_type;
+    param->kernel_indexs.push_back(-1);
+    param->kernel_indexs.push_back(-1);
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Pooling", {input_dims}, param);
+    Precision precision         = SetPrecision(dev, data_type);
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    } 
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_pow_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_pow_layer.cc
new file mode 100644
index 0000000..78e9b6b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_pow_layer.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class PowLayerTest : public LayerTest,
+                     public ::testing::WithParamInterface<std::tuple<int, int, int, int, float, float, float, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PowLayerTest,
+                         ::testing::Combine(
+                             // batch
+                             testing::Values(1, 2),
+                             // channel Values(1, 8),
+                             testing::Values(1, 4, 15),
+                             // size Values(16, 19),
+                             testing::Values(1, 6, 8, 13),
+                             // dim count
+                             testing::Values(2, 3, 4, 5),
+                             // scale
+                             testing::Values(1.234, 2.30, 0),
+                             // shift
+                             testing::Values(1.234, 1.234, 0.564),
+                             // exponent
+                             testing::Values(1.234, 2, 2.1),
+                             // data_type
+                             testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(PowLayerTest, PowLayer) {
+    ensure_input_positive_ = 1;
+
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+    float scale    = std::get<4>(GetParam());
+    float shift    = std::get<5>(GetParam());
+    float exponent = std::get<6>(GetParam());
+
+    DataType data_type = std::get<7>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (dim_count > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_CUDA == dev) {
+        exponent = (int)exponent;
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PowLayerParam> param(new PowLayerParam());
+    param->name     = "Pow";
+    param->scale    = scale;
+    param->shift    = shift;
+    param->exponent = exponent;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("Power", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_prelu_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_prelu_layer.cc
new file mode 100644
index 0000000..921d11b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_prelu_layer.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class PReluLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int, bool>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PReluLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            // share channel
+                                            testing::Values(false, true)));
+
+TEST_P(PReluLayerTest, PReluLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    bool share_channel = std::get<4>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (!share_channel && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<PReluLayerParam> param(new PReluLayerParam());
+    param->name           = "PRelu";
+    param->channel_shared = share_channel ? 1 : 0;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("PReLU", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_priox_box_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_priox_box_layer.cc
new file mode 100644
index 0000000..f10bcbd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_priox_box_layer.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class PriorBoxLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<
+          std::tuple<int, int, int, float, float, bool, bool, std::vector<float>, std::vector<float>, int, float>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, PriorBoxLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE, ::testing::Values(1.0, 2.0),
+                                            ::testing::Values(8.0, 10.0), ::testing::Values(false, true),
+                                            ::testing::Values(true, false),
+                                            ::testing::Values(std::vector<float>({0.1, 0.1, 0.2, 0.2})),
+                                            ::testing::Values(std::vector<float>({1.0, 2.0, 3.0, 0.5, 0.33333})),
+                                            ::testing::Values(512, 1024), ::testing::Values(1.0)));
+
+TEST_P(PriorBoxLayerTest, PriorBoxLayer) {
+    // get param
+    int batch                        = std::get<0>(GetParam());
+    int channel                      = std::get<1>(GetParam());
+    int input_size                   = std::get<2>(GetParam());
+    float min_size                   = std::get<3>(GetParam());
+    float max_size                   = std::get<4>(GetParam());
+    bool clip                        = std::get<5>(GetParam());
+    bool flip                        = std::get<6>(GetParam());
+    std::vector<float> variances     = std::get<7>(GetParam());
+    std::vector<float> aspect_ratios = std::get<8>(GetParam());
+    int img_size                     = std::get<9>(GetParam());
+    int step_size                    = std::get<10>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev && batch != 1) {
+        GTEST_SKIP();
+    }
+
+    std::vector<float> min_sizes = {min_size};
+    std::vector<float> max_sizes = {max_size};
+    float offset                 = 0.5;
+
+    std::shared_ptr<PriorBoxLayerParam> param(new PriorBoxLayerParam());
+    param->name          = "PriorBox";
+    param->min_sizes     = min_sizes;
+    param->max_sizes     = max_sizes;
+    param->clip          = clip;
+    param->flip          = flip;
+    param->variances     = variances;
+    param->aspect_ratios = aspect_ratios;
+    param->img_w = param->img_h = img_size;
+    param->step_h = param->step_w = step_size;
+    param->offset                 = offset;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("PriorBox", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_reciprocal_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_reciprocal_layer.cc
new file mode 100644
index 0000000..acc01f6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_reciprocal_layer.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class ReciprocalLayerTest : public UnaryLayerTest {
+public:
+    ReciprocalLayerTest() : UnaryLayerTest(LAYER_RECIPROCAL) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReciprocalLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(ReciprocalLayerTest, UnaryLayerTest) {
+    ensure_input_positive_ = true;
+    RunUnaryTest("Reciprocal");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_reduce_op_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_reduce_op_layer.cc
new file mode 100644
index 0000000..0a04711
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_reduce_op_layer.cc
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer_test.h"
+#include "tnn/utils/dims_utils.h"
+#include "unit_test_common.h"
+#include "utils/network_helpers.h"
+
+namespace TNN_NS {
+
+static std::string GenerateReduceProto(std::string op_type, ReduceLayerParam param) {
+    std::ostringstream ostr;
+    ostr << "\"" << op_type << " layer_name 1 1 input output " << param.keep_dims << " ";
+    for (auto axis : param.axis) {
+        ostr << axis << " ";
+    }
+    ostr << ",\"";
+    return ostr.str();
+}
+
+static void UpdateReduceAxis(std::vector<int>& axes, const int dim_count) {
+    const auto f = [=](int& v){ v = v < 0? v+dim_count : v;};
+    std::for_each(axes.begin(), axes.end(), f);
+}
+
+static bool HasAxis(std::vector<int> axes, const int axis, const int dim_count) {
+    UpdateReduceAxis(axes, dim_count);
+    auto it = std::find(axes.begin(), axes.end(), axis);
+    return (it != axes.end());
+}
+
+static bool IsDiscontinuous(std::vector<int> axes, const int dim_count) {
+    UpdateReduceAxis(axes, dim_count);
+    auto min_val = *std::min_element(axes.begin(), axes.end());
+    auto max_val = *std::max_element(axes.begin(), axes.end());
+    for(auto v=min_val; v<=max_val; ++v) {
+        if (std::find(axes.begin(), axes.end(), v) == axes.end())
+            return true;
+    }
+    return false;
+}
+
+class ReduceOpLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, std::vector<int>, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReduceOpLayerTest,
+                         ::testing::Combine(testing::Values(1, 2),
+                                            testing::Values(2, 3, 9, 128),
+                                            testing::Values(9, 10, 19, 128),
+                                            testing::Values(9, 10, 19, 128),
+                                            // keep_dim
+                                            testing::Values(0, 1),
+                                            // dim count
+                                            testing::Values(2, 3, 4),
+                                            // axis
+                                            testing::Values(std::vector<int>({1}), std::vector<int>({2}),
+                                                            std::vector<int>({3}), std::vector<int>({1, 2}),
+                                                            std::vector<int>({1, -1}), std::vector<int>({3, -2}),
+                                                            std::vector<int>({1, -2, -1})),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(ReduceOpLayerTest, ReduceOpLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_height   = std::get<2>(GetParam());
+    int input_width    = std::get<3>(GetParam());
+    int keep_dims      = std::get<4>(GetParam());
+    int dim_count      = std::get<5>(GetParam());
+    auto& axis         = std::get<6>(GetParam());
+    DataType data_type = std::get<7>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    // only test one case for large inputs
+    if ((channel == 128 && (input_height > 9 || input_width > 9)) ||
+        (input_width == 128 && (channel > 2 || input_height > 9)) ||
+        (input_height == 128 && (channel > 2 || input_width > 9))) {
+        GTEST_SKIP();
+    }
+
+    // blobconverter cannot handle 1-dimensional blob, skip it for now
+    if (dim_count <= axis.size()+1 && keep_dims == 0) {
+        GTEST_SKIP();
+    }
+
+    for(const auto& d: axis) {
+        if (d >= dim_count || d + dim_count < 0) {
+            GTEST_SKIP();
+        }
+    }
+
+    if (DEVICE_OPENCL == dev && keep_dims != 1) {
+        GTEST_SKIP();
+    }
+
+    if ((HasAxis(axis, 0, dim_count) || IsDiscontinuous(axis, dim_count)) && DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ReduceLayerParam> param(new ReduceLayerParam());
+    param->name = "ReduceOp";
+    param->axis = axis;
+    param->keep_dims = keep_dims;
+
+    // generate interpreter
+    //std::vector<int> input_dims = {batch, channel, input_height, input_width};
+    std::vector<int> input_dims = {batch, channel};
+    for(int i=input_dims.size(); i<dim_count; ++i) {
+        if (i % 2 == 0) input_dims.push_back(input_height);
+        else input_dims.push_back(input_width);
+    }
+
+
+    if (DEVICE_HUAWEI_NPU != dev) {
+        auto interpreter1 = GenerateInterpreter("ReduceMax", {input_dims}, param);
+        Run(interpreter1);
+        auto interpreter2 = GenerateInterpreter("ReduceMin", {input_dims}, param);
+        Run(interpreter2);
+        auto interpreter3 = GenerateInterpreter("ReduceMean", {input_dims}, param);
+        Run(interpreter3);
+        auto interpreter8 = GenerateInterpreter("ReduceLogSumExp", {input_dims}, param);
+        Run(interpreter8);
+        if (DEVICE_CUDA != dev) {
+            auto interpreter5 = GenerateInterpreter("ReduceL1", {input_dims}, param);
+            Run(interpreter5);
+            auto interpreter6 = GenerateInterpreter("ReduceL2", {input_dims}, param);
+            Run(interpreter6);
+            auto interpreter7 = GenerateInterpreter("ReduceLogSum", {input_dims}, param);
+            Run(interpreter7);
+            auto interpreter10 = GenerateInterpreter("ReduceSumSquare", {input_dims}, param);
+            Run(interpreter10);
+        }
+    }
+    auto interpreter4 = GenerateInterpreter("ReduceSum", {input_dims}, param);
+    Run(interpreter4);
+    if (DEVICE_CUDA != dev) {
+        auto interpreter9 = GenerateInterpreter("ReduceProd", {input_dims}, param);
+        Run(interpreter9);
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_reformat_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_reformat_layer.cc
new file mode 100644
index 0000000..75e752b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_reformat_layer.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ReformatLayerTest : public LayerTest,
+                          public ::testing::WithParamInterface<std::tuple<int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReformatLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // src_type
+                                            testing::Values(DATA_TYPE_INT8, DATA_TYPE_FLOAT)
+
+                                                ));
+
+TEST_P(ReformatLayerTest, ReformatLayer) {
+    // get param
+    int batch                = std::get<0>(GetParam());
+    int channel              = std::get<1>(GetParam());
+    int input_size           = std::get<2>(GetParam());
+    DataType input_data_type = std::get<3>(GetParam());
+    DeviceType dev           = ConvertDeviceType(FLAGS_dt);
+    if (DEVICE_ARM != dev) {
+        GTEST_SKIP();
+    }
+
+    DataType output_data_type = input_data_type == DATA_TYPE_INT8 ? DATA_TYPE_FLOAT : DATA_TYPE_INT8;
+
+    std::shared_ptr<ReformatLayerParam> param(new ReformatLayerParam());
+    param->name     = "Reformat";
+    param->src_type = input_data_type;
+    param->dst_type = output_data_type;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Reformat", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_relu6_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_relu6_layer.cc
new file mode 100644
index 0000000..48e92ca
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_relu6_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class Relu6LayerTest : public UnaryLayerTest {
+public:
+    Relu6LayerTest() : UnaryLayerTest(LAYER_RELU6) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, Relu6LayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(Relu6LayerTest, UnaryLayerTest) {
+    RunUnaryTest("Relu6");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_relu_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_relu_layer.cc
new file mode 100644
index 0000000..809046a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_relu_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class ReluLayerTest : public UnaryLayerTest {
+public:
+    ReluLayerTest() : UnaryLayerTest(LAYER_RELU) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReluLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_HALF, DATA_TYPE_BFP16, DATA_TYPE_FLOAT)));
+
+TEST_P(ReluLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Relu");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_reorg_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_reorg_layer.cc
new file mode 100644
index 0000000..a42f308
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_reorg_layer.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ReorgLayerTest : public LayerTest,
+                       public ::testing::WithParamInterface<std::tuple<int, int, int, int, bool, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReorgLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(36, 72),
+                                            testing::Values(6, 12, 18, 36),
+                                            // stride
+                                            testing::Values(2, 3),
+                                            // reverse
+                                            testing::Values(true, false), testing::Values(0, 1)));
+
+TEST_P(ReorgLayerTest, ReorgLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int stride     = std::get<3>(GetParam());
+    bool forward   = std::get<4>(GetParam());
+    int mode       = std::get<5>(GetParam());  // 0 : DCR, 1: CRD
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    if (mode == 1 && forward == 0) {
+        // illegal case
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ReorgLayerParam> param(new ReorgLayerParam());
+    param->name    = "Reorg";
+    param->stride  = stride;
+    param->forward = forward;
+    param->mode    = mode;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Reorg", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_reshape_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_reshape_layer.cc
new file mode 100644
index 0000000..17ee82a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_reshape_layer.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ReshapeLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ReshapeLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                         //dimensions
+                         testing::Values(2, 3, 4, 5, 6),
+                         // reshape type
+                         testing::Values(0, 1)));
+
+TEST_P(ReshapeLayerTest, ReshapeLayer) {
+    // get param
+    int batch        = std::get<0>(GetParam());
+    int channel      = std::get<1>(GetParam());
+    int input_size   = std::get<2>(GetParam());
+    int dim_size     = std::get<3>(GetParam());
+    int reshape_type = std::get<4>(GetParam());
+    DeviceType dev   = ConvertDeviceType(FLAGS_dt);
+
+    if (0 != reshape_type && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+    if (dim_size > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    // reshape_type 1 does not support dims>4
+    if (reshape_type == 1 && dim_size > 4) {
+        GTEST_SKIP();
+    }
+
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_size) input_dims.push_back(input_size);
+
+    // param
+    std::shared_ptr<ReshapeLayerParam> param(new ReshapeLayerParam());
+    param->name         = "Reshape";
+    param->reshape_type = reshape_type;
+    param->axis         = 0;
+    param->num_axes     = dim_size;
+    param->shape        = {0, -1};
+    while(param->shape.size() < dim_size) param->shape.push_back(1);
+ 
+    // generate interpreter
+    auto interpreter    = GenerateInterpreter("Reshape", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_selu_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_selu_layer.cc
new file mode 100644
index 0000000..00dce20
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_selu_layer.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class SeluLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SeluLayerTest, ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                                        testing::Values(2, 3, 4, 5)));
+
+TEST_P(SeluLayerTest, SeluLayer) {
+    // get param
+    int batch      = std::get<0>(GetParam());
+    int channel    = std::get<1>(GetParam());
+    int input_size = std::get<2>(GetParam());
+    int dim_count  = std::get<3>(GetParam());
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<SeluLayerParam> param(new SeluLayerParam());
+    param->alpha = 1.67326;
+    param->gamma = 1.0507;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("Selu", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_shuffle_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_shuffle_layer.cc
new file mode 100644
index 0000000..6bab2e5
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_shuffle_layer.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class ShuffleLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, ShuffleLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5, 6),
+                                            // group
+                                            testing::Values(1, 2, 3)));
+
+TEST_P(ShuffleLayerTest, ShuffleLayer) {
+    // get param
+    int batch             = std::get<0>(GetParam());
+    int channel_per_group = std::get<1>(GetParam());
+    int input_size        = std::get<2>(GetParam());
+    int dim_count         = std::get<3>(GetParam());
+    int group             = std::get<4>(GetParam());
+    int channel           = channel_per_group * group;
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+    if (DEVICE_HUAWEI_NPU == dev && dim_count != 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<ShuffleLayerParam> param(new ShuffleLayerParam());
+    param->name  = "ShuffleChannel";
+    param->group = group;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("ShuffleChannel", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_sigmoid_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_sigmoid_layer.cc
new file mode 100644
index 0000000..b21678f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_sigmoid_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class SigmoidLayerTest : public UnaryLayerTest {
+public:
+    SigmoidLayerTest() : UnaryLayerTest(LAYER_SIGMOID) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SigmoidLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(SigmoidLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Sigmoid");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_sign_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_sign_layer.cc
new file mode 100644
index 0000000..6ed8eac
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_sign_layer.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class SignLayerTest : public UnaryLayerTest {
+public:
+    SignLayerTest() : UnaryLayerTest(LAYER_SIGN) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SignLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SignLayerTest, UnaryLayerTest) {
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if (DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+    RunUnaryTest("Sign");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_signed_mul_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_signed_mul_layer.cc
new file mode 100644
index 0000000..0d3c229
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_signed_mul_layer.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+
+namespace TNN_NS {
+
+class SignedMulLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, float, float, float, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SignedMulLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5),
+                                            // alpha
+                                            testing::Values(1.0f, 0.0f, -1.0f),
+                                            // beta
+                                            testing::Values(1.0f, 0.0f, -1.0f),
+                                            // gema
+                                            testing::Values(2.0f, -1.0f),
+                                            // data type
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_BFP16)));
+
+TEST_P(SignedMulLayerTest, SignedMulLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    float alpha        = std::get<4>(GetParam());
+    float beta         = std::get<5>(GetParam());
+    float gamma        = std::get<6>(GetParam());
+    DataType data_type = std::get<7>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && dim_count > 4) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<SignedMulLayerParam> param(new SignedMulLayerParam());
+    param->alpha = alpha;
+    param->beta  = beta;
+    param->gamma = gamma;
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    auto interpreter            = GenerateInterpreter("SignedMul", {input_dims}, param);
+    Precision precision         = PRECISION_AUTO;
+    if (DATA_TYPE_BFP16 == data_type) {
+        precision = PRECISION_LOW;
+    }
+
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_sin_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_sin_layer.cc
new file mode 100644
index 0000000..1646360
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_sin_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class SinLayerTest : public UnaryLayerTest {
+public:
+    SinLayerTest() : UnaryLayerTest(LAYER_SIN) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SinLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SinLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Sin");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_softmax_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_softmax_layer.cc
new file mode 100644
index 0000000..7828993
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_softmax_layer.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+class SoftmaxLayerTest : public LayerTest,
+                         public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SoftmaxLayerTest,
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(10, 12, 10, 12, 512),
+                                            testing::Values(10, 512), testing::Values(1, 10, 512),
+                                            // axis
+                                            testing::Values(0, 1, 2, 3, 4),
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF, DATA_TYPE_BFP16)));
+
+TEST_P(SoftmaxLayerTest, SoftmaxLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_height   = std::get<2>(GetParam());
+    int input_width    = std::get<3>(GetParam());
+    int axis           = std::get<4>(GetParam());
+    int dim_count      = std::get<5>(GetParam());
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+    
+    if (dev == DEVICE_ARM && axis == 0) {
+        // arm do not support axis == 0 now
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && (dim_count > 4 || (axis != 1 && axis != 2))) {
+        // opencl only support axis = 1 or 2 for now
+        GTEST_SKIP();
+    }
+
+    if (2 == axis && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (dim_count != 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (channel < 2) {
+        GTEST_SKIP();
+    }
+
+    if (axis >= dim_count) {
+        GTEST_SKIP();
+    }
+
+    if ((channel == 512 && input_height == 512) || (input_width == 512 && input_height == 512) ||
+        (channel == 512 && input_width == 512)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<SoftmaxLayerParam> param(new SoftmaxLayerParam());
+    param->name = "Softmax";
+    param->axis = axis;
+
+    auto precision = SetPrecision(dev, data_type);
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel};
+    std::vector<int> input_sizes = {input_height, input_width};
+    auto idx = 0;
+    while(input_dims.size() < dim_count) {
+        input_dims.push_back(input_sizes[idx]);
+        idx  = (idx + 1) % 2;
+    }
+    auto interpreter            = GenerateInterpreter("Softmax", {input_dims}, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_softplus_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_softplus_layer.cc
new file mode 100644
index 0000000..ed59566
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_softplus_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class SoftplusLayerTest : public UnaryLayerTest {
+public:
+    SoftplusLayerTest() : UnaryLayerTest(LAYER_SOFTPLUS) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SoftplusLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SoftplusLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Softplus");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_splitv_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_splitv_layer.cc
new file mode 100644
index 0000000..2942279
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_splitv_layer.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class SplitVLayerTest : public LayerTest,
+                        public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SplitVLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5, 6),
+                                            // axis
+                                            testing::Values(0, 1, 2, 3, 4, 5),
+                                            // output cnt
+                                            testing::Values(2, 3),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SplitVLayerTest, SplitVLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    int axis           = std::get<4>(GetParam());
+    int output_count   = std::get<5>(GetParam());
+    DataType data_type = std::get<6>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (dim_count > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_OPENCL == dev && (dim_count > 4 || axis != 1)) {
+        GTEST_SKIP();
+    }
+
+    std::vector<int> input_dims = {batch, channel};
+    while (input_dims.size() < dim_count) input_dims.push_back(input_size);
+
+    if (input_dims[axis] < output_count) {
+        GTEST_SKIP();
+    }
+
+    if (axis >= dim_count) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<SplitVLayerParam> param(new SplitVLayerParam());
+    param->name   = "SplitV";
+    param->axis   = axis;
+    std::vector<int> slices;
+    int sum = 0;
+    for(int i=0; i<output_count; ++i) {
+        if (i != output_count - 1) {
+            slices.push_back(input_dims[axis] / output_count);
+            sum += input_dims[axis] / output_count;
+        } else {
+            slices.push_back(input_dims[axis] - sum);
+        }
+    }
+    param->slices = slices;
+
+    // generate interpreter
+    auto interpreter            = GenerateInterpreter("SplitV", {input_dims}, param, nullptr, output_count);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_sqrt_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_sqrt_layer.cc
new file mode 100644
index 0000000..0f9fc77
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_sqrt_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class SqrtLayerTest : public UnaryLayerTest {
+public:
+    SqrtLayerTest() : UnaryLayerTest(LAYER_SQRT) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SqrtLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SqrtLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Sqrt");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_squeeze_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_squeeze_layer.cc
new file mode 100644
index 0000000..89652da
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_squeeze_layer.cc
@@ -0,0 +1,94 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+static bool TestFilter(DeviceType device_type, DataType data_type) {
+    if (device_type == DEVICE_NAIVE || device_type == DEVICE_METAL)
+        return true;
+    return false;
+}
+
+class SqueezeLayerTest : public LayerTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int, int, int, std::vector<int>, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SqueezeLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5, 6),
+                                            // axis
+                                            testing::Values(std::vector<int>({0}), std::vector<int>({1}),
+                                                            std::vector<int>({2}), std::vector<int>({3}),
+                                                            std::vector<int>({0, 1}), std::vector<int>({0, 2}),
+                                                            std::vector<int>({1, -2}),std::vector<int>({0, -1}),
+                                                            std::vector<int>({0, 3}), std::vector<int>({1, 3}),
+                                                            std::vector<int>({1, 2, -1})),
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(SqueezeLayerTest, SqueezeLayer) {
+    // get param
+    int dim0         = std::get<0>(GetParam());
+    int dim1         = std::get<1>(GetParam());
+    int dim2         = std::get<2>(GetParam());
+    int dim_count    = std::get<3>(GetParam());
+    auto axes        = std::get<4>(GetParam());
+    DataType dtype   = std::get<5>(GetParam());
+    DeviceType dev   = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(dtype)) {
+        GTEST_SKIP();
+    }
+
+    if (!TestFilter(dev, dtype)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<SqueezeLayerParam> param(new SqueezeLayerParam());
+    param->name        = "Squeeze";
+    param->axes = axes;
+
+    // generate interpreter
+    std::vector<int> input_dims = {dim0, dim1};
+    while(input_dims.size() < dim_count) input_dims.push_back(dim2);
+    if (input_dims.size() - axes.size() < 2) {
+        GTEST_SKIP();
+    }
+
+    int erased_axes = 0;
+    for(int i=axes.size()-1; i>=0; --i) {
+        int axis = axes[i];
+        axis = axis >= 0 ? axis : axis + input_dims.size() - erased_axes;
+        if (axis >= input_dims.size() || axis < 0) {
+            GTEST_SKIP();
+        }
+        input_dims[axis] = 1;
+        erased_axes += 1;
+    }
+    auto interpreter            = GenerateInterpreter("Squeeze", {input_dims}, param);
+
+    Precision precision = PRECISION_AUTO;
+    if (DATA_TYPE_BFP16 == dtype) {
+        precision = PRECISION_LOW;
+    }
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_stride_slice_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_stride_slice_layer.cc
new file mode 100644
index 0000000..c49fff2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_stride_slice_layer.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class StrideSliceLayerTest : public LayerTest,
+                             public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, StrideSliceLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // begins offset
+                                            testing::Values(0, 1),
+                                            // channel stride Values(1, 2),
+                                            testing::Values(1, 2),
+                                            // w,h stride Values(1, 2),
+                                            testing::Values(1, 2),
+                                            // ends offset Values(0, -1)
+                                            testing::Values(0, -1)));
+
+TEST_P(StrideSliceLayerTest, StrideSliceLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int begin_offset   = std::get<3>(GetParam());
+    int channel_stride = std::get<4>(GetParam());
+    int wh_stride      = std::get<5>(GetParam());
+    int end_offset     = std::get<6>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    // param
+    std::shared_ptr<StrideSliceLayerParam> param(new StrideSliceLayerParam());
+    param->name    = "StrideSlice";
+    param->begins  = {begin_offset, begin_offset, begin_offset, 0};
+    param->strides = {wh_stride, wh_stride, channel_stride, 1};
+    param->ends    = {input_size + end_offset, input_size + end_offset, channel + end_offset, batch};
+
+    for (int i = 0; i < param->begins.size(); ++i) {
+        if (param->begins[i] >= param->ends[i]) {
+            GTEST_SKIP();
+        }
+    }
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("StridedSlice", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_sub_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_sub_layer.cc
new file mode 100644
index 0000000..a699e9e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_sub_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_binary_layer.h"
+
+namespace TNN_NS {
+
+class SubLayerTest : public BinaryLayerTest {
+public:
+    SubLayerTest() : BinaryLayerTest(LAYER_SUB) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, SubLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // input cnt
+                                            testing::Values(1, 2),
+                                            // param size type (1, channel, chw, hw)
+                                            testing::Values(0, 1, 2, 3),
+                                            // weight index
+                                            testing::Values(-1, 0, 1),
+                                            // dims
+                                            testing::Values(2, 3, 4),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(SubLayerTest, BinaryLayerTest) {
+    RunBinaryTest("Sub");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_tan_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_tan_layer.cc
new file mode 100644
index 0000000..cd237dd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_tan_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class TanLayerTest : public UnaryLayerTest {
+public:
+    TanLayerTest() : UnaryLayerTest(LAYER_TAN) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, TanLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(TanLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Tan");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_tanh_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_tanh_layer.cc
new file mode 100644
index 0000000..304f6ac
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_tanh_layer.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+
+namespace TNN_NS {
+
+class TanhLayerTest : public UnaryLayerTest {
+public:
+    TanhLayerTest() : UnaryLayerTest(LAYER_TANH) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, TanhLayerTest,
+                         ::testing::Combine(UNARY_BATCH_CHANNEL_SIZE,
+                                            testing::Values(2, 3, 4, 5),
+                                            testing::Values(DATA_TYPE_FLOAT)));
+
+TEST_P(TanhLayerTest, UnaryLayerTest) {
+    RunUnaryTest("Tanh");
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_tile_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_tile_layer.cc
new file mode 100644
index 0000000..6fe3f3c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_tile_layer.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/cpu_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class TileLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, int, int, int, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, TileLayerTest,
+                         ::testing::Combine(testing::Values(1, 2),
+                                            // channel
+                                            testing::Values(1, 3, 4, 10),
+                                            // input_height
+                                            testing::Values(3, 4, 10),
+                                            // input_weight
+                                            testing::Values(3, 4, 10),
+                                            // reps_batch
+                                            testing::Values(1, 2, 3),
+                                            // reps_channel
+                                            testing::Values(1, 2, 3),
+                                            // reps_height
+                                            testing::Values(1, 2, 3),
+                                            // reps_width
+                                            testing::Values(1, 2, 3),
+                                            // dtype
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_INT8)));
+
+TEST_P(TileLayerTest, TileLayer) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_height   = std::get<2>(GetParam());
+    int input_width    = std::get<3>(GetParam());
+    int reps_batch     = std::get<4>(GetParam());
+    int reps_channel   = std::get<5>(GetParam());
+    int reps_height    = std::get<6>(GetParam());
+    int reps_width     = std::get<7>(GetParam());
+    DataType data_type = std::get<8>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if (CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+    if (!(DEVICE_NAIVE == dev || DEVICE_ARM == dev || DEVICE_CUDA == dev || DEVICE_OPENCL == dev)) {
+        GTEST_SKIP();
+    }
+    Precision precision = SetPrecision(dev, data_type);
+
+    // param
+    std::shared_ptr<TileLayerParam> param(new TileLayerParam());
+    param->name = "Tile";
+    param->reps = {reps_batch, reps_channel, reps_height, reps_width};
+
+    // generate interpreter
+    std::vector<std::vector<int32_t>> input_dims_vec;
+    std::vector<int32_t> input_dims = {batch, channel, input_height, input_width};
+    input_dims_vec.push_back(input_dims);
+    auto interpreter = GenerateInterpreter("Tile", input_dims_vec, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.cc
new file mode 100644
index 0000000..fa39b5e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/test_unary_layer.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace TNN_NS {
+
+UnaryLayerTest::UnaryLayerTest(LayerType type) {
+    layer_type_ = type;
+}
+
+void UnaryLayerTest::RunUnaryTest(std::string type_str) {
+    // get param
+    int batch          = std::get<0>(GetParam());
+    int channel        = std::get<1>(GetParam());
+    int input_size     = std::get<2>(GetParam());
+    int dim_count      = std::get<3>(GetParam());
+    DataType data_type = std::get<4>(GetParam());
+    DeviceType dev     = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    //special for cuda skip
+    if ((type_str == "Reciprocal" || type_str == "Softplus") && DEVICE_CUDA == dev) {
+        GTEST_SKIP();
+    }
+
+    // skip dims > 4 for HUAWEI_NPU
+    if (dim_count > 4 && DEVICE_HUAWEI_NPU == dev) {
+        GTEST_SKIP();
+    }
+
+    std::shared_ptr<LayerParam> param(new LayerParam());
+    param->name = "Unary";
+
+    Precision precision = SetPrecision(dev, data_type);
+    
+    // generate proto string
+    std::vector<int> input_dims = {batch, channel};
+    while(input_dims.size() < dim_count) input_dims.push_back(input_size);
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    }
+
+    auto interpreter = GenerateInterpreter(type_str, {input_dims}, param);
+    Run(interpreter, precision);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.h b/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.h
new file mode 100644
index 0000000..80ead30
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_unary_layer.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_LAYER_TEST_UNARY_LAYER_HPP_
+#define TNN_TEST_UNIT_TEST_LAYER_TEST_UNARY_LAYER_HPP_
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class UnaryLayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int, DataType>> {
+public:
+    explicit UnaryLayerTest(LayerType type);
+    void RunUnaryTest(std::string type_str);
+
+protected:
+    LayerType layer_type_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_LAYER_TEST_UNARY_LAYER_HPP_
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_unsqueeze_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_unsqueeze_layer.cc
new file mode 100644
index 0000000..c737e46
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_unsqueeze_layer.cc
@@ -0,0 +1,87 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+static bool TestFilter(DeviceType device_type, DataType data_type) {
+    if (device_type == DEVICE_NAIVE)
+        return true;
+    
+    if (device_type == DEVICE_METAL && data_type == DATA_TYPE_FLOAT)
+        return true;
+    
+    return false;
+}
+
+class UnsqueezeLayerTest : public LayerTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int, int, int, std::vector<int>, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, UnsqueezeLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // dim count
+                                            testing::Values(2, 3, 4, 5, 6),
+                                            testing::Values(std::vector<int>({0}), std::vector<int>({1}),
+                                                            std::vector<int>({2}), std::vector<int>({3}),
+                                                            std::vector<int>({0, 1}), std::vector<int>({0, 2}),
+                                                            std::vector<int>({1, -2}),std::vector<int>({0, -1}),
+                                                            std::vector<int>({0, 3}), std::vector<int>({1, 2}),
+                                                            std::vector<int>({0, 1, -2})),  // axis
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_HALF)));
+
+TEST_P(UnsqueezeLayerTest, UnsqueezeLayer) {
+    // get param
+    int dim0         = std::get<0>(GetParam());
+    int dim1         = std::get<1>(GetParam());
+    int dim2         = std::get<2>(GetParam());
+    int dim_count    = std::get<3>(GetParam());
+    auto axes        = std::get<4>(GetParam());
+    DataType dtype   = std::get<5>(GetParam());
+    DeviceType dev   = ConvertDeviceType(FLAGS_dt);
+    if (!TestFilter(dev, dtype)) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<UnsqueezeLayerParam> param(new UnsqueezeLayerParam());
+    param->name        = "Unsqueeze";
+    param->axes = axes;
+
+    // generate interpreter
+    std::vector<int> input_dims = {dim0, dim1};
+    while(input_dims.size() < dim_count) input_dims.push_back(dim2);
+
+    int input_dim_size = input_dims.size();
+    for(int i=0; i<axes.size(); ++i) {
+        int axis = axes[i];
+        axis = axis >= 0 ? axis : axis + input_dims.size() + i;
+        if (axis < 0 || axis > input_dims.size()) {
+            GTEST_SKIP();
+        }
+    }
+    auto interpreter            = GenerateInterpreter("Unsqueeze", {input_dims}, param);
+
+    Precision precision = PRECISION_AUTO;
+    if (DATA_TYPE_BFP16 == dtype) {
+        precision = PRECISION_LOW;
+    }
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/layer_test/test_upsample_layer.cc b/3rdparty/TNN/test/unit_test/layer_test/test_upsample_layer.cc
new file mode 100644
index 0000000..8da1348
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/layer_test/test_upsample_layer.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/layer_test/layer_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/utils/network_helpers.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+class UpsampleLayerTest
+    : public LayerTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int, int, int, float, float, bool, DataType>> {};
+
+INSTANTIATE_TEST_SUITE_P(LayerTest, UpsampleLayerTest,
+                         ::testing::Combine(BASIC_BATCH_CHANNEL_SIZE,
+                                            // resize type 1:nearest 2:bilinear
+                                            testing::Values(1, 2, 3),
+                                            // align_corners
+                                            testing::Values(0, 1),
+                                            // scale x Values(1.0, 1.45, 2, 2.78)
+                                            testing::Values(0.3, 0.5, 1.0, 1.45, 2, 2.78),
+                                            // scale y Values(1.0, 1.45, 2, 2.78)
+                                            testing::Values(0.3, 0.5, 1.0, 1.45, 2, 2.78),
+                                            // use dims
+                                            testing::Values(true, false),
+                                            // data_type
+                                            testing::Values(DATA_TYPE_FLOAT, DATA_TYPE_INT8)));
+
+TEST_P(UpsampleLayerTest, UpsampleLayer) {
+    // get param
+    int batch         = std::get<0>(GetParam());
+    int channel       = std::get<1>(GetParam());
+    int input_size    = std::get<2>(GetParam());
+    int mode          = std::get<3>(GetParam());
+    int align_corners = std::get<4>(GetParam());
+    float scale_x     = std::get<5>(GetParam());
+    float scale_y     = std::get<6>(GetParam());
+    bool use_dims     = std::get<7>(GetParam());
+    auto data_type    = std::get<8>(GetParam());
+
+    DeviceType dev = ConvertDeviceType(FLAGS_dt);
+
+    if(CheckDataTypeSkip(data_type)) {
+        GTEST_SKIP();
+    }
+
+    if (mode == 3) {
+        // skip cubic upsample for now
+        if (data_type == DATA_TYPE_INT8 || DEVICE_HUAWEI_NPU == dev || DEVICE_CUDA == dev) {
+            GTEST_SKIP();
+        }
+    }
+
+    if (align_corners == 1 && DEVICE_CUDA == dev) {
+        // trt may get wrong result when align_corners == true
+        GTEST_SKIP();
+    }
+
+    if (DEVICE_HUAWEI_NPU == dev && scale_x * scale_y <= 1.0f/7.0f) {
+        GTEST_SKIP();
+    }
+
+    // param
+    std::shared_ptr<UpsampleLayerParam> param(new UpsampleLayerParam());
+    param->name          = "Upsample";
+    param->mode          = mode;
+    param->align_corners = align_corners;
+    param->scales        = {scale_x, scale_y};
+    if (use_dims) {
+        param->dims = {(int)round(scale_x * input_size), (int)round(scale_y * input_size)};
+    }
+
+    Precision precision = SetPrecision(dev, data_type);
+    if (DATA_TYPE_INT8 == data_type) {
+        param->quantized = true;
+    }
+
+    // generate interpreter
+    std::vector<int> input_dims = {batch, channel, input_size, input_size};
+    auto interpreter            = GenerateInterpreter("Upsample", {input_dims}, param);
+    Run(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/mat_converter_test.cc b/3rdparty/TNN/test/unit_test/mat_converter_test.cc
new file mode 100644
index 0000000..0c24579
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/mat_converter_test.cc
@@ -0,0 +1,445 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/mat_converter_test.h"
+#include "test/unit_test/unit_test_common.h"
+#include "test/unit_test/unit_test_macro.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/mat_utils.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "utils/network_helpers.h"
+
+namespace TNN_NS {
+
+AbstractDevice* MatConverterTest::cpu_;
+AbstractDevice* MatConverterTest::device_;
+Context* MatConverterTest::cpu_context_;
+Context* MatConverterTest::device_context_;
+
+void MatConverterTest::SetUpTestCase() {
+   SetUpEnvironment(&cpu_, &device_, &cpu_context_, &device_context_);
+}
+
+void MatConverterTest::TearDownTestCase() {
+    delete cpu_context_;
+    delete device_context_;
+}
+
+int MatConverterTest::CreateTestData(int batch, int channel, int input_size, MatType mat_type, int output_size) {
+    int mat_channel;
+    if (mat_type == N8UC4) {
+        mat_channel = 4;
+    } else {
+        mat_channel = channel;
+    }
+
+    int in_size             = batch * mat_channel * input_size * input_size;
+    out_size_               = batch * mat_channel * output_size * output_size;
+    mat_in_data_            = nullptr;
+    mat_out_ref_data_       = nullptr;
+    mat_out_dev_data_       = nullptr;
+
+    if (mat_type == NCHW_FLOAT) {
+        mat_in_data_ = malloc(in_size * sizeof(float));
+        InitRandom(static_cast<float*>(mat_in_data_), in_size, 0.0f, 1.0f);
+        mat_out_ref_data_ = malloc(out_size_ * sizeof(float));
+        mat_out_dev_data_ = malloc(out_size_ * sizeof(float));
+    } else {
+        mat_in_data_ = malloc(in_size * sizeof(uint8_t));
+        InitRandom(static_cast<uint8_t*>(mat_in_data_), in_size, static_cast<uint8_t>(0), static_cast<uint8_t>(255));
+        mat_out_ref_data_ = malloc(out_size_ * sizeof(uint8_t));
+        mat_out_dev_data_ = malloc(out_size_ * sizeof(uint8_t));
+    }
+
+    if (mat_type == NNV12 || mat_type == NNV21) {
+        out_size_ /= 2;
+    }
+
+    return 0;
+}
+
+int MatConverterTest::DestroyTestData() {
+    free(mat_in_data_);
+    free(mat_out_ref_data_);
+    free(mat_out_dev_data_);
+
+    return 0;
+}
+
+bool MatConverterTest::OpenCLTestFilter(const DeviceType& device_type, const MatType& mat_type) {
+    return device_type == DEVICE_OPENCL && mat_type != N8UC4;
+}
+
+bool MatConverterTest::MetalTestFilter(const DeviceType& device_type, const MatType& mat_type,
+                                       const MatConverterType& mat_converter_type, const int batch) {
+    // Metal device only supports NCHW_FLOAT and N8UC4 mat
+    if (device_type == DEVICE_METAL && !(mat_type == N8UC4 || mat_type == NCHW_FLOAT)) {
+        return true;
+    }
+    // Only Copy supports NCHW_FLOAT
+    if (device_type == DEVICE_METAL && mat_type == NCHW_FLOAT && mat_converter_type != MatConverterType::Copy) {
+        return true;
+    }
+    // Metal device only supports N8UC4 mat with batchsize = 1
+    if (device_type == DEVICE_METAL && mat_type == N8UC4 && batch != 1) {
+        return true;
+    }
+    // Disable interpolation-related tests on Metal
+    if (device_type == DEVICE_METAL && (mat_converter_type == MatConverterType::WarpAffine ||
+                mat_converter_type == MatConverterType::Resize)) {
+        return true;
+    }
+    return false;
+}
+
+bool MatConverterTest::CUDATestFilter(const DeviceType& device_type, const MatType& mat_type,
+                                       const MatConverterType& mat_converter_type) {
+    if (device_type != DEVICE_CUDA)
+        return false;
+
+    if (mat_converter_type == MatConverterType::WarpAffine && (mat_type == NNV12 || mat_type == NNV21)) {
+        return true;
+    }
+    if (mat_converter_type == MatConverterType::Resize && (mat_type == NNV12 || mat_type == NNV21)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool MatConverterTest::MatChannelCheck(const MatType& mat_type, const int channel, const int input_size) {
+    return (mat_type == NGRAY && channel != 1) ||
+           (mat_type == N8UC3 && channel != 3) ||
+           (mat_type == N8UC4 && channel != 4) ||
+           ((mat_type == NNV12 || mat_type == NNV21) && (channel != 3 || input_size % 2 != 0));
+}
+
+bool MatConverterTest::CvtColorCheck(const DeviceType& device_type, const MatType& mat_type,
+                                     const MatConverterType& mat_converter_type,
+                                     const ColorConversionType& cvt_type,
+                                     const int input_size) {
+    if (mat_converter_type == MatConverterType::CvtColor) {
+        if (device_type != DEVICE_ARM && device_type != DEVICE_X86) {
+            return true;
+        }
+        if (cvt_type == COLOR_CONVERT_BGRTOGRAY && mat_type != N8UC3) {
+            return true;
+        }
+        if (cvt_type == COLOR_CONVERT_RGBTOGRAY && mat_type != N8UC3) {
+            return true;
+        }
+        if (cvt_type == COLOR_CONVERT_BGRATOGRAY && mat_type != N8UC4) {
+            return true;
+        }
+        if (cvt_type == COLOR_CONVERT_RGBATOGRAY && mat_type != N8UC4) {
+            return true;
+        }
+        if ((cvt_type == COLOR_CONVERT_NV12TOBGR || cvt_type == COLOR_CONVERT_NV21TOBGR) &&
+            (mat_type != N8UC3 || input_size % 2 != 0)) {
+            return true;
+        }
+        if ((cvt_type == COLOR_CONVERT_NV12TOBGRA || cvt_type == COLOR_CONVERT_NV21TOBGRA) &&
+            (mat_type != N8UC4 || input_size % 2 != 0)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool MatConverterTest::CopyMakeBorderCheck(const DeviceType& device_type, const MatType& mat_type,
+                                           const MatConverterType& mat_converter_type) {
+    if (mat_converter_type == MatConverterType::CopyMakeBorder) {
+        if (mat_type == NNV12 || mat_type == NNV21) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool MatConverterTest::CropYUVCheck(const MatConverterTestParam& mat_converter_test_param,
+                                    const MatConverterType& mat_converter_type,
+                                    const MatType& mat_type) {
+    if (mat_converter_type == MatConverterType::Crop) {
+        auto param = mat_converter_test_param.crop_param;
+        if (mat_type == NNV12 || mat_type == NNV21) {
+            if (param.top_left_x % 2 || param.top_left_y % 2 || param.height % 2 || param.width % 2) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void MatConverterTest::GetOutputSize(const MatConverterTestParam& mat_converter_test_param,
+                                     const MatConverterType& mat_converter_type,
+                                     const int input_size,
+                                     int& output_size) {
+    if (mat_converter_type == MatConverterType::Resize){
+        output_size = int(round(mat_converter_test_param.resize_param.scale_h * input_size));
+    } else if (mat_converter_type == MatConverterType::Crop) {
+        output_size = mat_converter_test_param.crop_param.width;
+    } else if (mat_converter_type == MatConverterType::CopyMakeBorder) {
+        output_size = input_size + mat_converter_test_param.copy_make_border_param.top +
+                      mat_converter_test_param.copy_make_border_param.bottom;
+    } else {
+        output_size = input_size;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(MatConverterTest, MatConverterTest,
+                         ::testing::Combine(
+                            // batch
+                            testing::Values(1, 2), 
+                            // channel
+                            testing::Values(1, 3, 4), 
+                            // inputsize
+                            testing::Values(20, 23, 27, 150, 320),
+                            // mat type
+                            testing::Values(N8UC4, N8UC3, NGRAY, NNV12, NNV21),
+                            // converter test param
+                            testing::Values(
+                                // Copy
+                                MatConverterTestParam(MatConverterType::Copy),
+                                // Resize
+                                MatConverterTestParam(MatConverterType::Resize, 0.5, 0.5, INTERP_TYPE_LINEAR),
+                                MatConverterTestParam(MatConverterType::Resize, 0.5, 0.5, INTERP_TYPE_NEAREST),
+                                // Crop
+                                MatConverterTestParam(MatConverterType::Crop, 0, 0, 10, 10),
+                                MatConverterTestParam(MatConverterType::Crop, 5, 5, 10, 10),
+                                MatConverterTestParam(MatConverterType::Crop, 3, 7, 10, 10),
+                                MatConverterTestParam(MatConverterType::Crop, 7, 3, 10, 10),
+                                // WarpAffine
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 0.0),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 0.0),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 1, 0, 1, 0, 1, 1,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 1, 0, 50, 0, 1, 100,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 2, 1, 100, 3, 7, 50,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
+                                                      INTERP_TYPE_LINEAR, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                                                      INTERP_TYPE_NEAREST, BORDER_TYPE_CONSTANT, 0.0),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
+                                                      INTERP_TYPE_NEAREST, BORDER_TYPE_CONSTANT, 0.0),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 1, 0, 50, 0, 1, 100,
+                                                      INTERP_TYPE_NEAREST, BORDER_TYPE_CONSTANT, 255),
+                                MatConverterTestParam(MatConverterType::WarpAffine, 2, 1, 100, 3, 7, 50,
+                                                      INTERP_TYPE_NEAREST, BORDER_TYPE_CONSTANT, 255),
+                                // CvtColor
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_BGRTOGRAY),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_BGRATOGRAY),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_RGBTOGRAY),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_RGBATOGRAY),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_NV12TOBGR),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_NV21TOBGR),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_NV12TOBGRA),
+                                MatConverterTestParam(MatConverterType::CvtColor, COLOR_CONVERT_NV21TOBGRA),
+                                // CopyMakeBorder
+                                MatConverterTestParam(MatConverterType::CopyMakeBorder, 0, 10, 0, 10,
+                                                      BORDER_TYPE_CONSTANT, 0.0),
+                                MatConverterTestParam(MatConverterType::CopyMakeBorder, 5, 7, 5, 7,
+                                                      BORDER_TYPE_CONSTANT, 255.0),
+                                MatConverterTestParam(MatConverterType::CopyMakeBorder, 3, 9, 7, 5,
+                                                      BORDER_TYPE_CONSTANT, 100.0),
+                                MatConverterTestParam(MatConverterType::CopyMakeBorder, 7, 3, 3, 7,
+                                                      BORDER_TYPE_CONSTANT, 50.0)
+                                                      )
+                            ));
+
+#define CHECK_STATUS                                        \
+    if (status != TNN_OK) {                                 \
+        std::cout << status.description() << std::endl;     \
+        FAIL();                                             \
+    }
+
+TEST_P(MatConverterTest, MatConverterTest) {
+    int batch                                       = std::get<0>(GetParam());
+    int channel                                     = std::get<1>(GetParam());
+    int input_size                                  = std::get<2>(GetParam());
+    MatType mat_type                                = std::get<3>(GetParam());
+    MatConverterTestParam mat_converter_test_param  = std::get<4>(GetParam());
+    MatConverterType mat_converter_type             = mat_converter_test_param.mat_converter_type;
+    ColorConversionType cvt_type                    = mat_converter_test_param.cvt_type;
+
+    DeviceType device_type  = ConvertDeviceType(FLAGS_dt);
+    // warp affine/resize only support N8UC4 on OpenCL for now
+    if (OpenCLTestFilter(device_type, mat_type) ||
+        MetalTestFilter(device_type, mat_type, mat_converter_type, batch) ||
+        CUDATestFilter(device_type, mat_type, mat_converter_type)) {
+        GTEST_SKIP();
+    }
+    if (MatChannelCheck(mat_type, channel, input_size) ||
+        CvtColorCheck(device_type, mat_type, mat_converter_type, cvt_type, input_size) ||
+        CopyMakeBorderCheck(device_type, mat_type, mat_converter_type) ||
+        CropYUVCheck(mat_converter_test_param, mat_converter_type, mat_type)) {
+        GTEST_SKIP();
+    }
+    if (device_type == DEVICE_HUAWEI_NPU) {
+        GTEST_SKIP();
+    }
+
+    int output_size;
+    GetOutputSize(mat_converter_test_param, mat_converter_type, input_size, output_size);
+
+    if (mat_type == NNV12 || mat_type == NNV21) {
+        if (output_size % 2 != 0) {
+            GTEST_SKIP();
+        }
+    }
+
+    DimsVector dims         = {batch, channel, input_size, input_size};
+    DimsVector dims_out     = {batch, channel, output_size, output_size};;
+    int rtn = CreateTestData(batch, channel, input_size, mat_type, output_size);
+    EXPECT_EQ(rtn, 0);
+
+    Mat cpu_in_mat          = Mat(DEVICE_NAIVE, mat_type, dims, mat_in_data_);
+    Mat cpu_ref_mat         = Mat(DEVICE_NAIVE, mat_type, dims_out, mat_out_ref_data_);
+    Mat cpu_out_mat         = Mat(DEVICE_NAIVE, mat_type, dims_out, mat_out_dev_data_);
+    Mat device_mat          = Mat(device_type, mat_type, dims_out);
+    Mat device_in_mat       = Mat(device_type, mat_type, dims);
+    int cmp_result          = 0;
+    void* device_command_queue;
+    device_context_->GetCommandQueue(&device_command_queue);
+    switch (mat_converter_type)
+    {
+        case MatConverterType::Copy:
+        {
+            TNN_NS::Status status = MatUtils::Copy(cpu_in_mat, device_mat, device_command_queue);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+            CHECK_STATUS;
+
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_dev_data_), static_cast<uint8_t*>(mat_in_data_),
+                                      channel, channel, out_size_);
+
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+        case MatConverterType::Resize:
+        {
+            TNN_NS::Status status = MatUtils::Resize(cpu_in_mat, cpu_ref_mat, mat_converter_test_param.resize_param, NULL);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(cpu_in_mat, device_in_mat,
+                                           device_command_queue);
+            status = MatUtils::Resize(device_in_mat, device_mat,
+                                                 mat_converter_test_param.resize_param,
+                                                 device_command_queue);
+            CHECK_STATUS;
+
+            MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data_), static_cast<uint8_t*>(mat_out_dev_data_),
+                                      channel, channel, out_size_);
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+        case MatConverterType::Crop:
+        {
+            TNN_NS::Status status = MatUtils::Crop(cpu_in_mat, cpu_ref_mat, mat_converter_test_param.crop_param, NULL);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(cpu_in_mat, device_in_mat,
+                                           device_command_queue);
+            status = MatUtils::Crop(device_in_mat, device_mat,
+                                                 mat_converter_test_param.crop_param,
+                                                 device_command_queue);
+            CHECK_STATUS;
+
+            MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data_), static_cast<uint8_t*>(mat_out_dev_data_),
+                                      channel, channel, out_size_);
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+        case MatConverterType::WarpAffine:
+        {
+            TNN_NS::Status status = MatUtils::WarpAffine(cpu_in_mat, cpu_ref_mat,
+                                                           mat_converter_test_param.warp_affine_param,
+                                                           device_command_queue);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(cpu_in_mat, device_in_mat,
+                                           device_command_queue);
+            status = MatUtils::WarpAffine(device_in_mat, device_mat,
+                                                 mat_converter_test_param.warp_affine_param,
+                                                 device_command_queue);
+            CHECK_STATUS;
+
+            MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data_), static_cast<uint8_t*>(mat_out_dev_data_),
+                                      channel, channel, out_size_);
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+        case MatConverterType::CvtColor:
+        {
+            TNN_NS::Status status = MatUtils::CvtColor(cpu_in_mat, cpu_ref_mat, mat_converter_test_param.cvt_type, NULL);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(cpu_in_mat, device_in_mat,
+                                           device_command_queue);
+            status = MatUtils::CvtColor(device_in_mat, device_mat,
+                                                 mat_converter_test_param.cvt_type,
+                                                 device_command_queue);
+            CHECK_STATUS;
+
+            MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+
+            if (mat_converter_test_param.cvt_type == COLOR_CONVERT_BGRTOGRAY ||
+                mat_converter_test_param.cvt_type == COLOR_CONVERT_BGRATOGRAY ||
+                mat_converter_test_param.cvt_type == COLOR_CONVERT_RGBTOGRAY ||
+                mat_converter_test_param.cvt_type == COLOR_CONVERT_RGBATOGRAY) {
+                out_size_ /= channel;
+            }
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data_), static_cast<uint8_t*>(mat_out_dev_data_),
+                                      channel, channel, out_size_);
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+        case MatConverterType::CopyMakeBorder:
+        {
+            TNN_NS::Status status = MatUtils::CopyMakeBorder(cpu_in_mat, cpu_ref_mat,
+                                                             mat_converter_test_param.copy_make_border_param, NULL);
+            CHECK_STATUS;
+
+            status = MatUtils::Copy(cpu_in_mat, device_in_mat,
+                                           device_command_queue);
+            status = MatUtils::CopyMakeBorder(device_in_mat, device_mat,
+                                                 mat_converter_test_param.copy_make_border_param,
+                                                 device_command_queue);
+            CHECK_STATUS;
+
+            MatUtils::Copy(device_mat, cpu_out_mat, device_command_queue);
+            cmp_result |= CompareData(static_cast<uint8_t*>(mat_out_ref_data_), static_cast<uint8_t*>(mat_out_dev_data_),
+                                      channel, channel, out_size_);
+            EXPECT_EQ(0, cmp_result);
+            break;
+        }
+    }
+    rtn = DestroyTestData();
+    EXPECT_EQ(rtn, 0);
+}
+
+#undef CHECK_STATUS
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/mat_converter_test.h b/3rdparty/TNN/test/unit_test/mat_converter_test.h
new file mode 100644
index 0000000..d5af80d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/mat_converter_test.h
@@ -0,0 +1,161 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
+#define TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
+
+#include <gtest/gtest.h>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/mat_converter_acc.h"
+
+namespace TNN_NS {
+
+enum class MatConverterType
+{
+    Copy = 1,
+    Resize = 2,
+    Crop = 3,
+    WarpAffine = 4,
+    CvtColor = 5,
+    CopyMakeBorder = 6
+};
+
+struct MatConverterTestParam
+{
+    MatConverterType mat_converter_type;
+    // Resize
+    ResizeParam resize_param;
+    // Crop
+    CropParam crop_param;
+    // WarpAffine
+    WarpAffineParam warp_affine_param;
+    // CvtColor
+    ColorConversionType cvt_type = COLOR_CONVERT_NV12TOBGR;
+    // CopyMakeBorder
+    CopyMakeBorderParam copy_make_border_param;
+
+    // for Copy
+    MatConverterTestParam(MatConverterType converter_type) :
+        mat_converter_type(converter_type) { }
+
+    // for Resize
+    MatConverterTestParam(MatConverterType converter_type, float scale_w, float scale_h, InterpType type) :
+        mat_converter_type(converter_type) {
+        resize_param.scale_w    = scale_w;
+        resize_param.scale_h    = scale_h;
+        resize_param.type       = type;
+    }
+
+    // for Crop
+    MatConverterTestParam(MatConverterType converter_type, int top_left_x, int top_left_y, int width, int height) :
+        mat_converter_type(converter_type) {
+        crop_param.top_left_x   = top_left_x;
+        crop_param.top_left_y   = top_left_y;
+        crop_param.width        = width;
+        crop_param.height       = height;
+    }
+
+    // for WarpAffine
+    MatConverterTestParam(MatConverterType converter_type,
+                          float transform00,
+                          float transform01,
+                          float transform02,
+                          float transform10,
+                          float transform11,
+                          float transform12,
+                          InterpType interp_type,
+                          BorderType border_type,
+                          float border_val) :
+        mat_converter_type(converter_type) {
+        warp_affine_param.transform[0][0]   = transform00;
+        warp_affine_param.transform[0][1]   = transform01;
+        warp_affine_param.transform[0][2]   = transform02;
+        warp_affine_param.transform[1][0]   = transform10;
+        warp_affine_param.transform[1][1]   = transform11;
+        warp_affine_param.transform[1][2]   = transform12;
+        warp_affine_param.interp_type       = interp_type;
+        warp_affine_param.border_type       = border_type;
+        warp_affine_param.border_val        = border_val;
+    }
+
+    // for CvtColor
+    MatConverterTestParam(MatConverterType converter_type, ColorConversionType type) :
+        mat_converter_type(converter_type), cvt_type(type) { }
+
+    // for CopyMakeBorder
+    MatConverterTestParam(MatConverterType converter_type, int top, int bottom, int left, int right,
+                          BorderType border_type, float border_val) :
+        mat_converter_type(converter_type) {
+        copy_make_border_param.top         = top;
+        copy_make_border_param.bottom      = bottom;
+        copy_make_border_param.left        = left;
+        copy_make_border_param.right       = right;
+        copy_make_border_param.border_type = border_type;
+        copy_make_border_param.border_val  = border_val;
+    }
+};
+
+class MatConverterTest : public ::testing::TestWithParam<std::tuple<int, int, int, MatType, MatConverterTestParam>> {
+public:
+    static void SetUpTestCase();
+    static void TearDownTestCase();
+
+protected:
+    int Compare(Blob* cpu_blob, Blob* device_blob);
+    int CreateTestData(int batch, int channel, int input_size, MatType mat_type, int output_size);
+    int DestroyTestData();
+
+    bool OpenCLTestFilter(const DeviceType& device_type, const MatType& mat_type);
+    bool MetalTestFilter(const DeviceType& device_type, const MatType& mat_type,
+                         const MatConverterType& mat_converter_type, const int batch);
+    bool CUDATestFilter(const DeviceType& device_type, const MatType& mat_type,
+                         const MatConverterType& mat_converter_type);
+    bool MatChannelCheck(const MatType& mat_type, const int channel, const int input_size);
+    bool CropYUVCheck(const MatConverterTestParam& mat_converter_test_param,
+                      const MatConverterType& mat_converter_type,
+                      const MatType& mat_type);
+    bool CvtColorCheck(const DeviceType& device_type, const MatType& mat_type,
+                       const MatConverterType& mat_converter_type,
+                       const ColorConversionType& cvt_type,
+                       const int input_size);
+    bool CopyMakeBorderCheck(const DeviceType& device_type, const MatType& mat_type,
+                             const MatConverterType& mat_converter_type);
+    void GetOutputSize(const MatConverterTestParam& mat_converter_test_param,
+                       const MatConverterType& mat_converter_type,
+                       const int input_size,
+                       int& output_size);
+
+    void* mat_in_data_;
+    void* mat_out_ref_data_;
+    void* mat_out_dev_data_;
+
+    int out_size_;
+
+    static AbstractDevice* cpu_;
+    static AbstractDevice* device_;
+    static Context* cpu_context_;
+    static Context* device_context_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_BLOB_CONVERTER_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/.clang-format b/3rdparty/TNN/test/unit_test/third_party/googletest/.clang-format
new file mode 100755
index 0000000..5b9bfe6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/.gitignore b/3rdparty/TNN/test/unit_test/third_party/googletest/.gitignore
new file mode 100755
index 0000000..f08cb72
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/.gitignore
@@ -0,0 +1,84 @@
+# Ignore CI build directory
+build/
+xcuserdata
+cmake-build-debug/
+.idea/
+bazel-bin
+bazel-genfiles
+bazel-googletest
+bazel-out
+bazel-testlogs
+# python
+*.pyc
+
+# Visual Studio files
+.vs
+*.sdf
+*.opensdf
+*.VC.opendb
+*.suo
+*.user
+_ReSharper.Caches/
+Win32-Debug/
+Win32-Release/
+x64-Debug/
+x64-Release/
+
+# Ignore autoconf / automake files
+Makefile.in
+aclocal.m4
+configure
+build-aux/
+autom4te.cache/
+googletest/m4/libtool.m4
+googletest/m4/ltoptions.m4
+googletest/m4/ltsugar.m4
+googletest/m4/ltversion.m4
+googletest/m4/lt~obsolete.m4
+googlemock/m4
+
+# Ignore generated directories.
+googlemock/fused-src/
+googletest/fused-src/
+
+# macOS files
+.DS_Store
+googletest/.DS_Store
+googletest/xcode/.DS_Store
+
+# Ignore cmake generated directories and files.
+CMakeFiles
+CTestTestfile.cmake
+Makefile
+cmake_install.cmake
+googlemock/CMakeFiles
+googlemock/CTestTestfile.cmake
+googlemock/Makefile
+googlemock/cmake_install.cmake
+googlemock/gtest
+/bin
+/googlemock/gmock.dir
+/googlemock/gmock_main.dir
+/googlemock/RUN_TESTS.vcxproj.filters
+/googlemock/RUN_TESTS.vcxproj
+/googlemock/INSTALL.vcxproj.filters
+/googlemock/INSTALL.vcxproj
+/googlemock/gmock_main.vcxproj.filters
+/googlemock/gmock_main.vcxproj
+/googlemock/gmock.vcxproj.filters
+/googlemock/gmock.vcxproj
+/googlemock/gmock.sln
+/googlemock/ALL_BUILD.vcxproj.filters
+/googlemock/ALL_BUILD.vcxproj
+/lib
+/Win32
+/ZERO_CHECK.vcxproj.filters
+/ZERO_CHECK.vcxproj
+/RUN_TESTS.vcxproj.filters
+/RUN_TESTS.vcxproj
+/INSTALL.vcxproj.filters
+/INSTALL.vcxproj
+/googletest-distribution.sln
+/CMakeCache.txt
+/ALL_BUILD.vcxproj.filters
+/ALL_BUILD.vcxproj
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/.travis.yml b/3rdparty/TNN/test/unit_test/third_party/googletest/.travis.yml
new file mode 100755
index 0000000..56f7d7c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/.travis.yml
@@ -0,0 +1,73 @@
+# Build matrix / environment variable are explained on:
+# https://docs.travis-ci.com/user/customizing-the-build/
+# This file can be validated on:
+# http://lint.travis-ci.org/
+
+language: cpp
+
+# Define the matrix explicitly, manually expanding the combinations of (os, compiler, env).
+# It is more tedious, but grants us far more flexibility.
+matrix:
+  include:
+    - os: linux
+      before_install: chmod -R +x ./ci/*platformio.sh
+      install: ./ci/install-platformio.sh
+      script: ./ci/build-platformio.sh
+    - os: linux
+      dist: xenial
+      compiler: gcc
+      install: ./ci/install-linux.sh && ./ci/log-config.sh
+      script: ./ci/build-linux-bazel.sh
+    - os: linux
+      dist: xenial
+      compiler: clang
+      install: ./ci/install-linux.sh && ./ci/log-config.sh
+      script: ./ci/build-linux-bazel.sh
+    - os: linux
+      compiler: gcc
+      env: BUILD_TYPE=Debug VERBOSE=1 CXX_FLAGS=-std=c++11
+    - os: linux
+      compiler: clang
+      env: BUILD_TYPE=Release VERBOSE=1 CXX_FLAGS=-std=c++11 -Wgnu-zero-variadic-macro-arguments
+    - os: linux
+      compiler: clang
+      env: BUILD_TYPE=Release VERBOSE=1 CXX_FLAGS=-std=c++11 NO_EXCEPTION=ON NO_RTTI=ON COMPILER_IS_GNUCXX=ON
+    - os: osx
+      compiler: gcc
+      env: BUILD_TYPE=Release VERBOSE=1 CXX_FLAGS=-std=c++11 HOMEBREW_LOGS=~/homebrew-logs HOMEBREW_TEMP=~/homebrew-temp
+    - os: osx
+      compiler: clang
+      env: BUILD_TYPE=Release VERBOSE=1 CXX_FLAGS=-std=c++11 HOMEBREW_LOGS=~/homebrew-logs HOMEBREW_TEMP=~/homebrew-temp
+
+# These are the install and build (script) phases for the most common entries in the matrix.  They could be included
+# in each entry in the matrix, but that is just repetitive.
+install:
+  - ./ci/install-${TRAVIS_OS_NAME}.sh
+  - . ./ci/env-${TRAVIS_OS_NAME}.sh
+  - ./ci/log-config.sh
+
+script: ./ci/travis.sh
+
+# This section installs the necessary dependencies.
+addons:
+  apt:
+    # List of whitelisted in travis packages for ubuntu-precise can be found here:
+    #   https://github.com/travis-ci/apt-package-whitelist/blob/master/ubuntu-precise
+    # List of whitelisted in travis apt-sources:
+    #   https://github.com/travis-ci/apt-source-whitelist/blob/master/ubuntu.json
+    sources:
+    - ubuntu-toolchain-r-test
+    - llvm-toolchain-precise-3.9
+    packages:
+    - g++-4.9
+    - clang-3.9
+    update: true
+  homebrew:
+    packages:
+    - ccache
+    - gcc@4.9
+    - llvm@3.9
+    update: true
+
+notifications:
+  email: false
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/BUILD.bazel b/3rdparty/TNN/test/unit_test/third_party/googletest/BUILD.bazel
new file mode 100755
index 0000000..9b48aee
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/BUILD.bazel
@@ -0,0 +1,179 @@
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#   Bazel Build for Google C++ Testing Framework(Google Test)
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@bazel_tools//platforms:windows"],
+)
+
+config_setting(
+    name = "has_absl",
+    values = {"define": "absl=1"},
+)
+
+# Library that defines the FRIEND_TEST macro.
+cc_library(
+    name = "gtest_prod",
+    hdrs = ["googletest/include/gtest/gtest_prod.h"],
+    includes = ["googletest/include"],
+)
+
+# Google Test including Google Mock
+cc_library(
+    name = "gtest",
+    srcs = glob(
+        include = [
+            "googletest/src/*.cc",
+            "googletest/src/*.h",
+            "googletest/include/gtest/**/*.h",
+            "googlemock/src/*.cc",
+            "googlemock/include/gmock/**/*.h",
+        ],
+        exclude = [
+            "googletest/src/gtest-all.cc",
+            "googletest/src/gtest_main.cc",
+            "googlemock/src/gmock-all.cc",
+            "googlemock/src/gmock_main.cc",
+        ],
+    ),
+    hdrs = glob([
+        "googletest/include/gtest/*.h",
+        "googlemock/include/gmock/*.h",
+    ]),
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    defines = select({
+        ":has_absl": ["GTEST_HAS_ABSL=1"],
+        "//conditions:default": [],
+    }),
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "googlemock",
+        "googlemock/include",
+        "googletest",
+        "googletest/include",
+    ],
+    linkopts = select({
+        ":windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    deps = select({
+        ":has_absl": [
+            "@com_google_absl//absl/debugging:failure_signal_handler",
+            "@com_google_absl//absl/debugging:stacktrace",
+            "@com_google_absl//absl/debugging:symbolize",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/types:optional",
+            "@com_google_absl//absl/types:variant",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "gtest_main",
+    srcs = ["googlemock/src/gmock_main.cc"],
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    deps = [":gtest"],
+)
+
+# The following rules build samples of how to use gTest.
+cc_library(
+    name = "gtest_sample_lib",
+    srcs = [
+        "googletest/samples/sample1.cc",
+        "googletest/samples/sample2.cc",
+        "googletest/samples/sample4.cc",
+    ],
+    hdrs = [
+        "googletest/samples/prime_tables.h",
+        "googletest/samples/sample1.h",
+        "googletest/samples/sample2.h",
+        "googletest/samples/sample3-inl.h",
+        "googletest/samples/sample4.h",
+    ],
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_test(
+    name = "gtest_samples",
+    size = "small",
+    # All Samples except:
+    #   sample9 (main)
+    #   sample10 (main and takes a command line option and needs to be separate)
+    srcs = [
+        "googletest/samples/sample1_unittest.cc",
+        "googletest/samples/sample2_unittest.cc",
+        "googletest/samples/sample3_unittest.cc",
+        "googletest/samples/sample4_unittest.cc",
+        "googletest/samples/sample5_unittest.cc",
+        "googletest/samples/sample6_unittest.cc",
+        "googletest/samples/sample7_unittest.cc",
+        "googletest/samples/sample8_unittest.cc",
+    ],
+    linkstatic = 0,
+    deps = [
+        "gtest_sample_lib",
+        ":gtest_main",
+    ],
+)
+
+cc_test(
+    name = "sample9_unittest",
+    size = "small",
+    srcs = ["googletest/samples/sample9_unittest.cc"],
+    deps = [":gtest"],
+)
+
+cc_test(
+    name = "sample10_unittest",
+    size = "small",
+    srcs = ["googletest/samples/sample10_unittest.cc"],
+    deps = [":gtest"],
+)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/CMakeLists.txt b/3rdparty/TNN/test/unit_test/third_party/googletest/CMakeLists.txt
new file mode 100755
index 0000000..3fae2f9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+
+cmake_minimum_required(VERSION 2.8.8)
+
+if (POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif (POLICY CMP0048)
+
+project(googletest-distribution)
+set(GOOGLETEST_VERSION 1.9.0)
+
+if (CMAKE_VERSION VERSION_LESS "3.1")
+  add_definitions(-std=c++11)
+else()
+  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  if(NOT CYGWIN)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+  endif()
+endif()
+
+enable_testing()
+
+include(CMakeDependentOption)
+include(GNUInstallDirs)
+
+#Note that googlemock target already builds googletest
+option(BUILD_GMOCK "Builds the googlemock subproject" ON)
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" ON)
+
+if(BUILD_GMOCK)
+  add_subdirectory( googlemock )
+else()
+  add_subdirectory( googletest )
+endif()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/CONTRIBUTING.md b/3rdparty/TNN/test/unit_test/third_party/googletest/CONTRIBUTING.md
new file mode 100755
index 0000000..30c8d89
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/CONTRIBUTING.md
@@ -0,0 +1,142 @@
+# How to become a contributor and submit your own code
+
+## Contributor License Agreements
+
+We'd love to accept your patches! Before we can take them, we have to jump a
+couple of legal hurdles.
+
+Please fill out either the individual or corporate Contributor License Agreement
+(CLA).
+
+*   If you are an individual writing original source code and you're sure you
+    own the intellectual property, then you'll need to sign an
+    [individual CLA](https://developers.google.com/open-source/cla/individual).
+*   If you work for a company that wants to allow you to contribute your work,
+    then you'll need to sign a
+    [corporate CLA](https://developers.google.com/open-source/cla/corporate).
+
+Follow either of the two links above to access the appropriate CLA and
+instructions for how to sign and return it. Once we receive it, we'll be able to
+accept your pull requests.
+
+## Are you a Googler?
+
+If you are a Googler, please make an attempt to submit an internal change rather
+than a GitHub Pull Request. If you are not able to submit an internal change a
+PR is acceptable as an alternative.
+
+## Contributing A Patch
+
+1.  Submit an issue describing your proposed change to the
+    [issue tracker](https://github.com/google/googletest).
+2.  Please don't mix more than one logical change per submittal, because it
+    makes the history hard to follow. If you want to make a change that doesn't
+    have a corresponding issue in the issue tracker, please create one.
+3.  Also, coordinate with team members that are listed on the issue in question.
+    This ensures that work isn't being duplicated and communicating your plan
+    early also generally leads to better patches.
+4.  If your proposed change is accepted, and you haven't already done so, sign a
+    Contributor License Agreement (see details above).
+5.  Fork the desired repo, develop and test your code changes.
+6.  Ensure that your code adheres to the existing style in the sample to which
+    you are contributing.
+7.  Ensure that your code has an appropriate set of unit tests which all pass.
+8.  Submit a pull request.
+
+## The Google Test and Google Mock Communities
+
+The Google Test community exists primarily through the
+[discussion group](http://groups.google.com/group/googletestframework) and the
+GitHub repository. Likewise, the Google Mock community exists primarily through
+their own [discussion group](http://groups.google.com/group/googlemock). You are
+definitely encouraged to contribute to the discussion and you can also help us
+to keep the effectiveness of the group high by following and promoting the
+guidelines listed here.
+
+### Please Be Friendly
+
+Showing courtesy and respect to others is a vital part of the Google culture,
+and we strongly encourage everyone participating in Google Test development to
+join us in accepting nothing less. Of course, being courteous is not the same as
+failing to constructively disagree with each other, but it does mean that we
+should be respectful of each other when enumerating the 42 technical reasons
+that a particular proposal may not be the best choice. There's never a reason to
+be antagonistic or dismissive toward anyone who is sincerely trying to
+contribute to a discussion.
+
+Sure, C++ testing is serious business and all that, but it's also a lot of fun.
+Let's keep it that way. Let's strive to be one of the friendliest communities in
+all of open source.
+
+As always, discuss Google Test in the official GoogleTest discussion group. You
+don't have to actually submit code in order to sign up. Your participation
+itself is a valuable contribution.
+
+## Style
+
+To keep the source consistent, readable, diffable and easy to merge, we use a
+fairly rigid coding style, as defined by the
+[google-styleguide](https://github.com/google/styleguide) project. All patches
+will be expected to conform to the style outlined
+[here](https://google.github.io/styleguide/cppguide.html). Use
+[.clang-format](https://github.com/google/googletest/blob/master/.clang-format)
+to check your formatting
+
+## Requirements for Contributors
+
+If you plan to contribute a patch, you need to build Google Test, Google Mock,
+and their own tests from a git checkout, which has further requirements:
+
+*   [Python](https://www.python.org/) v2.3 or newer (for running some of the
+    tests and re-generating certain source files from templates)
+*   [CMake](https://cmake.org/) v2.6.4 or newer
+
+## Developing Google Test and Google Mock
+
+This section discusses how to make your own changes to the Google Test project.
+
+### Testing Google Test and Google Mock Themselves
+
+To make sure your changes work as intended and don't break existing
+functionality, you'll want to compile and run Google Test and GoogleMock's own
+tests. For that you can use CMake:
+
+    mkdir mybuild
+    cd mybuild
+    cmake -Dgtest_build_tests=ON -Dgmock_build_tests=ON ${GTEST_REPO_DIR}
+
+To choose between building only Google Test or Google Mock, you may modify your
+cmake command to be one of each
+
+    cmake -Dgtest_build_tests=ON ${GTEST_DIR} # sets up Google Test tests
+    cmake -Dgmock_build_tests=ON ${GMOCK_DIR} # sets up Google Mock tests
+
+Make sure you have Python installed, as some of Google Test's tests are written
+in Python. If the cmake command complains about not being able to find Python
+(`Could NOT find PythonInterp (missing: PYTHON_EXECUTABLE)`), try telling it
+explicitly where your Python executable can be found:
+
+    cmake -DPYTHON_EXECUTABLE=path/to/python ...
+
+Next, you can build Google Test and / or Google Mock and all desired tests. On
+\*nix, this is usually done by
+
+    make
+
+To run the tests, do
+
+    make test
+
+All tests should pass.
+
+### Regenerating Source Files
+
+Some of Google Test's source files are generated from templates (not in the C++
+sense) using a script. For example, the file
+include/gtest/internal/gtest-type-util.h.pump is used to generate
+gtest-type-util.h in the same directory.
+
+You don't need to worry about regenerating the source files unless you need to
+modify them. You would then modify the corresponding `.pump` files and run the
+'[pump.py](googletest/scripts/pump.py)' generator script. See the
+[Pump Manual](googletest/docs/pump_manual.md).
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/LICENSE b/3rdparty/TNN/test/unit_test/third_party/googletest/LICENSE
new file mode 100755
index 0000000..1941a11
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/README.md b/3rdparty/TNN/test/unit_test/third_party/googletest/README.md
new file mode 100755
index 0000000..5b417fa
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/README.md
@@ -0,0 +1,134 @@
+# Google Test
+
+#### OSS Builds Status:
+
+[![Build Status](https://api.travis-ci.org/google/googletest.svg?branch=master)](https://travis-ci.org/google/googletest)
+[![Build status](https://ci.appveyor.com/api/projects/status/4o38plt0xbo1ubc8/branch/master?svg=true)](https://ci.appveyor.com/project/GoogleTestAppVeyor/googletest/branch/master)
+
+### Future Plans
+
+#### 1.8.x Release:
+
+[the 1.8.x](https://github.com/google/googletest/releases/tag/release-1.8.1) is
+the last release that works with pre-C++11 compilers. The 1.8.x will not accept
+any requests for any new features and any bugfix requests will only be accepted
+if proven "critical"
+
+#### Post 1.8.x:
+
+On-going work to improve/cleanup/pay technical debt. When this work is completed
+there will be a 1.9.x tagged release
+
+#### Post 1.9.x
+
+Post 1.9.x googletest will follow
+[Abseil Live at Head philosophy](https://abseil.io/about/philosophy)
+
+## Welcome to **Google Test**, Google's C++ test framework!
+
+This repository is a merger of the formerly separate GoogleTest and GoogleMock
+projects. These were so closely related that it makes sense to maintain and
+release them together.
+
+Please subscribe to the mailing list at googletestframework@googlegroups.com for
+questions, discussions, and development.
+
+### Getting started:
+
+The information for **Google Test** is available in the
+[Google Test Primer](googletest/docs/primer.md) documentation.
+
+**Google Mock** is an extension to Google Test for writing and using C++ mock
+classes. See the separate [Google Mock documentation](googlemock/README.md).
+
+More detailed documentation for googletest is in its interior
+[googletest/README.md](googletest/README.md) file.
+
+## Features
+
+*   An [xUnit](https://en.wikipedia.org/wiki/XUnit) test framework.
+*   Test discovery.
+*   A rich set of assertions.
+*   User-defined assertions.
+*   Death tests.
+*   Fatal and non-fatal failures.
+*   Value-parameterized tests.
+*   Type-parameterized tests.
+*   Various options for running the tests.
+*   XML test report generation.
+
+## Platforms
+
+Google test has been used on a variety of platforms:
+
+*   Linux
+*   Mac OS X
+*   Windows
+*   Cygwin
+*   MinGW
+*   Windows Mobile
+*   Symbian
+*   PlatformIO
+
+## Who Is Using Google Test?
+
+In addition to many internal projects at Google, Google Test is also used by the
+following notable projects:
+
+*   The [Chromium projects](http://www.chromium.org/) (behind the Chrome browser
+    and Chrome OS).
+*   The [LLVM](http://llvm.org/) compiler.
+*   [Protocol Buffers](https://github.com/google/protobuf), Google's data
+    interchange format.
+*   The [OpenCV](http://opencv.org/) computer vision library.
+*   [tiny-dnn](https://github.com/tiny-dnn/tiny-dnn): header only,
+    dependency-free deep learning framework in C++11.
+
+## Related Open Source Projects
+
+[GTest Runner](https://github.com/nholthaus/gtest-runner) is a Qt5 based
+automated test-runner and Graphical User Interface with powerful features for
+Windows and Linux platforms.
+
+[Google Test UI](https://github.com/ospector/gtest-gbar) is test runner that
+runs your test binary, allows you to track its progress via a progress bar, and
+displays a list of test failures. Clicking on one shows failure text. Google
+Test UI is written in C#.
+
+[GTest TAP Listener](https://github.com/kinow/gtest-tap-listener) is an event
+listener for Google Test that implements the
+[TAP protocol](https://en.wikipedia.org/wiki/Test_Anything_Protocol) for test
+result output. If your test runner understands TAP, you may find it useful.
+
+[gtest-parallel](https://github.com/google/gtest-parallel) is a test runner that
+runs tests from your binary in parallel to provide significant speed-up.
+
+[GoogleTest Adapter](https://marketplace.visualstudio.com/items?itemName=DavidSchuldenfrei.gtest-adapter)
+is a VS Code extension allowing to view Google Tests in a tree view, and
+run/debug your tests.
+
+## Requirements
+
+Google Test is designed to have fairly minimal requirements to build and use
+with your projects, but there are some. If you notice any problems on your
+platform, please notify
+[googletestframework@googlegroups.com](https://groups.google.com/forum/#!forum/googletestframework).
+Patches for fixing them are welcome!
+
+### Build Requirements
+
+These are the base requirements to build and use Google Test from a source
+package:
+
+*   [Bazel](https://bazel.build/) or [CMake](https://cmake.org/). NOTE: Bazel is
+    the build system that googletest is using internally and tests against.
+    CMake is community-supported.
+
+*   a C++11-standard-compliant compiler
+
+## Contributing change
+
+Please read the [`CONTRIBUTING.md`](CONTRIBUTING.md) for details on how to
+contribute to this project.
+
+Happy testing!
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/WORKSPACE b/3rdparty/TNN/test/unit_test/third_party/googletest/WORKSPACE
new file mode 100755
index 0000000..2289bdb
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/WORKSPACE
@@ -0,0 +1,23 @@
+workspace(name = "com_google_googletest")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+# Abseil
+http_archive(
+     name = "com_google_absl",
+     urls = ["https://github.com/abseil/abseil-cpp/archive/master.zip"],
+     strip_prefix = "abseil-cpp-master",
+)
+
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-master",
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"],
+)
+
+http_archive(
+    name = "rules_python",
+    strip_prefix = "rules_python-master",
+    urls = ["https://github.com/bazelbuild/rules_python/archive/master.zip"],
+)
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/appveyor.yml b/3rdparty/TNN/test/unit_test/third_party/googletest/appveyor.yml
new file mode 100755
index 0000000..a58b768
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/appveyor.yml
@@ -0,0 +1,154 @@
+version: '{build}'
+
+os: Visual Studio 2015
+
+environment:
+  matrix:
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017"
+      build_system: cmake
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017 Win64"
+      build_system: cmake
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      enabled_on_pr: yes
+
+    - compiler: msvc-15-seh
+      build_system: bazel
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      enabled_on_pr: yes
+
+    - compiler: msvc-14-seh
+      build_system: cmake
+      generator: "Visual Studio 14 2015"
+      enabled_on_pr: yes
+
+    - compiler: msvc-14-seh
+      build_system: cmake
+      generator: "Visual Studio 14 2015 Win64"
+
+    - compiler: gcc-6.3.0-posix
+      build_system: cmake
+      generator: "MinGW Makefiles"
+      cxx_path: 'C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin'
+      enabled_on_pr: yes
+
+configuration:
+  - Debug
+
+build:
+  verbosity: minimal
+
+install:
+- ps: |
+    Write-Output "Compiler: $env:compiler"
+    Write-Output "Generator: $env:generator"
+    Write-Output "Env:Configuation: $env:configuration"
+    Write-Output "Env: $env"
+    if (-not (Test-Path env:APPVEYOR_PULL_REQUEST_NUMBER)) {
+      Write-Output "This is *NOT* a pull request build"
+    } else {
+      Write-Output "This is a pull request build"
+      if (-not (Test-Path env:enabled_on_pr) -or $env:enabled_on_pr -ne "yes") {
+        Write-Output "PR builds are *NOT* explicitly enabled"
+      }
+    }
+
+    # install Bazel
+    if ($env:build_system -eq "bazel") {
+        appveyor DownloadFile https://github.com/bazelbuild/bazel/releases/download/0.28.1/bazel-0.28.1-windows-x86_64.exe -FileName bazel.exe
+    }
+
+    if ($env:build_system -eq "cmake") {
+        # git bash conflicts with MinGW makefiles
+        if ($env:generator -eq "MinGW Makefiles") {
+            $env:path = $env:path.replace("C:\Program Files\Git\usr\bin;", "")
+            if ($env:cxx_path -ne "") {
+                $env:path += ";$env:cxx_path"
+            }
+        }
+    }
+
+before_build:
+- ps: |
+     $env:root=$env:APPVEYOR_BUILD_FOLDER
+     Write-Output "env:root: $env:root"
+
+build_script:
+- ps: |
+    # Only enable some builds for pull requests, the AppVeyor queue is too long.
+    if ((Test-Path env:APPVEYOR_PULL_REQUEST_NUMBER) -And (-not (Test-Path env:enabled_on_pr) -or $env:enabled_on_pr -ne "yes")) {
+      return
+    } else {
+        # special case - build with Bazel
+        if ($env:build_system -eq "bazel") {
+            & $env:root\bazel.exe build -c opt //:gtest_samples
+            if ($LastExitCode -eq 0) { # bazel writes to StdErr and PowerShell interprets it as an error
+                $host.SetShouldExit(0)
+            } else { # a real error
+                throw "Exec: $ErrorMessage"
+            }
+            return
+        }
+    }
+    # by default build with CMake
+    md _build -Force | Out-Null
+    cd _build
+
+    $conf = if ($env:generator -eq "MinGW Makefiles") {"-DCMAKE_BUILD_TYPE=$env:configuration"} else {"-DCMAKE_CONFIGURATION_TYPES=Debug;Release"}
+    # Disable test for MinGW (gtest tests fail, gmock tests can not build)
+    $gtest_build_tests = if ($env:generator -eq "MinGW Makefiles") {"-Dgtest_build_tests=OFF"} else {"-Dgtest_build_tests=ON"}
+    $gmock_build_tests = if ($env:generator -eq "MinGW Makefiles") {"-Dgmock_build_tests=OFF"} else {"-Dgmock_build_tests=ON"}
+    & cmake -G "$env:generator" $conf -Dgtest_build_samples=ON $gtest_build_tests $gmock_build_tests ..
+    if ($LastExitCode -ne 0) {
+        throw "Exec: $ErrorMessage"
+    }
+    $cmake_parallel = if ($env:generator -eq "MinGW Makefiles") {"-j2"} else  {"/m"}
+    & cmake --build . --config $env:configuration -- $cmake_parallel
+    if ($LastExitCode -ne 0) {
+        throw "Exec: $ErrorMessage"
+    }
+
+
+skip_commits:
+  files:
+    - '**/*.md'
+
+test_script:
+- ps: |
+    # Only enable some builds for pull requests, the AppVeyor queue is too long.
+    if ((Test-Path env:APPVEYOR_PULL_REQUEST_NUMBER) -And (-not (Test-Path env:enabled_on_pr) -or $env:enabled_on_pr -ne "yes")) {
+      return
+    }
+    if ($env:build_system -eq "bazel") {
+        # special case - testing with Bazel
+        & $env:root\bazel.exe test //:gtest_samples
+        if ($LastExitCode -eq 0) { # bazel writes to StdErr and PowerShell interprets it as an error
+            $host.SetShouldExit(0)
+        } else { # a real error
+            throw "Exec: $ErrorMessage"
+        }
+    }
+    if ($env:build_system -eq "cmake") {
+        # built with CMake - test with CTest
+        if ($env:generator -eq "MinGW Makefiles") {
+            return # No test available for MinGW
+        }
+
+        & ctest -C $env:configuration --timeout 600 --output-on-failure
+        if ($LastExitCode -ne 0) {
+            throw "Exec: $ErrorMessage"
+        }
+    }
+
+artifacts:
+  - path: '_build/CMakeFiles/*.log'
+    name: logs
+  - path: '_build/Testing/**/*.xml'
+    name: test_results
+  - path: 'bazel-testlogs/**/test.log'
+    name: test_logs
+  - path: 'bazel-testlogs/**/test.xml'
+    name: test_results
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-linux-bazel.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-linux-bazel.sh
new file mode 100755
index 0000000..ae8fb75
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-linux-bazel.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set -e
+
+bazel version
+bazel build --curses=no //...:all
+bazel test --curses=no //...:all
+bazel test --curses=no //...:all --define absl=1
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-platformio.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-platformio.sh
new file mode 100755
index 0000000..1d7658d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/build-platformio.sh
@@ -0,0 +1,2 @@
+# run PlatformIO builds
+platformio run
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-linux.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-linux.sh
new file mode 100755
index 0000000..37800d6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-linux.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# This file should be sourced, and not executed as a standalone script.
+#
+
+# TODO() - we can check if this is being sourced using $BASH_VERSION and $BASH_SOURCE[0] != ${0}.
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ]; then
+    if [ "$CXX" = "g++" ]; then export CXX="g++-4.9" CC="gcc-4.9"; fi
+    if [ "$CXX" = "clang++" ]; then export CXX="clang++-3.9" CC="clang-3.9"; fi
+fi
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-osx.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-osx.sh
new file mode 100755
index 0000000..9c421e1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/env-osx.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# This file should be sourced, and not executed as a standalone script.
+#
+
+# TODO() - we can check if this is being sourced using $BASH_VERSION and $BASH_SOURCE[0] != ${0}.
+#
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ]; then
+    if [ "$CXX" = "clang++" ]; then
+        # $PATH needs to be adjusted because the llvm tap doesn't install the
+        # package to /usr/local/bin, etc, like the gcc tap does.
+        # See: https://github.com/Homebrew/legacy-homebrew/issues/29733
+        clang_version=3.9
+        export PATH="/usr/local/opt/llvm@${clang_version}/bin:$PATH";
+    fi
+fi
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/get-nprocessors.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/get-nprocessors.sh
new file mode 100755
index 0000000..43635e7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/get-nprocessors.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This file is typically sourced by another script.
+# if possible, ask for the precise number of processors,
+# otherwise take 2 processors as reasonable default; see
+# https://docs.travis-ci.com/user/speeding-up-the-build/#Makefile-optimization
+if [ -x /usr/bin/getconf ]; then
+    NPROCESSORS=$(/usr/bin/getconf _NPROCESSORS_ONLN)
+else
+    NPROCESSORS=2
+fi
+
+# as of 2017-09-04 Travis CI reports 32 processors, but GCC build
+# crashes if parallelized too much (maybe memory consumption problem),
+# so limit to 4 processors for the time being.
+if [ $NPROCESSORS -gt 4 ] ; then
+	echo "$0:Note: Limiting processors to use by make from $NPROCESSORS to 4."
+	NPROCESSORS=4
+fi
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-linux.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-linux.sh
new file mode 100755
index 0000000..05e2cb2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-linux.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set -eu
+
+if [ "${TRAVIS_OS_NAME}" != linux ]; then
+    echo "Not a Linux build; skipping installation"
+    exit 0
+fi
+
+
+if [ "${TRAVIS_SUDO}" = "true" ]; then
+    echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \
+        sudo tee /etc/apt/sources.list.d/bazel.list
+    curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
+    sudo apt-get update && sudo apt-get install -y bazel gcc-4.9 g++-4.9 clang-3.9
+elif [ "${CXX}" = "clang++" ]; then
+    # Use ccache, assuming $HOME/bin is in the path, which is true in the Travis build environment.
+    ln -sf /usr/bin/ccache $HOME/bin/${CXX};
+    ln -sf /usr/bin/ccache $HOME/bin/${CC};
+fi
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-osx.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-osx.sh
new file mode 100755
index 0000000..f2baebd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-osx.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set -eu
+
+if [ "${TRAVIS_OS_NAME}" != "osx" ]; then
+    echo "Not a macOS build; skipping installation"
+    exit 0
+fi
+
+brew update
+brew install ccache gcc@4.9 llvm@3.9
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-platformio.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-platformio.sh
new file mode 100755
index 0000000..4d7860a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/install-platformio.sh
@@ -0,0 +1,5 @@
+# install PlatformIO
+sudo pip install -U platformio
+
+# update PlatformIO
+platformio update
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/log-config.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/log-config.sh
new file mode 100755
index 0000000..5fef119
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/log-config.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set -e
+
+# ccache on OS X needs installation first
+# reset ccache statistics
+ccache --zero-stats
+
+echo PATH=${PATH}
+
+echo "Compiler configuration:"
+echo CXX=${CXX}
+echo CC=${CC}
+echo CXXFLAGS=${CXXFLAGS}
+
+echo "C++ compiler version:"
+${CXX} --version || echo "${CXX} does not seem to support the --version flag"
+${CXX} -v || echo "${CXX} does not seem to support the -v flag"
+
+echo "C compiler version:"
+${CC} --version || echo "${CXX} does not seem to support the --version flag"
+${CC} -v || echo "${CXX} does not seem to support the -v flag"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/ci/travis.sh b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/travis.sh
new file mode 100755
index 0000000..9ff3bad
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/ci/travis.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env sh
+set -evx
+
+. ci/get-nprocessors.sh
+
+# if possible, ask for the precise number of processors,
+# otherwise take 2 processors as reasonable default; see
+# https://docs.travis-ci.com/user/speeding-up-the-build/#Makefile-optimization
+if [ -x /usr/bin/getconf ]; then
+    NPROCESSORS=$(/usr/bin/getconf _NPROCESSORS_ONLN)
+else
+    NPROCESSORS=2
+fi
+# as of 2017-09-04 Travis CI reports 32 processors, but GCC build
+# crashes if parallelized too much (maybe memory consumption problem),
+# so limit to 4 processors for the time being.
+if [ $NPROCESSORS -gt 4 ] ; then
+	echo "$0:Note: Limiting processors to use by make from $NPROCESSORS to 4."
+	NPROCESSORS=4
+fi
+# Tell make to use the processors. No preceding '-' required.
+MAKEFLAGS="j${NPROCESSORS}"
+export MAKEFLAGS
+
+env | sort
+
+# Set default values to OFF for these variables if not specified.
+: "${NO_EXCEPTION:=OFF}"
+: "${NO_RTTI:=OFF}"
+: "${COMPILER_IS_GNUCXX:=OFF}"
+
+mkdir build || true
+cd build
+cmake -Dgtest_build_samples=ON \
+      -Dgtest_build_tests=ON \
+      -Dgmock_build_tests=ON \
+      -Dcxx_no_exception=$NO_EXCEPTION \
+      -Dcxx_no_rtti=$NO_RTTI \
+      -DCMAKE_COMPILER_IS_GNUCXX=$COMPILER_IS_GNUCXX \
+      -DCMAKE_CXX_FLAGS=$CXX_FLAGS \
+      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+      ..
+make
+CTEST_OUTPUT_ON_FAILURE=1 make test
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CMakeLists.txt b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CMakeLists.txt
new file mode 100755
index 0000000..d32b70b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CMakeLists.txt
@@ -0,0 +1,233 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Mock.
+#
+# To run the tests for Google Mock itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+option(gmock_build_tests "Build all of Google Mock's own tests." OFF)
+
+# A directory to find Google Test sources.
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest/CMakeLists.txt")
+  set(gtest_dir gtest)
+else()
+  set(gtest_dir ../googletest)
+endif()
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include("${gtest_dir}/cmake/hermetic_build.cmake" OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  # Google Test also calls hermetic setup functions from add_subdirectory,
+  # although its changes will not affect things at the current scope.
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gmock_SOURCE_DIR} and to the root binary directory as
+# ${gmock_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gmock CXX C)
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.6.4)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+# Instructs CMake to process Google Test's CMakeLists.txt and add its
+# targets to the current scope.  We are placing Google Test's binary
+# directory in a subdirectory of our own as VC compilation may break
+# if they are the same (the default).
+add_subdirectory("${gtest_dir}" "${gmock_BINARY_DIR}/${gtest_dir}")
+
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gmock" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+else()
+  mark_as_advanced(gmock_build_tests)
+endif()
+
+# Although Google Test's CMakeLists.txt calls this function, the
+# changes there don't affect the current scope.  Therefore we have to
+# call it again here.
+config_compiler_and_linker()  # from ${gtest_dir}/cmake/internal_utils.cmake
+
+# Adds Google Mock's and Google Test's header directories to the search path.
+set(gmock_build_include_dirs
+  "${gmock_SOURCE_DIR}/include"
+  "${gmock_SOURCE_DIR}"
+  "${gtest_SOURCE_DIR}/include"
+  # This directory is needed to build directly from Google Test sources.
+  "${gtest_SOURCE_DIR}")
+include_directories(${gmock_build_include_dirs})
+
+########################################################################
+#
+# Defines the gmock & gmock_main libraries.  User tests should link
+# with one of them.
+
+# Google Mock libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that Google Mock can be compiled by
+# a user aggressive about warnings.
+if (MSVC)
+  cxx_library(gmock
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc)
+
+  cxx_library(gmock_main
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc
+              src/gmock_main.cc)
+else()
+  cxx_library(gmock "${cxx_strict}" src/gmock-all.cc)
+  target_link_libraries(gmock PUBLIC gtest)
+  cxx_library(gmock_main "${cxx_strict}" src/gmock_main.cc)
+  target_link_libraries(gmock_main PUBLIC gmock)
+endif()
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gmock SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gmock_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+
+########################################################################
+#
+# Install rules
+install_project(gmock gmock_main)
+
+########################################################################
+#
+# Google Mock's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Mock itself.
+#
+# The tests are not built by default.  To build them, set the
+# gmock_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgmock_build_tests=ON flag when running cmake.
+
+if (gmock_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  if (WIN32)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1"
+         CONTENT
+"$project_bin = \"${CMAKE_BINARY_DIR}/bin/$<CONFIG>\"
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  elseif (MINGW OR CYGWIN)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1"
+         CONTENT
+"$project_bin = (cygpath --windows ${CMAKE_BINARY_DIR}/bin)
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  endif()
+
+  if (MINGW OR CYGWIN)
+    if (CMAKE_VERSION VERSION_LESS "2.8.12")
+      add_compile_options("-Wa,-mbig-obj")
+    else()
+      add_definitions("-Wa,-mbig-obj")
+    endif()
+  endif()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(gmock-actions_test gmock_main)
+  cxx_test(gmock-cardinalities_test gmock_main)
+  cxx_test(gmock_ex_test gmock_main)
+  cxx_test(gmock-function-mocker_test gmock_main)
+  cxx_test(gmock-generated-actions_test gmock_main)
+  cxx_test(gmock-generated-function-mockers_test gmock_main)
+  cxx_test(gmock-generated-matchers_test gmock_main)
+  cxx_test(gmock-internal-utils_test gmock_main)
+  cxx_test(gmock-matchers_test gmock_main)
+  cxx_test(gmock-more-actions_test gmock_main)
+  cxx_test(gmock-nice-strict_test gmock_main)
+  cxx_test(gmock-port_test gmock_main)
+  cxx_test(gmock-spec-builders_test gmock_main)
+  cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc)
+  cxx_test(gmock_test gmock_main)
+
+  if (DEFINED GTEST_HAS_PTHREAD)
+    cxx_test(gmock_stress_test gmock)
+  endif()
+
+  # gmock_all_test is commented to save time building and running tests.
+  # Uncomment if necessary.
+  # cxx_test(gmock_all_test gmock_main)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  if (MSVC)
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  else()
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_exception PUBLIC gmock)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_rtti PUBLIC gmock)
+  endif()
+  cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}"
+    gmock_main_no_exception test/gmock-more-actions_test.cc)
+
+  cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}"
+    gmock_main_no_rtti test/gmock-spec-builders_test.cc)
+
+  cxx_shared_library(shared_gmock_main "${cxx_default}"
+    "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  # Tests that a binary can be built with Google Mock as a shared library.  On
+  # some system configurations, it may not possible to run the binary without
+  # knowing more details about the system configurations. We do not try to run
+  # this binary. To get a more robust shared library coverage, configure with
+  # -DBUILD_SHARED_LIBS=ON.
+  cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}"
+    shared_gmock_main test/gmock-spec-builders_test.cc)
+  set_target_properties(shared_gmock_test_
+    PROPERTIES
+    COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(gmock_leak_test_ test gmock_main)
+  py_test(gmock_leak_test)
+
+  cxx_executable(gmock_output_test_ test gmock)
+  py_test(gmock_output_test)
+endif()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CONTRIBUTORS b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CONTRIBUTORS
new file mode 100755
index 0000000..6e9ae36
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/CONTRIBUTORS
@@ -0,0 +1,40 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Mocking Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Benoit Sigoure <tsuna@google.com>
+Bogdan Piloca <boo@google.com>
+Chandler Carruth <chandlerc@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
+Gene Volovich <gv@cite.com>
+Hal Burch <gmock@hburch.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kostya Serebryany <kcc@google.com>
+Lev Makhlis
+Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
+Markus Heule <markus.heule@gmail.com>
+Matthew Simmons <simmonmt@acm.org>
+Mike Bland <mbland@google.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
+Paul Menage <menage@google.com>
+Piotr Kaminski <piotrk@google.com>
+Russ Rufer <russ@pentad.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/LICENSE b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/LICENSE
new file mode 100755
index 0000000..1941a11
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/README.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/README.md
new file mode 100755
index 0000000..183fdb8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/README.md
@@ -0,0 +1,44 @@
+# Googletest Mocking (gMock) Framework
+
+### Overview
+
+Google's framework for writing and using C++ mock classes. It can help you
+derive better designs of your system and write better tests.
+
+It is inspired by:
+
+*   [jMock](http://www.jmock.org/),
+*   [EasyMock](http://www.easymock.org/), and
+*   [Hamcrest](http://code.google.com/p/hamcrest/),
+
+and designed with C++'s specifics in mind.
+
+gMock:
+
+-   provides a declarative syntax for defining mocks,
+-   can define partial (hybrid) mocks, which are a cross of real and mock
+    objects,
+-   handles functions of arbitrary types and overloaded functions,
+-   comes with a rich set of matchers for validating function arguments,
+-   uses an intuitive syntax for controlling the behavior of a mock,
+-   does automatic verification of expectations (no record-and-replay needed),
+-   allows arbitrary (partial) ordering constraints on function calls to be
+    expressed,
+-   lets a user extend it by defining new matchers and actions.
+-   does not use exceptions, and
+-   is easy to learn and use.
+
+Details and examples can be found here:
+
+*   [gMock for Dummies](docs/for_dummies.md)
+*   [Legacy gMock FAQ](docs/gmock_faq.md)
+*   [gMock Cookbook](docs/cook_book.md)
+*   [gMock Cheat Sheet](docs/cheat_sheet.md)
+
+Please note that code under scripts/generator/ is from the [cppclean
+project](http://code.google.com/p/cppclean/) and under the Apache
+License, which is different from Google Mock's license.
+
+Google Mock is a part of
+[Google Test C++ testing framework](http://github.com/google/googletest/) and a
+subject to the same requirements.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock.pc.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock.pc.in
new file mode 100755
index 0000000..08e0454
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock.pc.in
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: gmock
+Description: GoogleMock (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest
+Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock_main.pc.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock_main.pc.in
new file mode 100755
index 0000000..b22fe61
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/cmake/gmock_main.pc.in
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: gmock_main
+Description: GoogleMock (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gmock
+Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cheat_sheet.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cheat_sheet.md
new file mode 100755
index 0000000..618c62d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cheat_sheet.md
@@ -0,0 +1,781 @@
+## gMock Cheat Sheet
+
+<!-- GOOGLETEST_CM0019 DO NOT DELETE -->
+
+<!-- GOOGLETEST_CM0033 DO NOT DELETE -->
+
+### Defining a Mock Class
+
+#### Mocking a Normal Class {#MockClass}
+
+Given
+
+```cpp
+class Foo {
+  ...
+  virtual ~Foo();
+  virtual int GetSize() const = 0;
+  virtual string Describe(const char* name) = 0;
+  virtual string Describe(int type) = 0;
+  virtual bool Process(Bar elem, int count) = 0;
+};
+```
+
+(note that `~Foo()` **must** be virtual) we can define its mock as
+
+```cpp
+#include "gmock/gmock.h"
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(string, Describe, (const char* name), (override));
+  MOCK_METHOD(string, Describe, (int type), (override));
+  MOCK_METHOD(bool, Process, (Bar elem, int count), (override));
+};
+```
+
+To create a "nice" mock, which ignores all uninteresting calls, a "naggy" mock,
+which warns on all uninteresting calls, or a "strict" mock, which treats them as
+failures:
+
+```cpp
+using ::testing::NiceMock;
+using ::testing::NaggyMock;
+using ::testing::StrictMock;
+
+NiceMock<MockFoo> nice_foo;      // The type is a subclass of MockFoo.
+NaggyMock<MockFoo> naggy_foo;    // The type is a subclass of MockFoo.
+StrictMock<MockFoo> strict_foo;  // The type is a subclass of MockFoo.
+```
+
+**Note:** A mock object is currently naggy by default. We may make it nice by
+default in the future.
+
+#### Mocking a Class Template {#MockTemplate}
+
+Class templates can be mocked just like any class.
+
+To mock
+
+```cpp
+template <typename Elem>
+class StackInterface {
+  ...
+  virtual ~StackInterface();
+  virtual int GetSize() const = 0;
+  virtual void Push(const Elem& x) = 0;
+};
+```
+
+(note that all member functions that are mocked, including `~StackInterface()`
+**must** be virtual).
+
+```cpp
+template <typename Elem>
+class MockStack : public StackInterface<Elem> {
+  ...
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(void, Push, (const Elem& x), (override));
+};
+```
+
+#### Specifying Calling Conventions for Mock Functions
+
+If your mock function doesn't use the default calling convention, you can
+specify it by adding `Calltype(convention)` to `MOCK_METHOD`'s 4th parameter.
+For example,
+
+```cpp
+  MOCK_METHOD(bool, Foo, (int n), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(int, Bar, (double x, double y),
+              (const, Calltype(STDMETHODCALLTYPE)));
+```
+
+where `STDMETHODCALLTYPE` is defined by `<objbase.h>` on Windows.
+
+### Using Mocks in Tests {#UsingMocks}
+
+The typical work flow is:
+
+1.  Import the gMock names you need to use. All gMock symbols are in the
+    `testing` namespace unless they are macros or otherwise noted.
+2.  Create the mock objects.
+3.  Optionally, set the default actions of the mock objects.
+4.  Set your expectations on the mock objects (How will they be called? What
+    will they do?).
+5.  Exercise code that uses the mock objects; if necessary, check the result
+    using googletest assertions.
+6.  When a mock object is destructed, gMock automatically verifies that all
+    expectations on it have been satisfied.
+
+Here's an example:
+
+```cpp
+using ::testing::Return;                          // #1
+
+TEST(BarTest, DoesThis) {
+  MockFoo foo;                                    // #2
+
+  ON_CALL(foo, GetSize())                         // #3
+      .WillByDefault(Return(1));
+  // ... other default actions ...
+
+  EXPECT_CALL(foo, Describe(5))                   // #4
+      .Times(3)
+      .WillRepeatedly(Return("Category 5"));
+  // ... other expectations ...
+
+  EXPECT_EQ("good", MyProductionFunction(&foo));  // #5
+}                                                 // #6
+```
+
+### Setting Default Actions {#OnCall}
+
+gMock has a **built-in default action** for any function that returns `void`,
+`bool`, a numeric value, or a pointer. In C++11, it will additionally returns
+the default-constructed value, if one exists for the given type.
+
+To customize the default action for functions with return type *`T`*:
+
+```cpp
+using ::testing::DefaultValue;
+
+// Sets the default value to be returned. T must be CopyConstructible.
+DefaultValue<T>::Set(value);
+// Sets a factory. Will be invoked on demand. T must be MoveConstructible.
+//  T MakeT();
+DefaultValue<T>::SetFactory(&MakeT);
+// ... use the mocks ...
+// Resets the default value.
+DefaultValue<T>::Clear();
+```
+
+Example usage:
+
+```cpp
+  // Sets the default action for return type std::unique_ptr<Buzz> to
+  // creating a new Buzz every time.
+  DefaultValue<std::unique_ptr<Buzz>>::SetFactory(
+      [] { return MakeUnique<Buzz>(AccessLevel::kInternal); });
+
+  // When this fires, the default action of MakeBuzz() will run, which
+  // will return a new Buzz object.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("hello")).Times(AnyNumber());
+
+  auto buzz1 = mock_buzzer_.MakeBuzz("hello");
+  auto buzz2 = mock_buzzer_.MakeBuzz("hello");
+  EXPECT_NE(nullptr, buzz1);
+  EXPECT_NE(nullptr, buzz2);
+  EXPECT_NE(buzz1, buzz2);
+
+  // Resets the default action for return type std::unique_ptr<Buzz>,
+  // to avoid interfere with other tests.
+  DefaultValue<std::unique_ptr<Buzz>>::Clear();
+```
+
+To customize the default action for a particular method of a specific mock
+object, use `ON_CALL()`. `ON_CALL()` has a similar syntax to `EXPECT_CALL()`,
+but it is used for setting default behaviors (when you do not require that the
+mock method is called). See [here](cook_book.md#UseOnCall) for a more detailed
+discussion.
+
+```cpp
+ON_CALL(mock-object, method(matchers))
+    .With(multi-argument-matcher)   ?
+    .WillByDefault(action);
+```
+
+### Setting Expectations {#ExpectCall}
+
+`EXPECT_CALL()` sets **expectations** on a mock method (How will it be called?
+What will it do?):
+
+```cpp
+EXPECT_CALL(mock-object, method (matchers)?)
+     .With(multi-argument-matcher)  ?
+     .Times(cardinality)            ?
+     .InSequence(sequences)         *
+     .After(expectations)           *
+     .WillOnce(action)              *
+     .WillRepeatedly(action)        ?
+     .RetiresOnSaturation();        ?
+```
+
+For each item above, `?` means it can be used at most once, while `*` means it
+can be used any number of times.
+
+In order to pass, `EXPECT_CALL` must be used before the calls are actually made.
+
+The `(matchers)` is a comma-separated list of matchers that correspond to each
+of the arguments of `method`, and sets the expectation only for calls of
+`method` that matches all of the matchers.
+
+If `(matchers)` is omitted, the expectation is the same as if the matchers were
+set to anything matchers (for example, `(_, _, _, _)` for a four-arg method).
+
+If `Times()` is omitted, the cardinality is assumed to be:
+
+*   `Times(1)` when there is neither `WillOnce()` nor `WillRepeatedly()`;
+*   `Times(n)` when there are `n` `WillOnce()`s but no `WillRepeatedly()`, where
+    `n` >= 1; or
+*   `Times(AtLeast(n))` when there are `n` `WillOnce()`s and a
+    `WillRepeatedly()`, where `n` >= 0.
+
+A method with no `EXPECT_CALL()` is free to be invoked *any number of times*,
+and the default action will be taken each time.
+
+### Matchers {#MatcherList}
+
+<!-- GOOGLETEST_CM0020 DO NOT DELETE -->
+
+A **matcher** matches a *single* argument. You can use it inside `ON_CALL()` or
+`EXPECT_CALL()`, or use it to validate a value directly using two macros:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+| Macro                                | Description                           |
+| :----------------------------------- | :------------------------------------ |
+| `EXPECT_THAT(actual_value, matcher)` | Asserts that `actual_value` matches `matcher`. |
+| `ASSERT_THAT(actual_value, matcher)` | The same as `EXPECT_THAT(actual_value, matcher)`, except that it generates a **fatal** failure. |
+<!-- mdformat on -->
+
+Built-in matchers (where `argument` is the function argument, e.g.
+`actual_value` in the example above, or when used in the context of
+`EXPECT_CALL(mock_object, method(matchers))`, the arguments of `method`) are
+divided into several categories:
+
+#### Wildcard
+
+Matcher                     | Description
+:-------------------------- | :-----------------------------------------------
+`_`                         | `argument` can be any value of the correct type.
+`A<type>()` or `An<type>()` | `argument` can be any value of type `type`.
+
+#### Generic Comparison
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                | Description                                         |
+| :--------------------- | :-------------------------------------------------- |
+| `Eq(value)` or `value` | `argument == value`                                 |
+| `Ge(value)`            | `argument >= value`                                 |
+| `Gt(value)`            | `argument > value`                                  |
+| `Le(value)`            | `argument <= value`                                 |
+| `Lt(value)`            | `argument < value`                                  |
+| `Ne(value)`            | `argument != value`                                 |
+| `IsFalse()`            | `argument` evaluates to `false` in a Boolean context. |
+| `IsTrue()`             | `argument` evaluates to `true` in a Boolean context. |
+| `IsNull()`             | `argument` is a `NULL` pointer (raw or smart).      |
+| `NotNull()`            | `argument` is a non-null pointer (raw or smart).    |
+| `Optional(m)`          | `argument` is `optional<>` that contains a value matching `m`. |
+| `VariantWith<T>(m)`    | `argument` is `variant<>` that holds the alternative of type T with a value matching `m`. |
+| `Ref(variable)`        | `argument` is a reference to `variable`.            |
+| `TypedEq<type>(value)` | `argument` has type `type` and is equal to `value`. You may need to use this instead of `Eq(value)` when the mock function is overloaded. |
+<!-- mdformat on -->
+
+Except `Ref()`, these matchers make a *copy* of `value` in case it's modified or
+destructed later. If the compiler complains that `value` doesn't have a public
+copy constructor, try wrap it in `ByRef()`, e.g.
+`Eq(ByRef(non_copyable_value))`. If you do that, make sure `non_copyable_value`
+is not changed afterwards, or the meaning of your matcher will be changed.
+
+#### Floating-Point Matchers {#FpMatchers}
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                          | Description                        |
+| :------------------------------- | :--------------------------------- |
+| `DoubleEq(a_double)`             | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as unequal. |
+| `FloatEq(a_float)`               | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as unequal. |
+| `NanSensitiveDoubleEq(a_double)` | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as equal. |
+| `NanSensitiveFloatEq(a_float)`   | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as equal. |
+<!-- mdformat on -->
+
+The above matchers use ULP-based comparison (the same as used in googletest).
+They automatically pick a reasonable error bound based on the absolute value of
+the expected value. `DoubleEq()` and `FloatEq()` conform to the IEEE standard,
+which requires comparing two NaNs for equality to return false. The
+`NanSensitive*` version instead treats two NaNs as equal, which is often what a
+user wants.
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                                           | Description              |
+| :------------------------------------------------ | :----------------------- |
+| `DoubleNear(a_double, max_abs_error)`             | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as unequal. |
+| `FloatNear(a_float, max_abs_error)`               | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as unequal. |
+| `NanSensitiveDoubleNear(a_double, max_abs_error)` | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as equal. |
+| `NanSensitiveFloatNear(a_float, max_abs_error)`   | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as equal. |
+<!-- mdformat on -->
+
+#### String Matchers
+
+The `argument` can be either a C string or a C++ string object:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                 | Description                                        |
+| :---------------------- | :------------------------------------------------- |
+| `ContainsRegex(string)` | `argument` matches the given regular expression.   |
+| `EndsWith(suffix)`      | `argument` ends with string `suffix`.              |
+| `HasSubstr(string)`     | `argument` contains `string` as a sub-string.      |
+| `MatchesRegex(string)`  | `argument` matches the given regular expression with the match starting at the first character and ending at the last character. |
+| `StartsWith(prefix)`    | `argument` starts with string `prefix`.            |
+| `StrCaseEq(string)`     | `argument` is equal to `string`, ignoring case.    |
+| `StrCaseNe(string)`     | `argument` is not equal to `string`, ignoring case. |
+| `StrEq(string)`         | `argument` is equal to `string`.                   |
+| `StrNe(string)`         | `argument` is not equal to `string`.               |
+<!-- mdformat on -->
+
+`ContainsRegex()` and `MatchesRegex()` take ownership of the `RE` object. They
+use the regular expression syntax defined
+[here](../../googletest/docs/advanced.md#regular-expression-syntax).
+`StrCaseEq()`, `StrCaseNe()`, `StrEq()`, and `StrNe()` work for wide strings as
+well.
+
+#### Container Matchers
+
+Most STL-style containers support `==`, so you can use `Eq(expected_container)`
+or simply `expected_container` to match a container exactly. If you want to
+write the elements in-line, match them more flexibly, or get more informative
+messages, you can use:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                                   | Description                      |
+| :---------------------------------------- | :------------------------------- |
+| `BeginEndDistanceIs(m)` | `argument` is a container whose `begin()` and `end()` iterators are separated by a number of increments matching `m`. E.g. `BeginEndDistanceIs(2)` or `BeginEndDistanceIs(Lt(2))`. For containers that define a `size()` method, `SizeIs(m)` may be more efficient. |
+| `ContainerEq(container)` | The same as `Eq(container)` except that the failure message also includes which elements are in one container but not the other. |
+| `Contains(e)` | `argument` contains an element that matches `e`, which can be either a value or a matcher. |
+| `Each(e)` | `argument` is a container where *every* element matches `e`, which can be either a value or a matcher. |
+| `ElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, where the *i*-th element matches `ei`, which can be a value or a matcher. |
+| `ElementsAreArray({e0, e1, ..., en})`, `ElementsAreArray(a_container)`, `ElementsAreArray(begin, end)`, `ElementsAreArray(array)`, or `ElementsAreArray(array, count)` | The same as `ElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `IsEmpty()` | `argument` is an empty container (`container.empty()`). |
+| `IsSubsetOf({e0, e1, ..., en})`, `IsSubsetOf(a_container)`, `IsSubsetOf(begin, end)`, `IsSubsetOf(array)`, or `IsSubsetOf(array, count)` | `argument` matches `UnorderedElementsAre(x0, x1, ..., xk)` for some subset `{x0, x1, ..., xk}` of the expected matchers. |
+| `IsSupersetOf({e0, e1, ..., en})`, `IsSupersetOf(a_container)`, `IsSupersetOf(begin, end)`, `IsSupersetOf(array)`, or `IsSupersetOf(array, count)` | Some subset of `argument` matches `UnorderedElementsAre(`expected matchers`)`. |
+| `Pointwise(m, container)`, `Pointwise(m, {e0, e1, ..., en})` | `argument` contains the same number of elements as in `container`, and for all i, (the i-th element in `argument`, the i-th element in `container`) match `m`, which is a matcher on 2-tuples. E.g. `Pointwise(Le(), upper_bounds)` verifies that each element in `argument` doesn't exceed the corresponding element in `upper_bounds`. See more detail below. |
+| `SizeIs(m)` | `argument` is a container whose size matches `m`. E.g. `SizeIs(2)` or `SizeIs(Lt(2))`. |
+| `UnorderedElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, and under *some* permutation of the elements, each element matches an `ei` (for a different `i`), which can be a value or a matcher. |
+| `UnorderedElementsAreArray({e0, e1, ..., en})`, `UnorderedElementsAreArray(a_container)`, `UnorderedElementsAreArray(begin, end)`, `UnorderedElementsAreArray(array)`, or `UnorderedElementsAreArray(array, count)` | The same as `UnorderedElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `UnorderedPointwise(m, container)`, `UnorderedPointwise(m, {e0, e1, ..., en})` | Like `Pointwise(m, container)`, but ignores the order of elements. |
+| `WhenSorted(m)` | When `argument` is sorted using the `<` operator, it matches container matcher `m`. E.g. `WhenSorted(ElementsAre(1, 2, 3))` verifies that `argument` contains elements 1, 2, and 3, ignoring order. |
+| `WhenSortedBy(comparator, m)` | The same as `WhenSorted(m)`, except that the given comparator instead of `<` is used to sort `argument`. E.g. `WhenSortedBy(std::greater(), ElementsAre(3, 2, 1))`. |
+<!-- mdformat on -->
+
+**Notes:**
+
+*   These matchers can also match:
+    1.  a native array passed by reference (e.g. in `Foo(const int (&a)[5])`),
+        and
+    2.  an array passed as a pointer and a count (e.g. in `Bar(const T* buffer,
+        int len)` -- see [Multi-argument Matchers](#MultiArgMatchers)).
+*   The array being matched may be multi-dimensional (i.e. its elements can be
+    arrays).
+*   `m` in `Pointwise(m, ...)` should be a matcher for `::std::tuple<T, U>`
+    where `T` and `U` are the element type of the actual container and the
+    expected container, respectively. For example, to compare two `Foo`
+    containers where `Foo` doesn't support `operator==`, one might write:
+
+    ```cpp
+    using ::std::get;
+    MATCHER(FooEq, "") {
+      return std::get<0>(arg).Equals(std::get<1>(arg));
+    }
+    ...
+    EXPECT_THAT(actual_foos, Pointwise(FooEq(), expected_foos));
+    ```
+
+#### Member Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                         | Description                                |
+| :------------------------------ | :----------------------------------------- |
+| `Field(&class::field, m)`       | `argument.field` (or `argument->field` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. |
+| `Key(e)`                        | `argument.first` matches `e`, which can be either a value or a matcher. E.g. `Contains(Key(Le(5)))` can verify that a `map` contains a key `<= 5`. |
+| `Pair(m1, m2)`                  | `argument` is an `std::pair` whose `first` field matches `m1` and `second` field matches `m2`. |
+| `Property(&class::property, m)` | `argument.property()` (or `argument->property()` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. |
+<!-- mdformat on -->
+
+#### Matching the Result of a Function, Functor, or Callback
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher          | Description                                       |
+| :--------------- | :------------------------------------------------ |
+| `ResultOf(f, m)` | `f(argument)` matches matcher `m`, where `f` is a function or functor. |
+<!-- mdformat on -->
+
+#### Pointer Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                   | Description                                     |
+| :------------------------ | :---------------------------------------------- |
+| `Pointee(m)`              | `argument` (either a smart pointer or a raw pointer) points to a value that matches matcher `m`. |
+| `WhenDynamicCastTo<T>(m)` | when `argument` is passed through `dynamic_cast<T>()`, it matches matcher `m`. |
+<!-- mdformat on -->
+
+<!-- GOOGLETEST_CM0026 DO NOT DELETE -->
+
+<!-- GOOGLETEST_CM0027 DO NOT DELETE -->
+
+#### Multi-argument Matchers {#MultiArgMatchers}
+
+Technically, all matchers match a *single* value. A "multi-argument" matcher is
+just one that matches a *tuple*. The following matchers can be used to match a
+tuple `(x, y)`:
+
+Matcher | Description
+:------ | :----------
+`Eq()`  | `x == y`
+`Ge()`  | `x >= y`
+`Gt()`  | `x > y`
+`Le()`  | `x <= y`
+`Lt()`  | `x < y`
+`Ne()`  | `x != y`
+
+You can use the following selectors to pick a subset of the arguments (or
+reorder them) to participate in the matching:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                    | Description                                     |
+| :------------------------- | :---------------------------------------------- |
+| `AllArgs(m)`               | Equivalent to `m`. Useful as syntactic sugar in `.With(AllArgs(m))`. |
+| `Args<N1, N2, ..., Nk>(m)` | The tuple of the `k` selected (using 0-based indices) arguments matches `m`, e.g. `Args<1, 2>(Eq())`. |
+<!-- mdformat on -->
+
+#### Composite Matchers
+
+You can make a matcher from one or more other matchers:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                          | Description                             |
+| :------------------------------- | :-------------------------------------- |
+| `AllOf(m1, m2, ..., mn)` | `argument` matches all of the matchers `m1` to `mn`. |
+| `AllOfArray({m0, m1, ..., mn})`, `AllOfArray(a_container)`, `AllOfArray(begin, end)`, `AllOfArray(array)`, or `AllOfArray(array, count)` | The same as `AllOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `AnyOf(m1, m2, ..., mn)` | `argument` matches at least one of the matchers `m1` to `mn`. |
+| `AnyOfArray({m0, m1, ..., mn})`, `AnyOfArray(a_container)`, `AnyOfArray(begin, end)`, `AnyOfArray(array)`, or `AnyOfArray(array, count)` | The same as `AnyOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `Not(m)` | `argument` doesn't match matcher `m`. |
+<!-- mdformat on -->
+
+<!-- GOOGLETEST_CM0028 DO NOT DELETE -->
+
+#### Adapters for Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                 | Description                           |
+| :---------------------- | :------------------------------------ |
+| `MatcherCast<T>(m)`     | casts matcher `m` to type `Matcher<T>`. |
+| `SafeMatcherCast<T>(m)` | [safely casts](cook_book.md#casting-matchers) matcher `m` to type `Matcher<T>`. |
+| `Truly(predicate)`      | `predicate(argument)` returns something considered by C++ to be true, where `predicate` is a function or functor. |
+<!-- mdformat on -->
+
+`AddressSatisfies(callback)` and `Truly(callback)` take ownership of `callback`,
+which must be a permanent callback.
+
+#### Using Matchers as Predicates {#MatchersAsPredicatesCheat}
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                       | Description                                 |
+| :---------------------------- | :------------------------------------------ |
+| `Matches(m)(value)` | evaluates to `true` if `value` matches `m`. You can use `Matches(m)` alone as a unary functor. |
+| `ExplainMatchResult(m, value, result_listener)` | evaluates to `true` if `value` matches `m`, explaining the result to `result_listener`. |
+| `Value(value, m)` | evaluates to `true` if `value` matches `m`. |
+<!-- mdformat on -->
+
+#### Defining Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                              | Description                           |
+| :----------------------------------- | :------------------------------------ |
+| `MATCHER(IsEven, "") { return (arg % 2) == 0; }` | Defines a matcher `IsEven()` to match an even number. |
+| `MATCHER_P(IsDivisibleBy, n, "") { *result_listener << "where the remainder is " << (arg % n); return (arg % n) == 0; }` | Defines a macher `IsDivisibleBy(n)` to match a number divisible by `n`. |
+| `MATCHER_P2(IsBetween, a, b, std::string(negation ? "isn't" : "is") + " between " + PrintToString(a) + " and " + PrintToString(b)) { return a <= arg && arg <= b; }` | Defines a matcher `IsBetween(a, b)` to match a value in the range [`a`, `b`]. |
+<!-- mdformat on -->
+
+**Notes:**
+
+1.  The `MATCHER*` macros cannot be used inside a function or class.
+2.  The matcher body must be *purely functional* (i.e. it cannot have any side
+    effect, and the result must not depend on anything other than the value
+    being matched and the matcher parameters).
+3.  You can use `PrintToString(x)` to convert a value `x` of any type to a
+    string.
+
+### Actions {#ActionList}
+
+**Actions** specify what a mock function should do when invoked.
+
+#### Returning a Value
+
+<!-- mdformat off(no multiline tables) -->
+|                             |                                               |
+| :-------------------------- | :-------------------------------------------- |
+| `Return()`                  | Return from a `void` mock function.           |
+| `Return(value)`             | Return `value`. If the type of `value` is     different to the mock function's return type, `value` is converted to the latter type <i>at the time the expectation is set</i>, not when the action is executed. |
+| `ReturnArg<N>()`            | Return the `N`-th (0-based) argument.         |
+| `ReturnNew<T>(a1, ..., ak)` | Return `new T(a1, ..., ak)`; a different      object is created each time. |
+| `ReturnNull()`              | Return a null pointer.                        |
+| `ReturnPointee(ptr)`        | Return the value pointed to by `ptr`.         |
+| `ReturnRef(variable)`       | Return a reference to `variable`.             |
+| `ReturnRefOfCopy(value)`    | Return a reference to a copy of `value`; the  copy lives as long as the action. |
+<!-- mdformat on -->
+
+#### Side Effects
+
+<!-- mdformat off(no multiline tables) -->
+|                                    |                                         |
+| :--------------------------------- | :-------------------------------------- |
+| `Assign(&variable, value)` | Assign `value` to variable. |
+| `DeleteArg<N>()` | Delete the `N`-th (0-based) argument, which must be a pointer. |
+| `SaveArg<N>(pointer)` | Save the `N`-th (0-based) argument to `*pointer`. |
+| `SaveArgPointee<N>(pointer)` | Save the value pointed to by the `N`-th (0-based) argument to `*pointer`. |
+| `SetArgReferee<N>(value)` | Assign value to the variable referenced by the `N`-th (0-based) argument. |
+| `SetArgPointee<N>(value)` | Assign `value` to the variable pointed by the `N`-th (0-based) argument. |
+| `SetArgumentPointee<N>(value)` | Same as `SetArgPointee<N>(value)`. Deprecated. Will be removed in v1.7.0. |
+| `SetArrayArgument<N>(first, last)` | Copies the elements in source range [`first`, `last`) to the array pointed to by the `N`-th (0-based) argument, which can be either a pointer or an iterator. The action does not take ownership of the elements in the source range. |
+| `SetErrnoAndReturn(error, value)` | Set `errno` to `error` and return `value`. |
+| `Throw(exception)` | Throws the given exception, which can be any copyable value. Available since v1.1.0. |
+<!-- mdformat on -->
+
+#### Using a Function, Functor, or Lambda as an Action
+
+In the following, by "callable" we mean a free function, `std::function`,
+functor, or lambda.
+
+<!-- mdformat off(no multiline tables) -->
+|                                     |                                        |
+| :---------------------------------- | :------------------------------------- |
+| `f` | Invoke f with the arguments passed to the mock function, where f is a callable. |
+| `Invoke(f)` | Invoke `f` with the arguments passed to the mock function, where `f` can be a global/static function or a functor. |
+| `Invoke(object_pointer, &class::method)` | Invoke the method on the object with the arguments passed to the mock function. |
+| `InvokeWithoutArgs(f)` | Invoke `f`, which can be a global/static function or a functor. `f` must take no arguments. |
+| `InvokeWithoutArgs(object_pointer, &class::method)` | Invoke the method on the object, which takes no arguments. |
+| `InvokeArgument<N>(arg1, arg2, ..., argk)` | Invoke the mock function's `N`-th (0-based) argument, which must be a function or a functor, with the `k` arguments. |
+<!-- mdformat on -->
+
+The return value of the invoked function is used as the return value of the
+action.
+
+When defining a callable to be used with `Invoke*()`, you can declare any unused
+parameters as `Unused`:
+
+```cpp
+using ::testing::Invoke;
+double Distance(Unused, double x, double y) { return sqrt(x*x + y*y); }
+...
+EXPECT_CALL(mock, Foo("Hi", _, _)).WillOnce(Invoke(Distance));
+```
+
+`Invoke(callback)` and `InvokeWithoutArgs(callback)` take ownership of
+`callback`, which must be permanent. The type of `callback` must be a base
+callback type instead of a derived one, e.g.
+
+```cpp
+  BlockingClosure* done = new BlockingClosure;
+  ... Invoke(done) ...;  // This won't compile!
+
+  Closure* done2 = new BlockingClosure;
+  ... Invoke(done2) ...;  // This works.
+```
+
+In `InvokeArgument<N>(...)`, if an argument needs to be passed by reference,
+wrap it inside `ByRef()`. For example,
+
+```cpp
+using ::testing::ByRef;
+using ::testing::InvokeArgument;
+...
+InvokeArgument<2>(5, string("Hi"), ByRef(foo))
+```
+
+calls the mock function's #2 argument, passing to it `5` and `string("Hi")` by
+value, and `foo` by reference.
+
+#### Default Action
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher       | Description                                            |
+| :------------ | :----------------------------------------------------- |
+| `DoDefault()` | Do the default action (specified by `ON_CALL()` or the built-in one). |
+<!-- mdformat on -->
+
+**Note:** due to technical reasons, `DoDefault()` cannot be used inside a
+composite action - trying to do so will result in a run-time error.
+
+<!-- GOOGLETEST_CM0032 DO NOT DELETE -->
+
+#### Composite Actions
+
+<!-- mdformat off(no multiline tables) -->
+|                                |                                             |
+| :----------------------------- | :------------------------------------------ |
+| `DoAll(a1, a2, ..., an)`       | Do all actions `a1` to `an` and return the result of `an` in each invocation. The first `n - 1` sub-actions must return void. |
+| `IgnoreResult(a)`              | Perform action `a` and ignore its result. `a` must not return void. |
+| `WithArg<N>(a)`                | Pass the `N`-th (0-based) argument of the mock function to action `a` and perform it. |
+| `WithArgs<N1, N2, ..., Nk>(a)` | Pass the selected (0-based) arguments of the mock function to action `a` and perform it. |
+| `WithoutArgs(a)`               | Perform action `a` without any arguments. |
+<!-- mdformat on -->
+
+#### Defining Actions
+
+<table border="1" cellspacing="0" cellpadding="1">
+  <tr>
+    <td>`struct SumAction {` <br>
+        &emsp;`template <typename T>` <br>
+        &emsp;`T operator()(T x, Ty) { return x + y; }` <br>
+        `};`
+    </td>
+    <td> Defines a generic functor that can be used as an action summing its
+    arguments. </td> </tr>
+  <tr>
+  </tr>
+</table>
+
+<!-- mdformat off(no multiline tables) -->
+|                                    |                                         |
+| :--------------------------------- | :-------------------------------------- |
+| `ACTION(Sum) { return arg0 + arg1; }` | Defines an action `Sum()` to return the sum of the mock function's argument #0 and #1. |
+| `ACTION_P(Plus, n) { return arg0 + n; }` | Defines an action `Plus(n)` to return the sum of the mock function's argument #0 and `n`. |
+| `ACTION_Pk(Foo, p1, ..., pk) { statements; }` | Defines a parameterized action `Foo(p1, ..., pk)` to execute the given `statements`. |
+<!-- mdformat on -->
+
+The `ACTION*` macros cannot be used inside a function or class.
+
+### Cardinalities {#CardinalityList}
+
+These are used in `Times()` to specify how many times a mock function will be
+called:
+
+<!-- mdformat off(no multiline tables) -->
+|                   |                                                        |
+| :---------------- | :----------------------------------------------------- |
+| `AnyNumber()`     | The function can be called any number of times.        |
+| `AtLeast(n)`      | The call is expected at least `n` times.               |
+| `AtMost(n)`       | The call is expected at most `n` times.                |
+| `Between(m, n)`   | The call is expected between `m` and `n` (inclusive) times. |
+| `Exactly(n) or n` | The call is expected exactly `n` times. In particular, the call should never happen when `n` is 0. |
+<!-- mdformat on -->
+
+### Expectation Order
+
+By default, the expectations can be matched in *any* order. If some or all
+expectations must be matched in a given order, there are two ways to specify it.
+They can be used either independently or together.
+
+#### The After Clause {#AfterClause}
+
+```cpp
+using ::testing::Expectation;
+...
+Expectation init_x = EXPECT_CALL(foo, InitX());
+Expectation init_y = EXPECT_CALL(foo, InitY());
+EXPECT_CALL(foo, Bar())
+     .After(init_x, init_y);
+```
+
+says that `Bar()` can be called only after both `InitX()` and `InitY()` have
+been called.
+
+If you don't know how many pre-requisites an expectation has when you write it,
+you can use an `ExpectationSet` to collect them:
+
+```cpp
+using ::testing::ExpectationSet;
+...
+ExpectationSet all_inits;
+for (int i = 0; i < element_count; i++) {
+  all_inits += EXPECT_CALL(foo, InitElement(i));
+}
+EXPECT_CALL(foo, Bar())
+     .After(all_inits);
+```
+
+says that `Bar()` can be called only after all elements have been initialized
+(but we don't care about which elements get initialized before the others).
+
+Modifying an `ExpectationSet` after using it in an `.After()` doesn't affect the
+meaning of the `.After()`.
+
+#### Sequences {#UsingSequences}
+
+When you have a long chain of sequential expectations, it's easier to specify
+the order using **sequences**, which don't require you to given each expectation
+in the chain a different name. *All expected calls* in the same sequence must
+occur in the order they are specified.
+
+```cpp
+using ::testing::Return;
+using ::testing::Sequence;
+Sequence s1, s2;
+...
+EXPECT_CALL(foo, Reset())
+    .InSequence(s1, s2)
+    .WillOnce(Return(true));
+EXPECT_CALL(foo, GetSize())
+    .InSequence(s1)
+    .WillOnce(Return(1));
+EXPECT_CALL(foo, Describe(A<const char*>()))
+    .InSequence(s2)
+    .WillOnce(Return("dummy"));
+```
+
+says that `Reset()` must be called before *both* `GetSize()` *and* `Describe()`,
+and the latter two can occur in any order.
+
+To put many expectations in a sequence conveniently:
+
+```cpp
+using ::testing::InSequence;
+{
+  InSequence seq;
+
+  EXPECT_CALL(...)...;
+  EXPECT_CALL(...)...;
+  ...
+  EXPECT_CALL(...)...;
+}
+```
+
+says that all expected calls in the scope of `seq` must occur in strict order.
+The name `seq` is irrelevant.
+
+### Verifying and Resetting a Mock
+
+gMock will verify the expectations on a mock object when it is destructed, or
+you can do it earlier:
+
+```cpp
+using ::testing::Mock;
+...
+// Verifies and removes the expectations on mock_obj;
+// returns true if successful.
+Mock::VerifyAndClearExpectations(&mock_obj);
+...
+// Verifies and removes the expectations on mock_obj;
+// also removes the default actions set by ON_CALL();
+// returns true if successful.
+Mock::VerifyAndClear(&mock_obj);
+```
+
+You can also tell gMock that a mock object can be leaked and doesn't need to be
+verified:
+
+```cpp
+Mock::AllowLeak(&mock_obj);
+```
+
+### Mock Classes
+
+gMock defines a convenient mock class template
+
+```cpp
+class MockFunction<R(A1, ..., An)> {
+ public:
+  MOCK_METHOD(R, Call, (A1, ..., An));
+};
+```
+
+See this [recipe](cook_book.md#using-check-points) for one application of it.
+
+### Flags
+
+<!-- mdformat off(no multiline tables) -->
+| Flag                           | Description                               |
+| :----------------------------- | :---------------------------------------- |
+| `--gmock_catch_leaked_mocks=0` | Don't report leaked mock objects as failures. |
+| `--gmock_verbose=LEVEL` | Sets the default verbosity level (`info`, `warning`, or `error`) of Google Mock messages. |
+<!-- mdformat on -->
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cook_book.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cook_book.md
new file mode 100755
index 0000000..28f7ba1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/cook_book.md
@@ -0,0 +1,4269 @@
+## gMock Cookbook
+
+<!-- GOOGLETEST_CM0012 DO NOT DELETE -->
+
+You can find recipes for using gMock here. If you haven't yet, please read
+[this](for_dummies.md) first to make sure you understand the basics.
+
+**Note:** gMock lives in the `testing` name space. For readability, it is
+recommended to write `using ::testing::Foo;` once in your file before using the
+name `Foo` defined by gMock. We omit such `using` statements in this section for
+brevity, but you should do it in your own code.
+
+### Creating Mock Classes
+
+Mock classes are defined as normal classes, using the `MOCK_METHOD` macro to
+generate mocked methods. The macro gets 3 or 4 parameters:
+
+```cpp
+class MyMock {
+ public:
+  MOCK_METHOD(ReturnType, MethodName, (Args...));
+  MOCK_METHOD(ReturnType, MethodName, (Args...), (Specs...));
+};
+```
+
+The first 3 parameters are simply the method declaration, split into 3 parts.
+The 4th parameter accepts a closed list of qualifiers, which affect the
+generated method:
+
+*   **`const`** - Makes the mocked method a `const` method. Required if
+    overriding a `const` method.
+*   **`override`** - Marks the method with `override`. Recommended if overriding
+    a `virtual` method.
+*   **`noexcept`** - Marks the method with `noexcept`. Required if overriding a
+    `noexcept` method.
+*   **`Calltype(...)`** - Sets the call type for the method (e.g. to
+    `STDMETHODCALLTYPE`), useful in Windows.
+
+#### Dealing with unprotected commas
+
+Unprotected commas, i.e. commas which are not surrounded by parentheses, prevent
+`MOCK_METHOD` from parsing its arguments correctly:
+
+```cpp {.bad}
+class MockFoo {
+ public:
+  MOCK_METHOD(std::pair<bool, int>, GetPair, ());  // Won't compile!
+  MOCK_METHOD(bool, CheckMap, (std::map<int, double>, bool));  // Won't compile!
+};
+```
+
+Solution 1 - wrap with parentheses:
+
+```cpp {.good}
+class MockFoo {
+ public:
+  MOCK_METHOD((std::pair<bool, int>), GetPair, ());
+  MOCK_METHOD(bool, CheckMap, ((std::map<int, double>), bool));
+};
+```
+
+Note that wrapping a return or argument type with parentheses is, in general,
+invalid C++. `MOCK_METHOD` removes the parentheses.
+
+Solution 2 - define an alias:
+
+```cpp {.good}
+class MockFoo {
+ public:
+  using BoolAndInt = std::pair<bool, int>;
+  MOCK_METHOD(BoolAndInt, GetPair, ());
+  using MapIntDouble = std::map<int, double>;
+  MOCK_METHOD(bool, CheckMap, (MapIntDouble, bool));
+};
+```
+
+#### Mocking Private or Protected Methods
+
+You must always put a mock method definition (`MOCK_METHOD`) in a `public:`
+section of the mock class, regardless of the method being mocked being `public`,
+`protected`, or `private` in the base class. This allows `ON_CALL` and
+`EXPECT_CALL` to reference the mock function from outside of the mock class.
+(Yes, C++ allows a subclass to change the access level of a virtual function in
+the base class.) Example:
+
+```cpp
+class Foo {
+ public:
+  ...
+  virtual bool Transform(Gadget* g) = 0;
+
+ protected:
+  virtual void Resume();
+
+ private:
+  virtual int GetTimeOut();
+};
+
+class MockFoo : public Foo {
+ public:
+  ...
+  MOCK_METHOD(bool, Transform, (Gadget* g), (override));
+
+  // The following must be in the public section, even though the
+  // methods are protected or private in the base class.
+  MOCK_METHOD(void, Resume, (), (override));
+  MOCK_METHOD(int, GetTimeOut, (), (override));
+};
+```
+
+#### Mocking Overloaded Methods
+
+You can mock overloaded functions as usual. No special attention is required:
+
+```cpp
+class Foo {
+  ...
+
+  // Must be virtual as we'll inherit from Foo.
+  virtual ~Foo();
+
+  // Overloaded on the types and/or numbers of arguments.
+  virtual int Add(Element x);
+  virtual int Add(int times, Element x);
+
+  // Overloaded on the const-ness of this object.
+  virtual Bar& GetBar();
+  virtual const Bar& GetBar() const;
+};
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(int, Add, (Element x), (override));
+  MOCK_METHOD(int, Add, (int times, Element x), (override));
+
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+  MOCK_METHOD(const Bar&, GetBar, (), (const, override));
+};
+```
+
+**Note:** if you don't mock all versions of the overloaded method, the compiler
+will give you a warning about some methods in the base class being hidden. To
+fix that, use `using` to bring them in scope:
+
+```cpp
+class MockFoo : public Foo {
+  ...
+  using Foo::Add;
+  MOCK_METHOD(int, Add, (Element x), (override));
+  // We don't want to mock int Add(int times, Element x);
+  ...
+};
+```
+
+#### Mocking Class Templates
+
+You can mock class templates just like any class.
+
+```cpp
+template <typename Elem>
+class StackInterface {
+  ...
+  // Must be virtual as we'll inherit from StackInterface.
+  virtual ~StackInterface();
+
+  virtual int GetSize() const = 0;
+  virtual void Push(const Elem& x) = 0;
+};
+
+template <typename Elem>
+class MockStack : public StackInterface<Elem> {
+  ...
+  MOCK_METHOD(int, GetSize, (), (override));
+  MOCK_METHOD(void, Push, (const Elem& x), (override));
+};
+```
+
+#### Mocking Non-virtual Methods {#MockingNonVirtualMethods}
+
+gMock can mock non-virtual functions to be used in Hi-perf dependency
+injection.<!-- GOOGLETEST_CM0017 DO NOT DELETE -->
+
+In this case, instead of sharing a common base class with the real class, your
+mock class will be *unrelated* to the real class, but contain methods with the
+same signatures. The syntax for mocking non-virtual methods is the *same* as
+mocking virtual methods (just don't add `override`):
+
+```cpp
+// A simple packet stream class.  None of its members is virtual.
+class ConcretePacketStream {
+ public:
+  void AppendPacket(Packet* new_packet);
+  const Packet* GetPacket(size_t packet_number) const;
+  size_t NumberOfPackets() const;
+  ...
+};
+
+// A mock packet stream class.  It inherits from no other, but defines
+// GetPacket() and NumberOfPackets().
+class MockPacketStream {
+ public:
+  MOCK_METHOD(const Packet*, GetPacket, (size_t packet_number), (const));
+  MOCK_METHOD(size_t, NumberOfPackets, (), (const));
+  ...
+};
+```
+
+Note that the mock class doesn't define `AppendPacket()`, unlike the real class.
+That's fine as long as the test doesn't need to call it.
+
+Next, you need a way to say that you want to use `ConcretePacketStream` in
+production code, and use `MockPacketStream` in tests. Since the functions are
+not virtual and the two classes are unrelated, you must specify your choice at
+*compile time* (as opposed to run time).
+
+One way to do it is to templatize your code that needs to use a packet stream.
+More specifically, you will give your code a template type argument for the type
+of the packet stream. In production, you will instantiate your template with
+`ConcretePacketStream` as the type argument. In tests, you will instantiate the
+same template with `MockPacketStream`. For example, you may write:
+
+```cpp
+template <class PacketStream>
+void CreateConnection(PacketStream* stream) { ... }
+
+template <class PacketStream>
+class PacketReader {
+ public:
+  void ReadPackets(PacketStream* stream, size_t packet_num);
+};
+```
+
+Then you can use `CreateConnection<ConcretePacketStream>()` and
+`PacketReader<ConcretePacketStream>` in production code, and use
+`CreateConnection<MockPacketStream>()` and `PacketReader<MockPacketStream>` in
+tests.
+
+```cpp
+  MockPacketStream mock_stream;
+  EXPECT_CALL(mock_stream, ...)...;
+  .. set more expectations on mock_stream ...
+  PacketReader<MockPacketStream> reader(&mock_stream);
+  ... exercise reader ...
+```
+
+#### Mocking Free Functions
+
+It's possible to use gMock to mock a free function (i.e. a C-style function or a
+static method). You just need to rewrite your code to use an interface (abstract
+class).
+
+Instead of calling a free function (say, `OpenFile`) directly, introduce an
+interface for it and have a concrete subclass that calls the free function:
+
+```cpp
+class FileInterface {
+ public:
+  ...
+  virtual bool Open(const char* path, const char* mode) = 0;
+};
+
+class File : public FileInterface {
+ public:
+  ...
+  virtual bool Open(const char* path, const char* mode) {
+     return OpenFile(path, mode);
+  }
+};
+```
+
+Your code should talk to `FileInterface` to open a file. Now it's easy to mock
+out the function.
+
+This may seem like a lot of hassle, but in practice you often have multiple
+related functions that you can put in the same interface, so the per-function
+syntactic overhead will be much lower.
+
+If you are concerned about the performance overhead incurred by virtual
+functions, and profiling confirms your concern, you can combine this with the
+recipe for [mocking non-virtual methods](#MockingNonVirtualMethods).
+
+#### Old-Style `MOCK_METHODn` Macros
+
+Before the generic `MOCK_METHOD` macro was introduced, mocks where created using
+a family of macros collectively called `MOCK_METHODn`. These macros are still
+supported, though migration to the new `MOCK_METHOD` is recommended.
+
+The macros in the `MOCK_METHODn` family differ from `MOCK_METHOD`:
+
+*   The general structure is `MOCK_METHODn(MethodName, ReturnType(Args))`,
+    instead of `MOCK_METHOD(ReturnType, MethodName, (Args))`.
+*   The number `n` must equal the number of arguments.
+*   When mocking a const method, one must use `MOCK_CONST_METHODn`.
+*   When mocking a class template, the macro name must be suffixed with `_T`.
+*   In order to specify the call type, the macro name must be suffixed with
+    `_WITH_CALLTYPE`, and the call type is the first macro argument.
+
+Old macros and their new equivalents:
+
+<a name="table99"></a>
+<table border="1" cellspacing="0" cellpadding="1">
+<tr> <th colspan=2> Simple </th></tr>
+<tr> <td> Old </td> <td> `MOCK_METHOD1(Foo, bool(int))` </td> </tr>
+<tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int))` </td> </tr>
+
+<tr> <th colspan=2> Const Method </th></tr> <tr> <td> Old </td> <td>
+`MOCK_CONST_METHOD1(Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td>
+`MOCK_METHOD(bool, Foo, (int), (const))` </td> </tr>
+
+<tr> <th colspan=2> Method in a Class Template </th></tr> <tr> <td> Old </td>
+<td> `MOCK_METHOD1_T(Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td>
+`MOCK_METHOD(bool, Foo, (int))` </td> </tr>
+
+<tr> <th colspan=2> Const Method in a Class Template </th></tr> <tr> <td> Old
+</td> <td> `MOCK_CONST_METHOD1_T(Foo, bool(int))` </td> </tr> <tr> <td> New
+</td> <td> `MOCK_METHOD(bool, Foo, (int), (const))` </td> </tr>
+
+<tr> <th colspan=2> Method with Call Type </th></tr> <tr> <td> Old </td> <td>
+`MOCK_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))` </td> </tr> <tr>
+<td> New </td> <td> `MOCK_METHOD(bool, Foo, (int),
+(Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Const Method with Call Type </th></tr> <tr> <td> Old</td>
+<td> `MOCK_CONST_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))` </td>
+</tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int), (const,
+Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Method with Call Type in a Class Template </th></tr> <tr>
+<td> Old </td> <td> `MOCK_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo,
+bool(int))` </td> </tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int),
+(Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Const Method with Call Type in a Class Template </th></tr>
+<tr> <td> Old </td> <td> `MOCK_CONST_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE,
+Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo,
+(int), (const, Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+</table>
+
+#### The Nice, the Strict, and the Naggy {#NiceStrictNaggy}
+
+If a mock method has no `EXPECT_CALL` spec but is called, we say that it's an
+"uninteresting call", and the default action (which can be specified using
+`ON_CALL()`) of the method will be taken. Currently, an uninteresting call will
+also by default cause gMock to print a warning. (In the future, we might remove
+this warning by default.)
+
+However, sometimes you may want to ignore these uninteresting calls, and
+sometimes you may want to treat them as errors. gMock lets you make the decision
+on a per-mock-object basis.
+
+Suppose your test uses a mock class `MockFoo`:
+
+```cpp
+TEST(...) {
+  MockFoo mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+If a method of `mock_foo` other than `DoThis()` is called, you will get a
+warning. However, if you rewrite your test to use `NiceMock<MockFoo>` instead,
+you can suppress the warning:
+
+```cpp
+using ::testing::NiceMock;
+
+TEST(...) {
+  NiceMock<MockFoo> mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+`NiceMock<MockFoo>` is a subclass of `MockFoo`, so it can be used wherever
+`MockFoo` is accepted.
+
+It also works if `MockFoo`'s constructor takes some arguments, as
+`NiceMock<MockFoo>` "inherits" `MockFoo`'s constructors:
+
+```cpp
+using ::testing::NiceMock;
+
+TEST(...) {
+  NiceMock<MockFoo> mock_foo(5, "hi");  // Calls MockFoo(5, "hi").
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+The usage of `StrictMock` is similar, except that it makes all uninteresting
+calls failures:
+
+```cpp
+using ::testing::StrictMock;
+
+TEST(...) {
+  StrictMock<MockFoo> mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+
+  // The test will fail if a method of mock_foo other than DoThis()
+  // is called.
+}
+```
+
+NOTE: `NiceMock` and `StrictMock` only affects *uninteresting* calls (calls of
+*methods* with no expectations); they do not affect *unexpected* calls (calls of
+methods with expectations, but they don't match). See
+[Understanding Uninteresting vs Unexpected Calls](#uninteresting-vs-unexpected).
+
+There are some caveats though (I dislike them just as much as the next guy, but
+sadly they are side effects of C++'s limitations):
+
+1.  `NiceMock<MockFoo>` and `StrictMock<MockFoo>` only work for mock methods
+    defined using the `MOCK_METHOD` macro **directly** in the `MockFoo` class.
+    If a mock method is defined in a **base class** of `MockFoo`, the "nice" or
+    "strict" modifier may not affect it, depending on the compiler. In
+    particular, nesting `NiceMock` and `StrictMock` (e.g.
+    `NiceMock<StrictMock<MockFoo> >`) is **not** supported.
+2.  `NiceMock<MockFoo>` and `StrictMock<MockFoo>` may not work correctly if the
+    destructor of `MockFoo` is not virtual. We would like to fix this, but it
+    requires cleaning up existing tests. http://b/28934720 tracks the issue.
+3.  During the constructor or destructor of `MockFoo`, the mock object is *not*
+    nice or strict. This may cause surprises if the constructor or destructor
+    calls a mock method on `this` object. (This behavior, however, is consistent
+    with C++'s general rule: if a constructor or destructor calls a virtual
+    method of `this` object, that method is treated as non-virtual. In other
+    words, to the base class's constructor or destructor, `this` object behaves
+    like an instance of the base class, not the derived class. This rule is
+    required for safety. Otherwise a base constructor may use members of a
+    derived class before they are initialized, or a base destructor may use
+    members of a derived class after they have been destroyed.)
+
+Finally, you should be **very cautious** about when to use naggy or strict
+mocks, as they tend to make tests more brittle and harder to maintain. When you
+refactor your code without changing its externally visible behavior, ideally you
+shouldn't need to update any tests. If your code interacts with a naggy mock,
+however, you may start to get spammed with warnings as the result of your
+change. Worse, if your code interacts with a strict mock, your tests may start
+to fail and you'll be forced to fix them. Our general recommendation is to use
+nice mocks (not yet the default) most of the time, use naggy mocks (the current
+default) when developing or debugging tests, and use strict mocks only as the
+last resort.
+
+#### Simplifying the Interface without Breaking Existing Code {#SimplerInterfaces}
+
+Sometimes a method has a long list of arguments that is mostly uninteresting.
+For example:
+
+```cpp
+class LogSink {
+ public:
+  ...
+  virtual void send(LogSeverity severity, const char* full_filename,
+                    const char* base_filename, int line,
+                    const struct tm* tm_time,
+                    const char* message, size_t message_len) = 0;
+};
+```
+
+This method's argument list is lengthy and hard to work with (the `message`
+argument is not even 0-terminated). If we mock it as is, using the mock will be
+awkward. If, however, we try to simplify this interface, we'll need to fix all
+clients depending on it, which is often infeasible.
+
+The trick is to redispatch the method in the mock class:
+
+```cpp
+class ScopedMockLog : public LogSink {
+ public:
+  ...
+  virtual void send(LogSeverity severity, const char* full_filename,
+                    const char* base_filename, int line, const tm* tm_time,
+                    const char* message, size_t message_len) {
+    // We are only interested in the log severity, full file name, and
+    // log message.
+    Log(severity, full_filename, std::string(message, message_len));
+  }
+
+  // Implements the mock method:
+  //
+  //   void Log(LogSeverity severity,
+  //            const string& file_path,
+  //            const string& message);
+  MOCK_METHOD(void, Log,
+              (LogSeverity severity, const string& file_path,
+               const string& message));
+};
+```
+
+By defining a new mock method with a trimmed argument list, we make the mock
+class more user-friendly.
+
+This technique may also be applied to make overloaded methods more amenable to
+mocking. For example, when overloads have been used to implement default
+arguments:
+
+```cpp
+class MockTurtleFactory : public TurtleFactory {
+ public:
+  Turtle* MakeTurtle(int length, int weight) override { ... }
+  Turtle* MakeTurtle(int length, int weight, int speed) override { ... }
+
+  // the above methods delegate to this one:
+  MOCK_METHOD(Turtle*, DoMakeTurtle, ());
+};
+```
+
+This allows tests that don't care which overload was invoked to avoid specifying
+argument matchers:
+
+```cpp
+ON_CALL(factory, DoMakeTurtle)
+    .WillByDefault(MakeMockTurtle());
+```
+
+#### Alternative to Mocking Concrete Classes
+
+Often you may find yourself using classes that don't implement interfaces. In
+order to test your code that uses such a class (let's call it `Concrete`), you
+may be tempted to make the methods of `Concrete` virtual and then mock it.
+
+Try not to do that.
+
+Making a non-virtual function virtual is a big decision. It creates an extension
+point where subclasses can tweak your class' behavior. This weakens your control
+on the class because now it's harder to maintain the class invariants. You
+should make a function virtual only when there is a valid reason for a subclass
+to override it.
+
+Mocking concrete classes directly is problematic as it creates a tight coupling
+between the class and the tests - any small change in the class may invalidate
+your tests and make test maintenance a pain.
+
+To avoid such problems, many programmers have been practicing "coding to
+interfaces": instead of talking to the `Concrete` class, your code would define
+an interface and talk to it. Then you implement that interface as an adaptor on
+top of `Concrete`. In tests, you can easily mock that interface to observe how
+your code is doing.
+
+This technique incurs some overhead:
+
+*   You pay the cost of virtual function calls (usually not a problem).
+*   There is more abstraction for the programmers to learn.
+
+However, it can also bring significant benefits in addition to better
+testability:
+
+*   `Concrete`'s API may not fit your problem domain very well, as you may not
+    be the only client it tries to serve. By designing your own interface, you
+    have a chance to tailor it to your need - you may add higher-level
+    functionalities, rename stuff, etc instead of just trimming the class. This
+    allows you to write your code (user of the interface) in a more natural way,
+    which means it will be more readable, more maintainable, and you'll be more
+    productive.
+*   If `Concrete`'s implementation ever has to change, you don't have to rewrite
+    everywhere it is used. Instead, you can absorb the change in your
+    implementation of the interface, and your other code and tests will be
+    insulated from this change.
+
+Some people worry that if everyone is practicing this technique, they will end
+up writing lots of redundant code. This concern is totally understandable.
+However, there are two reasons why it may not be the case:
+
+*   Different projects may need to use `Concrete` in different ways, so the best
+    interfaces for them will be different. Therefore, each of them will have its
+    own domain-specific interface on top of `Concrete`, and they will not be the
+    same code.
+*   If enough projects want to use the same interface, they can always share it,
+    just like they have been sharing `Concrete`. You can check in the interface
+    and the adaptor somewhere near `Concrete` (perhaps in a `contrib`
+    sub-directory) and let many projects use it.
+
+You need to weigh the pros and cons carefully for your particular problem, but
+I'd like to assure you that the Java community has been practicing this for a
+long time and it's a proven effective technique applicable in a wide variety of
+situations. :-)
+
+#### Delegating Calls to a Fake {#DelegatingToFake}
+
+Some times you have a non-trivial fake implementation of an interface. For
+example:
+
+```cpp
+class Foo {
+ public:
+  virtual ~Foo() {}
+  virtual char DoThis(int n) = 0;
+  virtual void DoThat(const char* s, int* p) = 0;
+};
+
+class FakeFoo : public Foo {
+ public:
+  char DoThis(int n) override {
+    return (n > 0) ? '+' :
+           (n < 0) ? '-' : '0';
+  }
+
+  void DoThat(const char* s, int* p) override {
+    *p = strlen(s);
+  }
+};
+```
+
+Now you want to mock this interface such that you can set expectations on it.
+However, you also want to use `FakeFoo` for the default behavior, as duplicating
+it in the mock object is, well, a lot of work.
+
+When you define the mock class using gMock, you can have it delegate its default
+action to a fake class you already have, using this pattern:
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  // Normal mock method definitions using gMock.
+  MOCK_METHOD(char, DoThis, (int n), (override));
+  MOCK_METHOD(void, DoThat, (const char* s, int* p), (override));
+
+  // Delegates the default actions of the methods to a FakeFoo object.
+  // This must be called *before* the custom ON_CALL() statements.
+  void DelegateToFake() {
+    ON_CALL(*this, DoThis).WillByDefault([this](int n) {
+      return fake_.DoThis(n);
+    });
+    ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) {
+      fake_.DoThat(s, p);
+    });
+  }
+
+ private:
+  FakeFoo fake_;  // Keeps an instance of the fake in the mock.
+};
+```
+
+With that, you can use `MockFoo` in your tests as usual. Just remember that if
+you don't explicitly set an action in an `ON_CALL()` or `EXPECT_CALL()`, the
+fake will be called upon to do it.:
+
+```cpp
+using ::testing::_;
+
+TEST(AbcTest, Xyz) {
+  MockFoo foo;
+
+  foo.DelegateToFake();  // Enables the fake for delegation.
+
+  // Put your ON_CALL(foo, ...)s here, if any.
+
+  // No action specified, meaning to use the default action.
+  EXPECT_CALL(foo, DoThis(5));
+  EXPECT_CALL(foo, DoThat(_, _));
+
+  int n = 0;
+  EXPECT_EQ('+', foo.DoThis(5));  // FakeFoo::DoThis() is invoked.
+  foo.DoThat("Hi", &n);  // FakeFoo::DoThat() is invoked.
+  EXPECT_EQ(2, n);
+}
+```
+
+**Some tips:**
+
+*   If you want, you can still override the default action by providing your own
+    `ON_CALL()` or using `.WillOnce()` / `.WillRepeatedly()` in `EXPECT_CALL()`.
+*   In `DelegateToFake()`, you only need to delegate the methods whose fake
+    implementation you intend to use.
+
+*   The general technique discussed here works for overloaded methods, but
+    you'll need to tell the compiler which version you mean. To disambiguate a
+    mock function (the one you specify inside the parentheses of `ON_CALL()`),
+    use [this technique](#SelectOverload); to disambiguate a fake function (the
+    one you place inside `Invoke()`), use a `static_cast` to specify the
+    function's type. For instance, if class `Foo` has methods `char DoThis(int
+    n)` and `bool DoThis(double x) const`, and you want to invoke the latter,
+    you need to write `Invoke(&fake_, static_cast<bool (FakeFoo::*)(double)
+    const>(&FakeFoo::DoThis))` instead of `Invoke(&fake_, &FakeFoo::DoThis)`
+    (The strange-looking thing inside the angled brackets of `static_cast` is
+    the type of a function pointer to the second `DoThis()` method.).
+
+*   Having to mix a mock and a fake is often a sign of something gone wrong.
+    Perhaps you haven't got used to the interaction-based way of testing yet. Or
+    perhaps your interface is taking on too many roles and should be split up.
+    Therefore, **don't abuse this**. We would only recommend to do it as an
+    intermediate step when you are refactoring your code.
+
+Regarding the tip on mixing a mock and a fake, here's an example on why it may
+be a bad sign: Suppose you have a class `System` for low-level system
+operations. In particular, it does file and I/O operations. And suppose you want
+to test how your code uses `System` to do I/O, and you just want the file
+operations to work normally. If you mock out the entire `System` class, you'll
+have to provide a fake implementation for the file operation part, which
+suggests that `System` is taking on too many roles.
+
+Instead, you can define a `FileOps` interface and an `IOOps` interface and split
+`System`'s functionalities into the two. Then you can mock `IOOps` without
+mocking `FileOps`.
+
+#### Delegating Calls to a Real Object
+
+When using testing doubles (mocks, fakes, stubs, and etc), sometimes their
+behaviors will differ from those of the real objects. This difference could be
+either intentional (as in simulating an error such that you can test the error
+handling code) or unintentional. If your mocks have different behaviors than the
+real objects by mistake, you could end up with code that passes the tests but
+fails in production.
+
+You can use the *delegating-to-real* technique to ensure that your mock has the
+same behavior as the real object while retaining the ability to validate calls.
+This technique is very similar to the [delegating-to-fake](#DelegatingToFake)
+technique, the difference being that we use a real object instead of a fake.
+Here's an example:
+
+```cpp
+using ::testing::AtLeast;
+
+class MockFoo : public Foo {
+ public:
+  MockFoo() {
+    // By default, all calls are delegated to the real object.
+    ON_CALL(*this, DoThis).WillByDefault([this](int n) {
+      return real_.DoThis(n);
+    });
+    ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) {
+      real_.DoThat(s, p);
+    });
+    ...
+  }
+  MOCK_METHOD(char, DoThis, ...);
+  MOCK_METHOD(void, DoThat, ...);
+  ...
+ private:
+  Foo real_;
+};
+
+...
+  MockFoo mock;
+  EXPECT_CALL(mock, DoThis())
+      .Times(3);
+  EXPECT_CALL(mock, DoThat("Hi"))
+      .Times(AtLeast(1));
+  ... use mock in test ...
+```
+
+With this, gMock will verify that your code made the right calls (with the right
+arguments, in the right order, called the right number of times, etc), and a
+real object will answer the calls (so the behavior will be the same as in
+production). This gives you the best of both worlds.
+
+#### Delegating Calls to a Parent Class
+
+Ideally, you should code to interfaces, whose methods are all pure virtual. In
+reality, sometimes you do need to mock a virtual method that is not pure (i.e,
+it already has an implementation). For example:
+
+```cpp
+class Foo {
+ public:
+  virtual ~Foo();
+
+  virtual void Pure(int n) = 0;
+  virtual int Concrete(const char* str) { ... }
+};
+
+class MockFoo : public Foo {
+ public:
+  // Mocking a pure method.
+  MOCK_METHOD(void, Pure, (int n), (override));
+  // Mocking a concrete method.  Foo::Concrete() is shadowed.
+  MOCK_METHOD(int, Concrete, (const char* str), (override));
+};
+```
+
+Sometimes you may want to call `Foo::Concrete()` instead of
+`MockFoo::Concrete()`. Perhaps you want to do it as part of a stub action, or
+perhaps your test doesn't need to mock `Concrete()` at all (but it would be
+oh-so painful to have to define a new mock class whenever you don't need to mock
+one of its methods).
+
+The trick is to leave a back door in your mock class for accessing the real
+methods in the base class:
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  // Mocking a pure method.
+  MOCK_METHOD(void, Pure, (int n), (override));
+  // Mocking a concrete method.  Foo::Concrete() is shadowed.
+  MOCK_METHOD(int, Concrete, (const char* str), (override));
+
+  // Use this to call Concrete() defined in Foo.
+  int FooConcrete(const char* str) { return Foo::Concrete(str); }
+};
+```
+
+Now, you can call `Foo::Concrete()` inside an action by:
+
+```cpp
+...
+  EXPECT_CALL(foo, Concrete).WillOnce([&foo](const char* str) {
+    return foo.FooConcrete(str);
+  });
+```
+
+or tell the mock object that you don't want to mock `Concrete()`:
+
+```cpp
+...
+  ON_CALL(foo, Concrete).WillByDefault([&foo](const char* str) {
+    return foo.FooConcrete(str);
+  });
+```
+
+(Why don't we just write `{ return foo.Concrete(str); }`? If you do that,
+`MockFoo::Concrete()` will be called (and cause an infinite recursion) since
+`Foo::Concrete()` is virtual. That's just how C++ works.)
+
+### Using Matchers
+
+#### Matching Argument Values Exactly
+
+You can specify exactly which arguments a mock method is expecting:
+
+```cpp
+using ::testing::Return;
+...
+  EXPECT_CALL(foo, DoThis(5))
+      .WillOnce(Return('a'));
+  EXPECT_CALL(foo, DoThat("Hello", bar));
+```
+
+#### Using Simple Matchers
+
+You can use matchers to match arguments that have a certain property:
+
+```cpp
+using ::testing::NotNull;
+using ::testing::Return;
+...
+  EXPECT_CALL(foo, DoThis(Ge(5)))  // The argument must be >= 5.
+      .WillOnce(Return('a'));
+  EXPECT_CALL(foo, DoThat("Hello", NotNull()));
+      // The second argument must not be NULL.
+```
+
+A frequently used matcher is `_`, which matches anything:
+
+```cpp
+  EXPECT_CALL(foo, DoThat(_, NotNull()));
+```
+<!-- GOOGLETEST_CM0022 DO NOT DELETE -->
+
+#### Combining Matchers {#CombiningMatchers}
+
+You can build complex matchers from existing ones using `AllOf()`,
+`AllOfArray()`, `AnyOf()`, `AnyOfArray()` and `Not()`:
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Ne;
+using ::testing::Not;
+...
+  // The argument must be > 5 and != 10.
+  EXPECT_CALL(foo, DoThis(AllOf(Gt(5),
+                                Ne(10))));
+
+  // The first argument must not contain sub-string "blah".
+  EXPECT_CALL(foo, DoThat(Not(HasSubstr("blah")),
+                          NULL));
+```
+
+#### Casting Matchers {#SafeMatcherCast}
+
+gMock matchers are statically typed, meaning that the compiler can catch your
+mistake if you use a matcher of the wrong type (for example, if you use `Eq(5)`
+to match a `string` argument). Good for you!
+
+Sometimes, however, you know what you're doing and want the compiler to give you
+some slack. One example is that you have a matcher for `long` and the argument
+you want to match is `int`. While the two types aren't exactly the same, there
+is nothing really wrong with using a `Matcher<long>` to match an `int` - after
+all, we can first convert the `int` argument to a `long` losslessly before
+giving it to the matcher.
+
+To support this need, gMock gives you the `SafeMatcherCast<T>(m)` function. It
+casts a matcher `m` to type `Matcher<T>`. To ensure safety, gMock checks that
+(let `U` be the type `m` accepts :
+
+1.  Type `T` can be *implicitly* cast to type `U`;
+2.  When both `T` and `U` are built-in arithmetic types (`bool`, integers, and
+    floating-point numbers), the conversion from `T` to `U` is not lossy (in
+    other words, any value representable by `T` can also be represented by `U`);
+    and
+3.  When `U` is a reference, `T` must also be a reference (as the underlying
+    matcher may be interested in the address of the `U` value).
+
+The code won't compile if any of these conditions isn't met.
+
+Here's one example:
+
+```cpp
+using ::testing::SafeMatcherCast;
+
+// A base class and a child class.
+class Base { ... };
+class Derived : public Base { ... };
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(void, DoThis, (Derived* derived), (override));
+};
+
+...
+  MockFoo foo;
+  // m is a Matcher<Base*> we got from somewhere.
+  EXPECT_CALL(foo, DoThis(SafeMatcherCast<Derived*>(m)));
+```
+
+If you find `SafeMatcherCast<T>(m)` too limiting, you can use a similar function
+`MatcherCast<T>(m)`. The difference is that `MatcherCast` works as long as you
+can `static_cast` type `T` to type `U`.
+
+`MatcherCast` essentially lets you bypass C++'s type system (`static_cast` isn't
+always safe as it could throw away information, for example), so be careful not
+to misuse/abuse it.
+
+#### Selecting Between Overloaded Functions {#SelectOverload}
+
+If you expect an overloaded function to be called, the compiler may need some
+help on which overloaded version it is.
+
+To disambiguate functions overloaded on the const-ness of this object, use the
+`Const()` argument wrapper.
+
+```cpp
+using ::testing::ReturnRef;
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+  MOCK_METHOD(const Bar&, GetBar, (), (const, override));
+};
+
+...
+  MockFoo foo;
+  Bar bar1, bar2;
+  EXPECT_CALL(foo, GetBar())         // The non-const GetBar().
+      .WillOnce(ReturnRef(bar1));
+  EXPECT_CALL(Const(foo), GetBar())  // The const GetBar().
+      .WillOnce(ReturnRef(bar2));
+```
+
+(`Const()` is defined by gMock and returns a `const` reference to its argument.)
+
+To disambiguate overloaded functions with the same number of arguments but
+different argument types, you may need to specify the exact type of a matcher,
+either by wrapping your matcher in `Matcher<type>()`, or using a matcher whose
+type is fixed (`TypedEq<type>`, `An<type>()`, etc):
+
+```cpp
+using ::testing::An;
+using ::testing::Matcher;
+using ::testing::TypedEq;
+
+class MockPrinter : public Printer {
+ public:
+  MOCK_METHOD(void, Print, (int n), (override));
+  MOCK_METHOD(void, Print, (char c), (override));
+};
+
+TEST(PrinterTest, Print) {
+  MockPrinter printer;
+
+  EXPECT_CALL(printer, Print(An<int>()));            // void Print(int);
+  EXPECT_CALL(printer, Print(Matcher<int>(Lt(5))));  // void Print(int);
+  EXPECT_CALL(printer, Print(TypedEq<char>('a')));   // void Print(char);
+
+  printer.Print(3);
+  printer.Print(6);
+  printer.Print('a');
+}
+```
+
+#### Performing Different Actions Based on the Arguments
+
+When a mock method is called, the *last* matching expectation that's still
+active will be selected (think "newer overrides older"). So, you can make a
+method do different things depending on its argument values like this:
+
+```cpp
+using ::testing::_;
+using ::testing::Lt;
+using ::testing::Return;
+...
+  // The default case.
+  EXPECT_CALL(foo, DoThis(_))
+      .WillRepeatedly(Return('b'));
+  // The more specific case.
+  EXPECT_CALL(foo, DoThis(Lt(5)))
+      .WillRepeatedly(Return('a'));
+```
+
+Now, if `foo.DoThis()` is called with a value less than 5, `'a'` will be
+returned; otherwise `'b'` will be returned.
+
+#### Matching Multiple Arguments as a Whole
+
+Sometimes it's not enough to match the arguments individually. For example, we
+may want to say that the first argument must be less than the second argument.
+The `With()` clause allows us to match all arguments of a mock function as a
+whole. For example,
+
+```cpp
+using ::testing::_;
+using ::testing::Ne;
+using ::testing::Lt;
+...
+  EXPECT_CALL(foo, InRange(Ne(0), _))
+      .With(Lt());
+```
+
+says that the first argument of `InRange()` must not be 0, and must be less than
+the second argument.
+
+The expression inside `With()` must be a matcher of type
+`Matcher< ::std::tuple<A1, ..., An> >`, where `A1`, ..., `An` are the types of
+the function arguments.
+
+You can also write `AllArgs(m)` instead of `m` inside `.With()`. The two forms
+are equivalent, but `.With(AllArgs(Lt()))` is more readable than `.With(Lt())`.
+
+You can use `Args<k1, ..., kn>(m)` to match the `n` selected arguments (as a
+tuple) against `m`. For example,
+
+```cpp
+using ::testing::_;
+using ::testing::AllOf;
+using ::testing::Args;
+using ::testing::Lt;
+...
+  EXPECT_CALL(foo, Blah)
+      .With(AllOf(Args<0, 1>(Lt()), Args<1, 2>(Lt())));
+```
+
+says that `Blah` will be called with arguments `x`, `y`, and `z` where `x < y <
+z`. Note that in this example, it wasn't necessary specify the positional
+matchers.
+
+As a convenience and example, gMock provides some matchers for 2-tuples,
+including the `Lt()` matcher above. See [here](#MultiArgMatchers) for the
+complete list.
+
+Note that if you want to pass the arguments to a predicate of your own (e.g.
+`.With(Args<0, 1>(Truly(&MyPredicate)))`), that predicate MUST be written to
+take a `::std::tuple` as its argument; gMock will pass the `n` selected
+arguments as *one* single tuple to the predicate.
+
+#### Using Matchers as Predicates
+
+Have you noticed that a matcher is just a fancy predicate that also knows how to
+describe itself? Many existing algorithms take predicates as arguments (e.g.
+those defined in STL's `<algorithm>` header), and it would be a shame if gMock
+matchers were not allowed to participate.
+
+Luckily, you can use a matcher where a unary predicate functor is expected by
+wrapping it inside the `Matches()` function. For example,
+
+```cpp
+#include <algorithm>
+#include <vector>
+
+using ::testing::Matches;
+using ::testing::Ge;
+
+vector<int> v;
+...
+// How many elements in v are >= 10?
+const int count = count_if(v.begin(), v.end(), Matches(Ge(10)));
+```
+
+Since you can build complex matchers from simpler ones easily using gMock, this
+gives you a way to conveniently construct composite predicates (doing the same
+using STL's `<functional>` header is just painful). For example, here's a
+predicate that's satisfied by any number that is >= 0, <= 100, and != 50:
+
+```cpp
+using testing::AllOf;
+using testing::Ge;
+using testing::Le;
+using testing::Matches;
+using testing::Ne;
+...
+Matches(AllOf(Ge(0), Le(100), Ne(50)))
+```
+
+#### Using Matchers in googletest Assertions
+
+Since matchers are basically predicates that also know how to describe
+themselves, there is a way to take advantage of them in googletest assertions.
+It's called `ASSERT_THAT` and `EXPECT_THAT`:
+
+```cpp
+  ASSERT_THAT(value, matcher);  // Asserts that value matches matcher.
+  EXPECT_THAT(value, matcher);  // The non-fatal version.
+```
+
+For example, in a googletest test you can write:
+
+```cpp
+#include "gmock/gmock.h"
+
+using ::testing::AllOf;
+using ::testing::Ge;
+using ::testing::Le;
+using ::testing::MatchesRegex;
+using ::testing::StartsWith;
+
+...
+  EXPECT_THAT(Foo(), StartsWith("Hello"));
+  EXPECT_THAT(Bar(), MatchesRegex("Line \\d+"));
+  ASSERT_THAT(Baz(), AllOf(Ge(5), Le(10)));
+```
+
+which (as you can probably guess) executes `Foo()`, `Bar()`, and `Baz()`, and
+verifies that:
+
+*   `Foo()` returns a string that starts with `"Hello"`.
+*   `Bar()` returns a string that matches regular expression `"Line \\d+"`.
+*   `Baz()` returns a number in the range [5, 10].
+
+The nice thing about these macros is that *they read like English*. They
+generate informative messages too. For example, if the first `EXPECT_THAT()`
+above fails, the message will be something like:
+
+```cpp
+Value of: Foo()
+  Actual: "Hi, world!"
+Expected: starts with "Hello"
+```
+
+**Credit:** The idea of `(ASSERT|EXPECT)_THAT` was borrowed from Joe Walnes'
+Hamcrest project, which adds `assertThat()` to JUnit.
+
+#### Using Predicates as Matchers
+
+gMock provides a [built-in set](#MatcherList) of matchers. In case you find them
+lacking, you can use an arbitrary unary predicate function or functor as a
+matcher - as long as the predicate accepts a value of the type you want. You do
+this by wrapping the predicate inside the `Truly()` function, for example:
+
+```cpp
+using ::testing::Truly;
+
+int IsEven(int n) { return (n % 2) == 0 ? 1 : 0; }
+...
+  // Bar() must be called with an even number.
+  EXPECT_CALL(foo, Bar(Truly(IsEven)));
+```
+
+Note that the predicate function / functor doesn't have to return `bool`. It
+works as long as the return value can be used as the condition in in statement
+`if (condition) ...`.
+
+<!-- GOOGLETEST_CM0023 DO NOT DELETE -->
+
+#### Matching Arguments that Are Not Copyable
+
+When you do an `EXPECT_CALL(mock_obj, Foo(bar))`, gMock saves away a copy of
+`bar`. When `Foo()` is called later, gMock compares the argument to `Foo()` with
+the saved copy of `bar`. This way, you don't need to worry about `bar` being
+modified or destroyed after the `EXPECT_CALL()` is executed. The same is true
+when you use matchers like `Eq(bar)`, `Le(bar)`, and so on.
+
+But what if `bar` cannot be copied (i.e. has no copy constructor)? You could
+define your own matcher function or callback and use it with `Truly()`, as the
+previous couple of recipes have shown. Or, you may be able to get away from it
+if you can guarantee that `bar` won't be changed after the `EXPECT_CALL()` is
+executed. Just tell gMock that it should save a reference to `bar`, instead of a
+copy of it. Here's how:
+
+```cpp
+using ::testing::ByRef;
+using ::testing::Eq;
+using ::testing::Lt;
+...
+  // Expects that Foo()'s argument == bar.
+  EXPECT_CALL(mock_obj, Foo(Eq(ByRef(bar))));
+
+  // Expects that Foo()'s argument < bar.
+  EXPECT_CALL(mock_obj, Foo(Lt(ByRef(bar))));
+```
+
+Remember: if you do this, don't change `bar` after the `EXPECT_CALL()`, or the
+result is undefined.
+
+#### Validating a Member of an Object
+
+Often a mock function takes a reference to object as an argument. When matching
+the argument, you may not want to compare the entire object against a fixed
+object, as that may be over-specification. Instead, you may need to validate a
+certain member variable or the result of a certain getter method of the object.
+You can do this with `Field()` and `Property()`. More specifically,
+
+```cpp
+Field(&Foo::bar, m)
+```
+
+is a matcher that matches a `Foo` object whose `bar` member variable satisfies
+matcher `m`.
+
+```cpp
+Property(&Foo::baz, m)
+```
+
+is a matcher that matches a `Foo` object whose `baz()` method returns a value
+that satisfies matcher `m`.
+
+For example:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+| Expression                   | Description                              |
+| :--------------------------- | :--------------------------------------- |
+| `Field(&Foo::number, Ge(3))` | Matches `x` where `x.number >= 3`.       |
+| `Property(&Foo::name,  StartsWith("John "))` | Matches `x` where `x.name()` starts with  `"John "`. |
+<!-- mdformat on -->
+
+Note that in `Property(&Foo::baz, ...)`, method `baz()` must take no argument
+and be declared as `const`.
+
+BTW, `Field()` and `Property()` can also match plain pointers to objects. For
+instance,
+
+```cpp
+using ::testing::Field;
+using ::testing::Ge;
+...
+Field(&Foo::number, Ge(3))
+```
+
+matches a plain pointer `p` where `p->number >= 3`. If `p` is `NULL`, the match
+will always fail regardless of the inner matcher.
+
+What if you want to validate more than one members at the same time? Remember
+that there are [`AllOf()` and `AllOfArray()`](#CombiningMatchers).
+
+Finally `Field()` and `Property()` provide overloads that take the field or
+property names as the first argument to include it in the error message. This
+can be useful when creating combined matchers.
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::Matcher;
+using ::testing::SafeMatcherCast;
+
+Matcher<Foo> IsFoo(const Foo& foo) {
+  return AllOf(Field("some_field", &Foo::some_field, foo.some_field),
+               Field("other_field", &Foo::other_field, foo.other_field),
+               Field("last_field", &Foo::last_field, foo.last_field));
+}
+```
+
+#### Validating the Value Pointed to by a Pointer Argument
+
+C++ functions often take pointers as arguments. You can use matchers like
+`IsNull()`, `NotNull()`, and other comparison matchers to match a pointer, but
+what if you want to make sure the value *pointed to* by the pointer, instead of
+the pointer itself, has a certain property? Well, you can use the `Pointee(m)`
+matcher.
+
+`Pointee(m)` matches a pointer if `m` matches the value the pointer points to.
+For example:
+
+```cpp
+using ::testing::Ge;
+using ::testing::Pointee;
+...
+  EXPECT_CALL(foo, Bar(Pointee(Ge(3))));
+```
+
+expects `foo.Bar()` to be called with a pointer that points to a value greater
+than or equal to 3.
+
+One nice thing about `Pointee()` is that it treats a `NULL` pointer as a match
+failure, so you can write `Pointee(m)` instead of
+
+```cpp
+using ::testing::AllOf;
+using ::testing::NotNull;
+using ::testing::Pointee;
+...
+  AllOf(NotNull(), Pointee(m))
+```
+
+without worrying that a `NULL` pointer will crash your test.
+
+Also, did we tell you that `Pointee()` works with both raw pointers **and**
+smart pointers (`std::unique_ptr`, `std::shared_ptr`, etc)?
+
+What if you have a pointer to pointer? You guessed it - you can use nested
+`Pointee()` to probe deeper inside the value. For example,
+`Pointee(Pointee(Lt(3)))` matches a pointer that points to a pointer that points
+to a number less than 3 (what a mouthful...).
+
+#### Testing a Certain Property of an Object
+
+Sometimes you want to specify that an object argument has a certain property,
+but there is no existing matcher that does this. If you want good error
+messages, you should [define a matcher](#NewMatchers). If you want to do it
+quick and dirty, you could get away with writing an ordinary function.
+
+Let's say you have a mock function that takes an object of type `Foo`, which has
+an `int bar()` method and an `int baz()` method, and you want to constrain that
+the argument's `bar()` value plus its `baz()` value is a given number. Here's
+how you can define a matcher to do it:
+
+```cpp
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
+
+class BarPlusBazEqMatcher : public MatcherInterface<const Foo&> {
+ public:
+  explicit BarPlusBazEqMatcher(int expected_sum)
+      : expected_sum_(expected_sum) {}
+
+  bool MatchAndExplain(const Foo& foo,
+                       MatchResultListener* /* listener */) const override {
+    return (foo.bar() + foo.baz()) == expected_sum_;
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "bar() + baz() equals " << expected_sum_;
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "bar() + baz() does not equal " << expected_sum_;
+  }
+ private:
+  const int expected_sum_;
+};
+
+Matcher<const Foo&> BarPlusBazEq(int expected_sum) {
+  return MakeMatcher(new BarPlusBazEqMatcher(expected_sum));
+}
+
+...
+  EXPECT_CALL(..., DoThis(BarPlusBazEq(5)))...;
+```
+
+#### Matching Containers
+
+Sometimes an STL container (e.g. list, vector, map, ...) is passed to a mock
+function and you may want to validate it. Since most STL containers support the
+`==` operator, you can write `Eq(expected_container)` or simply
+`expected_container` to match a container exactly.
+
+Sometimes, though, you may want to be more flexible (for example, the first
+element must be an exact match, but the second element can be any positive
+number, and so on). Also, containers used in tests often have a small number of
+elements, and having to define the expected container out-of-line is a bit of a
+hassle.
+
+You can use the `ElementsAre()` or `UnorderedElementsAre()` matcher in such
+cases:
+
+```cpp
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::Gt;
+...
+  MOCK_METHOD(void, Foo, (const vector<int>& numbers), (override));
+...
+  EXPECT_CALL(mock, Foo(ElementsAre(1, Gt(0), _, 5)));
+```
+
+The above matcher says that the container must have 4 elements, which must be 1,
+greater than 0, anything, and 5 respectively.
+
+If you instead write:
+
+```cpp
+using ::testing::_;
+using ::testing::Gt;
+using ::testing::UnorderedElementsAre;
+...
+  MOCK_METHOD(void, Foo, (const vector<int>& numbers), (override));
+...
+  EXPECT_CALL(mock, Foo(UnorderedElementsAre(1, Gt(0), _, 5)));
+```
+
+It means that the container must have 4 elements, which (under some permutation)
+must be 1, greater than 0, anything, and 5 respectively.
+
+As an alternative you can place the arguments in a C-style array and use
+`ElementsAreArray()` or `UnorderedElementsAreArray()` instead:
+
+```cpp
+using ::testing::ElementsAreArray;
+...
+  // ElementsAreArray accepts an array of element values.
+  const int expected_vector1[] = {1, 5, 2, 4, ...};
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector1)));
+
+  // Or, an array of element matchers.
+  Matcher<int> expected_vector2[] = {1, Gt(2), _, 3, ...};
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector2)));
+```
+
+In case the array needs to be dynamically created (and therefore the array size
+cannot be inferred by the compiler), you can give `ElementsAreArray()` an
+additional argument to specify the array size:
+
+```cpp
+using ::testing::ElementsAreArray;
+...
+  int* const expected_vector3 = new int[count];
+  ... fill expected_vector3 with values ...
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector3, count)));
+```
+
+Use `Pair` when comparing maps or other associative containers.
+
+```cpp
+using testing::ElementsAre;
+using testing::Pair;
+...
+  std::map<string, int> m = {{"a", 1}, {"b", 2}, {"c", 3}};
+  EXPECT_THAT(m, ElementsAre(Pair("a", 1), Pair("b", 2), Pair("c", 3)));
+```
+
+**Tips:**
+
+*   `ElementsAre*()` can be used to match *any* container that implements the
+    STL iterator pattern (i.e. it has a `const_iterator` type and supports
+    `begin()/end()`), not just the ones defined in STL. It will even work with
+    container types yet to be written - as long as they follows the above
+    pattern.
+*   You can use nested `ElementsAre*()` to match nested (multi-dimensional)
+    containers.
+*   If the container is passed by pointer instead of by reference, just write
+    `Pointee(ElementsAre*(...))`.
+*   The order of elements *matters* for `ElementsAre*()`. If you are using it
+    with containers whose element order are undefined (e.g. `hash_map`) you
+    should use `WhenSorted` around `ElementsAre`.
+
+#### Sharing Matchers
+
+Under the hood, a gMock matcher object consists of a pointer to a ref-counted
+implementation object. Copying matchers is allowed and very efficient, as only
+the pointer is copied. When the last matcher that references the implementation
+object dies, the implementation object will be deleted.
+
+Therefore, if you have some complex matcher that you want to use again and
+again, there is no need to build it everytime. Just assign it to a matcher
+variable and use that variable repeatedly! For example,
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::Le;
+using ::testing::Matcher;
+...
+  Matcher<int> in_range = AllOf(Gt(5), Le(10));
+  ... use in_range as a matcher in multiple EXPECT_CALLs ...
+```
+
+#### Matchers must have no side-effects {#PureMatchers}
+
+WARNING: gMock does not guarantee when or how many times a matcher will be
+invoked. Therefore, all matchers must be *purely functional*: they cannot have
+any side effects, and the match result must not depend on anything other than
+the matcher's parameters and the value being matched.
+
+This requirement must be satisfied no matter how a matcher is defined (e.g., if
+it is one of the standard matchers, or a custom matcher). In particular, a
+matcher can never call a mock function, as that will affect the state of the
+mock object and gMock.
+
+### Setting Expectations
+
+#### Knowing When to Expect {#UseOnCall}
+
+<!-- GOOGLETEST_CM0018 DO NOT DELETE -->
+
+**`ON_CALL`** is likely the *single most under-utilized construct* in gMock.
+
+There are basically two constructs for defining the behavior of a mock object:
+`ON_CALL` and `EXPECT_CALL`. The difference? `ON_CALL` defines what happens when
+a mock method is called, but <em>doesn't imply any expectation on the method
+being called</em>. `EXPECT_CALL` not only defines the behavior, but also sets an
+expectation that <em>the method will be called with the given arguments, for the
+given number of times</em> (and *in the given order* when you specify the order
+too).
+
+Since `EXPECT_CALL` does more, isn't it better than `ON_CALL`? Not really. Every
+`EXPECT_CALL` adds a constraint on the behavior of the code under test. Having
+more constraints than necessary is *baaad* - even worse than not having enough
+constraints.
+
+This may be counter-intuitive. How could tests that verify more be worse than
+tests that verify less? Isn't verification the whole point of tests?
+
+The answer lies in *what* a test should verify. **A good test verifies the
+contract of the code.** If a test over-specifies, it doesn't leave enough
+freedom to the implementation. As a result, changing the implementation without
+breaking the contract (e.g. refactoring and optimization), which should be
+perfectly fine to do, can break such tests. Then you have to spend time fixing
+them, only to see them broken again the next time the implementation is changed.
+
+Keep in mind that one doesn't have to verify more than one property in one test.
+In fact, **it's a good style to verify only one thing in one test.** If you do
+that, a bug will likely break only one or two tests instead of dozens (which
+case would you rather debug?). If you are also in the habit of giving tests
+descriptive names that tell what they verify, you can often easily guess what's
+wrong just from the test log itself.
+
+So use `ON_CALL` by default, and only use `EXPECT_CALL` when you actually intend
+to verify that the call is made. For example, you may have a bunch of `ON_CALL`s
+in your test fixture to set the common mock behavior shared by all tests in the
+same group, and write (scarcely) different `EXPECT_CALL`s in different `TEST_F`s
+to verify different aspects of the code's behavior. Compared with the style
+where each `TEST` has many `EXPECT_CALL`s, this leads to tests that are more
+resilient to implementational changes (and thus less likely to require
+maintenance) and makes the intent of the tests more obvious (so they are easier
+to maintain when you do need to maintain them).
+
+If you are bothered by the "Uninteresting mock function call" message printed
+when a mock method without an `EXPECT_CALL` is called, you may use a `NiceMock`
+instead to suppress all such messages for the mock object, or suppress the
+message for specific methods by adding `EXPECT_CALL(...).Times(AnyNumber())`. DO
+NOT suppress it by blindly adding an `EXPECT_CALL(...)`, or you'll have a test
+that's a pain to maintain.
+
+#### Ignoring Uninteresting Calls
+
+If you are not interested in how a mock method is called, just don't say
+anything about it. In this case, if the method is ever called, gMock will
+perform its default action to allow the test program to continue. If you are not
+happy with the default action taken by gMock, you can override it using
+`DefaultValue<T>::Set()` (described [here](#DefaultValue)) or `ON_CALL()`.
+
+Please note that once you expressed interest in a particular mock method (via
+`EXPECT_CALL()`), all invocations to it must match some expectation. If this
+function is called but the arguments don't match any `EXPECT_CALL()` statement,
+it will be an error.
+
+#### Disallowing Unexpected Calls
+
+If a mock method shouldn't be called at all, explicitly say so:
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .Times(0);
+```
+
+If some calls to the method are allowed, but the rest are not, just list all the
+expected calls:
+
+```cpp
+using ::testing::AnyNumber;
+using ::testing::Gt;
+...
+  EXPECT_CALL(foo, Bar(5));
+  EXPECT_CALL(foo, Bar(Gt(10)))
+      .Times(AnyNumber());
+```
+
+A call to `foo.Bar()` that doesn't match any of the `EXPECT_CALL()` statements
+will be an error.
+
+#### Understanding Uninteresting vs Unexpected Calls {#uninteresting-vs-unexpected}
+
+*Uninteresting* calls and *unexpected* calls are different concepts in gMock.
+*Very* different.
+
+A call `x.Y(...)` is **uninteresting** if there's *not even a single*
+`EXPECT_CALL(x, Y(...))` set. In other words, the test isn't interested in the
+`x.Y()` method at all, as evident in that the test doesn't care to say anything
+about it.
+
+A call `x.Y(...)` is **unexpected** if there are *some* `EXPECT_CALL(x,
+Y(...))`s set, but none of them matches the call. Put another way, the test is
+interested in the `x.Y()` method (therefore it explicitly sets some
+`EXPECT_CALL` to verify how it's called); however, the verification fails as the
+test doesn't expect this particular call to happen.
+
+**An unexpected call is always an error,** as the code under test doesn't behave
+the way the test expects it to behave.
+
+**By default, an uninteresting call is not an error,** as it violates no
+constraint specified by the test. (gMock's philosophy is that saying nothing
+means there is no constraint.) However, it leads to a warning, as it *might*
+indicate a problem (e.g. the test author might have forgotten to specify a
+constraint).
+
+In gMock, `NiceMock` and `StrictMock` can be used to make a mock class "nice" or
+"strict". How does this affect uninteresting calls and unexpected calls?
+
+A **nice mock** suppresses uninteresting call *warnings*. It is less chatty than
+the default mock, but otherwise is the same. If a test fails with a default
+mock, it will also fail using a nice mock instead. And vice versa. Don't expect
+making a mock nice to change the test's result.
+
+A **strict mock** turns uninteresting call warnings into errors. So making a
+mock strict may change the test's result.
+
+Let's look at an example:
+
+```cpp
+TEST(...) {
+  NiceMock<MockDomainRegistry> mock_registry;
+  EXPECT_CALL(mock_registry, GetDomainOwner("google.com"))
+          .WillRepeatedly(Return("Larry Page"));
+
+  // Use mock_registry in code under test.
+  ... &mock_registry ...
+}
+```
+
+The sole `EXPECT_CALL` here says that all calls to `GetDomainOwner()` must have
+`"google.com"` as the argument. If `GetDomainOwner("yahoo.com")` is called, it
+will be an unexpected call, and thus an error. *Having a nice mock doesn't
+change the severity of an unexpected call.*
+
+So how do we tell gMock that `GetDomainOwner()` can be called with some other
+arguments as well? The standard technique is to add a "catch all" `EXPECT_CALL`:
+
+```cpp
+  EXPECT_CALL(mock_registry, GetDomainOwner(_))
+        .Times(AnyNumber());  // catches all other calls to this method.
+  EXPECT_CALL(mock_registry, GetDomainOwner("google.com"))
+        .WillRepeatedly(Return("Larry Page"));
+```
+
+Remember that `_` is the wildcard matcher that matches anything. With this, if
+`GetDomainOwner("google.com")` is called, it will do what the second
+`EXPECT_CALL` says; if it is called with a different argument, it will do what
+the first `EXPECT_CALL` says.
+
+Note that the order of the two `EXPECT_CALL`s is important, as a newer
+`EXPECT_CALL` takes precedence over an older one.
+
+For more on uninteresting calls, nice mocks, and strict mocks, read
+["The Nice, the Strict, and the Naggy"](#NiceStrictNaggy).
+
+#### Ignoring Uninteresting Arguments {#ParameterlessExpectations}
+
+If your test doesn't care about the parameters (it only cares about the number
+or order of calls), you can often simply omit the parameter list:
+
+```cpp
+  // Expect foo.Bar( ... ) twice with any arguments.
+  EXPECT_CALL(foo, Bar).Times(2);
+
+  // Delegate to the given method whenever the factory is invoked.
+  ON_CALL(foo_factory, MakeFoo)
+      .WillByDefault(&BuildFooForTest);
+```
+
+This functionality is only available when a method is not overloaded; to prevent
+unexpected behavior it is a compilation error to try to set an expectation on a
+method where the specific overload is ambiguous. You can work around this by
+supplying a [simpler mock interface](#SimplerInterfaces) than the mocked class
+provides.
+
+This pattern is also useful when the arguments are interesting, but match logic
+is substantially complex. You can leave the argument list unspecified and use
+SaveArg actions to [save the values for later verification](#SaveArgVerify). If
+you do that, you can easily differentiate calling the method the wrong number of
+times from calling it with the wrong arguments.
+
+#### Expecting Ordered Calls {#OrderedCalls}
+
+Although an `EXPECT_CALL()` statement defined earlier takes precedence when
+gMock tries to match a function call with an expectation, by default calls don't
+have to happen in the order `EXPECT_CALL()` statements are written. For example,
+if the arguments match the matchers in the third `EXPECT_CALL()`, but not those
+in the first two, then the third expectation will be used.
+
+If you would rather have all calls occur in the order of the expectations, put
+the `EXPECT_CALL()` statements in a block where you define a variable of type
+`InSequence`:
+
+```cpp
+using ::testing::_;
+using ::testing::InSequence;
+
+  {
+    InSequence s;
+
+    EXPECT_CALL(foo, DoThis(5));
+    EXPECT_CALL(bar, DoThat(_))
+        .Times(2);
+    EXPECT_CALL(foo, DoThis(6));
+  }
+```
+
+In this example, we expect a call to `foo.DoThis(5)`, followed by two calls to
+`bar.DoThat()` where the argument can be anything, which are in turn followed by
+a call to `foo.DoThis(6)`. If a call occurred out-of-order, gMock will report an
+error.
+
+#### Expecting Partially Ordered Calls {#PartialOrder}
+
+Sometimes requiring everything to occur in a predetermined order can lead to
+brittle tests. For example, we may care about `A` occurring before both `B` and
+`C`, but aren't interested in the relative order of `B` and `C`. In this case,
+the test should reflect our real intent, instead of being overly constraining.
+
+gMock allows you to impose an arbitrary DAG (directed acyclic graph) on the
+calls. One way to express the DAG is to use the [After](#AfterClause) clause of
+`EXPECT_CALL`.
+
+Another way is via the `InSequence()` clause (not the same as the `InSequence`
+class), which we borrowed from jMock 2. It's less flexible than `After()`, but
+more convenient when you have long chains of sequential calls, as it doesn't
+require you to come up with different names for the expectations in the chains.
+Here's how it works:
+
+If we view `EXPECT_CALL()` statements as nodes in a graph, and add an edge from
+node A to node B wherever A must occur before B, we can get a DAG. We use the
+term "sequence" to mean a directed path in this DAG. Now, if we decompose the
+DAG into sequences, we just need to know which sequences each `EXPECT_CALL()`
+belongs to in order to be able to reconstruct the original DAG.
+
+So, to specify the partial order on the expectations we need to do two things:
+first to define some `Sequence` objects, and then for each `EXPECT_CALL()` say
+which `Sequence` objects it is part of.
+
+Expectations in the same sequence must occur in the order they are written. For
+example,
+
+```cpp
+using ::testing::Sequence;
+...
+  Sequence s1, s2;
+
+  EXPECT_CALL(foo, A())
+      .InSequence(s1, s2);
+  EXPECT_CALL(bar, B())
+      .InSequence(s1);
+  EXPECT_CALL(bar, C())
+      .InSequence(s2);
+  EXPECT_CALL(foo, D())
+      .InSequence(s2);
+```
+
+specifies the following DAG (where `s1` is `A -> B`, and `s2` is `A -> C -> D`):
+
+```text
+       +---> B
+       |
+  A ---|
+       |
+        +---> C ---> D
+```
+
+This means that A must occur before B and C, and C must occur before D. There's
+no restriction about the order other than these.
+
+#### Controlling When an Expectation Retires
+
+When a mock method is called, gMock only considers expectations that are still
+active. An expectation is active when created, and becomes inactive (aka
+*retires*) when a call that has to occur later has occurred. For example, in
+
+```cpp
+using ::testing::_;
+using ::testing::Sequence;
+...
+  Sequence s1, s2;
+
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."))      // #1
+      .Times(AnyNumber())
+      .InSequence(s1, s2);
+  EXPECT_CALL(log, Log(WARNING, _, "Data set is empty."))   // #2
+      .InSequence(s1);
+  EXPECT_CALL(log, Log(WARNING, _, "User not found."))      // #3
+      .InSequence(s2);
+```
+
+as soon as either #2 or #3 is matched, #1 will retire. If a warning `"File too
+large."` is logged after this, it will be an error.
+
+Note that an expectation doesn't retire automatically when it's saturated. For
+example,
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(log, Log(WARNING, _, _));                     // #1
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."));     // #2
+```
+
+says that there will be exactly one warning with the message `"File too
+large."`. If the second warning contains this message too, #2 will match again
+and result in an upper-bound-violated error.
+
+If this is not what you want, you can ask an expectation to retire as soon as it
+becomes saturated:
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(log, Log(WARNING, _, _));                     // #1
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."))      // #2
+      .RetiresOnSaturation();
+```
+
+Here #2 can be used only once, so if you have two warnings with the message
+`"File too large."`, the first will match #2 and the second will match #1 -
+there will be no error.
+
+### Using Actions
+
+#### Returning References from Mock Methods
+
+If a mock function's return type is a reference, you need to use `ReturnRef()`
+instead of `Return()` to return a result:
+
+```cpp
+using ::testing::ReturnRef;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+};
+...
+  MockFoo foo;
+  Bar bar;
+  EXPECT_CALL(foo, GetBar())
+      .WillOnce(ReturnRef(bar));
+...
+```
+
+#### Returning Live Values from Mock Methods
+
+The `Return(x)` action saves a copy of `x` when the action is created, and
+always returns the same value whenever it's executed. Sometimes you may want to
+instead return the *live* value of `x` (i.e. its value at the time when the
+action is *executed*.). Use either `ReturnRef()` or `ReturnPointee()` for this
+purpose.
+
+If the mock function's return type is a reference, you can do it using
+`ReturnRef(x)`, as shown in the previous recipe ("Returning References from Mock
+Methods"). However, gMock doesn't let you use `ReturnRef()` in a mock function
+whose return type is not a reference, as doing that usually indicates a user
+error. So, what shall you do?
+
+Though you may be tempted, DO NOT use `ByRef()`:
+
+```cpp
+using testing::ByRef;
+using testing::Return;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, GetValue, (), (override));
+};
+...
+  int x = 0;
+  MockFoo foo;
+  EXPECT_CALL(foo, GetValue())
+      .WillRepeatedly(Return(ByRef(x)));  // Wrong!
+  x = 42;
+  EXPECT_EQ(42, foo.GetValue());
+```
+
+Unfortunately, it doesn't work here. The above code will fail with error:
+
+```text
+Value of: foo.GetValue()
+  Actual: 0
+Expected: 42
+```
+
+The reason is that `Return(*value*)` converts `value` to the actual return type
+of the mock function at the time when the action is *created*, not when it is
+*executed*. (This behavior was chosen for the action to be safe when `value` is
+a proxy object that references some temporary objects.) As a result, `ByRef(x)`
+is converted to an `int` value (instead of a `const int&`) when the expectation
+is set, and `Return(ByRef(x))` will always return 0.
+
+`ReturnPointee(pointer)` was provided to solve this problem specifically. It
+returns the value pointed to by `pointer` at the time the action is *executed*:
+
+```cpp
+using testing::ReturnPointee;
+...
+  int x = 0;
+  MockFoo foo;
+  EXPECT_CALL(foo, GetValue())
+      .WillRepeatedly(ReturnPointee(&x));  // Note the & here.
+  x = 42;
+  EXPECT_EQ(42, foo.GetValue());  // This will succeed now.
+```
+
+#### Combining Actions
+
+Want to do more than one thing when a function is called? That's fine. `DoAll()`
+allow you to do sequence of actions every time. Only the return value of the
+last action in the sequence will be used.
+
+```cpp
+using ::testing::_;
+using ::testing::DoAll;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, Bar, (int n), (override));
+};
+...
+  EXPECT_CALL(foo, Bar(_))
+      .WillOnce(DoAll(action_1,
+                      action_2,
+                      ...
+                      action_n));
+```
+
+#### Verifying Complex Arguments {#SaveArgVerify}
+
+If you want to verify that a method is called with a particular argument but the
+match criteria is complex, it can be difficult to distinguish between
+cardinality failures (calling the method the wrong number of times) and argument
+match failures. Similarly, if you are matching multiple parameters, it may not
+be easy to distinguishing which argument failed to match. For example:
+
+```cpp
+  // Not ideal: this could fail because of a problem with arg1 or arg2, or maybe
+  // just the method wasn't called.
+  EXPECT_CALL(foo, SendValues(_, ElementsAre(1, 4, 4, 7), EqualsProto( ... )));
+```
+
+You can instead save the arguments and test them individually:
+
+```cpp
+  EXPECT_CALL(foo, SendValues)
+      .WillOnce(DoAll(SaveArg<1>(&actual_array), SaveArg<2>(&actual_proto)));
+  ... run the test
+  EXPECT_THAT(actual_array, ElementsAre(1, 4, 4, 7));
+  EXPECT_THAT(actual_proto, EqualsProto( ... ));
+```
+
+#### Mocking Side Effects {#MockingSideEffects}
+
+Sometimes a method exhibits its effect not via returning a value but via side
+effects. For example, it may change some global state or modify an output
+argument. To mock side effects, in general you can define your own action by
+implementing `::testing::ActionInterface`.
+
+If all you need to do is to change an output argument, the built-in
+`SetArgPointee()` action is convenient:
+
+```cpp
+using ::testing::_;
+using ::testing::SetArgPointee;
+
+class MockMutator : public Mutator {
+ public:
+  MOCK_METHOD(void, Mutate, (bool mutate, int* value), (override));
+  ...
+}
+...
+  MockMutator mutator;
+  EXPECT_CALL(mutator, Mutate(true, _))
+      .WillOnce(SetArgPointee<1>(5));
+```
+
+In this example, when `mutator.Mutate()` is called, we will assign 5 to the
+`int` variable pointed to by argument #1 (0-based).
+
+`SetArgPointee()` conveniently makes an internal copy of the value you pass to
+it, removing the need to keep the value in scope and alive. The implication
+however is that the value must have a copy constructor and assignment operator.
+
+If the mock method also needs to return a value as well, you can chain
+`SetArgPointee()` with `Return()` using `DoAll()`, remembering to put the
+`Return()` statement last:
+
+```cpp
+using ::testing::_;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+
+class MockMutator : public Mutator {
+ public:
+  ...
+  MOCK_METHOD(bool, MutateInt, (int* value), (override));
+}
+...
+  MockMutator mutator;
+  EXPECT_CALL(mutator, MutateInt(_))
+      .WillOnce(DoAll(SetArgPointee<0>(5),
+                      Return(true)));
+```
+
+Note, however, that if you use the `ReturnOKWith()` method, it will override the
+values provided by `SetArgPointee()` in the response parameters of your function
+call.
+
+If the output argument is an array, use the `SetArrayArgument<N>(first, last)`
+action instead. It copies the elements in source range `[first, last)` to the
+array pointed to by the `N`-th (0-based) argument:
+
+```cpp
+using ::testing::NotNull;
+using ::testing::SetArrayArgument;
+
+class MockArrayMutator : public ArrayMutator {
+ public:
+  MOCK_METHOD(void, Mutate, (int* values, int num_values), (override));
+  ...
+}
+...
+  MockArrayMutator mutator;
+  int values[5] = {1, 2, 3, 4, 5};
+  EXPECT_CALL(mutator, Mutate(NotNull(), 5))
+      .WillOnce(SetArrayArgument<0>(values, values + 5));
+```
+
+This also works when the argument is an output iterator:
+
+```cpp
+using ::testing::_;
+using ::testing::SetArrayArgument;
+
+class MockRolodex : public Rolodex {
+ public:
+  MOCK_METHOD(void, GetNames, (std::back_insert_iterator<vector<string>>),
+              (override));
+  ...
+}
+...
+  MockRolodex rolodex;
+  vector<string> names;
+  names.push_back("George");
+  names.push_back("John");
+  names.push_back("Thomas");
+  EXPECT_CALL(rolodex, GetNames(_))
+      .WillOnce(SetArrayArgument<0>(names.begin(), names.end()));
+```
+
+#### Changing a Mock Object's Behavior Based on the State
+
+If you expect a call to change the behavior of a mock object, you can use
+`::testing::InSequence` to specify different behaviors before and after the
+call:
+
+```cpp
+using ::testing::InSequence;
+using ::testing::Return;
+
+...
+  {
+     InSequence seq;
+     EXPECT_CALL(my_mock, IsDirty())
+         .WillRepeatedly(Return(true));
+     EXPECT_CALL(my_mock, Flush());
+     EXPECT_CALL(my_mock, IsDirty())
+         .WillRepeatedly(Return(false));
+  }
+  my_mock.FlushIfDirty();
+```
+
+This makes `my_mock.IsDirty()` return `true` before `my_mock.Flush()` is called
+and return `false` afterwards.
+
+If the behavior change is more complex, you can store the effects in a variable
+and make a mock method get its return value from that variable:
+
+```cpp
+using ::testing::_;
+using ::testing::SaveArg;
+using ::testing::Return;
+
+ACTION_P(ReturnPointee, p) { return *p; }
+...
+  int previous_value = 0;
+  EXPECT_CALL(my_mock, GetPrevValue)
+      .WillRepeatedly(ReturnPointee(&previous_value));
+  EXPECT_CALL(my_mock, UpdateValue)
+      .WillRepeatedly(SaveArg<0>(&previous_value));
+  my_mock.DoSomethingToUpdateValue();
+```
+
+Here `my_mock.GetPrevValue()` will always return the argument of the last
+`UpdateValue()` call.
+
+#### Setting the Default Value for a Return Type {#DefaultValue}
+
+If a mock method's return type is a built-in C++ type or pointer, by default it
+will return 0 when invoked. Also, in C++ 11 and above, a mock method whose
+return type has a default constructor will return a default-constructed value by
+default. You only need to specify an action if this default value doesn't work
+for you.
+
+Sometimes, you may want to change this default value, or you may want to specify
+a default value for types gMock doesn't know about. You can do this using the
+`::testing::DefaultValue` class template:
+
+```cpp
+using ::testing::DefaultValue;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(Bar, CalculateBar, (), (override));
+};
+
+
+...
+  Bar default_bar;
+  // Sets the default return value for type Bar.
+  DefaultValue<Bar>::Set(default_bar);
+
+  MockFoo foo;
+
+  // We don't need to specify an action here, as the default
+  // return value works for us.
+  EXPECT_CALL(foo, CalculateBar());
+
+  foo.CalculateBar();  // This should return default_bar.
+
+  // Unsets the default return value.
+  DefaultValue<Bar>::Clear();
+```
+
+Please note that changing the default value for a type can make you tests hard
+to understand. We recommend you to use this feature judiciously. For example,
+you may want to make sure the `Set()` and `Clear()` calls are right next to the
+code that uses your mock.
+
+#### Setting the Default Actions for a Mock Method
+
+You've learned how to change the default value of a given type. However, this
+may be too coarse for your purpose: perhaps you have two mock methods with the
+same return type and you want them to have different behaviors. The `ON_CALL()`
+macro allows you to customize your mock's behavior at the method level:
+
+```cpp
+using ::testing::_;
+using ::testing::AnyNumber;
+using ::testing::Gt;
+using ::testing::Return;
+...
+  ON_CALL(foo, Sign(_))
+      .WillByDefault(Return(-1));
+  ON_CALL(foo, Sign(0))
+      .WillByDefault(Return(0));
+  ON_CALL(foo, Sign(Gt(0)))
+      .WillByDefault(Return(1));
+
+  EXPECT_CALL(foo, Sign(_))
+      .Times(AnyNumber());
+
+  foo.Sign(5);   // This should return 1.
+  foo.Sign(-9);  // This should return -1.
+  foo.Sign(0);   // This should return 0.
+```
+
+As you may have guessed, when there are more than one `ON_CALL()` statements,
+the newer ones in the order take precedence over the older ones. In other words,
+the **last** one that matches the function arguments will be used. This matching
+order allows you to set up the common behavior in a mock object's constructor or
+the test fixture's set-up phase and specialize the mock's behavior later.
+
+Note that both `ON_CALL` and `EXPECT_CALL` have the same "later statements take
+precedence" rule, but they don't interact. That is, `EXPECT_CALL`s have their
+own precedence order distinct from the `ON_CALL` precedence order.
+
+#### Using Functions/Methods/Functors/Lambdas as Actions {#FunctionsAsActions}
+
+If the built-in actions don't suit you, you can use an existing callable
+(function, `std::function`, method, functor, lambda as an action.
+
+<!-- GOOGLETEST_CM0024 DO NOT DELETE -->
+
+```cpp
+using ::testing::_; using ::testing::Invoke;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, Sum, (int x, int y), (override));
+  MOCK_METHOD(bool, ComplexJob, (int x), (override));
+};
+
+int CalculateSum(int x, int y) { return x + y; }
+int Sum3(int x, int y, int z) { return x + y + z; }
+
+class Helper {
+ public:
+  bool ComplexJob(int x);
+};
+
+...
+  MockFoo foo;
+  Helper helper;
+  EXPECT_CALL(foo, Sum(_, _))
+      .WillOnce(&CalculateSum)
+      .WillRepeatedly(Invoke(NewPermanentCallback(Sum3, 1)));
+  EXPECT_CALL(foo, ComplexJob(_))
+      .WillOnce(Invoke(&helper, &Helper::ComplexJob));
+      .WillRepeatedly([](int x) { return x > 0; });
+
+  foo.Sum(5, 6);         // Invokes CalculateSum(5, 6).
+  foo.Sum(2, 3);         // Invokes Sum3(1, 2, 3).
+  foo.ComplexJob(10);    // Invokes helper.ComplexJob(10).
+  foo.ComplexJob(-1);    // Invokes the inline lambda.
+```
+
+The only requirement is that the type of the function, etc must be *compatible*
+with the signature of the mock function, meaning that the latter's arguments can
+be implicitly converted to the corresponding arguments of the former, and the
+former's return type can be implicitly converted to that of the latter. So, you
+can invoke something whose type is *not* exactly the same as the mock function,
+as long as it's safe to do so - nice, huh?
+
+**`Note:`{.escaped}**
+
+*   The action takes ownership of the callback and will delete it when the
+    action itself is destructed.
+*   If the type of a callback is derived from a base callback type `C`, you need
+    to implicitly cast it to `C` to resolve the overloading, e.g.
+
+    ```cpp
+    using ::testing::Invoke;
+    ...
+      ResultCallback<bool>* is_ok = ...;
+      ... Invoke(is_ok) ...;  // This works.
+
+      BlockingClosure* done = new BlockingClosure;
+      ... Invoke(implicit_cast<Closure*>(done)) ...;  // The cast is necessary.
+    ```
+
+#### Using Functions with Extra Info as Actions
+
+The function or functor you call using `Invoke()` must have the same number of
+arguments as the mock function you use it for. Sometimes you may have a function
+that takes more arguments, and you are willing to pass in the extra arguments
+yourself to fill the gap. You can do this in gMock using callbacks with
+pre-bound arguments. Here's an example:
+
+```cpp
+using ::testing::Invoke;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(char, DoThis, (int n), (override));
+};
+
+char SignOfSum(int x, int y) {
+  const int sum = x + y;
+  return (sum > 0) ? '+' : (sum < 0) ? '-' : '0';
+}
+
+TEST_F(FooTest, Test) {
+  MockFoo foo;
+
+  EXPECT_CALL(foo, DoThis(2))
+      .WillOnce(Invoke(NewPermanentCallback(SignOfSum, 5)));
+  EXPECT_EQ('+', foo.DoThis(2));  // Invokes SignOfSum(5, 2).
+}
+```
+
+#### Invoking a Function/Method/Functor/Lambda/Callback Without Arguments
+
+`Invoke()` is very useful for doing actions that are more complex. It passes the
+mock function's arguments to the function, etc being invoked such that the
+callee has the full context of the call to work with. If the invoked function is
+not interested in some or all of the arguments, it can simply ignore them.
+
+Yet, a common pattern is that a test author wants to invoke a function without
+the arguments of the mock function. `Invoke()` allows her to do that using a
+wrapper function that throws away the arguments before invoking an underlining
+nullary function. Needless to say, this can be tedious and obscures the intent
+of the test.
+
+`InvokeWithoutArgs()` solves this problem. It's like `Invoke()` except that it
+doesn't pass the mock function's arguments to the callee. Here's an example:
+
+```cpp
+using ::testing::_;
+using ::testing::InvokeWithoutArgs;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, ComplexJob, (int n), (override));
+};
+
+bool Job1() { ... }
+bool Job2(int n, char c) { ... }
+
+...
+  MockFoo foo;
+  EXPECT_CALL(foo, ComplexJob(_))
+      .WillOnce(InvokeWithoutArgs(Job1))
+      .WillOnce(InvokeWithoutArgs(NewPermanentCallback(Job2, 5, 'a')));
+
+  foo.ComplexJob(10);  // Invokes Job1().
+  foo.ComplexJob(20);  // Invokes Job2(5, 'a').
+```
+
+**`Note:`{.escaped}**
+
+*   The action takes ownership of the callback and will delete it when the
+    action itself is destructed.
+*   If the type of a callback is derived from a base callback type `C`, you need
+    to implicitly cast it to `C` to resolve the overloading, e.g.
+
+    ```cpp
+    using ::testing::InvokeWithoutArgs;
+    ...
+      ResultCallback<bool>* is_ok = ...;
+      ... InvokeWithoutArgs(is_ok) ...;  // This works.
+
+      BlockingClosure* done = ...;
+      ... InvokeWithoutArgs(implicit_cast<Closure*>(done)) ...;
+      // The cast is necessary.
+    ```
+
+#### Invoking an Argument of the Mock Function
+
+Sometimes a mock function will receive a function pointer, a functor (in other
+words, a "callable") as an argument, e.g.
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, DoThis, (int n, (ResultCallback1<bool, int>* callback)),
+              (override));
+};
+```
+
+and you may want to invoke this callable argument:
+
+```cpp
+using ::testing::_;
+...
+  MockFoo foo;
+  EXPECT_CALL(foo, DoThis(_, _))
+      .WillOnce(...);
+      // Will execute callback->Run(5), where callback is the
+      // second argument DoThis() receives.
+```
+
+NOTE: The section below is legacy documentation from before C++ had lambdas:
+
+Arghh, you need to refer to a mock function argument but C++ has no lambda
+(yet), so you have to define your own action. :-( Or do you really?
+
+Well, gMock has an action to solve *exactly* this problem:
+
+```cpp
+InvokeArgument<N>(arg_1, arg_2, ..., arg_m)
+```
+
+will invoke the `N`-th (0-based) argument the mock function receives, with
+`arg_1`, `arg_2`, ..., and `arg_m`. No matter if the argument is a function
+pointer, a functor, or a callback. gMock handles them all.
+
+With that, you could write:
+
+```cpp
+using ::testing::_;
+using ::testing::InvokeArgument;
+...
+  EXPECT_CALL(foo, DoThis(_, _))
+      .WillOnce(InvokeArgument<1>(5));
+      // Will execute callback->Run(5), where callback is the
+      // second argument DoThis() receives.
+```
+
+What if the callable takes an argument by reference? No problem - just wrap it
+inside `ByRef()`:
+
+```cpp
+  ...
+  MOCK_METHOD(bool, Bar,
+              ((ResultCallback2<bool, int, const Helper&>* callback)),
+              (override));
+  ...
+  using ::testing::_;
+  using ::testing::ByRef;
+  using ::testing::InvokeArgument;
+  ...
+  MockFoo foo;
+  Helper helper;
+  ...
+  EXPECT_CALL(foo, Bar(_))
+      .WillOnce(InvokeArgument<0>(5, ByRef(helper)));
+      // ByRef(helper) guarantees that a reference to helper, not a copy of it,
+      // will be passed to the callback.
+```
+
+What if the callable takes an argument by reference and we do **not** wrap the
+argument in `ByRef()`? Then `InvokeArgument()` will *make a copy* of the
+argument, and pass a *reference to the copy*, instead of a reference to the
+original value, to the callable. This is especially handy when the argument is a
+temporary value:
+
+```cpp
+  ...
+  MOCK_METHOD(bool, DoThat, (bool (*f)(const double& x, const string& s)),
+              (override));
+  ...
+  using ::testing::_;
+  using ::testing::InvokeArgument;
+  ...
+  MockFoo foo;
+  ...
+  EXPECT_CALL(foo, DoThat(_))
+      .WillOnce(InvokeArgument<0>(5.0, string("Hi")));
+      // Will execute (*f)(5.0, string("Hi")), where f is the function pointer
+      // DoThat() receives.  Note that the values 5.0 and string("Hi") are
+      // temporary and dead once the EXPECT_CALL() statement finishes.  Yet
+      // it's fine to perform this action later, since a copy of the values
+      // are kept inside the InvokeArgument action.
+```
+
+#### Ignoring an Action's Result
+
+Sometimes you have an action that returns *something*, but you need an action
+that returns `void` (perhaps you want to use it in a mock function that returns
+`void`, or perhaps it needs to be used in `DoAll()` and it's not the last in the
+list). `IgnoreResult()` lets you do that. For example:
+
+```cpp
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::IgnoreResult;
+using ::testing::Return;
+
+int Process(const MyData& data);
+string DoSomething();
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(void, Abc, (const MyData& data), (override));
+  MOCK_METHOD(bool, Xyz, (), (override));
+};
+
+  ...
+  MockFoo foo;
+  EXPECT_CALL(foo, Abc(_))
+      // .WillOnce(Invoke(Process));
+      // The above line won't compile as Process() returns int but Abc() needs
+      // to return void.
+      .WillOnce(IgnoreResult(Process));
+  EXPECT_CALL(foo, Xyz())
+      .WillOnce(DoAll(IgnoreResult(DoSomething),
+                      // Ignores the string DoSomething() returns.
+                      Return(true)));
+```
+
+Note that you **cannot** use `IgnoreResult()` on an action that already returns
+`void`. Doing so will lead to ugly compiler errors.
+
+#### Selecting an Action's Arguments {#SelectingArgs}
+
+Say you have a mock function `Foo()` that takes seven arguments, and you have a
+custom action that you want to invoke when `Foo()` is called. Trouble is, the
+custom action only wants three arguments:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+...
+  MOCK_METHOD(bool, Foo,
+              (bool visible, const string& name, int x, int y,
+               (const map<pair<int, int>>), double& weight, double min_weight,
+               double max_wight));
+...
+bool IsVisibleInQuadrant1(bool visible, int x, int y) {
+  return visible && x >= 0 && y >= 0;
+}
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(Invoke(IsVisibleInQuadrant1));  // Uh, won't compile. :-(
+```
+
+To please the compiler God, you need to define an "adaptor" that has the same
+signature as `Foo()` and calls the custom action with the right arguments:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+...
+bool MyIsVisibleInQuadrant1(bool visible, const string& name, int x, int y,
+                            const map<pair<int, int>, double>& weight,
+                            double min_weight, double max_wight) {
+  return IsVisibleInQuadrant1(visible, x, y);
+}
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(Invoke(MyIsVisibleInQuadrant1));  // Now it works.
+```
+
+But isn't this awkward?
+
+gMock provides a generic *action adaptor*, so you can spend your time minding
+more important business than writing your own adaptors. Here's the syntax:
+
+```cpp
+WithArgs<N1, N2, ..., Nk>(action)
+```
+
+creates an action that passes the arguments of the mock function at the given
+indices (0-based) to the inner `action` and performs it. Using `WithArgs`, our
+original example can be written as:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::WithArgs;
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(WithArgs<0, 2, 3>(Invoke(IsVisibleInQuadrant1)));  // No need to define your own adaptor.
+```
+
+For better readability, gMock also gives you:
+
+*   `WithoutArgs(action)` when the inner `action` takes *no* argument, and
+*   `WithArg<N>(action)` (no `s` after `Arg`) when the inner `action` takes
+    *one* argument.
+
+As you may have realized, `InvokeWithoutArgs(...)` is just syntactic sugar for
+`WithoutArgs(Invoke(...))`.
+
+Here are more tips:
+
+*   The inner action used in `WithArgs` and friends does not have to be
+    `Invoke()` -- it can be anything.
+*   You can repeat an argument in the argument list if necessary, e.g.
+    `WithArgs<2, 3, 3, 5>(...)`.
+*   You can change the order of the arguments, e.g. `WithArgs<3, 2, 1>(...)`.
+*   The types of the selected arguments do *not* have to match the signature of
+    the inner action exactly. It works as long as they can be implicitly
+    converted to the corresponding arguments of the inner action. For example,
+    if the 4-th argument of the mock function is an `int` and `my_action` takes
+    a `double`, `WithArg<4>(my_action)` will work.
+
+#### Ignoring Arguments in Action Functions
+
+The [selecting-an-action's-arguments](#SelectingArgs) recipe showed us one way
+to make a mock function and an action with incompatible argument lists fit
+together. The downside is that wrapping the action in `WithArgs<...>()` can get
+tedious for people writing the tests.
+
+If you are defining a function (or method, functor, lambda, callback) to be used
+with `Invoke*()`, and you are not interested in some of its arguments, an
+alternative to `WithArgs` is to declare the uninteresting arguments as `Unused`.
+This makes the definition less cluttered and less fragile in case the types of
+the uninteresting arguments change. It could also increase the chance the action
+function can be reused. For example, given
+
+```cpp
+ public:
+  MOCK_METHOD(double, Foo, double(const string& label, double x, double y),
+              (override));
+  MOCK_METHOD(double, Bar, (int index, double x, double y), (override));
+```
+
+instead of
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+
+double DistanceToOriginWithLabel(const string& label, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+double DistanceToOriginWithIndex(int index, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+...
+  EXPECT_CALL(mock, Foo("abc", _, _))
+      .WillOnce(Invoke(DistanceToOriginWithLabel));
+  EXPECT_CALL(mock, Bar(5, _, _))
+      .WillOnce(Invoke(DistanceToOriginWithIndex));
+```
+
+you could write
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::Unused;
+
+double DistanceToOrigin(Unused, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+...
+  EXPECT_CALL(mock, Foo("abc", _, _))
+      .WillOnce(Invoke(DistanceToOrigin));
+  EXPECT_CALL(mock, Bar(5, _, _))
+      .WillOnce(Invoke(DistanceToOrigin));
+```
+
+#### Sharing Actions
+
+Just like matchers, a gMock action object consists of a pointer to a ref-counted
+implementation object. Therefore copying actions is also allowed and very
+efficient. When the last action that references the implementation object dies,
+the implementation object will be deleted.
+
+If you have some complex action that you want to use again and again, you may
+not have to build it from scratch everytime. If the action doesn't have an
+internal state (i.e. if it always does the same thing no matter how many times
+it has been called), you can assign it to an action variable and use that
+variable repeatedly. For example:
+
+```cpp
+using ::testing::Action;
+using ::testing::DoAll;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+...
+  Action<bool(int*)> set_flag = DoAll(SetArgPointee<0>(5),
+                                      Return(true));
+  ... use set_flag in .WillOnce() and .WillRepeatedly() ...
+```
+
+However, if the action has its own state, you may be surprised if you share the
+action object. Suppose you have an action factory `IncrementCounter(init)` which
+creates an action that increments and returns a counter whose initial value is
+`init`, using two actions created from the same expression and using a shared
+action will exhibit different behaviors. Example:
+
+```cpp
+  EXPECT_CALL(foo, DoThis())
+      .WillRepeatedly(IncrementCounter(0));
+  EXPECT_CALL(foo, DoThat())
+      .WillRepeatedly(IncrementCounter(0));
+  foo.DoThis();  // Returns 1.
+  foo.DoThis();  // Returns 2.
+  foo.DoThat();  // Returns 1 - Blah() uses a different
+                 // counter than Bar()'s.
+```
+
+versus
+
+```cpp
+using ::testing::Action;
+...
+  Action<int()> increment = IncrementCounter(0);
+  EXPECT_CALL(foo, DoThis())
+      .WillRepeatedly(increment);
+  EXPECT_CALL(foo, DoThat())
+      .WillRepeatedly(increment);
+  foo.DoThis();  // Returns 1.
+  foo.DoThis();  // Returns 2.
+  foo.DoThat();  // Returns 3 - the counter is shared.
+```
+
+#### Testing Asynchronous Behavior
+
+One oft-encountered problem with gMock is that it can be hard to test
+asynchronous behavior. Suppose you had a `EventQueue` class that you wanted to
+test, and you created a separate `EventDispatcher` interface so that you could
+easily mock it out. However, the implementation of the class fired all the
+events on a background thread, which made test timings difficult. You could just
+insert `sleep()` statements and hope for the best, but that makes your test
+behavior nondeterministic. A better way is to use gMock actions and
+`Notification` objects to force your asynchronous test to behave synchronously.
+
+```cpp
+using ::testing::DoAll;
+using ::testing::InvokeWithoutArgs;
+using ::testing::Return;
+
+class MockEventDispatcher : public EventDispatcher {
+  MOCK_METHOD(bool, DispatchEvent, (int32), (override));
+};
+
+ACTION_P(Notify, notification) {
+  notification->Notify();
+}
+
+TEST(EventQueueTest, EnqueueEventTest) {
+  MockEventDispatcher mock_event_dispatcher;
+  EventQueue event_queue(&mock_event_dispatcher);
+
+  const int32 kEventId = 321;
+  Notification done;
+  EXPECT_CALL(mock_event_dispatcher, DispatchEvent(kEventId))
+      .WillOnce(Notify(&done));
+
+  event_queue.EnqueueEvent(kEventId);
+  done.WaitForNotification();
+}
+```
+
+In the example above, we set our normal gMock expectations, but then add an
+additional action to notify the `Notification` object. Now we can just call
+`Notification::WaitForNotification()` in the main thread to wait for the
+asynchronous call to finish. After that, our test suite is complete and we can
+safely exit.
+
+Note: this example has a downside: namely, if the expectation is not satisfied,
+our test will run forever. It will eventually time-out and fail, but it will
+take longer and be slightly harder to debug. To alleviate this problem, you can
+use `WaitForNotificationWithTimeout(ms)` instead of `WaitForNotification()`.
+
+### Misc Recipes on Using gMock
+
+#### Mocking Methods That Use Move-Only Types
+
+C++11 introduced *move-only types*. A move-only-typed value can be moved from
+one object to another, but cannot be copied. `std::unique_ptr<T>` is probably
+the most commonly used move-only type.
+
+Mocking a method that takes and/or returns move-only types presents some
+challenges, but nothing insurmountable. This recipe shows you how you can do it.
+Note that the support for move-only method arguments was only introduced to
+gMock in April 2017; in older code, you may find more complex
+[workarounds](#LegacyMoveOnly) for lack of this feature.
+
+Let’s say we are working on a fictional project that lets one post and share
+snippets called “buzzes”. Your code uses these types:
+
+```cpp
+enum class AccessLevel { kInternal, kPublic };
+
+class Buzz {
+ public:
+  explicit Buzz(AccessLevel access) { ... }
+  ...
+};
+
+class Buzzer {
+ public:
+  virtual ~Buzzer() {}
+  virtual std::unique_ptr<Buzz> MakeBuzz(StringPiece text) = 0;
+  virtual bool ShareBuzz(std::unique_ptr<Buzz> buzz, int64_t timestamp) = 0;
+  ...
+};
+```
+
+A `Buzz` object represents a snippet being posted. A class that implements the
+`Buzzer` interface is capable of creating and sharing `Buzz`es. Methods in
+`Buzzer` may return a `unique_ptr<Buzz>` or take a `unique_ptr<Buzz>`. Now we
+need to mock `Buzzer` in our tests.
+
+To mock a method that accepts or returns move-only types, you just use the
+familiar `MOCK_METHOD` syntax as usual:
+
+```cpp
+class MockBuzzer : public Buzzer {
+ public:
+  MOCK_METHOD(std::unique_ptr<Buzz>, MakeBuzz, (StringPiece text), (override));
+  MOCK_METHOD(bool, ShareBuzz, (std::unique_ptr<Buzz> buzz, int64_t timestamp),
+              (override));
+};
+```
+
+Now that we have the mock class defined, we can use it in tests. In the
+following code examples, we assume that we have defined a `MockBuzzer` object
+named `mock_buzzer_`:
+
+```cpp
+  MockBuzzer mock_buzzer_;
+```
+
+First let’s see how we can set expectations on the `MakeBuzz()` method, which
+returns a `unique_ptr<Buzz>`.
+
+As usual, if you set an expectation without an action (i.e. the `.WillOnce()` or
+`.WillRepeatedly()` clause), when that expectation fires, the default action for
+that method will be taken. Since `unique_ptr<>` has a default constructor that
+returns a null `unique_ptr`, that’s what you’ll get if you don’t specify an
+action:
+
+```cpp
+  // Use the default action.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("hello"));
+
+  // Triggers the previous EXPECT_CALL.
+  EXPECT_EQ(nullptr, mock_buzzer_.MakeBuzz("hello"));
+```
+
+If you are not happy with the default action, you can tweak it as usual; see
+[Setting Default Actions](#OnCall).
+
+If you just need to return a pre-defined move-only value, you can use the
+`Return(ByMove(...))` action:
+
+```cpp
+  // When this fires, the unique_ptr<> specified by ByMove(...) will
+  // be returned.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("world"))
+      .WillOnce(Return(ByMove(MakeUnique<Buzz>(AccessLevel::kInternal))));
+
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("world"));
+```
+
+Note that `ByMove()` is essential here - if you drop it, the code won’t compile.
+
+Quiz time! What do you think will happen if a `Return(ByMove(...))` action is
+performed more than once (e.g. you write `...
+.WillRepeatedly(Return(ByMove(...)));`)? Come think of it, after the first time
+the action runs, the source value will be consumed (since it’s a move-only
+value), so the next time around, there’s no value to move from -- you’ll get a
+run-time error that `Return(ByMove(...))` can only be run once.
+
+If you need your mock method to do more than just moving a pre-defined value,
+remember that you can always use a lambda or a callable object, which can do
+pretty much anything you want:
+
+```cpp
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("x"))
+      .WillRepeatedly([](StringPiece text) {
+        return MakeUnique<Buzz>(AccessLevel::kInternal);
+      });
+
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x"));
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x"));
+```
+
+Every time this `EXPECT_CALL` fires, a new `unique_ptr<Buzz>` will be created
+and returned. You cannot do this with `Return(ByMove(...))`.
+
+That covers returning move-only values; but how do we work with methods
+accepting move-only arguments? The answer is that they work normally, although
+some actions will not compile when any of method's arguments are move-only. You
+can always use `Return`, or a [lambda or functor](#FunctionsAsActions):
+
+```cpp
+  using ::testing::Unused;
+
+  EXPECT_CALL(mock_buzzer_, ShareBuzz(NotNull(), _)).WillOnce(Return(true));
+  EXPECT_TRUE(mock_buzzer_.ShareBuzz(MakeUnique<Buzz>(AccessLevel::kInternal)),
+              0);
+
+  EXPECT_CALL(mock_buzzer_, ShareBuzz(_, _)).WillOnce(
+      [](std::unique_ptr<Buzz> buzz, Unused) { return buzz != nullptr; });
+  EXPECT_FALSE(mock_buzzer_.ShareBuzz(nullptr, 0));
+```
+
+Many built-in actions (`WithArgs`, `WithoutArgs`,`DeleteArg`, `SaveArg`, ...)
+could in principle support move-only arguments, but the support for this is not
+implemented yet. If this is blocking you, please file a bug.
+
+A few actions (e.g. `DoAll`) copy their arguments internally, so they can never
+work with non-copyable objects; you'll have to use functors instead.
+
+##### Legacy workarounds for move-only types {#LegacyMoveOnly}
+
+Support for move-only function arguments was only introduced to gMock in April
+2017. In older code, you may encounter the following workaround for the lack of
+this feature (it is no longer necessary - we're including it just for
+reference):
+
+```cpp
+class MockBuzzer : public Buzzer {
+ public:
+  MOCK_METHOD(bool, DoShareBuzz, (Buzz* buzz, Time timestamp));
+  bool ShareBuzz(std::unique_ptr<Buzz> buzz, Time timestamp) override {
+    return DoShareBuzz(buzz.get(), timestamp);
+  }
+};
+```
+
+The trick is to delegate the `ShareBuzz()` method to a mock method (let’s call
+it `DoShareBuzz()`) that does not take move-only parameters. Then, instead of
+setting expectations on `ShareBuzz()`, you set them on the `DoShareBuzz()` mock
+method:
+
+```cpp
+  MockBuzzer mock_buzzer_;
+  EXPECT_CALL(mock_buzzer_, DoShareBuzz(NotNull(), _));
+
+  // When one calls ShareBuzz() on the MockBuzzer like this, the call is
+  // forwarded to DoShareBuzz(), which is mocked.  Therefore this statement
+  // will trigger the above EXPECT_CALL.
+  mock_buzzer_.ShareBuzz(MakeUnique<Buzz>(AccessLevel::kInternal), 0);
+```
+
+#### Making the Compilation Faster
+
+Believe it or not, the *vast majority* of the time spent on compiling a mock
+class is in generating its constructor and destructor, as they perform
+non-trivial tasks (e.g. verification of the expectations). What's more, mock
+methods with different signatures have different types and thus their
+constructors/destructors need to be generated by the compiler separately. As a
+result, if you mock many different types of methods, compiling your mock class
+can get really slow.
+
+If you are experiencing slow compilation, you can move the definition of your
+mock class' constructor and destructor out of the class body and into a `.cc`
+file. This way, even if you `#include` your mock class in N files, the compiler
+only needs to generate its constructor and destructor once, resulting in a much
+faster compilation.
+
+Let's illustrate the idea using an example. Here's the definition of a mock
+class before applying this recipe:
+
+```cpp
+// File mock_foo.h.
+...
+class MockFoo : public Foo {
+ public:
+  // Since we don't declare the constructor or the destructor,
+  // the compiler will generate them in every translation unit
+  // where this mock class is used.
+
+  MOCK_METHOD(int, DoThis, (), (override));
+  MOCK_METHOD(bool, DoThat, (const char* str), (override));
+  ... more mock methods ...
+};
+```
+
+After the change, it would look like:
+
+```cpp
+// File mock_foo.h.
+...
+class MockFoo : public Foo {
+ public:
+  // The constructor and destructor are declared, but not defined, here.
+  MockFoo();
+  virtual ~MockFoo();
+
+  MOCK_METHOD(int, DoThis, (), (override));
+  MOCK_METHOD(bool, DoThat, (const char* str), (override));
+  ... more mock methods ...
+};
+```
+
+and
+
+```cpp
+// File mock_foo.cc.
+#include "path/to/mock_foo.h"
+
+// The definitions may appear trivial, but the functions actually do a
+// lot of things through the constructors/destructors of the member
+// variables used to implement the mock methods.
+MockFoo::MockFoo() {}
+MockFoo::~MockFoo() {}
+```
+
+#### Forcing a Verification
+
+When it's being destroyed, your friendly mock object will automatically verify
+that all expectations on it have been satisfied, and will generate googletest
+failures if not. This is convenient as it leaves you with one less thing to
+worry about. That is, unless you are not sure if your mock object will be
+destroyed.
+
+How could it be that your mock object won't eventually be destroyed? Well, it
+might be created on the heap and owned by the code you are testing. Suppose
+there's a bug in that code and it doesn't delete the mock object properly - you
+could end up with a passing test when there's actually a bug.
+
+Using a heap checker is a good idea and can alleviate the concern, but its
+implementation is not 100% reliable. So, sometimes you do want to *force* gMock
+to verify a mock object before it is (hopefully) destructed. You can do this
+with `Mock::VerifyAndClearExpectations(&mock_object)`:
+
+```cpp
+TEST(MyServerTest, ProcessesRequest) {
+  using ::testing::Mock;
+
+  MockFoo* const foo = new MockFoo;
+  EXPECT_CALL(*foo, ...)...;
+  // ... other expectations ...
+
+  // server now owns foo.
+  MyServer server(foo);
+  server.ProcessRequest(...);
+
+  // In case that server's destructor will forget to delete foo,
+  // this will verify the expectations anyway.
+  Mock::VerifyAndClearExpectations(foo);
+}  // server is destroyed when it goes out of scope here.
+```
+
+**Tip:** The `Mock::VerifyAndClearExpectations()` function returns a `bool` to
+indicate whether the verification was successful (`true` for yes), so you can
+wrap that function call inside a `ASSERT_TRUE()` if there is no point going
+further when the verification has failed.
+
+#### Using Check Points {#UsingCheckPoints}
+
+Sometimes you may want to "reset" a mock object at various check points in your
+test: at each check point, you verify that all existing expectations on the mock
+object have been satisfied, and then you set some new expectations on it as if
+it's newly created. This allows you to work with a mock object in "phases" whose
+sizes are each manageable.
+
+One such scenario is that in your test's `SetUp()` function, you may want to put
+the object you are testing into a certain state, with the help from a mock
+object. Once in the desired state, you want to clear all expectations on the
+mock, such that in the `TEST_F` body you can set fresh expectations on it.
+
+As you may have figured out, the `Mock::VerifyAndClearExpectations()` function
+we saw in the previous recipe can help you here. Or, if you are using
+`ON_CALL()` to set default actions on the mock object and want to clear the
+default actions as well, use `Mock::VerifyAndClear(&mock_object)` instead. This
+function does what `Mock::VerifyAndClearExpectations(&mock_object)` does and
+returns the same `bool`, **plus** it clears the `ON_CALL()` statements on
+`mock_object` too.
+
+Another trick you can use to achieve the same effect is to put the expectations
+in sequences and insert calls to a dummy "check-point" function at specific
+places. Then you can verify that the mock function calls do happen at the right
+time. For example, if you are exercising code:
+
+```cpp
+  Foo(1);
+  Foo(2);
+  Foo(3);
+```
+
+and want to verify that `Foo(1)` and `Foo(3)` both invoke `mock.Bar("a")`, but
+`Foo(2)` doesn't invoke anything. You can write:
+
+```cpp
+using ::testing::MockFunction;
+
+TEST(FooTest, InvokesBarCorrectly) {
+  MyMock mock;
+  // Class MockFunction<F> has exactly one mock method.  It is named
+  // Call() and has type F.
+  MockFunction<void(string check_point_name)> check;
+  {
+    InSequence s;
+
+    EXPECT_CALL(mock, Bar("a"));
+    EXPECT_CALL(check, Call("1"));
+    EXPECT_CALL(check, Call("2"));
+    EXPECT_CALL(mock, Bar("a"));
+  }
+  Foo(1);
+  check.Call("1");
+  Foo(2);
+  check.Call("2");
+  Foo(3);
+}
+```
+
+The expectation spec says that the first `Bar("a")` must happen before check
+point "1", the second `Bar("a")` must happen after check point "2", and nothing
+should happen between the two check points. The explicit check points make it
+easy to tell which `Bar("a")` is called by which call to `Foo()`.
+
+#### Mocking Destructors
+
+Sometimes you want to make sure a mock object is destructed at the right time,
+e.g. after `bar->A()` is called but before `bar->B()` is called. We already know
+that you can specify constraints on the [order](#OrderedCalls) of mock function
+calls, so all we need to do is to mock the destructor of the mock function.
+
+This sounds simple, except for one problem: a destructor is a special function
+with special syntax and special semantics, and the `MOCK_METHOD` macro doesn't
+work for it:
+
+```cpp
+MOCK_METHOD(void, ~MockFoo, ());  // Won't compile!
+```
+
+The good news is that you can use a simple pattern to achieve the same effect.
+First, add a mock function `Die()` to your mock class and call it in the
+destructor, like this:
+
+```cpp
+class MockFoo : public Foo {
+  ...
+  // Add the following two lines to the mock class.
+  MOCK_METHOD(void, Die, ());
+  virtual ~MockFoo() { Die(); }
+};
+```
+
+(If the name `Die()` clashes with an existing symbol, choose another name.) Now,
+we have translated the problem of testing when a `MockFoo` object dies to
+testing when its `Die()` method is called:
+
+```cpp
+  MockFoo* foo = new MockFoo;
+  MockBar* bar = new MockBar;
+  ...
+  {
+    InSequence s;
+
+    // Expects *foo to die after bar->A() and before bar->B().
+    EXPECT_CALL(*bar, A());
+    EXPECT_CALL(*foo, Die());
+    EXPECT_CALL(*bar, B());
+  }
+```
+
+And that's that.
+
+#### Using gMock and Threads {#UsingThreads}
+
+In a **unit** test, it's best if you could isolate and test a piece of code in a
+single-threaded context. That avoids race conditions and dead locks, and makes
+debugging your test much easier.
+
+Yet most programs are multi-threaded, and sometimes to test something we need to
+pound on it from more than one thread. gMock works for this purpose too.
+
+Remember the steps for using a mock:
+
+1.  Create a mock object `foo`.
+2.  Set its default actions and expectations using `ON_CALL()` and
+    `EXPECT_CALL()`.
+3.  The code under test calls methods of `foo`.
+4.  Optionally, verify and reset the mock.
+5.  Destroy the mock yourself, or let the code under test destroy it. The
+    destructor will automatically verify it.
+
+If you follow the following simple rules, your mocks and threads can live
+happily together:
+
+*   Execute your *test code* (as opposed to the code being tested) in *one*
+    thread. This makes your test easy to follow.
+*   Obviously, you can do step #1 without locking.
+*   When doing step #2 and #5, make sure no other thread is accessing `foo`.
+    Obvious too, huh?
+*   #3 and #4 can be done either in one thread or in multiple threads - anyway
+    you want. gMock takes care of the locking, so you don't have to do any -
+    unless required by your test logic.
+
+If you violate the rules (for example, if you set expectations on a mock while
+another thread is calling its methods), you get undefined behavior. That's not
+fun, so don't do it.
+
+gMock guarantees that the action for a mock function is done in the same thread
+that called the mock function. For example, in
+
+```cpp
+  EXPECT_CALL(mock, Foo(1))
+      .WillOnce(action1);
+  EXPECT_CALL(mock, Foo(2))
+      .WillOnce(action2);
+```
+
+if `Foo(1)` is called in thread 1 and `Foo(2)` is called in thread 2, gMock will
+execute `action1` in thread 1 and `action2` in thread 2.
+
+gMock does *not* impose a sequence on actions performed in different threads
+(doing so may create deadlocks as the actions may need to cooperate). This means
+that the execution of `action1` and `action2` in the above example *may*
+interleave. If this is a problem, you should add proper synchronization logic to
+`action1` and `action2` to make the test thread-safe.
+
+Also, remember that `DefaultValue<T>` is a global resource that potentially
+affects *all* living mock objects in your program. Naturally, you won't want to
+mess with it from multiple threads or when there still are mocks in action.
+
+#### Controlling How Much Information gMock Prints
+
+When gMock sees something that has the potential of being an error (e.g. a mock
+function with no expectation is called, a.k.a. an uninteresting call, which is
+allowed but perhaps you forgot to explicitly ban the call), it prints some
+warning messages, including the arguments of the function, the return value, and
+the stack trace. Hopefully this will remind you to take a look and see if there
+is indeed a problem.
+
+Sometimes you are confident that your tests are correct and may not appreciate
+such friendly messages. Some other times, you are debugging your tests or
+learning about the behavior of the code you are testing, and wish you could
+observe every mock call that happens (including argument values, the return
+value, and the stack trace). Clearly, one size doesn't fit all.
+
+You can control how much gMock tells you using the `--gmock_verbose=LEVEL`
+command-line flag, where `LEVEL` is a string with three possible values:
+
+*   `info`: gMock will print all informational messages, warnings, and errors
+    (most verbose). At this setting, gMock will also log any calls to the
+    `ON_CALL/EXPECT_CALL` macros. It will include a stack trace in
+    "uninteresting call" warnings.
+*   `warning`: gMock will print both warnings and errors (less verbose); it will
+    omit the stack traces in "uninteresting call" warnings. This is the default.
+*   `error`: gMock will print errors only (least verbose).
+
+Alternatively, you can adjust the value of that flag from within your tests like
+so:
+
+```cpp
+  ::testing::FLAGS_gmock_verbose = "error";
+```
+
+If you find gMock printing too many stack frames with its informational or
+warning messages, remember that you can control their amount with the
+`--gtest_stack_trace_depth=max_depth` flag.
+
+Now, judiciously use the right flag to enable gMock serve you better!
+
+#### Gaining Super Vision into Mock Calls
+
+You have a test using gMock. It fails: gMock tells you some expectations aren't
+satisfied. However, you aren't sure why: Is there a typo somewhere in the
+matchers? Did you mess up the order of the `EXPECT_CALL`s? Or is the code under
+test doing something wrong? How can you find out the cause?
+
+Won't it be nice if you have X-ray vision and can actually see the trace of all
+`EXPECT_CALL`s and mock method calls as they are made? For each call, would you
+like to see its actual argument values and which `EXPECT_CALL` gMock thinks it
+matches? If you still need some help to figure out who made these calls, how
+about being able to see the complete stack trace at each mock call?
+
+You can unlock this power by running your test with the `--gmock_verbose=info`
+flag. For example, given the test program:
+
+```cpp
+#include "gmock/gmock.h"
+
+using testing::_;
+using testing::HasSubstr;
+using testing::Return;
+
+class MockFoo {
+ public:
+  MOCK_METHOD(void, F, (const string& x, const string& y));
+};
+
+TEST(Foo, Bar) {
+  MockFoo mock;
+  EXPECT_CALL(mock, F(_, _)).WillRepeatedly(Return());
+  EXPECT_CALL(mock, F("a", "b"));
+  EXPECT_CALL(mock, F("c", HasSubstr("d")));
+
+  mock.F("a", "good");
+  mock.F("a", "b");
+}
+```
+
+if you run it with `--gmock_verbose=info`, you will see this output:
+
+```shell
+[ RUN       ] Foo.Bar
+
+foo_test.cc:14: EXPECT_CALL(mock, F(_, _)) invoked
+Stack trace: ...
+
+foo_test.cc:15: EXPECT_CALL(mock, F("a", "b")) invoked
+Stack trace: ...
+
+foo_test.cc:16: EXPECT_CALL(mock, F("c", HasSubstr("d"))) invoked
+Stack trace: ...
+
+foo_test.cc:14: Mock function call matches EXPECT_CALL(mock, F(_, _))...
+    Function call: F(@0x7fff7c8dad40"a",@0x7fff7c8dad10"good")
+Stack trace: ...
+
+foo_test.cc:15: Mock function call matches EXPECT_CALL(mock, F("a", "b"))...
+    Function call: F(@0x7fff7c8dada0"a",@0x7fff7c8dad70"b")
+Stack trace: ...
+
+foo_test.cc:16: Failure
+Actual function call count doesn't match EXPECT_CALL(mock, F("c", HasSubstr("d")))...
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] Foo.Bar
+```
+
+Suppose the bug is that the `"c"` in the third `EXPECT_CALL` is a typo and
+should actually be `"a"`. With the above message, you should see that the actual
+`F("a", "good")` call is matched by the first `EXPECT_CALL`, not the third as
+you thought. From that it should be obvious that the third `EXPECT_CALL` is
+written wrong. Case solved.
+
+If you are interested in the mock call trace but not the stack traces, you can
+combine `--gmock_verbose=info` with `--gtest_stack_trace_depth=0` on the test
+command line.
+
+<!-- GOOGLETEST_CM0025 DO NOT DELETE -->
+
+#### Running Tests in Emacs
+
+If you build and run your tests in Emacs using the `M-x google-compile` command
+(as many googletest users do), the source file locations of gMock and googletest
+errors will be highlighted. Just press `<Enter>` on one of them and you'll be
+taken to the offending line. Or, you can just type `C-x`` to jump to the next
+error.
+
+To make it even easier, you can add the following lines to your `~/.emacs` file:
+
+```text
+(global-set-key "\M-m"  'google-compile)  ; m is for make
+(global-set-key [M-down] 'next-error)
+(global-set-key [M-up]  '(lambda () (interactive) (next-error -1)))
+```
+
+Then you can type `M-m` to start a build (if you want to run the test as well,
+just make sure `foo_test.run` or `runtests` is in the build command you supply
+after typing `M-m`), or `M-up`/`M-down` to move back and forth between errors.
+
+### Extending gMock
+
+#### Writing New Matchers Quickly {#NewMatchers}
+
+WARNING: gMock does not guarantee when or how many times a matcher will be
+invoked. Therefore, all matchers must be functionally pure. See
+[this section](#PureMatchers) for more details.
+
+The `MATCHER*` family of macros can be used to define custom matchers easily.
+The syntax:
+
+```cpp
+MATCHER(name, description_string_expression) { statements; }
+```
+
+will define a matcher with the given name that executes the statements, which
+must return a `bool` to indicate if the match succeeds. Inside the statements,
+you can refer to the value being matched by `arg`, and refer to its type by
+`arg_type`.
+
+The *description string* is a `string`-typed expression that documents what the
+matcher does, and is used to generate the failure message when the match fails.
+It can (and should) reference the special `bool` variable `negation`, and should
+evaluate to the description of the matcher when `negation` is `false`, or that
+of the matcher's negation when `negation` is `true`.
+
+For convenience, we allow the description string to be empty (`""`), in which
+case gMock will use the sequence of words in the matcher name as the
+description.
+
+For example:
+
+```cpp
+MATCHER(IsDivisibleBy7, "") { return (arg % 7) == 0; }
+```
+
+allows you to write
+
+```cpp
+  // Expects mock_foo.Bar(n) to be called where n is divisible by 7.
+  EXPECT_CALL(mock_foo, Bar(IsDivisibleBy7()));
+```
+
+or,
+
+```cpp
+  using ::testing::Not;
+  ...
+  // Verifies that two values are divisible by 7.
+  EXPECT_THAT(some_expression, IsDivisibleBy7());
+  EXPECT_THAT(some_other_expression, Not(IsDivisibleBy7()));
+```
+
+If the above assertions fail, they will print something like:
+
+```shell
+  Value of: some_expression
+  Expected: is divisible by 7
+    Actual: 27
+  ...
+  Value of: some_other_expression
+  Expected: not (is divisible by 7)
+    Actual: 21
+```
+
+where the descriptions `"is divisible by 7"` and `"not (is divisible by 7)"` are
+automatically calculated from the matcher name `IsDivisibleBy7`.
+
+As you may have noticed, the auto-generated descriptions (especially those for
+the negation) may not be so great. You can always override them with a `string`
+expression of your own:
+
+```cpp
+MATCHER(IsDivisibleBy7,
+        absl::StrCat(negation ? "isn't" : "is", " divisible by 7")) {
+  return (arg % 7) == 0;
+}
+```
+
+Optionally, you can stream additional information to a hidden argument named
+`result_listener` to explain the match result. For example, a better definition
+of `IsDivisibleBy7` is:
+
+```cpp
+MATCHER(IsDivisibleBy7, "") {
+  if ((arg % 7) == 0)
+    return true;
+
+  *result_listener << "the remainder is " << (arg % 7);
+  return false;
+}
+```
+
+With this definition, the above assertion will give a better message:
+
+```shell
+  Value of: some_expression
+  Expected: is divisible by 7
+    Actual: 27 (the remainder is 6)
+```
+
+You should let `MatchAndExplain()` print *any additional information* that can
+help a user understand the match result. Note that it should explain why the
+match succeeds in case of a success (unless it's obvious) - this is useful when
+the matcher is used inside `Not()`. There is no need to print the argument value
+itself, as gMock already prints it for you.
+
+NOTE: The type of the value being matched (`arg_type`) is determined by the
+context in which you use the matcher and is supplied to you by the compiler, so
+you don't need to worry about declaring it (nor can you). This allows the
+matcher to be polymorphic. For example, `IsDivisibleBy7()` can be used to match
+any type where the value of `(arg % 7) == 0` can be implicitly converted to a
+`bool`. In the `Bar(IsDivisibleBy7())` example above, if method `Bar()` takes an
+`int`, `arg_type` will be `int`; if it takes an `unsigned long`, `arg_type` will
+be `unsigned long`; and so on.
+
+#### Writing New Parameterized Matchers Quickly
+
+Sometimes you'll want to define a matcher that has parameters. For that you can
+use the macro:
+
+```cpp
+MATCHER_P(name, param_name, description_string) { statements; }
+```
+
+where the description string can be either `""` or a `string` expression that
+references `negation` and `param_name`.
+
+For example:
+
+```cpp
+MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+```
+
+will allow you to write:
+
+```cpp
+  EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+```
+
+which may lead to this message (assuming `n` is 10):
+
+```shell
+  Value of: Blah("a")
+  Expected: has absolute value 10
+    Actual: -9
+```
+
+Note that both the matcher description and its parameter are printed, making the
+message human-friendly.
+
+In the matcher definition body, you can write `foo_type` to reference the type
+of a parameter named `foo`. For example, in the body of
+`MATCHER_P(HasAbsoluteValue, value)` above, you can write `value_type` to refer
+to the type of `value`.
+
+gMock also provides `MATCHER_P2`, `MATCHER_P3`, ..., up to `MATCHER_P10` to
+support multi-parameter matchers:
+
+```cpp
+MATCHER_Pk(name, param_1, ..., param_k, description_string) { statements; }
+```
+
+Please note that the custom description string is for a particular *instance* of
+the matcher, where the parameters have been bound to actual values. Therefore
+usually you'll want the parameter values to be part of the description. gMock
+lets you do that by referencing the matcher parameters in the description string
+expression.
+
+For example,
+
+```cpp
+using ::testing::PrintToString;
+MATCHER_P2(InClosedRange, low, hi,
+           absl::StrFormat("%s in range [%s, %s]", negation ? "isn't" : "is",
+                           PrintToString(low), PrintToString(hi))) {
+  return low <= arg && arg <= hi;
+}
+...
+EXPECT_THAT(3, InClosedRange(4, 6));
+```
+
+would generate a failure that contains the message:
+
+```shell
+  Expected: is in range [4, 6]
+```
+
+If you specify `""` as the description, the failure message will contain the
+sequence of words in the matcher name followed by the parameter values printed
+as a tuple. For example,
+
+```cpp
+  MATCHER_P2(InClosedRange, low, hi, "") { ... }
+  ...
+  EXPECT_THAT(3, InClosedRange(4, 6));
+```
+
+would generate a failure that contains the text:
+
+```shell
+  Expected: in closed range (4, 6)
+```
+
+For the purpose of typing, you can view
+
+```cpp
+MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+```
+
+as shorthand for
+
+```cpp
+template <typename p1_type, ..., typename pk_type>
+FooMatcherPk<p1_type, ..., pk_type>
+Foo(p1_type p1, ..., pk_type pk) { ... }
+```
+
+When you write `Foo(v1, ..., vk)`, the compiler infers the types of the
+parameters `v1`, ..., and `vk` for you. If you are not happy with the result of
+the type inference, you can specify the types by explicitly instantiating the
+template, as in `Foo<long, bool>(5, false)`. As said earlier, you don't get to
+(or need to) specify `arg_type` as that's determined by the context in which the
+matcher is used.
+
+You can assign the result of expression `Foo(p1, ..., pk)` to a variable of type
+`FooMatcherPk<p1_type, ..., pk_type>`. This can be useful when composing
+matchers. Matchers that don't have a parameter or have only one parameter have
+special types: you can assign `Foo()` to a `FooMatcher`-typed variable, and
+assign `Foo(p)` to a `FooMatcherP<p_type>`-typed variable.
+
+While you can instantiate a matcher template with reference types, passing the
+parameters by pointer usually makes your code more readable. If, however, you
+still want to pass a parameter by reference, be aware that in the failure
+message generated by the matcher you will see the value of the referenced object
+but not its address.
+
+You can overload matchers with different numbers of parameters:
+
+```cpp
+MATCHER_P(Blah, a, description_string_1) { ... }
+MATCHER_P2(Blah, a, b, description_string_2) { ... }
+```
+
+While it's tempting to always use the `MATCHER*` macros when defining a new
+matcher, you should also consider implementing `MatcherInterface` or using
+`MakePolymorphicMatcher()` instead (see the recipes that follow), especially if
+you need to use the matcher a lot. While these approaches require more work,
+they give you more control on the types of the value being matched and the
+matcher parameters, which in general leads to better compiler error messages
+that pay off in the long run. They also allow overloading matchers based on
+parameter types (as opposed to just based on the number of parameters).
+
+#### Writing New Monomorphic Matchers
+
+A matcher of argument type `T` implements `::testing::MatcherInterface<T>` and
+does two things: it tests whether a value of type `T` matches the matcher, and
+can describe what kind of values it matches. The latter ability is used for
+generating readable error messages when expectations are violated.
+
+The interface looks like this:
+
+```cpp
+class MatchResultListener {
+ public:
+  ...
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener& operator<<(const T& x);
+
+  // Returns the underlying ostream.
+  ::std::ostream* stream();
+};
+
+template <typename T>
+class MatcherInterface {
+ public:
+  virtual ~MatcherInterface();
+
+  // Returns true if the matcher matches x; also explains the match
+  // result to 'listener'.
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+  // Describes this matcher to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.
+  virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+```
+
+If you need a custom matcher but `Truly()` is not a good option (for example,
+you may not be happy with the way `Truly(predicate)` describes itself, or you
+may want your matcher to be polymorphic as `Eq(value)` is), you can define a
+matcher to do whatever you want in two steps: first implement the matcher
+interface, and then define a factory function to create a matcher instance. The
+second step is not strictly needed but it makes the syntax of using the matcher
+nicer.
+
+For example, you can define a matcher to test whether an `int` is divisible by 7
+and then use it like this:
+
+```cpp
+using ::testing::MakeMatcher;
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
+
+class DivisibleBy7Matcher : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int n,
+                       MatchResultListener* /* listener */) const override {
+    return (n % 7) == 0;
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "is divisible by 7";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "is not divisible by 7";
+  }
+};
+
+Matcher<int> DivisibleBy7() {
+  return MakeMatcher(new DivisibleBy7Matcher);
+}
+
+...
+  EXPECT_CALL(foo, Bar(DivisibleBy7()));
+```
+
+You may improve the matcher message by streaming additional information to the
+`listener` argument in `MatchAndExplain()`:
+
+```cpp
+class DivisibleBy7Matcher : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int n,
+                       MatchResultListener* listener) const override {
+    const int remainder = n % 7;
+    if (remainder != 0) {
+      *listener << "the remainder is " << remainder;
+    }
+    return remainder == 0;
+  }
+  ...
+};
+```
+
+Then, `EXPECT_THAT(x, DivisibleBy7());` may generate a message like this:
+
+```shell
+Value of: x
+Expected: is divisible by 7
+  Actual: 23 (the remainder is 2)
+```
+
+#### Writing New Polymorphic Matchers
+
+You've learned how to write your own matchers in the previous recipe. Just one
+problem: a matcher created using `MakeMatcher()` only works for one particular
+type of arguments. If you want a *polymorphic* matcher that works with arguments
+of several types (for instance, `Eq(x)` can be used to match a *`value`* as long
+as `value == x` compiles -- *`value`* and `x` don't have to share the same
+type), you can learn the trick from `testing/base/public/gmock-matchers.h` but
+it's a bit involved.
+
+Fortunately, most of the time you can define a polymorphic matcher easily with
+the help of `MakePolymorphicMatcher()`. Here's how you can define `NotNull()` as
+an example:
+
+```cpp
+using ::testing::MakePolymorphicMatcher;
+using ::testing::MatchResultListener;
+using ::testing::PolymorphicMatcher;
+
+class NotNullMatcher {
+ public:
+  // To implement a polymorphic matcher, first define a COPYABLE class
+  // that has three members MatchAndExplain(), DescribeTo(), and
+  // DescribeNegationTo(), like the following.
+
+  // In this example, we want to use NotNull() with any pointer, so
+  // MatchAndExplain() accepts a pointer of any type as its first argument.
+  // In general, you can define MatchAndExplain() as an ordinary method or
+  // a method template, or even overload it.
+  template <typename T>
+  bool MatchAndExplain(T* p,
+                       MatchResultListener* /* listener */) const {
+    return p != NULL;
+  }
+
+  // Describes the property of a value matching this matcher.
+  void DescribeTo(::std::ostream* os) const { *os << "is not NULL"; }
+
+  // Describes the property of a value NOT matching this matcher.
+  void DescribeNegationTo(::std::ostream* os) const { *os << "is NULL"; }
+};
+
+// To construct a polymorphic matcher, pass an instance of the class
+// to MakePolymorphicMatcher().  Note the return type.
+PolymorphicMatcher<NotNullMatcher> NotNull() {
+  return MakePolymorphicMatcher(NotNullMatcher());
+}
+
+...
+
+  EXPECT_CALL(foo, Bar(NotNull()));  // The argument must be a non-NULL pointer.
+```
+
+**Note:** Your polymorphic matcher class does **not** need to inherit from
+`MatcherInterface` or any other class, and its methods do **not** need to be
+virtual.
+
+Like in a monomorphic matcher, you may explain the match result by streaming
+additional information to the `listener` argument in `MatchAndExplain()`.
+
+#### Writing New Cardinalities
+
+A cardinality is used in `Times()` to tell gMock how many times you expect a
+call to occur. It doesn't have to be exact. For example, you can say
+`AtLeast(5)` or `Between(2, 4)`.
+
+If the [built-in set](cheat_sheet.md#CardinalityList) of cardinalities doesn't
+suit you, you are free to define your own by implementing the following
+interface (in namespace `testing`):
+
+```cpp
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface();
+
+  // Returns true if call_count calls will satisfy this cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true if call_count calls will saturate this cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+```
+
+For example, to specify that a call must occur even number of times, you can
+write
+
+```cpp
+using ::testing::Cardinality;
+using ::testing::CardinalityInterface;
+using ::testing::MakeCardinality;
+
+class EvenNumberCardinality : public CardinalityInterface {
+ public:
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return (call_count % 2) == 0;
+  }
+
+  bool IsSaturatedByCallCount(int call_count) const override {
+    return false;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "called even number of times";
+  }
+};
+
+Cardinality EvenNumber() {
+  return MakeCardinality(new EvenNumberCardinality);
+}
+
+...
+  EXPECT_CALL(foo, Bar(3))
+      .Times(EvenNumber());
+```
+
+#### Writing New Actions Quickly {#QuickNewActions}
+
+If the built-in actions don't work for you, you can easily define your own one.
+Just define a functor class with a (possibly templated) call operator, matching
+the signature of your action.
+
+```cpp
+struct Increment {
+  template <typename T>
+  T operator()(T* arg) {
+    return ++(*arg);
+  }
+}
+```
+
+The same approach works with stateful functors (or any callable, really):
+
+```
+struct MultiplyBy {
+  template <typename T>
+  T operator()(T arg) { return arg * multiplier; }
+
+  int multiplier;
+}
+
+// Then use:
+// EXPECT_CALL(...).WillOnce(MultiplyBy{7});
+```
+
+##### Legacy macro-based Actions
+
+Before C++11, the functor-based actions were not supported; the old way of
+writing actions was through a set of `ACTION*` macros. We suggest to avoid them
+in new code; they hide a lot of logic behind the macro, potentially leading to
+harder-to-understand compiler errors. Nevertheless, we cover them here for
+completeness.
+
+By writing
+
+```cpp
+ACTION(name) { statements; }
+```
+
+in a namespace scope (i.e. not inside a class or function), you will define an
+action with the given name that executes the statements. The value returned by
+`statements` will be used as the return value of the action. Inside the
+statements, you can refer to the K-th (0-based) argument of the mock function as
+`argK`. For example:
+
+```cpp
+ACTION(IncrementArg1) { return ++(*arg1); }
+```
+
+allows you to write
+
+```cpp
+... WillOnce(IncrementArg1());
+```
+
+Note that you don't need to specify the types of the mock function arguments.
+Rest assured that your code is type-safe though: you'll get a compiler error if
+`*arg1` doesn't support the `++` operator, or if the type of `++(*arg1)` isn't
+compatible with the mock function's return type.
+
+Another example:
+
+```cpp
+ACTION(Foo) {
+  (*arg2)(5);
+  Blah();
+  *arg1 = 0;
+  return arg0;
+}
+```
+
+defines an action `Foo()` that invokes argument #2 (a function pointer) with 5,
+calls function `Blah()`, sets the value pointed to by argument #1 to 0, and
+returns argument #0.
+
+For more convenience and flexibility, you can also use the following pre-defined
+symbols in the body of `ACTION`:
+
+`argK_type`     | The type of the K-th (0-based) argument of the mock function
+:-------------- | :-----------------------------------------------------------
+`args`          | All arguments of the mock function as a tuple
+`args_type`     | The type of all arguments of the mock function as a tuple
+`return_type`   | The return type of the mock function
+`function_type` | The type of the mock function
+
+For example, when using an `ACTION` as a stub action for mock function:
+
+```cpp
+int DoSomething(bool flag, int* ptr);
+```
+
+we have:
+
+Pre-defined Symbol | Is Bound To
+------------------ | ---------------------------------
+`arg0`             | the value of `flag`
+`arg0_type`        | the type `bool`
+`arg1`             | the value of `ptr`
+`arg1_type`        | the type `int*`
+`args`             | the tuple `(flag, ptr)`
+`args_type`        | the type `std::tuple<bool, int*>`
+`return_type`      | the type `int`
+`function_type`    | the type `int(bool, int*)`
+
+##### Legacy macro-based parameterized Actions
+
+Sometimes you'll want to parameterize an action you define. For that we have
+another macro
+
+```cpp
+ACTION_P(name, param) { statements; }
+```
+
+For example,
+
+```cpp
+ACTION_P(Add, n) { return arg0 + n; }
+```
+
+will allow you to write
+
+```cpp
+// Returns argument #0 + 5.
+... WillOnce(Add(5));
+```
+
+For convenience, we use the term *arguments* for the values used to invoke the
+mock function, and the term *parameters* for the values used to instantiate an
+action.
+
+Note that you don't need to provide the type of the parameter either. Suppose
+the parameter is named `param`, you can also use the gMock-defined symbol
+`param_type` to refer to the type of the parameter as inferred by the compiler.
+For example, in the body of `ACTION_P(Add, n)` above, you can write `n_type` for
+the type of `n`.
+
+gMock also provides `ACTION_P2`, `ACTION_P3`, and etc to support multi-parameter
+actions. For example,
+
+```cpp
+ACTION_P2(ReturnDistanceTo, x, y) {
+  double dx = arg0 - x;
+  double dy = arg1 - y;
+  return sqrt(dx*dx + dy*dy);
+}
+```
+
+lets you write
+
+```cpp
+... WillOnce(ReturnDistanceTo(5.0, 26.5));
+```
+
+You can view `ACTION` as a degenerated parameterized action where the number of
+parameters is 0.
+
+You can also easily define actions overloaded on the number of parameters:
+
+```cpp
+ACTION_P(Plus, a) { ... }
+ACTION_P2(Plus, a, b) { ... }
+```
+
+#### Restricting the Type of an Argument or Parameter in an ACTION
+
+For maximum brevity and reusability, the `ACTION*` macros don't ask you to
+provide the types of the mock function arguments and the action parameters.
+Instead, we let the compiler infer the types for us.
+
+Sometimes, however, we may want to be more explicit about the types. There are
+several tricks to do that. For example:
+
+```cpp
+ACTION(Foo) {
+  // Makes sure arg0 can be converted to int.
+  int n = arg0;
+  ... use n instead of arg0 here ...
+}
+
+ACTION_P(Bar, param) {
+  // Makes sure the type of arg1 is const char*.
+  ::testing::StaticAssertTypeEq<const char*, arg1_type>();
+
+  // Makes sure param can be converted to bool.
+  bool flag = param;
+}
+```
+
+where `StaticAssertTypeEq` is a compile-time assertion in googletest that
+verifies two types are the same.
+
+#### Writing New Action Templates Quickly
+
+Sometimes you want to give an action explicit template parameters that cannot be
+inferred from its value parameters. `ACTION_TEMPLATE()` supports that and can be
+viewed as an extension to `ACTION()` and `ACTION_P*()`.
+
+The syntax:
+
+```cpp
+ACTION_TEMPLATE(ActionName,
+                HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+                AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+```
+
+defines an action template that takes *m* explicit template parameters and *n*
+value parameters, where *m* is in [1, 10] and *n* is in [0, 10]. `name_i` is the
+name of the *i*-th template parameter, and `kind_i` specifies whether it's a
+`typename`, an integral constant, or a template. `p_i` is the name of the *i*-th
+value parameter.
+
+Example:
+
+```cpp
+// DuplicateArg<k, T>(output) converts the k-th argument of the mock
+// function to type T and copies it to *output.
+ACTION_TEMPLATE(DuplicateArg,
+                // Note the comma between int and k:
+                HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+                AND_1_VALUE_PARAMS(output)) {
+  *output = T(::std::get<k>(args));
+}
+```
+
+To create an instance of an action template, write:
+
+```cpp
+ActionName<t1, ..., t_m>(v1, ..., v_n)
+```
+
+where the `t`s are the template arguments and the `v`s are the value arguments.
+The value argument types are inferred by the compiler. For example:
+
+```cpp
+using ::testing::_;
+...
+  int n;
+  EXPECT_CALL(mock, Foo).WillOnce(DuplicateArg<1, unsigned char>(&n));
+```
+
+If you want to explicitly specify the value argument types, you can provide
+additional template arguments:
+
+```cpp
+ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+```
+
+where `u_i` is the desired type of `v_i`.
+
+`ACTION_TEMPLATE` and `ACTION`/`ACTION_P*` can be overloaded on the number of
+value parameters, but not on the number of template parameters. Without the
+restriction, the meaning of the following is unclear:
+
+```cpp
+  OverloadedAction<int, bool>(x);
+```
+
+Are we using a single-template-parameter action where `bool` refers to the type
+of `x`, or a two-template-parameter action where the compiler is asked to infer
+the type of `x`?
+
+#### Using the ACTION Object's Type
+
+If you are writing a function that returns an `ACTION` object, you'll need to
+know its type. The type depends on the macro used to define the action and the
+parameter types. The rule is relatively simple:
+
+| Given Definition              | Expression          | Has Type              |
+| ----------------------------- | ------------------- | --------------------- |
+| `ACTION(Foo)`                 | `Foo()`             | `FooAction`           |
+| `ACTION_TEMPLATE(Foo,`        | `Foo<t1, ...,       | `FooAction<t1, ...,   |
+: `HAS_m_TEMPLATE_PARAMS(...),` : t_m>()`             : t_m>`                 :
+: `AND_0_VALUE_PARAMS())`       :                     :                       :
+| `ACTION_P(Bar, param)`        | `Bar(int_value)`    | `BarActionP<int>`     |
+| `ACTION_TEMPLATE(Bar,`        | `Bar<t1, ..., t_m>` | `FooActionP<t1, ...,  |
+: `HAS_m_TEMPLATE_PARAMS(...),` : `(int_value)`       : t_m, int>`            :
+: `AND_1_VALUE_PARAMS(p1))`     :                     :                       :
+| `ACTION_P2(Baz, p1, p2)`      | `Baz(bool_value,`   | `BazActionP2<bool,    |
+:                               : `int_value)`        : int>`                 :
+| `ACTION_TEMPLATE(Baz,`        | `Baz<t1, ..., t_m>` | `FooActionP2<t1, ..., |
+: `HAS_m_TEMPLATE_PARAMS(...),` : `(bool_value,`      : t_m,` `bool, int>`    :
+: `AND_2_VALUE_PARAMS(p1, p2))` : `int_value)`        :                       :
+| ...                           | ...                 | ...                   |
+
+Note that we have to pick different suffixes (`Action`, `ActionP`, `ActionP2`,
+and etc) for actions with different numbers of value parameters, or the action
+definitions cannot be overloaded on the number of them.
+
+#### Writing New Monomorphic Actions {#NewMonoActions}
+
+While the `ACTION*` macros are very convenient, sometimes they are
+inappropriate. For example, despite the tricks shown in the previous recipes,
+they don't let you directly specify the types of the mock function arguments and
+the action parameters, which in general leads to unoptimized compiler error
+messages that can baffle unfamiliar users. They also don't allow overloading
+actions based on parameter types without jumping through some hoops.
+
+An alternative to the `ACTION*` macros is to implement
+`::testing::ActionInterface<F>`, where `F` is the type of the mock function in
+which the action will be used. For example:
+
+```cpp
+template <typename F>
+class ActionInterface {
+ public:
+  virtual ~ActionInterface();
+
+  // Performs the action.  Result is the return type of function type
+  // F, and ArgumentTuple is the tuple of arguments of F.
+  //
+
+  // For example, if F is int(bool, const string&), then Result would
+  // be int, and ArgumentTuple would be ::std::tuple<bool, const string&>.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+};
+```
+
+```cpp
+using ::testing::_;
+using ::testing::Action;
+using ::testing::ActionInterface;
+using ::testing::MakeAction;
+
+typedef int IncrementMethod(int*);
+
+class IncrementArgumentAction : public ActionInterface<IncrementMethod> {
+ public:
+  int Perform(const ::std::tuple<int*>& args) override {
+    int* p = ::std::get<0>(args);  // Grabs the first argument.
+    return *p++;
+  }
+};
+
+Action<IncrementMethod> IncrementArgument() {
+  return MakeAction(new IncrementArgumentAction);
+}
+
+...
+  EXPECT_CALL(foo, Baz(_))
+      .WillOnce(IncrementArgument());
+
+  int n = 5;
+  foo.Baz(&n);  // Should return 5 and change n to 6.
+```
+
+#### Writing New Polymorphic Actions {#NewPolyActions}
+
+The previous recipe showed you how to define your own action. This is all good,
+except that you need to know the type of the function in which the action will
+be used. Sometimes that can be a problem. For example, if you want to use the
+action in functions with *different* types (e.g. like `Return()` and
+`SetArgPointee()`).
+
+If an action can be used in several types of mock functions, we say it's
+*polymorphic*. The `MakePolymorphicAction()` function template makes it easy to
+define such an action:
+
+```cpp
+namespace testing {
+template <typename Impl>
+PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl);
+}  // namespace testing
+```
+
+As an example, let's define an action that returns the second argument in the
+mock function's argument list. The first step is to define an implementation
+class:
+
+```cpp
+class ReturnSecondArgumentAction {
+ public:
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) const {
+    // To get the i-th (0-based) argument, use ::std::get(args).
+    return ::std::get<1>(args);
+  }
+};
+```
+
+This implementation class does *not* need to inherit from any particular class.
+What matters is that it must have a `Perform()` method template. This method
+template takes the mock function's arguments as a tuple in a **single**
+argument, and returns the result of the action. It can be either `const` or not,
+but must be invokable with exactly one template argument, which is the result
+type. In other words, you must be able to call `Perform<R>(args)` where `R` is
+the mock function's return type and `args` is its arguments in a tuple.
+
+Next, we use `MakePolymorphicAction()` to turn an instance of the implementation
+class into the polymorphic action we need. It will be convenient to have a
+wrapper for this:
+
+```cpp
+using ::testing::MakePolymorphicAction;
+using ::testing::PolymorphicAction;
+
+PolymorphicAction<ReturnSecondArgumentAction> ReturnSecondArgument() {
+  return MakePolymorphicAction(ReturnSecondArgumentAction());
+}
+```
+
+Now, you can use this polymorphic action the same way you use the built-in ones:
+
+```cpp
+using ::testing::_;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, DoThis, (bool flag, int n), (override));
+  MOCK_METHOD(string, DoThat, (int x, const char* str1, const char* str2),
+              (override));
+};
+
+  ...
+  MockFoo foo;
+  EXPECT_CALL(foo, DoThis).WillOnce(ReturnSecondArgument());
+  EXPECT_CALL(foo, DoThat).WillOnce(ReturnSecondArgument());
+  ...
+  foo.DoThis(true, 5);  // Will return 5.
+  foo.DoThat(1, "Hi", "Bye");  // Will return "Hi".
+```
+
+#### Teaching gMock How to Print Your Values
+
+When an uninteresting or unexpected call occurs, gMock prints the argument
+values and the stack trace to help you debug. Assertion macros like
+`EXPECT_THAT` and `EXPECT_EQ` also print the values in question when the
+assertion fails. gMock and googletest do this using googletest's user-extensible
+value printer.
+
+This printer knows how to print built-in C++ types, native arrays, STL
+containers, and any type that supports the `<<` operator. For other types, it
+prints the raw bytes in the value and hopes that you the user can figure it out.
+[googletest's advanced guide](../../googletest/docs/advanced.md#teaching-googletest-how-to-print-your-values)
+explains how to extend the printer to do a better job at printing your
+particular type than to dump the bytes.
+
+### Useful Mocks Created Using gMock
+
+<!--#include file="includes/g3_testing_LOGs.md"-->
+<!--#include file="includes/g3_mock_callbacks.md"-->
+
+#### Mock std::function {#MockFunction}
+
+`std::function` is a general function type introduced in C++11. It is a
+preferred way of passing callbacks to new interfaces. Functions are copiable,
+and are not usually passed around by pointer, which makes them tricky to mock.
+But fear not - `MockFunction` can help you with that.
+
+`MockFunction<R(T1, ..., Tn)>` has a mock method `Call()` with the signature:
+
+```cpp
+  R Call(T1, ..., Tn);
+```
+
+It also has a `AsStdFunction()` method, which creates a `std::function` proxy
+forwarding to Call:
+
+```cpp
+  std::function<R(T1, ..., Tn)> AsStdFunction();
+```
+
+To use `MockFunction`, first create `MockFunction` object and set up
+expectations on its `Call` method. Then pass proxy obtained from
+`AsStdFunction()` to the code you are testing. For example:
+
+```cpp
+TEST(FooTest, RunsCallbackWithBarArgument) {
+  // 1. Create a mock object.
+  MockFunction<int(string)> mock_function;
+
+  // 2. Set expectations on Call() method.
+  EXPECT_CALL(mock_function, Call("bar")).WillOnce(Return(1));
+
+  // 3. Exercise code that uses std::function.
+  Foo(mock_function.AsStdFunction());
+  // Foo's signature can be either of:
+  // void Foo(const std::function<int(string)>& fun);
+  // void Foo(std::function<int(string)> fun);
+
+  // 4. All expectations will be verified when mock_function
+  //     goes out of scope and is destroyed.
+}
+```
+
+Remember that function objects created with `AsStdFunction()` are just
+forwarders. If you create multiple of them, they will share the same set of
+expectations.
+
+Although `std::function` supports unlimited number of arguments, `MockFunction`
+implementation is limited to ten. If you ever hit that limit... well, your
+callback has bigger problems than being mockable. :-)
+
+<!-- GOOGLETEST_CM0034 DO NOT DELETE -->
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/for_dummies.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/for_dummies.md
new file mode 100755
index 0000000..e11c18d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/for_dummies.md
@@ -0,0 +1,700 @@
+## gMock for Dummies {#GMockForDummies}
+
+<!-- GOOGLETEST_CM0013 DO NOT DELETE -->
+
+### What Is gMock?
+
+When you write a prototype or test, often it's not feasible or wise to rely on
+real objects entirely. A **mock object** implements the same interface as a real
+object (so it can be used as one), but lets you specify at run time how it will
+be used and what it should do (which methods will be called? in which order? how
+many times? with what arguments? what will they return? etc).
+
+**Note:** It is easy to confuse the term *fake objects* with mock objects. Fakes
+and mocks actually mean very different things in the Test-Driven Development
+(TDD) community:
+
+*   **Fake** objects have working implementations, but usually take some
+    shortcut (perhaps to make the operations less expensive), which makes them
+    not suitable for production. An in-memory file system would be an example of
+    a fake.
+*   **Mocks** are objects pre-programmed with *expectations*, which form a
+    specification of the calls they are expected to receive.
+
+If all this seems too abstract for you, don't worry - the most important thing
+to remember is that a mock allows you to check the *interaction* between itself
+and code that uses it. The difference between fakes and mocks shall become much
+clearer once you start to use mocks.
+
+**gMock** is a library (sometimes we also call it a "framework" to make it sound
+cool) for creating mock classes and using them. It does to C++ what
+jMock/EasyMock does to Java (well, more or less).
+
+When using gMock,
+
+1.  first, you use some simple macros to describe the interface you want to
+    mock, and they will expand to the implementation of your mock class;
+2.  next, you create some mock objects and specify its expectations and behavior
+    using an intuitive syntax;
+3.  then you exercise code that uses the mock objects. gMock will catch any
+    violation to the expectations as soon as it arises.
+
+### Why gMock?
+
+While mock objects help you remove unnecessary dependencies in tests and make
+them fast and reliable, using mocks manually in C++ is *hard*:
+
+*   Someone has to implement the mocks. The job is usually tedious and
+    error-prone. No wonder people go great distance to avoid it.
+*   The quality of those manually written mocks is a bit, uh, unpredictable. You
+    may see some really polished ones, but you may also see some that were
+    hacked up in a hurry and have all sorts of ad hoc restrictions.
+*   The knowledge you gained from using one mock doesn't transfer to the next
+    one.
+
+In contrast, Java and Python programmers have some fine mock frameworks (jMock,
+EasyMock, [Mox](http://wtf/mox), etc), which automate the creation of mocks. As
+a result, mocking is a proven effective technique and widely adopted practice in
+those communities. Having the right tool absolutely makes the difference.
+
+gMock was built to help C++ programmers. It was inspired by jMock and EasyMock,
+but designed with C++'s specifics in mind. It is your friend if any of the
+following problems is bothering you:
+
+*   You are stuck with a sub-optimal design and wish you had done more
+    prototyping before it was too late, but prototyping in C++ is by no means
+    "rapid".
+*   Your tests are slow as they depend on too many libraries or use expensive
+    resources (e.g. a database).
+*   Your tests are brittle as some resources they use are unreliable (e.g. the
+    network).
+*   You want to test how your code handles a failure (e.g. a file checksum
+    error), but it's not easy to cause one.
+*   You need to make sure that your module interacts with other modules in the
+    right way, but it's hard to observe the interaction; therefore you resort to
+    observing the side effects at the end of the action, but it's awkward at
+    best.
+*   You want to "mock out" your dependencies, except that they don't have mock
+    implementations yet; and, frankly, you aren't thrilled by some of those
+    hand-written mocks.
+
+We encourage you to use gMock as
+
+*   a *design* tool, for it lets you experiment with your interface design early
+    and often. More iterations lead to better designs!
+*   a *testing* tool to cut your tests' outbound dependencies and probe the
+    interaction between your module and its collaborators.
+
+### Getting Started
+
+gMock is bundled with googletest.
+
+### A Case for Mock Turtles
+
+Let's look at an example. Suppose you are developing a graphics program that
+relies on a [LOGO](http://en.wikipedia.org/wiki/Logo_programming_language)-like
+API for drawing. How would you test that it does the right thing? Well, you can
+run it and compare the screen with a golden screen snapshot, but let's admit it:
+tests like this are expensive to run and fragile (What if you just upgraded to a
+shiny new graphics card that has better anti-aliasing? Suddenly you have to
+update all your golden images.). It would be too painful if all your tests are
+like this. Fortunately, you learned about
+[Dependency Injection](http://en.wikipedia.org/wiki/Dependency_injection) and know the right thing
+to do: instead of having your application talk to the system API directly, wrap
+the API in an interface (say, `Turtle`) and code to that interface:
+
+```cpp
+class Turtle {
+  ...
+  virtual ~Turtle() {};
+  virtual void PenUp() = 0;
+  virtual void PenDown() = 0;
+  virtual void Forward(int distance) = 0;
+  virtual void Turn(int degrees) = 0;
+  virtual void GoTo(int x, int y) = 0;
+  virtual int GetX() const = 0;
+  virtual int GetY() const = 0;
+};
+```
+
+(Note that the destructor of `Turtle` **must** be virtual, as is the case for
+**all** classes you intend to inherit from - otherwise the destructor of the
+derived class will not be called when you delete an object through a base
+pointer, and you'll get corrupted program states like memory leaks.)
+
+You can control whether the turtle's movement will leave a trace using `PenUp()`
+and `PenDown()`, and control its movement using `Forward()`, `Turn()`, and
+`GoTo()`. Finally, `GetX()` and `GetY()` tell you the current position of the
+turtle.
+
+Your program will normally use a real implementation of this interface. In
+tests, you can use a mock implementation instead. This allows you to easily
+check what drawing primitives your program is calling, with what arguments, and
+in which order. Tests written this way are much more robust (they won't break
+because your new machine does anti-aliasing differently), easier to read and
+maintain (the intent of a test is expressed in the code, not in some binary
+images), and run *much, much faster*.
+
+### Writing the Mock Class
+
+If you are lucky, the mocks you need to use have already been implemented by
+some nice people. If, however, you find yourself in the position to write a mock
+class, relax - gMock turns this task into a fun game! (Well, almost.)
+
+#### How to Define It
+
+Using the `Turtle` interface as example, here are the simple steps you need to
+follow:
+
+*   Derive a class `MockTurtle` from `Turtle`.
+*   Take a *virtual* function of `Turtle` (while it's possible to
+    [mock non-virtual methods using templates](cook_book.md#MockingNonVirtualMethods),
+    it's much more involved).
+*   In the `public:` section of the child class, write `MOCK_METHOD();`
+*   Now comes the fun part: you take the function signature, cut-and-paste it
+    into the macro, and add two commas - one between the return type and the
+    name, another between the name and the argument list.
+*   If you're mocking a const method, add a 4th parameter containing `(const)`
+    (the parentheses are required).
+*   Since you're overriding a virtual method, we suggest adding the `override`
+    keyword. For const methods the 4th parameter becomes `(const, override)`,
+    for non-const methods just `(override)`. This isn't mandatory.
+*   Repeat until all virtual functions you want to mock are done. (It goes
+    without saying that *all* pure virtual methods in your abstract class must
+    be either mocked or overridden.)
+
+After the process, you should have something like:
+
+```cpp
+#include "gmock/gmock.h"  // Brings in gMock.
+
+class MockTurtle : public Turtle {
+ public:
+  ...
+  MOCK_METHOD(void, PenUp, (), (override));
+  MOCK_METHOD(void, PenDown, (), (override));
+  MOCK_METHOD(void, Forward, (int distance), (override));
+  MOCK_METHOD(void, Turn, (int degrees), (override));
+  MOCK_METHOD(void, GoTo, (int x, int y), (override));
+  MOCK_METHOD(int, GetX, (), (const, override));
+  MOCK_METHOD(int, GetY, (), (const, override));
+};
+```
+
+You don't need to define these mock methods somewhere else - the `MOCK_METHOD`
+macro will generate the definitions for you. It's that simple!
+
+#### Where to Put It
+
+When you define a mock class, you need to decide where to put its definition.
+Some people put it in a `_test.cc`. This is fine when the interface being mocked
+(say, `Foo`) is owned by the same person or team. Otherwise, when the owner of
+`Foo` changes it, your test could break. (You can't really expect `Foo`'s
+maintainer to fix every test that uses `Foo`, can you?)
+
+So, the rule of thumb is: if you need to mock `Foo` and it's owned by others,
+define the mock class in `Foo`'s package (better, in a `testing` sub-package
+such that you can clearly separate production code and testing utilities), put
+it in a `.h` and a `cc_library`. Then everyone can reference them from their
+tests. If `Foo` ever changes, there is only one copy of `MockFoo` to change, and
+only tests that depend on the changed methods need to be fixed.
+
+Another way to do it: you can introduce a thin layer `FooAdaptor` on top of
+`Foo` and code to this new interface. Since you own `FooAdaptor`, you can absorb
+changes in `Foo` much more easily. While this is more work initially, carefully
+choosing the adaptor interface can make your code easier to write and more
+readable (a net win in the long run), as you can choose `FooAdaptor` to fit your
+specific domain much better than `Foo` does.
+
+<!-- GOOGLETEST_CM0029 DO NOT DELETE -->
+
+### Using Mocks in Tests
+
+Once you have a mock class, using it is easy. The typical work flow is:
+
+1.  Import the gMock names from the `testing` namespace such that you can use
+    them unqualified (You only have to do it once per file. Remember that
+    namespaces are a good idea.
+2.  Create some mock objects.
+3.  Specify your expectations on them (How many times will a method be called?
+    With what arguments? What should it do? etc.).
+4.  Exercise some code that uses the mocks; optionally, check the result using
+    googletest assertions. If a mock method is called more than expected or with
+    wrong arguments, you'll get an error immediately.
+5.  When a mock is destructed, gMock will automatically check whether all
+    expectations on it have been satisfied.
+
+Here's an example:
+
+```cpp
+#include "path/to/mock-turtle.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using ::testing::AtLeast;                         // #1
+
+TEST(PainterTest, CanDrawSomething) {
+  MockTurtle turtle;                              // #2
+  EXPECT_CALL(turtle, PenDown())                  // #3
+      .Times(AtLeast(1));
+
+  Painter painter(&turtle);                       // #4
+
+  EXPECT_TRUE(painter.DrawCircle(0, 0, 10));      // #5
+}
+```
+
+As you might have guessed, this test checks that `PenDown()` is called at least
+once. If the `painter` object didn't call this method, your test will fail with
+a message like this:
+
+```text
+path/to/my_test.cc:119: Failure
+Actual function call count doesn't match this expectation:
+Actually: never called;
+Expected: called at least once.
+Stack trace:
+...
+```
+
+**Tip 1:** If you run the test from an Emacs buffer, you can hit <Enter> on the
+line number to jump right to the failed expectation.
+
+**Tip 2:** If your mock objects are never deleted, the final verification won't
+happen. Therefore it's a good idea to turn on the heap checker in your tests
+when you allocate mocks on the heap. You get that automatically if you use the
+`gtest_main` library already.
+
+**Important note:** gMock requires expectations to be set **before** the mock
+functions are called, otherwise the behavior is **undefined**. In particular,
+you mustn't interleave `EXPECT_CALL()s` and calls to the mock functions.
+
+This means `EXPECT_CALL()` should be read as expecting that a call will occur
+*in the future*, not that a call has occurred. Why does gMock work like that?
+Well, specifying the expectation beforehand allows gMock to report a violation
+as soon as it rises, when the context (stack trace, etc) is still available.
+This makes debugging much easier.
+
+Admittedly, this test is contrived and doesn't do much. You can easily achieve
+the same effect without using gMock. However, as we shall reveal soon, gMock
+allows you to do *so much more* with the mocks.
+
+### Setting Expectations
+
+The key to using a mock object successfully is to set the *right expectations*
+on it. If you set the expectations too strict, your test will fail as the result
+of unrelated changes. If you set them too loose, bugs can slip through. You want
+to do it just right such that your test can catch exactly the kind of bugs you
+intend it to catch. gMock provides the necessary means for you to do it "just
+right."
+
+#### General Syntax
+
+In gMock we use the `EXPECT_CALL()` macro to set an expectation on a mock
+method. The general syntax is:
+
+```cpp
+EXPECT_CALL(mock_object, method(matchers))
+    .Times(cardinality)
+    .WillOnce(action)
+    .WillRepeatedly(action);
+```
+
+The macro has two arguments: first the mock object, and then the method and its
+arguments. Note that the two are separated by a comma (`,`), not a period (`.`).
+(Why using a comma? The answer is that it was necessary for technical reasons.)
+If the method is not overloaded, the macro can also be called without matchers:
+
+```cpp
+EXPECT_CALL(mock_object, non-overloaded-method)
+    .Times(cardinality)
+    .WillOnce(action)
+    .WillRepeatedly(action);
+```
+
+This syntax allows the test writer to specify "called with any arguments"
+without explicitly specifying the number or types of arguments. To avoid
+unintended ambiguity, this syntax may only be used for methods which are not
+overloaded
+
+Either form of the macro can be followed by some optional *clauses* that provide
+more information about the expectation. We'll discuss how each clause works in
+the coming sections.
+
+This syntax is designed to make an expectation read like English. For example,
+you can probably guess that
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetX())
+    .Times(5)
+    .WillOnce(Return(100))
+    .WillOnce(Return(150))
+    .WillRepeatedly(Return(200));
+```
+
+says that the `turtle` object's `GetX()` method will be called five times, it
+will return 100 the first time, 150 the second time, and then 200 every time.
+Some people like to call this style of syntax a Domain-Specific Language (DSL).
+
+**Note:** Why do we use a macro to do this? Well it serves two purposes: first
+it makes expectations easily identifiable (either by `gsearch` or by a human
+reader), and second it allows gMock to include the source file location of a
+failed expectation in messages, making debugging easier.
+
+#### Matchers: What Arguments Do We Expect?
+
+When a mock function takes arguments, we may specify what arguments we are
+expecting, for example:
+
+```cpp
+// Expects the turtle to move forward by 100 units.
+EXPECT_CALL(turtle, Forward(100));
+```
+
+Oftentimes you do not want to be too specific. Remember that talk about tests
+being too rigid? Over specification leads to brittle tests and obscures the
+intent of tests. Therefore we encourage you to specify only what's necessary—no
+more, no less. If you aren't interested in the value of an argument, write `_`
+as the argument, which means "anything goes":
+
+```cpp
+using ::testing::_;
+...
+// Expects that the turtle jumps to somewhere on the x=50 line.
+EXPECT_CALL(turtle, GoTo(50, _));
+```
+
+`_` is an instance of what we call **matchers**. A matcher is like a predicate
+and can test whether an argument is what we'd expect. You can use a matcher
+inside `EXPECT_CALL()` wherever a function argument is expected. `_` is a
+convenient way of saying "any value".
+
+In the above examples, `100` and `50` are also matchers; implicitly, they are
+the same as `Eq(100)` and `Eq(50)`, which specify that the argument must be
+equal (using `operator==`) to the matcher argument. There are many
+[built-in matchers](#MatcherList) for common types (as well as
+[custom matchers](cook_book.md#NewMatchers)); for example:
+
+```cpp
+using ::testing::Ge;
+...
+// Expects the turtle moves forward by at least 100.
+EXPECT_CALL(turtle, Forward(Ge(100)));
+```
+
+If you don't care about *any* arguments, rather than specify `_` for each of
+them you may instead omit the parameter list:
+
+```cpp
+// Expects the turtle to move forward.
+EXPECT_CALL(turtle, Forward);
+// Expects the turtle to jump somewhere.
+EXPECT_CALL(turtle, GoTo);
+```
+
+This works for all non-overloaded methods; if a method is overloaded, you need
+to help gMock resolve which overload is expected by specifying the number of
+arguments and possibly also the
+[types of the arguments](cook_book.md#SelectOverload).
+
+#### Cardinalities: How Many Times Will It Be Called?
+
+The first clause we can specify following an `EXPECT_CALL()` is `Times()`. We
+call its argument a **cardinality** as it tells *how many times* the call should
+occur. It allows us to repeat an expectation many times without actually writing
+it as many times. More importantly, a cardinality can be "fuzzy", just like a
+matcher can be. This allows a user to express the intent of a test exactly.
+
+An interesting special case is when we say `Times(0)`. You may have guessed - it
+means that the function shouldn't be called with the given arguments at all, and
+gMock will report a googletest failure whenever the function is (wrongfully)
+called.
+
+We've seen `AtLeast(n)` as an example of fuzzy cardinalities earlier. For the
+list of built-in cardinalities you can use, see
+[here](cheat_sheet.md#CardinalityList).
+
+The `Times()` clause can be omitted. **If you omit `Times()`, gMock will infer
+the cardinality for you.** The rules are easy to remember:
+
+*   If **neither** `WillOnce()` **nor** `WillRepeatedly()` is in the
+    `EXPECT_CALL()`, the inferred cardinality is `Times(1)`.
+*   If there are *n* `WillOnce()`'s but **no** `WillRepeatedly()`, where *n* >=
+    1, the cardinality is `Times(n)`.
+*   If there are *n* `WillOnce()`'s and **one** `WillRepeatedly()`, where *n* >=
+    0, the cardinality is `Times(AtLeast(n))`.
+
+**Quick quiz:** what do you think will happen if a function is expected to be
+called twice but actually called four times?
+
+#### Actions: What Should It Do?
+
+Remember that a mock object doesn't really have a working implementation? We as
+users have to tell it what to do when a method is invoked. This is easy in
+gMock.
+
+First, if the return type of a mock function is a built-in type or a pointer,
+the function has a **default action** (a `void` function will just return, a
+`bool` function will return `false`, and other functions will return 0). In
+addition, in C++ 11 and above, a mock function whose return type is
+default-constructible (i.e. has a default constructor) has a default action of
+returning a default-constructed value. If you don't say anything, this behavior
+will be used.
+
+Second, if a mock function doesn't have a default action, or the default action
+doesn't suit you, you can specify the action to be taken each time the
+expectation matches using a series of `WillOnce()` clauses followed by an
+optional `WillRepeatedly()`. For example,
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetX())
+     .WillOnce(Return(100))
+     .WillOnce(Return(200))
+     .WillOnce(Return(300));
+```
+
+says that `turtle.GetX()` will be called *exactly three times* (gMock inferred
+this from how many `WillOnce()` clauses we've written, since we didn't
+explicitly write `Times()`), and will return 100, 200, and 300 respectively.
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetY())
+     .WillOnce(Return(100))
+     .WillOnce(Return(200))
+     .WillRepeatedly(Return(300));
+```
+
+says that `turtle.GetY()` will be called *at least twice* (gMock knows this as
+we've written two `WillOnce()` clauses and a `WillRepeatedly()` while having no
+explicit `Times()`), will return 100 and 200 respectively the first two times,
+and 300 from the third time on.
+
+Of course, if you explicitly write a `Times()`, gMock will not try to infer the
+cardinality itself. What if the number you specified is larger than there are
+`WillOnce()` clauses? Well, after all `WillOnce()`s are used up, gMock will do
+the *default* action for the function every time (unless, of course, you have a
+`WillRepeatedly()`.).
+
+What can we do inside `WillOnce()` besides `Return()`? You can return a
+reference using `ReturnRef(*variable*)`, or invoke a pre-defined function, among
+[others](cook_book.md#using-actions).
+
+**Important note:** The `EXPECT_CALL()` statement evaluates the action clause
+only once, even though the action may be performed many times. Therefore you
+must be careful about side effects. The following may not do what you want:
+
+```cpp
+using ::testing::Return;
+...
+int n = 100;
+EXPECT_CALL(turtle, GetX())
+    .Times(4)
+    .WillRepeatedly(Return(n++));
+```
+
+Instead of returning 100, 101, 102, ..., consecutively, this mock function will
+always return 100 as `n++` is only evaluated once. Similarly, `Return(new Foo)`
+will create a new `Foo` object when the `EXPECT_CALL()` is executed, and will
+return the same pointer every time. If you want the side effect to happen every
+time, you need to define a custom action, which we'll teach in the
+[cook book](http://<!-- GOOGLETEST_CM0012 DO NOT DELETE -->).
+
+Time for another quiz! What do you think the following means?
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetY())
+    .Times(4)
+    .WillOnce(Return(100));
+```
+
+Obviously `turtle.GetY()` is expected to be called four times. But if you think
+it will return 100 every time, think twice! Remember that one `WillOnce()`
+clause will be consumed each time the function is invoked and the default action
+will be taken afterwards. So the right answer is that `turtle.GetY()` will
+return 100 the first time, but **return 0 from the second time on**, as
+returning 0 is the default action for `int` functions.
+
+#### Using Multiple Expectations {#MultiExpectations}
+
+So far we've only shown examples where you have a single expectation. More
+realistically, you'll specify expectations on multiple mock methods which may be
+from multiple mock objects.
+
+By default, when a mock method is invoked, gMock will search the expectations in
+the **reverse order** they are defined, and stop when an active expectation that
+matches the arguments is found (you can think of it as "newer rules override
+older ones."). If the matching expectation cannot take any more calls, you will
+get an upper-bound-violated failure. Here's an example:
+
+```cpp
+using ::testing::_;
+...
+EXPECT_CALL(turtle, Forward(_));  // #1
+EXPECT_CALL(turtle, Forward(10))  // #2
+    .Times(2);
+```
+
+If `Forward(10)` is called three times in a row, the third time it will be an
+error, as the last matching expectation (#2) has been saturated. If, however,
+the third `Forward(10)` call is replaced by `Forward(20)`, then it would be OK,
+as now #1 will be the matching expectation.
+
+**Note:** Why does gMock search for a match in the *reverse* order of the
+expectations? The reason is that this allows a user to set up the default
+expectations in a mock object's constructor or the test fixture's set-up phase
+and then customize the mock by writing more specific expectations in the test
+body. So, if you have two expectations on the same method, you want to put the
+one with more specific matchers **after** the other, or the more specific rule
+would be shadowed by the more general one that comes after it.
+
+**Tip:** It is very common to start with a catch-all expectation for a method
+and `Times(AnyNumber())` (omitting arguments, or with `_` for all arguments, if
+overloaded). This makes any calls to the method expected. This is not necessary
+for methods that are not mentioned at all (these are "uninteresting"), but is
+useful for methods that have some expectations, but for which other calls are
+ok. See
+[Understanding Uninteresting vs Unexpected Calls](cook_book.md#uninteresting-vs-unexpected).
+
+#### Ordered vs Unordered Calls {#OrderedCalls}
+
+By default, an expectation can match a call even though an earlier expectation
+hasn't been satisfied. In other words, the calls don't have to occur in the
+order the expectations are specified.
+
+Sometimes, you may want all the expected calls to occur in a strict order. To
+say this in gMock is easy:
+
+```cpp
+using ::testing::InSequence;
+...
+TEST(FooTest, DrawsLineSegment) {
+  ...
+  {
+    InSequence seq;
+
+    EXPECT_CALL(turtle, PenDown());
+    EXPECT_CALL(turtle, Forward(100));
+    EXPECT_CALL(turtle, PenUp());
+  }
+  Foo();
+}
+```
+
+By creating an object of type `InSequence`, all expectations in its scope are
+put into a *sequence* and have to occur *sequentially*. Since we are just
+relying on the constructor and destructor of this object to do the actual work,
+its name is really irrelevant.
+
+In this example, we test that `Foo()` calls the three expected functions in the
+order as written. If a call is made out-of-order, it will be an error.
+
+(What if you care about the relative order of some of the calls, but not all of
+them? Can you specify an arbitrary partial order? The answer is ... yes! The
+details can be found [here](cook_book.md#OrderedCalls).)
+
+#### All Expectations Are Sticky (Unless Said Otherwise) {#StickyExpectations}
+
+Now let's do a quick quiz to see how well you can use this mock stuff already.
+How would you test that the turtle is asked to go to the origin *exactly twice*
+(you want to ignore any other instructions it receives)?
+
+After you've come up with your answer, take a look at ours and compare notes
+(solve it yourself first - don't cheat!):
+
+```cpp
+using ::testing::_;
+using ::testing::AnyNumber;
+...
+EXPECT_CALL(turtle, GoTo(_, _))  // #1
+     .Times(AnyNumber());
+EXPECT_CALL(turtle, GoTo(0, 0))  // #2
+     .Times(2);
+```
+
+Suppose `turtle.GoTo(0, 0)` is called three times. In the third time, gMock will
+see that the arguments match expectation #2 (remember that we always pick the
+last matching expectation). Now, since we said that there should be only two
+such calls, gMock will report an error immediately. This is basically what we've
+told you in the [Using Multiple Expectations](#MultiExpectations) section above.
+
+This example shows that **expectations in gMock are "sticky" by default**, in
+the sense that they remain active even after we have reached their invocation
+upper bounds. This is an important rule to remember, as it affects the meaning
+of the spec, and is **different** to how it's done in many other mocking
+frameworks (Why'd we do that? Because we think our rule makes the common cases
+easier to express and understand.).
+
+Simple? Let's see if you've really understood it: what does the following code
+say?
+
+```cpp
+using ::testing::Return;
+...
+for (int i = n; i > 0; i--) {
+  EXPECT_CALL(turtle, GetX())
+      .WillOnce(Return(10*i));
+}
+```
+
+If you think it says that `turtle.GetX()` will be called `n` times and will
+return 10, 20, 30, ..., consecutively, think twice! The problem is that, as we
+said, expectations are sticky. So, the second time `turtle.GetX()` is called,
+the last (latest) `EXPECT_CALL()` statement will match, and will immediately
+lead to an "upper bound violated" error - this piece of code is not very useful!
+
+One correct way of saying that `turtle.GetX()` will return 10, 20, 30, ..., is
+to explicitly say that the expectations are *not* sticky. In other words, they
+should *retire* as soon as they are saturated:
+
+```cpp
+using ::testing::Return;
+...
+for (int i = n; i > 0; i--) {
+  EXPECT_CALL(turtle, GetX())
+      .WillOnce(Return(10*i))
+      .RetiresOnSaturation();
+}
+```
+
+And, there's a better way to do it: in this case, we expect the calls to occur
+in a specific order, and we line up the actions to match the order. Since the
+order is important here, we should make it explicit using a sequence:
+
+```cpp
+using ::testing::InSequence;
+using ::testing::Return;
+...
+{
+  InSequence s;
+
+  for (int i = 1; i <= n; i++) {
+    EXPECT_CALL(turtle, GetX())
+        .WillOnce(Return(10*i))
+        .RetiresOnSaturation();
+  }
+}
+```
+
+By the way, the other situation where an expectation may *not* be sticky is when
+it's in a sequence - as soon as another expectation that comes after it in the
+sequence has been used, it automatically retires (and will never be used to
+match any call).
+
+#### Uninteresting Calls
+
+A mock object may have many methods, and not all of them are that interesting.
+For example, in some tests we may not care about how many times `GetX()` and
+`GetY()` get called.
+
+In gMock, if you are not interested in a method, just don't say anything about
+it. If a call to this method occurs, you'll see a warning in the test output,
+but it won't be a failure. This is called "naggy" behavior; to change, see
+[The Nice, the Strict, and the Naggy](cook_book.md#NiceStrictNaggy).
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/gmock_faq.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/gmock_faq.md
new file mode 100755
index 0000000..214aabf
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/docs/gmock_faq.md
@@ -0,0 +1,396 @@
+## Legacy gMock FAQ {#GMockFaq}
+
+<!-- GOOGLETEST_CM0021 DO NOT DELETE -->
+
+### When I call a method on my mock object, the method for the real object is invoked instead. What's the problem?
+
+In order for a method to be mocked, it must be *virtual*, unless you use the
+[high-perf dependency injection technique](#MockingNonVirtualMethods).
+
+### Can I mock a variadic function?
+
+You cannot mock a variadic function (i.e. a function taking ellipsis (`...`)
+arguments) directly in gMock.
+
+The problem is that in general, there is *no way* for a mock object to know how
+many arguments are passed to the variadic method, and what the arguments' types
+are. Only the *author of the base class* knows the protocol, and we cannot look
+into his or her head.
+
+Therefore, to mock such a function, the *user* must teach the mock object how to
+figure out the number of arguments and their types. One way to do it is to
+provide overloaded versions of the function.
+
+Ellipsis arguments are inherited from C and not really a C++ feature. They are
+unsafe to use and don't work with arguments that have constructors or
+destructors. Therefore we recommend to avoid them in C++ as much as possible.
+
+### MSVC gives me warning C4301 or C4373 when I define a mock method with a const parameter. Why?
+
+If you compile this using Microsoft Visual C++ 2005 SP1:
+
+```cpp
+class Foo {
+  ...
+  virtual void Bar(const int i) = 0;
+};
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(void, Bar, (const int i), (override));
+};
+```
+
+You may get the following warning:
+
+```shell
+warning C4301: 'MockFoo::Bar': overriding virtual function only differs from 'Foo::Bar' by const/volatile qualifier
+```
+
+This is a MSVC bug. The same code compiles fine with gcc, for example. If you
+use Visual C++ 2008 SP1, you would get the warning:
+
+```shell
+warning C4373: 'MockFoo::Bar': virtual function overrides 'Foo::Bar', previous versions of the compiler did not override when parameters only differed by const/volatile qualifiers
+```
+
+In C++, if you *declare* a function with a `const` parameter, the `const`
+modifier is ignored. Therefore, the `Foo` base class above is equivalent to:
+
+```cpp
+class Foo {
+  ...
+  virtual void Bar(int i) = 0;  // int or const int?  Makes no difference.
+};
+```
+
+In fact, you can *declare* `Bar()` with an `int` parameter, and define it with a
+`const int` parameter. The compiler will still match them up.
+
+Since making a parameter `const` is meaningless in the method declaration, we
+recommend to remove it in both `Foo` and `MockFoo`. That should workaround the
+VC bug.
+
+Note that we are talking about the *top-level* `const` modifier here. If the
+function parameter is passed by pointer or reference, declaring the pointee or
+referee as `const` is still meaningful. For example, the following two
+declarations are *not* equivalent:
+
+```cpp
+void Bar(int* p);         // Neither p nor *p is const.
+void Bar(const int* p);  // p is not const, but *p is.
+```
+
+<!-- GOOGLETEST_CM0030 DO NOT DELETE -->
+
+### I can't figure out why gMock thinks my expectations are not satisfied. What should I do?
+
+You might want to run your test with `--gmock_verbose=info`. This flag lets
+gMock print a trace of every mock function call it receives. By studying the
+trace, you'll gain insights on why the expectations you set are not met.
+
+If you see the message "The mock function has no default action set, and its
+return type has no default value set.", then try
+[adding a default action](for_dummies.md#DefaultValue). Due to a known issue,
+unexpected calls on mocks without default actions don't print out a detailed
+comparison between the actual arguments and the expected arguments.
+
+### My program crashed and `ScopedMockLog` spit out tons of messages. Is it a gMock bug?
+
+gMock and `ScopedMockLog` are likely doing the right thing here.
+
+When a test crashes, the failure signal handler will try to log a lot of
+information (the stack trace, and the address map, for example). The messages
+are compounded if you have many threads with depth stacks. When `ScopedMockLog`
+intercepts these messages and finds that they don't match any expectations, it
+prints an error for each of them.
+
+You can learn to ignore the errors, or you can rewrite your expectations to make
+your test more robust, for example, by adding something like:
+
+```cpp
+using ::testing::AnyNumber;
+using ::testing::Not;
+...
+  // Ignores any log not done by us.
+  EXPECT_CALL(log, Log(_, Not(EndsWith("/my_file.cc")), _))
+      .Times(AnyNumber());
+```
+
+### How can I assert that a function is NEVER called?
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .Times(0);
+```
+
+<!-- GOOGLETEST_CM0031 DO NOT DELETE -->
+
+### I have a failed test where gMock tells me TWICE that a particular expectation is not satisfied. Isn't this redundant?
+
+When gMock detects a failure, it prints relevant information (the mock function
+arguments, the state of relevant expectations, and etc) to help the user debug.
+If another failure is detected, gMock will do the same, including printing the
+state of relevant expectations.
+
+Sometimes an expectation's state didn't change between two failures, and you'll
+see the same description of the state twice. They are however *not* redundant,
+as they refer to *different points in time*. The fact they are the same *is*
+interesting information.
+
+### I get a heapcheck failure when using a mock object, but using a real object is fine. What can be wrong?
+
+Does the class (hopefully a pure interface) you are mocking have a virtual
+destructor?
+
+Whenever you derive from a base class, make sure its destructor is virtual.
+Otherwise Bad Things will happen. Consider the following code:
+
+```cpp
+class Base {
+ public:
+  // Not virtual, but should be.
+  ~Base() { ... }
+  ...
+};
+
+class Derived : public Base {
+ public:
+  ...
+ private:
+  std::string value_;
+};
+
+...
+  Base* p = new Derived;
+  ...
+  delete p;  // Surprise! ~Base() will be called, but ~Derived() will not
+                 // - value_ is leaked.
+```
+
+By changing `~Base()` to virtual, `~Derived()` will be correctly called when
+`delete p` is executed, and the heap checker will be happy.
+
+### The "newer expectations override older ones" rule makes writing expectations awkward. Why does gMock do that?
+
+When people complain about this, often they are referring to code like:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.  However, I have to write the expectations in the
+  // reverse order.  This sucks big time!!!
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(2))
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(1))
+      .RetiresOnSaturation();
+```
+
+The problem, is that they didn't pick the **best** way to express the test's
+intent.
+
+By default, expectations don't have to be matched in *any* particular order. If
+you want them to match in a certain order, you need to be explicit. This is
+gMock's (and jMock's) fundamental philosophy: it's easy to accidentally
+over-specify your tests, and we want to make it harder to do so.
+
+There are two better ways to write the test spec. You could either put the
+expectations in sequence:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.  Using a sequence, we can write the expectations
+  // in their natural order.
+  {
+    InSequence s;
+    EXPECT_CALL(foo, Bar())
+        .WillOnce(Return(1))
+        .RetiresOnSaturation();
+    EXPECT_CALL(foo, Bar())
+        .WillOnce(Return(2))
+        .RetiresOnSaturation();
+  }
+```
+
+or you can put the sequence of actions in the same expectation:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2))
+      .RetiresOnSaturation();
+```
+
+Back to the original questions: why does gMock search the expectations (and
+`ON_CALL`s) from back to front? Because this allows a user to set up a mock's
+behavior for the common case early (e.g. in the mock's constructor or the test
+fixture's set-up phase) and customize it with more specific rules later. If
+gMock searches from front to back, this very useful pattern won't be possible.
+
+### gMock prints a warning when a function without EXPECT_CALL is called, even if I have set its behavior using ON_CALL. Would it be reasonable not to show the warning in this case?
+
+When choosing between being neat and being safe, we lean toward the latter. So
+the answer is that we think it's better to show the warning.
+
+Often people write `ON_CALL`s in the mock object's constructor or `SetUp()`, as
+the default behavior rarely changes from test to test. Then in the test body
+they set the expectations, which are often different for each test. Having an
+`ON_CALL` in the set-up part of a test doesn't mean that the calls are expected.
+If there's no `EXPECT_CALL` and the method is called, it's possibly an error. If
+we quietly let the call go through without notifying the user, bugs may creep in
+unnoticed.
+
+If, however, you are sure that the calls are OK, you can write
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .WillRepeatedly(...);
+```
+
+instead of
+
+```cpp
+using ::testing::_;
+...
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(...);
+```
+
+This tells gMock that you do expect the calls and no warning should be printed.
+
+Also, you can control the verbosity by specifying `--gmock_verbose=error`. Other
+values are `info` and `warning`. If you find the output too noisy when
+debugging, just choose a less verbose level.
+
+### How can I delete the mock function's argument in an action?
+
+If your mock function takes a pointer argument and you want to delete that
+argument, you can use testing::DeleteArg<N>() to delete the N'th (zero-indexed)
+argument:
+
+```cpp
+using ::testing::_;
+  ...
+  MOCK_METHOD(void, Bar, (X* x, const Y& y));
+  ...
+  EXPECT_CALL(mock_foo_, Bar(_, _))
+      .WillOnce(testing::DeleteArg<0>()));
+```
+
+### How can I perform an arbitrary action on a mock function's argument?
+
+If you find yourself needing to perform some action that's not supported by
+gMock directly, remember that you can define your own actions using
+[`MakeAction()`](#NewMonoActions) or
+[`MakePolymorphicAction()`](#NewPolyActions), or you can write a stub function
+and invoke it using [`Invoke()`](#FunctionsAsActions).
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+  ...
+  MOCK_METHOD(void, Bar, (X* p));
+  ...
+  EXPECT_CALL(mock_foo_, Bar(_))
+      .WillOnce(Invoke(MyAction(...)));
+```
+
+### My code calls a static/global function. Can I mock it?
+
+You can, but you need to make some changes.
+
+In general, if you find yourself needing to mock a static function, it's a sign
+that your modules are too tightly coupled (and less flexible, less reusable,
+less testable, etc). You are probably better off defining a small interface and
+call the function through that interface, which then can be easily mocked. It's
+a bit of work initially, but usually pays for itself quickly.
+
+This Google Testing Blog
+[post](https://testing.googleblog.com/2008/06/defeat-static-cling.html) says it
+excellently. Check it out.
+
+### My mock object needs to do complex stuff. It's a lot of pain to specify the actions. gMock sucks!
+
+I know it's not a question, but you get an answer for free any way. :-)
+
+With gMock, you can create mocks in C++ easily. And people might be tempted to
+use them everywhere. Sometimes they work great, and sometimes you may find them,
+well, a pain to use. So, what's wrong in the latter case?
+
+When you write a test without using mocks, you exercise the code and assert that
+it returns the correct value or that the system is in an expected state. This is
+sometimes called "state-based testing".
+
+Mocks are great for what some call "interaction-based" testing: instead of
+checking the system state at the very end, mock objects verify that they are
+invoked the right way and report an error as soon as it arises, giving you a
+handle on the precise context in which the error was triggered. This is often
+more effective and economical to do than state-based testing.
+
+If you are doing state-based testing and using a test double just to simulate
+the real object, you are probably better off using a fake. Using a mock in this
+case causes pain, as it's not a strong point for mocks to perform complex
+actions. If you experience this and think that mocks suck, you are just not
+using the right tool for your problem. Or, you might be trying to solve the
+wrong problem. :-)
+
+### I got a warning "Uninteresting function call encountered - default action taken.." Should I panic?
+
+By all means, NO! It's just an FYI. :-)
+
+What it means is that you have a mock function, you haven't set any expectations
+on it (by gMock's rule this means that you are not interested in calls to this
+function and therefore it can be called any number of times), and it is called.
+That's OK - you didn't say it's not OK to call the function!
+
+What if you actually meant to disallow this function to be called, but forgot to
+write `EXPECT_CALL(foo, Bar()).Times(0)`? While one can argue that it's the
+user's fault, gMock tries to be nice and prints you a note.
+
+So, when you see the message and believe that there shouldn't be any
+uninteresting calls, you should investigate what's going on. To make your life
+easier, gMock dumps the stack trace when an uninteresting call is encountered.
+From that you can figure out which mock function it is, and how it is called.
+
+### I want to define a custom action. Should I use Invoke() or implement the ActionInterface interface?
+
+Either way is fine - you want to choose the one that's more convenient for your
+circumstance.
+
+Usually, if your action is for a particular function type, defining it using
+`Invoke()` should be easier; if your action can be used in functions of
+different types (e.g. if you are defining `Return(*value*)`),
+`MakePolymorphicAction()` is easiest. Sometimes you want precise control on what
+types of functions the action can be used in, and implementing `ActionInterface`
+is the way to go here. See the implementation of `Return()` in
+`testing/base/public/gmock-actions.h` for an example.
+
+### I use SetArgPointee() in WillOnce(), but gcc complains about "conflicting return type specified". What does it mean?
+
+You got this error as gMock has no idea what value it should return when the
+mock method is called. `SetArgPointee()` says what the side effect is, but
+doesn't say what the return value should be. You need `DoAll()` to chain a
+`SetArgPointee()` with a `Return()` that provides a value appropriate to the API
+being mocked.
+
+See this [recipe](cook_book.md#mocking-side-effects) for more details and an
+example.
+
+### I have a huge mock class, and Microsoft Visual C++ runs out of memory when compiling it. What can I do?
+
+We've noticed that when the `/clr` compiler flag is used, Visual C++ uses 5~6
+times as much memory when compiling a mock class. We suggest to avoid `/clr`
+when compiling native C++ mocks.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-actions.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-actions.h
new file mode 100755
index 0000000..9605c43
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-actions.h
@@ -0,0 +1,1141 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+namespace testing {
+
+// To implement an action Foo, define:
+//   1. a class FooAction that implements the ActionInterface interface, and
+//   2. a factory function that creates an Action object from a
+//      const FooAction*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Action objects can now be copied like plain values.
+
+namespace internal {
+
+// BuiltInDefaultValueGetter<T, true>::Get() returns a
+// default-constructed T value.  BuiltInDefaultValueGetter<T,
+// false>::Get() crashes with an error.
+//
+// This primary template is used when kDefaultConstructible is true.
+template <typename T, bool kDefaultConstructible>
+struct BuiltInDefaultValueGetter {
+  static T Get() { return T(); }
+};
+template <typename T>
+struct BuiltInDefaultValueGetter<T, false> {
+  static T Get() {
+    Assert(false, __FILE__, __LINE__,
+           "Default action undefined for the function return type.");
+    return internal::Invalid<T>();
+    // The above statement will never be reached, but is required in
+    // order for this function to compile.
+  }
+};
+
+// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
+// for type T, which is NULL when T is a raw pointer type, 0 when T is
+// a numeric type, false when T is bool, or "" when T is string or
+// std::string.  In addition, in C++11 and above, it turns a
+// default-constructed T value if T is default constructible.  For any
+// other type T, the built-in default T value is undefined, and the
+// function will abort the process.
+template <typename T>
+class BuiltInDefaultValue {
+ public:
+  // This function returns true if type T has a built-in default value.
+  static bool Exists() {
+    return ::std::is_default_constructible<T>::value;
+  }
+
+  static T Get() {
+    return BuiltInDefaultValueGetter<
+        T, ::std::is_default_constructible<T>::value>::Get();
+  }
+};
+
+// This partial specialization says that we use the same built-in
+// default value for T and const T.
+template <typename T>
+class BuiltInDefaultValue<const T> {
+ public:
+  static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
+  static T Get() { return BuiltInDefaultValue<T>::Get(); }
+};
+
+// This partial specialization defines the default values for pointer
+// types.
+template <typename T>
+class BuiltInDefaultValue<T*> {
+ public:
+  static bool Exists() { return true; }
+  static T* Get() { return nullptr; }
+};
+
+// The following specializations define the default values for
+// specific types we care about.
+#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
+  template <> \
+  class BuiltInDefaultValue<type> { \
+   public: \
+    static bool Exists() { return true; } \
+    static type Get() { return value; } \
+  }
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
+
+// There's no need for a default action for signed wchar_t, as that
+// type is the same as wchar_t for gcc, and invalid for MSVC.
+//
+// There's also no need for a default action for unsigned wchar_t, as
+// that type is the same as unsigned int for gcc, and invalid for
+// MSVC.
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U);  // NOLINT
+#endif
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(UInt64, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(Int64, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
+
+#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
+
+}  // namespace internal
+
+// When an unexpected function call is encountered, Google Mock will
+// let it return a default value if the user has specified one for its
+// return type, or if the return type has a built-in default value;
+// otherwise Google Mock won't know what value to return and will have
+// to abort the process.
+//
+// The DefaultValue<T> class allows a user to specify the
+// default value for a type T that is both copyable and publicly
+// destructible (i.e. anything that can be used as a function return
+// type).  The usage is:
+//
+//   // Sets the default value for type T to be foo.
+//   DefaultValue<T>::Set(foo);
+template <typename T>
+class DefaultValue {
+ public:
+  // Sets the default value for type T; requires T to be
+  // copy-constructable and have a public destructor.
+  static void Set(T x) {
+    delete producer_;
+    producer_ = new FixedValueProducer(x);
+  }
+
+  // Provides a factory function to be called to generate the default value.
+  // This method can be used even if T is only move-constructible, but it is not
+  // limited to that case.
+  typedef T (*FactoryFunction)();
+  static void SetFactory(FactoryFunction factory) {
+    delete producer_;
+    producer_ = new FactoryValueProducer(factory);
+  }
+
+  // Unsets the default value for type T.
+  static void Clear() {
+    delete producer_;
+    producer_ = nullptr;
+  }
+
+  // Returns true if the user has set the default value for type T.
+  static bool IsSet() { return producer_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
+  }
+
+  // Returns the default value for type T if the user has set one;
+  // otherwise returns the built-in default value. Requires that Exists()
+  // is true, which ensures that the return value is well-defined.
+  static T Get() {
+    return producer_ == nullptr ? internal::BuiltInDefaultValue<T>::Get()
+                                : producer_->Produce();
+  }
+
+ private:
+  class ValueProducer {
+   public:
+    virtual ~ValueProducer() {}
+    virtual T Produce() = 0;
+  };
+
+  class FixedValueProducer : public ValueProducer {
+   public:
+    explicit FixedValueProducer(T value) : value_(value) {}
+    T Produce() override { return value_; }
+
+   private:
+    const T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer);
+  };
+
+  class FactoryValueProducer : public ValueProducer {
+   public:
+    explicit FactoryValueProducer(FactoryFunction factory)
+        : factory_(factory) {}
+    T Produce() override { return factory_(); }
+
+   private:
+    const FactoryFunction factory_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer);
+  };
+
+  static ValueProducer* producer_;
+};
+
+// This partial specialization allows a user to set default values for
+// reference types.
+template <typename T>
+class DefaultValue<T&> {
+ public:
+  // Sets the default value for type T&.
+  static void Set(T& x) {  // NOLINT
+    address_ = &x;
+  }
+
+  // Unsets the default value for type T&.
+  static void Clear() { address_ = nullptr; }
+
+  // Returns true if the user has set the default value for type T&.
+  static bool IsSet() { return address_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
+  }
+
+  // Returns the default value for type T& if the user has set one;
+  // otherwise returns the built-in default value if there is one;
+  // otherwise aborts the process.
+  static T& Get() {
+    return address_ == nullptr ? internal::BuiltInDefaultValue<T&>::Get()
+                               : *address_;
+  }
+
+ private:
+  static T* address_;
+};
+
+// This specialization allows DefaultValue<void>::Get() to
+// compile.
+template <>
+class DefaultValue<void> {
+ public:
+  static bool Exists() { return true; }
+  static void Get() {}
+};
+
+// Points to the user-set default value for type T.
+template <typename T>
+typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = nullptr;
+
+// Points to the user-set default value for type T&.
+template <typename T>
+T* DefaultValue<T&>::address_ = nullptr;
+
+// Implement this interface to define an action for function type F.
+template <typename F>
+class ActionInterface {
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  ActionInterface() {}
+  virtual ~ActionInterface() {}
+
+  // Performs the action.  This method is not const, as in general an
+  // action can have side effects and be stateful.  For example, a
+  // get-the-next-element-from-the-collection action will need to
+  // remember the current element.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface);
+};
+
+// An Action<F> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function
+// of type F is called.  The implementation of Action<T> is just a
+// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action!
+// You can view an object implementing ActionInterface<F> as a
+// concrete action (including its current state), and an Action<F>
+// object as a handle to it.
+template <typename F>
+class Action {
+  // Adapter class to allow constructing Action from a legacy ActionInterface.
+  // New code should create Actions from functors instead.
+  struct ActionAdapter {
+    // Adapter must be copyable to satisfy std::function requirements.
+    ::std::shared_ptr<ActionInterface<F>> impl_;
+
+    template <typename... Args>
+    typename internal::Function<F>::Result operator()(Args&&... args) {
+      return impl_->Perform(
+          ::std::forward_as_tuple(::std::forward<Args>(args)...));
+    }
+  };
+
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  // Constructs a null Action.  Needed for storing Action objects in
+  // STL containers.
+  Action() {}
+
+  // Construct an Action from a specified callable.
+  // This cannot take std::function directly, because then Action would not be
+  // directly constructible from lambda (it would require two conversions).
+  template <typename G,
+            typename = typename ::std::enable_if<
+                ::std::is_constructible<::std::function<F>, G>::value>::type>
+  Action(G&& fun) : fun_(::std::forward<G>(fun)) {}  // NOLINT
+
+  // Constructs an Action from its implementation.
+  explicit Action(ActionInterface<F>* impl)
+      : fun_(ActionAdapter{::std::shared_ptr<ActionInterface<F>>(impl)}) {}
+
+  // This constructor allows us to turn an Action<Func> object into an
+  // Action<F>, as long as F's arguments can be implicitly converted
+  // to Func's and Func's return type can be implicitly converted to F's.
+  template <typename Func>
+  explicit Action(const Action<Func>& action) : fun_(action.fun_) {}
+
+  // Returns true if this is the DoDefault() action.
+  bool IsDoDefault() const { return fun_ == nullptr; }
+
+  // Performs the action.  Note that this method is const even though
+  // the corresponding method in ActionInterface is not.  The reason
+  // is that a const Action<F> means that it cannot be re-bound to
+  // another concrete action, not that the concrete action it binds to
+  // cannot change state.  (Think of the difference between a const
+  // pointer and a pointer to const.)
+  Result Perform(ArgumentTuple args) const {
+    if (IsDoDefault()) {
+      internal::IllegalDoDefault(__FILE__, __LINE__);
+    }
+    return internal::Apply(fun_, ::std::move(args));
+  }
+
+ private:
+  template <typename G>
+  friend class Action;
+
+  // fun_ is an empty function if this is the DoDefault() action.
+  ::std::function<F> fun_;
+};
+
+// The PolymorphicAction class template makes it easy to implement a
+// polymorphic action (i.e. an action that can be used in mock
+// functions of than one type, e.g. Return()).
+//
+// To define a polymorphic action, a user first provides a COPYABLE
+// implementation class that has a Perform() method template:
+//
+//   class FooAction {
+//    public:
+//     template <typename Result, typename ArgumentTuple>
+//     Result Perform(const ArgumentTuple& args) const {
+//       // Processes the arguments and returns a result, using
+//       // std::get<N>(args) to get the N-th (0-based) argument in the tuple.
+//     }
+//     ...
+//   };
+//
+// Then the user creates the polymorphic action using
+// MakePolymorphicAction(object) where object has type FooAction.  See
+// the definition of Return(void) and SetArgumentPointee<N>(value) for
+// complete examples.
+template <typename Impl>
+class PolymorphicAction {
+ public:
+  explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    return Action<F>(new MonomorphicImpl<F>(impl_));
+  }
+
+ private:
+  template <typename F>
+  class MonomorphicImpl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    Result Perform(const ArgumentTuple& args) override {
+      return impl_.template Perform<Result>(args);
+    }
+
+   private:
+    Impl impl_;
+
+    GTEST_DISALLOW_ASSIGN_(MonomorphicImpl);
+  };
+
+  Impl impl_;
+
+  GTEST_DISALLOW_ASSIGN_(PolymorphicAction);
+};
+
+// Creates an Action from its implementation and returns it.  The
+// created Action object owns the implementation.
+template <typename F>
+Action<F> MakeAction(ActionInterface<F>* impl) {
+  return Action<F>(impl);
+}
+
+// Creates a polymorphic action from its implementation.  This is
+// easier to use than the PolymorphicAction<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicAction(foo);
+// vs
+//   PolymorphicAction<TypeOfFoo>(foo);
+template <typename Impl>
+inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
+  return PolymorphicAction<Impl>(impl);
+}
+
+namespace internal {
+
+// Helper struct to specialize ReturnAction to execute a move instead of a copy
+// on return. Useful for move-only types, but could be used on any type.
+template <typename T>
+struct ByMoveWrapper {
+  explicit ByMoveWrapper(T value) : payload(std::move(value)) {}
+  T payload;
+};
+
+// Implements the polymorphic Return(x) action, which can be used in
+// any function that returns the type of x, regardless of the argument
+// types.
+//
+// Note: The value passed into Return must be converted into
+// Function<F>::Result when this action is cast to Action<F> rather than
+// when that action is performed. This is important in scenarios like
+//
+// MOCK_METHOD1(Method, T(U));
+// ...
+// {
+//   Foo foo;
+//   X x(&foo);
+//   EXPECT_CALL(mock, Method(_)).WillOnce(Return(x));
+// }
+//
+// In the example above the variable x holds reference to foo which leaves
+// scope and gets destroyed.  If copying X just copies a reference to foo,
+// that copy will be left with a hanging reference.  If conversion to T
+// makes a copy of foo, the above code is safe. To support that scenario, we
+// need to make sure that the type conversion happens inside the EXPECT_CALL
+// statement, and conversion of the result of Return to Action<T(U)> is a
+// good place for that.
+//
+// The real life example of the above scenario happens when an invocation
+// of gtl::Container() is passed into Return.
+//
+template <typename R>
+class ReturnAction {
+ public:
+  // Constructs a ReturnAction object from the value to be returned.
+  // 'value' is passed by value instead of by const reference in order
+  // to allow Return("string literal") to compile.
+  explicit ReturnAction(R value) : value_(new R(std::move(value))) {}
+
+  // This template type conversion operator allows Return(x) to be
+  // used in ANY function that returns x's type.
+  template <typename F>
+  operator Action<F>() const {  // NOLINT
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename Function<F>::Result Result;
+    GTEST_COMPILE_ASSERT_(
+        !std::is_reference<Result>::value,
+        use_ReturnRef_instead_of_Return_to_return_a_reference);
+    static_assert(!std::is_void<Result>::value,
+                  "Can't use Return() on an action expected to return `void`.");
+    return Action<F>(new Impl<R, F>(value_));
+  }
+
+ private:
+  // Implements the Return(x) action for a particular function type F.
+  template <typename R_, typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    // The implicit cast is necessary when Result has more than one
+    // single-argument constructor (e.g. Result is std::vector<int>) and R
+    // has a type conversion operator template.  In that case, value_(value)
+    // won't compile as the compiler doesn't known which constructor of
+    // Result to call.  ImplicitCast_ forces the compiler to convert R to
+    // Result without considering explicit constructors, thus resolving the
+    // ambiguity. value_ is then initialized using its copy constructor.
+    explicit Impl(const std::shared_ptr<R>& value)
+        : value_before_cast_(*value),
+          value_(ImplicitCast_<Result>(value_before_cast_)) {}
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    GTEST_COMPILE_ASSERT_(!std::is_reference<Result>::value,
+                          Result_cannot_be_a_reference_type);
+    // We save the value before casting just in case it is being cast to a
+    // wrapper type.
+    R value_before_cast_;
+    Result value_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+  // Partially specialize for ByMoveWrapper. This version of ReturnAction will
+  // move its contents instead.
+  template <typename R_, typename F>
+  class Impl<ByMoveWrapper<R_>, F> : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const std::shared_ptr<R>& wrapper)
+        : performed_(false), wrapper_(wrapper) {}
+
+    Result Perform(const ArgumentTuple&) override {
+      GTEST_CHECK_(!performed_)
+          << "A ByMove() action should only be performed once.";
+      performed_ = true;
+      return std::move(wrapper_->payload);
+    }
+
+   private:
+    bool performed_;
+    const std::shared_ptr<R> wrapper_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const std::shared_ptr<R> value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnAction);
+};
+
+// Implements the ReturnNull() action.
+class ReturnNullAction {
+ public:
+  // Allows ReturnNull() to be used in any pointer-returning function. In C++11
+  // this is enforced by returning nullptr, and in non-C++11 by asserting a
+  // pointer type on compile time.
+  template <typename Result, typename ArgumentTuple>
+  static Result Perform(const ArgumentTuple&) {
+    return nullptr;
+  }
+};
+
+// Implements the Return() action.
+class ReturnVoidAction {
+ public:
+  // Allows Return() to be used in any void-returning function.
+  template <typename Result, typename ArgumentTuple>
+  static void Perform(const ArgumentTuple&) {
+    CompileAssertTypesEqual<void, Result>();
+  }
+};
+
+// Implements the polymorphic ReturnRef(x) action, which can be used
+// in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefAction {
+ public:
+  // Constructs a ReturnRefAction object from the reference to be returned.
+  explicit ReturnRefAction(T& ref) : ref_(ref) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRef(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRef(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(std::is_reference<Result>::value,
+                          use_Return_instead_of_ReturnRef_to_return_a_value);
+    return Action<F>(new Impl<F>(ref_));
+  }
+
+ private:
+  // Implements the ReturnRef(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(T& ref) : ref_(ref) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return ref_; }
+
+   private:
+    T& ref_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& ref_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefAction);
+};
+
+// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
+// used in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefOfCopyAction {
+ public:
+  // Constructs a ReturnRefOfCopyAction object from the reference to
+  // be returned.
+  explicit ReturnRefOfCopyAction(const T& value) : value_(value) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRefOfCopy(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRefOfCopy(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(
+        std::is_reference<Result>::value,
+        use_Return_instead_of_ReturnRefOfCopy_to_return_a_value);
+    return Action<F>(new Impl<F>(value_));
+  }
+
+ private:
+  // Implements the ReturnRefOfCopy(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const T& value) : value_(value) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    T value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const T value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefOfCopyAction);
+};
+
+// Implements the polymorphic DoDefault() action.
+class DoDefaultAction {
+ public:
+  // This template type conversion operator allows DoDefault() to be
+  // used in any function.
+  template <typename F>
+  operator Action<F>() const { return Action<F>(); }  // NOLINT
+};
+
+// Implements the Assign action to set a given pointer referent to a
+// particular value.
+template <typename T1, typename T2>
+class AssignAction {
+ public:
+  AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& /* args */) const {
+    *ptr_ = value_;
+  }
+
+ private:
+  T1* const ptr_;
+  const T2 value_;
+
+  GTEST_DISALLOW_ASSIGN_(AssignAction);
+};
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetErrnoAndReturn action to simulate return from
+// various system calls and libc functions.
+template <typename T>
+class SetErrnoAndReturnAction {
+ public:
+  SetErrnoAndReturnAction(int errno_value, T result)
+      : errno_(errno_value),
+        result_(result) {}
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& /* args */) const {
+    errno = errno_;
+    return result_;
+  }
+
+ private:
+  const int errno_;
+  const T result_;
+
+  GTEST_DISALLOW_ASSIGN_(SetErrnoAndReturnAction);
+};
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetArgumentPointee<N>(x) action for any function
+// whose N-th argument (0-based) is a pointer to x's type.
+template <size_t N, typename A, typename = void>
+struct SetArgumentPointeeAction {
+  A value;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    *::std::get<N>(std::tie(args...)) = value;
+  }
+};
+
+// Implements the Invoke(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  template <typename... Args>
+  auto operator()(Args&&... args) const
+      -> decltype((obj_ptr->*method_ptr)(std::forward<Args>(args)...)) {
+    return (obj_ptr->*method_ptr)(std::forward<Args>(args)...);
+  }
+};
+
+// Implements the InvokeWithoutArgs(f) action.  The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor.  InvokeWithoutArgs(f) can be used as an
+// Action<F> as long as f's type is compatible with F.
+template <typename FunctionImpl>
+struct InvokeWithoutArgsAction {
+  FunctionImpl function_impl;
+
+  // Allows InvokeWithoutArgs(f) to be used as any action whose type is
+  // compatible with f.
+  template <typename... Args>
+  auto operator()(const Args&...) -> decltype(function_impl()) {
+    return function_impl();
+  }
+};
+
+// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodWithoutArgsAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  using ReturnType = typename std::result_of<MethodPtr(Class*)>::type;
+
+  template <typename... Args>
+  ReturnType operator()(const Args&...) const {
+    return (obj_ptr->*method_ptr)();
+  }
+};
+
+// Implements the IgnoreResult(action) action.
+template <typename A>
+class IgnoreResultAction {
+ public:
+  explicit IgnoreResultAction(const A& action) : action_(action) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename internal::Function<F>::Result Result;
+
+    // Asserts at compile time that F returns void.
+    CompileAssertTypesEqual<void, Result>();
+
+    return Action<F>(new Impl<F>(action_));
+  }
+
+ private:
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const A& action) : action_(action) {}
+
+    void Perform(const ArgumentTuple& args) override {
+      // Performs the action and ignores its result.
+      action_.Perform(args);
+    }
+
+   private:
+    // Type OriginalFunction is the same as F except that its return
+    // type is IgnoredValue.
+    typedef typename internal::Function<F>::MakeResultIgnoredValue
+        OriginalFunction;
+
+    const Action<OriginalFunction> action_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const A action_;
+
+  GTEST_DISALLOW_ASSIGN_(IgnoreResultAction);
+};
+
+template <typename InnerAction, size_t... I>
+struct WithArgsAction {
+  InnerAction action;
+
+  // The inner action could be anything convertible to Action<X>.
+  // We use the conversion operator to detect the signature of the inner Action.
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    Action<R(typename std::tuple_element<I, std::tuple<Args...>>::type...)>
+        converted(action);
+
+    return [converted](Args... args) -> R {
+      return converted.Perform(std::forward_as_tuple(
+        std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
+    };
+  }
+};
+
+template <typename... Actions>
+struct DoAllAction {
+ private:
+  template <typename... Args, size_t... I>
+  std::vector<Action<void(Args...)>> Convert(IndexSequence<I...>) const {
+    return {std::get<I>(actions)...};
+  }
+
+ public:
+  std::tuple<Actions...> actions;
+
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    struct Op {
+      std::vector<Action<void(Args...)>> converted;
+      Action<R(Args...)> last;
+      R operator()(Args... args) const {
+        auto tuple_args = std::forward_as_tuple(std::forward<Args>(args)...);
+        for (auto& a : converted) {
+          a.Perform(tuple_args);
+        }
+        return last.Perform(tuple_args);
+      }
+    };
+    return Op{Convert<Args...>(MakeIndexSequence<sizeof...(Actions) - 1>()),
+              std::get<sizeof...(Actions) - 1>(actions)};
+  }
+};
+
+}  // namespace internal
+
+// An Unused object can be implicitly constructed from ANY value.
+// This is handy when defining actions that ignore some or all of the
+// mock function arguments.  For example, given
+//
+//   MOCK_METHOD3(Foo, double(const string& label, double x, double y));
+//   MOCK_METHOD3(Bar, double(int index, double x, double y));
+//
+// instead of
+//
+//   double DistanceToOriginWithLabel(const string& label, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   double DistanceToOriginWithIndex(int index, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithLabel));
+//   EXPECT_CALL(mock, Bar(5, _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithIndex));
+//
+// you could write
+//
+//   // We can declare any uninteresting argument as Unused.
+//   double DistanceToOrigin(Unused, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
+//   EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
+typedef internal::IgnoredValue Unused;
+
+// Creates an action that does actions a1, a2, ..., sequentially in
+// each invocation.
+template <typename... Action>
+internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
+    Action&&... action) {
+  return {std::forward_as_tuple(std::forward<Action>(action)...)};
+}
+
+// WithArg<k>(an_action) creates an action that passes the k-th
+// (0-based) argument of the mock function to an_action and performs
+// it.  It adapts an action accepting one argument to one that accepts
+// multiple arguments.  For convenience, we also provide
+// WithArgs<k>(an_action) (defined below) as a synonym.
+template <size_t k, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k>
+WithArg(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
+// the selected arguments of the mock function to an_action and
+// performs it.  It serves as an adaptor between actions with
+// different argument lists.
+template <size_t k, size_t... ks, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k, ks...>
+WithArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithoutArgs(inner_action) can be used in a mock function with a
+// non-empty argument list to perform inner_action, which takes no
+// argument.  In other words, it adapts an action accepting no
+// argument to one that accepts (and ignores) arguments.
+template <typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type>
+WithoutArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// Creates an action that returns 'value'.  'value' is passed by value
+// instead of const reference - otherwise Return("string literal")
+// will trigger a compiler error about using array as initializer.
+template <typename R>
+internal::ReturnAction<R> Return(R value) {
+  return internal::ReturnAction<R>(std::move(value));
+}
+
+// Creates an action that returns NULL.
+inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
+  return MakePolymorphicAction(internal::ReturnNullAction());
+}
+
+// Creates an action that returns from a void function.
+inline PolymorphicAction<internal::ReturnVoidAction> Return() {
+  return MakePolymorphicAction(internal::ReturnVoidAction());
+}
+
+// Creates an action that returns the reference to a variable.
+template <typename R>
+inline internal::ReturnRefAction<R> ReturnRef(R& x) {  // NOLINT
+  return internal::ReturnRefAction<R>(x);
+}
+
+// Creates an action that returns the reference to a copy of the
+// argument.  The copy is created when the action is constructed and
+// lives as long as the action.
+template <typename R>
+inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
+  return internal::ReturnRefOfCopyAction<R>(x);
+}
+
+// Modifies the parent action (a Return() action) to perform a move of the
+// argument instead of a copy.
+// Return(ByMove()) actions can only be executed once and will assert this
+// invariant.
+template <typename R>
+internal::ByMoveWrapper<R> ByMove(R x) {
+  return internal::ByMoveWrapper<R>(std::move(x));
+}
+
+// Creates an action that does the default action for the give mock function.
+inline internal::DoDefaultAction DoDefault() {
+  return internal::DoDefaultAction();
+}
+
+// Creates an action that sets the variable pointed by the N-th
+// (0-based) function argument to 'value'.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgPointee(T x) {
+  return {std::move(x)};
+}
+
+// The following version is DEPRECATED.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T x) {
+  return {std::move(x)};
+}
+
+// Creates an action that sets a pointer referent to a given value.
+template <typename T1, typename T2>
+PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
+  return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Creates an action that sets errno and returns the appropriate error.
+template <typename T>
+PolymorphicAction<internal::SetErrnoAndReturnAction<T> >
+SetErrnoAndReturn(int errval, T result) {
+  return MakePolymorphicAction(
+      internal::SetErrnoAndReturnAction<T>(errval, result));
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Various overloads for Invoke().
+
+// Legacy function.
+// Actions can now be implicitly constructed from callables. No need to create
+// wrapper objects.
+// This function exists for backwards compatibility.
+template <typename FunctionImpl>
+typename std::decay<FunctionImpl>::type Invoke(FunctionImpl&& function_impl) {
+  return std::forward<FunctionImpl>(function_impl);
+}
+
+// Creates an action that invokes the given method on the given object
+// with the mock function's arguments.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodAction<Class, MethodPtr> Invoke(Class* obj_ptr,
+                                                      MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that invokes 'function_impl' with no argument.
+template <typename FunctionImpl>
+internal::InvokeWithoutArgsAction<typename std::decay<FunctionImpl>::type>
+InvokeWithoutArgs(FunctionImpl function_impl) {
+  return {std::move(function_impl)};
+}
+
+// Creates an action that invokes the given method on the given object
+// with no argument.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> InvokeWithoutArgs(
+    Class* obj_ptr, MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that performs an_action and throws away its
+// result.  In other words, it changes the return type of an_action to
+// void.  an_action MUST NOT return void, or the code won't compile.
+template <typename A>
+inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
+  return internal::IgnoreResultAction<A>(an_action);
+}
+
+// Creates a reference wrapper for the given L-value.  If necessary,
+// you can explicitly specify the type of the reference.  For example,
+// suppose 'derived' is an object of type Derived, ByRef(derived)
+// would wrap a Derived&.  If you want to wrap a const Base& instead,
+// where Base is a base class of Derived, just write:
+//
+//   ByRef<const Base>(derived)
+//
+// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper.
+// However, it may still be used for consistency with ByMove().
+template <typename T>
+inline ::std::reference_wrapper<T> ByRef(T& l_value) {  // NOLINT
+  return ::std::reference_wrapper<T>(l_value);
+}
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-cardinalities.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-cardinalities.h
new file mode 100755
index 0000000..4b269a3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-cardinalities.h
@@ -0,0 +1,153 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used cardinalities.  More
+// cardinalities can be defined by the user implementing the
+// CardinalityInterface interface if necessary.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+
+#include <limits.h>
+#include <memory>
+#include <ostream>  // NOLINT
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// To implement a cardinality Foo, define:
+//   1. a class FooCardinality that implements the
+//      CardinalityInterface interface, and
+//   2. a factory function that creates a Cardinality object from a
+//      const FooCardinality*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Cardinality objects can now be copied like plain values.
+
+// The implementation of a cardinality.
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface() {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  virtual int ConservativeLowerBound() const { return 0; }
+  virtual int ConservativeUpperBound() const { return INT_MAX; }
+
+  // Returns true if call_count calls will satisfy this cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true if call_count calls will saturate this cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+
+// A Cardinality is a copyable and IMMUTABLE (except by assignment)
+// object that specifies how many times a mock function is expected to
+// be called.  The implementation of Cardinality is just a std::shared_ptr
+// to const CardinalityInterface. Don't inherit from Cardinality!
+class GTEST_API_ Cardinality {
+ public:
+  // Constructs a null cardinality.  Needed for storing Cardinality
+  // objects in STL containers.
+  Cardinality() {}
+
+  // Constructs a Cardinality from its implementation.
+  explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
+  int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
+
+  // Returns true if call_count calls will satisfy this cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const {
+    return impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Returns true if call_count calls will saturate this cardinality.
+  bool IsSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count);
+  }
+
+  // Returns true if call_count calls will over-saturate this
+  // cardinality, i.e. exceed the maximum number of allowed calls.
+  bool IsOverSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count) &&
+        !impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Describes self to an ostream
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the given actual call count to an ostream.
+  static void DescribeActualCallCountTo(int actual_call_count,
+                                        ::std::ostream* os);
+
+ private:
+  std::shared_ptr<const CardinalityInterface> impl_;
+};
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n);
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n);
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber();
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max);
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n);
+
+// Creates a cardinality from its implementation.
+inline Cardinality MakeCardinality(const CardinalityInterface* c) {
+  return Cardinality(c);
+}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-function-mocker.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-function-mocker.h
new file mode 100755
index 0000000..cc1535c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-function-mocker.h
@@ -0,0 +1,253 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements MOCK_METHOD.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+#define THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+
+#include "gmock/gmock-generated-function-mockers.h"  // NOLINT
+#include "gmock/internal/gmock-pp.h"
+
+#define MOCK_METHOD(...) \
+  GMOCK_PP_VARIADIC_CALL(GMOCK_INTERNAL_MOCK_METHOD_ARG_, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_1(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_2(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
+  GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)     \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                   \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                   \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                      \
+      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));           \
+  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                     \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                            \
+      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),     \
+      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec),    \
+      GMOCK_INTERNAL_HAS_NOEXCEPT(_Spec), GMOCK_INTERNAL_GET_CALLTYPE(_Spec), \
+      (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_6(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_7(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_WRONG_ARITY(...)                                      \
+  static_assert(                                                             \
+      false,                                                                 \
+      "MOCK_METHOD must be called with 3 or 4 arguments. _Ret, "             \
+      "_MethodName, _Args and optionally _Spec. _Args and _Spec must be "    \
+      "enclosed in parentheses. If _Ret is a type with unprotected commas, " \
+      "it must also be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Tuple) \
+  static_assert(                                  \
+      GMOCK_PP_IS_ENCLOSED_PARENS(_Tuple),        \
+      GMOCK_PP_STRINGIZE(_Tuple) " should be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(_N, ...)                 \
+  static_assert(                                                       \
+      std::is_function<__VA_ARGS__>::value,                            \
+      "Signature must be a function type, maybe return type contains " \
+      "unprotected comma.");                                           \
+  static_assert(                                                       \
+      ::testing::tuple_size<typename ::testing::internal::Function<    \
+              __VA_ARGS__>::ArgumentTuple>::value == _N,               \
+      "This method does not take " GMOCK_PP_STRINGIZE(                 \
+          _N) " arguments. Parenthesize all types with unproctected commas.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT, ~, _Spec)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_IMPL(_N, _MethodName, _Constness,           \
+                                        _Override, _Final, _Noexcept,          \
+                                        _CallType, _Signature)                 \
+  typename ::testing::internal::Function<GMOCK_PP_REMOVE_PARENS(               \
+      _Signature)>::Result                                                     \
+  GMOCK_INTERNAL_EXPAND(_CallType)                                             \
+      _MethodName(GMOCK_PP_REPEAT(GMOCK_INTERNAL_PARAMETER, _Signature, _N))   \
+          GMOCK_PP_IF(_Constness, const, ) GMOCK_PP_IF(_Noexcept, noexcept, )  \
+              GMOCK_PP_IF(_Override, override, )                               \
+                  GMOCK_PP_IF(_Final, final, ) {                               \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName)                                 \
+        .SetOwnerAndName(this, #_MethodName);                                  \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .Invoke(GMOCK_PP_REPEAT(GMOCK_INTERNAL_FORWARD_ARG, _Signature, _N));  \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_PARAMETER, _Signature, _N))       \
+      GMOCK_PP_IF(_Constness, const, ) {                                       \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName).RegisterOwner(this);            \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .With(GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_ARGUMENT, , _N));         \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      const ::testing::internal::WithoutMatchers&,                             \
+      GMOCK_PP_IF(_Constness, const, )::testing::internal::Function<           \
+          GMOCK_PP_REMOVE_PARENS(_Signature)>*)                                \
+      const GMOCK_PP_IF(_Noexcept, noexcept, ) {                               \
+    return GMOCK_PP_CAT(::testing::internal::AdjustConstness_,                 \
+                        GMOCK_PP_IF(_Constness, const, ))(this)                \
+        ->gmock_##_MethodName(GMOCK_PP_REPEAT(                                 \
+            GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N));               \
+  }                                                                            \
+  mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)>        \
+      GMOCK_MOCKER_(_N, _Constness, _MethodName)
+
+#define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
+
+// Five Valid modifiers.
+#define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_OVERRIDE(_Tuple) \
+  GMOCK_PP_HAS_COMMA(                       \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_OVERRIDE, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_FINAL(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_FINAL, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_NOEXCEPT(_Tuple) \
+  GMOCK_PP_HAS_COMMA(                       \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_NOEXCEPT, ~, _Tuple))
+
+#define GMOCK_INTERNAL_GET_CALLTYPE(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_CALLTYPE_IMPL, ~, _Tuple)
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)            \
+  static_assert(                                                          \
+      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) + \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) + \
+       GMOCK_INTERNAL_IS_CALLTYPE(_elem)) == 1,                           \
+      GMOCK_PP_STRINGIZE(                                                 \
+          _elem) " cannot be recognized as a valid specification modifier.");
+
+// Modifiers implementation.
+#define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CONST_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CONST_I_const ,
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_OVERRIDE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE_I_override ,
+
+#define GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_FINAL_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_FINAL_I_final ,
+
+// TODO(iserna): Maybe noexcept should accept an argument here as well.
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_NOEXCEPT_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT_I_noexcept ,
+
+#define GMOCK_INTERNAL_GET_CALLTYPE_IMPL(_i, _, _elem)           \
+  GMOCK_PP_IF(GMOCK_INTERNAL_IS_CALLTYPE(_elem),                 \
+              GMOCK_INTERNAL_GET_VALUE_CALLTYPE, GMOCK_PP_EMPTY) \
+  (_elem)
+
+// TODO(iserna): GMOCK_INTERNAL_IS_CALLTYPE and
+// GMOCK_INTERNAL_GET_VALUE_CALLTYPE needed more expansions to work on windows
+// maybe they can be simplified somehow.
+#define GMOCK_INTERNAL_IS_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_IS_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_IS_CALLTYPE_I(_arg) GMOCK_PP_IS_ENCLOSED_PARENS(_arg)
+
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(_arg) \
+  GMOCK_PP_CAT(GMOCK_PP_IDENTITY, _arg)
+
+#define GMOCK_INTERNAL_IS_CALLTYPE_HELPER_Calltype
+
+#define GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)                         \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_Ret), GMOCK_PP_REMOVE_PARENS, \
+              GMOCK_PP_IDENTITY)                                      \
+  (_Ret)(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_TYPE, _, _Args))
+
+#define GMOCK_INTERNAL_GET_TYPE(_i, _, _elem)                          \
+  GMOCK_PP_COMMA_IF(_i)                                                \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_elem), GMOCK_PP_REMOVE_PARENS, \
+              GMOCK_PP_IDENTITY)                                       \
+  (_elem)
+
+#define GMOCK_INTERNAL_PARAMETER(_i, _Signature, _)        \
+  GMOCK_PP_COMMA_IF(_i)                                    \
+  GMOCK_INTERNAL_ARG_O(typename, GMOCK_PP_INC(_i),         \
+                       GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_FORWARD_ARG(_i, _Signature, _)                       \
+  GMOCK_PP_COMMA_IF(_i)                                                     \
+  ::std::forward<GMOCK_INTERNAL_ARG_O(typename, GMOCK_PP_INC(_i),           \
+                                      GMOCK_PP_REMOVE_PARENS(_Signature))>( \
+      gmock_a##_i)
+
+#define GMOCK_INTERNAL_MATCHER_PARAMETER(_i, _Signature, _)    \
+  GMOCK_PP_COMMA_IF(_i)                                        \
+  GMOCK_INTERNAL_MATCHER_O(typename, GMOCK_PP_INC(_i),         \
+                           GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_MATCHER_ARGUMENT(_i, _1, _2) \
+  GMOCK_PP_COMMA_IF(_i)                             \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_A_MATCHER_ARGUMENT(_i, _Signature, _)    \
+  GMOCK_PP_COMMA_IF(_i)                                         \
+  ::testing::A<GMOCK_INTERNAL_ARG_O(typename, GMOCK_PP_INC(_i), \
+                                    GMOCK_PP_REMOVE_PARENS(_Signature))>()
+
+#define GMOCK_INTERNAL_ARG_O(_tn, _i, ...) GMOCK_ARG_(_tn, _i, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MATCHER_O(_tn, _i, ...) \
+  GMOCK_MATCHER_(_tn, _i, __VA_ARGS__)
+
+#endif  // THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h
new file mode 100755
index 0000000..981af78
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h
@@ -0,0 +1,1884 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-actions.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+namespace internal {
+
+// A macro from the ACTION* family (defined later in this file)
+// defines an action that can be used in a mock function.  Typically,
+// these actions only care about a subset of the arguments of the mock
+// function.  For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs.  The ExcessiveArg type is used to
+// represent those excessive arguments.  In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal.  However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// A helper class needed for implementing the ACTION* macros.
+template <typename Result, class Impl>
+class ActionHelper {
+ public:
+  static Result Perform(Impl* impl, const ::std::tuple<>& args) {
+    return impl->template gmock_PerformImpl<>(args, ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0>
+  static Result Perform(Impl* impl, const ::std::tuple<A0>& args) {
+    return impl->template gmock_PerformImpl<A0>(args, std::get<0>(args),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1>& args) {
+    return impl->template gmock_PerformImpl<A0, A1>(args, std::get<0>(args),
+        std::get<1>(args), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2>(args,
+        std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3>(args,
+        std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3,
+      A4>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4>(args,
+        std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3, A4,
+      A5>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5>(args,
+        std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), std::get<5>(args),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3, A4, A5,
+      A6>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6>(args,
+        std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), std::get<5>(args),
+        std::get<6>(args), ExcessiveArg(), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3, A4, A5,
+      A6, A7>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6,
+        A7>(args, std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), std::get<5>(args),
+        std::get<6>(args), std::get<7>(args), ExcessiveArg(), ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7, typename A8>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3, A4, A5,
+      A6, A7, A8>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6, A7,
+        A8>(args, std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), std::get<5>(args),
+        std::get<6>(args), std::get<7>(args), std::get<8>(args),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7, typename A8, typename A9>
+  static Result Perform(Impl* impl, const ::std::tuple<A0, A1, A2, A3, A4, A5,
+      A6, A7, A8, A9>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6, A7, A8,
+        A9>(args, std::get<0>(args), std::get<1>(args), std::get<2>(args),
+        std::get<3>(args), std::get<4>(args), std::get<5>(args),
+        std::get<6>(args), std::get<7>(args), std::get<8>(args),
+        std::get<9>(args));
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily.  The syntax:
+//
+//   ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements.  The value returned by the statements will be used as
+// the return value of the action.  Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'.  For example:
+//
+//   ACTION(IncrementArg1) {
+//     arg1_type temp = arg1;
+//     return ++(*temp);
+//   }
+//
+// allows you to write
+//
+//   ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments.  However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action.   For that you can use
+// another macro:
+//
+//   ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+//   ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+//   ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either.  If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'.  For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+//   ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically.  You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>.  This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+//   ACTION_P(Plus, a) { ... }
+//   ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot.  While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run.  They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+// Users can, however, define any local functors (e.g. a lambda) that
+// can be used as actions.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION' on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+
+// An internal macro needed for implementing ACTION*().
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_\
+    const args_type& args GTEST_ATTRIBUTE_UNUSED_, \
+    const arg0_type& arg0 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg1_type& arg1 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg2_type& arg2 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg3_type& arg3 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg4_type& arg4 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg5_type& arg5 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg6_type& arg6 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg7_type& arg7 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg8_type& arg8 GTEST_ATTRIBUTE_UNUSED_, \
+    const arg9_type& arg9 GTEST_ATTRIBUTE_UNUSED_
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::std::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) kind0 name0, kind1 name1, kind2 name2
+#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) kind0 name0, kind1 name1, \
+    kind2 name2, kind3 name3, kind4 name4
+#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+    kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) kind0 name0, kind1 name1, kind2 name2, kind3 name3, \
+    kind4 name4, kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3, kind4 name4, kind5 name5, kind6 name6, kind7 name7, \
+    kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5, \
+    kind6 name6, kind7 name7, kind8 name8, kind9 name9
+
+// Lists the template parameters.
+#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) name0, name1, name2
+#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) name0, name1, name2, name3, \
+    name4
+#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) name0, name1, \
+    name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) name0, name1, name2, name3, name4, name5, \
+    name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) name0, name1, name2, \
+    name3, name4, name5, name6, name7, name8, name9
+
+// Declares the types of value parameters.
+#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) , \
+    typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , \
+    typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type, typename p8##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , typename p0##_type, typename p1##_type, \
+    typename p2##_type, typename p3##_type, typename p4##_type, \
+    typename p5##_type, typename p6##_type, typename p7##_type, \
+    typename p8##_type, typename p9##_type
+
+// Initializes the value parameters.
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS()\
+    ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0)\
+    (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1)\
+    (p0##_type gmock_p0, p1##_type gmock_p1) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1))
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2))
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3))
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4))
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5))
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6))
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7))
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8))
+#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+        p9##_type gmock_p9) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)), \
+        p9(::std::move(gmock_p9))
+
+// Declares the fields for storing the value parameters.
+#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0; \
+    p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0; \
+    p1##_type p1; p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0; \
+    p1##_type p1; p2##_type p2; p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6; p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8; \
+    p9##_type p9;
+
+// Lists the value parameters.
+#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
+#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
+#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
+#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) p0, p1, \
+    p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) p0, \
+    p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+
+// Lists the value parameter types.
+#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) , p0##_type, \
+    p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , p0##_type, \
+    p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+    p6##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type, p9##_type
+
+// Declares the value parameters.
+#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0, \
+    p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0, \
+    p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0, \
+    p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+    p9##_type p9
+
+// The suffix of the class template implementing the action template.
+#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
+#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
+#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
+#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
+#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
+#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
+#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
+#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) P8
+#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) P9
+#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) P10
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+#define ACTION_TEMPLATE(name, template_params, value_params)\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  class GMOCK_ACTION_CLASS_(name, value_params) {\
+   public:\
+    explicit GMOCK_ACTION_CLASS_(name, value_params)\
+        GMOCK_INTERNAL_INIT_##value_params {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      GMOCK_INTERNAL_DEFN_##value_params\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(\
+          new gmock_Impl<F>(GMOCK_INTERNAL_LIST_##value_params));\
+    }\
+    GMOCK_INTERNAL_DEFN_##value_params\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(GMOCK_ACTION_CLASS_(name, value_params));\
+  };\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  inline GMOCK_ACTION_CLASS_(name, value_params)<\
+      GMOCK_INTERNAL_LIST_##template_params\
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(\
+          GMOCK_INTERNAL_DECL_##value_params) {\
+    return GMOCK_ACTION_CLASS_(name, value_params)<\
+        GMOCK_INTERNAL_LIST_##template_params\
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(\
+            GMOCK_INTERNAL_LIST_##value_params);\
+  }\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      GMOCK_ACTION_CLASS_(name, value_params)<\
+          GMOCK_INTERNAL_LIST_##template_params\
+          GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl<F>::\
+              gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION(name)\
+  class name##Action {\
+   public:\
+    name##Action() {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl() {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>());\
+    }\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##Action);\
+  };\
+  inline name##Action name() {\
+    return name##Action();\
+  }\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##Action::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P(name, p0)\
+  template <typename p0##_type>\
+  class name##ActionP {\
+   public:\
+    explicit name##ActionP(p0##_type gmock_p0) : \
+        p0(::std::forward<p0##_type>(gmock_p0)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl(p0##_type gmock_p0) : \
+          p0(::std::forward<p0##_type>(gmock_p0)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0));\
+    }\
+    p0##_type p0;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP);\
+  };\
+  template <typename p0##_type>\
+  inline name##ActionP<p0##_type> name(p0##_type p0) {\
+    return name##ActionP<p0##_type>(p0);\
+  }\
+  template <typename p0##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP<p0##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P2(name, p0, p1)\
+  template <typename p0##_type, typename p1##_type>\
+  class name##ActionP2 {\
+   public:\
+    name##ActionP2(p0##_type gmock_p0, \
+        p1##_type gmock_p1) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, \
+          p1##_type gmock_p1) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP2);\
+  };\
+  template <typename p0##_type, typename p1##_type>\
+  inline name##ActionP2<p0##_type, p1##_type> name(p0##_type p0, \
+      p1##_type p1) {\
+    return name##ActionP2<p0##_type, p1##_type>(p0, p1);\
+  }\
+  template <typename p0##_type, typename p1##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP2<p0##_type, p1##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P3(name, p0, p1, p2)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  class name##ActionP3 {\
+   public:\
+    name##ActionP3(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, \
+          p2##_type gmock_p2) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP3);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  inline name##ActionP3<p0##_type, p1##_type, p2##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2) {\
+    return name##ActionP3<p0##_type, p1##_type, p2##_type>(p0, p1, p2);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP3<p0##_type, p1##_type, \
+          p2##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P4(name, p0, p1, p2, p3)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  class name##ActionP4 {\
+   public:\
+    name##ActionP4(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, \
+        p3##_type gmock_p3) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP4);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  inline name##ActionP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3) {\
+    return name##ActionP4<p0##_type, p1##_type, p2##_type, p3##_type>(p0, p1, \
+        p2, p3);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP4<p0##_type, p1##_type, p2##_type, \
+          p3##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P5(name, p0, p1, p2, p3, p4)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  class name##ActionP5 {\
+   public:\
+    name##ActionP5(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, \
+        p4##_type gmock_p4) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, \
+          p4##_type gmock_p4) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP5);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  inline name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4) {\
+    return name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type>(p0, p1, p2, p3, p4);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+          p4##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P6(name, p0, p1, p2, p3, p4, p5)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  class name##ActionP6 {\
+   public:\
+    name##ActionP6(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)), \
+        p5(::std::forward<p5##_type>(gmock_p5)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, \
+          p5##_type gmock_p5) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)), \
+          p5(::std::forward<p5##_type>(gmock_p5)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP6);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  inline name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3, p4##_type p4, p5##_type p5) {\
+    return name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P7(name, p0, p1, p2, p3, p4, p5, p6)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  class name##ActionP7 {\
+   public:\
+    name##ActionP7(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, \
+        p6##_type gmock_p6) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)), \
+        p5(::std::forward<p5##_type>(gmock_p5)), \
+        p6(::std::forward<p6##_type>(gmock_p6)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)), \
+          p5(::std::forward<p5##_type>(gmock_p5)), \
+          p6(::std::forward<p6##_type>(gmock_p6)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP7);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  inline name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type> name(p0##_type p0, p1##_type p1, \
+      p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6) {\
+    return name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, p6);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P8(name, p0, p1, p2, p3, p4, p5, p6, p7)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  class name##ActionP8 {\
+   public:\
+    name##ActionP8(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, \
+        p7##_type gmock_p7) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)), \
+        p5(::std::forward<p5##_type>(gmock_p5)), \
+        p6(::std::forward<p6##_type>(gmock_p6)), \
+        p7(::std::forward<p7##_type>(gmock_p7)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, \
+          p7##_type gmock_p7) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)), \
+          p5(::std::forward<p5##_type>(gmock_p5)), \
+          p6(::std::forward<p6##_type>(gmock_p6)), \
+          p7(::std::forward<p7##_type>(gmock_p7)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP8);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  inline name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6, p7##_type p7) {\
+    return name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, p3, p4, p5, \
+        p6, p7);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, \
+          p7##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  class name##ActionP9 {\
+   public:\
+    name##ActionP9(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)), \
+        p5(::std::forward<p5##_type>(gmock_p5)), \
+        p6(::std::forward<p6##_type>(gmock_p6)), \
+        p7(::std::forward<p7##_type>(gmock_p7)), \
+        p8(::std::forward<p8##_type>(gmock_p8)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, \
+          p8##_type gmock_p8) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)), \
+          p5(::std::forward<p5##_type>(gmock_p5)), \
+          p6(::std::forward<p6##_type>(gmock_p6)), \
+          p7(::std::forward<p7##_type>(gmock_p7)), \
+          p8(::std::forward<p8##_type>(gmock_p8)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7, p8));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP9);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  inline name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, \
+      p8##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, \
+      p8##_type p8) {\
+    return name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type>(p0, p1, p2, \
+        p3, p4, p5, p6, p7, p8);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, p7##_type, \
+          p8##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  class name##ActionP10 {\
+   public:\
+    name##ActionP10(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8, \
+        p9##_type gmock_p9) : p0(::std::forward<p0##_type>(gmock_p0)), \
+        p1(::std::forward<p1##_type>(gmock_p1)), \
+        p2(::std::forward<p2##_type>(gmock_p2)), \
+        p3(::std::forward<p3##_type>(gmock_p3)), \
+        p4(::std::forward<p4##_type>(gmock_p4)), \
+        p5(::std::forward<p5##_type>(gmock_p5)), \
+        p6(::std::forward<p6##_type>(gmock_p6)), \
+        p7(::std::forward<p7##_type>(gmock_p7)), \
+        p8(::std::forward<p8##_type>(gmock_p8)), \
+        p9(::std::forward<p9##_type>(gmock_p9)) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+          p9##_type gmock_p9) : p0(::std::forward<p0##_type>(gmock_p0)), \
+          p1(::std::forward<p1##_type>(gmock_p1)), \
+          p2(::std::forward<p2##_type>(gmock_p2)), \
+          p3(::std::forward<p3##_type>(gmock_p3)), \
+          p4(::std::forward<p4##_type>(gmock_p4)), \
+          p5(::std::forward<p5##_type>(gmock_p5)), \
+          p6(::std::forward<p6##_type>(gmock_p6)), \
+          p7(::std::forward<p7##_type>(gmock_p7)), \
+          p8(::std::forward<p8##_type>(gmock_p8)), \
+          p9(::std::forward<p9##_type>(gmock_p9)) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, \
+          const arg0_type& arg0, const arg1_type& arg1, \
+          const arg2_type& arg2, const arg3_type& arg3, \
+          const arg4_type& arg4, const arg5_type& arg5, \
+          const arg6_type& arg6, const arg7_type& arg7, \
+          const arg8_type& arg8, const arg9_type& arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+      p9##_type p9;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7, p8, p9));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+    p9##_type p9;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP10);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  inline name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+      p9##_type p9) {\
+    return name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, p9##_type>(p0, \
+        p1, p2, p3, p4, p5, p6, p7, p8, p9);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, p7##_type, p8##_type, \
+          p9##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+namespace testing {
+
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Various overloads for InvokeArgument<N>().
+//
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside ByRef().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), ByRef(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but ByRef() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+
+namespace internal {
+namespace invoke_argument {
+
+// Appears in InvokeArgumentAdl's argument list to help avoid
+// accidental calls to user functions of the same name.
+struct AdlTag {};
+
+// InvokeArgumentAdl - a helper for InvokeArgument.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/callback-actions.h header.
+
+template <typename R, typename F>
+R InvokeArgumentAdl(AdlTag, F f) {
+  return f();
+}
+template <typename R, typename F, typename A1>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1) {
+  return f(a1);
+}
+template <typename R, typename F, typename A1, typename A2>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2) {
+  return f(a1, a2);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3) {
+  return f(a1, a2, a3);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4) {
+  return f(a1, a2, a3, a4);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5) {
+  return f(a1, a2, a3, a4, a5);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6) {
+  return f(a1, a2, a3, a4, a5, a6);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7) {
+  return f(a1, a2, a3, a4, a5, a6, a7);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8,
+    typename A9>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8, A9 a9) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8, a9);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8,
+    typename A9, typename A10>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8, A9 a9, A10 a10) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10);
+}
+}  // namespace invoke_argument
+}  // namespace internal
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args));
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(p0)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7, p8);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7, p8, p9);
+}
+
+// Various overloads for ReturnNew<T>().
+//
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_0_VALUE_PARAMS()) {
+  return new T();
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_1_VALUE_PARAMS(p0)) {
+  return new T(p0);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  return new T(p0, p1);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  return new T(p0, p1, p2);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  return new T(p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  return new T(p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  return new T(p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Include any custom callback actions added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h.pump b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h.pump
new file mode 100755
index 0000000..209603c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-actions.h.pump
@@ -0,0 +1,627 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file. Please use Pump to convert it to
+$$ gmock-generated-actions.h.
+$$
+$var n = 10  $$ The maximum arity we support.
+$$}} This meta comment fixes auto-indentation in editors.
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+namespace internal {
+
+// A macro from the ACTION* family (defined later in this file)
+// defines an action that can be used in a mock function.  Typically,
+// these actions only care about a subset of the arguments of the mock
+// function.  For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs.  The ExcessiveArg type is used to
+// represent those excessive arguments.  In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal.  However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// A helper class needed for implementing the ACTION* macros.
+template <typename Result, class Impl>
+class ActionHelper {
+ public:
+$range i 0..n
+$for i
+
+[[
+$var template = [[$if i==0 [[]] $else [[
+$range j 0..i-1
+  template <$for j, [[typename A$j]]>
+]]]]
+$range j 0..i-1
+$var As = [[$for j, [[A$j]]]]
+$var as = [[$for j, [[std::get<$j>(args)]]]]
+$range k 1..n-i
+$var eas = [[$for k, [[ExcessiveArg()]]]]
+$var arg_list = [[$if (i==0) | (i==n) [[$as$eas]] $else [[$as, $eas]]]]
+$template
+  static Result Perform(Impl* impl, const ::std::tuple<$As>& args) {
+    return impl->template gmock_PerformImpl<$As>(args, $arg_list);
+  }
+
+]]
+};
+
+}  // namespace internal
+}  // namespace testing
+
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily.  The syntax:
+//
+//   ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements.  The value returned by the statements will be used as
+// the return value of the action.  Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'.  For example:
+//
+//   ACTION(IncrementArg1) {
+//     arg1_type temp = arg1;
+//     return ++(*temp);
+//   }
+//
+// allows you to write
+//
+//   ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments.  However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action.   For that you can use
+// another macro:
+//
+//   ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+//   ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+//   ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either.  If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'.  For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P$n to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+//   ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically.  You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>.  This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+//   ACTION_P(Plus, a) { ... }
+//   ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot.  While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run.  They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+// Users can, however, define any local functors (e.g. a lambda) that
+// can be used as actions.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION' on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+
+$range i 0..n
+$range k 0..n-1
+
+// An internal macro needed for implementing ACTION*().
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_\
+    const args_type& args GTEST_ATTRIBUTE_UNUSED_
+$for k [[, \
+    const arg$k[[]]_type& arg$k GTEST_ATTRIBUTE_UNUSED_]]
+
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::std::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+
+$range j 1..n
+$for j [[
+$range m 0..j-1
+#define GMOCK_INTERNAL_DECL_HAS_$j[[]]
+_TEMPLATE_PARAMS($for m, [[kind$m, name$m]]) $for m, [[kind$m name$m]]
+
+
+]]
+
+// Lists the template parameters.
+
+$for j [[
+$range m 0..j-1
+#define GMOCK_INTERNAL_LIST_HAS_$j[[]]
+_TEMPLATE_PARAMS($for m, [[kind$m, name$m]]) $for m, [[name$m]]
+
+
+]]
+
+// Declares the types of value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DECL_TYPE_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[, typename p$j##_type]]
+
+
+]]
+
+// Initializes the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_INIT_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]])\
+    ($for j, [[p$j##_type gmock_p$j]])$if i>0 [[ : ]]$for j, [[p$j(::std::move(gmock_p$j))]]
+
+
+]]
+
+// Declares the fields for storing the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DEFN_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[p$j##_type p$j; ]]
+
+
+]]
+
+// Lists the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_LIST_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j, [[p$j]]
+
+
+]]
+
+// Lists the value parameter types.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_LIST_TYPE_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[, p$j##_type]]
+
+
+]]
+
+// Declares the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DECL_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]]) [[]]
+$for j, [[p$j##_type p$j]]
+
+
+]]
+
+// The suffix of the class template implementing the action template.
+$for i [[
+
+
+$range j 0..i-1
+#define GMOCK_INTERNAL_COUNT_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]]) [[]]
+$if i==1 [[P]] $elif i>=2 [[P$i]]
+]]
+
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+$range k 0..n-1
+
+#define ACTION_TEMPLATE(name, template_params, value_params)\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  class GMOCK_ACTION_CLASS_(name, value_params) {\
+   public:\
+    explicit GMOCK_ACTION_CLASS_(name, value_params)\
+        GMOCK_INTERNAL_INIT_##value_params {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <$for k, [[typename arg$k[[]]_type]]>\
+      return_type gmock_PerformImpl(const args_type& args[[]]
+$for k [[, const arg$k[[]]_type& arg$k]]) const;\
+      GMOCK_INTERNAL_DEFN_##value_params\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(\
+          new gmock_Impl<F>(GMOCK_INTERNAL_LIST_##value_params));\
+    }\
+    GMOCK_INTERNAL_DEFN_##value_params\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(GMOCK_ACTION_CLASS_(name, value_params));\
+  };\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  inline GMOCK_ACTION_CLASS_(name, value_params)<\
+      GMOCK_INTERNAL_LIST_##template_params\
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(\
+          GMOCK_INTERNAL_DECL_##value_params) {\
+    return GMOCK_ACTION_CLASS_(name, value_params)<\
+        GMOCK_INTERNAL_LIST_##template_params\
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(\
+            GMOCK_INTERNAL_LIST_##value_params);\
+  }\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      GMOCK_ACTION_CLASS_(name, value_params)<\
+          GMOCK_INTERNAL_LIST_##template_params\
+          GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl<F>::\
+              gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+$for i
+
+[[
+$var template = [[$if i==0 [[]] $else [[
+$range j 0..i-1
+
+  template <$for j, [[typename p$j##_type]]>\
+]]]]
+$var class_name = [[name##Action[[$if i==0 [[]] $elif i==1 [[P]]
+                                                $else [[P$i]]]]]]
+$range j 0..i-1
+$var ctor_param_list = [[$for j, [[p$j##_type gmock_p$j]]]]
+$var param_types_and_names = [[$for j, [[p$j##_type p$j]]]]
+$var inits = [[$if i==0 [[]] $else [[ : $for j, [[p$j(::std::forward<p$j##_type>(gmock_p$j))]]]]]]
+$var param_field_decls = [[$for j
+[[
+
+      p$j##_type p$j;\
+]]]]
+$var param_field_decls2 = [[$for j
+[[
+
+    p$j##_type p$j;\
+]]]]
+$var params = [[$for j, [[p$j]]]]
+$var param_types = [[$if i==0 [[]] $else [[<$for j, [[p$j##_type]]>]]]]
+$var typename_arg_types = [[$for k, [[typename arg$k[[]]_type]]]]
+$var arg_types_and_names = [[$for k, [[const arg$k[[]]_type& arg$k]]]]
+$var macro_name = [[$if i==0 [[ACTION]] $elif i==1 [[ACTION_P]]
+                                        $else [[ACTION_P$i]]]]
+
+#define $macro_name(name$for j [[, p$j]])\$template
+  class $class_name {\
+   public:\
+    [[$if i==1 [[explicit ]]]]$class_name($ctor_param_list)$inits {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      [[$if i==1 [[explicit ]]]]gmock_Impl($ctor_param_list)$inits {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <$typename_arg_types>\
+      return_type gmock_PerformImpl(const args_type& args, [[]]
+$arg_types_and_names) const;\$param_field_decls
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>($params));\
+    }\$param_field_decls2
+   private:\
+    GTEST_DISALLOW_ASSIGN_($class_name);\
+  };\$template
+  inline $class_name$param_types name($param_types_and_names) {\
+    return $class_name$param_types($params);\
+  }\$template
+  template <typename F>\
+  template <$typename_arg_types>\
+  typename ::testing::internal::Function<F>::Result\
+      $class_name$param_types::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+]]
+$$ }  // This meta comment fixes auto-indentation in Emacs.  It won't
+$$    // show up in the generated code.
+
+
+namespace testing {
+
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Various overloads for InvokeArgument<N>().
+//
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside ByRef().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), ByRef(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but ByRef() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+
+namespace internal {
+namespace invoke_argument {
+
+// Appears in InvokeArgumentAdl's argument list to help avoid
+// accidental calls to user functions of the same name.
+struct AdlTag {};
+
+// InvokeArgumentAdl - a helper for InvokeArgument.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/callback-actions.h header.
+
+$range i 0..n
+$for i
+[[
+$range j 1..i
+
+template <typename R, typename F[[$for j [[, typename A$j]]]]>
+R InvokeArgumentAdl(AdlTag, F f[[$for j [[, A$j a$j]]]]) {
+  return f([[$for j, [[a$j]]]]);
+}
+]]
+
+}  // namespace invoke_argument
+}  // namespace internal
+
+$range i 0..n
+$for i [[
+$range j 0..i-1
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]])) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::std::get<k>(args)$for j [[, p$j]]);
+}
+
+]]
+
+// Various overloads for ReturnNew<T>().
+//
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+$range i 0..n
+$for i [[
+$range j 0..i-1
+$var ps = [[$for j, [[p$j]]]]
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_$i[[]]_VALUE_PARAMS($ps)) {
+  return new T($ps);
+}
+
+]]
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Include any custom callback actions added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h
new file mode 100755
index 0000000..cd95781
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h
@@ -0,0 +1,752 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-function-mockers.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements function mockers of various arities.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+
+#include <functional>
+#include <utility>
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+namespace internal {
+// Removes the given pointer; this is a helper for the expectation setter method
+// for parameterless matchers.
+//
+// We want to make sure that the user cannot set a parameterless expectation on
+// overloaded methods, including methods which are overloaded on const. Example:
+//
+//   class MockClass {
+//     MOCK_METHOD0(GetName, string&());
+//     MOCK_CONST_METHOD0(GetName, const string&());
+//   };
+//
+//   TEST() {
+//     // This should be an error, as it's not clear which overload is expected.
+//     EXPECT_CALL(mock, GetName).WillOnce(ReturnRef(value));
+//   }
+//
+// Here are the generated expectation-setter methods:
+//
+//   class MockClass {
+//     // Overload 1
+//     MockSpec<string&()> gmock_GetName() { ... }
+//     // Overload 2. Declared const so that the compiler will generate an
+//     // error when trying to resolve between this and overload 4 in
+//     // 'gmock_GetName(WithoutMatchers(), nullptr)'.
+//     MockSpec<string&()> gmock_GetName(
+//         const WithoutMatchers&, const Function<string&()>*) const {
+//       // Removes const from this, calls overload 1
+//       return AdjustConstness_(this)->gmock_GetName();
+//     }
+//
+//     // Overload 3
+//     const string& gmock_GetName() const { ... }
+//     // Overload 4
+//     MockSpec<const string&()> gmock_GetName(
+//         const WithoutMatchers&, const Function<const string&()>*) const {
+//       // Does not remove const, calls overload 3
+//       return AdjustConstness_const(this)->gmock_GetName();
+//     }
+//   }
+//
+template <typename MockType>
+const MockType* AdjustConstness_const(const MockType* mock) {
+  return mock;
+}
+
+// Removes const from and returns the given pointer; this is a helper for the
+// expectation setter method for parameterless matchers.
+template <typename MockType>
+MockType* AdjustConstness_(const MockType* mock) {
+  return const_cast<MockType*>(mock);
+}
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace.  The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+
+// GMOCK_RESULT_(tn, F) expands to the result type of function type F.
+// We define this as a variadic macro in case F contains unprotected
+// commas (the same reason that we use variadic macros in other places
+// in this file).
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_RESULT_(tn, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::Result
+
+// The type of argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_ARG_(tn, N, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::template Arg<N-1>::type
+
+// The matcher type for argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MATCHER_(tn, N, ...) \
+    const ::testing::Matcher<GMOCK_ARG_(tn, N, __VA_ARGS__)>&
+
+// The variable for mocking the given method.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MOCKER_(arity, constness, Method) \
+    GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD0_(tn, constness, ct, Method, ...) \
+  static_assert(0 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      ) constness { \
+    GMOCK_MOCKER_(0, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(0, constness, Method).Invoke(); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method() constness { \
+    GMOCK_MOCKER_(0, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(0, constness, Method).With(); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(0, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD1_(tn, constness, ct, Method, ...) \
+  static_assert(1 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1) constness { \
+    GMOCK_MOCKER_(1, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(1, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1) constness { \
+    GMOCK_MOCKER_(1, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(1, constness, Method).With(gmock_a1); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(1, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD2_(tn, constness, ct, Method, ...) \
+  static_assert(2 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2) constness { \
+    GMOCK_MOCKER_(2, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(2, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2) constness { \
+    GMOCK_MOCKER_(2, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(2, constness, Method).With(gmock_a1, gmock_a2); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(2, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD3_(tn, constness, ct, Method, ...) \
+  static_assert(3 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, \
+          __VA_ARGS__) gmock_a3) constness { \
+    GMOCK_MOCKER_(3, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(3, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3) constness { \
+    GMOCK_MOCKER_(3, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(3, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(3, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD4_(tn, constness, ct, Method, ...) \
+  static_assert(4 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4) constness { \
+    GMOCK_MOCKER_(4, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(4, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4) constness { \
+    GMOCK_MOCKER_(4, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(4, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(4, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD5_(tn, constness, ct, Method, ...) \
+  static_assert(5 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5) constness { \
+    GMOCK_MOCKER_(5, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(5, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5) constness { \
+    GMOCK_MOCKER_(5, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(5, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(5, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD6_(tn, constness, ct, Method, ...) \
+  static_assert(6 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5, GMOCK_ARG_(tn, 6, \
+          __VA_ARGS__) gmock_a6) constness { \
+    GMOCK_MOCKER_(6, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(6, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5), \
+  ::std::forward<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(gmock_a6)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6) constness { \
+    GMOCK_MOCKER_(6, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(6, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 6, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(6, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD7_(tn, constness, ct, Method, ...) \
+  static_assert(7 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5, GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+          GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7) constness { \
+    GMOCK_MOCKER_(7, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(7, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5), \
+  ::std::forward<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(gmock_a6), \
+  ::std::forward<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(gmock_a7)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7) constness { \
+    GMOCK_MOCKER_(7, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(7, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 7, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(7, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD8_(tn, constness, ct, Method, ...) \
+  static_assert(8 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5, GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+          GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, GMOCK_ARG_(tn, 8, \
+          __VA_ARGS__) gmock_a8) constness { \
+    GMOCK_MOCKER_(8, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(8, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5), \
+  ::std::forward<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(gmock_a6), \
+  ::std::forward<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(gmock_a7), \
+  ::std::forward<GMOCK_ARG_(tn, 8, __VA_ARGS__)>(gmock_a8)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8) constness { \
+    GMOCK_MOCKER_(8, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(8, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 8, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(8, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD9_(tn, constness, ct, Method, ...) \
+  static_assert(9 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5, GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+          GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, GMOCK_ARG_(tn, 8, \
+          __VA_ARGS__) gmock_a8, GMOCK_ARG_(tn, 9, \
+          __VA_ARGS__) gmock_a9) constness { \
+    GMOCK_MOCKER_(9, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(9, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5), \
+  ::std::forward<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(gmock_a6), \
+  ::std::forward<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(gmock_a7), \
+  ::std::forward<GMOCK_ARG_(tn, 8, __VA_ARGS__)>(gmock_a8), \
+  ::std::forward<GMOCK_ARG_(tn, 9, __VA_ARGS__)>(gmock_a9)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8, \
+                     GMOCK_MATCHER_(tn, 9, __VA_ARGS__) gmock_a9) constness { \
+    GMOCK_MOCKER_(9, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(9, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, \
+        gmock_a9); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 8, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 9, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(9, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD10_(tn, constness, ct, Method, ...) \
+  static_assert(10 == \
+      ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, \
+      "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, GMOCK_ARG_(tn, 2, \
+          __VA_ARGS__) gmock_a2, GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+          GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, GMOCK_ARG_(tn, 5, \
+          __VA_ARGS__) gmock_a5, GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+          GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, GMOCK_ARG_(tn, 8, \
+          __VA_ARGS__) gmock_a8, GMOCK_ARG_(tn, 9, __VA_ARGS__) gmock_a9, \
+          GMOCK_ARG_(tn, 10, __VA_ARGS__) gmock_a10) constness { \
+    GMOCK_MOCKER_(10, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(10, constness, \
+        Method).Invoke(::std::forward<GMOCK_ARG_(tn, 1, \
+        __VA_ARGS__)>(gmock_a1), \
+  ::std::forward<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(gmock_a2), \
+  ::std::forward<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(gmock_a3), \
+  ::std::forward<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(gmock_a4), \
+  ::std::forward<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(gmock_a5), \
+  ::std::forward<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(gmock_a6), \
+  ::std::forward<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(gmock_a7), \
+  ::std::forward<GMOCK_ARG_(tn, 8, __VA_ARGS__)>(gmock_a8), \
+  ::std::forward<GMOCK_ARG_(tn, 9, __VA_ARGS__)>(gmock_a9), \
+  ::std::forward<GMOCK_ARG_(tn, 10, __VA_ARGS__)>(gmock_a10)); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8, \
+                     GMOCK_MATCHER_(tn, 9, __VA_ARGS__) gmock_a9, \
+                     GMOCK_MATCHER_(tn, 10, \
+                         __VA_ARGS__) gmock_a10) constness { \
+    GMOCK_MOCKER_(10, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(10, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, gmock_a9, \
+        gmock_a10); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method(::testing::A<GMOCK_ARG_(tn, 1, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 2, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 3, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 4, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 5, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 6, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 7, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 8, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 9, __VA_ARGS__)>(), \
+                     ::testing::A<GMOCK_ARG_(tn, 10, __VA_ARGS__)>()); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(10, constness, \
+      Method)
+
+#define MOCK_METHOD0(m, ...) GMOCK_METHOD0_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD1(m, ...) GMOCK_METHOD1_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD2(m, ...) GMOCK_METHOD2_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD3(m, ...) GMOCK_METHOD3_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD4(m, ...) GMOCK_METHOD4_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD5(m, ...) GMOCK_METHOD5_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD6(m, ...) GMOCK_METHOD6_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD7(m, ...) GMOCK_METHOD7_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD8(m, ...) GMOCK_METHOD8_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD9(m, ...) GMOCK_METHOD9_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD10(m, ...) GMOCK_METHOD10_(, , , m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0(m, ...) GMOCK_METHOD0_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1(m, ...) GMOCK_METHOD1_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2(m, ...) GMOCK_METHOD2_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3(m, ...) GMOCK_METHOD3_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4(m, ...) GMOCK_METHOD4_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5(m, ...) GMOCK_METHOD5_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6(m, ...) GMOCK_METHOD6_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7(m, ...) GMOCK_METHOD7_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8(m, ...) GMOCK_METHOD8_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9(m, ...) GMOCK_METHOD9_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10(m, ...) GMOCK_METHOD10_(, const, , m, __VA_ARGS__)
+
+#define MOCK_METHOD0_T(m, ...) GMOCK_METHOD0_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD1_T(m, ...) GMOCK_METHOD1_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD2_T(m, ...) GMOCK_METHOD2_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD3_T(m, ...) GMOCK_METHOD3_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD4_T(m, ...) GMOCK_METHOD4_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD5_T(m, ...) GMOCK_METHOD5_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD6_T(m, ...) GMOCK_METHOD6_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD7_T(m, ...) GMOCK_METHOD7_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD8_T(m, ...) GMOCK_METHOD8_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD9_T(m, ...) GMOCK_METHOD9_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD10_T(m, ...) GMOCK_METHOD10_(typename, , , m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T(m, ...) \
+    GMOCK_METHOD0_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T(m, ...) \
+    GMOCK_METHOD1_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T(m, ...) \
+    GMOCK_METHOD2_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T(m, ...) \
+    GMOCK_METHOD3_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T(m, ...) \
+    GMOCK_METHOD4_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T(m, ...) \
+    GMOCK_METHOD5_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T(m, ...) \
+    GMOCK_METHOD6_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T(m, ...) \
+    GMOCK_METHOD7_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T(m, ...) \
+    GMOCK_METHOD8_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T(m, ...) \
+    GMOCK_METHOD9_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T(m, ...) \
+    GMOCK_METHOD10_(typename, const, , m, __VA_ARGS__)
+
+#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(, , ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(, const, ct, m, __VA_ARGS__)
+
+#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(typename, , ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(typename, const, ct, m, __VA_ARGS__)
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h.pump b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h.pump
new file mode 100755
index 0000000..a56e132
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-function-mockers.h.pump
@@ -0,0 +1,227 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file.  Please use Pump to convert
+$$ it to gmock-generated-function-mockers.h.
+$$
+$var n = 10  $$ The maximum arity we support.
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements function mockers of various arities.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+
+#include <functional>
+#include <utility>
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+namespace internal {
+
+$range i 0..n
+// Removes the given pointer; this is a helper for the expectation setter method
+// for parameterless matchers.
+//
+// We want to make sure that the user cannot set a parameterless expectation on
+// overloaded methods, including methods which are overloaded on const. Example:
+//
+//   class MockClass {
+//     MOCK_METHOD0(GetName, string&());
+//     MOCK_CONST_METHOD0(GetName, const string&());
+//   };
+//
+//   TEST() {
+//     // This should be an error, as it's not clear which overload is expected.
+//     EXPECT_CALL(mock, GetName).WillOnce(ReturnRef(value));
+//   }
+//
+// Here are the generated expectation-setter methods:
+//
+//   class MockClass {
+//     // Overload 1
+//     MockSpec<string&()> gmock_GetName() { ... }
+//     // Overload 2. Declared const so that the compiler will generate an
+//     // error when trying to resolve between this and overload 4 in
+//     // 'gmock_GetName(WithoutMatchers(), nullptr)'.
+//     MockSpec<string&()> gmock_GetName(
+//         const WithoutMatchers&, const Function<string&()>*) const {
+//       // Removes const from this, calls overload 1
+//       return AdjustConstness_(this)->gmock_GetName();
+//     }
+//
+//     // Overload 3
+//     const string& gmock_GetName() const { ... }
+//     // Overload 4
+//     MockSpec<const string&()> gmock_GetName(
+//         const WithoutMatchers&, const Function<const string&()>*) const {
+//       // Does not remove const, calls overload 3
+//       return AdjustConstness_const(this)->gmock_GetName();
+//     }
+//   }
+//
+template <typename MockType>
+const MockType* AdjustConstness_const(const MockType* mock) {
+  return mock;
+}
+
+// Removes const from and returns the given pointer; this is a helper for the
+// expectation setter method for parameterless matchers.
+template <typename MockType>
+MockType* AdjustConstness_(const MockType* mock) {
+  return const_cast<MockType*>(mock);
+}
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace.  The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+
+// GMOCK_RESULT_(tn, F) expands to the result type of function type F.
+// We define this as a variadic macro in case F contains unprotected
+// commas (the same reason that we use variadic macros in other places
+// in this file).
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_RESULT_(tn, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::Result
+
+// The type of argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_ARG_(tn, N, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::template Arg<N-1>::type
+
+// The matcher type for argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MATCHER_(tn, N, ...) \
+    const ::testing::Matcher<GMOCK_ARG_(tn, N, __VA_ARGS__)>&
+
+// The variable for mocking the given method.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MOCKER_(arity, constness, Method) \
+    GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+
+$for i [[
+$range j 1..i
+$var arg_as = [[$for j, [[GMOCK_ARG_(tn, $j, __VA_ARGS__) gmock_a$j]]]]
+$var as = [[$for j, \
+  [[::std::forward<GMOCK_ARG_(tn, $j, __VA_ARGS__)>(gmock_a$j)]]]]
+$var matcher_arg_as = [[$for j, \
+                     [[GMOCK_MATCHER_(tn, $j, __VA_ARGS__) gmock_a$j]]]]
+$var matcher_as = [[$for j, [[gmock_a$j]]]]
+$var anything_matchers = [[$for j, \
+                     [[::testing::A<GMOCK_ARG_(tn, $j, __VA_ARGS__)>()]]]]
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD$i[[]]_(tn, constness, ct, Method, ...) \
+  static_assert($i == ::testing::internal::Function<__VA_ARGS__>::ArgumentCount, "MOCK_METHOD<N> must match argument count.");\
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      $arg_as) constness { \
+    GMOCK_MOCKER_($i, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_($i, constness, Method).Invoke($as); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> \
+      gmock_##Method($matcher_arg_as) constness { \
+    GMOCK_MOCKER_($i, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_($i, constness, Method).With($matcher_as); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__> gmock_##Method( \
+      const ::testing::internal::WithoutMatchers&, \
+      constness ::testing::internal::Function<__VA_ARGS__>* ) const { \
+        return ::testing::internal::AdjustConstness_##constness(this)-> \
+            gmock_##Method($anything_matchers); \
+      } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_($i, constness, Method)
+
+
+]]
+$for i [[
+#define MOCK_METHOD$i(m, ...) GMOCK_METHOD$i[[]]_(, , , m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_CONST_METHOD$i(m, ...) GMOCK_METHOD$i[[]]_(, const, , m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_METHOD$i[[]]_T(m, ...) GMOCK_METHOD$i[[]]_(typename, , , m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_CONST_METHOD$i[[]]_T(m, ...) \
+    GMOCK_METHOD$i[[]]_(typename, const, , m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_METHOD$i[[]]_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD$i[[]]_(, , ct, m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_CONST_METHOD$i[[]]_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD$i[[]]_(, const, ct, m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_METHOD$i[[]]_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD$i[[]]_(typename, , ct, m, __VA_ARGS__)
+
+]]
+
+
+$for i [[
+#define MOCK_CONST_METHOD$i[[]]_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD$i[[]]_(typename, const, ct, m, __VA_ARGS__)
+
+]]
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h
new file mode 100755
index 0000000..690a57f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h
@@ -0,0 +1,1097 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-matchers.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic matchers.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "gmock/gmock-matchers.h"
+
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+//   MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds.  Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails.  Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects.  It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+//   // Expects mock_foo.Bar(n) to be called where n is even.
+//   EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+//   // Verifies that the value of some_expression is even.
+//   EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+//   Value of: some_expression
+//   Expected: is even
+//     Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you).  This allows the matcher to be
+// polymorphic.  For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool.  In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher.  For that you
+// can use another macro:
+//
+//   MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+//   Value of: Blah("a")
+//   Expected: has absolute value 10
+//     Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'.  For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P10 to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression.  The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'.  When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher.  For example,
+//
+//   using testing::PrintToString;
+//
+//   MATCHER_P2(InClosedRange, low, hi,
+//       std::string(negation ? "is not" : "is") + " in range [" +
+//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
+//     return low <= arg && arg <= hi;
+//   }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: is in range [4, 6]
+//   ...
+//   Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple.  For example,
+//
+//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: in closed range (4, 6)
+//   ...
+//   Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooMatcherPk<p1_type, ..., pk_type>
+//   Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you.  If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false).  As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used.  You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable.  If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded.  For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one.  To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+//   MATCHER_P(EqualsLongString, str, "") {
+//     if (arg == str) return true;
+//
+//     *result_listener << "the difference: "
+///                     << DiffStrings(str, arg);
+//     return false;
+//   }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+//   MATCHER_P(Blah, a, description_string1) { ... }
+//   MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher().  These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong.  They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+
+#define MATCHER(name, description)\
+  class name##Matcher {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl()\
+           {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<>()));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>());\
+    }\
+    name##Matcher() {\
+    }\
+   private:\
+  };\
+  inline name##Matcher name() {\
+    return name##Matcher();\
+  }\
+  template <typename arg_type>\
+  bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P(name, p0, description)\
+  template <typename p0##_type>\
+  class name##MatcherP {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      explicit gmock_Impl(p0##_type gmock_p0)\
+           : p0(::std::move(gmock_p0)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type>(p0)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0));\
+    }\
+    explicit name##MatcherP(p0##_type gmock_p0) : p0(::std::move(gmock_p0)) {\
+    }\
+    p0##_type const p0;\
+   private:\
+  };\
+  template <typename p0##_type>\
+  inline name##MatcherP<p0##_type> name(p0##_type p0) {\
+    return name##MatcherP<p0##_type>(p0);\
+  }\
+  template <typename p0##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP<p0##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P2(name, p0, p1, description)\
+  template <typename p0##_type, typename p1##_type>\
+  class name##MatcherP2 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type>(p0, p1)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1));\
+    }\
+    name##MatcherP2(p0##_type gmock_p0, \
+        p1##_type gmock_p1) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type>\
+  inline name##MatcherP2<p0##_type, p1##_type> name(p0##_type p0, \
+      p1##_type p1) {\
+    return name##MatcherP2<p0##_type, p1##_type>(p0, p1);\
+  }\
+  template <typename p0##_type, typename p1##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP2<p0##_type, \
+      p1##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P3(name, p0, p1, p2, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  class name##MatcherP3 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type>(p0, p1, p2)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2));\
+    }\
+    name##MatcherP3(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  inline name##MatcherP3<p0##_type, p1##_type, p2##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2) {\
+    return name##MatcherP3<p0##_type, p1##_type, p2##_type>(p0, p1, p2);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP3<p0##_type, p1##_type, \
+      p2##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P4(name, p0, p1, p2, p3, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  class name##MatcherP4 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type>(p0, \
+                    p1, p2, p3)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3));\
+    }\
+    name##MatcherP4(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  inline name##MatcherP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3) {\
+    return name##MatcherP4<p0##_type, p1##_type, p2##_type, p3##_type>(p0, \
+        p1, p2, p3);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P5(name, p0, p1, p2, p3, p4, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  class name##MatcherP5 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type>(p0, p1, p2, p3, p4)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4));\
+    }\
+    name##MatcherP5(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, \
+        p4##_type gmock_p4) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  inline name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4) {\
+    return name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type>(p0, p1, p2, p3, p4);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  class name##MatcherP6 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)), p5(::std::move(gmock_p5)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+      p5##_type const p5;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5));\
+    }\
+    name##MatcherP6(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+    p5##_type const p5;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  inline name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3, p4##_type p4, p5##_type p5) {\
+    return name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  class name##MatcherP7 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)), p5(::std::move(gmock_p5)), \
+               p6(::std::move(gmock_p6)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+      p5##_type const p5;\
+      p6##_type const p6;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, \
+                    p6)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6));\
+    }\
+    name##MatcherP7(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+    p5##_type const p5;\
+    p6##_type const p6;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  inline name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type> name(p0##_type p0, p1##_type p1, \
+      p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6) {\
+    return name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, p6);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  class name##MatcherP8 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)), p5(::std::move(gmock_p5)), \
+               p6(::std::move(gmock_p6)), p7(::std::move(gmock_p7)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+      p5##_type const p5;\
+      p6##_type const p6;\
+      p7##_type const p7;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, \
+                    p3, p4, p5, p6, p7)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7));\
+    }\
+    name##MatcherP8(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, \
+        p7##_type gmock_p7) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+    p5##_type const p5;\
+    p6##_type const p6;\
+    p7##_type const p7;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  inline name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6, p7##_type p7) {\
+    return name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, p3, p4, p5, \
+        p6, p7);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type, \
+      p7##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  class name##MatcherP9 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)), p5(::std::move(gmock_p5)), \
+               p6(::std::move(gmock_p6)), p7(::std::move(gmock_p7)), \
+               p8(::std::move(gmock_p8)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+      p5##_type const p5;\
+      p6##_type const p6;\
+      p7##_type const p7;\
+      p8##_type const p8;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type, \
+                    p8##_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8));\
+    }\
+    name##MatcherP9(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+    p5##_type const p5;\
+    p6##_type const p6;\
+    p7##_type const p7;\
+    p8##_type const p8;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  inline name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, \
+      p8##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, \
+      p8##_type p8) {\
+    return name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type>(p0, p1, p2, \
+        p3, p4, p5, p6, p7, p8);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type, p7##_type, \
+      p8##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  class name##MatcherP10 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+          p9##_type gmock_p9)\
+           : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1)), \
+               p2(::std::move(gmock_p2)), p3(::std::move(gmock_p3)), \
+               p4(::std::move(gmock_p4)), p5(::std::move(gmock_p5)), \
+               p6(::std::move(gmock_p6)), p7(::std::move(gmock_p7)), \
+               p8(::std::move(gmock_p8)), p9(::std::move(gmock_p9)) {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type const p0;\
+      p1##_type const p1;\
+      p2##_type const p2;\
+      p3##_type const p3;\
+      p4##_type const p4;\
+      p5##_type const p5;\
+      p6##_type const p6;\
+      p7##_type const p7;\
+      p8##_type const p8;\
+      p9##_type const p9;\
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+                    p9##_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9));\
+    }\
+    name##MatcherP10(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8, p9##_type gmock_p9) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)), \
+        p9(::std::move(gmock_p9)) {\
+    }\
+    p0##_type const p0;\
+    p1##_type const p1;\
+    p2##_type const p2;\
+    p3##_type const p3;\
+    p4##_type const p4;\
+    p5##_type const p5;\
+    p6##_type const p6;\
+    p7##_type const p7;\
+    p8##_type const p8;\
+    p9##_type const p9;\
+   private:\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  inline name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+      p9##_type p9) {\
+    return name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, p9##_type>(p0, \
+        p1, p2, p3, p4, p5, p6, p7, p8, p9);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h.pump b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h.pump
new file mode 100755
index 0000000..ae90917
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-generated-matchers.h.pump
@@ -0,0 +1,346 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file. Please use Pump to convert
+$$ it to gmock-generated-matchers.h.
+$$
+$var n = 10  $$ The maximum arity we support.
+$$ }} This line fixes auto-indentation of the following code in Emacs.
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic matchers.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "gmock/gmock-matchers.h"
+
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+//   MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds.  Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails.  Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects.  It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+//   // Expects mock_foo.Bar(n) to be called where n is even.
+//   EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+//   // Verifies that the value of some_expression is even.
+//   EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+//   Value of: some_expression
+//   Expected: is even
+//     Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you).  This allows the matcher to be
+// polymorphic.  For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool.  In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher.  For that you
+// can use another macro:
+//
+//   MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+//   Value of: Blah("a")
+//   Expected: has absolute value 10
+//     Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'.  For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P$n to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression.  The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'.  When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher.  For example,
+//
+//   using testing::PrintToString;
+//
+//   MATCHER_P2(InClosedRange, low, hi,
+//       std::string(negation ? "is not" : "is") + " in range [" +
+//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
+//     return low <= arg && arg <= hi;
+//   }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: is in range [4, 6]
+//   ...
+//   Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple.  For example,
+//
+//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: in closed range (4, 6)
+//   ...
+//   Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooMatcherPk<p1_type, ..., pk_type>
+//   Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you.  If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false).  As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used.  You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable.  If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded.  For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one.  To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+//   MATCHER_P(EqualsLongString, str, "") {
+//     if (arg == str) return true;
+//
+//     *result_listener << "the difference: "
+///                     << DiffStrings(str, arg);
+//     return false;
+//   }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+//   MATCHER_P(Blah, a, description_string1) { ... }
+//   MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher().  These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong.  They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+
+$range i 0..n
+$for i
+
+[[
+$var macro_name = [[$if i==0 [[MATCHER]] $elif i==1 [[MATCHER_P]]
+                                         $else [[MATCHER_P$i]]]]
+$var class_name = [[name##Matcher[[$if i==0 [[]] $elif i==1 [[P]]
+                                                 $else [[P$i]]]]]]
+$range j 0..i-1
+$var template = [[$if i==0 [[]] $else [[
+
+  template <$for j, [[typename p$j##_type]]>\
+]]]]
+$var ctor_param_list = [[$for j, [[p$j##_type gmock_p$j]]]]
+$var impl_ctor_param_list = [[$for j, [[p$j##_type gmock_p$j]]]]
+$var impl_inits = [[$if i==0 [[]] $else [[ : $for j, [[p$j(::std::move(gmock_p$j))]]]]]]
+$var inits = [[$if i==0 [[]] $else [[ : $for j, [[p$j(::std::move(gmock_p$j))]]]]]]
+$var params = [[$for j, [[p$j]]]]
+$var param_types = [[$if i==0 [[]] $else [[<$for j, [[p$j##_type]]>]]]]
+$var param_types_and_names = [[$for j, [[p$j##_type p$j]]]]
+$var param_field_decls = [[$for j
+[[
+
+      p$j##_type const p$j;\
+]]]]
+$var param_field_decls2 = [[$for j
+[[
+
+    p$j##_type const p$j;\
+]]]]
+
+#define $macro_name(name$for j [[, p$j]], description)\$template
+  class $class_name {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<\
+        GTEST_REFERENCE_TO_CONST_(arg_type)> {\
+     public:\
+      [[$if i==1 [[explicit ]]]]gmock_Impl($impl_ctor_param_list)\
+          $impl_inits {}\
+      virtual bool MatchAndExplain(\
+          GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+          ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\$param_field_decls
+     private:\
+      ::std::string FormatDescription(bool negation) const {\
+        ::std::string gmock_description = (description);\
+        if (!gmock_description.empty()) {\
+          return gmock_description;\
+        }\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::std::tuple<$for j, [[p$j##_type]]>($for j, [[p$j]])));\
+      }\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>($params));\
+    }\
+    [[$if i==1 [[explicit ]]]]$class_name($ctor_param_list)$inits {\
+    }\$param_field_decls2
+   private:\
+  };\$template
+  inline $class_name$param_types name($param_types_and_names) {\
+    return $class_name$param_types($params);\
+  }\$template
+  template <typename arg_type>\
+  bool $class_name$param_types::gmock_Impl<arg_type>::MatchAndExplain(\
+      GTEST_REFERENCE_TO_CONST_(arg_type) arg,\
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+]]
+
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-matchers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-matchers.h
new file mode 100755
index 0000000..7009215
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-matchers.h
@@ -0,0 +1,4566 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used argument matchers.  More
+// matchers can be defined by the user implementing the
+// MatcherInterface<T> interface if necessary.
+//
+// See googletest/include/gtest/gtest-matchers.h for the definition of class
+// Matcher, class MatcherInterface, and others.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+
+#include <math.h>
+#include <algorithm>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GMOCK_MAYBE_5046_ 5046
+#else
+#define GMOCK_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GMOCK_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// A match result listener that stores the explanation in a string.
+class StringMatchResultListener : public MatchResultListener {
+ public:
+  StringMatchResultListener() : MatchResultListener(&ss_) {}
+
+  // Returns the explanation accumulated so far.
+  std::string str() const { return ss_.str(); }
+
+  // Clears the explanation accumulated so far.
+  void Clear() { ss_.str(""); }
+
+ private:
+  ::std::stringstream ss_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StringMatchResultListener);
+};
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// The MatcherCastImpl class template is a helper for implementing
+// MatcherCast().  We need this helper in order to partially
+// specialize the implementation of MatcherCast() (C++ allows
+// class/struct templates to be partially specialized, but not
+// function templates.).
+
+// This general version is used when MatcherCast()'s argument is a
+// polymorphic matcher (i.e. something that can be converted to a
+// Matcher but is not one yet; for example, Eq(value)) or a value (for
+// example, "hello").
+template <typename T, typename M>
+class MatcherCastImpl {
+ public:
+  static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    // M can be a polymorphic matcher, in which case we want to use
+    // its conversion operator to create Matcher<T>.  Or it can be a value
+    // that should be passed to the Matcher<T>'s constructor.
+    //
+    // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
+    // polymorphic matcher because it'll be ambiguous if T has an implicit
+    // constructor from M (this usually happens when T has an implicit
+    // constructor from any type).
+    //
+    // It won't work to unconditionally implict_cast
+    // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
+    // a user-defined conversion from M to T if one exists (assuming M is
+    // a value).
+    return CastImpl(polymorphic_matcher_or_value,
+                    std::is_convertible<M, Matcher<T>>{},
+                    std::is_convertible<M, T>{});
+  }
+
+ private:
+  template <bool Ignore>
+  static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
+                             std::true_type /* convertible_to_matcher */,
+                             bool_constant<Ignore>) {
+    // M is implicitly convertible to Matcher<T>, which means that either
+    // M is a polymorphic matcher or Matcher<T> has an implicit constructor
+    // from M.  In both cases using the implicit conversion will produce a
+    // matcher.
+    //
+    // Even if T has an implicit constructor from M, it won't be called because
+    // creating Matcher<T> would require a chain of two user-defined conversions
+    // (first to create T from M and then to create Matcher<T> from T).
+    return polymorphic_matcher_or_value;
+  }
+
+  // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
+  // matcher. It's a value of a type implicitly convertible to T. Use direct
+  // initialization to create a matcher.
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::true_type /* convertible_to_T */) {
+    return Matcher<T>(ImplicitCast_<T>(value));
+  }
+
+  // M can't be implicitly converted to either Matcher<T> or T. Attempt to use
+  // polymorphic matcher Eq(value) in this case.
+  //
+  // Note that we first attempt to perform an implicit cast on the value and
+  // only fall back to the polymorphic Eq() matcher afterwards because the
+  // latter calls bool operator==(const Lhs& lhs, const Rhs& rhs) in the end
+  // which might be undefined even when Rhs is implicitly convertible to Lhs
+  // (e.g. std::pair<const int, int> vs. std::pair<int, int>).
+  //
+  // We don't define this method inline as we need the declaration of Eq().
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::false_type /* convertible_to_T */);
+};
+
+// This more specialized version is used when MatcherCast()'s argument
+// is already a Matcher.  This only compiles when type T can be
+// statically converted to type U.
+template <typename T, typename U>
+class MatcherCastImpl<T, Matcher<U> > {
+ public:
+  static Matcher<T> Cast(const Matcher<U>& source_matcher) {
+    return Matcher<T>(new Impl(source_matcher));
+  }
+
+ private:
+  class Impl : public MatcherInterface<T> {
+   public:
+    explicit Impl(const Matcher<U>& source_matcher)
+        : source_matcher_(source_matcher) {}
+
+    // We delegate the matching logic to the source matcher.
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      using FromType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<T>::type>::type>::type;
+      using ToType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<U>::type>::type>::type;
+      // Do not allow implicitly converting base*/& to derived*/&.
+      static_assert(
+          // Do not trigger if only one of them is a pointer. That implies a
+          // regular conversion and not a down_cast.
+          (std::is_pointer<typename std::remove_reference<T>::type>::value !=
+           std::is_pointer<typename std::remove_reference<U>::type>::value) ||
+              std::is_same<FromType, ToType>::value ||
+              !std::is_base_of<FromType, ToType>::value,
+          "Can't implicitly convert from <base> to <derived>");
+
+      return source_matcher_.MatchAndExplain(static_cast<U>(x), listener);
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      source_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      source_matcher_.DescribeNegationTo(os);
+    }
+
+   private:
+    const Matcher<U> source_matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+};
+
+// This even more specialized version is used for efficiently casting
+// a matcher to its own type.
+template <typename T>
+class MatcherCastImpl<T, Matcher<T> > {
+ public:
+  static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
+};
+
+}  // namespace internal
+
+// In order to be safe and clear, casting between different matcher
+// types is done explicitly via MatcherCast<T>(m), which takes a
+// matcher m and returns a Matcher<T>.  It compiles only when T can be
+// statically converted to the argument type of m.
+template <typename T, typename M>
+inline Matcher<T> MatcherCast(const M& matcher) {
+  return internal::MatcherCastImpl<T, M>::Cast(matcher);
+}
+
+// Implements SafeMatcherCast().
+//
+// FIXME: The intermediate SafeMatcherCastImpl class was introduced as a
+// workaround for a compiler bug, and can now be removed.
+template <typename T>
+class SafeMatcherCastImpl {
+ public:
+  // This overload handles polymorphic matchers and values only since
+  // monomorphic matchers are handled by the next one.
+  template <typename M>
+  static inline Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    return internal::MatcherCastImpl<T, M>::Cast(polymorphic_matcher_or_value);
+  }
+
+  // This overload handles monomorphic matchers.
+  //
+  // In general, if type T can be implicitly converted to type U, we can
+  // safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
+  // contravariant): just keep a copy of the original Matcher<U>, convert the
+  // argument from type T to U, and then pass it to the underlying Matcher<U>.
+  // The only exception is when U is a reference and T is not, as the
+  // underlying Matcher<U> may be interested in the argument's address, which
+  // is not preserved in the conversion from T to U.
+  template <typename U>
+  static inline Matcher<T> Cast(const Matcher<U>& matcher) {
+    // Enforce that T can be implicitly converted to U.
+    GTEST_COMPILE_ASSERT_((std::is_convertible<T, U>::value),
+                          "T must be implicitly convertible to U");
+    // Enforce that we are not converting a non-reference type T to a reference
+    // type U.
+    GTEST_COMPILE_ASSERT_(
+        std::is_reference<T>::value || !std::is_reference<U>::value,
+        cannot_convert_non_reference_arg_to_reference);
+    // In case both T and U are arithmetic types, enforce that the
+    // conversion is not lossy.
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
+    const bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
+    const bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
+    GTEST_COMPILE_ASSERT_(
+        kTIsOther || kUIsOther ||
+        (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+        conversion_of_arithmetic_types_must_be_lossless);
+    return MatcherCast<T>(matcher);
+  }
+};
+
+template <typename T, typename M>
+inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher) {
+  return SafeMatcherCastImpl<T>::Cast(polymorphic_matcher);
+}
+
+// A<T>() returns a matcher that matches any value of type T.
+template <typename T>
+Matcher<T> A();
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// If the explanation is not empty, prints it to the ostream.
+inline void PrintIfNotEmpty(const std::string& explanation,
+                            ::std::ostream* os) {
+  if (explanation != "" && os != nullptr) {
+    *os << ", " << explanation;
+  }
+}
+
+// Returns true if the given type name is easy to read by a human.
+// This is used to decide whether printing the type of a value might
+// be helpful.
+inline bool IsReadableTypeName(const std::string& type_name) {
+  // We consider a type name readable if it's short or doesn't contain
+  // a template or function type.
+  return (type_name.length() <= 20 ||
+          type_name.find_first_of("<(") == std::string::npos);
+}
+
+// Matches the value against the given matcher, prints the value and explains
+// the match result to the listener. Returns the match result.
+// 'listener' must not be NULL.
+// Value cannot be passed by const reference, because some matchers take a
+// non-const argument.
+template <typename Value, typename T>
+bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
+                          MatchResultListener* listener) {
+  if (!listener->IsInterested()) {
+    // If the listener is not interested, we do not need to construct the
+    // inner explanation.
+    return matcher.Matches(value);
+  }
+
+  StringMatchResultListener inner_listener;
+  const bool match = matcher.MatchAndExplain(value, &inner_listener);
+
+  UniversalPrint(value, listener->stream());
+#if GTEST_HAS_RTTI
+  const std::string& type_name = GetTypeName<Value>();
+  if (IsReadableTypeName(type_name))
+    *listener->stream() << " (of type " << type_name << ")";
+#endif
+  PrintIfNotEmpty(inner_listener.str(), listener->stream());
+
+  return match;
+}
+
+// An internal helper class for doing compile-time loop on a tuple's
+// fields.
+template <size_t N>
+class TuplePrefix {
+ public:
+  // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
+  // if the first N fields of matcher_tuple matches the first N
+  // fields of value_tuple, respectively.
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& matcher_tuple,
+                      const ValueTuple& value_tuple) {
+    return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple) &&
+           std::get<N - 1>(matcher_tuple).Matches(std::get<N - 1>(value_tuple));
+  }
+
+  // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
+  // describes failures in matching the first N fields of matchers
+  // against the first N fields of values.  If there is no failure,
+  // nothing will be streamed to os.
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
+                                     const ValueTuple& values,
+                                     ::std::ostream* os) {
+    // First, describes failures in the first N - 1 fields.
+    TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
+
+    // Then describes the failure (if any) in the (N - 1)-th (0-based)
+    // field.
+    typename std::tuple_element<N - 1, MatcherTuple>::type matcher =
+        std::get<N - 1>(matchers);
+    typedef typename std::tuple_element<N - 1, ValueTuple>::type Value;
+    const Value& value = std::get<N - 1>(values);
+    StringMatchResultListener listener;
+    if (!matcher.MatchAndExplain(value, &listener)) {
+      *os << "  Expected arg #" << N - 1 << ": ";
+      std::get<N - 1>(matchers).DescribeTo(os);
+      *os << "\n           Actual: ";
+      // We remove the reference in type Value to prevent the
+      // universal printer from printing the address of value, which
+      // isn't interesting to the user most of the time.  The
+      // matcher's MatchAndExplain() method handles the case when
+      // the address is interesting.
+      internal::UniversalPrint(value, os);
+      PrintIfNotEmpty(listener.str(), os);
+      *os << "\n";
+    }
+  }
+};
+
+// The base case.
+template <>
+class TuplePrefix<0> {
+ public:
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& /* matcher_tuple */,
+                      const ValueTuple& /* value_tuple */) {
+    return true;
+  }
+
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
+                                     const ValueTuple& /* values */,
+                                     ::std::ostream* /* os */) {}
+};
+
+// TupleMatches(matcher_tuple, value_tuple) returns true if all
+// matchers in matcher_tuple match the corresponding fields in
+// value_tuple.  It is a compiler error if matcher_tuple and
+// value_tuple have different number of fields or incompatible field
+// types.
+template <typename MatcherTuple, typename ValueTuple>
+bool TupleMatches(const MatcherTuple& matcher_tuple,
+                  const ValueTuple& value_tuple) {
+  // Makes sure that matcher_tuple and value_tuple have the same
+  // number of fields.
+  GTEST_COMPILE_ASSERT_(std::tuple_size<MatcherTuple>::value ==
+                            std::tuple_size<ValueTuple>::value,
+                        matcher_and_value_have_different_numbers_of_fields);
+  return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
+                                                                  value_tuple);
+}
+
+// Describes failures in matching matchers against values.  If there
+// is no failure, nothing will be streamed to os.
+template <typename MatcherTuple, typename ValueTuple>
+void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
+                                const ValueTuple& values,
+                                ::std::ostream* os) {
+  TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
+      matchers, values, os);
+}
+
+// TransformTupleValues and its helper.
+//
+// TransformTupleValuesHelper hides the internal machinery that
+// TransformTupleValues uses to implement a tuple traversal.
+template <typename Tuple, typename Func, typename OutIter>
+class TransformTupleValuesHelper {
+ private:
+  typedef ::std::tuple_size<Tuple> TupleSize;
+
+ public:
+  // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
+  // Returns the final value of 'out' in case the caller needs it.
+  static OutIter Run(Func f, const Tuple& t, OutIter out) {
+    return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
+  }
+
+ private:
+  template <typename Tup, size_t kRemainingSize>
+  struct IterateOverTuple {
+    OutIter operator() (Func f, const Tup& t, OutIter out) const {
+      *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
+      return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
+    }
+  };
+  template <typename Tup>
+  struct IterateOverTuple<Tup, 0> {
+    OutIter operator() (Func /* f */, const Tup& /* t */, OutIter out) const {
+      return out;
+    }
+  };
+};
+
+// Successively invokes 'f(element)' on each element of the tuple 't',
+// appending each result to the 'out' iterator. Returns the final value
+// of 'out'.
+template <typename Tuple, typename Func, typename OutIter>
+OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
+  return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
+}
+
+// Implements A<T>().
+template <typename T>
+class AnyMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  bool MatchAndExplain(const T& /* x */,
+                       MatchResultListener* /* listener */) const override {
+    return true;
+  }
+  void DescribeTo(::std::ostream* os) const override { *os << "is anything"; }
+  void DescribeNegationTo(::std::ostream* os) const override {
+    // This is mostly for completeness' safe, as it's not very useful
+    // to write Not(A<bool>()).  However we cannot completely rule out
+    // such a possibility, and it doesn't hurt to be prepared.
+    *os << "never matches";
+  }
+};
+
+// Implements _, a matcher that matches any value of any
+// type.  This is a polymorphic matcher, so we need a template type
+// conversion operator to make it appearing as a Matcher<T> for any
+// type T.
+class AnythingMatcher {
+ public:
+  template <typename T>
+  operator Matcher<T>() const { return A<T>(); }
+};
+
+// Implements the polymorphic IsNull() matcher, which matches any raw or smart
+// pointer that is NULL.
+class IsNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p == nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NULL";
+  }
+};
+
+// Implements the polymorphic NotNull() matcher, which matches any raw or smart
+// pointer that is not NULL.
+class NotNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p != nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is NULL";
+  }
+};
+
+// Ref(variable) matches any argument that is a reference to
+// 'variable'.  This matcher is polymorphic as it can match any
+// super type of the type of 'variable'.
+//
+// The RefMatcher template class implements Ref(variable).  It can
+// only be instantiated with a reference type.  This prevents a user
+// from mistakenly using Ref(x) to match a non-reference function
+// argument.  For example, the following will righteously cause a
+// compiler error:
+//
+//   int n;
+//   Matcher<int> m1 = Ref(n);   // This won't compile.
+//   Matcher<int&> m2 = Ref(n);  // This will compile.
+template <typename T>
+class RefMatcher;
+
+template <typename T>
+class RefMatcher<T&> {
+  // Google Mock is a generic framework and thus needs to support
+  // mocking any function types, including those that take non-const
+  // reference arguments.  Therefore the template parameter T (and
+  // Super below) can be instantiated to either a const type or a
+  // non-const type.
+ public:
+  // RefMatcher() takes a T& instead of const T&, as we want the
+  // compiler to catch using Ref(const_value) as a matcher for a
+  // non-const reference.
+  explicit RefMatcher(T& x) : object_(x) {}  // NOLINT
+
+  template <typename Super>
+  operator Matcher<Super&>() const {
+    // By passing object_ (type T&) to Impl(), which expects a Super&,
+    // we make sure that Super is a super type of T.  In particular,
+    // this catches using Ref(const_value) as a matcher for a
+    // non-const reference, as you cannot implicitly convert a const
+    // reference to a non-const reference.
+    return MakeMatcher(new Impl<Super>(object_));
+  }
+
+ private:
+  template <typename Super>
+  class Impl : public MatcherInterface<Super&> {
+   public:
+    explicit Impl(Super& x) : object_(x) {}  // NOLINT
+
+    // MatchAndExplain() takes a Super& (as opposed to const Super&)
+    // in order to match the interface MatcherInterface<Super&>.
+    bool MatchAndExplain(Super& x,
+                         MatchResultListener* listener) const override {
+      *listener << "which is located @" << static_cast<const void*>(&x);
+      return &x == &object_;
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "references the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not reference the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+   private:
+    const Super& object_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& object_;
+
+  GTEST_DISALLOW_ASSIGN_(RefMatcher);
+};
+
+// Polymorphic helper functions for narrow and wide string matchers.
+inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+  return String::CaseInsensitiveCStringEquals(lhs, rhs);
+}
+
+inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
+                                         const wchar_t* rhs) {
+  return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
+}
+
+// String comparison for narrow or wide strings that can have embedded NUL
+// characters.
+template <typename StringType>
+bool CaseInsensitiveStringEquals(const StringType& s1,
+                                 const StringType& s2) {
+  // Are the heads equal?
+  if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
+    return false;
+  }
+
+  // Skip the equal heads.
+  const typename StringType::value_type nul = 0;
+  const size_t i1 = s1.find(nul), i2 = s2.find(nul);
+
+  // Are we at the end of either s1 or s2?
+  if (i1 == StringType::npos || i2 == StringType::npos) {
+    return i1 == i2;
+  }
+
+  // Are the tails equal?
+  return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
+}
+
+// String matchers.
+
+// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
+template <typename StringType>
+class StrEqualityMatcher {
+ public:
+  StrEqualityMatcher(const StringType& str, bool expect_eq,
+                     bool case_sensitive)
+      : string_(str), expect_eq_(expect_eq), case_sensitive_(case_sensitive) {}
+
+#if GTEST_HAS_ABSL
+  bool MatchAndExplain(const absl::string_view& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if absl::string_view is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_HAS_ABSL
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    if (s == nullptr) {
+      return !expect_eq_;
+    }
+    return MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    const bool eq = case_sensitive_ ? s2 == string_ :
+        CaseInsensitiveStringEquals(s2, string_);
+    return expect_eq_ == eq;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    DescribeToHelper(expect_eq_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    DescribeToHelper(!expect_eq_, os);
+  }
+
+ private:
+  void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
+    *os << (expect_eq ? "is " : "isn't ");
+    *os << "equal to ";
+    if (!case_sensitive_) {
+      *os << "(ignoring case) ";
+    }
+    UniversalPrint(string_, os);
+  }
+
+  const StringType string_;
+  const bool expect_eq_;
+  const bool case_sensitive_;
+
+  GTEST_DISALLOW_ASSIGN_(StrEqualityMatcher);
+};
+
+// Implements the polymorphic HasSubstr(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class HasSubstrMatcher {
+ public:
+  explicit HasSubstrMatcher(const StringType& substring)
+      : substring_(substring) {}
+
+#if GTEST_HAS_ABSL
+  bool MatchAndExplain(const absl::string_view& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if absl::string_view is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_HAS_ABSL
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.find(substring_) != StringType::npos;
+  }
+
+  // Describes what this matcher matches.
+  void DescribeTo(::std::ostream* os) const {
+    *os << "has substring ";
+    UniversalPrint(substring_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "has no substring ";
+    UniversalPrint(substring_, os);
+  }
+
+ private:
+  const StringType substring_;
+
+  GTEST_DISALLOW_ASSIGN_(HasSubstrMatcher);
+};
+
+// Implements the polymorphic StartsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class StartsWithMatcher {
+ public:
+  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {
+  }
+
+#if GTEST_HAS_ABSL
+  bool MatchAndExplain(const absl::string_view& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if absl::string_view is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_HAS_ABSL
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= prefix_.length() &&
+        s2.substr(0, prefix_.length()) == prefix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "starts with ";
+    UniversalPrint(prefix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't start with ";
+    UniversalPrint(prefix_, os);
+  }
+
+ private:
+  const StringType prefix_;
+
+  GTEST_DISALLOW_ASSIGN_(StartsWithMatcher);
+};
+
+// Implements the polymorphic EndsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class EndsWithMatcher {
+ public:
+  explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
+
+#if GTEST_HAS_ABSL
+  bool MatchAndExplain(const absl::string_view& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if absl::string_view is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_HAS_ABSL
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= suffix_.length() &&
+        s2.substr(s2.length() - suffix_.length()) == suffix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "ends with ";
+    UniversalPrint(suffix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't end with ";
+    UniversalPrint(suffix_, os);
+  }
+
+ private:
+  const StringType suffix_;
+
+  GTEST_DISALLOW_ASSIGN_(EndsWithMatcher);
+};
+
+// Implements a matcher that compares the two fields of a 2-tuple
+// using one of the ==, <=, <, etc, operators.  The two fields being
+// compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq() can be
+// used to match a std::tuple<int, short>, a std::tuple<const long&, double>,
+// etc).  Therefore we use a template type conversion operator in the
+// implementation.
+template <typename D, typename Op>
+class PairMatchBase {
+ public:
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return Matcher<::std::tuple<T1, T2>>(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << D::Desc();
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* /* listener */) const override {
+      return Op()(::std::get<0>(args), ::std::get<1>(args));
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+  };
+};
+
+class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
+ public:
+  static const char* Desc() { return "an equal pair"; }
+};
+class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
+ public:
+  static const char* Desc() { return "an unequal pair"; }
+};
+class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
+ public:
+  static const char* Desc() { return "a pair where the first < the second"; }
+};
+class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
+ public:
+  static const char* Desc() { return "a pair where the first > the second"; }
+};
+class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
+ public:
+  static const char* Desc() { return "a pair where the first <= the second"; }
+};
+class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
+ public:
+  static const char* Desc() { return "a pair where the first >= the second"; }
+};
+
+// Implements the Not(...) matcher for a particular argument type T.
+// We do not nest it inside the NotMatcher class template, as that
+// will prevent different instantiations of NotMatcher from sharing
+// the same NotMatcherImpl<T> class.
+template <typename T>
+class NotMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit NotMatcherImpl(const Matcher<T>& matcher)
+      : matcher_(matcher) {}
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    return !matcher_.MatchAndExplain(x, listener);
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    matcher_.DescribeNegationTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<T> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcherImpl);
+};
+
+// Implements the Not(m) matcher, which matches a value that doesn't
+// match matcher m.
+template <typename InnerMatcher>
+class NotMatcher {
+ public:
+  explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
+
+  // This template type conversion operator allows Not(m) to be used
+  // to match any type m can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
+  }
+
+ private:
+  InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcher);
+};
+
+// Implements the AllOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the BothOfMatcher class template, as
+// that will prevent different instantiations of BothOfMatcher from
+// sharing the same BothOfMatcherImpl<T> class.
+template <typename T>
+class AllOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AllOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    // If either matcher1_ or matcher2_ doesn't match x, we only need
+    // to explain why one of them fails.
+    std::string all_match_result;
+
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        if (all_match_result.empty()) {
+          all_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            all_match_result += ", and ";
+            all_match_result += result;
+          }
+        }
+      } else {
+        *listener << slistener.str();
+        return false;
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them match.
+    *listener << all_match_result;
+    return true;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(AllOfMatcherImpl);
+};
+
+// VariadicMatcher is used for the variadic implementation of
+// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
+// CombiningMatcher<T> is used to recursively combine the provided matchers
+// (of type Args...).
+template <template <typename T> class CombiningMatcher, typename... Args>
+class VariadicMatcher {
+ public:
+  VariadicMatcher(const Args&... matchers)  // NOLINT
+      : matchers_(matchers...) {
+    static_assert(sizeof...(Args) > 0, "Must have at least one matcher.");
+  }
+
+  // This template type conversion operator allows an
+  // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
+  // all of the provided matchers (Matcher1, Matcher2, ...) can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    std::vector<Matcher<T> > values;
+    CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
+    return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
+  }
+
+ private:
+  template <typename T, size_t I>
+  void CreateVariadicMatcher(std::vector<Matcher<T> >* values,
+                             std::integral_constant<size_t, I>) const {
+    values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
+    CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
+  }
+
+  template <typename T>
+  void CreateVariadicMatcher(
+      std::vector<Matcher<T> >*,
+      std::integral_constant<size_t, sizeof...(Args)>) const {}
+
+  std::tuple<Args...> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(VariadicMatcher);
+};
+
+template <typename... Args>
+using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
+
+// Implements the AnyOf(m1, m2) matcher for a particular argument type
+// T.  We do not nest it inside the AnyOfMatcher class template, as
+// that will prevent different instantiations of AnyOfMatcher from
+// sharing the same EitherOfMatcherImpl<T> class.
+template <typename T>
+class AnyOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AnyOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    std::string no_match_result;
+
+    // If either matcher1_ or matcher2_ matches x, we just need to
+    // explain why *one* of them matches.
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        *listener << slistener.str();
+        return true;
+      } else {
+        if (no_match_result.empty()) {
+          no_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            no_match_result += ", and ";
+            no_match_result += result;
+          }
+        }
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them fail.
+    *listener << no_match_result;
+    return false;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(AnyOfMatcherImpl);
+};
+
+// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
+template <typename... Args>
+using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
+
+// Wrapper for implementation of Any/AllOfArray().
+template <template <class> class MatcherImpl, typename T>
+class SomeOfArrayMatcher {
+ public:
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename Iter>
+  SomeOfArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename U>
+  operator Matcher<U>() const {  // NOLINT
+    using RawU = typename std::decay<U>::type;
+    std::vector<Matcher<RawU>> matchers;
+    for (const auto& matcher : matchers_) {
+      matchers.push_back(MatcherCast<RawU>(matcher));
+    }
+    return Matcher<U>(new MatcherImpl<RawU>(std::move(matchers)));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(SomeOfArrayMatcher);
+};
+
+template <typename T>
+using AllOfArrayMatcher = SomeOfArrayMatcher<AllOfMatcherImpl, T>;
+
+template <typename T>
+using AnyOfArrayMatcher = SomeOfArrayMatcher<AnyOfMatcherImpl, T>;
+
+// Used for implementing Truly(pred), which turns a predicate into a
+// matcher.
+template <typename Predicate>
+class TrulyMatcher {
+ public:
+  explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
+
+  // This method template allows Truly(pred) to be used as a matcher
+  // for type T where T is the argument type of predicate 'pred'.  The
+  // argument is passed by reference as the predicate may be
+  // interested in the address of the argument.
+  template <typename T>
+  bool MatchAndExplain(T& x,  // NOLINT
+                       MatchResultListener* /* listener */) const {
+    // Without the if-statement, MSVC sometimes warns about converting
+    // a value to bool (warning 4800).
+    //
+    // We cannot write 'return !!predicate_(x);' as that doesn't work
+    // when predicate_(x) returns a class convertible to bool but
+    // having no operator!().
+    if (predicate_(x))
+      return true;
+    return false;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "satisfies the given predicate";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't satisfy the given predicate";
+  }
+
+ private:
+  Predicate predicate_;
+
+  GTEST_DISALLOW_ASSIGN_(TrulyMatcher);
+};
+
+// Used for implementing Matches(matcher), which turns a matcher into
+// a predicate.
+template <typename M>
+class MatcherAsPredicate {
+ public:
+  explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
+
+  // This template operator() allows Matches(m) to be used as a
+  // predicate on type T where m is a matcher on type T.
+  //
+  // The argument x is passed by reference instead of by value, as
+  // some matcher may be interested in its address (e.g. as in
+  // Matches(Ref(n))(x)).
+  template <typename T>
+  bool operator()(const T& x) const {
+    // We let matcher_ commit to a particular type here instead of
+    // when the MatcherAsPredicate object was constructed.  This
+    // allows us to write Matches(m) where m is a polymorphic matcher
+    // (e.g. Eq(5)).
+    //
+    // If we write Matcher<T>(matcher_).Matches(x) here, it won't
+    // compile when matcher_ has type Matcher<const T&>; if we write
+    // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
+    // when matcher_ has type Matcher<T>; if we just write
+    // matcher_.Matches(x), it won't compile when matcher_ is
+    // polymorphic, e.g. Eq(5).
+    //
+    // MatcherCast<const T&>() is necessary for making the code work
+    // in all of the above situations.
+    return MatcherCast<const T&>(matcher_).Matches(x);
+  }
+
+ private:
+  M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(MatcherAsPredicate);
+};
+
+// For implementing ASSERT_THAT() and EXPECT_THAT().  The template
+// argument M must be a type that can be converted to a matcher.
+template <typename M>
+class PredicateFormatterFromMatcher {
+ public:
+  explicit PredicateFormatterFromMatcher(M m) : matcher_(std::move(m)) {}
+
+  // This template () operator allows a PredicateFormatterFromMatcher
+  // object to act as a predicate-formatter suitable for using with
+  // Google Test's EXPECT_PRED_FORMAT1() macro.
+  template <typename T>
+  AssertionResult operator()(const char* value_text, const T& x) const {
+    // We convert matcher_ to a Matcher<const T&> *now* instead of
+    // when the PredicateFormatterFromMatcher object was constructed,
+    // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
+    // know which type to instantiate it to until we actually see the
+    // type of x here.
+    //
+    // We write SafeMatcherCast<const T&>(matcher_) instead of
+    // Matcher<const T&>(matcher_), as the latter won't compile when
+    // matcher_ has type Matcher<T> (e.g. An<int>()).
+    // We don't write MatcherCast<const T&> either, as that allows
+    // potentially unsafe downcasting of the matcher argument.
+    const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
+
+    // The expected path here is that the matcher should match (i.e. that most
+    // tests pass) so optimize for this case.
+    if (matcher.Matches(x)) {
+      return AssertionSuccess();
+    }
+
+    ::std::stringstream ss;
+    ss << "Value of: " << value_text << "\n"
+       << "Expected: ";
+    matcher.DescribeTo(&ss);
+
+    // Rerun the matcher to "PrintAndExain" the failure.
+    StringMatchResultListener listener;
+    if (MatchPrintAndExplain(x, matcher, &listener)) {
+      ss << "\n  The matcher failed on the initial attempt; but passed when "
+            "rerun to generate the explanation.";
+    }
+    ss << "\n  Actual: " << listener.str();
+    return AssertionFailure() << ss.str();
+  }
+
+ private:
+  const M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PredicateFormatterFromMatcher);
+};
+
+// A helper function for converting a matcher to a predicate-formatter
+// without the user needing to explicitly write the type.  This is
+// used for implementing ASSERT_THAT() and EXPECT_THAT().
+// Implementation detail: 'matcher' is received by-value to force decaying.
+template <typename M>
+inline PredicateFormatterFromMatcher<M>
+MakePredicateFormatterFromMatcher(M matcher) {
+  return PredicateFormatterFromMatcher<M>(std::move(matcher));
+}
+
+// Implements the polymorphic floating point equality matcher, which matches
+// two float values using ULP-based approximation or, optionally, a
+// user-specified epsilon.  The template is meant to be instantiated with
+// FloatType being either float or double.
+template <typename FloatType>
+class FloatingEqMatcher {
+ public:
+  // Constructor for FloatingEqMatcher.
+  // The matcher's input will be compared with expected.  The matcher treats two
+  // NANs as equal if nan_eq_nan is true.  Otherwise, under IEEE standards,
+  // equality comparisons between NANs will always return false.  We specify a
+  // negative max_abs_error_ term to indicate that ULP-based approximation will
+  // be used for comparison.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan) :
+    expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {
+  }
+
+  // Constructor that supports a user-specified max_abs_error that will be used
+  // for comparison instead of ULP-based approximation.  The max absolute
+  // should be non-negative.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
+                    FloatType max_abs_error)
+      : expected_(expected),
+        nan_eq_nan_(nan_eq_nan),
+        max_abs_error_(max_abs_error) {
+    GTEST_CHECK_(max_abs_error >= 0)
+        << ", where max_abs_error is" << max_abs_error;
+  }
+
+  // Implements floating point equality matcher as a Matcher<T>.
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
+        : expected_(expected),
+          nan_eq_nan_(nan_eq_nan),
+          max_abs_error_(max_abs_error) {}
+
+    bool MatchAndExplain(T value,
+                         MatchResultListener* listener) const override {
+      const FloatingPoint<FloatType> actual(value), expected(expected_);
+
+      // Compares NaNs first, if nan_eq_nan_ is true.
+      if (actual.is_nan() || expected.is_nan()) {
+        if (actual.is_nan() && expected.is_nan()) {
+          return nan_eq_nan_;
+        }
+        // One is nan; the other is not nan.
+        return false;
+      }
+      if (HasMaxAbsError()) {
+        // We perform an equality check so that inf will match inf, regardless
+        // of error bounds.  If the result of value - expected_ would result in
+        // overflow or if either value is inf, the default result is infinity,
+        // which should only match if max_abs_error_ is also infinity.
+        if (value == expected_) {
+          return true;
+        }
+
+        const FloatType diff = value - expected_;
+        if (fabs(diff) <= max_abs_error_) {
+          return true;
+        }
+
+        if (listener->IsInterested()) {
+          *listener << "which is " << diff << " from " << expected_;
+        }
+        return false;
+      } else {
+        return actual.AlmostEquals(expected);
+      }
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      // os->precision() returns the previously set precision, which we
+      // store to restore the ostream to its original configuration
+      // after outputting.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "is NaN";
+        } else {
+          *os << "never matches";
+        }
+      } else {
+        *os << "is approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error <= " << max_abs_error_ << ")";
+        }
+      }
+      os->precision(old_precision);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      // As before, get original precision.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "isn't NaN";
+        } else {
+          *os << "is anything";
+        }
+      } else {
+        *os << "isn't approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error > " << max_abs_error_ << ")";
+        }
+      }
+      // Restore original precision.
+      os->precision(old_precision);
+    }
+
+   private:
+    bool HasMaxAbsError() const {
+      return max_abs_error_ >= 0;
+    }
+
+    const FloatType expected_;
+    const bool nan_eq_nan_;
+    // max_abs_error will be used for value comparison when >= 0.
+    const FloatType max_abs_error_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  // The following 3 type conversion operators allow FloatEq(expected) and
+  // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
+  // Matcher<const float&>, or a Matcher<float&>, but nothing else.
+  // (While Google's C++ coding style doesn't allow arguments passed
+  // by non-const reference, we may see them in code not conforming to
+  // the style.  Therefore Google Mock needs to support them.)
+  operator Matcher<FloatType>() const {
+    return MakeMatcher(
+        new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<const FloatType&>() const {
+    return MakeMatcher(
+        new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<FloatType&>() const {
+    return MakeMatcher(
+        new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+ private:
+  const FloatType expected_;
+  const bool nan_eq_nan_;
+  // max_abs_error will be used for value comparison when >= 0.
+  const FloatType max_abs_error_;
+
+  GTEST_DISALLOW_ASSIGN_(FloatingEqMatcher);
+};
+
+// A 2-tuple ("binary") wrapper around FloatingEqMatcher:
+// FloatingEq2Matcher() matches (x, y) by matching FloatingEqMatcher(x, false)
+// against y, and FloatingEq2Matcher(e) matches FloatingEqMatcher(x, false, e)
+// against y. The former implements "Eq", the latter "Near". At present, there
+// is no version that compares NaNs as equal.
+template <typename FloatType>
+class FloatingEq2Matcher {
+ public:
+  FloatingEq2Matcher() { Init(-1, false); }
+
+  explicit FloatingEq2Matcher(bool nan_eq_nan) { Init(-1, nan_eq_nan); }
+
+  explicit FloatingEq2Matcher(FloatType max_abs_error) {
+    Init(max_abs_error, false);
+  }
+
+  FloatingEq2Matcher(FloatType max_abs_error, bool nan_eq_nan) {
+    Init(max_abs_error, nan_eq_nan);
+  }
+
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return MakeMatcher(
+        new Impl<::std::tuple<T1, T2>>(max_abs_error_, nan_eq_nan_));
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(
+        new Impl<const ::std::tuple<T1, T2>&>(max_abs_error_, nan_eq_nan_));
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << "an almost-equal pair";
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    Impl(FloatType max_abs_error, bool nan_eq_nan) :
+        max_abs_error_(max_abs_error),
+        nan_eq_nan_(nan_eq_nan) {}
+
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* listener) const override {
+      if (max_abs_error_ == -1) {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      } else {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_,
+                                        max_abs_error_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      }
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+
+   private:
+    FloatType max_abs_error_;
+    const bool nan_eq_nan_;
+  };
+
+  void Init(FloatType max_abs_error_val, bool nan_eq_nan_val) {
+    max_abs_error_ = max_abs_error_val;
+    nan_eq_nan_ = nan_eq_nan_val;
+  }
+  FloatType max_abs_error_;
+  bool nan_eq_nan_;
+};
+
+// Implements the Pointee(m) matcher for matching a pointer whose
+// pointee matches matcher m.  The pointer can be either raw or smart.
+template <typename InnerMatcher>
+class PointeeMatcher {
+ public:
+  explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+  // This type conversion operator template allows Pointee(m) to be
+  // used as a matcher for any pointer type whose pointee type is
+  // compatible with the inner matcher, where type Pointer can be
+  // either a raw pointer or a smart pointer.
+  //
+  // The reason we do this instead of relying on
+  // MakePolymorphicMatcher() is that the latter is not flexible
+  // enough for implementing the DescribeTo() method of Pointee().
+  template <typename Pointer>
+  operator Matcher<Pointer>() const {
+    return Matcher<Pointer>(new Impl<const Pointer&>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular pointer type.
+  template <typename Pointer>
+  class Impl : public MatcherInterface<Pointer> {
+   public:
+    typedef typename PointeeOf<typename std::remove_const<
+        typename std::remove_reference<Pointer>::type>::type>::type Pointee;
+
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<const Pointee&>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "points to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not point to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(Pointer pointer,
+                         MatchResultListener* listener) const override {
+      if (GetRawPointer(pointer) == nullptr) return false;
+
+      *listener << "which points to ";
+      return MatchPrintAndExplain(*pointer, matcher_, listener);
+    }
+
+   private:
+    const Matcher<const Pointee&> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PointeeMatcher);
+};
+
+#if GTEST_HAS_RTTI
+// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
+// reference that matches inner_matcher when dynamic_cast<T> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+class WhenDynamicCastToMatcherBase {
+ public:
+  explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
+      : matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeNegationTo(os);
+  }
+
+ protected:
+  const Matcher<To> matcher_;
+
+  static std::string GetToName() {
+    return GetTypeName<To>();
+  }
+
+ private:
+  static void GetCastTypeDescription(::std::ostream* os) {
+    *os << "when dynamic_cast to " << GetToName() << ", ";
+  }
+
+  GTEST_DISALLOW_ASSIGN_(WhenDynamicCastToMatcherBase);
+};
+
+// Primary template.
+// To is a pointer. Cast and forward the result.
+template <typename To>
+class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
+      : WhenDynamicCastToMatcherBase<To>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From from, MatchResultListener* listener) const {
+    To to = dynamic_cast<To>(from);
+    return MatchPrintAndExplain(to, this->matcher_, listener);
+  }
+};
+
+// Specialize for references.
+// In this case we return false if the dynamic_cast fails.
+template <typename To>
+class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
+      : WhenDynamicCastToMatcherBase<To&>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From& from, MatchResultListener* listener) const {
+    // We don't want an std::bad_cast here, so do the cast with pointers.
+    To* to = dynamic_cast<To*>(&from);
+    if (to == nullptr) {
+      *listener << "which cannot be dynamic_cast to " << this->GetToName();
+      return false;
+    }
+    return MatchPrintAndExplain(*to, this->matcher_, listener);
+  }
+};
+#endif  // GTEST_HAS_RTTI
+
+// Implements the Field() matcher for matching a field (i.e. member
+// variable) of an object.
+template <typename Class, typename FieldType>
+class FieldMatcher {
+ public:
+  FieldMatcher(FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field), matcher_(matcher), whose_field_("whose given field ") {}
+
+  FieldMatcher(const std::string& field_name, FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field),
+        matcher_(matcher),
+        whose_field_("whose field `" + field_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+    // FIXME: The dispatch on std::is_pointer was introduced as a workaround for
+    // a compiler bug, and can now be removed.
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_field_ << "is ";
+    return MatchPrintAndExplain(obj.*field_, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a field, it must be a class/struct/union type and
+    // thus cannot be a pointer.  Therefore we pass false_type() as
+    // the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  const FieldType Class::*field_;
+  const Matcher<const FieldType&> matcher_;
+
+  // Contains either "whose given field " if the name of the field is unknown
+  // or "whose field `name_of_field` " if the name is known.
+  const std::string whose_field_;
+
+  GTEST_DISALLOW_ASSIGN_(FieldMatcher);
+};
+
+// Implements the Property() matcher for matching a property
+// (i.e. return value of a getter method) of an object.
+//
+// Property is a const-qualified member function of Class returning
+// PropertyType.
+template <typename Class, typename PropertyType, typename Property>
+class PropertyMatcher {
+ public:
+  typedef const PropertyType& RefToConstProperty;
+
+  PropertyMatcher(Property property, const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose given property ") {}
+
+  PropertyMatcher(const std::string& property_name, Property property,
+                  const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose property `" + property_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T&value, MatchResultListener* listener) const {
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_property_ << "is ";
+    // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
+    // which takes a non-const reference as argument.
+    RefToConstProperty result = (obj.*property_)();
+    return MatchPrintAndExplain(result, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a property method, it must be a class/struct/union
+    // type and thus cannot be a pointer.  Therefore we pass
+    // false_type() as the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  Property property_;
+  const Matcher<RefToConstProperty> matcher_;
+
+  // Contains either "whose given property " if the name of the property is
+  // unknown or "whose property `name_of_property` " if the name is known.
+  const std::string whose_property_;
+
+  GTEST_DISALLOW_ASSIGN_(PropertyMatcher);
+};
+
+// Type traits specifying various features of different functors for ResultOf.
+// The default template specifies features for functor objects.
+template <typename Functor>
+struct CallableTraits {
+  typedef Functor StorageType;
+
+  static void CheckIsValid(Functor /* functor */) {}
+
+  template <typename T>
+  static auto Invoke(Functor f, T arg) -> decltype(f(arg)) { return f(arg); }
+};
+
+// Specialization for function pointers.
+template <typename ArgType, typename ResType>
+struct CallableTraits<ResType(*)(ArgType)> {
+  typedef ResType ResultType;
+  typedef ResType(*StorageType)(ArgType);
+
+  static void CheckIsValid(ResType(*f)(ArgType)) {
+    GTEST_CHECK_(f != nullptr)
+        << "NULL function pointer is passed into ResultOf().";
+  }
+  template <typename T>
+  static ResType Invoke(ResType(*f)(ArgType), T arg) {
+    return (*f)(arg);
+  }
+};
+
+// Implements the ResultOf() matcher for matching a return value of a
+// unary function of an object.
+template <typename Callable, typename InnerMatcher>
+class ResultOfMatcher {
+ public:
+  ResultOfMatcher(Callable callable, InnerMatcher matcher)
+      : callable_(std::move(callable)), matcher_(std::move(matcher)) {
+    CallableTraits<Callable>::CheckIsValid(callable_);
+  }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new Impl<T>(callable_, matcher_));
+  }
+
+ private:
+  typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
+
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+    using ResultType = decltype(CallableTraits<Callable>::template Invoke<T>(
+        std::declval<CallableStorageType>(), std::declval<T>()));
+
+   public:
+    template <typename M>
+    Impl(const CallableStorageType& callable, const M& matcher)
+        : callable_(callable), matcher_(MatcherCast<ResultType>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
+      *listener << "which is mapped by the given callable to ";
+      // Cannot pass the return value directly to MatchPrintAndExplain, which
+      // takes a non-const reference as argument.
+      // Also, specifying template argument explicitly is needed because T could
+      // be a non-const reference (e.g. Matcher<Uncopyable&>).
+      ResultType result =
+          CallableTraits<Callable>::template Invoke<T>(callable_, obj);
+      return MatchPrintAndExplain(result, matcher_, listener);
+    }
+
+   private:
+    // Functors often define operator() as non-const method even though
+    // they are actually stateless. But we need to use them even when
+    // 'this' is a const pointer. It's the user's responsibility not to
+    // use stateful callables with ResultOf(), which doesn't guarantee
+    // how many times the callable will be invoked.
+    mutable CallableStorageType callable_;
+    const Matcher<ResultType> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };  // class Impl
+
+  const CallableStorageType callable_;
+  const InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ResultOfMatcher);
+};
+
+// Implements a matcher that checks the size of an STL-style container.
+template <typename SizeMatcher>
+class SizeIsMatcher {
+ public:
+  explicit SizeIsMatcher(const SizeMatcher& size_matcher)
+       : size_matcher_(size_matcher) {
+  }
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(size_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    using SizeType = decltype(std::declval<Container>().size());
+    explicit Impl(const SizeMatcher& size_matcher)
+        : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      SizeType size = container.size();
+      StringMatchResultListener size_listener;
+      const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
+      *listener
+          << "whose size " << size << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(size_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<SizeType> size_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const SizeMatcher size_matcher_;
+  GTEST_DISALLOW_ASSIGN_(SizeIsMatcher);
+};
+
+// Implements a matcher that checks the begin()..end() distance of an STL-style
+// container.
+template <typename DistanceMatcher>
+class BeginEndDistanceIsMatcher {
+ public:
+  explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
+      : distance_matcher_(distance_matcher) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(distance_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    typedef internal::StlContainerView<
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)> ContainerView;
+    typedef typename std::iterator_traits<
+        typename ContainerView::type::const_iterator>::difference_type
+        DistanceType;
+    explicit Impl(const DistanceMatcher& distance_matcher)
+        : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      using std::begin;
+      using std::end;
+      DistanceType distance = std::distance(begin(container), end(container));
+      StringMatchResultListener distance_listener;
+      const bool result =
+          distance_matcher_.MatchAndExplain(distance, &distance_listener);
+      *listener << "whose distance between begin() and end() " << distance
+                << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(distance_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<DistanceType> distance_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const DistanceMatcher distance_matcher_;
+  GTEST_DISALLOW_ASSIGN_(BeginEndDistanceIsMatcher);
+};
+
+// Implements an equality matcher for any STL-style container whose elements
+// support ==. This matcher is like Eq(), but its failure explanations provide
+// more detailed information that is useful when the container is used as a set.
+// The failure message reports elements that are in one of the operands but not
+// the other. The failure messages do not report duplicate or out-of-order
+// elements in the containers (which don't properly matter to sets, but can
+// occur if the containers are vectors or lists, for example).
+//
+// Uses the container's const_iterator, value_type, operator ==,
+// begin(), and end().
+template <typename Container>
+class ContainerEqMatcher {
+ public:
+  typedef internal::StlContainerView<Container> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+
+  // We make a copy of expected in case the elements in it are modified
+  // after this matcher is created.
+  explicit ContainerEqMatcher(const Container& expected)
+      : expected_(View::Copy(expected)) {
+    // Makes sure the user doesn't instantiate this class template
+    // with a const or reference type.
+    (void)testing::StaticAssertTypeEq<Container,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>();
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "equals ";
+    UniversalPrint(expected_, os);
+  }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "does not equal ";
+    UniversalPrint(expected_, os);
+  }
+
+  template <typename LhsContainer>
+  bool MatchAndExplain(const LhsContainer& lhs,
+                       MatchResultListener* listener) const {
+    typedef internal::StlContainerView<
+        typename std::remove_const<LhsContainer>::type>
+        LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+    if (lhs_stl_container == expected_)
+      return true;
+
+    ::std::ostream* const os = listener->stream();
+    if (os != nullptr) {
+      // Something is different. Check for extra values first.
+      bool printed_header = false;
+      for (typename LhsStlContainer::const_iterator it =
+               lhs_stl_container.begin();
+           it != lhs_stl_container.end(); ++it) {
+        if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
+            expected_.end()) {
+          if (printed_header) {
+            *os << ", ";
+          } else {
+            *os << "which has these unexpected elements: ";
+            printed_header = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+
+      // Now check for missing values.
+      bool printed_header2 = false;
+      for (typename StlContainer::const_iterator it = expected_.begin();
+           it != expected_.end(); ++it) {
+        if (internal::ArrayAwareFind(
+                lhs_stl_container.begin(), lhs_stl_container.end(), *it) ==
+            lhs_stl_container.end()) {
+          if (printed_header2) {
+            *os << ", ";
+          } else {
+            *os << (printed_header ? ",\nand" : "which")
+                << " doesn't have these expected elements: ";
+            printed_header2 = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+    }
+
+    return false;
+  }
+
+ private:
+  const StlContainer expected_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainerEqMatcher);
+};
+
+// A comparator functor that uses the < operator to compare two values.
+struct LessComparator {
+  template <typename T, typename U>
+  bool operator()(const T& lhs, const U& rhs) const { return lhs < rhs; }
+};
+
+// Implements WhenSortedBy(comparator, container_matcher).
+template <typename Comparator, typename ContainerMatcher>
+class WhenSortedByMatcher {
+ public:
+  WhenSortedByMatcher(const Comparator& comparator,
+                      const ContainerMatcher& matcher)
+      : comparator_(comparator), matcher_(matcher) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
+    // so that we can match associative containers.
+    typedef typename RemoveConstFromKey<
+        typename LhsStlContainer::value_type>::type LhsValue;
+
+    Impl(const Comparator& comparator, const ContainerMatcher& matcher)
+        : comparator_(comparator), matcher_(matcher) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
+                                               lhs_stl_container.end());
+      ::std::sort(
+           sorted_container.begin(), sorted_container.end(), comparator_);
+
+      if (!listener->IsInterested()) {
+        // If the listener is not interested, we do not need to
+        // construct the inner explanation.
+        return matcher_.Matches(sorted_container);
+      }
+
+      *listener << "which is ";
+      UniversalPrint(sorted_container, listener->stream());
+      *listener << " when sorted";
+
+      StringMatchResultListener inner_listener;
+      const bool match = matcher_.MatchAndExplain(sorted_container,
+                                                  &inner_listener);
+      PrintIfNotEmpty(inner_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Comparator comparator_;
+    const Matcher<const ::std::vector<LhsValue>&> matcher_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+ private:
+  const Comparator comparator_;
+  const ContainerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(WhenSortedByMatcher);
+};
+
+// Implements Pointwise(tuple_matcher, rhs_container).  tuple_matcher
+// must be able to be safely cast to Matcher<std::tuple<const T1&, const
+// T2&> >, where T1 and T2 are the types of elements in the LHS
+// container and the RHS container respectively.
+template <typename TupleMatcher, typename RhsContainer>
+class PointwiseMatcher {
+  GTEST_COMPILE_ASSERT_(
+      !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
+      use_UnorderedPointwise_with_hash_tables);
+
+ public:
+  typedef internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type RhsValue;
+
+  // Like ContainerEq, we make a copy of rhs in case the elements in
+  // it are modified after this matcher is created.
+  PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
+      : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {
+    // Makes sure the user doesn't instantiate this class template
+    // with a const or reference type.
+    (void)testing::StaticAssertTypeEq<RhsContainer,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>();
+  }
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
+        use_UnorderedPointwise_with_hash_tables);
+
+    return Matcher<LhsContainer>(
+        new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    typedef typename LhsStlContainer::value_type LhsValue;
+    // We pass the LHS value and the RHS value to the inner matcher by
+    // reference, as they may be expensive to copy.  We must use tuple
+    // instead of pair here, as a pair cannot hold references (C++ 98,
+    // 20.2.2 [lib.pairs]).
+    typedef ::std::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
+
+    Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
+        // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
+        : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+          rhs_(rhs) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "contains " << rhs_.size()
+          << " values, where each value and its corresponding value in ";
+      UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "doesn't contain exactly " << rhs_.size()
+          << " values, or contains a value x at some index i"
+          << " where x and the i-th value of ";
+      UniversalPrint(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      const size_t actual_size = lhs_stl_container.size();
+      if (actual_size != rhs_.size()) {
+        *listener << "which contains " << actual_size << " values";
+        return false;
+      }
+
+      typename LhsStlContainer::const_iterator left = lhs_stl_container.begin();
+      typename RhsStlContainer::const_iterator right = rhs_.begin();
+      for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
+        if (listener->IsInterested()) {
+          StringMatchResultListener inner_listener;
+          // Create InnerMatcherArg as a temporarily object to avoid it outlives
+          // *left and *right. Dereference or the conversion to `const T&` may
+          // return temp objects, e.g for vector<bool>.
+          if (!mono_tuple_matcher_.MatchAndExplain(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right)),
+                  &inner_listener)) {
+            *listener << "where the value pair (";
+            UniversalPrint(*left, listener->stream());
+            *listener << ", ";
+            UniversalPrint(*right, listener->stream());
+            *listener << ") at index #" << i << " don't match";
+            PrintIfNotEmpty(inner_listener.str(), listener->stream());
+            return false;
+          }
+        } else {
+          if (!mono_tuple_matcher_.Matches(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right))))
+            return false;
+        }
+      }
+
+      return true;
+    }
+
+   private:
+    const Matcher<InnerMatcherArg> mono_tuple_matcher_;
+    const RhsStlContainer rhs_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const TupleMatcher tuple_matcher_;
+  const RhsStlContainer rhs_;
+
+  GTEST_DISALLOW_ASSIGN_(PointwiseMatcher);
+};
+
+// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
+template <typename Container>
+class QuantifierMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InnerMatcher>
+  explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+           testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+
+  // Checks whether:
+  // * All elements in the container match, if all_elements_should_match.
+  // * Any element in the container matches, if !all_elements_should_match.
+  bool MatchAndExplainImpl(bool all_elements_should_match,
+                           Container container,
+                           MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    size_t i = 0;
+    for (typename StlContainer::const_iterator it = stl_container.begin();
+         it != stl_container.end(); ++it, ++i) {
+      StringMatchResultListener inner_listener;
+      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+
+      if (matches != all_elements_should_match) {
+        *listener << "whose element #" << i
+                  << (matches ? " matches" : " doesn't match");
+        PrintIfNotEmpty(inner_listener.str(), listener->stream());
+        return !all_elements_should_match;
+      }
+    }
+    return all_elements_should_match;
+  }
+
+ protected:
+  const Matcher<const Element&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(QuantifierMatcherImpl);
+};
+
+// Implements Contains(element_matcher) for the given argument type Container.
+// Symmetric to EachMatcherImpl.
+template <typename Container>
+class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "contains at least one element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't contain any element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(false, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcherImpl);
+};
+
+// Implements Each(element_matcher) for the given argument type Container.
+// Symmetric to ContainsMatcherImpl.
+template <typename Container>
+class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit EachMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "only contains elements that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "contains some element that ";
+    this->inner_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(true, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(EachMatcherImpl);
+};
+
+// Implements polymorphic Contains(element_matcher).
+template <typename M>
+class ContainsMatcher {
+ public:
+  explicit ContainsMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new ContainsMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcher);
+};
+
+// Implements polymorphic Each(element_matcher).
+template <typename M>
+class EachMatcher {
+ public:
+  explicit EachMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new EachMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(EachMatcher);
+};
+
+struct Rank1 {};
+struct Rank0 : Rank1 {};
+
+namespace pair_getters {
+using std::get;
+template <typename T>
+auto First(T& x, Rank1) -> decltype(get<0>(x)) {  // NOLINT
+  return get<0>(x);
+}
+template <typename T>
+auto First(T& x, Rank0) -> decltype((x.first)) {  // NOLINT
+  return x.first;
+}
+
+template <typename T>
+auto Second(T& x, Rank1) -> decltype(get<1>(x)) {  // NOLINT
+  return get<1>(x);
+}
+template <typename T>
+auto Second(T& x, Rank0) -> decltype((x.second)) {  // NOLINT
+  return x.second;
+}
+}  // namespace pair_getters
+
+// Implements Key(inner_matcher) for the given argument pair type.
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename PairType>
+class KeyMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type KeyType;
+
+  template <typename InnerMatcher>
+  explicit KeyMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+          testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {
+  }
+
+  // Returns true if 'key_value.first' (the key) matches the inner matcher.
+  bool MatchAndExplain(PairType key_value,
+                       MatchResultListener* listener) const override {
+    StringMatchResultListener inner_listener;
+    const bool match = inner_matcher_.MatchAndExplain(
+        pair_getters::First(key_value, Rank0()), &inner_listener);
+    const std::string explanation = inner_listener.str();
+    if (explanation != "") {
+      *listener << "whose first field is a value " << explanation;
+    }
+    return match;
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't have a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<const KeyType&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcherImpl);
+};
+
+// Implements polymorphic Key(matcher_for_key).
+template <typename M>
+class KeyMatcher {
+ public:
+  explicit KeyMatcher(M m) : matcher_for_key_(m) {}
+
+  template <typename PairType>
+  operator Matcher<PairType>() const {
+    return Matcher<PairType>(
+        new KeyMatcherImpl<const PairType&>(matcher_for_key_));
+  }
+
+ private:
+  const M matcher_for_key_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcher);
+};
+
+// Implements Pair(first_matcher, second_matcher) for the given argument pair
+// type with its two matchers. See Pair() function below.
+template <typename PairType>
+class PairMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type FirstType;
+  typedef typename RawPairType::second_type SecondType;
+
+  template <typename FirstMatcher, typename SecondMatcher>
+  PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(
+            testing::SafeMatcherCast<const FirstType&>(first_matcher)),
+        second_matcher_(
+            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeTo(os);
+    *os << ", and has a second field that ";
+    second_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeNegationTo(os);
+    *os << ", or has a second field that ";
+    second_matcher_.DescribeNegationTo(os);
+  }
+
+  // Returns true if 'a_pair.first' matches first_matcher and 'a_pair.second'
+  // matches second_matcher.
+  bool MatchAndExplain(PairType a_pair,
+                       MatchResultListener* listener) const override {
+    if (!listener->IsInterested()) {
+      // If the listener is not interested, we don't need to construct the
+      // explanation.
+      return first_matcher_.Matches(pair_getters::First(a_pair, Rank0())) &&
+             second_matcher_.Matches(pair_getters::Second(a_pair, Rank0()));
+    }
+    StringMatchResultListener first_inner_listener;
+    if (!first_matcher_.MatchAndExplain(pair_getters::First(a_pair, Rank0()),
+                                        &first_inner_listener)) {
+      *listener << "whose first field does not match";
+      PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
+      return false;
+    }
+    StringMatchResultListener second_inner_listener;
+    if (!second_matcher_.MatchAndExplain(pair_getters::Second(a_pair, Rank0()),
+                                         &second_inner_listener)) {
+      *listener << "whose second field does not match";
+      PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
+      return false;
+    }
+    ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
+                   listener);
+    return true;
+  }
+
+ private:
+  void ExplainSuccess(const std::string& first_explanation,
+                      const std::string& second_explanation,
+                      MatchResultListener* listener) const {
+    *listener << "whose both fields match";
+    if (first_explanation != "") {
+      *listener << ", where the first field is a value " << first_explanation;
+    }
+    if (second_explanation != "") {
+      *listener << ", ";
+      if (first_explanation != "") {
+        *listener << "and ";
+      } else {
+        *listener << "where ";
+      }
+      *listener << "the second field is a value " << second_explanation;
+    }
+  }
+
+  const Matcher<const FirstType&> first_matcher_;
+  const Matcher<const SecondType&> second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcherImpl);
+};
+
+// Implements polymorphic Pair(first_matcher, second_matcher).
+template <typename FirstMatcher, typename SecondMatcher>
+class PairMatcher {
+ public:
+  PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
+
+  template <typename PairType>
+  operator Matcher<PairType> () const {
+    return Matcher<PairType>(
+        new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
+  }
+
+ private:
+  const FirstMatcher first_matcher_;
+  const SecondMatcher second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcher);
+};
+
+// Implements ElementsAre() and ElementsAreArray().
+template <typename Container>
+class ElementsAreMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename InputIter>
+  ElementsAreMatcherImpl(InputIter first, InputIter last) {
+    while (first != last) {
+      matchers_.push_back(MatcherCast<const Element&>(*first++));
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "is empty";
+    } else if (count() == 1) {
+      *os << "has 1 element that ";
+      matchers_[0].DescribeTo(os);
+    } else {
+      *os << "has " << Elements(count()) << " where\n";
+      for (size_t i = 0; i != count(); ++i) {
+        *os << "element #" << i << " ";
+        matchers_[i].DescribeTo(os);
+        if (i + 1 < count()) {
+          *os << ",\n";
+        }
+      }
+    }
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "isn't empty";
+      return;
+    }
+
+    *os << "doesn't have " << Elements(count()) << ", or\n";
+    for (size_t i = 0; i != count(); ++i) {
+      *os << "element #" << i << " ";
+      matchers_[i].DescribeNegationTo(os);
+      if (i + 1 < count()) {
+        *os << ", or\n";
+      }
+    }
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    // To work with stream-like "containers", we must only walk
+    // through the elements in one pass.
+
+    const bool listener_interested = listener->IsInterested();
+
+    // explanations[i] is the explanation of the element at index i.
+    ::std::vector<std::string> explanations(count());
+    StlContainerReference stl_container = View::ConstReference(container);
+    typename StlContainer::const_iterator it = stl_container.begin();
+    size_t exam_pos = 0;
+    bool mismatch_found = false;  // Have we found a mismatched element yet?
+
+    // Go through the elements and matchers in pairs, until we reach
+    // the end of either the elements or the matchers, or until we find a
+    // mismatch.
+    for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
+      bool match;  // Does the current element match the current matcher?
+      if (listener_interested) {
+        StringMatchResultListener s;
+        match = matchers_[exam_pos].MatchAndExplain(*it, &s);
+        explanations[exam_pos] = s.str();
+      } else {
+        match = matchers_[exam_pos].Matches(*it);
+      }
+
+      if (!match) {
+        mismatch_found = true;
+        break;
+      }
+    }
+    // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
+
+    // Find how many elements the actual container has.  We avoid
+    // calling size() s.t. this code works for stream-like "containers"
+    // that don't define size().
+    size_t actual_count = exam_pos;
+    for (; it != stl_container.end(); ++it) {
+      ++actual_count;
+    }
+
+    if (actual_count != count()) {
+      // The element count doesn't match.  If the container is empty,
+      // there's no need to explain anything as Google Mock already
+      // prints the empty container.  Otherwise we just need to show
+      // how many elements there actually are.
+      if (listener_interested && (actual_count != 0)) {
+        *listener << "which has " << Elements(actual_count);
+      }
+      return false;
+    }
+
+    if (mismatch_found) {
+      // The element count matches, but the exam_pos-th element doesn't match.
+      if (listener_interested) {
+        *listener << "whose element #" << exam_pos << " doesn't match";
+        PrintIfNotEmpty(explanations[exam_pos], listener->stream());
+      }
+      return false;
+    }
+
+    // Every element matches its expectation.  We need to explain why
+    // (the obvious ones can be skipped).
+    if (listener_interested) {
+      bool reason_printed = false;
+      for (size_t i = 0; i != count(); ++i) {
+        const std::string& s = explanations[i];
+        if (!s.empty()) {
+          if (reason_printed) {
+            *listener << ",\nand ";
+          }
+          *listener << "whose element #" << i << " matches, " << s;
+          reason_printed = true;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  static Message Elements(size_t count) {
+    return Message() << count << (count == 1 ? " element" : " elements");
+  }
+
+  size_t count() const { return matchers_.size(); }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcherImpl);
+};
+
+// Connectivity matrix of (elements X matchers), in element-major order.
+// Initially, there are no edges.
+// Use NextGraph() to iterate over all possible edge configurations.
+// Use Randomize() to generate a random edge configuration.
+class GTEST_API_ MatchMatrix {
+ public:
+  MatchMatrix(size_t num_elements, size_t num_matchers)
+      : num_elements_(num_elements),
+        num_matchers_(num_matchers),
+        matched_(num_elements_* num_matchers_, 0) {
+  }
+
+  size_t LhsSize() const { return num_elements_; }
+  size_t RhsSize() const { return num_matchers_; }
+  bool HasEdge(size_t ilhs, size_t irhs) const {
+    return matched_[SpaceIndex(ilhs, irhs)] == 1;
+  }
+  void SetEdge(size_t ilhs, size_t irhs, bool b) {
+    matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
+  }
+
+  // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
+  // adds 1 to that number; returns false if incrementing the graph left it
+  // empty.
+  bool NextGraph();
+
+  void Randomize();
+
+  std::string DebugString() const;
+
+ private:
+  size_t SpaceIndex(size_t ilhs, size_t irhs) const {
+    return ilhs * num_matchers_ + irhs;
+  }
+
+  size_t num_elements_;
+  size_t num_matchers_;
+
+  // Each element is a char interpreted as bool. They are stored as a
+  // flattened array in lhs-major order, use 'SpaceIndex()' to translate
+  // a (ilhs, irhs) matrix coordinate into an offset.
+  ::std::vector<char> matched_;
+};
+
+typedef ::std::pair<size_t, size_t> ElementMatcherPair;
+typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
+
+// Returns a maximum bipartite matching for the specified graph 'g'.
+// The matching is represented as a vector of {element, matcher} pairs.
+GTEST_API_ ElementMatcherPairs
+FindMaxBipartiteMatching(const MatchMatrix& g);
+
+struct UnorderedMatcherRequire {
+  enum Flags {
+    Superset = 1 << 0,
+    Subset = 1 << 1,
+    ExactMatch = Superset | Subset,
+  };
+};
+
+// Untyped base class for implementing UnorderedElementsAre.  By
+// putting logic that's not specific to the element type here, we
+// reduce binary bloat and increase compilation speed.
+class GTEST_API_ UnorderedElementsAreMatcherImplBase {
+ protected:
+  explicit UnorderedElementsAreMatcherImplBase(
+      UnorderedMatcherRequire::Flags matcher_flags)
+      : match_flags_(matcher_flags) {}
+
+  // A vector of matcher describers, one for each element matcher.
+  // Does not own the describers (and thus can be used only when the
+  // element matchers are alive).
+  typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
+
+  // Describes this UnorderedElementsAre matcher.
+  void DescribeToImpl(::std::ostream* os) const;
+
+  // Describes the negation of this UnorderedElementsAre matcher.
+  void DescribeNegationToImpl(::std::ostream* os) const;
+
+  bool VerifyMatchMatrix(const ::std::vector<std::string>& element_printouts,
+                         const MatchMatrix& matrix,
+                         MatchResultListener* listener) const;
+
+  bool FindPairing(const MatchMatrix& matrix,
+                   MatchResultListener* listener) const;
+
+  MatcherDescriberVec& matcher_describers() {
+    return matcher_describers_;
+  }
+
+  static Message Elements(size_t n) {
+    return Message() << n << " element" << (n == 1 ? "" : "s");
+  }
+
+  UnorderedMatcherRequire::Flags match_flags() const { return match_flags_; }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  MatcherDescriberVec matcher_describers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImplBase);
+};
+
+// Implements UnorderedElementsAre, UnorderedElementsAreArray, IsSubsetOf, and
+// IsSupersetOf.
+template <typename Container>
+class UnorderedElementsAreMatcherImpl
+    : public MatcherInterface<Container>,
+      public UnorderedElementsAreMatcherImplBase {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::const_iterator StlContainerConstIterator;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InputIter>
+  UnorderedElementsAreMatcherImpl(UnorderedMatcherRequire::Flags matcher_flags,
+                                  InputIter first, InputIter last)
+      : UnorderedElementsAreMatcherImplBase(matcher_flags) {
+    for (; first != last; ++first) {
+      matchers_.push_back(MatcherCast<const Element&>(*first));
+      matcher_describers().push_back(matchers_.back().GetDescriber());
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    StlContainerReference stl_container = View::ConstReference(container);
+    ::std::vector<std::string> element_printouts;
+    MatchMatrix matrix =
+        AnalyzeElements(stl_container.begin(), stl_container.end(),
+                        &element_printouts, listener);
+
+    if (matrix.LhsSize() == 0 && matrix.RhsSize() == 0) {
+      return true;
+    }
+
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      if (matrix.LhsSize() != matrix.RhsSize()) {
+        // The element count doesn't match.  If the container is empty,
+        // there's no need to explain anything as Google Mock already
+        // prints the empty container. Otherwise we just need to show
+        // how many elements there actually are.
+        if (matrix.LhsSize() != 0 && listener->IsInterested()) {
+          *listener << "which has " << Elements(matrix.LhsSize());
+        }
+        return false;
+      }
+    }
+
+    return VerifyMatchMatrix(element_printouts, matrix, listener) &&
+           FindPairing(matrix, listener);
+  }
+
+ private:
+  template <typename ElementIter>
+  MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
+                              ::std::vector<std::string>* element_printouts,
+                              MatchResultListener* listener) const {
+    element_printouts->clear();
+    ::std::vector<char> did_match;
+    size_t num_elements = 0;
+    for (; elem_first != elem_last; ++num_elements, ++elem_first) {
+      if (listener->IsInterested()) {
+        element_printouts->push_back(PrintToString(*elem_first));
+      }
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        did_match.push_back(Matches(matchers_[irhs])(*elem_first));
+      }
+    }
+
+    MatchMatrix matrix(num_elements, matchers_.size());
+    ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
+    for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
+      }
+    }
+    return matrix;
+  }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImpl);
+};
+
+// Functor for use in TransformTuple.
+// Performs MatcherCast<Target> on an input argument of any type.
+template <typename Target>
+struct CastAndAppendTransform {
+  template <typename Arg>
+  Matcher<Target> operator()(const Arg& a) const {
+    return MatcherCast<Target>(a);
+  }
+};
+
+// Implements UnorderedElementsAre.
+template <typename MatcherTuple>
+class UnorderedElementsAreMatcher {
+ public:
+  explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
+      : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            UnorderedMatcherRequire::ExactMatch, matchers.begin(),
+            matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcher);
+};
+
+// Implements ElementsAre.
+template <typename MatcherTuple>
+class ElementsAreMatcher {
+ public:
+  explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
+            ::std::tuple_size<MatcherTuple>::value < 2,
+        use_UnorderedElementsAre_with_hash_tables);
+
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers.begin(), matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcher);
+};
+
+// Implements UnorderedElementsAreArray(), IsSubsetOf(), and IsSupersetOf().
+template <typename T>
+class UnorderedElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  UnorderedElementsAreArrayMatcher(UnorderedMatcherRequire::Flags match_flags,
+                                   Iter first, Iter last)
+      : match_flags_(match_flags), matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            match_flags_, matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreArrayMatcher);
+};
+
+// Implements ElementsAreArray().
+template <typename T>
+class ElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
+        use_UnorderedElementsAreArray_with_hash_tables);
+
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreArrayMatcher);
+};
+
+// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
+// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
+// second) is a polymorphic matcher that matches a value x if tm
+// matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+//
+// BoundSecondMatcher is copyable and assignable, as we need to put
+// instances of this class in a vector when implementing
+// UnorderedPointwise().
+template <typename Tuple2Matcher, typename Second>
+class BoundSecondMatcher {
+ public:
+  BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
+      : tuple2_matcher_(tm), second_value_(second) {}
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
+  }
+
+  // We have to define this for UnorderedPointwise() to compile in
+  // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
+  // which requires the elements to be assignable in C++98.  The
+  // compiler cannot generate the operator= for us, as Tuple2Matcher
+  // and Second may not be assignable.
+  //
+  // However, this should never be called, so the implementation just
+  // need to assert.
+  void operator=(const BoundSecondMatcher& /*rhs*/) {
+    GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
+  }
+
+ private:
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    typedef ::std::tuple<T, Second> ArgTuple;
+
+    Impl(const Tuple2Matcher& tm, const Second& second)
+        : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
+          second_value_(second) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "and ";
+      UniversalPrint(second_value_, os);
+      *os << " ";
+      mono_tuple2_matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
+                                                  listener);
+    }
+
+   private:
+    const Matcher<const ArgTuple&> mono_tuple2_matcher_;
+    const Second second_value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const Tuple2Matcher tuple2_matcher_;
+  const Second second_value_;
+};
+
+// Given a 2-tuple matcher tm and a value second,
+// MatcherBindSecond(tm, second) returns a matcher that matches a
+// value x if tm matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+template <typename Tuple2Matcher, typename Second>
+BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
+    const Tuple2Matcher& tm, const Second& second) {
+  return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values);
+
+// Implements a matcher that checks the value of a optional<> type variable.
+template <typename ValueMatcher>
+class OptionalMatcher {
+ public:
+  explicit OptionalMatcher(const ValueMatcher& value_matcher)
+      : value_matcher_(value_matcher) {}
+
+  template <typename Optional>
+  operator Matcher<Optional>() const {
+    return Matcher<Optional>(new Impl<const Optional&>(value_matcher_));
+  }
+
+  template <typename Optional>
+  class Impl : public MatcherInterface<Optional> {
+   public:
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Optional) OptionalView;
+    typedef typename OptionalView::value_type ValueType;
+    explicit Impl(const ValueMatcher& value_matcher)
+        : value_matcher_(MatcherCast<ValueType>(value_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Optional optional,
+                         MatchResultListener* listener) const override {
+      if (!optional) {
+        *listener << "which is not engaged";
+        return false;
+      }
+      const ValueType& value = *optional;
+      StringMatchResultListener value_listener;
+      const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
+      *listener << "whose value " << PrintToString(value)
+                << (match ? " matches" : " doesn't match");
+      PrintIfNotEmpty(value_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Matcher<ValueType> value_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const ValueMatcher value_matcher_;
+  GTEST_DISALLOW_ASSIGN_(OptionalMatcher);
+};
+
+namespace variant_matcher {
+// Overloads to allow VariantMatcher to do proper ADL lookup.
+template <typename T>
+void holds_alternative() {}
+template <typename T>
+void get() {}
+
+// Implements a matcher that checks the value of a variant<> type variable.
+template <typename T>
+class VariantMatcher {
+ public:
+  explicit VariantMatcher(::testing::Matcher<const T&> matcher)
+      : matcher_(std::move(matcher)) {}
+
+  template <typename Variant>
+  bool MatchAndExplain(const Variant& value,
+                       ::testing::MatchResultListener* listener) const {
+    using std::get;
+    if (!listener->IsInterested()) {
+      return holds_alternative<T>(value) && matcher_.Matches(get<T>(value));
+    }
+
+    if (!holds_alternative<T>(value)) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    const T& elem = get<T>(value);
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(elem, &elem_listener);
+    *listener << "whose value " << PrintToString(elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace variant_matcher
+
+namespace any_cast_matcher {
+
+// Overloads to allow AnyCastMatcher to do proper ADL lookup.
+template <typename T>
+void any_cast() {}
+
+// Implements a matcher that any_casts the value.
+template <typename T>
+class AnyCastMatcher {
+ public:
+  explicit AnyCastMatcher(const ::testing::Matcher<const T&>& matcher)
+      : matcher_(matcher) {}
+
+  template <typename AnyType>
+  bool MatchAndExplain(const AnyType& value,
+                       ::testing::MatchResultListener* listener) const {
+    if (!listener->IsInterested()) {
+      const T* ptr = any_cast<T>(&value);
+      return ptr != nullptr && matcher_.Matches(*ptr);
+    }
+
+    const T* elem = any_cast<T>(&value);
+    if (elem == nullptr) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(*elem, &elem_listener);
+    *listener << "whose value " << PrintToString(*elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace any_cast_matcher
+
+// Implements the Args() matcher.
+template <class ArgsTuple, size_t... k>
+class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
+ public:
+  using RawArgsTuple = typename std::decay<ArgsTuple>::type;
+  using SelectedArgs =
+      std::tuple<typename std::tuple_element<k, RawArgsTuple>::type...>;
+  using MonomorphicInnerMatcher = Matcher<const SelectedArgs&>;
+
+  template <typename InnerMatcher>
+  explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
+      : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
+
+  bool MatchAndExplain(ArgsTuple args,
+                       MatchResultListener* listener) const override {
+    // Workaround spurious C4100 on MSVC<=15.7 when k is empty.
+    (void)args;
+    const SelectedArgs& selected_args =
+        std::forward_as_tuple(std::get<k>(args)...);
+    if (!listener->IsInterested()) return inner_matcher_.Matches(selected_args);
+
+    PrintIndices(listener->stream());
+    *listener << "are " << PrintToString(selected_args);
+
+    StringMatchResultListener inner_listener;
+    const bool match =
+        inner_matcher_.MatchAndExplain(selected_args, &inner_listener);
+    PrintIfNotEmpty(inner_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  // Prints the indices of the selected fields.
+  static void PrintIndices(::std::ostream* os) {
+    *os << "whose fields (";
+    const char* sep = "";
+    // Workaround spurious C4189 on MSVC<=15.7 when k is empty.
+    (void)sep;
+    const char* dummy[] = {"", (*os << sep << "#" << k, sep = ", ")...};
+    (void)dummy;
+    *os << ") ";
+  }
+
+  MonomorphicInnerMatcher inner_matcher_;
+};
+
+template <class InnerMatcher, size_t... k>
+class ArgsMatcher {
+ public:
+  explicit ArgsMatcher(InnerMatcher inner_matcher)
+      : inner_matcher_(std::move(inner_matcher)) {}
+
+  template <typename ArgsTuple>
+  operator Matcher<ArgsTuple>() const {  // NOLINT
+    return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k...>(inner_matcher_));
+  }
+
+ private:
+  InnerMatcher inner_matcher_;
+};
+
+}  // namespace internal
+
+// ElementsAreArray(iterator_first, iterator_last)
+// ElementsAreArray(pointer, count)
+// ElementsAreArray(array)
+// ElementsAreArray(container)
+// ElementsAreArray({ e1, e2, ..., en })
+//
+// The ElementsAreArray() functions are like ElementsAre(...), except
+// that they are given a homogeneous sequence rather than taking each
+// element as a function argument. The sequence can be specified as an
+// array, a pointer and count, a vector, an initializer list, or an
+// STL iterator range. In each of these cases, the underlying sequence
+// can be either a sequence of values or a sequence of matchers.
+//
+// All forms of ElementsAreArray() make a copy of the input matcher sequence.
+
+template <typename Iter>
+inline internal::ElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+ElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::ElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T* pointer, size_t count) {
+  return ElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T (&array)[N]) {
+  return ElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::ElementsAreArrayMatcher<typename Container::value_type>
+ElementsAreArray(const Container& container) {
+  return ElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T>
+ElementsAreArray(::std::initializer_list<T> xs) {
+  return ElementsAreArray(xs.begin(), xs.end());
+}
+
+// UnorderedElementsAreArray(iterator_first, iterator_last)
+// UnorderedElementsAreArray(pointer, count)
+// UnorderedElementsAreArray(array)
+// UnorderedElementsAreArray(container)
+// UnorderedElementsAreArray({ e1, e2, ..., en })
+//
+// UnorderedElementsAreArray() verifies that a bijective mapping onto a
+// collection of matchers exists.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+UnorderedElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::ExactMatch, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T* pointer, size_t count) {
+  return UnorderedElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T (&array)[N]) {
+  return UnorderedElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+UnorderedElementsAreArray(const Container& container) {
+  return UnorderedElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(::std::initializer_list<T> xs) {
+  return UnorderedElementsAreArray(xs.begin(), xs.end());
+}
+
+// _ is a matcher that matches anything of any type.
+//
+// This definition is fine as:
+//
+//   1. The C++ standard permits using the name _ in a namespace that
+//      is not the global namespace or ::std.
+//   2. The AnythingMatcher class has no data member or constructor,
+//      so it's OK to create global variables of this type.
+//   3. c-style has approved of using _ in this case.
+const internal::AnythingMatcher _ = {};
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> A() {
+  return Matcher<T>(new internal::AnyMatcherImpl<T>());
+}
+
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> An() { return A<T>(); }
+
+template <typename T, typename M>
+Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
+    const M& value, std::false_type /* convertible_to_matcher */,
+    std::false_type /* convertible_to_T */) {
+  return Eq(value);
+}
+
+// Creates a polymorphic matcher that matches any NULL pointer.
+inline PolymorphicMatcher<internal::IsNullMatcher > IsNull() {
+  return MakePolymorphicMatcher(internal::IsNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any non-NULL pointer.
+// This is convenient as Not(NULL) doesn't compile (the compiler
+// thinks that that expression is comparing a pointer with an integer).
+inline PolymorphicMatcher<internal::NotNullMatcher > NotNull() {
+  return MakePolymorphicMatcher(internal::NotNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any argument that
+// references variable x.
+template <typename T>
+inline internal::RefMatcher<T&> Ref(T& x) {  // NOLINT
+  return internal::RefMatcher<T&>(x);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, false);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, true);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> DoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, false);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, true);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> FloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that points
+// to a value that matches inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointeeMatcher<InnerMatcher> Pointee(
+    const InnerMatcher& inner_matcher) {
+  return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
+}
+
+#if GTEST_HAS_RTTI
+// Creates a matcher that matches a pointer or reference that matches
+// inner_matcher when dynamic_cast<To> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To> >
+WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
+  return MakePolymorphicMatcher(
+      internal::WhenDynamicCastToMatcher<To>(inner_matcher));
+}
+#endif  // GTEST_HAS_RTTI
+
+// Creates a matcher that matches an object whose given field matches
+// 'matcher'.  For example,
+//   Field(&Foo::number, Ge(5))
+// matches a Foo object x if x.number >= 5.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<
+  internal::FieldMatcher<Class, FieldType> > Field(
+    FieldType Class::*field, const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::FieldMatcher<Class, FieldType>(
+          field, MatcherCast<const FieldType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Field(&Foo::bar, m)
+  // to compile where bar is an int32 and m is a matcher for int64.
+}
+
+// Same as Field() but also takes the name of the field to provide better error
+// messages.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType> > Field(
+    const std::string& field_name, FieldType Class::*field,
+    const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+      field_name, field, MatcherCast<const FieldType&>(matcher)));
+}
+
+// Creates a matcher that matches an object whose given property
+// matches 'matcher'.  For example,
+//   Property(&Foo::str, StartsWith("hi"))
+// matches a Foo object x if x.str() starts with "hi".
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Property(&Foo::bar, m)
+  // to compile where bar() returns an int32 and m is a matcher for int64.
+}
+
+// Same as Property() above, but also takes the name of the property to provide
+// better error messages.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// The same as above but for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Three-argument form for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Creates a matcher that matches an object if the result of applying
+// a callable to x matches 'matcher'.
+// For example,
+//   ResultOf(f, StartsWith("hi"))
+// matches a Foo object x if f(x) starts with "hi".
+// `callable` parameter can be a function, function pointer, or a functor. It is
+// required to keep no state affecting the results of the calls on it and make
+// no assumptions about how many calls will be made. Any state it keeps must be
+// protected from the concurrent access.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+    Callable callable, InnerMatcher matcher) {
+  return internal::ResultOfMatcher<Callable, InnerMatcher>(
+      std::move(callable), std::move(matcher));
+}
+
+// String matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrEq(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrNe(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseEq(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseNe(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, false, false));
+}
+
+// Creates a matcher that matches any string, std::string, or C string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::string> > HasSubstr(
+    const std::string& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::string>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::string> > StartsWith(
+    const std::string& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::string>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::string> > EndsWith(
+    const std::string& suffix) {
+  return MakePolymorphicMatcher(internal::EndsWithMatcher<std::string>(suffix));
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Wide string matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrEq(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrNe(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseEq(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseNe(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, false));
+}
+
+// Creates a matcher that matches any ::wstring, std::wstring, or C wide string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring> > HasSubstr(
+    const std::wstring& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::wstring>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring> >
+StartsWith(const std::wstring& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::wstring>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring> > EndsWith(
+    const std::wstring& suffix) {
+  return MakePolymorphicMatcher(
+      internal::EndsWithMatcher<std::wstring>(suffix));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field == the second field.
+inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field >= the second field.
+inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field > the second field.
+inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field <= the second field.
+inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field < the second field.
+inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field != the second field.
+inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatEq() {
+  return internal::FloatingEq2Matcher<float>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleEq() {
+  return internal::FloatingEq2Matcher<double>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatEq() {
+  return internal::FloatingEq2Matcher<float>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleEq() {
+  return internal::FloatingEq2Matcher<double>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatNear(float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleNear(double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatNear(
+    float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error, true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleNear(
+    double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error, true);
+}
+
+// Creates a matcher that matches any value of type T that m doesn't
+// match.
+template <typename InnerMatcher>
+inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
+  return internal::NotMatcher<InnerMatcher>(m);
+}
+
+// Returns a matcher that matches anything that satisfies the given
+// predicate.  The predicate can be any unary function or functor
+// whose return type can be implicitly converted to bool.
+template <typename Predicate>
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate> >
+Truly(Predicate pred) {
+  return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
+}
+
+// Returns a matcher that matches the container size. The container must
+// support both size() and size_type which all STL-like containers provide.
+// Note that the parameter 'size' can be a value of type size_type as well as
+// matcher. For instance:
+//   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
+//   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
+template <typename SizeMatcher>
+inline internal::SizeIsMatcher<SizeMatcher>
+SizeIs(const SizeMatcher& size_matcher) {
+  return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
+}
+
+// Returns a matcher that matches the distance between the container's begin()
+// iterator and its end() iterator, i.e. the size of the container. This matcher
+// can be used instead of SizeIs with containers such as std::forward_list which
+// do not implement size(). The container must provide const_iterator (with
+// valid iterator_traits), begin() and end().
+template <typename DistanceMatcher>
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher>
+BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
+  return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
+}
+
+// Returns a matcher that matches an equal container.
+// This matcher behaves like Eq(), but in the event of mismatch lists the
+// values that are included in one container but not the other. (Duplicate
+// values and order differences are not explained.)
+template <typename Container>
+inline PolymorphicMatcher<internal::ContainerEqMatcher<
+    typename std::remove_const<Container>::type>>
+ContainerEq(const Container& rhs) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes Container to be a const type sometimes.
+  typedef typename std::remove_const<Container>::type RawContainer;
+  return MakePolymorphicMatcher(
+      internal::ContainerEqMatcher<RawContainer>(rhs));
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the given comparator, matches container_matcher.
+template <typename Comparator, typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher>
+WhenSortedBy(const Comparator& comparator,
+             const ContainerMatcher& container_matcher) {
+  return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
+      comparator, container_matcher);
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the < operator, matches container_matcher.
+template <typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
+WhenSorted(const ContainerMatcher& container_matcher) {
+  return
+      internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>(
+          internal::LessComparator(), container_matcher);
+}
+
+// Matches an STL-style container or a native array that contains the
+// same number of elements as in rhs, where its i-th element and rhs's
+// i-th element (as a pair) satisfy the given pair matcher, for all i.
+// TupleMatcher must be able to be safely cast to Matcher<std::tuple<const
+// T1&, const T2&> >, where T1 and T2 are the types of elements in the
+// LHS container and the RHS container respectively.
+template <typename TupleMatcher, typename Container>
+inline internal::PointwiseMatcher<TupleMatcher,
+                                  typename std::remove_const<Container>::type>
+Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes Container to be a const type sometimes (e.g. when
+  // rhs is a const int[])..
+  typedef typename std::remove_const<Container>::type RawContainer;
+  return internal::PointwiseMatcher<TupleMatcher, RawContainer>(
+      tuple_matcher, rhs);
+}
+
+
+// Supports the Pointwise(m, {a, b, c}) syntax.
+template <typename TupleMatcher, typename T>
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T> > Pointwise(
+    const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
+  return Pointwise(tuple_matcher, std::vector<T>(rhs));
+}
+
+
+// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
+// container or a native array that contains the same number of
+// elements as in rhs, where in some permutation of the container, its
+// i-th element and rhs's i-th element (as a pair) satisfy the given
+// pair matcher, for all i.  Tuple2Matcher must be able to be safely
+// cast to Matcher<std::tuple<const T1&, const T2&> >, where T1 and T2 are
+// the types of elements in the LHS container and the RHS container
+// respectively.
+//
+// This is like Pointwise(pair_matcher, rhs), except that the element
+// order doesn't matter.
+template <typename Tuple2Matcher, typename RhsContainer>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<
+        Tuple2Matcher,
+        typename internal::StlContainerView<
+            typename std::remove_const<RhsContainer>::type>::type::value_type>>
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   const RhsContainer& rhs_container) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes RhsContainer to be a const type sometimes (e.g. when
+  // rhs_container is a const int[]).
+  typedef typename std::remove_const<RhsContainer>::type RawRhsContainer;
+
+  // RhsView allows the same code to handle RhsContainer being a
+  // STL-style container and it being a native C-style array.
+  typedef typename internal::StlContainerView<RawRhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type Second;
+  const RhsStlContainer& rhs_stl_container =
+      RhsView::ConstReference(rhs_container);
+
+  // Create a matcher for each element in rhs_container.
+  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second> > matchers;
+  for (typename RhsStlContainer::const_iterator it = rhs_stl_container.begin();
+       it != rhs_stl_container.end(); ++it) {
+    matchers.push_back(
+        internal::MatcherBindSecond(tuple2_matcher, *it));
+  }
+
+  // Delegate the work to UnorderedElementsAreArray().
+  return UnorderedElementsAreArray(matchers);
+}
+
+
+// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
+template <typename Tuple2Matcher, typename T>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<Tuple2Matcher, T> >
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   std::initializer_list<T> rhs) {
+  return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
+}
+
+
+// Matches an STL-style container or a native array that contains at
+// least one element matching the given value or matcher.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   page_ids.insert(3);
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Contains(1));
+//   EXPECT_THAT(page_ids, Contains(Gt(2)));
+//   EXPECT_THAT(page_ids, Not(Contains(4)));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   EXPECT_THAT(page_lengths,
+//               Contains(::std::pair<const int, size_t>(1, 100)));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+template <typename M>
+inline internal::ContainsMatcher<M> Contains(M matcher) {
+  return internal::ContainsMatcher<M>(matcher);
+}
+
+// IsSupersetOf(iterator_first, iterator_last)
+// IsSupersetOf(pointer, count)
+// IsSupersetOf(array)
+// IsSupersetOf(container)
+// IsSupersetOf({e1, e2, ..., en})
+//
+// IsSupersetOf() verifies that a surjective partial mapping onto a collection
+// of matchers exists. In other words, a container matches
+// IsSupersetOf({e1, ..., en}) if and only if there is a permutation
+// {y1, ..., yn} of some of the container's elements where y1 matches e1,
+// ..., and yn matches en. Obviously, the size of the container must be >= n
+// in order to have a match. Examples:
+//
+// - {1, 2, 3} matches IsSupersetOf({Ge(3), Ne(0)}), as 3 matches Ge(3) and
+//   1 matches Ne(0).
+// - {1, 2} doesn't match IsSupersetOf({Eq(1), Lt(2)}), even though 1 matches
+//   both Eq(1) and Lt(2). The reason is that different matchers must be used
+//   for elements in different slots of the container.
+// - {1, 1, 2} matches IsSupersetOf({Eq(1), Lt(2)}), as (the first) 1 matches
+//   Eq(1) and (the second) 1 matches Lt(2).
+// - {1, 2, 3} matches IsSupersetOf(Gt(1), Gt(1)), as 2 matches (the first)
+//   Gt(1) and 3 matches (the second) Gt(1).
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSupersetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Superset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T* pointer, size_t count) {
+  return IsSupersetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T (&array)[N]) {
+  return IsSupersetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSupersetOf(const Container& container) {
+  return IsSupersetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSupersetOf(xs.begin(), xs.end());
+}
+
+// IsSubsetOf(iterator_first, iterator_last)
+// IsSubsetOf(pointer, count)
+// IsSubsetOf(array)
+// IsSubsetOf(container)
+// IsSubsetOf({e1, e2, ..., en})
+//
+// IsSubsetOf() verifies that an injective mapping onto a collection of matchers
+// exists.  In other words, a container matches IsSubsetOf({e1, ..., en}) if and
+// only if there is a subset of matchers {m1, ..., mk} which would match the
+// container using UnorderedElementsAre.  Obviously, the size of the container
+// must be <= n in order to have a match. Examples:
+//
+// - {1} matches IsSubsetOf({Gt(0), Lt(0)}), as 1 matches Gt(0).
+// - {1, -1} matches IsSubsetOf({Lt(0), Gt(0)}), as 1 matches Gt(0) and -1
+//   matches Lt(0).
+// - {1, 2} doesn't matches IsSubsetOf({Gt(0), Lt(0)}), even though 1 and 2 both
+//   match Gt(0). The reason is that different matchers must be used for
+//   elements in different slots of the container.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSubsetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Subset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T* pointer, size_t count) {
+  return IsSubsetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T (&array)[N]) {
+  return IsSubsetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSubsetOf(const Container& container) {
+  return IsSubsetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSubsetOf(xs.begin(), xs.end());
+}
+
+// Matches an STL-style container or a native array that contains only
+// elements matching the given value or matcher.
+//
+// Each(m) is semantically equivalent to Not(Contains(Not(m))). Only
+// the messages are different.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   // Each(m) matches an empty container, regardless of what m is.
+//   EXPECT_THAT(page_ids, Each(Eq(1)));
+//   EXPECT_THAT(page_ids, Each(Eq(77)));
+//
+//   page_ids.insert(3);
+//   EXPECT_THAT(page_ids, Each(Gt(0)));
+//   EXPECT_THAT(page_ids, Not(Each(Gt(4))));
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Not(Each(Lt(2))));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   page_lengths[2] = 200;
+//   page_lengths[3] = 300;
+//   EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
+//   EXPECT_THAT(page_lengths, Each(Key(Le(3))));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
+template <typename M>
+inline internal::EachMatcher<M> Each(M matcher) {
+  return internal::EachMatcher<M>(matcher);
+}
+
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename M>
+inline internal::KeyMatcher<M> Key(M inner_matcher) {
+  return internal::KeyMatcher<M>(inner_matcher);
+}
+
+// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
+// matches first_matcher and whose 'second' field matches second_matcher.  For
+// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
+// to match a std::map<int, string> that contains exactly one element whose key
+// is >= 5 and whose value equals "foo".
+template <typename FirstMatcher, typename SecondMatcher>
+inline internal::PairMatcher<FirstMatcher, SecondMatcher>
+Pair(FirstMatcher first_matcher, SecondMatcher second_matcher) {
+  return internal::PairMatcher<FirstMatcher, SecondMatcher>(
+      first_matcher, second_matcher);
+}
+
+// Returns a predicate that is satisfied by anything that matches the
+// given matcher.
+template <typename M>
+inline internal::MatcherAsPredicate<M> Matches(M matcher) {
+  return internal::MatcherAsPredicate<M>(matcher);
+}
+
+// Returns true if the value matches the matcher.
+template <typename T, typename M>
+inline bool Value(const T& value, M matcher) {
+  return testing::Matches(matcher)(value);
+}
+
+// Matches the value against the given matcher and explains the match
+// result to listener.
+template <typename T, typename M>
+inline bool ExplainMatchResult(
+    M matcher, const T& value, MatchResultListener* listener) {
+  return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
+}
+
+// Returns a string representation of the given matcher.  Useful for description
+// strings of matchers defined using MATCHER_P* macros that accept matchers as
+// their arguments.  For example:
+//
+// MATCHER_P(XAndYThat, matcher,
+//           "X that " + DescribeMatcher<int>(matcher, negation) +
+//               " and Y that " + DescribeMatcher<double>(matcher, negation)) {
+//   return ExplainMatchResult(matcher, arg.x(), result_listener) &&
+//          ExplainMatchResult(matcher, arg.y(), result_listener);
+// }
+template <typename T, typename M>
+std::string DescribeMatcher(const M& matcher, bool negation = false) {
+  ::std::stringstream ss;
+  Matcher<T> monomorphic_matcher = SafeMatcherCast<T>(matcher);
+  if (negation) {
+    monomorphic_matcher.DescribeNegationTo(&ss);
+  } else {
+    monomorphic_matcher.DescribeTo(&ss);
+  }
+  return ss.str();
+}
+
+template <typename... Args>
+internal::ElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+ElementsAre(const Args&... matchers) {
+  return internal::ElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+template <typename... Args>
+internal::UnorderedElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+UnorderedElementsAre(const Args&... matchers) {
+  return internal::UnorderedElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+// Define variadic matcher versions.
+template <typename... Args>
+internal::AllOfMatcher<typename std::decay<const Args&>::type...> AllOf(
+    const Args&... matchers) {
+  return internal::AllOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+template <typename... Args>
+internal::AnyOfMatcher<typename std::decay<const Args&>::type...> AnyOf(
+    const Args&... matchers) {
+  return internal::AnyOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+// AnyOfArray(array)
+// AnyOfArray(pointer, count)
+// AnyOfArray(container)
+// AnyOfArray({ e1, e2, ..., en })
+// AnyOfArray(iterator_first, iterator_last)
+//
+// AnyOfArray() verifies whether a given value matches any member of a
+// collection of matchers.
+//
+// AllOfArray(array)
+// AllOfArray(pointer, count)
+// AllOfArray(container)
+// AllOfArray({ e1, e2, ..., en })
+// AllOfArray(iterator_first, iterator_last)
+//
+// AllOfArray() verifies whether a given value matches all members of a
+// collection of matchers.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::AnyOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AnyOfArray(Iter first, Iter last) {
+  return internal::AnyOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename Iter>
+inline internal::AllOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AllOfArray(Iter first, Iter last) {
+  return internal::AllOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T* ptr, size_t count) {
+  return AnyOfArray(ptr, ptr + count);
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T* ptr, size_t count) {
+  return AllOfArray(ptr, ptr + count);
+}
+
+template <typename T, size_t N>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T (&array)[N]) {
+  return AnyOfArray(array, N);
+}
+
+template <typename T, size_t N>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T (&array)[N]) {
+  return AllOfArray(array, N);
+}
+
+template <typename Container>
+inline internal::AnyOfArrayMatcher<typename Container::value_type> AnyOfArray(
+    const Container& container) {
+  return AnyOfArray(container.begin(), container.end());
+}
+
+template <typename Container>
+inline internal::AllOfArrayMatcher<typename Container::value_type> AllOfArray(
+    const Container& container) {
+  return AllOfArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(
+    ::std::initializer_list<T> xs) {
+  return AnyOfArray(xs.begin(), xs.end());
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(
+    ::std::initializer_list<T> xs) {
+  return AllOfArray(xs.begin(), xs.end());
+}
+
+// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
+// fields of it matches a_matcher.  C++ doesn't support default
+// arguments for function templates, so we have to overload it.
+template <size_t... k, typename InnerMatcher>
+internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
+    InnerMatcher&& matcher) {
+  return internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...>(
+      std::forward<InnerMatcher>(matcher));
+}
+
+// AllArgs(m) is a synonym of m.  This is useful in
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
+//
+// which is easier to read than
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
+template <typename InnerMatcher>
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) { return matcher; }
+
+// Returns a matcher that matches the value of an optional<> type variable.
+// The matcher implementation only uses '!arg' and requires that the optional<>
+// type has a 'value_type' member type and that '*arg' is of type 'value_type'
+// and is printable using 'PrintToString'. It is compatible with
+// std::optional/std::experimental::optional.
+// Note that to compare an optional type variable against nullopt you should
+// use Eq(nullopt) and not Optional(Eq(nullopt)). The latter implies that the
+// optional value contains an optional itself.
+template <typename ValueMatcher>
+inline internal::OptionalMatcher<ValueMatcher> Optional(
+    const ValueMatcher& value_matcher) {
+  return internal::OptionalMatcher<ValueMatcher>(value_matcher);
+}
+
+// Returns a matcher that matches the value of a absl::any type variable.
+template <typename T>
+PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T> > AnyWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
+}
+
+// Returns a matcher that matches the value of a variant<> type variable.
+// The matcher implementation uses ADL to find the holds_alternative and get
+// functions.
+// It is compatible with std::variant.
+template <typename T>
+PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T> > VariantWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::variant_matcher::VariantMatcher<T>(matcher));
+}
+
+// These macros allow using matchers to check values in Google Test
+// tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
+// succeed if the value matches the matcher.  If the assertion fails,
+// the value and the description of the matcher will be printed.
+#define ASSERT_THAT(value, matcher) ASSERT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) EXPECT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+// Include any custom callback matchers added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-matchers.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-actions.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-actions.h
new file mode 100755
index 0000000..d42484a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-actions.h
@@ -0,0 +1,162 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some actions that depend on gmock-generated-actions.h.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+
+#include <algorithm>
+#include <type_traits>
+
+#include "gmock/gmock-generated-actions.h"
+
+namespace testing {
+namespace internal {
+
+// An internal replacement for std::copy which mimics its behavior. This is
+// necessary because Visual Studio deprecates ::std::copy, issuing warning 4996.
+// However Visual Studio 2010 and later do not honor #pragmas which disable that
+// warning.
+template<typename InputIterator, typename OutputIterator>
+inline OutputIterator CopyElements(InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator output) {
+  for (; first != last; ++first, ++output) {
+    *output = *first;
+  }
+  return output;
+}
+
+}  // namespace internal
+
+// Various overloads for Invoke().
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Action ReturnArg<k>() returns the k-th argument of the mock function.
+ACTION_TEMPLATE(ReturnArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  return ::std::get<k>(args);
+}
+
+// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
+// mock function to *pointer.
+ACTION_TEMPLATE(SaveArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = ::std::get<k>(args);
+}
+
+// Action SaveArgPointee<k>(pointer) saves the value pointed to
+// by the k-th (0-based) argument of the mock function to *pointer.
+ACTION_TEMPLATE(SaveArgPointee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = *::std::get<k>(args);
+}
+
+// Action SetArgReferee<k>(value) assigns 'value' to the variable
+// referenced by the k-th (0-based) argument of the mock function.
+ACTION_TEMPLATE(SetArgReferee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(value)) {
+  typedef typename ::std::tuple_element<k, args_type>::type argk_type;
+  // Ensures that argument #k is a reference.  If you get a compiler
+  // error on the next line, you are using SetArgReferee<k>(value) in
+  // a mock function whose k-th (0-based) argument is not a reference.
+  GTEST_COMPILE_ASSERT_(std::is_reference<argk_type>::value,
+                        SetArgReferee_must_be_used_with_a_reference_argument);
+  ::std::get<k>(args) = value;
+}
+
+// Action SetArrayArgument<k>(first, last) copies the elements in
+// source range [first, last) to the array pointed to by the k-th
+// (0-based) argument, which can be either a pointer or an
+// iterator. The action does not take ownership of the elements in the
+// source range.
+ACTION_TEMPLATE(SetArrayArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(first, last)) {
+  // Visual Studio deprecates ::std::copy, so we use our own copy in that case.
+#ifdef _MSC_VER
+  internal::CopyElements(first, last, ::std::get<k>(args));
+#else
+  ::std::copy(first, last, ::std::get<k>(args));
+#endif
+}
+
+// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
+// function.
+ACTION_TEMPLATE(DeleteArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  delete ::std::get<k>(args);
+}
+
+// This action returns the value pointed to by 'pointer'.
+ACTION_P(ReturnPointee, pointer) { return *pointer; }
+
+// Action Throw(exception) can be used in a mock function of any type
+// to throw the given exception.  Any copyable value can be thrown.
+#if GTEST_HAS_EXCEPTIONS
+
+// Suppresses the 'unreachable code' warning that VC generates in opt modes.
+# ifdef _MSC_VER
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4702)  // Temporarily disables warning 4702.
+# endif
+ACTION_P(Throw, exception) { throw exception; }
+# ifdef _MSC_VER
+#  pragma warning(pop)           // Restores the warning state.
+# endif
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-matchers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-matchers.h
new file mode 100755
index 0000000..1c9a399
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-more-matchers.h
@@ -0,0 +1,92 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some matchers that depend on gmock-generated-matchers.h.
+//
+// Note that tests are implemented in gmock-matchers_test.cc rather than
+// gmock-more-matchers-test.cc.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
+
+#include "gmock/gmock-generated-matchers.h"
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal
+// parameter) for MSVC
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#if (_MSC_VER == 1900)
+// and silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 14
+# pragma warning(disable:4800)
+  #endif
+#endif
+
+// Defines a matcher that matches an empty container. The container must
+// support both size() and empty(), which all STL-like containers provide.
+MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
+  if (arg.empty()) {
+    return true;
+  }
+  *result_listener << "whose size is " << arg.size();
+  return false;
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to true.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsTrue, negation ? "is false" : "is true") {
+  return static_cast<bool>(arg);
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to false.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsFalse, negation ? "is true" : "is false") {
+  return !static_cast<bool>(arg);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-nice-strict.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-nice-strict.h
new file mode 100755
index 0000000..5495a98
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-nice-strict.h
@@ -0,0 +1,215 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Implements class templates NiceMock, NaggyMock, and StrictMock.
+//
+// Given a mock class MockFoo that is created using Google Mock,
+// NiceMock<MockFoo> is a subclass of MockFoo that allows
+// uninteresting calls (i.e. calls to mock methods that have no
+// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
+// that prints a warning when an uninteresting call occurs, and
+// StrictMock<MockFoo> is a subclass of MockFoo that treats all
+// uninteresting calls as errors.
+//
+// Currently a mock is naggy by default, so MockFoo and
+// NaggyMock<MockFoo> behave like the same.  However, we will soon
+// switch the default behavior of mocks to be nice, as that in general
+// leads to more maintainable tests.  When that happens, MockFoo will
+// stop behaving like NaggyMock<MockFoo> and start behaving like
+// NiceMock<MockFoo>.
+//
+// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
+// their respective base class.  Therefore you can write
+// NiceMock<MockFoo>(5, "a") to construct a nice mock where MockFoo
+// has a constructor that accepts (int, const char*), for example.
+//
+// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
+// and StrictMock<MockFoo> only works for mock methods defined using
+// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
+// If a mock method is defined in a base class of MockFoo, the "nice"
+// or "strict" modifier may not affect it, depending on the compiler.
+// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
+// supported.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+template <class MockClass>
+class NiceMock : public MockClass {
+ public:
+  NiceMock() : MockClass() {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NiceMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  NiceMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~NiceMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NiceMock);
+};
+
+template <class MockClass>
+class NaggyMock : public MockClass {
+ public:
+  NaggyMock() : MockClass() {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NaggyMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  NaggyMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~NaggyMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NaggyMock);
+};
+
+template <class MockClass>
+class StrictMock : public MockClass {
+ public:
+  StrictMock() : MockClass() {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit StrictMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  StrictMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~StrictMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StrictMock);
+};
+
+// The following specializations catch some (relatively more common)
+// user errors of nesting nice and strict mocks.  They do NOT catch
+// all possible errors.
+
+// These specializations are declared but not defined, as NiceMock,
+// NaggyMock, and StrictMock cannot be nested.
+
+template <typename MockClass>
+class NiceMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class NaggyMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class StrictMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<StrictMock<MockClass> >;
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-spec-builders.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-spec-builders.h
new file mode 100755
index 0000000..66429df
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock-spec-builders.h
@@ -0,0 +1,1982 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the ON_CALL() and EXPECT_CALL() macros.
+//
+// A user can use the ON_CALL() macro to specify the default action of
+// a mock method.  The syntax is:
+//
+//   ON_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matcher)
+//       .WillByDefault(action);
+//
+//  where the .With() clause is optional.
+//
+// A user can use the EXPECT_CALL() macro to specify an expectation on
+// a mock method.  The syntax is:
+//
+//   EXPECT_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matchers)
+//       .Times(cardinality)
+//       .InSequence(sequences)
+//       .After(expectations)
+//       .WillOnce(action)
+//       .WillRepeatedly(action)
+//       .RetiresOnSaturation();
+//
+// where all clauses are optional, and .InSequence()/.After()/
+// .WillOnce() can appear any number of times.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>  // NOLINT
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// An abstract handle of an expectation.
+class Expectation;
+
+// A set of expectation handles.
+class ExpectationSet;
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// Implements a mock function.
+template <typename F> class FunctionMocker;
+
+// Base class for expectations.
+class ExpectationBase;
+
+// Implements an expectation.
+template <typename F> class TypedExpectation;
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester;
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+//
+// The reason we don't use more fine-grained protection is: when a
+// mock function Foo() is called, it needs to consult its expectations
+// to see which one should be picked.  If another thread is allowed to
+// call a mock function (either Foo() or a different one) at the same
+// time, it could affect the "retired" attributes of Foo()'s
+// expectations when InSequence() is used, and thus affect which
+// expectation gets picked.  Therefore, we sequence all mock function
+// calls to ensure the integrity of the mock objects' states.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Untyped base class for ActionResultHolder<R>.
+class UntypedActionResultHolderBase;
+
+// Abstract base class of FunctionMocker.  This is the
+// type-agnostic part of the function mocker interface.  Its pure
+// virtual methods are implemented by FunctionMocker.
+class GTEST_API_ UntypedFunctionMockerBase {
+ public:
+  UntypedFunctionMockerBase();
+  virtual ~UntypedFunctionMockerBase();
+
+  // Verifies that all expectations on this mock function have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  bool VerifyAndClearExpectationsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Clears the ON_CALL()s set on this mock function.
+  virtual void ClearDefaultActionsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
+
+  // In all of the following Untyped* functions, it's the caller's
+  // responsibility to guarantee the correctness of the arguments'
+  // types.
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args, const std::string& call_description) const = 0;
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const = 0;
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  virtual void UntypedDescribeUninterestingCall(
+      const void* untyped_args,
+      ::std::ostream* os) const
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  virtual const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args,
+      const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Prints the given function arguments to the ostream.
+  virtual void UntypedPrintArgs(const void* untyped_args,
+                                ::std::ostream* os) const = 0;
+
+  // Sets the mock object this mock method belongs to, and registers
+  // this information in the global mock registry.  Will be called
+  // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+  // method.
+  void RegisterOwner(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Sets the mock object this mock method belongs to, and sets the
+  // name of the mock function.  Will be called upon each invocation
+  // of this mock function.
+  void SetOwnerAndName(const void* mock_obj, const char* name)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the mock object this mock method belongs to.  Must be
+  // called after RegisterOwner() or SetOwnerAndName() has been
+  // called.
+  const void* MockObject() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the name of this mock method.  Must be called after
+  // SetOwnerAndName() has been called.
+  const char* Name() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.  The caller is responsible for deleting the
+  // result.
+  UntypedActionResultHolderBase* UntypedInvokeWith(void* untyped_args)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ protected:
+  typedef std::vector<const void*> UntypedOnCallSpecs;
+
+  using UntypedExpectations = std::vector<std::shared_ptr<ExpectationBase>>;
+
+  // Returns an Expectation object that references and co-owns exp,
+  // which must be an expectation on this mock function.
+  Expectation GetHandleOf(ExpectationBase* exp);
+
+  // Address of the mock object this mock method belongs to.  Only
+  // valid after this mock method has been called or
+  // ON_CALL/EXPECT_CALL has been invoked on it.
+  const void* mock_obj_;  // Protected by g_gmock_mutex.
+
+  // Name of the function being mocked.  Only valid after this mock
+  // method has been called.
+  const char* name_;  // Protected by g_gmock_mutex.
+
+  // All default action specs for this function mocker.
+  UntypedOnCallSpecs untyped_on_call_specs_;
+
+  // All expectations for this function mocker.
+  //
+  // It's undefined behavior to interleave expectations (EXPECT_CALLs
+  // or ON_CALLs) and mock function calls.  Also, the order of
+  // expectations is important.  Therefore it's a logic race condition
+  // to read/write untyped_expectations_ concurrently.  In order for
+  // tools like tsan to catch concurrent read/write accesses to
+  // untyped_expectations, we deliberately leave accesses to it
+  // unprotected.
+  UntypedExpectations untyped_expectations_;
+};  // class UntypedFunctionMockerBase
+
+// Untyped base class for OnCallSpec<F>.
+class UntypedOnCallSpecBase {
+ public:
+  // The arguments are the location of the ON_CALL() statement.
+  UntypedOnCallSpecBase(const char* a_file, int a_line)
+      : file_(a_file), line_(a_line), last_clause_(kNone) {}
+
+  // Where in the source file was the default action spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+
+ protected:
+  // Gives each clause in the ON_CALL() statement a name.
+  enum Clause {
+    // Do not change the order of the enum members!  The run-time
+    // syntax checking relies on it.
+    kNone,
+    kWith,
+    kWillByDefault
+  };
+
+  // Asserts that the ON_CALL() statement has a certain property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the ON_CALL() statement has a certain property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  const char* file_;
+  int line_;
+
+  // The last clause in the ON_CALL() statement as seen so far.
+  // Initially kNone and changes as the statement is parsed.
+  Clause last_clause_;
+};  // class UntypedOnCallSpecBase
+
+// This template class implements an ON_CALL spec.
+template <typename F>
+class OnCallSpec : public UntypedOnCallSpecBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+  // Constructs an OnCallSpec object from the information inside
+  // the parenthesis of an ON_CALL() statement.
+  OnCallSpec(const char* a_file, int a_line,
+             const ArgumentMatcherTuple& matchers)
+      : UntypedOnCallSpecBase(a_file, a_line),
+        matchers_(matchers),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()) {}
+
+  // Implements the .With() clause.
+  OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
+    // Makes sure this is called at most once.
+    ExpectSpecProperty(last_clause_ < kWith,
+                       ".With() cannot appear "
+                       "more than once in an ON_CALL().");
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    return *this;
+  }
+
+  // Implements the .WillByDefault() clause.
+  OnCallSpec& WillByDefault(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ < kWillByDefault,
+                       ".WillByDefault() must appear "
+                       "exactly once in an ON_CALL().");
+    last_clause_ = kWillByDefault;
+
+    ExpectSpecProperty(!action.IsDoDefault(),
+                       "DoDefault() cannot be used in ON_CALL().");
+    action_ = action;
+    return *this;
+  }
+
+  // Returns true if the given arguments match the matchers.
+  bool Matches(const ArgumentTuple& args) const {
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns the action specified by the user.
+  const Action<F>& GetAction() const {
+    AssertSpecProperty(last_clause_ == kWillByDefault,
+                       ".WillByDefault() must appear exactly "
+                       "once in an ON_CALL().");
+    return action_;
+  }
+
+ private:
+  // The information in statement
+  //
+  //   ON_CALL(mock_object, Method(matchers))
+  //       .With(multi-argument-matcher)
+  //       .WillByDefault(action);
+  //
+  // is recorded in the data members like this:
+  //
+  //   source file that contains the statement => file_
+  //   line number of the statement            => line_
+  //   matchers                                => matchers_
+  //   multi-argument-matcher                  => extra_matcher_
+  //   action                                  => action_
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> action_;
+};  // class OnCallSpec
+
+// Possible reactions on uninteresting calls.
+enum CallReaction {
+  kAllow,
+  kWarn,
+  kFail,
+};
+
+}  // namespace internal
+
+// Utilities for manipulating mock objects.
+class GTEST_API_ Mock {
+ public:
+  // The following public methods can be called concurrently.
+
+  // Tells Google Mock to ignore mock_obj when checking for leaked
+  // mock objects.
+  static void AllowLeak(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies and clears all expectations on the given mock object.
+  // If the expectations aren't satisfied, generates one or more
+  // Google Test non-fatal failures and returns false.
+  static bool VerifyAndClearExpectations(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies all expectations on the given mock object and clears its
+  // default actions and expectations.  Returns true if the
+  // verification was successful.
+  static bool VerifyAndClear(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns whether the mock was created as a naggy mock (default)
+  static bool IsNaggy(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a nice mock
+  static bool IsNice(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a strict mock
+  static bool IsStrict(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ private:
+  friend class internal::UntypedFunctionMockerBase;
+
+  // Needed for a function mocker to register itself (so that we know
+  // how to clear a mock object).
+  template <typename F>
+  friend class internal::FunctionMocker;
+
+  template <typename M>
+  friend class NiceMock;
+
+  template <typename M>
+  friend class NaggyMock;
+
+  template <typename M>
+  friend class StrictMock;
+
+  // Tells Google Mock to allow uninteresting calls on the given mock
+  // object.
+  static void AllowUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to warn the user about uninteresting calls on
+  // the given mock object.
+  static void WarnUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to fail uninteresting calls on the given mock
+  // object.
+  static void FailUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock the given mock object is being destroyed and
+  // its entry in the call-reaction table should be removed.
+  static void UnregisterCallReaction(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns the reaction Google Mock will have on uninteresting calls
+  // made on the given mock object.
+  static internal::CallReaction GetReactionOnUninterestingCalls(
+      const void* mock_obj)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies that all expectations on the given mock object have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  static bool VerifyAndClearExpectationsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Clears all ON_CALL()s set on the given mock object.
+  static void ClearDefaultActionsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Registers a mock object and a mock method it owns.
+  static void Register(
+      const void* mock_obj,
+      internal::UntypedFunctionMockerBase* mocker)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock where in the source code mock_obj is used in an
+  // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+  // information helps the user identify which object it is.
+  static void RegisterUseByOnCallOrExpectCall(
+      const void* mock_obj, const char* file, int line)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Unregisters a mock method; removes the owning mock object from
+  // the registry when the last mock method associated with it has
+  // been unregistered.  This is called only in the destructor of
+  // FunctionMocker.
+  static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+};  // class Mock
+
+// An abstract handle of an expectation.  Useful in the .After()
+// clause of EXPECT_CALL() for setting the (partial) order of
+// expectations.  The syntax:
+//
+//   Expectation e1 = EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(e1)...;
+//
+// sets two expectations where the latter can only be matched after
+// the former has been satisfied.
+//
+// Notes:
+//   - This class is copyable and has value semantics.
+//   - Constness is shallow: a const Expectation object itself cannot
+//     be modified, but the mutable methods of the ExpectationBase
+//     object it references can be called via expectation_base().
+
+class GTEST_API_ Expectation {
+ public:
+  // Constructs a null object that doesn't reference any expectation.
+  Expectation();
+
+  ~Expectation();
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   Expectation e = EXPECT_CALL(...);
+  // syntax.
+  //
+  // A TypedExpectation object stores its pre-requisites as
+  // Expectation objects, and needs to call the non-const Retire()
+  // method on the ExpectationBase objects they reference.  Therefore
+  // Expectation must receive a *non-const* reference to the
+  // ExpectationBase object.
+  Expectation(internal::ExpectationBase& exp);  // NOLINT
+
+  // The compiler-generated copy ctor and operator= work exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if rhs references the same expectation as this object does.
+  bool operator==(const Expectation& rhs) const {
+    return expectation_base_ == rhs.expectation_base_;
+  }
+
+  bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
+
+ private:
+  friend class ExpectationSet;
+  friend class Sequence;
+  friend class ::testing::internal::ExpectationBase;
+  friend class ::testing::internal::UntypedFunctionMockerBase;
+
+  template <typename F>
+  friend class ::testing::internal::FunctionMocker;
+
+  template <typename F>
+  friend class ::testing::internal::TypedExpectation;
+
+  // This comparator is needed for putting Expectation objects into a set.
+  class Less {
+   public:
+    bool operator()(const Expectation& lhs, const Expectation& rhs) const {
+      return lhs.expectation_base_.get() < rhs.expectation_base_.get();
+    }
+  };
+
+  typedef ::std::set<Expectation, Less> Set;
+
+  Expectation(
+      const std::shared_ptr<internal::ExpectationBase>& expectation_base);
+
+  // Returns the expectation this object references.
+  const std::shared_ptr<internal::ExpectationBase>& expectation_base() const {
+    return expectation_base_;
+  }
+
+  // A shared_ptr that co-owns the expectation this handle references.
+  std::shared_ptr<internal::ExpectationBase> expectation_base_;
+};
+
+// A set of expectation handles.  Useful in the .After() clause of
+// EXPECT_CALL() for setting the (partial) order of expectations.  The
+// syntax:
+//
+//   ExpectationSet es;
+//   es += EXPECT_CALL(...)...;
+//   es += EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(es)...;
+//
+// sets three expectations where the last one can only be matched
+// after the first two have both been satisfied.
+//
+// This class is copyable and has value semantics.
+class ExpectationSet {
+ public:
+  // A bidirectional iterator that can read a const element in the set.
+  typedef Expectation::Set::const_iterator const_iterator;
+
+  // An object stored in the set.  This is an alias of Expectation.
+  typedef Expectation::Set::value_type value_type;
+
+  // Constructs an empty set.
+  ExpectationSet() {}
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   ExpectationSet es = EXPECT_CALL(...);
+  // syntax.
+  ExpectationSet(internal::ExpectationBase& exp) {  // NOLINT
+    *this += Expectation(exp);
+  }
+
+  // This single-argument ctor implements implicit conversion from
+  // Expectation and thus must not be explicit.  This allows either an
+  // Expectation or an ExpectationSet to be used in .After().
+  ExpectationSet(const Expectation& e) {  // NOLINT
+    *this += e;
+  }
+
+  // The compiler-generator ctor and operator= works exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if rhs contains the same set of Expectation objects
+  // as this does.
+  bool operator==(const ExpectationSet& rhs) const {
+    return expectations_ == rhs.expectations_;
+  }
+
+  bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
+
+  // Implements the syntax
+  //   expectation_set += EXPECT_CALL(...);
+  ExpectationSet& operator+=(const Expectation& e) {
+    expectations_.insert(e);
+    return *this;
+  }
+
+  int size() const { return static_cast<int>(expectations_.size()); }
+
+  const_iterator begin() const { return expectations_.begin(); }
+  const_iterator end() const { return expectations_.end(); }
+
+ private:
+  Expectation::Set expectations_;
+};
+
+
+// Sequence objects are used by a user to specify the relative order
+// in which the expectations should match.  They are copyable (we rely
+// on the compiler-defined copy constructor and assignment operator).
+class GTEST_API_ Sequence {
+ public:
+  // Constructs an empty sequence.
+  Sequence() : last_expectation_(new Expectation) {}
+
+  // Adds an expectation to this sequence.  The caller must ensure
+  // that no other thread is accessing this Sequence object.
+  void AddExpectation(const Expectation& expectation) const;
+
+ private:
+  // The last expectation in this sequence.
+  std::shared_ptr<Expectation> last_expectation_;
+};  // class Sequence
+
+// An object of this type causes all EXPECT_CALL() statements
+// encountered in its scope to be put in an anonymous sequence.  The
+// work is done in the constructor and destructor.  You should only
+// create an InSequence object on the stack.
+//
+// The sole purpose for this class is to support easy definition of
+// sequential expectations, e.g.
+//
+//   {
+//     InSequence dummy;  // The name of the object doesn't matter.
+//
+//     // The following expectations must match in the order they appear.
+//     EXPECT_CALL(a, Bar())...;
+//     EXPECT_CALL(a, Baz())...;
+//     ...
+//     EXPECT_CALL(b, Xyz())...;
+//   }
+//
+// You can create InSequence objects in multiple threads, as long as
+// they are used to affect different mock objects.  The idea is that
+// each thread can create and set up its own mocks as if it's the only
+// thread.  However, for clarity of your tests we recommend you to set
+// up mocks in the main thread unless you have a good reason not to do
+// so.
+class GTEST_API_ InSequence {
+ public:
+  InSequence();
+  ~InSequence();
+ private:
+  bool sequence_created_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InSequence);  // NOLINT
+} GTEST_ATTRIBUTE_UNUSED_;
+
+namespace internal {
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Base class for implementing expectations.
+//
+// There are two reasons for having a type-agnostic base class for
+// Expectation:
+//
+//   1. We need to store collections of expectations of different
+//   types (e.g. all pre-requisites of a particular expectation, all
+//   expectations in a sequence).  Therefore these expectation objects
+//   must share a common base class.
+//
+//   2. We can avoid binary code bloat by moving methods not depending
+//   on the template argument of Expectation to the base class.
+//
+// This class is internal and mustn't be used by user code directly.
+class GTEST_API_ ExpectationBase {
+ public:
+  // source_text is the EXPECT_CALL(...) source that created this Expectation.
+  ExpectationBase(const char* file, int line, const std::string& source_text);
+
+  virtual ~ExpectationBase();
+
+  // Where in the source file was the expectation spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+  const char* source_text() const { return source_text_.c_str(); }
+  // Returns the cardinality specified in the expectation spec.
+  const Cardinality& cardinality() const { return cardinality_; }
+
+  // Describes the source file location of this expectation.
+  void DescribeLocationTo(::std::ostream* os) const {
+    *os << FormatFileLocation(file(), line()) << " ";
+  }
+
+  // Describes how many times a function call matching this
+  // expectation has occurred.
+  void DescribeCallCountTo(::std::ostream* os) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
+
+ protected:
+  friend class ::testing::Expectation;
+  friend class UntypedFunctionMockerBase;
+
+  enum Clause {
+    // Don't change the order of the enum members!
+    kNone,
+    kWith,
+    kTimes,
+    kInSequence,
+    kAfter,
+    kWillOnce,
+    kWillRepeatedly,
+    kRetiresOnSaturation
+  };
+
+  typedef std::vector<const void*> UntypedActions;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  virtual Expectation GetHandle() = 0;
+
+  // Asserts that the EXPECT_CALL() statement has the given property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the EXPECT_CALL() statement has the given property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  // Explicitly specifies the cardinality of this expectation.  Used
+  // by the subclasses to implement the .Times() clause.
+  void SpecifyCardinality(const Cardinality& cardinality);
+
+  // Returns true if the user specified the cardinality explicitly
+  // using a .Times().
+  bool cardinality_specified() const { return cardinality_specified_; }
+
+  // Sets the cardinality of this expectation spec.
+  void set_cardinality(const Cardinality& a_cardinality) {
+    cardinality_ = a_cardinality;
+  }
+
+  // The following group of methods should only be called after the
+  // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
+  // the current thread.
+
+  // Retires all pre-requisites of this expectation.
+  void RetireAllPreRequisites()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns true if this expectation is retired.
+  bool is_retired() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return retired_;
+  }
+
+  // Retires this expectation.
+  void Retire()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    retired_ = true;
+  }
+
+  // Returns true if this expectation is satisfied.
+  bool IsSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSatisfiedByCallCount(call_count_);
+  }
+
+  // Returns true if this expectation is saturated.
+  bool IsSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if this expectation is over-saturated.
+  bool IsOverSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsOverSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if all pre-requisites of this expectation are satisfied.
+  bool AllPrerequisitesAreSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Adds unsatisfied pre-requisites of this expectation to 'result'.
+  void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns the number this expectation has been invoked.
+  int call_count() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return call_count_;
+  }
+
+  // Increments the number this expectation has been invoked.
+  void IncrementCallCount()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    call_count_++;
+  }
+
+  // Checks the action count (i.e. the number of WillOnce() and
+  // WillRepeatedly() clauses) against the cardinality if this hasn't
+  // been done before.  Prints a warning if there are too many or too
+  // few actions.
+  void CheckActionCountIfNotDone() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  friend class ::testing::Sequence;
+  friend class ::testing::internal::ExpectationTester;
+
+  template <typename Function>
+  friend class TypedExpectation;
+
+  // Implements the .Times() clause.
+  void UntypedTimes(const Cardinality& a_cardinality);
+
+  // This group of fields are part of the spec and won't change after
+  // an EXPECT_CALL() statement finishes.
+  const char* file_;          // The file that contains the expectation.
+  int line_;                  // The line number of the expectation.
+  const std::string source_text_;  // The EXPECT_CALL(...) source text.
+  // True if the cardinality is specified explicitly.
+  bool cardinality_specified_;
+  Cardinality cardinality_;            // The cardinality of the expectation.
+  // The immediate pre-requisites (i.e. expectations that must be
+  // satisfied before this expectation can be matched) of this
+  // expectation.  We use std::shared_ptr in the set because we want an
+  // Expectation object to be co-owned by its FunctionMocker and its
+  // successors.  This allows multiple mock objects to be deleted at
+  // different times.
+  ExpectationSet immediate_prerequisites_;
+
+  // This group of fields are the current state of the expectation,
+  // and can change as the mock function is called.
+  int call_count_;  // How many times this expectation has been invoked.
+  bool retired_;    // True if this expectation has retired.
+  UntypedActions untyped_actions_;
+  bool extra_matcher_specified_;
+  bool repeated_action_specified_;  // True if a WillRepeatedly() was specified.
+  bool retires_on_saturation_;
+  Clause last_clause_;
+  mutable bool action_count_checked_;  // Under mutex_.
+  mutable Mutex mutex_;  // Protects action_count_checked_.
+
+  GTEST_DISALLOW_ASSIGN_(ExpectationBase);
+};  // class ExpectationBase
+
+// Impements an expectation for the given function type.
+template <typename F>
+class TypedExpectation : public ExpectationBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+  typedef typename Function<F>::Result Result;
+
+  TypedExpectation(FunctionMocker<F>* owner, const char* a_file, int a_line,
+                   const std::string& a_source_text,
+                   const ArgumentMatcherTuple& m)
+      : ExpectationBase(a_file, a_line, a_source_text),
+        owner_(owner),
+        matchers_(m),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()),
+        repeated_action_(DoDefault()) {}
+
+  ~TypedExpectation() override {
+    // Check the validity of the action count if it hasn't been done
+    // yet (for example, if the expectation was never used).
+    CheckActionCountIfNotDone();
+    for (UntypedActions::const_iterator it = untyped_actions_.begin();
+         it != untyped_actions_.end(); ++it) {
+      delete static_cast<const Action<F>*>(*it);
+    }
+  }
+
+  // Implements the .With() clause.
+  TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
+    if (last_clause_ == kWith) {
+      ExpectSpecProperty(false,
+                         ".With() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWith,
+                         ".With() must be the first "
+                         "clause in an EXPECT_CALL().");
+    }
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    extra_matcher_specified_ = true;
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(const Cardinality& a_cardinality) {
+    ExpectationBase::UntypedTimes(a_cardinality);
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(int n) {
+    return Times(Exactly(n));
+  }
+
+  // Implements the .InSequence() clause.
+  TypedExpectation& InSequence(const Sequence& s) {
+    ExpectSpecProperty(last_clause_ <= kInSequence,
+                       ".InSequence() cannot appear after .After(),"
+                       " .WillOnce(), .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kInSequence;
+
+    s.AddExpectation(GetHandle());
+    return *this;
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
+    return InSequence(s1).InSequence(s2);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3) {
+    return InSequence(s1, s2).InSequence(s3);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4) {
+    return InSequence(s1, s2, s3).InSequence(s4);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4,
+                               const Sequence& s5) {
+    return InSequence(s1, s2, s3, s4).InSequence(s5);
+  }
+
+  // Implements that .After() clause.
+  TypedExpectation& After(const ExpectationSet& s) {
+    ExpectSpecProperty(last_clause_ <= kAfter,
+                       ".After() cannot appear after .WillOnce(),"
+                       " .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kAfter;
+
+    for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
+      immediate_prerequisites_ += *it;
+    }
+    return *this;
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
+    return After(s1).After(s2);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3) {
+    return After(s1, s2).After(s3);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4) {
+    return After(s1, s2, s3).After(s4);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4,
+                          const ExpectationSet& s5) {
+    return After(s1, s2, s3, s4).After(s5);
+  }
+
+  // Implements the .WillOnce() clause.
+  TypedExpectation& WillOnce(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ <= kWillOnce,
+                       ".WillOnce() cannot appear after "
+                       ".WillRepeatedly() or .RetiresOnSaturation().");
+    last_clause_ = kWillOnce;
+
+    untyped_actions_.push_back(new Action<F>(action));
+    if (!cardinality_specified()) {
+      set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
+    }
+    return *this;
+  }
+
+  // Implements the .WillRepeatedly() clause.
+  TypedExpectation& WillRepeatedly(const Action<F>& action) {
+    if (last_clause_ == kWillRepeatedly) {
+      ExpectSpecProperty(false,
+                         ".WillRepeatedly() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWillRepeatedly,
+                         ".WillRepeatedly() cannot appear "
+                         "after .RetiresOnSaturation().");
+    }
+    last_clause_ = kWillRepeatedly;
+    repeated_action_specified_ = true;
+
+    repeated_action_ = action;
+    if (!cardinality_specified()) {
+      set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
+    }
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Implements the .RetiresOnSaturation() clause.
+  TypedExpectation& RetiresOnSaturation() {
+    ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
+                       ".RetiresOnSaturation() cannot appear "
+                       "more than once.");
+    last_clause_ = kRetiresOnSaturation;
+    retires_on_saturation_ = true;
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Returns the matchers for the arguments as specified inside the
+  // EXPECT_CALL() macro.
+  const ArgumentMatcherTuple& matchers() const {
+    return matchers_;
+  }
+
+  // Returns the matcher specified by the .With() clause.
+  const Matcher<const ArgumentTuple&>& extra_matcher() const {
+    return extra_matcher_;
+  }
+
+  // Returns the action specified by the .WillRepeatedly() clause.
+  const Action<F>& repeated_action() const { return repeated_action_; }
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  void MaybeDescribeExtraMatcherTo(::std::ostream* os) override {
+    if (extra_matcher_specified_) {
+      *os << "    Expected args: ";
+      extra_matcher_.DescribeTo(os);
+      *os << "\n";
+    }
+  }
+
+ private:
+  template <typename Function>
+  friend class FunctionMocker;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  Expectation GetHandle() override { return owner_->GetHandleOf(this); }
+
+  // The following methods will be called only after the EXPECT_CALL()
+  // statement finishes and when the current thread holds
+  // g_gmock_mutex.
+
+  // Returns true if this expectation matches the given arguments.
+  bool Matches(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns true if this expectation should handle the given arguments.
+  bool ShouldHandleArguments(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // In case the action count wasn't checked when the expectation
+    // was defined (e.g. if this expectation has no WillRepeatedly()
+    // or RetiresOnSaturation() clause), we check it when the
+    // expectation is used for the first time.
+    CheckActionCountIfNotDone();
+    return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
+  }
+
+  // Describes the result of matching the arguments against this
+  // expectation to the given ostream.
+  void ExplainMatchResultTo(
+      const ArgumentTuple& args,
+      ::std::ostream* os) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    if (is_retired()) {
+      *os << "         Expected: the expectation is active\n"
+          << "           Actual: it is retired\n";
+    } else if (!Matches(args)) {
+      if (!TupleMatches(matchers_, args)) {
+        ExplainMatchFailureTupleTo(matchers_, args, os);
+      }
+      StringMatchResultListener listener;
+      if (!extra_matcher_.MatchAndExplain(args, &listener)) {
+        *os << "    Expected args: ";
+        extra_matcher_.DescribeTo(os);
+        *os << "\n           Actual: don't match";
+
+        internal::PrintIfNotEmpty(listener.str(), os);
+        *os << "\n";
+      }
+    } else if (!AllPrerequisitesAreSatisfied()) {
+      *os << "         Expected: all pre-requisites are satisfied\n"
+          << "           Actual: the following immediate pre-requisites "
+          << "are not satisfied:\n";
+      ExpectationSet unsatisfied_prereqs;
+      FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
+      int i = 0;
+      for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
+           it != unsatisfied_prereqs.end(); ++it) {
+        it->expectation_base()->DescribeLocationTo(os);
+        *os << "pre-requisite #" << i++ << "\n";
+      }
+      *os << "                   (end of pre-requisites)\n";
+    } else {
+      // This line is here just for completeness' sake.  It will never
+      // be executed as currently the ExplainMatchResultTo() function
+      // is called only when the mock function call does NOT match the
+      // expectation.
+      *os << "The call matches the expectation.\n";
+    }
+  }
+
+  // Returns the action that should be taken for the current invocation.
+  const Action<F>& GetCurrentAction(const FunctionMocker<F>* mocker,
+                                    const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const int count = call_count();
+    Assert(count >= 1, __FILE__, __LINE__,
+           "call_count() is <= 0 when GetCurrentAction() is "
+           "called - this should never happen.");
+
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    if (action_count > 0 && !repeated_action_specified_ &&
+        count > action_count) {
+      // If there is at least one WillOnce() and no WillRepeatedly(),
+      // we warn the user when the WillOnce() clauses ran out.
+      ::std::stringstream ss;
+      DescribeLocationTo(&ss);
+      ss << "Actions ran out in " << source_text() << "...\n"
+         << "Called " << count << " times, but only "
+         << action_count << " WillOnce()"
+         << (action_count == 1 ? " is" : "s are") << " specified - ";
+      mocker->DescribeDefaultActionTo(args, &ss);
+      Log(kWarning, ss.str(), 1);
+    }
+
+    return count <= action_count
+               ? *static_cast<const Action<F>*>(
+                     untyped_actions_[static_cast<size_t>(count - 1)])
+               : repeated_action();
+  }
+
+  // Given the arguments of a mock function call, if the call will
+  // over-saturate this expectation, returns the default action;
+  // otherwise, returns the next action in this expectation.  Also
+  // describes *what* happened to 'what', and explains *why* Google
+  // Mock does it to 'why'.  This method is not const as it calls
+  // IncrementCallCount().  A return value of NULL means the default
+  // action.
+  const Action<F>* GetActionForArguments(const FunctionMocker<F>* mocker,
+                                         const ArgumentTuple& args,
+                                         ::std::ostream* what,
+                                         ::std::ostream* why)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    if (IsSaturated()) {
+      // We have an excessive call.
+      IncrementCallCount();
+      *what << "Mock function called more times than expected - ";
+      mocker->DescribeDefaultActionTo(args, what);
+      DescribeCallCountTo(why);
+
+      return nullptr;
+    }
+
+    IncrementCallCount();
+    RetireAllPreRequisites();
+
+    if (retires_on_saturation_ && IsSaturated()) {
+      Retire();
+    }
+
+    // Must be done after IncrementCount()!
+    *what << "Mock function call matches " << source_text() <<"...\n";
+    return &(GetCurrentAction(mocker, args));
+  }
+
+  // All the fields below won't change once the EXPECT_CALL()
+  // statement finishes.
+  FunctionMocker<F>* const owner_;
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> repeated_action_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TypedExpectation);
+};  // class TypedExpectation
+
+// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
+// specifying the default behavior of, or expectation on, a mock
+// function.
+
+// Note: class MockSpec really belongs to the ::testing namespace.
+// However if we define it in ::testing, MSVC will complain when
+// classes in ::testing::internal declare it as a friend class
+// template.  To workaround this compiler bug, we define MockSpec in
+// ::testing::internal and import it into ::testing.
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message);
+
+template <typename F>
+class MockSpec {
+ public:
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename internal::Function<F>::ArgumentMatcherTuple
+      ArgumentMatcherTuple;
+
+  // Constructs a MockSpec object, given the function mocker object
+  // that the spec is associated with.
+  MockSpec(internal::FunctionMocker<F>* function_mocker,
+           const ArgumentMatcherTuple& matchers)
+      : function_mocker_(function_mocker), matchers_(matchers) {}
+
+  // Adds a new default action spec to the function mocker and returns
+  // the newly created spec.
+  internal::OnCallSpec<F>& InternalDefaultActionSetAt(
+      const char* file, int line, const char* obj, const char* call) {
+    LogWithLocation(internal::kInfo, file, line,
+                    std::string("ON_CALL(") + obj + ", " + call + ") invoked");
+    return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
+  }
+
+  // Adds a new expectation spec to the function mocker and returns
+  // the newly created spec.
+  internal::TypedExpectation<F>& InternalExpectedAt(
+      const char* file, int line, const char* obj, const char* call) {
+    const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
+                                  call + ")");
+    LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
+    return function_mocker_->AddNewExpectation(
+        file, line, source_text, matchers_);
+  }
+
+  // This operator overload is used to swallow the superfluous parameter list
+  // introduced by the ON/EXPECT_CALL macros. See the macro comments for more
+  // explanation.
+  MockSpec<F>& operator()(const internal::WithoutMatchers&, void* const) {
+    return *this;
+  }
+
+ private:
+  template <typename Function>
+  friend class internal::FunctionMocker;
+
+  // The function mocker that owns this spec.
+  internal::FunctionMocker<F>* const function_mocker_;
+  // The argument matchers specified in the spec.
+  ArgumentMatcherTuple matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(MockSpec);
+};  // class MockSpec
+
+// Wrapper type for generically holding an ordinary value or lvalue reference.
+// If T is not a reference type, it must be copyable or movable.
+// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
+// T is a move-only value type (which means that it will always be copyable
+// if the current platform does not support move semantics).
+//
+// The primary template defines handling for values, but function header
+// comments describe the contract for the whole template (including
+// specializations).
+template <typename T>
+class ReferenceOrValueWrapper {
+ public:
+  // Constructs a wrapper from the given value/reference.
+  explicit ReferenceOrValueWrapper(T value)
+      : value_(std::move(value)) {
+  }
+
+  // Unwraps and returns the underlying value/reference, exactly as
+  // originally passed. The behavior of calling this more than once on
+  // the same object is unspecified.
+  T Unwrap() { return std::move(value_); }
+
+  // Provides nondestructive access to the underlying value/reference.
+  // Always returns a const reference (more precisely,
+  // const std::add_lvalue_reference<T>::type). The behavior of calling this
+  // after calling Unwrap on the same object is unspecified.
+  const T& Peek() const {
+    return value_;
+  }
+
+ private:
+  T value_;
+};
+
+// Specialization for lvalue reference types. See primary template
+// for documentation.
+template <typename T>
+class ReferenceOrValueWrapper<T&> {
+ public:
+  // Workaround for debatable pass-by-reference lint warning (c-library-team
+  // policy precludes NOLINT in this context)
+  typedef T& reference;
+  explicit ReferenceOrValueWrapper(reference ref)
+      : value_ptr_(&ref) {}
+  T& Unwrap() { return *value_ptr_; }
+  const T& Peek() const { return *value_ptr_; }
+
+ private:
+  T* value_ptr_;
+};
+
+// MSVC warns about using 'this' in base member initializer list, so
+// we need to temporarily disable the warning.  We have to do it for
+// the entire class to suppress the warning, even though it's about
+// the constructor only.
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355)
+
+// C++ treats the void type specially.  For example, you cannot define
+// a void-typed variable or pass a void value to a function.
+// ActionResultHolder<T> holds a value of type T, where T must be a
+// copyable type or void (T doesn't need to be default-constructable).
+// It hides the syntactic difference between void and other types, and
+// is used to unify the code for invoking both void-returning and
+// non-void-returning mock functions.
+
+// Untyped base class for ActionResultHolder<T>.
+class UntypedActionResultHolderBase {
+ public:
+  virtual ~UntypedActionResultHolderBase() {}
+
+  // Prints the held value as an action's result to os.
+  virtual void PrintAsActionResult(::std::ostream* os) const = 0;
+};
+
+// This generic definition is used when T is not void.
+template <typename T>
+class ActionResultHolder : public UntypedActionResultHolderBase {
+ public:
+  // Returns the held value. Must not be called more than once.
+  T Unwrap() {
+    return result_.Unwrap();
+  }
+
+  // Prints the held value as an action's result to os.
+  void PrintAsActionResult(::std::ostream* os) const override {
+    *os << "\n          Returns: ";
+    // T may be a reference type, so we don't use UniversalPrint().
+    UniversalPrinter<T>::Print(result_.Peek(), os);
+  }
+
+  // Performs the given mock function's default action and returns the
+  // result in a new-ed ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    return new ActionResultHolder(Wrapper(func_mocker->PerformDefaultAction(
+        std::move(args), call_description)));
+  }
+
+  // Performs the given action and returns the result in a new-ed
+  // ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    return new ActionResultHolder(
+        Wrapper(action.Perform(std::move(args))));
+  }
+
+ private:
+  typedef ReferenceOrValueWrapper<T> Wrapper;
+
+  explicit ActionResultHolder(Wrapper result)
+      : result_(std::move(result)) {
+  }
+
+  Wrapper result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+// Specialization for T = void.
+template <>
+class ActionResultHolder<void> : public UntypedActionResultHolderBase {
+ public:
+  void Unwrap() { }
+
+  void PrintAsActionResult(::std::ostream* /* os */) const override {}
+
+  // Performs the given mock function's default action and returns ownership
+  // of an empty ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    func_mocker->PerformDefaultAction(std::move(args), call_description);
+    return new ActionResultHolder;
+  }
+
+  // Performs the given action and returns ownership of an empty
+  // ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    action.Perform(std::move(args));
+    return new ActionResultHolder;
+  }
+
+ private:
+  ActionResultHolder() {}
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+template <typename F>
+class FunctionMocker;
+
+template <typename R, typename... Args>
+class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
+  using F = R(Args...);
+
+ public:
+  using Result = R;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+
+  FunctionMocker() {}
+
+  // There is no generally useful and implementable semantics of
+  // copying a mock object, so copying a mock is usually a user error.
+  // Thus we disallow copying function mockers.  If the user really
+  // wants to copy a mock object, they should implement their own copy
+  // operation, for example:
+  //
+  //   class MockFoo : public Foo {
+  //    public:
+  //     // Defines a copy constructor explicitly.
+  //     MockFoo(const MockFoo& src) {}
+  //     ...
+  //   };
+  FunctionMocker(const FunctionMocker&) = delete;
+  FunctionMocker& operator=(const FunctionMocker&) = delete;
+
+  // The destructor verifies that all expectations on this mock
+  // function have been satisfied.  If not, it will report Google Test
+  // non-fatal failures for the violations.
+  ~FunctionMocker() override GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    MutexLock l(&g_gmock_mutex);
+    VerifyAndClearExpectationsLocked();
+    Mock::UnregisterLocked(this);
+    ClearDefaultActionsLocked();
+  }
+
+  // Returns the ON_CALL spec that matches this mock function with the
+  // given arguments; returns NULL if no matching ON_CALL is found.
+  // L = *
+  const OnCallSpec<F>* FindOnCallSpec(
+      const ArgumentTuple& args) const {
+    for (UntypedOnCallSpecs::const_reverse_iterator it
+             = untyped_on_call_specs_.rbegin();
+         it != untyped_on_call_specs_.rend(); ++it) {
+      const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
+      if (spec->Matches(args))
+        return spec;
+    }
+
+    return nullptr;
+  }
+
+  // Performs the default action of this mock function on the given
+  // arguments and returns the result. Asserts (or throws if
+  // exceptions are enabled) with a helpful call descrption if there
+  // is no valid return value. This method doesn't depend on the
+  // mutable state of this object, and thus can be called concurrently
+  // without locking.
+  // L = *
+  Result PerformDefaultAction(ArgumentTuple&& args,
+                              const std::string& call_description) const {
+    const OnCallSpec<F>* const spec =
+        this->FindOnCallSpec(args);
+    if (spec != nullptr) {
+      return spec->GetAction().Perform(std::move(args));
+    }
+    const std::string message =
+        call_description +
+        "\n    The mock function has no default action "
+        "set, and its return type has no default value set.";
+#if GTEST_HAS_EXCEPTIONS
+    if (!DefaultValue<Result>::Exists()) {
+      throw std::runtime_error(message);
+    }
+#else
+    Assert(DefaultValue<Result>::Exists(), "", -1, message);
+#endif
+    return DefaultValue<Result>::Get();
+  }
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.  The caller is responsible for deleting the result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args,  // must point to an ArgumentTuple
+      const std::string& call_description) const override {
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformDefaultAction(this, std::move(*args),
+                                              call_description);
+  }
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.  The caller is responsible for deleting the
+  // result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const override {
+    // Make a copy of the action before performing it, in case the
+    // action deletes the mock object (and thus deletes itself).
+    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformAction(action, std::move(*args));
+  }
+
+  // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
+  // clears the ON_CALL()s set on this mock function.
+  void ClearDefaultActionsLocked() override
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // Deleting our default actions may trigger other mock objects to be
+    // deleted, for example if an action contains a reference counted smart
+    // pointer to that mock object, and that is the last reference. So if we
+    // delete our actions within the context of the global mutex we may deadlock
+    // when this method is called again. Instead, make a copy of the set of
+    // actions to delete, clear our set within the mutex, and then delete the
+    // actions outside of the mutex.
+    UntypedOnCallSpecs specs_to_delete;
+    untyped_on_call_specs_.swap(specs_to_delete);
+
+    g_gmock_mutex.Unlock();
+    for (UntypedOnCallSpecs::const_iterator it =
+             specs_to_delete.begin();
+         it != specs_to_delete.end(); ++it) {
+      delete static_cast<const OnCallSpec<F>*>(*it);
+    }
+
+    // Lock the mutex again, since the caller expects it to be locked when we
+    // return.
+    g_gmock_mutex.Lock();
+  }
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.
+  Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    ArgumentTuple tuple(std::forward<Args>(args)...);
+    std::unique_ptr<ResultHolder> holder(DownCast_<ResultHolder*>(
+        this->UntypedInvokeWith(static_cast<void*>(&tuple))));
+    return holder->Unwrap();
+  }
+
+  MockSpec<F> With(Matcher<Args>... m) {
+    return MockSpec<F>(this, ::std::make_tuple(std::move(m)...));
+  }
+
+ protected:
+  template <typename Function>
+  friend class MockSpec;
+
+  typedef ActionResultHolder<Result> ResultHolder;
+
+  // Adds and returns a default action spec for this mock function.
+  OnCallSpec<F>& AddNewOnCallSpec(
+      const char* file, int line,
+      const ArgumentMatcherTuple& m)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
+    untyped_on_call_specs_.push_back(on_call_spec);
+    return *on_call_spec;
+  }
+
+  // Adds and returns an expectation spec for this mock function.
+  TypedExpectation<F>& AddNewExpectation(const char* file, int line,
+                                         const std::string& source_text,
+                                         const ArgumentMatcherTuple& m)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    TypedExpectation<F>* const expectation =
+        new TypedExpectation<F>(this, file, line, source_text, m);
+    const std::shared_ptr<ExpectationBase> untyped_expectation(expectation);
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    untyped_expectations_.push_back(untyped_expectation);
+
+    // Adds this expectation into the implicit sequence if there is one.
+    Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
+    if (implicit_sequence != nullptr) {
+      implicit_sequence->AddExpectation(Expectation(untyped_expectation));
+    }
+
+    return *expectation;
+  }
+
+ private:
+  template <typename Func> friend class TypedExpectation;
+
+  // Some utilities needed for implementing UntypedInvokeWith().
+
+  // Describes what default action will be performed for the given
+  // arguments.
+  // L = *
+  void DescribeDefaultActionTo(const ArgumentTuple& args,
+                               ::std::ostream* os) const {
+    const OnCallSpec<F>* const spec = FindOnCallSpec(args);
+
+    if (spec == nullptr) {
+      *os << (std::is_void<Result>::value ? "returning directly.\n"
+                                          : "returning default value.\n");
+    } else {
+      *os << "taking default action specified at:\n"
+          << FormatFileLocation(spec->file(), spec->line()) << "\n";
+    }
+  }
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  void UntypedDescribeUninterestingCall(const void* untyped_args,
+                                        ::std::ostream* os) const override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    *os << "Uninteresting mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    *os << "    Function call: " << Name();
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  //
+  // Critical section: We must find the matching expectation and the
+  // corresponding action that needs to be taken in an ATOMIC
+  // transaction.  Otherwise another thread may call this mock
+  // method in the middle and mess up the state.
+  //
+  // However, performing the action has to be left out of the critical
+  // section.  The reason is that we have no control on what the
+  // action does (it can invoke an arbitrary user function or even a
+  // mock function) and excessive locking could cause a dead lock.
+  const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args, const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why) override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    MutexLock l(&g_gmock_mutex);
+    TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
+    if (exp == nullptr) {  // A match wasn't found.
+      this->FormatUnexpectedCallMessageLocked(args, what, why);
+      return nullptr;
+    }
+
+    // This line must be done before calling GetActionForArguments(),
+    // which will increment the call count for *exp and thus affect
+    // its saturation status.
+    *is_excessive = exp->IsSaturated();
+    const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
+    if (action != nullptr && action->IsDoDefault())
+      action = nullptr;  // Normalize "do default" to NULL.
+    *untyped_action = action;
+    return exp;
+  }
+
+  // Prints the given function arguments to the ostream.
+  void UntypedPrintArgs(const void* untyped_args,
+                        ::std::ostream* os) const override {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the arguments, or NULL if no
+  // expectation matches them.
+  TypedExpectation<F>* FindMatchingExpectationLocked(
+      const ArgumentTuple& args) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    for (typename UntypedExpectations::const_reverse_iterator it =
+             untyped_expectations_.rbegin();
+         it != untyped_expectations_.rend(); ++it) {
+      TypedExpectation<F>* const exp =
+          static_cast<TypedExpectation<F>*>(it->get());
+      if (exp->ShouldHandleArguments(args)) {
+        return exp;
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns a message that the arguments don't match any expectation.
+  void FormatUnexpectedCallMessageLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* os,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    *os << "\nUnexpected mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    PrintTriedExpectationsLocked(args, why);
+  }
+
+  // Prints a list of expectations that have been tried against the
+  // current mock function call.
+  void PrintTriedExpectationsLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const size_t count = untyped_expectations_.size();
+    *why << "Google Mock tried the following " << count << " "
+         << (count == 1 ? "expectation, but it didn't match" :
+             "expectations, but none matched")
+         << ":\n";
+    for (size_t i = 0; i < count; i++) {
+      TypedExpectation<F>* const expectation =
+          static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
+      *why << "\n";
+      expectation->DescribeLocationTo(why);
+      if (count > 1) {
+        *why << "tried expectation #" << i << ": ";
+      }
+      *why << expectation->source_text() << "...\n";
+      expectation->ExplainMatchResultTo(args, why);
+      expectation->DescribeCallCountTo(why);
+    }
+  }
+};  // class FunctionMocker
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4355
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg);
+
+}  // namespace internal
+
+// A MockFunction<F> class has one mock method whose type is F.  It is
+// useful when you just want your test code to emit some messages and
+// have Google Mock verify the right messages are sent (and perhaps at
+// the right times).  For example, if you are exercising code:
+//
+//   Foo(1);
+//   Foo(2);
+//   Foo(3);
+//
+// and want to verify that Foo(1) and Foo(3) both invoke
+// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
+//
+// TEST(FooTest, InvokesBarCorrectly) {
+//   MyMock mock;
+//   MockFunction<void(string check_point_name)> check;
+//   {
+//     InSequence s;
+//
+//     EXPECT_CALL(mock, Bar("a"));
+//     EXPECT_CALL(check, Call("1"));
+//     EXPECT_CALL(check, Call("2"));
+//     EXPECT_CALL(mock, Bar("a"));
+//   }
+//   Foo(1);
+//   check.Call("1");
+//   Foo(2);
+//   check.Call("2");
+//   Foo(3);
+// }
+//
+// The expectation spec says that the first Bar("a") must happen
+// before check point "1", the second Bar("a") must happen after check
+// point "2", and nothing should happen between the two check
+// points. The explicit check points make it easy to tell which
+// Bar("a") is called by which call to Foo().
+//
+// MockFunction<F> can also be used to exercise code that accepts
+// std::function<F> callbacks. To do so, use AsStdFunction() method
+// to create std::function proxy forwarding to original object's Call.
+// Example:
+//
+// TEST(FooTest, RunsCallbackWithBarArgument) {
+//   MockFunction<int(string)> callback;
+//   EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
+//   Foo(callback.AsStdFunction());
+// }
+template <typename F>
+class MockFunction;
+
+template <typename R, typename... Args>
+class MockFunction<R(Args...)> {
+ public:
+  MockFunction() {}
+  MockFunction(const MockFunction&) = delete;
+  MockFunction& operator=(const MockFunction&) = delete;
+
+  std::function<R(Args...)> AsStdFunction() {
+    return [this](Args... args) -> R {
+      return this->Call(std::forward<Args>(args)...);
+    };
+  }
+
+  // Implementation detail: the expansion of the MOCK_METHOD macro.
+  R Call(Args... args) {
+    mock_.SetOwnerAndName(this, "Call");
+    return mock_.Invoke(std::forward<Args>(args)...);
+  }
+
+  internal::MockSpec<R(Args...)> gmock_Call(Matcher<Args>... m) {
+    mock_.RegisterOwner(this);
+    return mock_.With(std::move(m)...);
+  }
+
+  internal::MockSpec<R(Args...)> gmock_Call(const internal::WithoutMatchers&,
+                                            R (*)(Args...)) {
+    return this->gmock_Call(::testing::A<Args>()...);
+  }
+
+ private:
+  internal::FunctionMocker<R(Args...)> mock_;
+};
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the MockSpec class template is
+// meant to be defined in the ::testing namespace.  The following line
+// is just a trick for working around a bug in MSVC 8.0, which cannot
+// handle it if we define MockSpec in ::testing.
+using internal::MockSpec;
+
+// Const(x) is a convenient function for obtaining a const reference
+// to x.  This is useful for setting expectations on an overloaded
+// const mock method, e.g.
+//
+//   class MockFoo : public FooInterface {
+//    public:
+//     MOCK_METHOD0(Bar, int());
+//     MOCK_CONST_METHOD0(Bar, int&());
+//   };
+//
+//   MockFoo foo;
+//   // Expects a call to non-const MockFoo::Bar().
+//   EXPECT_CALL(foo, Bar());
+//   // Expects a call to const MockFoo::Bar().
+//   EXPECT_CALL(Const(foo), Bar());
+template <typename T>
+inline const T& Const(const T& x) { return x; }
+
+// Constructs an Expectation object that references and co-owns exp.
+inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
+    : expectation_base_(exp.GetHandle().expectation_base()) {}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Implementation for ON_CALL and EXPECT_CALL macros. A separate macro is
+// required to avoid compile errors when the name of the method used in call is
+// a result of macro expansion. See CompilesWithMethodNameExpandedFromMacro
+// tests in internal/gmock-spec-builders_test.cc for more details.
+//
+// This macro supports statements both with and without parameter matchers. If
+// the parameter list is omitted, gMock will accept any parameters, which allows
+// tests to be written that don't need to encode the number of method
+// parameter. This technique may only be used for non-overloaded methods.
+//
+//   // These are the same:
+//   ON_CALL(mock, NoArgsMethod()).WillByDefault(...);
+//   ON_CALL(mock, NoArgsMethod).WillByDefault(...);
+//
+//   // As are these:
+//   ON_CALL(mock, TwoArgsMethod(_, _)).WillByDefault(...);
+//   ON_CALL(mock, TwoArgsMethod).WillByDefault(...);
+//
+//   // Can also specify args if you want, of course:
+//   ON_CALL(mock, TwoArgsMethod(_, 45)).WillByDefault(...);
+//
+//   // Overloads work as long as you specify parameters:
+//   ON_CALL(mock, OverloadedMethod(_)).WillByDefault(...);
+//   ON_CALL(mock, OverloadedMethod(_, _)).WillByDefault(...);
+//
+//   // Oops! Which overload did you want?
+//   ON_CALL(mock, OverloadedMethod).WillByDefault(...);
+//     => ERROR: call to member function 'gmock_OverloadedMethod' is ambiguous
+//
+// How this works: The mock class uses two overloads of the gmock_Method
+// expectation setter method plus an operator() overload on the MockSpec object.
+// In the matcher list form, the macro expands to:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod(_, 45))...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(_, 45)(WithoutMatchers(), nullptr)...
+//   |-------------v---------------||------------v-------------|
+//       invokes first overload        swallowed by operator()
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, 45)...
+//
+// Whereas the form without a matcher list:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod)...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(WithoutMatchers(), nullptr)...
+//   |-----------------------v--------------------------|
+//                 invokes second overload
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, _)...
+//
+// The WithoutMatchers() argument is used to disambiguate overloads and to
+// block the caller from accidentally invoking the second overload directly. The
+// second argument is an internal type derived from the method signature. The
+// failure to disambiguate two overloads of this method in the ON_CALL statement
+// is how we block callers from setting expectations on overloaded methods.
+#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call)                    \
+  ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
+                             nullptr)                                   \
+      .Setter(__FILE__, __LINE__, #mock_expr, #call)
+
+#define ON_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
+
+#define EXPECT_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock.h
new file mode 100755
index 0000000..99c3d78
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/gmock.h
@@ -0,0 +1,101 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This is the main header file a user should include.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_H_
+
+// This file implements the following syntax:
+//
+//   ON_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .WillByDefault(...);
+//
+// where With() is optional and WillByDefault() must appear exactly
+// once.
+//
+//   EXPECT_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .Times(...) ?
+//     .InSequence(...) *
+//     .WillOnce(...) *
+//     .WillRepeatedly(...) ?
+//     .RetiresOnSaturation() ? ;
+//
+// where all clauses are optional and WillOnce() can be repeated.
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-function-mocker.h"
+#include "gmock/gmock-generated-actions.h"
+#include "gmock/gmock-generated-function-mockers.h"
+#include "gmock/gmock-generated-matchers.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-actions.h"
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/gmock-nice-strict.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+
+// Declares Google Mock flags that we want a user to use programmatically.
+GMOCK_DECLARE_bool_(catch_leaked_mocks);
+GMOCK_DECLARE_string_(verbose);
+GMOCK_DECLARE_int32_(default_mock_behavior);
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses the command line for the flags
+// that Google Mock recognizes.  Whenever a Google Mock flag is seen,
+// it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock();
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/README.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/README.md
new file mode 100755
index 0000000..f6c93f6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/README.md
@@ -0,0 +1,16 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gmock-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GMOCK_DECLARE_bool_(name)`
+*   `GMOCK_DECLARE_int32_(name)`
+*   `GMOCK_DECLARE_string_(name)`
+*   `GMOCK_DEFINE_bool_(name, default_val, doc)`
+*   `GMOCK_DEFINE_int32_(name, default_val, doc)`
+*   `GMOCK_DEFINE_string_(name, default_val, doc)`
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
new file mode 100755
index 0000000..92d910c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -0,0 +1,10 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-actions.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump
new file mode 100755
index 0000000..67c221f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump
@@ -0,0 +1,12 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file. Please use Pump to convert
+$$ it to callback-actions.h.
+$$
+$var max_callback_arity = 5
+$$}} This meta comment fixes auto-indentation in editors.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-matchers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-matchers.h
new file mode 100755
index 0000000..14aafaa
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -0,0 +1,36 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-port.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-port.h
new file mode 100755
index 0000000..0030fe9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -0,0 +1,39 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-internal-utils.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-internal-utils.h
new file mode 100755
index 0000000..53b6d97
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -0,0 +1,514 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include <type_traits>
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+template <typename>
+class Matcher;
+
+namespace internal {
+
+// Silence MSVC C4100 (unreferenced formal parameter) and
+// C4805('==': unsafe mix of type 'const int' and type 'const bool')
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+# pragma warning(disable:4805)
+#endif
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields);
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name);
+
+// PointeeOf<Pointer>::type is the type of a value pointed to by a
+// Pointer, which can be either a smart pointer or a raw pointer.  The
+// following default implementation is for the case where Pointer is a
+// smart pointer.
+template <typename Pointer>
+struct PointeeOf {
+  // Smart pointer classes define type element_type as the type of
+  // their pointees.
+  typedef typename Pointer::element_type type;
+};
+// This specialization is for the raw pointer case.
+template <typename T>
+struct PointeeOf<T*> { typedef T type; };  // NOLINT
+
+// GetRawPointer(p) returns the raw pointer underlying p when p is a
+// smart pointer, or returns p itself when p is already a raw pointer.
+// The following default implementation is for the smart pointer case.
+template <typename Pointer>
+inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
+  return p.get();
+}
+// This overloaded version is for the raw pointer case.
+template <typename Element>
+inline Element* GetRawPointer(Element* p) { return p; }
+
+// MSVC treats wchar_t as a native type usually, but treats it as the
+// same as unsigned short when the compiler option /Zc:wchar_t- is
+// specified.  It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
+// is a native type.
+#if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
+// wchar_t is a typedef.
+#else
+# define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#endif
+
+// In what follows, we use the term "kind" to indicate whether a type
+// is bool, an integer type (excluding bool), a floating-point type,
+// or none of them.  This categorization is useful for determining
+// when a matcher argument type can be safely converted to another
+// type in the implementation of SafeMatcherCast.
+enum TypeKind {
+  kBool, kInteger, kFloatingPoint, kOther
+};
+
+// KindOf<T>::value is the kind of type T.
+template <typename T> struct KindOf {
+  enum { value = kOther };  // The default kind.
+};
+
+// This macro declares that the kind of 'type' is 'kind'.
+#define GMOCK_DECLARE_KIND_(type, kind) \
+  template <> struct KindOf<type> { enum { value = kind }; }
+
+GMOCK_DECLARE_KIND_(bool, kBool);
+
+// All standard integer types.
+GMOCK_DECLARE_KIND_(char, kInteger);
+GMOCK_DECLARE_KIND_(signed char, kInteger);
+GMOCK_DECLARE_KIND_(unsigned char, kInteger);
+GMOCK_DECLARE_KIND_(short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(int, kInteger);
+GMOCK_DECLARE_KIND_(unsigned int, kInteger);
+GMOCK_DECLARE_KIND_(long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger);  // NOLINT
+
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DECLARE_KIND_(wchar_t, kInteger);
+#endif
+
+// Non-standard integer types.
+GMOCK_DECLARE_KIND_(Int64, kInteger);
+GMOCK_DECLARE_KIND_(UInt64, kInteger);
+
+// All standard floating-point types.
+GMOCK_DECLARE_KIND_(float, kFloatingPoint);
+GMOCK_DECLARE_KIND_(double, kFloatingPoint);
+GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
+
+#undef GMOCK_DECLARE_KIND_
+
+// Evaluates to the kind of 'type'.
+#define GMOCK_KIND_OF_(type) \
+  static_cast< ::testing::internal::TypeKind>( \
+      ::testing::internal::KindOf<type>::value)
+
+// Evaluates to true if integer type T is signed.
+#define GMOCK_IS_SIGNED_(T) (static_cast<T>(-1) < 0)
+
+// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
+// is true if arithmetic type From can be losslessly converted to
+// arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types, kFromKind is the kind of
+// From, and kToKind is the kind of To; the value is
+// implementation-defined when the above pre-condition is violated.
+template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
+struct LosslessArithmeticConvertibleImpl : public std::false_type {};
+
+// Converting bool to bool is lossless.
+template <>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kBool, bool>
+    : public std::true_type {};
+
+// Converting bool to any integer type is lossless.
+template <typename To>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kInteger, To>
+    : public std::true_type {};
+
+// Converting bool to any floating-point type is lossless.
+template <typename To>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kFloatingPoint, To>
+    : public std::true_type {};
+
+// Converting an integer to bool is lossy.
+template <typename From>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kBool, bool>
+    : public std::false_type {};
+
+// Converting an integer to another non-bool integer is lossless if
+// the target type's range encloses the source type's range.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kInteger, To>
+    : public bool_constant<
+      // When converting from a smaller size to a larger size, we are
+      // fine as long as we are not converting from signed to unsigned.
+      ((sizeof(From) < sizeof(To)) &&
+       (!GMOCK_IS_SIGNED_(From) || GMOCK_IS_SIGNED_(To))) ||
+      // When converting between the same size, the signedness must match.
+      ((sizeof(From) == sizeof(To)) &&
+       (GMOCK_IS_SIGNED_(From) == GMOCK_IS_SIGNED_(To)))> {};  // NOLINT
+
+#undef GMOCK_IS_SIGNED_
+
+// Converting an integer to a floating-point type may be lossy, since
+// the format of a floating-point number is implementation-defined.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kFloatingPoint, To>
+    : public std::false_type {};
+
+// Converting a floating-point to bool is lossy.
+template <typename From>
+struct LosslessArithmeticConvertibleImpl<kFloatingPoint, From, kBool, bool>
+    : public std::false_type {};
+
+// Converting a floating-point to an integer is lossy.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kFloatingPoint, From, kInteger, To>
+    : public std::false_type {};
+
+// Converting a floating-point to another floating-point is lossless
+// if the target type is at least as big as the source type.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<
+  kFloatingPoint, From, kFloatingPoint, To>
+    : public bool_constant<sizeof(From) <= sizeof(To)> {};  // NOLINT
+
+// LosslessArithmeticConvertible<From, To>::value is true if arithmetic
+// type From can be losslessly converted to arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types; the value is
+// implementation-defined when the above pre-condition is violated.
+template <typename From, typename To>
+struct LosslessArithmeticConvertible
+    : public LosslessArithmeticConvertibleImpl<
+  GMOCK_KIND_OF_(From), From, GMOCK_KIND_OF_(To), To> {};  // NOLINT
+
+// This interface knows how to report a Google Mock failure (either
+// non-fatal or fatal).
+class FailureReporterInterface {
+ public:
+  // The type of a failure (either non-fatal or fatal).
+  enum FailureType {
+    kNonfatal, kFatal
+  };
+
+  virtual ~FailureReporterInterface() {}
+
+  // Reports a failure that occurred at the given source file location.
+  virtual void ReportFailure(FailureType type, const char* file, int line,
+                             const std::string& message) = 0;
+};
+
+// Returns the failure reporter used by Google Mock.
+GTEST_API_ FailureReporterInterface* GetFailureReporter();
+
+// Asserts that condition is true; aborts the process with the given
+// message if condition is false.  We cannot use LOG(FATAL) or CHECK()
+// as Google Mock might be used to mock the log sink itself.  We
+// inline this function to prevent it from showing up in the stack
+// trace.
+inline void Assert(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal,
+                                        file, line, msg);
+  }
+}
+inline void Assert(bool condition, const char* file, int line) {
+  Assert(condition, file, line, "Assertion failed.");
+}
+
+// Verifies that condition is true; generates a non-fatal failure if
+// condition is false.
+inline void Expect(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
+                                        file, line, msg);
+  }
+}
+inline void Expect(bool condition, const char* file, int line) {
+  Expect(condition, file, line, "Expectation failed.");
+}
+
+// Severity level of a log.
+enum LogSeverity {
+  kInfo = 0,
+  kWarning = 1
+};
+
+// Valid values for the --gmock_verbose flag.
+
+// All logs (informational and warnings) are printed.
+const char kInfoVerbosity[] = "info";
+// Only warnings are printed.
+const char kWarningVerbosity[] = "warning";
+// No logs are printed.
+const char kErrorVerbosity[] = "error";
+
+// Returns true if a log with the given severity is visible according
+// to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity);
+
+// Prints the given message to stdout if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip);
+
+// A marker class that is used to resolve parameterless expectations to the
+// correct overload. This must not be instantiable, to prevent client code from
+// accidentally resolving to the overload; for example:
+//
+//    ON_CALL(mock, Method({}, nullptr))...
+//
+class WithoutMatchers {
+ private:
+  WithoutMatchers() {}
+  friend GTEST_API_ WithoutMatchers GetWithoutMatchers();
+};
+
+// Internal use only: access the singleton instance of WithoutMatchers.
+GTEST_API_ WithoutMatchers GetWithoutMatchers();
+
+// Type traits.
+
+// Disable MSVC warnings for infinite recursion, since in this case the
+// the recursion is unreachable.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4717)
+#endif
+
+// Invalid<T>() is usable as an expression of type T, but will terminate
+// the program with an assertion failure if actually run.  This is useful
+// when a value of type T is needed for compilation, but the statement
+// will not really be executed (or we don't care if the statement
+// crashes).
+template <typename T>
+inline T Invalid() {
+  Assert(false, "", -1, "Internal error: attempt to return invalid value");
+  // This statement is unreachable, and would never terminate even if it
+  // could be reached. It is provided only to placate compiler warnings
+  // about missing return statements.
+  return Invalid<T>();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+// Given a raw type (i.e. having no top-level reference or const
+// modifier) RawContainer that's either an STL-style container or a
+// native array, class StlContainerView<RawContainer> has the
+// following members:
+//
+//   - type is a type that provides an STL-style container view to
+//     (i.e. implements the STL container concept for) RawContainer;
+//   - const_reference is a type that provides a reference to a const
+//     RawContainer;
+//   - ConstReference(raw_container) returns a const reference to an STL-style
+//     container view to raw_container, which is a RawContainer.
+//   - Copy(raw_container) returns an STL-style container view of a
+//     copy of raw_container, which is a RawContainer.
+//
+// This generic version is used when RawContainer itself is already an
+// STL-style container.
+template <class RawContainer>
+class StlContainerView {
+ public:
+  typedef RawContainer type;
+  typedef const type& const_reference;
+
+  static const_reference ConstReference(const RawContainer& container) {
+    // Ensures that RawContainer is not a const type.
+    testing::StaticAssertTypeEq<
+        RawContainer, typename std::remove_const<RawContainer>::type>();
+    return container;
+  }
+  static type Copy(const RawContainer& container) { return container; }
+};
+
+// This specialization is used when RawContainer is a native array type.
+template <typename Element, size_t N>
+class StlContainerView<Element[N]> {
+ public:
+  typedef typename std::remove_const<Element>::type RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  // NativeArray<T> can represent a native array either by value or by
+  // reference (selected by a constructor argument), so 'const type'
+  // can be used to reference a const native array.  We cannot
+  // 'typedef const type& const_reference' here, as that would mean
+  // ConstReference() has to return a reference to a local variable.
+  typedef const type const_reference;
+
+  static const_reference ConstReference(const Element (&array)[N]) {
+    // Ensures that Element is not a const type.
+    testing::StaticAssertTypeEq<Element, RawElement>();
+    return type(array, N, RelationToSourceReference());
+  }
+  static type Copy(const Element (&array)[N]) {
+    return type(array, N, RelationToSourceCopy());
+  }
+};
+
+// This specialization is used when RawContainer is a native array
+// represented as a (pointer, size) tuple.
+template <typename ElementPointer, typename Size>
+class StlContainerView< ::std::tuple<ElementPointer, Size> > {
+ public:
+  typedef typename std::remove_const<
+      typename internal::PointeeOf<ElementPointer>::type>::type RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  typedef const type const_reference;
+
+  static const_reference ConstReference(
+      const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array),
+                RelationToSourceReference());
+  }
+  static type Copy(const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array), RelationToSourceCopy());
+  }
+};
+
+// The following specialization prevents the user from instantiating
+// StlContainer with a reference type.
+template <typename T> class StlContainerView<T&>;
+
+// A type transform to remove constness from the first part of a pair.
+// Pairs like that are used as the value_type of associative containers,
+// and this transform produces a similar but assignable pair.
+template <typename T>
+struct RemoveConstFromKey {
+  typedef T type;
+};
+
+// Partially specialized to remove constness from std::pair<const K, V>.
+template <typename K, typename V>
+struct RemoveConstFromKey<std::pair<const K, V> > {
+  typedef std::pair<K, V> type;
+};
+
+// Emit an assertion failure due to incorrect DoDefault() usage. Out-of-lined to
+// reduce code size.
+GTEST_API_ void IllegalDoDefault(const char* file, int line);
+
+template <typename F, typename Tuple, size_t... Idx>
+auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>) -> decltype(
+    std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...)) {
+  return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
+}
+
+// Apply the function to a tuple of arguments.
+template <typename F, typename Tuple>
+auto Apply(F&& f, Tuple&& args)
+    -> decltype(ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+                          MakeIndexSequence<std::tuple_size<Tuple>::value>())) {
+  return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+                   MakeIndexSequence<std::tuple_size<Tuple>::value>());
+}
+
+// Template struct Function<F>, where F must be a function type, contains
+// the following typedefs:
+//
+//   Result:               the function's return type.
+//   Arg<N>:               the type of the N-th argument, where N starts with 0.
+//   ArgumentTuple:        the tuple type consisting of all parameters of F.
+//   ArgumentMatcherTuple: the tuple type consisting of Matchers for all
+//                         parameters of F.
+//   MakeResultVoid:       the function type obtained by substituting void
+//                         for the return type of F.
+//   MakeResultIgnoredValue:
+//                         the function type obtained by substituting Something
+//                         for the return type of F.
+template <typename T>
+struct Function;
+
+template <typename R, typename... Args>
+struct Function<R(Args...)> {
+  using Result = R;
+  static constexpr size_t ArgumentCount = sizeof...(Args);
+  template <size_t I>
+  using Arg = ElemFromList<I, typename MakeIndexSequence<sizeof...(Args)>::type,
+                           Args...>;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+  using MakeResultVoid = void(Args...);
+  using MakeResultIgnoredValue = IgnoredValue(Args...);
+};
+
+template <typename R, typename... Args>
+constexpr size_t Function<R(Args...)>::ArgumentCount;
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-port.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-port.h
new file mode 100755
index 0000000..063e292
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-port.h
@@ -0,0 +1,87 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Low-level types and utilities for porting Google Mock to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Mock MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Mock's public API and can be used by
+// code outside Google Mock.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include <iostream>
+
+// Most of the utilities needed for porting Google Mock are also
+// required for Google Test and are defined in gtest-port.h.
+//
+// Note to maintainers: to reduce code duplication, prefer adding
+// portability utilities to Google Test's gtest-port.h instead of
+// here, as Google Mock depends on Google Test.  Only add a utility
+// here if it's truly specific to Google Mock.
+
+#include "gtest/internal/gtest-port.h"
+#include "gmock/internal/custom/gmock-port.h"
+
+// For MS Visual C++, check the compiler version. At least VS 2015 is
+// required to compile Google Mock.
+#if defined(_MSC_VER) && _MSC_VER < 1900
+# error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
+#endif
+
+// Macro for referencing flags.  This is public as we want the user to
+// use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG(name) FLAGS_gmock_##name
+
+#if !defined(GMOCK_DECLARE_bool_)
+
+// Macros for declaring flags.
+# define GMOCK_DECLARE_bool_(name) extern GTEST_API_ bool GMOCK_FLAG(name)
+# define GMOCK_DECLARE_int32_(name) \
+    extern GTEST_API_ ::testing::internal::Int32 GMOCK_FLAG(name)
+# define GMOCK_DECLARE_string_(name) \
+    extern GTEST_API_ ::std::string GMOCK_FLAG(name)
+
+// Macros for defining flags.
+# define GMOCK_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val)
+
+#endif  // !defined(GMOCK_DECLARE_bool_)
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-pp.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-pp.h
new file mode 100755
index 0000000..1ab80e1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/include/gmock/internal/gmock-pp.h
@@ -0,0 +1,317 @@
+#ifndef THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_PP_H_
+#define THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_PP_H_
+
+#undef GMOCK_PP_INTERNAL_USE_MSVC
+#if defined(__clang__)
+#define GMOCK_PP_INTERNAL_USE_MSVC 0
+#elif defined(_MSC_VER)
+// TODO(iserna): Also verify tradional versus comformant preprocessor.
+static_assert(
+    _MSC_VER >= 1900,
+    "MSVC version not supported. There is support for MSVC 14.0 and above.");
+#define GMOCK_PP_INTERNAL_USE_MSVC 1
+#else
+#define GMOCK_PP_INTERNAL_USE_MSVC 0
+#endif
+
+// Expands and concatenates the arguments. Constructed macros reevaluate.
+#define GMOCK_PP_CAT(_1, _2) GMOCK_PP_INTERNAL_CAT(_1, _2)
+
+// Expands and stringifies the only argument.
+#define GMOCK_PP_STRINGIZE(...) GMOCK_PP_INTERNAL_STRINGIZE(__VA_ARGS__)
+
+// Returns empty. Given a variadic number of arguments.
+#define GMOCK_PP_EMPTY(...)
+
+// Returns a comma. Given a variadic number of arguments.
+#define GMOCK_PP_COMMA(...) ,
+
+// Returns the only argument.
+#define GMOCK_PP_IDENTITY(_1) _1
+
+// MSVC preprocessor collapses __VA_ARGS__ in a single argument, we use a
+// CAT-like directive to force correct evaluation. Each macro has its own.
+#if GMOCK_PP_INTERNAL_USE_MSVC
+
+// Evaluates to the number of arguments after expansion.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG() => 1
+//   GMOCK_PP_NARG(x) => 1
+//   GMOCK_PP_NARG(x, y) => 2
+//   GMOCK_PP_NARG(PAIR) => 2
+//
+// Requires: the number of arguments after expansion is at most 15.
+#define GMOCK_PP_NARG(...)                                                    \
+  GMOCK_PP_INTERNAL_NARG_CAT(                                                 \
+      GMOCK_PP_INTERNAL_INTERNAL_16TH(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, \
+                                      8, 7, 6, 5, 4, 3, 2, 1), )
+
+// Returns 1 if the expansion of arguments has an unprotected comma. Otherwise
+// returns 0. Requires no more than 15 unprotected commas.
+#define GMOCK_PP_HAS_COMMA(...)                                               \
+  GMOCK_PP_INTERNAL_HAS_COMMA_CAT(                                            \
+      GMOCK_PP_INTERNAL_INTERNAL_16TH(__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+                                      1, 1, 1, 1, 1, 0), )
+// Returns the first argument.
+#define GMOCK_PP_HEAD(...) \
+  GMOCK_PP_INTERNAL_HEAD_CAT(GMOCK_PP_INTERNAL_HEAD(__VA_ARGS__), )
+
+// Returns the tail. A variadic list of all arguments minus the first. Requires
+// at least one argument.
+#define GMOCK_PP_TAIL(...) \
+  GMOCK_PP_INTERNAL_TAIL_CAT(GMOCK_PP_INTERNAL_TAIL(__VA_ARGS__), )
+
+// Calls CAT(_Macro, NARG(__VA_ARGS__))(__VA_ARGS__)
+#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
+  GMOCK_PP_INTERNAL_VARIADIC_CALL_CAT(      \
+      GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__), )
+
+#else  // GMOCK_PP_INTERNAL_USE_MSVC
+
+#define GMOCK_PP_NARG(...)                                                   \
+  GMOCK_PP_INTERNAL_INTERNAL_16TH(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, \
+                                  7, 6, 5, 4, 3, 2, 1)
+#define GMOCK_PP_HAS_COMMA(...)                                              \
+  GMOCK_PP_INTERNAL_INTERNAL_16TH(__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+                                  1, 1, 1, 1, 0)
+#define GMOCK_PP_HEAD(...) GMOCK_PP_INTERNAL_HEAD(__VA_ARGS__)
+#define GMOCK_PP_TAIL(...) GMOCK_PP_INTERNAL_TAIL(__VA_ARGS__)
+#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
+  GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__)
+
+#endif  // GMOCK_PP_INTERNAL_USE_MSVC
+
+// If the arguments after expansion have no tokens, evaluates to `1`. Otherwise
+// evaluates to `0`.
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+//
+// Implementation details:
+//
+// There is one case when it generates a compile error: if the argument is macro
+// that cannot be called with one argument.
+//
+//   #define M(a, b)  // it doesn't matter what it expands to
+//
+//   // Expected: expands to `0`.
+//   // Actual: compile error.
+//   GMOCK_PP_IS_EMPTY(M)
+//
+// There are 4 cases tested:
+//
+// * __VA_ARGS__ possible expansion has no unparen'd commas. Expected 0.
+// * __VA_ARGS__ possible expansion is not enclosed in parenthesis. Expected 0.
+// * __VA_ARGS__ possible expansion is not a macro that ()-evaluates to a comma.
+//   Expected 0
+// * __VA_ARGS__ is empty, or has unparen'd commas, or is enclosed in
+//   parenthesis, or is a macro that ()-evaluates to comma. Expected 1.
+//
+// We trigger detection on '0001', i.e. on empty.
+#define GMOCK_PP_IS_EMPTY(...)                                               \
+  GMOCK_PP_INTERNAL_IS_EMPTY(GMOCK_PP_HAS_COMMA(__VA_ARGS__),                \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__), \
+                             GMOCK_PP_HAS_COMMA(__VA_ARGS__()),              \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__()))
+
+// Evaluates to _Then if _Cond is 1 and _Else if _Cond is 0.
+#define GMOCK_PP_IF(_Cond, _Then, _Else) \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IF_, _Cond)(_Then, _Else)
+
+// Evaluates to the number of arguments after expansion. Identifies 'empty' as
+// 0.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG0() => 0
+//   GMOCK_PP_NARG0(x) => 1
+//   GMOCK_PP_NARG0(x, y) => 2
+//   GMOCK_PP_NARG0(PAIR) => 2
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+#define GMOCK_PP_NARG0(...) \
+  GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(__VA_ARGS__), 0, GMOCK_PP_NARG(__VA_ARGS__))
+
+// Expands to 1 if the first argument starts with something in parentheses,
+// otherwise to 0.
+#define GMOCK_PP_IS_BEGIN_PARENS(...)                    \
+  GMOCK_PP_INTERNAL_ALTERNATE_HEAD(                      \
+      GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_, \
+                   GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C __VA_ARGS__))
+
+// Expands to 1 is there is only one argument and it is enclosed in parentheses.
+#define GMOCK_PP_IS_ENCLOSED_PARENS(...)             \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(__VA_ARGS__), \
+              GMOCK_PP_IS_EMPTY(GMOCK_PP_EMPTY __VA_ARGS__), 0)
+
+// Remove the parens, requires GMOCK_PP_IS_ENCLOSED_PARENS(args) => 1.
+#define GMOCK_PP_REMOVE_PARENS(...) GMOCK_PP_INTERNAL_REMOVE_PARENS __VA_ARGS__
+
+// Expands to _Macro(0, _Data, e1) _Macro(1, _Data, e2) ... _Macro(K -1, _Data,
+// eK) as many of GMOCK_INTERNAL_NARG0 _Tuple.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_Tuple| expansion has no more than 15 elements.
+#define GMOCK_PP_FOR_EACH(_Macro, _Data, _Tuple)                        \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, GMOCK_PP_NARG0 _Tuple) \
+  (0, _Macro, _Data, _Tuple)
+
+// Expands to _Macro(0, _Data, ) _Macro(1, _Data, ) ... _Macro(K - 1, _Data, )
+// Empty if _K = 0.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_K| literal between 0 and 15
+#define GMOCK_PP_REPEAT(_Macro, _Data, _N)           \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, _N) \
+  (0, _Macro, _Data, GMOCK_PP_INTENRAL_EMPTY_TUPLE)
+
+// Increments the argument, requires the argument to be between 0 and 15.
+#define GMOCK_PP_INC(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_INC_, _i)
+
+// Returns comma if _i != 0. Requires _i to be between 0 and 15.
+#define GMOCK_PP_COMMA_IF(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_COMMA_IF_, _i)
+
+// Internal details follow. Do not use any of these symbols outside of this
+// file or we will break your code.
+#define GMOCK_PP_INTENRAL_EMPTY_TUPLE (, , , , , , , , , , , , , , , )
+#define GMOCK_PP_INTERNAL_CAT(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_STRINGIZE(...) #__VA_ARGS__
+#define GMOCK_PP_INTERNAL_INTERNAL_16TH(_1, _2, _3, _4, _5, _6, _7, _8, _9, \
+                                        _10, _11, _12, _13, _14, _15, _16,  \
+                                        ...)                                \
+  _16
+#define GMOCK_PP_INTERNAL_CAT_5(_1, _2, _3, _4, _5) _1##_2##_3##_4##_5
+#define GMOCK_PP_INTERNAL_IS_EMPTY(_1, _2, _3, _4)                             \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_INTERNAL_CAT_5(GMOCK_PP_INTERNAL_IS_EMPTY_CASE_, \
+                                             _1, _2, _3, _4))
+#define GMOCK_PP_INTERNAL_IS_EMPTY_CASE_0001 ,
+#define GMOCK_PP_INTERNAL_IF_1(_Then, _Else) _Then
+#define GMOCK_PP_INTERNAL_IF_0(_Then, _Else) _Else
+#define GMOCK_PP_INTERNAL_HEAD(_1, ...) _1
+#define GMOCK_PP_INTERNAL_TAIL(_1, ...) __VA_ARGS__
+
+#if GMOCK_PP_INTERNAL_USE_MSVC
+#define GMOCK_PP_INTERNAL_NARG_CAT(_1, _2) GMOCK_PP_INTERNAL_NARG_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_HEAD_CAT(_1, _2) GMOCK_PP_INTERNAL_HEAD_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_HAS_COMMA_CAT(_1, _2) \
+  GMOCK_PP_INTERNAL_HAS_COMMA_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_TAIL_CAT(_1, _2) GMOCK_PP_INTERNAL_TAIL_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_VARIADIC_CALL_CAT(_1, _2) \
+  GMOCK_PP_INTERNAL_VARIADIC_CALL_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_NARG_CAT_I(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_HEAD_CAT_I(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_HAS_COMMA_CAT_I(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_TAIL_CAT_I(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_VARIADIC_CALL_CAT_I(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_ALTERNATE_HEAD(...) \
+  GMOCK_PP_INTERNAL_ALTERNATE_HEAD_CAT(GMOCK_PP_HEAD(__VA_ARGS__), )
+#define GMOCK_PP_INTERNAL_ALTERNATE_HEAD_CAT(_1, _2) \
+  GMOCK_PP_INTERNAL_ALTERNATE_HEAD_CAT_I(_1, _2)
+#define GMOCK_PP_INTERNAL_ALTERNATE_HEAD_CAT_I(_1, _2) _1##_2
+#else  // GMOCK_PP_INTERNAL_USE_MSVC
+#define GMOCK_PP_INTERNAL_ALTERNATE_HEAD(...) GMOCK_PP_HEAD(__VA_ARGS__)
+#endif  // GMOCK_PP_INTERNAL_USE_MSVC
+
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C(...) 1 _
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_1 1,
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C \
+  0,
+#define GMOCK_PP_INTERNAL_REMOVE_PARENS(...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_INC_0 1
+#define GMOCK_PP_INTERNAL_INC_1 2
+#define GMOCK_PP_INTERNAL_INC_2 3
+#define GMOCK_PP_INTERNAL_INC_3 4
+#define GMOCK_PP_INTERNAL_INC_4 5
+#define GMOCK_PP_INTERNAL_INC_5 6
+#define GMOCK_PP_INTERNAL_INC_6 7
+#define GMOCK_PP_INTERNAL_INC_7 8
+#define GMOCK_PP_INTERNAL_INC_8 9
+#define GMOCK_PP_INTERNAL_INC_9 10
+#define GMOCK_PP_INTERNAL_INC_10 11
+#define GMOCK_PP_INTERNAL_INC_11 12
+#define GMOCK_PP_INTERNAL_INC_12 13
+#define GMOCK_PP_INTERNAL_INC_13 14
+#define GMOCK_PP_INTERNAL_INC_14 15
+#define GMOCK_PP_INTERNAL_INC_15 16
+#define GMOCK_PP_INTERNAL_COMMA_IF_0
+#define GMOCK_PP_INTERNAL_COMMA_IF_1 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_2 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_3 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_4 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_5 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_6 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_7 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_8 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_9 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_10 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_11 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_12 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_13 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_14 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_15 ,
+#define GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, _element) \
+  _Macro(_i, _Data, _element)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_0(_i, _Macro, _Data, _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(_i, _Macro, _Data, _Tuple) \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_15(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+
+#endif  // THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/fuse_gmock_files.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/fuse_gmock_files.py
new file mode 100755
index 0000000..c33c725
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/fuse_gmock_files.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""fuse_gmock_files.py v0.1.0
+Fuses Google Mock and Google Test source code into two .h files and a .cc file.
+
+SYNOPSIS
+       fuse_gmock_files.py [GMOCK_ROOT_DIR] OUTPUT_DIR
+
+       Scans GMOCK_ROOT_DIR for Google Mock and Google Test source
+       code, assuming Google Test is in the GMOCK_ROOT_DIR/../googletest
+       directory, and generates three files:
+       OUTPUT_DIR/gtest/gtest.h, OUTPUT_DIR/gmock/gmock.h, and
+       OUTPUT_DIR/gmock-gtest-all.cc.  Then you can build your tests
+       by adding OUTPUT_DIR to the include search path and linking
+       with OUTPUT_DIR/gmock-gtest-all.cc.  These three files contain
+       everything you need to use Google Mock.  Hence you can
+       "install" Google Mock by copying them to wherever you want.
+
+       GMOCK_ROOT_DIR can be omitted and defaults to the parent
+       directory of the directory holding this script.
+
+EXAMPLES
+       ./fuse_gmock_files.py fused_gmock
+       ./fuse_gmock_files.py path/to/unpacked/gmock fused_gmock
+
+This tool is experimental.  In particular, it assumes that there is no
+conditional inclusion of Google Mock or Google Test headers.  Please
+report any problems to googlemock@googlegroups.com.  You can read
+https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md for more
+information.
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import re
+import sets
+import sys
+
+# We assume that this file is in the scripts/ directory in the Google
+# Mock root directory.
+DEFAULT_GMOCK_ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+
+# We need to call into googletest/scripts/fuse_gtest_files.py.
+sys.path.append(os.path.join(DEFAULT_GMOCK_ROOT_DIR, '../googletest/scripts'))
+import fuse_gtest_files
+gtest = fuse_gtest_files
+
+# Regex for matching '#include "gmock/..."'.
+INCLUDE_GMOCK_FILE_REGEX = re.compile(r'^\s*#\s*include\s*"(gmock/.+)"')
+
+# Where to find the source seed files.
+GMOCK_H_SEED = 'include/gmock/gmock.h'
+GMOCK_ALL_CC_SEED = 'src/gmock-all.cc'
+
+# Where to put the generated files.
+GTEST_H_OUTPUT = 'gtest/gtest.h'
+GMOCK_H_OUTPUT = 'gmock/gmock.h'
+GMOCK_GTEST_ALL_CC_OUTPUT = 'gmock-gtest-all.cc'
+
+
+def GetGTestRootDir(gmock_root):
+  """Returns the root directory of Google Test."""
+
+  return os.path.join(gmock_root, '../googletest')
+
+
+def ValidateGMockRootDir(gmock_root):
+  """Makes sure gmock_root points to a valid gmock root directory.
+
+  The function aborts the program on failure.
+  """
+
+  gtest.ValidateGTestRootDir(GetGTestRootDir(gmock_root))
+  gtest.VerifyFileExists(gmock_root, GMOCK_H_SEED)
+  gtest.VerifyFileExists(gmock_root, GMOCK_ALL_CC_SEED)
+
+
+def ValidateOutputDir(output_dir):
+  """Makes sure output_dir points to a valid output directory.
+
+  The function aborts the program on failure.
+  """
+
+  gtest.VerifyOutputFile(output_dir, gtest.GTEST_H_OUTPUT)
+  gtest.VerifyOutputFile(output_dir, GMOCK_H_OUTPUT)
+  gtest.VerifyOutputFile(output_dir, GMOCK_GTEST_ALL_CC_OUTPUT)
+
+
+def FuseGMockH(gmock_root, output_dir):
+  """Scans folder gmock_root to generate gmock/gmock.h in output_dir."""
+
+  output_file = file(os.path.join(output_dir, GMOCK_H_OUTPUT), 'w')
+  processed_files = sets.Set()  # Holds all gmock headers we've processed.
+
+  def ProcessFile(gmock_header_path):
+    """Processes the given gmock header file."""
+
+    # We don't process the same header twice.
+    if gmock_header_path in processed_files:
+      return
+
+    processed_files.add(gmock_header_path)
+
+    # Reads each line in the given gmock header.
+    for line in file(os.path.join(gmock_root, gmock_header_path), 'r'):
+      m = INCLUDE_GMOCK_FILE_REGEX.match(line)
+      if m:
+        # It's '#include "gmock/..."' - let's process it recursively.
+        ProcessFile('include/' + m.group(1))
+      else:
+        m = gtest.INCLUDE_GTEST_FILE_REGEX.match(line)
+        if m:
+          # It's '#include "gtest/foo.h"'.  We translate it to
+          # "gtest/gtest.h", regardless of what foo is, since all
+          # gtest headers are fused into gtest/gtest.h.
+
+          # There is no need to #include gtest.h twice.
+          if not gtest.GTEST_H_SEED in processed_files:
+            processed_files.add(gtest.GTEST_H_SEED)
+            output_file.write('#include "%s"\n' % (gtest.GTEST_H_OUTPUT,))
+        else:
+          # Otherwise we copy the line unchanged to the output file.
+          output_file.write(line)
+
+  ProcessFile(GMOCK_H_SEED)
+  output_file.close()
+
+
+def FuseGMockAllCcToFile(gmock_root, output_file):
+  """Scans folder gmock_root to fuse gmock-all.cc into output_file."""
+
+  processed_files = sets.Set()
+
+  def ProcessFile(gmock_source_file):
+    """Processes the given gmock source file."""
+
+    # We don't process the same #included file twice.
+    if gmock_source_file in processed_files:
+      return
+
+    processed_files.add(gmock_source_file)
+
+    # Reads each line in the given gmock source file.
+    for line in file(os.path.join(gmock_root, gmock_source_file), 'r'):
+      m = INCLUDE_GMOCK_FILE_REGEX.match(line)
+      if m:
+        # It's '#include "gmock/foo.h"'.  We treat it as '#include
+        # "gmock/gmock.h"', as all other gmock headers are being fused
+        # into gmock.h and cannot be #included directly.
+
+        # There is no need to #include "gmock/gmock.h" more than once.
+        if not GMOCK_H_SEED in processed_files:
+          processed_files.add(GMOCK_H_SEED)
+          output_file.write('#include "%s"\n' % (GMOCK_H_OUTPUT,))
+      else:
+        m = gtest.INCLUDE_GTEST_FILE_REGEX.match(line)
+        if m:
+          # It's '#include "gtest/..."'.
+          # There is no need to #include gtest.h as it has been
+          # #included by gtest-all.cc.
+          pass
+        else:
+          m = gtest.INCLUDE_SRC_FILE_REGEX.match(line)
+          if m:
+            # It's '#include "src/foo"' - let's process it recursively.
+            ProcessFile(m.group(1))
+          else:
+            # Otherwise we copy the line unchanged to the output file.
+            output_file.write(line)
+
+  ProcessFile(GMOCK_ALL_CC_SEED)
+
+
+def FuseGMockGTestAllCc(gmock_root, output_dir):
+  """Scans folder gmock_root to generate gmock-gtest-all.cc in output_dir."""
+
+  output_file = file(os.path.join(output_dir, GMOCK_GTEST_ALL_CC_OUTPUT), 'w')
+  # First, fuse gtest-all.cc into gmock-gtest-all.cc.
+  gtest.FuseGTestAllCcToFile(GetGTestRootDir(gmock_root), output_file)
+  # Next, append fused gmock-all.cc to gmock-gtest-all.cc.
+  FuseGMockAllCcToFile(gmock_root, output_file)
+  output_file.close()
+
+
+def FuseGMock(gmock_root, output_dir):
+  """Fuses gtest.h, gmock.h, and gmock-gtest-all.h."""
+
+  ValidateGMockRootDir(gmock_root)
+  ValidateOutputDir(output_dir)
+
+  gtest.FuseGTestH(GetGTestRootDir(gmock_root), output_dir)
+  FuseGMockH(gmock_root, output_dir)
+  FuseGMockGTestAllCc(gmock_root, output_dir)
+
+
+def main():
+  argc = len(sys.argv)
+  if argc == 2:
+    # fuse_gmock_files.py OUTPUT_DIR
+    FuseGMock(DEFAULT_GMOCK_ROOT_DIR, sys.argv[1])
+  elif argc == 3:
+    # fuse_gmock_files.py GMOCK_ROOT_DIR OUTPUT_DIR
+    FuseGMock(sys.argv[1], sys.argv[2])
+  else:
+    print __doc__
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/LICENSE b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/LICENSE
new file mode 100755
index 0000000..87ea063
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2007] Neal Norwitz
+   Portions Copyright [2007] Google Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README
new file mode 100755
index 0000000..01fd463
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README
@@ -0,0 +1,34 @@
+
+The Google Mock class generator is an application that is part of cppclean.
+For more information about cppclean, visit http://code.google.com/p/cppclean/
+
+The mock generator requires Python 2.3.5 or later.  If you don't have Python
+installed on your system, you will also need to install it.  You can download
+Python from:  http://www.python.org/download/releases/
+
+To use the Google Mock class generator, you need to call it
+on the command line passing the header file and class for which you want
+to generate a Google Mock class.
+
+Make sure to install the scripts somewhere in your path.  Then you can
+run the program.
+
+  gmock_gen.py header-file.h [ClassName]...
+
+If no ClassNames are specified, all classes in the file are emitted.
+
+To change the indentation from the default of 2, set INDENT in
+the environment.  For example to use an indent of 4 spaces:
+
+INDENT=4 gmock_gen.py header-file.h ClassName
+
+This version was made from SVN revision 281 in the cppclean repository.
+
+Known Limitations
+-----------------
+Not all code will be generated properly.  For example, when mocking templated
+classes, the template information is lost.  You will need to add the template
+information manually.
+
+Not all permutations of using multiple pointers/references will be rendered
+properly.  These will also have to be fixed manually.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README.cppclean b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README.cppclean
new file mode 100755
index 0000000..65431b6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/README.cppclean
@@ -0,0 +1,115 @@
+Goal:
+-----
+  CppClean attempts to find problems in C++ source that slow development
+  in large code bases, for example various forms of unused code.
+  Unused code can be unused functions, methods, data members, types, etc
+  to unnecessary #include directives.  Unnecessary #includes can cause
+  considerable extra compiles increasing the edit-compile-run cycle.
+
+  The project home page is:   http://code.google.com/p/cppclean/
+
+
+Features:
+---------
+ * Find and print C++ language constructs: classes, methods, functions, etc.
+ * Find classes with virtual methods, no virtual destructor, and no bases
+ * Find global/static data that are potential problems when using threads
+ * Unnecessary forward class declarations
+ * Unnecessary function declarations
+ * Undeclared function definitions
+ * (planned) Find unnecessary header files #included
+   - No direct reference to anything in the header
+   - Header is unnecessary if classes were forward declared instead
+ * (planned) Source files that reference headers not directly #included,
+   ie, files that rely on a transitive #include from another header
+ * (planned) Unused members (private, protected, & public) methods and data
+ * (planned) Store AST in a SQL database so relationships can be queried
+
+AST is Abstract Syntax Tree, a representation of parsed source code.
+http://en.wikipedia.org/wiki/Abstract_syntax_tree
+
+
+System Requirements:
+--------------------
+ * Python 2.4 or later (2.3 probably works too)
+ * Works on Windows (untested), Mac OS X, and Unix
+
+
+How to Run:
+-----------
+  For all examples, it is assumed that cppclean resides in a directory called
+  /cppclean.
+
+  To print warnings for classes with virtual methods, no virtual destructor and
+  no base classes:
+
+      /cppclean/run.sh nonvirtual_dtors.py file1.h file2.h file3.cc ...
+
+  To print all the functions defined in header file(s):
+
+      /cppclean/run.sh functions.py file1.h file2.h ...
+
+  All the commands take multiple files on the command line.  Other programs
+  include: find_warnings, headers, methods, and types.  Some other programs
+  are available, but used primarily for debugging.
+
+  run.sh is a simple wrapper that sets PYTHONPATH to /cppclean and then
+  runs the program in /cppclean/cpp/PROGRAM.py.  There is currently
+  no equivalent for Windows.  Contributions for a run.bat file
+  would be greatly appreciated.
+
+
+How to Configure:
+-----------------
+  You can add a siteheaders.py file in /cppclean/cpp to configure where
+  to look for other headers (typically -I options passed to a compiler).
+  Currently two values are supported:  _TRANSITIVE and GetIncludeDirs.
+  _TRANSITIVE should be set to a boolean value (True or False) indicating
+  whether to transitively process all header files.  The default is False.
+
+  GetIncludeDirs is a function that takes a single argument and returns
+  a sequence of directories to include.  This can be a generator or
+  return a static list.
+
+      def GetIncludeDirs(filename):
+          return ['/some/path/with/other/headers']
+
+      # Here is a more complicated example.
+      def GetIncludeDirs(filename):
+          yield '/path1'
+          yield os.path.join('/path2', os.path.dirname(filename))
+          yield '/path3'
+
+
+How to Test:
+------------
+  For all examples, it is assumed that cppclean resides in a directory called
+  /cppclean.  The tests require
+
+  cd /cppclean
+  make test
+  # To generate expected results after a change:
+  make expected
+
+
+Current Status:
+---------------
+  The parser works pretty well for header files, parsing about 99% of Google's
+  header files.  Anything which inspects structure of C++ source files should
+  work reasonably well.  Function bodies are not transformed to an AST,
+  but left as tokens.  Much work is still needed on finding unused header files
+  and storing an AST in a database.
+
+
+Non-goals:
+----------
+ * Parsing all valid C++ source
+ * Handling invalid C++ source gracefully
+ * Compiling to machine code (or anything beyond an AST)
+
+
+Contact:
+--------
+  If you used cppclean, I would love to hear about your experiences
+  cppclean@googlegroups.com.  Even if you don't use cppclean, I'd like to
+  hear from you.  :-)  (You can contact me directly at:  nnorwitz@gmail.com)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/__init__.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/ast.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/ast.py
new file mode 100755
index 0000000..f14728b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/ast.py
@@ -0,0 +1,1736 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate an Abstract Syntax Tree (AST) for C++."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+# TODO:
+#  * Tokens should never be exported, need to convert to Nodes
+#    (return types, parameters, etc.)
+#  * Handle static class data for templatized classes
+#  * Handle casts (both C++ and C-style)
+#  * Handle conditions and loops (if/else, switch, for, while/do)
+#
+# TODO much, much later:
+#  * Handle #define
+#  * exceptions
+
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+import sys
+import traceback
+
+from cpp import keywords
+from cpp import tokenize
+from cpp import utils
+
+
+if not hasattr(builtins, 'reversed'):
+    # Support Python 2.3 and earlier.
+    def reversed(seq):
+        for i in range(len(seq)-1, -1, -1):
+            yield seq[i]
+
+if not hasattr(builtins, 'next'):
+    # Support Python 2.5 and earlier.
+    def next(obj):
+        return obj.next()
+
+
+VISIBILITY_PUBLIC, VISIBILITY_PROTECTED, VISIBILITY_PRIVATE = range(3)
+
+FUNCTION_NONE = 0x00
+FUNCTION_CONST = 0x01
+FUNCTION_VIRTUAL = 0x02
+FUNCTION_PURE_VIRTUAL = 0x04
+FUNCTION_CTOR = 0x08
+FUNCTION_DTOR = 0x10
+FUNCTION_ATTRIBUTE = 0x20
+FUNCTION_UNKNOWN_ANNOTATION = 0x40
+FUNCTION_THROW = 0x80
+FUNCTION_OVERRIDE = 0x100
+
+"""
+These are currently unused.  Should really handle these properly at some point.
+
+TYPE_MODIFIER_INLINE   = 0x010000
+TYPE_MODIFIER_EXTERN   = 0x020000
+TYPE_MODIFIER_STATIC   = 0x040000
+TYPE_MODIFIER_CONST    = 0x080000
+TYPE_MODIFIER_REGISTER = 0x100000
+TYPE_MODIFIER_VOLATILE = 0x200000
+TYPE_MODIFIER_MUTABLE  = 0x400000
+
+TYPE_MODIFIER_MAP = {
+    'inline': TYPE_MODIFIER_INLINE,
+    'extern': TYPE_MODIFIER_EXTERN,
+    'static': TYPE_MODIFIER_STATIC,
+    'const': TYPE_MODIFIER_CONST,
+    'register': TYPE_MODIFIER_REGISTER,
+    'volatile': TYPE_MODIFIER_VOLATILE,
+    'mutable': TYPE_MODIFIER_MUTABLE,
+    }
+"""
+
+_INTERNAL_TOKEN = 'internal'
+_NAMESPACE_POP = 'ns-pop'
+
+
+# TODO(nnorwitz): use this as a singleton for templated_types, etc
+# where we don't want to create a new empty dict each time.  It is also const.
+class _NullDict(object):
+    __contains__ = lambda self: False
+    keys = values = items = iterkeys = itervalues = iteritems = lambda self: ()
+
+
+# TODO(nnorwitz): move AST nodes into a separate module.
+class Node(object):
+    """Base AST node."""
+
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def IsDeclaration(self):
+        """Returns bool if this node is a declaration."""
+        return False
+
+    def IsDefinition(self):
+        """Returns bool if this node is a definition."""
+        return False
+
+    def IsExportable(self):
+        """Returns bool if this node exportable from a header file."""
+        return False
+
+    def Requires(self, node):
+        """Does this AST node require the definition of the node passed in?"""
+        return False
+
+    def XXX__str__(self):
+        return self._StringHelper(self.__class__.__name__, '')
+
+    def _StringHelper(self, name, suffix):
+        if not utils.DEBUG:
+            return '%s(%s)' % (name, suffix)
+        return '%s(%d, %d, %s)' % (name, self.start, self.end, suffix)
+
+    def __repr__(self):
+        return str(self)
+
+
+class Define(Node):
+    def __init__(self, start, end, name, definition):
+        Node.__init__(self, start, end)
+        self.name = name
+        self.definition = definition
+
+    def __str__(self):
+        value = '%s %s' % (self.name, self.definition)
+        return self._StringHelper(self.__class__.__name__, value)
+
+
+class Include(Node):
+    def __init__(self, start, end, filename, system):
+        Node.__init__(self, start, end)
+        self.filename = filename
+        self.system = system
+
+    def __str__(self):
+        fmt = '"%s"'
+        if self.system:
+            fmt = '<%s>'
+        return self._StringHelper(self.__class__.__name__, fmt % self.filename)
+
+
+class Goto(Node):
+    def __init__(self, start, end, label):
+        Node.__init__(self, start, end)
+        self.label = label
+
+    def __str__(self):
+        return self._StringHelper(self.__class__.__name__, str(self.label))
+
+
+class Expr(Node):
+    def __init__(self, start, end, expr):
+        Node.__init__(self, start, end)
+        self.expr = expr
+
+    def Requires(self, node):
+        # TODO(nnorwitz): impl.
+        return False
+
+    def __str__(self):
+        return self._StringHelper(self.__class__.__name__, str(self.expr))
+
+
+class Return(Expr):
+    pass
+
+
+class Delete(Expr):
+    pass
+
+
+class Friend(Expr):
+    def __init__(self, start, end, expr, namespace):
+        Expr.__init__(self, start, end, expr)
+        self.namespace = namespace[:]
+
+
+class Using(Node):
+    def __init__(self, start, end, names):
+        Node.__init__(self, start, end)
+        self.names = names
+
+    def __str__(self):
+        return self._StringHelper(self.__class__.__name__, str(self.names))
+
+
+class Parameter(Node):
+    def __init__(self, start, end, name, parameter_type, default):
+        Node.__init__(self, start, end)
+        self.name = name
+        self.type = parameter_type
+        self.default = default
+
+    def Requires(self, node):
+        # TODO(nnorwitz): handle namespaces, etc.
+        return self.type.name == node.name
+
+    def __str__(self):
+        name = str(self.type)
+        suffix = '%s %s' % (name, self.name)
+        if self.default:
+            suffix += ' = ' + ''.join([d.name for d in self.default])
+        return self._StringHelper(self.__class__.__name__, suffix)
+
+
+class _GenericDeclaration(Node):
+    def __init__(self, start, end, name, namespace):
+        Node.__init__(self, start, end)
+        self.name = name
+        self.namespace = namespace[:]
+
+    def FullName(self):
+        prefix = ''
+        if self.namespace and self.namespace[-1]:
+            prefix = '::'.join(self.namespace) + '::'
+        return prefix + self.name
+
+    def _TypeStringHelper(self, suffix):
+        if self.namespace:
+            names = [n or '<anonymous>' for n in self.namespace]
+            suffix += ' in ' + '::'.join(names)
+        return self._StringHelper(self.__class__.__name__, suffix)
+
+
+# TODO(nnorwitz): merge with Parameter in some way?
+class VariableDeclaration(_GenericDeclaration):
+    def __init__(self, start, end, name, var_type, initial_value, namespace):
+        _GenericDeclaration.__init__(self, start, end, name, namespace)
+        self.type = var_type
+        self.initial_value = initial_value
+
+    def Requires(self, node):
+        # TODO(nnorwitz): handle namespaces, etc.
+        return self.type.name == node.name
+
+    def ToString(self):
+        """Return a string that tries to reconstitute the variable decl."""
+        suffix = '%s %s' % (self.type, self.name)
+        if self.initial_value:
+            suffix += ' = ' + self.initial_value
+        return suffix
+
+    def __str__(self):
+        return self._StringHelper(self.__class__.__name__, self.ToString())
+
+
+class Typedef(_GenericDeclaration):
+    def __init__(self, start, end, name, alias, namespace):
+        _GenericDeclaration.__init__(self, start, end, name, namespace)
+        self.alias = alias
+
+    def IsDefinition(self):
+        return True
+
+    def IsExportable(self):
+        return True
+
+    def Requires(self, node):
+        # TODO(nnorwitz): handle namespaces, etc.
+        name = node.name
+        for token in self.alias:
+            if token is not None and name == token.name:
+                return True
+        return False
+
+    def __str__(self):
+        suffix = '%s, %s' % (self.name, self.alias)
+        return self._TypeStringHelper(suffix)
+
+
+class _NestedType(_GenericDeclaration):
+    def __init__(self, start, end, name, fields, namespace):
+        _GenericDeclaration.__init__(self, start, end, name, namespace)
+        self.fields = fields
+
+    def IsDefinition(self):
+        return True
+
+    def IsExportable(self):
+        return True
+
+    def __str__(self):
+        suffix = '%s, {%s}' % (self.name, self.fields)
+        return self._TypeStringHelper(suffix)
+
+
+class Union(_NestedType):
+    pass
+
+
+class Enum(_NestedType):
+    pass
+
+
+class Class(_GenericDeclaration):
+    def __init__(self, start, end, name, bases, templated_types, body, namespace):
+        _GenericDeclaration.__init__(self, start, end, name, namespace)
+        self.bases = bases
+        self.body = body
+        self.templated_types = templated_types
+
+    def IsDeclaration(self):
+        return self.bases is None and self.body is None
+
+    def IsDefinition(self):
+        return not self.IsDeclaration()
+
+    def IsExportable(self):
+        return not self.IsDeclaration()
+
+    def Requires(self, node):
+        # TODO(nnorwitz): handle namespaces, etc.
+        if self.bases:
+            for token_list in self.bases:
+                # TODO(nnorwitz): bases are tokens, do name comparison.
+                for token in token_list:
+                    if token.name == node.name:
+                        return True
+        # TODO(nnorwitz): search in body too.
+        return False
+
+    def __str__(self):
+        name = self.name
+        if self.templated_types:
+            name += '<%s>' % self.templated_types
+        suffix = '%s, %s, %s' % (name, self.bases, self.body)
+        return self._TypeStringHelper(suffix)
+
+
+class Struct(Class):
+    pass
+
+
+class Function(_GenericDeclaration):
+    def __init__(self, start, end, name, return_type, parameters,
+                 modifiers, templated_types, body, namespace):
+        _GenericDeclaration.__init__(self, start, end, name, namespace)
+        converter = TypeConverter(namespace)
+        self.return_type = converter.CreateReturnType(return_type)
+        self.parameters = converter.ToParameters(parameters)
+        self.modifiers = modifiers
+        self.body = body
+        self.templated_types = templated_types
+
+    def IsDeclaration(self):
+        return self.body is None
+
+    def IsDefinition(self):
+        return self.body is not None
+
+    def IsExportable(self):
+        if self.return_type and 'static' in self.return_type.modifiers:
+            return False
+        return None not in self.namespace
+
+    def Requires(self, node):
+        if self.parameters:
+            # TODO(nnorwitz): parameters are tokens, do name comparison.
+            for p in self.parameters:
+                if p.name == node.name:
+                    return True
+        # TODO(nnorwitz): search in body too.
+        return False
+
+    def __str__(self):
+        # TODO(nnorwitz): add templated_types.
+        suffix = ('%s %s(%s), 0x%02x, %s' %
+                  (self.return_type, self.name, self.parameters,
+                   self.modifiers, self.body))
+        return self._TypeStringHelper(suffix)
+
+
+class Method(Function):
+    def __init__(self, start, end, name, in_class, return_type, parameters,
+                 modifiers, templated_types, body, namespace):
+        Function.__init__(self, start, end, name, return_type, parameters,
+                          modifiers, templated_types, body, namespace)
+        # TODO(nnorwitz): in_class could also be a namespace which can
+        # mess up finding functions properly.
+        self.in_class = in_class
+
+
+class Type(_GenericDeclaration):
+    """Type used for any variable (eg class, primitive, struct, etc)."""
+
+    def __init__(self, start, end, name, templated_types, modifiers,
+                 reference, pointer, array):
+        """
+        Args:
+          name: str name of main type
+          templated_types: [Class (Type?)] template type info between <>
+          modifiers: [str] type modifiers (keywords) eg, const, mutable, etc.
+          reference, pointer, array: bools
+        """
+        _GenericDeclaration.__init__(self, start, end, name, [])
+        self.templated_types = templated_types
+        if not name and modifiers:
+            self.name = modifiers.pop()
+        self.modifiers = modifiers
+        self.reference = reference
+        self.pointer = pointer
+        self.array = array
+
+    def __str__(self):
+        prefix = ''
+        if self.modifiers:
+            prefix = ' '.join(self.modifiers) + ' '
+        name = str(self.name)
+        if self.templated_types:
+            name += '<%s>' % self.templated_types
+        suffix = prefix + name
+        if self.reference:
+            suffix += '&'
+        if self.pointer:
+            suffix += '*'
+        if self.array:
+            suffix += '[]'
+        return self._TypeStringHelper(suffix)
+
+    # By definition, Is* are always False.  A Type can only exist in
+    # some sort of variable declaration, parameter, or return value.
+    def IsDeclaration(self):
+        return False
+
+    def IsDefinition(self):
+        return False
+
+    def IsExportable(self):
+        return False
+
+
+class TypeConverter(object):
+
+    def __init__(self, namespace_stack):
+        self.namespace_stack = namespace_stack
+
+    def _GetTemplateEnd(self, tokens, start):
+        count = 1
+        end = start
+        while 1:
+            token = tokens[end]
+            end += 1
+            if token.name == '<':
+                count += 1
+            elif token.name == '>':
+                count -= 1
+                if count == 0:
+                    break
+        return tokens[start:end-1], end
+
+    def ToType(self, tokens):
+        """Convert [Token,...] to [Class(...), ] useful for base classes.
+        For example, code like class Foo : public Bar<x, y> { ... };
+        the "Bar<x, y>" portion gets converted to an AST.
+
+        Returns:
+          [Class(...), ...]
+        """
+        result = []
+        name_tokens = []
+        reference = pointer = array = False
+
+        def AddType(templated_types):
+            # Partition tokens into name and modifier tokens.
+            names = []
+            modifiers = []
+            for t in name_tokens:
+                if keywords.IsKeyword(t.name):
+                    modifiers.append(t.name)
+                else:
+                    names.append(t.name)
+            name = ''.join(names)
+            if name_tokens:
+                result.append(Type(name_tokens[0].start, name_tokens[-1].end,
+                                   name, templated_types, modifiers,
+                                   reference, pointer, array))
+            del name_tokens[:]
+
+        i = 0
+        end = len(tokens)
+        while i < end:
+            token = tokens[i]
+            if token.name == '<':
+                new_tokens, new_end = self._GetTemplateEnd(tokens, i+1)
+                AddType(self.ToType(new_tokens))
+                # If there is a comma after the template, we need to consume
+                # that here otherwise it becomes part of the name.
+                i = new_end
+                reference = pointer = array = False
+            elif token.name == ',':
+                AddType([])
+                reference = pointer = array = False
+            elif token.name == '*':
+                pointer = True
+            elif token.name == '&':
+                reference = True
+            elif token.name == '[':
+               pointer = True
+            elif token.name == ']':
+                pass
+            else:
+                name_tokens.append(token)
+            i += 1
+
+        if name_tokens:
+            # No '<' in the tokens, just a simple name and no template.
+            AddType([])
+        return result
+
+    def DeclarationToParts(self, parts, needs_name_removed):
+        name = None
+        default = []
+        if needs_name_removed:
+            # Handle default (initial) values properly.
+            for i, t in enumerate(parts):
+                if t.name == '=':
+                    default = parts[i+1:]
+                    name = parts[i-1].name
+                    if name == ']' and parts[i-2].name == '[':
+                        name = parts[i-3].name
+                        i -= 1
+                    parts = parts[:i-1]
+                    break
+            else:
+                if parts[-1].token_type == tokenize.NAME:
+                    name = parts.pop().name
+                else:
+                    # TODO(nnorwitz): this is a hack that happens for code like
+                    # Register(Foo<T>); where it thinks this is a function call
+                    # but it's actually a declaration.
+                    name = '???'
+        modifiers = []
+        type_name = []
+        other_tokens = []
+        templated_types = []
+        i = 0
+        end = len(parts)
+        while i < end:
+            p = parts[i]
+            if keywords.IsKeyword(p.name):
+                modifiers.append(p.name)
+            elif p.name == '<':
+                templated_tokens, new_end = self._GetTemplateEnd(parts, i+1)
+                templated_types = self.ToType(templated_tokens)
+                i = new_end - 1
+                # Don't add a spurious :: to data members being initialized.
+                next_index = i + 1
+                if next_index < end and parts[next_index].name == '::':
+                    i += 1
+            elif p.name in ('[', ']', '='):
+                # These are handled elsewhere.
+                other_tokens.append(p)
+            elif p.name not in ('*', '&', '>'):
+                # Ensure that names have a space between them.
+                if (type_name and type_name[-1].token_type == tokenize.NAME and
+                    p.token_type == tokenize.NAME):
+                    type_name.append(tokenize.Token(tokenize.SYNTAX, ' ', 0, 0))
+                type_name.append(p)
+            else:
+                other_tokens.append(p)
+            i += 1
+        type_name = ''.join([t.name for t in type_name])
+        return name, type_name, templated_types, modifiers, default, other_tokens
+
+    def ToParameters(self, tokens):
+        if not tokens:
+            return []
+
+        result = []
+        name = type_name = ''
+        type_modifiers = []
+        pointer = reference = array = False
+        first_token = None
+        default = []
+
+        def AddParameter(end):
+            if default:
+                del default[0]  # Remove flag.
+            parts = self.DeclarationToParts(type_modifiers, True)
+            (name, type_name, templated_types, modifiers,
+             unused_default, unused_other_tokens) = parts
+            parameter_type = Type(first_token.start, first_token.end,
+                                  type_name, templated_types, modifiers,
+                                  reference, pointer, array)
+            p = Parameter(first_token.start, end, name,
+                          parameter_type, default)
+            result.append(p)
+
+        template_count = 0
+        for s in tokens:
+            if not first_token:
+                first_token = s
+            if s.name == '<':
+                template_count += 1
+            elif s.name == '>':
+                template_count -= 1
+            if template_count > 0:
+                type_modifiers.append(s)
+                continue
+
+            if s.name == ',':
+                AddParameter(s.start)
+                name = type_name = ''
+                type_modifiers = []
+                pointer = reference = array = False
+                first_token = None
+                default = []
+            elif s.name == '*':
+                pointer = True
+            elif s.name == '&':
+                reference = True
+            elif s.name == '[':
+                array = True
+            elif s.name == ']':
+                pass  # Just don't add to type_modifiers.
+            elif s.name == '=':
+                # Got a default value.  Add any value (None) as a flag.
+                default.append(None)
+            elif default:
+                default.append(s)
+            else:
+                type_modifiers.append(s)
+        AddParameter(tokens[-1].end)
+        return result
+
+    def CreateReturnType(self, return_type_seq):
+        if not return_type_seq:
+            return None
+        start = return_type_seq[0].start
+        end = return_type_seq[-1].end
+        _, name, templated_types, modifiers, default, other_tokens = \
+           self.DeclarationToParts(return_type_seq, False)
+        names = [n.name for n in other_tokens]
+        reference = '&' in names
+        pointer = '*' in names
+        array = '[' in names
+        return Type(start, end, name, templated_types, modifiers,
+                    reference, pointer, array)
+
+    def GetTemplateIndices(self, names):
+        # names is a list of strings.
+        start = names.index('<')
+        end = len(names) - 1
+        while end > 0:
+            if names[end] == '>':
+                break
+            end -= 1
+        return start, end+1
+
+class AstBuilder(object):
+    def __init__(self, token_stream, filename, in_class='', visibility=None,
+                 namespace_stack=[]):
+        self.tokens = token_stream
+        self.filename = filename
+        # TODO(nnorwitz): use a better data structure (deque) for the queue.
+        # Switching directions of the "queue" improved perf by about 25%.
+        # Using a deque should be even better since we access from both sides.
+        self.token_queue = []
+        self.namespace_stack = namespace_stack[:]
+        self.in_class = in_class
+        if in_class is None:
+            self.in_class_name_only = None
+        else:
+            self.in_class_name_only = in_class.split('::')[-1]
+        self.visibility = visibility
+        self.in_function = False
+        self.current_token = None
+        # Keep the state whether we are currently handling a typedef or not.
+        self._handling_typedef = False
+
+        self.converter = TypeConverter(self.namespace_stack)
+
+    def HandleError(self, msg, token):
+        printable_queue = list(reversed(self.token_queue[-20:]))
+        sys.stderr.write('Got %s in %s @ %s %s\n' %
+                         (msg, self.filename, token, printable_queue))
+
+    def Generate(self):
+        while 1:
+            token = self._GetNextToken()
+            if not token:
+                break
+
+            # Get the next token.
+            self.current_token = token
+
+            # Dispatch on the next token type.
+            if token.token_type == _INTERNAL_TOKEN:
+                if token.name == _NAMESPACE_POP:
+                    self.namespace_stack.pop()
+                continue
+
+            try:
+                result = self._GenerateOne(token)
+                if result is not None:
+                    yield result
+            except:
+                self.HandleError('exception', token)
+                raise
+
+    def _CreateVariable(self, pos_token, name, type_name, type_modifiers,
+                        ref_pointer_name_seq, templated_types, value=None):
+        reference = '&' in ref_pointer_name_seq
+        pointer = '*' in ref_pointer_name_seq
+        array = '[' in ref_pointer_name_seq
+        var_type = Type(pos_token.start, pos_token.end, type_name,
+                        templated_types, type_modifiers,
+                        reference, pointer, array)
+        return VariableDeclaration(pos_token.start, pos_token.end,
+                                   name, var_type, value, self.namespace_stack)
+
+    def _GenerateOne(self, token):
+        if token.token_type == tokenize.NAME:
+            if (keywords.IsKeyword(token.name) and
+                not keywords.IsBuiltinType(token.name)):
+                method = getattr(self, 'handle_' + token.name)
+                return method()
+            elif token.name == self.in_class_name_only:
+                # The token name is the same as the class, must be a ctor if
+                # there is a paren.  Otherwise, it's the return type.
+                # Peek ahead to get the next token to figure out which.
+                next = self._GetNextToken()
+                self._AddBackToken(next)
+                if next.token_type == tokenize.SYNTAX and next.name == '(':
+                    return self._GetMethod([token], FUNCTION_CTOR, None, True)
+                # Fall through--handle like any other method.
+
+            # Handle data or function declaration/definition.
+            syntax = tokenize.SYNTAX
+            temp_tokens, last_token = \
+                self._GetVarTokensUpTo(syntax, '(', ';', '{', '[')
+            temp_tokens.insert(0, token)
+            if last_token.name == '(':
+                # If there is an assignment before the paren,
+                # this is an expression, not a method.
+                expr = bool([e for e in temp_tokens if e.name == '='])
+                if expr:
+                    new_temp = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+                    temp_tokens.append(last_token)
+                    temp_tokens.extend(new_temp)
+                    last_token = tokenize.Token(tokenize.SYNTAX, ';', 0, 0)
+
+            if last_token.name == '[':
+                # Handle array, this isn't a method, unless it's an operator.
+                # TODO(nnorwitz): keep the size somewhere.
+                # unused_size = self._GetTokensUpTo(tokenize.SYNTAX, ']')
+                temp_tokens.append(last_token)
+                if temp_tokens[-2].name == 'operator':
+                    temp_tokens.append(self._GetNextToken())
+                else:
+                    temp_tokens2, last_token = \
+                        self._GetVarTokensUpTo(tokenize.SYNTAX, ';')
+                    temp_tokens.extend(temp_tokens2)
+
+            if last_token.name == ';':
+                # Handle data, this isn't a method.
+                parts = self.converter.DeclarationToParts(temp_tokens, True)
+                (name, type_name, templated_types, modifiers, default,
+                 unused_other_tokens) = parts
+
+                t0 = temp_tokens[0]
+                names = [t.name for t in temp_tokens]
+                if templated_types:
+                    start, end = self.converter.GetTemplateIndices(names)
+                    names = names[:start] + names[end:]
+                default = ''.join([t.name for t in default])
+                return self._CreateVariable(t0, name, type_name, modifiers,
+                                            names, templated_types, default)
+            if last_token.name == '{':
+                self._AddBackTokens(temp_tokens[1:])
+                self._AddBackToken(last_token)
+                method_name = temp_tokens[0].name
+                method = getattr(self, 'handle_' + method_name, None)
+                if not method:
+                    # Must be declaring a variable.
+                    # TODO(nnorwitz): handle the declaration.
+                    return None
+                return method()
+            return self._GetMethod(temp_tokens, 0, None, False)
+        elif token.token_type == tokenize.SYNTAX:
+            if token.name == '~' and self.in_class:
+                # Must be a dtor (probably not in method body).
+                token = self._GetNextToken()
+                # self.in_class can contain A::Name, but the dtor will only
+                # be Name.  Make sure to compare against the right value.
+                if (token.token_type == tokenize.NAME and
+                    token.name == self.in_class_name_only):
+                    return self._GetMethod([token], FUNCTION_DTOR, None, True)
+            # TODO(nnorwitz): handle a lot more syntax.
+        elif token.token_type == tokenize.PREPROCESSOR:
+            # TODO(nnorwitz): handle more preprocessor directives.
+            # token starts with a #, so remove it and strip whitespace.
+            name = token.name[1:].lstrip()
+            if name.startswith('include'):
+                # Remove "include".
+                name = name[7:].strip()
+                assert name
+                # Handle #include \<newline> "header-on-second-line.h".
+                if name.startswith('\\'):
+                    name = name[1:].strip()
+                assert name[0] in '<"', token
+                assert name[-1] in '>"', token
+                system = name[0] == '<'
+                filename = name[1:-1]
+                return Include(token.start, token.end, filename, system)
+            if name.startswith('define'):
+                # Remove "define".
+                name = name[6:].strip()
+                assert name
+                value = ''
+                for i, c in enumerate(name):
+                    if c.isspace():
+                        value = name[i:].lstrip()
+                        name = name[:i]
+                        break
+                return Define(token.start, token.end, name, value)
+            if name.startswith('if') and name[2:3].isspace():
+                condition = name[3:].strip()
+                if condition.startswith('0') or condition.startswith('(0)'):
+                    self._SkipIf0Blocks()
+        return None
+
+    def _GetTokensUpTo(self, expected_token_type, expected_token):
+        return self._GetVarTokensUpTo(expected_token_type, expected_token)[0]
+
+    def _GetVarTokensUpTo(self, expected_token_type, *expected_tokens):
+        last_token = self._GetNextToken()
+        tokens = []
+        while (last_token.token_type != expected_token_type or
+               last_token.name not in expected_tokens):
+            tokens.append(last_token)
+            last_token = self._GetNextToken()
+        return tokens, last_token
+
+    # TODO(nnorwitz): remove _IgnoreUpTo() it shouldn't be necessary.
+    def _IgnoreUpTo(self, token_type, token):
+        unused_tokens = self._GetTokensUpTo(token_type, token)
+
+    def _SkipIf0Blocks(self):
+        count = 1
+        while 1:
+            token = self._GetNextToken()
+            if token.token_type != tokenize.PREPROCESSOR:
+                continue
+
+            name = token.name[1:].lstrip()
+            if name.startswith('endif'):
+                count -= 1
+                if count == 0:
+                    break
+            elif name.startswith('if'):
+                count += 1
+
+    def _GetMatchingChar(self, open_paren, close_paren, GetNextToken=None):
+        if GetNextToken is None:
+            GetNextToken = self._GetNextToken
+        # Assumes the current token is open_paren and we will consume
+        # and return up to the close_paren.
+        count = 1
+        token = GetNextToken()
+        while 1:
+            if token.token_type == tokenize.SYNTAX:
+                if token.name == open_paren:
+                    count += 1
+                elif token.name == close_paren:
+                    count -= 1
+                    if count == 0:
+                        break
+            yield token
+            token = GetNextToken()
+        yield token
+
+    def _GetParameters(self):
+        return self._GetMatchingChar('(', ')')
+
+    def GetScope(self):
+        return self._GetMatchingChar('{', '}')
+
+    def _GetNextToken(self):
+        if self.token_queue:
+            return self.token_queue.pop()
+        return next(self.tokens)
+
+    def _AddBackToken(self, token):
+        if token.whence == tokenize.WHENCE_STREAM:
+            token.whence = tokenize.WHENCE_QUEUE
+            self.token_queue.insert(0, token)
+        else:
+            assert token.whence == tokenize.WHENCE_QUEUE, token
+            self.token_queue.append(token)
+
+    def _AddBackTokens(self, tokens):
+        if tokens:
+            if tokens[-1].whence == tokenize.WHENCE_STREAM:
+                for token in tokens:
+                    token.whence = tokenize.WHENCE_QUEUE
+                self.token_queue[:0] = reversed(tokens)
+            else:
+                assert tokens[-1].whence == tokenize.WHENCE_QUEUE, tokens
+                self.token_queue.extend(reversed(tokens))
+
+    def GetName(self, seq=None):
+        """Returns ([tokens], next_token_info)."""
+        GetNextToken = self._GetNextToken
+        if seq is not None:
+            it = iter(seq)
+            GetNextToken = lambda: next(it)
+        next_token = GetNextToken()
+        tokens = []
+        last_token_was_name = False
+        while (next_token.token_type == tokenize.NAME or
+               (next_token.token_type == tokenize.SYNTAX and
+                next_token.name in ('::', '<'))):
+            # Two NAMEs in a row means the identifier should terminate.
+            # It's probably some sort of variable declaration.
+            if last_token_was_name and next_token.token_type == tokenize.NAME:
+                break
+            last_token_was_name = next_token.token_type == tokenize.NAME
+            tokens.append(next_token)
+            # Handle templated names.
+            if next_token.name == '<':
+                tokens.extend(self._GetMatchingChar('<', '>', GetNextToken))
+                last_token_was_name = True
+            next_token = GetNextToken()
+        return tokens, next_token
+
+    def GetMethod(self, modifiers, templated_types):
+        return_type_and_name = self._GetTokensUpTo(tokenize.SYNTAX, '(')
+        assert len(return_type_and_name) >= 1
+        return self._GetMethod(return_type_and_name, modifiers, templated_types,
+                               False)
+
+    def _GetMethod(self, return_type_and_name, modifiers, templated_types,
+                   get_paren):
+        template_portion = None
+        if get_paren:
+            token = self._GetNextToken()
+            assert token.token_type == tokenize.SYNTAX, token
+            if token.name == '<':
+                # Handle templatized dtors.
+                template_portion = [token]
+                template_portion.extend(self._GetMatchingChar('<', '>'))
+                token = self._GetNextToken()
+            assert token.token_type == tokenize.SYNTAX, token
+            assert token.name == '(', token
+
+        name = return_type_and_name.pop()
+        # Handle templatized ctors.
+        if name.name == '>':
+            index = 1
+            while return_type_and_name[index].name != '<':
+                index += 1
+            template_portion = return_type_and_name[index:] + [name]
+            del return_type_and_name[index:]
+            name = return_type_and_name.pop()
+        elif name.name == ']':
+            rt = return_type_and_name
+            assert rt[-1].name == '[', return_type_and_name
+            assert rt[-2].name == 'operator', return_type_and_name
+            name_seq = return_type_and_name[-2:]
+            del return_type_and_name[-2:]
+            name = tokenize.Token(tokenize.NAME, 'operator[]',
+                                  name_seq[0].start, name.end)
+            # Get the open paren so _GetParameters() below works.
+            unused_open_paren = self._GetNextToken()
+
+        # TODO(nnorwitz): store template_portion.
+        return_type = return_type_and_name
+        indices = name
+        if return_type:
+            indices = return_type[0]
+
+        # Force ctor for templatized ctors.
+        if name.name == self.in_class and not modifiers:
+            modifiers |= FUNCTION_CTOR
+        parameters = list(self._GetParameters())
+        del parameters[-1]              # Remove trailing ')'.
+
+        # Handling operator() is especially weird.
+        if name.name == 'operator' and not parameters:
+            token = self._GetNextToken()
+            assert token.name == '(', token
+            parameters = list(self._GetParameters())
+            del parameters[-1]          # Remove trailing ')'.
+
+        token = self._GetNextToken()
+        while token.token_type == tokenize.NAME:
+            modifier_token = token
+            token = self._GetNextToken()
+            if modifier_token.name == 'const':
+                modifiers |= FUNCTION_CONST
+            elif modifier_token.name == '__attribute__':
+                # TODO(nnorwitz): handle more __attribute__ details.
+                modifiers |= FUNCTION_ATTRIBUTE
+                assert token.name == '(', token
+                # Consume everything between the (parens).
+                unused_tokens = list(self._GetMatchingChar('(', ')'))
+                token = self._GetNextToken()
+            elif modifier_token.name == 'throw':
+                modifiers |= FUNCTION_THROW
+                assert token.name == '(', token
+                # Consume everything between the (parens).
+                unused_tokens = list(self._GetMatchingChar('(', ')'))
+                token = self._GetNextToken()
+            elif modifier_token.name == 'override':
+                modifiers |= FUNCTION_OVERRIDE
+            elif modifier_token.name == modifier_token.name.upper():
+                # HACK(nnorwitz):  assume that all upper-case names
+                # are some macro we aren't expanding.
+                modifiers |= FUNCTION_UNKNOWN_ANNOTATION
+            else:
+                self.HandleError('unexpected token', modifier_token)
+
+        assert token.token_type == tokenize.SYNTAX, token
+        # Handle ctor initializers.
+        if token.name == ':':
+            # TODO(nnorwitz): anything else to handle for initializer list?
+            while token.name != ';' and token.name != '{':
+                token = self._GetNextToken()
+
+        # Handle pointer to functions that are really data but look
+        # like method declarations.
+        if token.name == '(':
+            if parameters[0].name == '*':
+                # name contains the return type.
+                name = parameters.pop()
+                # parameters contains the name of the data.
+                modifiers = [p.name for p in parameters]
+                # Already at the ( to open the parameter list.
+                function_parameters = list(self._GetMatchingChar('(', ')'))
+                del function_parameters[-1]  # Remove trailing ')'.
+                # TODO(nnorwitz): store the function_parameters.
+                token = self._GetNextToken()
+                assert token.token_type == tokenize.SYNTAX, token
+                assert token.name == ';', token
+                return self._CreateVariable(indices, name.name, indices.name,
+                                            modifiers, '', None)
+            # At this point, we got something like:
+            #  return_type (type::*name_)(params);
+            # This is a data member called name_ that is a function pointer.
+            # With this code: void (sq_type::*field_)(string&);
+            # We get: name=void return_type=[] parameters=sq_type ... field_
+            # TODO(nnorwitz): is return_type always empty?
+            # TODO(nnorwitz): this isn't even close to being correct.
+            # Just put in something so we don't crash and can move on.
+            real_name = parameters[-1]
+            modifiers = [p.name for p in self._GetParameters()]
+            del modifiers[-1]           # Remove trailing ')'.
+            return self._CreateVariable(indices, real_name.name, indices.name,
+                                        modifiers, '', None)
+
+        if token.name == '{':
+            body = list(self.GetScope())
+            del body[-1]                # Remove trailing '}'.
+        else:
+            body = None
+            if token.name == '=':
+                token = self._GetNextToken()
+
+                if token.name == 'default' or token.name == 'delete':
+                    # Ignore explicitly defaulted and deleted special members
+                    # in C++11.
+                    token = self._GetNextToken()
+                else:
+                    # Handle pure-virtual declarations.
+                    assert token.token_type == tokenize.CONSTANT, token
+                    assert token.name == '0', token
+                    modifiers |= FUNCTION_PURE_VIRTUAL
+                    token = self._GetNextToken()
+
+            if token.name == '[':
+                # TODO(nnorwitz): store tokens and improve parsing.
+                # template <typename T, size_t N> char (&ASH(T (&seq)[N]))[N];
+                tokens = list(self._GetMatchingChar('[', ']'))
+                token = self._GetNextToken()
+
+            assert token.name == ';', (token, return_type_and_name, parameters)
+
+        # Looks like we got a method, not a function.
+        if len(return_type) > 2 and return_type[-1].name == '::':
+            return_type, in_class = \
+                         self._GetReturnTypeAndClassName(return_type)
+            return Method(indices.start, indices.end, name.name, in_class,
+                          return_type, parameters, modifiers, templated_types,
+                          body, self.namespace_stack)
+        return Function(indices.start, indices.end, name.name, return_type,
+                        parameters, modifiers, templated_types, body,
+                        self.namespace_stack)
+
+    def _GetReturnTypeAndClassName(self, token_seq):
+        # Splitting the return type from the class name in a method
+        # can be tricky.  For example, Return::Type::Is::Hard::To::Find().
+        # Where is the return type and where is the class name?
+        # The heuristic used is to pull the last name as the class name.
+        # This includes all the templated type info.
+        # TODO(nnorwitz): if there is only One name like in the
+        # example above, punt and assume the last bit is the class name.
+
+        # Ignore a :: prefix, if exists so we can find the first real name.
+        i = 0
+        if token_seq[0].name == '::':
+            i = 1
+        # Ignore a :: suffix, if exists.
+        end = len(token_seq) - 1
+        if token_seq[end-1].name == '::':
+            end -= 1
+
+        # Make a copy of the sequence so we can append a sentinel
+        # value. This is required for GetName will has to have some
+        # terminating condition beyond the last name.
+        seq_copy = token_seq[i:end]
+        seq_copy.append(tokenize.Token(tokenize.SYNTAX, '', 0, 0))
+        names = []
+        while i < end:
+            # Iterate through the sequence parsing out each name.
+            new_name, next = self.GetName(seq_copy[i:])
+            assert new_name, 'Got empty new_name, next=%s' % next
+            # We got a pointer or ref.  Add it to the name.
+            if next and next.token_type == tokenize.SYNTAX:
+                new_name.append(next)
+            names.append(new_name)
+            i += len(new_name)
+
+        # Now that we have the names, it's time to undo what we did.
+
+        # Remove the sentinel value.
+        names[-1].pop()
+        # Flatten the token sequence for the return type.
+        return_type = [e for seq in names[:-1] for e in seq]
+        # The class name is the last name.
+        class_name = names[-1]
+        return return_type, class_name
+
+    def handle_bool(self):
+        pass
+
+    def handle_char(self):
+        pass
+
+    def handle_int(self):
+        pass
+
+    def handle_long(self):
+        pass
+
+    def handle_short(self):
+        pass
+
+    def handle_double(self):
+        pass
+
+    def handle_float(self):
+        pass
+
+    def handle_void(self):
+        pass
+
+    def handle_wchar_t(self):
+        pass
+
+    def handle_unsigned(self):
+        pass
+
+    def handle_signed(self):
+        pass
+
+    def _GetNestedType(self, ctor):
+        name = None
+        name_tokens, token = self.GetName()
+        if name_tokens:
+            name = ''.join([t.name for t in name_tokens])
+
+        # Handle forward declarations.
+        if token.token_type == tokenize.SYNTAX and token.name == ';':
+            return ctor(token.start, token.end, name, None,
+                        self.namespace_stack)
+
+        if token.token_type == tokenize.NAME and self._handling_typedef:
+            self._AddBackToken(token)
+            return ctor(token.start, token.end, name, None,
+                        self.namespace_stack)
+
+        # Must be the type declaration.
+        fields = list(self._GetMatchingChar('{', '}'))
+        del fields[-1]                  # Remove trailing '}'.
+        if token.token_type == tokenize.SYNTAX and token.name == '{':
+            next = self._GetNextToken()
+            new_type = ctor(token.start, token.end, name, fields,
+                            self.namespace_stack)
+            # A name means this is an anonymous type and the name
+            # is the variable declaration.
+            if next.token_type != tokenize.NAME:
+                return new_type
+            name = new_type
+            token = next
+
+        # Must be variable declaration using the type prefixed with keyword.
+        assert token.token_type == tokenize.NAME, token
+        return self._CreateVariable(token, token.name, name, [], '', None)
+
+    def handle_struct(self):
+        # Special case the handling typedef/aliasing of structs here.
+        # It would be a pain to handle in the class code.
+        name_tokens, var_token = self.GetName()
+        if name_tokens:
+            next_token = self._GetNextToken()
+            is_syntax = (var_token.token_type == tokenize.SYNTAX and
+                         var_token.name[0] in '*&')
+            is_variable = (var_token.token_type == tokenize.NAME and
+                           next_token.name == ';')
+            variable = var_token
+            if is_syntax and not is_variable:
+                variable = next_token
+                temp = self._GetNextToken()
+                if temp.token_type == tokenize.SYNTAX and temp.name == '(':
+                    # Handle methods declared to return a struct.
+                    t0 = name_tokens[0]
+                    struct = tokenize.Token(tokenize.NAME, 'struct',
+                                            t0.start-7, t0.start-2)
+                    type_and_name = [struct]
+                    type_and_name.extend(name_tokens)
+                    type_and_name.extend((var_token, next_token))
+                    return self._GetMethod(type_and_name, 0, None, False)
+                assert temp.name == ';', (temp, name_tokens, var_token)
+            if is_syntax or (is_variable and not self._handling_typedef):
+                modifiers = ['struct']
+                type_name = ''.join([t.name for t in name_tokens])
+                position = name_tokens[0]
+                return self._CreateVariable(position, variable.name, type_name,
+                                            modifiers, var_token.name, None)
+            name_tokens.extend((var_token, next_token))
+            self._AddBackTokens(name_tokens)
+        else:
+            self._AddBackToken(var_token)
+        return self._GetClass(Struct, VISIBILITY_PUBLIC, None)
+
+    def handle_union(self):
+        return self._GetNestedType(Union)
+
+    def handle_enum(self):
+        token = self._GetNextToken()
+        if not (token.token_type == tokenize.NAME and token.name == 'class'):
+            self._AddBackToken(token)
+        return self._GetNestedType(Enum)
+
+    def handle_auto(self):
+        # TODO(nnorwitz): warn about using auto?  Probably not since it
+        # will be reclaimed and useful for C++0x.
+        pass
+
+    def handle_register(self):
+        pass
+
+    def handle_const(self):
+        pass
+
+    def handle_inline(self):
+        pass
+
+    def handle_extern(self):
+        pass
+
+    def handle_static(self):
+        pass
+
+    def handle_virtual(self):
+        # What follows must be a method.
+        token = token2 = self._GetNextToken()
+        if token.name == 'inline':
+            # HACK(nnorwitz): handle inline dtors by ignoring 'inline'.
+            token2 = self._GetNextToken()
+        if token2.token_type == tokenize.SYNTAX and token2.name == '~':
+            return self.GetMethod(FUNCTION_VIRTUAL + FUNCTION_DTOR, None)
+        assert token.token_type == tokenize.NAME or token.name == '::', token
+        return_type_and_name = self._GetTokensUpTo(tokenize.SYNTAX, '(')  # )
+        return_type_and_name.insert(0, token)
+        if token2 is not token:
+            return_type_and_name.insert(1, token2)
+        return self._GetMethod(return_type_and_name, FUNCTION_VIRTUAL,
+                               None, False)
+
+    def handle_volatile(self):
+        pass
+
+    def handle_mutable(self):
+        pass
+
+    def handle_public(self):
+        assert self.in_class
+        self.visibility = VISIBILITY_PUBLIC
+
+    def handle_protected(self):
+        assert self.in_class
+        self.visibility = VISIBILITY_PROTECTED
+
+    def handle_private(self):
+        assert self.in_class
+        self.visibility = VISIBILITY_PRIVATE
+
+    def handle_friend(self):
+        tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+        assert tokens
+        t0 = tokens[0]
+        return Friend(t0.start, t0.end, tokens, self.namespace_stack)
+
+    def handle_static_cast(self):
+        pass
+
+    def handle_const_cast(self):
+        pass
+
+    def handle_dynamic_cast(self):
+        pass
+
+    def handle_reinterpret_cast(self):
+        pass
+
+    def handle_new(self):
+        pass
+
+    def handle_delete(self):
+        tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+        assert tokens
+        return Delete(tokens[0].start, tokens[0].end, tokens)
+
+    def handle_typedef(self):
+        token = self._GetNextToken()
+        if (token.token_type == tokenize.NAME and
+            keywords.IsKeyword(token.name)):
+            # Token must be struct/enum/union/class.
+            method = getattr(self, 'handle_' + token.name)
+            self._handling_typedef = True
+            tokens = [method()]
+            self._handling_typedef = False
+        else:
+            tokens = [token]
+
+        # Get the remainder of the typedef up to the semi-colon.
+        tokens.extend(self._GetTokensUpTo(tokenize.SYNTAX, ';'))
+
+        # TODO(nnorwitz): clean all this up.
+        assert tokens
+        name = tokens.pop()
+        indices = name
+        if tokens:
+            indices = tokens[0]
+        if not indices:
+            indices = token
+        if name.name == ')':
+            # HACK(nnorwitz): Handle pointers to functions "properly".
+            if (len(tokens) >= 4 and
+                tokens[1].name == '(' and tokens[2].name == '*'):
+                tokens.append(name)
+                name = tokens[3]
+        elif name.name == ']':
+            # HACK(nnorwitz): Handle arrays properly.
+            if len(tokens) >= 2:
+                tokens.append(name)
+                name = tokens[1]
+        new_type = tokens
+        if tokens and isinstance(tokens[0], tokenize.Token):
+            new_type = self.converter.ToType(tokens)[0]
+        return Typedef(indices.start, indices.end, name.name,
+                       new_type, self.namespace_stack)
+
+    def handle_typeid(self):
+        pass  # Not needed yet.
+
+    def handle_typename(self):
+        pass  # Not needed yet.
+
+    def _GetTemplatedTypes(self):
+        result = {}
+        tokens = list(self._GetMatchingChar('<', '>'))
+        len_tokens = len(tokens) - 1    # Ignore trailing '>'.
+        i = 0
+        while i < len_tokens:
+            key = tokens[i].name
+            i += 1
+            if keywords.IsKeyword(key) or key == ',':
+                continue
+            type_name = default = None
+            if i < len_tokens:
+                i += 1
+                if tokens[i-1].name == '=':
+                    assert i < len_tokens, '%s %s' % (i, tokens)
+                    default, unused_next_token = self.GetName(tokens[i:])
+                    i += len(default)
+                else:
+                    if tokens[i-1].name != ',':
+                        # We got something like: Type variable.
+                        # Re-adjust the key (variable) and type_name (Type).
+                        key = tokens[i-1].name
+                        type_name = tokens[i-2]
+
+            result[key] = (type_name, default)
+        return result
+
+    def handle_template(self):
+        token = self._GetNextToken()
+        assert token.token_type == tokenize.SYNTAX, token
+        assert token.name == '<', token
+        templated_types = self._GetTemplatedTypes()
+        # TODO(nnorwitz): for now, just ignore the template params.
+        token = self._GetNextToken()
+        if token.token_type == tokenize.NAME:
+            if token.name == 'class':
+                return self._GetClass(Class, VISIBILITY_PRIVATE, templated_types)
+            elif token.name == 'struct':
+                return self._GetClass(Struct, VISIBILITY_PUBLIC, templated_types)
+            elif token.name == 'friend':
+                return self.handle_friend()
+        self._AddBackToken(token)
+        tokens, last = self._GetVarTokensUpTo(tokenize.SYNTAX, '(', ';')
+        tokens.append(last)
+        self._AddBackTokens(tokens)
+        if last.name == '(':
+            return self.GetMethod(FUNCTION_NONE, templated_types)
+        # Must be a variable definition.
+        return None
+
+    def handle_true(self):
+        pass  # Nothing to do.
+
+    def handle_false(self):
+        pass  # Nothing to do.
+
+    def handle_asm(self):
+        pass  # Not needed yet.
+
+    def handle_class(self):
+        return self._GetClass(Class, VISIBILITY_PRIVATE, None)
+
+    def _GetBases(self):
+        # Get base classes.
+        bases = []
+        while 1:
+            token = self._GetNextToken()
+            assert token.token_type == tokenize.NAME, token
+            # TODO(nnorwitz): store kind of inheritance...maybe.
+            if token.name not in ('public', 'protected', 'private'):
+                # If inheritance type is not specified, it is private.
+                # Just put the token back so we can form a name.
+                # TODO(nnorwitz): it would be good to warn about this.
+                self._AddBackToken(token)
+            else:
+                # Check for virtual inheritance.
+                token = self._GetNextToken()
+                if token.name != 'virtual':
+                    self._AddBackToken(token)
+                else:
+                    # TODO(nnorwitz): store that we got virtual for this base.
+                    pass
+            base, next_token = self.GetName()
+            bases_ast = self.converter.ToType(base)
+            assert len(bases_ast) == 1, bases_ast
+            bases.append(bases_ast[0])
+            assert next_token.token_type == tokenize.SYNTAX, next_token
+            if next_token.name == '{':
+                token = next_token
+                break
+            # Support multiple inheritance.
+            assert next_token.name == ',', next_token
+        return bases, token
+
+    def _GetClass(self, class_type, visibility, templated_types):
+        class_name = None
+        class_token = self._GetNextToken()
+        if class_token.token_type != tokenize.NAME:
+            assert class_token.token_type == tokenize.SYNTAX, class_token
+            token = class_token
+        else:
+            # Skip any macro (e.g. storage class specifiers) after the
+            # 'class' keyword.
+            next_token = self._GetNextToken()
+            if next_token.token_type == tokenize.NAME:
+                self._AddBackToken(next_token)
+            else:
+                self._AddBackTokens([class_token, next_token])
+            name_tokens, token = self.GetName()
+            class_name = ''.join([t.name for t in name_tokens])
+        bases = None
+        if token.token_type == tokenize.SYNTAX:
+            if token.name == ';':
+                # Forward declaration.
+                return class_type(class_token.start, class_token.end,
+                                  class_name, None, templated_types, None,
+                                  self.namespace_stack)
+            if token.name in '*&':
+                # Inline forward declaration.  Could be method or data.
+                name_token = self._GetNextToken()
+                next_token = self._GetNextToken()
+                if next_token.name == ';':
+                    # Handle data
+                    modifiers = ['class']
+                    return self._CreateVariable(class_token, name_token.name,
+                                                class_name,
+                                                modifiers, token.name, None)
+                else:
+                    # Assume this is a method.
+                    tokens = (class_token, token, name_token, next_token)
+                    self._AddBackTokens(tokens)
+                    return self.GetMethod(FUNCTION_NONE, None)
+            if token.name == ':':
+                bases, token = self._GetBases()
+
+        body = None
+        if token.token_type == tokenize.SYNTAX and token.name == '{':
+            assert token.token_type == tokenize.SYNTAX, token
+            assert token.name == '{', token
+
+            ast = AstBuilder(self.GetScope(), self.filename, class_name,
+                             visibility, self.namespace_stack)
+            body = list(ast.Generate())
+
+            if not self._handling_typedef:
+                token = self._GetNextToken()
+                if token.token_type != tokenize.NAME:
+                    assert token.token_type == tokenize.SYNTAX, token
+                    assert token.name == ';', token
+                else:
+                    new_class = class_type(class_token.start, class_token.end,
+                                           class_name, bases, None,
+                                           body, self.namespace_stack)
+
+                    modifiers = []
+                    return self._CreateVariable(class_token,
+                                                token.name, new_class,
+                                                modifiers, token.name, None)
+        else:
+            if not self._handling_typedef:
+                self.HandleError('non-typedef token', token)
+            self._AddBackToken(token)
+
+        return class_type(class_token.start, class_token.end, class_name,
+                          bases, templated_types, body, self.namespace_stack)
+
+    def handle_namespace(self):
+        token = self._GetNextToken()
+        # Support anonymous namespaces.
+        name = None
+        if token.token_type == tokenize.NAME:
+            name = token.name
+            token = self._GetNextToken()
+        self.namespace_stack.append(name)
+        assert token.token_type == tokenize.SYNTAX, token
+        # Create an internal token that denotes when the namespace is complete.
+        internal_token = tokenize.Token(_INTERNAL_TOKEN, _NAMESPACE_POP,
+                                        None, None)
+        internal_token.whence = token.whence
+        if token.name == '=':
+            # TODO(nnorwitz): handle aliasing namespaces.
+            name, next_token = self.GetName()
+            assert next_token.name == ';', next_token
+            self._AddBackToken(internal_token)
+        else:
+            assert token.name == '{', token
+            tokens = list(self.GetScope())
+            # Replace the trailing } with the internal namespace pop token.
+            tokens[-1] = internal_token
+            # Handle namespace with nothing in it.
+            self._AddBackTokens(tokens)
+        return None
+
+    def handle_using(self):
+        tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+        assert tokens
+        return Using(tokens[0].start, tokens[0].end, tokens)
+
+    def handle_explicit(self):
+        assert self.in_class
+        # Nothing much to do.
+        # TODO(nnorwitz): maybe verify the method name == class name.
+        # This must be a ctor.
+        return self.GetMethod(FUNCTION_CTOR, None)
+
+    def handle_this(self):
+        pass  # Nothing to do.
+
+    def handle_operator(self):
+        # Pull off the next token(s?) and make that part of the method name.
+        pass
+
+    def handle_sizeof(self):
+        pass
+
+    def handle_case(self):
+        pass
+
+    def handle_switch(self):
+        pass
+
+    def handle_default(self):
+        token = self._GetNextToken()
+        assert token.token_type == tokenize.SYNTAX
+        assert token.name == ':'
+
+    def handle_if(self):
+        pass
+
+    def handle_else(self):
+        pass
+
+    def handle_return(self):
+        tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+        if not tokens:
+            return Return(self.current_token.start, self.current_token.end, None)
+        return Return(tokens[0].start, tokens[0].end, tokens)
+
+    def handle_goto(self):
+        tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+        assert len(tokens) == 1, str(tokens)
+        return Goto(tokens[0].start, tokens[0].end, tokens[0].name)
+
+    def handle_try(self):
+        pass  # Not needed yet.
+
+    def handle_catch(self):
+        pass  # Not needed yet.
+
+    def handle_throw(self):
+        pass  # Not needed yet.
+
+    def handle_while(self):
+        pass
+
+    def handle_do(self):
+        pass
+
+    def handle_for(self):
+        pass
+
+    def handle_break(self):
+        self._IgnoreUpTo(tokenize.SYNTAX, ';')
+
+    def handle_continue(self):
+        self._IgnoreUpTo(tokenize.SYNTAX, ';')
+
+
+def BuilderFromSource(source, filename):
+    """Utility method that returns an AstBuilder from source code.
+
+    Args:
+      source: 'C++ source code'
+      filename: 'file1'
+
+    Returns:
+      AstBuilder
+    """
+    return AstBuilder(tokenize.GetTokens(source), filename)
+
+
+def PrintIndentifiers(filename, should_print):
+    """Prints all identifiers for a C++ source file.
+
+    Args:
+      filename: 'file1'
+      should_print: predicate with signature: bool Function(token)
+    """
+    source = utils.ReadFile(filename, False)
+    if source is None:
+        sys.stderr.write('Unable to find: %s\n' % filename)
+        return
+
+    #print('Processing %s' % actual_filename)
+    builder = BuilderFromSource(source, filename)
+    try:
+        for node in builder.Generate():
+            if should_print(node):
+                print(node.name)
+    except KeyboardInterrupt:
+        return
+    except:
+        pass
+
+
+def PrintAllIndentifiers(filenames, should_print):
+    """Prints all identifiers for each C++ source file in filenames.
+
+    Args:
+      filenames: ['file1', 'file2', ...]
+      should_print: predicate with signature: bool Function(token)
+    """
+    for path in filenames:
+        PrintIndentifiers(path, should_print)
+
+
+def main(argv):
+    for filename in argv[1:]:
+        source = utils.ReadFile(filename)
+        if source is None:
+            continue
+
+        print('Processing %s' % filename)
+        builder = BuilderFromSource(source, filename)
+        try:
+            entire_ast = filter(None, builder.Generate())
+        except KeyboardInterrupt:
+            return
+        except:
+            # Already printed a warning, print the traceback and continue.
+            traceback.print_exc()
+        else:
+            if utils.DEBUG:
+                for ast in entire_ast:
+                    print(ast)
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class.py
new file mode 100755
index 0000000..f9966cb
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+#
+# Copyright 2008 Google Inc.  All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate Google Mock classes from base classes.
+
+This program will read in a C++ source file and output the Google Mock
+classes for the specified classes.  If no class is specified, all
+classes in the source file are emitted.
+
+Usage:
+  gmock_class.py header-file.h [ClassName]...
+
+Output is sent to stdout.
+"""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+import os
+import re
+import sys
+
+from cpp import ast
+from cpp import utils
+
+# Preserve compatibility with Python 2.3.
+try:
+  _dummy = set
+except NameError:
+  import sets
+  set = sets.Set
+
+_VERSION = (1, 0, 1)  # The version of this script.
+# How many spaces to indent.  Can set me with the INDENT environment variable.
+_INDENT = 2
+
+
+def _GenerateMethods(output_lines, source, class_node):
+  function_type = (ast.FUNCTION_VIRTUAL | ast.FUNCTION_PURE_VIRTUAL |
+                   ast.FUNCTION_OVERRIDE)
+  ctor_or_dtor = ast.FUNCTION_CTOR | ast.FUNCTION_DTOR
+  indent = ' ' * _INDENT
+
+  for node in class_node.body:
+    # We only care about virtual functions.
+    if (isinstance(node, ast.Function) and
+        node.modifiers & function_type and
+        not node.modifiers & ctor_or_dtor):
+      # Pick out all the elements we need from the original function.
+      const = ''
+      if node.modifiers & ast.FUNCTION_CONST:
+        const = 'CONST_'
+      return_type = 'void'
+      if node.return_type:
+        # Add modifiers like 'const'.
+        modifiers = ''
+        if node.return_type.modifiers:
+          modifiers = ' '.join(node.return_type.modifiers) + ' '
+        return_type = modifiers + node.return_type.name
+        template_args = [arg.name for arg in node.return_type.templated_types]
+        if template_args:
+          return_type += '<' + ', '.join(template_args) + '>'
+          if len(template_args) > 1:
+            for line in [
+                '// The following line won\'t really compile, as the return',
+                '// type has multiple template arguments.  To fix it, use a',
+                '// typedef for the return type.']:
+              output_lines.append(indent + line)
+        if node.return_type.pointer:
+          return_type += '*'
+        if node.return_type.reference:
+          return_type += '&'
+        num_parameters = len(node.parameters)
+        if len(node.parameters) == 1:
+          first_param = node.parameters[0]
+          if source[first_param.start:first_param.end].strip() == 'void':
+            # We must treat T(void) as a function with no parameters.
+            num_parameters = 0
+      tmpl = ''
+      if class_node.templated_types:
+        tmpl = '_T'
+      mock_method_macro = 'MOCK_%sMETHOD%d%s' % (const, num_parameters, tmpl)
+
+      args = ''
+      if node.parameters:
+        # Due to the parser limitations, it is impossible to keep comments
+        # while stripping the default parameters.  When defaults are
+        # present, we choose to strip them and comments (and produce
+        # compilable code).
+        # TODO(nnorwitz@google.com): Investigate whether it is possible to
+        # preserve parameter name when reconstructing parameter text from
+        # the AST.
+        if len([param for param in node.parameters if param.default]) > 0:
+          args = ', '.join(param.type.name for param in node.parameters)
+        else:
+          # Get the full text of the parameters from the start
+          # of the first parameter to the end of the last parameter.
+          start = node.parameters[0].start
+          end = node.parameters[-1].end
+          # Remove // comments.
+          args_strings = re.sub(r'//.*', '', source[start:end])
+          # Condense multiple spaces and eliminate newlines putting the
+          # parameters together on a single line.  Ensure there is a
+          # space in an argument which is split by a newline without
+          # intervening whitespace, e.g.: int\nBar
+          args = re.sub('  +', ' ', args_strings.replace('\n', ' '))
+
+      # Create the mock method definition.
+      output_lines.extend(['%s%s(%s,' % (indent, mock_method_macro, node.name),
+                           '%s%s(%s));' % (indent*3, return_type, args)])
+
+
+def _GenerateMocks(filename, source, ast_list, desired_class_names):
+  processed_class_names = set()
+  lines = []
+  for node in ast_list:
+    if (isinstance(node, ast.Class) and node.body and
+        # desired_class_names being None means that all classes are selected.
+        (not desired_class_names or node.name in desired_class_names)):
+      class_name = node.name
+      parent_name = class_name
+      processed_class_names.add(class_name)
+      class_node = node
+      # Add namespace before the class.
+      if class_node.namespace:
+        lines.extend(['namespace %s {' % n for n in class_node.namespace])  # }
+        lines.append('')
+
+      # Add template args for templated classes.
+      if class_node.templated_types:
+        # TODO(paulchang): The AST doesn't preserve template argument order,
+        # so we have to make up names here.
+        # TODO(paulchang): Handle non-type template arguments (e.g.
+        # template<typename T, int N>).
+        template_arg_count = len(class_node.templated_types.keys())
+        template_args = ['T%d' % n for n in range(template_arg_count)]
+        template_decls = ['typename ' + arg for arg in template_args]
+        lines.append('template <' + ', '.join(template_decls) + '>')
+        parent_name += '<' + ', '.join(template_args) + '>'
+
+      # Add the class prolog.
+      lines.append('class Mock%s : public %s {'  # }
+                   % (class_name, parent_name))
+      lines.append('%spublic:' % (' ' * (_INDENT // 2)))
+
+      # Add all the methods.
+      _GenerateMethods(lines, source, class_node)
+
+      # Close the class.
+      if lines:
+        # If there are no virtual methods, no need for a public label.
+        if len(lines) == 2:
+          del lines[-1]
+
+        # Only close the class if there really is a class.
+        lines.append('};')
+        lines.append('')  # Add an extra newline.
+
+      # Close the namespace.
+      if class_node.namespace:
+        for i in range(len(class_node.namespace)-1, -1, -1):
+          lines.append('}  // namespace %s' % class_node.namespace[i])
+        lines.append('')  # Add an extra newline.
+
+  if desired_class_names:
+    missing_class_name_list = list(desired_class_names - processed_class_names)
+    if missing_class_name_list:
+      missing_class_name_list.sort()
+      sys.stderr.write('Class(es) not found in %s: %s\n' %
+                       (filename, ', '.join(missing_class_name_list)))
+  elif not processed_class_names:
+    sys.stderr.write('No class found in %s\n' % filename)
+
+  return lines
+
+
+def main(argv=sys.argv):
+  if len(argv) < 2:
+    sys.stderr.write('Google Mock Class Generator v%s\n\n' %
+                     '.'.join(map(str, _VERSION)))
+    sys.stderr.write(__doc__)
+    return 1
+
+  global _INDENT
+  try:
+    _INDENT = int(os.environ['INDENT'])
+  except KeyError:
+    pass
+  except:
+    sys.stderr.write('Unable to use indent of %s\n' % os.environ.get('INDENT'))
+
+  filename = argv[1]
+  desired_class_names = None  # None means all classes in the source file.
+  if len(argv) >= 3:
+    desired_class_names = set(argv[2:])
+  source = utils.ReadFile(filename)
+  if source is None:
+    return 1
+
+  builder = ast.BuilderFromSource(source, filename)
+  try:
+    entire_ast = filter(None, builder.Generate())
+  except KeyboardInterrupt:
+    return
+  except:
+    # An error message was already printed since we couldn't parse.
+    sys.exit(1)
+  else:
+    lines = _GenerateMocks(filename, source, entire_ast, desired_class_names)
+    sys.stdout.write('\n'.join(lines))
+
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class_test.py
new file mode 100755
index 0000000..c53e600
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/gmock_class_test.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python
+#
+# Copyright 2009 Neal Norwitz All Rights Reserved.
+# Portions Copyright 2009 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for gmock.scripts.generator.cpp.gmock_class."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+import os
+import sys
+import unittest
+
+# Allow the cpp imports below to work when run as a standalone script.
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+from cpp import ast
+from cpp import gmock_class
+
+
+class TestCase(unittest.TestCase):
+  """Helper class that adds assert methods."""
+
+  def StripLeadingWhitespace(self, lines):
+    """Strip leading whitespace in each line in 'lines'."""
+    return '\n'.join([s.lstrip() for s in lines.split('\n')])
+
+  def assertEqualIgnoreLeadingWhitespace(self, expected_lines, lines):
+    """Specialized assert that ignores the indent level."""
+    self.assertEqual(expected_lines, self.StripLeadingWhitespace(lines))
+
+
+class GenerateMethodsTest(TestCase):
+
+  def GenerateMethodSource(self, cpp_source):
+    """Convert C++ source to Google Mock output source lines."""
+    method_source_lines = []
+    # <test> is a pseudo-filename, it is not read or written.
+    builder = ast.BuilderFromSource(cpp_source, '<test>')
+    ast_list = list(builder.Generate())
+    gmock_class._GenerateMethods(method_source_lines, cpp_source, ast_list[0])
+    return '\n'.join(method_source_lines)
+
+  def testSimpleMethod(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar();
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testSimpleConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo();
+  Foo(int x);
+  Foo(const Foo& f);
+  Foo(Foo&& f);
+  ~Foo();
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testVirtualDestructor(self):
+    source = """
+class Foo {
+ public:
+  virtual ~Foo();
+  virtual int Bar() = 0;
+};
+"""
+    # The destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testExplicitlyDefaultedConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo() = default;
+  Foo(const Foo& f) = default;
+  Foo(Foo&& f) = default;
+  ~Foo() = default;
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testExplicitlyDeletedConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo() = delete;
+  Foo(const Foo& f) = delete;
+  Foo(Foo&& f) = delete;
+  ~Foo() = delete;
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testSimpleOverrideMethod(self):
+    source = """
+class Foo {
+ public:
+  int Bar() override;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testSimpleConstMethod(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(bool flag) const;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_CONST_METHOD1(Bar,\nvoid(bool flag));',
+        self.GenerateMethodSource(source))
+
+  def testExplicitVoid(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar(void);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0(Bar,\nint(void));',
+        self.GenerateMethodSource(source))
+
+  def testStrangeNewlineInParameter(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int
+a) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD1(Bar,\nvoid(int a));',
+        self.GenerateMethodSource(source))
+
+  def testDefaultParameters(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a, char c = 'x') = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD2(Bar,\nvoid(int, char));',
+        self.GenerateMethodSource(source))
+
+  def testMultipleDefaultParameters(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a = 42, char c = 'x') = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD2(Bar,\nvoid(int, char));',
+        self.GenerateMethodSource(source))
+
+  def testRemovesCommentsWhenDefaultsArePresent(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a = 42 /* a comment */,
+                   char /* other comment */ c= 'x') = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD2(Bar,\nvoid(int, char));',
+        self.GenerateMethodSource(source))
+
+  def testDoubleSlashCommentsInParameterListAreRemoved(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a,  // inline comments should be elided.
+                   int b   // inline comments should be elided.
+                   ) const = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_CONST_METHOD2(Bar,\nvoid(int a, int b));',
+        self.GenerateMethodSource(source))
+
+  def testCStyleCommentsInParameterListAreNotRemoved(self):
+    # NOTE(nnorwitz): I'm not sure if it's the best behavior to keep these
+    # comments.  Also note that C style comments after the last parameter
+    # are still elided.
+    source = """
+class Foo {
+ public:
+  virtual const string& Bar(int /* keeper */, int b);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD2(Bar,\nconst string&(int /* keeper */, int b));',
+        self.GenerateMethodSource(source))
+
+  def testArgsOfTemplateTypes(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar(const vector<int>& v, map<int, string>* output);
+};"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD2(Bar,\n'
+        'int(const vector<int>& v, map<int, string>* output));',
+        self.GenerateMethodSource(source))
+
+  def testReturnTypeWithOneTemplateArg(self):
+    source = """
+class Foo {
+ public:
+  virtual vector<int>* Bar(int n);
+};"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD1(Bar,\nvector<int>*(int n));',
+        self.GenerateMethodSource(source))
+
+  def testReturnTypeWithManyTemplateArgs(self):
+    source = """
+class Foo {
+ public:
+  virtual map<int, string> Bar();
+};"""
+    # Comparing the comment text is brittle - we'll think of something
+    # better in case this gets annoying, but for now let's keep it simple.
+    self.assertEqualIgnoreLeadingWhitespace(
+        '// The following line won\'t really compile, as the return\n'
+        '// type has multiple template arguments.  To fix it, use a\n'
+        '// typedef for the return type.\n'
+        'MOCK_METHOD0(Bar,\nmap<int, string>());',
+        self.GenerateMethodSource(source))
+
+  def testSimpleMethodInTemplatedClass(self):
+    source = """
+template<class T>
+class Foo {
+ public:
+  virtual int Bar();
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD0_T(Bar,\nint());',
+        self.GenerateMethodSource(source))
+
+  def testPointerArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C*);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD1(Bar,\nint(C*));',
+        self.GenerateMethodSource(source))
+
+  def testReferenceArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C&);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD1(Bar,\nint(C&));',
+        self.GenerateMethodSource(source))
+
+  def testArrayArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C[]);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD1(Bar,\nint(C[]));',
+        self.GenerateMethodSource(source))
+
+
+class GenerateMocksTest(TestCase):
+
+  def GenerateMocks(self, cpp_source):
+    """Convert C++ source to complete Google Mock output source."""
+    # <test> is a pseudo-filename, it is not read or written.
+    filename = '<test>'
+    builder = ast.BuilderFromSource(cpp_source, filename)
+    ast_list = list(builder.Generate())
+    lines = gmock_class._GenerateMocks(filename, cpp_source, ast_list, None)
+    return '\n'.join(lines)
+
+  def testNamespaces(self):
+    source = """
+namespace Foo {
+namespace Bar { class Forward; }
+namespace Baz {
+
+class Test {
+ public:
+  virtual void Foo();
+};
+
+}  // namespace Baz
+}  // namespace Foo
+"""
+    expected = """\
+namespace Foo {
+namespace Baz {
+
+class MockTest : public Test {
+public:
+MOCK_METHOD0(Foo,
+void());
+};
+
+}  // namespace Baz
+}  // namespace Foo
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testClassWithStorageSpecifierMacro(self):
+    source = """
+class STORAGE_SPECIFIER Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD0(Foo,
+void());
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testTemplatedForwardDeclaration(self):
+    source = """
+template <class T> class Forward;  // Forward declaration should be ignored.
+class Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD0(Foo,
+void());
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testTemplatedClass(self):
+    source = """
+template <typename S, typename T>
+class Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+template <typename T0, typename T1>
+class MockTest : public Test<T0, T1> {
+public:
+MOCK_METHOD0_T(Foo,
+void());
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testTemplateInATemplateTypedef(self):
+    source = """
+class Test {
+ public:
+  typedef std::vector<std::list<int>> FooType;
+  virtual void Bar(const FooType& test_arg);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD1(Bar,
+void(const FooType& test_arg));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testTemplateInATemplateTypedefWithComma(self):
+    source = """
+class Test {
+ public:
+  typedef std::function<void(
+      const vector<std::list<int>>&, int> FooType;
+  virtual void Bar(const FooType& test_arg);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD1(Bar,
+void(const FooType& test_arg));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+  def testEnumClass(self):
+    source = """
+class Test {
+ public:
+  enum class Baz { BAZINGA };
+  virtual void Bar(const FooType& test_arg);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD1(Bar,
+void(const FooType& test_arg));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        expected, self.GenerateMocks(source))
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/keywords.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/keywords.py
new file mode 100755
index 0000000..f694450
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/keywords.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""C++ keywords and helper utilities for determining keywords."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+
+if not hasattr(builtins, 'set'):
+    # Nominal support for Python 2.3.
+    from sets import Set as set
+
+
+TYPES = set('bool char int long short double float void wchar_t unsigned signed'.split())
+TYPE_MODIFIERS = set('auto register const inline extern static virtual volatile mutable'.split())
+ACCESS = set('public protected private friend'.split())
+
+CASTS = set('static_cast const_cast dynamic_cast reinterpret_cast'.split())
+
+OTHERS = set('true false asm class namespace using explicit this operator sizeof'.split())
+OTHER_TYPES = set('new delete typedef struct union enum typeid typename template'.split())
+
+CONTROL = set('case switch default if else return goto'.split())
+EXCEPTION = set('try catch throw'.split())
+LOOP = set('while do for break continue'.split())
+
+ALL = TYPES | TYPE_MODIFIERS | ACCESS | CASTS | OTHERS | OTHER_TYPES | CONTROL | EXCEPTION | LOOP
+
+
+def IsKeyword(token):
+    return token in ALL
+
+def IsBuiltinType(token):
+    if token in ('virtual', 'inline'):
+        # These only apply to methods, they can't be types by themselves.
+        return False
+    return token in TYPES or token in TYPE_MODIFIERS
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/tokenize.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/tokenize.py
new file mode 100755
index 0000000..359d556
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/tokenize.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenize C++ source code."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+
+import sys
+
+from cpp import utils
+
+
+if not hasattr(builtins, 'set'):
+    # Nominal support for Python 2.3.
+    from sets import Set as set
+
+
+# Add $ as a valid identifier char since so much code uses it.
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
+HEX_DIGITS = set('0123456789abcdefABCDEF')
+INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
+
+
+# C++0x string preffixes.
+_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
+
+
+# Token types.
+UNKNOWN = 'UNKNOWN'
+SYNTAX = 'SYNTAX'
+CONSTANT = 'CONSTANT'
+NAME = 'NAME'
+PREPROCESSOR = 'PREPROCESSOR'
+
+# Where the token originated from.  This can be used for backtracking.
+# It is always set to WHENCE_STREAM in this code.
+WHENCE_STREAM, WHENCE_QUEUE = range(2)
+
+
+class Token(object):
+    """Data container to represent a C++ token.
+
+    Tokens can be identifiers, syntax char(s), constants, or
+    pre-processor directives.
+
+    start contains the index of the first char of the token in the source
+    end contains the index of the last char of the token in the source
+    """
+
+    def __init__(self, token_type, name, start, end):
+        self.token_type = token_type
+        self.name = name
+        self.start = start
+        self.end = end
+        self.whence = WHENCE_STREAM
+
+    def __str__(self):
+        if not utils.DEBUG:
+            return 'Token(%r)' % self.name
+        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
+
+    __repr__ = __str__
+
+
+def _GetString(source, start, i):
+    i = source.find('"', i+1)
+    while source[i-1] == '\\':
+        # Count the trailing backslashes.
+        backslash_count = 1
+        j = i - 2
+        while source[j] == '\\':
+            backslash_count += 1
+            j -= 1
+        # When trailing backslashes are even, they escape each other.
+        if (backslash_count % 2) == 0:
+            break
+        i = source.find('"', i+1)
+    return i + 1
+
+
+def _GetChar(source, start, i):
+    # NOTE(nnorwitz): may not be quite correct, should be good enough.
+    i = source.find("'", i+1)
+    while source[i-1] == '\\':
+        # Need to special case '\\'.
+        if (i - 2) > start and source[i-2] == '\\':
+            break
+        i = source.find("'", i+1)
+    # Try to handle unterminated single quotes (in a #if 0 block).
+    if i < 0:
+        i = start
+    return i + 1
+
+
+def GetTokens(source):
+    """Returns a sequence of Tokens.
+
+    Args:
+      source: string of C++ source code.
+
+    Yields:
+      Token that represents the next token in the source.
+    """
+    # Cache various valid character sets for speed.
+    valid_identifier_chars = VALID_IDENTIFIER_CHARS
+    hex_digits = HEX_DIGITS
+    int_or_float_digits = INT_OR_FLOAT_DIGITS
+    int_or_float_digits2 = int_or_float_digits | set('.')
+
+    # Only ignore errors while in a #if 0 block.
+    ignore_errors = False
+    count_ifs = 0
+
+    i = 0
+    end = len(source)
+    while i < end:
+        # Skip whitespace.
+        while i < end and source[i].isspace():
+            i += 1
+        if i >= end:
+            return
+
+        token_type = UNKNOWN
+        start = i
+        c = source[i]
+        if c.isalpha() or c == '_':              # Find a string token.
+            token_type = NAME
+            while source[i] in valid_identifier_chars:
+                i += 1
+            # String and character constants can look like a name if
+            # they are something like L"".
+            if (source[i] == "'" and (i - start) == 1 and
+                source[start:i] in 'uUL'):
+                # u, U, and L are valid C++0x character preffixes.
+                token_type = CONSTANT
+                i = _GetChar(source, start, i)
+            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
+                token_type = CONSTANT
+                i = _GetString(source, start, i)
+        elif c == '/' and source[i+1] == '/':    # Find // comments.
+            i = source.find('\n', i)
+            if i == -1:  # Handle EOF.
+                i = end
+            continue
+        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
+            i = source.find('*/', i) + 2
+            continue
+        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
+            token_type = SYNTAX
+            i += 1
+            new_ch = source[i]
+            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
+                i += 1
+            elif c == '-' and new_ch == '>':
+                i += 1
+            elif new_ch == '=':
+                i += 1
+        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
+            token_type = SYNTAX
+            i += 1
+            if c == '.' and source[i].isdigit():
+                token_type = CONSTANT
+                i += 1
+                while source[i] in int_or_float_digits:
+                    i += 1
+                # Handle float suffixes.
+                for suffix in ('l', 'f'):
+                    if suffix == source[i:i+1].lower():
+                        i += 1
+                        break
+        elif c.isdigit():                        # Find integer.
+            token_type = CONSTANT
+            if c == '0' and source[i+1] in 'xX':
+                # Handle hex digits.
+                i += 2
+                while source[i] in hex_digits:
+                    i += 1
+            else:
+                while source[i] in int_or_float_digits2:
+                    i += 1
+            # Handle integer (and float) suffixes.
+            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
+                size = len(suffix)
+                if suffix == source[i:i+size].lower():
+                    i += size
+                    break
+        elif c == '"':                           # Find string.
+            token_type = CONSTANT
+            i = _GetString(source, start, i)
+        elif c == "'":                           # Find char.
+            token_type = CONSTANT
+            i = _GetChar(source, start, i)
+        elif c == '#':                           # Find pre-processor command.
+            token_type = PREPROCESSOR
+            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
+            if got_if:
+                count_ifs += 1
+            elif source[i:i+6] == '#endif':
+                count_ifs -= 1
+                if count_ifs == 0:
+                    ignore_errors = False
+
+            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
+            while 1:
+                i1 = source.find('\n', i)
+                i2 = source.find('//', i)
+                i3 = source.find('/*', i)
+                i4 = source.find('"', i)
+                # NOTE(nnorwitz): doesn't handle comments in #define macros.
+                # Get the first important symbol (newline, comment, EOF/end).
+                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
+
+                # Handle #include "dir//foo.h" properly.
+                if source[i] == '"':
+                    i = source.find('"', i+1) + 1
+                    assert i > 0
+                    continue
+                # Keep going if end of the line and the line ends with \.
+                if not (i == i1 and source[i-1] == '\\'):
+                    if got_if:
+                        condition = source[start+4:i].lstrip()
+                        if (condition.startswith('0') or
+                            condition.startswith('(0)')):
+                            ignore_errors = True
+                    break
+                i += 1
+        elif c == '\\':                          # Handle \ in code.
+            # This is different from the pre-processor \ handling.
+            i += 1
+            continue
+        elif ignore_errors:
+            # The tokenizer seems to be in pretty good shape.  This
+            # raise is conditionally disabled so that bogus code
+            # in an #if 0 block can be handled.  Since we will ignore
+            # it anyways, this is probably fine.  So disable the
+            # exception and  return the bogus char.
+            i += 1
+        else:
+            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
+                             ('?', i, c, source[i-10:i+10]))
+            raise RuntimeError('unexpected token')
+
+        if i <= 0:
+            print('Invalid index, exiting now.')
+            return
+        yield Token(token_type, source[start:i], start, i)
+
+
+if __name__ == '__main__':
+    def main(argv):
+        """Driver mostly for testing purposes."""
+        for filename in argv[1:]:
+            source = utils.ReadFile(filename)
+            if source is None:
+                continue
+
+            for token in GetTokens(source):
+                print('%-12s: %s' % (token.token_type, token.name))
+                # print('\r%6.2f%%' % (100.0 * index / token.end),)
+            sys.stdout.write('\n')
+
+
+    main(sys.argv)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/utils.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/utils.py
new file mode 100755
index 0000000..eab36ee
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/cpp/utils.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generic utilities for C++ parsing."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+import sys
+
+
+# Set to True to see the start/end token indices.
+DEBUG = True
+
+
+def ReadFile(filename, print_error=True):
+    """Returns the contents of a file."""
+    try:
+        fp = open(filename)
+        try:
+            return fp.read()
+        finally:
+            fp.close()
+    except IOError:
+        if print_error:
+            print('Error reading %s: %s' % (filename, sys.exc_info()[1]))
+        return None
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/gmock_gen.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/gmock_gen.py
new file mode 100755
index 0000000..8cc0d13
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/generator/gmock_gen.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+#
+# Copyright 2008 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Driver for starting up Google Mock class generator."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+import os
+import sys
+
+if __name__ == '__main__':
+  # Add the directory of this script to the path so we can import gmock_class.
+  sys.path.append(os.path.dirname(__file__))
+
+  from cpp import gmock_class
+  # Fix the docstring in case they require the usage.
+  gmock_class.__doc__ = gmock_class.__doc__.replace('gmock_class.py', __file__)
+  gmock_class.main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock-config.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock-config.in
new file mode 100755
index 0000000..2baefe9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock-config.in
@@ -0,0 +1,303 @@
+#!/bin/sh
+
+# These variables are automatically filled in by the configure script.
+name="@PACKAGE_TARNAME@"
+version="@PACKAGE_VERSION@"
+
+show_usage()
+{
+  echo "Usage: gmock-config [OPTIONS...]"
+}
+
+show_help()
+{
+  show_usage
+  cat <<\EOF
+
+The `gmock-config' script provides access to the necessary compile and linking
+flags to connect with Google C++ Mocking Framework, both in a build prior to
+installation, and on the system proper after installation. The installation
+overrides may be issued in combination with any other queries, but will only
+affect installation queries if called on a built but not installed gmock. The
+installation queries may not be issued with any other types of queries, and
+only one installation query may be made at a time. The version queries and
+compiler flag queries may be combined as desired but not mixed. Different
+version queries are always combined with logical "and" semantics, and only the
+last of any particular query is used while all previous ones ignored. All
+versions must be specified as a sequence of numbers separated by periods.
+Compiler flag queries output the union of the sets of flags when combined.
+
+ Examples:
+  gmock-config --min-version=1.0 || echo "Insufficient Google Mock version."
+
+  g++ $(gmock-config --cppflags --cxxflags) -o foo.o -c foo.cpp
+  g++ $(gmock-config --ldflags --libs) -o foo foo.o
+
+  # When using a built but not installed Google Mock:
+  g++ $(../../my_gmock_build/scripts/gmock-config ...) ...
+
+  # When using an installed Google Mock, but with installation overrides:
+  export GMOCK_PREFIX="/opt"
+  g++ $(gmock-config --libdir="/opt/lib64" ...) ...
+
+ Help:
+  --usage                    brief usage information
+  --help                     display this help message
+
+ Installation Overrides:
+  --prefix=<dir>             overrides the installation prefix
+  --exec-prefix=<dir>        overrides the executable installation prefix
+  --libdir=<dir>             overrides the library installation prefix
+  --includedir=<dir>         overrides the header file installation prefix
+
+ Installation Queries:
+  --prefix                   installation prefix
+  --exec-prefix              executable installation prefix
+  --libdir                   library installation directory
+  --includedir               header file installation directory
+  --version                  the version of the Google Mock installation
+
+ Version Queries:
+  --min-version=VERSION      return 0 if the version is at least VERSION
+  --exact-version=VERSION    return 0 if the version is exactly VERSION
+  --max-version=VERSION      return 0 if the version is at most VERSION
+
+ Compilation Flag Queries:
+  --cppflags                 compile flags specific to the C-like preprocessors
+  --cxxflags                 compile flags appropriate for C++ programs
+  --ldflags                  linker flags
+  --libs                     libraries for linking
+
+EOF
+}
+
+# This function bounds our version with a min and a max. It uses some clever
+# POSIX-compliant variable expansion to portably do all the work in the shell
+# and avoid any dependency on a particular "sed" or "awk" implementation.
+# Notable is that it will only ever compare the first 3 components of versions.
+# Further components will be cleanly stripped off. All versions must be
+# unadorned, so "v1.0" will *not* work. The minimum version must be in $1, and
+# the max in $2. TODO(chandlerc@google.com): If this ever breaks, we should
+# investigate expanding this via autom4te from AS_VERSION_COMPARE rather than
+# continuing to maintain our own shell version.
+check_versions()
+{
+  major_version=${version%%.*}
+  minor_version="0"
+  point_version="0"
+  if test "${version#*.}" != "${version}"; then
+    minor_version=${version#*.}
+    minor_version=${minor_version%%.*}
+  fi
+  if test "${version#*.*.}" != "${version}"; then
+    point_version=${version#*.*.}
+    point_version=${point_version%%.*}
+  fi
+
+  min_version="$1"
+  min_major_version=${min_version%%.*}
+  min_minor_version="0"
+  min_point_version="0"
+  if test "${min_version#*.}" != "${min_version}"; then
+    min_minor_version=${min_version#*.}
+    min_minor_version=${min_minor_version%%.*}
+  fi
+  if test "${min_version#*.*.}" != "${min_version}"; then
+    min_point_version=${min_version#*.*.}
+    min_point_version=${min_point_version%%.*}
+  fi
+
+  max_version="$2"
+  max_major_version=${max_version%%.*}
+  max_minor_version="0"
+  max_point_version="0"
+  if test "${max_version#*.}" != "${max_version}"; then
+    max_minor_version=${max_version#*.}
+    max_minor_version=${max_minor_version%%.*}
+  fi
+  if test "${max_version#*.*.}" != "${max_version}"; then
+    max_point_version=${max_version#*.*.}
+    max_point_version=${max_point_version%%.*}
+  fi
+
+  test $(($major_version)) -lt $(($min_major_version)) && exit 1
+  if test $(($major_version)) -eq $(($min_major_version)); then
+    test $(($minor_version)) -lt $(($min_minor_version)) && exit 1
+    if test $(($minor_version)) -eq $(($min_minor_version)); then
+      test $(($point_version)) -lt $(($min_point_version)) && exit 1
+    fi
+  fi
+
+  test $(($major_version)) -gt $(($max_major_version)) && exit 1
+  if test $(($major_version)) -eq $(($max_major_version)); then
+    test $(($minor_version)) -gt $(($max_minor_version)) && exit 1
+    if test $(($minor_version)) -eq $(($max_minor_version)); then
+      test $(($point_version)) -gt $(($max_point_version)) && exit 1
+    fi
+  fi
+
+  exit 0
+}
+
+# Show the usage line when no arguments are specified.
+if test $# -eq 0; then
+  show_usage
+  exit 1
+fi
+
+while test $# -gt 0; do
+  case $1 in
+    --usage)          show_usage;         exit 0;;
+    --help)           show_help;          exit 0;;
+
+    # Installation overrides
+    --prefix=*)       GMOCK_PREFIX=${1#--prefix=};;
+    --exec-prefix=*)  GMOCK_EXEC_PREFIX=${1#--exec-prefix=};;
+    --libdir=*)       GMOCK_LIBDIR=${1#--libdir=};;
+    --includedir=*)   GMOCK_INCLUDEDIR=${1#--includedir=};;
+
+    # Installation queries
+    --prefix|--exec-prefix|--libdir|--includedir|--version)
+      if test -n "${do_query}"; then
+        show_usage
+        exit 1
+      fi
+      do_query=${1#--}
+      ;;
+
+    # Version checking
+    --min-version=*)
+      do_check_versions=yes
+      min_version=${1#--min-version=}
+      ;;
+    --max-version=*)
+      do_check_versions=yes
+      max_version=${1#--max-version=}
+      ;;
+    --exact-version=*)
+      do_check_versions=yes
+      exact_version=${1#--exact-version=}
+      ;;
+
+    # Compiler flag output
+    --cppflags)       echo_cppflags=yes;;
+    --cxxflags)       echo_cxxflags=yes;;
+    --ldflags)        echo_ldflags=yes;;
+    --libs)           echo_libs=yes;;
+
+    # Everything else is an error
+    *)                show_usage;         exit 1;;
+  esac
+  shift
+done
+
+# These have defaults filled in by the configure script but can also be
+# overridden by environment variables or command line parameters.
+prefix="${GMOCK_PREFIX:-@prefix@}"
+exec_prefix="${GMOCK_EXEC_PREFIX:-@exec_prefix@}"
+libdir="${GMOCK_LIBDIR:-@libdir@}"
+includedir="${GMOCK_INCLUDEDIR:-@includedir@}"
+
+# We try and detect if our binary is not located at its installed location. If
+# it's not, we provide variables pointing to the source and build tree rather
+# than to the install tree. We also locate Google Test using the configured
+# gtest-config script rather than searching the PATH and our bindir for one.
+# This allows building against a just-built gmock rather than an installed
+# gmock.
+bindir="@bindir@"
+this_relative_bindir=`dirname $0`
+this_bindir=`cd ${this_relative_bindir}; pwd -P`
+if test "${this_bindir}" = "${this_bindir%${bindir}}"; then
+  # The path to the script doesn't end in the bindir sequence from Autoconf,
+  # assume that we are in a build tree.
+  build_dir=`dirname ${this_bindir}`
+  src_dir=`cd ${this_bindir}/@top_srcdir@; pwd -P`
+
+  # TODO(chandlerc@google.com): This is a dangerous dependency on libtool, we
+  # should work to remove it, and/or remove libtool altogether, replacing it
+  # with direct references to the library and a link path.
+  gmock_libs="${build_dir}/lib/libgmock.la"
+  gmock_ldflags=""
+
+  # We provide hooks to include from either the source or build dir, where the
+  # build dir is always preferred. This will potentially allow us to write
+  # build rules for generated headers and have them automatically be preferred
+  # over provided versions.
+  gmock_cppflags="-I${build_dir}/include -I${src_dir}/include"
+  gmock_cxxflags=""
+
+  # Directly invoke the gtest-config script used during the build process.
+  gtest_config="@GTEST_CONFIG@"
+else
+  # We're using an installed gmock, although it may be staged under some
+  # prefix. Assume (as our own libraries do) that we can resolve the prefix,
+  # and are present in the dynamic link paths.
+  gmock_ldflags="-L${libdir}"
+  gmock_libs="-l${name}"
+  gmock_cppflags="-I${includedir}"
+  gmock_cxxflags=""
+
+  # We also prefer any gtest-config script installed in our prefix. Lacking
+  # one, we look in the PATH for one.
+  gtest_config="${bindir}/gtest-config"
+  if test ! -x "${gtest_config}"; then
+    gtest_config=`which gtest-config`
+  fi
+fi
+
+# Ensure that we have located a Google Test to link against.
+if ! test -x "${gtest_config}"; then
+  echo "Unable to locate Google Test, check your Google Mock configuration" \
+       "and installation" >&2
+  exit 1
+elif ! "${gtest_config}" "--exact-version=@GTEST_VERSION@"; then
+  echo "The Google Test found is not the same version as Google Mock was " \
+       "built against" >&2
+  exit 1
+fi
+
+# Add the necessary Google Test bits into the various flag variables
+gmock_cppflags="${gmock_cppflags} `${gtest_config} --cppflags`"
+gmock_cxxflags="${gmock_cxxflags} `${gtest_config} --cxxflags`"
+gmock_ldflags="${gmock_ldflags} `${gtest_config} --ldflags`"
+gmock_libs="${gmock_libs} `${gtest_config} --libs`"
+
+# Do an installation query if requested.
+if test -n "$do_query"; then
+  case $do_query in
+    prefix)           echo $prefix;       exit 0;;
+    exec-prefix)      echo $exec_prefix;  exit 0;;
+    libdir)           echo $libdir;       exit 0;;
+    includedir)       echo $includedir;   exit 0;;
+    version)          echo $version;      exit 0;;
+    *)                show_usage;         exit 1;;
+  esac
+fi
+
+# Do a version check if requested.
+if test "$do_check_versions" = "yes"; then
+  # Make sure we didn't receive a bad combination of parameters.
+  test "$echo_cppflags" = "yes" && show_usage && exit 1
+  test "$echo_cxxflags" = "yes" && show_usage && exit 1
+  test "$echo_ldflags" = "yes"  && show_usage && exit 1
+  test "$echo_libs" = "yes"     && show_usage && exit 1
+
+  if test "$exact_version" != ""; then
+    check_versions $exact_version $exact_version
+    # unreachable
+  else
+    check_versions ${min_version:-0.0.0} ${max_version:-9999.9999.9999}
+    # unreachable
+  fi
+fi
+
+# Do the output in the correct order so that these can be used in-line of
+# a compiler invocation.
+output=""
+test "$echo_cppflags" = "yes" && output="$output $gmock_cppflags"
+test "$echo_cxxflags" = "yes" && output="$output $gmock_cxxflags"
+test "$echo_ldflags" = "yes"  && output="$output $gmock_ldflags"
+test "$echo_libs" = "yes"     && output="$output $gmock_libs"
+echo $output
+
+exit 0
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock_doctor.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock_doctor.py
new file mode 100755
index 0000000..74992bc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/gmock_doctor.py
@@ -0,0 +1,640 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Converts compiler's errors in code using Google Mock to plain English."""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import re
+import sys
+
+_VERSION = '1.0.3'
+
+_EMAIL = 'googlemock@googlegroups.com'
+
+_COMMON_GMOCK_SYMBOLS = [
+    # Matchers
+    '_',
+    'A',
+    'AddressSatisfies',
+    'AllOf',
+    'An',
+    'AnyOf',
+    'ContainerEq',
+    'Contains',
+    'ContainsRegex',
+    'DoubleEq',
+    'ElementsAre',
+    'ElementsAreArray',
+    'EndsWith',
+    'Eq',
+    'Field',
+    'FloatEq',
+    'Ge',
+    'Gt',
+    'HasSubstr',
+    'IsInitializedProto',
+    'Le',
+    'Lt',
+    'MatcherCast',
+    'Matches',
+    'MatchesRegex',
+    'NanSensitiveDoubleEq',
+    'NanSensitiveFloatEq',
+    'Ne',
+    'Not',
+    'NotNull',
+    'Pointee',
+    'Property',
+    'Ref',
+    'ResultOf',
+    'SafeMatcherCast',
+    'StartsWith',
+    'StrCaseEq',
+    'StrCaseNe',
+    'StrEq',
+    'StrNe',
+    'Truly',
+    'TypedEq',
+    'Value',
+
+    # Actions
+    'Assign',
+    'ByRef',
+    'DeleteArg',
+    'DoAll',
+    'DoDefault',
+    'IgnoreResult',
+    'Invoke',
+    'InvokeArgument',
+    'InvokeWithoutArgs',
+    'Return',
+    'ReturnNew',
+    'ReturnNull',
+    'ReturnRef',
+    'SaveArg',
+    'SetArgReferee',
+    'SetArgPointee',
+    'SetArgumentPointee',
+    'SetArrayArgument',
+    'SetErrnoAndReturn',
+    'Throw',
+    'WithArg',
+    'WithArgs',
+    'WithoutArgs',
+
+    # Cardinalities
+    'AnyNumber',
+    'AtLeast',
+    'AtMost',
+    'Between',
+    'Exactly',
+
+    # Sequences
+    'InSequence',
+    'Sequence',
+
+    # Misc
+    'DefaultValue',
+    'Mock',
+    ]
+
+# Regex for matching source file path and line number in the compiler's errors.
+_GCC_FILE_LINE_RE = r'(?P<file>.*):(?P<line>\d+):(\d+:)?\s+'
+_CLANG_FILE_LINE_RE = r'(?P<file>.*):(?P<line>\d+):(?P<column>\d+):\s+'
+_CLANG_NON_GMOCK_FILE_LINE_RE = (
+    r'(?P<file>.*[/\\^](?!gmock-)[^/\\]+):(?P<line>\d+):(?P<column>\d+):\s+')
+
+
+def _FindAllMatches(regex, s):
+  """Generates all matches of regex in string s."""
+
+  r = re.compile(regex)
+  return r.finditer(s)
+
+
+def _GenericDiagnoser(short_name, long_name, diagnoses, msg):
+  """Diagnoses the given disease by pattern matching.
+
+  Can provide different diagnoses for different patterns.
+
+  Args:
+    short_name: Short name of the disease.
+    long_name:  Long name of the disease.
+    diagnoses:  A list of pairs (regex, pattern for formatting the diagnosis
+                for matching regex).
+    msg:        Compiler's error messages.
+  Yields:
+    Tuples of the form
+      (short name of disease, long name of disease, diagnosis).
+  """
+  for regex, diagnosis in diagnoses:
+    if re.search(regex, msg):
+      diagnosis = '%(file)s:%(line)s:' + diagnosis
+      for m in _FindAllMatches(regex, msg):
+        yield (short_name, long_name, diagnosis % m.groupdict())
+
+
+def _NeedToReturnReferenceDiagnoser(msg):
+  """Diagnoses the NRR disease, given the error messages by the compiler."""
+
+  gcc_regex = (r'In member function \'testing::internal::ReturnAction<R>.*\n'
+               + _GCC_FILE_LINE_RE + r'instantiated from here\n'
+               r'.*gmock-actions\.h.*error: creating array with negative size')
+  clang_regex = (r'error:.*array.*negative.*\r?\n'
+                 r'(.*\n)*?' +
+                 _CLANG_NON_GMOCK_FILE_LINE_RE +
+                 r'note: in instantiation of function template specialization '
+                 r'\'testing::internal::ReturnAction<(?P<type>.*)>'
+                 r'::operator Action<.*>\' requested here')
+  clang11_re = (r'use_ReturnRef_instead_of_Return_to_return_a_reference.*'
+                r'(.*\n)*?' + _CLANG_NON_GMOCK_FILE_LINE_RE)
+
+  diagnosis = """
+You are using a Return() action in a function that returns a reference to
+%(type)s.  Please use ReturnRef() instead."""
+  return _GenericDiagnoser('NRR', 'Need to Return Reference',
+                           [(clang_regex, diagnosis),
+                            (clang11_re, diagnosis % {'type': 'a type'}),
+                            (gcc_regex, diagnosis % {'type': 'a type'})],
+                           msg)
+
+
+def _NeedToReturnSomethingDiagnoser(msg):
+  """Diagnoses the NRS disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'(instantiated from here\n.'
+               r'*gmock.*actions\.h.*error: void value not ignored)'
+               r'|(error: control reaches end of non-void function)')
+  clang_regex1 = (_CLANG_FILE_LINE_RE +
+                  r'error: cannot initialize return object '
+                  r'of type \'Result\' \(aka \'(?P<return_type>.*)\'\) '
+                  r'with an rvalue of type \'void\'')
+  clang_regex2 = (_CLANG_FILE_LINE_RE +
+                  r'error: cannot initialize return object '
+                  r'of type \'(?P<return_type>.*)\' '
+                  r'with an rvalue of type \'void\'')
+  diagnosis = """
+You are using an action that returns void, but it needs to return
+%(return_type)s.  Please tell it *what* to return.  Perhaps you can use
+the pattern DoAll(some_action, Return(some_value))?"""
+  return _GenericDiagnoser(
+      'NRS',
+      'Need to Return Something',
+      [(gcc_regex, diagnosis % {'return_type': '*something*'}),
+       (clang_regex1, diagnosis),
+       (clang_regex2, diagnosis)],
+      msg)
+
+
+def _NeedToReturnNothingDiagnoser(msg):
+  """Diagnoses the NRN disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'instantiated from here\n'
+               r'.*gmock-actions\.h.*error: instantiation of '
+               r'\'testing::internal::ReturnAction<R>::Impl<F>::value_\' '
+               r'as type \'void\'')
+  clang_regex1 = (r'error: field has incomplete type '
+                  r'\'Result\' \(aka \'void\'\)(\r)?\n'
+                  r'(.*\n)*?' +
+                  _CLANG_NON_GMOCK_FILE_LINE_RE + r'note: in instantiation '
+                  r'of function template specialization '
+                  r'\'testing::internal::ReturnAction<(?P<return_type>.*)>'
+                  r'::operator Action<void \(.*\)>\' requested here')
+  clang_regex2 = (r'error: field has incomplete type '
+                  r'\'Result\' \(aka \'void\'\)(\r)?\n'
+                  r'(.*\n)*?' +
+                  _CLANG_NON_GMOCK_FILE_LINE_RE + r'note: in instantiation '
+                  r'of function template specialization '
+                  r'\'testing::internal::DoBothAction<.*>'
+                  r'::operator Action<(?P<return_type>.*) \(.*\)>\' '
+                  r'requested here')
+  diagnosis = """
+You are using an action that returns %(return_type)s, but it needs to return
+void.  Please use a void-returning action instead.
+
+All actions but the last in DoAll(...) must return void.  Perhaps you need
+to re-arrange the order of actions in a DoAll(), if you are using one?"""
+  return _GenericDiagnoser(
+      'NRN',
+      'Need to Return Nothing',
+      [(gcc_regex, diagnosis % {'return_type': '*something*'}),
+       (clang_regex1, diagnosis),
+       (clang_regex2, diagnosis)],
+      msg)
+
+
+def _IncompleteByReferenceArgumentDiagnoser(msg):
+  """Diagnoses the IBRA disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'instantiated from here\n'
+               r'.*gtest-printers\.h.*error: invalid application of '
+               r'\'sizeof\' to incomplete type \'(?P<type>.*)\'')
+
+  clang_regex = (r'.*gtest-printers\.h.*error: invalid application of '
+                 r'\'sizeof\' to an incomplete type '
+                 r'\'(?P<type>.*)( const)?\'\r?\n'
+                 r'(.*\n)*?' +
+                 _CLANG_NON_GMOCK_FILE_LINE_RE +
+                 r'note: in instantiation of member function '
+                 r'\'testing::internal2::TypeWithoutFormatter<.*>::'
+                 r'PrintValue\' requested here')
+  diagnosis = """
+In order to mock this function, Google Mock needs to see the definition
+of type "%(type)s" - declaration alone is not enough.  Either #include
+the header that defines it, or change the argument to be passed
+by pointer."""
+
+  return _GenericDiagnoser('IBRA', 'Incomplete By-Reference Argument Type',
+                           [(gcc_regex, diagnosis),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+def _OverloadedFunctionMatcherDiagnoser(msg):
+  """Diagnoses the OFM disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'error: no matching function for '
+               r'call to \'Truly\(<unresolved overloaded function type>\)')
+  clang_regex = (_CLANG_FILE_LINE_RE + r'error: no matching function for '
+                 r'call to \'Truly')
+  diagnosis = """
+The argument you gave to Truly() is an overloaded function.  Please tell
+your compiler which overloaded version you want to use.
+
+For example, if you want to use the version whose signature is
+  bool Foo(int n);
+you should write
+  Truly(static_cast<bool (*)(int n)>(Foo))"""
+  return _GenericDiagnoser('OFM', 'Overloaded Function Matcher',
+                           [(gcc_regex, diagnosis),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+def _OverloadedFunctionActionDiagnoser(msg):
+  """Diagnoses the OFA disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'error: no matching function for call to '
+               r'\'Invoke\(<unresolved overloaded function type>')
+  clang_regex = (_CLANG_FILE_LINE_RE + r'error: no matching '
+                 r'function for call to \'Invoke\'\r?\n'
+                 r'(.*\n)*?'
+                 r'.*\bgmock-generated-actions\.h:\d+:\d+:\s+'
+                 r'note: candidate template ignored:\s+'
+                 r'couldn\'t infer template argument \'FunctionImpl\'')
+  diagnosis = """
+Function you are passing to Invoke is overloaded.  Please tell your compiler
+which overloaded version you want to use.
+
+For example, if you want to use the version whose signature is
+  bool MyFunction(int n, double x);
+you should write something like
+  Invoke(static_cast<bool (*)(int n, double x)>(MyFunction))"""
+  return _GenericDiagnoser('OFA', 'Overloaded Function Action',
+                           [(gcc_regex, diagnosis),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+def _OverloadedMethodActionDiagnoser(msg):
+  """Diagnoses the OMA disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'error: no matching function for '
+               r'call to \'Invoke\(.+, <unresolved overloaded function '
+               r'type>\)')
+  clang_regex = (_CLANG_FILE_LINE_RE + r'error: no matching function '
+                 r'for call to \'Invoke\'\r?\n'
+                 r'(.*\n)*?'
+                 r'.*\bgmock-generated-actions\.h:\d+:\d+: '
+                 r'note: candidate function template not viable: '
+                 r'requires .*, but 2 (arguments )?were provided')
+  diagnosis = """
+The second argument you gave to Invoke() is an overloaded method.  Please
+tell your compiler which overloaded version you want to use.
+
+For example, if you want to use the version whose signature is
+  class Foo {
+    ...
+    bool Bar(int n, double x);
+  };
+you should write something like
+  Invoke(foo, static_cast<bool (Foo::*)(int n, double x)>(&Foo::Bar))"""
+  return _GenericDiagnoser('OMA', 'Overloaded Method Action',
+                           [(gcc_regex, diagnosis),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+def _MockObjectPointerDiagnoser(msg):
+  """Diagnoses the MOP disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'error: request for member '
+               r'\'gmock_(?P<method>.+)\' in \'(?P<mock_object>.+)\', '
+               r'which is of non-class type \'(.*::)*(?P<class_name>.+)\*\'')
+  clang_regex = (_CLANG_FILE_LINE_RE + r'error: member reference type '
+                 r'\'(?P<class_name>.*?) *\' is a pointer; '
+                 r'(did you mean|maybe you meant) to use \'->\'\?')
+  diagnosis = """
+The first argument to ON_CALL() and EXPECT_CALL() must be a mock *object*,
+not a *pointer* to it.  Please write '*(%(mock_object)s)' instead of
+'%(mock_object)s' as your first argument.
+
+For example, given the mock class:
+
+  class %(class_name)s : public ... {
+    ...
+    MOCK_METHOD0(%(method)s, ...);
+  };
+
+and the following mock instance:
+
+  %(class_name)s* mock_ptr = ...
+
+you should use the EXPECT_CALL like this:
+
+  EXPECT_CALL(*mock_ptr, %(method)s(...));"""
+
+  return _GenericDiagnoser(
+      'MOP',
+      'Mock Object Pointer',
+      [(gcc_regex, diagnosis),
+       (clang_regex, diagnosis % {'mock_object': 'mock_object',
+                                  'method': 'method',
+                                  'class_name': '%(class_name)s'})],
+       msg)
+
+
+def _NeedToUseSymbolDiagnoser(msg):
+  """Diagnoses the NUS disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE + r'error: \'(?P<symbol>.+)\' '
+               r'(was not declared in this scope|has not been declared)')
+  clang_regex = (_CLANG_FILE_LINE_RE +
+                 r'error: (use of undeclared identifier|unknown type name|'
+                 r'no template named) \'(?P<symbol>[^\']+)\'')
+  diagnosis = """
+'%(symbol)s' is defined by Google Mock in the testing namespace.
+Did you forget to write
+  using testing::%(symbol)s;
+?"""
+  for m in (list(_FindAllMatches(gcc_regex, msg)) +
+            list(_FindAllMatches(clang_regex, msg))):
+    symbol = m.groupdict()['symbol']
+    if symbol in _COMMON_GMOCK_SYMBOLS:
+      yield ('NUS', 'Need to Use Symbol', diagnosis % m.groupdict())
+
+
+def _NeedToUseReturnNullDiagnoser(msg):
+  """Diagnoses the NRNULL disease, given the error messages by the compiler."""
+
+  gcc_regex = ('instantiated from \'testing::internal::ReturnAction<R>'
+               '::operator testing::Action<Func>\(\) const.*\n' +
+               _GCC_FILE_LINE_RE + r'instantiated from here\n'
+               r'.*error: no matching function for call to \'ImplicitCast_\('
+               r'(:?long )?int&\)')
+  clang_regex = (r'\bgmock-actions.h:.* error: no matching function for '
+                 r'call to \'ImplicitCast_\'\r?\n'
+                 r'(.*\n)*?' +
+                 _CLANG_NON_GMOCK_FILE_LINE_RE + r'note: in instantiation '
+                 r'of function template specialization '
+                 r'\'testing::internal::ReturnAction<(int|long)>::operator '
+                 r'Action<(?P<type>.*)\(\)>\' requested here')
+  diagnosis = """
+You are probably calling Return(NULL) and the compiler isn't sure how to turn
+NULL into %(type)s. Use ReturnNull() instead.
+Note: the line number may be off; please fix all instances of Return(NULL)."""
+  return _GenericDiagnoser(
+      'NRNULL', 'Need to use ReturnNull',
+      [(clang_regex, diagnosis),
+       (gcc_regex, diagnosis % {'type': 'the right type'})],
+      msg)
+
+
+def _TypeInTemplatedBaseDiagnoser(msg):
+  """Diagnoses the TTB disease, given the error messages by the compiler."""
+
+  # This version works when the type is used as the mock function's return
+  # type.
+  gcc_4_3_1_regex_type_in_retval = (
+      r'In member function \'int .*\n' + _GCC_FILE_LINE_RE +
+      r'error: a function call cannot appear in a constant-expression')
+  gcc_4_4_0_regex_type_in_retval = (
+      r'error: a function call cannot appear in a constant-expression'
+      + _GCC_FILE_LINE_RE + r'error: template argument 1 is invalid\n')
+  # This version works when the type is used as the mock function's sole
+  # parameter type.
+  gcc_regex_type_of_sole_param = (
+      _GCC_FILE_LINE_RE +
+      r'error: \'(?P<type>.+)\' was not declared in this scope\n'
+      r'.*error: template argument 1 is invalid\n')
+  # This version works when the type is used as a parameter of a mock
+  # function that has multiple parameters.
+  gcc_regex_type_of_a_param = (
+      r'error: expected `;\' before \'::\' token\n'
+      + _GCC_FILE_LINE_RE +
+      r'error: \'(?P<type>.+)\' was not declared in this scope\n'
+      r'.*error: template argument 1 is invalid\n'
+      r'.*error: \'.+\' was not declared in this scope')
+  clang_regex_type_of_retval_or_sole_param = (
+      _CLANG_FILE_LINE_RE +
+      r'error: use of undeclared identifier \'(?P<type>.*)\'\n'
+      r'(.*\n)*?'
+      r'(?P=file):(?P=line):\d+: error: '
+      r'non-friend class member \'Result\' cannot have a qualified name'
+      )
+  clang_regex_type_of_a_param = (
+      _CLANG_FILE_LINE_RE +
+      r'error: C\+\+ requires a type specifier for all declarations\n'
+      r'(.*\n)*?'
+      r'(?P=file):(?P=line):(?P=column): error: '
+      r'C\+\+ requires a type specifier for all declarations'
+      )
+  clang_regex_unknown_type = (
+      _CLANG_FILE_LINE_RE +
+      r'error: unknown type name \'(?P<type>[^\']+)\''
+      )
+
+  diagnosis = """
+In a mock class template, types or typedefs defined in the base class
+template are *not* automatically visible.  This is how C++ works.  Before
+you can use a type or typedef named %(type)s defined in base class Base<T>, you
+need to make it visible.  One way to do it is:
+
+  typedef typename Base<T>::%(type)s %(type)s;"""
+
+  for diag in _GenericDiagnoser(
+      'TTB', 'Type in Template Base',
+      [(gcc_4_3_1_regex_type_in_retval, diagnosis % {'type': 'Foo'}),
+       (gcc_4_4_0_regex_type_in_retval, diagnosis % {'type': 'Foo'}),
+       (gcc_regex_type_of_sole_param, diagnosis),
+       (gcc_regex_type_of_a_param, diagnosis),
+       (clang_regex_type_of_retval_or_sole_param, diagnosis),
+       (clang_regex_type_of_a_param, diagnosis % {'type': 'Foo'})],
+      msg):
+    yield diag
+  # Avoid overlap with the NUS pattern.
+  for m in _FindAllMatches(clang_regex_unknown_type, msg):
+    type_ = m.groupdict()['type']
+    if type_ not in _COMMON_GMOCK_SYMBOLS:
+      yield ('TTB', 'Type in Template Base', diagnosis % m.groupdict())
+
+
+def _WrongMockMethodMacroDiagnoser(msg):
+  """Diagnoses the WMM disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE +
+               r'.*this_method_does_not_take_(?P<wrong_args>\d+)_argument.*\n'
+               r'.*\n'
+               r'.*candidates are.*FunctionMocker<[^>]+A(?P<args>\d+)\)>')
+  clang_regex = (_CLANG_NON_GMOCK_FILE_LINE_RE +
+                 r'error:.*array.*negative.*r?\n'
+                 r'(.*\n)*?'
+                 r'(?P=file):(?P=line):(?P=column): error: too few arguments '
+                 r'to function call, expected (?P<args>\d+), '
+                 r'have (?P<wrong_args>\d+)')
+  clang11_re = (_CLANG_NON_GMOCK_FILE_LINE_RE +
+                r'.*this_method_does_not_take_'
+                r'(?P<wrong_args>\d+)_argument.*')
+  diagnosis = """
+You are using MOCK_METHOD%(wrong_args)s to define a mock method that has
+%(args)s arguments. Use MOCK_METHOD%(args)s (or MOCK_CONST_METHOD%(args)s,
+MOCK_METHOD%(args)s_T, MOCK_CONST_METHOD%(args)s_T as appropriate) instead."""
+  return _GenericDiagnoser('WMM', 'Wrong MOCK_METHODn Macro',
+                           [(gcc_regex, diagnosis),
+                            (clang11_re, diagnosis % {'wrong_args': 'm',
+                                                      'args': 'n'}),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+def _WrongParenPositionDiagnoser(msg):
+  """Diagnoses the WPP disease, given the error messages by the compiler."""
+
+  gcc_regex = (_GCC_FILE_LINE_RE +
+               r'error:.*testing::internal::MockSpec<.* has no member named \''
+               r'(?P<method>\w+)\'')
+  clang_regex = (_CLANG_NON_GMOCK_FILE_LINE_RE +
+                 r'error: no member named \'(?P<method>\w+)\' in '
+                 r'\'testing::internal::MockSpec<.*>\'')
+  diagnosis = """
+The closing parenthesis of ON_CALL or EXPECT_CALL should be *before*
+".%(method)s".  For example, you should write:
+  EXPECT_CALL(my_mock, Foo(_)).%(method)s(...);
+instead of:
+  EXPECT_CALL(my_mock, Foo(_).%(method)s(...));"""
+  return _GenericDiagnoser('WPP', 'Wrong Parenthesis Position',
+                           [(gcc_regex, diagnosis),
+                            (clang_regex, diagnosis)],
+                           msg)
+
+
+_DIAGNOSERS = [
+    _IncompleteByReferenceArgumentDiagnoser,
+    _MockObjectPointerDiagnoser,
+    _NeedToReturnNothingDiagnoser,
+    _NeedToReturnReferenceDiagnoser,
+    _NeedToReturnSomethingDiagnoser,
+    _NeedToUseReturnNullDiagnoser,
+    _NeedToUseSymbolDiagnoser,
+    _OverloadedFunctionActionDiagnoser,
+    _OverloadedFunctionMatcherDiagnoser,
+    _OverloadedMethodActionDiagnoser,
+    _TypeInTemplatedBaseDiagnoser,
+    _WrongMockMethodMacroDiagnoser,
+    _WrongParenPositionDiagnoser,
+    ]
+
+
+def Diagnose(msg):
+  """Generates all possible diagnoses given the compiler error message."""
+
+  msg = re.sub(r'\x1b\[[^m]*m', '', msg)  # Strips all color formatting.
+  # Assuming the string is using the UTF-8 encoding, replaces the left and
+  # the right single quote characters with apostrophes.
+  msg = re.sub(r'(\xe2\x80\x98|\xe2\x80\x99)', "'", msg)
+
+  diagnoses = []
+  for diagnoser in _DIAGNOSERS:
+    for diag in diagnoser(msg):
+      diagnosis = '[%s - %s]\n%s' % diag
+      if not diagnosis in diagnoses:
+        diagnoses.append(diagnosis)
+  return diagnoses
+
+
+def main():
+  print ('Google Mock Doctor v%s - '
+         'diagnoses problems in code using Google Mock.' % _VERSION)
+
+  if sys.stdin.isatty():
+    print ('Please copy and paste the compiler errors here.  Press c-D when '
+           'you are done:')
+  else:
+    print ('Waiting for compiler errors on stdin . . .')
+
+  msg = sys.stdin.read().strip()
+  diagnoses = Diagnose(msg)
+  count = len(diagnoses)
+  if not count:
+    print ("""
+Your compiler complained:
+8<------------------------------------------------------------
+%s
+------------------------------------------------------------>8
+
+Uh-oh, I'm not smart enough to figure out what the problem is. :-(
+However...
+If you send your source code and the compiler's error messages to
+%s, you can be helped and I can get smarter --
+win-win for us!""" % (msg, _EMAIL))
+  else:
+    print ('------------------------------------------------------------')
+    print ('Your code appears to have the following',)
+    if count > 1:
+      print ('%s diseases:' % (count,))
+    else:
+      print ('disease:')
+    i = 0
+    for d in diagnoses:
+      i += 1
+      if count > 1:
+        print ('\n#%s:' % (i,))
+      print (d)
+    print ("""
+How did I do?  If you think I'm wrong or unhelpful, please send your
+source code and the compiler's error messages to %s.
+Then you can be helped and I can get smarter -- I promise I won't be upset!""" %
+           _EMAIL)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload.py
new file mode 100755
index 0000000..9b1d8b1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload.py
@@ -0,0 +1,1387 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tool for uploading diffs from a version control system to the codereview app.
+
+Usage summary: upload.py [options] [-- diff_options]
+
+Diff options are passed to the diff command of the underlying system.
+
+Supported version control systems:
+  Git
+  Mercurial
+  Subversion
+
+It is important for Git/Mercurial users to specify a tree/node/branch to diff
+against by using the '--rev' option.
+"""
+# This code is derived from appcfg.py in the App Engine SDK (open source),
+# and from ASPN recipe #146306.
+
+import cookielib
+import getpass
+import logging
+import md5
+import mimetypes
+import optparse
+import os
+import re
+import socket
+import subprocess
+import sys
+import urllib
+import urllib2
+import urlparse
+
+try:
+  import readline
+except ImportError:
+  pass
+
+# The logging verbosity:
+#  0: Errors only.
+#  1: Status messages.
+#  2: Info logs.
+#  3: Debug logs.
+verbosity = 1
+
+# Max size of patch or base file.
+MAX_UPLOAD_SIZE = 900 * 1024
+
+
+def GetEmail(prompt):
+  """Prompts the user for their email address and returns it.
+
+  The last used email address is saved to a file and offered up as a suggestion
+  to the user. If the user presses enter without typing in anything the last
+  used email address is used. If the user enters a new address, it is saved
+  for next time we prompt.
+
+  """
+  last_email_file_name = os.path.expanduser("~/.last_codereview_email_address")
+  last_email = ""
+  if os.path.exists(last_email_file_name):
+    try:
+      last_email_file = open(last_email_file_name, "r")
+      last_email = last_email_file.readline().strip("\n")
+      last_email_file.close()
+      prompt += " [%s]" % last_email
+    except IOError, e:
+      pass
+  email = raw_input(prompt + ": ").strip()
+  if email:
+    try:
+      last_email_file = open(last_email_file_name, "w")
+      last_email_file.write(email)
+      last_email_file.close()
+    except IOError, e:
+      pass
+  else:
+    email = last_email
+  return email
+
+
+def StatusUpdate(msg):
+  """Print a status message to stdout.
+
+  If 'verbosity' is greater than 0, print the message.
+
+  Args:
+    msg: The string to print.
+  """
+  if verbosity > 0:
+    print msg
+
+
+def ErrorExit(msg):
+  """Print an error message to stderr and exit."""
+  print >>sys.stderr, msg
+  sys.exit(1)
+
+
+class ClientLoginError(urllib2.HTTPError):
+  """Raised to indicate there was an error authenticating with ClientLogin."""
+
+  def __init__(self, url, code, msg, headers, args):
+    urllib2.HTTPError.__init__(self, url, code, msg, headers, None)
+    self.args = args
+    self.reason = args["Error"]
+
+
+class AbstractRpcServer(object):
+  """Provides a common interface for a simple RPC server."""
+
+  def __init__(self, host, auth_function, host_override=None, extra_headers={},
+               save_cookies=False):
+    """Creates a new HttpRpcServer.
+
+    Args:
+      host: The host to send requests to.
+      auth_function: A function that takes no arguments and returns an
+        (email, password) tuple when called. Will be called if authentication
+        is required.
+      host_override: The host header to send to the server (defaults to host).
+      extra_headers: A dict of extra headers to append to every request.
+      save_cookies: If True, save the authentication cookies to local disk.
+        If False, use an in-memory cookiejar instead.  Subclasses must
+        implement this functionality.  Defaults to False.
+    """
+    self.host = host
+    self.host_override = host_override
+    self.auth_function = auth_function
+    self.authenticated = False
+    self.extra_headers = extra_headers
+    self.save_cookies = save_cookies
+    self.opener = self._GetOpener()
+    if self.host_override:
+      logging.info("Server: %s; Host: %s", self.host, self.host_override)
+    else:
+      logging.info("Server: %s", self.host)
+
+  def _GetOpener(self):
+    """Returns an OpenerDirector for making HTTP requests.
+
+    Returns:
+      A urllib2.OpenerDirector object.
+    """
+    raise NotImplementedError()
+
+  def _CreateRequest(self, url, data=None):
+    """Creates a new urllib request."""
+    logging.debug("Creating request for: '%s' with payload:\n%s", url, data)
+    req = urllib2.Request(url, data=data)
+    if self.host_override:
+      req.add_header("Host", self.host_override)
+    for key, value in self.extra_headers.iteritems():
+      req.add_header(key, value)
+    return req
+
+  def _GetAuthToken(self, email, password):
+    """Uses ClientLogin to authenticate the user, returning an auth token.
+
+    Args:
+      email:    The user's email address
+      password: The user's password
+
+    Raises:
+      ClientLoginError: If there was an error authenticating with ClientLogin.
+      HTTPError: If there was some other form of HTTP error.
+
+    Returns:
+      The authentication token returned by ClientLogin.
+    """
+    account_type = "GOOGLE"
+    if self.host.endswith(".google.com"):
+      # Needed for use inside Google.
+      account_type = "HOSTED"
+    req = self._CreateRequest(
+        url="https://www.google.com/accounts/ClientLogin",
+        data=urllib.urlencode({
+            "Email": email,
+            "Passwd": password,
+            "service": "ah",
+            "source/tnn": "rietveld-codereview-upload",
+            "accountType": account_type,
+        }),
+    )
+    try:
+      response = self.opener.open(req)
+      response_body = response.read()
+      response_dict = dict(x.split("=")
+                           for x in response_body.split("\n") if x)
+      return response_dict["Auth"]
+    except urllib2.HTTPError, e:
+      if e.code == 403:
+        body = e.read()
+        response_dict = dict(x.split("=", 1) for x in body.split("\n") if x)
+        raise ClientLoginError(req.get_full_url(), e.code, e.msg,
+                               e.headers, response_dict)
+      else:
+        raise
+
+  def _GetAuthCookie(self, auth_token):
+    """Fetches authentication cookies for an authentication token.
+
+    Args:
+      auth_token: The authentication token returned by ClientLogin.
+
+    Raises:
+      HTTPError: If there was an error fetching the authentication cookies.
+    """
+    # This is a dummy value to allow us to identify when we're successful.
+    continue_location = "http://localhost/"
+    args = {"continue": continue_location, "auth": auth_token}
+    req = self._CreateRequest("http://%s/_ah/login?%s" %
+                              (self.host, urllib.urlencode(args)))
+    try:
+      response = self.opener.open(req)
+    except urllib2.HTTPError, e:
+      response = e
+    if (response.code != 302 or
+        response.info()["location"] != continue_location):
+      raise urllib2.HTTPError(req.get_full_url(), response.code, response.msg,
+                              response.headers, response.fp)
+    self.authenticated = True
+
+  def _Authenticate(self):
+    """Authenticates the user.
+
+    The authentication process works as follows:
+     1) We get a username and password from the user
+     2) We use ClientLogin to obtain an AUTH token for the user
+        (see https://developers.google.com/identity/protocols/AuthForInstalledApps).
+     3) We pass the auth token to /_ah/login on the server to obtain an
+        authentication cookie. If login was successful, it tries to redirect
+        us to the URL we provided.
+
+    If we attempt to access the upload API without first obtaining an
+    authentication cookie, it returns a 401 response and directs us to
+    authenticate ourselves with ClientLogin.
+    """
+    for i in range(3):
+      credentials = self.auth_function()
+      try:
+        auth_token = self._GetAuthToken(credentials[0], credentials[1])
+      except ClientLoginError, e:
+        if e.reason == "BadAuthentication":
+          print >>sys.stderr, "Invalid username or password."
+          continue
+        if e.reason == "CaptchaRequired":
+          print >>sys.stderr, (
+              "Please go to\n"
+              "https://www.google.com/accounts/DisplayUnlockCaptcha\n"
+              "and verify you are a human.  Then try again.")
+          break
+        if e.reason == "NotVerified":
+          print >>sys.stderr, "Account not verified."
+          break
+        if e.reason == "TermsNotAgreed":
+          print >>sys.stderr, "User has not agreed to TOS."
+          break
+        if e.reason == "AccountDeleted":
+          print >>sys.stderr, "The user account has been deleted."
+          break
+        if e.reason == "AccountDisabled":
+          print >>sys.stderr, "The user account has been disabled."
+          break
+        if e.reason == "ServiceDisabled":
+          print >>sys.stderr, ("The user's access to the service has been "
+                               "disabled.")
+          break
+        if e.reason == "ServiceUnavailable":
+          print >>sys.stderr, "The service is not available; try again later."
+          break
+        raise
+      self._GetAuthCookie(auth_token)
+      return
+
+  def Send(self, request_path, payload=None,
+           content_type="application/octet-stream",
+           timeout=None,
+           **kwargs):
+    """Sends an RPC and returns the response.
+
+    Args:
+      request_path: The path to send the request to, eg /api/appversion/create.
+      payload: The body of the request, or None to send an empty request.
+      content_type: The Content-Type header to use.
+      timeout: timeout in seconds; default None i.e. no timeout.
+        (Note: for large requests on OS X, the timeout doesn't work right.)
+      kwargs: Any keyword arguments are converted into query string parameters.
+
+    Returns:
+      The response body, as a string.
+    """
+    # TODO: Don't require authentication.  Let the server say
+    # whether it is necessary.
+    if not self.authenticated:
+      self._Authenticate()
+
+    old_timeout = socket.getdefaulttimeout()
+    socket.setdefaulttimeout(timeout)
+    try:
+      tries = 0
+      while True:
+        tries += 1
+        args = dict(kwargs)
+        url = "http://%s%s" % (self.host, request_path)
+        if args:
+          url += "?" + urllib.urlencode(args)
+        req = self._CreateRequest(url=url, data=payload)
+        req.add_header("Content-Type", content_type)
+        try:
+          f = self.opener.open(req)
+          response = f.read()
+          f.close()
+          return response
+        except urllib2.HTTPError, e:
+          if tries > 3:
+            raise
+          elif e.code == 401:
+            self._Authenticate()
+##           elif e.code >= 500 and e.code < 600:
+##             # Server Error - try again.
+##             continue
+          else:
+            raise
+    finally:
+      socket.setdefaulttimeout(old_timeout)
+
+
+class HttpRpcServer(AbstractRpcServer):
+  """Provides a simplified RPC-style interface for HTTP requests."""
+
+  def _Authenticate(self):
+    """Save the cookie jar after authentication."""
+    super(HttpRpcServer, self)._Authenticate()
+    if self.save_cookies:
+      StatusUpdate("Saving authentication cookies to %s" % self.cookie_file)
+      self.cookie_jar.save()
+
+  def _GetOpener(self):
+    """Returns an OpenerDirector that supports cookies and ignores redirects.
+
+    Returns:
+      A urllib2.OpenerDirector object.
+    """
+    opener = urllib2.OpenerDirector()
+    opener.add_handler(urllib2.ProxyHandler())
+    opener.add_handler(urllib2.UnknownHandler())
+    opener.add_handler(urllib2.HTTPHandler())
+    opener.add_handler(urllib2.HTTPDefaultErrorHandler())
+    opener.add_handler(urllib2.HTTPSHandler())
+    opener.add_handler(urllib2.HTTPErrorProcessor())
+    if self.save_cookies:
+      self.cookie_file = os.path.expanduser("~/.codereview_upload_cookies")
+      self.cookie_jar = cookielib.MozillaCookieJar(self.cookie_file)
+      if os.path.exists(self.cookie_file):
+        try:
+          self.cookie_jar.load()
+          self.authenticated = True
+          StatusUpdate("Loaded authentication cookies from %s" %
+                       self.cookie_file)
+        except (cookielib.LoadError, IOError):
+          # Failed to load cookies - just ignore them.
+          pass
+      else:
+        # Create an empty cookie file with mode 600
+        fd = os.open(self.cookie_file, os.O_CREAT, 0600)
+        os.close(fd)
+      # Always chmod the cookie file
+      os.chmod(self.cookie_file, 0600)
+    else:
+      # Don't save cookies across runs of update.py.
+      self.cookie_jar = cookielib.CookieJar()
+    opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar))
+    return opener
+
+
+parser = optparse.OptionParser(usage="%prog [options] [-- diff_options]")
+parser.add_option("-y", "--assume_yes", action="store_true",
+                  dest="assume_yes", default=False,
+                  help="Assume that the answer to yes/no questions is 'yes'.")
+# Logging
+group = parser.add_option_group("Logging options")
+group.add_option("-q", "--quiet", action="store_const", const=0,
+                 dest="verbose", help="Print errors only.")
+group.add_option("-v", "--verbose", action="store_const", const=2,
+                 dest="verbose", default=1,
+                 help="Print info level logs (default).")
+group.add_option("--noisy", action="store_const", const=3,
+                 dest="verbose", help="Print all logs.")
+# Review server
+group = parser.add_option_group("Review server options")
+group.add_option("-s", "--server", action="store", dest="server",
+                 default="codereview.appspot.com",
+                 metavar="SERVER",
+                 help=("The server to upload to. The format is host[:port]. "
+                       "Defaults to 'codereview.appspot.com'."))
+group.add_option("-e", "--email", action="store", dest="email",
+                 metavar="EMAIL", default=None,
+                 help="The username to use. Will prompt if omitted.")
+group.add_option("-H", "--host", action="store", dest="host",
+                 metavar="HOST", default=None,
+                 help="Overrides the Host header sent with all RPCs.")
+group.add_option("--no_cookies", action="store_false",
+                 dest="save_cookies", default=True,
+                 help="Do not save authentication cookies to local disk.")
+# Issue
+group = parser.add_option_group("Issue options")
+group.add_option("-d", "--description", action="store", dest="description",
+                 metavar="DESCRIPTION", default=None,
+                 help="Optional description when creating an issue.")
+group.add_option("-f", "--description_file", action="store",
+                 dest="description_file", metavar="DESCRIPTION_FILE",
+                 default=None,
+                 help="Optional path of a file that contains "
+                      "the description when creating an issue.")
+group.add_option("-r", "--reviewers", action="store", dest="reviewers",
+                 metavar="REVIEWERS", default=None,
+                 help="Add reviewers (comma separated email addresses).")
+group.add_option("--cc", action="store", dest="cc",
+                 metavar="CC", default=None,
+                 help="Add CC (comma separated email addresses).")
+# Upload options
+group = parser.add_option_group("Patch options")
+group.add_option("-m", "--message", action="store", dest="message",
+                 metavar="MESSAGE", default=None,
+                 help="A message to identify the patch. "
+                      "Will prompt if omitted.")
+group.add_option("-i", "--issue", type="int", action="store",
+                 metavar="ISSUE", default=None,
+                 help="Issue number to which to add. Defaults to new issue.")
+group.add_option("--download_base", action="store_true",
+                 dest="download_base", default=False,
+                 help="Base files will be downloaded by the server "
+                 "(side-by-side diffs may not work on files with CRs).")
+group.add_option("--rev", action="store", dest="revision",
+                 metavar="REV", default=None,
+                 help="Branch/tree/revision to diff against (used by DVCS).")
+group.add_option("--send_mail", action="store_true",
+                 dest="send_mail", default=False,
+                 help="Send notification email to reviewers.")
+
+
+def GetRpcServer(options):
+  """Returns an instance of an AbstractRpcServer.
+
+  Returns:
+    A new AbstractRpcServer, on which RPC calls can be made.
+  """
+
+  rpc_server_class = HttpRpcServer
+
+  def GetUserCredentials():
+    """Prompts the user for a username and password."""
+    email = options.email
+    if email is None:
+      email = GetEmail("Email (login for uploading to %s)" % options.server)
+    password = getpass.getpass("Password for %s: " % email)
+    return (email, password)
+
+  # If this is the dev_appserver, use fake authentication.
+  host = (options.host or options.server).lower()
+  if host == "localhost" or host.startswith("localhost:"):
+    email = options.email
+    if email is None:
+      email = "test@example.com"
+      logging.info("Using debug user %s.  Override with --email" % email)
+    server = rpc_server_class(
+        options.server,
+        lambda: (email, "password"),
+        host_override=options.host,
+        extra_headers={"Cookie":
+                       'dev_appserver_login="%s:False"' % email},
+        save_cookies=options.save_cookies)
+    # Don't try to talk to ClientLogin.
+    server.authenticated = True
+    return server
+
+  return rpc_server_class(options.server, GetUserCredentials,
+                          host_override=options.host,
+                          save_cookies=options.save_cookies)
+
+
+def EncodeMultipartFormData(fields, files):
+  """Encode form fields for multipart/form-data.
+
+  Args:
+    fields: A sequence of (name, value) elements for regular form fields.
+    files: A sequence of (name, filename, value) elements for data to be
+           uploaded as files.
+  Returns:
+    (content_type, body) ready for httplib.HTTP instance.
+
+  Source:
+    https://web.archive.org/web/20160116052001/code.activestate.com/recipes/146306
+  """
+  BOUNDARY = '-M-A-G-I-C---B-O-U-N-D-A-R-Y-'
+  CRLF = '\r\n'
+  lines = []
+  for (key, value) in fields:
+    lines.append('--' + BOUNDARY)
+    lines.append('Content-Disposition: form-data; name="%s"' % key)
+    lines.append('')
+    lines.append(value)
+  for (key, filename, value) in files:
+    lines.append('--' + BOUNDARY)
+    lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' %
+             (key, filename))
+    lines.append('Content-Type: %s' % GetContentType(filename))
+    lines.append('')
+    lines.append(value)
+  lines.append('--' + BOUNDARY + '--')
+  lines.append('')
+  body = CRLF.join(lines)
+  content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
+  return content_type, body
+
+
+def GetContentType(filename):
+  """Helper to guess the content-type from the filename."""
+  return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
+
+
+# Use a shell for subcommands on Windows to get a PATH search.
+use_shell = sys.platform.startswith("win")
+
+def RunShellWithReturnCode(command, print_output=False,
+                           universal_newlines=True):
+  """Executes a command and returns the output from stdout and the return code.
+
+  Args:
+    command: Command to execute.
+    print_output: If True, the output is printed to stdout.
+                  If False, both stdout and stderr are ignored.
+    universal_newlines: Use universal_newlines flag (default: True).
+
+  Returns:
+    Tuple (output, return code)
+  """
+  logging.info("Running %s", command)
+  p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                       shell=use_shell, universal_newlines=universal_newlines)
+  if print_output:
+    output_array = []
+    while True:
+      line = p.stdout.readline()
+      if not line:
+        break
+      print line.strip("\n")
+      output_array.append(line)
+    output = "".join(output_array)
+  else:
+    output = p.stdout.read()
+  p.wait()
+  errout = p.stderr.read()
+  if print_output and errout:
+    print >>sys.stderr, errout
+  p.stdout.close()
+  p.stderr.close()
+  return output, p.returncode
+
+
+def RunShell(command, silent_ok=False, universal_newlines=True,
+             print_output=False):
+  data, retcode = RunShellWithReturnCode(command, print_output,
+                                         universal_newlines)
+  if retcode:
+    ErrorExit("Got error status from %s:\n%s" % (command, data))
+  if not silent_ok and not data:
+    ErrorExit("No output from %s" % command)
+  return data
+
+
+class VersionControlSystem(object):
+  """Abstract base class providing an interface to the VCS."""
+
+  def __init__(self, options):
+    """Constructor.
+
+    Args:
+      options: Command line options.
+    """
+    self.options = options
+
+  def GenerateDiff(self, args):
+    """Return the current diff as a string.
+
+    Args:
+      args: Extra arguments to pass to the diff command.
+    """
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+  def GetUnknownFiles(self):
+    """Return a list of files unknown to the VCS."""
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+  def CheckForUnknownFiles(self):
+    """Show an "are you sure?" prompt if there are unknown files."""
+    unknown_files = self.GetUnknownFiles()
+    if unknown_files:
+      print "The following files are not added to version control:"
+      for line in unknown_files:
+        print line
+      prompt = "Are you sure to continue?(y/N) "
+      answer = raw_input(prompt).strip()
+      if answer != "y":
+        ErrorExit("User aborted")
+
+  def GetBaseFile(self, filename):
+    """Get the content of the upstream version of a file.
+
+    Returns:
+      A tuple (base_content, new_content, is_binary, status)
+        base_content: The contents of the base file.
+        new_content: For text files, this is empty.  For binary files, this is
+          the contents of the new file, since the diff output won't contain
+          information to reconstruct the current file.
+        is_binary: True iff the file is binary.
+        status: The status of the file.
+    """
+
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+
+  def GetBaseFiles(self, diff):
+    """Helper that calls GetBase file for each file in the patch.
+
+    Returns:
+      A dictionary that maps from filename to GetBaseFile's tuple.  Filenames
+      are retrieved based on lines that start with "Index:" or
+      "Property changes on:".
+    """
+    files = {}
+    for line in diff.splitlines(True):
+      if line.startswith('Index:') or line.startswith('Property changes on:'):
+        unused, filename = line.split(':', 1)
+        # On Windows if a file has property changes its filename uses '\'
+        # instead of '/'.
+        filename = filename.strip().replace('\\', '/')
+        files[filename] = self.GetBaseFile(filename)
+    return files
+
+
+  def UploadBaseFiles(self, issue, rpc_server, patch_list, patchset, options,
+                      files):
+    """Uploads the base files (and if necessary, the current ones as well)."""
+
+    def UploadFile(filename, file_id, content, is_binary, status, is_base):
+      """Uploads a file to the server."""
+      file_too_large = False
+      if is_base:
+        type = "base"
+      else:
+        type = "current"
+      if len(content) > MAX_UPLOAD_SIZE:
+        print ("Not uploading the %s file for %s because it's too large." %
+               (type, filename))
+        file_too_large = True
+        content = ""
+      checksum = md5.new(content).hexdigest()
+      if options.verbose > 0 and not file_too_large:
+        print "Uploading %s file for %s" % (type, filename)
+      url = "/%d/upload_content/%d/%d" % (int(issue), int(patchset), file_id)
+      form_fields = [("filename", filename),
+                     ("status", status),
+                     ("checksum", checksum),
+                     ("is_binary", str(is_binary)),
+                     ("is_current", str(not is_base)),
+                    ]
+      if file_too_large:
+        form_fields.append(("file_too_large", "1"))
+      if options.email:
+        form_fields.append(("user", options.email))
+      ctype, body = EncodeMultipartFormData(form_fields,
+                                            [("data", filename, content)])
+      response_body = rpc_server.Send(url, body,
+                                      content_type=ctype)
+      if not response_body.startswith("OK"):
+        StatusUpdate("  --> %s" % response_body)
+        sys.exit(1)
+
+    patches = dict()
+    [patches.setdefault(v, k) for k, v in patch_list]
+    for filename in patches.keys():
+      base_content, new_content, is_binary, status = files[filename]
+      file_id_str = patches.get(filename)
+      if file_id_str.find("nobase") != -1:
+        base_content = None
+        file_id_str = file_id_str[file_id_str.rfind("_") + 1:]
+      file_id = int(file_id_str)
+      if base_content != None:
+        UploadFile(filename, file_id, base_content, is_binary, status, True)
+      if new_content != None:
+        UploadFile(filename, file_id, new_content, is_binary, status, False)
+
+  def IsImage(self, filename):
+    """Returns true if the filename has an image extension."""
+    mimetype =  mimetypes.guess_type(filename)[0]
+    if not mimetype:
+      return False
+    return mimetype.startswith("image/")
+
+
+class SubversionVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Subversion."""
+
+  def __init__(self, options):
+    super(SubversionVCS, self).__init__(options)
+    if self.options.revision:
+      match = re.match(r"(\d+)(:(\d+))?", self.options.revision)
+      if not match:
+        ErrorExit("Invalid Subversion revision %s." % self.options.revision)
+      self.rev_start = match.group(1)
+      self.rev_end = match.group(3)
+    else:
+      self.rev_start = self.rev_end = None
+    # Cache output from "svn list -r REVNO dirname".
+    # Keys: dirname, Values: 2-tuple (ouput for start rev and end rev).
+    self.svnls_cache = {}
+    # SVN base URL is required to fetch files deleted in an older revision.
+    # Result is cached to not guess it over and over again in GetBaseFile().
+    required = self.options.download_base or self.options.revision is not None
+    self.svn_base = self._GuessBase(required)
+
+  def GuessBase(self, required):
+    """Wrapper for _GuessBase."""
+    return self.svn_base
+
+  def _GuessBase(self, required):
+    """Returns the SVN base URL.
+
+    Args:
+      required: If true, exits if the url can't be guessed, otherwise None is
+        returned.
+    """
+    info = RunShell(["svn", "info"])
+    for line in info.splitlines():
+      words = line.split()
+      if len(words) == 2 and words[0] == "URL:":
+        url = words[1]
+        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
+        username, netloc = urllib.splituser(netloc)
+        if username:
+          logging.info("Removed username from base URL")
+        if netloc.endswith("svn.python.org"):
+          if netloc == "svn.python.org":
+            if path.startswith("/projects/"):
+              path = path[9:]
+          elif netloc != "pythondev@svn.python.org":
+            ErrorExit("Unrecognized Python URL: %s" % url)
+          base = "http://svn.python.org/view/*checkout*%s/" % path
+          logging.info("Guessed Python base = %s", base)
+        elif netloc.endswith("svn.collab.net"):
+          if path.startswith("/repos/"):
+            path = path[6:]
+          base = "http://svn.collab.net/viewvc/*checkout*%s/" % path
+          logging.info("Guessed CollabNet base = %s", base)
+        elif netloc.endswith(".googlecode.com"):
+          path = path + "/"
+          base = urlparse.urlunparse(("http", netloc, path, params,
+                                      query, fragment))
+          logging.info("Guessed Google Code base = %s", base)
+        else:
+          path = path + "/"
+          base = urlparse.urlunparse((scheme, netloc, path, params,
+                                      query, fragment))
+          logging.info("Guessed base = %s", base)
+        return base
+    if required:
+      ErrorExit("Can't find URL in output from svn info")
+    return None
+
+  def GenerateDiff(self, args):
+    cmd = ["svn", "diff"]
+    if self.options.revision:
+      cmd += ["-r", self.options.revision]
+    cmd.extend(args)
+    data = RunShell(cmd)
+    count = 0
+    for line in data.splitlines():
+      if line.startswith("Index:") or line.startswith("Property changes on:"):
+        count += 1
+        logging.info(line)
+    if not count:
+      ErrorExit("No valid patches found in output from svn diff")
+    return data
+
+  def _CollapseKeywords(self, content, keyword_str):
+    """Collapses SVN keywords."""
+    # svn cat translates keywords but svn diff doesn't. As a result of this
+    # behavior patching.PatchChunks() fails with a chunk mismatch error.
+    # This part was originally written by the Review Board development team
+    # who had the same problem (https://reviews.reviewboard.org/r/276/).
+    # Mapping of keywords to known aliases
+    svn_keywords = {
+      # Standard keywords
+      'Date':                ['Date', 'LastChangedDate'],
+      'Revision':            ['Revision', 'LastChangedRevision', 'Rev'],
+      'Author':              ['Author', 'LastChangedBy'],
+      'HeadURL':             ['HeadURL', 'URL'],
+      'Id':                  ['Id'],
+
+      # Aliases
+      'LastChangedDate':     ['LastChangedDate', 'Date'],
+      'LastChangedRevision': ['LastChangedRevision', 'Rev', 'Revision'],
+      'LastChangedBy':       ['LastChangedBy', 'Author'],
+      'URL':                 ['URL', 'HeadURL'],
+    }
+
+    def repl(m):
+       if m.group(2):
+         return "$%s::%s$" % (m.group(1), " " * len(m.group(3)))
+       return "$%s$" % m.group(1)
+    keywords = [keyword
+                for name in keyword_str.split(" ")
+                for keyword in svn_keywords.get(name, [])]
+    return re.sub(r"\$(%s):(:?)([^\$]+)\$" % '|'.join(keywords), repl, content)
+
+  def GetUnknownFiles(self):
+    status = RunShell(["svn", "status", "--ignore-externals"], silent_ok=True)
+    unknown_files = []
+    for line in status.split("\n"):
+      if line and line[0] == "?":
+        unknown_files.append(line)
+    return unknown_files
+
+  def ReadFile(self, filename):
+    """Returns the contents of a file."""
+    file = open(filename, 'rb')
+    result = ""
+    try:
+      result = file.read()
+    finally:
+      file.close()
+    return result
+
+  def GetStatus(self, filename):
+    """Returns the status of a file."""
+    if not self.options.revision:
+      status = RunShell(["svn", "status", "--ignore-externals", filename])
+      if not status:
+        ErrorExit("svn status returned no output for %s" % filename)
+      status_lines = status.splitlines()
+      # If file is in a cl, the output will begin with
+      # "\n--- Changelist 'cl_name':\n".  See
+      # https://web.archive.org/web/20090918234815/svn.collab.net/repos/svn/trunk/notes/changelist-design.txt
+      if (len(status_lines) == 3 and
+          not status_lines[0] and
+          status_lines[1].startswith("--- Changelist")):
+        status = status_lines[2]
+      else:
+        status = status_lines[0]
+    # If we have a revision to diff against we need to run "svn list"
+    # for the old and the new revision and compare the results to get
+    # the correct status for a file.
+    else:
+      dirname, relfilename = os.path.split(filename)
+      if dirname not in self.svnls_cache:
+        cmd = ["svn", "list", "-r", self.rev_start, dirname or "."]
+        out, returncode = RunShellWithReturnCode(cmd)
+        if returncode:
+          ErrorExit("Failed to get status for %s." % filename)
+        old_files = out.splitlines()
+        args = ["svn", "list"]
+        if self.rev_end:
+          args += ["-r", self.rev_end]
+        cmd = args + [dirname or "."]
+        out, returncode = RunShellWithReturnCode(cmd)
+        if returncode:
+          ErrorExit("Failed to run command %s" % cmd)
+        self.svnls_cache[dirname] = (old_files, out.splitlines())
+      old_files, new_files = self.svnls_cache[dirname]
+      if relfilename in old_files and relfilename not in new_files:
+        status = "D   "
+      elif relfilename in old_files and relfilename in new_files:
+        status = "M   "
+      else:
+        status = "A   "
+    return status
+
+  def GetBaseFile(self, filename):
+    status = self.GetStatus(filename)
+    base_content = None
+    new_content = None
+
+    # If a file is copied its status will be "A  +", which signifies
+    # "addition-with-history".  See "svn st" for more information.  We need to
+    # upload the original file or else diff parsing will fail if the file was
+    # edited.
+    if status[0] == "A" and status[3] != "+":
+      # We'll need to upload the new content if we're adding a binary file
+      # since diff's output won't contain it.
+      mimetype = RunShell(["svn", "propget", "svn:mime-type", filename],
+                          silent_ok=True)
+      base_content = ""
+      is_binary = mimetype and not mimetype.startswith("text/")
+      if is_binary and self.IsImage(filename):
+        new_content = self.ReadFile(filename)
+    elif (status[0] in ("M", "D", "R") or
+          (status[0] == "A" and status[3] == "+") or  # Copied file.
+          (status[0] == " " and status[1] == "M")):  # Property change.
+      args = []
+      if self.options.revision:
+        url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+      else:
+        # Don't change filename, it's needed later.
+        url = filename
+        args += ["-r", "BASE"]
+      cmd = ["svn"] + args + ["propget", "svn:mime-type", url]
+      mimetype, returncode = RunShellWithReturnCode(cmd)
+      if returncode:
+        # File does not exist in the requested revision.
+        # Reset mimetype, it contains an error message.
+        mimetype = ""
+      get_base = False
+      is_binary = mimetype and not mimetype.startswith("text/")
+      if status[0] == " ":
+        # Empty base content just to force an upload.
+        base_content = ""
+      elif is_binary:
+        if self.IsImage(filename):
+          get_base = True
+          if status[0] == "M":
+            if not self.rev_end:
+              new_content = self.ReadFile(filename)
+            else:
+              url = "%s/%s@%s" % (self.svn_base, filename, self.rev_end)
+              new_content = RunShell(["svn", "cat", url],
+                                     universal_newlines=True, silent_ok=True)
+        else:
+          base_content = ""
+      else:
+        get_base = True
+
+      if get_base:
+        if is_binary:
+          universal_newlines = False
+        else:
+          universal_newlines = True
+        if self.rev_start:
+          # "svn cat -r REV delete_file.txt" doesn't work. cat requires
+          # the full URL with "@REV" appended instead of using "-r" option.
+          url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+          base_content = RunShell(["svn", "cat", url],
+                                  universal_newlines=universal_newlines,
+                                  silent_ok=True)
+        else:
+          base_content = RunShell(["svn", "cat", filename],
+                                  universal_newlines=universal_newlines,
+                                  silent_ok=True)
+        if not is_binary:
+          args = []
+          if self.rev_start:
+            url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+          else:
+            url = filename
+            args += ["-r", "BASE"]
+          cmd = ["svn"] + args + ["propget", "svn:keywords", url]
+          keywords, returncode = RunShellWithReturnCode(cmd)
+          if keywords and not returncode:
+            base_content = self._CollapseKeywords(base_content, keywords)
+    else:
+      StatusUpdate("svn status returned unexpected output: %s" % status)
+      sys.exit(1)
+    return base_content, new_content, is_binary, status[0:5]
+
+
+class GitVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Git."""
+
+  def __init__(self, options):
+    super(GitVCS, self).__init__(options)
+    # Map of filename -> hash of base file.
+    self.base_hashes = {}
+
+  def GenerateDiff(self, extra_args):
+    # This is more complicated than svn's GenerateDiff because we must convert
+    # the diff output to include an svn-style "Index:" line as well as record
+    # the hashes of the base files, so we can upload them along with our diff.
+    if self.options.revision:
+      extra_args = [self.options.revision] + extra_args
+    gitdiff = RunShell(["git", "diff", "--full-index"] + extra_args)
+    svndiff = []
+    filecount = 0
+    filename = None
+    for line in gitdiff.splitlines():
+      match = re.match(r"diff --git a/(.*) b/.*$", line)
+      if match:
+        filecount += 1
+        filename = match.group(1)
+        svndiff.append("Index: %s\n" % filename)
+      else:
+        # The "index" line in a git diff looks like this (long hashes elided):
+        #   index 82c0d44..b2cee3f 100755
+        # We want to save the left hash, as that identifies the base file.
+        match = re.match(r"index (\w+)\.\.", line)
+        if match:
+          self.base_hashes[filename] = match.group(1)
+      svndiff.append(line + "\n")
+    if not filecount:
+      ErrorExit("No valid patches found in output from git diff")
+    return "".join(svndiff)
+
+  def GetUnknownFiles(self):
+    status = RunShell(["git", "ls-files", "--exclude-standard", "--others"],
+                      silent_ok=True)
+    return status.splitlines()
+
+  def GetBaseFile(self, filename):
+    hash = self.base_hashes[filename]
+    base_content = None
+    new_content = None
+    is_binary = False
+    if hash == "0" * 40:  # All-zero hash indicates no base file.
+      status = "A"
+      base_content = ""
+    else:
+      status = "M"
+      base_content, returncode = RunShellWithReturnCode(["git", "show", hash])
+      if returncode:
+        ErrorExit("Got error status from 'git show %s'" % hash)
+    return (base_content, new_content, is_binary, status)
+
+
+class MercurialVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Mercurial."""
+
+  def __init__(self, options, repo_dir):
+    super(MercurialVCS, self).__init__(options)
+    # Absolute path to repository (we can be in a subdir)
+    self.repo_dir = os.path.normpath(repo_dir)
+    # Compute the subdir
+    cwd = os.path.normpath(os.getcwd())
+    assert cwd.startswith(self.repo_dir)
+    self.subdir = cwd[len(self.repo_dir):].lstrip(r"\/")
+    if self.options.revision:
+      self.base_rev = self.options.revision
+    else:
+      self.base_rev = RunShell(["hg", "parent", "-q"]).split(':')[1].strip()
+
+  def _GetRelPath(self, filename):
+    """Get relative path of a file according to the current directory,
+    given its logical path in the repo."""
+    assert filename.startswith(self.subdir), filename
+    return filename[len(self.subdir):].lstrip(r"\/")
+
+  def GenerateDiff(self, extra_args):
+    # If no file specified, restrict to the current subdir
+    extra_args = extra_args or ["."]
+    cmd = ["hg", "diff", "--git", "-r", self.base_rev] + extra_args
+    data = RunShell(cmd, silent_ok=True)
+    svndiff = []
+    filecount = 0
+    for line in data.splitlines():
+      m = re.match("diff --git a/(\S+) b/(\S+)", line)
+      if m:
+        # Modify line to make it look like as it comes from svn diff.
+        # With this modification no changes on the server side are required
+        # to make upload.py work with Mercurial repos.
+        # NOTE: for proper handling of moved/copied files, we have to use
+        # the second filename.
+        filename = m.group(2)
+        svndiff.append("Index: %s" % filename)
+        svndiff.append("=" * 67)
+        filecount += 1
+        logging.info(line)
+      else:
+        svndiff.append(line)
+    if not filecount:
+      ErrorExit("No valid patches found in output from hg diff")
+    return "\n".join(svndiff) + "\n"
+
+  def GetUnknownFiles(self):
+    """Return a list of files unknown to the VCS."""
+    args = []
+    status = RunShell(["hg", "status", "--rev", self.base_rev, "-u", "."],
+        silent_ok=True)
+    unknown_files = []
+    for line in status.splitlines():
+      st, fn = line.split(" ", 1)
+      if st == "?":
+        unknown_files.append(fn)
+    return unknown_files
+
+  def GetBaseFile(self, filename):
+    # "hg status" and "hg cat" both take a path relative to the current subdir
+    # rather than to the repo root, but "hg diff" has given us the full path
+    # to the repo root.
+    base_content = ""
+    new_content = None
+    is_binary = False
+    oldrelpath = relpath = self._GetRelPath(filename)
+    # "hg status -C" returns two lines for moved/copied files, one otherwise
+    out = RunShell(["hg", "status", "-C", "--rev", self.base_rev, relpath])
+    out = out.splitlines()
+    # HACK: strip error message about missing file/directory if it isn't in
+    # the working copy
+    if out[0].startswith('%s: ' % relpath):
+      out = out[1:]
+    if len(out) > 1:
+      # Moved/copied => considered as modified, use old filename to
+      # retrieve base contents
+      oldrelpath = out[1].strip()
+      status = "M"
+    else:
+      status, _ = out[0].split(' ', 1)
+    if status != "A":
+      base_content = RunShell(["hg", "cat", "-r", self.base_rev, oldrelpath],
+        silent_ok=True)
+      is_binary = "\0" in base_content  # Mercurial's heuristic
+    if status != "R":
+      new_content = open(relpath, "rb").read()
+      is_binary = is_binary or "\0" in new_content
+    if is_binary and base_content:
+      # Fetch again without converting newlines
+      base_content = RunShell(["hg", "cat", "-r", self.base_rev, oldrelpath],
+        silent_ok=True, universal_newlines=False)
+    if not is_binary or not self.IsImage(relpath):
+      new_content = None
+    return base_content, new_content, is_binary, status
+
+
+# NOTE: The SplitPatch function is duplicated in engine.py, keep them in sync.
+def SplitPatch(data):
+  """Splits a patch into separate pieces for each file.
+
+  Args:
+    data: A string containing the output of svn diff.
+
+  Returns:
+    A list of 2-tuple (filename, text) where text is the svn diff output
+      pertaining to filename.
+  """
+  patches = []
+  filename = None
+  diff = []
+  for line in data.splitlines(True):
+    new_filename = None
+    if line.startswith('Index:'):
+      unused, new_filename = line.split(':', 1)
+      new_filename = new_filename.strip()
+    elif line.startswith('Property changes on:'):
+      unused, temp_filename = line.split(':', 1)
+      # When a file is modified, paths use '/' between directories, however
+      # when a property is modified '\' is used on Windows.  Make them the same
+      # otherwise the file shows up twice.
+      temp_filename = temp_filename.strip().replace('\\', '/')
+      if temp_filename != filename:
+        # File has property changes but no modifications, create a new diff.
+        new_filename = temp_filename
+    if new_filename:
+      if filename and diff:
+        patches.append((filename, ''.join(diff)))
+      filename = new_filename
+      diff = [line]
+      continue
+    if diff is not None:
+      diff.append(line)
+  if filename and diff:
+    patches.append((filename, ''.join(diff)))
+  return patches
+
+
+def UploadSeparatePatches(issue, rpc_server, patchset, data, options):
+  """Uploads a separate patch for each file in the diff output.
+
+  Returns a list of [patch_key, filename] for each file.
+  """
+  patches = SplitPatch(data)
+  rv = []
+  for patch in patches:
+    if len(patch[1]) > MAX_UPLOAD_SIZE:
+      print ("Not uploading the patch for " + patch[0] +
+             " because the file is too large.")
+      continue
+    form_fields = [("filename", patch[0])]
+    if not options.download_base:
+      form_fields.append(("content_upload", "1"))
+    files = [("data", "data.diff", patch[1])]
+    ctype, body = EncodeMultipartFormData(form_fields, files)
+    url = "/%d/upload_patch/%d" % (int(issue), int(patchset))
+    print "Uploading patch for " + patch[0]
+    response_body = rpc_server.Send(url, body, content_type=ctype)
+    lines = response_body.splitlines()
+    if not lines or lines[0] != "OK":
+      StatusUpdate("  --> %s" % response_body)
+      sys.exit(1)
+    rv.append([lines[1], patch[0]])
+  return rv
+
+
+def GuessVCS(options):
+  """Helper to guess the version control system.
+
+  This examines the current directory, guesses which VersionControlSystem
+  we're using, and returns an instance of the appropriate class.  Exit with an
+  error if we can't figure it out.
+
+  Returns:
+    A VersionControlSystem instance. Exits if the VCS can't be guessed.
+  """
+  # Mercurial has a command to get the base directory of a repository
+  # Try running it, but don't die if we don't have hg installed.
+  # NOTE: we try Mercurial first as it can sit on top of an SVN working copy.
+  try:
+    out, returncode = RunShellWithReturnCode(["hg", "root"])
+    if returncode == 0:
+      return MercurialVCS(options, out.strip())
+  except OSError, (errno, message):
+    if errno != 2:  # ENOENT -- they don't have hg installed.
+      raise
+
+  # Subversion has a .svn in all working directories.
+  if os.path.isdir('.svn'):
+    logging.info("Guessed VCS = Subversion")
+    return SubversionVCS(options)
+
+  # Git has a command to test if you're in a git tree.
+  # Try running it, but don't die if we don't have git installed.
+  try:
+    out, returncode = RunShellWithReturnCode(["git", "rev-parse",
+                                              "--is-inside-work-tree"])
+    if returncode == 0:
+      return GitVCS(options)
+  except OSError, (errno, message):
+    if errno != 2:  # ENOENT -- they don't have git installed.
+      raise
+
+  ErrorExit(("Could not guess version control system. "
+             "Are you in a working copy directory?"))
+
+
+def RealMain(argv, data=None):
+  """The real main function.
+
+  Args:
+    argv: Command line arguments.
+    data: Diff contents. If None (default) the diff is generated by
+      the VersionControlSystem implementation returned by GuessVCS().
+
+  Returns:
+    A 2-tuple (issue id, patchset id).
+    The patchset id is None if the base files are not uploaded by this
+    script (applies only to SVN checkouts).
+  """
+  logging.basicConfig(format=("%(asctime).19s %(levelname)s %(filename)s:"
+                              "%(lineno)s %(message)s "))
+  os.environ['LC_ALL'] = 'C'
+  options, args = parser.parse_args(argv[1:])
+  global verbosity
+  verbosity = options.verbose
+  if verbosity >= 3:
+    logging.getLogger().setLevel(logging.DEBUG)
+  elif verbosity >= 2:
+    logging.getLogger().setLevel(logging.INFO)
+  vcs = GuessVCS(options)
+  if isinstance(vcs, SubversionVCS):
+    # base field is only allowed for Subversion.
+    # Note: Fetching base files may become deprecated in future releases.
+    base = vcs.GuessBase(options.download_base)
+  else:
+    base = None
+  if not base and options.download_base:
+    options.download_base = True
+    logging.info("Enabled upload of base file")
+  if not options.assume_yes:
+    vcs.CheckForUnknownFiles()
+  if data is None:
+    data = vcs.GenerateDiff(args)
+  files = vcs.GetBaseFiles(data)
+  if verbosity >= 1:
+    print "Upload server:", options.server, "(change with -s/--server)"
+  if options.issue:
+    prompt = "Message describing this patch set: "
+  else:
+    prompt = "New issue subject: "
+  message = options.message or raw_input(prompt).strip()
+  if not message:
+    ErrorExit("A non-empty message is required")
+  rpc_server = GetRpcServer(options)
+  form_fields = [("subject", message)]
+  if base:
+    form_fields.append(("base", base))
+  if options.issue:
+    form_fields.append(("issue", str(options.issue)))
+  if options.email:
+    form_fields.append(("user", options.email))
+  if options.reviewers:
+    for reviewer in options.reviewers.split(','):
+      if "@" in reviewer and not reviewer.split("@")[1].count(".") == 1:
+        ErrorExit("Invalid email address: %s" % reviewer)
+    form_fields.append(("reviewers", options.reviewers))
+  if options.cc:
+    for cc in options.cc.split(','):
+      if "@" in cc and not cc.split("@")[1].count(".") == 1:
+        ErrorExit("Invalid email address: %s" % cc)
+    form_fields.append(("cc", options.cc))
+  description = options.description
+  if options.description_file:
+    if options.description:
+      ErrorExit("Can't specify description and description_file")
+    file = open(options.description_file, 'r')
+    description = file.read()
+    file.close()
+  if description:
+    form_fields.append(("description", description))
+  # Send a hash of all the base file so the server can determine if a copy
+  # already exists in an earlier patchset.
+  base_hashes = ""
+  for file, info in files.iteritems():
+    if not info[0] is None:
+      checksum = md5.new(info[0]).hexdigest()
+      if base_hashes:
+        base_hashes += "|"
+      base_hashes += checksum + ":" + file
+  form_fields.append(("base_hashes", base_hashes))
+  # If we're uploading base files, don't send the email before the uploads, so
+  # that it contains the file status.
+  if options.send_mail and options.download_base:
+    form_fields.append(("send_mail", "1"))
+  if not options.download_base:
+    form_fields.append(("content_upload", "1"))
+  if len(data) > MAX_UPLOAD_SIZE:
+    print "Patch is large, so uploading file patches separately."
+    uploaded_diff_file = []
+    form_fields.append(("separate_patches", "1"))
+  else:
+    uploaded_diff_file = [("data", "data.diff", data)]
+  ctype, body = EncodeMultipartFormData(form_fields, uploaded_diff_file)
+  response_body = rpc_server.Send("/upload", body, content_type=ctype)
+  patchset = None
+  if not options.download_base or not uploaded_diff_file:
+    lines = response_body.splitlines()
+    if len(lines) >= 2:
+      msg = lines[0]
+      patchset = lines[1].strip()
+      patches = [x.split(" ", 1) for x in lines[2:]]
+    else:
+      msg = response_body
+  else:
+    msg = response_body
+  StatusUpdate(msg)
+  if not response_body.startswith("Issue created.") and \
+  not response_body.startswith("Issue updated."):
+    sys.exit(0)
+  issue = msg[msg.rfind("/")+1:]
+
+  if not uploaded_diff_file:
+    result = UploadSeparatePatches(issue, rpc_server, patchset, data, options)
+    if not options.download_base:
+      patches = result
+
+  if not options.download_base:
+    vcs.UploadBaseFiles(issue, rpc_server, patches, patchset, options, files)
+    if options.send_mail:
+      rpc_server.Send("/" + issue + "/mail", payload="")
+  return issue, patchset
+
+
+def main():
+  try:
+    RealMain(sys.argv)
+  except KeyboardInterrupt:
+    print
+    StatusUpdate("Interrupted.")
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload_gmock.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload_gmock.py
new file mode 100755
index 0000000..5dc484b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/scripts/upload_gmock.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""upload_gmock.py v0.1.0 -- uploads a Google Mock patch for review.
+
+This simple wrapper passes all command line flags and
+--cc=googlemock@googlegroups.com to upload.py.
+
+USAGE: upload_gmock.py [options for upload.py]
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import sys
+
+CC_FLAG = '--cc='
+GMOCK_GROUP = 'googlemock@googlegroups.com'
+
+
+def main():
+  # Finds the path to upload.py, assuming it is in the same directory
+  # as this file.
+  my_dir = os.path.dirname(os.path.abspath(__file__))
+  upload_py_path = os.path.join(my_dir, 'upload.py')
+
+  # Adds Google Mock discussion group to the cc line if it's not there
+  # already.
+  upload_py_argv = [upload_py_path]
+  found_cc_flag = False
+  for arg in sys.argv[1:]:
+    if arg.startswith(CC_FLAG):
+      found_cc_flag = True
+      cc_line = arg[len(CC_FLAG):]
+      cc_list = [addr for addr in cc_line.split(',') if addr]
+      if GMOCK_GROUP not in cc_list:
+        cc_list.append(GMOCK_GROUP)
+      upload_py_argv.append(CC_FLAG + ','.join(cc_list))
+    else:
+      upload_py_argv.append(arg)
+
+  if not found_cc_flag:
+    upload_py_argv.append(CC_FLAG + GMOCK_GROUP)
+
+  # Invokes upload.py with the modified command line flags.
+  os.execv(upload_py_path, upload_py_argv)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-all.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-all.cc
new file mode 100755
index 0000000..e43c9b7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-all.cc
@@ -0,0 +1,46 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Mocking Framework (Google Mock)
+//
+// This file #includes all Google Mock implementation .cc files.  The
+// purpose is to allow a user to build Google Mock by compiling this
+// file alone.
+
+// This line ensures that gmock.h can be compiled on its own, even
+// when it's fused.
+#include "gmock/gmock.h"
+
+// The following lines pull in the real gmock *.cc files.
+#include "src/gmock-cardinalities.cc"
+#include "src/gmock-internal-utils.cc"
+#include "src/gmock-matchers.cc"
+#include "src/gmock-spec-builders.cc"
+#include "src/gmock.cc"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-cardinalities.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-cardinalities.cc
new file mode 100755
index 0000000..7463f43
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-cardinalities.cc
@@ -0,0 +1,155 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements cardinalities.
+
+#include "gmock/gmock-cardinalities.h"
+
+#include <limits.h>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+namespace {
+
+// Implements the Between(m, n) cardinality.
+class BetweenCardinalityImpl : public CardinalityInterface {
+ public:
+  BetweenCardinalityImpl(int min, int max)
+      : min_(min >= 0 ? min : 0),
+        max_(max >= min_ ? max : min_) {
+    std::stringstream ss;
+    if (min < 0) {
+      ss << "The invocation lower bound must be >= 0, "
+         << "but is actually " << min << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (max < 0) {
+      ss << "The invocation upper bound must be >= 0, "
+         << "but is actually " << max << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (min > max) {
+      ss << "The invocation upper bound (" << max
+         << ") must be >= the invocation lower bound (" << min
+         << ").";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    }
+  }
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const override { return min_; }
+  int ConservativeUpperBound() const override { return max_; }
+
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return min_ <= call_count && call_count <= max_;
+  }
+
+  bool IsSaturatedByCallCount(int call_count) const override {
+    return call_count >= max_;
+  }
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  const int min_;
+  const int max_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(BetweenCardinalityImpl);
+};
+
+// Formats "n times" in a human-friendly way.
+inline std::string FormatTimes(int n) {
+  if (n == 1) {
+    return "once";
+  } else if (n == 2) {
+    return "twice";
+  } else {
+    std::stringstream ss;
+    ss << n << " times";
+    return ss.str();
+  }
+}
+
+// Describes the Between(m, n) cardinality in human-friendly text.
+void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
+  if (min_ == 0) {
+    if (max_ == 0) {
+      *os << "never called";
+    } else if (max_ == INT_MAX) {
+      *os << "called any number of times";
+    } else {
+      *os << "called at most " << FormatTimes(max_);
+    }
+  } else if (min_ == max_) {
+    *os << "called " << FormatTimes(min_);
+  } else if (max_ == INT_MAX) {
+    *os << "called at least " << FormatTimes(min_);
+  } else {
+    // 0 < min_ < max_ < INT_MAX
+    *os << "called between " << min_ << " and " << max_ << " times";
+  }
+}
+
+}  // Unnamed namespace
+
+// Describes the given call count to an ostream.
+void Cardinality::DescribeActualCallCountTo(int actual_call_count,
+                                            ::std::ostream* os) {
+  if (actual_call_count > 0) {
+    *os << "called " << FormatTimes(actual_call_count);
+  } else {
+    *os << "never called";
+  }
+}
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max) {
+  return Cardinality(new BetweenCardinalityImpl(min, max));
+}
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-internal-utils.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-internal-utils.cc
new file mode 100755
index 0000000..1292e1d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-internal-utils.cc
@@ -0,0 +1,200 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <ctype.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields) {
+  switch (fields.size()) {
+    case 0:
+      return "";
+    case 1:
+      return fields[0];
+    default:
+      std::string result = "(" + fields[0];
+      for (size_t i = 1; i < fields.size(); i++) {
+        result += ", ";
+        result += fields[i];
+      }
+      result += ")";
+      return result;
+  }
+}
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
+  std::string result;
+  char prev_char = '\0';
+  for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
+    // We don't care about the current locale as the input is
+    // guaranteed to be a valid C++ identifier name.
+    const bool starts_new_word = IsUpper(*p) ||
+        (!IsAlpha(prev_char) && IsLower(*p)) ||
+        (!IsDigit(prev_char) && IsDigit(*p));
+
+    if (IsAlNum(*p)) {
+      if (starts_new_word && result != "")
+        result += ' ';
+      result += ToLower(*p);
+    }
+  }
+  return result;
+}
+
+// This class reports Google Mock failures as Google Test failures.  A
+// user can define another class in a similar fashion if they intend to
+// use Google Mock with a testing framework other than Google Test.
+class GoogleTestFailureReporter : public FailureReporterInterface {
+ public:
+  void ReportFailure(FailureType type, const char* file, int line,
+                     const std::string& message) override {
+    AssertHelper(type == kFatal ?
+                 TestPartResult::kFatalFailure :
+                 TestPartResult::kNonFatalFailure,
+                 file,
+                 line,
+                 message.c_str()) = Message();
+    if (type == kFatal) {
+      posix::Abort();
+    }
+  }
+};
+
+// Returns the global failure reporter.  Will create a
+// GoogleTestFailureReporter and return it the first time called.
+GTEST_API_ FailureReporterInterface* GetFailureReporter() {
+  // Points to the global failure reporter used by Google Mock.  gcc
+  // guarantees that the following use of failure_reporter is
+  // thread-safe.  We may need to add additional synchronization to
+  // protect failure_reporter if we port Google Mock to other
+  // compilers.
+  static FailureReporterInterface* const failure_reporter =
+      new GoogleTestFailureReporter();
+  return failure_reporter;
+}
+
+// Protects global resources (stdout in particular) used by Log().
+static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
+
+// Returns true if a log with the given severity is visible according
+// to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity) {
+  if (GMOCK_FLAG(verbose) == kInfoVerbosity) {
+    // Always show the log if --gmock_verbose=info.
+    return true;
+  } else if (GMOCK_FLAG(verbose) == kErrorVerbosity) {
+    // Always hide it if --gmock_verbose=error.
+    return false;
+  } else {
+    // If --gmock_verbose is neither "info" nor "error", we treat it
+    // as "warning" (its default value).
+    return severity == kWarning;
+  }
+}
+
+// Prints the given message to stdout if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip) {
+  if (!LogIsVisible(severity))
+    return;
+
+  // Ensures that logs from different threads don't interleave.
+  MutexLock l(&g_log_mutex);
+
+  if (severity == kWarning) {
+    // Prints a GMOCK WARNING marker to make the warnings easily searchable.
+    std::cout << "\nGMOCK WARNING:";
+  }
+  // Pre-pends a new-line to message if it doesn't start with one.
+  if (message.empty() || message[0] != '\n') {
+    std::cout << "\n";
+  }
+  std::cout << message;
+  if (stack_frames_to_skip >= 0) {
+#ifdef NDEBUG
+    // In opt mode, we have to be conservative and skip no stack frame.
+    const int actual_to_skip = 0;
+#else
+    // In dbg mode, we can do what the caller tell us to do (plus one
+    // for skipping this function's stack frame).
+    const int actual_to_skip = stack_frames_to_skip + 1;
+#endif  // NDEBUG
+
+    // Appends a new-line to message if it doesn't end with one.
+    if (!message.empty() && *message.rbegin() != '\n') {
+      std::cout << "\n";
+    }
+    std::cout << "Stack trace:\n"
+         << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+             ::testing::UnitTest::GetInstance(), actual_to_skip);
+  }
+  std::cout << ::std::flush;
+}
+
+GTEST_API_ WithoutMatchers GetWithoutMatchers() { return WithoutMatchers(); }
+
+GTEST_API_ void IllegalDoDefault(const char* file, int line) {
+  internal::Assert(
+      false, file, line,
+      "You are using DoDefault() inside a composite action like "
+      "DoAll() or WithArgs().  This is not supported for technical "
+      "reasons.  Please instead spell out the default action, or "
+      "assign the default action to an Action variable and use "
+      "the variable in various places.");
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-matchers.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-matchers.cc
new file mode 100755
index 0000000..4a3f7af
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-matchers.cc
@@ -0,0 +1,462 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements Matcher<const string&>, Matcher<string>, and
+// utilities for defining matchers.
+
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-generated-matchers.h"
+
+#include <string.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace testing {
+namespace internal {
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values) {
+  std::string result = ConvertIdentifierNameToWords(matcher_name);
+  if (param_values.size() >= 1) result += " " + JoinAsTuple(param_values);
+  return negation ? "not (" + result + ")" : result;
+}
+
+// FindMaxBipartiteMatching and its helper class.
+//
+// Uses the well-known Ford-Fulkerson max flow method to find a maximum
+// bipartite matching. Flow is considered to be from left to right.
+// There is an implicit source node that is connected to all of the left
+// nodes, and an implicit sink node that is connected to all of the
+// right nodes. All edges have unit capacity.
+//
+// Neither the flow graph nor the residual flow graph are represented
+// explicitly. Instead, they are implied by the information in 'graph' and
+// a vector<int> called 'left_' whose elements are initialized to the
+// value kUnused. This represents the initial state of the algorithm,
+// where the flow graph is empty, and the residual flow graph has the
+// following edges:
+//   - An edge from source to each left_ node
+//   - An edge from each right_ node to sink
+//   - An edge from each left_ node to each right_ node, if the
+//     corresponding edge exists in 'graph'.
+//
+// When the TryAugment() method adds a flow, it sets left_[l] = r for some
+// nodes l and r. This induces the following changes:
+//   - The edges (source, l), (l, r), and (r, sink) are added to the
+//     flow graph.
+//   - The same three edges are removed from the residual flow graph.
+//   - The reverse edges (l, source), (r, l), and (sink, r) are added
+//     to the residual flow graph, which is a directional graph
+//     representing unused flow capacity.
+//
+// When the method augments a flow (moving left_[l] from some r1 to some
+// other r2), this can be thought of as "undoing" the above steps with
+// respect to r1 and "redoing" them with respect to r2.
+//
+// It bears repeating that the flow graph and residual flow graph are
+// never represented explicitly, but can be derived by looking at the
+// information in 'graph' and in left_.
+//
+// As an optimization, there is a second vector<int> called right_ which
+// does not provide any new information. Instead, it enables more
+// efficient queries about edges entering or leaving the right-side nodes
+// of the flow or residual flow graphs. The following invariants are
+// maintained:
+//
+// left[l] == kUnused or right[left[l]] == l
+// right[r] == kUnused or left[right[r]] == r
+//
+// . [ source ]                                        .
+// .   |||                                             .
+// .   |||                                             .
+// .   ||\--> left[0]=1  ---\    right[0]=-1 ----\     .
+// .   ||                   |                    |     .
+// .   |\---> left[1]=-1    \--> right[1]=0  ---\|     .
+// .   |                                        ||     .
+// .   \----> left[2]=2  ------> right[2]=2  --\||     .
+// .                                           |||     .
+// .         elements           matchers       vvv     .
+// .                                         [ sink ]  .
+//
+// See Also:
+//   [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
+//       "Introduction to Algorithms (Second ed.)", pp. 651-664.
+//   [2] "Ford-Fulkerson algorithm", Wikipedia,
+//       'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
+class MaxBipartiteMatchState {
+ public:
+  explicit MaxBipartiteMatchState(const MatchMatrix& graph)
+      : graph_(&graph),
+        left_(graph_->LhsSize(), kUnused),
+        right_(graph_->RhsSize(), kUnused) {}
+
+  // Returns the edges of a maximal match, each in the form {left, right}.
+  ElementMatcherPairs Compute() {
+    // 'seen' is used for path finding { 0: unseen, 1: seen }.
+    ::std::vector<char> seen;
+    // Searches the residual flow graph for a path from each left node to
+    // the sink in the residual flow graph, and if one is found, add flow
+    // to the graph. It's okay to search through the left nodes once. The
+    // edge from the implicit source node to each previously-visited left
+    // node will have flow if that left node has any path to the sink
+    // whatsoever. Subsequent augmentations can only add flow to the
+    // network, and cannot take away that previous flow unit from the source.
+    // Since the source-to-left edge can only carry one flow unit (or,
+    // each element can be matched to only one matcher), there is no need
+    // to visit the left nodes more than once looking for augmented paths.
+    // The flow is known to be possible or impossible by looking at the
+    // node once.
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      // Reset the path-marking vector and try to find a path from
+      // source to sink starting at the left_[ilhs] node.
+      GTEST_CHECK_(left_[ilhs] == kUnused)
+          << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
+      // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
+      seen.assign(graph_->RhsSize(), 0);
+      TryAugment(ilhs, &seen);
+    }
+    ElementMatcherPairs result;
+    for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
+      size_t irhs = left_[ilhs];
+      if (irhs == kUnused) continue;
+      result.push_back(ElementMatcherPair(ilhs, irhs));
+    }
+    return result;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  // Perform a depth-first search from left node ilhs to the sink.  If a
+  // path is found, flow is added to the network by linking the left and
+  // right vector elements corresponding each segment of the path.
+  // Returns true if a path to sink was found, which means that a unit of
+  // flow was added to the network. The 'seen' vector elements correspond
+  // to right nodes and are marked to eliminate cycles from the search.
+  //
+  // Left nodes will only be explored at most once because they
+  // are accessible from at most one right node in the residual flow
+  // graph.
+  //
+  // Note that left_[ilhs] is the only element of left_ that TryAugment will
+  // potentially transition from kUnused to another value. Any other
+  // left_ element holding kUnused before TryAugment will be holding it
+  // when TryAugment returns.
+  //
+  bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      if ((*seen)[irhs]) continue;
+      if (!graph_->HasEdge(ilhs, irhs)) continue;
+      // There's an available edge from ilhs to irhs.
+      (*seen)[irhs] = 1;
+      // Next a search is performed to determine whether
+      // this edge is a dead end or leads to the sink.
+      //
+      // right_[irhs] == kUnused means that there is residual flow from
+      // right node irhs to the sink, so we can use that to finish this
+      // flow path and return success.
+      //
+      // Otherwise there is residual flow to some ilhs. We push flow
+      // along that path and call ourselves recursively to see if this
+      // ultimately leads to sink.
+      if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
+        // Add flow from left_[ilhs] to right_[irhs].
+        left_[ilhs] = irhs;
+        right_[irhs] = ilhs;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const MatchMatrix* graph_;  // not owned
+  // Each element of the left_ vector represents a left hand side node
+  // (i.e. an element) and each element of right_ is a right hand side
+  // node (i.e. a matcher). The values in the left_ vector indicate
+  // outflow from that node to a node on the right_ side. The values
+  // in the right_ indicate inflow, and specify which left_ node is
+  // feeding that right_ node, if any. For example, left_[3] == 1 means
+  // there's a flow from element #3 to matcher #1. Such a flow would also
+  // be redundantly represented in the right_ vector as right_[1] == 3.
+  // Elements of left_ and right_ are either kUnused or mutually
+  // referent. Mutually referent means that left_[right_[i]] = i and
+  // right_[left_[i]] = i.
+  ::std::vector<size_t> left_;
+  ::std::vector<size_t> right_;
+
+  GTEST_DISALLOW_ASSIGN_(MaxBipartiteMatchState);
+};
+
+const size_t MaxBipartiteMatchState::kUnused;
+
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g) {
+  return MaxBipartiteMatchState(g).Compute();
+}
+
+static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
+                                     ::std::ostream* stream) {
+  typedef ElementMatcherPairs::const_iterator Iter;
+  ::std::ostream& os = *stream;
+  os << "{";
+  const char* sep = "";
+  for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
+    os << sep << "\n  ("
+       << "element #" << it->first << ", "
+       << "matcher #" << it->second << ")";
+    sep = ",";
+  }
+  os << "\n}";
+}
+
+bool MatchMatrix::NextGraph() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      if (!b) {
+        b = 1;
+        return true;
+      }
+      b = 0;
+    }
+  }
+  return false;
+}
+
+void MatchMatrix::Randomize() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      b = static_cast<char>(rand() & 1);  // NOLINT
+    }
+  }
+}
+
+std::string MatchMatrix::DebugString() const {
+  ::std::stringstream ss;
+  const char* sep = "";
+  for (size_t i = 0; i < LhsSize(); ++i) {
+    ss << sep;
+    for (size_t j = 0; j < RhsSize(); ++j) {
+      ss << HasEdge(i, j);
+    }
+    sep = ";";
+  }
+  return ss.str();
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "is empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "has " << Elements(1) << " and that element ";
+        matcher_describers_[0]->DescribeTo(os);
+        return;
+      }
+      *os << "has " << Elements(matcher_describers_.size())
+          << " and there exists some permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "a surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "an injection from elements to requirements exists such that:\n";
+      break;
+  }
+
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "isn't empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "doesn't have " << Elements(1) << ", or has " << Elements(1)
+            << " that ";
+        matcher_describers_[0]->DescribeNegationTo(os);
+        return;
+      }
+      *os << "doesn't have " << Elements(matcher_describers_.size())
+          << ", or there exists no permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "no surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "no injection from elements to requirements exists such that:\n";
+      break;
+  }
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+// Checks that all matchers match at least one element, and that all
+// elements match at least one matcher. This enables faster matching
+// and better error reporting.
+// Returns false, writing an explanation to 'listener', if and only
+// if the success criteria are not met.
+bool UnorderedElementsAreMatcherImplBase::VerifyMatchMatrix(
+    const ::std::vector<std::string>& element_printouts,
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  bool result = true;
+  ::std::vector<char> element_matched(matrix.LhsSize(), 0);
+  ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
+
+  for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
+    for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
+      char matched = matrix.HasEdge(ilhs, irhs);
+      element_matched[ilhs] |= matched;
+      matcher_matched[irhs] |= matched;
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Superset) {
+    const char* sep =
+        "where the following matchers don't match any elements:\n";
+    for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
+      if (matcher_matched[mi]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << sep << "matcher #" << mi << ": ";
+        matcher_describers_[mi]->DescribeTo(listener->stream());
+        sep = ",\n";
+      }
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Subset) {
+    const char* sep =
+        "where the following elements don't match any matchers:\n";
+    const char* outer_sep = "";
+    if (!result) {
+      outer_sep = "\nand ";
+    }
+    for (size_t ei = 0; ei < element_matched.size(); ++ei) {
+      if (element_matched[ei]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << outer_sep << sep << "element #" << ei << ": "
+                  << element_printouts[ei];
+        sep = ",\n";
+        outer_sep = "";
+      }
+    }
+  }
+  return result;
+}
+
+bool UnorderedElementsAreMatcherImplBase::FindPairing(
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
+
+  size_t max_flow = matches.size();
+  if ((match_flags() & UnorderedMatcherRequire::Superset) &&
+      max_flow < matrix.RhsSize()) {
+    if (listener->IsInterested()) {
+      *listener << "where no permutation of the elements can satisfy all "
+                   "matchers, and the closest match is "
+                << max_flow << " of " << matrix.RhsSize()
+                << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+  if ((match_flags() & UnorderedMatcherRequire::Subset) &&
+      max_flow < matrix.LhsSize()) {
+    if (listener->IsInterested()) {
+      *listener
+          << "where not all elements can be matched, and the closest match is "
+          << max_flow << " of " << matrix.RhsSize()
+          << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+
+  if (matches.size() > 1) {
+    if (listener->IsInterested()) {
+      const char* sep = "where:\n";
+      for (size_t mi = 0; mi < matches.size(); ++mi) {
+        *listener << sep << " - element #" << matches[mi].first
+                  << " is matched by matcher #" << matches[mi].second;
+        sep = ",\n";
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-spec-builders.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-spec-builders.cc
new file mode 100755
index 0000000..f6705a3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock-spec-builders.cc
@@ -0,0 +1,887 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the spec builder syntax (ON_CALL and
+// EXPECT_CALL).
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <stdlib.h>
+#include <iostream>  // NOLINT
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
+# include <unistd.h>  // NOLINT
+#endif
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(push)
+#  pragma warning(disable:4800)
+#endif
+#endif
+
+namespace testing {
+namespace internal {
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message) {
+  ::std::ostringstream s;
+  s << file << ":" << line << ": " << message << ::std::endl;
+  Log(severity, s.str(), 0);
+}
+
+// Constructs an ExpectationBase object.
+ExpectationBase::ExpectationBase(const char* a_file, int a_line,
+                                 const std::string& a_source_text)
+    : file_(a_file),
+      line_(a_line),
+      source_text_(a_source_text),
+      cardinality_specified_(false),
+      cardinality_(Exactly(1)),
+      call_count_(0),
+      retired_(false),
+      extra_matcher_specified_(false),
+      repeated_action_specified_(false),
+      retires_on_saturation_(false),
+      last_clause_(kNone),
+      action_count_checked_(false) {}
+
+// Destructs an ExpectationBase object.
+ExpectationBase::~ExpectationBase() {}
+
+// Explicitly specifies the cardinality of this expectation.  Used by
+// the subclasses to implement the .Times() clause.
+void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
+  cardinality_specified_ = true;
+  cardinality_ = a_cardinality;
+}
+
+// Retires all pre-requisites of this expectation.
+void ExpectationBase::RetireAllPreRequisites()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  if (is_retired()) {
+    // We can take this short-cut as we never retire an expectation
+    // until we have retired all its pre-requisites.
+    return;
+  }
+
+  ::std::vector<ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      ExpectationBase* next = it->expectation_base().get();
+      if (!next->is_retired()) {
+        next->Retire();
+        expectations.push_back(next);
+      }
+    }
+  }
+}
+
+// Returns true if all pre-requisites of this expectation have been
+// satisfied.
+bool ExpectationBase::AllPrerequisitesAreSatisfied() const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+      if (!next->IsSatisfied()) return false;
+      expectations.push_back(next);
+    }
+  }
+  return true;
+}
+
+// Adds unsatisfied pre-requisites of this expectation to 'result'.
+void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+
+      if (next->IsSatisfied()) {
+        // If *it is satisfied and has a call count of 0, some of its
+        // pre-requisites may not be satisfied yet.
+        if (next->call_count_ == 0) {
+          expectations.push_back(next);
+        }
+      } else {
+        // Now that we know next is unsatisfied, we are not so interested
+        // in whether its pre-requisites are satisfied.  Therefore we
+        // don't iterate into it here.
+        *result += *it;
+      }
+    }
+  }
+}
+
+// Describes how many times a function call matching this
+// expectation has occurred.
+void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+
+  // Describes how many times the function is expected to be called.
+  *os << "         Expected: to be ";
+  cardinality().DescribeTo(os);
+  *os << "\n           Actual: ";
+  Cardinality::DescribeActualCallCountTo(call_count(), os);
+
+  // Describes the state of the expectation (e.g. is it satisfied?
+  // is it active?).
+  *os << " - " << (IsOverSaturated() ? "over-saturated" :
+                   IsSaturated() ? "saturated" :
+                   IsSatisfied() ? "satisfied" : "unsatisfied")
+      << " and "
+      << (is_retired() ? "retired" : "active");
+}
+
+// Checks the action count (i.e. the number of WillOnce() and
+// WillRepeatedly() clauses) against the cardinality if this hasn't
+// been done before.  Prints a warning if there are too many or too
+// few actions.
+void ExpectationBase::CheckActionCountIfNotDone() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  bool should_check = false;
+  {
+    MutexLock l(&mutex_);
+    if (!action_count_checked_) {
+      action_count_checked_ = true;
+      should_check = true;
+    }
+  }
+
+  if (should_check) {
+    if (!cardinality_specified_) {
+      // The cardinality was inferred - no need to check the action
+      // count against it.
+      return;
+    }
+
+    // The cardinality was explicitly specified.
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    const int upper_bound = cardinality().ConservativeUpperBound();
+    const int lower_bound = cardinality().ConservativeLowerBound();
+    bool too_many;  // True if there are too many actions, or false
+    // if there are too few.
+    if (action_count > upper_bound ||
+        (action_count == upper_bound && repeated_action_specified_)) {
+      too_many = true;
+    } else if (0 < action_count && action_count < lower_bound &&
+               !repeated_action_specified_) {
+      too_many = false;
+    } else {
+      return;
+    }
+
+    ::std::stringstream ss;
+    DescribeLocationTo(&ss);
+    ss << "Too " << (too_many ? "many" : "few")
+       << " actions specified in " << source_text() << "...\n"
+       << "Expected to be ";
+    cardinality().DescribeTo(&ss);
+    ss << ", but has " << (too_many ? "" : "only ")
+       << action_count << " WillOnce()"
+       << (action_count == 1 ? "" : "s");
+    if (repeated_action_specified_) {
+      ss << " and a WillRepeatedly()";
+    }
+    ss << ".";
+    Log(kWarning, ss.str(), -1);  // -1 means "don't print stack trace".
+  }
+}
+
+// Implements the .Times() clause.
+void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
+  if (last_clause_ == kTimes) {
+    ExpectSpecProperty(false,
+                       ".Times() cannot appear "
+                       "more than once in an EXPECT_CALL().");
+  } else {
+    ExpectSpecProperty(last_clause_ < kTimes,
+                       ".Times() cannot appear after "
+                       ".InSequence(), .WillOnce(), .WillRepeatedly(), "
+                       "or .RetiresOnSaturation().");
+  }
+  last_clause_ = kTimes;
+
+  SpecifyCardinality(a_cardinality);
+}
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
+  // Include a stack trace only if --gmock_verbose=info is specified.
+  const int stack_frames_to_skip =
+      GMOCK_FLAG(verbose) == kInfoVerbosity ? 3 : -1;
+  switch (reaction) {
+    case kAllow:
+      Log(kInfo, msg, stack_frames_to_skip);
+      break;
+    case kWarn:
+      Log(kWarning,
+          msg +
+              "\nNOTE: You can safely ignore the above warning unless this "
+              "call should not happen.  Do not suppress it by blindly adding "
+              "an EXPECT_CALL() if you don't mean to enforce the call.  "
+              "See "
+              "https://github.com/google/googletest/blob/master/googlemock/"
+              "docs/cook_book.md#"
+              "knowing-when-to-expect for details.\n",
+          stack_frames_to_skip);
+      break;
+    default:  // FAIL
+      Expect(false, nullptr, -1, msg);
+  }
+}
+
+UntypedFunctionMockerBase::UntypedFunctionMockerBase()
+    : mock_obj_(nullptr), name_("") {}
+
+UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
+
+// Sets the mock object this mock method belongs to, and registers
+// this information in the global mock registry.  Will be called
+// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+// method.
+void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  {
+    MutexLock l(&g_gmock_mutex);
+    mock_obj_ = mock_obj;
+  }
+  Mock::Register(mock_obj, this);
+}
+
+// Sets the mock object this mock method belongs to, and sets the name
+// of the mock function.  Will be called upon each invocation of this
+// mock function.
+void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
+                                                const char* name)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // We protect name_ under g_gmock_mutex in case this mock function
+  // is called from two threads concurrently.
+  MutexLock l(&g_gmock_mutex);
+  mock_obj_ = mock_obj;
+  name_ = name;
+}
+
+// Returns the name of the function being mocked.  Must be called
+// after RegisterOwner() or SetOwnerAndName() has been called.
+const void* UntypedFunctionMockerBase::MockObject() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const void* mock_obj;
+  {
+    // We protect mock_obj_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(mock_obj_ != nullptr, __FILE__, __LINE__,
+           "MockObject() must not be called before RegisterOwner() or "
+           "SetOwnerAndName() has been called.");
+    mock_obj = mock_obj_;
+  }
+  return mock_obj;
+}
+
+// Returns the name of this mock method.  Must be called after
+// SetOwnerAndName() has been called.
+const char* UntypedFunctionMockerBase::Name() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const char* name;
+  {
+    // We protect name_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(name_ != nullptr, __FILE__, __LINE__,
+           "Name() must not be called before SetOwnerAndName() has "
+           "been called.");
+    name = name_;
+  }
+  return name;
+}
+
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.  The caller is responsible
+// for deleting the result.
+UntypedActionResultHolderBase* UntypedFunctionMockerBase::UntypedInvokeWith(
+    void* const untyped_args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  if (untyped_expectations_.size() == 0) {
+    // No expectation is set on this mock method - we have an
+    // uninteresting call.
+
+    // We must get Google Mock's reaction on uninteresting calls
+    // made on this mock object BEFORE performing the action,
+    // because the action may DELETE the mock object and make the
+    // following expression meaningless.
+    const CallReaction reaction =
+        Mock::GetReactionOnUninterestingCalls(MockObject());
+
+    // True if we need to print this call's arguments and return
+    // value.  This definition must be kept in sync with
+    // the behavior of ReportUninterestingCall().
+    const bool need_to_report_uninteresting_call =
+        // If the user allows this uninteresting call, we print it
+        // only when they want informational messages.
+        reaction == kAllow ? LogIsVisible(kInfo) :
+                           // If the user wants this to be a warning, we print
+                           // it only when they want to see warnings.
+            reaction == kWarn
+                ? LogIsVisible(kWarning)
+                :
+                // Otherwise, the user wants this to be an error, and we
+                // should always print detailed information in the error.
+                true;
+
+    if (!need_to_report_uninteresting_call) {
+      // Perform the action without printing the call information.
+      return this->UntypedPerformDefaultAction(
+          untyped_args, "Function call: " + std::string(Name()));
+    }
+
+    // Warns about the uninteresting call.
+    ::std::stringstream ss;
+    this->UntypedDescribeUninterestingCall(untyped_args, &ss);
+
+    // Calculates the function result.
+    UntypedActionResultHolderBase* const result =
+        this->UntypedPerformDefaultAction(untyped_args, ss.str());
+
+    // Prints the function result.
+    if (result != nullptr) result->PrintAsActionResult(&ss);
+
+    ReportUninterestingCall(reaction, ss.str());
+    return result;
+  }
+
+  bool is_excessive = false;
+  ::std::stringstream ss;
+  ::std::stringstream why;
+  ::std::stringstream loc;
+  const void* untyped_action = nullptr;
+
+  // The UntypedFindMatchingExpectation() function acquires and
+  // releases g_gmock_mutex.
+  const ExpectationBase* const untyped_expectation =
+      this->UntypedFindMatchingExpectation(
+          untyped_args, &untyped_action, &is_excessive,
+          &ss, &why);
+  const bool found = untyped_expectation != nullptr;
+
+  // True if we need to print the call's arguments and return value.
+  // This definition must be kept in sync with the uses of Expect()
+  // and Log() in this function.
+  const bool need_to_report_call =
+      !found || is_excessive || LogIsVisible(kInfo);
+  if (!need_to_report_call) {
+    // Perform the action without printing the call information.
+    return untyped_action == nullptr
+               ? this->UntypedPerformDefaultAction(untyped_args, "")
+               : this->UntypedPerformAction(untyped_action, untyped_args);
+  }
+
+  ss << "    Function call: " << Name();
+  this->UntypedPrintArgs(untyped_args, &ss);
+
+  // In case the action deletes a piece of the expectation, we
+  // generate the message beforehand.
+  if (found && !is_excessive) {
+    untyped_expectation->DescribeLocationTo(&loc);
+  }
+
+  UntypedActionResultHolderBase* const result =
+      untyped_action == nullptr
+          ? this->UntypedPerformDefaultAction(untyped_args, ss.str())
+          : this->UntypedPerformAction(untyped_action, untyped_args);
+  if (result != nullptr) result->PrintAsActionResult(&ss);
+  ss << "\n" << why.str();
+
+  if (!found) {
+    // No expectation matches this call - reports a failure.
+    Expect(false, nullptr, -1, ss.str());
+  } else if (is_excessive) {
+    // We had an upper-bound violation and the failure message is in ss.
+    Expect(false, untyped_expectation->file(),
+           untyped_expectation->line(), ss.str());
+  } else {
+    // We had an expected call and the matching expectation is
+    // described in ss.
+    Log(kInfo, loc.str() + ss.str(), 2);
+  }
+
+  return result;
+}
+
+// Returns an Expectation object that references and co-owns exp,
+// which must be an expectation on this mock function.
+Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    if (it->get() == exp) {
+      return Expectation(*it);
+    }
+  }
+
+  Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
+  return Expectation();
+  // The above statement is just to make the code compile, and will
+  // never be executed.
+}
+
+// Verifies that all expectations on this mock function have been
+// satisfied.  Reports one or more Google Test non-fatal failures
+// and returns false if not.
+bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  bool expectations_met = true;
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    ExpectationBase* const untyped_expectation = it->get();
+    if (untyped_expectation->IsOverSaturated()) {
+      // There was an upper-bound violation.  Since the error was
+      // already reported when it occurred, there is no need to do
+      // anything here.
+      expectations_met = false;
+    } else if (!untyped_expectation->IsSatisfied()) {
+      expectations_met = false;
+      ::std::stringstream ss;
+      ss  << "Actual function call count doesn't match "
+          << untyped_expectation->source_text() << "...\n";
+      // No need to show the source file location of the expectation
+      // in the description, as the Expect() call that follows already
+      // takes care of it.
+      untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
+      untyped_expectation->DescribeCallCountTo(&ss);
+      Expect(false, untyped_expectation->file(),
+             untyped_expectation->line(), ss.str());
+    }
+  }
+
+  // Deleting our expectations may trigger other mock objects to be deleted, for
+  // example if an action contains a reference counted smart pointer to that
+  // mock object, and that is the last reference. So if we delete our
+  // expectations within the context of the global mutex we may deadlock when
+  // this method is called again. Instead, make a copy of the set of
+  // expectations to delete, clear our set within the mutex, and then clear the
+  // copied set outside of it.
+  UntypedExpectations expectations_to_delete;
+  untyped_expectations_.swap(expectations_to_delete);
+
+  g_gmock_mutex.Unlock();
+  expectations_to_delete.clear();
+  g_gmock_mutex.Lock();
+
+  return expectations_met;
+}
+
+CallReaction intToCallReaction(int mock_behavior) {
+  if (mock_behavior >= kAllow && mock_behavior <= kFail) {
+    return static_cast<internal::CallReaction>(mock_behavior);
+  }
+  return kWarn;
+}
+
+}  // namespace internal
+
+// Class Mock.
+
+namespace {
+
+typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
+
+// The current state of a mock object.  Such information is needed for
+// detecting leaked mock objects and explicitly verifying a mock's
+// expectations.
+struct MockObjectState {
+  MockObjectState()
+      : first_used_file(nullptr), first_used_line(-1), leakable(false) {}
+
+  // Where in the source file an ON_CALL or EXPECT_CALL is first
+  // invoked on this mock object.
+  const char* first_used_file;
+  int first_used_line;
+  ::std::string first_used_test_suite;
+  ::std::string first_used_test;
+  bool leakable;  // true if it's OK to leak the object.
+  FunctionMockers function_mockers;  // All registered methods of the object.
+};
+
+// A global registry holding the state of all mock objects that are
+// alive.  A mock object is added to this registry the first time
+// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it.  It
+// is removed from the registry in the mock object's destructor.
+class MockObjectRegistry {
+ public:
+  // Maps a mock object (identified by its address) to its state.
+  typedef std::map<const void*, MockObjectState> StateMap;
+
+  // This destructor will be called when a program exits, after all
+  // tests in it have been run.  By then, there should be no mock
+  // object alive.  Therefore we report any living object as test
+  // failure, unless the user explicitly asked us to ignore it.
+  ~MockObjectRegistry() {
+    if (!GMOCK_FLAG(catch_leaked_mocks))
+      return;
+
+    int leaked_count = 0;
+    for (StateMap::const_iterator it = states_.begin(); it != states_.end();
+         ++it) {
+      if (it->second.leakable)  // The user said it's fine to leak this object.
+        continue;
+
+      // FIXME: Print the type of the leaked object.
+      // This can help the user identify the leaked object.
+      std::cout << "\n";
+      const MockObjectState& state = it->second;
+      std::cout << internal::FormatFileLocation(state.first_used_file,
+                                                state.first_used_line);
+      std::cout << " ERROR: this mock object";
+      if (state.first_used_test != "") {
+        std::cout << " (used in test " << state.first_used_test_suite << "."
+                  << state.first_used_test << ")";
+      }
+      std::cout << " should be deleted but never is. Its address is @"
+           << it->first << ".";
+      leaked_count++;
+    }
+    if (leaked_count > 0) {
+      std::cout << "\nERROR: " << leaked_count << " leaked mock "
+                << (leaked_count == 1 ? "object" : "objects")
+                << " found at program exit. Expectations on a mock object is "
+                   "verified when the object is destructed. Leaking a mock "
+                   "means that its expectations aren't verified, which is "
+                   "usually a test bug. If you really intend to leak a mock, "
+                   "you can suppress this error using "
+                   "testing::Mock::AllowLeak(mock_object), or you may use a "
+                   "fake or stub instead of a mock.\n";
+      std::cout.flush();
+      ::std::cerr.flush();
+      // RUN_ALL_TESTS() has already returned when this destructor is
+      // called.  Therefore we cannot use the normal Google Test
+      // failure reporting mechanism.
+      _exit(1);  // We cannot call exit() as it is not reentrant and
+                 // may already have been called.
+    }
+  }
+
+  StateMap& states() { return states_; }
+
+ private:
+  StateMap states_;
+};
+
+// Protected by g_gmock_mutex.
+MockObjectRegistry g_mock_object_registry;
+
+// Maps a mock object to the reaction Google Mock should have when an
+// uninteresting method is called.  Protected by g_gmock_mutex.
+std::map<const void*, internal::CallReaction> g_uninteresting_call_reaction;
+
+// Sets the reaction Google Mock should have when an uninteresting
+// method of the given mock object is called.
+void SetReactionOnUninterestingCalls(const void* mock_obj,
+                                     internal::CallReaction reaction)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction[mock_obj] = reaction;
+}
+
+}  // namespace
+
+// Tells Google Mock to allow uninteresting calls on the given mock
+// object.
+void Mock::AllowUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
+}
+
+// Tells Google Mock to warn the user about uninteresting calls on the
+// given mock object.
+void Mock::WarnUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
+}
+
+// Tells Google Mock to fail uninteresting calls on the given mock
+// object.
+void Mock::FailUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
+}
+
+// Tells Google Mock the given mock object is being destroyed and its
+// entry in the call-reaction table should be removed.
+void Mock::UnregisterCallReaction(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction.erase(mock_obj);
+}
+
+// Returns the reaction Google Mock will have on uninteresting calls
+// made on the given mock object.
+internal::CallReaction Mock::GetReactionOnUninterestingCalls(
+    const void* mock_obj)
+        GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return (g_uninteresting_call_reaction.count(mock_obj) == 0) ?
+      internal::intToCallReaction(GMOCK_FLAG(default_mock_behavior)) :
+      g_uninteresting_call_reaction[mock_obj];
+}
+
+// Tells Google Mock to ignore mock_obj when checking for leaked mock
+// objects.
+void Mock::AllowLeak(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].leakable = true;
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectations(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies all expectations on the given mock object and clears its
+// default actions and expectations.  Returns true if the
+// verification was successful.
+bool Mock::VerifyAndClear(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  ClearDefaultActionsLocked(mock_obj);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No EXPECT_CALL() was set on the given mock object.
+    return true;
+  }
+
+  // Verifies and clears the expectations on each mock method in the
+  // given mock object.
+  bool expectations_met = true;
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    if (!(*it)->VerifyAndClearExpectationsLocked()) {
+      expectations_met = false;
+    }
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by ClearDefaultActionsLocked().
+  return expectations_met;
+}
+
+bool Mock::IsNaggy(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kWarn;
+}
+bool Mock::IsNice(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kAllow;
+}
+bool Mock::IsStrict(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kFail;
+}
+
+// Registers a mock object and a mock method it owns.
+void Mock::Register(const void* mock_obj,
+                    internal::UntypedFunctionMockerBase* mocker)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
+}
+
+// Tells Google Mock where in the source code mock_obj is used in an
+// ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+// information helps the user identify which object it is.
+void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+                                           const char* file, int line)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  MockObjectState& state = g_mock_object_registry.states()[mock_obj];
+  if (state.first_used_file == nullptr) {
+    state.first_used_file = file;
+    state.first_used_line = line;
+    const TestInfo* const test_info =
+        UnitTest::GetInstance()->current_test_info();
+    if (test_info != nullptr) {
+      state.first_used_test_suite = test_info->test_suite_name();
+      state.first_used_test = test_info->name();
+    }
+  }
+}
+
+// Unregisters a mock method; removes the owning mock object from the
+// registry when the last mock method associated with it has been
+// unregistered.  This is called only in the destructor of
+// FunctionMockerBase.
+void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  for (MockObjectRegistry::StateMap::iterator it =
+           g_mock_object_registry.states().begin();
+       it != g_mock_object_registry.states().end(); ++it) {
+    FunctionMockers& mockers = it->second.function_mockers;
+    if (mockers.erase(mocker) > 0) {
+      // mocker was in mockers and has been just removed.
+      if (mockers.empty()) {
+        g_mock_object_registry.states().erase(it);
+      }
+      return;
+    }
+  }
+}
+
+// Clears all ON_CALL()s set on the given mock object.
+void Mock::ClearDefaultActionsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No ON_CALL() was set on the given mock object.
+    return;
+  }
+
+  // Clears the default actions for each mock method in the given mock
+  // object.
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    (*it)->ClearDefaultActionsLocked();
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by VerifyAndClearExpectationsLocked().
+}
+
+Expectation::Expectation() {}
+
+Expectation::Expectation(
+    const std::shared_ptr<internal::ExpectationBase>& an_expectation_base)
+    : expectation_base_(an_expectation_base) {}
+
+Expectation::~Expectation() {}
+
+// Adds an expectation to a sequence.
+void Sequence::AddExpectation(const Expectation& expectation) const {
+  if (*last_expectation_ != expectation) {
+    if (last_expectation_->expectation_base() != nullptr) {
+      expectation.expectation_base()->immediate_prerequisites_
+          += *last_expectation_;
+    }
+    *last_expectation_ = expectation;
+  }
+}
+
+// Creates the implicit sequence if there isn't one.
+InSequence::InSequence() {
+  if (internal::g_gmock_implicit_sequence.get() == nullptr) {
+    internal::g_gmock_implicit_sequence.set(new Sequence);
+    sequence_created_ = true;
+  } else {
+    sequence_created_ = false;
+  }
+}
+
+// Deletes the implicit sequence if it was created by the constructor
+// of this object.
+InSequence::~InSequence() {
+  if (sequence_created_) {
+    delete internal::g_gmock_implicit_sequence.get();
+    internal::g_gmock_implicit_sequence.set(nullptr);
+  }
+}
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(pop)
+#endif
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock.cc
new file mode 100755
index 0000000..ce926f2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock.cc
@@ -0,0 +1,213 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
+                   "true if Google Mock should report leaked mock objects "
+                   "as failures.");
+
+GMOCK_DEFINE_string_(verbose, internal::kWarningVerbosity,
+                     "Controls how verbose Google Mock's output is."
+                     "  Valid values:\n"
+                     "  info    - prints all messages.\n"
+                     "  warning - prints warnings and errors.\n"
+                     "  error   - prints errors only.");
+
+GMOCK_DEFINE_int32_(default_mock_behavior, 1,
+                    "Controls the default behavior of mocks."
+                    "  Valid values:\n"
+                    "  0 - by default, mocks act as NiceMocks.\n"
+                    "  1 - by default, mocks act as NaggyMocks.\n"
+                    "  2 - by default, mocks act as StrictMocks.");
+
+namespace internal {
+
+// Parses a string as a command line flag.  The string should have the
+// format "--gmock_flag=value".  When def_optional is true, the
+// "=value" part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseGoogleMockFlagValue(const char* str,
+                                            const char* flag,
+                                            bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--gmock_".
+  const std::string flag_str = std::string("--gmock_") + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a Google Mock bool flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
+                                    bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for a Google Mock string flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
+                                      String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+static bool ParseGoogleMockIntFlag(const char* str, const char* flag,
+                                   int* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// The internal implementation of InitGoogleMock().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleMockImpl(int* argc, CharType** argv) {
+  // Makes sure Google Test is initialized.  InitGoogleTest() is
+  // idempotent, so it's fine if the user has already called it.
+  InitGoogleTest(argc, argv);
+  if (*argc <= 0) return;
+
+  for (int i = 1; i != *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    // Do we see a Google Mock flag?
+    if (ParseGoogleMockBoolFlag(arg, "catch_leaked_mocks",
+                                &GMOCK_FLAG(catch_leaked_mocks)) ||
+        ParseGoogleMockStringFlag(arg, "verbose", &GMOCK_FLAG(verbose)) ||
+        ParseGoogleMockIntFlag(arg, "default_mock_behavior",
+                               &GMOCK_FLAG(default_mock_behavior))) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+}
+
+}  // namespace internal
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses a command line for the flags that
+// Google Mock recognizes.  Whenever a Google Mock flag is seen, it is
+// removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
+
+  internal::InitGoogleMockImpl(&argc, argv);
+}
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock_main.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock_main.cc
new file mode 100755
index 0000000..98611b9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/src/gmock_main.cc
@@ -0,0 +1,65 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include <iostream>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#ifdef ARDUINO
+void setup() {
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock();
+}
+void loop() { RUN_ALL_TESTS(); }
+#else
+
+// MS C++ compiler/linker has a bug on Windows (not on Windows CE), which
+// causes a link error when _tmain is defined in a static library and UNICODE
+// is enabled. For this reason instead of _tmain, main function is used on
+// Windows. See the following link to track the current status of this bug:
+// https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
+// // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE
+# include <tchar.h>  // NOLINT
+
+GTEST_API_ int _tmain(int argc, TCHAR** argv) {
+#else
+GTEST_API_ int main(int argc, char** argv) {
+#endif  // GTEST_OS_WINDOWS_MOBILE
+  std::cout << "Running main() from gmock_main.cc\n";
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/BUILD.bazel b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/BUILD.bazel
new file mode 100755
index 0000000..da95ed5
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/BUILD.bazel
@@ -0,0 +1,110 @@
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: misterg@google.com (Gennadiy Civil)
+#
+#   Bazel Build for Google C++ Testing Framework(Google Test)-googlemock
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_test")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+licenses(["notice"])
+
+# Tests for GMock itself
+cc_test(
+    name = "gmock_all_test",
+    size = "small",
+    srcs = glob(include = ["gmock-*.cc"]),
+    linkopts = select({
+        "//:windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    deps = ["//:gtest"],
+)
+
+# Python tests
+py_library(
+    name = "gmock_test_utils",
+    testonly = 1,
+    srcs = ["gmock_test_utils.py"],
+)
+
+cc_binary(
+    name = "gmock_leak_test_",
+    testonly = 1,
+    srcs = ["gmock_leak_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "gmock_leak_test",
+    size = "medium",
+    srcs = ["gmock_leak_test.py"],
+    data = [
+        ":gmock_leak_test_",
+        ":gmock_test_utils",
+    ],
+)
+
+cc_test(
+    name = "gmock_link_test",
+    size = "small",
+    srcs = [
+        "gmock_link2_test.cc",
+        "gmock_link_test.cc",
+        "gmock_link_test.h",
+    ],
+    deps = ["//:gtest_main"],
+)
+
+cc_binary(
+    name = "gmock_output_test_",
+    srcs = ["gmock_output_test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "gmock_output_test",
+    size = "medium",
+    srcs = ["gmock_output_test.py"],
+    data = [
+        ":gmock_output_test_",
+        ":gmock_output_test_golden.txt",
+    ],
+    python_version = "PY2",
+    deps = [":gmock_test_utils"],
+)
+
+cc_test(
+    name = "gmock_test",
+    size = "small",
+    srcs = ["gmock_test.cc"],
+    deps = ["//:gtest_main"],
+)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-actions_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-actions_test.cc
new file mode 100755
index 0000000..cc0fd3a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-actions_test.cc
@@ -0,0 +1,1445 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions.
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(push)
+#  pragma warning(disable:4800)
+#endif
+#endif
+
+#include "gmock/gmock-actions.h"
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <string>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace {
+
+// This list should be kept sorted.
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::Assign;
+using testing::ByMove;
+using testing::ByRef;
+using testing::DefaultValue;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::IgnoreResult;
+using testing::Invoke;
+using testing::InvokeWithoutArgs;
+using testing::MakePolymorphicAction;
+using testing::Ne;
+using testing::PolymorphicAction;
+using testing::Return;
+using testing::ReturnNull;
+using testing::ReturnRef;
+using testing::ReturnRefOfCopy;
+using testing::SetArgPointee;
+using testing::SetArgumentPointee;
+using testing::Unused;
+using testing::WithArgs;
+using testing::internal::BuiltInDefaultValue;
+using testing::internal::Int64;
+using testing::internal::UInt64;
+
+#if !GTEST_OS_WINDOWS_MOBILE
+using testing::SetErrnoAndReturn;
+#endif
+
+// Tests that BuiltInDefaultValue<T*>::Get() returns NULL.
+TEST(BuiltInDefaultValueTest, IsNullForPointerTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<int*>::Get() == nullptr);
+  EXPECT_TRUE(BuiltInDefaultValue<const char*>::Get() == nullptr);
+  EXPECT_TRUE(BuiltInDefaultValue<void*>::Get() == nullptr);
+}
+
+// Tests that BuiltInDefaultValue<T*>::Exists() return true.
+TEST(BuiltInDefaultValueTest, ExistsForPointerTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<int*>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<const char*>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<void*>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T>::Get() returns 0 when T is a
+// built-in numeric type.
+TEST(BuiltInDefaultValueTest, IsZeroForNumericTypes) {
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned char>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<signed char>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<char>::Get());
+#if GMOCK_WCHAR_T_IS_NATIVE_
+#if !defined(__WCHAR_UNSIGNED__)
+  EXPECT_EQ(0, BuiltInDefaultValue<wchar_t>::Get());
+#else
+  EXPECT_EQ(0U, BuiltInDefaultValue<wchar_t>::Get());
+#endif
+#endif
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned short>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<signed short>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<short>::Get());  // NOLINT
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned int>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<signed int>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<int>::Get());
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<signed long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<long>::Get());  // NOLINT
+  EXPECT_EQ(0U, BuiltInDefaultValue<UInt64>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<Int64>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<float>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<double>::Get());
+}
+
+// Tests that BuiltInDefaultValue<T>::Exists() returns true when T is a
+// built-in numeric type.
+TEST(BuiltInDefaultValueTest, ExistsForNumericTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned char>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<signed char>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<char>::Exists());
+#if GMOCK_WCHAR_T_IS_NATIVE_
+  EXPECT_TRUE(BuiltInDefaultValue<wchar_t>::Exists());
+#endif
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<signed short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<signed int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<signed long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<UInt64>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<Int64>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<float>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<double>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<bool>::Get() returns false.
+TEST(BuiltInDefaultValueTest, IsFalseForBool) {
+  EXPECT_FALSE(BuiltInDefaultValue<bool>::Get());
+}
+
+// Tests that BuiltInDefaultValue<bool>::Exists() returns true.
+TEST(BuiltInDefaultValueTest, BoolExists) {
+  EXPECT_TRUE(BuiltInDefaultValue<bool>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T>::Get() returns "" when T is a
+// string type.
+TEST(BuiltInDefaultValueTest, IsEmptyStringForString) {
+  EXPECT_EQ("", BuiltInDefaultValue< ::std::string>::Get());
+}
+
+// Tests that BuiltInDefaultValue<T>::Exists() returns true when T is a
+// string type.
+TEST(BuiltInDefaultValueTest, ExistsForString) {
+  EXPECT_TRUE(BuiltInDefaultValue< ::std::string>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<const T>::Get() returns the same
+// value as BuiltInDefaultValue<T>::Get() does.
+TEST(BuiltInDefaultValueTest, WorksForConstTypes) {
+  EXPECT_EQ("", BuiltInDefaultValue<const std::string>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<const int>::Get());
+  EXPECT_TRUE(BuiltInDefaultValue<char* const>::Get() == nullptr);
+  EXPECT_FALSE(BuiltInDefaultValue<const bool>::Get());
+}
+
+// A type that's default constructible.
+class MyDefaultConstructible {
+ public:
+  MyDefaultConstructible() : value_(42) {}
+
+  int value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+// A type that's not default constructible.
+class MyNonDefaultConstructible {
+ public:
+  // Does not have a default ctor.
+  explicit MyNonDefaultConstructible(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+
+TEST(BuiltInDefaultValueTest, ExistsForDefaultConstructibleType) {
+  EXPECT_TRUE(BuiltInDefaultValue<MyDefaultConstructible>::Exists());
+}
+
+TEST(BuiltInDefaultValueTest, IsDefaultConstructedForDefaultConstructibleType) {
+  EXPECT_EQ(42, BuiltInDefaultValue<MyDefaultConstructible>::Get().value());
+}
+
+
+TEST(BuiltInDefaultValueTest, DoesNotExistForNonDefaultConstructibleType) {
+  EXPECT_FALSE(BuiltInDefaultValue<MyNonDefaultConstructible>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T&>::Get() aborts the program.
+TEST(BuiltInDefaultValueDeathTest, IsUndefinedForReferences) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<int&>::Get();
+  }, "");
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<const char&>::Get();
+  }, "");
+}
+
+TEST(BuiltInDefaultValueDeathTest, IsUndefinedForNonDefaultConstructibleType) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+// Tests that DefaultValue<T>::IsSet() is false initially.
+TEST(DefaultValueTest, IsInitiallyUnset) {
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible>::IsSet());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::IsSet());
+}
+
+// Tests that DefaultValue<T> can be set and then unset.
+TEST(DefaultValueTest, CanBeSetAndUnset) {
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+
+  DefaultValue<int>::Set(1);
+  DefaultValue<const MyNonDefaultConstructible>::Set(
+      MyNonDefaultConstructible(42));
+
+  EXPECT_EQ(1, DefaultValue<int>::Get());
+  EXPECT_EQ(42, DefaultValue<const MyNonDefaultConstructible>::Get().value());
+
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_TRUE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+
+  DefaultValue<int>::Clear();
+  DefaultValue<const MyNonDefaultConstructible>::Clear();
+
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::IsSet());
+
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+}
+
+// Tests that DefaultValue<T>::Get() returns the
+// BuiltInDefaultValue<T>::Get() when DefaultValue<T>::IsSet() is
+// false.
+TEST(DefaultValueDeathTest, GetReturnsBuiltInDefaultValueWhenUnset) {
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible>::Exists());
+
+  EXPECT_EQ(0, DefaultValue<int>::Get());
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+TEST(DefaultValueTest, GetWorksForMoveOnlyIfSet) {
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Exists());
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Get() == nullptr);
+  DefaultValue<std::unique_ptr<int>>::SetFactory([] {
+    return std::unique_ptr<int>(new int(42));
+  });
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Exists());
+  std::unique_ptr<int> i = DefaultValue<std::unique_ptr<int>>::Get();
+  EXPECT_EQ(42, *i);
+}
+
+// Tests that DefaultValue<void>::Get() returns void.
+TEST(DefaultValueTest, GetWorksForVoid) {
+  return DefaultValue<void>::Get();
+}
+
+// Tests using DefaultValue with a reference type.
+
+// Tests that DefaultValue<T&>::IsSet() is false initially.
+TEST(DefaultValueOfReferenceTest, IsInitiallyUnset) {
+  EXPECT_FALSE(DefaultValue<int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+}
+
+// Tests that DefaultValue<T&>::Exists is false initiallly.
+TEST(DefaultValueOfReferenceTest, IsInitiallyNotExisting) {
+  EXPECT_FALSE(DefaultValue<int&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+}
+
+// Tests that DefaultValue<T&> can be set and then unset.
+TEST(DefaultValueOfReferenceTest, CanBeSetAndUnset) {
+  int n = 1;
+  DefaultValue<const int&>::Set(n);
+  MyNonDefaultConstructible x(42);
+  DefaultValue<MyNonDefaultConstructible&>::Set(x);
+
+  EXPECT_TRUE(DefaultValue<const int&>::Exists());
+  EXPECT_TRUE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+
+  EXPECT_EQ(&n, &(DefaultValue<const int&>::Get()));
+  EXPECT_EQ(&x, &(DefaultValue<MyNonDefaultConstructible&>::Get()));
+
+  DefaultValue<const int&>::Clear();
+  DefaultValue<MyNonDefaultConstructible&>::Clear();
+
+  EXPECT_FALSE(DefaultValue<const int&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+
+  EXPECT_FALSE(DefaultValue<const int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+}
+
+// Tests that DefaultValue<T&>::Get() returns the
+// BuiltInDefaultValue<T&>::Get() when DefaultValue<T&>::IsSet() is
+// false.
+TEST(DefaultValueOfReferenceDeathTest, GetReturnsBuiltInDefaultValueWhenUnset) {
+  EXPECT_FALSE(DefaultValue<int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<int&>::Get();
+  }, "");
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+// Tests that ActionInterface can be implemented by defining the
+// Perform method.
+
+typedef int MyGlobalFunction(bool, int);
+
+class MyActionImpl : public ActionInterface<MyGlobalFunction> {
+ public:
+  int Perform(const std::tuple<bool, int>& args) override {
+    return std::get<0>(args) ? std::get<1>(args) : 0;
+  }
+};
+
+TEST(ActionInterfaceTest, CanBeImplementedByDefiningPerform) {
+  MyActionImpl my_action_impl;
+  (void)my_action_impl;
+}
+
+TEST(ActionInterfaceTest, MakeAction) {
+  Action<MyGlobalFunction> action = MakeAction(new MyActionImpl);
+
+  // When exercising the Perform() method of Action<F>, we must pass
+  // it a tuple whose size and type are compatible with F's argument
+  // types.  For example, if F is int(), then Perform() takes a
+  // 0-tuple; if F is void(bool, int), then Perform() takes a
+  // std::tuple<bool, int>, and so on.
+  EXPECT_EQ(5, action.Perform(std::make_tuple(true, 5)));
+}
+
+// Tests that Action<F> can be constructed from a pointer to
+// ActionInterface<F>.
+TEST(ActionTest, CanBeConstructedFromActionInterface) {
+  Action<MyGlobalFunction> action(new MyActionImpl);
+}
+
+// Tests that Action<F> delegates actual work to ActionInterface<F>.
+TEST(ActionTest, DelegatesWorkToActionInterface) {
+  const Action<MyGlobalFunction> action(new MyActionImpl);
+
+  EXPECT_EQ(5, action.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, action.Perform(std::make_tuple(false, 1)));
+}
+
+// Tests that Action<F> can be copied.
+TEST(ActionTest, IsCopyable) {
+  Action<MyGlobalFunction> a1(new MyActionImpl);
+  Action<MyGlobalFunction> a2(a1);  // Tests the copy constructor.
+
+  // a1 should continue to work after being copied from.
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 1)));
+
+  // a2 should work like the action it was copied from.
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(false, 1)));
+
+  a2 = a1;  // Tests the assignment operator.
+
+  // a1 should continue to work after being copied from.
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 1)));
+
+  // a2 should work like the action it was copied from.
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(false, 1)));
+}
+
+// Tests that an Action<From> object can be converted to a
+// compatible Action<To> object.
+
+class IsNotZero : public ActionInterface<bool(int)> {  // NOLINT
+ public:
+  bool Perform(const std::tuple<int>& arg) override {
+    return std::get<0>(arg) != 0;
+  }
+};
+
+TEST(ActionTest, CanBeConvertedToOtherActionType) {
+  const Action<bool(int)> a1(new IsNotZero);  // NOLINT
+  const Action<int(char)> a2 = Action<int(char)>(a1);  // NOLINT
+  EXPECT_EQ(1, a2.Perform(std::make_tuple('a')));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple('\0')));
+}
+
+// The following two classes are for testing MakePolymorphicAction().
+
+// Implements a polymorphic action that returns the second of the
+// arguments it receives.
+class ReturnSecondArgumentAction {
+ public:
+  // We want to verify that MakePolymorphicAction() can work with a
+  // polymorphic action whose Perform() method template is either
+  // const or not.  This lets us verify the non-const case.
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) {
+    return std::get<1>(args);
+  }
+};
+
+// Implements a polymorphic action that can be used in a nullary
+// function to return 0.
+class ReturnZeroFromNullaryFunctionAction {
+ public:
+  // For testing that MakePolymorphicAction() works when the
+  // implementation class' Perform() method template takes only one
+  // template parameter.
+  //
+  // We want to verify that MakePolymorphicAction() can work with a
+  // polymorphic action whose Perform() method template is either
+  // const or not.  This lets us verify the const case.
+  template <typename Result>
+  Result Perform(const std::tuple<>&) const {
+    return 0;
+  }
+};
+
+// These functions verify that MakePolymorphicAction() returns a
+// PolymorphicAction<T> where T is the argument's type.
+
+PolymorphicAction<ReturnSecondArgumentAction> ReturnSecondArgument() {
+  return MakePolymorphicAction(ReturnSecondArgumentAction());
+}
+
+PolymorphicAction<ReturnZeroFromNullaryFunctionAction>
+ReturnZeroFromNullaryFunction() {
+  return MakePolymorphicAction(ReturnZeroFromNullaryFunctionAction());
+}
+
+// Tests that MakePolymorphicAction() turns a polymorphic action
+// implementation class into a polymorphic action.
+TEST(MakePolymorphicActionTest, ConstructsActionFromImpl) {
+  Action<int(bool, int, double)> a1 = ReturnSecondArgument();  // NOLINT
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(false, 5, 2.0)));
+}
+
+// Tests that MakePolymorphicAction() works when the implementation
+// class' Perform() method template has only one template parameter.
+TEST(MakePolymorphicActionTest, WorksWhenPerformHasOneTemplateParameter) {
+  Action<int()> a1 = ReturnZeroFromNullaryFunction();
+  EXPECT_EQ(0, a1.Perform(std::make_tuple()));
+
+  Action<void*()> a2 = ReturnZeroFromNullaryFunction();
+  EXPECT_TRUE(a2.Perform(std::make_tuple()) == nullptr);
+}
+
+// Tests that Return() works as an action for void-returning
+// functions.
+TEST(ReturnTest, WorksForVoid) {
+  const Action<void(int)> ret = Return();  // NOLINT
+  return ret.Perform(std::make_tuple(1));
+}
+
+// Tests that Return(v) returns v.
+TEST(ReturnTest, ReturnsGivenValue) {
+  Action<int()> ret = Return(1);  // NOLINT
+  EXPECT_EQ(1, ret.Perform(std::make_tuple()));
+
+  ret = Return(-5);
+  EXPECT_EQ(-5, ret.Perform(std::make_tuple()));
+}
+
+// Tests that Return("string literal") works.
+TEST(ReturnTest, AcceptsStringLiteral) {
+  Action<const char*()> a1 = Return("Hello");
+  EXPECT_STREQ("Hello", a1.Perform(std::make_tuple()));
+
+  Action<std::string()> a2 = Return("world");
+  EXPECT_EQ("world", a2.Perform(std::make_tuple()));
+}
+
+// Test struct which wraps a vector of integers. Used in
+// 'SupportsWrapperReturnType' test.
+struct IntegerVectorWrapper {
+  std::vector<int> * v;
+  IntegerVectorWrapper(std::vector<int>& _v) : v(&_v) {}  // NOLINT
+};
+
+// Tests that Return() works when return type is a wrapper type.
+TEST(ReturnTest, SupportsWrapperReturnType) {
+  // Initialize vector of integers.
+  std::vector<int> v;
+  for (int i = 0; i < 5; ++i) v.push_back(i);
+
+  // Return() called with 'v' as argument. The Action will return the same data
+  // as 'v' (copy) but it will be wrapped in an IntegerVectorWrapper.
+  Action<IntegerVectorWrapper()> a = Return(v);
+  const std::vector<int>& result = *(a.Perform(std::make_tuple()).v);
+  EXPECT_THAT(result, ::testing::ElementsAre(0, 1, 2, 3, 4));
+}
+
+// Tests that Return(v) is covaraint.
+
+struct Base {
+  bool operator==(const Base&) { return true; }
+};
+
+struct Derived : public Base {
+  bool operator==(const Derived&) { return true; }
+};
+
+TEST(ReturnTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base*()> ret = Return(&base);
+  EXPECT_EQ(&base, ret.Perform(std::make_tuple()));
+
+  ret = Return(&derived);
+  EXPECT_EQ(&derived, ret.Perform(std::make_tuple()));
+}
+
+// Tests that the type of the value passed into Return is converted into T
+// when the action is cast to Action<T(...)> rather than when the action is
+// performed. See comments on testing::internal::ReturnAction in
+// gmock-actions.h for more information.
+class FromType {
+ public:
+  explicit FromType(bool* is_converted) : converted_(is_converted) {}
+  bool* converted() const { return converted_; }
+
+ private:
+  bool* const converted_;
+
+  GTEST_DISALLOW_ASSIGN_(FromType);
+};
+
+class ToType {
+ public:
+  // Must allow implicit conversion due to use in ImplicitCast_<T>.
+  ToType(const FromType& x) { *x.converted() = true; }  // NOLINT
+};
+
+TEST(ReturnTest, ConvertsArgumentWhenConverted) {
+  bool converted = false;
+  FromType x(&converted);
+  Action<ToType()> action(Return(x));
+  EXPECT_TRUE(converted) << "Return must convert its argument in its own "
+                         << "conversion operator.";
+  converted = false;
+  action.Perform(std::tuple<>());
+  EXPECT_FALSE(converted) << "Action must NOT convert its argument "
+                          << "when performed.";
+}
+
+class DestinationType {};
+
+class SourceType {
+ public:
+  // Note: a non-const typecast operator.
+  operator DestinationType() { return DestinationType(); }
+};
+
+TEST(ReturnTest, CanConvertArgumentUsingNonConstTypeCastOperator) {
+  SourceType s;
+  Action<DestinationType()> action(Return(s));
+}
+
+// Tests that ReturnNull() returns NULL in a pointer-returning function.
+TEST(ReturnNullTest, WorksInPointerReturningFunction) {
+  const Action<int*()> a1 = ReturnNull();
+  EXPECT_TRUE(a1.Perform(std::make_tuple()) == nullptr);
+
+  const Action<const char*(bool)> a2 = ReturnNull();  // NOLINT
+  EXPECT_TRUE(a2.Perform(std::make_tuple(true)) == nullptr);
+}
+
+// Tests that ReturnNull() returns NULL for shared_ptr and unique_ptr returning
+// functions.
+TEST(ReturnNullTest, WorksInSmartPointerReturningFunction) {
+  const Action<std::unique_ptr<const int>()> a1 = ReturnNull();
+  EXPECT_TRUE(a1.Perform(std::make_tuple()) == nullptr);
+
+  const Action<std::shared_ptr<int>(std::string)> a2 = ReturnNull();
+  EXPECT_TRUE(a2.Perform(std::make_tuple("foo")) == nullptr);
+}
+
+// Tests that ReturnRef(v) works for reference types.
+TEST(ReturnRefTest, WorksForReference) {
+  const int n = 0;
+  const Action<const int&(bool)> ret = ReturnRef(n);  // NOLINT
+
+  EXPECT_EQ(&n, &ret.Perform(std::make_tuple(true)));
+}
+
+// Tests that ReturnRef(v) is covariant.
+TEST(ReturnRefTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base&()> a = ReturnRef(base);
+  EXPECT_EQ(&base, &a.Perform(std::make_tuple()));
+
+  a = ReturnRef(derived);
+  EXPECT_EQ(&derived, &a.Perform(std::make_tuple()));
+}
+
+// Tests that ReturnRefOfCopy(v) works for reference types.
+TEST(ReturnRefOfCopyTest, WorksForReference) {
+  int n = 42;
+  const Action<const int&()> ret = ReturnRefOfCopy(n);
+
+  EXPECT_NE(&n, &ret.Perform(std::make_tuple()));
+  EXPECT_EQ(42, ret.Perform(std::make_tuple()));
+
+  n = 43;
+  EXPECT_NE(&n, &ret.Perform(std::make_tuple()));
+  EXPECT_EQ(42, ret.Perform(std::make_tuple()));
+}
+
+// Tests that ReturnRefOfCopy(v) is covariant.
+TEST(ReturnRefOfCopyTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base&()> a = ReturnRefOfCopy(base);
+  EXPECT_NE(&base, &a.Perform(std::make_tuple()));
+
+  a = ReturnRefOfCopy(derived);
+  EXPECT_NE(&derived, &a.Perform(std::make_tuple()));
+}
+
+// Tests that DoDefault() does the default action for the mock method.
+
+class MockClass {
+ public:
+  MockClass() {}
+
+  MOCK_METHOD1(IntFunc, int(bool flag));  // NOLINT
+  MOCK_METHOD0(Foo, MyNonDefaultConstructible());
+  MOCK_METHOD0(MakeUnique, std::unique_ptr<int>());
+  MOCK_METHOD0(MakeUniqueBase, std::unique_ptr<Base>());
+  MOCK_METHOD0(MakeVectorUnique, std::vector<std::unique_ptr<int>>());
+  MOCK_METHOD1(TakeUnique, int(std::unique_ptr<int>));
+  MOCK_METHOD2(TakeUnique,
+               int(const std::unique_ptr<int>&, std::unique_ptr<int>));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockClass);
+};
+
+// Tests that DoDefault() returns the built-in default value for the
+// return type by default.
+TEST(DoDefaultTest, ReturnsBuiltInDefaultValueByDefault) {
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(0, mock.IntFunc(true));
+}
+
+// Tests that DoDefault() throws (when exceptions are enabled) or aborts
+// the process when there is no built-in default value for the return type.
+TEST(DoDefaultDeathTest, DiesForUnknowType) {
+  MockClass mock;
+  EXPECT_CALL(mock, Foo())
+      .WillRepeatedly(DoDefault());
+#if GTEST_HAS_EXCEPTIONS
+  EXPECT_ANY_THROW(mock.Foo());
+#else
+  EXPECT_DEATH_IF_SUPPORTED({
+    mock.Foo();
+  }, "");
+#endif
+}
+
+// Tests that using DoDefault() inside a composite action leads to a
+// run-time error.
+
+void VoidFunc(bool /* flag */) {}
+
+TEST(DoDefaultDeathTest, DiesIfUsedInCompositeAction) {
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillRepeatedly(DoAll(Invoke(VoidFunc),
+                            DoDefault()));
+
+  // Ideally we should verify the error message as well.  Sadly,
+  // EXPECT_DEATH() can only capture stderr, while Google Mock's
+  // errors are printed on stdout.  Therefore we have to settle for
+  // not verifying the message.
+  EXPECT_DEATH_IF_SUPPORTED({
+    mock.IntFunc(true);
+  }, "");
+}
+
+// Tests that DoDefault() returns the default value set by
+// DefaultValue<T>::Set() when it's not overriden by an ON_CALL().
+TEST(DoDefaultTest, ReturnsUserSpecifiedPerTypeDefaultValueWhenThereIsOne) {
+  DefaultValue<int>::Set(1);
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(1, mock.IntFunc(false));
+  DefaultValue<int>::Clear();
+}
+
+// Tests that DoDefault() does the action specified by ON_CALL().
+TEST(DoDefaultTest, DoesWhatOnCallSpecifies) {
+  MockClass mock;
+  ON_CALL(mock, IntFunc(_))
+      .WillByDefault(Return(2));
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(2, mock.IntFunc(false));
+}
+
+// Tests that using DoDefault() in ON_CALL() leads to a run-time failure.
+TEST(DoDefaultTest, CannotBeUsedInOnCall) {
+  MockClass mock;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(mock, IntFunc(_))
+      .WillByDefault(DoDefault());
+  }, "DoDefault() cannot be used in ON_CALL()");
+}
+
+// Tests that SetArgPointee<N>(v) sets the variable pointed to by
+// the N-th (0-based) argument to v.
+TEST(SetArgPointeeTest, SetsTheNthPointee) {
+  typedef void MyFunction(bool, int*, char*);
+  Action<MyFunction> a = SetArgPointee<1>(2);
+
+  int n = 0;
+  char ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('\0', ch);
+
+  a = SetArgPointee<2>('a');
+  n = 0;
+  ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(0, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Tests that SetArgPointee<N>() accepts a string literal.
+TEST(SetArgPointeeTest, AcceptsStringLiteral) {
+  typedef void MyFunction(std::string*, const char**);
+  Action<MyFunction> a = SetArgPointee<0>("hi");
+  std::string str;
+  const char* ptr = nullptr;
+  a.Perform(std::make_tuple(&str, &ptr));
+  EXPECT_EQ("hi", str);
+  EXPECT_TRUE(ptr == nullptr);
+
+  a = SetArgPointee<1>("world");
+  str = "";
+  a.Perform(std::make_tuple(&str, &ptr));
+  EXPECT_EQ("", str);
+  EXPECT_STREQ("world", ptr);
+}
+
+TEST(SetArgPointeeTest, AcceptsWideStringLiteral) {
+  typedef void MyFunction(const wchar_t**);
+  Action<MyFunction> a = SetArgPointee<0>(L"world");
+  const wchar_t* ptr = nullptr;
+  a.Perform(std::make_tuple(&ptr));
+  EXPECT_STREQ(L"world", ptr);
+
+# if GTEST_HAS_STD_WSTRING
+
+  typedef void MyStringFunction(std::wstring*);
+  Action<MyStringFunction> a2 = SetArgPointee<0>(L"world");
+  std::wstring str = L"";
+  a2.Perform(std::make_tuple(&str));
+  EXPECT_EQ(L"world", str);
+
+# endif
+}
+
+// Tests that SetArgPointee<N>() accepts a char pointer.
+TEST(SetArgPointeeTest, AcceptsCharPointer) {
+  typedef void MyFunction(bool, std::string*, const char**);
+  const char* const hi = "hi";
+  Action<MyFunction> a = SetArgPointee<1>(hi);
+  std::string str;
+  const char* ptr = nullptr;
+  a.Perform(std::make_tuple(true, &str, &ptr));
+  EXPECT_EQ("hi", str);
+  EXPECT_TRUE(ptr == nullptr);
+
+  char world_array[] = "world";
+  char* const world = world_array;
+  a = SetArgPointee<2>(world);
+  str = "";
+  a.Perform(std::make_tuple(true, &str, &ptr));
+  EXPECT_EQ("", str);
+  EXPECT_EQ(world, ptr);
+}
+
+TEST(SetArgPointeeTest, AcceptsWideCharPointer) {
+  typedef void MyFunction(bool, const wchar_t**);
+  const wchar_t* const hi = L"hi";
+  Action<MyFunction> a = SetArgPointee<1>(hi);
+  const wchar_t* ptr = nullptr;
+  a.Perform(std::make_tuple(true, &ptr));
+  EXPECT_EQ(hi, ptr);
+
+# if GTEST_HAS_STD_WSTRING
+
+  typedef void MyStringFunction(bool, std::wstring*);
+  wchar_t world_array[] = L"world";
+  wchar_t* const world = world_array;
+  Action<MyStringFunction> a2 = SetArgPointee<1>(world);
+  std::wstring str;
+  a2.Perform(std::make_tuple(true, &str));
+  EXPECT_EQ(world_array, str);
+# endif
+}
+
+// Tests that SetArgumentPointee<N>(v) sets the variable pointed to by
+// the N-th (0-based) argument to v.
+TEST(SetArgumentPointeeTest, SetsTheNthPointee) {
+  typedef void MyFunction(bool, int*, char*);
+  Action<MyFunction> a = SetArgumentPointee<1>(2);
+
+  int n = 0;
+  char ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('\0', ch);
+
+  a = SetArgumentPointee<2>('a');
+  n = 0;
+  ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(0, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Sample functions and functors for testing Invoke() and etc.
+int Nullary() { return 1; }
+
+class NullaryFunctor {
+ public:
+  int operator()() { return 2; }
+};
+
+bool g_done = false;
+void VoidNullary() { g_done = true; }
+
+class VoidNullaryFunctor {
+ public:
+  void operator()() { g_done = true; }
+};
+
+short Short(short n) { return n; }  // NOLINT
+char Char(char ch) { return ch; }
+
+const char* CharPtr(const char* s) { return s; }
+
+bool Unary(int x) { return x < 0; }
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+void VoidBinary(int, char) { g_done = true; }
+
+int Ternary(int x, char y, short z) { return x + y + z; }  // NOLINT
+
+int SumOf4(int a, int b, int c, int d) { return a + b + c + d; }
+
+class Foo {
+ public:
+  Foo() : value_(123) {}
+
+  int Nullary() const { return value_; }
+
+ private:
+  int value_;
+};
+
+// Tests InvokeWithoutArgs(function).
+TEST(InvokeWithoutArgsTest, Function) {
+  // As an action that takes one argument.
+  Action<int(int)> a = InvokeWithoutArgs(Nullary);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2)));
+
+  // As an action that takes two arguments.
+  Action<int(int, double)> a2 = InvokeWithoutArgs(Nullary);  // NOLINT
+  EXPECT_EQ(1, a2.Perform(std::make_tuple(2, 3.5)));
+
+  // As an action that returns void.
+  Action<void(int)> a3 = InvokeWithoutArgs(VoidNullary);  // NOLINT
+  g_done = false;
+  a3.Perform(std::make_tuple(1));
+  EXPECT_TRUE(g_done);
+}
+
+// Tests InvokeWithoutArgs(functor).
+TEST(InvokeWithoutArgsTest, Functor) {
+  // As an action that takes no argument.
+  Action<int()> a = InvokeWithoutArgs(NullaryFunctor());  // NOLINT
+  EXPECT_EQ(2, a.Perform(std::make_tuple()));
+
+  // As an action that takes three arguments.
+  Action<int(int, double, char)> a2 =  // NOLINT
+      InvokeWithoutArgs(NullaryFunctor());
+  EXPECT_EQ(2, a2.Perform(std::make_tuple(3, 3.5, 'a')));
+
+  // As an action that returns void.
+  Action<void()> a3 = InvokeWithoutArgs(VoidNullaryFunctor());
+  g_done = false;
+  a3.Perform(std::make_tuple());
+  EXPECT_TRUE(g_done);
+}
+
+// Tests InvokeWithoutArgs(obj_ptr, method).
+TEST(InvokeWithoutArgsTest, Method) {
+  Foo foo;
+  Action<int(bool, char)> a =  // NOLINT
+      InvokeWithoutArgs(&foo, &Foo::Nullary);
+  EXPECT_EQ(123, a.Perform(std::make_tuple(true, 'a')));
+}
+
+// Tests using IgnoreResult() on a polymorphic action.
+TEST(IgnoreResultTest, PolymorphicAction) {
+  Action<void(int)> a = IgnoreResult(Return(5));  // NOLINT
+  a.Perform(std::make_tuple(1));
+}
+
+// Tests using IgnoreResult() on a monomorphic action.
+
+int ReturnOne() {
+  g_done = true;
+  return 1;
+}
+
+TEST(IgnoreResultTest, MonomorphicAction) {
+  g_done = false;
+  Action<void()> a = IgnoreResult(Invoke(ReturnOne));
+  a.Perform(std::make_tuple());
+  EXPECT_TRUE(g_done);
+}
+
+// Tests using IgnoreResult() on an action that returns a class type.
+
+MyNonDefaultConstructible ReturnMyNonDefaultConstructible(double /* x */) {
+  g_done = true;
+  return MyNonDefaultConstructible(42);
+}
+
+TEST(IgnoreResultTest, ActionReturningClass) {
+  g_done = false;
+  Action<void(int)> a =
+      IgnoreResult(Invoke(ReturnMyNonDefaultConstructible));  // NOLINT
+  a.Perform(std::make_tuple(2));
+  EXPECT_TRUE(g_done);
+}
+
+TEST(AssignTest, Int) {
+  int x = 0;
+  Action<void(int)> a = Assign(&x, 5);
+  a.Perform(std::make_tuple(0));
+  EXPECT_EQ(5, x);
+}
+
+TEST(AssignTest, String) {
+  ::std::string x;
+  Action<void(void)> a = Assign(&x, "Hello, world");
+  a.Perform(std::make_tuple());
+  EXPECT_EQ("Hello, world", x);
+}
+
+TEST(AssignTest, CompatibleTypes) {
+  double x = 0;
+  Action<void(int)> a = Assign(&x, 5);
+  a.Perform(std::make_tuple(0));
+  EXPECT_DOUBLE_EQ(5, x);
+}
+
+
+// Tests using WithArgs and with an action that takes 1 argument.
+TEST(WithArgsTest, OneArg) {
+  Action<bool(double x, int n)> a = WithArgs<1>(Invoke(Unary));  // NOLINT
+  EXPECT_TRUE(a.Perform(std::make_tuple(1.5, -1)));
+  EXPECT_FALSE(a.Perform(std::make_tuple(1.5, 1)));
+}
+
+// Tests using WithArgs with an action that takes 2 arguments.
+TEST(WithArgsTest, TwoArgs) {
+  Action<const char*(const char* s, double x, short n)> a =  // NOLINT
+      WithArgs<0, 2>(Invoke(Binary));
+  const char s[] = "Hello";
+  EXPECT_EQ(s + 2, a.Perform(std::make_tuple(CharPtr(s), 0.5, Short(2))));
+}
+
+struct ConcatAll {
+  std::string operator()() const { return {}; }
+  template <typename... I>
+  std::string operator()(const char* a, I... i) const {
+    return a + ConcatAll()(i...);
+  }
+};
+
+// Tests using WithArgs with an action that takes 10 arguments.
+TEST(WithArgsTest, TenArgs) {
+  Action<std::string(const char*, const char*, const char*, const char*)> a =
+      WithArgs<0, 1, 2, 3, 2, 1, 0, 1, 2, 3>(Invoke(ConcatAll{}));
+  EXPECT_EQ("0123210123",
+            a.Perform(std::make_tuple(CharPtr("0"), CharPtr("1"), CharPtr("2"),
+                                      CharPtr("3"))));
+}
+
+// Tests using WithArgs with an action that is not Invoke().
+class SubtractAction : public ActionInterface<int(int, int)> {
+ public:
+  int Perform(const std::tuple<int, int>& args) override {
+    return std::get<0>(args) - std::get<1>(args);
+  }
+};
+
+TEST(WithArgsTest, NonInvokeAction) {
+  Action<int(const std::string&, int, int)> a =
+      WithArgs<2, 1>(MakeAction(new SubtractAction));
+  std::tuple<std::string, int, int> dummy =
+      std::make_tuple(std::string("hi"), 2, 10);
+  EXPECT_EQ(8, a.Perform(dummy));
+}
+
+// Tests using WithArgs to pass all original arguments in the original order.
+TEST(WithArgsTest, Identity) {
+  Action<int(int x, char y, short z)> a =  // NOLINT
+      WithArgs<0, 1, 2>(Invoke(Ternary));
+  EXPECT_EQ(123, a.Perform(std::make_tuple(100, Char(20), Short(3))));
+}
+
+// Tests using WithArgs with repeated arguments.
+TEST(WithArgsTest, RepeatedArguments) {
+  Action<int(bool, int m, int n)> a =  // NOLINT
+      WithArgs<1, 1, 1, 1>(Invoke(SumOf4));
+  EXPECT_EQ(4, a.Perform(std::make_tuple(false, 1, 10)));
+}
+
+// Tests using WithArgs with reversed argument order.
+TEST(WithArgsTest, ReversedArgumentOrder) {
+  Action<const char*(short n, const char* input)> a =  // NOLINT
+      WithArgs<1, 0>(Invoke(Binary));
+  const char s[] = "Hello";
+  EXPECT_EQ(s + 2, a.Perform(std::make_tuple(Short(2), CharPtr(s))));
+}
+
+// Tests using WithArgs with compatible, but not identical, argument types.
+TEST(WithArgsTest, ArgsOfCompatibleTypes) {
+  Action<long(short x, char y, double z, char c)> a =  // NOLINT
+      WithArgs<0, 1, 3>(Invoke(Ternary));
+  EXPECT_EQ(123,
+            a.Perform(std::make_tuple(Short(100), Char(20), 5.6, Char(3))));
+}
+
+// Tests using WithArgs with an action that returns void.
+TEST(WithArgsTest, VoidAction) {
+  Action<void(double x, char c, int n)> a = WithArgs<2, 1>(Invoke(VoidBinary));
+  g_done = false;
+  a.Perform(std::make_tuple(1.5, 'a', 3));
+  EXPECT_TRUE(g_done);
+}
+
+TEST(WithArgsTest, ReturnReference) {
+  Action<int&(int&, void*)> aa = WithArgs<0>([](int& a) -> int& { return a; });
+  int i = 0;
+  const int& res = aa.Perform(std::forward_as_tuple(i, nullptr));
+  EXPECT_EQ(&i, &res);
+}
+
+TEST(WithArgsTest, InnerActionWithConversion) {
+  Action<Derived*()> inner = [] { return nullptr; };
+  Action<Base*(double)> a = testing::WithoutArgs(inner);
+  EXPECT_EQ(nullptr, a.Perform(std::make_tuple(1.1)));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+class SetErrnoAndReturnTest : public testing::Test {
+ protected:
+  void SetUp() override { errno = 0; }
+  void TearDown() override { errno = 0; }
+};
+
+TEST_F(SetErrnoAndReturnTest, Int) {
+  Action<int(void)> a = SetErrnoAndReturn(ENOTTY, -5);
+  EXPECT_EQ(-5, a.Perform(std::make_tuple()));
+  EXPECT_EQ(ENOTTY, errno);
+}
+
+TEST_F(SetErrnoAndReturnTest, Ptr) {
+  int x;
+  Action<int*(void)> a = SetErrnoAndReturn(ENOTTY, &x);
+  EXPECT_EQ(&x, a.Perform(std::make_tuple()));
+  EXPECT_EQ(ENOTTY, errno);
+}
+
+TEST_F(SetErrnoAndReturnTest, CompatibleTypes) {
+  Action<double()> a = SetErrnoAndReturn(EINVAL, 5);
+  EXPECT_DOUBLE_EQ(5.0, a.Perform(std::make_tuple()));
+  EXPECT_EQ(EINVAL, errno);
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests ByRef().
+
+// Tests that the result of ByRef() is copyable.
+TEST(ByRefTest, IsCopyable) {
+  const std::string s1 = "Hi";
+  const std::string s2 = "Hello";
+
+  auto ref_wrapper = ByRef(s1);
+  const std::string& r1 = ref_wrapper;
+  EXPECT_EQ(&s1, &r1);
+
+  // Assigns a new value to ref_wrapper.
+  ref_wrapper = ByRef(s2);
+  const std::string& r2 = ref_wrapper;
+  EXPECT_EQ(&s2, &r2);
+
+  auto ref_wrapper1 = ByRef(s1);
+  // Copies ref_wrapper1 to ref_wrapper.
+  ref_wrapper = ref_wrapper1;
+  const std::string& r3 = ref_wrapper;
+  EXPECT_EQ(&s1, &r3);
+}
+
+// Tests using ByRef() on a const value.
+TEST(ByRefTest, ConstValue) {
+  const int n = 0;
+  // int& ref = ByRef(n);  // This shouldn't compile - we have a
+                           // negative compilation test to catch it.
+  const int& const_ref = ByRef(n);
+  EXPECT_EQ(&n, &const_ref);
+}
+
+// Tests using ByRef() on a non-const value.
+TEST(ByRefTest, NonConstValue) {
+  int n = 0;
+
+  // ByRef(n) can be used as either an int&,
+  int& ref = ByRef(n);
+  EXPECT_EQ(&n, &ref);
+
+  // or a const int&.
+  const int& const_ref = ByRef(n);
+  EXPECT_EQ(&n, &const_ref);
+}
+
+// Tests explicitly specifying the type when using ByRef().
+TEST(ByRefTest, ExplicitType) {
+  int n = 0;
+  const int& r1 = ByRef<const int>(n);
+  EXPECT_EQ(&n, &r1);
+
+  // ByRef<char>(n);  // This shouldn't compile - we have a negative
+                      // compilation test to catch it.
+
+  Derived d;
+  Derived& r2 = ByRef<Derived>(d);
+  EXPECT_EQ(&d, &r2);
+
+  const Derived& r3 = ByRef<const Derived>(d);
+  EXPECT_EQ(&d, &r3);
+
+  Base& r4 = ByRef<Base>(d);
+  EXPECT_EQ(&d, &r4);
+
+  const Base& r5 = ByRef<const Base>(d);
+  EXPECT_EQ(&d, &r5);
+
+  // The following shouldn't compile - we have a negative compilation
+  // test for it.
+  //
+  // Base b;
+  // ByRef<Derived>(b);
+}
+
+// Tests that Google Mock prints expression ByRef(x) as a reference to x.
+TEST(ByRefTest, PrintsCorrectly) {
+  int n = 42;
+  ::std::stringstream expected, actual;
+  testing::internal::UniversalPrinter<const int&>::Print(n, &expected);
+  testing::internal::UniversalPrint(ByRef(n), &actual);
+  EXPECT_EQ(expected.str(), actual.str());
+}
+
+
+std::unique_ptr<int> UniquePtrSource() {
+  return std::unique_ptr<int>(new int(19));
+}
+
+std::vector<std::unique_ptr<int>> VectorUniquePtrSource() {
+  std::vector<std::unique_ptr<int>> out;
+  out.emplace_back(new int(7));
+  return out;
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_Return) {
+  MockClass mock;
+  std::unique_ptr<int> i(new int(19));
+  EXPECT_CALL(mock, MakeUnique()).WillOnce(Return(ByMove(std::move(i))));
+  EXPECT_CALL(mock, MakeVectorUnique())
+      .WillOnce(Return(ByMove(VectorUniquePtrSource())));
+  Derived* d = new Derived;
+  EXPECT_CALL(mock, MakeUniqueBase())
+      .WillOnce(Return(ByMove(std::unique_ptr<Derived>(d))));
+
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+
+  std::vector<std::unique_ptr<int>> vresult = mock.MakeVectorUnique();
+  EXPECT_EQ(1u, vresult.size());
+  EXPECT_NE(nullptr, vresult[0]);
+  EXPECT_EQ(7, *vresult[0]);
+
+  std::unique_ptr<Base> result2 = mock.MakeUniqueBase();
+  EXPECT_EQ(d, result2.get());
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_DoAllReturn) {
+  testing::MockFunction<void()> mock_function;
+  MockClass mock;
+  std::unique_ptr<int> i(new int(19));
+  EXPECT_CALL(mock_function, Call());
+  EXPECT_CALL(mock, MakeUnique()).WillOnce(DoAll(
+      InvokeWithoutArgs(&mock_function, &testing::MockFunction<void()>::Call),
+      Return(ByMove(std::move(i)))));
+
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_Invoke) {
+  MockClass mock;
+
+  // Check default value
+  DefaultValue<std::unique_ptr<int>>::SetFactory([] {
+    return std::unique_ptr<int>(new int(42));
+  });
+  EXPECT_EQ(42, *mock.MakeUnique());
+
+  EXPECT_CALL(mock, MakeUnique()).WillRepeatedly(Invoke(UniquePtrSource));
+  EXPECT_CALL(mock, MakeVectorUnique())
+      .WillRepeatedly(Invoke(VectorUniquePtrSource));
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+  std::unique_ptr<int> result2 = mock.MakeUnique();
+  EXPECT_EQ(19, *result2);
+  EXPECT_NE(result1, result2);
+
+  std::vector<std::unique_ptr<int>> vresult = mock.MakeVectorUnique();
+  EXPECT_EQ(1u, vresult.size());
+  EXPECT_NE(nullptr, vresult[0]);
+  EXPECT_EQ(7, *vresult[0]);
+}
+
+TEST(MockMethodTest, CanTakeMoveOnlyValue) {
+  MockClass mock;
+  auto make = [](int i) { return std::unique_ptr<int>(new int(i)); };
+
+  EXPECT_CALL(mock, TakeUnique(_)).WillRepeatedly([](std::unique_ptr<int> i) {
+    return *i;
+  });
+  // DoAll() does not compile, since it would move from its arguments twice.
+  // EXPECT_CALL(mock, TakeUnique(_, _))
+  //     .WillRepeatedly(DoAll(Invoke([](std::unique_ptr<int> j) {}),
+  //     Return(1)));
+  EXPECT_CALL(mock, TakeUnique(testing::Pointee(7)))
+      .WillOnce(Return(-7))
+      .RetiresOnSaturation();
+  EXPECT_CALL(mock, TakeUnique(testing::IsNull()))
+      .WillOnce(Return(-1))
+      .RetiresOnSaturation();
+
+  EXPECT_EQ(5, mock.TakeUnique(make(5)));
+  EXPECT_EQ(-7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(-1, mock.TakeUnique({}));
+
+  // Some arguments are moved, some passed by reference.
+  auto lvalue = make(6);
+  EXPECT_CALL(mock, TakeUnique(_, _))
+      .WillOnce([](const std::unique_ptr<int>& i, std::unique_ptr<int> j) {
+        return *i * *j;
+      });
+  EXPECT_EQ(42, mock.TakeUnique(lvalue, make(7)));
+
+  // The unique_ptr can be saved by the action.
+  std::unique_ptr<int> saved;
+  EXPECT_CALL(mock, TakeUnique(_)).WillOnce([&saved](std::unique_ptr<int> i) {
+    saved = std::move(i);
+    return 0;
+  });
+  EXPECT_EQ(0, mock.TakeUnique(make(42)));
+  EXPECT_EQ(42, *saved);
+}
+
+
+// Tests for std::function based action.
+
+int Add(int val, int& ref, int* ptr) {  // NOLINT
+  int result = val + ref + *ptr;
+  ref = 42;
+  *ptr = 43;
+  return result;
+}
+
+int Deref(std::unique_ptr<int> ptr) { return *ptr; }
+
+struct Double {
+  template <typename T>
+  T operator()(T t) { return 2 * t; }
+};
+
+std::unique_ptr<int> UniqueInt(int i) {
+  return std::unique_ptr<int>(new int(i));
+}
+
+TEST(FunctorActionTest, ActionFromFunction) {
+  Action<int(int, int&, int*)> a = &Add;
+  int x = 1, y = 2, z = 3;
+  EXPECT_EQ(6, a.Perform(std::forward_as_tuple(x, y, &z)));
+  EXPECT_EQ(42, y);
+  EXPECT_EQ(43, z);
+
+  Action<int(std::unique_ptr<int>)> a1 = &Deref;
+  EXPECT_EQ(7, a1.Perform(std::make_tuple(UniqueInt(7))));
+}
+
+TEST(FunctorActionTest, ActionFromLambda) {
+  Action<int(bool, int)> a1 = [](bool b, int i) { return b ? i : 0; };
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 5)));
+
+  std::unique_ptr<int> saved;
+  Action<void(std::unique_ptr<int>)> a2 = [&saved](std::unique_ptr<int> p) {
+    saved = std::move(p);
+  };
+  a2.Perform(std::make_tuple(UniqueInt(5)));
+  EXPECT_EQ(5, *saved);
+}
+
+TEST(FunctorActionTest, PolymorphicFunctor) {
+  Action<int(int)> ai = Double();
+  EXPECT_EQ(2, ai.Perform(std::make_tuple(1)));
+  Action<double(double)> ad = Double();  // Double? Double double!
+  EXPECT_EQ(3.0, ad.Perform(std::make_tuple(1.5)));
+}
+
+TEST(FunctorActionTest, TypeConversion) {
+  // Numeric promotions are allowed.
+  const Action<bool(int)> a1 = [](int i) { return i > 1; };
+  const Action<int(bool)> a2 = Action<int(bool)>(a1);
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(42)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(42)));
+
+  // Implicit constructors are allowed.
+  const Action<bool(std::string)> s1 = [](std::string s) { return !s.empty(); };
+  const Action<int(const char*)> s2 = Action<int(const char*)>(s1);
+  EXPECT_EQ(0, s2.Perform(std::make_tuple("")));
+  EXPECT_EQ(1, s2.Perform(std::make_tuple("hello")));
+
+  // Also between the lambda and the action itself.
+  const Action<bool(std::string)> x = [](Unused) { return 42; };
+  EXPECT_TRUE(x.Perform(std::make_tuple("hello")));
+}
+
+TEST(FunctorActionTest, UnusedArguments) {
+  // Verify that users can ignore uninteresting arguments.
+  Action<int(int, double y, double z)> a =
+      [](int i, Unused, Unused) { return 2 * i; };
+  std::tuple<int, double, double> dummy = std::make_tuple(3, 7.3, 9.44);
+  EXPECT_EQ(6, a.Perform(dummy));
+}
+
+// Test that basic built-in actions work with move-only arguments.
+TEST(MoveOnlyArgumentsTest, ReturningActions) {
+  Action<int(std::unique_ptr<int>)> a = Return(1);
+  EXPECT_EQ(1, a.Perform(std::make_tuple(nullptr)));
+
+  a = testing::WithoutArgs([]() { return 7; });
+  EXPECT_EQ(7, a.Perform(std::make_tuple(nullptr)));
+
+  Action<void(std::unique_ptr<int>, int*)> a2 = testing::SetArgPointee<1>(3);
+  int x = 0;
+  a2.Perform(std::make_tuple(nullptr, &x));
+  EXPECT_EQ(x, 3);
+}
+
+
+}  // Unnamed namespace
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(pop)
+#endif
+#endif
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-cardinalities_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-cardinalities_test.cc
new file mode 100755
index 0000000..66042d4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-cardinalities_test.cc
@@ -0,0 +1,427 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in cardinalities.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace {
+
+using std::stringstream;
+using testing::AnyNumber;
+using testing::AtLeast;
+using testing::AtMost;
+using testing::Between;
+using testing::Cardinality;
+using testing::CardinalityInterface;
+using testing::Exactly;
+using testing::IsSubstring;
+using testing::MakeCardinality;
+
+class MockFoo {
+ public:
+  MockFoo() {}
+  MOCK_METHOD0(Bar, int());  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+// Tests that Cardinality objects can be default constructed.
+TEST(CardinalityTest, IsDefaultConstructable) {
+  Cardinality c;
+}
+
+// Tests that Cardinality objects are copyable.
+TEST(CardinalityTest, IsCopyable) {
+  // Tests the copy constructor.
+  Cardinality c = Exactly(1);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  // Tests the assignment operator.
+  c = Exactly(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+}
+
+TEST(CardinalityTest, IsOverSaturatedByCallCountWorks) {
+  const Cardinality c = AtMost(5);
+  EXPECT_FALSE(c.IsOverSaturatedByCallCount(4));
+  EXPECT_FALSE(c.IsOverSaturatedByCallCount(5));
+  EXPECT_TRUE(c.IsOverSaturatedByCallCount(6));
+}
+
+// Tests that Cardinality::DescribeActualCallCountTo() creates the
+// correct description.
+TEST(CardinalityTest, CanDescribeActualCallCount) {
+  stringstream ss0;
+  Cardinality::DescribeActualCallCountTo(0, &ss0);
+  EXPECT_EQ("never called", ss0.str());
+
+  stringstream ss1;
+  Cardinality::DescribeActualCallCountTo(1, &ss1);
+  EXPECT_EQ("called once", ss1.str());
+
+  stringstream ss2;
+  Cardinality::DescribeActualCallCountTo(2, &ss2);
+  EXPECT_EQ("called twice", ss2.str());
+
+  stringstream ss3;
+  Cardinality::DescribeActualCallCountTo(3, &ss3);
+  EXPECT_EQ("called 3 times", ss3.str());
+}
+
+// Tests AnyNumber()
+TEST(AnyNumber, Works) {
+  const Cardinality c = AnyNumber();
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(9));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(9));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called any number of times",
+                      ss.str());
+}
+
+TEST(AnyNumberTest, HasCorrectBounds) {
+  const Cardinality c = AnyNumber();
+  EXPECT_EQ(0, c.ConservativeLowerBound());
+  EXPECT_EQ(INT_MAX, c.ConservativeUpperBound());
+}
+
+// Tests AtLeast(n).
+
+TEST(AtLeastTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    AtLeast(-1);
+  }, "The invocation lower bound must be >= 0");
+}
+
+TEST(AtLeastTest, OnZero) {
+  const Cardinality c = AtLeast(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "any number of times",
+                      ss.str());
+}
+
+TEST(AtLeastTest, OnPositiveNumber) {
+  const Cardinality c = AtLeast(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  AtLeast(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least twice",
+                      ss2.str());
+
+  stringstream ss3;
+  AtLeast(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least 3 times",
+                      ss3.str());
+}
+
+TEST(AtLeastTest, HasCorrectBounds) {
+  const Cardinality c = AtLeast(2);
+  EXPECT_EQ(2, c.ConservativeLowerBound());
+  EXPECT_EQ(INT_MAX, c.ConservativeUpperBound());
+}
+
+// Tests AtMost(n).
+
+TEST(AtMostTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    AtMost(-1);
+  }, "The invocation upper bound must be >= 0");
+}
+
+TEST(AtMostTest, OnZero) {
+  const Cardinality c = AtMost(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(AtMostTest, OnPositiveNumber) {
+  const Cardinality c = AtMost(2);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  AtMost(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most twice",
+                      ss2.str());
+
+  stringstream ss3;
+  AtMost(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most 3 times",
+                      ss3.str());
+}
+
+TEST(AtMostTest, HasCorrectBounds) {
+  const Cardinality c = AtMost(2);
+  EXPECT_EQ(0, c.ConservativeLowerBound());
+  EXPECT_EQ(2, c.ConservativeUpperBound());
+}
+
+// Tests Between(m, n).
+
+TEST(BetweenTest, OnNegativeStart) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(-1, 2);
+  }, "The invocation lower bound must be >= 0, but is actually -1");
+}
+
+TEST(BetweenTest, OnNegativeEnd) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(1, -2);
+  }, "The invocation upper bound must be >= 0, but is actually -2");
+}
+
+TEST(BetweenTest, OnStartBiggerThanEnd) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(2, 1);
+  }, "The invocation upper bound (1) must be >= "
+     "the invocation lower bound (2)");
+}
+
+TEST(BetweenTest, OnZeroStartAndZeroEnd) {
+  const Cardinality c = Between(0, 0);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnZeroStartAndNonZeroEnd) {
+  const Cardinality c = Between(0, 2);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(4));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(4));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most twice",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnSameStartAndEnd) {
+  const Cardinality c = Between(3, 3);
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(3));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(3));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(4));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(4));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called 3 times",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnDifferentStartAndEnd) {
+  const Cardinality c = Between(3, 5);
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(3));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(3));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(5));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(5));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(6));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(6));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called between 3 and 5 times",
+                      ss.str());
+}
+
+TEST(BetweenTest, HasCorrectBounds) {
+  const Cardinality c = Between(3, 5);
+  EXPECT_EQ(3, c.ConservativeLowerBound());
+  EXPECT_EQ(5, c.ConservativeUpperBound());
+}
+
+// Tests Exactly(n).
+
+TEST(ExactlyTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Exactly(-1);
+  }, "The invocation lower bound must be >= 0");
+}
+
+TEST(ExactlyTest, OnZero) {
+  const Cardinality c = Exactly(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(ExactlyTest, OnPositiveNumber) {
+  const Cardinality c = Exactly(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  Exactly(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called twice",
+                      ss2.str());
+
+  stringstream ss3;
+  Exactly(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called 3 times",
+                      ss3.str());
+}
+
+TEST(ExactlyTest, HasCorrectBounds) {
+  const Cardinality c = Exactly(3);
+  EXPECT_EQ(3, c.ConservativeLowerBound());
+  EXPECT_EQ(3, c.ConservativeUpperBound());
+}
+
+// Tests that a user can make their own cardinality by implementing
+// CardinalityInterface and calling MakeCardinality().
+
+class EvenCardinality : public CardinalityInterface {
+ public:
+  // Returns true if call_count calls will satisfy this cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return (call_count % 2 == 0);
+  }
+
+  // Returns true if call_count calls will saturate this cardinality.
+  bool IsSaturatedByCallCount(int /* call_count */) const override {
+    return false;
+  }
+
+  // Describes self to an ostream.
+  void DescribeTo(::std::ostream* ss) const override {
+    *ss << "called even number of times";
+  }
+};
+
+TEST(MakeCardinalityTest, ConstructsCardinalityFromInterface) {
+  const Cardinality c = MakeCardinality(new EvenCardinality);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(3));
+
+  EXPECT_FALSE(c.IsSaturatedByCallCount(10000));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_EQ("called even number of times", ss.str());
+}
+
+}  // Unnamed namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc.cc
new file mode 100755
index 0000000..d38fe85
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc.cc
@@ -0,0 +1,16 @@
+#include "gmock/gmock.h"
+
+#include <memory>
+#include <string>
+
+#if defined(TEST_MOCK_METHOD_INVALID_CONST_SPEC)
+
+struct Base {
+  MOCK_METHOD(int, F, (), (onst));
+};
+
+#else
+
+// Sanity check - this should compile.
+
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc_test.py
new file mode 100755
index 0000000..8ef6e09
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_nc_test.py
@@ -0,0 +1,43 @@
+"""Negative compilation tests for Google Mock macro MOCK_METHOD."""
+
+import os
+import sys
+
+IS_LINUX = os.name == "posix" and os.uname()[0] == "Linux"
+if not IS_LINUX:
+  sys.stderr.write(
+      "WARNING: Negative compilation tests are not supported on this platform")
+  sys.exit(0)
+
+# Suppresses the 'Import not at the top of the file' lint complaint.
+# pylint: disable-msg=C6204
+from google3.testing.pybase import fake_target_util
+from google3.testing.pybase import googletest
+
+# pylint: enable-msg=C6204
+
+
+class GMockMethodNCTest(googletest.TestCase):
+  """Negative compilation tests for MOCK_METHOD."""
+
+  # The class body is intentionally empty.  The actual test*() methods
+  # will be defined at run time by a call to
+  # DefineNegativeCompilationTests() later.
+  pass
+
+
+# Defines a list of test specs, where each element is a tuple
+# (test name, list of regexes for matching the compiler errors).
+TEST_SPECS = [
+    ("MOCK_METHOD_INVALID_CONST_SPEC",
+     [r"onst cannot be recognized as a valid specification modifier"]),
+]
+
+# Define a test method in GMockNCTest for each element in TEST_SPECS.
+fake_target_util.DefineNegativeCompilationTests(
+    GMockMethodNCTest,
+    "google3/third_party/googletest/googlemock/test/gmock-function-mocker_nc",
+    "gmock-function-mocker_nc.o", TEST_SPECS)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_test.cc
new file mode 100755
index 0000000..fbc5d5b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-function-mocker_test.cc
@@ -0,0 +1,660 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the function mocker classes.
+#include "gmock/gmock-generated-function-mockers.h"
+
+#if GTEST_OS_WINDOWS
+// MSDN says the header file to be included for STDMETHOD is BaseTyps.h but
+// we are getting compiler errors if we use basetyps.h, hence including
+// objbase.h for definition of STDMETHOD.
+# include <objbase.h>
+#endif  // GTEST_OS_WINDOWS
+
+#include <map>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_function_mocker_test {
+
+using testing::_;
+using testing::A;
+using testing::An;
+using testing::AnyNumber;
+using testing::Const;
+using testing::DoDefault;
+using testing::Eq;
+using testing::Lt;
+using testing::MockFunction;
+using testing::Ref;
+using testing::Return;
+using testing::ReturnRef;
+using testing::TypedEq;
+
+template<typename T>
+class TemplatedCopyable {
+ public:
+  TemplatedCopyable() {}
+
+  template <typename U>
+  TemplatedCopyable(const U& other) {}  // NOLINT
+};
+
+class FooInterface {
+ public:
+  virtual ~FooInterface() {}
+
+  virtual void VoidReturning(int x) = 0;
+
+  virtual int Nullary() = 0;
+  virtual bool Unary(int x) = 0;
+  virtual long Binary(short x, int y) = 0;  // NOLINT
+  virtual int Decimal(bool b, char c, short d, int e, long f,  // NOLINT
+                      float g, double h, unsigned i, char* j,
+                      const std::string& k) = 0;
+
+  virtual bool TakesNonConstReference(int& n) = 0;  // NOLINT
+  virtual std::string TakesConstReference(const int& n) = 0;
+  virtual bool TakesConst(const int x) = 0;
+
+  virtual int OverloadedOnArgumentNumber() = 0;
+  virtual int OverloadedOnArgumentNumber(int n) = 0;
+
+  virtual int OverloadedOnArgumentType(int n) = 0;
+  virtual char OverloadedOnArgumentType(char c) = 0;
+
+  virtual int OverloadedOnConstness() = 0;
+  virtual char OverloadedOnConstness() const = 0;
+
+  virtual int TypeWithHole(int (*func)()) = 0;
+  virtual int TypeWithComma(const std::map<int, std::string>& a_map) = 0;
+  virtual int TypeWithTemplatedCopyCtor(const TemplatedCopyable<int>&) = 0;
+
+#if GTEST_OS_WINDOWS
+  STDMETHOD_(int, CTNullary)() = 0;
+  STDMETHOD_(bool, CTUnary)(int x) = 0;
+  STDMETHOD_(int, CTDecimal)
+  (bool b, char c, short d, int e, long f,  // NOLINT
+   float g, double h, unsigned i, char* j, const std::string& k) = 0;
+  STDMETHOD_(char, CTConst)(int x) const = 0;
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Const qualifiers on arguments were once (incorrectly) considered
+// significant in determining whether two virtual functions had the same
+// signature. This was fixed in Visual Studio 2008. However, the compiler
+// still emits a warning that alerts about this change in behavior.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4373)
+#endif
+class MockFoo : public FooInterface {
+ public:
+  MockFoo() {}
+
+  // Makes sure that a mock function parameter can be named.
+  MOCK_METHOD(void, VoidReturning, (int n));  // NOLINT
+
+  MOCK_METHOD(int, Nullary, ());  // NOLINT
+
+  // Makes sure that a mock function parameter can be unnamed.
+  MOCK_METHOD(bool, Unary, (int));          // NOLINT
+  MOCK_METHOD(long, Binary, (short, int));  // NOLINT
+  MOCK_METHOD(int, Decimal,
+              (bool, char, short, int, long, float,  // NOLINT
+               double, unsigned, char*, const std::string& str),
+              (override));
+
+  MOCK_METHOD(bool, TakesNonConstReference, (int&));  // NOLINT
+  MOCK_METHOD(std::string, TakesConstReference, (const int&));
+  MOCK_METHOD(bool, TakesConst, (const int));  // NOLINT
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD((std::map<int, std::string>), ReturnTypeWithComma, (), ());
+  MOCK_METHOD((std::map<int, std::string>), ReturnTypeWithComma, (int),
+              (const));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnArgumentNumber, ());     // NOLINT
+  MOCK_METHOD(int, OverloadedOnArgumentNumber, (int));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnArgumentType, (int));    // NOLINT
+  MOCK_METHOD(char, OverloadedOnArgumentType, (char));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnConstness, (), (override));          // NOLINT
+  MOCK_METHOD(char, OverloadedOnConstness, (), (override, const));  // NOLINT
+
+  MOCK_METHOD(int, TypeWithHole, (int (*)()), ());  // NOLINT
+  MOCK_METHOD(int, TypeWithComma, ((const std::map<int, std::string>&)));
+  MOCK_METHOD(int, TypeWithTemplatedCopyCtor,
+              (const TemplatedCopyable<int>&));  // NOLINT
+
+#if GTEST_OS_WINDOWS
+  MOCK_METHOD(int, CTNullary, (), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(bool, CTUnary, (int), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(int, CTDecimal,
+              (bool b, char c, short d, int e, long f, float g, double h,
+               unsigned i, char* j, const std::string& k),
+              (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(char, CTConst, (int), (const, Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD((std::map<int, std::string>), CTReturnTypeWithComma, (),
+              (Calltype(STDMETHODCALLTYPE)));
+#endif  // GTEST_OS_WINDOWS
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+class MockMethodFunctionMockerTest : public testing::Test {
+ protected:
+  MockMethodFunctionMockerTest() : foo_(&mock_foo_) {}
+
+  FooInterface* const foo_;
+  MockFoo mock_foo_;
+};
+
+// Tests mocking a void-returning function.
+TEST_F(MockMethodFunctionMockerTest, MocksVoidFunction) {
+  EXPECT_CALL(mock_foo_, VoidReturning(Lt(100)));
+  foo_->VoidReturning(0);
+}
+
+// Tests mocking a nullary function.
+TEST_F(MockMethodFunctionMockerTest, MocksNullaryFunction) {
+  EXPECT_CALL(mock_foo_, Nullary())
+      .WillOnce(DoDefault())
+      .WillOnce(Return(1));
+
+  EXPECT_EQ(0, foo_->Nullary());
+  EXPECT_EQ(1, foo_->Nullary());
+}
+
+// Tests mocking a unary function.
+TEST_F(MockMethodFunctionMockerTest, MocksUnaryFunction) {
+  EXPECT_CALL(mock_foo_, Unary(Eq(2)))
+      .Times(2)
+      .WillOnce(Return(true));
+
+  EXPECT_TRUE(foo_->Unary(2));
+  EXPECT_FALSE(foo_->Unary(2));
+}
+
+// Tests mocking a binary function.
+TEST_F(MockMethodFunctionMockerTest, MocksBinaryFunction) {
+  EXPECT_CALL(mock_foo_, Binary(2, _))
+      .WillOnce(Return(3));
+
+  EXPECT_EQ(3, foo_->Binary(2, 1));
+}
+
+// Tests mocking a decimal function.
+TEST_F(MockMethodFunctionMockerTest, MocksDecimalFunction) {
+  EXPECT_CALL(mock_foo_, Decimal(true, 'a', 0, 0, 1L, A<float>(),
+                                 Lt(100), 5U, NULL, "hi"))
+      .WillOnce(Return(5));
+
+  EXPECT_EQ(5, foo_->Decimal(true, 'a', 0, 0, 1, 0, 0, 5, nullptr, "hi"));
+}
+
+// Tests mocking a function that takes a non-const reference.
+TEST_F(MockMethodFunctionMockerTest,
+       MocksFunctionWithNonConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(mock_foo_, TakesNonConstReference(Ref(a)))
+      .WillOnce(Return(true));
+
+  EXPECT_TRUE(foo_->TakesNonConstReference(a));
+}
+
+// Tests mocking a function that takes a const reference.
+TEST_F(MockMethodFunctionMockerTest, MocksFunctionWithConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(mock_foo_, TakesConstReference(Ref(a)))
+      .WillOnce(Return("Hello"));
+
+  EXPECT_EQ("Hello", foo_->TakesConstReference(a));
+}
+
+// Tests mocking a function that takes a const variable.
+TEST_F(MockMethodFunctionMockerTest, MocksFunctionWithConstArgument) {
+  EXPECT_CALL(mock_foo_, TakesConst(Lt(10)))
+      .WillOnce(DoDefault());
+
+  EXPECT_FALSE(foo_->TakesConst(5));
+}
+
+// Tests mocking functions overloaded on the number of arguments.
+TEST_F(MockMethodFunctionMockerTest, MocksFunctionsOverloadedOnArgumentNumber) {
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentNumber())
+      .WillOnce(Return(1));
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentNumber(_))
+      .WillOnce(Return(2));
+
+  EXPECT_EQ(2, foo_->OverloadedOnArgumentNumber(1));
+  EXPECT_EQ(1, foo_->OverloadedOnArgumentNumber());
+}
+
+// Tests mocking functions overloaded on the types of argument.
+TEST_F(MockMethodFunctionMockerTest, MocksFunctionsOverloadedOnArgumentType) {
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentType(An<int>()))
+      .WillOnce(Return(1));
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentType(TypedEq<char>('a')))
+      .WillOnce(Return('b'));
+
+  EXPECT_EQ(1, foo_->OverloadedOnArgumentType(0));
+  EXPECT_EQ('b', foo_->OverloadedOnArgumentType('a'));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TEST_F(MockMethodFunctionMockerTest,
+       MocksFunctionsOverloadedOnConstnessOfThis) {
+  EXPECT_CALL(mock_foo_, OverloadedOnConstness());
+  EXPECT_CALL(Const(mock_foo_), OverloadedOnConstness())
+      .WillOnce(Return('a'));
+
+  EXPECT_EQ(0, foo_->OverloadedOnConstness());
+  EXPECT_EQ('a', Const(*foo_).OverloadedOnConstness());
+}
+
+TEST_F(MockMethodFunctionMockerTest, MocksReturnTypeWithComma) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(mock_foo_, ReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+  EXPECT_CALL(mock_foo_, ReturnTypeWithComma(42))
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock_foo_.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, mock_foo_.ReturnTypeWithComma(42));
+}
+
+TEST_F(MockMethodFunctionMockerTest, MocksTypeWithTemplatedCopyCtor) {
+  EXPECT_CALL(mock_foo_, TypeWithTemplatedCopyCtor(_)).WillOnce(Return(true));
+  EXPECT_TRUE(foo_->TypeWithTemplatedCopyCtor(TemplatedCopyable<int>()));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking a nullary function with calltype.
+TEST_F(MockMethodFunctionMockerTest, MocksNullaryFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTNullary())
+      .WillOnce(Return(-1))
+      .WillOnce(Return(0));
+
+  EXPECT_EQ(-1, foo_->CTNullary());
+  EXPECT_EQ(0, foo_->CTNullary());
+}
+
+// Tests mocking a unary function with calltype.
+TEST_F(MockMethodFunctionMockerTest, MocksUnaryFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTUnary(Eq(2)))
+      .Times(2)
+      .WillOnce(Return(true))
+      .WillOnce(Return(false));
+
+  EXPECT_TRUE(foo_->CTUnary(2));
+  EXPECT_FALSE(foo_->CTUnary(2));
+}
+
+// Tests mocking a decimal function with calltype.
+TEST_F(MockMethodFunctionMockerTest, MocksDecimalFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTDecimal(true, 'a', 0, 0, 1L, A<float>(),
+                                   Lt(100), 5U, NULL, "hi"))
+      .WillOnce(Return(10));
+
+  EXPECT_EQ(10, foo_->CTDecimal(true, 'a', 0, 0, 1, 0, 0, 5, NULL, "hi"));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TEST_F(MockMethodFunctionMockerTest, MocksFunctionsConstFunctionWithCallType) {
+  EXPECT_CALL(Const(mock_foo_), CTConst(_))
+      .WillOnce(Return('a'));
+
+  EXPECT_EQ('a', Const(*foo_).CTConst(0));
+}
+
+TEST_F(MockMethodFunctionMockerTest, MocksReturnTypeWithCommaAndCallType) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(mock_foo_, CTReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock_foo_.CTReturnTypeWithComma());
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+class MockB {
+ public:
+  MockB() {}
+
+  MOCK_METHOD(void, DoB, ());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockB);
+};
+
+// Tests that functions with no EXPECT_CALL() rules can be called any
+// number of times.
+TEST(MockMethodExpectCallTest, UnmentionedFunctionCanBeCalledAnyNumberOfTimes) {
+  {
+    MockB b;
+  }
+
+  {
+    MockB b;
+    b.DoB();
+  }
+
+  {
+    MockB b;
+    b.DoB();
+    b.DoB();
+  }
+}
+
+// Tests mocking template interfaces.
+
+template <typename T>
+class StackInterface {
+ public:
+  virtual ~StackInterface() {}
+
+  // Template parameter appears in function parameter.
+  virtual void Push(const T& value) = 0;
+  virtual void Pop() = 0;
+  virtual int GetSize() const = 0;
+  // Template parameter appears in function return type.
+  virtual const T& GetTop() const = 0;
+};
+
+template <typename T>
+class MockStack : public StackInterface<T> {
+ public:
+  MockStack() {}
+
+  MOCK_METHOD(void, Push, (const T& elem), ());
+  MOCK_METHOD(void, Pop, (), (final));
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(const T&, GetTop, (), (const));
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD((std::map<int, int>), ReturnTypeWithComma, (), ());
+  MOCK_METHOD((std::map<int, int>), ReturnTypeWithComma, (int), (const));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStack);
+};
+
+// Tests that template mock works.
+TEST(MockMethodTemplateMockTest, Works) {
+  MockStack<int> mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+
+TEST(MockMethodTemplateMockTest, MethodWithCommaInReturnTypeWorks) {
+  MockStack<int> mock;
+
+  const std::map<int, int> a_map;
+  EXPECT_CALL(mock, ReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+  EXPECT_CALL(mock, ReturnTypeWithComma(1))
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma(1));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking template interfaces with calltype.
+
+template <typename T>
+class StackInterfaceWithCallType {
+ public:
+  virtual ~StackInterfaceWithCallType() {}
+
+  // Template parameter appears in function parameter.
+  STDMETHOD_(void, Push)(const T& value) = 0;
+  STDMETHOD_(void, Pop)() = 0;
+  STDMETHOD_(int, GetSize)() const = 0;
+  // Template parameter appears in function return type.
+  STDMETHOD_(const T&, GetTop)() const = 0;
+};
+
+template <typename T>
+class MockStackWithCallType : public StackInterfaceWithCallType<T> {
+ public:
+  MockStackWithCallType() {}
+
+  MOCK_METHOD(void, Push, (const T& elem),
+              (Calltype(STDMETHODCALLTYPE), override));
+  MOCK_METHOD(void, Pop, (), (Calltype(STDMETHODCALLTYPE), override));
+  MOCK_METHOD(int, GetSize, (), (Calltype(STDMETHODCALLTYPE), override, const));
+  MOCK_METHOD(const T&, GetTop, (),
+              (Calltype(STDMETHODCALLTYPE), override, const));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStackWithCallType);
+};
+
+// Tests that template mock with calltype works.
+TEST(MockMethodTemplateMockTestWithCallType, Works) {
+  MockStackWithCallType<int> mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+#endif  // GTEST_OS_WINDOWS
+
+#define MY_MOCK_METHODS1_                       \
+  MOCK_METHOD(void, Overloaded, ());            \
+  MOCK_METHOD(int, Overloaded, (int), (const)); \
+  MOCK_METHOD(bool, Overloaded, (bool f, int n))
+
+class MockOverloadedOnArgNumber {
+ public:
+  MockOverloadedOnArgNumber() {}
+
+  MY_MOCK_METHODS1_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnArgNumber);
+};
+
+TEST(MockMethodOverloadedMockMethodTest, CanOverloadOnArgNumberInMacroBody) {
+  MockOverloadedOnArgNumber mock;
+  EXPECT_CALL(mock, Overloaded());
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(mock, Overloaded(true, 1)).WillOnce(Return(true));
+
+  mock.Overloaded();
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_TRUE(mock.Overloaded(true, 1));
+}
+
+#define MY_MOCK_METHODS2_ \
+    MOCK_CONST_METHOD1(Overloaded, int(int n)); \
+    MOCK_METHOD1(Overloaded, int(int n))
+
+class MockOverloadedOnConstness {
+ public:
+  MockOverloadedOnConstness() {}
+
+  MY_MOCK_METHODS2_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnConstness);
+};
+
+TEST(MockMethodOverloadedMockMethodTest, CanOverloadOnConstnessInMacroBody) {
+  MockOverloadedOnConstness mock;
+  const MockOverloadedOnConstness* const_mock = &mock;
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(*const_mock, Overloaded(1)).WillOnce(Return(3));
+
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_EQ(3, const_mock->Overloaded(1));
+}
+
+TEST(MockMethodMockFunctionTest, WorksForVoidNullary) {
+  MockFunction<void()> foo;
+  EXPECT_CALL(foo, Call());
+  foo.Call();
+}
+
+TEST(MockMethodMockFunctionTest, WorksForNonVoidNullary) {
+  MockFunction<int()> foo;
+  EXPECT_CALL(foo, Call())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call());
+  EXPECT_EQ(2, foo.Call());
+}
+
+TEST(MockMethodMockFunctionTest, WorksForVoidUnary) {
+  MockFunction<void(int)> foo;
+  EXPECT_CALL(foo, Call(1));
+  foo.Call(1);
+}
+
+TEST(MockMethodMockFunctionTest, WorksForNonVoidBinary) {
+  MockFunction<int(bool, int)> foo;
+  EXPECT_CALL(foo, Call(false, 42))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_CALL(foo, Call(true, Ge(100)))
+      .WillOnce(Return(3));
+  EXPECT_EQ(1, foo.Call(false, 42));
+  EXPECT_EQ(2, foo.Call(false, 42));
+  EXPECT_EQ(3, foo.Call(true, 120));
+}
+
+TEST(MockMethodMockFunctionTest, WorksFor10Arguments) {
+  MockFunction<int(bool a0, char a1, int a2, int a3, int a4,
+                   int a5, int a6, char a7, int a8, bool a9)> foo;
+  EXPECT_CALL(foo, Call(_, 'a', _, _, _, _, _, _, _, _))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call(false, 'a', 0, 0, 0, 0, 0, 'b', 0, true));
+  EXPECT_EQ(2, foo.Call(true, 'a', 0, 0, 0, 0, 0, 'b', 1, false));
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunction) {
+  MockFunction<int(int)> foo;
+  auto call = [](const std::function<int(int)> &f, int i) {
+    return f(i);
+  };
+  EXPECT_CALL(foo, Call(1)).WillOnce(Return(-1));
+  EXPECT_CALL(foo, Call(2)).WillOnce(Return(-2));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), 1));
+  EXPECT_EQ(-2, call(foo.AsStdFunction(), 2));
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunctionReturnsReference) {
+  MockFunction<int&()> foo;
+  int value = 1;
+  EXPECT_CALL(foo, Call()).WillOnce(ReturnRef(value));
+  int& ref = foo.AsStdFunction()();
+  EXPECT_EQ(1, ref);
+  value = 2;
+  EXPECT_EQ(2, ref);
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunctionWithReferenceParameter) {
+  MockFunction<int(int &)> foo;
+  auto call = [](const std::function<int(int& )> &f, int &i) {
+    return f(i);
+  };
+  int i = 42;
+  EXPECT_CALL(foo, Call(i)).WillOnce(Return(-1));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), i));
+}
+
+
+struct MockMethodSizes0 {
+  MOCK_METHOD(void, func, ());
+};
+struct MockMethodSizes1 {
+  MOCK_METHOD(void, func, (int));
+};
+struct MockMethodSizes2 {
+  MOCK_METHOD(void, func, (int, int));
+};
+struct MockMethodSizes3 {
+  MOCK_METHOD(void, func, (int, int, int));
+};
+struct MockMethodSizes4 {
+  MOCK_METHOD(void, func, (int, int, int, int));
+};
+
+TEST(MockMethodMockFunctionTest, MockMethodSizeOverhead) {
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes1));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes2));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes3));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes4));
+}
+
+}  // namespace gmock_function_mocker_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-actions_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-actions_test.cc
new file mode 100755
index 0000000..4c649a7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-actions_test.cc
@@ -0,0 +1,1064 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions generated by a script.
+
+#include "gmock/gmock-generated-actions.h"
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_generated_actions_test {
+
+using ::std::plus;
+using ::std::string;
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::ByRef;
+using testing::DoAll;
+using testing::Invoke;
+using testing::Return;
+using testing::ReturnNew;
+using testing::SetArgPointee;
+using testing::StaticAssertTypeEq;
+using testing::Unused;
+
+// For suppressing compiler warnings on conversion possibly losing precision.
+inline short Short(short n) { return n; }  // NOLINT
+inline char Char(char ch) { return ch; }
+
+// Sample functions and functors for testing various actions.
+int Nullary() { return 1; }
+
+bool g_done = false;
+
+bool ByConstRef(const std::string& s) { return s == "Hi"; }
+
+const double g_double = 0;
+bool ReferencesGlobalDouble(const double& x) { return &x == &g_double; }
+
+struct UnaryFunctor {
+  int operator()(bool x) { return x ? 1 : -1; }
+};
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+struct SumOf5Functor {
+  int operator()(int a, int b, int c, int d, int e) {
+    return a + b + c + d + e;
+  }
+};
+
+std::string Concat5(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5) {
+  return std::string(s1) + s2 + s3 + s4 + s5;
+}
+
+int SumOf6(int a, int b, int c, int d, int e, int f) {
+  return a + b + c + d + e + f;
+}
+
+struct SumOf6Functor {
+  int operator()(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+};
+
+std::string Concat6(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6;
+}
+
+std::string Concat7(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+}
+
+std::string Concat8(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+}
+
+std::string Concat9(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8, const char* s9) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+}
+
+std::string Concat10(const char* s1, const char* s2, const char* s3,
+                     const char* s4, const char* s5, const char* s6,
+                     const char* s7, const char* s8, const char* s9,
+                     const char* s10) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+}
+
+// A helper that turns the type of a C-string literal from const
+// char[N] to const char*.
+inline const char* CharPtr(const char* s) { return s; }
+
+// Tests InvokeArgument<N>(...).
+
+// Tests using InvokeArgument with a nullary function.
+TEST(InvokeArgumentTest, Function0) {
+  Action<int(int, int(*)())> a = InvokeArgument<1>();  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2, &Nullary)));
+}
+
+// Tests using InvokeArgument with a unary function.
+TEST(InvokeArgumentTest, Functor1) {
+  Action<int(UnaryFunctor)> a = InvokeArgument<0>(true);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(UnaryFunctor())));
+}
+
+// Tests using InvokeArgument with a 5-ary function.
+TEST(InvokeArgumentTest, Function5) {
+  Action<int(int(*)(int, int, int, int, int))> a =  // NOLINT
+      InvokeArgument<0>(10000, 2000, 300, 40, 5);
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(&SumOf5)));
+}
+
+// Tests using InvokeArgument with a 5-ary functor.
+TEST(InvokeArgumentTest, Functor5) {
+  Action<int(SumOf5Functor)> a =  // NOLINT
+      InvokeArgument<0>(10000, 2000, 300, 40, 5);
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(SumOf5Functor())));
+}
+
+// Tests using InvokeArgument with a 6-ary function.
+TEST(InvokeArgumentTest, Function6) {
+  Action<int(int(*)(int, int, int, int, int, int))> a =  // NOLINT
+      InvokeArgument<0>(100000, 20000, 3000, 400, 50, 6);
+  EXPECT_EQ(123456, a.Perform(std::make_tuple(&SumOf6)));
+}
+
+// Tests using InvokeArgument with a 6-ary functor.
+TEST(InvokeArgumentTest, Functor6) {
+  Action<int(SumOf6Functor)> a =  // NOLINT
+      InvokeArgument<0>(100000, 20000, 3000, 400, 50, 6);
+  EXPECT_EQ(123456, a.Perform(std::make_tuple(SumOf6Functor())));
+}
+
+// Tests using InvokeArgument with a 7-ary function.
+TEST(InvokeArgumentTest, Function7) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7");
+  EXPECT_EQ("1234567", a.Perform(std::make_tuple(&Concat7)));
+}
+
+// Tests using InvokeArgument with a 8-ary function.
+TEST(InvokeArgumentTest, Function8) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8");
+  EXPECT_EQ("12345678", a.Perform(std::make_tuple(&Concat8)));
+}
+
+// Tests using InvokeArgument with a 9-ary function.
+TEST(InvokeArgumentTest, Function9) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*, const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8", "9");
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(&Concat9)));
+}
+
+// Tests using InvokeArgument with a 10-ary function.
+TEST(InvokeArgumentTest, Function10) {
+  Action<std::string(std::string(*)(
+      const char*, const char*, const char*, const char*, const char*,
+      const char*, const char*, const char*, const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8", "9", "0");
+  EXPECT_EQ("1234567890", a.Perform(std::make_tuple(&Concat10)));
+}
+
+// Tests using InvokeArgument with a function that takes a pointer argument.
+TEST(InvokeArgumentTest, ByPointerFunction) {
+  Action<const char*(const char*(*)(const char* input, short n))> a =  // NOLINT
+      InvokeArgument<0>(static_cast<const char*>("Hi"), Short(1));
+  EXPECT_STREQ("i", a.Perform(std::make_tuple(&Binary)));
+}
+
+// Tests using InvokeArgument with a function that takes a const char*
+// by passing it a C-string literal.
+TEST(InvokeArgumentTest, FunctionWithCStringLiteral) {
+  Action<const char*(const char*(*)(const char* input, short n))> a =  // NOLINT
+      InvokeArgument<0>("Hi", Short(1));
+  EXPECT_STREQ("i", a.Perform(std::make_tuple(&Binary)));
+}
+
+// Tests using InvokeArgument with a function that takes a const reference.
+TEST(InvokeArgumentTest, ByConstReferenceFunction) {
+  Action<bool(bool (*function)(const std::string& s))> a =  // NOLINT
+      InvokeArgument<0>(std::string("Hi"));
+  // When action 'a' is constructed, it makes a copy of the temporary
+  // string object passed to it, so it's OK to use 'a' later, when the
+  // temporary object has already died.
+  EXPECT_TRUE(a.Perform(std::make_tuple(&ByConstRef)));
+}
+
+// Tests using InvokeArgument with ByRef() and a function that takes a
+// const reference.
+TEST(InvokeArgumentTest, ByExplicitConstReferenceFunction) {
+  Action<bool(bool(*)(const double& x))> a =  // NOLINT
+      InvokeArgument<0>(ByRef(g_double));
+  // The above line calls ByRef() on a const value.
+  EXPECT_TRUE(a.Perform(std::make_tuple(&ReferencesGlobalDouble)));
+
+  double x = 0;
+  a = InvokeArgument<0>(ByRef(x));  // This calls ByRef() on a non-const.
+  EXPECT_FALSE(a.Perform(std::make_tuple(&ReferencesGlobalDouble)));
+}
+
+// Tests DoAll(a1, a2).
+TEST(DoAllTest, TwoActions) {
+  int n = 0;
+  Action<int(int*)> a = DoAll(SetArgPointee<0>(1),  // NOLINT
+                              Return(2));
+  EXPECT_EQ(2, a.Perform(std::make_tuple(&n)));
+  EXPECT_EQ(1, n);
+}
+
+// Tests DoAll(a1, a2, a3).
+TEST(DoAllTest, ThreeActions) {
+  int m = 0, n = 0;
+  Action<int(int*, int*)> a = DoAll(SetArgPointee<0>(1),  // NOLINT
+                                    SetArgPointee<1>(2),
+                                    Return(3));
+  EXPECT_EQ(3, a.Perform(std::make_tuple(&m, &n)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+}
+
+// Tests DoAll(a1, a2, a3, a4).
+TEST(DoAllTest, FourActions) {
+  int m = 0, n = 0;
+  char ch = '\0';
+  Action<int(int*, int*, char*)> a =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            Return(3));
+  EXPECT_EQ(3, a.Perform(std::make_tuple(&m, &n, &ch)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Tests DoAll(a1, a2, a3, a4, a5).
+TEST(DoAllTest, FiveActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0';
+  Action<int(int*, int*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+}
+
+// Tests DoAll(a1, a2, ..., a6).
+TEST(DoAllTest, SixActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0';
+  Action<int(int*, int*, char*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+}
+
+// Tests DoAll(a1, a2, ..., a7).
+TEST(DoAllTest, SevenActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+}
+
+// Tests DoAll(a1, a2, ..., a8).
+TEST(DoAllTest, EightActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0', e = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+}
+
+// Tests DoAll(a1, a2, ..., a9).
+TEST(DoAllTest, NineActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0', e = '\0', f = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*, char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            SetArgPointee<7>('f'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e, &f)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+  EXPECT_EQ('f', f);
+}
+
+// Tests DoAll(a1, a2, ..., a10).
+TEST(DoAllTest, TenActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0';
+  char e = '\0', f = '\0', g = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*, char*, char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            SetArgPointee<7>('f'),
+            SetArgPointee<8>('g'),
+            Return(3));
+  EXPECT_EQ(
+      3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e, &f, &g)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+  EXPECT_EQ('f', f);
+  EXPECT_EQ('g', g);
+}
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+// Also suppress C4503 decorated name length exceeded, name was truncated
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+# pragma warning(disable:4503)
+#endif
+// Tests the ACTION*() macro family.
+
+// Tests that ACTION() can define an action that doesn't reference the
+// mock function arguments.
+ACTION(Return5) { return 5; }
+
+TEST(ActionMacroTest, WorksWhenNotReferencingArguments) {
+  Action<double()> a1 = Return5();
+  EXPECT_DOUBLE_EQ(5, a1.Perform(std::make_tuple()));
+
+  Action<int(double, bool)> a2 = Return5();
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(1, true)));
+}
+
+// Tests that ACTION() can define an action that returns void.
+ACTION(IncrementArg1) { (*arg1)++; }
+
+TEST(ActionMacroTest, WorksWhenReturningVoid) {
+  Action<void(int, int*)> a1 = IncrementArg1();
+  int n = 0;
+  a1.Perform(std::make_tuple(5, &n));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the body of ACTION() can reference the type of the
+// argument.
+ACTION(IncrementArg2) {
+  StaticAssertTypeEq<int*, arg2_type>();
+  arg2_type temp = arg2;
+  (*temp)++;
+}
+
+TEST(ActionMacroTest, CanReferenceArgumentType) {
+  Action<void(int, bool, int*)> a1 = IncrementArg2();
+  int n = 0;
+  a1.Perform(std::make_tuple(5, false, &n));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the body of ACTION() can reference the argument tuple
+// via args_type and args.
+ACTION(Sum2) {
+  StaticAssertTypeEq<std::tuple<int, char, int*>, args_type>();
+  args_type args_copy = args;
+  return std::get<0>(args_copy) + std::get<1>(args_copy);
+}
+
+TEST(ActionMacroTest, CanReferenceArgumentTuple) {
+  Action<int(int, char, int*)> a1 = Sum2();
+  int dummy = 0;
+  EXPECT_EQ(11, a1.Perform(std::make_tuple(5, Char(6), &dummy)));
+}
+
+// Tests that the body of ACTION() can reference the mock function
+// type.
+int Dummy(bool flag) { return flag? 1 : 0; }
+
+ACTION(InvokeDummy) {
+  StaticAssertTypeEq<int(bool), function_type>();
+  function_type* fp = &Dummy;
+  return (*fp)(true);
+}
+
+TEST(ActionMacroTest, CanReferenceMockFunctionType) {
+  Action<int(bool)> a1 = InvokeDummy();
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(true)));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(false)));
+}
+
+// Tests that the body of ACTION() can reference the mock function's
+// return type.
+ACTION(InvokeDummy2) {
+  StaticAssertTypeEq<int, return_type>();
+  return_type result = Dummy(true);
+  return result;
+}
+
+TEST(ActionMacroTest, CanReferenceMockFunctionReturnType) {
+  Action<int(bool)> a1 = InvokeDummy2();
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(true)));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(false)));
+}
+
+// Tests that ACTION() works for arguments passed by const reference.
+ACTION(ReturnAddrOfConstBoolReferenceArg) {
+  StaticAssertTypeEq<const bool&, arg1_type>();
+  return &arg1;
+}
+
+TEST(ActionMacroTest, WorksForConstReferenceArg) {
+  Action<const bool*(int, const bool&)> a = ReturnAddrOfConstBoolReferenceArg();
+  const bool b = false;
+  EXPECT_EQ(&b, a.Perform(std::tuple<int, const bool&>(0, b)));
+}
+
+// Tests that ACTION() works for arguments passed by non-const reference.
+ACTION(ReturnAddrOfIntReferenceArg) {
+  StaticAssertTypeEq<int&, arg0_type>();
+  return &arg0;
+}
+
+TEST(ActionMacroTest, WorksForNonConstReferenceArg) {
+  Action<int*(int&, bool, int)> a = ReturnAddrOfIntReferenceArg();
+  int n = 0;
+  EXPECT_EQ(&n, a.Perform(std::tuple<int&, bool, int>(n, true, 1)));
+}
+
+// Tests that ACTION() can be used in a namespace.
+namespace action_test {
+ACTION(Sum) { return arg0 + arg1; }
+}  // namespace action_test
+
+TEST(ActionMacroTest, WorksInNamespace) {
+  Action<int(int, int)> a1 = action_test::Sum();
+  EXPECT_EQ(3, a1.Perform(std::make_tuple(1, 2)));
+}
+
+// Tests that the same ACTION definition works for mock functions with
+// different argument numbers.
+ACTION(PlusTwo) { return arg0 + 2; }
+
+TEST(ActionMacroTest, WorksForDifferentArgumentNumbers) {
+  Action<int(int)> a1 = PlusTwo();
+  EXPECT_EQ(4, a1.Perform(std::make_tuple(2)));
+
+  Action<double(float, void*)> a2 = PlusTwo();
+  int dummy;
+  EXPECT_DOUBLE_EQ(6, a2.Perform(std::make_tuple(4.0f, &dummy)));
+}
+
+// Tests that ACTION_P can define a parameterized action.
+ACTION_P(Plus, n) { return arg0 + n; }
+
+TEST(ActionPMacroTest, DefinesParameterizedAction) {
+  Action<int(int m, bool t)> a1 = Plus(9);
+  EXPECT_EQ(10, a1.Perform(std::make_tuple(1, true)));
+}
+
+// Tests that the body of ACTION_P can reference the argument types
+// and the parameter type.
+ACTION_P(TypedPlus, n) {
+  arg0_type t1 = arg0;
+  n_type t2 = n;
+  return t1 + t2;
+}
+
+TEST(ActionPMacroTest, CanReferenceArgumentAndParameterTypes) {
+  Action<int(char m, bool t)> a1 = TypedPlus(9);
+  EXPECT_EQ(10, a1.Perform(std::make_tuple(Char(1), true)));
+}
+
+// Tests that a parameterized action can be used in any mock function
+// whose type is compatible.
+TEST(ActionPMacroTest, WorksInCompatibleMockFunction) {
+  Action<std::string(const std::string& s)> a1 = Plus("tail");
+  const std::string re = "re";
+  std::tuple<const std::string> dummy = std::make_tuple(re);
+  EXPECT_EQ("retail", a1.Perform(dummy));
+}
+
+// Tests that we can use ACTION*() to define actions overloaded on the
+// number of parameters.
+
+ACTION(OverloadedAction) { return arg0 ? arg1 : "hello"; }
+
+ACTION_P(OverloadedAction, default_value) {
+  return arg0 ? arg1 : default_value;
+}
+
+ACTION_P2(OverloadedAction, true_value, false_value) {
+  return arg0 ? true_value : false_value;
+}
+
+TEST(ActionMacroTest, CanDefineOverloadedActions) {
+  typedef Action<const char*(bool, const char*)> MyAction;
+
+  const MyAction a1 = OverloadedAction();
+  EXPECT_STREQ("hello", a1.Perform(std::make_tuple(false, CharPtr("world"))));
+  EXPECT_STREQ("world", a1.Perform(std::make_tuple(true, CharPtr("world"))));
+
+  const MyAction a2 = OverloadedAction("hi");
+  EXPECT_STREQ("hi", a2.Perform(std::make_tuple(false, CharPtr("world"))));
+  EXPECT_STREQ("world", a2.Perform(std::make_tuple(true, CharPtr("world"))));
+
+  const MyAction a3 = OverloadedAction("hi", "you");
+  EXPECT_STREQ("hi", a3.Perform(std::make_tuple(true, CharPtr("world"))));
+  EXPECT_STREQ("you", a3.Perform(std::make_tuple(false, CharPtr("world"))));
+}
+
+// Tests ACTION_Pn where n >= 3.
+
+ACTION_P3(Plus, m, n, k) { return arg0 + m + n + k; }
+
+TEST(ActionPnMacroTest, WorksFor3Parameters) {
+  Action<double(int m, bool t)> a1 = Plus(100, 20, 3.4);
+  EXPECT_DOUBLE_EQ(3123.4, a1.Perform(std::make_tuple(3000, true)));
+
+  Action<std::string(const std::string& s)> a2 = Plus("tail", "-", ">");
+  const std::string re = "re";
+  std::tuple<const std::string> dummy = std::make_tuple(re);
+  EXPECT_EQ("retail->", a2.Perform(dummy));
+}
+
+ACTION_P4(Plus, p0, p1, p2, p3) { return arg0 + p0 + p1 + p2 + p3; }
+
+TEST(ActionPnMacroTest, WorksFor4Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P5(Plus, p0, p1, p2, p3, p4) { return arg0 + p0 + p1 + p2 + p3 + p4; }
+
+TEST(ActionPnMacroTest, WorksFor5Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P6(Plus, p0, p1, p2, p3, p4, p5) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5;
+}
+
+TEST(ActionPnMacroTest, WorksFor6Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P7(Plus, p0, p1, p2, p3, p4, p5, p6) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6;
+}
+
+TEST(ActionPnMacroTest, WorksFor7Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P8(Plus, p0, p1, p2, p3, p4, p5, p6, p7) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7;
+}
+
+TEST(ActionPnMacroTest, WorksFor8Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+            a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P9(Plus, p0, p1, p2, p3, p4, p5, p6, p7, p8) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8;
+}
+
+TEST(ActionPnMacroTest, WorksFor9Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8, 9);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+            a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P10(Plus, p0, p1, p2, p3, p4, p5, p6, p7, p8, last_param) {
+  arg0_type t0 = arg0;
+  last_param_type t9 = last_param;
+  return t0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + t9;
+}
+
+TEST(ActionPnMacroTest, WorksFor10Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+            a1.Perform(std::make_tuple(10)));
+}
+
+// Tests that the action body can promote the parameter types.
+
+ACTION_P2(PadArgument, prefix, suffix) {
+  // The following lines promote the two parameters to desired types.
+  std::string prefix_str(prefix);
+  char suffix_char = static_cast<char>(suffix);
+  return prefix_str + arg0 + suffix_char;
+}
+
+TEST(ActionPnMacroTest, SimpleTypePromotion) {
+  Action<std::string(const char*)> no_promo =
+      PadArgument(std::string("foo"), 'r');
+  Action<std::string(const char*)> promo =
+      PadArgument("foo", static_cast<int>('r'));
+  EXPECT_EQ("foobar", no_promo.Perform(std::make_tuple(CharPtr("ba"))));
+  EXPECT_EQ("foobar", promo.Perform(std::make_tuple(CharPtr("ba"))));
+}
+
+// Tests that we can partially restrict parameter types using a
+// straight-forward pattern.
+
+// Defines a generic action that doesn't restrict the types of its
+// parameters.
+ACTION_P3(ConcatImpl, a, b, c) {
+  std::stringstream ss;
+  ss << a << b << c;
+  return ss.str();
+}
+
+// Next, we try to restrict that either the first parameter is a
+// string, or the second parameter is an int.
+
+// Defines a partially specialized wrapper that restricts the first
+// parameter to std::string.
+template <typename T1, typename T2>
+// ConcatImplActionP3 is the class template ACTION_P3 uses to
+// implement ConcatImpl.  We shouldn't change the name as this
+// pattern requires the user to use it directly.
+ConcatImplActionP3<std::string, T1, T2>
+Concat(const std::string& a, T1 b, T2 c) {
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (true) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    // This branch verifies that ConcatImpl() can be invoked without
+    // explicit template arguments.
+    return ConcatImpl(a, b, c);
+  } else {
+    // This branch verifies that ConcatImpl() can also be invoked with
+    // explicit template arguments.  It doesn't really need to be
+    // executed as this is a compile-time verification.
+    return ConcatImpl<std::string, T1, T2>(a, b, c);
+  }
+}
+
+// Defines another partially specialized wrapper that restricts the
+// second parameter to int.
+template <typename T1, typename T2>
+ConcatImplActionP3<T1, int, T2>
+Concat(T1 a, int b, T2 c) {
+  return ConcatImpl(a, b, c);
+}
+
+TEST(ActionPnMacroTest, CanPartiallyRestrictParameterTypes) {
+  Action<const std::string()> a1 = Concat("Hello", "1", 2);
+  EXPECT_EQ("Hello12", a1.Perform(std::make_tuple()));
+
+  a1 = Concat(1, 2, 3);
+  EXPECT_EQ("123", a1.Perform(std::make_tuple()));
+}
+
+// Verifies the type of an ACTION*.
+
+ACTION(DoFoo) {}
+ACTION_P(DoFoo, p) {}
+ACTION_P2(DoFoo, p0, p1) {}
+
+TEST(ActionPnMacroTest, TypesAreCorrect) {
+  // DoFoo() must be assignable to a DoFooAction variable.
+  DoFooAction a0 = DoFoo();
+
+  // DoFoo(1) must be assignable to a DoFooActionP variable.
+  DoFooActionP<int> a1 = DoFoo(1);
+
+  // DoFoo(p1, ..., pk) must be assignable to a DoFooActionPk
+  // variable, and so on.
+  DoFooActionP2<int, char> a2 = DoFoo(1, '2');
+  PlusActionP3<int, int, char> a3 = Plus(1, 2, '3');
+  PlusActionP4<int, int, int, char> a4 = Plus(1, 2, 3, '4');
+  PlusActionP5<int, int, int, int, char> a5 = Plus(1, 2, 3, 4, '5');
+  PlusActionP6<int, int, int, int, int, char> a6 = Plus(1, 2, 3, 4, 5, '6');
+  PlusActionP7<int, int, int, int, int, int, char> a7 =
+      Plus(1, 2, 3, 4, 5, 6, '7');
+  PlusActionP8<int, int, int, int, int, int, int, char> a8 =
+      Plus(1, 2, 3, 4, 5, 6, 7, '8');
+  PlusActionP9<int, int, int, int, int, int, int, int, char> a9 =
+      Plus(1, 2, 3, 4, 5, 6, 7, 8, '9');
+  PlusActionP10<int, int, int, int, int, int, int, int, int, char> a10 =
+      Plus(1, 2, 3, 4, 5, 6, 7, 8, 9, '0');
+
+  // Avoid "unused variable" warnings.
+  (void)a0;
+  (void)a1;
+  (void)a2;
+  (void)a3;
+  (void)a4;
+  (void)a5;
+  (void)a6;
+  (void)a7;
+  (void)a8;
+  (void)a9;
+  (void)a10;
+}
+
+// Tests that an ACTION_P*() action can be explicitly instantiated
+// with reference-typed parameters.
+
+ACTION_P(Plus1, x) { return x; }
+ACTION_P2(Plus2, x, y) { return x + y; }
+ACTION_P3(Plus3, x, y, z) { return x + y + z; }
+ACTION_P10(Plus10, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) {
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9;
+}
+
+TEST(ActionPnMacroTest, CanExplicitlyInstantiateWithReferenceTypes) {
+  int x = 1, y = 2, z = 3;
+  const std::tuple<> empty = std::make_tuple();
+
+  Action<int()> a = Plus1<int&>(x);
+  EXPECT_EQ(1, a.Perform(empty));
+
+  a = Plus2<const int&, int&>(x, y);
+  EXPECT_EQ(3, a.Perform(empty));
+
+  a = Plus3<int&, const int&, int&>(x, y, z);
+  EXPECT_EQ(6, a.Perform(empty));
+
+  int n[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+  a = Plus10<const int&, int&, const int&, int&, const int&, int&, const int&,
+      int&, const int&, int&>(n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
+                              n[8], n[9]);
+  EXPECT_EQ(55, a.Perform(empty));
+}
+
+class NullaryConstructorClass {
+ public:
+  NullaryConstructorClass() : value_(123) {}
+  int value_;
+};
+
+// Tests using ReturnNew() with a nullary constructor.
+TEST(ReturnNewTest, NoArgs) {
+  Action<NullaryConstructorClass*()> a = ReturnNew<NullaryConstructorClass>();
+  NullaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(123, c->value_);
+  delete c;
+}
+
+class UnaryConstructorClass {
+ public:
+  explicit UnaryConstructorClass(int value) : value_(value) {}
+  int value_;
+};
+
+// Tests using ReturnNew() with a unary constructor.
+TEST(ReturnNewTest, Unary) {
+  Action<UnaryConstructorClass*()> a = ReturnNew<UnaryConstructorClass>(4000);
+  UnaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+TEST(ReturnNewTest, UnaryWorksWhenMockMethodHasArgs) {
+  Action<UnaryConstructorClass*(bool, int)> a =
+      ReturnNew<UnaryConstructorClass>(4000);
+  UnaryConstructorClass* c = a.Perform(std::make_tuple(false, 5));
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+TEST(ReturnNewTest, UnaryWorksWhenMockMethodReturnsPointerToConst) {
+  Action<const UnaryConstructorClass*()> a =
+      ReturnNew<UnaryConstructorClass>(4000);
+  const UnaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+class TenArgConstructorClass {
+ public:
+  TenArgConstructorClass(int a1, int a2, int a3, int a4, int a5,
+                         int a6, int a7, int a8, int a9, int a10)
+    : value_(a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10) {
+  }
+  int value_;
+};
+
+// Tests using ReturnNew() with a 10-argument constructor.
+TEST(ReturnNewTest, ConstructorThatTakes10Arguments) {
+  Action<TenArgConstructorClass*()> a =
+      ReturnNew<TenArgConstructorClass>(1000000000, 200000000, 30000000,
+                                        4000000, 500000, 60000,
+                                        7000, 800, 90, 0);
+  TenArgConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(1234567890, c->value_);
+  delete c;
+}
+
+// Tests that ACTION_TEMPLATE works when there is no value parameter.
+ACTION_TEMPLATE(CreateNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_0_VALUE_PARAMS()) {
+  return new T;
+}
+
+TEST(ActionTemplateTest, WorksWithoutValueParam) {
+  const Action<int*()> a = CreateNew<int>();
+  int* p = a.Perform(std::make_tuple());
+  delete p;
+}
+
+// Tests that ACTION_TEMPLATE works when there are value parameters.
+ACTION_TEMPLATE(CreateNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_1_VALUE_PARAMS(a0)) {
+  return new T(a0);
+}
+
+TEST(ActionTemplateTest, WorksWithValueParams) {
+  const Action<int*()> a = CreateNew<int>(42);
+  int* p = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, *p);
+  delete p;
+}
+
+// Tests that ACTION_TEMPLATE works for integral template parameters.
+ACTION_TEMPLATE(MyDeleteArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  delete std::get<k>(args);
+}
+
+// Resets a bool variable in the destructor.
+class BoolResetter {
+ public:
+  explicit BoolResetter(bool* value) : value_(value) {}
+  ~BoolResetter() { *value_ = false; }
+ private:
+  bool* value_;
+};
+
+TEST(ActionTemplateTest, WorksForIntegralTemplateParams) {
+  const Action<void(int*, BoolResetter*)> a = MyDeleteArg<1>();
+  int n = 0;
+  bool b = true;
+  BoolResetter* resetter = new BoolResetter(&b);
+  a.Perform(std::make_tuple(&n, resetter));
+  EXPECT_FALSE(b);  // Verifies that resetter is deleted.
+}
+
+// Tests that ACTION_TEMPLATES works for template template parameters.
+ACTION_TEMPLATE(ReturnSmartPointer,
+                HAS_1_TEMPLATE_PARAMS(template <typename Pointee> class,
+                                      Pointer),
+                AND_1_VALUE_PARAMS(pointee)) {
+  return Pointer<pointee_type>(new pointee_type(pointee));
+}
+
+TEST(ActionTemplateTest, WorksForTemplateTemplateParameters) {
+  const Action<std::shared_ptr<int>()> a =
+      ReturnSmartPointer<std::shared_ptr>(42);
+  std::shared_ptr<int> p = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, *p);
+}
+
+// Tests that ACTION_TEMPLATE works for 10 template parameters.
+template <typename T1, typename T2, typename T3, int k4, bool k5,
+          unsigned int k6, typename T7, typename T8, typename T9>
+struct GiantTemplate {
+ public:
+  explicit GiantTemplate(int a_value) : value(a_value) {}
+  int value;
+};
+
+ACTION_TEMPLATE(ReturnGiant,
+                HAS_10_TEMPLATE_PARAMS(
+                    typename, T1,
+                    typename, T2,
+                    typename, T3,
+                    int, k4,
+                    bool, k5,
+                    unsigned int, k6,
+                    class, T7,
+                    class, T8,
+                    class, T9,
+                    template <typename T> class, T10),
+                AND_1_VALUE_PARAMS(value)) {
+  return GiantTemplate<T10<T1>, T2, T3, k4, k5, k6, T7, T8, T9>(value);
+}
+
+TEST(ActionTemplateTest, WorksFor10TemplateParameters) {
+  using Giant = GiantTemplate<std::shared_ptr<int>, bool, double, 5, true, 6,
+                              char, unsigned, int>;
+  const Action<Giant()> a = ReturnGiant<int, bool, double, 5, true, 6, char,
+                                        unsigned, int, std::shared_ptr>(42);
+  Giant giant = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, giant.value);
+}
+
+// Tests that ACTION_TEMPLATE works for 10 value parameters.
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_10_VALUE_PARAMS(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10)) {
+  return static_cast<Number>(v1) + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10;
+}
+
+TEST(ActionTemplateTest, WorksFor10ValueParameters) {
+  const Action<int()> a = ReturnSum<int>(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  EXPECT_EQ(55, a.Perform(std::make_tuple()));
+}
+
+// Tests that ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded
+// on the number of value parameters.
+
+ACTION(ReturnSum) { return 0; }
+
+ACTION_P(ReturnSum, x) { return x; }
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_2_VALUE_PARAMS(v1, v2)) {
+  return static_cast<Number>(v1) + v2;
+}
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_3_VALUE_PARAMS(v1, v2, v3)) {
+  return static_cast<Number>(v1) + v2 + v3;
+}
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_2_TEMPLATE_PARAMS(typename, Number, int, k),
+                AND_4_VALUE_PARAMS(v1, v2, v3, v4)) {
+  return static_cast<Number>(v1) + v2 + v3 + v4 + k;
+}
+
+TEST(ActionTemplateTest, CanBeOverloadedOnNumberOfValueParameters) {
+  const Action<int()> a0 = ReturnSum();
+  const Action<int()> a1 = ReturnSum(1);
+  const Action<int()> a2 = ReturnSum<int>(1, 2);
+  const Action<int()> a3 = ReturnSum<int>(1, 2, 3);
+  const Action<int()> a4 = ReturnSum<int, 10000>(2000, 300, 40, 5);
+  EXPECT_EQ(0, a0.Perform(std::make_tuple()));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple()));
+  EXPECT_EQ(3, a2.Perform(std::make_tuple()));
+  EXPECT_EQ(6, a3.Perform(std::make_tuple()));
+  EXPECT_EQ(12345, a4.Perform(std::make_tuple()));
+}
+
+
+}  // namespace gmock_generated_actions_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-function-mockers_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-function-mockers_test.cc
new file mode 100755
index 0000000..dff3a9f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-function-mockers_test.cc
@@ -0,0 +1,659 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the function mocker classes.
+
+#include "gmock/gmock-generated-function-mockers.h"
+
+#if GTEST_OS_WINDOWS
+// MSDN says the header file to be included for STDMETHOD is BaseTyps.h but
+// we are getting compiler errors if we use basetyps.h, hence including
+// objbase.h for definition of STDMETHOD.
+# include <objbase.h>
+#endif  // GTEST_OS_WINDOWS
+
+#include <map>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_generated_function_mockers_test {
+
+using testing::_;
+using testing::A;
+using testing::An;
+using testing::AnyNumber;
+using testing::Const;
+using testing::DoDefault;
+using testing::Eq;
+using testing::Lt;
+using testing::MockFunction;
+using testing::Ref;
+using testing::Return;
+using testing::ReturnRef;
+using testing::TypedEq;
+
+template<typename T>
+class TemplatedCopyable {
+ public:
+  TemplatedCopyable() {}
+
+  template <typename U>
+  TemplatedCopyable(const U& other) {}  // NOLINT
+};
+
+class FooInterface {
+ public:
+  virtual ~FooInterface() {}
+
+  virtual void VoidReturning(int x) = 0;
+
+  virtual int Nullary() = 0;
+  virtual bool Unary(int x) = 0;
+  virtual long Binary(short x, int y) = 0;  // NOLINT
+  virtual int Decimal(bool b, char c, short d, int e, long f,  // NOLINT
+                      float g, double h, unsigned i, char* j,
+                      const std::string& k) = 0;
+
+  virtual bool TakesNonConstReference(int& n) = 0;  // NOLINT
+  virtual std::string TakesConstReference(const int& n) = 0;
+  virtual bool TakesConst(const int x) = 0;
+
+  virtual int OverloadedOnArgumentNumber() = 0;
+  virtual int OverloadedOnArgumentNumber(int n) = 0;
+
+  virtual int OverloadedOnArgumentType(int n) = 0;
+  virtual char OverloadedOnArgumentType(char c) = 0;
+
+  virtual int OverloadedOnConstness() = 0;
+  virtual char OverloadedOnConstness() const = 0;
+
+  virtual int TypeWithHole(int (*func)()) = 0;
+  virtual int TypeWithComma(const std::map<int, std::string>& a_map) = 0;
+  virtual int TypeWithTemplatedCopyCtor(
+      const TemplatedCopyable<int>& a_vector) = 0;
+
+#if GTEST_OS_WINDOWS
+  STDMETHOD_(int, CTNullary)() = 0;
+  STDMETHOD_(bool, CTUnary)(int x) = 0;
+  STDMETHOD_(int, CTDecimal)
+  (bool b, char c, short d, int e, long f,  // NOLINT
+   float g, double h, unsigned i, char* j, const std::string& k) = 0;
+  STDMETHOD_(char, CTConst)(int x) const = 0;
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Const qualifiers on arguments were once (incorrectly) considered
+// significant in determining whether two virtual functions had the same
+// signature. This was fixed in Visual Studio 2008. However, the compiler
+// still emits a warning that alerts about this change in behavior.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4373)
+#endif
+class MockFoo : public FooInterface {
+ public:
+  MockFoo() {}
+
+  // Makes sure that a mock function parameter can be named.
+  MOCK_METHOD1(VoidReturning, void(int n));  // NOLINT
+
+  MOCK_METHOD0(Nullary, int());  // NOLINT
+
+  // Makes sure that a mock function parameter can be unnamed.
+  MOCK_METHOD1(Unary, bool(int));  // NOLINT
+  MOCK_METHOD2(Binary, long(short, int));  // NOLINT
+  MOCK_METHOD10(Decimal, int(bool, char, short, int, long, float,  // NOLINT
+                             double, unsigned, char*, const std::string& str));
+
+  MOCK_METHOD1(TakesNonConstReference, bool(int&));  // NOLINT
+  MOCK_METHOD1(TakesConstReference, std::string(const int&));
+  MOCK_METHOD1(TakesConst, bool(const int));  // NOLINT
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0(ReturnTypeWithComma, std::map<int, std::string>());
+  MOCK_CONST_METHOD1(ReturnTypeWithComma,
+                     std::map<int, std::string>(int));  // NOLINT
+
+  MOCK_METHOD0(OverloadedOnArgumentNumber, int());  // NOLINT
+  MOCK_METHOD1(OverloadedOnArgumentNumber, int(int));  // NOLINT
+
+  MOCK_METHOD1(OverloadedOnArgumentType, int(int));  // NOLINT
+  MOCK_METHOD1(OverloadedOnArgumentType, char(char));  // NOLINT
+
+  MOCK_METHOD0(OverloadedOnConstness, int());  // NOLINT
+  MOCK_CONST_METHOD0(OverloadedOnConstness, char());  // NOLINT
+
+  MOCK_METHOD1(TypeWithHole, int(int (*)()));  // NOLINT
+  MOCK_METHOD1(TypeWithComma,
+               int(const std::map<int, std::string>&));  // NOLINT
+  MOCK_METHOD1(TypeWithTemplatedCopyCtor,
+               int(const TemplatedCopyable<int>&));  // NOLINT
+
+#if GTEST_OS_WINDOWS
+  MOCK_METHOD0_WITH_CALLTYPE(STDMETHODCALLTYPE, CTNullary, int());
+  MOCK_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, CTUnary, bool(int));
+  MOCK_METHOD10_WITH_CALLTYPE(STDMETHODCALLTYPE, CTDecimal,
+                              int(bool b, char c, short d, int e, long f,
+                                  float g, double h, unsigned i, char* j,
+                                  const std::string& k));
+  MOCK_CONST_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, CTConst, char(int));
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0_WITH_CALLTYPE(STDMETHODCALLTYPE, CTReturnTypeWithComma,
+                             std::map<int, std::string>());
+#endif  // GTEST_OS_WINDOWS
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+class FunctionMockerTest : public testing::Test {
+ protected:
+  FunctionMockerTest() : foo_(&mock_foo_) {}
+
+  FooInterface* const foo_;
+  MockFoo mock_foo_;
+};
+
+// Tests mocking a void-returning function.
+TEST_F(FunctionMockerTest, MocksVoidFunction) {
+  EXPECT_CALL(mock_foo_, VoidReturning(Lt(100)));
+  foo_->VoidReturning(0);
+}
+
+// Tests mocking a nullary function.
+TEST_F(FunctionMockerTest, MocksNullaryFunction) {
+  EXPECT_CALL(mock_foo_, Nullary())
+      .WillOnce(DoDefault())
+      .WillOnce(Return(1));
+
+  EXPECT_EQ(0, foo_->Nullary());
+  EXPECT_EQ(1, foo_->Nullary());
+}
+
+// Tests mocking a unary function.
+TEST_F(FunctionMockerTest, MocksUnaryFunction) {
+  EXPECT_CALL(mock_foo_, Unary(Eq(2)))
+      .Times(2)
+      .WillOnce(Return(true));
+
+  EXPECT_TRUE(foo_->Unary(2));
+  EXPECT_FALSE(foo_->Unary(2));
+}
+
+// Tests mocking a binary function.
+TEST_F(FunctionMockerTest, MocksBinaryFunction) {
+  EXPECT_CALL(mock_foo_, Binary(2, _))
+      .WillOnce(Return(3));
+
+  EXPECT_EQ(3, foo_->Binary(2, 1));
+}
+
+// Tests mocking a decimal function.
+TEST_F(FunctionMockerTest, MocksDecimalFunction) {
+  EXPECT_CALL(mock_foo_, Decimal(true, 'a', 0, 0, 1L, A<float>(), Lt(100), 5U,
+                                 nullptr, "hi"))
+      .WillOnce(Return(5));
+
+  EXPECT_EQ(5, foo_->Decimal(true, 'a', 0, 0, 1, 0, 0, 5, nullptr, "hi"));
+}
+
+// Tests mocking a function that takes a non-const reference.
+TEST_F(FunctionMockerTest, MocksFunctionWithNonConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(mock_foo_, TakesNonConstReference(Ref(a)))
+      .WillOnce(Return(true));
+
+  EXPECT_TRUE(foo_->TakesNonConstReference(a));
+}
+
+// Tests mocking a function that takes a const reference.
+TEST_F(FunctionMockerTest, MocksFunctionWithConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(mock_foo_, TakesConstReference(Ref(a)))
+      .WillOnce(Return("Hello"));
+
+  EXPECT_EQ("Hello", foo_->TakesConstReference(a));
+}
+
+// Tests mocking a function that takes a const variable.
+TEST_F(FunctionMockerTest, MocksFunctionWithConstArgument) {
+  EXPECT_CALL(mock_foo_, TakesConst(Lt(10)))
+      .WillOnce(DoDefault());
+
+  EXPECT_FALSE(foo_->TakesConst(5));
+}
+
+// Tests mocking functions overloaded on the number of arguments.
+TEST_F(FunctionMockerTest, MocksFunctionsOverloadedOnArgumentNumber) {
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentNumber())
+      .WillOnce(Return(1));
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentNumber(_))
+      .WillOnce(Return(2));
+
+  EXPECT_EQ(2, foo_->OverloadedOnArgumentNumber(1));
+  EXPECT_EQ(1, foo_->OverloadedOnArgumentNumber());
+}
+
+// Tests mocking functions overloaded on the types of argument.
+TEST_F(FunctionMockerTest, MocksFunctionsOverloadedOnArgumentType) {
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentType(An<int>()))
+      .WillOnce(Return(1));
+  EXPECT_CALL(mock_foo_, OverloadedOnArgumentType(TypedEq<char>('a')))
+      .WillOnce(Return('b'));
+
+  EXPECT_EQ(1, foo_->OverloadedOnArgumentType(0));
+  EXPECT_EQ('b', foo_->OverloadedOnArgumentType('a'));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TEST_F(FunctionMockerTest, MocksFunctionsOverloadedOnConstnessOfThis) {
+  EXPECT_CALL(mock_foo_, OverloadedOnConstness());
+  EXPECT_CALL(Const(mock_foo_), OverloadedOnConstness())
+      .WillOnce(Return('a'));
+
+  EXPECT_EQ(0, foo_->OverloadedOnConstness());
+  EXPECT_EQ('a', Const(*foo_).OverloadedOnConstness());
+}
+
+TEST_F(FunctionMockerTest, MocksReturnTypeWithComma) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(mock_foo_, ReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+  EXPECT_CALL(mock_foo_, ReturnTypeWithComma(42))
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock_foo_.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, mock_foo_.ReturnTypeWithComma(42));
+}
+
+TEST_F(FunctionMockerTest, MocksTypeWithTemplatedCopyCtor) {
+  EXPECT_CALL(mock_foo_, TypeWithTemplatedCopyCtor(_)).WillOnce(Return(true));
+  EXPECT_TRUE(foo_->TypeWithTemplatedCopyCtor(TemplatedCopyable<int>()));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking a nullary function with calltype.
+TEST_F(FunctionMockerTest, MocksNullaryFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTNullary())
+      .WillOnce(Return(-1))
+      .WillOnce(Return(0));
+
+  EXPECT_EQ(-1, foo_->CTNullary());
+  EXPECT_EQ(0, foo_->CTNullary());
+}
+
+// Tests mocking a unary function with calltype.
+TEST_F(FunctionMockerTest, MocksUnaryFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTUnary(Eq(2)))
+      .Times(2)
+      .WillOnce(Return(true))
+      .WillOnce(Return(false));
+
+  EXPECT_TRUE(foo_->CTUnary(2));
+  EXPECT_FALSE(foo_->CTUnary(2));
+}
+
+// Tests mocking a decimal function with calltype.
+TEST_F(FunctionMockerTest, MocksDecimalFunctionWithCallType) {
+  EXPECT_CALL(mock_foo_, CTDecimal(true, 'a', 0, 0, 1L, A<float>(), Lt(100), 5U,
+                                   nullptr, "hi"))
+      .WillOnce(Return(10));
+
+  EXPECT_EQ(10, foo_->CTDecimal(true, 'a', 0, 0, 1, 0, 0, 5, nullptr, "hi"));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TEST_F(FunctionMockerTest, MocksFunctionsConstFunctionWithCallType) {
+  EXPECT_CALL(Const(mock_foo_), CTConst(_))
+      .WillOnce(Return('a'));
+
+  EXPECT_EQ('a', Const(*foo_).CTConst(0));
+}
+
+TEST_F(FunctionMockerTest, MocksReturnTypeWithCommaAndCallType) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(mock_foo_, CTReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock_foo_.CTReturnTypeWithComma());
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+class MockB {
+ public:
+  MockB() {}
+
+  MOCK_METHOD0(DoB, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockB);
+};
+
+// Tests that functions with no EXPECT_CALL() ruls can be called any
+// number of times.
+TEST(ExpectCallTest, UnmentionedFunctionCanBeCalledAnyNumberOfTimes) {
+  {
+    MockB b;
+  }
+
+  {
+    MockB b;
+    b.DoB();
+  }
+
+  {
+    MockB b;
+    b.DoB();
+    b.DoB();
+  }
+}
+
+// Tests mocking template interfaces.
+
+template <typename T>
+class StackInterface {
+ public:
+  virtual ~StackInterface() {}
+
+  // Template parameter appears in function parameter.
+  virtual void Push(const T& value) = 0;
+  virtual void Pop() = 0;
+  virtual int GetSize() const = 0;
+  // Template parameter appears in function return type.
+  virtual const T& GetTop() const = 0;
+};
+
+template <typename T>
+class MockStack : public StackInterface<T> {
+ public:
+  MockStack() {}
+
+  MOCK_METHOD1_T(Push, void(const T& elem));
+  MOCK_METHOD0_T(Pop, void());
+  MOCK_CONST_METHOD0_T(GetSize, int());  // NOLINT
+  MOCK_CONST_METHOD0_T(GetTop, const T&());
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0_T(ReturnTypeWithComma, std::map<int, int>());
+  MOCK_CONST_METHOD1_T(ReturnTypeWithComma, std::map<int, int>(int));  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStack);
+};
+
+// Tests that template mock works.
+TEST(TemplateMockTest, Works) {
+  MockStack<int> mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+
+TEST(TemplateMockTest, MethodWithCommaInReturnTypeWorks) {
+  MockStack<int> mock;
+
+  const std::map<int, int> a_map;
+  EXPECT_CALL(mock, ReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+  EXPECT_CALL(mock, ReturnTypeWithComma(1))
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma(1));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking template interfaces with calltype.
+
+template <typename T>
+class StackInterfaceWithCallType {
+ public:
+  virtual ~StackInterfaceWithCallType() {}
+
+  // Template parameter appears in function parameter.
+  STDMETHOD_(void, Push)(const T& value) = 0;
+  STDMETHOD_(void, Pop)() = 0;
+  STDMETHOD_(int, GetSize)() const = 0;
+  // Template parameter appears in function return type.
+  STDMETHOD_(const T&, GetTop)() const = 0;
+};
+
+template <typename T>
+class MockStackWithCallType : public StackInterfaceWithCallType<T> {
+ public:
+  MockStackWithCallType() {}
+
+  MOCK_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Push, void(const T& elem));
+  MOCK_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Pop, void());
+  MOCK_CONST_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, GetSize, int());
+  MOCK_CONST_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, GetTop, const T&());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStackWithCallType);
+};
+
+// Tests that template mock with calltype works.
+TEST(TemplateMockTestWithCallType, Works) {
+  MockStackWithCallType<int> mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+#endif  // GTEST_OS_WINDOWS
+
+#define MY_MOCK_METHODS1_ \
+    MOCK_METHOD0(Overloaded, void()); \
+    MOCK_CONST_METHOD1(Overloaded, int(int n)); \
+    MOCK_METHOD2(Overloaded, bool(bool f, int n))
+
+class MockOverloadedOnArgNumber {
+ public:
+  MockOverloadedOnArgNumber() {}
+
+  MY_MOCK_METHODS1_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnArgNumber);
+};
+
+TEST(OverloadedMockMethodTest, CanOverloadOnArgNumberInMacroBody) {
+  MockOverloadedOnArgNumber mock;
+  EXPECT_CALL(mock, Overloaded());
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(mock, Overloaded(true, 1)).WillOnce(Return(true));
+
+  mock.Overloaded();
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_TRUE(mock.Overloaded(true, 1));
+}
+
+#define MY_MOCK_METHODS2_ \
+    MOCK_CONST_METHOD1(Overloaded, int(int n)); \
+    MOCK_METHOD1(Overloaded, int(int n))
+
+class MockOverloadedOnConstness {
+ public:
+  MockOverloadedOnConstness() {}
+
+  MY_MOCK_METHODS2_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnConstness);
+};
+
+TEST(OverloadedMockMethodTest, CanOverloadOnConstnessInMacroBody) {
+  MockOverloadedOnConstness mock;
+  const MockOverloadedOnConstness* const_mock = &mock;
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(*const_mock, Overloaded(1)).WillOnce(Return(3));
+
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_EQ(3, const_mock->Overloaded(1));
+}
+
+TEST(MockFunctionTest, WorksForVoidNullary) {
+  MockFunction<void()> foo;
+  EXPECT_CALL(foo, Call());
+  foo.Call();
+}
+
+TEST(MockFunctionTest, WorksForNonVoidNullary) {
+  MockFunction<int()> foo;
+  EXPECT_CALL(foo, Call())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call());
+  EXPECT_EQ(2, foo.Call());
+}
+
+TEST(MockFunctionTest, WorksForVoidUnary) {
+  MockFunction<void(int)> foo;
+  EXPECT_CALL(foo, Call(1));
+  foo.Call(1);
+}
+
+TEST(MockFunctionTest, WorksForNonVoidBinary) {
+  MockFunction<int(bool, int)> foo;
+  EXPECT_CALL(foo, Call(false, 42))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_CALL(foo, Call(true, Ge(100)))
+      .WillOnce(Return(3));
+  EXPECT_EQ(1, foo.Call(false, 42));
+  EXPECT_EQ(2, foo.Call(false, 42));
+  EXPECT_EQ(3, foo.Call(true, 120));
+}
+
+TEST(MockFunctionTest, WorksFor10Arguments) {
+  MockFunction<int(bool a0, char a1, int a2, int a3, int a4,
+                   int a5, int a6, char a7, int a8, bool a9)> foo;
+  EXPECT_CALL(foo, Call(_, 'a', _, _, _, _, _, _, _, _))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call(false, 'a', 0, 0, 0, 0, 0, 'b', 0, true));
+  EXPECT_EQ(2, foo.Call(true, 'a', 0, 0, 0, 0, 0, 'b', 1, false));
+}
+
+TEST(MockFunctionTest, AsStdFunction) {
+  MockFunction<int(int)> foo;
+  auto call = [](const std::function<int(int)> &f, int i) {
+    return f(i);
+  };
+  EXPECT_CALL(foo, Call(1)).WillOnce(Return(-1));
+  EXPECT_CALL(foo, Call(2)).WillOnce(Return(-2));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), 1));
+  EXPECT_EQ(-2, call(foo.AsStdFunction(), 2));
+}
+
+TEST(MockFunctionTest, AsStdFunctionReturnsReference) {
+  MockFunction<int&()> foo;
+  int value = 1;
+  EXPECT_CALL(foo, Call()).WillOnce(ReturnRef(value));
+  int& ref = foo.AsStdFunction()();
+  EXPECT_EQ(1, ref);
+  value = 2;
+  EXPECT_EQ(2, ref);
+}
+
+TEST(MockFunctionTest, AsStdFunctionWithReferenceParameter) {
+  MockFunction<int(int &)> foo;
+  auto call = [](const std::function<int(int& )> &f, int &i) {
+    return f(i);
+  };
+  int i = 42;
+  EXPECT_CALL(foo, Call(i)).WillOnce(Return(-1));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), i));
+}
+
+
+struct MockMethodSizes0 {
+  MOCK_METHOD0(func, void());
+};
+struct MockMethodSizes1 {
+  MOCK_METHOD1(func, void(int));
+};
+struct MockMethodSizes2 {
+  MOCK_METHOD2(func, void(int, int));
+};
+struct MockMethodSizes3 {
+  MOCK_METHOD3(func, void(int, int, int));
+};
+struct MockMethodSizes4 {
+  MOCK_METHOD4(func, void(int, int, int, int));
+};
+
+TEST(MockFunctionTest, MockMethodSizeOverhead) {
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes1));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes2));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes3));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes4));
+}
+
+}  // namespace gmock_generated_function_mockers_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-matchers_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-matchers_test.cc
new file mode 100755
index 0000000..6c4b300
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-generated-matchers_test.cc
@@ -0,0 +1,1324 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in matchers generated by a script.
+
+// Silence warning C4244: 'initializing': conversion from 'int' to 'short',
+// possible loss of data and C4100, unreferenced local parameter
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4244)
+# pragma warning(disable:4100)
+#endif
+
+#include "gmock/gmock-generated-matchers.h"
+
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace {
+
+using std::list;
+using std::map;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using testing::_;
+using testing::AllOf;
+using testing::AllOfArray;
+using testing::AnyOf;
+using testing::AnyOfArray;
+using testing::Args;
+using testing::Contains;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+using testing::MakeMatcher;
+using testing::Matcher;
+using testing::MatcherInterface;
+using testing::MatchResultListener;
+using testing::Ne;
+using testing::Not;
+using testing::Pointee;
+using testing::PrintToString;
+using testing::Ref;
+using testing::StaticAssertTypeEq;
+using testing::StrEq;
+using testing::Value;
+using testing::internal::ElementsAreArrayMatcher;
+
+// Returns the description of the given matcher.
+template <typename T>
+std::string Describe(const Matcher<T>& m) {
+  stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
+// Returns the description of the negation of the given matcher.
+template <typename T>
+std::string DescribeNegation(const Matcher<T>& m) {
+  stringstream ss;
+  m.DescribeNegationTo(&ss);
+  return ss.str();
+}
+
+// Returns the reason why x matches, or doesn't match, m.
+template <typename MatcherType, typename Value>
+std::string Explain(const MatcherType& m, const Value& x) {
+  stringstream ss;
+  m.ExplainMatchResultTo(x, &ss);
+  return ss.str();
+}
+
+// For testing ExplainMatchResultTo().
+class GreaterThanMatcher : public MatcherInterface<int> {
+ public:
+  explicit GreaterThanMatcher(int rhs) : rhs_(rhs) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "is greater than " << rhs_;
+  }
+
+  bool MatchAndExplain(int lhs, MatchResultListener* listener) const override {
+    const int diff = lhs - rhs_;
+    if (diff > 0) {
+      *listener << "which is " << diff << " more than " << rhs_;
+    } else if (diff == 0) {
+      *listener << "which is the same as " << rhs_;
+    } else {
+      *listener << "which is " << -diff << " less than " << rhs_;
+    }
+
+    return lhs > rhs_;
+  }
+
+ private:
+  int rhs_;
+};
+
+Matcher<int> GreaterThan(int n) {
+  return MakeMatcher(new GreaterThanMatcher(n));
+}
+
+// Tests for ElementsAre().
+
+TEST(ElementsAreTest, CanDescribeExpectingNoElement) {
+  Matcher<const vector<int>&> m = ElementsAre();
+  EXPECT_EQ("is empty", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeExpectingOneElement) {
+  Matcher<vector<int> > m = ElementsAre(Gt(5));
+  EXPECT_EQ("has 1 element that is > 5", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeExpectingManyElements) {
+  Matcher<list<std::string> > m = ElementsAre(StrEq("one"), "two");
+  EXPECT_EQ("has 2 elements where\n"
+            "element #0 is equal to \"one\",\n"
+            "element #1 is equal to \"two\"", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingNoElement) {
+  Matcher<vector<int> > m = ElementsAre();
+  EXPECT_EQ("isn't empty", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingOneElment) {
+  Matcher<const list<int>& > m = ElementsAre(Gt(5));
+  EXPECT_EQ("doesn't have 1 element, or\n"
+            "element #0 isn't > 5", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingManyElements) {
+  Matcher<const list<std::string>&> m = ElementsAre("one", "two");
+  EXPECT_EQ("doesn't have 2 elements, or\n"
+            "element #0 isn't equal to \"one\", or\n"
+            "element #1 isn't equal to \"two\"", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, DoesNotExplainTrivialMatch) {
+  Matcher<const list<int>& > m = ElementsAre(1, Ne(2));
+
+  list<int> test_list;
+  test_list.push_back(1);
+  test_list.push_back(3);
+  EXPECT_EQ("", Explain(m, test_list));  // No need to explain anything.
+}
+
+TEST(ElementsAreTest, ExplainsNonTrivialMatch) {
+  Matcher<const vector<int>& > m =
+      ElementsAre(GreaterThan(1), 0, GreaterThan(2));
+
+  const int a[] = { 10, 0, 100 };
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_EQ("whose element #0 matches, which is 9 more than 1,\n"
+            "and whose element #2 matches, which is 98 more than 2",
+            Explain(m, test_vector));
+}
+
+TEST(ElementsAreTest, CanExplainMismatchWrongSize) {
+  Matcher<const list<int>& > m = ElementsAre(1, 3);
+
+  list<int> test_list;
+  // No need to explain when the container is empty.
+  EXPECT_EQ("", Explain(m, test_list));
+
+  test_list.push_back(1);
+  EXPECT_EQ("which has 1 element", Explain(m, test_list));
+}
+
+TEST(ElementsAreTest, CanExplainMismatchRightSize) {
+  Matcher<const vector<int>& > m = ElementsAre(1, GreaterThan(5));
+
+  vector<int> v;
+  v.push_back(2);
+  v.push_back(1);
+  EXPECT_EQ("whose element #0 doesn't match", Explain(m, v));
+
+  v[0] = 1;
+  EXPECT_EQ("whose element #1 doesn't match, which is 4 less than 5",
+            Explain(m, v));
+}
+
+TEST(ElementsAreTest, MatchesOneElementVector) {
+  vector<std::string> test_vector;
+  test_vector.push_back("test string");
+
+  EXPECT_THAT(test_vector, ElementsAre(StrEq("test string")));
+}
+
+TEST(ElementsAreTest, MatchesOneElementList) {
+  list<std::string> test_list;
+  test_list.push_back("test string");
+
+  EXPECT_THAT(test_list, ElementsAre("test string"));
+}
+
+TEST(ElementsAreTest, MatchesThreeElementVector) {
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("two");
+  test_vector.push_back("three");
+
+  EXPECT_THAT(test_vector, ElementsAre("one", StrEq("two"), _));
+}
+
+TEST(ElementsAreTest, MatchesOneElementEqMatcher) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(Eq(4)));
+}
+
+TEST(ElementsAreTest, MatchesOneElementAnyMatcher) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(_));
+}
+
+TEST(ElementsAreTest, MatchesOneElementValue) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(4));
+}
+
+TEST(ElementsAreTest, MatchesThreeElementsMixedMatchers) {
+  vector<int> test_vector;
+  test_vector.push_back(1);
+  test_vector.push_back(2);
+  test_vector.push_back(3);
+
+  EXPECT_THAT(test_vector, ElementsAre(1, Eq(2), _));
+}
+
+TEST(ElementsAreTest, MatchesTenElementVector) {
+  const int a[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+
+  EXPECT_THAT(test_vector,
+              // The element list can contain values and/or matchers
+              // of different types.
+              ElementsAre(0, Ge(0), _, 3, 4, Ne(2), Eq(6), 7, 8, _));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongSize) {
+  vector<std::string> test_vector;
+  test_vector.push_back("test string");
+  test_vector.push_back("test string");
+
+  Matcher<vector<std::string> > m = ElementsAre(StrEq("test string"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongValue) {
+  vector<std::string> test_vector;
+  test_vector.push_back("other string");
+
+  Matcher<vector<std::string> > m = ElementsAre(StrEq("test string"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongOrder) {
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("three");
+  test_vector.push_back("two");
+
+  Matcher<vector<std::string> > m =
+      ElementsAre(StrEq("one"), StrEq("two"), StrEq("three"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, WorksForNestedContainer) {
+  const char* strings[] = {
+    "Hi",
+    "world"
+  };
+
+  vector<list<char> > nested;
+  for (size_t i = 0; i < GTEST_ARRAY_SIZE_(strings); i++) {
+    nested.push_back(list<char>(strings[i], strings[i] + strlen(strings[i])));
+  }
+
+  EXPECT_THAT(nested, ElementsAre(ElementsAre('H', Ne('e')),
+                                  ElementsAre('w', 'o', _, _, 'd')));
+  EXPECT_THAT(nested, Not(ElementsAre(ElementsAre('H', 'e'),
+                                      ElementsAre('w', 'o', _, _, 'd'))));
+}
+
+TEST(ElementsAreTest, WorksWithByRefElementMatchers) {
+  int a[] = { 0, 1, 2 };
+  vector<int> v(a, a + GTEST_ARRAY_SIZE_(a));
+
+  EXPECT_THAT(v, ElementsAre(Ref(v[0]), Ref(v[1]), Ref(v[2])));
+  EXPECT_THAT(v, Not(ElementsAre(Ref(v[0]), Ref(v[1]), Ref(a[2]))));
+}
+
+TEST(ElementsAreTest, WorksWithContainerPointerUsingPointee) {
+  int a[] = { 0, 1, 2 };
+  vector<int> v(a, a + GTEST_ARRAY_SIZE_(a));
+
+  EXPECT_THAT(&v, Pointee(ElementsAre(0, 1, _)));
+  EXPECT_THAT(&v, Not(Pointee(ElementsAre(0, _, 3))));
+}
+
+TEST(ElementsAreTest, WorksWithNativeArrayPassedByReference) {
+  int array[] = { 0, 1, 2 };
+  EXPECT_THAT(array, ElementsAre(0, 1, _));
+  EXPECT_THAT(array, Not(ElementsAre(1, _, _)));
+  EXPECT_THAT(array, Not(ElementsAre(0, _)));
+}
+
+class NativeArrayPassedAsPointerAndSize {
+ public:
+  NativeArrayPassedAsPointerAndSize() {}
+
+  MOCK_METHOD2(Helper, void(int* array, int size));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NativeArrayPassedAsPointerAndSize);
+};
+
+TEST(ElementsAreTest, WorksWithNativeArrayPassedAsPointerAndSize) {
+  int array[] = { 0, 1 };
+  ::std::tuple<int*, size_t> array_as_tuple(array, 2);
+  EXPECT_THAT(array_as_tuple, ElementsAre(0, 1));
+  EXPECT_THAT(array_as_tuple, Not(ElementsAre(0)));
+
+  NativeArrayPassedAsPointerAndSize helper;
+  EXPECT_CALL(helper, Helper(_, _))
+      .With(ElementsAre(0, 1));
+  helper.Helper(array, 2);
+}
+
+TEST(ElementsAreTest, WorksWithTwoDimensionalNativeArray) {
+  const char a2[][3] = { "hi", "lo" };
+  EXPECT_THAT(a2, ElementsAre(ElementsAre('h', 'i', '\0'),
+                              ElementsAre('l', 'o', '\0')));
+  EXPECT_THAT(a2, ElementsAre(StrEq("hi"), StrEq("lo")));
+  EXPECT_THAT(a2, ElementsAre(Not(ElementsAre('h', 'o', '\0')),
+                              ElementsAre('l', 'o', '\0')));
+}
+
+TEST(ElementsAreTest, AcceptsStringLiteral) {
+  std::string array[] = {"hi", "one", "two"};
+  EXPECT_THAT(array, ElementsAre("hi", "one", "two"));
+  EXPECT_THAT(array, Not(ElementsAre("hi", "one", "too")));
+}
+
+#ifndef _MSC_VER
+
+// The following test passes a value of type const char[] to a
+// function template that expects const T&.  Some versions of MSVC
+// generates a compiler error C2665 for that.  We believe it's a bug
+// in MSVC.  Therefore this test is #if-ed out for MSVC.
+
+// Declared here with the size unknown.  Defined AFTER the following test.
+extern const char kHi[];
+
+TEST(ElementsAreTest, AcceptsArrayWithUnknownSize) {
+  // The size of kHi is not known in this test, but ElementsAre() should
+  // still accept it.
+
+  std::string array1[] = {"hi"};
+  EXPECT_THAT(array1, ElementsAre(kHi));
+
+  std::string array2[] = {"ho"};
+  EXPECT_THAT(array2, Not(ElementsAre(kHi)));
+}
+
+const char kHi[] = "hi";
+
+#endif  // _MSC_VER
+
+TEST(ElementsAreTest, MakesCopyOfArguments) {
+  int x = 1;
+  int y = 2;
+  // This should make a copy of x and y.
+  ::testing::internal::ElementsAreMatcher<std::tuple<int, int> >
+      polymorphic_matcher = ElementsAre(x, y);
+  // Changing x and y now shouldn't affect the meaning of the above matcher.
+  x = y = 0;
+  const int array1[] = { 1, 2 };
+  EXPECT_THAT(array1, polymorphic_matcher);
+  const int array2[] = { 0, 0 };
+  EXPECT_THAT(array2, Not(polymorphic_matcher));
+}
+
+
+// Tests for ElementsAreArray().  Since ElementsAreArray() shares most
+// of the implementation with ElementsAre(), we don't test it as
+// thoroughly here.
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithValueArray) {
+  const int a[] = { 1, 2, 3 };
+
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a));
+
+  test_vector[2] = 0;
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(a)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithArraySize) {
+  const char* a[] = { "one", "two", "three" };
+
+  vector<std::string> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a, GTEST_ARRAY_SIZE_(a)));
+
+  const char** p = a;
+  test_vector[0] = "1";
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(p, GTEST_ARRAY_SIZE_(a))));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithoutArraySize) {
+  const char* a[] = { "one", "two", "three" };
+
+  vector<std::string> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a));
+
+  test_vector[0] = "1";
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(a)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithMatcherArray) {
+  const Matcher<std::string> kMatcherArray[] = {StrEq("one"), StrEq("two"),
+                                                StrEq("three")};
+
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("two");
+  test_vector.push_back("three");
+  EXPECT_THAT(test_vector, ElementsAreArray(kMatcherArray));
+
+  test_vector.push_back("three");
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(kMatcherArray)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithVector) {
+  const int a[] = { 1, 2, 3 };
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  const vector<int> expected(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected));
+  test_vector.push_back(4);
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(expected)));
+}
+
+
+TEST(ElementsAreArrayTest, TakesInitializerList) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  EXPECT_THAT(a, ElementsAreArray({ 1, 2, 3, 4, 5 }));
+  EXPECT_THAT(a, Not(ElementsAreArray({ 1, 2, 3, 5, 4 })));
+  EXPECT_THAT(a, Not(ElementsAreArray({ 1, 2, 3, 4, 6 })));
+}
+
+TEST(ElementsAreArrayTest, TakesInitializerListOfCStrings) {
+  const std::string a[5] = {"a", "b", "c", "d", "e"};
+  EXPECT_THAT(a, ElementsAreArray({ "a", "b", "c", "d", "e" }));
+  EXPECT_THAT(a, Not(ElementsAreArray({ "a", "b", "c", "e", "d" })));
+  EXPECT_THAT(a, Not(ElementsAreArray({ "a", "b", "c", "d", "ef" })));
+}
+
+TEST(ElementsAreArrayTest, TakesInitializerListOfSameTypedMatchers) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  EXPECT_THAT(a, ElementsAreArray(
+      { Eq(1), Eq(2), Eq(3), Eq(4), Eq(5) }));
+  EXPECT_THAT(a, Not(ElementsAreArray(
+      { Eq(1), Eq(2), Eq(3), Eq(4), Eq(6) })));
+}
+
+TEST(ElementsAreArrayTest,
+     TakesInitializerListOfDifferentTypedMatchers) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  // The compiler cannot infer the type of the initializer list if its
+  // elements have different types.  We must explicitly specify the
+  // unified element type in this case.
+  EXPECT_THAT(a, ElementsAreArray<Matcher<int> >(
+      { Eq(1), Ne(-2), Ge(3), Le(4), Eq(5) }));
+  EXPECT_THAT(a, Not(ElementsAreArray<Matcher<int> >(
+      { Eq(1), Ne(-2), Ge(3), Le(4), Eq(6) })));
+}
+
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithMatcherVector) {
+  const int a[] = { 1, 2, 3 };
+  const Matcher<int> kMatchers[] = { Eq(1), Eq(2), Eq(3) };
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  const vector<Matcher<int> > expected(
+      kMatchers, kMatchers + GTEST_ARRAY_SIZE_(kMatchers));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected));
+  test_vector.push_back(4);
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(expected)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithIteratorRange) {
+  const int a[] = { 1, 2, 3 };
+  const vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  const vector<int> expected(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected.begin(), expected.end()));
+  // Pointers are iterators, too.
+  EXPECT_THAT(test_vector, ElementsAreArray(a, a + GTEST_ARRAY_SIZE_(a)));
+  // The empty range of NULL pointers should also be okay.
+  int* const null_int = nullptr;
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(null_int, null_int)));
+  EXPECT_THAT((vector<int>()), ElementsAreArray(null_int, null_int));
+}
+
+// Since ElementsAre() and ElementsAreArray() share much of the
+// implementation, we only do a sanity test for native arrays here.
+TEST(ElementsAreArrayTest, WorksWithNativeArray) {
+  ::std::string a[] = { "hi", "ho" };
+  ::std::string b[] = { "hi", "ho" };
+
+  EXPECT_THAT(a, ElementsAreArray(b));
+  EXPECT_THAT(a, ElementsAreArray(b, 2));
+  EXPECT_THAT(a, Not(ElementsAreArray(b, 1)));
+}
+
+TEST(ElementsAreArrayTest, SourceLifeSpan) {
+  const int a[] = { 1, 2, 3 };
+  vector<int> test_vector(a, a + GTEST_ARRAY_SIZE_(a));
+  vector<int> expect(a, a + GTEST_ARRAY_SIZE_(a));
+  ElementsAreArrayMatcher<int> matcher_maker =
+      ElementsAreArray(expect.begin(), expect.end());
+  EXPECT_THAT(test_vector, matcher_maker);
+  // Changing in place the values that initialized matcher_maker should not
+  // affect matcher_maker anymore. It should have made its own copy of them.
+  typedef vector<int>::iterator Iter;
+  for (Iter it = expect.begin(); it != expect.end(); ++it) { *it += 10; }
+  EXPECT_THAT(test_vector, matcher_maker);
+  test_vector.push_back(3);
+  EXPECT_THAT(test_vector, Not(matcher_maker));
+}
+
+// Tests for the MATCHER*() macro family.
+
+// Tests that a simple MATCHER() definition works.
+
+MATCHER(IsEven, "") { return (arg % 2) == 0; }
+
+TEST(MatcherMacroTest, Works) {
+  const Matcher<int> m = IsEven();
+  EXPECT_TRUE(m.Matches(6));
+  EXPECT_FALSE(m.Matches(7));
+
+  EXPECT_EQ("is even", Describe(m));
+  EXPECT_EQ("not (is even)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 6));
+  EXPECT_EQ("", Explain(m, 7));
+}
+
+// This also tests that the description string can reference 'negation'.
+MATCHER(IsEven2, negation ? "is odd" : "is even") {
+  if ((arg % 2) == 0) {
+    // Verifies that we can stream to result_listener, a listener
+    // supplied by the MATCHER macro implicitly.
+    *result_listener << "OK";
+    return true;
+  } else {
+    *result_listener << "% 2 == " << (arg % 2);
+    return false;
+  }
+}
+
+// This also tests that the description string can reference matcher
+// parameters.
+MATCHER_P2(EqSumOf, x, y, std::string(negation ? "doesn't equal" : "equals") +
+                              " the sum of " + PrintToString(x) + " and " +
+                              PrintToString(y)) {
+  if (arg == (x + y)) {
+    *result_listener << "OK";
+    return true;
+  } else {
+    // Verifies that we can stream to the underlying stream of
+    // result_listener.
+    if (result_listener->stream() != nullptr) {
+      *result_listener->stream() << "diff == " << (x + y - arg);
+    }
+    return false;
+  }
+}
+
+// Tests that the matcher description can reference 'negation' and the
+// matcher parameters.
+TEST(MatcherMacroTest, DescriptionCanReferenceNegationAndParameters) {
+  const Matcher<int> m1 = IsEven2();
+  EXPECT_EQ("is even", Describe(m1));
+  EXPECT_EQ("is odd", DescribeNegation(m1));
+
+  const Matcher<int> m2 = EqSumOf(5, 9);
+  EXPECT_EQ("equals the sum of 5 and 9", Describe(m2));
+  EXPECT_EQ("doesn't equal the sum of 5 and 9", DescribeNegation(m2));
+}
+
+// Tests explaining match result in a MATCHER* macro.
+TEST(MatcherMacroTest, CanExplainMatchResult) {
+  const Matcher<int> m1 = IsEven2();
+  EXPECT_EQ("OK", Explain(m1, 4));
+  EXPECT_EQ("% 2 == 1", Explain(m1, 5));
+
+  const Matcher<int> m2 = EqSumOf(1, 2);
+  EXPECT_EQ("OK", Explain(m2, 3));
+  EXPECT_EQ("diff == -1", Explain(m2, 4));
+}
+
+// Tests that the body of MATCHER() can reference the type of the
+// value being matched.
+
+MATCHER(IsEmptyString, "") {
+  StaticAssertTypeEq< ::std::string, arg_type>();
+  return arg == "";
+}
+
+MATCHER(IsEmptyStringByRef, "") {
+  StaticAssertTypeEq<const ::std::string&, arg_type>();
+  return arg == "";
+}
+
+TEST(MatcherMacroTest, CanReferenceArgType) {
+  const Matcher< ::std::string> m1 = IsEmptyString();
+  EXPECT_TRUE(m1.Matches(""));
+
+  const Matcher<const ::std::string&> m2 = IsEmptyStringByRef();
+  EXPECT_TRUE(m2.Matches(""));
+}
+
+// Tests that MATCHER() can be used in a namespace.
+
+namespace matcher_test {
+MATCHER(IsOdd, "") { return (arg % 2) != 0; }
+}  // namespace matcher_test
+
+TEST(MatcherMacroTest, WorksInNamespace) {
+  Matcher<int> m = matcher_test::IsOdd();
+  EXPECT_FALSE(m.Matches(4));
+  EXPECT_TRUE(m.Matches(5));
+}
+
+// Tests that Value() can be used to compose matchers.
+MATCHER(IsPositiveOdd, "") {
+  return Value(arg, matcher_test::IsOdd()) && arg > 0;
+}
+
+TEST(MatcherMacroTest, CanBeComposedUsingValue) {
+  EXPECT_THAT(3, IsPositiveOdd());
+  EXPECT_THAT(4, Not(IsPositiveOdd()));
+  EXPECT_THAT(-1, Not(IsPositiveOdd()));
+}
+
+// Tests that a simple MATCHER_P() definition works.
+
+MATCHER_P(IsGreaterThan32And, n, "") { return arg > 32 && arg > n; }
+
+TEST(MatcherPMacroTest, Works) {
+  const Matcher<int> m = IsGreaterThan32And(5);
+  EXPECT_TRUE(m.Matches(36));
+  EXPECT_FALSE(m.Matches(5));
+
+  EXPECT_EQ("is greater than 32 and 5", Describe(m));
+  EXPECT_EQ("not (is greater than 32 and 5)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36));
+  EXPECT_EQ("", Explain(m, 5));
+}
+
+// Tests that the description is calculated correctly from the matcher name.
+MATCHER_P(_is_Greater_Than32and_, n, "") { return arg > 32 && arg > n; }
+
+TEST(MatcherPMacroTest, GeneratesCorrectDescription) {
+  const Matcher<int> m = _is_Greater_Than32and_(5);
+
+  EXPECT_EQ("is greater than 32 and 5", Describe(m));
+  EXPECT_EQ("not (is greater than 32 and 5)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36));
+  EXPECT_EQ("", Explain(m, 5));
+}
+
+// Tests that a MATCHER_P matcher can be explicitly instantiated with
+// a reference parameter type.
+
+class UncopyableFoo {
+ public:
+  explicit UncopyableFoo(char value) : value_(value) {}
+ private:
+  UncopyableFoo(const UncopyableFoo&);
+  void operator=(const UncopyableFoo&);
+
+  char value_;
+};
+
+MATCHER_P(ReferencesUncopyable, variable, "") { return &arg == &variable; }
+
+TEST(MatcherPMacroTest, WorksWhenExplicitlyInstantiatedWithReference) {
+  UncopyableFoo foo1('1'), foo2('2');
+  const Matcher<const UncopyableFoo&> m =
+      ReferencesUncopyable<const UncopyableFoo&>(foo1);
+
+  EXPECT_TRUE(m.Matches(foo1));
+  EXPECT_FALSE(m.Matches(foo2));
+
+  // We don't want the address of the parameter printed, as most
+  // likely it will just annoy the user.  If the address is
+  // interesting, the user should consider passing the parameter by
+  // pointer instead.
+  EXPECT_EQ("references uncopyable 1-byte object <31>", Describe(m));
+}
+
+
+// Tests that the body of MATCHER_Pn() can reference the parameter
+// types.
+
+MATCHER_P3(ParamTypesAreIntLongAndChar, foo, bar, baz, "") {
+  StaticAssertTypeEq<int, foo_type>();
+  StaticAssertTypeEq<long, bar_type>();  // NOLINT
+  StaticAssertTypeEq<char, baz_type>();
+  return arg == 0;
+}
+
+TEST(MatcherPnMacroTest, CanReferenceParamTypes) {
+  EXPECT_THAT(0, ParamTypesAreIntLongAndChar(10, 20L, 'a'));
+}
+
+// Tests that a MATCHER_Pn matcher can be explicitly instantiated with
+// reference parameter types.
+
+MATCHER_P2(ReferencesAnyOf, variable1, variable2, "") {
+  return &arg == &variable1 || &arg == &variable2;
+}
+
+TEST(MatcherPnMacroTest, WorksWhenExplicitlyInstantiatedWithReferences) {
+  UncopyableFoo foo1('1'), foo2('2'), foo3('3');
+  const Matcher<const UncopyableFoo&> m =
+      ReferencesAnyOf<const UncopyableFoo&, const UncopyableFoo&>(foo1, foo2);
+
+  EXPECT_TRUE(m.Matches(foo1));
+  EXPECT_TRUE(m.Matches(foo2));
+  EXPECT_FALSE(m.Matches(foo3));
+}
+
+TEST(MatcherPnMacroTest,
+     GeneratesCorretDescriptionWhenExplicitlyInstantiatedWithReferences) {
+  UncopyableFoo foo1('1'), foo2('2');
+  const Matcher<const UncopyableFoo&> m =
+      ReferencesAnyOf<const UncopyableFoo&, const UncopyableFoo&>(foo1, foo2);
+
+  // We don't want the addresses of the parameters printed, as most
+  // likely they will just annoy the user.  If the addresses are
+  // interesting, the user should consider passing the parameters by
+  // pointers instead.
+  EXPECT_EQ("references any of (1-byte object <31>, 1-byte object <32>)",
+            Describe(m));
+}
+
+// Tests that a simple MATCHER_P2() definition works.
+
+MATCHER_P2(IsNotInClosedRange, low, hi, "") { return arg < low || arg > hi; }
+
+TEST(MatcherPnMacroTest, Works) {
+  const Matcher<const long&> m = IsNotInClosedRange(10, 20);  // NOLINT
+  EXPECT_TRUE(m.Matches(36L));
+  EXPECT_FALSE(m.Matches(15L));
+
+  EXPECT_EQ("is not in closed range (10, 20)", Describe(m));
+  EXPECT_EQ("not (is not in closed range (10, 20))", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36L));
+  EXPECT_EQ("", Explain(m, 15L));
+}
+
+// Tests that MATCHER*() definitions can be overloaded on the number
+// of parameters; also tests MATCHER_Pn() where n >= 3.
+
+MATCHER(EqualsSumOf, "") { return arg == 0; }
+MATCHER_P(EqualsSumOf, a, "") { return arg == a; }
+MATCHER_P2(EqualsSumOf, a, b, "") { return arg == a + b; }
+MATCHER_P3(EqualsSumOf, a, b, c, "") { return arg == a + b + c; }
+MATCHER_P4(EqualsSumOf, a, b, c, d, "") { return arg == a + b + c + d; }
+MATCHER_P5(EqualsSumOf, a, b, c, d, e, "") { return arg == a + b + c + d + e; }
+MATCHER_P6(EqualsSumOf, a, b, c, d, e, f, "") {
+  return arg == a + b + c + d + e + f;
+}
+MATCHER_P7(EqualsSumOf, a, b, c, d, e, f, g, "") {
+  return arg == a + b + c + d + e + f + g;
+}
+MATCHER_P8(EqualsSumOf, a, b, c, d, e, f, g, h, "") {
+  return arg == a + b + c + d + e + f + g + h;
+}
+MATCHER_P9(EqualsSumOf, a, b, c, d, e, f, g, h, i, "") {
+  return arg == a + b + c + d + e + f + g + h + i;
+}
+MATCHER_P10(EqualsSumOf, a, b, c, d, e, f, g, h, i, j, "") {
+  return arg == a + b + c + d + e + f + g + h + i + j;
+}
+
+TEST(MatcherPnMacroTest, CanBeOverloadedOnNumberOfParameters) {
+  EXPECT_THAT(0, EqualsSumOf());
+  EXPECT_THAT(1, EqualsSumOf(1));
+  EXPECT_THAT(12, EqualsSumOf(10, 2));
+  EXPECT_THAT(123, EqualsSumOf(100, 20, 3));
+  EXPECT_THAT(1234, EqualsSumOf(1000, 200, 30, 4));
+  EXPECT_THAT(12345, EqualsSumOf(10000, 2000, 300, 40, 5));
+  EXPECT_THAT("abcdef",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f'));
+  EXPECT_THAT("abcdefg",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g'));
+  EXPECT_THAT("abcdefgh",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h"));
+  EXPECT_THAT("abcdefghi",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h", 'i'));
+  EXPECT_THAT("abcdefghij",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h", 'i', ::std::string("j")));
+
+  EXPECT_THAT(1, Not(EqualsSumOf()));
+  EXPECT_THAT(-1, Not(EqualsSumOf(1)));
+  EXPECT_THAT(-12, Not(EqualsSumOf(10, 2)));
+  EXPECT_THAT(-123, Not(EqualsSumOf(100, 20, 3)));
+  EXPECT_THAT(-1234, Not(EqualsSumOf(1000, 200, 30, 4)));
+  EXPECT_THAT(-12345, Not(EqualsSumOf(10000, 2000, 300, 40, 5)));
+  EXPECT_THAT("abcdef ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f')));
+  EXPECT_THAT("abcdefg ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f',
+                              'g')));
+  EXPECT_THAT("abcdefgh ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h")));
+  EXPECT_THAT("abcdefghi ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h", 'i')));
+  EXPECT_THAT("abcdefghij ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h", 'i', ::std::string("j"))));
+}
+
+// Tests that a MATCHER_Pn() definition can be instantiated with any
+// compatible parameter types.
+TEST(MatcherPnMacroTest, WorksForDifferentParameterTypes) {
+  EXPECT_THAT(123, EqualsSumOf(100L, 20, static_cast<char>(3)));
+  EXPECT_THAT("abcd", EqualsSumOf(::std::string("a"), "b", 'c', "d"));
+
+  EXPECT_THAT(124, Not(EqualsSumOf(100L, 20, static_cast<char>(3))));
+  EXPECT_THAT("abcde", Not(EqualsSumOf(::std::string("a"), "b", 'c', "d")));
+}
+
+// Tests that the matcher body can promote the parameter types.
+
+MATCHER_P2(EqConcat, prefix, suffix, "") {
+  // The following lines promote the two parameters to desired types.
+  std::string prefix_str(prefix);
+  char suffix_char = static_cast<char>(suffix);
+  return arg == prefix_str + suffix_char;
+}
+
+TEST(MatcherPnMacroTest, SimpleTypePromotion) {
+  Matcher<std::string> no_promo =
+      EqConcat(std::string("foo"), 't');
+  Matcher<const std::string&> promo =
+      EqConcat("foo", static_cast<int>('t'));
+  EXPECT_FALSE(no_promo.Matches("fool"));
+  EXPECT_FALSE(promo.Matches("fool"));
+  EXPECT_TRUE(no_promo.Matches("foot"));
+  EXPECT_TRUE(promo.Matches("foot"));
+}
+
+// Verifies the type of a MATCHER*.
+
+TEST(MatcherPnMacroTest, TypesAreCorrect) {
+  // EqualsSumOf() must be assignable to a EqualsSumOfMatcher variable.
+  EqualsSumOfMatcher a0 = EqualsSumOf();
+
+  // EqualsSumOf(1) must be assignable to a EqualsSumOfMatcherP variable.
+  EqualsSumOfMatcherP<int> a1 = EqualsSumOf(1);
+
+  // EqualsSumOf(p1, ..., pk) must be assignable to a EqualsSumOfMatcherPk
+  // variable, and so on.
+  EqualsSumOfMatcherP2<int, char> a2 = EqualsSumOf(1, '2');
+  EqualsSumOfMatcherP3<int, int, char> a3 = EqualsSumOf(1, 2, '3');
+  EqualsSumOfMatcherP4<int, int, int, char> a4 = EqualsSumOf(1, 2, 3, '4');
+  EqualsSumOfMatcherP5<int, int, int, int, char> a5 =
+      EqualsSumOf(1, 2, 3, 4, '5');
+  EqualsSumOfMatcherP6<int, int, int, int, int, char> a6 =
+      EqualsSumOf(1, 2, 3, 4, 5, '6');
+  EqualsSumOfMatcherP7<int, int, int, int, int, int, char> a7 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, '7');
+  EqualsSumOfMatcherP8<int, int, int, int, int, int, int, char> a8 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, '8');
+  EqualsSumOfMatcherP9<int, int, int, int, int, int, int, int, char> a9 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, 8, '9');
+  EqualsSumOfMatcherP10<int, int, int, int, int, int, int, int, int, char> a10 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, 8, 9, '0');
+
+  // Avoid "unused variable" warnings.
+  (void)a0;
+  (void)a1;
+  (void)a2;
+  (void)a3;
+  (void)a4;
+  (void)a5;
+  (void)a6;
+  (void)a7;
+  (void)a8;
+  (void)a9;
+  (void)a10;
+}
+
+// Tests that matcher-typed parameters can be used in Value() inside a
+// MATCHER_Pn definition.
+
+// Succeeds if arg matches exactly 2 of the 3 matchers.
+MATCHER_P3(TwoOf, m1, m2, m3, "") {
+  const int count = static_cast<int>(Value(arg, m1))
+      + static_cast<int>(Value(arg, m2)) + static_cast<int>(Value(arg, m3));
+  return count == 2;
+}
+
+TEST(MatcherPnMacroTest, CanUseMatcherTypedParameterInValue) {
+  EXPECT_THAT(42, TwoOf(Gt(0), Lt(50), Eq(10)));
+  EXPECT_THAT(0, Not(TwoOf(Gt(-1), Lt(1), Eq(0))));
+}
+
+// Tests Contains().
+
+TEST(ContainsTest, ListMatchesWhenElementIsInContainer) {
+  list<int> some_list;
+  some_list.push_back(3);
+  some_list.push_back(1);
+  some_list.push_back(2);
+  EXPECT_THAT(some_list, Contains(1));
+  EXPECT_THAT(some_list, Contains(Gt(2.5)));
+  EXPECT_THAT(some_list, Contains(Eq(2.0f)));
+
+  list<std::string> another_list;
+  another_list.push_back("fee");
+  another_list.push_back("fie");
+  another_list.push_back("foe");
+  another_list.push_back("fum");
+  EXPECT_THAT(another_list, Contains(std::string("fee")));
+}
+
+TEST(ContainsTest, ListDoesNotMatchWhenElementIsNotInContainer) {
+  list<int> some_list;
+  some_list.push_back(3);
+  some_list.push_back(1);
+  EXPECT_THAT(some_list, Not(Contains(4)));
+}
+
+TEST(ContainsTest, SetMatchesWhenElementIsInContainer) {
+  set<int> some_set;
+  some_set.insert(3);
+  some_set.insert(1);
+  some_set.insert(2);
+  EXPECT_THAT(some_set, Contains(Eq(1.0)));
+  EXPECT_THAT(some_set, Contains(Eq(3.0f)));
+  EXPECT_THAT(some_set, Contains(2));
+
+  set<const char*> another_set;
+  another_set.insert("fee");
+  another_set.insert("fie");
+  another_set.insert("foe");
+  another_set.insert("fum");
+  EXPECT_THAT(another_set, Contains(Eq(std::string("fum"))));
+}
+
+TEST(ContainsTest, SetDoesNotMatchWhenElementIsNotInContainer) {
+  set<int> some_set;
+  some_set.insert(3);
+  some_set.insert(1);
+  EXPECT_THAT(some_set, Not(Contains(4)));
+
+  set<const char*> c_string_set;
+  c_string_set.insert("hello");
+  EXPECT_THAT(c_string_set, Not(Contains(std::string("hello").c_str())));
+}
+
+TEST(ContainsTest, ExplainsMatchResultCorrectly) {
+  const int a[2] = { 1, 2 };
+  Matcher<const int (&)[2]> m = Contains(2);
+  EXPECT_EQ("whose element #1 matches", Explain(m, a));
+
+  m = Contains(3);
+  EXPECT_EQ("", Explain(m, a));
+
+  m = Contains(GreaterThan(0));
+  EXPECT_EQ("whose element #0 matches, which is 1 more than 0", Explain(m, a));
+
+  m = Contains(GreaterThan(10));
+  EXPECT_EQ("", Explain(m, a));
+}
+
+TEST(ContainsTest, DescribesItselfCorrectly) {
+  Matcher<vector<int> > m = Contains(1);
+  EXPECT_EQ("contains at least one element that is equal to 1", Describe(m));
+
+  Matcher<vector<int> > m2 = Not(m);
+  EXPECT_EQ("doesn't contain any element that is equal to 1", Describe(m2));
+}
+
+TEST(ContainsTest, MapMatchesWhenElementIsInContainer) {
+  map<const char*, int> my_map;
+  const char* bar = "a string";
+  my_map[bar] = 2;
+  EXPECT_THAT(my_map, Contains(pair<const char* const, int>(bar, 2)));
+
+  map<std::string, int> another_map;
+  another_map["fee"] = 1;
+  another_map["fie"] = 2;
+  another_map["foe"] = 3;
+  another_map["fum"] = 4;
+  EXPECT_THAT(another_map,
+              Contains(pair<const std::string, int>(std::string("fee"), 1)));
+  EXPECT_THAT(another_map, Contains(pair<const std::string, int>("fie", 2)));
+}
+
+TEST(ContainsTest, MapDoesNotMatchWhenElementIsNotInContainer) {
+  map<int, int> some_map;
+  some_map[1] = 11;
+  some_map[2] = 22;
+  EXPECT_THAT(some_map, Not(Contains(pair<const int, int>(2, 23))));
+}
+
+TEST(ContainsTest, ArrayMatchesWhenElementIsInContainer) {
+  const char* string_array[] = { "fee", "fie", "foe", "fum" };
+  EXPECT_THAT(string_array, Contains(Eq(std::string("fum"))));
+}
+
+TEST(ContainsTest, ArrayDoesNotMatchWhenElementIsNotInContainer) {
+  int int_array[] = { 1, 2, 3, 4 };
+  EXPECT_THAT(int_array, Not(Contains(5)));
+}
+
+TEST(ContainsTest, AcceptsMatcher) {
+  const int a[] = { 1, 2, 3 };
+  EXPECT_THAT(a, Contains(Gt(2)));
+  EXPECT_THAT(a, Not(Contains(Gt(4))));
+}
+
+TEST(ContainsTest, WorksForNativeArrayAsTuple) {
+  const int a[] = { 1, 2 };
+  const int* const pointer = a;
+  EXPECT_THAT(std::make_tuple(pointer, 2), Contains(1));
+  EXPECT_THAT(std::make_tuple(pointer, 2), Not(Contains(Gt(3))));
+}
+
+TEST(ContainsTest, WorksForTwoDimensionalNativeArray) {
+  int a[][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  EXPECT_THAT(a, Contains(ElementsAre(4, 5, 6)));
+  EXPECT_THAT(a, Contains(Contains(5)));
+  EXPECT_THAT(a, Not(Contains(ElementsAre(3, 4, 5))));
+  EXPECT_THAT(a, Contains(Not(Contains(5))));
+}
+
+TEST(AllOfArrayTest, BasicForms) {
+  // Iterator
+  std::vector<int> v0{};
+  std::vector<int> v1{1};
+  std::vector<int> v2{2, 3};
+  std::vector<int> v3{4, 4, 4};
+  EXPECT_THAT(0, AllOfArray(v0.begin(), v0.end()));
+  EXPECT_THAT(1, AllOfArray(v1.begin(), v1.end()));
+  EXPECT_THAT(2, Not(AllOfArray(v1.begin(), v1.end())));
+  EXPECT_THAT(3, Not(AllOfArray(v2.begin(), v2.end())));
+  EXPECT_THAT(4, AllOfArray(v3.begin(), v3.end()));
+  // Pointer +  size
+  int ar[6] = {1, 2, 3, 4, 4, 4};
+  EXPECT_THAT(0, AllOfArray(ar, 0));
+  EXPECT_THAT(1, AllOfArray(ar, 1));
+  EXPECT_THAT(2, Not(AllOfArray(ar, 1)));
+  EXPECT_THAT(3, Not(AllOfArray(ar + 1, 3)));
+  EXPECT_THAT(4, AllOfArray(ar + 3, 3));
+  // Array
+  // int ar0[0];  Not usable
+  int ar1[1] = {1};
+  int ar2[2] = {2, 3};
+  int ar3[3] = {4, 4, 4};
+  // EXPECT_THAT(0, Not(AllOfArray(ar0)));  // Cannot work
+  EXPECT_THAT(1, AllOfArray(ar1));
+  EXPECT_THAT(2, Not(AllOfArray(ar1)));
+  EXPECT_THAT(3, Not(AllOfArray(ar2)));
+  EXPECT_THAT(4, AllOfArray(ar3));
+  // Container
+  EXPECT_THAT(0, AllOfArray(v0));
+  EXPECT_THAT(1, AllOfArray(v1));
+  EXPECT_THAT(2, Not(AllOfArray(v1)));
+  EXPECT_THAT(3, Not(AllOfArray(v2)));
+  EXPECT_THAT(4, AllOfArray(v3));
+  // Initializer
+  EXPECT_THAT(0, AllOfArray<int>({}));  // Requires template arg.
+  EXPECT_THAT(1, AllOfArray({1}));
+  EXPECT_THAT(2, Not(AllOfArray({1})));
+  EXPECT_THAT(3, Not(AllOfArray({2, 3})));
+  EXPECT_THAT(4, AllOfArray({4, 4, 4}));
+}
+
+TEST(AllOfArrayTest, Matchers) {
+  // vector
+  std::vector<Matcher<int>> matchers{Ge(1), Lt(2)};
+  EXPECT_THAT(0, Not(AllOfArray(matchers)));
+  EXPECT_THAT(1, AllOfArray(matchers));
+  EXPECT_THAT(2, Not(AllOfArray(matchers)));
+  // initializer_list
+  EXPECT_THAT(0, Not(AllOfArray({Ge(0), Ge(1)})));
+  EXPECT_THAT(1, AllOfArray({Ge(0), Ge(1)}));
+}
+
+TEST(AnyOfArrayTest, BasicForms) {
+  // Iterator
+  std::vector<int> v0{};
+  std::vector<int> v1{1};
+  std::vector<int> v2{2, 3};
+  EXPECT_THAT(0, Not(AnyOfArray(v0.begin(), v0.end())));
+  EXPECT_THAT(1, AnyOfArray(v1.begin(), v1.end()));
+  EXPECT_THAT(2, Not(AnyOfArray(v1.begin(), v1.end())));
+  EXPECT_THAT(3, AnyOfArray(v2.begin(), v2.end()));
+  EXPECT_THAT(4, Not(AnyOfArray(v2.begin(), v2.end())));
+  // Pointer +  size
+  int ar[3] = {1, 2, 3};
+  EXPECT_THAT(0, Not(AnyOfArray(ar, 0)));
+  EXPECT_THAT(1, AnyOfArray(ar, 1));
+  EXPECT_THAT(2, Not(AnyOfArray(ar, 1)));
+  EXPECT_THAT(3, AnyOfArray(ar + 1, 2));
+  EXPECT_THAT(4, Not(AnyOfArray(ar + 1, 2)));
+  // Array
+  // int ar0[0];  Not usable
+  int ar1[1] = {1};
+  int ar2[2] = {2, 3};
+  // EXPECT_THAT(0, Not(AnyOfArray(ar0)));  // Cannot work
+  EXPECT_THAT(1, AnyOfArray(ar1));
+  EXPECT_THAT(2, Not(AnyOfArray(ar1)));
+  EXPECT_THAT(3, AnyOfArray(ar2));
+  EXPECT_THAT(4, Not(AnyOfArray(ar2)));
+  // Container
+  EXPECT_THAT(0, Not(AnyOfArray(v0)));
+  EXPECT_THAT(1, AnyOfArray(v1));
+  EXPECT_THAT(2, Not(AnyOfArray(v1)));
+  EXPECT_THAT(3, AnyOfArray(v2));
+  EXPECT_THAT(4, Not(AnyOfArray(v2)));
+  // Initializer
+  EXPECT_THAT(0, Not(AnyOfArray<int>({})));  // Requires template arg.
+  EXPECT_THAT(1, AnyOfArray({1}));
+  EXPECT_THAT(2, Not(AnyOfArray({1})));
+  EXPECT_THAT(3, AnyOfArray({2, 3}));
+  EXPECT_THAT(4, Not(AnyOfArray({2, 3})));
+}
+
+TEST(AnyOfArrayTest, Matchers) {
+  // We negate test AllOfArrayTest.Matchers.
+  // vector
+  std::vector<Matcher<int>> matchers{Lt(1), Ge(2)};
+  EXPECT_THAT(0, AnyOfArray(matchers));
+  EXPECT_THAT(1, Not(AnyOfArray(matchers)));
+  EXPECT_THAT(2, AnyOfArray(matchers));
+  // initializer_list
+  EXPECT_THAT(0, AnyOfArray({Lt(0), Lt(1)}));
+  EXPECT_THAT(1, Not(AllOfArray({Lt(0), Lt(1)})));
+}
+
+TEST(AnyOfArrayTest, ExplainsMatchResultCorrectly) {
+  // AnyOfArray and AllOfArry use the same underlying template-template,
+  // thus it is sufficient to test one here.
+  const std::vector<int> v0{};
+  const std::vector<int> v1{1};
+  const std::vector<int> v2{2, 3};
+  const Matcher<int> m0 = AnyOfArray(v0);
+  const Matcher<int> m1 = AnyOfArray(v1);
+  const Matcher<int> m2 = AnyOfArray(v2);
+  EXPECT_EQ("", Explain(m0, 0));
+  EXPECT_EQ("", Explain(m1, 1));
+  EXPECT_EQ("", Explain(m1, 2));
+  EXPECT_EQ("", Explain(m2, 3));
+  EXPECT_EQ("", Explain(m2, 4));
+  EXPECT_EQ("()", Describe(m0));
+  EXPECT_EQ("(is equal to 1)", Describe(m1));
+  EXPECT_EQ("(is equal to 2) or (is equal to 3)", Describe(m2));
+  EXPECT_EQ("()", DescribeNegation(m0));
+  EXPECT_EQ("(isn't equal to 1)", DescribeNegation(m1));
+  EXPECT_EQ("(isn't equal to 2) and (isn't equal to 3)", DescribeNegation(m2));
+  // Explain with matchers
+  const Matcher<int> g1 = AnyOfArray({GreaterThan(1)});
+  const Matcher<int> g2 = AnyOfArray({GreaterThan(1), GreaterThan(2)});
+  // Explains the first positiv match and all prior negative matches...
+  EXPECT_EQ("which is 1 less than 1", Explain(g1, 0));
+  EXPECT_EQ("which is the same as 1", Explain(g1, 1));
+  EXPECT_EQ("which is 1 more than 1", Explain(g1, 2));
+  EXPECT_EQ("which is 1 less than 1, and which is 2 less than 2",
+            Explain(g2, 0));
+  EXPECT_EQ("which is the same as 1, and which is 1 less than 2",
+            Explain(g2, 1));
+  EXPECT_EQ("which is 1 more than 1",  // Only the first
+            Explain(g2, 2));
+}
+
+TEST(AllOfTest, HugeMatcher) {
+  // Verify that using AllOf with many arguments doesn't cause
+  // the compiler to exceed template instantiation depth limit.
+  EXPECT_THAT(0, testing::AllOf(_, _, _, _, _, _, _, _, _,
+                                testing::AllOf(_, _, _, _, _, _, _, _, _, _)));
+}
+
+TEST(AnyOfTest, HugeMatcher) {
+  // Verify that using AnyOf with many arguments doesn't cause
+  // the compiler to exceed template instantiation depth limit.
+  EXPECT_THAT(0, testing::AnyOf(_, _, _, _, _, _, _, _, _,
+                                testing::AnyOf(_, _, _, _, _, _, _, _, _, _)));
+}
+
+namespace adl_test {
+
+// Verifies that the implementation of ::testing::AllOf and ::testing::AnyOf
+// don't issue unqualified recursive calls.  If they do, the argument dependent
+// name lookup will cause AllOf/AnyOf in the 'adl_test' namespace to be found
+// as a candidate and the compilation will break due to an ambiguous overload.
+
+// The matcher must be in the same namespace as AllOf/AnyOf to make argument
+// dependent lookup find those.
+MATCHER(M, "") { return true; }
+
+template <typename T1, typename T2>
+bool AllOf(const T1& /*t1*/, const T2& /*t2*/) { return true; }
+
+TEST(AllOfTest, DoesNotCallAllOfUnqualified) {
+  EXPECT_THAT(42, testing::AllOf(
+      M(), M(), M(), M(), M(), M(), M(), M(), M(), M()));
+}
+
+template <typename T1, typename T2> bool
+AnyOf(const T1& t1, const T2& t2) { return true; }
+
+TEST(AnyOfTest, DoesNotCallAnyOfUnqualified) {
+  EXPECT_THAT(42, testing::AnyOf(
+      M(), M(), M(), M(), M(), M(), M(), M(), M(), M()));
+}
+
+}  // namespace adl_test
+
+
+TEST(AllOfTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, AllOf(Pointee(Eq(3)), Pointee(Gt(0)), Pointee(Lt(5))));
+  EXPECT_THAT(p, Not(AllOf(Pointee(Eq(3)), Pointee(Gt(0)), Pointee(Lt(3)))));
+}
+
+TEST(AnyOfTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, AnyOf(Pointee(Eq(5)), Pointee(Lt(0)), Pointee(Lt(5))));
+  EXPECT_THAT(p, Not(AnyOf(Pointee(Eq(5)), Pointee(Lt(0)), Pointee(Gt(5)))));
+}
+
+MATCHER(IsNotNull, "") {
+  return arg != nullptr;
+}
+
+// Verifies that a matcher defined using MATCHER() can work on
+// move-only types.
+TEST(MatcherMacroTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, IsNotNull());
+  EXPECT_THAT(std::unique_ptr<int>(), Not(IsNotNull()));
+}
+
+MATCHER_P(UniquePointee, pointee, "") {
+  return *arg == pointee;
+}
+
+// Verifies that a matcher defined using MATCHER_P*() can work on
+// move-only types.
+TEST(MatcherPMacroTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, UniquePointee(3));
+  EXPECT_THAT(p, Not(UniquePointee(2)));
+}
+
+
+}  // namespace
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-internal-utils_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-internal-utils_test.cc
new file mode 100755
index 0000000..7df4078
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-internal-utils_test.cc
@@ -0,0 +1,728 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal utilities.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <stdlib.h>
+
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// their code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_CYGWIN
+# include <sys/types.h>  // For ssize_t. NOLINT
+#endif
+
+namespace proto2 {
+class Message;
+}  // namespace proto2
+
+namespace testing {
+namespace internal {
+
+namespace {
+
+TEST(JoinAsTupleTest, JoinsEmptyTuple) {
+  EXPECT_EQ("", JoinAsTuple(Strings()));
+}
+
+TEST(JoinAsTupleTest, JoinsOneTuple) {
+  const char* fields[] = {"1"};
+  EXPECT_EQ("1", JoinAsTuple(Strings(fields, fields + 1)));
+}
+
+TEST(JoinAsTupleTest, JoinsTwoTuple) {
+  const char* fields[] = {"1", "a"};
+  EXPECT_EQ("(1, a)", JoinAsTuple(Strings(fields, fields + 2)));
+}
+
+TEST(JoinAsTupleTest, JoinsTenTuple) {
+  const char* fields[] = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"};
+  EXPECT_EQ("(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)",
+            JoinAsTuple(Strings(fields, fields + 10)));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsNoWord) {
+  EXPECT_EQ("", ConvertIdentifierNameToWords(""));
+  EXPECT_EQ("", ConvertIdentifierNameToWords("_"));
+  EXPECT_EQ("", ConvertIdentifierNameToWords("__"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsDigits) {
+  EXPECT_EQ("1", ConvertIdentifierNameToWords("_1"));
+  EXPECT_EQ("2", ConvertIdentifierNameToWords("2_"));
+  EXPECT_EQ("34", ConvertIdentifierNameToWords("_34_"));
+  EXPECT_EQ("34 56", ConvertIdentifierNameToWords("_34_56"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsCamelCaseWords) {
+  EXPECT_EQ("a big word", ConvertIdentifierNameToWords("ABigWord"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("FooBar"));
+  EXPECT_EQ("foo", ConvertIdentifierNameToWords("Foo_"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("_Foo_Bar_"));
+  EXPECT_EQ("foo and bar", ConvertIdentifierNameToWords("_Foo__And_Bar"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContains_SeparatedWords) {
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("foo_bar"));
+  EXPECT_EQ("foo", ConvertIdentifierNameToWords("_foo_"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("_foo_bar_"));
+  EXPECT_EQ("foo and bar", ConvertIdentifierNameToWords("_foo__and_bar"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameIsMixture) {
+  EXPECT_EQ("foo bar 123", ConvertIdentifierNameToWords("Foo_bar123"));
+  EXPECT_EQ("chapter 11 section 1",
+            ConvertIdentifierNameToWords("_Chapter11Section_1_"));
+}
+
+TEST(PointeeOfTest, WorksForSmartPointers) {
+  CompileAssertTypesEqual<int, PointeeOf<std::unique_ptr<int> >::type>();
+  CompileAssertTypesEqual<std::string,
+                          PointeeOf<std::shared_ptr<std::string> >::type>();
+}
+
+TEST(PointeeOfTest, WorksForRawPointers) {
+  CompileAssertTypesEqual<int, PointeeOf<int*>::type>();
+  CompileAssertTypesEqual<const char, PointeeOf<const char*>::type>();
+  CompileAssertTypesEqual<void, PointeeOf<void*>::type>();
+}
+
+TEST(GetRawPointerTest, WorksForSmartPointers) {
+  const char* const raw_p1 = new const char('a');  // NOLINT
+  const std::unique_ptr<const char> p1(raw_p1);
+  EXPECT_EQ(raw_p1, GetRawPointer(p1));
+  double* const raw_p2 = new double(2.5);  // NOLINT
+  const std::shared_ptr<double> p2(raw_p2);
+  EXPECT_EQ(raw_p2, GetRawPointer(p2));
+}
+
+TEST(GetRawPointerTest, WorksForRawPointers) {
+  int* p = nullptr;
+  EXPECT_TRUE(nullptr == GetRawPointer(p));
+  int n = 1;
+  EXPECT_EQ(&n, GetRawPointer(&n));
+}
+
+// Tests KindOf<T>.
+
+class Base {};
+class Derived : public Base {};
+
+TEST(KindOfTest, Bool) {
+  EXPECT_EQ(kBool, GMOCK_KIND_OF_(bool));  // NOLINT
+}
+
+TEST(KindOfTest, Integer) {
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(signed char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(short));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned short));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(int));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned int));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(wchar_t));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(Int64));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(UInt64));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(size_t));  // NOLINT
+#if GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_CYGWIN
+  // ssize_t is not defined on Windows and possibly some other OSes.
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(ssize_t));  // NOLINT
+#endif
+}
+
+TEST(KindOfTest, FloatingPoint) {
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(float));  // NOLINT
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(double));  // NOLINT
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(long double));  // NOLINT
+}
+
+TEST(KindOfTest, Other) {
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(void*));  // NOLINT
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(char**));  // NOLINT
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(Base));  // NOLINT
+}
+
+// Tests LosslessArithmeticConvertible<T, U>.
+
+TEST(LosslessArithmeticConvertibleTest, BoolToBool) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, BoolToInteger) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, char>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, int>::value));
+  EXPECT_TRUE(
+      (LosslessArithmeticConvertible<bool, unsigned long>::value));  // NOLINT
+}
+
+TEST(LosslessArithmeticConvertibleTest, BoolToFloatingPoint) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, float>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, double>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToBool) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<unsigned char, bool>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToInteger) {
+  // Unsigned => larger signed is fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<unsigned char, int>::value));
+
+  // Unsigned => larger unsigned is fine.
+  EXPECT_TRUE(
+      (LosslessArithmeticConvertible<unsigned short, UInt64>::value)); // NOLINT
+
+  // Signed => unsigned is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<short, UInt64>::value)); // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+      signed char, unsigned int>::value));  // NOLINT
+
+  // Same size and same signedness: fine too.
+  EXPECT_TRUE((LosslessArithmeticConvertible<
+               unsigned char, unsigned char>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<int, int>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<wchar_t, wchar_t>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<
+               unsigned long, unsigned long>::value));  // NOLINT
+
+  // Same size, different signedness: not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+                unsigned char, signed char>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, unsigned int>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<UInt64, Int64>::value));
+
+  // Larger size => smaller size is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<long, char>::value));  // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, signed char>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<Int64, unsigned int>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToFloatingPoint) {
+  // Integers cannot be losslessly converted to floating-points, as
+  // the format of the latter is implementation-defined.
+  EXPECT_FALSE((LosslessArithmeticConvertible<char, float>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, double>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+                short, long double>::value));  // NOLINT
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToBool) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<float, bool>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToInteger) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<float, long>::value));  // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, Int64>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<long double, int>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToFloatingPoint) {
+  // Smaller size => larger size is fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, double>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, long double>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<double, long double>::value));
+
+  // Same size: fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, float>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<double, double>::value));
+
+  // Larger size => smaller size is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, float>::value));
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (sizeof(double) == sizeof(long double)) {  // NOLINT
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    // In some implementations (e.g. MSVC), double and long double
+    // have the same size.
+    EXPECT_TRUE((LosslessArithmeticConvertible<long double, double>::value));
+  } else {
+    EXPECT_FALSE((LosslessArithmeticConvertible<long double, double>::value));
+  }
+}
+
+// Tests the TupleMatches() template function.
+
+TEST(TupleMatchesTest, WorksForSize0) {
+  std::tuple<> matchers;
+  std::tuple<> values;
+
+  EXPECT_TRUE(TupleMatches(matchers, values));
+}
+
+TEST(TupleMatchesTest, WorksForSize1) {
+  std::tuple<Matcher<int> > matchers(Eq(1));
+  std::tuple<int> values1(1), values2(2);
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+}
+
+TEST(TupleMatchesTest, WorksForSize2) {
+  std::tuple<Matcher<int>, Matcher<char> > matchers(Eq(1), Eq('a'));
+  std::tuple<int, char> values1(1, 'a'), values2(1, 'b'), values3(2, 'a'),
+      values4(2, 'b');
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+  EXPECT_FALSE(TupleMatches(matchers, values3));
+  EXPECT_FALSE(TupleMatches(matchers, values4));
+}
+
+TEST(TupleMatchesTest, WorksForSize5) {
+  std::tuple<Matcher<int>, Matcher<char>, Matcher<bool>,
+             Matcher<long>,  // NOLINT
+             Matcher<std::string> >
+      matchers(Eq(1), Eq('a'), Eq(true), Eq(2L), Eq("hi"));
+  std::tuple<int, char, bool, long, std::string>  // NOLINT
+      values1(1, 'a', true, 2L, "hi"), values2(1, 'a', true, 2L, "hello"),
+      values3(2, 'a', true, 2L, "hi");
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+  EXPECT_FALSE(TupleMatches(matchers, values3));
+}
+
+// Tests that Assert(true, ...) succeeds.
+TEST(AssertTest, SucceedsOnTrue) {
+  Assert(true, __FILE__, __LINE__, "This should succeed.");
+  Assert(true, __FILE__, __LINE__);  // This should succeed too.
+}
+
+// Tests that Assert(false, ...) generates a fatal failure.
+TEST(AssertTest, FailsFatallyOnFalse) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    Assert(false, __FILE__, __LINE__, "This should fail.");
+  }, "");
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    Assert(false, __FILE__, __LINE__);
+  }, "");
+}
+
+// Tests that Expect(true, ...) succeeds.
+TEST(ExpectTest, SucceedsOnTrue) {
+  Expect(true, __FILE__, __LINE__, "This should succeed.");
+  Expect(true, __FILE__, __LINE__);  // This should succeed too.
+}
+
+// Tests that Expect(false, ...) generates a non-fatal failure.
+TEST(ExpectTest, FailsNonfatallyOnFalse) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Expect(false, __FILE__, __LINE__, "This should fail.");
+  }, "This should fail");
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Expect(false, __FILE__, __LINE__);
+  }, "Expectation failed");
+}
+
+// Tests LogIsVisible().
+
+class LogIsVisibleTest : public ::testing::Test {
+ protected:
+  void SetUp() override { original_verbose_ = GMOCK_FLAG(verbose); }
+
+  void TearDown() override { GMOCK_FLAG(verbose) = original_verbose_; }
+
+  std::string original_verbose_;
+};
+
+TEST_F(LogIsVisibleTest, AlwaysReturnsTrueIfVerbosityIsInfo) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  EXPECT_TRUE(LogIsVisible(kInfo));
+  EXPECT_TRUE(LogIsVisible(kWarning));
+}
+
+TEST_F(LogIsVisibleTest, AlwaysReturnsFalseIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  EXPECT_FALSE(LogIsVisible(kInfo));
+  EXPECT_FALSE(LogIsVisible(kWarning));
+}
+
+TEST_F(LogIsVisibleTest, WorksWhenVerbosityIsWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  EXPECT_FALSE(LogIsVisible(kInfo));
+  EXPECT_TRUE(LogIsVisible(kWarning));
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests the Log() function.
+
+// Verifies that Log() behaves correctly for the given verbosity level
+// and log severity.
+void TestLogWithSeverity(const std::string& verbosity, LogSeverity severity,
+                         bool should_print) {
+  const std::string old_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = verbosity;
+  CaptureStdout();
+  Log(severity, "Test log.\n", 0);
+  if (should_print) {
+    EXPECT_THAT(GetCapturedStdout().c_str(),
+                ContainsRegex(
+                    severity == kWarning ?
+                    "^\nGMOCK WARNING:\nTest log\\.\nStack trace:\n" :
+                    "^\nTest log\\.\nStack trace:\n"));
+  } else {
+    EXPECT_STREQ("", GetCapturedStdout().c_str());
+  }
+  GMOCK_FLAG(verbose) = old_flag;
+}
+
+// Tests that when the stack_frames_to_skip parameter is negative,
+// Log() doesn't include the stack trace in the output.
+TEST(LogTest, NoStackTraceWhenStackFramesToSkipIsNegative) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  CaptureStdout();
+  Log(kInfo, "Test log.\n", -1);
+  EXPECT_STREQ("\nTest log.\n", GetCapturedStdout().c_str());
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+struct MockStackTraceGetter : testing::internal::OsStackTraceGetterInterface {
+  std::string CurrentStackTrace(int max_depth, int skip_count) override {
+    return (testing::Message() << max_depth << "::" << skip_count << "\n")
+        .GetString();
+  }
+  void UponLeavingGTest() override {}
+};
+
+// Tests that in opt mode, a positive stack_frames_to_skip argument is
+// treated as 0.
+TEST(LogTest, NoSkippingStackFrameInOptMode) {
+  MockStackTraceGetter* mock_os_stack_trace_getter = new MockStackTraceGetter;
+  GetUnitTestImpl()->set_os_stack_trace_getter(mock_os_stack_trace_getter);
+
+  CaptureStdout();
+  Log(kWarning, "Test log.\n", 100);
+  const std::string log = GetCapturedStdout();
+
+  std::string expected_trace =
+      (testing::Message() << GTEST_FLAG(stack_trace_depth) << "::").GetString();
+  std::string expected_message =
+      "\nGMOCK WARNING:\n"
+      "Test log.\n"
+      "Stack trace:\n" +
+      expected_trace;
+  EXPECT_THAT(log, HasSubstr(expected_message));
+  int skip_count = atoi(log.substr(expected_message.size()).c_str());
+
+# if defined(NDEBUG)
+  // In opt mode, no stack frame should be skipped.
+  const int expected_skip_count = 0;
+# else
+  // In dbg mode, the stack frames should be skipped.
+  const int expected_skip_count = 100;
+# endif
+
+  // Note that each inner implementation layer will +1 the number to remove
+  // itself from the trace. This means that the value is a little higher than
+  // expected, but close enough.
+  EXPECT_THAT(skip_count,
+              AllOf(Ge(expected_skip_count), Le(expected_skip_count + 10)));
+
+  // Restores the default OS stack trace getter.
+  GetUnitTestImpl()->set_os_stack_trace_getter(nullptr);
+}
+
+// Tests that all logs are printed when the value of the
+// --gmock_verbose flag is "info".
+TEST(LogTest, AllLogsArePrintedWhenVerbosityIsInfo) {
+  TestLogWithSeverity(kInfoVerbosity, kInfo, true);
+  TestLogWithSeverity(kInfoVerbosity, kWarning, true);
+}
+
+// Tests that only warnings are printed when the value of the
+// --gmock_verbose flag is "warning".
+TEST(LogTest, OnlyWarningsArePrintedWhenVerbosityIsWarning) {
+  TestLogWithSeverity(kWarningVerbosity, kInfo, false);
+  TestLogWithSeverity(kWarningVerbosity, kWarning, true);
+}
+
+// Tests that no logs are printed when the value of the
+// --gmock_verbose flag is "error".
+TEST(LogTest, NoLogsArePrintedWhenVerbosityIsError) {
+  TestLogWithSeverity(kErrorVerbosity, kInfo, false);
+  TestLogWithSeverity(kErrorVerbosity, kWarning, false);
+}
+
+// Tests that only warnings are printed when the value of the
+// --gmock_verbose flag is invalid.
+TEST(LogTest, OnlyWarningsArePrintedWhenVerbosityIsInvalid) {
+  TestLogWithSeverity("invalid", kInfo, false);
+  TestLogWithSeverity("invalid", kWarning, true);
+}
+
+// Verifies that Log() behaves correctly for the given verbosity level
+// and log severity.
+std::string GrabOutput(void(*logger)(), const char* verbosity) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = verbosity;
+  CaptureStdout();
+  logger();
+  GMOCK_FLAG(verbose) = saved_flag;
+  return GetCapturedStdout();
+}
+
+class DummyMock {
+ public:
+  MOCK_METHOD0(TestMethod, void());
+  MOCK_METHOD1(TestMethodArg, void(int dummy));
+};
+
+void ExpectCallLogger() {
+  DummyMock mock;
+  EXPECT_CALL(mock, TestMethod());
+  mock.TestMethod();
+}
+
+// Verifies that EXPECT_CALL logs if the --gmock_verbose flag is set to "info".
+TEST(ExpectCallTest, LogsWhenVerbosityIsInfo) {
+  EXPECT_THAT(std::string(GrabOutput(ExpectCallLogger, kInfoVerbosity)),
+              HasSubstr("EXPECT_CALL(mock, TestMethod())"));
+}
+
+// Verifies that EXPECT_CALL doesn't log
+// if the --gmock_verbose flag is set to "warning".
+TEST(ExpectCallTest, DoesNotLogWhenVerbosityIsWarning) {
+  EXPECT_STREQ("", GrabOutput(ExpectCallLogger, kWarningVerbosity).c_str());
+}
+
+// Verifies that EXPECT_CALL doesn't log
+// if the --gmock_verbose flag is set to "error".
+TEST(ExpectCallTest,  DoesNotLogWhenVerbosityIsError) {
+  EXPECT_STREQ("", GrabOutput(ExpectCallLogger, kErrorVerbosity).c_str());
+}
+
+void OnCallLogger() {
+  DummyMock mock;
+  ON_CALL(mock, TestMethod());
+}
+
+// Verifies that ON_CALL logs if the --gmock_verbose flag is set to "info".
+TEST(OnCallTest, LogsWhenVerbosityIsInfo) {
+  EXPECT_THAT(std::string(GrabOutput(OnCallLogger, kInfoVerbosity)),
+              HasSubstr("ON_CALL(mock, TestMethod())"));
+}
+
+// Verifies that ON_CALL doesn't log
+// if the --gmock_verbose flag is set to "warning".
+TEST(OnCallTest, DoesNotLogWhenVerbosityIsWarning) {
+  EXPECT_STREQ("", GrabOutput(OnCallLogger, kWarningVerbosity).c_str());
+}
+
+// Verifies that ON_CALL doesn't log if
+// the --gmock_verbose flag is set to "error".
+TEST(OnCallTest, DoesNotLogWhenVerbosityIsError) {
+  EXPECT_STREQ("", GrabOutput(OnCallLogger, kErrorVerbosity).c_str());
+}
+
+void OnCallAnyArgumentLogger() {
+  DummyMock mock;
+  ON_CALL(mock, TestMethodArg(_));
+}
+
+// Verifies that ON_CALL prints provided _ argument.
+TEST(OnCallTest, LogsAnythingArgument) {
+  EXPECT_THAT(std::string(GrabOutput(OnCallAnyArgumentLogger, kInfoVerbosity)),
+              HasSubstr("ON_CALL(mock, TestMethodArg(_)"));
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests StlContainerView.
+
+TEST(StlContainerViewTest, WorksForStlContainer) {
+  StaticAssertTypeEq<std::vector<int>,
+      StlContainerView<std::vector<int> >::type>();
+  StaticAssertTypeEq<const std::vector<double>&,
+      StlContainerView<std::vector<double> >::const_reference>();
+
+  typedef std::vector<char> Chars;
+  Chars v1;
+  const Chars& v2(StlContainerView<Chars>::ConstReference(v1));
+  EXPECT_EQ(&v1, &v2);
+
+  v1.push_back('a');
+  Chars v3 = StlContainerView<Chars>::Copy(v1);
+  EXPECT_THAT(v3, Eq(v3));
+}
+
+TEST(StlContainerViewTest, WorksForStaticNativeArray) {
+  StaticAssertTypeEq<NativeArray<int>,
+      StlContainerView<int[3]>::type>();
+  StaticAssertTypeEq<NativeArray<double>,
+      StlContainerView<const double[4]>::type>();
+  StaticAssertTypeEq<NativeArray<char[3]>,
+      StlContainerView<const char[2][3]>::type>();
+
+  StaticAssertTypeEq<const NativeArray<int>,
+      StlContainerView<int[2]>::const_reference>();
+
+  int a1[3] = { 0, 1, 2 };
+  NativeArray<int> a2 = StlContainerView<int[3]>::ConstReference(a1);
+  EXPECT_EQ(3U, a2.size());
+  EXPECT_EQ(a1, a2.begin());
+
+  const NativeArray<int> a3 = StlContainerView<int[3]>::Copy(a1);
+  ASSERT_EQ(3U, a3.size());
+  EXPECT_EQ(0, a3.begin()[0]);
+  EXPECT_EQ(1, a3.begin()[1]);
+  EXPECT_EQ(2, a3.begin()[2]);
+
+  // Makes sure a1 and a3 aren't aliases.
+  a1[0] = 3;
+  EXPECT_EQ(0, a3.begin()[0]);
+}
+
+TEST(StlContainerViewTest, WorksForDynamicNativeArray) {
+  StaticAssertTypeEq<NativeArray<int>,
+                     StlContainerView<std::tuple<const int*, size_t> >::type>();
+  StaticAssertTypeEq<
+      NativeArray<double>,
+      StlContainerView<std::tuple<std::shared_ptr<double>, int> >::type>();
+
+  StaticAssertTypeEq<
+      const NativeArray<int>,
+      StlContainerView<std::tuple<const int*, int> >::const_reference>();
+
+  int a1[3] = { 0, 1, 2 };
+  const int* const p1 = a1;
+  NativeArray<int> a2 =
+      StlContainerView<std::tuple<const int*, int> >::ConstReference(
+          std::make_tuple(p1, 3));
+  EXPECT_EQ(3U, a2.size());
+  EXPECT_EQ(a1, a2.begin());
+
+  const NativeArray<int> a3 = StlContainerView<std::tuple<int*, size_t> >::Copy(
+      std::make_tuple(static_cast<int*>(a1), 3));
+  ASSERT_EQ(3U, a3.size());
+  EXPECT_EQ(0, a3.begin()[0]);
+  EXPECT_EQ(1, a3.begin()[1]);
+  EXPECT_EQ(2, a3.begin()[2]);
+
+  // Makes sure a1 and a3 aren't aliases.
+  a1[0] = 3;
+  EXPECT_EQ(0, a3.begin()[0]);
+}
+
+// Tests the Function template struct.
+
+TEST(FunctionTest, Nullary) {
+  typedef Function<int()> F;  // NOLINT
+  EXPECT_EQ(0u, F::ArgumentCount);
+  CompileAssertTypesEqual<int, F::Result>();
+  CompileAssertTypesEqual<std::tuple<>, F::ArgumentTuple>();
+  CompileAssertTypesEqual<std::tuple<>, F::ArgumentMatcherTuple>();
+  CompileAssertTypesEqual<void(), F::MakeResultVoid>();
+  CompileAssertTypesEqual<IgnoredValue(), F::MakeResultIgnoredValue>();
+}
+
+TEST(FunctionTest, Unary) {
+  typedef Function<int(bool)> F;  // NOLINT
+  EXPECT_EQ(1u, F::ArgumentCount);
+  CompileAssertTypesEqual<int, F::Result>();
+  CompileAssertTypesEqual<bool, F::Arg<0>::type>();
+  CompileAssertTypesEqual<std::tuple<bool>, F::ArgumentTuple>();
+  CompileAssertTypesEqual<std::tuple<Matcher<bool> >,
+                          F::ArgumentMatcherTuple>();
+  CompileAssertTypesEqual<void(bool), F::MakeResultVoid>();  // NOLINT
+  CompileAssertTypesEqual<IgnoredValue(bool),  // NOLINT
+      F::MakeResultIgnoredValue>();
+}
+
+TEST(FunctionTest, Binary) {
+  typedef Function<int(bool, const long&)> F;  // NOLINT
+  EXPECT_EQ(2u, F::ArgumentCount);
+  CompileAssertTypesEqual<int, F::Result>();
+  CompileAssertTypesEqual<bool, F::Arg<0>::type>();
+  CompileAssertTypesEqual<const long&, F::Arg<1>::type>();  // NOLINT
+  CompileAssertTypesEqual<std::tuple<bool, const long&>,  // NOLINT
+                          F::ArgumentTuple>();
+  CompileAssertTypesEqual<
+      std::tuple<Matcher<bool>, Matcher<const long&> >,  // NOLINT
+      F::ArgumentMatcherTuple>();
+  CompileAssertTypesEqual<void(bool, const long&), F::MakeResultVoid>();  // NOLINT
+  CompileAssertTypesEqual<IgnoredValue(bool, const long&),  // NOLINT
+      F::MakeResultIgnoredValue>();
+}
+
+TEST(FunctionTest, LongArgumentList) {
+  typedef Function<char(bool, int, char*, int&, const long&)> F;  // NOLINT
+  EXPECT_EQ(5u, F::ArgumentCount);
+  CompileAssertTypesEqual<char, F::Result>();
+  CompileAssertTypesEqual<bool, F::Arg<0>::type>();
+  CompileAssertTypesEqual<int, F::Arg<1>::type>();
+  CompileAssertTypesEqual<char*, F::Arg<2>::type>();
+  CompileAssertTypesEqual<int&, F::Arg<3>::type>();
+  CompileAssertTypesEqual<const long&, F::Arg<4>::type>();  // NOLINT
+  CompileAssertTypesEqual<
+      std::tuple<bool, int, char*, int&, const long&>,  // NOLINT
+      F::ArgumentTuple>();
+  CompileAssertTypesEqual<
+      std::tuple<Matcher<bool>, Matcher<int>, Matcher<char*>, Matcher<int&>,
+                 Matcher<const long&> >,  // NOLINT
+      F::ArgumentMatcherTuple>();
+  CompileAssertTypesEqual<void(bool, int, char*, int&, const long&),  // NOLINT
+                          F::MakeResultVoid>();
+  CompileAssertTypesEqual<
+      IgnoredValue(bool, int, char*, int&, const long&),  // NOLINT
+      F::MakeResultIgnoredValue>();
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-matchers_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-matchers_test.cc
new file mode 100755
index 0000000..a61d040
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-matchers_test.cc
@@ -0,0 +1,6792 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests some commonly used argument matchers.
+
+// Silence warning C4244: 'initializing': conversion from 'int' to 'short',
+// possible loss of data and C4100, unreferenced local parameter
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4244)
+# pragma warning(disable:4100)
+#endif
+
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-matchers.h"
+
+#include <string.h>
+#include <time.h>
+#include <deque>
+#include <forward_list>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace testing {
+namespace gmock_matchers_test {
+namespace {
+
+using std::greater;
+using std::less;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::multimap;
+using std::multiset;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using testing::internal::DummyMatchResultListener;
+using testing::internal::ElementMatcherPair;
+using testing::internal::ElementMatcherPairs;
+using testing::internal::ExplainMatchFailureTupleTo;
+using testing::internal::FloatingEqMatcher;
+using testing::internal::FormatMatcherDescription;
+using testing::internal::IsReadableTypeName;
+using testing::internal::MatchMatrix;
+using testing::internal::PredicateFormatterFromMatcher;
+using testing::internal::RE;
+using testing::internal::StreamMatchResultListener;
+using testing::internal::Strings;
+
+// Helper for testing container-valued matchers in mock method context. It is
+// important to test matchers in this context, since it requires additional type
+// deduction beyond what EXPECT_THAT does, thus making it more restrictive.
+struct ContainerHelper {
+  MOCK_METHOD1(Call, void(std::vector<std::unique_ptr<int>>));
+};
+
+std::vector<std::unique_ptr<int>> MakeUniquePtrs(const std::vector<int>& ints) {
+  std::vector<std::unique_ptr<int>> pointers;
+  for (int i : ints) pointers.emplace_back(new int(i));
+  return pointers;
+}
+
+// For testing ExplainMatchResultTo().
+class GreaterThanMatcher : public MatcherInterface<int> {
+ public:
+  explicit GreaterThanMatcher(int rhs) : rhs_(rhs) {}
+
+  void DescribeTo(ostream* os) const override { *os << "is > " << rhs_; }
+
+  bool MatchAndExplain(int lhs, MatchResultListener* listener) const override {
+    const int diff = lhs - rhs_;
+    if (diff > 0) {
+      *listener << "which is " << diff << " more than " << rhs_;
+    } else if (diff == 0) {
+      *listener << "which is the same as " << rhs_;
+    } else {
+      *listener << "which is " << -diff << " less than " << rhs_;
+    }
+
+    return lhs > rhs_;
+  }
+
+ private:
+  int rhs_;
+};
+
+Matcher<int> GreaterThan(int n) {
+  return MakeMatcher(new GreaterThanMatcher(n));
+}
+
+std::string OfType(const std::string& type_name) {
+#if GTEST_HAS_RTTI
+  return " (of type " + type_name + ")";
+#else
+  return "";
+#endif
+}
+
+// Returns the description of the given matcher.
+template <typename T>
+std::string Describe(const Matcher<T>& m) {
+  return DescribeMatcher<T>(m);
+}
+
+// Returns the description of the negation of the given matcher.
+template <typename T>
+std::string DescribeNegation(const Matcher<T>& m) {
+  return DescribeMatcher<T>(m, true);
+}
+
+// Returns the reason why x matches, or doesn't match, m.
+template <typename MatcherType, typename Value>
+std::string Explain(const MatcherType& m, const Value& x) {
+  StringMatchResultListener listener;
+  ExplainMatchResult(m, x, &listener);
+  return listener.str();
+}
+
+TEST(MonotonicMatcherTest, IsPrintable) {
+  stringstream ss;
+  ss << GreaterThan(5);
+  EXPECT_EQ("is > 5", ss.str());
+}
+
+TEST(MatchResultListenerTest, StreamingWorks) {
+  StringMatchResultListener listener;
+  listener << "hi" << 5;
+  EXPECT_EQ("hi5", listener.str());
+
+  listener.Clear();
+  EXPECT_EQ("", listener.str());
+
+  listener << 42;
+  EXPECT_EQ("42", listener.str());
+
+  // Streaming shouldn't crash when the underlying ostream is NULL.
+  DummyMatchResultListener dummy;
+  dummy << "hi" << 5;
+}
+
+TEST(MatchResultListenerTest, CanAccessUnderlyingStream) {
+  EXPECT_TRUE(DummyMatchResultListener().stream() == nullptr);
+  EXPECT_TRUE(StreamMatchResultListener(nullptr).stream() == nullptr);
+
+  EXPECT_EQ(&std::cout, StreamMatchResultListener(&std::cout).stream());
+}
+
+TEST(MatchResultListenerTest, IsInterestedWorks) {
+  EXPECT_TRUE(StringMatchResultListener().IsInterested());
+  EXPECT_TRUE(StreamMatchResultListener(&std::cout).IsInterested());
+
+  EXPECT_FALSE(DummyMatchResultListener().IsInterested());
+  EXPECT_FALSE(StreamMatchResultListener(nullptr).IsInterested());
+}
+
+// Makes sure that the MatcherInterface<T> interface doesn't
+// change.
+class EvenMatcherImpl : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int x,
+                       MatchResultListener* /* listener */) const override {
+    return x % 2 == 0;
+  }
+
+  void DescribeTo(ostream* os) const override { *os << "is an even number"; }
+
+  // We deliberately don't define DescribeNegationTo() and
+  // ExplainMatchResultTo() here, to make sure the definition of these
+  // two methods is optional.
+};
+
+// Makes sure that the MatcherInterface API doesn't change.
+TEST(MatcherInterfaceTest, CanBeImplementedUsingPublishedAPI) {
+  EvenMatcherImpl m;
+}
+
+// Tests implementing a monomorphic matcher using MatchAndExplain().
+
+class NewEvenMatcherImpl : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int x, MatchResultListener* listener) const override {
+    const bool match = x % 2 == 0;
+    // Verifies that we can stream to a listener directly.
+    *listener << "value % " << 2;
+    if (listener->stream() != nullptr) {
+      // Verifies that we can stream to a listener's underlying stream
+      // too.
+      *listener->stream() << " == " << (x % 2);
+    }
+    return match;
+  }
+
+  void DescribeTo(ostream* os) const override { *os << "is an even number"; }
+};
+
+TEST(MatcherInterfaceTest, CanBeImplementedUsingNewAPI) {
+  Matcher<int> m = MakeMatcher(new NewEvenMatcherImpl);
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(3));
+  EXPECT_EQ("value % 2 == 0", Explain(m, 2));
+  EXPECT_EQ("value % 2 == 1", Explain(m, 3));
+}
+
+// Tests default-constructing a matcher.
+TEST(MatcherTest, CanBeDefaultConstructed) {
+  Matcher<double> m;
+}
+
+// Tests that Matcher<T> can be constructed from a MatcherInterface<T>*.
+TEST(MatcherTest, CanBeConstructedFromMatcherInterface) {
+  const MatcherInterface<int>* impl = new EvenMatcherImpl;
+  Matcher<int> m(impl);
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(5));
+}
+
+// Tests that value can be used in place of Eq(value).
+TEST(MatcherTest, CanBeImplicitlyConstructedFromValue) {
+  Matcher<int> m1 = 5;
+  EXPECT_TRUE(m1.Matches(5));
+  EXPECT_FALSE(m1.Matches(6));
+}
+
+// Tests that NULL can be used in place of Eq(NULL).
+TEST(MatcherTest, CanBeImplicitlyConstructedFromNULL) {
+  Matcher<int*> m1 = nullptr;
+  EXPECT_TRUE(m1.Matches(nullptr));
+  int n = 0;
+  EXPECT_FALSE(m1.Matches(&n));
+}
+
+// Tests that matchers can be constructed from a variable that is not properly
+// defined. This should be illegal, but many users rely on this accidentally.
+struct Undefined {
+  virtual ~Undefined() = 0;
+  static const int kInt = 1;
+};
+
+TEST(MatcherTest, CanBeConstructedFromUndefinedVariable) {
+  Matcher<int> m1 = Undefined::kInt;
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_FALSE(m1.Matches(2));
+}
+
+// Test that a matcher parameterized with an abstract class compiles.
+TEST(MatcherTest, CanAcceptAbstractClass) { Matcher<const Undefined&> m = _; }
+
+// Tests that matchers are copyable.
+TEST(MatcherTest, IsCopyable) {
+  // Tests the copy constructor.
+  Matcher<bool> m1 = Eq(false);
+  EXPECT_TRUE(m1.Matches(false));
+  EXPECT_FALSE(m1.Matches(true));
+
+  // Tests the assignment operator.
+  m1 = Eq(true);
+  EXPECT_TRUE(m1.Matches(true));
+  EXPECT_FALSE(m1.Matches(false));
+}
+
+// Tests that Matcher<T>::DescribeTo() calls
+// MatcherInterface<T>::DescribeTo().
+TEST(MatcherTest, CanDescribeItself) {
+  EXPECT_EQ("is an even number",
+            Describe(Matcher<int>(new EvenMatcherImpl)));
+}
+
+// Tests Matcher<T>::MatchAndExplain().
+TEST(MatcherTest, MatchAndExplain) {
+  Matcher<int> m = GreaterThan(0);
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(m.MatchAndExplain(42, &listener1));
+  EXPECT_EQ("which is 42 more than 0", listener1.str());
+
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(m.MatchAndExplain(-9, &listener2));
+  EXPECT_EQ("which is 9 less than 0", listener2.str());
+}
+
+// Tests that a C-string literal can be implicitly converted to a
+// Matcher<std::string> or Matcher<const std::string&>.
+TEST(StringMatcherTest, CanBeImplicitlyConstructedFromCStringLiteral) {
+  Matcher<std::string> m1 = "hi";
+  EXPECT_TRUE(m1.Matches("hi"));
+  EXPECT_FALSE(m1.Matches("hello"));
+
+  Matcher<const std::string&> m2 = "hi";
+  EXPECT_TRUE(m2.Matches("hi"));
+  EXPECT_FALSE(m2.Matches("hello"));
+}
+
+// Tests that a string object can be implicitly converted to a
+// Matcher<std::string> or Matcher<const std::string&>.
+TEST(StringMatcherTest, CanBeImplicitlyConstructedFromString) {
+  Matcher<std::string> m1 = std::string("hi");
+  EXPECT_TRUE(m1.Matches("hi"));
+  EXPECT_FALSE(m1.Matches("hello"));
+
+  Matcher<const std::string&> m2 = std::string("hi");
+  EXPECT_TRUE(m2.Matches("hi"));
+  EXPECT_FALSE(m2.Matches("hello"));
+}
+
+#if GTEST_HAS_ABSL
+// Tests that a C-string literal can be implicitly converted to a
+// Matcher<absl::string_view> or Matcher<const absl::string_view&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromCStringLiteral) {
+  Matcher<absl::string_view> m1 = "cats";
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const absl::string_view&> m2 = "cats";
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that a std::string object can be implicitly converted to a
+// Matcher<absl::string_view> or Matcher<const absl::string_view&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromString) {
+  Matcher<absl::string_view> m1 = std::string("cats");
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const absl::string_view&> m2 = std::string("cats");
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that a absl::string_view object can be implicitly converted to a
+// Matcher<absl::string_view> or Matcher<const absl::string_view&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromStringView) {
+  Matcher<absl::string_view> m1 = absl::string_view("cats");
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const absl::string_view&> m2 = absl::string_view("cats");
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+#endif  // GTEST_HAS_ABSL
+
+// Tests that a std::reference_wrapper<std::string> object can be implicitly
+// converted to a Matcher<std::string> or Matcher<const std::string&> via Eq().
+TEST(StringMatcherTest,
+     CanBeImplicitlyConstructedFromEqReferenceWrapperString) {
+  std::string value = "cats";
+  Matcher<std::string> m1 = Eq(std::ref(value));
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const std::string&> m2 = Eq(std::ref(value));
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that MakeMatcher() constructs a Matcher<T> from a
+// MatcherInterface* without requiring the user to explicitly
+// write the type.
+TEST(MakeMatcherTest, ConstructsMatcherFromMatcherInterface) {
+  const MatcherInterface<int>* dummy_impl = nullptr;
+  Matcher<int> m = MakeMatcher(dummy_impl);
+}
+
+// Tests that MakePolymorphicMatcher() can construct a polymorphic
+// matcher from its implementation using the old API.
+const int g_bar = 1;
+class ReferencesBarOrIsZeroImpl {
+ public:
+  template <typename T>
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* /* listener */) const {
+    const void* p = &x;
+    return p == &g_bar || x == 0;
+  }
+
+  void DescribeTo(ostream* os) const { *os << "g_bar or zero"; }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "doesn't reference g_bar and is not zero";
+  }
+};
+
+// This function verifies that MakePolymorphicMatcher() returns a
+// PolymorphicMatcher<T> where T is the argument's type.
+PolymorphicMatcher<ReferencesBarOrIsZeroImpl> ReferencesBarOrIsZero() {
+  return MakePolymorphicMatcher(ReferencesBarOrIsZeroImpl());
+}
+
+TEST(MakePolymorphicMatcherTest, ConstructsMatcherUsingOldAPI) {
+  // Using a polymorphic matcher to match a reference type.
+  Matcher<const int&> m1 = ReferencesBarOrIsZero();
+  EXPECT_TRUE(m1.Matches(0));
+  // Verifies that the identity of a by-reference argument is preserved.
+  EXPECT_TRUE(m1.Matches(g_bar));
+  EXPECT_FALSE(m1.Matches(1));
+  EXPECT_EQ("g_bar or zero", Describe(m1));
+
+  // Using a polymorphic matcher to match a value type.
+  Matcher<double> m2 = ReferencesBarOrIsZero();
+  EXPECT_TRUE(m2.Matches(0.0));
+  EXPECT_FALSE(m2.Matches(0.1));
+  EXPECT_EQ("g_bar or zero", Describe(m2));
+}
+
+// Tests implementing a polymorphic matcher using MatchAndExplain().
+
+class PolymorphicIsEvenImpl {
+ public:
+  void DescribeTo(ostream* os) const { *os << "is even"; }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "is odd";
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+    // Verifies that we can stream to the listener directly.
+    *listener << "% " << 2;
+    if (listener->stream() != nullptr) {
+      // Verifies that we can stream to the listener's underlying stream
+      // too.
+      *listener->stream() << " == " << (x % 2);
+    }
+    return (x % 2) == 0;
+  }
+};
+
+PolymorphicMatcher<PolymorphicIsEvenImpl> PolymorphicIsEven() {
+  return MakePolymorphicMatcher(PolymorphicIsEvenImpl());
+}
+
+TEST(MakePolymorphicMatcherTest, ConstructsMatcherUsingNewAPI) {
+  // Using PolymorphicIsEven() as a Matcher<int>.
+  const Matcher<int> m1 = PolymorphicIsEven();
+  EXPECT_TRUE(m1.Matches(42));
+  EXPECT_FALSE(m1.Matches(43));
+  EXPECT_EQ("is even", Describe(m1));
+
+  const Matcher<int> not_m1 = Not(m1);
+  EXPECT_EQ("is odd", Describe(not_m1));
+
+  EXPECT_EQ("% 2 == 0", Explain(m1, 42));
+
+  // Using PolymorphicIsEven() as a Matcher<char>.
+  const Matcher<char> m2 = PolymorphicIsEven();
+  EXPECT_TRUE(m2.Matches('\x42'));
+  EXPECT_FALSE(m2.Matches('\x43'));
+  EXPECT_EQ("is even", Describe(m2));
+
+  const Matcher<char> not_m2 = Not(m2);
+  EXPECT_EQ("is odd", Describe(not_m2));
+
+  EXPECT_EQ("% 2 == 0", Explain(m2, '\x42'));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a polymorphic matcher.
+TEST(MatcherCastTest, FromPolymorphicMatcher) {
+  Matcher<int> m = MatcherCast<int>(Eq(5));
+  EXPECT_TRUE(m.Matches(5));
+  EXPECT_FALSE(m.Matches(6));
+}
+
+// For testing casting matchers between compatible types.
+class IntValue {
+ public:
+  // An int can be statically (although not implicitly) cast to a
+  // IntValue.
+  explicit IntValue(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+ private:
+  int value_;
+};
+
+// For testing casting matchers between compatible types.
+bool IsPositiveIntValue(const IntValue& foo) {
+  return foo.value() > 0;
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<U> where T
+// can be statically converted to U.
+TEST(MatcherCastTest, FromCompatibleType) {
+  Matcher<double> m1 = Eq(2.0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(2));
+  EXPECT_FALSE(m2.Matches(3));
+
+  Matcher<IntValue> m3 = Truly(IsPositiveIntValue);
+  Matcher<int> m4 = MatcherCast<int>(m3);
+  // In the following, the arguments 1 and 0 are statically converted
+  // to IntValue objects, and then tested by the IsPositiveIntValue()
+  // predicate.
+  EXPECT_TRUE(m4.Matches(1));
+  EXPECT_FALSE(m4.Matches(0));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<const T&>.
+TEST(MatcherCastTest, FromConstReferenceToNonReference) {
+  Matcher<const int&> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<T&>.
+TEST(MatcherCastTest, FromReferenceToNonReference) {
+  Matcher<int&> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<const T&>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromNonReferenceToConstReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<const int&> m2 = MatcherCast<const int&>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T&>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromNonReferenceToReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int&> m2 = MatcherCast<int&>(m1);
+  int n = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  n = 1;
+  EXPECT_FALSE(m2.Matches(n));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromSameType) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a value of the same type as the
+// value type of the Matcher.
+TEST(MatcherCastTest, FromAValue) {
+  Matcher<int> m = MatcherCast<int>(42);
+  EXPECT_TRUE(m.Matches(42));
+  EXPECT_FALSE(m.Matches(239));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a value of the type implicitly
+// convertible to the value type of the Matcher.
+TEST(MatcherCastTest, FromAnImplicitlyConvertibleValue) {
+  const int kExpected = 'c';
+  Matcher<int> m = MatcherCast<int>('c');
+  EXPECT_TRUE(m.Matches(kExpected));
+  EXPECT_FALSE(m.Matches(kExpected + 1));
+}
+
+struct NonImplicitlyConstructibleTypeWithOperatorEq {
+  friend bool operator==(
+      const NonImplicitlyConstructibleTypeWithOperatorEq& /* ignored */,
+      int rhs) {
+    return 42 == rhs;
+  }
+  friend bool operator==(
+      int lhs,
+      const NonImplicitlyConstructibleTypeWithOperatorEq& /* ignored */) {
+    return lhs == 42;
+  }
+};
+
+// Tests that MatcherCast<T>(m) works when m is a neither a matcher nor
+// implicitly convertible to the value type of the Matcher, but the value type
+// of the matcher has operator==() overload accepting m.
+TEST(MatcherCastTest, NonImplicitlyConstructibleTypeWithOperatorEq) {
+  Matcher<NonImplicitlyConstructibleTypeWithOperatorEq> m1 =
+      MatcherCast<NonImplicitlyConstructibleTypeWithOperatorEq>(42);
+  EXPECT_TRUE(m1.Matches(NonImplicitlyConstructibleTypeWithOperatorEq()));
+
+  Matcher<NonImplicitlyConstructibleTypeWithOperatorEq> m2 =
+      MatcherCast<NonImplicitlyConstructibleTypeWithOperatorEq>(239);
+  EXPECT_FALSE(m2.Matches(NonImplicitlyConstructibleTypeWithOperatorEq()));
+
+  // When updating the following lines please also change the comment to
+  // namespace convertible_from_any.
+  Matcher<int> m3 =
+      MatcherCast<int>(NonImplicitlyConstructibleTypeWithOperatorEq());
+  EXPECT_TRUE(m3.Matches(42));
+  EXPECT_FALSE(m3.Matches(239));
+}
+
+// ConvertibleFromAny does not work with MSVC. resulting in
+// error C2440: 'initializing': cannot convert from 'Eq' to 'M'
+// No constructor could take the source type, or constructor overload
+// resolution was ambiguous
+
+#if !defined _MSC_VER
+
+// The below ConvertibleFromAny struct is implicitly constructible from anything
+// and when in the same namespace can interact with other tests. In particular,
+// if it is in the same namespace as other tests and one removes
+//   NonImplicitlyConstructibleTypeWithOperatorEq::operator==(int lhs, ...);
+// then the corresponding test still compiles (and it should not!) by implicitly
+// converting NonImplicitlyConstructibleTypeWithOperatorEq to ConvertibleFromAny
+// in m3.Matcher().
+namespace convertible_from_any {
+// Implicitly convertible from any type.
+struct ConvertibleFromAny {
+  ConvertibleFromAny(int a_value) : value(a_value) {}
+  template <typename T>
+  ConvertibleFromAny(const T& /*a_value*/) : value(-1) {
+    ADD_FAILURE() << "Conversion constructor called";
+  }
+  int value;
+};
+
+bool operator==(const ConvertibleFromAny& a, const ConvertibleFromAny& b) {
+  return a.value == b.value;
+}
+
+ostream& operator<<(ostream& os, const ConvertibleFromAny& a) {
+  return os << a.value;
+}
+
+TEST(MatcherCastTest, ConversionConstructorIsUsed) {
+  Matcher<ConvertibleFromAny> m = MatcherCast<ConvertibleFromAny>(1);
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+
+TEST(MatcherCastTest, FromConvertibleFromAny) {
+  Matcher<ConvertibleFromAny> m =
+      MatcherCast<ConvertibleFromAny>(Eq(ConvertibleFromAny(1)));
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+}  // namespace convertible_from_any
+
+#endif  // !defined _MSC_VER
+
+struct IntReferenceWrapper {
+  IntReferenceWrapper(const int& a_value) : value(&a_value) {}
+  const int* value;
+};
+
+bool operator==(const IntReferenceWrapper& a, const IntReferenceWrapper& b) {
+  return a.value == b.value;
+}
+
+TEST(MatcherCastTest, ValueIsNotCopied) {
+  int n = 42;
+  Matcher<IntReferenceWrapper> m = MatcherCast<IntReferenceWrapper>(n);
+  // Verify that the matcher holds a reference to n, not to its temporary copy.
+  EXPECT_TRUE(m.Matches(n));
+}
+
+class Base {
+ public:
+  virtual ~Base() {}
+  Base() {}
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Base);
+};
+
+class Derived : public Base {
+ public:
+  Derived() : Base() {}
+  int i;
+};
+
+class OtherDerived : public Base {};
+
+// Tests that SafeMatcherCast<T>(m) works when m is a polymorphic matcher.
+TEST(SafeMatcherCastTest, FromPolymorphicMatcher) {
+  Matcher<char> m2 = SafeMatcherCast<char>(Eq(32));
+  EXPECT_TRUE(m2.Matches(' '));
+  EXPECT_FALSE(m2.Matches('\n'));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<U> where
+// T and U are arithmetic types and T can be losslessly converted to
+// U.
+TEST(SafeMatcherCastTest, FromLosslesslyConvertibleArithmeticType) {
+  Matcher<double> m1 = DoubleEq(1.0);
+  Matcher<float> m2 = SafeMatcherCast<float>(m1);
+  EXPECT_TRUE(m2.Matches(1.0f));
+  EXPECT_FALSE(m2.Matches(2.0f));
+
+  Matcher<char> m3 = SafeMatcherCast<char>(TypedEq<int>('a'));
+  EXPECT_TRUE(m3.Matches('a'));
+  EXPECT_FALSE(m3.Matches('b'));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<U> where T and U
+// are pointers or references to a derived and a base class, correspondingly.
+TEST(SafeMatcherCastTest, FromBaseClass) {
+  Derived d, d2;
+  Matcher<Base*> m1 = Eq(&d);
+  Matcher<Derived*> m2 = SafeMatcherCast<Derived*>(m1);
+  EXPECT_TRUE(m2.Matches(&d));
+  EXPECT_FALSE(m2.Matches(&d2));
+
+  Matcher<Base&> m3 = Ref(d);
+  Matcher<Derived&> m4 = SafeMatcherCast<Derived&>(m3);
+  EXPECT_TRUE(m4.Matches(d));
+  EXPECT_FALSE(m4.Matches(d2));
+}
+
+// Tests that SafeMatcherCast<T&>(m) works when m is a Matcher<const T&>.
+TEST(SafeMatcherCastTest, FromConstReferenceToReference) {
+  int n = 0;
+  Matcher<const int&> m1 = Ref(n);
+  Matcher<int&> m2 = SafeMatcherCast<int&>(m1);
+  int n1 = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  EXPECT_FALSE(m2.Matches(n1));
+}
+
+// Tests that MatcherCast<const T&>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromNonReferenceToConstReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<const int&> m2 = SafeMatcherCast<const int&>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that SafeMatcherCast<T&>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromNonReferenceToReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int&> m2 = SafeMatcherCast<int&>(m1);
+  int n = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  n = 1;
+  EXPECT_FALSE(m2.Matches(n));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromSameType) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int> m2 = SafeMatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+#if !defined _MSC_VER
+
+namespace convertible_from_any {
+TEST(SafeMatcherCastTest, ConversionConstructorIsUsed) {
+  Matcher<ConvertibleFromAny> m = SafeMatcherCast<ConvertibleFromAny>(1);
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+
+TEST(SafeMatcherCastTest, FromConvertibleFromAny) {
+  Matcher<ConvertibleFromAny> m =
+      SafeMatcherCast<ConvertibleFromAny>(Eq(ConvertibleFromAny(1)));
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+}  // namespace convertible_from_any
+
+#endif  // !defined _MSC_VER
+
+TEST(SafeMatcherCastTest, ValueIsNotCopied) {
+  int n = 42;
+  Matcher<IntReferenceWrapper> m = SafeMatcherCast<IntReferenceWrapper>(n);
+  // Verify that the matcher holds a reference to n, not to its temporary copy.
+  EXPECT_TRUE(m.Matches(n));
+}
+
+TEST(ExpectThat, TakesLiterals) {
+  EXPECT_THAT(1, 1);
+  EXPECT_THAT(1.0, 1.0);
+  EXPECT_THAT(std::string(), "");
+}
+
+TEST(ExpectThat, TakesFunctions) {
+  struct Helper {
+    static void Func() {}
+  };
+  void (*func)() = Helper::Func;
+  EXPECT_THAT(func, Helper::Func);
+  EXPECT_THAT(func, &Helper::Func);
+}
+
+// Tests that A<T>() matches any value of type T.
+TEST(ATest, MatchesAnyValue) {
+  // Tests a matcher for a value type.
+  Matcher<double> m1 = A<double>();
+  EXPECT_TRUE(m1.Matches(91.43));
+  EXPECT_TRUE(m1.Matches(-15.32));
+
+  // Tests a matcher for a reference type.
+  int a = 2;
+  int b = -6;
+  Matcher<int&> m2 = A<int&>();
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+TEST(ATest, WorksForDerivedClass) {
+  Base base;
+  Derived derived;
+  EXPECT_THAT(&base, A<Base*>());
+  // This shouldn't compile: EXPECT_THAT(&base, A<Derived*>());
+  EXPECT_THAT(&derived, A<Base*>());
+  EXPECT_THAT(&derived, A<Derived*>());
+}
+
+// Tests that A<T>() describes itself properly.
+TEST(ATest, CanDescribeSelf) {
+  EXPECT_EQ("is anything", Describe(A<bool>()));
+}
+
+// Tests that An<T>() matches any value of type T.
+TEST(AnTest, MatchesAnyValue) {
+  // Tests a matcher for a value type.
+  Matcher<int> m1 = An<int>();
+  EXPECT_TRUE(m1.Matches(9143));
+  EXPECT_TRUE(m1.Matches(-1532));
+
+  // Tests a matcher for a reference type.
+  int a = 2;
+  int b = -6;
+  Matcher<int&> m2 = An<int&>();
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+// Tests that An<T>() describes itself properly.
+TEST(AnTest, CanDescribeSelf) {
+  EXPECT_EQ("is anything", Describe(An<int>()));
+}
+
+// Tests that _ can be used as a matcher for any type and matches any
+// value of that type.
+TEST(UnderscoreTest, MatchesAnyValue) {
+  // Uses _ as a matcher for a value type.
+  Matcher<int> m1 = _;
+  EXPECT_TRUE(m1.Matches(123));
+  EXPECT_TRUE(m1.Matches(-242));
+
+  // Uses _ as a matcher for a reference type.
+  bool a = false;
+  const bool b = true;
+  Matcher<const bool&> m2 = _;
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+// Tests that _ describes itself properly.
+TEST(UnderscoreTest, CanDescribeSelf) {
+  Matcher<int> m = _;
+  EXPECT_EQ("is anything", Describe(m));
+}
+
+// Tests that Eq(x) matches any value equal to x.
+TEST(EqTest, MatchesEqualValue) {
+  // 2 C-strings with same content but different addresses.
+  const char a1[] = "hi";
+  const char a2[] = "hi";
+
+  Matcher<const char*> m1 = Eq(a1);
+  EXPECT_TRUE(m1.Matches(a1));
+  EXPECT_FALSE(m1.Matches(a2));
+}
+
+// Tests that Eq(v) describes itself properly.
+
+class Unprintable {
+ public:
+  Unprintable() : c_('a') {}
+
+  bool operator==(const Unprintable& /* rhs */) const { return true; }
+  // -Wunused-private-field: dummy accessor for `c_`.
+  char dummy_c() { return c_; }
+ private:
+  char c_;
+};
+
+TEST(EqTest, CanDescribeSelf) {
+  Matcher<Unprintable> m = Eq(Unprintable());
+  EXPECT_EQ("is equal to 1-byte object <61>", Describe(m));
+}
+
+// Tests that Eq(v) can be used to match any type that supports
+// comparing with type T, where T is v's type.
+TEST(EqTest, IsPolymorphic) {
+  Matcher<int> m1 = Eq(1);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_FALSE(m1.Matches(2));
+
+  Matcher<char> m2 = Eq(1);
+  EXPECT_TRUE(m2.Matches('\1'));
+  EXPECT_FALSE(m2.Matches('a'));
+}
+
+// Tests that TypedEq<T>(v) matches values of type T that's equal to v.
+TEST(TypedEqTest, ChecksEqualityForGivenType) {
+  Matcher<char> m1 = TypedEq<char>('a');
+  EXPECT_TRUE(m1.Matches('a'));
+  EXPECT_FALSE(m1.Matches('b'));
+
+  Matcher<int> m2 = TypedEq<int>(6);
+  EXPECT_TRUE(m2.Matches(6));
+  EXPECT_FALSE(m2.Matches(7));
+}
+
+// Tests that TypedEq(v) describes itself properly.
+TEST(TypedEqTest, CanDescribeSelf) {
+  EXPECT_EQ("is equal to 2", Describe(TypedEq<int>(2)));
+}
+
+// Tests that TypedEq<T>(v) has type Matcher<T>.
+
+// Type<T>::IsTypeOf(v) compiles if the type of value v is T, where T
+// is a "bare" type (i.e. not in the form of const U or U&).  If v's
+// type is not T, the compiler will generate a message about
+// "undefined reference".
+template <typename T>
+struct Type {
+  static bool IsTypeOf(const T& /* v */) { return true; }
+
+  template <typename T2>
+  static void IsTypeOf(T2 v);
+};
+
+TEST(TypedEqTest, HasSpecifiedType) {
+  // Verfies that the type of TypedEq<T>(v) is Matcher<T>.
+  Type<Matcher<int> >::IsTypeOf(TypedEq<int>(5));
+  Type<Matcher<double> >::IsTypeOf(TypedEq<double>(5));
+}
+
+// Tests that Ge(v) matches anything >= v.
+TEST(GeTest, ImplementsGreaterThanOrEqual) {
+  Matcher<int> m1 = Ge(0);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_TRUE(m1.Matches(0));
+  EXPECT_FALSE(m1.Matches(-1));
+}
+
+// Tests that Ge(v) describes itself properly.
+TEST(GeTest, CanDescribeSelf) {
+  Matcher<int> m = Ge(5);
+  EXPECT_EQ("is >= 5", Describe(m));
+}
+
+// Tests that Gt(v) matches anything > v.
+TEST(GtTest, ImplementsGreaterThan) {
+  Matcher<double> m1 = Gt(0);
+  EXPECT_TRUE(m1.Matches(1.0));
+  EXPECT_FALSE(m1.Matches(0.0));
+  EXPECT_FALSE(m1.Matches(-1.0));
+}
+
+// Tests that Gt(v) describes itself properly.
+TEST(GtTest, CanDescribeSelf) {
+  Matcher<int> m = Gt(5);
+  EXPECT_EQ("is > 5", Describe(m));
+}
+
+// Tests that Le(v) matches anything <= v.
+TEST(LeTest, ImplementsLessThanOrEqual) {
+  Matcher<char> m1 = Le('b');
+  EXPECT_TRUE(m1.Matches('a'));
+  EXPECT_TRUE(m1.Matches('b'));
+  EXPECT_FALSE(m1.Matches('c'));
+}
+
+// Tests that Le(v) describes itself properly.
+TEST(LeTest, CanDescribeSelf) {
+  Matcher<int> m = Le(5);
+  EXPECT_EQ("is <= 5", Describe(m));
+}
+
+// Tests that Lt(v) matches anything < v.
+TEST(LtTest, ImplementsLessThan) {
+  Matcher<const std::string&> m1 = Lt("Hello");
+  EXPECT_TRUE(m1.Matches("Abc"));
+  EXPECT_FALSE(m1.Matches("Hello"));
+  EXPECT_FALSE(m1.Matches("Hello, world!"));
+}
+
+// Tests that Lt(v) describes itself properly.
+TEST(LtTest, CanDescribeSelf) {
+  Matcher<int> m = Lt(5);
+  EXPECT_EQ("is < 5", Describe(m));
+}
+
+// Tests that Ne(v) matches anything != v.
+TEST(NeTest, ImplementsNotEqual) {
+  Matcher<int> m1 = Ne(0);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_TRUE(m1.Matches(-1));
+  EXPECT_FALSE(m1.Matches(0));
+}
+
+// Tests that Ne(v) describes itself properly.
+TEST(NeTest, CanDescribeSelf) {
+  Matcher<int> m = Ne(5);
+  EXPECT_EQ("isn't equal to 5", Describe(m));
+}
+
+class MoveOnly {
+ public:
+  explicit MoveOnly(int i) : i_(i) {}
+  MoveOnly(const MoveOnly&) = delete;
+  MoveOnly(MoveOnly&&) = default;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+  MoveOnly& operator=(MoveOnly&&) = default;
+
+  bool operator==(const MoveOnly& other) const { return i_ == other.i_; }
+  bool operator!=(const MoveOnly& other) const { return i_ != other.i_; }
+  bool operator<(const MoveOnly& other) const { return i_ < other.i_; }
+  bool operator<=(const MoveOnly& other) const { return i_ <= other.i_; }
+  bool operator>(const MoveOnly& other) const { return i_ > other.i_; }
+  bool operator>=(const MoveOnly& other) const { return i_ >= other.i_; }
+
+ private:
+  int i_;
+};
+
+struct MoveHelper {
+  MOCK_METHOD1(Call, void(MoveOnly));
+};
+
+TEST(ComparisonBaseTest, WorksWithMoveOnly) {
+  MoveOnly m{0};
+  MoveHelper helper;
+
+  EXPECT_CALL(helper, Call(Eq(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Ne(ByRef(m))));
+  helper.Call(MoveOnly(1));
+  EXPECT_CALL(helper, Call(Le(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Lt(ByRef(m))));
+  helper.Call(MoveOnly(-1));
+  EXPECT_CALL(helper, Call(Ge(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Gt(ByRef(m))));
+  helper.Call(MoveOnly(1));
+}
+
+// Tests that IsNull() matches any NULL pointer of any type.
+TEST(IsNullTest, MatchesNullPointer) {
+  Matcher<int*> m1 = IsNull();
+  int* p1 = nullptr;
+  int n = 0;
+  EXPECT_TRUE(m1.Matches(p1));
+  EXPECT_FALSE(m1.Matches(&n));
+
+  Matcher<const char*> m2 = IsNull();
+  const char* p2 = nullptr;
+  EXPECT_TRUE(m2.Matches(p2));
+  EXPECT_FALSE(m2.Matches("hi"));
+
+  Matcher<void*> m3 = IsNull();
+  void* p3 = nullptr;
+  EXPECT_TRUE(m3.Matches(p3));
+  EXPECT_FALSE(m3.Matches(reinterpret_cast<void*>(0xbeef)));
+}
+
+TEST(IsNullTest, StdFunction) {
+  const Matcher<std::function<void()>> m = IsNull();
+
+  EXPECT_TRUE(m.Matches(std::function<void()>()));
+  EXPECT_FALSE(m.Matches([]{}));
+}
+
+// Tests that IsNull() describes itself properly.
+TEST(IsNullTest, CanDescribeSelf) {
+  Matcher<int*> m = IsNull();
+  EXPECT_EQ("is NULL", Describe(m));
+  EXPECT_EQ("isn't NULL", DescribeNegation(m));
+}
+
+// Tests that NotNull() matches any non-NULL pointer of any type.
+TEST(NotNullTest, MatchesNonNullPointer) {
+  Matcher<int*> m1 = NotNull();
+  int* p1 = nullptr;
+  int n = 0;
+  EXPECT_FALSE(m1.Matches(p1));
+  EXPECT_TRUE(m1.Matches(&n));
+
+  Matcher<const char*> m2 = NotNull();
+  const char* p2 = nullptr;
+  EXPECT_FALSE(m2.Matches(p2));
+  EXPECT_TRUE(m2.Matches("hi"));
+}
+
+TEST(NotNullTest, LinkedPtr) {
+  const Matcher<std::shared_ptr<int>> m = NotNull();
+  const std::shared_ptr<int> null_p;
+  const std::shared_ptr<int> non_null_p(new int);
+
+  EXPECT_FALSE(m.Matches(null_p));
+  EXPECT_TRUE(m.Matches(non_null_p));
+}
+
+TEST(NotNullTest, ReferenceToConstLinkedPtr) {
+  const Matcher<const std::shared_ptr<double>&> m = NotNull();
+  const std::shared_ptr<double> null_p;
+  const std::shared_ptr<double> non_null_p(new double);
+
+  EXPECT_FALSE(m.Matches(null_p));
+  EXPECT_TRUE(m.Matches(non_null_p));
+}
+
+TEST(NotNullTest, StdFunction) {
+  const Matcher<std::function<void()>> m = NotNull();
+
+  EXPECT_TRUE(m.Matches([]{}));
+  EXPECT_FALSE(m.Matches(std::function<void()>()));
+}
+
+// Tests that NotNull() describes itself properly.
+TEST(NotNullTest, CanDescribeSelf) {
+  Matcher<int*> m = NotNull();
+  EXPECT_EQ("isn't NULL", Describe(m));
+}
+
+// Tests that Ref(variable) matches an argument that references
+// 'variable'.
+TEST(RefTest, MatchesSameVariable) {
+  int a = 0;
+  int b = 0;
+  Matcher<int&> m = Ref(a);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_FALSE(m.Matches(b));
+}
+
+// Tests that Ref(variable) describes itself properly.
+TEST(RefTest, CanDescribeSelf) {
+  int n = 5;
+  Matcher<int&> m = Ref(n);
+  stringstream ss;
+  ss << "references the variable @" << &n << " 5";
+  EXPECT_EQ(ss.str(), Describe(m));
+}
+
+// Test that Ref(non_const_varialbe) can be used as a matcher for a
+// const reference.
+TEST(RefTest, CanBeUsedAsMatcherForConstReference) {
+  int a = 0;
+  int b = 0;
+  Matcher<const int&> m = Ref(a);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_FALSE(m.Matches(b));
+}
+
+// Tests that Ref(variable) is covariant, i.e. Ref(derived) can be
+// used wherever Ref(base) can be used (Ref(derived) is a sub-type
+// of Ref(base), but not vice versa.
+
+TEST(RefTest, IsCovariant) {
+  Base base, base2;
+  Derived derived;
+  Matcher<const Base&> m1 = Ref(base);
+  EXPECT_TRUE(m1.Matches(base));
+  EXPECT_FALSE(m1.Matches(base2));
+  EXPECT_FALSE(m1.Matches(derived));
+
+  m1 = Ref(derived);
+  EXPECT_TRUE(m1.Matches(derived));
+  EXPECT_FALSE(m1.Matches(base));
+  EXPECT_FALSE(m1.Matches(base2));
+}
+
+TEST(RefTest, ExplainsResult) {
+  int n = 0;
+  EXPECT_THAT(Explain(Matcher<const int&>(Ref(n)), n),
+              StartsWith("which is located @"));
+
+  int m = 0;
+  EXPECT_THAT(Explain(Matcher<const int&>(Ref(n)), m),
+              StartsWith("which is located @"));
+}
+
+// Tests string comparison matchers.
+
+TEST(StrEqTest, MatchesEqualString) {
+  Matcher<const char*> m = StrEq(std::string("Hello"));
+  EXPECT_TRUE(m.Matches("Hello"));
+  EXPECT_FALSE(m.Matches("hello"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const std::string&> m2 = StrEq("Hello");
+  EXPECT_TRUE(m2.Matches("Hello"));
+  EXPECT_FALSE(m2.Matches("Hi"));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view&> m3 = StrEq("Hello");
+  EXPECT_TRUE(m3.Matches(absl::string_view("Hello")));
+  EXPECT_FALSE(m3.Matches(absl::string_view("hello")));
+  EXPECT_FALSE(m3.Matches(absl::string_view()));
+
+  Matcher<const absl::string_view&> m_empty = StrEq("");
+  EXPECT_TRUE(m_empty.Matches(absl::string_view("")));
+  EXPECT_TRUE(m_empty.Matches(absl::string_view()));
+  EXPECT_FALSE(m_empty.Matches(absl::string_view("hello")));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(StrEqTest, CanDescribeSelf) {
+  Matcher<std::string> m = StrEq("Hi-\'\"?\\\a\b\f\n\r\t\v\xD3");
+  EXPECT_EQ("is equal to \"Hi-\'\\\"?\\\\\\a\\b\\f\\n\\r\\t\\v\\xD3\"",
+      Describe(m));
+
+  std::string str("01204500800");
+  str[3] = '\0';
+  Matcher<std::string> m2 = StrEq(str);
+  EXPECT_EQ("is equal to \"012\\04500800\"", Describe(m2));
+  str[0] = str[6] = str[7] = str[9] = str[10] = '\0';
+  Matcher<std::string> m3 = StrEq(str);
+  EXPECT_EQ("is equal to \"\\012\\045\\0\\08\\0\\0\"", Describe(m3));
+}
+
+TEST(StrNeTest, MatchesUnequalString) {
+  Matcher<const char*> m = StrNe("Hello");
+  EXPECT_TRUE(m.Matches(""));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches("Hello"));
+
+  Matcher<std::string> m2 = StrNe(std::string("Hello"));
+  EXPECT_TRUE(m2.Matches("hello"));
+  EXPECT_FALSE(m2.Matches("Hello"));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view> m3 = StrNe("Hello");
+  EXPECT_TRUE(m3.Matches(absl::string_view("")));
+  EXPECT_TRUE(m3.Matches(absl::string_view()));
+  EXPECT_FALSE(m3.Matches(absl::string_view("Hello")));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(StrNeTest, CanDescribeSelf) {
+  Matcher<const char*> m = StrNe("Hi");
+  EXPECT_EQ("isn't equal to \"Hi\"", Describe(m));
+}
+
+TEST(StrCaseEqTest, MatchesEqualStringIgnoringCase) {
+  Matcher<const char*> m = StrCaseEq(std::string("Hello"));
+  EXPECT_TRUE(m.Matches("Hello"));
+  EXPECT_TRUE(m.Matches("hello"));
+  EXPECT_FALSE(m.Matches("Hi"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const std::string&> m2 = StrCaseEq("Hello");
+  EXPECT_TRUE(m2.Matches("hello"));
+  EXPECT_FALSE(m2.Matches("Hi"));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view&> m3 = StrCaseEq(std::string("Hello"));
+  EXPECT_TRUE(m3.Matches(absl::string_view("Hello")));
+  EXPECT_TRUE(m3.Matches(absl::string_view("hello")));
+  EXPECT_FALSE(m3.Matches(absl::string_view("Hi")));
+  EXPECT_FALSE(m3.Matches(absl::string_view()));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(StrCaseEqTest, MatchesEqualStringWith0IgnoringCase) {
+  std::string str1("oabocdooeoo");
+  std::string str2("OABOCDOOEOO");
+  Matcher<const std::string&> m0 = StrCaseEq(str1);
+  EXPECT_FALSE(m0.Matches(str2 + std::string(1, '\0')));
+
+  str1[3] = str2[3] = '\0';
+  Matcher<const std::string&> m1 = StrCaseEq(str1);
+  EXPECT_TRUE(m1.Matches(str2));
+
+  str1[0] = str1[6] = str1[7] = str1[10] = '\0';
+  str2[0] = str2[6] = str2[7] = str2[10] = '\0';
+  Matcher<const std::string&> m2 = StrCaseEq(str1);
+  str1[9] = str2[9] = '\0';
+  EXPECT_FALSE(m2.Matches(str2));
+
+  Matcher<const std::string&> m3 = StrCaseEq(str1);
+  EXPECT_TRUE(m3.Matches(str2));
+
+  EXPECT_FALSE(m3.Matches(str2 + "x"));
+  str2.append(1, '\0');
+  EXPECT_FALSE(m3.Matches(str2));
+  EXPECT_FALSE(m3.Matches(std::string(str2, 0, 9)));
+}
+
+TEST(StrCaseEqTest, CanDescribeSelf) {
+  Matcher<std::string> m = StrCaseEq("Hi");
+  EXPECT_EQ("is equal to (ignoring case) \"Hi\"", Describe(m));
+}
+
+TEST(StrCaseNeTest, MatchesUnequalStringIgnoringCase) {
+  Matcher<const char*> m = StrCaseNe("Hello");
+  EXPECT_TRUE(m.Matches("Hi"));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches("Hello"));
+  EXPECT_FALSE(m.Matches("hello"));
+
+  Matcher<std::string> m2 = StrCaseNe(std::string("Hello"));
+  EXPECT_TRUE(m2.Matches(""));
+  EXPECT_FALSE(m2.Matches("Hello"));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view> m3 = StrCaseNe("Hello");
+  EXPECT_TRUE(m3.Matches(absl::string_view("Hi")));
+  EXPECT_TRUE(m3.Matches(absl::string_view()));
+  EXPECT_FALSE(m3.Matches(absl::string_view("Hello")));
+  EXPECT_FALSE(m3.Matches(absl::string_view("hello")));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(StrCaseNeTest, CanDescribeSelf) {
+  Matcher<const char*> m = StrCaseNe("Hi");
+  EXPECT_EQ("isn't equal to (ignoring case) \"Hi\"", Describe(m));
+}
+
+// Tests that HasSubstr() works for matching string-typed values.
+TEST(HasSubstrTest, WorksForStringClasses) {
+  const Matcher<std::string> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(std::string("I love food.")));
+  EXPECT_FALSE(m1.Matches(std::string("tofo")));
+
+  const Matcher<const std::string&> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches(std::string("I love food.")));
+  EXPECT_FALSE(m2.Matches(std::string("tofo")));
+
+  const Matcher<std::string> m_empty = HasSubstr("");
+  EXPECT_TRUE(m_empty.Matches(std::string()));
+  EXPECT_TRUE(m_empty.Matches(std::string("not empty")));
+}
+
+// Tests that HasSubstr() works for matching C-string-typed values.
+TEST(HasSubstrTest, WorksForCStrings) {
+  const Matcher<char*> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(const_cast<char*>("I love food.")));
+  EXPECT_FALSE(m1.Matches(const_cast<char*>("tofo")));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const char*> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches("I love food."));
+  EXPECT_FALSE(m2.Matches("tofo"));
+  EXPECT_FALSE(m2.Matches(nullptr));
+
+  const Matcher<const char*> m_empty = HasSubstr("");
+  EXPECT_TRUE(m_empty.Matches("not empty"));
+  EXPECT_TRUE(m_empty.Matches(""));
+  EXPECT_FALSE(m_empty.Matches(nullptr));
+}
+
+#if GTEST_HAS_ABSL
+// Tests that HasSubstr() works for matching absl::string_view-typed values.
+TEST(HasSubstrTest, WorksForStringViewClasses) {
+  const Matcher<absl::string_view> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(absl::string_view("I love food.")));
+  EXPECT_FALSE(m1.Matches(absl::string_view("tofo")));
+  EXPECT_FALSE(m1.Matches(absl::string_view()));
+
+  const Matcher<const absl::string_view&> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches(absl::string_view("I love food.")));
+  EXPECT_FALSE(m2.Matches(absl::string_view("tofo")));
+  EXPECT_FALSE(m2.Matches(absl::string_view()));
+
+  const Matcher<const absl::string_view&> m3 = HasSubstr("");
+  EXPECT_TRUE(m3.Matches(absl::string_view("foo")));
+  EXPECT_TRUE(m3.Matches(absl::string_view("")));
+  EXPECT_TRUE(m3.Matches(absl::string_view()));
+}
+#endif  // GTEST_HAS_ABSL
+
+// Tests that HasSubstr(s) describes itself properly.
+TEST(HasSubstrTest, CanDescribeSelf) {
+  Matcher<std::string> m = HasSubstr("foo\n\"");
+  EXPECT_EQ("has substring \"foo\\n\\\"\"", Describe(m));
+}
+
+TEST(KeyTest, CanDescribeSelf) {
+  Matcher<const pair<std::string, int>&> m = Key("foo");
+  EXPECT_EQ("has a key that is equal to \"foo\"", Describe(m));
+  EXPECT_EQ("doesn't have a key that is equal to \"foo\"", DescribeNegation(m));
+}
+
+TEST(KeyTest, ExplainsResult) {
+  Matcher<pair<int, bool> > m = Key(GreaterThan(10));
+  EXPECT_EQ("whose first field is a value which is 5 less than 10",
+            Explain(m, make_pair(5, true)));
+  EXPECT_EQ("whose first field is a value which is 5 more than 10",
+            Explain(m, make_pair(15, true)));
+}
+
+TEST(KeyTest, MatchesCorrectly) {
+  pair<int, std::string> p(25, "foo");
+  EXPECT_THAT(p, Key(25));
+  EXPECT_THAT(p, Not(Key(42)));
+  EXPECT_THAT(p, Key(Ge(20)));
+  EXPECT_THAT(p, Not(Key(Lt(25))));
+}
+
+TEST(KeyTest, WorksWithMoveOnly) {
+  pair<std::unique_ptr<int>, std::unique_ptr<int>> p;
+  EXPECT_THAT(p, Key(Eq(nullptr)));
+}
+
+template <size_t I>
+struct Tag {};
+
+struct PairWithGet {
+  int member_1;
+  std::string member_2;
+  using first_type = int;
+  using second_type = std::string;
+
+  const int& GetImpl(Tag<0>) const { return member_1; }
+  const std::string& GetImpl(Tag<1>) const { return member_2; }
+};
+template <size_t I>
+auto get(const PairWithGet& value) -> decltype(value.GetImpl(Tag<I>())) {
+  return value.GetImpl(Tag<I>());
+}
+TEST(PairTest, MatchesPairWithGetCorrectly) {
+  PairWithGet p{25, "foo"};
+  EXPECT_THAT(p, Key(25));
+  EXPECT_THAT(p, Not(Key(42)));
+  EXPECT_THAT(p, Key(Ge(20)));
+  EXPECT_THAT(p, Not(Key(Lt(25))));
+
+  std::vector<PairWithGet> v = {{11, "Foo"}, {29, "gMockIsBestMock"}};
+  EXPECT_THAT(v, Contains(Key(29)));
+}
+
+TEST(KeyTest, SafelyCastsInnerMatcher) {
+  Matcher<int> is_positive = Gt(0);
+  Matcher<int> is_negative = Lt(0);
+  pair<char, bool> p('a', true);
+  EXPECT_THAT(p, Key(is_positive));
+  EXPECT_THAT(p, Not(Key(is_negative)));
+}
+
+TEST(KeyTest, InsideContainsUsingMap) {
+  map<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+  EXPECT_THAT(container, Contains(Key(1)));
+  EXPECT_THAT(container, Not(Contains(Key(3))));
+}
+
+TEST(KeyTest, InsideContainsUsingMultimap) {
+  multimap<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+
+  EXPECT_THAT(container, Not(Contains(Key(25))));
+  container.insert(make_pair(25, 'd'));
+  EXPECT_THAT(container, Contains(Key(25)));
+  container.insert(make_pair(25, 'e'));
+  EXPECT_THAT(container, Contains(Key(25)));
+
+  EXPECT_THAT(container, Contains(Key(1)));
+  EXPECT_THAT(container, Not(Contains(Key(3))));
+}
+
+TEST(PairTest, Typing) {
+  // Test verifies the following type conversions can be compiled.
+  Matcher<const pair<const char*, int>&> m1 = Pair("foo", 42);
+  Matcher<const pair<const char*, int> > m2 = Pair("foo", 42);
+  Matcher<pair<const char*, int> > m3 = Pair("foo", 42);
+
+  Matcher<pair<int, const std::string> > m4 = Pair(25, "42");
+  Matcher<pair<const std::string, int> > m5 = Pair("25", 42);
+}
+
+TEST(PairTest, CanDescribeSelf) {
+  Matcher<const pair<std::string, int>&> m1 = Pair("foo", 42);
+  EXPECT_EQ("has a first field that is equal to \"foo\""
+            ", and has a second field that is equal to 42",
+            Describe(m1));
+  EXPECT_EQ("has a first field that isn't equal to \"foo\""
+            ", or has a second field that isn't equal to 42",
+            DescribeNegation(m1));
+  // Double and triple negation (1 or 2 times not and description of negation).
+  Matcher<const pair<int, int>&> m2 = Not(Pair(Not(13), 42));
+  EXPECT_EQ("has a first field that isn't equal to 13"
+            ", and has a second field that is equal to 42",
+            DescribeNegation(m2));
+}
+
+TEST(PairTest, CanExplainMatchResultTo) {
+  // If neither field matches, Pair() should explain about the first
+  // field.
+  const Matcher<pair<int, int> > m = Pair(GreaterThan(0), GreaterThan(0));
+  EXPECT_EQ("whose first field does not match, which is 1 less than 0",
+            Explain(m, make_pair(-1, -2)));
+
+  // If the first field matches but the second doesn't, Pair() should
+  // explain about the second field.
+  EXPECT_EQ("whose second field does not match, which is 2 less than 0",
+            Explain(m, make_pair(1, -2)));
+
+  // If the first field doesn't match but the second does, Pair()
+  // should explain about the first field.
+  EXPECT_EQ("whose first field does not match, which is 1 less than 0",
+            Explain(m, make_pair(-1, 2)));
+
+  // If both fields match, Pair() should explain about them both.
+  EXPECT_EQ("whose both fields match, where the first field is a value "
+            "which is 1 more than 0, and the second field is a value "
+            "which is 2 more than 0",
+            Explain(m, make_pair(1, 2)));
+
+  // If only the first match has an explanation, only this explanation should
+  // be printed.
+  const Matcher<pair<int, int> > explain_first = Pair(GreaterThan(0), 0);
+  EXPECT_EQ("whose both fields match, where the first field is a value "
+            "which is 1 more than 0",
+            Explain(explain_first, make_pair(1, 0)));
+
+  // If only the second match has an explanation, only this explanation should
+  // be printed.
+  const Matcher<pair<int, int> > explain_second = Pair(0, GreaterThan(0));
+  EXPECT_EQ("whose both fields match, where the second field is a value "
+            "which is 1 more than 0",
+            Explain(explain_second, make_pair(0, 1)));
+}
+
+TEST(PairTest, MatchesCorrectly) {
+  pair<int, std::string> p(25, "foo");
+
+  // Both fields match.
+  EXPECT_THAT(p, Pair(25, "foo"));
+  EXPECT_THAT(p, Pair(Ge(20), HasSubstr("o")));
+
+  // 'first' doesnt' match, but 'second' matches.
+  EXPECT_THAT(p, Not(Pair(42, "foo")));
+  EXPECT_THAT(p, Not(Pair(Lt(25), "foo")));
+
+  // 'first' matches, but 'second' doesn't match.
+  EXPECT_THAT(p, Not(Pair(25, "bar")));
+  EXPECT_THAT(p, Not(Pair(25, Not("foo"))));
+
+  // Neither field matches.
+  EXPECT_THAT(p, Not(Pair(13, "bar")));
+  EXPECT_THAT(p, Not(Pair(Lt(13), HasSubstr("a"))));
+}
+
+TEST(PairTest, WorksWithMoveOnly) {
+  pair<std::unique_ptr<int>, std::unique_ptr<int>> p;
+  p.second.reset(new int(7));
+  EXPECT_THAT(p, Pair(Eq(nullptr), Ne(nullptr)));
+}
+
+TEST(PairTest, SafelyCastsInnerMatchers) {
+  Matcher<int> is_positive = Gt(0);
+  Matcher<int> is_negative = Lt(0);
+  pair<char, bool> p('a', true);
+  EXPECT_THAT(p, Pair(is_positive, _));
+  EXPECT_THAT(p, Not(Pair(is_negative, _)));
+  EXPECT_THAT(p, Pair(_, is_positive));
+  EXPECT_THAT(p, Not(Pair(_, is_negative)));
+}
+
+TEST(PairTest, InsideContainsUsingMap) {
+  map<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+  EXPECT_THAT(container, Contains(Pair(1, 'a')));
+  EXPECT_THAT(container, Contains(Pair(1, _)));
+  EXPECT_THAT(container, Contains(Pair(_, 'a')));
+  EXPECT_THAT(container, Not(Contains(Pair(3, _))));
+}
+
+TEST(ContainsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Contains(Pointee(2))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(PairTest, UseGetInsteadOfMembers) {
+  PairWithGet pair{7, "ABC"};
+  EXPECT_THAT(pair, Pair(7, "ABC"));
+  EXPECT_THAT(pair, Pair(Ge(7), HasSubstr("AB")));
+  EXPECT_THAT(pair, Not(Pair(Lt(7), "ABC")));
+
+  std::vector<PairWithGet> v = {{11, "Foo"}, {29, "gMockIsBestMock"}};
+  EXPECT_THAT(v,
+              ElementsAre(Pair(11, std::string("Foo")), Pair(Ge(10), Not(""))));
+}
+
+// Tests StartsWith(s).
+
+TEST(StartsWithTest, MatchesStringWithGivenPrefix) {
+  const Matcher<const char*> m1 = StartsWith(std::string(""));
+  EXPECT_TRUE(m1.Matches("Hi"));
+  EXPECT_TRUE(m1.Matches(""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = StartsWith("Hi");
+  EXPECT_TRUE(m2.Matches("Hi"));
+  EXPECT_TRUE(m2.Matches("Hi Hi!"));
+  EXPECT_TRUE(m2.Matches("High"));
+  EXPECT_FALSE(m2.Matches("H"));
+  EXPECT_FALSE(m2.Matches(" Hi"));
+
+#if GTEST_HAS_ABSL
+  const Matcher<absl::string_view> m_empty = StartsWith("");
+  EXPECT_TRUE(m_empty.Matches(absl::string_view()));
+  EXPECT_TRUE(m_empty.Matches(absl::string_view("")));
+  EXPECT_TRUE(m_empty.Matches(absl::string_view("not empty")));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(StartsWithTest, CanDescribeSelf) {
+  Matcher<const std::string> m = StartsWith("Hi");
+  EXPECT_EQ("starts with \"Hi\"", Describe(m));
+}
+
+// Tests EndsWith(s).
+
+TEST(EndsWithTest, MatchesStringWithGivenSuffix) {
+  const Matcher<const char*> m1 = EndsWith("");
+  EXPECT_TRUE(m1.Matches("Hi"));
+  EXPECT_TRUE(m1.Matches(""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = EndsWith(std::string("Hi"));
+  EXPECT_TRUE(m2.Matches("Hi"));
+  EXPECT_TRUE(m2.Matches("Wow Hi Hi"));
+  EXPECT_TRUE(m2.Matches("Super Hi"));
+  EXPECT_FALSE(m2.Matches("i"));
+  EXPECT_FALSE(m2.Matches("Hi "));
+
+#if GTEST_HAS_ABSL
+  const Matcher<const absl::string_view&> m4 = EndsWith("");
+  EXPECT_TRUE(m4.Matches("Hi"));
+  EXPECT_TRUE(m4.Matches(""));
+  EXPECT_TRUE(m4.Matches(absl::string_view()));
+  EXPECT_TRUE(m4.Matches(absl::string_view("")));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(EndsWithTest, CanDescribeSelf) {
+  Matcher<const std::string> m = EndsWith("Hi");
+  EXPECT_EQ("ends with \"Hi\"", Describe(m));
+}
+
+// Tests MatchesRegex().
+
+TEST(MatchesRegexTest, MatchesStringMatchingGivenRegex) {
+  const Matcher<const char*> m1 = MatchesRegex("a.*z");
+  EXPECT_TRUE(m1.Matches("az"));
+  EXPECT_TRUE(m1.Matches("abcz"));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = MatchesRegex(new RE("a.*z"));
+  EXPECT_TRUE(m2.Matches("azbz"));
+  EXPECT_FALSE(m2.Matches("az1"));
+  EXPECT_FALSE(m2.Matches("1az"));
+
+#if GTEST_HAS_ABSL
+  const Matcher<const absl::string_view&> m3 = MatchesRegex("a.*z");
+  EXPECT_TRUE(m3.Matches(absl::string_view("az")));
+  EXPECT_TRUE(m3.Matches(absl::string_view("abcz")));
+  EXPECT_FALSE(m3.Matches(absl::string_view("1az")));
+  EXPECT_FALSE(m3.Matches(absl::string_view()));
+  const Matcher<const absl::string_view&> m4 = MatchesRegex("");
+  EXPECT_TRUE(m4.Matches(absl::string_view("")));
+  EXPECT_TRUE(m4.Matches(absl::string_view()));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(MatchesRegexTest, CanDescribeSelf) {
+  Matcher<const std::string> m1 = MatchesRegex(std::string("Hi.*"));
+  EXPECT_EQ("matches regular expression \"Hi.*\"", Describe(m1));
+
+  Matcher<const char*> m2 = MatchesRegex(new RE("a.*"));
+  EXPECT_EQ("matches regular expression \"a.*\"", Describe(m2));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view> m3 = MatchesRegex(new RE("0.*"));
+  EXPECT_EQ("matches regular expression \"0.*\"", Describe(m3));
+#endif  // GTEST_HAS_ABSL
+}
+
+// Tests ContainsRegex().
+
+TEST(ContainsRegexTest, MatchesStringContainingGivenRegex) {
+  const Matcher<const char*> m1 = ContainsRegex(std::string("a.*z"));
+  EXPECT_TRUE(m1.Matches("az"));
+  EXPECT_TRUE(m1.Matches("0abcz1"));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = ContainsRegex(new RE("a.*z"));
+  EXPECT_TRUE(m2.Matches("azbz"));
+  EXPECT_TRUE(m2.Matches("az1"));
+  EXPECT_FALSE(m2.Matches("1a"));
+
+#if GTEST_HAS_ABSL
+  const Matcher<const absl::string_view&> m3 = ContainsRegex(new RE("a.*z"));
+  EXPECT_TRUE(m3.Matches(absl::string_view("azbz")));
+  EXPECT_TRUE(m3.Matches(absl::string_view("az1")));
+  EXPECT_FALSE(m3.Matches(absl::string_view("1a")));
+  EXPECT_FALSE(m3.Matches(absl::string_view()));
+  const Matcher<const absl::string_view&> m4 = ContainsRegex("");
+  EXPECT_TRUE(m4.Matches(absl::string_view("")));
+  EXPECT_TRUE(m4.Matches(absl::string_view()));
+#endif  // GTEST_HAS_ABSL
+}
+
+TEST(ContainsRegexTest, CanDescribeSelf) {
+  Matcher<const std::string> m1 = ContainsRegex("Hi.*");
+  EXPECT_EQ("contains regular expression \"Hi.*\"", Describe(m1));
+
+  Matcher<const char*> m2 = ContainsRegex(new RE("a.*"));
+  EXPECT_EQ("contains regular expression \"a.*\"", Describe(m2));
+
+#if GTEST_HAS_ABSL
+  Matcher<const absl::string_view> m3 = ContainsRegex(new RE("0.*"));
+  EXPECT_EQ("contains regular expression \"0.*\"", Describe(m3));
+#endif  // GTEST_HAS_ABSL
+}
+
+// Tests for wide strings.
+#if GTEST_HAS_STD_WSTRING
+TEST(StdWideStrEqTest, MatchesEqual) {
+  Matcher<const wchar_t*> m = StrEq(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"Hello"));
+  EXPECT_FALSE(m.Matches(L"hello"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const ::std::wstring&> m2 = StrEq(L"Hello");
+  EXPECT_TRUE(m2.Matches(L"Hello"));
+  EXPECT_FALSE(m2.Matches(L"Hi"));
+
+  Matcher<const ::std::wstring&> m3 = StrEq(L"\xD3\x576\x8D3\xC74D");
+  EXPECT_TRUE(m3.Matches(L"\xD3\x576\x8D3\xC74D"));
+  EXPECT_FALSE(m3.Matches(L"\xD3\x576\x8D3\xC74E"));
+
+  ::std::wstring str(L"01204500800");
+  str[3] = L'\0';
+  Matcher<const ::std::wstring&> m4 = StrEq(str);
+  EXPECT_TRUE(m4.Matches(str));
+  str[0] = str[6] = str[7] = str[9] = str[10] = L'\0';
+  Matcher<const ::std::wstring&> m5 = StrEq(str);
+  EXPECT_TRUE(m5.Matches(str));
+}
+
+TEST(StdWideStrEqTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = StrEq(L"Hi-\'\"?\\\a\b\f\n\r\t\v");
+  EXPECT_EQ("is equal to L\"Hi-\'\\\"?\\\\\\a\\b\\f\\n\\r\\t\\v\"",
+    Describe(m));
+
+  Matcher< ::std::wstring> m2 = StrEq(L"\xD3\x576\x8D3\xC74D");
+  EXPECT_EQ("is equal to L\"\\xD3\\x576\\x8D3\\xC74D\"",
+    Describe(m2));
+
+  ::std::wstring str(L"01204500800");
+  str[3] = L'\0';
+  Matcher<const ::std::wstring&> m4 = StrEq(str);
+  EXPECT_EQ("is equal to L\"012\\04500800\"", Describe(m4));
+  str[0] = str[6] = str[7] = str[9] = str[10] = L'\0';
+  Matcher<const ::std::wstring&> m5 = StrEq(str);
+  EXPECT_EQ("is equal to L\"\\012\\045\\0\\08\\0\\0\"", Describe(m5));
+}
+
+TEST(StdWideStrNeTest, MatchesUnequalString) {
+  Matcher<const wchar_t*> m = StrNe(L"Hello");
+  EXPECT_TRUE(m.Matches(L""));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches(L"Hello"));
+
+  Matcher< ::std::wstring> m2 = StrNe(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m2.Matches(L"hello"));
+  EXPECT_FALSE(m2.Matches(L"Hello"));
+}
+
+TEST(StdWideStrNeTest, CanDescribeSelf) {
+  Matcher<const wchar_t*> m = StrNe(L"Hi");
+  EXPECT_EQ("isn't equal to L\"Hi\"", Describe(m));
+}
+
+TEST(StdWideStrCaseEqTest, MatchesEqualStringIgnoringCase) {
+  Matcher<const wchar_t*> m = StrCaseEq(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"hello"));
+  EXPECT_FALSE(m.Matches(L"Hi"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const ::std::wstring&> m2 = StrCaseEq(L"Hello");
+  EXPECT_TRUE(m2.Matches(L"hello"));
+  EXPECT_FALSE(m2.Matches(L"Hi"));
+}
+
+TEST(StdWideStrCaseEqTest, MatchesEqualStringWith0IgnoringCase) {
+  ::std::wstring str1(L"oabocdooeoo");
+  ::std::wstring str2(L"OABOCDOOEOO");
+  Matcher<const ::std::wstring&> m0 = StrCaseEq(str1);
+  EXPECT_FALSE(m0.Matches(str2 + ::std::wstring(1, L'\0')));
+
+  str1[3] = str2[3] = L'\0';
+  Matcher<const ::std::wstring&> m1 = StrCaseEq(str1);
+  EXPECT_TRUE(m1.Matches(str2));
+
+  str1[0] = str1[6] = str1[7] = str1[10] = L'\0';
+  str2[0] = str2[6] = str2[7] = str2[10] = L'\0';
+  Matcher<const ::std::wstring&> m2 = StrCaseEq(str1);
+  str1[9] = str2[9] = L'\0';
+  EXPECT_FALSE(m2.Matches(str2));
+
+  Matcher<const ::std::wstring&> m3 = StrCaseEq(str1);
+  EXPECT_TRUE(m3.Matches(str2));
+
+  EXPECT_FALSE(m3.Matches(str2 + L"x"));
+  str2.append(1, L'\0');
+  EXPECT_FALSE(m3.Matches(str2));
+  EXPECT_FALSE(m3.Matches(::std::wstring(str2, 0, 9)));
+}
+
+TEST(StdWideStrCaseEqTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = StrCaseEq(L"Hi");
+  EXPECT_EQ("is equal to (ignoring case) L\"Hi\"", Describe(m));
+}
+
+TEST(StdWideStrCaseNeTest, MatchesUnequalStringIgnoringCase) {
+  Matcher<const wchar_t*> m = StrCaseNe(L"Hello");
+  EXPECT_TRUE(m.Matches(L"Hi"));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches(L"Hello"));
+  EXPECT_FALSE(m.Matches(L"hello"));
+
+  Matcher< ::std::wstring> m2 = StrCaseNe(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m2.Matches(L""));
+  EXPECT_FALSE(m2.Matches(L"Hello"));
+}
+
+TEST(StdWideStrCaseNeTest, CanDescribeSelf) {
+  Matcher<const wchar_t*> m = StrCaseNe(L"Hi");
+  EXPECT_EQ("isn't equal to (ignoring case) L\"Hi\"", Describe(m));
+}
+
+// Tests that HasSubstr() works for matching wstring-typed values.
+TEST(StdWideHasSubstrTest, WorksForStringClasses) {
+  const Matcher< ::std::wstring> m1 = HasSubstr(L"foo");
+  EXPECT_TRUE(m1.Matches(::std::wstring(L"I love food.")));
+  EXPECT_FALSE(m1.Matches(::std::wstring(L"tofo")));
+
+  const Matcher<const ::std::wstring&> m2 = HasSubstr(L"foo");
+  EXPECT_TRUE(m2.Matches(::std::wstring(L"I love food.")));
+  EXPECT_FALSE(m2.Matches(::std::wstring(L"tofo")));
+}
+
+// Tests that HasSubstr() works for matching C-wide-string-typed values.
+TEST(StdWideHasSubstrTest, WorksForCStrings) {
+  const Matcher<wchar_t*> m1 = HasSubstr(L"foo");
+  EXPECT_TRUE(m1.Matches(const_cast<wchar_t*>(L"I love food.")));
+  EXPECT_FALSE(m1.Matches(const_cast<wchar_t*>(L"tofo")));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const wchar_t*> m2 = HasSubstr(L"foo");
+  EXPECT_TRUE(m2.Matches(L"I love food."));
+  EXPECT_FALSE(m2.Matches(L"tofo"));
+  EXPECT_FALSE(m2.Matches(nullptr));
+}
+
+// Tests that HasSubstr(s) describes itself properly.
+TEST(StdWideHasSubstrTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = HasSubstr(L"foo\n\"");
+  EXPECT_EQ("has substring L\"foo\\n\\\"\"", Describe(m));
+}
+
+// Tests StartsWith(s).
+
+TEST(StdWideStartsWithTest, MatchesStringWithGivenPrefix) {
+  const Matcher<const wchar_t*> m1 = StartsWith(::std::wstring(L""));
+  EXPECT_TRUE(m1.Matches(L"Hi"));
+  EXPECT_TRUE(m1.Matches(L""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const ::std::wstring&> m2 = StartsWith(L"Hi");
+  EXPECT_TRUE(m2.Matches(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Hi Hi!"));
+  EXPECT_TRUE(m2.Matches(L"High"));
+  EXPECT_FALSE(m2.Matches(L"H"));
+  EXPECT_FALSE(m2.Matches(L" Hi"));
+}
+
+TEST(StdWideStartsWithTest, CanDescribeSelf) {
+  Matcher<const ::std::wstring> m = StartsWith(L"Hi");
+  EXPECT_EQ("starts with L\"Hi\"", Describe(m));
+}
+
+// Tests EndsWith(s).
+
+TEST(StdWideEndsWithTest, MatchesStringWithGivenSuffix) {
+  const Matcher<const wchar_t*> m1 = EndsWith(L"");
+  EXPECT_TRUE(m1.Matches(L"Hi"));
+  EXPECT_TRUE(m1.Matches(L""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const ::std::wstring&> m2 = EndsWith(::std::wstring(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Wow Hi Hi"));
+  EXPECT_TRUE(m2.Matches(L"Super Hi"));
+  EXPECT_FALSE(m2.Matches(L"i"));
+  EXPECT_FALSE(m2.Matches(L"Hi "));
+}
+
+TEST(StdWideEndsWithTest, CanDescribeSelf) {
+  Matcher<const ::std::wstring> m = EndsWith(L"Hi");
+  EXPECT_EQ("ends with L\"Hi\"", Describe(m));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+typedef ::std::tuple<long, int> Tuple2;  // NOLINT
+
+// Tests that Eq() matches a 2-tuple where the first field == the
+// second field.
+TEST(Eq2Test, MatchesEqualArguments) {
+  Matcher<const Tuple2&> m = Eq();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Eq() describes itself properly.
+TEST(Eq2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Eq();
+  EXPECT_EQ("are an equal pair", Describe(m));
+}
+
+// Tests that Ge() matches a 2-tuple where the first field >= the
+// second field.
+TEST(Ge2Test, MatchesGreaterThanOrEqualArguments) {
+  Matcher<const Tuple2&> m = Ge();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Ge() describes itself properly.
+TEST(Ge2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Ge();
+  EXPECT_EQ("are a pair where the first >= the second", Describe(m));
+}
+
+// Tests that Gt() matches a 2-tuple where the first field > the
+// second field.
+TEST(Gt2Test, MatchesGreaterThanArguments) {
+  Matcher<const Tuple2&> m = Gt();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Gt() describes itself properly.
+TEST(Gt2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Gt();
+  EXPECT_EQ("are a pair where the first > the second", Describe(m));
+}
+
+// Tests that Le() matches a 2-tuple where the first field <= the
+// second field.
+TEST(Le2Test, MatchesLessThanOrEqualArguments) {
+  Matcher<const Tuple2&> m = Le();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 4)));
+}
+
+// Tests that Le() describes itself properly.
+TEST(Le2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Le();
+  EXPECT_EQ("are a pair where the first <= the second", Describe(m));
+}
+
+// Tests that Lt() matches a 2-tuple where the first field < the
+// second field.
+TEST(Lt2Test, MatchesLessThanArguments) {
+  Matcher<const Tuple2&> m = Lt();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 4)));
+}
+
+// Tests that Lt() describes itself properly.
+TEST(Lt2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Lt();
+  EXPECT_EQ("are a pair where the first < the second", Describe(m));
+}
+
+// Tests that Ne() matches a 2-tuple where the first field != the
+// second field.
+TEST(Ne2Test, MatchesUnequalArguments) {
+  Matcher<const Tuple2&> m = Ne();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+}
+
+// Tests that Ne() describes itself properly.
+TEST(Ne2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Ne();
+  EXPECT_EQ("are an unequal pair", Describe(m));
+}
+
+TEST(PairMatchBaseTest, WorksWithMoveOnly) {
+  using Pointers = std::tuple<std::unique_ptr<int>, std::unique_ptr<int>>;
+  Matcher<Pointers> matcher = Eq();
+  Pointers pointers;
+  // Tested values don't matter; the point is that matcher does not copy the
+  // matched values.
+  EXPECT_TRUE(matcher.Matches(pointers));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+TEST(FloatEq2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = FloatEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(0.3f, 0.1f + 0.1f + 0.1f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+}
+
+// Tests that FloatEq() describes itself properly.
+TEST(FloatEq2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<float, float>&> m = FloatEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveFloatEq() matches a 2-tuple where
+// NanSensitiveFloatEq(first field) matches the second field.
+TEST(NanSensitiveFloatEqTest, MatchesEqualArgumentsWithNaN) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveFloatEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(),
+                            std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveFloatEq() describes itself properly.
+TEST(NanSensitiveFloatEqTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<float, float>&> m = NanSensitiveFloatEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that DoubleEq() matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+TEST(DoubleEq2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = DoubleEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0, 1.0)));
+  EXPECT_TRUE(m.Matches(Tpl(0.3, 0.1 + 0.1 + 0.1)));
+  EXPECT_FALSE(m.Matches(Tpl(1.1, 1.0)));
+}
+
+// Tests that DoubleEq() describes itself properly.
+TEST(DoubleEq2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<double, double>&> m = DoubleEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveDoubleEq() matches a 2-tuple where
+// NanSensitiveDoubleEq(first field) matches the second field.
+TEST(NanSensitiveDoubleEqTest, MatchesEqualArgumentsWithNaN) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveDoubleEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(),
+                            std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that DoubleEq() describes itself properly.
+TEST(NanSensitiveDoubleEqTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<double, double>&> m = NanSensitiveDoubleEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+TEST(FloatNear2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = FloatNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.3f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.8f, 1.0f)));
+}
+
+// Tests that FloatNear() describes itself properly.
+TEST(FloatNear2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<float, float>&> m = FloatNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveFloatNear() matches a 2-tuple where
+// NanSensitiveFloatNear(first field) matches the second field.
+TEST(NanSensitiveFloatNearTest, MatchesNearbyArgumentsWithNaN) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveFloatNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(),
+                            std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.6f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveFloatNear() describes itself properly.
+TEST(NanSensitiveFloatNearTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<float, float>&> m = NanSensitiveFloatNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+TEST(DoubleNear2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = DoubleNear(0.5);
+  EXPECT_TRUE(m.Matches(Tpl(1.0, 1.0)));
+  EXPECT_TRUE(m.Matches(Tpl(1.3, 1.0)));
+  EXPECT_FALSE(m.Matches(Tpl(1.8, 1.0)));
+}
+
+// Tests that DoubleNear() describes itself properly.
+TEST(DoubleNear2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<double, double>&> m = DoubleNear(0.5);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveDoubleNear() matches a 2-tuple where
+// NanSensitiveDoubleNear(first field) matches the second field.
+TEST(NanSensitiveDoubleNearTest, MatchesNearbyArgumentsWithNaN) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveDoubleNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(),
+                            std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.6f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveDoubleNear() describes itself properly.
+TEST(NanSensitiveDoubleNearTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<double, double>&> m = NanSensitiveDoubleNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that Not(m) matches any value that doesn't match m.
+TEST(NotTest, NegatesMatcher) {
+  Matcher<int> m;
+  m = Not(Eq(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+}
+
+// Tests that Not(m) describes itself properly.
+TEST(NotTest, CanDescribeSelf) {
+  Matcher<int> m = Not(Eq(5));
+  EXPECT_EQ("isn't equal to 5", Describe(m));
+}
+
+// Tests that monomorphic matchers are safely cast by the Not matcher.
+TEST(NotTest, NotMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 is a monomorphic matcher.
+  Matcher<int> greater_than_5 = Gt(5);
+
+  Matcher<const int&> m = Not(greater_than_5);
+  Matcher<int&> m2 = Not(greater_than_5);
+  Matcher<int&> m3 = Not(m);
+}
+
+// Helper to allow easy testing of AllOf matchers with num parameters.
+void AllOfMatches(int num, const Matcher<int>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_TRUE(m.Matches(0));
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_FALSE(m.Matches(i));
+  }
+  EXPECT_TRUE(m.Matches(num + 1));
+}
+
+// Tests that AllOf(m1, ..., mn) matches any value that matches all of
+// the given matchers.
+TEST(AllOfTest, MatchesWhenAllMatch) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(0));
+  EXPECT_FALSE(m.Matches(3));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  EXPECT_TRUE(m.Matches(0));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(3));
+
+  // The following tests for varying number of sub-matchers. Due to the way
+  // the sub-matchers are handled it is enough to test every sub-matcher once
+  // with sub-matchers using the same matcher type. Varying matcher types are
+  // checked for above.
+  AllOfMatches(2, AllOf(Ne(1), Ne(2)));
+  AllOfMatches(3, AllOf(Ne(1), Ne(2), Ne(3)));
+  AllOfMatches(4, AllOf(Ne(1), Ne(2), Ne(3), Ne(4)));
+  AllOfMatches(5, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5)));
+  AllOfMatches(6, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6)));
+  AllOfMatches(7, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7)));
+  AllOfMatches(8, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7),
+                        Ne(8)));
+  AllOfMatches(9, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7),
+                        Ne(8), Ne(9)));
+  AllOfMatches(10, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8),
+                         Ne(9), Ne(10)));
+  AllOfMatches(
+      50, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8), Ne(9),
+                Ne(10), Ne(11), Ne(12), Ne(13), Ne(14), Ne(15), Ne(16), Ne(17),
+                Ne(18), Ne(19), Ne(20), Ne(21), Ne(22), Ne(23), Ne(24), Ne(25),
+                Ne(26), Ne(27), Ne(28), Ne(29), Ne(30), Ne(31), Ne(32), Ne(33),
+                Ne(34), Ne(35), Ne(36), Ne(37), Ne(38), Ne(39), Ne(40), Ne(41),
+                Ne(42), Ne(43), Ne(44), Ne(45), Ne(46), Ne(47), Ne(48), Ne(49),
+                Ne(50)));
+}
+
+
+// Tests that AllOf(m1, ..., mn) describes itself properly.
+TEST(AllOfTest, CanDescribeSelf) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  EXPECT_EQ("(is <= 2) and (is >= 1)", Describe(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  std::string expected_descr1 =
+      "(is > 0) and (isn't equal to 1) and (isn't equal to 2)";
+  EXPECT_EQ(expected_descr1, Describe(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  std::string expected_descr2 =
+      "(is > 0) and (isn't equal to 1) and (isn't equal to 2) and (isn't equal "
+      "to 3)";
+  EXPECT_EQ(expected_descr2, Describe(m));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  std::string expected_descr3 =
+      "(is >= 0) and (is < 10) and (isn't equal to 3) and (isn't equal to 5) "
+      "and (isn't equal to 7)";
+  EXPECT_EQ(expected_descr3, Describe(m));
+}
+
+// Tests that AllOf(m1, ..., mn) describes its negation properly.
+TEST(AllOfTest, CanDescribeNegation) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  std::string expected_descr4 = "(isn't <= 2) or (isn't >= 1)";
+  EXPECT_EQ(expected_descr4, DescribeNegation(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  std::string expected_descr5 =
+      "(isn't > 0) or (is equal to 1) or (is equal to 2)";
+  EXPECT_EQ(expected_descr5, DescribeNegation(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  std::string expected_descr6 =
+      "(isn't > 0) or (is equal to 1) or (is equal to 2) or (is equal to 3)";
+  EXPECT_EQ(expected_descr6, DescribeNegation(m));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  std::string expected_desr7 =
+      "(isn't >= 0) or (isn't < 10) or (is equal to 3) or (is equal to 5) or "
+      "(is equal to 7)";
+  EXPECT_EQ(expected_desr7, DescribeNegation(m));
+
+  m = AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8), Ne(9),
+            Ne(10), Ne(11));
+  AllOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+  EXPECT_THAT(Describe(m), EndsWith("and (isn't equal to 11)"));
+  AllOfMatches(11, m);
+}
+
+// Tests that monomorphic matchers are safely cast by the AllOf matcher.
+TEST(AllOfTest, AllOfMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 and less_than_10 are monomorphic matchers.
+  Matcher<int> greater_than_5 = Gt(5);
+  Matcher<int> less_than_10 = Lt(10);
+
+  Matcher<const int&> m = AllOf(greater_than_5, less_than_10);
+  Matcher<int&> m2 = AllOf(greater_than_5, less_than_10);
+  Matcher<int&> m3 = AllOf(greater_than_5, m2);
+
+  // Tests that BothOf works when composing itself.
+  Matcher<const int&> m4 = AllOf(greater_than_5, less_than_10, less_than_10);
+  Matcher<int&> m5 = AllOf(greater_than_5, less_than_10, less_than_10);
+}
+
+TEST(AllOfTest, ExplainsResult) {
+  Matcher<int> m;
+
+  // Successful match.  Both matchers need to explain.  The second
+  // matcher doesn't give an explanation, so only the first matcher's
+  // explanation is printed.
+  m = AllOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("which is 15 more than 10", Explain(m, 25));
+
+  // Successful match.  Both matchers need to explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 20 more than 10, and which is 10 more than 20",
+            Explain(m, 30));
+
+  // Successful match.  All matchers need to explain.  The second
+  // matcher doesn't given an explanation.
+  m = AllOf(GreaterThan(10), Lt(30), GreaterThan(20));
+  EXPECT_EQ("which is 15 more than 10, and which is 5 more than 20",
+            Explain(m, 25));
+
+  // Successful match.  All matchers need to explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20), GreaterThan(30));
+  EXPECT_EQ("which is 30 more than 10, and which is 20 more than 20, "
+            "and which is 10 more than 30",
+            Explain(m, 40));
+
+  // Failed match.  The first matcher, which failed, needs to
+  // explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 10", Explain(m, 5));
+
+  // Failed match.  The second matcher, which failed, needs to
+  // explain.  Since it doesn't given an explanation, nothing is
+  // printed.
+  m = AllOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("", Explain(m, 40));
+
+  // Failed match.  The second matcher, which failed, needs to
+  // explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 20", Explain(m, 15));
+}
+
+// Helper to allow easy testing of AnyOf matchers with num parameters.
+static void AnyOfMatches(int num, const Matcher<int>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_FALSE(m.Matches(0));
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_TRUE(m.Matches(i));
+  }
+  EXPECT_FALSE(m.Matches(num + 1));
+}
+
+static void AnyOfStringMatches(int num, const Matcher<std::string>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_FALSE(m.Matches(std::to_string(0)));
+
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_TRUE(m.Matches(std::to_string(i)));
+  }
+  EXPECT_FALSE(m.Matches(std::to_string(num + 1)));
+}
+
+// Tests that AnyOf(m1, ..., mn) matches any value that matches at
+// least one of the given matchers.
+TEST(AnyOfTest, MatchesWhenAnyMatches) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(2));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_TRUE(m.Matches(-1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_TRUE(m.Matches(-1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_TRUE(m.Matches(0));
+  EXPECT_TRUE(m.Matches(11));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+
+  // The following tests for varying number of sub-matchers. Due to the way
+  // the sub-matchers are handled it is enough to test every sub-matcher once
+  // with sub-matchers using the same matcher type. Varying matcher types are
+  // checked for above.
+  AnyOfMatches(2, AnyOf(1, 2));
+  AnyOfMatches(3, AnyOf(1, 2, 3));
+  AnyOfMatches(4, AnyOf(1, 2, 3, 4));
+  AnyOfMatches(5, AnyOf(1, 2, 3, 4, 5));
+  AnyOfMatches(6, AnyOf(1, 2, 3, 4, 5, 6));
+  AnyOfMatches(7, AnyOf(1, 2, 3, 4, 5, 6, 7));
+  AnyOfMatches(8, AnyOf(1, 2, 3, 4, 5, 6, 7, 8));
+  AnyOfMatches(9, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9));
+  AnyOfMatches(10, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
+}
+
+// Tests the variadic version of the AnyOfMatcher.
+TEST(AnyOfTest, VariadicMatchesWhenAnyMatches) {
+  // Also make sure AnyOf is defined in the right namespace and does not depend
+  // on ADL.
+  Matcher<int> m = ::testing::AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+
+  EXPECT_THAT(Describe(m), EndsWith("or (is equal to 11)"));
+  AnyOfMatches(11, m);
+  AnyOfMatches(50, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                         11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                         31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+                         41, 42, 43, 44, 45, 46, 47, 48, 49, 50));
+  AnyOfStringMatches(
+      50, AnyOf("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
+                "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
+                "23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
+                "33", "34", "35", "36", "37", "38", "39", "40", "41", "42",
+                "43", "44", "45", "46", "47", "48", "49", "50"));
+}
+
+// Tests the variadic version of the ElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcher) {
+  vector<int> test_vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+
+  EXPECT_THAT(test_vector,
+              ElementsAre(Eq(1), Eq(2), Lt(13), Eq(4), Eq(5), Eq(6), Eq(7),
+                          Eq(8), Eq(9), Eq(10), Gt(1), Eq(12)));
+}
+
+// Tests the variadic version of the UnorderedElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcherStr) {
+  vector<std::string> test_vector{
+      "literal_string", "", "", "", "", "", "", "", "", "", "", ""};
+
+  EXPECT_THAT(test_vector, UnorderedElementsAre("literal_string", _, _, _, _, _,
+                                                _, _, _, _, _, _));
+}
+
+// Tests the variadic version of the UnorderedElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcherUnordered) {
+  vector<int> test_vector{2, 1, 8, 5, 4, 6, 7, 3, 9, 12, 11, 10};
+
+  EXPECT_THAT(test_vector, UnorderedElementsAre(
+                               Eq(2), Eq(1), Gt(7), Eq(5), Eq(4), Eq(6), Eq(7),
+                               Eq(3), Eq(9), Eq(12), Eq(11), Ne(122)));
+}
+
+
+// Tests that AnyOf(m1, ..., mn) describes itself properly.
+TEST(AnyOfTest, CanDescribeSelf) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+
+  EXPECT_EQ("(is <= 1) or (is >= 3)",
+            Describe(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_EQ("(is < 0) or (is equal to 1) or (is equal to 2)", Describe(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_EQ("(is < 0) or (is equal to 1) or (is equal to 2) or (is equal to 3)",
+            Describe(m));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_EQ(
+      "(is <= 0) or (is > 10) or (is equal to 3) or (is equal to 5) or (is "
+      "equal to 7)",
+      Describe(m));
+}
+
+// Tests that AnyOf(m1, ..., mn) describes its negation properly.
+TEST(AnyOfTest, CanDescribeNegation) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+  EXPECT_EQ("(isn't <= 1) and (isn't >= 3)",
+            DescribeNegation(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_EQ("(isn't < 0) and (isn't equal to 1) and (isn't equal to 2)",
+            DescribeNegation(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_EQ(
+      "(isn't < 0) and (isn't equal to 1) and (isn't equal to 2) and (isn't "
+      "equal to 3)",
+      DescribeNegation(m));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_EQ(
+      "(isn't <= 0) and (isn't > 10) and (isn't equal to 3) and (isn't equal "
+      "to 5) and (isn't equal to 7)",
+      DescribeNegation(m));
+}
+
+// Tests that monomorphic matchers are safely cast by the AnyOf matcher.
+TEST(AnyOfTest, AnyOfMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 and less_than_10 are monomorphic matchers.
+  Matcher<int> greater_than_5 = Gt(5);
+  Matcher<int> less_than_10 = Lt(10);
+
+  Matcher<const int&> m = AnyOf(greater_than_5, less_than_10);
+  Matcher<int&> m2 = AnyOf(greater_than_5, less_than_10);
+  Matcher<int&> m3 = AnyOf(greater_than_5, m2);
+
+  // Tests that EitherOf works when composing itself.
+  Matcher<const int&> m4 = AnyOf(greater_than_5, less_than_10, less_than_10);
+  Matcher<int&> m5 = AnyOf(greater_than_5, less_than_10, less_than_10);
+}
+
+TEST(AnyOfTest, ExplainsResult) {
+  Matcher<int> m;
+
+  // Failed match.  Both matchers need to explain.  The second
+  // matcher doesn't give an explanation, so only the first matcher's
+  // explanation is printed.
+  m = AnyOf(GreaterThan(10), Lt(0));
+  EXPECT_EQ("which is 5 less than 10", Explain(m, 5));
+
+  // Failed match.  Both matchers need to explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 10, and which is 15 less than 20",
+            Explain(m, 5));
+
+  // Failed match.  All matchers need to explain.  The second
+  // matcher doesn't given an explanation.
+  m = AnyOf(GreaterThan(10), Gt(20), GreaterThan(30));
+  EXPECT_EQ("which is 5 less than 10, and which is 25 less than 30",
+            Explain(m, 5));
+
+  // Failed match.  All matchers need to explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20), GreaterThan(30));
+  EXPECT_EQ("which is 5 less than 10, and which is 15 less than 20, "
+            "and which is 25 less than 30",
+            Explain(m, 5));
+
+  // Successful match.  The first matcher, which succeeded, needs to
+  // explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 more than 10", Explain(m, 15));
+
+  // Successful match.  The second matcher, which succeeded, needs to
+  // explain.  Since it doesn't given an explanation, nothing is
+  // printed.
+  m = AnyOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("", Explain(m, 0));
+
+  // Successful match.  The second matcher, which succeeded, needs to
+  // explain.
+  m = AnyOf(GreaterThan(30), GreaterThan(20));
+  EXPECT_EQ("which is 5 more than 20", Explain(m, 25));
+}
+
+// The following predicate function and predicate functor are for
+// testing the Truly(predicate) matcher.
+
+// Returns non-zero if the input is positive.  Note that the return
+// type of this function is not bool.  It's OK as Truly() accepts any
+// unary function or functor whose return type can be implicitly
+// converted to bool.
+int IsPositive(double x) {
+  return x > 0 ? 1 : 0;
+}
+
+// This functor returns true if the input is greater than the given
+// number.
+class IsGreaterThan {
+ public:
+  explicit IsGreaterThan(int threshold) : threshold_(threshold) {}
+
+  bool operator()(int n) const { return n > threshold_; }
+
+ private:
+  int threshold_;
+};
+
+// For testing Truly().
+const int foo = 0;
+
+// This predicate returns true if the argument references foo and has
+// a zero value.
+bool ReferencesFooAndIsZero(const int& n) {
+  return (&n == &foo) && (n == 0);
+}
+
+// Tests that Truly(predicate) matches what satisfies the given
+// predicate.
+TEST(TrulyTest, MatchesWhatSatisfiesThePredicate) {
+  Matcher<double> m = Truly(IsPositive);
+  EXPECT_TRUE(m.Matches(2.0));
+  EXPECT_FALSE(m.Matches(-1.5));
+}
+
+// Tests that Truly(predicate_functor) works too.
+TEST(TrulyTest, CanBeUsedWithFunctor) {
+  Matcher<int> m = Truly(IsGreaterThan(5));
+  EXPECT_TRUE(m.Matches(6));
+  EXPECT_FALSE(m.Matches(4));
+}
+
+// A class that can be implicitly converted to bool.
+class ConvertibleToBool {
+ public:
+  explicit ConvertibleToBool(int number) : number_(number) {}
+  operator bool() const { return number_ != 0; }
+
+ private:
+  int number_;
+};
+
+ConvertibleToBool IsNotZero(int number) {
+  return ConvertibleToBool(number);
+}
+
+// Tests that the predicate used in Truly() may return a class that's
+// implicitly convertible to bool, even when the class has no
+// operator!().
+TEST(TrulyTest, PredicateCanReturnAClassConvertibleToBool) {
+  Matcher<int> m = Truly(IsNotZero);
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+}
+
+// Tests that Truly(predicate) can describe itself properly.
+TEST(TrulyTest, CanDescribeSelf) {
+  Matcher<double> m = Truly(IsPositive);
+  EXPECT_EQ("satisfies the given predicate",
+            Describe(m));
+}
+
+// Tests that Truly(predicate) works when the matcher takes its
+// argument by reference.
+TEST(TrulyTest, WorksForByRefArguments) {
+  Matcher<const int&> m = Truly(ReferencesFooAndIsZero);
+  EXPECT_TRUE(m.Matches(foo));
+  int n = 0;
+  EXPECT_FALSE(m.Matches(n));
+}
+
+// Tests that Matches(m) is a predicate satisfied by whatever that
+// matches matcher m.
+TEST(MatchesTest, IsSatisfiedByWhatMatchesTheMatcher) {
+  EXPECT_TRUE(Matches(Ge(0))(1));
+  EXPECT_FALSE(Matches(Eq('a'))('b'));
+}
+
+// Tests that Matches(m) works when the matcher takes its argument by
+// reference.
+TEST(MatchesTest, WorksOnByRefArguments) {
+  int m = 0, n = 0;
+  EXPECT_TRUE(Matches(AllOf(Ref(n), Eq(0)))(n));
+  EXPECT_FALSE(Matches(Ref(m))(n));
+}
+
+// Tests that a Matcher on non-reference type can be used in
+// Matches().
+TEST(MatchesTest, WorksWithMatcherOnNonRefType) {
+  Matcher<int> eq5 = Eq(5);
+  EXPECT_TRUE(Matches(eq5)(5));
+  EXPECT_FALSE(Matches(eq5)(2));
+}
+
+// Tests Value(value, matcher).  Since Value() is a simple wrapper for
+// Matches(), which has been tested already, we don't spend a lot of
+// effort on testing Value().
+TEST(ValueTest, WorksWithPolymorphicMatcher) {
+  EXPECT_TRUE(Value("hi", StartsWith("h")));
+  EXPECT_FALSE(Value(5, Gt(10)));
+}
+
+TEST(ValueTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> is_zero = Eq(0);
+  EXPECT_TRUE(Value(0, is_zero));
+  EXPECT_FALSE(Value('a', is_zero));
+
+  int n = 0;
+  const Matcher<const int&> ref_n = Ref(n);
+  EXPECT_TRUE(Value(n, ref_n));
+  EXPECT_FALSE(Value(1, ref_n));
+}
+
+TEST(ExplainMatchResultTest, WorksWithPolymorphicMatcher) {
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(ExplainMatchResult(PolymorphicIsEven(), 42, &listener1));
+  EXPECT_EQ("% 2 == 0", listener1.str());
+
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(ExplainMatchResult(Ge(42), 1.5, &listener2));
+  EXPECT_EQ("", listener2.str());
+}
+
+TEST(ExplainMatchResultTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> is_even = PolymorphicIsEven();
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(ExplainMatchResult(is_even, 42, &listener1));
+  EXPECT_EQ("% 2 == 0", listener1.str());
+
+  const Matcher<const double&> is_zero = Eq(0);
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(ExplainMatchResult(is_zero, 1.5, &listener2));
+  EXPECT_EQ("", listener2.str());
+}
+
+MATCHER_P(Really, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg, result_listener);
+}
+
+TEST(ExplainMatchResultTest, WorksInsideMATCHER) {
+  EXPECT_THAT(0, Really(Eq(0)));
+}
+
+TEST(DescribeMatcherTest, WorksWithValue) {
+  EXPECT_EQ("is equal to 42", DescribeMatcher<int>(42));
+  EXPECT_EQ("isn't equal to 42", DescribeMatcher<int>(42, true));
+}
+
+TEST(DescribeMatcherTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> monomorphic = Le(0);
+  EXPECT_EQ("is <= 0", DescribeMatcher<int>(monomorphic));
+  EXPECT_EQ("isn't <= 0", DescribeMatcher<int>(monomorphic, true));
+}
+
+TEST(DescribeMatcherTest, WorksWithPolymorphicMatcher) {
+  EXPECT_EQ("is even", DescribeMatcher<int>(PolymorphicIsEven()));
+  EXPECT_EQ("is odd", DescribeMatcher<int>(PolymorphicIsEven(), true));
+}
+
+TEST(AllArgsTest, WorksForTuple) {
+  EXPECT_THAT(std::make_tuple(1, 2L), AllArgs(Lt()));
+  EXPECT_THAT(std::make_tuple(2L, 1), Not(AllArgs(Lt())));
+}
+
+TEST(AllArgsTest, WorksForNonTuple) {
+  EXPECT_THAT(42, AllArgs(Gt(0)));
+  EXPECT_THAT('a', Not(AllArgs(Eq('b'))));
+}
+
+class AllArgsHelper {
+ public:
+  AllArgsHelper() {}
+
+  MOCK_METHOD2(Helper, int(char x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AllArgsHelper);
+};
+
+TEST(AllArgsTest, WorksInWithClause) {
+  AllArgsHelper helper;
+  ON_CALL(helper, Helper(_, _))
+      .With(AllArgs(Lt()))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(helper, Helper(_, _));
+  EXPECT_CALL(helper, Helper(_, _))
+      .With(AllArgs(Gt()))
+      .WillOnce(Return(2));
+
+  EXPECT_EQ(1, helper.Helper('\1', 2));
+  EXPECT_EQ(2, helper.Helper('a', 1));
+}
+
+class OptionalMatchersHelper {
+ public:
+  OptionalMatchersHelper() {}
+
+  MOCK_METHOD0(NoArgs, int());
+
+  MOCK_METHOD1(OneArg, int(int y));
+
+  MOCK_METHOD2(TwoArgs, int(char x, int y));
+
+  MOCK_METHOD1(Overloaded, int(char x));
+  MOCK_METHOD2(Overloaded, int(char x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OptionalMatchersHelper);
+};
+
+TEST(AllArgsTest, WorksWithoutMatchers) {
+  OptionalMatchersHelper helper;
+
+  ON_CALL(helper, NoArgs).WillByDefault(Return(10));
+  ON_CALL(helper, OneArg).WillByDefault(Return(20));
+  ON_CALL(helper, TwoArgs).WillByDefault(Return(30));
+
+  EXPECT_EQ(10, helper.NoArgs());
+  EXPECT_EQ(20, helper.OneArg(1));
+  EXPECT_EQ(30, helper.TwoArgs('\1', 2));
+
+  EXPECT_CALL(helper, NoArgs).Times(1);
+  EXPECT_CALL(helper, OneArg).WillOnce(Return(100));
+  EXPECT_CALL(helper, OneArg(17)).WillOnce(Return(200));
+  EXPECT_CALL(helper, TwoArgs).Times(0);
+
+  EXPECT_EQ(10, helper.NoArgs());
+  EXPECT_EQ(100, helper.OneArg(1));
+  EXPECT_EQ(200, helper.OneArg(17));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the value
+// matches the matcher.
+TEST(MatcherAssertionTest, WorksWhenMatcherIsSatisfied) {
+  ASSERT_THAT(5, Ge(2)) << "This should succeed.";
+  ASSERT_THAT("Foo", EndsWith("oo"));
+  EXPECT_THAT(2, AllOf(Le(7), Ge(0))) << "This should succeed too.";
+  EXPECT_THAT("Hello", StartsWith("Hell"));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the value
+// doesn't match the matcher.
+TEST(MatcherAssertionTest, WorksWhenMatcherIsNotSatisfied) {
+  // 'n' must be static as it is used in an EXPECT_FATAL_FAILURE(),
+  // which cannot reference auto variables.
+  static unsigned short n;  // NOLINT
+  n = 5;
+
+  // VC++ prior to version 8.0 SP1 has a bug where it will not see any
+  // functions declared in the namespace scope from within nested classes.
+  // EXPECT/ASSERT_(NON)FATAL_FAILURE macros use nested classes so that all
+  // namespace-level functions invoked inside them need to be explicitly
+  // resolved.
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, ::testing::Gt(10)),
+                       "Value of: n\n"
+                       "Expected: is > 10\n"
+                       "  Actual: 5" + OfType("unsigned short"));
+  n = 0;
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_THAT(n, ::testing::AllOf(::testing::Le(7), ::testing::Ge(5))),
+      "Value of: n\n"
+      "Expected: (is <= 7) and (is >= 5)\n"
+      "  Actual: 0" + OfType("unsigned short"));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the argument
+// has a reference type.
+TEST(MatcherAssertionTest, WorksForByRefArguments) {
+  // We use a static variable here as EXPECT_FATAL_FAILURE() cannot
+  // reference auto variables.
+  static int n;
+  n = 0;
+  EXPECT_THAT(n, AllOf(Le(7), Ref(n)));
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, ::testing::Not(::testing::Ref(n))),
+                       "Value of: n\n"
+                       "Expected: does not reference the variable @");
+  // Tests the "Actual" part.
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, ::testing::Not(::testing::Ref(n))),
+                       "Actual: 0" + OfType("int") + ", which is located @");
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the matcher is
+// monomorphic.
+TEST(MatcherAssertionTest, WorksForMonomorphicMatcher) {
+  Matcher<const char*> starts_with_he = StartsWith("he");
+  ASSERT_THAT("hello", starts_with_he);
+
+  Matcher<const std::string&> ends_with_ok = EndsWith("ok");
+  ASSERT_THAT("book", ends_with_ok);
+  const std::string bad = "bad";
+  EXPECT_NONFATAL_FAILURE(EXPECT_THAT(bad, ends_with_ok),
+                          "Value of: bad\n"
+                          "Expected: ends with \"ok\"\n"
+                          "  Actual: \"bad\"");
+  Matcher<int> is_greater_than_5 = Gt(5);
+  EXPECT_NONFATAL_FAILURE(EXPECT_THAT(5, is_greater_than_5),
+                          "Value of: 5\n"
+                          "Expected: is > 5\n"
+                          "  Actual: 5" + OfType("int"));
+}
+
+// Tests floating-point matchers.
+template <typename RawType>
+class FloatingPointTest : public testing::Test {
+ protected:
+  typedef testing::internal::FloatingPoint<RawType> Floating;
+  typedef typename Floating::Bits Bits;
+
+  FloatingPointTest()
+      : max_ulps_(Floating::kMaxUlps),
+        zero_bits_(Floating(0).bits()),
+        one_bits_(Floating(1).bits()),
+        infinity_bits_(Floating(Floating::Infinity()).bits()),
+        close_to_positive_zero_(
+            Floating::ReinterpretBits(zero_bits_ + max_ulps_/2)),
+        close_to_negative_zero_(
+            -Floating::ReinterpretBits(zero_bits_ + max_ulps_ - max_ulps_/2)),
+        further_from_negative_zero_(-Floating::ReinterpretBits(
+            zero_bits_ + max_ulps_ + 1 - max_ulps_/2)),
+        close_to_one_(Floating::ReinterpretBits(one_bits_ + max_ulps_)),
+        further_from_one_(Floating::ReinterpretBits(one_bits_ + max_ulps_ + 1)),
+        infinity_(Floating::Infinity()),
+        close_to_infinity_(
+            Floating::ReinterpretBits(infinity_bits_ - max_ulps_)),
+        further_from_infinity_(
+            Floating::ReinterpretBits(infinity_bits_ - max_ulps_ - 1)),
+        max_(Floating::Max()),
+        nan1_(Floating::ReinterpretBits(Floating::kExponentBitMask | 1)),
+        nan2_(Floating::ReinterpretBits(Floating::kExponentBitMask | 200)) {
+  }
+
+  void TestSize() {
+    EXPECT_EQ(sizeof(RawType), sizeof(Bits));
+  }
+
+  // A battery of tests for FloatingEqMatcher::Matches.
+  // matcher_maker is a pointer to a function which creates a FloatingEqMatcher.
+  void TestMatches(
+      testing::internal::FloatingEqMatcher<RawType> (*matcher_maker)(RawType)) {
+    Matcher<RawType> m1 = matcher_maker(0.0);
+    EXPECT_TRUE(m1.Matches(-0.0));
+    EXPECT_TRUE(m1.Matches(close_to_positive_zero_));
+    EXPECT_TRUE(m1.Matches(close_to_negative_zero_));
+    EXPECT_FALSE(m1.Matches(1.0));
+
+    Matcher<RawType> m2 = matcher_maker(close_to_positive_zero_);
+    EXPECT_FALSE(m2.Matches(further_from_negative_zero_));
+
+    Matcher<RawType> m3 = matcher_maker(1.0);
+    EXPECT_TRUE(m3.Matches(close_to_one_));
+    EXPECT_FALSE(m3.Matches(further_from_one_));
+
+    // Test commutativity: matcher_maker(0.0).Matches(1.0) was tested above.
+    EXPECT_FALSE(m3.Matches(0.0));
+
+    Matcher<RawType> m4 = matcher_maker(-infinity_);
+    EXPECT_TRUE(m4.Matches(-close_to_infinity_));
+
+    Matcher<RawType> m5 = matcher_maker(infinity_);
+    EXPECT_TRUE(m5.Matches(close_to_infinity_));
+
+    // This is interesting as the representations of infinity_ and nan1_
+    // are only 1 DLP apart.
+    EXPECT_FALSE(m5.Matches(nan1_));
+
+    // matcher_maker can produce a Matcher<const RawType&>, which is needed in
+    // some cases.
+    Matcher<const RawType&> m6 = matcher_maker(0.0);
+    EXPECT_TRUE(m6.Matches(-0.0));
+    EXPECT_TRUE(m6.Matches(close_to_positive_zero_));
+    EXPECT_FALSE(m6.Matches(1.0));
+
+    // matcher_maker can produce a Matcher<RawType&>, which is needed in some
+    // cases.
+    Matcher<RawType&> m7 = matcher_maker(0.0);
+    RawType x = 0.0;
+    EXPECT_TRUE(m7.Matches(x));
+    x = 0.01f;
+    EXPECT_FALSE(m7.Matches(x));
+  }
+
+  // Pre-calculated numbers to be used by the tests.
+
+  const Bits max_ulps_;
+
+  const Bits zero_bits_;  // The bits that represent 0.0.
+  const Bits one_bits_;  // The bits that represent 1.0.
+  const Bits infinity_bits_;  // The bits that represent +infinity.
+
+  // Some numbers close to 0.0.
+  const RawType close_to_positive_zero_;
+  const RawType close_to_negative_zero_;
+  const RawType further_from_negative_zero_;
+
+  // Some numbers close to 1.0.
+  const RawType close_to_one_;
+  const RawType further_from_one_;
+
+  // Some numbers close to +infinity.
+  const RawType infinity_;
+  const RawType close_to_infinity_;
+  const RawType further_from_infinity_;
+
+  // Maximum representable value that's not infinity.
+  const RawType max_;
+
+  // Some NaNs.
+  const RawType nan1_;
+  const RawType nan2_;
+};
+
+// Tests floating-point matchers with fixed epsilons.
+template <typename RawType>
+class FloatingPointNearTest : public FloatingPointTest<RawType> {
+ protected:
+  typedef FloatingPointTest<RawType> ParentType;
+
+  // A battery of tests for FloatingEqMatcher::Matches with a fixed epsilon.
+  // matcher_maker is a pointer to a function which creates a FloatingEqMatcher.
+  void TestNearMatches(
+      testing::internal::FloatingEqMatcher<RawType>
+          (*matcher_maker)(RawType, RawType)) {
+    Matcher<RawType> m1 = matcher_maker(0.0, 0.0);
+    EXPECT_TRUE(m1.Matches(0.0));
+    EXPECT_TRUE(m1.Matches(-0.0));
+    EXPECT_FALSE(m1.Matches(ParentType::close_to_positive_zero_));
+    EXPECT_FALSE(m1.Matches(ParentType::close_to_negative_zero_));
+    EXPECT_FALSE(m1.Matches(1.0));
+
+    Matcher<RawType> m2 = matcher_maker(0.0, 1.0);
+    EXPECT_TRUE(m2.Matches(0.0));
+    EXPECT_TRUE(m2.Matches(-0.0));
+    EXPECT_TRUE(m2.Matches(1.0));
+    EXPECT_TRUE(m2.Matches(-1.0));
+    EXPECT_FALSE(m2.Matches(ParentType::close_to_one_));
+    EXPECT_FALSE(m2.Matches(-ParentType::close_to_one_));
+
+    // Check that inf matches inf, regardless of the of the specified max
+    // absolute error.
+    Matcher<RawType> m3 = matcher_maker(ParentType::infinity_, 0.0);
+    EXPECT_TRUE(m3.Matches(ParentType::infinity_));
+    EXPECT_FALSE(m3.Matches(ParentType::close_to_infinity_));
+    EXPECT_FALSE(m3.Matches(-ParentType::infinity_));
+
+    Matcher<RawType> m4 = matcher_maker(-ParentType::infinity_, 0.0);
+    EXPECT_TRUE(m4.Matches(-ParentType::infinity_));
+    EXPECT_FALSE(m4.Matches(-ParentType::close_to_infinity_));
+    EXPECT_FALSE(m4.Matches(ParentType::infinity_));
+
+    // Test various overflow scenarios.
+    Matcher<RawType> m5 = matcher_maker(ParentType::max_, ParentType::max_);
+    EXPECT_TRUE(m5.Matches(ParentType::max_));
+    EXPECT_FALSE(m5.Matches(-ParentType::max_));
+
+    Matcher<RawType> m6 = matcher_maker(-ParentType::max_, ParentType::max_);
+    EXPECT_FALSE(m6.Matches(ParentType::max_));
+    EXPECT_TRUE(m6.Matches(-ParentType::max_));
+
+    Matcher<RawType> m7 = matcher_maker(ParentType::max_, 0);
+    EXPECT_TRUE(m7.Matches(ParentType::max_));
+    EXPECT_FALSE(m7.Matches(-ParentType::max_));
+
+    Matcher<RawType> m8 = matcher_maker(-ParentType::max_, 0);
+    EXPECT_FALSE(m8.Matches(ParentType::max_));
+    EXPECT_TRUE(m8.Matches(-ParentType::max_));
+
+    // The difference between max() and -max() normally overflows to infinity,
+    // but it should still match if the max_abs_error is also infinity.
+    Matcher<RawType> m9 = matcher_maker(
+        ParentType::max_, ParentType::infinity_);
+    EXPECT_TRUE(m8.Matches(-ParentType::max_));
+
+    // matcher_maker can produce a Matcher<const RawType&>, which is needed in
+    // some cases.
+    Matcher<const RawType&> m10 = matcher_maker(0.0, 1.0);
+    EXPECT_TRUE(m10.Matches(-0.0));
+    EXPECT_TRUE(m10.Matches(ParentType::close_to_positive_zero_));
+    EXPECT_FALSE(m10.Matches(ParentType::close_to_one_));
+
+    // matcher_maker can produce a Matcher<RawType&>, which is needed in some
+    // cases.
+    Matcher<RawType&> m11 = matcher_maker(0.0, 1.0);
+    RawType x = 0.0;
+    EXPECT_TRUE(m11.Matches(x));
+    x = 1.0f;
+    EXPECT_TRUE(m11.Matches(x));
+    x = -1.0f;
+    EXPECT_TRUE(m11.Matches(x));
+    x = 1.1f;
+    EXPECT_FALSE(m11.Matches(x));
+    x = -1.1f;
+    EXPECT_FALSE(m11.Matches(x));
+  }
+};
+
+// Instantiate FloatingPointTest for testing floats.
+typedef FloatingPointTest<float> FloatTest;
+
+TEST_F(FloatTest, FloatEqApproximatelyMatchesFloats) {
+  TestMatches(&FloatEq);
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqApproximatelyMatchesFloats) {
+  TestMatches(&NanSensitiveFloatEq);
+}
+
+TEST_F(FloatTest, FloatEqCannotMatchNaN) {
+  // FloatEq never matches NaN.
+  Matcher<float> m = FloatEq(nan1_);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqCanMatchNaN) {
+  // NanSensitiveFloatEq will match NaN.
+  Matcher<float> m = NanSensitiveFloatEq(nan1_);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatTest, FloatEqCanDescribeSelf) {
+  Matcher<float> m1 = FloatEq(2.0f);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<float> m2 = FloatEq(0.5f);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<float> m3 = FloatEq(nan1_);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqCanDescribeSelf) {
+  Matcher<float> m1 = NanSensitiveFloatEq(2.0f);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<float> m2 = NanSensitiveFloatEq(0.5f);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<float> m3 = NanSensitiveFloatEq(nan1_);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+// Instantiate FloatingPointTest for testing floats with a user-specified
+// max absolute error.
+typedef FloatingPointNearTest<float> FloatNearTest;
+
+TEST_F(FloatNearTest, FloatNearMatches) {
+  TestNearMatches(&FloatNear);
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearApproximatelyMatchesFloats) {
+  TestNearMatches(&NanSensitiveFloatNear);
+}
+
+TEST_F(FloatNearTest, FloatNearCanDescribeSelf) {
+  Matcher<float> m1 = FloatNear(2.0f, 0.5f);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<float> m2 = FloatNear(0.5f, 0.5f);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<float> m3 = FloatNear(nan1_, 0.0);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearCanDescribeSelf) {
+  Matcher<float> m1 = NanSensitiveFloatNear(2.0f, 0.5f);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<float> m2 = NanSensitiveFloatNear(0.5f, 0.5f);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<float> m3 = NanSensitiveFloatNear(nan1_, 0.1f);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+TEST_F(FloatNearTest, FloatNearCannotMatchNaN) {
+  // FloatNear never matches NaN.
+  Matcher<float> m = FloatNear(ParentType::nan1_, 0.1f);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearCanMatchNaN) {
+  // NanSensitiveFloatNear will match NaN.
+  Matcher<float> m = NanSensitiveFloatNear(nan1_, 0.1f);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+// Instantiate FloatingPointTest for testing doubles.
+typedef FloatingPointTest<double> DoubleTest;
+
+TEST_F(DoubleTest, DoubleEqApproximatelyMatchesDoubles) {
+  TestMatches(&DoubleEq);
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqApproximatelyMatchesDoubles) {
+  TestMatches(&NanSensitiveDoubleEq);
+}
+
+TEST_F(DoubleTest, DoubleEqCannotMatchNaN) {
+  // DoubleEq never matches NaN.
+  Matcher<double> m = DoubleEq(nan1_);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqCanMatchNaN) {
+  // NanSensitiveDoubleEq will match NaN.
+  Matcher<double> m = NanSensitiveDoubleEq(nan1_);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleTest, DoubleEqCanDescribeSelf) {
+  Matcher<double> m1 = DoubleEq(2.0);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<double> m2 = DoubleEq(0.5);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<double> m3 = DoubleEq(nan1_);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqCanDescribeSelf) {
+  Matcher<double> m1 = NanSensitiveDoubleEq(2.0);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<double> m2 = NanSensitiveDoubleEq(0.5);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<double> m3 = NanSensitiveDoubleEq(nan1_);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+// Instantiate FloatingPointTest for testing floats with a user-specified
+// max absolute error.
+typedef FloatingPointNearTest<double> DoubleNearTest;
+
+TEST_F(DoubleNearTest, DoubleNearMatches) {
+  TestNearMatches(&DoubleNear);
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearApproximatelyMatchesDoubles) {
+  TestNearMatches(&NanSensitiveDoubleNear);
+}
+
+TEST_F(DoubleNearTest, DoubleNearCanDescribeSelf) {
+  Matcher<double> m1 = DoubleNear(2.0, 0.5);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<double> m2 = DoubleNear(0.5, 0.5);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<double> m3 = DoubleNear(nan1_, 0.0);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(DoubleNearTest, ExplainsResultWhenMatchFails) {
+  EXPECT_EQ("", Explain(DoubleNear(2.0, 0.1), 2.05));
+  EXPECT_EQ("which is 0.2 from 2", Explain(DoubleNear(2.0, 0.1), 2.2));
+  EXPECT_EQ("which is -0.3 from 2", Explain(DoubleNear(2.0, 0.1), 1.7));
+
+  const std::string explanation =
+      Explain(DoubleNear(2.1, 1e-10), 2.1 + 1.2e-10);
+  // Different C++ implementations may print floating-point numbers
+  // slightly differently.
+  EXPECT_TRUE(explanation == "which is 1.2e-10 from 2.1" ||  // GCC
+              explanation == "which is 1.2e-010 from 2.1")   // MSVC
+      << " where explanation is \"" << explanation << "\".";
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearCanDescribeSelf) {
+  Matcher<double> m1 = NanSensitiveDoubleNear(2.0, 0.5);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<double> m2 = NanSensitiveDoubleNear(0.5, 0.5);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<double> m3 = NanSensitiveDoubleNear(nan1_, 0.1);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+TEST_F(DoubleNearTest, DoubleNearCannotMatchNaN) {
+  // DoubleNear never matches NaN.
+  Matcher<double> m = DoubleNear(ParentType::nan1_, 0.1);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearCanMatchNaN) {
+  // NanSensitiveDoubleNear will match NaN.
+  Matcher<double> m = NanSensitiveDoubleNear(nan1_, 0.1);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST(PointeeTest, RawPointer) {
+  const Matcher<int*> m = Pointee(Ge(0));
+
+  int n = 1;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, RawPointerToConst) {
+  const Matcher<const double*> m = Pointee(Ge(0));
+
+  double x = 1;
+  EXPECT_TRUE(m.Matches(&x));
+  x = -1;
+  EXPECT_FALSE(m.Matches(&x));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, ReferenceToConstRawPointer) {
+  const Matcher<int* const &> m = Pointee(Ge(0));
+
+  int n = 1;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, ReferenceToNonConstRawPointer) {
+  const Matcher<double* &> m = Pointee(Ge(0));
+
+  double x = 1.0;
+  double* p = &x;
+  EXPECT_TRUE(m.Matches(p));
+  x = -1;
+  EXPECT_FALSE(m.Matches(p));
+  p = nullptr;
+  EXPECT_FALSE(m.Matches(p));
+}
+
+MATCHER_P(FieldIIs, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg.i, result_listener);
+}
+
+#if GTEST_HAS_RTTI
+TEST(WhenDynamicCastToTest, SameType) {
+  Derived derived;
+  derived.i = 4;
+
+  // Right type. A pointer is passed down.
+  Base* as_base_ptr = &derived;
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(Not(IsNull())));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(Pointee(FieldIIs(4))));
+  EXPECT_THAT(as_base_ptr,
+              Not(WhenDynamicCastTo<Derived*>(Pointee(FieldIIs(5)))));
+}
+
+TEST(WhenDynamicCastToTest, WrongTypes) {
+  Base base;
+  Derived derived;
+  OtherDerived other_derived;
+
+  // Wrong types. NULL is passed.
+  EXPECT_THAT(&base, Not(WhenDynamicCastTo<Derived*>(Pointee(_))));
+  EXPECT_THAT(&base, WhenDynamicCastTo<Derived*>(IsNull()));
+  Base* as_base_ptr = &derived;
+  EXPECT_THAT(as_base_ptr, Not(WhenDynamicCastTo<OtherDerived*>(Pointee(_))));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<OtherDerived*>(IsNull()));
+  as_base_ptr = &other_derived;
+  EXPECT_THAT(as_base_ptr, Not(WhenDynamicCastTo<Derived*>(Pointee(_))));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(IsNull()));
+}
+
+TEST(WhenDynamicCastToTest, AlreadyNull) {
+  // Already NULL.
+  Base* as_base_ptr = nullptr;
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(IsNull()));
+}
+
+struct AmbiguousCastTypes {
+  class VirtualDerived : public virtual Base {};
+  class DerivedSub1 : public VirtualDerived {};
+  class DerivedSub2 : public VirtualDerived {};
+  class ManyDerivedInHierarchy : public DerivedSub1, public DerivedSub2 {};
+};
+
+TEST(WhenDynamicCastToTest, AmbiguousCast) {
+  AmbiguousCastTypes::DerivedSub1 sub1;
+  AmbiguousCastTypes::ManyDerivedInHierarchy many_derived;
+  // Multiply derived from Base. dynamic_cast<> returns NULL.
+  Base* as_base_ptr =
+      static_cast<AmbiguousCastTypes::DerivedSub1*>(&many_derived);
+  EXPECT_THAT(as_base_ptr,
+              WhenDynamicCastTo<AmbiguousCastTypes::VirtualDerived*>(IsNull()));
+  as_base_ptr = &sub1;
+  EXPECT_THAT(
+      as_base_ptr,
+      WhenDynamicCastTo<AmbiguousCastTypes::VirtualDerived*>(Not(IsNull())));
+}
+
+TEST(WhenDynamicCastToTest, Describe) {
+  Matcher<Base*> matcher = WhenDynamicCastTo<Derived*>(Pointee(_));
+  const std::string prefix =
+      "when dynamic_cast to " + internal::GetTypeName<Derived*>() + ", ";
+  EXPECT_EQ(prefix + "points to a value that is anything", Describe(matcher));
+  EXPECT_EQ(prefix + "does not point to a value that is anything",
+            DescribeNegation(matcher));
+}
+
+TEST(WhenDynamicCastToTest, Explain) {
+  Matcher<Base*> matcher = WhenDynamicCastTo<Derived*>(Pointee(_));
+  Base* null = nullptr;
+  EXPECT_THAT(Explain(matcher, null), HasSubstr("NULL"));
+  Derived derived;
+  EXPECT_TRUE(matcher.Matches(&derived));
+  EXPECT_THAT(Explain(matcher, &derived), HasSubstr("which points to "));
+
+  // With references, the matcher itself can fail. Test for that one.
+  Matcher<const Base&> ref_matcher = WhenDynamicCastTo<const OtherDerived&>(_);
+  EXPECT_THAT(Explain(ref_matcher, derived),
+              HasSubstr("which cannot be dynamic_cast"));
+}
+
+TEST(WhenDynamicCastToTest, GoodReference) {
+  Derived derived;
+  derived.i = 4;
+  Base& as_base_ref = derived;
+  EXPECT_THAT(as_base_ref, WhenDynamicCastTo<const Derived&>(FieldIIs(4)));
+  EXPECT_THAT(as_base_ref, WhenDynamicCastTo<const Derived&>(Not(FieldIIs(5))));
+}
+
+TEST(WhenDynamicCastToTest, BadReference) {
+  Derived derived;
+  Base& as_base_ref = derived;
+  EXPECT_THAT(as_base_ref, Not(WhenDynamicCastTo<const OtherDerived&>(_)));
+}
+#endif  // GTEST_HAS_RTTI
+
+// Minimal const-propagating pointer.
+template <typename T>
+class ConstPropagatingPtr {
+ public:
+  typedef T element_type;
+
+  ConstPropagatingPtr() : val_() {}
+  explicit ConstPropagatingPtr(T* t) : val_(t) {}
+  ConstPropagatingPtr(const ConstPropagatingPtr& other) : val_(other.val_) {}
+
+  T* get() { return val_; }
+  T& operator*() { return *val_; }
+  // Most smart pointers return non-const T* and T& from the next methods.
+  const T* get() const { return val_; }
+  const T& operator*() const { return *val_; }
+
+ private:
+  T* val_;
+};
+
+TEST(PointeeTest, WorksWithConstPropagatingPointers) {
+  const Matcher< ConstPropagatingPtr<int> > m = Pointee(Lt(5));
+  int three = 3;
+  const ConstPropagatingPtr<int> co(&three);
+  ConstPropagatingPtr<int> o(&three);
+  EXPECT_TRUE(m.Matches(o));
+  EXPECT_TRUE(m.Matches(co));
+  *o = 6;
+  EXPECT_FALSE(m.Matches(o));
+  EXPECT_FALSE(m.Matches(ConstPropagatingPtr<int>()));
+}
+
+TEST(PointeeTest, NeverMatchesNull) {
+  const Matcher<const char*> m = Pointee(_);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that we can write Pointee(value) instead of Pointee(Eq(value)).
+TEST(PointeeTest, MatchesAgainstAValue) {
+  const Matcher<int*> m = Pointee(5);
+
+  int n = 5;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, CanDescribeSelf) {
+  const Matcher<int*> m = Pointee(Gt(3));
+  EXPECT_EQ("points to a value that is > 3", Describe(m));
+  EXPECT_EQ("does not point to a value that is > 3",
+            DescribeNegation(m));
+}
+
+TEST(PointeeTest, CanExplainMatchResult) {
+  const Matcher<const std::string*> m = Pointee(StartsWith("Hi"));
+
+  EXPECT_EQ("", Explain(m, static_cast<const std::string*>(nullptr)));
+
+  const Matcher<long*> m2 = Pointee(GreaterThan(1));  // NOLINT
+  long n = 3;  // NOLINT
+  EXPECT_EQ("which points to 3" + OfType("long") + ", which is 2 more than 1",
+            Explain(m2, &n));
+}
+
+TEST(PointeeTest, AlwaysExplainsPointee) {
+  const Matcher<int*> m = Pointee(0);
+  int n = 42;
+  EXPECT_EQ("which points to 42" + OfType("int"), Explain(m, &n));
+}
+
+// An uncopyable class.
+class Uncopyable {
+ public:
+  Uncopyable() : value_(-1) {}
+  explicit Uncopyable(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+  void set_value(int i) { value_ = i; }
+
+ private:
+  int value_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Uncopyable);
+};
+
+// Returns true if x.value() is positive.
+bool ValueIsPositive(const Uncopyable& x) { return x.value() > 0; }
+
+MATCHER_P(UncopyableIs, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg.value(), result_listener);
+}
+
+// A user-defined struct for testing Field().
+struct AStruct {
+  AStruct() : x(0), y(1.0), z(5), p(nullptr) {}
+  AStruct(const AStruct& rhs)
+      : x(rhs.x), y(rhs.y), z(rhs.z.value()), p(rhs.p) {}
+
+  int x;           // A non-const field.
+  const double y;  // A const field.
+  Uncopyable z;    // An uncopyable field.
+  const char* p;   // A pointer field.
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(AStruct);
+};
+
+// A derived struct for testing Field().
+struct DerivedStruct : public AStruct {
+  char ch;
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(DerivedStruct);
+};
+
+// Tests that Field(&Foo::field, ...) works when field is non-const.
+TEST(FieldTest, WorksForNonConstField) {
+  Matcher<AStruct> m = Field(&AStruct::x, Ge(0));
+  Matcher<AStruct> m_with_name = Field("x", &AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is const.
+TEST(FieldTest, WorksForConstField) {
+  AStruct a;
+
+  Matcher<AStruct> m = Field(&AStruct::y, Ge(0.0));
+  Matcher<AStruct> m_with_name = Field("y", &AStruct::y, Ge(0.0));
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  m = Field(&AStruct::y, Le(0.0));
+  m_with_name = Field("y", &AStruct::y, Le(0.0));
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is not copyable.
+TEST(FieldTest, WorksForUncopyableField) {
+  AStruct a;
+
+  Matcher<AStruct> m = Field(&AStruct::z, Truly(ValueIsPositive));
+  EXPECT_TRUE(m.Matches(a));
+  m = Field(&AStruct::z, Not(Truly(ValueIsPositive)));
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is a pointer.
+TEST(FieldTest, WorksForPointerField) {
+  // Matching against NULL.
+  Matcher<AStruct> m = Field(&AStruct::p, static_cast<const char*>(nullptr));
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.p = "hi";
+  EXPECT_FALSE(m.Matches(a));
+
+  // Matching a pointer that is not NULL.
+  m = Field(&AStruct::p, StartsWith("hi"));
+  a.p = "hill";
+  EXPECT_TRUE(m.Matches(a));
+  a.p = "hole";
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field() works when the object is passed by reference.
+TEST(FieldTest, WorksForByRefArgument) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when the argument's type
+// is a sub-type of Foo.
+TEST(FieldTest, WorksForArgumentOfSubType) {
+  // Note that the matcher expects DerivedStruct but we say AStruct
+  // inside Field().
+  Matcher<const DerivedStruct&> m = Field(&AStruct::x, Ge(0));
+
+  DerivedStruct d;
+  EXPECT_TRUE(m.Matches(d));
+  d.x = -1;
+  EXPECT_FALSE(m.Matches(d));
+}
+
+// Tests that Field(&Foo::field, m) works when field's type and m's
+// argument type are compatible but not the same.
+TEST(FieldTest, WorksForCompatibleMatcherType) {
+  // The field is an int, but the inner matcher expects a signed char.
+  Matcher<const AStruct&> m = Field(&AStruct::x,
+                                    Matcher<signed char>(Ge(0)));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field() can describe itself.
+TEST(FieldTest, CanDescribeSelf) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose given field is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given field isn't >= 0", DescribeNegation(m));
+}
+
+TEST(FieldTest, CanDescribeSelfWithFieldName) {
+  Matcher<const AStruct&> m = Field("field_name", &AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose field `field_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose field `field_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Field() can explain the match result.
+TEST(FieldTest, CanExplainMatchResult) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("whose given field is 1" + OfType("int"), Explain(m, a));
+
+  m = Field(&AStruct::x, GreaterThan(0));
+  EXPECT_EQ(
+      "whose given field is 1" + OfType("int") + ", which is 1 more than 0",
+      Explain(m, a));
+}
+
+TEST(FieldTest, CanExplainMatchResultWithFieldName) {
+  Matcher<const AStruct&> m = Field("field_name", &AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("whose field `field_name` is 1" + OfType("int"), Explain(m, a));
+
+  m = Field("field_name", &AStruct::x, GreaterThan(0));
+  EXPECT_EQ("whose field `field_name` is 1" + OfType("int") +
+                ", which is 1 more than 0",
+            Explain(m, a));
+}
+
+// Tests that Field() works when the argument is a pointer to const.
+TEST(FieldForPointerTest, WorksForPointerToConst) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() works when the argument is a pointer to non-const.
+TEST(FieldForPointerTest, WorksForPointerToNonConst) {
+  Matcher<AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() works when the argument is a reference to a const pointer.
+TEST(FieldForPointerTest, WorksForReferenceToConstPointer) {
+  Matcher<AStruct* const&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() does not match the NULL pointer.
+TEST(FieldForPointerTest, DoesNotMatchNull) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, _);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that Field(&Foo::field, ...) works when the argument's type
+// is a sub-type of const Foo*.
+TEST(FieldForPointerTest, WorksForArgumentOfSubType) {
+  // Note that the matcher expects DerivedStruct but we say AStruct
+  // inside Field().
+  Matcher<DerivedStruct*> m = Field(&AStruct::x, Ge(0));
+
+  DerivedStruct d;
+  EXPECT_TRUE(m.Matches(&d));
+  d.x = -1;
+  EXPECT_FALSE(m.Matches(&d));
+}
+
+// Tests that Field() can describe itself when used to match a pointer.
+TEST(FieldForPointerTest, CanDescribeSelf) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose given field is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given field isn't >= 0", DescribeNegation(m));
+}
+
+TEST(FieldForPointerTest, CanDescribeSelfWithFieldName) {
+  Matcher<const AStruct*> m = Field("field_name", &AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose field `field_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose field `field_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Field() can explain the result of matching a pointer.
+TEST(FieldForPointerTest, CanExplainMatchResult) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("", Explain(m, static_cast<const AStruct*>(nullptr)));
+  EXPECT_EQ("which points to an object whose given field is 1" + OfType("int"),
+            Explain(m, &a));
+
+  m = Field(&AStruct::x, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose given field is 1" + OfType("int") +
+            ", which is 1 more than 0", Explain(m, &a));
+}
+
+TEST(FieldForPointerTest, CanExplainMatchResultWithFieldName) {
+  Matcher<const AStruct*> m = Field("field_name", &AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("", Explain(m, static_cast<const AStruct*>(nullptr)));
+  EXPECT_EQ(
+      "which points to an object whose field `field_name` is 1" + OfType("int"),
+      Explain(m, &a));
+
+  m = Field("field_name", &AStruct::x, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose field `field_name` is 1" +
+                OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+// A user-defined class for testing Property().
+class AClass {
+ public:
+  AClass() : n_(0) {}
+
+  // A getter that returns a non-reference.
+  int n() const { return n_; }
+
+  void set_n(int new_n) { n_ = new_n; }
+
+  // A getter that returns a reference to const.
+  const std::string& s() const { return s_; }
+
+  const std::string& s_ref() const & { return s_; }
+
+  void set_s(const std::string& new_s) { s_ = new_s; }
+
+  // A getter that returns a reference to non-const.
+  double& x() const { return x_; }
+
+ private:
+  int n_;
+  std::string s_;
+
+  static double x_;
+};
+
+double AClass::x_ = 0.0;
+
+// A derived class for testing Property().
+class DerivedClass : public AClass {
+ public:
+  int k() const { return k_; }
+ private:
+  int k_;
+};
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a non-reference.
+TEST(PropertyTest, WorksForNonReferenceProperty) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+  Matcher<const AClass&> m_with_name = Property("n", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a reference to const.
+TEST(PropertyTest, WorksForReferenceToConstProperty) {
+  Matcher<const AClass&> m = Property(&AClass::s, StartsWith("hi"));
+  Matcher<const AClass&> m_with_name =
+      Property("s", &AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property() is
+// ref-qualified.
+TEST(PropertyTest, WorksForRefQualifiedProperty) {
+  Matcher<const AClass&> m = Property(&AClass::s_ref, StartsWith("hi"));
+  Matcher<const AClass&> m_with_name =
+      Property("s", &AClass::s_ref, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a reference to non-const.
+TEST(PropertyTest, WorksForReferenceToNonConstProperty) {
+  double x = 0.0;
+  AClass a;
+
+  Matcher<const AClass&> m = Property(&AClass::x, Ref(x));
+  EXPECT_FALSE(m.Matches(a));
+
+  m = Property(&AClass::x, Not(Ref(x)));
+  EXPECT_TRUE(m.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument is
+// passed by value.
+TEST(PropertyTest, WorksForByValueArgument) {
+  Matcher<AClass> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument's
+// type is a sub-type of Foo.
+TEST(PropertyTest, WorksForArgumentOfSubType) {
+  // The matcher expects a DerivedClass, but inside the Property() we
+  // say AClass.
+  Matcher<const DerivedClass&> m = Property(&AClass::n, Ge(0));
+
+  DerivedClass d;
+  d.set_n(1);
+  EXPECT_TRUE(m.Matches(d));
+
+  d.set_n(-1);
+  EXPECT_FALSE(m.Matches(d));
+}
+
+// Tests that Property(&Foo::property, m) works when property()'s type
+// and m's argument type are compatible but different.
+TEST(PropertyTest, WorksForCompatibleMatcherType) {
+  // n() returns an int but the inner matcher expects a signed char.
+  Matcher<const AClass&> m = Property(&AClass::n,
+                                      Matcher<signed char>(Ge(0)));
+
+  Matcher<const AClass&> m_with_name =
+      Property("n", &AClass::n, Matcher<signed char>(Ge(0)));
+
+  AClass a;
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property() can describe itself.
+TEST(PropertyTest, CanDescribeSelf) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose given property is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given property isn't >= 0",
+            DescribeNegation(m));
+}
+
+TEST(PropertyTest, CanDescribeSelfWithPropertyName) {
+  Matcher<const AClass&> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose property `fancy_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose property `fancy_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Property() can explain the match result.
+TEST(PropertyTest, CanExplainMatchResult) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("whose given property is 1" + OfType("int"), Explain(m, a));
+
+  m = Property(&AClass::n, GreaterThan(0));
+  EXPECT_EQ(
+      "whose given property is 1" + OfType("int") + ", which is 1 more than 0",
+      Explain(m, a));
+}
+
+TEST(PropertyTest, CanExplainMatchResultWithPropertyName) {
+  Matcher<const AClass&> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("whose property `fancy_name` is 1" + OfType("int"), Explain(m, a));
+
+  m = Property("fancy_name", &AClass::n, GreaterThan(0));
+  EXPECT_EQ("whose property `fancy_name` is 1" + OfType("int") +
+                ", which is 1 more than 0",
+            Explain(m, a));
+}
+
+// Tests that Property() works when the argument is a pointer to const.
+TEST(PropertyForPointerTest, WorksForPointerToConst) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() works when the argument is a pointer to non-const.
+TEST(PropertyForPointerTest, WorksForPointerToNonConst) {
+  Matcher<AClass*> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() works when the argument is a reference to a
+// const pointer.
+TEST(PropertyForPointerTest, WorksForReferenceToConstPointer) {
+  Matcher<AClass* const&> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() does not match the NULL pointer.
+TEST(PropertyForPointerTest, WorksForReferenceToNonConstProperty) {
+  Matcher<const AClass*> m = Property(&AClass::x, _);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument's
+// type is a sub-type of const Foo*.
+TEST(PropertyForPointerTest, WorksForArgumentOfSubType) {
+  // The matcher expects a DerivedClass, but inside the Property() we
+  // say AClass.
+  Matcher<const DerivedClass*> m = Property(&AClass::n, Ge(0));
+
+  DerivedClass d;
+  d.set_n(1);
+  EXPECT_TRUE(m.Matches(&d));
+
+  d.set_n(-1);
+  EXPECT_FALSE(m.Matches(&d));
+}
+
+// Tests that Property() can describe itself when used to match a pointer.
+TEST(PropertyForPointerTest, CanDescribeSelf) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose given property is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given property isn't >= 0",
+            DescribeNegation(m));
+}
+
+TEST(PropertyForPointerTest, CanDescribeSelfWithPropertyDescription) {
+  Matcher<const AClass*> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose property `fancy_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose property `fancy_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Property() can explain the result of matching a pointer.
+TEST(PropertyForPointerTest, CanExplainMatchResult) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("", Explain(m, static_cast<const AClass*>(nullptr)));
+  EXPECT_EQ(
+      "which points to an object whose given property is 1" + OfType("int"),
+      Explain(m, &a));
+
+  m = Property(&AClass::n, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose given property is 1" +
+            OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+TEST(PropertyForPointerTest, CanExplainMatchResultWithPropertyName) {
+  Matcher<const AClass*> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("", Explain(m, static_cast<const AClass*>(nullptr)));
+  EXPECT_EQ("which points to an object whose property `fancy_name` is 1" +
+                OfType("int"),
+            Explain(m, &a));
+
+  m = Property("fancy_name", &AClass::n, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose property `fancy_name` is 1" +
+                OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+// Tests ResultOf.
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function pointer.
+std::string IntToStringFunction(int input) {
+  return input == 1 ? "foo" : "bar";
+}
+
+TEST(ResultOfTest, WorksForFunctionPointers) {
+  Matcher<int> matcher = ResultOf(&IntToStringFunction, Eq(std::string("foo")));
+
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf() can describe itself.
+TEST(ResultOfTest, CanDescribeItself) {
+  Matcher<int> matcher = ResultOf(&IntToStringFunction, StrEq("foo"));
+
+  EXPECT_EQ("is mapped by the given callable to a value that "
+            "is equal to \"foo\"", Describe(matcher));
+  EXPECT_EQ("is mapped by the given callable to a value that "
+            "isn't equal to \"foo\"", DescribeNegation(matcher));
+}
+
+// Tests that ResultOf() can explain the match result.
+int IntFunction(int input) { return input == 42 ? 80 : 90; }
+
+TEST(ResultOfTest, CanExplainMatchResult) {
+  Matcher<int> matcher = ResultOf(&IntFunction, Ge(85));
+  EXPECT_EQ("which is mapped by the given callable to 90" + OfType("int"),
+            Explain(matcher, 36));
+
+  matcher = ResultOf(&IntFunction, GreaterThan(85));
+  EXPECT_EQ("which is mapped by the given callable to 90" + OfType("int") +
+            ", which is 5 more than 85", Explain(matcher, 36));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a non-reference.
+TEST(ResultOfTest, WorksForNonReferenceResults) {
+  Matcher<int> matcher = ResultOf(&IntFunction, Eq(80));
+
+  EXPECT_TRUE(matcher.Matches(42));
+  EXPECT_FALSE(matcher.Matches(36));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a reference to non-const.
+double& DoubleFunction(double& input) { return input; }  // NOLINT
+
+Uncopyable& RefUncopyableFunction(Uncopyable& obj) {  // NOLINT
+  return obj;
+}
+
+TEST(ResultOfTest, WorksForReferenceToNonConstResults) {
+  double x = 3.14;
+  double x2 = x;
+  Matcher<double&> matcher = ResultOf(&DoubleFunction, Ref(x));
+
+  EXPECT_TRUE(matcher.Matches(x));
+  EXPECT_FALSE(matcher.Matches(x2));
+
+  // Test that ResultOf works with uncopyable objects
+  Uncopyable obj(0);
+  Uncopyable obj2(0);
+  Matcher<Uncopyable&> matcher2 =
+      ResultOf(&RefUncopyableFunction, Ref(obj));
+
+  EXPECT_TRUE(matcher2.Matches(obj));
+  EXPECT_FALSE(matcher2.Matches(obj2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a reference to const.
+const std::string& StringFunction(const std::string& input) { return input; }
+
+TEST(ResultOfTest, WorksForReferenceToConstResults) {
+  std::string s = "foo";
+  std::string s2 = s;
+  Matcher<const std::string&> matcher = ResultOf(&StringFunction, Ref(s));
+
+  EXPECT_TRUE(matcher.Matches(s));
+  EXPECT_FALSE(matcher.Matches(s2));
+}
+
+// Tests that ResultOf(f, m) works when f(x) and m's
+// argument types are compatible but different.
+TEST(ResultOfTest, WorksForCompatibleMatcherTypes) {
+  // IntFunction() returns int but the inner matcher expects a signed char.
+  Matcher<int> matcher = ResultOf(IntFunction, Matcher<signed char>(Ge(85)));
+
+  EXPECT_TRUE(matcher.Matches(36));
+  EXPECT_FALSE(matcher.Matches(42));
+}
+
+// Tests that the program aborts when ResultOf is passed
+// a NULL function pointer.
+TEST(ResultOfDeathTest, DiesOnNullFunctionPointers) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      ResultOf(static_cast<std::string (*)(int dummy)>(nullptr),
+               Eq(std::string("foo"))),
+      "NULL function pointer is passed into ResultOf\\(\\)\\.");
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function reference.
+TEST(ResultOfTest, WorksForFunctionReferences) {
+  Matcher<int> matcher = ResultOf(IntToStringFunction, StrEq("foo"));
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function object.
+struct Functor {
+  std::string operator()(int input) const {
+    return IntToStringFunction(input);
+  }
+};
+
+TEST(ResultOfTest, WorksForFunctors) {
+  Matcher<int> matcher = ResultOf(Functor(), Eq(std::string("foo")));
+
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// functor with more than one operator() defined. ResultOf() must work
+// for each defined operator().
+struct PolymorphicFunctor {
+  typedef int result_type;
+  int operator()(int n) { return n; }
+  int operator()(const char* s) { return static_cast<int>(strlen(s)); }
+  std::string operator()(int *p) { return p ? "good ptr" : "null"; }
+};
+
+TEST(ResultOfTest, WorksForPolymorphicFunctors) {
+  Matcher<int> matcher_int = ResultOf(PolymorphicFunctor(), Ge(5));
+
+  EXPECT_TRUE(matcher_int.Matches(10));
+  EXPECT_FALSE(matcher_int.Matches(2));
+
+  Matcher<const char*> matcher_string = ResultOf(PolymorphicFunctor(), Ge(5));
+
+  EXPECT_TRUE(matcher_string.Matches("long string"));
+  EXPECT_FALSE(matcher_string.Matches("shrt"));
+}
+
+TEST(ResultOfTest, WorksForPolymorphicFunctorsIgnoringResultType) {
+  Matcher<int*> matcher = ResultOf(PolymorphicFunctor(), "good ptr");
+
+  int n = 0;
+  EXPECT_TRUE(matcher.Matches(&n));
+  EXPECT_FALSE(matcher.Matches(nullptr));
+}
+
+TEST(ResultOfTest, WorksForLambdas) {
+  Matcher<int> matcher = ResultOf(
+      [](int str_len) {
+        return std::string(static_cast<size_t>(str_len), 'x');
+      },
+      "xxx");
+  EXPECT_TRUE(matcher.Matches(3));
+  EXPECT_FALSE(matcher.Matches(1));
+}
+
+const int* ReferencingFunction(const int& n) { return &n; }
+
+struct ReferencingFunctor {
+  typedef const int* result_type;
+  result_type operator()(const int& n) { return &n; }
+};
+
+TEST(ResultOfTest, WorksForReferencingCallables) {
+  const int n = 1;
+  const int n2 = 1;
+  Matcher<const int&> matcher2 = ResultOf(ReferencingFunction, Eq(&n));
+  EXPECT_TRUE(matcher2.Matches(n));
+  EXPECT_FALSE(matcher2.Matches(n2));
+
+  Matcher<const int&> matcher3 = ResultOf(ReferencingFunctor(), Eq(&n));
+  EXPECT_TRUE(matcher3.Matches(n));
+  EXPECT_FALSE(matcher3.Matches(n2));
+}
+
+class DivisibleByImpl {
+ public:
+  explicit DivisibleByImpl(int a_divider) : divider_(a_divider) {}
+
+  // For testing using ExplainMatchResultTo() with polymorphic matchers.
+  template <typename T>
+  bool MatchAndExplain(const T& n, MatchResultListener* listener) const {
+    *listener << "which is " << (n % divider_) << " modulo "
+              << divider_;
+    return (n % divider_) == 0;
+  }
+
+  void DescribeTo(ostream* os) const {
+    *os << "is divisible by " << divider_;
+  }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "is not divisible by " << divider_;
+  }
+
+  void set_divider(int a_divider) { divider_ = a_divider; }
+  int divider() const { return divider_; }
+
+ private:
+  int divider_;
+};
+
+PolymorphicMatcher<DivisibleByImpl> DivisibleBy(int n) {
+  return MakePolymorphicMatcher(DivisibleByImpl(n));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_False_False) {
+  const Matcher<int> m = AllOf(DivisibleBy(4), DivisibleBy(3));
+  EXPECT_EQ("which is 1 modulo 4", Explain(m, 5));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_False_True) {
+  const Matcher<int> m = AllOf(DivisibleBy(4), DivisibleBy(3));
+  EXPECT_EQ("which is 2 modulo 4", Explain(m, 6));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_True_False) {
+  const Matcher<int> m = AllOf(Ge(1), DivisibleBy(3));
+  EXPECT_EQ("which is 2 modulo 3", Explain(m, 5));
+}
+
+// Tests that when AllOf() succeeds, all matchers are asked to explain
+// why.
+TEST(ExplainMatchResultTest, AllOf_True_True) {
+  const Matcher<int> m = AllOf(DivisibleBy(2), DivisibleBy(3));
+  EXPECT_EQ("which is 0 modulo 2, and which is 0 modulo 3", Explain(m, 6));
+}
+
+TEST(ExplainMatchResultTest, AllOf_True_True_2) {
+  const Matcher<int> m = AllOf(Ge(2), Le(3));
+  EXPECT_EQ("", Explain(m, 2));
+}
+
+TEST(ExplainmatcherResultTest, MonomorphicMatcher) {
+  const Matcher<int> m = GreaterThan(5);
+  EXPECT_EQ("which is 1 more than 5", Explain(m, 6));
+}
+
+// The following two tests verify that values without a public copy
+// ctor can be used as arguments to matchers like Eq(), Ge(), and etc
+// with the help of ByRef().
+
+class NotCopyable {
+ public:
+  explicit NotCopyable(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+
+  bool operator==(const NotCopyable& rhs) const {
+    return value() == rhs.value();
+  }
+
+  bool operator>=(const NotCopyable& rhs) const {
+    return value() >= rhs.value();
+  }
+ private:
+  int value_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NotCopyable);
+};
+
+TEST(ByRefTest, AllowsNotCopyableConstValueInMatchers) {
+  const NotCopyable const_value1(1);
+  const Matcher<const NotCopyable&> m = Eq(ByRef(const_value1));
+
+  const NotCopyable n1(1), n2(2);
+  EXPECT_TRUE(m.Matches(n1));
+  EXPECT_FALSE(m.Matches(n2));
+}
+
+TEST(ByRefTest, AllowsNotCopyableValueInMatchers) {
+  NotCopyable value2(2);
+  const Matcher<NotCopyable&> m = Ge(ByRef(value2));
+
+  NotCopyable n1(1), n2(2);
+  EXPECT_FALSE(m.Matches(n1));
+  EXPECT_TRUE(m.Matches(n2));
+}
+
+TEST(IsEmptyTest, ImplementsIsEmpty) {
+  vector<int> container;
+  EXPECT_THAT(container, IsEmpty());
+  container.push_back(0);
+  EXPECT_THAT(container, Not(IsEmpty()));
+  container.push_back(1);
+  EXPECT_THAT(container, Not(IsEmpty()));
+}
+
+TEST(IsEmptyTest, WorksWithString) {
+  std::string text;
+  EXPECT_THAT(text, IsEmpty());
+  text = "foo";
+  EXPECT_THAT(text, Not(IsEmpty()));
+  text = std::string("\0", 1);
+  EXPECT_THAT(text, Not(IsEmpty()));
+}
+
+TEST(IsEmptyTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = IsEmpty();
+  EXPECT_EQ("is empty", Describe(m));
+  EXPECT_EQ("isn't empty", DescribeNegation(m));
+}
+
+TEST(IsEmptyTest, ExplainsResult) {
+  Matcher<vector<int> > m = IsEmpty();
+  vector<int> container;
+  EXPECT_EQ("", Explain(m, container));
+  container.push_back(0);
+  EXPECT_EQ("whose size is 1", Explain(m, container));
+}
+
+TEST(IsEmptyTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsEmpty()));
+  helper.Call({});
+}
+
+TEST(IsTrueTest, IsTrueIsFalse) {
+  EXPECT_THAT(true, IsTrue());
+  EXPECT_THAT(false, IsFalse());
+  EXPECT_THAT(true, Not(IsFalse()));
+  EXPECT_THAT(false, Not(IsTrue()));
+  EXPECT_THAT(0, Not(IsTrue()));
+  EXPECT_THAT(0, IsFalse());
+  EXPECT_THAT(nullptr, Not(IsTrue()));
+  EXPECT_THAT(nullptr, IsFalse());
+  EXPECT_THAT(-1, IsTrue());
+  EXPECT_THAT(-1, Not(IsFalse()));
+  EXPECT_THAT(1, IsTrue());
+  EXPECT_THAT(1, Not(IsFalse()));
+  EXPECT_THAT(2, IsTrue());
+  EXPECT_THAT(2, Not(IsFalse()));
+  int a = 42;
+  EXPECT_THAT(a, IsTrue());
+  EXPECT_THAT(a, Not(IsFalse()));
+  EXPECT_THAT(&a, IsTrue());
+  EXPECT_THAT(&a, Not(IsFalse()));
+  EXPECT_THAT(false, Not(IsTrue()));
+  EXPECT_THAT(true, Not(IsFalse()));
+  EXPECT_THAT(std::true_type(), IsTrue());
+  EXPECT_THAT(std::true_type(), Not(IsFalse()));
+  EXPECT_THAT(std::false_type(), IsFalse());
+  EXPECT_THAT(std::false_type(), Not(IsTrue()));
+  EXPECT_THAT(nullptr, Not(IsTrue()));
+  EXPECT_THAT(nullptr, IsFalse());
+  std::unique_ptr<int> null_unique;
+  std::unique_ptr<int> nonnull_unique(new int(0));
+  EXPECT_THAT(null_unique, Not(IsTrue()));
+  EXPECT_THAT(null_unique, IsFalse());
+  EXPECT_THAT(nonnull_unique, IsTrue());
+  EXPECT_THAT(nonnull_unique, Not(IsFalse()));
+}
+
+TEST(SizeIsTest, ImplementsSizeIs) {
+  vector<int> container;
+  EXPECT_THAT(container, SizeIs(0));
+  EXPECT_THAT(container, Not(SizeIs(1)));
+  container.push_back(0);
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(1));
+  container.push_back(0);
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(2));
+}
+
+TEST(SizeIsTest, WorksWithMap) {
+  map<std::string, int> container;
+  EXPECT_THAT(container, SizeIs(0));
+  EXPECT_THAT(container, Not(SizeIs(1)));
+  container.insert(make_pair("foo", 1));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(1));
+  container.insert(make_pair("bar", 2));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(2));
+}
+
+TEST(SizeIsTest, WorksWithReferences) {
+  vector<int> container;
+  Matcher<const vector<int>&> m = SizeIs(1);
+  EXPECT_THAT(container, Not(m));
+  container.push_back(0);
+  EXPECT_THAT(container, m);
+}
+
+TEST(SizeIsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(SizeIs(3)));
+  helper.Call(MakeUniquePtrs({1, 2, 3}));
+}
+
+// SizeIs should work for any type that provides a size() member function.
+// For example, a size_type member type should not need to be provided.
+struct MinimalistCustomType {
+  int size() const { return 1; }
+};
+TEST(SizeIsTest, WorksWithMinimalistCustomType) {
+  MinimalistCustomType container;
+  EXPECT_THAT(container, SizeIs(1));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+}
+
+TEST(SizeIsTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = SizeIs(2);
+  EXPECT_EQ("size is equal to 2", Describe(m));
+  EXPECT_EQ("size isn't equal to 2", DescribeNegation(m));
+}
+
+TEST(SizeIsTest, ExplainsResult) {
+  Matcher<vector<int> > m1 = SizeIs(2);
+  Matcher<vector<int> > m2 = SizeIs(Lt(2u));
+  Matcher<vector<int> > m3 = SizeIs(AnyOf(0, 3));
+  Matcher<vector<int> > m4 = SizeIs(GreaterThan(1));
+  vector<int> container;
+  EXPECT_EQ("whose size 0 doesn't match", Explain(m1, container));
+  EXPECT_EQ("whose size 0 matches", Explain(m2, container));
+  EXPECT_EQ("whose size 0 matches", Explain(m3, container));
+  EXPECT_EQ("whose size 0 doesn't match, which is 1 less than 1",
+            Explain(m4, container));
+  container.push_back(0);
+  container.push_back(0);
+  EXPECT_EQ("whose size 2 matches", Explain(m1, container));
+  EXPECT_EQ("whose size 2 doesn't match", Explain(m2, container));
+  EXPECT_EQ("whose size 2 doesn't match", Explain(m3, container));
+  EXPECT_EQ("whose size 2 matches, which is 1 more than 1",
+            Explain(m4, container));
+}
+
+#if GTEST_HAS_TYPED_TEST
+// Tests ContainerEq with different container types, and
+// different element types.
+
+template <typename T>
+class ContainerEqTest : public testing::Test {};
+
+typedef testing::Types<
+    set<int>,
+    vector<size_t>,
+    multiset<size_t>,
+    list<int> >
+    ContainerEqTestTypes;
+
+TYPED_TEST_SUITE(ContainerEqTest, ContainerEqTestTypes);
+
+// Tests that the filled container is equal to itself.
+TYPED_TEST(ContainerEqTest, EqualsSelf) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  TypeParam my_set(vals, vals + 6);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_TRUE(m.Matches(my_set));
+  EXPECT_EQ("", Explain(m, my_set));
+}
+
+// Tests that missing values are reported.
+TYPED_TEST(ContainerEqTest, ValueMissing) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {2, 1, 8, 5};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 4);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which doesn't have these expected elements: 3",
+            Explain(m, test_set));
+}
+
+// Tests that added values are reported.
+TYPED_TEST(ContainerEqTest, ValueAdded) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8, 46};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 6);
+  const Matcher<const TypeParam&> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 46", Explain(m, test_set));
+}
+
+// Tests that added and missing values are reported together.
+TYPED_TEST(ContainerEqTest, ValueAddedAndRemoved) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 8, 46};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 5);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 46,\n"
+            "and doesn't have these expected elements: 5",
+            Explain(m, test_set));
+}
+
+// Tests duplicated value -- expect no explanation.
+TYPED_TEST(ContainerEqTest, DuplicateDifference) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 5);
+  const Matcher<const TypeParam&> m = ContainerEq(my_set);
+  // Depending on the container, match may be true or false
+  // But in any case there should be no explanation.
+  EXPECT_EQ("", Explain(m, test_set));
+}
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Tests that multiple missing values are reported.
+// Using just vector here, so order is predictable.
+TEST(ContainerEqExtraTest, MultipleValuesMissing) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {2, 1, 5};
+  vector<int> my_set(vals, vals + 6);
+  vector<int> test_set(test_vals, test_vals + 3);
+  const Matcher<vector<int> > m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which doesn't have these expected elements: 3, 8",
+            Explain(m, test_set));
+}
+
+// Tests that added values are reported.
+// Using just vector here, so order is predictable.
+TEST(ContainerEqExtraTest, MultipleValuesAdded) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 92, 3, 5, 8, 46};
+  list<size_t> my_set(vals, vals + 6);
+  list<size_t> test_set(test_vals, test_vals + 7);
+  const Matcher<const list<size_t>&> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 92, 46",
+            Explain(m, test_set));
+}
+
+// Tests that added and missing values are reported together.
+TEST(ContainerEqExtraTest, MultipleValuesAddedAndRemoved) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 92, 46};
+  list<size_t> my_set(vals, vals + 6);
+  list<size_t> test_set(test_vals, test_vals + 5);
+  const Matcher<const list<size_t> > m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 92, 46,\n"
+            "and doesn't have these expected elements: 5, 8",
+            Explain(m, test_set));
+}
+
+// Tests to see that duplicate elements are detected,
+// but (as above) not reported in the explanation.
+TEST(ContainerEqExtraTest, MultiSetOfIntDuplicateDifference) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8};
+  vector<int> my_set(vals, vals + 6);
+  vector<int> test_set(test_vals, test_vals + 5);
+  const Matcher<vector<int> > m = ContainerEq(my_set);
+  EXPECT_TRUE(m.Matches(my_set));
+  EXPECT_FALSE(m.Matches(test_set));
+  // There is nothing to report when both sets contain all the same values.
+  EXPECT_EQ("", Explain(m, test_set));
+}
+
+// Tests that ContainerEq works for non-trivial associative containers,
+// like maps.
+TEST(ContainerEqExtraTest, WorksForMaps) {
+  map<int, std::string> my_map;
+  my_map[0] = "a";
+  my_map[1] = "b";
+
+  map<int, std::string> test_map;
+  test_map[0] = "aa";
+  test_map[1] = "b";
+
+  const Matcher<const map<int, std::string>&> m = ContainerEq(my_map);
+  EXPECT_TRUE(m.Matches(my_map));
+  EXPECT_FALSE(m.Matches(test_map));
+
+  EXPECT_EQ("which has these unexpected elements: (0, \"aa\"),\n"
+            "and doesn't have these expected elements: (0, \"a\")",
+            Explain(m, test_map));
+}
+
+TEST(ContainerEqExtraTest, WorksForNativeArray) {
+  int a1[] = {1, 2, 3};
+  int a2[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+
+  EXPECT_THAT(a1, ContainerEq(a2));
+  EXPECT_THAT(a1, Not(ContainerEq(b)));
+}
+
+TEST(ContainerEqExtraTest, WorksForTwoDimensionalNativeArray) {
+  const char a1[][3] = {"hi", "lo"};
+  const char a2[][3] = {"hi", "lo"};
+  const char b[][3] = {"lo", "hi"};
+
+  // Tests using ContainerEq() in the first dimension.
+  EXPECT_THAT(a1, ContainerEq(a2));
+  EXPECT_THAT(a1, Not(ContainerEq(b)));
+
+  // Tests using ContainerEq() in the second dimension.
+  EXPECT_THAT(a1, ElementsAre(ContainerEq(a2[0]), ContainerEq(a2[1])));
+  EXPECT_THAT(a1, ElementsAre(Not(ContainerEq(b[0])), ContainerEq(a2[1])));
+}
+
+TEST(ContainerEqExtraTest, WorksForNativeArrayAsTuple) {
+  const int a1[] = {1, 2, 3};
+  const int a2[] = {1, 2, 3};
+  const int b[] = {1, 2, 3, 4};
+
+  const int* const p1 = a1;
+  EXPECT_THAT(std::make_tuple(p1, 3), ContainerEq(a2));
+  EXPECT_THAT(std::make_tuple(p1, 3), Not(ContainerEq(b)));
+
+  const int c[] = {1, 3, 2};
+  EXPECT_THAT(std::make_tuple(p1, 3), Not(ContainerEq(c)));
+}
+
+TEST(ContainerEqExtraTest, CopiesNativeArrayParameter) {
+  std::string a1[][3] = {
+    {"hi", "hello", "ciao"},
+    {"bye", "see you", "ciao"}
+  };
+
+  std::string a2[][3] = {
+    {"hi", "hello", "ciao"},
+    {"bye", "see you", "ciao"}
+  };
+
+  const Matcher<const std::string(&)[2][3]> m = ContainerEq(a2);
+  EXPECT_THAT(a1, m);
+
+  a2[0][0] = "ha";
+  EXPECT_THAT(a1, m);
+}
+
+TEST(WhenSortedByTest, WorksForEmptyContainer) {
+  const vector<int> numbers;
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(), ElementsAre()));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(less<int>(), ElementsAre(1))));
+}
+
+TEST(WhenSortedByTest, WorksForNonEmptyContainer) {
+  vector<unsigned> numbers;
+  numbers.push_back(3);
+  numbers.push_back(1);
+  numbers.push_back(2);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, WhenSortedBy(greater<unsigned>(),
+                                    ElementsAre(3, 2, 2, 1)));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(greater<unsigned>(),
+                                        ElementsAre(1, 2, 2, 3))));
+}
+
+TEST(WhenSortedByTest, WorksForNonVectorContainer) {
+  list<std::string> words;
+  words.push_back("say");
+  words.push_back("hello");
+  words.push_back("world");
+  EXPECT_THAT(words, WhenSortedBy(less<std::string>(),
+                                  ElementsAre("hello", "say", "world")));
+  EXPECT_THAT(words, Not(WhenSortedBy(less<std::string>(),
+                                      ElementsAre("say", "hello", "world"))));
+}
+
+TEST(WhenSortedByTest, WorksForNativeArray) {
+  const int numbers[] = {1, 3, 2, 4};
+  const int sorted_numbers[] = {1, 2, 3, 4};
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(), ElementsAre(1, 2, 3, 4)));
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(),
+                                    ElementsAreArray(sorted_numbers)));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(less<int>(), ElementsAre(1, 3, 2, 4))));
+}
+
+TEST(WhenSortedByTest, CanDescribeSelf) {
+  const Matcher<vector<int> > m = WhenSortedBy(less<int>(), ElementsAre(1, 2));
+  EXPECT_EQ("(when sorted) has 2 elements where\n"
+            "element #0 is equal to 1,\n"
+            "element #1 is equal to 2",
+            Describe(m));
+  EXPECT_EQ("(when sorted) doesn't have 2 elements, or\n"
+            "element #0 isn't equal to 1, or\n"
+            "element #1 isn't equal to 2",
+            DescribeNegation(m));
+}
+
+TEST(WhenSortedByTest, ExplainsMatchResult) {
+  const int a[] = {2, 1};
+  EXPECT_EQ("which is { 1, 2 } when sorted, whose element #0 doesn't match",
+            Explain(WhenSortedBy(less<int>(), ElementsAre(2, 3)), a));
+  EXPECT_EQ("which is { 1, 2 } when sorted",
+            Explain(WhenSortedBy(less<int>(), ElementsAre(1, 2)), a));
+}
+
+// WhenSorted() is a simple wrapper on WhenSortedBy().  Hence we don't
+// need to test it as exhaustively as we test the latter.
+
+TEST(WhenSortedTest, WorksForEmptyContainer) {
+  const vector<int> numbers;
+  EXPECT_THAT(numbers, WhenSorted(ElementsAre()));
+  EXPECT_THAT(numbers, Not(WhenSorted(ElementsAre(1))));
+}
+
+TEST(WhenSortedTest, WorksForNonEmptyContainer) {
+  list<std::string> words;
+  words.push_back("3");
+  words.push_back("1");
+  words.push_back("2");
+  words.push_back("2");
+  EXPECT_THAT(words, WhenSorted(ElementsAre("1", "2", "2", "3")));
+  EXPECT_THAT(words, Not(WhenSorted(ElementsAre("3", "1", "2", "2"))));
+}
+
+TEST(WhenSortedTest, WorksForMapTypes) {
+  map<std::string, int> word_counts;
+  word_counts["and"] = 1;
+  word_counts["the"] = 1;
+  word_counts["buffalo"] = 2;
+  EXPECT_THAT(word_counts,
+              WhenSorted(ElementsAre(Pair("and", 1), Pair("buffalo", 2),
+                                     Pair("the", 1))));
+  EXPECT_THAT(word_counts,
+              Not(WhenSorted(ElementsAre(Pair("and", 1), Pair("the", 1),
+                                         Pair("buffalo", 2)))));
+}
+
+TEST(WhenSortedTest, WorksForMultiMapTypes) {
+    multimap<int, int> ifib;
+    ifib.insert(make_pair(8, 6));
+    ifib.insert(make_pair(2, 3));
+    ifib.insert(make_pair(1, 1));
+    ifib.insert(make_pair(3, 4));
+    ifib.insert(make_pair(1, 2));
+    ifib.insert(make_pair(5, 5));
+    EXPECT_THAT(ifib, WhenSorted(ElementsAre(Pair(1, 1),
+                                             Pair(1, 2),
+                                             Pair(2, 3),
+                                             Pair(3, 4),
+                                             Pair(5, 5),
+                                             Pair(8, 6))));
+    EXPECT_THAT(ifib, Not(WhenSorted(ElementsAre(Pair(8, 6),
+                                                 Pair(2, 3),
+                                                 Pair(1, 1),
+                                                 Pair(3, 4),
+                                                 Pair(1, 2),
+                                                 Pair(5, 5)))));
+}
+
+TEST(WhenSortedTest, WorksForPolymorphicMatcher) {
+    std::deque<int> d;
+    d.push_back(2);
+    d.push_back(1);
+    EXPECT_THAT(d, WhenSorted(ElementsAre(1, 2)));
+    EXPECT_THAT(d, Not(WhenSorted(ElementsAre(2, 1))));
+}
+
+TEST(WhenSortedTest, WorksForVectorConstRefMatcher) {
+    std::deque<int> d;
+    d.push_back(2);
+    d.push_back(1);
+    Matcher<const std::vector<int>&> vector_match = ElementsAre(1, 2);
+    EXPECT_THAT(d, WhenSorted(vector_match));
+    Matcher<const std::vector<int>&> not_vector_match = ElementsAre(2, 1);
+    EXPECT_THAT(d, Not(WhenSorted(not_vector_match)));
+}
+
+// Deliberately bare pseudo-container.
+// Offers only begin() and end() accessors, yielding InputIterator.
+template <typename T>
+class Streamlike {
+ private:
+  class ConstIter;
+ public:
+  typedef ConstIter const_iterator;
+  typedef T value_type;
+
+  template <typename InIter>
+  Streamlike(InIter first, InIter last) : remainder_(first, last) {}
+
+  const_iterator begin() const {
+    return const_iterator(this, remainder_.begin());
+  }
+  const_iterator end() const {
+    return const_iterator(this, remainder_.end());
+  }
+
+ private:
+  class ConstIter : public std::iterator<std::input_iterator_tag,
+                                         value_type,
+                                         ptrdiff_t,
+                                         const value_type*,
+                                         const value_type&> {
+   public:
+    ConstIter(const Streamlike* s,
+              typename std::list<value_type>::iterator pos)
+        : s_(s), pos_(pos) {}
+
+    const value_type& operator*() const { return *pos_; }
+    const value_type* operator->() const { return &*pos_; }
+    ConstIter& operator++() {
+      s_->remainder_.erase(pos_++);
+      return *this;
+    }
+
+    // *iter++ is required to work (see std::istreambuf_iterator).
+    // (void)iter++ is also required to work.
+    class PostIncrProxy {
+     public:
+      explicit PostIncrProxy(const value_type& value) : value_(value) {}
+      value_type operator*() const { return value_; }
+     private:
+      value_type value_;
+    };
+    PostIncrProxy operator++(int) {
+      PostIncrProxy proxy(**this);
+      ++(*this);
+      return proxy;
+    }
+
+    friend bool operator==(const ConstIter& a, const ConstIter& b) {
+      return a.s_ == b.s_ && a.pos_ == b.pos_;
+    }
+    friend bool operator!=(const ConstIter& a, const ConstIter& b) {
+      return !(a == b);
+    }
+
+   private:
+    const Streamlike* s_;
+    typename std::list<value_type>::iterator pos_;
+  };
+
+  friend std::ostream& operator<<(std::ostream& os, const Streamlike& s) {
+    os << "[";
+    typedef typename std::list<value_type>::const_iterator Iter;
+    const char* sep = "";
+    for (Iter it = s.remainder_.begin(); it != s.remainder_.end(); ++it) {
+      os << sep << *it;
+      sep = ",";
+    }
+    os << "]";
+    return os;
+  }
+
+  mutable std::list<value_type> remainder_;  // modified by iteration
+};
+
+TEST(StreamlikeTest, Iteration) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + 5);
+  Streamlike<int>::const_iterator it = s.begin();
+  const int* ip = a;
+  while (it != s.end()) {
+    SCOPED_TRACE(ip - a);
+    EXPECT_EQ(*ip++, *it++);
+  }
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithForwardList) {
+  std::forward_list<int> container;
+  EXPECT_THAT(container, BeginEndDistanceIs(0));
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(1)));
+  container.push_front(0);
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(0)));
+  EXPECT_THAT(container, BeginEndDistanceIs(1));
+  container.push_front(0);
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(0)));
+  EXPECT_THAT(container, BeginEndDistanceIs(2));
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithNonStdList) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(a, a + 5);
+  EXPECT_THAT(s, BeginEndDistanceIs(5));
+}
+
+TEST(BeginEndDistanceIsTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = BeginEndDistanceIs(2);
+  EXPECT_EQ("distance between begin() and end() is equal to 2", Describe(m));
+  EXPECT_EQ("distance between begin() and end() isn't equal to 2",
+            DescribeNegation(m));
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(BeginEndDistanceIs(2)));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(BeginEndDistanceIsTest, ExplainsResult) {
+  Matcher<vector<int> > m1 = BeginEndDistanceIs(2);
+  Matcher<vector<int> > m2 = BeginEndDistanceIs(Lt(2));
+  Matcher<vector<int> > m3 = BeginEndDistanceIs(AnyOf(0, 3));
+  Matcher<vector<int> > m4 = BeginEndDistanceIs(GreaterThan(1));
+  vector<int> container;
+  EXPECT_EQ("whose distance between begin() and end() 0 doesn't match",
+            Explain(m1, container));
+  EXPECT_EQ("whose distance between begin() and end() 0 matches",
+            Explain(m2, container));
+  EXPECT_EQ("whose distance between begin() and end() 0 matches",
+            Explain(m3, container));
+  EXPECT_EQ(
+      "whose distance between begin() and end() 0 doesn't match, which is 1 "
+      "less than 1",
+      Explain(m4, container));
+  container.push_back(0);
+  container.push_back(0);
+  EXPECT_EQ("whose distance between begin() and end() 2 matches",
+            Explain(m1, container));
+  EXPECT_EQ("whose distance between begin() and end() 2 doesn't match",
+            Explain(m2, container));
+  EXPECT_EQ("whose distance between begin() and end() 2 doesn't match",
+            Explain(m3, container));
+  EXPECT_EQ(
+      "whose distance between begin() and end() 2 matches, which is 1 more "
+      "than 1",
+      Explain(m4, container));
+}
+
+TEST(WhenSortedTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(s, WhenSorted(ElementsAre(1, 2, 3, 4, 5)));
+  EXPECT_THAT(s, Not(WhenSorted(ElementsAre(2, 1, 4, 5, 3))));
+}
+
+TEST(WhenSortedTest, WorksForVectorConstRefMatcherOnStreamlike) {
+  const int a[] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  Matcher<const std::vector<int>&> vector_match = ElementsAre(1, 2, 3, 4, 5);
+  EXPECT_THAT(s, WhenSorted(vector_match));
+  EXPECT_THAT(s, Not(WhenSorted(ElementsAre(2, 1, 4, 5, 3))));
+}
+
+TEST(IsSupersetOfTest, WorksForNativeArray) {
+  const int subset[] = {1, 4};
+  const int superset[] = {1, 2, 4};
+  const int disjoint[] = {1, 0, 3};
+  EXPECT_THAT(subset, IsSupersetOf(subset));
+  EXPECT_THAT(subset, Not(IsSupersetOf(superset)));
+  EXPECT_THAT(superset, IsSupersetOf(subset));
+  EXPECT_THAT(subset, Not(IsSupersetOf(disjoint)));
+  EXPECT_THAT(disjoint, Not(IsSupersetOf(subset)));
+}
+
+TEST(IsSupersetOfTest, WorksWithDuplicates) {
+  const int not_enough[] = {1, 2};
+  const int enough[] = {1, 1, 2};
+  const int expected[] = {1, 1};
+  EXPECT_THAT(not_enough, Not(IsSupersetOf(expected)));
+  EXPECT_THAT(enough, IsSupersetOf(expected));
+}
+
+TEST(IsSupersetOfTest, WorksForEmpty) {
+  vector<int> numbers;
+  vector<int> expected;
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, Not(IsSupersetOf(expected)));
+  expected.clear();
+  numbers.push_back(1);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(2);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(3);
+  EXPECT_THAT(numbers, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(5);
+  EXPECT_THAT(s, IsSupersetOf(expected));
+
+  expected.push_back(0);
+  EXPECT_THAT(s, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(3);
+  EXPECT_THAT(actual, IsSupersetOf(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, Describe) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      Describe<IntVec>(IsSupersetOf(expected)),
+      Eq("a surjection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSupersetOfTest, DescribeNegation) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(IsSupersetOf(expected)),
+      Eq("no surjection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSupersetOfTest, MatchAndExplain) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  StringMatchResultListener listener;
+  ASSERT_FALSE(ExplainMatchResult(IsSupersetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(),
+              Eq("where the following matchers don't match any elements:\n"
+                 "matcher #0: is equal to 1"));
+
+  v.push_back(1);
+  listener.Clear();
+  ASSERT_TRUE(ExplainMatchResult(IsSupersetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(), Eq("where:\n"
+                                 " - element #0 is matched by matcher #1,\n"
+                                 " - element #2 is matched by matcher #0"));
+}
+
+TEST(IsSupersetOfTest, WorksForRhsInitializerList) {
+  const int numbers[] = {1, 3, 6, 2, 4, 5};
+  EXPECT_THAT(numbers, IsSupersetOf({1, 2}));
+  EXPECT_THAT(numbers, Not(IsSupersetOf({3, 0})));
+}
+
+TEST(IsSupersetOfTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsSupersetOf({Pointee(1)})));
+  helper.Call(MakeUniquePtrs({1, 2}));
+  EXPECT_CALL(helper, Call(Not(IsSupersetOf({Pointee(1), Pointee(2)}))));
+  helper.Call(MakeUniquePtrs({2}));
+}
+
+TEST(IsSubsetOfTest, WorksForNativeArray) {
+  const int subset[] = {1, 4};
+  const int superset[] = {1, 2, 4};
+  const int disjoint[] = {1, 0, 3};
+  EXPECT_THAT(subset, IsSubsetOf(subset));
+  EXPECT_THAT(subset, IsSubsetOf(superset));
+  EXPECT_THAT(superset, Not(IsSubsetOf(subset)));
+  EXPECT_THAT(subset, Not(IsSubsetOf(disjoint)));
+  EXPECT_THAT(disjoint, Not(IsSubsetOf(subset)));
+}
+
+TEST(IsSubsetOfTest, WorksWithDuplicates) {
+  const int not_enough[] = {1, 2};
+  const int enough[] = {1, 1, 2};
+  const int actual[] = {1, 1};
+  EXPECT_THAT(actual, Not(IsSubsetOf(not_enough)));
+  EXPECT_THAT(actual, IsSubsetOf(enough));
+}
+
+TEST(IsSubsetOfTest, WorksForEmpty) {
+  vector<int> numbers;
+  vector<int> expected;
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.clear();
+  numbers.push_back(1);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, Not(IsSubsetOf(expected)));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, Not(IsSubsetOf(expected)));
+  expected.push_back(2);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.push_back(3);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, WorksForStreamlike) {
+  const int a[5] = {1, 2};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  EXPECT_THAT(s, Not(IsSubsetOf(expected)));
+  expected.push_back(2);
+  expected.push_back(5);
+  EXPECT_THAT(s, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(3);
+  EXPECT_THAT(actual, Not(IsSubsetOf(expected)));
+
+  expected.push_back(2);
+  expected.push_back(4);
+  EXPECT_THAT(actual, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, Describe) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+
+  EXPECT_THAT(
+      Describe<IntVec>(IsSubsetOf(expected)),
+      Eq("an injection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSubsetOfTest, DescribeNegation) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(IsSubsetOf(expected)),
+      Eq("no injection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSubsetOfTest, MatchAndExplain) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  StringMatchResultListener listener;
+  ASSERT_FALSE(ExplainMatchResult(IsSubsetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(),
+              Eq("where the following elements don't match any matchers:\n"
+                 "element #1: 3"));
+
+  expected.push_back(3);
+  listener.Clear();
+  ASSERT_TRUE(ExplainMatchResult(IsSubsetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(), Eq("where:\n"
+                                 " - element #0 is matched by matcher #1,\n"
+                                 " - element #1 is matched by matcher #2"));
+}
+
+TEST(IsSubsetOfTest, WorksForRhsInitializerList) {
+  const int numbers[] = {1, 2, 3};
+  EXPECT_THAT(numbers, IsSubsetOf({1, 2, 3, 4}));
+  EXPECT_THAT(numbers, Not(IsSubsetOf({1, 2})));
+}
+
+TEST(IsSubsetOfTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsSubsetOf({Pointee(1), Pointee(2)})));
+  helper.Call(MakeUniquePtrs({1}));
+  EXPECT_CALL(helper, Call(Not(IsSubsetOf({Pointee(1)}))));
+  helper.Call(MakeUniquePtrs({2}));
+}
+
+// Tests using ElementsAre() and ElementsAreArray() with stream-like
+// "containers".
+
+TEST(ElemensAreStreamTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  EXPECT_THAT(s, ElementsAre(1, 2, 3, 4, 5));
+  EXPECT_THAT(s, Not(ElementsAre(2, 1, 4, 5, 3)));
+}
+
+TEST(ElemensAreArrayStreamTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  expected.push_back(4);
+  expected.push_back(5);
+  EXPECT_THAT(s, ElementsAreArray(expected));
+
+  expected[3] = 0;
+  EXPECT_THAT(s, Not(ElementsAreArray(expected)));
+}
+
+TEST(ElementsAreTest, WorksWithUncopyable) {
+  Uncopyable objs[2];
+  objs[0].set_value(-3);
+  objs[1].set_value(1);
+  EXPECT_THAT(objs, ElementsAre(UncopyableIs(-3), Truly(ValueIsPositive)));
+}
+
+TEST(ElementsAreTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(ElementsAre(Pointee(1), Pointee(2))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+
+  EXPECT_CALL(helper, Call(ElementsAreArray({Pointee(3), Pointee(4)})));
+  helper.Call(MakeUniquePtrs({3, 4}));
+}
+
+TEST(ElementsAreTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(3);
+  expected.push_back(1);
+  expected.push_back(2);
+  EXPECT_THAT(actual, ElementsAreArray(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(ElementsAreArray(expected)));
+}
+
+// Tests for UnorderedElementsAreArray()
+
+TEST(UnorderedElementsAreArrayTest, SucceedsWhenExpected) {
+  const int a[] = {0, 1, 2, 3, 4};
+  std::vector<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  do {
+    StringMatchResultListener listener;
+    EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(a),
+                                   s, &listener)) << listener.str();
+  } while (std::next_permutation(s.begin(), s.end()));
+}
+
+TEST(UnorderedElementsAreArrayTest, VectorBool) {
+  const bool a[] = {0, 1, 0, 1, 1};
+  const bool b[] = {1, 0, 1, 1, 0};
+  std::vector<bool> expected(a, a + GTEST_ARRAY_SIZE_(a));
+  std::vector<bool> actual(b, b + GTEST_ARRAY_SIZE_(b));
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(expected),
+                                 actual, &listener)) << listener.str();
+}
+
+TEST(UnorderedElementsAreArrayTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag, and it has no
+  // size() or empty() methods.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+
+  ::std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  expected.push_back(4);
+  expected.push_back(5);
+  EXPECT_THAT(s, UnorderedElementsAreArray(expected));
+
+  expected.push_back(6);
+  EXPECT_THAT(s, Not(UnorderedElementsAreArray(expected)));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  EXPECT_THAT(actual, UnorderedElementsAreArray(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(UnorderedElementsAreArray(expected)));
+}
+
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerList) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  EXPECT_THAT(a, UnorderedElementsAreArray({1, 2, 3, 4, 5}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray({1, 2, 3, 4, 6})));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerListOfCStrings) {
+  const std::string a[5] = {"a", "b", "c", "d", "e"};
+  EXPECT_THAT(a, UnorderedElementsAreArray({"a", "b", "c", "d", "e"}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray({"a", "b", "c", "d", "ef"})));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerListOfSameTypedMatchers) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  EXPECT_THAT(a, UnorderedElementsAreArray(
+      {Eq(1), Eq(2), Eq(3), Eq(4), Eq(5)}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray(
+      {Eq(1), Eq(2), Eq(3), Eq(4), Eq(6)})));
+}
+
+TEST(UnorderedElementsAreArrayTest,
+     TakesInitializerListOfDifferentTypedMatchers) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  // The compiler cannot infer the type of the initializer list if its
+  // elements have different types.  We must explicitly specify the
+  // unified element type in this case.
+  EXPECT_THAT(a, UnorderedElementsAreArray<Matcher<int> >(
+      {Eq(1), Ne(-2), Ge(3), Le(4), Eq(5)}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray<Matcher<int> >(
+      {Eq(1), Ne(-2), Ge(3), Le(4), Eq(6)})));
+}
+
+
+TEST(UnorderedElementsAreArrayTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper,
+              Call(UnorderedElementsAreArray({Pointee(1), Pointee(2)})));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+class UnorderedElementsAreTest : public testing::Test {
+ protected:
+  typedef std::vector<int> IntVec;
+};
+
+TEST_F(UnorderedElementsAreTest, WorksWithUncopyable) {
+  Uncopyable objs[2];
+  objs[0].set_value(-3);
+  objs[1].set_value(1);
+  EXPECT_THAT(objs,
+              UnorderedElementsAre(Truly(ValueIsPositive), UncopyableIs(-3)));
+}
+
+TEST_F(UnorderedElementsAreTest, SucceedsWhenExpected) {
+  const int a[] = {1, 2, 3};
+  std::vector<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  do {
+    StringMatchResultListener listener;
+    EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                   s, &listener)) << listener.str();
+  } while (std::next_permutation(s.begin(), s.end()));
+}
+
+TEST_F(UnorderedElementsAreTest, FailsWhenAnElementMatchesNoMatcher) {
+  const int a[] = {1, 2, 3};
+  std::vector<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+  std::vector<Matcher<int> > mv;
+  mv.push_back(1);
+  mv.push_back(2);
+  mv.push_back(2);
+  // The element with value '3' matches nothing: fail fast.
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                  s, &listener)) << listener.str();
+}
+
+TEST_F(UnorderedElementsAreTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag, and it has no
+  // size() or empty() methods.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + GTEST_ARRAY_SIZE_(a));
+
+  EXPECT_THAT(s, UnorderedElementsAre(1, 2, 3, 4, 5));
+  EXPECT_THAT(s, Not(UnorderedElementsAre(2, 2, 3, 4, 5)));
+}
+
+TEST_F(UnorderedElementsAreTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(UnorderedElementsAre(Pointee(1), Pointee(2))));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+// One naive implementation of the matcher runs in O(N!) time, which is too
+// slow for many real-world inputs. This test shows that our matcher can match
+// 100 inputs very quickly (a few milliseconds).  An O(100!) is 10^158
+// iterations and obviously effectively incomputable.
+// [ RUN      ] UnorderedElementsAreTest.Performance
+// [       OK ] UnorderedElementsAreTest.Performance (4 ms)
+TEST_F(UnorderedElementsAreTest, Performance) {
+  std::vector<int> s;
+  std::vector<Matcher<int> > mv;
+  for (int i = 0; i < 100; ++i) {
+    s.push_back(i);
+    mv.push_back(_);
+  }
+  mv[50] = Eq(0);
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                 s, &listener)) << listener.str();
+}
+
+// Another variant of 'Performance' with similar expectations.
+// [ RUN      ] UnorderedElementsAreTest.PerformanceHalfStrict
+// [       OK ] UnorderedElementsAreTest.PerformanceHalfStrict (4 ms)
+TEST_F(UnorderedElementsAreTest, PerformanceHalfStrict) {
+  std::vector<int> s;
+  std::vector<Matcher<int> > mv;
+  for (int i = 0; i < 100; ++i) {
+    s.push_back(i);
+    if (i & 1) {
+      mv.push_back(_);
+    } else {
+      mv.push_back(i);
+    }
+  }
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                 s, &listener)) << listener.str();
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageCountWrong) {
+  std::vector<int> v;
+  v.push_back(4);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(listener.str(), Eq("which has 1 element"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageCountWrongZero) {
+  std::vector<int> v;
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(listener.str(), Eq(""));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedMatchers) {
+  std::vector<int> v;
+  v.push_back(1);
+  v.push_back(1);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where the following matchers don't match any elements:\n"
+         "matcher #1: is equal to 2"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedElements) {
+  std::vector<int> v;
+  v.push_back(1);
+  v.push_back(2);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 1),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where the following elements don't match any matchers:\n"
+         "element #1: 2"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedMatcherAndElement) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where"
+         " the following matchers don't match any elements:\n"
+         "matcher #0: is equal to 1\n"
+         "and"
+         " where"
+         " the following elements don't match any matchers:\n"
+         "element #1: 3"));
+}
+
+// Test helper for formatting element, matcher index pairs in expectations.
+static std::string EMString(int element, int matcher) {
+  stringstream ss;
+  ss << "(element #" << element << ", matcher #" << matcher << ")";
+  return ss.str();
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageImperfectMatchOnly) {
+  // A situation where all elements and matchers have a match
+  // associated with them, but the max matching is not perfect.
+  std::vector<std::string> v;
+  v.push_back("a");
+  v.push_back("b");
+  v.push_back("c");
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(
+      UnorderedElementsAre("a", "a", AnyOf("b", "c")), v, &listener))
+      << listener.str();
+
+  std::string prefix =
+      "where no permutation of the elements can satisfy all matchers, "
+      "and the closest match is 2 of 3 matchers with the "
+      "pairings:\n";
+
+  // We have to be a bit loose here, because there are 4 valid max matches.
+  EXPECT_THAT(
+      listener.str(),
+      AnyOf(prefix + "{\n  " + EMString(0, 0) +
+                     ",\n  " + EMString(1, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 1) +
+                     ",\n  " + EMString(1, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 0) +
+                     ",\n  " + EMString(2, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 1) +
+                     ",\n  " + EMString(2, 2) + "\n}"));
+}
+
+TEST_F(UnorderedElementsAreTest, Describe) {
+  EXPECT_THAT(Describe<IntVec>(UnorderedElementsAre()),
+              Eq("is empty"));
+  EXPECT_THAT(
+      Describe<IntVec>(UnorderedElementsAre(345)),
+      Eq("has 1 element and that element is equal to 345"));
+  EXPECT_THAT(
+      Describe<IntVec>(UnorderedElementsAre(111, 222, 333)),
+      Eq("has 3 elements and there exists some permutation "
+         "of elements such that:\n"
+         " - element #0 is equal to 111, and\n"
+         " - element #1 is equal to 222, and\n"
+         " - element #2 is equal to 333"));
+}
+
+TEST_F(UnorderedElementsAreTest, DescribeNegation) {
+  EXPECT_THAT(DescribeNegation<IntVec>(UnorderedElementsAre()),
+              Eq("isn't empty"));
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(UnorderedElementsAre(345)),
+      Eq("doesn't have 1 element, or has 1 element that isn't equal to 345"));
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(UnorderedElementsAre(123, 234, 345)),
+      Eq("doesn't have 3 elements, or there exists no permutation "
+         "of elements such that:\n"
+         " - element #0 is equal to 123, and\n"
+         " - element #1 is equal to 234, and\n"
+         " - element #2 is equal to 345"));
+}
+
+namespace {
+
+// Used as a check on the more complex max flow method used in the
+// real testing::internal::FindMaxBipartiteMatching. This method is
+// compatible but runs in worst-case factorial time, so we only
+// use it in testing for small problem sizes.
+template <typename Graph>
+class BacktrackingMaxBPMState {
+ public:
+  // Does not take ownership of 'g'.
+  explicit BacktrackingMaxBPMState(const Graph* g) : graph_(g) { }
+
+  ElementMatcherPairs Compute() {
+    if (graph_->LhsSize() == 0 || graph_->RhsSize() == 0) {
+      return best_so_far_;
+    }
+    lhs_used_.assign(graph_->LhsSize(), kUnused);
+    rhs_used_.assign(graph_->RhsSize(), kUnused);
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      matches_.clear();
+      RecurseInto(irhs);
+      if (best_so_far_.size() == graph_->RhsSize())
+        break;
+    }
+    return best_so_far_;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  void PushMatch(size_t lhs, size_t rhs) {
+    matches_.push_back(ElementMatcherPair(lhs, rhs));
+    lhs_used_[lhs] = rhs;
+    rhs_used_[rhs] = lhs;
+    if (matches_.size() > best_so_far_.size()) {
+      best_so_far_ = matches_;
+    }
+  }
+
+  void PopMatch() {
+    const ElementMatcherPair& back = matches_.back();
+    lhs_used_[back.first] = kUnused;
+    rhs_used_[back.second] = kUnused;
+    matches_.pop_back();
+  }
+
+  bool RecurseInto(size_t irhs) {
+    if (rhs_used_[irhs] != kUnused) {
+      return true;
+    }
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      if (lhs_used_[ilhs] != kUnused) {
+        continue;
+      }
+      if (!graph_->HasEdge(ilhs, irhs)) {
+        continue;
+      }
+      PushMatch(ilhs, irhs);
+      if (best_so_far_.size() == graph_->RhsSize()) {
+        return false;
+      }
+      for (size_t mi = irhs + 1; mi < graph_->RhsSize(); ++mi) {
+        if (!RecurseInto(mi)) return false;
+      }
+      PopMatch();
+    }
+    return true;
+  }
+
+  const Graph* graph_;  // not owned
+  std::vector<size_t> lhs_used_;
+  std::vector<size_t> rhs_used_;
+  ElementMatcherPairs matches_;
+  ElementMatcherPairs best_so_far_;
+};
+
+template <typename Graph>
+const size_t BacktrackingMaxBPMState<Graph>::kUnused;
+
+}  // namespace
+
+// Implement a simple backtracking algorithm to determine if it is possible
+// to find one element per matcher, without reusing elements.
+template <typename Graph>
+ElementMatcherPairs
+FindBacktrackingMaxBPM(const Graph& g) {
+  return BacktrackingMaxBPMState<Graph>(&g).Compute();
+}
+
+class BacktrackingBPMTest : public ::testing::Test { };
+
+// Tests the MaxBipartiteMatching algorithm with square matrices.
+// The single int param is the # of nodes on each of the left and right sides.
+class BipartiteTest : public ::testing::TestWithParam<size_t> {};
+
+// Verify all match graphs up to some moderate number of edges.
+TEST_P(BipartiteTest, Exhaustive) {
+  size_t nodes = GetParam();
+  MatchMatrix graph(nodes, nodes);
+  do {
+    ElementMatcherPairs matches =
+        internal::FindMaxBipartiteMatching(graph);
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(), matches.size())
+        << "graph: " << graph.DebugString();
+    // Check that all elements of matches are in the graph.
+    // Check that elements of first and second are unique.
+    std::vector<bool> seen_element(graph.LhsSize());
+    std::vector<bool> seen_matcher(graph.RhsSize());
+    SCOPED_TRACE(PrintToString(matches));
+    for (size_t i = 0; i < matches.size(); ++i) {
+      size_t ilhs = matches[i].first;
+      size_t irhs = matches[i].second;
+      EXPECT_TRUE(graph.HasEdge(ilhs, irhs));
+      EXPECT_FALSE(seen_element[ilhs]);
+      EXPECT_FALSE(seen_matcher[irhs]);
+      seen_element[ilhs] = true;
+      seen_matcher[irhs] = true;
+    }
+  } while (graph.NextGraph());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllGraphs, BipartiteTest,
+                         ::testing::Range(size_t{0}, size_t{5}));
+
+// Parameterized by a pair interpreted as (LhsSize, RhsSize).
+class BipartiteNonSquareTest
+    : public ::testing::TestWithParam<std::pair<size_t, size_t> > {
+};
+
+TEST_F(BipartiteNonSquareTest, SimpleBacktracking) {
+  //   .......
+  // 0:-----\ :
+  // 1:---\ | :
+  // 2:---\ | :
+  // 3:-\ | | :
+  //  :.......:
+  //    0 1 2
+  MatchMatrix g(4, 3);
+  static const size_t kEdges[][2] = {{0, 2}, {1, 1}, {2, 1}, {3, 0}};
+  for (size_t i = 0; i < GTEST_ARRAY_SIZE_(kEdges); ++i) {
+    g.SetEdge(kEdges[i][0], kEdges[i][1], true);
+  }
+  EXPECT_THAT(FindBacktrackingMaxBPM(g),
+              ElementsAre(Pair(3, 0),
+                          Pair(AnyOf(1, 2), 1),
+                          Pair(0, 2))) << g.DebugString();
+}
+
+// Verify a few nonsquare matrices.
+TEST_P(BipartiteNonSquareTest, Exhaustive) {
+  size_t nlhs = GetParam().first;
+  size_t nrhs = GetParam().second;
+  MatchMatrix graph(nlhs, nrhs);
+  do {
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(),
+              internal::FindMaxBipartiteMatching(graph).size())
+        << "graph: " << graph.DebugString()
+        << "\nbacktracking: "
+        << PrintToString(FindBacktrackingMaxBPM(graph))
+        << "\nmax flow: "
+        << PrintToString(internal::FindMaxBipartiteMatching(graph));
+  } while (graph.NextGraph());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllGraphs, BipartiteNonSquareTest,
+    testing::Values(
+        std::make_pair(1, 2),
+        std::make_pair(2, 1),
+        std::make_pair(3, 2),
+        std::make_pair(2, 3),
+        std::make_pair(4, 1),
+        std::make_pair(1, 4),
+        std::make_pair(4, 3),
+        std::make_pair(3, 4)));
+
+class BipartiteRandomTest
+    : public ::testing::TestWithParam<std::pair<int, int> > {
+};
+
+// Verifies a large sample of larger graphs.
+TEST_P(BipartiteRandomTest, LargerNets) {
+  int nodes = GetParam().first;
+  int iters = GetParam().second;
+  MatchMatrix graph(static_cast<size_t>(nodes), static_cast<size_t>(nodes));
+
+  auto seed = static_cast<testing::internal::UInt32>(GTEST_FLAG(random_seed));
+  if (seed == 0) {
+    seed = static_cast<testing::internal::UInt32>(time(nullptr));
+  }
+
+  for (; iters > 0; --iters, ++seed) {
+    srand(static_cast<unsigned int>(seed));
+    graph.Randomize();
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(),
+              internal::FindMaxBipartiteMatching(graph).size())
+        << " graph: " << graph.DebugString()
+        << "\nTo reproduce the failure, rerun the test with the flag"
+           " --" << GTEST_FLAG_PREFIX_ << "random_seed=" << seed;
+  }
+}
+
+// Test argument is a std::pair<int, int> representing (nodes, iters).
+INSTANTIATE_TEST_SUITE_P(Samples, BipartiteRandomTest,
+    testing::Values(
+        std::make_pair(5, 10000),
+        std::make_pair(6, 5000),
+        std::make_pair(7, 2000),
+        std::make_pair(8, 500),
+        std::make_pair(9, 100)));
+
+// Tests IsReadableTypeName().
+
+TEST(IsReadableTypeNameTest, ReturnsTrueForShortNames) {
+  EXPECT_TRUE(IsReadableTypeName("int"));
+  EXPECT_TRUE(IsReadableTypeName("const unsigned char*"));
+  EXPECT_TRUE(IsReadableTypeName("MyMap<int, void*>"));
+  EXPECT_TRUE(IsReadableTypeName("void (*)(int, bool)"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsTrueForLongNonTemplateNonFunctionNames) {
+  EXPECT_TRUE(IsReadableTypeName("my_long_namespace::MyClassName"));
+  EXPECT_TRUE(IsReadableTypeName("int [5][6][7][8][9][10][11]"));
+  EXPECT_TRUE(IsReadableTypeName("my_namespace::MyOuterClass::MyInnerClass"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsFalseForLongTemplateNames) {
+  EXPECT_FALSE(
+      IsReadableTypeName("basic_string<char, std::char_traits<char> >"));
+  EXPECT_FALSE(IsReadableTypeName("std::vector<int, std::alloc_traits<int> >"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsFalseForLongFunctionTypeNames) {
+  EXPECT_FALSE(IsReadableTypeName("void (&)(int, bool, char, float)"));
+}
+
+// Tests FormatMatcherDescription().
+
+TEST(FormatMatcherDescriptionTest, WorksForEmptyDescription) {
+  EXPECT_EQ("is even",
+            FormatMatcherDescription(false, "IsEven", Strings()));
+  EXPECT_EQ("not (is even)",
+            FormatMatcherDescription(true, "IsEven", Strings()));
+
+  const char* params[] = {"5"};
+  EXPECT_EQ("equals 5",
+            FormatMatcherDescription(false, "Equals",
+                                     Strings(params, params + 1)));
+
+  const char* params2[] = {"5", "8"};
+  EXPECT_EQ("is in range (5, 8)",
+            FormatMatcherDescription(false, "IsInRange",
+                                     Strings(params2, params2 + 2)));
+}
+
+// Tests PolymorphicMatcher::mutable_impl().
+TEST(PolymorphicMatcherTest, CanAccessMutableImpl) {
+  PolymorphicMatcher<DivisibleByImpl> m(DivisibleByImpl(42));
+  DivisibleByImpl& impl = m.mutable_impl();
+  EXPECT_EQ(42, impl.divider());
+
+  impl.set_divider(0);
+  EXPECT_EQ(0, m.mutable_impl().divider());
+}
+
+// Tests PolymorphicMatcher::impl().
+TEST(PolymorphicMatcherTest, CanAccessImpl) {
+  const PolymorphicMatcher<DivisibleByImpl> m(DivisibleByImpl(42));
+  const DivisibleByImpl& impl = m.impl();
+  EXPECT_EQ(42, impl.divider());
+}
+
+TEST(MatcherTupleTest, ExplainsMatchFailure) {
+  stringstream ss1;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(Matcher<char>(Eq('a')), GreaterThan(5)),
+      std::make_tuple('a', 10), &ss1);
+  EXPECT_EQ("", ss1.str());  // Successful match.
+
+  stringstream ss2;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(GreaterThan(5), Matcher<char>(Eq('a'))),
+      std::make_tuple(2, 'b'), &ss2);
+  EXPECT_EQ("  Expected arg #0: is > 5\n"
+            "           Actual: 2, which is 3 less than 5\n"
+            "  Expected arg #1: is equal to 'a' (97, 0x61)\n"
+            "           Actual: 'b' (98, 0x62)\n",
+            ss2.str());  // Failed match where both arguments need explanation.
+
+  stringstream ss3;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(GreaterThan(5), Matcher<char>(Eq('a'))),
+      std::make_tuple(2, 'a'), &ss3);
+  EXPECT_EQ("  Expected arg #0: is > 5\n"
+            "           Actual: 2, which is 3 less than 5\n",
+            ss3.str());  // Failed match where only one argument needs
+                         // explanation.
+}
+
+// Tests Each().
+
+TEST(EachTest, ExplainsMatchResultCorrectly) {
+  set<int> a;  // empty
+
+  Matcher<set<int> > m = Each(2);
+  EXPECT_EQ("", Explain(m, a));
+
+  Matcher<const int(&)[1]> n = Each(1);  // NOLINT
+
+  const int b[1] = {1};
+  EXPECT_EQ("", Explain(n, b));
+
+  n = Each(3);
+  EXPECT_EQ("whose element #0 doesn't match", Explain(n, b));
+
+  a.insert(1);
+  a.insert(2);
+  a.insert(3);
+  m = Each(GreaterThan(0));
+  EXPECT_EQ("", Explain(m, a));
+
+  m = Each(GreaterThan(10));
+  EXPECT_EQ("whose element #0 doesn't match, which is 9 less than 10",
+            Explain(m, a));
+}
+
+TEST(EachTest, DescribesItselfCorrectly) {
+  Matcher<vector<int> > m = Each(1);
+  EXPECT_EQ("only contains elements that is equal to 1", Describe(m));
+
+  Matcher<vector<int> > m2 = Not(m);
+  EXPECT_EQ("contains some element that isn't equal to 1", Describe(m2));
+}
+
+TEST(EachTest, MatchesVectorWhenAllElementsMatch) {
+  vector<int> some_vector;
+  EXPECT_THAT(some_vector, Each(1));
+  some_vector.push_back(3);
+  EXPECT_THAT(some_vector, Not(Each(1)));
+  EXPECT_THAT(some_vector, Each(3));
+  some_vector.push_back(1);
+  some_vector.push_back(2);
+  EXPECT_THAT(some_vector, Not(Each(3)));
+  EXPECT_THAT(some_vector, Each(Lt(3.5)));
+
+  vector<std::string> another_vector;
+  another_vector.push_back("fee");
+  EXPECT_THAT(another_vector, Each(std::string("fee")));
+  another_vector.push_back("fie");
+  another_vector.push_back("foe");
+  another_vector.push_back("fum");
+  EXPECT_THAT(another_vector, Not(Each(std::string("fee"))));
+}
+
+TEST(EachTest, MatchesMapWhenAllElementsMatch) {
+  map<const char*, int> my_map;
+  const char* bar = "a string";
+  my_map[bar] = 2;
+  EXPECT_THAT(my_map, Each(make_pair(bar, 2)));
+
+  map<std::string, int> another_map;
+  EXPECT_THAT(another_map, Each(make_pair(std::string("fee"), 1)));
+  another_map["fee"] = 1;
+  EXPECT_THAT(another_map, Each(make_pair(std::string("fee"), 1)));
+  another_map["fie"] = 2;
+  another_map["foe"] = 3;
+  another_map["fum"] = 4;
+  EXPECT_THAT(another_map, Not(Each(make_pair(std::string("fee"), 1))));
+  EXPECT_THAT(another_map, Not(Each(make_pair(std::string("fum"), 1))));
+  EXPECT_THAT(another_map, Each(Pair(_, Gt(0))));
+}
+
+TEST(EachTest, AcceptsMatcher) {
+  const int a[] = {1, 2, 3};
+  EXPECT_THAT(a, Each(Gt(0)));
+  EXPECT_THAT(a, Not(Each(Gt(1))));
+}
+
+TEST(EachTest, WorksForNativeArrayAsTuple) {
+  const int a[] = {1, 2};
+  const int* const pointer = a;
+  EXPECT_THAT(std::make_tuple(pointer, 2), Each(Gt(0)));
+  EXPECT_THAT(std::make_tuple(pointer, 2), Not(Each(Gt(1))));
+}
+
+TEST(EachTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Each(Pointee(Gt(0)))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+// For testing Pointwise().
+class IsHalfOfMatcher {
+ public:
+  template <typename T1, typename T2>
+  bool MatchAndExplain(const std::tuple<T1, T2>& a_pair,
+                       MatchResultListener* listener) const {
+    if (std::get<0>(a_pair) == std::get<1>(a_pair) / 2) {
+      *listener << "where the second is " << std::get<1>(a_pair);
+      return true;
+    } else {
+      *listener << "where the second/2 is " << std::get<1>(a_pair) / 2;
+      return false;
+    }
+  }
+
+  void DescribeTo(ostream* os) const {
+    *os << "are a pair where the first is half of the second";
+  }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "are a pair where the first isn't half of the second";
+  }
+};
+
+PolymorphicMatcher<IsHalfOfMatcher> IsHalfOf() {
+  return MakePolymorphicMatcher(IsHalfOfMatcher());
+}
+
+TEST(PointwiseTest, DescribesSelf) {
+  vector<int> rhs;
+  rhs.push_back(1);
+  rhs.push_back(2);
+  rhs.push_back(3);
+  const Matcher<const vector<int>&> m = Pointwise(IsHalfOf(), rhs);
+  EXPECT_EQ("contains 3 values, where each value and its corresponding value "
+            "in { 1, 2, 3 } are a pair where the first is half of the second",
+            Describe(m));
+  EXPECT_EQ("doesn't contain exactly 3 values, or contains a value x at some "
+            "index i where x and the i-th value of { 1, 2, 3 } are a pair "
+            "where the first isn't half of the second",
+            DescribeNegation(m));
+}
+
+TEST(PointwiseTest, MakesCopyOfRhs) {
+  list<signed char> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+
+  int lhs[] = {1, 2};
+  const Matcher<const int (&)[2]> m = Pointwise(IsHalfOf(), rhs);
+  EXPECT_THAT(lhs, m);
+
+  // Changing rhs now shouldn't affect m, which made a copy of rhs.
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, m);
+}
+
+TEST(PointwiseTest, WorksForLhsNativeArray) {
+  const int lhs[] = {1, 2, 3};
+  vector<int> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, Pointwise(Lt(), rhs));
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs)));
+}
+
+TEST(PointwiseTest, WorksForRhsNativeArray) {
+  const int rhs[] = {1, 2, 3};
+  vector<int> lhs;
+  lhs.push_back(2);
+  lhs.push_back(4);
+  lhs.push_back(6);
+  EXPECT_THAT(lhs, Pointwise(Gt(), rhs));
+  EXPECT_THAT(lhs, Not(Pointwise(Lt(), rhs)));
+}
+
+// Test is effective only with sanitizers.
+TEST(PointwiseTest, WorksForVectorOfBool) {
+  vector<bool> rhs(3, false);
+  rhs[1] = true;
+  vector<bool> lhs = rhs;
+  EXPECT_THAT(lhs, Pointwise(Eq(), rhs));
+  rhs[0] = true;
+  EXPECT_THAT(lhs, Not(Pointwise(Eq(), rhs)));
+}
+
+
+TEST(PointwiseTest, WorksForRhsInitializerList) {
+  const vector<int> lhs{2, 4, 6};
+  EXPECT_THAT(lhs, Pointwise(Gt(), {1, 2, 3}));
+  EXPECT_THAT(lhs, Not(Pointwise(Lt(), {3, 3, 7})));
+}
+
+
+TEST(PointwiseTest, RejectsWrongSize) {
+  const double lhs[2] = {1, 2};
+  const int rhs[1] = {0};
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs)));
+  EXPECT_EQ("which contains 2 values",
+            Explain(Pointwise(Gt(), rhs), lhs));
+
+  const int rhs2[3] = {0, 1, 2};
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs2)));
+}
+
+TEST(PointwiseTest, RejectsWrongContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 6, 4};
+  EXPECT_THAT(lhs, Not(Pointwise(IsHalfOf(), rhs)));
+  EXPECT_EQ("where the value pair (2, 6) at index #1 don't match, "
+            "where the second/2 is 3",
+            Explain(Pointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(PointwiseTest, AcceptsCorrectContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  EXPECT_THAT(lhs, Pointwise(IsHalfOf(), rhs));
+  EXPECT_EQ("", Explain(Pointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(PointwiseTest, AllowsMonomorphicInnerMatcher) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  const Matcher<std::tuple<const double&, const int&>> m1 = IsHalfOf();
+  EXPECT_THAT(lhs, Pointwise(m1, rhs));
+  EXPECT_EQ("", Explain(Pointwise(m1, rhs), lhs));
+
+  // This type works as a std::tuple<const double&, const int&> can be
+  // implicitly cast to std::tuple<double, int>.
+  const Matcher<std::tuple<double, int>> m2 = IsHalfOf();
+  EXPECT_THAT(lhs, Pointwise(m2, rhs));
+  EXPECT_EQ("", Explain(Pointwise(m2, rhs), lhs));
+}
+
+MATCHER(PointeeEquals, "Points to an equal value") {
+  return ExplainMatchResult(::testing::Pointee(::testing::get<1>(arg)),
+                            ::testing::get<0>(arg), result_listener);
+}
+
+TEST(PointwiseTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Pointwise(PointeeEquals(), std::vector<int>{1, 2})));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(UnorderedPointwiseTest, DescribesSelf) {
+  vector<int> rhs;
+  rhs.push_back(1);
+  rhs.push_back(2);
+  rhs.push_back(3);
+  const Matcher<const vector<int>&> m = UnorderedPointwise(IsHalfOf(), rhs);
+  EXPECT_EQ(
+      "has 3 elements and there exists some permutation of elements such "
+      "that:\n"
+      " - element #0 and 1 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #1 and 2 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #2 and 3 are a pair where the first is half of the second",
+      Describe(m));
+  EXPECT_EQ(
+      "doesn't have 3 elements, or there exists no permutation of elements "
+      "such that:\n"
+      " - element #0 and 1 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #1 and 2 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #2 and 3 are a pair where the first is half of the second",
+      DescribeNegation(m));
+}
+
+TEST(UnorderedPointwiseTest, MakesCopyOfRhs) {
+  list<signed char> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+
+  int lhs[] = {2, 1};
+  const Matcher<const int (&)[2]> m = UnorderedPointwise(IsHalfOf(), rhs);
+  EXPECT_THAT(lhs, m);
+
+  // Changing rhs now shouldn't affect m, which made a copy of rhs.
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, m);
+}
+
+TEST(UnorderedPointwiseTest, WorksForLhsNativeArray) {
+  const int lhs[] = {1, 2, 3};
+  vector<int> rhs;
+  rhs.push_back(4);
+  rhs.push_back(6);
+  rhs.push_back(2);
+  EXPECT_THAT(lhs, UnorderedPointwise(Lt(), rhs));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs)));
+}
+
+TEST(UnorderedPointwiseTest, WorksForRhsNativeArray) {
+  const int rhs[] = {1, 2, 3};
+  vector<int> lhs;
+  lhs.push_back(4);
+  lhs.push_back(2);
+  lhs.push_back(6);
+  EXPECT_THAT(lhs, UnorderedPointwise(Gt(), rhs));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Lt(), rhs)));
+}
+
+
+TEST(UnorderedPointwiseTest, WorksForRhsInitializerList) {
+  const vector<int> lhs{2, 4, 6};
+  EXPECT_THAT(lhs, UnorderedPointwise(Gt(), {5, 1, 3}));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Lt(), {1, 1, 7})));
+}
+
+
+TEST(UnorderedPointwiseTest, RejectsWrongSize) {
+  const double lhs[2] = {1, 2};
+  const int rhs[1] = {0};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs)));
+  EXPECT_EQ("which has 2 elements",
+            Explain(UnorderedPointwise(Gt(), rhs), lhs));
+
+  const int rhs2[3] = {0, 1, 2};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs2)));
+}
+
+TEST(UnorderedPointwiseTest, RejectsWrongContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 6, 6};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(IsHalfOf(), rhs)));
+  EXPECT_EQ("where the following elements don't match any matchers:\n"
+            "element #1: 2",
+            Explain(UnorderedPointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(UnorderedPointwiseTest, AcceptsCorrectContentInSameOrder) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  EXPECT_THAT(lhs, UnorderedPointwise(IsHalfOf(), rhs));
+}
+
+TEST(UnorderedPointwiseTest, AcceptsCorrectContentInDifferentOrder) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {6, 4, 2};
+  EXPECT_THAT(lhs, UnorderedPointwise(IsHalfOf(), rhs));
+}
+
+TEST(UnorderedPointwiseTest, AllowsMonomorphicInnerMatcher) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {4, 6, 2};
+  const Matcher<std::tuple<const double&, const int&>> m1 = IsHalfOf();
+  EXPECT_THAT(lhs, UnorderedPointwise(m1, rhs));
+
+  // This type works as a std::tuple<const double&, const int&> can be
+  // implicitly cast to std::tuple<double, int>.
+  const Matcher<std::tuple<double, int>> m2 = IsHalfOf();
+  EXPECT_THAT(lhs, UnorderedPointwise(m2, rhs));
+}
+
+TEST(UnorderedPointwiseTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(UnorderedPointwise(PointeeEquals(),
+                                              std::vector<int>{1, 2})));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+// Sample optional type implementation with minimal requirements for use with
+// Optional matcher.
+template <typename T>
+class SampleOptional {
+ public:
+  using value_type = T;
+  explicit SampleOptional(T value)
+      : value_(std::move(value)), has_value_(true) {}
+  SampleOptional() : value_(), has_value_(false) {}
+  operator bool() const { return has_value_; }
+  const T& operator*() const { return value_; }
+
+ private:
+  T value_;
+  bool has_value_;
+};
+
+TEST(OptionalTest, DescribesSelf) {
+  const Matcher<SampleOptional<int>> m = Optional(Eq(1));
+  EXPECT_EQ("value is equal to 1", Describe(m));
+}
+
+TEST(OptionalTest, ExplainsSelf) {
+  const Matcher<SampleOptional<int>> m = Optional(Eq(1));
+  EXPECT_EQ("whose value 1 matches", Explain(m, SampleOptional<int>(1)));
+  EXPECT_EQ("whose value 2 doesn't match", Explain(m, SampleOptional<int>(2)));
+}
+
+TEST(OptionalTest, MatchesNonEmptyOptional) {
+  const Matcher<SampleOptional<int>> m1 = Optional(1);
+  const Matcher<SampleOptional<int>> m2 = Optional(Eq(2));
+  const Matcher<SampleOptional<int>> m3 = Optional(Lt(3));
+  SampleOptional<int> opt(1);
+  EXPECT_TRUE(m1.Matches(opt));
+  EXPECT_FALSE(m2.Matches(opt));
+  EXPECT_TRUE(m3.Matches(opt));
+}
+
+TEST(OptionalTest, DoesNotMatchNullopt) {
+  const Matcher<SampleOptional<int>> m = Optional(1);
+  SampleOptional<int> empty;
+  EXPECT_FALSE(m.Matches(empty));
+}
+
+TEST(OptionalTest, WorksWithMoveOnly) {
+  Matcher<SampleOptional<std::unique_ptr<int>>> m = Optional(Eq(nullptr));
+  EXPECT_TRUE(m.Matches(SampleOptional<std::unique_ptr<int>>(nullptr)));
+}
+
+class SampleVariantIntString {
+ public:
+  SampleVariantIntString(int i) : i_(i), has_int_(true) {}
+  SampleVariantIntString(const std::string& s) : s_(s), has_int_(false) {}
+
+  template <typename T>
+  friend bool holds_alternative(const SampleVariantIntString& value) {
+    return value.has_int_ == std::is_same<T, int>::value;
+  }
+
+  template <typename T>
+  friend const T& get(const SampleVariantIntString& value) {
+    return value.get_impl(static_cast<T*>(nullptr));
+  }
+
+ private:
+  const int& get_impl(int*) const { return i_; }
+  const std::string& get_impl(std::string*) const { return s_; }
+
+  int i_;
+  std::string s_;
+  bool has_int_;
+};
+
+TEST(VariantTest, DescribesSelf) {
+  const Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_THAT(Describe(m), ContainsRegex("is a variant<> with value of type "
+                                         "'.*' and the value is equal to 1"));
+}
+
+TEST(VariantTest, ExplainsSelf) {
+  const Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_THAT(Explain(m, SampleVariantIntString(1)),
+              ContainsRegex("whose value 1"));
+  EXPECT_THAT(Explain(m, SampleVariantIntString("A")),
+              HasSubstr("whose value is not of type '"));
+  EXPECT_THAT(Explain(m, SampleVariantIntString(2)),
+              "whose value 2 doesn't match");
+}
+
+TEST(VariantTest, FullMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_TRUE(m.Matches(SampleVariantIntString(1)));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_TRUE(m.Matches(SampleVariantIntString("1")));
+}
+
+TEST(VariantTest, TypeDoesNotMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString("1")));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString(1)));
+}
+
+TEST(VariantTest, InnerDoesNotMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString(2)));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString("2")));
+}
+
+class SampleAnyType {
+ public:
+  explicit SampleAnyType(int i) : index_(0), i_(i) {}
+  explicit SampleAnyType(const std::string& s) : index_(1), s_(s) {}
+
+  template <typename T>
+  friend const T* any_cast(const SampleAnyType* any) {
+    return any->get_impl(static_cast<T*>(nullptr));
+  }
+
+ private:
+  int index_;
+  int i_;
+  std::string s_;
+
+  const int* get_impl(int*) const { return index_ == 0 ? &i_ : nullptr; }
+  const std::string* get_impl(std::string*) const {
+    return index_ == 1 ? &s_ : nullptr;
+  }
+};
+
+TEST(AnyWithTest, FullMatch) {
+  Matcher<SampleAnyType> m = AnyWith<int>(Eq(1));
+  EXPECT_TRUE(m.Matches(SampleAnyType(1)));
+}
+
+TEST(AnyWithTest, TestBadCastType) {
+  Matcher<SampleAnyType> m = AnyWith<std::string>(Eq("fail"));
+  EXPECT_FALSE(m.Matches(SampleAnyType(1)));
+}
+
+TEST(AnyWithTest, TestUseInContainers) {
+  std::vector<SampleAnyType> a;
+  a.emplace_back(1);
+  a.emplace_back(2);
+  a.emplace_back(3);
+  EXPECT_THAT(
+      a, ElementsAreArray({AnyWith<int>(1), AnyWith<int>(2), AnyWith<int>(3)}));
+
+  std::vector<SampleAnyType> b;
+  b.emplace_back("hello");
+  b.emplace_back("merhaba");
+  b.emplace_back("salut");
+  EXPECT_THAT(b, ElementsAreArray({AnyWith<std::string>("hello"),
+                                   AnyWith<std::string>("merhaba"),
+                                   AnyWith<std::string>("salut")}));
+}
+TEST(AnyWithTest, TestCompare) {
+  EXPECT_THAT(SampleAnyType(1), AnyWith<int>(Gt(0)));
+}
+
+TEST(AnyWithTest, DescribesSelf) {
+  const Matcher<const SampleAnyType&> m = AnyWith<int>(Eq(1));
+  EXPECT_THAT(Describe(m), ContainsRegex("is an 'any' type with value of type "
+                                         "'.*' and the value is equal to 1"));
+}
+
+TEST(AnyWithTest, ExplainsSelf) {
+  const Matcher<const SampleAnyType&> m = AnyWith<int>(Eq(1));
+
+  EXPECT_THAT(Explain(m, SampleAnyType(1)), ContainsRegex("whose value 1"));
+  EXPECT_THAT(Explain(m, SampleAnyType("A")),
+              HasSubstr("whose value is not of type '"));
+  EXPECT_THAT(Explain(m, SampleAnyType(2)), "whose value 2 doesn't match");
+}
+
+TEST(PointeeTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, Pointee(Eq(3)));
+  EXPECT_THAT(p, Not(Pointee(Eq(2))));
+}
+
+TEST(NotTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, Pointee(Eq(3)));
+  EXPECT_THAT(p, Not(Pointee(Eq(2))));
+}
+
+// Tests Args<k0, ..., kn>(m).
+
+TEST(ArgsTest, AcceptsZeroTemplateArg) {
+  const std::tuple<int, bool> t(5, true);
+  EXPECT_THAT(t, Args<>(Eq(std::tuple<>())));
+  EXPECT_THAT(t, Not(Args<>(Ne(std::tuple<>()))));
+}
+
+TEST(ArgsTest, AcceptsOneTemplateArg) {
+  const std::tuple<int, bool> t(5, true);
+  EXPECT_THAT(t, Args<0>(Eq(std::make_tuple(5))));
+  EXPECT_THAT(t, Args<1>(Eq(std::make_tuple(true))));
+  EXPECT_THAT(t, Not(Args<1>(Eq(std::make_tuple(false)))));
+}
+
+TEST(ArgsTest, AcceptsTwoTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+
+  EXPECT_THAT(t, (Args<0, 1>(Lt())));
+  EXPECT_THAT(t, (Args<1, 2>(Lt())));
+  EXPECT_THAT(t, Not(Args<0, 2>(Gt())));
+}
+
+TEST(ArgsTest, AcceptsRepeatedTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+  EXPECT_THAT(t, (Args<0, 0>(Eq())));
+  EXPECT_THAT(t, Not(Args<1, 1>(Ne())));
+}
+
+TEST(ArgsTest, AcceptsDecreasingTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+  EXPECT_THAT(t, (Args<2, 0>(Gt())));
+  EXPECT_THAT(t, Not(Args<2, 1>(Lt())));
+}
+
+MATCHER(SumIsZero, "") {
+  return std::get<0>(arg) + std::get<1>(arg) + std::get<2>(arg) == 0;
+}
+
+TEST(ArgsTest, AcceptsMoreTemplateArgsThanArityOfOriginalTuple) {
+  EXPECT_THAT(std::make_tuple(-1, 2), (Args<0, 0, 1>(SumIsZero())));
+  EXPECT_THAT(std::make_tuple(1, 2), Not(Args<0, 0, 1>(SumIsZero())));
+}
+
+TEST(ArgsTest, CanBeNested) {
+  const std::tuple<short, int, long, int> t(4, 5, 6L, 6);  // NOLINT
+  EXPECT_THAT(t, (Args<1, 2, 3>(Args<1, 2>(Eq()))));
+  EXPECT_THAT(t, (Args<0, 1, 3>(Args<0, 2>(Lt()))));
+}
+
+TEST(ArgsTest, CanMatchTupleByValue) {
+  typedef std::tuple<char, int, int> Tuple3;
+  const Matcher<Tuple3> m = Args<1, 2>(Lt());
+  EXPECT_TRUE(m.Matches(Tuple3('a', 1, 2)));
+  EXPECT_FALSE(m.Matches(Tuple3('b', 2, 2)));
+}
+
+TEST(ArgsTest, CanMatchTupleByReference) {
+  typedef std::tuple<char, char, int> Tuple3;
+  const Matcher<const Tuple3&> m = Args<0, 1>(Lt());
+  EXPECT_TRUE(m.Matches(Tuple3('a', 'b', 2)));
+  EXPECT_FALSE(m.Matches(Tuple3('b', 'b', 2)));
+}
+
+// Validates that arg is printed as str.
+MATCHER_P(PrintsAs, str, "") {
+  return testing::PrintToString(arg) == str;
+}
+
+TEST(ArgsTest, AcceptsTenTemplateArgs) {
+  EXPECT_THAT(std::make_tuple(0, 1L, 2, 3L, 4, 5, 6, 7, 8, 9),
+              (Args<9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(
+                  PrintsAs("(9, 8, 7, 6, 5, 4, 3, 2, 1, 0)"))));
+  EXPECT_THAT(std::make_tuple(0, 1L, 2, 3L, 4, 5, 6, 7, 8, 9),
+              Not(Args<9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(
+                  PrintsAs("(0, 8, 7, 6, 5, 4, 3, 2, 1, 0)"))));
+}
+
+TEST(ArgsTest, DescirbesSelfCorrectly) {
+  const Matcher<std::tuple<int, bool, char> > m = Args<2, 0>(Lt());
+  EXPECT_EQ("are a tuple whose fields (#2, #0) are a pair where "
+            "the first < the second",
+            Describe(m));
+}
+
+TEST(ArgsTest, DescirbesNestedArgsCorrectly) {
+  const Matcher<const std::tuple<int, bool, char, int>&> m =
+      Args<0, 2, 3>(Args<2, 0>(Lt()));
+  EXPECT_EQ("are a tuple whose fields (#0, #2, #3) are a tuple "
+            "whose fields (#2, #0) are a pair where the first < the second",
+            Describe(m));
+}
+
+TEST(ArgsTest, DescribesNegationCorrectly) {
+  const Matcher<std::tuple<int, char> > m = Args<1, 0>(Gt());
+  EXPECT_EQ("are a tuple whose fields (#1, #0) aren't a pair "
+            "where the first > the second",
+            DescribeNegation(m));
+}
+
+TEST(ArgsTest, ExplainsMatchResultWithoutInnerExplanation) {
+  const Matcher<std::tuple<bool, int, int> > m = Args<1, 2>(Eq());
+  EXPECT_EQ("whose fields (#1, #2) are (42, 42)",
+            Explain(m, std::make_tuple(false, 42, 42)));
+  EXPECT_EQ("whose fields (#1, #2) are (42, 43)",
+            Explain(m, std::make_tuple(false, 42, 43)));
+}
+
+// For testing Args<>'s explanation.
+class LessThanMatcher : public MatcherInterface<std::tuple<char, int> > {
+ public:
+  void DescribeTo(::std::ostream* /*os*/) const override {}
+
+  bool MatchAndExplain(std::tuple<char, int> value,
+                       MatchResultListener* listener) const override {
+    const int diff = std::get<0>(value) - std::get<1>(value);
+    if (diff > 0) {
+      *listener << "where the first value is " << diff
+                << " more than the second";
+    }
+    return diff < 0;
+  }
+};
+
+Matcher<std::tuple<char, int> > LessThan() {
+  return MakeMatcher(new LessThanMatcher);
+}
+
+TEST(ArgsTest, ExplainsMatchResultWithInnerExplanation) {
+  const Matcher<std::tuple<char, int, int> > m = Args<0, 2>(LessThan());
+  EXPECT_EQ(
+      "whose fields (#0, #2) are ('a' (97, 0x61), 42), "
+      "where the first value is 55 more than the second",
+      Explain(m, std::make_tuple('a', 42, 42)));
+  EXPECT_EQ("whose fields (#0, #2) are ('\\0', 43)",
+            Explain(m, std::make_tuple('\0', 42, 43)));
+}
+
+class PredicateFormatterFromMatcherTest : public ::testing::Test {
+ protected:
+  enum Behavior { kInitialSuccess, kAlwaysFail, kFlaky };
+
+  // A matcher that can return different results when used multiple times on the
+  // same input. No real matcher should do this; but this lets us test that we
+  // detect such behavior and fail appropriately.
+  class MockMatcher : public MatcherInterface<Behavior> {
+   public:
+    bool MatchAndExplain(Behavior behavior,
+                         MatchResultListener* listener) const override {
+      *listener << "[MatchAndExplain]";
+      switch (behavior) {
+        case kInitialSuccess:
+          // The first call to MatchAndExplain should use a "not interested"
+          // listener; so this is expected to return |true|. There should be no
+          // subsequent calls.
+          return !listener->IsInterested();
+
+        case kAlwaysFail:
+          return false;
+
+        case kFlaky:
+          // The first call to MatchAndExplain should use a "not interested"
+          // listener; so this will return |false|. Subsequent calls should have
+          // an "interested" listener; so this will return |true|, thus
+          // simulating a flaky matcher.
+          return listener->IsInterested();
+      }
+
+      GTEST_LOG_(FATAL) << "This should never be reached";
+      return false;
+    }
+
+    void DescribeTo(ostream* os) const override { *os << "[DescribeTo]"; }
+
+    void DescribeNegationTo(ostream* os) const override {
+      *os << "[DescribeNegationTo]";
+    }
+  };
+
+  AssertionResult RunPredicateFormatter(Behavior behavior) {
+    auto matcher = MakeMatcher(new MockMatcher);
+    PredicateFormatterFromMatcher<Matcher<Behavior>> predicate_formatter(
+        matcher);
+    return predicate_formatter("dummy-name", behavior);
+  }
+};
+
+TEST_F(PredicateFormatterFromMatcherTest, ShortCircuitOnSuccess) {
+  AssertionResult result = RunPredicateFormatter(kInitialSuccess);
+  EXPECT_TRUE(result);  // Implicit cast to bool.
+  std::string expect;
+  EXPECT_EQ(expect, result.message());
+}
+
+TEST_F(PredicateFormatterFromMatcherTest, NoShortCircuitOnFailure) {
+  AssertionResult result = RunPredicateFormatter(kAlwaysFail);
+  EXPECT_FALSE(result);  // Implicit cast to bool.
+  std::string expect =
+      "Value of: dummy-name\nExpected: [DescribeTo]\n"
+      "  Actual: 1, [MatchAndExplain]";
+  EXPECT_EQ(expect, result.message());
+}
+
+TEST_F(PredicateFormatterFromMatcherTest, DetectsFlakyShortCircuit) {
+  AssertionResult result = RunPredicateFormatter(kFlaky);
+  EXPECT_FALSE(result);  // Implicit cast to bool.
+  std::string expect =
+      "Value of: dummy-name\nExpected: [DescribeTo]\n"
+      "  The matcher failed on the initial attempt; but passed when rerun to "
+      "generate the explanation.\n"
+      "  Actual: 2, [MatchAndExplain]";
+  EXPECT_EQ(expect, result.message());
+}
+
+}  // namespace
+}  // namespace gmock_matchers_test
+}  // namespace testing
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-more-actions_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-more-actions_test.cc
new file mode 100755
index 0000000..b4e0fc3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-more-actions_test.cc
@@ -0,0 +1,699 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions in gmock-more-actions.h.
+
+#include "gmock/gmock-more-actions.h"
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_more_actions_test {
+
+using ::std::plus;
+using ::std::string;
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::DeleteArg;
+using testing::Invoke;
+using testing::Return;
+using testing::ReturnArg;
+using testing::ReturnPointee;
+using testing::SaveArg;
+using testing::SaveArgPointee;
+using testing::SetArgReferee;
+using testing::StaticAssertTypeEq;
+using testing::Unused;
+using testing::WithArg;
+using testing::WithoutArgs;
+
+// For suppressing compiler warnings on conversion possibly losing precision.
+inline short Short(short n) { return n; }  // NOLINT
+inline char Char(char ch) { return ch; }
+
+// Sample functions and functors for testing Invoke() and etc.
+int Nullary() { return 1; }
+
+class NullaryFunctor {
+ public:
+  int operator()() { return 2; }
+};
+
+bool g_done = false;
+void VoidNullary() { g_done = true; }
+
+class VoidNullaryFunctor {
+ public:
+  void operator()() { g_done = true; }
+};
+
+bool Unary(int x) { return x < 0; }
+
+const char* Plus1(const char* s) { return s + 1; }
+
+void VoidUnary(int /* n */) { g_done = true; }
+
+bool ByConstRef(const std::string& s) { return s == "Hi"; }
+
+const double g_double = 0;
+bool ReferencesGlobalDouble(const double& x) { return &x == &g_double; }
+
+std::string ByNonConstRef(std::string& s) { return s += "+"; }  // NOLINT
+
+struct UnaryFunctor {
+  int operator()(bool x) { return x ? 1 : -1; }
+};
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+void VoidBinary(int, char) { g_done = true; }
+
+int Ternary(int x, char y, short z) { return x + y + z; }  // NOLINT
+
+void VoidTernary(int, char, bool) { g_done = true; }
+
+int SumOf4(int a, int b, int c, int d) { return a + b + c + d; }
+
+int SumOfFirst2(int a, int b, Unused, Unused) { return a + b; }
+
+void VoidFunctionWithFourArguments(char, int, float, double) { g_done = true; }
+
+std::string Concat4(const char* s1, const char* s2, const char* s3,
+                    const char* s4) {
+  return std::string(s1) + s2 + s3 + s4;
+}
+
+int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+struct SumOf5Functor {
+  int operator()(int a, int b, int c, int d, int e) {
+    return a + b + c + d + e;
+  }
+};
+
+std::string Concat5(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5) {
+  return std::string(s1) + s2 + s3 + s4 + s5;
+}
+
+int SumOf6(int a, int b, int c, int d, int e, int f) {
+  return a + b + c + d + e + f;
+}
+
+struct SumOf6Functor {
+  int operator()(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+};
+
+std::string Concat6(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6;
+}
+
+std::string Concat7(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+}
+
+std::string Concat8(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+}
+
+std::string Concat9(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8, const char* s9) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+}
+
+std::string Concat10(const char* s1, const char* s2, const char* s3,
+                     const char* s4, const char* s5, const char* s6,
+                     const char* s7, const char* s8, const char* s9,
+                     const char* s10) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+}
+
+class Foo {
+ public:
+  Foo() : value_(123) {}
+
+  int Nullary() const { return value_; }
+
+  short Unary(long x) { return static_cast<short>(value_ + x); }  // NOLINT
+
+  std::string Binary(const std::string& str, char c) const { return str + c; }
+
+  int Ternary(int x, bool y, char z) { return value_ + x + y*z; }
+
+  int SumOf4(int a, int b, int c, int d) const {
+    return a + b + c + d + value_;
+  }
+
+  int SumOfLast2(Unused, Unused, int a, int b) const { return a + b; }
+
+  int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+  int SumOf6(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+
+  std::string Concat7(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+  }
+
+  std::string Concat8(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7, const char* s8) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+  }
+
+  std::string Concat9(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7, const char* s8, const char* s9) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+  }
+
+  std::string Concat10(const char* s1, const char* s2, const char* s3,
+                       const char* s4, const char* s5, const char* s6,
+                       const char* s7, const char* s8, const char* s9,
+                       const char* s10) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+  }
+
+ private:
+  int value_;
+};
+
+// Tests using Invoke() with a nullary function.
+TEST(InvokeTest, Nullary) {
+  Action<int()> a = Invoke(Nullary);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple()));
+}
+
+// Tests using Invoke() with a unary function.
+TEST(InvokeTest, Unary) {
+  Action<bool(int)> a = Invoke(Unary);  // NOLINT
+  EXPECT_FALSE(a.Perform(std::make_tuple(1)));
+  EXPECT_TRUE(a.Perform(std::make_tuple(-1)));
+}
+
+// Tests using Invoke() with a binary function.
+TEST(InvokeTest, Binary) {
+  Action<const char*(const char*, short)> a = Invoke(Binary);  // NOLINT
+  const char* p = "Hello";
+  EXPECT_EQ(p + 2, a.Perform(std::make_tuple(p, Short(2))));
+}
+
+// Tests using Invoke() with a ternary function.
+TEST(InvokeTest, Ternary) {
+  Action<int(int, char, short)> a = Invoke(Ternary);  // NOLINT
+  EXPECT_EQ(6, a.Perform(std::make_tuple(1, '\2', Short(3))));
+}
+
+// Tests using Invoke() with a 4-argument function.
+TEST(InvokeTest, FunctionThatTakes4Arguments) {
+  Action<int(int, int, int, int)> a = Invoke(SumOf4);  // NOLINT
+  EXPECT_EQ(1234, a.Perform(std::make_tuple(1000, 200, 30, 4)));
+}
+
+// Tests using Invoke() with a 5-argument function.
+TEST(InvokeTest, FunctionThatTakes5Arguments) {
+  Action<int(int, int, int, int, int)> a = Invoke(SumOf5);  // NOLINT
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(10000, 2000, 300, 40, 5)));
+}
+
+// Tests using Invoke() with a 6-argument function.
+TEST(InvokeTest, FunctionThatTakes6Arguments) {
+  Action<int(int, int, int, int, int, int)> a = Invoke(SumOf6);  // NOLINT
+  EXPECT_EQ(123456,
+            a.Perform(std::make_tuple(100000, 20000, 3000, 400, 50, 6)));
+}
+
+// A helper that turns the type of a C-string literal from const
+// char[N] to const char*.
+inline const char* CharPtr(const char* s) { return s; }
+
+// Tests using Invoke() with a 7-argument function.
+TEST(InvokeTest, FunctionThatTakes7Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*)>
+      a = Invoke(Concat7);
+  EXPECT_EQ("1234567",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"))));
+}
+
+// Tests using Invoke() with a 8-argument function.
+TEST(InvokeTest, FunctionThatTakes8Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*)>
+      a = Invoke(Concat8);
+  EXPECT_EQ("12345678",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"))));
+}
+
+// Tests using Invoke() with a 9-argument function.
+TEST(InvokeTest, FunctionThatTakes9Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*)>
+      a = Invoke(Concat9);
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(
+                             CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                             CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                             CharPtr("7"), CharPtr("8"), CharPtr("9"))));
+}
+
+// Tests using Invoke() with a 10-argument function.
+TEST(InvokeTest, FunctionThatTakes10Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*, const char*)>
+      a = Invoke(Concat10);
+  EXPECT_EQ("1234567890",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"), CharPtr("9"),
+                                      CharPtr("0"))));
+}
+
+// Tests using Invoke() with functions with parameters declared as Unused.
+TEST(InvokeTest, FunctionWithUnusedParameters) {
+  Action<int(int, int, double, const std::string&)> a1 = Invoke(SumOfFirst2);
+  std::tuple<int, int, double, std::string> dummy =
+      std::make_tuple(10, 2, 5.6, std::string("hi"));
+  EXPECT_EQ(12, a1.Perform(dummy));
+
+  Action<int(int, int, bool, int*)> a2 =
+      Invoke(SumOfFirst2);
+  EXPECT_EQ(
+      23, a2.Perform(std::make_tuple(20, 3, true, static_cast<int*>(nullptr))));
+}
+
+// Tests using Invoke() with methods with parameters declared as Unused.
+TEST(InvokeTest, MethodWithUnusedParameters) {
+  Foo foo;
+  Action<int(std::string, bool, int, int)> a1 = Invoke(&foo, &Foo::SumOfLast2);
+  EXPECT_EQ(12, a1.Perform(std::make_tuple(CharPtr("hi"), true, 10, 2)));
+
+  Action<int(char, double, int, int)> a2 =
+      Invoke(&foo, &Foo::SumOfLast2);
+  EXPECT_EQ(23, a2.Perform(std::make_tuple('a', 2.5, 20, 3)));
+}
+
+// Tests using Invoke() with a functor.
+TEST(InvokeTest, Functor) {
+  Action<long(long, int)> a = Invoke(plus<long>());  // NOLINT
+  EXPECT_EQ(3L, a.Perform(std::make_tuple(1, 2)));
+}
+
+// Tests using Invoke(f) as an action of a compatible type.
+TEST(InvokeTest, FunctionWithCompatibleType) {
+  Action<long(int, short, char, bool)> a = Invoke(SumOf4);  // NOLINT
+  EXPECT_EQ(4321, a.Perform(std::make_tuple(4000, Short(300), Char(20), true)));
+}
+
+// Tests using Invoke() with an object pointer and a method pointer.
+
+// Tests using Invoke() with a nullary method.
+TEST(InvokeMethodTest, Nullary) {
+  Foo foo;
+  Action<int()> a = Invoke(&foo, &Foo::Nullary);  // NOLINT
+  EXPECT_EQ(123, a.Perform(std::make_tuple()));
+}
+
+// Tests using Invoke() with a unary method.
+TEST(InvokeMethodTest, Unary) {
+  Foo foo;
+  Action<short(long)> a = Invoke(&foo, &Foo::Unary);  // NOLINT
+  EXPECT_EQ(4123, a.Perform(std::make_tuple(4000)));
+}
+
+// Tests using Invoke() with a binary method.
+TEST(InvokeMethodTest, Binary) {
+  Foo foo;
+  Action<std::string(const std::string&, char)> a = Invoke(&foo, &Foo::Binary);
+  std::string s("Hell");
+  std::tuple<std::string, char> dummy = std::make_tuple(s, 'o');
+  EXPECT_EQ("Hello", a.Perform(dummy));
+}
+
+// Tests using Invoke() with a ternary method.
+TEST(InvokeMethodTest, Ternary) {
+  Foo foo;
+  Action<int(int, bool, char)> a = Invoke(&foo, &Foo::Ternary);  // NOLINT
+  EXPECT_EQ(1124, a.Perform(std::make_tuple(1000, true, Char(1))));
+}
+
+// Tests using Invoke() with a 4-argument method.
+TEST(InvokeMethodTest, MethodThatTakes4Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int)> a = Invoke(&foo, &Foo::SumOf4);  // NOLINT
+  EXPECT_EQ(1357, a.Perform(std::make_tuple(1000, 200, 30, 4)));
+}
+
+// Tests using Invoke() with a 5-argument method.
+TEST(InvokeMethodTest, MethodThatTakes5Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int, int)> a = Invoke(&foo, &Foo::SumOf5);  // NOLINT
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(10000, 2000, 300, 40, 5)));
+}
+
+// Tests using Invoke() with a 6-argument method.
+TEST(InvokeMethodTest, MethodThatTakes6Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int, int, int)> a =  // NOLINT
+      Invoke(&foo, &Foo::SumOf6);
+  EXPECT_EQ(123456,
+            a.Perform(std::make_tuple(100000, 20000, 3000, 400, 50, 6)));
+}
+
+// Tests using Invoke() with a 7-argument method.
+TEST(InvokeMethodTest, MethodThatTakes7Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat7);
+  EXPECT_EQ("1234567",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"))));
+}
+
+// Tests using Invoke() with a 8-argument method.
+TEST(InvokeMethodTest, MethodThatTakes8Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat8);
+  EXPECT_EQ("12345678",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"))));
+}
+
+// Tests using Invoke() with a 9-argument method.
+TEST(InvokeMethodTest, MethodThatTakes9Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*)>
+      a = Invoke(&foo, &Foo::Concat9);
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(
+                             CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                             CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                             CharPtr("7"), CharPtr("8"), CharPtr("9"))));
+}
+
+// Tests using Invoke() with a 10-argument method.
+TEST(InvokeMethodTest, MethodThatTakes10Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat10);
+  EXPECT_EQ("1234567890",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"), CharPtr("9"),
+                                      CharPtr("0"))));
+}
+
+// Tests using Invoke(f) as an action of a compatible type.
+TEST(InvokeMethodTest, MethodWithCompatibleType) {
+  Foo foo;
+  Action<long(int, short, char, bool)> a =  // NOLINT
+      Invoke(&foo, &Foo::SumOf4);
+  EXPECT_EQ(4444, a.Perform(std::make_tuple(4000, Short(300), Char(20), true)));
+}
+
+// Tests using WithoutArgs with an action that takes no argument.
+TEST(WithoutArgsTest, NoArg) {
+  Action<int(int n)> a = WithoutArgs(Invoke(Nullary));  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2)));
+}
+
+// Tests using WithArg with an action that takes 1 argument.
+TEST(WithArgTest, OneArg) {
+  Action<bool(double x, int n)> b = WithArg<1>(Invoke(Unary));  // NOLINT
+  EXPECT_TRUE(b.Perform(std::make_tuple(1.5, -1)));
+  EXPECT_FALSE(b.Perform(std::make_tuple(1.5, 1)));
+}
+
+TEST(ReturnArgActionTest, WorksForOneArgIntArg0) {
+  const Action<int(int)> a = ReturnArg<0>();
+  EXPECT_EQ(5, a.Perform(std::make_tuple(5)));
+}
+
+TEST(ReturnArgActionTest, WorksForMultiArgBoolArg0) {
+  const Action<bool(bool, bool, bool)> a = ReturnArg<0>();
+  EXPECT_TRUE(a.Perform(std::make_tuple(true, false, false)));
+}
+
+TEST(ReturnArgActionTest, WorksForMultiArgStringArg2) {
+  const Action<std::string(int, int, std::string, int)> a = ReturnArg<2>();
+  EXPECT_EQ("seven", a.Perform(std::make_tuple(5, 6, std::string("seven"), 8)));
+}
+
+TEST(SaveArgActionTest, WorksForSameType) {
+  int result = 0;
+  const Action<void(int n)> a1 = SaveArg<0>(&result);
+  a1.Perform(std::make_tuple(5));
+  EXPECT_EQ(5, result);
+}
+
+TEST(SaveArgActionTest, WorksForCompatibleType) {
+  int result = 0;
+  const Action<void(bool, char)> a1 = SaveArg<1>(&result);
+  a1.Perform(std::make_tuple(true, 'a'));
+  EXPECT_EQ('a', result);
+}
+
+TEST(SaveArgPointeeActionTest, WorksForSameType) {
+  int result = 0;
+  const int value = 5;
+  const Action<void(const int*)> a1 = SaveArgPointee<0>(&result);
+  a1.Perform(std::make_tuple(&value));
+  EXPECT_EQ(5, result);
+}
+
+TEST(SaveArgPointeeActionTest, WorksForCompatibleType) {
+  int result = 0;
+  char value = 'a';
+  const Action<void(bool, char*)> a1 = SaveArgPointee<1>(&result);
+  a1.Perform(std::make_tuple(true, &value));
+  EXPECT_EQ('a', result);
+}
+
+TEST(SetArgRefereeActionTest, WorksForSameType) {
+  int value = 0;
+  const Action<void(int&)> a1 = SetArgReferee<0>(1);
+  a1.Perform(std::tuple<int&>(value));
+  EXPECT_EQ(1, value);
+}
+
+TEST(SetArgRefereeActionTest, WorksForCompatibleType) {
+  int value = 0;
+  const Action<void(int, int&)> a1 = SetArgReferee<1>('a');
+  a1.Perform(std::tuple<int, int&>(0, value));
+  EXPECT_EQ('a', value);
+}
+
+TEST(SetArgRefereeActionTest, WorksWithExtraArguments) {
+  int value = 0;
+  const Action<void(bool, int, int&, const char*)> a1 = SetArgReferee<2>('a');
+  a1.Perform(std::tuple<bool, int, int&, const char*>(true, 0, value, "hi"));
+  EXPECT_EQ('a', value);
+}
+
+// A class that can be used to verify that its destructor is called: it will set
+// the bool provided to the constructor to true when destroyed.
+class DeletionTester {
+ public:
+  explicit DeletionTester(bool* is_deleted)
+    : is_deleted_(is_deleted) {
+    // Make sure the bit is set to false.
+    *is_deleted_ = false;
+  }
+
+  ~DeletionTester() {
+    *is_deleted_ = true;
+  }
+
+ private:
+  bool* is_deleted_;
+};
+
+TEST(DeleteArgActionTest, OneArg) {
+  bool is_deleted = false;
+  DeletionTester* t = new DeletionTester(&is_deleted);
+  const Action<void(DeletionTester*)> a1 = DeleteArg<0>();      // NOLINT
+  EXPECT_FALSE(is_deleted);
+  a1.Perform(std::make_tuple(t));
+  EXPECT_TRUE(is_deleted);
+}
+
+TEST(DeleteArgActionTest, TenArgs) {
+  bool is_deleted = false;
+  DeletionTester* t = new DeletionTester(&is_deleted);
+  const Action<void(bool, int, int, const char*, bool,
+                    int, int, int, int, DeletionTester*)> a1 = DeleteArg<9>();
+  EXPECT_FALSE(is_deleted);
+  a1.Perform(std::make_tuple(true, 5, 6, CharPtr("hi"), false, 7, 8, 9, 10, t));
+  EXPECT_TRUE(is_deleted);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInVoidFunction) {
+  const Action<void(int n)> a = Throw('a');
+  EXPECT_THROW(a.Perform(std::make_tuple(0)), char);
+}
+
+class MyException {};
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInNonVoidFunction) {
+  const Action<double(char ch)> a = Throw(MyException());
+  EXPECT_THROW(a.Perform(std::make_tuple('0')), MyException);
+}
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInNullaryFunction) {
+  const Action<double()> a = Throw(MyException());
+  EXPECT_THROW(a.Perform(std::make_tuple()), MyException);
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Tests that SetArrayArgument<N>(first, last) sets the elements of the array
+// pointed to by the N-th (0-based) argument to values in range [first, last).
+TEST(SetArrayArgumentTest, SetsTheNthArray) {
+  typedef void MyFunction(bool, int*, char*);
+  int numbers[] = { 1, 2, 3 };
+  Action<MyFunction> a = SetArrayArgument<1>(numbers, numbers + 3);
+
+  int n[4] = {};
+  int* pn = n;
+  char ch[4] = {};
+  char* pch = ch;
+  a.Perform(std::make_tuple(true, pn, pch));
+  EXPECT_EQ(1, n[0]);
+  EXPECT_EQ(2, n[1]);
+  EXPECT_EQ(3, n[2]);
+  EXPECT_EQ(0, n[3]);
+  EXPECT_EQ('\0', ch[0]);
+  EXPECT_EQ('\0', ch[1]);
+  EXPECT_EQ('\0', ch[2]);
+  EXPECT_EQ('\0', ch[3]);
+
+  // Tests first and last are iterators.
+  std::string letters = "abc";
+  a = SetArrayArgument<2>(letters.begin(), letters.end());
+  std::fill_n(n, 4, 0);
+  std::fill_n(ch, 4, '\0');
+  a.Perform(std::make_tuple(true, pn, pch));
+  EXPECT_EQ(0, n[0]);
+  EXPECT_EQ(0, n[1]);
+  EXPECT_EQ(0, n[2]);
+  EXPECT_EQ(0, n[3]);
+  EXPECT_EQ('a', ch[0]);
+  EXPECT_EQ('b', ch[1]);
+  EXPECT_EQ('c', ch[2]);
+  EXPECT_EQ('\0', ch[3]);
+}
+
+// Tests SetArrayArgument<N>(first, last) where first == last.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithEmptyRange) {
+  typedef void MyFunction(bool, int*);
+  int numbers[] = { 1, 2, 3 };
+  Action<MyFunction> a = SetArrayArgument<1>(numbers, numbers);
+
+  int n[4] = {};
+  int* pn = n;
+  a.Perform(std::make_tuple(true, pn));
+  EXPECT_EQ(0, n[0]);
+  EXPECT_EQ(0, n[1]);
+  EXPECT_EQ(0, n[2]);
+  EXPECT_EQ(0, n[3]);
+}
+
+// Tests SetArrayArgument<N>(first, last) where *first is convertible
+// (but not equal) to the argument type.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithConvertibleType) {
+  typedef void MyFunction(bool, int*);
+  char chars[] = { 97, 98, 99 };
+  Action<MyFunction> a = SetArrayArgument<1>(chars, chars + 3);
+
+  int codes[4] = { 111, 222, 333, 444 };
+  int* pcodes = codes;
+  a.Perform(std::make_tuple(true, pcodes));
+  EXPECT_EQ(97, codes[0]);
+  EXPECT_EQ(98, codes[1]);
+  EXPECT_EQ(99, codes[2]);
+  EXPECT_EQ(444, codes[3]);
+}
+
+// Test SetArrayArgument<N>(first, last) with iterator as argument.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithIteratorArgument) {
+  typedef void MyFunction(bool, std::back_insert_iterator<std::string>);
+  std::string letters = "abc";
+  Action<MyFunction> a = SetArrayArgument<1>(letters.begin(), letters.end());
+
+  std::string s;
+  a.Perform(std::make_tuple(true, back_inserter(s)));
+  EXPECT_EQ(letters, s);
+}
+
+TEST(ReturnPointeeTest, Works) {
+  int n = 42;
+  const Action<int()> a = ReturnPointee(&n);
+  EXPECT_EQ(42, a.Perform(std::make_tuple()));
+
+  n = 43;
+  EXPECT_EQ(43, a.Perform(std::make_tuple()));
+}
+
+}  // namespace gmock_generated_actions_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-nice-strict_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-nice-strict_test.cc
new file mode 100755
index 0000000..0a201ed
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-nice-strict_test.cc
@@ -0,0 +1,500 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gmock/gmock-nice-strict.h"
+
+#include <string>
+#include <utility>
+#include "gmock/gmock.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+// This must not be defined inside the ::testing namespace, or it will
+// clash with ::testing::Mock.
+class Mock {
+ public:
+  Mock() {}
+
+  MOCK_METHOD0(DoThis, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mock);
+};
+
+namespace testing {
+namespace gmock_nice_strict_test {
+
+using testing::GMOCK_FLAG(verbose);
+using testing::HasSubstr;
+using testing::NaggyMock;
+using testing::NiceMock;
+using testing::StrictMock;
+
+#if GTEST_HAS_STREAM_REDIRECTION
+using testing::internal::CaptureStdout;
+using testing::internal::GetCapturedStdout;
+#endif
+
+// Class without default constructor.
+class NotDefaultConstructible {
+ public:
+  explicit NotDefaultConstructible(int) {}
+};
+
+// Defines some mock classes needed by the tests.
+
+class Foo {
+ public:
+  virtual ~Foo() {}
+
+  virtual void DoThis() = 0;
+  virtual int DoThat(bool flag) = 0;
+};
+
+class MockFoo : public Foo {
+ public:
+  MockFoo() {}
+  void Delete() { delete this; }
+
+  MOCK_METHOD0(DoThis, void());
+  MOCK_METHOD1(DoThat, int(bool flag));
+  MOCK_METHOD0(ReturnNonDefaultConstructible, NotDefaultConstructible());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+class MockBar {
+ public:
+  explicit MockBar(const std::string& s) : str_(s) {}
+
+  MockBar(char a1, char a2, std::string a3, std::string a4, int a5, int a6,
+          const std::string& a7, const std::string& a8, bool a9, bool a10) {
+    str_ = std::string() + a1 + a2 + a3 + a4 + static_cast<char>(a5) +
+        static_cast<char>(a6) + a7 + a8 + (a9 ? 'T' : 'F') + (a10 ? 'T' : 'F');
+  }
+
+  virtual ~MockBar() {}
+
+  const std::string& str() const { return str_; }
+
+  MOCK_METHOD0(This, int());
+  MOCK_METHOD2(That, std::string(int, bool));
+
+ private:
+  std::string str_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockBar);
+};
+
+
+class MockBaz {
+ public:
+  class MoveOnly {
+   public:
+    MoveOnly() = default;
+
+    MoveOnly(const MoveOnly&) = delete;
+    MoveOnly& operator=(const MoveOnly&) = delete;
+
+    MoveOnly(MoveOnly&&) = default;
+    MoveOnly& operator=(MoveOnly&&) = default;
+  };
+
+  MockBaz(MoveOnly) {}
+};
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a raw mock generates warnings for uninteresting calls.
+TEST(RawMockTest, WarningForUninterestingCall) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  MockFoo raw_foo;
+
+  CaptureStdout();
+  raw_foo.DoThis();
+  raw_foo.DoThat(true);
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a raw mock generates warnings for uninteresting calls
+// that delete the mock object.
+TEST(RawMockTest, WarningForUninterestingCallAfterDeath) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  MockFoo* const raw_foo = new MockFoo;
+
+  ON_CALL(*raw_foo, DoThis())
+      .WillByDefault(Invoke(raw_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  raw_foo->DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a raw mock generates informational logs for
+// uninteresting calls.
+TEST(RawMockTest, InfoForUninterestingCall) {
+  MockFoo raw_foo;
+
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "info";
+  CaptureStdout();
+  raw_foo.DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+TEST(RawMockTest, IsNaggy_IsNice_IsStrict) {
+  MockFoo raw_foo;
+  EXPECT_TRUE(Mock::IsNaggy(&raw_foo));
+  EXPECT_FALSE(Mock::IsNice(&raw_foo));
+  EXPECT_FALSE(Mock::IsStrict(&raw_foo));
+}
+
+// Tests that a nice mock generates no warning for uninteresting calls.
+TEST(NiceMockTest, NoWarningForUninterestingCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  CaptureStdout();
+  nice_foo.DoThis();
+  nice_foo.DoThat(true);
+  EXPECT_EQ("", GetCapturedStdout());
+}
+
+// Tests that a nice mock generates no warning for uninteresting calls
+// that delete the mock object.
+TEST(NiceMockTest, NoWarningForUninterestingCallAfterDeath) {
+  NiceMock<MockFoo>* const nice_foo = new NiceMock<MockFoo>;
+
+  ON_CALL(*nice_foo, DoThis())
+      .WillByDefault(Invoke(nice_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  nice_foo->DoThis();
+  EXPECT_EQ("", GetCapturedStdout());
+}
+
+// Tests that a nice mock generates informational logs for
+// uninteresting calls.
+TEST(NiceMockTest, InfoForUninterestingCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "info";
+  CaptureStdout();
+  nice_foo.DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a nice mock allows expected calls.
+TEST(NiceMockTest, AllowsExpectedCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  EXPECT_CALL(nice_foo, DoThis());
+  nice_foo.DoThis();
+}
+
+// Tests that an unexpected call on a nice mock which returns a
+// not-default-constructible type throws an exception and the exception contains
+// the method's name.
+TEST(NiceMockTest, ThrowsExceptionForUnknownReturnTypes) {
+  NiceMock<MockFoo> nice_foo;
+#if GTEST_HAS_EXCEPTIONS
+  try {
+    nice_foo.ReturnNonDefaultConstructible();
+    FAIL();
+  } catch (const std::runtime_error& ex) {
+    EXPECT_THAT(ex.what(), HasSubstr("ReturnNonDefaultConstructible"));
+  }
+#else
+  EXPECT_DEATH_IF_SUPPORTED({ nice_foo.ReturnNonDefaultConstructible(); }, "");
+#endif
+}
+
+// Tests that an unexpected call on a nice mock fails.
+TEST(NiceMockTest, UnexpectedCallFails) {
+  NiceMock<MockFoo> nice_foo;
+
+  EXPECT_CALL(nice_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(nice_foo.DoThis(), "called more times than expected");
+}
+
+// Tests that NiceMock works with a mock class that has a non-default
+// constructor.
+TEST(NiceMockTest, NonDefaultConstructor) {
+  NiceMock<MockBar> nice_bar("hi");
+  EXPECT_EQ("hi", nice_bar.str());
+
+  nice_bar.This();
+  nice_bar.That(5, true);
+}
+
+// Tests that NiceMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(NiceMockTest, NonDefaultConstructor10) {
+  NiceMock<MockBar> nice_bar('a', 'b', "c", "d", 'e', 'f',
+                             "g", "h", true, false);
+  EXPECT_EQ("abcdefghTF", nice_bar.str());
+
+  nice_bar.This();
+  nice_bar.That(5, true);
+}
+
+TEST(NiceMockTest, AllowLeak) {
+  NiceMock<MockFoo>* leaked = new NiceMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(NiceMockTest, MoveOnlyConstructor) {
+  NiceMock<MockBaz> nice_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that NiceMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(NiceMockTest, AcceptsClassNamedMock) {
+  NiceMock< ::Mock> nice;
+  EXPECT_CALL(nice, DoThis());
+  nice.DoThis();
+}
+
+TEST(NiceMockTest, IsNaggy_IsNice_IsStrict) {
+  NiceMock<MockFoo> nice_foo;
+  EXPECT_FALSE(Mock::IsNaggy(&nice_foo));
+  EXPECT_TRUE(Mock::IsNice(&nice_foo));
+  EXPECT_FALSE(Mock::IsStrict(&nice_foo));
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a naggy mock generates warnings for uninteresting calls.
+TEST(NaggyMockTest, WarningForUninterestingCall) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  NaggyMock<MockFoo> naggy_foo;
+
+  CaptureStdout();
+  naggy_foo.DoThis();
+  naggy_foo.DoThat(true);
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a naggy mock generates a warning for an uninteresting call
+// that deletes the mock object.
+TEST(NaggyMockTest, WarningForUninterestingCallAfterDeath) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  NaggyMock<MockFoo>* const naggy_foo = new NaggyMock<MockFoo>;
+
+  ON_CALL(*naggy_foo, DoThis())
+      .WillByDefault(Invoke(naggy_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  naggy_foo->DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a naggy mock allows expected calls.
+TEST(NaggyMockTest, AllowsExpectedCall) {
+  NaggyMock<MockFoo> naggy_foo;
+
+  EXPECT_CALL(naggy_foo, DoThis());
+  naggy_foo.DoThis();
+}
+
+// Tests that an unexpected call on a naggy mock fails.
+TEST(NaggyMockTest, UnexpectedCallFails) {
+  NaggyMock<MockFoo> naggy_foo;
+
+  EXPECT_CALL(naggy_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(naggy_foo.DoThis(),
+                          "called more times than expected");
+}
+
+// Tests that NaggyMock works with a mock class that has a non-default
+// constructor.
+TEST(NaggyMockTest, NonDefaultConstructor) {
+  NaggyMock<MockBar> naggy_bar("hi");
+  EXPECT_EQ("hi", naggy_bar.str());
+
+  naggy_bar.This();
+  naggy_bar.That(5, true);
+}
+
+// Tests that NaggyMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(NaggyMockTest, NonDefaultConstructor10) {
+  NaggyMock<MockBar> naggy_bar('0', '1', "2", "3", '4', '5',
+                               "6", "7", true, false);
+  EXPECT_EQ("01234567TF", naggy_bar.str());
+
+  naggy_bar.This();
+  naggy_bar.That(5, true);
+}
+
+TEST(NaggyMockTest, AllowLeak) {
+  NaggyMock<MockFoo>* leaked = new NaggyMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(NaggyMockTest, MoveOnlyConstructor) {
+  NaggyMock<MockBaz> naggy_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that NaggyMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(NaggyMockTest, AcceptsClassNamedMock) {
+  NaggyMock< ::Mock> naggy;
+  EXPECT_CALL(naggy, DoThis());
+  naggy.DoThis();
+}
+
+TEST(NaggyMockTest, IsNaggy_IsNice_IsStrict) {
+  NaggyMock<MockFoo> naggy_foo;
+  EXPECT_TRUE(Mock::IsNaggy(&naggy_foo));
+  EXPECT_FALSE(Mock::IsNice(&naggy_foo));
+  EXPECT_FALSE(Mock::IsStrict(&naggy_foo));
+}
+
+// Tests that a strict mock allows expected calls.
+TEST(StrictMockTest, AllowsExpectedCall) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_CALL(strict_foo, DoThis());
+  strict_foo.DoThis();
+}
+
+// Tests that an unexpected call on a strict mock fails.
+TEST(StrictMockTest, UnexpectedCallFails) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_CALL(strict_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(strict_foo.DoThis(),
+                          "called more times than expected");
+}
+
+// Tests that an uninteresting call on a strict mock fails.
+TEST(StrictMockTest, UninterestingCallFails) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_NONFATAL_FAILURE(strict_foo.DoThis(),
+                          "Uninteresting mock function call");
+}
+
+// Tests that an uninteresting call on a strict mock fails, even if
+// the call deletes the mock object.
+TEST(StrictMockTest, UninterestingCallFailsAfterDeath) {
+  StrictMock<MockFoo>* const strict_foo = new StrictMock<MockFoo>;
+
+  ON_CALL(*strict_foo, DoThis())
+      .WillByDefault(Invoke(strict_foo, &MockFoo::Delete));
+
+  EXPECT_NONFATAL_FAILURE(strict_foo->DoThis(),
+                          "Uninteresting mock function call");
+}
+
+// Tests that StrictMock works with a mock class that has a
+// non-default constructor.
+TEST(StrictMockTest, NonDefaultConstructor) {
+  StrictMock<MockBar> strict_bar("hi");
+  EXPECT_EQ("hi", strict_bar.str());
+
+  EXPECT_NONFATAL_FAILURE(strict_bar.That(5, true),
+                          "Uninteresting mock function call");
+}
+
+// Tests that StrictMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(StrictMockTest, NonDefaultConstructor10) {
+  StrictMock<MockBar> strict_bar('a', 'b', "c", "d", 'e', 'f',
+                                 "g", "h", true, false);
+  EXPECT_EQ("abcdefghTF", strict_bar.str());
+
+  EXPECT_NONFATAL_FAILURE(strict_bar.That(5, true),
+                          "Uninteresting mock function call");
+}
+
+TEST(StrictMockTest, AllowLeak) {
+  StrictMock<MockFoo>* leaked = new StrictMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(StrictMockTest, MoveOnlyConstructor) {
+  StrictMock<MockBaz> strict_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that StrictMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(StrictMockTest, AcceptsClassNamedMock) {
+  StrictMock< ::Mock> strict;
+  EXPECT_CALL(strict, DoThis());
+  strict.DoThis();
+}
+
+TEST(StrictMockTest, IsNaggy_IsNice_IsStrict) {
+  StrictMock<MockFoo> strict_foo;
+  EXPECT_FALSE(Mock::IsNaggy(&strict_foo));
+  EXPECT_FALSE(Mock::IsNice(&strict_foo));
+  EXPECT_TRUE(Mock::IsStrict(&strict_foo));
+}
+
+}  // namespace gmock_nice_strict_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-port_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-port_test.cc
new file mode 100755
index 0000000..a2c2be2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-port_test.cc
@@ -0,0 +1,42 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal cross-platform support utilities.
+
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+// NOTE: if this file is left without tests for some reason, put a dummy
+// test here to make references to symbols in the gtest library and avoid
+// 'undefined symbol' linker errors in gmock_main:
+
+TEST(DummyTest, Dummy) {}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp-string_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp-string_test.cc
new file mode 100755
index 0000000..6f66cf1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp-string_test.cc
@@ -0,0 +1,206 @@
+// Copyright 2018, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal preprocessor macro library.
+#include "gmock/internal/gmock-pp.h"
+
+#include <string>
+
+#include "gmock/gmock.h"
+
+namespace testing {
+namespace {
+
+// Matcher to verify that to strings are identical up to whitespace
+// Not 100% correct, because it treats "AB" as equal to "A B".
+::testing::Matcher<const std::string&> SameExceptSpaces(const std::string& s) {
+  auto remove_spaces = [](std::string to_split) {
+    to_split.erase(std::remove(to_split.begin(), to_split.end(), ' '),
+                   to_split.end());
+    return to_split;
+  };
+  return ::testing::ResultOf(remove_spaces, remove_spaces(s));
+}
+
+// Verify that a macro expands to a given text. Ignores whitespace difference.
+// In MSVC, GMOCK_PP_STRINGIZE() returns nothing, rather than "". So concatenate
+// with an empty string.
+#define EXPECT_EXPANSION(Result, Macro) \
+  EXPECT_THAT("" GMOCK_PP_STRINGIZE(Macro), SameExceptSpaces(Result))
+
+TEST(Macros, Cat) {
+  EXPECT_EXPANSION("14", GMOCK_PP_CAT(1, 4));
+  EXPECT_EXPANSION("+=", GMOCK_PP_CAT(+, =));
+}
+
+TEST(Macros, Narg) {
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG());
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG(x));
+  EXPECT_EXPANSION("2", GMOCK_PP_NARG(x, y));
+  EXPECT_EXPANSION("3", GMOCK_PP_NARG(x, y, z));
+  EXPECT_EXPANSION("4", GMOCK_PP_NARG(x, y, z, w));
+
+  EXPECT_EXPANSION("0", GMOCK_PP_NARG0());
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG0(x));
+  EXPECT_EXPANSION("2", GMOCK_PP_NARG0(x, y));
+}
+
+TEST(Macros, Comma) {
+  EXPECT_EXPANSION("0", GMOCK_PP_HAS_COMMA());
+  EXPECT_EXPANSION("1", GMOCK_PP_HAS_COMMA(, ));
+  EXPECT_EXPANSION("0", GMOCK_PP_HAS_COMMA((, )));
+}
+
+TEST(Macros, IsEmpty) {
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_EMPTY());
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(, ));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(a));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(()));
+
+#define GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_EMPTY(GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1));
+}
+
+TEST(Macros, If) {
+  EXPECT_EXPANSION("1", GMOCK_PP_IF(1, 1, 2));
+  EXPECT_EXPANSION("2", GMOCK_PP_IF(0, 1, 2));
+}
+
+TEST(Macros, HeadTail) {
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1));
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1, 2));
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1, 2, 3));
+
+  EXPECT_EXPANSION("", GMOCK_PP_TAIL(1));
+  EXPECT_EXPANSION("2", GMOCK_PP_TAIL(1, 2));
+  EXPECT_EXPANSION("2", GMOCK_PP_HEAD(GMOCK_PP_TAIL(1, 2, 3)));
+}
+
+TEST(Macros, Parentheses) {
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss()));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss() sss));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_BEGIN_PARENS((sss)));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_BEGIN_PARENS((sss)ss));
+
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss()));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss() sss));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_ENCLOSED_PARENS((sss)));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS((sss)ss));
+
+  EXPECT_EXPANSION("1 + 1", GMOCK_PP_REMOVE_PARENS((1 + 1)));
+}
+
+TEST(Macros, Increment) {
+  EXPECT_EXPANSION("1", GMOCK_PP_INC(0));
+  EXPECT_EXPANSION("2", GMOCK_PP_INC(1));
+  EXPECT_EXPANSION("3", GMOCK_PP_INC(2));
+  EXPECT_EXPANSION("4", GMOCK_PP_INC(3));
+  EXPECT_EXPANSION("5", GMOCK_PP_INC(4));
+
+  EXPECT_EXPANSION("16", GMOCK_PP_INC(15));
+}
+
+#define JOINER_CAT(a, b) a##b
+#define JOINER(_N, _Data, _Elem) JOINER_CAT(_Data, _N) = _Elem
+
+TEST(Macros, Repeat) {
+  EXPECT_EXPANSION("", GMOCK_PP_REPEAT(JOINER, X, 0));
+  EXPECT_EXPANSION("X0=", GMOCK_PP_REPEAT(JOINER, X, 1));
+  EXPECT_EXPANSION("X0= X1=", GMOCK_PP_REPEAT(JOINER, X, 2));
+  EXPECT_EXPANSION("X0= X1= X2=", GMOCK_PP_REPEAT(JOINER, X, 3));
+  EXPECT_EXPANSION("X0= X1= X2= X3=", GMOCK_PP_REPEAT(JOINER, X, 4));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4=", GMOCK_PP_REPEAT(JOINER, X, 5));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5=", GMOCK_PP_REPEAT(JOINER, X, 6));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6=",
+                   GMOCK_PP_REPEAT(JOINER, X, 7));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7=",
+                   GMOCK_PP_REPEAT(JOINER, X, 8));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8=",
+                   GMOCK_PP_REPEAT(JOINER, X, 9));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9=",
+                   GMOCK_PP_REPEAT(JOINER, X, 10));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10=",
+                   GMOCK_PP_REPEAT(JOINER, X, 11));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11=",
+                   GMOCK_PP_REPEAT(JOINER, X, 12));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12=",
+                   GMOCK_PP_REPEAT(JOINER, X, 13));
+  EXPECT_EXPANSION(
+      "X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12= X13=",
+      GMOCK_PP_REPEAT(JOINER, X, 14));
+  EXPECT_EXPANSION(
+      "X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12= X13= X14=",
+      GMOCK_PP_REPEAT(JOINER, X, 15));
+}
+TEST(Macros, ForEach) {
+  EXPECT_EXPANSION("", GMOCK_PP_FOR_EACH(JOINER, X, ()));
+  EXPECT_EXPANSION("X0=a", GMOCK_PP_FOR_EACH(JOINER, X, (a)));
+  EXPECT_EXPANSION("X0=a X1=b", GMOCK_PP_FOR_EACH(JOINER, X, (a, b)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c", GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l, m)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m "
+      "X13=n",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l, m, n)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m "
+      "X13=n X14=o",
+      GMOCK_PP_FOR_EACH(JOINER, X,
+                        (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)));
+}
+
+}  // namespace
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp_test.cc
new file mode 100755
index 0000000..7387d39
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-pp_test.cc
@@ -0,0 +1,73 @@
+#include "gmock/internal/gmock-pp.h"
+
+// Static assertions.
+namespace testing {
+namespace internal {
+namespace gmockpp {
+
+static_assert(GMOCK_PP_CAT(1, 4) == 14, "");
+static_assert(GMOCK_PP_INTERNAL_INTERNAL_16TH(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                              12, 13, 14, 15, 16, 17, 18) == 16,
+              "");
+static_assert(GMOCK_PP_NARG() == 1, "");
+static_assert(GMOCK_PP_NARG(x) == 1, "");
+static_assert(GMOCK_PP_NARG(x, y) == 2, "");
+static_assert(GMOCK_PP_NARG(x, y, z) == 3, "");
+static_assert(GMOCK_PP_NARG(x, y, z, w) == 4, "");
+static_assert(!GMOCK_PP_HAS_COMMA(), "");
+static_assert(GMOCK_PP_HAS_COMMA(b, ), "");
+static_assert(!GMOCK_PP_HAS_COMMA((, )), "");
+static_assert(!GMOCK_PP_IS_EMPTY(, ), "");
+static_assert(!GMOCK_PP_IS_EMPTY(a), "");
+static_assert(!GMOCK_PP_IS_EMPTY(()), "");
+static_assert(GMOCK_PP_IF(1, 1, 2) == 1, "");
+static_assert(GMOCK_PP_IF(0, 1, 2) == 2, "");
+static_assert(GMOCK_PP_NARG0(x) == 1, "");
+static_assert(GMOCK_PP_NARG0(x, y) == 2, "");
+static_assert(GMOCK_PP_HEAD(1) == 1, "");
+static_assert(GMOCK_PP_HEAD(1, 2) == 1, "");
+static_assert(GMOCK_PP_HEAD(1, 2, 3) == 1, "");
+static_assert(GMOCK_PP_TAIL(1, 2) == 2, "");
+static_assert(GMOCK_PP_HEAD(GMOCK_PP_TAIL(1, 2, 3)) == 2, "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss), "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss()), "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss() sss), "");
+static_assert(GMOCK_PP_IS_BEGIN_PARENS((sss)), "");
+static_assert(GMOCK_PP_IS_BEGIN_PARENS((sss)ss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss()), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss() sss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS((sss)ss), "");
+static_assert(GMOCK_PP_REMOVE_PARENS((1 + 1)) * 2 == 3, "");
+static_assert(GMOCK_PP_INC(4) == 5, "");
+
+template <class... Args>
+struct Test {
+  static constexpr int kArgs = sizeof...(Args);
+};
+#define GMOCK_PP_INTERNAL_TYPE_TEST(_i, _Data, _element) \
+  GMOCK_PP_COMMA_IF(_i) _element
+static_assert(Test<GMOCK_PP_FOR_EACH(GMOCK_PP_INTERNAL_TYPE_TEST, ~,
+                                     (int, float, double, char))>::kArgs == 4,
+              "");
+#define GMOCK_PP_INTERNAL_VAR_TEST_1(_x) 1
+#define GMOCK_PP_INTERNAL_VAR_TEST_2(_x, _y) 2
+#define GMOCK_PP_INTERNAL_VAR_TEST_3(_x, _y, _z) 3
+
+#define GMOCK_PP_INTERNAL_VAR_TEST(...) \
+  GMOCK_PP_VARIADIC_CALL(GMOCK_PP_INTERNAL_VAR_TEST_, __VA_ARGS__)
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(x, y) == 2, "");
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(silly) == 1, "");
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(x, y, z) == 3, "");
+
+// TODO(iserna): The following asserts fail in --config=lexan.
+#define GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1
+static_assert(GMOCK_PP_IS_EMPTY(GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1), "");
+static_assert(GMOCK_PP_IS_EMPTY(), "");
+static_assert(GMOCK_PP_IS_ENCLOSED_PARENS((sss)), "");
+static_assert(GMOCK_PP_IS_EMPTY(GMOCK_PP_TAIL(1)), "");
+static_assert(GMOCK_PP_NARG0() == 0, "");
+
+}  // namespace gmockpp
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-spec-builders_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-spec-builders_test.cc
new file mode 100755
index 0000000..95b4b8b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock-spec-builders_test.cc
@@ -0,0 +1,2773 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the spec builder syntax.
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester {
+ public:
+  // Sets the call count of the given expectation to the given number.
+  void SetCallCount(int n, ExpectationBase* exp) {
+    exp->call_count_ = n;
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+namespace {
+
+using testing::_;
+using testing::AnyNumber;
+using testing::AtLeast;
+using testing::AtMost;
+using testing::Between;
+using testing::Cardinality;
+using testing::CardinalityInterface;
+using testing::ContainsRegex;
+using testing::Const;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::Eq;
+using testing::Expectation;
+using testing::ExpectationSet;
+using testing::GMOCK_FLAG(verbose);
+using testing::Gt;
+using testing::IgnoreResult;
+using testing::InSequence;
+using testing::Invoke;
+using testing::InvokeWithoutArgs;
+using testing::IsNotSubstring;
+using testing::IsSubstring;
+using testing::Lt;
+using testing::Message;
+using testing::Mock;
+using testing::NaggyMock;
+using testing::Ne;
+using testing::Return;
+using testing::SaveArg;
+using testing::Sequence;
+using testing::SetArgPointee;
+using testing::internal::ExpectationTester;
+using testing::internal::FormatFileLocation;
+using testing::internal::kAllow;
+using testing::internal::kErrorVerbosity;
+using testing::internal::kFail;
+using testing::internal::kInfoVerbosity;
+using testing::internal::kWarn;
+using testing::internal::kWarningVerbosity;
+
+#if GTEST_HAS_STREAM_REDIRECTION
+using testing::HasSubstr;
+using testing::internal::CaptureStdout;
+using testing::internal::GetCapturedStdout;
+#endif
+
+class Incomplete;
+
+class MockIncomplete {
+ public:
+  // This line verifies that a mock method can take a by-reference
+  // argument of an incomplete type.
+  MOCK_METHOD1(ByRefFunc, void(const Incomplete& x));
+};
+
+// Tells Google Mock how to print a value of type Incomplete.
+void PrintTo(const Incomplete& x, ::std::ostream* os);
+
+TEST(MockMethodTest, CanInstantiateWithIncompleteArgType) {
+  // Even though this mock class contains a mock method that takes
+  // by-reference an argument whose type is incomplete, we can still
+  // use the mock, as long as Google Mock knows how to print the
+  // argument.
+  MockIncomplete incomplete;
+  EXPECT_CALL(incomplete, ByRefFunc(_))
+      .Times(AnyNumber());
+}
+
+// The definition of the printer for the argument type doesn't have to
+// be visible where the mock is used.
+void PrintTo(const Incomplete& /* x */, ::std::ostream* os) {
+  *os << "incomplete";
+}
+
+class Result {};
+
+// A type that's not default constructible.
+class NonDefaultConstructible {
+ public:
+  explicit NonDefaultConstructible(int /* dummy */) {}
+};
+
+class MockA {
+ public:
+  MockA() {}
+
+  MOCK_METHOD1(DoA, void(int n));
+  MOCK_METHOD1(ReturnResult, Result(int n));
+  MOCK_METHOD0(ReturnNonDefaultConstructible, NonDefaultConstructible());
+  MOCK_METHOD2(Binary, bool(int x, int y));
+  MOCK_METHOD2(ReturnInt, int(int x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockA);
+};
+
+class MockB {
+ public:
+  MockB() {}
+
+  MOCK_CONST_METHOD0(DoB, int());  // NOLINT
+  MOCK_METHOD1(DoB, int(int n));  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockB);
+};
+
+class ReferenceHoldingMock {
+ public:
+  ReferenceHoldingMock() {}
+
+  MOCK_METHOD1(AcceptReference, void(std::shared_ptr<MockA>*));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ReferenceHoldingMock);
+};
+
+// Tests that EXPECT_CALL and ON_CALL compile in a presence of macro
+// redefining a mock method name. This could happen, for example, when
+// the tested code #includes Win32 API headers which define many APIs
+// as macros, e.g. #define TextOut TextOutW.
+
+#define Method MethodW
+
+class CC {
+ public:
+  virtual ~CC() {}
+  virtual int Method() = 0;
+};
+class MockCC : public CC {
+ public:
+  MockCC() {}
+
+  MOCK_METHOD0(Method, int());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockCC);
+};
+
+// Tests that a method with expanded name compiles.
+TEST(OnCallSyntaxTest, CompilesWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  ON_CALL(cc, Method());
+}
+
+// Tests that the method with expanded name not only compiles but runs
+// and returns a correct value, too.
+TEST(OnCallSyntaxTest, WorksWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  ON_CALL(cc, Method()).WillByDefault(Return(42));
+  EXPECT_EQ(42, cc.Method());
+}
+
+// Tests that a method with expanded name compiles.
+TEST(ExpectCallSyntaxTest, CompilesWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  EXPECT_CALL(cc, Method());
+  cc.Method();
+}
+
+// Tests that it works, too.
+TEST(ExpectCallSyntaxTest, WorksWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  EXPECT_CALL(cc, Method()).WillOnce(Return(42));
+  EXPECT_EQ(42, cc.Method());
+}
+
+#undef Method  // Done with macro redefinition tests.
+
+// Tests that ON_CALL evaluates its arguments exactly once as promised
+// by Google Mock.
+TEST(OnCallSyntaxTest, EvaluatesFirstArgumentOnce) {
+  MockA a;
+  MockA* pa = &a;
+
+  ON_CALL(*pa++, DoA(_));
+  EXPECT_EQ(&a + 1, pa);
+}
+
+TEST(OnCallSyntaxTest, EvaluatesSecondArgumentOnce) {
+  MockA a;
+  int n = 0;
+
+  ON_CALL(a, DoA(n++));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the syntax of ON_CALL() is enforced at run time.
+
+TEST(OnCallSyntaxTest, WithIsOptional) {
+  MockA a;
+
+  ON_CALL(a, DoA(5))
+      .WillByDefault(Return());
+  ON_CALL(a, DoA(_))
+      .With(_)
+      .WillByDefault(Return());
+}
+
+TEST(OnCallSyntaxTest, WithCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(a, ReturnResult(_))
+        .With(_)
+        .With(_)
+        .WillByDefault(Return(Result()));
+  }, ".With() cannot appear more than once in an ON_CALL()");
+}
+
+TEST(OnCallSyntaxTest, WillByDefaultIsMandatory) {
+  MockA a;
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    ON_CALL(a, DoA(5));
+    a.DoA(5);
+  }, "");
+}
+
+TEST(OnCallSyntaxTest, WillByDefaultCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(a, DoA(5))
+        .WillByDefault(Return())
+        .WillByDefault(Return());
+  }, ".WillByDefault() must appear exactly once in an ON_CALL()");
+}
+
+// Tests that EXPECT_CALL evaluates its arguments exactly once as
+// promised by Google Mock.
+TEST(ExpectCallSyntaxTest, EvaluatesFirstArgumentOnce) {
+  MockA a;
+  MockA* pa = &a;
+
+  EXPECT_CALL(*pa++, DoA(_));
+  a.DoA(0);
+  EXPECT_EQ(&a + 1, pa);
+}
+
+TEST(ExpectCallSyntaxTest, EvaluatesSecondArgumentOnce) {
+  MockA a;
+  int n = 0;
+
+  EXPECT_CALL(a, DoA(n++));
+  a.DoA(0);
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the syntax of EXPECT_CALL() is enforced at run time.
+
+TEST(ExpectCallSyntaxTest, WithIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(5))
+      .Times(0);
+  EXPECT_CALL(a, DoA(6))
+      .With(_)
+      .Times(0);
+}
+
+TEST(ExpectCallSyntaxTest, WithCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(6))
+        .With(_)
+        .With(_);
+  }, ".With() cannot appear more than once in an EXPECT_CALL()");
+
+  a.DoA(6);
+}
+
+TEST(ExpectCallSyntaxTest, WithMustBeFirstClause) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .Times(1)
+        .With(_);
+  }, ".With() must be the first clause in an EXPECT_CALL()");
+
+  a.DoA(1);
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(2))
+        .WillOnce(Return())
+        .With(_);
+  }, ".With() must be the first clause in an EXPECT_CALL()");
+
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, TimesCanBeInferred) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Return());
+
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return())
+      .WillRepeatedly(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, TimesCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .Times(1)
+        .Times(2);
+  }, ".Times() cannot appear more than once in an EXPECT_CALL()");
+
+  a.DoA(1);
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, TimesMustBeBeforeInSequence) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .InSequence(s)
+        .Times(1);
+  }, ".Times() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceIsOptional) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2))
+      .InSequence(s);
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceCanAppearMultipleTimes) {
+  MockA a;
+  Sequence s1, s2;
+
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s1, s2)
+      .InSequence(s1);
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceMustBeBeforeAfter) {
+  MockA a;
+  Sequence s;
+
+  Expectation e = EXPECT_CALL(a, DoA(1))
+      .Times(AnyNumber());
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(2))
+        .After(e)
+        .InSequence(s);
+  }, ".InSequence() cannot appear after ");
+
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceMustBeBeforeWillOnce) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillOnce(Return())
+        .InSequence(s);
+  }, ".InSequence() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, AfterMustBeBeforeWillOnce) {
+  MockA a;
+
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  EXPECT_NONFATAL_FAILURE({
+    EXPECT_CALL(a, DoA(2))
+        .WillOnce(Return())
+        .After(e);
+  }, ".After() cannot appear after ");
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillCanAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .Times(AnyNumber())
+      .WillOnce(Return())
+      .WillOnce(Return())
+      .WillOnce(Return());
+}
+
+TEST(ExpectCallSyntaxTest, WillMustBeBeforeWillRepeatedly) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillRepeatedly(Return())
+        .WillOnce(Return());
+  }, ".WillOnce() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Return());
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return())
+      .WillRepeatedly(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyCannotAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillRepeatedly(Return())
+        .WillRepeatedly(Return());
+  }, ".WillRepeatedly() cannot appear more than once in an "
+     "EXPECT_CALL()");
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyMustBeBeforeRetiresOnSaturation) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .RetiresOnSaturation()
+        .WillRepeatedly(Return());
+  }, ".WillRepeatedly() cannot appear after ");
+}
+
+TEST(ExpectCallSyntaxTest, RetiresOnSaturationIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(1))
+      .RetiresOnSaturation();
+
+  a.DoA(1);
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, RetiresOnSaturationCannotAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .RetiresOnSaturation()
+        .RetiresOnSaturation();
+  }, ".RetiresOnSaturation() cannot appear more than once");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, DefaultCardinalityIsOnce) {
+  {
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+    a.DoA(1);
+  }
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+  }, "to be called once");
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+    a.DoA(1);
+    a.DoA(1);
+  }, "to be called once");
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that Google Mock doesn't print a warning when the number of
+// WillOnce() is adequate.
+TEST(ExpectCallSyntaxTest, DoesNotWarnOnAdequateActionCount) {
+  CaptureStdout();
+  {
+    MockB b;
+
+    // It's always fine to omit WillOnce() entirely.
+    EXPECT_CALL(b, DoB())
+        .Times(0);
+    EXPECT_CALL(b, DoB(1))
+        .Times(AtMost(1));
+    EXPECT_CALL(b, DoB(2))
+        .Times(1)
+        .WillRepeatedly(Return(1));
+
+    // It's fine for the number of WillOnce()s to equal the upper bound.
+    EXPECT_CALL(b, DoB(3))
+        .Times(Between(1, 2))
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    // It's fine for the number of WillOnce()s to be smaller than the
+    // upper bound when there is a WillRepeatedly().
+    EXPECT_CALL(b, DoB(4))
+        .Times(AtMost(3))
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    // Satisfies the above expectations.
+    b.DoB(2);
+    b.DoB(3);
+  }
+  EXPECT_STREQ("", GetCapturedStdout().c_str());
+}
+
+// Tests that Google Mock warns on having too many actions in an
+// expectation compared to its cardinality.
+TEST(ExpectCallSyntaxTest, WarnsOnTooManyActions) {
+  CaptureStdout();
+  {
+    MockB b;
+
+    // Warns when the number of WillOnce()s is larger than the upper bound.
+    EXPECT_CALL(b, DoB())
+        .Times(0)
+        .WillOnce(Return(1));  // #1
+    EXPECT_CALL(b, DoB())
+        .Times(AtMost(1))
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));  // #2
+    EXPECT_CALL(b, DoB(1))
+        .Times(1)
+        .WillOnce(Return(1))
+        .WillOnce(Return(2))
+        .RetiresOnSaturation();  // #3
+
+    // Warns when the number of WillOnce()s equals the upper bound and
+    // there is a WillRepeatedly().
+    EXPECT_CALL(b, DoB())
+        .Times(0)
+        .WillRepeatedly(Return(1));  // #4
+    EXPECT_CALL(b, DoB(2))
+        .Times(1)
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));  // #5
+
+    // Satisfies the above expectations.
+    b.DoB(1);
+    b.DoB(2);
+  }
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be never called, but has 1 WillOnce().",
+      output);  // #1
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be called at most once, "
+      "but has 2 WillOnce()s.",
+      output);  // #2
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB(1))...\n"
+      "Expected to be called once, but has 2 WillOnce()s.",
+      output);  // #3
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be never called, but has 0 WillOnce()s "
+      "and a WillRepeatedly().",
+      output);  // #4
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB(2))...\n"
+      "Expected to be called once, but has 1 WillOnce() "
+      "and a WillRepeatedly().",
+      output);  // #5
+}
+
+// Tests that Google Mock warns on having too few actions in an
+// expectation compared to its cardinality.
+TEST(ExpectCallSyntaxTest, WarnsOnTooFewActions) {
+  MockB b;
+
+  EXPECT_CALL(b, DoB())
+      .Times(Between(2, 3))
+      .WillOnce(Return(1));
+
+  CaptureStdout();
+  b.DoB();
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too few actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be called between 2 and 3 times, "
+      "but has only 1 WillOnce().",
+      output);
+  b.DoB();
+}
+
+TEST(ExpectCallSyntaxTest, WarningIsErrorWithFlag) {
+  int original_behavior = testing::GMOCK_FLAG(default_mock_behavior);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kAllow;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  std::string output = GetCapturedStdout();
+  EXPECT_TRUE(output.empty()) << output;
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kWarn;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  std::string warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kFail;
+  EXPECT_NONFATAL_FAILURE({
+    MockA a;
+    a.DoA(0);
+  }, "Uninteresting mock function call");
+
+  // Out of bounds values are converted to kWarn
+  testing::GMOCK_FLAG(default_mock_behavior) = -1;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+  testing::GMOCK_FLAG(default_mock_behavior) = 3;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = original_behavior;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests the semantics of ON_CALL().
+
+// Tests that the built-in default action is taken when no ON_CALL()
+// is specified.
+TEST(OnCallTest, TakesBuiltInDefaultActionWhenNoOnCall) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that the built-in default action is taken when no ON_CALL()
+// matches the invocation.
+TEST(OnCallTest, TakesBuiltInDefaultActionWhenNoOnCallMatches) {
+  MockB b;
+  ON_CALL(b, DoB(1))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_));
+
+  EXPECT_EQ(0, b.DoB(2));
+}
+
+// Tests that the last matching ON_CALL() action is taken.
+TEST(OnCallTest, PicksLastMatchingOnCall) {
+  MockB b;
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(3));
+  ON_CALL(b, DoB(2))
+      .WillByDefault(Return(2));
+  ON_CALL(b, DoB(1))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_));
+
+  EXPECT_EQ(2, b.DoB(2));
+}
+
+// Tests the semantics of EXPECT_CALL().
+
+// Tests that any call is allowed when no EXPECT_CALL() is specified.
+TEST(ExpectCallTest, AllowsAnyCallWhenNoSpec) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+  // There is no expectation on DoB(int).
+
+  b.DoB();
+
+  // DoB(int) can be called any number of times.
+  b.DoB(1);
+  b.DoB(2);
+}
+
+// Tests that the last matching EXPECT_CALL() fires.
+TEST(ExpectCallTest, PicksLastMatchingExpectCall) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_))
+      .WillRepeatedly(Return(2));
+  EXPECT_CALL(b, DoB(1))
+      .WillRepeatedly(Return(1));
+
+  EXPECT_EQ(1, b.DoB(1));
+}
+
+// Tests lower-bound violation.
+TEST(ExpectCallTest, CatchesTooFewCalls) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB(5))
+        .Times(AtLeast(2));
+
+    b.DoB(5);
+  }, "Actual function call count doesn't match EXPECT_CALL(b, DoB(5))...\n"
+     "         Expected: to be called at least twice\n"
+     "           Actual: called once - unsatisfied and active");
+}
+
+// Tests that the cardinality can be inferred when no Times(...) is
+// specified.
+TEST(ExpectCallTest, InfersCardinalityWhenThereIsNoWillRepeatedly) {
+  {
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+  }, "to be called twice");
+
+  {  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+    EXPECT_NONFATAL_FAILURE(b.DoB(), "to be called twice");
+  }
+}
+
+TEST(ExpectCallTest, InfersCardinality1WhenThereIsWillRepeatedly) {
+  {
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+  }
+
+  {  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+  }, "to be called at least once");
+}
+
+// Tests that the n-th action is taken for the n-th matching
+// invocation.
+TEST(ExpectCallTest, NthMatchTakesNthAction) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2))
+      .WillOnce(Return(3));
+
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  EXPECT_EQ(3, b.DoB());
+}
+
+// Tests that the WillRepeatedly() action is taken when the WillOnce(...)
+// list is exhausted.
+TEST(ExpectCallTest, TakesRepeatedActionWhenWillListIsExhausted) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1))
+      .WillRepeatedly(Return(2));
+
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that the default action is taken when the WillOnce(...) list is
+// exhausted and there is no WillRepeatedly().
+TEST(ExpectCallTest, TakesDefaultActionWhenWillListIsExhausted) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_))
+      .Times(1);
+  EXPECT_CALL(b, DoB())
+      .Times(AnyNumber())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB(1));  // Shouldn't generate a warning as the
+                           // expectation has no action clause at all.
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  const std::string output1 = GetCapturedStdout();
+  EXPECT_STREQ("", output1.c_str());
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB());
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_THAT(output2.c_str(),
+              HasSubstr("Actions ran out in EXPECT_CALL(b, DoB())...\n"
+                        "Called 3 times, but only 2 WillOnce()s are specified"
+                        " - returning default value."));
+  EXPECT_THAT(output2.c_str(),
+              HasSubstr("Actions ran out in EXPECT_CALL(b, DoB())...\n"
+                        "Called 4 times, but only 2 WillOnce()s are specified"
+                        " - returning default value."));
+}
+
+TEST(FunctionMockerMessageTest, ReportsExpectCallLocationForExhausedActions) {
+  MockB b;
+  std::string expect_call_location = FormatFileLocation(__FILE__, __LINE__ + 1);
+  EXPECT_CALL(b, DoB()).Times(AnyNumber()).WillOnce(Return(1));
+
+  EXPECT_EQ(1, b.DoB());
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB());
+  const std::string output = GetCapturedStdout();
+  // The warning message should contain the call location.
+  EXPECT_PRED_FORMAT2(IsSubstring, expect_call_location, output);
+}
+
+TEST(FunctionMockerMessageTest,
+     ReportsDefaultActionLocationOfUninterestingCallsForNaggyMock) {
+  std::string on_call_location;
+  CaptureStdout();
+  {
+    NaggyMock<MockB> b;
+    on_call_location = FormatFileLocation(__FILE__, __LINE__ + 1);
+    ON_CALL(b, DoB(_)).WillByDefault(Return(0));
+    b.DoB(0);
+  }
+  EXPECT_PRED_FORMAT2(IsSubstring, on_call_location, GetCapturedStdout());
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that an uninteresting call performs the default action.
+TEST(UninterestingCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_TRUE(a.Binary(1, 2));
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that an unexpected call performs the default action.
+TEST(UnexpectedCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(0, 0));
+  a.Binary(0, 0);
+  bool result = false;
+  EXPECT_NONFATAL_FAILURE(result = a.Binary(1, 2),
+                          "Unexpected mock function call");
+  EXPECT_TRUE(result);
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .Times(0);
+  int n = -1;
+  EXPECT_NONFATAL_FAILURE(n = b.DoB(1),
+                          "Unexpected mock function call");
+  EXPECT_EQ(0, n);
+}
+
+// Tests that when an unexpected void function generates the right
+// failure message.
+TEST(UnexpectedCallTest, GeneratesFailureForVoidFunction) {
+  // First, tests the message when there is only one EXPECT_CALL().
+  MockA a1;
+  EXPECT_CALL(a1, DoA(1));
+  a1.DoA(1);
+  // Ideally we should match the failure message against a regex, but
+  // EXPECT_NONFATAL_FAILURE doesn't support that, so we test for
+  // multiple sub-strings instead.
+  EXPECT_NONFATAL_FAILURE(
+      a1.DoA(9),
+      "Unexpected mock function call - returning directly.\n"
+      "    Function call: DoA(9)\n"
+      "Google Mock tried the following 1 expectation, but it didn't match:");
+  EXPECT_NONFATAL_FAILURE(
+      a1.DoA(9),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 9\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+
+  // Next, tests the message when there are more than one EXPECT_CALL().
+  MockA a2;
+  EXPECT_CALL(a2, DoA(1));
+  EXPECT_CALL(a2, DoA(3));
+  a2.DoA(1);
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "Unexpected mock function call - returning directly.\n"
+      "    Function call: DoA(2)\n"
+      "Google Mock tried the following 2 expectations, but none matched:");
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "tried expectation #0: EXPECT_CALL(a2, DoA(1))...\n"
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "tried expectation #1: EXPECT_CALL(a2, DoA(3))...\n"
+      "  Expected arg #0: is equal to 3\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: never called - unsatisfied and active");
+  a2.DoA(3);
+}
+
+// Tests that an unexpected non-void function generates the right
+// failure message.
+TEST(UnexpectedCallTest, GeneartesFailureForNonVoidFunction) {
+  MockB b1;
+  EXPECT_CALL(b1, DoB(1));
+  b1.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b1.DoB(2),
+      "Unexpected mock function call - returning default value.\n"
+      "    Function call: DoB(2)\n"
+      "          Returns: 0\n"
+      "Google Mock tried the following 1 expectation, but it didn't match:");
+  EXPECT_NONFATAL_FAILURE(
+      b1.DoB(2),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+}
+
+// Tests that Google Mock explains that an retired expectation doesn't
+// match the call.
+TEST(UnexpectedCallTest, RetiredExpectation) {
+  MockB b;
+  EXPECT_CALL(b, DoB(1))
+      .RetiresOnSaturation();
+
+  b.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(1),
+      "         Expected: the expectation is active\n"
+      "           Actual: it is retired");
+}
+
+// Tests that Google Mock explains that an expectation that doesn't
+// match the arguments doesn't match the call.
+TEST(UnexpectedCallTest, UnmatchedArguments) {
+  MockB b;
+  EXPECT_CALL(b, DoB(1));
+
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(2),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n");
+  b.DoB(1);
+}
+
+// Tests that Google Mock explains that an expectation with
+// unsatisfied pre-requisites doesn't match the call.
+TEST(UnexpectedCallTest, UnsatisifiedPrerequisites) {
+  Sequence s1, s2;
+  MockB b;
+  EXPECT_CALL(b, DoB(1))
+      .InSequence(s1);
+  EXPECT_CALL(b, DoB(2))
+      .Times(AnyNumber())
+      .InSequence(s1);
+  EXPECT_CALL(b, DoB(3))
+      .InSequence(s2);
+  EXPECT_CALL(b, DoB(4))
+      .InSequence(s1, s2);
+
+  ::testing::TestPartResultArray failures;
+  {
+    ::testing::ScopedFakeTestPartResultReporter reporter(&failures);
+    b.DoB(4);
+    // Now 'failures' contains the Google Test failures generated by
+    // the above statement.
+  }
+
+  // There should be one non-fatal failure.
+  ASSERT_EQ(1, failures.size());
+  const ::testing::TestPartResult& r = failures.GetTestPartResult(0);
+  EXPECT_EQ(::testing::TestPartResult::kNonFatalFailure, r.type());
+
+  // Verifies that the failure message contains the two unsatisfied
+  // pre-requisites but not the satisfied one.
+#if GTEST_USES_PCRE
+  EXPECT_THAT(r.message(), ContainsRegex(
+      // PCRE has trouble using (.|\n) to match any character, but
+      // supports the (?s) prefix for using . to match any character.
+      "(?s)the following immediate pre-requisites are not satisfied:\n"
+      ".*: pre-requisite #0\n"
+      ".*: pre-requisite #1"));
+#elif GTEST_USES_POSIX_RE
+  EXPECT_THAT(r.message(), ContainsRegex(
+      // POSIX RE doesn't understand the (?s) prefix, but has no trouble
+      // with (.|\n).
+      "the following immediate pre-requisites are not satisfied:\n"
+      "(.|\n)*: pre-requisite #0\n"
+      "(.|\n)*: pre-requisite #1"));
+#else
+  // We can only use Google Test's own simple regex.
+  EXPECT_THAT(r.message(), ContainsRegex(
+      "the following immediate pre-requisites are not satisfied:"));
+  EXPECT_THAT(r.message(), ContainsRegex(": pre-requisite #0"));
+  EXPECT_THAT(r.message(), ContainsRegex(": pre-requisite #1"));
+#endif  // GTEST_USES_PCRE
+
+  b.DoB(1);
+  b.DoB(3);
+  b.DoB(4);
+}
+
+TEST(UndefinedReturnValueTest,
+     ReturnValueIsMandatoryWhenNotDefaultConstructible) {
+  MockA a;
+  // FIXME: We should really verify the output message,
+  // but we cannot yet due to that EXPECT_DEATH only captures stderr
+  // while Google Mock logs to stdout.
+#if GTEST_HAS_EXCEPTIONS
+  EXPECT_ANY_THROW(a.ReturnNonDefaultConstructible());
+#else
+  EXPECT_DEATH_IF_SUPPORTED(a.ReturnNonDefaultConstructible(), "");
+#endif
+}
+
+// Tests that an excessive call (one whose arguments match the
+// matchers but is called too many times) performs the default action.
+TEST(ExcessiveCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(0, 0));
+  a.Binary(0, 0);
+  bool result = false;
+  EXPECT_NONFATAL_FAILURE(result = a.Binary(0, 0),
+                          "Mock function called more times than expected");
+  EXPECT_TRUE(result);
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .Times(0);
+  int n = -1;
+  EXPECT_NONFATAL_FAILURE(n = b.DoB(0),
+                          "Mock function called more times than expected");
+  EXPECT_EQ(0, n);
+}
+
+// Tests that when a void function is called too many times,
+// the failure message contains the argument values.
+TEST(ExcessiveCallTest, GeneratesFailureForVoidFunction) {
+  MockA a;
+  EXPECT_CALL(a, DoA(_))
+      .Times(0);
+  EXPECT_NONFATAL_FAILURE(
+      a.DoA(9),
+      "Mock function called more times than expected - returning directly.\n"
+      "    Function call: DoA(9)\n"
+      "         Expected: to be never called\n"
+      "           Actual: called once - over-saturated and active");
+}
+
+// Tests that when a non-void function is called too many times, the
+// failure message contains the argument values and the return value.
+TEST(ExcessiveCallTest, GeneratesFailureForNonVoidFunction) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_));
+  b.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(2),
+      "Mock function called more times than expected - "
+      "returning default value.\n"
+      "    Function call: DoB(2)\n"
+      "          Returns: 0\n"
+      "         Expected: to be called once\n"
+      "           Actual: called twice - over-saturated and active");
+}
+
+// Tests using sequences.
+
+TEST(InSequenceTest, AllExpectationInScopeAreInSequence) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(a, DoA(2));
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(2);
+  }, "Unexpected mock function call");
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(InSequenceTest, NestedInSequence) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    {
+      InSequence dummy2;
+
+      EXPECT_CALL(a, DoA(2));
+      EXPECT_CALL(a, DoA(3));
+    }
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(1);
+    a.DoA(3);
+  }, "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(3);
+}
+
+TEST(InSequenceTest, ExpectationsOutOfScopeAreNotAffected) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(a, DoA(2));
+  }
+  EXPECT_CALL(a, DoA(3));
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(2);
+  }, "Unexpected mock function call");
+
+  a.DoA(3);
+  a.DoA(1);
+  a.DoA(2);
+}
+
+// Tests that any order is allowed when no sequence is used.
+TEST(SequenceTest, AnyOrderIsOkByDefault) {
+  {
+    MockA a;
+    MockB b;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(b, DoB())
+        .Times(AnyNumber());
+
+    a.DoA(1);
+    b.DoB();
+  }
+
+  {  // NOLINT
+    MockA a;
+    MockB b;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(b, DoB())
+        .Times(AnyNumber());
+
+    b.DoB();
+    a.DoA(1);
+  }
+}
+
+// Tests that the calls must be in strict order when a complete order
+// is specified.
+TEST(SequenceTest, CallsMustBeInStrictOrderWhenSaidSo1) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  Sequence s;
+  EXPECT_CALL(a, ReturnResult(1))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(2))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(3))
+      .InSequence(s);
+
+  a.ReturnResult(1);
+
+  // May only be called after a.ReturnResult(2).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.ReturnResult(2);
+  a.ReturnResult(3);
+}
+
+// Tests that the calls must be in strict order when a complete order
+// is specified.
+TEST(SequenceTest, CallsMustBeInStrictOrderWhenSaidSo2) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  Sequence s;
+  EXPECT_CALL(a, ReturnResult(1))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(2))
+      .InSequence(s);
+
+  // May only be called after a.ReturnResult(1).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(2), "Unexpected mock function call");
+
+  a.ReturnResult(1);
+  a.ReturnResult(2);
+}
+
+// Tests specifying a DAG using multiple sequences.
+class PartialOrderTest : public testing::Test {
+ protected:
+  PartialOrderTest() {
+    ON_CALL(a_, ReturnResult(_))
+        .WillByDefault(Return(Result()));
+
+    // Specifies this partial ordering:
+    //
+    // a.ReturnResult(1) ==>
+    //                       a.ReturnResult(2) * n  ==>  a.ReturnResult(3)
+    // b.DoB() * 2       ==>
+    Sequence x, y;
+    EXPECT_CALL(a_, ReturnResult(1))
+        .InSequence(x);
+    EXPECT_CALL(b_, DoB())
+        .Times(2)
+        .InSequence(y);
+    EXPECT_CALL(a_, ReturnResult(2))
+        .Times(AnyNumber())
+        .InSequence(x, y);
+    EXPECT_CALL(a_, ReturnResult(3))
+        .InSequence(x);
+  }
+
+  MockA a_;
+  MockB b_;
+};
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag1) {
+  a_.ReturnResult(1);
+  b_.DoB();
+
+  // May only be called after the second DoB().
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag2) {
+  // May only be called after ReturnResult(1).
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag3) {
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(3), "Unexpected mock function call");
+
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag4) {
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+
+  // May only be called before ReturnResult(3).
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+}
+
+TEST(SequenceTest, Retirement) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s);
+  EXPECT_CALL(a, DoA(_))
+      .InSequence(s)
+      .RetiresOnSaturation();
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s);
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(1);
+}
+
+// Tests Expectation.
+
+TEST(ExpectationTest, ConstrutorsWork) {
+  MockA a;
+  Expectation e1;  // Default ctor.
+
+  // Ctor from various forms of EXPECT_CALL.
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3)).With(_);
+  {
+    Sequence s;
+    Expectation e4 = EXPECT_CALL(a, DoA(4)).Times(1);
+    Expectation e5 = EXPECT_CALL(a, DoA(5)).InSequence(s);
+  }
+  Expectation e6 = EXPECT_CALL(a, DoA(6)).After(e2);
+  Expectation e7 = EXPECT_CALL(a, DoA(7)).WillOnce(Return());
+  Expectation e8 = EXPECT_CALL(a, DoA(8)).WillRepeatedly(Return());
+  Expectation e9 = EXPECT_CALL(a, DoA(9)).RetiresOnSaturation();
+
+  Expectation e10 = e2;  // Copy ctor.
+
+  EXPECT_THAT(e1, Ne(e2));
+  EXPECT_THAT(e2, Eq(e10));
+
+  a.DoA(2);
+  a.DoA(3);
+  a.DoA(4);
+  a.DoA(5);
+  a.DoA(6);
+  a.DoA(7);
+  a.DoA(8);
+  a.DoA(9);
+}
+
+TEST(ExpectationTest, AssignmentWorks) {
+  MockA a;
+  Expectation e1;
+  Expectation e2 = EXPECT_CALL(a, DoA(1));
+
+  EXPECT_THAT(e1, Ne(e2));
+
+  e1 = e2;
+  EXPECT_THAT(e1, Eq(e2));
+
+  a.DoA(1);
+}
+
+// Tests ExpectationSet.
+
+TEST(ExpectationSetTest, MemberTypesAreCorrect) {
+  ::testing::StaticAssertTypeEq<Expectation, ExpectationSet::value_type>();
+}
+
+TEST(ExpectationSetTest, ConstructorsWork) {
+  MockA a;
+
+  Expectation e1;
+  const Expectation e2;
+  ExpectationSet es1;  // Default ctor.
+  ExpectationSet es2 = EXPECT_CALL(a, DoA(1));  // Ctor from EXPECT_CALL.
+  ExpectationSet es3 = e1;  // Ctor from Expectation.
+  ExpectationSet es4(e1);   // Ctor from Expectation; alternative syntax.
+  ExpectationSet es5 = e2;  // Ctor from const Expectation.
+  ExpectationSet es6(e2);   // Ctor from const Expectation; alternative syntax.
+  ExpectationSet es7 = es2;  // Copy ctor.
+
+  EXPECT_EQ(0, es1.size());
+  EXPECT_EQ(1, es2.size());
+  EXPECT_EQ(1, es3.size());
+  EXPECT_EQ(1, es4.size());
+  EXPECT_EQ(1, es5.size());
+  EXPECT_EQ(1, es6.size());
+  EXPECT_EQ(1, es7.size());
+
+  EXPECT_THAT(es3, Ne(es2));
+  EXPECT_THAT(es4, Eq(es3));
+  EXPECT_THAT(es5, Eq(es4));
+  EXPECT_THAT(es6, Eq(es5));
+  EXPECT_THAT(es7, Eq(es2));
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, AssignmentWorks) {
+  ExpectationSet es1;
+  ExpectationSet es2 = Expectation();
+
+  es1 = es2;
+  EXPECT_EQ(1, es1.size());
+  EXPECT_THAT(*(es1.begin()), Eq(Expectation()));
+  EXPECT_THAT(es1, Eq(es2));
+}
+
+TEST(ExpectationSetTest, InsertionWorks) {
+  ExpectationSet es1;
+  Expectation e1;
+  es1 += e1;
+  EXPECT_EQ(1, es1.size());
+  EXPECT_THAT(*(es1.begin()), Eq(e1));
+
+  MockA a;
+  Expectation e2 = EXPECT_CALL(a, DoA(1));
+  es1 += e2;
+  EXPECT_EQ(2, es1.size());
+
+  ExpectationSet::const_iterator it1 = es1.begin();
+  ExpectationSet::const_iterator it2 = it1;
+  ++it2;
+  EXPECT_TRUE(*it1 == e1 || *it2 == e1);  // e1 must be in the set.
+  EXPECT_TRUE(*it1 == e2 || *it2 == e2);  // e2 must be in the set too.
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, SizeWorks) {
+  ExpectationSet es;
+  EXPECT_EQ(0, es.size());
+
+  es += Expectation();
+  EXPECT_EQ(1, es.size());
+
+  MockA a;
+  es += EXPECT_CALL(a, DoA(1));
+  EXPECT_EQ(2, es.size());
+
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, IsEnumerable) {
+  ExpectationSet es;
+  EXPECT_TRUE(es.begin() == es.end());
+
+  es += Expectation();
+  ExpectationSet::const_iterator it = es.begin();
+  EXPECT_TRUE(it != es.end());
+  EXPECT_THAT(*it, Eq(Expectation()));
+  ++it;
+  EXPECT_TRUE(it== es.end());
+}
+
+// Tests the .After() clause.
+
+TEST(AfterTest, SucceedsWhenPartialOrderIsSatisfied) {
+  MockA a;
+  ExpectationSet es;
+  es += EXPECT_CALL(a, DoA(1));
+  es += EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(es);
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(3);
+}
+
+TEST(AfterTest, SucceedsWhenTotalOrderIsSatisfied) {
+  MockA a;
+  MockB b;
+  // The following also verifies that const Expectation objects work
+  // too.  Do not remove the const modifiers.
+  const Expectation e1 = EXPECT_CALL(a, DoA(1));
+  const Expectation e2 = EXPECT_CALL(b, DoB())
+      .Times(2)
+      .After(e1);
+  EXPECT_CALL(a, DoA(2)).After(e2);
+
+  a.DoA(1);
+  b.DoB();
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must be in strict order when specified so using .After().
+TEST(AfterTest, CallsMustBeInStrictOrderWhenSpecifiedSo1) {
+  MockA a;
+  MockB b;
+
+  // Define ordering:
+  //   a.DoA(1) ==> b.DoB() ==> a.DoA(2)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(b, DoB())
+      .After(e1);
+  EXPECT_CALL(a, DoA(2))
+      .After(e2);
+
+  a.DoA(1);
+
+  // May only be called after DoB().
+  EXPECT_NONFATAL_FAILURE(a.DoA(2), "Unexpected mock function call");
+
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must be in strict order when specified so using .After().
+TEST(AfterTest, CallsMustBeInStrictOrderWhenSpecifiedSo2) {
+  MockA a;
+  MockB b;
+
+  // Define ordering:
+  //   a.DoA(1) ==> b.DoB() * 2 ==> a.DoA(2)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(b, DoB())
+      .Times(2)
+      .After(e1);
+  EXPECT_CALL(a, DoA(2))
+      .After(e2);
+
+  a.DoA(1);
+  b.DoB();
+
+  // May only be called after the second DoB().
+  EXPECT_NONFATAL_FAILURE(a.DoA(2), "Unexpected mock function call");
+
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must satisfy the partial order when specified so.
+TEST(AfterTest, CallsMustSatisfyPartialOrderWhenSpecifiedSo) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  // Define ordering:
+  //   a.DoA(1) ==>
+  //   a.DoA(2) ==> a.ReturnResult(3)
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  const ExpectationSet es = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, ReturnResult(3))
+      .After(e, es);
+
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(1);
+  a.ReturnResult(3);
+}
+
+// Calls must satisfy the partial order when specified so.
+TEST(AfterTest, CallsMustSatisfyPartialOrderWhenSpecifiedSo2) {
+  MockA a;
+
+  // Define ordering:
+  //   a.DoA(1) ==>
+  //   a.DoA(2) ==> a.DoA(3)
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  const ExpectationSet es = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(e, es);
+
+  a.DoA(2);
+
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a.DoA(3), "Unexpected mock function call");
+
+  a.DoA(1);
+  a.DoA(3);
+}
+
+// .After() can be combined with .InSequence().
+TEST(AfterTest, CanBeUsedWithInSequence) {
+  MockA a;
+  Sequence s;
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2)).InSequence(s);
+  EXPECT_CALL(a, DoA(3))
+      .InSequence(s)
+      .After(e);
+
+  a.DoA(1);
+
+  // May only be after DoA(2).
+  EXPECT_NONFATAL_FAILURE(a.DoA(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(3);
+}
+
+// .After() can be called multiple times.
+TEST(AfterTest, CanBeCalledManyTimes) {
+  MockA a;
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3));
+  EXPECT_CALL(a, DoA(4))
+      .After(e1)
+      .After(e2)
+      .After(e3);
+
+  a.DoA(3);
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(4);
+}
+
+// .After() accepts up to 5 arguments.
+TEST(AfterTest, AcceptsUpToFiveArguments) {
+  MockA a;
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3));
+  ExpectationSet es1 = EXPECT_CALL(a, DoA(4));
+  ExpectationSet es2 = EXPECT_CALL(a, DoA(5));
+  EXPECT_CALL(a, DoA(6))
+      .After(e1, e2, e3, es1, es2);
+
+  a.DoA(5);
+  a.DoA(2);
+  a.DoA(4);
+  a.DoA(1);
+  a.DoA(3);
+  a.DoA(6);
+}
+
+// .After() allows input to contain duplicated Expectations.
+TEST(AfterTest, AcceptsDuplicatedInput) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  // Define ordering:
+  //   DoA(1) ==>
+  //   DoA(2) ==> ReturnResult(3)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  ExpectationSet es;
+  es += e1;
+  es += e2;
+  EXPECT_CALL(a, ReturnResult(3))
+      .After(e1, e2, es, e1);
+
+  a.DoA(1);
+
+  // May only be after DoA(2).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.ReturnResult(3);
+}
+
+// An Expectation added to an ExpectationSet after it has been used in
+// an .After() has no effect.
+TEST(AfterTest, ChangesToExpectationSetHaveNoEffectAfterwards) {
+  MockA a;
+  ExpectationSet es1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(es1);
+  es1 += e2;
+
+  a.DoA(1);
+  a.DoA(3);
+  a.DoA(2);
+}
+
+// Tests that Google Mock correctly handles calls to mock functions
+// after a mock object owning one of their pre-requisites has died.
+
+// Tests that calls that satisfy the original spec are successful.
+TEST(DeletingMockEarlyTest, Success1) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(true));
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  EXPECT_EQ(1, b1->DoB(1));
+  delete b1;
+  // a's pre-requisite has died.
+  EXPECT_TRUE(a->Binary(0, 1));
+  delete b2;
+  // a's successor has died.
+  EXPECT_TRUE(a->Binary(1, 2));
+  delete a;
+}
+
+// Tests that calls that satisfy the original spec are successful.
+TEST(DeletingMockEarlyTest, Success2) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  delete a;  // a is trivially satisfied.
+  EXPECT_EQ(1, b1->DoB(1));
+  EXPECT_EQ(2, b2->DoB(2));
+  delete b1;
+  delete b2;
+}
+
+// Tests that it's OK to delete a mock object itself in its action.
+
+// Suppresses warning on unreferenced formal parameter in MSVC with
+// -W4.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+ACTION_P(Delete, ptr) { delete ptr; }
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+TEST(DeletingMockEarlyTest, CanDeleteSelfInActionReturningVoid) {
+  MockA* const a = new MockA;
+  EXPECT_CALL(*a, DoA(_)).WillOnce(Delete(a));
+  a->DoA(42);  // This will cause a to be deleted.
+}
+
+TEST(DeletingMockEarlyTest, CanDeleteSelfInActionReturningValue) {
+  MockA* const a = new MockA;
+  EXPECT_CALL(*a, ReturnResult(_))
+      .WillOnce(DoAll(Delete(a), Return(Result())));
+  a->ReturnResult(42);  // This will cause a to be deleted.
+}
+
+// Tests that calls that violate the original spec yield failures.
+TEST(DeletingMockEarlyTest, Failure1) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  delete a;  // a is trivially satisfied.
+  EXPECT_NONFATAL_FAILURE({
+    b2->DoB(2);
+  }, "Unexpected mock function call");
+  EXPECT_EQ(1, b1->DoB(1));
+  delete b1;
+  delete b2;
+}
+
+// Tests that calls that violate the original spec yield failures.
+TEST(DeletingMockEarlyTest, Failure2) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber());
+  }
+
+  EXPECT_NONFATAL_FAILURE(delete b1,
+                          "Actual: never called");
+  EXPECT_NONFATAL_FAILURE(a->Binary(0, 1),
+                          "Unexpected mock function call");
+  EXPECT_NONFATAL_FAILURE(b2->DoB(1),
+                          "Unexpected mock function call");
+  delete a;
+  delete b2;
+}
+
+class EvenNumberCardinality : public CardinalityInterface {
+ public:
+  // Returns true if call_count calls will satisfy this cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return call_count % 2 == 0;
+  }
+
+  // Returns true if call_count calls will saturate this cardinality.
+  bool IsSaturatedByCallCount(int /* call_count */) const override {
+    return false;
+  }
+
+  // Describes self to an ostream.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "called even number of times";
+  }
+};
+
+Cardinality EvenNumber() {
+  return Cardinality(new EvenNumberCardinality);
+}
+
+TEST(ExpectationBaseTest,
+     AllPrerequisitesAreSatisfiedWorksForNonMonotonicCardinality) {
+  MockA* a = new MockA;
+  Sequence s;
+
+  EXPECT_CALL(*a, DoA(1))
+      .Times(EvenNumber())
+      .InSequence(s);
+  EXPECT_CALL(*a, DoA(2))
+      .Times(AnyNumber())
+      .InSequence(s);
+  EXPECT_CALL(*a, DoA(3))
+      .Times(AnyNumber());
+
+  a->DoA(3);
+  a->DoA(1);
+  EXPECT_NONFATAL_FAILURE(a->DoA(2), "Unexpected mock function call");
+  EXPECT_NONFATAL_FAILURE(delete a, "to be called even number of times");
+}
+
+// The following tests verify the message generated when a mock
+// function is called.
+
+struct Printable {
+};
+
+inline void operator<<(::std::ostream& os, const Printable&) {
+  os << "Printable";
+}
+
+struct Unprintable {
+  Unprintable() : value(0) {}
+  int value;
+};
+
+class MockC {
+ public:
+  MockC() {}
+
+  MOCK_METHOD6(VoidMethod, void(bool cond, int n, std::string s, void* p,
+                                const Printable& x, Unprintable y));
+  MOCK_METHOD0(NonVoidMethod, int());  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockC);
+};
+
+class VerboseFlagPreservingFixture : public testing::Test {
+ protected:
+  VerboseFlagPreservingFixture()
+      : saved_verbose_flag_(GMOCK_FLAG(verbose)) {}
+
+  ~VerboseFlagPreservingFixture() override {
+    GMOCK_FLAG(verbose) = saved_verbose_flag_;
+  }
+
+ private:
+  const std::string saved_verbose_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(VerboseFlagPreservingFixture);
+};
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that an uninteresting mock function call on a naggy mock
+// generates a warning without the stack trace when
+// --gmock_verbose=warning is specified.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockGeneratesNoStackTraceWhenVerboseWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", output);
+  EXPECT_PRED_FORMAT2(IsNotSubstring, "Stack trace:", output);
+}
+
+// Tests that an uninteresting mock function call on a naggy mock
+// generates a warning containing the stack trace when
+// --gmock_verbose=info is specified.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockGeneratesFyiWithStackTraceWhenVerboseInfo) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Stack trace:", output);
+
+# ifndef NDEBUG
+
+  // We check the stack trace content in dbg-mode only, as opt-mode
+  // may inline the call we are interested in seeing.
+
+  // Verifies that a void mock function's name appears in the stack
+  // trace.
+  EXPECT_PRED_FORMAT2(IsSubstring, "VoidMethod(", output);
+
+  // Verifies that a non-void mock function's name appears in the
+  // stack trace.
+  CaptureStdout();
+  c.NonVoidMethod();
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "NonVoidMethod(", output2);
+
+# endif  // NDEBUG
+}
+
+// Tests that an uninteresting mock function call on a naggy mock
+// causes the function arguments and return value to be printed.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockPrintsArgumentsAndReturnValue) {
+  // A non-void mock function.
+  NaggyMock<MockB> b;
+  CaptureStdout();
+  b.DoB();
+  const std::string output1 = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Uninteresting mock function call - returning default value.\n"
+      "    Function call: DoB()\n"
+      "          Returns: 0\n", output1.c_str());
+  // Makes sure the return value is printed.
+
+  // A void mock function.
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_THAT(output2.c_str(),
+              ContainsRegex(
+                  "Uninteresting mock function call - returning directly\\.\n"
+                  "    Function call: VoidMethod"
+                  "\\(false, 5, \"Hi\", NULL, @.+ "
+                  "Printable, 4-byte object <00-00 00-00>\\)"));
+  // A void function has no return value to print.
+}
+
+// Tests how the --gmock_verbose flag affects Google Mock's output.
+
+class GMockVerboseFlagTest : public VerboseFlagPreservingFixture {
+ public:
+  // Verifies that the given Google Mock output is correct.  (When
+  // should_print is true, the output should match the given regex and
+  // contain the given function name in the stack trace.  When it's
+  // false, the output should be empty.)
+  void VerifyOutput(const std::string& output, bool should_print,
+                    const std::string& expected_substring,
+                    const std::string& function_name) {
+    if (should_print) {
+      EXPECT_THAT(output.c_str(), HasSubstr(expected_substring));
+# ifndef NDEBUG
+      // We check the stack trace content in dbg-mode only, as opt-mode
+      // may inline the call we are interested in seeing.
+      EXPECT_THAT(output.c_str(), HasSubstr(function_name));
+# else
+      // Suppresses 'unused function parameter' warnings.
+      static_cast<void>(function_name);
+# endif  // NDEBUG
+    } else {
+      EXPECT_STREQ("", output.c_str());
+    }
+  }
+
+  // Tests how the flag affects expected calls.
+  void TestExpectedCall(bool should_print) {
+    MockA a;
+    EXPECT_CALL(a, DoA(5));
+    EXPECT_CALL(a, Binary(_, 1))
+        .WillOnce(Return(true));
+
+    // A void-returning function.
+    CaptureStdout();
+    a.DoA(5);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "Mock function call matches EXPECT_CALL(a, DoA(5))...\n"
+        "    Function call: DoA(5)\n"
+        "Stack trace:\n",
+        "DoA");
+
+    // A non-void-returning function.
+    CaptureStdout();
+    a.Binary(2, 1);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "Mock function call matches EXPECT_CALL(a, Binary(_, 1))...\n"
+        "    Function call: Binary(2, 1)\n"
+        "          Returns: true\n"
+        "Stack trace:\n",
+        "Binary");
+  }
+
+  // Tests how the flag affects uninteresting calls on a naggy mock.
+  void TestUninterestingCallOnNaggyMock(bool should_print) {
+    NaggyMock<MockA> a;
+    const std::string note =
+        "NOTE: You can safely ignore the above warning unless this "
+        "call should not happen.  Do not suppress it by blindly adding "
+        "an EXPECT_CALL() if you don't mean to enforce the call.  "
+        "See "
+        "https://github.com/google/googletest/blob/master/googlemock/docs/"
+        "cook_book.md#"
+        "knowing-when-to-expect for details.";
+
+    // A void-returning function.
+    CaptureStdout();
+    a.DoA(5);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "\nGMOCK WARNING:\n"
+        "Uninteresting mock function call - returning directly.\n"
+        "    Function call: DoA(5)\n" +
+        note,
+        "DoA");
+
+    // A non-void-returning function.
+    CaptureStdout();
+    a.Binary(2, 1);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "\nGMOCK WARNING:\n"
+        "Uninteresting mock function call - returning default value.\n"
+        "    Function call: Binary(2, 1)\n"
+        "          Returns: false\n" +
+        note,
+        "Binary");
+  }
+};
+
+// Tests that --gmock_verbose=info causes both expected and
+// uninteresting calls to be reported.
+TEST_F(GMockVerboseFlagTest, Info) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  TestExpectedCall(true);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+// Tests that --gmock_verbose=warning causes uninteresting calls to be
+// reported.
+TEST_F(GMockVerboseFlagTest, Warning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+// Tests that --gmock_verbose=warning causes neither expected nor
+// uninteresting calls to be reported.
+TEST_F(GMockVerboseFlagTest, Error) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(false);
+}
+
+// Tests that --gmock_verbose=SOME_INVALID_VALUE has the same effect
+// as --gmock_verbose=warning.
+TEST_F(GMockVerboseFlagTest, InvalidFlagIsTreatedAsWarning) {
+  GMOCK_FLAG(verbose) = "invalid";  // Treated as "warning".
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// A helper class that generates a failure when printed.  We use it to
+// ensure that Google Mock doesn't print a value (even to an internal
+// buffer) when it is not supposed to do so.
+class PrintMeNot {};
+
+void PrintTo(PrintMeNot /* dummy */, ::std::ostream* /* os */) {
+  ADD_FAILURE() << "Google Mock is printing a value that shouldn't be "
+                << "printed even to an internal buffer.";
+}
+
+class LogTestHelper {
+ public:
+  LogTestHelper() {}
+
+  MOCK_METHOD1(Foo, PrintMeNot(PrintMeNot));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LogTestHelper);
+};
+
+class GMockLogTest : public VerboseFlagPreservingFixture {
+ protected:
+  LogTestHelper helper_;
+};
+
+TEST_F(GMockLogTest, DoesNotPrintGoodCallInternallyIfVerbosityIsWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  EXPECT_CALL(helper_, Foo(_))
+      .WillOnce(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This is an expected call.
+}
+
+TEST_F(GMockLogTest, DoesNotPrintGoodCallInternallyIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  EXPECT_CALL(helper_, Foo(_))
+      .WillOnce(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This is an expected call.
+}
+
+TEST_F(GMockLogTest, DoesNotPrintWarningInternallyIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  ON_CALL(helper_, Foo(_))
+      .WillByDefault(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This should generate a warning.
+}
+
+// Tests Mock::AllowLeak().
+
+TEST(AllowLeakTest, AllowsLeakingUnusedMockObject) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, CanBeCalledBeforeOnCall) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  a->DoA(0);
+}
+
+TEST(AllowLeakTest, CanBeCalledAfterOnCall) {
+  MockA* a = new MockA;
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, CanBeCalledBeforeExpectCall) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+  EXPECT_CALL(*a, DoA(_));
+  a->DoA(0);
+}
+
+TEST(AllowLeakTest, CanBeCalledAfterExpectCall) {
+  MockA* a = new MockA;
+  EXPECT_CALL(*a, DoA(_)).Times(AnyNumber());
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, WorksWhenBothOnCallAndExpectCallArePresent) {
+  MockA* a = new MockA;
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  EXPECT_CALL(*a, DoA(_)).Times(AnyNumber());
+  Mock::AllowLeak(a);
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when none of its methods has expectations.
+TEST(VerifyAndClearExpectationsTest, NoMethodHasExpectations) {
+  MockB b;
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when some, but not all, of its methods have expectations *and* the
+// verification succeeds.
+TEST(VerifyAndClearExpectationsTest, SomeMethodsHaveExpectationsAndSucceed) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  b.DoB();
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when some, but not all, of its methods have expectations *and* the
+// verification fails.
+TEST(VerifyAndClearExpectationsTest, SomeMethodsHaveExpectationsAndFail) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClearExpectations(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when all of its methods have expectations.
+TEST(VerifyAndClearExpectationsTest, AllMethodsHaveExpectations) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB();
+  b.DoB(1);
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when a method has more than one expectation.
+TEST(VerifyAndClearExpectationsTest, AMethodHasManyExpectations) {
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .WillOnce(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB(1);
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClearExpectations(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can call VerifyAndClearExpectations() on the same
+// mock object multiple times.
+TEST(VerifyAndClearExpectationsTest, CanCallManyTimes) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+  b.DoB();
+  Mock::VerifyAndClearExpectations(&b);
+
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(1));
+  b.DoB(1);
+  Mock::VerifyAndClearExpectations(&b);
+  Mock::VerifyAndClearExpectations(&b);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can clear a mock object's default actions when none
+// of its methods has default actions.
+TEST(VerifyAndClearTest, NoMethodHasDefaultActions) {
+  MockB b;
+  // If this crashes or generates a failure, the test will catch it.
+  Mock::VerifyAndClear(&b);
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that we can clear a mock object's default actions when some,
+// but not all of its methods have default actions.
+TEST(VerifyAndClearTest, SomeMethodsHaveDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default action of int DoB() was removed.
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that we can clear a mock object's default actions when all of
+// its methods have default actions.
+TEST(VerifyAndClearTest, AllMethodsHaveDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(2));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default action of int DoB() was removed.
+  EXPECT_EQ(0, b.DoB());
+
+  // Verifies that the default action of int DoB(int) was removed.
+  EXPECT_EQ(0, b.DoB(0));
+}
+
+// Tests that we can clear a mock object's default actions when a
+// method has more than one ON_CALL() set on it.
+TEST(VerifyAndClearTest, AMethodHasManyDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB(0))
+      .WillByDefault(Return(1));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(2));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default actions (there are two) of int DoB(int)
+  // were removed.
+  EXPECT_EQ(0, b.DoB(0));
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can call VerifyAndClear() on a mock object multiple
+// times.
+TEST(VerifyAndClearTest, CanCallManyTimes) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  Mock::VerifyAndClear(&b);
+  Mock::VerifyAndClear(&b);
+
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(1));
+  Mock::VerifyAndClear(&b);
+
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the verification succeeds.
+TEST(VerifyAndClearTest, Success) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(1))
+      .WillOnce(Return(2));
+
+  b.DoB();
+  b.DoB(1);
+  ASSERT_TRUE(Mock::VerifyAndClear(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the verification fails.
+TEST(VerifyAndClearTest, Failure) {
+  MockB b;
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(2));
+
+  b.DoB(1);
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClear(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the default actions and
+// expectations are set on a const mock object.
+TEST(VerifyAndClearTest, Const) {
+  MockB b;
+  ON_CALL(Const(b), DoB())
+      .WillByDefault(Return(1));
+
+  EXPECT_CALL(Const(b), DoB())
+      .WillOnce(DoDefault())
+      .WillOnce(Return(2));
+
+  b.DoB();
+  b.DoB();
+  ASSERT_TRUE(Mock::VerifyAndClear(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can set default actions and expectations on a mock
+// object after VerifyAndClear() has been called on it.
+TEST(VerifyAndClearTest, CanSetDefaultActionsAndExpectationsAfterwards) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB(1);
+
+  Mock::VerifyAndClear(&b);
+
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(3));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(4));
+
+  EXPECT_EQ(3, b.DoB());
+  EXPECT_EQ(4, b.DoB(1));
+}
+
+// Tests that calling VerifyAndClear() on one mock object does not
+// affect other mock objects (either of the same type or not).
+TEST(VerifyAndClearTest, DoesNotAffectOtherMockObjects) {
+  MockA a;
+  MockB b1;
+  MockB b2;
+
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(_, _))
+      .WillOnce(DoDefault())
+      .WillOnce(Return(false));
+
+  ON_CALL(b1, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b1, DoB(_))
+      .WillOnce(Return(2));
+
+  ON_CALL(b2, DoB())
+      .WillByDefault(Return(3));
+  EXPECT_CALL(b2, DoB(_));
+
+  b2.DoB(0);
+  Mock::VerifyAndClear(&b2);
+
+  // Verifies that the default actions and expectations of a and b1
+  // are still in effect.
+  EXPECT_TRUE(a.Binary(0, 0));
+  EXPECT_FALSE(a.Binary(0, 0));
+
+  EXPECT_EQ(1, b1.DoB());
+  EXPECT_EQ(2, b1.DoB(0));
+}
+
+TEST(VerifyAndClearTest,
+     DestroyingChainedMocksDoesNotDeadlockThroughExpectations) {
+  std::shared_ptr<MockA> a(new MockA);
+  ReferenceHoldingMock test_mock;
+
+  // EXPECT_CALL stores a reference to a inside test_mock.
+  EXPECT_CALL(test_mock, AcceptReference(_))
+      .WillRepeatedly(SetArgPointee<0>(a));
+
+  // Throw away the reference to the mock that we have in a. After this, the
+  // only reference to it is stored by test_mock.
+  a.reset();
+
+  // When test_mock goes out of scope, it destroys the last remaining reference
+  // to the mock object originally pointed to by a. This will cause the MockA
+  // destructor to be called from inside the ReferenceHoldingMock destructor.
+  // The state of all mocks is protected by a single global lock, but there
+  // should be no deadlock.
+}
+
+TEST(VerifyAndClearTest,
+     DestroyingChainedMocksDoesNotDeadlockThroughDefaultAction) {
+  std::shared_ptr<MockA> a(new MockA);
+  ReferenceHoldingMock test_mock;
+
+  // ON_CALL stores a reference to a inside test_mock.
+  ON_CALL(test_mock, AcceptReference(_))
+      .WillByDefault(SetArgPointee<0>(a));
+
+  // Throw away the reference to the mock that we have in a. After this, the
+  // only reference to it is stored by test_mock.
+  a.reset();
+
+  // When test_mock goes out of scope, it destroys the last remaining reference
+  // to the mock object originally pointed to by a. This will cause the MockA
+  // destructor to be called from inside the ReferenceHoldingMock destructor.
+  // The state of all mocks is protected by a single global lock, but there
+  // should be no deadlock.
+}
+
+// Tests that a mock function's action can call a mock function
+// (either the same function or a different one) either as an explicit
+// action or as a default action without causing a dead lock.  It
+// verifies that the action is not performed inside the critical
+// section.
+TEST(SynchronizationTest, CanCallMockMethodInAction) {
+  MockA a;
+  MockC c;
+  ON_CALL(a, DoA(_))
+      .WillByDefault(IgnoreResult(InvokeWithoutArgs(&c,
+                                                    &MockC::NonVoidMethod)));
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Invoke(&a, &MockA::DoA))
+      .RetiresOnSaturation();
+  EXPECT_CALL(c, NonVoidMethod());
+
+  a.DoA(1);
+  // This will match the second EXPECT_CALL() and trigger another a.DoA(1),
+  // which will in turn match the first EXPECT_CALL() and trigger a call to
+  // c.NonVoidMethod() that was specified by the ON_CALL() since the first
+  // EXPECT_CALL() did not specify an action.
+}
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsWithoutMatchers) {
+  MockA a;
+  int do_a_arg0 = 0;
+  ON_CALL(a, DoA).WillByDefault(SaveArg<0>(&do_a_arg0));
+  int do_a_47_arg0 = 0;
+  ON_CALL(a, DoA(47)).WillByDefault(SaveArg<0>(&do_a_47_arg0));
+
+  a.DoA(17);
+  EXPECT_THAT(do_a_arg0, 17);
+  EXPECT_THAT(do_a_47_arg0, 0);
+  a.DoA(47);
+  EXPECT_THAT(do_a_arg0, 17);
+  EXPECT_THAT(do_a_47_arg0, 47);
+
+  ON_CALL(a, Binary).WillByDefault(Return(true));
+  ON_CALL(a, Binary(_, 14)).WillByDefault(Return(false));
+  EXPECT_THAT(a.Binary(14, 17), true);
+  EXPECT_THAT(a.Binary(17, 14), false);
+}
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsForOverloadedMethods) {
+  MockB b;
+  ON_CALL(b, DoB()).WillByDefault(Return(9));
+  ON_CALL(b, DoB(5)).WillByDefault(Return(11));
+
+  EXPECT_THAT(b.DoB(), 9);
+  EXPECT_THAT(b.DoB(1), 0);  // default value
+  EXPECT_THAT(b.DoB(5), 11);
+}
+
+struct MockWithConstMethods {
+ public:
+  MOCK_CONST_METHOD1(Foo, int(int));
+  MOCK_CONST_METHOD2(Bar, int(int, const char*));
+};
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsForConstMethods) {
+  MockWithConstMethods mock;
+  ON_CALL(mock, Foo).WillByDefault(Return(7));
+  ON_CALL(mock, Bar).WillByDefault(Return(33));
+
+  EXPECT_THAT(mock.Foo(17), 7);
+  EXPECT_THAT(mock.Bar(27, "purple"), 33);
+}
+
+class MockConstOverload {
+ public:
+  MOCK_METHOD1(Overloaded, int(int));
+  MOCK_CONST_METHOD1(Overloaded, int(int));
+};
+
+TEST(ParameterlessExpectationsTest,
+     CanSetExpectationsForConstOverloadedMethods) {
+  MockConstOverload mock;
+  ON_CALL(mock, Overloaded(_)).WillByDefault(Return(7));
+  ON_CALL(mock, Overloaded(5)).WillByDefault(Return(9));
+  ON_CALL(Const(mock), Overloaded(5)).WillByDefault(Return(11));
+  ON_CALL(Const(mock), Overloaded(7)).WillByDefault(Return(13));
+
+  EXPECT_THAT(mock.Overloaded(1), 7);
+  EXPECT_THAT(mock.Overloaded(5), 9);
+  EXPECT_THAT(mock.Overloaded(7), 7);
+
+  const MockConstOverload& const_mock = mock;
+  EXPECT_THAT(const_mock.Overloaded(1), 0);
+  EXPECT_THAT(const_mock.Overloaded(5), 11);
+  EXPECT_THAT(const_mock.Overloaded(7), 13);
+}
+
+}  // namespace
+
+// Allows the user to define their own main and then invoke gmock_main
+// from it. This might be necessary on some platforms which require
+// specific setup and teardown.
+#if GMOCK_RENAME_MAIN
+int gmock_main(int argc, char **argv) {
+#else
+int main(int argc, char **argv) {
+#endif  // GMOCK_RENAME_MAIN
+  testing::InitGoogleMock(&argc, argv);
+  // Ensures that the tests pass no matter what value of
+  // --gmock_catch_leaked_mocks and --gmock_verbose the user specifies.
+  testing::GMOCK_FLAG(catch_leaked_mocks) = true;
+  testing::GMOCK_FLAG(verbose) = testing::internal::kWarningVerbosity;
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_all_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_all_test.cc
new file mode 100755
index 0000000..b2b2027
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_all_test.cc
@@ -0,0 +1,49 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google C++ Mocking Framework (Google Mock)
+//
+// Some users use a build system that Google Mock doesn't support directly,
+// yet they still want to build and run Google Mock's own tests.  This file
+// includes most such tests, making it easier for these users to maintain
+// their build scripts (they just need to build this file, even though the
+// below list of actual *_test.cc files might change).
+#include "test/gmock-actions_test.cc"
+#include "test/gmock-cardinalities_test.cc"
+#include "test/gmock-generated-actions_test.cc"
+#include "test/gmock-generated-function-mockers_test.cc"
+#include "test/gmock-generated-matchers_test.cc"
+#include "test/gmock-internal-utils_test.cc"
+#include "test/gmock-matchers_test.cc"
+#include "test/gmock-more-actions_test.cc"
+#include "test/gmock-nice-strict_test.cc"
+#include "test/gmock-port_test.cc"
+#include "test/gmock-spec-builders_test.cc"
+#include "test/gmock_test.cc"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_ex_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_ex_test.cc
new file mode 100755
index 0000000..72eb43f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_ex_test.cc
@@ -0,0 +1,80 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Mock's functionality that depends on exceptions.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+namespace {
+
+using testing::HasSubstr;
+
+using testing::internal::GoogleTestFailureException;
+
+// A type that cannot be default constructed.
+class NonDefaultConstructible {
+ public:
+  explicit NonDefaultConstructible(int /* dummy */) {}
+};
+
+class MockFoo {
+ public:
+  // A mock method that returns a user-defined type.  Google Mock
+  // doesn't know what the default value for this type is.
+  MOCK_METHOD0(GetNonDefaultConstructible, NonDefaultConstructible());
+};
+
+TEST(DefaultValueTest, ThrowsRuntimeErrorWhenNoDefaultValue) {
+  MockFoo mock;
+  try {
+    // No expectation is set on this method, so Google Mock must
+    // return the default value.  However, since Google Mock knows
+    // nothing about the return type, it doesn't know what to return,
+    // and has to throw (when exceptions are enabled) or abort
+    // (otherwise).
+    mock.GetNonDefaultConstructible();
+    FAIL() << "GetNonDefaultConstructible()'s return type has no default "
+           << "value, so Google Mock should have thrown.";
+  } catch (const GoogleTestFailureException& /* unused */) {
+    FAIL() << "Google Test does not try to catch an exception of type "
+           << "GoogleTestFailureException, which is used for reporting "
+           << "a failure to other testing frameworks.  Google Mock should "
+           << "not throw a GoogleTestFailureException as it will kill the "
+           << "entire test program instead of just the current TEST.";
+  } catch (const std::exception& ex) {
+    EXPECT_THAT(ex.what(), HasSubstr("has no default value"));
+  }
+}
+
+
+}  // unnamed namespace
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test.py
new file mode 100755
index 0000000..7e4b1ee
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests that leaked mock objects can be caught be Google Mock."""
+
+import gmock_test_utils
+
+PROGRAM_PATH = gmock_test_utils.GetTestExecutablePath('gmock_leak_test_')
+TEST_WITH_EXPECT_CALL = [PROGRAM_PATH, '--gtest_filter=*ExpectCall*']
+TEST_WITH_ON_CALL = [PROGRAM_PATH, '--gtest_filter=*OnCall*']
+TEST_MULTIPLE_LEAKS = [PROGRAM_PATH, '--gtest_filter=*MultipleLeaked*']
+
+environ = gmock_test_utils.environ
+SetEnvVar = gmock_test_utils.SetEnvVar
+
+# Tests in this file run a Google-Test-based test program and expect it
+# to terminate prematurely.  Therefore they are incompatible with
+# the premature-exit-file protocol by design.  Unset the
+# premature-exit filepath to prevent Google Test from creating
+# the file.
+SetEnvVar(gmock_test_utils.PREMATURE_EXIT_FILE_ENV_VAR, None)
+
+
+class GMockLeakTest(gmock_test_utils.TestCase):
+
+  def testCatchesLeakedMockByDefault(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL,
+                                    env=environ).exit_code)
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL,
+                                    env=environ).exit_code)
+
+  def testDoesNotCatchLeakedMockWhenDisabled(self):
+    self.assertEquals(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks=0'],
+                                    env=environ).exit_code)
+    self.assertEquals(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL +
+                                    ['--gmock_catch_leaked_mocks=0'],
+                                    env=environ).exit_code)
+
+  def testCatchesLeakedMockWhenEnabled(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+
+  def testCatchesLeakedMockWhenEnabledWithExplictFlagValue(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks=1'],
+                                    env=environ).exit_code)
+
+  def testCatchesMultipleLeakedMocks(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_MULTIPLE_LEAKS +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+
+
+if __name__ == '__main__':
+  gmock_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test_.cc
new file mode 100755
index 0000000..2e095ab
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_leak_test_.cc
@@ -0,0 +1,99 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This program is for verifying that a leaked mock object can be
+// caught by Google Mock's leak detector.
+
+#include "gmock/gmock.h"
+
+namespace {
+
+using ::testing::Return;
+
+class FooInterface {
+ public:
+  virtual ~FooInterface() {}
+  virtual void DoThis() = 0;
+};
+
+class MockFoo : public FooInterface {
+ public:
+  MockFoo() {}
+
+  MOCK_METHOD0(DoThis, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+TEST(LeakTest, LeakedMockWithExpectCallCausesFailureWhenLeakCheckingIsEnabled) {
+  MockFoo* foo = new MockFoo;
+
+  EXPECT_CALL(*foo, DoThis());
+  foo->DoThis();
+
+  // In order to test the leak detector, we deliberately leak foo.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+TEST(LeakTest, LeakedMockWithOnCallCausesFailureWhenLeakCheckingIsEnabled) {
+  MockFoo* foo = new MockFoo;
+
+  ON_CALL(*foo, DoThis()).WillByDefault(Return());
+
+  // In order to test the leak detector, we deliberately leak foo.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+TEST(LeakTest, CatchesMultipleLeakedMockObjects) {
+  MockFoo* foo1 = new MockFoo;
+  MockFoo* foo2 = new MockFoo;
+
+  ON_CALL(*foo1, DoThis()).WillByDefault(Return());
+  EXPECT_CALL(*foo2, DoThis());
+  foo2->DoThis();
+
+  // In order to test the leak detector, we deliberately leak foo1 and
+  // foo2.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link2_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link2_test.cc
new file mode 100755
index 0000000..d27ce17
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link2_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file is for verifying that various Google Mock constructs do not
+// produce linker errors when instantiated in different translation units.
+// Please see gmock_link_test.h for details.
+
+#define LinkTest LinkTest2
+
+#include "test/gmock_link_test.h"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.cc
new file mode 100755
index 0000000..e7c54cc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file is for verifying that various Google Mock constructs do not
+// produce linker errors when instantiated in different translation units.
+// Please see gmock_link_test.h for details.
+
+#define LinkTest LinkTest1
+
+#include "test/gmock_link_test.h"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.h
new file mode 100755
index 0000000..175d2bd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_link_test.h
@@ -0,0 +1,690 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests that:
+// a. A header file defining a mock class can be included in multiple
+//    translation units without causing a link error.
+// b. Actions and matchers can be instantiated with identical template
+//    arguments in different translation units without causing link
+//    errors.
+//    The following constructs are currently tested:
+//    Actions:
+//      Return()
+//      Return(value)
+//      ReturnNull
+//      ReturnRef
+//      Assign
+//      SetArgPointee
+//      SetArrayArgument
+//      SetErrnoAndReturn
+//      Invoke(function)
+//      Invoke(object, method)
+//      InvokeWithoutArgs(function)
+//      InvokeWithoutArgs(object, method)
+//      InvokeArgument
+//      WithArg
+//      WithArgs
+//      WithoutArgs
+//      DoAll
+//      DoDefault
+//      IgnoreResult
+//      Throw
+//      ACTION()-generated
+//      ACTION_P()-generated
+//      ACTION_P2()-generated
+//    Matchers:
+//      _
+//      A
+//      An
+//      Eq
+//      Gt, Lt, Ge, Le, Ne
+//      NotNull
+//      Ref
+//      TypedEq
+//      DoubleEq
+//      FloatEq
+//      NanSensitiveDoubleEq
+//      NanSensitiveFloatEq
+//      ContainsRegex
+//      MatchesRegex
+//      EndsWith
+//      HasSubstr
+//      StartsWith
+//      StrCaseEq
+//      StrCaseNe
+//      StrEq
+//      StrNe
+//      ElementsAre
+//      ElementsAreArray
+//      ContainerEq
+//      Field
+//      Property
+//      ResultOf(function)
+//      ResultOf(callback)
+//      Pointee
+//      Truly(predicate)
+//      AddressSatisfies
+//      AllOf
+//      AnyOf
+//      Not
+//      MatcherCast<T>
+//
+//  Please note: this test does not verify the functioning of these
+//  constructs, only that the programs using them will link successfully.
+//
+// Implementation note:
+// This test requires identical definitions of Interface and Mock to be
+// included in different translation units.  We achieve this by writing
+// them in this header and #including it in gmock_link_test.cc and
+// gmock_link2_test.cc.  Because the symbols generated by the compiler for
+// those constructs must be identical in both translation units,
+// definitions of Interface and Mock tests MUST be kept in the SAME
+// NON-ANONYMOUS namespace in this file.  The test fixture class LinkTest
+// is defined as LinkTest1 in gmock_link_test.cc and as LinkTest2 in
+// gmock_link2_test.cc to avoid producing linker errors.
+
+#ifndef GMOCK_TEST_GMOCK_LINK_TEST_H_
+#define GMOCK_TEST_GMOCK_LINK_TEST_H_
+
+#include "gmock/gmock.h"
+
+#if !GTEST_OS_WINDOWS_MOBILE
+# include <errno.h>
+#endif
+
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+using testing::_;
+using testing::A;
+using testing::Action;
+using testing::AllOf;
+using testing::AnyOf;
+using testing::Assign;
+using testing::ContainerEq;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::DoubleEq;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::EndsWith;
+using testing::Eq;
+using testing::Field;
+using testing::FloatEq;
+using testing::Ge;
+using testing::Gt;
+using testing::HasSubstr;
+using testing::IgnoreResult;
+using testing::Invoke;
+using testing::InvokeArgument;
+using testing::InvokeWithoutArgs;
+using testing::IsNull;
+using testing::IsSubsetOf;
+using testing::IsSupersetOf;
+using testing::Le;
+using testing::Lt;
+using testing::Matcher;
+using testing::MatcherCast;
+using testing::NanSensitiveDoubleEq;
+using testing::NanSensitiveFloatEq;
+using testing::Ne;
+using testing::Not;
+using testing::NotNull;
+using testing::Pointee;
+using testing::Property;
+using testing::Ref;
+using testing::ResultOf;
+using testing::Return;
+using testing::ReturnNull;
+using testing::ReturnRef;
+using testing::SetArgPointee;
+using testing::SetArrayArgument;
+using testing::StartsWith;
+using testing::StrCaseEq;
+using testing::StrCaseNe;
+using testing::StrEq;
+using testing::StrNe;
+using testing::Truly;
+using testing::TypedEq;
+using testing::WithArg;
+using testing::WithArgs;
+using testing::WithoutArgs;
+
+#if !GTEST_OS_WINDOWS_MOBILE
+using testing::SetErrnoAndReturn;
+#endif
+
+#if GTEST_HAS_EXCEPTIONS
+using testing::Throw;
+#endif
+
+using testing::ContainsRegex;
+using testing::MatchesRegex;
+
+class Interface {
+ public:
+  virtual ~Interface() {}
+  virtual void VoidFromString(char* str) = 0;
+  virtual char* StringFromString(char* str) = 0;
+  virtual int IntFromString(char* str) = 0;
+  virtual int& IntRefFromString(char* str) = 0;
+  virtual void VoidFromFunc(void(*func)(char* str)) = 0;
+  virtual void VoidFromIntRef(int& n) = 0;  // NOLINT
+  virtual void VoidFromFloat(float n) = 0;
+  virtual void VoidFromDouble(double n) = 0;
+  virtual void VoidFromVector(const std::vector<int>& v) = 0;
+};
+
+class Mock: public Interface {
+ public:
+  Mock() {}
+
+  MOCK_METHOD1(VoidFromString, void(char* str));
+  MOCK_METHOD1(StringFromString, char*(char* str));
+  MOCK_METHOD1(IntFromString, int(char* str));
+  MOCK_METHOD1(IntRefFromString, int&(char* str));
+  MOCK_METHOD1(VoidFromFunc, void(void(*func)(char* str)));
+  MOCK_METHOD1(VoidFromIntRef, void(int& n));  // NOLINT
+  MOCK_METHOD1(VoidFromFloat, void(float n));
+  MOCK_METHOD1(VoidFromDouble, void(double n));
+  MOCK_METHOD1(VoidFromVector, void(const std::vector<int>& v));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mock);
+};
+
+class InvokeHelper {
+ public:
+  static void StaticVoidFromVoid() {}
+  void VoidFromVoid() {}
+  static void StaticVoidFromString(char* /* str */) {}
+  void VoidFromString(char* /* str */) {}
+  static int StaticIntFromString(char* /* str */) { return 1; }
+  static bool StaticBoolFromString(const char* /* str */) { return true; }
+};
+
+class FieldHelper {
+ public:
+  explicit FieldHelper(int a_field) : field_(a_field) {}
+  int field() const { return field_; }
+  int field_;  // NOLINT -- need external access to field_ to test
+               //           the Field matcher.
+};
+
+// Tests the linkage of the ReturnVoid action.
+TEST(LinkTest, TestReturnVoid) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Return());
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the Return action.
+TEST(LinkTest, TestReturn) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, StringFromString(_)).WillOnce(Return(&ch));
+  mock.StringFromString(nullptr);
+}
+
+// Tests the linkage of the ReturnNull action.
+TEST(LinkTest, TestReturnNull) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Return());
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the ReturnRef action.
+TEST(LinkTest, TestReturnRef) {
+  Mock mock;
+  int n = 42;
+
+  EXPECT_CALL(mock, IntRefFromString(_)).WillOnce(ReturnRef(n));
+  mock.IntRefFromString(nullptr);
+}
+
+// Tests the linkage of the Assign action.
+TEST(LinkTest, TestAssign) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Assign(&ch, 'y'));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the SetArgPointee action.
+TEST(LinkTest, TestSetArgPointee) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(SetArgPointee<0>('y'));
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the SetArrayArgument action.
+TEST(LinkTest, TestSetArrayArgument) {
+  Mock mock;
+  char ch = 'x';
+  char ch2 = 'y';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(SetArrayArgument<0>(&ch2,
+                                                                    &ch2 + 1));
+  mock.VoidFromString(&ch);
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Tests the linkage of the SetErrnoAndReturn action.
+TEST(LinkTest, TestSetErrnoAndReturn) {
+  Mock mock;
+
+  int saved_errno = errno;
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(SetErrnoAndReturn(1, -1));
+  mock.IntFromString(nullptr);
+  errno = saved_errno;
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests the linkage of the Invoke(function) and Invoke(object, method) actions.
+TEST(LinkTest, TestInvoke) {
+  Mock mock;
+  InvokeHelper test_invoke_helper;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(Invoke(&InvokeHelper::StaticVoidFromString))
+      .WillOnce(Invoke(&test_invoke_helper, &InvokeHelper::VoidFromString));
+  mock.VoidFromString(nullptr);
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the InvokeWithoutArgs action.
+TEST(LinkTest, TestInvokeWithoutArgs) {
+  Mock mock;
+  InvokeHelper test_invoke_helper;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(InvokeWithoutArgs(&InvokeHelper::StaticVoidFromVoid))
+      .WillOnce(InvokeWithoutArgs(&test_invoke_helper,
+                                  &InvokeHelper::VoidFromVoid));
+  mock.VoidFromString(nullptr);
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the InvokeArgument action.
+TEST(LinkTest, TestInvokeArgument) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromFunc(_)).WillOnce(InvokeArgument<0>(&ch));
+  mock.VoidFromFunc(InvokeHelper::StaticVoidFromString);
+}
+
+// Tests the linkage of the WithArg action.
+TEST(LinkTest, TestWithArg) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(WithArg<0>(Invoke(&InvokeHelper::StaticVoidFromString)));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the WithArgs action.
+TEST(LinkTest, TestWithArgs) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(WithArgs<0>(Invoke(&InvokeHelper::StaticVoidFromString)));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the WithoutArgs action.
+TEST(LinkTest, TestWithoutArgs) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(WithoutArgs(Return()));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the DoAll action.
+TEST(LinkTest, TestDoAll) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(DoAll(SetArgPointee<0>('y'), Return()));
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the DoDefault action.
+TEST(LinkTest, TestDoDefault) {
+  Mock mock;
+  char ch = 'x';
+
+  ON_CALL(mock, VoidFromString(_)).WillByDefault(Return());
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(DoDefault());
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the IgnoreResult action.
+TEST(LinkTest, TestIgnoreResult) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(IgnoreResult(Return(42)));
+  mock.VoidFromString(nullptr);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+// Tests the linkage of the Throw action.
+TEST(LinkTest, TestThrow) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Throw(42));
+  EXPECT_THROW(mock.VoidFromString(nullptr), int);
+}
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Tests the linkage of actions created using ACTION macro.
+namespace {
+ACTION(Return1) { return 1; }
+}
+
+TEST(LinkTest, TestActionMacro) {
+  Mock mock;
+
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(Return1());
+  mock.IntFromString(nullptr);
+}
+
+// Tests the linkage of actions created using ACTION_P macro.
+namespace {
+ACTION_P(ReturnArgument, ret_value) { return ret_value; }
+}
+
+TEST(LinkTest, TestActionPMacro) {
+  Mock mock;
+
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(ReturnArgument(42));
+  mock.IntFromString(nullptr);
+}
+
+// Tests the linkage of actions created using ACTION_P2 macro.
+namespace {
+ACTION_P2(ReturnEqualsEitherOf, first, second) {
+  return arg0 == first || arg0 == second;
+}
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+TEST(LinkTest, TestActionP2Macro) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, IntFromString(_))
+      .WillOnce(ReturnEqualsEitherOf("one", "two"));
+  mock.IntFromString(&ch);
+}
+
+// Tests the linkage of the "_" matcher.
+TEST(LinkTest, TestMatcherAnything) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(_)).WillByDefault(Return());
+}
+
+// Tests the linkage of the A matcher.
+TEST(LinkTest, TestMatcherA) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(A<char*>())).WillByDefault(Return());
+}
+
+// Tests the linkage of the Eq and the "bare value" matcher.
+TEST(LinkTest, TestMatchersEq) {
+  Mock mock;
+  const char* p = "x";
+
+  ON_CALL(mock, VoidFromString(Eq(p))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(const_cast<char*>("y")))
+      .WillByDefault(Return());
+}
+
+// Tests the linkage of the Lt, Gt, Le, Ge, and Ne matchers.
+TEST(LinkTest, TestMatchersRelations) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromFloat(Lt(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Gt(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Le(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Ge(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Ne(1.0f))).WillByDefault(Return());
+}
+
+// Tests the linkage of the NotNull matcher.
+TEST(LinkTest, TestMatcherNotNull) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(NotNull())).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsNull matcher.
+TEST(LinkTest, TestMatcherIsNull) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(IsNull())).WillByDefault(Return());
+}
+
+// Tests the linkage of the Ref matcher.
+TEST(LinkTest, TestMatcherRef) {
+  Mock mock;
+  int a = 0;
+
+  ON_CALL(mock, VoidFromIntRef(Ref(a))).WillByDefault(Return());
+}
+
+// Tests the linkage of the TypedEq matcher.
+TEST(LinkTest, TestMatcherTypedEq) {
+  Mock mock;
+  long a = 0;
+
+  ON_CALL(mock, VoidFromIntRef(TypedEq<int&>(a))).WillByDefault(Return());
+}
+
+// Tests the linkage of the FloatEq, DoubleEq, NanSensitiveFloatEq and
+// NanSensitiveDoubleEq matchers.
+TEST(LinkTest, TestMatchersFloatingPoint) {
+  Mock mock;
+  float a = 0;
+
+  ON_CALL(mock, VoidFromFloat(FloatEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromDouble(DoubleEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(NanSensitiveFloatEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromDouble(NanSensitiveDoubleEq(a)))
+      .WillByDefault(Return());
+}
+
+// Tests the linkage of the ContainsRegex matcher.
+TEST(LinkTest, TestMatcherContainsRegex) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(ContainsRegex(".*"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the MatchesRegex matcher.
+TEST(LinkTest, TestMatcherMatchesRegex) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(MatchesRegex(".*"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the StartsWith, EndsWith, and HasSubstr matchers.
+TEST(LinkTest, TestMatchersSubstrings) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(StartsWith("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(EndsWith("c"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(HasSubstr("b"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the StrEq, StrNe, StrCaseEq, and StrCaseNe matchers.
+TEST(LinkTest, TestMatchersStringEquality) {
+  Mock mock;
+  ON_CALL(mock, VoidFromString(StrEq("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrNe("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrCaseEq("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrCaseNe("a"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ElementsAre matcher.
+TEST(LinkTest, TestMatcherElementsAre) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromVector(ElementsAre('a', _))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ElementsAreArray matcher.
+TEST(LinkTest, TestMatcherElementsAreArray) {
+  Mock mock;
+  char arr[] = { 'a', 'b' };
+
+  ON_CALL(mock, VoidFromVector(ElementsAreArray(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsSubsetOf matcher.
+TEST(LinkTest, TestMatcherIsSubsetOf) {
+  Mock mock;
+  char arr[] = {'a', 'b'};
+
+  ON_CALL(mock, VoidFromVector(IsSubsetOf(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsSupersetOf matcher.
+TEST(LinkTest, TestMatcherIsSupersetOf) {
+  Mock mock;
+  char arr[] = {'a', 'b'};
+
+  ON_CALL(mock, VoidFromVector(IsSupersetOf(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ContainerEq matcher.
+TEST(LinkTest, TestMatcherContainerEq) {
+  Mock mock;
+  std::vector<int> v;
+
+  ON_CALL(mock, VoidFromVector(ContainerEq(v))).WillByDefault(Return());
+}
+
+// Tests the linkage of the Field matcher.
+TEST(LinkTest, TestMatcherField) {
+  FieldHelper helper(0);
+
+  Matcher<const FieldHelper&> m = Field(&FieldHelper::field_, Eq(0));
+  EXPECT_TRUE(m.Matches(helper));
+
+  Matcher<const FieldHelper*> m2 = Field(&FieldHelper::field_, Eq(0));
+  EXPECT_TRUE(m2.Matches(&helper));
+}
+
+// Tests the linkage of the Property matcher.
+TEST(LinkTest, TestMatcherProperty) {
+  FieldHelper helper(0);
+
+  Matcher<const FieldHelper&> m = Property(&FieldHelper::field, Eq(0));
+  EXPECT_TRUE(m.Matches(helper));
+
+  Matcher<const FieldHelper*> m2 = Property(&FieldHelper::field, Eq(0));
+  EXPECT_TRUE(m2.Matches(&helper));
+}
+
+// Tests the linkage of the ResultOf matcher.
+TEST(LinkTest, TestMatcherResultOf) {
+  Matcher<char*> m = ResultOf(&InvokeHelper::StaticIntFromString, Eq(1));
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+// Tests the linkage of the ResultOf matcher.
+TEST(LinkTest, TestMatcherPointee) {
+  int n = 1;
+
+  Matcher<int*> m = Pointee(Eq(1));
+  EXPECT_TRUE(m.Matches(&n));
+}
+
+// Tests the linkage of the Truly matcher.
+TEST(LinkTest, TestMatcherTruly) {
+  Matcher<const char*> m = Truly(&InvokeHelper::StaticBoolFromString);
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+// Tests the linkage of the AllOf matcher.
+TEST(LinkTest, TestMatcherAllOf) {
+  Matcher<int> m = AllOf(_, Eq(1));
+  EXPECT_TRUE(m.Matches(1));
+}
+
+// Tests the linkage of the AnyOf matcher.
+TEST(LinkTest, TestMatcherAnyOf) {
+  Matcher<int> m = AnyOf(_, Eq(1));
+  EXPECT_TRUE(m.Matches(1));
+}
+
+// Tests the linkage of the Not matcher.
+TEST(LinkTest, TestMatcherNot) {
+  Matcher<int> m = Not(_);
+  EXPECT_FALSE(m.Matches(1));
+}
+
+// Tests the linkage of the MatcherCast<T>() function.
+TEST(LinkTest, TestMatcherCast) {
+  Matcher<const char*> m = MatcherCast<const char*>(_);
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+#endif  // GMOCK_TEST_GMOCK_LINK_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test.py
new file mode 100755
index 0000000..25f99f2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+r"""Tests the text output of Google C++ Mocking Framework.
+
+To update the golden file:
+gmock_output_test.py --build_dir=BUILD/DIR --gengolden
+where BUILD/DIR contains the built gmock_output_test_ file.
+gmock_output_test.py --gengolden
+gmock_output_test.py
+
+"""
+
+from io import open    # pylint: disable=redefined-builtin, g-importing-member
+import os
+import re
+import sys
+import gmock_test_utils
+
+
+# The flag for generating the golden file
+GENGOLDEN_FLAG = '--gengolden'
+
+PROGRAM_PATH = gmock_test_utils.GetTestExecutablePath('gmock_output_test_')
+COMMAND = [PROGRAM_PATH, '--gtest_stack_trace_depth=0', '--gtest_print_time=0']
+GOLDEN_NAME = 'gmock_output_test_golden.txt'
+GOLDEN_PATH = os.path.join(gmock_test_utils.GetSourceDir(), GOLDEN_NAME)
+
+
+def ToUnixLineEnding(s):
+  """Changes all Windows/Mac line endings in s to UNIX line endings."""
+
+  return s.replace('\r\n', '\n').replace('\r', '\n')
+
+
+def RemoveReportHeaderAndFooter(output):
+  """Removes Google Test result report's header and footer from the output."""
+
+  output = re.sub(r'.*gtest_main.*\n', '', output)
+  output = re.sub(r'\[.*\d+ tests.*\n', '', output)
+  output = re.sub(r'\[.* test environment .*\n', '', output)
+  output = re.sub(r'\[=+\] \d+ tests .* ran.*', '', output)
+  output = re.sub(r'.* FAILED TESTS\n', '', output)
+  return output
+
+
+def RemoveLocations(output):
+  """Removes all file location info from a Google Test program's output.
+
+  Args:
+       output:  the output of a Google Test program.
+
+  Returns:
+       output with all file location info (in the form of
+       'DIRECTORY/FILE_NAME:LINE_NUMBER: 'or
+       'DIRECTORY\\FILE_NAME(LINE_NUMBER): ') replaced by
+       'FILE:#: '.
+  """
+
+  return re.sub(r'.*[/\\](.+)(\:\d+|\(\d+\))\:', 'FILE:#:', output)
+
+
+def NormalizeErrorMarker(output):
+  """Normalizes the error marker, which is different on Windows vs on Linux."""
+
+  return re.sub(r' error: ', ' Failure\n', output)
+
+
+def RemoveMemoryAddresses(output):
+  """Removes memory addresses from the test output."""
+
+  return re.sub(r'@\w+', '@0x#', output)
+
+
+def RemoveTestNamesOfLeakedMocks(output):
+  """Removes the test names of leaked mock objects from the test output."""
+
+  return re.sub(r'\(used in test .+\) ', '', output)
+
+
+def GetLeakyTests(output):
+  """Returns a list of test names that leak mock objects."""
+
+  # findall() returns a list of all matches of the regex in output.
+  # For example, if '(used in test FooTest.Bar)' is in output, the
+  # list will contain 'FooTest.Bar'.
+  return re.findall(r'\(used in test (.+)\)', output)
+
+
+def GetNormalizedOutputAndLeakyTests(output):
+  """Normalizes the output of gmock_output_test_.
+
+  Args:
+    output: The test output.
+
+  Returns:
+    A tuple (the normalized test output, the list of test names that have
+    leaked mocks).
+  """
+
+  output = ToUnixLineEnding(output)
+  output = RemoveReportHeaderAndFooter(output)
+  output = NormalizeErrorMarker(output)
+  output = RemoveLocations(output)
+  output = RemoveMemoryAddresses(output)
+  return (RemoveTestNamesOfLeakedMocks(output), GetLeakyTests(output))
+
+
+def GetShellCommandOutput(cmd):
+  """Runs a command in a sub-process, and returns its STDOUT in a string."""
+
+  return gmock_test_utils.Subprocess(cmd, capture_stderr=False).output
+
+
+def GetNormalizedCommandOutputAndLeakyTests(cmd):
+  """Runs a command and returns its normalized output and a list of leaky tests.
+
+  Args:
+    cmd:  the shell command.
+  """
+
+  # Disables exception pop-ups on Windows.
+  os.environ['GTEST_CATCH_EXCEPTIONS'] = '1'
+  return GetNormalizedOutputAndLeakyTests(GetShellCommandOutput(cmd))
+
+
+class GMockOutputTest(gmock_test_utils.TestCase):
+
+  def testOutput(self):
+    (output, leaky_tests) = GetNormalizedCommandOutputAndLeakyTests(COMMAND)
+    golden_file = open(GOLDEN_PATH, 'rb')
+    golden = golden_file.read().decode('utf-8')
+    golden_file.close()
+
+    # The normalized output should match the golden file.
+    self.assertEquals(golden, output)
+
+    # The raw output should contain 2 leaked mock object errors for
+    # test GMockOutputTest.CatchesLeakedMocks.
+    self.assertEquals(['GMockOutputTest.CatchesLeakedMocks',
+                       'GMockOutputTest.CatchesLeakedMocks'],
+                      leaky_tests)
+
+
+if __name__ == '__main__':
+  if sys.argv[1:] == [GENGOLDEN_FLAG]:
+    (output, _) = GetNormalizedCommandOutputAndLeakyTests(COMMAND)
+    golden_file = open(GOLDEN_PATH, 'wb')
+    golden_file.write(output)
+    golden_file.close()
+    # Suppress the error "googletest was imported but a call to its main()
+    # was never detected."
+    os._exit(0)
+  else:
+    gmock_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_.cc
new file mode 100755
index 0000000..3955c73
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_.cc
@@ -0,0 +1,309 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Mock's output in various scenarios.  This ensures that
+// Google Mock's messages are readable and useful.
+
+#include "gmock/gmock.h"
+
+#include <stdio.h>
+#include <string>
+
+#include "gtest/gtest.h"
+
+// Silence C4100 (unreferenced formal parameter)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+using testing::_;
+using testing::AnyNumber;
+using testing::Ge;
+using testing::InSequence;
+using testing::NaggyMock;
+using testing::Ref;
+using testing::Return;
+using testing::Sequence;
+using testing::Value;
+
+class MockFoo {
+ public:
+  MockFoo() {}
+
+  MOCK_METHOD3(Bar, char(const std::string& s, int i, double x));
+  MOCK_METHOD2(Bar2, bool(int x, int y));
+  MOCK_METHOD2(Bar3, void(int x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+class GMockOutputTest : public testing::Test {
+ protected:
+  NaggyMock<MockFoo> foo_;
+};
+
+TEST_F(GMockOutputTest, ExpectedCall) {
+  testing::GMOCK_FLAG(verbose) = "info";
+
+  EXPECT_CALL(foo_, Bar2(0, _));
+  foo_.Bar2(0, 0);  // Expected call
+
+  testing::GMOCK_FLAG(verbose) = "warning";
+}
+
+TEST_F(GMockOutputTest, ExpectedCallToVoidFunction) {
+  testing::GMOCK_FLAG(verbose) = "info";
+
+  EXPECT_CALL(foo_, Bar3(0, _));
+  foo_.Bar3(0, 0);  // Expected call
+
+  testing::GMOCK_FLAG(verbose) = "warning";
+}
+
+TEST_F(GMockOutputTest, ExplicitActionsRunOut) {
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .Times(2)
+      .WillOnce(Return(false));
+  foo_.Bar2(2, 2);
+  foo_.Bar2(1, 1);  // Explicit actions in EXPECT_CALL run out.
+}
+
+TEST_F(GMockOutputTest, UnexpectedCall) {
+  EXPECT_CALL(foo_, Bar2(0, _));
+
+  foo_.Bar2(1, 0);  // Unexpected call
+  foo_.Bar2(0, 0);  // Expected call
+}
+
+TEST_F(GMockOutputTest, UnexpectedCallToVoidFunction) {
+  EXPECT_CALL(foo_, Bar3(0, _));
+
+  foo_.Bar3(1, 0);  // Unexpected call
+  foo_.Bar3(0, 0);  // Expected call
+}
+
+TEST_F(GMockOutputTest, ExcessiveCall) {
+  EXPECT_CALL(foo_, Bar2(0, _));
+
+  foo_.Bar2(0, 0);  // Expected call
+  foo_.Bar2(0, 1);  // Excessive call
+}
+
+TEST_F(GMockOutputTest, ExcessiveCallToVoidFunction) {
+  EXPECT_CALL(foo_, Bar3(0, _));
+
+  foo_.Bar3(0, 0);  // Expected call
+  foo_.Bar3(0, 1);  // Excessive call
+}
+
+TEST_F(GMockOutputTest, UninterestingCall) {
+  foo_.Bar2(0, 1);  // Uninteresting call
+}
+
+TEST_F(GMockOutputTest, UninterestingCallToVoidFunction) {
+  foo_.Bar3(0, 1);  // Uninteresting call
+}
+
+TEST_F(GMockOutputTest, RetiredExpectation) {
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo_, Bar2(0, 0));
+
+  foo_.Bar2(1, 1);
+  foo_.Bar2(1, 1);  // Matches a retired expectation
+  foo_.Bar2(0, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedPrerequisite) {
+  {
+    InSequence s;
+    EXPECT_CALL(foo_, Bar(_, 0, _));
+    EXPECT_CALL(foo_, Bar2(0, 0));
+    EXPECT_CALL(foo_, Bar2(1, _));
+  }
+
+  foo_.Bar2(1, 0);  // Has one immediate unsatisfied pre-requisite
+  foo_.Bar("Hi", 0, 0);
+  foo_.Bar2(0, 0);
+  foo_.Bar2(1, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedPrerequisites) {
+  Sequence s1, s2;
+
+  EXPECT_CALL(foo_, Bar(_, 0, _))
+      .InSequence(s1);
+  EXPECT_CALL(foo_, Bar2(0, 0))
+      .InSequence(s2);
+  EXPECT_CALL(foo_, Bar2(1, _))
+      .InSequence(s1, s2);
+
+  foo_.Bar2(1, 0);  // Has two immediate unsatisfied pre-requisites
+  foo_.Bar("Hi", 0, 0);
+  foo_.Bar2(0, 0);
+  foo_.Bar2(1, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedWith) {
+  EXPECT_CALL(foo_, Bar2(_, _)).With(Ge());
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedExpectation) {
+  EXPECT_CALL(foo_, Bar(_, _, _));
+  EXPECT_CALL(foo_, Bar2(0, _))
+      .Times(2);
+
+  foo_.Bar2(0, 1);
+}
+
+TEST_F(GMockOutputTest, MismatchArguments) {
+  const std::string s = "Hi";
+  EXPECT_CALL(foo_, Bar(Ref(s), _, Ge(0)));
+
+  foo_.Bar("Ho", 0, -0.1);  // Mismatch arguments
+  foo_.Bar(s, 0, 0);
+}
+
+TEST_F(GMockOutputTest, MismatchWith) {
+  EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))
+      .With(Ge());
+
+  foo_.Bar2(2, 3);  // Mismatch With()
+  foo_.Bar2(2, 1);
+}
+
+TEST_F(GMockOutputTest, MismatchArgumentsAndWith) {
+  EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))
+      .With(Ge());
+
+  foo_.Bar2(1, 3);  // Mismatch arguments and mismatch With()
+  foo_.Bar2(2, 1);
+}
+
+TEST_F(GMockOutputTest, UnexpectedCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  EXPECT_CALL(foo_, Bar2(2, 2));
+  foo_.Bar2(1, 0);  // Unexpected call, takes default action #2.
+  foo_.Bar2(0, 0);  // Unexpected call, takes default action #1.
+  foo_.Bar2(2, 2);  // Expected call.
+}
+
+TEST_F(GMockOutputTest, ExcessiveCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  EXPECT_CALL(foo_, Bar2(2, 2));
+  EXPECT_CALL(foo_, Bar2(1, 1));
+
+  foo_.Bar2(2, 2);  // Expected call.
+  foo_.Bar2(2, 2);  // Excessive call, takes default action #1.
+  foo_.Bar2(1, 1);  // Expected call.
+  foo_.Bar2(1, 1);  // Excessive call, takes default action #2.
+}
+
+TEST_F(GMockOutputTest, UninterestingCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  foo_.Bar2(2, 2);  // Uninteresting call, takes default action #1.
+  foo_.Bar2(1, 1);  // Uninteresting call, takes default action #2.
+}
+
+TEST_F(GMockOutputTest, ExplicitActionsRunOutWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .Times(2)
+      .WillOnce(Return(false));
+  foo_.Bar2(2, 2);
+  foo_.Bar2(1, 1);  // Explicit actions in EXPECT_CALL run out.
+}
+
+TEST_F(GMockOutputTest, CatchesLeakedMocks) {
+  MockFoo* foo1 = new MockFoo;
+  MockFoo* foo2 = new MockFoo;
+
+  // Invokes ON_CALL on foo1.
+  ON_CALL(*foo1, Bar(_, _, _)).WillByDefault(Return('a'));
+
+  // Invokes EXPECT_CALL on foo2.
+  EXPECT_CALL(*foo2, Bar2(_, _));
+  EXPECT_CALL(*foo2, Bar2(1, _));
+  EXPECT_CALL(*foo2, Bar3(_, _)).Times(AnyNumber());
+  foo2->Bar2(2, 1);
+  foo2->Bar2(1, 1);
+
+  // Both foo1 and foo2 are deliberately leaked.
+}
+
+MATCHER_P2(IsPair, first, second, "") {
+  return Value(arg.first, first) && Value(arg.second, second);
+}
+
+TEST_F(GMockOutputTest, PrintsMatcher) {
+  const testing::Matcher<int> m1 = Ge(48);
+  EXPECT_THAT((std::pair<int, bool>(42, true)), IsPair(m1, true));
+}
+
+void TestCatchesLeakedMocksInAdHocTests() {
+  MockFoo* foo = new MockFoo;
+
+  // Invokes EXPECT_CALL on foo.
+  EXPECT_CALL(*foo, Bar2(_, _));
+  foo->Bar2(2, 1);
+
+  // foo is deliberately leaked.
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleMock(&argc, argv);
+  // Ensures that the tests pass no matter what value of
+  // --gmock_catch_leaked_mocks and --gmock_verbose the user specifies.
+  testing::GMOCK_FLAG(catch_leaked_mocks) = true;
+  testing::GMOCK_FLAG(verbose) = "warning";
+
+  TestCatchesLeakedMocksInAdHocTests();
+  return RUN_ALL_TESTS();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_golden.txt b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_golden.txt
new file mode 100755
index 0000000..4c90b41
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_output_test_golden.txt
@@ -0,0 +1,317 @@
+[ RUN      ] GMockOutputTest.ExpectedCall
+
+FILE:#: EXPECT_CALL(foo_, Bar2(0, _)) invoked
+Stack trace:
+
+FILE:#: Mock function call matches EXPECT_CALL(foo_, Bar2(0, _))...
+    Function call: Bar2(0, 0)
+          Returns: false
+Stack trace:
+[       OK ] GMockOutputTest.ExpectedCall
+[ RUN      ] GMockOutputTest.ExpectedCallToVoidFunction
+
+FILE:#: EXPECT_CALL(foo_, Bar3(0, _)) invoked
+Stack trace:
+
+FILE:#: Mock function call matches EXPECT_CALL(foo_, Bar3(0, _))...
+    Function call: Bar3(0, 0)
+Stack trace:
+[       OK ] GMockOutputTest.ExpectedCallToVoidFunction
+[ RUN      ] GMockOutputTest.ExplicitActionsRunOut
+
+GMOCK WARNING:
+FILE:#: Too few actions specified in EXPECT_CALL(foo_, Bar2(_, _))...
+Expected to be called twice, but has only 1 WillOnce().
+GMOCK WARNING:
+FILE:#: Actions ran out in EXPECT_CALL(foo_, Bar2(_, _))...
+Called 2 times, but only 1 WillOnce() is specified - returning default value.
+Stack trace:
+[       OK ] GMockOutputTest.ExplicitActionsRunOut
+[ RUN      ] GMockOutputTest.UnexpectedCall
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(0, _))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCall
+[ RUN      ] GMockOutputTest.UnexpectedCallToVoidFunction
+unknown file: Failure
+
+Unexpected mock function call - returning directly.
+    Function call: Bar3(1, 0)
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar3(0, _))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCallToVoidFunction
+[ RUN      ] GMockOutputTest.ExcessiveCall
+FILE:#: Failure
+Mock function called more times than expected - returning default value.
+    Function call: Bar2(0, 1)
+          Returns: false
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCall
+[ RUN      ] GMockOutputTest.ExcessiveCallToVoidFunction
+FILE:#: Failure
+Mock function called more times than expected - returning directly.
+    Function call: Bar3(0, 1)
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCallToVoidFunction
+[ RUN      ] GMockOutputTest.UninterestingCall
+
+GMOCK WARNING:
+Uninteresting mock function call - returning default value.
+    Function call: Bar2(0, 1)
+          Returns: false
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCall
+[ RUN      ] GMockOutputTest.UninterestingCallToVoidFunction
+
+GMOCK WARNING:
+Uninteresting mock function call - returning directly.
+    Function call: Bar3(0, 1)
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCallToVoidFunction
+[ RUN      ] GMockOutputTest.RetiredExpectation
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 1)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(_, _))...
+         Expected: the expectation is active
+           Actual: it is retired
+         Expected: to be called once
+           Actual: called once - saturated and retired
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+  Expected arg #1: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.RetiredExpectation
+[ RUN      ] GMockOutputTest.UnsatisfiedPrerequisite
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(1, _))...
+         Expected: all pre-requisites are satisfied
+           Actual: the following immediate pre-requisites are not satisfied:
+FILE:#: pre-requisite #0
+                   (end of pre-requisites)
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisite
+[ RUN      ] GMockOutputTest.UnsatisfiedPrerequisites
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(1, _))...
+         Expected: all pre-requisites are satisfied
+           Actual: the following immediate pre-requisites are not satisfied:
+FILE:#: pre-requisite #0
+FILE:#: pre-requisite #1
+                   (end of pre-requisites)
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisites
+[ RUN      ] GMockOutputTest.UnsatisfiedWith
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar2(_, _))...
+    Expected args: are a pair where the first >= the second
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedWith
+[ RUN      ] GMockOutputTest.UnsatisfiedExpectation
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar2(0, _))...
+         Expected: to be called twice
+           Actual: called once - unsatisfied and active
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar(_, _, _))...
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedExpectation
+[ RUN      ] GMockOutputTest.MismatchArguments
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar(@0x# "Ho", 0, -0.1)
+          Returns: '\0'
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar(Ref(s), _, Ge(0)))...
+  Expected arg #0: references the variable @0x# "Hi"
+           Actual: "Ho", which is located @0x#
+  Expected arg #2: is >= 0
+           Actual: -0.1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchArguments
+[ RUN      ] GMockOutputTest.MismatchWith
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(2, 3)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))...
+    Expected args: are a pair where the first >= the second
+           Actual: don't match
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchWith
+[ RUN      ] GMockOutputTest.MismatchArgumentsAndWith
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 3)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))...
+  Expected arg #0: is >= 2
+           Actual: 1
+    Expected args: are a pair where the first >= the second
+           Actual: don't match
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchArgumentsAndWith
+[ RUN      ] GMockOutputTest.UnexpectedCallWithDefaultAction
+unknown file: Failure
+
+Unexpected mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(2, 2))...
+  Expected arg #0: is equal to 2
+           Actual: 1
+  Expected arg #1: is equal to 2
+           Actual: 0
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+unknown file: Failure
+
+Unexpected mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(0, 0)
+          Returns: true
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(2, 2))...
+  Expected arg #0: is equal to 2
+           Actual: 0
+  Expected arg #1: is equal to 2
+           Actual: 0
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCallWithDefaultAction
+[ RUN      ] GMockOutputTest.ExcessiveCallWithDefaultAction
+FILE:#: Failure
+Mock function called more times than expected - taking default action specified at:
+FILE:#:
+    Function call: Bar2(2, 2)
+          Returns: true
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+FILE:#: Failure
+Mock function called more times than expected - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 1)
+          Returns: false
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCallWithDefaultAction
+[ RUN      ] GMockOutputTest.UninterestingCallWithDefaultAction
+
+GMOCK WARNING:
+Uninteresting mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(2, 2)
+          Returns: true
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+
+GMOCK WARNING:
+Uninteresting mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 1)
+          Returns: false
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCallWithDefaultAction
+[ RUN      ] GMockOutputTest.ExplicitActionsRunOutWithDefaultAction
+
+GMOCK WARNING:
+FILE:#: Too few actions specified in EXPECT_CALL(foo_, Bar2(_, _))...
+Expected to be called twice, but has only 1 WillOnce().
+GMOCK WARNING:
+FILE:#: Actions ran out in EXPECT_CALL(foo_, Bar2(_, _))...
+Called 2 times, but only 1 WillOnce() is specified - taking default action specified at:
+FILE:#:
+Stack trace:
+[       OK ] GMockOutputTest.ExplicitActionsRunOutWithDefaultAction
+[ RUN      ] GMockOutputTest.CatchesLeakedMocks
+[       OK ] GMockOutputTest.CatchesLeakedMocks
+[ RUN      ] GMockOutputTest.PrintsMatcher
+FILE:#: Failure
+Value of: (std::pair<int, bool>(42, true))
+Expected: is pair (is >= 48, true)
+  Actual: (42, true) (of type std::pair<int, bool>)
+[  FAILED  ] GMockOutputTest.PrintsMatcher
+[  FAILED  ] GMockOutputTest.UnexpectedCall
+[  FAILED  ] GMockOutputTest.UnexpectedCallToVoidFunction
+[  FAILED  ] GMockOutputTest.ExcessiveCall
+[  FAILED  ] GMockOutputTest.ExcessiveCallToVoidFunction
+[  FAILED  ] GMockOutputTest.RetiredExpectation
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisite
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisites
+[  FAILED  ] GMockOutputTest.UnsatisfiedWith
+[  FAILED  ] GMockOutputTest.UnsatisfiedExpectation
+[  FAILED  ] GMockOutputTest.MismatchArguments
+[  FAILED  ] GMockOutputTest.MismatchWith
+[  FAILED  ] GMockOutputTest.MismatchArgumentsAndWith
+[  FAILED  ] GMockOutputTest.UnexpectedCallWithDefaultAction
+[  FAILED  ] GMockOutputTest.ExcessiveCallWithDefaultAction
+[  FAILED  ] GMockOutputTest.PrintsMatcher
+
+
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+ERROR: 3 leaked mock objects found at program exit. Expectations on a mock object is verified when the object is destructed. Leaking a mock means that its expectations aren't verified, which is usually a test bug. If you really intend to leak a mock, you can suppress this error using testing::Mock::AllowLeak(mock_object), or you may use a fake or stub instead of a mock.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_stress_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_stress_test.cc
new file mode 100755
index 0000000..20725d6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_stress_test.cc
@@ -0,0 +1,240 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests that Google Mock constructs can be used in a large number of
+// threads concurrently.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace {
+
+// From gtest-port.h.
+using ::testing::internal::ThreadWithParam;
+
+// The maximum number of test threads (not including helper threads)
+// to create.
+const int kMaxTestThreads = 50;
+
+// How many times to repeat a task in a test thread.
+const int kRepeat = 50;
+
+class MockFoo {
+ public:
+  MOCK_METHOD1(Bar, int(int n));  // NOLINT
+  MOCK_METHOD2(Baz, char(const char* s1, const std::string& s2));  // NOLINT
+};
+
+// Helper for waiting for the given thread to finish and then deleting it.
+template <typename T>
+void JoinAndDelete(ThreadWithParam<T>* t) {
+  t->Join();
+  delete t;
+}
+
+struct Dummy {};
+
+
+// Tests that different mock objects can be used in their respective
+// threads.  This should generate no Google Test failure.
+void TestConcurrentMockObjects(Dummy /* dummy */) {
+  // Creates a mock and does some typical operations on it.
+  MockFoo foo;
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(Return(1));
+  ON_CALL(foo, Baz(_, _))
+      .WillByDefault(Return('b'));
+  ON_CALL(foo, Baz(_, "you"))
+      .WillByDefault(Return('a'));
+
+  EXPECT_CALL(foo, Bar(0))
+      .Times(AtMost(3));
+  EXPECT_CALL(foo, Baz(_, _));
+  EXPECT_CALL(foo, Baz("hi", "you"))
+      .WillOnce(Return('z'))
+      .WillRepeatedly(DoDefault());
+
+  EXPECT_EQ(1, foo.Bar(0));
+  EXPECT_EQ(1, foo.Bar(0));
+  EXPECT_EQ('z', foo.Baz("hi", "you"));
+  EXPECT_EQ('a', foo.Baz("hi", "you"));
+  EXPECT_EQ('b', foo.Baz("hi", "me"));
+}
+
+// Tests invoking methods of the same mock object in multiple threads.
+
+struct Helper1Param {
+  MockFoo* mock_foo;
+  int* count;
+};
+
+void Helper1(Helper1Param param) {
+  for (int i = 0; i < kRepeat; i++) {
+    const char ch = param.mock_foo->Baz("a", "b");
+    if (ch == 'a') {
+      // It was an expected call.
+      (*param.count)++;
+    } else {
+      // It was an excessive call.
+      EXPECT_EQ('\0', ch);
+    }
+
+    // An unexpected call.
+    EXPECT_EQ('\0', param.mock_foo->Baz("x", "y")) << "Expected failure.";
+
+    // An uninteresting call.
+    EXPECT_EQ(1, param.mock_foo->Bar(5));
+  }
+}
+
+// This should generate 3*kRepeat + 1 failures in total.
+void TestConcurrentCallsOnSameObject(Dummy /* dummy */) {
+  MockFoo foo;
+
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(foo, Baz(_, "b"))
+      .Times(kRepeat)
+      .WillRepeatedly(Return('a'));
+  EXPECT_CALL(foo, Baz(_, "c"));  // Expected to be unsatisfied.
+
+  // This chunk of code should generate kRepeat failures about
+  // excessive calls, and 2*kRepeat failures about unexpected calls.
+  int count1 = 0;
+  const Helper1Param param = { &foo, &count1 };
+  ThreadWithParam<Helper1Param>* const t =
+      new ThreadWithParam<Helper1Param>(Helper1, param, nullptr);
+
+  int count2 = 0;
+  const Helper1Param param2 = { &foo, &count2 };
+  Helper1(param2);
+  JoinAndDelete(t);
+
+  EXPECT_EQ(kRepeat, count1 + count2);
+
+  // foo's destructor should generate one failure about unsatisfied
+  // expectation.
+}
+
+// Tests using the same mock object in multiple threads when the
+// expectations are partially ordered.
+
+void Helper2(MockFoo* foo) {
+  for (int i = 0; i < kRepeat; i++) {
+    foo->Bar(2);
+    foo->Bar(3);
+  }
+}
+
+// This should generate no Google Test failures.
+void TestPartiallyOrderedExpectationsWithThreads(Dummy /* dummy */) {
+  MockFoo foo;
+  Sequence s1, s2;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(foo, Bar(0));
+    EXPECT_CALL(foo, Bar(1))
+        .InSequence(s1, s2);
+  }
+
+  EXPECT_CALL(foo, Bar(2))
+      .Times(2*kRepeat)
+      .InSequence(s1)
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo, Bar(3))
+      .Times(2*kRepeat)
+      .InSequence(s2);
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(foo, Bar(2))
+        .InSequence(s1, s2);
+    EXPECT_CALL(foo, Bar(4));
+  }
+
+  foo.Bar(0);
+  foo.Bar(1);
+
+  ThreadWithParam<MockFoo*>* const t =
+      new ThreadWithParam<MockFoo*>(Helper2, &foo, nullptr);
+  Helper2(&foo);
+  JoinAndDelete(t);
+
+  foo.Bar(2);
+  foo.Bar(4);
+}
+
+// Tests using Google Mock constructs in many threads concurrently.
+TEST(StressTest, CanUseGMockWithThreads) {
+  void (*test_routines[])(Dummy dummy) = {
+    &TestConcurrentMockObjects,
+    &TestConcurrentCallsOnSameObject,
+    &TestPartiallyOrderedExpectationsWithThreads,
+  };
+
+  const int kRoutines = sizeof(test_routines)/sizeof(test_routines[0]);
+  const int kCopiesOfEachRoutine = kMaxTestThreads / kRoutines;
+  const int kTestThreads = kCopiesOfEachRoutine * kRoutines;
+  ThreadWithParam<Dummy>* threads[kTestThreads] = {};
+  for (int i = 0; i < kTestThreads; i++) {
+    // Creates a thread to run the test function.
+    threads[i] = new ThreadWithParam<Dummy>(test_routines[i % kRoutines],
+                                            Dummy(), nullptr);
+    GTEST_LOG_(INFO) << "Thread #" << i << " running . . .";
+  }
+
+  // At this point, we have many threads running.
+  for (int i = 0; i < kTestThreads; i++) {
+    JoinAndDelete(threads[i]);
+  }
+
+  // Ensures that the correct number of failures have been reported.
+  const TestInfo* const info = UnitTest::GetInstance()->current_test_info();
+  const TestResult& result = *info->result();
+  const int kExpectedFailures = (3*kRepeat + 1)*kCopiesOfEachRoutine;
+  GTEST_CHECK_(kExpectedFailures == result.total_part_count())
+      << "Expected " << kExpectedFailures << " failures, but got "
+      << result.total_part_count();
+}
+
+}  // namespace
+}  // namespace testing
+
+int main(int argc, char **argv) {
+  testing::InitGoogleMock(&argc, argv);
+
+  const int exit_code = RUN_ALL_TESTS();  // Expected to fail.
+  GTEST_CHECK_(exit_code != 0) << "RUN_ALL_TESTS() did not fail as expected";
+
+  printf("\nPASS\n");
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test.cc
new file mode 100755
index 0000000..e9840a3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test.cc
@@ -0,0 +1,181 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests code in gmock.cc.
+
+#include "gmock/gmock.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if !defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+
+using testing::GMOCK_FLAG(default_mock_behavior);
+using testing::GMOCK_FLAG(verbose);
+using testing::InitGoogleMock;
+
+// Verifies that calling InitGoogleMock() on argv results in new_argv,
+// and the gmock_verbose flag's value is set to expected_gmock_verbose.
+template <typename Char, int M, int N>
+void TestInitGoogleMock(const Char* (&argv)[M], const Char* (&new_argv)[N],
+                        const ::std::string& expected_gmock_verbose) {
+  const ::std::string old_verbose = GMOCK_FLAG(verbose);
+
+  int argc = M - 1;
+  InitGoogleMock(&argc, const_cast<Char**>(argv));
+  ASSERT_EQ(N - 1, argc) << "The new argv has wrong number of elements.";
+
+  for (int i = 0; i < N; i++) {
+    EXPECT_STREQ(new_argv[i], argv[i]);
+  }
+
+  EXPECT_EQ(expected_gmock_verbose, GMOCK_FLAG(verbose).c_str());
+  GMOCK_FLAG(verbose) = old_verbose;  // Restores the gmock_verbose flag.
+}
+
+TEST(InitGoogleMockTest, ParsesInvalidCommandLine) {
+  const char* argv[] = {nullptr};
+
+  const char* new_argv[] = {nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesEmptyCommandLine) {
+  const char* argv[] = {"foo.exe", nullptr};
+
+  const char* new_argv[] = {"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesSingleFlag) {
+  const char* argv[] = {"foo.exe", "--gmock_verbose=info", nullptr};
+
+  const char* new_argv[] = {"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+}
+
+TEST(InitGoogleMockTest, ParsesMultipleFlags) {
+  int old_default_behavior = GMOCK_FLAG(default_mock_behavior);
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info",
+                           L"--gmock_default_mock_behavior=2", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+  EXPECT_EQ(2, GMOCK_FLAG(default_mock_behavior));
+  EXPECT_NE(2, old_default_behavior);
+  GMOCK_FLAG(default_mock_behavior) = old_default_behavior;
+}
+
+TEST(InitGoogleMockTest, ParsesUnrecognizedFlag) {
+  const char* argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  const char* new_argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesGoogleMockFlagAndUnrecognizedFlag) {
+  const char* argv[] = {"foo.exe", "--non_gmock_flag=blah",
+                        "--gmock_verbose=error", nullptr};
+
+  const char* new_argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "error");
+}
+
+TEST(WideInitGoogleMockTest, ParsesInvalidCommandLine) {
+  const wchar_t* argv[] = {nullptr};
+
+  const wchar_t* new_argv[] = {nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesEmptyCommandLine) {
+  const wchar_t* argv[] = {L"foo.exe", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesSingleFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+}
+
+TEST(WideInitGoogleMockTest, ParsesMultipleFlags) {
+  int old_default_behavior = GMOCK_FLAG(default_mock_behavior);
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info",
+                           L"--gmock_default_mock_behavior=2", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+  EXPECT_EQ(2, GMOCK_FLAG(default_mock_behavior));
+  EXPECT_NE(2, old_default_behavior);
+  GMOCK_FLAG(default_mock_behavior) = old_default_behavior;
+}
+
+TEST(WideInitGoogleMockTest, ParsesUnrecognizedFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesGoogleMockFlagAndUnrecognizedFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--non_gmock_flag=blah",
+                           L"--gmock_verbose=error", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "error");
+}
+
+#endif  // !defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+
+// Makes sure Google Mock flags can be accessed in code.
+TEST(FlagTest, IsAccessibleInCode) {
+  bool dummy = testing::GMOCK_FLAG(catch_leaked_mocks) &&
+      testing::GMOCK_FLAG(verbose) == "";
+  (void)dummy;  // Avoids the "unused local variable" warning.
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test_utils.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test_utils.py
new file mode 100755
index 0000000..7dc4e11
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googlemock/test/gmock_test_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test utilities for Google C++ Mocking Framework."""
+
+import os
+import sys
+
+# Determines path to gtest_test_utils and imports it.
+SCRIPT_DIR = os.path.dirname(__file__) or '.'
+
+# isdir resolves symbolic links.
+gtest_tests_util_dir = os.path.join(SCRIPT_DIR, '../../googletest/test')
+if os.path.isdir(gtest_tests_util_dir):
+  GTEST_TESTS_UTIL_DIR = gtest_tests_util_dir
+else:
+  GTEST_TESTS_UTIL_DIR = os.path.join(SCRIPT_DIR, '../../googletest/test')
+sys.path.append(GTEST_TESTS_UTIL_DIR)
+
+# pylint: disable=C6204
+import gtest_test_utils
+
+
+def GetSourceDir():
+  """Returns the absolute path of the directory where the .py files are."""
+
+  return gtest_test_utils.GetSourceDir()
+
+
+def GetTestExecutablePath(executable_name):
+  """Returns the absolute path of the test binary given its name.
+
+  The function will print a message and abort the program if the resulting file
+  doesn't exist.
+
+  Args:
+    executable_name: name of the test binary that the test script runs.
+
+  Returns:
+    The absolute path of the test binary.
+  """
+
+  return gtest_test_utils.GetTestExecutablePath(executable_name)
+
+
+def GetExitStatus(exit_code):
+  """Returns the argument to exit(), or -1 if exit() wasn't called.
+
+  Args:
+    exit_code: the result value of os.system(command).
+  """
+
+  if os.name == 'nt':
+    # On Windows, os.WEXITSTATUS() doesn't work and os.system() returns
+    # the argument to exit() directly.
+    return exit_code
+  else:
+    # On Unix, os.WEXITSTATUS() must be used to extract the exit status
+    # from the result of os.system().
+    if os.WIFEXITED(exit_code):
+      return os.WEXITSTATUS(exit_code)
+    else:
+      return -1
+
+
+# Suppresses the "Invalid const name" lint complaint
+# pylint: disable-msg=C6409
+
+# Exposes utilities from gtest_test_utils.
+Subprocess = gtest_test_utils.Subprocess
+TestCase = gtest_test_utils.TestCase
+environ = gtest_test_utils.environ
+SetEnvVar = gtest_test_utils.SetEnvVar
+PREMATURE_EXIT_FILE_ENV_VAR = gtest_test_utils.PREMATURE_EXIT_FILE_ENV_VAR
+
+# pylint: enable-msg=C6409
+
+
+def Main():
+  """Runs the unit test."""
+
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CMakeLists.txt b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CMakeLists.txt
new file mode 100755
index 0000000..db29294
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CMakeLists.txt
@@ -0,0 +1,328 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Test.
+#
+# To run the tests for Google Test itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+# When other libraries are using a shared version of runtime libraries,
+# Google Test also has to use one.
+option(
+  gtest_force_shared_crt
+  "Use shared (DLL) run-time lib even when Google Test is built as static lib."
+  OFF)
+
+option(gtest_build_tests "Build all of gtest's own tests." OFF)
+
+option(gtest_build_samples "Build gtest's sample programs." OFF)
+
+option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF)
+
+option(
+  gtest_hide_internal_symbols
+  "Build gtest with internal symbols hidden in shared libraries."
+  OFF)
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include(cmake/hermetic_build.cmake OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gtest_SOURCE_DIR} and to the root binary directory as
+# ${gtest_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+
+# Project version:
+
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gtest CXX C)
+  set(PROJECT_VERSION ${GOOGLETEST_VERSION})
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.6.4)
+
+if (POLICY CMP0063) # Visibility
+  cmake_policy(SET CMP0063 NEW)
+endif (POLICY CMP0063)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+
+else()
+
+  mark_as_advanced(
+    gtest_force_shared_crt
+    gtest_build_tests
+    gtest_build_samples
+    gtest_disable_pthreads
+    gtest_hide_internal_symbols)
+
+endif()
+
+
+if (gtest_hide_internal_symbols)
+  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+  set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+endif()
+
+# Define helper functions and macros used by Google Test.
+include(cmake/internal_utils.cmake)
+
+config_compiler_and_linker()  # Defined in internal_utils.cmake.
+
+# Create the CMake package file descriptors.
+if (INSTALL_GTEST)
+  include(CMakePackageConfigHelpers)
+  set(cmake_package_name GTest)
+  set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
+  set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
+  set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
+  set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
+  write_basic_package_version_file(${version_file} VERSION ${GOOGLETEST_VERSION} COMPATIBILITY AnyNewerVersion)
+  install(EXPORT ${targets_export_name}
+    NAMESPACE ${cmake_package_name}::
+    DESTINATION ${cmake_files_install_dir})
+  set(config_file "${generated_dir}/${cmake_package_name}Config.cmake")
+  configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in"
+    "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir})
+  install(FILES ${version_file} ${config_file}
+    DESTINATION ${cmake_files_install_dir})
+endif()
+
+# Where Google Test's .h files can be found.
+set(gtest_build_include_dirs
+  "${gtest_SOURCE_DIR}/include"
+  "${gtest_SOURCE_DIR}")
+include_directories(${gtest_build_include_dirs})
+
+########################################################################
+#
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+
+# Google Test libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that gtest can be compiled by a user
+# aggressive about warnings.
+cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
+cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gtest SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gtest_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+target_link_libraries(gtest_main PUBLIC gtest)
+
+########################################################################
+#
+# Install rules
+install_project(gtest gtest_main)
+
+########################################################################
+#
+# Samples on how to link user tests with gtest or gtest_main.
+#
+# They are not built by default.  To build them, set the
+# gtest_build_samples option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_samples=ON flag when running cmake.
+
+if (gtest_build_samples)
+  cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc)
+  cxx_executable(sample3_unittest samples gtest_main)
+  cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc)
+  cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample6_unittest samples gtest_main)
+  cxx_executable(sample7_unittest samples gtest_main)
+  cxx_executable(sample8_unittest samples gtest_main)
+  cxx_executable(sample9_unittest samples gtest)
+  cxx_executable(sample10_unittest samples gtest)
+endif()
+
+########################################################################
+#
+# Google Test's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Test itself.
+#
+# The tests are not built by default.  To build them, set the
+# gtest_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_tests=ON flag when running cmake.
+
+if (gtest_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  if (WIN32)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1"
+         CONTENT
+"$project_bin = \"${CMAKE_BINARY_DIR}/bin/$<CONFIG>\"
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  elseif (MINGW OR CYGWIN)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1"
+         CONTENT
+"$project_bin = (cygpath --windows ${CMAKE_BINARY_DIR}/bin)
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  endif()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(googletest-death-test-test gtest_main)
+  cxx_test(gtest_environment_test gtest)
+  cxx_test(googletest-filepath-test gtest_main)
+  cxx_test(googletest-listener-test gtest_main)
+  cxx_test(gtest_main_unittest gtest_main)
+  cxx_test(googletest-message-test gtest_main)
+  cxx_test(gtest_no_test_unittest gtest)
+  cxx_test(googletest-options-test gtest_main)
+  cxx_test(googletest-param-test-test gtest
+    test/googletest-param-test2-test.cc)
+  cxx_test(googletest-port-test gtest_main)
+  cxx_test(gtest_pred_impl_unittest gtest_main)
+  cxx_test(gtest_premature_exit_test gtest
+    test/gtest_premature_exit_test.cc)
+  cxx_test(googletest-printers-test gtest_main)
+  cxx_test(gtest_prod_test gtest_main
+    test/production.cc)
+  cxx_test(gtest_repeat_test gtest)
+  cxx_test(gtest_sole_header_test gtest_main)
+  cxx_test(gtest_stress_test gtest)
+  cxx_test(googletest-test-part-test gtest_main)
+  cxx_test(gtest_throw_on_failure_ex_test gtest)
+  cxx_test(gtest-typed-test_test gtest_main
+    test/gtest-typed-test2_test.cc)
+  cxx_test(gtest_unittest gtest_main)
+  cxx_test(gtest-unittest-api_test gtest)
+  cxx_test(gtest_skip_in_environment_setup_test gtest_main)
+  cxx_test(gtest_skip_test gtest_main)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_library(gtest_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc)
+    cxx_library(gtest_main_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc src/gtest_main.cc)
+  endif()
+  cxx_library(gtest_main_no_rtti "${cxx_no_rtti}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_test_with_flags(gtest-death-test_ex_nocatch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
+    gtest test/googletest-death-test_ex_test.cc)
+  cxx_test_with_flags(gtest-death-test_ex_catch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
+    gtest test/googletest-death-test_ex_test.cc)
+
+  cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
+    gtest_main_no_rtti test/gtest_unittest.cc)
+
+  cxx_shared_library(gtest_dll "${cxx_default}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}"
+    gtest_dll test/gtest_all_test.cc)
+  set_target_properties(gtest_dll_test_
+                        PROPERTIES
+                        COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(googletest-break-on-failure-unittest_ test gtest)
+  py_test(googletest-break-on-failure-unittest)
+
+  py_test(gtest_skip_environment_check_output_test)
+
+  # Visual Studio .NET 2003 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
+    cxx_executable_with_flags(
+      googletest-catch-exceptions-no-ex-test_
+      "${cxx_no_exception}"
+      gtest_main_no_exception
+      test/googletest-catch-exceptions-test_.cc)
+  endif()
+
+  cxx_executable_with_flags(
+    googletest-catch-exceptions-ex-test_
+    "${cxx_exception}"
+    gtest_main
+    test/googletest-catch-exceptions-test_.cc)
+  py_test(googletest-catch-exceptions-test)
+
+  cxx_executable(googletest-color-test_ test gtest)
+  py_test(googletest-color-test)
+
+  cxx_executable(googletest-env-var-test_ test gtest)
+  py_test(googletest-env-var-test)
+
+  cxx_executable(googletest-filter-unittest_ test gtest)
+  py_test(googletest-filter-unittest)
+
+  cxx_executable(gtest_help_test_ test gtest_main)
+  py_test(gtest_help_test)
+
+  cxx_executable(googletest-list-tests-unittest_ test gtest)
+  py_test(googletest-list-tests-unittest)
+
+  cxx_executable(googletest-output-test_ test gtest)
+  py_test(googletest-output-test --no_stacktrace_support)
+
+  cxx_executable(googletest-shuffle-test_ test gtest)
+  py_test(googletest-shuffle-test)
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception)
+    set_target_properties(googletest-throw-on-failure-test_
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_no_exception}")
+    py_test(googletest-throw-on-failure-test)
+  endif()
+
+  cxx_executable(googletest-uninitialized-test_ test gtest)
+  py_test(googletest-uninitialized-test)
+
+  cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
+  cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
+  py_test(gtest_xml_outfiles_test)
+  py_test(googletest-json-outfiles-test)
+
+  cxx_executable(gtest_xml_output_unittest_ test gtest)
+  py_test(gtest_xml_output_unittest --no_stacktrace_support)
+  py_test(googletest-json-output-unittest --no_stacktrace_support)
+endif()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CONTRIBUTORS b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CONTRIBUTORS
new file mode 100755
index 0000000..feae2fc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/CONTRIBUTORS
@@ -0,0 +1,37 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Bharat Mediratta <bharat@menalto.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Eric Roman <eroman@chromium.org>
+Hady Zalek <hady.zalek@gmail.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Manuel Klimek <klimek@google.com>
+Markus Heule <markus.heule@gmail.com>
+Mika Raento <mikie@iki.fi>
+Miklós Fazekas <mfazekas@szemafor.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Peter Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/LICENSE b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/LICENSE
new file mode 100755
index 0000000..1941a11
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/README.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/README.md
new file mode 100755
index 0000000..766ddc1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/README.md
@@ -0,0 +1,244 @@
+### Generic Build Instructions
+
+#### Setup
+
+To build Google Test and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
+
+### Build with CMake
+
+Google Test comes with a CMake build script (
+[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build Google Test as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
+
+#### Standalone CMake Project
+
+When building Google Test as a standalone project, the typical workflow starts
+with:
+
+    mkdir mybuild       # Create a directory to hold the build output.
+    cd mybuild
+    cmake ${GTEST_DIR}  # Generate native build scripts.
+
+If you want to build Google Test's samples, you should replace the last command
+with
+
+    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type 'make' to build gtest.
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+#### Incorporating Into An Existing CMake Project
+
+If you want to use gtest in a project which already uses CMake, then a more
+robust and flexible approach is to build gtest as part of that project directly.
+This is done by making the GoogleTest source code available to the main build
+and adding it using CMake's `add_subdirectory()` command. This has the
+significant advantage that the same compiler and linker settings are used
+between gtest and the rest of your project, so issues associated with using
+incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+*   Download the GoogleTest source code manually and place it at a known
+    location. This is the least flexible approach and can make it more difficult
+    to use with continuous integration systems, etc.
+*   Embed the GoogleTest source code as a direct copy in the main project's
+    source tree. This is often the simplest approach, but is also the hardest to
+    keep up to date. Some organizations may not permit this method.
+*   Add GoogleTest as a git submodule or equivalent. This may not always be
+    possible or appropriate. Git submodules, for example, have their own set of
+    advantages and drawbacks.
+*   Use CMake to download GoogleTest as part of the build's configure step. This
+    is just a little more complex, but doesn't have the limitations of the other
+    methods.
+
+The last of the above methods is implemented with a small piece of CMake code in
+a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
+then invoked as a sub-build _during the CMake stage_. That directory is then
+pulled into the main build with `add_subdirectory()`. For example:
+
+New file `CMakeLists.txt.in`:
+
+```cmake
+cmake_minimum_required(VERSION 2.8.2)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_TAG           master
+  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
+  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+```
+
+Existing build's `CMakeLists.txt`:
+
+```cmake
+# Download and unpack googletest at configure time
+configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
+                 EXCLUDE_FROM_ALL)
+
+# The gtest/gtest_main targets carry header search path
+# dependencies automatically when using CMake 2.8.11 or
+# later. Otherwise we have to add them here ourselves.
+if (CMAKE_VERSION VERSION_LESS 2.8.11)
+  include_directories("${gtest_SOURCE_DIR}/include")
+endif()
+
+# Now simply link against gtest or gtest_main as needed. Eg
+add_executable(example example.cpp)
+target_link_libraries(example gtest_main)
+add_test(NAME example_test COMMAND example)
+```
+
+Note that this approach requires CMake 2.8.2 or later due to its use of the
+`ExternalProject_Add()` command. The above technique is discussed in more detail
+in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
+also contains a link to a fully generalized implementation of the technique.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+Google Test links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+Google Test already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+#### C++ Standard Version
+
+An environment that supports C++11 is required in order to successfully build
+Google Test. One way to ensure this is to specify the standard in the top-level
+project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
+is not feasible, for example in a C project using Google Test for validation,
+then it can be specified by adding it to the options for cmake via the
+`DCMAKE_CXX_FLAGS` option.
+
+### Tweaking Google Test
+
+Google Test can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak Google Test by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h).
+
+### Multi-threaded Tests
+
+Google Test is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the
+`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
+`#defined` to 1, no if it's undefined.).
+
+If Google Test doesn't correctly detect whether pthread is available in your
+environment, you can force it with
+
+    -DGTEST_HAS_PTHREAD=1
+
+or
+
+    -DGTEST_HAS_PTHREAD=0
+
+When Google Test uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script or the deprecated Autotools script, this is taken care of for you.
+If you use your own build script, you'll need to read your compiler and linker's
+manual to figure out what flags to add.
+
+### As a Shared Library (DLL)
+
+Google Test is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use Google Test as a shared library (known
+as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+    -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
+
+To compile your *tests* that use the gtest shared library, add
+
+    -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using Google Test as a shared library.
+Otherwise a future release of Google Test may break your build script.
+
+### Avoiding Macro Name Clashes
+
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+Google Test macro clashes with another library, you can force Google Test to
+rename its macro to avoid the conflict.
+
+Specifically, if both Google Test and some other code define macro FOO, you can
+add
+
+    -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
+
+    GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+    TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/Config.cmake.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/Config.cmake.in
new file mode 100755
index 0000000..12be449
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/Config.cmake.in
@@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+include(CMakeFindDependencyMacro)
+if (@GTEST_HAS_PTHREAD@)
+  set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@)
+  find_dependency(Threads)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
+check_required_components("@project_name@")
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest.pc.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest.pc.in
new file mode 100755
index 0000000..9aae29e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest.pc.in
@@ -0,0 +1,10 @@
+prefix=${pcfiledir}/../..
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: gtest
+Description: GoogleTest (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest_main.pc.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest_main.pc.in
new file mode 100755
index 0000000..915f297
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/gtest_main.pc.in
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: gtest_main
+Description: GoogleTest (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest
+Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/internal_utils.cmake b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/internal_utils.cmake
new file mode 100755
index 0000000..2f70f0b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/internal_utils.cmake
@@ -0,0 +1,358 @@
+# Defines functions and macros useful for building Google Test and
+# Google Mock.
+#
+# Note:
+#
+# - This file will be run twice when building Google Mock (once via
+#   Google Test's CMakeLists.txt, and once via Google Mock's).
+#   Therefore it shouldn't have any side effects other than defining
+#   the functions and macros.
+#
+# - The functions/macros defined in this file may depend on Google
+#   Test and Google Mock's option() definitions, and thus must be
+#   called *after* the options have been defined.
+
+if (POLICY CMP0054)
+  cmake_policy(SET CMP0054 NEW)
+endif (POLICY CMP0054)
+
+# Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
+#
+# This must be a macro(), as inside a function string() can only
+# update variables in the function scope.
+macro(fix_default_compiler_settings_)
+  if (MSVC)
+    # For MSVC, CMake sets certain flags to defaults we want to override.
+    # This replacement code is taken from sample in the CMake Wiki at
+    # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
+    foreach (flag_var
+             CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+             CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
+        # When Google Test is built as a shared library, it should also use
+        # shared runtime libraries.  Otherwise, it may end up with multiple
+        # copies of runtime library data in different modules, resulting in
+        # hard-to-find crashes. When it is built as a static library, it is
+        # preferable to use CRT as static libraries, as we don't have to rely
+        # on CRT DLLs being available. CMake always defaults to using shared
+        # CRT libraries, so we override that default here.
+        string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}")
+      endif()
+
+      # We prefer more strict warning checking for building Google Test.
+      # Replaces /W3 with /W4 in defaults.
+      string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
+
+      # Prevent D9025 warning for targets that have exception handling
+      # turned off (/EHs-c- flag). Where required, exceptions are explicitly
+      # re-enabled using the cxx_exception_flags variable.
+      string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}")
+    endforeach()
+  endif()
+endmacro()
+
+# Defines the compiler/linker flags used to build Google Test and
+# Google Mock.  You can tweak these definitions to suit your need.  A
+# variable's value is empty before it's explicitly assigned to.
+macro(config_compiler_and_linker)
+  # Note: pthreads on MinGW is not supported, even if available
+  # instead, we use windows threading primitives
+  unset(GTEST_HAS_PTHREAD)
+  if (NOT gtest_disable_pthreads AND NOT MINGW)
+    # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
+    find_package(Threads)
+    if (CMAKE_USE_PTHREADS_INIT)
+      set(GTEST_HAS_PTHREAD ON)
+    endif()
+  endif()
+
+  fix_default_compiler_settings_()
+  if (MSVC)
+    # Newlines inside flags variables break CMake's NMake generator.
+    # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
+    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi")
+    set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
+    set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
+    set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-GR-")
+    # Suppress "unreachable code" warning
+    # http://stackoverflow.com/questions/3232669 explains the issue.
+    set(cxx_base_flags "${cxx_base_flags} -wd4702")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(cxx_base_flags "-Wall -Wshadow -Werror -Wconversion")
+    set(cxx_exception_flags "-fexceptions")
+    set(cxx_no_exception_flags "-fno-exceptions")
+    set(cxx_strict_flags "-W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls")
+    set(cxx_no_rtti_flags "-fno-rtti")
+  elseif (CMAKE_COMPILER_IS_GNUCXX)
+    set(cxx_base_flags "-Wall -Wshadow -Werror")
+    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
+      set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
+    endif()
+    set(cxx_exception_flags "-fexceptions")
+    set(cxx_no_exception_flags "-fno-exceptions")
+    # Until version 4.3.2, GCC doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0")
+    set(cxx_strict_flags
+      "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro")
+    set(cxx_exception_flags "-features=except")
+    # Sun Pro doesn't provide macros to indicate whether exceptions and
+    # RTTI are enabled, so we define GTEST_HAS_* explicitly.
+    set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR
+      CMAKE_CXX_COMPILER_ID STREQUAL "XL")
+    # CMake 2.8 changes Visual Age's compiler ID to "XL".
+    set(cxx_exception_flags "-qeh")
+    set(cxx_no_exception_flags "-qnoeh")
+    # Until version 9.0, Visual Age doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP")
+    set(cxx_base_flags "-AA -mt")
+    set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0")
+    # RTTI can not be disabled in HP aCC compiler.
+    set(cxx_no_rtti_flags "")
+  endif()
+
+  # The pthreads library is available and allowed?
+  if (DEFINED GTEST_HAS_PTHREAD)
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1")
+  else()
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0")
+  endif()
+  set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}")
+
+  # For building gtest's own tests and samples.
+  set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}")
+  set(cxx_no_exception
+    "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
+  set(cxx_default "${cxx_exception}")
+  set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
+
+  # For building the gtest libraries.
+  set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
+endmacro()
+
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+function(cxx_library_with_type name type cxx_flags)
+  # type can be either STATIC or SHARED to denote a static or shared library.
+  # ARGN refers to additional arguments after 'cxx_flags'.
+  add_library(${name} ${type} ${ARGN})
+  set_target_properties(${name}
+    PROPERTIES
+    COMPILE_FLAGS "${cxx_flags}")
+  # Generate debug library name with a postfix.
+  set_target_properties(${name}
+    PROPERTIES
+    DEBUG_POSTFIX "d")
+  # Set the output directory for build artifacts
+  set_target_properties(${name}
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+  # make PDBs match library name
+  get_target_property(pdb_debug_postfix ${name} DEBUG_POSTFIX)
+  set_target_properties(${name}
+    PROPERTIES
+    PDB_NAME "${name}"
+    PDB_NAME_DEBUG "${name}${pdb_debug_postfix}"
+    COMPILE_PDB_NAME "${name}"
+    COMPILE_PDB_NAME_DEBUG "${name}${pdb_debug_postfix}")
+
+  if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
+    if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+      target_compile_definitions(${name} INTERFACE
+        $<INSTALL_INTERFACE:GTEST_LINKED_AS_SHARED_LIBRARY=1>)
+    endif()
+  endif()
+  if (DEFINED GTEST_HAS_PTHREAD)
+    if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+      set(threads_spec ${CMAKE_THREAD_LIBS_INIT})
+    else()
+      set(threads_spec Threads::Threads)
+    endif()
+    target_link_libraries(${name} PUBLIC ${threads_spec})
+  endif()
+endfunction()
+
+########################################################################
+#
+# Helper functions for creating build targets.
+
+function(cxx_shared_library name cxx_flags)
+  cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN})
+endfunction()
+
+function(cxx_library name cxx_flags)
+  cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN})
+endfunction()
+
+# cxx_executable_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ executable that depends on the given libraries and
+# is built from the given source files with the given compiler flags.
+function(cxx_executable_with_flags name cxx_flags libs)
+  add_executable(${name} ${ARGN})
+  if (MSVC)
+    # BigObj required for tests.
+    set(cxx_flags "${cxx_flags} -bigobj")
+  endif()
+  if (cxx_flags)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_flags}")
+  endif()
+  if (BUILD_SHARED_LIBS)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+  endif()
+  # To support mixing linking in static and dynamic libraries, link each
+  # library in with an extra call to target_link_libraries.
+  foreach (lib "${libs}")
+    target_link_libraries(${name} ${lib})
+  endforeach()
+endfunction()
+
+# cxx_executable(name dir lib srcs...)
+#
+# creates a named target that depends on the given libs and is built
+# from the given source files.  dir/name.cc is implicitly included in
+# the source file list.
+function(cxx_executable name dir libs)
+  cxx_executable_with_flags(
+    ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN})
+endfunction()
+
+# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
+find_package(PythonInterp)
+
+# cxx_test_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ test that depends on the given libs and is built
+# from the given source files with the given compiler flags.
+function(cxx_test_with_flags name cxx_flags libs)
+  cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
+  if (WIN32 OR MINGW)
+    add_test(NAME ${name}
+      COMMAND "powershell" "-Command" "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1" "$<TARGET_FILE:${name}>")
+  else()
+    add_test(NAME ${name}
+      COMMAND "$<TARGET_FILE:${name}>")
+  endif()
+endfunction()
+
+# cxx_test(name libs srcs...)
+#
+# creates a named test target that depends on the given libs and is
+# built from the given source files.  Unlike cxx_test_with_flags,
+# test/name.cc is already implicitly included in the source file list.
+function(cxx_test name libs)
+  cxx_test_with_flags("${name}" "${cxx_default}" "${libs}"
+    "test/${name}.cc" ${ARGN})
+endfunction()
+
+# py_test(name)
+#
+# creates a Python test with the given name whose main module is in
+# test/name.py.  It does nothing if Python is not installed.
+function(py_test name)
+  if (PYTHONINTERP_FOUND)
+    if ("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" VERSION_GREATER 3.1)
+      if (CMAKE_CONFIGURATION_TYPES)
+        # Multi-configuration build generators as for Visual Studio save
+        # output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
+        # Release etc.), so we have to provide it here.
+        if (WIN32 OR MINGW)
+          add_test(NAME ${name}
+            COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1
+              ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
+        else()
+          add_test(NAME ${name}
+            COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
+        endif()
+      else (CMAKE_CONFIGURATION_TYPES)
+        # Single-configuration build generators like Makefile generators
+        # don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
+        if (WIN32 OR MINGW)
+          add_test(NAME ${name}
+            COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1
+              ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+        else()
+          add_test(NAME ${name}
+            COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+        endif()
+      endif (CMAKE_CONFIGURATION_TYPES)
+    else()
+      # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
+      # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
+      # only at ctest runtime (by calling ctest -c <Configuration>), so
+      # we have to escape $ to delay variable substitution here.
+      if (WIN32 OR MINGW)
+        add_test(NAME ${name}
+          COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1
+            ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
+      else()
+        add_test(NAME ${name}
+          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
+      endif()
+    endif()
+  endif(PYTHONINTERP_FOUND)
+endfunction()
+
+# install_project(targets...)
+#
+# Installs the specified targets and configures the associated pkgconfig files.
+function(install_project)
+  if(INSTALL_GTEST)
+    install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/"
+      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+    # Install the project targets.
+    install(TARGETS ${ARGN}
+      EXPORT ${targets_export_name}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+      LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+      # Install PDBs
+      foreach(t ${ARGN})
+        get_target_property(t_pdb_name ${t} COMPILE_PDB_NAME)
+        get_target_property(t_pdb_name_debug ${t} COMPILE_PDB_NAME_DEBUG)
+        get_target_property(t_pdb_output_directory ${t} PDB_OUTPUT_DIRECTORY)
+        install(FILES
+          "${t_pdb_output_directory}/\${CMAKE_INSTALL_CONFIG_NAME}/$<$<CONFIG:Debug>:${t_pdb_name_debug}>$<$<NOT:$<CONFIG:Debug>>:${t_pdb_name}>.pdb"
+          DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          OPTIONAL)
+      endforeach()
+    endif()
+    # Configure and install pkgconfig files.
+    foreach(t ${ARGN})
+      set(configured_pc "${generated_dir}/${t}.pc")
+      configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in"
+        "${configured_pc}" @ONLY)
+      install(FILES "${configured_pc}"
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    endforeach()
+  endif()
+endfunction()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/libgtest.la.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/libgtest.la.in
new file mode 100755
index 0000000..840c838
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/cmake/libgtest.la.in
@@ -0,0 +1,21 @@
+# libgtest.la - a libtool library file
+# Generated by libtool (GNU libtool) 2.4.6
+
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Names of this library.
+library_names='libgtest.so'
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='@CMAKE_INSTALL_FULL_LIBDIR@'
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/advanced.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/advanced.md
new file mode 100755
index 0000000..51005e9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/advanced.md
@@ -0,0 +1,2566 @@
+# Advanced googletest Topics
+
+<!-- GOOGLETEST_CM0016 DO NOT DELETE -->
+
+## Introduction
+
+Now that you have read the [googletest Primer](primer.md) and learned how to
+write tests using googletest, it's time to learn some new tricks. This document
+will show you more assertions as well as how to construct complex failure
+messages, propagate fatal failures, reuse and speed up your test fixtures, and
+use various flags with your tests.
+
+## More Assertions
+
+This section covers some less frequently used, but still significant,
+assertions.
+
+### Explicit Success and Failure
+
+These three assertions do not actually test a value or expression. Instead, they
+generate a success or failure directly. Like the macros that actually perform a
+test, you may stream a custom failure message into them.
+
+```c++
+SUCCEED();
+```
+
+Generates a success. This does **NOT** make the overall test succeed. A test is
+considered successful only if none of its assertions fail during its execution.
+
+NOTE: `SUCCEED()` is purely documentary and currently doesn't generate any
+user-visible output. However, we may add `SUCCEED()` messages to googletest's
+output in the future.
+
+```c++
+FAIL();
+ADD_FAILURE();
+ADD_FAILURE_AT("file_path", line_number);
+```
+
+`FAIL()` generates a fatal failure, while `ADD_FAILURE()` and `ADD_FAILURE_AT()`
+generate a nonfatal failure. These are useful when control flow, rather than a
+Boolean expression, determines the test's success or failure. For example, you
+might want to write something like:
+
+```c++
+switch(expression) {
+  case 1:
+     ... some checks ...
+  case 2:
+     ... some other checks ...
+  default:
+     FAIL() << "We shouldn't get here.";
+}
+```
+
+NOTE: you can only use `FAIL()` in functions that return `void`. See the
+[Assertion Placement section](#assertion-placement) for more information.
+
+### Exception Assertions
+
+These are for verifying that a piece of code throws (or does not throw) an
+exception of the given type:
+
+Fatal assertion                            | Nonfatal assertion                         | Verifies
+------------------------------------------ | ------------------------------------------ | --------
+`ASSERT_THROW(statement, exception_type);` | `EXPECT_THROW(statement, exception_type);` | `statement` throws an exception of the given type
+`ASSERT_ANY_THROW(statement);`             | `EXPECT_ANY_THROW(statement);`             | `statement` throws an exception of any type
+`ASSERT_NO_THROW(statement);`              | `EXPECT_NO_THROW(statement);`              | `statement` doesn't throw any exception
+
+Examples:
+
+```c++
+ASSERT_THROW(Foo(5), bar_exception);
+
+EXPECT_NO_THROW({
+  int n = 5;
+  Bar(&n);
+});
+```
+
+**Availability**: requires exceptions to be enabled in the build environment
+
+### Predicate Assertions for Better Error Messages
+
+Even though googletest has a rich set of assertions, they can never be complete,
+as it's impossible (nor a good idea) to anticipate all scenarios a user might
+run into. Therefore, sometimes a user has to use `EXPECT_TRUE()` to check a
+complex expression, for lack of a better macro. This has the problem of not
+showing you the values of the parts of the expression, making it hard to
+understand what went wrong. As a workaround, some users choose to construct the
+failure message by themselves, streaming it into `EXPECT_TRUE()`. However, this
+is awkward especially when the expression has side-effects or is expensive to
+evaluate.
+
+googletest gives you three different options to solve this problem:
+
+#### Using an Existing Boolean Function
+
+If you already have a function or functor that returns `bool` (or a type that
+can be implicitly converted to `bool`), you can use it in a *predicate
+assertion* to get the function arguments printed for free:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Fatal assertion                   | Nonfatal assertion                | Verifies                    |
+| --------------------------------- | --------------------------------- | --------------------------- |
+| `ASSERT_PRED1(pred1, val1)`       | `EXPECT_PRED1(pred1, val1)`       | `pred1(val1)` is true       |
+| `ASSERT_PRED2(pred2, val1, val2)` | `EXPECT_PRED2(pred2, val1, val2)` | `pred1(val1, val2)` is true |
+| `...`                             | `...`                             | `...`                       |
+
+<!-- mdformat on-->
+In the above, `predn` is an `n`-ary predicate function or functor, where `val1`,
+`val2`, ..., and `valn` are its arguments. The assertion succeeds if the
+predicate returns `true` when applied to the given arguments, and fails
+otherwise. When the assertion fails, it prints the value of each argument. In
+either case, the arguments are evaluated exactly once.
+
+Here's an example. Given
+
+```c++
+// Returns true if m and n have no common divisors except 1.
+bool MutuallyPrime(int m, int n) { ... }
+
+const int a = 3;
+const int b = 4;
+const int c = 10;
+```
+
+the assertion
+
+```c++
+  EXPECT_PRED2(MutuallyPrime, a, b);
+```
+
+will succeed, while the assertion
+
+```c++
+  EXPECT_PRED2(MutuallyPrime, b, c);
+```
+
+will fail with the message
+
+```none
+MutuallyPrime(b, c) is false, where
+b is 4
+c is 10
+```
+
+> NOTE:
+>
+> 1.  If you see a compiler error "no matching function to call" when using
+>     `ASSERT_PRED*` or `EXPECT_PRED*`, please see
+>     [this](faq.md#the-compiler-complains-no-matching-function-to-call-when-i-use-assert-pred-how-do-i-fix-it)
+>     for how to resolve it.
+
+#### Using a Function That Returns an AssertionResult
+
+While `EXPECT_PRED*()` and friends are handy for a quick job, the syntax is not
+satisfactory: you have to use different macros for different arities, and it
+feels more like Lisp than C++. The `::testing::AssertionResult` class solves
+this problem.
+
+An `AssertionResult` object represents the result of an assertion (whether it's
+a success or a failure, and an associated message). You can create an
+`AssertionResult` using one of these factory functions:
+
+```c++
+namespace testing {
+
+// Returns an AssertionResult object to indicate that an assertion has
+// succeeded.
+AssertionResult AssertionSuccess();
+
+// Returns an AssertionResult object to indicate that an assertion has
+// failed.
+AssertionResult AssertionFailure();
+
+}
+```
+
+You can then use the `<<` operator to stream messages to the `AssertionResult`
+object.
+
+To provide more readable messages in Boolean assertions (e.g. `EXPECT_TRUE()`),
+write a predicate function that returns `AssertionResult` instead of `bool`. For
+example, if you define `IsEven()` as:
+
+```c++
+::testing::AssertionResult IsEven(int n) {
+  if ((n % 2) == 0)
+     return ::testing::AssertionSuccess();
+  else
+     return ::testing::AssertionFailure() << n << " is odd";
+}
+```
+
+instead of:
+
+```c++
+bool IsEven(int n) {
+  return (n % 2) == 0;
+}
+```
+
+the failed assertion `EXPECT_TRUE(IsEven(Fib(4)))` will print:
+
+```none
+Value of: IsEven(Fib(4))
+  Actual: false (3 is odd)
+Expected: true
+```
+
+instead of a more opaque
+
+```none
+Value of: IsEven(Fib(4))
+  Actual: false
+Expected: true
+```
+
+If you want informative messages in `EXPECT_FALSE` and `ASSERT_FALSE` as well
+(one third of Boolean assertions in the Google code base are negative ones), and
+are fine with making the predicate slower in the success case, you can supply a
+success message:
+
+```c++
+::testing::AssertionResult IsEven(int n) {
+  if ((n % 2) == 0)
+     return ::testing::AssertionSuccess() << n << " is even";
+  else
+     return ::testing::AssertionFailure() << n << " is odd";
+}
+```
+
+Then the statement `EXPECT_FALSE(IsEven(Fib(6)))` will print
+
+```none
+  Value of: IsEven(Fib(6))
+     Actual: true (8 is even)
+  Expected: false
+```
+
+#### Using a Predicate-Formatter
+
+If you find the default message generated by `(ASSERT|EXPECT)_PRED*` and
+`(ASSERT|EXPECT)_(TRUE|FALSE)` unsatisfactory, or some arguments to your
+predicate do not support streaming to `ostream`, you can instead use the
+following *predicate-formatter assertions* to *fully* customize how the message
+is formatted:
+
+Fatal assertion                                  | Nonfatal assertion                               | Verifies
+------------------------------------------------ | ------------------------------------------------ | --------
+`ASSERT_PRED_FORMAT1(pred_format1, val1);`       | `EXPECT_PRED_FORMAT1(pred_format1, val1);`       | `pred_format1(val1)` is successful
+`ASSERT_PRED_FORMAT2(pred_format2, val1, val2);` | `EXPECT_PRED_FORMAT2(pred_format2, val1, val2);` | `pred_format2(val1, val2)` is successful
+`...`                                            | `...`                                            | ...
+
+The difference between this and the previous group of macros is that instead of
+a predicate, `(ASSERT|EXPECT)_PRED_FORMAT*` take a *predicate-formatter*
+(`pred_formatn`), which is a function or functor with the signature:
+
+```c++
+::testing::AssertionResult PredicateFormattern(const char* expr1,
+                                               const char* expr2,
+                                               ...
+                                               const char* exprn,
+                                               T1 val1,
+                                               T2 val2,
+                                               ...
+                                               Tn valn);
+```
+
+where `val1`, `val2`, ..., and `valn` are the values of the predicate arguments,
+and `expr1`, `expr2`, ..., and `exprn` are the corresponding expressions as they
+appear in the source code. The types `T1`, `T2`, ..., and `Tn` can be either
+value types or reference types. For example, if an argument has type `Foo`, you
+can declare it as either `Foo` or `const Foo&`, whichever is appropriate.
+
+As an example, let's improve the failure message in `MutuallyPrime()`, which was
+used with `EXPECT_PRED2()`:
+
+```c++
+// Returns the smallest prime common divisor of m and n,
+// or 1 when m and n are mutually prime.
+int SmallestPrimeCommonDivisor(int m, int n) { ... }
+
+// A predicate-formatter for asserting that two integers are mutually prime.
+::testing::AssertionResult AssertMutuallyPrime(const char* m_expr,
+                                               const char* n_expr,
+                                               int m,
+                                               int n) {
+  if (MutuallyPrime(m, n)) return ::testing::AssertionSuccess();
+
+  return ::testing::AssertionFailure() << m_expr << " and " << n_expr
+      << " (" << m << " and " << n << ") are not mutually prime, "
+      << "as they have a common divisor " << SmallestPrimeCommonDivisor(m, n);
+}
+```
+
+With this predicate-formatter, we can use
+
+```c++
+  EXPECT_PRED_FORMAT2(AssertMutuallyPrime, b, c);
+```
+
+to generate the message
+
+```none
+b and c (4 and 10) are not mutually prime, as they have a common divisor 2.
+```
+
+As you may have realized, many of the built-in assertions we introduced earlier
+are special cases of `(EXPECT|ASSERT)_PRED_FORMAT*`. In fact, most of them are
+indeed defined using `(EXPECT|ASSERT)_PRED_FORMAT*`.
+
+### Floating-Point Comparison
+
+Comparing floating-point numbers is tricky. Due to round-off errors, it is very
+unlikely that two floating-points will match exactly. Therefore, `ASSERT_EQ` 's
+naive comparison usually doesn't work. And since floating-points can have a wide
+value range, no single fixed error bound works. It's better to compare by a
+fixed relative error bound, except for values close to 0 due to the loss of
+precision there.
+
+In general, for floating-point comparison to make sense, the user needs to
+carefully choose the error bound. If they don't want or care to, comparing in
+terms of Units in the Last Place (ULPs) is a good default, and googletest
+provides assertions to do this. Full details about ULPs are quite long; if you
+want to learn more, see
+[here](https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/).
+
+#### Floating-Point Macros
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Fatal assertion                 | Nonfatal assertion              | Verifies                                 |
+| ------------------------------- | ------------------------------- | ---------------------------------------- |
+| `ASSERT_FLOAT_EQ(val1, val2);`  | `EXPECT_FLOAT_EQ(val1, val2);`  | the two `float` values are almost equal  |
+| `ASSERT_DOUBLE_EQ(val1, val2);` | `EXPECT_DOUBLE_EQ(val1, val2);` | the two `double` values are almost equal |
+
+<!-- mdformat on-->
+
+By "almost equal" we mean the values are within 4 ULP's from each other.
+
+The following assertions allow you to choose the acceptable error bound:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Fatal assertion                       | Nonfatal assertion                    | Verifies                                                                         |
+| ------------------------------------- | ------------------------------------- | -------------------------------------------------------------------------------- |
+| `ASSERT_NEAR(val1, val2, abs_error);` | `EXPECT_NEAR(val1, val2, abs_error);` | the difference between `val1` and `val2` doesn't exceed the given absolute error |
+
+<!-- mdformat on-->
+
+#### Floating-Point Predicate-Format Functions
+
+Some floating-point operations are useful, but not that often used. In order to
+avoid an explosion of new macros, we provide them as predicate-format functions
+that can be used in predicate assertion macros (e.g. `EXPECT_PRED_FORMAT2`,
+etc).
+
+```c++
+EXPECT_PRED_FORMAT2(::testing::FloatLE, val1, val2);
+EXPECT_PRED_FORMAT2(::testing::DoubleLE, val1, val2);
+```
+
+Verifies that `val1` is less than, or almost equal to, `val2`. You can replace
+`EXPECT_PRED_FORMAT2` in the above table with `ASSERT_PRED_FORMAT2`.
+
+### Asserting Using gMock Matchers
+
+[gMock](../../googlemock) comes with a library of matchers for validating
+arguments passed to mock objects. A gMock *matcher* is basically a predicate
+that knows how to describe itself. It can be used in these assertion macros:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Fatal assertion                | Nonfatal assertion             | Verifies              |
+| ------------------------------ | ------------------------------ | --------------------- |
+| `ASSERT_THAT(value, matcher);` | `EXPECT_THAT(value, matcher);` | value matches matcher |
+
+<!-- mdformat on-->
+
+For example, `StartsWith(prefix)` is a matcher that matches a string starting
+with `prefix`, and you can write:
+
+```c++
+using ::testing::StartsWith;
+...
+    // Verifies that Foo() returns a string starting with "Hello".
+    EXPECT_THAT(Foo(), StartsWith("Hello"));
+```
+
+Read this
+[recipe](../../googlemock/docs/cook_book.md#using-matchers-in-googletest-assertions)
+in the gMock Cookbook for more details.
+
+gMock has a rich set of matchers. You can do many things googletest cannot do
+alone with them. For a list of matchers gMock provides, read
+[this](../../googlemock/docs/cook_book.md##using-matchers). It's easy to write
+your [own matchers](../../googlemock/docs/cook_book.md#NewMatchers) too.
+
+gMock is bundled with googletest, so you don't need to add any build dependency
+in order to take advantage of this. Just include `"testing/base/public/gmock.h"`
+and you're ready to go.
+
+### More String Assertions
+
+(Please read the [previous](#asserting-using-gmock-matchers) section first if
+you haven't.)
+
+You can use the gMock
+[string matchers](../../googlemock/docs/cheat_sheet.md#string-matchers) with
+`EXPECT_THAT()` or `ASSERT_THAT()` to do more string comparison tricks
+(sub-string, prefix, suffix, regular expression, and etc). For example,
+
+```c++
+using ::testing::HasSubstr;
+using ::testing::MatchesRegex;
+...
+  ASSERT_THAT(foo_string, HasSubstr("needle"));
+  EXPECT_THAT(bar_string, MatchesRegex("\\w*\\d+"));
+```
+
+If the string contains a well-formed HTML or XML document, you can check whether
+its DOM tree matches an
+[XPath expression](http://www.w3.org/TR/xpath/#contents):
+
+```c++
+// Currently still in //template/prototemplate/testing:xpath_matcher
+#include "template/prototemplate/testing/xpath_matcher.h"
+using prototemplate::testing::MatchesXPath;
+EXPECT_THAT(html_string, MatchesXPath("//a[text()='click here']"));
+```
+
+### Windows HRESULT assertions
+
+These assertions test for `HRESULT` success or failure.
+
+Fatal assertion                        | Nonfatal assertion                     | Verifies
+-------------------------------------- | -------------------------------------- | --------
+`ASSERT_HRESULT_SUCCEEDED(expression)` | `EXPECT_HRESULT_SUCCEEDED(expression)` | `expression` is a success `HRESULT`
+`ASSERT_HRESULT_FAILED(expression)`    | `EXPECT_HRESULT_FAILED(expression)`    | `expression` is a failure `HRESULT`
+
+The generated output contains the human-readable error message associated with
+the `HRESULT` code returned by `expression`.
+
+You might use them like this:
+
+```c++
+CComPtr<IShellDispatch2> shell;
+ASSERT_HRESULT_SUCCEEDED(shell.CoCreateInstance(L"Shell.Application"));
+CComVariant empty;
+ASSERT_HRESULT_SUCCEEDED(shell->ShellExecute(CComBSTR(url), empty, empty, empty, empty));
+```
+
+### Type Assertions
+
+You can call the function
+
+```c++
+::testing::StaticAssertTypeEq<T1, T2>();
+```
+
+to assert that types `T1` and `T2` are the same. The function does nothing if
+the assertion is satisfied. If the types are different, the function call will
+fail to compile, and the compiler error message will likely (depending on the
+compiler) show you the actual values of `T1` and `T2`. This is mainly useful
+inside template code.
+
+**Caveat**: When used inside a member function of a class template or a function
+template, `StaticAssertTypeEq<T1, T2>()` is effective only if the function is
+instantiated. For example, given:
+
+```c++
+template <typename T> class Foo {
+ public:
+  void Bar() { ::testing::StaticAssertTypeEq<int, T>(); }
+};
+```
+
+the code:
+
+```c++
+void Test1() { Foo<bool> foo; }
+```
+
+will not generate a compiler error, as `Foo<bool>::Bar()` is never actually
+instantiated. Instead, you need:
+
+```c++
+void Test2() { Foo<bool> foo; foo.Bar(); }
+```
+
+to cause a compiler error.
+
+### Assertion Placement
+
+You can use assertions in any C++ function. In particular, it doesn't have to be
+a method of the test fixture class. The one constraint is that assertions that
+generate a fatal failure (`FAIL*` and `ASSERT_*`) can only be used in
+void-returning functions. This is a consequence of Google's not using
+exceptions. By placing it in a non-void function you'll get a confusing compile
+error like `"error: void value not ignored as it ought to be"` or `"cannot
+initialize return object of type 'bool' with an rvalue of type 'void'"` or
+`"error: no viable conversion from 'void' to 'string'"`.
+
+If you need to use fatal assertions in a function that returns non-void, one
+option is to make the function return the value in an out parameter instead. For
+example, you can rewrite `T2 Foo(T1 x)` to `void Foo(T1 x, T2* result)`. You
+need to make sure that `*result` contains some sensible value even when the
+function returns prematurely. As the function now returns `void`, you can use
+any assertion inside of it.
+
+If changing the function's type is not an option, you should just use assertions
+that generate non-fatal failures, such as `ADD_FAILURE*` and `EXPECT_*`.
+
+NOTE: Constructors and destructors are not considered void-returning functions,
+according to the C++ language specification, and so you may not use fatal
+assertions in them; you'll get a compilation error if you try. Instead, either
+call `abort` and crash the entire test executable, or put the fatal assertion in
+a `SetUp`/`TearDown` function; see
+[constructor/destructor vs. `SetUp`/`TearDown`](faq.md#CtorVsSetUp)
+
+WARNING: A fatal assertion in a helper function (private void-returning method)
+called from a constructor or destructor does not does not terminate the current
+test, as your intuition might suggest: it merely returns from the constructor or
+destructor early, possibly leaving your object in a partially-constructed or
+partially-destructed state! You almost certainly want to `abort` or use
+`SetUp`/`TearDown` instead.
+
+## Teaching googletest How to Print Your Values
+
+When a test assertion such as `EXPECT_EQ` fails, googletest prints the argument
+values to help you debug. It does this using a user-extensible value printer.
+
+This printer knows how to print built-in C++ types, native arrays, STL
+containers, and any type that supports the `<<` operator. For other types, it
+prints the raw bytes in the value and hopes that you the user can figure it out.
+
+As mentioned earlier, the printer is *extensible*. That means you can teach it
+to do a better job at printing your particular type than to dump the bytes. To
+do that, define `<<` for your type:
+
+```c++
+#include <ostream>
+
+namespace foo {
+
+class Bar {  // We want googletest to be able to print instances of this.
+...
+  // Create a free inline friend function.
+  friend std::ostream& operator<<(std::ostream& os, const Bar& bar) {
+    return os << bar.DebugString();  // whatever needed to print bar to os
+  }
+};
+
+// If you can't declare the function in the class it's important that the
+// << operator is defined in the SAME namespace that defines Bar.  C++'s look-up
+// rules rely on that.
+std::ostream& operator<<(std::ostream& os, const Bar& bar) {
+  return os << bar.DebugString();  // whatever needed to print bar to os
+}
+
+}  // namespace foo
+```
+
+Sometimes, this might not be an option: your team may consider it bad style to
+have a `<<` operator for `Bar`, or `Bar` may already have a `<<` operator that
+doesn't do what you want (and you cannot change it). If so, you can instead
+define a `PrintTo()` function like this:
+
+```c++
+#include <ostream>
+
+namespace foo {
+
+class Bar {
+  ...
+  friend void PrintTo(const Bar& bar, std::ostream* os) {
+    *os << bar.DebugString();  // whatever needed to print bar to os
+  }
+};
+
+// If you can't declare the function in the class it's important that PrintTo()
+// is defined in the SAME namespace that defines Bar.  C++'s look-up rules rely
+// on that.
+void PrintTo(const Bar& bar, std::ostream* os) {
+  *os << bar.DebugString();  // whatever needed to print bar to os
+}
+
+}  // namespace foo
+```
+
+If you have defined both `<<` and `PrintTo()`, the latter will be used when
+googletest is concerned. This allows you to customize how the value appears in
+googletest's output without affecting code that relies on the behavior of its
+`<<` operator.
+
+If you want to print a value `x` using googletest's value printer yourself, just
+call `::testing::PrintToString(x)`, which returns an `std::string`:
+
+```c++
+vector<pair<Bar, int> > bar_ints = GetBarIntVector();
+
+EXPECT_TRUE(IsCorrectBarIntVector(bar_ints))
+    << "bar_ints = " << ::testing::PrintToString(bar_ints);
+```
+
+## Death Tests
+
+In many applications, there are assertions that can cause application failure if
+a condition is not met. These sanity checks, which ensure that the program is in
+a known good state, are there to fail at the earliest possible time after some
+program state is corrupted. If the assertion checks the wrong condition, then
+the program may proceed in an erroneous state, which could lead to memory
+corruption, security holes, or worse. Hence it is vitally important to test that
+such assertion statements work as expected.
+
+Since these precondition checks cause the processes to die, we call such tests
+_death tests_. More generally, any test that checks that a program terminates
+(except by throwing an exception) in an expected fashion is also a death test.
+
+Note that if a piece of code throws an exception, we don't consider it "death"
+for the purpose of death tests, as the caller of the code could catch the
+exception and avoid the crash. If you want to verify exceptions thrown by your
+code, see [Exception Assertions](#ExceptionAssertions).
+
+If you want to test `EXPECT_*()/ASSERT_*()` failures in your test code, see
+Catching Failures
+
+### How to Write a Death Test
+
+googletest has the following macros to support death tests:
+
+Fatal assertion                                  | Nonfatal assertion                               | Verifies
+------------------------------------------------ | ------------------------------------------------ | --------
+`ASSERT_DEATH(statement, matcher);`              | `EXPECT_DEATH(statement, matcher);`              | `statement` crashes with the given error
+`ASSERT_DEATH_IF_SUPPORTED(statement, matcher);` | `EXPECT_DEATH_IF_SUPPORTED(statement, matcher);` | if death tests are supported, verifies that `statement` crashes with the given error; otherwise verifies nothing
+`ASSERT_EXIT(statement, predicate, matcher);`    | `EXPECT_EXIT(statement, predicate, matcher);`    | `statement` exits with the given error, and its exit code matches `predicate`
+
+where `statement` is a statement that is expected to cause the process to die,
+`predicate` is a function or function object that evaluates an integer exit
+status, and `matcher` is either a GMock matcher matching a `const std::string&`
+or a (Perl) regular expression - either of which is matched against the stderr
+output of `statement`. For legacy reasons, a bare string (i.e. with no matcher)
+is interpreted as `ContainsRegex(str)`, **not** `Eq(str)`. Note that `statement`
+can be *any valid statement* (including *compound statement*) and doesn't have
+to be an expression.
+
+As usual, the `ASSERT` variants abort the current test function, while the
+`EXPECT` variants do not.
+
+> NOTE: We use the word "crash" here to mean that the process terminates with a
+> *non-zero* exit status code. There are two possibilities: either the process
+> has called `exit()` or `_exit()` with a non-zero value, or it may be killed by
+> a signal.
+>
+> This means that if `*statement*` terminates the process with a 0 exit code, it
+> is *not* considered a crash by `EXPECT_DEATH`. Use `EXPECT_EXIT` instead if
+> this is the case, or if you want to restrict the exit code more precisely.
+
+A predicate here must accept an `int` and return a `bool`. The death test
+succeeds only if the predicate returns `true`. googletest defines a few
+predicates that handle the most common cases:
+
+```c++
+::testing::ExitedWithCode(exit_code)
+```
+
+This expression is `true` if the program exited normally with the given exit
+code.
+
+```c++
+::testing::KilledBySignal(signal_number)  // Not available on Windows.
+```
+
+This expression is `true` if the program was killed by the given signal.
+
+The `*_DEATH` macros are convenient wrappers for `*_EXIT` that use a predicate
+that verifies the process' exit code is non-zero.
+
+Note that a death test only cares about three things:
+
+1.  does `statement` abort or exit the process?
+2.  (in the case of `ASSERT_EXIT` and `EXPECT_EXIT`) does the exit status
+    satisfy `predicate`? Or (in the case of `ASSERT_DEATH` and `EXPECT_DEATH`)
+    is the exit status non-zero? And
+3.  does the stderr output match `regex`?
+
+In particular, if `statement` generates an `ASSERT_*` or `EXPECT_*` failure, it
+will **not** cause the death test to fail, as googletest assertions don't abort
+the process.
+
+To write a death test, simply use one of the above macros inside your test
+function. For example,
+
+```c++
+TEST(MyDeathTest, Foo) {
+  // This death test uses a compound statement.
+  ASSERT_DEATH({
+    int n = 5;
+    Foo(&n);
+  }, "Error on line .* of Foo()");
+}
+
+TEST(MyDeathTest, NormalExit) {
+  EXPECT_EXIT(NormalExit(), ::testing::ExitedWithCode(0), "Success");
+}
+
+TEST(MyDeathTest, KillMyself) {
+  EXPECT_EXIT(KillMyself(), ::testing::KilledBySignal(SIGKILL),
+              "Sending myself unblockable signal");
+}
+```
+
+verifies that:
+
+*   calling `Foo(5)` causes the process to die with the given error message,
+*   calling `NormalExit()` causes the process to print `"Success"` to stderr and
+    exit with exit code 0, and
+*   calling `KillMyself()` kills the process with signal `SIGKILL`.
+
+The test function body may contain other assertions and statements as well, if
+necessary.
+
+### Death Test Naming
+
+IMPORTANT: We strongly recommend you to follow the convention of naming your
+**test suite** (not test) `*DeathTest` when it contains a death test, as
+demonstrated in the above example. The
+[Death Tests And Threads](#death-tests-and-threads) section below explains why.
+
+If a test fixture class is shared by normal tests and death tests, you can use
+`using` or `typedef` to introduce an alias for the fixture class and avoid
+duplicating its code:
+
+```c++
+class FooTest : public ::testing::Test { ... };
+
+using FooDeathTest = FooTest;
+
+TEST_F(FooTest, DoesThis) {
+  // normal test
+}
+
+TEST_F(FooDeathTest, DoesThat) {
+  // death test
+}
+```
+
+### Regular Expression Syntax
+
+On POSIX systems (e.g. Linux, Cygwin, and Mac), googletest uses the
+[POSIX extended regular expression](http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html#tag_09_04)
+syntax. To learn about this syntax, you may want to read this
+[Wikipedia entry](http://en.wikipedia.org/wiki/Regular_expression#POSIX_Extended_Regular_Expressions).
+
+On Windows, googletest uses its own simple regular expression implementation. It
+lacks many features. For example, we don't support union (`"x|y"`), grouping
+(`"(xy)"`), brackets (`"[xy]"`), and repetition count (`"x{5,7}"`), among
+others. Below is what we do support (`A` denotes a literal character, period
+(`.`), or a single `\\ ` escape sequence; `x` and `y` denote regular
+expressions.):
+
+Expression | Meaning
+---------- | --------------------------------------------------------------
+`c`        | matches any literal character `c`
+`\\d`      | matches any decimal digit
+`\\D`      | matches any character that's not a decimal digit
+`\\f`      | matches `\f`
+`\\n`      | matches `\n`
+`\\r`      | matches `\r`
+`\\s`      | matches any ASCII whitespace, including `\n`
+`\\S`      | matches any character that's not a whitespace
+`\\t`      | matches `\t`
+`\\v`      | matches `\v`
+`\\w`      | matches any letter, `_`, or decimal digit
+`\\W`      | matches any character that `\\w` doesn't match
+`\\c`      | matches any literal character `c`, which must be a punctuation
+`.`        | matches any single character except `\n`
+`A?`       | matches 0 or 1 occurrences of `A`
+`A*`       | matches 0 or many occurrences of `A`
+`A+`       | matches 1 or many occurrences of `A`
+`^`        | matches the beginning of a string (not that of each line)
+`$`        | matches the end of a string (not that of each line)
+`xy`       | matches `x` followed by `y`
+
+To help you determine which capability is available on your system, googletest
+defines macros to govern which regular expression it is using. The macros are:
+`GTEST_USES_SIMPLE_RE=1` or `GTEST_USES_POSIX_RE=1`. If you want your death
+tests to work in all cases, you can either `#if` on these macros or use the more
+limited syntax only.
+
+### How It Works
+
+Under the hood, `ASSERT_EXIT()` spawns a new process and executes the death test
+statement in that process. The details of how precisely that happens depend on
+the platform and the variable ::testing::GTEST_FLAG(death_test_style) (which is
+initialized from the command-line flag `--gtest_death_test_style`).
+
+*   On POSIX systems, `fork()` (or `clone()` on Linux) is used to spawn the
+    child, after which:
+    *   If the variable's value is `"fast"`, the death test statement is
+        immediately executed.
+    *   If the variable's value is `"threadsafe"`, the child process re-executes
+        the unit test binary just as it was originally invoked, but with some
+        extra flags to cause just the single death test under consideration to
+        be run.
+*   On Windows, the child is spawned using the `CreateProcess()` API, and
+    re-executes the binary to cause just the single death test under
+    consideration to be run - much like the `threadsafe` mode on POSIX.
+
+Other values for the variable are illegal and will cause the death test to fail.
+Currently, the flag's default value is **"fast"**
+
+1.  the child's exit status satisfies the predicate, and
+2.  the child's stderr matches the regular expression.
+
+If the death test statement runs to completion without dying, the child process
+will nonetheless terminate, and the assertion fails.
+
+### Death Tests And Threads
+
+The reason for the two death test styles has to do with thread safety. Due to
+well-known problems with forking in the presence of threads, death tests should
+be run in a single-threaded context. Sometimes, however, it isn't feasible to
+arrange that kind of environment. For example, statically-initialized modules
+may start threads before main is ever reached. Once threads have been created,
+it may be difficult or impossible to clean them up.
+
+googletest has three features intended to raise awareness of threading issues.
+
+1.  A warning is emitted if multiple threads are running when a death test is
+    encountered.
+2.  Test suites with a name ending in "DeathTest" are run before all other
+    tests.
+3.  It uses `clone()` instead of `fork()` to spawn the child process on Linux
+    (`clone()` is not available on Cygwin and Mac), as `fork()` is more likely
+    to cause the child to hang when the parent process has multiple threads.
+
+It's perfectly fine to create threads inside a death test statement; they are
+executed in a separate process and cannot affect the parent.
+
+### Death Test Styles
+
+The "threadsafe" death test style was introduced in order to help mitigate the
+risks of testing in a possibly multithreaded environment. It trades increased
+test execution time (potentially dramatically so) for improved thread safety.
+
+The automated testing framework does not set the style flag. You can choose a
+particular style of death tests by setting the flag programmatically:
+
+```c++
+testing::FLAGS_gtest_death_test_style="threadsafe"
+```
+
+You can do this in `main()` to set the style for all death tests in the binary,
+or in individual tests. Recall that flags are saved before running each test and
+restored afterwards, so you need not do that yourself. For example:
+
+```c++
+int main(int argc, char** argv) {
+  InitGoogle(argv[0], &argc, &argv, true);
+  ::testing::FLAGS_gtest_death_test_style = "fast";
+  return RUN_ALL_TESTS();
+}
+
+TEST(MyDeathTest, TestOne) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  // This test is run in the "threadsafe" style:
+  ASSERT_DEATH(ThisShouldDie(), "");
+}
+
+TEST(MyDeathTest, TestTwo) {
+  // This test is run in the "fast" style:
+  ASSERT_DEATH(ThisShouldDie(), "");
+}
+```
+
+### Caveats
+
+The `statement` argument of `ASSERT_EXIT()` can be any valid C++ statement. If
+it leaves the current function via a `return` statement or by throwing an
+exception, the death test is considered to have failed. Some googletest macros
+may return from the current function (e.g. `ASSERT_TRUE()`), so be sure to avoid
+them in `statement`.
+
+Since `statement` runs in the child process, any in-memory side effect (e.g.
+modifying a variable, releasing memory, etc) it causes will *not* be observable
+in the parent process. In particular, if you release memory in a death test,
+your program will fail the heap check as the parent process will never see the
+memory reclaimed. To solve this problem, you can
+
+1.  try not to free memory in a death test;
+2.  free the memory again in the parent process; or
+3.  do not use the heap checker in your program.
+
+Due to an implementation detail, you cannot place multiple death test assertions
+on the same line; otherwise, compilation will fail with an unobvious error
+message.
+
+Despite the improved thread safety afforded by the "threadsafe" style of death
+test, thread problems such as deadlock are still possible in the presence of
+handlers registered with `pthread_atfork(3)`.
+
+
+## Using Assertions in Sub-routines
+
+### Adding Traces to Assertions
+
+If a test sub-routine is called from several places, when an assertion inside it
+fails, it can be hard to tell which invocation of the sub-routine the failure is
+from. You can alleviate this problem using extra logging or custom failure
+messages, but that usually clutters up your tests. A better solution is to use
+the `SCOPED_TRACE` macro or the `ScopedTrace` utility:
+
+```c++
+SCOPED_TRACE(message);
+ScopedTrace trace("file_path", line_number, message);
+```
+
+where `message` can be anything streamable to `std::ostream`. `SCOPED_TRACE`
+macro will cause the current file name, line number, and the given message to be
+added in every failure message. `ScopedTrace` accepts explicit file name and
+line number in arguments, which is useful for writing test helpers. The effect
+will be undone when the control leaves the current lexical scope.
+
+For example,
+
+```c++
+10: void Sub1(int n) {
+11:   EXPECT_EQ(Bar(n), 1);
+12:   EXPECT_EQ(Bar(n + 1), 2);
+13: }
+14:
+15: TEST(FooTest, Bar) {
+16:   {
+17:     SCOPED_TRACE("A");  // This trace point will be included in
+18:                         // every failure in this scope.
+19:     Sub1(1);
+20:   }
+21:   // Now it won't.
+22:   Sub1(9);
+23: }
+```
+
+could result in messages like these:
+
+```none
+path/to/foo_test.cc:11: Failure
+Value of: Bar(n)
+Expected: 1
+  Actual: 2
+   Trace:
+path/to/foo_test.cc:17: A
+
+path/to/foo_test.cc:12: Failure
+Value of: Bar(n + 1)
+Expected: 2
+  Actual: 3
+```
+
+Without the trace, it would've been difficult to know which invocation of
+`Sub1()` the two failures come from respectively. (You could add an extra
+message to each assertion in `Sub1()` to indicate the value of `n`, but that's
+tedious.)
+
+Some tips on using `SCOPED_TRACE`:
+
+1.  With a suitable message, it's often enough to use `SCOPED_TRACE` at the
+    beginning of a sub-routine, instead of at each call site.
+2.  When calling sub-routines inside a loop, make the loop iterator part of the
+    message in `SCOPED_TRACE` such that you can know which iteration the failure
+    is from.
+3.  Sometimes the line number of the trace point is enough for identifying the
+    particular invocation of a sub-routine. In this case, you don't have to
+    choose a unique message for `SCOPED_TRACE`. You can simply use `""`.
+4.  You can use `SCOPED_TRACE` in an inner scope when there is one in the outer
+    scope. In this case, all active trace points will be included in the failure
+    messages, in reverse order they are encountered.
+5.  The trace dump is clickable in Emacs - hit `return` on a line number and
+    you'll be taken to that line in the source file!
+
+### Propagating Fatal Failures
+
+A common pitfall when using `ASSERT_*` and `FAIL*` is not understanding that
+when they fail they only abort the _current function_, not the entire test. For
+example, the following test will segfault:
+
+```c++
+void Subroutine() {
+  // Generates a fatal failure and aborts the current function.
+  ASSERT_EQ(1, 2);
+
+  // The following won't be executed.
+  ...
+}
+
+TEST(FooTest, Bar) {
+  Subroutine();  // The intended behavior is for the fatal failure
+                 // in Subroutine() to abort the entire test.
+
+  // The actual behavior: the function goes on after Subroutine() returns.
+  int* p = NULL;
+  *p = 3;  // Segfault!
+}
+```
+
+To alleviate this, googletest provides three different solutions. You could use
+either exceptions, the `(ASSERT|EXPECT)_NO_FATAL_FAILURE` assertions or the
+`HasFatalFailure()` function. They are described in the following two
+subsections.
+
+#### Asserting on Subroutines with an exception
+
+The following code can turn ASSERT-failure into an exception:
+
+```c++
+class ThrowListener : public testing::EmptyTestEventListener {
+  void OnTestPartResult(const testing::TestPartResult& result) override {
+    if (result.type() == testing::TestPartResult::kFatalFailure) {
+      throw testing::AssertionException(result);
+    }
+  }
+};
+int main(int argc, char** argv) {
+  ...
+  testing::UnitTest::GetInstance()->listeners().Append(new ThrowListener);
+  return RUN_ALL_TESTS();
+}
+```
+
+This listener should be added after other listeners if you have any, otherwise
+they won't see failed `OnTestPartResult`.
+
+#### Asserting on Subroutines
+
+As shown above, if your test calls a subroutine that has an `ASSERT_*` failure
+in it, the test will continue after the subroutine returns. This may not be what
+you want.
+
+Often people want fatal failures to propagate like exceptions. For that
+googletest offers the following macros:
+
+Fatal assertion                       | Nonfatal assertion                    | Verifies
+------------------------------------- | ------------------------------------- | --------
+`ASSERT_NO_FATAL_FAILURE(statement);` | `EXPECT_NO_FATAL_FAILURE(statement);` | `statement` doesn't generate any new fatal failures in the current thread.
+
+Only failures in the thread that executes the assertion are checked to determine
+the result of this type of assertions. If `statement` creates new threads,
+failures in these threads are ignored.
+
+Examples:
+
+```c++
+ASSERT_NO_FATAL_FAILURE(Foo());
+
+int i;
+EXPECT_NO_FATAL_FAILURE({
+  i = Bar();
+});
+```
+
+Assertions from multiple threads are currently not supported on Windows.
+
+#### Checking for Failures in the Current Test
+
+`HasFatalFailure()` in the `::testing::Test` class returns `true` if an
+assertion in the current test has suffered a fatal failure. This allows
+functions to catch fatal failures in a sub-routine and return early.
+
+```c++
+class Test {
+ public:
+  ...
+  static bool HasFatalFailure();
+};
+```
+
+The typical usage, which basically simulates the behavior of a thrown exception,
+is:
+
+```c++
+TEST(FooTest, Bar) {
+  Subroutine();
+  // Aborts if Subroutine() had a fatal failure.
+  if (HasFatalFailure()) return;
+
+  // The following won't be executed.
+  ...
+}
+```
+
+If `HasFatalFailure()` is used outside of `TEST()` , `TEST_F()` , or a test
+fixture, you must add the `::testing::Test::` prefix, as in:
+
+```c++
+if (::testing::Test::HasFatalFailure()) return;
+```
+
+Similarly, `HasNonfatalFailure()` returns `true` if the current test has at
+least one non-fatal failure, and `HasFailure()` returns `true` if the current
+test has at least one failure of either kind.
+
+## Logging Additional Information
+
+In your test code, you can call `RecordProperty("key", value)` to log additional
+information, where `value` can be either a string or an `int`. The *last* value
+recorded for a key will be emitted to the
+[XML output](#generating-an-xml-report) if you specify one. For example, the
+test
+
+```c++
+TEST_F(WidgetUsageTest, MinAndMaxWidgets) {
+  RecordProperty("MaximumWidgets", ComputeMaxUsage());
+  RecordProperty("MinimumWidgets", ComputeMinUsage());
+}
+```
+
+will output XML like this:
+
+```xml
+  ...
+    <testcase name="MinAndMaxWidgets" status="run" time="0.006" classname="WidgetUsageTest" MaximumWidgets="12" MinimumWidgets="9" />
+  ...
+```
+
+> NOTE:
+>
+> *   `RecordProperty()` is a static member of the `Test` class. Therefore it
+>     needs to be prefixed with `::testing::Test::` if used outside of the
+>     `TEST` body and the test fixture class.
+> *   `*key*` must be a valid XML attribute name, and cannot conflict with the
+>     ones already used by googletest (`name`, `status`, `time`, `classname`,
+>     `type_param`, and `value_param`).
+> *   Calling `RecordProperty()` outside of the lifespan of a test is allowed.
+>     If it's called outside of a test but between a test suite's
+>     `SetUpTestSuite()` and `TearDownTestSuite()` methods, it will be
+>     attributed to the XML element for the test suite. If it's called outside
+>     of all test suites (e.g. in a test environment), it will be attributed to
+>     the top-level XML element.
+
+## Sharing Resources Between Tests in the Same Test Suite
+
+googletest creates a new test fixture object for each test in order to make
+tests independent and easier to debug. However, sometimes tests use resources
+that are expensive to set up, making the one-copy-per-test model prohibitively
+expensive.
+
+If the tests don't change the resource, there's no harm in their sharing a
+single resource copy. So, in addition to per-test set-up/tear-down, googletest
+also supports per-test-suite set-up/tear-down. To use it:
+
+1.  In your test fixture class (say `FooTest` ), declare as `static` some member
+    variables to hold the shared resources.
+2.  Outside your test fixture class (typically just below it), define those
+    member variables, optionally giving them initial values.
+3.  In the same test fixture class, define a `static void SetUpTestSuite()`
+    function (remember not to spell it as **`SetupTestSuite`** with a small
+    `u`!) to set up the shared resources and a `static void TearDownTestSuite()`
+    function to tear them down.
+
+That's it! googletest automatically calls `SetUpTestSuite()` before running the
+*first test* in the `FooTest` test suite (i.e. before creating the first
+`FooTest` object), and calls `TearDownTestSuite()` after running the *last test*
+in it (i.e. after deleting the last `FooTest` object). In between, the tests can
+use the shared resources.
+
+Remember that the test order is undefined, so your code can't depend on a test
+preceding or following another. Also, the tests must either not modify the state
+of any shared resource, or, if they do modify the state, they must restore the
+state to its original value before passing control to the next test.
+
+Here's an example of per-test-suite set-up and tear-down:
+
+```c++
+class FooTest : public ::testing::Test {
+ protected:
+  // Per-test-suite set-up.
+  // Called before the first test in this test suite.
+  // Can be omitted if not needed.
+  static void SetUpTestSuite() {
+    shared_resource_ = new ...;
+  }
+
+  // Per-test-suite tear-down.
+  // Called after the last test in this test suite.
+  // Can be omitted if not needed.
+  static void TearDownTestSuite() {
+    delete shared_resource_;
+    shared_resource_ = NULL;
+  }
+
+  // You can define per-test set-up logic as usual.
+  virtual void SetUp() { ... }
+
+  // You can define per-test tear-down logic as usual.
+  virtual void TearDown() { ... }
+
+  // Some expensive resource shared by all tests.
+  static T* shared_resource_;
+};
+
+T* FooTest::shared_resource_ = NULL;
+
+TEST_F(FooTest, Test1) {
+  ... you can refer to shared_resource_ here ...
+}
+
+TEST_F(FooTest, Test2) {
+  ... you can refer to shared_resource_ here ...
+}
+```
+
+NOTE: Though the above code declares `SetUpTestSuite()` protected, it may
+sometimes be necessary to declare it public, such as when using it with
+`TEST_P`.
+
+## Global Set-Up and Tear-Down
+
+Just as you can do set-up and tear-down at the test level and the test suite
+level, you can also do it at the test program level. Here's how.
+
+First, you subclass the `::testing::Environment` class to define a test
+environment, which knows how to set-up and tear-down:
+
+```c++
+class Environment : public ::testing::Environment {
+ public:
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  void SetUp() override {}
+
+  // Override this to define how to tear down the environment.
+  void TearDown() override {}
+};
+```
+
+Then, you register an instance of your environment class with googletest by
+calling the `::testing::AddGlobalTestEnvironment()` function:
+
+```c++
+Environment* AddGlobalTestEnvironment(Environment* env);
+```
+
+Now, when `RUN_ALL_TESTS()` is called, it first calls the `SetUp()` method of
+each environment object, then runs the tests if none of the environments
+reported fatal failures and `GTEST_SKIP()` was not called. `RUN_ALL_TESTS()`
+always calls `TearDown()` with each environment object, regardless of whether or
+not the tests were run.
+
+It's OK to register multiple environment objects. In this suite, their `SetUp()`
+will be called in the order they are registered, and their `TearDown()` will be
+called in the reverse order.
+
+Note that googletest takes ownership of the registered environment objects.
+Therefore **do not delete them** by yourself.
+
+You should call `AddGlobalTestEnvironment()` before `RUN_ALL_TESTS()` is called,
+probably in `main()`. If you use `gtest_main`, you need to call this before
+`main()` starts for it to take effect. One way to do this is to define a global
+variable like this:
+
+```c++
+::testing::Environment* const foo_env =
+    ::testing::AddGlobalTestEnvironment(new FooEnvironment);
+```
+
+However, we strongly recommend you to write your own `main()` and call
+`AddGlobalTestEnvironment()` there, as relying on initialization of global
+variables makes the code harder to read and may cause problems when you register
+multiple environments from different translation units and the environments have
+dependencies among them (remember that the compiler doesn't guarantee the order
+in which global variables from different translation units are initialized).
+
+## Value-Parameterized Tests
+
+*Value-parameterized tests* allow you to test your code with different
+parameters without writing multiple copies of the same test. This is useful in a
+number of situations, for example:
+
+*   You have a piece of code whose behavior is affected by one or more
+    command-line flags. You want to make sure your code performs correctly for
+    various values of those flags.
+*   You want to test different implementations of an OO interface.
+*   You want to test your code over various inputs (a.k.a. data-driven testing).
+    This feature is easy to abuse, so please exercise your good sense when doing
+    it!
+
+### How to Write Value-Parameterized Tests
+
+To write value-parameterized tests, first you should define a fixture class. It
+must be derived from both `testing::Test` and `testing::WithParamInterface<T>`
+(the latter is a pure interface), where `T` is the type of your parameter
+values. For convenience, you can just derive the fixture class from
+`testing::TestWithParam<T>`, which itself is derived from both `testing::Test`
+and `testing::WithParamInterface<T>`. `T` can be any copyable type. If it's a
+raw pointer, you are responsible for managing the lifespan of the pointed
+values.
+
+NOTE: If your test fixture defines `SetUpTestSuite()` or `TearDownTestSuite()`
+they must be declared **public** rather than **protected** in order to use
+`TEST_P`.
+
+```c++
+class FooTest :
+    public testing::TestWithParam<const char*> {
+  // You can implement all the usual fixture class members here.
+  // To access the test parameter, call GetParam() from class
+  // TestWithParam<T>.
+};
+
+// Or, when you want to add parameters to a pre-existing fixture class:
+class BaseTest : public testing::Test {
+  ...
+};
+class BarTest : public BaseTest,
+                public testing::WithParamInterface<const char*> {
+  ...
+};
+```
+
+Then, use the `TEST_P` macro to define as many test patterns using this fixture
+as you want. The `_P` suffix is for "parameterized" or "pattern", whichever you
+prefer to think.
+
+```c++
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+```
+
+Finally, you can use `INSTANTIATE_TEST_SUITE_P` to instantiate the test suite
+with any set of parameters you want. googletest defines a number of functions
+for generating test parameters. They return what we call (surprise!) *parameter
+generators*. Here is a summary of them, which are all in the `testing`
+namespace:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Parameter Generator                                                                       | Behavior                                                                                                          |
+| ----------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `Range(begin, end [, step])`                                                              | Yields values `{begin, begin+step, begin+step+step, ...}`. The values do not include `end`. `step` defaults to 1. |
+| `Values(v1, v2, ..., vN)`                                                                 | Yields values `{v1, v2, ..., vN}`.                                                                                |
+| `ValuesIn(container)` and  `ValuesIn(begin,end)`                                          | Yields values from a C-style array, an  STL-style container, or an iterator range `[begin, end)`                  |
+| `Bool()`                                                                                  | Yields sequence `{false, true}`.                                                                                  |
+| `Combine(g1, g2, ..., gN)`                                                                | Yields all combinations (Cartesian product) as std\:\:tuples of the values generated by the `N` generators.       |
+
+<!-- mdformat on-->
+
+For more details, see the comments at the definitions of these functions.
+
+The following statement will instantiate tests from the `FooTest` test suite
+each with parameter values `"meeny"`, `"miny"`, and `"moe"`.
+
+```c++
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+                         FooTest,
+                         testing::Values("meeny", "miny", "moe"));
+```
+
+NOTE: The code above must be placed at global or namespace scope, not at
+function scope.
+
+NOTE: Don't forget this step! If you do your test will silently pass, but none
+of its suites will ever run!
+
+To distinguish different instances of the pattern (yes, you can instantiate it
+more than once), the first argument to `INSTANTIATE_TEST_SUITE_P` is a prefix
+that will be added to the actual test suite name. Remember to pick unique
+prefixes for different instantiations. The tests from the instantiation above
+will have these names:
+
+*   `InstantiationName/FooTest.DoesBlah/0` for `"meeny"`
+*   `InstantiationName/FooTest.DoesBlah/1` for `"miny"`
+*   `InstantiationName/FooTest.DoesBlah/2` for `"moe"`
+*   `InstantiationName/FooTest.HasBlahBlah/0` for `"meeny"`
+*   `InstantiationName/FooTest.HasBlahBlah/1` for `"miny"`
+*   `InstantiationName/FooTest.HasBlahBlah/2` for `"moe"`
+
+You can use these names in [`--gtest_filter`](#running-a-subset-of-the-tests).
+
+This statement will instantiate all tests from `FooTest` again, each with
+parameter values `"cat"` and `"dog"`:
+
+```c++
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest,
+                         testing::ValuesIn(pets));
+```
+
+The tests from the instantiation above will have these names:
+
+*   `AnotherInstantiationName/FooTest.DoesBlah/0` for `"cat"`
+*   `AnotherInstantiationName/FooTest.DoesBlah/1` for `"dog"`
+*   `AnotherInstantiationName/FooTest.HasBlahBlah/0` for `"cat"`
+*   `AnotherInstantiationName/FooTest.HasBlahBlah/1` for `"dog"`
+
+Please note that `INSTANTIATE_TEST_SUITE_P` will instantiate *all* tests in the
+given test suite, whether their definitions come before or *after* the
+`INSTANTIATE_TEST_SUITE_P` statement.
+
+You can see [sample7_unittest.cc] and [sample8_unittest.cc] for more examples.
+
+[sample7_unittest.cc]: ../samples/sample7_unittest.cc "Parameterized Test example"
+[sample8_unittest.cc]: ../samples/sample8_unittest.cc "Parameterized Test example with multiple parameters"
+
+### Creating Value-Parameterized Abstract Tests
+
+In the above, we define and instantiate `FooTest` in the *same* source file.
+Sometimes you may want to define value-parameterized tests in a library and let
+other people instantiate them later. This pattern is known as *abstract tests*.
+As an example of its application, when you are designing an interface you can
+write a standard suite of abstract tests (perhaps using a factory function as
+the test parameter) that all implementations of the interface are expected to
+pass. When someone implements the interface, they can instantiate your suite to
+get all the interface-conformance tests for free.
+
+To define abstract tests, you should organize your code like this:
+
+1.  Put the definition of the parameterized test fixture class (e.g. `FooTest`)
+    in a header file, say `foo_param_test.h`. Think of this as *declaring* your
+    abstract tests.
+2.  Put the `TEST_P` definitions in `foo_param_test.cc`, which includes
+    `foo_param_test.h`. Think of this as *implementing* your abstract tests.
+
+Once they are defined, you can instantiate them by including `foo_param_test.h`,
+invoking `INSTANTIATE_TEST_SUITE_P()`, and depending on the library target that
+contains `foo_param_test.cc`. You can instantiate the same abstract test suite
+multiple times, possibly in different source files.
+
+### Specifying Names for Value-Parameterized Test Parameters
+
+The optional last argument to `INSTANTIATE_TEST_SUITE_P()` allows the user to
+specify a function or functor that generates custom test name suffixes based on
+the test parameters. The function should accept one argument of type
+`testing::TestParamInfo<class ParamType>`, and return `std::string`.
+
+`testing::PrintToStringParamName` is a builtin test suffix generator that
+returns the value of `testing::PrintToString(GetParam())`. It does not work for
+`std::string` or C strings.
+
+NOTE: test names must be non-empty, unique, and may only contain ASCII
+alphanumeric characters. In particular, they
+[should not contain underscores](faq.md#why-should-test-suite-names-and-test-names-not-contain-underscore)
+
+```c++
+class MyTestSuite : public testing::TestWithParam<int> {};
+
+TEST_P(MyTestSuite, MyTest)
+{
+  std::cout << "Example Test Param: " << GetParam() << std::endl;
+}
+
+INSTANTIATE_TEST_SUITE_P(MyGroup, MyTestSuite, testing::Range(0, 10),
+                         testing::PrintToStringParamName());
+```
+
+Providing a custom functor allows for more control over test parameter name
+generation, especially for types where the automatic conversion does not
+generate helpful parameter names (e.g. strings as demonstrated above). The
+following example illustrates this for multiple parameters, an enumeration type
+and a string, and also demonstrates how to combine generators. It uses a lambda
+for conciseness:
+
+```c++
+enum class MyType { MY_FOO = 0, MY_BAR = 1 };
+
+class MyTestSuite : public testing::TestWithParam<std::tuple<MyType, string>> {
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    MyGroup, MyTestSuite,
+    testing::Combine(
+        testing::Values(MyType::VALUE_0, MyType::VALUE_1),
+        testing::ValuesIn("", "")),
+    [](const testing::TestParamInfo<MyTestSuite::ParamType>& info) {
+      string name = absl::StrCat(
+          std::get<0>(info.param) == MY_FOO ? "Foo" : "Bar", "_",
+          std::get<1>(info.param));
+      absl::c_replace_if(name, [](char c) { return !std::isalnum(c); }, '_');
+      return name;
+    });
+```
+
+## Typed Tests
+
+Suppose you have multiple implementations of the same interface and want to make
+sure that all of them satisfy some common requirements. Or, you may have defined
+several types that are supposed to conform to the same "concept" and you want to
+verify it. In both cases, you want the same test logic repeated for different
+types.
+
+While you can write one `TEST` or `TEST_F` for each type you want to test (and
+you may even factor the test logic into a function template that you invoke from
+the `TEST`), it's tedious and doesn't scale: if you want `m` tests over `n`
+types, you'll end up writing `m*n` `TEST`s.
+
+*Typed tests* allow you to repeat the same test logic over a list of types. You
+only need to write the test logic once, although you must know the type list
+when writing typed tests. Here's how you do it:
+
+First, define a fixture class template. It should be parameterized by a type.
+Remember to derive it from `::testing::Test`:
+
+```c++
+template <typename T>
+class FooTest : public ::testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+```
+
+Next, associate a list of types with the test suite, which will be repeated for
+each type in the list:
+
+```c++
+using MyTypes = ::testing::Types<char, int, unsigned int>;
+TYPED_TEST_SUITE(FooTest, MyTypes);
+```
+
+The type alias (`using` or `typedef`) is necessary for the `TYPED_TEST_SUITE`
+macro to parse correctly. Otherwise the compiler will think that each comma in
+the type list introduces a new macro argument.
+
+Then, use `TYPED_TEST()` instead of `TEST_F()` to define a typed test for this
+test suite. You can repeat this as many times as you want:
+
+```c++
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to the special name TypeParam to get the type
+  // parameter.  Since we are inside a derived class template, C++ requires
+  // us to visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the 'TestFixture::'
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the 'typename TestFixture::'
+  // prefix.  The 'typename' is required to satisfy the compiler.
+  typename TestFixture::List values;
+
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+```
+
+You can see [sample6_unittest.cc] for a complete example.
+
+[sample6_unittest.cc]: ../samples/sample6_unittest.cc "Typed Test example"
+
+## Type-Parameterized Tests
+
+*Type-parameterized tests* are like typed tests, except that they don't require
+you to know the list of types ahead of time. Instead, you can define the test
+logic first and instantiate it with different type lists later. You can even
+instantiate it more than once in the same program.
+
+If you are designing an interface or concept, you can define a suite of
+type-parameterized tests to verify properties that any valid implementation of
+the interface/concept should have. Then, the author of each implementation can
+just instantiate the test suite with their type to verify that it conforms to
+the requirements, without having to write similar tests repeatedly. Here's an
+example:
+
+First, define a fixture class template, as we did with typed tests:
+
+```c++
+template <typename T>
+class FooTest : public ::testing::Test {
+  ...
+};
+```
+
+Next, declare that you will define a type-parameterized test suite:
+
+```c++
+TYPED_TEST_SUITE_P(FooTest);
+```
+
+Then, use `TYPED_TEST_P()` to define a type-parameterized test. You can repeat
+this as many times as you want:
+
+```c++
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+```
+
+Now the tricky part: you need to register all test patterns using the
+`REGISTER_TYPED_TEST_SUITE_P` macro before you can instantiate them. The first
+argument of the macro is the test suite name; the rest are the names of the
+tests in this test suite:
+
+```c++
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+                            DoesBlah, HasPropertyA);
+```
+
+Finally, you are free to instantiate the pattern with the types you want. If you
+put the above code in a header file, you can `#include` it in multiple C++
+source files and instantiate it multiple times.
+
+```c++
+typedef ::testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
+```
+
+To distinguish different instances of the pattern, the first argument to the
+`INSTANTIATE_TYPED_TEST_SUITE_P` macro is a prefix that will be added to the
+actual test suite name. Remember to pick unique prefixes for different
+instances.
+
+In the special case where the type list contains only one type, you can write
+that type directly without `::testing::Types<...>`, like this:
+
+```c++
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+```
+
+You can see [sample6_unittest.cc] for a complete example.
+
+## Testing Private Code
+
+If you change your software's internal implementation, your tests should not
+break as long as the change is not observable by users. Therefore, **per the
+black-box testing principle, most of the time you should test your code through
+its public interfaces.**
+
+**If you still find yourself needing to test internal implementation code,
+consider if there's a better design.** The desire to test internal
+implementation is often a sign that the class is doing too much. Consider
+extracting an implementation class, and testing it. Then use that implementation
+class in the original class.
+
+If you absolutely have to test non-public interface code though, you can. There
+are two cases to consider:
+
+*   Static functions ( *not* the same as static member functions!) or unnamed
+    namespaces, and
+*   Private or protected class members
+
+To test them, we use the following special techniques:
+
+*   Both static functions and definitions/declarations in an unnamed namespace
+    are only visible within the same translation unit. To test them, you can
+    `#include` the entire `.cc` file being tested in your `*_test.cc` file.
+    (#including `.cc` files is not a good way to reuse code - you should not do
+    this in production code!)
+
+    However, a better approach is to move the private code into the
+    `foo::internal` namespace, where `foo` is the namespace your project
+    normally uses, and put the private declarations in a `*-internal.h` file.
+    Your production `.cc` files and your tests are allowed to include this
+    internal header, but your clients are not. This way, you can fully test your
+    internal implementation without leaking it to your clients.
+
+*   Private class members are only accessible from within the class or by
+    friends. To access a class' private members, you can declare your test
+    fixture as a friend to the class and define accessors in your fixture. Tests
+    using the fixture can then access the private members of your production
+    class via the accessors in the fixture. Note that even though your fixture
+    is a friend to your production class, your tests are not automatically
+    friends to it, as they are technically defined in sub-classes of the
+    fixture.
+
+    Another way to test private members is to refactor them into an
+    implementation class, which is then declared in a `*-internal.h` file. Your
+    clients aren't allowed to include this header but your tests can. Such is
+    called the
+    [Pimpl](https://www.gamedev.net/articles/programming/general-and-gameplay-programming/the-c-pimpl-r1794/)
+    (Private Implementation) idiom.
+
+    Or, you can declare an individual test as a friend of your class by adding
+    this line in the class body:
+
+    ```c++
+        FRIEND_TEST(TestSuiteName, TestName);
+    ```
+
+    For example,
+
+    ```c++
+    // foo.h
+    class Foo {
+      ...
+     private:
+      FRIEND_TEST(FooTest, BarReturnsZeroOnNull);
+
+      int Bar(void* x);
+    };
+
+    // foo_test.cc
+    ...
+    TEST(FooTest, BarReturnsZeroOnNull) {
+      Foo foo;
+      EXPECT_EQ(foo.Bar(NULL), 0);  // Uses Foo's private member Bar().
+    }
+    ```
+
+    Pay special attention when your class is defined in a namespace, as you
+    should define your test fixtures and tests in the same namespace if you want
+    them to be friends of your class. For example, if the code to be tested
+    looks like:
+
+    ```c++
+    namespace my_namespace {
+
+    class Foo {
+      friend class FooTest;
+      FRIEND_TEST(FooTest, Bar);
+      FRIEND_TEST(FooTest, Baz);
+      ... definition of the class Foo ...
+    };
+
+    }  // namespace my_namespace
+    ```
+
+    Your test code should be something like:
+
+    ```c++
+    namespace my_namespace {
+
+    class FooTest : public ::testing::Test {
+     protected:
+      ...
+    };
+
+    TEST_F(FooTest, Bar) { ... }
+    TEST_F(FooTest, Baz) { ... }
+
+    }  // namespace my_namespace
+    ```
+
+## "Catching" Failures
+
+If you are building a testing utility on top of googletest, you'll want to test
+your utility. What framework would you use to test it? googletest, of course.
+
+The challenge is to verify that your testing utility reports failures correctly.
+In frameworks that report a failure by throwing an exception, you could catch
+the exception and assert on it. But googletest doesn't use exceptions, so how do
+we test that a piece of code generates an expected failure?
+
+gunit-spi.h contains some constructs to do this. After #including this header,
+you can use
+
+```c++
+  EXPECT_FATAL_FAILURE(statement, substring);
+```
+
+to assert that `statement` generates a fatal (e.g. `ASSERT_*`) failure in the
+current thread whose message contains the given `substring`, or use
+
+```c++
+  EXPECT_NONFATAL_FAILURE(statement, substring);
+```
+
+if you are expecting a non-fatal (e.g. `EXPECT_*`) failure.
+
+Only failures in the current thread are checked to determine the result of this
+type of expectations. If `statement` creates new threads, failures in these
+threads are also ignored. If you want to catch failures in other threads as
+well, use one of the following macros instead:
+
+```c++
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substring);
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substring);
+```
+
+NOTE: Assertions from multiple threads are currently not supported on Windows.
+
+For technical reasons, there are some caveats:
+
+1.  You cannot stream a failure message to either macro.
+
+2.  `statement` in `EXPECT_FATAL_FAILURE{_ON_ALL_THREADS}()` cannot reference
+    local non-static variables or non-static members of `this` object.
+
+3.  `statement` in `EXPECT_FATAL_FAILURE{_ON_ALL_THREADS}()` cannot return a
+    value.
+
+## Registering tests programmatically
+
+The `TEST` macros handle the vast majority of all use cases, but there are few
+were runtime registration logic is required. For those cases, the framework
+provides the `::testing::RegisterTest` that allows callers to register arbitrary
+tests dynamically.
+
+This is an advanced API only to be used when the `TEST` macros are insufficient.
+The macros should be preferred when possible, as they avoid most of the
+complexity of calling this function.
+
+It provides the following signature:
+
+```c++
+template <typename Factory>
+TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
+                       const char* type_param, const char* value_param,
+                       const char* file, int line, Factory factory);
+```
+
+The `factory` argument is a factory callable (move-constructible) object or
+function pointer that creates a new instance of the Test object. It handles
+ownership to the caller. The signature of the callable is `Fixture*()`, where
+`Fixture` is the test fixture class for the test. All tests registered with the
+same `test_suite_name` must return the same fixture type. This is checked at
+runtime.
+
+The framework will infer the fixture class from the factory and will call the
+`SetUpTestSuite` and `TearDownTestSuite` for it.
+
+Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+undefined.
+
+Use case example:
+
+```c++
+class MyFixture : public ::testing::Test {
+ public:
+  // All of these optional, just like in regular macro usage.
+  static void SetUpTestSuite() { ... }
+  static void TearDownTestSuite() { ... }
+  void SetUp() override { ... }
+  void TearDown() override { ... }
+};
+
+class MyTest : public MyFixture {
+ public:
+  explicit MyTest(int data) : data_(data) {}
+  void TestBody() override { ... }
+
+ private:
+  int data_;
+};
+
+void RegisterMyTests(const std::vector<int>& values) {
+  for (int v : values) {
+    ::testing::RegisterTest(
+        "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+        std::to_string(v).c_str(),
+        __FILE__, __LINE__,
+        // Important to use the fixture type as the return type here.
+        [=]() -> MyFixture* { return new MyTest(v); });
+  }
+}
+...
+int main(int argc, char** argv) {
+  std::vector<int> values_to_test = LoadValuesFromConfig();
+  RegisterMyTests(values_to_test);
+  ...
+  return RUN_ALL_TESTS();
+}
+```
+## Getting the Current Test's Name
+
+Sometimes a function may need to know the name of the currently running test.
+For example, you may be using the `SetUp()` method of your test fixture to set
+the golden file name based on which test is running. The `::testing::TestInfo`
+class has this information:
+
+```c++
+namespace testing {
+
+class TestInfo {
+ public:
+  // Returns the test suite name and the test name, respectively.
+  //
+  // Do NOT delete or free the return value - it's managed by the
+  // TestInfo class.
+  const char* test_suite_name() const;
+  const char* name() const;
+};
+
+}
+```
+
+To obtain a `TestInfo` object for the currently running test, call
+`current_test_info()` on the `UnitTest` singleton object:
+
+```c++
+  // Gets information about the currently running test.
+  // Do NOT delete the returned object - it's managed by the UnitTest class.
+  const ::testing::TestInfo* const test_info =
+    ::testing::UnitTest::GetInstance()->current_test_info();
+
+
+
+  printf("We are in test %s of test suite %s.\n",
+         test_info->name(),
+         test_info->test_suite_name());
+```
+
+`current_test_info()` returns a null pointer if no test is running. In
+particular, you cannot find the test suite name in `TestSuiteSetUp()`,
+`TestSuiteTearDown()` (where you know the test suite name implicitly), or
+functions called from them.
+
+## Extending googletest by Handling Test Events
+
+googletest provides an **event listener API** to let you receive notifications
+about the progress of a test program and test failures. The events you can
+listen to include the start and end of the test program, a test suite, or a test
+method, among others. You may use this API to augment or replace the standard
+console output, replace the XML output, or provide a completely different form
+of output, such as a GUI or a database. You can also use test events as
+checkpoints to implement a resource leak checker, for example.
+
+### Defining Event Listeners
+
+To define a event listener, you subclass either testing::TestEventListener or
+testing::EmptyTestEventListener The former is an (abstract) interface, where
+*each pure virtual method can be overridden to handle a test event* (For
+example, when a test starts, the `OnTestStart()` method will be called.). The
+latter provides an empty implementation of all methods in the interface, such
+that a subclass only needs to override the methods it cares about.
+
+When an event is fired, its context is passed to the handler function as an
+argument. The following argument types are used:
+
+*   UnitTest reflects the state of the entire test program,
+*   TestSuite has information about a test suite, which can contain one or more
+    tests,
+*   TestInfo contains the state of a test, and
+*   TestPartResult represents the result of a test assertion.
+
+An event handler function can examine the argument it receives to find out
+interesting information about the event and the test program's state.
+
+Here's an example:
+
+```c++
+  class MinimalistPrinter : public ::testing::EmptyTestEventListener {
+    // Called before a test starts.
+    virtual void OnTestStart(const ::testing::TestInfo& test_info) {
+      printf("*** Test %s.%s starting.\n",
+             test_info.test_suite_name(), test_info.name());
+    }
+
+    // Called after a failed assertion or a SUCCESS().
+    virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result) {
+      printf("%s in %s:%d\n%s\n",
+             test_part_result.failed() ? "*** Failure" : "Success",
+             test_part_result.file_name(),
+             test_part_result.line_number(),
+             test_part_result.summary());
+    }
+
+    // Called after a test ends.
+    virtual void OnTestEnd(const ::testing::TestInfo& test_info) {
+      printf("*** Test %s.%s ending.\n",
+             test_info.test_suite_name(), test_info.name());
+    }
+  };
+```
+
+### Using Event Listeners
+
+To use the event listener you have defined, add an instance of it to the
+googletest event listener list (represented by class TestEventListeners - note
+the "s" at the end of the name) in your `main()` function, before calling
+`RUN_ALL_TESTS()`:
+
+```c++
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  // Gets hold of the event listener list.
+  ::testing::TestEventListeners& listeners =
+        ::testing::UnitTest::GetInstance()->listeners();
+  // Adds a listener to the end.  googletest takes the ownership.
+  listeners.Append(new MinimalistPrinter);
+  return RUN_ALL_TESTS();
+}
+```
+
+There's only one problem: the default test result printer is still in effect, so
+its output will mingle with the output from your minimalist printer. To suppress
+the default printer, just release it from the event listener list and delete it.
+You can do so by adding one line:
+
+```c++
+  ...
+  delete listeners.Release(listeners.default_result_printer());
+  listeners.Append(new MinimalistPrinter);
+  return RUN_ALL_TESTS();
+```
+
+Now, sit back and enjoy a completely different output from your tests. For more
+details, see [sample9_unittest.cc].
+
+[sample9_unittest.cc]: ../samples/sample9_unittest.cc "Event listener example"
+
+You may append more than one listener to the list. When an `On*Start()` or
+`OnTestPartResult()` event is fired, the listeners will receive it in the order
+they appear in the list (since new listeners are added to the end of the list,
+the default text printer and the default XML generator will receive the event
+first). An `On*End()` event will be received by the listeners in the *reverse*
+order. This allows output by listeners added later to be framed by output from
+listeners added earlier.
+
+### Generating Failures in Listeners
+
+You may use failure-raising macros (`EXPECT_*()`, `ASSERT_*()`, `FAIL()`, etc)
+when processing an event. There are some restrictions:
+
+1.  You cannot generate any failure in `OnTestPartResult()` (otherwise it will
+    cause `OnTestPartResult()` to be called recursively).
+2.  A listener that handles `OnTestPartResult()` is not allowed to generate any
+    failure.
+
+When you add listeners to the listener list, you should put listeners that
+handle `OnTestPartResult()` *before* listeners that can generate failures. This
+ensures that failures generated by the latter are attributed to the right test
+by the former.
+
+See [sample10_unittest.cc] for an example of a failure-raising listener.
+
+[sample10_unittest.cc]: ../samples/sample10_unittest.cc "Failure-raising listener example"
+
+## Running Test Programs: Advanced Options
+
+googletest test programs are ordinary executables. Once built, you can run them
+directly and affect their behavior via the following environment variables
+and/or command line flags. For the flags to work, your programs must call
+`::testing::InitGoogleTest()` before calling `RUN_ALL_TESTS()`.
+
+To see a list of supported flags and their usage, please run your test program
+with the `--help` flag. You can also use `-h`, `-?`, or `/?` for short.
+
+If an option is specified both by an environment variable and by a flag, the
+latter takes precedence.
+
+### Selecting Tests
+
+#### Listing Test Names
+
+Sometimes it is necessary to list the available tests in a program before
+running them so that a filter may be applied if needed. Including the flag
+`--gtest_list_tests` overrides all other flags and lists tests in the following
+format:
+
+```none
+TestSuite1.
+  TestName1
+  TestName2
+TestSuite2.
+  TestName
+```
+
+None of the tests listed are actually run if the flag is provided. There is no
+corresponding environment variable for this flag.
+
+#### Running a Subset of the Tests
+
+By default, a googletest program runs all tests the user has defined. Sometimes,
+you want to run only a subset of the tests (e.g. for debugging or quickly
+verifying a change). If you set the `GTEST_FILTER` environment variable or the
+`--gtest_filter` flag to a filter string, googletest will only run the tests
+whose full names (in the form of `TestSuiteName.TestName`) match the filter.
+
+The format of a filter is a '`:`'-separated list of wildcard patterns (called
+the *positive patterns*) optionally followed by a '`-`' and another
+'`:`'-separated pattern list (called the *negative patterns*). A test matches
+the filter if and only if it matches any of the positive patterns but does not
+match any of the negative patterns.
+
+A pattern may contain `'*'` (matches any string) or `'?'` (matches any single
+character). For convenience, the filter `'*-NegativePatterns'` can be also
+written as `'-NegativePatterns'`.
+
+For example:
+
+*   `./foo_test` Has no flag, and thus runs all its tests.
+*   `./foo_test --gtest_filter=*` Also runs everything, due to the single
+    match-everything `*` value.
+*   `./foo_test --gtest_filter=FooTest.*` Runs everything in test suite
+    `FooTest` .
+*   `./foo_test --gtest_filter=*Null*:*Constructor*` Runs any test whose full
+    name contains either `"Null"` or `"Constructor"` .
+*   `./foo_test --gtest_filter=-*DeathTest.*` Runs all non-death tests.
+*   `./foo_test --gtest_filter=FooTest.*-FooTest.Bar` Runs everything in test
+    suite `FooTest` except `FooTest.Bar`.
+*   `./foo_test --gtest_filter=FooTest.*:BarTest.*-FooTest.Bar:BarTest.Foo` Runs
+    everything in test suite `FooTest` except `FooTest.Bar` and everything in
+    test suite `BarTest` except `BarTest.Foo`.
+
+#### Temporarily Disabling Tests
+
+If you have a broken test that you cannot fix right away, you can add the
+`DISABLED_` prefix to its name. This will exclude it from execution. This is
+better than commenting out the code or using `#if 0`, as disabled tests are
+still compiled (and thus won't rot).
+
+If you need to disable all tests in a test suite, you can either add `DISABLED_`
+to the front of the name of each test, or alternatively add it to the front of
+the test suite name.
+
+For example, the following tests won't be run by googletest, even though they
+will still be compiled:
+
+```c++
+// Tests that Foo does Abc.
+TEST(FooTest, DISABLED_DoesAbc) { ... }
+
+class DISABLED_BarTest : public ::testing::Test { ... };
+
+// Tests that Bar does Xyz.
+TEST_F(DISABLED_BarTest, DoesXyz) { ... }
+```
+
+NOTE: This feature should only be used for temporary pain-relief. You still have
+to fix the disabled tests at a later date. As a reminder, googletest will print
+a banner warning you if a test program contains any disabled tests.
+
+TIP: You can easily count the number of disabled tests you have using `gsearch`
+and/or `grep`. This number can be used as a metric for improving your test
+quality.
+
+#### Temporarily Enabling Disabled Tests
+
+To include disabled tests in test execution, just invoke the test program with
+the `--gtest_also_run_disabled_tests` flag or set the
+`GTEST_ALSO_RUN_DISABLED_TESTS` environment variable to a value other than `0`.
+You can combine this with the `--gtest_filter` flag to further select which
+disabled tests to run.
+
+### Repeating the Tests
+
+Once in a while you'll run into a test whose result is hit-or-miss. Perhaps it
+will fail only 1% of the time, making it rather hard to reproduce the bug under
+a debugger. This can be a major source of frustration.
+
+The `--gtest_repeat` flag allows you to repeat all (or selected) test methods in
+a program many times. Hopefully, a flaky test will eventually fail and give you
+a chance to debug. Here's how to use it:
+
+```none
+$ foo_test --gtest_repeat=1000
+Repeat foo_test 1000 times and don't stop at failures.
+
+$ foo_test --gtest_repeat=-1
+A negative count means repeating forever.
+
+$ foo_test --gtest_repeat=1000 --gtest_break_on_failure
+Repeat foo_test 1000 times, stopping at the first failure.  This
+is especially useful when running under a debugger: when the test
+fails, it will drop into the debugger and you can then inspect
+variables and stacks.
+
+$ foo_test --gtest_repeat=1000 --gtest_filter=FooBar.*
+Repeat the tests whose name matches the filter 1000 times.
+```
+
+If your test program contains
+[global set-up/tear-down](#global-set-up-and-tear-down) code, it will be
+repeated in each iteration as well, as the flakiness may be in it. You can also
+specify the repeat count by setting the `GTEST_REPEAT` environment variable.
+
+### Shuffling the Tests
+
+You can specify the `--gtest_shuffle` flag (or set the `GTEST_SHUFFLE`
+environment variable to `1`) to run the tests in a program in a random order.
+This helps to reveal bad dependencies between tests.
+
+By default, googletest uses a random seed calculated from the current time.
+Therefore you'll get a different order every time. The console output includes
+the random seed value, such that you can reproduce an order-related test failure
+later. To specify the random seed explicitly, use the `--gtest_random_seed=SEED`
+flag (or set the `GTEST_RANDOM_SEED` environment variable), where `SEED` is an
+integer in the range [0, 99999]. The seed value 0 is special: it tells
+googletest to do the default behavior of calculating the seed from the current
+time.
+
+If you combine this with `--gtest_repeat=N`, googletest will pick a different
+random seed and re-shuffle the tests in each iteration.
+
+### Controlling Test Output
+
+#### Colored Terminal Output
+
+googletest can use colors in its terminal output to make it easier to spot the
+important information:
+
+<code>
+...<br/>
+  <font color="green">[----------]</font><font color="black"> 1 test from
+  FooTest</font><br/>
+  <font color="green">[ RUN &nbsp; &nbsp; &nbsp;]</font><font color="black">
+  FooTest.DoesAbc</font><br/>
+  <font color="green">[ &nbsp; &nbsp; &nbsp; OK ]</font><font color="black">
+  FooTest.DoesAbc </font><br/>
+  <font color="green">[----------]</font><font color="black">
+  2 tests from BarTest</font><br/>
+  <font color="green">[ RUN &nbsp; &nbsp; &nbsp;]</font><font color="black">
+  BarTest.HasXyzProperty </font><br/>
+  <font color="green">[ &nbsp; &nbsp; &nbsp; OK ]</font><font color="black">
+  BarTest.HasXyzProperty</font><br/>
+  <font color="green">[ RUN &nbsp; &nbsp; &nbsp;]</font><font color="black">
+  BarTest.ReturnsTrueOnSuccess ... some error messages ...</font><br/>
+  <font color="red">[ &nbsp; FAILED ]</font><font color="black">
+  BarTest.ReturnsTrueOnSuccess ...</font><br/>
+  <font color="green">[==========]</font><font color="black">
+  30 tests from 14 test suites ran.</font><br/>
+  <font color="green">[ &nbsp; PASSED ]</font><font color="black">
+  28 tests.</font><br/>
+  <font color="red">[ &nbsp; FAILED ]</font><font color="black">
+  2 tests, listed below:</font><br/>
+  <font color="red">[ &nbsp; FAILED ]</font><font color="black">
+  BarTest.ReturnsTrueOnSuccess</font><br/>
+  <font color="red">[ &nbsp; FAILED ]</font><font color="black">
+  AnotherTest.DoesXyz<br/>
+<br/>
+  2 FAILED TESTS
+  </font>
+</code>
+
+You can set the `GTEST_COLOR` environment variable or the `--gtest_color`
+command line flag to `yes`, `no`, or `auto` (the default) to enable colors,
+disable colors, or let googletest decide. When the value is `auto`, googletest
+will use colors if and only if the output goes to a terminal and (on non-Windows
+platforms) the `TERM` environment variable is set to `xterm` or `xterm-color`.
+
+#### Suppressing the Elapsed Time
+
+By default, googletest prints the time it takes to run each test. To disable
+that, run the test program with the `--gtest_print_time=0` command line flag, or
+set the GTEST_PRINT_TIME environment variable to `0`.
+
+#### Suppressing UTF-8 Text Output
+
+In case of assertion failures, googletest prints expected and actual values of
+type `string` both as hex-encoded strings as well as in readable UTF-8 text if
+they contain valid non-ASCII UTF-8 characters. If you want to suppress the UTF-8
+text because, for example, you don't have an UTF-8 compatible output medium, run
+the test program with `--gtest_print_utf8=0` or set the `GTEST_PRINT_UTF8`
+environment variable to `0`.
+
+
+
+#### Generating an XML Report
+
+googletest can emit a detailed XML report to a file in addition to its normal
+textual output. The report contains the duration of each test, and thus can help
+you identify slow tests. The report is also used by the http://unittest
+dashboard to show per-test-method error messages.
+
+To generate the XML report, set the `GTEST_OUTPUT` environment variable or the
+`--gtest_output` flag to the string `"xml:path_to_output_file"`, which will
+create the file at the given location. You can also just use the string `"xml"`,
+in which case the output can be found in the `test_detail.xml` file in the
+current directory.
+
+If you specify a directory (for example, `"xml:output/directory/"` on Linux or
+`"xml:output\directory\"` on Windows), googletest will create the XML file in
+that directory, named after the test executable (e.g. `foo_test.xml` for test
+program `foo_test` or `foo_test.exe`). If the file already exists (perhaps left
+over from a previous run), googletest will pick a different name (e.g.
+`foo_test_1.xml`) to avoid overwriting it.
+
+The report is based on the `junitreport` Ant task. Since that format was
+originally intended for Java, a little interpretation is required to make it
+apply to googletest tests, as shown here:
+
+```xml
+<testsuites name="AllTests" ...>
+  <testsuite name="test_case_name" ...>
+    <testcase    name="test_name" ...>
+      <failure message="..."/>
+      <failure message="..."/>
+      <failure message="..."/>
+    </testcase>
+  </testsuite>
+</testsuites>
+```
+
+*   The root `<testsuites>` element corresponds to the entire test program.
+*   `<testsuite>` elements correspond to googletest test suites.
+*   `<testcase>` elements correspond to googletest test functions.
+
+For instance, the following program
+
+```c++
+TEST(MathTest, Addition) { ... }
+TEST(MathTest, Subtraction) { ... }
+TEST(LogicTest, NonContradiction) { ... }
+```
+
+could generate this report:
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="3" failures="1" errors="0" time="0.035" timestamp="2011-10-31T18:52:42" name="AllTests">
+  <testsuite name="MathTest" tests="2" failures="1" errors="0" time="0.015">
+    <testcase name="Addition" status="run" time="0.007" classname="">
+      <failure message="Value of: add(1, 1)&#x0A;  Actual: 3&#x0A;Expected: 2" type="">...</failure>
+      <failure message="Value of: add(1, -1)&#x0A;  Actual: 1&#x0A;Expected: 0" type="">...</failure>
+    </testcase>
+    <testcase name="Subtraction" status="run" time="0.005" classname="">
+    </testcase>
+  </testsuite>
+  <testsuite name="LogicTest" tests="1" failures="0" errors="0" time="0.005">
+    <testcase name="NonContradiction" status="run" time="0.005" classname="">
+    </testcase>
+  </testsuite>
+</testsuites>
+```
+
+Things to note:
+
+*   The `tests` attribute of a `<testsuites>` or `<testsuite>` element tells how
+    many test functions the googletest program or test suite contains, while the
+    `failures` attribute tells how many of them failed.
+
+*   The `time` attribute expresses the duration of the test, test suite, or
+    entire test program in seconds.
+
+*   The `timestamp` attribute records the local date and time of the test
+    execution.
+
+*   Each `<failure>` element corresponds to a single failed googletest
+    assertion.
+
+#### Generating a JSON Report
+
+googletest can also emit a JSON report as an alternative format to XML. To
+generate the JSON report, set the `GTEST_OUTPUT` environment variable or the
+`--gtest_output` flag to the string `"json:path_to_output_file"`, which will
+create the file at the given location. You can also just use the string
+`"json"`, in which case the output can be found in the `test_detail.json` file
+in the current directory.
+
+The report format conforms to the following JSON Schema:
+
+```json
+{
+  "$schema": "http://json-schema.org/schema#",
+  "type": "object",
+  "definitions": {
+    "TestCase": {
+      "type": "object",
+      "properties": {
+        "name": { "type": "string" },
+        "tests": { "type": "integer" },
+        "failures": { "type": "integer" },
+        "disabled": { "type": "integer" },
+        "time": { "type": "string" },
+        "testsuite": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/TestInfo"
+          }
+        }
+      }
+    },
+    "TestInfo": {
+      "type": "object",
+      "properties": {
+        "name": { "type": "string" },
+        "status": {
+          "type": "string",
+          "enum": ["RUN", "NOTRUN"]
+        },
+        "time": { "type": "string" },
+        "classname": { "type": "string" },
+        "failures": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/Failure"
+          }
+        }
+      }
+    },
+    "Failure": {
+      "type": "object",
+      "properties": {
+        "failures": { "type": "string" },
+        "type": { "type": "string" }
+      }
+    }
+  },
+  "properties": {
+    "tests": { "type": "integer" },
+    "failures": { "type": "integer" },
+    "disabled": { "type": "integer" },
+    "errors": { "type": "integer" },
+    "timestamp": {
+      "type": "string",
+      "format": "date-time"
+    },
+    "time": { "type": "string" },
+    "name": { "type": "string" },
+    "testsuites": {
+      "type": "array",
+      "items": {
+        "$ref": "#/definitions/TestCase"
+      }
+    }
+  }
+}
+```
+
+The report uses the format that conforms to the following Proto3 using the
+[JSON encoding](https://developers.google.com/protocol-buffers/docs/proto3#json):
+
+```proto
+syntax = "proto3";
+
+package googletest;
+
+import "google/protobuf/timestamp.proto";
+import "google/protobuf/duration.proto";
+
+message UnitTest {
+  int32 tests = 1;
+  int32 failures = 2;
+  int32 disabled = 3;
+  int32 errors = 4;
+  google.protobuf.Timestamp timestamp = 5;
+  google.protobuf.Duration time = 6;
+  string name = 7;
+  repeated TestCase testsuites = 8;
+}
+
+message TestCase {
+  string name = 1;
+  int32 tests = 2;
+  int32 failures = 3;
+  int32 disabled = 4;
+  int32 errors = 5;
+  google.protobuf.Duration time = 6;
+  repeated TestInfo testsuite = 7;
+}
+
+message TestInfo {
+  string name = 1;
+  enum Status {
+    RUN = 0;
+    NOTRUN = 1;
+  }
+  Status status = 2;
+  google.protobuf.Duration time = 3;
+  string classname = 4;
+  message Failure {
+    string failures = 1;
+    string type = 2;
+  }
+  repeated Failure failures = 5;
+}
+```
+
+For instance, the following program
+
+```c++
+TEST(MathTest, Addition) { ... }
+TEST(MathTest, Subtraction) { ... }
+TEST(LogicTest, NonContradiction) { ... }
+```
+
+could generate this report:
+
+```json
+{
+  "tests": 3,
+  "failures": 1,
+  "errors": 0,
+  "time": "0.035s",
+  "timestamp": "2011-10-31T18:52:42Z",
+  "name": "AllTests",
+  "testsuites": [
+    {
+      "name": "MathTest",
+      "tests": 2,
+      "failures": 1,
+      "errors": 0,
+      "time": "0.015s",
+      "testsuite": [
+        {
+          "name": "Addition",
+          "status": "RUN",
+          "time": "0.007s",
+          "classname": "",
+          "failures": [
+            {
+              "message": "Value of: add(1, 1)\n  Actual: 3\nExpected: 2",
+              "type": ""
+            },
+            {
+              "message": "Value of: add(1, -1)\n  Actual: 1\nExpected: 0",
+              "type": ""
+            }
+          ]
+        },
+        {
+          "name": "Subtraction",
+          "status": "RUN",
+          "time": "0.005s",
+          "classname": ""
+        }
+      ]
+    },
+    {
+      "name": "LogicTest",
+      "tests": 1,
+      "failures": 0,
+      "errors": 0,
+      "time": "0.005s",
+      "testsuite": [
+        {
+          "name": "NonContradiction",
+          "status": "RUN",
+          "time": "0.005s",
+          "classname": ""
+        }
+      ]
+    }
+  ]
+}
+```
+
+IMPORTANT: The exact format of the JSON document is subject to change.
+
+### Controlling How Failures Are Reported
+
+#### Turning Assertion Failures into Break-Points
+
+When running test programs under a debugger, it's very convenient if the
+debugger can catch an assertion failure and automatically drop into interactive
+mode. googletest's *break-on-failure* mode supports this behavior.
+
+To enable it, set the `GTEST_BREAK_ON_FAILURE` environment variable to a value
+other than `0`. Alternatively, you can use the `--gtest_break_on_failure`
+command line flag.
+
+#### Disabling Catching Test-Thrown Exceptions
+
+googletest can be used either with or without exceptions enabled. If a test
+throws a C++ exception or (on Windows) a structured exception (SEH), by default
+googletest catches it, reports it as a test failure, and continues with the next
+test method. This maximizes the coverage of a test run. Also, on Windows an
+uncaught exception will cause a pop-up window, so catching the exceptions allows
+you to run the tests automatically.
+
+When debugging the test failures, however, you may instead want the exceptions
+to be handled by the debugger, such that you can examine the call stack when an
+exception is thrown. To achieve that, set the `GTEST_CATCH_EXCEPTIONS`
+environment variable to `0`, or use the `--gtest_catch_exceptions=0` flag when
+running the tests.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/faq.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/faq.md
new file mode 100755
index 0000000..960a827
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/faq.md
@@ -0,0 +1,753 @@
+# Googletest FAQ
+
+<!-- GOOGLETEST_CM0014 DO NOT DELETE -->
+
+## Why should test suite names and test names not contain underscore?
+
+Underscore (`_`) is special, as C++ reserves the following to be used by the
+compiler and the standard library:
+
+1.  any identifier that starts with an `_` followed by an upper-case letter, and
+2.  any identifier that contains two consecutive underscores (i.e. `__`)
+    *anywhere* in its name.
+
+User code is *prohibited* from using such identifiers.
+
+Now let's look at what this means for `TEST` and `TEST_F`.
+
+Currently `TEST(TestSuiteName, TestName)` generates a class named
+`TestSuiteName_TestName_Test`. What happens if `TestSuiteName` or `TestName`
+contains `_`?
+
+1.  If `TestSuiteName` starts with an `_` followed by an upper-case letter (say,
+    `_Foo`), we end up with `_Foo_TestName_Test`, which is reserved and thus
+    invalid.
+2.  If `TestSuiteName` ends with an `_` (say, `Foo_`), we get
+    `Foo__TestName_Test`, which is invalid.
+3.  If `TestName` starts with an `_` (say, `_Bar`), we get
+    `TestSuiteName__Bar_Test`, which is invalid.
+4.  If `TestName` ends with an `_` (say, `Bar_`), we get
+    `TestSuiteName_Bar__Test`, which is invalid.
+
+So clearly `TestSuiteName` and `TestName` cannot start or end with `_`
+(Actually, `TestSuiteName` can start with `_` -- as long as the `_` isn't
+followed by an upper-case letter. But that's getting complicated. So for
+simplicity we just say that it cannot start with `_`.).
+
+It may seem fine for `TestSuiteName` and `TestName` to contain `_` in the
+middle. However, consider this:
+
+```c++
+TEST(Time, Flies_Like_An_Arrow) { ... }
+TEST(Time_Flies, Like_An_Arrow) { ... }
+```
+
+Now, the two `TEST`s will both generate the same class
+(`Time_Flies_Like_An_Arrow_Test`). That's not good.
+
+So for simplicity, we just ask the users to avoid `_` in `TestSuiteName` and
+`TestName`. The rule is more constraining than necessary, but it's simple and
+easy to remember. It also gives googletest some wiggle room in case its
+implementation needs to change in the future.
+
+If you violate the rule, there may not be immediate consequences, but your test
+may (just may) break with a new compiler (or a new version of the compiler you
+are using) or with a new version of googletest. Therefore it's best to follow
+the rule.
+
+## Why does googletest support `EXPECT_EQ(NULL, ptr)` and `ASSERT_EQ(NULL, ptr)` but not `EXPECT_NE(NULL, ptr)` and `ASSERT_NE(NULL, ptr)`?
+
+First of all you can use `EXPECT_NE(nullptr, ptr)` and `ASSERT_NE(nullptr,
+ptr)`. This is the preferred syntax in the style guide because nullptr does not
+have the type problems that NULL does. Which is why NULL does not work.
+
+Due to some peculiarity of C++, it requires some non-trivial template meta
+programming tricks to support using `NULL` as an argument of the `EXPECT_XX()`
+and `ASSERT_XX()` macros. Therefore we only do it where it's most needed
+(otherwise we make the implementation of googletest harder to maintain and more
+error-prone than necessary).
+
+The `EXPECT_EQ()` macro takes the *expected* value as its first argument and the
+*actual* value as the second. It's reasonable that someone wants to write
+`EXPECT_EQ(NULL, some_expression)`, and this indeed was requested several times.
+Therefore we implemented it.
+
+The need for `EXPECT_NE(NULL, ptr)` isn't nearly as strong. When the assertion
+fails, you already know that `ptr` must be `NULL`, so it doesn't add any
+information to print `ptr` in this case. That means `EXPECT_TRUE(ptr != NULL)`
+works just as well.
+
+If we were to support `EXPECT_NE(NULL, ptr)`, for consistency we'll have to
+support `EXPECT_NE(ptr, NULL)` as well, as unlike `EXPECT_EQ`, we don't have a
+convention on the order of the two arguments for `EXPECT_NE`. This means using
+the template meta programming tricks twice in the implementation, making it even
+harder to understand and maintain. We believe the benefit doesn't justify the
+cost.
+
+Finally, with the growth of the gMock matcher library, we are encouraging people
+to use the unified `EXPECT_THAT(value, matcher)` syntax more often in tests. One
+significant advantage of the matcher approach is that matchers can be easily
+combined to form new matchers, while the `EXPECT_NE`, etc, macros cannot be
+easily combined. Therefore we want to invest more in the matchers than in the
+`EXPECT_XX()` macros.
+
+## I need to test that different implementations of an interface satisfy some common requirements. Should I use typed tests or value-parameterized tests?
+
+For testing various implementations of the same interface, either typed tests or
+value-parameterized tests can get it done. It's really up to you the user to
+decide which is more convenient for you, depending on your particular case. Some
+rough guidelines:
+
+*   Typed tests can be easier to write if instances of the different
+    implementations can be created the same way, modulo the type. For example,
+    if all these implementations have a public default constructor (such that
+    you can write `new TypeParam`), or if their factory functions have the same
+    form (e.g. `CreateInstance<TypeParam>()`).
+*   Value-parameterized tests can be easier to write if you need different code
+    patterns to create different implementations' instances, e.g. `new Foo` vs
+    `new Bar(5)`. To accommodate for the differences, you can write factory
+    function wrappers and pass these function pointers to the tests as their
+    parameters.
+*   When a typed test fails, the default output includes the name of the type,
+    which can help you quickly identify which implementation is wrong.
+    Value-parameterized tests only show the number of the failed iteration by
+    default. You will need to define a function that returns the iteration name
+    and pass it as the third parameter to INSTANTIATE_TEST_SUITE_P to have more
+    useful output.
+*   When using typed tests, you need to make sure you are testing against the
+    interface type, not the concrete types (in other words, you want to make
+    sure `implicit_cast<MyInterface*>(my_concrete_impl)` works, not just that
+    `my_concrete_impl` works). It's less likely to make mistakes in this area
+    when using value-parameterized tests.
+
+I hope I didn't confuse you more. :-) If you don't mind, I'd suggest you to give
+both approaches a try. Practice is a much better way to grasp the subtle
+differences between the two tools. Once you have some concrete experience, you
+can much more easily decide which one to use the next time.
+
+## I got some run-time errors about invalid proto descriptors when using `ProtocolMessageEquals`. Help!
+
+**Note:** `ProtocolMessageEquals` and `ProtocolMessageEquiv` are *deprecated*
+now. Please use `EqualsProto`, etc instead.
+
+`ProtocolMessageEquals` and `ProtocolMessageEquiv` were redefined recently and
+are now less tolerant of invalid protocol buffer definitions. In particular, if
+you have a `foo.proto` that doesn't fully qualify the type of a protocol message
+it references (e.g. `message<Bar>` where it should be `message<blah.Bar>`), you
+will now get run-time errors like:
+
+```
+... descriptor.cc:...] Invalid proto descriptor for file "path/to/foo.proto":
+... descriptor.cc:...]  blah.MyMessage.my_field: ".Bar" is not defined.
+```
+
+If you see this, your `.proto` file is broken and needs to be fixed by making
+the types fully qualified. The new definition of `ProtocolMessageEquals` and
+`ProtocolMessageEquiv` just happen to reveal your bug.
+
+## My death test modifies some state, but the change seems lost after the death test finishes. Why?
+
+Death tests (`EXPECT_DEATH`, etc) are executed in a sub-process s.t. the
+expected crash won't kill the test program (i.e. the parent process). As a
+result, any in-memory side effects they incur are observable in their respective
+sub-processes, but not in the parent process. You can think of them as running
+in a parallel universe, more or less.
+
+In particular, if you use mocking and the death test statement invokes some mock
+methods, the parent process will think the calls have never occurred. Therefore,
+you may want to move your `EXPECT_CALL` statements inside the `EXPECT_DEATH`
+macro.
+
+## EXPECT_EQ(htonl(blah), blah_blah) generates weird compiler errors in opt mode. Is this a googletest bug?
+
+Actually, the bug is in `htonl()`.
+
+According to `'man htonl'`, `htonl()` is a *function*, which means it's valid to
+use `htonl` as a function pointer. However, in opt mode `htonl()` is defined as
+a *macro*, which breaks this usage.
+
+Worse, the macro definition of `htonl()` uses a `gcc` extension and is *not*
+standard C++. That hacky implementation has some ad hoc limitations. In
+particular, it prevents you from writing `Foo<sizeof(htonl(x))>()`, where `Foo`
+is a template that has an integral argument.
+
+The implementation of `EXPECT_EQ(a, b)` uses `sizeof(... a ...)` inside a
+template argument, and thus doesn't compile in opt mode when `a` contains a call
+to `htonl()`. It is difficult to make `EXPECT_EQ` bypass the `htonl()` bug, as
+the solution must work with different compilers on various platforms.
+
+`htonl()` has some other problems as described in `//util/endian/endian.h`,
+which defines `ghtonl()` to replace it. `ghtonl()` does the same thing `htonl()`
+does, only without its problems. We suggest you to use `ghtonl()` instead of
+`htonl()`, both in your tests and production code.
+
+`//util/endian/endian.h` also defines `ghtons()`, which solves similar problems
+in `htons()`.
+
+Don't forget to add `//util/endian` to the list of dependencies in the `BUILD`
+file wherever `ghtonl()` and `ghtons()` are used. The library consists of a
+single header file and will not bloat your binary.
+
+## The compiler complains about "undefined references" to some static const member variables, but I did define them in the class body. What's wrong?
+
+If your class has a static data member:
+
+```c++
+// foo.h
+class Foo {
+  ...
+  static const int kBar = 100;
+};
+```
+
+You also need to define it *outside* of the class body in `foo.cc`:
+
+```c++
+const int Foo::kBar;  // No initializer here.
+```
+
+Otherwise your code is **invalid C++**, and may break in unexpected ways. In
+particular, using it in googletest comparison assertions (`EXPECT_EQ`, etc) will
+generate an "undefined reference" linker error. The fact that "it used to work"
+doesn't mean it's valid. It just means that you were lucky. :-)
+
+## Can I derive a test fixture from another?
+
+Yes.
+
+Each test fixture has a corresponding and same named test suite. This means only
+one test suite can use a particular fixture. Sometimes, however, multiple test
+cases may want to use the same or slightly different fixtures. For example, you
+may want to make sure that all of a GUI library's test suites don't leak
+important system resources like fonts and brushes.
+
+In googletest, you share a fixture among test suites by putting the shared logic
+in a base test fixture, then deriving from that base a separate fixture for each
+test suite that wants to use this common logic. You then use `TEST_F()` to write
+tests using each derived fixture.
+
+Typically, your code looks like this:
+
+```c++
+// Defines a base test fixture.
+class BaseTest : public ::testing::Test {
+ protected:
+  ...
+};
+
+// Derives a fixture FooTest from BaseTest.
+class FooTest : public BaseTest {
+ protected:
+  void SetUp() override {
+    BaseTest::SetUp();  // Sets up the base fixture first.
+    ... additional set-up work ...
+  }
+
+  void TearDown() override {
+    ... clean-up work for FooTest ...
+    BaseTest::TearDown();  // Remember to tear down the base fixture
+                           // after cleaning up FooTest!
+  }
+
+  ... functions and variables for FooTest ...
+};
+
+// Tests that use the fixture FooTest.
+TEST_F(FooTest, Bar) { ... }
+TEST_F(FooTest, Baz) { ... }
+
+... additional fixtures derived from BaseTest ...
+```
+
+If necessary, you can continue to derive test fixtures from a derived fixture.
+googletest has no limit on how deep the hierarchy can be.
+
+For a complete example using derived test fixtures, see
+[sample5_unittest.cc](../samples/sample5_unittest.cc).
+
+## My compiler complains "void value not ignored as it ought to be." What does this mean?
+
+You're probably using an `ASSERT_*()` in a function that doesn't return `void`.
+`ASSERT_*()` can only be used in `void` functions, due to exceptions being
+disabled by our build system. Please see more details
+[here](advanced.md#assertion-placement).
+
+## My death test hangs (or seg-faults). How do I fix it?
+
+In googletest, death tests are run in a child process and the way they work is
+delicate. To write death tests you really need to understand how they work.
+Please make sure you have read [this](advanced.md#how-it-works).
+
+In particular, death tests don't like having multiple threads in the parent
+process. So the first thing you can try is to eliminate creating threads outside
+of `EXPECT_DEATH()`. For example, you may want to use mocks or fake objects
+instead of real ones in your tests.
+
+Sometimes this is impossible as some library you must use may be creating
+threads before `main()` is even reached. In this case, you can try to minimize
+the chance of conflicts by either moving as many activities as possible inside
+`EXPECT_DEATH()` (in the extreme case, you want to move everything inside), or
+leaving as few things as possible in it. Also, you can try to set the death test
+style to `"threadsafe"`, which is safer but slower, and see if it helps.
+
+If you go with thread-safe death tests, remember that they rerun the test
+program from the beginning in the child process. Therefore make sure your
+program can run side-by-side with itself and is deterministic.
+
+In the end, this boils down to good concurrent programming. You have to make
+sure that there is no race conditions or dead locks in your program. No silver
+bullet - sorry!
+
+## Should I use the constructor/destructor of the test fixture or SetUp()/TearDown()? {#CtorVsSetUp}
+
+The first thing to remember is that googletest does **not** reuse the same test
+fixture object across multiple tests. For each `TEST_F`, googletest will create
+a **fresh** test fixture object, immediately call `SetUp()`, run the test body,
+call `TearDown()`, and then delete the test fixture object.
+
+When you need to write per-test set-up and tear-down logic, you have the choice
+between using the test fixture constructor/destructor or `SetUp()/TearDown()`.
+The former is usually preferred, as it has the following benefits:
+
+*   By initializing a member variable in the constructor, we have the option to
+    make it `const`, which helps prevent accidental changes to its value and
+    makes the tests more obviously correct.
+*   In case we need to subclass the test fixture class, the subclass'
+    constructor is guaranteed to call the base class' constructor *first*, and
+    the subclass' destructor is guaranteed to call the base class' destructor
+    *afterward*. With `SetUp()/TearDown()`, a subclass may make the mistake of
+    forgetting to call the base class' `SetUp()/TearDown()` or call them at the
+    wrong time.
+
+You may still want to use `SetUp()/TearDown()` in the following cases:
+
+*   C++ does not allow virtual function calls in constructors and destructors.
+    You can call a method declared as virtual, but it will not use dynamic
+    dispatch, it will use the definition from the class the constructor of which
+    is currently executing. This is because calling a virtual method before the
+    derived class constructor has a chance to run is very dangerous - the
+    virtual method might operate on uninitialized data. Therefore, if you need
+    to call a method that will be overridden in a derived class, you have to use
+    `SetUp()/TearDown()`.
+*   In the body of a constructor (or destructor), it's not possible to use the
+    `ASSERT_xx` macros. Therefore, if the set-up operation could cause a fatal
+    test failure that should prevent the test from running, it's necessary to
+    use `abort` <!-- GOOGLETEST_CM0015 DO NOT DELETE --> and abort the whole test executable,
+    or to use `SetUp()` instead of a constructor.
+*   If the tear-down operation could throw an exception, you must use
+    `TearDown()` as opposed to the destructor, as throwing in a destructor leads
+    to undefined behavior and usually will kill your program right away. Note
+    that many standard libraries (like STL) may throw when exceptions are
+    enabled in the compiler. Therefore you should prefer `TearDown()` if you
+    want to write portable tests that work with or without exceptions.
+*   The googletest team is considering making the assertion macros throw on
+    platforms where exceptions are enabled (e.g. Windows, Mac OS, and Linux
+    client-side), which will eliminate the need for the user to propagate
+    failures from a subroutine to its caller. Therefore, you shouldn't use
+    googletest assertions in a destructor if your code could run on such a
+    platform.
+
+## The compiler complains "no matching function to call" when I use ASSERT_PRED*. How do I fix it?
+
+If the predicate function you use in `ASSERT_PRED*` or `EXPECT_PRED*` is
+overloaded or a template, the compiler will have trouble figuring out which
+overloaded version it should use. `ASSERT_PRED_FORMAT*` and
+`EXPECT_PRED_FORMAT*` don't have this problem.
+
+If you see this error, you might want to switch to
+`(ASSERT|EXPECT)_PRED_FORMAT*`, which will also give you a better failure
+message. If, however, that is not an option, you can resolve the problem by
+explicitly telling the compiler which version to pick.
+
+For example, suppose you have
+
+```c++
+bool IsPositive(int n) {
+  return n > 0;
+}
+
+bool IsPositive(double x) {
+  return x > 0;
+}
+```
+
+you will get a compiler error if you write
+
+```c++
+EXPECT_PRED1(IsPositive, 5);
+```
+
+However, this will work:
+
+```c++
+EXPECT_PRED1(static_cast<bool (*)(int)>(IsPositive), 5);
+```
+
+(The stuff inside the angled brackets for the `static_cast` operator is the type
+of the function pointer for the `int`-version of `IsPositive()`.)
+
+As another example, when you have a template function
+
+```c++
+template <typename T>
+bool IsNegative(T x) {
+  return x < 0;
+}
+```
+
+you can use it in a predicate assertion like this:
+
+```c++
+ASSERT_PRED1(IsNegative<int>, -5);
+```
+
+Things are more interesting if your template has more than one parameters. The
+following won't compile:
+
+```c++
+ASSERT_PRED2(GreaterThan<int, int>, 5, 0);
+```
+
+as the C++ pre-processor thinks you are giving `ASSERT_PRED2` 4 arguments, which
+is one more than expected. The workaround is to wrap the predicate function in
+parentheses:
+
+```c++
+ASSERT_PRED2((GreaterThan<int, int>), 5, 0);
+```
+
+## My compiler complains about "ignoring return value" when I call RUN_ALL_TESTS(). Why?
+
+Some people had been ignoring the return value of `RUN_ALL_TESTS()`. That is,
+instead of
+
+```c++
+  return RUN_ALL_TESTS();
+```
+
+they write
+
+```c++
+  RUN_ALL_TESTS();
+```
+
+This is **wrong and dangerous**. The testing services needs to see the return
+value of `RUN_ALL_TESTS()` in order to determine if a test has passed. If your
+`main()` function ignores it, your test will be considered successful even if it
+has a googletest assertion failure. Very bad.
+
+We have decided to fix this (thanks to Michael Chastain for the idea). Now, your
+code will no longer be able to ignore `RUN_ALL_TESTS()` when compiled with
+`gcc`. If you do so, you'll get a compiler error.
+
+If you see the compiler complaining about you ignoring the return value of
+`RUN_ALL_TESTS()`, the fix is simple: just make sure its value is used as the
+return value of `main()`.
+
+But how could we introduce a change that breaks existing tests? Well, in this
+case, the code was already broken in the first place, so we didn't break it. :-)
+
+## My compiler complains that a constructor (or destructor) cannot return a value. What's going on?
+
+Due to a peculiarity of C++, in order to support the syntax for streaming
+messages to an `ASSERT_*`, e.g.
+
+```c++
+  ASSERT_EQ(1, Foo()) << "blah blah" << foo;
+```
+
+we had to give up using `ASSERT*` and `FAIL*` (but not `EXPECT*` and
+`ADD_FAILURE*`) in constructors and destructors. The workaround is to move the
+content of your constructor/destructor to a private void member function, or
+switch to `EXPECT_*()` if that works. This
+[section](advanced.md#assertion-placement) in the user's guide explains it.
+
+## My SetUp() function is not called. Why?
+
+C++ is case-sensitive. Did you spell it as `Setup()`?
+
+Similarly, sometimes people spell `SetUpTestSuite()` as `SetupTestSuite()` and
+wonder why it's never called.
+
+
+## I have several test suites which share the same test fixture logic, do I have to define a new test fixture class for each of them? This seems pretty tedious.
+
+You don't have to. Instead of
+
+```c++
+class FooTest : public BaseTest {};
+
+TEST_F(FooTest, Abc) { ... }
+TEST_F(FooTest, Def) { ... }
+
+class BarTest : public BaseTest {};
+
+TEST_F(BarTest, Abc) { ... }
+TEST_F(BarTest, Def) { ... }
+```
+
+you can simply `typedef` the test fixtures:
+
+```c++
+typedef BaseTest FooTest;
+
+TEST_F(FooTest, Abc) { ... }
+TEST_F(FooTest, Def) { ... }
+
+typedef BaseTest BarTest;
+
+TEST_F(BarTest, Abc) { ... }
+TEST_F(BarTest, Def) { ... }
+```
+
+## googletest output is buried in a whole bunch of LOG messages. What do I do?
+
+The googletest output is meant to be a concise and human-friendly report. If
+your test generates textual output itself, it will mix with the googletest
+output, making it hard to read. However, there is an easy solution to this
+problem.
+
+Since `LOG` messages go to stderr, we decided to let googletest output go to
+stdout. This way, you can easily separate the two using redirection. For
+example:
+
+```shell
+$ ./my_test > gtest_output.txt
+```
+
+## Why should I prefer test fixtures over global variables?
+
+There are several good reasons:
+
+1.  It's likely your test needs to change the states of its global variables.
+    This makes it difficult to keep side effects from escaping one test and
+    contaminating others, making debugging difficult. By using fixtures, each
+    test has a fresh set of variables that's different (but with the same
+    names). Thus, tests are kept independent of each other.
+2.  Global variables pollute the global namespace.
+3.  Test fixtures can be reused via subclassing, which cannot be done easily
+    with global variables. This is useful if many test suites have something in
+    common.
+
+## What can the statement argument in ASSERT_DEATH() be?
+
+`ASSERT_DEATH(*statement*, *regex*)` (or any death assertion macro) can be used
+wherever `*statement*` is valid. So basically `*statement*` can be any C++
+statement that makes sense in the current context. In particular, it can
+reference global and/or local variables, and can be:
+
+*   a simple function call (often the case),
+*   a complex expression, or
+*   a compound statement.
+
+Some examples are shown here:
+
+```c++
+// A death test can be a simple function call.
+TEST(MyDeathTest, FunctionCall) {
+  ASSERT_DEATH(Xyz(5), "Xyz failed");
+}
+
+// Or a complex expression that references variables and functions.
+TEST(MyDeathTest, ComplexExpression) {
+  const bool c = Condition();
+  ASSERT_DEATH((c ? Func1(0) : object2.Method("test")),
+               "(Func1|Method) failed");
+}
+
+// Death assertions can be used any where in a function.  In
+// particular, they can be inside a loop.
+TEST(MyDeathTest, InsideLoop) {
+  // Verifies that Foo(0), Foo(1), ..., and Foo(4) all die.
+  for (int i = 0; i < 5; i++) {
+    EXPECT_DEATH_M(Foo(i), "Foo has \\d+ errors",
+                   ::testing::Message() << "where i is " << i);
+  }
+}
+
+// A death assertion can contain a compound statement.
+TEST(MyDeathTest, CompoundStatement) {
+  // Verifies that at lease one of Bar(0), Bar(1), ..., and
+  // Bar(4) dies.
+  ASSERT_DEATH({
+    for (int i = 0; i < 5; i++) {
+      Bar(i);
+    }
+  },
+  "Bar has \\d+ errors");
+}
+```
+
+gtest-death-test_test.cc contains more examples if you are interested.
+
+## I have a fixture class `FooTest`, but `TEST_F(FooTest, Bar)` gives me error ``"no matching function for call to `FooTest::FooTest()'"``. Why?
+
+Googletest needs to be able to create objects of your test fixture class, so it
+must have a default constructor. Normally the compiler will define one for you.
+However, there are cases where you have to define your own:
+
+*   If you explicitly declare a non-default constructor for class `FooTest`
+    (`DISALLOW_EVIL_CONSTRUCTORS()` does this), then you need to define a
+    default constructor, even if it would be empty.
+*   If `FooTest` has a const non-static data member, then you have to define the
+    default constructor *and* initialize the const member in the initializer
+    list of the constructor. (Early versions of `gcc` doesn't force you to
+    initialize the const member. It's a bug that has been fixed in `gcc 4`.)
+
+## Why does ASSERT_DEATH complain about previous threads that were already joined?
+
+With the Linux pthread library, there is no turning back once you cross the line
+from single thread to multiple threads. The first time you create a thread, a
+manager thread is created in addition, so you get 3, not 2, threads. Later when
+the thread you create joins the main thread, the thread count decrements by 1,
+but the manager thread will never be killed, so you still have 2 threads, which
+means you cannot safely run a death test.
+
+The new NPTL thread library doesn't suffer from this problem, as it doesn't
+create a manager thread. However, if you don't control which machine your test
+runs on, you shouldn't depend on this.
+
+## Why does googletest require the entire test suite, instead of individual tests, to be named *DeathTest when it uses ASSERT_DEATH?
+
+googletest does not interleave tests from different test suites. That is, it
+runs all tests in one test suite first, and then runs all tests in the next test
+suite, and so on. googletest does this because it needs to set up a test suite
+before the first test in it is run, and tear it down afterwords. Splitting up
+the test case would require multiple set-up and tear-down processes, which is
+inefficient and makes the semantics unclean.
+
+If we were to determine the order of tests based on test name instead of test
+case name, then we would have a problem with the following situation:
+
+```c++
+TEST_F(FooTest, AbcDeathTest) { ... }
+TEST_F(FooTest, Uvw) { ... }
+
+TEST_F(BarTest, DefDeathTest) { ... }
+TEST_F(BarTest, Xyz) { ... }
+```
+
+Since `FooTest.AbcDeathTest` needs to run before `BarTest.Xyz`, and we don't
+interleave tests from different test suites, we need to run all tests in the
+`FooTest` case before running any test in the `BarTest` case. This contradicts
+with the requirement to run `BarTest.DefDeathTest` before `FooTest.Uvw`.
+
+## But I don't like calling my entire test suite \*DeathTest when it contains both death tests and non-death tests. What do I do?
+
+You don't have to, but if you like, you may split up the test suite into
+`FooTest` and `FooDeathTest`, where the names make it clear that they are
+related:
+
+```c++
+class FooTest : public ::testing::Test { ... };
+
+TEST_F(FooTest, Abc) { ... }
+TEST_F(FooTest, Def) { ... }
+
+using FooDeathTest = FooTest;
+
+TEST_F(FooDeathTest, Uvw) { ... EXPECT_DEATH(...) ... }
+TEST_F(FooDeathTest, Xyz) { ... ASSERT_DEATH(...) ... }
+```
+
+## googletest prints the LOG messages in a death test's child process only when the test fails. How can I see the LOG messages when the death test succeeds?
+
+Printing the LOG messages generated by the statement inside `EXPECT_DEATH()`
+makes it harder to search for real problems in the parent's log. Therefore,
+googletest only prints them when the death test has failed.
+
+If you really need to see such LOG messages, a workaround is to temporarily
+break the death test (e.g. by changing the regex pattern it is expected to
+match). Admittedly, this is a hack. We'll consider a more permanent solution
+after the fork-and-exec-style death tests are implemented.
+
+## The compiler complains about "no match for 'operator<<'" when I use an assertion. What gives?
+
+If you use a user-defined type `FooType` in an assertion, you must make sure
+there is an `std::ostream& operator<<(std::ostream&, const FooType&)` function
+defined such that we can print a value of `FooType`.
+
+In addition, if `FooType` is declared in a name space, the `<<` operator also
+needs to be defined in the *same* name space. See https://abseil.io/tips/49 for details.
+
+## How do I suppress the memory leak messages on Windows?
+
+Since the statically initialized googletest singleton requires allocations on
+the heap, the Visual C++ memory leak detector will report memory leaks at the
+end of the program run. The easiest way to avoid this is to use the
+`_CrtMemCheckpoint` and `_CrtMemDumpAllObjectsSince` calls to not report any
+statically initialized heap objects. See MSDN for more details and additional
+heap check/debug routines.
+
+## How can my code detect if it is running in a test?
+
+If you write code that sniffs whether it's running in a test and does different
+things accordingly, you are leaking test-only logic into production code and
+there is no easy way to ensure that the test-only code paths aren't run by
+mistake in production. Such cleverness also leads to
+[Heisenbugs](https://en.wikipedia.org/wiki/Heisenbug). Therefore we strongly
+advise against the practice, and googletest doesn't provide a way to do it.
+
+In general, the recommended way to cause the code to behave differently under
+test is [Dependency Injection](https://en.wikipedia.org/wiki/Dependency_injection). You can inject
+different functionality from the test and from the production code. Since your
+production code doesn't link in the for-test logic at all (the
+[`testonly`](https://docs.bazel.build/versions/master/be/common-definitions.html#common.testonly) attribute for BUILD targets helps to ensure
+that), there is no danger in accidentally running it.
+
+However, if you *really*, *really*, *really* have no choice, and if you follow
+the rule of ending your test program names with `_test`, you can use the
+*horrible* hack of sniffing your executable name (`argv[0]` in `main()`) to know
+whether the code is under test.
+
+## How do I temporarily disable a test?
+
+If you have a broken test that you cannot fix right away, you can add the
+DISABLED_ prefix to its name. This will exclude it from execution. This is
+better than commenting out the code or using #if 0, as disabled tests are still
+compiled (and thus won't rot).
+
+To include disabled tests in test execution, just invoke the test program with
+the --gtest_also_run_disabled_tests flag.
+
+## Is it OK if I have two separate `TEST(Foo, Bar)` test methods defined in different namespaces?
+
+Yes.
+
+The rule is **all test methods in the same test suite must use the same fixture
+class.** This means that the following is **allowed** because both tests use the
+same fixture class (`::testing::Test`).
+
+```c++
+namespace foo {
+TEST(CoolTest, DoSomething) {
+  SUCCEED();
+}
+}  // namespace foo
+
+namespace bar {
+TEST(CoolTest, DoSomething) {
+  SUCCEED();
+}
+}  // namespace bar
+```
+
+However, the following code is **not allowed** and will produce a runtime error
+from googletest because the test methods are using different test fixture
+classes with the same test suite name.
+
+```c++
+namespace foo {
+class CoolTest : public ::testing::Test {};  // Fixture foo::CoolTest
+TEST_F(CoolTest, DoSomething) {
+  SUCCEED();
+}
+}  // namespace foo
+
+namespace bar {
+class CoolTest : public ::testing::Test {};  // Fixture: bar::CoolTest
+TEST_F(CoolTest, DoSomething) {
+  SUCCEED();
+}
+}  // namespace bar
+```
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pkgconfig.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pkgconfig.md
new file mode 100755
index 0000000..6dc0673
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pkgconfig.md
@@ -0,0 +1,141 @@
+## Using GoogleTest from various build systems
+
+GoogleTest comes with pkg-config files that can be used to determine all
+necessary flags for compiling and linking to GoogleTest (and GoogleMock).
+Pkg-config is a standardised plain-text format containing
+
+*   the includedir (-I) path
+*   necessary macro (-D) definitions
+*   further required flags (-pthread)
+*   the library (-L) path
+*   the library (-l) to link to
+
+All current build systems support pkg-config in one way or another. For all
+examples here we assume you want to compile the sample
+`samples/sample3_unittest.cc`.
+
+### CMake
+
+Using `pkg-config` in CMake is fairly easy:
+
+```cmake
+cmake_minimum_required(VERSION 3.0)
+
+cmake_policy(SET CMP0048 NEW)
+project(my_gtest_pkgconfig VERSION 0.0.1 LANGUAGES CXX)
+
+find_package(PkgConfig)
+pkg_search_module(GTEST REQUIRED gtest_main)
+
+add_executable(testapp samples/sample3_unittest.cc)
+target_link_libraries(testapp ${GTEST_LDFLAGS})
+target_compile_options(testapp PUBLIC ${GTEST_CFLAGS})
+
+include(CTest)
+add_test(first_and_only_test testapp)
+```
+
+It is generally recommended that you use `target_compile_options` + `_CFLAGS`
+over `target_include_directories` + `_INCLUDE_DIRS` as the former includes not
+just -I flags (GoogleTest might require a macro indicating to internal headers
+that all libraries have been compiled with threading enabled. In addition,
+GoogleTest might also require `-pthread` in the compiling step, and as such
+splitting the pkg-config `Cflags` variable into include dirs and macros for
+`target_compile_definitions()` might still miss this). The same recommendation
+goes for using `_LDFLAGS` over the more commonplace `_LIBRARIES`, which happens
+to discard `-L` flags and `-pthread`.
+
+### Autotools
+
+Finding GoogleTest in Autoconf and using it from Automake is also fairly easy:
+
+In your `configure.ac`:
+
+```
+AC_PREREQ([2.69])
+AC_INIT([my_gtest_pkgconfig], [0.0.1])
+AC_CONFIG_SRCDIR([samples/sample3_unittest.cc])
+AC_PROG_CXX
+
+PKG_CHECK_MODULES([GTEST], [gtest_main])
+
+AM_INIT_AUTOMAKE([foreign subdir-objects])
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
+```
+
+and in your `Makefile.am`:
+
+```
+check_PROGRAMS = testapp
+TESTS = $(check_PROGRAMS)
+
+testapp_SOURCES = samples/sample3_unittest.cc
+testapp_CXXFLAGS = $(GTEST_CFLAGS)
+testapp_LDADD = $(GTEST_LIBS)
+```
+
+### Meson
+
+Meson natively uses pkgconfig to query dependencies:
+
+```
+project('my_gtest_pkgconfig', 'cpp', version : '0.0.1')
+
+gtest_dep = dependency('gtest_main')
+
+testapp = executable(
+  'testapp',
+  files(['samples/sample3_unittest.cc']),
+  dependencies : gtest_dep,
+  install : false)
+
+test('first_and_only_test', testapp)
+```
+
+### Plain Makefiles
+
+Since `pkg-config` is a small Unix command-line utility, it can be used in
+handwritten `Makefile`s too:
+
+```makefile
+GTEST_CFLAGS = `pkg-config --cflags gtest_main`
+GTEST_LIBS = `pkg-config --libs gtest_main`
+
+.PHONY: tests all
+
+tests: all
+  ./testapp
+
+all: testapp
+
+testapp: testapp.o
+  $(CXX) $(CXXFLAGS) $(LDFLAGS) $< -o $@ $(GTEST_LIBS)
+
+testapp.o: samples/sample3_unittest.cc
+  $(CXX) $(CPPFLAGS) $(CXXFLAGS) $< -c -o $@ $(GTEST_CFLAGS)
+```
+
+### Help! pkg-config can't find GoogleTest!
+
+Let's say you have a `CMakeLists.txt` along the lines of the one in this
+tutorial and you try to run `cmake`. It is very possible that you get a failure
+along the lines of:
+
+```
+-- Checking for one of the modules 'gtest_main'
+CMake Error at /usr/share/cmake/Modules/FindPkgConfig.cmake:640 (message):
+  None of the required 'gtest_main' found
+```
+
+These failures are common if you installed GoogleTest yourself and have not
+sourced it from a distro or other package manager. If so, you need to tell
+pkg-config where it can find the `.pc` files containing the information. Say you
+installed GoogleTest to `/usr/local`, then it might be that the `.pc` files are
+installed under `/usr/local/lib64/pkgconfig`. If you set
+
+```
+export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig
+```
+
+pkg-config will also try to look in `PKG_CONFIG_PATH` to find `gtest_main.pc`.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/primer.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/primer.md
new file mode 100755
index 0000000..0356968
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/primer.md
@@ -0,0 +1,567 @@
+# Googletest Primer
+
+## Introduction: Why googletest?
+
+*googletest* helps you write better C++ tests.
+
+googletest is a testing framework developed by the Testing Technology team with
+Google's specific requirements and constraints in mind. Whether you work on
+Linux, Windows, or a Mac, if you write C++ code, googletest can help you. And it
+supports *any* kind of tests, not just unit tests.
+
+So what makes a good test, and how does googletest fit in? We believe:
+
+1.  Tests should be *independent* and *repeatable*. It's a pain to debug a test
+    that succeeds or fails as a result of other tests. googletest isolates the
+    tests by running each of them on a different object. When a test fails,
+    googletest allows you to run it in isolation for quick debugging.
+2.  Tests should be well *organized* and reflect the structure of the tested
+    code. googletest groups related tests into test suites that can share data
+    and subroutines. This common pattern is easy to recognize and makes tests
+    easy to maintain. Such consistency is especially helpful when people switch
+    projects and start to work on a new code base.
+3.  Tests should be *portable* and *reusable*. Google has a lot of code that is
+    platform-neutral; its tests should also be platform-neutral. googletest
+    works on different OSes, with different compilers, with or without
+    exceptions, so googletest tests can work with a variety of configurations.
+4.  When tests fail, they should provide as much *information* about the problem
+    as possible. googletest doesn't stop at the first test failure. Instead, it
+    only stops the current test and continues with the next. You can also set up
+    tests that report non-fatal failures after which the current test continues.
+    Thus, you can detect and fix multiple bugs in a single run-edit-compile
+    cycle.
+5.  The testing framework should liberate test writers from housekeeping chores
+    and let them focus on the test *content*. googletest automatically keeps
+    track of all tests defined, and doesn't require the user to enumerate them
+    in order to run them.
+6.  Tests should be *fast*. With googletest, you can reuse shared resources
+    across tests and pay for the set-up/tear-down only once, without making
+    tests depend on each other.
+
+Since googletest is based on the popular xUnit architecture, you'll feel right
+at home if you've used JUnit or PyUnit before. If not, it will take you about 10
+minutes to learn the basics and get started. So let's go!
+
+## Beware of the nomenclature
+
+_Note:_ There might be some confusion arising from different definitions of the
+terms _Test_, _Test Case_ and _Test Suite_, so beware of misunderstanding these.
+
+Historically, googletest started to use the term _Test Case_ for grouping
+related tests, whereas current publications, including International Software
+Testing Qualifications Board ([ISTQB](http://www.istqb.org/)) materials and
+various textbooks on software quality, use the term
+_[Test Suite][istqb test suite]_ for this.
+
+The related term _Test_, as it is used in googletest, corresponds to the term
+_[Test Case][istqb test case]_ of ISTQB and others.
+
+The term _Test_ is commonly of broad enough sense, including ISTQB's definition
+of _Test Case_, so it's not much of a problem here. But the term _Test Case_ as
+was used in Google Test is of contradictory sense and thus confusing.
+
+googletest recently started replacing the term _Test Case_ with _Test Suite_.
+The preferred API is *TestSuite*. The older TestCase API is being slowly
+deprecated and refactored away.
+
+So please be aware of the different definitions of the terms:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+Meaning                                                                              | googletest Term         | [ISTQB](http://www.istqb.org/) Term
+:----------------------------------------------------------------------------------- | :---------------------- | :----------------------------------
+Exercise a particular program path with specific input values and verify the results | [TEST()](#simple-tests) | [Test Case][istqb test case]
+
+<!-- mdformat on -->
+
+[istqb test case]: http://glossary.istqb.org/en/search/test%20case
+[istqb test suite]: http://glossary.istqb.org/en/search/test%20suite
+
+## Basic Concepts
+
+When using googletest, you start by writing *assertions*, which are statements
+that check whether a condition is true. An assertion's result can be *success*,
+*nonfatal failure*, or *fatal failure*. If a fatal failure occurs, it aborts the
+current function; otherwise the program continues normally.
+
+*Tests* use assertions to verify the tested code's behavior. If a test crashes
+or has a failed assertion, then it *fails*; otherwise it *succeeds*.
+
+A *test suite* contains one or many tests. You should group your tests into test
+suites that reflect the structure of the tested code. When multiple tests in a
+test suite need to share common objects and subroutines, you can put them into a
+*test fixture* class.
+
+A *test program* can contain multiple test suites.
+
+We'll now explain how to write a test program, starting at the individual
+assertion level and building up to tests and test suites.
+
+## Assertions
+
+googletest assertions are macros that resemble function calls. You test a class
+or function by making assertions about its behavior. When an assertion fails,
+googletest prints the assertion's source file and line number location, along
+with a failure message. You may also supply a custom failure message which will
+be appended to googletest's message.
+
+The assertions come in pairs that test the same thing but have different effects
+on the current function. `ASSERT_*` versions generate fatal failures when they
+fail, and **abort the current function**. `EXPECT_*` versions generate nonfatal
+failures, which don't abort the current function. Usually `EXPECT_*` are
+preferred, as they allow more than one failure to be reported in a test.
+However, you should use `ASSERT_*` if it doesn't make sense to continue when the
+assertion in question fails.
+
+Since a failed `ASSERT_*` returns from the current function immediately,
+possibly skipping clean-up code that comes after it, it may cause a space leak.
+Depending on the nature of the leak, it may or may not be worth fixing - so keep
+this in mind if you get a heap checker error in addition to assertion errors.
+
+To provide a custom failure message, simply stream it into the macro using the
+`<<` operator or a sequence of such operators. An example:
+
+```c++
+ASSERT_EQ(x.size(), y.size()) << "Vectors x and y are of unequal length";
+
+for (int i = 0; i < x.size(); ++i) {
+  EXPECT_EQ(x[i], y[i]) << "Vectors x and y differ at index " << i;
+}
+```
+
+Anything that can be streamed to an `ostream` can be streamed to an assertion
+macro--in particular, C strings and `string` objects. If a wide string
+(`wchar_t*`, `TCHAR*` in `UNICODE` mode on Windows, or `std::wstring`) is
+streamed to an assertion, it will be translated to UTF-8 when printed.
+
+### Basic Assertions
+
+These assertions do basic true/false condition testing.
+
+Fatal assertion            | Nonfatal assertion         | Verifies
+-------------------------- | -------------------------- | --------------------
+`ASSERT_TRUE(condition);`  | `EXPECT_TRUE(condition);`  | `condition` is true
+`ASSERT_FALSE(condition);` | `EXPECT_FALSE(condition);` | `condition` is false
+
+Remember, when they fail, `ASSERT_*` yields a fatal failure and returns from the
+current function, while `EXPECT_*` yields a nonfatal failure, allowing the
+function to continue running. In either case, an assertion failure means its
+containing test fails.
+
+**Availability**: Linux, Windows, Mac.
+
+### Binary Comparison
+
+This section describes assertions that compare two values.
+
+Fatal assertion          | Nonfatal assertion       | Verifies
+------------------------ | ------------------------ | --------------
+`ASSERT_EQ(val1, val2);` | `EXPECT_EQ(val1, val2);` | `val1 == val2`
+`ASSERT_NE(val1, val2);` | `EXPECT_NE(val1, val2);` | `val1 != val2`
+`ASSERT_LT(val1, val2);` | `EXPECT_LT(val1, val2);` | `val1 < val2`
+`ASSERT_LE(val1, val2);` | `EXPECT_LE(val1, val2);` | `val1 <= val2`
+`ASSERT_GT(val1, val2);` | `EXPECT_GT(val1, val2);` | `val1 > val2`
+`ASSERT_GE(val1, val2);` | `EXPECT_GE(val1, val2);` | `val1 >= val2`
+
+Value arguments must be comparable by the assertion's comparison operator or
+you'll get a compiler error. We used to require the arguments to support the
+`<<` operator for streaming to an `ostream`, but this is no longer necessary. If
+`<<` is supported, it will be called to print the arguments when the assertion
+fails; otherwise googletest will attempt to print them in the best way it can.
+For more details and how to customize the printing of the arguments, see the
+[documentation](../../googlemock/docs/cook_book.md#teaching-gmock-how-to-print-your-values).
+
+These assertions can work with a user-defined type, but only if you define the
+corresponding comparison operator (e.g., `==` or `<`). Since this is discouraged
+by the Google
+[C++ Style Guide](https://google.github.io/styleguide/cppguide.html#Operator_Overloading),
+you may need to use `ASSERT_TRUE()` or `EXPECT_TRUE()` to assert the equality of
+two objects of a user-defined type.
+
+However, when possible, `ASSERT_EQ(actual, expected)` is preferred to
+`ASSERT_TRUE(actual == expected)`, since it tells you `actual` and `expected`'s
+values on failure.
+
+Arguments are always evaluated exactly once. Therefore, it's OK for the
+arguments to have side effects. However, as with any ordinary C/C++ function,
+the arguments' evaluation order is undefined (i.e., the compiler is free to
+choose any order), and your code should not depend on any particular argument
+evaluation order.
+
+`ASSERT_EQ()` does pointer equality on pointers. If used on two C strings, it
+tests if they are in the same memory location, not if they have the same value.
+Therefore, if you want to compare C strings (e.g. `const char*`) by value, use
+`ASSERT_STREQ()`, which will be described later on. In particular, to assert
+that a C string is `NULL`, use `ASSERT_STREQ(c_string, NULL)`. Consider using
+`ASSERT_EQ(c_string, nullptr)` if c++11 is supported. To compare two `string`
+objects, you should use `ASSERT_EQ`.
+
+When doing pointer comparisons use `*_EQ(ptr, nullptr)` and `*_NE(ptr, nullptr)`
+instead of `*_EQ(ptr, NULL)` and `*_NE(ptr, NULL)`. This is because `nullptr` is
+typed, while `NULL` is not. See the [FAQ](faq.md) for more details.
+
+If you're working with floating point numbers, you may want to use the floating
+point variations of some of these macros in order to avoid problems caused by
+rounding. See [Advanced googletest Topics](advanced.md) for details.
+
+Macros in this section work with both narrow and wide string objects (`string`
+and `wstring`).
+
+**Availability**: Linux, Windows, Mac.
+
+**Historical note**: Before February 2016 `*_EQ` had a convention of calling it
+as `ASSERT_EQ(expected, actual)`, so lots of existing code uses this order. Now
+`*_EQ` treats both parameters in the same way.
+
+### String Comparison
+
+The assertions in this group compare two **C strings**. If you want to compare
+two `string` objects, use `EXPECT_EQ`, `EXPECT_NE`, and etc instead.
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+
+| Fatal assertion                | Nonfatal assertion             | Verifies                                                 |
+| --------------------------     | ------------------------------ | -------------------------------------------------------- |
+| `ASSERT_STREQ(str1,str2);`     | `EXPECT_STREQ(str1,str2);`     | the two C strings have the same content   		     |
+| `ASSERT_STRNE(str1,str2);`     | `EXPECT_STRNE(str1,str2);`     | the two C strings have different contents 		     |
+| `ASSERT_STRCASEEQ(str1,str2);` | `EXPECT_STRCASEEQ(str1,str2);` | the two C strings have the same content, ignoring case   |
+| `ASSERT_STRCASENE(str1,str2);` | `EXPECT_STRCASENE(str1,str2);` | the two C strings have different contents, ignoring case |
+
+<!-- mdformat on-->
+
+Note that "CASE" in an assertion name means that case is ignored. A `NULL`
+pointer and an empty string are considered *different*.
+
+`*STREQ*` and `*STRNE*` also accept wide C strings (`wchar_t*`). If a comparison
+of two wide strings fails, their values will be printed as UTF-8 narrow strings.
+
+**Availability**: Linux, Windows, Mac.
+
+**See also**: For more string comparison tricks (substring, prefix, suffix, and
+regular expression matching, for example), see [this](advanced.md) in the
+Advanced googletest Guide.
+
+## Simple Tests
+
+To create a test:
+
+1.  Use the `TEST()` macro to define and name a test function. These are
+    ordinary C++ functions that don't return a value.
+2.  In this function, along with any valid C++ statements you want to include,
+    use the various googletest assertions to check values.
+3.  The test's result is determined by the assertions; if any assertion in the
+    test fails (either fatally or non-fatally), or if the test crashes, the
+    entire test fails. Otherwise, it succeeds.
+
+```c++
+TEST(TestSuiteName, TestName) {
+  ... test body ...
+}
+```
+
+`TEST()` arguments go from general to specific. The *first* argument is the name
+of the test suite, and the *second* argument is the test's name within the test
+case. Both names must be valid C++ identifiers, and they should not contain
+any underscores (`_`). A test's *full name* consists of its containing test suite and
+its individual name. Tests from different test suites can have the same
+individual name.
+
+For example, let's take a simple integer function:
+
+```c++
+int Factorial(int n);  // Returns the factorial of n
+```
+
+A test suite for this function might look like:
+
+```c++
+// Tests factorial of 0.
+TEST(FactorialTest, HandlesZeroInput) {
+  EXPECT_EQ(Factorial(0), 1);
+}
+
+// Tests factorial of positive numbers.
+TEST(FactorialTest, HandlesPositiveInput) {
+  EXPECT_EQ(Factorial(1), 1);
+  EXPECT_EQ(Factorial(2), 2);
+  EXPECT_EQ(Factorial(3), 6);
+  EXPECT_EQ(Factorial(8), 40320);
+}
+```
+
+googletest groups the test results by test suites, so logically related tests
+should be in the same test suite; in other words, the first argument to their
+`TEST()` should be the same. In the above example, we have two tests,
+`HandlesZeroInput` and `HandlesPositiveInput`, that belong to the same test
+suite `FactorialTest`.
+
+When naming your test suites and tests, you should follow the same convention as
+for
+[naming functions and classes](https://google.github.io/styleguide/cppguide.html#Function_Names).
+
+**Availability**: Linux, Windows, Mac.
+
+## Test Fixtures: Using the Same Data Configuration for Multiple Tests {#same-data-multiple-tests}
+
+If you find yourself writing two or more tests that operate on similar data, you
+can use a *test fixture*. This allows you to reuse the same configuration of
+objects for several different tests.
+
+To create a fixture:
+
+1.  Derive a class from `::testing::Test` . Start its body with `protected:`, as
+    we'll want to access fixture members from sub-classes.
+2.  Inside the class, declare any objects you plan to use.
+3.  If necessary, write a default constructor or `SetUp()` function to prepare
+    the objects for each test. A common mistake is to spell `SetUp()` as
+    **`Setup()`** with a small `u` - Use `override` in C++11 to make sure you
+    spelled it correctly.
+4.  If necessary, write a destructor or `TearDown()` function to release any
+    resources you allocated in `SetUp()` . To learn when you should use the
+    constructor/destructor and when you should use `SetUp()/TearDown()`, read
+    the [FAQ](faq.md).
+5.  If needed, define subroutines for your tests to share.
+
+When using a fixture, use `TEST_F()` instead of `TEST()` as it allows you to
+access objects and subroutines in the test fixture:
+
+```c++
+TEST_F(TestFixtureName, TestName) {
+  ... test body ...
+}
+```
+
+Like `TEST()`, the first argument is the test suite name, but for `TEST_F()`
+this must be the name of the test fixture class. You've probably guessed: `_F`
+is for fixture.
+
+Unfortunately, the C++ macro system does not allow us to create a single macro
+that can handle both types of tests. Using the wrong macro causes a compiler
+error.
+
+Also, you must first define a test fixture class before using it in a
+`TEST_F()`, or you'll get the compiler error "`virtual outside class
+declaration`".
+
+For each test defined with `TEST_F()`, googletest will create a *fresh* test
+fixture at runtime, immediately initialize it via `SetUp()`, run the test,
+clean up by calling `TearDown()`, and then delete the test fixture. Note that
+different tests in the same test suite have different test fixture objects, and
+googletest always deletes a test fixture before it creates the next one.
+googletest does **not** reuse the same test fixture for multiple tests. Any
+changes one test makes to the fixture do not affect other tests.
+
+As an example, let's write tests for a FIFO queue class named `Queue`, which has
+the following interface:
+
+```c++
+template <typename E>  // E is the element type.
+class Queue {
+ public:
+  Queue();
+  void Enqueue(const E& element);
+  E* Dequeue();  // Returns NULL if the queue is empty.
+  size_t size() const;
+  ...
+};
+```
+
+First, define a fixture class. By convention, you should give it the name
+`FooTest` where `Foo` is the class being tested.
+
+```c++
+class QueueTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+     q1_.Enqueue(1);
+     q2_.Enqueue(2);
+     q2_.Enqueue(3);
+  }
+
+  // void TearDown() override {}
+
+  Queue<int> q0_;
+  Queue<int> q1_;
+  Queue<int> q2_;
+};
+```
+
+In this case, `TearDown()` is not needed since we don't have to clean up after
+each test, other than what's already done by the destructor.
+
+Now we'll write tests using `TEST_F()` and this fixture.
+
+```c++
+TEST_F(QueueTest, IsEmptyInitially) {
+  EXPECT_EQ(q0_.size(), 0);
+}
+
+TEST_F(QueueTest, DequeueWorks) {
+  int* n = q0_.Dequeue();
+  EXPECT_EQ(n, nullptr);
+
+  n = q1_.Dequeue();
+  ASSERT_NE(n, nullptr);
+  EXPECT_EQ(*n, 1);
+  EXPECT_EQ(q1_.size(), 0);
+  delete n;
+
+  n = q2_.Dequeue();
+  ASSERT_NE(n, nullptr);
+  EXPECT_EQ(*n, 2);
+  EXPECT_EQ(q2_.size(), 1);
+  delete n;
+}
+```
+
+The above uses both `ASSERT_*` and `EXPECT_*` assertions. The rule of thumb is
+to use `EXPECT_*` when you want the test to continue to reveal more errors after
+the assertion failure, and use `ASSERT_*` when continuing after failure doesn't
+make sense. For example, the second assertion in the `Dequeue` test is
+`ASSERT_NE(nullptr, n)`, as we need to dereference the pointer `n` later, which
+would lead to a segfault when `n` is `NULL`.
+
+When these tests run, the following happens:
+
+1.  googletest constructs a `QueueTest` object (let's call it `t1`).
+2.  `t1.SetUp()` initializes `t1`.
+3.  The first test (`IsEmptyInitially`) runs on `t1`.
+4.  `t1.TearDown()` cleans up after the test finishes.
+5.  `t1` is destructed.
+6.  The above steps are repeated on another `QueueTest` object, this time
+    running the `DequeueWorks` test.
+
+**Availability**: Linux, Windows, Mac.
+
+## Invoking the Tests
+
+`TEST()` and `TEST_F()` implicitly register their tests with googletest. So,
+unlike with many other C++ testing frameworks, you don't have to re-list all
+your defined tests in order to run them.
+
+After defining your tests, you can run them with `RUN_ALL_TESTS()`, which
+returns `0` if all the tests are successful, or `1` otherwise. Note that
+`RUN_ALL_TESTS()` runs *all tests* in your link unit--they can be from
+different test suites, or even different source files.
+
+When invoked, the `RUN_ALL_TESTS()` macro:
+
+*   Saves the state of all googletest flags.
+
+*   Creates a test fixture object for the first test.
+
+*   Initializes it via `SetUp()`.
+
+*   Runs the test on the fixture object.
+
+*   Cleans up the fixture via `TearDown()`.
+
+*   Deletes the fixture.
+
+*   Restores the state of all googletest flags.
+
+*   Repeats the above steps for the next test, until all tests have run.
+
+If a fatal failure happens the subsequent steps will be skipped.
+
+> IMPORTANT: You must **not** ignore the return value of `RUN_ALL_TESTS()`, or
+> you will get a compiler error. The rationale for this design is that the
+> automated testing service determines whether a test has passed based on its
+> exit code, not on its stdout/stderr output; thus your `main()` function must
+> return the value of `RUN_ALL_TESTS()`.
+>
+> Also, you should call `RUN_ALL_TESTS()` only **once**. Calling it more than
+> once conflicts with some advanced googletest features (e.g., thread-safe
+> [death tests](advanced.md#death-tests)) and thus is not supported.
+
+**Availability**: Linux, Windows, Mac.
+
+## Writing the main() Function
+
+Write your own main() function, which should return the value of
+`RUN_ALL_TESTS()`.
+
+You can start from this boilerplate:
+
+```c++
+#include "this/package/foo.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+// The fixture for testing class Foo.
+class FooTest : public ::testing::Test {
+ protected:
+  // You can remove any or all of the following functions if its body
+  // is empty.
+
+  FooTest() {
+     // You can do set-up work for each test here.
+  }
+
+  ~FooTest() override {
+     // You can do clean-up work that doesn't throw exceptions here.
+  }
+
+  // If the constructor and destructor are not enough for setting up
+  // and cleaning up each test, you can define the following methods:
+
+  void SetUp() override {
+     // Code here will be called immediately after the constructor (right
+     // before each test).
+  }
+
+  void TearDown() override {
+     // Code here will be called immediately after each test (right
+     // before the destructor).
+  }
+
+  // Objects declared here can be used by all tests in the test suite for Foo.
+};
+
+// Tests that the Foo::Bar() method does Abc.
+TEST_F(FooTest, MethodBarDoesAbc) {
+  const std::string input_filepath = "this/package/testdata/myinputfile.dat";
+  const std::string output_filepath = "this/package/testdata/myoutputfile.dat";
+  Foo f;
+  EXPECT_EQ(f.Bar(input_filepath, output_filepath), 0);
+}
+
+// Tests that Foo does Xyz.
+TEST_F(FooTest, DoesXyz) {
+  // Exercises the Xyz feature of Foo.
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+```
+
+The `::testing::InitGoogleTest()` function parses the command line for
+googletest flags, and removes all recognized flags. This allows the user to
+control a test program's behavior via various flags, which we'll cover in
+the [AdvancedGuide](advanced.md). You **must** call this function before calling
+`RUN_ALL_TESTS()`, or the flags won't be properly initialized.
+
+On Windows, `InitGoogleTest()` also works with wide strings, so it can be used
+in programs compiled in `UNICODE` mode as well.
+
+But maybe you think that writing all those main() functions is too much work? We
+agree with you completely, and that's why Google Test provides a basic
+implementation of main(). If it fits your needs, then just link your test with
+gtest\_main library and you are good to go.
+
+NOTE: `ParseGUnitFlags()` is deprecated in favor of `InitGoogleTest()`.
+
+## Known Limitations
+
+*   Google Test is designed to be thread-safe. The implementation is thread-safe
+    on systems where the `pthreads` library is available. It is currently
+    _unsafe_ to use Google Test assertions from two threads concurrently on
+    other systems (e.g. Windows). In most tests this is not an issue as usually
+    the assertions are done in the main thread. If you want to help, you can
+    volunteer to implement the necessary synchronization primitives in
+    `gtest-port.h` for your platform.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pump_manual.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pump_manual.md
new file mode 100755
index 0000000..10b3c5f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/pump_manual.md
@@ -0,0 +1,190 @@
+<b>P</b>ump is <b>U</b>seful for <b>M</b>eta <b>P</b>rogramming.
+
+# The Problem
+
+Template and macro libraries often need to define many classes, functions, or
+macros that vary only (or almost only) in the number of arguments they take.
+It's a lot of repetitive, mechanical, and error-prone work.
+
+Variadic templates and variadic macros can alleviate the problem. However, while
+both are being considered by the C++ committee, neither is in the standard yet
+or widely supported by compilers. Thus they are often not a good choice,
+especially when your code needs to be portable. And their capabilities are still
+limited.
+
+As a result, authors of such libraries often have to write scripts to generate
+their implementation. However, our experience is that it's tedious to write such
+scripts, which tend to reflect the structure of the generated code poorly and
+are often hard to read and edit. For example, a small change needed in the
+generated code may require some non-intuitive, non-trivial changes in the
+script. This is especially painful when experimenting with the code.
+
+# Our Solution
+
+Pump (for Pump is Useful for Meta Programming, Pretty Useful for Meta
+Programming, or Practical Utility for Meta Programming, whichever you prefer) is
+a simple meta-programming tool for C++. The idea is that a programmer writes a
+`foo.pump` file which contains C++ code plus meta code that manipulates the C++
+code. The meta code can handle iterations over a range, nested iterations, local
+meta variable definitions, simple arithmetic, and conditional expressions. You
+can view it as a small Domain-Specific Language. The meta language is designed
+to be non-intrusive (s.t. it won't confuse Emacs' C++ mode, for example) and
+concise, making Pump code intuitive and easy to maintain.
+
+## Highlights
+
+*   The implementation is in a single Python script and thus ultra portable: no
+    build or installation is needed and it works cross platforms.
+*   Pump tries to be smart with respect to
+    [Google's style guide](https://github.com/google/styleguide): it breaks long
+    lines (easy to have when they are generated) at acceptable places to fit
+    within 80 columns and indent the continuation lines correctly.
+*   The format is human-readable and more concise than XML.
+*   The format works relatively well with Emacs' C++ mode.
+
+## Examples
+
+The following Pump code (where meta keywords start with `$`, `[[` and `]]` are
+meta brackets, and `$$` starts a meta comment that ends with the line):
+
+```
+$var n = 3     $$ Defines a meta variable n.
+$range i 0..n  $$ Declares the range of meta iterator i (inclusive).
+$for i [[
+               $$ Meta loop.
+// Foo$i does blah for $i-ary predicates.
+$range j 1..i
+template <size_t N $for j [[, typename A$j]]>
+class Foo$i {
+$if i == 0 [[
+  blah a;
+]] $elif i <= 2 [[
+  blah b;
+]] $else [[
+  blah c;
+]]
+};
+
+]]
+```
+
+will be translated by the Pump compiler to:
+
+```cpp
+// Foo0 does blah for 0-ary predicates.
+template <size_t N>
+class Foo0 {
+  blah a;
+};
+
+// Foo1 does blah for 1-ary predicates.
+template <size_t N, typename A1>
+class Foo1 {
+  blah b;
+};
+
+// Foo2 does blah for 2-ary predicates.
+template <size_t N, typename A1, typename A2>
+class Foo2 {
+  blah b;
+};
+
+// Foo3 does blah for 3-ary predicates.
+template <size_t N, typename A1, typename A2, typename A3>
+class Foo3 {
+  blah c;
+};
+```
+
+In another example,
+
+```
+$range i 1..n
+Func($for i + [[a$i]]);
+$$ The text between i and [[ is the separator between iterations.
+```
+
+will generate one of the following lines (without the comments), depending on
+the value of `n`:
+
+```cpp
+Func();              // If n is 0.
+Func(a1);            // If n is 1.
+Func(a1 + a2);       // If n is 2.
+Func(a1 + a2 + a3);  // If n is 3.
+// And so on...
+```
+
+## Constructs
+
+We support the following meta programming constructs:
+
+| `$var id = exp`                  | Defines a named constant value. `$id` is |
+:                                  : valid util the end of the current meta   :
+:                                  : lexical block.                           :
+| :------------------------------- | :--------------------------------------- |
+| `$range id exp..exp`             | Sets the range of an iteration variable, |
+:                                  : which can be reused in multiple loops    :
+:                                  : later.                                   :
+| `$for id sep [[ code ]]`         | Iteration. The range of `id` must have   |
+:                                  : been defined earlier. `$id` is valid in  :
+:                                  : `code`.                                  :
+| `$($)`                           | Generates a single `$` character.        |
+| `$id`                            | Value of the named constant or iteration |
+:                                  : variable.                                :
+| `$(exp)`                         | Value of the expression.                 |
+| `$if exp [[ code ]] else_branch` | Conditional.                             |
+| `[[ code ]]`                     | Meta lexical block.                      |
+| `cpp_code`                       | Raw C++ code.                            |
+| `$$ comment`                     | Meta comment.                            |
+
+**Note:** To give the user some freedom in formatting the Pump source code, Pump
+ignores a new-line character if it's right after `$for foo` or next to `[[` or
+`]]`. Without this rule you'll often be forced to write very long lines to get
+the desired output. Therefore sometimes you may need to insert an extra new-line
+in such places for a new-line to show up in your output.
+
+## Grammar
+
+```ebnf
+code ::= atomic_code*
+atomic_code ::= $var id = exp
+    | $var id = [[ code ]]
+    | $range id exp..exp
+    | $for id sep [[ code ]]
+    | $($)
+    | $id
+    | $(exp)
+    | $if exp [[ code ]] else_branch
+    | [[ code ]]
+    | cpp_code
+sep ::= cpp_code | empty_string
+else_branch ::= $else [[ code ]]
+    | $elif exp [[ code ]] else_branch
+    | empty_string
+exp ::= simple_expression_in_Python_syntax
+```
+
+## Code
+
+You can find the source code of Pump in [scripts/pump.py](../scripts/pump.py).
+It is still very unpolished and lacks automated tests, although it has been
+successfully used many times. If you find a chance to use it in your project,
+please let us know what you think! We also welcome help on improving Pump.
+
+## Real Examples
+
+You can find real-world applications of Pump in
+[Google Test](https://github.com/google/googletest/tree/master/googletest) and
+[Google Mock](https://github.com/google/googletest/tree/master/googlemock). The
+source file `foo.h.pump` generates `foo.h`.
+
+## Tips
+
+*   If a meta variable is followed by a letter or digit, you can separate them
+    using `[[]]`, which inserts an empty string. For example `Foo$j[[]]Helper`
+    generate `Foo1Helper` when `j` is 1.
+*   To avoid extra-long Pump source lines, you can break a line anywhere you
+    want by inserting `[[]]` followed by a new line. Since any new-line
+    character next to `[[` or `]]` is ignored, the generated code won't contain
+    this new line.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/samples.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/samples.md
new file mode 100755
index 0000000..aaa5883
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/docs/samples.md
@@ -0,0 +1,22 @@
+# Googletest Samples {#samples}
+
+If you're like us, you'd like to look at
+[googletest samples.](https://github.com/google/googletest/tree/master/googletest/samples)
+The sample directory has a number of well-commented samples showing how to use a
+variety of googletest features.
+
+*   Sample #1 shows the basic steps of using googletest to test C++ functions.
+*   Sample #2 shows a more complex unit test for a class with multiple member
+    functions.
+*   Sample #3 uses a test fixture.
+*   Sample #4 teaches you how to use googletest and `googletest.h` together to
+    get the best of both libraries.
+*   Sample #5 puts shared testing logic in a base test fixture, and reuses it in
+    derived fixtures.
+*   Sample #6 demonstrates type-parameterized tests.
+*   Sample #7 teaches the basics of value-parameterized tests.
+*   Sample #8 shows using `Combine()` in value-parameterized tests.
+*   Sample #9 shows use of the listener API to modify Google Test's console
+    output and the use of its reflection API to inspect test results.
+*   Sample #10 shows use of the listener API to implement a primitive memory
+    leak checker.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-death-test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-death-test.h
new file mode 100755
index 0000000..cec9629
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-death-test.h
@@ -0,0 +1,343 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   GOOGLETEST_CM0005 DO NOT DELETE
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test suite, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test suite, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system. It is exposed publicly so that
+// systems that have death-tests with stricter requirements than
+// GTEST_HAS_DEATH_TEST can write their own equivalent of
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter if EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-matchers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-matchers.h
new file mode 100755
index 0000000..c10d650
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-matchers.h
@@ -0,0 +1,750 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+// IWYU pragma: private, include "testing/base/public/gunit.h"
+// IWYU pragma: friend third_party/googletest/googlemock/.*
+// IWYU pragma: friend third_party/googletest/googletest/.*
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GTEST_MAYBE_5046_ 5046
+#else
+#define GTEST_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// MatchResultListener is an abstract class.  Its << operator can be
+// used by a matcher to explain why a value matches or doesn't match.
+//
+class MatchResultListener {
+ public:
+  // Creates a listener object with the given underlying ostream.  The
+  // listener does not own the ostream, and does not dereference it
+  // in the constructor or destructor.
+  explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
+  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
+
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener& operator<<(const T& x) {
+    if (stream_ != nullptr) *stream_ << x;
+    return *this;
+  }
+
+  // Returns the underlying ostream.
+  ::std::ostream* stream() { return stream_; }
+
+  // Returns true if the listener is interested in an explanation of
+  // the match result.  A matcher's MatchAndExplain() method can use
+  // this information to avoid generating the explanation when no one
+  // intends to hear it.
+  bool IsInterested() const { return stream_ != nullptr; }
+
+ private:
+  ::std::ostream* const stream_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+};
+
+inline MatchResultListener::~MatchResultListener() {
+}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class MatcherDescriberInterface {
+ public:
+  virtual ~MatcherDescriberInterface() {}
+
+  // Describes this matcher to an ostream.  The function should print
+  // a verb phrase that describes the property a value matching this
+  // matcher should have.  The subject of the verb phrase is the value
+  // being matched.  For example, the DescribeTo() method of the Gt(7)
+  // matcher prints "is greater than 7".
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.  For
+  // example, if the description of this matcher is "is greater than
+  // 7", the negated description could be "is not greater than 7".
+  // You are not required to override this when implementing
+  // MatcherInterface, but it is highly advised so that your matcher
+  // can produce good error messages.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not (";
+    DescribeTo(os);
+    *os << ")";
+  }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+  // Returns true if the matcher matches x; also explains the match
+  // result to 'listener' if necessary (see the next paragraph), in
+  // the form of a non-restrictive relative clause ("which ...",
+  // "whose ...", etc) that describes x.  For example, the
+  // MatchAndExplain() method of the Pointee(...) matcher should
+  // generate an explanation like "which points to ...".
+  //
+  // Implementations of MatchAndExplain() should add an explanation of
+  // the match result *if and only if* they can provide additional
+  // information that's not already present (or not obvious) in the
+  // print-out of x and the matcher's description.  Whether the match
+  // succeeds is not a factor in deciding whether an explanation is
+  // needed, as sometimes the caller needs to print a failure message
+  // when the match succeeds (e.g. when the matcher is used inside
+  // Not()).
+  //
+  // For example, a "has at least 10 elements" matcher should explain
+  // what the actual element count is, regardless of the match result,
+  // as it is useful information to the reader; on the other hand, an
+  // "is empty" matcher probably only needs to explain what the actual
+  // size is when the match fails, as it's redundant to say that the
+  // size is 0 when the value is already known to be empty.
+  //
+  // You should override this method when defining a new matcher.
+  //
+  // It's the responsibility of the caller (Google Test) to guarantee
+  // that 'listener' is not NULL.  This helps to simplify a matcher's
+  // implementation when it doesn't care about the performance, as it
+  // can talk to 'listener' without checking its validity first.
+  // However, in order to implement dummy listeners efficiently,
+  // listener->stream() may be NULL.
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+  // Inherits these methods from MatcherDescriberInterface:
+  //   virtual void DescribeTo(::std::ostream* os) const = 0;
+  //   virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+namespace internal {
+
+// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
+template <typename T>
+class MatcherInterfaceAdapter : public MatcherInterface<const T&> {
+ public:
+  explicit MatcherInterfaceAdapter(const MatcherInterface<T>* impl)
+      : impl_(impl) {}
+  ~MatcherInterfaceAdapter() override { delete impl_; }
+
+  void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    impl_->DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+ private:
+  const MatcherInterface<T>* const impl_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
+};
+
+struct AnyEq {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a == b; }
+};
+struct AnyNe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a != b; }
+};
+struct AnyLt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a < b; }
+};
+struct AnyGt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a > b; }
+};
+struct AnyLe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a <= b; }
+};
+struct AnyGe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a >= b; }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+  DummyMatchResultListener() : MatchResultListener(nullptr) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream.  The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+  explicit StreamMatchResultListener(::std::ostream* os)
+      : MatchResultListener(os) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it.  We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase {
+ public:
+  // Returns true if the matcher matches x; also explains the match
+  // result to 'listener'.
+  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+  // Returns true if this matcher matches x.
+  bool Matches(const T& x) const {
+    DummyMatchResultListener dummy;
+    return MatchAndExplain(x, &dummy);
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(::std::ostream* os) const {
+    impl_->DescribeNegationTo(os);
+  }
+
+  // Explains why x matches, or doesn't match, the matcher.
+  void ExplainMatchResultTo(const T& x, ::std::ostream* os) const {
+    StreamMatchResultListener listener(os);
+    MatchAndExplain(x, &listener);
+  }
+
+  // Returns the describer for this matcher object; retains ownership
+  // of the describer, which is only guaranteed to be alive when
+  // this matcher object is alive.
+  const MatcherDescriberInterface* GetDescriber() const {
+    return impl_.get();
+  }
+
+ protected:
+  MatcherBase() {}
+
+  // Constructs a matcher from its implementation.
+  explicit MatcherBase(const MatcherInterface<const T&>* impl) : impl_(impl) {}
+
+  template <typename U>
+  explicit MatcherBase(
+      const MatcherInterface<U>* impl,
+      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
+          nullptr)
+      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+
+  MatcherBase(const MatcherBase&) = default;
+  MatcherBase& operator=(const MatcherBase&) = default;
+  MatcherBase(MatcherBase&&) = default;
+  MatcherBase& operator=(MatcherBase&&) = default;
+
+  virtual ~MatcherBase() {}
+
+ private:
+  std::shared_ptr<const MatcherInterface<const T&>> impl_;
+};
+
+}  // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches.  The
+// implementation of Matcher<T> is just a std::shared_ptr to const
+// MatcherInterface<T>.  Don't inherit from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+  // Constructs a null matcher.  Needed for storing Matcher objects in STL
+  // containers.  A default-constructed matcher is not yet initialized.  You
+  // cannot use it until a valid value has been assigned to it.
+  explicit Matcher() {}  // NOLINT
+
+  // Constructs a matcher from its implementation.
+  explicit Matcher(const MatcherInterface<const T&>* impl)
+      : internal::MatcherBase<T>(impl) {}
+
+  template <typename U>
+  explicit Matcher(
+      const MatcherInterface<U>* impl,
+      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
+          nullptr)
+      : internal::MatcherBase<T>(impl) {}
+
+  // Implicit constructor here allows people to write
+  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+  Matcher(T value);  // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const std::string&>
+    : public internal::MatcherBase<const std::string&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
+      : internal::MatcherBase<const std::string&>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<std::string>
+    : public internal::MatcherBase<std::string> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
+      : internal::MatcherBase<std::string>(impl) {}
+  explicit Matcher(const MatcherInterface<std::string>* impl)
+      : internal::MatcherBase<std::string>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+#if GTEST_HAS_ABSL
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const absl::string_view&>
+    : public internal::MatcherBase<const absl::string_view&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
+      : internal::MatcherBase<const absl::string_view&>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass absl::string_views directly.
+  Matcher(absl::string_view s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<absl::string_view>
+    : public internal::MatcherBase<absl::string_view> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
+      : internal::MatcherBase<absl::string_view>(impl) {}
+  explicit Matcher(const MatcherInterface<absl::string_view>* impl)
+      : internal::MatcherBase<absl::string_view>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass absl::string_views directly.
+  Matcher(absl::string_view s);  // NOLINT
+};
+#endif  // GTEST_HAS_ABSL
+
+// Prints a matcher in a human-readable format.
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const Matcher<T>& matcher) {
+  matcher.DescribeTo(&os);
+  return os;
+}
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+//   bool MatchAndExplain(const Value& value,
+//                        MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+  explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
+
+  // Returns a mutable reference to the underlying matcher
+  // implementation object.
+  Impl& mutable_impl() { return impl_; }
+
+  // Returns an immutable reference to the underlying matcher
+  // implementation object.
+  const Impl& impl() const { return impl_; }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new MonomorphicImpl<const T&>(impl_));
+  }
+
+ private:
+  template <typename T>
+  class MonomorphicImpl : public MatcherInterface<T> {
+   public:
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    virtual void DescribeTo(::std::ostream* os) const { impl_.DescribeTo(os); }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      impl_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+      return impl_.MatchAndExplain(x, listener);
+    }
+
+   private:
+    const Impl impl_;
+  };
+
+  Impl impl_;
+};
+
+// Creates a matcher from its implementation.
+// DEPRECATED: Especially in the generic code, prefer:
+//   Matcher<T>(new MyMatcherImpl<const T&>(...));
+//
+// MakeMatcher may create a Matcher that accepts its argument by value, which
+// leads to unnecessary copies & lack of support for non-copyable types.
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
+  return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation.  This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicMatcher(foo);
+// vs
+//   PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
+  return PolymorphicMatcher<Impl>(impl);
+}
+
+namespace internal {
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators.  The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc).  Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+  explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+  template <typename Lhs>
+  operator Matcher<Lhs>() const {
+    return Matcher<Lhs>(new Impl<const Lhs&>(rhs_));
+  }
+
+ private:
+  template <typename T>
+  static const T& Unwrap(const T& v) { return v; }
+  template <typename T>
+  static const T& Unwrap(std::reference_wrapper<T> v) { return v; }
+
+  template <typename Lhs, typename = Rhs>
+  class Impl : public MatcherInterface<Lhs> {
+   public:
+    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
+    bool MatchAndExplain(Lhs lhs,
+                         MatchResultListener* /* listener */) const override {
+      return Op()(lhs, Unwrap(rhs_));
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << D::Desc() << " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << D::NegatedDesc() <<  " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+
+   private:
+    Rhs rhs_;
+  };
+  Rhs rhs_;
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+  explicit EqMatcher(const Rhs& rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+  static const char* Desc() { return "is equal to"; }
+  static const char* NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+  explicit NeMatcher(const Rhs& rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+  static const char* Desc() { return "isn't equal to"; }
+  static const char* NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+  explicit LtMatcher(const Rhs& rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+  static const char* Desc() { return "is <"; }
+  static const char* NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+  explicit GtMatcher(const Rhs& rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+  static const char* Desc() { return "is >"; }
+  static const char* NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+  explicit LeMatcher(const Rhs& rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+  static const char* Desc() { return "is <="; }
+  static const char* NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+  explicit GeMatcher(const Rhs& rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+  static const char* Desc() { return "is >="; }
+  static const char* NegatedDesc() { return "isn't >="; }
+};
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+  MatchesRegexMatcher(const RE* regex, bool full_match)
+      : regex_(regex), full_match_(full_match) {}
+
+#if GTEST_HAS_ABSL
+  bool MatchAndExplain(const absl::string_view& s,
+                       MatchResultListener* listener) const {
+    return MatchAndExplain(std::string(s), listener);
+  }
+#endif  // GTEST_HAS_ABSL
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(std::string(s), listener);
+  }
+
+  // Matches anything that can convert to std::string.
+  //
+  // This is a template, not just a plain function with const std::string&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <class MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const std::string& s2(s);
+    return full_match_ ? RE::FullMatch(s2, *regex_)
+                       : RE::PartialMatch(s2, *regex_);
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << (full_match_ ? "matches" : "contains") << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't " << (full_match_ ? "match" : "contain")
+        << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+ private:
+  const std::shared_ptr<const RE> regex_;
+  const bool full_match_;
+};
+}  // namespace internal
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const std::string& regex) {
+  return MatchesRegex(new internal::RE(regex));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const std::string& regex) {
+  return ContainsRegex(new internal::RE(regex));
+}
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+
+// Constructs a Matcher<T> from a 'value' of type T.  The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) { *this = Eq(value); }
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs.  A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice.  A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+  return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+  return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+  return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+  return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+  return internal::NeMatcher<Rhs>(x);
+}
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-message.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-message.h
new file mode 100755
index 0000000..4a80e11
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-message.h
@@ -0,0 +1,218 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+#include <memory>
+
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == nullptr) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+  // We'll hold the text streamed to this object here.
+  const std::unique_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-param-test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-param-test.h
new file mode 100755
index 0000000..c2e6eae
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-param-test.h
@@ -0,0 +1,503 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test suite
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+                         FooTest,
+                         Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more than once) the first argument to the
+// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
+// actual test suite name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
+// in the given test suite, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_SUITE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include <iterator>
+#include <utility>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test suite is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test suite FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test suite StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
+//
+// This instantiates tests from test suite StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_SUITE_P(CharSequence,
+//                          StlStringTest,
+//                          ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_SUITE_P(CharSequence2,
+//                          CharTest,
+//                          ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+    typename std::iterator_traits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename std::iterator_traits<ForwardIterator>::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test suite BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_SUITE_P(NumSequence,
+//                          BarTest,
+//                          Values("one", "two", "three"));
+//
+// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+//
+template <typename... T>
+internal::ValueArray<T...> Values(T... v) {
+  return internal::ValueArray<T...>(std::move(v)...);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test suite FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments.
+//
+// Example:
+//
+// This will instantiate tests in test suite AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
+//                          Combine(Values("cat", "dog"),
+//                                  Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<std::tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     std::tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
+//                          Combine(Bool(), Bool()));
+//
+template <typename... Generator>
+internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
+  return internal::CartesianProductHolder<Generator...>(g...);
+}
+
+#define TEST_P(test_suite_name, test_name)                                     \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public test_suite_name {                                               \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
+    virtual void TestBody();                                                   \
+                                                                               \
+   private:                                                                    \
+    static int AddToRegistry() {                                               \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestSuitePatternHolder<test_suite_name>(                         \
+              #test_suite_name,                                                \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_suite_name, test_name)>());                             \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry();     \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
+// generator and an optional function or functor that generates custom test name
+// suffixes based on the test parameters. Such a function or functor should
+// accept one argument of type testing::TestParamInfo<class ParamType>, and
+// return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+#define GTEST_EXPAND_(arg) arg
+#define GTEST_GET_FIRST_(first, ...) first
+#define GTEST_GET_SECOND_(first, second, ...) second
+
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
+  }                                                                           \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
+    if (::testing::internal::AlwaysFalse()) {                                 \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
+          __VA_ARGS__,                                                        \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
+          DUMMY_PARAM_)));                                                    \
+      auto t = std::make_tuple(__VA_ARGS__);                                  \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
+                    "Too Many Args!");                                        \
+    }                                                                         \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
+        __VA_ARGS__,                                                          \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
+        DUMMY_PARAM_))))(info);                                               \
+  }                                                                           \
+  static int gtest_##prefix##test_suite_name##_dummy_                         \
+      GTEST_ATTRIBUTE_UNUSED_ =                                               \
+          ::testing::UnitTest::GetInstance()                                  \
+              ->parameterized_test_registry()                                 \
+              .GetTestSuitePatternHolder<test_suite_name>(                    \
+                  #test_suite_name,                                           \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
+              ->AddTestSuiteInstantiation(                                    \
+                  #prefix, &gtest_##prefix##test_suite_name##_EvalGenerator_, \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
+                  __FILE__, __LINE__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TEST_CASE_P                                            \
+  static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
+                "");                                                       \
+  INSTANTIATE_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-printers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-printers.h
new file mode 100755
index 0000000..56a0545
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-printers.h
@@ -0,0 +1,928 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <functional>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+#if GTEST_HAS_ABSL
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view
+#endif
+  kOtherType  // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+#if GTEST_HAS_ABSL
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view.
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T& value, ::std::ostream* os);
+};
+#endif
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
+                               ? kProtobuf
+                               : std::is_convertible<
+                                     const T&, internal::BiggestInt>::value
+                                     ? kConvertibleToInteger
+                                     :
+#if GTEST_HAS_ABSL
+                                     std::is_convertible<
+                                         const T&, absl::string_view>::value
+                                         ? kConvertibleToStringView
+                                         :
+#endif
+                                         kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type> struct WrapPrinterType {};
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == nullptr) {
+    *os << "NULL";
+  } else {
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == nullptr) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void*>(p);
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !std::is_pointer<T>::value
+                ? kPrintOther
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::std::string.
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::std::wstring.
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_ABSL
+// Overload for absl::string_view.
+inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_HAS_ABSL
+
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+
+template <typename T>
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
+  UniversalPrinter<T&>::Print(ref.get(), os);
+}
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T&, std::integral_constant<size_t, 0>,
+                  ::std::ostream*) {}
+
+template <typename T, size_t I>
+void PrintTupleTo(const T& t, std::integral_constant<size_t, I>,
+                  ::std::ostream* os) {
+  PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (I > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    *os << ", ";
+  }
+  UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
+      std::get<I - 1>(t), os);
+}
+
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  *os << "(";
+  PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
+  *os << ")";
+}
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+    absl::visit(Visitor{os}, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(std::string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector< ::std::string> Strings;
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+template <typename Tuple>
+void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
+                               Strings*) {}
+template <typename Tuple, size_t I>
+void TersePrintPrefixToStrings(const Tuple& t,
+                               std::integral_constant<size_t, I>,
+                               Strings* strings) {
+  TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
+                            strings);
+  ::std::stringstream ss;
+  UniversalTersePrint(std::get<I - 1>(t), &ss);
+  strings->push_back(ss.str());
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TersePrintPrefixToStrings(
+      value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
+      &result);
+  return result;
+}
+
+}  // namespace internal
+
+#if GTEST_HAS_ABSL
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T& value, ::std::ostream* os) {
+  internal::PrintTo(absl::string_view(value), os);
+}
+}  // namespace internal2
+#endif
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-spi.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-spi.h
new file mode 100755
index 0000000..aa38870
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-spi.h
@@ -0,0 +1,238 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+// GOOGLETEST_CM0004 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  ~ScopedFakeTestPartResultReporter() override;
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type, const std::string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const std::string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-test-part.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-test-part.h
new file mode 100755
index 0000000..4f189b6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-test-part.h
@@ -0,0 +1,184 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure,     // Failed and the test should be terminated.
+    kSkip              // Skipped.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type, const char* a_file_name, int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == nullptr ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {}
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? nullptr : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true if the test part was skipped.
+  bool skipped() const { return type_ == kSkip; }
+
+  // Returns true if the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true if the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true if the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+  // Returns true if the test part failed.
+  bool failed() const { return fatally_failed() || nonfatally_failed(); }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class GTEST_API_ TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  ~HasNewFatalFailureHelper() override;
+  void ReportTestPartResult(const TestPartResult& result) override;
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-typed-test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-typed-test.h
new file mode 100755
index 0000000..095ce05
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest-typed-test.h
@@ -0,0 +1,330 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test suite, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_SUITE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_SUITE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test suite as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to the special name TypeParam to get the type
+  // parameter.  Since we are inside a derived class template, C++ requires
+  // us to visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test suite
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_SUITE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test suite as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test suite name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+                            DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test suite name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_SUITE above,
+// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test suite.
+#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
+
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestSuiteName) \
+  gtest_type_params_##TestSuiteName##_NameGenerator
+
+#define TYPED_TEST_SUITE(CaseName, Types, ...)                           \
+  typedef ::testing::internal::TypeList<Types>::type GTEST_TYPE_PARAMS_( \
+      CaseName);                                                         \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type  \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName)                                       \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    virtual void TestBody();                                                  \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+        GTEST_ATTRIBUTE_UNUSED_ =                                             \
+      ::testing::internal::TypeParameterizedTest<                             \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   #CaseName, #TestName, 0,                   \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE                                                \
+  static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
+  TYPED_TEST_SUITE
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test suite are defined in.  The exact
+// name of the namespace is subject to change without notice.
+#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test suite.
+#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
+  gtest_typed_test_suite_p_state_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test suite.
+#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
+  gtest_registered_test_names_##TestSuiteName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+#define TYPED_TEST_SUITE_P(SuiteName)              \
+  static ::testing::internal::TypedTestSuitePState \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE_P                                                 \
+  static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
+  TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define TYPED_TEST_P(SuiteName, TestName)                             \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
+    template <typename gtest_TypeParam_>                              \
+    class TestName : public SuiteName<gtest_TypeParam_> {             \
+     private:                                                         \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                \
+      typedef gtest_TypeParam_ TypeParam;                             \
+      virtual void TestBody();                                        \
+    };                                                                \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
+            __FILE__, __LINE__, #SuiteName, #TestName);               \
+  }                                                                   \
+  template <typename gtest_TypeParam_>                                \
+  void GTEST_SUITE_NAMESPACE_(                                        \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                            \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                                \
+    typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  }                                                                            \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(                       \
+      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                     \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames(    \
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define REGISTER_TYPED_TEST_CASE_P                                           \
+  static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
+                "");                                                         \
+  REGISTER_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
+      ::testing::internal::TypeParameterizedTestSuite<                      \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
+          ::testing::internal::TypeList<Types>::type>::                     \
+          Register(#Prefix,                                                 \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), #SuiteName, \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
+                   ::testing::internal::GenerateNames<                      \
+                       ::testing::internal::NameGeneratorSelector<          \
+                           __VA_ARGS__>::type,                              \
+                       ::testing::internal::TypeList<Types>::type>())
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TYPED_TEST_CASE_P                                      \
+  static_assert(                                                           \
+      ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
+  INSTANTIATE_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest.h
new file mode 100755
index 0000000..dfe7c78
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest.h
@@ -0,0 +1,2477 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <cstddef>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-matchers.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
+#endif
+
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise. For use with an external test framework.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class FuchsiaDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestSuite;
+
+// Old API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TestCase = TestSuite;
+#endif
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+#include "gtest/gtest_pred_impl.h"
+
+namespace testing {
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestSuites, and
+// each TestSuite contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestSuite() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestSuite() method to shadow the one defined in the super
+  // class.
+  // Failures that happen during SetUpTestSuite are logged but otherwise
+  // ignored.
+  static void SetUpTestSuite() {}
+
+  // Tears down the stuff shared by all tests in this test suite.
+  //
+  // Google Test will call Foo::TearDownTestSuite() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestSuite() method to shadow the one defined in the super
+  // class.
+  // Failures that happen during TearDownTestSuite are logged but otherwise
+  // ignored.
+  static void TearDownTestSuite() {}
+
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  static void TearDownTestCase() {}
+  static void SetUpTestCase() {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns true if the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true if the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true if the current test was skipped.
+  static bool IsSkipped();
+
+  // Returns true if the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test suite, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test suite.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true if the current test has the same fixture class as
+  // the first test in the current test suite.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true if the test passed (i.e. no test part failed).
+  bool Passed() const { return !Skipped() && !Failed(); }
+
+  // Returns true if the test was skipped.
+  bool Skipped() const;
+
+  // Returns true if the test failed.
+  bool Failed() const;
+
+  // Returns true if the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true if the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Gets the time of the test case start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestSuite;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the start time.
+  void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testsuite tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test suite name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test suite name.
+  const char* test_suite_name() const { return test_suite_name_.c_str(); }
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const char* test_case_name() const { return test_suite_name(); }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != nullptr) return value_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test suite Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true if this test will appear in the XML report.
+  bool is_reportable() const {
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestSuite;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_suite_name, const char* name, const char* type_param,
+      const char* value_param, internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
+      internal::TearDownTestSuiteFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_suite_name, const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_suite_name_;    // test suite name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const std::unique_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const std::unique_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True if this test should run
+  bool is_disabled_;                // True if this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  bool is_in_another_shard_;        // Will be run in another shard.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test suite, which consists of a vector of TestInfos.
+//
+// TestSuite is not copyable.
+class GTEST_API_ TestSuite {
+ public:
+  // Creates a TestSuite with the given name.
+  //
+  // TestSuite does NOT have a default constructor.  Always use this
+  // constructor to create a TestSuite object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test suite
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  TestSuite(const char* name, const char* a_type_param,
+            internal::SetUpTestSuiteFunc set_up_tc,
+            internal::TearDownTestSuiteFunc tear_down_tc);
+
+  // Destructor of TestSuite.
+  virtual ~TestSuite();
+
+  // Gets the name of the TestSuite.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test suite.
+  const char* type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns true if any test in this test suite should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test suite.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests in this test suite.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests in this test suite.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test suite.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test suite that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test suite.
+  int total_test_count() const;
+
+  // Returns true if the test suite passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true if the test suite failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Gets the time of the test suite start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestSuite and TearDownTestSuite.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestSuite.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestSuite.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
+  // destruction of the TestSuite object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test suite.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test suite.
+  static void ClearTestSuiteResult(TestSuite* test_suite) {
+    test_suite->ClearResult();
+  }
+
+  // Runs every test in this TestSuite.
+  void Run();
+
+  // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestSuite().
+  void RunSetUpTestSuite() {
+    if (set_up_tc_ != nullptr) {
+      (*set_up_tc_)();
+    }
+  }
+
+  // Runs TearDownTestSuite() for this TestSuite.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestSuite().
+  void RunTearDownTestSuite() {
+    if (tear_down_tc_ != nullptr) {
+      (*tear_down_tc_)();
+    }
+  }
+
+  // Returns true if test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true if test skipped.
+  static bool TestSkipped(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Skipped();
+  }
+
+  // Returns true if test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true if the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true if test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true if this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test suite.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test suite.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const std::unique_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test suite.
+  internal::SetUpTestSuiteFunc set_up_tc_;
+  // Pointer to the function that tears down the test suite.
+  internal::TearDownTestSuiteFunc tear_down_tc_;
+  // True if any test in this test suite should run.
+  bool should_run_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestSuite and
+  // TearDownTestSuite.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestSuites.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test suite starts.
+  virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test suite ends.
+  virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
+  void OnTestEnd(const TestInfo& /*test_info*/) override {}
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                          int /*iteration*/) override {}
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestSuite;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestSuites.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestSuite object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+#endif
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+  // Returns the ParameterizedTestSuiteRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
+
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
+
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
+
+  // Gets the number of all test suites that contain at least one test
+  // that should run.
+  int test_suite_to_run_count() const;
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  int successful_test_case_count() const;
+  int failed_test_case_count() const;
+  int total_test_case_count() const;
+  int test_case_to_run_count() const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true if the unit test passed (i.e. all test suites passed).
+  bool Passed() const;
+
+  // Returns true if the unit test failed (i.e. some test suite failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite* GetTestSuite(int i) const;
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase* GetTestCase(int i) const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test suites.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+  // from SetUpTestSuite or TearDownTestSuite, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite* GetMutableTestSuite(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and functions are friends as they need to access private
+  // members of UnitTest.
+  friend class ScopedTrace;
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleTest();
+
+namespace internal {
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// This block of code defines operator==/!=
+// to block lexical scope lookup.
+// It prevents using invalid operator==/!= defined at namespace scope.
+struct faketype {};
+inline bool operator==(faketype, faketype) { return true; }
+inline bool operator!=(faketype, faketype) { return false; }
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
+
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <
+      typename T1, typename T2,
+      // Disable this overload for cases where one argument is a pointer
+      // and the other is the null pointer constant.
+      typename std::enable_if<!std::is_integral<T1>::value ||
+                              !std::is_pointer<T2>::value>::type* = nullptr>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression, const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  template <typename T>
+  static AssertionResult Compare(
+      const char* lhs_expression, const char* rhs_expression,
+      // Handle cases where '0' is used as a null pointer literal.
+      std::nullptr_t /* lhs */, T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression, static_cast<T*>(nullptr),
+                       rhs);
+  }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
+
+GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
+                                                            const char* fmt,
+                                                            ...);
+
+}  // namespace internal
+
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   ~FooTest() override {
+//     // Can use GetParam() here.
+//   }
+//   void SetUp() override {
+//     // Can use GetParam() here.
+//   }
+//   void TearDown override {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor.
+  static const ParamType& GetParam() {
+    GTEST_CHECK_(parameter_ != nullptr)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = nullptr;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+// Macros for indicating success/failure in test code.
+
+// Skips test in runtime.
+// Skipping test aborts current function.
+// Skipped tests are neither successful nor failed.
+#define GTEST_SKIP() GTEST_SKIP_("Skipped")
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Like GTEST_FAIL(), but at the given source file location.
+#define GTEST_FAIL_AT(file, line)         \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kFatalFailure)
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define EXPECT_DOUBLE_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define ASSERT_FLOAT_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define ASSERT_DOUBLE_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message) \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
+
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles if type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test suite, and the second
+// parameter is the name of the test within the test suite.
+//
+// The convention is to end the test suite name with "Test".  For
+// example, a test suite for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_suite_name, test_name)             \
+  GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
+              ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test suite name.  The second parameter is the
+// name of the test within the test suite.
+//
+// A test fixture class must be declared earlier.  The user should put
+// the test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
+//   }
+//
+// GOOGLETEST_CM0011 DO NOT DELETE
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+// Dynamically registers a test with the framework.
+//
+// This is an advanced API only to be used when the `TEST` macros are
+// insufficient. The macros should be preferred when possible, as they avoid
+// most of the complexity of calling this function.
+//
+// The `factory` argument is a factory callable (move-constructible) object or
+// function pointer that creates a new instance of the Test object. It
+// handles ownership to the caller. The signature of the callable is
+// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
+// tests registered with the same `test_suite_name` must return the same
+// fixture type. This is checked at runtime.
+//
+// The framework will infer the fixture class from the factory and will call
+// the `SetUpTestSuite` and `TearDownTestSuite` for it.
+//
+// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+// undefined.
+//
+// Use case example:
+//
+// class MyFixture : public ::testing::Test {
+//  public:
+//   // All of these optional, just like in regular macro usage.
+//   static void SetUpTestSuite() { ... }
+//   static void TearDownTestSuite() { ... }
+//   void SetUp() override { ... }
+//   void TearDown() override { ... }
+// };
+//
+// class MyTest : public MyFixture {
+//  public:
+//   explicit MyTest(int data) : data_(data) {}
+//   void TestBody() override { ... }
+//
+//  private:
+//   int data_;
+// };
+//
+// void RegisterMyTests(const std::vector<int>& values) {
+//   for (int v : values) {
+//     ::testing::RegisterTest(
+//         "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+//         std::to_string(v).c_str(),
+//         __FILE__, __LINE__,
+//         // Important to use the fixture type as the return type here.
+//         [=]() -> MyFixture* { return new MyTest(v); });
+//   }
+// }
+// ...
+// int main(int argc, char** argv) {
+//   std::vector<int> values_to_test = LoadValuesFromConfig();
+//   RegisterMyTests(values_to_test);
+//   ...
+//   return RUN_ALL_TESTS();
+// }
+//
+template <int&... ExplicitParameterBarrier, typename Factory>
+TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
+                       const char* type_param, const char* value_param,
+                       const char* file, int line, Factory factory) {
+  using TestT = typename std::remove_pointer<decltype(factory())>::type;
+
+  class FactoryImpl : public internal::TestFactoryBase {
+   public:
+    explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
+    Test* CreateTest() override { return factory_(); }
+
+   private:
+    Factory factory_;
+  };
+
+  return internal::MakeAndRegisterTestInfo(
+      test_suite_name, test_name, type_param, value_param,
+      internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
+      internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
+      internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
+      new FactoryImpl{std::move(factory)});
+}
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_pred_impl.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_pred_impl.h
new file mode 100755
index 0000000..d514255
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,359 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ", " << e5 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
+         << e5 << " evaluates to " << ::testing::PrintToString(v5);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_prod.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_prod.h
new file mode 100755
index 0000000..e651671
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/gtest_prod.h
@@ -0,0 +1,61 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
+// }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/README.md b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/README.md
new file mode 100755
index 0000000..ff391fb
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/README.md
@@ -0,0 +1,56 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+    `OsStackTraceGetterInterface`.
+*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+    `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GTEST_FLAG(flag_name)`
+*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
+    own flagfile flag parsing.
+*   `GTEST_DECLARE_bool_(name)`
+*   `GTEST_DECLARE_int32_(name)`
+*   `GTEST_DECLARE_string_(name)`
+*   `GTEST_DEFINE_bool_(name, default_val, doc)`
+*   `GTEST_DEFINE_int32_(name, default_val, doc)`
+*   `GTEST_DEFINE_string_(name, default_val, doc)`
+
+### Logging:
+
+*   `GTEST_LOG_(severity)`
+*   `GTEST_CHECK_(condition)`
+*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+*   `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+*   `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+*   `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+*   See documentation at `gtest/gtest-printers.h` for details on how to define a
+    custom printer.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-port.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-port.h
new file mode 100755
index 0000000..cd85d95
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-printers.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-printers.h
new file mode 100755
index 0000000..eb4467a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest.h
new file mode 100755
index 0000000..4c8e07b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-death-test-internal.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-death-test-internal.h
new file mode 100755
index 0000000..68bd353
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,304 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include "gtest/gtest-matchers.h"
+#include "gtest/internal/gtest-internal.h"
+
+#include <stdio.h>
+#include <memory>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, Matcher<const std::string&> matcher,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement,
+                      Matcher<const std::string&> matcher, const char* file,
+                      int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  bool Create(const char* statement, Matcher<const std::string&> matcher,
+              const char* file, int line, DeathTest** test) override;
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
+// and interpreted as a regex (rather than an Eq matcher) for legacy
+// compatibility.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    ::testing::internal::RE regex) {
+  return ContainsRegex(regex.pattern());
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(const char* regex) {
+  return ContainsRegex(regex);
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    const ::std::string& regex) {
+  return ContainsRegex(regex);
+}
+
+// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
+// used directly.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    Matcher<const ::std::string&> matcher) {
+  return matcher;
+}
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::DeathTest* gtest_dt;                                  \
+    if (!::testing::internal::DeathTest::Create(                               \
+            #statement,                                                        \
+            ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
+            __FILE__, __LINE__, &gtest_dt)) {                                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                        \
+    }                                                                          \
+    if (gtest_dt != nullptr) {                                                 \
+      std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) {                                        \
+        case ::testing::internal::DeathTest::OVERSEE_TEST:                     \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) {                \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                  \
+          }                                                                    \
+          break;                                                               \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: {                   \
+          ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel(       \
+              gtest_dt);                                                       \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt);            \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
+          break;                                                               \
+        }                                                                      \
+        default:                                                               \
+          break;                                                               \
+      }                                                                        \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__)                                \
+        : fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher)    \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                  \
+  if (::testing::internal::AlwaysTrue()) {                       \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);   \
+  } else if (!::testing::internal::AlwaysTrue()) {               \
+    ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
+  } else                                                         \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-filepath.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-filepath.h
new file mode 100755
index 0000000..5d21bbc
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,211 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true if the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-internal.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-internal.h
new file mode 100755
index 0000000..3a98ce4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1386 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+// Stringifies its argument.
+#define GTEST_STRINGIFY_(name) #name
+
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test suites.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// An IgnoredValue object can be implicitly constructed from ANY value.
+class IgnoredValue {
+  struct Sink {};
+ public:
+  // This constructor template allows any value to be implicitly
+  // converted to IgnoredValue.  The object has no data member and
+  // doesn't try to remember anything about the argument.  We
+  // deliberately omit the 'explicit' keyword in order to allow the
+  // conversion to be implicit.
+  // Disable the conversion if T already has a magical conversion operator.
+  // Otherwise we get ambiguity.
+  template <typename T,
+            typename std::enable_if<!std::is_convertible<T, Sink>::value,
+                                    int>::type = 0>
+  IgnoredValue(const T& /* ignored */) {}  // NOLINT(runtime/explicit)
+};
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner-Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true if this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true if this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test suite, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  Test* CreateTest() override { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestSuite() and TearDownTestSuite() functions.
+using SetUpTestSuiteFunc = void (*)();
+using TearDownTestSuiteFunc = void (*)();
+
+struct CodeLocation {
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
+//  Helper to identify which setup function for TestCase / TestSuite to call.
+//  Only one function is allowed, either TestCase or TestSute but not both.
+
+// Utility functions to help SuiteApiResolver
+using SetUpTearDownSuiteFuncType = void (*)();
+
+inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
+    SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
+  return a == def ? nullptr : a;
+}
+
+template <typename T>
+//  Note that SuiteApiResolver inherits from T because
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SuiteApiResolver can access them.
+struct SuiteApiResolver : T {
+  // testing::Test is only forward declared at this point. So we make it a
+  // dependend class for the compiler to be OK with it.
+  using Test =
+      typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
+
+  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
+                                                        int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
+           "make sure there is only one present at "
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+
+  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
+                                                           int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
+           " please make sure there is only one present at"
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:   name of the test suite
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test suite.
+class GTEST_API_ TypedTestSuitePState {
+ public:
+  TypedTestSuitePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test suite hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr,
+              "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TypedTestCasePState = TypedTestSuitePState;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == nullptr) {
+    return nullptr;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == nullptr ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[static_cast<size_t>(index)])
+            .c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        nullptr,  // No value parameter.
+        code_location, GetTypeId<FixtureClass>(),
+        SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestSuite {
+ public:
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestSuitePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0, type_names);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
+                                      Types>::Register(prefix, code_location,
+                                                       state, case_name,
+                                                       SkipComma(test_names),
+                                                       type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestSuite<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestSuitePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error if T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+  typename std::remove_const<typename std::remove_reference<T>::type>::type
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true if T is type proto2::Message or a subclass of it.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+          std::is_convertible<const T*, const ::proto2::Message*>::value> {};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
+struct IsRecursiveContainerImpl;
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, false> : public std::false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true> {
+  using value_type = decltype(*std::declval<typename C::const_iterator>());
+  using type =
+      std::is_same<typename std::remove_const<
+                       typename std::remove_reference<value_type>::type>::type,
+                   C>;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+// Backport of std::index_sequence.
+template <size_t... Is>
+struct IndexSequence {
+  using type = IndexSequence;
+};
+
+// Double the IndexSequence, and one if plus_one is true.
+template <bool plus_one, typename T, size_t sizeofT>
+struct DoubleSequence;
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
+};
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)...>;
+};
+
+// Backport of std::make_index_sequence.
+// It uses O(ln(N)) instantiation depth.
+template <size_t N>
+struct MakeIndexSequence
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+                     N / 2>::type {};
+
+template <>
+struct MakeIndexSequence<0> : IndexSequence<> {};
+
+// FIXME: This implementation of ElemFromList is O(1) in instantiation depth,
+// but it is O(N^2) in total instantiations. Not sure if this is the best
+// tradeoff, as it will make it somewhat slow to compile.
+template <typename T, size_t, size_t>
+struct ElemFromListImpl {};
+
+template <typename T, size_t I>
+struct ElemFromListImpl<T, I, I> {
+  using type = T;
+};
+
+// Get the Nth element from T...
+// It uses O(1) instantiation depth.
+template <size_t N, typename I, typename... T>
+struct ElemFromList;
+
+template <size_t N, size_t... I, typename... T>
+struct ElemFromList<N, IndexSequence<I...>, T...>
+    : ElemFromListImpl<T, N, I>... {};
+
+template <typename... T>
+class FlatTuple;
+
+template <typename Derived, size_t I>
+struct FlatTupleElemBase;
+
+template <typename... T, size_t I>
+struct FlatTupleElemBase<FlatTuple<T...>, I> {
+  using value_type =
+      typename ElemFromList<I, typename MakeIndexSequence<sizeof...(T)>::type,
+                            T...>::type;
+  FlatTupleElemBase() = default;
+  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  value_type value;
+};
+
+template <typename Derived, typename Idx>
+struct FlatTupleBase;
+
+template <size_t... Idx, typename... T>
+struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
+    : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
+  using Indices = IndexSequence<Idx...>;
+  FlatTupleBase() = default;
+  explicit FlatTupleBase(T... t)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+};
+
+// Analog to std::tuple but with different tradeoffs.
+// This class minimizes the template instantiation depth, thus allowing more
+// elements that std::tuple would. std::tuple has been seen to require an
+// instantiation depth of more than 10x the number of elements in some
+// implementations.
+// FlatTuple and ElemFromList are not recursive and have a fixed depth
+// regardless of T...
+// MakeIndexSequence, on the other hand, it is recursive but with an
+// instantiation depth of O(ln(N)).
+template <typename... T>
+class FlatTuple
+    : private FlatTupleBase<FlatTuple<T...>,
+                            typename MakeIndexSequence<sizeof...(T)>::type> {
+  using Indices = typename FlatTuple::FlatTupleBase::Indices;
+
+ public:
+  FlatTuple() = default;
+  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+
+  template <size_t I>
+  const typename ElemFromList<I, Indices, T...>::type& Get() const {
+    return static_cast<const FlatTupleElemBase<FlatTuple, I>*>(this)->value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, Indices, T...>::type& Get() {
+    return static_cast<FlatTupleElemBase<FlatTuple, I>*>(this)->value;
+  }
+};
+
+// Utility functions to be called with static_assert to induce deprecation
+// warnings.
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TEST_SUITE_P")
+constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE_P is deprecated, please use "
+    "TYPED_TEST_SUITE_P")
+constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE is deprecated, please use "
+    "TYPED_TEST_SUITE")
+constexpr bool TypedTestCaseIsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
+    "REGISTER_TYPED_TEST_SUITE_P")
+constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TYPED_TEST_SUITE_P")
+constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+#define GTEST_SKIP_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
+
+// Suppress MSVC warning 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+  test_suite_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
+      : public parent_class {                                                 \
+   public:                                                                    \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+                                                                              \
+   private:                                                                   \
+    virtual void TestBody();                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
+                                                           test_name));       \
+  };                                                                          \
+                                                                              \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
+                                                    test_name)::test_info_ =  \
+      ::testing::internal::MakeAndRegisterTestInfo(                           \
+          #test_suite_name, #test_name, nullptr, nullptr,                     \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
+              test_suite_name, test_name)>);                                  \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-param-util.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-param-util.h
new file mode 100755
index 0000000..9753399
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,883 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Type and function utilities for implementing parameterized tests.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-printers.h"
+
+namespace testing {
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// Utility Functions
+
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test suite. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
+                                           CodeLocation code_location);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  std::unique_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  ~RangeGenerator() override {}
+
+  ParamIteratorInterface<T>* Begin() const override {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  ParamIteratorInterface<T>* End() const override {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    ParamIteratorInterface<T>* Clone() const override {
+      return new Iterator(*this);
+    }
+    const T* Current() const override { return &value_; }
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  ~ValuesInIteratorRangeGenerator() override {}
+
+  ParamIteratorInterface<T>* Begin() const override {
+    return new Iterator(this, container_.begin());
+  }
+  ParamIteratorInterface<T>* End() const override {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      ++iterator_;
+      value_.reset();
+    }
+    ParamIteratorInterface<T>* Clone() const override {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    const T* Current() const override {
+      if (value_.get() == nullptr) value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of std::unique_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable std::unique_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+template <typename T = int>
+void TestNotEmpty() {
+  static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
+}
+template <typename T = int>
+void TestNotEmpty(const T&) {}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  Test* CreateTest() override {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestSuiteInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestSuite>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestSuite::ParamType> {
+ public:
+  using ParamType = typename TestSuite::ParamType;
+
+  TestMetaFactory() {}
+
+  TestFactoryBase* CreateTestFactory(ParamType parameter) override {
+    return new ParameterizedTestFactory<TestSuite>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfoBase is a generic interface
+// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
+// a collection of pointers to the ParameterizedTestSuiteInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestSuiteInfoBase {
+ public:
+  virtual ~ParameterizedTestSuiteInfoBase() {}
+
+  // Base part of test suite name for display purposes.
+  virtual const std::string& GetTestSuiteName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestSuiteTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test suite right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestSuiteInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test suite and generators
+// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
+// test suite. It registers tests with all values generated by all
+// generators when asked.
+template <class TestSuite>
+class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestSuiteInstantiation().
+  using ParamType = typename TestSuite::ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType>&);
+
+  explicit ParameterizedTestSuiteInfo(const char* name,
+                                      CodeLocation code_location)
+      : test_suite_name_(name), code_location_(code_location) {}
+
+  // Test case base name for display purposes.
+  const std::string& GetTestSuiteName() const override {
+    return test_suite_name_;
+  }
+  // Test case id to verify identity.
+  TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_suite_name is the base name of the test suite (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test suite base name and DoBar is test base name.
+  void AddTestPattern(const char* test_suite_name, const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(std::shared_ptr<TestInfo>(
+        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+  }
+  // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestSuiteInstantiation(const std::string& instantiation_name,
+                                GeneratorCreationFunc* func,
+                                ParamNameGeneratorFunc* name_func,
+                                const char* file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test suite
+  // test suites right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more than once.
+  void RegisterTests() override {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      std::shared_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const std::string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
+
+        std::string test_suite_name;
+        if ( !instantiation_name.empty() )
+          test_suite_name = instantiation_name + "/";
+        test_suite_name += test_info->test_suite_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          if (!test_info->test_base_name.empty()) {
+            test_name_stream << test_info->test_base_name << "/";
+          }
+          test_name_stream << param_name;
+          MakeAndRegisterTestInfo(
+              test_suite_name.c_str(), test_name_stream.GetString().c_str(),
+              nullptr,  // No type parameter.
+              PrintToString(*param_it).c_str(), code_location_,
+              GetTestSuiteTypeId(),
+              SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
+              SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }    // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory)
+        : test_suite_base_name(a_test_suite_base_name),
+          test_base_name(a_test_base_name),
+          test_meta_factory(a_test_meta_factory) {}
+
+    const std::string test_suite_base_name;
+    const std::string test_base_name;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+  // Records data received from INSTANTIATE_TEST_SUITE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
+
+  const std::string test_suite_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+};  // class ParameterizedTestSuiteInfo
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+template <class TestCase>
+using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteRegistry contains a map of
+// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
+// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
+// ParameterizedTestSuiteInfo descriptors.
+class ParameterizedTestSuiteRegistry {
+ public:
+  ParameterizedTestSuiteRegistry() {}
+  ~ParameterizedTestSuiteRegistry() {
+    for (auto& test_suite_info : test_suite_infos_) {
+      delete test_suite_info;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test suite.
+  template <class TestSuite>
+  ParameterizedTestSuiteInfo<TestSuite>* GetTestSuitePatternHolder(
+      const char* test_suite_name, CodeLocation code_location) {
+    ParameterizedTestSuiteInfo<TestSuite>* typed_test_info = nullptr;
+    for (auto& test_suite_info : test_suite_infos_) {
+      if (test_suite_info->GetTestSuiteName() == test_suite_name) {
+        if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test suite setup and tear-down in this case.
+          ReportInvalidTestSuiteType(test_suite_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == nullptr) {
+      typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
+          test_suite_name, code_location);
+      test_suite_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (auto& test_suite_info : test_suite_infos_) {
+      test_suite_info->RegisterTests();
+    }
+  }
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name, CodeLocation code_location) {
+    return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
+  }
+
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ private:
+  using TestSuiteInfoContainer = ::std::vector<ParameterizedTestSuiteInfoBase*>;
+
+  TestSuiteInfoContainer test_suite_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+};
+
+}  // namespace internal
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+// Used in the Values() function to provide polymorphic capabilities.
+
+template <typename... Ts>
+class ValueArray {
+ public:
+  ValueArray(Ts... v) : v_{std::move(v)...} {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {  // NOLINT
+    return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
+  }
+
+ private:
+  template <typename T, size_t... I>
+  std::vector<T> MakeVector(IndexSequence<I...>) const {
+    return std::vector<T>{static_cast<T>(v_.template Get<I>())...};
+  }
+
+  FlatTuple<Ts...> v_;
+};
+
+template <typename... T>
+class CartesianProductGenerator
+    : public ParamGeneratorInterface<::std::tuple<T...>> {
+ public:
+  typedef ::std::tuple<T...> ParamType;
+
+  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...>& g)
+      : generators_(g) {}
+  ~CartesianProductGenerator() override {}
+
+  ParamIteratorInterface<ParamType>* Begin() const override {
+    return new Iterator(this, generators_, false);
+  }
+  ParamIteratorInterface<ParamType>* End() const override {
+    return new Iterator(this, generators_, true);
+  }
+
+ private:
+  template <class I>
+  class IteratorImpl;
+  template <size_t... I>
+  class IteratorImpl<IndexSequence<I...>>
+      : public ParamIteratorInterface<ParamType> {
+   public:
+    IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
+             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+        : base_(base),
+          begin_(std::get<I>(generators).begin()...),
+          end_(std::get<I>(generators).end()...),
+          current_(is_end ? end_ : begin_) {
+      ComputeCurrentValue();
+    }
+    ~IteratorImpl() override {}
+
+    const ParamGeneratorInterface<ParamType>* BaseGenerator() const override {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    void Advance() override {
+      assert(!AtEnd());
+      // Advance the last iterator.
+      ++std::get<sizeof...(T) - 1>(current_);
+      // if that reaches end, propagate that up.
+      AdvanceIfEnd<sizeof...(T) - 1>();
+      ComputeCurrentValue();
+    }
+    ParamIteratorInterface<ParamType>* Clone() const override {
+      return new IteratorImpl(*this);
+    }
+
+    const ParamType* Current() const override { return current_value_.get(); }
+
+    bool Equals(const ParamIteratorInterface<ParamType>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const IteratorImpl* typed_other =
+          CheckedDowncastToActualType<const IteratorImpl>(&other);
+
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      if (AtEnd() && typed_other->AtEnd()) return true;
+
+      bool same = true;
+      bool dummy[] = {
+          (same = same && std::get<I>(current_) ==
+                              std::get<I>(typed_other->current_))...};
+      (void)dummy;
+      return same;
+    }
+
+   private:
+    template <size_t ThisI>
+    void AdvanceIfEnd() {
+      if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
+
+      bool last = ThisI == 0;
+      if (last) {
+        // We are done. Nothing else to propagate.
+        return;
+      }
+
+      constexpr size_t NextI = ThisI - (ThisI != 0);
+      std::get<ThisI>(current_) = std::get<ThisI>(begin_);
+      ++std::get<NextI>(current_);
+      AdvanceIfEnd<NextI>();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
+    }
+    bool AtEnd() const {
+      bool at_end = false;
+      bool dummy[] = {
+          (at_end = at_end || std::get<I>(current_) == std::get<I>(end_))...};
+      (void)dummy;
+      return at_end;
+    }
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    std::tuple<typename ParamGenerator<T>::iterator...> begin_;
+    std::tuple<typename ParamGenerator<T>::iterator...> end_;
+    std::tuple<typename ParamGenerator<T>::iterator...> current_;
+    std::shared_ptr<ParamType> current_value_;
+  };
+
+  using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
+
+  std::tuple<ParamGenerator<T>...> generators_;
+};
+
+template <class... Gen>
+class CartesianProductHolder {
+ public:
+  CartesianProductHolder(const Gen&... g) : generators_(g...) {}
+  template <typename... T>
+  operator ParamGenerator<::std::tuple<T...>>() const {
+    return ParamGenerator<::std::tuple<T...>>(
+        new CartesianProductGenerator<T...>(generators_));
+  }
+
+ private:
+  std::tuple<Gen...> generators_;
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port-arch.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port-arch.h
new file mode 100755
index 0000000..cece93d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,107 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+#  define GTEST_OS_WINDOWS 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __OS2__
+# define GTEST_OS_OS2 1
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __DragonFly__
+# define GTEST_OS_DRAGONFLY 1
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+# define GTEST_OS_FUCHSIA 1
+#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
+# define GTEST_OS_GNU_KFREEBSD 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+# define GTEST_OS_NETBSD 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+#define GTEST_OS_HAIKU 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port.h
new file mode 100755
index 0000000..830aa19
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_DRAGONFLY - DragonFlyBSD
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
+//   GTEST_OS_HAIKU    - Haiku
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_OS2      - OS/2
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above RE\b(s) are mutually exclusive.
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+//
+// Deprecation warnings:
+//   GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
+//                                        deprecated; calling a marked function
+//                                        should generate a compiler warning
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <memory>
+#include <type_traits>
+
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <algorithm>  // NOLINT
+#include <iostream>   // NOLINT
+#include <sstream>    // NOLINT
+#include <string>     // NOLINT
+#include <tuple>
+#include <utility>
+#include <vector>  // NOLINT
+
+#include "gtest/internal/gtest-port-arch.h"
+#include "gtest/internal/custom/gtest-port.h"
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if defined(_MSC_VER)
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Not all compilers are MSVC
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
+#else
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true if Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_USES_PCRE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 if exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS if exceptions are enabled before clang 220714,
+// but if cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 if exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 if exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+#define GTEST_HAS_STD_WSTRING                                         \
+  (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     GTEST_OS_HAIKU))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro if RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI if RTTI is enabled.
+# elif defined(__GNUC__)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+#define GTEST_HAS_PTHREAD                                                      \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+   GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
+   GTEST_OS_HAIKU)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||             \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                                   \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
+     GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
+     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
+# define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+  (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+# if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
+#else
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  void operator=(type const &) = delete
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) = delete; \
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifndef GTEST_IS_THREADSAFE
+
+#define GTEST_IS_THREADSAFE                                                 \
+  (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ ||                                     \
+   (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
+   GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_IS_THREADSAFE
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
+#ifdef _MSC_VER
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if !defined(GTEST_HAS_CXXABI_H_)
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable HWAddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(hwaddress_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+       __attribute__((no_sanitize("hwaddress")))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+# endif  // __has_feature(hwaddress_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+// Legacy imports for backwards compatibility.
+// New code should use std:: names directly.
+using std::get;
+using std::make_tuple;
+using std::tuple;
+using std::tuple_element;
+using std::tuple_size;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
+// time expression is true (in new code, use static_assert instead). For
+// example, you could use it to verify the size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// The second argument to the macro must be a valid C++ identifier. If the
+// expression is false, compiler will issue an error containing this identifier.
+#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines RE.
+
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true if regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true if regular expression re
+  // matches a substring of str (including str itself).
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+  const char* pattern_;
+  bool is_valid_;
+
+# if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+# else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+# endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+#endif  // GTEST_USES_PCRE
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(nullptr); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+# define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef { typedef const T& type; };
+template <typename T>
+struct ConstRef<T&> { typedef T& type; };
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+  const To to = nullptr;
+  ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+void ClearInjectableArgvs();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, nullptr);
+}
+# endif  // GTEST_HAS_PTHREAD
+
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true if the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+# endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return nullptr;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() override { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
+      finished_ = true;
+    }
+  }
+
+  void Run() override {
+    if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true if we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  GTEST_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != nullptr) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+template <bool B>
+using bool_constant = std::integral_constant<bool, B>;
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return nullptr;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+[[noreturn]] void Abort();
+#else
+[[noreturn]] inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+// Macros for declaring flags.
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+# define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#if !defined(GTEST_INTERNAL_DEPRECATED)
+
+// Internal Macro to mark an API deprecated, for googletest usage only
+// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
+// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
+// a deprecated entity will trigger a warning when compiled with
+// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
+// For msvc /W3 option will need to be used
+// Note that for 'other' compilers this macro evaluates to nothing to prevent
+// compilations errors.
+#if defined(_MSC_VER)
+#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__GNUC__)
+#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
+#else
+#define GTEST_INTERNAL_DEPRECATED(message)
+#endif
+
+#endif  // !defined(GTEST_INTERNAL_DEPRECATED)
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-string.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-string.h
new file mode 100755
index 0000000..26d8407
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-string.h
@@ -0,0 +1,170 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true if they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true if they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true if they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true if they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true if the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats an int value as "%X".
+  static std::string FormatHexUInt32(UInt32 value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h
new file mode 100755
index 0000000..3d7542d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,3335 @@
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test suite.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return CanonicalizeForStdLibVersioning(name_str);
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_SUITE() and
+// INSTANTIATE_TYPED_TEST_SUITE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h.pump b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h.pump
new file mode 100755
index 0000000..5e31b7b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/include/gtest/internal/gtest-type-util.h.pump
@@ -0,0 +1,302 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of type lists we want to support.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most $n types in a list, and at most $n
+// type-parameterized tests in one type-parameterized test suite.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return CanonicalizeForStdLibVersioning(name_str);
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[typename T$j]]>
+struct Types$i {
+  typedef T1 Head;
+  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+
+$range i 1..n
+template <$for i, [[typename T$i = internal::None]]>
+struct Types {
+  typedef internal::Types$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Types<$for i, [[internal::None]]> {
+  typedef internal::Types0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[typename T$j]]>
+struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
+  typedef internal::Types$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates$i {
+  typedef TemplateSel<T1> Head;
+  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+
+$range i 1..n
+template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
+struct Templates {
+  typedef Templates$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Templates<$for i, [[NoneT]]> {
+  typedef Templates0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
+  typedef Templates$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_SUITE() and
+// INSTANTIATE_TYPED_TEST_SUITE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+
+$range i 1..n
+template <$for i, [[typename T$i]]>
+struct TypeList<Types<$for i, [[T$i]]> > {
+  typedef typename Types<$for i, [[T$i]]>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/prime_tables.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/prime_tables.h
new file mode 100755
index 0000000..4178e70
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/prime_tables.h
@@ -0,0 +1,126 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+// This provides interface PrimeTable that determines whether a number is a
+// prime and determines a next prime number. This interface is used
+// in Google Test samples demonstrating use of parameterized tests.
+
+#ifndef GTEST_SAMPLES_PRIME_TABLES_H_
+#define GTEST_SAMPLES_PRIME_TABLES_H_
+
+#include <algorithm>
+
+// The prime table interface.
+class PrimeTable {
+ public:
+  virtual ~PrimeTable() {}
+
+  // Returns true if n is a prime number.
+  virtual bool IsPrime(int n) const = 0;
+
+  // Returns the smallest prime number greater than p; or returns -1
+  // if the next prime is beyond the capacity of the table.
+  virtual int GetNextPrime(int p) const = 0;
+};
+
+// Implementation #1 calculates the primes on-the-fly.
+class OnTheFlyPrimeTable : public PrimeTable {
+ public:
+  bool IsPrime(int n) const override {
+    if (n <= 1) return false;
+
+    for (int i = 2; i*i <= n; i++) {
+      // n is divisible by an integer other than 1 and itself.
+      if ((n % i) == 0) return false;
+    }
+
+    return true;
+  }
+
+  int GetNextPrime(int p) const override {
+    for (int n = p + 1; n > 0; n++) {
+      if (IsPrime(n)) return n;
+    }
+
+    return -1;
+  }
+};
+
+// Implementation #2 pre-calculates the primes and stores the result
+// in an array.
+class PreCalculatedPrimeTable : public PrimeTable {
+ public:
+  // 'max' specifies the maximum number the prime table holds.
+  explicit PreCalculatedPrimeTable(int max)
+      : is_prime_size_(max + 1), is_prime_(new bool[max + 1]) {
+    CalculatePrimesUpTo(max);
+  }
+  ~PreCalculatedPrimeTable() override { delete[] is_prime_; }
+
+  bool IsPrime(int n) const override {
+    return 0 <= n && n < is_prime_size_ && is_prime_[n];
+  }
+
+  int GetNextPrime(int p) const override {
+    for (int n = p + 1; n < is_prime_size_; n++) {
+      if (is_prime_[n]) return n;
+    }
+
+    return -1;
+  }
+
+ private:
+  void CalculatePrimesUpTo(int max) {
+    ::std::fill(is_prime_, is_prime_ + is_prime_size_, true);
+    is_prime_[0] = is_prime_[1] = false;
+
+    // Checks every candidate for prime number (we know that 2 is the only even
+    // prime).
+    for (int i = 2; i*i <= max; i += i%2+1) {
+      if (!is_prime_[i]) continue;
+
+      // Marks all multiples of i (except i itself) as non-prime.
+      // We are starting here from i-th multiplier, because all smaller
+      // complex numbers were already marked.
+      for (int j = i*i; j <= max; j += i) {
+        is_prime_[j] = false;
+      }
+    }
+  }
+
+  const int is_prime_size_;
+  bool* const is_prime_;
+
+  // Disables compiler warning "assignment operator could not be generated."
+  void operator=(const PreCalculatedPrimeTable& rhs);
+};
+
+#endif  // GTEST_SAMPLES_PRIME_TABLES_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.cc
new file mode 100755
index 0000000..58dbf17
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.cc
@@ -0,0 +1,66 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#include "sample1.h"
+
+// Returns n! (the factorial of n).  For negative n, n! is defined to be 1.
+int Factorial(int n) {
+  int result = 1;
+  for (int i = 1; i <= n; i++) {
+    result *= i;
+  }
+
+  return result;
+}
+
+// Returns true if n is a prime number.
+bool IsPrime(int n) {
+  // Trivial case 1: small numbers
+  if (n <= 1) return false;
+
+  // Trivial case 2: even numbers
+  if (n % 2 == 0) return n == 2;
+
+  // Now, we have that n is odd and n >= 3.
+
+  // Try to divide n by every odd number i, starting from 3
+  for (int i = 3; ; i += 2) {
+    // We only have to try i up to the square root of n
+    if (i > n/i) break;
+
+    // Now, we have i <= n/i < n.
+    // If n is divisible by i, n is not prime.
+    if (n % i == 0) return false;
+  }
+
+  // n has no integer factor in the range (1, n), and thus is prime.
+  return true;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.h
new file mode 100755
index 0000000..a90eae4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1.h
@@ -0,0 +1,41 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#ifndef GTEST_SAMPLES_SAMPLE1_H_
+#define GTEST_SAMPLES_SAMPLE1_H_
+
+// Returns n! (the factorial of n).  For negative n, n! is defined to be 1.
+int Factorial(int n);
+
+// Returns true if n is a prime number.
+bool IsPrime(int n);
+
+#endif  // GTEST_SAMPLES_SAMPLE1_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample10_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample10_unittest.cc
new file mode 100755
index 0000000..36cdac2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample10_unittest.cc
@@ -0,0 +1,139 @@
+// Copyright 2009 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample shows how to use Google Test listener API to implement
+// a primitive leak checker.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gtest/gtest.h"
+using ::testing::EmptyTestEventListener;
+using ::testing::InitGoogleTest;
+using ::testing::Test;
+using ::testing::TestEventListeners;
+using ::testing::TestInfo;
+using ::testing::TestPartResult;
+using ::testing::UnitTest;
+
+namespace {
+// We will track memory used by this class.
+class Water {
+ public:
+  // Normal Water declarations go here.
+
+  // operator new and operator delete help us control water allocation.
+  void* operator new(size_t allocation_size) {
+    allocated_++;
+    return malloc(allocation_size);
+  }
+
+  void operator delete(void* block, size_t /* allocation_size */) {
+    allocated_--;
+    free(block);
+  }
+
+  static int allocated() { return allocated_; }
+
+ private:
+  static int allocated_;
+};
+
+int Water::allocated_ = 0;
+
+// This event listener monitors how many Water objects are created and
+// destroyed by each test, and reports a failure if a test leaks some Water
+// objects. It does this by comparing the number of live Water objects at
+// the beginning of a test and at the end of a test.
+class LeakChecker : public EmptyTestEventListener {
+ private:
+  // Called before a test starts.
+  void OnTestStart(const TestInfo& /* test_info */) override {
+    initially_allocated_ = Water::allocated();
+  }
+
+  // Called after a test ends.
+  void OnTestEnd(const TestInfo& /* test_info */) override {
+    int difference = Water::allocated() - initially_allocated_;
+
+    // You can generate a failure in any event handler except
+    // OnTestPartResult. Just use an appropriate Google Test assertion to do
+    // it.
+    EXPECT_LE(difference, 0) << "Leaked " << difference << " unit(s) of Water!";
+  }
+
+  int initially_allocated_;
+};
+
+TEST(ListenersTest, DoesNotLeak) {
+  Water* water = new Water;
+  delete water;
+}
+
+// This should fail when the --check_for_leaks command line flag is
+// specified.
+TEST(ListenersTest, LeaksWater) {
+  Water* water = new Water;
+  EXPECT_TRUE(water != nullptr);
+}
+}  // namespace
+
+int main(int argc, char **argv) {
+  InitGoogleTest(&argc, argv);
+
+  bool check_for_leaks = false;
+  if (argc > 1 && strcmp(argv[1], "--check_for_leaks") == 0 )
+    check_for_leaks = true;
+  else
+    printf("%s\n", "Run this program with --check_for_leaks to enable "
+           "custom leak checking in the tests.");
+
+  // If we are given the --check_for_leaks command line flag, installs the
+  // leak checker.
+  if (check_for_leaks) {
+    TestEventListeners& listeners = UnitTest::GetInstance()->listeners();
+
+    // Adds the leak checker to the end of the test event listener list,
+    // after the default text output printer and the default XML report
+    // generator.
+    //
+    // The order is important - it ensures that failures generated in the
+    // leak checker's OnTestEnd() method are processed by the text and XML
+    // printers *before* their OnTestEnd() methods are called, such that
+    // they are attributed to the right test. Remember that a listener
+    // receives an OnXyzStart event *after* listeners preceding it in the
+    // list received that event, and receives an OnXyzEnd event *before*
+    // listeners preceding it.
+    //
+    // We don't need to worry about deleting the new listener later, as
+    // Google Test will do it.
+    listeners.Append(new LeakChecker);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1_unittest.cc
new file mode 100755
index 0000000..cb08b61
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample1_unittest.cc
@@ -0,0 +1,151 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+// This sample shows how to write a simple unit test for a function,
+// using Google C++ testing framework.
+//
+// Writing a unit test using Google C++ testing framework is easy as 1-2-3:
+
+
+// Step 1. Include necessary header files such that the stuff your
+// test logic needs is declared.
+//
+// Don't forget gtest.h, which declares the testing framework.
+
+#include <limits.h>
+#include "sample1.h"
+#include "gtest/gtest.h"
+namespace {
+
+// Step 2. Use the TEST macro to define your tests.
+//
+// TEST has two parameters: the test case name and the test name.
+// After using the macro, you should define your test logic between a
+// pair of braces.  You can use a bunch of macros to indicate the
+// success or failure of a test.  EXPECT_TRUE and EXPECT_EQ are
+// examples of such macros.  For a complete list, see gtest.h.
+//
+// <TechnicalDetails>
+//
+// In Google Test, tests are grouped into test cases.  This is how we
+// keep test code organized.  You should put logically related tests
+// into the same test case.
+//
+// The test case name and the test name should both be valid C++
+// identifiers.  And you should not use underscore (_) in the names.
+//
+// Google Test guarantees that each test you define is run exactly
+// once, but it makes no guarantee on the order the tests are
+// executed.  Therefore, you should write your tests in such a way
+// that their results don't depend on their order.
+//
+// </TechnicalDetails>
+
+
+// Tests Factorial().
+
+// Tests factorial of negative numbers.
+TEST(FactorialTest, Negative) {
+  // This test is named "Negative", and belongs to the "FactorialTest"
+  // test case.
+  EXPECT_EQ(1, Factorial(-5));
+  EXPECT_EQ(1, Factorial(-1));
+  EXPECT_GT(Factorial(-10), 0);
+
+  // <TechnicalDetails>
+  //
+  // EXPECT_EQ(expected, actual) is the same as
+  //
+  //   EXPECT_TRUE((expected) == (actual))
+  //
+  // except that it will print both the expected value and the actual
+  // value when the assertion fails.  This is very helpful for
+  // debugging.  Therefore in this case EXPECT_EQ is preferred.
+  //
+  // On the other hand, EXPECT_TRUE accepts any Boolean expression,
+  // and is thus more general.
+  //
+  // </TechnicalDetails>
+}
+
+// Tests factorial of 0.
+TEST(FactorialTest, Zero) {
+  EXPECT_EQ(1, Factorial(0));
+}
+
+// Tests factorial of positive numbers.
+TEST(FactorialTest, Positive) {
+  EXPECT_EQ(1, Factorial(1));
+  EXPECT_EQ(2, Factorial(2));
+  EXPECT_EQ(6, Factorial(3));
+  EXPECT_EQ(40320, Factorial(8));
+}
+
+
+// Tests IsPrime()
+
+// Tests negative input.
+TEST(IsPrimeTest, Negative) {
+  // This test belongs to the IsPrimeTest test case.
+
+  EXPECT_FALSE(IsPrime(-1));
+  EXPECT_FALSE(IsPrime(-2));
+  EXPECT_FALSE(IsPrime(INT_MIN));
+}
+
+// Tests some trivial cases.
+TEST(IsPrimeTest, Trivial) {
+  EXPECT_FALSE(IsPrime(0));
+  EXPECT_FALSE(IsPrime(1));
+  EXPECT_TRUE(IsPrime(2));
+  EXPECT_TRUE(IsPrime(3));
+}
+
+// Tests positive input.
+TEST(IsPrimeTest, Positive) {
+  EXPECT_FALSE(IsPrime(4));
+  EXPECT_TRUE(IsPrime(5));
+  EXPECT_FALSE(IsPrime(6));
+  EXPECT_TRUE(IsPrime(23));
+}
+}  // namespace
+
+// Step 3. Call RUN_ALL_TESTS() in main().
+//
+// We do this by linking in src/gtest_main.cc file, which consists of
+// a main() function which calls RUN_ALL_TESTS() for us.
+//
+// This runs all the tests you've defined, prints the result, and
+// returns 0 if successful, or 1 otherwise.
+//
+// Did you notice that we didn't register the tests?  The
+// RUN_ALL_TESTS() macro magically knows about all the tests we
+// defined.  Isn't this convenient?
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.cc
new file mode 100755
index 0000000..d8e8723
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.cc
@@ -0,0 +1,54 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#include "sample2.h"
+
+#include <string.h>
+
+// Clones a 0-terminated C string, allocating memory using new.
+const char* MyString::CloneCString(const char* a_c_string) {
+  if (a_c_string == nullptr) return nullptr;
+
+  const size_t len = strlen(a_c_string);
+  char* const clone = new char[ len + 1 ];
+  memcpy(clone, a_c_string, len + 1);
+
+  return clone;
+}
+
+// Sets the 0-terminated C string this MyString object
+// represents.
+void MyString::Set(const char* a_c_string) {
+  // Makes sure this works when c_string == c_string_
+  const char* const temp = MyString::CloneCString(a_c_string);
+  delete[] c_string_;
+  c_string_ = temp;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.h
new file mode 100755
index 0000000..e9a5a70
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2.h
@@ -0,0 +1,81 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#ifndef GTEST_SAMPLES_SAMPLE2_H_
+#define GTEST_SAMPLES_SAMPLE2_H_
+
+#include <string.h>
+
+
+// A simple string class.
+class MyString {
+ private:
+  const char* c_string_;
+  const MyString& operator=(const MyString& rhs);
+
+ public:
+  // Clones a 0-terminated C string, allocating memory using new.
+  static const char* CloneCString(const char* a_c_string);
+
+  ////////////////////////////////////////////////////////////
+  //
+  // C'tors
+
+  // The default c'tor constructs a NULL string.
+  MyString() : c_string_(nullptr) {}
+
+  // Constructs a MyString by cloning a 0-terminated C string.
+  explicit MyString(const char* a_c_string) : c_string_(nullptr) {
+    Set(a_c_string);
+  }
+
+  // Copy c'tor
+  MyString(const MyString& string) : c_string_(nullptr) {
+    Set(string.c_string_);
+  }
+
+  ////////////////////////////////////////////////////////////
+  //
+  // D'tor.  MyString is intended to be a final class, so the d'tor
+  // doesn't need to be virtual.
+  ~MyString() { delete[] c_string_; }
+
+  // Gets the 0-terminated C string this MyString object represents.
+  const char* c_string() const { return c_string_; }
+
+  size_t Length() const { return c_string_ == nullptr ? 0 : strlen(c_string_); }
+
+  // Sets the 0-terminated C string this MyString object represents.
+  void Set(const char* c_string);
+};
+
+
+#endif  // GTEST_SAMPLES_SAMPLE2_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2_unittest.cc
new file mode 100755
index 0000000..41e31c1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample2_unittest.cc
@@ -0,0 +1,107 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+// This sample shows how to write a more complex unit test for a class
+// that has multiple member functions.
+//
+// Usually, it's a good idea to have one test for each method in your
+// class.  You don't have to do that exactly, but it helps to keep
+// your tests organized.  You may also throw in additional tests as
+// needed.
+
+#include "sample2.h"
+#include "gtest/gtest.h"
+namespace {
+// In this example, we test the MyString class (a simple string).
+
+// Tests the default c'tor.
+TEST(MyString, DefaultConstructor) {
+  const MyString s;
+
+  // Asserts that s.c_string() returns NULL.
+  //
+  // <TechnicalDetails>
+  //
+  // If we write NULL instead of
+  //
+  //   static_cast<const char *>(NULL)
+  //
+  // in this assertion, it will generate a warning on gcc 3.4.  The
+  // reason is that EXPECT_EQ needs to know the types of its
+  // arguments in order to print them when it fails.  Since NULL is
+  // #defined as 0, the compiler will use the formatter function for
+  // int to print it.  However, gcc thinks that NULL should be used as
+  // a pointer, not an int, and therefore complains.
+  //
+  // The root of the problem is C++'s lack of distinction between the
+  // integer number 0 and the null pointer constant.  Unfortunately,
+  // we have to live with this fact.
+  //
+  // </TechnicalDetails>
+  EXPECT_STREQ(nullptr, s.c_string());
+
+  EXPECT_EQ(0u, s.Length());
+}
+
+const char kHelloString[] = "Hello, world!";
+
+// Tests the c'tor that accepts a C string.
+TEST(MyString, ConstructorFromCString) {
+  const MyString s(kHelloString);
+  EXPECT_EQ(0, strcmp(s.c_string(), kHelloString));
+  EXPECT_EQ(sizeof(kHelloString)/sizeof(kHelloString[0]) - 1,
+            s.Length());
+}
+
+// Tests the copy c'tor.
+TEST(MyString, CopyConstructor) {
+  const MyString s1(kHelloString);
+  const MyString s2 = s1;
+  EXPECT_EQ(0, strcmp(s2.c_string(), kHelloString));
+}
+
+// Tests the Set method.
+TEST(MyString, Set) {
+  MyString s;
+
+  s.Set(kHelloString);
+  EXPECT_EQ(0, strcmp(s.c_string(), kHelloString));
+
+  // Set should work when the input pointer is the same as the one
+  // already in the MyString object.
+  s.Set(s.c_string());
+  EXPECT_EQ(0, strcmp(s.c_string(), kHelloString));
+
+  // Can we set the MyString to NULL?
+  s.Set(nullptr);
+  EXPECT_STREQ(nullptr, s.c_string());
+}
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3-inl.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3-inl.h
new file mode 100755
index 0000000..80ba6b9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3-inl.h
@@ -0,0 +1,172 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#ifndef GTEST_SAMPLES_SAMPLE3_INL_H_
+#define GTEST_SAMPLES_SAMPLE3_INL_H_
+
+#include <stddef.h>
+
+
+// Queue is a simple queue implemented as a singled-linked list.
+//
+// The element type must support copy constructor.
+template <typename E>  // E is the element type
+class Queue;
+
+// QueueNode is a node in a Queue, which consists of an element of
+// type E and a pointer to the next node.
+template <typename E>  // E is the element type
+class QueueNode {
+  friend class Queue<E>;
+
+ public:
+  // Gets the element in this node.
+  const E& element() const { return element_; }
+
+  // Gets the next node in the queue.
+  QueueNode* next() { return next_; }
+  const QueueNode* next() const { return next_; }
+
+ private:
+  // Creates a node with a given element value.  The next pointer is
+  // set to NULL.
+  explicit QueueNode(const E& an_element)
+      : element_(an_element), next_(nullptr) {}
+
+  // We disable the default assignment operator and copy c'tor.
+  const QueueNode& operator = (const QueueNode&);
+  QueueNode(const QueueNode&);
+
+  E element_;
+  QueueNode* next_;
+};
+
+template <typename E>  // E is the element type.
+class Queue {
+ public:
+  // Creates an empty queue.
+  Queue() : head_(nullptr), last_(nullptr), size_(0) {}
+
+  // D'tor.  Clears the queue.
+  ~Queue() { Clear(); }
+
+  // Clears the queue.
+  void Clear() {
+    if (size_ > 0) {
+      // 1. Deletes every node.
+      QueueNode<E>* node = head_;
+      QueueNode<E>* next = node->next();
+      for (; ;) {
+        delete node;
+        node = next;
+        if (node == nullptr) break;
+        next = node->next();
+      }
+
+      // 2. Resets the member variables.
+      head_ = last_ = nullptr;
+      size_ = 0;
+    }
+  }
+
+  // Gets the number of elements.
+  size_t Size() const { return size_; }
+
+  // Gets the first element of the queue, or NULL if the queue is empty.
+  QueueNode<E>* Head() { return head_; }
+  const QueueNode<E>* Head() const { return head_; }
+
+  // Gets the last element of the queue, or NULL if the queue is empty.
+  QueueNode<E>* Last() { return last_; }
+  const QueueNode<E>* Last() const { return last_; }
+
+  // Adds an element to the end of the queue.  A copy of the element is
+  // created using the copy constructor, and then stored in the queue.
+  // Changes made to the element in the queue doesn't affect the source
+  // object, and vice versa.
+  void Enqueue(const E& element) {
+    QueueNode<E>* new_node = new QueueNode<E>(element);
+
+    if (size_ == 0) {
+      head_ = last_ = new_node;
+      size_ = 1;
+    } else {
+      last_->next_ = new_node;
+      last_ = new_node;
+      size_++;
+    }
+  }
+
+  // Removes the head of the queue and returns it.  Returns NULL if
+  // the queue is empty.
+  E* Dequeue() {
+    if (size_ == 0) {
+      return nullptr;
+    }
+
+    const QueueNode<E>* const old_head = head_;
+    head_ = head_->next_;
+    size_--;
+    if (size_ == 0) {
+      last_ = nullptr;
+    }
+
+    E* element = new E(old_head->element());
+    delete old_head;
+
+    return element;
+  }
+
+  // Applies a function/functor on each element of the queue, and
+  // returns the result in a new queue.  The original queue is not
+  // affected.
+  template <typename F>
+  Queue* Map(F function) const {
+    Queue* new_queue = new Queue();
+    for (const QueueNode<E>* node = head_; node != nullptr;
+         node = node->next_) {
+      new_queue->Enqueue(function(node->element()));
+    }
+
+    return new_queue;
+  }
+
+ private:
+  QueueNode<E>* head_;  // The first node of the queue.
+  QueueNode<E>* last_;  // The last node of the queue.
+  size_t size_;  // The number of elements in the queue.
+
+  // We disallow copying a queue.
+  Queue(const Queue&);
+  const Queue& operator = (const Queue&);
+};
+
+#endif  // GTEST_SAMPLES_SAMPLE3_INL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3_unittest.cc
new file mode 100755
index 0000000..b19416d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample3_unittest.cc
@@ -0,0 +1,149 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+// In this example, we use a more advanced feature of Google Test called
+// test fixture.
+//
+// A test fixture is a place to hold objects and functions shared by
+// all tests in a test case.  Using a test fixture avoids duplicating
+// the test code necessary to initialize and cleanup those common
+// objects for each test.  It is also useful for defining sub-routines
+// that your tests need to invoke a lot.
+//
+// <TechnicalDetails>
+//
+// The tests share the test fixture in the sense of code sharing, not
+// data sharing.  Each test is given its own fresh copy of the
+// fixture.  You cannot expect the data modified by one test to be
+// passed on to another test, which is a bad idea.
+//
+// The reason for this design is that tests should be independent and
+// repeatable.  In particular, a test should not fail as the result of
+// another test's failure.  If one test depends on info produced by
+// another test, then the two tests should really be one big test.
+//
+// The macros for indicating the success/failure of a test
+// (EXPECT_TRUE, FAIL, etc) need to know what the current test is
+// (when Google Test prints the test result, it tells you which test
+// each failure belongs to).  Technically, these macros invoke a
+// member function of the Test class.  Therefore, you cannot use them
+// in a global function.  That's why you should put test sub-routines
+// in a test fixture.
+//
+// </TechnicalDetails>
+
+#include "sample3-inl.h"
+#include "gtest/gtest.h"
+namespace {
+// To use a test fixture, derive a class from testing::Test.
+class QueueTestSmpl3 : public testing::Test {
+ protected:  // You should make the members protected s.t. they can be
+             // accessed from sub-classes.
+
+  // virtual void SetUp() will be called before each test is run.  You
+  // should define it if you need to initialize the variables.
+  // Otherwise, this can be skipped.
+  void SetUp() override {
+    q1_.Enqueue(1);
+    q2_.Enqueue(2);
+    q2_.Enqueue(3);
+  }
+
+  // virtual void TearDown() will be called after each test is run.
+  // You should define it if there is cleanup work to do.  Otherwise,
+  // you don't have to provide it.
+  //
+  // virtual void TearDown() {
+  // }
+
+  // A helper function that some test uses.
+  static int Double(int n) {
+    return 2*n;
+  }
+
+  // A helper function for testing Queue::Map().
+  void MapTester(const Queue<int> * q) {
+    // Creates a new queue, where each element is twice as big as the
+    // corresponding one in q.
+    const Queue<int> * const new_q = q->Map(Double);
+
+    // Verifies that the new queue has the same size as q.
+    ASSERT_EQ(q->Size(), new_q->Size());
+
+    // Verifies the relationship between the elements of the two queues.
+    for (const QueueNode<int>*n1 = q->Head(), *n2 = new_q->Head();
+         n1 != nullptr; n1 = n1->next(), n2 = n2->next()) {
+      EXPECT_EQ(2 * n1->element(), n2->element());
+    }
+
+    delete new_q;
+  }
+
+  // Declares the variables your tests want to use.
+  Queue<int> q0_;
+  Queue<int> q1_;
+  Queue<int> q2_;
+};
+
+// When you have a test fixture, you define a test using TEST_F
+// instead of TEST.
+
+// Tests the default c'tor.
+TEST_F(QueueTestSmpl3, DefaultConstructor) {
+  // You can access data in the test fixture here.
+  EXPECT_EQ(0u, q0_.Size());
+}
+
+// Tests Dequeue().
+TEST_F(QueueTestSmpl3, Dequeue) {
+  int * n = q0_.Dequeue();
+  EXPECT_TRUE(n == nullptr);
+
+  n = q1_.Dequeue();
+  ASSERT_TRUE(n != nullptr);
+  EXPECT_EQ(1, *n);
+  EXPECT_EQ(0u, q1_.Size());
+  delete n;
+
+  n = q2_.Dequeue();
+  ASSERT_TRUE(n != nullptr);
+  EXPECT_EQ(2, *n);
+  EXPECT_EQ(1u, q2_.Size());
+  delete n;
+}
+
+// Tests the Queue::Map() function.
+TEST_F(QueueTestSmpl3, Map) {
+  MapTester(&q0_);
+  MapTester(&q1_);
+  MapTester(&q2_);
+}
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.cc
new file mode 100755
index 0000000..b0ee609
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.cc
@@ -0,0 +1,54 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+
+#include <stdio.h>
+
+#include "sample4.h"
+
+// Returns the current counter value, and increments it.
+int Counter::Increment() {
+  return counter_++;
+}
+
+// Returns the current counter value, and decrements it.
+// counter can not be less than 0, return 0 in this case
+int Counter::Decrement() {
+  if (counter_ == 0) {
+    return counter_;
+  } else  {
+    return counter_--;
+  }
+}
+
+// Prints the current counter value to STDOUT.
+void Counter::Print() const {
+  printf("%d", counter_);
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.h
new file mode 100755
index 0000000..e256f40
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4.h
@@ -0,0 +1,53 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A sample program demonstrating using Google C++ testing framework.
+#ifndef GTEST_SAMPLES_SAMPLE4_H_
+#define GTEST_SAMPLES_SAMPLE4_H_
+
+// A simple monotonic counter.
+class Counter {
+ private:
+  int counter_;
+
+ public:
+  // Creates a counter that starts at 0.
+  Counter() : counter_(0) {}
+
+  // Returns the current counter value, and increments it.
+  int Increment();
+
+  // Returns the current counter value, and decrements it.
+  int Decrement();
+
+  // Prints the current counter value to STDOUT.
+  void Print() const;
+};
+
+#endif  // GTEST_SAMPLES_SAMPLE4_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4_unittest.cc
new file mode 100755
index 0000000..d5144c0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample4_unittest.cc
@@ -0,0 +1,53 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "sample4.h"
+#include "gtest/gtest.h"
+
+namespace {
+// Tests the Increment() method.
+
+TEST(Counter, Increment) {
+  Counter c;
+
+  // Test that counter 0 returns 0
+  EXPECT_EQ(0, c.Decrement());
+
+  // EXPECT_EQ() evaluates its arguments exactly once, so they
+  // can have side effects.
+
+  EXPECT_EQ(0, c.Increment());
+  EXPECT_EQ(1, c.Increment());
+  EXPECT_EQ(2, c.Increment());
+
+  EXPECT_EQ(3, c.Decrement());
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample5_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample5_unittest.cc
new file mode 100755
index 0000000..0a21dd2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample5_unittest.cc
@@ -0,0 +1,196 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample teaches how to reuse a test fixture in multiple test
+// cases by deriving sub-fixtures from it.
+//
+// When you define a test fixture, you specify the name of the test
+// case that will use this fixture.  Therefore, a test fixture can
+// be used by only one test case.
+//
+// Sometimes, more than one test cases may want to use the same or
+// slightly different test fixtures.  For example, you may want to
+// make sure that all tests for a GUI library don't leak important
+// system resources like fonts and brushes.  In Google Test, you do
+// this by putting the shared logic in a super (as in "super class")
+// test fixture, and then have each test case use a fixture derived
+// from this super fixture.
+
+#include <limits.h>
+#include <time.h>
+#include "gtest/gtest.h"
+#include "sample1.h"
+#include "sample3-inl.h"
+namespace {
+// In this sample, we want to ensure that every test finishes within
+// ~5 seconds.  If a test takes longer to run, we consider it a
+// failure.
+//
+// We put the code for timing a test in a test fixture called
+// "QuickTest".  QuickTest is intended to be the super fixture that
+// other fixtures derive from, therefore there is no test case with
+// the name "QuickTest".  This is OK.
+//
+// Later, we will derive multiple test fixtures from QuickTest.
+class QuickTest : public testing::Test {
+ protected:
+  // Remember that SetUp() is run immediately before a test starts.
+  // This is a good place to record the start time.
+  void SetUp() override { start_time_ = time(nullptr); }
+
+  // TearDown() is invoked immediately after a test finishes.  Here we
+  // check if the test was too slow.
+  void TearDown() override {
+    // Gets the time when the test finishes
+    const time_t end_time = time(nullptr);
+
+    // Asserts that the test took no more than ~5 seconds.  Did you
+    // know that you can use assertions in SetUp() and TearDown() as
+    // well?
+    EXPECT_TRUE(end_time - start_time_ <= 5) << "The test took too long.";
+  }
+
+  // The UTC time (in seconds) when the test starts
+  time_t start_time_;
+};
+
+
+// We derive a fixture named IntegerFunctionTest from the QuickTest
+// fixture.  All tests using this fixture will be automatically
+// required to be quick.
+class IntegerFunctionTest : public QuickTest {
+  // We don't need any more logic than already in the QuickTest fixture.
+  // Therefore the body is empty.
+};
+
+
+// Now we can write tests in the IntegerFunctionTest test case.
+
+// Tests Factorial()
+TEST_F(IntegerFunctionTest, Factorial) {
+  // Tests factorial of negative numbers.
+  EXPECT_EQ(1, Factorial(-5));
+  EXPECT_EQ(1, Factorial(-1));
+  EXPECT_GT(Factorial(-10), 0);
+
+  // Tests factorial of 0.
+  EXPECT_EQ(1, Factorial(0));
+
+  // Tests factorial of positive numbers.
+  EXPECT_EQ(1, Factorial(1));
+  EXPECT_EQ(2, Factorial(2));
+  EXPECT_EQ(6, Factorial(3));
+  EXPECT_EQ(40320, Factorial(8));
+}
+
+
+// Tests IsPrime()
+TEST_F(IntegerFunctionTest, IsPrime) {
+  // Tests negative input.
+  EXPECT_FALSE(IsPrime(-1));
+  EXPECT_FALSE(IsPrime(-2));
+  EXPECT_FALSE(IsPrime(INT_MIN));
+
+  // Tests some trivial cases.
+  EXPECT_FALSE(IsPrime(0));
+  EXPECT_FALSE(IsPrime(1));
+  EXPECT_TRUE(IsPrime(2));
+  EXPECT_TRUE(IsPrime(3));
+
+  // Tests positive input.
+  EXPECT_FALSE(IsPrime(4));
+  EXPECT_TRUE(IsPrime(5));
+  EXPECT_FALSE(IsPrime(6));
+  EXPECT_TRUE(IsPrime(23));
+}
+
+
+// The next test case (named "QueueTest") also needs to be quick, so
+// we derive another fixture from QuickTest.
+//
+// The QueueTest test fixture has some logic and shared objects in
+// addition to what's in QuickTest already.  We define the additional
+// stuff inside the body of the test fixture, as usual.
+class QueueTest : public QuickTest {
+ protected:
+  void SetUp() override {
+    // First, we need to set up the super fixture (QuickTest).
+    QuickTest::SetUp();
+
+    // Second, some additional setup for this fixture.
+    q1_.Enqueue(1);
+    q2_.Enqueue(2);
+    q2_.Enqueue(3);
+  }
+
+  // By default, TearDown() inherits the behavior of
+  // QuickTest::TearDown().  As we have no additional cleaning work
+  // for QueueTest, we omit it here.
+  //
+  // virtual void TearDown() {
+  //   QuickTest::TearDown();
+  // }
+
+  Queue<int> q0_;
+  Queue<int> q1_;
+  Queue<int> q2_;
+};
+
+
+// Now, let's write tests using the QueueTest fixture.
+
+// Tests the default constructor.
+TEST_F(QueueTest, DefaultConstructor) {
+  EXPECT_EQ(0u, q0_.Size());
+}
+
+// Tests Dequeue().
+TEST_F(QueueTest, Dequeue) {
+  int* n = q0_.Dequeue();
+  EXPECT_TRUE(n == nullptr);
+
+  n = q1_.Dequeue();
+  EXPECT_TRUE(n != nullptr);
+  EXPECT_EQ(1, *n);
+  EXPECT_EQ(0u, q1_.Size());
+  delete n;
+
+  n = q2_.Dequeue();
+  EXPECT_TRUE(n != nullptr);
+  EXPECT_EQ(2, *n);
+  EXPECT_EQ(1u, q2_.Size());
+  delete n;
+}
+}  // namespace
+// If necessary, you can derive further test fixtures from a derived
+// fixture itself.  For example, you can derive another fixture from
+// QueueTest.  Google Test imposes no limit on how deep the hierarchy
+// can be.  In practice, however, you probably don't want it to be too
+// deep as to be confusing.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample6_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample6_unittest.cc
new file mode 100755
index 0000000..0266e27
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample6_unittest.cc
@@ -0,0 +1,224 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample shows how to test common properties of multiple
+// implementations of the same interface (aka interface tests).
+
+// The interface and its implementations are in this header.
+#include "prime_tables.h"
+
+#include "gtest/gtest.h"
+namespace {
+// First, we define some factory functions for creating instances of
+// the implementations.  You may be able to skip this step if all your
+// implementations can be constructed the same way.
+
+template <class T>
+PrimeTable* CreatePrimeTable();
+
+template <>
+PrimeTable* CreatePrimeTable<OnTheFlyPrimeTable>() {
+  return new OnTheFlyPrimeTable;
+}
+
+template <>
+PrimeTable* CreatePrimeTable<PreCalculatedPrimeTable>() {
+  return new PreCalculatedPrimeTable(10000);
+}
+
+// Then we define a test fixture class template.
+template <class T>
+class PrimeTableTest : public testing::Test {
+ protected:
+  // The ctor calls the factory function to create a prime table
+  // implemented by T.
+  PrimeTableTest() : table_(CreatePrimeTable<T>()) {}
+
+  ~PrimeTableTest() override { delete table_; }
+
+  // Note that we test an implementation via the base interface
+  // instead of the actual implementation class.  This is important
+  // for keeping the tests close to the real world scenario, where the
+  // implementation is invoked via the base interface.  It avoids
+  // got-yas where the implementation class has a method that shadows
+  // a method with the same name (but slightly different argument
+  // types) in the base interface, for example.
+  PrimeTable* const table_;
+};
+
+#if GTEST_HAS_TYPED_TEST
+
+using testing::Types;
+
+// Google Test offers two ways for reusing tests for different types.
+// The first is called "typed tests".  You should use it if you
+// already know *all* the types you are gonna exercise when you write
+// the tests.
+
+// To write a typed test case, first use
+//
+//   TYPED_TEST_SUITE(TestCaseName, TypeList);
+//
+// to declare it and specify the type parameters.  As with TEST_F,
+// TestCaseName must match the test fixture name.
+
+// The list of types we want to test.
+typedef Types<OnTheFlyPrimeTable, PreCalculatedPrimeTable> Implementations;
+
+TYPED_TEST_SUITE(PrimeTableTest, Implementations);
+
+// Then use TYPED_TEST(TestCaseName, TestName) to define a typed test,
+// similar to TEST_F.
+TYPED_TEST(PrimeTableTest, ReturnsFalseForNonPrimes) {
+  // Inside the test body, you can refer to the type parameter by
+  // TypeParam, and refer to the fixture class by TestFixture.  We
+  // don't need them in this example.
+
+  // Since we are in the template world, C++ requires explicitly
+  // writing 'this->' when referring to members of the fixture class.
+  // This is something you have to learn to live with.
+  EXPECT_FALSE(this->table_->IsPrime(-5));
+  EXPECT_FALSE(this->table_->IsPrime(0));
+  EXPECT_FALSE(this->table_->IsPrime(1));
+  EXPECT_FALSE(this->table_->IsPrime(4));
+  EXPECT_FALSE(this->table_->IsPrime(6));
+  EXPECT_FALSE(this->table_->IsPrime(100));
+}
+
+TYPED_TEST(PrimeTableTest, ReturnsTrueForPrimes) {
+  EXPECT_TRUE(this->table_->IsPrime(2));
+  EXPECT_TRUE(this->table_->IsPrime(3));
+  EXPECT_TRUE(this->table_->IsPrime(5));
+  EXPECT_TRUE(this->table_->IsPrime(7));
+  EXPECT_TRUE(this->table_->IsPrime(11));
+  EXPECT_TRUE(this->table_->IsPrime(131));
+}
+
+TYPED_TEST(PrimeTableTest, CanGetNextPrime) {
+  EXPECT_EQ(2, this->table_->GetNextPrime(0));
+  EXPECT_EQ(3, this->table_->GetNextPrime(2));
+  EXPECT_EQ(5, this->table_->GetNextPrime(3));
+  EXPECT_EQ(7, this->table_->GetNextPrime(5));
+  EXPECT_EQ(11, this->table_->GetNextPrime(7));
+  EXPECT_EQ(131, this->table_->GetNextPrime(128));
+}
+
+// That's it!  Google Test will repeat each TYPED_TEST for each type
+// in the type list specified in TYPED_TEST_SUITE.  Sit back and be
+// happy that you don't have to define them multiple times.
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+#if GTEST_HAS_TYPED_TEST_P
+
+using testing::Types;
+
+// Sometimes, however, you don't yet know all the types that you want
+// to test when you write the tests.  For example, if you are the
+// author of an interface and expect other people to implement it, you
+// might want to write a set of tests to make sure each implementation
+// conforms to some basic requirements, but you don't know what
+// implementations will be written in the future.
+//
+// How can you write the tests without committing to the type
+// parameters?  That's what "type-parameterized tests" can do for you.
+// It is a bit more involved than typed tests, but in return you get a
+// test pattern that can be reused in many contexts, which is a big
+// win.  Here's how you do it:
+
+// First, define a test fixture class template.  Here we just reuse
+// the PrimeTableTest fixture defined earlier:
+
+template <class T>
+class PrimeTableTest2 : public PrimeTableTest<T> {
+};
+
+// Then, declare the test case.  The argument is the name of the test
+// fixture, and also the name of the test case (as usual).  The _P
+// suffix is for "parameterized" or "pattern".
+TYPED_TEST_SUITE_P(PrimeTableTest2);
+
+// Next, use TYPED_TEST_P(TestCaseName, TestName) to define a test,
+// similar to what you do with TEST_F.
+TYPED_TEST_P(PrimeTableTest2, ReturnsFalseForNonPrimes) {
+  EXPECT_FALSE(this->table_->IsPrime(-5));
+  EXPECT_FALSE(this->table_->IsPrime(0));
+  EXPECT_FALSE(this->table_->IsPrime(1));
+  EXPECT_FALSE(this->table_->IsPrime(4));
+  EXPECT_FALSE(this->table_->IsPrime(6));
+  EXPECT_FALSE(this->table_->IsPrime(100));
+}
+
+TYPED_TEST_P(PrimeTableTest2, ReturnsTrueForPrimes) {
+  EXPECT_TRUE(this->table_->IsPrime(2));
+  EXPECT_TRUE(this->table_->IsPrime(3));
+  EXPECT_TRUE(this->table_->IsPrime(5));
+  EXPECT_TRUE(this->table_->IsPrime(7));
+  EXPECT_TRUE(this->table_->IsPrime(11));
+  EXPECT_TRUE(this->table_->IsPrime(131));
+}
+
+TYPED_TEST_P(PrimeTableTest2, CanGetNextPrime) {
+  EXPECT_EQ(2, this->table_->GetNextPrime(0));
+  EXPECT_EQ(3, this->table_->GetNextPrime(2));
+  EXPECT_EQ(5, this->table_->GetNextPrime(3));
+  EXPECT_EQ(7, this->table_->GetNextPrime(5));
+  EXPECT_EQ(11, this->table_->GetNextPrime(7));
+  EXPECT_EQ(131, this->table_->GetNextPrime(128));
+}
+
+// Type-parameterized tests involve one extra step: you have to
+// enumerate the tests you defined:
+REGISTER_TYPED_TEST_SUITE_P(
+    PrimeTableTest2,  // The first argument is the test case name.
+    // The rest of the arguments are the test names.
+    ReturnsFalseForNonPrimes, ReturnsTrueForPrimes, CanGetNextPrime);
+
+// At this point the test pattern is done.  However, you don't have
+// any real test yet as you haven't said which types you want to run
+// the tests with.
+
+// To turn the abstract test pattern into real tests, you instantiate
+// it with a list of types.  Usually the test pattern will be defined
+// in a .h file, and anyone can #include and instantiate it.  You can
+// even instantiate it more than once in the same program.  To tell
+// different instances apart, you give each of them a name, which will
+// become part of the test case name and can be used in test filters.
+
+// The list of types we want to test.  Note that it doesn't have to be
+// defined at the time we write the TYPED_TEST_P()s.
+typedef Types<OnTheFlyPrimeTable, PreCalculatedPrimeTable>
+    PrimeTableImplementations;
+INSTANTIATE_TYPED_TEST_SUITE_P(OnTheFlyAndPreCalculated,    // Instance name
+                               PrimeTableTest2,             // Test case name
+                               PrimeTableImplementations);  // Type list
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample7_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample7_unittest.cc
new file mode 100755
index 0000000..e0efc29
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample7_unittest.cc
@@ -0,0 +1,117 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample shows how to test common properties of multiple
+// implementations of an interface (aka interface tests) using
+// value-parameterized tests. Each test in the test case has
+// a parameter that is an interface pointer to an implementation
+// tested.
+
+// The interface and its implementations are in this header.
+#include "prime_tables.h"
+
+#include "gtest/gtest.h"
+namespace {
+
+using ::testing::TestWithParam;
+using ::testing::Values;
+
+// As a general rule, to prevent a test from affecting the tests that come
+// after it, you should create and destroy the tested objects for each test
+// instead of reusing them.  In this sample we will define a simple factory
+// function for PrimeTable objects.  We will instantiate objects in test's
+// SetUp() method and delete them in TearDown() method.
+typedef PrimeTable* CreatePrimeTableFunc();
+
+PrimeTable* CreateOnTheFlyPrimeTable() {
+  return new OnTheFlyPrimeTable();
+}
+
+template <size_t max_precalculated>
+PrimeTable* CreatePreCalculatedPrimeTable() {
+  return new PreCalculatedPrimeTable(max_precalculated);
+}
+
+// Inside the test body, fixture constructor, SetUp(), and TearDown() you
+// can refer to the test parameter by GetParam().  In this case, the test
+// parameter is a factory function which we call in fixture's SetUp() to
+// create and store an instance of PrimeTable.
+class PrimeTableTestSmpl7 : public TestWithParam<CreatePrimeTableFunc*> {
+ public:
+  ~PrimeTableTestSmpl7() override { delete table_; }
+  void SetUp() override { table_ = (*GetParam())(); }
+  void TearDown() override {
+    delete table_;
+    table_ = nullptr;
+  }
+
+ protected:
+  PrimeTable* table_;
+};
+
+TEST_P(PrimeTableTestSmpl7, ReturnsFalseForNonPrimes) {
+  EXPECT_FALSE(table_->IsPrime(-5));
+  EXPECT_FALSE(table_->IsPrime(0));
+  EXPECT_FALSE(table_->IsPrime(1));
+  EXPECT_FALSE(table_->IsPrime(4));
+  EXPECT_FALSE(table_->IsPrime(6));
+  EXPECT_FALSE(table_->IsPrime(100));
+}
+
+TEST_P(PrimeTableTestSmpl7, ReturnsTrueForPrimes) {
+  EXPECT_TRUE(table_->IsPrime(2));
+  EXPECT_TRUE(table_->IsPrime(3));
+  EXPECT_TRUE(table_->IsPrime(5));
+  EXPECT_TRUE(table_->IsPrime(7));
+  EXPECT_TRUE(table_->IsPrime(11));
+  EXPECT_TRUE(table_->IsPrime(131));
+}
+
+TEST_P(PrimeTableTestSmpl7, CanGetNextPrime) {
+  EXPECT_EQ(2, table_->GetNextPrime(0));
+  EXPECT_EQ(3, table_->GetNextPrime(2));
+  EXPECT_EQ(5, table_->GetNextPrime(3));
+  EXPECT_EQ(7, table_->GetNextPrime(5));
+  EXPECT_EQ(11, table_->GetNextPrime(7));
+  EXPECT_EQ(131, table_->GetNextPrime(128));
+}
+
+// In order to run value-parameterized tests, you need to instantiate them,
+// or bind them to a list of values which will be used as test parameters.
+// You can instantiate them in a different translation module, or even
+// instantiate them several times.
+//
+// Here, we instantiate our tests with a list of two PrimeTable object
+// factory functions:
+INSTANTIATE_TEST_SUITE_P(OnTheFlyAndPreCalculated, PrimeTableTestSmpl7,
+                         Values(&CreateOnTheFlyPrimeTable,
+                                &CreatePreCalculatedPrimeTable<1000>));
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample8_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample8_unittest.cc
new file mode 100755
index 0000000..10488b0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample8_unittest.cc
@@ -0,0 +1,154 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample shows how to test code relying on some global flag variables.
+// Combine() helps with generating all possible combinations of such flags,
+// and each test is given one combination as a parameter.
+
+// Use class definitions to test from this header.
+#include "prime_tables.h"
+
+#include "gtest/gtest.h"
+namespace {
+
+// Suppose we want to introduce a new, improved implementation of PrimeTable
+// which combines speed of PrecalcPrimeTable and versatility of
+// OnTheFlyPrimeTable (see prime_tables.h). Inside it instantiates both
+// PrecalcPrimeTable and OnTheFlyPrimeTable and uses the one that is more
+// appropriate under the circumstances. But in low memory conditions, it can be
+// told to instantiate without PrecalcPrimeTable instance at all and use only
+// OnTheFlyPrimeTable.
+class HybridPrimeTable : public PrimeTable {
+ public:
+  HybridPrimeTable(bool force_on_the_fly, int max_precalculated)
+      : on_the_fly_impl_(new OnTheFlyPrimeTable),
+        precalc_impl_(force_on_the_fly
+                          ? nullptr
+                          : new PreCalculatedPrimeTable(max_precalculated)),
+        max_precalculated_(max_precalculated) {}
+  ~HybridPrimeTable() override {
+    delete on_the_fly_impl_;
+    delete precalc_impl_;
+  }
+
+  bool IsPrime(int n) const override {
+    if (precalc_impl_ != nullptr && n < max_precalculated_)
+      return precalc_impl_->IsPrime(n);
+    else
+      return on_the_fly_impl_->IsPrime(n);
+  }
+
+  int GetNextPrime(int p) const override {
+    int next_prime = -1;
+    if (precalc_impl_ != nullptr && p < max_precalculated_)
+      next_prime = precalc_impl_->GetNextPrime(p);
+
+    return next_prime != -1 ? next_prime : on_the_fly_impl_->GetNextPrime(p);
+  }
+
+ private:
+  OnTheFlyPrimeTable* on_the_fly_impl_;
+  PreCalculatedPrimeTable* precalc_impl_;
+  int max_precalculated_;
+};
+
+using ::testing::TestWithParam;
+using ::testing::Bool;
+using ::testing::Values;
+using ::testing::Combine;
+
+// To test all code paths for HybridPrimeTable we must test it with numbers
+// both within and outside PreCalculatedPrimeTable's capacity and also with
+// PreCalculatedPrimeTable disabled. We do this by defining fixture which will
+// accept different combinations of parameters for instantiating a
+// HybridPrimeTable instance.
+class PrimeTableTest : public TestWithParam< ::std::tuple<bool, int> > {
+ protected:
+  void SetUp() override {
+    bool force_on_the_fly;
+    int max_precalculated;
+    std::tie(force_on_the_fly, max_precalculated) = GetParam();
+    table_ = new HybridPrimeTable(force_on_the_fly, max_precalculated);
+  }
+  void TearDown() override {
+    delete table_;
+    table_ = nullptr;
+  }
+  HybridPrimeTable* table_;
+};
+
+TEST_P(PrimeTableTest, ReturnsFalseForNonPrimes) {
+  // Inside the test body, you can refer to the test parameter by GetParam().
+  // In this case, the test parameter is a PrimeTable interface pointer which
+  // we can use directly.
+  // Please note that you can also save it in the fixture's SetUp() method
+  // or constructor and use saved copy in the tests.
+
+  EXPECT_FALSE(table_->IsPrime(-5));
+  EXPECT_FALSE(table_->IsPrime(0));
+  EXPECT_FALSE(table_->IsPrime(1));
+  EXPECT_FALSE(table_->IsPrime(4));
+  EXPECT_FALSE(table_->IsPrime(6));
+  EXPECT_FALSE(table_->IsPrime(100));
+}
+
+TEST_P(PrimeTableTest, ReturnsTrueForPrimes) {
+  EXPECT_TRUE(table_->IsPrime(2));
+  EXPECT_TRUE(table_->IsPrime(3));
+  EXPECT_TRUE(table_->IsPrime(5));
+  EXPECT_TRUE(table_->IsPrime(7));
+  EXPECT_TRUE(table_->IsPrime(11));
+  EXPECT_TRUE(table_->IsPrime(131));
+}
+
+TEST_P(PrimeTableTest, CanGetNextPrime) {
+  EXPECT_EQ(2, table_->GetNextPrime(0));
+  EXPECT_EQ(3, table_->GetNextPrime(2));
+  EXPECT_EQ(5, table_->GetNextPrime(3));
+  EXPECT_EQ(7, table_->GetNextPrime(5));
+  EXPECT_EQ(11, table_->GetNextPrime(7));
+  EXPECT_EQ(131, table_->GetNextPrime(128));
+}
+
+// In order to run value-parameterized tests, you need to instantiate them,
+// or bind them to a list of values which will be used as test parameters.
+// You can instantiate them in a different translation module, or even
+// instantiate them several times.
+//
+// Here, we instantiate our tests with a list of parameters. We must combine
+// all variations of the boolean flag suppressing PrecalcPrimeTable and some
+// meaningful values for tests. We choose a small value (1), and a value that
+// will put some of the tested numbers beyond the capability of the
+// PrecalcPrimeTable instance and some inside it (10). Combine will produce all
+// possible combinations.
+INSTANTIATE_TEST_SUITE_P(MeaningfulTestParameters, PrimeTableTest,
+                         Combine(Bool(), Values(1, 10)));
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample9_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample9_unittest.cc
new file mode 100755
index 0000000..e502d08
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/samples/sample9_unittest.cc
@@ -0,0 +1,156 @@
+// Copyright 2009 Google Inc. All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This sample shows how to use Google Test listener API to implement
+// an alternative console output and how to use the UnitTest reflection API
+// to enumerate test cases and tests and to inspect their results.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+using ::testing::EmptyTestEventListener;
+using ::testing::InitGoogleTest;
+using ::testing::Test;
+using ::testing::TestCase;
+using ::testing::TestEventListeners;
+using ::testing::TestInfo;
+using ::testing::TestPartResult;
+using ::testing::UnitTest;
+namespace {
+// Provides alternative output mode which produces minimal amount of
+// information about tests.
+class TersePrinter : public EmptyTestEventListener {
+ private:
+  // Called before any test activity starts.
+  void OnTestProgramStart(const UnitTest& /* unit_test */) override {}
+
+  // Called after all test activities have ended.
+  void OnTestProgramEnd(const UnitTest& unit_test) override {
+    fprintf(stdout, "TEST %s\n", unit_test.Passed() ? "PASSED" : "FAILED");
+    fflush(stdout);
+  }
+
+  // Called before a test starts.
+  void OnTestStart(const TestInfo& test_info) override {
+    fprintf(stdout,
+            "*** Test %s.%s starting.\n",
+            test_info.test_case_name(),
+            test_info.name());
+    fflush(stdout);
+  }
+
+  // Called after a failed assertion or a SUCCEED() invocation.
+  void OnTestPartResult(const TestPartResult& test_part_result) override {
+    fprintf(stdout,
+            "%s in %s:%d\n%s\n",
+            test_part_result.failed() ? "*** Failure" : "Success",
+            test_part_result.file_name(),
+            test_part_result.line_number(),
+            test_part_result.summary());
+    fflush(stdout);
+  }
+
+  // Called after a test ends.
+  void OnTestEnd(const TestInfo& test_info) override {
+    fprintf(stdout,
+            "*** Test %s.%s ending.\n",
+            test_info.test_case_name(),
+            test_info.name());
+    fflush(stdout);
+  }
+};  // class TersePrinter
+
+TEST(CustomOutputTest, PrintsMessage) {
+  printf("Printing something from the test body...\n");
+}
+
+TEST(CustomOutputTest, Succeeds) {
+  SUCCEED() << "SUCCEED() has been invoked from here";
+}
+
+TEST(CustomOutputTest, Fails) {
+  EXPECT_EQ(1, 2)
+      << "This test fails in order to demonstrate alternative failure messages";
+}
+}  // namespace
+
+int main(int argc, char **argv) {
+  InitGoogleTest(&argc, argv);
+
+  bool terse_output = false;
+  if (argc > 1 && strcmp(argv[1], "--terse_output") == 0 )
+    terse_output = true;
+  else
+    printf("%s\n", "Run this program with --terse_output to change the way "
+           "it prints its output.");
+
+  UnitTest& unit_test = *UnitTest::GetInstance();
+
+  // If we are given the --terse_output command line flag, suppresses the
+  // standard output and attaches own result printer.
+  if (terse_output) {
+    TestEventListeners& listeners = unit_test.listeners();
+
+    // Removes the default console output listener from the list so it will
+    // not receive events from Google Test and won't print any output. Since
+    // this operation transfers ownership of the listener to the caller we
+    // have to delete it as well.
+    delete listeners.Release(listeners.default_result_printer());
+
+    // Adds the custom output listener to the list. It will now receive
+    // events from Google Test and print the alternative output. We don't
+    // have to worry about deleting it since Google Test assumes ownership
+    // over it after adding it to the list.
+    listeners.Append(new TersePrinter);
+  }
+  int ret_val = RUN_ALL_TESTS();
+
+  // This is an example of using the UnitTest reflection API to inspect test
+  // results. Here we discount failures from the tests we expected to fail.
+  int unexpectedly_failed_tests = 0;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const testing::TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
+      // Counts failed tests that were not meant to fail (those without
+      // 'Fails' in the name).
+      if (test_info.result()->Failed() &&
+          strcmp(test_info.name(), "Fails") != 0) {
+        unexpectedly_failed_tests++;
+      }
+    }
+  }
+
+  // Test that were meant to fail should not affect the test program outcome.
+  if (unexpectedly_failed_tests == 0)
+    ret_val = 0;
+
+  return ret_val;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/common.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/common.py
new file mode 100755
index 0000000..3c0347a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/common.py
@@ -0,0 +1,83 @@
+# Copyright 2013 Google Inc. All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Shared utilities for writing scripts for Google Test/Mock."""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+
+import os
+import re
+
+
+# Matches the line from 'svn info .' output that describes what SVN
+# path the current local directory corresponds to.  For example, in
+# a googletest SVN workspace's trunk/test directory, the output will be:
+#
+# URL: https://googletest.googlecode.com/svn/trunk/test
+_SVN_INFO_URL_RE = re.compile(r'^URL: https://(\w+)\.googlecode\.com/svn(.*)')
+
+
+def GetCommandOutput(command):
+  """Runs the shell command and returns its stdout as a list of lines."""
+
+  f = os.popen(command, 'r')
+  lines = [line.strip() for line in f.readlines()]
+  f.close()
+  return lines
+
+
+def GetSvnInfo():
+  """Returns the project name and the current SVN workspace's root path."""
+
+  for line in GetCommandOutput('svn info .'):
+    m = _SVN_INFO_URL_RE.match(line)
+    if m:
+      project = m.group(1)  # googletest or googlemock
+      rel_path = m.group(2)
+      root = os.path.realpath(rel_path.count('/') * '../')
+      return project, root
+
+  return None, None
+
+
+def GetSvnTrunk():
+  """Returns the current SVN workspace's trunk root path."""
+
+  _, root = GetSvnInfo()
+  return root + '/trunk' if root else None
+
+
+def IsInGTestSvn():
+  project, _ = GetSvnInfo()
+  return project == 'googletest'
+
+
+def IsInGMockSvn():
+  project, _ = GetSvnInfo()
+  return project == 'googlemock'
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/fuse_gtest_files.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/fuse_gtest_files.py
new file mode 100755
index 0000000..d0dd464
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/fuse_gtest_files.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""fuse_gtest_files.py v0.2.0
+Fuses Google Test source code into a .h file and a .cc file.
+
+SYNOPSIS
+       fuse_gtest_files.py [GTEST_ROOT_DIR] OUTPUT_DIR
+
+       Scans GTEST_ROOT_DIR for Google Test source code, and generates
+       two files: OUTPUT_DIR/gtest/gtest.h and OUTPUT_DIR/gtest/gtest-all.cc.
+       Then you can build your tests by adding OUTPUT_DIR to the include
+       search path and linking with OUTPUT_DIR/gtest/gtest-all.cc.  These
+       two files contain everything you need to use Google Test.  Hence
+       you can "install" Google Test by copying them to wherever you want.
+
+       GTEST_ROOT_DIR can be omitted and defaults to the parent
+       directory of the directory holding this script.
+
+EXAMPLES
+       ./fuse_gtest_files.py fused_gtest
+       ./fuse_gtest_files.py path/to/unpacked/gtest fused_gtest
+
+This tool is experimental.  In particular, it assumes that there is no
+conditional inclusion of Google Test headers.  Please report any
+problems to googletestframework@googlegroups.com.  You can read
+https://github.com/google/googletest/blob/master/googletest/docs/advanced.md for
+more information.
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import re
+try:
+  from sets import Set as set  # For Python 2.3 compatibility
+except ImportError:
+  pass
+import sys
+
+# We assume that this file is in the scripts/ directory in the Google
+# Test root directory.
+DEFAULT_GTEST_ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+
+# Regex for matching '#include "gtest/..."'.
+INCLUDE_GTEST_FILE_REGEX = re.compile(r'^\s*#\s*include\s*"(gtest/.+)"')
+
+# Regex for matching '#include "src/..."'.
+INCLUDE_SRC_FILE_REGEX = re.compile(r'^\s*#\s*include\s*"(src/.+)"')
+
+# Where to find the source seed files.
+GTEST_H_SEED = 'include/gtest/gtest.h'
+GTEST_SPI_H_SEED = 'include/gtest/gtest-spi.h'
+GTEST_ALL_CC_SEED = 'src/gtest-all.cc'
+
+# Where to put the generated files.
+GTEST_H_OUTPUT = 'gtest/gtest.h'
+GTEST_ALL_CC_OUTPUT = 'gtest/gtest-all.cc'
+
+
+def VerifyFileExists(directory, relative_path):
+  """Verifies that the given file exists; aborts on failure.
+
+  relative_path is the file path relative to the given directory.
+  """
+
+  if not os.path.isfile(os.path.join(directory, relative_path)):
+    print('ERROR: Cannot find %s in directory %s.' % (relative_path,
+                                                      directory))
+    print('Please either specify a valid project root directory '
+          'or omit it on the command line.')
+    sys.exit(1)
+
+
+def ValidateGTestRootDir(gtest_root):
+  """Makes sure gtest_root points to a valid gtest root directory.
+
+  The function aborts the program on failure.
+  """
+
+  VerifyFileExists(gtest_root, GTEST_H_SEED)
+  VerifyFileExists(gtest_root, GTEST_ALL_CC_SEED)
+
+
+def VerifyOutputFile(output_dir, relative_path):
+  """Verifies that the given output file path is valid.
+
+  relative_path is relative to the output_dir directory.
+  """
+
+  # Makes sure the output file either doesn't exist or can be overwritten.
+  output_file = os.path.join(output_dir, relative_path)
+  if os.path.exists(output_file):
+    # TODO(wan@google.com): The following user-interaction doesn't
+    # work with automated processes.  We should provide a way for the
+    # Makefile to force overwriting the files.
+    print('%s already exists in directory %s - overwrite it? (y/N) ' %
+          (relative_path, output_dir))
+    answer = sys.stdin.readline().strip()
+    if answer not in ['y', 'Y']:
+      print('ABORTED.')
+      sys.exit(1)
+
+  # Makes sure the directory holding the output file exists; creates
+  # it and all its ancestors if necessary.
+  parent_directory = os.path.dirname(output_file)
+  if not os.path.isdir(parent_directory):
+    os.makedirs(parent_directory)
+
+
+def ValidateOutputDir(output_dir):
+  """Makes sure output_dir points to a valid output directory.
+
+  The function aborts the program on failure.
+  """
+
+  VerifyOutputFile(output_dir, GTEST_H_OUTPUT)
+  VerifyOutputFile(output_dir, GTEST_ALL_CC_OUTPUT)
+
+
+def FuseGTestH(gtest_root, output_dir):
+  """Scans folder gtest_root to generate gtest/gtest.h in output_dir."""
+
+  output_file = open(os.path.join(output_dir, GTEST_H_OUTPUT), 'w')
+  processed_files = set()  # Holds all gtest headers we've processed.
+
+  def ProcessFile(gtest_header_path):
+    """Processes the given gtest header file."""
+
+    # We don't process the same header twice.
+    if gtest_header_path in processed_files:
+      return
+
+    processed_files.add(gtest_header_path)
+
+    # Reads each line in the given gtest header.
+    for line in open(os.path.join(gtest_root, gtest_header_path), 'r'):
+      m = INCLUDE_GTEST_FILE_REGEX.match(line)
+      if m:
+        # It's '#include "gtest/..."' - let's process it recursively.
+        ProcessFile('include/' + m.group(1))
+      else:
+        # Otherwise we copy the line unchanged to the output file.
+        output_file.write(line)
+
+  ProcessFile(GTEST_H_SEED)
+  output_file.close()
+
+
+def FuseGTestAllCcToFile(gtest_root, output_file):
+  """Scans folder gtest_root to generate gtest/gtest-all.cc in output_file."""
+
+  processed_files = set()
+
+  def ProcessFile(gtest_source_file):
+    """Processes the given gtest source file."""
+
+    # We don't process the same #included file twice.
+    if gtest_source_file in processed_files:
+      return
+
+    processed_files.add(gtest_source_file)
+
+    # Reads each line in the given gtest source file.
+    for line in open(os.path.join(gtest_root, gtest_source_file), 'r'):
+      m = INCLUDE_GTEST_FILE_REGEX.match(line)
+      if m:
+        if 'include/' + m.group(1) == GTEST_SPI_H_SEED:
+          # It's '#include "gtest/gtest-spi.h"'.  This file is not
+          # #included by "gtest/gtest.h", so we need to process it.
+          ProcessFile(GTEST_SPI_H_SEED)
+        else:
+          # It's '#include "gtest/foo.h"' where foo is not gtest-spi.
+          # We treat it as '#include "gtest/gtest.h"', as all other
+          # gtest headers are being fused into gtest.h and cannot be
+          # #included directly.
+
+          # There is no need to #include "gtest/gtest.h" more than once.
+          if not GTEST_H_SEED in processed_files:
+            processed_files.add(GTEST_H_SEED)
+            output_file.write('#include "%s"\n' % (GTEST_H_OUTPUT,))
+      else:
+        m = INCLUDE_SRC_FILE_REGEX.match(line)
+        if m:
+          # It's '#include "src/foo"' - let's process it recursively.
+          ProcessFile(m.group(1))
+        else:
+          output_file.write(line)
+
+  ProcessFile(GTEST_ALL_CC_SEED)
+
+
+def FuseGTestAllCc(gtest_root, output_dir):
+  """Scans folder gtest_root to generate gtest/gtest-all.cc in output_dir."""
+
+  output_file = open(os.path.join(output_dir, GTEST_ALL_CC_OUTPUT), 'w')
+  FuseGTestAllCcToFile(gtest_root, output_file)
+  output_file.close()
+
+
+def FuseGTest(gtest_root, output_dir):
+  """Fuses gtest.h and gtest-all.cc."""
+
+  ValidateGTestRootDir(gtest_root)
+  ValidateOutputDir(output_dir)
+
+  FuseGTestH(gtest_root, output_dir)
+  FuseGTestAllCc(gtest_root, output_dir)
+
+
+def main():
+  argc = len(sys.argv)
+  if argc == 2:
+    # fuse_gtest_files.py OUTPUT_DIR
+    FuseGTest(DEFAULT_GTEST_ROOT_DIR, sys.argv[1])
+  elif argc == 3:
+    # fuse_gtest_files.py GTEST_ROOT_DIR OUTPUT_DIR
+    FuseGTest(sys.argv[1], sys.argv[2])
+  else:
+    print(__doc__)
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gen_gtest_pred_impl.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gen_gtest_pred_impl.py
new file mode 100755
index 0000000..b43efdf
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gen_gtest_pred_impl.py
@@ -0,0 +1,730 @@
+#!/usr/bin/env python
+#
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""gen_gtest_pred_impl.py v0.1
+
+Generates the implementation of Google Test predicate assertions and
+accompanying tests.
+
+Usage:
+
+  gen_gtest_pred_impl.py MAX_ARITY
+
+where MAX_ARITY is a positive integer.
+
+The command generates the implementation of up-to MAX_ARITY-ary
+predicate assertions, and writes it to file gtest_pred_impl.h in the
+directory where the script is.  It also generates the accompanying
+unit test in file gtest_pred_impl_unittest.cc.
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import sys
+import time
+
+# Where this script is.
+SCRIPT_DIR = os.path.dirname(sys.argv[0])
+
+# Where to store the generated header.
+HEADER = os.path.join(SCRIPT_DIR, '../include/gtest/gtest_pred_impl.h')
+
+# Where to store the generated unit test.
+UNIT_TEST = os.path.join(SCRIPT_DIR, '../test/gtest_pred_impl_unittest.cc')
+
+
+def HeaderPreamble(n):
+  """Returns the preamble for the header file.
+
+  Args:
+    n:  the maximum arity of the predicate macros to be generated.
+  """
+
+  # A map that defines the values used in the preamble template.
+  DEFS = {
+    'today' : time.strftime('%m/%d/%Y'),
+    'year' : time.strftime('%Y'),
+    'command' : '%s %s' % (os.path.basename(sys.argv[0]), n),
+    'n' : n
+    }
+
+  return (
+"""// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on %(today)s by command
+// '%(command)s'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most %(n)s.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \\
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \\
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \\
+    ; \\
+  else \\
+    on_failure(gtest_ar.failure_message())
+""" % DEFS)
+
+
+def Arity(n):
+  """Returns the English name of the given arity."""
+
+  if n < 0:
+    return None
+  elif n <= 3:
+    return ['nullary', 'unary', 'binary', 'ternary'][n]
+  else:
+    return '%s-ary' % n
+
+
+def Title(word):
+  """Returns the given word in title case.  The difference between
+  this and string's title() method is that Title('4-ary') is '4-ary'
+  while '4-ary'.title() is '4-Ary'."""
+
+  return word[0].upper() + word[1:]
+
+
+def OneTo(n):
+  """Returns the list [1, 2, 3, ..., n]."""
+
+  return range(1, n + 1)
+
+
+def Iter(n, format, sep=''):
+  """Given a positive integer n, a format string that contains 0 or
+  more '%s' format specs, and optionally a separator string, returns
+  the join of n strings, each formatted with the format string on an
+  iterator ranged from 1 to n.
+
+  Example:
+
+  Iter(3, 'v%s', sep=', ') returns 'v1, v2, v3'.
+  """
+
+  # How many '%s' specs are in format?
+  spec_count = len(format.split('%s')) - 1
+  return sep.join([format % (spec_count * (i,)) for i in OneTo(n)])
+
+
+def ImplementationForArity(n):
+  """Returns the implementation of n-ary predicate assertions."""
+
+  # A map the defines the values used in the implementation template.
+  DEFS = {
+    'n' : str(n),
+    'vs' : Iter(n, 'v%s', sep=', '),
+    'vts' : Iter(n, '#v%s', sep=', '),
+    'arity' : Arity(n),
+    'Arity' : Title(Arity(n))
+    }
+
+  impl = """
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED%(n)s.  Don't use
+// this in your code.
+template <typename Pred""" % DEFS
+
+  impl += Iter(n, """,
+          typename T%s""")
+
+  impl += """>
+AssertionResult AssertPred%(n)sHelper(const char* pred_text""" % DEFS
+
+  impl += Iter(n, """,
+                                  const char* e%s""")
+
+  impl += """,
+                                  Pred pred"""
+
+  impl += Iter(n, """,
+                                  const T%s& v%s""")
+
+  impl += """) {
+  if (pred(%(vs)s)) return AssertionSuccess();
+
+""" % DEFS
+
+  impl += '  return AssertionFailure() << pred_text << "("'
+
+  impl += Iter(n, """
+                            << e%s""", sep=' << ", "')
+
+  impl += ' << ") evaluates to false, where"'
+
+  impl += Iter(n, """
+                            << "\\n" << e%s << " evaluates to " << v%s""")
+
+  impl += """;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT%(n)s.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT%(n)s_(pred_format, %(vs)s, on_failure)\\
+  GTEST_ASSERT_(pred_format(%(vts)s, %(vs)s), \\
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED%(n)s.  Don't use
+// this in your code.
+#define GTEST_PRED%(n)s_(pred, %(vs)s, on_failure)\\
+  GTEST_ASSERT_(::testing::AssertPred%(n)sHelper(#pred""" % DEFS
+
+  impl += Iter(n, """, \\
+                                             #v%s""")
+
+  impl += """, \\
+                                             pred"""
+
+  impl += Iter(n, """, \\
+                                             v%s""")
+
+  impl += """), on_failure)
+
+// %(Arity)s predicate assertion macros.
+#define EXPECT_PRED_FORMAT%(n)s(pred_format, %(vs)s) \\
+  GTEST_PRED_FORMAT%(n)s_(pred_format, %(vs)s, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED%(n)s(pred, %(vs)s) \\
+  GTEST_PRED%(n)s_(pred, %(vs)s, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT%(n)s(pred_format, %(vs)s) \\
+  GTEST_PRED_FORMAT%(n)s_(pred_format, %(vs)s, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED%(n)s(pred, %(vs)s) \\
+  GTEST_PRED%(n)s_(pred, %(vs)s, GTEST_FATAL_FAILURE_)
+
+""" % DEFS
+
+  return impl
+
+
+def HeaderPostamble():
+  """Returns the postamble for the header file."""
+
+  return """
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+"""
+
+
+def GenerateFile(path, content):
+  """Given a file path and a content string
+     overwrites it with the given content.
+  """
+  print 'Updating file %s . . .' % path
+  f = file(path, 'w+')
+  print >>f, content,
+  f.close()
+
+  print 'File %s has been updated.' % path
+
+
+def GenerateHeader(n):
+  """Given the maximum arity n, updates the header file that implements
+  the predicate assertions.
+  """
+  GenerateFile(HEADER,
+               HeaderPreamble(n)
+               + ''.join([ImplementationForArity(i) for i in OneTo(n)])
+               + HeaderPostamble())
+
+
+def UnitTestPreamble():
+  """Returns the preamble for the unit test file."""
+
+  # A map that defines the values used in the preamble template.
+  DEFS = {
+    'today' : time.strftime('%m/%d/%Y'),
+    'year' : time.strftime('%Y'),
+    'command' : '%s %s' % (os.path.basename(sys.argv[0]), sys.argv[1]),
+    }
+
+  return (
+"""// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on %(today)s by command
+// '%(command)s'.  DO NOT EDIT BY HAND!
+
+// Regression test for gtest_pred_impl.h
+//
+// This file is generated by a script and quite long.  If you intend to
+// learn how Google Test works by reading its unit tests, read
+// gtest_unittest.cc instead.
+//
+// This is intended as a regression test for the Google Test predicate
+// assertions.  We compile it as part of the gtest_unittest target
+// only to keep the implementation tidy and compact, as it is quite
+// involved to set up the stage for testing Google Test using Google
+// Test itself.
+//
+// Currently, gtest_unittest takes ~11 seconds to run in the testing
+// daemon.  In the future, if it grows too large and needs much more
+// time to finish, we should consider separating this file into a
+// stand-alone regression test.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+// A user-defined data type.
+struct Bool {
+  explicit Bool(int val) : value(val != 0) {}
+
+  bool operator>(int n) const { return value > Bool(n).value; }
+
+  Bool operator+(const Bool& rhs) const { return Bool(value + rhs.value); }
+
+  bool operator==(const Bool& rhs) const { return value == rhs.value; }
+
+  bool value;
+};
+
+// Enables Bool to be used in assertions.
+std::ostream& operator<<(std::ostream& os, const Bool& x) {
+  return os << (x.value ? "true" : "false");
+}
+
+""" % DEFS)
+
+
+def TestsForArity(n):
+  """Returns the tests for n-ary predicate assertions."""
+
+  # A map that defines the values used in the template for the tests.
+  DEFS = {
+    'n' : n,
+    'es' : Iter(n, 'e%s', sep=', '),
+    'vs' : Iter(n, 'v%s', sep=', '),
+    'vts' : Iter(n, '#v%s', sep=', '),
+    'tvs' : Iter(n, 'T%s v%s', sep=', '),
+    'int_vs' : Iter(n, 'int v%s', sep=', '),
+    'Bool_vs' : Iter(n, 'Bool v%s', sep=', '),
+    'types' : Iter(n, 'typename T%s', sep=', '),
+    'v_sum' : Iter(n, 'v%s', sep=' + '),
+    'arity' : Arity(n),
+    'Arity' : Title(Arity(n)),
+    }
+
+  tests = (
+"""// Sample functions/functors for testing %(arity)s predicate assertions.
+
+// A %(arity)s predicate function.
+template <%(types)s>
+bool PredFunction%(n)s(%(tvs)s) {
+  return %(v_sum)s > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction%(n)sInt(%(int_vs)s) {
+  return %(v_sum)s > 0;
+}
+bool PredFunction%(n)sBool(%(Bool_vs)s) {
+  return %(v_sum)s > 0;
+}
+""" % DEFS)
+
+  tests += """
+// A %(arity)s predicate functor.
+struct PredFunctor%(n)s {
+  template <%(types)s>
+  bool operator()(""" % DEFS
+
+  tests += Iter(n, 'const T%s& v%s', sep=""",
+                  """)
+
+  tests += """) {
+    return %(v_sum)s > 0;
+  }
+};
+""" % DEFS
+
+  tests += """
+// A %(arity)s predicate-formatter function.
+template <%(types)s>
+testing::AssertionResult PredFormatFunction%(n)s(""" % DEFS
+
+  tests += Iter(n, 'const char* e%s', sep=""",
+                                             """)
+
+  tests += Iter(n, """,
+                                             const T%s& v%s""")
+
+  tests += """) {
+  if (PredFunction%(n)s(%(vs)s))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << """ % DEFS
+
+  tests += Iter(n, 'e%s', sep=' << " + " << ')
+
+  tests += """
+      << " is expected to be positive, but evaluates to "
+      << %(v_sum)s << ".";
+}
+""" % DEFS
+
+  tests += """
+// A %(arity)s predicate-formatter functor.
+struct PredFormatFunctor%(n)s {
+  template <%(types)s>
+  testing::AssertionResult operator()(""" % DEFS
+
+  tests += Iter(n, 'const char* e%s', sep=""",
+                                      """)
+
+  tests += Iter(n, """,
+                                      const T%s& v%s""")
+
+  tests += """) const {
+    return PredFormatFunction%(n)s(%(es)s, %(vs)s);
+  }
+};
+""" % DEFS
+
+  tests += """
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT%(n)s.
+
+class Predicate%(n)sTest : public testing::Test {
+ protected:
+  virtual void SetUp() {
+    expected_to_finish_ = true;
+    finished_ = false;""" % DEFS
+
+  tests += """
+    """ + Iter(n, 'n%s_ = ') + """0;
+  }
+"""
+
+  tests += """
+  virtual void TearDown() {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once."""
+
+  tests += ''.join(["""
+    EXPECT_EQ(1, n%s_) <<
+        "The predicate assertion didn't evaluate argument %s "
+        "exactly once.";""" % (i, i + 1) for i in OneTo(n)])
+
+  tests += """
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true iff the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true iff the test function did run to finish.
+  static bool finished_;
+""" % DEFS
+
+  tests += Iter(n, """
+  static int n%s_;""")
+
+  tests += """
+};
+
+bool Predicate%(n)sTest::expected_to_finish_;
+bool Predicate%(n)sTest::finished_;
+""" % DEFS
+
+  tests += Iter(n, """int Predicate%%(n)sTest::n%s_;
+""") % DEFS
+
+  tests += """
+typedef Predicate%(n)sTest EXPECT_PRED_FORMAT%(n)sTest;
+typedef Predicate%(n)sTest ASSERT_PRED_FORMAT%(n)sTest;
+typedef Predicate%(n)sTest EXPECT_PRED%(n)sTest;
+typedef Predicate%(n)sTest ASSERT_PRED%(n)sTest;
+""" % DEFS
+
+  def GenTest(use_format, use_assert, expect_failure,
+              use_functor, use_user_type):
+    """Returns the test for a predicate assertion macro.
+
+    Args:
+      use_format:     true iff the assertion is a *_PRED_FORMAT*.
+      use_assert:     true iff the assertion is a ASSERT_*.
+      expect_failure: true iff the assertion is expected to fail.
+      use_functor:    true iff the first argument of the assertion is
+                      a functor (as opposed to a function)
+      use_user_type:  true iff the predicate functor/function takes
+                      argument(s) of a user-defined type.
+
+    Example:
+
+      GenTest(1, 0, 0, 1, 0) returns a test that tests the behavior
+      of a successful EXPECT_PRED_FORMATn() that takes a functor
+      whose arguments have built-in types."""
+
+    if use_assert:
+      assrt = 'ASSERT'  # 'assert' is reserved, so we cannot use
+                        # that identifier here.
+    else:
+      assrt = 'EXPECT'
+
+    assertion = assrt + '_PRED'
+
+    if use_format:
+      pred_format = 'PredFormat'
+      assertion += '_FORMAT'
+    else:
+      pred_format = 'Pred'
+
+    assertion += '%(n)s' % DEFS
+
+    if use_functor:
+      pred_format_type = 'functor'
+      pred_format += 'Functor%(n)s()'
+    else:
+      pred_format_type = 'function'
+      pred_format += 'Function%(n)s'
+      if not use_format:
+        if use_user_type:
+          pred_format += 'Bool'
+        else:
+          pred_format += 'Int'
+
+    test_name = pred_format_type.title()
+
+    if use_user_type:
+      arg_type = 'user-defined type (Bool)'
+      test_name += 'OnUserType'
+      if expect_failure:
+        arg = 'Bool(n%s_++)'
+      else:
+        arg = 'Bool(++n%s_)'
+    else:
+      arg_type = 'built-in type (int)'
+      test_name += 'OnBuiltInType'
+      if expect_failure:
+        arg = 'n%s_++'
+      else:
+        arg = '++n%s_'
+
+    if expect_failure:
+      successful_or_failed = 'failed'
+      expected_or_not = 'expected.'
+      test_name +=  'Failure'
+    else:
+      successful_or_failed = 'successful'
+      expected_or_not = 'UNEXPECTED!'
+      test_name +=  'Success'
+
+    # A map that defines the values used in the test template.
+    defs = DEFS.copy()
+    defs.update({
+      'assert' : assrt,
+      'assertion' : assertion,
+      'test_name' : test_name,
+      'pf_type' : pred_format_type,
+      'pf' : pred_format,
+      'arg_type' : arg_type,
+      'arg' : arg,
+      'successful' : successful_or_failed,
+      'expected' : expected_or_not,
+      })
+
+    test = """
+// Tests a %(successful)s %(assertion)s where the
+// predicate-formatter is a %(pf_type)s on a %(arg_type)s.
+TEST_F(%(assertion)sTest, %(test_name)s) {""" % defs
+
+    indent = (len(assertion) + 3)*' '
+    extra_indent = ''
+
+    if expect_failure:
+      extra_indent = '  '
+      if use_assert:
+        test += """
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT"""
+      else:
+        test += """
+  EXPECT_NONFATAL_FAILURE({  // NOLINT"""
+
+    test += '\n' + extra_indent + """  %(assertion)s(%(pf)s""" % defs
+
+    test = test % defs
+    test += Iter(n, ',\n' + indent + extra_indent + '%(arg)s' % defs)
+    test += ');\n' + extra_indent + '  finished_ = true;\n'
+
+    if expect_failure:
+      test += '  }, "");\n'
+
+    test += '}\n'
+    return test
+
+  # Generates tests for all 2**6 = 64 combinations.
+  tests += ''.join([GenTest(use_format, use_assert, expect_failure,
+                            use_functor, use_user_type)
+                    for use_format in [0, 1]
+                    for use_assert in [0, 1]
+                    for expect_failure in [0, 1]
+                    for use_functor in [0, 1]
+                    for use_user_type in [0, 1]
+                    ])
+
+  return tests
+
+
+def UnitTestPostamble():
+  """Returns the postamble for the tests."""
+
+  return ''
+
+
+def GenerateUnitTest(n):
+  """Returns the tests for up-to n-ary predicate assertions."""
+
+  GenerateFile(UNIT_TEST,
+               UnitTestPreamble()
+               + ''.join([TestsForArity(i) for i in OneTo(n)])
+               + UnitTestPostamble())
+
+
+def _Main():
+  """The entry point of the script.  Generates the header file and its
+  unit test."""
+
+  if len(sys.argv) != 2:
+    print __doc__
+    print 'Author: ' + __author__
+    sys.exit(1)
+
+  n = int(sys.argv[1])
+  GenerateHeader(n)
+  GenerateUnitTest(n)
+
+
+if __name__ == '__main__':
+  _Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gtest-config.in b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gtest-config.in
new file mode 100755
index 0000000..780f843
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/gtest-config.in
@@ -0,0 +1,274 @@
+#!/bin/sh
+
+# These variables are automatically filled in by the configure script.
+name="@PACKAGE_TARNAME@"
+version="@PACKAGE_VERSION@"
+
+show_usage()
+{
+  echo "Usage: gtest-config [OPTIONS...]"
+}
+
+show_help()
+{
+  show_usage
+  cat <<\EOF
+
+The `gtest-config' script provides access to the necessary compile and linking
+flags to connect with Google C++ Testing Framework, both in a build prior to
+installation, and on the system proper after installation. The installation
+overrides may be issued in combination with any other queries, but will only
+affect installation queries if called on a built but not installed gtest. The
+installation queries may not be issued with any other types of queries, and
+only one installation query may be made at a time. The version queries and
+compiler flag queries may be combined as desired but not mixed. Different
+version queries are always combined with logical "and" semantics, and only the
+last of any particular query is used while all previous ones ignored. All
+versions must be specified as a sequence of numbers separated by periods.
+Compiler flag queries output the union of the sets of flags when combined.
+
+ Examples:
+  gtest-config --min-version=1.0 || echo "Insufficient Google Test version."
+
+  g++ $(gtest-config --cppflags --cxxflags) -o foo.o -c foo.cpp
+  g++ $(gtest-config --ldflags --libs) -o foo foo.o
+
+  # When using a built but not installed Google Test:
+  g++ $(../../my_gtest_build/scripts/gtest-config ...) ...
+
+  # When using an installed Google Test, but with installation overrides:
+  export GTEST_PREFIX="/opt"
+  g++ $(gtest-config --libdir="/opt/lib64" ...) ...
+
+ Help:
+  --usage                    brief usage information
+  --help                     display this help message
+
+ Installation Overrides:
+  --prefix=<dir>             overrides the installation prefix
+  --exec-prefix=<dir>        overrides the executable installation prefix
+  --libdir=<dir>             overrides the library installation prefix
+  --includedir=<dir>         overrides the header file installation prefix
+
+ Installation Queries:
+  --prefix                   installation prefix
+  --exec-prefix              executable installation prefix
+  --libdir                   library installation directory
+  --includedir               header file installation directory
+  --version                  the version of the Google Test installation
+
+ Version Queries:
+  --min-version=VERSION      return 0 if the version is at least VERSION
+  --exact-version=VERSION    return 0 if the version is exactly VERSION
+  --max-version=VERSION      return 0 if the version is at most VERSION
+
+ Compilation Flag Queries:
+  --cppflags                 compile flags specific to the C-like preprocessors
+  --cxxflags                 compile flags appropriate for C++ programs
+  --ldflags                  linker flags
+  --libs                     libraries for linking
+
+EOF
+}
+
+# This function bounds our version with a min and a max. It uses some clever
+# POSIX-compliant variable expansion to portably do all the work in the shell
+# and avoid any dependency on a particular "sed" or "awk" implementation.
+# Notable is that it will only ever compare the first 3 components of versions.
+# Further components will be cleanly stripped off. All versions must be
+# unadorned, so "v1.0" will *not* work. The minimum version must be in $1, and
+# the max in $2. TODO(chandlerc@google.com): If this ever breaks, we should
+# investigate expanding this via autom4te from AS_VERSION_COMPARE rather than
+# continuing to maintain our own shell version.
+check_versions()
+{
+  major_version=${version%%.*}
+  minor_version="0"
+  point_version="0"
+  if test "${version#*.}" != "${version}"; then
+    minor_version=${version#*.}
+    minor_version=${minor_version%%.*}
+  fi
+  if test "${version#*.*.}" != "${version}"; then
+    point_version=${version#*.*.}
+    point_version=${point_version%%.*}
+  fi
+
+  min_version="$1"
+  min_major_version=${min_version%%.*}
+  min_minor_version="0"
+  min_point_version="0"
+  if test "${min_version#*.}" != "${min_version}"; then
+    min_minor_version=${min_version#*.}
+    min_minor_version=${min_minor_version%%.*}
+  fi
+  if test "${min_version#*.*.}" != "${min_version}"; then
+    min_point_version=${min_version#*.*.}
+    min_point_version=${min_point_version%%.*}
+  fi
+
+  max_version="$2"
+  max_major_version=${max_version%%.*}
+  max_minor_version="0"
+  max_point_version="0"
+  if test "${max_version#*.}" != "${max_version}"; then
+    max_minor_version=${max_version#*.}
+    max_minor_version=${max_minor_version%%.*}
+  fi
+  if test "${max_version#*.*.}" != "${max_version}"; then
+    max_point_version=${max_version#*.*.}
+    max_point_version=${max_point_version%%.*}
+  fi
+
+  test $(($major_version)) -lt $(($min_major_version)) && exit 1
+  if test $(($major_version)) -eq $(($min_major_version)); then
+    test $(($minor_version)) -lt $(($min_minor_version)) && exit 1
+    if test $(($minor_version)) -eq $(($min_minor_version)); then
+      test $(($point_version)) -lt $(($min_point_version)) && exit 1
+    fi
+  fi
+
+  test $(($major_version)) -gt $(($max_major_version)) && exit 1
+  if test $(($major_version)) -eq $(($max_major_version)); then
+    test $(($minor_version)) -gt $(($max_minor_version)) && exit 1
+    if test $(($minor_version)) -eq $(($max_minor_version)); then
+      test $(($point_version)) -gt $(($max_point_version)) && exit 1
+    fi
+  fi
+
+  exit 0
+}
+
+# Show the usage line when no arguments are specified.
+if test $# -eq 0; then
+  show_usage
+  exit 1
+fi
+
+while test $# -gt 0; do
+  case $1 in
+    --usage)          show_usage;         exit 0;;
+    --help)           show_help;          exit 0;;
+
+    # Installation overrides
+    --prefix=*)       GTEST_PREFIX=${1#--prefix=};;
+    --exec-prefix=*)  GTEST_EXEC_PREFIX=${1#--exec-prefix=};;
+    --libdir=*)       GTEST_LIBDIR=${1#--libdir=};;
+    --includedir=*)   GTEST_INCLUDEDIR=${1#--includedir=};;
+
+    # Installation queries
+    --prefix|--exec-prefix|--libdir|--includedir|--version)
+      if test -n "${do_query}"; then
+        show_usage
+        exit 1
+      fi
+      do_query=${1#--}
+      ;;
+
+    # Version checking
+    --min-version=*)
+      do_check_versions=yes
+      min_version=${1#--min-version=}
+      ;;
+    --max-version=*)
+      do_check_versions=yes
+      max_version=${1#--max-version=}
+      ;;
+    --exact-version=*)
+      do_check_versions=yes
+      exact_version=${1#--exact-version=}
+      ;;
+
+    # Compiler flag output
+    --cppflags)       echo_cppflags=yes;;
+    --cxxflags)       echo_cxxflags=yes;;
+    --ldflags)        echo_ldflags=yes;;
+    --libs)           echo_libs=yes;;
+
+    # Everything else is an error
+    *)                show_usage;         exit 1;;
+  esac
+  shift
+done
+
+# These have defaults filled in by the configure script but can also be
+# overridden by environment variables or command line parameters.
+prefix="${GTEST_PREFIX:-@prefix@}"
+exec_prefix="${GTEST_EXEC_PREFIX:-@exec_prefix@}"
+libdir="${GTEST_LIBDIR:-@libdir@}"
+includedir="${GTEST_INCLUDEDIR:-@includedir@}"
+
+# We try and detect if our binary is not located at its installed location. If
+# it's not, we provide variables pointing to the source and build tree rather
+# than to the install tree. This allows building against a just-built gtest
+# rather than an installed gtest.
+bindir="@bindir@"
+this_relative_bindir=`dirname $0`
+this_bindir=`cd ${this_relative_bindir}; pwd -P`
+if test "${this_bindir}" = "${this_bindir%${bindir}}"; then
+  # The path to the script doesn't end in the bindir sequence from Autoconf,
+  # assume that we are in a build tree.
+  build_dir=`dirname ${this_bindir}`
+  src_dir=`cd ${this_bindir}; cd @top_srcdir@; pwd -P`
+
+  # TODO(chandlerc@google.com): This is a dangerous dependency on libtool, we
+  # should work to remove it, and/or remove libtool altogether, replacing it
+  # with direct references to the library and a link path.
+  gtest_libs="${build_dir}/lib/libgtest.la @PTHREAD_CFLAGS@ @PTHREAD_LIBS@"
+  gtest_ldflags=""
+
+  # We provide hooks to include from either the source or build dir, where the
+  # build dir is always preferred. This will potentially allow us to write
+  # build rules for generated headers and have them automatically be preferred
+  # over provided versions.
+  gtest_cppflags="-I${build_dir}/include -I${src_dir}/include"
+  gtest_cxxflags="@PTHREAD_CFLAGS@"
+else
+  # We're using an installed gtest, although it may be staged under some
+  # prefix. Assume (as our own libraries do) that we can resolve the prefix,
+  # and are present in the dynamic link paths.
+  gtest_ldflags="-L${libdir}"
+  gtest_libs="-l${name} @PTHREAD_CFLAGS@ @PTHREAD_LIBS@"
+  gtest_cppflags="-I${includedir}"
+  gtest_cxxflags="@PTHREAD_CFLAGS@"
+fi
+
+# Do an installation query if requested.
+if test -n "$do_query"; then
+  case $do_query in
+    prefix)           echo $prefix;       exit 0;;
+    exec-prefix)      echo $exec_prefix;  exit 0;;
+    libdir)           echo $libdir;       exit 0;;
+    includedir)       echo $includedir;   exit 0;;
+    version)          echo $version;      exit 0;;
+    *)                show_usage;         exit 1;;
+  esac
+fi
+
+# Do a version check if requested.
+if test "$do_check_versions" = "yes"; then
+  # Make sure we didn't receive a bad combination of parameters.
+  test "$echo_cppflags" = "yes" && show_usage && exit 1
+  test "$echo_cxxflags" = "yes" && show_usage && exit 1
+  test "$echo_ldflags" = "yes"  && show_usage && exit 1
+  test "$echo_libs" = "yes"     && show_usage && exit 1
+
+  if test "$exact_version" != ""; then
+    check_versions $exact_version $exact_version
+    # unreachable
+  else
+    check_versions ${min_version:-0.0.0} ${max_version:-9999.9999.9999}
+    # unreachable
+  fi
+fi
+
+# Do the output in the correct order so that these can be used in-line of
+# a compiler invocation.
+output=""
+test "$echo_cppflags" = "yes" && output="$output $gtest_cppflags"
+test "$echo_cxxflags" = "yes" && output="$output $gtest_cxxflags"
+test "$echo_ldflags" = "yes"  && output="$output $gtest_ldflags"
+test "$echo_libs" = "yes"     && output="$output $gtest_libs"
+echo $output
+
+exit 0
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/pump.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/pump.py
new file mode 100755
index 0000000..5efb653
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/pump.py
@@ -0,0 +1,855 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""pump v0.2.0 - Pretty Useful for Meta Programming.
+
+A tool for preprocessor meta programming.  Useful for generating
+repetitive boilerplate code.  Especially useful for writing C++
+classes, functions, macros, and templates that need to work with
+various number of arguments.
+
+USAGE:
+       pump.py SOURCE_FILE
+
+EXAMPLES:
+       pump.py foo.cc.pump
+         Converts foo.cc.pump to foo.cc.
+
+GRAMMAR:
+       CODE ::= ATOMIC_CODE*
+       ATOMIC_CODE ::= $var ID = EXPRESSION
+           | $var ID = [[ CODE ]]
+           | $range ID EXPRESSION..EXPRESSION
+           | $for ID SEPARATOR [[ CODE ]]
+           | $($)
+           | $ID
+           | $(EXPRESSION)
+           | $if EXPRESSION [[ CODE ]] ELSE_BRANCH
+           | [[ CODE ]]
+           | RAW_CODE
+       SEPARATOR ::= RAW_CODE | EMPTY
+       ELSE_BRANCH ::= $else [[ CODE ]]
+           | $elif EXPRESSION [[ CODE ]] ELSE_BRANCH
+           | EMPTY
+       EXPRESSION has Python syntax.
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import re
+import sys
+
+
+TOKEN_TABLE = [
+    (re.compile(r'\$var\s+'), '$var'),
+    (re.compile(r'\$elif\s+'), '$elif'),
+    (re.compile(r'\$else\s+'), '$else'),
+    (re.compile(r'\$for\s+'), '$for'),
+    (re.compile(r'\$if\s+'), '$if'),
+    (re.compile(r'\$range\s+'), '$range'),
+    (re.compile(r'\$[_A-Za-z]\w*'), '$id'),
+    (re.compile(r'\$\(\$\)'), '$($)'),
+    (re.compile(r'\$'), '$'),
+    (re.compile(r'\[\[\n?'), '[['),
+    (re.compile(r'\]\]\n?'), ']]'),
+    ]
+
+
+class Cursor:
+  """Represents a position (line and column) in a text file."""
+
+  def __init__(self, line=-1, column=-1):
+    self.line = line
+    self.column = column
+
+  def __eq__(self, rhs):
+    return self.line == rhs.line and self.column == rhs.column
+
+  def __ne__(self, rhs):
+    return not self == rhs
+
+  def __lt__(self, rhs):
+    return self.line < rhs.line or (
+        self.line == rhs.line and self.column < rhs.column)
+
+  def __le__(self, rhs):
+    return self < rhs or self == rhs
+
+  def __gt__(self, rhs):
+    return rhs < self
+
+  def __ge__(self, rhs):
+    return rhs <= self
+
+  def __str__(self):
+    if self == Eof():
+      return 'EOF'
+    else:
+      return '%s(%s)' % (self.line + 1, self.column)
+
+  def __add__(self, offset):
+    return Cursor(self.line, self.column + offset)
+
+  def __sub__(self, offset):
+    return Cursor(self.line, self.column - offset)
+
+  def Clone(self):
+    """Returns a copy of self."""
+
+    return Cursor(self.line, self.column)
+
+
+# Special cursor to indicate the end-of-file.
+def Eof():
+  """Returns the special cursor to denote the end-of-file."""
+  return Cursor(-1, -1)
+
+
+class Token:
+  """Represents a token in a Pump source file."""
+
+  def __init__(self, start=None, end=None, value=None, token_type=None):
+    if start is None:
+      self.start = Eof()
+    else:
+      self.start = start
+    if end is None:
+      self.end = Eof()
+    else:
+      self.end = end
+    self.value = value
+    self.token_type = token_type
+
+  def __str__(self):
+    return 'Token @%s: \'%s\' type=%s' % (
+        self.start, self.value, self.token_type)
+
+  def Clone(self):
+    """Returns a copy of self."""
+
+    return Token(self.start.Clone(), self.end.Clone(), self.value,
+                 self.token_type)
+
+
+def StartsWith(lines, pos, string):
+  """Returns True iff the given position in lines starts with 'string'."""
+
+  return lines[pos.line][pos.column:].startswith(string)
+
+
+def FindFirstInLine(line, token_table):
+  best_match_start = -1
+  for (regex, token_type) in token_table:
+    m = regex.search(line)
+    if m:
+      # We found regex in lines
+      if best_match_start < 0 or m.start() < best_match_start:
+        best_match_start = m.start()
+        best_match_length = m.end() - m.start()
+        best_match_token_type = token_type
+
+  if best_match_start < 0:
+    return None
+
+  return (best_match_start, best_match_length, best_match_token_type)
+
+
+def FindFirst(lines, token_table, cursor):
+  """Finds the first occurrence of any string in strings in lines."""
+
+  start = cursor.Clone()
+  cur_line_number = cursor.line
+  for line in lines[start.line:]:
+    if cur_line_number == start.line:
+      line = line[start.column:]
+    m = FindFirstInLine(line, token_table)
+    if m:
+      # We found a regex in line.
+      (start_column, length, token_type) = m
+      if cur_line_number == start.line:
+        start_column += start.column
+      found_start = Cursor(cur_line_number, start_column)
+      found_end = found_start + length
+      return MakeToken(lines, found_start, found_end, token_type)
+    cur_line_number += 1
+  # We failed to find str in lines
+  return None
+
+
+def SubString(lines, start, end):
+  """Returns a substring in lines."""
+
+  if end == Eof():
+    end = Cursor(len(lines) - 1, len(lines[-1]))
+
+  if start >= end:
+    return ''
+
+  if start.line == end.line:
+    return lines[start.line][start.column:end.column]
+
+  result_lines = ([lines[start.line][start.column:]] +
+                  lines[start.line + 1:end.line] +
+                  [lines[end.line][:end.column]])
+  return ''.join(result_lines)
+
+
+def StripMetaComments(str):
+  """Strip meta comments from each line in the given string."""
+
+  # First, completely remove lines containing nothing but a meta
+  # comment, including the trailing \n.
+  str = re.sub(r'^\s*\$\$.*\n', '', str)
+
+  # Then, remove meta comments from contentful lines.
+  return re.sub(r'\s*\$\$.*', '', str)
+
+
+def MakeToken(lines, start, end, token_type):
+  """Creates a new instance of Token."""
+
+  return Token(start, end, SubString(lines, start, end), token_type)
+
+
+def ParseToken(lines, pos, regex, token_type):
+  line = lines[pos.line][pos.column:]
+  m = regex.search(line)
+  if m and not m.start():
+    return MakeToken(lines, pos, pos + m.end(), token_type)
+  else:
+    print 'ERROR: %s expected at %s.' % (token_type, pos)
+    sys.exit(1)
+
+
+ID_REGEX = re.compile(r'[_A-Za-z]\w*')
+EQ_REGEX = re.compile(r'=')
+REST_OF_LINE_REGEX = re.compile(r'.*?(?=$|\$\$)')
+OPTIONAL_WHITE_SPACES_REGEX = re.compile(r'\s*')
+WHITE_SPACE_REGEX = re.compile(r'\s')
+DOT_DOT_REGEX = re.compile(r'\.\.')
+
+
+def Skip(lines, pos, regex):
+  line = lines[pos.line][pos.column:]
+  m = re.search(regex, line)
+  if m and not m.start():
+    return pos + m.end()
+  else:
+    return pos
+
+
+def SkipUntil(lines, pos, regex, token_type):
+  line = lines[pos.line][pos.column:]
+  m = re.search(regex, line)
+  if m:
+    return pos + m.start()
+  else:
+    print ('ERROR: %s expected on line %s after column %s.' %
+           (token_type, pos.line + 1, pos.column))
+    sys.exit(1)
+
+
+def ParseExpTokenInParens(lines, pos):
+  def ParseInParens(pos):
+    pos = Skip(lines, pos, OPTIONAL_WHITE_SPACES_REGEX)
+    pos = Skip(lines, pos, r'\(')
+    pos = Parse(pos)
+    pos = Skip(lines, pos, r'\)')
+    return pos
+
+  def Parse(pos):
+    pos = SkipUntil(lines, pos, r'\(|\)', ')')
+    if SubString(lines, pos, pos + 1) == '(':
+      pos = Parse(pos + 1)
+      pos = Skip(lines, pos, r'\)')
+      return Parse(pos)
+    else:
+      return pos
+
+  start = pos.Clone()
+  pos = ParseInParens(pos)
+  return MakeToken(lines, start, pos, 'exp')
+
+
+def RStripNewLineFromToken(token):
+  if token.value.endswith('\n'):
+    return Token(token.start, token.end, token.value[:-1], token.token_type)
+  else:
+    return token
+
+
+def TokenizeLines(lines, pos):
+  while True:
+    found = FindFirst(lines, TOKEN_TABLE, pos)
+    if not found:
+      yield MakeToken(lines, pos, Eof(), 'code')
+      return
+
+    if found.start == pos:
+      prev_token = None
+      prev_token_rstripped = None
+    else:
+      prev_token = MakeToken(lines, pos, found.start, 'code')
+      prev_token_rstripped = RStripNewLineFromToken(prev_token)
+
+    if found.token_type == '$var':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, OPTIONAL_WHITE_SPACES_REGEX)
+
+      eq_token = ParseToken(lines, pos, EQ_REGEX, '=')
+      yield eq_token
+      pos = Skip(lines, eq_token.end, r'\s*')
+
+      if SubString(lines, pos, pos + 2) != '[[':
+        exp_token = ParseToken(lines, pos, REST_OF_LINE_REGEX, 'exp')
+        yield exp_token
+        pos = Cursor(exp_token.end.line + 1, 0)
+    elif found.token_type == '$for':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, WHITE_SPACE_REGEX)
+    elif found.token_type == '$range':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, OPTIONAL_WHITE_SPACES_REGEX)
+
+      dots_pos = SkipUntil(lines, pos, DOT_DOT_REGEX, '..')
+      yield MakeToken(lines, pos, dots_pos, 'exp')
+      yield MakeToken(lines, dots_pos, dots_pos + 2, '..')
+      pos = dots_pos + 2
+      new_pos = Cursor(pos.line + 1, 0)
+      yield MakeToken(lines, pos, new_pos, 'exp')
+      pos = new_pos
+    elif found.token_type == '$':
+      if prev_token:
+        yield prev_token
+      yield found
+      exp_token = ParseExpTokenInParens(lines, found.end)
+      yield exp_token
+      pos = exp_token.end
+    elif (found.token_type == ']]' or found.token_type == '$if' or
+          found.token_type == '$elif' or found.token_type == '$else'):
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      pos = found.end
+    else:
+      if prev_token:
+        yield prev_token
+      yield found
+      pos = found.end
+
+
+def Tokenize(s):
+  """A generator that yields the tokens in the given string."""
+  if s != '':
+    lines = s.splitlines(True)
+    for token in TokenizeLines(lines, Cursor(0, 0)):
+      yield token
+
+
+class CodeNode:
+  def __init__(self, atomic_code_list=None):
+    self.atomic_code = atomic_code_list
+
+
+class VarNode:
+  def __init__(self, identifier=None, atomic_code=None):
+    self.identifier = identifier
+    self.atomic_code = atomic_code
+
+
+class RangeNode:
+  def __init__(self, identifier=None, exp1=None, exp2=None):
+    self.identifier = identifier
+    self.exp1 = exp1
+    self.exp2 = exp2
+
+
+class ForNode:
+  def __init__(self, identifier=None, sep=None, code=None):
+    self.identifier = identifier
+    self.sep = sep
+    self.code = code
+
+
+class ElseNode:
+  def __init__(self, else_branch=None):
+    self.else_branch = else_branch
+
+
+class IfNode:
+  def __init__(self, exp=None, then_branch=None, else_branch=None):
+    self.exp = exp
+    self.then_branch = then_branch
+    self.else_branch = else_branch
+
+
+class RawCodeNode:
+  def __init__(self, token=None):
+    self.raw_code = token
+
+
+class LiteralDollarNode:
+  def __init__(self, token):
+    self.token = token
+
+
+class ExpNode:
+  def __init__(self, token, python_exp):
+    self.token = token
+    self.python_exp = python_exp
+
+
+def PopFront(a_list):
+  head = a_list[0]
+  a_list[:1] = []
+  return head
+
+
+def PushFront(a_list, elem):
+  a_list[:0] = [elem]
+
+
+def PopToken(a_list, token_type=None):
+  token = PopFront(a_list)
+  if token_type is not None and token.token_type != token_type:
+    print 'ERROR: %s expected at %s' % (token_type, token.start)
+    print 'ERROR: %s found instead' % (token,)
+    sys.exit(1)
+
+  return token
+
+
+def PeekToken(a_list):
+  if not a_list:
+    return None
+
+  return a_list[0]
+
+
+def ParseExpNode(token):
+  python_exp = re.sub(r'([_A-Za-z]\w*)', r'self.GetValue("\1")', token.value)
+  return ExpNode(token, python_exp)
+
+
+def ParseElseNode(tokens):
+  def Pop(token_type=None):
+    return PopToken(tokens, token_type)
+
+  next = PeekToken(tokens)
+  if not next:
+    return None
+  if next.token_type == '$else':
+    Pop('$else')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return code_node
+  elif next.token_type == '$elif':
+    Pop('$elif')
+    exp = Pop('code')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    inner_else_node = ParseElseNode(tokens)
+    return CodeNode([IfNode(ParseExpNode(exp), code_node, inner_else_node)])
+  elif not next.value.strip():
+    Pop('code')
+    return ParseElseNode(tokens)
+  else:
+    return None
+
+
+def ParseAtomicCodeNode(tokens):
+  def Pop(token_type=None):
+    return PopToken(tokens, token_type)
+
+  head = PopFront(tokens)
+  t = head.token_type
+  if t == 'code':
+    return RawCodeNode(head)
+  elif t == '$var':
+    id_token = Pop('id')
+    Pop('=')
+    next = PeekToken(tokens)
+    if next.token_type == 'exp':
+      exp_token = Pop()
+      return VarNode(id_token, ParseExpNode(exp_token))
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return VarNode(id_token, code_node)
+  elif t == '$for':
+    id_token = Pop('id')
+    next_token = PeekToken(tokens)
+    if next_token.token_type == 'code':
+      sep_token = next_token
+      Pop('code')
+    else:
+      sep_token = None
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return ForNode(id_token, sep_token, code_node)
+  elif t == '$if':
+    exp_token = Pop('code')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    else_node = ParseElseNode(tokens)
+    return IfNode(ParseExpNode(exp_token), code_node, else_node)
+  elif t == '$range':
+    id_token = Pop('id')
+    exp1_token = Pop('exp')
+    Pop('..')
+    exp2_token = Pop('exp')
+    return RangeNode(id_token, ParseExpNode(exp1_token),
+                     ParseExpNode(exp2_token))
+  elif t == '$id':
+    return ParseExpNode(Token(head.start + 1, head.end, head.value[1:], 'id'))
+  elif t == '$($)':
+    return LiteralDollarNode(head)
+  elif t == '$':
+    exp_token = Pop('exp')
+    return ParseExpNode(exp_token)
+  elif t == '[[':
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return code_node
+  else:
+    PushFront(tokens, head)
+    return None
+
+
+def ParseCodeNode(tokens):
+  atomic_code_list = []
+  while True:
+    if not tokens:
+      break
+    atomic_code_node = ParseAtomicCodeNode(tokens)
+    if atomic_code_node:
+      atomic_code_list.append(atomic_code_node)
+    else:
+      break
+  return CodeNode(atomic_code_list)
+
+
+def ParseToAST(pump_src_text):
+  """Convert the given Pump source text into an AST."""
+  tokens = list(Tokenize(pump_src_text))
+  code_node = ParseCodeNode(tokens)
+  return code_node
+
+
+class Env:
+  def __init__(self):
+    self.variables = []
+    self.ranges = []
+
+  def Clone(self):
+    clone = Env()
+    clone.variables = self.variables[:]
+    clone.ranges = self.ranges[:]
+    return clone
+
+  def PushVariable(self, var, value):
+    # If value looks like an int, store it as an int.
+    try:
+      int_value = int(value)
+      if ('%s' % int_value) == value:
+        value = int_value
+    except Exception:
+      pass
+    self.variables[:0] = [(var, value)]
+
+  def PopVariable(self):
+    self.variables[:1] = []
+
+  def PushRange(self, var, lower, upper):
+    self.ranges[:0] = [(var, lower, upper)]
+
+  def PopRange(self):
+    self.ranges[:1] = []
+
+  def GetValue(self, identifier):
+    for (var, value) in self.variables:
+      if identifier == var:
+        return value
+
+    print 'ERROR: meta variable %s is undefined.' % (identifier,)
+    sys.exit(1)
+
+  def EvalExp(self, exp):
+    try:
+      result = eval(exp.python_exp)
+    except Exception, e:
+      print 'ERROR: caught exception %s: %s' % (e.__class__.__name__, e)
+      print ('ERROR: failed to evaluate meta expression %s at %s' %
+             (exp.python_exp, exp.token.start))
+      sys.exit(1)
+    return result
+
+  def GetRange(self, identifier):
+    for (var, lower, upper) in self.ranges:
+      if identifier == var:
+        return (lower, upper)
+
+    print 'ERROR: range %s is undefined.' % (identifier,)
+    sys.exit(1)
+
+
+class Output:
+  def __init__(self):
+    self.string = ''
+
+  def GetLastLine(self):
+    index = self.string.rfind('\n')
+    if index < 0:
+      return ''
+
+    return self.string[index + 1:]
+
+  def Append(self, s):
+    self.string += s
+
+
+def RunAtomicCode(env, node, output):
+  if isinstance(node, VarNode):
+    identifier = node.identifier.value.strip()
+    result = Output()
+    RunAtomicCode(env.Clone(), node.atomic_code, result)
+    value = result.string
+    env.PushVariable(identifier, value)
+  elif isinstance(node, RangeNode):
+    identifier = node.identifier.value.strip()
+    lower = int(env.EvalExp(node.exp1))
+    upper = int(env.EvalExp(node.exp2))
+    env.PushRange(identifier, lower, upper)
+  elif isinstance(node, ForNode):
+    identifier = node.identifier.value.strip()
+    if node.sep is None:
+      sep = ''
+    else:
+      sep = node.sep.value
+    (lower, upper) = env.GetRange(identifier)
+    for i in range(lower, upper + 1):
+      new_env = env.Clone()
+      new_env.PushVariable(identifier, i)
+      RunCode(new_env, node.code, output)
+      if i != upper:
+        output.Append(sep)
+  elif isinstance(node, RawCodeNode):
+    output.Append(node.raw_code.value)
+  elif isinstance(node, IfNode):
+    cond = env.EvalExp(node.exp)
+    if cond:
+      RunCode(env.Clone(), node.then_branch, output)
+    elif node.else_branch is not None:
+      RunCode(env.Clone(), node.else_branch, output)
+  elif isinstance(node, ExpNode):
+    value = env.EvalExp(node)
+    output.Append('%s' % (value,))
+  elif isinstance(node, LiteralDollarNode):
+    output.Append('$')
+  elif isinstance(node, CodeNode):
+    RunCode(env.Clone(), node, output)
+  else:
+    print 'BAD'
+    print node
+    sys.exit(1)
+
+
+def RunCode(env, code_node, output):
+  for atomic_code in code_node.atomic_code:
+    RunAtomicCode(env, atomic_code, output)
+
+
+def IsSingleLineComment(cur_line):
+  return '//' in cur_line
+
+
+def IsInPreprocessorDirective(prev_lines, cur_line):
+  if cur_line.lstrip().startswith('#'):
+    return True
+  return prev_lines and prev_lines[-1].endswith('\\')
+
+
+def WrapComment(line, output):
+  loc = line.find('//')
+  before_comment = line[:loc].rstrip()
+  if before_comment == '':
+    indent = loc
+  else:
+    output.append(before_comment)
+    indent = len(before_comment) - len(before_comment.lstrip())
+  prefix = indent*' ' + '// '
+  max_len = 80 - len(prefix)
+  comment = line[loc + 2:].strip()
+  segs = [seg for seg in re.split(r'(\w+\W*)', comment) if seg != '']
+  cur_line = ''
+  for seg in segs:
+    if len((cur_line + seg).rstrip()) < max_len:
+      cur_line += seg
+    else:
+      if cur_line.strip() != '':
+        output.append(prefix + cur_line.rstrip())
+      cur_line = seg.lstrip()
+  if cur_line.strip() != '':
+    output.append(prefix + cur_line.strip())
+
+
+def WrapCode(line, line_concat, output):
+  indent = len(line) - len(line.lstrip())
+  prefix = indent*' '  # Prefix of the current line
+  max_len = 80 - indent - len(line_concat)  # Maximum length of the current line
+  new_prefix = prefix + 4*' '  # Prefix of a continuation line
+  new_max_len = max_len - 4  # Maximum length of a continuation line
+  # Prefers to wrap a line after a ',' or ';'.
+  segs = [seg for seg in re.split(r'([^,;]+[,;]?)', line.strip()) if seg != '']
+  cur_line = ''  # The current line without leading spaces.
+  for seg in segs:
+    # If the line is still too long, wrap at a space.
+    while cur_line == '' and len(seg.strip()) > max_len:
+      seg = seg.lstrip()
+      split_at = seg.rfind(' ', 0, max_len)
+      output.append(prefix + seg[:split_at].strip() + line_concat)
+      seg = seg[split_at + 1:]
+      prefix = new_prefix
+      max_len = new_max_len
+
+    if len((cur_line + seg).rstrip()) < max_len:
+      cur_line = (cur_line + seg).lstrip()
+    else:
+      output.append(prefix + cur_line.rstrip() + line_concat)
+      prefix = new_prefix
+      max_len = new_max_len
+      cur_line = seg.lstrip()
+  if cur_line.strip() != '':
+    output.append(prefix + cur_line.strip())
+
+
+def WrapPreprocessorDirective(line, output):
+  WrapCode(line, ' \\', output)
+
+
+def WrapPlainCode(line, output):
+  WrapCode(line, '', output)
+
+
+def IsMultiLineIWYUPragma(line):
+  return re.search(r'/\* IWYU pragma: ', line)
+
+
+def IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+  return (re.match(r'^#(ifndef|define|endif\s*//)\s*[\w_]+\s*$', line) or
+          re.match(r'^#include\s', line) or
+          # Don't break IWYU pragmas, either; that causes iwyu.py problems.
+          re.search(r'// IWYU pragma: ', line))
+
+
+def WrapLongLine(line, output):
+  line = line.rstrip()
+  if len(line) <= 80:
+    output.append(line)
+  elif IsSingleLineComment(line):
+    if IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+      # The style guide made an exception to allow long header guard lines,
+      # includes and IWYU pragmas.
+      output.append(line)
+    else:
+      WrapComment(line, output)
+  elif IsInPreprocessorDirective(output, line):
+    if IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+      # The style guide made an exception to allow long header guard lines,
+      # includes and IWYU pragmas.
+      output.append(line)
+    else:
+      WrapPreprocessorDirective(line, output)
+  elif IsMultiLineIWYUPragma(line):
+    output.append(line)
+  else:
+    WrapPlainCode(line, output)
+
+
+def BeautifyCode(string):
+  lines = string.splitlines()
+  output = []
+  for line in lines:
+    WrapLongLine(line, output)
+  output2 = [line.rstrip() for line in output]
+  return '\n'.join(output2) + '\n'
+
+
+def ConvertFromPumpSource(src_text):
+  """Return the text generated from the given Pump source text."""
+  ast = ParseToAST(StripMetaComments(src_text))
+  output = Output()
+  RunCode(Env(), ast, output)
+  return BeautifyCode(output.string)
+
+
+def main(argv):
+  if len(argv) == 1:
+    print __doc__
+    sys.exit(1)
+
+  file_path = argv[-1]
+  output_str = ConvertFromPumpSource(file(file_path, 'r').read())
+  if file_path.endswith('.pump'):
+    output_file_path = file_path[:-5]
+  else:
+    output_file_path = '-'
+  if output_file_path == '-':
+    print output_str,
+  else:
+    output_file = file(output_file_path, 'w')
+    output_file.write('// This file was GENERATED by command:\n')
+    output_file.write('//     %s %s\n' %
+                      (os.path.basename(__file__), os.path.basename(file_path)))
+    output_file.write('// DO NOT EDIT BY HAND!!!\n\n')
+    output_file.write(output_str)
+    output_file.close()
+
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/release_docs.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/release_docs.py
new file mode 100755
index 0000000..1291347
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/release_docs.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+#
+# Copyright 2013 Google Inc. All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Script for branching Google Test/Mock wiki pages for a new version.
+
+SYNOPSIS
+       release_docs.py NEW_RELEASE_VERSION
+
+       Google Test and Google Mock's external user documentation is in
+       interlinked wiki files.  When we release a new version of
+       Google Test or Google Mock, we need to branch the wiki files
+       such that users of a specific version of Google Test/Mock can
+       look up documenation relevant for that version.  This script
+       automates that process by:
+
+         - branching the current wiki pages (which document the
+           behavior of the SVN trunk head) to pages for the specified
+           version (e.g. branching FAQ.wiki to V2_6_FAQ.wiki when
+           NEW_RELEASE_VERSION is 2.6);
+         - updating the links in the branched files to point to the branched
+           version (e.g. a link in V2_6_FAQ.wiki that pointed to
+           Primer.wiki#Anchor will now point to V2_6_Primer.wiki#Anchor).
+
+       NOTE: NEW_RELEASE_VERSION must be a NEW version number for
+       which the wiki pages don't yet exist; otherwise you'll get SVN
+       errors like "svn: Path 'V1_7_PumpManual.wiki' is not a
+       directory" when running the script.
+
+EXAMPLE
+       $ cd PATH/TO/GTEST_SVN_WORKSPACE/trunk
+       $ scripts/release_docs.py 2.6  # create wiki pages for v2.6
+       $ svn status                   # verify the file list
+       $ svn diff                     # verify the file contents
+       $ svn commit -m "release wiki pages for v2.6"
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import re
+import sys
+
+import common
+
+
+# Wiki pages that shouldn't be branched for every gtest/gmock release.
+GTEST_UNVERSIONED_WIKIS = ['DevGuide.wiki']
+GMOCK_UNVERSIONED_WIKIS = [
+    'DesignDoc.wiki',
+    'DevGuide.wiki',
+    'KnownIssues.wiki'
+    ]
+
+
+def DropWikiSuffix(wiki_filename):
+  """Removes the .wiki suffix (if any) from the given filename."""
+
+  return (wiki_filename[:-len('.wiki')] if wiki_filename.endswith('.wiki')
+          else wiki_filename)
+
+
+class WikiBrancher(object):
+  """Branches ..."""
+
+  def __init__(self, dot_version):
+    self.project, svn_root_path = common.GetSvnInfo()
+    if self.project not in ('googletest', 'googlemock'):
+      sys.exit('This script must be run in a gtest or gmock SVN workspace.')
+    self.wiki_dir = svn_root_path + '/wiki'
+    # Turn '2.6' to 'V2_6_'.
+    self.version_prefix = 'V' + dot_version.replace('.', '_') + '_'
+    self.files_to_branch = self.GetFilesToBranch()
+    page_names = [DropWikiSuffix(f) for f in self.files_to_branch]
+    # A link to Foo.wiki is in one of the following forms:
+    #   [Foo words]
+    #   [Foo#Anchor words]
+    #   [http://code.google.com/.../wiki/Foo words]
+    #   [http://code.google.com/.../wiki/Foo#Anchor words]
+    # We want to replace 'Foo' with 'V2_6_Foo' in the above cases.
+    self.search_for_re = re.compile(
+        # This regex matches either
+        #   [Foo
+        # or
+        #   /wiki/Foo
+        # followed by a space or a #, where Foo is the name of an
+        # unversioned wiki page.
+        r'(\[|/wiki/)(%s)([ #])' % '|'.join(page_names))
+    self.replace_with = r'\1%s\2\3' % (self.version_prefix,)
+
+  def GetFilesToBranch(self):
+    """Returns a list of .wiki file names that need to be branched."""
+
+    unversioned_wikis = (GTEST_UNVERSIONED_WIKIS if self.project == 'googletest'
+                         else GMOCK_UNVERSIONED_WIKIS)
+    return [f for f in os.listdir(self.wiki_dir)
+            if (f.endswith('.wiki') and
+                not re.match(r'^V\d', f) and  # Excluded versioned .wiki files.
+                f not in unversioned_wikis)]
+
+  def BranchFiles(self):
+    """Branches the .wiki files needed to be branched."""
+
+    print 'Branching %d .wiki files:' % (len(self.files_to_branch),)
+    os.chdir(self.wiki_dir)
+    for f in self.files_to_branch:
+      command = 'svn cp %s %s%s' % (f, self.version_prefix, f)
+      print command
+      os.system(command)
+
+  def UpdateLinksInBranchedFiles(self):
+
+    for f in self.files_to_branch:
+      source_file = os.path.join(self.wiki_dir, f)
+      versioned_file = os.path.join(self.wiki_dir, self.version_prefix + f)
+      print 'Updating links in %s.' % (versioned_file,)
+      text = file(source_file, 'r').read()
+      new_text = self.search_for_re.sub(self.replace_with, text)
+      file(versioned_file, 'w').write(new_text)
+
+
+def main():
+  if len(sys.argv) != 2:
+    sys.exit(__doc__)
+
+  brancher = WikiBrancher(sys.argv[1])
+  brancher.BranchFiles()
+  brancher.UpdateLinksInBranchedFiles()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload.py
new file mode 100755
index 0000000..4e036a6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload.py
@@ -0,0 +1,1387 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tool for uploading diffs from a version control system to the codereview app.
+
+Usage summary: upload.py [options] [-- diff_options]
+
+Diff options are passed to the diff command of the underlying system.
+
+Supported version control systems:
+  Git
+  Mercurial
+  Subversion
+
+It is important for Git/Mercurial users to specify a tree/node/branch to diff
+against by using the '--rev' option.
+"""
+# This code is derived from appcfg.py in the App Engine SDK (open source),
+# and from ASPN recipe #146306.
+
+import cookielib
+import getpass
+import logging
+import md5
+import mimetypes
+import optparse
+import os
+import re
+import socket
+import subprocess
+import sys
+import urllib
+import urllib2
+import urlparse
+
+try:
+  import readline
+except ImportError:
+  pass
+
+# The logging verbosity:
+#  0: Errors only.
+#  1: Status messages.
+#  2: Info logs.
+#  3: Debug logs.
+verbosity = 1
+
+# Max size of patch or base file.
+MAX_UPLOAD_SIZE = 900 * 1024
+
+
+def GetEmail(prompt):
+  """Prompts the user for their email address and returns it.
+
+  The last used email address is saved to a file and offered up as a suggestion
+  to the user. If the user presses enter without typing in anything the last
+  used email address is used. If the user enters a new address, it is saved
+  for next time we prompt.
+
+  """
+  last_email_file_name = os.path.expanduser("~/.last_codereview_email_address")
+  last_email = ""
+  if os.path.exists(last_email_file_name):
+    try:
+      last_email_file = open(last_email_file_name, "r")
+      last_email = last_email_file.readline().strip("\n")
+      last_email_file.close()
+      prompt += " [%s]" % last_email
+    except IOError, e:
+      pass
+  email = raw_input(prompt + ": ").strip()
+  if email:
+    try:
+      last_email_file = open(last_email_file_name, "w")
+      last_email_file.write(email)
+      last_email_file.close()
+    except IOError, e:
+      pass
+  else:
+    email = last_email
+  return email
+
+
+def StatusUpdate(msg):
+  """Print a status message to stdout.
+
+  If 'verbosity' is greater than 0, print the message.
+
+  Args:
+    msg: The string to print.
+  """
+  if verbosity > 0:
+    print msg
+
+
+def ErrorExit(msg):
+  """Print an error message to stderr and exit."""
+  print >>sys.stderr, msg
+  sys.exit(1)
+
+
+class ClientLoginError(urllib2.HTTPError):
+  """Raised to indicate there was an error authenticating with ClientLogin."""
+
+  def __init__(self, url, code, msg, headers, args):
+    urllib2.HTTPError.__init__(self, url, code, msg, headers, None)
+    self.args = args
+    self.reason = args["Error"]
+
+
+class AbstractRpcServer(object):
+  """Provides a common interface for a simple RPC server."""
+
+  def __init__(self, host, auth_function, host_override=None, extra_headers={},
+               save_cookies=False):
+    """Creates a new HttpRpcServer.
+
+    Args:
+      host: The host to send requests to.
+      auth_function: A function that takes no arguments and returns an
+        (email, password) tuple when called. Will be called if authentication
+        is required.
+      host_override: The host header to send to the server (defaults to host).
+      extra_headers: A dict of extra headers to append to every request.
+      save_cookies: If True, save the authentication cookies to local disk.
+        If False, use an in-memory cookiejar instead.  Subclasses must
+        implement this functionality.  Defaults to False.
+    """
+    self.host = host
+    self.host_override = host_override
+    self.auth_function = auth_function
+    self.authenticated = False
+    self.extra_headers = extra_headers
+    self.save_cookies = save_cookies
+    self.opener = self._GetOpener()
+    if self.host_override:
+      logging.info("Server: %s; Host: %s", self.host, self.host_override)
+    else:
+      logging.info("Server: %s", self.host)
+
+  def _GetOpener(self):
+    """Returns an OpenerDirector for making HTTP requests.
+
+    Returns:
+      A urllib2.OpenerDirector object.
+    """
+    raise NotImplementedError()
+
+  def _CreateRequest(self, url, data=None):
+    """Creates a new urllib request."""
+    logging.debug("Creating request for: '%s' with payload:\n%s", url, data)
+    req = urllib2.Request(url, data=data)
+    if self.host_override:
+      req.add_header("Host", self.host_override)
+    for key, value in self.extra_headers.iteritems():
+      req.add_header(key, value)
+    return req
+
+  def _GetAuthToken(self, email, password):
+    """Uses ClientLogin to authenticate the user, returning an auth token.
+
+    Args:
+      email:    The user's email address
+      password: The user's password
+
+    Raises:
+      ClientLoginError: If there was an error authenticating with ClientLogin.
+      HTTPError: If there was some other form of HTTP error.
+
+    Returns:
+      The authentication token returned by ClientLogin.
+    """
+    account_type = "GOOGLE"
+    if self.host.endswith(".google.com"):
+      # Needed for use inside Google.
+      account_type = "HOSTED"
+    req = self._CreateRequest(
+        url="https://www.google.com/accounts/ClientLogin",
+        data=urllib.urlencode({
+            "Email": email,
+            "Passwd": password,
+            "service": "ah",
+            "source/tnn": "rietveld-codereview-upload",
+            "accountType": account_type,
+        }),
+    )
+    try:
+      response = self.opener.open(req)
+      response_body = response.read()
+      response_dict = dict(x.split("=")
+                           for x in response_body.split("\n") if x)
+      return response_dict["Auth"]
+    except urllib2.HTTPError, e:
+      if e.code == 403:
+        body = e.read()
+        response_dict = dict(x.split("=", 1) for x in body.split("\n") if x)
+        raise ClientLoginError(req.get_full_url(), e.code, e.msg,
+                               e.headers, response_dict)
+      else:
+        raise
+
+  def _GetAuthCookie(self, auth_token):
+    """Fetches authentication cookies for an authentication token.
+
+    Args:
+      auth_token: The authentication token returned by ClientLogin.
+
+    Raises:
+      HTTPError: If there was an error fetching the authentication cookies.
+    """
+    # This is a dummy value to allow us to identify when we're successful.
+    continue_location = "http://localhost/"
+    args = {"continue": continue_location, "auth": auth_token}
+    req = self._CreateRequest("http://%s/_ah/login?%s" %
+                              (self.host, urllib.urlencode(args)))
+    try:
+      response = self.opener.open(req)
+    except urllib2.HTTPError, e:
+      response = e
+    if (response.code != 302 or
+        response.info()["location"] != continue_location):
+      raise urllib2.HTTPError(req.get_full_url(), response.code, response.msg,
+                              response.headers, response.fp)
+    self.authenticated = True
+
+  def _Authenticate(self):
+    """Authenticates the user.
+
+    The authentication process works as follows:
+     1) We get a username and password from the user
+     2) We use ClientLogin to obtain an AUTH token for the user
+        (see https://developers.google.com/identity/protocols/AuthForInstalledApps).
+     3) We pass the auth token to /_ah/login on the server to obtain an
+        authentication cookie. If login was successful, it tries to redirect
+        us to the URL we provided.
+
+    If we attempt to access the upload API without first obtaining an
+    authentication cookie, it returns a 401 response and directs us to
+    authenticate ourselves with ClientLogin.
+    """
+    for i in range(3):
+      credentials = self.auth_function()
+      try:
+        auth_token = self._GetAuthToken(credentials[0], credentials[1])
+      except ClientLoginError, e:
+        if e.reason == "BadAuthentication":
+          print >>sys.stderr, "Invalid username or password."
+          continue
+        if e.reason == "CaptchaRequired":
+          print >>sys.stderr, (
+              "Please go to\n"
+              "https://www.google.com/accounts/DisplayUnlockCaptcha\n"
+              "and verify you are a human.  Then try again.")
+          break
+        if e.reason == "NotVerified":
+          print >>sys.stderr, "Account not verified."
+          break
+        if e.reason == "TermsNotAgreed":
+          print >>sys.stderr, "User has not agreed to TOS."
+          break
+        if e.reason == "AccountDeleted":
+          print >>sys.stderr, "The user account has been deleted."
+          break
+        if e.reason == "AccountDisabled":
+          print >>sys.stderr, "The user account has been disabled."
+          break
+        if e.reason == "ServiceDisabled":
+          print >>sys.stderr, ("The user's access to the service has been "
+                               "disabled.")
+          break
+        if e.reason == "ServiceUnavailable":
+          print >>sys.stderr, "The service is not available; try again later."
+          break
+        raise
+      self._GetAuthCookie(auth_token)
+      return
+
+  def Send(self, request_path, payload=None,
+           content_type="application/octet-stream",
+           timeout=None,
+           **kwargs):
+    """Sends an RPC and returns the response.
+
+    Args:
+      request_path: The path to send the request to, eg /api/appversion/create.
+      payload: The body of the request, or None to send an empty request.
+      content_type: The Content-Type header to use.
+      timeout: timeout in seconds; default None i.e. no timeout.
+        (Note: for large requests on OS X, the timeout doesn't work right.)
+      kwargs: Any keyword arguments are converted into query string parameters.
+
+    Returns:
+      The response body, as a string.
+    """
+    # TODO: Don't require authentication.  Let the server say
+    # whether it is necessary.
+    if not self.authenticated:
+      self._Authenticate()
+
+    old_timeout = socket.getdefaulttimeout()
+    socket.setdefaulttimeout(timeout)
+    try:
+      tries = 0
+      while True:
+        tries += 1
+        args = dict(kwargs)
+        url = "http://%s%s" % (self.host, request_path)
+        if args:
+          url += "?" + urllib.urlencode(args)
+        req = self._CreateRequest(url=url, data=payload)
+        req.add_header("Content-Type", content_type)
+        try:
+          f = self.opener.open(req)
+          response = f.read()
+          f.close()
+          return response
+        except urllib2.HTTPError, e:
+          if tries > 3:
+            raise
+          elif e.code == 401:
+            self._Authenticate()
+##           elif e.code >= 500 and e.code < 600:
+##             # Server Error - try again.
+##             continue
+          else:
+            raise
+    finally:
+      socket.setdefaulttimeout(old_timeout)
+
+
+class HttpRpcServer(AbstractRpcServer):
+  """Provides a simplified RPC-style interface for HTTP requests."""
+
+  def _Authenticate(self):
+    """Save the cookie jar after authentication."""
+    super(HttpRpcServer, self)._Authenticate()
+    if self.save_cookies:
+      StatusUpdate("Saving authentication cookies to %s" % self.cookie_file)
+      self.cookie_jar.save()
+
+  def _GetOpener(self):
+    """Returns an OpenerDirector that supports cookies and ignores redirects.
+
+    Returns:
+      A urllib2.OpenerDirector object.
+    """
+    opener = urllib2.OpenerDirector()
+    opener.add_handler(urllib2.ProxyHandler())
+    opener.add_handler(urllib2.UnknownHandler())
+    opener.add_handler(urllib2.HTTPHandler())
+    opener.add_handler(urllib2.HTTPDefaultErrorHandler())
+    opener.add_handler(urllib2.HTTPSHandler())
+    opener.add_handler(urllib2.HTTPErrorProcessor())
+    if self.save_cookies:
+      self.cookie_file = os.path.expanduser("~/.codereview_upload_cookies")
+      self.cookie_jar = cookielib.MozillaCookieJar(self.cookie_file)
+      if os.path.exists(self.cookie_file):
+        try:
+          self.cookie_jar.load()
+          self.authenticated = True
+          StatusUpdate("Loaded authentication cookies from %s" %
+                       self.cookie_file)
+        except (cookielib.LoadError, IOError):
+          # Failed to load cookies - just ignore them.
+          pass
+      else:
+        # Create an empty cookie file with mode 600
+        fd = os.open(self.cookie_file, os.O_CREAT, 0600)
+        os.close(fd)
+      # Always chmod the cookie file
+      os.chmod(self.cookie_file, 0600)
+    else:
+      # Don't save cookies across runs of update.py.
+      self.cookie_jar = cookielib.CookieJar()
+    opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar))
+    return opener
+
+
+parser = optparse.OptionParser(usage="%prog [options] [-- diff_options]")
+parser.add_option("-y", "--assume_yes", action="store_true",
+                  dest="assume_yes", default=False,
+                  help="Assume that the answer to yes/no questions is 'yes'.")
+# Logging
+group = parser.add_option_group("Logging options")
+group.add_option("-q", "--quiet", action="store_const", const=0,
+                 dest="verbose", help="Print errors only.")
+group.add_option("-v", "--verbose", action="store_const", const=2,
+                 dest="verbose", default=1,
+                 help="Print info level logs (default).")
+group.add_option("--noisy", action="store_const", const=3,
+                 dest="verbose", help="Print all logs.")
+# Review server
+group = parser.add_option_group("Review server options")
+group.add_option("-s", "--server", action="store", dest="server",
+                 default="codereview.appspot.com",
+                 metavar="SERVER",
+                 help=("The server to upload to. The format is host[:port]. "
+                       "Defaults to 'codereview.appspot.com'."))
+group.add_option("-e", "--email", action="store", dest="email",
+                 metavar="EMAIL", default=None,
+                 help="The username to use. Will prompt if omitted.")
+group.add_option("-H", "--host", action="store", dest="host",
+                 metavar="HOST", default=None,
+                 help="Overrides the Host header sent with all RPCs.")
+group.add_option("--no_cookies", action="store_false",
+                 dest="save_cookies", default=True,
+                 help="Do not save authentication cookies to local disk.")
+# Issue
+group = parser.add_option_group("Issue options")
+group.add_option("-d", "--description", action="store", dest="description",
+                 metavar="DESCRIPTION", default=None,
+                 help="Optional description when creating an issue.")
+group.add_option("-f", "--description_file", action="store",
+                 dest="description_file", metavar="DESCRIPTION_FILE",
+                 default=None,
+                 help="Optional path of a file that contains "
+                      "the description when creating an issue.")
+group.add_option("-r", "--reviewers", action="store", dest="reviewers",
+                 metavar="REVIEWERS", default=None,
+                 help="Add reviewers (comma separated email addresses).")
+group.add_option("--cc", action="store", dest="cc",
+                 metavar="CC", default=None,
+                 help="Add CC (comma separated email addresses).")
+# Upload options
+group = parser.add_option_group("Patch options")
+group.add_option("-m", "--message", action="store", dest="message",
+                 metavar="MESSAGE", default=None,
+                 help="A message to identify the patch. "
+                      "Will prompt if omitted.")
+group.add_option("-i", "--issue", type="int", action="store",
+                 metavar="ISSUE", default=None,
+                 help="Issue number to which to add. Defaults to new issue.")
+group.add_option("--download_base", action="store_true",
+                 dest="download_base", default=False,
+                 help="Base files will be downloaded by the server "
+                 "(side-by-side diffs may not work on files with CRs).")
+group.add_option("--rev", action="store", dest="revision",
+                 metavar="REV", default=None,
+                 help="Branch/tree/revision to diff against (used by DVCS).")
+group.add_option("--send_mail", action="store_true",
+                 dest="send_mail", default=False,
+                 help="Send notification email to reviewers.")
+
+
+def GetRpcServer(options):
+  """Returns an instance of an AbstractRpcServer.
+
+  Returns:
+    A new AbstractRpcServer, on which RPC calls can be made.
+  """
+
+  rpc_server_class = HttpRpcServer
+
+  def GetUserCredentials():
+    """Prompts the user for a username and password."""
+    email = options.email
+    if email is None:
+      email = GetEmail("Email (login for uploading to %s)" % options.server)
+    password = getpass.getpass("Password for %s: " % email)
+    return (email, password)
+
+  # If this is the dev_appserver, use fake authentication.
+  host = (options.host or options.server).lower()
+  if host == "localhost" or host.startswith("localhost:"):
+    email = options.email
+    if email is None:
+      email = "test@example.com"
+      logging.info("Using debug user %s.  Override with --email" % email)
+    server = rpc_server_class(
+        options.server,
+        lambda: (email, "password"),
+        host_override=options.host,
+        extra_headers={"Cookie":
+                       'dev_appserver_login="%s:False"' % email},
+        save_cookies=options.save_cookies)
+    # Don't try to talk to ClientLogin.
+    server.authenticated = True
+    return server
+
+  return rpc_server_class(options.server, GetUserCredentials,
+                          host_override=options.host,
+                          save_cookies=options.save_cookies)
+
+
+def EncodeMultipartFormData(fields, files):
+  """Encode form fields for multipart/form-data.
+
+  Args:
+    fields: A sequence of (name, value) elements for regular form fields.
+    files: A sequence of (name, filename, value) elements for data to be
+           uploaded as files.
+  Returns:
+    (content_type, body) ready for httplib.HTTP instance.
+
+  Source:
+    https://web.archive.org/web/20160116052001/code.activestate.com/recipes/146306
+  """
+  BOUNDARY = '-M-A-G-I-C---B-O-U-N-D-A-R-Y-'
+  CRLF = '\r\n'
+  lines = []
+  for (key, value) in fields:
+    lines.append('--' + BOUNDARY)
+    lines.append('Content-Disposition: form-data; name="%s"' % key)
+    lines.append('')
+    lines.append(value)
+  for (key, filename, value) in files:
+    lines.append('--' + BOUNDARY)
+    lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' %
+             (key, filename))
+    lines.append('Content-Type: %s' % GetContentType(filename))
+    lines.append('')
+    lines.append(value)
+  lines.append('--' + BOUNDARY + '--')
+  lines.append('')
+  body = CRLF.join(lines)
+  content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
+  return content_type, body
+
+
+def GetContentType(filename):
+  """Helper to guess the content-type from the filename."""
+  return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
+
+
+# Use a shell for subcommands on Windows to get a PATH search.
+use_shell = sys.platform.startswith("win")
+
+def RunShellWithReturnCode(command, print_output=False,
+                           universal_newlines=True):
+  """Executes a command and returns the output from stdout and the return code.
+
+  Args:
+    command: Command to execute.
+    print_output: If True, the output is printed to stdout.
+                  If False, both stdout and stderr are ignored.
+    universal_newlines: Use universal_newlines flag (default: True).
+
+  Returns:
+    Tuple (output, return code)
+  """
+  logging.info("Running %s", command)
+  p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                       shell=use_shell, universal_newlines=universal_newlines)
+  if print_output:
+    output_array = []
+    while True:
+      line = p.stdout.readline()
+      if not line:
+        break
+      print line.strip("\n")
+      output_array.append(line)
+    output = "".join(output_array)
+  else:
+    output = p.stdout.read()
+  p.wait()
+  errout = p.stderr.read()
+  if print_output and errout:
+    print >>sys.stderr, errout
+  p.stdout.close()
+  p.stderr.close()
+  return output, p.returncode
+
+
+def RunShell(command, silent_ok=False, universal_newlines=True,
+             print_output=False):
+  data, retcode = RunShellWithReturnCode(command, print_output,
+                                         universal_newlines)
+  if retcode:
+    ErrorExit("Got error status from %s:\n%s" % (command, data))
+  if not silent_ok and not data:
+    ErrorExit("No output from %s" % command)
+  return data
+
+
+class VersionControlSystem(object):
+  """Abstract base class providing an interface to the VCS."""
+
+  def __init__(self, options):
+    """Constructor.
+
+    Args:
+      options: Command line options.
+    """
+    self.options = options
+
+  def GenerateDiff(self, args):
+    """Return the current diff as a string.
+
+    Args:
+      args: Extra arguments to pass to the diff command.
+    """
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+  def GetUnknownFiles(self):
+    """Return a list of files unknown to the VCS."""
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+  def CheckForUnknownFiles(self):
+    """Show an "are you sure?" prompt if there are unknown files."""
+    unknown_files = self.GetUnknownFiles()
+    if unknown_files:
+      print "The following files are not added to version control:"
+      for line in unknown_files:
+        print line
+      prompt = "Are you sure to continue?(y/N) "
+      answer = raw_input(prompt).strip()
+      if answer != "y":
+        ErrorExit("User aborted")
+
+  def GetBaseFile(self, filename):
+    """Get the content of the upstream version of a file.
+
+    Returns:
+      A tuple (base_content, new_content, is_binary, status)
+        base_content: The contents of the base file.
+        new_content: For text files, this is empty.  For binary files, this is
+          the contents of the new file, since the diff output won't contain
+          information to reconstruct the current file.
+        is_binary: True iff the file is binary.
+        status: The status of the file.
+    """
+
+    raise NotImplementedError(
+        "abstract method -- subclass %s must override" % self.__class__)
+
+
+  def GetBaseFiles(self, diff):
+    """Helper that calls GetBase file for each file in the patch.
+
+    Returns:
+      A dictionary that maps from filename to GetBaseFile's tuple.  Filenames
+      are retrieved based on lines that start with "Index:" or
+      "Property changes on:".
+    """
+    files = {}
+    for line in diff.splitlines(True):
+      if line.startswith('Index:') or line.startswith('Property changes on:'):
+        unused, filename = line.split(':', 1)
+        # On Windows if a file has property changes its filename uses '\'
+        # instead of '/'.
+        filename = filename.strip().replace('\\', '/')
+        files[filename] = self.GetBaseFile(filename)
+    return files
+
+
+  def UploadBaseFiles(self, issue, rpc_server, patch_list, patchset, options,
+                      files):
+    """Uploads the base files (and if necessary, the current ones as well)."""
+
+    def UploadFile(filename, file_id, content, is_binary, status, is_base):
+      """Uploads a file to the server."""
+      file_too_large = False
+      if is_base:
+        type = "base"
+      else:
+        type = "current"
+      if len(content) > MAX_UPLOAD_SIZE:
+        print ("Not uploading the %s file for %s because it's too large." %
+               (type, filename))
+        file_too_large = True
+        content = ""
+      checksum = md5.new(content).hexdigest()
+      if options.verbose > 0 and not file_too_large:
+        print "Uploading %s file for %s" % (type, filename)
+      url = "/%d/upload_content/%d/%d" % (int(issue), int(patchset), file_id)
+      form_fields = [("filename", filename),
+                     ("status", status),
+                     ("checksum", checksum),
+                     ("is_binary", str(is_binary)),
+                     ("is_current", str(not is_base)),
+                    ]
+      if file_too_large:
+        form_fields.append(("file_too_large", "1"))
+      if options.email:
+        form_fields.append(("user", options.email))
+      ctype, body = EncodeMultipartFormData(form_fields,
+                                            [("data", filename, content)])
+      response_body = rpc_server.Send(url, body,
+                                      content_type=ctype)
+      if not response_body.startswith("OK"):
+        StatusUpdate("  --> %s" % response_body)
+        sys.exit(1)
+
+    patches = dict()
+    [patches.setdefault(v, k) for k, v in patch_list]
+    for filename in patches.keys():
+      base_content, new_content, is_binary, status = files[filename]
+      file_id_str = patches.get(filename)
+      if file_id_str.find("nobase") != -1:
+        base_content = None
+        file_id_str = file_id_str[file_id_str.rfind("_") + 1:]
+      file_id = int(file_id_str)
+      if base_content != None:
+        UploadFile(filename, file_id, base_content, is_binary, status, True)
+      if new_content != None:
+        UploadFile(filename, file_id, new_content, is_binary, status, False)
+
+  def IsImage(self, filename):
+    """Returns true if the filename has an image extension."""
+    mimetype =  mimetypes.guess_type(filename)[0]
+    if not mimetype:
+      return False
+    return mimetype.startswith("image/")
+
+
+class SubversionVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Subversion."""
+
+  def __init__(self, options):
+    super(SubversionVCS, self).__init__(options)
+    if self.options.revision:
+      match = re.match(r"(\d+)(:(\d+))?", self.options.revision)
+      if not match:
+        ErrorExit("Invalid Subversion revision %s." % self.options.revision)
+      self.rev_start = match.group(1)
+      self.rev_end = match.group(3)
+    else:
+      self.rev_start = self.rev_end = None
+    # Cache output from "svn list -r REVNO dirname".
+    # Keys: dirname, Values: 2-tuple (output for start rev and end rev).
+    self.svnls_cache = {}
+    # SVN base URL is required to fetch files deleted in an older revision.
+    # Result is cached to not guess it over and over again in GetBaseFile().
+    required = self.options.download_base or self.options.revision is not None
+    self.svn_base = self._GuessBase(required)
+
+  def GuessBase(self, required):
+    """Wrapper for _GuessBase."""
+    return self.svn_base
+
+  def _GuessBase(self, required):
+    """Returns the SVN base URL.
+
+    Args:
+      required: If true, exits if the url can't be guessed, otherwise None is
+        returned.
+    """
+    info = RunShell(["svn", "info"])
+    for line in info.splitlines():
+      words = line.split()
+      if len(words) == 2 and words[0] == "URL:":
+        url = words[1]
+        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
+        username, netloc = urllib.splituser(netloc)
+        if username:
+          logging.info("Removed username from base URL")
+        if netloc.endswith("svn.python.org"):
+          if netloc == "svn.python.org":
+            if path.startswith("/projects/"):
+              path = path[9:]
+          elif netloc != "pythondev@svn.python.org":
+            ErrorExit("Unrecognized Python URL: %s" % url)
+          base = "http://svn.python.org/view/*checkout*%s/" % path
+          logging.info("Guessed Python base = %s", base)
+        elif netloc.endswith("svn.collab.net"):
+          if path.startswith("/repos/"):
+            path = path[6:]
+          base = "http://svn.collab.net/viewvc/*checkout*%s/" % path
+          logging.info("Guessed CollabNet base = %s", base)
+        elif netloc.endswith(".googlecode.com"):
+          path = path + "/"
+          base = urlparse.urlunparse(("http", netloc, path, params,
+                                      query, fragment))
+          logging.info("Guessed Google Code base = %s", base)
+        else:
+          path = path + "/"
+          base = urlparse.urlunparse((scheme, netloc, path, params,
+                                      query, fragment))
+          logging.info("Guessed base = %s", base)
+        return base
+    if required:
+      ErrorExit("Can't find URL in output from svn info")
+    return None
+
+  def GenerateDiff(self, args):
+    cmd = ["svn", "diff"]
+    if self.options.revision:
+      cmd += ["-r", self.options.revision]
+    cmd.extend(args)
+    data = RunShell(cmd)
+    count = 0
+    for line in data.splitlines():
+      if line.startswith("Index:") or line.startswith("Property changes on:"):
+        count += 1
+        logging.info(line)
+    if not count:
+      ErrorExit("No valid patches found in output from svn diff")
+    return data
+
+  def _CollapseKeywords(self, content, keyword_str):
+    """Collapses SVN keywords."""
+    # svn cat translates keywords but svn diff doesn't. As a result of this
+    # behavior patching.PatchChunks() fails with a chunk mismatch error.
+    # This part was originally written by the Review Board development team
+    # who had the same problem (https://reviews.reviewboard.org/r/276/).
+    # Mapping of keywords to known aliases
+    svn_keywords = {
+      # Standard keywords
+      'Date':                ['Date', 'LastChangedDate'],
+      'Revision':            ['Revision', 'LastChangedRevision', 'Rev'],
+      'Author':              ['Author', 'LastChangedBy'],
+      'HeadURL':             ['HeadURL', 'URL'],
+      'Id':                  ['Id'],
+
+      # Aliases
+      'LastChangedDate':     ['LastChangedDate', 'Date'],
+      'LastChangedRevision': ['LastChangedRevision', 'Rev', 'Revision'],
+      'LastChangedBy':       ['LastChangedBy', 'Author'],
+      'URL':                 ['URL', 'HeadURL'],
+    }
+
+    def repl(m):
+       if m.group(2):
+         return "$%s::%s$" % (m.group(1), " " * len(m.group(3)))
+       return "$%s$" % m.group(1)
+    keywords = [keyword
+                for name in keyword_str.split(" ")
+                for keyword in svn_keywords.get(name, [])]
+    return re.sub(r"\$(%s):(:?)([^\$]+)\$" % '|'.join(keywords), repl, content)
+
+  def GetUnknownFiles(self):
+    status = RunShell(["svn", "status", "--ignore-externals"], silent_ok=True)
+    unknown_files = []
+    for line in status.split("\n"):
+      if line and line[0] == "?":
+        unknown_files.append(line)
+    return unknown_files
+
+  def ReadFile(self, filename):
+    """Returns the contents of a file."""
+    file = open(filename, 'rb')
+    result = ""
+    try:
+      result = file.read()
+    finally:
+      file.close()
+    return result
+
+  def GetStatus(self, filename):
+    """Returns the status of a file."""
+    if not self.options.revision:
+      status = RunShell(["svn", "status", "--ignore-externals", filename])
+      if not status:
+        ErrorExit("svn status returned no output for %s" % filename)
+      status_lines = status.splitlines()
+      # If file is in a cl, the output will begin with
+      # "\n--- Changelist 'cl_name':\n".  See
+      # https://web.archive.org/web/20090918234815/svn.collab.net/repos/svn/trunk/notes/changelist-design.txt
+      if (len(status_lines) == 3 and
+          not status_lines[0] and
+          status_lines[1].startswith("--- Changelist")):
+        status = status_lines[2]
+      else:
+        status = status_lines[0]
+    # If we have a revision to diff against we need to run "svn list"
+    # for the old and the new revision and compare the results to get
+    # the correct status for a file.
+    else:
+      dirname, relfilename = os.path.split(filename)
+      if dirname not in self.svnls_cache:
+        cmd = ["svn", "list", "-r", self.rev_start, dirname or "."]
+        out, returncode = RunShellWithReturnCode(cmd)
+        if returncode:
+          ErrorExit("Failed to get status for %s." % filename)
+        old_files = out.splitlines()
+        args = ["svn", "list"]
+        if self.rev_end:
+          args += ["-r", self.rev_end]
+        cmd = args + [dirname or "."]
+        out, returncode = RunShellWithReturnCode(cmd)
+        if returncode:
+          ErrorExit("Failed to run command %s" % cmd)
+        self.svnls_cache[dirname] = (old_files, out.splitlines())
+      old_files, new_files = self.svnls_cache[dirname]
+      if relfilename in old_files and relfilename not in new_files:
+        status = "D   "
+      elif relfilename in old_files and relfilename in new_files:
+        status = "M   "
+      else:
+        status = "A   "
+    return status
+
+  def GetBaseFile(self, filename):
+    status = self.GetStatus(filename)
+    base_content = None
+    new_content = None
+
+    # If a file is copied its status will be "A  +", which signifies
+    # "addition-with-history".  See "svn st" for more information.  We need to
+    # upload the original file or else diff parsing will fail if the file was
+    # edited.
+    if status[0] == "A" and status[3] != "+":
+      # We'll need to upload the new content if we're adding a binary file
+      # since diff's output won't contain it.
+      mimetype = RunShell(["svn", "propget", "svn:mime-type", filename],
+                          silent_ok=True)
+      base_content = ""
+      is_binary = mimetype and not mimetype.startswith("text/")
+      if is_binary and self.IsImage(filename):
+        new_content = self.ReadFile(filename)
+    elif (status[0] in ("M", "D", "R") or
+          (status[0] == "A" and status[3] == "+") or  # Copied file.
+          (status[0] == " " and status[1] == "M")):  # Property change.
+      args = []
+      if self.options.revision:
+        url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+      else:
+        # Don't change filename, it's needed later.
+        url = filename
+        args += ["-r", "BASE"]
+      cmd = ["svn"] + args + ["propget", "svn:mime-type", url]
+      mimetype, returncode = RunShellWithReturnCode(cmd)
+      if returncode:
+        # File does not exist in the requested revision.
+        # Reset mimetype, it contains an error message.
+        mimetype = ""
+      get_base = False
+      is_binary = mimetype and not mimetype.startswith("text/")
+      if status[0] == " ":
+        # Empty base content just to force an upload.
+        base_content = ""
+      elif is_binary:
+        if self.IsImage(filename):
+          get_base = True
+          if status[0] == "M":
+            if not self.rev_end:
+              new_content = self.ReadFile(filename)
+            else:
+              url = "%s/%s@%s" % (self.svn_base, filename, self.rev_end)
+              new_content = RunShell(["svn", "cat", url],
+                                     universal_newlines=True, silent_ok=True)
+        else:
+          base_content = ""
+      else:
+        get_base = True
+
+      if get_base:
+        if is_binary:
+          universal_newlines = False
+        else:
+          universal_newlines = True
+        if self.rev_start:
+          # "svn cat -r REV delete_file.txt" doesn't work. cat requires
+          # the full URL with "@REV" appended instead of using "-r" option.
+          url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+          base_content = RunShell(["svn", "cat", url],
+                                  universal_newlines=universal_newlines,
+                                  silent_ok=True)
+        else:
+          base_content = RunShell(["svn", "cat", filename],
+                                  universal_newlines=universal_newlines,
+                                  silent_ok=True)
+        if not is_binary:
+          args = []
+          if self.rev_start:
+            url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start)
+          else:
+            url = filename
+            args += ["-r", "BASE"]
+          cmd = ["svn"] + args + ["propget", "svn:keywords", url]
+          keywords, returncode = RunShellWithReturnCode(cmd)
+          if keywords and not returncode:
+            base_content = self._CollapseKeywords(base_content, keywords)
+    else:
+      StatusUpdate("svn status returned unexpected output: %s" % status)
+      sys.exit(1)
+    return base_content, new_content, is_binary, status[0:5]
+
+
+class GitVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Git."""
+
+  def __init__(self, options):
+    super(GitVCS, self).__init__(options)
+    # Map of filename -> hash of base file.
+    self.base_hashes = {}
+
+  def GenerateDiff(self, extra_args):
+    # This is more complicated than svn's GenerateDiff because we must convert
+    # the diff output to include an svn-style "Index:" line as well as record
+    # the hashes of the base files, so we can upload them along with our diff.
+    if self.options.revision:
+      extra_args = [self.options.revision] + extra_args
+    gitdiff = RunShell(["git", "diff", "--full-index"] + extra_args)
+    svndiff = []
+    filecount = 0
+    filename = None
+    for line in gitdiff.splitlines():
+      match = re.match(r"diff --git a/(.*) b/.*$", line)
+      if match:
+        filecount += 1
+        filename = match.group(1)
+        svndiff.append("Index: %s\n" % filename)
+      else:
+        # The "index" line in a git diff looks like this (long hashes elided):
+        #   index 82c0d44..b2cee3f 100755
+        # We want to save the left hash, as that identifies the base file.
+        match = re.match(r"index (\w+)\.\.", line)
+        if match:
+          self.base_hashes[filename] = match.group(1)
+      svndiff.append(line + "\n")
+    if not filecount:
+      ErrorExit("No valid patches found in output from git diff")
+    return "".join(svndiff)
+
+  def GetUnknownFiles(self):
+    status = RunShell(["git", "ls-files", "--exclude-standard", "--others"],
+                      silent_ok=True)
+    return status.splitlines()
+
+  def GetBaseFile(self, filename):
+    hash = self.base_hashes[filename]
+    base_content = None
+    new_content = None
+    is_binary = False
+    if hash == "0" * 40:  # All-zero hash indicates no base file.
+      status = "A"
+      base_content = ""
+    else:
+      status = "M"
+      base_content, returncode = RunShellWithReturnCode(["git", "show", hash])
+      if returncode:
+        ErrorExit("Got error status from 'git show %s'" % hash)
+    return (base_content, new_content, is_binary, status)
+
+
+class MercurialVCS(VersionControlSystem):
+  """Implementation of the VersionControlSystem interface for Mercurial."""
+
+  def __init__(self, options, repo_dir):
+    super(MercurialVCS, self).__init__(options)
+    # Absolute path to repository (we can be in a subdir)
+    self.repo_dir = os.path.normpath(repo_dir)
+    # Compute the subdir
+    cwd = os.path.normpath(os.getcwd())
+    assert cwd.startswith(self.repo_dir)
+    self.subdir = cwd[len(self.repo_dir):].lstrip(r"\/")
+    if self.options.revision:
+      self.base_rev = self.options.revision
+    else:
+      self.base_rev = RunShell(["hg", "parent", "-q"]).split(':')[1].strip()
+
+  def _GetRelPath(self, filename):
+    """Get relative path of a file according to the current directory,
+    given its logical path in the repo."""
+    assert filename.startswith(self.subdir), filename
+    return filename[len(self.subdir):].lstrip(r"\/")
+
+  def GenerateDiff(self, extra_args):
+    # If no file specified, restrict to the current subdir
+    extra_args = extra_args or ["."]
+    cmd = ["hg", "diff", "--git", "-r", self.base_rev] + extra_args
+    data = RunShell(cmd, silent_ok=True)
+    svndiff = []
+    filecount = 0
+    for line in data.splitlines():
+      m = re.match("diff --git a/(\S+) b/(\S+)", line)
+      if m:
+        # Modify line to make it look like as it comes from svn diff.
+        # With this modification no changes on the server side are required
+        # to make upload.py work with Mercurial repos.
+        # NOTE: for proper handling of moved/copied files, we have to use
+        # the second filename.
+        filename = m.group(2)
+        svndiff.append("Index: %s" % filename)
+        svndiff.append("=" * 67)
+        filecount += 1
+        logging.info(line)
+      else:
+        svndiff.append(line)
+    if not filecount:
+      ErrorExit("No valid patches found in output from hg diff")
+    return "\n".join(svndiff) + "\n"
+
+  def GetUnknownFiles(self):
+    """Return a list of files unknown to the VCS."""
+    args = []
+    status = RunShell(["hg", "status", "--rev", self.base_rev, "-u", "."],
+        silent_ok=True)
+    unknown_files = []
+    for line in status.splitlines():
+      st, fn = line.split(" ", 1)
+      if st == "?":
+        unknown_files.append(fn)
+    return unknown_files
+
+  def GetBaseFile(self, filename):
+    # "hg status" and "hg cat" both take a path relative to the current subdir
+    # rather than to the repo root, but "hg diff" has given us the full path
+    # to the repo root.
+    base_content = ""
+    new_content = None
+    is_binary = False
+    oldrelpath = relpath = self._GetRelPath(filename)
+    # "hg status -C" returns two lines for moved/copied files, one otherwise
+    out = RunShell(["hg", "status", "-C", "--rev", self.base_rev, relpath])
+    out = out.splitlines()
+    # HACK: strip error message about missing file/directory if it isn't in
+    # the working copy
+    if out[0].startswith('%s: ' % relpath):
+      out = out[1:]
+    if len(out) > 1:
+      # Moved/copied => considered as modified, use old filename to
+      # retrieve base contents
+      oldrelpath = out[1].strip()
+      status = "M"
+    else:
+      status, _ = out[0].split(' ', 1)
+    if status != "A":
+      base_content = RunShell(["hg", "cat", "-r", self.base_rev, oldrelpath],
+        silent_ok=True)
+      is_binary = "\0" in base_content  # Mercurial's heuristic
+    if status != "R":
+      new_content = open(relpath, "rb").read()
+      is_binary = is_binary or "\0" in new_content
+    if is_binary and base_content:
+      # Fetch again without converting newlines
+      base_content = RunShell(["hg", "cat", "-r", self.base_rev, oldrelpath],
+        silent_ok=True, universal_newlines=False)
+    if not is_binary or not self.IsImage(relpath):
+      new_content = None
+    return base_content, new_content, is_binary, status
+
+
+# NOTE: The SplitPatch function is duplicated in engine.py, keep them in sync.
+def SplitPatch(data):
+  """Splits a patch into separate pieces for each file.
+
+  Args:
+    data: A string containing the output of svn diff.
+
+  Returns:
+    A list of 2-tuple (filename, text) where text is the svn diff output
+      pertaining to filename.
+  """
+  patches = []
+  filename = None
+  diff = []
+  for line in data.splitlines(True):
+    new_filename = None
+    if line.startswith('Index:'):
+      unused, new_filename = line.split(':', 1)
+      new_filename = new_filename.strip()
+    elif line.startswith('Property changes on:'):
+      unused, temp_filename = line.split(':', 1)
+      # When a file is modified, paths use '/' between directories, however
+      # when a property is modified '\' is used on Windows.  Make them the same
+      # otherwise the file shows up twice.
+      temp_filename = temp_filename.strip().replace('\\', '/')
+      if temp_filename != filename:
+        # File has property changes but no modifications, create a new diff.
+        new_filename = temp_filename
+    if new_filename:
+      if filename and diff:
+        patches.append((filename, ''.join(diff)))
+      filename = new_filename
+      diff = [line]
+      continue
+    if diff is not None:
+      diff.append(line)
+  if filename and diff:
+    patches.append((filename, ''.join(diff)))
+  return patches
+
+
+def UploadSeparatePatches(issue, rpc_server, patchset, data, options):
+  """Uploads a separate patch for each file in the diff output.
+
+  Returns a list of [patch_key, filename] for each file.
+  """
+  patches = SplitPatch(data)
+  rv = []
+  for patch in patches:
+    if len(patch[1]) > MAX_UPLOAD_SIZE:
+      print ("Not uploading the patch for " + patch[0] +
+             " because the file is too large.")
+      continue
+    form_fields = [("filename", patch[0])]
+    if not options.download_base:
+      form_fields.append(("content_upload", "1"))
+    files = [("data", "data.diff", patch[1])]
+    ctype, body = EncodeMultipartFormData(form_fields, files)
+    url = "/%d/upload_patch/%d" % (int(issue), int(patchset))
+    print "Uploading patch for " + patch[0]
+    response_body = rpc_server.Send(url, body, content_type=ctype)
+    lines = response_body.splitlines()
+    if not lines or lines[0] != "OK":
+      StatusUpdate("  --> %s" % response_body)
+      sys.exit(1)
+    rv.append([lines[1], patch[0]])
+  return rv
+
+
+def GuessVCS(options):
+  """Helper to guess the version control system.
+
+  This examines the current directory, guesses which VersionControlSystem
+  we're using, and returns an instance of the appropriate class.  Exit with an
+  error if we can't figure it out.
+
+  Returns:
+    A VersionControlSystem instance. Exits if the VCS can't be guessed.
+  """
+  # Mercurial has a command to get the base directory of a repository
+  # Try running it, but don't die if we don't have hg installed.
+  # NOTE: we try Mercurial first as it can sit on top of an SVN working copy.
+  try:
+    out, returncode = RunShellWithReturnCode(["hg", "root"])
+    if returncode == 0:
+      return MercurialVCS(options, out.strip())
+  except OSError, (errno, message):
+    if errno != 2:  # ENOENT -- they don't have hg installed.
+      raise
+
+  # Subversion has a .svn in all working directories.
+  if os.path.isdir('.svn'):
+    logging.info("Guessed VCS = Subversion")
+    return SubversionVCS(options)
+
+  # Git has a command to test if you're in a git tree.
+  # Try running it, but don't die if we don't have git installed.
+  try:
+    out, returncode = RunShellWithReturnCode(["git", "rev-parse",
+                                              "--is-inside-work-tree"])
+    if returncode == 0:
+      return GitVCS(options)
+  except OSError, (errno, message):
+    if errno != 2:  # ENOENT -- they don't have git installed.
+      raise
+
+  ErrorExit(("Could not guess version control system. "
+             "Are you in a working copy directory?"))
+
+
+def RealMain(argv, data=None):
+  """The real main function.
+
+  Args:
+    argv: Command line arguments.
+    data: Diff contents. If None (default) the diff is generated by
+      the VersionControlSystem implementation returned by GuessVCS().
+
+  Returns:
+    A 2-tuple (issue id, patchset id).
+    The patchset id is None if the base files are not uploaded by this
+    script (applies only to SVN checkouts).
+  """
+  logging.basicConfig(format=("%(asctime).19s %(levelname)s %(filename)s:"
+                              "%(lineno)s %(message)s "))
+  os.environ['LC_ALL'] = 'C'
+  options, args = parser.parse_args(argv[1:])
+  global verbosity
+  verbosity = options.verbose
+  if verbosity >= 3:
+    logging.getLogger().setLevel(logging.DEBUG)
+  elif verbosity >= 2:
+    logging.getLogger().setLevel(logging.INFO)
+  vcs = GuessVCS(options)
+  if isinstance(vcs, SubversionVCS):
+    # base field is only allowed for Subversion.
+    # Note: Fetching base files may become deprecated in future releases.
+    base = vcs.GuessBase(options.download_base)
+  else:
+    base = None
+  if not base and options.download_base:
+    options.download_base = True
+    logging.info("Enabled upload of base file")
+  if not options.assume_yes:
+    vcs.CheckForUnknownFiles()
+  if data is None:
+    data = vcs.GenerateDiff(args)
+  files = vcs.GetBaseFiles(data)
+  if verbosity >= 1:
+    print "Upload server:", options.server, "(change with -s/--server)"
+  if options.issue:
+    prompt = "Message describing this patch set: "
+  else:
+    prompt = "New issue subject: "
+  message = options.message or raw_input(prompt).strip()
+  if not message:
+    ErrorExit("A non-empty message is required")
+  rpc_server = GetRpcServer(options)
+  form_fields = [("subject", message)]
+  if base:
+    form_fields.append(("base", base))
+  if options.issue:
+    form_fields.append(("issue", str(options.issue)))
+  if options.email:
+    form_fields.append(("user", options.email))
+  if options.reviewers:
+    for reviewer in options.reviewers.split(','):
+      if "@" in reviewer and not reviewer.split("@")[1].count(".") == 1:
+        ErrorExit("Invalid email address: %s" % reviewer)
+    form_fields.append(("reviewers", options.reviewers))
+  if options.cc:
+    for cc in options.cc.split(','):
+      if "@" in cc and not cc.split("@")[1].count(".") == 1:
+        ErrorExit("Invalid email address: %s" % cc)
+    form_fields.append(("cc", options.cc))
+  description = options.description
+  if options.description_file:
+    if options.description:
+      ErrorExit("Can't specify description and description_file")
+    file = open(options.description_file, 'r')
+    description = file.read()
+    file.close()
+  if description:
+    form_fields.append(("description", description))
+  # Send a hash of all the base file so the server can determine if a copy
+  # already exists in an earlier patchset.
+  base_hashes = ""
+  for file, info in files.iteritems():
+    if not info[0] is None:
+      checksum = md5.new(info[0]).hexdigest()
+      if base_hashes:
+        base_hashes += "|"
+      base_hashes += checksum + ":" + file
+  form_fields.append(("base_hashes", base_hashes))
+  # If we're uploading base files, don't send the email before the uploads, so
+  # that it contains the file status.
+  if options.send_mail and options.download_base:
+    form_fields.append(("send_mail", "1"))
+  if not options.download_base:
+    form_fields.append(("content_upload", "1"))
+  if len(data) > MAX_UPLOAD_SIZE:
+    print "Patch is large, so uploading file patches separately."
+    uploaded_diff_file = []
+    form_fields.append(("separate_patches", "1"))
+  else:
+    uploaded_diff_file = [("data", "data.diff", data)]
+  ctype, body = EncodeMultipartFormData(form_fields, uploaded_diff_file)
+  response_body = rpc_server.Send("/upload", body, content_type=ctype)
+  patchset = None
+  if not options.download_base or not uploaded_diff_file:
+    lines = response_body.splitlines()
+    if len(lines) >= 2:
+      msg = lines[0]
+      patchset = lines[1].strip()
+      patches = [x.split(" ", 1) for x in lines[2:]]
+    else:
+      msg = response_body
+  else:
+    msg = response_body
+  StatusUpdate(msg)
+  if not response_body.startswith("Issue created.") and \
+  not response_body.startswith("Issue updated."):
+    sys.exit(0)
+  issue = msg[msg.rfind("/")+1:]
+
+  if not uploaded_diff_file:
+    result = UploadSeparatePatches(issue, rpc_server, patchset, data, options)
+    if not options.download_base:
+      patches = result
+
+  if not options.download_base:
+    vcs.UploadBaseFiles(issue, rpc_server, patches, patchset, options, files)
+    if options.send_mail:
+      rpc_server.Send("/" + issue + "/mail", payload="")
+  return issue, patchset
+
+
+def main():
+  try:
+    RealMain(sys.argv)
+  except KeyboardInterrupt:
+    print
+    StatusUpdate("Interrupted.")
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload_gtest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload_gtest.py
new file mode 100755
index 0000000..be19ae8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/scripts/upload_gtest.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""upload_gtest.py v0.1.0 -- uploads a Google Test patch for review.
+
+This simple wrapper passes all command line flags and
+--cc=googletestframework@googlegroups.com to upload.py.
+
+USAGE: upload_gtest.py [options for upload.py]
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import sys
+
+CC_FLAG = '--cc='
+GTEST_GROUP = 'googletestframework@googlegroups.com'
+
+
+def main():
+  # Finds the path to upload.py, assuming it is in the same directory
+  # as this file.
+  my_dir = os.path.dirname(os.path.abspath(__file__))
+  upload_py_path = os.path.join(my_dir, 'upload.py')
+
+  # Adds Google Test discussion group to the cc line if it's not there
+  # already.
+  upload_py_argv = [upload_py_path]
+  found_cc_flag = False
+  for arg in sys.argv[1:]:
+    if arg.startswith(CC_FLAG):
+      found_cc_flag = True
+      cc_line = arg[len(CC_FLAG):]
+      cc_list = [addr for addr in cc_line.split(',') if addr]
+      if GTEST_GROUP not in cc_list:
+        cc_list.append(GTEST_GROUP)
+      upload_py_argv.append(CC_FLAG + ','.join(cc_list))
+    else:
+      upload_py_argv.append(arg)
+
+  if not found_cc_flag:
+    upload_py_argv.append(CC_FLAG + GTEST_GROUP)
+
+  # Invokes upload.py with the modified command line flags.
+  os.execv(upload_py_path, upload_py_argv)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-all.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-all.cc
new file mode 100755
index 0000000..ad29290
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-all.cc
@@ -0,0 +1,48 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+#include "src/gtest.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-matchers.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-death-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-death-test.cc
new file mode 100755
index 0000000..e5ec287
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-death-test.cc
@@ -0,0 +1,1653 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+
+#include <utility>
+
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/fd.h>
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <lib/zx/channel.h>
+#  include <lib/zx/port.h>
+#  include <lib/zx/process.h>
+#  include <lib/zx/socket.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/policy.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+static bool g_in_fast_death_test_child = false;
+# endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+}
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0) {
+    msg << "couldn't detect the number of threads.";
+  } else {
+    msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+static void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != nullptr) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == nullptr) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement,
+                       Matcher<const std::string&> matcher, const char* file,
+                       int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, std::move(matcher), file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, Matcher<const std::string&> matcher)
+      : statement_(a_statement),
+        matcher_(std::move(matcher)),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason) override;
+  bool Passed(bool status_ok) override;
+
+  const char* statement() const { return statement_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+  // Returns stderr output from the child process.
+  virtual std::string GetErrorLogs();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // A matcher that's expected to match the stderr output by the child process.
+  Matcher<const std::string&> matcher_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+std::string DeathTestImpl::GetErrorLogs() {
+  return GetCapturedStderr();
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   matcher_: A matcher that's expected to match the stderr output by the child
+//             process.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true if all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetErrorLogs();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        if (matcher_.Matches(error_message)) {
+          success = true;
+        } else {
+          std::ostringstream stream;
+          matcher_.DescribeTo(&stream);
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << stream.str() << "\n"
+                 << "Actual msg:\n"
+                 << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
+                                                 nullptr, TRUE};
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,       // The event will automatically reset to non-signaled state.
+      FALSE,      // The initial state is non-signalled.
+      nullptr));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
+                                                                executable_path,
+                                                                _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreateProcessA(
+          executable_path, const_cast<char*>(command_line.c_str()),
+          nullptr,  // Retuned process handle is not inheritable.
+          nullptr,  // Retuned thread handle is not inheritable.
+          TRUE,  // Child inherits all inheritable handles (for write_handle_).
+          0x0,   // Default creation flags.
+          nullptr,  // Inherit the parent's environment.
+          UnitTest::GetInstance()->original_working_dir(), &startup_info,
+          &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+  TestRole AssumeRole() override;
+  std::string GetErrorLogs() override;
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // The stderr data captured by the child process.
+  std::string captured_stderr_;
+
+  zx::process child_process_;
+  zx::channel exception_channel_;
+  zx::socket stderr_socket_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+  int size() {
+    return args_.size() - 1;
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  const int kProcessKey = 0;
+  const int kSocketKey = 1;
+  const int kExceptionKey = 2;
+
+  if (!spawned())
+    return 0;
+
+  // Create a port to wait for socket/task/exception events.
+  zx_status_t status_zx;
+  zx::port port;
+  status_zx = zx::port::create(0, &port);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the child process to terminate.
+  status_zx = child_process_.wait_async(
+      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the socket to be readable or closed.
+  status_zx = stderr_socket_.wait_async(
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+      ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for an exception.
+  status_zx = exception_channel_.wait_async(
+      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  bool process_terminated = false;
+  bool socket_closed = false;
+  do {
+    zx_port_packet_t packet = {};
+    status_zx = port.wait(zx::time::infinite(), &packet);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    if (packet.key == kExceptionKey) {
+      // Process encountered an exception. Kill it directly rather than
+      // letting other handlers process the event. We will get a kProcessKey
+      // event when the process actually terminates.
+      status_zx = child_process_.kill();
+      GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    } else if (packet.key == kProcessKey) {
+      // Process terminated.
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+      process_terminated = true;
+    } else if (packet.key == kSocketKey) {
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      if (packet.signal.observed & ZX_SOCKET_READABLE) {
+        // Read data from the socket.
+        constexpr size_t kBufferSize = 1024;
+        do {
+          size_t old_length = captured_stderr_.length();
+          size_t bytes_read = 0;
+          captured_stderr_.resize(old_length + kBufferSize);
+          status_zx = stderr_socket_.read(
+              0, &captured_stderr_.front() + old_length, kBufferSize,
+              &bytes_read);
+          captured_stderr_.resize(old_length + bytes_read);
+        } while (status_zx == ZX_OK);
+        if (status_zx == ZX_ERR_PEER_CLOSED) {
+          socket_closed = true;
+        } else {
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
+          status_zx = stderr_socket_.wait_async(
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+              ZX_WAIT_ASYNC_ONCE);
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+        }
+      } else {
+        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
+        socket_closed = true;
+      }
+    }
+  } while (!process_terminated && !socket_closed);
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = child_process_.get_info(
+      ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  int child_pipe_fd;
+  status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  set_read_fd(child_pipe_fd);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t spawn_actions[2] = {};
+  fdio_spawn_action_t* add_handle_action = &spawn_actions[0];
+  add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
+  add_handle_action->h.handle = child_pipe_handle;
+
+  // Create a socket pair will be used to receive the child process' stderr.
+  zx::socket stderr_producer_socket;
+  status =
+      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  int stderr_producer_fd = -1;
+  status =
+      fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+
+  // Make the stderr socket nonblocking.
+  GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
+
+  fdio_spawn_action_t* add_stderr_action = &spawn_actions[1];
+  add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
+  add_stderr_action->fd.local_fd = stderr_producer_fd;
+  add_stderr_action->fd.target_fd = STDERR_FILENO;
+
+  // Create a child job.
+  zx_handle_t child_job = ZX_HANDLE_INVALID;
+  status = zx_job_create(zx_job_default(), 0, & child_job);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  zx_policy_basic_t policy;
+  policy.condition = ZX_POL_NEW_ANY;
+  policy.policy = ZX_POL_ACTION_ALLOW;
+  status = zx_job_set_policy(
+      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception channel attached to the |child_job|, to allow
+  // us to suppress the system default exception handler from firing.
+  status =
+      zx_task_create_exception_channel(
+          child_job, 0, exception_channel_.reset_and_get_address());
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(
+      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
+      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+std::string FuchsiaDeathTest::GetErrorLogs() {
+  return captured_stderr_;
+}
+
+#else  // We are neither on Windows, nor on Fuchsia.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, Matcher<const std::string&> matcher);
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement,
+                                   Matcher<const std::string&> matcher)
+    : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher)
+      : ForkingDeathTest(a_statement, std::move(matcher)) {}
+  TestRole AssumeRole() override;
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                const char* file, int line)
+      : ForkingDeathTest(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+  TestRole AssumeRole() override;
+
+ private:
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+#  if GTEST_HAS_CLONE
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+// HWAddressSanitizer add a random tag to the MSB of the local variable address,
+// making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+#  endif  // GTEST_HAS_CLONE
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid =
+      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const auto stack_size = static_cast<size_t>(getpagesize());
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(
+        static_cast<size_t>(stack_size) > kMaxStackAlignment &&
+        reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, nullptr));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement,
+                                     Matcher<const std::string&> matcher,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != nullptr) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = nullptr;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
+  }
+
+# elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, std::move(matcher), file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, std::move(matcher));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+
+# elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-filepath.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-filepath.cc
new file mode 100755
index 0000000..bd7b99f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-filepath.cc
@@ -0,0 +1,379 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-filepath.h"
+
+#include <stdlib.h>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-message.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#include "gtest/internal/gtest-string.h"
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || ARDUINO || defined(ESP_PLATFORM)
+  // These platforms do not have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == nullptr ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != nullptr &&
+      (last_sep == nullptr || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+void FilePath::Normalize() {
+  if (pathname_.c_str() == nullptr) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-internal-inl.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-internal-inl.h
new file mode 100755
index 0000000..e29d992
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-internal-inl.h
@@ -0,0 +1,1210 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true if the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true if Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  bool print_utf8_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
+                                                    : v[static_cast<size_t>(i)];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected =
+        begin +
+        static_cast<int>(random->Generate(static_cast<UInt32>(range_width)));
+    std::swap((*v)[static_cast<size_t>(selected)],
+              (*v)[static_cast<size_t>(last_in_range)]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true if the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true if the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true if the user-specified filter matches the test suite
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string& test_suite_name,
+                                const std::string& test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  std::string CurrentStackTrace(int max_depth, int skip_count) override;
+  void UponLeavingGTest() override;
+
+ private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
+
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
+
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
+
+  // Gets the number of all test suites that contain at least one test
+  // that should run.
+  int test_suite_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true if the unit test passed (i.e. all test suites passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true if the unit test failed (i.e. some test suite failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite* GetTestSuite(int i) const {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
+  }
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase* GetTestCase(int i) const { return GetTestSuite(i); }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite* GetMutableSuiteCase(int i) {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestSuite with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_suite_name: name of the test suite
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test suite
+  //   tear_down_tc:   pointer to the function that tears down the test suite
+  TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
+                          internal::SetUpTestSuiteFunc set_up_tc,
+                          internal::TearDownTestSuiteFunc tear_down_tc);
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  TestCase* GetTestCase(const char* test_case_name, const char* type_param,
+                        internal::SetUpTestSuiteFunc set_up_tc,
+                        internal::TearDownTestSuiteFunc tear_down_tc) {
+    return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
+  }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  //   test_info:    the TestInfo object
+  void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
+                   internal::TearDownTestSuiteFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
+                 set_up_tc, tear_down_tc)
+        ->AddTestInfo(test_info);
+  }
+
+  // Returns ParameterizedTestSuiteRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+
+  // Sets the TestSuite object for the test that's currently running.
+  void set_current_test_suite(TestSuite* a_current_test_suite) {
+    current_test_suite_ = a_current_test_suite;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test suite, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestSuite and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestSuite* current_test_suite() const { return current_test_suite_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test suites, and the tests within each test suite,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test suites and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestSuites in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestSuite*> test_suites_;
+
+  // Provides a level of indirection for the test suite list to allow
+  // easy shuffling and restoring the test suite order.  The i-th
+  // element of this vector is the index of the i-th test suite in the
+  // shuffled order.
+  std::vector<int> test_suite_indices_;
+
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+
+  // Index of the last death test suite registered.  Initially -1.
+  int last_death_test_suite_;
+
+  // This points to the TestSuite for the currently running test.  It
+  // changes as Google Test goes through one test suite after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestSuite* current_test_suite_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True if PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const std::string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const std::string& message) { Send(message + "\n"); }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const std::string& host, const std::string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    ~SocketWriter() override {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    void Send(const std::string& message) override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const auto len = static_cast<size_t>(message.length());
+      if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const std::string host_name_;
+    const std::string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static std::string UrlEncode(const char* str);
+
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) override {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) override {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */,
+                            int iteration) override {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test,
+                          int /* iteration */) override {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  // Note that "event=TestCaseStart" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseStart(const TestCase& test_case) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  // Note that "event=TestCaseEnd" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseEnd(const TestCase& test_case) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+           "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) override {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) override {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) override {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == nullptr) file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const std::unique_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-matchers.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-matchers.cc
new file mode 100755
index 0000000..7d2fb68
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-matchers.cc
@@ -0,0 +1,97 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-matchers.h"
+
+#include <string>
+
+namespace testing {
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
+
+#if GTEST_HAS_ABSL
+// Constructs a matcher that matches a const absl::string_view& whose value is
+// equal to s.
+Matcher<const absl::string_view&>::Matcher(const std::string& s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const absl::string_view& whose value is
+// equal to s.
+Matcher<const absl::string_view&>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a const absl::string_view& whose value is
+// equal to s.
+Matcher<const absl::string_view&>::Matcher(absl::string_view s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a absl::string_view whose value is equal to
+// s.
+Matcher<absl::string_view>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a absl::string_view whose value is equal to
+// s.
+Matcher<absl::string_view>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a absl::string_view whose value is equal to
+// s.
+Matcher<absl::string_view>::Matcher(absl::string_view s) {
+  *this = Eq(std::string(s));
+}
+#endif  // GTEST_HAS_ABSL
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-port.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-port.cc
new file mode 100755
index 0000000..9024f03
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-port.cc
@@ -0,0 +1,1402 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <memory>
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+# ifdef _MSC_VER
+#  include <crtdbg.h>
+# endif  // _MSC_VER
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+# include <sys/sysctl.h>
+# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#  include <sys/user.h>
+# endif
+#endif
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<size_t>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+      GTEST_OS_NETBSD
+
+#if GTEST_OS_NETBSD
+#undef KERN_PROC
+#define KERN_PROC KERN_PROC2
+#define kinfo_proc kinfo_proc2
+#endif
+
+#if GTEST_OS_DRAGONFLY
+#define KP_NLWP(kp) (kp.kp_nthreads)
+#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#define KP_NLWP(kp) (kp.ki_numthreads)
+#elif GTEST_OS_NETBSD
+#define KP_NLWP(kp) (kp.p_nlwps)
+#endif
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID,
+    getpid(),
+#if GTEST_OS_NETBSD
+    sizeof(struct kinfo_proc),
+    1,
+#endif
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+  struct kinfo_proc info;
+  size_t size = sizeof(info);
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+  return static_cast<size_t>(KP_NLWP(info));
+}
+#elif GTEST_OS_OPENBSD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+    getpid(),
+    sizeof(struct kinfo_proc),
+    0,
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+
+  // get number of structs
+  size_t size;
+  if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
+    return 0;
+  }
+  mib[5] = size / mib[4];
+
+  // populate array of structs
+  struct kinfo_proc info[mib[5]];
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+
+  // exclude empty members
+  int nthreads = 0;
+  for (int i = 0; i < size / mib[4]; i++) {
+    if (info[i].p_tid != -1)
+      nthreads++;
+  }
+  return nthreads;
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(static_cast<DWORD>(n));
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(nullptr,     // Default security attributes.
+                           TRUE,        // Do not reset automatically.
+                           FALSE,       // Initially unset.
+                           nullptr)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != nullptr);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = nullptr;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+namespace {
+
+#ifdef _MSC_VER
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated
+{
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+  }
+
+  ~MemoryIsNotDeallocated() {
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+#endif  // _MSC_VER
+
+}  // namespace
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+#ifdef _MSC_VER
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+          critical_section_ = new CRITICAL_SECTION;
+        }
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    HANDLE thread_handle = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,        // Parameter to ThreadMainStatic
+        0x0,          // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
+    if (thread_handle == nullptr) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    std::unique_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != nullptr)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  std::shared_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   std::shared_ptr<ThreadLocalValueHolderBase> >
+      ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != nullptr);
+    // We need to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED, &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != nullptr);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true if regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true if ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != nullptr;
+}
+
+// Returns true if ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true if "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true if the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == nullptr) {
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True if ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true if regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true if regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == nullptr || str == nullptr) return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true if regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = nullptr;
+  if (regex != nullptr) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    if (captured_fd == -1) {
+      GTEST_LOG_(WARNING)
+          << "Failed to create tmp file " << name_template
+          << " for test; does the test have access to the /tmp directory?";
+    }
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(nullptr);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(nullptr);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    if (file == nullptr) {
+      GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
+                        << " for capturing stream.";
+    }
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+static CapturedStream* g_captured_stderr = nullptr;
+static CapturedStream* g_captured_stdout = nullptr;
+
+// Starts capturing an output stream (stdout/stderr).
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
+  if (*stream != nullptr) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = nullptr;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+
+
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs =
+    nullptr;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
+  if (g_injected_test_argvs != nullptr) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = nullptr;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true if it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar(){
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (nullptr != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-printers.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-printers.cc
new file mode 100755
index 0000000..3337be3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-printers.cc
@@ -0,0 +1,442 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+#include <stdio.h>
+#include <cctype>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  wchar_t w_c = static_cast<wchar_t>(c);
+  switch (w_c) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(w_c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexadecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<int>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static CharFormat PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
+  }
+  *os << "\"";
+  return print_format;
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == nullptr) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == nullptr) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+        switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-test-part.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-test-part.cc
new file mode 100755
index 0000000..178317a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-test-part.cc
@@ -0,0 +1,104 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == nullptr ? message : std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os << result.file_name() << ":" << result.line_number() << ": "
+            << (result.type() == TestPartResult::kSuccess
+                    ? "Success"
+                    : result.type() == TestPartResult::kSkip
+                          ? "Skipped"
+                          : result.type() == TestPartResult::kFatalFailure
+                                ? "Fatal failure"
+                                : "Non-fatal failure")
+            << ":\n"
+            << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[static_cast<size_t>(index)];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-typed-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-typed-test.cc
new file mode 100755
index 0000000..8677caf
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest-typed-test.cc
@@ -0,0 +1,118 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/gtest-typed-test.h"
+
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != nullptr; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestSuitePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
+         ++it) {
+      if (name == it->first) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test suite.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
+       ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest.cc
new file mode 100755
index 0000000..0d1f413
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest.cc
@@ -0,0 +1,6181 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/gtest-spi.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+# include <crtdbg.h>  // NOLINT
+# include <debugapi.h>  // NOLINT
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+#else
+
+// Assume other platforms have gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+#endif
+
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test suite name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test suite whose name matches this filter is considered a death
+// test suite and will be run before test suites whose name doesn't
+// match this filter.
+static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true if the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+// Utilty function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = nullptr;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == nullptr) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
+}  // namespace internal
+
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != nullptr) {
+    return testbridge_test_only;
+  }
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True if a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True if " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output",
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True if " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(
+    print_utf8,
+    internal::BoolFromGTestEnv("print_utf8", true),
+    "True if " GTEST_NAME_
+    " prints UTF8 characters as text.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True if " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True if " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true if the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestSuites, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestSuiteList(const std::vector<TestSuite*>& case_list,
+                                int (TestSuite::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true if the test suite passed.
+static bool TestSuitePassed(const TestSuite* test_suite) {
+  return test_suite->should_run() && test_suite->Passed();
+}
+
+// Returns true if the test suite failed.
+static bool TestSuiteFailed(const TestSuite* test_suite) {
+  return test_suite->should_run() && test_suite->Failed();
+}
+
+// Returns true if test_suite contains at least one test that should
+// run.
+static bool ShouldRunTestSuite(const TestSuite* test_suite) {
+  return test_suite->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_OS2
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == nullptr)
+             ? std::string(gtest_output_flag)
+             : std::string(gtest_output_flag,
+                           static_cast<size_t>(colon - gtest_output_flag));
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+
+  std::string format = GetOutputFormat();
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == nullptr)
+    return internal::FilePath::MakeFileName(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true if the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == nullptr) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true if the user-specified filter matches the test suite
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
+                                        const std::string& test_name) {
+  const std::string& full_name = test_suite_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == nullptr) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == nullptr) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test suites.
+int UnitTestImpl::successful_test_suite_count() const {
+  return CountIf(test_suites_, TestSuitePassed);
+}
+
+// Gets the number of failed test suites.
+int UnitTestImpl::failed_test_suite_count() const {
+  return CountIf(test_suites_, TestSuiteFailed);
+}
+
+// Gets the number of all test suites.
+int UnitTestImpl::total_test_suite_count() const {
+  return static_cast<int>(test_suites_.size());
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTestImpl::test_suite_to_run_count() const {
+  return CountIf(test_suites_, ShouldRunTestSuite);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
+}
+
+// Gets the number of skipped tests.
+int UnitTestImpl::skipped_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_,
+                              &TestSuite::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, nullptr);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return nullptr;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return nullptr;
+  const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
+                                              0, nullptr, nullptr);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
+                      nullptr);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true if they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        auto it = edits.begin() + static_cast<int>(edit_i);
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() ||
+            static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true if needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return strstr(haystack, needle) != nullptr;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return wcsstr(haystack, needle) != nullptr;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,   // no source, we're asking system
+                                          static_cast<DWORD>(hr),  // the error
+                                          0,   // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,    // buf size
+                                          nullptr);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const auto first_u = static_cast<UInt32>(first);
+  const auto second_u = static_cast<UInt32>(second);
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2)
+             ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
+             :
+             // This function should not be called when the condition is
+             // false, but we provide a sensible default in case it is.
+             first_u;
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == nullptr) return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true if they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true if they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+  if (rhs == nullptr) return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true if they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(static_cast<wint_t>(*lhs++));
+    right = towlower(static_cast<wint_t>(*rhs++));
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true if str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexUInt32(UInt32 value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  return FormatHexUInt32(static_cast<UInt32>(value));
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(static_cast<size_t>(2 * (end - start)));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(static_cast<size_t>(i));
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(static_cast<size_t>(i));
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+    "disabled", "errors", "failures", "name", "tests", "time", "timestamp"};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+    "classname",   "name", "status", "time",  "type_param",
+    "value_param", "file", "line"};
+
+// Use a slightly different set for allowed output to ensure existing tests can
+// still RecordProperty("result") or "RecordProperty(timestamp")
+static const char* const kReservedOutputTestCaseAttributes[] = {
+    "classname",   "name", "status", "time",   "type_param",
+    "value_param", "file", "line",   "result", "timestamp"};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
+static std::vector<std::string> GetReservedOutputAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedOutputTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true off the test part was skipped.
+static bool TestPartSkipped(const TestPartResult& result) {
+  return result.skipped();
+}
+
+// Returns true if the test was skipped.
+bool TestResult::Skipped() const {
+  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
+}
+
+// Returns true if the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true if the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true if the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true if the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true if the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test()
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
+}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      nullptr,  // No info about the source file where the exception occurred.
+      -1,       // We have no info on which line caused the exception.
+      message,
+      "");  // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test suite to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test suite.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestSuite* const test_suite = impl->current_test_suite();
+
+  // Info about the first test in the current test suite.
+  const TestInfo* const first_test_info = test_suite->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test suite, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test suite is\n"
+          << "illegal.  In test suite " << this_test_info->test_suite_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class.  However, in test suite "
+          << this_test_info->test_suite_name() << ",\n"
+          << "you defined test " << first_test_name << " and test "
+          << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test suites.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != nullptr) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(nullptr, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful and didn't call
+  // GTEST_SKIP().
+  if (!HasFatalFailure() && !IsSkipped()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true if the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true if the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// Returns true if the current test was skipped.
+bool Test::IsSkipped() {
+  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_suite_name,
+                   const std::string& a_name, const char* a_type_param,
+                   const char* a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_suite_name_(a_test_suite_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:   name of the test suite
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_suite_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+void ReportInvalidTestSuiteType(const char* test_suite_name,
+                                CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test suite " << test_suite_name << ".\n"
+      << "All tests in the same test suite must use the same test fixture\n"
+      << "class.  However, in test suite " << test_suite_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test suites.";
+
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
+}
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestSuite class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true if the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test if the constructor didn't generate a fatal failure or invoke
+  // GTEST_SKIP().
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  if (test != nullptr) {
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
+  }
+
+  result_.set_start_timestamp(start);
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(nullptr);
+}
+
+// class TestSuite
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::skipped_test_count() const {
+  return CountIf(test_info_list_, TestSkipped);
+}
+
+// Gets the number of failed tests in this test suite.
+int TestSuite::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestSuite::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test suite.
+int TestSuite::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestSuite::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test suite that should run.
+int TestSuite::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestSuite::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestSuite with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test suite
+//   a_type_param: the name of the test suite's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test suite.
+//   set_up_tc:    pointer to the function that sets up the test suite
+//   tear_down_tc: pointer to the function that tears down the test suite
+TestSuite::TestSuite(const char* a_name, const char* a_type_param,
+                     internal::SetUpTestSuiteFunc set_up_tc,
+                     internal::TearDownTestSuiteFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      start_timestamp_(0),
+      elapsed_time_(0) {}
+
+// Destructor of TestSuite.
+TestSuite::~TestSuite() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestSuite::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestSuite::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Adds a test to this test suite.  Will delete the test upon
+// destruction of the TestSuite object.
+void TestSuite::AddTestInfo(TestInfo* test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestSuite.
+void TestSuite::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
+
+  start_timestamp_ = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Clears the results of all tests in this test suite.
+void TestSuite::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test suite.
+void TestSuite::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestSuite::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test suites.
+static std::string FormatTestSuiteCount(int test_suite_count) {
+  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSkip:
+      return "Skipped";
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+static const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:
+      return nullptr;
+  }
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true if Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gtest_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != nullptr || value_param != nullptr) {
+    printf(", where ");
+    if (type_param != nullptr) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != nullptr) printf(" and ");
+    }
+    if (value_param != nullptr) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& test_case) override;
+#else
+  void OnTestSuiteStart(const TestSuite& test_suite) override;
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& test_info) override;
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& test_case) override;
+#else
+  void OnTestSuiteEnd(const TestSuite& test_suite) override;
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+  static void PrintSkippedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteStart(
+    const TestSuite& test_suite) {
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_suite.name());
+  if (test_suite.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
+  }
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, or was skipped,
+    // we don't need to do anything.
+    case TestPartResult::kSkip:
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else if (test_info.result()->Skipped()) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
+         internal::StreamableToString(test_suite.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Failed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+// Internal helper for printing the list of skipped tests.
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Skipped()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
+    PrintSkippedTests(unit_test);
+  }
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  ~TestEventRepeater() override;
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  void OnTestProgramStart(const UnitTest& unit_test) override;
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestSuite& parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteStart(const TestSuite& parameter) override;
+  void OnTestStart(const TestInfo& test_info) override;
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteEnd(const TestSuite& parameter) override;
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override;
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& unit_test) override;
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + static_cast<int>(i));
+      return listener;
+    }
+  }
+
+  return nullptr;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = listeners_.size(); i != 0; i--) { \
+        listeners_[i - 1]->Name(parameter);             \
+      }                                                 \
+    }                                                   \
+  }
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = listeners_.size(); i > 0; i--) {
+      listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void ListTestsMatchingFilter(const std::vector<TestSuite*>& test_suites);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_suite_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestSuite object
+  static void PrintXmlTestSuite(::std::ostream* stream,
+                                const TestSuite& test_suite);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestSuite*>& test_suites) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_suites);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+// GOOGLETEST_CM0009 DO NOT DELETE
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestSuite object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == nullptr) return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != nullptr;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != nullptr) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_suite_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
+
+  if (test_info.value_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "type_param",
+                       test_info.type_param());
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestsuite, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
+
+  OutputXmlAttribute(stream, kTestsuite, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestsuite, "result",
+                     test_info.should_run()
+                         ? (result.Skipped() ? "skipped" : "completed")
+                         : "suppressed");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0 && result.test_property_count() == 0) {
+    *stream << " />\n";
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
+    *stream << "    </testcase>\n";
+  }
+}
+
+// Prints an XML representation of a TestSuite object
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
+                                                 const TestSuite& test_suite) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_suite.reportable_test_count()));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_suite.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp()));
+    *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
+  }
+  *stream << ">\n";
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
+      PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (auto test_suite : test_suites) {
+    PrintXmlTestSuite(stream, *test_suite);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "</" << kProperties << ">\n";
+}
+
+// End XmlUnitTestResultPrinter
+
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_suite_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestSuite object
+  static void PrintJsonTestSuite(::std::ostream* stream,
+                                 const TestSuite& test_suite);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(size_t width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma)
+    *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma)
+    *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_suite_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
+                  kIndent);
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestsuite, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestsuite, "result",
+                test_info.should_run()
+                    ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
+                    : "SUPPRESSED",
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
+                false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestSuite object
+void JsonUnitTestResultPrinter::PrintJsonTestSuite(
+    std::ostream* stream, const TestSuite& test_suite) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures",
+                  test_suite.failed_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_suite.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()),
+        kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
+    }
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_suites.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestSuite(stream, *test_suites[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = nullptr;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else  // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
+    // If a path to the premature-exit file is specified...
+    if (!premature_exit_filepath_.empty()) {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
+    }
+  }
+
+ private:
+  const std::string premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(nullptr),
+      default_xml_generator_(nullptr) {}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = nullptr;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = nullptr;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // defined(__BORLANDC__)
+}
+
+// Gets the number of successful test suites.
+int UnitTest::successful_test_suite_count() const {
+  return impl()->successful_test_suite_count();
+}
+
+// Gets the number of failed test suites.
+int UnitTest::failed_test_suite_count() const {
+  return impl()->failed_test_suite_count();
+}
+
+// Gets the number of all test suites.
+int UnitTest::total_test_suite_count() const {
+  return impl()->total_test_suite_count();
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTest::test_suite_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_suite_count();
+}
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_suite_count();
+}
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_suite_count();
+}
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of skipped tests.
+int UnitTest::skipped_test_count() const {
+  return impl()->skipped_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true if the unit test passed (i.e. all test suites passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true if the unit test failed (i.e. some test suite failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+const TestSuite* UnitTest::GetTestSuite(int i) const {
+  return impl()->GetTestSuite(i);
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test suites.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+TestSuite* UnitTest::GetMutableTestSuite(int i) {
+  return impl()->GetMutableSuiteCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == nullptr) {
+    return nullptr;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result = TestPartResult(
+      result_type, file_name, line_number, msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess &&
+      result_type != TestPartResult::kSkip) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
+#else
+      // Dereference nullptr through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: some debuggers don't correctly trap abort().
+      *static_cast<volatile int*>(nullptr) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+// from SetUpTestSuite or TearDownTestSuite, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process
+          ? nullptr
+          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_OS_WINDOWS
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+
+    // In debug mode, the Windows CRT can crash with an assertion over invalid
+    // input (e.g. passing an invalid file descriptor).  The default handling
+    // for these assertions is to pop up a dialog and wait for user input.
+    // Instead ask the CRT to dump such assertions to stderr non-interactively.
+    if (!IsDebuggerPresent()) {
+      (void)_CrtSetReportMode(_CRT_ASSERT,
+                              _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+      (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+    }
+  }
+#endif  // GTEST_OS_WINDOWS
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestSuite object for the test that's currently running,
+// or NULL if no test is running.
+const TestSuite* UnitTest::current_test_suite() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+#endif
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+// Returns ParameterizedTestSuiteRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestSuiteRegistry&
+UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+          default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+      last_death_test_suite_(-1),
+      current_test_suite_(nullptr),
+      current_test_info_(nullptr),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(nullptr),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),       // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestSuite.
+  ForEach(test_suites_, internal::Delete<TestSuite>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test suite's ad_hoc_test_result when invoke
+// from SetUpTestSuite/TearDownTestSuite, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != nullptr) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_suite_ != nullptr) {
+    xml_element = "testsuite";
+    test_result = &(current_test_suite_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != nullptr)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
+  }
+}
+
+// A predicate that checks the name of a TestSuite against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestSuiteNameIs is copyable.
+class TestSuiteNameIs {
+ public:
+  // Constructor.
+  explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
+
+  // Returns true if the name of test_suite matches name_.
+  bool operator()(const TestSuite* test_suite) const {
+    return test_suite != nullptr &&
+           strcmp(test_suite->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestSuite with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_suite_name: name of the test suite
+//   type_param:     the name of the test suite's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test suite.
+//   set_up_tc:      pointer to the function that sets up the test suite
+//   tear_down_tc:   pointer to the function that tears down the test suite
+TestSuite* UnitTestImpl::GetTestSuite(
+    const char* test_suite_name, const char* type_param,
+    internal::SetUpTestSuiteFunc set_up_tc,
+    internal::TearDownTestSuiteFunc tear_down_tc) {
+  // Can we find a TestSuite with the given name?
+  const auto test_suite =
+      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
+                   TestSuiteNameIs(test_suite_name));
+
+  if (test_suite != test_suites_.rend()) return *test_suite;
+
+  // No.  Let's create one.
+  auto* const new_test_suite =
+      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test suite?
+  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
+                                               kDeathTestSuiteFilter)) {
+    // Yes.  Inserts the test suite after the last death test suite
+    // defined so far.  This only works when the test suites haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_suite_;
+    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
+                        new_test_suite);
+  } else {
+    // No.  Appends to the end of the list.
+    test_suites_.push_back(new_test_suite);
+  }
+
+  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
+  return new_test_suite;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // True if Google Test is initialized before RUN_ALL_TESTS() is called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True if we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test =
+      (internal_run_death_test_flag_.get() != nullptr);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True if at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool gtest_repeat_forever = repeat < 0;
+  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test suites and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(static_cast<UInt32>(random_seed_));
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test suite if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure or skip triggered
+      // during global set-up.
+      if (Test::IsSkipped()) {
+        // Emit diagnostics when global set-up calls skip, as it will not be
+        // emitted by default.
+        TestResult& test_result =
+            *internal::GetUnitTestImpl()->current_test_result();
+        for (int j = 0; j < test_result.total_part_count(); ++j) {
+          const TestPartResult& test_part_result =
+              test_result.GetTestPartResult(j);
+          if (test_part_result.type() == TestPartResult::kSkip) {
+            const std::string& result = test_part_result.message();
+            printf("%s\n", result.c_str());
+          }
+        }
+        fflush(stdout);
+      } else if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != nullptr) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == nullptr) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == nullptr) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestSuite and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (auto* test_suite : test_suites_) {
+    const std::string& test_suite_name = test_suite->name();
+    test_suite->set_should_run(false);
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_suite->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test suite name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
+                                   test_suite_name, kDisableTestFilter) ||
+                               internal::UnitTestOptions::MatchesFilter(
+                                   test_name, kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
+          test_suite_name, test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_suite->set_should_run(test_suite->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != nullptr) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (auto* test_suite : test_suites_) {
+    bool printed_test_suite_name = false;
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      const TestInfo* const test_info = test_suite->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_suite_name) {
+          printed_test_suite_name = true;
+          printf("%s.", test_suite->name());
+          if (test_suite->type_param() != nullptr) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != nullptr) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_suites_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_suites_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == nullptr) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the most specific TestResult currently running.
+TestResult* UnitTestImpl::current_test_result() {
+  if (current_test_info_ != nullptr) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_suite_ != nullptr) {
+    return &current_test_suite_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
+}
+
+// Shuffles all test suites, and the tests within each test suite,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test suites.
+  ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
+
+  // Shuffles the non-death test suites.
+  ShuffleRange(random(), last_death_test_suite_ + 1,
+               static_cast<int>(test_suites_.size()), &test_suite_indices_);
+
+  // Shuffles the tests inside each test suite.
+  for (auto& test_suite : test_suites_) {
+    test_suite->ShuffleTests(random());
+  }
+}
+
+// Restores the test suites and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_suites_.size(); i++) {
+    // Unshuffles the tests in each test suite.
+    test_suites_[i]->UnshuffleTests();
+    // Resets the index of each test suite.
+    test_suite_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseFlagValue(const char* str, const char* flag,
+                                  bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == nullptr) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate a JSON or XML report in the given directory or with the given\n"
+"      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+# if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+# endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions for use by an external\n"
+"      test framework.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+static bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but if
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+void InitGoogleTest() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
+
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(&argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == nullptr || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest_main.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest_main.cc
new file mode 100755
index 0000000..f6e1dd9
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/src/gtest_main.cc
@@ -0,0 +1,47 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+#include "gtest/gtest.h"
+
+#ifdef ARDUINO
+void setup() {
+  testing::InitGoogleTest();
+}
+
+void loop() { RUN_ALL_TESTS(); }
+
+#else
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from %s\n", __FILE__);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/BUILD.bazel b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/BUILD.bazel
new file mode 100755
index 0000000..156d5d4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/BUILD.bazel
@@ -0,0 +1,521 @@
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: misterg@google.com (Gennadiy Civil)
+#
+# Bazel BUILD for The Google C++ Testing Framework (Google Test)
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_test")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+licenses(["notice"])
+
+#on windows exclude gtest-tuple.h
+cc_test(
+    name = "gtest_all_test",
+    size = "small",
+    srcs = glob(
+        include = [
+            "gtest-*.cc",
+            "googletest-*.cc",
+            "*.h",
+            "googletest/include/gtest/**/*.h",
+        ],
+        exclude = [
+            "gtest-unittest-api_test.cc",
+            "googletest/src/gtest-all.cc",
+            "gtest_all_test.cc",
+            "gtest-death-test_ex_test.cc",
+            "gtest-listener_test.cc",
+            "gtest-unittest-api_test.cc",
+            "googletest-param-test-test.cc",
+            "googletest-catch-exceptions-test_.cc",
+            "googletest-color-test_.cc",
+            "googletest-env-var-test_.cc",
+            "googletest-filter-unittest_.cc",
+            "googletest-break-on-failure-unittest_.cc",
+            "googletest-listener-test.cc",
+            "googletest-output-test_.cc",
+            "googletest-list-tests-unittest_.cc",
+            "googletest-shuffle-test_.cc",
+            "googletest-uninitialized-test_.cc",
+            "googletest-death-test_ex_test.cc",
+            "googletest-param-test-test",
+            "googletest-throw-on-failure-test_.cc",
+            "googletest-param-test-invalid-name1-test_.cc",
+            "googletest-param-test-invalid-name2-test_.cc",
+        ],
+    ) + select({
+        "//:windows": [],
+        "//conditions:default": [],
+    }),
+    copts = select({
+        "//:windows": ["-DGTEST_USE_OWN_TR1_TUPLE=0"],
+        "//conditions:default": ["-DGTEST_USE_OWN_TR1_TUPLE=1"],
+    }),
+    includes = [
+        "googletest",
+        "googletest/include",
+        "googletest/include/internal",
+        "googletest/test",
+    ],
+    linkopts = select({
+        "//:windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    deps = ["//:gtest_main"],
+)
+
+# Tests death tests.
+cc_test(
+    name = "googletest-death-test-test",
+    size = "medium",
+    srcs = ["googletest-death-test-test.cc"],
+    deps = ["//:gtest_main"],
+)
+
+cc_test(
+    name = "gtest_test_macro_stack_footprint_test",
+    size = "small",
+    srcs = ["gtest_test_macro_stack_footprint_test.cc"],
+    deps = ["//:gtest"],
+)
+
+#These googletest tests have their own main()
+cc_test(
+    name = "googletest-listener-test",
+    size = "small",
+    srcs = ["googletest-listener-test.cc"],
+    deps = ["//:gtest_main"],
+)
+
+cc_test(
+    name = "gtest-unittest-api_test",
+    size = "small",
+    srcs = [
+        "gtest-unittest-api_test.cc",
+    ],
+    deps = [
+        "//:gtest",
+    ],
+)
+
+cc_test(
+    name = "googletest-param-test-test",
+    size = "small",
+    srcs = [
+        "googletest-param-test-test.cc",
+        "googletest-param-test-test.h",
+        "googletest-param-test2-test.cc",
+    ],
+    deps = ["//:gtest"],
+)
+
+cc_test(
+    name = "gtest_unittest",
+    size = "small",
+    srcs = ["gtest_unittest.cc"],
+    args = ["--heap_check=strict"],
+    shard_count = 2,
+    deps = ["//:gtest_main"],
+)
+
+#  Py tests
+
+py_library(
+    name = "gtest_test_utils",
+    testonly = 1,
+    srcs = ["gtest_test_utils.py"],
+)
+
+cc_binary(
+    name = "gtest_help_test_",
+    testonly = 1,
+    srcs = ["gtest_help_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "gtest_help_test",
+    size = "small",
+    srcs = ["gtest_help_test.py"],
+    data = [":gtest_help_test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-output-test_",
+    testonly = 1,
+    srcs = ["googletest-output-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-output-test",
+    size = "small",
+    srcs = ["googletest-output-test.py"],
+    args = select({
+        "//:has_absl": [],
+        "//conditions:default": ["--no_stacktrace_support"],
+    }),
+    data = [
+        "googletest-output-test-golden-lin.txt",
+        ":googletest-output-test_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-color-test_",
+    testonly = 1,
+    srcs = ["googletest-color-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-color-test",
+    size = "small",
+    srcs = ["googletest-color-test.py"],
+    data = [":googletest-color-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-env-var-test_",
+    testonly = 1,
+    srcs = ["googletest-env-var-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-env-var-test",
+    size = "medium",
+    srcs = ["googletest-env-var-test.py"],
+    data = [":googletest-env-var-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-filter-unittest_",
+    testonly = 1,
+    srcs = ["googletest-filter-unittest_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-filter-unittest",
+    size = "medium",
+    srcs = ["googletest-filter-unittest.py"],
+    data = [":googletest-filter-unittest_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-break-on-failure-unittest_",
+    testonly = 1,
+    srcs = ["googletest-break-on-failure-unittest_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-break-on-failure-unittest",
+    size = "small",
+    srcs = ["googletest-break-on-failure-unittest.py"],
+    data = [":googletest-break-on-failure-unittest_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_test(
+    name = "gtest_assert_by_exception_test",
+    size = "small",
+    srcs = ["gtest_assert_by_exception_test.cc"],
+    deps = ["//:gtest"],
+)
+
+cc_binary(
+    name = "googletest-throw-on-failure-test_",
+    testonly = 1,
+    srcs = ["googletest-throw-on-failure-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-throw-on-failure-test",
+    size = "small",
+    srcs = ["googletest-throw-on-failure-test.py"],
+    data = [":googletest-throw-on-failure-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-list-tests-unittest_",
+    testonly = 1,
+    srcs = ["googletest-list-tests-unittest_.cc"],
+    deps = ["//:gtest"],
+)
+
+cc_test(
+    name = "gtest_skip_test",
+    size = "small",
+    srcs = ["gtest_skip_test.cc"],
+    deps = ["//:gtest_main"],
+)
+
+cc_test(
+    name = "gtest_skip_in_environment_setup_test",
+    size = "small",
+    srcs = ["gtest_skip_in_environment_setup_test.cc"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "gtest_skip_environment_check_output_test",
+    size = "small",
+    srcs = ["gtest_skip_environment_check_output_test.py"],
+    data = [
+        ":gtest_skip_in_environment_setup_test",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+py_test(
+    name = "googletest-list-tests-unittest",
+    size = "small",
+    srcs = ["googletest-list-tests-unittest.py"],
+    data = [":googletest-list-tests-unittest_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-shuffle-test_",
+    srcs = ["googletest-shuffle-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-shuffle-test",
+    size = "small",
+    srcs = ["googletest-shuffle-test.py"],
+    data = [":googletest-shuffle-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-catch-exceptions-no-ex-test_",
+    testonly = 1,
+    srcs = ["googletest-catch-exceptions-test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+cc_binary(
+    name = "googletest-catch-exceptions-ex-test_",
+    testonly = 1,
+    srcs = ["googletest-catch-exceptions-test_.cc"],
+    copts = ["-fexceptions"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "googletest-catch-exceptions-test",
+    size = "small",
+    srcs = ["googletest-catch-exceptions-test.py"],
+    data = [
+        ":googletest-catch-exceptions-ex-test_",
+        ":googletest-catch-exceptions-no-ex-test_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "gtest_xml_output_unittest_",
+    testonly = 1,
+    srcs = ["gtest_xml_output_unittest_.cc"],
+    deps = ["//:gtest"],
+)
+
+cc_test(
+    name = "gtest_no_test_unittest",
+    size = "small",
+    srcs = ["gtest_no_test_unittest.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "gtest_xml_output_unittest",
+    size = "small",
+    srcs = [
+        "gtest_xml_output_unittest.py",
+        "gtest_xml_test_utils.py",
+    ],
+    args = select({
+        "//:has_absl": [],
+        "//conditions:default": ["--no_stacktrace_support"],
+    }),
+    data = [
+        # We invoke gtest_no_test_unittest to verify the XML output
+        # when the test program contains no test definition.
+        ":gtest_no_test_unittest",
+        ":gtest_xml_output_unittest_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "gtest_xml_outfile1_test_",
+    testonly = 1,
+    srcs = ["gtest_xml_outfile1_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+cc_binary(
+    name = "gtest_xml_outfile2_test_",
+    testonly = 1,
+    srcs = ["gtest_xml_outfile2_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "gtest_xml_outfiles_test",
+    size = "small",
+    srcs = [
+        "gtest_xml_outfiles_test.py",
+        "gtest_xml_test_utils.py",
+    ],
+    data = [
+        ":gtest_xml_outfile1_test_",
+        ":gtest_xml_outfile2_test_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "googletest-uninitialized-test_",
+    testonly = 1,
+    srcs = ["googletest-uninitialized-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-uninitialized-test",
+    size = "medium",
+    srcs = ["googletest-uninitialized-test.py"],
+    data = ["googletest-uninitialized-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+cc_binary(
+    name = "gtest_testbridge_test_",
+    testonly = 1,
+    srcs = ["gtest_testbridge_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+# Tests that filtering via testbridge works
+py_test(
+    name = "gtest_testbridge_test",
+    size = "small",
+    srcs = ["gtest_testbridge_test.py"],
+    data = [":gtest_testbridge_test_"],
+    deps = [":gtest_test_utils"],
+)
+
+py_test(
+    name = "googletest-json-outfiles-test",
+    size = "small",
+    srcs = [
+        "googletest-json-outfiles-test.py",
+        "gtest_json_test_utils.py",
+    ],
+    data = [
+        ":gtest_xml_outfile1_test_",
+        ":gtest_xml_outfile2_test_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+py_test(
+    name = "googletest-json-output-unittest",
+    size = "medium",
+    srcs = [
+        "googletest-json-output-unittest.py",
+        "gtest_json_test_utils.py",
+    ],
+    args = select({
+        "//:has_absl": [],
+        "//conditions:default": ["--no_stacktrace_support"],
+    }),
+    data = [
+        # We invoke gtest_no_test_unittest to verify the JSON output
+        # when the test program contains no test definition.
+        ":gtest_no_test_unittest",
+        ":gtest_xml_output_unittest_",
+    ],
+    deps = [":gtest_test_utils"],
+)
+
+# Verifies interaction of death tests and exceptions.
+cc_test(
+    name = "googletest-death-test_ex_catch_test",
+    size = "medium",
+    srcs = ["googletest-death-test_ex_test.cc"],
+    copts = ["-fexceptions"],
+    defines = ["GTEST_ENABLE_CATCH_EXCEPTIONS_=1"],
+    deps = ["//:gtest"],
+)
+
+cc_binary(
+    name = "googletest-param-test-invalid-name1-test_",
+    testonly = 1,
+    srcs = ["googletest-param-test-invalid-name1-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+cc_binary(
+    name = "googletest-param-test-invalid-name2-test_",
+    testonly = 1,
+    srcs = ["googletest-param-test-invalid-name2-test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "googletest-param-test-invalid-name1-test",
+    size = "small",
+    srcs = ["googletest-param-test-invalid-name1-test.py"],
+    data = [":googletest-param-test-invalid-name1-test_"],
+    deps = [":gtest_test_utils"],
+)
+
+py_test(
+    name = "googletest-param-test-invalid-name2-test",
+    size = "small",
+    srcs = ["googletest-param-test-invalid-name2-test.py"],
+    data = [":googletest-param-test-invalid-name2-test_"],
+    deps = [":gtest_test_utils"],
+)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest.py
new file mode 100755
index 0000000..a5dfbc6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+#
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for Google Test's break-on-failure mode.
+
+A user can ask Google Test to seg-fault when an assertion fails, using
+either the GTEST_BREAK_ON_FAILURE environment variable or the
+--gtest_break_on_failure flag.  This script tests such functionality
+by invoking googletest-break-on-failure-unittest_ (a program written with
+Google Test) with different environments and command line flags.
+"""
+
+import os
+import gtest_test_utils
+
+# Constants.
+
+IS_WINDOWS = os.name == 'nt'
+
+# The environment variable for enabling/disabling the break-on-failure mode.
+BREAK_ON_FAILURE_ENV_VAR = 'GTEST_BREAK_ON_FAILURE'
+
+# The command line flag for enabling/disabling the break-on-failure mode.
+BREAK_ON_FAILURE_FLAG = 'gtest_break_on_failure'
+
+# The environment variable for enabling/disabling the throw-on-failure mode.
+THROW_ON_FAILURE_ENV_VAR = 'GTEST_THROW_ON_FAILURE'
+
+# The environment variable for enabling/disabling the catch-exceptions mode.
+CATCH_EXCEPTIONS_ENV_VAR = 'GTEST_CATCH_EXCEPTIONS'
+
+# Path to the googletest-break-on-failure-unittest_ program.
+EXE_PATH = gtest_test_utils.GetTestExecutablePath(
+    'googletest-break-on-failure-unittest_')
+
+
+environ = gtest_test_utils.environ
+SetEnvVar = gtest_test_utils.SetEnvVar
+
+# Tests in this file run a Google-Test-based test program and expect it
+# to terminate prematurely.  Therefore they are incompatible with
+# the premature-exit-file protocol by design.  Unset the
+# premature-exit filepath to prevent Google Test from creating
+# the file.
+SetEnvVar(gtest_test_utils.PREMATURE_EXIT_FILE_ENV_VAR, None)
+
+
+def Run(command):
+  """Runs a command; returns 1 if it was killed by a signal, or 0 otherwise."""
+
+  p = gtest_test_utils.Subprocess(command, env=environ)
+  if p.terminated_by_signal:
+    return 1
+  else:
+    return 0
+
+
+# The tests.
+
+
+class GTestBreakOnFailureUnitTest(gtest_test_utils.TestCase):
+  """Tests using the GTEST_BREAK_ON_FAILURE environment variable or
+  the --gtest_break_on_failure flag to turn assertion failures into
+  segmentation faults.
+  """
+
+  def RunAndVerify(self, env_var_value, flag_value, expect_seg_fault):
+    """Runs googletest-break-on-failure-unittest_ and verifies that it does
+    (or does not) have a seg-fault.
+
+    Args:
+      env_var_value:    value of the GTEST_BREAK_ON_FAILURE environment
+                        variable; None if the variable should be unset.
+      flag_value:       value of the --gtest_break_on_failure flag;
+                        None if the flag should not be present.
+      expect_seg_fault: 1 if the program is expected to generate a seg-fault;
+                        0 otherwise.
+    """
+
+    SetEnvVar(BREAK_ON_FAILURE_ENV_VAR, env_var_value)
+
+    if env_var_value is None:
+      env_var_value_msg = ' is not set'
+    else:
+      env_var_value_msg = '=' + env_var_value
+
+    if flag_value is None:
+      flag = ''
+    elif flag_value == '0':
+      flag = '--%s=0' % BREAK_ON_FAILURE_FLAG
+    else:
+      flag = '--%s' % BREAK_ON_FAILURE_FLAG
+
+    command = [EXE_PATH]
+    if flag:
+      command.append(flag)
+
+    if expect_seg_fault:
+      should_or_not = 'should'
+    else:
+      should_or_not = 'should not'
+
+    has_seg_fault = Run(command)
+
+    SetEnvVar(BREAK_ON_FAILURE_ENV_VAR, None)
+
+    msg = ('when %s%s, an assertion failure in "%s" %s cause a seg-fault.' %
+           (BREAK_ON_FAILURE_ENV_VAR, env_var_value_msg, ' '.join(command),
+            should_or_not))
+    self.assert_(has_seg_fault == expect_seg_fault, msg)
+
+  def testDefaultBehavior(self):
+    """Tests the behavior of the default mode."""
+
+    self.RunAndVerify(env_var_value=None,
+                      flag_value=None,
+                      expect_seg_fault=0)
+
+  def testEnvVar(self):
+    """Tests using the GTEST_BREAK_ON_FAILURE environment variable."""
+
+    self.RunAndVerify(env_var_value='0',
+                      flag_value=None,
+                      expect_seg_fault=0)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value=None,
+                      expect_seg_fault=1)
+
+  def testFlag(self):
+    """Tests using the --gtest_break_on_failure flag."""
+
+    self.RunAndVerify(env_var_value=None,
+                      flag_value='0',
+                      expect_seg_fault=0)
+    self.RunAndVerify(env_var_value=None,
+                      flag_value='1',
+                      expect_seg_fault=1)
+
+  def testFlagOverridesEnvVar(self):
+    """Tests that the flag overrides the environment variable."""
+
+    self.RunAndVerify(env_var_value='0',
+                      flag_value='0',
+                      expect_seg_fault=0)
+    self.RunAndVerify(env_var_value='0',
+                      flag_value='1',
+                      expect_seg_fault=1)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value='0',
+                      expect_seg_fault=0)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value='1',
+                      expect_seg_fault=1)
+
+  def testBreakOnFailureOverridesThrowOnFailure(self):
+    """Tests that gtest_break_on_failure overrides gtest_throw_on_failure."""
+
+    SetEnvVar(THROW_ON_FAILURE_ENV_VAR, '1')
+    try:
+      self.RunAndVerify(env_var_value=None,
+                        flag_value='1',
+                        expect_seg_fault=1)
+    finally:
+      SetEnvVar(THROW_ON_FAILURE_ENV_VAR, None)
+
+  if IS_WINDOWS:
+    def testCatchExceptionsDoesNotInterfere(self):
+      """Tests that gtest_catch_exceptions doesn't interfere."""
+
+      SetEnvVar(CATCH_EXCEPTIONS_ENV_VAR, '1')
+      try:
+        self.RunAndVerify(env_var_value='1',
+                          flag_value='1',
+                          expect_seg_fault=1)
+      finally:
+        SetEnvVar(CATCH_EXCEPTIONS_ENV_VAR, None)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest_.cc
new file mode 100755
index 0000000..f84957a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-break-on-failure-unittest_.cc
@@ -0,0 +1,86 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Unit test for Google Test's break-on-failure mode.
+//
+// A user can ask Google Test to seg-fault when an assertion fails, using
+// either the GTEST_BREAK_ON_FAILURE environment variable or the
+// --gtest_break_on_failure flag.  This file is used for testing such
+// functionality.
+//
+// This program will be invoked from a Python unit test.  It is
+// expected to fail.  Don't run it directly.
+
+#include "gtest/gtest.h"
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <stdlib.h>
+#endif
+
+namespace {
+
+// A test that's expected to fail.
+TEST(Foo, Bar) {
+  EXPECT_EQ(2, 3);
+}
+
+#if GTEST_HAS_SEH && !GTEST_OS_WINDOWS_MOBILE
+// On Windows Mobile global exception handlers are not supported.
+LONG WINAPI ExitWithExceptionCode(
+    struct _EXCEPTION_POINTERS* exception_pointers) {
+  exit(exception_pointers->ExceptionRecord->ExceptionCode);
+}
+#endif
+
+}  // namespace
+
+int main(int argc, char **argv) {
+#if GTEST_OS_WINDOWS
+  // Suppresses display of the Windows error dialog upon encountering
+  // a general protection fault (segment violation).
+  SetErrorMode(SEM_NOGPFAULTERRORBOX | SEM_FAILCRITICALERRORS);
+
+# if GTEST_HAS_SEH && !GTEST_OS_WINDOWS_MOBILE
+
+  // The default unhandled exception filter does not always exit
+  // with the exception code as exit code - for example it exits with
+  // 0 for EXCEPTION_ACCESS_VIOLATION and 1 for EXCEPTION_BREAKPOINT
+  // if the application is compiled in debug mode. Thus we use our own
+  // filter which always exits with the exception code for unhandled
+  // exceptions.
+  SetUnhandledExceptionFilter(ExitWithExceptionCode);
+
+# endif
+#endif  // GTEST_OS_WINDOWS
+  testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test.py
new file mode 100755
index 0000000..94a5b33
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+#
+# Copyright 2010 Google Inc.  All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests Google Test's exception catching behavior.
+
+This script invokes googletest-catch-exceptions-test_ and
+googletest-catch-exceptions-ex-test_ (programs written with
+Google Test) and verifies their output.
+"""
+
+import gtest_test_utils
+
+# Constants.
+FLAG_PREFIX = '--gtest_'
+LIST_TESTS_FLAG = FLAG_PREFIX + 'list_tests'
+NO_CATCH_EXCEPTIONS_FLAG = FLAG_PREFIX + 'catch_exceptions=0'
+FILTER_FLAG = FLAG_PREFIX + 'filter'
+
+# Path to the googletest-catch-exceptions-ex-test_ binary, compiled with
+# exceptions enabled.
+EX_EXE_PATH = gtest_test_utils.GetTestExecutablePath(
+    'googletest-catch-exceptions-ex-test_')
+
+# Path to the googletest-catch-exceptions-test_ binary, compiled with
+# exceptions disabled.
+EXE_PATH = gtest_test_utils.GetTestExecutablePath(
+    'googletest-catch-exceptions-no-ex-test_')
+
+environ = gtest_test_utils.environ
+SetEnvVar = gtest_test_utils.SetEnvVar
+
+# Tests in this file run a Google-Test-based test program and expect it
+# to terminate prematurely.  Therefore they are incompatible with
+# the premature-exit-file protocol by design.  Unset the
+# premature-exit filepath to prevent Google Test from creating
+# the file.
+SetEnvVar(gtest_test_utils.PREMATURE_EXIT_FILE_ENV_VAR, None)
+
+TEST_LIST = gtest_test_utils.Subprocess(
+    [EXE_PATH, LIST_TESTS_FLAG], env=environ).output
+
+SUPPORTS_SEH_EXCEPTIONS = 'ThrowsSehException' in TEST_LIST
+
+if SUPPORTS_SEH_EXCEPTIONS:
+  BINARY_OUTPUT = gtest_test_utils.Subprocess([EXE_PATH], env=environ).output
+
+EX_BINARY_OUTPUT = gtest_test_utils.Subprocess(
+    [EX_EXE_PATH], env=environ).output
+
+
+# The tests.
+if SUPPORTS_SEH_EXCEPTIONS:
+  # pylint:disable-msg=C6302
+  class CatchSehExceptionsTest(gtest_test_utils.TestCase):
+    """Tests exception-catching behavior."""
+
+
+    def TestSehExceptions(self, test_output):
+      self.assert_('SEH exception with code 0x2a thrown '
+                   'in the test fixture\'s constructor'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown '
+                   'in the test fixture\'s destructor'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown in SetUpTestSuite()'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown in TearDownTestSuite()'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown in SetUp()'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown in TearDown()'
+                   in test_output)
+      self.assert_('SEH exception with code 0x2a thrown in the test body'
+                   in test_output)
+
+    def testCatchesSehExceptionsWithCxxExceptionsEnabled(self):
+      self.TestSehExceptions(EX_BINARY_OUTPUT)
+
+    def testCatchesSehExceptionsWithCxxExceptionsDisabled(self):
+      self.TestSehExceptions(BINARY_OUTPUT)
+
+
+class CatchCxxExceptionsTest(gtest_test_utils.TestCase):
+  """Tests C++ exception-catching behavior.
+
+     Tests in this test case verify that:
+     * C++ exceptions are caught and logged as C++ (not SEH) exceptions
+     * Exception thrown affect the remainder of the test work flow in the
+       expected manner.
+  """
+
+  def testCatchesCxxExceptionsInFixtureConstructor(self):
+    self.assertTrue(
+        'C++ exception with description '
+        '"Standard C++ exception" thrown '
+        'in the test fixture\'s constructor' in EX_BINARY_OUTPUT,
+        EX_BINARY_OUTPUT)
+    self.assert_('unexpected' not in EX_BINARY_OUTPUT,
+                 'This failure belongs in this test only if '
+                 '"CxxExceptionInConstructorTest" (no quotes) '
+                 'appears on the same line as words "called unexpectedly"')
+
+  if ('CxxExceptionInDestructorTest.ThrowsExceptionInDestructor' in
+      EX_BINARY_OUTPUT):
+
+    def testCatchesCxxExceptionsInFixtureDestructor(self):
+      self.assertTrue(
+          'C++ exception with description '
+          '"Standard C++ exception" thrown '
+          'in the test fixture\'s destructor' in EX_BINARY_OUTPUT,
+          EX_BINARY_OUTPUT)
+      self.assertTrue(
+          'CxxExceptionInDestructorTest::TearDownTestSuite() '
+          'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+
+  def testCatchesCxxExceptionsInSetUpTestCase(self):
+    self.assertTrue(
+        'C++ exception with description "Standard C++ exception"'
+        ' thrown in SetUpTestSuite()' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInConstructorTest::TearDownTestSuite() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTestSuiteTest constructor '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTestSuiteTest destructor '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTestSuiteTest::SetUp() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTestSuiteTest::TearDown() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTestSuiteTest test body '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+
+  def testCatchesCxxExceptionsInTearDownTestCase(self):
+    self.assertTrue(
+        'C++ exception with description "Standard C++ exception"'
+        ' thrown in TearDownTestSuite()' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+
+  def testCatchesCxxExceptionsInSetUp(self):
+    self.assertTrue(
+        'C++ exception with description "Standard C++ exception"'
+        ' thrown in SetUp()' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTest::TearDownTestSuite() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTest destructor '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInSetUpTest::TearDown() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assert_('unexpected' not in EX_BINARY_OUTPUT,
+                 'This failure belongs in this test only if '
+                 '"CxxExceptionInSetUpTest" (no quotes) '
+                 'appears on the same line as words "called unexpectedly"')
+
+  def testCatchesCxxExceptionsInTearDown(self):
+    self.assertTrue(
+        'C++ exception with description "Standard C++ exception"'
+        ' thrown in TearDown()' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInTearDownTest::TearDownTestSuite() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInTearDownTest destructor '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+
+  def testCatchesCxxExceptionsInTestBody(self):
+    self.assertTrue(
+        'C++ exception with description "Standard C++ exception"'
+        ' thrown in the test body' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInTestBodyTest::TearDownTestSuite() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInTestBodyTest destructor '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+    self.assertTrue(
+        'CxxExceptionInTestBodyTest::TearDown() '
+        'called as expected.' in EX_BINARY_OUTPUT, EX_BINARY_OUTPUT)
+
+  def testCatchesNonStdCxxExceptions(self):
+    self.assertTrue(
+        'Unknown C++ exception thrown in the test body' in EX_BINARY_OUTPUT,
+        EX_BINARY_OUTPUT)
+
+  def testUnhandledCxxExceptionsAbortTheProgram(self):
+    # Filters out SEH exception tests on Windows. Unhandled SEH exceptions
+    # cause tests to show pop-up windows there.
+    FITLER_OUT_SEH_TESTS_FLAG = FILTER_FLAG + '=-*Seh*'
+    # By default, Google Test doesn't catch the exceptions.
+    uncaught_exceptions_ex_binary_output = gtest_test_utils.Subprocess(
+        [EX_EXE_PATH,
+         NO_CATCH_EXCEPTIONS_FLAG,
+         FITLER_OUT_SEH_TESTS_FLAG],
+        env=environ).output
+
+    self.assert_('Unhandled C++ exception terminating the program'
+                 in uncaught_exceptions_ex_binary_output)
+    self.assert_('unexpected' not in uncaught_exceptions_ex_binary_output)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test_.cc
new file mode 100755
index 0000000..8c127d4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-catch-exceptions-test_.cc
@@ -0,0 +1,293 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google Test itself. Tests in this file throw C++ or SEH
+// exceptions, and the output is verified by
+// googletest-catch-exceptions-test.py.
+
+#include <stdio.h>  // NOLINT
+#include <stdlib.h>  // For exit().
+
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_SEH
+# include <windows.h>
+#endif
+
+#if GTEST_HAS_EXCEPTIONS
+# include <exception>  // For set_terminate().
+# include <stdexcept>
+#endif
+
+using testing::Test;
+
+#if GTEST_HAS_SEH
+
+class SehExceptionInConstructorTest : public Test {
+ public:
+  SehExceptionInConstructorTest() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInConstructorTest, ThrowsExceptionInConstructor) {}
+
+class SehExceptionInDestructorTest : public Test {
+ public:
+  ~SehExceptionInDestructorTest() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInDestructorTest, ThrowsExceptionInDestructor) {}
+
+class SehExceptionInSetUpTestSuiteTest : public Test {
+ public:
+  static void SetUpTestSuite() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInSetUpTestSuiteTest, ThrowsExceptionInSetUpTestSuite) {}
+
+class SehExceptionInTearDownTestSuiteTest : public Test {
+ public:
+  static void TearDownTestSuite() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInTearDownTestSuiteTest,
+       ThrowsExceptionInTearDownTestSuite) {}
+
+class SehExceptionInSetUpTest : public Test {
+ protected:
+  virtual void SetUp() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInSetUpTest, ThrowsExceptionInSetUp) {}
+
+class SehExceptionInTearDownTest : public Test {
+ protected:
+  virtual void TearDown() { RaiseException(42, 0, 0, NULL); }
+};
+
+TEST_F(SehExceptionInTearDownTest, ThrowsExceptionInTearDown) {}
+
+TEST(SehExceptionTest, ThrowsSehException) {
+  RaiseException(42, 0, 0, NULL);
+}
+
+#endif  // GTEST_HAS_SEH
+
+#if GTEST_HAS_EXCEPTIONS
+
+class CxxExceptionInConstructorTest : public Test {
+ public:
+  CxxExceptionInConstructorTest() {
+    // Without this macro VC++ complains about unreachable code at the end of
+    // the constructor.
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        throw std::runtime_error("Standard C++ exception"));
+  }
+
+  static void TearDownTestSuite() {
+    printf("%s",
+           "CxxExceptionInConstructorTest::TearDownTestSuite() "
+           "called as expected.\n");
+  }
+
+ protected:
+  ~CxxExceptionInConstructorTest() override {
+    ADD_FAILURE() << "CxxExceptionInConstructorTest destructor "
+                  << "called unexpectedly.";
+  }
+
+  void SetUp() override {
+    ADD_FAILURE() << "CxxExceptionInConstructorTest::SetUp() "
+                  << "called unexpectedly.";
+  }
+
+  void TearDown() override {
+    ADD_FAILURE() << "CxxExceptionInConstructorTest::TearDown() "
+                  << "called unexpectedly.";
+  }
+};
+
+TEST_F(CxxExceptionInConstructorTest, ThrowsExceptionInConstructor) {
+  ADD_FAILURE() << "CxxExceptionInConstructorTest test body "
+                << "called unexpectedly.";
+}
+
+class CxxExceptionInSetUpTestSuiteTest : public Test {
+ public:
+  CxxExceptionInSetUpTestSuiteTest() {
+    printf("%s",
+           "CxxExceptionInSetUpTestSuiteTest constructor "
+           "called as expected.\n");
+  }
+
+  static void SetUpTestSuite() {
+    throw std::runtime_error("Standard C++ exception");
+  }
+
+  static void TearDownTestSuite() {
+    printf("%s",
+           "CxxExceptionInSetUpTestSuiteTest::TearDownTestSuite() "
+           "called as expected.\n");
+  }
+
+ protected:
+  ~CxxExceptionInSetUpTestSuiteTest() override {
+    printf("%s",
+           "CxxExceptionInSetUpTestSuiteTest destructor "
+           "called as expected.\n");
+  }
+
+  void SetUp() override {
+    printf("%s",
+           "CxxExceptionInSetUpTestSuiteTest::SetUp() "
+           "called as expected.\n");
+  }
+
+  void TearDown() override {
+    printf("%s",
+           "CxxExceptionInSetUpTestSuiteTest::TearDown() "
+           "called as expected.\n");
+  }
+};
+
+TEST_F(CxxExceptionInSetUpTestSuiteTest, ThrowsExceptionInSetUpTestSuite) {
+  printf("%s",
+         "CxxExceptionInSetUpTestSuiteTest test body "
+         "called as expected.\n");
+}
+
+class CxxExceptionInTearDownTestSuiteTest : public Test {
+ public:
+  static void TearDownTestSuite() {
+    throw std::runtime_error("Standard C++ exception");
+  }
+};
+
+TEST_F(CxxExceptionInTearDownTestSuiteTest,
+       ThrowsExceptionInTearDownTestSuite) {}
+
+class CxxExceptionInSetUpTest : public Test {
+ public:
+  static void TearDownTestSuite() {
+    printf("%s",
+           "CxxExceptionInSetUpTest::TearDownTestSuite() "
+           "called as expected.\n");
+  }
+
+ protected:
+  ~CxxExceptionInSetUpTest() override {
+    printf("%s",
+           "CxxExceptionInSetUpTest destructor "
+           "called as expected.\n");
+  }
+
+  void SetUp() override { throw std::runtime_error("Standard C++ exception"); }
+
+  void TearDown() override {
+    printf("%s",
+           "CxxExceptionInSetUpTest::TearDown() "
+           "called as expected.\n");
+  }
+};
+
+TEST_F(CxxExceptionInSetUpTest, ThrowsExceptionInSetUp) {
+  ADD_FAILURE() << "CxxExceptionInSetUpTest test body "
+                << "called unexpectedly.";
+}
+
+class CxxExceptionInTearDownTest : public Test {
+ public:
+  static void TearDownTestSuite() {
+    printf("%s",
+           "CxxExceptionInTearDownTest::TearDownTestSuite() "
+           "called as expected.\n");
+  }
+
+ protected:
+  ~CxxExceptionInTearDownTest() override {
+    printf("%s",
+           "CxxExceptionInTearDownTest destructor "
+           "called as expected.\n");
+  }
+
+  void TearDown() override {
+    throw std::runtime_error("Standard C++ exception");
+  }
+};
+
+TEST_F(CxxExceptionInTearDownTest, ThrowsExceptionInTearDown) {}
+
+class CxxExceptionInTestBodyTest : public Test {
+ public:
+  static void TearDownTestSuite() {
+    printf("%s",
+           "CxxExceptionInTestBodyTest::TearDownTestSuite() "
+           "called as expected.\n");
+  }
+
+ protected:
+  ~CxxExceptionInTestBodyTest() override {
+    printf("%s",
+           "CxxExceptionInTestBodyTest destructor "
+           "called as expected.\n");
+  }
+
+  void TearDown() override {
+    printf("%s",
+           "CxxExceptionInTestBodyTest::TearDown() "
+           "called as expected.\n");
+  }
+};
+
+TEST_F(CxxExceptionInTestBodyTest, ThrowsStdCxxException) {
+  throw std::runtime_error("Standard C++ exception");
+}
+
+TEST(CxxExceptionTest, ThrowsNonStdCxxException) {
+  throw "C-string";
+}
+
+// This terminate handler aborts the program using exit() rather than abort().
+// This avoids showing pop-ups on Windows systems and core dumps on Unix-like
+// ones.
+void TerminateHandler() {
+  fprintf(stderr, "%s\n", "Unhandled C++ exception terminating the program.");
+  fflush(nullptr);
+  exit(3);
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+int main(int argc, char** argv) {
+#if GTEST_HAS_EXCEPTIONS
+  std::set_terminate(&TerminateHandler);
+#endif
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test.py
new file mode 100755
index 0000000..f3b7c99
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that Google Test correctly determines whether to use colors."""
+
+import os
+import gtest_test_utils
+
+IS_WINDOWS = os.name == 'nt'
+
+COLOR_ENV_VAR = 'GTEST_COLOR'
+COLOR_FLAG = 'gtest_color'
+COMMAND = gtest_test_utils.GetTestExecutablePath('googletest-color-test_')
+
+
+def SetEnvVar(env_var, value):
+  """Sets the env variable to 'value'; unsets it when 'value' is None."""
+
+  if value is not None:
+    os.environ[env_var] = value
+  elif env_var in os.environ:
+    del os.environ[env_var]
+
+
+def UsesColor(term, color_env_var, color_flag):
+  """Runs googletest-color-test_ and returns its exit code."""
+
+  SetEnvVar('TERM', term)
+  SetEnvVar(COLOR_ENV_VAR, color_env_var)
+
+  if color_flag is None:
+    args = []
+  else:
+    args = ['--%s=%s' % (COLOR_FLAG, color_flag)]
+  p = gtest_test_utils.Subprocess([COMMAND] + args)
+  return not p.exited or p.exit_code
+
+
+class GTestColorTest(gtest_test_utils.TestCase):
+  def testNoEnvVarNoFlag(self):
+    """Tests the case when there's neither GTEST_COLOR nor --gtest_color."""
+
+    if not IS_WINDOWS:
+      self.assert_(not UsesColor('dumb', None, None))
+      self.assert_(not UsesColor('emacs', None, None))
+      self.assert_(not UsesColor('xterm-mono', None, None))
+      self.assert_(not UsesColor('unknown', None, None))
+      self.assert_(not UsesColor(None, None, None))
+    self.assert_(UsesColor('linux', None, None))
+    self.assert_(UsesColor('cygwin', None, None))
+    self.assert_(UsesColor('xterm', None, None))
+    self.assert_(UsesColor('xterm-color', None, None))
+    self.assert_(UsesColor('xterm-256color', None, None))
+
+  def testFlagOnly(self):
+    """Tests the case when there's --gtest_color but not GTEST_COLOR."""
+
+    self.assert_(not UsesColor('dumb', None, 'no'))
+    self.assert_(not UsesColor('xterm-color', None, 'no'))
+    if not IS_WINDOWS:
+      self.assert_(not UsesColor('emacs', None, 'auto'))
+    self.assert_(UsesColor('xterm', None, 'auto'))
+    self.assert_(UsesColor('dumb', None, 'yes'))
+    self.assert_(UsesColor('xterm', None, 'yes'))
+
+  def testEnvVarOnly(self):
+    """Tests the case when there's GTEST_COLOR but not --gtest_color."""
+
+    self.assert_(not UsesColor('dumb', 'no', None))
+    self.assert_(not UsesColor('xterm-color', 'no', None))
+    if not IS_WINDOWS:
+      self.assert_(not UsesColor('dumb', 'auto', None))
+    self.assert_(UsesColor('xterm-color', 'auto', None))
+    self.assert_(UsesColor('dumb', 'yes', None))
+    self.assert_(UsesColor('xterm-color', 'yes', None))
+
+  def testEnvVarAndFlag(self):
+    """Tests the case when there are both GTEST_COLOR and --gtest_color."""
+
+    self.assert_(not UsesColor('xterm-color', 'no', 'no'))
+    self.assert_(UsesColor('dumb', 'no', 'yes'))
+    self.assert_(UsesColor('xterm-color', 'no', 'auto'))
+
+  def testAliasesOfYesAndNo(self):
+    """Tests using aliases in specifying --gtest_color."""
+
+    self.assert_(UsesColor('dumb', None, 'true'))
+    self.assert_(UsesColor('dumb', None, 'YES'))
+    self.assert_(UsesColor('dumb', None, 'T'))
+    self.assert_(UsesColor('dumb', None, '1'))
+
+    self.assert_(not UsesColor('xterm', None, 'f'))
+    self.assert_(not UsesColor('xterm', None, 'false'))
+    self.assert_(not UsesColor('xterm', None, '0'))
+    self.assert_(not UsesColor('xterm', None, 'unknown'))
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test_.cc
new file mode 100755
index 0000000..220a3a0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-color-test_.cc
@@ -0,0 +1,62 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// A helper program for testing how Google Test determines whether to use
+// colors in the output.  It prints "YES" and returns 1 if Google Test
+// decides to use colors, and prints "NO" and returns 0 otherwise.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+using testing::internal::ShouldUseColor;
+
+// The purpose of this is to ensure that the UnitTest singleton is
+// created before main() is entered, and thus that ShouldUseColor()
+// works the same way as in a real Google-Test-based test.  We don't actual
+// run the TEST itself.
+TEST(GTestColorTest, Dummy) {
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  if (ShouldUseColor(true)) {
+    // Google Test decides to use colors in the output (assuming it
+    // goes to a TTY).
+    printf("YES\n");
+    return 1;
+  } else {
+    // Google Test decides not to use colors in the output.
+    printf("NO\n");
+    return 0;
+  }
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test-test.cc
new file mode 100755
index 0000000..814d771
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test-test.cc
@@ -0,0 +1,1516 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for death tests.
+
+#include "gtest/gtest-death-test.h"
+
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-filepath.h"
+
+using testing::internal::AlwaysFalse;
+using testing::internal::AlwaysTrue;
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_WINDOWS
+#  include <fcntl.h>           // For O_BINARY
+#  include <direct.h>          // For chdir().
+#  include <io.h>
+# else
+#  include <unistd.h>
+#  include <sys/wait.h>        // For waitpid.
+# endif  // GTEST_OS_WINDOWS
+
+# include <limits.h>
+# include <signal.h>
+# include <stdio.h>
+
+# if GTEST_OS_LINUX
+#  include <sys/time.h>
+# endif  // GTEST_OS_LINUX
+
+# include "gtest/gtest-spi.h"
+# include "src/gtest-internal-inl.h"
+
+namespace posix = ::testing::internal::posix;
+
+using testing::ContainsRegex;
+using testing::Matcher;
+using testing::Message;
+using testing::internal::DeathTest;
+using testing::internal::DeathTestFactory;
+using testing::internal::FilePath;
+using testing::internal::GetLastErrnoDescription;
+using testing::internal::GetUnitTestImpl;
+using testing::internal::InDeathTestChild;
+using testing::internal::ParseNaturalNumber;
+
+namespace testing {
+namespace internal {
+
+// A helper class whose objects replace the death test factory for a
+// single UnitTest object during their lifetimes.
+class ReplaceDeathTestFactory {
+ public:
+  explicit ReplaceDeathTestFactory(DeathTestFactory* new_factory)
+      : unit_test_impl_(GetUnitTestImpl()) {
+    old_factory_ = unit_test_impl_->death_test_factory_.release();
+    unit_test_impl_->death_test_factory_.reset(new_factory);
+  }
+
+  ~ReplaceDeathTestFactory() {
+    unit_test_impl_->death_test_factory_.release();
+    unit_test_impl_->death_test_factory_.reset(old_factory_);
+  }
+ private:
+  // Prevents copying ReplaceDeathTestFactory objects.
+  ReplaceDeathTestFactory(const ReplaceDeathTestFactory&);
+  void operator=(const ReplaceDeathTestFactory&);
+
+  UnitTestImpl* unit_test_impl_;
+  DeathTestFactory* old_factory_;
+};
+
+}  // namespace internal
+}  // namespace testing
+
+namespace {
+
+void DieWithMessage(const ::std::string& message) {
+  fprintf(stderr, "%s", message.c_str());
+  fflush(stderr);  // Make sure the text is printed before the process exits.
+
+  // We call _exit() instead of exit(), as the former is a direct
+  // system call and thus safer in the presence of threads.  exit()
+  // will invoke user-defined exit-hooks, which may do dangerous
+  // things that conflict with death tests.
+  //
+  // Some compilers can recognize that _exit() never returns and issue the
+  // 'unreachable code' warning for code following this function, unless
+  // fooled by a fake condition.
+  if (AlwaysTrue())
+    _exit(1);
+}
+
+void DieInside(const ::std::string& function) {
+  DieWithMessage("death inside " + function + "().");
+}
+
+// Tests that death tests work.
+
+class TestForDeathTest : public testing::Test {
+ protected:
+  TestForDeathTest() : original_dir_(FilePath::GetCurrentDir()) {}
+
+  ~TestForDeathTest() override { posix::ChDir(original_dir_.c_str()); }
+
+  // A static member function that's expected to die.
+  static void StaticMemberFunction() { DieInside("StaticMemberFunction"); }
+
+  // A method of the test fixture that may die.
+  void MemberFunction() {
+    if (should_die_)
+      DieInside("MemberFunction");
+  }
+
+  // True if MemberFunction() should die.
+  bool should_die_;
+  const FilePath original_dir_;
+};
+
+// A class with a member function that may die.
+class MayDie {
+ public:
+  explicit MayDie(bool should_die) : should_die_(should_die) {}
+
+  // A member function that may die.
+  void MemberFunction() const {
+    if (should_die_)
+      DieInside("MayDie::MemberFunction");
+  }
+
+ private:
+  // True if MemberFunction() should die.
+  bool should_die_;
+};
+
+// A global function that's expected to die.
+void GlobalFunction() { DieInside("GlobalFunction"); }
+
+// A non-void function that's expected to die.
+int NonVoidFunction() {
+  DieInside("NonVoidFunction");
+  return 1;
+}
+
+// A unary function that may die.
+void DieIf(bool should_die) {
+  if (should_die)
+    DieInside("DieIf");
+}
+
+// A binary function that may die.
+bool DieIfLessThan(int x, int y) {
+  if (x < y) {
+    DieInside("DieIfLessThan");
+  }
+  return true;
+}
+
+// Tests that ASSERT_DEATH can be used outside a TEST, TEST_F, or test fixture.
+void DeathTestSubroutine() {
+  EXPECT_DEATH(GlobalFunction(), "death.*GlobalFunction");
+  ASSERT_DEATH(GlobalFunction(), "death.*GlobalFunction");
+}
+
+// Death in dbg, not opt.
+int DieInDebugElse12(int* sideeffect) {
+  if (sideeffect) *sideeffect = 12;
+
+# ifndef NDEBUG
+
+  DieInside("DieInDebugElse12");
+
+# endif  // NDEBUG
+
+  return 12;
+}
+
+# if GTEST_OS_WINDOWS
+
+// Death in dbg due to Windows CRT assertion failure, not opt.
+int DieInCRTDebugElse12(int* sideeffect) {
+  if (sideeffect) *sideeffect = 12;
+
+  // Create an invalid fd by closing a valid one
+  int fdpipe[2];
+  EXPECT_EQ(_pipe(fdpipe, 256, O_BINARY), 0);
+  EXPECT_EQ(_close(fdpipe[0]), 0);
+  EXPECT_EQ(_close(fdpipe[1]), 0);
+
+  // _dup() should crash in debug mode
+  EXPECT_EQ(_dup(fdpipe[0]), -1);
+
+  return 12;
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+// Tests the ExitedWithCode predicate.
+TEST(ExitStatusPredicateTest, ExitedWithCode) {
+  // On Windows, the process's exit code is the same as its exit status,
+  // so the predicate just compares the its input with its parameter.
+  EXPECT_TRUE(testing::ExitedWithCode(0)(0));
+  EXPECT_TRUE(testing::ExitedWithCode(1)(1));
+  EXPECT_TRUE(testing::ExitedWithCode(42)(42));
+  EXPECT_FALSE(testing::ExitedWithCode(0)(1));
+  EXPECT_FALSE(testing::ExitedWithCode(1)(0));
+}
+
+# else
+
+// Returns the exit status of a process that calls _exit(2) with a
+// given exit code.  This is a helper function for the
+// ExitStatusPredicateTest test suite.
+static int NormalExitStatus(int exit_code) {
+  pid_t child_pid = fork();
+  if (child_pid == 0) {
+    _exit(exit_code);
+  }
+  int status;
+  waitpid(child_pid, &status, 0);
+  return status;
+}
+
+// Returns the exit status of a process that raises a given signal.
+// If the signal does not cause the process to die, then it returns
+// instead the exit status of a process that exits normally with exit
+// code 1.  This is a helper function for the ExitStatusPredicateTest
+// test suite.
+static int KilledExitStatus(int signum) {
+  pid_t child_pid = fork();
+  if (child_pid == 0) {
+    raise(signum);
+    _exit(1);
+  }
+  int status;
+  waitpid(child_pid, &status, 0);
+  return status;
+}
+
+// Tests the ExitedWithCode predicate.
+TEST(ExitStatusPredicateTest, ExitedWithCode) {
+  const int status0  = NormalExitStatus(0);
+  const int status1  = NormalExitStatus(1);
+  const int status42 = NormalExitStatus(42);
+  const testing::ExitedWithCode pred0(0);
+  const testing::ExitedWithCode pred1(1);
+  const testing::ExitedWithCode pred42(42);
+  EXPECT_PRED1(pred0,  status0);
+  EXPECT_PRED1(pred1,  status1);
+  EXPECT_PRED1(pred42, status42);
+  EXPECT_FALSE(pred0(status1));
+  EXPECT_FALSE(pred42(status0));
+  EXPECT_FALSE(pred1(status42));
+}
+
+// Tests the KilledBySignal predicate.
+TEST(ExitStatusPredicateTest, KilledBySignal) {
+  const int status_segv = KilledExitStatus(SIGSEGV);
+  const int status_kill = KilledExitStatus(SIGKILL);
+  const testing::KilledBySignal pred_segv(SIGSEGV);
+  const testing::KilledBySignal pred_kill(SIGKILL);
+  EXPECT_PRED1(pred_segv, status_segv);
+  EXPECT_PRED1(pred_kill, status_kill);
+  EXPECT_FALSE(pred_segv(status_kill));
+  EXPECT_FALSE(pred_kill(status_segv));
+}
+
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+// Tests that the death test macros expand to code which may or may not
+// be followed by operator<<, and that in either case the complete text
+// comprises only a single C++ statement.
+TEST_F(TestForDeathTest, SingleStatement) {
+  if (AlwaysFalse())
+    // This would fail if executed; this is a compilation test only
+    ASSERT_DEATH(return, "");
+
+  if (AlwaysTrue())
+    EXPECT_DEATH(_exit(1), "");
+  else
+    // This empty "else" branch is meant to ensure that EXPECT_DEATH
+    // doesn't expand into an "if" statement without an "else"
+    ;
+
+  if (AlwaysFalse())
+    ASSERT_DEATH(return, "") << "did not die";
+
+  if (AlwaysFalse())
+    ;
+  else
+    EXPECT_DEATH(_exit(1), "") << 1 << 2 << 3;
+}
+
+# if GTEST_USES_PCRE
+
+void DieWithEmbeddedNul() {
+  fprintf(stderr, "Hello%cmy null world.\n", '\0');
+  fflush(stderr);
+  _exit(1);
+}
+
+// Tests that EXPECT_DEATH and ASSERT_DEATH work when the error
+// message has a NUL character in it.
+TEST_F(TestForDeathTest, EmbeddedNulInMessage) {
+  EXPECT_DEATH(DieWithEmbeddedNul(), "my null world");
+  ASSERT_DEATH(DieWithEmbeddedNul(), "my null world");
+}
+
+# endif  // GTEST_USES_PCRE
+
+// Tests that death test macros expand to code which interacts well with switch
+// statements.
+TEST_F(TestForDeathTest, SwitchStatement) {
+  // Microsoft compiler usually complains about switch statements without
+  // case labels. We suppress that warning for this test.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4065)
+
+  switch (0)
+    default:
+      ASSERT_DEATH(_exit(1), "") << "exit in default switch handler";
+
+  switch (0)
+    case 0:
+      EXPECT_DEATH(_exit(1), "") << "exit in switch case";
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+}
+
+// Tests that a static member function can be used in a "fast" style
+// death test.
+TEST_F(TestForDeathTest, StaticMemberFunctionFastStyle) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  ASSERT_DEATH(StaticMemberFunction(), "death.*StaticMember");
+}
+
+// Tests that a method of the test fixture can be used in a "fast"
+// style death test.
+TEST_F(TestForDeathTest, MemberFunctionFastStyle) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  should_die_ = true;
+  EXPECT_DEATH(MemberFunction(), "inside.*MemberFunction");
+}
+
+void ChangeToRootDir() { posix::ChDir(GTEST_PATH_SEP_); }
+
+// Tests that death tests work even if the current directory has been
+// changed.
+TEST_F(TestForDeathTest, FastDeathTestInChangedDir) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+
+  ChangeToRootDir();
+  EXPECT_EXIT(_exit(1), testing::ExitedWithCode(1), "");
+
+  ChangeToRootDir();
+  ASSERT_DEATH(_exit(1), "");
+}
+
+# if GTEST_OS_LINUX
+void SigprofAction(int, siginfo_t*, void*) { /* no op */ }
+
+// Sets SIGPROF action and ITIMER_PROF timer (interval: 1ms).
+void SetSigprofActionAndTimer() {
+  struct itimerval timer;
+  timer.it_interval.tv_sec = 0;
+  timer.it_interval.tv_usec = 1;
+  timer.it_value = timer.it_interval;
+  ASSERT_EQ(0, setitimer(ITIMER_PROF, &timer, nullptr));
+  struct sigaction signal_action;
+  memset(&signal_action, 0, sizeof(signal_action));
+  sigemptyset(&signal_action.sa_mask);
+  signal_action.sa_sigaction = SigprofAction;
+  signal_action.sa_flags = SA_RESTART | SA_SIGINFO;
+  ASSERT_EQ(0, sigaction(SIGPROF, &signal_action, nullptr));
+}
+
+// Disables ITIMER_PROF timer and ignores SIGPROF signal.
+void DisableSigprofActionAndTimer(struct sigaction* old_signal_action) {
+  struct itimerval timer;
+  timer.it_interval.tv_sec = 0;
+  timer.it_interval.tv_usec = 0;
+  timer.it_value = timer.it_interval;
+  ASSERT_EQ(0, setitimer(ITIMER_PROF, &timer, nullptr));
+  struct sigaction signal_action;
+  memset(&signal_action, 0, sizeof(signal_action));
+  sigemptyset(&signal_action.sa_mask);
+  signal_action.sa_handler = SIG_IGN;
+  ASSERT_EQ(0, sigaction(SIGPROF, &signal_action, old_signal_action));
+}
+
+// Tests that death tests work when SIGPROF handler and timer are set.
+TEST_F(TestForDeathTest, FastSigprofActionSet) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  SetSigprofActionAndTimer();
+  EXPECT_DEATH(_exit(1), "");
+  struct sigaction old_signal_action;
+  DisableSigprofActionAndTimer(&old_signal_action);
+  EXPECT_TRUE(old_signal_action.sa_sigaction == SigprofAction);
+}
+
+TEST_F(TestForDeathTest, ThreadSafeSigprofActionSet) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  SetSigprofActionAndTimer();
+  EXPECT_DEATH(_exit(1), "");
+  struct sigaction old_signal_action;
+  DisableSigprofActionAndTimer(&old_signal_action);
+  EXPECT_TRUE(old_signal_action.sa_sigaction == SigprofAction);
+}
+# endif  // GTEST_OS_LINUX
+
+// Repeats a representative sample of death tests in the "threadsafe" style:
+
+TEST_F(TestForDeathTest, StaticMemberFunctionThreadsafeStyle) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  ASSERT_DEATH(StaticMemberFunction(), "death.*StaticMember");
+}
+
+TEST_F(TestForDeathTest, MemberFunctionThreadsafeStyle) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  should_die_ = true;
+  EXPECT_DEATH(MemberFunction(), "inside.*MemberFunction");
+}
+
+TEST_F(TestForDeathTest, ThreadsafeDeathTestInLoop) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+
+  for (int i = 0; i < 3; ++i)
+    EXPECT_EXIT(_exit(i), testing::ExitedWithCode(i), "") << ": i = " << i;
+}
+
+TEST_F(TestForDeathTest, ThreadsafeDeathTestInChangedDir) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+
+  ChangeToRootDir();
+  EXPECT_EXIT(_exit(1), testing::ExitedWithCode(1), "");
+
+  ChangeToRootDir();
+  ASSERT_DEATH(_exit(1), "");
+}
+
+TEST_F(TestForDeathTest, MixedStyles) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  EXPECT_DEATH(_exit(1), "");
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_DEATH(_exit(1), "");
+}
+
+# if GTEST_HAS_CLONE && GTEST_HAS_PTHREAD
+
+bool pthread_flag;
+
+void SetPthreadFlag() {
+  pthread_flag = true;
+}
+
+TEST_F(TestForDeathTest, DoesNotExecuteAtforkHooks) {
+  if (!testing::GTEST_FLAG(death_test_use_fork)) {
+    testing::GTEST_FLAG(death_test_style) = "threadsafe";
+    pthread_flag = false;
+    ASSERT_EQ(0, pthread_atfork(&SetPthreadFlag, nullptr, nullptr));
+    ASSERT_DEATH(_exit(1), "");
+    ASSERT_FALSE(pthread_flag);
+  }
+}
+
+# endif  // GTEST_HAS_CLONE && GTEST_HAS_PTHREAD
+
+// Tests that a method of another class can be used in a death test.
+TEST_F(TestForDeathTest, MethodOfAnotherClass) {
+  const MayDie x(true);
+  ASSERT_DEATH(x.MemberFunction(), "MayDie\\:\\:MemberFunction");
+}
+
+// Tests that a global function can be used in a death test.
+TEST_F(TestForDeathTest, GlobalFunction) {
+  EXPECT_DEATH(GlobalFunction(), "GlobalFunction");
+}
+
+// Tests that any value convertible to an RE works as a second
+// argument to EXPECT_DEATH.
+TEST_F(TestForDeathTest, AcceptsAnythingConvertibleToRE) {
+  static const char regex_c_str[] = "GlobalFunction";
+  EXPECT_DEATH(GlobalFunction(), regex_c_str);
+
+  const testing::internal::RE regex(regex_c_str);
+  EXPECT_DEATH(GlobalFunction(), regex);
+
+# if !GTEST_USES_PCRE
+
+  const ::std::string regex_std_str(regex_c_str);
+  EXPECT_DEATH(GlobalFunction(), regex_std_str);
+
+  // This one is tricky; a temporary pointer into another temporary.  Reference
+  // lifetime extension of the pointer is not sufficient.
+  EXPECT_DEATH(GlobalFunction(), ::std::string(regex_c_str).c_str());
+
+# endif  // !GTEST_USES_PCRE
+}
+
+// Tests that a non-void function can be used in a death test.
+TEST_F(TestForDeathTest, NonVoidFunction) {
+  ASSERT_DEATH(NonVoidFunction(), "NonVoidFunction");
+}
+
+// Tests that functions that take parameter(s) can be used in a death test.
+TEST_F(TestForDeathTest, FunctionWithParameter) {
+  EXPECT_DEATH(DieIf(true), "DieIf\\(\\)");
+  EXPECT_DEATH(DieIfLessThan(2, 3), "DieIfLessThan");
+}
+
+// Tests that ASSERT_DEATH can be used outside a TEST, TEST_F, or test fixture.
+TEST_F(TestForDeathTest, OutsideFixture) {
+  DeathTestSubroutine();
+}
+
+// Tests that death tests can be done inside a loop.
+TEST_F(TestForDeathTest, InsideLoop) {
+  for (int i = 0; i < 5; i++) {
+    EXPECT_DEATH(DieIfLessThan(-1, i), "DieIfLessThan") << "where i == " << i;
+  }
+}
+
+// Tests that a compound statement can be used in a death test.
+TEST_F(TestForDeathTest, CompoundStatement) {
+  EXPECT_DEATH({  // NOLINT
+    const int x = 2;
+    const int y = x + 1;
+    DieIfLessThan(x, y);
+  },
+  "DieIfLessThan");
+}
+
+// Tests that code that doesn't die causes a death test to fail.
+TEST_F(TestForDeathTest, DoesNotDie) {
+  EXPECT_NONFATAL_FAILURE(EXPECT_DEATH(DieIf(false), "DieIf"),
+                          "failed to die");
+}
+
+// Tests that a death test fails when the error message isn't expected.
+TEST_F(TestForDeathTest, ErrorMessageMismatch) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_DEATH(DieIf(true), "DieIfLessThan") << "End of death test message.";
+  }, "died but not with expected error");
+}
+
+// On exit, *aborted will be true if the EXPECT_DEATH() statement
+// aborted the function.
+void ExpectDeathTestHelper(bool* aborted) {
+  *aborted = true;
+  EXPECT_DEATH(DieIf(false), "DieIf");  // This assertion should fail.
+  *aborted = false;
+}
+
+// Tests that EXPECT_DEATH doesn't abort the test on failure.
+TEST_F(TestForDeathTest, EXPECT_DEATH) {
+  bool aborted = true;
+  EXPECT_NONFATAL_FAILURE(ExpectDeathTestHelper(&aborted),
+                          "failed to die");
+  EXPECT_FALSE(aborted);
+}
+
+// Tests that ASSERT_DEATH does abort the test on failure.
+TEST_F(TestForDeathTest, ASSERT_DEATH) {
+  static bool aborted;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    aborted = true;
+    ASSERT_DEATH(DieIf(false), "DieIf");  // This assertion should fail.
+    aborted = false;
+  }, "failed to die");
+  EXPECT_TRUE(aborted);
+}
+
+// Tests that EXPECT_DEATH evaluates the arguments exactly once.
+TEST_F(TestForDeathTest, SingleEvaluation) {
+  int x = 3;
+  EXPECT_DEATH(DieIf((++x) == 4), "DieIf");
+
+  const char* regex = "DieIf";
+  const char* regex_save = regex;
+  EXPECT_DEATH(DieIfLessThan(3, 4), regex++);
+  EXPECT_EQ(regex_save + 1, regex);
+}
+
+// Tests that run-away death tests are reported as failures.
+TEST_F(TestForDeathTest, RunawayIsFailure) {
+  EXPECT_NONFATAL_FAILURE(EXPECT_DEATH(static_cast<void>(0), "Foo"),
+                          "failed to die.");
+}
+
+// Tests that death tests report executing 'return' in the statement as
+// failure.
+TEST_F(TestForDeathTest, ReturnIsFailure) {
+  EXPECT_FATAL_FAILURE(ASSERT_DEATH(return, "Bar"),
+                       "illegal return in test statement.");
+}
+
+// Tests that EXPECT_DEBUG_DEATH works as expected, that is, you can stream a
+// message to it, and in debug mode it:
+// 1. Asserts on death.
+// 2. Has no side effect.
+//
+// And in opt mode, it:
+// 1.  Has side effects but does not assert.
+TEST_F(TestForDeathTest, TestExpectDebugDeath) {
+  int sideeffect = 0;
+
+  // Put the regex in a local variable to make sure we don't get an "unused"
+  // warning in opt mode.
+  const char* regex = "death.*DieInDebugElse12";
+
+  EXPECT_DEBUG_DEATH(DieInDebugElse12(&sideeffect), regex)
+      << "Must accept a streamed message";
+
+# ifdef NDEBUG
+
+  // Checks that the assignment occurs in opt mode (sideeffect).
+  EXPECT_EQ(12, sideeffect);
+
+# else
+
+  // Checks that the assignment does not occur in dbg mode (no sideeffect).
+  EXPECT_EQ(0, sideeffect);
+
+# endif
+}
+
+# if GTEST_OS_WINDOWS
+
+// Tests that EXPECT_DEBUG_DEATH works as expected when in debug mode
+// the Windows CRT crashes the process with an assertion failure.
+// 1. Asserts on death.
+// 2. Has no side effect (doesn't pop up a window or wait for user input).
+//
+// And in opt mode, it:
+// 1.  Has side effects but does not assert.
+TEST_F(TestForDeathTest, CRTDebugDeath) {
+  int sideeffect = 0;
+
+  // Put the regex in a local variable to make sure we don't get an "unused"
+  // warning in opt mode.
+  const char* regex = "dup.* : Assertion failed";
+
+  EXPECT_DEBUG_DEATH(DieInCRTDebugElse12(&sideeffect), regex)
+      << "Must accept a streamed message";
+
+# ifdef NDEBUG
+
+  // Checks that the assignment occurs in opt mode (sideeffect).
+  EXPECT_EQ(12, sideeffect);
+
+# else
+
+  // Checks that the assignment does not occur in dbg mode (no sideeffect).
+  EXPECT_EQ(0, sideeffect);
+
+# endif
+}
+
+# endif  // GTEST_OS_WINDOWS
+
+// Tests that ASSERT_DEBUG_DEATH works as expected, that is, you can stream a
+// message to it, and in debug mode it:
+// 1. Asserts on death.
+// 2. Has no side effect.
+//
+// And in opt mode, it:
+// 1.  Has side effects but does not assert.
+TEST_F(TestForDeathTest, TestAssertDebugDeath) {
+  int sideeffect = 0;
+
+  ASSERT_DEBUG_DEATH(DieInDebugElse12(&sideeffect), "death.*DieInDebugElse12")
+      << "Must accept a streamed message";
+
+# ifdef NDEBUG
+
+  // Checks that the assignment occurs in opt mode (sideeffect).
+  EXPECT_EQ(12, sideeffect);
+
+# else
+
+  // Checks that the assignment does not occur in dbg mode (no sideeffect).
+  EXPECT_EQ(0, sideeffect);
+
+# endif
+}
+
+# ifndef NDEBUG
+
+void ExpectDebugDeathHelper(bool* aborted) {
+  *aborted = true;
+  EXPECT_DEBUG_DEATH(return, "") << "This is expected to fail.";
+  *aborted = false;
+}
+
+#  if GTEST_OS_WINDOWS
+TEST(PopUpDeathTest, DoesNotShowPopUpOnAbort) {
+  printf("This test should be considered failing if it shows "
+         "any pop-up dialogs.\n");
+  fflush(stdout);
+
+  EXPECT_DEATH({
+    testing::GTEST_FLAG(catch_exceptions) = false;
+    abort();
+  }, "");
+}
+#  endif  // GTEST_OS_WINDOWS
+
+// Tests that EXPECT_DEBUG_DEATH in debug mode does not abort
+// the function.
+TEST_F(TestForDeathTest, ExpectDebugDeathDoesNotAbort) {
+  bool aborted = true;
+  EXPECT_NONFATAL_FAILURE(ExpectDebugDeathHelper(&aborted), "");
+  EXPECT_FALSE(aborted);
+}
+
+void AssertDebugDeathHelper(bool* aborted) {
+  *aborted = true;
+  GTEST_LOG_(INFO) << "Before ASSERT_DEBUG_DEATH";
+  ASSERT_DEBUG_DEATH(GTEST_LOG_(INFO) << "In ASSERT_DEBUG_DEATH"; return, "")
+      << "This is expected to fail.";
+  GTEST_LOG_(INFO) << "After ASSERT_DEBUG_DEATH";
+  *aborted = false;
+}
+
+// Tests that ASSERT_DEBUG_DEATH in debug mode aborts the function on
+// failure.
+TEST_F(TestForDeathTest, AssertDebugDeathAborts) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts2) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts3) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts4) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts5) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts6) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts7) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts8) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts9) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+TEST_F(TestForDeathTest, AssertDebugDeathAborts10) {
+  static bool aborted;
+  aborted = false;
+  EXPECT_FATAL_FAILURE(AssertDebugDeathHelper(&aborted), "");
+  EXPECT_TRUE(aborted);
+}
+
+# endif  // _NDEBUG
+
+// Tests the *_EXIT family of macros, using a variety of predicates.
+static void TestExitMacros() {
+  EXPECT_EXIT(_exit(1),  testing::ExitedWithCode(1),  "");
+  ASSERT_EXIT(_exit(42), testing::ExitedWithCode(42), "");
+
+# if GTEST_OS_WINDOWS
+
+  // Of all signals effects on the process exit code, only those of SIGABRT
+  // are documented on Windows.
+  // See https://msdn.microsoft.com/en-us/query-bi/m/dwwzkt4c.
+  EXPECT_EXIT(raise(SIGABRT), testing::ExitedWithCode(3), "") << "b_ar";
+
+# elif !GTEST_OS_FUCHSIA
+
+  // Fuchsia has no unix signals.
+  EXPECT_EXIT(raise(SIGKILL), testing::KilledBySignal(SIGKILL), "") << "foo";
+  ASSERT_EXIT(raise(SIGUSR2), testing::KilledBySignal(SIGUSR2), "") << "bar";
+
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_EXIT(_exit(0), testing::KilledBySignal(SIGSEGV), "")
+      << "This failure is expected, too.";
+  }, "This failure is expected, too.");
+
+# endif  // GTEST_OS_WINDOWS
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_EXIT(raise(SIGSEGV), testing::ExitedWithCode(0), "")
+      << "This failure is expected.";
+  }, "This failure is expected.");
+}
+
+TEST_F(TestForDeathTest, ExitMacros) {
+  TestExitMacros();
+}
+
+TEST_F(TestForDeathTest, ExitMacrosUsingFork) {
+  testing::GTEST_FLAG(death_test_use_fork) = true;
+  TestExitMacros();
+}
+
+TEST_F(TestForDeathTest, InvalidStyle) {
+  testing::GTEST_FLAG(death_test_style) = "rococo";
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_DEATH(_exit(0), "") << "This failure is expected.";
+  }, "This failure is expected.");
+}
+
+TEST_F(TestForDeathTest, DeathTestFailedOutput) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_DEATH(DieWithMessage("death\n"),
+                   "expected message"),
+      "Actual msg:\n"
+      "[  DEATH   ] death\n");
+}
+
+TEST_F(TestForDeathTest, DeathTestUnexpectedReturnOutput) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_DEATH({
+          fprintf(stderr, "returning\n");
+          fflush(stderr);
+          return;
+        }, ""),
+      "    Result: illegal return in test statement.\n"
+      " Error msg:\n"
+      "[  DEATH   ] returning\n");
+}
+
+TEST_F(TestForDeathTest, DeathTestBadExitCodeOutput) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_EXIT(DieWithMessage("exiting with rc 1\n"),
+                  testing::ExitedWithCode(3),
+                  "expected message"),
+      "    Result: died but not with expected exit code:\n"
+      "            Exited with exit status 1\n"
+      "Actual msg:\n"
+      "[  DEATH   ] exiting with rc 1\n");
+}
+
+TEST_F(TestForDeathTest, DeathTestMultiLineMatchFail) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_DEATH(DieWithMessage("line 1\nline 2\nline 3\n"),
+                   "line 1\nxyz\nline 3\n"),
+      "Actual msg:\n"
+      "[  DEATH   ] line 1\n"
+      "[  DEATH   ] line 2\n"
+      "[  DEATH   ] line 3\n");
+}
+
+TEST_F(TestForDeathTest, DeathTestMultiLineMatchPass) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_DEATH(DieWithMessage("line 1\nline 2\nline 3\n"),
+               "line 1\nline 2\nline 3\n");
+}
+
+// A DeathTestFactory that returns MockDeathTests.
+class MockDeathTestFactory : public DeathTestFactory {
+ public:
+  MockDeathTestFactory();
+  bool Create(const char* statement,
+              testing::Matcher<const std::string&> matcher, const char* file,
+              int line, DeathTest** test) override;
+
+  // Sets the parameters for subsequent calls to Create.
+  void SetParameters(bool create, DeathTest::TestRole role,
+                     int status, bool passed);
+
+  // Accessors.
+  int AssumeRoleCalls() const { return assume_role_calls_; }
+  int WaitCalls() const { return wait_calls_; }
+  size_t PassedCalls() const { return passed_args_.size(); }
+  bool PassedArgument(int n) const {
+    return passed_args_[static_cast<size_t>(n)];
+  }
+  size_t AbortCalls() const { return abort_args_.size(); }
+  DeathTest::AbortReason AbortArgument(int n) const {
+    return abort_args_[static_cast<size_t>(n)];
+  }
+  bool TestDeleted() const { return test_deleted_; }
+
+ private:
+  friend class MockDeathTest;
+  // If true, Create will return a MockDeathTest; otherwise it returns
+  // NULL.
+  bool create_;
+  // The value a MockDeathTest will return from its AssumeRole method.
+  DeathTest::TestRole role_;
+  // The value a MockDeathTest will return from its Wait method.
+  int status_;
+  // The value a MockDeathTest will return from its Passed method.
+  bool passed_;
+
+  // Number of times AssumeRole was called.
+  int assume_role_calls_;
+  // Number of times Wait was called.
+  int wait_calls_;
+  // The arguments to the calls to Passed since the last call to
+  // SetParameters.
+  std::vector<bool> passed_args_;
+  // The arguments to the calls to Abort since the last call to
+  // SetParameters.
+  std::vector<DeathTest::AbortReason> abort_args_;
+  // True if the last MockDeathTest returned by Create has been
+  // deleted.
+  bool test_deleted_;
+};
+
+
+// A DeathTest implementation useful in testing.  It returns values set
+// at its creation from its various inherited DeathTest methods, and
+// reports calls to those methods to its parent MockDeathTestFactory
+// object.
+class MockDeathTest : public DeathTest {
+ public:
+  MockDeathTest(MockDeathTestFactory *parent,
+                TestRole role, int status, bool passed) :
+      parent_(parent), role_(role), status_(status), passed_(passed) {
+  }
+  ~MockDeathTest() override { parent_->test_deleted_ = true; }
+  TestRole AssumeRole() override {
+    ++parent_->assume_role_calls_;
+    return role_;
+  }
+  int Wait() override {
+    ++parent_->wait_calls_;
+    return status_;
+  }
+  bool Passed(bool exit_status_ok) override {
+    parent_->passed_args_.push_back(exit_status_ok);
+    return passed_;
+  }
+  void Abort(AbortReason reason) override {
+    parent_->abort_args_.push_back(reason);
+  }
+
+ private:
+  MockDeathTestFactory* const parent_;
+  const TestRole role_;
+  const int status_;
+  const bool passed_;
+};
+
+
+// MockDeathTestFactory constructor.
+MockDeathTestFactory::MockDeathTestFactory()
+    : create_(true),
+      role_(DeathTest::OVERSEE_TEST),
+      status_(0),
+      passed_(true),
+      assume_role_calls_(0),
+      wait_calls_(0),
+      passed_args_(),
+      abort_args_() {
+}
+
+
+// Sets the parameters for subsequent calls to Create.
+void MockDeathTestFactory::SetParameters(bool create,
+                                         DeathTest::TestRole role,
+                                         int status, bool passed) {
+  create_ = create;
+  role_ = role;
+  status_ = status;
+  passed_ = passed;
+
+  assume_role_calls_ = 0;
+  wait_calls_ = 0;
+  passed_args_.clear();
+  abort_args_.clear();
+}
+
+
+// Sets test to NULL (if create_ is false) or to the address of a new
+// MockDeathTest object with parameters taken from the last call
+// to SetParameters (if create_ is true).  Always returns true.
+bool MockDeathTestFactory::Create(
+    const char* /*statement*/, testing::Matcher<const std::string&> /*matcher*/,
+    const char* /*file*/, int /*line*/, DeathTest** test) {
+  test_deleted_ = false;
+  if (create_) {
+    *test = new MockDeathTest(this, role_, status_, passed_);
+  } else {
+    *test = nullptr;
+  }
+  return true;
+}
+
+// A test fixture for testing the logic of the GTEST_DEATH_TEST_ macro.
+// It installs a MockDeathTestFactory that is used for the duration
+// of the test case.
+class MacroLogicDeathTest : public testing::Test {
+ protected:
+  static testing::internal::ReplaceDeathTestFactory* replacer_;
+  static MockDeathTestFactory* factory_;
+
+  static void SetUpTestSuite() {
+    factory_ = new MockDeathTestFactory;
+    replacer_ = new testing::internal::ReplaceDeathTestFactory(factory_);
+  }
+
+  static void TearDownTestSuite() {
+    delete replacer_;
+    replacer_ = nullptr;
+    delete factory_;
+    factory_ = nullptr;
+  }
+
+  // Runs a death test that breaks the rules by returning.  Such a death
+  // test cannot be run directly from a test routine that uses a
+  // MockDeathTest, or the remainder of the routine will not be executed.
+  static void RunReturningDeathTest(bool* flag) {
+    ASSERT_DEATH({  // NOLINT
+      *flag = true;
+      return;
+    }, "");
+  }
+};
+
+testing::internal::ReplaceDeathTestFactory* MacroLogicDeathTest::replacer_ =
+    nullptr;
+MockDeathTestFactory* MacroLogicDeathTest::factory_ = nullptr;
+
+// Test that nothing happens when the factory doesn't return a DeathTest:
+TEST_F(MacroLogicDeathTest, NothingHappens) {
+  bool flag = false;
+  factory_->SetParameters(false, DeathTest::OVERSEE_TEST, 0, true);
+  EXPECT_DEATH(flag = true, "");
+  EXPECT_FALSE(flag);
+  EXPECT_EQ(0, factory_->AssumeRoleCalls());
+  EXPECT_EQ(0, factory_->WaitCalls());
+  EXPECT_EQ(0U, factory_->PassedCalls());
+  EXPECT_EQ(0U, factory_->AbortCalls());
+  EXPECT_FALSE(factory_->TestDeleted());
+}
+
+// Test that the parent process doesn't run the death test code,
+// and that the Passed method returns false when the (simulated)
+// child process exits with status 0:
+TEST_F(MacroLogicDeathTest, ChildExitsSuccessfully) {
+  bool flag = false;
+  factory_->SetParameters(true, DeathTest::OVERSEE_TEST, 0, true);
+  EXPECT_DEATH(flag = true, "");
+  EXPECT_FALSE(flag);
+  EXPECT_EQ(1, factory_->AssumeRoleCalls());
+  EXPECT_EQ(1, factory_->WaitCalls());
+  ASSERT_EQ(1U, factory_->PassedCalls());
+  EXPECT_FALSE(factory_->PassedArgument(0));
+  EXPECT_EQ(0U, factory_->AbortCalls());
+  EXPECT_TRUE(factory_->TestDeleted());
+}
+
+// Tests that the Passed method was given the argument "true" when
+// the (simulated) child process exits with status 1:
+TEST_F(MacroLogicDeathTest, ChildExitsUnsuccessfully) {
+  bool flag = false;
+  factory_->SetParameters(true, DeathTest::OVERSEE_TEST, 1, true);
+  EXPECT_DEATH(flag = true, "");
+  EXPECT_FALSE(flag);
+  EXPECT_EQ(1, factory_->AssumeRoleCalls());
+  EXPECT_EQ(1, factory_->WaitCalls());
+  ASSERT_EQ(1U, factory_->PassedCalls());
+  EXPECT_TRUE(factory_->PassedArgument(0));
+  EXPECT_EQ(0U, factory_->AbortCalls());
+  EXPECT_TRUE(factory_->TestDeleted());
+}
+
+// Tests that the (simulated) child process executes the death test
+// code, and is aborted with the correct AbortReason if it
+// executes a return statement.
+TEST_F(MacroLogicDeathTest, ChildPerformsReturn) {
+  bool flag = false;
+  factory_->SetParameters(true, DeathTest::EXECUTE_TEST, 0, true);
+  RunReturningDeathTest(&flag);
+  EXPECT_TRUE(flag);
+  EXPECT_EQ(1, factory_->AssumeRoleCalls());
+  EXPECT_EQ(0, factory_->WaitCalls());
+  EXPECT_EQ(0U, factory_->PassedCalls());
+  EXPECT_EQ(1U, factory_->AbortCalls());
+  EXPECT_EQ(DeathTest::TEST_ENCOUNTERED_RETURN_STATEMENT,
+            factory_->AbortArgument(0));
+  EXPECT_TRUE(factory_->TestDeleted());
+}
+
+// Tests that the (simulated) child process is aborted with the
+// correct AbortReason if it does not die.
+TEST_F(MacroLogicDeathTest, ChildDoesNotDie) {
+  bool flag = false;
+  factory_->SetParameters(true, DeathTest::EXECUTE_TEST, 0, true);
+  EXPECT_DEATH(flag = true, "");
+  EXPECT_TRUE(flag);
+  EXPECT_EQ(1, factory_->AssumeRoleCalls());
+  EXPECT_EQ(0, factory_->WaitCalls());
+  EXPECT_EQ(0U, factory_->PassedCalls());
+  // This time there are two calls to Abort: one since the test didn't
+  // die, and another from the ReturnSentinel when it's destroyed.  The
+  // sentinel normally isn't destroyed if a test doesn't die, since
+  // _exit(2) is called in that case by ForkingDeathTest, but not by
+  // our MockDeathTest.
+  ASSERT_EQ(2U, factory_->AbortCalls());
+  EXPECT_EQ(DeathTest::TEST_DID_NOT_DIE,
+            factory_->AbortArgument(0));
+  EXPECT_EQ(DeathTest::TEST_ENCOUNTERED_RETURN_STATEMENT,
+            factory_->AbortArgument(1));
+  EXPECT_TRUE(factory_->TestDeleted());
+}
+
+// Tests that a successful death test does not register a successful
+// test part.
+TEST(SuccessRegistrationDeathTest, NoSuccessPart) {
+  EXPECT_DEATH(_exit(1), "");
+  EXPECT_EQ(0, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+TEST(StreamingAssertionsDeathTest, DeathTest) {
+  EXPECT_DEATH(_exit(1), "") << "unexpected failure";
+  ASSERT_DEATH(_exit(1), "") << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_DEATH(_exit(0), "") << "expected failure";
+  }, "expected failure");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_DEATH(_exit(0), "") << "expected failure";
+  }, "expected failure");
+}
+
+// Tests that GetLastErrnoDescription returns an empty string when the
+// last error is 0 and non-empty string when it is non-zero.
+TEST(GetLastErrnoDescription, GetLastErrnoDescriptionWorks) {
+  errno = ENOENT;
+  EXPECT_STRNE("", GetLastErrnoDescription().c_str());
+  errno = 0;
+  EXPECT_STREQ("", GetLastErrnoDescription().c_str());
+}
+
+# if GTEST_OS_WINDOWS
+TEST(AutoHandleTest, AutoHandleWorks) {
+  HANDLE handle = ::CreateEvent(NULL, FALSE, FALSE, NULL);
+  ASSERT_NE(INVALID_HANDLE_VALUE, handle);
+
+  // Tests that the AutoHandle is correctly initialized with a handle.
+  testing::internal::AutoHandle auto_handle(handle);
+  EXPECT_EQ(handle, auto_handle.Get());
+
+  // Tests that Reset assigns INVALID_HANDLE_VALUE.
+  // Note that this cannot verify whether the original handle is closed.
+  auto_handle.Reset();
+  EXPECT_EQ(INVALID_HANDLE_VALUE, auto_handle.Get());
+
+  // Tests that Reset assigns the new handle.
+  // Note that this cannot verify whether the original handle is closed.
+  handle = ::CreateEvent(NULL, FALSE, FALSE, NULL);
+  ASSERT_NE(INVALID_HANDLE_VALUE, handle);
+  auto_handle.Reset(handle);
+  EXPECT_EQ(handle, auto_handle.Get());
+
+  // Tests that AutoHandle contains INVALID_HANDLE_VALUE by default.
+  testing::internal::AutoHandle auto_handle2;
+  EXPECT_EQ(INVALID_HANDLE_VALUE, auto_handle2.Get());
+}
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_WINDOWS
+typedef unsigned __int64 BiggestParsable;
+typedef signed __int64 BiggestSignedParsable;
+# else
+typedef unsigned long long BiggestParsable;
+typedef signed long long BiggestSignedParsable;
+# endif  // GTEST_OS_WINDOWS
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the
+// max() macro defined by <windows.h>.
+const BiggestParsable kBiggestParsableMax = ULLONG_MAX;
+const BiggestSignedParsable kBiggestSignedParsableMax = LLONG_MAX;
+
+TEST(ParseNaturalNumberTest, RejectsInvalidFormat) {
+  BiggestParsable result = 0;
+
+  // Rejects non-numbers.
+  EXPECT_FALSE(ParseNaturalNumber("non-number string", &result));
+
+  // Rejects numbers with whitespace prefix.
+  EXPECT_FALSE(ParseNaturalNumber(" 123", &result));
+
+  // Rejects negative numbers.
+  EXPECT_FALSE(ParseNaturalNumber("-123", &result));
+
+  // Rejects numbers starting with a plus sign.
+  EXPECT_FALSE(ParseNaturalNumber("+123", &result));
+  errno = 0;
+}
+
+TEST(ParseNaturalNumberTest, RejectsOverflownNumbers) {
+  BiggestParsable result = 0;
+
+  EXPECT_FALSE(ParseNaturalNumber("99999999999999999999999", &result));
+
+  signed char char_result = 0;
+  EXPECT_FALSE(ParseNaturalNumber("200", &char_result));
+  errno = 0;
+}
+
+TEST(ParseNaturalNumberTest, AcceptsValidNumbers) {
+  BiggestParsable result = 0;
+
+  result = 0;
+  ASSERT_TRUE(ParseNaturalNumber("123", &result));
+  EXPECT_EQ(123U, result);
+
+  // Check 0 as an edge case.
+  result = 1;
+  ASSERT_TRUE(ParseNaturalNumber("0", &result));
+  EXPECT_EQ(0U, result);
+
+  result = 1;
+  ASSERT_TRUE(ParseNaturalNumber("00000", &result));
+  EXPECT_EQ(0U, result);
+}
+
+TEST(ParseNaturalNumberTest, AcceptsTypeLimits) {
+  Message msg;
+  msg << kBiggestParsableMax;
+
+  BiggestParsable result = 0;
+  EXPECT_TRUE(ParseNaturalNumber(msg.GetString(), &result));
+  EXPECT_EQ(kBiggestParsableMax, result);
+
+  Message msg2;
+  msg2 << kBiggestSignedParsableMax;
+
+  BiggestSignedParsable signed_result = 0;
+  EXPECT_TRUE(ParseNaturalNumber(msg2.GetString(), &signed_result));
+  EXPECT_EQ(kBiggestSignedParsableMax, signed_result);
+
+  Message msg3;
+  msg3 << INT_MAX;
+
+  int int_result = 0;
+  EXPECT_TRUE(ParseNaturalNumber(msg3.GetString(), &int_result));
+  EXPECT_EQ(INT_MAX, int_result);
+
+  Message msg4;
+  msg4 << UINT_MAX;
+
+  unsigned int uint_result = 0;
+  EXPECT_TRUE(ParseNaturalNumber(msg4.GetString(), &uint_result));
+  EXPECT_EQ(UINT_MAX, uint_result);
+}
+
+TEST(ParseNaturalNumberTest, WorksForShorterIntegers) {
+  short short_result = 0;
+  ASSERT_TRUE(ParseNaturalNumber("123", &short_result));
+  EXPECT_EQ(123, short_result);
+
+  signed char char_result = 0;
+  ASSERT_TRUE(ParseNaturalNumber("123", &char_result));
+  EXPECT_EQ(123, char_result);
+}
+
+# if GTEST_OS_WINDOWS
+TEST(EnvironmentTest, HandleFitsIntoSizeT) {
+  ASSERT_TRUE(sizeof(HANDLE) <= sizeof(size_t));
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Tests that EXPECT_DEATH_IF_SUPPORTED/ASSERT_DEATH_IF_SUPPORTED trigger
+// failures when death tests are available on the system.
+TEST(ConditionalDeathMacrosDeathTest, ExpectsDeathWhenDeathTestsAvailable) {
+  EXPECT_DEATH_IF_SUPPORTED(DieInside("CondDeathTestExpectMacro"),
+                            "death inside CondDeathTestExpectMacro");
+  ASSERT_DEATH_IF_SUPPORTED(DieInside("CondDeathTestAssertMacro"),
+                            "death inside CondDeathTestAssertMacro");
+
+  // Empty statement will not crash, which must trigger a failure.
+  EXPECT_NONFATAL_FAILURE(EXPECT_DEATH_IF_SUPPORTED(;, ""), "");
+  EXPECT_FATAL_FAILURE(ASSERT_DEATH_IF_SUPPORTED(;, ""), "");
+}
+
+TEST(InDeathTestChildDeathTest, ReportsDeathTestCorrectlyInFastStyle) {
+  testing::GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_FALSE(InDeathTestChild());
+  EXPECT_DEATH({
+    fprintf(stderr, InDeathTestChild() ? "Inside" : "Outside");
+    fflush(stderr);
+    _exit(1);
+  }, "Inside");
+}
+
+TEST(InDeathTestChildDeathTest, ReportsDeathTestCorrectlyInThreadSafeStyle) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  EXPECT_FALSE(InDeathTestChild());
+  EXPECT_DEATH({
+    fprintf(stderr, InDeathTestChild() ? "Inside" : "Outside");
+    fflush(stderr);
+    _exit(1);
+  }, "Inside");
+}
+
+void DieWithMessage(const char* message) {
+  fputs(message, stderr);
+  fflush(stderr);  // Make sure the text is printed before the process exits.
+  _exit(1);
+}
+
+TEST(MatcherDeathTest, DoesNotBreakBareRegexMatching) {
+  // googletest tests this, of course; here we ensure that including googlemock
+  // has not broken it.
+  EXPECT_DEATH(DieWithMessage("O, I die, Horatio."), "I d[aeiou]e");
+}
+
+TEST(MatcherDeathTest, MonomorphicMatcherMatches) {
+  EXPECT_DEATH(DieWithMessage("Behind O, I am slain!"),
+               Matcher<const std::string&>(ContainsRegex("I am slain")));
+}
+
+TEST(MatcherDeathTest, MonomorphicMatcherDoesNotMatch) {
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_DEATH(
+          DieWithMessage("Behind O, I am slain!"),
+          Matcher<const std::string&>(ContainsRegex("Ow, I am slain"))),
+      "Expected: contains regular expression \"Ow, I am slain\"");
+}
+
+TEST(MatcherDeathTest, PolymorphicMatcherMatches) {
+  EXPECT_DEATH(DieWithMessage("The rest is silence."),
+               ContainsRegex("rest is silence"));
+}
+
+TEST(MatcherDeathTest, PolymorphicMatcherDoesNotMatch) {
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_DEATH(DieWithMessage("The rest is silence."),
+                   ContainsRegex("rest is science")),
+      "Expected: contains regular expression \"rest is science\"");
+}
+
+}  // namespace
+
+#else  // !GTEST_HAS_DEATH_TEST follows
+
+namespace {
+
+using testing::internal::CaptureStderr;
+using testing::internal::GetCapturedStderr;
+
+// Tests that EXPECT_DEATH_IF_SUPPORTED/ASSERT_DEATH_IF_SUPPORTED are still
+// defined but do not trigger failures when death tests are not available on
+// the system.
+TEST(ConditionalDeathMacrosTest, WarnsWhenDeathTestsNotAvailable) {
+  // Empty statement will not crash, but that should not trigger a failure
+  // when death tests are not supported.
+  CaptureStderr();
+  EXPECT_DEATH_IF_SUPPORTED(;, "");
+  std::string output = GetCapturedStderr();
+  ASSERT_TRUE(NULL != strstr(output.c_str(),
+                             "Death tests are not supported on this platform"));
+  ASSERT_TRUE(NULL != strstr(output.c_str(), ";"));
+
+  // The streamed message should not be printed as there is no test failure.
+  CaptureStderr();
+  EXPECT_DEATH_IF_SUPPORTED(;, "") << "streamed message";
+  output = GetCapturedStderr();
+  ASSERT_TRUE(NULL == strstr(output.c_str(), "streamed message"));
+
+  CaptureStderr();
+  ASSERT_DEATH_IF_SUPPORTED(;, "");  // NOLINT
+  output = GetCapturedStderr();
+  ASSERT_TRUE(NULL != strstr(output.c_str(),
+                             "Death tests are not supported on this platform"));
+  ASSERT_TRUE(NULL != strstr(output.c_str(), ";"));
+
+  CaptureStderr();
+  ASSERT_DEATH_IF_SUPPORTED(;, "") << "streamed message";  // NOLINT
+  output = GetCapturedStderr();
+  ASSERT_TRUE(NULL == strstr(output.c_str(), "streamed message"));
+}
+
+void FuncWithAssert(int* n) {
+  ASSERT_DEATH_IF_SUPPORTED(return;, "");
+  (*n)++;
+}
+
+// Tests that ASSERT_DEATH_IF_SUPPORTED does not return from the current
+// function (as ASSERT_DEATH does) if death tests are not supported.
+TEST(ConditionalDeathMacrosTest, AssertDeatDoesNotReturnhIfUnsupported) {
+  int n = 0;
+  FuncWithAssert(&n);
+  EXPECT_EQ(1, n);
+}
+
+}  // namespace
+
+#endif  // !GTEST_HAS_DEATH_TEST
+
+namespace {
+
+// Tests that the death test macros expand to code which may or may not
+// be followed by operator<<, and that in either case the complete text
+// comprises only a single C++ statement.
+//
+// The syntax should work whether death tests are available or not.
+TEST(ConditionalDeathMacrosSyntaxDeathTest, SingleStatement) {
+  if (AlwaysFalse())
+    // This would fail if executed; this is a compilation test only
+    ASSERT_DEATH_IF_SUPPORTED(return, "");
+
+  if (AlwaysTrue())
+    EXPECT_DEATH_IF_SUPPORTED(_exit(1), "");
+  else
+    // This empty "else" branch is meant to ensure that EXPECT_DEATH
+    // doesn't expand into an "if" statement without an "else"
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    ASSERT_DEATH_IF_SUPPORTED(return, "") << "did not die";
+
+  if (AlwaysFalse())
+    ;  // NOLINT
+  else
+    EXPECT_DEATH_IF_SUPPORTED(_exit(1), "") << 1 << 2 << 3;
+}
+
+// Tests that conditional death test macros expand to code which interacts
+// well with switch statements.
+TEST(ConditionalDeathMacrosSyntaxDeathTest, SwitchStatement) {
+  // Microsoft compiler usually complains about switch statements without
+  // case labels. We suppress that warning for this test.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4065)
+
+  switch (0)
+    default:
+      ASSERT_DEATH_IF_SUPPORTED(_exit(1), "")
+          << "exit in default switch handler";
+
+  switch (0)
+    case 0:
+      EXPECT_DEATH_IF_SUPPORTED(_exit(1), "") << "exit in switch case";
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+}
+
+// Tests that a test case whose name ends with "DeathTest" works fine
+// on Windows.
+TEST(NotADeathTest, Test) {
+  SUCCEED();
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test_ex_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test_ex_test.cc
new file mode 100755
index 0000000..7ea5b94
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-death-test_ex_test.cc
@@ -0,0 +1,92 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests that verify interaction of exceptions and death tests.
+
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_HAS_SEH
+#  include <windows.h>          // For RaiseException().
+# endif
+
+# include "gtest/gtest-spi.h"
+
+# if GTEST_HAS_EXCEPTIONS
+
+#  include <exception>  // For std::exception.
+
+// Tests that death tests report thrown exceptions as failures and that the
+// exceptions do not escape death test macros.
+TEST(CxxExceptionDeathTest, ExceptionIsFailure) {
+  try {
+    EXPECT_NONFATAL_FAILURE(EXPECT_DEATH(throw 1, ""), "threw an exception");
+  } catch (...) {  // NOLINT
+    FAIL() << "An exception escaped a death test macro invocation "
+           << "with catch_exceptions "
+           << (testing::GTEST_FLAG(catch_exceptions) ? "enabled" : "disabled");
+  }
+}
+
+class TestException : public std::exception {
+ public:
+  const char* what() const throw() override { return "exceptional message"; }
+};
+
+TEST(CxxExceptionDeathTest, PrintsMessageForStdExceptions) {
+  // Verifies that the exception message is quoted in the failure text.
+  EXPECT_NONFATAL_FAILURE(EXPECT_DEATH(throw TestException(), ""),
+                          "exceptional message");
+  // Verifies that the location is mentioned in the failure text.
+  EXPECT_NONFATAL_FAILURE(EXPECT_DEATH(throw TestException(), ""),
+                          __FILE__);
+}
+# endif  // GTEST_HAS_EXCEPTIONS
+
+# if GTEST_HAS_SEH
+// Tests that enabling interception of SEH exceptions with the
+// catch_exceptions flag does not interfere with SEH exceptions being
+// treated as death by death tests.
+TEST(SehExceptionDeasTest, CatchExceptionsDoesNotInterfere) {
+  EXPECT_DEATH(RaiseException(42, 0x0, 0, NULL), "")
+      << "with catch_exceptions "
+      << (testing::GTEST_FLAG(catch_exceptions) ? "enabled" : "disabled");
+}
+# endif
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::GTEST_FLAG(catch_exceptions) = GTEST_ENABLE_CATCH_EXCEPTIONS_ != 0;
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test.py
new file mode 100755
index 0000000..2f0e406
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that Google Test correctly parses environment variables."""
+
+import os
+import gtest_test_utils
+
+
+IS_WINDOWS = os.name == 'nt'
+IS_LINUX = os.name == 'posix' and os.uname()[0] == 'Linux'
+
+COMMAND = gtest_test_utils.GetTestExecutablePath('googletest-env-var-test_')
+
+environ = os.environ.copy()
+
+
+def AssertEq(expected, actual):
+  if expected != actual:
+    print('Expected: %s' % (expected,))
+    print('  Actual: %s' % (actual,))
+    raise AssertionError
+
+
+def SetEnvVar(env_var, value):
+  """Sets the env variable to 'value'; unsets it when 'value' is None."""
+
+  if value is not None:
+    environ[env_var] = value
+  elif env_var in environ:
+    del environ[env_var]
+
+
+def GetFlag(flag):
+  """Runs googletest-env-var-test_ and returns its output."""
+
+  args = [COMMAND]
+  if flag is not None:
+    args += [flag]
+  return gtest_test_utils.Subprocess(args, env=environ).output
+
+
+def TestFlag(flag, test_val, default_val):
+  """Verifies that the given flag is affected by the corresponding env var."""
+
+  env_var = 'GTEST_' + flag.upper()
+  SetEnvVar(env_var, test_val)
+  AssertEq(test_val, GetFlag(flag))
+  SetEnvVar(env_var, None)
+  AssertEq(default_val, GetFlag(flag))
+
+
+class GTestEnvVarTest(gtest_test_utils.TestCase):
+
+  def testEnvVarAffectsFlag(self):
+    """Tests that environment variable should affect the corresponding flag."""
+
+    TestFlag('break_on_failure', '1', '0')
+    TestFlag('color', 'yes', 'auto')
+    TestFlag('filter', 'FooTest.Bar', '*')
+    SetEnvVar('XML_OUTPUT_FILE', None)  # For 'output' test
+    TestFlag('output', 'xml:tmp/foo.xml', '')
+    TestFlag('print_time', '0', '1')
+    TestFlag('repeat', '999', '1')
+    TestFlag('throw_on_failure', '1', '0')
+    TestFlag('death_test_style', 'threadsafe', 'fast')
+    TestFlag('catch_exceptions', '0', '1')
+
+    if IS_LINUX:
+      TestFlag('death_test_use_fork', '1', '0')
+      TestFlag('stack_trace_depth', '0', '100')
+
+
+  def testXmlOutputFile(self):
+    """Tests that $XML_OUTPUT_FILE affects the output flag."""
+
+    SetEnvVar('GTEST_OUTPUT', None)
+    SetEnvVar('XML_OUTPUT_FILE', 'tmp/bar.xml')
+    AssertEq('xml:tmp/bar.xml', GetFlag('output'))
+
+  def testXmlOutputFileOverride(self):
+    """Tests that $XML_OUTPUT_FILE is overridden by $GTEST_OUTPUT."""
+
+    SetEnvVar('GTEST_OUTPUT', 'xml:tmp/foo.xml')
+    SetEnvVar('XML_OUTPUT_FILE', 'tmp/bar.xml')
+    AssertEq('xml:tmp/foo.xml', GetFlag('output'))
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test_.cc
new file mode 100755
index 0000000..fd2aa82
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-env-var-test_.cc
@@ -0,0 +1,122 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// A helper program for testing that Google Test parses the environment
+// variables correctly.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+using ::std::cout;
+
+namespace testing {
+
+// The purpose of this is to make the test more realistic by ensuring
+// that the UnitTest singleton is created before main() is entered.
+// We don't actual run the TEST itself.
+TEST(GTestEnvVarTest, Dummy) {
+}
+
+void PrintFlag(const char* flag) {
+  if (strcmp(flag, "break_on_failure") == 0) {
+    cout << GTEST_FLAG(break_on_failure);
+    return;
+  }
+
+  if (strcmp(flag, "catch_exceptions") == 0) {
+    cout << GTEST_FLAG(catch_exceptions);
+    return;
+  }
+
+  if (strcmp(flag, "color") == 0) {
+    cout << GTEST_FLAG(color);
+    return;
+  }
+
+  if (strcmp(flag, "death_test_style") == 0) {
+    cout << GTEST_FLAG(death_test_style);
+    return;
+  }
+
+  if (strcmp(flag, "death_test_use_fork") == 0) {
+    cout << GTEST_FLAG(death_test_use_fork);
+    return;
+  }
+
+  if (strcmp(flag, "filter") == 0) {
+    cout << GTEST_FLAG(filter);
+    return;
+  }
+
+  if (strcmp(flag, "output") == 0) {
+    cout << GTEST_FLAG(output);
+    return;
+  }
+
+  if (strcmp(flag, "print_time") == 0) {
+    cout << GTEST_FLAG(print_time);
+    return;
+  }
+
+  if (strcmp(flag, "repeat") == 0) {
+    cout << GTEST_FLAG(repeat);
+    return;
+  }
+
+  if (strcmp(flag, "stack_trace_depth") == 0) {
+    cout << GTEST_FLAG(stack_trace_depth);
+    return;
+  }
+
+  if (strcmp(flag, "throw_on_failure") == 0) {
+    cout << GTEST_FLAG(throw_on_failure);
+    return;
+  }
+
+  cout << "Invalid flag name " << flag
+       << ".  Valid names are break_on_failure, color, filter, etc.\n";
+  exit(1);
+}
+
+}  // namespace testing
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  if (argc != 2) {
+    cout << "Usage: googletest-env-var-test_ NAME_OF_FLAG\n";
+    return 1;
+  }
+
+  testing::PrintFlag(argv[1]);
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filepath-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filepath-test.cc
new file mode 100755
index 0000000..aafad36
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filepath-test.cc
@@ -0,0 +1,649 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Google Test filepath utilities
+//
+// This file tests classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included from gtest-internal.h.
+// Do not #include this file anywhere else!
+
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>  // NOLINT
+#elif GTEST_OS_WINDOWS
+# include <direct.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+namespace testing {
+namespace internal {
+namespace {
+
+#if GTEST_OS_WINDOWS_MOBILE
+
+// Windows CE doesn't have the remove C function.
+int remove(const char* path) {
+  LPCWSTR wpath = String::AnsiToUtf16(path);
+  int ret = DeleteFile(wpath) ? 0 : -1;
+  delete [] wpath;
+  return ret;
+}
+// Windows CE doesn't have the _rmdir C function.
+int _rmdir(const char* path) {
+  FilePath filepath(path);
+  LPCWSTR wpath = String::AnsiToUtf16(
+      filepath.RemoveTrailingPathSeparator().c_str());
+  int ret = RemoveDirectory(wpath) ? 0 : -1;
+  delete [] wpath;
+  return ret;
+}
+
+#else
+
+TEST(GetCurrentDirTest, ReturnsCurrentDir) {
+  const FilePath original_dir = FilePath::GetCurrentDir();
+  EXPECT_FALSE(original_dir.IsEmpty());
+
+  posix::ChDir(GTEST_PATH_SEP_);
+  const FilePath cwd = FilePath::GetCurrentDir();
+  posix::ChDir(original_dir.c_str());
+
+# if GTEST_OS_WINDOWS || GTEST_OS_OS2
+
+  // Skips the ":".
+  const char* const cwd_without_drive = strchr(cwd.c_str(), ':');
+  ASSERT_TRUE(cwd_without_drive != NULL);
+  EXPECT_STREQ(GTEST_PATH_SEP_, cwd_without_drive + 1);
+
+# else
+
+  EXPECT_EQ(GTEST_PATH_SEP_, cwd.string());
+
+# endif
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+TEST(IsEmptyTest, ReturnsTrueForEmptyPath) {
+  EXPECT_TRUE(FilePath("").IsEmpty());
+}
+
+TEST(IsEmptyTest, ReturnsFalseForNonEmptyPath) {
+  EXPECT_FALSE(FilePath("a").IsEmpty());
+  EXPECT_FALSE(FilePath(".").IsEmpty());
+  EXPECT_FALSE(FilePath("a/b").IsEmpty());
+  EXPECT_FALSE(FilePath("a\\b\\").IsEmpty());
+}
+
+// RemoveDirectoryName "" -> ""
+TEST(RemoveDirectoryNameTest, WhenEmptyName) {
+  EXPECT_EQ("", FilePath("").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName "afile" -> "afile"
+TEST(RemoveDirectoryNameTest, ButNoDirectory) {
+  EXPECT_EQ("afile",
+      FilePath("afile").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName "/afile" -> "afile"
+TEST(RemoveDirectoryNameTest, RootFileShouldGiveFileName) {
+  EXPECT_EQ("afile",
+      FilePath(GTEST_PATH_SEP_ "afile").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName "adir/" -> ""
+TEST(RemoveDirectoryNameTest, WhereThereIsNoFileName) {
+  EXPECT_EQ("",
+      FilePath("adir" GTEST_PATH_SEP_).RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName "adir/afile" -> "afile"
+TEST(RemoveDirectoryNameTest, ShouldGiveFileName) {
+  EXPECT_EQ("afile",
+      FilePath("adir" GTEST_PATH_SEP_ "afile").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName "adir/subdir/afile" -> "afile"
+TEST(RemoveDirectoryNameTest, ShouldAlsoGiveFileName) {
+  EXPECT_EQ("afile",
+      FilePath("adir" GTEST_PATH_SEP_ "subdir" GTEST_PATH_SEP_ "afile")
+      .RemoveDirectoryName().string());
+}
+
+#if GTEST_HAS_ALT_PATH_SEP_
+
+// Tests that RemoveDirectoryName() works with the alternate separator
+// on Windows.
+
+// RemoveDirectoryName("/afile") -> "afile"
+TEST(RemoveDirectoryNameTest, RootFileShouldGiveFileNameForAlternateSeparator) {
+  EXPECT_EQ("afile", FilePath("/afile").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName("adir/") -> ""
+TEST(RemoveDirectoryNameTest, WhereThereIsNoFileNameForAlternateSeparator) {
+  EXPECT_EQ("", FilePath("adir/").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName("adir/afile") -> "afile"
+TEST(RemoveDirectoryNameTest, ShouldGiveFileNameForAlternateSeparator) {
+  EXPECT_EQ("afile", FilePath("adir/afile").RemoveDirectoryName().string());
+}
+
+// RemoveDirectoryName("adir/subdir/afile") -> "afile"
+TEST(RemoveDirectoryNameTest, ShouldAlsoGiveFileNameForAlternateSeparator) {
+  EXPECT_EQ("afile",
+            FilePath("adir/subdir/afile").RemoveDirectoryName().string());
+}
+
+#endif
+
+// RemoveFileName "" -> "./"
+TEST(RemoveFileNameTest, EmptyName) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // On Windows CE, we use the root as the current directory.
+  EXPECT_EQ(GTEST_PATH_SEP_, FilePath("").RemoveFileName().string());
+#else
+  EXPECT_EQ("." GTEST_PATH_SEP_, FilePath("").RemoveFileName().string());
+#endif
+}
+
+// RemoveFileName "adir/" -> "adir/"
+TEST(RemoveFileNameTest, ButNoFile) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_,
+      FilePath("adir" GTEST_PATH_SEP_).RemoveFileName().string());
+}
+
+// RemoveFileName "adir/afile" -> "adir/"
+TEST(RemoveFileNameTest, GivesDirName) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_,
+            FilePath("adir" GTEST_PATH_SEP_ "afile").RemoveFileName().string());
+}
+
+// RemoveFileName "adir/subdir/afile" -> "adir/subdir/"
+TEST(RemoveFileNameTest, GivesDirAndSubDirName) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_ "subdir" GTEST_PATH_SEP_,
+      FilePath("adir" GTEST_PATH_SEP_ "subdir" GTEST_PATH_SEP_ "afile")
+      .RemoveFileName().string());
+}
+
+// RemoveFileName "/afile" -> "/"
+TEST(RemoveFileNameTest, GivesRootDir) {
+  EXPECT_EQ(GTEST_PATH_SEP_,
+      FilePath(GTEST_PATH_SEP_ "afile").RemoveFileName().string());
+}
+
+#if GTEST_HAS_ALT_PATH_SEP_
+
+// Tests that RemoveFileName() works with the alternate separator on
+// Windows.
+
+// RemoveFileName("adir/") -> "adir/"
+TEST(RemoveFileNameTest, ButNoFileForAlternateSeparator) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_,
+            FilePath("adir/").RemoveFileName().string());
+}
+
+// RemoveFileName("adir/afile") -> "adir/"
+TEST(RemoveFileNameTest, GivesDirNameForAlternateSeparator) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_,
+            FilePath("adir/afile").RemoveFileName().string());
+}
+
+// RemoveFileName("adir/subdir/afile") -> "adir/subdir/"
+TEST(RemoveFileNameTest, GivesDirAndSubDirNameForAlternateSeparator) {
+  EXPECT_EQ("adir" GTEST_PATH_SEP_ "subdir" GTEST_PATH_SEP_,
+            FilePath("adir/subdir/afile").RemoveFileName().string());
+}
+
+// RemoveFileName("/afile") -> "\"
+TEST(RemoveFileNameTest, GivesRootDirForAlternateSeparator) {
+  EXPECT_EQ(GTEST_PATH_SEP_, FilePath("/afile").RemoveFileName().string());
+}
+
+#endif
+
+TEST(MakeFileNameTest, GenerateWhenNumberIsZero) {
+  FilePath actual = FilePath::MakeFileName(FilePath("foo"), FilePath("bar"),
+      0, "xml");
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar.xml", actual.string());
+}
+
+TEST(MakeFileNameTest, GenerateFileNameNumberGtZero) {
+  FilePath actual = FilePath::MakeFileName(FilePath("foo"), FilePath("bar"),
+      12, "xml");
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar_12.xml", actual.string());
+}
+
+TEST(MakeFileNameTest, GenerateFileNameWithSlashNumberIsZero) {
+  FilePath actual = FilePath::MakeFileName(FilePath("foo" GTEST_PATH_SEP_),
+      FilePath("bar"), 0, "xml");
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar.xml", actual.string());
+}
+
+TEST(MakeFileNameTest, GenerateFileNameWithSlashNumberGtZero) {
+  FilePath actual = FilePath::MakeFileName(FilePath("foo" GTEST_PATH_SEP_),
+      FilePath("bar"), 12, "xml");
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar_12.xml", actual.string());
+}
+
+TEST(MakeFileNameTest, GenerateWhenNumberIsZeroAndDirIsEmpty) {
+  FilePath actual = FilePath::MakeFileName(FilePath(""), FilePath("bar"),
+      0, "xml");
+  EXPECT_EQ("bar.xml", actual.string());
+}
+
+TEST(MakeFileNameTest, GenerateWhenNumberIsNotZeroAndDirIsEmpty) {
+  FilePath actual = FilePath::MakeFileName(FilePath(""), FilePath("bar"),
+      14, "xml");
+  EXPECT_EQ("bar_14.xml", actual.string());
+}
+
+TEST(ConcatPathsTest, WorksWhenDirDoesNotEndWithPathSep) {
+  FilePath actual = FilePath::ConcatPaths(FilePath("foo"),
+                                          FilePath("bar.xml"));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar.xml", actual.string());
+}
+
+TEST(ConcatPathsTest, WorksWhenPath1EndsWithPathSep) {
+  FilePath actual = FilePath::ConcatPaths(FilePath("foo" GTEST_PATH_SEP_),
+                                          FilePath("bar.xml"));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar.xml", actual.string());
+}
+
+TEST(ConcatPathsTest, Path1BeingEmpty) {
+  FilePath actual = FilePath::ConcatPaths(FilePath(""),
+                                          FilePath("bar.xml"));
+  EXPECT_EQ("bar.xml", actual.string());
+}
+
+TEST(ConcatPathsTest, Path2BeingEmpty) {
+  FilePath actual = FilePath::ConcatPaths(FilePath("foo"), FilePath(""));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_, actual.string());
+}
+
+TEST(ConcatPathsTest, BothPathBeingEmpty) {
+  FilePath actual = FilePath::ConcatPaths(FilePath(""),
+                                          FilePath(""));
+  EXPECT_EQ("", actual.string());
+}
+
+TEST(ConcatPathsTest, Path1ContainsPathSep) {
+  FilePath actual = FilePath::ConcatPaths(FilePath("foo" GTEST_PATH_SEP_ "bar"),
+                                          FilePath("foobar.xml"));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar" GTEST_PATH_SEP_ "foobar.xml",
+            actual.string());
+}
+
+TEST(ConcatPathsTest, Path2ContainsPathSep) {
+  FilePath actual = FilePath::ConcatPaths(
+      FilePath("foo" GTEST_PATH_SEP_),
+      FilePath("bar" GTEST_PATH_SEP_ "bar.xml"));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar" GTEST_PATH_SEP_ "bar.xml",
+            actual.string());
+}
+
+TEST(ConcatPathsTest, Path2EndsWithPathSep) {
+  FilePath actual = FilePath::ConcatPaths(FilePath("foo"),
+                                          FilePath("bar" GTEST_PATH_SEP_));
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar" GTEST_PATH_SEP_, actual.string());
+}
+
+// RemoveTrailingPathSeparator "" -> ""
+TEST(RemoveTrailingPathSeparatorTest, EmptyString) {
+  EXPECT_EQ("", FilePath("").RemoveTrailingPathSeparator().string());
+}
+
+// RemoveTrailingPathSeparator "foo" -> "foo"
+TEST(RemoveTrailingPathSeparatorTest, FileNoSlashString) {
+  EXPECT_EQ("foo", FilePath("foo").RemoveTrailingPathSeparator().string());
+}
+
+// RemoveTrailingPathSeparator "foo/" -> "foo"
+TEST(RemoveTrailingPathSeparatorTest, ShouldRemoveTrailingSeparator) {
+  EXPECT_EQ("foo",
+      FilePath("foo" GTEST_PATH_SEP_).RemoveTrailingPathSeparator().string());
+#if GTEST_HAS_ALT_PATH_SEP_
+  EXPECT_EQ("foo", FilePath("foo/").RemoveTrailingPathSeparator().string());
+#endif
+}
+
+// RemoveTrailingPathSeparator "foo/bar/" -> "foo/bar/"
+TEST(RemoveTrailingPathSeparatorTest, ShouldRemoveLastSeparator) {
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar",
+            FilePath("foo" GTEST_PATH_SEP_ "bar" GTEST_PATH_SEP_)
+                .RemoveTrailingPathSeparator().string());
+}
+
+// RemoveTrailingPathSeparator "foo/bar" -> "foo/bar"
+TEST(RemoveTrailingPathSeparatorTest, ShouldReturnUnmodified) {
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar",
+            FilePath("foo" GTEST_PATH_SEP_ "bar")
+                .RemoveTrailingPathSeparator().string());
+}
+
+TEST(DirectoryTest, RootDirectoryExists) {
+#if GTEST_OS_WINDOWS  // We are on Windows.
+  char current_drive[_MAX_PATH];  // NOLINT
+  current_drive[0] = static_cast<char>(_getdrive() + 'A' - 1);
+  current_drive[1] = ':';
+  current_drive[2] = '\\';
+  current_drive[3] = '\0';
+  EXPECT_TRUE(FilePath(current_drive).DirectoryExists());
+#else
+  EXPECT_TRUE(FilePath("/").DirectoryExists());
+#endif  // GTEST_OS_WINDOWS
+}
+
+#if GTEST_OS_WINDOWS
+TEST(DirectoryTest, RootOfWrongDriveDoesNotExists) {
+  const int saved_drive_ = _getdrive();
+  // Find a drive that doesn't exist. Start with 'Z' to avoid common ones.
+  for (char drive = 'Z'; drive >= 'A'; drive--)
+    if (_chdrive(drive - 'A' + 1) == -1) {
+      char non_drive[_MAX_PATH];  // NOLINT
+      non_drive[0] = drive;
+      non_drive[1] = ':';
+      non_drive[2] = '\\';
+      non_drive[3] = '\0';
+      EXPECT_FALSE(FilePath(non_drive).DirectoryExists());
+      break;
+    }
+  _chdrive(saved_drive_);
+}
+#endif  // GTEST_OS_WINDOWS
+
+#if !GTEST_OS_WINDOWS_MOBILE
+// Windows CE _does_ consider an empty directory to exist.
+TEST(DirectoryTest, EmptyPathDirectoryDoesNotExist) {
+  EXPECT_FALSE(FilePath("").DirectoryExists());
+}
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+TEST(DirectoryTest, CurrentDirectoryExists) {
+#if GTEST_OS_WINDOWS  // We are on Windows.
+# ifndef _WIN32_CE  // Windows CE doesn't have a current directory.
+
+  EXPECT_TRUE(FilePath(".").DirectoryExists());
+  EXPECT_TRUE(FilePath(".\\").DirectoryExists());
+
+# endif  // _WIN32_CE
+#else
+  EXPECT_TRUE(FilePath(".").DirectoryExists());
+  EXPECT_TRUE(FilePath("./").DirectoryExists());
+#endif  // GTEST_OS_WINDOWS
+}
+
+// "foo/bar" == foo//bar" == "foo///bar"
+TEST(NormalizeTest, MultipleConsecutiveSepaparatorsInMidstring) {
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar",
+            FilePath("foo" GTEST_PATH_SEP_ "bar").string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar",
+            FilePath("foo" GTEST_PATH_SEP_ GTEST_PATH_SEP_ "bar").string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_ "bar",
+            FilePath("foo" GTEST_PATH_SEP_ GTEST_PATH_SEP_
+                     GTEST_PATH_SEP_ "bar").string());
+}
+
+// "/bar" == //bar" == "///bar"
+TEST(NormalizeTest, MultipleConsecutiveSepaparatorsAtStringStart) {
+  EXPECT_EQ(GTEST_PATH_SEP_ "bar",
+    FilePath(GTEST_PATH_SEP_ "bar").string());
+  EXPECT_EQ(GTEST_PATH_SEP_ "bar",
+    FilePath(GTEST_PATH_SEP_ GTEST_PATH_SEP_ "bar").string());
+  EXPECT_EQ(GTEST_PATH_SEP_ "bar",
+    FilePath(GTEST_PATH_SEP_ GTEST_PATH_SEP_ GTEST_PATH_SEP_ "bar").string());
+}
+
+// "foo/" == foo//" == "foo///"
+TEST(NormalizeTest, MultipleConsecutiveSepaparatorsAtStringEnd) {
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+    FilePath("foo" GTEST_PATH_SEP_).string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+    FilePath("foo" GTEST_PATH_SEP_ GTEST_PATH_SEP_).string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+    FilePath("foo" GTEST_PATH_SEP_ GTEST_PATH_SEP_ GTEST_PATH_SEP_).string());
+}
+
+#if GTEST_HAS_ALT_PATH_SEP_
+
+// Tests that separators at the end of the string are normalized
+// regardless of their combination (e.g. "foo\" =="foo/\" ==
+// "foo\\/").
+TEST(NormalizeTest, MixAlternateSeparatorAtStringEnd) {
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+            FilePath("foo/").string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+            FilePath("foo" GTEST_PATH_SEP_ "/").string());
+  EXPECT_EQ("foo" GTEST_PATH_SEP_,
+            FilePath("foo//" GTEST_PATH_SEP_).string());
+}
+
+#endif
+
+TEST(AssignmentOperatorTest, DefaultAssignedToNonDefault) {
+  FilePath default_path;
+  FilePath non_default_path("path");
+  non_default_path = default_path;
+  EXPECT_EQ("", non_default_path.string());
+  EXPECT_EQ("", default_path.string());  // RHS var is unchanged.
+}
+
+TEST(AssignmentOperatorTest, NonDefaultAssignedToDefault) {
+  FilePath non_default_path("path");
+  FilePath default_path;
+  default_path = non_default_path;
+  EXPECT_EQ("path", default_path.string());
+  EXPECT_EQ("path", non_default_path.string());  // RHS var is unchanged.
+}
+
+TEST(AssignmentOperatorTest, ConstAssignedToNonConst) {
+  const FilePath const_default_path("const_path");
+  FilePath non_default_path("path");
+  non_default_path = const_default_path;
+  EXPECT_EQ("const_path", non_default_path.string());
+}
+
+class DirectoryCreationTest : public Test {
+ protected:
+  void SetUp() override {
+    testdata_path_.Set(FilePath(
+        TempDir() + GetCurrentExecutableName().string() +
+        "_directory_creation" GTEST_PATH_SEP_ "test" GTEST_PATH_SEP_));
+    testdata_file_.Set(testdata_path_.RemoveTrailingPathSeparator());
+
+    unique_file0_.Set(FilePath::MakeFileName(testdata_path_, FilePath("unique"),
+        0, "txt"));
+    unique_file1_.Set(FilePath::MakeFileName(testdata_path_, FilePath("unique"),
+        1, "txt"));
+
+    remove(testdata_file_.c_str());
+    remove(unique_file0_.c_str());
+    remove(unique_file1_.c_str());
+    posix::RmDir(testdata_path_.c_str());
+  }
+
+  void TearDown() override {
+    remove(testdata_file_.c_str());
+    remove(unique_file0_.c_str());
+    remove(unique_file1_.c_str());
+    posix::RmDir(testdata_path_.c_str());
+  }
+
+  void CreateTextFile(const char* filename) {
+    FILE* f = posix::FOpen(filename, "w");
+    fprintf(f, "text\n");
+    fclose(f);
+  }
+
+  // Strings representing a directory and a file, with identical paths
+  // except for the trailing separator character that distinquishes
+  // a directory named 'test' from a file named 'test'. Example names:
+  FilePath testdata_path_;  // "/tmp/directory_creation/test/"
+  FilePath testdata_file_;  // "/tmp/directory_creation/test"
+  FilePath unique_file0_;  // "/tmp/directory_creation/test/unique.txt"
+  FilePath unique_file1_;  // "/tmp/directory_creation/test/unique_1.txt"
+};
+
+TEST_F(DirectoryCreationTest, CreateDirectoriesRecursively) {
+  EXPECT_FALSE(testdata_path_.DirectoryExists()) << testdata_path_.string();
+  EXPECT_TRUE(testdata_path_.CreateDirectoriesRecursively());
+  EXPECT_TRUE(testdata_path_.DirectoryExists());
+}
+
+TEST_F(DirectoryCreationTest, CreateDirectoriesForAlreadyExistingPath) {
+  EXPECT_FALSE(testdata_path_.DirectoryExists()) << testdata_path_.string();
+  EXPECT_TRUE(testdata_path_.CreateDirectoriesRecursively());
+  // Call 'create' again... should still succeed.
+  EXPECT_TRUE(testdata_path_.CreateDirectoriesRecursively());
+}
+
+TEST_F(DirectoryCreationTest, CreateDirectoriesAndUniqueFilename) {
+  FilePath file_path(FilePath::GenerateUniqueFileName(testdata_path_,
+      FilePath("unique"), "txt"));
+  EXPECT_EQ(unique_file0_.string(), file_path.string());
+  EXPECT_FALSE(file_path.FileOrDirectoryExists());  // file not there
+
+  testdata_path_.CreateDirectoriesRecursively();
+  EXPECT_FALSE(file_path.FileOrDirectoryExists());  // file still not there
+  CreateTextFile(file_path.c_str());
+  EXPECT_TRUE(file_path.FileOrDirectoryExists());
+
+  FilePath file_path2(FilePath::GenerateUniqueFileName(testdata_path_,
+      FilePath("unique"), "txt"));
+  EXPECT_EQ(unique_file1_.string(), file_path2.string());
+  EXPECT_FALSE(file_path2.FileOrDirectoryExists());  // file not there
+  CreateTextFile(file_path2.c_str());
+  EXPECT_TRUE(file_path2.FileOrDirectoryExists());
+}
+
+TEST_F(DirectoryCreationTest, CreateDirectoriesFail) {
+  // force a failure by putting a file where we will try to create a directory.
+  CreateTextFile(testdata_file_.c_str());
+  EXPECT_TRUE(testdata_file_.FileOrDirectoryExists());
+  EXPECT_FALSE(testdata_file_.DirectoryExists());
+  EXPECT_FALSE(testdata_file_.CreateDirectoriesRecursively());
+}
+
+TEST(NoDirectoryCreationTest, CreateNoDirectoriesForDefaultXmlFile) {
+  const FilePath test_detail_xml("test_detail.xml");
+  EXPECT_FALSE(test_detail_xml.CreateDirectoriesRecursively());
+}
+
+TEST(FilePathTest, DefaultConstructor) {
+  FilePath fp;
+  EXPECT_EQ("", fp.string());
+}
+
+TEST(FilePathTest, CharAndCopyConstructors) {
+  const FilePath fp("spicy");
+  EXPECT_EQ("spicy", fp.string());
+
+  const FilePath fp_copy(fp);
+  EXPECT_EQ("spicy", fp_copy.string());
+}
+
+TEST(FilePathTest, StringConstructor) {
+  const FilePath fp(std::string("cider"));
+  EXPECT_EQ("cider", fp.string());
+}
+
+TEST(FilePathTest, Set) {
+  const FilePath apple("apple");
+  FilePath mac("mac");
+  mac.Set(apple);  // Implement Set() since overloading operator= is forbidden.
+  EXPECT_EQ("apple", mac.string());
+  EXPECT_EQ("apple", apple.string());
+}
+
+TEST(FilePathTest, ToString) {
+  const FilePath file("drink");
+  EXPECT_EQ("drink", file.string());
+}
+
+TEST(FilePathTest, RemoveExtension) {
+  EXPECT_EQ("app", FilePath("app.cc").RemoveExtension("cc").string());
+  EXPECT_EQ("app", FilePath("app.exe").RemoveExtension("exe").string());
+  EXPECT_EQ("APP", FilePath("APP.EXE").RemoveExtension("exe").string());
+}
+
+TEST(FilePathTest, RemoveExtensionWhenThereIsNoExtension) {
+  EXPECT_EQ("app", FilePath("app").RemoveExtension("exe").string());
+}
+
+TEST(FilePathTest, IsDirectory) {
+  EXPECT_FALSE(FilePath("cola").IsDirectory());
+  EXPECT_TRUE(FilePath("koala" GTEST_PATH_SEP_).IsDirectory());
+#if GTEST_HAS_ALT_PATH_SEP_
+  EXPECT_TRUE(FilePath("koala/").IsDirectory());
+#endif
+}
+
+TEST(FilePathTest, IsAbsolutePath) {
+  EXPECT_FALSE(FilePath("is" GTEST_PATH_SEP_ "relative").IsAbsolutePath());
+  EXPECT_FALSE(FilePath("").IsAbsolutePath());
+#if GTEST_OS_WINDOWS
+  EXPECT_TRUE(FilePath("c:\\" GTEST_PATH_SEP_ "is_not"
+                       GTEST_PATH_SEP_ "relative").IsAbsolutePath());
+  EXPECT_FALSE(FilePath("c:foo" GTEST_PATH_SEP_ "bar").IsAbsolutePath());
+  EXPECT_TRUE(FilePath("c:/" GTEST_PATH_SEP_ "is_not"
+                       GTEST_PATH_SEP_ "relative").IsAbsolutePath());
+#else
+  EXPECT_TRUE(FilePath(GTEST_PATH_SEP_ "is_not" GTEST_PATH_SEP_ "relative")
+              .IsAbsolutePath());
+#endif  // GTEST_OS_WINDOWS
+}
+
+TEST(FilePathTest, IsRootDirectory) {
+#if GTEST_OS_WINDOWS
+  EXPECT_TRUE(FilePath("a:\\").IsRootDirectory());
+  EXPECT_TRUE(FilePath("Z:/").IsRootDirectory());
+  EXPECT_TRUE(FilePath("e://").IsRootDirectory());
+  EXPECT_FALSE(FilePath("").IsRootDirectory());
+  EXPECT_FALSE(FilePath("b:").IsRootDirectory());
+  EXPECT_FALSE(FilePath("b:a").IsRootDirectory());
+  EXPECT_FALSE(FilePath("8:/").IsRootDirectory());
+  EXPECT_FALSE(FilePath("c|/").IsRootDirectory());
+#else
+  EXPECT_TRUE(FilePath("/").IsRootDirectory());
+  EXPECT_TRUE(FilePath("//").IsRootDirectory());
+  EXPECT_FALSE(FilePath("").IsRootDirectory());
+  EXPECT_FALSE(FilePath("\\").IsRootDirectory());
+  EXPECT_FALSE(FilePath("/x").IsRootDirectory());
+#endif
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest.py
new file mode 100755
index 0000000..6b32f2d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest.py
@@ -0,0 +1,639 @@
+#!/usr/bin/env python
+#
+# Copyright 2005 Google Inc. All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for Google Test test filters.
+
+A user can specify which test(s) in a Google Test program to run via either
+the GTEST_FILTER environment variable or the --gtest_filter flag.
+This script tests such functionality by invoking
+googletest-filter-unittest_ (a program written with Google Test) with different
+environments and command line flags.
+
+Note that test sharding may also influence which tests are filtered. Therefore,
+we test that here also.
+"""
+
+import os
+import re
+try:
+  from sets import Set as set  # For Python 2.3 compatibility
+except ImportError:
+  pass
+import sys
+import gtest_test_utils
+
+# Constants.
+
+# Checks if this platform can pass empty environment variables to child
+# processes.  We set an env variable to an empty string and invoke a python
+# script in a subprocess to print whether the variable is STILL in
+# os.environ.  We then use 'eval' to parse the child's output so that an
+# exception is thrown if the input is anything other than 'True' nor 'False'.
+CAN_PASS_EMPTY_ENV = False
+if sys.executable:
+  os.environ['EMPTY_VAR'] = ''
+  child = gtest_test_utils.Subprocess(
+      [sys.executable, '-c', 'import os; print(\'EMPTY_VAR\' in os.environ)'])
+  CAN_PASS_EMPTY_ENV = eval(child.output)
+
+
+# Check if this platform can unset environment variables in child processes.
+# We set an env variable to a non-empty string, unset it, and invoke
+# a python script in a subprocess to print whether the variable
+# is NO LONGER in os.environ.
+# We use 'eval' to parse the child's output so that an exception
+# is thrown if the input is neither 'True' nor 'False'.
+CAN_UNSET_ENV = False
+if sys.executable:
+  os.environ['UNSET_VAR'] = 'X'
+  del os.environ['UNSET_VAR']
+  child = gtest_test_utils.Subprocess(
+      [sys.executable, '-c', 'import os; print(\'UNSET_VAR\' not in os.environ)'
+      ])
+  CAN_UNSET_ENV = eval(child.output)
+
+
+# Checks if we should test with an empty filter. This doesn't
+# make sense on platforms that cannot pass empty env variables (Win32)
+# and on platforms that cannot unset variables (since we cannot tell
+# the difference between "" and NULL -- Borland and Solaris < 5.10)
+CAN_TEST_EMPTY_FILTER = (CAN_PASS_EMPTY_ENV and CAN_UNSET_ENV)
+
+
+# The environment variable for specifying the test filters.
+FILTER_ENV_VAR = 'GTEST_FILTER'
+
+# The environment variables for test sharding.
+TOTAL_SHARDS_ENV_VAR = 'GTEST_TOTAL_SHARDS'
+SHARD_INDEX_ENV_VAR = 'GTEST_SHARD_INDEX'
+SHARD_STATUS_FILE_ENV_VAR = 'GTEST_SHARD_STATUS_FILE'
+
+# The command line flag for specifying the test filters.
+FILTER_FLAG = 'gtest_filter'
+
+# The command line flag for including disabled tests.
+ALSO_RUN_DISABLED_TESTS_FLAG = 'gtest_also_run_disabled_tests'
+
+# Command to run the googletest-filter-unittest_ program.
+COMMAND = gtest_test_utils.GetTestExecutablePath('googletest-filter-unittest_')
+
+# Regex for determining whether parameterized tests are enabled in the binary.
+PARAM_TEST_REGEX = re.compile(r'/ParamTest')
+
+# Regex for parsing test case names from Google Test's output.
+TEST_CASE_REGEX = re.compile(r'^\[\-+\] \d+ tests? from (\w+(/\w+)?)')
+
+# Regex for parsing test names from Google Test's output.
+TEST_REGEX = re.compile(r'^\[\s*RUN\s*\].*\.(\w+(/\w+)?)')
+
+# The command line flag to tell Google Test to output the list of tests it
+# will run.
+LIST_TESTS_FLAG = '--gtest_list_tests'
+
+# Indicates whether Google Test supports death tests.
+SUPPORTS_DEATH_TESTS = 'HasDeathTest' in gtest_test_utils.Subprocess(
+    [COMMAND, LIST_TESTS_FLAG]).output
+
+# Full names of all tests in googletest-filter-unittests_.
+PARAM_TESTS = [
+    'SeqP/ParamTest.TestX/0',
+    'SeqP/ParamTest.TestX/1',
+    'SeqP/ParamTest.TestY/0',
+    'SeqP/ParamTest.TestY/1',
+    'SeqQ/ParamTest.TestX/0',
+    'SeqQ/ParamTest.TestX/1',
+    'SeqQ/ParamTest.TestY/0',
+    'SeqQ/ParamTest.TestY/1',
+    ]
+
+DISABLED_TESTS = [
+    'BarTest.DISABLED_TestFour',
+    'BarTest.DISABLED_TestFive',
+    'BazTest.DISABLED_TestC',
+    'DISABLED_FoobarTest.Test1',
+    'DISABLED_FoobarTest.DISABLED_Test2',
+    'DISABLED_FoobarbazTest.TestA',
+    ]
+
+if SUPPORTS_DEATH_TESTS:
+  DEATH_TESTS = [
+    'HasDeathTest.Test1',
+    'HasDeathTest.Test2',
+    ]
+else:
+  DEATH_TESTS = []
+
+# All the non-disabled tests.
+ACTIVE_TESTS = [
+    'FooTest.Abc',
+    'FooTest.Xyz',
+
+    'BarTest.TestOne',
+    'BarTest.TestTwo',
+    'BarTest.TestThree',
+
+    'BazTest.TestOne',
+    'BazTest.TestA',
+    'BazTest.TestB',
+    ] + DEATH_TESTS + PARAM_TESTS
+
+param_tests_present = None
+
+# Utilities.
+
+environ = os.environ.copy()
+
+
+def SetEnvVar(env_var, value):
+  """Sets the env variable to 'value'; unsets it when 'value' is None."""
+
+  if value is not None:
+    environ[env_var] = value
+  elif env_var in environ:
+    del environ[env_var]
+
+
+def RunAndReturnOutput(args = None):
+  """Runs the test program and returns its output."""
+
+  return gtest_test_utils.Subprocess([COMMAND] + (args or []),
+                                     env=environ).output
+
+
+def RunAndExtractTestList(args = None):
+  """Runs the test program and returns its exit code and a list of tests run."""
+
+  p = gtest_test_utils.Subprocess([COMMAND] + (args or []), env=environ)
+  tests_run = []
+  test_case = ''
+  test = ''
+  for line in p.output.split('\n'):
+    match = TEST_CASE_REGEX.match(line)
+    if match is not None:
+      test_case = match.group(1)
+    else:
+      match = TEST_REGEX.match(line)
+      if match is not None:
+        test = match.group(1)
+        tests_run.append(test_case + '.' + test)
+  return (tests_run, p.exit_code)
+
+
+def InvokeWithModifiedEnv(extra_env, function, *args, **kwargs):
+  """Runs the given function and arguments in a modified environment."""
+  try:
+    original_env = environ.copy()
+    environ.update(extra_env)
+    return function(*args, **kwargs)
+  finally:
+    environ.clear()
+    environ.update(original_env)
+
+
+def RunWithSharding(total_shards, shard_index, command):
+  """Runs a test program shard and returns exit code and a list of tests run."""
+
+  extra_env = {SHARD_INDEX_ENV_VAR: str(shard_index),
+               TOTAL_SHARDS_ENV_VAR: str(total_shards)}
+  return InvokeWithModifiedEnv(extra_env, RunAndExtractTestList, command)
+
+# The unit test.
+
+
+class GTestFilterUnitTest(gtest_test_utils.TestCase):
+  """Tests the env variable or the command line flag to filter tests."""
+
+  # Utilities.
+
+  def AssertSetEqual(self, lhs, rhs):
+    """Asserts that two sets are equal."""
+
+    for elem in lhs:
+      self.assert_(elem in rhs, '%s in %s' % (elem, rhs))
+
+    for elem in rhs:
+      self.assert_(elem in lhs, '%s in %s' % (elem, lhs))
+
+  def AssertPartitionIsValid(self, set_var, list_of_sets):
+    """Asserts that list_of_sets is a valid partition of set_var."""
+
+    full_partition = []
+    for slice_var in list_of_sets:
+      full_partition.extend(slice_var)
+    self.assertEqual(len(set_var), len(full_partition))
+    self.assertEqual(set(set_var), set(full_partition))
+
+  def AdjustForParameterizedTests(self, tests_to_run):
+    """Adjust tests_to_run in case value parameterized tests are disabled."""
+
+    global param_tests_present
+    if not param_tests_present:
+      return list(set(tests_to_run) - set(PARAM_TESTS))
+    else:
+      return tests_to_run
+
+  def RunAndVerify(self, gtest_filter, tests_to_run):
+    """Checks that the binary runs correct set of tests for a given filter."""
+
+    tests_to_run = self.AdjustForParameterizedTests(tests_to_run)
+
+    # First, tests using the environment variable.
+
+    # Windows removes empty variables from the environment when passing it
+    # to a new process.  This means it is impossible to pass an empty filter
+    # into a process using the environment variable.  However, we can still
+    # test the case when the variable is not supplied (i.e., gtest_filter is
+    # None).
+    # pylint: disable-msg=C6403
+    if CAN_TEST_EMPTY_FILTER or gtest_filter != '':
+      SetEnvVar(FILTER_ENV_VAR, gtest_filter)
+      tests_run = RunAndExtractTestList()[0]
+      SetEnvVar(FILTER_ENV_VAR, None)
+      self.AssertSetEqual(tests_run, tests_to_run)
+    # pylint: enable-msg=C6403
+
+    # Next, tests using the command line flag.
+
+    if gtest_filter is None:
+      args = []
+    else:
+      args = ['--%s=%s' % (FILTER_FLAG, gtest_filter)]
+
+    tests_run = RunAndExtractTestList(args)[0]
+    self.AssertSetEqual(tests_run, tests_to_run)
+
+  def RunAndVerifyWithSharding(self, gtest_filter, total_shards, tests_to_run,
+                               args=None, check_exit_0=False):
+    """Checks that binary runs correct tests for the given filter and shard.
+
+    Runs all shards of googletest-filter-unittest_ with the given filter, and
+    verifies that the right set of tests were run. The union of tests run
+    on each shard should be identical to tests_to_run, without duplicates.
+    If check_exit_0, .
+
+    Args:
+      gtest_filter: A filter to apply to the tests.
+      total_shards: A total number of shards to split test run into.
+      tests_to_run: A set of tests expected to run.
+      args   :      Arguments to pass to the to the test binary.
+      check_exit_0: When set to a true value, make sure that all shards
+                    return 0.
+    """
+
+    tests_to_run = self.AdjustForParameterizedTests(tests_to_run)
+
+    # Windows removes empty variables from the environment when passing it
+    # to a new process.  This means it is impossible to pass an empty filter
+    # into a process using the environment variable.  However, we can still
+    # test the case when the variable is not supplied (i.e., gtest_filter is
+    # None).
+    # pylint: disable-msg=C6403
+    if CAN_TEST_EMPTY_FILTER or gtest_filter != '':
+      SetEnvVar(FILTER_ENV_VAR, gtest_filter)
+      partition = []
+      for i in range(0, total_shards):
+        (tests_run, exit_code) = RunWithSharding(total_shards, i, args)
+        if check_exit_0:
+          self.assertEqual(0, exit_code)
+        partition.append(tests_run)
+
+      self.AssertPartitionIsValid(tests_to_run, partition)
+      SetEnvVar(FILTER_ENV_VAR, None)
+    # pylint: enable-msg=C6403
+
+  def RunAndVerifyAllowingDisabled(self, gtest_filter, tests_to_run):
+    """Checks that the binary runs correct set of tests for the given filter.
+
+    Runs googletest-filter-unittest_ with the given filter, and enables
+    disabled tests. Verifies that the right set of tests were run.
+
+    Args:
+      gtest_filter: A filter to apply to the tests.
+      tests_to_run: A set of tests expected to run.
+    """
+
+    tests_to_run = self.AdjustForParameterizedTests(tests_to_run)
+
+    # Construct the command line.
+    args = ['--%s' % ALSO_RUN_DISABLED_TESTS_FLAG]
+    if gtest_filter is not None:
+      args.append('--%s=%s' % (FILTER_FLAG, gtest_filter))
+
+    tests_run = RunAndExtractTestList(args)[0]
+    self.AssertSetEqual(tests_run, tests_to_run)
+
+  def setUp(self):
+    """Sets up test case.
+
+    Determines whether value-parameterized tests are enabled in the binary and
+    sets the flags accordingly.
+    """
+
+    global param_tests_present
+    if param_tests_present is None:
+      param_tests_present = PARAM_TEST_REGEX.search(
+          RunAndReturnOutput()) is not None
+
+  def testDefaultBehavior(self):
+    """Tests the behavior of not specifying the filter."""
+
+    self.RunAndVerify(None, ACTIVE_TESTS)
+
+  def testDefaultBehaviorWithShards(self):
+    """Tests the behavior without the filter, with sharding enabled."""
+
+    self.RunAndVerifyWithSharding(None, 1, ACTIVE_TESTS)
+    self.RunAndVerifyWithSharding(None, 2, ACTIVE_TESTS)
+    self.RunAndVerifyWithSharding(None, len(ACTIVE_TESTS) - 1, ACTIVE_TESTS)
+    self.RunAndVerifyWithSharding(None, len(ACTIVE_TESTS), ACTIVE_TESTS)
+    self.RunAndVerifyWithSharding(None, len(ACTIVE_TESTS) + 1, ACTIVE_TESTS)
+
+  def testEmptyFilter(self):
+    """Tests an empty filter."""
+
+    self.RunAndVerify('', [])
+    self.RunAndVerifyWithSharding('', 1, [])
+    self.RunAndVerifyWithSharding('', 2, [])
+
+  def testBadFilter(self):
+    """Tests a filter that matches nothing."""
+
+    self.RunAndVerify('BadFilter', [])
+    self.RunAndVerifyAllowingDisabled('BadFilter', [])
+
+  def testFullName(self):
+    """Tests filtering by full name."""
+
+    self.RunAndVerify('FooTest.Xyz', ['FooTest.Xyz'])
+    self.RunAndVerifyAllowingDisabled('FooTest.Xyz', ['FooTest.Xyz'])
+    self.RunAndVerifyWithSharding('FooTest.Xyz', 5, ['FooTest.Xyz'])
+
+  def testUniversalFilters(self):
+    """Tests filters that match everything."""
+
+    self.RunAndVerify('*', ACTIVE_TESTS)
+    self.RunAndVerify('*.*', ACTIVE_TESTS)
+    self.RunAndVerifyWithSharding('*.*', len(ACTIVE_TESTS) - 3, ACTIVE_TESTS)
+    self.RunAndVerifyAllowingDisabled('*', ACTIVE_TESTS + DISABLED_TESTS)
+    self.RunAndVerifyAllowingDisabled('*.*', ACTIVE_TESTS + DISABLED_TESTS)
+
+  def testFilterByTestCase(self):
+    """Tests filtering by test case name."""
+
+    self.RunAndVerify('FooTest.*', ['FooTest.Abc', 'FooTest.Xyz'])
+
+    BAZ_TESTS = ['BazTest.TestOne', 'BazTest.TestA', 'BazTest.TestB']
+    self.RunAndVerify('BazTest.*', BAZ_TESTS)
+    self.RunAndVerifyAllowingDisabled('BazTest.*',
+                                      BAZ_TESTS + ['BazTest.DISABLED_TestC'])
+
+  def testFilterByTest(self):
+    """Tests filtering by test name."""
+
+    self.RunAndVerify('*.TestOne', ['BarTest.TestOne', 'BazTest.TestOne'])
+
+  def testFilterDisabledTests(self):
+    """Select only the disabled tests to run."""
+
+    self.RunAndVerify('DISABLED_FoobarTest.Test1', [])
+    self.RunAndVerifyAllowingDisabled('DISABLED_FoobarTest.Test1',
+                                      ['DISABLED_FoobarTest.Test1'])
+
+    self.RunAndVerify('*DISABLED_*', [])
+    self.RunAndVerifyAllowingDisabled('*DISABLED_*', DISABLED_TESTS)
+
+    self.RunAndVerify('*.DISABLED_*', [])
+    self.RunAndVerifyAllowingDisabled('*.DISABLED_*', [
+        'BarTest.DISABLED_TestFour',
+        'BarTest.DISABLED_TestFive',
+        'BazTest.DISABLED_TestC',
+        'DISABLED_FoobarTest.DISABLED_Test2',
+        ])
+
+    self.RunAndVerify('DISABLED_*', [])
+    self.RunAndVerifyAllowingDisabled('DISABLED_*', [
+        'DISABLED_FoobarTest.Test1',
+        'DISABLED_FoobarTest.DISABLED_Test2',
+        'DISABLED_FoobarbazTest.TestA',
+        ])
+
+  def testWildcardInTestCaseName(self):
+    """Tests using wildcard in the test case name."""
+
+    self.RunAndVerify('*a*.*', [
+        'BarTest.TestOne',
+        'BarTest.TestTwo',
+        'BarTest.TestThree',
+
+        'BazTest.TestOne',
+        'BazTest.TestA',
+        'BazTest.TestB', ] + DEATH_TESTS + PARAM_TESTS)
+
+  def testWildcardInTestName(self):
+    """Tests using wildcard in the test name."""
+
+    self.RunAndVerify('*.*A*', ['FooTest.Abc', 'BazTest.TestA'])
+
+  def testFilterWithoutDot(self):
+    """Tests a filter that has no '.' in it."""
+
+    self.RunAndVerify('*z*', [
+        'FooTest.Xyz',
+
+        'BazTest.TestOne',
+        'BazTest.TestA',
+        'BazTest.TestB',
+        ])
+
+  def testTwoPatterns(self):
+    """Tests filters that consist of two patterns."""
+
+    self.RunAndVerify('Foo*.*:*A*', [
+        'FooTest.Abc',
+        'FooTest.Xyz',
+
+        'BazTest.TestA',
+        ])
+
+    # An empty pattern + a non-empty one
+    self.RunAndVerify(':*A*', ['FooTest.Abc', 'BazTest.TestA'])
+
+  def testThreePatterns(self):
+    """Tests filters that consist of three patterns."""
+
+    self.RunAndVerify('*oo*:*A*:*One', [
+        'FooTest.Abc',
+        'FooTest.Xyz',
+
+        'BarTest.TestOne',
+
+        'BazTest.TestOne',
+        'BazTest.TestA',
+        ])
+
+    # The 2nd pattern is empty.
+    self.RunAndVerify('*oo*::*One', [
+        'FooTest.Abc',
+        'FooTest.Xyz',
+
+        'BarTest.TestOne',
+
+        'BazTest.TestOne',
+        ])
+
+    # The last 2 patterns are empty.
+    self.RunAndVerify('*oo*::', [
+        'FooTest.Abc',
+        'FooTest.Xyz',
+        ])
+
+  def testNegativeFilters(self):
+    self.RunAndVerify('*-BazTest.TestOne', [
+        'FooTest.Abc',
+        'FooTest.Xyz',
+
+        'BarTest.TestOne',
+        'BarTest.TestTwo',
+        'BarTest.TestThree',
+
+        'BazTest.TestA',
+        'BazTest.TestB',
+        ] + DEATH_TESTS + PARAM_TESTS)
+
+    self.RunAndVerify('*-FooTest.Abc:BazTest.*', [
+        'FooTest.Xyz',
+
+        'BarTest.TestOne',
+        'BarTest.TestTwo',
+        'BarTest.TestThree',
+        ] + DEATH_TESTS + PARAM_TESTS)
+
+    self.RunAndVerify('BarTest.*-BarTest.TestOne', [
+        'BarTest.TestTwo',
+        'BarTest.TestThree',
+        ])
+
+    # Tests without leading '*'.
+    self.RunAndVerify('-FooTest.Abc:FooTest.Xyz:BazTest.*', [
+        'BarTest.TestOne',
+        'BarTest.TestTwo',
+        'BarTest.TestThree',
+        ] + DEATH_TESTS + PARAM_TESTS)
+
+    # Value parameterized tests.
+    self.RunAndVerify('*/*', PARAM_TESTS)
+
+    # Value parameterized tests filtering by the sequence name.
+    self.RunAndVerify('SeqP/*', [
+        'SeqP/ParamTest.TestX/0',
+        'SeqP/ParamTest.TestX/1',
+        'SeqP/ParamTest.TestY/0',
+        'SeqP/ParamTest.TestY/1',
+        ])
+
+    # Value parameterized tests filtering by the test name.
+    self.RunAndVerify('*/0', [
+        'SeqP/ParamTest.TestX/0',
+        'SeqP/ParamTest.TestY/0',
+        'SeqQ/ParamTest.TestX/0',
+        'SeqQ/ParamTest.TestY/0',
+        ])
+
+  def testFlagOverridesEnvVar(self):
+    """Tests that the filter flag overrides the filtering env. variable."""
+
+    SetEnvVar(FILTER_ENV_VAR, 'Foo*')
+    args = ['--%s=%s' % (FILTER_FLAG, '*One')]
+    tests_run = RunAndExtractTestList(args)[0]
+    SetEnvVar(FILTER_ENV_VAR, None)
+
+    self.AssertSetEqual(tests_run, ['BarTest.TestOne', 'BazTest.TestOne'])
+
+  def testShardStatusFileIsCreated(self):
+    """Tests that the shard file is created if specified in the environment."""
+
+    shard_status_file = os.path.join(gtest_test_utils.GetTempDir(),
+                                     'shard_status_file')
+    self.assert_(not os.path.exists(shard_status_file))
+
+    extra_env = {SHARD_STATUS_FILE_ENV_VAR: shard_status_file}
+    try:
+      InvokeWithModifiedEnv(extra_env, RunAndReturnOutput)
+    finally:
+      self.assert_(os.path.exists(shard_status_file))
+      os.remove(shard_status_file)
+
+  def testShardStatusFileIsCreatedWithListTests(self):
+    """Tests that the shard file is created with the "list_tests" flag."""
+
+    shard_status_file = os.path.join(gtest_test_utils.GetTempDir(),
+                                     'shard_status_file2')
+    self.assert_(not os.path.exists(shard_status_file))
+
+    extra_env = {SHARD_STATUS_FILE_ENV_VAR: shard_status_file}
+    try:
+      output = InvokeWithModifiedEnv(extra_env,
+                                     RunAndReturnOutput,
+                                     [LIST_TESTS_FLAG])
+    finally:
+      # This assertion ensures that Google Test enumerated the tests as
+      # opposed to running them.
+      self.assert_('[==========]' not in output,
+                   'Unexpected output during test enumeration.\n'
+                   'Please ensure that LIST_TESTS_FLAG is assigned the\n'
+                   'correct flag value for listing Google Test tests.')
+
+      self.assert_(os.path.exists(shard_status_file))
+      os.remove(shard_status_file)
+
+  if SUPPORTS_DEATH_TESTS:
+    def testShardingWorksWithDeathTests(self):
+      """Tests integration with death tests and sharding."""
+
+      gtest_filter = 'HasDeathTest.*:SeqP/*'
+      expected_tests = [
+          'HasDeathTest.Test1',
+          'HasDeathTest.Test2',
+
+          'SeqP/ParamTest.TestX/0',
+          'SeqP/ParamTest.TestX/1',
+          'SeqP/ParamTest.TestY/0',
+          'SeqP/ParamTest.TestY/1',
+          ]
+
+      for flag in ['--gtest_death_test_style=threadsafe',
+                   '--gtest_death_test_style=fast']:
+        self.RunAndVerifyWithSharding(gtest_filter, 3, expected_tests,
+                                      check_exit_0=True, args=[flag])
+        self.RunAndVerifyWithSharding(gtest_filter, 5, expected_tests,
+                                      check_exit_0=True, args=[flag])
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest_.cc
new file mode 100755
index 0000000..d30ec9c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-filter-unittest_.cc
@@ -0,0 +1,137 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Unit test for Google Test test filters.
+//
+// A user can specify which test(s) in a Google Test program to run via
+// either the GTEST_FILTER environment variable or the --gtest_filter
+// flag.  This is used for testing such functionality.
+//
+// The program will be invoked from a Python unit test.  Don't run it
+// directly.
+
+#include "gtest/gtest.h"
+
+namespace {
+
+// Test case FooTest.
+
+class FooTest : public testing::Test {
+};
+
+TEST_F(FooTest, Abc) {
+}
+
+TEST_F(FooTest, Xyz) {
+  FAIL() << "Expected failure.";
+}
+
+// Test case BarTest.
+
+TEST(BarTest, TestOne) {
+}
+
+TEST(BarTest, TestTwo) {
+}
+
+TEST(BarTest, TestThree) {
+}
+
+TEST(BarTest, DISABLED_TestFour) {
+  FAIL() << "Expected failure.";
+}
+
+TEST(BarTest, DISABLED_TestFive) {
+  FAIL() << "Expected failure.";
+}
+
+// Test case BazTest.
+
+TEST(BazTest, TestOne) {
+  FAIL() << "Expected failure.";
+}
+
+TEST(BazTest, TestA) {
+}
+
+TEST(BazTest, TestB) {
+}
+
+TEST(BazTest, DISABLED_TestC) {
+  FAIL() << "Expected failure.";
+}
+
+// Test case HasDeathTest
+
+TEST(HasDeathTest, Test1) {
+  EXPECT_DEATH_IF_SUPPORTED(exit(1), ".*");
+}
+
+// We need at least two death tests to make sure that the all death tests
+// aren't on the first shard.
+TEST(HasDeathTest, Test2) {
+  EXPECT_DEATH_IF_SUPPORTED(exit(1), ".*");
+}
+
+// Test case FoobarTest
+
+TEST(DISABLED_FoobarTest, Test1) {
+  FAIL() << "Expected failure.";
+}
+
+TEST(DISABLED_FoobarTest, DISABLED_Test2) {
+  FAIL() << "Expected failure.";
+}
+
+// Test case FoobarbazTest
+
+TEST(DISABLED_FoobarbazTest, TestA) {
+  FAIL() << "Expected failure.";
+}
+
+class ParamTest : public testing::TestWithParam<int> {
+};
+
+TEST_P(ParamTest, TestX) {
+}
+
+TEST_P(ParamTest, TestY) {
+}
+
+INSTANTIATE_TEST_SUITE_P(SeqP, ParamTest, testing::Values(1, 2));
+INSTANTIATE_TEST_SUITE_P(SeqQ, ParamTest, testing::Values(5, 6));
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-outfiles-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-outfiles-test.py
new file mode 100755
index 0000000..8ef47b8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-outfiles-test.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# Copyright 2018, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for the gtest_json_output module."""
+
+import json
+import os
+import gtest_json_test_utils
+import gtest_test_utils
+
+GTEST_OUTPUT_SUBDIR = 'json_outfiles'
+GTEST_OUTPUT_1_TEST = 'gtest_xml_outfile1_test_'
+GTEST_OUTPUT_2_TEST = 'gtest_xml_outfile2_test_'
+
+EXPECTED_1 = {
+    u'tests':
+        1,
+    u'failures':
+        0,
+    u'disabled':
+        0,
+    u'errors':
+        0,
+    u'time':
+        u'*',
+    u'timestamp':
+        u'*',
+    u'name':
+        u'AllTests',
+    u'testsuites': [{
+        u'name':
+            u'PropertyOne',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'TestSomeProperties',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'PropertyOne',
+            u'SetUpProp': u'1',
+            u'TestSomeProperty': u'1',
+            u'TearDownProp': u'1',
+        }],
+    }],
+}
+
+EXPECTED_2 = {
+    u'tests':
+        1,
+    u'failures':
+        0,
+    u'disabled':
+        0,
+    u'errors':
+        0,
+    u'time':
+        u'*',
+    u'timestamp':
+        u'*',
+    u'name':
+        u'AllTests',
+    u'testsuites': [{
+        u'name':
+            u'PropertyTwo',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'TestSomeProperties',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'timestamp': u'*',
+            u'time': u'*',
+            u'classname': u'PropertyTwo',
+            u'SetUpProp': u'2',
+            u'TestSomeProperty': u'2',
+            u'TearDownProp': u'2',
+        }],
+    }],
+}
+
+
+class GTestJsonOutFilesTest(gtest_test_utils.TestCase):
+  """Unit test for Google Test's JSON output functionality."""
+
+  def setUp(self):
+    # We want the trailing '/' that the last "" provides in os.path.join, for
+    # telling Google Test to create an output directory instead of a single file
+    # for xml output.
+    self.output_dir_ = os.path.join(gtest_test_utils.GetTempDir(),
+                                    GTEST_OUTPUT_SUBDIR, '')
+    self.DeleteFilesAndDir()
+
+  def tearDown(self):
+    self.DeleteFilesAndDir()
+
+  def DeleteFilesAndDir(self):
+    try:
+      os.remove(os.path.join(self.output_dir_, GTEST_OUTPUT_1_TEST + '.json'))
+    except os.error:
+      pass
+    try:
+      os.remove(os.path.join(self.output_dir_, GTEST_OUTPUT_2_TEST + '.json'))
+    except os.error:
+      pass
+    try:
+      os.rmdir(self.output_dir_)
+    except os.error:
+      pass
+
+  def testOutfile1(self):
+    self._TestOutFile(GTEST_OUTPUT_1_TEST, EXPECTED_1)
+
+  def testOutfile2(self):
+    self._TestOutFile(GTEST_OUTPUT_2_TEST, EXPECTED_2)
+
+  def _TestOutFile(self, test_name, expected):
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(test_name)
+    command = [gtest_prog_path, '--gtest_output=json:%s' % self.output_dir_]
+    p = gtest_test_utils.Subprocess(command,
+                                    working_dir=gtest_test_utils.GetTempDir())
+    self.assert_(p.exited)
+    self.assertEquals(0, p.exit_code)
+
+    output_file_name1 = test_name + '.json'
+    output_file1 = os.path.join(self.output_dir_, output_file_name1)
+    output_file_name2 = 'lt-' + output_file_name1
+    output_file2 = os.path.join(self.output_dir_, output_file_name2)
+    self.assert_(os.path.isfile(output_file1) or os.path.isfile(output_file2),
+                 output_file1)
+
+    if os.path.isfile(output_file1):
+      with open(output_file1) as f:
+        actual = json.load(f)
+    else:
+      with open(output_file2) as f:
+        actual = json.load(f)
+    self.assertEqual(expected, gtest_json_test_utils.normalize(actual))
+
+
+if __name__ == '__main__':
+  os.environ['GTEST_STACK_TRACE_DEPTH'] = '0'
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-output-unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-output-unittest.py
new file mode 100755
index 0000000..15861f7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-json-output-unittest.py
@@ -0,0 +1,778 @@
+#!/usr/bin/env python
+# Copyright 2018, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for the gtest_json_output module."""
+
+import datetime
+import errno
+import json
+import os
+import re
+import sys
+
+import gtest_json_test_utils
+import gtest_test_utils
+
+GTEST_FILTER_FLAG = '--gtest_filter'
+GTEST_LIST_TESTS_FLAG = '--gtest_list_tests'
+GTEST_OUTPUT_FLAG = '--gtest_output'
+GTEST_DEFAULT_OUTPUT_FILE = 'test_detail.json'
+GTEST_PROGRAM_NAME = 'gtest_xml_output_unittest_'
+
+# The flag indicating stacktraces are not supported
+NO_STACKTRACE_SUPPORT_FLAG = '--no_stacktrace_support'
+
+SUPPORTS_STACK_TRACES = NO_STACKTRACE_SUPPORT_FLAG not in sys.argv
+
+if SUPPORTS_STACK_TRACES:
+  STACK_TRACE_TEMPLATE = '\nStack trace:\n*'
+else:
+  STACK_TRACE_TEMPLATE = ''
+
+EXPECTED_NON_EMPTY = {
+    u'tests':
+        24,
+    u'failures':
+        4,
+    u'disabled':
+        2,
+    u'errors':
+        0,
+    u'timestamp':
+        u'*',
+    u'time':
+        u'*',
+    u'ad_hoc_property':
+        u'42',
+    u'name':
+        u'AllTests',
+    u'testsuites': [{
+        u'name':
+            u'SuccessfulTest',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'Succeeds',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'SuccessfulTest'
+        }]
+    }, {
+        u'name':
+            u'FailedTest',
+        u'tests':
+            1,
+        u'failures':
+            1,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name':
+                u'Fails',
+            u'status':
+                u'RUN',
+            u'result':
+                u'COMPLETED',
+            u'time':
+                u'*',
+            u'timestamp':
+                u'*',
+            u'classname':
+                u'FailedTest',
+            u'failures': [{
+                u'failure': u'gtest_xml_output_unittest_.cc:*\n'
+                            u'Expected equality of these values:\n'
+                            u'  1\n  2' + STACK_TRACE_TEMPLATE,
+                u'type': u''
+            }]
+        }]
+    }, {
+        u'name':
+            u'DisabledTest',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            1,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'DISABLED_test_not_run',
+            u'status': u'NOTRUN',
+            u'result': u'SUPPRESSED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'DisabledTest'
+        }]
+    }, {
+        u'name':
+            u'SkippedTest',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'Skipped',
+            u'status': u'RUN',
+            u'result': u'SKIPPED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'SkippedTest'
+        }]
+    }, {
+        u'name':
+            u'MixedResultTest',
+        u'tests':
+            3,
+        u'failures':
+            1,
+        u'disabled':
+            1,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'Succeeds',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'MixedResultTest'
+        }, {
+            u'name':
+                u'Fails',
+            u'status':
+                u'RUN',
+            u'result':
+                u'COMPLETED',
+            u'time':
+                u'*',
+            u'timestamp':
+                u'*',
+            u'classname':
+                u'MixedResultTest',
+            u'failures': [{
+                u'failure': u'gtest_xml_output_unittest_.cc:*\n'
+                            u'Expected equality of these values:\n'
+                            u'  1\n  2' + STACK_TRACE_TEMPLATE,
+                u'type': u''
+            }, {
+                u'failure': u'gtest_xml_output_unittest_.cc:*\n'
+                            u'Expected equality of these values:\n'
+                            u'  2\n  3' + STACK_TRACE_TEMPLATE,
+                u'type': u''
+            }]
+        }, {
+            u'name': u'DISABLED_test',
+            u'status': u'NOTRUN',
+            u'result': u'SUPPRESSED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'MixedResultTest'
+        }]
+    }, {
+        u'name':
+            u'XmlQuotingTest',
+        u'tests':
+            1,
+        u'failures':
+            1,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name':
+                u'OutputsCData',
+            u'status':
+                u'RUN',
+            u'result':
+                u'COMPLETED',
+            u'time':
+                u'*',
+            u'timestamp':
+                u'*',
+            u'classname':
+                u'XmlQuotingTest',
+            u'failures': [{
+                u'failure': u'gtest_xml_output_unittest_.cc:*\n'
+                            u'Failed\nXML output: <?xml encoding="utf-8">'
+                            u'<top><![CDATA[cdata text]]></top>' +
+                            STACK_TRACE_TEMPLATE,
+                u'type': u''
+            }]
+        }]
+    }, {
+        u'name':
+            u'InvalidCharactersTest',
+        u'tests':
+            1,
+        u'failures':
+            1,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name':
+                u'InvalidCharactersInMessage',
+            u'status':
+                u'RUN',
+            u'result':
+                u'COMPLETED',
+            u'time':
+                u'*',
+            u'timestamp':
+                u'*',
+            u'classname':
+                u'InvalidCharactersTest',
+            u'failures': [{
+                u'failure': u'gtest_xml_output_unittest_.cc:*\n'
+                            u'Failed\nInvalid characters in brackets'
+                            u' [\x01\x02]' + STACK_TRACE_TEMPLATE,
+                u'type': u''
+            }]
+        }]
+    }, {
+        u'name':
+            u'PropertyRecordingTest',
+        u'tests':
+            4,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'SetUpTestSuite':
+            u'yes',
+        u'TearDownTestSuite':
+            u'aye',
+        u'testsuite': [{
+            u'name': u'OneProperty',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'PropertyRecordingTest',
+            u'key_1': u'1'
+        }, {
+            u'name': u'IntValuedProperty',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'PropertyRecordingTest',
+            u'key_int': u'1'
+        }, {
+            u'name': u'ThreeProperties',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'PropertyRecordingTest',
+            u'key_1': u'1',
+            u'key_2': u'2',
+            u'key_3': u'3'
+        }, {
+            u'name': u'TwoValuesForOneKeyUsesLastValue',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'PropertyRecordingTest',
+            u'key_1': u'2'
+        }]
+    }, {
+        u'name':
+            u'NoFixtureTest',
+        u'tests':
+            3,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'RecordProperty',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'NoFixtureTest',
+            u'key': u'1'
+        }, {
+            u'name': u'ExternalUtilityThatCallsRecordIntValuedProperty',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'NoFixtureTest',
+            u'key_for_utility_int': u'1'
+        }, {
+            u'name': u'ExternalUtilityThatCallsRecordStringValuedProperty',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'NoFixtureTest',
+            u'key_for_utility_string': u'1'
+        }]
+    }, {
+        u'name':
+            u'TypedTest/0',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'HasTypeParamAttribute',
+            u'type_param': u'int',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'TypedTest/0'
+        }]
+    }, {
+        u'name':
+            u'TypedTest/1',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'HasTypeParamAttribute',
+            u'type_param': u'long',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'TypedTest/1'
+        }]
+    }, {
+        u'name':
+            u'Single/TypeParameterizedTestSuite/0',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'HasTypeParamAttribute',
+            u'type_param': u'int',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/TypeParameterizedTestSuite/0'
+        }]
+    }, {
+        u'name':
+            u'Single/TypeParameterizedTestSuite/1',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'HasTypeParamAttribute',
+            u'type_param': u'long',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/TypeParameterizedTestSuite/1'
+        }]
+    }, {
+        u'name':
+            u'Single/ValueParamTest',
+        u'tests':
+            4,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'HasValueParamAttribute/0',
+            u'value_param': u'33',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/ValueParamTest'
+        }, {
+            u'name': u'HasValueParamAttribute/1',
+            u'value_param': u'42',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/ValueParamTest'
+        }, {
+            u'name': u'AnotherTestThatHasValueParamAttribute/0',
+            u'value_param': u'33',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/ValueParamTest'
+        }, {
+            u'name': u'AnotherTestThatHasValueParamAttribute/1',
+            u'value_param': u'42',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'Single/ValueParamTest'
+        }]
+    }]
+}
+
+EXPECTED_FILTERED = {
+    u'tests':
+        1,
+    u'failures':
+        0,
+    u'disabled':
+        0,
+    u'errors':
+        0,
+    u'time':
+        u'*',
+    u'timestamp':
+        u'*',
+    u'name':
+        u'AllTests',
+    u'ad_hoc_property':
+        u'42',
+    u'testsuites': [{
+        u'name':
+            u'SuccessfulTest',
+        u'tests':
+            1,
+        u'failures':
+            0,
+        u'disabled':
+            0,
+        u'errors':
+            0,
+        u'time':
+            u'*',
+        u'timestamp':
+            u'*',
+        u'testsuite': [{
+            u'name': u'Succeeds',
+            u'status': u'RUN',
+            u'result': u'COMPLETED',
+            u'time': u'*',
+            u'timestamp': u'*',
+            u'classname': u'SuccessfulTest',
+        }]
+    }],
+}
+
+EXPECTED_EMPTY = {
+    u'tests': 0,
+    u'failures': 0,
+    u'disabled': 0,
+    u'errors': 0,
+    u'time': u'*',
+    u'timestamp': u'*',
+    u'name': u'AllTests',
+    u'testsuites': [],
+}
+
+GTEST_PROGRAM_PATH = gtest_test_utils.GetTestExecutablePath(GTEST_PROGRAM_NAME)
+
+SUPPORTS_TYPED_TESTS = 'TypedTest' in gtest_test_utils.Subprocess(
+    [GTEST_PROGRAM_PATH, GTEST_LIST_TESTS_FLAG], capture_stderr=False).output
+
+
+class GTestJsonOutputUnitTest(gtest_test_utils.TestCase):
+  """Unit test for Google Test's JSON output functionality.
+  """
+
+  # This test currently breaks on platforms that do not support typed and
+  # type-parameterized tests, so we don't run it under them.
+  if SUPPORTS_TYPED_TESTS:
+
+    def testNonEmptyJsonOutput(self):
+      """Verifies JSON output for a Google Test binary with non-empty output.
+
+      Runs a test program that generates a non-empty JSON output, and
+      tests that the JSON output is expected.
+      """
+      self._TestJsonOutput(GTEST_PROGRAM_NAME, EXPECTED_NON_EMPTY, 1)
+
+  def testEmptyJsonOutput(self):
+    """Verifies JSON output for a Google Test binary without actual tests.
+
+    Runs a test program that generates an empty JSON output, and
+    tests that the JSON output is expected.
+    """
+
+    self._TestJsonOutput('gtest_no_test_unittest', EXPECTED_EMPTY, 0)
+
+  def testTimestampValue(self):
+    """Checks whether the timestamp attribute in the JSON output is valid.
+
+    Runs a test program that generates an empty JSON output, and checks if
+    the timestamp attribute in the testsuites tag is valid.
+    """
+    actual = self._GetJsonOutput('gtest_no_test_unittest', [], 0)
+    date_time_str = actual['timestamp']
+    # datetime.strptime() is only available in Python 2.5+ so we have to
+    # parse the expected datetime manually.
+    match = re.match(r'(\d+)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)', date_time_str)
+    self.assertTrue(
+        re.match,
+        'JSON datettime string %s has incorrect format' % date_time_str)
+    date_time_from_json = datetime.datetime(
+        year=int(match.group(1)), month=int(match.group(2)),
+        day=int(match.group(3)), hour=int(match.group(4)),
+        minute=int(match.group(5)), second=int(match.group(6)))
+
+    time_delta = abs(datetime.datetime.now() - date_time_from_json)
+    # timestamp value should be near the current local time
+    self.assertTrue(time_delta < datetime.timedelta(seconds=600),
+                    'time_delta is %s' % time_delta)
+
+  def testDefaultOutputFile(self):
+    """Verifies the default output file name.
+
+    Confirms that Google Test produces an JSON output file with the expected
+    default name if no name is explicitly specified.
+    """
+    output_file = os.path.join(gtest_test_utils.GetTempDir(),
+                               GTEST_DEFAULT_OUTPUT_FILE)
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(
+        'gtest_no_test_unittest')
+    try:
+      os.remove(output_file)
+    except OSError:
+      e = sys.exc_info()[1]
+      if e.errno != errno.ENOENT:
+        raise
+
+    p = gtest_test_utils.Subprocess(
+        [gtest_prog_path, '%s=json' % GTEST_OUTPUT_FLAG],
+        working_dir=gtest_test_utils.GetTempDir())
+    self.assert_(p.exited)
+    self.assertEquals(0, p.exit_code)
+    self.assert_(os.path.isfile(output_file))
+
+  def testSuppressedJsonOutput(self):
+    """Verifies that no JSON output is generated.
+
+    Tests that no JSON file is generated if the default JSON listener is
+    shut down before RUN_ALL_TESTS is invoked.
+    """
+
+    json_path = os.path.join(gtest_test_utils.GetTempDir(),
+                             GTEST_PROGRAM_NAME + 'out.json')
+    if os.path.isfile(json_path):
+      os.remove(json_path)
+
+    command = [GTEST_PROGRAM_PATH,
+               '%s=json:%s' % (GTEST_OUTPUT_FLAG, json_path),
+               '--shut_down_xml']
+    p = gtest_test_utils.Subprocess(command)
+    if p.terminated_by_signal:
+      # p.signal is available only if p.terminated_by_signal is True.
+      self.assertFalse(
+          p.terminated_by_signal,
+          '%s was killed by signal %d' % (GTEST_PROGRAM_NAME, p.signal))
+    else:
+      self.assert_(p.exited)
+      self.assertEquals(1, p.exit_code,
+                        "'%s' exited with code %s, which doesn't match "
+                        'the expected exit code %s.'
+                        % (command, p.exit_code, 1))
+
+    self.assert_(not os.path.isfile(json_path))
+
+  def testFilteredTestJsonOutput(self):
+    """Verifies JSON output when a filter is applied.
+
+    Runs a test program that executes only some tests and verifies that
+    non-selected tests do not show up in the JSON output.
+    """
+
+    self._TestJsonOutput(GTEST_PROGRAM_NAME, EXPECTED_FILTERED, 0,
+                         extra_args=['%s=SuccessfulTest.*' % GTEST_FILTER_FLAG])
+
+  def _GetJsonOutput(self, gtest_prog_name, extra_args, expected_exit_code):
+    """Returns the JSON output generated by running the program gtest_prog_name.
+
+    Furthermore, the program's exit code must be expected_exit_code.
+
+    Args:
+      gtest_prog_name: Google Test binary name.
+      extra_args: extra arguments to binary invocation.
+      expected_exit_code: program's exit code.
+    """
+    json_path = os.path.join(gtest_test_utils.GetTempDir(),
+                             gtest_prog_name + 'out.json')
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(gtest_prog_name)
+
+    command = (
+        [gtest_prog_path, '%s=json:%s' % (GTEST_OUTPUT_FLAG, json_path)] +
+        extra_args
+    )
+    p = gtest_test_utils.Subprocess(command)
+    if p.terminated_by_signal:
+      self.assert_(False,
+                   '%s was killed by signal %d' % (gtest_prog_name, p.signal))
+    else:
+      self.assert_(p.exited)
+      self.assertEquals(expected_exit_code, p.exit_code,
+                        "'%s' exited with code %s, which doesn't match "
+                        'the expected exit code %s.'
+                        % (command, p.exit_code, expected_exit_code))
+    with open(json_path) as f:
+      actual = json.load(f)
+    return actual
+
+  def _TestJsonOutput(self, gtest_prog_name, expected,
+                      expected_exit_code, extra_args=None):
+    """Checks the JSON output generated by the Google Test binary.
+
+    Asserts that the JSON document generated by running the program
+    gtest_prog_name matches expected_json, a string containing another
+    JSON document.  Furthermore, the program's exit code must be
+    expected_exit_code.
+
+    Args:
+      gtest_prog_name: Google Test binary name.
+      expected: expected output.
+      expected_exit_code: program's exit code.
+      extra_args: extra arguments to binary invocation.
+    """
+
+    actual = self._GetJsonOutput(gtest_prog_name, extra_args or [],
+                                 expected_exit_code)
+    self.assertEqual(expected, gtest_json_test_utils.normalize(actual))
+
+
+if __name__ == '__main__':
+  if NO_STACKTRACE_SUPPORT_FLAG in sys.argv:
+    # unittest.main() can't handle unknown flags
+    sys.argv.remove(NO_STACKTRACE_SUPPORT_FLAG)
+
+  os.environ['GTEST_STACK_TRACE_DEPTH'] = '1'
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest.py
new file mode 100755
index 0000000..81423a3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+#
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for Google Test's --gtest_list_tests flag.
+
+A user can ask Google Test to list all tests by specifying the
+--gtest_list_tests flag.  This script tests such functionality
+by invoking googletest-list-tests-unittest_ (a program written with
+Google Test) the command line flags.
+"""
+
+import re
+import gtest_test_utils
+
+# Constants.
+
+# The command line flag for enabling/disabling listing all tests.
+LIST_TESTS_FLAG = 'gtest_list_tests'
+
+# Path to the googletest-list-tests-unittest_ program.
+EXE_PATH = gtest_test_utils.GetTestExecutablePath('googletest-list-tests-unittest_')
+
+# The expected output when running googletest-list-tests-unittest_ with
+# --gtest_list_tests
+EXPECTED_OUTPUT_NO_FILTER_RE = re.compile(r"""FooDeathTest\.
+  Test1
+Foo\.
+  Bar1
+  Bar2
+  DISABLED_Bar3
+Abc\.
+  Xyz
+  Def
+FooBar\.
+  Baz
+FooTest\.
+  Test1
+  DISABLED_Test2
+  Test3
+TypedTest/0\.  # TypeParam = (VeryLo{245}|class VeryLo{239})\.\.\.
+  TestA
+  TestB
+TypedTest/1\.  # TypeParam = int\s*\*( __ptr64)?
+  TestA
+  TestB
+TypedTest/2\.  # TypeParam = .*MyArray<bool,\s*42>
+  TestA
+  TestB
+My/TypeParamTest/0\.  # TypeParam = (VeryLo{245}|class VeryLo{239})\.\.\.
+  TestA
+  TestB
+My/TypeParamTest/1\.  # TypeParam = int\s*\*( __ptr64)?
+  TestA
+  TestB
+My/TypeParamTest/2\.  # TypeParam = .*MyArray<bool,\s*42>
+  TestA
+  TestB
+MyInstantiation/ValueParamTest\.
+  TestA/0  # GetParam\(\) = one line
+  TestA/1  # GetParam\(\) = two\\nlines
+  TestA/2  # GetParam\(\) = a very\\nlo{241}\.\.\.
+  TestB/0  # GetParam\(\) = one line
+  TestB/1  # GetParam\(\) = two\\nlines
+  TestB/2  # GetParam\(\) = a very\\nlo{241}\.\.\.
+""")
+
+# The expected output when running googletest-list-tests-unittest_ with
+# --gtest_list_tests and --gtest_filter=Foo*.
+EXPECTED_OUTPUT_FILTER_FOO_RE = re.compile(r"""FooDeathTest\.
+  Test1
+Foo\.
+  Bar1
+  Bar2
+  DISABLED_Bar3
+FooBar\.
+  Baz
+FooTest\.
+  Test1
+  DISABLED_Test2
+  Test3
+""")
+
+# Utilities.
+
+
+def Run(args):
+  """Runs googletest-list-tests-unittest_ and returns the list of tests printed."""
+
+  return gtest_test_utils.Subprocess([EXE_PATH] + args,
+                                     capture_stderr=False).output
+
+
+# The unit test.
+
+
+class GTestListTestsUnitTest(gtest_test_utils.TestCase):
+  """Tests using the --gtest_list_tests flag to list all tests."""
+
+  def RunAndVerify(self, flag_value, expected_output_re, other_flag):
+    """Runs googletest-list-tests-unittest_ and verifies that it prints
+    the correct tests.
+
+    Args:
+      flag_value:         value of the --gtest_list_tests flag;
+                          None if the flag should not be present.
+      expected_output_re: regular expression that matches the expected
+                          output after running command;
+      other_flag:         a different flag to be passed to command
+                          along with gtest_list_tests;
+                          None if the flag should not be present.
+    """
+
+    if flag_value is None:
+      flag = ''
+      flag_expression = 'not set'
+    elif flag_value == '0':
+      flag = '--%s=0' % LIST_TESTS_FLAG
+      flag_expression = '0'
+    else:
+      flag = '--%s' % LIST_TESTS_FLAG
+      flag_expression = '1'
+
+    args = [flag]
+
+    if other_flag is not None:
+      args += [other_flag]
+
+    output = Run(args)
+
+    if expected_output_re:
+      self.assert_(
+          expected_output_re.match(output),
+          ('when %s is %s, the output of "%s" is "%s",\n'
+           'which does not match regex "%s"' %
+           (LIST_TESTS_FLAG, flag_expression, ' '.join(args), output,
+            expected_output_re.pattern)))
+    else:
+      self.assert_(
+          not EXPECTED_OUTPUT_NO_FILTER_RE.match(output),
+          ('when %s is %s, the output of "%s" is "%s"'%
+           (LIST_TESTS_FLAG, flag_expression, ' '.join(args), output)))
+
+  def testDefaultBehavior(self):
+    """Tests the behavior of the default mode."""
+
+    self.RunAndVerify(flag_value=None,
+                      expected_output_re=None,
+                      other_flag=None)
+
+  def testFlag(self):
+    """Tests using the --gtest_list_tests flag."""
+
+    self.RunAndVerify(flag_value='0',
+                      expected_output_re=None,
+                      other_flag=None)
+    self.RunAndVerify(flag_value='1',
+                      expected_output_re=EXPECTED_OUTPUT_NO_FILTER_RE,
+                      other_flag=None)
+
+  def testOverrideNonFilterFlags(self):
+    """Tests that --gtest_list_tests overrides the non-filter flags."""
+
+    self.RunAndVerify(flag_value='1',
+                      expected_output_re=EXPECTED_OUTPUT_NO_FILTER_RE,
+                      other_flag='--gtest_break_on_failure')
+
+  def testWithFilterFlags(self):
+    """Tests that --gtest_list_tests takes into account the
+    --gtest_filter flag."""
+
+    self.RunAndVerify(flag_value='1',
+                      expected_output_re=EXPECTED_OUTPUT_FILTER_FOO_RE,
+                      other_flag='--gtest_filter=Foo*')
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest_.cc
new file mode 100755
index 0000000..493c6f0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-list-tests-unittest_.cc
@@ -0,0 +1,156 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Unit test for Google Test's --gtest_list_tests flag.
+//
+// A user can ask Google Test to list all tests that will run
+// so that when using a filter, a user will know what
+// tests to look for. The tests will not be run after listing.
+//
+// This program will be invoked from a Python unit test.
+// Don't run it directly.
+
+#include "gtest/gtest.h"
+
+// Several different test cases and tests that will be listed.
+TEST(Foo, Bar1) {
+}
+
+TEST(Foo, Bar2) {
+}
+
+TEST(Foo, DISABLED_Bar3) {
+}
+
+TEST(Abc, Xyz) {
+}
+
+TEST(Abc, Def) {
+}
+
+TEST(FooBar, Baz) {
+}
+
+class FooTest : public testing::Test {
+};
+
+TEST_F(FooTest, Test1) {
+}
+
+TEST_F(FooTest, DISABLED_Test2) {
+}
+
+TEST_F(FooTest, Test3) {
+}
+
+TEST(FooDeathTest, Test1) {
+}
+
+// A group of value-parameterized tests.
+
+class MyType {
+ public:
+  explicit MyType(const std::string& a_value) : value_(a_value) {}
+
+  const std::string& value() const { return value_; }
+
+ private:
+  std::string value_;
+};
+
+// Teaches Google Test how to print a MyType.
+void PrintTo(const MyType& x, std::ostream* os) {
+  *os << x.value();
+}
+
+class ValueParamTest : public testing::TestWithParam<MyType> {
+};
+
+TEST_P(ValueParamTest, TestA) {
+}
+
+TEST_P(ValueParamTest, TestB) {
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MyInstantiation, ValueParamTest,
+    testing::Values(MyType("one line"),
+                    MyType("two\nlines"),
+                    MyType("a very\nloooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong line")));  // NOLINT
+
+// A group of typed tests.
+
+// A deliberately long type name for testing the line-truncating
+// behavior when printing a type parameter.
+class VeryLoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooogName {  // NOLINT
+};
+
+template <typename T>
+class TypedTest : public testing::Test {
+};
+
+template <typename T, int kSize>
+class MyArray {
+};
+
+typedef testing::Types<VeryLoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooogName,  // NOLINT
+                       int*, MyArray<bool, 42> > MyTypes;
+
+TYPED_TEST_SUITE(TypedTest, MyTypes);
+
+TYPED_TEST(TypedTest, TestA) {
+}
+
+TYPED_TEST(TypedTest, TestB) {
+}
+
+// A group of type-parameterized tests.
+
+template <typename T>
+class TypeParamTest : public testing::Test {
+};
+
+TYPED_TEST_SUITE_P(TypeParamTest);
+
+TYPED_TEST_P(TypeParamTest, TestA) {
+}
+
+TYPED_TEST_P(TypeParamTest, TestB) {
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypeParamTest, TestA, TestB);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(My, TypeParamTest, MyTypes);
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-listener-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-listener-test.cc
new file mode 100755
index 0000000..10457af
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-listener-test.cc
@@ -0,0 +1,518 @@
+// Copyright 2009 Google Inc. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file verifies Google Test event listeners receive events at the
+// right times.
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+
+using ::testing::AddGlobalTestEnvironment;
+using ::testing::Environment;
+using ::testing::InitGoogleTest;
+using ::testing::Test;
+using ::testing::TestSuite;
+using ::testing::TestEventListener;
+using ::testing::TestInfo;
+using ::testing::TestPartResult;
+using ::testing::UnitTest;
+
+// Used by tests to register their events.
+std::vector<std::string>* g_events = nullptr;
+
+namespace testing {
+namespace internal {
+
+class EventRecordingListener : public TestEventListener {
+ public:
+  explicit EventRecordingListener(const char* name) : name_(name) {}
+
+ protected:
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnTestProgramStart"));
+  }
+
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int iteration) override {
+    Message message;
+    message << GetFullMethodName("OnTestIterationStart")
+            << "(" << iteration << ")";
+    g_events->push_back(message.GetString());
+  }
+
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsSetUpStart"));
+  }
+
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsSetUpEnd"));
+  }
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {
+    g_events->push_back(GetFullMethodName("OnTestCaseStart"));
+  }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {
+    g_events->push_back(GetFullMethodName("OnTestStart"));
+  }
+
+  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {
+    g_events->push_back(GetFullMethodName("OnTestPartResult"));
+  }
+
+  void OnTestEnd(const TestInfo& /*test_info*/) override {
+    g_events->push_back(GetFullMethodName("OnTestEnd"));
+  }
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {
+    g_events->push_back(GetFullMethodName("OnTestCaseEnd"));
+  }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsTearDownStart"));
+  }
+
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsTearDownEnd"));
+  }
+
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                          int iteration) override {
+    Message message;
+    message << GetFullMethodName("OnTestIterationEnd")
+            << "("  << iteration << ")";
+    g_events->push_back(message.GetString());
+  }
+
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnTestProgramEnd"));
+  }
+
+ private:
+  std::string GetFullMethodName(const char* name) {
+    return name_ + "." + name;
+  }
+
+  std::string name_;
+};
+
+// This listener is using OnTestSuiteStart, OnTestSuiteEnd API
+class EventRecordingListener2 : public TestEventListener {
+ public:
+  explicit EventRecordingListener2(const char* name) : name_(name) {}
+
+ protected:
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnTestProgramStart"));
+  }
+
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int iteration) override {
+    Message message;
+    message << GetFullMethodName("OnTestIterationStart") << "(" << iteration
+            << ")";
+    g_events->push_back(message.GetString());
+  }
+
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsSetUpStart"));
+  }
+
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsSetUpEnd"));
+  }
+
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {
+    g_events->push_back(GetFullMethodName("OnTestSuiteStart"));
+  }
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {
+    g_events->push_back(GetFullMethodName("OnTestStart"));
+  }
+
+  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {
+    g_events->push_back(GetFullMethodName("OnTestPartResult"));
+  }
+
+  void OnTestEnd(const TestInfo& /*test_info*/) override {
+    g_events->push_back(GetFullMethodName("OnTestEnd"));
+  }
+
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {
+    g_events->push_back(GetFullMethodName("OnTestSuiteEnd"));
+  }
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsTearDownStart"));
+  }
+
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnEnvironmentsTearDownEnd"));
+  }
+
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                          int iteration) override {
+    Message message;
+    message << GetFullMethodName("OnTestIterationEnd") << "(" << iteration
+            << ")";
+    g_events->push_back(message.GetString());
+  }
+
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {
+    g_events->push_back(GetFullMethodName("OnTestProgramEnd"));
+  }
+
+ private:
+  std::string GetFullMethodName(const char* name) { return name_ + "." + name; }
+
+  std::string name_;
+};
+
+class EnvironmentInvocationCatcher : public Environment {
+ protected:
+  void SetUp() override { g_events->push_back("Environment::SetUp"); }
+
+  void TearDown() override { g_events->push_back("Environment::TearDown"); }
+};
+
+class ListenerTest : public Test {
+ protected:
+  static void SetUpTestSuite() {
+    g_events->push_back("ListenerTest::SetUpTestSuite");
+  }
+
+  static void TearDownTestSuite() {
+    g_events->push_back("ListenerTest::TearDownTestSuite");
+  }
+
+  void SetUp() override { g_events->push_back("ListenerTest::SetUp"); }
+
+  void TearDown() override { g_events->push_back("ListenerTest::TearDown"); }
+};
+
+TEST_F(ListenerTest, DoesFoo) {
+  // Test execution order within a test case is not guaranteed so we are not
+  // recording the test name.
+  g_events->push_back("ListenerTest::* Test Body");
+  SUCCEED();  // Triggers OnTestPartResult.
+}
+
+TEST_F(ListenerTest, DoesBar) {
+  g_events->push_back("ListenerTest::* Test Body");
+  SUCCEED();  // Triggers OnTestPartResult.
+}
+
+}  // namespace internal
+
+}  // namespace testing
+
+using ::testing::internal::EnvironmentInvocationCatcher;
+using ::testing::internal::EventRecordingListener;
+using ::testing::internal::EventRecordingListener2;
+
+void VerifyResults(const std::vector<std::string>& data,
+                   const char* const* expected_data,
+                   size_t expected_data_size) {
+  const size_t actual_size = data.size();
+  // If the following assertion fails, a new entry will be appended to
+  // data.  Hence we save data.size() first.
+  EXPECT_EQ(expected_data_size, actual_size);
+
+  // Compares the common prefix.
+  const size_t shorter_size = expected_data_size <= actual_size ?
+      expected_data_size : actual_size;
+  size_t i = 0;
+  for (; i < shorter_size; ++i) {
+    ASSERT_STREQ(expected_data[i], data[i].c_str())
+        << "at position " << i;
+  }
+
+  // Prints extra elements in the actual data.
+  for (; i < actual_size; ++i) {
+    printf("  Actual event #%lu: %s\n",
+        static_cast<unsigned long>(i), data[i].c_str());
+  }
+}
+
+int main(int argc, char **argv) {
+  std::vector<std::string> events;
+  g_events = &events;
+  InitGoogleTest(&argc, argv);
+
+  UnitTest::GetInstance()->listeners().Append(
+      new EventRecordingListener("1st"));
+  UnitTest::GetInstance()->listeners().Append(
+      new EventRecordingListener("2nd"));
+  UnitTest::GetInstance()->listeners().Append(
+      new EventRecordingListener2("3rd"));
+
+  AddGlobalTestEnvironment(new EnvironmentInvocationCatcher);
+
+  GTEST_CHECK_(events.size() == 0)
+      << "AddGlobalTestEnvironment should not generate any events itself.";
+
+  ::testing::GTEST_FLAG(repeat) = 2;
+  int ret_val = RUN_ALL_TESTS();
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // The deprecated OnTestSuiteStart/OnTestCaseStart events are included
+  const char* const expected_events[] = {"1st.OnTestProgramStart",
+                                         "2nd.OnTestProgramStart",
+                                         "3rd.OnTestProgramStart",
+                                         "1st.OnTestIterationStart(0)",
+                                         "2nd.OnTestIterationStart(0)",
+                                         "3rd.OnTestIterationStart(0)",
+                                         "1st.OnEnvironmentsSetUpStart",
+                                         "2nd.OnEnvironmentsSetUpStart",
+                                         "3rd.OnEnvironmentsSetUpStart",
+                                         "Environment::SetUp",
+                                         "3rd.OnEnvironmentsSetUpEnd",
+                                         "2nd.OnEnvironmentsSetUpEnd",
+                                         "1st.OnEnvironmentsSetUpEnd",
+                                         "3rd.OnTestSuiteStart",
+                                         "1st.OnTestCaseStart",
+                                         "2nd.OnTestCaseStart",
+                                         "ListenerTest::SetUpTestSuite",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "ListenerTest::TearDownTestSuite",
+                                         "3rd.OnTestSuiteEnd",
+                                         "2nd.OnTestCaseEnd",
+                                         "1st.OnTestCaseEnd",
+                                         "1st.OnEnvironmentsTearDownStart",
+                                         "2nd.OnEnvironmentsTearDownStart",
+                                         "3rd.OnEnvironmentsTearDownStart",
+                                         "Environment::TearDown",
+                                         "3rd.OnEnvironmentsTearDownEnd",
+                                         "2nd.OnEnvironmentsTearDownEnd",
+                                         "1st.OnEnvironmentsTearDownEnd",
+                                         "3rd.OnTestIterationEnd(0)",
+                                         "2nd.OnTestIterationEnd(0)",
+                                         "1st.OnTestIterationEnd(0)",
+                                         "1st.OnTestIterationStart(1)",
+                                         "2nd.OnTestIterationStart(1)",
+                                         "3rd.OnTestIterationStart(1)",
+                                         "1st.OnEnvironmentsSetUpStart",
+                                         "2nd.OnEnvironmentsSetUpStart",
+                                         "3rd.OnEnvironmentsSetUpStart",
+                                         "Environment::SetUp",
+                                         "3rd.OnEnvironmentsSetUpEnd",
+                                         "2nd.OnEnvironmentsSetUpEnd",
+                                         "1st.OnEnvironmentsSetUpEnd",
+                                         "3rd.OnTestSuiteStart",
+                                         "1st.OnTestCaseStart",
+                                         "2nd.OnTestCaseStart",
+                                         "ListenerTest::SetUpTestSuite",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "ListenerTest::TearDownTestSuite",
+                                         "3rd.OnTestSuiteEnd",
+                                         "2nd.OnTestCaseEnd",
+                                         "1st.OnTestCaseEnd",
+                                         "1st.OnEnvironmentsTearDownStart",
+                                         "2nd.OnEnvironmentsTearDownStart",
+                                         "3rd.OnEnvironmentsTearDownStart",
+                                         "Environment::TearDown",
+                                         "3rd.OnEnvironmentsTearDownEnd",
+                                         "2nd.OnEnvironmentsTearDownEnd",
+                                         "1st.OnEnvironmentsTearDownEnd",
+                                         "3rd.OnTestIterationEnd(1)",
+                                         "2nd.OnTestIterationEnd(1)",
+                                         "1st.OnTestIterationEnd(1)",
+                                         "3rd.OnTestProgramEnd",
+                                         "2nd.OnTestProgramEnd",
+                                         "1st.OnTestProgramEnd"};
+#else
+  const char* const expected_events[] = {"1st.OnTestProgramStart",
+                                         "2nd.OnTestProgramStart",
+                                         "3rd.OnTestProgramStart",
+                                         "1st.OnTestIterationStart(0)",
+                                         "2nd.OnTestIterationStart(0)",
+                                         "3rd.OnTestIterationStart(0)",
+                                         "1st.OnEnvironmentsSetUpStart",
+                                         "2nd.OnEnvironmentsSetUpStart",
+                                         "3rd.OnEnvironmentsSetUpStart",
+                                         "Environment::SetUp",
+                                         "3rd.OnEnvironmentsSetUpEnd",
+                                         "2nd.OnEnvironmentsSetUpEnd",
+                                         "1st.OnEnvironmentsSetUpEnd",
+                                         "3rd.OnTestSuiteStart",
+                                         "ListenerTest::SetUpTestSuite",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "ListenerTest::TearDownTestSuite",
+                                         "3rd.OnTestSuiteEnd",
+                                         "1st.OnEnvironmentsTearDownStart",
+                                         "2nd.OnEnvironmentsTearDownStart",
+                                         "3rd.OnEnvironmentsTearDownStart",
+                                         "Environment::TearDown",
+                                         "3rd.OnEnvironmentsTearDownEnd",
+                                         "2nd.OnEnvironmentsTearDownEnd",
+                                         "1st.OnEnvironmentsTearDownEnd",
+                                         "3rd.OnTestIterationEnd(0)",
+                                         "2nd.OnTestIterationEnd(0)",
+                                         "1st.OnTestIterationEnd(0)",
+                                         "1st.OnTestIterationStart(1)",
+                                         "2nd.OnTestIterationStart(1)",
+                                         "3rd.OnTestIterationStart(1)",
+                                         "1st.OnEnvironmentsSetUpStart",
+                                         "2nd.OnEnvironmentsSetUpStart",
+                                         "3rd.OnEnvironmentsSetUpStart",
+                                         "Environment::SetUp",
+                                         "3rd.OnEnvironmentsSetUpEnd",
+                                         "2nd.OnEnvironmentsSetUpEnd",
+                                         "1st.OnEnvironmentsSetUpEnd",
+                                         "3rd.OnTestSuiteStart",
+                                         "ListenerTest::SetUpTestSuite",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "1st.OnTestStart",
+                                         "2nd.OnTestStart",
+                                         "3rd.OnTestStart",
+                                         "ListenerTest::SetUp",
+                                         "ListenerTest::* Test Body",
+                                         "1st.OnTestPartResult",
+                                         "2nd.OnTestPartResult",
+                                         "3rd.OnTestPartResult",
+                                         "ListenerTest::TearDown",
+                                         "3rd.OnTestEnd",
+                                         "2nd.OnTestEnd",
+                                         "1st.OnTestEnd",
+                                         "ListenerTest::TearDownTestSuite",
+                                         "3rd.OnTestSuiteEnd",
+                                         "1st.OnEnvironmentsTearDownStart",
+                                         "2nd.OnEnvironmentsTearDownStart",
+                                         "3rd.OnEnvironmentsTearDownStart",
+                                         "Environment::TearDown",
+                                         "3rd.OnEnvironmentsTearDownEnd",
+                                         "2nd.OnEnvironmentsTearDownEnd",
+                                         "1st.OnEnvironmentsTearDownEnd",
+                                         "3rd.OnTestIterationEnd(1)",
+                                         "2nd.OnTestIterationEnd(1)",
+                                         "1st.OnTestIterationEnd(1)",
+                                         "3rd.OnTestProgramEnd",
+                                         "2nd.OnTestProgramEnd",
+                                         "1st.OnTestProgramEnd"};
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  VerifyResults(events,
+                expected_events,
+                sizeof(expected_events)/sizeof(expected_events[0]));
+
+  // We need to check manually for ad hoc test failures that happen after
+  // RUN_ALL_TESTS finishes.
+  if (UnitTest::GetInstance()->Failed())
+    ret_val = 1;
+
+  return ret_val;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-message-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-message-test.cc
new file mode 100755
index 0000000..962d519
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-message-test.cc
@@ -0,0 +1,158 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for the Message class.
+
+#include "gtest/gtest-message.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+using ::testing::Message;
+
+// Tests the testing::Message class
+
+// Tests the default constructor.
+TEST(MessageTest, DefaultConstructor) {
+  const Message msg;
+  EXPECT_EQ("", msg.GetString());
+}
+
+// Tests the copy constructor.
+TEST(MessageTest, CopyConstructor) {
+  const Message msg1("Hello");
+  const Message msg2(msg1);
+  EXPECT_EQ("Hello", msg2.GetString());
+}
+
+// Tests constructing a Message from a C-string.
+TEST(MessageTest, ConstructsFromCString) {
+  Message msg("Hello");
+  EXPECT_EQ("Hello", msg.GetString());
+}
+
+// Tests streaming a float.
+TEST(MessageTest, StreamsFloat) {
+  const std::string s = (Message() << 1.23456F << " " << 2.34567F).GetString();
+  // Both numbers should be printed with enough precision.
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "1.234560", s.c_str());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, " 2.345669", s.c_str());
+}
+
+// Tests streaming a double.
+TEST(MessageTest, StreamsDouble) {
+  const std::string s = (Message() << 1260570880.4555497 << " "
+                                  << 1260572265.1954534).GetString();
+  // Both numbers should be printed with enough precision.
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "1260570880.45", s.c_str());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, " 1260572265.19", s.c_str());
+}
+
+// Tests streaming a non-char pointer.
+TEST(MessageTest, StreamsPointer) {
+  int n = 0;
+  int* p = &n;
+  EXPECT_NE("(null)", (Message() << p).GetString());
+}
+
+// Tests streaming a NULL non-char pointer.
+TEST(MessageTest, StreamsNullPointer) {
+  int* p = nullptr;
+  EXPECT_EQ("(null)", (Message() << p).GetString());
+}
+
+// Tests streaming a C string.
+TEST(MessageTest, StreamsCString) {
+  EXPECT_EQ("Foo", (Message() << "Foo").GetString());
+}
+
+// Tests streaming a NULL C string.
+TEST(MessageTest, StreamsNullCString) {
+  char* p = nullptr;
+  EXPECT_EQ("(null)", (Message() << p).GetString());
+}
+
+// Tests streaming std::string.
+TEST(MessageTest, StreamsString) {
+  const ::std::string str("Hello");
+  EXPECT_EQ("Hello", (Message() << str).GetString());
+}
+
+// Tests that we can output strings containing embedded NULs.
+TEST(MessageTest, StreamsStringWithEmbeddedNUL) {
+  const char char_array_with_nul[] =
+      "Here's a NUL\0 and some more string";
+  const ::std::string string_with_nul(char_array_with_nul,
+                                      sizeof(char_array_with_nul) - 1);
+  EXPECT_EQ("Here's a NUL\\0 and some more string",
+            (Message() << string_with_nul).GetString());
+}
+
+// Tests streaming a NUL char.
+TEST(MessageTest, StreamsNULChar) {
+  EXPECT_EQ("\\0", (Message() << '\0').GetString());
+}
+
+// Tests streaming int.
+TEST(MessageTest, StreamsInt) {
+  EXPECT_EQ("123", (Message() << 123).GetString());
+}
+
+// Tests that basic IO manipulators (endl, ends, and flush) can be
+// streamed to Message.
+TEST(MessageTest, StreamsBasicIoManip) {
+  EXPECT_EQ("Line 1.\nA NUL char \\0 in line 2.",
+               (Message() << "Line 1." << std::endl
+                         << "A NUL char " << std::ends << std::flush
+                         << " in line 2.").GetString());
+}
+
+// Tests Message::GetString()
+TEST(MessageTest, GetString) {
+  Message msg;
+  msg << 1 << " lamb";
+  EXPECT_EQ("1 lamb", msg.GetString());
+}
+
+// Tests streaming a Message object to an ostream.
+TEST(MessageTest, StreamsToOStream) {
+  Message msg("Hello");
+  ::std::stringstream ss;
+  ss << msg;
+  EXPECT_EQ("Hello", testing::internal::StringStreamToString(&ss));
+}
+
+// Tests that a Message object doesn't take up too much stack space.
+TEST(MessageTest, DoesNotTakeUpMuchStackSpace) {
+  EXPECT_LE(sizeof(Message), 16U);
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-options-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-options-test.cc
new file mode 100755
index 0000000..f07b316
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-options-test.cc
@@ -0,0 +1,216 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Google Test UnitTestOptions tests
+//
+// This file tests classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included from gtest.cc, to avoid changing build or
+// make-files on Windows and other platforms. Do not #include this file
+// anywhere else!
+
+#include "gtest/gtest.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+namespace {
+
+// Turns the given relative path into an absolute path.
+FilePath GetAbsolutePathOf(const FilePath& relative_path) {
+  return FilePath::ConcatPaths(FilePath::GetCurrentDir(), relative_path);
+}
+
+// Testing UnitTestOptions::GetOutputFormat/GetOutputFile.
+
+TEST(XmlOutputTest, GetOutputFormatDefault) {
+  GTEST_FLAG(output) = "";
+  EXPECT_STREQ("", UnitTestOptions::GetOutputFormat().c_str());
+}
+
+TEST(XmlOutputTest, GetOutputFormat) {
+  GTEST_FLAG(output) = "xml:filename";
+  EXPECT_STREQ("xml", UnitTestOptions::GetOutputFormat().c_str());
+}
+
+TEST(XmlOutputTest, GetOutputFileDefault) {
+  GTEST_FLAG(output) = "";
+  EXPECT_EQ(GetAbsolutePathOf(FilePath("test_detail.xml")).string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+}
+
+TEST(XmlOutputTest, GetOutputFileSingleFile) {
+  GTEST_FLAG(output) = "xml:filename.abc";
+  EXPECT_EQ(GetAbsolutePathOf(FilePath("filename.abc")).string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+}
+
+TEST(XmlOutputTest, GetOutputFileFromDirectoryPath) {
+  GTEST_FLAG(output) = "xml:path" GTEST_PATH_SEP_;
+  const std::string expected_output_file =
+      GetAbsolutePathOf(
+          FilePath(std::string("path") + GTEST_PATH_SEP_ +
+                   GetCurrentExecutableName().string() + ".xml")).string();
+  const std::string& output_file =
+      UnitTestOptions::GetAbsolutePathToOutputFile();
+#if GTEST_OS_WINDOWS
+  EXPECT_STRCASEEQ(expected_output_file.c_str(), output_file.c_str());
+#else
+  EXPECT_EQ(expected_output_file, output_file.c_str());
+#endif
+}
+
+TEST(OutputFileHelpersTest, GetCurrentExecutableName) {
+  const std::string exe_str = GetCurrentExecutableName().string();
+#if GTEST_OS_WINDOWS
+  const bool success =
+      _strcmpi("googletest-options-test", exe_str.c_str()) == 0 ||
+      _strcmpi("gtest-options-ex_test", exe_str.c_str()) == 0 ||
+      _strcmpi("gtest_all_test", exe_str.c_str()) == 0 ||
+      _strcmpi("gtest_dll_test", exe_str.c_str()) == 0;
+#elif GTEST_OS_OS2
+  const bool success =
+      strcasecmp("googletest-options-test", exe_str.c_str()) == 0 ||
+      strcasecmp("gtest-options-ex_test", exe_str.c_str()) == 0 ||
+      strcasecmp("gtest_all_test", exe_str.c_str()) == 0 ||
+      strcasecmp("gtest_dll_test", exe_str.c_str()) == 0;
+#elif GTEST_OS_FUCHSIA
+  const bool success = exe_str == "app";
+#else
+  const bool success =
+      exe_str == "googletest-options-test" ||
+      exe_str == "gtest_all_test" ||
+      exe_str == "lt-gtest_all_test" ||
+      exe_str == "gtest_dll_test";
+#endif  // GTEST_OS_WINDOWS
+  if (!success)
+    FAIL() << "GetCurrentExecutableName() returns " << exe_str;
+}
+
+#if !GTEST_OS_FUCHSIA
+
+class XmlOutputChangeDirTest : public Test {
+ protected:
+  void SetUp() override {
+    original_working_dir_ = FilePath::GetCurrentDir();
+    posix::ChDir("..");
+    // This will make the test fail if run from the root directory.
+    EXPECT_NE(original_working_dir_.string(),
+              FilePath::GetCurrentDir().string());
+  }
+
+  void TearDown() override {
+    posix::ChDir(original_working_dir_.string().c_str());
+  }
+
+  FilePath original_working_dir_;
+};
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithDefault) {
+  GTEST_FLAG(output) = "";
+  EXPECT_EQ(FilePath::ConcatPaths(original_working_dir_,
+                                  FilePath("test_detail.xml")).string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+}
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithDefaultXML) {
+  GTEST_FLAG(output) = "xml";
+  EXPECT_EQ(FilePath::ConcatPaths(original_working_dir_,
+                                  FilePath("test_detail.xml")).string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+}
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithRelativeFile) {
+  GTEST_FLAG(output) = "xml:filename.abc";
+  EXPECT_EQ(FilePath::ConcatPaths(original_working_dir_,
+                                  FilePath("filename.abc")).string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+}
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithRelativePath) {
+  GTEST_FLAG(output) = "xml:path" GTEST_PATH_SEP_;
+  const std::string expected_output_file =
+      FilePath::ConcatPaths(
+          original_working_dir_,
+          FilePath(std::string("path") + GTEST_PATH_SEP_ +
+                   GetCurrentExecutableName().string() + ".xml")).string();
+  const std::string& output_file =
+      UnitTestOptions::GetAbsolutePathToOutputFile();
+#if GTEST_OS_WINDOWS
+  EXPECT_STRCASEEQ(expected_output_file.c_str(), output_file.c_str());
+#else
+  EXPECT_EQ(expected_output_file, output_file.c_str());
+#endif
+}
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithAbsoluteFile) {
+#if GTEST_OS_WINDOWS
+  GTEST_FLAG(output) = "xml:c:\\tmp\\filename.abc";
+  EXPECT_EQ(FilePath("c:\\tmp\\filename.abc").string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+#else
+  GTEST_FLAG(output) ="xml:/tmp/filename.abc";
+  EXPECT_EQ(FilePath("/tmp/filename.abc").string(),
+            UnitTestOptions::GetAbsolutePathToOutputFile());
+#endif
+}
+
+TEST_F(XmlOutputChangeDirTest, PreserveOriginalWorkingDirWithAbsolutePath) {
+#if GTEST_OS_WINDOWS
+  const std::string path = "c:\\tmp\\";
+#else
+  const std::string path = "/tmp/";
+#endif
+
+  GTEST_FLAG(output) = "xml:" + path;
+  const std::string expected_output_file =
+      path + GetCurrentExecutableName().string() + ".xml";
+  const std::string& output_file =
+      UnitTestOptions::GetAbsolutePathToOutputFile();
+
+#if GTEST_OS_WINDOWS
+  EXPECT_STRCASEEQ(expected_output_file.c_str(), output_file.c_str());
+#else
+  EXPECT_EQ(expected_output_file, output_file.c_str());
+#endif
+}
+
+#endif  // !GTEST_OS_FUCHSIA
+
+}  // namespace
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test-golden-lin.txt b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test-golden-lin.txt
new file mode 100755
index 0000000..038de92
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test-golden-lin.txt
@@ -0,0 +1,1140 @@
+The non-test part of the code is expected to have 2 failures.
+
+googletest-output-test_.cc:#: Failure
+Value of: false
+  Actual: false
+Expected: true
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  2
+  3
+Stack trace: (omitted)
+
+[0;32m[==========] [mRunning 85 tests from 40 test suites.
+[0;32m[----------] [mGlobal test environment set-up.
+FooEnvironment::SetUp() called.
+BarEnvironment::SetUp() called.
+[0;32m[----------] [m1 test from ADeathTest
+[0;32m[ RUN      ] [mADeathTest.ShouldRunFirst
+[0;32m[       OK ] [mADeathTest.ShouldRunFirst
+[0;32m[----------] [m1 test from ATypedDeathTest/0, where TypeParam = int
+[0;32m[ RUN      ] [mATypedDeathTest/0.ShouldRunFirst
+[0;32m[       OK ] [mATypedDeathTest/0.ShouldRunFirst
+[0;32m[----------] [m1 test from ATypedDeathTest/1, where TypeParam = double
+[0;32m[ RUN      ] [mATypedDeathTest/1.ShouldRunFirst
+[0;32m[       OK ] [mATypedDeathTest/1.ShouldRunFirst
+[0;32m[----------] [m1 test from My/ATypeParamDeathTest/0, where TypeParam = int
+[0;32m[ RUN      ] [mMy/ATypeParamDeathTest/0.ShouldRunFirst
+[0;32m[       OK ] [mMy/ATypeParamDeathTest/0.ShouldRunFirst
+[0;32m[----------] [m1 test from My/ATypeParamDeathTest/1, where TypeParam = double
+[0;32m[ RUN      ] [mMy/ATypeParamDeathTest/1.ShouldRunFirst
+[0;32m[       OK ] [mMy/ATypeParamDeathTest/1.ShouldRunFirst
+[0;32m[----------] [m2 tests from PassingTest
+[0;32m[ RUN      ] [mPassingTest.PassingTest1
+[0;32m[       OK ] [mPassingTest.PassingTest1
+[0;32m[ RUN      ] [mPassingTest.PassingTest2
+[0;32m[       OK ] [mPassingTest.PassingTest2
+[0;32m[----------] [m2 tests from NonfatalFailureTest
+[0;32m[ RUN      ] [mNonfatalFailureTest.EscapesStringOperands
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  kGoldenString
+    Which is: "\"Line"
+  actual
+    Which is: "actual \"string\""
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  golden
+    Which is: "\"Line"
+  actual
+    Which is: "actual \"string\""
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mNonfatalFailureTest.EscapesStringOperands
+[0;32m[ RUN      ] [mNonfatalFailureTest.DiffForLongStrings
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  golden_str
+    Which is: "\"Line\0 1\"\nLine 2"
+  "Line 2"
+With diff:
+@@ -1,2 @@
+-\"Line\0 1\"
+ Line 2
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mNonfatalFailureTest.DiffForLongStrings
+[0;32m[----------] [m3 tests from FatalFailureTest
+[0;32m[ RUN      ] [mFatalFailureTest.FatalFailureInSubroutine
+(expecting a failure that x should be 1)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  x
+    Which is: 2
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mFatalFailureTest.FatalFailureInSubroutine
+[0;32m[ RUN      ] [mFatalFailureTest.FatalFailureInNestedSubroutine
+(expecting a failure that x should be 1)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  x
+    Which is: 2
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mFatalFailureTest.FatalFailureInNestedSubroutine
+[0;32m[ RUN      ] [mFatalFailureTest.NonfatalFailureInSubroutine
+(expecting a failure on false)
+googletest-output-test_.cc:#: Failure
+Value of: false
+  Actual: false
+Expected: true
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mFatalFailureTest.NonfatalFailureInSubroutine
+[0;32m[----------] [m1 test from LoggingTest
+[0;32m[ RUN      ] [mLoggingTest.InterleavingLoggingAndAssertions
+(expecting 2 failures on (3) >= (a[i]))
+i == 0
+i == 1
+googletest-output-test_.cc:#: Failure
+Expected: (3) >= (a[i]), actual: 3 vs 9
+Stack trace: (omitted)
+
+i == 2
+i == 3
+googletest-output-test_.cc:#: Failure
+Expected: (3) >= (a[i]), actual: 3 vs 6
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mLoggingTest.InterleavingLoggingAndAssertions
+[0;32m[----------] [m7 tests from SCOPED_TRACETest
+[0;32m[ RUN      ] [mSCOPED_TRACETest.AcceptedValues
+googletest-output-test_.cc:#: Failure
+Failed
+Just checking that all these values work fine.
+Google Test trace:
+googletest-output-test_.cc:#: (null)
+googletest-output-test_.cc:#: 1337
+googletest-output-test_.cc:#: std::string
+googletest-output-test_.cc:#: literal string
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.AcceptedValues
+[0;32m[ RUN      ] [mSCOPED_TRACETest.ObeysScopes
+(expected to fail)
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and shouldn't have a trace.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and should have a trace.
+Google Test trace:
+googletest-output-test_.cc:#: Expected trace
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and shouldn't have a trace.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.ObeysScopes
+[0;32m[ RUN      ] [mSCOPED_TRACETest.WorksInLoop
+(expected to fail)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  2
+  n
+    Which is: 1
+Google Test trace:
+googletest-output-test_.cc:#: i = 1
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  n
+    Which is: 2
+Google Test trace:
+googletest-output-test_.cc:#: i = 2
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksInLoop
+[0;32m[ RUN      ] [mSCOPED_TRACETest.WorksInSubroutine
+(expected to fail)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  2
+  n
+    Which is: 1
+Google Test trace:
+googletest-output-test_.cc:#: n = 1
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  n
+    Which is: 2
+Google Test trace:
+googletest-output-test_.cc:#: n = 2
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksInSubroutine
+[0;32m[ RUN      ] [mSCOPED_TRACETest.CanBeNested
+(expected to fail)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  n
+    Which is: 2
+Google Test trace:
+googletest-output-test_.cc:#: n = 2
+googletest-output-test_.cc:#: 
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.CanBeNested
+[0;32m[ RUN      ] [mSCOPED_TRACETest.CanBeRepeated
+(expected to fail)
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and should contain trace point A.
+Google Test trace:
+googletest-output-test_.cc:#: A
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and should contain trace point A and B.
+Google Test trace:
+googletest-output-test_.cc:#: B
+googletest-output-test_.cc:#: A
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and should contain trace point A, B, and C.
+Google Test trace:
+googletest-output-test_.cc:#: C
+googletest-output-test_.cc:#: B
+googletest-output-test_.cc:#: A
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+This failure is expected, and should contain trace point A, B, and D.
+Google Test trace:
+googletest-output-test_.cc:#: D
+googletest-output-test_.cc:#: B
+googletest-output-test_.cc:#: A
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.CanBeRepeated
+[0;32m[ RUN      ] [mSCOPED_TRACETest.WorksConcurrently
+(expecting 6 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #1 (in thread B, only trace B alive).
+Google Test trace:
+googletest-output-test_.cc:#: Trace B
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #2 (in thread A, trace A & B both alive).
+Google Test trace:
+googletest-output-test_.cc:#: Trace A
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #3 (in thread B, trace A & B both alive).
+Google Test trace:
+googletest-output-test_.cc:#: Trace B
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #4 (in thread B, only trace A alive).
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #5 (in thread A, only trace A alive).
+Google Test trace:
+googletest-output-test_.cc:#: Trace A
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #6 (in thread A, no trace alive).
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksConcurrently
+[0;32m[----------] [m1 test from ScopedTraceTest
+[0;32m[ RUN      ] [mScopedTraceTest.WithExplicitFileAndLine
+googletest-output-test_.cc:#: Failure
+Failed
+Check that the trace is attached to a particular location.
+Google Test trace:
+explicit_file.cc:123: expected trace message
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mScopedTraceTest.WithExplicitFileAndLine
+[0;32m[----------] [m1 test from NonFatalFailureInFixtureConstructorTest
+[0;32m[ RUN      ] [mNonFatalFailureInFixtureConstructorTest.FailureInConstructor
+(expecting 5 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #1, in the test fixture c'tor.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #2, in SetUp().
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #3, in the test body.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #4, in TearDown.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #5, in the test fixture d'tor.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mNonFatalFailureInFixtureConstructorTest.FailureInConstructor
+[0;32m[----------] [m1 test from FatalFailureInFixtureConstructorTest
+[0;32m[ RUN      ] [mFatalFailureInFixtureConstructorTest.FailureInConstructor
+(expecting 2 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #1, in the test fixture c'tor.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #2, in the test fixture d'tor.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mFatalFailureInFixtureConstructorTest.FailureInConstructor
+[0;32m[----------] [m1 test from NonFatalFailureInSetUpTest
+[0;32m[ RUN      ] [mNonFatalFailureInSetUpTest.FailureInSetUp
+(expecting 4 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #1, in SetUp().
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #2, in the test function.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #3, in TearDown().
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #4, in the test fixture d'tor.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mNonFatalFailureInSetUpTest.FailureInSetUp
+[0;32m[----------] [m1 test from FatalFailureInSetUpTest
+[0;32m[ RUN      ] [mFatalFailureInSetUpTest.FailureInSetUp
+(expecting 3 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #1, in SetUp().
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #2, in TearDown().
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected failure #3, in the test fixture d'tor.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mFatalFailureInSetUpTest.FailureInSetUp
+[0;32m[----------] [m1 test from AddFailureAtTest
+[0;32m[ RUN      ] [mAddFailureAtTest.MessageContainsSpecifiedFileAndLineNumber
+foo.cc:42: Failure
+Failed
+Expected nonfatal failure in foo.cc
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mAddFailureAtTest.MessageContainsSpecifiedFileAndLineNumber
+[0;32m[----------] [m1 test from GtestFailAtTest
+[0;32m[ RUN      ] [mGtestFailAtTest.MessageContainsSpecifiedFileAndLineNumber
+foo.cc:42: Failure
+Failed
+Expected fatal failure in foo.cc
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mGtestFailAtTest.MessageContainsSpecifiedFileAndLineNumber
+[0;32m[----------] [m4 tests from MixedUpTestSuiteTest
+[0;32m[ RUN      ] [mMixedUpTestSuiteTest.FirstTestFromNamespaceFoo
+[0;32m[       OK ] [mMixedUpTestSuiteTest.FirstTestFromNamespaceFoo
+[0;32m[ RUN      ] [mMixedUpTestSuiteTest.SecondTestFromNamespaceFoo
+[0;32m[       OK ] [mMixedUpTestSuiteTest.SecondTestFromNamespaceFoo
+[0;32m[ RUN      ] [mMixedUpTestSuiteTest.ThisShouldFail
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class.  However, in test suite MixedUpTestSuiteTest,
+you defined test FirstTestFromNamespaceFoo and test ThisShouldFail
+using two different test fixture classes.  This can happen if
+the two classes are from different namespaces or translation
+units and have the same name.  You should probably rename one
+of the classes to put the tests into different test suites.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mMixedUpTestSuiteTest.ThisShouldFail
+[0;32m[ RUN      ] [mMixedUpTestSuiteTest.ThisShouldFailToo
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class.  However, in test suite MixedUpTestSuiteTest,
+you defined test FirstTestFromNamespaceFoo and test ThisShouldFailToo
+using two different test fixture classes.  This can happen if
+the two classes are from different namespaces or translation
+units and have the same name.  You should probably rename one
+of the classes to put the tests into different test suites.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mMixedUpTestSuiteTest.ThisShouldFailToo
+[0;32m[----------] [m2 tests from MixedUpTestSuiteWithSameTestNameTest
+[0;32m[ RUN      ] [mMixedUpTestSuiteWithSameTestNameTest.TheSecondTestWithThisNameShouldFail
+[0;32m[       OK ] [mMixedUpTestSuiteWithSameTestNameTest.TheSecondTestWithThisNameShouldFail
+[0;32m[ RUN      ] [mMixedUpTestSuiteWithSameTestNameTest.TheSecondTestWithThisNameShouldFail
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class.  However, in test suite MixedUpTestSuiteWithSameTestNameTest,
+you defined test TheSecondTestWithThisNameShouldFail and test TheSecondTestWithThisNameShouldFail
+using two different test fixture classes.  This can happen if
+the two classes are from different namespaces or translation
+units and have the same name.  You should probably rename one
+of the classes to put the tests into different test suites.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mMixedUpTestSuiteWithSameTestNameTest.TheSecondTestWithThisNameShouldFail
+[0;32m[----------] [m2 tests from TEST_F_before_TEST_in_same_test_case
+[0;32m[ RUN      ] [mTEST_F_before_TEST_in_same_test_case.DefinedUsingTEST_F
+[0;32m[       OK ] [mTEST_F_before_TEST_in_same_test_case.DefinedUsingTEST_F
+[0;32m[ RUN      ] [mTEST_F_before_TEST_in_same_test_case.DefinedUsingTESTAndShouldFail
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class, so mixing TEST_F and TEST in the same test suite is
+illegal.  In test suite TEST_F_before_TEST_in_same_test_case,
+test DefinedUsingTEST_F is defined using TEST_F but
+test DefinedUsingTESTAndShouldFail is defined using TEST.  You probably
+want to change the TEST to TEST_F or move it to another test
+case.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mTEST_F_before_TEST_in_same_test_case.DefinedUsingTESTAndShouldFail
+[0;32m[----------] [m2 tests from TEST_before_TEST_F_in_same_test_case
+[0;32m[ RUN      ] [mTEST_before_TEST_F_in_same_test_case.DefinedUsingTEST
+[0;32m[       OK ] [mTEST_before_TEST_F_in_same_test_case.DefinedUsingTEST
+[0;32m[ RUN      ] [mTEST_before_TEST_F_in_same_test_case.DefinedUsingTEST_FAndShouldFail
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class, so mixing TEST_F and TEST in the same test suite is
+illegal.  In test suite TEST_before_TEST_F_in_same_test_case,
+test DefinedUsingTEST_FAndShouldFail is defined using TEST_F but
+test DefinedUsingTEST is defined using TEST.  You probably
+want to change the TEST to TEST_F or move it to another test
+case.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mTEST_before_TEST_F_in_same_test_case.DefinedUsingTEST_FAndShouldFail
+[0;32m[----------] [m8 tests from ExpectNonfatalFailureTest
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.CanReferenceGlobalVariables
+[0;32m[       OK ] [mExpectNonfatalFailureTest.CanReferenceGlobalVariables
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.CanReferenceLocalVariables
+[0;32m[       OK ] [mExpectNonfatalFailureTest.CanReferenceLocalVariables
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.SucceedsWhenThereIsOneNonfatalFailure
+[0;32m[       OK ] [mExpectNonfatalFailureTest.SucceedsWhenThereIsOneNonfatalFailure
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.FailsWhenThereIsNoNonfatalFailure
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereIsNoNonfatalFailure
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.FailsWhenThereAreTwoNonfatalFailures
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual: 2 failures
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure 1.
+Stack trace: (omitted)
+
+
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure 2.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereAreTwoNonfatalFailures
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.FailsWhenThereIsOneFatalFailure
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual:
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereIsOneFatalFailure
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.FailsWhenStatementReturns
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenStatementReturns
+[0;32m[ RUN      ] [mExpectNonfatalFailureTest.FailsWhenStatementThrows
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenStatementThrows
+[0;32m[----------] [m8 tests from ExpectFatalFailureTest
+[0;32m[ RUN      ] [mExpectFatalFailureTest.CanReferenceGlobalVariables
+[0;32m[       OK ] [mExpectFatalFailureTest.CanReferenceGlobalVariables
+[0;32m[ RUN      ] [mExpectFatalFailureTest.CanReferenceLocalStaticVariables
+[0;32m[       OK ] [mExpectFatalFailureTest.CanReferenceLocalStaticVariables
+[0;32m[ RUN      ] [mExpectFatalFailureTest.SucceedsWhenThereIsOneFatalFailure
+[0;32m[       OK ] [mExpectFatalFailureTest.SucceedsWhenThereIsOneFatalFailure
+[0;32m[ RUN      ] [mExpectFatalFailureTest.FailsWhenThereIsNoFatalFailure
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereIsNoFatalFailure
+[0;32m[ RUN      ] [mExpectFatalFailureTest.FailsWhenThereAreTwoFatalFailures
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual: 2 failures
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereAreTwoFatalFailures
+[0;32m[ RUN      ] [mExpectFatalFailureTest.FailsWhenThereIsOneNonfatalFailure
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual:
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereIsOneNonfatalFailure
+[0;32m[ RUN      ] [mExpectFatalFailureTest.FailsWhenStatementReturns
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenStatementReturns
+[0;32m[ RUN      ] [mExpectFatalFailureTest.FailsWhenStatementThrows
+(expecting a failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenStatementThrows
+[0;32m[----------] [m2 tests from TypedTest/0, where TypeParam = int
+[0;32m[ RUN      ] [mTypedTest/0.Success
+[0;32m[       OK ] [mTypedTest/0.Success
+[0;32m[ RUN      ] [mTypedTest/0.Failure
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  TypeParam()
+    Which is: 0
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mTypedTest/0.Failure, where TypeParam = int
+[0;32m[----------] [m2 tests from TypedTestWithNames/char0, where TypeParam = char
+[0;32m[ RUN      ] [mTypedTestWithNames/char0.Success
+[0;32m[       OK ] [mTypedTestWithNames/char0.Success
+[0;32m[ RUN      ] [mTypedTestWithNames/char0.Failure
+googletest-output-test_.cc:#: Failure
+Failed
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mTypedTestWithNames/char0.Failure, where TypeParam = char
+[0;32m[----------] [m2 tests from TypedTestWithNames/int1, where TypeParam = int
+[0;32m[ RUN      ] [mTypedTestWithNames/int1.Success
+[0;32m[       OK ] [mTypedTestWithNames/int1.Success
+[0;32m[ RUN      ] [mTypedTestWithNames/int1.Failure
+googletest-output-test_.cc:#: Failure
+Failed
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mTypedTestWithNames/int1.Failure, where TypeParam = int
+[0;32m[----------] [m2 tests from Unsigned/TypedTestP/0, where TypeParam = unsigned char
+[0;32m[ RUN      ] [mUnsigned/TypedTestP/0.Success
+[0;32m[       OK ] [mUnsigned/TypedTestP/0.Success
+[0;32m[ RUN      ] [mUnsigned/TypedTestP/0.Failure
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1U
+    Which is: 1
+  TypeParam()
+    Which is: '\0'
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mUnsigned/TypedTestP/0.Failure, where TypeParam = unsigned char
+[0;32m[----------] [m2 tests from Unsigned/TypedTestP/1, where TypeParam = unsigned int
+[0;32m[ RUN      ] [mUnsigned/TypedTestP/1.Success
+[0;32m[       OK ] [mUnsigned/TypedTestP/1.Success
+[0;32m[ RUN      ] [mUnsigned/TypedTestP/1.Failure
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1U
+    Which is: 1
+  TypeParam()
+    Which is: 0
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mUnsigned/TypedTestP/1.Failure, where TypeParam = unsigned int
+[0;32m[----------] [m2 tests from UnsignedCustomName/TypedTestP/unsignedChar0, where TypeParam = unsigned char
+[0;32m[ RUN      ] [mUnsignedCustomName/TypedTestP/unsignedChar0.Success
+[0;32m[       OK ] [mUnsignedCustomName/TypedTestP/unsignedChar0.Success
+[0;32m[ RUN      ] [mUnsignedCustomName/TypedTestP/unsignedChar0.Failure
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1U
+    Which is: 1
+  TypeParam()
+    Which is: '\0'
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mUnsignedCustomName/TypedTestP/unsignedChar0.Failure, where TypeParam = unsigned char
+[0;32m[----------] [m2 tests from UnsignedCustomName/TypedTestP/unsignedInt1, where TypeParam = unsigned int
+[0;32m[ RUN      ] [mUnsignedCustomName/TypedTestP/unsignedInt1.Success
+[0;32m[       OK ] [mUnsignedCustomName/TypedTestP/unsignedInt1.Success
+[0;32m[ RUN      ] [mUnsignedCustomName/TypedTestP/unsignedInt1.Failure
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1U
+    Which is: 1
+  TypeParam()
+    Which is: 0
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mUnsignedCustomName/TypedTestP/unsignedInt1.Failure, where TypeParam = unsigned int
+[0;32m[----------] [m4 tests from ExpectFailureTest
+[0;32m[ RUN      ] [mExpectFailureTest.ExpectFatalFailure
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual:
+googletest-output-test_.cc:#: Success:
+Succeeded
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual:
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure containing "Some other fatal failure expected."
+  Actual:
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectFatalFailure
+[0;32m[ RUN      ] [mExpectFailureTest.ExpectNonFatalFailure
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual:
+googletest-output-test_.cc:#: Success:
+Succeeded
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual:
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure containing "Some other non-fatal failure."
+  Actual:
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectNonFatalFailure
+[0;32m[ RUN      ] [mExpectFailureTest.ExpectFatalFailureOnAllThreads
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual:
+googletest-output-test_.cc:#: Success:
+Succeeded
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual:
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 fatal failure containing "Some other fatal failure expected."
+  Actual:
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectFatalFailureOnAllThreads
+[0;32m[ RUN      ] [mExpectFailureTest.ExpectNonFatalFailureOnAllThreads
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual:
+googletest-output-test_.cc:#: Success:
+Succeeded
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual:
+googletest-output-test_.cc:#: Fatal failure:
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+(expecting 1 failure)
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure containing "Some other non-fatal failure."
+  Actual:
+googletest-output-test_.cc:#: Non-fatal failure:
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectNonFatalFailureOnAllThreads
+[0;32m[----------] [m2 tests from ExpectFailureWithThreadsTest
+[0;32m[ RUN      ] [mExpectFailureWithThreadsTest.ExpectFatalFailure
+(expecting 2 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+gtest.cc:#: Failure
+Expected: 1 fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureWithThreadsTest.ExpectFatalFailure
+[0;32m[ RUN      ] [mExpectFailureWithThreadsTest.ExpectNonFatalFailure
+(expecting 2 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+gtest.cc:#: Failure
+Expected: 1 non-fatal failure
+  Actual: 0 failures
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mExpectFailureWithThreadsTest.ExpectNonFatalFailure
+[0;32m[----------] [m1 test from ScopedFakeTestPartResultReporterTest
+[0;32m[ RUN      ] [mScopedFakeTestPartResultReporterTest.InterceptOnlyCurrentThread
+(expecting 2 failures)
+googletest-output-test_.cc:#: Failure
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+googletest-output-test_.cc:#: Failure
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mScopedFakeTestPartResultReporterTest.InterceptOnlyCurrentThread
+[0;32m[----------] [m2 tests from DynamicFixture
+DynamicFixture::SetUpTestSuite
+[0;32m[ RUN      ] [mDynamicFixture.DynamicTestPass
+DynamicFixture()
+DynamicFixture::SetUp
+DynamicFixture::TearDown
+~DynamicFixture()
+[0;32m[       OK ] [mDynamicFixture.DynamicTestPass
+[0;32m[ RUN      ] [mDynamicFixture.DynamicTestFail
+DynamicFixture()
+DynamicFixture::SetUp
+googletest-output-test_.cc:#: Failure
+Value of: Pass
+  Actual: false
+Expected: true
+Stack trace: (omitted)
+
+DynamicFixture::TearDown
+~DynamicFixture()
+[0;31m[  FAILED  ] [mDynamicFixture.DynamicTestFail
+DynamicFixture::TearDownTestSuite
+[0;32m[----------] [m1 test from DynamicFixtureAnotherName
+DynamicFixture::SetUpTestSuite
+[0;32m[ RUN      ] [mDynamicFixtureAnotherName.DynamicTestPass
+DynamicFixture()
+DynamicFixture::SetUp
+DynamicFixture::TearDown
+~DynamicFixture()
+[0;32m[       OK ] [mDynamicFixtureAnotherName.DynamicTestPass
+DynamicFixture::TearDownTestSuite
+[0;32m[----------] [m2 tests from BadDynamicFixture1
+DynamicFixture::SetUpTestSuite
+[0;32m[ RUN      ] [mBadDynamicFixture1.FixtureBase
+DynamicFixture()
+DynamicFixture::SetUp
+DynamicFixture::TearDown
+~DynamicFixture()
+[0;32m[       OK ] [mBadDynamicFixture1.FixtureBase
+[0;32m[ RUN      ] [mBadDynamicFixture1.TestBase
+DynamicFixture()
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class, so mixing TEST_F and TEST in the same test suite is
+illegal.  In test suite BadDynamicFixture1,
+test FixtureBase is defined using TEST_F but
+test TestBase is defined using TEST.  You probably
+want to change the TEST to TEST_F or move it to another test
+case.
+Stack trace: (omitted)
+
+~DynamicFixture()
+[0;31m[  FAILED  ] [mBadDynamicFixture1.TestBase
+DynamicFixture::TearDownTestSuite
+[0;32m[----------] [m2 tests from BadDynamicFixture2
+DynamicFixture::SetUpTestSuite
+[0;32m[ RUN      ] [mBadDynamicFixture2.FixtureBase
+DynamicFixture()
+DynamicFixture::SetUp
+DynamicFixture::TearDown
+~DynamicFixture()
+[0;32m[       OK ] [mBadDynamicFixture2.FixtureBase
+[0;32m[ RUN      ] [mBadDynamicFixture2.Derived
+DynamicFixture()
+gtest.cc:#: Failure
+Failed
+All tests in the same test suite must use the same test fixture
+class.  However, in test suite BadDynamicFixture2,
+you defined test FixtureBase and test Derived
+using two different test fixture classes.  This can happen if
+the two classes are from different namespaces or translation
+units and have the same name.  You should probably rename one
+of the classes to put the tests into different test suites.
+Stack trace: (omitted)
+
+~DynamicFixture()
+[0;31m[  FAILED  ] [mBadDynamicFixture2.Derived
+DynamicFixture::TearDownTestSuite
+[0;32m[----------] [m1 test from PrintingFailingParams/FailingParamTest
+[0;32m[ RUN      ] [mPrintingFailingParams/FailingParamTest.Fails/0
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  GetParam()
+    Which is: 2
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mPrintingFailingParams/FailingParamTest.Fails/0, where GetParam() = 2
+[0;32m[----------] [m1 test from EmptyBasenameParamInst
+[0;32m[ RUN      ] [mEmptyBasenameParamInst.Passes/0
+[0;32m[       OK ] [mEmptyBasenameParamInst.Passes/0
+[0;32m[----------] [m2 tests from PrintingStrings/ParamTest
+[0;32m[ RUN      ] [mPrintingStrings/ParamTest.Success/a
+[0;32m[       OK ] [mPrintingStrings/ParamTest.Success/a
+[0;32m[ RUN      ] [mPrintingStrings/ParamTest.Failure/a
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  "b"
+  GetParam()
+    Which is: "a"
+Expected failure
+Stack trace: (omitted)
+
+[0;31m[  FAILED  ] [mPrintingStrings/ParamTest.Failure/a, where GetParam() = "a"
+[0;32m[----------] [mGlobal test environment tear-down
+BarEnvironment::TearDown() called.
+googletest-output-test_.cc:#: Failure
+Failed
+Expected non-fatal failure.
+Stack trace: (omitted)
+
+FooEnvironment::TearDown() called.
+googletest-output-test_.cc:#: Failure
+Failed
+Expected fatal failure.
+Stack trace: (omitted)
+
+[0;32m[==========] [m85 tests from 40 test suites ran.
+[0;32m[  PASSED  ] [m31 tests.
+[0;31m[  FAILED  ] [m54 tests, listed below:
+[0;31m[  FAILED  ] [mNonfatalFailureTest.EscapesStringOperands
+[0;31m[  FAILED  ] [mNonfatalFailureTest.DiffForLongStrings
+[0;31m[  FAILED  ] [mFatalFailureTest.FatalFailureInSubroutine
+[0;31m[  FAILED  ] [mFatalFailureTest.FatalFailureInNestedSubroutine
+[0;31m[  FAILED  ] [mFatalFailureTest.NonfatalFailureInSubroutine
+[0;31m[  FAILED  ] [mLoggingTest.InterleavingLoggingAndAssertions
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.AcceptedValues
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.ObeysScopes
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksInLoop
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksInSubroutine
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.CanBeNested
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.CanBeRepeated
+[0;31m[  FAILED  ] [mSCOPED_TRACETest.WorksConcurrently
+[0;31m[  FAILED  ] [mScopedTraceTest.WithExplicitFileAndLine
+[0;31m[  FAILED  ] [mNonFatalFailureInFixtureConstructorTest.FailureInConstructor
+[0;31m[  FAILED  ] [mFatalFailureInFixtureConstructorTest.FailureInConstructor
+[0;31m[  FAILED  ] [mNonFatalFailureInSetUpTest.FailureInSetUp
+[0;31m[  FAILED  ] [mFatalFailureInSetUpTest.FailureInSetUp
+[0;31m[  FAILED  ] [mAddFailureAtTest.MessageContainsSpecifiedFileAndLineNumber
+[0;31m[  FAILED  ] [mGtestFailAtTest.MessageContainsSpecifiedFileAndLineNumber
+[0;31m[  FAILED  ] [mMixedUpTestSuiteTest.ThisShouldFail
+[0;31m[  FAILED  ] [mMixedUpTestSuiteTest.ThisShouldFailToo
+[0;31m[  FAILED  ] [mMixedUpTestSuiteWithSameTestNameTest.TheSecondTestWithThisNameShouldFail
+[0;31m[  FAILED  ] [mTEST_F_before_TEST_in_same_test_case.DefinedUsingTESTAndShouldFail
+[0;31m[  FAILED  ] [mTEST_before_TEST_F_in_same_test_case.DefinedUsingTEST_FAndShouldFail
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereIsNoNonfatalFailure
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereAreTwoNonfatalFailures
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenThereIsOneFatalFailure
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenStatementReturns
+[0;31m[  FAILED  ] [mExpectNonfatalFailureTest.FailsWhenStatementThrows
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereIsNoFatalFailure
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereAreTwoFatalFailures
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenThereIsOneNonfatalFailure
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenStatementReturns
+[0;31m[  FAILED  ] [mExpectFatalFailureTest.FailsWhenStatementThrows
+[0;31m[  FAILED  ] [mTypedTest/0.Failure, where TypeParam = int
+[0;31m[  FAILED  ] [mTypedTestWithNames/char0.Failure, where TypeParam = char
+[0;31m[  FAILED  ] [mTypedTestWithNames/int1.Failure, where TypeParam = int
+[0;31m[  FAILED  ] [mUnsigned/TypedTestP/0.Failure, where TypeParam = unsigned char
+[0;31m[  FAILED  ] [mUnsigned/TypedTestP/1.Failure, where TypeParam = unsigned int
+[0;31m[  FAILED  ] [mUnsignedCustomName/TypedTestP/unsignedChar0.Failure, where TypeParam = unsigned char
+[0;31m[  FAILED  ] [mUnsignedCustomName/TypedTestP/unsignedInt1.Failure, where TypeParam = unsigned int
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectFatalFailure
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectNonFatalFailure
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectFatalFailureOnAllThreads
+[0;31m[  FAILED  ] [mExpectFailureTest.ExpectNonFatalFailureOnAllThreads
+[0;31m[  FAILED  ] [mExpectFailureWithThreadsTest.ExpectFatalFailure
+[0;31m[  FAILED  ] [mExpectFailureWithThreadsTest.ExpectNonFatalFailure
+[0;31m[  FAILED  ] [mScopedFakeTestPartResultReporterTest.InterceptOnlyCurrentThread
+[0;31m[  FAILED  ] [mDynamicFixture.DynamicTestFail
+[0;31m[  FAILED  ] [mBadDynamicFixture1.TestBase
+[0;31m[  FAILED  ] [mBadDynamicFixture2.Derived
+[0;31m[  FAILED  ] [mPrintingFailingParams/FailingParamTest.Fails/0, where GetParam() = 2
+[0;31m[  FAILED  ] [mPrintingStrings/ParamTest.Failure/a, where GetParam() = "a"
+
+54 FAILED TESTS
+[0;33m  YOU HAVE 1 DISABLED TEST
+
+[mNote: Google Test filter = FatalFailureTest.*:LoggingTest.*
+[==========] Running 4 tests from 2 test suites.
+[----------] Global test environment set-up.
+[----------] 3 tests from FatalFailureTest
+[ RUN      ] FatalFailureTest.FatalFailureInSubroutine
+(expecting a failure that x should be 1)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  x
+    Which is: 2
+Stack trace: (omitted)
+
+[  FAILED  ] FatalFailureTest.FatalFailureInSubroutine (? ms)
+[ RUN      ] FatalFailureTest.FatalFailureInNestedSubroutine
+(expecting a failure that x should be 1)
+googletest-output-test_.cc:#: Failure
+Expected equality of these values:
+  1
+  x
+    Which is: 2
+Stack trace: (omitted)
+
+[  FAILED  ] FatalFailureTest.FatalFailureInNestedSubroutine (? ms)
+[ RUN      ] FatalFailureTest.NonfatalFailureInSubroutine
+(expecting a failure on false)
+googletest-output-test_.cc:#: Failure
+Value of: false
+  Actual: false
+Expected: true
+Stack trace: (omitted)
+
+[  FAILED  ] FatalFailureTest.NonfatalFailureInSubroutine (? ms)
+[----------] 3 tests from FatalFailureTest (? ms total)
+
+[----------] 1 test from LoggingTest
+[ RUN      ] LoggingTest.InterleavingLoggingAndAssertions
+(expecting 2 failures on (3) >= (a[i]))
+i == 0
+i == 1
+googletest-output-test_.cc:#: Failure
+Expected: (3) >= (a[i]), actual: 3 vs 9
+Stack trace: (omitted)
+
+i == 2
+i == 3
+googletest-output-test_.cc:#: Failure
+Expected: (3) >= (a[i]), actual: 3 vs 6
+Stack trace: (omitted)
+
+[  FAILED  ] LoggingTest.InterleavingLoggingAndAssertions (? ms)
+[----------] 1 test from LoggingTest (? ms total)
+
+[----------] Global test environment tear-down
+[==========] 4 tests from 2 test suites ran. (? ms total)
+[  PASSED  ] 0 tests.
+[  FAILED  ] 4 tests, listed below:
+[  FAILED  ] FatalFailureTest.FatalFailureInSubroutine
+[  FAILED  ] FatalFailureTest.FatalFailureInNestedSubroutine
+[  FAILED  ] FatalFailureTest.NonfatalFailureInSubroutine
+[  FAILED  ] LoggingTest.InterleavingLoggingAndAssertions
+
+ 4 FAILED TESTS
+Note: Google Test filter = *DISABLED_*
+[==========] Running 1 test from 1 test suite.
+[----------] Global test environment set-up.
+[----------] 1 test from DisabledTestsWarningTest
+[ RUN      ] DisabledTestsWarningTest.DISABLED_AlsoRunDisabledTestsFlagSuppressesWarning
+[       OK ] DisabledTestsWarningTest.DISABLED_AlsoRunDisabledTestsFlagSuppressesWarning
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test suite ran.
+[  PASSED  ] 1 test.
+Note: Google Test filter = PassingTest.*
+Note: This is test shard 2 of 2.
+[==========] Running 1 test from 1 test suite.
+[----------] Global test environment set-up.
+[----------] 1 test from PassingTest
+[ RUN      ] PassingTest.PassingTest2
+[       OK ] PassingTest.PassingTest2
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test suite ran.
+[  PASSED  ] 1 test.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test.py
new file mode 100755
index 0000000..c727f17
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests the text output of Google C++ Testing and Mocking Framework.
+
+To update the golden file:
+googletest_output_test.py --build_dir=BUILD/DIR --gengolden
+where BUILD/DIR contains the built googletest-output-test_ file.
+googletest_output_test.py --gengolden
+googletest_output_test.py
+"""
+
+import difflib
+import os
+import re
+import sys
+import gtest_test_utils
+
+
+# The flag for generating the golden file
+GENGOLDEN_FLAG = '--gengolden'
+CATCH_EXCEPTIONS_ENV_VAR_NAME = 'GTEST_CATCH_EXCEPTIONS'
+
+# The flag indicating stacktraces are not supported
+NO_STACKTRACE_SUPPORT_FLAG = '--no_stacktrace_support'
+
+IS_LINUX = os.name == 'posix' and os.uname()[0] == 'Linux'
+IS_WINDOWS = os.name == 'nt'
+
+GOLDEN_NAME = 'googletest-output-test-golden-lin.txt'
+
+PROGRAM_PATH = gtest_test_utils.GetTestExecutablePath('googletest-output-test_')
+
+# At least one command we exercise must not have the
+# 'internal_skip_environment_and_ad_hoc_tests' argument.
+COMMAND_LIST_TESTS = ({}, [PROGRAM_PATH, '--gtest_list_tests'])
+COMMAND_WITH_COLOR = ({}, [PROGRAM_PATH, '--gtest_color=yes'])
+COMMAND_WITH_TIME = ({}, [PROGRAM_PATH,
+                          '--gtest_print_time',
+                          'internal_skip_environment_and_ad_hoc_tests',
+                          '--gtest_filter=FatalFailureTest.*:LoggingTest.*'])
+COMMAND_WITH_DISABLED = (
+    {}, [PROGRAM_PATH,
+         '--gtest_also_run_disabled_tests',
+         'internal_skip_environment_and_ad_hoc_tests',
+         '--gtest_filter=*DISABLED_*'])
+COMMAND_WITH_SHARDING = (
+    {'GTEST_SHARD_INDEX': '1', 'GTEST_TOTAL_SHARDS': '2'},
+    [PROGRAM_PATH,
+     'internal_skip_environment_and_ad_hoc_tests',
+     '--gtest_filter=PassingTest.*'])
+
+GOLDEN_PATH = os.path.join(gtest_test_utils.GetSourceDir(), GOLDEN_NAME)
+
+
+def ToUnixLineEnding(s):
+  """Changes all Windows/Mac line endings in s to UNIX line endings."""
+
+  return s.replace('\r\n', '\n').replace('\r', '\n')
+
+
+def RemoveLocations(test_output):
+  """Removes all file location info from a Google Test program's output.
+
+  Args:
+       test_output:  the output of a Google Test program.
+
+  Returns:
+       output with all file location info (in the form of
+       'DIRECTORY/FILE_NAME:LINE_NUMBER: 'or
+       'DIRECTORY\\FILE_NAME(LINE_NUMBER): ') replaced by
+       'FILE_NAME:#: '.
+  """
+
+  return re.sub(r'.*[/\\]((googletest-output-test_|gtest).cc)(\:\d+|\(\d+\))\: ',
+                r'\1:#: ', test_output)
+
+
+def RemoveStackTraceDetails(output):
+  """Removes all stack traces from a Google Test program's output."""
+
+  # *? means "find the shortest string that matches".
+  return re.sub(r'Stack trace:(.|\n)*?\n\n',
+                'Stack trace: (omitted)\n\n', output)
+
+
+def RemoveStackTraces(output):
+  """Removes all traces of stack traces from a Google Test program's output."""
+
+  # *? means "find the shortest string that matches".
+  return re.sub(r'Stack trace:(.|\n)*?\n\n', '', output)
+
+
+def RemoveTime(output):
+  """Removes all time information from a Google Test program's output."""
+
+  return re.sub(r'\(\d+ ms', '(? ms', output)
+
+
+def RemoveTypeInfoDetails(test_output):
+  """Removes compiler-specific type info from Google Test program's output.
+
+  Args:
+       test_output:  the output of a Google Test program.
+
+  Returns:
+       output with type information normalized to canonical form.
+  """
+
+  # some compilers output the name of type 'unsigned int' as 'unsigned'
+  return re.sub(r'unsigned int', 'unsigned', test_output)
+
+
+def NormalizeToCurrentPlatform(test_output):
+  """Normalizes platform specific output details for easier comparison."""
+
+  if IS_WINDOWS:
+    # Removes the color information that is not present on Windows.
+    test_output = re.sub('\x1b\\[(0;3\d)?m', '', test_output)
+    # Changes failure message headers into the Windows format.
+    test_output = re.sub(r': Failure\n', r': error: ', test_output)
+    # Changes file(line_number) to file:line_number.
+    test_output = re.sub(r'((\w|\.)+)\((\d+)\):', r'\1:\3:', test_output)
+
+  return test_output
+
+
+def RemoveTestCounts(output):
+  """Removes test counts from a Google Test program's output."""
+
+  output = re.sub(r'\d+ tests?, listed below',
+                  '? tests, listed below', output)
+  output = re.sub(r'\d+ FAILED TESTS',
+                  '? FAILED TESTS', output)
+  output = re.sub(r'\d+ tests? from \d+ test cases?',
+                  '? tests from ? test cases', output)
+  output = re.sub(r'\d+ tests? from ([a-zA-Z_])',
+                  r'? tests from \1', output)
+  return re.sub(r'\d+ tests?\.', '? tests.', output)
+
+
+def RemoveMatchingTests(test_output, pattern):
+  """Removes output of specified tests from a Google Test program's output.
+
+  This function strips not only the beginning and the end of a test but also
+  all output in between.
+
+  Args:
+    test_output:       A string containing the test output.
+    pattern:           A regex string that matches names of test cases or
+                       tests to remove.
+
+  Returns:
+    Contents of test_output with tests whose names match pattern removed.
+  """
+
+  test_output = re.sub(
+      r'.*\[ RUN      \] .*%s(.|\n)*?\[(  FAILED  |       OK )\] .*%s.*\n' % (
+          pattern, pattern),
+      '',
+      test_output)
+  return re.sub(r'.*%s.*\n' % pattern, '', test_output)
+
+
+def NormalizeOutput(output):
+  """Normalizes output (the output of googletest-output-test_.exe)."""
+
+  output = ToUnixLineEnding(output)
+  output = RemoveLocations(output)
+  output = RemoveStackTraceDetails(output)
+  output = RemoveTime(output)
+  return output
+
+
+def GetShellCommandOutput(env_cmd):
+  """Runs a command in a sub-process, and returns its output in a string.
+
+  Args:
+    env_cmd: The shell command. A 2-tuple where element 0 is a dict of extra
+             environment variables to set, and element 1 is a string with
+             the command and any flags.
+
+  Returns:
+    A string with the command's combined standard and diagnostic output.
+  """
+
+  # Spawns cmd in a sub-process, and gets its standard I/O file objects.
+  # Set and save the environment properly.
+  environ = os.environ.copy()
+  environ.update(env_cmd[0])
+  p = gtest_test_utils.Subprocess(env_cmd[1], env=environ)
+
+  return p.output
+
+
+def GetCommandOutput(env_cmd):
+  """Runs a command and returns its output with all file location
+  info stripped off.
+
+  Args:
+    env_cmd:  The shell command. A 2-tuple where element 0 is a dict of extra
+              environment variables to set, and element 1 is a string with
+              the command and any flags.
+  """
+
+  # Disables exception pop-ups on Windows.
+  environ, cmdline = env_cmd
+  environ = dict(environ)  # Ensures we are modifying a copy.
+  environ[CATCH_EXCEPTIONS_ENV_VAR_NAME] = '1'
+  return NormalizeOutput(GetShellCommandOutput((environ, cmdline)))
+
+
+def GetOutputOfAllCommands():
+  """Returns concatenated output from several representative commands."""
+
+  return (GetCommandOutput(COMMAND_WITH_COLOR) +
+          GetCommandOutput(COMMAND_WITH_TIME) +
+          GetCommandOutput(COMMAND_WITH_DISABLED) +
+          GetCommandOutput(COMMAND_WITH_SHARDING))
+
+
+test_list = GetShellCommandOutput(COMMAND_LIST_TESTS)
+SUPPORTS_DEATH_TESTS = 'DeathTest' in test_list
+SUPPORTS_TYPED_TESTS = 'TypedTest' in test_list
+SUPPORTS_THREADS = 'ExpectFailureWithThreadsTest' in test_list
+SUPPORTS_STACK_TRACES = NO_STACKTRACE_SUPPORT_FLAG not in sys.argv
+
+CAN_GENERATE_GOLDEN_FILE = (SUPPORTS_DEATH_TESTS and
+                            SUPPORTS_TYPED_TESTS and
+                            SUPPORTS_THREADS and
+                            SUPPORTS_STACK_TRACES)
+
+class GTestOutputTest(gtest_test_utils.TestCase):
+  def RemoveUnsupportedTests(self, test_output):
+    if not SUPPORTS_DEATH_TESTS:
+      test_output = RemoveMatchingTests(test_output, 'DeathTest')
+    if not SUPPORTS_TYPED_TESTS:
+      test_output = RemoveMatchingTests(test_output, 'TypedTest')
+      test_output = RemoveMatchingTests(test_output, 'TypedDeathTest')
+      test_output = RemoveMatchingTests(test_output, 'TypeParamDeathTest')
+    if not SUPPORTS_THREADS:
+      test_output = RemoveMatchingTests(test_output,
+                                        'ExpectFailureWithThreadsTest')
+      test_output = RemoveMatchingTests(test_output,
+                                        'ScopedFakeTestPartResultReporterTest')
+      test_output = RemoveMatchingTests(test_output,
+                                        'WorksConcurrently')
+    if not SUPPORTS_STACK_TRACES:
+      test_output = RemoveStackTraces(test_output)
+
+    return test_output
+
+  def testOutput(self):
+    output = GetOutputOfAllCommands()
+
+    golden_file = open(GOLDEN_PATH, 'rb')
+    # A mis-configured source control system can cause \r appear in EOL
+    # sequences when we read the golden file irrespective of an operating
+    # system used. Therefore, we need to strip those \r's from newlines
+    # unconditionally.
+    golden = ToUnixLineEnding(golden_file.read().decode())
+    golden_file.close()
+
+    # We want the test to pass regardless of certain features being
+    # supported or not.
+
+    # We still have to remove type name specifics in all cases.
+    normalized_actual = RemoveTypeInfoDetails(output)
+    normalized_golden = RemoveTypeInfoDetails(golden)
+
+    if CAN_GENERATE_GOLDEN_FILE:
+      self.assertEqual(normalized_golden, normalized_actual,
+                       '\n'.join(difflib.unified_diff(
+                           normalized_golden.split('\n'),
+                           normalized_actual.split('\n'),
+                           'golden', 'actual')))
+    else:
+      normalized_actual = NormalizeToCurrentPlatform(
+          RemoveTestCounts(normalized_actual))
+      normalized_golden = NormalizeToCurrentPlatform(
+          RemoveTestCounts(self.RemoveUnsupportedTests(normalized_golden)))
+
+      # This code is very handy when debugging golden file differences:
+      if os.getenv('DEBUG_GTEST_OUTPUT_TEST'):
+        open(os.path.join(
+            gtest_test_utils.GetSourceDir(),
+            '_googletest-output-test_normalized_actual.txt'), 'wb').write(
+                normalized_actual)
+        open(os.path.join(
+            gtest_test_utils.GetSourceDir(),
+            '_googletest-output-test_normalized_golden.txt'), 'wb').write(
+                normalized_golden)
+
+      self.assertEqual(normalized_golden, normalized_actual)
+
+
+if __name__ == '__main__':
+  if NO_STACKTRACE_SUPPORT_FLAG in sys.argv:
+    # unittest.main() can't handle unknown flags
+    sys.argv.remove(NO_STACKTRACE_SUPPORT_FLAG)
+
+  if GENGOLDEN_FLAG in sys.argv:
+    if CAN_GENERATE_GOLDEN_FILE:
+      output = GetOutputOfAllCommands()
+      golden_file = open(GOLDEN_PATH, 'wb')
+      golden_file.write(output)
+      golden_file.close()
+    else:
+      message = (
+          """Unable to write a golden file when compiled in an environment
+that does not support all the required features (death tests,
+typed tests, stack traces, and multiple threads).
+Please build this test and generate the golden file using Blaze on Linux.""")
+
+      sys.stderr.write(message)
+      sys.exit(1)
+  else:
+    gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test_.cc
new file mode 100755
index 0000000..4f716d8
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-output-test_.cc
@@ -0,0 +1,1157 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The purpose of this file is to generate Google Test output under
+// various conditions.  The output will then be verified by
+// googletest-output-test.py to ensure that Google Test generates the
+// desired messages.  Therefore, most tests in this file are MEANT TO
+// FAIL.
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+#include <stdlib.h>
+
+#if _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127 /* conditional expression is constant */)
+#endif  //  _MSC_VER
+
+#if GTEST_IS_THREADSAFE
+using testing::ScopedFakeTestPartResultReporter;
+using testing::TestPartResultArray;
+
+using testing::internal::Notification;
+using testing::internal::ThreadWithParam;
+#endif
+
+namespace posix = ::testing::internal::posix;
+
+// Tests catching fatal failures.
+
+// A subroutine used by the following test.
+void TestEq1(int x) {
+  ASSERT_EQ(1, x);
+}
+
+// This function calls a test subroutine, catches the fatal failure it
+// generates, and then returns early.
+void TryTestSubroutine() {
+  // Calls a subrountine that yields a fatal failure.
+  TestEq1(2);
+
+  // Catches the fatal failure and aborts the test.
+  //
+  // The testing::Test:: prefix is necessary when calling
+  // HasFatalFailure() outside of a TEST, TEST_F, or test fixture.
+  if (testing::Test::HasFatalFailure()) return;
+
+  // If we get here, something is wrong.
+  FAIL() << "This should never be reached.";
+}
+
+TEST(PassingTest, PassingTest1) {
+}
+
+TEST(PassingTest, PassingTest2) {
+}
+
+// Tests that parameters of failing parameterized tests are printed in the
+// failing test summary.
+class FailingParamTest : public testing::TestWithParam<int> {};
+
+TEST_P(FailingParamTest, Fails) {
+  EXPECT_EQ(1, GetParam());
+}
+
+// This generates a test which will fail. Google Test is expected to print
+// its parameter when it outputs the list of all failed tests.
+INSTANTIATE_TEST_SUITE_P(PrintingFailingParams,
+                         FailingParamTest,
+                         testing::Values(2));
+
+// Tests that an empty value for the test suite basename yields just
+// the test name without any prior /
+class EmptyBasenameParamInst : public testing::TestWithParam<int> {};
+
+TEST_P(EmptyBasenameParamInst, Passes) { EXPECT_EQ(1, GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(, EmptyBasenameParamInst, testing::Values(1));
+
+static const char kGoldenString[] = "\"Line\0 1\"\nLine 2";
+
+TEST(NonfatalFailureTest, EscapesStringOperands) {
+  std::string actual = "actual \"string\"";
+  EXPECT_EQ(kGoldenString, actual);
+
+  const char* golden = kGoldenString;
+  EXPECT_EQ(golden, actual);
+}
+
+TEST(NonfatalFailureTest, DiffForLongStrings) {
+  std::string golden_str(kGoldenString, sizeof(kGoldenString) - 1);
+  EXPECT_EQ(golden_str, "Line 2");
+}
+
+// Tests catching a fatal failure in a subroutine.
+TEST(FatalFailureTest, FatalFailureInSubroutine) {
+  printf("(expecting a failure that x should be 1)\n");
+
+  TryTestSubroutine();
+}
+
+// Tests catching a fatal failure in a nested subroutine.
+TEST(FatalFailureTest, FatalFailureInNestedSubroutine) {
+  printf("(expecting a failure that x should be 1)\n");
+
+  // Calls a subrountine that yields a fatal failure.
+  TryTestSubroutine();
+
+  // Catches the fatal failure and aborts the test.
+  //
+  // When calling HasFatalFailure() inside a TEST, TEST_F, or test
+  // fixture, the testing::Test:: prefix is not needed.
+  if (HasFatalFailure()) return;
+
+  // If we get here, something is wrong.
+  FAIL() << "This should never be reached.";
+}
+
+// Tests HasFatalFailure() after a failed EXPECT check.
+TEST(FatalFailureTest, NonfatalFailureInSubroutine) {
+  printf("(expecting a failure on false)\n");
+  EXPECT_TRUE(false);  // Generates a nonfatal failure
+  ASSERT_FALSE(HasFatalFailure());  // This should succeed.
+}
+
+// Tests interleaving user logging and Google Test assertions.
+TEST(LoggingTest, InterleavingLoggingAndAssertions) {
+  static const int a[4] = {
+    3, 9, 2, 6
+  };
+
+  printf("(expecting 2 failures on (3) >= (a[i]))\n");
+  for (int i = 0; i < static_cast<int>(sizeof(a)/sizeof(*a)); i++) {
+    printf("i == %d\n", i);
+    EXPECT_GE(3, a[i]);
+  }
+}
+
+// Tests the SCOPED_TRACE macro.
+
+// A helper function for testing SCOPED_TRACE.
+void SubWithoutTrace(int n) {
+  EXPECT_EQ(1, n);
+  ASSERT_EQ(2, n);
+}
+
+// Another helper function for testing SCOPED_TRACE.
+void SubWithTrace(int n) {
+  SCOPED_TRACE(testing::Message() << "n = " << n);
+
+  SubWithoutTrace(n);
+}
+
+TEST(SCOPED_TRACETest, AcceptedValues) {
+  SCOPED_TRACE("literal string");
+  SCOPED_TRACE(std::string("std::string"));
+  SCOPED_TRACE(1337);  // streamable type
+  const char* null_value = nullptr;
+  SCOPED_TRACE(null_value);
+
+  ADD_FAILURE() << "Just checking that all these values work fine.";
+}
+
+// Tests that SCOPED_TRACE() obeys lexical scopes.
+TEST(SCOPED_TRACETest, ObeysScopes) {
+  printf("(expected to fail)\n");
+
+  // There should be no trace before SCOPED_TRACE() is invoked.
+  ADD_FAILURE() << "This failure is expected, and shouldn't have a trace.";
+
+  {
+    SCOPED_TRACE("Expected trace");
+    // After SCOPED_TRACE(), a failure in the current scope should contain
+    // the trace.
+    ADD_FAILURE() << "This failure is expected, and should have a trace.";
+  }
+
+  // Once the control leaves the scope of the SCOPED_TRACE(), there
+  // should be no trace again.
+  ADD_FAILURE() << "This failure is expected, and shouldn't have a trace.";
+}
+
+// Tests that SCOPED_TRACE works inside a loop.
+TEST(SCOPED_TRACETest, WorksInLoop) {
+  printf("(expected to fail)\n");
+
+  for (int i = 1; i <= 2; i++) {
+    SCOPED_TRACE(testing::Message() << "i = " << i);
+
+    SubWithoutTrace(i);
+  }
+}
+
+// Tests that SCOPED_TRACE works in a subroutine.
+TEST(SCOPED_TRACETest, WorksInSubroutine) {
+  printf("(expected to fail)\n");
+
+  SubWithTrace(1);
+  SubWithTrace(2);
+}
+
+// Tests that SCOPED_TRACE can be nested.
+TEST(SCOPED_TRACETest, CanBeNested) {
+  printf("(expected to fail)\n");
+
+  SCOPED_TRACE("");  // A trace without a message.
+
+  SubWithTrace(2);
+}
+
+// Tests that multiple SCOPED_TRACEs can be used in the same scope.
+TEST(SCOPED_TRACETest, CanBeRepeated) {
+  printf("(expected to fail)\n");
+
+  SCOPED_TRACE("A");
+  ADD_FAILURE()
+      << "This failure is expected, and should contain trace point A.";
+
+  SCOPED_TRACE("B");
+  ADD_FAILURE()
+      << "This failure is expected, and should contain trace point A and B.";
+
+  {
+    SCOPED_TRACE("C");
+    ADD_FAILURE() << "This failure is expected, and should "
+                  << "contain trace point A, B, and C.";
+  }
+
+  SCOPED_TRACE("D");
+  ADD_FAILURE() << "This failure is expected, and should "
+                << "contain trace point A, B, and D.";
+}
+
+#if GTEST_IS_THREADSAFE
+// Tests that SCOPED_TRACE()s can be used concurrently from multiple
+// threads.  Namely, an assertion should be affected by
+// SCOPED_TRACE()s in its own thread only.
+
+// Here's the sequence of actions that happen in the test:
+//
+//   Thread A (main)                | Thread B (spawned)
+//   ===============================|================================
+//   spawns thread B                |
+//   -------------------------------+--------------------------------
+//   waits for n1                   | SCOPED_TRACE("Trace B");
+//                                  | generates failure #1
+//                                  | notifies n1
+//   -------------------------------+--------------------------------
+//   SCOPED_TRACE("Trace A");       | waits for n2
+//   generates failure #2           |
+//   notifies n2                    |
+//   -------------------------------|--------------------------------
+//   waits for n3                   | generates failure #3
+//                                  | trace B dies
+//                                  | generates failure #4
+//                                  | notifies n3
+//   -------------------------------|--------------------------------
+//   generates failure #5           | finishes
+//   trace A dies                   |
+//   generates failure #6           |
+//   -------------------------------|--------------------------------
+//   waits for thread B to finish   |
+
+struct CheckPoints {
+  Notification n1;
+  Notification n2;
+  Notification n3;
+};
+
+static void ThreadWithScopedTrace(CheckPoints* check_points) {
+  {
+    SCOPED_TRACE("Trace B");
+    ADD_FAILURE()
+        << "Expected failure #1 (in thread B, only trace B alive).";
+    check_points->n1.Notify();
+    check_points->n2.WaitForNotification();
+
+    ADD_FAILURE()
+        << "Expected failure #3 (in thread B, trace A & B both alive).";
+  }  // Trace B dies here.
+  ADD_FAILURE()
+      << "Expected failure #4 (in thread B, only trace A alive).";
+  check_points->n3.Notify();
+}
+
+TEST(SCOPED_TRACETest, WorksConcurrently) {
+  printf("(expecting 6 failures)\n");
+
+  CheckPoints check_points;
+  ThreadWithParam<CheckPoints*> thread(&ThreadWithScopedTrace, &check_points,
+                                       nullptr);
+  check_points.n1.WaitForNotification();
+
+  {
+    SCOPED_TRACE("Trace A");
+    ADD_FAILURE()
+        << "Expected failure #2 (in thread A, trace A & B both alive).";
+    check_points.n2.Notify();
+    check_points.n3.WaitForNotification();
+
+    ADD_FAILURE()
+        << "Expected failure #5 (in thread A, only trace A alive).";
+  }  // Trace A dies here.
+  ADD_FAILURE()
+      << "Expected failure #6 (in thread A, no trace alive).";
+  thread.Join();
+}
+#endif  // GTEST_IS_THREADSAFE
+
+// Tests basic functionality of the ScopedTrace utility (most of its features
+// are already tested in SCOPED_TRACETest).
+TEST(ScopedTraceTest, WithExplicitFileAndLine) {
+  testing::ScopedTrace trace("explicit_file.cc", 123, "expected trace message");
+  ADD_FAILURE() << "Check that the trace is attached to a particular location.";
+}
+
+TEST(DisabledTestsWarningTest,
+     DISABLED_AlsoRunDisabledTestsFlagSuppressesWarning) {
+  // This test body is intentionally empty.  Its sole purpose is for
+  // verifying that the --gtest_also_run_disabled_tests flag
+  // suppresses the "YOU HAVE 12 DISABLED TESTS" warning at the end of
+  // the test output.
+}
+
+// Tests using assertions outside of TEST and TEST_F.
+//
+// This function creates two failures intentionally.
+void AdHocTest() {
+  printf("The non-test part of the code is expected to have 2 failures.\n\n");
+  EXPECT_TRUE(false);
+  EXPECT_EQ(2, 3);
+}
+
+// Runs all TESTs, all TEST_Fs, and the ad hoc test.
+int RunAllTests() {
+  AdHocTest();
+  return RUN_ALL_TESTS();
+}
+
+// Tests non-fatal failures in the fixture constructor.
+class NonFatalFailureInFixtureConstructorTest : public testing::Test {
+ protected:
+  NonFatalFailureInFixtureConstructorTest() {
+    printf("(expecting 5 failures)\n");
+    ADD_FAILURE() << "Expected failure #1, in the test fixture c'tor.";
+  }
+
+  ~NonFatalFailureInFixtureConstructorTest() override {
+    ADD_FAILURE() << "Expected failure #5, in the test fixture d'tor.";
+  }
+
+  void SetUp() override { ADD_FAILURE() << "Expected failure #2, in SetUp()."; }
+
+  void TearDown() override {
+    ADD_FAILURE() << "Expected failure #4, in TearDown.";
+  }
+};
+
+TEST_F(NonFatalFailureInFixtureConstructorTest, FailureInConstructor) {
+  ADD_FAILURE() << "Expected failure #3, in the test body.";
+}
+
+// Tests fatal failures in the fixture constructor.
+class FatalFailureInFixtureConstructorTest : public testing::Test {
+ protected:
+  FatalFailureInFixtureConstructorTest() {
+    printf("(expecting 2 failures)\n");
+    Init();
+  }
+
+  ~FatalFailureInFixtureConstructorTest() override {
+    ADD_FAILURE() << "Expected failure #2, in the test fixture d'tor.";
+  }
+
+  void SetUp() override {
+    ADD_FAILURE() << "UNEXPECTED failure in SetUp().  "
+                  << "We should never get here, as the test fixture c'tor "
+                  << "had a fatal failure.";
+  }
+
+  void TearDown() override {
+    ADD_FAILURE() << "UNEXPECTED failure in TearDown().  "
+                  << "We should never get here, as the test fixture c'tor "
+                  << "had a fatal failure.";
+  }
+
+ private:
+  void Init() {
+    FAIL() << "Expected failure #1, in the test fixture c'tor.";
+  }
+};
+
+TEST_F(FatalFailureInFixtureConstructorTest, FailureInConstructor) {
+  ADD_FAILURE() << "UNEXPECTED failure in the test body.  "
+                << "We should never get here, as the test fixture c'tor "
+                << "had a fatal failure.";
+}
+
+// Tests non-fatal failures in SetUp().
+class NonFatalFailureInSetUpTest : public testing::Test {
+ protected:
+  ~NonFatalFailureInSetUpTest() override { Deinit(); }
+
+  void SetUp() override {
+    printf("(expecting 4 failures)\n");
+    ADD_FAILURE() << "Expected failure #1, in SetUp().";
+  }
+
+  void TearDown() override { FAIL() << "Expected failure #3, in TearDown()."; }
+
+ private:
+  void Deinit() {
+    FAIL() << "Expected failure #4, in the test fixture d'tor.";
+  }
+};
+
+TEST_F(NonFatalFailureInSetUpTest, FailureInSetUp) {
+  FAIL() << "Expected failure #2, in the test function.";
+}
+
+// Tests fatal failures in SetUp().
+class FatalFailureInSetUpTest : public testing::Test {
+ protected:
+  ~FatalFailureInSetUpTest() override { Deinit(); }
+
+  void SetUp() override {
+    printf("(expecting 3 failures)\n");
+    FAIL() << "Expected failure #1, in SetUp().";
+  }
+
+  void TearDown() override { FAIL() << "Expected failure #2, in TearDown()."; }
+
+ private:
+  void Deinit() {
+    FAIL() << "Expected failure #3, in the test fixture d'tor.";
+  }
+};
+
+TEST_F(FatalFailureInSetUpTest, FailureInSetUp) {
+  FAIL() << "UNEXPECTED failure in the test function.  "
+         << "We should never get here, as SetUp() failed.";
+}
+
+TEST(AddFailureAtTest, MessageContainsSpecifiedFileAndLineNumber) {
+  ADD_FAILURE_AT("foo.cc", 42) << "Expected nonfatal failure in foo.cc";
+}
+
+TEST(GtestFailAtTest, MessageContainsSpecifiedFileAndLineNumber) {
+  GTEST_FAIL_AT("foo.cc", 42) << "Expected fatal failure in foo.cc";
+}
+
+#if GTEST_IS_THREADSAFE
+
+// A unary function that may die.
+void DieIf(bool should_die) {
+  GTEST_CHECK_(!should_die) << " - death inside DieIf().";
+}
+
+// Tests running death tests in a multi-threaded context.
+
+// Used for coordination between the main and the spawn thread.
+struct SpawnThreadNotifications {
+  SpawnThreadNotifications() {}
+
+  Notification spawn_thread_started;
+  Notification spawn_thread_ok_to_terminate;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SpawnThreadNotifications);
+};
+
+// The function to be executed in the thread spawn by the
+// MultipleThreads test (below).
+static void ThreadRoutine(SpawnThreadNotifications* notifications) {
+  // Signals the main thread that this thread has started.
+  notifications->spawn_thread_started.Notify();
+
+  // Waits for permission to finish from the main thread.
+  notifications->spawn_thread_ok_to_terminate.WaitForNotification();
+}
+
+// This is a death-test test, but it's not named with a DeathTest
+// suffix.  It starts threads which might interfere with later
+// death tests, so it must run after all other death tests.
+class DeathTestAndMultiThreadsTest : public testing::Test {
+ protected:
+  // Starts a thread and waits for it to begin.
+  void SetUp() override {
+    thread_.reset(new ThreadWithParam<SpawnThreadNotifications*>(
+        &ThreadRoutine, &notifications_, nullptr));
+    notifications_.spawn_thread_started.WaitForNotification();
+  }
+  // Tells the thread to finish, and reaps it.
+  // Depending on the version of the thread library in use,
+  // a manager thread might still be left running that will interfere
+  // with later death tests.  This is unfortunate, but this class
+  // cleans up after itself as best it can.
+  void TearDown() override {
+    notifications_.spawn_thread_ok_to_terminate.Notify();
+  }
+
+ private:
+  SpawnThreadNotifications notifications_;
+  std::unique_ptr<ThreadWithParam<SpawnThreadNotifications*> > thread_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// The MixedUpTestSuiteTest test case verifies that Google Test will fail a
+// test if it uses a different fixture class than what other tests in
+// the same test case use.  It deliberately contains two fixture
+// classes with the same name but defined in different namespaces.
+
+// The MixedUpTestSuiteWithSameTestNameTest test case verifies that
+// when the user defines two tests with the same test case name AND
+// same test name (but in different namespaces), the second test will
+// fail.
+
+namespace foo {
+
+class MixedUpTestSuiteTest : public testing::Test {
+};
+
+TEST_F(MixedUpTestSuiteTest, FirstTestFromNamespaceFoo) {}
+TEST_F(MixedUpTestSuiteTest, SecondTestFromNamespaceFoo) {}
+
+class MixedUpTestSuiteWithSameTestNameTest : public testing::Test {
+};
+
+TEST_F(MixedUpTestSuiteWithSameTestNameTest,
+       TheSecondTestWithThisNameShouldFail) {}
+
+}  // namespace foo
+
+namespace bar {
+
+class MixedUpTestSuiteTest : public testing::Test {
+};
+
+// The following two tests are expected to fail.  We rely on the
+// golden file to check that Google Test generates the right error message.
+TEST_F(MixedUpTestSuiteTest, ThisShouldFail) {}
+TEST_F(MixedUpTestSuiteTest, ThisShouldFailToo) {}
+
+class MixedUpTestSuiteWithSameTestNameTest : public testing::Test {
+};
+
+// Expected to fail.  We rely on the golden file to check that Google Test
+// generates the right error message.
+TEST_F(MixedUpTestSuiteWithSameTestNameTest,
+       TheSecondTestWithThisNameShouldFail) {}
+
+}  // namespace bar
+
+// The following two test cases verify that Google Test catches the user
+// error of mixing TEST and TEST_F in the same test case.  The first
+// test case checks the scenario where TEST_F appears before TEST, and
+// the second one checks where TEST appears before TEST_F.
+
+class TEST_F_before_TEST_in_same_test_case : public testing::Test {
+};
+
+TEST_F(TEST_F_before_TEST_in_same_test_case, DefinedUsingTEST_F) {}
+
+// Expected to fail.  We rely on the golden file to check that Google Test
+// generates the right error message.
+TEST(TEST_F_before_TEST_in_same_test_case, DefinedUsingTESTAndShouldFail) {}
+
+class TEST_before_TEST_F_in_same_test_case : public testing::Test {
+};
+
+TEST(TEST_before_TEST_F_in_same_test_case, DefinedUsingTEST) {}
+
+// Expected to fail.  We rely on the golden file to check that Google Test
+// generates the right error message.
+TEST_F(TEST_before_TEST_F_in_same_test_case, DefinedUsingTEST_FAndShouldFail) {
+}
+
+// Used for testing EXPECT_NONFATAL_FAILURE() and EXPECT_FATAL_FAILURE().
+int global_integer = 0;
+
+// Tests that EXPECT_NONFATAL_FAILURE() can reference global variables.
+TEST(ExpectNonfatalFailureTest, CanReferenceGlobalVariables) {
+  global_integer = 0;
+  EXPECT_NONFATAL_FAILURE({
+    EXPECT_EQ(1, global_integer) << "Expected non-fatal failure.";
+  }, "Expected non-fatal failure.");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() can reference local variables
+// (static or not).
+TEST(ExpectNonfatalFailureTest, CanReferenceLocalVariables) {
+  int m = 0;
+  static int n;
+  n = 1;
+  EXPECT_NONFATAL_FAILURE({
+    EXPECT_EQ(m, n) << "Expected non-fatal failure.";
+  }, "Expected non-fatal failure.");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() succeeds when there is exactly
+// one non-fatal failure and no fatal failure.
+TEST(ExpectNonfatalFailureTest, SucceedsWhenThereIsOneNonfatalFailure) {
+  EXPECT_NONFATAL_FAILURE({
+    ADD_FAILURE() << "Expected non-fatal failure.";
+  }, "Expected non-fatal failure.");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() fails when there is no
+// non-fatal failure.
+TEST(ExpectNonfatalFailureTest, FailsWhenThereIsNoNonfatalFailure) {
+  printf("(expecting a failure)\n");
+  EXPECT_NONFATAL_FAILURE({
+  }, "");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() fails when there are two
+// non-fatal failures.
+TEST(ExpectNonfatalFailureTest, FailsWhenThereAreTwoNonfatalFailures) {
+  printf("(expecting a failure)\n");
+  EXPECT_NONFATAL_FAILURE({
+    ADD_FAILURE() << "Expected non-fatal failure 1.";
+    ADD_FAILURE() << "Expected non-fatal failure 2.";
+  }, "");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() fails when there is one fatal
+// failure.
+TEST(ExpectNonfatalFailureTest, FailsWhenThereIsOneFatalFailure) {
+  printf("(expecting a failure)\n");
+  EXPECT_NONFATAL_FAILURE({
+    FAIL() << "Expected fatal failure.";
+  }, "");
+}
+
+// Tests that EXPECT_NONFATAL_FAILURE() fails when the statement being
+// tested returns.
+TEST(ExpectNonfatalFailureTest, FailsWhenStatementReturns) {
+  printf("(expecting a failure)\n");
+  EXPECT_NONFATAL_FAILURE({
+    return;
+  }, "");
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Tests that EXPECT_NONFATAL_FAILURE() fails when the statement being
+// tested throws.
+TEST(ExpectNonfatalFailureTest, FailsWhenStatementThrows) {
+  printf("(expecting a failure)\n");
+  try {
+    EXPECT_NONFATAL_FAILURE({
+      throw 0;
+    }, "");
+  } catch(int) {  // NOLINT
+  }
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Tests that EXPECT_FATAL_FAILURE() can reference global variables.
+TEST(ExpectFatalFailureTest, CanReferenceGlobalVariables) {
+  global_integer = 0;
+  EXPECT_FATAL_FAILURE({
+    ASSERT_EQ(1, global_integer) << "Expected fatal failure.";
+  }, "Expected fatal failure.");
+}
+
+// Tests that EXPECT_FATAL_FAILURE() can reference local static
+// variables.
+TEST(ExpectFatalFailureTest, CanReferenceLocalStaticVariables) {
+  static int n;
+  n = 1;
+  EXPECT_FATAL_FAILURE({
+    ASSERT_EQ(0, n) << "Expected fatal failure.";
+  }, "Expected fatal failure.");
+}
+
+// Tests that EXPECT_FATAL_FAILURE() succeeds when there is exactly
+// one fatal failure and no non-fatal failure.
+TEST(ExpectFatalFailureTest, SucceedsWhenThereIsOneFatalFailure) {
+  EXPECT_FATAL_FAILURE({
+    FAIL() << "Expected fatal failure.";
+  }, "Expected fatal failure.");
+}
+
+// Tests that EXPECT_FATAL_FAILURE() fails when there is no fatal
+// failure.
+TEST(ExpectFatalFailureTest, FailsWhenThereIsNoFatalFailure) {
+  printf("(expecting a failure)\n");
+  EXPECT_FATAL_FAILURE({
+  }, "");
+}
+
+// A helper for generating a fatal failure.
+void FatalFailure() {
+  FAIL() << "Expected fatal failure.";
+}
+
+// Tests that EXPECT_FATAL_FAILURE() fails when there are two
+// fatal failures.
+TEST(ExpectFatalFailureTest, FailsWhenThereAreTwoFatalFailures) {
+  printf("(expecting a failure)\n");
+  EXPECT_FATAL_FAILURE({
+    FatalFailure();
+    FatalFailure();
+  }, "");
+}
+
+// Tests that EXPECT_FATAL_FAILURE() fails when there is one non-fatal
+// failure.
+TEST(ExpectFatalFailureTest, FailsWhenThereIsOneNonfatalFailure) {
+  printf("(expecting a failure)\n");
+  EXPECT_FATAL_FAILURE({
+    ADD_FAILURE() << "Expected non-fatal failure.";
+  }, "");
+}
+
+// Tests that EXPECT_FATAL_FAILURE() fails when the statement being
+// tested returns.
+TEST(ExpectFatalFailureTest, FailsWhenStatementReturns) {
+  printf("(expecting a failure)\n");
+  EXPECT_FATAL_FAILURE({
+    return;
+  }, "");
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Tests that EXPECT_FATAL_FAILURE() fails when the statement being
+// tested throws.
+TEST(ExpectFatalFailureTest, FailsWhenStatementThrows) {
+  printf("(expecting a failure)\n");
+  try {
+    EXPECT_FATAL_FAILURE({
+      throw 0;
+    }, "");
+  } catch(int) {  // NOLINT
+  }
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// This #ifdef block tests the output of value-parameterized tests.
+
+std::string ParamNameFunc(const testing::TestParamInfo<std::string>& info) {
+  return info.param;
+}
+
+class ParamTest : public testing::TestWithParam<std::string> {
+};
+
+TEST_P(ParamTest, Success) {
+  EXPECT_EQ("a", GetParam());
+}
+
+TEST_P(ParamTest, Failure) {
+  EXPECT_EQ("b", GetParam()) << "Expected failure";
+}
+
+INSTANTIATE_TEST_SUITE_P(PrintingStrings,
+                         ParamTest,
+                         testing::Values(std::string("a")),
+                         ParamNameFunc);
+
+// This #ifdef block tests the output of typed tests.
+#if GTEST_HAS_TYPED_TEST
+
+template <typename T>
+class TypedTest : public testing::Test {
+};
+
+TYPED_TEST_SUITE(TypedTest, testing::Types<int>);
+
+TYPED_TEST(TypedTest, Success) {
+  EXPECT_EQ(0, TypeParam());
+}
+
+TYPED_TEST(TypedTest, Failure) {
+  EXPECT_EQ(1, TypeParam()) << "Expected failure";
+}
+
+typedef testing::Types<char, int> TypesForTestWithNames;
+
+template <typename T>
+class TypedTestWithNames : public testing::Test {};
+
+class TypedTestNames {
+ public:
+  template <typename T>
+  static std::string GetName(int i) {
+    if (std::is_same<T, char>::value)
+      return std::string("char") + ::testing::PrintToString(i);
+    if (std::is_same<T, int>::value)
+      return std::string("int") + ::testing::PrintToString(i);
+  }
+};
+
+TYPED_TEST_SUITE(TypedTestWithNames, TypesForTestWithNames, TypedTestNames);
+
+TYPED_TEST(TypedTestWithNames, Success) {}
+
+TYPED_TEST(TypedTestWithNames, Failure) { FAIL(); }
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// This #ifdef block tests the output of type-parameterized tests.
+#if GTEST_HAS_TYPED_TEST_P
+
+template <typename T>
+class TypedTestP : public testing::Test {
+};
+
+TYPED_TEST_SUITE_P(TypedTestP);
+
+TYPED_TEST_P(TypedTestP, Success) {
+  EXPECT_EQ(0U, TypeParam());
+}
+
+TYPED_TEST_P(TypedTestP, Failure) {
+  EXPECT_EQ(1U, TypeParam()) << "Expected failure";
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypedTestP, Success, Failure);
+
+typedef testing::Types<unsigned char, unsigned int> UnsignedTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(Unsigned, TypedTestP, UnsignedTypes);
+
+class TypedTestPNames {
+ public:
+  template <typename T>
+  static std::string GetName(int i) {
+    if (std::is_same<T, unsigned char>::value) {
+      return std::string("unsignedChar") + ::testing::PrintToString(i);
+    }
+    if (std::is_same<T, unsigned int>::value) {
+      return std::string("unsignedInt") + ::testing::PrintToString(i);
+    }
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(UnsignedCustomName, TypedTestP, UnsignedTypes,
+                              TypedTestPNames);
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#if GTEST_HAS_DEATH_TEST
+
+// We rely on the golden file to verify that tests whose test case
+// name ends with DeathTest are run first.
+
+TEST(ADeathTest, ShouldRunFirst) {
+}
+
+# if GTEST_HAS_TYPED_TEST
+
+// We rely on the golden file to verify that typed tests whose test
+// case name ends with DeathTest are run first.
+
+template <typename T>
+class ATypedDeathTest : public testing::Test {
+};
+
+typedef testing::Types<int, double> NumericTypes;
+TYPED_TEST_SUITE(ATypedDeathTest, NumericTypes);
+
+TYPED_TEST(ATypedDeathTest, ShouldRunFirst) {
+}
+
+# endif  // GTEST_HAS_TYPED_TEST
+
+# if GTEST_HAS_TYPED_TEST_P
+
+
+// We rely on the golden file to verify that type-parameterized tests
+// whose test case name ends with DeathTest are run first.
+
+template <typename T>
+class ATypeParamDeathTest : public testing::Test {
+};
+
+TYPED_TEST_SUITE_P(ATypeParamDeathTest);
+
+TYPED_TEST_P(ATypeParamDeathTest, ShouldRunFirst) {
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ATypeParamDeathTest, ShouldRunFirst);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(My, ATypeParamDeathTest, NumericTypes);
+
+# endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Tests various failure conditions of
+// EXPECT_{,NON}FATAL_FAILURE{,_ON_ALL_THREADS}.
+class ExpectFailureTest : public testing::Test {
+ public:  // Must be public and not protected due to a bug in g++ 3.4.2.
+  enum FailureMode {
+    FATAL_FAILURE,
+    NONFATAL_FAILURE
+  };
+  static void AddFailure(FailureMode failure) {
+    if (failure == FATAL_FAILURE) {
+      FAIL() << "Expected fatal failure.";
+    } else {
+      ADD_FAILURE() << "Expected non-fatal failure.";
+    }
+  }
+};
+
+TEST_F(ExpectFailureTest, ExpectFatalFailure) {
+  // Expected fatal failure, but succeeds.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE(SUCCEED(), "Expected fatal failure.");
+  // Expected fatal failure, but got a non-fatal failure.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE(AddFailure(NONFATAL_FAILURE), "Expected non-fatal "
+                       "failure.");
+  // Wrong message.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE(AddFailure(FATAL_FAILURE), "Some other fatal failure "
+                       "expected.");
+}
+
+TEST_F(ExpectFailureTest, ExpectNonFatalFailure) {
+  // Expected non-fatal failure, but succeeds.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE(SUCCEED(), "Expected non-fatal failure.");
+  // Expected non-fatal failure, but got a fatal failure.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE(AddFailure(FATAL_FAILURE), "Expected fatal failure.");
+  // Wrong message.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE(AddFailure(NONFATAL_FAILURE), "Some other non-fatal "
+                          "failure.");
+}
+
+#if GTEST_IS_THREADSAFE
+
+class ExpectFailureWithThreadsTest : public ExpectFailureTest {
+ protected:
+  static void AddFailureInOtherThread(FailureMode failure) {
+    ThreadWithParam<FailureMode> thread(&AddFailure, failure, nullptr);
+    thread.Join();
+  }
+};
+
+TEST_F(ExpectFailureWithThreadsTest, ExpectFatalFailure) {
+  // We only intercept the current thread.
+  printf("(expecting 2 failures)\n");
+  EXPECT_FATAL_FAILURE(AddFailureInOtherThread(FATAL_FAILURE),
+                       "Expected fatal failure.");
+}
+
+TEST_F(ExpectFailureWithThreadsTest, ExpectNonFatalFailure) {
+  // We only intercept the current thread.
+  printf("(expecting 2 failures)\n");
+  EXPECT_NONFATAL_FAILURE(AddFailureInOtherThread(NONFATAL_FAILURE),
+                          "Expected non-fatal failure.");
+}
+
+typedef ExpectFailureWithThreadsTest ScopedFakeTestPartResultReporterTest;
+
+// Tests that the ScopedFakeTestPartResultReporter only catches failures from
+// the current thread if it is instantiated with INTERCEPT_ONLY_CURRENT_THREAD.
+TEST_F(ScopedFakeTestPartResultReporterTest, InterceptOnlyCurrentThread) {
+  printf("(expecting 2 failures)\n");
+  TestPartResultArray results;
+  {
+    ScopedFakeTestPartResultReporter reporter(
+        ScopedFakeTestPartResultReporter::INTERCEPT_ONLY_CURRENT_THREAD,
+        &results);
+    AddFailureInOtherThread(FATAL_FAILURE);
+    AddFailureInOtherThread(NONFATAL_FAILURE);
+  }
+  // The two failures should not have been intercepted.
+  EXPECT_EQ(0, results.size()) << "This shouldn't fail.";
+}
+
+#endif  // GTEST_IS_THREADSAFE
+
+TEST_F(ExpectFailureTest, ExpectFatalFailureOnAllThreads) {
+  // Expected fatal failure, but succeeds.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(SUCCEED(), "Expected fatal failure.");
+  // Expected fatal failure, but got a non-fatal failure.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(AddFailure(NONFATAL_FAILURE),
+                                      "Expected non-fatal failure.");
+  // Wrong message.
+  printf("(expecting 1 failure)\n");
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(AddFailure(FATAL_FAILURE),
+                                      "Some other fatal failure expected.");
+}
+
+TEST_F(ExpectFailureTest, ExpectNonFatalFailureOnAllThreads) {
+  // Expected non-fatal failure, but succeeds.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(SUCCEED(), "Expected non-fatal "
+                                         "failure.");
+  // Expected non-fatal failure, but got a fatal failure.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(AddFailure(FATAL_FAILURE),
+                                         "Expected fatal failure.");
+  // Wrong message.
+  printf("(expecting 1 failure)\n");
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(AddFailure(NONFATAL_FAILURE),
+                                         "Some other non-fatal failure.");
+}
+
+class DynamicFixture : public testing::Test {
+ protected:
+  DynamicFixture() { printf("DynamicFixture()\n"); }
+  ~DynamicFixture() override { printf("~DynamicFixture()\n"); }
+  void SetUp() override { printf("DynamicFixture::SetUp\n"); }
+  void TearDown() override { printf("DynamicFixture::TearDown\n"); }
+
+  static void SetUpTestSuite() { printf("DynamicFixture::SetUpTestSuite\n"); }
+  static void TearDownTestSuite() {
+    printf("DynamicFixture::TearDownTestSuite\n");
+  }
+};
+
+template <bool Pass>
+class DynamicTest : public DynamicFixture {
+ public:
+  void TestBody() override { EXPECT_TRUE(Pass); }
+};
+
+auto dynamic_test = (
+    // Register two tests with the same fixture correctly.
+    testing::RegisterTest(
+        "DynamicFixture", "DynamicTestPass", nullptr, nullptr, __FILE__,
+        __LINE__, []() -> DynamicFixture* { return new DynamicTest<true>; }),
+    testing::RegisterTest(
+        "DynamicFixture", "DynamicTestFail", nullptr, nullptr, __FILE__,
+        __LINE__, []() -> DynamicFixture* { return new DynamicTest<false>; }),
+
+    // Register the same fixture with another name. That's fine.
+    testing::RegisterTest(
+        "DynamicFixtureAnotherName", "DynamicTestPass", nullptr, nullptr,
+        __FILE__, __LINE__,
+        []() -> DynamicFixture* { return new DynamicTest<true>; }),
+
+    // Register two tests with the same fixture incorrectly.
+    testing::RegisterTest(
+        "BadDynamicFixture1", "FixtureBase", nullptr, nullptr, __FILE__,
+        __LINE__, []() -> DynamicFixture* { return new DynamicTest<true>; }),
+    testing::RegisterTest(
+        "BadDynamicFixture1", "TestBase", nullptr, nullptr, __FILE__, __LINE__,
+        []() -> testing::Test* { return new DynamicTest<true>; }),
+
+    // Register two tests with the same fixture incorrectly by ommiting the
+    // return type.
+    testing::RegisterTest(
+        "BadDynamicFixture2", "FixtureBase", nullptr, nullptr, __FILE__,
+        __LINE__, []() -> DynamicFixture* { return new DynamicTest<true>; }),
+    testing::RegisterTest("BadDynamicFixture2", "Derived", nullptr, nullptr,
+                          __FILE__, __LINE__,
+                          []() { return new DynamicTest<true>; }));
+
+// Two test environments for testing testing::AddGlobalTestEnvironment().
+
+class FooEnvironment : public testing::Environment {
+ public:
+  void SetUp() override { printf("%s", "FooEnvironment::SetUp() called.\n"); }
+
+  void TearDown() override {
+    printf("%s", "FooEnvironment::TearDown() called.\n");
+    FAIL() << "Expected fatal failure.";
+  }
+};
+
+class BarEnvironment : public testing::Environment {
+ public:
+  void SetUp() override { printf("%s", "BarEnvironment::SetUp() called.\n"); }
+
+  void TearDown() override {
+    printf("%s", "BarEnvironment::TearDown() called.\n");
+    ADD_FAILURE() << "Expected non-fatal failure.";
+  }
+};
+
+// The main function.
+//
+// The idea is to use Google Test to run all the tests we have defined (some
+// of them are intended to fail), and then compare the test results
+// with the "golden" file.
+int main(int argc, char **argv) {
+  testing::GTEST_FLAG(print_time) = false;
+
+  // We just run the tests, knowing some of them are intended to fail.
+  // We will use a separate Python script to compare the output of
+  // this program with the golden file.
+
+  // It's hard to test InitGoogleTest() directly, as it has many
+  // global side effects.  The following line serves as a sanity test
+  // for it.
+  testing::InitGoogleTest(&argc, argv);
+  bool internal_skip_environment_and_ad_hoc_tests =
+      std::count(argv, argv + argc,
+                 std::string("internal_skip_environment_and_ad_hoc_tests")) > 0;
+
+#if GTEST_HAS_DEATH_TEST
+  if (testing::internal::GTEST_FLAG(internal_run_death_test) != "") {
+    // Skip the usual output capturing if we're running as the child
+    // process of an threadsafe-style death test.
+# if GTEST_OS_WINDOWS
+    posix::FReopen("nul:", "w", stdout);
+# else
+    posix::FReopen("/dev/null", "w", stdout);
+# endif  // GTEST_OS_WINDOWS
+    return RUN_ALL_TESTS();
+  }
+#endif  // GTEST_HAS_DEATH_TEST
+
+  if (internal_skip_environment_and_ad_hoc_tests)
+    return RUN_ALL_TESTS();
+
+  // Registers two global test environments.
+  // The golden file verifies that they are set up in the order they
+  // are registered, and torn down in the reverse order.
+  testing::AddGlobalTestEnvironment(new FooEnvironment);
+  testing::AddGlobalTestEnvironment(new BarEnvironment);
+#if _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4127
+#endif  //  _MSC_VER
+  return RunAllTests();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test.py
new file mode 100755
index 0000000..2a08477
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that Google Test warns the user when not initialized properly."""
+
+import gtest_test_utils
+
+binary_name = 'googletest-param-test-invalid-name1-test_'
+COMMAND = gtest_test_utils.GetTestExecutablePath(binary_name)
+
+
+def Assert(condition):
+  if not condition:
+    raise AssertionError
+
+
+def TestExitCodeAndOutput(command):
+  """Runs the given command and verifies its exit code and output."""
+
+  err = ('Parameterized test name \'"InvalidWithQuotes"\' is invalid')
+
+  p = gtest_test_utils.Subprocess(command)
+  Assert(p.terminated_by_signal)
+
+  # Verify the output message contains appropriate output
+  Assert(err in p.output)
+
+
+class GTestParamTestInvalidName1Test(gtest_test_utils.TestCase):
+
+  def testExitCodeAndOutput(self):
+    TestExitCodeAndOutput(COMMAND)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test_.cc
new file mode 100755
index 0000000..955d699
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name1-test_.cc
@@ -0,0 +1,50 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/gtest.h"
+
+namespace {
+class DummyTest : public ::testing::TestWithParam<const char *> {};
+
+TEST_P(DummyTest, Dummy) {
+}
+
+INSTANTIATE_TEST_SUITE_P(InvalidTestName,
+                         DummyTest,
+                         ::testing::Values("InvalidWithQuotes"),
+                         ::testing::PrintToStringParamName());
+
+}  // namespace
+
+int main(int argc, char *argv[]) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test.py
new file mode 100755
index 0000000..ab838f4
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that Google Test warns the user when not initialized properly."""
+
+import gtest_test_utils
+
+binary_name = 'googletest-param-test-invalid-name2-test_'
+COMMAND = gtest_test_utils.GetTestExecutablePath(binary_name)
+
+
+def Assert(condition):
+  if not condition:
+    raise AssertionError
+
+
+def TestExitCodeAndOutput(command):
+  """Runs the given command and verifies its exit code and output."""
+
+  err = ('Duplicate parameterized test name \'a\'')
+
+  p = gtest_test_utils.Subprocess(command)
+  Assert(p.terminated_by_signal)
+
+  # Check for appropriate output
+  Assert(err in p.output)
+
+
+class GTestParamTestInvalidName2Test(gtest_test_utils.TestCase):
+
+  def testExitCodeAndOutput(self):
+    TestExitCodeAndOutput(COMMAND)
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test_.cc
new file mode 100755
index 0000000..76371df
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-invalid-name2-test_.cc
@@ -0,0 +1,55 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/gtest.h"
+
+namespace {
+class DummyTest : public ::testing::TestWithParam<const char *> {};
+
+std::string StringParamTestSuffix(
+    const testing::TestParamInfo<const char*>& info) {
+  return std::string(info.param);
+}
+
+TEST_P(DummyTest, Dummy) {
+}
+
+INSTANTIATE_TEST_SUITE_P(DuplicateTestNames,
+                         DummyTest,
+                         ::testing::Values("a", "b", "a", "c"),
+                         StringParamTestSuffix);
+}  // namespace
+
+int main(int argc, char *argv[]) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.cc
new file mode 100755
index 0000000..6c187df
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.cc
@@ -0,0 +1,1055 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google Test itself. This file verifies that the parameter
+// generators objects produce correct parameter sequences and that
+// Google Test runtime instantiates correct tests from those sequences.
+
+#include "gtest/gtest.h"
+
+# include <algorithm>
+# include <iostream>
+# include <list>
+# include <sstream>
+# include <string>
+# include <vector>
+
+# include "src/gtest-internal-inl.h"  // for UnitTestOptions
+# include "test/googletest-param-test-test.h"
+
+using ::std::vector;
+using ::std::sort;
+
+using ::testing::AddGlobalTestEnvironment;
+using ::testing::Bool;
+using ::testing::Combine;
+using ::testing::Message;
+using ::testing::Range;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+using ::testing::internal::ParamGenerator;
+using ::testing::internal::UnitTestOptions;
+
+// Prints a value to a string.
+//
+// FIXME: remove PrintValue() when we move matchers and
+// EXPECT_THAT() from Google Mock to Google Test.  At that time, we
+// can write EXPECT_THAT(x, Eq(y)) to compare two tuples x and y, as
+// EXPECT_THAT() and the matchers know how to print tuples.
+template <typename T>
+::std::string PrintValue(const T& value) {
+  return testing::PrintToString(value);
+}
+
+// Verifies that a sequence generated by the generator and accessed
+// via the iterator object matches the expected one using Google Test
+// assertions.
+template <typename T, size_t N>
+void VerifyGenerator(const ParamGenerator<T>& generator,
+                     const T (&expected_values)[N]) {
+  typename ParamGenerator<T>::iterator it = generator.begin();
+  for (size_t i = 0; i < N; ++i) {
+    ASSERT_FALSE(it == generator.end())
+        << "At element " << i << " when accessing via an iterator "
+        << "created with the copy constructor.\n";
+    // We cannot use EXPECT_EQ() here as the values may be tuples,
+    // which don't support <<.
+    EXPECT_TRUE(expected_values[i] == *it)
+        << "where i is " << i
+        << ", expected_values[i] is " << PrintValue(expected_values[i])
+        << ", *it is " << PrintValue(*it)
+        << ", and 'it' is an iterator created with the copy constructor.\n";
+    ++it;
+  }
+  EXPECT_TRUE(it == generator.end())
+        << "At the presumed end of sequence when accessing via an iterator "
+        << "created with the copy constructor.\n";
+
+  // Test the iterator assignment. The following lines verify that
+  // the sequence accessed via an iterator initialized via the
+  // assignment operator (as opposed to a copy constructor) matches
+  // just the same.
+  it = generator.begin();
+  for (size_t i = 0; i < N; ++i) {
+    ASSERT_FALSE(it == generator.end())
+        << "At element " << i << " when accessing via an iterator "
+        << "created with the assignment operator.\n";
+    EXPECT_TRUE(expected_values[i] == *it)
+        << "where i is " << i
+        << ", expected_values[i] is " << PrintValue(expected_values[i])
+        << ", *it is " << PrintValue(*it)
+        << ", and 'it' is an iterator created with the copy constructor.\n";
+    ++it;
+  }
+  EXPECT_TRUE(it == generator.end())
+        << "At the presumed end of sequence when accessing via an iterator "
+        << "created with the assignment operator.\n";
+}
+
+template <typename T>
+void VerifyGeneratorIsEmpty(const ParamGenerator<T>& generator) {
+  typename ParamGenerator<T>::iterator it = generator.begin();
+  EXPECT_TRUE(it == generator.end());
+
+  it = generator.begin();
+  EXPECT_TRUE(it == generator.end());
+}
+
+// Generator tests. They test that each of the provided generator functions
+// generates an expected sequence of values. The general test pattern
+// instantiates a generator using one of the generator functions,
+// checks the sequence produced by the generator using its iterator API,
+// and then resets the iterator back to the beginning of the sequence
+// and checks the sequence again.
+
+// Tests that iterators produced by generator functions conform to the
+// ForwardIterator concept.
+TEST(IteratorTest, ParamIteratorConformsToForwardIteratorConcept) {
+  const ParamGenerator<int> gen = Range(0, 10);
+  ParamGenerator<int>::iterator it = gen.begin();
+
+  // Verifies that iterator initialization works as expected.
+  ParamGenerator<int>::iterator it2 = it;
+  EXPECT_TRUE(*it == *it2) << "Initialized iterators must point to the "
+                           << "element same as its source points to";
+
+  // Verifies that iterator assignment works as expected.
+  ++it;
+  EXPECT_FALSE(*it == *it2);
+  it2 = it;
+  EXPECT_TRUE(*it == *it2) << "Assigned iterators must point to the "
+                           << "element same as its source points to";
+
+  // Verifies that prefix operator++() returns *this.
+  EXPECT_EQ(&it, &(++it)) << "Result of the prefix operator++ must be "
+                          << "refer to the original object";
+
+  // Verifies that the result of the postfix operator++ points to the value
+  // pointed to by the original iterator.
+  int original_value = *it;  // Have to compute it outside of macro call to be
+                             // unaffected by the parameter evaluation order.
+  EXPECT_EQ(original_value, *(it++));
+
+  // Verifies that prefix and postfix operator++() advance an iterator
+  // all the same.
+  it2 = it;
+  ++it;
+  ++it2;
+  EXPECT_TRUE(*it == *it2);
+}
+
+// Tests that Range() generates the expected sequence.
+TEST(RangeTest, IntRangeWithDefaultStep) {
+  const ParamGenerator<int> gen = Range(0, 3);
+  const int expected_values[] = {0, 1, 2};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Edge case. Tests that Range() generates the single element sequence
+// as expected when provided with range limits that are equal.
+TEST(RangeTest, IntRangeSingleValue) {
+  const ParamGenerator<int> gen = Range(0, 1);
+  const int expected_values[] = {0};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Edge case. Tests that Range() with generates empty sequence when
+// supplied with an empty range.
+TEST(RangeTest, IntRangeEmpty) {
+  const ParamGenerator<int> gen = Range(0, 0);
+  VerifyGeneratorIsEmpty(gen);
+}
+
+// Tests that Range() with custom step (greater then one) generates
+// the expected sequence.
+TEST(RangeTest, IntRangeWithCustomStep) {
+  const ParamGenerator<int> gen = Range(0, 9, 3);
+  const int expected_values[] = {0, 3, 6};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that Range() with custom step (greater then one) generates
+// the expected sequence when the last element does not fall on the
+// upper range limit. Sequences generated by Range() must not have
+// elements beyond the range limits.
+TEST(RangeTest, IntRangeWithCustomStepOverUpperBound) {
+  const ParamGenerator<int> gen = Range(0, 4, 3);
+  const int expected_values[] = {0, 3};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Verifies that Range works with user-defined types that define
+// copy constructor, operator=(), operator+(), and operator<().
+class DogAdder {
+ public:
+  explicit DogAdder(const char* a_value) : value_(a_value) {}
+  DogAdder(const DogAdder& other) : value_(other.value_.c_str()) {}
+
+  DogAdder operator=(const DogAdder& other) {
+    if (this != &other)
+      value_ = other.value_;
+    return *this;
+  }
+  DogAdder operator+(const DogAdder& other) const {
+    Message msg;
+    msg << value_.c_str() << other.value_.c_str();
+    return DogAdder(msg.GetString().c_str());
+  }
+  bool operator<(const DogAdder& other) const {
+    return value_ < other.value_;
+  }
+  const std::string& value() const { return value_; }
+
+ private:
+  std::string value_;
+};
+
+TEST(RangeTest, WorksWithACustomType) {
+  const ParamGenerator<DogAdder> gen =
+      Range(DogAdder("cat"), DogAdder("catdogdog"), DogAdder("dog"));
+  ParamGenerator<DogAdder>::iterator it = gen.begin();
+
+  ASSERT_FALSE(it == gen.end());
+  EXPECT_STREQ("cat", it->value().c_str());
+
+  ASSERT_FALSE(++it == gen.end());
+  EXPECT_STREQ("catdog", it->value().c_str());
+
+  EXPECT_TRUE(++it == gen.end());
+}
+
+class IntWrapper {
+ public:
+  explicit IntWrapper(int a_value) : value_(a_value) {}
+  IntWrapper(const IntWrapper& other) : value_(other.value_) {}
+
+  IntWrapper operator=(const IntWrapper& other) {
+    value_ = other.value_;
+    return *this;
+  }
+  // operator+() adds a different type.
+  IntWrapper operator+(int other) const { return IntWrapper(value_ + other); }
+  bool operator<(const IntWrapper& other) const {
+    return value_ < other.value_;
+  }
+  int value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+TEST(RangeTest, WorksWithACustomTypeWithDifferentIncrementType) {
+  const ParamGenerator<IntWrapper> gen = Range(IntWrapper(0), IntWrapper(2));
+  ParamGenerator<IntWrapper>::iterator it = gen.begin();
+
+  ASSERT_FALSE(it == gen.end());
+  EXPECT_EQ(0, it->value());
+
+  ASSERT_FALSE(++it == gen.end());
+  EXPECT_EQ(1, it->value());
+
+  EXPECT_TRUE(++it == gen.end());
+}
+
+// Tests that ValuesIn() with an array parameter generates
+// the expected sequence.
+TEST(ValuesInTest, ValuesInArray) {
+  int array[] = {3, 5, 8};
+  const ParamGenerator<int> gen = ValuesIn(array);
+  VerifyGenerator(gen, array);
+}
+
+// Tests that ValuesIn() with a const array parameter generates
+// the expected sequence.
+TEST(ValuesInTest, ValuesInConstArray) {
+  const int array[] = {3, 5, 8};
+  const ParamGenerator<int> gen = ValuesIn(array);
+  VerifyGenerator(gen, array);
+}
+
+// Edge case. Tests that ValuesIn() with an array parameter containing a
+// single element generates the single element sequence.
+TEST(ValuesInTest, ValuesInSingleElementArray) {
+  int array[] = {42};
+  const ParamGenerator<int> gen = ValuesIn(array);
+  VerifyGenerator(gen, array);
+}
+
+// Tests that ValuesIn() generates the expected sequence for an STL
+// container (vector).
+TEST(ValuesInTest, ValuesInVector) {
+  typedef ::std::vector<int> ContainerType;
+  ContainerType values;
+  values.push_back(3);
+  values.push_back(5);
+  values.push_back(8);
+  const ParamGenerator<int> gen = ValuesIn(values);
+
+  const int expected_values[] = {3, 5, 8};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that ValuesIn() generates the expected sequence.
+TEST(ValuesInTest, ValuesInIteratorRange) {
+  typedef ::std::vector<int> ContainerType;
+  ContainerType values;
+  values.push_back(3);
+  values.push_back(5);
+  values.push_back(8);
+  const ParamGenerator<int> gen = ValuesIn(values.begin(), values.end());
+
+  const int expected_values[] = {3, 5, 8};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Edge case. Tests that ValuesIn() provided with an iterator range specifying a
+// single value generates a single-element sequence.
+TEST(ValuesInTest, ValuesInSingleElementIteratorRange) {
+  typedef ::std::vector<int> ContainerType;
+  ContainerType values;
+  values.push_back(42);
+  const ParamGenerator<int> gen = ValuesIn(values.begin(), values.end());
+
+  const int expected_values[] = {42};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Edge case. Tests that ValuesIn() provided with an empty iterator range
+// generates an empty sequence.
+TEST(ValuesInTest, ValuesInEmptyIteratorRange) {
+  typedef ::std::vector<int> ContainerType;
+  ContainerType values;
+  const ParamGenerator<int> gen = ValuesIn(values.begin(), values.end());
+
+  VerifyGeneratorIsEmpty(gen);
+}
+
+// Tests that the Values() generates the expected sequence.
+TEST(ValuesTest, ValuesWorks) {
+  const ParamGenerator<int> gen = Values(3, 5, 8);
+
+  const int expected_values[] = {3, 5, 8};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that Values() generates the expected sequences from elements of
+// different types convertible to ParamGenerator's parameter type.
+TEST(ValuesTest, ValuesWorksForValuesOfCompatibleTypes) {
+  const ParamGenerator<double> gen = Values(3, 5.0f, 8.0);
+
+  const double expected_values[] = {3.0, 5.0, 8.0};
+  VerifyGenerator(gen, expected_values);
+}
+
+TEST(ValuesTest, ValuesWorksForMaxLengthList) {
+  const ParamGenerator<int> gen = Values(
+      10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
+      110, 120, 130, 140, 150, 160, 170, 180, 190, 200,
+      210, 220, 230, 240, 250, 260, 270, 280, 290, 300,
+      310, 320, 330, 340, 350, 360, 370, 380, 390, 400,
+      410, 420, 430, 440, 450, 460, 470, 480, 490, 500);
+
+  const int expected_values[] = {
+      10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
+      110, 120, 130, 140, 150, 160, 170, 180, 190, 200,
+      210, 220, 230, 240, 250, 260, 270, 280, 290, 300,
+      310, 320, 330, 340, 350, 360, 370, 380, 390, 400,
+      410, 420, 430, 440, 450, 460, 470, 480, 490, 500};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Edge case test. Tests that single-parameter Values() generates the sequence
+// with the single value.
+TEST(ValuesTest, ValuesWithSingleParameter) {
+  const ParamGenerator<int> gen = Values(42);
+
+  const int expected_values[] = {42};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that Bool() generates sequence (false, true).
+TEST(BoolTest, BoolWorks) {
+  const ParamGenerator<bool> gen = Bool();
+
+  const bool expected_values[] = {false, true};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that Combine() with two parameters generates the expected sequence.
+TEST(CombineTest, CombineWithTwoParameters) {
+  const char* foo = "foo";
+  const char* bar = "bar";
+  const ParamGenerator<std::tuple<const char*, int> > gen =
+      Combine(Values(foo, bar), Values(3, 4));
+
+  std::tuple<const char*, int> expected_values[] = {
+      std::make_tuple(foo, 3), std::make_tuple(foo, 4), std::make_tuple(bar, 3),
+      std::make_tuple(bar, 4)};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that Combine() with three parameters generates the expected sequence.
+TEST(CombineTest, CombineWithThreeParameters) {
+  const ParamGenerator<std::tuple<int, int, int> > gen =
+      Combine(Values(0, 1), Values(3, 4), Values(5, 6));
+  std::tuple<int, int, int> expected_values[] = {
+      std::make_tuple(0, 3, 5), std::make_tuple(0, 3, 6),
+      std::make_tuple(0, 4, 5), std::make_tuple(0, 4, 6),
+      std::make_tuple(1, 3, 5), std::make_tuple(1, 3, 6),
+      std::make_tuple(1, 4, 5), std::make_tuple(1, 4, 6)};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that the Combine() with the first parameter generating a single value
+// sequence generates a sequence with the number of elements equal to the
+// number of elements in the sequence generated by the second parameter.
+TEST(CombineTest, CombineWithFirstParameterSingleValue) {
+  const ParamGenerator<std::tuple<int, int> > gen =
+      Combine(Values(42), Values(0, 1));
+
+  std::tuple<int, int> expected_values[] = {std::make_tuple(42, 0),
+                                            std::make_tuple(42, 1)};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that the Combine() with the second parameter generating a single value
+// sequence generates a sequence with the number of elements equal to the
+// number of elements in the sequence generated by the first parameter.
+TEST(CombineTest, CombineWithSecondParameterSingleValue) {
+  const ParamGenerator<std::tuple<int, int> > gen =
+      Combine(Values(0, 1), Values(42));
+
+  std::tuple<int, int> expected_values[] = {std::make_tuple(0, 42),
+                                            std::make_tuple(1, 42)};
+  VerifyGenerator(gen, expected_values);
+}
+
+// Tests that when the first parameter produces an empty sequence,
+// Combine() produces an empty sequence, too.
+TEST(CombineTest, CombineWithFirstParameterEmptyRange) {
+  const ParamGenerator<std::tuple<int, int> > gen =
+      Combine(Range(0, 0), Values(0, 1));
+  VerifyGeneratorIsEmpty(gen);
+}
+
+// Tests that when the second parameter produces an empty sequence,
+// Combine() produces an empty sequence, too.
+TEST(CombineTest, CombineWithSecondParameterEmptyRange) {
+  const ParamGenerator<std::tuple<int, int> > gen =
+      Combine(Values(0, 1), Range(1, 1));
+  VerifyGeneratorIsEmpty(gen);
+}
+
+// Edge case. Tests that combine works with the maximum number
+// of parameters supported by Google Test (currently 10).
+TEST(CombineTest, CombineWithMaxNumberOfParameters) {
+  const char* foo = "foo";
+  const char* bar = "bar";
+  const ParamGenerator<
+      std::tuple<const char*, int, int, int, int, int, int, int, int, int> >
+      gen =
+          Combine(Values(foo, bar), Values(1), Values(2), Values(3), Values(4),
+                  Values(5), Values(6), Values(7), Values(8), Values(9));
+
+  std::tuple<const char*, int, int, int, int, int, int, int, int, int>
+      expected_values[] = {std::make_tuple(foo, 1, 2, 3, 4, 5, 6, 7, 8, 9),
+                           std::make_tuple(bar, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
+  VerifyGenerator(gen, expected_values);
+}
+
+class NonDefaultConstructAssignString {
+ public:
+  NonDefaultConstructAssignString(const std::string& s) : str_(s) {}
+
+  const std::string& str() const { return str_; }
+
+ private:
+  std::string str_;
+
+  // Not default constructible
+  NonDefaultConstructAssignString();
+  // Not assignable
+  void operator=(const NonDefaultConstructAssignString&);
+};
+
+TEST(CombineTest, NonDefaultConstructAssign) {
+  const ParamGenerator<std::tuple<int, NonDefaultConstructAssignString> > gen =
+      Combine(Values(0, 1), Values(NonDefaultConstructAssignString("A"),
+                                   NonDefaultConstructAssignString("B")));
+
+  ParamGenerator<std::tuple<int, NonDefaultConstructAssignString> >::iterator
+      it = gen.begin();
+
+  EXPECT_EQ(0, std::get<0>(*it));
+  EXPECT_EQ("A", std::get<1>(*it).str());
+  ++it;
+
+  EXPECT_EQ(0, std::get<0>(*it));
+  EXPECT_EQ("B", std::get<1>(*it).str());
+  ++it;
+
+  EXPECT_EQ(1, std::get<0>(*it));
+  EXPECT_EQ("A", std::get<1>(*it).str());
+  ++it;
+
+  EXPECT_EQ(1, std::get<0>(*it));
+  EXPECT_EQ("B", std::get<1>(*it).str());
+  ++it;
+
+  EXPECT_TRUE(it == gen.end());
+}
+
+
+// Tests that an generator produces correct sequence after being
+// assigned from another generator.
+TEST(ParamGeneratorTest, AssignmentWorks) {
+  ParamGenerator<int> gen = Values(1, 2);
+  const ParamGenerator<int> gen2 = Values(3, 4);
+  gen = gen2;
+
+  const int expected_values[] = {3, 4};
+  VerifyGenerator(gen, expected_values);
+}
+
+// This test verifies that the tests are expanded and run as specified:
+// one test per element from the sequence produced by the generator
+// specified in INSTANTIATE_TEST_SUITE_P. It also verifies that the test's
+// fixture constructor, SetUp(), and TearDown() have run and have been
+// supplied with the correct parameters.
+
+// The use of environment object allows detection of the case where no test
+// case functionality is run at all. In this case TearDownTestSuite will not
+// be able to detect missing tests, naturally.
+template <int kExpectedCalls>
+class TestGenerationEnvironment : public ::testing::Environment {
+ public:
+  static TestGenerationEnvironment* Instance() {
+    static TestGenerationEnvironment* instance = new TestGenerationEnvironment;
+    return instance;
+  }
+
+  void FixtureConstructorExecuted() { fixture_constructor_count_++; }
+  void SetUpExecuted() { set_up_count_++; }
+  void TearDownExecuted() { tear_down_count_++; }
+  void TestBodyExecuted() { test_body_count_++; }
+
+  void TearDown() override {
+    // If all MultipleTestGenerationTest tests have been de-selected
+    // by the filter flag, the following checks make no sense.
+    bool perform_check = false;
+
+    for (int i = 0; i < kExpectedCalls; ++i) {
+      Message msg;
+      msg << "TestsExpandedAndRun/" << i;
+      if (UnitTestOptions::FilterMatchesTest(
+             "TestExpansionModule/MultipleTestGenerationTest",
+              msg.GetString().c_str())) {
+        perform_check = true;
+      }
+    }
+    if (perform_check) {
+      EXPECT_EQ(kExpectedCalls, fixture_constructor_count_)
+          << "Fixture constructor of ParamTestGenerationTest test case "
+          << "has not been run as expected.";
+      EXPECT_EQ(kExpectedCalls, set_up_count_)
+          << "Fixture SetUp method of ParamTestGenerationTest test case "
+          << "has not been run as expected.";
+      EXPECT_EQ(kExpectedCalls, tear_down_count_)
+          << "Fixture TearDown method of ParamTestGenerationTest test case "
+          << "has not been run as expected.";
+      EXPECT_EQ(kExpectedCalls, test_body_count_)
+          << "Test in ParamTestGenerationTest test case "
+          << "has not been run as expected.";
+    }
+  }
+
+ private:
+  TestGenerationEnvironment() : fixture_constructor_count_(0), set_up_count_(0),
+                                tear_down_count_(0), test_body_count_(0) {}
+
+  int fixture_constructor_count_;
+  int set_up_count_;
+  int tear_down_count_;
+  int test_body_count_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestGenerationEnvironment);
+};
+
+const int test_generation_params[] = {36, 42, 72};
+
+class TestGenerationTest : public TestWithParam<int> {
+ public:
+  enum {
+    PARAMETER_COUNT =
+        sizeof(test_generation_params)/sizeof(test_generation_params[0])
+  };
+
+  typedef TestGenerationEnvironment<PARAMETER_COUNT> Environment;
+
+  TestGenerationTest() {
+    Environment::Instance()->FixtureConstructorExecuted();
+    current_parameter_ = GetParam();
+  }
+  void SetUp() override {
+    Environment::Instance()->SetUpExecuted();
+    EXPECT_EQ(current_parameter_, GetParam());
+  }
+  void TearDown() override {
+    Environment::Instance()->TearDownExecuted();
+    EXPECT_EQ(current_parameter_, GetParam());
+  }
+
+  static void SetUpTestSuite() {
+    bool all_tests_in_test_case_selected = true;
+
+    for (int i = 0; i < PARAMETER_COUNT; ++i) {
+      Message test_name;
+      test_name << "TestsExpandedAndRun/" << i;
+      if ( !UnitTestOptions::FilterMatchesTest(
+                "TestExpansionModule/MultipleTestGenerationTest",
+                test_name.GetString())) {
+        all_tests_in_test_case_selected = false;
+      }
+    }
+    EXPECT_TRUE(all_tests_in_test_case_selected)
+        << "When running the TestGenerationTest test case all of its tests\n"
+        << "must be selected by the filter flag for the test case to pass.\n"
+        << "If not all of them are enabled, we can't reliably conclude\n"
+        << "that the correct number of tests have been generated.";
+
+    collected_parameters_.clear();
+  }
+
+  static void TearDownTestSuite() {
+    vector<int> expected_values(test_generation_params,
+                                test_generation_params + PARAMETER_COUNT);
+    // Test execution order is not guaranteed by Google Test,
+    // so the order of values in collected_parameters_ can be
+    // different and we have to sort to compare.
+    sort(expected_values.begin(), expected_values.end());
+    sort(collected_parameters_.begin(), collected_parameters_.end());
+
+    EXPECT_TRUE(collected_parameters_ == expected_values);
+  }
+
+ protected:
+  int current_parameter_;
+  static vector<int> collected_parameters_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestGenerationTest);
+};
+vector<int> TestGenerationTest::collected_parameters_;
+
+TEST_P(TestGenerationTest, TestsExpandedAndRun) {
+  Environment::Instance()->TestBodyExecuted();
+  EXPECT_EQ(current_parameter_, GetParam());
+  collected_parameters_.push_back(GetParam());
+}
+INSTANTIATE_TEST_SUITE_P(TestExpansionModule, TestGenerationTest,
+                         ValuesIn(test_generation_params));
+
+// This test verifies that the element sequence (third parameter of
+// INSTANTIATE_TEST_SUITE_P) is evaluated in InitGoogleTest() and neither at
+// the call site of INSTANTIATE_TEST_SUITE_P nor in RUN_ALL_TESTS().  For
+// that, we declare param_value_ to be a static member of
+// GeneratorEvaluationTest and initialize it to 0.  We set it to 1 in
+// main(), just before invocation of InitGoogleTest().  After calling
+// InitGoogleTest(), we set the value to 2.  If the sequence is evaluated
+// before or after InitGoogleTest, INSTANTIATE_TEST_SUITE_P will create a
+// test with parameter other than 1, and the test body will fail the
+// assertion.
+class GeneratorEvaluationTest : public TestWithParam<int> {
+ public:
+  static int param_value() { return param_value_; }
+  static void set_param_value(int param_value) { param_value_ = param_value; }
+
+ private:
+  static int param_value_;
+};
+int GeneratorEvaluationTest::param_value_ = 0;
+
+TEST_P(GeneratorEvaluationTest, GeneratorsEvaluatedInMain) {
+  EXPECT_EQ(1, GetParam());
+}
+INSTANTIATE_TEST_SUITE_P(GenEvalModule, GeneratorEvaluationTest,
+                         Values(GeneratorEvaluationTest::param_value()));
+
+// Tests that generators defined in a different translation unit are
+// functional. Generator extern_gen is defined in gtest-param-test_test2.cc.
+extern ParamGenerator<int> extern_gen;
+class ExternalGeneratorTest : public TestWithParam<int> {};
+TEST_P(ExternalGeneratorTest, ExternalGenerator) {
+  // Sequence produced by extern_gen contains only a single value
+  // which we verify here.
+  EXPECT_EQ(GetParam(), 33);
+}
+INSTANTIATE_TEST_SUITE_P(ExternalGeneratorModule, ExternalGeneratorTest,
+                         extern_gen);
+
+// Tests that a parameterized test case can be defined in one translation
+// unit and instantiated in another. This test will be instantiated in
+// gtest-param-test_test2.cc. ExternalInstantiationTest fixture class is
+// defined in gtest-param-test_test.h.
+TEST_P(ExternalInstantiationTest, IsMultipleOf33) {
+  EXPECT_EQ(0, GetParam() % 33);
+}
+
+// Tests that a parameterized test case can be instantiated with multiple
+// generators.
+class MultipleInstantiationTest : public TestWithParam<int> {};
+TEST_P(MultipleInstantiationTest, AllowsMultipleInstances) {
+}
+INSTANTIATE_TEST_SUITE_P(Sequence1, MultipleInstantiationTest, Values(1, 2));
+INSTANTIATE_TEST_SUITE_P(Sequence2, MultipleInstantiationTest, Range(3, 5));
+
+// Tests that a parameterized test case can be instantiated
+// in multiple translation units. This test will be instantiated
+// here and in gtest-param-test_test2.cc.
+// InstantiationInMultipleTranslationUnitsTest fixture class
+// is defined in gtest-param-test_test.h.
+TEST_P(InstantiationInMultipleTranslationUnitsTest, IsMultipleOf42) {
+  EXPECT_EQ(0, GetParam() % 42);
+}
+INSTANTIATE_TEST_SUITE_P(Sequence1, InstantiationInMultipleTranslationUnitsTest,
+                         Values(42, 42 * 2));
+
+// Tests that each iteration of parameterized test runs in a separate test
+// object.
+class SeparateInstanceTest : public TestWithParam<int> {
+ public:
+  SeparateInstanceTest() : count_(0) {}
+
+  static void TearDownTestSuite() {
+    EXPECT_GE(global_count_, 2)
+        << "If some (but not all) SeparateInstanceTest tests have been "
+        << "filtered out this test will fail. Make sure that all "
+        << "GeneratorEvaluationTest are selected or de-selected together "
+        << "by the test filter.";
+  }
+
+ protected:
+  int count_;
+  static int global_count_;
+};
+int SeparateInstanceTest::global_count_ = 0;
+
+TEST_P(SeparateInstanceTest, TestsRunInSeparateInstances) {
+  EXPECT_EQ(0, count_++);
+  global_count_++;
+}
+INSTANTIATE_TEST_SUITE_P(FourElemSequence, SeparateInstanceTest, Range(1, 4));
+
+// Tests that all instantiations of a test have named appropriately. Test
+// defined with TEST_P(TestSuiteName, TestName) and instantiated with
+// INSTANTIATE_TEST_SUITE_P(SequenceName, TestSuiteName, generator) must be
+// named SequenceName/TestSuiteName.TestName/i, where i is the 0-based index of
+// the sequence element used to instantiate the test.
+class NamingTest : public TestWithParam<int> {};
+
+TEST_P(NamingTest, TestsReportCorrectNamesAndParameters) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+
+  EXPECT_STREQ("ZeroToFiveSequence/NamingTest", test_info->test_suite_name());
+
+  Message index_stream;
+  index_stream << "TestsReportCorrectNamesAndParameters/" << GetParam();
+  EXPECT_STREQ(index_stream.GetString().c_str(), test_info->name());
+
+  EXPECT_EQ(::testing::PrintToString(GetParam()), test_info->value_param());
+}
+
+INSTANTIATE_TEST_SUITE_P(ZeroToFiveSequence, NamingTest, Range(0, 5));
+
+// Tests that macros in test names are expanded correctly.
+class MacroNamingTest : public TestWithParam<int> {};
+
+#define PREFIX_WITH_FOO(test_name) Foo##test_name
+#define PREFIX_WITH_MACRO(test_name) Macro##test_name
+
+TEST_P(PREFIX_WITH_MACRO(NamingTest), PREFIX_WITH_FOO(SomeTestName)) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+
+  EXPECT_STREQ("FortyTwo/MacroNamingTest", test_info->test_suite_name());
+  EXPECT_STREQ("FooSomeTestName", test_info->name());
+}
+
+INSTANTIATE_TEST_SUITE_P(FortyTwo, MacroNamingTest, Values(42));
+
+// Tests the same thing for non-parametrized tests.
+class MacroNamingTestNonParametrized : public ::testing::Test {};
+
+TEST_F(PREFIX_WITH_MACRO(NamingTestNonParametrized),
+       PREFIX_WITH_FOO(SomeTestName)) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+
+  EXPECT_STREQ("MacroNamingTestNonParametrized", test_info->test_suite_name());
+  EXPECT_STREQ("FooSomeTestName", test_info->name());
+}
+
+// Tests that user supplied custom parameter names are working correctly.
+// Runs the test with a builtin helper method which uses PrintToString,
+// as well as a custom function and custom functor to ensure all possible
+// uses work correctly.
+class CustomFunctorNamingTest : public TestWithParam<std::string> {};
+TEST_P(CustomFunctorNamingTest, CustomTestNames) {}
+
+struct CustomParamNameFunctor {
+  std::string operator()(const ::testing::TestParamInfo<std::string>& inf) {
+    return inf.param;
+  }
+};
+
+INSTANTIATE_TEST_SUITE_P(CustomParamNameFunctor, CustomFunctorNamingTest,
+                         Values(std::string("FunctorName")),
+                         CustomParamNameFunctor());
+
+INSTANTIATE_TEST_SUITE_P(AllAllowedCharacters, CustomFunctorNamingTest,
+                         Values("abcdefghijklmnopqrstuvwxyz",
+                                "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "01234567890_"),
+                         CustomParamNameFunctor());
+
+inline std::string CustomParamNameFunction(
+    const ::testing::TestParamInfo<std::string>& inf) {
+  return inf.param;
+}
+
+class CustomFunctionNamingTest : public TestWithParam<std::string> {};
+TEST_P(CustomFunctionNamingTest, CustomTestNames) {}
+
+INSTANTIATE_TEST_SUITE_P(CustomParamNameFunction, CustomFunctionNamingTest,
+                         Values(std::string("FunctionName")),
+                         CustomParamNameFunction);
+
+INSTANTIATE_TEST_SUITE_P(CustomParamNameFunctionP, CustomFunctionNamingTest,
+                         Values(std::string("FunctionNameP")),
+                         &CustomParamNameFunction);
+
+// Test custom naming with a lambda
+
+class CustomLambdaNamingTest : public TestWithParam<std::string> {};
+TEST_P(CustomLambdaNamingTest, CustomTestNames) {}
+
+INSTANTIATE_TEST_SUITE_P(CustomParamNameLambda, CustomLambdaNamingTest,
+                         Values(std::string("LambdaName")),
+                         [](const ::testing::TestParamInfo<std::string>& inf) {
+                           return inf.param;
+                         });
+
+TEST(CustomNamingTest, CheckNameRegistry) {
+  ::testing::UnitTest* unit_test = ::testing::UnitTest::GetInstance();
+  std::set<std::string> test_names;
+  for (int suite_num = 0; suite_num < unit_test->total_test_suite_count();
+       ++suite_num) {
+    const ::testing::TestSuite* test_suite = unit_test->GetTestSuite(suite_num);
+    for (int test_num = 0; test_num < test_suite->total_test_count();
+         ++test_num) {
+      const ::testing::TestInfo* test_info = test_suite->GetTestInfo(test_num);
+      test_names.insert(std::string(test_info->name()));
+    }
+  }
+  EXPECT_EQ(1u, test_names.count("CustomTestNames/FunctorName"));
+  EXPECT_EQ(1u, test_names.count("CustomTestNames/FunctionName"));
+  EXPECT_EQ(1u, test_names.count("CustomTestNames/FunctionNameP"));
+  EXPECT_EQ(1u, test_names.count("CustomTestNames/LambdaName"));
+}
+
+// Test a numeric name to ensure PrintToStringParamName works correctly.
+
+class CustomIntegerNamingTest : public TestWithParam<int> {};
+
+TEST_P(CustomIntegerNamingTest, TestsReportCorrectNames) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+  Message test_name_stream;
+  test_name_stream << "TestsReportCorrectNames/" << GetParam();
+  EXPECT_STREQ(test_name_stream.GetString().c_str(), test_info->name());
+}
+
+INSTANTIATE_TEST_SUITE_P(PrintToString, CustomIntegerNamingTest, Range(0, 5),
+                         ::testing::PrintToStringParamName());
+
+// Test a custom struct with PrintToString.
+
+struct CustomStruct {
+  explicit CustomStruct(int value) : x(value) {}
+  int x;
+};
+
+std::ostream& operator<<(std::ostream& stream, const CustomStruct& val) {
+  stream << val.x;
+  return stream;
+}
+
+class CustomStructNamingTest : public TestWithParam<CustomStruct> {};
+
+TEST_P(CustomStructNamingTest, TestsReportCorrectNames) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+  Message test_name_stream;
+  test_name_stream << "TestsReportCorrectNames/" << GetParam();
+  EXPECT_STREQ(test_name_stream.GetString().c_str(), test_info->name());
+}
+
+INSTANTIATE_TEST_SUITE_P(PrintToString, CustomStructNamingTest,
+                         Values(CustomStruct(0), CustomStruct(1)),
+                         ::testing::PrintToStringParamName());
+
+// Test that using a stateful parameter naming function works as expected.
+
+struct StatefulNamingFunctor {
+  StatefulNamingFunctor() : sum(0) {}
+  std::string operator()(const ::testing::TestParamInfo<int>& info) {
+    int value = info.param + sum;
+    sum += info.param;
+    return ::testing::PrintToString(value);
+  }
+  int sum;
+};
+
+class StatefulNamingTest : public ::testing::TestWithParam<int> {
+ protected:
+  StatefulNamingTest() : sum_(0) {}
+  int sum_;
+};
+
+TEST_P(StatefulNamingTest, TestsReportCorrectNames) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+  sum_ += GetParam();
+  Message test_name_stream;
+  test_name_stream << "TestsReportCorrectNames/" << sum_;
+  EXPECT_STREQ(test_name_stream.GetString().c_str(), test_info->name());
+}
+
+INSTANTIATE_TEST_SUITE_P(StatefulNamingFunctor, StatefulNamingTest, Range(0, 5),
+                         StatefulNamingFunctor());
+
+// Class that cannot be streamed into an ostream.  It needs to be copyable
+// (and, in case of MSVC, also assignable) in order to be a test parameter
+// type.  Its default copy constructor and assignment operator do exactly
+// what we need.
+class Unstreamable {
+ public:
+  explicit Unstreamable(int value) : value_(value) {}
+  // -Wunused-private-field: dummy accessor for `value_`.
+  const int& dummy_value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+class CommentTest : public TestWithParam<Unstreamable> {};
+
+TEST_P(CommentTest, TestsCorrectlyReportUnstreamableParams) {
+  const ::testing::TestInfo* const test_info =
+     ::testing::UnitTest::GetInstance()->current_test_info();
+
+  EXPECT_EQ(::testing::PrintToString(GetParam()), test_info->value_param());
+}
+
+INSTANTIATE_TEST_SUITE_P(InstantiationWithComments, CommentTest,
+                         Values(Unstreamable(1)));
+
+// Verify that we can create a hierarchy of test fixtures, where the base
+// class fixture is not parameterized and the derived class is. In this case
+// ParameterizedDerivedTest inherits from NonParameterizedBaseTest.  We
+// perform simple tests on both.
+class NonParameterizedBaseTest : public ::testing::Test {
+ public:
+  NonParameterizedBaseTest() : n_(17) { }
+ protected:
+  int n_;
+};
+
+class ParameterizedDerivedTest : public NonParameterizedBaseTest,
+                                 public ::testing::WithParamInterface<int> {
+ protected:
+  ParameterizedDerivedTest() : count_(0) { }
+  int count_;
+  static int global_count_;
+};
+
+int ParameterizedDerivedTest::global_count_ = 0;
+
+TEST_F(NonParameterizedBaseTest, FixtureIsInitialized) {
+  EXPECT_EQ(17, n_);
+}
+
+TEST_P(ParameterizedDerivedTest, SeesSequence) {
+  EXPECT_EQ(17, n_);
+  EXPECT_EQ(0, count_++);
+  EXPECT_EQ(GetParam(), global_count_++);
+}
+
+class ParameterizedDeathTest : public ::testing::TestWithParam<int> { };
+
+TEST_F(ParameterizedDeathTest, GetParamDiesFromTestF) {
+  EXPECT_DEATH_IF_SUPPORTED(GetParam(),
+                            ".* value-parameterized test .*");
+}
+
+INSTANTIATE_TEST_SUITE_P(RangeZeroToFive, ParameterizedDerivedTest,
+                         Range(0, 5));
+
+// Tests param generator working with Enums
+enum MyEnums {
+  ENUM1 = 1,
+  ENUM2 = 3,
+  ENUM3 = 8,
+};
+
+class MyEnumTest : public testing::TestWithParam<MyEnums> {};
+
+TEST_P(MyEnumTest, ChecksParamMoreThanZero) { EXPECT_GE(10, GetParam()); }
+INSTANTIATE_TEST_SUITE_P(MyEnumTests, MyEnumTest,
+                         ::testing::Values(ENUM1, ENUM2, 0));
+
+int main(int argc, char **argv) {
+  // Used in TestGenerationTest test suite.
+  AddGlobalTestEnvironment(TestGenerationTest::Environment::Instance());
+  // Used in GeneratorEvaluationTest test suite. Tests that the updated value
+  // will be picked up for instantiating tests in GeneratorEvaluationTest.
+  GeneratorEvaluationTest::set_param_value(1);
+
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Used in GeneratorEvaluationTest test suite. Tests that value updated
+  // here will NOT be used for instantiating tests in
+  // GeneratorEvaluationTest.
+  GeneratorEvaluationTest::set_param_value(2);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.h
new file mode 100755
index 0000000..6480570
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test-test.h
@@ -0,0 +1,51 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file provides classes and functions used internally
+// for testing Google Test itself.
+
+#ifndef GTEST_TEST_GTEST_PARAM_TEST_TEST_H_
+#define GTEST_TEST_GTEST_PARAM_TEST_TEST_H_
+
+#include "gtest/gtest.h"
+
+// Test fixture for testing definition and instantiation of a test
+// in separate translation units.
+class ExternalInstantiationTest : public ::testing::TestWithParam<int> {
+};
+
+// Test fixture for testing instantiation of a test in multiple
+// translation units.
+class InstantiationInMultipleTranslationUnitsTest
+    : public ::testing::TestWithParam<int> {
+};
+
+#endif  // GTEST_TEST_GTEST_PARAM_TEST_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test2-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test2-test.cc
new file mode 100755
index 0000000..2a29fb1
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-param-test2-test.cc
@@ -0,0 +1,61 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google Test itself.  This verifies that the basic constructs of
+// Google Test work.
+
+#include "gtest/gtest.h"
+#include "test/googletest-param-test-test.h"
+
+using ::testing::Values;
+using ::testing::internal::ParamGenerator;
+
+// Tests that generators defined in a different translation unit
+// are functional. The test using extern_gen is defined
+// in googletest-param-test-test.cc.
+ParamGenerator<int> extern_gen = Values(33);
+
+// Tests that a parameterized test case can be defined in one translation unit
+// and instantiated in another. The test is defined in
+// googletest-param-test-test.cc and ExternalInstantiationTest fixture class is
+// defined in gtest-param-test_test.h.
+INSTANTIATE_TEST_SUITE_P(MultiplesOf33,
+                         ExternalInstantiationTest,
+                         Values(33, 66));
+
+// Tests that a parameterized test case can be instantiated
+// in multiple translation units. Another instantiation is defined
+// in googletest-param-test-test.cc and
+// InstantiationInMultipleTranslationUnitsTest fixture is defined in
+// gtest-param-test_test.h
+INSTANTIATE_TEST_SUITE_P(Sequence2,
+                         InstantiationInMultipleTranslationUnitsTest,
+                         Values(42*3, 42*4, 42*5));
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-port-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-port-test.cc
new file mode 100755
index 0000000..c29e922
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-port-test.cc
@@ -0,0 +1,1272 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file tests the internal cross-platform support utilities.
+#include <stdio.h>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_MAC
+# include <time.h>
+#endif  // GTEST_OS_MAC
+
+#include <list>
+#include <memory>
+#include <utility>  // For std::pair and std::make_pair.
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+#include "src/gtest-internal-inl.h"
+
+using std::make_pair;
+using std::pair;
+
+namespace testing {
+namespace internal {
+
+TEST(IsXDigitTest, WorksForNarrowAscii) {
+  EXPECT_TRUE(IsXDigit('0'));
+  EXPECT_TRUE(IsXDigit('9'));
+  EXPECT_TRUE(IsXDigit('A'));
+  EXPECT_TRUE(IsXDigit('F'));
+  EXPECT_TRUE(IsXDigit('a'));
+  EXPECT_TRUE(IsXDigit('f'));
+
+  EXPECT_FALSE(IsXDigit('-'));
+  EXPECT_FALSE(IsXDigit('g'));
+  EXPECT_FALSE(IsXDigit('G'));
+}
+
+TEST(IsXDigitTest, ReturnsFalseForNarrowNonAscii) {
+  EXPECT_FALSE(IsXDigit(static_cast<char>('\x80')));
+  EXPECT_FALSE(IsXDigit(static_cast<char>('0' | '\x80')));
+}
+
+TEST(IsXDigitTest, WorksForWideAscii) {
+  EXPECT_TRUE(IsXDigit(L'0'));
+  EXPECT_TRUE(IsXDigit(L'9'));
+  EXPECT_TRUE(IsXDigit(L'A'));
+  EXPECT_TRUE(IsXDigit(L'F'));
+  EXPECT_TRUE(IsXDigit(L'a'));
+  EXPECT_TRUE(IsXDigit(L'f'));
+
+  EXPECT_FALSE(IsXDigit(L'-'));
+  EXPECT_FALSE(IsXDigit(L'g'));
+  EXPECT_FALSE(IsXDigit(L'G'));
+}
+
+TEST(IsXDigitTest, ReturnsFalseForWideNonAscii) {
+  EXPECT_FALSE(IsXDigit(static_cast<wchar_t>(0x80)));
+  EXPECT_FALSE(IsXDigit(static_cast<wchar_t>(L'0' | 0x80)));
+  EXPECT_FALSE(IsXDigit(static_cast<wchar_t>(L'0' | 0x100)));
+}
+
+class Base {
+ public:
+  // Copy constructor and assignment operator do exactly what we need, so we
+  // use them.
+  Base() : member_(0) {}
+  explicit Base(int n) : member_(n) {}
+  virtual ~Base() {}
+  int member() { return member_; }
+
+ private:
+  int member_;
+};
+
+class Derived : public Base {
+ public:
+  explicit Derived(int n) : Base(n) {}
+};
+
+TEST(ImplicitCastTest, ConvertsPointers) {
+  Derived derived(0);
+  EXPECT_TRUE(&derived == ::testing::internal::ImplicitCast_<Base*>(&derived));
+}
+
+TEST(ImplicitCastTest, CanUseInheritance) {
+  Derived derived(1);
+  Base base = ::testing::internal::ImplicitCast_<Base>(derived);
+  EXPECT_EQ(derived.member(), base.member());
+}
+
+class Castable {
+ public:
+  explicit Castable(bool* converted) : converted_(converted) {}
+  operator Base() {
+    *converted_ = true;
+    return Base();
+  }
+
+ private:
+  bool* converted_;
+};
+
+TEST(ImplicitCastTest, CanUseNonConstCastOperator) {
+  bool converted = false;
+  Castable castable(&converted);
+  Base base = ::testing::internal::ImplicitCast_<Base>(castable);
+  EXPECT_TRUE(converted);
+}
+
+class ConstCastable {
+ public:
+  explicit ConstCastable(bool* converted) : converted_(converted) {}
+  operator Base() const {
+    *converted_ = true;
+    return Base();
+  }
+
+ private:
+  bool* converted_;
+};
+
+TEST(ImplicitCastTest, CanUseConstCastOperatorOnConstValues) {
+  bool converted = false;
+  const ConstCastable const_castable(&converted);
+  Base base = ::testing::internal::ImplicitCast_<Base>(const_castable);
+  EXPECT_TRUE(converted);
+}
+
+class ConstAndNonConstCastable {
+ public:
+  ConstAndNonConstCastable(bool* converted, bool* const_converted)
+      : converted_(converted), const_converted_(const_converted) {}
+  operator Base() {
+    *converted_ = true;
+    return Base();
+  }
+  operator Base() const {
+    *const_converted_ = true;
+    return Base();
+  }
+
+ private:
+  bool* converted_;
+  bool* const_converted_;
+};
+
+TEST(ImplicitCastTest, CanSelectBetweenConstAndNonConstCasrAppropriately) {
+  bool converted = false;
+  bool const_converted = false;
+  ConstAndNonConstCastable castable(&converted, &const_converted);
+  Base base = ::testing::internal::ImplicitCast_<Base>(castable);
+  EXPECT_TRUE(converted);
+  EXPECT_FALSE(const_converted);
+
+  converted = false;
+  const_converted = false;
+  const ConstAndNonConstCastable const_castable(&converted, &const_converted);
+  base = ::testing::internal::ImplicitCast_<Base>(const_castable);
+  EXPECT_FALSE(converted);
+  EXPECT_TRUE(const_converted);
+}
+
+class To {
+ public:
+  To(bool* converted) { *converted = true; }  // NOLINT
+};
+
+TEST(ImplicitCastTest, CanUseImplicitConstructor) {
+  bool converted = false;
+  To to = ::testing::internal::ImplicitCast_<To>(&converted);
+  (void)to;
+  EXPECT_TRUE(converted);
+}
+
+TEST(GtestCheckSyntaxTest, BehavesLikeASingleStatement) {
+  if (AlwaysFalse())
+    GTEST_CHECK_(false) << "This should never be executed; "
+                           "It's a compilation test only.";
+
+  if (AlwaysTrue())
+    GTEST_CHECK_(true);
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    ;  // NOLINT
+  else
+    GTEST_CHECK_(true) << "";
+}
+
+TEST(GtestCheckSyntaxTest, WorksWithSwitch) {
+  switch (0) {
+    case 1:
+      break;
+    default:
+      GTEST_CHECK_(true);
+  }
+
+  switch (0)
+    case 0:
+      GTEST_CHECK_(true) << "Check failed in switch case";
+}
+
+// Verifies behavior of FormatFileLocation.
+TEST(FormatFileLocationTest, FormatsFileLocation) {
+  EXPECT_PRED_FORMAT2(IsSubstring, "foo.cc", FormatFileLocation("foo.cc", 42));
+  EXPECT_PRED_FORMAT2(IsSubstring, "42", FormatFileLocation("foo.cc", 42));
+}
+
+TEST(FormatFileLocationTest, FormatsUnknownFile) {
+  EXPECT_PRED_FORMAT2(IsSubstring, "unknown file",
+                      FormatFileLocation(nullptr, 42));
+  EXPECT_PRED_FORMAT2(IsSubstring, "42", FormatFileLocation(nullptr, 42));
+}
+
+TEST(FormatFileLocationTest, FormatsUknownLine) {
+  EXPECT_EQ("foo.cc:", FormatFileLocation("foo.cc", -1));
+}
+
+TEST(FormatFileLocationTest, FormatsUknownFileAndLine) {
+  EXPECT_EQ("unknown file:", FormatFileLocation(nullptr, -1));
+}
+
+// Verifies behavior of FormatCompilerIndependentFileLocation.
+TEST(FormatCompilerIndependentFileLocationTest, FormatsFileLocation) {
+  EXPECT_EQ("foo.cc:42", FormatCompilerIndependentFileLocation("foo.cc", 42));
+}
+
+TEST(FormatCompilerIndependentFileLocationTest, FormatsUknownFile) {
+  EXPECT_EQ("unknown file:42",
+            FormatCompilerIndependentFileLocation(nullptr, 42));
+}
+
+TEST(FormatCompilerIndependentFileLocationTest, FormatsUknownLine) {
+  EXPECT_EQ("foo.cc", FormatCompilerIndependentFileLocation("foo.cc", -1));
+}
+
+TEST(FormatCompilerIndependentFileLocationTest, FormatsUknownFileAndLine) {
+  EXPECT_EQ("unknown file", FormatCompilerIndependentFileLocation(nullptr, -1));
+}
+
+#if GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_QNX || GTEST_OS_FUCHSIA || \
+    GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+void* ThreadFunc(void* data) {
+  internal::Mutex* mutex = static_cast<internal::Mutex*>(data);
+  mutex->Lock();
+  mutex->Unlock();
+  return nullptr;
+}
+
+TEST(GetThreadCountTest, ReturnsCorrectValue) {
+  const size_t starting_count = GetThreadCount();
+  pthread_t       thread_id;
+
+  internal::Mutex mutex;
+  {
+    internal::MutexLock lock(&mutex);
+    pthread_attr_t  attr;
+    ASSERT_EQ(0, pthread_attr_init(&attr));
+    ASSERT_EQ(0, pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE));
+
+    const int status = pthread_create(&thread_id, &attr, &ThreadFunc, &mutex);
+    ASSERT_EQ(0, pthread_attr_destroy(&attr));
+    ASSERT_EQ(0, status);
+    EXPECT_EQ(starting_count + 1, GetThreadCount());
+  }
+
+  void* dummy;
+  ASSERT_EQ(0, pthread_join(thread_id, &dummy));
+
+  // The OS may not immediately report the updated thread count after
+  // joining a thread, causing flakiness in this test. To counter that, we
+  // wait for up to .5 seconds for the OS to report the correct value.
+  for (int i = 0; i < 5; ++i) {
+    if (GetThreadCount() == starting_count)
+      break;
+
+    SleepMilliseconds(100);
+  }
+
+  EXPECT_EQ(starting_count, GetThreadCount());
+}
+#else
+TEST(GetThreadCountTest, ReturnsZeroWhenUnableToCountThreads) {
+  EXPECT_EQ(0U, GetThreadCount());
+}
+#endif  // GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_QNX || GTEST_OS_FUCHSIA
+
+TEST(GtestCheckDeathTest, DiesWithCorrectOutputOnFailure) {
+  const bool a_false_condition = false;
+  const char regex[] =
+#ifdef _MSC_VER
+     "googletest-port-test\\.cc\\(\\d+\\):"
+#elif GTEST_USES_POSIX_RE
+     "googletest-port-test\\.cc:[0-9]+"
+#else
+     "googletest-port-test\\.cc:\\d+"
+#endif  // _MSC_VER
+     ".*a_false_condition.*Extra info.*";
+
+  EXPECT_DEATH_IF_SUPPORTED(GTEST_CHECK_(a_false_condition) << "Extra info",
+                            regex);
+}
+
+#if GTEST_HAS_DEATH_TEST
+
+TEST(GtestCheckDeathTest, LivesSilentlyOnSuccess) {
+  EXPECT_EXIT({
+      GTEST_CHECK_(true) << "Extra info";
+      ::std::cerr << "Success\n";
+      exit(0); },
+      ::testing::ExitedWithCode(0), "Success");
+}
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Verifies that Google Test choose regular expression engine appropriate to
+// the platform. The test will produce compiler errors in case of failure.
+// For simplicity, we only cover the most important platforms here.
+TEST(RegexEngineSelectionTest, SelectsCorrectRegexEngine) {
+#if !GTEST_USES_PCRE
+# if GTEST_HAS_POSIX_RE
+
+  EXPECT_TRUE(GTEST_USES_POSIX_RE);
+
+# else
+
+  EXPECT_TRUE(GTEST_USES_SIMPLE_RE);
+
+# endif
+#endif  // !GTEST_USES_PCRE
+}
+
+#if GTEST_USES_POSIX_RE
+
+# if GTEST_HAS_TYPED_TEST
+
+template <typename Str>
+class RETest : public ::testing::Test {};
+
+// Defines StringTypes as the list of all string types that class RE
+// supports.
+typedef testing::Types< ::std::string, const char*> StringTypes;
+
+TYPED_TEST_SUITE(RETest, StringTypes);
+
+// Tests RE's implicit constructors.
+TYPED_TEST(RETest, ImplicitConstructorWorks) {
+  const RE empty(TypeParam(""));
+  EXPECT_STREQ("", empty.pattern());
+
+  const RE simple(TypeParam("hello"));
+  EXPECT_STREQ("hello", simple.pattern());
+
+  const RE normal(TypeParam(".*(\\w+)"));
+  EXPECT_STREQ(".*(\\w+)", normal.pattern());
+}
+
+// Tests that RE's constructors reject invalid regular expressions.
+TYPED_TEST(RETest, RejectsInvalidRegex) {
+  EXPECT_NONFATAL_FAILURE({
+    const RE invalid(TypeParam("?"));
+  }, "\"?\" is not a valid POSIX Extended regular expression.");
+}
+
+// Tests RE::FullMatch().
+TYPED_TEST(RETest, FullMatchWorks) {
+  const RE empty(TypeParam(""));
+  EXPECT_TRUE(RE::FullMatch(TypeParam(""), empty));
+  EXPECT_FALSE(RE::FullMatch(TypeParam("a"), empty));
+
+  const RE re(TypeParam("a.*z"));
+  EXPECT_TRUE(RE::FullMatch(TypeParam("az"), re));
+  EXPECT_TRUE(RE::FullMatch(TypeParam("axyz"), re));
+  EXPECT_FALSE(RE::FullMatch(TypeParam("baz"), re));
+  EXPECT_FALSE(RE::FullMatch(TypeParam("azy"), re));
+}
+
+// Tests RE::PartialMatch().
+TYPED_TEST(RETest, PartialMatchWorks) {
+  const RE empty(TypeParam(""));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam(""), empty));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam("a"), empty));
+
+  const RE re(TypeParam("a.*z"));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam("az"), re));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam("axyz"), re));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam("baz"), re));
+  EXPECT_TRUE(RE::PartialMatch(TypeParam("azy"), re));
+  EXPECT_FALSE(RE::PartialMatch(TypeParam("zza"), re));
+}
+
+# endif  // GTEST_HAS_TYPED_TEST
+
+#elif GTEST_USES_SIMPLE_RE
+
+TEST(IsInSetTest, NulCharIsNotInAnySet) {
+  EXPECT_FALSE(IsInSet('\0', ""));
+  EXPECT_FALSE(IsInSet('\0', "\0"));
+  EXPECT_FALSE(IsInSet('\0', "a"));
+}
+
+TEST(IsInSetTest, WorksForNonNulChars) {
+  EXPECT_FALSE(IsInSet('a', "Ab"));
+  EXPECT_FALSE(IsInSet('c', ""));
+
+  EXPECT_TRUE(IsInSet('b', "bcd"));
+  EXPECT_TRUE(IsInSet('b', "ab"));
+}
+
+TEST(IsAsciiDigitTest, IsFalseForNonDigit) {
+  EXPECT_FALSE(IsAsciiDigit('\0'));
+  EXPECT_FALSE(IsAsciiDigit(' '));
+  EXPECT_FALSE(IsAsciiDigit('+'));
+  EXPECT_FALSE(IsAsciiDigit('-'));
+  EXPECT_FALSE(IsAsciiDigit('.'));
+  EXPECT_FALSE(IsAsciiDigit('a'));
+}
+
+TEST(IsAsciiDigitTest, IsTrueForDigit) {
+  EXPECT_TRUE(IsAsciiDigit('0'));
+  EXPECT_TRUE(IsAsciiDigit('1'));
+  EXPECT_TRUE(IsAsciiDigit('5'));
+  EXPECT_TRUE(IsAsciiDigit('9'));
+}
+
+TEST(IsAsciiPunctTest, IsFalseForNonPunct) {
+  EXPECT_FALSE(IsAsciiPunct('\0'));
+  EXPECT_FALSE(IsAsciiPunct(' '));
+  EXPECT_FALSE(IsAsciiPunct('\n'));
+  EXPECT_FALSE(IsAsciiPunct('a'));
+  EXPECT_FALSE(IsAsciiPunct('0'));
+}
+
+TEST(IsAsciiPunctTest, IsTrueForPunct) {
+  for (const char* p = "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"; *p; p++) {
+    EXPECT_PRED1(IsAsciiPunct, *p);
+  }
+}
+
+TEST(IsRepeatTest, IsFalseForNonRepeatChar) {
+  EXPECT_FALSE(IsRepeat('\0'));
+  EXPECT_FALSE(IsRepeat(' '));
+  EXPECT_FALSE(IsRepeat('a'));
+  EXPECT_FALSE(IsRepeat('1'));
+  EXPECT_FALSE(IsRepeat('-'));
+}
+
+TEST(IsRepeatTest, IsTrueForRepeatChar) {
+  EXPECT_TRUE(IsRepeat('?'));
+  EXPECT_TRUE(IsRepeat('*'));
+  EXPECT_TRUE(IsRepeat('+'));
+}
+
+TEST(IsAsciiWhiteSpaceTest, IsFalseForNonWhiteSpace) {
+  EXPECT_FALSE(IsAsciiWhiteSpace('\0'));
+  EXPECT_FALSE(IsAsciiWhiteSpace('a'));
+  EXPECT_FALSE(IsAsciiWhiteSpace('1'));
+  EXPECT_FALSE(IsAsciiWhiteSpace('+'));
+  EXPECT_FALSE(IsAsciiWhiteSpace('_'));
+}
+
+TEST(IsAsciiWhiteSpaceTest, IsTrueForWhiteSpace) {
+  EXPECT_TRUE(IsAsciiWhiteSpace(' '));
+  EXPECT_TRUE(IsAsciiWhiteSpace('\n'));
+  EXPECT_TRUE(IsAsciiWhiteSpace('\r'));
+  EXPECT_TRUE(IsAsciiWhiteSpace('\t'));
+  EXPECT_TRUE(IsAsciiWhiteSpace('\v'));
+  EXPECT_TRUE(IsAsciiWhiteSpace('\f'));
+}
+
+TEST(IsAsciiWordCharTest, IsFalseForNonWordChar) {
+  EXPECT_FALSE(IsAsciiWordChar('\0'));
+  EXPECT_FALSE(IsAsciiWordChar('+'));
+  EXPECT_FALSE(IsAsciiWordChar('.'));
+  EXPECT_FALSE(IsAsciiWordChar(' '));
+  EXPECT_FALSE(IsAsciiWordChar('\n'));
+}
+
+TEST(IsAsciiWordCharTest, IsTrueForLetter) {
+  EXPECT_TRUE(IsAsciiWordChar('a'));
+  EXPECT_TRUE(IsAsciiWordChar('b'));
+  EXPECT_TRUE(IsAsciiWordChar('A'));
+  EXPECT_TRUE(IsAsciiWordChar('Z'));
+}
+
+TEST(IsAsciiWordCharTest, IsTrueForDigit) {
+  EXPECT_TRUE(IsAsciiWordChar('0'));
+  EXPECT_TRUE(IsAsciiWordChar('1'));
+  EXPECT_TRUE(IsAsciiWordChar('7'));
+  EXPECT_TRUE(IsAsciiWordChar('9'));
+}
+
+TEST(IsAsciiWordCharTest, IsTrueForUnderscore) {
+  EXPECT_TRUE(IsAsciiWordChar('_'));
+}
+
+TEST(IsValidEscapeTest, IsFalseForNonPrintable) {
+  EXPECT_FALSE(IsValidEscape('\0'));
+  EXPECT_FALSE(IsValidEscape('\007'));
+}
+
+TEST(IsValidEscapeTest, IsFalseForDigit) {
+  EXPECT_FALSE(IsValidEscape('0'));
+  EXPECT_FALSE(IsValidEscape('9'));
+}
+
+TEST(IsValidEscapeTest, IsFalseForWhiteSpace) {
+  EXPECT_FALSE(IsValidEscape(' '));
+  EXPECT_FALSE(IsValidEscape('\n'));
+}
+
+TEST(IsValidEscapeTest, IsFalseForSomeLetter) {
+  EXPECT_FALSE(IsValidEscape('a'));
+  EXPECT_FALSE(IsValidEscape('Z'));
+}
+
+TEST(IsValidEscapeTest, IsTrueForPunct) {
+  EXPECT_TRUE(IsValidEscape('.'));
+  EXPECT_TRUE(IsValidEscape('-'));
+  EXPECT_TRUE(IsValidEscape('^'));
+  EXPECT_TRUE(IsValidEscape('$'));
+  EXPECT_TRUE(IsValidEscape('('));
+  EXPECT_TRUE(IsValidEscape(']'));
+  EXPECT_TRUE(IsValidEscape('{'));
+  EXPECT_TRUE(IsValidEscape('|'));
+}
+
+TEST(IsValidEscapeTest, IsTrueForSomeLetter) {
+  EXPECT_TRUE(IsValidEscape('d'));
+  EXPECT_TRUE(IsValidEscape('D'));
+  EXPECT_TRUE(IsValidEscape('s'));
+  EXPECT_TRUE(IsValidEscape('S'));
+  EXPECT_TRUE(IsValidEscape('w'));
+  EXPECT_TRUE(IsValidEscape('W'));
+}
+
+TEST(AtomMatchesCharTest, EscapedPunct) {
+  EXPECT_FALSE(AtomMatchesChar(true, '\\', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, '\\', ' '));
+  EXPECT_FALSE(AtomMatchesChar(true, '_', '.'));
+  EXPECT_FALSE(AtomMatchesChar(true, '.', 'a'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, '\\', '\\'));
+  EXPECT_TRUE(AtomMatchesChar(true, '_', '_'));
+  EXPECT_TRUE(AtomMatchesChar(true, '+', '+'));
+  EXPECT_TRUE(AtomMatchesChar(true, '.', '.'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_d) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'd', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'd', 'a'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'd', '.'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'd', '0'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'd', '9'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_D) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'D', '0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'D', '9'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'D', '\0'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'D', 'a'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'D', '-'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_s) {
+  EXPECT_FALSE(AtomMatchesChar(true, 's', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 's', 'a'));
+  EXPECT_FALSE(AtomMatchesChar(true, 's', '.'));
+  EXPECT_FALSE(AtomMatchesChar(true, 's', '9'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 's', ' '));
+  EXPECT_TRUE(AtomMatchesChar(true, 's', '\n'));
+  EXPECT_TRUE(AtomMatchesChar(true, 's', '\t'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_S) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'S', ' '));
+  EXPECT_FALSE(AtomMatchesChar(true, 'S', '\r'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'S', '\0'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'S', 'a'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'S', '9'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_w) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'w', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'w', '+'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'w', ' '));
+  EXPECT_FALSE(AtomMatchesChar(true, 'w', '\n'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'w', '0'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'w', 'b'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'w', 'C'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'w', '_'));
+}
+
+TEST(AtomMatchesCharTest, Escaped_W) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'W', 'A'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'W', 'b'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'W', '9'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'W', '_'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'W', '\0'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'W', '*'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'W', '\n'));
+}
+
+TEST(AtomMatchesCharTest, EscapedWhiteSpace) {
+  EXPECT_FALSE(AtomMatchesChar(true, 'f', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'f', '\n'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'n', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'n', '\r'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'r', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'r', 'a'));
+  EXPECT_FALSE(AtomMatchesChar(true, 't', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 't', 't'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'v', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(true, 'v', '\f'));
+
+  EXPECT_TRUE(AtomMatchesChar(true, 'f', '\f'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'n', '\n'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'r', '\r'));
+  EXPECT_TRUE(AtomMatchesChar(true, 't', '\t'));
+  EXPECT_TRUE(AtomMatchesChar(true, 'v', '\v'));
+}
+
+TEST(AtomMatchesCharTest, UnescapedDot) {
+  EXPECT_FALSE(AtomMatchesChar(false, '.', '\n'));
+
+  EXPECT_TRUE(AtomMatchesChar(false, '.', '\0'));
+  EXPECT_TRUE(AtomMatchesChar(false, '.', '.'));
+  EXPECT_TRUE(AtomMatchesChar(false, '.', 'a'));
+  EXPECT_TRUE(AtomMatchesChar(false, '.', ' '));
+}
+
+TEST(AtomMatchesCharTest, UnescapedChar) {
+  EXPECT_FALSE(AtomMatchesChar(false, 'a', '\0'));
+  EXPECT_FALSE(AtomMatchesChar(false, 'a', 'b'));
+  EXPECT_FALSE(AtomMatchesChar(false, '$', 'a'));
+
+  EXPECT_TRUE(AtomMatchesChar(false, '$', '$'));
+  EXPECT_TRUE(AtomMatchesChar(false, '5', '5'));
+  EXPECT_TRUE(AtomMatchesChar(false, 'Z', 'Z'));
+}
+
+TEST(ValidateRegexTest, GeneratesFailureAndReturnsFalseForInvalid) {
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex(NULL)),
+                          "NULL is not a valid simple regular expression");
+  EXPECT_NONFATAL_FAILURE(
+      ASSERT_FALSE(ValidateRegex("a\\")),
+      "Syntax error at index 1 in simple regular expression \"a\\\": ");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("a\\")),
+                          "'\\' cannot appear at the end");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("\\n\\")),
+                          "'\\' cannot appear at the end");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("\\s\\hb")),
+                          "invalid escape sequence \"\\h\"");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("^^")),
+                          "'^' can only appear at the beginning");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex(".*^b")),
+                          "'^' can only appear at the beginning");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("$$")),
+                          "'$' can only appear at the end");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("^$a")),
+                          "'$' can only appear at the end");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("a(b")),
+                          "'(' is unsupported");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("ab)")),
+                          "')' is unsupported");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("[ab")),
+                          "'[' is unsupported");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("a{2")),
+                          "'{' is unsupported");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("?")),
+                          "'?' can only follow a repeatable token");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("^*")),
+                          "'*' can only follow a repeatable token");
+  EXPECT_NONFATAL_FAILURE(ASSERT_FALSE(ValidateRegex("5*+")),
+                          "'+' can only follow a repeatable token");
+}
+
+TEST(ValidateRegexTest, ReturnsTrueForValid) {
+  EXPECT_TRUE(ValidateRegex(""));
+  EXPECT_TRUE(ValidateRegex("a"));
+  EXPECT_TRUE(ValidateRegex(".*"));
+  EXPECT_TRUE(ValidateRegex("^a_+"));
+  EXPECT_TRUE(ValidateRegex("^a\\t\\&?"));
+  EXPECT_TRUE(ValidateRegex("09*$"));
+  EXPECT_TRUE(ValidateRegex("^Z$"));
+  EXPECT_TRUE(ValidateRegex("a\\^Z\\$\\(\\)\\|\\[\\]\\{\\}"));
+}
+
+TEST(MatchRepetitionAndRegexAtHeadTest, WorksForZeroOrOne) {
+  EXPECT_FALSE(MatchRepetitionAndRegexAtHead(false, 'a', '?', "a", "ba"));
+  // Repeating more than once.
+  EXPECT_FALSE(MatchRepetitionAndRegexAtHead(false, 'a', '?', "b", "aab"));
+
+  // Repeating zero times.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, 'a', '?', "b", "ba"));
+  // Repeating once.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, 'a', '?', "b", "ab"));
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, '#', '?', ".", "##"));
+}
+
+TEST(MatchRepetitionAndRegexAtHeadTest, WorksForZeroOrMany) {
+  EXPECT_FALSE(MatchRepetitionAndRegexAtHead(false, '.', '*', "a$", "baab"));
+
+  // Repeating zero times.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, '.', '*', "b", "bc"));
+  // Repeating once.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, '.', '*', "b", "abc"));
+  // Repeating more than once.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(true, 'w', '*', "-", "ab_1-g"));
+}
+
+TEST(MatchRepetitionAndRegexAtHeadTest, WorksForOneOrMany) {
+  EXPECT_FALSE(MatchRepetitionAndRegexAtHead(false, '.', '+', "a$", "baab"));
+  // Repeating zero times.
+  EXPECT_FALSE(MatchRepetitionAndRegexAtHead(false, '.', '+', "b", "bc"));
+
+  // Repeating once.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(false, '.', '+', "b", "abc"));
+  // Repeating more than once.
+  EXPECT_TRUE(MatchRepetitionAndRegexAtHead(true, 'w', '+', "-", "ab_1-g"));
+}
+
+TEST(MatchRegexAtHeadTest, ReturnsTrueForEmptyRegex) {
+  EXPECT_TRUE(MatchRegexAtHead("", ""));
+  EXPECT_TRUE(MatchRegexAtHead("", "ab"));
+}
+
+TEST(MatchRegexAtHeadTest, WorksWhenDollarIsInRegex) {
+  EXPECT_FALSE(MatchRegexAtHead("$", "a"));
+
+  EXPECT_TRUE(MatchRegexAtHead("$", ""));
+  EXPECT_TRUE(MatchRegexAtHead("a$", "a"));
+}
+
+TEST(MatchRegexAtHeadTest, WorksWhenRegexStartsWithEscapeSequence) {
+  EXPECT_FALSE(MatchRegexAtHead("\\w", "+"));
+  EXPECT_FALSE(MatchRegexAtHead("\\W", "ab"));
+
+  EXPECT_TRUE(MatchRegexAtHead("\\sa", "\nab"));
+  EXPECT_TRUE(MatchRegexAtHead("\\d", "1a"));
+}
+
+TEST(MatchRegexAtHeadTest, WorksWhenRegexStartsWithRepetition) {
+  EXPECT_FALSE(MatchRegexAtHead(".+a", "abc"));
+  EXPECT_FALSE(MatchRegexAtHead("a?b", "aab"));
+
+  EXPECT_TRUE(MatchRegexAtHead(".*a", "bc12-ab"));
+  EXPECT_TRUE(MatchRegexAtHead("a?b", "b"));
+  EXPECT_TRUE(MatchRegexAtHead("a?b", "ab"));
+}
+
+TEST(MatchRegexAtHeadTest,
+     WorksWhenRegexStartsWithRepetionOfEscapeSequence) {
+  EXPECT_FALSE(MatchRegexAtHead("\\.+a", "abc"));
+  EXPECT_FALSE(MatchRegexAtHead("\\s?b", "  b"));
+
+  EXPECT_TRUE(MatchRegexAtHead("\\(*a", "((((ab"));
+  EXPECT_TRUE(MatchRegexAtHead("\\^?b", "^b"));
+  EXPECT_TRUE(MatchRegexAtHead("\\\\?b", "b"));
+  EXPECT_TRUE(MatchRegexAtHead("\\\\?b", "\\b"));
+}
+
+TEST(MatchRegexAtHeadTest, MatchesSequentially) {
+  EXPECT_FALSE(MatchRegexAtHead("ab.*c", "acabc"));
+
+  EXPECT_TRUE(MatchRegexAtHead("ab.*c", "ab-fsc"));
+}
+
+TEST(MatchRegexAnywhereTest, ReturnsFalseWhenStringIsNull) {
+  EXPECT_FALSE(MatchRegexAnywhere("", NULL));
+}
+
+TEST(MatchRegexAnywhereTest, WorksWhenRegexStartsWithCaret) {
+  EXPECT_FALSE(MatchRegexAnywhere("^a", "ba"));
+  EXPECT_FALSE(MatchRegexAnywhere("^$", "a"));
+
+  EXPECT_TRUE(MatchRegexAnywhere("^a", "ab"));
+  EXPECT_TRUE(MatchRegexAnywhere("^", "ab"));
+  EXPECT_TRUE(MatchRegexAnywhere("^$", ""));
+}
+
+TEST(MatchRegexAnywhereTest, ReturnsFalseWhenNoMatch) {
+  EXPECT_FALSE(MatchRegexAnywhere("a", "bcde123"));
+  EXPECT_FALSE(MatchRegexAnywhere("a.+a", "--aa88888888"));
+}
+
+TEST(MatchRegexAnywhereTest, ReturnsTrueWhenMatchingPrefix) {
+  EXPECT_TRUE(MatchRegexAnywhere("\\w+", "ab1_ - 5"));
+  EXPECT_TRUE(MatchRegexAnywhere(".*=", "="));
+  EXPECT_TRUE(MatchRegexAnywhere("x.*ab?.*bc", "xaaabc"));
+}
+
+TEST(MatchRegexAnywhereTest, ReturnsTrueWhenMatchingNonPrefix) {
+  EXPECT_TRUE(MatchRegexAnywhere("\\w+", "$$$ ab1_ - 5"));
+  EXPECT_TRUE(MatchRegexAnywhere("\\.+=", "=  ...="));
+}
+
+// Tests RE's implicit constructors.
+TEST(RETest, ImplicitConstructorWorks) {
+  const RE empty("");
+  EXPECT_STREQ("", empty.pattern());
+
+  const RE simple("hello");
+  EXPECT_STREQ("hello", simple.pattern());
+}
+
+// Tests that RE's constructors reject invalid regular expressions.
+TEST(RETest, RejectsInvalidRegex) {
+  EXPECT_NONFATAL_FAILURE({
+    const RE normal(NULL);
+  }, "NULL is not a valid simple regular expression");
+
+  EXPECT_NONFATAL_FAILURE({
+    const RE normal(".*(\\w+");
+  }, "'(' is unsupported");
+
+  EXPECT_NONFATAL_FAILURE({
+    const RE invalid("^?");
+  }, "'?' can only follow a repeatable token");
+}
+
+// Tests RE::FullMatch().
+TEST(RETest, FullMatchWorks) {
+  const RE empty("");
+  EXPECT_TRUE(RE::FullMatch("", empty));
+  EXPECT_FALSE(RE::FullMatch("a", empty));
+
+  const RE re1("a");
+  EXPECT_TRUE(RE::FullMatch("a", re1));
+
+  const RE re("a.*z");
+  EXPECT_TRUE(RE::FullMatch("az", re));
+  EXPECT_TRUE(RE::FullMatch("axyz", re));
+  EXPECT_FALSE(RE::FullMatch("baz", re));
+  EXPECT_FALSE(RE::FullMatch("azy", re));
+}
+
+// Tests RE::PartialMatch().
+TEST(RETest, PartialMatchWorks) {
+  const RE empty("");
+  EXPECT_TRUE(RE::PartialMatch("", empty));
+  EXPECT_TRUE(RE::PartialMatch("a", empty));
+
+  const RE re("a.*z");
+  EXPECT_TRUE(RE::PartialMatch("az", re));
+  EXPECT_TRUE(RE::PartialMatch("axyz", re));
+  EXPECT_TRUE(RE::PartialMatch("baz", re));
+  EXPECT_TRUE(RE::PartialMatch("azy", re));
+  EXPECT_FALSE(RE::PartialMatch("zza", re));
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+TEST(CaptureTest, CapturesStdout) {
+  CaptureStdout();
+  fprintf(stdout, "abc");
+  EXPECT_STREQ("abc", GetCapturedStdout().c_str());
+
+  CaptureStdout();
+  fprintf(stdout, "def%cghi", '\0');
+  EXPECT_EQ(::std::string("def\0ghi", 7), ::std::string(GetCapturedStdout()));
+}
+
+TEST(CaptureTest, CapturesStderr) {
+  CaptureStderr();
+  fprintf(stderr, "jkl");
+  EXPECT_STREQ("jkl", GetCapturedStderr().c_str());
+
+  CaptureStderr();
+  fprintf(stderr, "jkl%cmno", '\0');
+  EXPECT_EQ(::std::string("jkl\0mno", 7), ::std::string(GetCapturedStderr()));
+}
+
+// Tests that stdout and stderr capture don't interfere with each other.
+TEST(CaptureTest, CapturesStdoutAndStderr) {
+  CaptureStdout();
+  CaptureStderr();
+  fprintf(stdout, "pqr");
+  fprintf(stderr, "stu");
+  EXPECT_STREQ("pqr", GetCapturedStdout().c_str());
+  EXPECT_STREQ("stu", GetCapturedStderr().c_str());
+}
+
+TEST(CaptureDeathTest, CannotReenterStdoutCapture) {
+  CaptureStdout();
+  EXPECT_DEATH_IF_SUPPORTED(CaptureStdout(),
+                            "Only one stdout capturer can exist at a time");
+  GetCapturedStdout();
+
+  // We cannot test stderr capturing using death tests as they use it
+  // themselves.
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+TEST(ThreadLocalTest, DefaultConstructorInitializesToDefaultValues) {
+  ThreadLocal<int> t1;
+  EXPECT_EQ(0, t1.get());
+
+  ThreadLocal<void*> t2;
+  EXPECT_TRUE(t2.get() == nullptr);
+}
+
+TEST(ThreadLocalTest, SingleParamConstructorInitializesToParam) {
+  ThreadLocal<int> t1(123);
+  EXPECT_EQ(123, t1.get());
+
+  int i = 0;
+  ThreadLocal<int*> t2(&i);
+  EXPECT_EQ(&i, t2.get());
+}
+
+class NoDefaultconstructor {
+ public:
+  explicit NoDefaultconstructor(const char*) {}
+  NoDefaultconstructor(const NoDefaultconstructor&) {}
+};
+
+TEST(ThreadLocalTest, ValueDefaultconstructorIsNotRequiredForParamVersion) {
+  ThreadLocal<NoDefaultconstructor> bar(NoDefaultconstructor("foo"));
+  bar.pointer();
+}
+
+TEST(ThreadLocalTest, GetAndPointerReturnSameValue) {
+  ThreadLocal<std::string> thread_local_string;
+
+  EXPECT_EQ(thread_local_string.pointer(), &(thread_local_string.get()));
+
+  // Verifies the condition still holds after calling set.
+  thread_local_string.set("foo");
+  EXPECT_EQ(thread_local_string.pointer(), &(thread_local_string.get()));
+}
+
+TEST(ThreadLocalTest, PointerAndConstPointerReturnSameValue) {
+  ThreadLocal<std::string> thread_local_string;
+  const ThreadLocal<std::string>& const_thread_local_string =
+      thread_local_string;
+
+  EXPECT_EQ(thread_local_string.pointer(), const_thread_local_string.pointer());
+
+  thread_local_string.set("foo");
+  EXPECT_EQ(thread_local_string.pointer(), const_thread_local_string.pointer());
+}
+
+#if GTEST_IS_THREADSAFE
+
+void AddTwo(int* param) { *param += 2; }
+
+TEST(ThreadWithParamTest, ConstructorExecutesThreadFunc) {
+  int i = 40;
+  ThreadWithParam<int*> thread(&AddTwo, &i, nullptr);
+  thread.Join();
+  EXPECT_EQ(42, i);
+}
+
+TEST(MutexDeathTest, AssertHeldShouldAssertWhenNotLocked) {
+  // AssertHeld() is flaky only in the presence of multiple threads accessing
+  // the lock. In this case, the test is robust.
+  EXPECT_DEATH_IF_SUPPORTED({
+    Mutex m;
+    { MutexLock lock(&m); }
+    m.AssertHeld();
+  },
+  "thread .*hold");
+}
+
+TEST(MutexTest, AssertHeldShouldNotAssertWhenLocked) {
+  Mutex m;
+  MutexLock lock(&m);
+  m.AssertHeld();
+}
+
+class AtomicCounterWithMutex {
+ public:
+  explicit AtomicCounterWithMutex(Mutex* mutex) :
+    value_(0), mutex_(mutex), random_(42) {}
+
+  void Increment() {
+    MutexLock lock(mutex_);
+    int temp = value_;
+    {
+      // We need to put up a memory barrier to prevent reads and writes to
+      // value_ rearranged with the call to SleepMilliseconds when observed
+      // from other threads.
+#if GTEST_HAS_PTHREAD
+      // On POSIX, locking a mutex puts up a memory barrier.  We cannot use
+      // Mutex and MutexLock here or rely on their memory barrier
+      // functionality as we are testing them here.
+      pthread_mutex_t memory_barrier_mutex;
+      GTEST_CHECK_POSIX_SUCCESS_(
+          pthread_mutex_init(&memory_barrier_mutex, nullptr));
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&memory_barrier_mutex));
+
+      SleepMilliseconds(static_cast<int>(random_.Generate(30)));
+
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&memory_barrier_mutex));
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&memory_barrier_mutex));
+#elif GTEST_OS_WINDOWS
+      // On Windows, performing an interlocked access puts up a memory barrier.
+      volatile LONG dummy = 0;
+      ::InterlockedIncrement(&dummy);
+      SleepMilliseconds(static_cast<int>(random_.Generate(30)));
+      ::InterlockedIncrement(&dummy);
+#else
+# error "Memory barrier not implemented on this platform."
+#endif  // GTEST_HAS_PTHREAD
+    }
+    value_ = temp + 1;
+  }
+  int value() const { return value_; }
+
+ private:
+  volatile int value_;
+  Mutex* const mutex_;  // Protects value_.
+  Random       random_;
+};
+
+void CountingThreadFunc(pair<AtomicCounterWithMutex*, int> param) {
+  for (int i = 0; i < param.second; ++i)
+      param.first->Increment();
+}
+
+// Tests that the mutex only lets one thread at a time to lock it.
+TEST(MutexTest, OnlyOneThreadCanLockAtATime) {
+  Mutex mutex;
+  AtomicCounterWithMutex locked_counter(&mutex);
+
+  typedef ThreadWithParam<pair<AtomicCounterWithMutex*, int> > ThreadType;
+  const int kCycleCount = 20;
+  const int kThreadCount = 7;
+  std::unique_ptr<ThreadType> counting_threads[kThreadCount];
+  Notification threads_can_start;
+  // Creates and runs kThreadCount threads that increment locked_counter
+  // kCycleCount times each.
+  for (int i = 0; i < kThreadCount; ++i) {
+    counting_threads[i].reset(new ThreadType(&CountingThreadFunc,
+                                             make_pair(&locked_counter,
+                                                       kCycleCount),
+                                             &threads_can_start));
+  }
+  threads_can_start.Notify();
+  for (int i = 0; i < kThreadCount; ++i)
+    counting_threads[i]->Join();
+
+  // If the mutex lets more than one thread to increment the counter at a
+  // time, they are likely to encounter a race condition and have some
+  // increments overwritten, resulting in the lower then expected counter
+  // value.
+  EXPECT_EQ(kCycleCount * kThreadCount, locked_counter.value());
+}
+
+template <typename T>
+void RunFromThread(void (func)(T), T param) {
+  ThreadWithParam<T> thread(func, param, nullptr);
+  thread.Join();
+}
+
+void RetrieveThreadLocalValue(
+    pair<ThreadLocal<std::string>*, std::string*> param) {
+  *param.second = param.first->get();
+}
+
+TEST(ThreadLocalTest, ParameterizedConstructorSetsDefault) {
+  ThreadLocal<std::string> thread_local_string("foo");
+  EXPECT_STREQ("foo", thread_local_string.get().c_str());
+
+  thread_local_string.set("bar");
+  EXPECT_STREQ("bar", thread_local_string.get().c_str());
+
+  std::string result;
+  RunFromThread(&RetrieveThreadLocalValue,
+                make_pair(&thread_local_string, &result));
+  EXPECT_STREQ("foo", result.c_str());
+}
+
+// Keeps track of whether of destructors being called on instances of
+// DestructorTracker.  On Windows, waits for the destructor call reports.
+class DestructorCall {
+ public:
+  DestructorCall() {
+    invoked_ = false;
+#if GTEST_OS_WINDOWS
+    wait_event_.Reset(::CreateEvent(NULL, TRUE, FALSE, NULL));
+    GTEST_CHECK_(wait_event_.Get() != NULL);
+#endif
+  }
+
+  bool CheckDestroyed() const {
+#if GTEST_OS_WINDOWS
+    if (::WaitForSingleObject(wait_event_.Get(), 1000) != WAIT_OBJECT_0)
+      return false;
+#endif
+    return invoked_;
+  }
+
+  void ReportDestroyed() {
+    invoked_ = true;
+#if GTEST_OS_WINDOWS
+    ::SetEvent(wait_event_.Get());
+#endif
+  }
+
+  static std::vector<DestructorCall*>& List() { return *list_; }
+
+  static void ResetList() {
+    for (size_t i = 0; i < list_->size(); ++i) {
+      delete list_->at(i);
+    }
+    list_->clear();
+  }
+
+ private:
+  bool invoked_;
+#if GTEST_OS_WINDOWS
+  AutoHandle wait_event_;
+#endif
+  static std::vector<DestructorCall*>* const list_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DestructorCall);
+};
+
+std::vector<DestructorCall*>* const DestructorCall::list_ =
+    new std::vector<DestructorCall*>;
+
+// DestructorTracker keeps track of whether its instances have been
+// destroyed.
+class DestructorTracker {
+ public:
+  DestructorTracker() : index_(GetNewIndex()) {}
+  DestructorTracker(const DestructorTracker& /* rhs */)
+      : index_(GetNewIndex()) {}
+  ~DestructorTracker() {
+    // We never access DestructorCall::List() concurrently, so we don't need
+    // to protect this access with a mutex.
+    DestructorCall::List()[index_]->ReportDestroyed();
+  }
+
+ private:
+  static size_t GetNewIndex() {
+    DestructorCall::List().push_back(new DestructorCall);
+    return DestructorCall::List().size() - 1;
+  }
+  const size_t index_;
+
+  GTEST_DISALLOW_ASSIGN_(DestructorTracker);
+};
+
+typedef ThreadLocal<DestructorTracker>* ThreadParam;
+
+void CallThreadLocalGet(ThreadParam thread_local_param) {
+  thread_local_param->get();
+}
+
+// Tests that when a ThreadLocal object dies in a thread, it destroys
+// the managed object for that thread.
+TEST(ThreadLocalTest, DestroysManagedObjectForOwnThreadWhenDying) {
+  DestructorCall::ResetList();
+
+  {
+    ThreadLocal<DestructorTracker> thread_local_tracker;
+    ASSERT_EQ(0U, DestructorCall::List().size());
+
+    // This creates another DestructorTracker object for the main thread.
+    thread_local_tracker.get();
+    ASSERT_EQ(1U, DestructorCall::List().size());
+    ASSERT_FALSE(DestructorCall::List()[0]->CheckDestroyed());
+  }
+
+  // Now thread_local_tracker has died.
+  ASSERT_EQ(1U, DestructorCall::List().size());
+  EXPECT_TRUE(DestructorCall::List()[0]->CheckDestroyed());
+
+  DestructorCall::ResetList();
+}
+
+// Tests that when a thread exits, the thread-local object for that
+// thread is destroyed.
+TEST(ThreadLocalTest, DestroysManagedObjectAtThreadExit) {
+  DestructorCall::ResetList();
+
+  {
+    ThreadLocal<DestructorTracker> thread_local_tracker;
+    ASSERT_EQ(0U, DestructorCall::List().size());
+
+    // This creates another DestructorTracker object in the new thread.
+    ThreadWithParam<ThreadParam> thread(&CallThreadLocalGet,
+                                        &thread_local_tracker, nullptr);
+    thread.Join();
+
+    // The thread has exited, and we should have a DestroyedTracker
+    // instance created for it. But it may not have been destroyed yet.
+    ASSERT_EQ(1U, DestructorCall::List().size());
+  }
+
+  // The thread has exited and thread_local_tracker has died.
+  ASSERT_EQ(1U, DestructorCall::List().size());
+  EXPECT_TRUE(DestructorCall::List()[0]->CheckDestroyed());
+
+  DestructorCall::ResetList();
+}
+
+TEST(ThreadLocalTest, ThreadLocalMutationsAffectOnlyCurrentThread) {
+  ThreadLocal<std::string> thread_local_string;
+  thread_local_string.set("Foo");
+  EXPECT_STREQ("Foo", thread_local_string.get().c_str());
+
+  std::string result;
+  RunFromThread(&RetrieveThreadLocalValue,
+                make_pair(&thread_local_string, &result));
+  EXPECT_TRUE(result.empty());
+}
+
+#endif  // GTEST_IS_THREADSAFE
+
+#if GTEST_OS_WINDOWS
+TEST(WindowsTypesTest, HANDLEIsVoidStar) {
+  StaticAssertTypeEq<HANDLE, void*>();
+}
+
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+TEST(WindowsTypesTest, _CRITICAL_SECTIONIs_CRITICAL_SECTION) {
+  StaticAssertTypeEq<CRITICAL_SECTION, _CRITICAL_SECTION>();
+}
+#else
+TEST(WindowsTypesTest, CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION) {
+  StaticAssertTypeEq<CRITICAL_SECTION, _RTL_CRITICAL_SECTION>();
+}
+#endif
+
+#endif  // GTEST_OS_WINDOWS
+
+}  // namespace internal
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-printers-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-printers-test.cc
new file mode 100755
index 0000000..4bdc9ad
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-printers-test.cc
@@ -0,0 +1,1620 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file tests the universal value printer.
+
+#include <ctype.h>
+#include <limits.h>
+#include <string.h>
+#include <algorithm>
+#include <deque>
+#include <forward_list>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest.h"
+
+// Some user-defined types for testing the universal value printer.
+
+// An anonymous enum type.
+enum AnonymousEnum {
+  kAE1 = -1,
+  kAE2 = 1
+};
+
+// An enum without a user-defined printer.
+enum EnumWithoutPrinter {
+  kEWP1 = -2,
+  kEWP2 = 42
+};
+
+// An enum with a << operator.
+enum EnumWithStreaming {
+  kEWS1 = 10
+};
+
+std::ostream& operator<<(std::ostream& os, EnumWithStreaming e) {
+  return os << (e == kEWS1 ? "kEWS1" : "invalid");
+}
+
+// An enum with a PrintTo() function.
+enum EnumWithPrintTo {
+  kEWPT1 = 1
+};
+
+void PrintTo(EnumWithPrintTo e, std::ostream* os) {
+  *os << (e == kEWPT1 ? "kEWPT1" : "invalid");
+}
+
+// A class implicitly convertible to BiggestInt.
+class BiggestIntConvertible {
+ public:
+  operator ::testing::internal::BiggestInt() const { return 42; }
+};
+
+// A user-defined unprintable class template in the global namespace.
+template <typename T>
+class UnprintableTemplateInGlobal {
+ public:
+  UnprintableTemplateInGlobal() : value_() {}
+ private:
+  T value_;
+};
+
+// A user-defined streamable type in the global namespace.
+class StreamableInGlobal {
+ public:
+  virtual ~StreamableInGlobal() {}
+};
+
+inline void operator<<(::std::ostream& os, const StreamableInGlobal& /* x */) {
+  os << "StreamableInGlobal";
+}
+
+void operator<<(::std::ostream& os, const StreamableInGlobal* /* x */) {
+  os << "StreamableInGlobal*";
+}
+
+namespace foo {
+
+// A user-defined unprintable type in a user namespace.
+class UnprintableInFoo {
+ public:
+  UnprintableInFoo() : z_(0) { memcpy(xy_, "\xEF\x12\x0\x0\x34\xAB\x0\x0", 8); }
+  double z() const { return z_; }
+ private:
+  char xy_[8];
+  double z_;
+};
+
+// A user-defined printable type in a user-chosen namespace.
+struct PrintableViaPrintTo {
+  PrintableViaPrintTo() : value() {}
+  int value;
+};
+
+void PrintTo(const PrintableViaPrintTo& x, ::std::ostream* os) {
+  *os << "PrintableViaPrintTo: " << x.value;
+}
+
+// A type with a user-defined << for printing its pointer.
+struct PointerPrintable {
+};
+
+::std::ostream& operator<<(::std::ostream& os,
+                           const PointerPrintable* /* x */) {
+  return os << "PointerPrintable*";
+}
+
+// A user-defined printable class template in a user-chosen namespace.
+template <typename T>
+class PrintableViaPrintToTemplate {
+ public:
+  explicit PrintableViaPrintToTemplate(const T& a_value) : value_(a_value) {}
+
+  const T& value() const { return value_; }
+ private:
+  T value_;
+};
+
+template <typename T>
+void PrintTo(const PrintableViaPrintToTemplate<T>& x, ::std::ostream* os) {
+  *os << "PrintableViaPrintToTemplate: " << x.value();
+}
+
+// A user-defined streamable class template in a user namespace.
+template <typename T>
+class StreamableTemplateInFoo {
+ public:
+  StreamableTemplateInFoo() : value_() {}
+
+  const T& value() const { return value_; }
+ private:
+  T value_;
+};
+
+template <typename T>
+inline ::std::ostream& operator<<(::std::ostream& os,
+                                  const StreamableTemplateInFoo<T>& x) {
+  return os << "StreamableTemplateInFoo: " << x.value();
+}
+
+// A user-defined streamable but recursivly-defined container type in
+// a user namespace, it mimics therefore std::filesystem::path or
+// boost::filesystem::path.
+class PathLike {
+ public:
+  struct iterator {
+    typedef PathLike value_type;
+
+    iterator& operator++();
+    PathLike& operator*();
+  };
+
+  using value_type = char;
+  using const_iterator = iterator;
+
+  PathLike() {}
+
+  iterator begin() const { return iterator(); }
+  iterator end() const { return iterator(); }
+
+  friend ::std::ostream& operator<<(::std::ostream& os, const PathLike&) {
+    return os << "Streamable-PathLike";
+  }
+};
+
+}  // namespace foo
+
+namespace testing {
+namespace gtest_printers_test {
+
+using ::std::deque;
+using ::std::list;
+using ::std::make_pair;
+using ::std::map;
+using ::std::multimap;
+using ::std::multiset;
+using ::std::pair;
+using ::std::set;
+using ::std::vector;
+using ::testing::PrintToString;
+using ::testing::internal::FormatForComparisonFailureMessage;
+using ::testing::internal::ImplicitCast_;
+using ::testing::internal::NativeArray;
+using ::testing::internal::RE;
+using ::testing::internal::RelationToSourceReference;
+using ::testing::internal::Strings;
+using ::testing::internal::UniversalPrint;
+using ::testing::internal::UniversalPrinter;
+using ::testing::internal::UniversalTersePrint;
+using ::testing::internal::UniversalTersePrintTupleFieldsToStrings;
+
+// Prints a value to a string using the universal value printer.  This
+// is a helper for testing UniversalPrinter<T>::Print() for various types.
+template <typename T>
+std::string Print(const T& value) {
+  ::std::stringstream ss;
+  UniversalPrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+// Prints a value passed by reference to a string, using the universal
+// value printer.  This is a helper for testing
+// UniversalPrinter<T&>::Print() for various types.
+template <typename T>
+std::string PrintByRef(const T& value) {
+  ::std::stringstream ss;
+  UniversalPrinter<T&>::Print(value, &ss);
+  return ss.str();
+}
+
+// Tests printing various enum types.
+
+TEST(PrintEnumTest, AnonymousEnum) {
+  EXPECT_EQ("-1", Print(kAE1));
+  EXPECT_EQ("1", Print(kAE2));
+}
+
+TEST(PrintEnumTest, EnumWithoutPrinter) {
+  EXPECT_EQ("-2", Print(kEWP1));
+  EXPECT_EQ("42", Print(kEWP2));
+}
+
+TEST(PrintEnumTest, EnumWithStreaming) {
+  EXPECT_EQ("kEWS1", Print(kEWS1));
+  EXPECT_EQ("invalid", Print(static_cast<EnumWithStreaming>(0)));
+}
+
+TEST(PrintEnumTest, EnumWithPrintTo) {
+  EXPECT_EQ("kEWPT1", Print(kEWPT1));
+  EXPECT_EQ("invalid", Print(static_cast<EnumWithPrintTo>(0)));
+}
+
+// Tests printing a class implicitly convertible to BiggestInt.
+
+TEST(PrintClassTest, BiggestIntConvertible) {
+  EXPECT_EQ("42", Print(BiggestIntConvertible()));
+}
+
+// Tests printing various char types.
+
+// char.
+TEST(PrintCharTest, PlainChar) {
+  EXPECT_EQ("'\\0'", Print('\0'));
+  EXPECT_EQ("'\\'' (39, 0x27)", Print('\''));
+  EXPECT_EQ("'\"' (34, 0x22)", Print('"'));
+  EXPECT_EQ("'?' (63, 0x3F)", Print('?'));
+  EXPECT_EQ("'\\\\' (92, 0x5C)", Print('\\'));
+  EXPECT_EQ("'\\a' (7)", Print('\a'));
+  EXPECT_EQ("'\\b' (8)", Print('\b'));
+  EXPECT_EQ("'\\f' (12, 0xC)", Print('\f'));
+  EXPECT_EQ("'\\n' (10, 0xA)", Print('\n'));
+  EXPECT_EQ("'\\r' (13, 0xD)", Print('\r'));
+  EXPECT_EQ("'\\t' (9)", Print('\t'));
+  EXPECT_EQ("'\\v' (11, 0xB)", Print('\v'));
+  EXPECT_EQ("'\\x7F' (127)", Print('\x7F'));
+  EXPECT_EQ("'\\xFF' (255)", Print('\xFF'));
+  EXPECT_EQ("' ' (32, 0x20)", Print(' '));
+  EXPECT_EQ("'a' (97, 0x61)", Print('a'));
+}
+
+// signed char.
+TEST(PrintCharTest, SignedChar) {
+  EXPECT_EQ("'\\0'", Print(static_cast<signed char>('\0')));
+  EXPECT_EQ("'\\xCE' (-50)",
+            Print(static_cast<signed char>(-50)));
+}
+
+// unsigned char.
+TEST(PrintCharTest, UnsignedChar) {
+  EXPECT_EQ("'\\0'", Print(static_cast<unsigned char>('\0')));
+  EXPECT_EQ("'b' (98, 0x62)",
+            Print(static_cast<unsigned char>('b')));
+}
+
+// Tests printing other simple, built-in types.
+
+// bool.
+TEST(PrintBuiltInTypeTest, Bool) {
+  EXPECT_EQ("false", Print(false));
+  EXPECT_EQ("true", Print(true));
+}
+
+// wchar_t.
+TEST(PrintBuiltInTypeTest, Wchar_t) {
+  EXPECT_EQ("L'\\0'", Print(L'\0'));
+  EXPECT_EQ("L'\\'' (39, 0x27)", Print(L'\''));
+  EXPECT_EQ("L'\"' (34, 0x22)", Print(L'"'));
+  EXPECT_EQ("L'?' (63, 0x3F)", Print(L'?'));
+  EXPECT_EQ("L'\\\\' (92, 0x5C)", Print(L'\\'));
+  EXPECT_EQ("L'\\a' (7)", Print(L'\a'));
+  EXPECT_EQ("L'\\b' (8)", Print(L'\b'));
+  EXPECT_EQ("L'\\f' (12, 0xC)", Print(L'\f'));
+  EXPECT_EQ("L'\\n' (10, 0xA)", Print(L'\n'));
+  EXPECT_EQ("L'\\r' (13, 0xD)", Print(L'\r'));
+  EXPECT_EQ("L'\\t' (9)", Print(L'\t'));
+  EXPECT_EQ("L'\\v' (11, 0xB)", Print(L'\v'));
+  EXPECT_EQ("L'\\x7F' (127)", Print(L'\x7F'));
+  EXPECT_EQ("L'\\xFF' (255)", Print(L'\xFF'));
+  EXPECT_EQ("L' ' (32, 0x20)", Print(L' '));
+  EXPECT_EQ("L'a' (97, 0x61)", Print(L'a'));
+  EXPECT_EQ("L'\\x576' (1398)", Print(static_cast<wchar_t>(0x576)));
+  EXPECT_EQ("L'\\xC74D' (51021)", Print(static_cast<wchar_t>(0xC74D)));
+}
+
+// Test that Int64 provides more storage than wchar_t.
+TEST(PrintTypeSizeTest, Wchar_t) {
+  EXPECT_LT(sizeof(wchar_t), sizeof(testing::internal::Int64));
+}
+
+// Various integer types.
+TEST(PrintBuiltInTypeTest, Integer) {
+  EXPECT_EQ("'\\xFF' (255)", Print(static_cast<unsigned char>(255)));  // uint8
+  EXPECT_EQ("'\\x80' (-128)", Print(static_cast<signed char>(-128)));  // int8
+  EXPECT_EQ("65535", Print(USHRT_MAX));  // uint16
+  EXPECT_EQ("-32768", Print(SHRT_MIN));  // int16
+  EXPECT_EQ("4294967295", Print(UINT_MAX));  // uint32
+  EXPECT_EQ("-2147483648", Print(INT_MIN));  // int32
+  EXPECT_EQ("18446744073709551615",
+            Print(static_cast<testing::internal::UInt64>(-1)));  // uint64
+  EXPECT_EQ("-9223372036854775808",
+            Print(static_cast<testing::internal::Int64>(1) << 63));  // int64
+}
+
+// Size types.
+TEST(PrintBuiltInTypeTest, Size_t) {
+  EXPECT_EQ("1", Print(sizeof('a')));  // size_t.
+#if !GTEST_OS_WINDOWS
+  // Windows has no ssize_t type.
+  EXPECT_EQ("-2", Print(static_cast<ssize_t>(-2)));  // ssize_t.
+#endif  // !GTEST_OS_WINDOWS
+}
+
+// Floating-points.
+TEST(PrintBuiltInTypeTest, FloatingPoints) {
+  EXPECT_EQ("1.5", Print(1.5f));   // float
+  EXPECT_EQ("-2.5", Print(-2.5));  // double
+}
+
+// Since ::std::stringstream::operator<<(const void *) formats the pointer
+// output differently with different compilers, we have to create the expected
+// output first and use it as our expectation.
+static std::string PrintPointer(const void* p) {
+  ::std::stringstream expected_result_stream;
+  expected_result_stream << p;
+  return expected_result_stream.str();
+}
+
+// Tests printing C strings.
+
+// const char*.
+TEST(PrintCStringTest, Const) {
+  const char* p = "World";
+  EXPECT_EQ(PrintPointer(p) + " pointing to \"World\"", Print(p));
+}
+
+// char*.
+TEST(PrintCStringTest, NonConst) {
+  char p[] = "Hi";
+  EXPECT_EQ(PrintPointer(p) + " pointing to \"Hi\"",
+            Print(static_cast<char*>(p)));
+}
+
+// NULL C string.
+TEST(PrintCStringTest, Null) {
+  const char* p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// Tests that C strings are escaped properly.
+TEST(PrintCStringTest, EscapesProperly) {
+  const char* p = "'\"?\\\a\b\f\n\r\t\v\x7F\xFF a";
+  EXPECT_EQ(PrintPointer(p) + " pointing to \"'\\\"?\\\\\\a\\b\\f"
+            "\\n\\r\\t\\v\\x7F\\xFF a\"",
+            Print(p));
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+
+// const wchar_t*.
+TEST(PrintWideCStringTest, Const) {
+  const wchar_t* p = L"World";
+  EXPECT_EQ(PrintPointer(p) + " pointing to L\"World\"", Print(p));
+}
+
+// wchar_t*.
+TEST(PrintWideCStringTest, NonConst) {
+  wchar_t p[] = L"Hi";
+  EXPECT_EQ(PrintPointer(p) + " pointing to L\"Hi\"",
+            Print(static_cast<wchar_t*>(p)));
+}
+
+// NULL wide C string.
+TEST(PrintWideCStringTest, Null) {
+  const wchar_t* p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// Tests that wide C strings are escaped properly.
+TEST(PrintWideCStringTest, EscapesProperly) {
+  const wchar_t s[] = {'\'', '"', '?', '\\', '\a', '\b', '\f', '\n', '\r',
+                       '\t', '\v', 0xD3, 0x576, 0x8D3, 0xC74D, ' ', 'a', '\0'};
+  EXPECT_EQ(PrintPointer(s) + " pointing to L\"'\\\"?\\\\\\a\\b\\f"
+            "\\n\\r\\t\\v\\xD3\\x576\\x8D3\\xC74D a\"",
+            Print(static_cast<const wchar_t*>(s)));
+}
+#endif  // native wchar_t
+
+// Tests printing pointers to other char types.
+
+// signed char*.
+TEST(PrintCharPointerTest, SignedChar) {
+  signed char* p = reinterpret_cast<signed char*>(0x1234);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// const signed char*.
+TEST(PrintCharPointerTest, ConstSignedChar) {
+  signed char* p = reinterpret_cast<signed char*>(0x1234);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// unsigned char*.
+TEST(PrintCharPointerTest, UnsignedChar) {
+  unsigned char* p = reinterpret_cast<unsigned char*>(0x1234);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// const unsigned char*.
+TEST(PrintCharPointerTest, ConstUnsignedChar) {
+  const unsigned char* p = reinterpret_cast<const unsigned char*>(0x1234);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// Tests printing pointers to simple, built-in types.
+
+// bool*.
+TEST(PrintPointerToBuiltInTypeTest, Bool) {
+  bool* p = reinterpret_cast<bool*>(0xABCD);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// void*.
+TEST(PrintPointerToBuiltInTypeTest, Void) {
+  void* p = reinterpret_cast<void*>(0xABCD);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// const void*.
+TEST(PrintPointerToBuiltInTypeTest, ConstVoid) {
+  const void* p = reinterpret_cast<const void*>(0xABCD);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// Tests printing pointers to pointers.
+TEST(PrintPointerToPointerTest, IntPointerPointer) {
+  int** p = reinterpret_cast<int**>(0xABCD);
+  EXPECT_EQ(PrintPointer(p), Print(p));
+  p = nullptr;
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// Tests printing (non-member) function pointers.
+
+void MyFunction(int /* n */) {}
+
+TEST(PrintPointerTest, NonMemberFunctionPointer) {
+  // We cannot directly cast &MyFunction to const void* because the
+  // standard disallows casting between pointers to functions and
+  // pointers to objects, and some compilers (e.g. GCC 3.4) enforce
+  // this limitation.
+  EXPECT_EQ(
+      PrintPointer(reinterpret_cast<const void*>(
+          reinterpret_cast<internal::BiggestInt>(&MyFunction))),
+      Print(&MyFunction));
+  int (*p)(bool) = NULL;  // NOLINT
+  EXPECT_EQ("NULL", Print(p));
+}
+
+// An assertion predicate determining whether a one string is a prefix for
+// another.
+template <typename StringType>
+AssertionResult HasPrefix(const StringType& str, const StringType& prefix) {
+  if (str.find(prefix, 0) == 0)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(prefix[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << begin_string_quote << prefix << "\" is not a prefix of "
+      << begin_string_quote << str << "\"\n";
+}
+
+// Tests printing member variable pointers.  Although they are called
+// pointers, they don't point to a location in the address space.
+// Their representation is implementation-defined.  Thus they will be
+// printed as raw bytes.
+
+struct Foo {
+ public:
+  virtual ~Foo() {}
+  int MyMethod(char x) { return x + 1; }
+  virtual char MyVirtualMethod(int /* n */) { return 'a'; }
+
+  int value;
+};
+
+TEST(PrintPointerTest, MemberVariablePointer) {
+  EXPECT_TRUE(HasPrefix(Print(&Foo::value),
+                        Print(sizeof(&Foo::value)) + "-byte object "));
+  int Foo::*p = NULL;  // NOLINT
+  EXPECT_TRUE(HasPrefix(Print(p),
+                        Print(sizeof(p)) + "-byte object "));
+}
+
+// Tests printing member function pointers.  Although they are called
+// pointers, they don't point to a location in the address space.
+// Their representation is implementation-defined.  Thus they will be
+// printed as raw bytes.
+TEST(PrintPointerTest, MemberFunctionPointer) {
+  EXPECT_TRUE(HasPrefix(Print(&Foo::MyMethod),
+                        Print(sizeof(&Foo::MyMethod)) + "-byte object "));
+  EXPECT_TRUE(
+      HasPrefix(Print(&Foo::MyVirtualMethod),
+                Print(sizeof((&Foo::MyVirtualMethod))) + "-byte object "));
+  int (Foo::*p)(char) = NULL;  // NOLINT
+  EXPECT_TRUE(HasPrefix(Print(p),
+                        Print(sizeof(p)) + "-byte object "));
+}
+
+// Tests printing C arrays.
+
+// The difference between this and Print() is that it ensures that the
+// argument is a reference to an array.
+template <typename T, size_t N>
+std::string PrintArrayHelper(T (&a)[N]) {
+  return Print(a);
+}
+
+// One-dimensional array.
+TEST(PrintArrayTest, OneDimensionalArray) {
+  int a[5] = { 1, 2, 3, 4, 5 };
+  EXPECT_EQ("{ 1, 2, 3, 4, 5 }", PrintArrayHelper(a));
+}
+
+// Two-dimensional array.
+TEST(PrintArrayTest, TwoDimensionalArray) {
+  int a[2][5] = {
+    { 1, 2, 3, 4, 5 },
+    { 6, 7, 8, 9, 0 }
+  };
+  EXPECT_EQ("{ { 1, 2, 3, 4, 5 }, { 6, 7, 8, 9, 0 } }", PrintArrayHelper(a));
+}
+
+// Array of const elements.
+TEST(PrintArrayTest, ConstArray) {
+  const bool a[1] = { false };
+  EXPECT_EQ("{ false }", PrintArrayHelper(a));
+}
+
+// char array without terminating NUL.
+TEST(PrintArrayTest, CharArrayWithNoTerminatingNul) {
+  // Array a contains '\0' in the middle and doesn't end with '\0'.
+  char a[] = { 'H', '\0', 'i' };
+  EXPECT_EQ("\"H\\0i\" (no terminating NUL)", PrintArrayHelper(a));
+}
+
+// const char array with terminating NUL.
+TEST(PrintArrayTest, ConstCharArrayWithTerminatingNul) {
+  const char a[] = "\0Hi";
+  EXPECT_EQ("\"\\0Hi\"", PrintArrayHelper(a));
+}
+
+// const wchar_t array without terminating NUL.
+TEST(PrintArrayTest, WCharArrayWithNoTerminatingNul) {
+  // Array a contains '\0' in the middle and doesn't end with '\0'.
+  const wchar_t a[] = { L'H', L'\0', L'i' };
+  EXPECT_EQ("L\"H\\0i\" (no terminating NUL)", PrintArrayHelper(a));
+}
+
+// wchar_t array with terminating NUL.
+TEST(PrintArrayTest, WConstCharArrayWithTerminatingNul) {
+  const wchar_t a[] = L"\0Hi";
+  EXPECT_EQ("L\"\\0Hi\"", PrintArrayHelper(a));
+}
+
+// Array of objects.
+TEST(PrintArrayTest, ObjectArray) {
+  std::string a[3] = {"Hi", "Hello", "Ni hao"};
+  EXPECT_EQ("{ \"Hi\", \"Hello\", \"Ni hao\" }", PrintArrayHelper(a));
+}
+
+// Array with many elements.
+TEST(PrintArrayTest, BigArray) {
+  int a[100] = { 1, 2, 3 };
+  EXPECT_EQ("{ 1, 2, 3, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0 }",
+            PrintArrayHelper(a));
+}
+
+// Tests printing ::string and ::std::string.
+
+// ::std::string.
+TEST(PrintStringTest, StringInStdNamespace) {
+  const char s[] = "'\"?\\\a\b\f\n\0\r\t\v\x7F\xFF a";
+  const ::std::string str(s, sizeof(s));
+  EXPECT_EQ("\"'\\\"?\\\\\\a\\b\\f\\n\\0\\r\\t\\v\\x7F\\xFF a\\0\"",
+            Print(str));
+}
+
+TEST(PrintStringTest, StringAmbiguousHex) {
+  // "\x6BANANA" is ambiguous, it can be interpreted as starting with either of:
+  // '\x6', '\x6B', or '\x6BA'.
+
+  // a hex escaping sequence following by a decimal digit
+  EXPECT_EQ("\"0\\x12\" \"3\"", Print(::std::string("0\x12" "3")));
+  // a hex escaping sequence following by a hex digit (lower-case)
+  EXPECT_EQ("\"mm\\x6\" \"bananas\"", Print(::std::string("mm\x6" "bananas")));
+  // a hex escaping sequence following by a hex digit (upper-case)
+  EXPECT_EQ("\"NOM\\x6\" \"BANANA\"", Print(::std::string("NOM\x6" "BANANA")));
+  // a hex escaping sequence following by a non-xdigit
+  EXPECT_EQ("\"!\\x5-!\"", Print(::std::string("!\x5-!")));
+}
+
+// Tests printing ::std::wstring.
+#if GTEST_HAS_STD_WSTRING
+// ::std::wstring.
+TEST(PrintWideStringTest, StringInStdNamespace) {
+  const wchar_t s[] = L"'\"?\\\a\b\f\n\0\r\t\v\xD3\x576\x8D3\xC74D a";
+  const ::std::wstring str(s, sizeof(s)/sizeof(wchar_t));
+  EXPECT_EQ("L\"'\\\"?\\\\\\a\\b\\f\\n\\0\\r\\t\\v"
+            "\\xD3\\x576\\x8D3\\xC74D a\\0\"",
+            Print(str));
+}
+
+TEST(PrintWideStringTest, StringAmbiguousHex) {
+  // same for wide strings.
+  EXPECT_EQ("L\"0\\x12\" L\"3\"", Print(::std::wstring(L"0\x12" L"3")));
+  EXPECT_EQ("L\"mm\\x6\" L\"bananas\"",
+            Print(::std::wstring(L"mm\x6" L"bananas")));
+  EXPECT_EQ("L\"NOM\\x6\" L\"BANANA\"",
+            Print(::std::wstring(L"NOM\x6" L"BANANA")));
+  EXPECT_EQ("L\"!\\x5-!\"", Print(::std::wstring(L"!\x5-!")));
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Tests printing types that support generic streaming (i.e. streaming
+// to std::basic_ostream<Char, CharTraits> for any valid Char and
+// CharTraits types).
+
+// Tests printing a non-template type that supports generic streaming.
+
+class AllowsGenericStreaming {};
+
+template <typename Char, typename CharTraits>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const AllowsGenericStreaming& /* a */) {
+  return os << "AllowsGenericStreaming";
+}
+
+TEST(PrintTypeWithGenericStreamingTest, NonTemplateType) {
+  AllowsGenericStreaming a;
+  EXPECT_EQ("AllowsGenericStreaming", Print(a));
+}
+
+// Tests printing a template type that supports generic streaming.
+
+template <typename T>
+class AllowsGenericStreamingTemplate {};
+
+template <typename Char, typename CharTraits, typename T>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const AllowsGenericStreamingTemplate<T>& /* a */) {
+  return os << "AllowsGenericStreamingTemplate";
+}
+
+TEST(PrintTypeWithGenericStreamingTest, TemplateType) {
+  AllowsGenericStreamingTemplate<int> a;
+  EXPECT_EQ("AllowsGenericStreamingTemplate", Print(a));
+}
+
+// Tests printing a type that supports generic streaming and can be
+// implicitly converted to another printable type.
+
+template <typename T>
+class AllowsGenericStreamingAndImplicitConversionTemplate {
+ public:
+  operator bool() const { return false; }
+};
+
+template <typename Char, typename CharTraits, typename T>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const AllowsGenericStreamingAndImplicitConversionTemplate<T>& /* a */) {
+  return os << "AllowsGenericStreamingAndImplicitConversionTemplate";
+}
+
+TEST(PrintTypeWithGenericStreamingTest, TypeImplicitlyConvertible) {
+  AllowsGenericStreamingAndImplicitConversionTemplate<int> a;
+  EXPECT_EQ("AllowsGenericStreamingAndImplicitConversionTemplate", Print(a));
+}
+
+#if GTEST_HAS_ABSL
+
+// Tests printing ::absl::string_view.
+
+TEST(PrintStringViewTest, SimpleStringView) {
+  const ::absl::string_view sp = "Hello";
+  EXPECT_EQ("\"Hello\"", Print(sp));
+}
+
+TEST(PrintStringViewTest, UnprintableCharacters) {
+  const char str[] = "NUL (\0) and \r\t";
+  const ::absl::string_view sp(str, sizeof(str) - 1);
+  EXPECT_EQ("\"NUL (\\0) and \\r\\t\"", Print(sp));
+}
+
+#endif  // GTEST_HAS_ABSL
+
+// Tests printing STL containers.
+
+TEST(PrintStlContainerTest, EmptyDeque) {
+  deque<char> empty;
+  EXPECT_EQ("{}", Print(empty));
+}
+
+TEST(PrintStlContainerTest, NonEmptyDeque) {
+  deque<int> non_empty;
+  non_empty.push_back(1);
+  non_empty.push_back(3);
+  EXPECT_EQ("{ 1, 3 }", Print(non_empty));
+}
+
+
+TEST(PrintStlContainerTest, OneElementHashMap) {
+  ::std::unordered_map<int, char> map1;
+  map1[1] = 'a';
+  EXPECT_EQ("{ (1, 'a' (97, 0x61)) }", Print(map1));
+}
+
+TEST(PrintStlContainerTest, HashMultiMap) {
+  ::std::unordered_multimap<int, bool> map1;
+  map1.insert(make_pair(5, true));
+  map1.insert(make_pair(5, false));
+
+  // Elements of hash_multimap can be printed in any order.
+  const std::string result = Print(map1);
+  EXPECT_TRUE(result == "{ (5, true), (5, false) }" ||
+              result == "{ (5, false), (5, true) }")
+                  << " where Print(map1) returns \"" << result << "\".";
+}
+
+
+
+TEST(PrintStlContainerTest, HashSet) {
+  ::std::unordered_set<int> set1;
+  set1.insert(1);
+  EXPECT_EQ("{ 1 }", Print(set1));
+}
+
+TEST(PrintStlContainerTest, HashMultiSet) {
+  const int kSize = 5;
+  int a[kSize] = { 1, 1, 2, 5, 1 };
+  ::std::unordered_multiset<int> set1(a, a + kSize);
+
+  // Elements of hash_multiset can be printed in any order.
+  const std::string result = Print(set1);
+  const std::string expected_pattern = "{ d, d, d, d, d }";  // d means a digit.
+
+  // Verifies the result matches the expected pattern; also extracts
+  // the numbers in the result.
+  ASSERT_EQ(expected_pattern.length(), result.length());
+  std::vector<int> numbers;
+  for (size_t i = 0; i != result.length(); i++) {
+    if (expected_pattern[i] == 'd') {
+      ASSERT_NE(isdigit(static_cast<unsigned char>(result[i])), 0);
+      numbers.push_back(result[i] - '0');
+    } else {
+      EXPECT_EQ(expected_pattern[i], result[i]) << " where result is "
+                                                << result;
+    }
+  }
+
+  // Makes sure the result contains the right numbers.
+  std::sort(numbers.begin(), numbers.end());
+  std::sort(a, a + kSize);
+  EXPECT_TRUE(std::equal(a, a + kSize, numbers.begin()));
+}
+
+
+TEST(PrintStlContainerTest, List) {
+  const std::string a[] = {"hello", "world"};
+  const list<std::string> strings(a, a + 2);
+  EXPECT_EQ("{ \"hello\", \"world\" }", Print(strings));
+}
+
+TEST(PrintStlContainerTest, Map) {
+  map<int, bool> map1;
+  map1[1] = true;
+  map1[5] = false;
+  map1[3] = true;
+  EXPECT_EQ("{ (1, true), (3, true), (5, false) }", Print(map1));
+}
+
+TEST(PrintStlContainerTest, MultiMap) {
+  multimap<bool, int> map1;
+  // The make_pair template function would deduce the type as
+  // pair<bool, int> here, and since the key part in a multimap has to
+  // be constant, without a templated ctor in the pair class (as in
+  // libCstd on Solaris), make_pair call would fail to compile as no
+  // implicit conversion is found.  Thus explicit typename is used
+  // here instead.
+  map1.insert(pair<const bool, int>(true, 0));
+  map1.insert(pair<const bool, int>(true, 1));
+  map1.insert(pair<const bool, int>(false, 2));
+  EXPECT_EQ("{ (false, 2), (true, 0), (true, 1) }", Print(map1));
+}
+
+TEST(PrintStlContainerTest, Set) {
+  const unsigned int a[] = { 3, 0, 5 };
+  set<unsigned int> set1(a, a + 3);
+  EXPECT_EQ("{ 0, 3, 5 }", Print(set1));
+}
+
+TEST(PrintStlContainerTest, MultiSet) {
+  const int a[] = { 1, 1, 2, 5, 1 };
+  multiset<int> set1(a, a + 5);
+  EXPECT_EQ("{ 1, 1, 1, 2, 5 }", Print(set1));
+}
+
+
+TEST(PrintStlContainerTest, SinglyLinkedList) {
+  int a[] = { 9, 2, 8 };
+  const std::forward_list<int> ints(a, a + 3);
+  EXPECT_EQ("{ 9, 2, 8 }", Print(ints));
+}
+
+TEST(PrintStlContainerTest, Pair) {
+  pair<const bool, int> p(true, 5);
+  EXPECT_EQ("(true, 5)", Print(p));
+}
+
+TEST(PrintStlContainerTest, Vector) {
+  vector<int> v;
+  v.push_back(1);
+  v.push_back(2);
+  EXPECT_EQ("{ 1, 2 }", Print(v));
+}
+
+TEST(PrintStlContainerTest, LongSequence) {
+  const int a[100] = { 1, 2, 3 };
+  const vector<int> v(a, a + 100);
+  EXPECT_EQ("{ 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "
+            "0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... }", Print(v));
+}
+
+TEST(PrintStlContainerTest, NestedContainer) {
+  const int a1[] = { 1, 2 };
+  const int a2[] = { 3, 4, 5 };
+  const list<int> l1(a1, a1 + 2);
+  const list<int> l2(a2, a2 + 3);
+
+  vector<list<int> > v;
+  v.push_back(l1);
+  v.push_back(l2);
+  EXPECT_EQ("{ { 1, 2 }, { 3, 4, 5 } }", Print(v));
+}
+
+TEST(PrintStlContainerTest, OneDimensionalNativeArray) {
+  const int a[3] = { 1, 2, 3 };
+  NativeArray<int> b(a, 3, RelationToSourceReference());
+  EXPECT_EQ("{ 1, 2, 3 }", Print(b));
+}
+
+TEST(PrintStlContainerTest, TwoDimensionalNativeArray) {
+  const int a[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  NativeArray<int[3]> b(a, 2, RelationToSourceReference());
+  EXPECT_EQ("{ { 1, 2, 3 }, { 4, 5, 6 } }", Print(b));
+}
+
+// Tests that a class named iterator isn't treated as a container.
+
+struct iterator {
+  char x;
+};
+
+TEST(PrintStlContainerTest, Iterator) {
+  iterator it = {};
+  EXPECT_EQ("1-byte object <00>", Print(it));
+}
+
+// Tests that a class named const_iterator isn't treated as a container.
+
+struct const_iterator {
+  char x;
+};
+
+TEST(PrintStlContainerTest, ConstIterator) {
+  const_iterator it = {};
+  EXPECT_EQ("1-byte object <00>", Print(it));
+}
+
+// Tests printing ::std::tuples.
+
+// Tuples of various arities.
+TEST(PrintStdTupleTest, VariousSizes) {
+  ::std::tuple<> t0;
+  EXPECT_EQ("()", Print(t0));
+
+  ::std::tuple<int> t1(5);
+  EXPECT_EQ("(5)", Print(t1));
+
+  ::std::tuple<char, bool> t2('a', true);
+  EXPECT_EQ("('a' (97, 0x61), true)", Print(t2));
+
+  ::std::tuple<bool, int, int> t3(false, 2, 3);
+  EXPECT_EQ("(false, 2, 3)", Print(t3));
+
+  ::std::tuple<bool, int, int, int> t4(false, 2, 3, 4);
+  EXPECT_EQ("(false, 2, 3, 4)", Print(t4));
+
+  const char* const str = "8";
+  ::std::tuple<bool, char, short, testing::internal::Int32,  // NOLINT
+               testing::internal::Int64, float, double, const char*, void*,
+               std::string>
+      t10(false, 'a', static_cast<short>(3), 4, 5, 1.5F, -2.5, str,  // NOLINT
+          nullptr, "10");
+  EXPECT_EQ("(false, 'a' (97, 0x61), 3, 4, 5, 1.5, -2.5, " + PrintPointer(str) +
+            " pointing to \"8\", NULL, \"10\")",
+            Print(t10));
+}
+
+// Nested tuples.
+TEST(PrintStdTupleTest, NestedTuple) {
+  ::std::tuple< ::std::tuple<int, bool>, char> nested(
+      ::std::make_tuple(5, true), 'a');
+  EXPECT_EQ("((5, true), 'a' (97, 0x61))", Print(nested));
+}
+
+TEST(PrintNullptrT, Basic) {
+  EXPECT_EQ("(nullptr)", Print(nullptr));
+}
+
+TEST(PrintReferenceWrapper, Printable) {
+  int x = 5;
+  EXPECT_EQ("@" + PrintPointer(&x) + " 5", Print(std::ref(x)));
+  EXPECT_EQ("@" + PrintPointer(&x) + " 5", Print(std::cref(x)));
+}
+
+TEST(PrintReferenceWrapper, Unprintable) {
+  ::foo::UnprintableInFoo up;
+  EXPECT_EQ(
+      "@" + PrintPointer(&up) +
+          " 16-byte object <EF-12 00-00 34-AB 00-00 00-00 00-00 00-00 00-00>",
+      Print(std::ref(up)));
+  EXPECT_EQ(
+      "@" + PrintPointer(&up) +
+          " 16-byte object <EF-12 00-00 34-AB 00-00 00-00 00-00 00-00 00-00>",
+      Print(std::cref(up)));
+}
+
+// Tests printing user-defined unprintable types.
+
+// Unprintable types in the global namespace.
+TEST(PrintUnprintableTypeTest, InGlobalNamespace) {
+  EXPECT_EQ("1-byte object <00>",
+            Print(UnprintableTemplateInGlobal<char>()));
+}
+
+// Unprintable types in a user namespace.
+TEST(PrintUnprintableTypeTest, InUserNamespace) {
+  EXPECT_EQ("16-byte object <EF-12 00-00 34-AB 00-00 00-00 00-00 00-00 00-00>",
+            Print(::foo::UnprintableInFoo()));
+}
+
+// Unprintable types are that too big to be printed completely.
+
+struct Big {
+  Big() { memset(array, 0, sizeof(array)); }
+  char array[257];
+};
+
+TEST(PrintUnpritableTypeTest, BigObject) {
+  EXPECT_EQ("257-byte object <00-00 00-00 00-00 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 ... 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 "
+            "00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00>",
+            Print(Big()));
+}
+
+// Tests printing user-defined streamable types.
+
+// Streamable types in the global namespace.
+TEST(PrintStreamableTypeTest, InGlobalNamespace) {
+  StreamableInGlobal x;
+  EXPECT_EQ("StreamableInGlobal", Print(x));
+  EXPECT_EQ("StreamableInGlobal*", Print(&x));
+}
+
+// Printable template types in a user namespace.
+TEST(PrintStreamableTypeTest, TemplateTypeInUserNamespace) {
+  EXPECT_EQ("StreamableTemplateInFoo: 0",
+            Print(::foo::StreamableTemplateInFoo<int>()));
+}
+
+// Tests printing a user-defined recursive container type that has a <<
+// operator.
+TEST(PrintStreamableTypeTest, PathLikeInUserNamespace) {
+  ::foo::PathLike x;
+  EXPECT_EQ("Streamable-PathLike", Print(x));
+  const ::foo::PathLike cx;
+  EXPECT_EQ("Streamable-PathLike", Print(cx));
+}
+
+// Tests printing user-defined types that have a PrintTo() function.
+TEST(PrintPrintableTypeTest, InUserNamespace) {
+  EXPECT_EQ("PrintableViaPrintTo: 0",
+            Print(::foo::PrintableViaPrintTo()));
+}
+
+// Tests printing a pointer to a user-defined type that has a <<
+// operator for its pointer.
+TEST(PrintPrintableTypeTest, PointerInUserNamespace) {
+  ::foo::PointerPrintable x;
+  EXPECT_EQ("PointerPrintable*", Print(&x));
+}
+
+// Tests printing user-defined class template that have a PrintTo() function.
+TEST(PrintPrintableTypeTest, TemplateInUserNamespace) {
+  EXPECT_EQ("PrintableViaPrintToTemplate: 5",
+            Print(::foo::PrintableViaPrintToTemplate<int>(5)));
+}
+
+// Tests that the universal printer prints both the address and the
+// value of a reference.
+TEST(PrintReferenceTest, PrintsAddressAndValue) {
+  int n = 5;
+  EXPECT_EQ("@" + PrintPointer(&n) + " 5", PrintByRef(n));
+
+  int a[2][3] = {
+    { 0, 1, 2 },
+    { 3, 4, 5 }
+  };
+  EXPECT_EQ("@" + PrintPointer(a) + " { { 0, 1, 2 }, { 3, 4, 5 } }",
+            PrintByRef(a));
+
+  const ::foo::UnprintableInFoo x;
+  EXPECT_EQ("@" + PrintPointer(&x) + " 16-byte object "
+            "<EF-12 00-00 34-AB 00-00 00-00 00-00 00-00 00-00>",
+            PrintByRef(x));
+}
+
+// Tests that the universal printer prints a function pointer passed by
+// reference.
+TEST(PrintReferenceTest, HandlesFunctionPointer) {
+  void (*fp)(int n) = &MyFunction;
+  const std::string fp_pointer_string =
+      PrintPointer(reinterpret_cast<const void*>(&fp));
+  // We cannot directly cast &MyFunction to const void* because the
+  // standard disallows casting between pointers to functions and
+  // pointers to objects, and some compilers (e.g. GCC 3.4) enforce
+  // this limitation.
+  const std::string fp_string = PrintPointer(reinterpret_cast<const void*>(
+      reinterpret_cast<internal::BiggestInt>(fp)));
+  EXPECT_EQ("@" + fp_pointer_string + " " + fp_string,
+            PrintByRef(fp));
+}
+
+// Tests that the universal printer prints a member function pointer
+// passed by reference.
+TEST(PrintReferenceTest, HandlesMemberFunctionPointer) {
+  int (Foo::*p)(char ch) = &Foo::MyMethod;
+  EXPECT_TRUE(HasPrefix(
+      PrintByRef(p),
+      "@" + PrintPointer(reinterpret_cast<const void*>(&p)) + " " +
+          Print(sizeof(p)) + "-byte object "));
+
+  char (Foo::*p2)(int n) = &Foo::MyVirtualMethod;
+  EXPECT_TRUE(HasPrefix(
+      PrintByRef(p2),
+      "@" + PrintPointer(reinterpret_cast<const void*>(&p2)) + " " +
+          Print(sizeof(p2)) + "-byte object "));
+}
+
+// Tests that the universal printer prints a member variable pointer
+// passed by reference.
+TEST(PrintReferenceTest, HandlesMemberVariablePointer) {
+  int Foo::*p = &Foo::value;  // NOLINT
+  EXPECT_TRUE(HasPrefix(
+      PrintByRef(p),
+      "@" + PrintPointer(&p) + " " + Print(sizeof(p)) + "-byte object "));
+}
+
+// Tests that FormatForComparisonFailureMessage(), which is used to print
+// an operand in a comparison assertion (e.g. ASSERT_EQ) when the assertion
+// fails, formats the operand in the desired way.
+
+// scalar
+TEST(FormatForComparisonFailureMessageTest, WorksForScalar) {
+  EXPECT_STREQ("123",
+               FormatForComparisonFailureMessage(123, 124).c_str());
+}
+
+// non-char pointer
+TEST(FormatForComparisonFailureMessageTest, WorksForNonCharPointer) {
+  int n = 0;
+  EXPECT_EQ(PrintPointer(&n),
+            FormatForComparisonFailureMessage(&n, &n).c_str());
+}
+
+// non-char array
+TEST(FormatForComparisonFailureMessageTest, FormatsNonCharArrayAsPointer) {
+  // In expression 'array == x', 'array' is compared by pointer.
+  // Therefore we want to print an array operand as a pointer.
+  int n[] = { 1, 2, 3 };
+  EXPECT_EQ(PrintPointer(n),
+            FormatForComparisonFailureMessage(n, n).c_str());
+}
+
+// Tests formatting a char pointer when it's compared with another pointer.
+// In this case we want to print it as a raw pointer, as the comparison is by
+// pointer.
+
+// char pointer vs pointer
+TEST(FormatForComparisonFailureMessageTest, WorksForCharPointerVsPointer) {
+  // In expression 'p == x', where 'p' and 'x' are (const or not) char
+  // pointers, the operands are compared by pointer.  Therefore we
+  // want to print 'p' as a pointer instead of a C string (we don't
+  // even know if it's supposed to point to a valid C string).
+
+  // const char*
+  const char* s = "hello";
+  EXPECT_EQ(PrintPointer(s),
+            FormatForComparisonFailureMessage(s, s).c_str());
+
+  // char*
+  char ch = 'a';
+  EXPECT_EQ(PrintPointer(&ch),
+            FormatForComparisonFailureMessage(&ch, &ch).c_str());
+}
+
+// wchar_t pointer vs pointer
+TEST(FormatForComparisonFailureMessageTest, WorksForWCharPointerVsPointer) {
+  // In expression 'p == x', where 'p' and 'x' are (const or not) char
+  // pointers, the operands are compared by pointer.  Therefore we
+  // want to print 'p' as a pointer instead of a wide C string (we don't
+  // even know if it's supposed to point to a valid wide C string).
+
+  // const wchar_t*
+  const wchar_t* s = L"hello";
+  EXPECT_EQ(PrintPointer(s),
+            FormatForComparisonFailureMessage(s, s).c_str());
+
+  // wchar_t*
+  wchar_t ch = L'a';
+  EXPECT_EQ(PrintPointer(&ch),
+            FormatForComparisonFailureMessage(&ch, &ch).c_str());
+}
+
+// Tests formatting a char pointer when it's compared to a string object.
+// In this case we want to print the char pointer as a C string.
+
+// char pointer vs std::string
+TEST(FormatForComparisonFailureMessageTest, WorksForCharPointerVsStdString) {
+  const char* s = "hello \"world";
+  EXPECT_STREQ("\"hello \\\"world\"",  // The string content should be escaped.
+               FormatForComparisonFailureMessage(s, ::std::string()).c_str());
+
+  // char*
+  char str[] = "hi\1";
+  char* p = str;
+  EXPECT_STREQ("\"hi\\x1\"",  // The string content should be escaped.
+               FormatForComparisonFailureMessage(p, ::std::string()).c_str());
+}
+
+#if GTEST_HAS_STD_WSTRING
+// wchar_t pointer vs std::wstring
+TEST(FormatForComparisonFailureMessageTest, WorksForWCharPointerVsStdWString) {
+  const wchar_t* s = L"hi \"world";
+  EXPECT_STREQ("L\"hi \\\"world\"",  // The string content should be escaped.
+               FormatForComparisonFailureMessage(s, ::std::wstring()).c_str());
+
+  // wchar_t*
+  wchar_t str[] = L"hi\1";
+  wchar_t* p = str;
+  EXPECT_STREQ("L\"hi\\x1\"",  // The string content should be escaped.
+               FormatForComparisonFailureMessage(p, ::std::wstring()).c_str());
+}
+#endif
+
+// Tests formatting a char array when it's compared with a pointer or array.
+// In this case we want to print the array as a row pointer, as the comparison
+// is by pointer.
+
+// char array vs pointer
+TEST(FormatForComparisonFailureMessageTest, WorksForCharArrayVsPointer) {
+  char str[] = "hi \"world\"";
+  char* p = nullptr;
+  EXPECT_EQ(PrintPointer(str),
+            FormatForComparisonFailureMessage(str, p).c_str());
+}
+
+// char array vs char array
+TEST(FormatForComparisonFailureMessageTest, WorksForCharArrayVsCharArray) {
+  const char str[] = "hi \"world\"";
+  EXPECT_EQ(PrintPointer(str),
+            FormatForComparisonFailureMessage(str, str).c_str());
+}
+
+// wchar_t array vs pointer
+TEST(FormatForComparisonFailureMessageTest, WorksForWCharArrayVsPointer) {
+  wchar_t str[] = L"hi \"world\"";
+  wchar_t* p = nullptr;
+  EXPECT_EQ(PrintPointer(str),
+            FormatForComparisonFailureMessage(str, p).c_str());
+}
+
+// wchar_t array vs wchar_t array
+TEST(FormatForComparisonFailureMessageTest, WorksForWCharArrayVsWCharArray) {
+  const wchar_t str[] = L"hi \"world\"";
+  EXPECT_EQ(PrintPointer(str),
+            FormatForComparisonFailureMessage(str, str).c_str());
+}
+
+// Tests formatting a char array when it's compared with a string object.
+// In this case we want to print the array as a C string.
+
+// char array vs std::string
+TEST(FormatForComparisonFailureMessageTest, WorksForCharArrayVsStdString) {
+  const char str[] = "hi \"world\"";
+  EXPECT_STREQ("\"hi \\\"world\\\"\"",  // The content should be escaped.
+               FormatForComparisonFailureMessage(str, ::std::string()).c_str());
+}
+
+#if GTEST_HAS_STD_WSTRING
+// wchar_t array vs std::wstring
+TEST(FormatForComparisonFailureMessageTest, WorksForWCharArrayVsStdWString) {
+  const wchar_t str[] = L"hi \"w\0rld\"";
+  EXPECT_STREQ(
+      "L\"hi \\\"w\"",  // The content should be escaped.
+                        // Embedded NUL terminates the string.
+      FormatForComparisonFailureMessage(str, ::std::wstring()).c_str());
+}
+#endif
+
+// Useful for testing PrintToString().  We cannot use EXPECT_EQ()
+// there as its implementation uses PrintToString().  The caller must
+// ensure that 'value' has no side effect.
+#define EXPECT_PRINT_TO_STRING_(value, expected_string)         \
+  EXPECT_TRUE(PrintToString(value) == (expected_string))        \
+      << " where " #value " prints as " << (PrintToString(value))
+
+TEST(PrintToStringTest, WorksForScalar) {
+  EXPECT_PRINT_TO_STRING_(123, "123");
+}
+
+TEST(PrintToStringTest, WorksForPointerToConstChar) {
+  const char* p = "hello";
+  EXPECT_PRINT_TO_STRING_(p, "\"hello\"");
+}
+
+TEST(PrintToStringTest, WorksForPointerToNonConstChar) {
+  char s[] = "hello";
+  char* p = s;
+  EXPECT_PRINT_TO_STRING_(p, "\"hello\"");
+}
+
+TEST(PrintToStringTest, EscapesForPointerToConstChar) {
+  const char* p = "hello\n";
+  EXPECT_PRINT_TO_STRING_(p, "\"hello\\n\"");
+}
+
+TEST(PrintToStringTest, EscapesForPointerToNonConstChar) {
+  char s[] = "hello\1";
+  char* p = s;
+  EXPECT_PRINT_TO_STRING_(p, "\"hello\\x1\"");
+}
+
+TEST(PrintToStringTest, WorksForArray) {
+  int n[3] = { 1, 2, 3 };
+  EXPECT_PRINT_TO_STRING_(n, "{ 1, 2, 3 }");
+}
+
+TEST(PrintToStringTest, WorksForCharArray) {
+  char s[] = "hello";
+  EXPECT_PRINT_TO_STRING_(s, "\"hello\"");
+}
+
+TEST(PrintToStringTest, WorksForCharArrayWithEmbeddedNul) {
+  const char str_with_nul[] = "hello\0 world";
+  EXPECT_PRINT_TO_STRING_(str_with_nul, "\"hello\\0 world\"");
+
+  char mutable_str_with_nul[] = "hello\0 world";
+  EXPECT_PRINT_TO_STRING_(mutable_str_with_nul, "\"hello\\0 world\"");
+}
+
+  TEST(PrintToStringTest, ContainsNonLatin) {
+  // Sanity test with valid UTF-8. Prints both in hex and as text.
+  std::string non_ascii_str = ::std::string("오전 4:30");
+  EXPECT_PRINT_TO_STRING_(non_ascii_str,
+                          "\"\\xEC\\x98\\xA4\\xEC\\xA0\\x84 4:30\"\n"
+                          "    As Text: \"오전 4:30\"");
+  non_ascii_str = ::std::string("From ä — ẑ");
+  EXPECT_PRINT_TO_STRING_(non_ascii_str,
+                          "\"From \\xC3\\xA4 \\xE2\\x80\\x94 \\xE1\\xBA\\x91\""
+                          "\n    As Text: \"From ä — ẑ\"");
+}
+
+TEST(IsValidUTF8Test, IllFormedUTF8) {
+  // The following test strings are ill-formed UTF-8 and are printed
+  // as hex only (or ASCII, in case of ASCII bytes) because IsValidUTF8() is
+  // expected to fail, thus output does not contain "As Text:".
+
+  static const char *const kTestdata[][2] = {
+    // 2-byte lead byte followed by a single-byte character.
+    {"\xC3\x74", "\"\\xC3t\""},
+    // Valid 2-byte character followed by an orphan trail byte.
+    {"\xC3\x84\xA4", "\"\\xC3\\x84\\xA4\""},
+    // Lead byte without trail byte.
+    {"abc\xC3", "\"abc\\xC3\""},
+    // 3-byte lead byte, single-byte character, orphan trail byte.
+    {"x\xE2\x70\x94", "\"x\\xE2p\\x94\""},
+    // Truncated 3-byte character.
+    {"\xE2\x80", "\"\\xE2\\x80\""},
+    // Truncated 3-byte character followed by valid 2-byte char.
+    {"\xE2\x80\xC3\x84", "\"\\xE2\\x80\\xC3\\x84\""},
+    // Truncated 3-byte character followed by a single-byte character.
+    {"\xE2\x80\x7A", "\"\\xE2\\x80z\""},
+    // 3-byte lead byte followed by valid 3-byte character.
+    {"\xE2\xE2\x80\x94", "\"\\xE2\\xE2\\x80\\x94\""},
+    // 4-byte lead byte followed by valid 3-byte character.
+    {"\xF0\xE2\x80\x94", "\"\\xF0\\xE2\\x80\\x94\""},
+    // Truncated 4-byte character.
+    {"\xF0\xE2\x80", "\"\\xF0\\xE2\\x80\""},
+     // Invalid UTF-8 byte sequences embedded in other chars.
+    {"abc\xE2\x80\x94\xC3\x74xyc", "\"abc\\xE2\\x80\\x94\\xC3txyc\""},
+    {"abc\xC3\x84\xE2\x80\xC3\x84xyz",
+     "\"abc\\xC3\\x84\\xE2\\x80\\xC3\\x84xyz\""},
+    // Non-shortest UTF-8 byte sequences are also ill-formed.
+    // The classics: xC0, xC1 lead byte.
+    {"\xC0\x80", "\"\\xC0\\x80\""},
+    {"\xC1\x81", "\"\\xC1\\x81\""},
+    // Non-shortest sequences.
+    {"\xE0\x80\x80", "\"\\xE0\\x80\\x80\""},
+    {"\xf0\x80\x80\x80", "\"\\xF0\\x80\\x80\\x80\""},
+    // Last valid code point before surrogate range, should be printed as text,
+    // too.
+    {"\xED\x9F\xBF", "\"\\xED\\x9F\\xBF\"\n    As Text: \"퟿\""},
+    // Start of surrogate lead. Surrogates are not printed as text.
+    {"\xED\xA0\x80", "\"\\xED\\xA0\\x80\""},
+    // Last non-private surrogate lead.
+    {"\xED\xAD\xBF", "\"\\xED\\xAD\\xBF\""},
+    // First private-use surrogate lead.
+    {"\xED\xAE\x80", "\"\\xED\\xAE\\x80\""},
+    // Last private-use surrogate lead.
+    {"\xED\xAF\xBF", "\"\\xED\\xAF\\xBF\""},
+    // Mid-point of surrogate trail.
+    {"\xED\xB3\xBF", "\"\\xED\\xB3\\xBF\""},
+    // First valid code point after surrogate range, should be printed as text,
+    // too.
+    {"\xEE\x80\x80", "\"\\xEE\\x80\\x80\"\n    As Text: \"\""}
+  };
+
+  for (int i = 0; i < int(sizeof(kTestdata)/sizeof(kTestdata[0])); ++i) {
+    EXPECT_PRINT_TO_STRING_(kTestdata[i][0], kTestdata[i][1]);
+  }
+}
+
+#undef EXPECT_PRINT_TO_STRING_
+
+TEST(UniversalTersePrintTest, WorksForNonReference) {
+  ::std::stringstream ss;
+  UniversalTersePrint(123, &ss);
+  EXPECT_EQ("123", ss.str());
+}
+
+TEST(UniversalTersePrintTest, WorksForReference) {
+  const int& n = 123;
+  ::std::stringstream ss;
+  UniversalTersePrint(n, &ss);
+  EXPECT_EQ("123", ss.str());
+}
+
+TEST(UniversalTersePrintTest, WorksForCString) {
+  const char* s1 = "abc";
+  ::std::stringstream ss1;
+  UniversalTersePrint(s1, &ss1);
+  EXPECT_EQ("\"abc\"", ss1.str());
+
+  char* s2 = const_cast<char*>(s1);
+  ::std::stringstream ss2;
+  UniversalTersePrint(s2, &ss2);
+  EXPECT_EQ("\"abc\"", ss2.str());
+
+  const char* s3 = nullptr;
+  ::std::stringstream ss3;
+  UniversalTersePrint(s3, &ss3);
+  EXPECT_EQ("NULL", ss3.str());
+}
+
+TEST(UniversalPrintTest, WorksForNonReference) {
+  ::std::stringstream ss;
+  UniversalPrint(123, &ss);
+  EXPECT_EQ("123", ss.str());
+}
+
+TEST(UniversalPrintTest, WorksForReference) {
+  const int& n = 123;
+  ::std::stringstream ss;
+  UniversalPrint(n, &ss);
+  EXPECT_EQ("123", ss.str());
+}
+
+TEST(UniversalPrintTest, WorksForCString) {
+  const char* s1 = "abc";
+  ::std::stringstream ss1;
+  UniversalPrint(s1, &ss1);
+  EXPECT_EQ(PrintPointer(s1) + " pointing to \"abc\"", std::string(ss1.str()));
+
+  char* s2 = const_cast<char*>(s1);
+  ::std::stringstream ss2;
+  UniversalPrint(s2, &ss2);
+  EXPECT_EQ(PrintPointer(s2) + " pointing to \"abc\"", std::string(ss2.str()));
+
+  const char* s3 = nullptr;
+  ::std::stringstream ss3;
+  UniversalPrint(s3, &ss3);
+  EXPECT_EQ("NULL", ss3.str());
+}
+
+TEST(UniversalPrintTest, WorksForCharArray) {
+  const char str[] = "\"Line\0 1\"\nLine 2";
+  ::std::stringstream ss1;
+  UniversalPrint(str, &ss1);
+  EXPECT_EQ("\"\\\"Line\\0 1\\\"\\nLine 2\"", ss1.str());
+
+  const char mutable_str[] = "\"Line\0 1\"\nLine 2";
+  ::std::stringstream ss2;
+  UniversalPrint(mutable_str, &ss2);
+  EXPECT_EQ("\"\\\"Line\\0 1\\\"\\nLine 2\"", ss2.str());
+}
+
+TEST(UniversalTersePrintTupleFieldsToStringsTestWithStd, PrintsEmptyTuple) {
+  Strings result = UniversalTersePrintTupleFieldsToStrings(::std::make_tuple());
+  EXPECT_EQ(0u, result.size());
+}
+
+TEST(UniversalTersePrintTupleFieldsToStringsTestWithStd, PrintsOneTuple) {
+  Strings result = UniversalTersePrintTupleFieldsToStrings(
+      ::std::make_tuple(1));
+  ASSERT_EQ(1u, result.size());
+  EXPECT_EQ("1", result[0]);
+}
+
+TEST(UniversalTersePrintTupleFieldsToStringsTestWithStd, PrintsTwoTuple) {
+  Strings result = UniversalTersePrintTupleFieldsToStrings(
+      ::std::make_tuple(1, 'a'));
+  ASSERT_EQ(2u, result.size());
+  EXPECT_EQ("1", result[0]);
+  EXPECT_EQ("'a' (97, 0x61)", result[1]);
+}
+
+TEST(UniversalTersePrintTupleFieldsToStringsTestWithStd, PrintsTersely) {
+  const int n = 1;
+  Strings result = UniversalTersePrintTupleFieldsToStrings(
+      ::std::tuple<const int&, const char*>(n, "a"));
+  ASSERT_EQ(2u, result.size());
+  EXPECT_EQ("1", result[0]);
+  EXPECT_EQ("\"a\"", result[1]);
+}
+
+#if GTEST_HAS_ABSL
+
+TEST(PrintOptionalTest, Basic) {
+  absl::optional<int> value;
+  EXPECT_EQ("(nullopt)", PrintToString(value));
+  value = {7};
+  EXPECT_EQ("(7)", PrintToString(value));
+  EXPECT_EQ("(1.1)", PrintToString(absl::optional<double>{1.1}));
+  EXPECT_EQ("(\"A\")", PrintToString(absl::optional<std::string>{"A"}));
+}
+
+struct NonPrintable {
+  unsigned char contents = 17;
+};
+
+TEST(PrintOneofTest, Basic) {
+  using Type = absl::variant<int, StreamableInGlobal, NonPrintable>;
+  EXPECT_EQ("('int' with value 7)", PrintToString(Type(7)));
+  EXPECT_EQ("('StreamableInGlobal' with value StreamableInGlobal)",
+            PrintToString(Type(StreamableInGlobal{})));
+  EXPECT_EQ(
+      "('testing::gtest_printers_test::NonPrintable' with value 1-byte object "
+      "<11>)",
+      PrintToString(Type(NonPrintable{})));
+}
+#endif  // GTEST_HAS_ABSL
+namespace {
+class string_ref;
+
+/**
+ * This is a synthetic pointer to a fixed size string.
+ */
+class string_ptr {
+ public:
+  string_ptr(const char* data, size_t size) : data_(data), size_(size) {}
+
+  string_ptr& operator++() noexcept {
+    data_ += size_;
+    return *this;
+  }
+
+  string_ref operator*() const noexcept;
+
+ private:
+  const char* data_;
+  size_t size_;
+};
+
+/**
+ * This is a synthetic reference of a fixed size string.
+ */
+class string_ref {
+ public:
+  string_ref(const char* data, size_t size) : data_(data), size_(size) {}
+
+  string_ptr operator&() const noexcept { return {data_, size_}; }  // NOLINT
+
+  bool operator==(const char* s) const noexcept {
+    if (size_ > 0 && data_[size_ - 1] != 0) {
+      return std::string(data_, size_) == std::string(s);
+    } else {
+      return std::string(data_) == std::string(s);
+    }
+  }
+
+ private:
+  const char* data_;
+  size_t size_;
+};
+
+string_ref string_ptr::operator*() const noexcept { return {data_, size_}; }
+
+TEST(string_ref, compare) {
+  const char* s = "alex\0davidjohn\0";
+  string_ptr ptr(s, 5);
+  EXPECT_EQ(*ptr, "alex");
+  EXPECT_TRUE(*ptr == "alex");
+  ++ptr;
+  EXPECT_EQ(*ptr, "david");
+  EXPECT_TRUE(*ptr == "david");
+  ++ptr;
+  EXPECT_EQ(*ptr, "john");
+}
+
+}  // namespace
+
+}  // namespace gtest_printers_test
+}  // namespace testing
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test.py
new file mode 100755
index 0000000..573cc5e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python
+#
+# Copyright 2009 Google Inc. All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that test shuffling works."""
+
+import os
+import gtest_test_utils
+
+# Command to run the googletest-shuffle-test_ program.
+COMMAND = gtest_test_utils.GetTestExecutablePath('googletest-shuffle-test_')
+
+# The environment variables for test sharding.
+TOTAL_SHARDS_ENV_VAR = 'GTEST_TOTAL_SHARDS'
+SHARD_INDEX_ENV_VAR = 'GTEST_SHARD_INDEX'
+
+TEST_FILTER = 'A*.A:A*.B:C*'
+
+ALL_TESTS = []
+ACTIVE_TESTS = []
+FILTERED_TESTS = []
+SHARDED_TESTS = []
+
+SHUFFLED_ALL_TESTS = []
+SHUFFLED_ACTIVE_TESTS = []
+SHUFFLED_FILTERED_TESTS = []
+SHUFFLED_SHARDED_TESTS = []
+
+
+def AlsoRunDisabledTestsFlag():
+  return '--gtest_also_run_disabled_tests'
+
+
+def FilterFlag(test_filter):
+  return '--gtest_filter=%s' % (test_filter,)
+
+
+def RepeatFlag(n):
+  return '--gtest_repeat=%s' % (n,)
+
+
+def ShuffleFlag():
+  return '--gtest_shuffle'
+
+
+def RandomSeedFlag(n):
+  return '--gtest_random_seed=%s' % (n,)
+
+
+def RunAndReturnOutput(extra_env, args):
+  """Runs the test program and returns its output."""
+
+  environ_copy = os.environ.copy()
+  environ_copy.update(extra_env)
+
+  return gtest_test_utils.Subprocess([COMMAND] + args, env=environ_copy).output
+
+
+def GetTestsForAllIterations(extra_env, args):
+  """Runs the test program and returns a list of test lists.
+
+  Args:
+    extra_env: a map from environment variables to their values
+    args: command line flags to pass to googletest-shuffle-test_
+
+  Returns:
+    A list where the i-th element is the list of tests run in the i-th
+    test iteration.
+  """
+
+  test_iterations = []
+  for line in RunAndReturnOutput(extra_env, args).split('\n'):
+    if line.startswith('----'):
+      tests = []
+      test_iterations.append(tests)
+    elif line.strip():
+      tests.append(line.strip())  # 'TestCaseName.TestName'
+
+  return test_iterations
+
+
+def GetTestCases(tests):
+  """Returns a list of test cases in the given full test names.
+
+  Args:
+    tests: a list of full test names
+
+  Returns:
+    A list of test cases from 'tests', in their original order.
+    Consecutive duplicates are removed.
+  """
+
+  test_cases = []
+  for test in tests:
+    test_case = test.split('.')[0]
+    if not test_case in test_cases:
+      test_cases.append(test_case)
+
+  return test_cases
+
+
+def CalculateTestLists():
+  """Calculates the list of tests run under different flags."""
+
+  if not ALL_TESTS:
+    ALL_TESTS.extend(
+        GetTestsForAllIterations({}, [AlsoRunDisabledTestsFlag()])[0])
+
+  if not ACTIVE_TESTS:
+    ACTIVE_TESTS.extend(GetTestsForAllIterations({}, [])[0])
+
+  if not FILTERED_TESTS:
+    FILTERED_TESTS.extend(
+        GetTestsForAllIterations({}, [FilterFlag(TEST_FILTER)])[0])
+
+  if not SHARDED_TESTS:
+    SHARDED_TESTS.extend(
+        GetTestsForAllIterations({TOTAL_SHARDS_ENV_VAR: '3',
+                                  SHARD_INDEX_ENV_VAR: '1'},
+                                 [])[0])
+
+  if not SHUFFLED_ALL_TESTS:
+    SHUFFLED_ALL_TESTS.extend(GetTestsForAllIterations(
+        {}, [AlsoRunDisabledTestsFlag(), ShuffleFlag(), RandomSeedFlag(1)])[0])
+
+  if not SHUFFLED_ACTIVE_TESTS:
+    SHUFFLED_ACTIVE_TESTS.extend(GetTestsForAllIterations(
+        {}, [ShuffleFlag(), RandomSeedFlag(1)])[0])
+
+  if not SHUFFLED_FILTERED_TESTS:
+    SHUFFLED_FILTERED_TESTS.extend(GetTestsForAllIterations(
+        {}, [ShuffleFlag(), RandomSeedFlag(1), FilterFlag(TEST_FILTER)])[0])
+
+  if not SHUFFLED_SHARDED_TESTS:
+    SHUFFLED_SHARDED_TESTS.extend(
+        GetTestsForAllIterations({TOTAL_SHARDS_ENV_VAR: '3',
+                                  SHARD_INDEX_ENV_VAR: '1'},
+                                 [ShuffleFlag(), RandomSeedFlag(1)])[0])
+
+
+class GTestShuffleUnitTest(gtest_test_utils.TestCase):
+  """Tests test shuffling."""
+
+  def setUp(self):
+    CalculateTestLists()
+
+  def testShufflePreservesNumberOfTests(self):
+    self.assertEqual(len(ALL_TESTS), len(SHUFFLED_ALL_TESTS))
+    self.assertEqual(len(ACTIVE_TESTS), len(SHUFFLED_ACTIVE_TESTS))
+    self.assertEqual(len(FILTERED_TESTS), len(SHUFFLED_FILTERED_TESTS))
+    self.assertEqual(len(SHARDED_TESTS), len(SHUFFLED_SHARDED_TESTS))
+
+  def testShuffleChangesTestOrder(self):
+    self.assert_(SHUFFLED_ALL_TESTS != ALL_TESTS, SHUFFLED_ALL_TESTS)
+    self.assert_(SHUFFLED_ACTIVE_TESTS != ACTIVE_TESTS, SHUFFLED_ACTIVE_TESTS)
+    self.assert_(SHUFFLED_FILTERED_TESTS != FILTERED_TESTS,
+                 SHUFFLED_FILTERED_TESTS)
+    self.assert_(SHUFFLED_SHARDED_TESTS != SHARDED_TESTS,
+                 SHUFFLED_SHARDED_TESTS)
+
+  def testShuffleChangesTestCaseOrder(self):
+    self.assert_(GetTestCases(SHUFFLED_ALL_TESTS) != GetTestCases(ALL_TESTS),
+                 GetTestCases(SHUFFLED_ALL_TESTS))
+    self.assert_(
+        GetTestCases(SHUFFLED_ACTIVE_TESTS) != GetTestCases(ACTIVE_TESTS),
+        GetTestCases(SHUFFLED_ACTIVE_TESTS))
+    self.assert_(
+        GetTestCases(SHUFFLED_FILTERED_TESTS) != GetTestCases(FILTERED_TESTS),
+        GetTestCases(SHUFFLED_FILTERED_TESTS))
+    self.assert_(
+        GetTestCases(SHUFFLED_SHARDED_TESTS) != GetTestCases(SHARDED_TESTS),
+        GetTestCases(SHUFFLED_SHARDED_TESTS))
+
+  def testShuffleDoesNotRepeatTest(self):
+    for test in SHUFFLED_ALL_TESTS:
+      self.assertEqual(1, SHUFFLED_ALL_TESTS.count(test),
+                       '%s appears more than once' % (test,))
+    for test in SHUFFLED_ACTIVE_TESTS:
+      self.assertEqual(1, SHUFFLED_ACTIVE_TESTS.count(test),
+                       '%s appears more than once' % (test,))
+    for test in SHUFFLED_FILTERED_TESTS:
+      self.assertEqual(1, SHUFFLED_FILTERED_TESTS.count(test),
+                       '%s appears more than once' % (test,))
+    for test in SHUFFLED_SHARDED_TESTS:
+      self.assertEqual(1, SHUFFLED_SHARDED_TESTS.count(test),
+                       '%s appears more than once' % (test,))
+
+  def testShuffleDoesNotCreateNewTest(self):
+    for test in SHUFFLED_ALL_TESTS:
+      self.assert_(test in ALL_TESTS, '%s is an invalid test' % (test,))
+    for test in SHUFFLED_ACTIVE_TESTS:
+      self.assert_(test in ACTIVE_TESTS, '%s is an invalid test' % (test,))
+    for test in SHUFFLED_FILTERED_TESTS:
+      self.assert_(test in FILTERED_TESTS, '%s is an invalid test' % (test,))
+    for test in SHUFFLED_SHARDED_TESTS:
+      self.assert_(test in SHARDED_TESTS, '%s is an invalid test' % (test,))
+
+  def testShuffleIncludesAllTests(self):
+    for test in ALL_TESTS:
+      self.assert_(test in SHUFFLED_ALL_TESTS, '%s is missing' % (test,))
+    for test in ACTIVE_TESTS:
+      self.assert_(test in SHUFFLED_ACTIVE_TESTS, '%s is missing' % (test,))
+    for test in FILTERED_TESTS:
+      self.assert_(test in SHUFFLED_FILTERED_TESTS, '%s is missing' % (test,))
+    for test in SHARDED_TESTS:
+      self.assert_(test in SHUFFLED_SHARDED_TESTS, '%s is missing' % (test,))
+
+  def testShuffleLeavesDeathTestsAtFront(self):
+    non_death_test_found = False
+    for test in SHUFFLED_ACTIVE_TESTS:
+      if 'DeathTest.' in test:
+        self.assert_(not non_death_test_found,
+                     '%s appears after a non-death test' % (test,))
+      else:
+        non_death_test_found = True
+
+  def _VerifyTestCasesDoNotInterleave(self, tests):
+    test_cases = []
+    for test in tests:
+      [test_case, _] = test.split('.')
+      if test_cases and test_cases[-1] != test_case:
+        test_cases.append(test_case)
+        self.assertEqual(1, test_cases.count(test_case),
+                         'Test case %s is not grouped together in %s' %
+                         (test_case, tests))
+
+  def testShuffleDoesNotInterleaveTestCases(self):
+    self._VerifyTestCasesDoNotInterleave(SHUFFLED_ALL_TESTS)
+    self._VerifyTestCasesDoNotInterleave(SHUFFLED_ACTIVE_TESTS)
+    self._VerifyTestCasesDoNotInterleave(SHUFFLED_FILTERED_TESTS)
+    self._VerifyTestCasesDoNotInterleave(SHUFFLED_SHARDED_TESTS)
+
+  def testShuffleRestoresOrderAfterEachIteration(self):
+    # Get the test lists in all 3 iterations, using random seed 1, 2,
+    # and 3 respectively.  Google Test picks a different seed in each
+    # iteration, and this test depends on the current implementation
+    # picking successive numbers.  This dependency is not ideal, but
+    # makes the test much easier to write.
+    [tests_in_iteration1, tests_in_iteration2, tests_in_iteration3] = (
+        GetTestsForAllIterations(
+            {}, [ShuffleFlag(), RandomSeedFlag(1), RepeatFlag(3)]))
+
+    # Make sure running the tests with random seed 1 gets the same
+    # order as in iteration 1 above.
+    [tests_with_seed1] = GetTestsForAllIterations(
+        {}, [ShuffleFlag(), RandomSeedFlag(1)])
+    self.assertEqual(tests_in_iteration1, tests_with_seed1)
+
+    # Make sure running the tests with random seed 2 gets the same
+    # order as in iteration 2 above.  Success means that Google Test
+    # correctly restores the test order before re-shuffling at the
+    # beginning of iteration 2.
+    [tests_with_seed2] = GetTestsForAllIterations(
+        {}, [ShuffleFlag(), RandomSeedFlag(2)])
+    self.assertEqual(tests_in_iteration2, tests_with_seed2)
+
+    # Make sure running the tests with random seed 3 gets the same
+    # order as in iteration 3 above.  Success means that Google Test
+    # correctly restores the test order before re-shuffling at the
+    # beginning of iteration 3.
+    [tests_with_seed3] = GetTestsForAllIterations(
+        {}, [ShuffleFlag(), RandomSeedFlag(3)])
+    self.assertEqual(tests_in_iteration3, tests_with_seed3)
+
+  def testShuffleGeneratesNewOrderInEachIteration(self):
+    [tests_in_iteration1, tests_in_iteration2, tests_in_iteration3] = (
+        GetTestsForAllIterations(
+            {}, [ShuffleFlag(), RandomSeedFlag(1), RepeatFlag(3)]))
+
+    self.assert_(tests_in_iteration1 != tests_in_iteration2,
+                 tests_in_iteration1)
+    self.assert_(tests_in_iteration1 != tests_in_iteration3,
+                 tests_in_iteration1)
+    self.assert_(tests_in_iteration2 != tests_in_iteration3,
+                 tests_in_iteration2)
+
+  def testShuffleShardedTestsPreservesPartition(self):
+    # If we run M tests on N shards, the same M tests should be run in
+    # total, regardless of the random seeds used by the shards.
+    [tests1] = GetTestsForAllIterations({TOTAL_SHARDS_ENV_VAR: '3',
+                                         SHARD_INDEX_ENV_VAR: '0'},
+                                        [ShuffleFlag(), RandomSeedFlag(1)])
+    [tests2] = GetTestsForAllIterations({TOTAL_SHARDS_ENV_VAR: '3',
+                                         SHARD_INDEX_ENV_VAR: '1'},
+                                        [ShuffleFlag(), RandomSeedFlag(20)])
+    [tests3] = GetTestsForAllIterations({TOTAL_SHARDS_ENV_VAR: '3',
+                                         SHARD_INDEX_ENV_VAR: '2'},
+                                        [ShuffleFlag(), RandomSeedFlag(25)])
+    sorted_sharded_tests = tests1 + tests2 + tests3
+    sorted_sharded_tests.sort()
+    sorted_active_tests = []
+    sorted_active_tests.extend(ACTIVE_TESTS)
+    sorted_active_tests.sort()
+    self.assertEqual(sorted_active_tests, sorted_sharded_tests)
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test_.cc
new file mode 100755
index 0000000..c1fc106
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-shuffle-test_.cc
@@ -0,0 +1,101 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Verifies that test shuffling works.
+
+#include "gtest/gtest.h"
+
+namespace {
+
+using ::testing::EmptyTestEventListener;
+using ::testing::InitGoogleTest;
+using ::testing::Message;
+using ::testing::Test;
+using ::testing::TestEventListeners;
+using ::testing::TestInfo;
+using ::testing::UnitTest;
+
+// The test methods are empty, as the sole purpose of this program is
+// to print the test names before/after shuffling.
+
+class A : public Test {};
+TEST_F(A, A) {}
+TEST_F(A, B) {}
+
+TEST(ADeathTest, A) {}
+TEST(ADeathTest, B) {}
+TEST(ADeathTest, C) {}
+
+TEST(B, A) {}
+TEST(B, B) {}
+TEST(B, C) {}
+TEST(B, DISABLED_D) {}
+TEST(B, DISABLED_E) {}
+
+TEST(BDeathTest, A) {}
+TEST(BDeathTest, B) {}
+
+TEST(C, A) {}
+TEST(C, B) {}
+TEST(C, C) {}
+TEST(C, DISABLED_D) {}
+
+TEST(CDeathTest, A) {}
+
+TEST(DISABLED_D, A) {}
+TEST(DISABLED_D, DISABLED_B) {}
+
+// This printer prints the full test names only, starting each test
+// iteration with a "----" marker.
+class TestNamePrinter : public EmptyTestEventListener {
+ public:
+  void OnTestIterationStart(const UnitTest& /* unit_test */,
+                            int /* iteration */) override {
+    printf("----\n");
+  }
+
+  void OnTestStart(const TestInfo& test_info) override {
+    printf("%s.%s\n", test_info.test_case_name(), test_info.name());
+  }
+};
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  InitGoogleTest(&argc, argv);
+
+  // Replaces the default printer with TestNamePrinter, which prints
+  // the test name only.
+  TestEventListeners& listeners = UnitTest::GetInstance()->listeners();
+  delete listeners.Release(listeners.default_result_printer());
+  listeners.Append(new TestNamePrinter);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test-part-test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test-part-test.cc
new file mode 100755
index 0000000..44cf7ca
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test-part-test.cc
@@ -0,0 +1,230 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/gtest-test-part.h"
+
+#include "gtest/gtest.h"
+
+using testing::Message;
+using testing::Test;
+using testing::TestPartResult;
+using testing::TestPartResultArray;
+
+namespace {
+
+// Tests the TestPartResult class.
+
+// The test fixture for testing TestPartResult.
+class TestPartResultTest : public Test {
+ protected:
+  TestPartResultTest()
+      : r1_(TestPartResult::kSuccess, "foo/bar.cc", 10, "Success!"),
+        r2_(TestPartResult::kNonFatalFailure, "foo/bar.cc", -1, "Failure!"),
+        r3_(TestPartResult::kFatalFailure, nullptr, -1, "Failure!"),
+        r4_(TestPartResult::kSkip, "foo/bar.cc", 2, "Skipped!") {}
+
+  TestPartResult r1_, r2_, r3_, r4_;
+};
+
+
+TEST_F(TestPartResultTest, ConstructorWorks) {
+  Message message;
+  message << "something is terribly wrong";
+  message << static_cast<const char*>(testing::internal::kStackTraceMarker);
+  message << "some unimportant stack trace";
+
+  const TestPartResult result(TestPartResult::kNonFatalFailure,
+                              "some_file.cc",
+                              42,
+                              message.GetString().c_str());
+
+  EXPECT_EQ(TestPartResult::kNonFatalFailure, result.type());
+  EXPECT_STREQ("some_file.cc", result.file_name());
+  EXPECT_EQ(42, result.line_number());
+  EXPECT_STREQ(message.GetString().c_str(), result.message());
+  EXPECT_STREQ("something is terribly wrong", result.summary());
+}
+
+TEST_F(TestPartResultTest, ResultAccessorsWork) {
+  const TestPartResult success(TestPartResult::kSuccess,
+                               "file.cc",
+                               42,
+                               "message");
+  EXPECT_TRUE(success.passed());
+  EXPECT_FALSE(success.failed());
+  EXPECT_FALSE(success.nonfatally_failed());
+  EXPECT_FALSE(success.fatally_failed());
+  EXPECT_FALSE(success.skipped());
+
+  const TestPartResult nonfatal_failure(TestPartResult::kNonFatalFailure,
+                                        "file.cc",
+                                        42,
+                                        "message");
+  EXPECT_FALSE(nonfatal_failure.passed());
+  EXPECT_TRUE(nonfatal_failure.failed());
+  EXPECT_TRUE(nonfatal_failure.nonfatally_failed());
+  EXPECT_FALSE(nonfatal_failure.fatally_failed());
+  EXPECT_FALSE(nonfatal_failure.skipped());
+
+  const TestPartResult fatal_failure(TestPartResult::kFatalFailure,
+                                     "file.cc",
+                                     42,
+                                     "message");
+  EXPECT_FALSE(fatal_failure.passed());
+  EXPECT_TRUE(fatal_failure.failed());
+  EXPECT_FALSE(fatal_failure.nonfatally_failed());
+  EXPECT_TRUE(fatal_failure.fatally_failed());
+  EXPECT_FALSE(fatal_failure.skipped());
+
+  const TestPartResult skip(TestPartResult::kSkip, "file.cc", 42, "message");
+  EXPECT_FALSE(skip.passed());
+  EXPECT_FALSE(skip.failed());
+  EXPECT_FALSE(skip.nonfatally_failed());
+  EXPECT_FALSE(skip.fatally_failed());
+  EXPECT_TRUE(skip.skipped());
+}
+
+// Tests TestPartResult::type().
+TEST_F(TestPartResultTest, type) {
+  EXPECT_EQ(TestPartResult::kSuccess, r1_.type());
+  EXPECT_EQ(TestPartResult::kNonFatalFailure, r2_.type());
+  EXPECT_EQ(TestPartResult::kFatalFailure, r3_.type());
+  EXPECT_EQ(TestPartResult::kSkip, r4_.type());
+}
+
+// Tests TestPartResult::file_name().
+TEST_F(TestPartResultTest, file_name) {
+  EXPECT_STREQ("foo/bar.cc", r1_.file_name());
+  EXPECT_STREQ(nullptr, r3_.file_name());
+  EXPECT_STREQ("foo/bar.cc", r4_.file_name());
+}
+
+// Tests TestPartResult::line_number().
+TEST_F(TestPartResultTest, line_number) {
+  EXPECT_EQ(10, r1_.line_number());
+  EXPECT_EQ(-1, r2_.line_number());
+  EXPECT_EQ(2, r4_.line_number());
+}
+
+// Tests TestPartResult::message().
+TEST_F(TestPartResultTest, message) {
+  EXPECT_STREQ("Success!", r1_.message());
+  EXPECT_STREQ("Skipped!", r4_.message());
+}
+
+// Tests TestPartResult::passed().
+TEST_F(TestPartResultTest, Passed) {
+  EXPECT_TRUE(r1_.passed());
+  EXPECT_FALSE(r2_.passed());
+  EXPECT_FALSE(r3_.passed());
+  EXPECT_FALSE(r4_.passed());
+}
+
+// Tests TestPartResult::failed().
+TEST_F(TestPartResultTest, Failed) {
+  EXPECT_FALSE(r1_.failed());
+  EXPECT_TRUE(r2_.failed());
+  EXPECT_TRUE(r3_.failed());
+  EXPECT_FALSE(r4_.failed());
+}
+
+// Tests TestPartResult::failed().
+TEST_F(TestPartResultTest, Skipped) {
+  EXPECT_FALSE(r1_.skipped());
+  EXPECT_FALSE(r2_.skipped());
+  EXPECT_FALSE(r3_.skipped());
+  EXPECT_TRUE(r4_.skipped());
+}
+
+// Tests TestPartResult::fatally_failed().
+TEST_F(TestPartResultTest, FatallyFailed) {
+  EXPECT_FALSE(r1_.fatally_failed());
+  EXPECT_FALSE(r2_.fatally_failed());
+  EXPECT_TRUE(r3_.fatally_failed());
+  EXPECT_FALSE(r4_.fatally_failed());
+}
+
+// Tests TestPartResult::nonfatally_failed().
+TEST_F(TestPartResultTest, NonfatallyFailed) {
+  EXPECT_FALSE(r1_.nonfatally_failed());
+  EXPECT_TRUE(r2_.nonfatally_failed());
+  EXPECT_FALSE(r3_.nonfatally_failed());
+  EXPECT_FALSE(r4_.nonfatally_failed());
+}
+
+// Tests the TestPartResultArray class.
+
+class TestPartResultArrayTest : public Test {
+ protected:
+  TestPartResultArrayTest()
+      : r1_(TestPartResult::kNonFatalFailure, "foo/bar.cc", -1, "Failure 1"),
+        r2_(TestPartResult::kFatalFailure, "foo/bar.cc", -1, "Failure 2") {}
+
+  const TestPartResult r1_, r2_;
+};
+
+// Tests that TestPartResultArray initially has size 0.
+TEST_F(TestPartResultArrayTest, InitialSizeIsZero) {
+  TestPartResultArray results;
+  EXPECT_EQ(0, results.size());
+}
+
+// Tests that TestPartResultArray contains the given TestPartResult
+// after one Append() operation.
+TEST_F(TestPartResultArrayTest, ContainsGivenResultAfterAppend) {
+  TestPartResultArray results;
+  results.Append(r1_);
+  EXPECT_EQ(1, results.size());
+  EXPECT_STREQ("Failure 1", results.GetTestPartResult(0).message());
+}
+
+// Tests that TestPartResultArray contains the given TestPartResults
+// after two Append() operations.
+TEST_F(TestPartResultArrayTest, ContainsGivenResultsAfterTwoAppends) {
+  TestPartResultArray results;
+  results.Append(r1_);
+  results.Append(r2_);
+  EXPECT_EQ(2, results.size());
+  EXPECT_STREQ("Failure 1", results.GetTestPartResult(0).message());
+  EXPECT_STREQ("Failure 2", results.GetTestPartResult(1).message());
+}
+
+typedef TestPartResultArrayTest TestPartResultArrayDeathTest;
+
+// Tests that the program dies when GetTestPartResult() is called with
+// an invalid index.
+TEST_F(TestPartResultArrayDeathTest, DiesWhenIndexIsOutOfBound) {
+  TestPartResultArray results;
+  results.Append(r1_);
+
+  EXPECT_DEATH_IF_SUPPORTED(results.GetTestPartResult(-1), "");
+  EXPECT_DEATH_IF_SUPPORTED(results.GetTestPartResult(1), "");
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test2_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test2_test.cc
new file mode 100755
index 0000000..2e425da
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-test2_test.cc
@@ -0,0 +1,61 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google Test itself.  This verifies that the basic constructs of
+// Google Test work.
+
+#include "gtest/gtest.h"
+#include "googletest-param-test-test.h"
+
+using ::testing::Values;
+using ::testing::internal::ParamGenerator;
+
+// Tests that generators defined in a different translation unit
+// are functional. The test using extern_gen_2 is defined
+// in googletest-param-test-test.cc.
+ParamGenerator<int> extern_gen_2 = Values(33);
+
+// Tests that a parameterized test case can be defined in one translation unit
+// and instantiated in another. The test is defined in
+// googletest-param-test-test.cc and ExternalInstantiationTest fixture class is
+// defined in gtest-param-test_test.h.
+INSTANTIATE_TEST_SUITE_P(MultiplesOf33,
+                         ExternalInstantiationTest,
+                         Values(33, 66));
+
+// Tests that a parameterized test case can be instantiated
+// in multiple translation units. Another instantiation is defined
+// in googletest-param-test-test.cc and
+// InstantiationInMultipleTranslationUnitsTest fixture is defined in
+// gtest-param-test_test.h
+INSTANTIATE_TEST_SUITE_P(Sequence2,
+                         InstantiationInMultipleTranslationUnitsTest,
+                         Values(42*3, 42*4, 42*5));
+
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test.py
new file mode 100755
index 0000000..a38cd33
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests Google Test's throw-on-failure mode with exceptions disabled.
+
+This script invokes googletest-throw-on-failure-test_ (a program written with
+Google Test) with different environments and command line flags.
+"""
+
+import os
+import gtest_test_utils
+
+
+# Constants.
+
+# The command line flag for enabling/disabling the throw-on-failure mode.
+THROW_ON_FAILURE = 'gtest_throw_on_failure'
+
+# Path to the googletest-throw-on-failure-test_ program, compiled with
+# exceptions disabled.
+EXE_PATH = gtest_test_utils.GetTestExecutablePath(
+    'googletest-throw-on-failure-test_')
+
+
+# Utilities.
+
+
+def SetEnvVar(env_var, value):
+  """Sets an environment variable to a given value; unsets it when the
+  given value is None.
+  """
+
+  env_var = env_var.upper()
+  if value is not None:
+    os.environ[env_var] = value
+  elif env_var in os.environ:
+    del os.environ[env_var]
+
+
+def Run(command):
+  """Runs a command; returns True/False if its exit code is/isn't 0."""
+
+  print('Running "%s". . .' % ' '.join(command))
+  p = gtest_test_utils.Subprocess(command)
+  return p.exited and p.exit_code == 0
+
+
+# The tests.
+class ThrowOnFailureTest(gtest_test_utils.TestCase):
+  """Tests the throw-on-failure mode."""
+
+  def RunAndVerify(self, env_var_value, flag_value, should_fail):
+    """Runs googletest-throw-on-failure-test_ and verifies that it does
+    (or does not) exit with a non-zero code.
+
+    Args:
+      env_var_value:    value of the GTEST_BREAK_ON_FAILURE environment
+                        variable; None if the variable should be unset.
+      flag_value:       value of the --gtest_break_on_failure flag;
+                        None if the flag should not be present.
+      should_fail:      True if the program is expected to fail.
+    """
+
+    SetEnvVar(THROW_ON_FAILURE, env_var_value)
+
+    if env_var_value is None:
+      env_var_value_msg = ' is not set'
+    else:
+      env_var_value_msg = '=' + env_var_value
+
+    if flag_value is None:
+      flag = ''
+    elif flag_value == '0':
+      flag = '--%s=0' % THROW_ON_FAILURE
+    else:
+      flag = '--%s' % THROW_ON_FAILURE
+
+    command = [EXE_PATH]
+    if flag:
+      command.append(flag)
+
+    if should_fail:
+      should_or_not = 'should'
+    else:
+      should_or_not = 'should not'
+
+    failed = not Run(command)
+
+    SetEnvVar(THROW_ON_FAILURE, None)
+
+    msg = ('when %s%s, an assertion failure in "%s" %s cause a non-zero '
+           'exit code.' %
+           (THROW_ON_FAILURE, env_var_value_msg, ' '.join(command),
+            should_or_not))
+    self.assert_(failed == should_fail, msg)
+
+  def testDefaultBehavior(self):
+    """Tests the behavior of the default mode."""
+
+    self.RunAndVerify(env_var_value=None, flag_value=None, should_fail=False)
+
+  def testThrowOnFailureEnvVar(self):
+    """Tests using the GTEST_THROW_ON_FAILURE environment variable."""
+
+    self.RunAndVerify(env_var_value='0',
+                      flag_value=None,
+                      should_fail=False)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value=None,
+                      should_fail=True)
+
+  def testThrowOnFailureFlag(self):
+    """Tests using the --gtest_throw_on_failure flag."""
+
+    self.RunAndVerify(env_var_value=None,
+                      flag_value='0',
+                      should_fail=False)
+    self.RunAndVerify(env_var_value=None,
+                      flag_value='1',
+                      should_fail=True)
+
+  def testThrowOnFailureFlagOverridesEnvVar(self):
+    """Tests that --gtest_throw_on_failure overrides GTEST_THROW_ON_FAILURE."""
+
+    self.RunAndVerify(env_var_value='0',
+                      flag_value='0',
+                      should_fail=False)
+    self.RunAndVerify(env_var_value='0',
+                      flag_value='1',
+                      should_fail=True)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value='0',
+                      should_fail=False)
+    self.RunAndVerify(env_var_value='1',
+                      flag_value='1',
+                      should_fail=True)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test_.cc
new file mode 100755
index 0000000..83bb914
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-throw-on-failure-test_.cc
@@ -0,0 +1,71 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Test's throw-on-failure mode with exceptions disabled.
+//
+// This program must be compiled with exceptions disabled.  It will be
+// invoked by googletest-throw-on-failure-test.py, and is expected to exit
+// with non-zero in the throw-on-failure mode or 0 otherwise.
+
+#include "gtest/gtest.h"
+
+#include <stdio.h>                      // for fflush, fprintf, NULL, etc.
+#include <stdlib.h>                     // for exit
+#include <exception>                    // for set_terminate
+
+// This terminate handler aborts the program using exit() rather than abort().
+// This avoids showing pop-ups on Windows systems and core dumps on Unix-like
+// ones.
+void TerminateHandler() {
+  fprintf(stderr, "%s\n", "Unhandled C++ exception terminating the program.");
+  fflush(nullptr);
+  exit(1);
+}
+
+int main(int argc, char** argv) {
+#if GTEST_HAS_EXCEPTIONS
+  std::set_terminate(&TerminateHandler);
+#endif
+  testing::InitGoogleTest(&argc, argv);
+
+  // We want to ensure that people can use Google Test assertions in
+  // other testing frameworks, as long as they initialize Google Test
+  // properly and set the throw-on-failure mode.  Therefore, we don't
+  // use Google Test's constructs for defining and running tests
+  // (e.g. TEST and RUN_ALL_TESTS) here.
+
+  // In the throw-on-failure mode with exceptions disabled, this
+  // assertion will cause the program to exit with a non-zero code.
+  EXPECT_EQ(2, 3);
+
+  // When not in the throw-on-failure mode, the control will reach
+  // here.
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test.py
new file mode 100755
index 0000000..69595a0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Verifies that Google Test warns the user when not initialized properly."""
+
+import gtest_test_utils
+
+COMMAND = gtest_test_utils.GetTestExecutablePath('googletest-uninitialized-test_')
+
+
+def Assert(condition):
+  if not condition:
+    raise AssertionError
+
+
+def AssertEq(expected, actual):
+  if expected != actual:
+    print('Expected: %s' % (expected,))
+    print('  Actual: %s' % (actual,))
+    raise AssertionError
+
+
+def TestExitCodeAndOutput(command):
+  """Runs the given command and verifies its exit code and output."""
+
+  # Verifies that 'command' exits with code 1.
+  p = gtest_test_utils.Subprocess(command)
+  if p.exited and p.exit_code == 0:
+    Assert('IMPORTANT NOTICE' in p.output);
+  Assert('InitGoogleTest' in p.output)
+
+
+class GTestUninitializedTest(gtest_test_utils.TestCase):
+  def testExitCodeAndOutput(self):
+    TestExitCodeAndOutput(COMMAND)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test_.cc
new file mode 100755
index 0000000..b4434d5
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/googletest-uninitialized-test_.cc
@@ -0,0 +1,42 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/gtest.h"
+
+TEST(DummyTest, Dummy) {
+  // This test doesn't verify anything.  We just need it to create a
+  // realistic stage for testing the behavior of Google Test when
+  // RUN_ALL_TESTS() is called without
+  // testing::InitGoogleTest() being called first.
+}
+
+int main() {
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test2_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test2_test.cc
new file mode 100755
index 0000000..7000160
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test2_test.cc
@@ -0,0 +1,44 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include <vector>
+
+#include "test/gtest-typed-test_test.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Tests that the same type-parameterized test case can be
+// instantiated in different translation units linked together.
+// (ContainerTest is also instantiated in gtest-typed-test_test.cc.)
+INSTANTIATE_TYPED_TEST_SUITE_P(Vector, ContainerTest,
+                               testing::Types<std::vector<int> >);
+
+#endif  // GTEST_HAS_TYPED_TEST_P
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.cc
new file mode 100755
index 0000000..5411832
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.cc
@@ -0,0 +1,462 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "test/gtest-typed-test_test.h"
+
+#include <set>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#if _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127 /* conditional expression is constant */)
+#endif  //  _MSC_VER
+
+using testing::Test;
+
+// Used for testing that SetUpTestSuite()/TearDownTestSuite(), fixture
+// ctor/dtor, and SetUp()/TearDown() work correctly in typed tests and
+// type-parameterized test.
+template <typename T>
+class CommonTest : public Test {
+  // For some technical reason, SetUpTestSuite() and TearDownTestSuite()
+  // must be public.
+ public:
+  static void SetUpTestSuite() {
+    shared_ = new T(5);
+  }
+
+  static void TearDownTestSuite() {
+    delete shared_;
+    shared_ = nullptr;
+  }
+
+  // This 'protected:' is optional.  There's no harm in making all
+  // members of this fixture class template public.
+ protected:
+  // We used to use std::list here, but switched to std::vector since
+  // MSVC's <list> doesn't compile cleanly with /W4.
+  typedef std::vector<T> Vector;
+  typedef std::set<int> IntSet;
+
+  CommonTest() : value_(1) {}
+
+  ~CommonTest() override { EXPECT_EQ(3, value_); }
+
+  void SetUp() override {
+    EXPECT_EQ(1, value_);
+    value_++;
+  }
+
+  void TearDown() override {
+    EXPECT_EQ(2, value_);
+    value_++;
+  }
+
+  T value_;
+  static T* shared_;
+};
+
+template <typename T>
+T* CommonTest<T>::shared_ = nullptr;
+
+// This #ifdef block tests typed tests.
+#if GTEST_HAS_TYPED_TEST
+
+using testing::Types;
+
+// Tests that SetUpTestSuite()/TearDownTestSuite(), fixture ctor/dtor,
+// and SetUp()/TearDown() work correctly in typed tests
+
+typedef Types<char, int> TwoTypes;
+TYPED_TEST_SUITE(CommonTest, TwoTypes);
+
+TYPED_TEST(CommonTest, ValuesAreCorrect) {
+  // Static members of the fixture class template can be visited via
+  // the TestFixture:: prefix.
+  EXPECT_EQ(5, *TestFixture::shared_);
+
+  // Typedefs in the fixture class template can be visited via the
+  // "typename TestFixture::" prefix.
+  typename TestFixture::Vector empty;
+  EXPECT_EQ(0U, empty.size());
+
+  typename TestFixture::IntSet empty2;
+  EXPECT_EQ(0U, empty2.size());
+
+  // Non-static members of the fixture class must be visited via
+  // 'this', as required by C++ for class templates.
+  EXPECT_EQ(2, this->value_);
+}
+
+// The second test makes sure shared_ is not deleted after the first
+// test.
+TYPED_TEST(CommonTest, ValuesAreStillCorrect) {
+  // Static members of the fixture class template can also be visited
+  // via 'this'.
+  ASSERT_TRUE(this->shared_ != nullptr);
+  EXPECT_EQ(5, *this->shared_);
+
+  // TypeParam can be used to refer to the type parameter.
+  EXPECT_EQ(static_cast<TypeParam>(2), this->value_);
+}
+
+// Tests that multiple TYPED_TEST_SUITE's can be defined in the same
+// translation unit.
+
+template <typename T>
+class TypedTest1 : public Test {
+};
+
+// Verifies that the second argument of TYPED_TEST_SUITE can be a
+// single type.
+TYPED_TEST_SUITE(TypedTest1, int);
+TYPED_TEST(TypedTest1, A) {}
+
+template <typename T>
+class TypedTest2 : public Test {
+};
+
+// Verifies that the second argument of TYPED_TEST_SUITE can be a
+// Types<...> type list.
+TYPED_TEST_SUITE(TypedTest2, Types<int>);
+
+// This also verifies that tests from different typed test cases can
+// share the same name.
+TYPED_TEST(TypedTest2, A) {}
+
+// Tests that a typed test case can be defined in a namespace.
+
+namespace library1 {
+
+template <typename T>
+class NumericTest : public Test {
+};
+
+typedef Types<int, long> NumericTypes;
+TYPED_TEST_SUITE(NumericTest, NumericTypes);
+
+TYPED_TEST(NumericTest, DefaultIsZero) {
+  EXPECT_EQ(0, TypeParam());
+}
+
+}  // namespace library1
+
+// Tests that custom names work.
+template <typename T>
+class TypedTestWithNames : public Test {};
+
+class TypedTestNames {
+ public:
+  template <typename T>
+  static std::string GetName(int i) {
+    if (std::is_same<T, char>::value) {
+      return std::string("char") + ::testing::PrintToString(i);
+    }
+    if (std::is_same<T, int>::value) {
+      return std::string("int") + ::testing::PrintToString(i);
+    }
+  }
+};
+
+TYPED_TEST_SUITE(TypedTestWithNames, TwoTypes, TypedTestNames);
+
+TYPED_TEST(TypedTestWithNames, TestSuiteName) {
+  if (std::is_same<TypeParam, char>::value) {
+    EXPECT_STREQ(::testing::UnitTest::GetInstance()
+                     ->current_test_info()
+                     ->test_case_name(),
+                 "TypedTestWithNames/char0");
+  }
+  if (std::is_same<TypeParam, int>::value) {
+    EXPECT_STREQ(::testing::UnitTest::GetInstance()
+                     ->current_test_info()
+                     ->test_case_name(),
+                 "TypedTestWithNames/int1");
+  }
+}
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// This #ifdef block tests type-parameterized tests.
+#if GTEST_HAS_TYPED_TEST_P
+
+using testing::Types;
+using testing::internal::TypedTestSuitePState;
+
+// Tests TypedTestSuitePState.
+
+class TypedTestSuitePStateTest : public Test {
+ protected:
+  void SetUp() override {
+    state_.AddTestName("foo.cc", 0, "FooTest", "A");
+    state_.AddTestName("foo.cc", 0, "FooTest", "B");
+    state_.AddTestName("foo.cc", 0, "FooTest", "C");
+  }
+
+  TypedTestSuitePState state_;
+};
+
+TEST_F(TypedTestSuitePStateTest, SucceedsForMatchingList) {
+  const char* tests = "A, B, C";
+  EXPECT_EQ(tests,
+            state_.VerifyRegisteredTestNames("foo.cc", 1, tests));
+}
+
+// Makes sure that the order of the tests and spaces around the names
+// don't matter.
+TEST_F(TypedTestSuitePStateTest, IgnoresOrderAndSpaces) {
+  const char* tests = "A,C,   B";
+  EXPECT_EQ(tests,
+            state_.VerifyRegisteredTestNames("foo.cc", 1, tests));
+}
+
+using TypedTestSuitePStateDeathTest = TypedTestSuitePStateTest;
+
+TEST_F(TypedTestSuitePStateDeathTest, DetectsDuplicates) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      state_.VerifyRegisteredTestNames("foo.cc", 1, "A, B, A, C"),
+      "foo\\.cc.1.?: Test A is listed more than once\\.");
+}
+
+TEST_F(TypedTestSuitePStateDeathTest, DetectsExtraTest) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      state_.VerifyRegisteredTestNames("foo.cc", 1, "A, B, C, D"),
+      "foo\\.cc.1.?: No test named D can be found in this test suite\\.");
+}
+
+TEST_F(TypedTestSuitePStateDeathTest, DetectsMissedTest) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      state_.VerifyRegisteredTestNames("foo.cc", 1, "A, C"),
+      "foo\\.cc.1.?: You forgot to list test B\\.");
+}
+
+// Tests that defining a test for a parameterized test case generates
+// a run-time error if the test case has been registered.
+TEST_F(TypedTestSuitePStateDeathTest, DetectsTestAfterRegistration) {
+  state_.VerifyRegisteredTestNames("foo.cc", 1, "A, B, C");
+  EXPECT_DEATH_IF_SUPPORTED(
+      state_.AddTestName("foo.cc", 2, "FooTest", "D"),
+      "foo\\.cc.2.?: Test D must be defined before REGISTER_TYPED_TEST_SUITE_P"
+      "\\(FooTest, \\.\\.\\.\\)\\.");
+}
+
+// Tests that SetUpTestSuite()/TearDownTestSuite(), fixture ctor/dtor,
+// and SetUp()/TearDown() work correctly in type-parameterized tests.
+
+template <typename T>
+class DerivedTest : public CommonTest<T> {
+};
+
+TYPED_TEST_SUITE_P(DerivedTest);
+
+TYPED_TEST_P(DerivedTest, ValuesAreCorrect) {
+  // Static members of the fixture class template can be visited via
+  // the TestFixture:: prefix.
+  EXPECT_EQ(5, *TestFixture::shared_);
+
+  // Non-static members of the fixture class must be visited via
+  // 'this', as required by C++ for class templates.
+  EXPECT_EQ(2, this->value_);
+}
+
+// The second test makes sure shared_ is not deleted after the first
+// test.
+TYPED_TEST_P(DerivedTest, ValuesAreStillCorrect) {
+  // Static members of the fixture class template can also be visited
+  // via 'this'.
+  ASSERT_TRUE(this->shared_ != nullptr);
+  EXPECT_EQ(5, *this->shared_);
+  EXPECT_EQ(2, this->value_);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(DerivedTest,
+                           ValuesAreCorrect, ValuesAreStillCorrect);
+
+typedef Types<short, long> MyTwoTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, DerivedTest, MyTwoTypes);
+
+// Tests that custom names work with type parametrized tests. We reuse the
+// TwoTypes from above here.
+template <typename T>
+class TypeParametrizedTestWithNames : public Test {};
+
+TYPED_TEST_SUITE_P(TypeParametrizedTestWithNames);
+
+TYPED_TEST_P(TypeParametrizedTestWithNames, TestSuiteName) {
+  if (std::is_same<TypeParam, char>::value) {
+    EXPECT_STREQ(::testing::UnitTest::GetInstance()
+                     ->current_test_info()
+                     ->test_case_name(),
+                 "CustomName/TypeParametrizedTestWithNames/parChar0");
+  }
+  if (std::is_same<TypeParam, int>::value) {
+    EXPECT_STREQ(::testing::UnitTest::GetInstance()
+                     ->current_test_info()
+                     ->test_case_name(),
+                 "CustomName/TypeParametrizedTestWithNames/parInt1");
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypeParametrizedTestWithNames, TestSuiteName);
+
+class TypeParametrizedTestNames {
+ public:
+  template <typename T>
+  static std::string GetName(int i) {
+    if (std::is_same<T, char>::value) {
+      return std::string("parChar") + ::testing::PrintToString(i);
+    }
+    if (std::is_same<T, int>::value) {
+      return std::string("parInt") + ::testing::PrintToString(i);
+    }
+  }
+};
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CustomName, TypeParametrizedTestWithNames,
+                              TwoTypes, TypeParametrizedTestNames);
+
+// Tests that multiple TYPED_TEST_SUITE_P's can be defined in the same
+// translation unit.
+
+template <typename T>
+class TypedTestP1 : public Test {
+};
+
+TYPED_TEST_SUITE_P(TypedTestP1);
+
+// For testing that the code between TYPED_TEST_SUITE_P() and
+// TYPED_TEST_P() is not enclosed in a namespace.
+using IntAfterTypedTestSuiteP = int;
+
+TYPED_TEST_P(TypedTestP1, A) {}
+TYPED_TEST_P(TypedTestP1, B) {}
+
+// For testing that the code between TYPED_TEST_P() and
+// REGISTER_TYPED_TEST_SUITE_P() is not enclosed in a namespace.
+using IntBeforeRegisterTypedTestSuiteP = int;
+
+REGISTER_TYPED_TEST_SUITE_P(TypedTestP1, A, B);
+
+template <typename T>
+class TypedTestP2 : public Test {
+};
+
+TYPED_TEST_SUITE_P(TypedTestP2);
+
+// This also verifies that tests from different type-parameterized
+// test cases can share the same name.
+TYPED_TEST_P(TypedTestP2, A) {}
+
+REGISTER_TYPED_TEST_SUITE_P(TypedTestP2, A);
+
+// Verifies that the code between TYPED_TEST_SUITE_P() and
+// REGISTER_TYPED_TEST_SUITE_P() is not enclosed in a namespace.
+IntAfterTypedTestSuiteP after = 0;
+IntBeforeRegisterTypedTestSuiteP before = 0;
+
+// Verifies that the last argument of INSTANTIATE_TYPED_TEST_SUITE_P()
+// can be either a single type or a Types<...> type list.
+INSTANTIATE_TYPED_TEST_SUITE_P(Int, TypedTestP1, int);
+INSTANTIATE_TYPED_TEST_SUITE_P(Int, TypedTestP2, Types<int>);
+
+// Tests that the same type-parameterized test case can be
+// instantiated more than once in the same translation unit.
+INSTANTIATE_TYPED_TEST_SUITE_P(Double, TypedTestP2, Types<double>);
+
+// Tests that the same type-parameterized test case can be
+// instantiated in different translation units linked together.
+// (ContainerTest is also instantiated in gtest-typed-test_test.cc.)
+typedef Types<std::vector<double>, std::set<char> > MyContainers;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, ContainerTest, MyContainers);
+
+// Tests that a type-parameterized test case can be defined and
+// instantiated in a namespace.
+
+namespace library2 {
+
+template <typename T>
+class NumericTest : public Test {
+};
+
+TYPED_TEST_SUITE_P(NumericTest);
+
+TYPED_TEST_P(NumericTest, DefaultIsZero) {
+  EXPECT_EQ(0, TypeParam());
+}
+
+TYPED_TEST_P(NumericTest, ZeroIsLessThanOne) {
+  EXPECT_LT(TypeParam(0), TypeParam(1));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(NumericTest,
+                           DefaultIsZero, ZeroIsLessThanOne);
+typedef Types<int, double> NumericTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, NumericTest, NumericTypes);
+
+static const char* GetTestName() {
+  return testing::UnitTest::GetInstance()->current_test_info()->name();
+}
+// Test the stripping of space from test names
+template <typename T> class TrimmedTest : public Test { };
+TYPED_TEST_SUITE_P(TrimmedTest);
+TYPED_TEST_P(TrimmedTest, Test1) { EXPECT_STREQ("Test1", GetTestName()); }
+TYPED_TEST_P(TrimmedTest, Test2) { EXPECT_STREQ("Test2", GetTestName()); }
+TYPED_TEST_P(TrimmedTest, Test3) { EXPECT_STREQ("Test3", GetTestName()); }
+TYPED_TEST_P(TrimmedTest, Test4) { EXPECT_STREQ("Test4", GetTestName()); }
+TYPED_TEST_P(TrimmedTest, Test5) { EXPECT_STREQ("Test5", GetTestName()); }
+REGISTER_TYPED_TEST_SUITE_P(
+    TrimmedTest,
+    Test1, Test2,Test3 , Test4 ,Test5 );  // NOLINT
+template <typename T1, typename T2> struct MyPair {};
+// Be sure to try a type with a comma in its name just in case it matters.
+typedef Types<int, double, MyPair<int, int> > TrimTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, TrimmedTest, TrimTypes);
+
+}  // namespace library2
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#if !defined(GTEST_HAS_TYPED_TEST) && !defined(GTEST_HAS_TYPED_TEST_P)
+
+// Google Test may not support type-parameterized tests with some
+// compilers. If we use conditional compilation to compile out all
+// code referring to the gtest_main library, MSVC linker will not link
+// that library at all and consequently complain about missing entry
+// point defined in that library (fatal error LNK1561: entry point
+// must be defined). This dummy test keeps gtest_main linked in.
+TEST(DummyTest, TypedTestsAreNotSupportedOnThisPlatform) {}
+
+#if _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4127
+#endif                             //  _MSC_VER
+
+#endif  // #if !defined(GTEST_HAS_TYPED_TEST) && !defined(GTEST_HAS_TYPED_TEST_P)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.h
new file mode 100755
index 0000000..23137b7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-typed-test_test.h
@@ -0,0 +1,65 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#ifndef GTEST_TEST_GTEST_TYPED_TEST_TEST_H_
+#define GTEST_TEST_GTEST_TYPED_TEST_TEST_H_
+
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_TYPED_TEST_P
+
+using testing::Test;
+
+// For testing that the same type-parameterized test case can be
+// instantiated in different translation units linked together.
+// ContainerTest will be instantiated in both gtest-typed-test_test.cc
+// and gtest-typed-test2_test.cc.
+
+template <typename T>
+class ContainerTest : public Test {
+};
+
+TYPED_TEST_SUITE_P(ContainerTest);
+
+TYPED_TEST_P(ContainerTest, CanBeDefaultConstructed) {
+  TypeParam container;
+}
+
+TYPED_TEST_P(ContainerTest, InitialSizeIsZero) {
+  TypeParam container;
+  EXPECT_EQ(0U, container.size());
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ContainerTest,
+                            CanBeDefaultConstructed, InitialSizeIsZero);
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_TEST_GTEST_TYPED_TEST_TEST_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-unittest-api_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-unittest-api_test.cc
new file mode 100755
index 0000000..480a41f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest-unittest-api_test.cc
@@ -0,0 +1,340 @@
+// Copyright 2009 Google Inc.  All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file contains tests verifying correctness of data provided via
+// UnitTest's public methods.
+
+#include "gtest/gtest.h"
+
+#include <string.h>  // For strcmp.
+#include <algorithm>
+
+using ::testing::InitGoogleTest;
+
+namespace testing {
+namespace internal {
+
+template <typename T>
+struct LessByName {
+  bool operator()(const T* a, const T* b) {
+    return strcmp(a->name(), b->name()) < 0;
+  }
+};
+
+class UnitTestHelper {
+ public:
+  // Returns the array of pointers to all test suites sorted by the test suite
+  // name.  The caller is responsible for deleting the array.
+  static TestSuite const** GetSortedTestSuites() {
+    UnitTest& unit_test = *UnitTest::GetInstance();
+    auto const** const test_suites =
+        new const TestSuite*[unit_test.total_test_suite_count()];
+
+    for (int i = 0; i < unit_test.total_test_suite_count(); ++i)
+      test_suites[i] = unit_test.GetTestSuite(i);
+
+    std::sort(test_suites,
+              test_suites + unit_test.total_test_suite_count(),
+              LessByName<TestSuite>());
+    return test_suites;
+  }
+
+  // Returns the test suite by its name.  The caller doesn't own the returned
+  // pointer.
+  static const TestSuite* FindTestSuite(const char* name) {
+    UnitTest& unit_test = *UnitTest::GetInstance();
+    for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+      const TestSuite* test_suite = unit_test.GetTestSuite(i);
+      if (0 == strcmp(test_suite->name(), name))
+        return test_suite;
+    }
+    return nullptr;
+  }
+
+  // Returns the array of pointers to all tests in a particular test suite
+  // sorted by the test name.  The caller is responsible for deleting the
+  // array.
+  static TestInfo const** GetSortedTests(const TestSuite* test_suite) {
+    TestInfo const** const tests =
+        new const TestInfo*[test_suite->total_test_count()];
+
+    for (int i = 0; i < test_suite->total_test_count(); ++i)
+      tests[i] = test_suite->GetTestInfo(i);
+
+    std::sort(tests, tests + test_suite->total_test_count(),
+              LessByName<TestInfo>());
+    return tests;
+  }
+};
+
+#if GTEST_HAS_TYPED_TEST
+template <typename T> class TestSuiteWithCommentTest : public Test {};
+TYPED_TEST_SUITE(TestSuiteWithCommentTest, Types<int>);
+TYPED_TEST(TestSuiteWithCommentTest, Dummy) {}
+
+const int kTypedTestSuites = 1;
+const int kTypedTests = 1;
+#else
+const int kTypedTestSuites = 0;
+const int kTypedTests = 0;
+#endif  // GTEST_HAS_TYPED_TEST
+
+// We can only test the accessors that do not change value while tests run.
+// Since tests can be run in any order, the values the accessors that track
+// test execution (such as failed_test_count) can not be predicted.
+TEST(ApiTest, UnitTestImmutableAccessorsWork) {
+  UnitTest* unit_test = UnitTest::GetInstance();
+
+  ASSERT_EQ(2 + kTypedTestSuites, unit_test->total_test_suite_count());
+  EXPECT_EQ(1 + kTypedTestSuites, unit_test->test_suite_to_run_count());
+  EXPECT_EQ(2, unit_test->disabled_test_count());
+  EXPECT_EQ(5 + kTypedTests, unit_test->total_test_count());
+  EXPECT_EQ(3 + kTypedTests, unit_test->test_to_run_count());
+
+  const TestSuite** const test_suites = UnitTestHelper::GetSortedTestSuites();
+
+  EXPECT_STREQ("ApiTest", test_suites[0]->name());
+  EXPECT_STREQ("DISABLED_Test", test_suites[1]->name());
+#if GTEST_HAS_TYPED_TEST
+  EXPECT_STREQ("TestSuiteWithCommentTest/0", test_suites[2]->name());
+#endif  // GTEST_HAS_TYPED_TEST
+
+  delete[] test_suites;
+
+  // The following lines initiate actions to verify certain methods in
+  // FinalSuccessChecker::TearDown.
+
+  // Records a test property to verify TestResult::GetTestProperty().
+  RecordProperty("key", "value");
+}
+
+AssertionResult IsNull(const char* str) {
+  if (str != nullptr) {
+    return testing::AssertionFailure() << "argument is " << str;
+  }
+  return AssertionSuccess();
+}
+
+TEST(ApiTest, TestSuiteImmutableAccessorsWork) {
+  const TestSuite* test_suite = UnitTestHelper::FindTestSuite("ApiTest");
+  ASSERT_TRUE(test_suite != nullptr);
+
+  EXPECT_STREQ("ApiTest", test_suite->name());
+  EXPECT_TRUE(IsNull(test_suite->type_param()));
+  EXPECT_TRUE(test_suite->should_run());
+  EXPECT_EQ(1, test_suite->disabled_test_count());
+  EXPECT_EQ(3, test_suite->test_to_run_count());
+  ASSERT_EQ(4, test_suite->total_test_count());
+
+  const TestInfo** tests = UnitTestHelper::GetSortedTests(test_suite);
+
+  EXPECT_STREQ("DISABLED_Dummy1", tests[0]->name());
+  EXPECT_STREQ("ApiTest", tests[0]->test_suite_name());
+  EXPECT_TRUE(IsNull(tests[0]->value_param()));
+  EXPECT_TRUE(IsNull(tests[0]->type_param()));
+  EXPECT_FALSE(tests[0]->should_run());
+
+  EXPECT_STREQ("TestSuiteDisabledAccessorsWork", tests[1]->name());
+  EXPECT_STREQ("ApiTest", tests[1]->test_suite_name());
+  EXPECT_TRUE(IsNull(tests[1]->value_param()));
+  EXPECT_TRUE(IsNull(tests[1]->type_param()));
+  EXPECT_TRUE(tests[1]->should_run());
+
+  EXPECT_STREQ("TestSuiteImmutableAccessorsWork", tests[2]->name());
+  EXPECT_STREQ("ApiTest", tests[2]->test_suite_name());
+  EXPECT_TRUE(IsNull(tests[2]->value_param()));
+  EXPECT_TRUE(IsNull(tests[2]->type_param()));
+  EXPECT_TRUE(tests[2]->should_run());
+
+  EXPECT_STREQ("UnitTestImmutableAccessorsWork", tests[3]->name());
+  EXPECT_STREQ("ApiTest", tests[3]->test_suite_name());
+  EXPECT_TRUE(IsNull(tests[3]->value_param()));
+  EXPECT_TRUE(IsNull(tests[3]->type_param()));
+  EXPECT_TRUE(tests[3]->should_run());
+
+  delete[] tests;
+  tests = nullptr;
+
+#if GTEST_HAS_TYPED_TEST
+  test_suite = UnitTestHelper::FindTestSuite("TestSuiteWithCommentTest/0");
+  ASSERT_TRUE(test_suite != nullptr);
+
+  EXPECT_STREQ("TestSuiteWithCommentTest/0", test_suite->name());
+  EXPECT_STREQ(GetTypeName<int>().c_str(), test_suite->type_param());
+  EXPECT_TRUE(test_suite->should_run());
+  EXPECT_EQ(0, test_suite->disabled_test_count());
+  EXPECT_EQ(1, test_suite->test_to_run_count());
+  ASSERT_EQ(1, test_suite->total_test_count());
+
+  tests = UnitTestHelper::GetSortedTests(test_suite);
+
+  EXPECT_STREQ("Dummy", tests[0]->name());
+  EXPECT_STREQ("TestSuiteWithCommentTest/0", tests[0]->test_suite_name());
+  EXPECT_TRUE(IsNull(tests[0]->value_param()));
+  EXPECT_STREQ(GetTypeName<int>().c_str(), tests[0]->type_param());
+  EXPECT_TRUE(tests[0]->should_run());
+
+  delete[] tests;
+#endif  // GTEST_HAS_TYPED_TEST
+}
+
+TEST(ApiTest, TestSuiteDisabledAccessorsWork) {
+  const TestSuite* test_suite = UnitTestHelper::FindTestSuite("DISABLED_Test");
+  ASSERT_TRUE(test_suite != nullptr);
+
+  EXPECT_STREQ("DISABLED_Test", test_suite->name());
+  EXPECT_TRUE(IsNull(test_suite->type_param()));
+  EXPECT_FALSE(test_suite->should_run());
+  EXPECT_EQ(1, test_suite->disabled_test_count());
+  EXPECT_EQ(0, test_suite->test_to_run_count());
+  ASSERT_EQ(1, test_suite->total_test_count());
+
+  const TestInfo* const test_info = test_suite->GetTestInfo(0);
+  EXPECT_STREQ("Dummy2", test_info->name());
+  EXPECT_STREQ("DISABLED_Test", test_info->test_suite_name());
+  EXPECT_TRUE(IsNull(test_info->value_param()));
+  EXPECT_TRUE(IsNull(test_info->type_param()));
+  EXPECT_FALSE(test_info->should_run());
+}
+
+// These two tests are here to provide support for testing
+// test_suite_to_run_count, disabled_test_count, and test_to_run_count.
+TEST(ApiTest, DISABLED_Dummy1) {}
+TEST(DISABLED_Test, Dummy2) {}
+
+class FinalSuccessChecker : public Environment {
+ protected:
+  void TearDown() override {
+    UnitTest* unit_test = UnitTest::GetInstance();
+
+    EXPECT_EQ(1 + kTypedTestSuites, unit_test->successful_test_suite_count());
+    EXPECT_EQ(3 + kTypedTests, unit_test->successful_test_count());
+    EXPECT_EQ(0, unit_test->failed_test_suite_count());
+    EXPECT_EQ(0, unit_test->failed_test_count());
+    EXPECT_TRUE(unit_test->Passed());
+    EXPECT_FALSE(unit_test->Failed());
+    ASSERT_EQ(2 + kTypedTestSuites, unit_test->total_test_suite_count());
+
+    const TestSuite** const test_suites = UnitTestHelper::GetSortedTestSuites();
+
+    EXPECT_STREQ("ApiTest", test_suites[0]->name());
+    EXPECT_TRUE(IsNull(test_suites[0]->type_param()));
+    EXPECT_TRUE(test_suites[0]->should_run());
+    EXPECT_EQ(1, test_suites[0]->disabled_test_count());
+    ASSERT_EQ(4, test_suites[0]->total_test_count());
+    EXPECT_EQ(3, test_suites[0]->successful_test_count());
+    EXPECT_EQ(0, test_suites[0]->failed_test_count());
+    EXPECT_TRUE(test_suites[0]->Passed());
+    EXPECT_FALSE(test_suites[0]->Failed());
+
+    EXPECT_STREQ("DISABLED_Test", test_suites[1]->name());
+    EXPECT_TRUE(IsNull(test_suites[1]->type_param()));
+    EXPECT_FALSE(test_suites[1]->should_run());
+    EXPECT_EQ(1, test_suites[1]->disabled_test_count());
+    ASSERT_EQ(1, test_suites[1]->total_test_count());
+    EXPECT_EQ(0, test_suites[1]->successful_test_count());
+    EXPECT_EQ(0, test_suites[1]->failed_test_count());
+
+#if GTEST_HAS_TYPED_TEST
+    EXPECT_STREQ("TestSuiteWithCommentTest/0", test_suites[2]->name());
+    EXPECT_STREQ(GetTypeName<int>().c_str(), test_suites[2]->type_param());
+    EXPECT_TRUE(test_suites[2]->should_run());
+    EXPECT_EQ(0, test_suites[2]->disabled_test_count());
+    ASSERT_EQ(1, test_suites[2]->total_test_count());
+    EXPECT_EQ(1, test_suites[2]->successful_test_count());
+    EXPECT_EQ(0, test_suites[2]->failed_test_count());
+    EXPECT_TRUE(test_suites[2]->Passed());
+    EXPECT_FALSE(test_suites[2]->Failed());
+#endif  // GTEST_HAS_TYPED_TEST
+
+    const TestSuite* test_suite = UnitTestHelper::FindTestSuite("ApiTest");
+    const TestInfo** tests = UnitTestHelper::GetSortedTests(test_suite);
+    EXPECT_STREQ("DISABLED_Dummy1", tests[0]->name());
+    EXPECT_STREQ("ApiTest", tests[0]->test_suite_name());
+    EXPECT_FALSE(tests[0]->should_run());
+
+    EXPECT_STREQ("TestSuiteDisabledAccessorsWork", tests[1]->name());
+    EXPECT_STREQ("ApiTest", tests[1]->test_suite_name());
+    EXPECT_TRUE(IsNull(tests[1]->value_param()));
+    EXPECT_TRUE(IsNull(tests[1]->type_param()));
+    EXPECT_TRUE(tests[1]->should_run());
+    EXPECT_TRUE(tests[1]->result()->Passed());
+    EXPECT_EQ(0, tests[1]->result()->test_property_count());
+
+    EXPECT_STREQ("TestSuiteImmutableAccessorsWork", tests[2]->name());
+    EXPECT_STREQ("ApiTest", tests[2]->test_suite_name());
+    EXPECT_TRUE(IsNull(tests[2]->value_param()));
+    EXPECT_TRUE(IsNull(tests[2]->type_param()));
+    EXPECT_TRUE(tests[2]->should_run());
+    EXPECT_TRUE(tests[2]->result()->Passed());
+    EXPECT_EQ(0, tests[2]->result()->test_property_count());
+
+    EXPECT_STREQ("UnitTestImmutableAccessorsWork", tests[3]->name());
+    EXPECT_STREQ("ApiTest", tests[3]->test_suite_name());
+    EXPECT_TRUE(IsNull(tests[3]->value_param()));
+    EXPECT_TRUE(IsNull(tests[3]->type_param()));
+    EXPECT_TRUE(tests[3]->should_run());
+    EXPECT_TRUE(tests[3]->result()->Passed());
+    EXPECT_EQ(1, tests[3]->result()->test_property_count());
+    const TestProperty& property = tests[3]->result()->GetTestProperty(0);
+    EXPECT_STREQ("key", property.key());
+    EXPECT_STREQ("value", property.value());
+
+    delete[] tests;
+
+#if GTEST_HAS_TYPED_TEST
+    test_suite = UnitTestHelper::FindTestSuite("TestSuiteWithCommentTest/0");
+    tests = UnitTestHelper::GetSortedTests(test_suite);
+
+    EXPECT_STREQ("Dummy", tests[0]->name());
+    EXPECT_STREQ("TestSuiteWithCommentTest/0", tests[0]->test_suite_name());
+    EXPECT_TRUE(IsNull(tests[0]->value_param()));
+    EXPECT_STREQ(GetTypeName<int>().c_str(), tests[0]->type_param());
+    EXPECT_TRUE(tests[0]->should_run());
+    EXPECT_TRUE(tests[0]->result()->Passed());
+    EXPECT_EQ(0, tests[0]->result()->test_property_count());
+
+    delete[] tests;
+#endif  // GTEST_HAS_TYPED_TEST
+    delete[] test_suites;
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+int main(int argc, char **argv) {
+  InitGoogleTest(&argc, argv);
+
+  AddGlobalTestEnvironment(new testing::internal::FinalSuccessChecker());
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_all_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_all_test.cc
new file mode 100755
index 0000000..615b29b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_all_test.cc
@@ -0,0 +1,46 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google C++ Testing and Mocking Framework (Google Test)
+//
+// Sometimes it's desirable to build most of Google Test's own tests
+// by compiling a single file.  This file serves this purpose.
+#include "test/googletest-filepath-test.cc"
+#include "test/googletest-message-test.cc"
+#include "test/googletest-options-test.cc"
+#include "test/googletest-port-test.cc"
+#include "test/googletest-test-part-test.cc"
+#include "test/gtest-typed-test2_test.cc"
+#include "test/gtest-typed-test_test.cc"
+#include "test/gtest_pred_impl_unittest.cc"
+#include "test/gtest_prod_test.cc"
+#include "test/gtest_skip_test.cc"
+#include "test/gtest_unittest.cc"
+#include "test/production.cc"
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_assert_by_exception_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_assert_by_exception_test.cc
new file mode 100755
index 0000000..ada4cb3
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_assert_by_exception_test.cc
@@ -0,0 +1,116 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Test's assert-by-exception mode with exceptions enabled.
+
+#include "gtest/gtest.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdexcept>
+
+class ThrowListener : public testing::EmptyTestEventListener {
+  void OnTestPartResult(const testing::TestPartResult& result) override {
+    if (result.type() == testing::TestPartResult::kFatalFailure) {
+      throw testing::AssertionException(result);
+    }
+  }
+};
+
+// Prints the given failure message and exits the program with
+// non-zero.  We use this instead of a Google Test assertion to
+// indicate a failure, as the latter is been tested and cannot be
+// relied on.
+void Fail(const char* msg) {
+  printf("FAILURE: %s\n", msg);
+  fflush(stdout);
+  exit(1);
+}
+
+static void AssertFalse() {
+  ASSERT_EQ(2, 3) << "Expected failure";
+}
+
+// Tests that an assertion failure throws a subclass of
+// std::runtime_error.
+TEST(Test, Test) {
+  // A successful assertion shouldn't throw.
+  try {
+    EXPECT_EQ(3, 3);
+  } catch(...) {
+    Fail("A successful assertion wrongfully threw.");
+  }
+
+  // A successful assertion shouldn't throw.
+  try {
+    EXPECT_EQ(3, 4);
+  } catch(...) {
+    Fail("A failed non-fatal assertion wrongfully threw.");
+  }
+
+  // A failed assertion should throw.
+  try {
+    AssertFalse();
+  } catch(const testing::AssertionException& e) {
+    if (strstr(e.what(), "Expected failure") != nullptr) throw;
+
+    printf("%s",
+           "A failed assertion did throw an exception of the right type, "
+           "but the message is incorrect.  Instead of containing \"Expected "
+           "failure\", it is:\n");
+    Fail(e.what());
+  } catch(...) {
+    Fail("A failed assertion threw the wrong type of exception.");
+  }
+  Fail("A failed assertion should've thrown but didn't.");
+}
+
+int kTestForContinuingTest = 0;
+
+TEST(Test, Test2) {
+  kTestForContinuingTest = 1;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::UnitTest::GetInstance()->listeners().Append(new ThrowListener);
+
+  int result = RUN_ALL_TESTS();
+  if (result == 0) {
+    printf("RUN_ALL_TESTS returned %d\n", result);
+    Fail("Expected failure instead.");
+  }
+
+  if (kTestForContinuingTest == 0) {
+    Fail("Should have continued with other tests, but did not.");
+  }
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_environment_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_environment_test.cc
new file mode 100755
index 0000000..58908e6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_environment_test.cc
@@ -0,0 +1,188 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests using global test environments.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+GTEST_DECLARE_string_(filter);
+}
+
+namespace {
+
+enum FailureType {
+  NO_FAILURE, NON_FATAL_FAILURE, FATAL_FAILURE
+};
+
+// For testing using global test environments.
+class MyEnvironment : public testing::Environment {
+ public:
+  MyEnvironment() { Reset(); }
+
+  // Depending on the value of failure_in_set_up_, SetUp() will
+  // generate a non-fatal failure, generate a fatal failure, or
+  // succeed.
+  void SetUp() override {
+    set_up_was_run_ = true;
+
+    switch (failure_in_set_up_) {
+      case NON_FATAL_FAILURE:
+        ADD_FAILURE() << "Expected non-fatal failure in global set-up.";
+        break;
+      case FATAL_FAILURE:
+        FAIL() << "Expected fatal failure in global set-up.";
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Generates a non-fatal failure.
+  void TearDown() override {
+    tear_down_was_run_ = true;
+    ADD_FAILURE() << "Expected non-fatal failure in global tear-down.";
+  }
+
+  // Resets the state of the environment s.t. it can be reused.
+  void Reset() {
+    failure_in_set_up_ = NO_FAILURE;
+    set_up_was_run_ = false;
+    tear_down_was_run_ = false;
+  }
+
+  // We call this function to set the type of failure SetUp() should
+  // generate.
+  void set_failure_in_set_up(FailureType type) {
+    failure_in_set_up_ = type;
+  }
+
+  // Was SetUp() run?
+  bool set_up_was_run() const { return set_up_was_run_; }
+
+  // Was TearDown() run?
+  bool tear_down_was_run() const { return tear_down_was_run_; }
+
+ private:
+  FailureType failure_in_set_up_;
+  bool set_up_was_run_;
+  bool tear_down_was_run_;
+};
+
+// Was the TEST run?
+bool test_was_run;
+
+// The sole purpose of this TEST is to enable us to check whether it
+// was run.
+TEST(FooTest, Bar) {
+  test_was_run = true;
+}
+
+// Prints the message and aborts the program if condition is false.
+void Check(bool condition, const char* msg) {
+  if (!condition) {
+    printf("FAILED: %s\n", msg);
+    testing::internal::posix::Abort();
+  }
+}
+
+// Runs the tests.  Return true if successful.
+//
+// The 'failure' parameter specifies the type of failure that should
+// be generated by the global set-up.
+int RunAllTests(MyEnvironment* env, FailureType failure) {
+  env->Reset();
+  env->set_failure_in_set_up(failure);
+  test_was_run = false;
+  testing::internal::GetUnitTestImpl()->ClearAdHocTestResult();
+  return RUN_ALL_TESTS();
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // Registers a global test environment, and verifies that the
+  // registration function returns its argument.
+  MyEnvironment* const env = new MyEnvironment;
+  Check(testing::AddGlobalTestEnvironment(env) == env,
+        "AddGlobalTestEnvironment() should return its argument.");
+
+  // Verifies that RUN_ALL_TESTS() runs the tests when the global
+  // set-up is successful.
+  Check(RunAllTests(env, NO_FAILURE) != 0,
+        "RUN_ALL_TESTS() should return non-zero, as the global tear-down "
+        "should generate a failure.");
+  Check(test_was_run,
+        "The tests should run, as the global set-up should generate no "
+        "failure");
+  Check(env->tear_down_was_run(),
+        "The global tear-down should run, as the global set-up was run.");
+
+  // Verifies that RUN_ALL_TESTS() runs the tests when the global
+  // set-up generates no fatal failure.
+  Check(RunAllTests(env, NON_FATAL_FAILURE) != 0,
+        "RUN_ALL_TESTS() should return non-zero, as both the global set-up "
+        "and the global tear-down should generate a non-fatal failure.");
+  Check(test_was_run,
+        "The tests should run, as the global set-up should generate no "
+        "fatal failure.");
+  Check(env->tear_down_was_run(),
+        "The global tear-down should run, as the global set-up was run.");
+
+  // Verifies that RUN_ALL_TESTS() runs no test when the global set-up
+  // generates a fatal failure.
+  Check(RunAllTests(env, FATAL_FAILURE) != 0,
+        "RUN_ALL_TESTS() should return non-zero, as the global set-up "
+        "should generate a fatal failure.");
+  Check(!test_was_run,
+        "The tests should not run, as the global set-up should generate "
+        "a fatal failure.");
+  Check(env->tear_down_was_run(),
+        "The global tear-down should run, as the global set-up was run.");
+
+  // Verifies that RUN_ALL_TESTS() doesn't do global set-up or
+  // tear-down when there is no test to run.
+  testing::GTEST_FLAG(filter) = "-*";
+  Check(RunAllTests(env, NO_FAILURE) == 0,
+        "RUN_ALL_TESTS() should return zero, as there is no test to run.");
+  Check(!env->set_up_was_run(),
+        "The global set-up should not run, as there is no test to run.");
+  Check(!env->tear_down_was_run(),
+        "The global tear-down should not run, "
+        "as the global set-up was not run.");
+
+  printf("PASS\n");
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test.py
new file mode 100755
index 0000000..582d24c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests the --help flag of Google C++ Testing and Mocking Framework.
+
+SYNOPSIS
+       gtest_help_test.py --build_dir=BUILD/DIR
+         # where BUILD/DIR contains the built gtest_help_test_ file.
+       gtest_help_test.py
+"""
+
+import os
+import re
+import gtest_test_utils
+
+
+IS_LINUX = os.name == 'posix' and os.uname()[0] == 'Linux'
+IS_WINDOWS = os.name == 'nt'
+
+PROGRAM_PATH = gtest_test_utils.GetTestExecutablePath('gtest_help_test_')
+FLAG_PREFIX = '--gtest_'
+DEATH_TEST_STYLE_FLAG = FLAG_PREFIX + 'death_test_style'
+STREAM_RESULT_TO_FLAG = FLAG_PREFIX + 'stream_result_to'
+UNKNOWN_FLAG = FLAG_PREFIX + 'unknown_flag_for_testing'
+LIST_TESTS_FLAG = FLAG_PREFIX + 'list_tests'
+INCORRECT_FLAG_VARIANTS = [re.sub('^--', '-', LIST_TESTS_FLAG),
+                           re.sub('^--', '/', LIST_TESTS_FLAG),
+                           re.sub('_', '-', LIST_TESTS_FLAG)]
+INTERNAL_FLAG_FOR_TESTING = FLAG_PREFIX + 'internal_flag_for_testing'
+
+SUPPORTS_DEATH_TESTS = "DeathTest" in gtest_test_utils.Subprocess(
+    [PROGRAM_PATH, LIST_TESTS_FLAG]).output
+
+# The help message must match this regex.
+HELP_REGEX = re.compile(
+    FLAG_PREFIX + r'list_tests.*' +
+    FLAG_PREFIX + r'filter=.*' +
+    FLAG_PREFIX + r'also_run_disabled_tests.*' +
+    FLAG_PREFIX + r'repeat=.*' +
+    FLAG_PREFIX + r'shuffle.*' +
+    FLAG_PREFIX + r'random_seed=.*' +
+    FLAG_PREFIX + r'color=.*' +
+    FLAG_PREFIX + r'print_time.*' +
+    FLAG_PREFIX + r'output=.*' +
+    FLAG_PREFIX + r'break_on_failure.*' +
+    FLAG_PREFIX + r'throw_on_failure.*' +
+    FLAG_PREFIX + r'catch_exceptions=0.*',
+    re.DOTALL)
+
+
+def RunWithFlag(flag):
+  """Runs gtest_help_test_ with the given flag.
+
+  Returns:
+    the exit code and the text output as a tuple.
+  Args:
+    flag: the command-line flag to pass to gtest_help_test_, or None.
+  """
+
+  if flag is None:
+    command = [PROGRAM_PATH]
+  else:
+    command = [PROGRAM_PATH, flag]
+  child = gtest_test_utils.Subprocess(command)
+  return child.exit_code, child.output
+
+
+class GTestHelpTest(gtest_test_utils.TestCase):
+  """Tests the --help flag and its equivalent forms."""
+
+  def TestHelpFlag(self, flag):
+    """Verifies correct behavior when help flag is specified.
+
+    The right message must be printed and the tests must
+    skipped when the given flag is specified.
+
+    Args:
+      flag:  A flag to pass to the binary or None.
+    """
+
+    exit_code, output = RunWithFlag(flag)
+    self.assertEquals(0, exit_code)
+    self.assert_(HELP_REGEX.search(output), output)
+
+    if IS_LINUX:
+      self.assert_(STREAM_RESULT_TO_FLAG in output, output)
+    else:
+      self.assert_(STREAM_RESULT_TO_FLAG not in output, output)
+
+    if SUPPORTS_DEATH_TESTS and not IS_WINDOWS:
+      self.assert_(DEATH_TEST_STYLE_FLAG in output, output)
+    else:
+      self.assert_(DEATH_TEST_STYLE_FLAG not in output, output)
+
+  def TestNonHelpFlag(self, flag):
+    """Verifies correct behavior when no help flag is specified.
+
+    Verifies that when no help flag is specified, the tests are run
+    and the help message is not printed.
+
+    Args:
+      flag:  A flag to pass to the binary or None.
+    """
+
+    exit_code, output = RunWithFlag(flag)
+    self.assert_(exit_code != 0)
+    self.assert_(not HELP_REGEX.search(output), output)
+
+  def testPrintsHelpWithFullFlag(self):
+    self.TestHelpFlag('--help')
+
+  def testPrintsHelpWithShortFlag(self):
+    self.TestHelpFlag('-h')
+
+  def testPrintsHelpWithQuestionFlag(self):
+    self.TestHelpFlag('-?')
+
+  def testPrintsHelpWithWindowsStyleQuestionFlag(self):
+    self.TestHelpFlag('/?')
+
+  def testPrintsHelpWithUnrecognizedGoogleTestFlag(self):
+    self.TestHelpFlag(UNKNOWN_FLAG)
+
+  def testPrintsHelpWithIncorrectFlagStyle(self):
+    for incorrect_flag in INCORRECT_FLAG_VARIANTS:
+      self.TestHelpFlag(incorrect_flag)
+
+  def testRunsTestsWithoutHelpFlag(self):
+    """Verifies that when no help flag is specified, the tests are run
+    and the help message is not printed."""
+
+    self.TestNonHelpFlag(None)
+
+  def testRunsTestsWithGtestInternalFlag(self):
+    """Verifies that the tests are run and no help message is printed when
+    a flag starting with Google Test prefix and 'internal_' is supplied."""
+
+    self.TestNonHelpFlag(INTERNAL_FLAG_FOR_TESTING)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test_.cc
new file mode 100755
index 0000000..750ae6c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_help_test_.cc
@@ -0,0 +1,45 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This program is meant to be run by gtest_help_test.py.  Do not run
+// it directly.
+
+#include "gtest/gtest.h"
+
+// When a help flag is specified, this program should skip the tests
+// and exit with 0; otherwise the following test will be executed,
+// causing this program to exit with a non-zero code.
+TEST(HelpFlagTest, ShouldNotBeRun) {
+  ASSERT_TRUE(false) << "Tests shouldn't be run when --help is specified.";
+}
+
+#if GTEST_HAS_DEATH_TEST
+TEST(DeathTest, UsedByPythonScriptToDetectSupportForDeathTestsInThisBinary) {}
+#endif
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_json_test_utils.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_json_test_utils.py
new file mode 100755
index 0000000..62bbfc2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_json_test_utils.py
@@ -0,0 +1,60 @@
+# Copyright 2018, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test utilities for gtest_json_output."""
+
+import re
+
+
+def normalize(obj):
+  """Normalize output object.
+
+  Args:
+     obj: Google Test's JSON output object to normalize.
+
+  Returns:
+     Normalized output without any references to transient information that may
+     change from run to run.
+  """
+  def _normalize(key, value):
+    if key == 'time':
+      return re.sub(r'^\d+(\.\d+)?s$', '*', value)
+    elif key == 'timestamp':
+      return re.sub(r'^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\dZ$', '*', value)
+    elif key == 'failure':
+      value = re.sub(r'^.*[/\\](.*:)\d+\n', '\\1*\n', value)
+      return re.sub(r'Stack trace:\n(.|\n)*', 'Stack trace:\n*', value)
+    else:
+      return normalize(value)
+  if isinstance(obj, dict):
+    return {k: _normalize(k, v) for k, v in obj.items()}
+  if isinstance(obj, list):
+    return [normalize(x) for x in obj]
+  else:
+    return obj
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest.py
new file mode 100755
index 0000000..3bba7ea
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+#
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Unit test for Google Test's --gtest_list_tests flag.
+
+A user can ask Google Test to list all tests by specifying the
+--gtest_list_tests flag. If output is requested, via --gtest_output=xml
+or --gtest_output=json, the tests are listed, with extra information in the
+output file.
+This script tests such functionality by invoking gtest_list_output_unittest_
+ (a program written with Google Test) the command line flags.
+"""
+
+import os
+import re
+import gtest_test_utils
+
+GTEST_LIST_TESTS_FLAG = '--gtest_list_tests'
+GTEST_OUTPUT_FLAG = '--gtest_output'
+
+EXPECTED_XML = """<\?xml version="1.0" encoding="UTF-8"\?>
+<testsuites tests="2" name="AllTests">
+  <testsuite name="FooTest" tests="2">
+    <testcase name="Test1" file=".*gtest_list_output_unittest_.cc" line="43" />
+    <testcase name="Test2" file=".*gtest_list_output_unittest_.cc" line="45" />
+  </testsuite>
+</testsuites>
+"""
+
+EXPECTED_JSON = """{
+  "tests": 2,
+  "name": "AllTests",
+  "testsuites": \[
+    {
+      "name": "FooTest",
+      "tests": 2,
+      "testsuite": \[
+        {
+          "name": "Test1",
+          "file": ".*gtest_list_output_unittest_.cc",
+          "line": 43
+        },
+        {
+          "name": "Test2",
+          "file": ".*gtest_list_output_unittest_.cc",
+          "line": 45
+        }
+      \]
+    }
+  \]
+}
+"""
+
+
+class GTestListTestsOutputUnitTest(gtest_test_utils.TestCase):
+  """Unit test for Google Test's list tests with output to file functionality.
+  """
+
+  def testXml(self):
+    """Verifies XML output for listing tests in a Google Test binary.
+
+    Runs a test program that generates an empty XML output, and
+    tests that the XML output is expected.
+    """
+    self._TestOutput('xml', EXPECTED_XML)
+
+  def testJSON(self):
+    """Verifies XML output for listing tests in a Google Test binary.
+
+    Runs a test program that generates an empty XML output, and
+    tests that the XML output is expected.
+    """
+    self._TestOutput('json', EXPECTED_JSON)
+
+  def _GetOutput(self, out_format):
+    file_path = os.path.join(gtest_test_utils.GetTempDir(),
+                             'test_out.' + out_format)
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(
+        'gtest_list_output_unittest_')
+
+    command = ([
+        gtest_prog_path,
+        '%s=%s:%s' % (GTEST_OUTPUT_FLAG, out_format, file_path),
+        '--gtest_list_tests'
+    ])
+    environ_copy = os.environ.copy()
+    p = gtest_test_utils.Subprocess(
+        command, env=environ_copy, working_dir=gtest_test_utils.GetTempDir())
+
+    self.assert_(p.exited)
+    self.assertEquals(0, p.exit_code)
+    with open(file_path) as f:
+      result = f.read()
+    return result
+
+  def _TestOutput(self, test_format, expected_output):
+    actual = self._GetOutput(test_format)
+    actual_lines = actual.splitlines()
+    expected_lines = expected_output.splitlines()
+    line_count = 0
+    for actual_line in actual_lines:
+      expected_line = expected_lines[line_count]
+      expected_line_re = re.compile(expected_line.strip())
+      self.assert_(
+          expected_line_re.match(actual_line.strip()),
+          ('actual output of "%s",\n'
+           'which does not match expected regex of "%s"\n'
+           'on line %d' % (actual, expected_output, line_count)))
+      line_count = line_count + 1
+
+
+if __name__ == '__main__':
+  os.environ['GTEST_STACK_TRACE_DEPTH'] = '1'
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest_.cc
new file mode 100755
index 0000000..b1c7b4d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_list_output_unittest_.cc
@@ -0,0 +1,51 @@
+// Copyright 2018, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: david.schuldenfrei@gmail.com (David Schuldenfrei)
+
+// Unit test for Google Test's --gtest_list_tests and --gtest_output flag.
+//
+// A user can ask Google Test to list all tests that will run,
+// and have the output saved in a Json/Xml file.
+// The tests will not be run after listing.
+//
+// This program will be invoked from a Python unit test.
+// Don't run it directly.
+
+#include "gtest/gtest.h"
+
+TEST(FooTest, Test1) {}
+
+TEST(FooTest, Test2) {}
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_main_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_main_unittest.cc
new file mode 100755
index 0000000..eddedea
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_main_unittest.cc
@@ -0,0 +1,44 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gtest/gtest.h"
+
+// Tests that we don't have to define main() when we link to
+// gtest_main instead of gtest.
+
+namespace {
+
+TEST(GTestMainTest, ShouldSucceed) {
+}
+
+}  // namespace
+
+// We are using the main() function defined in gtest_main.cc, so we
+// don't define it here.
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_no_test_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_no_test_unittest.cc
new file mode 100755
index 0000000..d4f88db
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_no_test_unittest.cc
@@ -0,0 +1,54 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Tests that a Google Test program that has no test defined can run
+// successfully.
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // An ad-hoc assertion outside of all tests.
+  //
+  // This serves three purposes:
+  //
+  // 1. It verifies that an ad-hoc assertion can be executed even if
+  //    no test is defined.
+  // 2. It verifies that a failed ad-hoc assertion causes the test
+  //    program to fail.
+  // 3. We had a bug where the XML output won't be generated if an
+  //    assertion is executed before RUN_ALL_TESTS() is called, even
+  //    though --gtest_output=xml is specified.  This makes sure the
+  //    bug is fixed and doesn't regress.
+  EXPECT_EQ(1, 2);
+
+  // The above EXPECT_EQ() should cause RUN_ALL_TESTS() to return non-zero.
+  return RUN_ALL_TESTS() ? 0 : 1;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_pred_impl_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_pred_impl_unittest.cc
new file mode 100755
index 0000000..4d77896
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_pred_impl_unittest.cc
@@ -0,0 +1,2427 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+
+// Regression test for gtest_pred_impl.h
+//
+// This file is generated by a script and quite long.  If you intend to
+// learn how Google Test works by reading its unit tests, read
+// gtest_unittest.cc instead.
+//
+// This is intended as a regression test for the Google Test predicate
+// assertions.  We compile it as part of the gtest_unittest target
+// only to keep the implementation tidy and compact, as it is quite
+// involved to set up the stage for testing Google Test using Google
+// Test itself.
+//
+// Currently, gtest_unittest takes ~11 seconds to run in the testing
+// daemon.  In the future, if it grows too large and needs much more
+// time to finish, we should consider separating this file into a
+// stand-alone regression test.
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+// A user-defined data type.
+struct Bool {
+  explicit Bool(int val) : value(val != 0) {}
+
+  bool operator>(int n) const { return value > Bool(n).value; }
+
+  Bool operator+(const Bool& rhs) const { return Bool(value + rhs.value); }
+
+  bool operator==(const Bool& rhs) const { return value == rhs.value; }
+
+  bool value;
+};
+
+// Enables Bool to be used in assertions.
+std::ostream& operator<<(std::ostream& os, const Bool& x) {
+  return os << (x.value ? "true" : "false");
+}
+
+// Sample functions/functors for testing unary predicate assertions.
+
+// A unary predicate function.
+template <typename T1>
+bool PredFunction1(T1 v1) {
+  return v1 > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction1Int(int v1) {
+  return v1 > 0;
+}
+bool PredFunction1Bool(Bool v1) {
+  return v1 > 0;
+}
+
+// A unary predicate functor.
+struct PredFunctor1 {
+  template <typename T1>
+  bool operator()(const T1& v1) {
+    return v1 > 0;
+  }
+};
+
+// A unary predicate-formatter function.
+template <typename T1>
+testing::AssertionResult PredFormatFunction1(const char* e1,
+                                             const T1& v1) {
+  if (PredFunction1(v1))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << e1
+      << " is expected to be positive, but evaluates to "
+      << v1 << ".";
+}
+
+// A unary predicate-formatter functor.
+struct PredFormatFunctor1 {
+  template <typename T1>
+  testing::AssertionResult operator()(const char* e1,
+                                      const T1& v1) const {
+    return PredFormatFunction1(e1, v1);
+  }
+};
+
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT1.
+
+class Predicate1Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    expected_to_finish_ = true;
+    finished_ = false;
+    n1_ = 0;
+  }
+
+  void TearDown() override {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once.
+    EXPECT_EQ(1, n1_) <<
+        "The predicate assertion didn't evaluate argument 2 "
+        "exactly once.";
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true if the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true if the test function did run to finish.
+  static bool finished_;
+
+  static int n1_;
+};
+
+bool Predicate1Test::expected_to_finish_;
+bool Predicate1Test::finished_;
+int Predicate1Test::n1_;
+
+typedef Predicate1Test EXPECT_PRED_FORMAT1Test;
+typedef Predicate1Test ASSERT_PRED_FORMAT1Test;
+typedef Predicate1Test EXPECT_PRED1Test;
+typedef Predicate1Test ASSERT_PRED1Test;
+
+// Tests a successful EXPECT_PRED1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED1Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED1(PredFunction1Int,
+               ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED1Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED1(PredFunction1Bool,
+               Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED1Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED1(PredFunctor1(),
+               ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED1Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED1(PredFunctor1(),
+               Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED1Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED1(PredFunction1Int,
+                 n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED1Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED1(PredFunction1Bool,
+                 Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED1Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED1(PredFunctor1(),
+                 n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED1Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED1(PredFunctor1(),
+                 Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED1Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED1(PredFunction1Int,
+               ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED1Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED1(PredFunction1Bool,
+               Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED1Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED1(PredFunctor1(),
+               ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED1Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED1(PredFunctor1(),
+               Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED1Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED1(PredFunction1Int,
+                 n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED1Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED1(PredFunction1Bool,
+                 Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED1Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED1(PredFunctor1(),
+                 n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED1Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED1(PredFunctor1(),
+                 Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT1(PredFormatFunction1,
+                      ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT1(PredFormatFunction1,
+                      Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT1(PredFormatFunctor1(),
+                      ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT1(PredFormatFunctor1(),
+                      Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT1(PredFormatFunction1,
+                        n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT1(PredFormatFunction1,
+                        Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT1(PredFormatFunctor1(),
+                        n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT1Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT1(PredFormatFunctor1(),
+                        Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT1(PredFormatFunction1,
+                      ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT1(PredFormatFunction1,
+                      Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT1(PredFormatFunctor1(),
+                      ++n1_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT1(PredFormatFunctor1(),
+                      Bool(++n1_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(PredFormatFunction1,
+                        n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(PredFormatFunction1,
+                        Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(PredFormatFunctor1(),
+                        n1_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT1 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT1Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(PredFormatFunctor1(),
+                        Bool(n1_++));
+    finished_ = true;
+  }, "");
+}
+// Sample functions/functors for testing binary predicate assertions.
+
+// A binary predicate function.
+template <typename T1, typename T2>
+bool PredFunction2(T1 v1, T2 v2) {
+  return v1 + v2 > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction2Int(int v1, int v2) {
+  return v1 + v2 > 0;
+}
+bool PredFunction2Bool(Bool v1, Bool v2) {
+  return v1 + v2 > 0;
+}
+
+// A binary predicate functor.
+struct PredFunctor2 {
+  template <typename T1, typename T2>
+  bool operator()(const T1& v1,
+                  const T2& v2) {
+    return v1 + v2 > 0;
+  }
+};
+
+// A binary predicate-formatter function.
+template <typename T1, typename T2>
+testing::AssertionResult PredFormatFunction2(const char* e1,
+                                             const char* e2,
+                                             const T1& v1,
+                                             const T2& v2) {
+  if (PredFunction2(v1, v2))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << e1 << " + " << e2
+      << " is expected to be positive, but evaluates to "
+      << v1 + v2 << ".";
+}
+
+// A binary predicate-formatter functor.
+struct PredFormatFunctor2 {
+  template <typename T1, typename T2>
+  testing::AssertionResult operator()(const char* e1,
+                                      const char* e2,
+                                      const T1& v1,
+                                      const T2& v2) const {
+    return PredFormatFunction2(e1, e2, v1, v2);
+  }
+};
+
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT2.
+
+class Predicate2Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    expected_to_finish_ = true;
+    finished_ = false;
+    n1_ = n2_ = 0;
+  }
+
+  void TearDown() override {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once.
+    EXPECT_EQ(1, n1_) <<
+        "The predicate assertion didn't evaluate argument 2 "
+        "exactly once.";
+    EXPECT_EQ(1, n2_) <<
+        "The predicate assertion didn't evaluate argument 3 "
+        "exactly once.";
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true if the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true if the test function did run to finish.
+  static bool finished_;
+
+  static int n1_;
+  static int n2_;
+};
+
+bool Predicate2Test::expected_to_finish_;
+bool Predicate2Test::finished_;
+int Predicate2Test::n1_;
+int Predicate2Test::n2_;
+
+typedef Predicate2Test EXPECT_PRED_FORMAT2Test;
+typedef Predicate2Test ASSERT_PRED_FORMAT2Test;
+typedef Predicate2Test EXPECT_PRED2Test;
+typedef Predicate2Test ASSERT_PRED2Test;
+
+// Tests a successful EXPECT_PRED2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED2Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED2(PredFunction2Int,
+               ++n1_,
+               ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED2Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED2(PredFunction2Bool,
+               Bool(++n1_),
+               Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED2Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED2(PredFunctor2(),
+               ++n1_,
+               ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED2Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED2(PredFunctor2(),
+               Bool(++n1_),
+               Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED2Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED2(PredFunction2Int,
+                 n1_++,
+                 n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED2Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED2(PredFunction2Bool,
+                 Bool(n1_++),
+                 Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED2Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED2(PredFunctor2(),
+                 n1_++,
+                 n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED2Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED2(PredFunctor2(),
+                 Bool(n1_++),
+                 Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED2Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED2(PredFunction2Int,
+               ++n1_,
+               ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED2Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED2(PredFunction2Bool,
+               Bool(++n1_),
+               Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED2Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED2(PredFunctor2(),
+               ++n1_,
+               ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED2Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED2(PredFunctor2(),
+               Bool(++n1_),
+               Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED2Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED2(PredFunction2Int,
+                 n1_++,
+                 n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED2Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED2(PredFunction2Bool,
+                 Bool(n1_++),
+                 Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED2Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED2(PredFunctor2(),
+                 n1_++,
+                 n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED2Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED2(PredFunctor2(),
+                 Bool(n1_++),
+                 Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT2(PredFormatFunction2,
+                      ++n1_,
+                      ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT2(PredFormatFunction2,
+                      Bool(++n1_),
+                      Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT2(PredFormatFunctor2(),
+                      ++n1_,
+                      ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT2(PredFormatFunctor2(),
+                      Bool(++n1_),
+                      Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(PredFormatFunction2,
+                        n1_++,
+                        n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(PredFormatFunction2,
+                        Bool(n1_++),
+                        Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(PredFormatFunctor2(),
+                        n1_++,
+                        n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT2Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(PredFormatFunctor2(),
+                        Bool(n1_++),
+                        Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT2(PredFormatFunction2,
+                      ++n1_,
+                      ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT2(PredFormatFunction2,
+                      Bool(++n1_),
+                      Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT2(PredFormatFunctor2(),
+                      ++n1_,
+                      ++n2_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT2(PredFormatFunctor2(),
+                      Bool(++n1_),
+                      Bool(++n2_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(PredFormatFunction2,
+                        n1_++,
+                        n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(PredFormatFunction2,
+                        Bool(n1_++),
+                        Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(PredFormatFunctor2(),
+                        n1_++,
+                        n2_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT2 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT2Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(PredFormatFunctor2(),
+                        Bool(n1_++),
+                        Bool(n2_++));
+    finished_ = true;
+  }, "");
+}
+// Sample functions/functors for testing ternary predicate assertions.
+
+// A ternary predicate function.
+template <typename T1, typename T2, typename T3>
+bool PredFunction3(T1 v1, T2 v2, T3 v3) {
+  return v1 + v2 + v3 > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction3Int(int v1, int v2, int v3) {
+  return v1 + v2 + v3 > 0;
+}
+bool PredFunction3Bool(Bool v1, Bool v2, Bool v3) {
+  return v1 + v2 + v3 > 0;
+}
+
+// A ternary predicate functor.
+struct PredFunctor3 {
+  template <typename T1, typename T2, typename T3>
+  bool operator()(const T1& v1,
+                  const T2& v2,
+                  const T3& v3) {
+    return v1 + v2 + v3 > 0;
+  }
+};
+
+// A ternary predicate-formatter function.
+template <typename T1, typename T2, typename T3>
+testing::AssertionResult PredFormatFunction3(const char* e1,
+                                             const char* e2,
+                                             const char* e3,
+                                             const T1& v1,
+                                             const T2& v2,
+                                             const T3& v3) {
+  if (PredFunction3(v1, v2, v3))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << e1 << " + " << e2 << " + " << e3
+      << " is expected to be positive, but evaluates to "
+      << v1 + v2 + v3 << ".";
+}
+
+// A ternary predicate-formatter functor.
+struct PredFormatFunctor3 {
+  template <typename T1, typename T2, typename T3>
+  testing::AssertionResult operator()(const char* e1,
+                                      const char* e2,
+                                      const char* e3,
+                                      const T1& v1,
+                                      const T2& v2,
+                                      const T3& v3) const {
+    return PredFormatFunction3(e1, e2, e3, v1, v2, v3);
+  }
+};
+
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT3.
+
+class Predicate3Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    expected_to_finish_ = true;
+    finished_ = false;
+    n1_ = n2_ = n3_ = 0;
+  }
+
+  void TearDown() override {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once.
+    EXPECT_EQ(1, n1_) <<
+        "The predicate assertion didn't evaluate argument 2 "
+        "exactly once.";
+    EXPECT_EQ(1, n2_) <<
+        "The predicate assertion didn't evaluate argument 3 "
+        "exactly once.";
+    EXPECT_EQ(1, n3_) <<
+        "The predicate assertion didn't evaluate argument 4 "
+        "exactly once.";
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true if the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true if the test function did run to finish.
+  static bool finished_;
+
+  static int n1_;
+  static int n2_;
+  static int n3_;
+};
+
+bool Predicate3Test::expected_to_finish_;
+bool Predicate3Test::finished_;
+int Predicate3Test::n1_;
+int Predicate3Test::n2_;
+int Predicate3Test::n3_;
+
+typedef Predicate3Test EXPECT_PRED_FORMAT3Test;
+typedef Predicate3Test ASSERT_PRED_FORMAT3Test;
+typedef Predicate3Test EXPECT_PRED3Test;
+typedef Predicate3Test ASSERT_PRED3Test;
+
+// Tests a successful EXPECT_PRED3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED3Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED3(PredFunction3Int,
+               ++n1_,
+               ++n2_,
+               ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED3Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED3(PredFunction3Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED3Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED3(PredFunctor3(),
+               ++n1_,
+               ++n2_,
+               ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED3Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED3(PredFunctor3(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED3Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED3(PredFunction3Int,
+                 n1_++,
+                 n2_++,
+                 n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED3Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED3(PredFunction3Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED3Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED3(PredFunctor3(),
+                 n1_++,
+                 n2_++,
+                 n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED3Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED3(PredFunctor3(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED3Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED3(PredFunction3Int,
+               ++n1_,
+               ++n2_,
+               ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED3Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED3(PredFunction3Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED3Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED3(PredFunctor3(),
+               ++n1_,
+               ++n2_,
+               ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED3Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED3(PredFunctor3(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED3Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED3(PredFunction3Int,
+                 n1_++,
+                 n2_++,
+                 n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED3Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED3(PredFunction3Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED3Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED3(PredFunctor3(),
+                 n1_++,
+                 n2_++,
+                 n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED3Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED3(PredFunctor3(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT3(PredFormatFunction3,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT3(PredFormatFunction3,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT3(PredFormatFunctor3(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT3(PredFormatFunctor3(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT3(PredFormatFunction3,
+                        n1_++,
+                        n2_++,
+                        n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT3(PredFormatFunction3,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT3(PredFormatFunctor3(),
+                        n1_++,
+                        n2_++,
+                        n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT3Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT3(PredFormatFunctor3(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT3(PredFormatFunction3,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT3(PredFormatFunction3,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT3(PredFormatFunctor3(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT3(PredFormatFunctor3(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT3(PredFormatFunction3,
+                        n1_++,
+                        n2_++,
+                        n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT3(PredFormatFunction3,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT3(PredFormatFunctor3(),
+                        n1_++,
+                        n2_++,
+                        n3_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT3 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT3Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT3(PredFormatFunctor3(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++));
+    finished_ = true;
+  }, "");
+}
+// Sample functions/functors for testing 4-ary predicate assertions.
+
+// A 4-ary predicate function.
+template <typename T1, typename T2, typename T3, typename T4>
+bool PredFunction4(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return v1 + v2 + v3 + v4 > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction4Int(int v1, int v2, int v3, int v4) {
+  return v1 + v2 + v3 + v4 > 0;
+}
+bool PredFunction4Bool(Bool v1, Bool v2, Bool v3, Bool v4) {
+  return v1 + v2 + v3 + v4 > 0;
+}
+
+// A 4-ary predicate functor.
+struct PredFunctor4 {
+  template <typename T1, typename T2, typename T3, typename T4>
+  bool operator()(const T1& v1,
+                  const T2& v2,
+                  const T3& v3,
+                  const T4& v4) {
+    return v1 + v2 + v3 + v4 > 0;
+  }
+};
+
+// A 4-ary predicate-formatter function.
+template <typename T1, typename T2, typename T3, typename T4>
+testing::AssertionResult PredFormatFunction4(const char* e1,
+                                             const char* e2,
+                                             const char* e3,
+                                             const char* e4,
+                                             const T1& v1,
+                                             const T2& v2,
+                                             const T3& v3,
+                                             const T4& v4) {
+  if (PredFunction4(v1, v2, v3, v4))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << e1 << " + " << e2 << " + " << e3 << " + " << e4
+      << " is expected to be positive, but evaluates to "
+      << v1 + v2 + v3 + v4 << ".";
+}
+
+// A 4-ary predicate-formatter functor.
+struct PredFormatFunctor4 {
+  template <typename T1, typename T2, typename T3, typename T4>
+  testing::AssertionResult operator()(const char* e1,
+                                      const char* e2,
+                                      const char* e3,
+                                      const char* e4,
+                                      const T1& v1,
+                                      const T2& v2,
+                                      const T3& v3,
+                                      const T4& v4) const {
+    return PredFormatFunction4(e1, e2, e3, e4, v1, v2, v3, v4);
+  }
+};
+
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT4.
+
+class Predicate4Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    expected_to_finish_ = true;
+    finished_ = false;
+    n1_ = n2_ = n3_ = n4_ = 0;
+  }
+
+  void TearDown() override {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once.
+    EXPECT_EQ(1, n1_) <<
+        "The predicate assertion didn't evaluate argument 2 "
+        "exactly once.";
+    EXPECT_EQ(1, n2_) <<
+        "The predicate assertion didn't evaluate argument 3 "
+        "exactly once.";
+    EXPECT_EQ(1, n3_) <<
+        "The predicate assertion didn't evaluate argument 4 "
+        "exactly once.";
+    EXPECT_EQ(1, n4_) <<
+        "The predicate assertion didn't evaluate argument 5 "
+        "exactly once.";
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true if the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true if the test function did run to finish.
+  static bool finished_;
+
+  static int n1_;
+  static int n2_;
+  static int n3_;
+  static int n4_;
+};
+
+bool Predicate4Test::expected_to_finish_;
+bool Predicate4Test::finished_;
+int Predicate4Test::n1_;
+int Predicate4Test::n2_;
+int Predicate4Test::n3_;
+int Predicate4Test::n4_;
+
+typedef Predicate4Test EXPECT_PRED_FORMAT4Test;
+typedef Predicate4Test ASSERT_PRED_FORMAT4Test;
+typedef Predicate4Test EXPECT_PRED4Test;
+typedef Predicate4Test ASSERT_PRED4Test;
+
+// Tests a successful EXPECT_PRED4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED4Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED4(PredFunction4Int,
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED4Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED4(PredFunction4Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED4Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED4(PredFunctor4(),
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED4Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED4(PredFunctor4(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED4Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED4(PredFunction4Int,
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED4Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED4(PredFunction4Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED4Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED4(PredFunctor4(),
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED4Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED4(PredFunctor4(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED4Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED4(PredFunction4Int,
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED4Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED4(PredFunction4Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED4Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED4(PredFunctor4(),
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED4Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED4(PredFunctor4(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED4Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED4(PredFunction4Int,
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED4Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED4(PredFunction4Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED4Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED4(PredFunctor4(),
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED4Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED4(PredFunctor4(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT4(PredFormatFunction4,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT4(PredFormatFunction4,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT4(PredFormatFunctor4(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT4(PredFormatFunctor4(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(PredFormatFunction4,
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(PredFormatFunction4,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(PredFormatFunctor4(),
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT4Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(PredFormatFunctor4(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT4(PredFormatFunction4,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT4(PredFormatFunction4,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT4(PredFormatFunctor4(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT4(PredFormatFunctor4(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT4(PredFormatFunction4,
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT4(PredFormatFunction4,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT4(PredFormatFunctor4(),
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT4 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT4Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT4(PredFormatFunctor4(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++));
+    finished_ = true;
+  }, "");
+}
+// Sample functions/functors for testing 5-ary predicate assertions.
+
+// A 5-ary predicate function.
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+bool PredFunction5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) {
+  return v1 + v2 + v3 + v4 + v5 > 0;
+}
+
+// The following two functions are needed to circumvent a bug in
+// gcc 2.95.3, which sometimes has problem with the above template
+// function.
+bool PredFunction5Int(int v1, int v2, int v3, int v4, int v5) {
+  return v1 + v2 + v3 + v4 + v5 > 0;
+}
+bool PredFunction5Bool(Bool v1, Bool v2, Bool v3, Bool v4, Bool v5) {
+  return v1 + v2 + v3 + v4 + v5 > 0;
+}
+
+// A 5-ary predicate functor.
+struct PredFunctor5 {
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  bool operator()(const T1& v1,
+                  const T2& v2,
+                  const T3& v3,
+                  const T4& v4,
+                  const T5& v5) {
+    return v1 + v2 + v3 + v4 + v5 > 0;
+  }
+};
+
+// A 5-ary predicate-formatter function.
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+testing::AssertionResult PredFormatFunction5(const char* e1,
+                                             const char* e2,
+                                             const char* e3,
+                                             const char* e4,
+                                             const char* e5,
+                                             const T1& v1,
+                                             const T2& v2,
+                                             const T3& v3,
+                                             const T4& v4,
+                                             const T5& v5) {
+  if (PredFunction5(v1, v2, v3, v4, v5))
+    return testing::AssertionSuccess();
+
+  return testing::AssertionFailure()
+      << e1 << " + " << e2 << " + " << e3 << " + " << e4 << " + " << e5
+      << " is expected to be positive, but evaluates to "
+      << v1 + v2 + v3 + v4 + v5 << ".";
+}
+
+// A 5-ary predicate-formatter functor.
+struct PredFormatFunctor5 {
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  testing::AssertionResult operator()(const char* e1,
+                                      const char* e2,
+                                      const char* e3,
+                                      const char* e4,
+                                      const char* e5,
+                                      const T1& v1,
+                                      const T2& v2,
+                                      const T3& v3,
+                                      const T4& v4,
+                                      const T5& v5) const {
+    return PredFormatFunction5(e1, e2, e3, e4, e5, v1, v2, v3, v4, v5);
+  }
+};
+
+// Tests for {EXPECT|ASSERT}_PRED_FORMAT5.
+
+class Predicate5Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    expected_to_finish_ = true;
+    finished_ = false;
+    n1_ = n2_ = n3_ = n4_ = n5_ = 0;
+  }
+
+  void TearDown() override {
+    // Verifies that each of the predicate's arguments was evaluated
+    // exactly once.
+    EXPECT_EQ(1, n1_) <<
+        "The predicate assertion didn't evaluate argument 2 "
+        "exactly once.";
+    EXPECT_EQ(1, n2_) <<
+        "The predicate assertion didn't evaluate argument 3 "
+        "exactly once.";
+    EXPECT_EQ(1, n3_) <<
+        "The predicate assertion didn't evaluate argument 4 "
+        "exactly once.";
+    EXPECT_EQ(1, n4_) <<
+        "The predicate assertion didn't evaluate argument 5 "
+        "exactly once.";
+    EXPECT_EQ(1, n5_) <<
+        "The predicate assertion didn't evaluate argument 6 "
+        "exactly once.";
+
+    // Verifies that the control flow in the test function is expected.
+    if (expected_to_finish_ && !finished_) {
+      FAIL() << "The predicate assertion unexpactedly aborted the test.";
+    } else if (!expected_to_finish_ && finished_) {
+      FAIL() << "The failed predicate assertion didn't abort the test "
+                "as expected.";
+    }
+  }
+
+  // true if the test function is expected to run to finish.
+  static bool expected_to_finish_;
+
+  // true if the test function did run to finish.
+  static bool finished_;
+
+  static int n1_;
+  static int n2_;
+  static int n3_;
+  static int n4_;
+  static int n5_;
+};
+
+bool Predicate5Test::expected_to_finish_;
+bool Predicate5Test::finished_;
+int Predicate5Test::n1_;
+int Predicate5Test::n2_;
+int Predicate5Test::n3_;
+int Predicate5Test::n4_;
+int Predicate5Test::n5_;
+
+typedef Predicate5Test EXPECT_PRED_FORMAT5Test;
+typedef Predicate5Test ASSERT_PRED_FORMAT5Test;
+typedef Predicate5Test EXPECT_PRED5Test;
+typedef Predicate5Test ASSERT_PRED5Test;
+
+// Tests a successful EXPECT_PRED5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED5Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED5(PredFunction5Int,
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_,
+               ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED5Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED5(PredFunction5Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_),
+               Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED5Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED5(PredFunctor5(),
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_,
+               ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED5Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED5(PredFunctor5(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_),
+               Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED5Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED5(PredFunction5Int,
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++,
+                 n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED5Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED5(PredFunction5Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++),
+                 Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED5Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED5(PredFunctor5(),
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++,
+                 n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED5Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED5(PredFunctor5(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++),
+                 Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED5Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED5(PredFunction5Int,
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_,
+               ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED5Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED5(PredFunction5Bool,
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_),
+               Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED5Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED5(PredFunctor5(),
+               ++n1_,
+               ++n2_,
+               ++n3_,
+               ++n4_,
+               ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED5Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED5(PredFunctor5(),
+               Bool(++n1_),
+               Bool(++n2_),
+               Bool(++n3_),
+               Bool(++n4_),
+               Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED5Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED5(PredFunction5Int,
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++,
+                 n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED5Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED5(PredFunction5Bool,
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++),
+                 Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED5Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED5(PredFunctor5(),
+                 n1_++,
+                 n2_++,
+                 n3_++,
+                 n4_++,
+                 n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED5Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED5(PredFunctor5(),
+                 Bool(n1_++),
+                 Bool(n2_++),
+                 Bool(n3_++),
+                 Bool(n4_++),
+                 Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctionOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT5(PredFormatFunction5,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_,
+                      ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctionOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT5(PredFormatFunction5,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_),
+                      Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctorOnBuiltInTypeSuccess) {
+  EXPECT_PRED_FORMAT5(PredFormatFunctor5(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_,
+                      ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctorOnUserTypeSuccess) {
+  EXPECT_PRED_FORMAT5(PredFormatFunctor5(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_),
+                      Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a failed EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctionOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT5(PredFormatFunction5,
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++,
+                        n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctionOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT5(PredFormatFunction5,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++),
+                        Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctorOnBuiltInTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT5(PredFormatFunctor5(),
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++,
+                        n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed EXPECT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(EXPECT_PRED_FORMAT5Test, FunctorOnUserTypeFailure) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT5(PredFormatFunctor5(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++),
+                        Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a successful ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctionOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT5(PredFormatFunction5,
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_,
+                      ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctionOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT5(PredFormatFunction5,
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_),
+                      Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctorOnBuiltInTypeSuccess) {
+  ASSERT_PRED_FORMAT5(PredFormatFunctor5(),
+                      ++n1_,
+                      ++n2_,
+                      ++n3_,
+                      ++n4_,
+                      ++n5_);
+  finished_ = true;
+}
+
+// Tests a successful ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctorOnUserTypeSuccess) {
+  ASSERT_PRED_FORMAT5(PredFormatFunctor5(),
+                      Bool(++n1_),
+                      Bool(++n2_),
+                      Bool(++n3_),
+                      Bool(++n4_),
+                      Bool(++n5_));
+  finished_ = true;
+}
+
+// Tests a failed ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctionOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT5(PredFormatFunction5,
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++,
+                        n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a function on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctionOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT5(PredFormatFunction5,
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++),
+                        Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a built-in type (int).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctorOnBuiltInTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT5(PredFormatFunctor5(),
+                        n1_++,
+                        n2_++,
+                        n3_++,
+                        n4_++,
+                        n5_++);
+    finished_ = true;
+  }, "");
+}
+
+// Tests a failed ASSERT_PRED_FORMAT5 where the
+// predicate-formatter is a functor on a user-defined type (Bool).
+TEST_F(ASSERT_PRED_FORMAT5Test, FunctorOnUserTypeFailure) {
+  expected_to_finish_ = false;
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT5(PredFormatFunctor5(),
+                        Bool(n1_++),
+                        Bool(n2_++),
+                        Bool(n3_++),
+                        Bool(n4_++),
+                        Bool(n5_++));
+    finished_ = true;
+  }, "");
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_premature_exit_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_premature_exit_test.cc
new file mode 100755
index 0000000..777a8bf
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_premature_exit_test.cc
@@ -0,0 +1,126 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests that Google Test manipulates the premature-exit-detection
+// file correctly.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+using ::testing::InitGoogleTest;
+using ::testing::Test;
+using ::testing::internal::posix::GetEnv;
+using ::testing::internal::posix::Stat;
+using ::testing::internal::posix::StatStruct;
+
+namespace {
+
+class PrematureExitTest : public Test {
+ public:
+  // Returns true if the given file exists.
+  static bool FileExists(const char* filepath) {
+    StatStruct stat;
+    return Stat(filepath, &stat) == 0;
+  }
+
+ protected:
+  PrematureExitTest() {
+    premature_exit_file_path_ = GetEnv("TEST_PREMATURE_EXIT_FILE");
+
+    // Normalize NULL to "" for ease of handling.
+    if (premature_exit_file_path_ == nullptr) {
+      premature_exit_file_path_ = "";
+    }
+  }
+
+  // Returns true if the premature-exit file exists.
+  bool PrematureExitFileExists() const {
+    return FileExists(premature_exit_file_path_);
+  }
+
+  const char* premature_exit_file_path_;
+};
+
+typedef PrematureExitTest PrematureExitDeathTest;
+
+// Tests that:
+//   - the premature-exit file exists during the execution of a
+//     death test (EXPECT_DEATH*), and
+//   - a death test doesn't interfere with the main test process's
+//     handling of the premature-exit file.
+TEST_F(PrematureExitDeathTest, FileExistsDuringExecutionOfDeathTest) {
+  if (*premature_exit_file_path_ == '\0') {
+    return;
+  }
+
+  EXPECT_DEATH_IF_SUPPORTED({
+      // If the file exists, crash the process such that the main test
+      // process will catch the (expected) crash and report a success;
+      // otherwise don't crash, which will cause the main test process
+      // to report that the death test has failed.
+      if (PrematureExitFileExists()) {
+        exit(1);
+      }
+    }, "");
+}
+
+// Tests that the premature-exit file exists during the execution of a
+// normal (non-death) test.
+TEST_F(PrematureExitTest, PrematureExitFileExistsDuringTestExecution) {
+  if (*premature_exit_file_path_ == '\0') {
+    return;
+  }
+
+  EXPECT_TRUE(PrematureExitFileExists())
+      << " file " << premature_exit_file_path_
+      << " should exist during test execution, but doesn't.";
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  InitGoogleTest(&argc, argv);
+  const int exit_code = RUN_ALL_TESTS();
+
+  // Test that the premature-exit file is deleted upon return from
+  // RUN_ALL_TESTS().
+  const char* const filepath = GetEnv("TEST_PREMATURE_EXIT_FILE");
+  if (filepath != nullptr && *filepath != '\0') {
+    if (PrematureExitTest::FileExists(filepath)) {
+      printf(
+          "File %s shouldn't exist after the test program finishes, but does.",
+          filepath);
+      return 1;
+    }
+  }
+
+  return exit_code;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_prod_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_prod_test.cc
new file mode 100755
index 0000000..ede81a0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_prod_test.cc
@@ -0,0 +1,56 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Unit test for gtest_prod.h.
+
+#include "production.h"
+#include "gtest/gtest.h"
+
+// Tests that private members can be accessed from a TEST declared as
+// a friend of the class.
+TEST(PrivateCodeTest, CanAccessPrivateMembers) {
+  PrivateCode a;
+  EXPECT_EQ(0, a.x_);
+
+  a.set_x(1);
+  EXPECT_EQ(1, a.x_);
+}
+
+typedef testing::Test PrivateCodeFixtureTest;
+
+// Tests that private members can be accessed from a TEST_F declared
+// as a friend of the class.
+TEST_F(PrivateCodeFixtureTest, CanAccessPrivateMembers) {
+  PrivateCode a;
+  EXPECT_EQ(0, a.x_);
+
+  a.set_x(2);
+  EXPECT_EQ(2, a.x_);
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_repeat_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_repeat_test.cc
new file mode 100755
index 0000000..7da4a15
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_repeat_test.cc
@@ -0,0 +1,233 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests the --gtest_repeat=number flag.
+
+#include <stdlib.h>
+#include <iostream>
+#include "gtest/gtest.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+GTEST_DECLARE_string_(death_test_style);
+GTEST_DECLARE_string_(filter);
+GTEST_DECLARE_int32_(repeat);
+
+}  // namespace testing
+
+using testing::GTEST_FLAG(death_test_style);
+using testing::GTEST_FLAG(filter);
+using testing::GTEST_FLAG(repeat);
+
+namespace {
+
+// We need this when we are testing Google Test itself and therefore
+// cannot use Google Test assertions.
+#define GTEST_CHECK_INT_EQ_(expected, actual) \
+  do {\
+    const int expected_val = (expected);\
+    const int actual_val = (actual);\
+    if (::testing::internal::IsTrue(expected_val != actual_val)) {\
+      ::std::cout << "Value of: " #actual "\n"\
+                  << "  Actual: " << actual_val << "\n"\
+                  << "Expected: " #expected "\n"\
+                  << "Which is: " << expected_val << "\n";\
+      ::testing::internal::posix::Abort();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+
+// Used for verifying that global environment set-up and tear-down are
+// inside the --gtest_repeat loop.
+
+int g_environment_set_up_count = 0;
+int g_environment_tear_down_count = 0;
+
+class MyEnvironment : public testing::Environment {
+ public:
+  MyEnvironment() {}
+  void SetUp() override { g_environment_set_up_count++; }
+  void TearDown() override { g_environment_tear_down_count++; }
+};
+
+// A test that should fail.
+
+int g_should_fail_count = 0;
+
+TEST(FooTest, ShouldFail) {
+  g_should_fail_count++;
+  EXPECT_EQ(0, 1) << "Expected failure.";
+}
+
+// A test that should pass.
+
+int g_should_pass_count = 0;
+
+TEST(FooTest, ShouldPass) {
+  g_should_pass_count++;
+}
+
+// A test that contains a thread-safe death test and a fast death
+// test.  It should pass.
+
+int g_death_test_count = 0;
+
+TEST(BarDeathTest, ThreadSafeAndFast) {
+  g_death_test_count++;
+
+  GTEST_FLAG(death_test_style) = "threadsafe";
+  EXPECT_DEATH_IF_SUPPORTED(::testing::internal::posix::Abort(), "");
+
+  GTEST_FLAG(death_test_style) = "fast";
+  EXPECT_DEATH_IF_SUPPORTED(::testing::internal::posix::Abort(), "");
+}
+
+int g_param_test_count = 0;
+
+const int kNumberOfParamTests = 10;
+
+class MyParamTest : public testing::TestWithParam<int> {};
+
+TEST_P(MyParamTest, ShouldPass) {
+  GTEST_CHECK_INT_EQ_(g_param_test_count % kNumberOfParamTests, GetParam());
+  g_param_test_count++;
+}
+INSTANTIATE_TEST_SUITE_P(MyParamSequence,
+                         MyParamTest,
+                         testing::Range(0, kNumberOfParamTests));
+
+// Resets the count for each test.
+void ResetCounts() {
+  g_environment_set_up_count = 0;
+  g_environment_tear_down_count = 0;
+  g_should_fail_count = 0;
+  g_should_pass_count = 0;
+  g_death_test_count = 0;
+  g_param_test_count = 0;
+}
+
+// Checks that the count for each test is expected.
+void CheckCounts(int expected) {
+  GTEST_CHECK_INT_EQ_(expected, g_environment_set_up_count);
+  GTEST_CHECK_INT_EQ_(expected, g_environment_tear_down_count);
+  GTEST_CHECK_INT_EQ_(expected, g_should_fail_count);
+  GTEST_CHECK_INT_EQ_(expected, g_should_pass_count);
+  GTEST_CHECK_INT_EQ_(expected, g_death_test_count);
+  GTEST_CHECK_INT_EQ_(expected * kNumberOfParamTests, g_param_test_count);
+}
+
+// Tests the behavior of Google Test when --gtest_repeat is not specified.
+void TestRepeatUnspecified() {
+  ResetCounts();
+  GTEST_CHECK_INT_EQ_(1, RUN_ALL_TESTS());
+  CheckCounts(1);
+}
+
+// Tests the behavior of Google Test when --gtest_repeat has the given value.
+void TestRepeat(int repeat) {
+  GTEST_FLAG(repeat) = repeat;
+
+  ResetCounts();
+  GTEST_CHECK_INT_EQ_(repeat > 0 ? 1 : 0, RUN_ALL_TESTS());
+  CheckCounts(repeat);
+}
+
+// Tests using --gtest_repeat when --gtest_filter specifies an empty
+// set of tests.
+void TestRepeatWithEmptyFilter(int repeat) {
+  GTEST_FLAG(repeat) = repeat;
+  GTEST_FLAG(filter) = "None";
+
+  ResetCounts();
+  GTEST_CHECK_INT_EQ_(0, RUN_ALL_TESTS());
+  CheckCounts(0);
+}
+
+// Tests using --gtest_repeat when --gtest_filter specifies a set of
+// successful tests.
+void TestRepeatWithFilterForSuccessfulTests(int repeat) {
+  GTEST_FLAG(repeat) = repeat;
+  GTEST_FLAG(filter) = "*-*ShouldFail";
+
+  ResetCounts();
+  GTEST_CHECK_INT_EQ_(0, RUN_ALL_TESTS());
+  GTEST_CHECK_INT_EQ_(repeat, g_environment_set_up_count);
+  GTEST_CHECK_INT_EQ_(repeat, g_environment_tear_down_count);
+  GTEST_CHECK_INT_EQ_(0, g_should_fail_count);
+  GTEST_CHECK_INT_EQ_(repeat, g_should_pass_count);
+  GTEST_CHECK_INT_EQ_(repeat, g_death_test_count);
+  GTEST_CHECK_INT_EQ_(repeat * kNumberOfParamTests, g_param_test_count);
+}
+
+// Tests using --gtest_repeat when --gtest_filter specifies a set of
+// failed tests.
+void TestRepeatWithFilterForFailedTests(int repeat) {
+  GTEST_FLAG(repeat) = repeat;
+  GTEST_FLAG(filter) = "*ShouldFail";
+
+  ResetCounts();
+  GTEST_CHECK_INT_EQ_(1, RUN_ALL_TESTS());
+  GTEST_CHECK_INT_EQ_(repeat, g_environment_set_up_count);
+  GTEST_CHECK_INT_EQ_(repeat, g_environment_tear_down_count);
+  GTEST_CHECK_INT_EQ_(repeat, g_should_fail_count);
+  GTEST_CHECK_INT_EQ_(0, g_should_pass_count);
+  GTEST_CHECK_INT_EQ_(0, g_death_test_count);
+  GTEST_CHECK_INT_EQ_(0, g_param_test_count);
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  testing::AddGlobalTestEnvironment(new MyEnvironment);
+
+  TestRepeatUnspecified();
+  TestRepeat(0);
+  TestRepeat(1);
+  TestRepeat(5);
+
+  TestRepeatWithEmptyFilter(2);
+  TestRepeatWithEmptyFilter(3);
+
+  TestRepeatWithFilterForSuccessfulTests(3);
+
+  TestRepeatWithFilterForFailedTests(4);
+
+  // It would be nice to verify that the tests indeed loop forever
+  // when GTEST_FLAG(repeat) is negative, but this test will be quite
+  // complicated to write.  Since this flag is for interactive
+  // debugging only and doesn't affect the normal test result, such a
+  // test would be an overkill.
+
+  printf("PASS\n");
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_environment_check_output_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_environment_check_output_test.py
new file mode 100755
index 0000000..6e79155
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_environment_check_output_test.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#
+# Copyright 2019 Google LLC.  All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Tests Google Test's gtest skip in environment setup  behavior.
+
+This script invokes gtest_skip_in_environment_setup_test_ and verifies its
+output.
+"""
+
+import gtest_test_utils
+
+# Path to the gtest_skip_in_environment_setup_test binary
+EXE_PATH = gtest_test_utils.GetTestExecutablePath(
+    'gtest_skip_in_environment_setup_test')
+
+OUTPUT = gtest_test_utils.Subprocess([EXE_PATH]).output
+
+
+# Test.
+class SkipEntireEnvironmentTest(gtest_test_utils.TestCase):
+
+  def testSkipEntireEnvironmentTest(self):
+    self.assertIn('Skipping the entire environment', OUTPUT)
+    self.assertNotIn('FAILED', OUTPUT)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_in_environment_setup_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_in_environment_setup_test.cc
new file mode 100755
index 0000000..9372310
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_in_environment_setup_test.cc
@@ -0,0 +1,49 @@
+// Copyright 2019, Google LLC.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google LLC. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This test verifies that skipping in the environment results in the
+// testcases being skipped.
+
+#include <iostream>
+#include "gtest/gtest.h"
+
+class SetupEnvironment : public testing::Environment {
+ public:
+  void SetUp() override { GTEST_SKIP() << "Skipping the entire environment"; }
+};
+
+TEST(Test, AlwaysFails) { EXPECT_EQ(true, false); }
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  testing::AddGlobalTestEnvironment(new SetupEnvironment());
+
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_test.cc
new file mode 100755
index 0000000..717e105
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_skip_test.cc
@@ -0,0 +1,55 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: arseny.aprelev@gmail.com (Arseny Aprelev)
+//
+
+#include "gtest/gtest.h"
+
+using ::testing::Test;
+
+TEST(SkipTest, DoesSkip) {
+  GTEST_SKIP();
+  EXPECT_EQ(0, 1);
+}
+
+class Fixture : public Test {
+ protected:
+  void SetUp() override {
+    GTEST_SKIP() << "skipping all tests for this fixture";
+  }
+};
+
+TEST_F(Fixture, SkipsOneTest) {
+  EXPECT_EQ(5, 7);
+}
+
+TEST_F(Fixture, SkipsAnotherTest) {
+  EXPECT_EQ(99, 100);
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_sole_header_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_sole_header_test.cc
new file mode 100755
index 0000000..1d94ac6
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_sole_header_test.cc
@@ -0,0 +1,56 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This test verifies that it's possible to use Google Test by including
+// the gtest.h header file alone.
+
+#include "gtest/gtest.h"
+
+namespace {
+
+void Subroutine() {
+  EXPECT_EQ(42, 42);
+}
+
+TEST(NoFatalFailureTest, ExpectNoFatalFailure) {
+  EXPECT_NO_FATAL_FAILURE(;);
+  EXPECT_NO_FATAL_FAILURE(SUCCEED());
+  EXPECT_NO_FATAL_FAILURE(Subroutine());
+  EXPECT_NO_FATAL_FAILURE({ SUCCEED(); });
+}
+
+TEST(NoFatalFailureTest, AssertNoFatalFailure) {
+  ASSERT_NO_FATAL_FAILURE(;);
+  ASSERT_NO_FATAL_FAILURE(SUCCEED());
+  ASSERT_NO_FATAL_FAILURE(Subroutine());
+  ASSERT_NO_FATAL_FAILURE({ SUCCEED(); });
+}
+
+}  // namespace
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_stress_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_stress_test.cc
new file mode 100755
index 0000000..8434819
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_stress_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests that SCOPED_TRACE() and various Google Test assertions can be
+// used in a large number of threads concurrently.
+
+#include "gtest/gtest.h"
+
+#include <vector>
+
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_IS_THREADSAFE
+
+namespace testing {
+namespace {
+
+using internal::Notification;
+using internal::TestPropertyKeyIs;
+using internal::ThreadWithParam;
+
+// In order to run tests in this file, for platforms where Google Test is
+// thread safe, implement ThreadWithParam. See the description of its API
+// in gtest-port.h, where it is defined for already supported platforms.
+
+// How many threads to create?
+const int kThreadCount = 50;
+
+std::string IdToKey(int id, const char* suffix) {
+  Message key;
+  key << "key_" << id << "_" << suffix;
+  return key.GetString();
+}
+
+std::string IdToString(int id) {
+  Message id_message;
+  id_message << id;
+  return id_message.GetString();
+}
+
+void ExpectKeyAndValueWereRecordedForId(
+    const std::vector<TestProperty>& properties,
+    int id, const char* suffix) {
+  TestPropertyKeyIs matches_key(IdToKey(id, suffix).c_str());
+  const std::vector<TestProperty>::const_iterator property =
+      std::find_if(properties.begin(), properties.end(), matches_key);
+  ASSERT_TRUE(property != properties.end())
+      << "expecting " << suffix << " value for id " << id;
+  EXPECT_STREQ(IdToString(id).c_str(), property->value());
+}
+
+// Calls a large number of Google Test assertions, where exactly one of them
+// will fail.
+void ManyAsserts(int id) {
+  GTEST_LOG_(INFO) << "Thread #" << id << " running...";
+
+  SCOPED_TRACE(Message() << "Thread #" << id);
+
+  for (int i = 0; i < kThreadCount; i++) {
+    SCOPED_TRACE(Message() << "Iteration #" << i);
+
+    // A bunch of assertions that should succeed.
+    EXPECT_TRUE(true);
+    ASSERT_FALSE(false) << "This shouldn't fail.";
+    EXPECT_STREQ("a", "a");
+    ASSERT_LE(5, 6);
+    EXPECT_EQ(i, i) << "This shouldn't fail.";
+
+    // RecordProperty() should interact safely with other threads as well.
+    // The shared_key forces property updates.
+    Test::RecordProperty(IdToKey(id, "string").c_str(), IdToString(id).c_str());
+    Test::RecordProperty(IdToKey(id, "int").c_str(), id);
+    Test::RecordProperty("shared_key", IdToString(id).c_str());
+
+    // This assertion should fail kThreadCount times per thread.  It
+    // is for testing whether Google Test can handle failed assertions in a
+    // multi-threaded context.
+    EXPECT_LT(i, 0) << "This should always fail.";
+  }
+}
+
+void CheckTestFailureCount(int expected_failures) {
+  const TestInfo* const info = UnitTest::GetInstance()->current_test_info();
+  const TestResult* const result = info->result();
+  GTEST_CHECK_(expected_failures == result->total_part_count())
+      << "Logged " << result->total_part_count() << " failures "
+      << " vs. " << expected_failures << " expected";
+}
+
+// Tests using SCOPED_TRACE() and Google Test assertions in many threads
+// concurrently.
+TEST(StressTest, CanUseScopedTraceAndAssertionsInManyThreads) {
+  {
+    std::unique_ptr<ThreadWithParam<int> > threads[kThreadCount];
+    Notification threads_can_start;
+    for (int i = 0; i != kThreadCount; i++)
+      threads[i].reset(new ThreadWithParam<int>(&ManyAsserts,
+                                                i,
+                                                &threads_can_start));
+
+    threads_can_start.Notify();
+
+    // Blocks until all the threads are done.
+    for (int i = 0; i != kThreadCount; i++)
+      threads[i]->Join();
+  }
+
+  // Ensures that kThreadCount*kThreadCount failures have been reported.
+  const TestInfo* const info = UnitTest::GetInstance()->current_test_info();
+  const TestResult* const result = info->result();
+
+  std::vector<TestProperty> properties;
+  // We have no access to the TestResult's list of properties but we can
+  // copy them one by one.
+  for (int i = 0; i < result->test_property_count(); ++i)
+    properties.push_back(result->GetTestProperty(i));
+
+  EXPECT_EQ(kThreadCount * 2 + 1, result->test_property_count())
+      << "String and int values recorded on each thread, "
+      << "as well as one shared_key";
+  for (int i = 0; i < kThreadCount; ++i) {
+    ExpectKeyAndValueWereRecordedForId(properties, i, "string");
+    ExpectKeyAndValueWereRecordedForId(properties, i, "int");
+  }
+  CheckTestFailureCount(kThreadCount*kThreadCount);
+}
+
+void FailingThread(bool is_fatal) {
+  if (is_fatal)
+    FAIL() << "Fatal failure in some other thread. "
+           << "(This failure is expected.)";
+  else
+    ADD_FAILURE() << "Non-fatal failure in some other thread. "
+                  << "(This failure is expected.)";
+}
+
+void GenerateFatalFailureInAnotherThread(bool is_fatal) {
+  ThreadWithParam<bool> thread(&FailingThread, is_fatal, nullptr);
+  thread.Join();
+}
+
+TEST(NoFatalFailureTest, ExpectNoFatalFailureIgnoresFailuresInOtherThreads) {
+  EXPECT_NO_FATAL_FAILURE(GenerateFatalFailureInAnotherThread(true));
+  // We should only have one failure (the one from
+  // GenerateFatalFailureInAnotherThread()), since the EXPECT_NO_FATAL_FAILURE
+  // should succeed.
+  CheckTestFailureCount(1);
+}
+
+void AssertNoFatalFailureIgnoresFailuresInOtherThreads() {
+  ASSERT_NO_FATAL_FAILURE(GenerateFatalFailureInAnotherThread(true));
+}
+TEST(NoFatalFailureTest, AssertNoFatalFailureIgnoresFailuresInOtherThreads) {
+  // Using a subroutine, to make sure, that the test continues.
+  AssertNoFatalFailureIgnoresFailuresInOtherThreads();
+  // We should only have one failure (the one from
+  // GenerateFatalFailureInAnotherThread()), since the EXPECT_NO_FATAL_FAILURE
+  // should succeed.
+  CheckTestFailureCount(1);
+}
+
+TEST(FatalFailureTest, ExpectFatalFailureIgnoresFailuresInOtherThreads) {
+  // This statement should fail, since the current thread doesn't generate a
+  // fatal failure, only another one does.
+  EXPECT_FATAL_FAILURE(GenerateFatalFailureInAnotherThread(true), "expected");
+  CheckTestFailureCount(2);
+}
+
+TEST(FatalFailureOnAllThreadsTest, ExpectFatalFailureOnAllThreads) {
+  // This statement should succeed, because failures in all threads are
+  // considered.
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(
+      GenerateFatalFailureInAnotherThread(true), "expected");
+  CheckTestFailureCount(0);
+  // We need to add a failure, because main() checks that there are failures.
+  // But when only this test is run, we shouldn't have any failures.
+  ADD_FAILURE() << "This is an expected non-fatal failure.";
+}
+
+TEST(NonFatalFailureTest, ExpectNonFatalFailureIgnoresFailuresInOtherThreads) {
+  // This statement should fail, since the current thread doesn't generate a
+  // fatal failure, only another one does.
+  EXPECT_NONFATAL_FAILURE(GenerateFatalFailureInAnotherThread(false),
+                          "expected");
+  CheckTestFailureCount(2);
+}
+
+TEST(NonFatalFailureOnAllThreadsTest, ExpectNonFatalFailureOnAllThreads) {
+  // This statement should succeed, because failures in all threads are
+  // considered.
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(
+      GenerateFatalFailureInAnotherThread(false), "expected");
+  CheckTestFailureCount(0);
+  // We need to add a failure, because main() checks that there are failures,
+  // But when only this test is run, we shouldn't have any failures.
+  ADD_FAILURE() << "This is an expected non-fatal failure.";
+}
+
+}  // namespace
+}  // namespace testing
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  const int result = RUN_ALL_TESTS();  // Expected to fail.
+  GTEST_CHECK_(result == 1) << "RUN_ALL_TESTS() did not fail as expected";
+
+  printf("\nPASS\n");
+  return 0;
+}
+
+#else
+TEST(StressTest,
+     DISABLED_ThreadSafetyTestsAreSkippedWhenGoogleTestIsNotThreadSafe) {
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GTEST_IS_THREADSAFE
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_macro_stack_footprint_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_macro_stack_footprint_test.cc
new file mode 100755
index 0000000..a48db05
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_macro_stack_footprint_test.cc
@@ -0,0 +1,89 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Each TEST() expands to some static registration logic.  GCC puts all
+// such static initialization logic for a translation unit in a common,
+// internal function.  Since Google's build system restricts how much
+// stack space a function can use, there's a limit on how many TEST()s
+// one can put in a single C++ test file.  This test ensures that a large
+// number of TEST()s can be defined in the same translation unit.
+
+#include "gtest/gtest.h"
+
+// This macro defines 10 dummy tests.
+#define TEN_TESTS_(test_case_name) \
+  TEST(test_case_name, T0) {} \
+  TEST(test_case_name, T1) {} \
+  TEST(test_case_name, T2) {} \
+  TEST(test_case_name, T3) {} \
+  TEST(test_case_name, T4) {} \
+  TEST(test_case_name, T5) {} \
+  TEST(test_case_name, T6) {} \
+  TEST(test_case_name, T7) {} \
+  TEST(test_case_name, T8) {} \
+  TEST(test_case_name, T9) {}
+
+// This macro defines 100 dummy tests.
+#define HUNDRED_TESTS_(test_case_name_prefix) \
+  TEN_TESTS_(test_case_name_prefix ## 0) \
+  TEN_TESTS_(test_case_name_prefix ## 1) \
+  TEN_TESTS_(test_case_name_prefix ## 2) \
+  TEN_TESTS_(test_case_name_prefix ## 3) \
+  TEN_TESTS_(test_case_name_prefix ## 4) \
+  TEN_TESTS_(test_case_name_prefix ## 5) \
+  TEN_TESTS_(test_case_name_prefix ## 6) \
+  TEN_TESTS_(test_case_name_prefix ## 7) \
+  TEN_TESTS_(test_case_name_prefix ## 8) \
+  TEN_TESTS_(test_case_name_prefix ## 9)
+
+// This macro defines 1000 dummy tests.
+#define THOUSAND_TESTS_(test_case_name_prefix) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 0) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 1) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 2) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 3) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 4) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 5) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 6) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 7) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 8) \
+  HUNDRED_TESTS_(test_case_name_prefix ## 9)
+
+// Ensures that we can define 1000 TEST()s in the same translation
+// unit.
+THOUSAND_TESTS_(T)
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // We don't actually need to run the dummy tests - the purpose is to
+  // ensure that they compile.
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_utils.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_utils.py
new file mode 100755
index 0000000..abd56ec
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_test_utils.py
@@ -0,0 +1,313 @@
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test utilities for Google C++ Testing and Mocking Framework."""
+# Suppresses the 'Import not at the top of the file' lint complaint.
+# pylint: disable-msg=C6204
+
+import os
+import sys
+
+IS_WINDOWS = os.name == 'nt'
+IS_CYGWIN = os.name == 'posix' and 'CYGWIN' in os.uname()[0]
+IS_OS2 = os.name == 'os2'
+
+import atexit
+import shutil
+import tempfile
+import unittest as _test_module
+
+try:
+  import subprocess
+  _SUBPROCESS_MODULE_AVAILABLE = True
+except:
+  import popen2
+  _SUBPROCESS_MODULE_AVAILABLE = False
+# pylint: enable-msg=C6204
+
+GTEST_OUTPUT_VAR_NAME = 'GTEST_OUTPUT'
+
+# The environment variable for specifying the path to the premature-exit file.
+PREMATURE_EXIT_FILE_ENV_VAR = 'TEST_PREMATURE_EXIT_FILE'
+
+environ = os.environ.copy()
+
+
+def SetEnvVar(env_var, value):
+  """Sets/unsets an environment variable to a given value."""
+
+  if value is not None:
+    environ[env_var] = value
+  elif env_var in environ:
+    del environ[env_var]
+
+
+# Here we expose a class from a particular module, depending on the
+# environment. The comment suppresses the 'Invalid variable name' lint
+# complaint.
+TestCase = _test_module.TestCase  # pylint: disable=C6409
+
+# Initially maps a flag to its default value. After
+# _ParseAndStripGTestFlags() is called, maps a flag to its actual value.
+_flag_map = {'source_dir': os.path.dirname(sys.argv[0]),
+             'build_dir': os.path.dirname(sys.argv[0])}
+_gtest_flags_are_parsed = False
+
+
+def _ParseAndStripGTestFlags(argv):
+  """Parses and strips Google Test flags from argv.  This is idempotent."""
+
+  # Suppresses the lint complaint about a global variable since we need it
+  # here to maintain module-wide state.
+  global _gtest_flags_are_parsed  # pylint: disable=W0603
+  if _gtest_flags_are_parsed:
+    return
+
+  _gtest_flags_are_parsed = True
+  for flag in _flag_map:
+    # The environment variable overrides the default value.
+    if flag.upper() in os.environ:
+      _flag_map[flag] = os.environ[flag.upper()]
+
+    # The command line flag overrides the environment variable.
+    i = 1  # Skips the program name.
+    while i < len(argv):
+      prefix = '--' + flag + '='
+      if argv[i].startswith(prefix):
+        _flag_map[flag] = argv[i][len(prefix):]
+        del argv[i]
+        break
+      else:
+        # We don't increment i in case we just found a --gtest_* flag
+        # and removed it from argv.
+        i += 1
+
+
+def GetFlag(flag):
+  """Returns the value of the given flag."""
+
+  # In case GetFlag() is called before Main(), we always call
+  # _ParseAndStripGTestFlags() here to make sure the --gtest_* flags
+  # are parsed.
+  _ParseAndStripGTestFlags(sys.argv)
+
+  return _flag_map[flag]
+
+
+def GetSourceDir():
+  """Returns the absolute path of the directory where the .py files are."""
+
+  return os.path.abspath(GetFlag('source_dir'))
+
+
+def GetBuildDir():
+  """Returns the absolute path of the directory where the test binaries are."""
+
+  return os.path.abspath(GetFlag('build_dir'))
+
+
+_temp_dir = None
+
+def _RemoveTempDir():
+  if _temp_dir:
+    shutil.rmtree(_temp_dir, ignore_errors=True)
+
+atexit.register(_RemoveTempDir)
+
+
+def GetTempDir():
+  global _temp_dir
+  if not _temp_dir:
+    _temp_dir = tempfile.mkdtemp()
+  return _temp_dir
+
+
+def GetTestExecutablePath(executable_name, build_dir=None):
+  """Returns the absolute path of the test binary given its name.
+
+  The function will print a message and abort the program if the resulting file
+  doesn't exist.
+
+  Args:
+    executable_name: name of the test binary that the test script runs.
+    build_dir:       directory where to look for executables, by default
+                     the result of GetBuildDir().
+
+  Returns:
+    The absolute path of the test binary.
+  """
+
+  path = os.path.abspath(os.path.join(build_dir or GetBuildDir(),
+                                      executable_name))
+  if (IS_WINDOWS or IS_CYGWIN or IS_OS2) and not path.endswith('.exe'):
+    path += '.exe'
+
+  if not os.path.exists(path):
+    message = (
+        'Unable to find the test binary "%s". Please make sure to provide\n'
+        'a path to the binary via the --build_dir flag or the BUILD_DIR\n'
+        'environment variable.' % path)
+    print >> sys.stderr, message
+    sys.exit(1)
+
+  return path
+
+
+def GetExitStatus(exit_code):
+  """Returns the argument to exit(), or -1 if exit() wasn't called.
+
+  Args:
+    exit_code: the result value of os.system(command).
+  """
+
+  if os.name == 'nt':
+    # On Windows, os.WEXITSTATUS() doesn't work and os.system() returns
+    # the argument to exit() directly.
+    return exit_code
+  else:
+    # On Unix, os.WEXITSTATUS() must be used to extract the exit status
+    # from the result of os.system().
+    if os.WIFEXITED(exit_code):
+      return os.WEXITSTATUS(exit_code)
+    else:
+      return -1
+
+
+class Subprocess:
+  def __init__(self, command, working_dir=None, capture_stderr=True, env=None):
+    """Changes into a specified directory, if provided, and executes a command.
+
+    Restores the old directory afterwards.
+
+    Args:
+      command:        The command to run, in the form of sys.argv.
+      working_dir:    The directory to change into.
+      capture_stderr: Determines whether to capture stderr in the output member
+                      or to discard it.
+      env:            Dictionary with environment to pass to the subprocess.
+
+    Returns:
+      An object that represents outcome of the executed process. It has the
+      following attributes:
+        terminated_by_signal   True if the child process has been terminated
+                               by a signal.
+        signal                 Sygnal that terminated the child process.
+        exited                 True if the child process exited normally.
+        exit_code              The code with which the child process exited.
+        output                 Child process's stdout and stderr output
+                               combined in a string.
+    """
+
+    # The subprocess module is the preferrable way of running programs
+    # since it is available and behaves consistently on all platforms,
+    # including Windows. But it is only available starting in python 2.4.
+    # In earlier python versions, we revert to the popen2 module, which is
+    # available in python 2.0 and later but doesn't provide required
+    # functionality (Popen4) under Windows. This allows us to support Mac
+    # OS X 10.4 Tiger, which has python 2.3 installed.
+    if _SUBPROCESS_MODULE_AVAILABLE:
+      if capture_stderr:
+        stderr = subprocess.STDOUT
+      else:
+        stderr = subprocess.PIPE
+
+      p = subprocess.Popen(command,
+                           stdout=subprocess.PIPE, stderr=stderr,
+                           cwd=working_dir, universal_newlines=True, env=env)
+      # communicate returns a tuple with the file object for the child's
+      # output.
+      self.output = p.communicate()[0]
+      self._return_code = p.returncode
+    else:
+      old_dir = os.getcwd()
+
+      def _ReplaceEnvDict(dest, src):
+        # Changes made by os.environ.clear are not inheritable by child
+        # processes until Python 2.6. To produce inheritable changes we have
+        # to delete environment items with the del statement.
+        for key in dest.keys():
+          del dest[key]
+        dest.update(src)
+
+      # When 'env' is not None, backup the environment variables and replace
+      # them with the passed 'env'. When 'env' is None, we simply use the
+      # current 'os.environ' for compatibility with the subprocess.Popen
+      # semantics used above.
+      if env is not None:
+        old_environ = os.environ.copy()
+        _ReplaceEnvDict(os.environ, env)
+
+      try:
+        if working_dir is not None:
+          os.chdir(working_dir)
+        if capture_stderr:
+          p = popen2.Popen4(command)
+        else:
+          p = popen2.Popen3(command)
+        p.tochild.close()
+        self.output = p.fromchild.read()
+        ret_code = p.wait()
+      finally:
+        os.chdir(old_dir)
+
+        # Restore the old environment variables
+        # if they were replaced.
+        if env is not None:
+          _ReplaceEnvDict(os.environ, old_environ)
+
+      # Converts ret_code to match the semantics of
+      # subprocess.Popen.returncode.
+      if os.WIFSIGNALED(ret_code):
+        self._return_code = -os.WTERMSIG(ret_code)
+      else:  # os.WIFEXITED(ret_code) should return True here.
+        self._return_code = os.WEXITSTATUS(ret_code)
+
+    if self._return_code < 0:
+      self.terminated_by_signal = True
+      self.exited = False
+      self.signal = -self._return_code
+    else:
+      self.terminated_by_signal = False
+      self.exited = True
+      self.exit_code = self._return_code
+
+
+def Main():
+  """Runs the unit test."""
+
+  # We must call _ParseAndStripGTestFlags() before calling
+  # unittest.main().  Otherwise the latter will be confused by the
+  # --gtest_* flags.
+  _ParseAndStripGTestFlags(sys.argv)
+  # The tested binaries should not be writing XML output files unless the
+  # script explicitly instructs them to.
+  if GTEST_OUTPUT_VAR_NAME in os.environ:
+    del os.environ[GTEST_OUTPUT_VAR_NAME]
+
+  _test_module.main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test.py
new file mode 100755
index 0000000..87ffad7
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Copyright 2018 Google LLC. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Verifies that Google Test uses filter provided via testbridge."""
+
+import os
+
+import gtest_test_utils
+
+binary_name = 'gtest_testbridge_test_'
+COMMAND = gtest_test_utils.GetTestExecutablePath(binary_name)
+TESTBRIDGE_NAME = 'TESTBRIDGE_TEST_ONLY'
+
+
+def Assert(condition):
+  if not condition:
+    raise AssertionError
+
+
+class GTestTestFilterTest(gtest_test_utils.TestCase):
+
+  def testTestExecutionIsFiltered(self):
+    """Tests that the test filter is picked up from the testbridge env var."""
+    subprocess_env = os.environ.copy()
+
+    subprocess_env[TESTBRIDGE_NAME] = '*.TestThatSucceeds'
+    p = gtest_test_utils.Subprocess(COMMAND, env=subprocess_env)
+
+    self.assertEquals(0, p.exit_code)
+
+    Assert('filter = *.TestThatSucceeds' in p.output)
+    Assert('[       OK ] TestFilterTest.TestThatSucceeds' in p.output)
+    Assert('[  PASSED  ] 1 test.' in p.output)
+
+
+if __name__ == '__main__':
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test_.cc
new file mode 100755
index 0000000..24617b2
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_testbridge_test_.cc
@@ -0,0 +1,43 @@
+// Copyright 2018, Google LLC.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// This program is meant to be run by gtest_test_filter_test.py.  Do not run
+// it directly.
+
+#include "gtest/gtest.h"
+
+// These tests are used to detect if filtering is working. Only
+// 'TestThatSucceeds' should ever run.
+
+TEST(TestFilterTest, TestThatSucceeds) {}
+
+TEST(TestFilterTest, TestThatFails) {
+  ASSERT_TRUE(false) << "This test should never be run.";
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_throw_on_failure_ex_test.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_throw_on_failure_ex_test.cc
new file mode 100755
index 0000000..1d95adb
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_throw_on_failure_ex_test.cc
@@ -0,0 +1,90 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Test's throw-on-failure mode with exceptions enabled.
+
+#include "gtest/gtest.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdexcept>
+
+// Prints the given failure message and exits the program with
+// non-zero.  We use this instead of a Google Test assertion to
+// indicate a failure, as the latter is been tested and cannot be
+// relied on.
+void Fail(const char* msg) {
+  printf("FAILURE: %s\n", msg);
+  fflush(stdout);
+  exit(1);
+}
+
+// Tests that an assertion failure throws a subclass of
+// std::runtime_error.
+void TestFailureThrowsRuntimeError() {
+  testing::GTEST_FLAG(throw_on_failure) = true;
+
+  // A successful assertion shouldn't throw.
+  try {
+    EXPECT_EQ(3, 3);
+  } catch(...) {
+    Fail("A successful assertion wrongfully threw.");
+  }
+
+  // A failed assertion should throw a subclass of std::runtime_error.
+  try {
+    EXPECT_EQ(2, 3) << "Expected failure";
+  } catch(const std::runtime_error& e) {
+    if (strstr(e.what(), "Expected failure") != nullptr) return;
+
+    printf("%s",
+           "A failed assertion did throw an exception of the right type, "
+           "but the message is incorrect.  Instead of containing \"Expected "
+           "failure\", it is:\n");
+    Fail(e.what());
+  } catch(...) {
+    Fail("A failed assertion threw the wrong type of exception.");
+  }
+  Fail("A failed assertion should've thrown but didn't.");
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  // We want to ensure that people can use Google Test assertions in
+  // other testing frameworks, as long as they initialize Google Test
+  // properly and set the thrown-on-failure mode.  Therefore, we don't
+  // use Google Test's constructs for defining and running tests
+  // (e.g. TEST and RUN_ALL_TESTS) here.
+
+  TestFailureThrowsRuntimeError();
+  return 0;
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_unittest.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_unittest.cc
new file mode 100755
index 0000000..12c5a87
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_unittest.cc
@@ -0,0 +1,7494 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google Test itself.  This verifies that the basic constructs of
+// Google Test work.
+
+#include "gtest/gtest.h"
+
+// Verifies that the command line flag variables can be accessed in
+// code once "gtest.h" has been #included.
+// Do not move it after other gtest #includes.
+TEST(CommandLineFlagsTest, CanBeAccessedInCodeOnceGTestHIsIncluded) {
+  bool dummy = testing::GTEST_FLAG(also_run_disabled_tests)
+      || testing::GTEST_FLAG(break_on_failure)
+      || testing::GTEST_FLAG(catch_exceptions)
+      || testing::GTEST_FLAG(color) != "unknown"
+      || testing::GTEST_FLAG(filter) != "unknown"
+      || testing::GTEST_FLAG(list_tests)
+      || testing::GTEST_FLAG(output) != "unknown"
+      || testing::GTEST_FLAG(print_time)
+      || testing::GTEST_FLAG(random_seed)
+      || testing::GTEST_FLAG(repeat) > 0
+      || testing::GTEST_FLAG(show_internal_stack_frames)
+      || testing::GTEST_FLAG(shuffle)
+      || testing::GTEST_FLAG(stack_trace_depth) > 0
+      || testing::GTEST_FLAG(stream_result_to) != "unknown"
+      || testing::GTEST_FLAG(throw_on_failure);
+  EXPECT_TRUE(dummy || !dummy);  // Suppresses warning that dummy is unused.
+}
+
+#include <limits.h>  // For INT_MAX.
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <map>
+#include <ostream>
+#include <type_traits>
+#include <unordered_set>
+#include <vector>
+
+#include "gtest/gtest-spi.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+class StreamingListenerTest : public Test {
+ public:
+  class FakeSocketWriter : public StreamingListener::AbstractSocketWriter {
+   public:
+    // Sends a string to the socket.
+    void Send(const std::string& message) override { output_ += message; }
+
+    std::string output_;
+  };
+
+  StreamingListenerTest()
+      : fake_sock_writer_(new FakeSocketWriter),
+        streamer_(fake_sock_writer_),
+        test_info_obj_("FooTest", "Bar", nullptr, nullptr,
+                       CodeLocation(__FILE__, __LINE__), nullptr, nullptr) {}
+
+ protected:
+  std::string* output() { return &(fake_sock_writer_->output_); }
+
+  FakeSocketWriter* const fake_sock_writer_;
+  StreamingListener streamer_;
+  UnitTest unit_test_;
+  TestInfo test_info_obj_;  // The name test_info_ was taken by testing::Test.
+};
+
+TEST_F(StreamingListenerTest, OnTestProgramEnd) {
+  *output() = "";
+  streamer_.OnTestProgramEnd(unit_test_);
+  EXPECT_EQ("event=TestProgramEnd&passed=1\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestIterationEnd) {
+  *output() = "";
+  streamer_.OnTestIterationEnd(unit_test_, 42);
+  EXPECT_EQ("event=TestIterationEnd&passed=1&elapsed_time=0ms\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestCaseStart) {
+  *output() = "";
+  streamer_.OnTestCaseStart(TestCase("FooTest", "Bar", nullptr, nullptr));
+  EXPECT_EQ("event=TestCaseStart&name=FooTest\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestCaseEnd) {
+  *output() = "";
+  streamer_.OnTestCaseEnd(TestCase("FooTest", "Bar", nullptr, nullptr));
+  EXPECT_EQ("event=TestCaseEnd&passed=1&elapsed_time=0ms\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestStart) {
+  *output() = "";
+  streamer_.OnTestStart(test_info_obj_);
+  EXPECT_EQ("event=TestStart&name=Bar\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestEnd) {
+  *output() = "";
+  streamer_.OnTestEnd(test_info_obj_);
+  EXPECT_EQ("event=TestEnd&passed=1&elapsed_time=0ms\n", *output());
+}
+
+TEST_F(StreamingListenerTest, OnTestPartResult) {
+  *output() = "";
+  streamer_.OnTestPartResult(TestPartResult(
+      TestPartResult::kFatalFailure, "foo.cc", 42, "failed=\n&%"));
+
+  // Meta characters in the failure message should be properly escaped.
+  EXPECT_EQ(
+      "event=TestPartResult&file=foo.cc&line=42&message=failed%3D%0A%26%25\n",
+      *output());
+}
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Provides access to otherwise private parts of the TestEventListeners class
+// that are needed to test it.
+class TestEventListenersAccessor {
+ public:
+  static TestEventListener* GetRepeater(TestEventListeners* listeners) {
+    return listeners->repeater();
+  }
+
+  static void SetDefaultResultPrinter(TestEventListeners* listeners,
+                                      TestEventListener* listener) {
+    listeners->SetDefaultResultPrinter(listener);
+  }
+  static void SetDefaultXmlGenerator(TestEventListeners* listeners,
+                                     TestEventListener* listener) {
+    listeners->SetDefaultXmlGenerator(listener);
+  }
+
+  static bool EventForwardingEnabled(const TestEventListeners& listeners) {
+    return listeners.EventForwardingEnabled();
+  }
+
+  static void SuppressEventForwarding(TestEventListeners* listeners) {
+    listeners->SuppressEventForwarding();
+  }
+};
+
+class UnitTestRecordPropertyTestHelper : public Test {
+ protected:
+  UnitTestRecordPropertyTestHelper() {}
+
+  // Forwards to UnitTest::RecordProperty() to bypass access controls.
+  void UnitTestRecordProperty(const char* key, const std::string& value) {
+    unit_test_.RecordProperty(key, value);
+  }
+
+  UnitTest unit_test_;
+};
+
+}  // namespace internal
+}  // namespace testing
+
+using testing::AssertionFailure;
+using testing::AssertionResult;
+using testing::AssertionSuccess;
+using testing::DoubleLE;
+using testing::EmptyTestEventListener;
+using testing::Environment;
+using testing::FloatLE;
+using testing::GTEST_FLAG(also_run_disabled_tests);
+using testing::GTEST_FLAG(break_on_failure);
+using testing::GTEST_FLAG(catch_exceptions);
+using testing::GTEST_FLAG(color);
+using testing::GTEST_FLAG(death_test_use_fork);
+using testing::GTEST_FLAG(filter);
+using testing::GTEST_FLAG(list_tests);
+using testing::GTEST_FLAG(output);
+using testing::GTEST_FLAG(print_time);
+using testing::GTEST_FLAG(random_seed);
+using testing::GTEST_FLAG(repeat);
+using testing::GTEST_FLAG(show_internal_stack_frames);
+using testing::GTEST_FLAG(shuffle);
+using testing::GTEST_FLAG(stack_trace_depth);
+using testing::GTEST_FLAG(stream_result_to);
+using testing::GTEST_FLAG(throw_on_failure);
+using testing::IsNotSubstring;
+using testing::IsSubstring;
+using testing::Message;
+using testing::ScopedFakeTestPartResultReporter;
+using testing::StaticAssertTypeEq;
+using testing::Test;
+using testing::TestCase;
+using testing::TestEventListeners;
+using testing::TestInfo;
+using testing::TestPartResult;
+using testing::TestPartResultArray;
+using testing::TestProperty;
+using testing::TestResult;
+using testing::TimeInMillis;
+using testing::UnitTest;
+using testing::internal::AlwaysFalse;
+using testing::internal::AlwaysTrue;
+using testing::internal::AppendUserMessage;
+using testing::internal::ArrayAwareFind;
+using testing::internal::ArrayEq;
+using testing::internal::CodePointToUtf8;
+using testing::internal::CompileAssertTypesEqual;
+using testing::internal::CopyArray;
+using testing::internal::CountIf;
+using testing::internal::EqFailure;
+using testing::internal::FloatingPoint;
+using testing::internal::ForEach;
+using testing::internal::FormatEpochTimeInMillisAsIso8601;
+using testing::internal::FormatTimeInMillisAsSeconds;
+using testing::internal::GTestFlagSaver;
+using testing::internal::GetCurrentOsStackTraceExceptTop;
+using testing::internal::GetElementOr;
+using testing::internal::GetNextRandomSeed;
+using testing::internal::GetRandomSeedFromFlag;
+using testing::internal::GetTestTypeId;
+using testing::internal::GetTimeInMillis;
+using testing::internal::GetTypeId;
+using testing::internal::GetUnitTestImpl;
+using testing::internal::Int32;
+using testing::internal::Int32FromEnvOrDie;
+using testing::internal::IsAProtocolMessage;
+using testing::internal::IsContainer;
+using testing::internal::IsContainerTest;
+using testing::internal::IsNotContainer;
+using testing::internal::NativeArray;
+using testing::internal::OsStackTraceGetter;
+using testing::internal::OsStackTraceGetterInterface;
+using testing::internal::ParseInt32Flag;
+using testing::internal::RelationToSourceCopy;
+using testing::internal::RelationToSourceReference;
+using testing::internal::ShouldRunTestOnShard;
+using testing::internal::ShouldShard;
+using testing::internal::ShouldUseColor;
+using testing::internal::Shuffle;
+using testing::internal::ShuffleRange;
+using testing::internal::SkipPrefix;
+using testing::internal::StreamableToString;
+using testing::internal::String;
+using testing::internal::TestEventListenersAccessor;
+using testing::internal::TestResultAccessor;
+using testing::internal::UInt32;
+using testing::internal::UnitTestImpl;
+using testing::internal::WideStringToUtf8;
+using testing::internal::edit_distance::CalculateOptimalEdits;
+using testing::internal::edit_distance::CreateUnifiedDiff;
+using testing::internal::edit_distance::EditType;
+using testing::internal::kMaxRandomSeed;
+using testing::internal::kTestTypeIdInGoogleTest;
+using testing::kMaxStackTraceDepth;
+
+#if GTEST_HAS_STREAM_REDIRECTION
+using testing::internal::CaptureStdout;
+using testing::internal::GetCapturedStdout;
+#endif
+
+#if GTEST_IS_THREADSAFE
+using testing::internal::ThreadWithParam;
+#endif
+
+class TestingVector : public std::vector<int> {
+};
+
+::std::ostream& operator<<(::std::ostream& os,
+                           const TestingVector& vector) {
+  os << "{ ";
+  for (size_t i = 0; i < vector.size(); i++) {
+    os << vector[i] << " ";
+  }
+  os << "}";
+  return os;
+}
+
+// This line tests that we can define tests in an unnamed namespace.
+namespace {
+
+TEST(GetRandomSeedFromFlagTest, HandlesZero) {
+  const int seed = GetRandomSeedFromFlag(0);
+  EXPECT_LE(1, seed);
+  EXPECT_LE(seed, static_cast<int>(kMaxRandomSeed));
+}
+
+TEST(GetRandomSeedFromFlagTest, PreservesValidSeed) {
+  EXPECT_EQ(1, GetRandomSeedFromFlag(1));
+  EXPECT_EQ(2, GetRandomSeedFromFlag(2));
+  EXPECT_EQ(kMaxRandomSeed - 1, GetRandomSeedFromFlag(kMaxRandomSeed - 1));
+  EXPECT_EQ(static_cast<int>(kMaxRandomSeed),
+            GetRandomSeedFromFlag(kMaxRandomSeed));
+}
+
+TEST(GetRandomSeedFromFlagTest, NormalizesInvalidSeed) {
+  const int seed1 = GetRandomSeedFromFlag(-1);
+  EXPECT_LE(1, seed1);
+  EXPECT_LE(seed1, static_cast<int>(kMaxRandomSeed));
+
+  const int seed2 = GetRandomSeedFromFlag(kMaxRandomSeed + 1);
+  EXPECT_LE(1, seed2);
+  EXPECT_LE(seed2, static_cast<int>(kMaxRandomSeed));
+}
+
+TEST(GetNextRandomSeedTest, WorksForValidInput) {
+  EXPECT_EQ(2, GetNextRandomSeed(1));
+  EXPECT_EQ(3, GetNextRandomSeed(2));
+  EXPECT_EQ(static_cast<int>(kMaxRandomSeed),
+            GetNextRandomSeed(kMaxRandomSeed - 1));
+  EXPECT_EQ(1, GetNextRandomSeed(kMaxRandomSeed));
+
+  // We deliberately don't test GetNextRandomSeed() with invalid
+  // inputs, as that requires death tests, which are expensive.  This
+  // is fine as GetNextRandomSeed() is internal and has a
+  // straightforward definition.
+}
+
+static void ClearCurrentTestPartResults() {
+  TestResultAccessor::ClearTestPartResults(
+      GetUnitTestImpl()->current_test_result());
+}
+
+// Tests GetTypeId.
+
+TEST(GetTypeIdTest, ReturnsSameValueForSameType) {
+  EXPECT_EQ(GetTypeId<int>(), GetTypeId<int>());
+  EXPECT_EQ(GetTypeId<Test>(), GetTypeId<Test>());
+}
+
+class SubClassOfTest : public Test {};
+class AnotherSubClassOfTest : public Test {};
+
+TEST(GetTypeIdTest, ReturnsDifferentValuesForDifferentTypes) {
+  EXPECT_NE(GetTypeId<int>(), GetTypeId<const int>());
+  EXPECT_NE(GetTypeId<int>(), GetTypeId<char>());
+  EXPECT_NE(GetTypeId<int>(), GetTestTypeId());
+  EXPECT_NE(GetTypeId<SubClassOfTest>(), GetTestTypeId());
+  EXPECT_NE(GetTypeId<AnotherSubClassOfTest>(), GetTestTypeId());
+  EXPECT_NE(GetTypeId<AnotherSubClassOfTest>(), GetTypeId<SubClassOfTest>());
+}
+
+// Verifies that GetTestTypeId() returns the same value, no matter it
+// is called from inside Google Test or outside of it.
+TEST(GetTestTypeIdTest, ReturnsTheSameValueInsideOrOutsideOfGoogleTest) {
+  EXPECT_EQ(kTestTypeIdInGoogleTest, GetTestTypeId());
+}
+
+// Tests CanonicalizeForStdLibVersioning.
+
+using ::testing::internal::CanonicalizeForStdLibVersioning;
+
+TEST(CanonicalizeForStdLibVersioning, LeavesUnversionedNamesUnchanged) {
+  EXPECT_EQ("std::bind", CanonicalizeForStdLibVersioning("std::bind"));
+  EXPECT_EQ("std::_", CanonicalizeForStdLibVersioning("std::_"));
+  EXPECT_EQ("std::__foo", CanonicalizeForStdLibVersioning("std::__foo"));
+  EXPECT_EQ("gtl::__1::x", CanonicalizeForStdLibVersioning("gtl::__1::x"));
+  EXPECT_EQ("__1::x", CanonicalizeForStdLibVersioning("__1::x"));
+  EXPECT_EQ("::__1::x", CanonicalizeForStdLibVersioning("::__1::x"));
+}
+
+TEST(CanonicalizeForStdLibVersioning, ElidesDoubleUnderNames) {
+  EXPECT_EQ("std::bind", CanonicalizeForStdLibVersioning("std::__1::bind"));
+  EXPECT_EQ("std::_", CanonicalizeForStdLibVersioning("std::__1::_"));
+
+  EXPECT_EQ("std::bind", CanonicalizeForStdLibVersioning("std::__g::bind"));
+  EXPECT_EQ("std::_", CanonicalizeForStdLibVersioning("std::__g::_"));
+
+  EXPECT_EQ("std::bind",
+            CanonicalizeForStdLibVersioning("std::__google::bind"));
+  EXPECT_EQ("std::_", CanonicalizeForStdLibVersioning("std::__google::_"));
+}
+
+// Tests FormatTimeInMillisAsSeconds().
+
+TEST(FormatTimeInMillisAsSecondsTest, FormatsZero) {
+  EXPECT_EQ("0", FormatTimeInMillisAsSeconds(0));
+}
+
+TEST(FormatTimeInMillisAsSecondsTest, FormatsPositiveNumber) {
+  EXPECT_EQ("0.003", FormatTimeInMillisAsSeconds(3));
+  EXPECT_EQ("0.01", FormatTimeInMillisAsSeconds(10));
+  EXPECT_EQ("0.2", FormatTimeInMillisAsSeconds(200));
+  EXPECT_EQ("1.2", FormatTimeInMillisAsSeconds(1200));
+  EXPECT_EQ("3", FormatTimeInMillisAsSeconds(3000));
+}
+
+TEST(FormatTimeInMillisAsSecondsTest, FormatsNegativeNumber) {
+  EXPECT_EQ("-0.003", FormatTimeInMillisAsSeconds(-3));
+  EXPECT_EQ("-0.01", FormatTimeInMillisAsSeconds(-10));
+  EXPECT_EQ("-0.2", FormatTimeInMillisAsSeconds(-200));
+  EXPECT_EQ("-1.2", FormatTimeInMillisAsSeconds(-1200));
+  EXPECT_EQ("-3", FormatTimeInMillisAsSeconds(-3000));
+}
+
+// Tests FormatEpochTimeInMillisAsIso8601().  The correctness of conversion
+// for particular dates below was verified in Python using
+// datetime.datetime.fromutctimestamp(<timetamp>/1000).
+
+// FormatEpochTimeInMillisAsIso8601 depends on the current timezone, so we
+// have to set up a particular timezone to obtain predictable results.
+class FormatEpochTimeInMillisAsIso8601Test : public Test {
+ public:
+  // On Cygwin, GCC doesn't allow unqualified integer literals to exceed
+  // 32 bits, even when 64-bit integer types are available.  We have to
+  // force the constants to have a 64-bit type here.
+  static const TimeInMillis kMillisPerSec = 1000;
+
+ private:
+  void SetUp() override {
+    saved_tz_ = nullptr;
+
+    GTEST_DISABLE_MSC_DEPRECATED_PUSH_(/* getenv, strdup: deprecated */)
+    if (getenv("TZ"))
+      saved_tz_ = strdup(getenv("TZ"));
+    GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+    // Set up the time zone for FormatEpochTimeInMillisAsIso8601 to use.  We
+    // cannot use the local time zone because the function's output depends
+    // on the time zone.
+    SetTimeZone("UTC+00");
+  }
+
+  void TearDown() override {
+    SetTimeZone(saved_tz_);
+    free(const_cast<char*>(saved_tz_));
+    saved_tz_ = nullptr;
+  }
+
+  static void SetTimeZone(const char* time_zone) {
+    // tzset() distinguishes between the TZ variable being present and empty
+    // and not being present, so we have to consider the case of time_zone
+    // being NULL.
+#if _MSC_VER || GTEST_OS_WINDOWS_MINGW
+    // ...Unless it's MSVC, whose standard library's _putenv doesn't
+    // distinguish between an empty and a missing variable.
+    const std::string env_var =
+        std::string("TZ=") + (time_zone ? time_zone : "");
+    _putenv(env_var.c_str());
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+    tzset();
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#else
+    if (time_zone) {
+      setenv(("TZ"), time_zone, 1);
+    } else {
+      unsetenv("TZ");
+    }
+    tzset();
+#endif
+  }
+
+  const char* saved_tz_;
+};
+
+const TimeInMillis FormatEpochTimeInMillisAsIso8601Test::kMillisPerSec;
+
+TEST_F(FormatEpochTimeInMillisAsIso8601Test, PrintsTwoDigitSegments) {
+  EXPECT_EQ("2011-10-31T18:52:42",
+            FormatEpochTimeInMillisAsIso8601(1320087162 * kMillisPerSec));
+}
+
+TEST_F(FormatEpochTimeInMillisAsIso8601Test, MillisecondsDoNotAffectResult) {
+  EXPECT_EQ(
+      "2011-10-31T18:52:42",
+      FormatEpochTimeInMillisAsIso8601(1320087162 * kMillisPerSec + 234));
+}
+
+TEST_F(FormatEpochTimeInMillisAsIso8601Test, PrintsLeadingZeroes) {
+  EXPECT_EQ("2011-09-03T05:07:02",
+            FormatEpochTimeInMillisAsIso8601(1315026422 * kMillisPerSec));
+}
+
+TEST_F(FormatEpochTimeInMillisAsIso8601Test, Prints24HourTime) {
+  EXPECT_EQ("2011-09-28T17:08:22",
+            FormatEpochTimeInMillisAsIso8601(1317229702 * kMillisPerSec));
+}
+
+TEST_F(FormatEpochTimeInMillisAsIso8601Test, PrintsEpochStart) {
+  EXPECT_EQ("1970-01-01T00:00:00", FormatEpochTimeInMillisAsIso8601(0));
+}
+
+# ifdef __BORLANDC__
+// Silences warnings: "Condition is always true", "Unreachable code"
+#  pragma option push -w-ccc -w-rch
+# endif
+
+// Tests that the LHS of EXPECT_EQ or ASSERT_EQ can be used as a null literal
+// when the RHS is a pointer type.
+TEST(NullLiteralTest, LHSAllowsNullLiterals) {
+  EXPECT_EQ(0, static_cast<void*>(nullptr));     // NOLINT
+  ASSERT_EQ(0, static_cast<void*>(nullptr));     // NOLINT
+  EXPECT_EQ(NULL, static_cast<void*>(nullptr));  // NOLINT
+  ASSERT_EQ(NULL, static_cast<void*>(nullptr));  // NOLINT
+  EXPECT_EQ(nullptr, static_cast<void*>(nullptr));
+  ASSERT_EQ(nullptr, static_cast<void*>(nullptr));
+
+  const int* const p = nullptr;
+  EXPECT_EQ(0, p);     // NOLINT
+  ASSERT_EQ(0, p);     // NOLINT
+  EXPECT_EQ(NULL, p);  // NOLINT
+  ASSERT_EQ(NULL, p);  // NOLINT
+  EXPECT_EQ(nullptr, p);
+  ASSERT_EQ(nullptr, p);
+}
+
+struct ConvertToAll {
+  template <typename T>
+  operator T() const {  // NOLINT
+    return T();
+  }
+};
+
+struct ConvertToPointer {
+  template <class T>
+  operator T*() const {  // NOLINT
+    return nullptr;
+  }
+};
+
+struct ConvertToAllButNoPointers {
+  template <typename T,
+            typename std::enable_if<!std::is_pointer<T>::value, int>::type = 0>
+  operator T() const {  // NOLINT
+    return T();
+  }
+};
+
+struct MyType {};
+inline bool operator==(MyType const&, MyType const&) { return true; }
+
+TEST(NullLiteralTest, ImplicitConversion) {
+  EXPECT_EQ(ConvertToPointer{}, static_cast<void*>(nullptr));
+#if !defined(__GNUC__) || defined(__clang__)
+  // Disabled due to GCC bug gcc.gnu.org/PR89580
+  EXPECT_EQ(ConvertToAll{}, static_cast<void*>(nullptr));
+#endif
+  EXPECT_EQ(ConvertToAll{}, MyType{});
+  EXPECT_EQ(ConvertToAllButNoPointers{}, MyType{});
+}
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#if __has_warning("-Wzero-as-null-pointer-constant")
+#pragma clang diagnostic error "-Wzero-as-null-pointer-constant"
+#endif
+#endif
+
+TEST(NullLiteralTest, NoConversionNoWarning) {
+  // Test that gtests detection and handling of null pointer constants
+  // doesn't trigger a warning when '0' isn't actually used as null.
+  EXPECT_EQ(0, 0);
+  ASSERT_EQ(0, 0);
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+# ifdef __BORLANDC__
+// Restores warnings after previous "#pragma option push" suppressed them.
+#  pragma option pop
+# endif
+
+//
+// Tests CodePointToUtf8().
+
+// Tests that the NUL character L'\0' is encoded correctly.
+TEST(CodePointToUtf8Test, CanEncodeNul) {
+  EXPECT_EQ("", CodePointToUtf8(L'\0'));
+}
+
+// Tests that ASCII characters are encoded correctly.
+TEST(CodePointToUtf8Test, CanEncodeAscii) {
+  EXPECT_EQ("a", CodePointToUtf8(L'a'));
+  EXPECT_EQ("Z", CodePointToUtf8(L'Z'));
+  EXPECT_EQ("&", CodePointToUtf8(L'&'));
+  EXPECT_EQ("\x7F", CodePointToUtf8(L'\x7F'));
+}
+
+// Tests that Unicode code-points that have 8 to 11 bits are encoded
+// as 110xxxxx 10xxxxxx.
+TEST(CodePointToUtf8Test, CanEncode8To11Bits) {
+  // 000 1101 0011 => 110-00011 10-010011
+  EXPECT_EQ("\xC3\x93", CodePointToUtf8(L'\xD3'));
+
+  // 101 0111 0110 => 110-10101 10-110110
+  // Some compilers (e.g., GCC on MinGW) cannot handle non-ASCII codepoints
+  // in wide strings and wide chars. In order to accommodate them, we have to
+  // introduce such character constants as integers.
+  EXPECT_EQ("\xD5\xB6",
+            CodePointToUtf8(static_cast<wchar_t>(0x576)));
+}
+
+// Tests that Unicode code-points that have 12 to 16 bits are encoded
+// as 1110xxxx 10xxxxxx 10xxxxxx.
+TEST(CodePointToUtf8Test, CanEncode12To16Bits) {
+  // 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011
+  EXPECT_EQ("\xE0\xA3\x93",
+            CodePointToUtf8(static_cast<wchar_t>(0x8D3)));
+
+  // 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101
+  EXPECT_EQ("\xEC\x9D\x8D",
+            CodePointToUtf8(static_cast<wchar_t>(0xC74D)));
+}
+
+#if !GTEST_WIDE_STRING_USES_UTF16_
+// Tests in this group require a wchar_t to hold > 16 bits, and thus
+// are skipped on Windows, and Cygwin, where a wchar_t is
+// 16-bit wide. This code may not compile on those systems.
+
+// Tests that Unicode code-points that have 17 to 21 bits are encoded
+// as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
+TEST(CodePointToUtf8Test, CanEncode17To21Bits) {
+  // 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011
+  EXPECT_EQ("\xF0\x90\xA3\x93", CodePointToUtf8(L'\x108D3'));
+
+  // 0 0001 0000 0100 0000 0000 => 11110-000 10-010000 10-010000 10-000000
+  EXPECT_EQ("\xF0\x90\x90\x80", CodePointToUtf8(L'\x10400'));
+
+  // 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100
+  EXPECT_EQ("\xF4\x88\x98\xB4", CodePointToUtf8(L'\x108634'));
+}
+
+// Tests that encoding an invalid code-point generates the expected result.
+TEST(CodePointToUtf8Test, CanEncodeInvalidCodePoint) {
+  EXPECT_EQ("(Invalid Unicode 0x1234ABCD)", CodePointToUtf8(L'\x1234ABCD'));
+}
+
+#endif  // !GTEST_WIDE_STRING_USES_UTF16_
+
+// Tests WideStringToUtf8().
+
+// Tests that the NUL character L'\0' is encoded correctly.
+TEST(WideStringToUtf8Test, CanEncodeNul) {
+  EXPECT_STREQ("", WideStringToUtf8(L"", 0).c_str());
+  EXPECT_STREQ("", WideStringToUtf8(L"", -1).c_str());
+}
+
+// Tests that ASCII strings are encoded correctly.
+TEST(WideStringToUtf8Test, CanEncodeAscii) {
+  EXPECT_STREQ("a", WideStringToUtf8(L"a", 1).c_str());
+  EXPECT_STREQ("ab", WideStringToUtf8(L"ab", 2).c_str());
+  EXPECT_STREQ("a", WideStringToUtf8(L"a", -1).c_str());
+  EXPECT_STREQ("ab", WideStringToUtf8(L"ab", -1).c_str());
+}
+
+// Tests that Unicode code-points that have 8 to 11 bits are encoded
+// as 110xxxxx 10xxxxxx.
+TEST(WideStringToUtf8Test, CanEncode8To11Bits) {
+  // 000 1101 0011 => 110-00011 10-010011
+  EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", 1).c_str());
+  EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", -1).c_str());
+
+  // 101 0111 0110 => 110-10101 10-110110
+  const wchar_t s[] = { 0x576, '\0' };
+  EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(s, 1).c_str());
+  EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(s, -1).c_str());
+}
+
+// Tests that Unicode code-points that have 12 to 16 bits are encoded
+// as 1110xxxx 10xxxxxx 10xxxxxx.
+TEST(WideStringToUtf8Test, CanEncode12To16Bits) {
+  // 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011
+  const wchar_t s1[] = { 0x8D3, '\0' };
+  EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(s1, 1).c_str());
+  EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(s1, -1).c_str());
+
+  // 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101
+  const wchar_t s2[] = { 0xC74D, '\0' };
+  EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(s2, 1).c_str());
+  EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(s2, -1).c_str());
+}
+
+// Tests that the conversion stops when the function encounters \0 character.
+TEST(WideStringToUtf8Test, StopsOnNulCharacter) {
+  EXPECT_STREQ("ABC", WideStringToUtf8(L"ABC\0XYZ", 100).c_str());
+}
+
+// Tests that the conversion stops when the function reaches the limit
+// specified by the 'length' parameter.
+TEST(WideStringToUtf8Test, StopsWhenLengthLimitReached) {
+  EXPECT_STREQ("ABC", WideStringToUtf8(L"ABCDEF", 3).c_str());
+}
+
+#if !GTEST_WIDE_STRING_USES_UTF16_
+// Tests that Unicode code-points that have 17 to 21 bits are encoded
+// as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. This code may not compile
+// on the systems using UTF-16 encoding.
+TEST(WideStringToUtf8Test, CanEncode17To21Bits) {
+  // 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011
+  EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", 1).c_str());
+  EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", -1).c_str());
+
+  // 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100
+  EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", 1).c_str());
+  EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", -1).c_str());
+}
+
+// Tests that encoding an invalid code-point generates the expected result.
+TEST(WideStringToUtf8Test, CanEncodeInvalidCodePoint) {
+  EXPECT_STREQ("(Invalid Unicode 0xABCDFF)",
+               WideStringToUtf8(L"\xABCDFF", -1).c_str());
+}
+#else  // !GTEST_WIDE_STRING_USES_UTF16_
+// Tests that surrogate pairs are encoded correctly on the systems using
+// UTF-16 encoding in the wide strings.
+TEST(WideStringToUtf8Test, CanEncodeValidUtf16SUrrogatePairs) {
+  const wchar_t s[] = { 0xD801, 0xDC00, '\0' };
+  EXPECT_STREQ("\xF0\x90\x90\x80", WideStringToUtf8(s, -1).c_str());
+}
+
+// Tests that encoding an invalid UTF-16 surrogate pair
+// generates the expected result.
+TEST(WideStringToUtf8Test, CanEncodeInvalidUtf16SurrogatePair) {
+  // Leading surrogate is at the end of the string.
+  const wchar_t s1[] = { 0xD800, '\0' };
+  EXPECT_STREQ("\xED\xA0\x80", WideStringToUtf8(s1, -1).c_str());
+  // Leading surrogate is not followed by the trailing surrogate.
+  const wchar_t s2[] = { 0xD800, 'M', '\0' };
+  EXPECT_STREQ("\xED\xA0\x80M", WideStringToUtf8(s2, -1).c_str());
+  // Trailing surrogate appearas without a leading surrogate.
+  const wchar_t s3[] = { 0xDC00, 'P', 'Q', 'R', '\0' };
+  EXPECT_STREQ("\xED\xB0\x80PQR", WideStringToUtf8(s3, -1).c_str());
+}
+#endif  // !GTEST_WIDE_STRING_USES_UTF16_
+
+// Tests that codepoint concatenation works correctly.
+#if !GTEST_WIDE_STRING_USES_UTF16_
+TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) {
+  const wchar_t s[] = { 0x108634, 0xC74D, '\n', 0x576, 0x8D3, 0x108634, '\0'};
+  EXPECT_STREQ(
+      "\xF4\x88\x98\xB4"
+          "\xEC\x9D\x8D"
+          "\n"
+          "\xD5\xB6"
+          "\xE0\xA3\x93"
+          "\xF4\x88\x98\xB4",
+      WideStringToUtf8(s, -1).c_str());
+}
+#else
+TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) {
+  const wchar_t s[] = { 0xC74D, '\n', 0x576, 0x8D3, '\0'};
+  EXPECT_STREQ(
+      "\xEC\x9D\x8D" "\n" "\xD5\xB6" "\xE0\xA3\x93",
+      WideStringToUtf8(s, -1).c_str());
+}
+#endif  // !GTEST_WIDE_STRING_USES_UTF16_
+
+// Tests the Random class.
+
+TEST(RandomDeathTest, GeneratesCrashesOnInvalidRange) {
+  testing::internal::Random random(42);
+  EXPECT_DEATH_IF_SUPPORTED(
+      random.Generate(0),
+      "Cannot generate a number in the range \\[0, 0\\)");
+  EXPECT_DEATH_IF_SUPPORTED(
+      random.Generate(testing::internal::Random::kMaxRange + 1),
+      "Generation of a number in \\[0, 2147483649\\) was requested, "
+      "but this can only generate numbers in \\[0, 2147483648\\)");
+}
+
+TEST(RandomTest, GeneratesNumbersWithinRange) {
+  const UInt32 kRange = 10000;
+  testing::internal::Random random(12345);
+  for (int i = 0; i < 10; i++) {
+    EXPECT_LT(random.Generate(kRange), kRange) << " for iteration " << i;
+  }
+
+  testing::internal::Random random2(testing::internal::Random::kMaxRange);
+  for (int i = 0; i < 10; i++) {
+    EXPECT_LT(random2.Generate(kRange), kRange) << " for iteration " << i;
+  }
+}
+
+TEST(RandomTest, RepeatsWhenReseeded) {
+  const int kSeed = 123;
+  const int kArraySize = 10;
+  const UInt32 kRange = 10000;
+  UInt32 values[kArraySize];
+
+  testing::internal::Random random(kSeed);
+  for (int i = 0; i < kArraySize; i++) {
+    values[i] = random.Generate(kRange);
+  }
+
+  random.Reseed(kSeed);
+  for (int i = 0; i < kArraySize; i++) {
+    EXPECT_EQ(values[i], random.Generate(kRange)) << " for iteration " << i;
+  }
+}
+
+// Tests STL container utilities.
+
+// Tests CountIf().
+
+static bool IsPositive(int n) { return n > 0; }
+
+TEST(ContainerUtilityTest, CountIf) {
+  std::vector<int> v;
+  EXPECT_EQ(0, CountIf(v, IsPositive));  // Works for an empty container.
+
+  v.push_back(-1);
+  v.push_back(0);
+  EXPECT_EQ(0, CountIf(v, IsPositive));  // Works when no value satisfies.
+
+  v.push_back(2);
+  v.push_back(-10);
+  v.push_back(10);
+  EXPECT_EQ(2, CountIf(v, IsPositive));
+}
+
+// Tests ForEach().
+
+static int g_sum = 0;
+static void Accumulate(int n) { g_sum += n; }
+
+TEST(ContainerUtilityTest, ForEach) {
+  std::vector<int> v;
+  g_sum = 0;
+  ForEach(v, Accumulate);
+  EXPECT_EQ(0, g_sum);  // Works for an empty container;
+
+  g_sum = 0;
+  v.push_back(1);
+  ForEach(v, Accumulate);
+  EXPECT_EQ(1, g_sum);  // Works for a container with one element.
+
+  g_sum = 0;
+  v.push_back(20);
+  v.push_back(300);
+  ForEach(v, Accumulate);
+  EXPECT_EQ(321, g_sum);
+}
+
+// Tests GetElementOr().
+TEST(ContainerUtilityTest, GetElementOr) {
+  std::vector<char> a;
+  EXPECT_EQ('x', GetElementOr(a, 0, 'x'));
+
+  a.push_back('a');
+  a.push_back('b');
+  EXPECT_EQ('a', GetElementOr(a, 0, 'x'));
+  EXPECT_EQ('b', GetElementOr(a, 1, 'x'));
+  EXPECT_EQ('x', GetElementOr(a, -2, 'x'));
+  EXPECT_EQ('x', GetElementOr(a, 2, 'x'));
+}
+
+TEST(ContainerUtilityDeathTest, ShuffleRange) {
+  std::vector<int> a;
+  a.push_back(0);
+  a.push_back(1);
+  a.push_back(2);
+  testing::internal::Random random(1);
+
+  EXPECT_DEATH_IF_SUPPORTED(
+      ShuffleRange(&random, -1, 1, &a),
+      "Invalid shuffle range start -1: must be in range \\[0, 3\\]");
+  EXPECT_DEATH_IF_SUPPORTED(
+      ShuffleRange(&random, 4, 4, &a),
+      "Invalid shuffle range start 4: must be in range \\[0, 3\\]");
+  EXPECT_DEATH_IF_SUPPORTED(
+      ShuffleRange(&random, 3, 2, &a),
+      "Invalid shuffle range finish 2: must be in range \\[3, 3\\]");
+  EXPECT_DEATH_IF_SUPPORTED(
+      ShuffleRange(&random, 3, 4, &a),
+      "Invalid shuffle range finish 4: must be in range \\[3, 3\\]");
+}
+
+class VectorShuffleTest : public Test {
+ protected:
+  static const size_t kVectorSize = 20;
+
+  VectorShuffleTest() : random_(1) {
+    for (int i = 0; i < static_cast<int>(kVectorSize); i++) {
+      vector_.push_back(i);
+    }
+  }
+
+  static bool VectorIsCorrupt(const TestingVector& vector) {
+    if (kVectorSize != vector.size()) {
+      return true;
+    }
+
+    bool found_in_vector[kVectorSize] = { false };
+    for (size_t i = 0; i < vector.size(); i++) {
+      const int e = vector[i];
+      if (e < 0 || e >= static_cast<int>(kVectorSize) || found_in_vector[e]) {
+        return true;
+      }
+      found_in_vector[e] = true;
+    }
+
+    // Vector size is correct, elements' range is correct, no
+    // duplicate elements.  Therefore no corruption has occurred.
+    return false;
+  }
+
+  static bool VectorIsNotCorrupt(const TestingVector& vector) {
+    return !VectorIsCorrupt(vector);
+  }
+
+  static bool RangeIsShuffled(const TestingVector& vector, int begin, int end) {
+    for (int i = begin; i < end; i++) {
+      if (i != vector[static_cast<size_t>(i)]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static bool RangeIsUnshuffled(
+      const TestingVector& vector, int begin, int end) {
+    return !RangeIsShuffled(vector, begin, end);
+  }
+
+  static bool VectorIsShuffled(const TestingVector& vector) {
+    return RangeIsShuffled(vector, 0, static_cast<int>(vector.size()));
+  }
+
+  static bool VectorIsUnshuffled(const TestingVector& vector) {
+    return !VectorIsShuffled(vector);
+  }
+
+  testing::internal::Random random_;
+  TestingVector vector_;
+};  // class VectorShuffleTest
+
+const size_t VectorShuffleTest::kVectorSize;
+
+TEST_F(VectorShuffleTest, HandlesEmptyRange) {
+  // Tests an empty range at the beginning...
+  ShuffleRange(&random_, 0, 0, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+
+  // ...in the middle...
+  ShuffleRange(&random_, kVectorSize/2, kVectorSize/2, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+
+  // ...at the end...
+  ShuffleRange(&random_, kVectorSize - 1, kVectorSize - 1, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+
+  // ...and past the end.
+  ShuffleRange(&random_, kVectorSize, kVectorSize, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+}
+
+TEST_F(VectorShuffleTest, HandlesRangeOfSizeOne) {
+  // Tests a size one range at the beginning...
+  ShuffleRange(&random_, 0, 1, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+
+  // ...in the middle...
+  ShuffleRange(&random_, kVectorSize/2, kVectorSize/2 + 1, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+
+  // ...and at the end.
+  ShuffleRange(&random_, kVectorSize - 1, kVectorSize, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsUnshuffled, vector_);
+}
+
+// Because we use our own random number generator and a fixed seed,
+// we can guarantee that the following "random" tests will succeed.
+
+TEST_F(VectorShuffleTest, ShufflesEntireVector) {
+  Shuffle(&random_, &vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  EXPECT_FALSE(VectorIsUnshuffled(vector_)) << vector_;
+
+  // Tests the first and last elements in particular to ensure that
+  // there are no off-by-one problems in our shuffle algorithm.
+  EXPECT_NE(0, vector_[0]);
+  EXPECT_NE(static_cast<int>(kVectorSize - 1), vector_[kVectorSize - 1]);
+}
+
+TEST_F(VectorShuffleTest, ShufflesStartOfVector) {
+  const int kRangeSize = kVectorSize/2;
+
+  ShuffleRange(&random_, 0, kRangeSize, &vector_);
+
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  EXPECT_PRED3(RangeIsShuffled, vector_, 0, kRangeSize);
+  EXPECT_PRED3(RangeIsUnshuffled, vector_, kRangeSize,
+               static_cast<int>(kVectorSize));
+}
+
+TEST_F(VectorShuffleTest, ShufflesEndOfVector) {
+  const int kRangeSize = kVectorSize / 2;
+  ShuffleRange(&random_, kRangeSize, kVectorSize, &vector_);
+
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  EXPECT_PRED3(RangeIsUnshuffled, vector_, 0, kRangeSize);
+  EXPECT_PRED3(RangeIsShuffled, vector_, kRangeSize,
+               static_cast<int>(kVectorSize));
+}
+
+TEST_F(VectorShuffleTest, ShufflesMiddleOfVector) {
+  const int kRangeSize = static_cast<int>(kVectorSize) / 3;
+  ShuffleRange(&random_, kRangeSize, 2*kRangeSize, &vector_);
+
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  EXPECT_PRED3(RangeIsUnshuffled, vector_, 0, kRangeSize);
+  EXPECT_PRED3(RangeIsShuffled, vector_, kRangeSize, 2*kRangeSize);
+  EXPECT_PRED3(RangeIsUnshuffled, vector_, 2 * kRangeSize,
+               static_cast<int>(kVectorSize));
+}
+
+TEST_F(VectorShuffleTest, ShufflesRepeatably) {
+  TestingVector vector2;
+  for (size_t i = 0; i < kVectorSize; i++) {
+    vector2.push_back(static_cast<int>(i));
+  }
+
+  random_.Reseed(1234);
+  Shuffle(&random_, &vector_);
+  random_.Reseed(1234);
+  Shuffle(&random_, &vector2);
+
+  ASSERT_PRED1(VectorIsNotCorrupt, vector_);
+  ASSERT_PRED1(VectorIsNotCorrupt, vector2);
+
+  for (size_t i = 0; i < kVectorSize; i++) {
+    EXPECT_EQ(vector_[i], vector2[i]) << " where i is " << i;
+  }
+}
+
+// Tests the size of the AssertHelper class.
+
+TEST(AssertHelperTest, AssertHelperIsSmall) {
+  // To avoid breaking clients that use lots of assertions in one
+  // function, we cannot grow the size of AssertHelper.
+  EXPECT_LE(sizeof(testing::internal::AssertHelper), sizeof(void*));
+}
+
+// Tests String::EndsWithCaseInsensitive().
+TEST(StringTest, EndsWithCaseInsensitive) {
+  EXPECT_TRUE(String::EndsWithCaseInsensitive("foobar", "BAR"));
+  EXPECT_TRUE(String::EndsWithCaseInsensitive("foobaR", "bar"));
+  EXPECT_TRUE(String::EndsWithCaseInsensitive("foobar", ""));
+  EXPECT_TRUE(String::EndsWithCaseInsensitive("", ""));
+
+  EXPECT_FALSE(String::EndsWithCaseInsensitive("Foobar", "foo"));
+  EXPECT_FALSE(String::EndsWithCaseInsensitive("foobar", "Foo"));
+  EXPECT_FALSE(String::EndsWithCaseInsensitive("", "foo"));
+}
+
+// C++Builder's preprocessor is buggy; it fails to expand macros that
+// appear in macro parameters after wide char literals.  Provide an alias
+// for NULL as a workaround.
+static const wchar_t* const kNull = nullptr;
+
+// Tests String::CaseInsensitiveWideCStringEquals
+TEST(StringTest, CaseInsensitiveWideCStringEquals) {
+  EXPECT_TRUE(String::CaseInsensitiveWideCStringEquals(nullptr, nullptr));
+  EXPECT_FALSE(String::CaseInsensitiveWideCStringEquals(kNull, L""));
+  EXPECT_FALSE(String::CaseInsensitiveWideCStringEquals(L"", kNull));
+  EXPECT_FALSE(String::CaseInsensitiveWideCStringEquals(kNull, L"foobar"));
+  EXPECT_FALSE(String::CaseInsensitiveWideCStringEquals(L"foobar", kNull));
+  EXPECT_TRUE(String::CaseInsensitiveWideCStringEquals(L"foobar", L"foobar"));
+  EXPECT_TRUE(String::CaseInsensitiveWideCStringEquals(L"foobar", L"FOOBAR"));
+  EXPECT_TRUE(String::CaseInsensitiveWideCStringEquals(L"FOOBAR", L"foobar"));
+}
+
+#if GTEST_OS_WINDOWS
+
+// Tests String::ShowWideCString().
+TEST(StringTest, ShowWideCString) {
+  EXPECT_STREQ("(null)",
+               String::ShowWideCString(NULL).c_str());
+  EXPECT_STREQ("", String::ShowWideCString(L"").c_str());
+  EXPECT_STREQ("foo", String::ShowWideCString(L"foo").c_str());
+}
+
+# if GTEST_OS_WINDOWS_MOBILE
+TEST(StringTest, AnsiAndUtf16Null) {
+  EXPECT_EQ(NULL, String::AnsiToUtf16(NULL));
+  EXPECT_EQ(NULL, String::Utf16ToAnsi(NULL));
+}
+
+TEST(StringTest, AnsiAndUtf16ConvertBasic) {
+  const char* ansi = String::Utf16ToAnsi(L"str");
+  EXPECT_STREQ("str", ansi);
+  delete [] ansi;
+  const WCHAR* utf16 = String::AnsiToUtf16("str");
+  EXPECT_EQ(0, wcsncmp(L"str", utf16, 3));
+  delete [] utf16;
+}
+
+TEST(StringTest, AnsiAndUtf16ConvertPathChars) {
+  const char* ansi = String::Utf16ToAnsi(L".:\\ \"*?");
+  EXPECT_STREQ(".:\\ \"*?", ansi);
+  delete [] ansi;
+  const WCHAR* utf16 = String::AnsiToUtf16(".:\\ \"*?");
+  EXPECT_EQ(0, wcsncmp(L".:\\ \"*?", utf16, 3));
+  delete [] utf16;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#endif  // GTEST_OS_WINDOWS
+
+// Tests TestProperty construction.
+TEST(TestPropertyTest, StringValue) {
+  TestProperty property("key", "1");
+  EXPECT_STREQ("key", property.key());
+  EXPECT_STREQ("1", property.value());
+}
+
+// Tests TestProperty replacing a value.
+TEST(TestPropertyTest, ReplaceStringValue) {
+  TestProperty property("key", "1");
+  EXPECT_STREQ("1", property.value());
+  property.SetValue("2");
+  EXPECT_STREQ("2", property.value());
+}
+
+// AddFatalFailure() and AddNonfatalFailure() must be stand-alone
+// functions (i.e. their definitions cannot be inlined at the call
+// sites), or C++Builder won't compile the code.
+static void AddFatalFailure() {
+  FAIL() << "Expected fatal failure.";
+}
+
+static void AddNonfatalFailure() {
+  ADD_FAILURE() << "Expected non-fatal failure.";
+}
+
+class ScopedFakeTestPartResultReporterTest : public Test {
+ public:  // Must be public and not protected due to a bug in g++ 3.4.2.
+  enum FailureMode {
+    FATAL_FAILURE,
+    NONFATAL_FAILURE
+  };
+  static void AddFailure(FailureMode failure) {
+    if (failure == FATAL_FAILURE) {
+      AddFatalFailure();
+    } else {
+      AddNonfatalFailure();
+    }
+  }
+};
+
+// Tests that ScopedFakeTestPartResultReporter intercepts test
+// failures.
+TEST_F(ScopedFakeTestPartResultReporterTest, InterceptsTestFailures) {
+  TestPartResultArray results;
+  {
+    ScopedFakeTestPartResultReporter reporter(
+        ScopedFakeTestPartResultReporter::INTERCEPT_ONLY_CURRENT_THREAD,
+        &results);
+    AddFailure(NONFATAL_FAILURE);
+    AddFailure(FATAL_FAILURE);
+  }
+
+  EXPECT_EQ(2, results.size());
+  EXPECT_TRUE(results.GetTestPartResult(0).nonfatally_failed());
+  EXPECT_TRUE(results.GetTestPartResult(1).fatally_failed());
+}
+
+TEST_F(ScopedFakeTestPartResultReporterTest, DeprecatedConstructor) {
+  TestPartResultArray results;
+  {
+    // Tests, that the deprecated constructor still works.
+    ScopedFakeTestPartResultReporter reporter(&results);
+    AddFailure(NONFATAL_FAILURE);
+  }
+  EXPECT_EQ(1, results.size());
+}
+
+#if GTEST_IS_THREADSAFE
+
+class ScopedFakeTestPartResultReporterWithThreadsTest
+  : public ScopedFakeTestPartResultReporterTest {
+ protected:
+  static void AddFailureInOtherThread(FailureMode failure) {
+    ThreadWithParam<FailureMode> thread(&AddFailure, failure, nullptr);
+    thread.Join();
+  }
+};
+
+TEST_F(ScopedFakeTestPartResultReporterWithThreadsTest,
+       InterceptsTestFailuresInAllThreads) {
+  TestPartResultArray results;
+  {
+    ScopedFakeTestPartResultReporter reporter(
+        ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, &results);
+    AddFailure(NONFATAL_FAILURE);
+    AddFailure(FATAL_FAILURE);
+    AddFailureInOtherThread(NONFATAL_FAILURE);
+    AddFailureInOtherThread(FATAL_FAILURE);
+  }
+
+  EXPECT_EQ(4, results.size());
+  EXPECT_TRUE(results.GetTestPartResult(0).nonfatally_failed());
+  EXPECT_TRUE(results.GetTestPartResult(1).fatally_failed());
+  EXPECT_TRUE(results.GetTestPartResult(2).nonfatally_failed());
+  EXPECT_TRUE(results.GetTestPartResult(3).fatally_failed());
+}
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Tests EXPECT_FATAL_FAILURE{,ON_ALL_THREADS}.  Makes sure that they
+// work even if the failure is generated in a called function rather than
+// the current context.
+
+typedef ScopedFakeTestPartResultReporterTest ExpectFatalFailureTest;
+
+TEST_F(ExpectFatalFailureTest, CatchesFatalFaliure) {
+  EXPECT_FATAL_FAILURE(AddFatalFailure(), "Expected fatal failure.");
+}
+
+TEST_F(ExpectFatalFailureTest, AcceptsStdStringObject) {
+  EXPECT_FATAL_FAILURE(AddFatalFailure(),
+                       ::std::string("Expected fatal failure."));
+}
+
+TEST_F(ExpectFatalFailureTest, CatchesFatalFailureOnAllThreads) {
+  // We have another test below to verify that the macro catches fatal
+  // failures generated on another thread.
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(AddFatalFailure(),
+                                      "Expected fatal failure.");
+}
+
+#ifdef __BORLANDC__
+// Silences warnings: "Condition is always true"
+# pragma option push -w-ccc
+#endif
+
+// Tests that EXPECT_FATAL_FAILURE() can be used in a non-void
+// function even when the statement in it contains ASSERT_*.
+
+int NonVoidFunction() {
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(false), "");
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(FAIL(), "");
+  return 0;
+}
+
+TEST_F(ExpectFatalFailureTest, CanBeUsedInNonVoidFunction) {
+  NonVoidFunction();
+}
+
+// Tests that EXPECT_FATAL_FAILURE(statement, ...) doesn't abort the
+// current function even though 'statement' generates a fatal failure.
+
+void DoesNotAbortHelper(bool* aborted) {
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(false), "");
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(FAIL(), "");
+
+  *aborted = false;
+}
+
+#ifdef __BORLANDC__
+// Restores warnings after previous "#pragma option push" suppressed them.
+# pragma option pop
+#endif
+
+TEST_F(ExpectFatalFailureTest, DoesNotAbort) {
+  bool aborted = true;
+  DoesNotAbortHelper(&aborted);
+  EXPECT_FALSE(aborted);
+}
+
+// Tests that the EXPECT_FATAL_FAILURE{,_ON_ALL_THREADS} accepts a
+// statement that contains a macro which expands to code containing an
+// unprotected comma.
+
+static int global_var = 0;
+#define GTEST_USE_UNPROTECTED_COMMA_ global_var++, global_var++
+
+TEST_F(ExpectFatalFailureTest, AcceptsMacroThatExpandsToUnprotectedComma) {
+#ifndef __BORLANDC__
+  // ICE's in C++Builder.
+  EXPECT_FATAL_FAILURE({
+    GTEST_USE_UNPROTECTED_COMMA_;
+    AddFatalFailure();
+  }, "");
+#endif
+
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS({
+    GTEST_USE_UNPROTECTED_COMMA_;
+    AddFatalFailure();
+  }, "");
+}
+
+// Tests EXPECT_NONFATAL_FAILURE{,ON_ALL_THREADS}.
+
+typedef ScopedFakeTestPartResultReporterTest ExpectNonfatalFailureTest;
+
+TEST_F(ExpectNonfatalFailureTest, CatchesNonfatalFailure) {
+  EXPECT_NONFATAL_FAILURE(AddNonfatalFailure(),
+                          "Expected non-fatal failure.");
+}
+
+TEST_F(ExpectNonfatalFailureTest, AcceptsStdStringObject) {
+  EXPECT_NONFATAL_FAILURE(AddNonfatalFailure(),
+                          ::std::string("Expected non-fatal failure."));
+}
+
+TEST_F(ExpectNonfatalFailureTest, CatchesNonfatalFailureOnAllThreads) {
+  // We have another test below to verify that the macro catches
+  // non-fatal failures generated on another thread.
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(AddNonfatalFailure(),
+                                         "Expected non-fatal failure.");
+}
+
+// Tests that the EXPECT_NONFATAL_FAILURE{,_ON_ALL_THREADS} accepts a
+// statement that contains a macro which expands to code containing an
+// unprotected comma.
+TEST_F(ExpectNonfatalFailureTest, AcceptsMacroThatExpandsToUnprotectedComma) {
+  EXPECT_NONFATAL_FAILURE({
+    GTEST_USE_UNPROTECTED_COMMA_;
+    AddNonfatalFailure();
+  }, "");
+
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS({
+    GTEST_USE_UNPROTECTED_COMMA_;
+    AddNonfatalFailure();
+  }, "");
+}
+
+#if GTEST_IS_THREADSAFE
+
+typedef ScopedFakeTestPartResultReporterWithThreadsTest
+    ExpectFailureWithThreadsTest;
+
+TEST_F(ExpectFailureWithThreadsTest, ExpectFatalFailureOnAllThreads) {
+  EXPECT_FATAL_FAILURE_ON_ALL_THREADS(AddFailureInOtherThread(FATAL_FAILURE),
+                                      "Expected fatal failure.");
+}
+
+TEST_F(ExpectFailureWithThreadsTest, ExpectNonFatalFailureOnAllThreads) {
+  EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(
+      AddFailureInOtherThread(NONFATAL_FAILURE), "Expected non-fatal failure.");
+}
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Tests the TestProperty class.
+
+TEST(TestPropertyTest, ConstructorWorks) {
+  const TestProperty property("key", "value");
+  EXPECT_STREQ("key", property.key());
+  EXPECT_STREQ("value", property.value());
+}
+
+TEST(TestPropertyTest, SetValue) {
+  TestProperty property("key", "value_1");
+  EXPECT_STREQ("key", property.key());
+  property.SetValue("value_2");
+  EXPECT_STREQ("key", property.key());
+  EXPECT_STREQ("value_2", property.value());
+}
+
+// Tests the TestResult class
+
+// The test fixture for testing TestResult.
+class TestResultTest : public Test {
+ protected:
+  typedef std::vector<TestPartResult> TPRVector;
+
+  // We make use of 2 TestPartResult objects,
+  TestPartResult * pr1, * pr2;
+
+  // ... and 3 TestResult objects.
+  TestResult * r0, * r1, * r2;
+
+  void SetUp() override {
+    // pr1 is for success.
+    pr1 = new TestPartResult(TestPartResult::kSuccess,
+                             "foo/bar.cc",
+                             10,
+                             "Success!");
+
+    // pr2 is for fatal failure.
+    pr2 = new TestPartResult(TestPartResult::kFatalFailure,
+                             "foo/bar.cc",
+                             -1,  // This line number means "unknown"
+                             "Failure!");
+
+    // Creates the TestResult objects.
+    r0 = new TestResult();
+    r1 = new TestResult();
+    r2 = new TestResult();
+
+    // In order to test TestResult, we need to modify its internal
+    // state, in particular the TestPartResult vector it holds.
+    // test_part_results() returns a const reference to this vector.
+    // We cast it to a non-const object s.t. it can be modified
+    TPRVector* results1 = const_cast<TPRVector*>(
+        &TestResultAccessor::test_part_results(*r1));
+    TPRVector* results2 = const_cast<TPRVector*>(
+        &TestResultAccessor::test_part_results(*r2));
+
+    // r0 is an empty TestResult.
+
+    // r1 contains a single SUCCESS TestPartResult.
+    results1->push_back(*pr1);
+
+    // r2 contains a SUCCESS, and a FAILURE.
+    results2->push_back(*pr1);
+    results2->push_back(*pr2);
+  }
+
+  void TearDown() override {
+    delete pr1;
+    delete pr2;
+
+    delete r0;
+    delete r1;
+    delete r2;
+  }
+
+  // Helper that compares two TestPartResults.
+  static void CompareTestPartResult(const TestPartResult& expected,
+                                    const TestPartResult& actual) {
+    EXPECT_EQ(expected.type(), actual.type());
+    EXPECT_STREQ(expected.file_name(), actual.file_name());
+    EXPECT_EQ(expected.line_number(), actual.line_number());
+    EXPECT_STREQ(expected.summary(), actual.summary());
+    EXPECT_STREQ(expected.message(), actual.message());
+    EXPECT_EQ(expected.passed(), actual.passed());
+    EXPECT_EQ(expected.failed(), actual.failed());
+    EXPECT_EQ(expected.nonfatally_failed(), actual.nonfatally_failed());
+    EXPECT_EQ(expected.fatally_failed(), actual.fatally_failed());
+  }
+};
+
+// Tests TestResult::total_part_count().
+TEST_F(TestResultTest, total_part_count) {
+  ASSERT_EQ(0, r0->total_part_count());
+  ASSERT_EQ(1, r1->total_part_count());
+  ASSERT_EQ(2, r2->total_part_count());
+}
+
+// Tests TestResult::Passed().
+TEST_F(TestResultTest, Passed) {
+  ASSERT_TRUE(r0->Passed());
+  ASSERT_TRUE(r1->Passed());
+  ASSERT_FALSE(r2->Passed());
+}
+
+// Tests TestResult::Failed().
+TEST_F(TestResultTest, Failed) {
+  ASSERT_FALSE(r0->Failed());
+  ASSERT_FALSE(r1->Failed());
+  ASSERT_TRUE(r2->Failed());
+}
+
+// Tests TestResult::GetTestPartResult().
+
+typedef TestResultTest TestResultDeathTest;
+
+TEST_F(TestResultDeathTest, GetTestPartResult) {
+  CompareTestPartResult(*pr1, r2->GetTestPartResult(0));
+  CompareTestPartResult(*pr2, r2->GetTestPartResult(1));
+  EXPECT_DEATH_IF_SUPPORTED(r2->GetTestPartResult(2), "");
+  EXPECT_DEATH_IF_SUPPORTED(r2->GetTestPartResult(-1), "");
+}
+
+// Tests TestResult has no properties when none are added.
+TEST(TestResultPropertyTest, NoPropertiesFoundWhenNoneAreAdded) {
+  TestResult test_result;
+  ASSERT_EQ(0, test_result.test_property_count());
+}
+
+// Tests TestResult has the expected property when added.
+TEST(TestResultPropertyTest, OnePropertyFoundWhenAdded) {
+  TestResult test_result;
+  TestProperty property("key_1", "1");
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property);
+  ASSERT_EQ(1, test_result.test_property_count());
+  const TestProperty& actual_property = test_result.GetTestProperty(0);
+  EXPECT_STREQ("key_1", actual_property.key());
+  EXPECT_STREQ("1", actual_property.value());
+}
+
+// Tests TestResult has multiple properties when added.
+TEST(TestResultPropertyTest, MultiplePropertiesFoundWhenAdded) {
+  TestResult test_result;
+  TestProperty property_1("key_1", "1");
+  TestProperty property_2("key_2", "2");
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_1);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_2);
+  ASSERT_EQ(2, test_result.test_property_count());
+  const TestProperty& actual_property_1 = test_result.GetTestProperty(0);
+  EXPECT_STREQ("key_1", actual_property_1.key());
+  EXPECT_STREQ("1", actual_property_1.value());
+
+  const TestProperty& actual_property_2 = test_result.GetTestProperty(1);
+  EXPECT_STREQ("key_2", actual_property_2.key());
+  EXPECT_STREQ("2", actual_property_2.value());
+}
+
+// Tests TestResult::RecordProperty() overrides values for duplicate keys.
+TEST(TestResultPropertyTest, OverridesValuesForDuplicateKeys) {
+  TestResult test_result;
+  TestProperty property_1_1("key_1", "1");
+  TestProperty property_2_1("key_2", "2");
+  TestProperty property_1_2("key_1", "12");
+  TestProperty property_2_2("key_2", "22");
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_1_1);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_2_1);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_1_2);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_2_2);
+
+  ASSERT_EQ(2, test_result.test_property_count());
+  const TestProperty& actual_property_1 = test_result.GetTestProperty(0);
+  EXPECT_STREQ("key_1", actual_property_1.key());
+  EXPECT_STREQ("12", actual_property_1.value());
+
+  const TestProperty& actual_property_2 = test_result.GetTestProperty(1);
+  EXPECT_STREQ("key_2", actual_property_2.key());
+  EXPECT_STREQ("22", actual_property_2.value());
+}
+
+// Tests TestResult::GetTestProperty().
+TEST(TestResultPropertyTest, GetTestProperty) {
+  TestResult test_result;
+  TestProperty property_1("key_1", "1");
+  TestProperty property_2("key_2", "2");
+  TestProperty property_3("key_3", "3");
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_1);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_2);
+  TestResultAccessor::RecordProperty(&test_result, "testcase", property_3);
+
+  const TestProperty& fetched_property_1 = test_result.GetTestProperty(0);
+  const TestProperty& fetched_property_2 = test_result.GetTestProperty(1);
+  const TestProperty& fetched_property_3 = test_result.GetTestProperty(2);
+
+  EXPECT_STREQ("key_1", fetched_property_1.key());
+  EXPECT_STREQ("1", fetched_property_1.value());
+
+  EXPECT_STREQ("key_2", fetched_property_2.key());
+  EXPECT_STREQ("2", fetched_property_2.value());
+
+  EXPECT_STREQ("key_3", fetched_property_3.key());
+  EXPECT_STREQ("3", fetched_property_3.value());
+
+  EXPECT_DEATH_IF_SUPPORTED(test_result.GetTestProperty(3), "");
+  EXPECT_DEATH_IF_SUPPORTED(test_result.GetTestProperty(-1), "");
+}
+
+// Tests the Test class.
+//
+// It's difficult to test every public method of this class (we are
+// already stretching the limit of Google Test by using it to test itself!).
+// Fortunately, we don't have to do that, as we are already testing
+// the functionalities of the Test class extensively by using Google Test
+// alone.
+//
+// Therefore, this section only contains one test.
+
+// Tests that GTestFlagSaver works on Windows and Mac.
+
+class GTestFlagSaverTest : public Test {
+ protected:
+  // Saves the Google Test flags such that we can restore them later, and
+  // then sets them to their default values.  This will be called
+  // before the first test in this test case is run.
+  static void SetUpTestSuite() {
+    saver_ = new GTestFlagSaver;
+
+    GTEST_FLAG(also_run_disabled_tests) = false;
+    GTEST_FLAG(break_on_failure) = false;
+    GTEST_FLAG(catch_exceptions) = false;
+    GTEST_FLAG(death_test_use_fork) = false;
+    GTEST_FLAG(color) = "auto";
+    GTEST_FLAG(filter) = "";
+    GTEST_FLAG(list_tests) = false;
+    GTEST_FLAG(output) = "";
+    GTEST_FLAG(print_time) = true;
+    GTEST_FLAG(random_seed) = 0;
+    GTEST_FLAG(repeat) = 1;
+    GTEST_FLAG(shuffle) = false;
+    GTEST_FLAG(stack_trace_depth) = kMaxStackTraceDepth;
+    GTEST_FLAG(stream_result_to) = "";
+    GTEST_FLAG(throw_on_failure) = false;
+  }
+
+  // Restores the Google Test flags that the tests have modified.  This will
+  // be called after the last test in this test case is run.
+  static void TearDownTestSuite() {
+    delete saver_;
+    saver_ = nullptr;
+  }
+
+  // Verifies that the Google Test flags have their default values, and then
+  // modifies each of them.
+  void VerifyAndModifyFlags() {
+    EXPECT_FALSE(GTEST_FLAG(also_run_disabled_tests));
+    EXPECT_FALSE(GTEST_FLAG(break_on_failure));
+    EXPECT_FALSE(GTEST_FLAG(catch_exceptions));
+    EXPECT_STREQ("auto", GTEST_FLAG(color).c_str());
+    EXPECT_FALSE(GTEST_FLAG(death_test_use_fork));
+    EXPECT_STREQ("", GTEST_FLAG(filter).c_str());
+    EXPECT_FALSE(GTEST_FLAG(list_tests));
+    EXPECT_STREQ("", GTEST_FLAG(output).c_str());
+    EXPECT_TRUE(GTEST_FLAG(print_time));
+    EXPECT_EQ(0, GTEST_FLAG(random_seed));
+    EXPECT_EQ(1, GTEST_FLAG(repeat));
+    EXPECT_FALSE(GTEST_FLAG(shuffle));
+    EXPECT_EQ(kMaxStackTraceDepth, GTEST_FLAG(stack_trace_depth));
+    EXPECT_STREQ("", GTEST_FLAG(stream_result_to).c_str());
+    EXPECT_FALSE(GTEST_FLAG(throw_on_failure));
+
+    GTEST_FLAG(also_run_disabled_tests) = true;
+    GTEST_FLAG(break_on_failure) = true;
+    GTEST_FLAG(catch_exceptions) = true;
+    GTEST_FLAG(color) = "no";
+    GTEST_FLAG(death_test_use_fork) = true;
+    GTEST_FLAG(filter) = "abc";
+    GTEST_FLAG(list_tests) = true;
+    GTEST_FLAG(output) = "xml:foo.xml";
+    GTEST_FLAG(print_time) = false;
+    GTEST_FLAG(random_seed) = 1;
+    GTEST_FLAG(repeat) = 100;
+    GTEST_FLAG(shuffle) = true;
+    GTEST_FLAG(stack_trace_depth) = 1;
+    GTEST_FLAG(stream_result_to) = "localhost:1234";
+    GTEST_FLAG(throw_on_failure) = true;
+  }
+
+ private:
+  // For saving Google Test flags during this test case.
+  static GTestFlagSaver* saver_;
+};
+
+GTestFlagSaver* GTestFlagSaverTest::saver_ = nullptr;
+
+// Google Test doesn't guarantee the order of tests.  The following two
+// tests are designed to work regardless of their order.
+
+// Modifies the Google Test flags in the test body.
+TEST_F(GTestFlagSaverTest, ModifyGTestFlags) {
+  VerifyAndModifyFlags();
+}
+
+// Verifies that the Google Test flags in the body of the previous test were
+// restored to their original values.
+TEST_F(GTestFlagSaverTest, VerifyGTestFlags) {
+  VerifyAndModifyFlags();
+}
+
+// Sets an environment variable with the given name to the given
+// value.  If the value argument is "", unsets the environment
+// variable.  The caller must ensure that both arguments are not NULL.
+static void SetEnv(const char* name, const char* value) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // Environment variables are not supported on Windows CE.
+  return;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // C++Builder's putenv only stores a pointer to its parameter; we have to
+  // ensure that the string remains valid as long as it might be needed.
+  // We use an std::map to do so.
+  static std::map<std::string, std::string*> added_env;
+
+  // Because putenv stores a pointer to the string buffer, we can't delete the
+  // previous string (if present) until after it's replaced.
+  std::string *prev_env = NULL;
+  if (added_env.find(name) != added_env.end()) {
+    prev_env = added_env[name];
+  }
+  added_env[name] = new std::string(
+      (Message() << name << "=" << value).GetString());
+
+  // The standard signature of putenv accepts a 'char*' argument. Other
+  // implementations, like C++Builder's, accept a 'const char*'.
+  // We cast away the 'const' since that would work for both variants.
+  putenv(const_cast<char*>(added_env[name]->c_str()));
+  delete prev_env;
+#elif GTEST_OS_WINDOWS  // If we are on Windows proper.
+  _putenv((Message() << name << "=" << value).GetString().c_str());
+#else
+  if (*value == '\0') {
+    unsetenv(name);
+  } else {
+    setenv(name, value, 1);
+  }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+// Environment variables are not supported on Windows CE.
+
+using testing::internal::Int32FromGTestEnv;
+
+// Tests Int32FromGTestEnv().
+
+// Tests that Int32FromGTestEnv() returns the default value when the
+// environment variable is not set.
+TEST(Int32FromGTestEnvTest, ReturnsDefaultWhenVariableIsNotSet) {
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "");
+  EXPECT_EQ(10, Int32FromGTestEnv("temp", 10));
+}
+
+# if !defined(GTEST_GET_INT32_FROM_ENV_)
+
+// Tests that Int32FromGTestEnv() returns the default value when the
+// environment variable overflows as an Int32.
+TEST(Int32FromGTestEnvTest, ReturnsDefaultWhenValueOverflows) {
+  printf("(expecting 2 warnings)\n");
+
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "12345678987654321");
+  EXPECT_EQ(20, Int32FromGTestEnv("temp", 20));
+
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "-12345678987654321");
+  EXPECT_EQ(30, Int32FromGTestEnv("temp", 30));
+}
+
+// Tests that Int32FromGTestEnv() returns the default value when the
+// environment variable does not represent a valid decimal integer.
+TEST(Int32FromGTestEnvTest, ReturnsDefaultWhenValueIsInvalid) {
+  printf("(expecting 2 warnings)\n");
+
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "A1");
+  EXPECT_EQ(40, Int32FromGTestEnv("temp", 40));
+
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "12X");
+  EXPECT_EQ(50, Int32FromGTestEnv("temp", 50));
+}
+
+# endif  // !defined(GTEST_GET_INT32_FROM_ENV_)
+
+// Tests that Int32FromGTestEnv() parses and returns the value of the
+// environment variable when it represents a valid decimal integer in
+// the range of an Int32.
+TEST(Int32FromGTestEnvTest, ParsesAndReturnsValidValue) {
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "123");
+  EXPECT_EQ(123, Int32FromGTestEnv("temp", 0));
+
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "TEMP", "-321");
+  EXPECT_EQ(-321, Int32FromGTestEnv("temp", 0));
+}
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests ParseInt32Flag().
+
+// Tests that ParseInt32Flag() returns false and doesn't change the
+// output value when the flag has wrong format
+TEST(ParseInt32FlagTest, ReturnsFalseForInvalidFlag) {
+  Int32 value = 123;
+  EXPECT_FALSE(ParseInt32Flag("--a=100", "b", &value));
+  EXPECT_EQ(123, value);
+
+  EXPECT_FALSE(ParseInt32Flag("a=100", "a", &value));
+  EXPECT_EQ(123, value);
+}
+
+// Tests that ParseInt32Flag() returns false and doesn't change the
+// output value when the flag overflows as an Int32.
+TEST(ParseInt32FlagTest, ReturnsDefaultWhenValueOverflows) {
+  printf("(expecting 2 warnings)\n");
+
+  Int32 value = 123;
+  EXPECT_FALSE(ParseInt32Flag("--abc=12345678987654321", "abc", &value));
+  EXPECT_EQ(123, value);
+
+  EXPECT_FALSE(ParseInt32Flag("--abc=-12345678987654321", "abc", &value));
+  EXPECT_EQ(123, value);
+}
+
+// Tests that ParseInt32Flag() returns false and doesn't change the
+// output value when the flag does not represent a valid decimal
+// integer.
+TEST(ParseInt32FlagTest, ReturnsDefaultWhenValueIsInvalid) {
+  printf("(expecting 2 warnings)\n");
+
+  Int32 value = 123;
+  EXPECT_FALSE(ParseInt32Flag("--abc=A1", "abc", &value));
+  EXPECT_EQ(123, value);
+
+  EXPECT_FALSE(ParseInt32Flag("--abc=12X", "abc", &value));
+  EXPECT_EQ(123, value);
+}
+
+// Tests that ParseInt32Flag() parses the value of the flag and
+// returns true when the flag represents a valid decimal integer in
+// the range of an Int32.
+TEST(ParseInt32FlagTest, ParsesAndReturnsValidValue) {
+  Int32 value = 123;
+  EXPECT_TRUE(ParseInt32Flag("--" GTEST_FLAG_PREFIX_ "abc=456", "abc", &value));
+  EXPECT_EQ(456, value);
+
+  EXPECT_TRUE(ParseInt32Flag("--" GTEST_FLAG_PREFIX_ "abc=-789",
+                             "abc", &value));
+  EXPECT_EQ(-789, value);
+}
+
+// Tests that Int32FromEnvOrDie() parses the value of the var or
+// returns the correct default.
+// Environment variables are not supported on Windows CE.
+#if !GTEST_OS_WINDOWS_MOBILE
+TEST(Int32FromEnvOrDieTest, ParsesAndReturnsValidValue) {
+  EXPECT_EQ(333, Int32FromEnvOrDie(GTEST_FLAG_PREFIX_UPPER_ "UnsetVar", 333));
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "UnsetVar", "123");
+  EXPECT_EQ(123, Int32FromEnvOrDie(GTEST_FLAG_PREFIX_UPPER_ "UnsetVar", 333));
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "UnsetVar", "-123");
+  EXPECT_EQ(-123, Int32FromEnvOrDie(GTEST_FLAG_PREFIX_UPPER_ "UnsetVar", 333));
+}
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests that Int32FromEnvOrDie() aborts with an error message
+// if the variable is not an Int32.
+TEST(Int32FromEnvOrDieDeathTest, AbortsOnFailure) {
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "VAR", "xxx");
+  EXPECT_DEATH_IF_SUPPORTED(
+      Int32FromEnvOrDie(GTEST_FLAG_PREFIX_UPPER_ "VAR", 123),
+      ".*");
+}
+
+// Tests that Int32FromEnvOrDie() aborts with an error message
+// if the variable cannot be represented by an Int32.
+TEST(Int32FromEnvOrDieDeathTest, AbortsOnInt32Overflow) {
+  SetEnv(GTEST_FLAG_PREFIX_UPPER_ "VAR", "1234567891234567891234");
+  EXPECT_DEATH_IF_SUPPORTED(
+      Int32FromEnvOrDie(GTEST_FLAG_PREFIX_UPPER_ "VAR", 123),
+      ".*");
+}
+
+// Tests that ShouldRunTestOnShard() selects all tests
+// where there is 1 shard.
+TEST(ShouldRunTestOnShardTest, IsPartitionWhenThereIsOneShard) {
+  EXPECT_TRUE(ShouldRunTestOnShard(1, 0, 0));
+  EXPECT_TRUE(ShouldRunTestOnShard(1, 0, 1));
+  EXPECT_TRUE(ShouldRunTestOnShard(1, 0, 2));
+  EXPECT_TRUE(ShouldRunTestOnShard(1, 0, 3));
+  EXPECT_TRUE(ShouldRunTestOnShard(1, 0, 4));
+}
+
+class ShouldShardTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    index_var_ = GTEST_FLAG_PREFIX_UPPER_ "INDEX";
+    total_var_ = GTEST_FLAG_PREFIX_UPPER_ "TOTAL";
+  }
+
+  void TearDown() override {
+    SetEnv(index_var_, "");
+    SetEnv(total_var_, "");
+  }
+
+  const char* index_var_;
+  const char* total_var_;
+};
+
+// Tests that sharding is disabled if neither of the environment variables
+// are set.
+TEST_F(ShouldShardTest, ReturnsFalseWhenNeitherEnvVarIsSet) {
+  SetEnv(index_var_, "");
+  SetEnv(total_var_, "");
+
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, false));
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, true));
+}
+
+// Tests that sharding is not enabled if total_shards  == 1.
+TEST_F(ShouldShardTest, ReturnsFalseWhenTotalShardIsOne) {
+  SetEnv(index_var_, "0");
+  SetEnv(total_var_, "1");
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, false));
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, true));
+}
+
+// Tests that sharding is enabled if total_shards > 1 and
+// we are not in a death test subprocess.
+// Environment variables are not supported on Windows CE.
+#if !GTEST_OS_WINDOWS_MOBILE
+TEST_F(ShouldShardTest, WorksWhenShardEnvVarsAreValid) {
+  SetEnv(index_var_, "4");
+  SetEnv(total_var_, "22");
+  EXPECT_TRUE(ShouldShard(total_var_, index_var_, false));
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, true));
+
+  SetEnv(index_var_, "8");
+  SetEnv(total_var_, "9");
+  EXPECT_TRUE(ShouldShard(total_var_, index_var_, false));
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, true));
+
+  SetEnv(index_var_, "0");
+  SetEnv(total_var_, "9");
+  EXPECT_TRUE(ShouldShard(total_var_, index_var_, false));
+  EXPECT_FALSE(ShouldShard(total_var_, index_var_, true));
+}
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests that we exit in error if the sharding values are not valid.
+
+typedef ShouldShardTest ShouldShardDeathTest;
+
+TEST_F(ShouldShardDeathTest, AbortsWhenShardingEnvVarsAreInvalid) {
+  SetEnv(index_var_, "4");
+  SetEnv(total_var_, "4");
+  EXPECT_DEATH_IF_SUPPORTED(ShouldShard(total_var_, index_var_, false), ".*");
+
+  SetEnv(index_var_, "4");
+  SetEnv(total_var_, "-2");
+  EXPECT_DEATH_IF_SUPPORTED(ShouldShard(total_var_, index_var_, false), ".*");
+
+  SetEnv(index_var_, "5");
+  SetEnv(total_var_, "");
+  EXPECT_DEATH_IF_SUPPORTED(ShouldShard(total_var_, index_var_, false), ".*");
+
+  SetEnv(index_var_, "");
+  SetEnv(total_var_, "5");
+  EXPECT_DEATH_IF_SUPPORTED(ShouldShard(total_var_, index_var_, false), ".*");
+}
+
+// Tests that ShouldRunTestOnShard is a partition when 5
+// shards are used.
+TEST(ShouldRunTestOnShardTest, IsPartitionWhenThereAreFiveShards) {
+  // Choose an arbitrary number of tests and shards.
+  const int num_tests = 17;
+  const int num_shards = 5;
+
+  // Check partitioning: each test should be on exactly 1 shard.
+  for (int test_id = 0; test_id < num_tests; test_id++) {
+    int prev_selected_shard_index = -1;
+    for (int shard_index = 0; shard_index < num_shards; shard_index++) {
+      if (ShouldRunTestOnShard(num_shards, shard_index, test_id)) {
+        if (prev_selected_shard_index < 0) {
+          prev_selected_shard_index = shard_index;
+        } else {
+          ADD_FAILURE() << "Shard " << prev_selected_shard_index << " and "
+            << shard_index << " are both selected to run test " << test_id;
+        }
+      }
+    }
+  }
+
+  // Check balance: This is not required by the sharding protocol, but is a
+  // desirable property for performance.
+  for (int shard_index = 0; shard_index < num_shards; shard_index++) {
+    int num_tests_on_shard = 0;
+    for (int test_id = 0; test_id < num_tests; test_id++) {
+      num_tests_on_shard +=
+        ShouldRunTestOnShard(num_shards, shard_index, test_id);
+    }
+    EXPECT_GE(num_tests_on_shard, num_tests / num_shards);
+  }
+}
+
+// For the same reason we are not explicitly testing everything in the
+// Test class, there are no separate tests for the following classes
+// (except for some trivial cases):
+//
+//   TestSuite, UnitTest, UnitTestResultPrinter.
+//
+// Similarly, there are no separate tests for the following macros:
+//
+//   TEST, TEST_F, RUN_ALL_TESTS
+
+TEST(UnitTestTest, CanGetOriginalWorkingDir) {
+  ASSERT_TRUE(UnitTest::GetInstance()->original_working_dir() != nullptr);
+  EXPECT_STRNE(UnitTest::GetInstance()->original_working_dir(), "");
+}
+
+TEST(UnitTestTest, ReturnsPlausibleTimestamp) {
+  EXPECT_LT(0, UnitTest::GetInstance()->start_timestamp());
+  EXPECT_LE(UnitTest::GetInstance()->start_timestamp(), GetTimeInMillis());
+}
+
+// When a property using a reserved key is supplied to this function, it
+// tests that a non-fatal failure is added, a fatal failure is not added,
+// and that the property is not recorded.
+void ExpectNonFatalFailureRecordingPropertyWithReservedKey(
+    const TestResult& test_result, const char* key) {
+  EXPECT_NONFATAL_FAILURE(Test::RecordProperty(key, "1"), "Reserved key");
+  ASSERT_EQ(0, test_result.test_property_count()) << "Property for key '" << key
+                                                  << "' recorded unexpectedly.";
+}
+
+void ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+    const char* key) {
+  const TestInfo* test_info = UnitTest::GetInstance()->current_test_info();
+  ASSERT_TRUE(test_info != nullptr);
+  ExpectNonFatalFailureRecordingPropertyWithReservedKey(*test_info->result(),
+                                                        key);
+}
+
+void ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+    const char* key) {
+  const testing::TestSuite* test_suite =
+      UnitTest::GetInstance()->current_test_suite();
+  ASSERT_TRUE(test_suite != nullptr);
+  ExpectNonFatalFailureRecordingPropertyWithReservedKey(
+      test_suite->ad_hoc_test_result(), key);
+}
+
+void ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+    const char* key) {
+  ExpectNonFatalFailureRecordingPropertyWithReservedKey(
+      UnitTest::GetInstance()->ad_hoc_test_result(), key);
+}
+
+// Tests that property recording functions in UnitTest outside of tests
+// functions correcly.  Creating a separate instance of UnitTest ensures it
+// is in a state similar to the UnitTest's singleton's between tests.
+class UnitTestRecordPropertyTest :
+    public testing::internal::UnitTestRecordPropertyTestHelper {
+ public:
+  static void SetUpTestSuite() {
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "disabled");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "errors");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "failures");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "name");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "tests");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTestSuite(
+        "time");
+
+    Test::RecordProperty("test_case_key_1", "1");
+
+    const testing::TestSuite* test_suite =
+        UnitTest::GetInstance()->current_test_suite();
+
+    ASSERT_TRUE(test_suite != nullptr);
+
+    ASSERT_EQ(1, test_suite->ad_hoc_test_result().test_property_count());
+    EXPECT_STREQ("test_case_key_1",
+                 test_suite->ad_hoc_test_result().GetTestProperty(0).key());
+    EXPECT_STREQ("1",
+                 test_suite->ad_hoc_test_result().GetTestProperty(0).value());
+  }
+};
+
+// Tests TestResult has the expected property when added.
+TEST_F(UnitTestRecordPropertyTest, OnePropertyFoundWhenAdded) {
+  UnitTestRecordProperty("key_1", "1");
+
+  ASSERT_EQ(1, unit_test_.ad_hoc_test_result().test_property_count());
+
+  EXPECT_STREQ("key_1",
+               unit_test_.ad_hoc_test_result().GetTestProperty(0).key());
+  EXPECT_STREQ("1",
+               unit_test_.ad_hoc_test_result().GetTestProperty(0).value());
+}
+
+// Tests TestResult has multiple properties when added.
+TEST_F(UnitTestRecordPropertyTest, MultiplePropertiesFoundWhenAdded) {
+  UnitTestRecordProperty("key_1", "1");
+  UnitTestRecordProperty("key_2", "2");
+
+  ASSERT_EQ(2, unit_test_.ad_hoc_test_result().test_property_count());
+
+  EXPECT_STREQ("key_1",
+               unit_test_.ad_hoc_test_result().GetTestProperty(0).key());
+  EXPECT_STREQ("1", unit_test_.ad_hoc_test_result().GetTestProperty(0).value());
+
+  EXPECT_STREQ("key_2",
+               unit_test_.ad_hoc_test_result().GetTestProperty(1).key());
+  EXPECT_STREQ("2", unit_test_.ad_hoc_test_result().GetTestProperty(1).value());
+}
+
+// Tests TestResult::RecordProperty() overrides values for duplicate keys.
+TEST_F(UnitTestRecordPropertyTest, OverridesValuesForDuplicateKeys) {
+  UnitTestRecordProperty("key_1", "1");
+  UnitTestRecordProperty("key_2", "2");
+  UnitTestRecordProperty("key_1", "12");
+  UnitTestRecordProperty("key_2", "22");
+
+  ASSERT_EQ(2, unit_test_.ad_hoc_test_result().test_property_count());
+
+  EXPECT_STREQ("key_1",
+               unit_test_.ad_hoc_test_result().GetTestProperty(0).key());
+  EXPECT_STREQ("12",
+               unit_test_.ad_hoc_test_result().GetTestProperty(0).value());
+
+  EXPECT_STREQ("key_2",
+               unit_test_.ad_hoc_test_result().GetTestProperty(1).key());
+  EXPECT_STREQ("22",
+               unit_test_.ad_hoc_test_result().GetTestProperty(1).value());
+}
+
+TEST_F(UnitTestRecordPropertyTest,
+       AddFailureInsideTestsWhenUsingTestSuiteReservedKeys) {
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "name");
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "value_param");
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "type_param");
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "status");
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "time");
+  ExpectNonFatalFailureRecordingPropertyWithReservedKeyForCurrentTest(
+      "classname");
+}
+
+TEST_F(UnitTestRecordPropertyTest,
+       AddRecordWithReservedKeysGeneratesCorrectPropertyList) {
+  EXPECT_NONFATAL_FAILURE(
+      Test::RecordProperty("name", "1"),
+      "'classname', 'name', 'status', 'time', 'type_param', 'value_param',"
+      " 'file', and 'line' are reserved");
+}
+
+class UnitTestRecordPropertyTestEnvironment : public Environment {
+ public:
+  void TearDown() override {
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "tests");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "failures");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "disabled");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "errors");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "name");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "timestamp");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "time");
+    ExpectNonFatalFailureRecordingPropertyWithReservedKeyOutsideOfTestSuite(
+        "random_seed");
+  }
+};
+
+// This will test property recording outside of any test or test case.
+static Environment* record_property_env GTEST_ATTRIBUTE_UNUSED_ =
+    AddGlobalTestEnvironment(new UnitTestRecordPropertyTestEnvironment);
+
+// This group of tests is for predicate assertions (ASSERT_PRED*, etc)
+// of various arities.  They do not attempt to be exhaustive.  Rather,
+// view them as smoke tests that can be easily reviewed and verified.
+// A more complete set of tests for predicate assertions can be found
+// in gtest_pred_impl_unittest.cc.
+
+// First, some predicates and predicate-formatters needed by the tests.
+
+// Returns true if the argument is an even number.
+bool IsEven(int n) {
+  return (n % 2) == 0;
+}
+
+// A functor that returns true if the argument is an even number.
+struct IsEvenFunctor {
+  bool operator()(int n) { return IsEven(n); }
+};
+
+// A predicate-formatter function that asserts the argument is an even
+// number.
+AssertionResult AssertIsEven(const char* expr, int n) {
+  if (IsEven(n)) {
+    return AssertionSuccess();
+  }
+
+  Message msg;
+  msg << expr << " evaluates to " << n << ", which is not even.";
+  return AssertionFailure(msg);
+}
+
+// A predicate function that returns AssertionResult for use in
+// EXPECT/ASSERT_TRUE/FALSE.
+AssertionResult ResultIsEven(int n) {
+  if (IsEven(n))
+    return AssertionSuccess() << n << " is even";
+  else
+    return AssertionFailure() << n << " is odd";
+}
+
+// A predicate function that returns AssertionResult but gives no
+// explanation why it succeeds. Needed for testing that
+// EXPECT/ASSERT_FALSE handles such functions correctly.
+AssertionResult ResultIsEvenNoExplanation(int n) {
+  if (IsEven(n))
+    return AssertionSuccess();
+  else
+    return AssertionFailure() << n << " is odd";
+}
+
+// A predicate-formatter functor that asserts the argument is an even
+// number.
+struct AssertIsEvenFunctor {
+  AssertionResult operator()(const char* expr, int n) {
+    return AssertIsEven(expr, n);
+  }
+};
+
+// Returns true if the sum of the arguments is an even number.
+bool SumIsEven2(int n1, int n2) {
+  return IsEven(n1 + n2);
+}
+
+// A functor that returns true if the sum of the arguments is an even
+// number.
+struct SumIsEven3Functor {
+  bool operator()(int n1, int n2, int n3) {
+    return IsEven(n1 + n2 + n3);
+  }
+};
+
+// A predicate-formatter function that asserts the sum of the
+// arguments is an even number.
+AssertionResult AssertSumIsEven4(
+    const char* e1, const char* e2, const char* e3, const char* e4,
+    int n1, int n2, int n3, int n4) {
+  const int sum = n1 + n2 + n3 + n4;
+  if (IsEven(sum)) {
+    return AssertionSuccess();
+  }
+
+  Message msg;
+  msg << e1 << " + " << e2 << " + " << e3 << " + " << e4
+      << " (" << n1 << " + " << n2 << " + " << n3 << " + " << n4
+      << ") evaluates to " << sum << ", which is not even.";
+  return AssertionFailure(msg);
+}
+
+// A predicate-formatter functor that asserts the sum of the arguments
+// is an even number.
+struct AssertSumIsEven5Functor {
+  AssertionResult operator()(
+      const char* e1, const char* e2, const char* e3, const char* e4,
+      const char* e5, int n1, int n2, int n3, int n4, int n5) {
+    const int sum = n1 + n2 + n3 + n4 + n5;
+    if (IsEven(sum)) {
+      return AssertionSuccess();
+    }
+
+    Message msg;
+    msg << e1 << " + " << e2 << " + " << e3 << " + " << e4 << " + " << e5
+        << " ("
+        << n1 << " + " << n2 << " + " << n3 << " + " << n4 << " + " << n5
+        << ") evaluates to " << sum << ", which is not even.";
+    return AssertionFailure(msg);
+  }
+};
+
+
+// Tests unary predicate assertions.
+
+// Tests unary predicate assertions that don't use a custom formatter.
+TEST(Pred1Test, WithoutFormat) {
+  // Success cases.
+  EXPECT_PRED1(IsEvenFunctor(), 2) << "This failure is UNEXPECTED!";
+  ASSERT_PRED1(IsEven, 4);
+
+  // Failure cases.
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED1(IsEven, 5) << "This failure is expected.";
+  }, "This failure is expected.");
+  EXPECT_FATAL_FAILURE(ASSERT_PRED1(IsEvenFunctor(), 5),
+                       "evaluates to false");
+}
+
+// Tests unary predicate assertions that use a custom formatter.
+TEST(Pred1Test, WithFormat) {
+  // Success cases.
+  EXPECT_PRED_FORMAT1(AssertIsEven, 2);
+  ASSERT_PRED_FORMAT1(AssertIsEvenFunctor(), 4)
+    << "This failure is UNEXPECTED!";
+
+  // Failure cases.
+  const int n = 5;
+  EXPECT_NONFATAL_FAILURE(EXPECT_PRED_FORMAT1(AssertIsEvenFunctor(), n),
+                          "n evaluates to 5, which is not even.");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(AssertIsEven, 5) << "This failure is expected.";
+  }, "This failure is expected.");
+}
+
+// Tests that unary predicate assertions evaluates their arguments
+// exactly once.
+TEST(Pred1Test, SingleEvaluationOnFailure) {
+  // A success case.
+  static int n = 0;
+  EXPECT_PRED1(IsEven, n++);
+  EXPECT_EQ(1, n) << "The argument is not evaluated exactly once.";
+
+  // A failure case.
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT1(AssertIsEvenFunctor(), n++)
+        << "This failure is expected.";
+  }, "This failure is expected.");
+  EXPECT_EQ(2, n) << "The argument is not evaluated exactly once.";
+}
+
+
+// Tests predicate assertions whose arity is >= 2.
+
+// Tests predicate assertions that don't use a custom formatter.
+TEST(PredTest, WithoutFormat) {
+  // Success cases.
+  ASSERT_PRED2(SumIsEven2, 2, 4) << "This failure is UNEXPECTED!";
+  EXPECT_PRED3(SumIsEven3Functor(), 4, 6, 8);
+
+  // Failure cases.
+  const int n1 = 1;
+  const int n2 = 2;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED2(SumIsEven2, n1, n2) << "This failure is expected.";
+  }, "This failure is expected.");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED3(SumIsEven3Functor(), 1, 2, 4);
+  }, "evaluates to false");
+}
+
+// Tests predicate assertions that use a custom formatter.
+TEST(PredTest, WithFormat) {
+  // Success cases.
+  ASSERT_PRED_FORMAT4(AssertSumIsEven4, 4, 6, 8, 10) <<
+    "This failure is UNEXPECTED!";
+  EXPECT_PRED_FORMAT5(AssertSumIsEven5Functor(), 2, 4, 6, 8, 10);
+
+  // Failure cases.
+  const int n1 = 1;
+  const int n2 = 2;
+  const int n3 = 4;
+  const int n4 = 6;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(AssertSumIsEven4, n1, n2, n3, n4);
+  }, "evaluates to 13, which is not even.");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT5(AssertSumIsEven5Functor(), 1, 2, 4, 6, 8)
+        << "This failure is expected.";
+  }, "This failure is expected.");
+}
+
+// Tests that predicate assertions evaluates their arguments
+// exactly once.
+TEST(PredTest, SingleEvaluationOnFailure) {
+  // A success case.
+  int n1 = 0;
+  int n2 = 0;
+  EXPECT_PRED2(SumIsEven2, n1++, n2++);
+  EXPECT_EQ(1, n1) << "Argument 1 is not evaluated exactly once.";
+  EXPECT_EQ(1, n2) << "Argument 2 is not evaluated exactly once.";
+
+  // Another success case.
+  n1 = n2 = 0;
+  int n3 = 0;
+  int n4 = 0;
+  int n5 = 0;
+  ASSERT_PRED_FORMAT5(AssertSumIsEven5Functor(),
+                      n1++, n2++, n3++, n4++, n5++)
+                        << "This failure is UNEXPECTED!";
+  EXPECT_EQ(1, n1) << "Argument 1 is not evaluated exactly once.";
+  EXPECT_EQ(1, n2) << "Argument 2 is not evaluated exactly once.";
+  EXPECT_EQ(1, n3) << "Argument 3 is not evaluated exactly once.";
+  EXPECT_EQ(1, n4) << "Argument 4 is not evaluated exactly once.";
+  EXPECT_EQ(1, n5) << "Argument 5 is not evaluated exactly once.";
+
+  // A failure case.
+  n1 = n2 = n3 = 0;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED3(SumIsEven3Functor(), ++n1, n2++, n3++)
+        << "This failure is expected.";
+  }, "This failure is expected.");
+  EXPECT_EQ(1, n1) << "Argument 1 is not evaluated exactly once.";
+  EXPECT_EQ(1, n2) << "Argument 2 is not evaluated exactly once.";
+  EXPECT_EQ(1, n3) << "Argument 3 is not evaluated exactly once.";
+
+  // Another failure case.
+  n1 = n2 = n3 = n4 = 0;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT4(AssertSumIsEven4, ++n1, n2++, n3++, n4++);
+  }, "evaluates to 1, which is not even.");
+  EXPECT_EQ(1, n1) << "Argument 1 is not evaluated exactly once.";
+  EXPECT_EQ(1, n2) << "Argument 2 is not evaluated exactly once.";
+  EXPECT_EQ(1, n3) << "Argument 3 is not evaluated exactly once.";
+  EXPECT_EQ(1, n4) << "Argument 4 is not evaluated exactly once.";
+}
+
+// Test predicate assertions for sets
+TEST(PredTest, ExpectPredEvalFailure) {
+  std::set<int> set_a = {2, 1, 3, 4, 5};
+  std::set<int> set_b = {0, 4, 8};
+  const auto compare_sets = [] (std::set<int>, std::set<int>) { return false; };
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_PRED2(compare_sets, set_a, set_b),
+      "compare_sets(set_a, set_b) evaluates to false, where\nset_a evaluates "
+      "to { 1, 2, 3, 4, 5 }\nset_b evaluates to { 0, 4, 8 }");
+}
+
+// Some helper functions for testing using overloaded/template
+// functions with ASSERT_PREDn and EXPECT_PREDn.
+
+bool IsPositive(double x) {
+  return x > 0;
+}
+
+template <typename T>
+bool IsNegative(T x) {
+  return x < 0;
+}
+
+template <typename T1, typename T2>
+bool GreaterThan(T1 x1, T2 x2) {
+  return x1 > x2;
+}
+
+// Tests that overloaded functions can be used in *_PRED* as long as
+// their types are explicitly specified.
+TEST(PredicateAssertionTest, AcceptsOverloadedFunction) {
+  // C++Builder requires C-style casts rather than static_cast.
+  EXPECT_PRED1((bool (*)(int))(IsPositive), 5);  // NOLINT
+  ASSERT_PRED1((bool (*)(double))(IsPositive), 6.0);  // NOLINT
+}
+
+// Tests that template functions can be used in *_PRED* as long as
+// their types are explicitly specified.
+TEST(PredicateAssertionTest, AcceptsTemplateFunction) {
+  EXPECT_PRED1(IsNegative<int>, -5);
+  // Makes sure that we can handle templates with more than one
+  // parameter.
+  ASSERT_PRED2((GreaterThan<int, int>), 5, 0);
+}
+
+
+// Some helper functions for testing using overloaded/template
+// functions with ASSERT_PRED_FORMATn and EXPECT_PRED_FORMATn.
+
+AssertionResult IsPositiveFormat(const char* /* expr */, int n) {
+  return n > 0 ? AssertionSuccess() :
+      AssertionFailure(Message() << "Failure");
+}
+
+AssertionResult IsPositiveFormat(const char* /* expr */, double x) {
+  return x > 0 ? AssertionSuccess() :
+      AssertionFailure(Message() << "Failure");
+}
+
+template <typename T>
+AssertionResult IsNegativeFormat(const char* /* expr */, T x) {
+  return x < 0 ? AssertionSuccess() :
+      AssertionFailure(Message() << "Failure");
+}
+
+template <typename T1, typename T2>
+AssertionResult EqualsFormat(const char* /* expr1 */, const char* /* expr2 */,
+                             const T1& x1, const T2& x2) {
+  return x1 == x2 ? AssertionSuccess() :
+      AssertionFailure(Message() << "Failure");
+}
+
+// Tests that overloaded functions can be used in *_PRED_FORMAT*
+// without explicitly specifying their types.
+TEST(PredicateFormatAssertionTest, AcceptsOverloadedFunction) {
+  EXPECT_PRED_FORMAT1(IsPositiveFormat, 5);
+  ASSERT_PRED_FORMAT1(IsPositiveFormat, 6.0);
+}
+
+// Tests that template functions can be used in *_PRED_FORMAT* without
+// explicitly specifying their types.
+TEST(PredicateFormatAssertionTest, AcceptsTemplateFunction) {
+  EXPECT_PRED_FORMAT1(IsNegativeFormat, -5);
+  ASSERT_PRED_FORMAT2(EqualsFormat, 3, 3);
+}
+
+
+// Tests string assertions.
+
+// Tests ASSERT_STREQ with non-NULL arguments.
+TEST(StringAssertionTest, ASSERT_STREQ) {
+  const char * const p1 = "good";
+  ASSERT_STREQ(p1, p1);
+
+  // Let p2 have the same content as p1, but be at a different address.
+  const char p2[] = "good";
+  ASSERT_STREQ(p1, p2);
+
+  EXPECT_FATAL_FAILURE(ASSERT_STREQ("bad", "good"),
+                       "  \"bad\"\n  \"good\"");
+}
+
+// Tests ASSERT_STREQ with NULL arguments.
+TEST(StringAssertionTest, ASSERT_STREQ_Null) {
+  ASSERT_STREQ(static_cast<const char*>(nullptr), nullptr);
+  EXPECT_FATAL_FAILURE(ASSERT_STREQ(nullptr, "non-null"), "non-null");
+}
+
+// Tests ASSERT_STREQ with NULL arguments.
+TEST(StringAssertionTest, ASSERT_STREQ_Null2) {
+  EXPECT_FATAL_FAILURE(ASSERT_STREQ("non-null", nullptr), "non-null");
+}
+
+// Tests ASSERT_STRNE.
+TEST(StringAssertionTest, ASSERT_STRNE) {
+  ASSERT_STRNE("hi", "Hi");
+  ASSERT_STRNE("Hi", nullptr);
+  ASSERT_STRNE(nullptr, "Hi");
+  ASSERT_STRNE("", nullptr);
+  ASSERT_STRNE(nullptr, "");
+  ASSERT_STRNE("", "Hi");
+  ASSERT_STRNE("Hi", "");
+  EXPECT_FATAL_FAILURE(ASSERT_STRNE("Hi", "Hi"),
+                       "\"Hi\" vs \"Hi\"");
+}
+
+// Tests ASSERT_STRCASEEQ.
+TEST(StringAssertionTest, ASSERT_STRCASEEQ) {
+  ASSERT_STRCASEEQ("hi", "Hi");
+  ASSERT_STRCASEEQ(static_cast<const char*>(nullptr), nullptr);
+
+  ASSERT_STRCASEEQ("", "");
+  EXPECT_FATAL_FAILURE(ASSERT_STRCASEEQ("Hi", "hi2"),
+                       "Ignoring case");
+}
+
+// Tests ASSERT_STRCASENE.
+TEST(StringAssertionTest, ASSERT_STRCASENE) {
+  ASSERT_STRCASENE("hi1", "Hi2");
+  ASSERT_STRCASENE("Hi", nullptr);
+  ASSERT_STRCASENE(nullptr, "Hi");
+  ASSERT_STRCASENE("", nullptr);
+  ASSERT_STRCASENE(nullptr, "");
+  ASSERT_STRCASENE("", "Hi");
+  ASSERT_STRCASENE("Hi", "");
+  EXPECT_FATAL_FAILURE(ASSERT_STRCASENE("Hi", "hi"),
+                       "(ignoring case)");
+}
+
+// Tests *_STREQ on wide strings.
+TEST(StringAssertionTest, STREQ_Wide) {
+  // NULL strings.
+  ASSERT_STREQ(static_cast<const wchar_t*>(nullptr), nullptr);
+
+  // Empty strings.
+  ASSERT_STREQ(L"", L"");
+
+  // Non-null vs NULL.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STREQ(L"non-null", nullptr), "non-null");
+
+  // Equal strings.
+  EXPECT_STREQ(L"Hi", L"Hi");
+
+  // Unequal strings.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STREQ(L"abc", L"Abc"),
+                          "Abc");
+
+  // Strings containing wide characters.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STREQ(L"abc\x8119", L"abc\x8120"),
+                          "abc");
+
+  // The streaming variation.
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_STREQ(L"abc\x8119", L"abc\x8121") << "Expected failure";
+  }, "Expected failure");
+}
+
+// Tests *_STRNE on wide strings.
+TEST(StringAssertionTest, STRNE_Wide) {
+  // NULL strings.
+  EXPECT_NONFATAL_FAILURE(
+      {  // NOLINT
+        EXPECT_STRNE(static_cast<const wchar_t*>(nullptr), nullptr);
+      },
+      "");
+
+  // Empty strings.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRNE(L"", L""),
+                          "L\"\"");
+
+  // Non-null vs NULL.
+  ASSERT_STRNE(L"non-null", nullptr);
+
+  // Equal strings.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRNE(L"Hi", L"Hi"),
+                          "L\"Hi\"");
+
+  // Unequal strings.
+  EXPECT_STRNE(L"abc", L"Abc");
+
+  // Strings containing wide characters.
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRNE(L"abc\x8119", L"abc\x8119"),
+                          "abc");
+
+  // The streaming variation.
+  ASSERT_STRNE(L"abc\x8119", L"abc\x8120") << "This shouldn't happen";
+}
+
+// Tests for ::testing::IsSubstring().
+
+// Tests that IsSubstring() returns the correct result when the input
+// argument type is const char*.
+TEST(IsSubstringTest, ReturnsCorrectResultForCString) {
+  EXPECT_FALSE(IsSubstring("", "", nullptr, "a"));
+  EXPECT_FALSE(IsSubstring("", "", "b", nullptr));
+  EXPECT_FALSE(IsSubstring("", "", "needle", "haystack"));
+
+  EXPECT_TRUE(IsSubstring("", "", static_cast<const char*>(nullptr), nullptr));
+  EXPECT_TRUE(IsSubstring("", "", "needle", "two needles"));
+}
+
+// Tests that IsSubstring() returns the correct result when the input
+// argument type is const wchar_t*.
+TEST(IsSubstringTest, ReturnsCorrectResultForWideCString) {
+  EXPECT_FALSE(IsSubstring("", "", kNull, L"a"));
+  EXPECT_FALSE(IsSubstring("", "", L"b", kNull));
+  EXPECT_FALSE(IsSubstring("", "", L"needle", L"haystack"));
+
+  EXPECT_TRUE(
+      IsSubstring("", "", static_cast<const wchar_t*>(nullptr), nullptr));
+  EXPECT_TRUE(IsSubstring("", "", L"needle", L"two needles"));
+}
+
+// Tests that IsSubstring() generates the correct message when the input
+// argument type is const char*.
+TEST(IsSubstringTest, GeneratesCorrectMessageForCString) {
+  EXPECT_STREQ("Value of: needle_expr\n"
+               "  Actual: \"needle\"\n"
+               "Expected: a substring of haystack_expr\n"
+               "Which is: \"haystack\"",
+               IsSubstring("needle_expr", "haystack_expr",
+                           "needle", "haystack").failure_message());
+}
+
+// Tests that IsSubstring returns the correct result when the input
+// argument type is ::std::string.
+TEST(IsSubstringTest, ReturnsCorrectResultsForStdString) {
+  EXPECT_TRUE(IsSubstring("", "", std::string("hello"), "ahellob"));
+  EXPECT_FALSE(IsSubstring("", "", "hello", std::string("world")));
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Tests that IsSubstring returns the correct result when the input
+// argument type is ::std::wstring.
+TEST(IsSubstringTest, ReturnsCorrectResultForStdWstring) {
+  EXPECT_TRUE(IsSubstring("", "", ::std::wstring(L"needle"), L"two needles"));
+  EXPECT_FALSE(IsSubstring("", "", L"needle", ::std::wstring(L"haystack")));
+}
+
+// Tests that IsSubstring() generates the correct message when the input
+// argument type is ::std::wstring.
+TEST(IsSubstringTest, GeneratesCorrectMessageForWstring) {
+  EXPECT_STREQ("Value of: needle_expr\n"
+               "  Actual: L\"needle\"\n"
+               "Expected: a substring of haystack_expr\n"
+               "Which is: L\"haystack\"",
+               IsSubstring(
+                   "needle_expr", "haystack_expr",
+                   ::std::wstring(L"needle"), L"haystack").failure_message());
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Tests for ::testing::IsNotSubstring().
+
+// Tests that IsNotSubstring() returns the correct result when the input
+// argument type is const char*.
+TEST(IsNotSubstringTest, ReturnsCorrectResultForCString) {
+  EXPECT_TRUE(IsNotSubstring("", "", "needle", "haystack"));
+  EXPECT_FALSE(IsNotSubstring("", "", "needle", "two needles"));
+}
+
+// Tests that IsNotSubstring() returns the correct result when the input
+// argument type is const wchar_t*.
+TEST(IsNotSubstringTest, ReturnsCorrectResultForWideCString) {
+  EXPECT_TRUE(IsNotSubstring("", "", L"needle", L"haystack"));
+  EXPECT_FALSE(IsNotSubstring("", "", L"needle", L"two needles"));
+}
+
+// Tests that IsNotSubstring() generates the correct message when the input
+// argument type is const wchar_t*.
+TEST(IsNotSubstringTest, GeneratesCorrectMessageForWideCString) {
+  EXPECT_STREQ("Value of: needle_expr\n"
+               "  Actual: L\"needle\"\n"
+               "Expected: not a substring of haystack_expr\n"
+               "Which is: L\"two needles\"",
+               IsNotSubstring(
+                   "needle_expr", "haystack_expr",
+                   L"needle", L"two needles").failure_message());
+}
+
+// Tests that IsNotSubstring returns the correct result when the input
+// argument type is ::std::string.
+TEST(IsNotSubstringTest, ReturnsCorrectResultsForStdString) {
+  EXPECT_FALSE(IsNotSubstring("", "", std::string("hello"), "ahellob"));
+  EXPECT_TRUE(IsNotSubstring("", "", "hello", std::string("world")));
+}
+
+// Tests that IsNotSubstring() generates the correct message when the input
+// argument type is ::std::string.
+TEST(IsNotSubstringTest, GeneratesCorrectMessageForStdString) {
+  EXPECT_STREQ("Value of: needle_expr\n"
+               "  Actual: \"needle\"\n"
+               "Expected: not a substring of haystack_expr\n"
+               "Which is: \"two needles\"",
+               IsNotSubstring(
+                   "needle_expr", "haystack_expr",
+                   ::std::string("needle"), "two needles").failure_message());
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Tests that IsNotSubstring returns the correct result when the input
+// argument type is ::std::wstring.
+TEST(IsNotSubstringTest, ReturnsCorrectResultForStdWstring) {
+  EXPECT_FALSE(
+      IsNotSubstring("", "", ::std::wstring(L"needle"), L"two needles"));
+  EXPECT_TRUE(IsNotSubstring("", "", L"needle", ::std::wstring(L"haystack")));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Tests floating-point assertions.
+
+template <typename RawType>
+class FloatingPointTest : public Test {
+ protected:
+  // Pre-calculated numbers to be used by the tests.
+  struct TestValues {
+    RawType close_to_positive_zero;
+    RawType close_to_negative_zero;
+    RawType further_from_negative_zero;
+
+    RawType close_to_one;
+    RawType further_from_one;
+
+    RawType infinity;
+    RawType close_to_infinity;
+    RawType further_from_infinity;
+
+    RawType nan1;
+    RawType nan2;
+  };
+
+  typedef typename testing::internal::FloatingPoint<RawType> Floating;
+  typedef typename Floating::Bits Bits;
+
+  void SetUp() override {
+    const size_t max_ulps = Floating::kMaxUlps;
+
+    // The bits that represent 0.0.
+    const Bits zero_bits = Floating(0).bits();
+
+    // Makes some numbers close to 0.0.
+    values_.close_to_positive_zero = Floating::ReinterpretBits(
+        zero_bits + max_ulps/2);
+    values_.close_to_negative_zero = -Floating::ReinterpretBits(
+        zero_bits + max_ulps - max_ulps/2);
+    values_.further_from_negative_zero = -Floating::ReinterpretBits(
+        zero_bits + max_ulps + 1 - max_ulps/2);
+
+    // The bits that represent 1.0.
+    const Bits one_bits = Floating(1).bits();
+
+    // Makes some numbers close to 1.0.
+    values_.close_to_one = Floating::ReinterpretBits(one_bits + max_ulps);
+    values_.further_from_one = Floating::ReinterpretBits(
+        one_bits + max_ulps + 1);
+
+    // +infinity.
+    values_.infinity = Floating::Infinity();
+
+    // The bits that represent +infinity.
+    const Bits infinity_bits = Floating(values_.infinity).bits();
+
+    // Makes some numbers close to infinity.
+    values_.close_to_infinity = Floating::ReinterpretBits(
+        infinity_bits - max_ulps);
+    values_.further_from_infinity = Floating::ReinterpretBits(
+        infinity_bits - max_ulps - 1);
+
+    // Makes some NAN's.  Sets the most significant bit of the fraction so that
+    // our NaN's are quiet; trying to process a signaling NaN would raise an
+    // exception if our environment enables floating point exceptions.
+    values_.nan1 = Floating::ReinterpretBits(Floating::kExponentBitMask
+        | (static_cast<Bits>(1) << (Floating::kFractionBitCount - 1)) | 1);
+    values_.nan2 = Floating::ReinterpretBits(Floating::kExponentBitMask
+        | (static_cast<Bits>(1) << (Floating::kFractionBitCount - 1)) | 200);
+  }
+
+  void TestSize() {
+    EXPECT_EQ(sizeof(RawType), sizeof(Bits));
+  }
+
+  static TestValues values_;
+};
+
+template <typename RawType>
+typename FloatingPointTest<RawType>::TestValues
+    FloatingPointTest<RawType>::values_;
+
+// Instantiates FloatingPointTest for testing *_FLOAT_EQ.
+typedef FloatingPointTest<float> FloatTest;
+
+// Tests that the size of Float::Bits matches the size of float.
+TEST_F(FloatTest, Size) {
+  TestSize();
+}
+
+// Tests comparing with +0 and -0.
+TEST_F(FloatTest, Zeros) {
+  EXPECT_FLOAT_EQ(0.0, -0.0);
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(-0.0, 1.0),
+                          "1.0");
+  EXPECT_FATAL_FAILURE(ASSERT_FLOAT_EQ(0.0, 1.5),
+                       "1.5");
+}
+
+// Tests comparing numbers close to 0.
+//
+// This ensures that *_FLOAT_EQ handles the sign correctly and no
+// overflow occurs when comparing numbers whose absolute value is very
+// small.
+TEST_F(FloatTest, AlmostZeros) {
+  // In C++Builder, names within local classes (such as used by
+  // EXPECT_FATAL_FAILURE) cannot be resolved against static members of the
+  // scoping class.  Use a static local alias as a workaround.
+  // We use the assignment syntax since some compilers, like Sun Studio,
+  // don't allow initializing references using construction syntax
+  // (parentheses).
+  static const FloatTest::TestValues& v = this->values_;
+
+  EXPECT_FLOAT_EQ(0.0, v.close_to_positive_zero);
+  EXPECT_FLOAT_EQ(-0.0, v.close_to_negative_zero);
+  EXPECT_FLOAT_EQ(v.close_to_positive_zero, v.close_to_negative_zero);
+
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_FLOAT_EQ(v.close_to_positive_zero,
+                    v.further_from_negative_zero);
+  }, "v.further_from_negative_zero");
+}
+
+// Tests comparing numbers close to each other.
+TEST_F(FloatTest, SmallDiff) {
+  EXPECT_FLOAT_EQ(1.0, values_.close_to_one);
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(1.0, values_.further_from_one),
+                          "values_.further_from_one");
+}
+
+// Tests comparing numbers far apart.
+TEST_F(FloatTest, LargeDiff) {
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(2.5, 3.0),
+                          "3.0");
+}
+
+// Tests comparing with infinity.
+//
+// This ensures that no overflow occurs when comparing numbers whose
+// absolute value is very large.
+TEST_F(FloatTest, Infinity) {
+  EXPECT_FLOAT_EQ(values_.infinity, values_.close_to_infinity);
+  EXPECT_FLOAT_EQ(-values_.infinity, -values_.close_to_infinity);
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(values_.infinity, -values_.infinity),
+                          "-values_.infinity");
+
+  // This is interesting as the representations of infinity and nan1
+  // are only 1 DLP apart.
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(values_.infinity, values_.nan1),
+                          "values_.nan1");
+}
+
+// Tests that comparing with NAN always returns false.
+TEST_F(FloatTest, NaN) {
+  // In C++Builder, names within local classes (such as used by
+  // EXPECT_FATAL_FAILURE) cannot be resolved against static members of the
+  // scoping class.  Use a static local alias as a workaround.
+  // We use the assignment syntax since some compilers, like Sun Studio,
+  // don't allow initializing references using construction syntax
+  // (parentheses).
+  static const FloatTest::TestValues& v = this->values_;
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(v.nan1, v.nan1),
+                          "v.nan1");
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(v.nan1, v.nan2),
+                          "v.nan2");
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(1.0, v.nan1),
+                          "v.nan1");
+
+  EXPECT_FATAL_FAILURE(ASSERT_FLOAT_EQ(v.nan1, v.infinity),
+                       "v.infinity");
+}
+
+// Tests that *_FLOAT_EQ are reflexive.
+TEST_F(FloatTest, Reflexive) {
+  EXPECT_FLOAT_EQ(0.0, 0.0);
+  EXPECT_FLOAT_EQ(1.0, 1.0);
+  ASSERT_FLOAT_EQ(values_.infinity, values_.infinity);
+}
+
+// Tests that *_FLOAT_EQ are commutative.
+TEST_F(FloatTest, Commutative) {
+  // We already tested EXPECT_FLOAT_EQ(1.0, values_.close_to_one).
+  EXPECT_FLOAT_EQ(values_.close_to_one, 1.0);
+
+  // We already tested EXPECT_FLOAT_EQ(1.0, values_.further_from_one).
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(values_.further_from_one, 1.0),
+                          "1.0");
+}
+
+// Tests EXPECT_NEAR.
+TEST_F(FloatTest, EXPECT_NEAR) {
+  EXPECT_NEAR(-1.0f, -1.1f, 0.2f);
+  EXPECT_NEAR(2.0f, 3.0f, 1.0f);
+  EXPECT_NONFATAL_FAILURE(EXPECT_NEAR(1.0f,1.5f, 0.25f),  // NOLINT
+                          "The difference between 1.0f and 1.5f is 0.5, "
+                          "which exceeds 0.25f");
+  // To work around a bug in gcc 2.95.0, there is intentionally no
+  // space after the first comma in the previous line.
+}
+
+// Tests ASSERT_NEAR.
+TEST_F(FloatTest, ASSERT_NEAR) {
+  ASSERT_NEAR(-1.0f, -1.1f, 0.2f);
+  ASSERT_NEAR(2.0f, 3.0f, 1.0f);
+  EXPECT_FATAL_FAILURE(ASSERT_NEAR(1.0f,1.5f, 0.25f),  // NOLINT
+                       "The difference between 1.0f and 1.5f is 0.5, "
+                       "which exceeds 0.25f");
+  // To work around a bug in gcc 2.95.0, there is intentionally no
+  // space after the first comma in the previous line.
+}
+
+// Tests the cases where FloatLE() should succeed.
+TEST_F(FloatTest, FloatLESucceeds) {
+  EXPECT_PRED_FORMAT2(FloatLE, 1.0f, 2.0f);  // When val1 < val2,
+  ASSERT_PRED_FORMAT2(FloatLE, 1.0f, 1.0f);  // val1 == val2,
+
+  // or when val1 is greater than, but almost equals to, val2.
+  EXPECT_PRED_FORMAT2(FloatLE, values_.close_to_positive_zero, 0.0f);
+}
+
+// Tests the cases where FloatLE() should fail.
+TEST_F(FloatTest, FloatLEFails) {
+  // When val1 is greater than val2 by a large margin,
+  EXPECT_NONFATAL_FAILURE(EXPECT_PRED_FORMAT2(FloatLE, 2.0f, 1.0f),
+                          "(2.0f) <= (1.0f)");
+
+  // or by a small yet non-negligible margin,
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(FloatLE, values_.further_from_one, 1.0f);
+  }, "(values_.further_from_one) <= (1.0f)");
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(FloatLE, values_.nan1, values_.infinity);
+  }, "(values_.nan1) <= (values_.infinity)");
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(FloatLE, -values_.infinity, values_.nan1);
+  }, "(-values_.infinity) <= (values_.nan1)");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(FloatLE, values_.nan1, values_.nan1);
+  }, "(values_.nan1) <= (values_.nan1)");
+}
+
+// Instantiates FloatingPointTest for testing *_DOUBLE_EQ.
+typedef FloatingPointTest<double> DoubleTest;
+
+// Tests that the size of Double::Bits matches the size of double.
+TEST_F(DoubleTest, Size) {
+  TestSize();
+}
+
+// Tests comparing with +0 and -0.
+TEST_F(DoubleTest, Zeros) {
+  EXPECT_DOUBLE_EQ(0.0, -0.0);
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(-0.0, 1.0),
+                          "1.0");
+  EXPECT_FATAL_FAILURE(ASSERT_DOUBLE_EQ(0.0, 1.0),
+                       "1.0");
+}
+
+// Tests comparing numbers close to 0.
+//
+// This ensures that *_DOUBLE_EQ handles the sign correctly and no
+// overflow occurs when comparing numbers whose absolute value is very
+// small.
+TEST_F(DoubleTest, AlmostZeros) {
+  // In C++Builder, names within local classes (such as used by
+  // EXPECT_FATAL_FAILURE) cannot be resolved against static members of the
+  // scoping class.  Use a static local alias as a workaround.
+  // We use the assignment syntax since some compilers, like Sun Studio,
+  // don't allow initializing references using construction syntax
+  // (parentheses).
+  static const DoubleTest::TestValues& v = this->values_;
+
+  EXPECT_DOUBLE_EQ(0.0, v.close_to_positive_zero);
+  EXPECT_DOUBLE_EQ(-0.0, v.close_to_negative_zero);
+  EXPECT_DOUBLE_EQ(v.close_to_positive_zero, v.close_to_negative_zero);
+
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_DOUBLE_EQ(v.close_to_positive_zero,
+                     v.further_from_negative_zero);
+  }, "v.further_from_negative_zero");
+}
+
+// Tests comparing numbers close to each other.
+TEST_F(DoubleTest, SmallDiff) {
+  EXPECT_DOUBLE_EQ(1.0, values_.close_to_one);
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(1.0, values_.further_from_one),
+                          "values_.further_from_one");
+}
+
+// Tests comparing numbers far apart.
+TEST_F(DoubleTest, LargeDiff) {
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(2.0, 3.0),
+                          "3.0");
+}
+
+// Tests comparing with infinity.
+//
+// This ensures that no overflow occurs when comparing numbers whose
+// absolute value is very large.
+TEST_F(DoubleTest, Infinity) {
+  EXPECT_DOUBLE_EQ(values_.infinity, values_.close_to_infinity);
+  EXPECT_DOUBLE_EQ(-values_.infinity, -values_.close_to_infinity);
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(values_.infinity, -values_.infinity),
+                          "-values_.infinity");
+
+  // This is interesting as the representations of infinity_ and nan1_
+  // are only 1 DLP apart.
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(values_.infinity, values_.nan1),
+                          "values_.nan1");
+}
+
+// Tests that comparing with NAN always returns false.
+TEST_F(DoubleTest, NaN) {
+  static const DoubleTest::TestValues& v = this->values_;
+
+  // Nokia's STLport crashes if we try to output infinity or NaN.
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(v.nan1, v.nan1),
+                          "v.nan1");
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(v.nan1, v.nan2), "v.nan2");
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(1.0, v.nan1), "v.nan1");
+  EXPECT_FATAL_FAILURE(ASSERT_DOUBLE_EQ(v.nan1, v.infinity),
+                       "v.infinity");
+}
+
+// Tests that *_DOUBLE_EQ are reflexive.
+TEST_F(DoubleTest, Reflexive) {
+  EXPECT_DOUBLE_EQ(0.0, 0.0);
+  EXPECT_DOUBLE_EQ(1.0, 1.0);
+  ASSERT_DOUBLE_EQ(values_.infinity, values_.infinity);
+}
+
+// Tests that *_DOUBLE_EQ are commutative.
+TEST_F(DoubleTest, Commutative) {
+  // We already tested EXPECT_DOUBLE_EQ(1.0, values_.close_to_one).
+  EXPECT_DOUBLE_EQ(values_.close_to_one, 1.0);
+
+  // We already tested EXPECT_DOUBLE_EQ(1.0, values_.further_from_one).
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(values_.further_from_one, 1.0),
+                          "1.0");
+}
+
+// Tests EXPECT_NEAR.
+TEST_F(DoubleTest, EXPECT_NEAR) {
+  EXPECT_NEAR(-1.0, -1.1, 0.2);
+  EXPECT_NEAR(2.0, 3.0, 1.0);
+  EXPECT_NONFATAL_FAILURE(EXPECT_NEAR(1.0, 1.5, 0.25),  // NOLINT
+                          "The difference between 1.0 and 1.5 is 0.5, "
+                          "which exceeds 0.25");
+  // To work around a bug in gcc 2.95.0, there is intentionally no
+  // space after the first comma in the previous statement.
+}
+
+// Tests ASSERT_NEAR.
+TEST_F(DoubleTest, ASSERT_NEAR) {
+  ASSERT_NEAR(-1.0, -1.1, 0.2);
+  ASSERT_NEAR(2.0, 3.0, 1.0);
+  EXPECT_FATAL_FAILURE(ASSERT_NEAR(1.0, 1.5, 0.25),  // NOLINT
+                       "The difference between 1.0 and 1.5 is 0.5, "
+                       "which exceeds 0.25");
+  // To work around a bug in gcc 2.95.0, there is intentionally no
+  // space after the first comma in the previous statement.
+}
+
+// Tests the cases where DoubleLE() should succeed.
+TEST_F(DoubleTest, DoubleLESucceeds) {
+  EXPECT_PRED_FORMAT2(DoubleLE, 1.0, 2.0);  // When val1 < val2,
+  ASSERT_PRED_FORMAT2(DoubleLE, 1.0, 1.0);  // val1 == val2,
+
+  // or when val1 is greater than, but almost equals to, val2.
+  EXPECT_PRED_FORMAT2(DoubleLE, values_.close_to_positive_zero, 0.0);
+}
+
+// Tests the cases where DoubleLE() should fail.
+TEST_F(DoubleTest, DoubleLEFails) {
+  // When val1 is greater than val2 by a large margin,
+  EXPECT_NONFATAL_FAILURE(EXPECT_PRED_FORMAT2(DoubleLE, 2.0, 1.0),
+                          "(2.0) <= (1.0)");
+
+  // or by a small yet non-negligible margin,
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(DoubleLE, values_.further_from_one, 1.0);
+  }, "(values_.further_from_one) <= (1.0)");
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(DoubleLE, values_.nan1, values_.infinity);
+  }, "(values_.nan1) <= (values_.infinity)");
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_PRED_FORMAT2(DoubleLE, -values_.infinity, values_.nan1);
+  }, " (-values_.infinity) <= (values_.nan1)");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_PRED_FORMAT2(DoubleLE, values_.nan1, values_.nan1);
+  }, "(values_.nan1) <= (values_.nan1)");
+}
+
+
+// Verifies that a test or test case whose name starts with DISABLED_ is
+// not run.
+
+// A test whose name starts with DISABLED_.
+// Should not run.
+TEST(DisabledTest, DISABLED_TestShouldNotRun) {
+  FAIL() << "Unexpected failure: Disabled test should not be run.";
+}
+
+// A test whose name does not start with DISABLED_.
+// Should run.
+TEST(DisabledTest, NotDISABLED_TestShouldRun) {
+  EXPECT_EQ(1, 1);
+}
+
+// A test case whose name starts with DISABLED_.
+// Should not run.
+TEST(DISABLED_TestSuite, TestShouldNotRun) {
+  FAIL() << "Unexpected failure: Test in disabled test case should not be run.";
+}
+
+// A test case and test whose names start with DISABLED_.
+// Should not run.
+TEST(DISABLED_TestSuite, DISABLED_TestShouldNotRun) {
+  FAIL() << "Unexpected failure: Test in disabled test case should not be run.";
+}
+
+// Check that when all tests in a test case are disabled, SetUpTestSuite() and
+// TearDownTestSuite() are not called.
+class DisabledTestsTest : public Test {
+ protected:
+  static void SetUpTestSuite() {
+    FAIL() << "Unexpected failure: All tests disabled in test case. "
+              "SetUpTestSuite() should not be called.";
+  }
+
+  static void TearDownTestSuite() {
+    FAIL() << "Unexpected failure: All tests disabled in test case. "
+              "TearDownTestSuite() should not be called.";
+  }
+};
+
+TEST_F(DisabledTestsTest, DISABLED_TestShouldNotRun_1) {
+  FAIL() << "Unexpected failure: Disabled test should not be run.";
+}
+
+TEST_F(DisabledTestsTest, DISABLED_TestShouldNotRun_2) {
+  FAIL() << "Unexpected failure: Disabled test should not be run.";
+}
+
+// Tests that disabled typed tests aren't run.
+
+#if GTEST_HAS_TYPED_TEST
+
+template <typename T>
+class TypedTest : public Test {
+};
+
+typedef testing::Types<int, double> NumericTypes;
+TYPED_TEST_SUITE(TypedTest, NumericTypes);
+
+TYPED_TEST(TypedTest, DISABLED_ShouldNotRun) {
+  FAIL() << "Unexpected failure: Disabled typed test should not run.";
+}
+
+template <typename T>
+class DISABLED_TypedTest : public Test {
+};
+
+TYPED_TEST_SUITE(DISABLED_TypedTest, NumericTypes);
+
+TYPED_TEST(DISABLED_TypedTest, ShouldNotRun) {
+  FAIL() << "Unexpected failure: Disabled typed test should not run.";
+}
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Tests that disabled type-parameterized tests aren't run.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+template <typename T>
+class TypedTestP : public Test {
+};
+
+TYPED_TEST_SUITE_P(TypedTestP);
+
+TYPED_TEST_P(TypedTestP, DISABLED_ShouldNotRun) {
+  FAIL() << "Unexpected failure: "
+         << "Disabled type-parameterized test should not run.";
+}
+
+REGISTER_TYPED_TEST_SUITE_P(TypedTestP, DISABLED_ShouldNotRun);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(My, TypedTestP, NumericTypes);
+
+template <typename T>
+class DISABLED_TypedTestP : public Test {
+};
+
+TYPED_TEST_SUITE_P(DISABLED_TypedTestP);
+
+TYPED_TEST_P(DISABLED_TypedTestP, ShouldNotRun) {
+  FAIL() << "Unexpected failure: "
+         << "Disabled type-parameterized test should not run.";
+}
+
+REGISTER_TYPED_TEST_SUITE_P(DISABLED_TypedTestP, ShouldNotRun);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(My, DISABLED_TypedTestP, NumericTypes);
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+// Tests that assertion macros evaluate their arguments exactly once.
+
+class SingleEvaluationTest : public Test {
+ public:  // Must be public and not protected due to a bug in g++ 3.4.2.
+  // This helper function is needed by the FailedASSERT_STREQ test
+  // below.  It's public to work around C++Builder's bug with scoping local
+  // classes.
+  static void CompareAndIncrementCharPtrs() {
+    ASSERT_STREQ(p1_++, p2_++);
+  }
+
+  // This helper function is needed by the FailedASSERT_NE test below.  It's
+  // public to work around C++Builder's bug with scoping local classes.
+  static void CompareAndIncrementInts() {
+    ASSERT_NE(a_++, b_++);
+  }
+
+ protected:
+  SingleEvaluationTest() {
+    p1_ = s1_;
+    p2_ = s2_;
+    a_ = 0;
+    b_ = 0;
+  }
+
+  static const char* const s1_;
+  static const char* const s2_;
+  static const char* p1_;
+  static const char* p2_;
+
+  static int a_;
+  static int b_;
+};
+
+const char* const SingleEvaluationTest::s1_ = "01234";
+const char* const SingleEvaluationTest::s2_ = "abcde";
+const char* SingleEvaluationTest::p1_;
+const char* SingleEvaluationTest::p2_;
+int SingleEvaluationTest::a_;
+int SingleEvaluationTest::b_;
+
+// Tests that when ASSERT_STREQ fails, it evaluates its arguments
+// exactly once.
+TEST_F(SingleEvaluationTest, FailedASSERT_STREQ) {
+  EXPECT_FATAL_FAILURE(SingleEvaluationTest::CompareAndIncrementCharPtrs(),
+                       "p2_++");
+  EXPECT_EQ(s1_ + 1, p1_);
+  EXPECT_EQ(s2_ + 1, p2_);
+}
+
+// Tests that string assertion arguments are evaluated exactly once.
+TEST_F(SingleEvaluationTest, ASSERT_STR) {
+  // successful EXPECT_STRNE
+  EXPECT_STRNE(p1_++, p2_++);
+  EXPECT_EQ(s1_ + 1, p1_);
+  EXPECT_EQ(s2_ + 1, p2_);
+
+  // failed EXPECT_STRCASEEQ
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRCASEEQ(p1_++, p2_++),
+                          "Ignoring case");
+  EXPECT_EQ(s1_ + 2, p1_);
+  EXPECT_EQ(s2_ + 2, p2_);
+}
+
+// Tests that when ASSERT_NE fails, it evaluates its arguments exactly
+// once.
+TEST_F(SingleEvaluationTest, FailedASSERT_NE) {
+  EXPECT_FATAL_FAILURE(SingleEvaluationTest::CompareAndIncrementInts(),
+                       "(a_++) != (b_++)");
+  EXPECT_EQ(1, a_);
+  EXPECT_EQ(1, b_);
+}
+
+// Tests that assertion arguments are evaluated exactly once.
+TEST_F(SingleEvaluationTest, OtherCases) {
+  // successful EXPECT_TRUE
+  EXPECT_TRUE(0 == a_++);  // NOLINT
+  EXPECT_EQ(1, a_);
+
+  // failed EXPECT_TRUE
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(-1 == a_++), "-1 == a_++");
+  EXPECT_EQ(2, a_);
+
+  // successful EXPECT_GT
+  EXPECT_GT(a_++, b_++);
+  EXPECT_EQ(3, a_);
+  EXPECT_EQ(1, b_);
+
+  // failed EXPECT_LT
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(a_++, b_++), "(a_++) < (b_++)");
+  EXPECT_EQ(4, a_);
+  EXPECT_EQ(2, b_);
+
+  // successful ASSERT_TRUE
+  ASSERT_TRUE(0 < a_++);  // NOLINT
+  EXPECT_EQ(5, a_);
+
+  // successful ASSERT_GT
+  ASSERT_GT(a_++, b_++);
+  EXPECT_EQ(6, a_);
+  EXPECT_EQ(3, b_);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+void ThrowAnInteger() {
+  throw 1;
+}
+
+// Tests that assertion arguments are evaluated exactly once.
+TEST_F(SingleEvaluationTest, ExceptionTests) {
+  // successful EXPECT_THROW
+  EXPECT_THROW({  // NOLINT
+    a_++;
+    ThrowAnInteger();
+  }, int);
+  EXPECT_EQ(1, a_);
+
+  // failed EXPECT_THROW, throws different
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW({  // NOLINT
+    a_++;
+    ThrowAnInteger();
+  }, bool), "throws a different type");
+  EXPECT_EQ(2, a_);
+
+  // failed EXPECT_THROW, throws nothing
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW(a_++, bool), "throws nothing");
+  EXPECT_EQ(3, a_);
+
+  // successful EXPECT_NO_THROW
+  EXPECT_NO_THROW(a_++);
+  EXPECT_EQ(4, a_);
+
+  // failed EXPECT_NO_THROW
+  EXPECT_NONFATAL_FAILURE(EXPECT_NO_THROW({  // NOLINT
+    a_++;
+    ThrowAnInteger();
+  }), "it throws");
+  EXPECT_EQ(5, a_);
+
+  // successful EXPECT_ANY_THROW
+  EXPECT_ANY_THROW({  // NOLINT
+    a_++;
+    ThrowAnInteger();
+  });
+  EXPECT_EQ(6, a_);
+
+  // failed EXPECT_ANY_THROW
+  EXPECT_NONFATAL_FAILURE(EXPECT_ANY_THROW(a_++), "it doesn't");
+  EXPECT_EQ(7, a_);
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Tests {ASSERT|EXPECT}_NO_FATAL_FAILURE.
+class NoFatalFailureTest : public Test {
+ protected:
+  void Succeeds() {}
+  void FailsNonFatal() {
+    ADD_FAILURE() << "some non-fatal failure";
+  }
+  void Fails() {
+    FAIL() << "some fatal failure";
+  }
+
+  void DoAssertNoFatalFailureOnFails() {
+    ASSERT_NO_FATAL_FAILURE(Fails());
+    ADD_FAILURE() << "should not reach here.";
+  }
+
+  void DoExpectNoFatalFailureOnFails() {
+    EXPECT_NO_FATAL_FAILURE(Fails());
+    ADD_FAILURE() << "other failure";
+  }
+};
+
+TEST_F(NoFatalFailureTest, NoFailure) {
+  EXPECT_NO_FATAL_FAILURE(Succeeds());
+  ASSERT_NO_FATAL_FAILURE(Succeeds());
+}
+
+TEST_F(NoFatalFailureTest, NonFatalIsNoFailure) {
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_NO_FATAL_FAILURE(FailsNonFatal()),
+      "some non-fatal failure");
+  EXPECT_NONFATAL_FAILURE(
+      ASSERT_NO_FATAL_FAILURE(FailsNonFatal()),
+      "some non-fatal failure");
+}
+
+TEST_F(NoFatalFailureTest, AssertNoFatalFailureOnFatalFailure) {
+  TestPartResultArray gtest_failures;
+  {
+    ScopedFakeTestPartResultReporter gtest_reporter(&gtest_failures);
+    DoAssertNoFatalFailureOnFails();
+  }
+  ASSERT_EQ(2, gtest_failures.size());
+  EXPECT_EQ(TestPartResult::kFatalFailure,
+            gtest_failures.GetTestPartResult(0).type());
+  EXPECT_EQ(TestPartResult::kFatalFailure,
+            gtest_failures.GetTestPartResult(1).type());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "some fatal failure",
+                      gtest_failures.GetTestPartResult(0).message());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "it does",
+                      gtest_failures.GetTestPartResult(1).message());
+}
+
+TEST_F(NoFatalFailureTest, ExpectNoFatalFailureOnFatalFailure) {
+  TestPartResultArray gtest_failures;
+  {
+    ScopedFakeTestPartResultReporter gtest_reporter(&gtest_failures);
+    DoExpectNoFatalFailureOnFails();
+  }
+  ASSERT_EQ(3, gtest_failures.size());
+  EXPECT_EQ(TestPartResult::kFatalFailure,
+            gtest_failures.GetTestPartResult(0).type());
+  EXPECT_EQ(TestPartResult::kNonFatalFailure,
+            gtest_failures.GetTestPartResult(1).type());
+  EXPECT_EQ(TestPartResult::kNonFatalFailure,
+            gtest_failures.GetTestPartResult(2).type());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "some fatal failure",
+                      gtest_failures.GetTestPartResult(0).message());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "it does",
+                      gtest_failures.GetTestPartResult(1).message());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "other failure",
+                      gtest_failures.GetTestPartResult(2).message());
+}
+
+TEST_F(NoFatalFailureTest, MessageIsStreamable) {
+  TestPartResultArray gtest_failures;
+  {
+    ScopedFakeTestPartResultReporter gtest_reporter(&gtest_failures);
+    EXPECT_NO_FATAL_FAILURE(FAIL() << "foo") << "my message";
+  }
+  ASSERT_EQ(2, gtest_failures.size());
+  EXPECT_EQ(TestPartResult::kNonFatalFailure,
+            gtest_failures.GetTestPartResult(0).type());
+  EXPECT_EQ(TestPartResult::kNonFatalFailure,
+            gtest_failures.GetTestPartResult(1).type());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "foo",
+                      gtest_failures.GetTestPartResult(0).message());
+  EXPECT_PRED_FORMAT2(testing::IsSubstring, "my message",
+                      gtest_failures.GetTestPartResult(1).message());
+}
+
+// Tests non-string assertions.
+
+std::string EditsToString(const std::vector<EditType>& edits) {
+  std::string out;
+  for (size_t i = 0; i < edits.size(); ++i) {
+    static const char kEdits[] = " +-/";
+    out.append(1, kEdits[edits[i]]);
+  }
+  return out;
+}
+
+std::vector<size_t> CharsToIndices(const std::string& str) {
+  std::vector<size_t> out;
+  for (size_t i = 0; i < str.size(); ++i) {
+    out.push_back(static_cast<size_t>(str[i]));
+  }
+  return out;
+}
+
+std::vector<std::string> CharsToLines(const std::string& str) {
+  std::vector<std::string> out;
+  for (size_t i = 0; i < str.size(); ++i) {
+    out.push_back(str.substr(i, 1));
+  }
+  return out;
+}
+
+TEST(EditDistance, TestSuites) {
+  struct Case {
+    int line;
+    const char* left;
+    const char* right;
+    const char* expected_edits;
+    const char* expected_diff;
+  };
+  static const Case kCases[] = {
+      // No change.
+      {__LINE__, "A", "A", " ", ""},
+      {__LINE__, "ABCDE", "ABCDE", "     ", ""},
+      // Simple adds.
+      {__LINE__, "X", "XA", " +", "@@ +1,2 @@\n X\n+A\n"},
+      {__LINE__, "X", "XABCD", " ++++", "@@ +1,5 @@\n X\n+A\n+B\n+C\n+D\n"},
+      // Simple removes.
+      {__LINE__, "XA", "X", " -", "@@ -1,2 @@\n X\n-A\n"},
+      {__LINE__, "XABCD", "X", " ----", "@@ -1,5 @@\n X\n-A\n-B\n-C\n-D\n"},
+      // Simple replaces.
+      {__LINE__, "A", "a", "/", "@@ -1,1 +1,1 @@\n-A\n+a\n"},
+      {__LINE__, "ABCD", "abcd", "////",
+       "@@ -1,4 +1,4 @@\n-A\n-B\n-C\n-D\n+a\n+b\n+c\n+d\n"},
+      // Path finding.
+      {__LINE__, "ABCDEFGH", "ABXEGH1", "  -/ -  +",
+       "@@ -1,8 +1,7 @@\n A\n B\n-C\n-D\n+X\n E\n-F\n G\n H\n+1\n"},
+      {__LINE__, "AAAABCCCC", "ABABCDCDC", "- /   + / ",
+       "@@ -1,9 +1,9 @@\n-A\n A\n-A\n+B\n A\n B\n C\n+D\n C\n-C\n+D\n C\n"},
+      {__LINE__, "ABCDE", "BCDCD", "-   +/",
+       "@@ -1,5 +1,5 @@\n-A\n B\n C\n D\n-E\n+C\n+D\n"},
+      {__LINE__, "ABCDEFGHIJKL", "BCDCDEFGJKLJK", "- ++     --   ++",
+       "@@ -1,4 +1,5 @@\n-A\n B\n+C\n+D\n C\n D\n"
+       "@@ -6,7 +7,7 @@\n F\n G\n-H\n-I\n J\n K\n L\n+J\n+K\n"},
+      {}};
+  for (const Case* c = kCases; c->left; ++c) {
+    EXPECT_TRUE(c->expected_edits ==
+                EditsToString(CalculateOptimalEdits(CharsToIndices(c->left),
+                                                    CharsToIndices(c->right))))
+        << "Left <" << c->left << "> Right <" << c->right << "> Edits <"
+        << EditsToString(CalculateOptimalEdits(
+               CharsToIndices(c->left), CharsToIndices(c->right))) << ">";
+    EXPECT_TRUE(c->expected_diff == CreateUnifiedDiff(CharsToLines(c->left),
+                                                      CharsToLines(c->right)))
+        << "Left <" << c->left << "> Right <" << c->right << "> Diff <"
+        << CreateUnifiedDiff(CharsToLines(c->left), CharsToLines(c->right))
+        << ">";
+  }
+}
+
+// Tests EqFailure(), used for implementing *EQ* assertions.
+TEST(AssertionTest, EqFailure) {
+  const std::string foo_val("5"), bar_val("6");
+  const std::string msg1(
+      EqFailure("foo", "bar", foo_val, bar_val, false)
+      .failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  foo\n"
+      "    Which is: 5\n"
+      "  bar\n"
+      "    Which is: 6",
+      msg1.c_str());
+
+  const std::string msg2(
+      EqFailure("foo", "6", foo_val, bar_val, false)
+      .failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  foo\n"
+      "    Which is: 5\n"
+      "  6",
+      msg2.c_str());
+
+  const std::string msg3(
+      EqFailure("5", "bar", foo_val, bar_val, false)
+      .failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  5\n"
+      "  bar\n"
+      "    Which is: 6",
+      msg3.c_str());
+
+  const std::string msg4(
+      EqFailure("5", "6", foo_val, bar_val, false).failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  5\n"
+      "  6",
+      msg4.c_str());
+
+  const std::string msg5(
+      EqFailure("foo", "bar",
+                std::string("\"x\""), std::string("\"y\""),
+                true).failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  foo\n"
+      "    Which is: \"x\"\n"
+      "  bar\n"
+      "    Which is: \"y\"\n"
+      "Ignoring case",
+      msg5.c_str());
+}
+
+TEST(AssertionTest, EqFailureWithDiff) {
+  const std::string left(
+      "1\\n2XXX\\n3\\n5\\n6\\n7\\n8\\n9\\n10\\n11\\n12XXX\\n13\\n14\\n15");
+  const std::string right(
+      "1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n11\\n12\\n13\\n14");
+  const std::string msg1(
+      EqFailure("left", "right", left, right, false).failure_message());
+  EXPECT_STREQ(
+      "Expected equality of these values:\n"
+      "  left\n"
+      "    Which is: "
+      "1\\n2XXX\\n3\\n5\\n6\\n7\\n8\\n9\\n10\\n11\\n12XXX\\n13\\n14\\n15\n"
+      "  right\n"
+      "    Which is: 1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n11\\n12\\n13\\n14\n"
+      "With diff:\n@@ -1,5 +1,6 @@\n 1\n-2XXX\n+2\n 3\n+4\n 5\n 6\n"
+      "@@ -7,8 +8,6 @@\n 8\n 9\n-10\n 11\n-12XXX\n+12\n 13\n 14\n-15\n",
+      msg1.c_str());
+}
+
+// Tests AppendUserMessage(), used for implementing the *EQ* macros.
+TEST(AssertionTest, AppendUserMessage) {
+  const std::string foo("foo");
+
+  Message msg;
+  EXPECT_STREQ("foo",
+               AppendUserMessage(foo, msg).c_str());
+
+  msg << "bar";
+  EXPECT_STREQ("foo\nbar",
+               AppendUserMessage(foo, msg).c_str());
+}
+
+#ifdef __BORLANDC__
+// Silences warnings: "Condition is always true", "Unreachable code"
+# pragma option push -w-ccc -w-rch
+#endif
+
+// Tests ASSERT_TRUE.
+TEST(AssertionTest, ASSERT_TRUE) {
+  ASSERT_TRUE(2 > 1);  // NOLINT
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(2 < 1),
+                       "2 < 1");
+}
+
+// Tests ASSERT_TRUE(predicate) for predicates returning AssertionResult.
+TEST(AssertionTest, AssertTrueWithAssertionResult) {
+  ASSERT_TRUE(ResultIsEven(2));
+#ifndef __BORLANDC__
+  // ICE's in C++Builder.
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(ResultIsEven(3)),
+                       "Value of: ResultIsEven(3)\n"
+                       "  Actual: false (3 is odd)\n"
+                       "Expected: true");
+#endif
+  ASSERT_TRUE(ResultIsEvenNoExplanation(2));
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(ResultIsEvenNoExplanation(3)),
+                       "Value of: ResultIsEvenNoExplanation(3)\n"
+                       "  Actual: false (3 is odd)\n"
+                       "Expected: true");
+}
+
+// Tests ASSERT_FALSE.
+TEST(AssertionTest, ASSERT_FALSE) {
+  ASSERT_FALSE(2 < 1);  // NOLINT
+  EXPECT_FATAL_FAILURE(ASSERT_FALSE(2 > 1),
+                       "Value of: 2 > 1\n"
+                       "  Actual: true\n"
+                       "Expected: false");
+}
+
+// Tests ASSERT_FALSE(predicate) for predicates returning AssertionResult.
+TEST(AssertionTest, AssertFalseWithAssertionResult) {
+  ASSERT_FALSE(ResultIsEven(3));
+#ifndef __BORLANDC__
+  // ICE's in C++Builder.
+  EXPECT_FATAL_FAILURE(ASSERT_FALSE(ResultIsEven(2)),
+                       "Value of: ResultIsEven(2)\n"
+                       "  Actual: true (2 is even)\n"
+                       "Expected: false");
+#endif
+  ASSERT_FALSE(ResultIsEvenNoExplanation(3));
+  EXPECT_FATAL_FAILURE(ASSERT_FALSE(ResultIsEvenNoExplanation(2)),
+                       "Value of: ResultIsEvenNoExplanation(2)\n"
+                       "  Actual: true\n"
+                       "Expected: false");
+}
+
+#ifdef __BORLANDC__
+// Restores warnings after previous "#pragma option push" suppressed them
+# pragma option pop
+#endif
+
+// Tests using ASSERT_EQ on double values.  The purpose is to make
+// sure that the specialization we did for integer and anonymous enums
+// isn't used for double arguments.
+TEST(ExpectTest, ASSERT_EQ_Double) {
+  // A success.
+  ASSERT_EQ(5.6, 5.6);
+
+  // A failure.
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(5.1, 5.2),
+                       "5.1");
+}
+
+// Tests ASSERT_EQ.
+TEST(AssertionTest, ASSERT_EQ) {
+  ASSERT_EQ(5, 2 + 3);
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(5, 2*3),
+                       "Expected equality of these values:\n"
+                       "  5\n"
+                       "  2*3\n"
+                       "    Which is: 6");
+}
+
+// Tests ASSERT_EQ(NULL, pointer).
+TEST(AssertionTest, ASSERT_EQ_NULL) {
+  // A success.
+  const char* p = nullptr;
+  // Some older GCC versions may issue a spurious warning in this or the next
+  // assertion statement. This warning should not be suppressed with
+  // static_cast since the test verifies the ability to use bare NULL as the
+  // expected parameter to the macro.
+  ASSERT_EQ(nullptr, p);
+
+  // A failure.
+  static int n = 0;
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(nullptr, &n), "  &n\n    Which is:");
+}
+
+// Tests ASSERT_EQ(0, non_pointer).  Since the literal 0 can be
+// treated as a null pointer by the compiler, we need to make sure
+// that ASSERT_EQ(0, non_pointer) isn't interpreted by Google Test as
+// ASSERT_EQ(static_cast<void*>(NULL), non_pointer).
+TEST(ExpectTest, ASSERT_EQ_0) {
+  int n = 0;
+
+  // A success.
+  ASSERT_EQ(0, n);
+
+  // A failure.
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(0, 5.6),
+                       "  0\n  5.6");
+}
+
+// Tests ASSERT_NE.
+TEST(AssertionTest, ASSERT_NE) {
+  ASSERT_NE(6, 7);
+  EXPECT_FATAL_FAILURE(ASSERT_NE('a', 'a'),
+                       "Expected: ('a') != ('a'), "
+                       "actual: 'a' (97, 0x61) vs 'a' (97, 0x61)");
+}
+
+// Tests ASSERT_LE.
+TEST(AssertionTest, ASSERT_LE) {
+  ASSERT_LE(2, 3);
+  ASSERT_LE(2, 2);
+  EXPECT_FATAL_FAILURE(ASSERT_LE(2, 0),
+                       "Expected: (2) <= (0), actual: 2 vs 0");
+}
+
+// Tests ASSERT_LT.
+TEST(AssertionTest, ASSERT_LT) {
+  ASSERT_LT(2, 3);
+  EXPECT_FATAL_FAILURE(ASSERT_LT(2, 2),
+                       "Expected: (2) < (2), actual: 2 vs 2");
+}
+
+// Tests ASSERT_GE.
+TEST(AssertionTest, ASSERT_GE) {
+  ASSERT_GE(2, 1);
+  ASSERT_GE(2, 2);
+  EXPECT_FATAL_FAILURE(ASSERT_GE(2, 3),
+                       "Expected: (2) >= (3), actual: 2 vs 3");
+}
+
+// Tests ASSERT_GT.
+TEST(AssertionTest, ASSERT_GT) {
+  ASSERT_GT(2, 1);
+  EXPECT_FATAL_FAILURE(ASSERT_GT(2, 2),
+                       "Expected: (2) > (2), actual: 2 vs 2");
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+void ThrowNothing() {}
+
+// Tests ASSERT_THROW.
+TEST(AssertionTest, ASSERT_THROW) {
+  ASSERT_THROW(ThrowAnInteger(), int);
+
+# ifndef __BORLANDC__
+
+  // ICE's in C++Builder 2007 and 2009.
+  EXPECT_FATAL_FAILURE(
+      ASSERT_THROW(ThrowAnInteger(), bool),
+      "Expected: ThrowAnInteger() throws an exception of type bool.\n"
+      "  Actual: it throws a different type.");
+# endif
+
+  EXPECT_FATAL_FAILURE(
+      ASSERT_THROW(ThrowNothing(), bool),
+      "Expected: ThrowNothing() throws an exception of type bool.\n"
+      "  Actual: it throws nothing.");
+}
+
+// Tests ASSERT_NO_THROW.
+TEST(AssertionTest, ASSERT_NO_THROW) {
+  ASSERT_NO_THROW(ThrowNothing());
+  EXPECT_FATAL_FAILURE(ASSERT_NO_THROW(ThrowAnInteger()),
+                       "Expected: ThrowAnInteger() doesn't throw an exception."
+                       "\n  Actual: it throws.");
+}
+
+// Tests ASSERT_ANY_THROW.
+TEST(AssertionTest, ASSERT_ANY_THROW) {
+  ASSERT_ANY_THROW(ThrowAnInteger());
+  EXPECT_FATAL_FAILURE(
+      ASSERT_ANY_THROW(ThrowNothing()),
+      "Expected: ThrowNothing() throws an exception.\n"
+      "  Actual: it doesn't.");
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Makes sure we deal with the precedence of <<.  This test should
+// compile.
+TEST(AssertionTest, AssertPrecedence) {
+  ASSERT_EQ(1 < 2, true);
+  bool false_value = false;
+  ASSERT_EQ(true && false_value, false);
+}
+
+// A subroutine used by the following test.
+void TestEq1(int x) {
+  ASSERT_EQ(1, x);
+}
+
+// Tests calling a test subroutine that's not part of a fixture.
+TEST(AssertionTest, NonFixtureSubroutine) {
+  EXPECT_FATAL_FAILURE(TestEq1(2),
+                       "  x\n    Which is: 2");
+}
+
+// An uncopyable class.
+class Uncopyable {
+ public:
+  explicit Uncopyable(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+  bool operator==(const Uncopyable& rhs) const {
+    return value() == rhs.value();
+  }
+ private:
+  // This constructor deliberately has no implementation, as we don't
+  // want this class to be copyable.
+  Uncopyable(const Uncopyable&);  // NOLINT
+
+  int value_;
+};
+
+::std::ostream& operator<<(::std::ostream& os, const Uncopyable& value) {
+  return os << value.value();
+}
+
+
+bool IsPositiveUncopyable(const Uncopyable& x) {
+  return x.value() > 0;
+}
+
+// A subroutine used by the following test.
+void TestAssertNonPositive() {
+  Uncopyable y(-1);
+  ASSERT_PRED1(IsPositiveUncopyable, y);
+}
+// A subroutine used by the following test.
+void TestAssertEqualsUncopyable() {
+  Uncopyable x(5);
+  Uncopyable y(-1);
+  ASSERT_EQ(x, y);
+}
+
+// Tests that uncopyable objects can be used in assertions.
+TEST(AssertionTest, AssertWorksWithUncopyableObject) {
+  Uncopyable x(5);
+  ASSERT_PRED1(IsPositiveUncopyable, x);
+  ASSERT_EQ(x, x);
+  EXPECT_FATAL_FAILURE(TestAssertNonPositive(),
+    "IsPositiveUncopyable(y) evaluates to false, where\ny evaluates to -1");
+  EXPECT_FATAL_FAILURE(TestAssertEqualsUncopyable(),
+                       "Expected equality of these values:\n"
+                       "  x\n    Which is: 5\n  y\n    Which is: -1");
+}
+
+// Tests that uncopyable objects can be used in expects.
+TEST(AssertionTest, ExpectWorksWithUncopyableObject) {
+  Uncopyable x(5);
+  EXPECT_PRED1(IsPositiveUncopyable, x);
+  Uncopyable y(-1);
+  EXPECT_NONFATAL_FAILURE(EXPECT_PRED1(IsPositiveUncopyable, y),
+    "IsPositiveUncopyable(y) evaluates to false, where\ny evaluates to -1");
+  EXPECT_EQ(x, x);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(x, y),
+                          "Expected equality of these values:\n"
+                          "  x\n    Which is: 5\n  y\n    Which is: -1");
+}
+
+enum NamedEnum {
+  kE1 = 0,
+  kE2 = 1
+};
+
+TEST(AssertionTest, NamedEnum) {
+  EXPECT_EQ(kE1, kE1);
+  EXPECT_LT(kE1, kE2);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(kE1, kE2), "Which is: 0");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(kE1, kE2), "Which is: 1");
+}
+
+// Sun Studio and HP aCC2reject this code.
+#if !defined(__SUNPRO_CC) && !defined(__HP_aCC)
+
+// Tests using assertions with anonymous enums.
+enum {
+  kCaseA = -1,
+
+# if GTEST_OS_LINUX
+
+  // We want to test the case where the size of the anonymous enum is
+  // larger than sizeof(int), to make sure our implementation of the
+  // assertions doesn't truncate the enums.  However, MSVC
+  // (incorrectly) doesn't allow an enum value to exceed the range of
+  // an int, so this has to be conditionally compiled.
+  //
+  // On Linux, kCaseB and kCaseA have the same value when truncated to
+  // int size.  We want to test whether this will confuse the
+  // assertions.
+  kCaseB = testing::internal::kMaxBiggestInt,
+
+# else
+
+  kCaseB = INT_MAX,
+
+# endif  // GTEST_OS_LINUX
+
+  kCaseC = 42
+};
+
+TEST(AssertionTest, AnonymousEnum) {
+# if GTEST_OS_LINUX
+
+  EXPECT_EQ(static_cast<int>(kCaseA), static_cast<int>(kCaseB));
+
+# endif  // GTEST_OS_LINUX
+
+  EXPECT_EQ(kCaseA, kCaseA);
+  EXPECT_NE(kCaseA, kCaseB);
+  EXPECT_LT(kCaseA, kCaseB);
+  EXPECT_LE(kCaseA, kCaseB);
+  EXPECT_GT(kCaseB, kCaseA);
+  EXPECT_GE(kCaseA, kCaseA);
+  EXPECT_NONFATAL_FAILURE(EXPECT_GE(kCaseA, kCaseB),
+                          "(kCaseA) >= (kCaseB)");
+  EXPECT_NONFATAL_FAILURE(EXPECT_GE(kCaseA, kCaseC),
+                          "-1 vs 42");
+
+  ASSERT_EQ(kCaseA, kCaseA);
+  ASSERT_NE(kCaseA, kCaseB);
+  ASSERT_LT(kCaseA, kCaseB);
+  ASSERT_LE(kCaseA, kCaseB);
+  ASSERT_GT(kCaseB, kCaseA);
+  ASSERT_GE(kCaseA, kCaseA);
+
+# ifndef __BORLANDC__
+
+  // ICE's in C++Builder.
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(kCaseA, kCaseB),
+                       "  kCaseB\n    Which is: ");
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(kCaseA, kCaseC),
+                       "\n    Which is: 42");
+# endif
+
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(kCaseA, kCaseC),
+                       "\n    Which is: -1");
+}
+
+#endif  // !GTEST_OS_MAC && !defined(__SUNPRO_CC)
+
+#if GTEST_OS_WINDOWS
+
+static HRESULT UnexpectedHRESULTFailure() {
+  return E_UNEXPECTED;
+}
+
+static HRESULT OkHRESULTSuccess() {
+  return S_OK;
+}
+
+static HRESULT FalseHRESULTSuccess() {
+  return S_FALSE;
+}
+
+// HRESULT assertion tests test both zero and non-zero
+// success codes as well as failure message for each.
+//
+// Windows CE doesn't support message texts.
+TEST(HRESULTAssertionTest, EXPECT_HRESULT_SUCCEEDED) {
+  EXPECT_HRESULT_SUCCEEDED(S_OK);
+  EXPECT_HRESULT_SUCCEEDED(S_FALSE);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_HRESULT_SUCCEEDED(UnexpectedHRESULTFailure()),
+    "Expected: (UnexpectedHRESULTFailure()) succeeds.\n"
+    "  Actual: 0x8000FFFF");
+}
+
+TEST(HRESULTAssertionTest, ASSERT_HRESULT_SUCCEEDED) {
+  ASSERT_HRESULT_SUCCEEDED(S_OK);
+  ASSERT_HRESULT_SUCCEEDED(S_FALSE);
+
+  EXPECT_FATAL_FAILURE(ASSERT_HRESULT_SUCCEEDED(UnexpectedHRESULTFailure()),
+    "Expected: (UnexpectedHRESULTFailure()) succeeds.\n"
+    "  Actual: 0x8000FFFF");
+}
+
+TEST(HRESULTAssertionTest, EXPECT_HRESULT_FAILED) {
+  EXPECT_HRESULT_FAILED(E_UNEXPECTED);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_HRESULT_FAILED(OkHRESULTSuccess()),
+    "Expected: (OkHRESULTSuccess()) fails.\n"
+    "  Actual: 0x0");
+  EXPECT_NONFATAL_FAILURE(EXPECT_HRESULT_FAILED(FalseHRESULTSuccess()),
+    "Expected: (FalseHRESULTSuccess()) fails.\n"
+    "  Actual: 0x1");
+}
+
+TEST(HRESULTAssertionTest, ASSERT_HRESULT_FAILED) {
+  ASSERT_HRESULT_FAILED(E_UNEXPECTED);
+
+# ifndef __BORLANDC__
+
+  // ICE's in C++Builder 2007 and 2009.
+  EXPECT_FATAL_FAILURE(ASSERT_HRESULT_FAILED(OkHRESULTSuccess()),
+    "Expected: (OkHRESULTSuccess()) fails.\n"
+    "  Actual: 0x0");
+# endif
+
+  EXPECT_FATAL_FAILURE(ASSERT_HRESULT_FAILED(FalseHRESULTSuccess()),
+    "Expected: (FalseHRESULTSuccess()) fails.\n"
+    "  Actual: 0x1");
+}
+
+// Tests that streaming to the HRESULT macros works.
+TEST(HRESULTAssertionTest, Streaming) {
+  EXPECT_HRESULT_SUCCEEDED(S_OK) << "unexpected failure";
+  ASSERT_HRESULT_SUCCEEDED(S_OK) << "unexpected failure";
+  EXPECT_HRESULT_FAILED(E_UNEXPECTED) << "unexpected failure";
+  ASSERT_HRESULT_FAILED(E_UNEXPECTED) << "unexpected failure";
+
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_HRESULT_SUCCEEDED(E_UNEXPECTED) << "expected failure",
+      "expected failure");
+
+# ifndef __BORLANDC__
+
+  // ICE's in C++Builder 2007 and 2009.
+  EXPECT_FATAL_FAILURE(
+      ASSERT_HRESULT_SUCCEEDED(E_UNEXPECTED) << "expected failure",
+      "expected failure");
+# endif
+
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_HRESULT_FAILED(S_OK) << "expected failure",
+      "expected failure");
+
+  EXPECT_FATAL_FAILURE(
+      ASSERT_HRESULT_FAILED(S_OK) << "expected failure",
+      "expected failure");
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+#ifdef __BORLANDC__
+// Silences warnings: "Condition is always true", "Unreachable code"
+# pragma option push -w-ccc -w-rch
+#endif
+
+// Tests that the assertion macros behave like single statements.
+TEST(AssertionSyntaxTest, BasicAssertionsBehavesLikeSingleStatement) {
+  if (AlwaysFalse())
+    ASSERT_TRUE(false) << "This should never be executed; "
+                          "It's a compilation test only.";
+
+  if (AlwaysTrue())
+    EXPECT_FALSE(false);
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    ASSERT_LT(1, 3);
+
+  if (AlwaysFalse())
+    ;  // NOLINT
+  else
+    EXPECT_GT(3, 2) << "";
+}
+
+#if GTEST_HAS_EXCEPTIONS
+// Tests that the compiler will not complain about unreachable code in the
+// EXPECT_THROW/EXPECT_ANY_THROW/EXPECT_NO_THROW macros.
+TEST(ExpectThrowTest, DoesNotGenerateUnreachableCodeWarning) {
+  int n = 0;
+
+  EXPECT_THROW(throw 1, int);
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW(n++, int), "");
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW(throw 1, const char*), "");
+  EXPECT_NO_THROW(n++);
+  EXPECT_NONFATAL_FAILURE(EXPECT_NO_THROW(throw 1), "");
+  EXPECT_ANY_THROW(throw 1);
+  EXPECT_NONFATAL_FAILURE(EXPECT_ANY_THROW(n++), "");
+}
+
+TEST(AssertionSyntaxTest, ExceptionAssertionsBehavesLikeSingleStatement) {
+  if (AlwaysFalse())
+    EXPECT_THROW(ThrowNothing(), bool);
+
+  if (AlwaysTrue())
+    EXPECT_THROW(ThrowAnInteger(), int);
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    EXPECT_NO_THROW(ThrowAnInteger());
+
+  if (AlwaysTrue())
+    EXPECT_NO_THROW(ThrowNothing());
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    EXPECT_ANY_THROW(ThrowNothing());
+
+  if (AlwaysTrue())
+    EXPECT_ANY_THROW(ThrowAnInteger());
+  else
+    ;  // NOLINT
+}
+#endif  // GTEST_HAS_EXCEPTIONS
+
+TEST(AssertionSyntaxTest, NoFatalFailureAssertionsBehavesLikeSingleStatement) {
+  if (AlwaysFalse())
+    EXPECT_NO_FATAL_FAILURE(FAIL()) << "This should never be executed. "
+                                    << "It's a compilation test only.";
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    ASSERT_NO_FATAL_FAILURE(FAIL()) << "";
+  else
+    ;  // NOLINT
+
+  if (AlwaysTrue())
+    EXPECT_NO_FATAL_FAILURE(SUCCEED());
+  else
+    ;  // NOLINT
+
+  if (AlwaysFalse())
+    ;  // NOLINT
+  else
+    ASSERT_NO_FATAL_FAILURE(SUCCEED());
+}
+
+// Tests that the assertion macros work well with switch statements.
+TEST(AssertionSyntaxTest, WorksWithSwitch) {
+  switch (0) {
+    case 1:
+      break;
+    default:
+      ASSERT_TRUE(true);
+  }
+
+  switch (0)
+    case 0:
+      EXPECT_FALSE(false) << "EXPECT_FALSE failed in switch case";
+
+  // Binary assertions are implemented using a different code path
+  // than the Boolean assertions.  Hence we test them separately.
+  switch (0) {
+    case 1:
+    default:
+      ASSERT_EQ(1, 1) << "ASSERT_EQ failed in default switch handler";
+  }
+
+  switch (0)
+    case 0:
+      EXPECT_NE(1, 2);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+void ThrowAString() {
+    throw "std::string";
+}
+
+// Test that the exception assertion macros compile and work with const
+// type qualifier.
+TEST(AssertionSyntaxTest, WorksWithConst) {
+    ASSERT_THROW(ThrowAString(), const char*);
+
+    EXPECT_THROW(ThrowAString(), const char*);
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+}  // namespace
+
+namespace testing {
+
+// Tests that Google Test tracks SUCCEED*.
+TEST(SuccessfulAssertionTest, SUCCEED) {
+  SUCCEED();
+  SUCCEED() << "OK";
+  EXPECT_EQ(2, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+// Tests that Google Test doesn't track successful EXPECT_*.
+TEST(SuccessfulAssertionTest, EXPECT) {
+  EXPECT_TRUE(true);
+  EXPECT_EQ(0, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+// Tests that Google Test doesn't track successful EXPECT_STR*.
+TEST(SuccessfulAssertionTest, EXPECT_STR) {
+  EXPECT_STREQ("", "");
+  EXPECT_EQ(0, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+// Tests that Google Test doesn't track successful ASSERT_*.
+TEST(SuccessfulAssertionTest, ASSERT) {
+  ASSERT_TRUE(true);
+  EXPECT_EQ(0, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+// Tests that Google Test doesn't track successful ASSERT_STR*.
+TEST(SuccessfulAssertionTest, ASSERT_STR) {
+  ASSERT_STREQ("", "");
+  EXPECT_EQ(0, GetUnitTestImpl()->current_test_result()->total_part_count());
+}
+
+}  // namespace testing
+
+namespace {
+
+// Tests the message streaming variation of assertions.
+
+TEST(AssertionWithMessageTest, EXPECT) {
+  EXPECT_EQ(1, 1) << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_NE(1, 1) << "Expected failure #1.",
+                          "Expected failure #1");
+  EXPECT_LE(1, 2) << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(1, 0) << "Expected failure #2.",
+                          "Expected failure #2.");
+  EXPECT_GE(1, 0) << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_GT(1, 2) << "Expected failure #3.",
+                          "Expected failure #3.");
+
+  EXPECT_STREQ("1", "1") << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRNE("1", "1") << "Expected failure #4.",
+                          "Expected failure #4.");
+  EXPECT_STRCASEEQ("a", "A") << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRCASENE("a", "A") << "Expected failure #5.",
+                          "Expected failure #5.");
+
+  EXPECT_FLOAT_EQ(1, 1) << "This should succeed.";
+  EXPECT_NONFATAL_FAILURE(EXPECT_DOUBLE_EQ(1, 1.2) << "Expected failure #6.",
+                          "Expected failure #6.");
+  EXPECT_NEAR(1, 1.1, 0.2) << "This should succeed.";
+}
+
+TEST(AssertionWithMessageTest, ASSERT) {
+  ASSERT_EQ(1, 1) << "This should succeed.";
+  ASSERT_NE(1, 2) << "This should succeed.";
+  ASSERT_LE(1, 2) << "This should succeed.";
+  ASSERT_LT(1, 2) << "This should succeed.";
+  ASSERT_GE(1, 0) << "This should succeed.";
+  EXPECT_FATAL_FAILURE(ASSERT_GT(1, 2) << "Expected failure.",
+                       "Expected failure.");
+}
+
+TEST(AssertionWithMessageTest, ASSERT_STR) {
+  ASSERT_STREQ("1", "1") << "This should succeed.";
+  ASSERT_STRNE("1", "2") << "This should succeed.";
+  ASSERT_STRCASEEQ("a", "A") << "This should succeed.";
+  EXPECT_FATAL_FAILURE(ASSERT_STRCASENE("a", "A") << "Expected failure.",
+                       "Expected failure.");
+}
+
+TEST(AssertionWithMessageTest, ASSERT_FLOATING) {
+  ASSERT_FLOAT_EQ(1, 1) << "This should succeed.";
+  ASSERT_DOUBLE_EQ(1, 1) << "This should succeed.";
+  EXPECT_FATAL_FAILURE(ASSERT_NEAR(1,1.2, 0.1) << "Expect failure.",  // NOLINT
+                       "Expect failure.");
+  // To work around a bug in gcc 2.95.0, there is intentionally no
+  // space after the first comma in the previous statement.
+}
+
+// Tests using ASSERT_FALSE with a streamed message.
+TEST(AssertionWithMessageTest, ASSERT_FALSE) {
+  ASSERT_FALSE(false) << "This shouldn't fail.";
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_FALSE(true) << "Expected failure: " << 2 << " > " << 1
+                       << " evaluates to " << true;
+  }, "Expected failure");
+}
+
+// Tests using FAIL with a streamed message.
+TEST(AssertionWithMessageTest, FAIL) {
+  EXPECT_FATAL_FAILURE(FAIL() << 0,
+                       "0");
+}
+
+// Tests using SUCCEED with a streamed message.
+TEST(AssertionWithMessageTest, SUCCEED) {
+  SUCCEED() << "Success == " << 1;
+}
+
+// Tests using ASSERT_TRUE with a streamed message.
+TEST(AssertionWithMessageTest, ASSERT_TRUE) {
+  ASSERT_TRUE(true) << "This should succeed.";
+  ASSERT_TRUE(true) << true;
+  EXPECT_FATAL_FAILURE(
+      {  // NOLINT
+        ASSERT_TRUE(false) << static_cast<const char*>(nullptr)
+                           << static_cast<char*>(nullptr);
+      },
+      "(null)(null)");
+}
+
+#if GTEST_OS_WINDOWS
+// Tests using wide strings in assertion messages.
+TEST(AssertionWithMessageTest, WideStringMessage) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_TRUE(false) << L"This failure is expected.\x8119";
+  }, "This failure is expected.");
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_EQ(1, 2) << "This failure is "
+                    << L"expected too.\x8120";
+  }, "This failure is expected too.");
+}
+#endif  // GTEST_OS_WINDOWS
+
+// Tests EXPECT_TRUE.
+TEST(ExpectTest, EXPECT_TRUE) {
+  EXPECT_TRUE(true) << "Intentional success";
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(false) << "Intentional failure #1.",
+                          "Intentional failure #1.");
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(false) << "Intentional failure #2.",
+                          "Intentional failure #2.");
+  EXPECT_TRUE(2 > 1);  // NOLINT
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(2 < 1),
+                          "Value of: 2 < 1\n"
+                          "  Actual: false\n"
+                          "Expected: true");
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(2 > 3),
+                          "2 > 3");
+}
+
+// Tests EXPECT_TRUE(predicate) for predicates returning AssertionResult.
+TEST(ExpectTest, ExpectTrueWithAssertionResult) {
+  EXPECT_TRUE(ResultIsEven(2));
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(ResultIsEven(3)),
+                          "Value of: ResultIsEven(3)\n"
+                          "  Actual: false (3 is odd)\n"
+                          "Expected: true");
+  EXPECT_TRUE(ResultIsEvenNoExplanation(2));
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(ResultIsEvenNoExplanation(3)),
+                          "Value of: ResultIsEvenNoExplanation(3)\n"
+                          "  Actual: false (3 is odd)\n"
+                          "Expected: true");
+}
+
+// Tests EXPECT_FALSE with a streamed message.
+TEST(ExpectTest, EXPECT_FALSE) {
+  EXPECT_FALSE(2 < 1);  // NOLINT
+  EXPECT_FALSE(false) << "Intentional success";
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(true) << "Intentional failure #1.",
+                          "Intentional failure #1.");
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(true) << "Intentional failure #2.",
+                          "Intentional failure #2.");
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(2 > 1),
+                          "Value of: 2 > 1\n"
+                          "  Actual: true\n"
+                          "Expected: false");
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(2 < 3),
+                          "2 < 3");
+}
+
+// Tests EXPECT_FALSE(predicate) for predicates returning AssertionResult.
+TEST(ExpectTest, ExpectFalseWithAssertionResult) {
+  EXPECT_FALSE(ResultIsEven(3));
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(ResultIsEven(2)),
+                          "Value of: ResultIsEven(2)\n"
+                          "  Actual: true (2 is even)\n"
+                          "Expected: false");
+  EXPECT_FALSE(ResultIsEvenNoExplanation(3));
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(ResultIsEvenNoExplanation(2)),
+                          "Value of: ResultIsEvenNoExplanation(2)\n"
+                          "  Actual: true\n"
+                          "Expected: false");
+}
+
+#ifdef __BORLANDC__
+// Restores warnings after previous "#pragma option push" suppressed them
+# pragma option pop
+#endif
+
+// Tests EXPECT_EQ.
+TEST(ExpectTest, EXPECT_EQ) {
+  EXPECT_EQ(5, 2 + 3);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(5, 2*3),
+                          "Expected equality of these values:\n"
+                          "  5\n"
+                          "  2*3\n"
+                          "    Which is: 6");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(5, 2 - 3),
+                          "2 - 3");
+}
+
+// Tests using EXPECT_EQ on double values.  The purpose is to make
+// sure that the specialization we did for integer and anonymous enums
+// isn't used for double arguments.
+TEST(ExpectTest, EXPECT_EQ_Double) {
+  // A success.
+  EXPECT_EQ(5.6, 5.6);
+
+  // A failure.
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(5.1, 5.2),
+                          "5.1");
+}
+
+// Tests EXPECT_EQ(NULL, pointer).
+TEST(ExpectTest, EXPECT_EQ_NULL) {
+  // A success.
+  const char* p = nullptr;
+  // Some older GCC versions may issue a spurious warning in this or the next
+  // assertion statement. This warning should not be suppressed with
+  // static_cast since the test verifies the ability to use bare NULL as the
+  // expected parameter to the macro.
+  EXPECT_EQ(nullptr, p);
+
+  // A failure.
+  int n = 0;
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(nullptr, &n), "  &n\n    Which is:");
+}
+
+// Tests EXPECT_EQ(0, non_pointer).  Since the literal 0 can be
+// treated as a null pointer by the compiler, we need to make sure
+// that EXPECT_EQ(0, non_pointer) isn't interpreted by Google Test as
+// EXPECT_EQ(static_cast<void*>(NULL), non_pointer).
+TEST(ExpectTest, EXPECT_EQ_0) {
+  int n = 0;
+
+  // A success.
+  EXPECT_EQ(0, n);
+
+  // A failure.
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(0, 5.6),
+                          "  0\n  5.6");
+}
+
+// Tests EXPECT_NE.
+TEST(ExpectTest, EXPECT_NE) {
+  EXPECT_NE(6, 7);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_NE('a', 'a'),
+                          "Expected: ('a') != ('a'), "
+                          "actual: 'a' (97, 0x61) vs 'a' (97, 0x61)");
+  EXPECT_NONFATAL_FAILURE(EXPECT_NE(2, 2),
+                          "2");
+  char* const p0 = nullptr;
+  EXPECT_NONFATAL_FAILURE(EXPECT_NE(p0, p0),
+                          "p0");
+  // Only way to get the Nokia compiler to compile the cast
+  // is to have a separate void* variable first. Putting
+  // the two casts on the same line doesn't work, neither does
+  // a direct C-style to char*.
+  void* pv1 = (void*)0x1234;  // NOLINT
+  char* const p1 = reinterpret_cast<char*>(pv1);
+  EXPECT_NONFATAL_FAILURE(EXPECT_NE(p1, p1),
+                          "p1");
+}
+
+// Tests EXPECT_LE.
+TEST(ExpectTest, EXPECT_LE) {
+  EXPECT_LE(2, 3);
+  EXPECT_LE(2, 2);
+  EXPECT_NONFATAL_FAILURE(EXPECT_LE(2, 0),
+                          "Expected: (2) <= (0), actual: 2 vs 0");
+  EXPECT_NONFATAL_FAILURE(EXPECT_LE(1.1, 0.9),
+                          "(1.1) <= (0.9)");
+}
+
+// Tests EXPECT_LT.
+TEST(ExpectTest, EXPECT_LT) {
+  EXPECT_LT(2, 3);
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(2, 2),
+                          "Expected: (2) < (2), actual: 2 vs 2");
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(2, 1),
+                          "(2) < (1)");
+}
+
+// Tests EXPECT_GE.
+TEST(ExpectTest, EXPECT_GE) {
+  EXPECT_GE(2, 1);
+  EXPECT_GE(2, 2);
+  EXPECT_NONFATAL_FAILURE(EXPECT_GE(2, 3),
+                          "Expected: (2) >= (3), actual: 2 vs 3");
+  EXPECT_NONFATAL_FAILURE(EXPECT_GE(0.9, 1.1),
+                          "(0.9) >= (1.1)");
+}
+
+// Tests EXPECT_GT.
+TEST(ExpectTest, EXPECT_GT) {
+  EXPECT_GT(2, 1);
+  EXPECT_NONFATAL_FAILURE(EXPECT_GT(2, 2),
+                          "Expected: (2) > (2), actual: 2 vs 2");
+  EXPECT_NONFATAL_FAILURE(EXPECT_GT(2, 3),
+                          "(2) > (3)");
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Tests EXPECT_THROW.
+TEST(ExpectTest, EXPECT_THROW) {
+  EXPECT_THROW(ThrowAnInteger(), int);
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW(ThrowAnInteger(), bool),
+                          "Expected: ThrowAnInteger() throws an exception of "
+                          "type bool.\n  Actual: it throws a different type.");
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_THROW(ThrowNothing(), bool),
+      "Expected: ThrowNothing() throws an exception of type bool.\n"
+      "  Actual: it throws nothing.");
+}
+
+// Tests EXPECT_NO_THROW.
+TEST(ExpectTest, EXPECT_NO_THROW) {
+  EXPECT_NO_THROW(ThrowNothing());
+  EXPECT_NONFATAL_FAILURE(EXPECT_NO_THROW(ThrowAnInteger()),
+                          "Expected: ThrowAnInteger() doesn't throw an "
+                          "exception.\n  Actual: it throws.");
+}
+
+// Tests EXPECT_ANY_THROW.
+TEST(ExpectTest, EXPECT_ANY_THROW) {
+  EXPECT_ANY_THROW(ThrowAnInteger());
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_ANY_THROW(ThrowNothing()),
+      "Expected: ThrowNothing() throws an exception.\n"
+      "  Actual: it doesn't.");
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Make sure we deal with the precedence of <<.
+TEST(ExpectTest, ExpectPrecedence) {
+  EXPECT_EQ(1 < 2, true);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(true, true && false),
+                          "  true && false\n    Which is: false");
+}
+
+
+// Tests the StreamableToString() function.
+
+// Tests using StreamableToString() on a scalar.
+TEST(StreamableToStringTest, Scalar) {
+  EXPECT_STREQ("5", StreamableToString(5).c_str());
+}
+
+// Tests using StreamableToString() on a non-char pointer.
+TEST(StreamableToStringTest, Pointer) {
+  int n = 0;
+  int* p = &n;
+  EXPECT_STRNE("(null)", StreamableToString(p).c_str());
+}
+
+// Tests using StreamableToString() on a NULL non-char pointer.
+TEST(StreamableToStringTest, NullPointer) {
+  int* p = nullptr;
+  EXPECT_STREQ("(null)", StreamableToString(p).c_str());
+}
+
+// Tests using StreamableToString() on a C string.
+TEST(StreamableToStringTest, CString) {
+  EXPECT_STREQ("Foo", StreamableToString("Foo").c_str());
+}
+
+// Tests using StreamableToString() on a NULL C string.
+TEST(StreamableToStringTest, NullCString) {
+  char* p = nullptr;
+  EXPECT_STREQ("(null)", StreamableToString(p).c_str());
+}
+
+// Tests using streamable values as assertion messages.
+
+// Tests using std::string as an assertion message.
+TEST(StreamableTest, string) {
+  static const std::string str(
+      "This failure message is a std::string, and is expected.");
+  EXPECT_FATAL_FAILURE(FAIL() << str,
+                       str.c_str());
+}
+
+// Tests that we can output strings containing embedded NULs.
+// Limited to Linux because we can only do this with std::string's.
+TEST(StreamableTest, stringWithEmbeddedNUL) {
+  static const char char_array_with_nul[] =
+      "Here's a NUL\0 and some more string";
+  static const std::string string_with_nul(char_array_with_nul,
+                                           sizeof(char_array_with_nul)
+                                           - 1);  // drops the trailing NUL
+  EXPECT_FATAL_FAILURE(FAIL() << string_with_nul,
+                       "Here's a NUL\\0 and some more string");
+}
+
+// Tests that we can output a NUL char.
+TEST(StreamableTest, NULChar) {
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    FAIL() << "A NUL" << '\0' << " and some more string";
+  }, "A NUL\\0 and some more string");
+}
+
+// Tests using int as an assertion message.
+TEST(StreamableTest, int) {
+  EXPECT_FATAL_FAILURE(FAIL() << 900913,
+                       "900913");
+}
+
+// Tests using NULL char pointer as an assertion message.
+//
+// In MSVC, streaming a NULL char * causes access violation.  Google Test
+// implemented a workaround (substituting "(null)" for NULL).  This
+// tests whether the workaround works.
+TEST(StreamableTest, NullCharPtr) {
+  EXPECT_FATAL_FAILURE(FAIL() << static_cast<const char*>(nullptr), "(null)");
+}
+
+// Tests that basic IO manipulators (endl, ends, and flush) can be
+// streamed to testing::Message.
+TEST(StreamableTest, BasicIoManip) {
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    FAIL() << "Line 1." << std::endl
+           << "A NUL char " << std::ends << std::flush << " in line 2.";
+  }, "Line 1.\nA NUL char \\0 in line 2.");
+}
+
+// Tests the macros that haven't been covered so far.
+
+void AddFailureHelper(bool* aborted) {
+  *aborted = true;
+  ADD_FAILURE() << "Intentional failure.";
+  *aborted = false;
+}
+
+// Tests ADD_FAILURE.
+TEST(MacroTest, ADD_FAILURE) {
+  bool aborted = true;
+  EXPECT_NONFATAL_FAILURE(AddFailureHelper(&aborted),
+                          "Intentional failure.");
+  EXPECT_FALSE(aborted);
+}
+
+// Tests ADD_FAILURE_AT.
+TEST(MacroTest, ADD_FAILURE_AT) {
+  // Verifies that ADD_FAILURE_AT does generate a nonfatal failure and
+  // the failure message contains the user-streamed part.
+  EXPECT_NONFATAL_FAILURE(ADD_FAILURE_AT("foo.cc", 42) << "Wrong!", "Wrong!");
+
+  // Verifies that the user-streamed part is optional.
+  EXPECT_NONFATAL_FAILURE(ADD_FAILURE_AT("foo.cc", 42), "Failed");
+
+  // Unfortunately, we cannot verify that the failure message contains
+  // the right file path and line number the same way, as
+  // EXPECT_NONFATAL_FAILURE() doesn't get to see the file path and
+  // line number.  Instead, we do that in googletest-output-test_.cc.
+}
+
+// Tests FAIL.
+TEST(MacroTest, FAIL) {
+  EXPECT_FATAL_FAILURE(FAIL(),
+                       "Failed");
+  EXPECT_FATAL_FAILURE(FAIL() << "Intentional failure.",
+                       "Intentional failure.");
+}
+
+// Tests GTEST_FAIL_AT.
+TEST(MacroTest, GTEST_FAIL_AT) {
+  // Verifies that GTEST_FAIL_AT does generate a fatal failure and
+  // the failure message contains the user-streamed part.
+  EXPECT_FATAL_FAILURE(GTEST_FAIL_AT("foo.cc", 42) << "Wrong!", "Wrong!");
+
+  // Verifies that the user-streamed part is optional.
+  EXPECT_FATAL_FAILURE(GTEST_FAIL_AT("foo.cc", 42), "Failed");
+
+  // See the ADD_FAIL_AT test above to see how we test that the failure message
+  // contains the right filename and line number -- the same applies here.
+}
+
+// Tests SUCCEED
+TEST(MacroTest, SUCCEED) {
+  SUCCEED();
+  SUCCEED() << "Explicit success.";
+}
+
+// Tests for EXPECT_EQ() and ASSERT_EQ().
+//
+// These tests fail *intentionally*, s.t. the failure messages can be
+// generated and tested.
+//
+// We have different tests for different argument types.
+
+// Tests using bool values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, Bool) {
+  EXPECT_EQ(true,  true);
+  EXPECT_FATAL_FAILURE({
+      bool false_value = false;
+      ASSERT_EQ(false_value, true);
+    }, "  false_value\n    Which is: false\n  true");
+}
+
+// Tests using int values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, Int) {
+  ASSERT_EQ(32, 32);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(32, 33),
+                          "  32\n  33");
+}
+
+// Tests using time_t values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, Time_T) {
+  EXPECT_EQ(static_cast<time_t>(0),
+            static_cast<time_t>(0));
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(static_cast<time_t>(0),
+                                 static_cast<time_t>(1234)),
+                       "1234");
+}
+
+// Tests using char values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, Char) {
+  ASSERT_EQ('z', 'z');
+  const char ch = 'b';
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ('\0', ch),
+                          "  ch\n    Which is: 'b'");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ('a', ch),
+                          "  ch\n    Which is: 'b'");
+}
+
+// Tests using wchar_t values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, WideChar) {
+  EXPECT_EQ(L'b', L'b');
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(L'\0', L'x'),
+                          "Expected equality of these values:\n"
+                          "  L'\0'\n"
+                          "    Which is: L'\0' (0, 0x0)\n"
+                          "  L'x'\n"
+                          "    Which is: L'x' (120, 0x78)");
+
+  static wchar_t wchar;
+  wchar = L'b';
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(L'a', wchar),
+                          "wchar");
+  wchar = 0x8119;
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(static_cast<wchar_t>(0x8120), wchar),
+                       "  wchar\n    Which is: L'");
+}
+
+// Tests using ::std::string values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, StdString) {
+  // Compares a const char* to an std::string that has identical
+  // content.
+  ASSERT_EQ("Test", ::std::string("Test"));
+
+  // Compares two identical std::strings.
+  static const ::std::string str1("A * in the middle");
+  static const ::std::string str2(str1);
+  EXPECT_EQ(str1, str2);
+
+  // Compares a const char* to an std::string that has different
+  // content
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ("Test", ::std::string("test")),
+                          "\"test\"");
+
+  // Compares an std::string to a char* that has different content.
+  char* const p1 = const_cast<char*>("foo");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(::std::string("bar"), p1),
+                          "p1");
+
+  // Compares two std::strings that have different contents, one of
+  // which having a NUL character in the middle.  This should fail.
+  static ::std::string str3(str1);
+  str3.at(2) = '\0';
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(str1, str3),
+                       "  str3\n    Which is: \"A \\0 in the middle\"");
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Tests using ::std::wstring values in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, StdWideString) {
+  // Compares two identical std::wstrings.
+  const ::std::wstring wstr1(L"A * in the middle");
+  const ::std::wstring wstr2(wstr1);
+  ASSERT_EQ(wstr1, wstr2);
+
+  // Compares an std::wstring to a const wchar_t* that has identical
+  // content.
+  const wchar_t kTestX8119[] = { 'T', 'e', 's', 't', 0x8119, '\0' };
+  EXPECT_EQ(::std::wstring(kTestX8119), kTestX8119);
+
+  // Compares an std::wstring to a const wchar_t* that has different
+  // content.
+  const wchar_t kTestX8120[] = { 'T', 'e', 's', 't', 0x8120, '\0' };
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_EQ(::std::wstring(kTestX8119), kTestX8120);
+  }, "kTestX8120");
+
+  // Compares two std::wstrings that have different contents, one of
+  // which having a NUL character in the middle.
+  ::std::wstring wstr3(wstr1);
+  wstr3.at(2) = L'\0';
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(wstr1, wstr3),
+                          "wstr3");
+
+  // Compares a wchar_t* to an std::wstring that has different
+  // content.
+  EXPECT_FATAL_FAILURE({  // NOLINT
+    ASSERT_EQ(const_cast<wchar_t*>(L"foo"), ::std::wstring(L"bar"));
+  }, "");
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Tests using char pointers in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, CharPointer) {
+  char* const p0 = nullptr;
+  // Only way to get the Nokia compiler to compile the cast
+  // is to have a separate void* variable first. Putting
+  // the two casts on the same line doesn't work, neither does
+  // a direct C-style to char*.
+  void* pv1 = (void*)0x1234;  // NOLINT
+  void* pv2 = (void*)0xABC0;  // NOLINT
+  char* const p1 = reinterpret_cast<char*>(pv1);
+  char* const p2 = reinterpret_cast<char*>(pv2);
+  ASSERT_EQ(p1, p1);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(p0, p2),
+                          "  p2\n    Which is:");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(p1, p2),
+                          "  p2\n    Which is:");
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(reinterpret_cast<char*>(0x1234),
+                                 reinterpret_cast<char*>(0xABC0)),
+                       "ABC0");
+}
+
+// Tests using wchar_t pointers in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, WideCharPointer) {
+  wchar_t* const p0 = nullptr;
+  // Only way to get the Nokia compiler to compile the cast
+  // is to have a separate void* variable first. Putting
+  // the two casts on the same line doesn't work, neither does
+  // a direct C-style to char*.
+  void* pv1 = (void*)0x1234;  // NOLINT
+  void* pv2 = (void*)0xABC0;  // NOLINT
+  wchar_t* const p1 = reinterpret_cast<wchar_t*>(pv1);
+  wchar_t* const p2 = reinterpret_cast<wchar_t*>(pv2);
+  EXPECT_EQ(p0, p0);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(p0, p2),
+                          "  p2\n    Which is:");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(p1, p2),
+                          "  p2\n    Which is:");
+  void* pv3 = (void*)0x1234;  // NOLINT
+  void* pv4 = (void*)0xABC0;  // NOLINT
+  const wchar_t* p3 = reinterpret_cast<const wchar_t*>(pv3);
+  const wchar_t* p4 = reinterpret_cast<const wchar_t*>(pv4);
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(p3, p4),
+                          "p4");
+}
+
+// Tests using other types of pointers in {EXPECT|ASSERT}_EQ.
+TEST(EqAssertionTest, OtherPointer) {
+  ASSERT_EQ(static_cast<const int*>(nullptr), static_cast<const int*>(nullptr));
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(static_cast<const int*>(nullptr),
+                                 reinterpret_cast<const int*>(0x1234)),
+                       "0x1234");
+}
+
+// A class that supports binary comparison operators but not streaming.
+class UnprintableChar {
+ public:
+  explicit UnprintableChar(char ch) : char_(ch) {}
+
+  bool operator==(const UnprintableChar& rhs) const {
+    return char_ == rhs.char_;
+  }
+  bool operator!=(const UnprintableChar& rhs) const {
+    return char_ != rhs.char_;
+  }
+  bool operator<(const UnprintableChar& rhs) const {
+    return char_ < rhs.char_;
+  }
+  bool operator<=(const UnprintableChar& rhs) const {
+    return char_ <= rhs.char_;
+  }
+  bool operator>(const UnprintableChar& rhs) const {
+    return char_ > rhs.char_;
+  }
+  bool operator>=(const UnprintableChar& rhs) const {
+    return char_ >= rhs.char_;
+  }
+
+ private:
+  char char_;
+};
+
+// Tests that ASSERT_EQ() and friends don't require the arguments to
+// be printable.
+TEST(ComparisonAssertionTest, AcceptsUnprintableArgs) {
+  const UnprintableChar x('x'), y('y');
+  ASSERT_EQ(x, x);
+  EXPECT_NE(x, y);
+  ASSERT_LT(x, y);
+  EXPECT_LE(x, y);
+  ASSERT_GT(y, x);
+  EXPECT_GE(x, x);
+
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(x, y), "1-byte object <78>");
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(x, y), "1-byte object <79>");
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(y, y), "1-byte object <79>");
+  EXPECT_NONFATAL_FAILURE(EXPECT_GT(x, y), "1-byte object <78>");
+  EXPECT_NONFATAL_FAILURE(EXPECT_GT(x, y), "1-byte object <79>");
+
+  // Code tested by EXPECT_FATAL_FAILURE cannot reference local
+  // variables, so we have to write UnprintableChar('x') instead of x.
+#ifndef __BORLANDC__
+  // ICE's in C++Builder.
+  EXPECT_FATAL_FAILURE(ASSERT_NE(UnprintableChar('x'), UnprintableChar('x')),
+                       "1-byte object <78>");
+  EXPECT_FATAL_FAILURE(ASSERT_LE(UnprintableChar('y'), UnprintableChar('x')),
+                       "1-byte object <78>");
+#endif
+  EXPECT_FATAL_FAILURE(ASSERT_LE(UnprintableChar('y'), UnprintableChar('x')),
+                       "1-byte object <79>");
+  EXPECT_FATAL_FAILURE(ASSERT_GE(UnprintableChar('x'), UnprintableChar('y')),
+                       "1-byte object <78>");
+  EXPECT_FATAL_FAILURE(ASSERT_GE(UnprintableChar('x'), UnprintableChar('y')),
+                       "1-byte object <79>");
+}
+
+// Tests the FRIEND_TEST macro.
+
+// This class has a private member we want to test.  We will test it
+// both in a TEST and in a TEST_F.
+class Foo {
+ public:
+  Foo() {}
+
+ private:
+  int Bar() const { return 1; }
+
+  // Declares the friend tests that can access the private member
+  // Bar().
+  FRIEND_TEST(FRIEND_TEST_Test, TEST);
+  FRIEND_TEST(FRIEND_TEST_Test2, TEST_F);
+};
+
+// Tests that the FRIEND_TEST declaration allows a TEST to access a
+// class's private members.  This should compile.
+TEST(FRIEND_TEST_Test, TEST) {
+  ASSERT_EQ(1, Foo().Bar());
+}
+
+// The fixture needed to test using FRIEND_TEST with TEST_F.
+class FRIEND_TEST_Test2 : public Test {
+ protected:
+  Foo foo;
+};
+
+// Tests that the FRIEND_TEST declaration allows a TEST_F to access a
+// class's private members.  This should compile.
+TEST_F(FRIEND_TEST_Test2, TEST_F) {
+  ASSERT_EQ(1, foo.Bar());
+}
+
+// Tests the life cycle of Test objects.
+
+// The test fixture for testing the life cycle of Test objects.
+//
+// This class counts the number of live test objects that uses this
+// fixture.
+class TestLifeCycleTest : public Test {
+ protected:
+  // Constructor.  Increments the number of test objects that uses
+  // this fixture.
+  TestLifeCycleTest() { count_++; }
+
+  // Destructor.  Decrements the number of test objects that uses this
+  // fixture.
+  ~TestLifeCycleTest() override { count_--; }
+
+  // Returns the number of live test objects that uses this fixture.
+  int count() const { return count_; }
+
+ private:
+  static int count_;
+};
+
+int TestLifeCycleTest::count_ = 0;
+
+// Tests the life cycle of test objects.
+TEST_F(TestLifeCycleTest, Test1) {
+  // There should be only one test object in this test case that's
+  // currently alive.
+  ASSERT_EQ(1, count());
+}
+
+// Tests the life cycle of test objects.
+TEST_F(TestLifeCycleTest, Test2) {
+  // After Test1 is done and Test2 is started, there should still be
+  // only one live test object, as the object for Test1 should've been
+  // deleted.
+  ASSERT_EQ(1, count());
+}
+
+}  // namespace
+
+// Tests that the copy constructor works when it is NOT optimized away by
+// the compiler.
+TEST(AssertionResultTest, CopyConstructorWorksWhenNotOptimied) {
+  // Checks that the copy constructor doesn't try to dereference NULL pointers
+  // in the source object.
+  AssertionResult r1 = AssertionSuccess();
+  AssertionResult r2 = r1;
+  // The following line is added to prevent the compiler from optimizing
+  // away the constructor call.
+  r1 << "abc";
+
+  AssertionResult r3 = r1;
+  EXPECT_EQ(static_cast<bool>(r3), static_cast<bool>(r1));
+  EXPECT_STREQ("abc", r1.message());
+}
+
+// Tests that AssertionSuccess and AssertionFailure construct
+// AssertionResult objects as expected.
+TEST(AssertionResultTest, ConstructionWorks) {
+  AssertionResult r1 = AssertionSuccess();
+  EXPECT_TRUE(r1);
+  EXPECT_STREQ("", r1.message());
+
+  AssertionResult r2 = AssertionSuccess() << "abc";
+  EXPECT_TRUE(r2);
+  EXPECT_STREQ("abc", r2.message());
+
+  AssertionResult r3 = AssertionFailure();
+  EXPECT_FALSE(r3);
+  EXPECT_STREQ("", r3.message());
+
+  AssertionResult r4 = AssertionFailure() << "def";
+  EXPECT_FALSE(r4);
+  EXPECT_STREQ("def", r4.message());
+
+  AssertionResult r5 = AssertionFailure(Message() << "ghi");
+  EXPECT_FALSE(r5);
+  EXPECT_STREQ("ghi", r5.message());
+}
+
+// Tests that the negation flips the predicate result but keeps the message.
+TEST(AssertionResultTest, NegationWorks) {
+  AssertionResult r1 = AssertionSuccess() << "abc";
+  EXPECT_FALSE(!r1);
+  EXPECT_STREQ("abc", (!r1).message());
+
+  AssertionResult r2 = AssertionFailure() << "def";
+  EXPECT_TRUE(!r2);
+  EXPECT_STREQ("def", (!r2).message());
+}
+
+TEST(AssertionResultTest, StreamingWorks) {
+  AssertionResult r = AssertionSuccess();
+  r << "abc" << 'd' << 0 << true;
+  EXPECT_STREQ("abcd0true", r.message());
+}
+
+TEST(AssertionResultTest, CanStreamOstreamManipulators) {
+  AssertionResult r = AssertionSuccess();
+  r << "Data" << std::endl << std::flush << std::ends << "Will be visible";
+  EXPECT_STREQ("Data\n\\0Will be visible", r.message());
+}
+
+// The next test uses explicit conversion operators
+
+TEST(AssertionResultTest, ConstructibleFromContextuallyConvertibleToBool) {
+  struct ExplicitlyConvertibleToBool {
+    explicit operator bool() const { return value; }
+    bool value;
+  };
+  ExplicitlyConvertibleToBool v1 = {false};
+  ExplicitlyConvertibleToBool v2 = {true};
+  EXPECT_FALSE(v1);
+  EXPECT_TRUE(v2);
+}
+
+struct ConvertibleToAssertionResult {
+  operator AssertionResult() const { return AssertionResult(true); }
+};
+
+TEST(AssertionResultTest, ConstructibleFromImplicitlyConvertible) {
+  ConvertibleToAssertionResult obj;
+  EXPECT_TRUE(obj);
+}
+
+// Tests streaming a user type whose definition and operator << are
+// both in the global namespace.
+class Base {
+ public:
+  explicit Base(int an_x) : x_(an_x) {}
+  int x() const { return x_; }
+ private:
+  int x_;
+};
+std::ostream& operator<<(std::ostream& os,
+                         const Base& val) {
+  return os << val.x();
+}
+std::ostream& operator<<(std::ostream& os,
+                         const Base* pointer) {
+  return os << "(" << pointer->x() << ")";
+}
+
+TEST(MessageTest, CanStreamUserTypeInGlobalNameSpace) {
+  Message msg;
+  Base a(1);
+
+  msg << a << &a;  // Uses ::operator<<.
+  EXPECT_STREQ("1(1)", msg.GetString().c_str());
+}
+
+// Tests streaming a user type whose definition and operator<< are
+// both in an unnamed namespace.
+namespace {
+class MyTypeInUnnamedNameSpace : public Base {
+ public:
+  explicit MyTypeInUnnamedNameSpace(int an_x): Base(an_x) {}
+};
+std::ostream& operator<<(std::ostream& os,
+                         const MyTypeInUnnamedNameSpace& val) {
+  return os << val.x();
+}
+std::ostream& operator<<(std::ostream& os,
+                         const MyTypeInUnnamedNameSpace* pointer) {
+  return os << "(" << pointer->x() << ")";
+}
+}  // namespace
+
+TEST(MessageTest, CanStreamUserTypeInUnnamedNameSpace) {
+  Message msg;
+  MyTypeInUnnamedNameSpace a(1);
+
+  msg << a << &a;  // Uses <unnamed_namespace>::operator<<.
+  EXPECT_STREQ("1(1)", msg.GetString().c_str());
+}
+
+// Tests streaming a user type whose definition and operator<< are
+// both in a user namespace.
+namespace namespace1 {
+class MyTypeInNameSpace1 : public Base {
+ public:
+  explicit MyTypeInNameSpace1(int an_x): Base(an_x) {}
+};
+std::ostream& operator<<(std::ostream& os,
+                         const MyTypeInNameSpace1& val) {
+  return os << val.x();
+}
+std::ostream& operator<<(std::ostream& os,
+                         const MyTypeInNameSpace1* pointer) {
+  return os << "(" << pointer->x() << ")";
+}
+}  // namespace namespace1
+
+TEST(MessageTest, CanStreamUserTypeInUserNameSpace) {
+  Message msg;
+  namespace1::MyTypeInNameSpace1 a(1);
+
+  msg << a << &a;  // Uses namespace1::operator<<.
+  EXPECT_STREQ("1(1)", msg.GetString().c_str());
+}
+
+// Tests streaming a user type whose definition is in a user namespace
+// but whose operator<< is in the global namespace.
+namespace namespace2 {
+class MyTypeInNameSpace2 : public ::Base {
+ public:
+  explicit MyTypeInNameSpace2(int an_x): Base(an_x) {}
+};
+}  // namespace namespace2
+std::ostream& operator<<(std::ostream& os,
+                         const namespace2::MyTypeInNameSpace2& val) {
+  return os << val.x();
+}
+std::ostream& operator<<(std::ostream& os,
+                         const namespace2::MyTypeInNameSpace2* pointer) {
+  return os << "(" << pointer->x() << ")";
+}
+
+TEST(MessageTest, CanStreamUserTypeInUserNameSpaceWithStreamOperatorInGlobal) {
+  Message msg;
+  namespace2::MyTypeInNameSpace2 a(1);
+
+  msg << a << &a;  // Uses ::operator<<.
+  EXPECT_STREQ("1(1)", msg.GetString().c_str());
+}
+
+// Tests streaming NULL pointers to testing::Message.
+TEST(MessageTest, NullPointers) {
+  Message msg;
+  char* const p1 = nullptr;
+  unsigned char* const p2 = nullptr;
+  int* p3 = nullptr;
+  double* p4 = nullptr;
+  bool* p5 = nullptr;
+  Message* p6 = nullptr;
+
+  msg << p1 << p2 << p3 << p4 << p5 << p6;
+  ASSERT_STREQ("(null)(null)(null)(null)(null)(null)",
+               msg.GetString().c_str());
+}
+
+// Tests streaming wide strings to testing::Message.
+TEST(MessageTest, WideStrings) {
+  // Streams a NULL of type const wchar_t*.
+  const wchar_t* const_wstr = nullptr;
+  EXPECT_STREQ("(null)",
+               (Message() << const_wstr).GetString().c_str());
+
+  // Streams a NULL of type wchar_t*.
+  wchar_t* wstr = nullptr;
+  EXPECT_STREQ("(null)",
+               (Message() << wstr).GetString().c_str());
+
+  // Streams a non-NULL of type const wchar_t*.
+  const_wstr = L"abc\x8119";
+  EXPECT_STREQ("abc\xe8\x84\x99",
+               (Message() << const_wstr).GetString().c_str());
+
+  // Streams a non-NULL of type wchar_t*.
+  wstr = const_cast<wchar_t*>(const_wstr);
+  EXPECT_STREQ("abc\xe8\x84\x99",
+               (Message() << wstr).GetString().c_str());
+}
+
+
+// This line tests that we can define tests in the testing namespace.
+namespace testing {
+
+// Tests the TestInfo class.
+
+class TestInfoTest : public Test {
+ protected:
+  static const TestInfo* GetTestInfo(const char* test_name) {
+    const TestSuite* const test_suite =
+        GetUnitTestImpl()->GetTestSuite("TestInfoTest", "", nullptr, nullptr);
+
+    for (int i = 0; i < test_suite->total_test_count(); ++i) {
+      const TestInfo* const test_info = test_suite->GetTestInfo(i);
+      if (strcmp(test_name, test_info->name()) == 0)
+        return test_info;
+    }
+    return nullptr;
+  }
+
+  static const TestResult* GetTestResult(
+      const TestInfo* test_info) {
+    return test_info->result();
+  }
+};
+
+// Tests TestInfo::test_case_name() and TestInfo::name().
+TEST_F(TestInfoTest, Names) {
+  const TestInfo* const test_info = GetTestInfo("Names");
+
+  ASSERT_STREQ("TestInfoTest", test_info->test_case_name());
+  ASSERT_STREQ("Names", test_info->name());
+}
+
+// Tests TestInfo::result().
+TEST_F(TestInfoTest, result) {
+  const TestInfo* const test_info = GetTestInfo("result");
+
+  // Initially, there is no TestPartResult for this test.
+  ASSERT_EQ(0, GetTestResult(test_info)->total_part_count());
+
+  // After the previous assertion, there is still none.
+  ASSERT_EQ(0, GetTestResult(test_info)->total_part_count());
+}
+
+#define VERIFY_CODE_LOCATION \
+  const int expected_line = __LINE__ - 1; \
+  const TestInfo* const test_info = GetUnitTestImpl()->current_test_info(); \
+  ASSERT_TRUE(test_info); \
+  EXPECT_STREQ(__FILE__, test_info->file()); \
+  EXPECT_EQ(expected_line, test_info->line())
+
+TEST(CodeLocationForTEST, Verify) {
+  VERIFY_CODE_LOCATION;
+}
+
+class CodeLocationForTESTF : public Test {
+};
+
+TEST_F(CodeLocationForTESTF, Verify) {
+  VERIFY_CODE_LOCATION;
+}
+
+class CodeLocationForTESTP : public TestWithParam<int> {
+};
+
+TEST_P(CodeLocationForTESTP, Verify) {
+  VERIFY_CODE_LOCATION;
+}
+
+INSTANTIATE_TEST_SUITE_P(, CodeLocationForTESTP, Values(0));
+
+template <typename T>
+class CodeLocationForTYPEDTEST : public Test {
+};
+
+TYPED_TEST_SUITE(CodeLocationForTYPEDTEST, int);
+
+TYPED_TEST(CodeLocationForTYPEDTEST, Verify) {
+  VERIFY_CODE_LOCATION;
+}
+
+template <typename T>
+class CodeLocationForTYPEDTESTP : public Test {
+};
+
+TYPED_TEST_SUITE_P(CodeLocationForTYPEDTESTP);
+
+TYPED_TEST_P(CodeLocationForTYPEDTESTP, Verify) {
+  VERIFY_CODE_LOCATION;
+}
+
+REGISTER_TYPED_TEST_SUITE_P(CodeLocationForTYPEDTESTP, Verify);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(My, CodeLocationForTYPEDTESTP, int);
+
+#undef VERIFY_CODE_LOCATION
+
+// Tests setting up and tearing down a test case.
+// Legacy API is deprecated but still available
+#ifndef REMOVE_LEGACY_TEST_CASEAPI
+class SetUpTestCaseTest : public Test {
+ protected:
+  // This will be called once before the first test in this test case
+  // is run.
+  static void SetUpTestCase() {
+    printf("Setting up the test case . . .\n");
+
+    // Initializes some shared resource.  In this simple example, we
+    // just create a C string.  More complex stuff can be done if
+    // desired.
+    shared_resource_ = "123";
+
+    // Increments the number of test cases that have been set up.
+    counter_++;
+
+    // SetUpTestCase() should be called only once.
+    EXPECT_EQ(1, counter_);
+  }
+
+  // This will be called once after the last test in this test case is
+  // run.
+  static void TearDownTestCase() {
+    printf("Tearing down the test case . . .\n");
+
+    // Decrements the number of test cases that have been set up.
+    counter_--;
+
+    // TearDownTestCase() should be called only once.
+    EXPECT_EQ(0, counter_);
+
+    // Cleans up the shared resource.
+    shared_resource_ = nullptr;
+  }
+
+  // This will be called before each test in this test case.
+  void SetUp() override {
+    // SetUpTestCase() should be called only once, so counter_ should
+    // always be 1.
+    EXPECT_EQ(1, counter_);
+  }
+
+  // Number of test cases that have been set up.
+  static int counter_;
+
+  // Some resource to be shared by all tests in this test case.
+  static const char* shared_resource_;
+};
+
+int SetUpTestCaseTest::counter_ = 0;
+const char* SetUpTestCaseTest::shared_resource_ = nullptr;
+
+// A test that uses the shared resource.
+TEST_F(SetUpTestCaseTest, Test1) { EXPECT_STRNE(nullptr, shared_resource_); }
+
+// Another test that uses the shared resource.
+TEST_F(SetUpTestCaseTest, Test2) {
+  EXPECT_STREQ("123", shared_resource_);
+}
+#endif  //  REMOVE_LEGACY_TEST_CASEAPI
+
+// Tests SetupTestSuite/TearDown TestSuite
+class SetUpTestSuiteTest : public Test {
+ protected:
+  // This will be called once before the first test in this test case
+  // is run.
+  static void SetUpTestSuite() {
+    printf("Setting up the test suite . . .\n");
+
+    // Initializes some shared resource.  In this simple example, we
+    // just create a C string.  More complex stuff can be done if
+    // desired.
+    shared_resource_ = "123";
+
+    // Increments the number of test cases that have been set up.
+    counter_++;
+
+    // SetUpTestSuite() should be called only once.
+    EXPECT_EQ(1, counter_);
+  }
+
+  // This will be called once after the last test in this test case is
+  // run.
+  static void TearDownTestSuite() {
+    printf("Tearing down the test suite . . .\n");
+
+    // Decrements the number of test suites that have been set up.
+    counter_--;
+
+    // TearDownTestSuite() should be called only once.
+    EXPECT_EQ(0, counter_);
+
+    // Cleans up the shared resource.
+    shared_resource_ = nullptr;
+  }
+
+  // This will be called before each test in this test case.
+  void SetUp() override {
+    // SetUpTestSuite() should be called only once, so counter_ should
+    // always be 1.
+    EXPECT_EQ(1, counter_);
+  }
+
+  // Number of test suites that have been set up.
+  static int counter_;
+
+  // Some resource to be shared by all tests in this test case.
+  static const char* shared_resource_;
+};
+
+int SetUpTestSuiteTest::counter_ = 0;
+const char* SetUpTestSuiteTest::shared_resource_ = nullptr;
+
+// A test that uses the shared resource.
+TEST_F(SetUpTestSuiteTest, TestSetupTestSuite1) {
+  EXPECT_STRNE(nullptr, shared_resource_);
+}
+
+// Another test that uses the shared resource.
+TEST_F(SetUpTestSuiteTest, TestSetupTestSuite2) {
+  EXPECT_STREQ("123", shared_resource_);
+}
+
+// The ParseFlagsTest test case tests ParseGoogleTestFlagsOnly.
+
+// The Flags struct stores a copy of all Google Test flags.
+struct Flags {
+  // Constructs a Flags struct where each flag has its default value.
+  Flags() : also_run_disabled_tests(false),
+            break_on_failure(false),
+            catch_exceptions(false),
+            death_test_use_fork(false),
+            filter(""),
+            list_tests(false),
+            output(""),
+            print_time(true),
+            random_seed(0),
+            repeat(1),
+            shuffle(false),
+            stack_trace_depth(kMaxStackTraceDepth),
+            stream_result_to(""),
+            throw_on_failure(false) {}
+
+  // Factory methods.
+
+  // Creates a Flags struct where the gtest_also_run_disabled_tests flag has
+  // the given value.
+  static Flags AlsoRunDisabledTests(bool also_run_disabled_tests) {
+    Flags flags;
+    flags.also_run_disabled_tests = also_run_disabled_tests;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_break_on_failure flag has
+  // the given value.
+  static Flags BreakOnFailure(bool break_on_failure) {
+    Flags flags;
+    flags.break_on_failure = break_on_failure;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_catch_exceptions flag has
+  // the given value.
+  static Flags CatchExceptions(bool catch_exceptions) {
+    Flags flags;
+    flags.catch_exceptions = catch_exceptions;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_death_test_use_fork flag has
+  // the given value.
+  static Flags DeathTestUseFork(bool death_test_use_fork) {
+    Flags flags;
+    flags.death_test_use_fork = death_test_use_fork;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_filter flag has the given
+  // value.
+  static Flags Filter(const char* filter) {
+    Flags flags;
+    flags.filter = filter;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_list_tests flag has the
+  // given value.
+  static Flags ListTests(bool list_tests) {
+    Flags flags;
+    flags.list_tests = list_tests;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_output flag has the given
+  // value.
+  static Flags Output(const char* output) {
+    Flags flags;
+    flags.output = output;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_print_time flag has the given
+  // value.
+  static Flags PrintTime(bool print_time) {
+    Flags flags;
+    flags.print_time = print_time;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_random_seed flag has the given
+  // value.
+  static Flags RandomSeed(Int32 random_seed) {
+    Flags flags;
+    flags.random_seed = random_seed;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_repeat flag has the given
+  // value.
+  static Flags Repeat(Int32 repeat) {
+    Flags flags;
+    flags.repeat = repeat;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_shuffle flag has the given
+  // value.
+  static Flags Shuffle(bool shuffle) {
+    Flags flags;
+    flags.shuffle = shuffle;
+    return flags;
+  }
+
+  // Creates a Flags struct where the GTEST_FLAG(stack_trace_depth) flag has
+  // the given value.
+  static Flags StackTraceDepth(Int32 stack_trace_depth) {
+    Flags flags;
+    flags.stack_trace_depth = stack_trace_depth;
+    return flags;
+  }
+
+  // Creates a Flags struct where the GTEST_FLAG(stream_result_to) flag has
+  // the given value.
+  static Flags StreamResultTo(const char* stream_result_to) {
+    Flags flags;
+    flags.stream_result_to = stream_result_to;
+    return flags;
+  }
+
+  // Creates a Flags struct where the gtest_throw_on_failure flag has
+  // the given value.
+  static Flags ThrowOnFailure(bool throw_on_failure) {
+    Flags flags;
+    flags.throw_on_failure = throw_on_failure;
+    return flags;
+  }
+
+  // These fields store the flag values.
+  bool also_run_disabled_tests;
+  bool break_on_failure;
+  bool catch_exceptions;
+  bool death_test_use_fork;
+  const char* filter;
+  bool list_tests;
+  const char* output;
+  bool print_time;
+  Int32 random_seed;
+  Int32 repeat;
+  bool shuffle;
+  Int32 stack_trace_depth;
+  const char* stream_result_to;
+  bool throw_on_failure;
+};
+
+// Fixture for testing ParseGoogleTestFlagsOnly().
+class ParseFlagsTest : public Test {
+ protected:
+  // Clears the flags before each test.
+  void SetUp() override {
+    GTEST_FLAG(also_run_disabled_tests) = false;
+    GTEST_FLAG(break_on_failure) = false;
+    GTEST_FLAG(catch_exceptions) = false;
+    GTEST_FLAG(death_test_use_fork) = false;
+    GTEST_FLAG(filter) = "";
+    GTEST_FLAG(list_tests) = false;
+    GTEST_FLAG(output) = "";
+    GTEST_FLAG(print_time) = true;
+    GTEST_FLAG(random_seed) = 0;
+    GTEST_FLAG(repeat) = 1;
+    GTEST_FLAG(shuffle) = false;
+    GTEST_FLAG(stack_trace_depth) = kMaxStackTraceDepth;
+    GTEST_FLAG(stream_result_to) = "";
+    GTEST_FLAG(throw_on_failure) = false;
+  }
+
+  // Asserts that two narrow or wide string arrays are equal.
+  template <typename CharType>
+  static void AssertStringArrayEq(int size1, CharType** array1, int size2,
+                                  CharType** array2) {
+    ASSERT_EQ(size1, size2) << " Array sizes different.";
+
+    for (int i = 0; i != size1; i++) {
+      ASSERT_STREQ(array1[i], array2[i]) << " where i == " << i;
+    }
+  }
+
+  // Verifies that the flag values match the expected values.
+  static void CheckFlags(const Flags& expected) {
+    EXPECT_EQ(expected.also_run_disabled_tests,
+              GTEST_FLAG(also_run_disabled_tests));
+    EXPECT_EQ(expected.break_on_failure, GTEST_FLAG(break_on_failure));
+    EXPECT_EQ(expected.catch_exceptions, GTEST_FLAG(catch_exceptions));
+    EXPECT_EQ(expected.death_test_use_fork, GTEST_FLAG(death_test_use_fork));
+    EXPECT_STREQ(expected.filter, GTEST_FLAG(filter).c_str());
+    EXPECT_EQ(expected.list_tests, GTEST_FLAG(list_tests));
+    EXPECT_STREQ(expected.output, GTEST_FLAG(output).c_str());
+    EXPECT_EQ(expected.print_time, GTEST_FLAG(print_time));
+    EXPECT_EQ(expected.random_seed, GTEST_FLAG(random_seed));
+    EXPECT_EQ(expected.repeat, GTEST_FLAG(repeat));
+    EXPECT_EQ(expected.shuffle, GTEST_FLAG(shuffle));
+    EXPECT_EQ(expected.stack_trace_depth, GTEST_FLAG(stack_trace_depth));
+    EXPECT_STREQ(expected.stream_result_to,
+                 GTEST_FLAG(stream_result_to).c_str());
+    EXPECT_EQ(expected.throw_on_failure, GTEST_FLAG(throw_on_failure));
+  }
+
+  // Parses a command line (specified by argc1 and argv1), then
+  // verifies that the flag values are expected and that the
+  // recognized flags are removed from the command line.
+  template <typename CharType>
+  static void TestParsingFlags(int argc1, const CharType** argv1,
+                               int argc2, const CharType** argv2,
+                               const Flags& expected, bool should_print_help) {
+    const bool saved_help_flag = ::testing::internal::g_help_flag;
+    ::testing::internal::g_help_flag = false;
+
+# if GTEST_HAS_STREAM_REDIRECTION
+    CaptureStdout();
+# endif
+
+    // Parses the command line.
+    internal::ParseGoogleTestFlagsOnly(&argc1, const_cast<CharType**>(argv1));
+
+# if GTEST_HAS_STREAM_REDIRECTION
+    const std::string captured_stdout = GetCapturedStdout();
+# endif
+
+    // Verifies the flag values.
+    CheckFlags(expected);
+
+    // Verifies that the recognized flags are removed from the command
+    // line.
+    AssertStringArrayEq(argc1 + 1, argv1, argc2 + 1, argv2);
+
+    // ParseGoogleTestFlagsOnly should neither set g_help_flag nor print the
+    // help message for the flags it recognizes.
+    EXPECT_EQ(should_print_help, ::testing::internal::g_help_flag);
+
+# if GTEST_HAS_STREAM_REDIRECTION
+    const char* const expected_help_fragment =
+        "This program contains tests written using";
+    if (should_print_help) {
+      EXPECT_PRED_FORMAT2(IsSubstring, expected_help_fragment, captured_stdout);
+    } else {
+      EXPECT_PRED_FORMAT2(IsNotSubstring,
+                          expected_help_fragment, captured_stdout);
+    }
+# endif  // GTEST_HAS_STREAM_REDIRECTION
+
+    ::testing::internal::g_help_flag = saved_help_flag;
+  }
+
+  // This macro wraps TestParsingFlags s.t. the user doesn't need
+  // to specify the array sizes.
+
+# define GTEST_TEST_PARSING_FLAGS_(argv1, argv2, expected, should_print_help) \
+  TestParsingFlags(sizeof(argv1)/sizeof(*argv1) - 1, argv1, \
+                   sizeof(argv2)/sizeof(*argv2) - 1, argv2, \
+                   expected, should_print_help)
+};
+
+// Tests parsing an empty command line.
+TEST_F(ParseFlagsTest, Empty) {
+  const char* argv[] = {nullptr};
+
+  const char* argv2[] = {nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags(), false);
+}
+
+// Tests parsing a command line that has no flag.
+TEST_F(ParseFlagsTest, NoFlag) {
+  const char* argv[] = {"foo.exe", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags(), false);
+}
+
+// Tests parsing a bad --gtest_filter flag.
+TEST_F(ParseFlagsTest, FilterBad) {
+  const char* argv[] = {"foo.exe", "--gtest_filter", nullptr};
+
+  const char* argv2[] = {"foo.exe", "--gtest_filter", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Filter(""), true);
+}
+
+// Tests parsing an empty --gtest_filter flag.
+TEST_F(ParseFlagsTest, FilterEmpty) {
+  const char* argv[] = {"foo.exe", "--gtest_filter=", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Filter(""), false);
+}
+
+// Tests parsing a non-empty --gtest_filter flag.
+TEST_F(ParseFlagsTest, FilterNonEmpty) {
+  const char* argv[] = {"foo.exe", "--gtest_filter=abc", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Filter("abc"), false);
+}
+
+// Tests parsing --gtest_break_on_failure.
+TEST_F(ParseFlagsTest, BreakOnFailureWithoutValue) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::BreakOnFailure(true), false);
+}
+
+// Tests parsing --gtest_break_on_failure=0.
+TEST_F(ParseFlagsTest, BreakOnFailureFalse_0) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure=0", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::BreakOnFailure(false), false);
+}
+
+// Tests parsing --gtest_break_on_failure=f.
+TEST_F(ParseFlagsTest, BreakOnFailureFalse_f) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure=f", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::BreakOnFailure(false), false);
+}
+
+// Tests parsing --gtest_break_on_failure=F.
+TEST_F(ParseFlagsTest, BreakOnFailureFalse_F) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure=F", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::BreakOnFailure(false), false);
+}
+
+// Tests parsing a --gtest_break_on_failure flag that has a "true"
+// definition.
+TEST_F(ParseFlagsTest, BreakOnFailureTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure=1", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::BreakOnFailure(true), false);
+}
+
+// Tests parsing --gtest_catch_exceptions.
+TEST_F(ParseFlagsTest, CatchExceptions) {
+  const char* argv[] = {"foo.exe", "--gtest_catch_exceptions", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::CatchExceptions(true), false);
+}
+
+// Tests parsing --gtest_death_test_use_fork.
+TEST_F(ParseFlagsTest, DeathTestUseFork) {
+  const char* argv[] = {"foo.exe", "--gtest_death_test_use_fork", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::DeathTestUseFork(true), false);
+}
+
+// Tests having the same flag twice with different values.  The
+// expected behavior is that the one coming last takes precedence.
+TEST_F(ParseFlagsTest, DuplicatedFlags) {
+  const char* argv[] = {"foo.exe", "--gtest_filter=a", "--gtest_filter=b",
+                        nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Filter("b"), false);
+}
+
+// Tests having an unrecognized flag on the command line.
+TEST_F(ParseFlagsTest, UnrecognizedFlag) {
+  const char* argv[] = {"foo.exe", "--gtest_break_on_failure",
+                        "bar",  // Unrecognized by Google Test.
+                        "--gtest_filter=b", nullptr};
+
+  const char* argv2[] = {"foo.exe", "bar", nullptr};
+
+  Flags flags;
+  flags.break_on_failure = true;
+  flags.filter = "b";
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, flags, false);
+}
+
+// Tests having a --gtest_list_tests flag
+TEST_F(ParseFlagsTest, ListTestsFlag) {
+  const char* argv[] = {"foo.exe", "--gtest_list_tests", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ListTests(true), false);
+}
+
+// Tests having a --gtest_list_tests flag with a "true" value
+TEST_F(ParseFlagsTest, ListTestsTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_list_tests=1", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ListTests(true), false);
+}
+
+// Tests having a --gtest_list_tests flag with a "false" value
+TEST_F(ParseFlagsTest, ListTestsFalse) {
+  const char* argv[] = {"foo.exe", "--gtest_list_tests=0", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ListTests(false), false);
+}
+
+// Tests parsing --gtest_list_tests=f.
+TEST_F(ParseFlagsTest, ListTestsFalse_f) {
+  const char* argv[] = {"foo.exe", "--gtest_list_tests=f", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ListTests(false), false);
+}
+
+// Tests parsing --gtest_list_tests=F.
+TEST_F(ParseFlagsTest, ListTestsFalse_F) {
+  const char* argv[] = {"foo.exe", "--gtest_list_tests=F", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ListTests(false), false);
+}
+
+// Tests parsing --gtest_output (invalid).
+TEST_F(ParseFlagsTest, OutputEmpty) {
+  const char* argv[] = {"foo.exe", "--gtest_output", nullptr};
+
+  const char* argv2[] = {"foo.exe", "--gtest_output", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags(), true);
+}
+
+// Tests parsing --gtest_output=xml
+TEST_F(ParseFlagsTest, OutputXml) {
+  const char* argv[] = {"foo.exe", "--gtest_output=xml", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Output("xml"), false);
+}
+
+// Tests parsing --gtest_output=xml:file
+TEST_F(ParseFlagsTest, OutputXmlFile) {
+  const char* argv[] = {"foo.exe", "--gtest_output=xml:file", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Output("xml:file"), false);
+}
+
+// Tests parsing --gtest_output=xml:directory/path/
+TEST_F(ParseFlagsTest, OutputXmlDirectory) {
+  const char* argv[] = {"foo.exe", "--gtest_output=xml:directory/path/",
+                        nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2,
+                            Flags::Output("xml:directory/path/"), false);
+}
+
+// Tests having a --gtest_print_time flag
+TEST_F(ParseFlagsTest, PrintTimeFlag) {
+  const char* argv[] = {"foo.exe", "--gtest_print_time", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::PrintTime(true), false);
+}
+
+// Tests having a --gtest_print_time flag with a "true" value
+TEST_F(ParseFlagsTest, PrintTimeTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_print_time=1", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::PrintTime(true), false);
+}
+
+// Tests having a --gtest_print_time flag with a "false" value
+TEST_F(ParseFlagsTest, PrintTimeFalse) {
+  const char* argv[] = {"foo.exe", "--gtest_print_time=0", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::PrintTime(false), false);
+}
+
+// Tests parsing --gtest_print_time=f.
+TEST_F(ParseFlagsTest, PrintTimeFalse_f) {
+  const char* argv[] = {"foo.exe", "--gtest_print_time=f", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::PrintTime(false), false);
+}
+
+// Tests parsing --gtest_print_time=F.
+TEST_F(ParseFlagsTest, PrintTimeFalse_F) {
+  const char* argv[] = {"foo.exe", "--gtest_print_time=F", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::PrintTime(false), false);
+}
+
+// Tests parsing --gtest_random_seed=number
+TEST_F(ParseFlagsTest, RandomSeed) {
+  const char* argv[] = {"foo.exe", "--gtest_random_seed=1000", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::RandomSeed(1000), false);
+}
+
+// Tests parsing --gtest_repeat=number
+TEST_F(ParseFlagsTest, Repeat) {
+  const char* argv[] = {"foo.exe", "--gtest_repeat=1000", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Repeat(1000), false);
+}
+
+// Tests having a --gtest_also_run_disabled_tests flag
+TEST_F(ParseFlagsTest, AlsoRunDisabledTestsFlag) {
+  const char* argv[] = {"foo.exe", "--gtest_also_run_disabled_tests", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::AlsoRunDisabledTests(true),
+                            false);
+}
+
+// Tests having a --gtest_also_run_disabled_tests flag with a "true" value
+TEST_F(ParseFlagsTest, AlsoRunDisabledTestsTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_also_run_disabled_tests=1",
+                        nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::AlsoRunDisabledTests(true),
+                            false);
+}
+
+// Tests having a --gtest_also_run_disabled_tests flag with a "false" value
+TEST_F(ParseFlagsTest, AlsoRunDisabledTestsFalse) {
+  const char* argv[] = {"foo.exe", "--gtest_also_run_disabled_tests=0",
+                        nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::AlsoRunDisabledTests(false),
+                            false);
+}
+
+// Tests parsing --gtest_shuffle.
+TEST_F(ParseFlagsTest, ShuffleWithoutValue) {
+  const char* argv[] = {"foo.exe", "--gtest_shuffle", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Shuffle(true), false);
+}
+
+// Tests parsing --gtest_shuffle=0.
+TEST_F(ParseFlagsTest, ShuffleFalse_0) {
+  const char* argv[] = {"foo.exe", "--gtest_shuffle=0", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Shuffle(false), false);
+}
+
+// Tests parsing a --gtest_shuffle flag that has a "true" definition.
+TEST_F(ParseFlagsTest, ShuffleTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_shuffle=1", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Shuffle(true), false);
+}
+
+// Tests parsing --gtest_stack_trace_depth=number.
+TEST_F(ParseFlagsTest, StackTraceDepth) {
+  const char* argv[] = {"foo.exe", "--gtest_stack_trace_depth=5", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::StackTraceDepth(5), false);
+}
+
+TEST_F(ParseFlagsTest, StreamResultTo) {
+  const char* argv[] = {"foo.exe", "--gtest_stream_result_to=localhost:1234",
+                        nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(
+      argv, argv2, Flags::StreamResultTo("localhost:1234"), false);
+}
+
+// Tests parsing --gtest_throw_on_failure.
+TEST_F(ParseFlagsTest, ThrowOnFailureWithoutValue) {
+  const char* argv[] = {"foo.exe", "--gtest_throw_on_failure", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ThrowOnFailure(true), false);
+}
+
+// Tests parsing --gtest_throw_on_failure=0.
+TEST_F(ParseFlagsTest, ThrowOnFailureFalse_0) {
+  const char* argv[] = {"foo.exe", "--gtest_throw_on_failure=0", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ThrowOnFailure(false), false);
+}
+
+// Tests parsing a --gtest_throw_on_failure flag that has a "true"
+// definition.
+TEST_F(ParseFlagsTest, ThrowOnFailureTrue) {
+  const char* argv[] = {"foo.exe", "--gtest_throw_on_failure=1", nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::ThrowOnFailure(true), false);
+}
+
+# if GTEST_OS_WINDOWS
+// Tests parsing wide strings.
+TEST_F(ParseFlagsTest, WideStrings) {
+  const wchar_t* argv[] = {
+    L"foo.exe",
+    L"--gtest_filter=Foo*",
+    L"--gtest_list_tests=1",
+    L"--gtest_break_on_failure",
+    L"--non_gtest_flag",
+    NULL
+  };
+
+  const wchar_t* argv2[] = {
+    L"foo.exe",
+    L"--non_gtest_flag",
+    NULL
+  };
+
+  Flags expected_flags;
+  expected_flags.break_on_failure = true;
+  expected_flags.filter = "Foo*";
+  expected_flags.list_tests = true;
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, expected_flags, false);
+}
+# endif  // GTEST_OS_WINDOWS
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+class FlagfileTest : public ParseFlagsTest {
+ public:
+  virtual void SetUp() {
+    ParseFlagsTest::SetUp();
+
+    testdata_path_.Set(internal::FilePath(
+        testing::TempDir() + internal::GetCurrentExecutableName().string() +
+        "_flagfile_test"));
+    testing::internal::posix::RmDir(testdata_path_.c_str());
+    EXPECT_TRUE(testdata_path_.CreateFolder());
+  }
+
+  virtual void TearDown() {
+    testing::internal::posix::RmDir(testdata_path_.c_str());
+    ParseFlagsTest::TearDown();
+  }
+
+  internal::FilePath CreateFlagfile(const char* contents) {
+    internal::FilePath file_path(internal::FilePath::GenerateUniqueFileName(
+        testdata_path_, internal::FilePath("unique"), "txt"));
+    FILE* f = testing::internal::posix::FOpen(file_path.c_str(), "w");
+    fprintf(f, "%s", contents);
+    fclose(f);
+    return file_path;
+  }
+
+ private:
+  internal::FilePath testdata_path_;
+};
+
+// Tests an empty flagfile.
+TEST_F(FlagfileTest, Empty) {
+  internal::FilePath flagfile_path(CreateFlagfile(""));
+  std::string flagfile_flag =
+      std::string("--" GTEST_FLAG_PREFIX_ "flagfile=") + flagfile_path.c_str();
+
+  const char* argv[] = {"foo.exe", flagfile_flag.c_str(), nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags(), false);
+}
+
+// Tests passing a non-empty --gtest_filter flag via --gtest_flagfile.
+TEST_F(FlagfileTest, FilterNonEmpty) {
+  internal::FilePath flagfile_path(CreateFlagfile(
+      "--"  GTEST_FLAG_PREFIX_  "filter=abc"));
+  std::string flagfile_flag =
+      std::string("--" GTEST_FLAG_PREFIX_ "flagfile=") + flagfile_path.c_str();
+
+  const char* argv[] = {"foo.exe", flagfile_flag.c_str(), nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, Flags::Filter("abc"), false);
+}
+
+// Tests passing several flags via --gtest_flagfile.
+TEST_F(FlagfileTest, SeveralFlags) {
+  internal::FilePath flagfile_path(CreateFlagfile(
+      "--"  GTEST_FLAG_PREFIX_  "filter=abc\n"
+      "--"  GTEST_FLAG_PREFIX_  "break_on_failure\n"
+      "--"  GTEST_FLAG_PREFIX_  "list_tests"));
+  std::string flagfile_flag =
+      std::string("--" GTEST_FLAG_PREFIX_ "flagfile=") + flagfile_path.c_str();
+
+  const char* argv[] = {"foo.exe", flagfile_flag.c_str(), nullptr};
+
+  const char* argv2[] = {"foo.exe", nullptr};
+
+  Flags expected_flags;
+  expected_flags.break_on_failure = true;
+  expected_flags.filter = "abc";
+  expected_flags.list_tests = true;
+
+  GTEST_TEST_PARSING_FLAGS_(argv, argv2, expected_flags, false);
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Tests current_test_info() in UnitTest.
+class CurrentTestInfoTest : public Test {
+ protected:
+  // Tests that current_test_info() returns NULL before the first test in
+  // the test case is run.
+  static void SetUpTestSuite() {
+    // There should be no tests running at this point.
+    const TestInfo* test_info =
+      UnitTest::GetInstance()->current_test_info();
+    EXPECT_TRUE(test_info == nullptr)
+        << "There should be no tests running at this point.";
+  }
+
+  // Tests that current_test_info() returns NULL after the last test in
+  // the test case has run.
+  static void TearDownTestSuite() {
+    const TestInfo* test_info =
+      UnitTest::GetInstance()->current_test_info();
+    EXPECT_TRUE(test_info == nullptr)
+        << "There should be no tests running at this point.";
+  }
+};
+
+// Tests that current_test_info() returns TestInfo for currently running
+// test by checking the expected test name against the actual one.
+TEST_F(CurrentTestInfoTest, WorksForFirstTestInATestSuite) {
+  const TestInfo* test_info =
+    UnitTest::GetInstance()->current_test_info();
+  ASSERT_TRUE(nullptr != test_info)
+      << "There is a test running so we should have a valid TestInfo.";
+  EXPECT_STREQ("CurrentTestInfoTest", test_info->test_case_name())
+      << "Expected the name of the currently running test case.";
+  EXPECT_STREQ("WorksForFirstTestInATestSuite", test_info->name())
+      << "Expected the name of the currently running test.";
+}
+
+// Tests that current_test_info() returns TestInfo for currently running
+// test by checking the expected test name against the actual one.  We
+// use this test to see that the TestInfo object actually changed from
+// the previous invocation.
+TEST_F(CurrentTestInfoTest, WorksForSecondTestInATestSuite) {
+  const TestInfo* test_info =
+    UnitTest::GetInstance()->current_test_info();
+  ASSERT_TRUE(nullptr != test_info)
+      << "There is a test running so we should have a valid TestInfo.";
+  EXPECT_STREQ("CurrentTestInfoTest", test_info->test_case_name())
+      << "Expected the name of the currently running test case.";
+  EXPECT_STREQ("WorksForSecondTestInATestSuite", test_info->name())
+      << "Expected the name of the currently running test.";
+}
+
+}  // namespace testing
+
+
+// These two lines test that we can define tests in a namespace that
+// has the name "testing" and is nested in another namespace.
+namespace my_namespace {
+namespace testing {
+
+// Makes sure that TEST knows to use ::testing::Test instead of
+// ::my_namespace::testing::Test.
+class Test {};
+
+// Makes sure that an assertion knows to use ::testing::Message instead of
+// ::my_namespace::testing::Message.
+class Message {};
+
+// Makes sure that an assertion knows to use
+// ::testing::AssertionResult instead of
+// ::my_namespace::testing::AssertionResult.
+class AssertionResult {};
+
+// Tests that an assertion that should succeed works as expected.
+TEST(NestedTestingNamespaceTest, Success) {
+  EXPECT_EQ(1, 1) << "This shouldn't fail.";
+}
+
+// Tests that an assertion that should fail works as expected.
+TEST(NestedTestingNamespaceTest, Failure) {
+  EXPECT_FATAL_FAILURE(FAIL() << "This failure is expected.",
+                       "This failure is expected.");
+}
+
+}  // namespace testing
+}  // namespace my_namespace
+
+// Tests that one can call superclass SetUp and TearDown methods--
+// that is, that they are not private.
+// No tests are based on this fixture; the test "passes" if it compiles
+// successfully.
+class ProtectedFixtureMethodsTest : public Test {
+ protected:
+  void SetUp() override { Test::SetUp(); }
+  void TearDown() override { Test::TearDown(); }
+};
+
+// StreamingAssertionsTest tests the streaming versions of a representative
+// sample of assertions.
+TEST(StreamingAssertionsTest, Unconditional) {
+  SUCCEED() << "expected success";
+  EXPECT_NONFATAL_FAILURE(ADD_FAILURE() << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(FAIL() << "expected failure",
+                       "expected failure");
+}
+
+#ifdef __BORLANDC__
+// Silences warnings: "Condition is always true", "Unreachable code"
+# pragma option push -w-ccc -w-rch
+#endif
+
+TEST(StreamingAssertionsTest, Truth) {
+  EXPECT_TRUE(true) << "unexpected failure";
+  ASSERT_TRUE(true) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_TRUE(false) << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_TRUE(false) << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, Truth2) {
+  EXPECT_FALSE(false) << "unexpected failure";
+  ASSERT_FALSE(false) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_FALSE(true) << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_FALSE(true) << "expected failure",
+                       "expected failure");
+}
+
+#ifdef __BORLANDC__
+// Restores warnings after previous "#pragma option push" suppressed them
+# pragma option pop
+#endif
+
+TEST(StreamingAssertionsTest, IntegerEquals) {
+  EXPECT_EQ(1, 1) << "unexpected failure";
+  ASSERT_EQ(1, 1) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_EQ(1, 2) << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_EQ(1, 2) << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, IntegerLessThan) {
+  EXPECT_LT(1, 2) << "unexpected failure";
+  ASSERT_LT(1, 2) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_LT(2, 1) << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_LT(2, 1) << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, StringsEqual) {
+  EXPECT_STREQ("foo", "foo") << "unexpected failure";
+  ASSERT_STREQ("foo", "foo") << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STREQ("foo", "bar") << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_STREQ("foo", "bar") << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, StringsNotEqual) {
+  EXPECT_STRNE("foo", "bar") << "unexpected failure";
+  ASSERT_STRNE("foo", "bar") << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRNE("foo", "foo") << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_STRNE("foo", "foo") << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, StringsEqualIgnoringCase) {
+  EXPECT_STRCASEEQ("foo", "FOO") << "unexpected failure";
+  ASSERT_STRCASEEQ("foo", "FOO") << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRCASEEQ("foo", "bar") << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_STRCASEEQ("foo", "bar") << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, StringNotEqualIgnoringCase) {
+  EXPECT_STRCASENE("foo", "bar") << "unexpected failure";
+  ASSERT_STRCASENE("foo", "bar") << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_STRCASENE("foo", "FOO") << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_STRCASENE("bar", "BAR") << "expected failure",
+                       "expected failure");
+}
+
+TEST(StreamingAssertionsTest, FloatingPointEquals) {
+  EXPECT_FLOAT_EQ(1.0, 1.0) << "unexpected failure";
+  ASSERT_FLOAT_EQ(1.0, 1.0) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_FLOAT_EQ(0.0, 1.0) << "expected failure",
+                          "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_FLOAT_EQ(0.0, 1.0) << "expected failure",
+                       "expected failure");
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+TEST(StreamingAssertionsTest, Throw) {
+  EXPECT_THROW(ThrowAnInteger(), int) << "unexpected failure";
+  ASSERT_THROW(ThrowAnInteger(), int) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_THROW(ThrowAnInteger(), bool) <<
+                          "expected failure", "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_THROW(ThrowAnInteger(), bool) <<
+                       "expected failure", "expected failure");
+}
+
+TEST(StreamingAssertionsTest, NoThrow) {
+  EXPECT_NO_THROW(ThrowNothing()) << "unexpected failure";
+  ASSERT_NO_THROW(ThrowNothing()) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_NO_THROW(ThrowAnInteger()) <<
+                          "expected failure", "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_NO_THROW(ThrowAnInteger()) <<
+                       "expected failure", "expected failure");
+}
+
+TEST(StreamingAssertionsTest, AnyThrow) {
+  EXPECT_ANY_THROW(ThrowAnInteger()) << "unexpected failure";
+  ASSERT_ANY_THROW(ThrowAnInteger()) << "unexpected failure";
+  EXPECT_NONFATAL_FAILURE(EXPECT_ANY_THROW(ThrowNothing()) <<
+                          "expected failure", "expected failure");
+  EXPECT_FATAL_FAILURE(ASSERT_ANY_THROW(ThrowNothing()) <<
+                       "expected failure", "expected failure");
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Tests that Google Test correctly decides whether to use colors in the output.
+
+TEST(ColoredOutputTest, UsesColorsWhenGTestColorFlagIsYes) {
+  GTEST_FLAG(color) = "yes";
+
+  SetEnv("TERM", "xterm");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+  EXPECT_TRUE(ShouldUseColor(false));  // Stdout is not a TTY.
+
+  SetEnv("TERM", "dumb");  // TERM doesn't support colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+  EXPECT_TRUE(ShouldUseColor(false));  // Stdout is not a TTY.
+}
+
+TEST(ColoredOutputTest, UsesColorsWhenGTestColorFlagIsAliasOfYes) {
+  SetEnv("TERM", "dumb");  // TERM doesn't support colors.
+
+  GTEST_FLAG(color) = "True";
+  EXPECT_TRUE(ShouldUseColor(false));  // Stdout is not a TTY.
+
+  GTEST_FLAG(color) = "t";
+  EXPECT_TRUE(ShouldUseColor(false));  // Stdout is not a TTY.
+
+  GTEST_FLAG(color) = "1";
+  EXPECT_TRUE(ShouldUseColor(false));  // Stdout is not a TTY.
+}
+
+TEST(ColoredOutputTest, UsesNoColorWhenGTestColorFlagIsNo) {
+  GTEST_FLAG(color) = "no";
+
+  SetEnv("TERM", "xterm");  // TERM supports colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+  EXPECT_FALSE(ShouldUseColor(false));  // Stdout is not a TTY.
+
+  SetEnv("TERM", "dumb");  // TERM doesn't support colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+  EXPECT_FALSE(ShouldUseColor(false));  // Stdout is not a TTY.
+}
+
+TEST(ColoredOutputTest, UsesNoColorWhenGTestColorFlagIsInvalid) {
+  SetEnv("TERM", "xterm");  // TERM supports colors.
+
+  GTEST_FLAG(color) = "F";
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  GTEST_FLAG(color) = "0";
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  GTEST_FLAG(color) = "unknown";
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+}
+
+TEST(ColoredOutputTest, UsesColorsWhenStdoutIsTty) {
+  GTEST_FLAG(color) = "auto";
+
+  SetEnv("TERM", "xterm");  // TERM supports colors.
+  EXPECT_FALSE(ShouldUseColor(false));  // Stdout is not a TTY.
+  EXPECT_TRUE(ShouldUseColor(true));    // Stdout is a TTY.
+}
+
+TEST(ColoredOutputTest, UsesColorsWhenTermSupportsColors) {
+  GTEST_FLAG(color) = "auto";
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  // On Windows, we ignore the TERM variable as it's usually not set.
+
+  SetEnv("TERM", "dumb");
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "");
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "xterm");
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+#else
+  // On non-Windows platforms, we rely on TERM to determine if the
+  // terminal supports colors.
+
+  SetEnv("TERM", "dumb");  // TERM doesn't support colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "emacs");  // TERM doesn't support colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "vt100");  // TERM doesn't support colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "xterm-mono");  // TERM doesn't support colors.
+  EXPECT_FALSE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "xterm");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "xterm-color");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "xterm-256color");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "screen");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "screen-256color");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "tmux");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "tmux-256color");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "rxvt-unicode");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "rxvt-unicode-256color");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "linux");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+
+  SetEnv("TERM", "cygwin");  // TERM supports colors.
+  EXPECT_TRUE(ShouldUseColor(true));  // Stdout is a TTY.
+#endif  // GTEST_OS_WINDOWS
+}
+
+// Verifies that StaticAssertTypeEq works in a namespace scope.
+
+static bool dummy1 GTEST_ATTRIBUTE_UNUSED_ = StaticAssertTypeEq<bool, bool>();
+static bool dummy2 GTEST_ATTRIBUTE_UNUSED_ =
+    StaticAssertTypeEq<const int, const int>();
+
+// Verifies that StaticAssertTypeEq works in a class.
+
+template <typename T>
+class StaticAssertTypeEqTestHelper {
+ public:
+  StaticAssertTypeEqTestHelper() { StaticAssertTypeEq<bool, T>(); }
+};
+
+TEST(StaticAssertTypeEqTest, WorksInClass) {
+  StaticAssertTypeEqTestHelper<bool>();
+}
+
+// Verifies that StaticAssertTypeEq works inside a function.
+
+typedef int IntAlias;
+
+TEST(StaticAssertTypeEqTest, CompilesForEqualTypes) {
+  StaticAssertTypeEq<int, IntAlias>();
+  StaticAssertTypeEq<int*, IntAlias*>();
+}
+
+TEST(HasNonfatalFailureTest, ReturnsFalseWhenThereIsNoFailure) {
+  EXPECT_FALSE(HasNonfatalFailure());
+}
+
+static void FailFatally() { FAIL(); }
+
+TEST(HasNonfatalFailureTest, ReturnsFalseWhenThereIsOnlyFatalFailure) {
+  FailFatally();
+  const bool has_nonfatal_failure = HasNonfatalFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_FALSE(has_nonfatal_failure);
+}
+
+TEST(HasNonfatalFailureTest, ReturnsTrueWhenThereIsNonfatalFailure) {
+  ADD_FAILURE();
+  const bool has_nonfatal_failure = HasNonfatalFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_nonfatal_failure);
+}
+
+TEST(HasNonfatalFailureTest, ReturnsTrueWhenThereAreFatalAndNonfatalFailures) {
+  FailFatally();
+  ADD_FAILURE();
+  const bool has_nonfatal_failure = HasNonfatalFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_nonfatal_failure);
+}
+
+// A wrapper for calling HasNonfatalFailure outside of a test body.
+static bool HasNonfatalFailureHelper() {
+  return testing::Test::HasNonfatalFailure();
+}
+
+TEST(HasNonfatalFailureTest, WorksOutsideOfTestBody) {
+  EXPECT_FALSE(HasNonfatalFailureHelper());
+}
+
+TEST(HasNonfatalFailureTest, WorksOutsideOfTestBody2) {
+  ADD_FAILURE();
+  const bool has_nonfatal_failure = HasNonfatalFailureHelper();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_nonfatal_failure);
+}
+
+TEST(HasFailureTest, ReturnsFalseWhenThereIsNoFailure) {
+  EXPECT_FALSE(HasFailure());
+}
+
+TEST(HasFailureTest, ReturnsTrueWhenThereIsFatalFailure) {
+  FailFatally();
+  const bool has_failure = HasFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_failure);
+}
+
+TEST(HasFailureTest, ReturnsTrueWhenThereIsNonfatalFailure) {
+  ADD_FAILURE();
+  const bool has_failure = HasFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_failure);
+}
+
+TEST(HasFailureTest, ReturnsTrueWhenThereAreFatalAndNonfatalFailures) {
+  FailFatally();
+  ADD_FAILURE();
+  const bool has_failure = HasFailure();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_failure);
+}
+
+// A wrapper for calling HasFailure outside of a test body.
+static bool HasFailureHelper() { return testing::Test::HasFailure(); }
+
+TEST(HasFailureTest, WorksOutsideOfTestBody) {
+  EXPECT_FALSE(HasFailureHelper());
+}
+
+TEST(HasFailureTest, WorksOutsideOfTestBody2) {
+  ADD_FAILURE();
+  const bool has_failure = HasFailureHelper();
+  ClearCurrentTestPartResults();
+  EXPECT_TRUE(has_failure);
+}
+
+class TestListener : public EmptyTestEventListener {
+ public:
+  TestListener() : on_start_counter_(nullptr), is_destroyed_(nullptr) {}
+  TestListener(int* on_start_counter, bool* is_destroyed)
+      : on_start_counter_(on_start_counter),
+        is_destroyed_(is_destroyed) {}
+
+  ~TestListener() override {
+    if (is_destroyed_)
+      *is_destroyed_ = true;
+  }
+
+ protected:
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {
+    if (on_start_counter_ != nullptr) (*on_start_counter_)++;
+  }
+
+ private:
+  int* on_start_counter_;
+  bool* is_destroyed_;
+};
+
+// Tests the constructor.
+TEST(TestEventListenersTest, ConstructionWorks) {
+  TestEventListeners listeners;
+
+  EXPECT_TRUE(TestEventListenersAccessor::GetRepeater(&listeners) != nullptr);
+  EXPECT_TRUE(listeners.default_result_printer() == nullptr);
+  EXPECT_TRUE(listeners.default_xml_generator() == nullptr);
+}
+
+// Tests that the TestEventListeners destructor deletes all the listeners it
+// owns.
+TEST(TestEventListenersTest, DestructionWorks) {
+  bool default_result_printer_is_destroyed = false;
+  bool default_xml_printer_is_destroyed = false;
+  bool extra_listener_is_destroyed = false;
+  TestListener* default_result_printer =
+      new TestListener(nullptr, &default_result_printer_is_destroyed);
+  TestListener* default_xml_printer =
+      new TestListener(nullptr, &default_xml_printer_is_destroyed);
+  TestListener* extra_listener =
+      new TestListener(nullptr, &extra_listener_is_destroyed);
+
+  {
+    TestEventListeners listeners;
+    TestEventListenersAccessor::SetDefaultResultPrinter(&listeners,
+                                                        default_result_printer);
+    TestEventListenersAccessor::SetDefaultXmlGenerator(&listeners,
+                                                       default_xml_printer);
+    listeners.Append(extra_listener);
+  }
+  EXPECT_TRUE(default_result_printer_is_destroyed);
+  EXPECT_TRUE(default_xml_printer_is_destroyed);
+  EXPECT_TRUE(extra_listener_is_destroyed);
+}
+
+// Tests that a listener Append'ed to a TestEventListeners list starts
+// receiving events.
+TEST(TestEventListenersTest, Append) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+  {
+    TestEventListeners listeners;
+    listeners.Append(listener);
+    TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+        *UnitTest::GetInstance());
+    EXPECT_EQ(1, on_start_counter);
+  }
+  EXPECT_TRUE(is_destroyed);
+}
+
+// Tests that listeners receive events in the order they were appended to
+// the list, except for *End requests, which must be received in the reverse
+// order.
+class SequenceTestingListener : public EmptyTestEventListener {
+ public:
+  SequenceTestingListener(std::vector<std::string>* vector, const char* id)
+      : vector_(vector), id_(id) {}
+
+ protected:
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {
+    vector_->push_back(GetEventDescription("OnTestProgramStart"));
+  }
+
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {
+    vector_->push_back(GetEventDescription("OnTestProgramEnd"));
+  }
+
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {
+    vector_->push_back(GetEventDescription("OnTestIterationStart"));
+  }
+
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                          int /*iteration*/) override {
+    vector_->push_back(GetEventDescription("OnTestIterationEnd"));
+  }
+
+ private:
+  std::string GetEventDescription(const char* method) {
+    Message message;
+    message << id_ << "." << method;
+    return message.GetString();
+  }
+
+  std::vector<std::string>* vector_;
+  const char* const id_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SequenceTestingListener);
+};
+
+TEST(EventListenerTest, AppendKeepsOrder) {
+  std::vector<std::string> vec;
+  TestEventListeners listeners;
+  listeners.Append(new SequenceTestingListener(&vec, "1st"));
+  listeners.Append(new SequenceTestingListener(&vec, "2nd"));
+  listeners.Append(new SequenceTestingListener(&vec, "3rd"));
+
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+  ASSERT_EQ(3U, vec.size());
+  EXPECT_STREQ("1st.OnTestProgramStart", vec[0].c_str());
+  EXPECT_STREQ("2nd.OnTestProgramStart", vec[1].c_str());
+  EXPECT_STREQ("3rd.OnTestProgramStart", vec[2].c_str());
+
+  vec.clear();
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramEnd(
+      *UnitTest::GetInstance());
+  ASSERT_EQ(3U, vec.size());
+  EXPECT_STREQ("3rd.OnTestProgramEnd", vec[0].c_str());
+  EXPECT_STREQ("2nd.OnTestProgramEnd", vec[1].c_str());
+  EXPECT_STREQ("1st.OnTestProgramEnd", vec[2].c_str());
+
+  vec.clear();
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestIterationStart(
+      *UnitTest::GetInstance(), 0);
+  ASSERT_EQ(3U, vec.size());
+  EXPECT_STREQ("1st.OnTestIterationStart", vec[0].c_str());
+  EXPECT_STREQ("2nd.OnTestIterationStart", vec[1].c_str());
+  EXPECT_STREQ("3rd.OnTestIterationStart", vec[2].c_str());
+
+  vec.clear();
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestIterationEnd(
+      *UnitTest::GetInstance(), 0);
+  ASSERT_EQ(3U, vec.size());
+  EXPECT_STREQ("3rd.OnTestIterationEnd", vec[0].c_str());
+  EXPECT_STREQ("2nd.OnTestIterationEnd", vec[1].c_str());
+  EXPECT_STREQ("1st.OnTestIterationEnd", vec[2].c_str());
+}
+
+// Tests that a listener removed from a TestEventListeners list stops receiving
+// events and is not deleted when the list is destroyed.
+TEST(TestEventListenersTest, Release) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  // Although Append passes the ownership of this object to the list,
+  // the following calls release it, and we need to delete it before the
+  // test ends.
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+  {
+    TestEventListeners listeners;
+    listeners.Append(listener);
+    EXPECT_EQ(listener, listeners.Release(listener));
+    TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+        *UnitTest::GetInstance());
+    EXPECT_TRUE(listeners.Release(listener) == nullptr);
+  }
+  EXPECT_EQ(0, on_start_counter);
+  EXPECT_FALSE(is_destroyed);
+  delete listener;
+}
+
+// Tests that no events are forwarded when event forwarding is disabled.
+TEST(EventListenerTest, SuppressEventForwarding) {
+  int on_start_counter = 0;
+  TestListener* listener = new TestListener(&on_start_counter, nullptr);
+
+  TestEventListeners listeners;
+  listeners.Append(listener);
+  ASSERT_TRUE(TestEventListenersAccessor::EventForwardingEnabled(listeners));
+  TestEventListenersAccessor::SuppressEventForwarding(&listeners);
+  ASSERT_FALSE(TestEventListenersAccessor::EventForwardingEnabled(listeners));
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+  EXPECT_EQ(0, on_start_counter);
+}
+
+// Tests that events generated by Google Test are not forwarded in
+// death test subprocesses.
+TEST(EventListenerDeathTest, EventsNotForwardedInDeathTestSubprecesses) {
+  EXPECT_DEATH_IF_SUPPORTED({
+      GTEST_CHECK_(TestEventListenersAccessor::EventForwardingEnabled(
+          *GetUnitTestImpl()->listeners())) << "expected failure";},
+      "expected failure");
+}
+
+// Tests that a listener installed via SetDefaultResultPrinter() starts
+// receiving events and is returned via default_result_printer() and that
+// the previous default_result_printer is removed from the list and deleted.
+TEST(EventListenerTest, default_result_printer) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+
+  TestEventListeners listeners;
+  TestEventListenersAccessor::SetDefaultResultPrinter(&listeners, listener);
+
+  EXPECT_EQ(listener, listeners.default_result_printer());
+
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+
+  EXPECT_EQ(1, on_start_counter);
+
+  // Replacing default_result_printer with something else should remove it
+  // from the list and destroy it.
+  TestEventListenersAccessor::SetDefaultResultPrinter(&listeners, nullptr);
+
+  EXPECT_TRUE(listeners.default_result_printer() == nullptr);
+  EXPECT_TRUE(is_destroyed);
+
+  // After broadcasting an event the counter is still the same, indicating
+  // the listener is not in the list anymore.
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+  EXPECT_EQ(1, on_start_counter);
+}
+
+// Tests that the default_result_printer listener stops receiving events
+// when removed via Release and that is not owned by the list anymore.
+TEST(EventListenerTest, RemovingDefaultResultPrinterWorks) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  // Although Append passes the ownership of this object to the list,
+  // the following calls release it, and we need to delete it before the
+  // test ends.
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+  {
+    TestEventListeners listeners;
+    TestEventListenersAccessor::SetDefaultResultPrinter(&listeners, listener);
+
+    EXPECT_EQ(listener, listeners.Release(listener));
+    EXPECT_TRUE(listeners.default_result_printer() == nullptr);
+    EXPECT_FALSE(is_destroyed);
+
+    // Broadcasting events now should not affect default_result_printer.
+    TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+        *UnitTest::GetInstance());
+    EXPECT_EQ(0, on_start_counter);
+  }
+  // Destroying the list should not affect the listener now, too.
+  EXPECT_FALSE(is_destroyed);
+  delete listener;
+}
+
+// Tests that a listener installed via SetDefaultXmlGenerator() starts
+// receiving events and is returned via default_xml_generator() and that
+// the previous default_xml_generator is removed from the list and deleted.
+TEST(EventListenerTest, default_xml_generator) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+
+  TestEventListeners listeners;
+  TestEventListenersAccessor::SetDefaultXmlGenerator(&listeners, listener);
+
+  EXPECT_EQ(listener, listeners.default_xml_generator());
+
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+
+  EXPECT_EQ(1, on_start_counter);
+
+  // Replacing default_xml_generator with something else should remove it
+  // from the list and destroy it.
+  TestEventListenersAccessor::SetDefaultXmlGenerator(&listeners, nullptr);
+
+  EXPECT_TRUE(listeners.default_xml_generator() == nullptr);
+  EXPECT_TRUE(is_destroyed);
+
+  // After broadcasting an event the counter is still the same, indicating
+  // the listener is not in the list anymore.
+  TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+      *UnitTest::GetInstance());
+  EXPECT_EQ(1, on_start_counter);
+}
+
+// Tests that the default_xml_generator listener stops receiving events
+// when removed via Release and that is not owned by the list anymore.
+TEST(EventListenerTest, RemovingDefaultXmlGeneratorWorks) {
+  int on_start_counter = 0;
+  bool is_destroyed = false;
+  // Although Append passes the ownership of this object to the list,
+  // the following calls release it, and we need to delete it before the
+  // test ends.
+  TestListener* listener = new TestListener(&on_start_counter, &is_destroyed);
+  {
+    TestEventListeners listeners;
+    TestEventListenersAccessor::SetDefaultXmlGenerator(&listeners, listener);
+
+    EXPECT_EQ(listener, listeners.Release(listener));
+    EXPECT_TRUE(listeners.default_xml_generator() == nullptr);
+    EXPECT_FALSE(is_destroyed);
+
+    // Broadcasting events now should not affect default_xml_generator.
+    TestEventListenersAccessor::GetRepeater(&listeners)->OnTestProgramStart(
+        *UnitTest::GetInstance());
+    EXPECT_EQ(0, on_start_counter);
+  }
+  // Destroying the list should not affect the listener now, too.
+  EXPECT_FALSE(is_destroyed);
+  delete listener;
+}
+
+// Sanity tests to ensure that the alternative, verbose spellings of
+// some of the macros work.  We don't test them thoroughly as that
+// would be quite involved.  Since their implementations are
+// straightforward, and they are rarely used, we'll just rely on the
+// users to tell us when they are broken.
+GTEST_TEST(AlternativeNameTest, Works) {  // GTEST_TEST is the same as TEST.
+  GTEST_SUCCEED() << "OK";  // GTEST_SUCCEED is the same as SUCCEED.
+
+  // GTEST_FAIL is the same as FAIL.
+  EXPECT_FATAL_FAILURE(GTEST_FAIL() << "An expected failure",
+                       "An expected failure");
+
+  // GTEST_ASSERT_XY is the same as ASSERT_XY.
+
+  GTEST_ASSERT_EQ(0, 0);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_EQ(0, 1) << "An expected failure",
+                       "An expected failure");
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_EQ(1, 0) << "An expected failure",
+                       "An expected failure");
+
+  GTEST_ASSERT_NE(0, 1);
+  GTEST_ASSERT_NE(1, 0);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_NE(0, 0) << "An expected failure",
+                       "An expected failure");
+
+  GTEST_ASSERT_LE(0, 0);
+  GTEST_ASSERT_LE(0, 1);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_LE(1, 0) << "An expected failure",
+                       "An expected failure");
+
+  GTEST_ASSERT_LT(0, 1);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_LT(0, 0) << "An expected failure",
+                       "An expected failure");
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_LT(1, 0) << "An expected failure",
+                       "An expected failure");
+
+  GTEST_ASSERT_GE(0, 0);
+  GTEST_ASSERT_GE(1, 0);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_GE(0, 1) << "An expected failure",
+                       "An expected failure");
+
+  GTEST_ASSERT_GT(1, 0);
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_GT(0, 1) << "An expected failure",
+                       "An expected failure");
+  EXPECT_FATAL_FAILURE(GTEST_ASSERT_GT(1, 1) << "An expected failure",
+                       "An expected failure");
+}
+
+// Tests for internal utilities necessary for implementation of the universal
+// printing.
+
+class ConversionHelperBase {};
+class ConversionHelperDerived : public ConversionHelperBase {};
+
+// Tests that IsAProtocolMessage<T>::value is a compile-time constant.
+TEST(IsAProtocolMessageTest, ValueIsCompileTimeConstant) {
+  GTEST_COMPILE_ASSERT_(IsAProtocolMessage<::proto2::Message>::value,
+                        const_true);
+  GTEST_COMPILE_ASSERT_(!IsAProtocolMessage<int>::value, const_false);
+}
+
+// Tests that IsAProtocolMessage<T>::value is true when T is
+// proto2::Message or a sub-class of it.
+TEST(IsAProtocolMessageTest, ValueIsTrueWhenTypeIsAProtocolMessage) {
+  EXPECT_TRUE(IsAProtocolMessage< ::proto2::Message>::value);
+}
+
+// Tests that IsAProtocolMessage<T>::value is false when T is neither
+// ::proto2::Message nor a sub-class of it.
+TEST(IsAProtocolMessageTest, ValueIsFalseWhenTypeIsNotAProtocolMessage) {
+  EXPECT_FALSE(IsAProtocolMessage<int>::value);
+  EXPECT_FALSE(IsAProtocolMessage<const ConversionHelperBase>::value);
+}
+
+// Tests that CompileAssertTypesEqual compiles when the type arguments are
+// equal.
+TEST(CompileAssertTypesEqual, CompilesWhenTypesAreEqual) {
+  CompileAssertTypesEqual<void, void>();
+  CompileAssertTypesEqual<int*, int*>();
+}
+
+// Tests GTEST_REMOVE_REFERENCE_AND_CONST_.
+
+template <typename T1, typename T2>
+void TestGTestRemoveReferenceAndConst() {
+  CompileAssertTypesEqual<T1, GTEST_REMOVE_REFERENCE_AND_CONST_(T2)>();
+}
+
+TEST(RemoveReferenceToConstTest, Works) {
+  TestGTestRemoveReferenceAndConst<int, int>();
+  TestGTestRemoveReferenceAndConst<double, double&>();
+  TestGTestRemoveReferenceAndConst<char, const char>();
+  TestGTestRemoveReferenceAndConst<char, const char&>();
+  TestGTestRemoveReferenceAndConst<const char*, const char*>();
+}
+
+// Tests GTEST_REFERENCE_TO_CONST_.
+
+template <typename T1, typename T2>
+void TestGTestReferenceToConst() {
+  CompileAssertTypesEqual<T1, GTEST_REFERENCE_TO_CONST_(T2)>();
+}
+
+TEST(GTestReferenceToConstTest, Works) {
+  TestGTestReferenceToConst<const char&, char>();
+  TestGTestReferenceToConst<const int&, const int>();
+  TestGTestReferenceToConst<const double&, double>();
+  TestGTestReferenceToConst<const std::string&, const std::string&>();
+}
+
+
+// Tests IsContainerTest.
+
+class NonContainer {};
+
+TEST(IsContainerTestTest, WorksForNonContainer) {
+  EXPECT_EQ(sizeof(IsNotContainer), sizeof(IsContainerTest<int>(0)));
+  EXPECT_EQ(sizeof(IsNotContainer), sizeof(IsContainerTest<char[5]>(0)));
+  EXPECT_EQ(sizeof(IsNotContainer), sizeof(IsContainerTest<NonContainer>(0)));
+}
+
+TEST(IsContainerTestTest, WorksForContainer) {
+  EXPECT_EQ(sizeof(IsContainer),
+            sizeof(IsContainerTest<std::vector<bool> >(0)));
+  EXPECT_EQ(sizeof(IsContainer),
+            sizeof(IsContainerTest<std::map<int, double> >(0)));
+}
+
+struct ConstOnlyContainerWithPointerIterator {
+  using const_iterator = int*;
+  const_iterator begin() const;
+  const_iterator end() const;
+};
+
+struct ConstOnlyContainerWithClassIterator {
+  struct const_iterator {
+    const int& operator*() const;
+    const_iterator& operator++(/* pre-increment */);
+  };
+  const_iterator begin() const;
+  const_iterator end() const;
+};
+
+TEST(IsContainerTestTest, ConstOnlyContainer) {
+  EXPECT_EQ(sizeof(IsContainer),
+            sizeof(IsContainerTest<ConstOnlyContainerWithPointerIterator>(0)));
+  EXPECT_EQ(sizeof(IsContainer),
+            sizeof(IsContainerTest<ConstOnlyContainerWithClassIterator>(0)));
+}
+
+// Tests IsHashTable.
+struct AHashTable {
+  typedef void hasher;
+};
+struct NotReallyAHashTable {
+  typedef void hasher;
+  typedef void reverse_iterator;
+};
+TEST(IsHashTable, Basic) {
+  EXPECT_TRUE(testing::internal::IsHashTable<AHashTable>::value);
+  EXPECT_FALSE(testing::internal::IsHashTable<NotReallyAHashTable>::value);
+  EXPECT_FALSE(testing::internal::IsHashTable<std::vector<int>>::value);
+  EXPECT_TRUE(testing::internal::IsHashTable<std::unordered_set<int>>::value);
+}
+
+// Tests ArrayEq().
+
+TEST(ArrayEqTest, WorksForDegeneratedArrays) {
+  EXPECT_TRUE(ArrayEq(5, 5L));
+  EXPECT_FALSE(ArrayEq('a', 0));
+}
+
+TEST(ArrayEqTest, WorksForOneDimensionalArrays) {
+  // Note that a and b are distinct but compatible types.
+  const int a[] = { 0, 1 };
+  long b[] = { 0, 1 };
+  EXPECT_TRUE(ArrayEq(a, b));
+  EXPECT_TRUE(ArrayEq(a, 2, b));
+
+  b[0] = 2;
+  EXPECT_FALSE(ArrayEq(a, b));
+  EXPECT_FALSE(ArrayEq(a, 1, b));
+}
+
+TEST(ArrayEqTest, WorksForTwoDimensionalArrays) {
+  const char a[][3] = { "hi", "lo" };
+  const char b[][3] = { "hi", "lo" };
+  const char c[][3] = { "hi", "li" };
+
+  EXPECT_TRUE(ArrayEq(a, b));
+  EXPECT_TRUE(ArrayEq(a, 2, b));
+
+  EXPECT_FALSE(ArrayEq(a, c));
+  EXPECT_FALSE(ArrayEq(a, 2, c));
+}
+
+// Tests ArrayAwareFind().
+
+TEST(ArrayAwareFindTest, WorksForOneDimensionalArray) {
+  const char a[] = "hello";
+  EXPECT_EQ(a + 4, ArrayAwareFind(a, a + 5, 'o'));
+  EXPECT_EQ(a + 5, ArrayAwareFind(a, a + 5, 'x'));
+}
+
+TEST(ArrayAwareFindTest, WorksForTwoDimensionalArray) {
+  int a[][2] = { { 0, 1 }, { 2, 3 }, { 4, 5 } };
+  const int b[2] = { 2, 3 };
+  EXPECT_EQ(a + 1, ArrayAwareFind(a, a + 3, b));
+
+  const int c[2] = { 6, 7 };
+  EXPECT_EQ(a + 3, ArrayAwareFind(a, a + 3, c));
+}
+
+// Tests CopyArray().
+
+TEST(CopyArrayTest, WorksForDegeneratedArrays) {
+  int n = 0;
+  CopyArray('a', &n);
+  EXPECT_EQ('a', n);
+}
+
+TEST(CopyArrayTest, WorksForOneDimensionalArrays) {
+  const char a[3] = "hi";
+  int b[3];
+#ifndef __BORLANDC__  // C++Builder cannot compile some array size deductions.
+  CopyArray(a, &b);
+  EXPECT_TRUE(ArrayEq(a, b));
+#endif
+
+  int c[3];
+  CopyArray(a, 3, c);
+  EXPECT_TRUE(ArrayEq(a, c));
+}
+
+TEST(CopyArrayTest, WorksForTwoDimensionalArrays) {
+  const int a[2][3] = { { 0, 1, 2 }, { 3, 4, 5 } };
+  int b[2][3];
+#ifndef __BORLANDC__  // C++Builder cannot compile some array size deductions.
+  CopyArray(a, &b);
+  EXPECT_TRUE(ArrayEq(a, b));
+#endif
+
+  int c[2][3];
+  CopyArray(a, 2, c);
+  EXPECT_TRUE(ArrayEq(a, c));
+}
+
+// Tests NativeArray.
+
+TEST(NativeArrayTest, ConstructorFromArrayWorks) {
+  const int a[3] = { 0, 1, 2 };
+  NativeArray<int> na(a, 3, RelationToSourceReference());
+  EXPECT_EQ(3U, na.size());
+  EXPECT_EQ(a, na.begin());
+}
+
+TEST(NativeArrayTest, CreatesAndDeletesCopyOfArrayWhenAskedTo) {
+  typedef int Array[2];
+  Array* a = new Array[1];
+  (*a)[0] = 0;
+  (*a)[1] = 1;
+  NativeArray<int> na(*a, 2, RelationToSourceCopy());
+  EXPECT_NE(*a, na.begin());
+  delete[] a;
+  EXPECT_EQ(0, na.begin()[0]);
+  EXPECT_EQ(1, na.begin()[1]);
+
+  // We rely on the heap checker to verify that na deletes the copy of
+  // array.
+}
+
+TEST(NativeArrayTest, TypeMembersAreCorrect) {
+  StaticAssertTypeEq<char, NativeArray<char>::value_type>();
+  StaticAssertTypeEq<int[2], NativeArray<int[2]>::value_type>();
+
+  StaticAssertTypeEq<const char*, NativeArray<char>::const_iterator>();
+  StaticAssertTypeEq<const bool(*)[2], NativeArray<bool[2]>::const_iterator>();
+}
+
+TEST(NativeArrayTest, MethodsWork) {
+  const int a[3] = { 0, 1, 2 };
+  NativeArray<int> na(a, 3, RelationToSourceCopy());
+  ASSERT_EQ(3U, na.size());
+  EXPECT_EQ(3, na.end() - na.begin());
+
+  NativeArray<int>::const_iterator it = na.begin();
+  EXPECT_EQ(0, *it);
+  ++it;
+  EXPECT_EQ(1, *it);
+  it++;
+  EXPECT_EQ(2, *it);
+  ++it;
+  EXPECT_EQ(na.end(), it);
+
+  EXPECT_TRUE(na == na);
+
+  NativeArray<int> na2(a, 3, RelationToSourceReference());
+  EXPECT_TRUE(na == na2);
+
+  const int b1[3] = { 0, 1, 1 };
+  const int b2[4] = { 0, 1, 2, 3 };
+  EXPECT_FALSE(na == NativeArray<int>(b1, 3, RelationToSourceReference()));
+  EXPECT_FALSE(na == NativeArray<int>(b2, 4, RelationToSourceCopy()));
+}
+
+TEST(NativeArrayTest, WorksForTwoDimensionalArray) {
+  const char a[2][3] = { "hi", "lo" };
+  NativeArray<char[3]> na(a, 2, RelationToSourceReference());
+  ASSERT_EQ(2U, na.size());
+  EXPECT_EQ(a, na.begin());
+}
+
+// IndexSequence
+TEST(IndexSequence, MakeIndexSequence) {
+  using testing::internal::IndexSequence;
+  using testing::internal::MakeIndexSequence;
+  EXPECT_TRUE(
+      (std::is_same<IndexSequence<>, MakeIndexSequence<0>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<IndexSequence<0>, MakeIndexSequence<1>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<IndexSequence<0, 1>, MakeIndexSequence<2>::type>::value));
+  EXPECT_TRUE((
+      std::is_same<IndexSequence<0, 1, 2>, MakeIndexSequence<3>::type>::value));
+  EXPECT_TRUE(
+      (std::is_base_of<IndexSequence<0, 1, 2>, MakeIndexSequence<3>>::value));
+}
+
+// ElemFromList
+TEST(ElemFromList, Basic) {
+  using testing::internal::ElemFromList;
+  using Idx = testing::internal::MakeIndexSequence<3>::type;
+  EXPECT_TRUE((
+      std::is_same<int, ElemFromList<0, Idx, int, double, char>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<double,
+                    ElemFromList<1, Idx, int, double, char>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<char,
+                    ElemFromList<2, Idx, int, double, char>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<
+          char, ElemFromList<7, testing::internal::MakeIndexSequence<12>::type,
+                             int, int, int, int, int, int, int, char, int, int,
+                             int, int>::type>::value));
+}
+
+// FlatTuple
+TEST(FlatTuple, Basic) {
+  using testing::internal::FlatTuple;
+
+  FlatTuple<int, double, const char*> tuple = {};
+  EXPECT_EQ(0, tuple.Get<0>());
+  EXPECT_EQ(0.0, tuple.Get<1>());
+  EXPECT_EQ(nullptr, tuple.Get<2>());
+
+  tuple = FlatTuple<int, double, const char*>(7, 3.2, "Foo");
+  EXPECT_EQ(7, tuple.Get<0>());
+  EXPECT_EQ(3.2, tuple.Get<1>());
+  EXPECT_EQ(std::string("Foo"), tuple.Get<2>());
+
+  tuple.Get<1>() = 5.1;
+  EXPECT_EQ(5.1, tuple.Get<1>());
+}
+
+TEST(FlatTuple, ManyTypes) {
+  using testing::internal::FlatTuple;
+
+  // Instantiate FlatTuple with 257 ints.
+  // Tests show that we can do it with thousands of elements, but very long
+  // compile times makes it unusuitable for this test.
+#define GTEST_FLAT_TUPLE_INT8 int, int, int, int, int, int, int, int,
+#define GTEST_FLAT_TUPLE_INT16 GTEST_FLAT_TUPLE_INT8 GTEST_FLAT_TUPLE_INT8
+#define GTEST_FLAT_TUPLE_INT32 GTEST_FLAT_TUPLE_INT16 GTEST_FLAT_TUPLE_INT16
+#define GTEST_FLAT_TUPLE_INT64 GTEST_FLAT_TUPLE_INT32 GTEST_FLAT_TUPLE_INT32
+#define GTEST_FLAT_TUPLE_INT128 GTEST_FLAT_TUPLE_INT64 GTEST_FLAT_TUPLE_INT64
+#define GTEST_FLAT_TUPLE_INT256 GTEST_FLAT_TUPLE_INT128 GTEST_FLAT_TUPLE_INT128
+
+  // Let's make sure that we can have a very long list of types without blowing
+  // up the template instantiation depth.
+  FlatTuple<GTEST_FLAT_TUPLE_INT256 int> tuple;
+
+  tuple.Get<0>() = 7;
+  tuple.Get<99>() = 17;
+  tuple.Get<256>() = 1000;
+  EXPECT_EQ(7, tuple.Get<0>());
+  EXPECT_EQ(17, tuple.Get<99>());
+  EXPECT_EQ(1000, tuple.Get<256>());
+}
+
+// Tests SkipPrefix().
+
+TEST(SkipPrefixTest, SkipsWhenPrefixMatches) {
+  const char* const str = "hello";
+
+  const char* p = str;
+  EXPECT_TRUE(SkipPrefix("", &p));
+  EXPECT_EQ(str, p);
+
+  p = str;
+  EXPECT_TRUE(SkipPrefix("hell", &p));
+  EXPECT_EQ(str + 4, p);
+}
+
+TEST(SkipPrefixTest, DoesNotSkipWhenPrefixDoesNotMatch) {
+  const char* const str = "world";
+
+  const char* p = str;
+  EXPECT_FALSE(SkipPrefix("W", &p));
+  EXPECT_EQ(str, p);
+
+  p = str;
+  EXPECT_FALSE(SkipPrefix("world!", &p));
+  EXPECT_EQ(str, p);
+}
+
+// Tests ad_hoc_test_result().
+
+class AdHocTestResultTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    FAIL() << "A failure happened inside SetUpTestSuite().";
+  }
+};
+
+TEST_F(AdHocTestResultTest, AdHocTestResultForTestSuiteShowsFailure) {
+  const testing::TestResult& test_result = testing::UnitTest::GetInstance()
+                                               ->current_test_suite()
+                                               ->ad_hoc_test_result();
+  EXPECT_TRUE(test_result.Failed());
+}
+
+TEST_F(AdHocTestResultTest, AdHocTestResultTestForUnitTestDoesNotShowFailure) {
+  const testing::TestResult& test_result =
+      testing::UnitTest::GetInstance()->ad_hoc_test_result();
+  EXPECT_FALSE(test_result.Failed());
+}
+
+class DynamicUnitTestFixture : public testing::Test {};
+
+class DynamicTest : public DynamicUnitTestFixture {
+  void TestBody() override { EXPECT_TRUE(true); }
+};
+
+auto* dynamic_test = testing::RegisterTest(
+    "DynamicUnitTestFixture", "DynamicTest", "TYPE", "VALUE", __FILE__,
+    __LINE__, []() -> DynamicUnitTestFixture* { return new DynamicTest; });
+
+TEST(RegisterTest, WasRegistered) {
+  auto* unittest = testing::UnitTest::GetInstance();
+  for (int i = 0; i < unittest->total_test_suite_count(); ++i) {
+    auto* tests = unittest->GetTestSuite(i);
+    if (tests->name() != std::string("DynamicUnitTestFixture")) continue;
+    for (int j = 0; j < tests->total_test_count(); ++j) {
+      if (tests->GetTestInfo(j)->name() != std::string("DynamicTest")) continue;
+      // Found it.
+      EXPECT_STREQ(tests->GetTestInfo(j)->value_param(), "VALUE");
+      EXPECT_STREQ(tests->GetTestInfo(j)->type_param(), "TYPE");
+      return;
+    }
+  }
+
+  FAIL() << "Didn't find the test!";
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile1_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile1_test_.cc
new file mode 100755
index 0000000..19aa252
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile1_test_.cc
@@ -0,0 +1,43 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// gtest_xml_outfile1_test_ writes some xml via TestProperty used by
+// gtest_xml_outfiles_test.py
+
+#include "gtest/gtest.h"
+
+class PropertyOne : public testing::Test {
+ protected:
+  void SetUp() override { RecordProperty("SetUpProp", 1); }
+  void TearDown() override { RecordProperty("TearDownProp", 1); }
+};
+
+TEST_F(PropertyOne, TestSomeProperties) {
+  RecordProperty("TestSomeProperty", 1);
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile2_test_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile2_test_.cc
new file mode 100755
index 0000000..f9a2a6e
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfile2_test_.cc
@@ -0,0 +1,43 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// gtest_xml_outfile2_test_ writes some xml via TestProperty used by
+// gtest_xml_outfiles_test.py
+
+#include "gtest/gtest.h"
+
+class PropertyTwo : public testing::Test {
+ protected:
+  void SetUp() override { RecordProperty("SetUpProp", 2); }
+  void TearDown() override { RecordProperty("TearDownProp", 2); }
+};
+
+TEST_F(PropertyTwo, TestSomeProperties) {
+  RecordProperty("TestSomeProperty", 2);
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfiles_test.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfiles_test.py
new file mode 100755
index 0000000..e093f6f
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_outfiles_test.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for the gtest_xml_output module."""
+
+import os
+from xml.dom import minidom, Node
+import gtest_test_utils
+import gtest_xml_test_utils
+
+GTEST_OUTPUT_SUBDIR = "xml_outfiles"
+GTEST_OUTPUT_1_TEST = "gtest_xml_outfile1_test_"
+GTEST_OUTPUT_2_TEST = "gtest_xml_outfile2_test_"
+
+EXPECTED_XML_1 = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*" name="AllTests">
+  <testsuite name="PropertyOne" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="TestSomeProperties" status="run" result="completed" time="*" timestamp="*" classname="PropertyOne">
+      <properties>
+        <property name="SetUpProp" value="1"/>
+        <property name="TestSomeProperty" value="1"/>
+        <property name="TearDownProp" value="1"/>
+      </properties>
+    </testcase>
+  </testsuite>
+</testsuites>
+"""
+
+EXPECTED_XML_2 = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*" name="AllTests">
+  <testsuite name="PropertyTwo" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="TestSomeProperties" status="run" result="completed" time="*" timestamp="*" classname="PropertyTwo">
+      <properties>
+        <property name="SetUpProp" value="2"/>
+        <property name="TestSomeProperty" value="2"/>
+        <property name="TearDownProp" value="2"/>
+      </properties>
+    </testcase>
+  </testsuite>
+</testsuites>
+"""
+
+
+class GTestXMLOutFilesTest(gtest_xml_test_utils.GTestXMLTestCase):
+  """Unit test for Google Test's XML output functionality."""
+
+  def setUp(self):
+    # We want the trailing '/' that the last "" provides in os.path.join, for
+    # telling Google Test to create an output directory instead of a single file
+    # for xml output.
+    self.output_dir_ = os.path.join(gtest_test_utils.GetTempDir(),
+                                    GTEST_OUTPUT_SUBDIR, "")
+    self.DeleteFilesAndDir()
+
+  def tearDown(self):
+    self.DeleteFilesAndDir()
+
+  def DeleteFilesAndDir(self):
+    try:
+      os.remove(os.path.join(self.output_dir_, GTEST_OUTPUT_1_TEST + ".xml"))
+    except os.error:
+      pass
+    try:
+      os.remove(os.path.join(self.output_dir_, GTEST_OUTPUT_2_TEST + ".xml"))
+    except os.error:
+      pass
+    try:
+      os.rmdir(self.output_dir_)
+    except os.error:
+      pass
+
+  def testOutfile1(self):
+    self._TestOutFile(GTEST_OUTPUT_1_TEST, EXPECTED_XML_1)
+
+  def testOutfile2(self):
+    self._TestOutFile(GTEST_OUTPUT_2_TEST, EXPECTED_XML_2)
+
+  def _TestOutFile(self, test_name, expected_xml):
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(test_name)
+    command = [gtest_prog_path, "--gtest_output=xml:%s" % self.output_dir_]
+    p = gtest_test_utils.Subprocess(command,
+                                    working_dir=gtest_test_utils.GetTempDir())
+    self.assert_(p.exited)
+    self.assertEquals(0, p.exit_code)
+
+    output_file_name1 = test_name + ".xml"
+    output_file1 = os.path.join(self.output_dir_, output_file_name1)
+    output_file_name2 = 'lt-' + output_file_name1
+    output_file2 = os.path.join(self.output_dir_, output_file_name2)
+    self.assert_(os.path.isfile(output_file1) or os.path.isfile(output_file2),
+                 output_file1)
+
+    expected = minidom.parseString(expected_xml)
+    if os.path.isfile(output_file1):
+      actual = minidom.parse(output_file1)
+    else:
+      actual = minidom.parse(output_file2)
+    self.NormalizeXml(actual.documentElement)
+    self.AssertEquivalentNodes(expected.documentElement,
+                               actual.documentElement)
+    expected.unlink()
+    actual.unlink()
+
+
+if __name__ == "__main__":
+  os.environ["GTEST_STACK_TRACE_DEPTH"] = "0"
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest.py
new file mode 100755
index 0000000..63b1af0
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python
+#
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test for the gtest_xml_output module"""
+
+import datetime
+import errno
+import os
+import re
+import sys
+from xml.dom import minidom, Node
+
+import gtest_test_utils
+import gtest_xml_test_utils
+
+GTEST_FILTER_FLAG = '--gtest_filter'
+GTEST_LIST_TESTS_FLAG = '--gtest_list_tests'
+GTEST_OUTPUT_FLAG = '--gtest_output'
+GTEST_DEFAULT_OUTPUT_FILE = 'test_detail.xml'
+GTEST_PROGRAM_NAME = 'gtest_xml_output_unittest_'
+
+# The flag indicating stacktraces are not supported
+NO_STACKTRACE_SUPPORT_FLAG = '--no_stacktrace_support'
+
+# The environment variables for test sharding.
+TOTAL_SHARDS_ENV_VAR = 'GTEST_TOTAL_SHARDS'
+SHARD_INDEX_ENV_VAR = 'GTEST_SHARD_INDEX'
+SHARD_STATUS_FILE_ENV_VAR = 'GTEST_SHARD_STATUS_FILE'
+
+SUPPORTS_STACK_TRACES = NO_STACKTRACE_SUPPORT_FLAG not in sys.argv
+
+if SUPPORTS_STACK_TRACES:
+  STACK_TRACE_TEMPLATE = '\nStack trace:\n*'
+else:
+  STACK_TRACE_TEMPLATE = ''
+  # unittest.main() can't handle unknown flags
+  sys.argv.remove(NO_STACKTRACE_SUPPORT_FLAG)
+
+EXPECTED_NON_EMPTY_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="24" failures="4" disabled="2" errors="0" time="*" timestamp="*" name="AllTests" ad_hoc_property="42">
+  <testsuite name="SuccessfulTest" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="Succeeds" status="run" result="completed" time="*" timestamp="*" classname="SuccessfulTest"/>
+  </testsuite>
+  <testsuite name="FailedTest" tests="1" failures="1" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="Fails" status="run" result="completed" time="*" timestamp="*" classname="FailedTest">
+      <failure message="gtest_xml_output_unittest_.cc:*&#x0A;Expected equality of these values:&#x0A;  1&#x0A;  2" type=""><![CDATA[gtest_xml_output_unittest_.cc:*
+Expected equality of these values:
+  1
+  2%(stack)s]]></failure>
+    </testcase>
+  </testsuite>
+  <testsuite name="MixedResultTest" tests="3" failures="1" disabled="1" errors="0" time="*" timestamp="*">
+    <testcase name="Succeeds" status="run" result="completed" time="*" timestamp="*" classname="MixedResultTest"/>
+    <testcase name="Fails" status="run" result="completed" time="*" timestamp="*" classname="MixedResultTest">
+      <failure message="gtest_xml_output_unittest_.cc:*&#x0A;Expected equality of these values:&#x0A;  1&#x0A;  2" type=""><![CDATA[gtest_xml_output_unittest_.cc:*
+Expected equality of these values:
+  1
+  2%(stack)s]]></failure>
+      <failure message="gtest_xml_output_unittest_.cc:*&#x0A;Expected equality of these values:&#x0A;  2&#x0A;  3" type=""><![CDATA[gtest_xml_output_unittest_.cc:*
+Expected equality of these values:
+  2
+  3%(stack)s]]></failure>
+    </testcase>
+    <testcase name="DISABLED_test" status="notrun" result="suppressed" time="*" timestamp="*" classname="MixedResultTest"/>
+  </testsuite>
+  <testsuite name="XmlQuotingTest" tests="1" failures="1" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="OutputsCData" status="run" result="completed" time="*" timestamp="*" classname="XmlQuotingTest">
+      <failure message="gtest_xml_output_unittest_.cc:*&#x0A;Failed&#x0A;XML output: &lt;?xml encoding=&quot;utf-8&quot;&gt;&lt;top&gt;&lt;![CDATA[cdata text]]&gt;&lt;/top&gt;" type=""><![CDATA[gtest_xml_output_unittest_.cc:*
+Failed
+XML output: <?xml encoding="utf-8"><top><![CDATA[cdata text]]>]]&gt;<![CDATA[</top>%(stack)s]]></failure>
+    </testcase>
+  </testsuite>
+  <testsuite name="InvalidCharactersTest" tests="1" failures="1" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="InvalidCharactersInMessage" status="run" result="completed" time="*" timestamp="*" classname="InvalidCharactersTest">
+      <failure message="gtest_xml_output_unittest_.cc:*&#x0A;Failed&#x0A;Invalid characters in brackets []" type=""><![CDATA[gtest_xml_output_unittest_.cc:*
+Failed
+Invalid characters in brackets []%(stack)s]]></failure>
+    </testcase>
+  </testsuite>
+  <testsuite name="DisabledTest" tests="1" failures="0" disabled="1" errors="0" time="*" timestamp="*">
+    <testcase name="DISABLED_test_not_run" status="notrun" result="suppressed" time="*" timestamp="*" classname="DisabledTest"/>
+  </testsuite>
+  <testsuite name="SkippedTest" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="Skipped" status="run" result="skipped" time="*" timestamp="*" classname="SkippedTest"/>
+  </testsuite>
+  <testsuite name="PropertyRecordingTest" tests="4" failures="0" disabled="0" errors="0" time="*" timestamp="*" SetUpTestSuite="yes" TearDownTestSuite="aye">
+    <testcase name="OneProperty" status="run" result="completed" time="*" timestamp="*" classname="PropertyRecordingTest">
+      <properties>
+        <property name="key_1" value="1"/>
+      </properties>
+    </testcase>
+    <testcase name="IntValuedProperty" status="run" result="completed" time="*" timestamp="*" classname="PropertyRecordingTest">
+      <properties>
+        <property name="key_int" value="1"/>
+      </properties>
+    </testcase>
+    <testcase name="ThreeProperties" status="run" result="completed" time="*" timestamp="*" classname="PropertyRecordingTest">
+      <properties>
+        <property name="key_1" value="1"/>
+        <property name="key_2" value="2"/>
+        <property name="key_3" value="3"/>
+      </properties>
+    </testcase>
+    <testcase name="TwoValuesForOneKeyUsesLastValue" status="run" result="completed" time="*" timestamp="*" classname="PropertyRecordingTest">
+      <properties>
+        <property name="key_1" value="2"/>
+      </properties>
+    </testcase>
+  </testsuite>
+  <testsuite name="NoFixtureTest" tests="3" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+     <testcase name="RecordProperty" status="run" result="completed" time="*" timestamp="*" classname="NoFixtureTest">
+       <properties>
+         <property name="key" value="1"/>
+       </properties>
+     </testcase>
+     <testcase name="ExternalUtilityThatCallsRecordIntValuedProperty" status="run" result="completed" time="*" timestamp="*" classname="NoFixtureTest">
+       <properties>
+         <property name="key_for_utility_int" value="1"/>
+       </properties>
+     </testcase>
+     <testcase name="ExternalUtilityThatCallsRecordStringValuedProperty" status="run" result="completed" time="*" timestamp="*" classname="NoFixtureTest">
+       <properties>
+         <property name="key_for_utility_string" value="1"/>
+       </properties>
+     </testcase>
+  </testsuite>
+  <testsuite name="Single/ValueParamTest" tests="4" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="HasValueParamAttribute/0" value_param="33" status="run" result="completed" time="*" timestamp="*" classname="Single/ValueParamTest" />
+    <testcase name="HasValueParamAttribute/1" value_param="42" status="run" result="completed" time="*" timestamp="*" classname="Single/ValueParamTest" />
+    <testcase name="AnotherTestThatHasValueParamAttribute/0" value_param="33" status="run" result="completed" time="*" timestamp="*" classname="Single/ValueParamTest" />
+    <testcase name="AnotherTestThatHasValueParamAttribute/1" value_param="42" status="run" result="completed" time="*" timestamp="*" classname="Single/ValueParamTest" />
+  </testsuite>
+  <testsuite name="TypedTest/0" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="HasTypeParamAttribute" type_param="*" status="run" result="completed" time="*" timestamp="*" classname="TypedTest/0" />
+  </testsuite>
+  <testsuite name="TypedTest/1" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="HasTypeParamAttribute" type_param="*" status="run" result="completed" time="*" timestamp="*" classname="TypedTest/1" />
+  </testsuite>
+  <testsuite name="Single/TypeParameterizedTestSuite/0" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="HasTypeParamAttribute" type_param="*" status="run" result="completed" time="*" timestamp="*" classname="Single/TypeParameterizedTestSuite/0" />
+  </testsuite>
+  <testsuite name="Single/TypeParameterizedTestSuite/1" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="HasTypeParamAttribute" type_param="*" status="run" result="completed" time="*" timestamp="*" classname="Single/TypeParameterizedTestSuite/1" />
+  </testsuite>
+</testsuites>""" % {
+    'stack': STACK_TRACE_TEMPLATE
+}
+
+EXPECTED_FILTERED_TEST_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="1" failures="0" disabled="0" errors="0" time="*"
+            timestamp="*" name="AllTests" ad_hoc_property="42">
+  <testsuite name="SuccessfulTest" tests="1" failures="0" disabled="0"
+             errors="0" time="*" timestamp="*">
+    <testcase name="Succeeds" status="run" result="completed" time="*" timestamp="*" classname="SuccessfulTest"/>
+  </testsuite>
+</testsuites>"""
+
+EXPECTED_SHARDED_TEST_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="3" failures="0" disabled="0" errors="0" time="*" timestamp="*" name="AllTests" ad_hoc_property="42">
+  <testsuite name="SuccessfulTest" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="Succeeds" status="run" result="completed" time="*" timestamp="*" classname="SuccessfulTest"/>
+  </testsuite>
+  <testsuite name="PropertyRecordingTest" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*" SetUpTestSuite="yes" TearDownTestSuite="aye">
+    <testcase name="TwoValuesForOneKeyUsesLastValue" status="run" result="completed" time="*" timestamp="*" classname="PropertyRecordingTest">
+      <properties>
+        <property name="key_1" value="2"/>
+      </properties>
+    </testcase>
+  </testsuite>
+  <testsuite name="Single/ValueParamTest" tests="1" failures="0" disabled="0" errors="0" time="*" timestamp="*">
+    <testcase name="AnotherTestThatHasValueParamAttribute/0" value_param="33" status="run" result="completed" time="*" timestamp="*" classname="Single/ValueParamTest" />
+  </testsuite>
+</testsuites>"""
+
+EXPECTED_EMPTY_XML = """<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="0" failures="0" disabled="0" errors="0" time="*"
+            timestamp="*" name="AllTests">
+</testsuites>"""
+
+GTEST_PROGRAM_PATH = gtest_test_utils.GetTestExecutablePath(GTEST_PROGRAM_NAME)
+
+SUPPORTS_TYPED_TESTS = 'TypedTest' in gtest_test_utils.Subprocess(
+    [GTEST_PROGRAM_PATH, GTEST_LIST_TESTS_FLAG], capture_stderr=False).output
+
+
+class GTestXMLOutputUnitTest(gtest_xml_test_utils.GTestXMLTestCase):
+  """
+  Unit test for Google Test's XML output functionality.
+  """
+
+  # This test currently breaks on platforms that do not support typed and
+  # type-parameterized tests, so we don't run it under them.
+  if SUPPORTS_TYPED_TESTS:
+    def testNonEmptyXmlOutput(self):
+      """
+      Runs a test program that generates a non-empty XML output, and
+      tests that the XML output is expected.
+      """
+      self._TestXmlOutput(GTEST_PROGRAM_NAME, EXPECTED_NON_EMPTY_XML, 1)
+
+  def testEmptyXmlOutput(self):
+    """Verifies XML output for a Google Test binary without actual tests.
+
+    Runs a test program that generates an empty XML output, and
+    tests that the XML output is expected.
+    """
+
+    self._TestXmlOutput('gtest_no_test_unittest', EXPECTED_EMPTY_XML, 0)
+
+  def testTimestampValue(self):
+    """Checks whether the timestamp attribute in the XML output is valid.
+
+    Runs a test program that generates an empty XML output, and checks if
+    the timestamp attribute in the testsuites tag is valid.
+    """
+    actual = self._GetXmlOutput('gtest_no_test_unittest', [], {}, 0)
+    date_time_str = actual.documentElement.getAttributeNode('timestamp').value
+    # datetime.strptime() is only available in Python 2.5+ so we have to
+    # parse the expected datetime manually.
+    match = re.match(r'(\d+)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)', date_time_str)
+    self.assertTrue(
+        re.match,
+        'XML datettime string %s has incorrect format' % date_time_str)
+    date_time_from_xml = datetime.datetime(
+        year=int(match.group(1)), month=int(match.group(2)),
+        day=int(match.group(3)), hour=int(match.group(4)),
+        minute=int(match.group(5)), second=int(match.group(6)))
+
+    time_delta = abs(datetime.datetime.now() - date_time_from_xml)
+    # timestamp value should be near the current local time
+    self.assertTrue(time_delta < datetime.timedelta(seconds=600),
+                    'time_delta is %s' % time_delta)
+    actual.unlink()
+
+  def testDefaultOutputFile(self):
+    """
+    Confirms that Google Test produces an XML output file with the expected
+    default name if no name is explicitly specified.
+    """
+    output_file = os.path.join(gtest_test_utils.GetTempDir(),
+                               GTEST_DEFAULT_OUTPUT_FILE)
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(
+        'gtest_no_test_unittest')
+    try:
+      os.remove(output_file)
+    except OSError:
+      e = sys.exc_info()[1]
+      if e.errno != errno.ENOENT:
+        raise
+
+    p = gtest_test_utils.Subprocess(
+        [gtest_prog_path, '%s=xml' % GTEST_OUTPUT_FLAG],
+        working_dir=gtest_test_utils.GetTempDir())
+    self.assert_(p.exited)
+    self.assertEquals(0, p.exit_code)
+    self.assert_(os.path.isfile(output_file))
+
+  def testSuppressedXmlOutput(self):
+    """
+    Tests that no XML file is generated if the default XML listener is
+    shut down before RUN_ALL_TESTS is invoked.
+    """
+
+    xml_path = os.path.join(gtest_test_utils.GetTempDir(),
+                            GTEST_PROGRAM_NAME + 'out.xml')
+    if os.path.isfile(xml_path):
+      os.remove(xml_path)
+
+    command = [GTEST_PROGRAM_PATH,
+               '%s=xml:%s' % (GTEST_OUTPUT_FLAG, xml_path),
+               '--shut_down_xml']
+    p = gtest_test_utils.Subprocess(command)
+    if p.terminated_by_signal:
+      # p.signal is available only if p.terminated_by_signal is True.
+      self.assertFalse(
+          p.terminated_by_signal,
+          '%s was killed by signal %d' % (GTEST_PROGRAM_NAME, p.signal))
+    else:
+      self.assert_(p.exited)
+      self.assertEquals(1, p.exit_code,
+                        "'%s' exited with code %s, which doesn't match "
+                        'the expected exit code %s.'
+                        % (command, p.exit_code, 1))
+
+    self.assert_(not os.path.isfile(xml_path))
+
+  def testFilteredTestXmlOutput(self):
+    """Verifies XML output when a filter is applied.
+
+    Runs a test program that executes only some tests and verifies that
+    non-selected tests do not show up in the XML output.
+    """
+
+    self._TestXmlOutput(GTEST_PROGRAM_NAME, EXPECTED_FILTERED_TEST_XML, 0,
+                        extra_args=['%s=SuccessfulTest.*' % GTEST_FILTER_FLAG])
+
+  def testShardedTestXmlOutput(self):
+    """Verifies XML output when run using multiple shards.
+
+    Runs a test program that executes only one shard and verifies that tests
+    from other shards do not show up in the XML output.
+    """
+
+    self._TestXmlOutput(
+        GTEST_PROGRAM_NAME,
+        EXPECTED_SHARDED_TEST_XML,
+        0,
+        extra_env={SHARD_INDEX_ENV_VAR: '0',
+                   TOTAL_SHARDS_ENV_VAR: '10'})
+
+  def _GetXmlOutput(self, gtest_prog_name, extra_args, extra_env,
+                    expected_exit_code):
+    """
+    Returns the xml output generated by running the program gtest_prog_name.
+    Furthermore, the program's exit code must be expected_exit_code.
+    """
+    xml_path = os.path.join(gtest_test_utils.GetTempDir(),
+                            gtest_prog_name + 'out.xml')
+    gtest_prog_path = gtest_test_utils.GetTestExecutablePath(gtest_prog_name)
+
+    command = ([gtest_prog_path, '%s=xml:%s' % (GTEST_OUTPUT_FLAG, xml_path)] +
+               extra_args)
+    environ_copy = os.environ.copy()
+    if extra_env:
+      environ_copy.update(extra_env)
+    p = gtest_test_utils.Subprocess(command, env=environ_copy)
+
+    if p.terminated_by_signal:
+      self.assert_(False,
+                   '%s was killed by signal %d' % (gtest_prog_name, p.signal))
+    else:
+      self.assert_(p.exited)
+      self.assertEquals(expected_exit_code, p.exit_code,
+                        "'%s' exited with code %s, which doesn't match "
+                        'the expected exit code %s.'
+                        % (command, p.exit_code, expected_exit_code))
+    actual = minidom.parse(xml_path)
+    return actual
+
+  def _TestXmlOutput(self, gtest_prog_name, expected_xml,
+                     expected_exit_code, extra_args=None, extra_env=None):
+    """
+    Asserts that the XML document generated by running the program
+    gtest_prog_name matches expected_xml, a string containing another
+    XML document.  Furthermore, the program's exit code must be
+    expected_exit_code.
+    """
+
+    actual = self._GetXmlOutput(gtest_prog_name, extra_args or [],
+                                extra_env or {}, expected_exit_code)
+    expected = minidom.parseString(expected_xml)
+    self.NormalizeXml(actual.documentElement)
+    self.AssertEquivalentNodes(expected.documentElement,
+                               actual.documentElement)
+    expected.unlink()
+    actual.unlink()
+
+
+if __name__ == '__main__':
+  os.environ['GTEST_STACK_TRACE_DEPTH'] = '1'
+  gtest_test_utils.Main()
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest_.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest_.cc
new file mode 100755
index 0000000..c95fd66
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_output_unittest_.cc
@@ -0,0 +1,188 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Unit test for Google Test XML output.
+//
+// A user can specify XML output in a Google Test program to run via
+// either the GTEST_OUTPUT environment variable or the --gtest_output
+// flag.  This is used for testing such functionality.
+//
+// This program will be invoked from a Python unit test.  Don't run it
+// directly.
+
+#include "gtest/gtest.h"
+
+using ::testing::InitGoogleTest;
+using ::testing::TestEventListeners;
+using ::testing::TestWithParam;
+using ::testing::UnitTest;
+using ::testing::Test;
+using ::testing::Values;
+
+class SuccessfulTest : public Test {
+};
+
+TEST_F(SuccessfulTest, Succeeds) {
+  SUCCEED() << "This is a success.";
+  ASSERT_EQ(1, 1);
+}
+
+class FailedTest : public Test {
+};
+
+TEST_F(FailedTest, Fails) {
+  ASSERT_EQ(1, 2);
+}
+
+class DisabledTest : public Test {
+};
+
+TEST_F(DisabledTest, DISABLED_test_not_run) {
+  FAIL() << "Unexpected failure: Disabled test should not be run";
+}
+
+class SkippedTest : public Test {
+};
+
+TEST_F(SkippedTest, Skipped) {
+  GTEST_SKIP();
+}
+
+TEST(MixedResultTest, Succeeds) {
+  EXPECT_EQ(1, 1);
+  ASSERT_EQ(1, 1);
+}
+
+TEST(MixedResultTest, Fails) {
+  EXPECT_EQ(1, 2);
+  ASSERT_EQ(2, 3);
+}
+
+TEST(MixedResultTest, DISABLED_test) {
+  FAIL() << "Unexpected failure: Disabled test should not be run";
+}
+
+TEST(XmlQuotingTest, OutputsCData) {
+  FAIL() << "XML output: "
+            "<?xml encoding=\"utf-8\"><top><![CDATA[cdata text]]></top>";
+}
+
+// Helps to test that invalid characters produced by test code do not make
+// it into the XML file.
+TEST(InvalidCharactersTest, InvalidCharactersInMessage) {
+  FAIL() << "Invalid characters in brackets [\x1\x2]";
+}
+
+class PropertyRecordingTest : public Test {
+ public:
+  static void SetUpTestSuite() { RecordProperty("SetUpTestSuite", "yes"); }
+  static void TearDownTestSuite() {
+    RecordProperty("TearDownTestSuite", "aye");
+  }
+};
+
+TEST_F(PropertyRecordingTest, OneProperty) {
+  RecordProperty("key_1", "1");
+}
+
+TEST_F(PropertyRecordingTest, IntValuedProperty) {
+  RecordProperty("key_int", 1);
+}
+
+TEST_F(PropertyRecordingTest, ThreeProperties) {
+  RecordProperty("key_1", "1");
+  RecordProperty("key_2", "2");
+  RecordProperty("key_3", "3");
+}
+
+TEST_F(PropertyRecordingTest, TwoValuesForOneKeyUsesLastValue) {
+  RecordProperty("key_1", "1");
+  RecordProperty("key_1", "2");
+}
+
+TEST(NoFixtureTest, RecordProperty) {
+  RecordProperty("key", "1");
+}
+
+void ExternalUtilityThatCallsRecordProperty(const std::string& key, int value) {
+  testing::Test::RecordProperty(key, value);
+}
+
+void ExternalUtilityThatCallsRecordProperty(const std::string& key,
+                                            const std::string& value) {
+  testing::Test::RecordProperty(key, value);
+}
+
+TEST(NoFixtureTest, ExternalUtilityThatCallsRecordIntValuedProperty) {
+  ExternalUtilityThatCallsRecordProperty("key_for_utility_int", 1);
+}
+
+TEST(NoFixtureTest, ExternalUtilityThatCallsRecordStringValuedProperty) {
+  ExternalUtilityThatCallsRecordProperty("key_for_utility_string", "1");
+}
+
+// Verifies that the test parameter value is output in the 'value_param'
+// XML attribute for value-parameterized tests.
+class ValueParamTest : public TestWithParam<int> {};
+TEST_P(ValueParamTest, HasValueParamAttribute) {}
+TEST_P(ValueParamTest, AnotherTestThatHasValueParamAttribute) {}
+INSTANTIATE_TEST_SUITE_P(Single, ValueParamTest, Values(33, 42));
+
+#if GTEST_HAS_TYPED_TEST
+// Verifies that the type parameter name is output in the 'type_param'
+// XML attribute for typed tests.
+template <typename T> class TypedTest : public Test {};
+typedef testing::Types<int, long> TypedTestTypes;
+TYPED_TEST_SUITE(TypedTest, TypedTestTypes);
+TYPED_TEST(TypedTest, HasTypeParamAttribute) {}
+#endif
+
+#if GTEST_HAS_TYPED_TEST_P
+// Verifies that the type parameter name is output in the 'type_param'
+// XML attribute for type-parameterized tests.
+template <typename T>
+class TypeParameterizedTestSuite : public Test {};
+TYPED_TEST_SUITE_P(TypeParameterizedTestSuite);
+TYPED_TEST_P(TypeParameterizedTestSuite, HasTypeParamAttribute) {}
+REGISTER_TYPED_TEST_SUITE_P(TypeParameterizedTestSuite, HasTypeParamAttribute);
+typedef testing::Types<int, long> TypeParameterizedTestSuiteTypes;  // NOLINT
+INSTANTIATE_TYPED_TEST_SUITE_P(Single, TypeParameterizedTestSuite,
+                               TypeParameterizedTestSuiteTypes);
+#endif
+
+int main(int argc, char** argv) {
+  InitGoogleTest(&argc, argv);
+
+  if (argc > 1 && strcmp(argv[1], "--shut_down_xml") == 0) {
+    TestEventListeners& listeners = UnitTest::GetInstance()->listeners();
+    delete listeners.Release(listeners.default_xml_generator());
+  }
+  testing::Test::RecordProperty("ad_hoc_property", "42");
+  return RUN_ALL_TESTS();
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_test_utils.py b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_test_utils.py
new file mode 100755
index 0000000..9914a49
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/gtest_xml_test_utils.py
@@ -0,0 +1,196 @@
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test utilities for gtest_xml_output"""
+
+import re
+from xml.dom import minidom, Node
+import gtest_test_utils
+
+GTEST_DEFAULT_OUTPUT_FILE = 'test_detail.xml'
+
+class GTestXMLTestCase(gtest_test_utils.TestCase):
+  """
+  Base class for tests of Google Test's XML output functionality.
+  """
+
+
+  def AssertEquivalentNodes(self, expected_node, actual_node):
+    """
+    Asserts that actual_node (a DOM node object) is equivalent to
+    expected_node (another DOM node object), in that either both of
+    them are CDATA nodes and have the same value, or both are DOM
+    elements and actual_node meets all of the following conditions:
+
+    *  It has the same tag name as expected_node.
+    *  It has the same set of attributes as expected_node, each with
+       the same value as the corresponding attribute of expected_node.
+       Exceptions are any attribute named "time", which needs only be
+       convertible to a floating-point number and any attribute named
+       "type_param" which only has to be non-empty.
+    *  It has an equivalent set of child nodes (including elements and
+       CDATA sections) as expected_node.  Note that we ignore the
+       order of the children as they are not guaranteed to be in any
+       particular order.
+    """
+
+    if expected_node.nodeType == Node.CDATA_SECTION_NODE:
+      self.assertEquals(Node.CDATA_SECTION_NODE, actual_node.nodeType)
+      self.assertEquals(expected_node.nodeValue, actual_node.nodeValue)
+      return
+
+    self.assertEquals(Node.ELEMENT_NODE, actual_node.nodeType)
+    self.assertEquals(Node.ELEMENT_NODE, expected_node.nodeType)
+    self.assertEquals(expected_node.tagName, actual_node.tagName)
+
+    expected_attributes = expected_node.attributes
+    actual_attributes   = actual_node  .attributes
+    self.assertEquals(
+        expected_attributes.length, actual_attributes.length,
+        'attribute numbers differ in element %s:\nExpected: %r\nActual: %r' % (
+            actual_node.tagName, expected_attributes.keys(),
+            actual_attributes.keys()))
+    for i in range(expected_attributes.length):
+      expected_attr = expected_attributes.item(i)
+      actual_attr   = actual_attributes.get(expected_attr.name)
+      self.assert_(
+          actual_attr is not None,
+          'expected attribute %s not found in element %s' %
+          (expected_attr.name, actual_node.tagName))
+      self.assertEquals(
+          expected_attr.value, actual_attr.value,
+          ' values of attribute %s in element %s differ: %s vs %s' %
+          (expected_attr.name, actual_node.tagName,
+           expected_attr.value, actual_attr.value))
+
+    expected_children = self._GetChildren(expected_node)
+    actual_children = self._GetChildren(actual_node)
+    self.assertEquals(
+        len(expected_children), len(actual_children),
+        'number of child elements differ in element ' + actual_node.tagName)
+    for child_id, child in expected_children.items():
+      self.assert_(child_id in actual_children,
+                   '<%s> is not in <%s> (in element %s)' %
+                   (child_id, actual_children, actual_node.tagName))
+      self.AssertEquivalentNodes(child, actual_children[child_id])
+
+  identifying_attribute = {
+      'testsuites': 'name',
+      'testsuite': 'name',
+      'testcase': 'name',
+      'failure': 'message',
+      'property': 'name',
+  }
+
+  def _GetChildren(self, element):
+    """
+    Fetches all of the child nodes of element, a DOM Element object.
+    Returns them as the values of a dictionary keyed by the IDs of the
+    children.  For <testsuites>, <testsuite>, <testcase>, and <property>
+    elements, the ID is the value of their "name" attribute; for <failure>
+    elements, it is the value of the "message" attribute; for <properties>
+    elements, it is the value of their parent's "name" attribute plus the
+    literal string "properties"; CDATA sections and non-whitespace
+    text nodes are concatenated into a single CDATA section with ID
+    "detail".  An exception is raised if any element other than the above
+    four is encountered, if two child elements with the same identifying
+    attributes are encountered, or if any other type of node is encountered.
+    """
+
+    children = {}
+    for child in element.childNodes:
+      if child.nodeType == Node.ELEMENT_NODE:
+        if child.tagName == 'properties':
+          self.assert_(child.parentNode is not None,
+                       'Encountered <properties> element without a parent')
+          child_id = child.parentNode.getAttribute('name') + '-properties'
+        else:
+          self.assert_(child.tagName in self.identifying_attribute,
+                       'Encountered unknown element <%s>' % child.tagName)
+          child_id = child.getAttribute(
+              self.identifying_attribute[child.tagName])
+        self.assert_(child_id not in children)
+        children[child_id] = child
+      elif child.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
+        if 'detail' not in children:
+          if (child.nodeType == Node.CDATA_SECTION_NODE or
+              not child.nodeValue.isspace()):
+            children['detail'] = child.ownerDocument.createCDATASection(
+                child.nodeValue)
+        else:
+          children['detail'].nodeValue += child.nodeValue
+      else:
+        self.fail('Encountered unexpected node type %d' % child.nodeType)
+    return children
+
+  def NormalizeXml(self, element):
+    """
+    Normalizes Google Test's XML output to eliminate references to transient
+    information that may change from run to run.
+
+    *  The "time" attribute of <testsuites>, <testsuite> and <testcase>
+       elements is replaced with a single asterisk, if it contains
+       only digit characters.
+    *  The "timestamp" attribute of <testsuites> elements is replaced with a
+       single asterisk, if it contains a valid ISO8601 datetime value.
+    *  The "type_param" attribute of <testcase> elements is replaced with a
+       single asterisk (if it sn non-empty) as it is the type name returned
+       by the compiler and is platform dependent.
+    *  The line info reported in the first line of the "message"
+       attribute and CDATA section of <failure> elements is replaced with the
+       file's basename and a single asterisk for the line number.
+    *  The directory names in file paths are removed.
+    *  The stack traces are removed.
+    """
+
+    if element.tagName in ('testsuites', 'testsuite', 'testcase'):
+      timestamp = element.getAttributeNode('timestamp')
+      timestamp.value = re.sub(r'^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$',
+                               '*', timestamp.value)
+    if element.tagName in ('testsuites', 'testsuite', 'testcase'):
+      time = element.getAttributeNode('time')
+      time.value = re.sub(r'^\d+(\.\d+)?$', '*', time.value)
+      type_param = element.getAttributeNode('type_param')
+      if type_param and type_param.value:
+        type_param.value = '*'
+    elif element.tagName == 'failure':
+      source_line_pat = r'^.*[/\\](.*:)\d+\n'
+      # Replaces the source line information with a normalized form.
+      message = element.getAttributeNode('message')
+      message.value = re.sub(source_line_pat, '\\1*\n', message.value)
+      for child in element.childNodes:
+        if child.nodeType == Node.CDATA_SECTION_NODE:
+          # Replaces the source line information with a normalized form.
+          cdata = re.sub(source_line_pat, '\\1*\n', child.nodeValue)
+          # Removes the actual stack trace.
+          child.nodeValue = re.sub(r'Stack trace:\n(.|\n)*',
+                                   'Stack trace:\n*', cdata)
+    for child in element.childNodes:
+      if child.nodeType == Node.ELEMENT_NODE:
+        self.NormalizeXml(child)
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.cc b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.cc
new file mode 100755
index 0000000..0f69f6d
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.cc
@@ -0,0 +1,35 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This is part of the unit test for gtest_prod.h.
+
+#include "production.h"
+
+PrivateCode::PrivateCode() : x_(0) {}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.h b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.h
new file mode 100755
index 0000000..542723b
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/googletest/test/production.h
@@ -0,0 +1,54 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This is part of the unit test for gtest_prod.h.
+
+#ifndef GTEST_TEST_PRODUCTION_H_
+#define GTEST_TEST_PRODUCTION_H_
+
+#include "gtest/gtest_prod.h"
+
+class PrivateCode {
+ public:
+  // Declares a friend test that does not use a fixture.
+  FRIEND_TEST(PrivateCodeTest, CanAccessPrivateMembers);
+
+  // Declares a friend test that uses a fixture.
+  FRIEND_TEST(PrivateCodeFixtureTest, CanAccessPrivateMembers);
+
+  PrivateCode();
+
+  int x() const { return x_; }
+ private:
+  void set_x(int an_x) { x_ = an_x; }
+  int x_;
+};
+
+#endif  // GTEST_TEST_PRODUCTION_H_
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/library.json b/3rdparty/TNN/test/unit_test/third_party/googletest/library.json
new file mode 100755
index 0000000..e46fcbd
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/library.json
@@ -0,0 +1,59 @@
+{
+  "name": "googletest",
+  "keywords": "unittest, unit, test, gtest, gmock",
+  "description": "googletest is a testing framework developed by the Testing Technology team with Google's specific requirements and constraints in mind. No matter whether you work on Linux, Windows, or a Mac, if you write C++ code, googletest can help you. And it supports any kind of tests, not just unit tests.",
+   "license": "BSD-3-Clause",
+  "homepage": "https://github.com/google/googletest/blob/master/README.md",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/google/googletest.git"
+  },
+  "version": "1.8.1",
+  "frameworks": "arduino",
+  "platforms": [
+        "espressif32"
+  ],
+  "export": {
+        "include": [
+            "googlemock/include/*",
+            "googlemock/src/*",
+            "googletest/include/*",
+            "googletest/src/*"
+        ],
+        "exclude": [
+            "ci",
+            "googlemock/build-aux",
+            "googlemock/cmake",
+            "googlemock/make",
+            "googlemock/msvc",
+            "googlemock/scripts",
+            "googlemock/src/gmock-all.cc",
+            "googlemock/src/gmock_main.cc",
+            "googlemock/test",
+            "googlemock/CMakeLists.txt",
+            "googlemock/Makefile.am",
+            "googlemock/configure.ac",
+            "googletest/cmake",
+            "googletest/codegear",
+            "googletest/m4",
+            "googletest/make",
+            "googletest/msvc",
+            "googletest/scripts",
+            "googletest/src/gtest-all.cc",
+            "googletest/src/gtest_main.cc",
+            "googletest/test",
+            "googletest/xcode",
+            "googletest/CMakeLists.txt",
+            "googletest/Makefile.am",
+            "googletest/configure.ac"
+          ]
+  },
+  "build": {
+        "flags": [
+            "-Igooglemock/include",
+            "-Igooglemock",
+            "-Igoogletest/include",
+            "-Igoogletest"
+        ]
+  }
+}
diff --git a/3rdparty/TNN/test/unit_test/third_party/googletest/platformio.ini b/3rdparty/TNN/test/unit_test/third_party/googletest/platformio.ini
new file mode 100755
index 0000000..3910026
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/third_party/googletest/platformio.ini
@@ -0,0 +1,31 @@
+; PlatformIO Project Configuration File
+;
+;   Build options: build flags, source filter
+;   Upload options: custom upload port, speed and extra flags
+;   Library options: dependencies, extra library storages
+;   Advanced options: extra scripting
+;
+; Please visit documentation for the other options and examples
+; http://docs.platformio.org/page/projectconf.html
+
+
+[platformio]
+#src_dir = ./googlemock
+#src_dir = ./googletest
+src_dir = .
+
+[env:googletest_esp32]
+platform = espressif32
+board = esp32dev
+framework = arduino
+build_flags = -I./googletest/include -I./googletest
+src_filter = +<*> -<.git/> -<googlemock> -<googletest/codegear/> -<googletest/samples> -<googletest/test/> -<googletest/xcode> -<googletest/src> +<googletest/src/gtest-all.cc> +<googletest/src/gtest_main.cc>
+upload_speed = 921600
+
+[env:googlemock_esp32]
+platform = espressif32
+board = esp32dev
+framework = arduino
+build_flags = -I./googlemock/include -I./googletest/include -I./googletest -I./googlemock
+src_filter = +<*> -<.git/> -<googletest> -<googlemock/test/> -<googlemock/src> +<googlemock/src/gmock-all.cc> +<googlemock/src/gmock_main.cc> +<googletest/src/gtest-all.cc>
+upload_speed = 921600
diff --git a/3rdparty/TNN/test/unit_test/unit_test.cc b/3rdparty/TNN/test/unit_test/unit_test.cc
new file mode 100644
index 0000000..837498a
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/unit_test.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <gtest/gtest.h>
+#include <chrono>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "test/unit_test/unit_test_common.h"
+
+#include "tnn/utils/omp_utils.h"
+
+namespace TNN_NS {
+
+void ShowUsage() {
+    printf("    -dt \"<device type>\"  %s \n", device_type_message);
+    printf("    -lp \"<dependent library path>\"  %s \n", library_path_message);
+    printf("    -ic \"<number>\"        %s \n", iterations_count_message);
+    printf("    -ub \"<bool>\"          %s \n", unit_test_benchmark_message);
+    printf("    -th \"<bumber>\"        %s \n", cpu_thread_num_message);
+    printf("    -et \"<enable tune>\t%s \n", enable_tune_message);
+}
+
+bool ParseAndCheckCommandLine(int argc, char *argv[]) {
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        ShowUsage();
+        return false;
+    }
+
+    return true;
+}
+
+}  // namespace TNN_NS
+
+GTEST_API_ int main(int argc, char **argv) {
+    std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
+
+    int result = 0;
+    try {
+        ::testing::InitGoogleTest(&argc, argv);
+        if (TNN_NS::ParseAndCheckCommandLine(argc, argv)) {
+            LOGD("run unit for device type: %s \n", TNN_NS::FLAGS_dt.c_str());
+            result = RUN_ALL_TESTS();
+        }
+    } catch (std::exception e) {
+        LOGE("unit test catches an exception: %s \n", e.what());
+    }
+
+    std::chrono::time_point<std::chrono::system_clock> stop = std::chrono::system_clock::now();
+    float cost_sec = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000000.0f;
+    printf("=== Unit Test Cost: %f s ===\n", cost_sec);
+
+    return result;
+}
diff --git a/3rdparty/TNN/test/unit_test/unit_test_common.cc b/3rdparty/TNN/test/unit_test/unit_test_common.cc
new file mode 100644
index 0000000..28bf07c
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/unit_test_common.cc
@@ -0,0 +1,138 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "test/unit_test/unit_test_common.h"
+
+#include <iostream>
+#include <sstream>
+
+#include "test/flags.h"
+#include "test/test_utils.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/utils/bfp16.h"
+
+namespace TNN_NS {
+
+IntScaleResource* CreateIntScale(int channel) {
+    IntScaleResource* int8scale = new IntScaleResource();
+    // scale
+    RawBuffer scale(channel * sizeof(float));
+    float* k_data = scale.force_to<float*>();
+    InitRandom(k_data, channel, 0.f, 1.0f);
+    for (int k = 0; k < channel; k++) {
+        k_data[k] = std::fabs(k_data[k] - 0.f) < FLT_EPSILON ? 1.f : k_data[k];
+    }
+    int8scale->scale_handle = scale;
+
+    // bias
+    RawBuffer bias(channel * sizeof(int32_t));
+    int32_t* b_data = bias.force_to<int32_t*>();
+    InitRandom(b_data, channel, 32);
+    int8scale->bias_handle = bias;
+    return int8scale;
+}
+
+void SetUpEnvironment(AbstractDevice** cpu, AbstractDevice** device,
+                       Context** cpu_context, Context** device_context) {
+    NetworkConfig config;
+    config.device_type = ConvertDeviceType(FLAGS_dt);
+    config.enable_tune_kernel = FLAGS_et;
+    if (FLAGS_lp.length() > 0) {
+        config.library_path = {FLAGS_lp};
+    }
+    TNN_NS::Status ret = TNN_NS::TNN_OK;
+
+    // cpu
+    *cpu = GetDevice(DEVICE_NAIVE);
+    ASSERT(*cpu != NULL);
+
+    *cpu_context = (*cpu)->CreateContext(0);
+    ASSERT(*cpu_context != NULL);
+
+    // device
+    *device = GetDevice(config.device_type);
+    ASSERT(*device != NULL);
+
+    *device_context = (*device)->CreateContext(config.device_id);
+    ASSERT(*device_context != NULL);
+
+
+    if (!FLAGS_ub) {
+        (*device_context)->SetPrecision(PRECISION_HIGH);
+    } else {
+        (*device_context)->SetEnableTuneKernel(config.enable_tune_kernel);
+    }
+
+    ret = (*device_context)->LoadLibrary(config.library_path);
+    ASSERT(ret == TNN_OK);
+}
+
+InputShapesMap GenerateInputShapeMap(std::vector<std::vector<int>>& input_vec) {
+    InputShapesMap shape_map;
+    for (int i = 0; i < input_vec.size(); ++i) {
+        std::ostringstream ostr;
+        ostr << "input" << i;
+        shape_map[ostr.str()] = input_vec[i];
+    }
+    return shape_map;
+}
+
+std::shared_ptr<AbstractModelInterpreter> GenerateInterpreter(std::string layer_type_str,
+                                                              std::vector<std::vector<int>> input_vec,
+                                                              std::shared_ptr<LayerParam> param,
+                                                              std::shared_ptr<LayerResource> resource,
+                                                              int output_count) {
+    auto interpreter = CreateModelInterpreter(MODEL_TYPE_TNN);
+    if (!interpreter) {
+        return nullptr;
+    }
+    DefaultModelInterpreter* default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter);
+    if (!default_interpreter) {
+        return nullptr;
+    }
+
+    NetStructure* net_structure = default_interpreter->GetNetStructure();
+    NetResource* net_resource   = default_interpreter->GetNetResource();
+
+    // generate net structure
+    net_structure->inputs_shape_map = GenerateInputShapeMap(input_vec);
+
+    std::shared_ptr<LayerInfo> layer_info = std::make_shared<LayerInfo>();
+    layer_info->type                      = GlobalConvertLayerType(layer_type_str);
+    layer_info->type_str                  = layer_type_str;
+    layer_info->name                      = "layer_name";
+    for (auto item : net_structure->inputs_shape_map) {
+        layer_info->inputs.push_back(item.first);
+        net_structure->blobs.insert(item.first);
+    }
+    for (int i = 0; i < output_count; ++i) {
+        std::ostringstream ostr;
+        ostr << "output" << i;
+        layer_info->outputs.push_back(ostr.str());
+        net_structure->outputs.insert(ostr.str());
+        net_structure->blobs.insert(ostr.str());
+    }
+    layer_info->param = param;
+    net_structure->layers.push_back(layer_info);
+
+    // generate net resource
+    if (nullptr != resource) {
+        net_resource->resource_map["layer_name"] = resource;
+    }
+
+    return std::shared_ptr<AbstractModelInterpreter>(interpreter);
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/unit_test_common.h b/3rdparty/TNN/test/unit_test/unit_test_common.h
new file mode 100644
index 0000000..b62a2df
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/unit_test_common.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_COMMON_H_
+#define TNN_TEST_UNIT_TEST_COMMON_H_
+
+#include <chrono>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/context.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/random_data_utils.h"
+
+namespace TNN_NS {
+
+IntScaleResource* CreateIntScale(int channel);
+void SetUpEnvironment(AbstractDevice** cpu, AbstractDevice** device, Context** cpu_context, Context** device_context);
+
+std::shared_ptr<AbstractModelInterpreter> GenerateInterpreter(std::string layer_type_str,
+                                                              std::vector<std::vector<int>> input_vec,
+                                                              std::shared_ptr<LayerParam> param,
+                                                              std::shared_ptr<LayerResource> resource = nullptr,
+                                                              int output_count                        = 1);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_COMMON_H_
diff --git a/3rdparty/TNN/test/unit_test/unit_test_macro.h b/3rdparty/TNN/test/unit_test/unit_test_macro.h
new file mode 100644
index 0000000..6bbee98
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/unit_test_macro.h
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_MACRO_H_
+#define TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_MACRO_H_
+
+#include <gtest/gtest.h>
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+#ifdef TNN_UNIT_TEST_BENCHMARK
+
+#define BASIC_BATCH_CHANNEL_SIZE testing::Values(1, 16), testing::Values(1, 3, 64, 128), testing::Values(256)
+
+#define UNARY_BATCH_CHANNEL_SIZE testing::Values(1, 16), testing::Values(1, 3, 64, 128), testing::Values(256)
+
+#else
+
+#define BASIC_BATCH_CHANNEL_SIZE                                                                                       \
+    testing::Values(1, 2), testing::Values(1, 2, 3, 4, 10, 32), testing::Values(9, 10, 16, 19)
+
+#define UNARY_BATCH_CHANNEL_SIZE                                                                                       \
+    testing::Values(1, 2), testing::Values(1, 3, 4), testing::Values(3)
+
+#endif
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TEST_UNIT_TEST_LAYER_TEST_LAYER_TEST_MACRO_H_
diff --git a/3rdparty/TNN/test/unit_test/utils/network_helpers.cc b/3rdparty/TNN/test/unit_test/utils/network_helpers.cc
new file mode 100644
index 0000000..9f6e093
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/utils/network_helpers.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "network_helpers.h"
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/core/common.h"
+#include "tnn/core/context.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+Status BlobHandleAllocate(Blob* blob, AbstractDevice* device) {
+    Status ret                   = TNN_OK;
+    BlobDesc desc                = blob->GetBlobDesc();
+    BlobMemorySizeInfo size_info = device->Calculate(desc);
+    void* data;
+    ret = device->Allocate(&data, size_info);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    BlobHandle handle;
+    handle.base = data;
+    blob->SetHandle(handle);
+    return ret;
+}
+
+Status BlobHandleFree(Blob* blob, AbstractDevice* device) {
+    return device->Free(blob->GetHandle().base);
+}
+
+DataFormat GetDefaultDataFormat(DeviceType device_type) {
+    if (device_type == DEVICE_OPENCL) {
+        return DATA_FORMAT_NHC4W4;
+    } else if (device_type == DEVICE_METAL || device_type == DEVICE_ARM) {
+        return DATA_FORMAT_NC4HW4;
+    } else {
+        return DATA_FORMAT_NCHW;
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/test/unit_test/utils/network_helpers.h b/3rdparty/TNN/test/unit_test/utils/network_helpers.h
new file mode 100644
index 0000000..9fc7392
--- /dev/null
+++ b/3rdparty/TNN/test/unit_test/utils/network_helpers.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TEST_UNIT_TEST_NETWORK_HELPERS_H_
+#define TNN_TEST_UNIT_TEST_NETWORK_HELPERS_H_
+
+#include "tnn/core/abstract_device.h"
+#include "tnn/layer/base_layer.h"
+#include "tnn/core/context.h"
+#include "tnn/core/common.h"
+#include "tnn/core/blob.h"
+
+namespace TNN_NS {
+
+Status BlobHandleAllocate(Blob*blob, AbstractDevice* device);
+
+Status BlobHandleFree(Blob* blob, AbstractDevice * device);
+
+DataFormat GetDefaultDataFormat(DeviceType device_type);
+
+}
+
+#endif // TNN_TEST_UNIT_TEST_NETWORK_HELPERS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/.appveyor/check-generate-code.bat b/3rdparty/TNN/third_party/flatbuffers/.appveyor/check-generate-code.bat
new file mode 100644
index 0000000..ba7398a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.appveyor/check-generate-code.bat
@@ -0,0 +1,41 @@
+:: Copyright 2018 Google Inc. All rights reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+set buildtype=Release
+if "%1"=="-b" set buildtype=%2
+
+cd tests
+call generate_code.bat -b %buildtype% || goto FAIL
+
+:: TODO: Release and Debug builds produce differences here for some reason.
+git checkout HEAD -- monster_test.bfbs
+git checkout HEAD -- arrays_test.bfbs
+
+git -c core.autocrlf=true diff --exit-code --quiet || goto :DIFFFOUND
+goto SUCCESS
+
+:DIFFFOUND
+@echo "" >&2
+@echo "ERROR: ********************************************************" >&2
+@echo "ERROR: The following differences were found after running the" >&2
+@echo "ERROR: tests/generate_code.sh script.  Maybe you forgot to run" >&2
+@echo "ERROR: it after making changes in a generator or schema?" >&2
+@echo "ERROR: ********************************************************" >&2
+@echo "" >&2
+@git -c core.autocrlf=true --no-pager diff --binary
+
+:FAIL
+set EXITCODE=1
+:SUCCESS
+cd ..
+EXIT /B %EXITCODE%
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/.bazelci/presubmit.yml b/3rdparty/TNN/third_party/flatbuffers/.bazelci/presubmit.yml
new file mode 100644
index 0000000..a6e38fd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.bazelci/presubmit.yml
@@ -0,0 +1,18 @@
+---
+buildifier: latest
+platforms:
+  ubuntu1604:
+    build_targets:
+    - "..."
+    test_targets:
+    - "..."
+  ubuntu1804:
+    build_targets:
+    - "..."
+    test_targets:
+    - "..."
+  macos:
+    build_targets:
+    - "..."
+    test_targets:
+    - "..."
diff --git a/3rdparty/TNN/third_party/flatbuffers/.clang-format b/3rdparty/TNN/third_party/flatbuffers/.clang-format
new file mode 100644
index 0000000..7da2b43
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.clang-format
@@ -0,0 +1,13 @@
+---
+Language: Cpp
+BasedOnStyle: Google
+DerivePointerAlignment: false
+PointerAlignment: Right
+IndentPPDirectives: AfterHash
+Cpp11BracedListStyle: false
+AlwaysBreakTemplateDeclarations: false
+AllowShortCaseLabelsOnASingleLine: true
+SpaceAfterTemplateKeyword: false
+AllowShortBlocksOnASingleLine: true
+...
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/.editorconfig b/3rdparty/TNN/third_party/flatbuffers/.editorconfig
new file mode 100644
index 0000000..6c54966
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.editorconfig
@@ -0,0 +1,7 @@
+root = true
+# Don't set line endings to avoid conflict with core.autocrlf flag.
+# Line endings on checkout/checkin are controlled by .gitattributes file.
+[*]
+indent_style = space
+indent_size = 2
+insert_final_newline = true
diff --git a/3rdparty/TNN/third_party/flatbuffers/.eslintrc.js b/3rdparty/TNN/third_party/flatbuffers/.eslintrc.js
new file mode 100644
index 0000000..321169b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.eslintrc.js
@@ -0,0 +1,13 @@
+/* eslint-env node */
+
+module.exports = {
+    root: true,
+    parser: '@typescript-eslint/parser',
+    plugins: [
+        '@typescript-eslint',
+    ],
+    extends: [
+        'eslint:recommended',
+        'plugin:@typescript-eslint/recommended',
+    ]
+};
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/.gitattributes b/3rdparty/TNN/third_party/flatbuffers/.gitattributes
new file mode 100644
index 0000000..4cab1f4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.gitattributes
@@ -0,0 +1,2 @@
+# Set the default behavior, in case people don't have core.autocrlf set.
+* text=auto
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/ISSUE_TEMPLATE.md b/3rdparty/TNN/third_party/flatbuffers/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..a053fe4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,12 @@
+Thank you for submitting an issue!
+
+Please make sure you include the names of the affected language(s), compiler version(s), operating system version(s), and FlatBuffers version(s) in your issue title.
+
+This helps us get the correct maintainers to look at your issue. Here are examples of good titles:
+
+- Crash when accessing FlatBuffer [C++, gcc 4.8, OS X, master]
+- Flatc converts a protobuf 'bytes' field to 'string' in fbs schema file [all languages, FlatBuffers 1.4]
+
+Include other details as appropriate.
+
+Thanks!
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/PULL_REQUEST_TEMPLATE.md b/3rdparty/TNN/third_party/flatbuffers/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..949e76f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,20 @@
+Thank you for submitting a PR!
+
+Please delete this standard text once you've created your own description.
+
+Make sure you include the names of the affected language(s) in your PR title.
+This helps us get the correct maintainers to look at your issue.
+
+If you make changes to any of the code generators, be sure to run
+`cd tests && bash generate_code.sh` (or equivalent .bat) and include the generated
+code changes in the PR. This allows us to better see the effect of the PR.
+
+If your PR includes C++ code, please adhere to the Google C++ Style Guide,
+and don't forget we try to support older compilers (e.g. VS2010, GCC 4.6.3),
+so only some C++11 support is available.
+
+For any C++ changes, please make sure to run `sh src/clang-format-git.sh`
+
+Include other details as appropriate.
+
+Thanks!
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/labeler.yml b/3rdparty/TNN/third_party/flatbuffers/.github/labeler.yml
new file mode 100644
index 0000000..6d4ee88
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/labeler.yml
@@ -0,0 +1,96 @@
+# Configuration for Auto Labeler during pull request
+#
+# See https://github.com/actions/labeler for file format
+# and https://github.com/google/flatbuffers/labels for a list of valid labels
+#
+# See .github/workflows/label.yml for Github Action workflow script
+
+c#:
+  - '**/*.cs'
+  - net/**/*
+  - tests/FlatBuffers.Test/**/*
+  - tests/FlatBuffers.Benchmarks/**/*
+  - src/idl_gen_csharp.cpp
+
+swift:
+  - '**/*.swift'
+  - swift/**/*
+  - tests/FlatBuffers.GRPC.Swift/**/*
+  - tests/FlatBuffers.Benchmarks.swift/**/*
+  - tests/FlatBuffers.Test.Swift/**/*
+  - src/idl_gen_swift.cpp
+
+javascript:
+  - '**/*.js'
+  - src/idl_gen_ts.cpp
+
+typescript:
+  - '**/*.ts'
+  - src/idl_gen_ts.cpp
+  - grpc/flatbuffers-js-grpc/**/*.ts
+
+golang:
+  - '**/*.go'
+  - src/idl_gen_go.cpp
+
+python:
+  - '**/*.py'
+  - src/idl_gen_python.cpp
+
+java:
+  - '**/*.java'
+  - src/idl_gen_java.cpp
+
+kotlin:
+  - '**/*.kt'
+  - src/idl_gen_kotlin.cpp
+
+lua:
+  - '**/*.lua'
+  - lua/**/*
+  - src/idl_gen_lua.cpp
+
+lobster:
+  - '**/*.lobster'
+  - src/idl_gen_lobster.cpp
+
+php:
+  - '**/*.php'
+  - src/idl_gen_php.cpp
+
+rust:
+  - '**/*.rs'
+  - rust/**/*
+  - src/idl_gen_rust.cpp
+  
+dart:
+  - '**/*.dart'
+  - src/idl_gen_dart.cpp
+
+c++:
+  - '**/*.cc'
+  - '**/*.cpp'
+  - '**/*.h'
+
+json:
+  - '**/*.json'
+  - src/idl_gen_json_schema.cpp
+
+codegen:
+  - src/**/*
+
+documentation:
+  - docs/**/*
+  - '**/*.md'
+
+CI:
+  - '.github/**/*'
+  - '.appveyor/**/*'
+  - '.travis/**/*'
+  - '.bazelci/**/*'
+  - .travis.yml
+  - appveyor.yml
+
+grpc:
+  - grpc/**/*
+  - src/idl_gen_grpc.cpp
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/workflows/build.yml b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/build.yml
new file mode 100644
index 0000000..ce8bf5a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/build.yml
@@ -0,0 +1,229 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-linux:
+    name: Build Linux
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cxx: [g++-9, clang++-9]
+    steps:
+    - uses: actions/checkout@v1
+    - name: cmake
+      run: CXX=${{ matrix.cxx }} cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release .
+    - name: build
+      run: make -j4
+    - name: test
+      run: ./flattests
+    - name: upload build artifacts
+      uses: actions/upload-artifact@v1
+      with:
+        name: Linux flatc binary ${{ matrix.cxx }}
+        path: flatc
+
+  build-windows:
+    name: Build Windows
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.0.2
+    - name: cmake
+      run: cmake -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DFLATBUFFERS_BUILD_CPP17=ON .
+    - name: build
+      run: msbuild.exe FlatBuffers.sln /p:Configuration=Release /p:Platform=x64
+    - name: test
+      run: Release\flattests.exe
+    - name: upload build artifacts
+      uses: actions/upload-artifact@v1
+      with:
+        name: Windows flatc binary
+        path: Release\flatc.exe
+
+  build-windows-2017:
+    name: Build Windows 2017
+    runs-on: windows-2016
+    steps:
+    - uses: actions/checkout@v1
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.0.2
+    - name: cmake
+      run: cmake -G "Visual Studio 15 2017" -A x64 -DCMAKE_BUILD_TYPE=Release .
+    - name: build
+      run: msbuild.exe FlatBuffers.sln /p:Configuration=Release /p:Platform=x64
+    - name: test
+      run: Release\flattests.exe
+
+  build-mac:
+    name: Build Mac
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: cmake
+      run: cmake -G "Xcode" -DCMAKE_BUILD_TYPE=Release -DFLATBUFFERS_FLATC_EXECUTABLE=_build/Release/flatc .
+    - name: build
+      # NOTE: we need this _build dir to not have xcodebuild's default ./build dir clash with the BUILD file.
+      run: xcodebuild -toolchain clang -configuration Release -target flattests SYMROOT=$(PWD)/_build
+    - name: test
+      run: _build/Release/flattests
+    - name: upload build artifacts
+      uses: actions/upload-artifact@v1
+      with:
+        name: Mac flatc binary
+        path: _build/Release/flatc
+
+  build-android:
+   name: Build Android (on Linux)
+   runs-on: ubuntu-latest
+   steps:
+   - uses: actions/checkout@v1
+   - name: set up JDK 1.8
+     uses: actions/setup-java@v1
+     with:
+       java-version: 1.8
+   - name: set up flatc
+     run: |
+       cmake -DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_BUILD_FLATLIB=OFF -DFLATBUFFERS_BUILD_FLATHASH=OFF .
+       make
+       echo "${PWD}" >> $GITHUB_PATH
+   - name: build
+     working-directory: android
+     run: bash ./gradlew clean build
+
+  build-generator:
+    name: Check Generated Code
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cxx: [g++-9, clang++-9]
+    steps:
+    - uses: actions/checkout@v1
+    - name: cmake
+      run: CXX=${{ matrix.cxx }} cmake -G "Unix Makefiles" -DFLATBUFFERS_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release . && make -j4
+    - name: Generate
+      run: bash scripts/check-generate-code.sh && bash scripts/check-grpc-generated-code.sh
+
+  build-java:
+    name: Build Java
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: test
+      working-directory: tests
+      run: bash JavaTest.sh
+  
+  build-kotlin:
+    name: Build Kotlin
+    runs-on: macos-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v1
+    - name: Build
+      working-directory: kotlin  
+      run: ./gradlew clean build allTests
+    - name: Run Benchmark
+      working-directory: kotlin  
+      run: ./gradlew benchmark
+    - name: Generate Benchmark Report
+      working-directory: kotlin  
+      run: |
+        ./gradlew jmhReport;
+        mv benchmark/build/reports/benchmarks/main/* benchmark_latest
+    - name: Archive benchmark report
+      uses: actions/upload-artifact@v1
+      with:
+        name: Kotlin Benchmark Report
+        path: kotlin/benchmark_latest
+
+  build-rust:
+    name: Build Rust
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: test
+      working-directory: tests
+      run: bash RustTest.sh
+
+  #build-js:
+  #  name: Build JS
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #  - uses: actions/checkout@v1
+  #  - name: flatc
+  #    # FIXME: make test script not rely on flatc
+  #    run: cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_INSTALL=OFF -DFLATBUFFERS_BUILD_FLATLIB=OFF -DFLATBUFFERS_BUILD_FLATHASH=OFF . && make -j4
+  #  - name: test
+  #    working-directory: tests
+  #    run: bash JavaScriptTest.sh
+
+  build-python:
+    name: Build Python
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: test
+      working-directory: tests
+      run: bash PythonTest.sh
+
+  build-go:
+    name: Build Go
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: flatc
+      # FIXME: make test script not rely on flatc
+      run: cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_INSTALL=OFF -DFLATBUFFERS_BUILD_FLATLIB=OFF -DFLATBUFFERS_BUILD_FLATHASH=OFF . && make -j4
+    - name: test
+      working-directory: tests
+      run: bash GoTest.sh
+
+  #build-csharp:
+  #  name: Build CSharp
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #  - uses: actions/checkout@v1
+  #  - name: test
+  #    working-directory: tests/FlatBuffers.Test
+  #    run: bash NetTest.sh
+
+  #build-php:
+  #  name: Build PHP
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #  - uses: actions/checkout@v1
+  #  - name: flatc
+  #    # FIXME: make test script not rely on flatc
+  #    run: cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_INSTALL=OFF -DFLATBUFFERS_BUILD_FLATLIB=OFF -DFLATBUFFERS_BUILD_FLATHASH=OFF . && make -j4
+  #  - name: test
+  #    working-directory: tests
+  #    run: |
+  #      php phpTest.php
+  #      sh phpUnionVectorTest.sh
+
+  build-swift:
+    name: Build Swift
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: test
+      working-directory: tests/FlatBuffers.Test.Swift
+      run: sh SwiftTest.sh
+
+  build-ts:
+    name: Build TS
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: compile
+      run: npm run compile
+    - name: test
+      working-directory: tests
+      run: sh TypeScriptTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/workflows/label.yml b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/label.yml
new file mode 100644
index 0000000..53dd472
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/label.yml
@@ -0,0 +1,19 @@
+# This workflow will triage pull requests and apply a label based on the
+# paths that are modified in the pull request.
+#
+# To use this workflow, you will need to set up a .github/labeler.yml
+# file with configuration.  For more information, see:
+# https://github.com/actions/labeler
+
+name: Labeler
+on: [pull_request_target]
+
+jobs:
+  label:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/labeler@main
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/.github/workflows/stale.yml b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/stale.yml
new file mode 100644
index 0000000..a27225d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.github/workflows/stale.yml
@@ -0,0 +1,20 @@
+name: Mark stale issues and pull requests
+
+on:
+  schedule:
+  - cron: "30 20 * * *"
+
+jobs:
+  stale:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/stale@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-pr-message: 'This pull request is stale because it has been open 6 months with no activity. Please comment or this will be closed in 14 days.'
+        stale-issue-message: 'This issue is stale because it has been open 6 months with no activity. Please comment or this will be closed in 14 days.'
+        days-before-stale: 182 # 6 months
+        days-before-close: 14
+        operations-per-run: 1500
diff --git a/3rdparty/TNN/third_party/flatbuffers/.gitignore b/3rdparty/TNN/third_party/flatbuffers/.gitignore
new file mode 100644
index 0000000..70a88b5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.gitignore
@@ -0,0 +1,143 @@
+*_wire.txt
+*_wire.bin
+.DS_Store
+**/.build
+build
+**/Packages
+/*.xcodeproj
+**/xcuserdata/
+**/xcshareddata/
+**/.swiftpm/
+*.o
+*.o.d
+*.class
+*.a
+*.swp
+*~
+*.vcxproj
+*.vcxproj.filters
+*.vcxproj.user
+*.sln
+*.suo
+*.opendb
+*.keystore
+**/.vs/**
+**/bin/**
+!tests/rust_usage_test/bin/**
+**/gen/**
+**/libs/**
+**/obj/**
+**/*.dir/**
+**/CMakeFiles/**
+**/cmake_install.cmake
+**/install_manifest.txt
+**/CMakeCache.txt
+**/CMakeTestfile.cmake
+**/CPackConfig.cmake
+**/CPackSourceConfig.cmake
+**/compile_commands.json
+**/Debug/**
+**/Release/**
+**/RelWithDebInfo/**
+**/x64/ #build artifacts from VS
+build.xml
+local.properties
+project.properties
+proguard-project.txt
+linklint_results
+Makefile
+flatc
+flatc.exe
+flathash
+flathash.exe
+flattests
+flattests.exe
+flatsamplebinary
+flatsamplebinary.exe
+flatsampletext
+flatsampletext.exe
+flatsamplebfbs
+flatsamplebfbs.exe
+grpctest
+grpctest.exe
+snapshot.sh
+tags
+tests/dart_gen
+tests/go_gen
+tests/monsterdata_java_wire.mon
+tests/monsterdata_java_wire_sp.mon
+tests/monsterdata_go_wire.mon
+tests/monsterdata_javascript_wire.mon
+tests/monsterdata_lobster_wire.mon
+tests/monsterdata_rust_wire.mon
+tests/php/
+CMakeLists.txt.user
+CMakeScripts/**
+CTestTestfile.cmake
+FlatbuffersConfigVersion.cmake
+FlatBuffers.cbp
+build/Xcode/FlatBuffers.xcodeproj/project.xcworkspace/**
+build/Xcode/FlatBuffers.xcodeproj/xcuserdata/**
+FlatBuffers.xcodeproj/
+java/.idea
+java/*.iml
+.idea
+*.iml
+target
+**/*.pyc
+build/VS2010/FlatBuffers.sdf
+build/VS2010/FlatBuffers.opensdf
+build/VS2010/ipch/**/*.ipch
+*.so
+Testing/Temporary
+.cproject
+.settings/
+.project
+net/**/obj
+node_modules/
+android/.externalNativeBuild/
+android/.gradle/
+android/build/
+samples/android/.externalNativeBuild/
+samples/android/.gradle/
+samples/android/build/
+js/flatbuffers.mjs
+/bazel-bin
+/bazel-flatbuffers
+/bazel-genfiles
+/bazel-out
+/bazel-testlogs
+.ninja_deps
+.ninja_log
+build.ninja
+rules.ninja
+.vscode
+dart/.pub/
+dart/.packages
+dart/pubspec.lock
+dart/.dart_tool/
+dart/build/
+dart/doc/api/
+Cargo.lock
+.corpus**
+.seed**
+.crash**
+grpc/google/
+**/Package.resolved
+.clangd/**
+package-lock.json
+/*.ilk
+/*.pdb
+.clwb
+js/**/*.js
+js/**/*.d.ts
+mjs/**/*.js
+mjs/**/*.d.ts
+yarn-error.log
+.cache/
+/flatbuffers.lib
+.cmake/
+**/dist
+**/vendor
+**/go.sum
+flatbuffers.pc
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis.yml b/3rdparty/TNN/third_party/flatbuffers/.travis.yml
new file mode 100644
index 0000000..3ca86de
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis.yml
@@ -0,0 +1,217 @@
+env:
+  global:
+    # Set at the root level as this is ignored when set under matrix.env.
+    - GCC_VERSION="4.9"
+    # Fail on first error if UBSAN or ASAN enabled for a target
+    - UBSAN_OPTIONS=halt_on_error=1
+    - ASAN_OPTIONS=halt_on_error=1
+    # Travis machines have 2 cores
+    - JOBS=2
+    - MAKEFLAGS="-j 2"
+
+conan-linux: &conan-linux
+  os: linux
+  dist: xenial
+  language: python
+  python: "3.7"
+  services:
+    - docker
+  install:
+    - ./conan/travis/install.sh
+  script:
+    - ./conan/travis/build.sh
+  if: tag IS present
+
+conan-linux-master: &conan-linux-master
+  os: linux
+  dist: xenial
+  language: python
+  python: "3.7"
+  services:
+    - docker
+  install:
+    - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then bash ./conan/travis/install.sh; fi'
+  script:
+    - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then bash ./conan/travis/build.sh; fi'
+  branches:
+    only:
+    - master
+
+conan-osx: &conan-osx
+  os: osx
+  language: generic
+  install:
+    - ./conan/travis/install.sh
+  script:
+    - ./conan/travis/build.sh
+  if: tag IS present
+
+matrix:
+  include:
+    #- language: python
+    #  python: "2.7"
+    #  install:
+    #    - "pip install wheel twine"
+    #  script:
+    #    - "cd python/"
+    #    - 'VERSION="$TRAVIS_TAG" python setup.py sdist bdist_wheel'
+    #    - "cd ../"
+    #  deploy:
+    #    # Checkpointed release builds.
+    #    - provider: script
+    #      script: .travis/deploy-python.sh
+    #      skip_cleanup: true
+    #      on:
+    #        tags: true
+    #        # all_branches must be set with tags: true. See below post:
+    #        # https://stackoverflow.com/a/27775257/1076585
+    #        all_branches: true
+    #    # Produce a new build for the cutting edge when master changes.
+    #    - provider: script
+    #      script: .travis/deploy-python.sh
+    #      skip_cleanup: true
+    #      on:
+    #        branch: master
+    - language: cpp
+      os:
+        - linux
+
+      addons:
+        apt:
+          packages:
+            - docker-ce
+      script:
+        - bash .travis/build-and-run-docker-test-containers.sh
+
+    - language: cpp
+      os:
+        - linux
+
+      compiler:
+        - gcc
+
+      env:
+        matrix:
+          - BUILD_TYPE=Debug
+          - BUILD_TYPE=Release
+
+      before_install:
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt-get update -qq; fi
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt-get install -qq g++-$GCC_VERSION; fi
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt-get install -qq gcc-$GCC_VERSION; fi
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s -v -f $(which g++-$GCC_VERSION) /usr/bin/g++; fi
+      - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s -v -f $(which gcc-$GCC_VERSION) /usr/bin/gcc; fi
+
+      script:
+      - pip install cmake
+      - bash .travis/check-sources.sh
+      - bash grpc/build_grpc.sh
+      - cmake .
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+        -DFLATBUFFERS_BUILD_GRPCTEST=ON
+        -DGRPC_INSTALL_PATH=$TRAVIS_BUILD_DIR/google/grpc/install
+        -DPROTOBUF_DOWNLOAD_PATH=$TRAVIS_BUILD_DIR/google/grpc/third_party/protobuf
+        -DFLATBUFFERS_CODE_SANITIZE=ON
+      - cmake --build . --target all --clean-first -- -j${JOBS}
+      - LD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/google/grpc/install/lib ctest --extra-verbose --output-on-failure
+      - bash scripts/check-generate-code.sh
+
+    - language: cpp
+      os: osx
+      osx_image: xcode9.3
+      env:
+        matrix:
+          - BUILD_TYPE=Debug
+          - BUILD_TYPE=Release
+
+      script:
+      - pip install --user cmake
+      - mkdir ~/cmake_path
+      - ln -s $(find ~/Library/Python -name cmake -type f | head -n 1) ~/cmake_path/cmake
+      - ln -s $(find ~/Library/Python -name ctest -type f | head -n 1) ~/cmake_path/ctest
+      - export PATH=~/cmake_path:${PATH}
+      - bash grpc/build_grpc.sh
+      - cmake .
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+        -DFLATBUFFERS_BUILD_GRPCTEST=ON
+        -DGRPC_INSTALL_PATH=$TRAVIS_BUILD_DIR/google/grpc/install
+        -DPROTOBUF_DOWNLOAD_PATH=$TRAVIS_BUILD_DIR/google/grpc/third_party/protobuf
+        -DFLATBUFFERS_CODE_SANITIZE=ON
+      - cmake --build . -- -j${JOBS}
+      - DYLD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/google/grpc/install/lib ctest --extra-verbose --output-on-failure
+      - bash scripts/check-generate-code.sh
+
+    - <<: *conan-linux-master
+      env: CONAN_GCC_VERSIONS=8 CONAN_DOCKER_IMAGE=conanio/gcc8
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=4.9 CONAN_DOCKER_IMAGE=conanio/gcc49
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=5 CONAN_DOCKER_IMAGE=conanio/gcc5
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=6 CONAN_DOCKER_IMAGE=conanio/gcc6
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=7 CONAN_DOCKER_IMAGE=conanio/gcc7
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=8 CONAN_DOCKER_IMAGE=conanio/gcc8
+    - <<: *conan-linux
+      env: CONAN_GCC_VERSIONS=9 CONAN_DOCKER_IMAGE=conanio/gcc9
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=3.9 CONAN_DOCKER_IMAGE=conanio/clang39
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=4.0 CONAN_DOCKER_IMAGE=conanio/clang40
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=5.0 CONAN_DOCKER_IMAGE=conanio/clang50
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=6.0 CONAN_DOCKER_IMAGE=conanio/clang60
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=7.0 CONAN_DOCKER_IMAGE=conanio/clang7
+    - <<: *conan-linux
+      env: CONAN_CLANG_VERSIONS=8 CONAN_DOCKER_IMAGE=conanio/clang8
+    - <<: *conan-osx
+      osx_image: xcode7.3
+      env: CONAN_APPLE_CLANG_VERSIONS=7.3
+    - <<: *conan-osx
+      osx_image: xcode8.3
+      env: CONAN_APPLE_CLANG_VERSIONS=8.1
+    - <<: *conan-osx
+      osx_image: xcode9
+      env: CONAN_APPLE_CLANG_VERSIONS=9.0
+    - <<: *conan-osx
+      osx_image: xcode9.4
+      env: CONAN_APPLE_CLANG_VERSIONS=9.1
+    - <<: *conan-osx
+      osx_image: xcode10.2
+      env: CONAN_APPLE_CLANG_VERSIONS=10.0
+
+    - language: android
+      sudo: true
+      dist: trusty
+      android:
+        components:
+          - tools
+          - platform-tools
+          - extra-android-m2repository
+        licenses:
+          - 'android-sdk-preview-license-52d11cd2'
+          - 'android-sdk-license-.+'
+          - 'google-gdk-license-.+'
+      compiler:
+        - gcc
+      before_install:
+        - echo y | sdkmanager "platforms;android-30"
+        - echo y | sdkmanager "build-tools;30.0.2"
+        - echo y | sdkmanager "ndk-bundle"
+        - echo y | sdkmanager "cmake;3.6.4111459"
+      script:
+        - cmake -DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_BUILD_FLATLIB=OFF -DFLATBUFFERS_BUILD_FLATHASH=OFF .; make; export PATH="$PATH:${PWD}"
+        - cd android; ./gradlew clean build
+
+    - language: generic
+      if: type IN (pull_request)
+      os: linux
+      install:
+        - bash .travis/format_install.sh
+
+      script:
+        - bash .travis/format_check.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/build-and-run-docker-test-containers.sh b/3rdparty/TNN/third_party/flatbuffers/.travis/build-and-run-docker-test-containers.sh
new file mode 100755
index 0000000..d377ba8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/build-and-run-docker-test-containers.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#
+# Copyright 2018 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+docker build -t build_cpp_image -f tests/docker/Dockerfile.testing.cpp.debian_buster .
+# Run tests with sanitizers  (--cap-add SYS_PTRACE), both GCC and Clang.
+cpp_test_args="--cap-add SYS_PTRACE build_cpp_image sh ./tests/docker/cpp_test.run.sh Debug"
+docker run --rm $cpp_test_args
+docker run --rm --env CC=/usr/bin/clang --env CXX=/usr/bin/clang++ $cpp_test_args
+# Build flatc on debian once to speed up the test loop below.
+docker run --name flatc_container build_cpp_image sh ./tests/docker/build_flatc.run.sh Debug
+# All dependent dockers refer to 'flatc_debian_stretch'.
+docker cp flatc_container:/flatbuffers/flatc flatc_debian_stretch
+
+for f in $(ls tests/docker/languages | sort)
+do
+        # docker pull sometimes fails for unknown reasons, probably travisci-related. this retries the pull we need a few times.
+        REQUIRED_BASE_IMAGE=$(cat tests/docker/languages/${f} | head -n 1  | awk ' { print $2 } ')
+
+        set +e
+        n=0
+        until [ $n -ge 5 ]
+        do
+           docker pull $REQUIRED_BASE_IMAGE && break
+           n=$[$n+1]
+           sleep 1
+        done
+        set -e
+
+        docker build -t $(echo ${f} | cut -f 3- -d .) -f tests/docker/languages/${f} .
+        echo "TEST OK: ${f}"
+done
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh b/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh
new file mode 100644
index 0000000..3e6dbf1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#
+# Copyright 2018 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+if [ -n "$1" ]; then
+  scan_dir="$1"
+else
+  scan_dir="$( pwd )"
+fi
+
+py_checker="$0.py"
+
+echo "scan root directory = '$scan_dir'"
+python3 --version
+# Scan recursively and search all *.cpp and *.h files using regex patterns.
+# Assume that script running from a root of Flatbuffers working dir.
+python3 $py_checker "ascii" "$scan_dir/include" "\.h$"
+python3 $py_checker "ascii" "$scan_dir/src"     "\.cpp$"
+python3 $py_checker "ascii" "$scan_dir/tests"   "\.h$"
+python3 $py_checker "utf-8" "$scan_dir/tests"   "\.cpp$"
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh.py b/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh.py
new file mode 100644
index 0000000..2b001d7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/check-sources.sh.py
@@ -0,0 +1,35 @@
+import os
+import re
+import sys
+
+def check_encoding(encoding, scan_dir, regex_pattern):
+  fname = None
+  try:
+    assert encoding in ['ascii', 'utf-8'], "unexpected encoding"
+    cmp = re.compile(regex_pattern)
+    for root, dirs, files in os.walk(scan_dir):
+      fname = root
+      cmp_list = [f for f in files if cmp.search(f) is not None]
+      for f in cmp_list:
+        fname = os.path.join(root, f)
+        with open(fname, mode='rb') as test_file:
+          btext = test_file.read()
+        # check encoding
+        btext.decode(encoding=encoding, errors="strict")
+        if encoding == "utf-8" and btext.startswith(b'\xEF\xBB\xBF'):
+          raise ValueError("unexpected BOM in file")
+        # check LF line endings
+        LF = btext.count(b'\n')
+        CR = btext.count(b'\r')
+        if CR!=0:
+          raise ValueError("invalid line endings: LF({})/CR({})".format(LF, CR))
+  except Exception as err:
+    print("ERROR with [{}]: {}".format(fname, err))
+    return -1
+  else:
+    return 0
+
+if __name__ == "__main__":
+  # python check-sources.sh.py 'ascii' '.' '.*\.(cpp|h)$'
+  res = check_encoding(sys.argv[1], sys.argv[2], sys.argv[3])
+  sys.exit(0 if res == 0 else -1)
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/deploy-python.sh b/3rdparty/TNN/third_party/flatbuffers/.travis/deploy-python.sh
new file mode 100755
index 0000000..4cc0346
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/deploy-python.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROD_REPOSITORY="https://upload.pypi.org/legacy/"
+TEST_REPOSITORY="https://test.pypi.org/legacy/"
+
+twine upload \
+    --username "$PYPI_USERNAME" \
+    --password "$PYPI_PASSWORD" \
+    --repository-url "$PROD_REPOSITORY" \
+    "$DIR/../python/dist/"*
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/format_check.sh b/3rdparty/TNN/third_party/flatbuffers/.travis/format_check.sh
new file mode 100644
index 0000000..c96d8db
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/format_check.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Copyright 2021 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# HACKY solution to make nodejs work.
+source ~/.nvm/nvm.sh
+nvm alias default node
+nvm use default
+
+sh src/clang-format-git.sh
+
+# Check formatting for go lang
+
+cd go
+gofmt -w .
+cd ..
+cd grpc/examples/go
+sh format.sh
+cd ../../..
+
+node_modules/.bin/eslint ts/** --ext .ts --quiet --fix
+
+#PYTHON IS DISABLED UNTIL WE CREATE A .pylintrc FILE FOR IT
+pylint python/** --disable=all
+
+swiftformat --config swift.swiftformat .
+
+
+if ! git diff --quiet; then
+  echo >&2
+  echo "ERROR: ********************************************************" >&2
+  echo "ERROR: The following differences were found after running" >&2
+  echo "ERROR: .travis/format_check.sh script. Maybe you forgot to format" >&2
+  echo "ERROR: the code after making changes? please check Formatters.md" >&2
+  echo "ERROR: ********************************************************" >&2
+  echo >&2
+  git diff --binary --exit-code
+fi
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/.travis/format_install.sh b/3rdparty/TNN/third_party/flatbuffers/.travis/format_install.sh
new file mode 100644
index 0000000..74908d5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/.travis/format_install.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+# install devtools
+install_languages() {
+  sudo apt update
+
+  # Install nodeJS and yarn
+  wget https://raw.githubusercontent.com/creationix/nvm/v0.31.0/nvm.sh -O ~/.nvm/nvm.sh
+  source ~/.nvm/nvm.sh
+  nvm install node
+  node --version
+  curl -o- -L https://yarnpkg.com/install.sh | bash
+  export PATH="$HOME/.yarn/bin:$PATH"
+  yarn config set prefix ~/.yarn -g
+  export PATH="$HOME/.yarn/bin:$HOME/.config/yarn/global/node_modules/.bin:$PATH"
+
+  # Install swift
+  sudo apt-get install \
+          binutils \
+          git \
+          libc6-dev \
+          libcurl3 \
+          libedit2 \
+          libgcc-5-dev \
+          libpython2.7 \
+          libsqlite3-0 \
+          libstdc++-5-dev \
+          libxml2 \
+          pkg-config \
+          tzdata \
+          zlib1g-dev
+
+  SWIFT_URL=https://swift.org/builds/swift-5.3.1-release/ubuntu1604/swift-5.3.1-RELEASE/swift-5.3.1-RELEASE-ubuntu16.04.tar.gz
+  curl -fSsL "$SWIFT_URL" -o swift.tar.gz
+
+  mkdir ~/swiftbuild
+  tar -xvzf swift.tar.gz -C ~/swiftbuild
+
+  export PATH="~/swiftbuild/swift-5.3.1-RELEASE-ubuntu16.04/usr/bin:$PATH"
+
+
+  mkdir ~/gobuild
+  wget -c https://golang.org/dl/go1.15.2.linux-amd64.tar.gz
+  tar -xvzf go1.15.2.linux-amd64.tar.gz -C ~/gobuild
+
+  export  PATH="~/gobuild/go/bin:$PATH"
+
+  swift --version
+  go version
+  yarn -v
+  node -v
+}
+
+install_formatters() {
+  # installing swift formatter
+  git clone --depth 1 --branch 0.47.4 https://github.com/nicklockwood/SwiftFormat.git
+  cd SwiftFormat
+  swift build -c release
+  sudo cp .build/release/swiftformat /usr/local/bin/swiftformat
+  cd ..
+
+  which yarn
+  which node
+  yarn -v
+  node -v
+
+  yarn install
+  pip install pylint
+}
+
+install_languages
+export PATH="~/swift/swift/usr/bin:$PATH"
+install_formatters
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/BUILD.bazel
new file mode 100644
index 0000000..4e40b71
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/BUILD.bazel
@@ -0,0 +1,79 @@
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+licenses(["notice"])
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+exports_files([
+    "LICENSE",
+])
+
+# Public flatc library to compile flatbuffer files at runtime.
+cc_library(
+    name = "flatbuffers",
+    hdrs = ["//:public_headers"],
+    linkstatic = 1,
+    strip_include_prefix = "/include",
+    deps = ["//src:flatbuffers"],
+)
+
+# Public C++ headers for the Flatbuffers library.
+filegroup(
+    name = "public_headers",
+    srcs = [
+        "include/flatbuffers/base.h",
+        "include/flatbuffers/code_generators.h",
+        "include/flatbuffers/flatbuffers.h",
+        "include/flatbuffers/flexbuffers.h",
+        "include/flatbuffers/grpc.h",
+        "include/flatbuffers/hash.h",
+        "include/flatbuffers/idl.h",
+        "include/flatbuffers/minireflect.h",
+        "include/flatbuffers/reflection.h",
+        "include/flatbuffers/reflection_generated.h",
+        "include/flatbuffers/registry.h",
+        "include/flatbuffers/stl_emulation.h",
+        "include/flatbuffers/util.h",
+    ],
+)
+
+# Public flatc compiler library.
+cc_library(
+    name = "flatc_library",
+    linkstatic = 1,
+    deps = [
+        "//src:flatc_library",
+    ],
+)
+
+# Public flatc compiler.
+cc_binary(
+    name = "flatc",
+    deps = [
+        "//src:flatc",
+    ],
+)
+
+filegroup(
+    name = "flatc_headers",
+    srcs = [
+        "include/flatbuffers/flatc.h",
+    ],
+    visibility = ["//:__subpackages__"],
+)
+
+# Library used by flatbuffer_cc_library rules.
+cc_library(
+    name = "runtime_cc",
+    hdrs = [
+        "include/flatbuffers/base.h",
+        "include/flatbuffers/flatbuffers.h",
+        "include/flatbuffers/flexbuffers.h",
+        "include/flatbuffers/stl_emulation.h",
+        "include/flatbuffers/util.h",
+    ],
+    linkstatic = 1,
+    strip_include_prefix = "/include",
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/BuildFlatBuffers.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/BuildFlatBuffers.cmake
new file mode 100644
index 0000000..01db72d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/BuildFlatBuffers.cmake
@@ -0,0 +1,403 @@
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# General function to create FlatBuffer build rules for the given list of
+# schemas.
+#
+# flatbuffers_schemas: A list of flatbuffer schema files to process.
+#
+# schema_include_dirs: A list of schema file include directories, which will be
+# passed to flatc via the -I parameter.
+#
+# custom_target_name: The generated files will be added as dependencies for a
+# new custom target with this name. You should add that target as a dependency
+# for your main target to ensure these files are built. You can also retrieve
+# various properties from this target, such as GENERATED_INCLUDES_DIR,
+# BINARY_SCHEMAS_DIR, and COPY_TEXT_SCHEMAS_DIR.
+#
+# additional_dependencies: A list of additional dependencies that you'd like
+# all generated files to depend on. Pass in a blank string if you have none.
+#
+# generated_includes_dir: Where to generate the C++ header files for these
+# schemas. The generated includes directory will automatically be added to
+# CMake's include directories, and will be where generated header files are
+# placed. This parameter is optional; pass in empty string if you don't want to
+# generate include files for these schemas.
+#
+# binary_schemas_dir: If you specify an optional binary schema directory, binary
+# schemas will be generated for these schemas as well, and placed into the given
+# directory.
+#
+# copy_text_schemas_dir: If you want all text schemas (including schemas from
+# all schema include directories) copied into a directory (for example, if you
+# need them within your project to build JSON files), you can specify that
+# folder here. All text schemas will be copied to that folder.
+#
+# IMPORTANT: Make sure you quote all list arguments you pass to this function!
+# Otherwise CMake will only pass in the first element.
+# Example: build_flatbuffers("${fb_files}" "${include_dirs}" target_name ...)
+function(build_flatbuffers flatbuffers_schemas
+                           schema_include_dirs
+                           custom_target_name
+                           additional_dependencies
+                           generated_includes_dir
+                           binary_schemas_dir
+                           copy_text_schemas_dir)
+
+  # Test if including from FindFlatBuffers
+  if(FLATBUFFERS_FLATC_EXECUTABLE)
+    set(FLATC_TARGET "")
+    set(FLATC ${FLATBUFFERS_FLATC_EXECUTABLE})
+  else()
+    set(FLATC_TARGET flatc)
+    set(FLATC flatc)
+  endif()
+  set(FLATC_SCHEMA_ARGS --gen-mutable)
+  if(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS)
+    set(FLATC_SCHEMA_ARGS
+      ${FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS}
+      ${FLATC_SCHEMA_ARGS}
+      )
+  endif()
+
+  set(working_dir "${CMAKE_CURRENT_SOURCE_DIR}")
+
+  set(schema_glob "*.fbs")
+  # Generate the include files parameters.
+  set(include_params "")
+  set(all_generated_files "")
+  foreach (include_dir ${schema_include_dirs})
+    set(include_params -I ${include_dir} ${include_params})
+    if (NOT ${copy_text_schemas_dir} STREQUAL "")
+      # Copy text schemas from dependent folders.
+      file(GLOB_RECURSE dependent_schemas ${include_dir}/${schema_glob})
+      foreach (dependent_schema ${dependent_schemas})
+        file(COPY ${dependent_schema} DESTINATION ${copy_text_schemas_dir})
+      endforeach()
+    endif()
+  endforeach()
+
+  foreach(schema ${flatbuffers_schemas})
+    get_filename_component(filename ${schema} NAME_WE)
+    # For each schema, do the things we requested.
+    if (NOT ${generated_includes_dir} STREQUAL "")
+      set(generated_include ${generated_includes_dir}/${filename}_generated.h)
+      add_custom_command(
+        OUTPUT ${generated_include}
+        COMMAND ${FLATC} ${FLATC_ARGS}
+        -o ${generated_includes_dir}
+        ${include_params}
+        -c ${schema}
+        DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies}
+        WORKING_DIRECTORY "${working_dir}")
+      list(APPEND all_generated_files ${generated_include})
+    endif()
+
+    if (NOT ${binary_schemas_dir} STREQUAL "")
+      set(binary_schema ${binary_schemas_dir}/${filename}.bfbs)
+      add_custom_command(
+        OUTPUT ${binary_schema}
+        COMMAND ${FLATC} -b --schema
+        -o ${binary_schemas_dir}
+        ${include_params}
+        ${schema}
+        DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies}
+        WORKING_DIRECTORY "${working_dir}")
+      list(APPEND all_generated_files ${binary_schema})
+    endif()
+
+    if (NOT ${copy_text_schemas_dir} STREQUAL "")
+      file(COPY ${schema} DESTINATION ${copy_text_schemas_dir})
+    endif()
+  endforeach()
+
+  # Create a custom target that depends on all the generated files.
+  # This is the target that you can depend on to trigger all these
+  # to be built.
+  add_custom_target(${custom_target_name}
+                    DEPENDS ${all_generated_files} ${additional_dependencies})
+
+  # Register the include directory we are using.
+  if (NOT ${generated_includes_dir} STREQUAL "")
+    include_directories(${generated_includes_dir})
+    set_property(TARGET ${custom_target_name}
+      PROPERTY GENERATED_INCLUDES_DIR
+      ${generated_includes_dir})
+  endif()
+
+  # Register the binary schemas dir we are using.
+  if (NOT ${binary_schemas_dir} STREQUAL "")
+    set_property(TARGET ${custom_target_name}
+      PROPERTY BINARY_SCHEMAS_DIR
+      ${binary_schemas_dir})
+  endif()
+
+  # Register the text schema copy dir we are using.
+  if (NOT ${copy_text_schemas_dir} STREQUAL "")
+    set_property(TARGET ${custom_target_name}
+      PROPERTY COPY_TEXT_SCHEMAS_DIR
+      ${copy_text_schemas_dir})
+  endif()
+endfunction()
+
+# Creates a target that can be linked against that generates flatbuffer headers.
+#
+# This function takes a target name and a list of schemas. You can also specify
+# other flagc flags using the FLAGS option to change the behavior of the flatc
+# tool.
+#
+# Arguments:
+#   TARGET: The name of the target to generate.
+#   SCHEMAS: The list of schema files to generate code for.
+#   BINARY_SCHEMAS_DIR: Optional. The directory in which to generate binary
+#       schemas. Binary schemas will only be generated if a path is provided.
+#   INCLUDE: Optional. Search for includes in the specified paths. (Use this
+#       instead of "-I <path>" and the FLAGS option so that CMake is aware of
+#       the directories that need to be searched).
+#   INCLUDE_PREFIX: Optional. The directory in which to place the generated
+#       files. Use this instead of the --include-prefix option.
+#   FLAGS: Optional. A list of any additional flags that you would like to pass
+#       to flatc.
+#
+# Example:
+#
+#     flatbuffers_generate_headers(
+#         TARGET my_generated_headers_target
+#         INCLUDE_PREFIX ${MY_INCLUDE_PREFIX}"
+#         SCHEMAS ${MY_SCHEMA_FILES}
+#         BINARY_SCHEMAS_DIR "${MY_BINARY_SCHEMA_DIRECTORY}"
+#         FLAGS --gen-object-api)
+#
+#     target_link_libraries(MyExecutableTarget
+#         PRIVATE my_generated_headers_target
+#     )
+function(flatbuffers_generate_headers)
+  # Parse function arguments.
+  set(options)
+  set(one_value_args
+    "TARGET"
+    "INCLUDE_PREFIX"
+    "BINARY_SCHEMAS_DIR")
+  set(multi_value_args
+    "SCHEMAS"
+    "INCLUDE"
+    "FLAGS")
+  cmake_parse_arguments(
+    PARSE_ARGV 0
+    FLATBUFFERS_GENERATE_HEADERS
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}")
+
+  # Test if including from FindFlatBuffers
+  if(FLATBUFFERS_FLATC_EXECUTABLE)
+    set(FLATC_TARGET "")
+    set(FLATC ${FLATBUFFERS_FLATC_EXECUTABLE})
+  else()
+    set(FLATC_TARGET flatc)
+    set(FLATC flatc)
+  endif()
+
+  set(working_dir "${CMAKE_CURRENT_SOURCE_DIR}")
+
+  # Generate the include files parameters.
+  set(include_params "")
+  foreach (include_dir ${FLATBUFFERS_GENERATE_HEADERS_INCLUDE})
+    set(include_params -I ${include_dir} ${include_params})
+  endforeach()
+
+  # Create a directory to place the generated code.
+  set(generated_target_dir "${CMAKE_CURRENT_BINARY_DIR}/${FLATBUFFERS_GENERATE_HEADERS_TARGET}")
+  set(generated_include_dir "${generated_target_dir}")
+  if (NOT ${FLATBUFFERS_GENERATE_HEADERS_INCLUDE_PREFIX} STREQUAL "")
+    set(generated_include_dir "${generated_include_dir}/${FLATBUFFERS_GENERATE_HEADERS_INCLUDE_PREFIX}")
+    list(APPEND FLATBUFFERS_GENERATE_HEADERS_FLAGS 
+         "--include-prefix" ${FLATBUFFERS_GENERATE_HEADERS_INCLUDE_PREFIX})
+  endif()
+
+  # Create rules to generate the code for each schema.
+  foreach(schema ${FLATBUFFERS_GENERATE_HEADERS_SCHEMAS})
+    get_filename_component(filename ${schema} NAME_WE)
+    set(generated_include "${generated_include_dir}/${filename}_generated.h")
+    add_custom_command(
+      OUTPUT ${generated_include}
+      COMMAND ${FLATC} ${FLATC_ARGS}
+      -o ${generated_include_dir}
+      ${include_params}
+      -c ${schema}
+      ${FLATBUFFERS_GENERATE_HEADERS_FLAGS}
+      DEPENDS ${FLATC_TARGET} ${schema}
+      WORKING_DIRECTORY "${working_dir}")
+    list(APPEND all_generated_header_files ${generated_include})
+
+    # Geneate the binary flatbuffers schemas if instructed to.
+    if (NOT ${FLATBUFFERS_GENERATE_HEADERS_BINARY_SCHEMAS_DIR} STREQUAL "")
+      set(binary_schema
+          "${FLATBUFFERS_GENERATE_HEADERS_BINARY_SCHEMAS_DIR}/${filename}.bfbs")
+      add_custom_command(
+        OUTPUT ${binary_schema}
+        COMMAND ${FLATC} -b --schema
+        -o ${FLATBUFFERS_GENERATE_HEADERS_BINARY_SCHEMAS_DIR}
+        ${include_params}
+        ${schema}
+        DEPENDS ${FLATC_TARGET} ${schema}
+        WORKING_DIRECTORY "${working_dir}")
+      list(APPEND all_generated_binary_files ${binary_schema})
+    endif()
+  endforeach()
+
+  # Set up interface library
+  add_library(${FLATBUFFERS_GENERATE_HEADERS_TARGET} INTERFACE)
+  target_sources(
+    ${FLATBUFFERS_GENERATE_HEADERS_TARGET}
+    INTERFACE
+      ${all_generated_header_files}
+      ${all_generated_binary_files}
+      ${FLATBUFFERS_GENERATE_HEADERS_SCHEMAS})
+  add_dependencies(
+    ${FLATBUFFERS_GENERATE_HEADERS_TARGET}
+    ${FLATC}
+    ${FLATBUFFERS_GENERATE_HEADERS_SCHEMAS})
+  target_include_directories(
+    ${FLATBUFFERS_GENERATE_HEADERS_TARGET}
+    INTERFACE ${generated_target_dir})
+
+  # Organize file layout for IDEs.
+  source_group(
+    TREE "${generated_target_dir}"
+    PREFIX "Flatbuffers/Generated/Headers Files"
+    FILES ${all_generated_header_files})
+  source_group(
+    TREE ${working_dir}
+    PREFIX "Flatbuffers/Schemas"
+    FILES ${FLATBUFFERS_GENERATE_HEADERS_SCHEMAS})
+  if (NOT ${FLATBUFFERS_GENERATE_HEADERS_BINARY_SCHEMAS_DIR} STREQUAL "")
+    source_group(
+      TREE "${FLATBUFFERS_GENERATE_HEADERS_BINARY_SCHEMAS_DIR}"
+      PREFIX "Flatbuffers/Generated/Binary Schemas"
+      FILES ${all_generated_binary_files})
+  endif()
+endfunction()
+
+# Creates a target that can be linked against that generates flatbuffer binaries
+# from json files.
+#
+# This function takes a target name and a list of schemas and Json files. You
+# can also specify other flagc flags and options to change the behavior of the
+# flatc compiler.
+#
+# Adding this target to your executable ensurses that the flatbuffer binaries
+# are compiled before your executable is run.
+#
+# Arguments:
+#   TARGET: The name of the target to generate.
+#   JSON_FILES: The list of json files to compile to flatbuffers binaries.
+#   SCHEMA: The flatbuffers schema of the Json files to be compiled.
+#   INCLUDE: Optional. Search for includes in the specified paths. (Use this 
+#       instead of "-I <path>" and the FLAGS option so that CMake is aware of 
+#       the directories that need to be searched).
+#   OUTPUT_DIR: The directly where the generated flatbuffers binaries should be
+#       placed.
+#   FLAGS: Optional. A list of any additional flags that you would like to pass
+#       to flatc.
+#
+# Example:
+#
+#     flatbuffers_generate_binary_files(
+#         TARGET my_binary_data
+#         SCHEMA "${MY_SCHEMA_DIR}/my_example_schema.fbs"
+#         JSON_FILES ${MY_JSON_FILES}
+#         OUTPUT_DIR "${MY_BINARY_DATA_DIRECTORY}"
+#         FLAGS --strict-json)
+#
+#     target_link_libraries(MyExecutableTarget
+#         PRIVATE my_binary_data
+#     )
+function(flatbuffers_generate_binary_files)
+  # Parse function arguments.
+  set(options)
+  set(one_value_args
+    "TARGET"
+    "SCHEMA"
+    "OUTPUT_DIR")
+  set(multi_value_args
+    "JSON_FILES"
+    "INCLUDE"
+    "FLAGS")
+  cmake_parse_arguments(
+    PARSE_ARGV 0
+    FLATBUFFERS_GENERATE_BINARY_FILES
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}")
+
+  # Test if including from FindFlatBuffers
+  if(FLATBUFFERS_FLATC_EXECUTABLE)
+    set(FLATC_TARGET "")
+    set(FLATC ${FLATBUFFERS_FLATC_EXECUTABLE})
+  else()
+    set(FLATC_TARGET flatc)
+    set(FLATC flatc)
+  endif()
+
+  set(working_dir "${CMAKE_CURRENT_SOURCE_DIR}")
+
+  # Generate the include files parameters.
+  set(include_params "")
+  foreach (include_dir ${FLATBUFFERS_GENERATE_BINARY_FILES_INCLUDE})
+    set(include_params -I ${include_dir} ${include_params})
+  endforeach()
+
+  # Create rules to generate the flatbuffers binary for each json file.
+  foreach(json_file ${FLATBUFFERS_GENERATE_BINARY_FILES_JSON_FILES})
+    get_filename_component(filename ${json_file} NAME_WE)
+    set(generated_binary_file "${FLATBUFFERS_GENERATE_BINARY_FILES_OUTPUT_DIR}/${filename}.bin")
+    add_custom_command(
+      OUTPUT ${generated_binary_file}
+      COMMAND ${FLATC} ${FLATC_ARGS}
+      -o ${FLATBUFFERS_GENERATE_BINARY_FILES_OUTPUT_DIR}
+      ${include_params}
+      -b ${FLATBUFFERS_GENERATE_BINARY_FILES_SCHEMA} ${json_file}
+      ${FLATBUFFERS_GENERATE_BINARY_FILES_FLAGS}
+      DEPENDS ${FLATC_TARGET} ${json_file}
+      WORKING_DIRECTORY "${working_dir}")
+      list(APPEND all_generated_binary_files ${generated_binary_file})
+  endforeach()
+
+  # Set up interface library
+  add_library(${FLATBUFFERS_GENERATE_BINARY_FILES_TARGET} INTERFACE)
+  target_sources(
+    ${FLATBUFFERS_GENERATE_BINARY_FILES_TARGET}
+    INTERFACE
+      ${all_generated_binary_files}
+      ${FLATBUFFERS_GENERATE_BINARY_FILES_JSON_FILES}
+      ${FLATBUFFERS_GENERATE_BINARY_FILES_SCHEMA})
+  add_dependencies(
+    ${FLATBUFFERS_GENERATE_BINARY_FILES_TARGET}
+    ${FLATC})
+
+  # Organize file layout for IDEs.
+  source_group(
+    TREE ${working_dir}
+    PREFIX "Flatbuffers/JSON Files"
+    FILES ${FLATBUFFERS_GENERATE_BINARY_FILES_JSON_FILES})
+  source_group(
+    TREE ${working_dir}
+    PREFIX "Flatbuffers/Schemas"
+    FILES ${FLATBUFFERS_GENERATE_BINARY_FILES_SCHEMA})
+  source_group(
+    TREE ${FLATBUFFERS_GENERATE_BINARY_FILES_OUTPUT_DIR}
+    PREFIX "Flatbuffers/Generated/Binary Files"
+    FILES ${all_generated_binary_files})
+endfunction()
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/DESCRIPTION.txt b/3rdparty/TNN/third_party/flatbuffers/CMake/DESCRIPTION.txt
new file mode 100644
index 0000000..3698b03
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/DESCRIPTION.txt
@@ -0,0 +1,4 @@
+FlatBuffers is a cross platform serialization library architected for
+maximum memory efficiency. It allows you to directly access serialized
+data without parsing/unpacking it first, while still having great 
+forwards/backwards compatibility.
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/FindFlatBuffers.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/FindFlatBuffers.cmake
new file mode 100644
index 0000000..044cf7c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/FindFlatBuffers.cmake
@@ -0,0 +1,61 @@
+# Copyright 2014 Stefan.Eilemann@epfl.ch
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Find the flatbuffers schema compiler
+#
+# Output Variables:
+# * FLATBUFFERS_FLATC_EXECUTABLE the flatc compiler executable
+# * FLATBUFFERS_FOUND
+#
+# Provides:
+# * FLATBUFFERS_GENERATE_C_HEADERS(Name <files>) creates the C++ headers
+#   for the given flatbuffer schema files.
+#   Returns the header files in ${Name}_OUTPUTS
+
+set(FLATBUFFERS_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+find_program(FLATBUFFERS_FLATC_EXECUTABLE NAMES flatc)
+find_path(FLATBUFFERS_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FlatBuffers
+  DEFAULT_MSG FLATBUFFERS_FLATC_EXECUTABLE FLATBUFFERS_INCLUDE_DIR)
+
+if(FLATBUFFERS_FOUND)
+  function(FLATBUFFERS_GENERATE_C_HEADERS Name)
+    set(FLATC_OUTPUTS)
+    foreach(FILE ${ARGN})
+      get_filename_component(FLATC_OUTPUT ${FILE} NAME_WE)
+      set(FLATC_OUTPUT
+        "${CMAKE_CURRENT_BINARY_DIR}/${FLATC_OUTPUT}_generated.h")
+      list(APPEND FLATC_OUTPUTS ${FLATC_OUTPUT})
+
+      add_custom_command(OUTPUT ${FLATC_OUTPUT}
+        COMMAND ${FLATBUFFERS_FLATC_EXECUTABLE}
+        ARGS -c -o "${CMAKE_CURRENT_BINARY_DIR}/" ${FILE}
+        DEPENDS ${FILE}
+        COMMENT "Building C++ header for ${FILE}"
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    endforeach()
+    set(${Name}_OUTPUTS ${FLATC_OUTPUTS} PARENT_SCOPE)
+  endfunction()
+
+  set(FLATBUFFERS_INCLUDE_DIRS ${FLATBUFFERS_INCLUDE_DIR})
+  include_directories(${CMAKE_BINARY_DIR})
+else()
+  set(FLATBUFFERS_INCLUDE_DIR)
+endif()
+
+include("${FLATBUFFERS_CMAKE_DIR}/BuildFlatBuffers.cmake")
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfig.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfig.cmake
new file mode 100644
index 0000000..107d78e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfig.cmake
@@ -0,0 +1,4 @@
+include("${CMAKE_CURRENT_LIST_DIR}/FlatbuffersTargets.cmake" OPTIONAL)
+include("${CMAKE_CURRENT_LIST_DIR}/FlatcTargets.cmake" OPTIONAL)
+include("${CMAKE_CURRENT_LIST_DIR}/FlatbuffersSharedTargets.cmake" OPTIONAL)
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfigVersion.cmake.in b/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfigVersion.cmake.in
new file mode 100644
index 0000000..a553ab1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/FlatbuffersConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_PATCH@")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/PackageDebian.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/PackageDebian.cmake
new file mode 100644
index 0000000..f587ff7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/PackageDebian.cmake
@@ -0,0 +1,39 @@
+# ------------------- Debianization ---------------------
+if (UNIX)
+
+    # Set build environment
+    SET(CPACK_GENERATOR "TGZ;DEB")
+    SET(CPACK_SOURCE_TGZ "ON")
+
+    # Common package information
+    SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY
+        "FlatBuffers is an efficient cross platform serialization library for C++, with support for Java, C# and Go. It was created at Google specifically for game development and other performance-critical applications.")
+    SET(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/google/flatbuffers")
+    SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "Vitaly Isaev <vitalyisaev2@gmail.com>")
+
+    SET(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
+    SET(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
+    SET(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
+    SET(CPACK_PACKAGE_VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}-${VERSION_COMMIT}")
+    SET(CPACK_DEBIAN_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}")
+
+    # Derive architecture
+    IF(NOT CPACK_DEBIAN_PACKAGE_ARCHITECTURE)
+      FIND_PROGRAM(DPKG_CMD dpkg)
+      IF(NOT DPKG_CMD)
+        MESSAGE(STATUS "Can not find dpkg in your path, default to i386.")
+        SET(CPACK_DEBIAN_PACKAGE_ARCHITECTURE i386)
+      ENDIF(NOT DPKG_CMD)
+      EXECUTE_PROCESS(COMMAND "${DPKG_CMD}" --print-architecture
+        OUTPUT_VARIABLE CPACK_DEBIAN_PACKAGE_ARCHITECTURE
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+    ENDIF(NOT CPACK_DEBIAN_PACKAGE_ARCHITECTURE)
+
+    # Package name
+    SET(CPACK_DEBIAN_PACKAGE_NAME "flatbuffers")
+    SET(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_SOURCE_DIR}/LICENSE.txt)
+    SET(CPACK_PACKAGE_FILE_NAME
+        "${CPACK_DEBIAN_PACKAGE_NAME}_${CPACK_DEBIAN_PACKAGE_VERSION}_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}")
+
+endif(UNIX)
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/PackageRedhat.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/PackageRedhat.cmake
new file mode 100644
index 0000000..5b7c6fa
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/PackageRedhat.cmake
@@ -0,0 +1,44 @@
+if (UNIX)
+    set(CPACK_GENERATOR "RPM")
+    set(CPACK_SOURCE_TGZ "ON")
+
+    set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "FlatBuffers serialization library and schema compiler.")
+    
+    set(CPACK_RPM_PACKAGE_HOMEPAGE "https://github.com/google/flatbuffers")
+    set(CPACK_RPM_PACKAGE_MAINTAINER "Marc Butler <mockbutler@gmail.com>")
+
+    set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
+    set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
+    set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
+    set(CPACK_PACKAGE_VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}-${VERSION_COMMIT}")
+    set(CPACK_RPM_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}")
+
+    set(CPACK_RPM_PACKAGE_NAME "flatbuffers")
+
+    # Assume this is not a cross complation build.
+    if(NOT CPACK_RPM_PACKAGE_ARCHITECTURE)
+        set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
+    endif(NOT CPACK_RPM_PACKAGE_ARCHITECTURE)
+
+    set(CPACK_RPM_PACKAGE_VENDOR "Google, Inc.")
+    set(CPACK_RPM_PACKAGE_LICENSE "Apache 2.0")
+    set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_SOURCE_DIR}/LICENSE.txt)
+    set(CPACK_PACKAGE_DESCRIPTION_FILE ${CMAKE_SOURCE_DIR}/CMake/DESCRIPTION.txt)
+
+    # This may reduce rpm compatiblity with very old systems.
+    set(CPACK_RPM_COMPRESSION_TYPE lzma)
+    
+    set(CPACK_RPM_PACKAGE_NAME "flatbuffers")
+    set(CPACK_PACKAGE_FILE_NAME
+        "${CPACK_RPM_PACKAGE_NAME}_${CPACK_RPM_PACKAGE_VERSION}_${CPACK_RPM_PACKAGE_ARCHITECTURE}")
+    if(NOT DEFINED ${CPACK_PACKAGING_INSTALL_PREFIX})
+       # Default packaging install prefix on RedHat systems is /usr.
+       # This is the assumed value when this variable is not defined.
+       # There is currently a conflict with
+       # /usr/${CMAKE_INSTALL_LIBDIR}/cmake which is installed by default
+       # by other packages on RedHat (most notably cmake-filesystem). Ensure
+       # that on these systems, flatbuffers does not package this path.
+       # This patch is required for cmake pre-3.17.
+       list(APPEND CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/usr/${CMAKE_INSTALL_LIBDIR}/cmake")
+   endif()
+endif(UNIX)
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/Version.cmake b/3rdparty/TNN/third_party/flatbuffers/CMake/Version.cmake
new file mode 100644
index 0000000..cc6ca1f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/Version.cmake
@@ -0,0 +1,28 @@
+set(VERSION_MAJOR 2)
+set(VERSION_MINOR 0)
+set(VERSION_PATCH 0)
+set(VERSION_COMMIT 0)
+
+find_program(GIT git)
+if(GIT)
+  execute_process(
+      COMMAND ${GIT} describe
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_DESCRIBE_DIRTY
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      RESULT_VARIABLE GIT_DESCRIBE_RESULT
+  )
+
+  if(GIT_DESCRIBE_RESULT EQUAL 0)
+    string(REGEX REPLACE "^v([0-9]+)\\..*" "\\1" VERSION_MAJOR "${GIT_DESCRIBE_DIRTY}")
+    string(REGEX REPLACE "^v[0-9]+\\.([0-9]+).*" "\\1" VERSION_MINOR "${GIT_DESCRIBE_DIRTY}")
+    string(REGEX REPLACE "^v[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${GIT_DESCRIBE_DIRTY}")
+    string(REGEX REPLACE "^v[0-9]+\\.[0-9]+\\.[0-9]+\\-([0-9]+).*" "\\1" VERSION_COMMIT "${GIT_DESCRIBE_DIRTY}")
+  else()
+    message(WARNING "git describe failed with exit code: ${GIT_DESCRIBE_RESULT}")
+  endif()
+else()
+  message(WARNING "git is not found")
+endif()
+
+message(STATUS "Proceeding with version: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}.${VERSION_COMMIT}")
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMake/flatbuffers.pc.in b/3rdparty/TNN/third_party/flatbuffers/CMake/flatbuffers.pc.in
new file mode 100644
index 0000000..110770a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMake/flatbuffers.pc.in
@@ -0,0 +1,9 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: FlatBuffers
+Description: Memory Efficient Serialization Library
+Version: @VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_PATCH@
+
+Libs: -L${libdir} -lflatbuffers
+Cflags: -I${includedir}
diff --git a/3rdparty/TNN/third_party/flatbuffers/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/CMakeLists.txt
new file mode 100644
index 0000000..73db8eb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CMakeLists.txt
@@ -0,0 +1,678 @@
+cmake_minimum_required(VERSION 2.8.12)
+# generate compile_commands.json
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+include(CheckCXXSymbolExists)
+
+project(FlatBuffers)
+
+# NOTE: Code coverage only works on Linux & OSX.
+option(FLATBUFFERS_CODE_COVERAGE "Enable the code coverage build option." OFF)
+option(FLATBUFFERS_BUILD_TESTS "Enable the build of tests and samples." OFF)
+option(FLATBUFFERS_INSTALL "Enable the installation of targets." ON)
+option(FLATBUFFERS_BUILD_FLATLIB "Enable the build of the flatbuffers library"
+       ON)
+option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler"
+       ON)
+option(FLATBUFFERS_STATIC_FLATC "Build flatbuffers compiler with -static flag"
+       OFF)
+option(FLATBUFFERS_BUILD_FLATHASH "Enable the build of flathash" ON)
+option(FLATBUFFERS_BUILD_GRPCTEST "Enable the build of grpctest" OFF)
+option(FLATBUFFERS_BUILD_SHAREDLIB
+       "Enable the build of the flatbuffers shared library"
+       OFF)
+option(FLATBUFFERS_LIBCXX_WITH_CLANG "Force libc++ when using Clang" ON)
+# NOTE: Sanitizer check only works on Linux & OSX (gcc & llvm).
+option(FLATBUFFERS_CODE_SANITIZE
+      "Add '-fsanitize' flags to 'flattests' and 'flatc' targets."
+      OFF)
+option(FLATBUFFERS_PACKAGE_REDHAT
+       "Build an rpm using the 'package' target."
+       OFF)
+option(FLATBUFFERS_PACKAGE_DEBIAN
+       "Build an deb using the 'package' target."
+       OFF)
+option(FLATBUFFERS_BUILD_CPP17
+       "Enable the build of c++17 test target. \"
+       Requirements: Clang6, GCC7, MSVC2017 (_MSC_VER >= 1914)  or higher."
+       OFF)
+option(FLATBUFFERS_BUILD_LEGACY
+       "Run C++ code generator with '--cpp-std c++0x' switch."
+       OFF)
+option(FLATBUFFERS_ENABLE_PCH
+       "Enable precompile headers support for 'flatbuffers' and 'flatc'. \"
+        Only work if CMake supports 'target_precompile_headers'. \"
+        This can speed up compilation time."
+       OFF)
+
+if(NOT FLATBUFFERS_BUILD_FLATC AND FLATBUFFERS_BUILD_TESTS)
+    message(WARNING
+    "Cannot build tests without building the compiler. Tests will be disabled.")
+    set(FLATBUFFERS_BUILD_TESTS OFF)
+endif()
+
+if(DEFINED FLATBUFFERS_MAX_PARSING_DEPTH)
+  # Override the default recursion depth limit.
+  add_definitions(-DFLATBUFFERS_MAX_PARSING_DEPTH=${FLATBUFFERS_MAX_PARSING_DEPTH})
+  message(STATUS "FLATBUFFERS_MAX_PARSING_DEPTH: ${FLATBUFFERS_MAX_PARSING_DEPTH}")
+endif()
+
+# Auto-detect locale-narrow 'strtod_l' and  'strtoull_l' functions.
+if(NOT DEFINED FLATBUFFERS_LOCALE_INDEPENDENT)
+  set(FLATBUFFERS_LOCALE_INDEPENDENT 0)
+  if(MSVC)
+    check_cxx_symbol_exists(_strtof_l stdlib.h FLATBUFFERS_HAS_STRTOF_L)
+    check_cxx_symbol_exists(_strtoui64_l stdlib.h FLATBUFFERS_HAS_STRTOULL_L)
+  else()
+    check_cxx_symbol_exists(strtof_l stdlib.h FLATBUFFERS_HAS_STRTOF_L)
+    check_cxx_symbol_exists(strtoull_l stdlib.h FLATBUFFERS_HAS_STRTOULL_L)
+  endif()
+  if(FLATBUFFERS_HAS_STRTOF_L AND FLATBUFFERS_HAS_STRTOULL_L)
+    set(FLATBUFFERS_LOCALE_INDEPENDENT 1)
+  endif()
+endif()
+add_definitions(-DFLATBUFFERS_LOCALE_INDEPENDENT=$<BOOL:${FLATBUFFERS_LOCALE_INDEPENDENT}>)
+
+set(FlatBuffers_Library_SRCS
+  include/flatbuffers/base.h
+  include/flatbuffers/flatbuffers.h
+  include/flatbuffers/hash.h
+  include/flatbuffers/idl.h
+  include/flatbuffers/util.h
+  include/flatbuffers/reflection.h
+  include/flatbuffers/reflection_generated.h
+  include/flatbuffers/stl_emulation.h
+  include/flatbuffers/flexbuffers.h
+  include/flatbuffers/registry.h
+  include/flatbuffers/minireflect.h
+  src/idl_parser.cpp
+  src/idl_gen_text.cpp
+  src/reflection.cpp
+  src/util.cpp
+)
+
+set(FlatBuffers_Compiler_SRCS
+  ${FlatBuffers_Library_SRCS}
+  src/idl_gen_cpp.cpp
+  src/idl_gen_csharp.cpp
+  src/idl_gen_dart.cpp
+  src/idl_gen_kotlin.cpp
+  src/idl_gen_go.cpp
+  src/idl_gen_java.cpp
+  src/idl_gen_ts.cpp
+  src/idl_gen_php.cpp
+  src/idl_gen_python.cpp
+  src/idl_gen_lobster.cpp
+  src/idl_gen_lua.cpp
+  src/idl_gen_rust.cpp
+  src/idl_gen_fbs.cpp
+  src/idl_gen_grpc.cpp
+  src/idl_gen_json_schema.cpp
+  src/idl_gen_swift.cpp
+  src/flatc.cpp
+  src/flatc_main.cpp
+  include/flatbuffers/code_generators.h
+  src/code_generators.cpp
+  grpc/src/compiler/schema_interface.h
+  grpc/src/compiler/cpp_generator.h
+  grpc/src/compiler/cpp_generator.cc
+  grpc/src/compiler/go_generator.h
+  grpc/src/compiler/go_generator.cc
+  grpc/src/compiler/java_generator.h
+  grpc/src/compiler/java_generator.cc
+  grpc/src/compiler/python_generator.h
+  grpc/src/compiler/python_generator.cc
+  grpc/src/compiler/swift_generator.h
+  grpc/src/compiler/swift_generator.cc
+  grpc/src/compiler/ts_generator.h
+  grpc/src/compiler/ts_generator.cc
+)
+
+set(FlatHash_SRCS
+  include/flatbuffers/hash.h
+  src/flathash.cpp
+)
+
+set(FlatBuffers_Tests_SRCS
+  ${FlatBuffers_Library_SRCS}
+  src/idl_gen_fbs.cpp
+  tests/test.cpp
+  tests/test_assert.h
+  tests/test_assert.cpp
+  tests/test_builder.h
+  tests/test_builder.cpp
+  tests/native_type_test_impl.h
+  tests/native_type_test_impl.cpp
+  include/flatbuffers/code_generators.h
+  src/code_generators.cpp
+  # file generate by running compiler on tests/monster_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/monster_test_generated.h
+  # file generate by running compiler on namespace_test/namespace_test1.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/namespace_test/namespace_test1_generated.h
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/namespace_test/namespace_test2_generated.h
+  # file generate by running compiler on union_vector/union_vector.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/union_vector/union_vector_generated.h
+  # file generate by running compiler on tests/arrays_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/arrays_test_generated.h
+  # file generate by running compiler on tests/native_type_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/native_type_test_generated.h
+  # file generate by running compiler on tests/monster_extra.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/monster_extra_generated.h
+  # file generate by running compiler on tests/monster_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/monster_test_bfbs_generated.h
+  # file generate by running compiler on tests/optional_scalars.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/optional_scalars_generated.h
+)
+
+set(FlatBuffers_Tests_CPP17_SRCS
+  ${FlatBuffers_Library_SRCS}
+  tests/test_assert.h
+  tests/test_assert.cpp
+  tests/cpp17/test_cpp17.cpp
+  # file generate by running compiler on tests/monster_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/cpp17/generated_cpp17/monster_test_generated.h
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/monster_test_generated.h
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/cpp17/generated_cpp17/optional_scalars_generated.h
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/optional_scalars_generated.h
+)
+
+set(FlatBuffers_Sample_Binary_SRCS
+  include/flatbuffers/flatbuffers.h
+  samples/sample_binary.cpp
+  # file generated by running compiler on samples/monster.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/samples/monster_generated.h
+)
+
+set(FlatBuffers_Sample_Text_SRCS
+  ${FlatBuffers_Library_SRCS}
+  samples/sample_text.cpp
+  # file generated by running compiler on samples/monster.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/samples/monster_generated.h
+)
+
+set(FlatBuffers_Sample_BFBS_SRCS
+  ${FlatBuffers_Library_SRCS}
+  samples/sample_bfbs.cpp
+  # file generated by running compiler on samples/monster.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/samples/monster_generated.h
+)
+
+set(FlatBuffers_GRPCTest_SRCS
+  include/flatbuffers/flatbuffers.h
+  include/flatbuffers/grpc.h
+  include/flatbuffers/util.h
+  src/util.cpp
+  tests/monster_test.grpc.fb.h
+  tests/test_assert.h
+  tests/test_builder.h
+  tests/monster_test.grpc.fb.cc
+  tests/test_assert.cpp
+  tests/test_builder.cpp
+  grpc/tests/grpctest.cpp
+  grpc/tests/message_builder_test.cpp
+  # file generate by running compiler on tests/monster_test.fbs
+  ${CMAKE_CURRENT_BINARY_DIR}/tests/monster_test_generated.h
+)
+
+# source_group(Compiler FILES ${FlatBuffers_Compiler_SRCS})
+# source_group(Tests FILES ${FlatBuffers_Tests_SRCS})
+
+if(EXISTS "${CMAKE_TOOLCHAIN_FILE}")
+  # do not apply any global settings if the toolchain
+  # is being configured externally
+  message(STATUS "Using toolchain file: ${CMAKE_TOOLCHAIN_FILE}.")
+elseif(APPLE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=libc++")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Wno-unused-parameter")
+  set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
+elseif(CMAKE_COMPILER_IS_GNUCXX)
+  if(CYGWIN)
+    set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -std=gnu++11")
+  else(CYGWIN)
+    set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -std=c++0x")
+  endif(CYGWIN)
+  set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow")
+  set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(CMAKE_CXX_FLAGS
+        "${CMAKE_CXX_FLAGS} -faligned-new -Werror=implicit-fallthrough=2")
+    endif()
+    set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wunused-result -Werror=unused-result -Wunused-parameter -Werror=unused-parameter")
+  endif()
+
+  # Certain platforms such as ARM do not use signed chars by default
+  # which causes issues with certain bounds checks.
+  set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -fsigned-char")
+
+elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -pedantic -Werror -Wextra -Wno-unused-parameter")
+  set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
+  if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.8)
+    list(APPEND FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wimplicit-fallthrough" "-Wextra-semi" "-Werror=unused-private-field") # enable warning
+  endif()
+  if(FLATBUFFERS_LIBCXX_WITH_CLANG)
+    if(NOT "${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
+      set(CMAKE_CXX_FLAGS
+          "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+    endif()
+    if(NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "FreeBSD" OR
+            "${CMAKE_SYSTEM_NAME}" MATCHES "Linux"))
+      set(CMAKE_EXE_LINKER_FLAGS
+          "${CMAKE_EXE_LINKER_FLAGS} -lc++abi")
+    endif()
+  endif()
+
+  # Certain platforms such as ARM do not use signed chars by default
+  # which causes issues with certain bounds checks.
+  set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -fsigned-char")
+
+elseif(MSVC)
+  # Visual Studio pedantic build settings
+  # warning C4512: assignment operator could not be generated
+  # warning C4316: object allocated on the heap may not be aligned
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /WX /wd4512 /wd4316")
+
+  # multi-core build.
+  add_definitions("/MP")
+endif()
+
+# Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS.
+if(DEFINED FLATBUFFERS_CXX_FLAGS AND NOT EXISTS "${CMAKE_TOOLCHAIN_FILE}")
+  message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}")
+endif()
+message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+if(FLATBUFFERS_CODE_COVERAGE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fprofile-arcs -ftest-coverage")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -fprofile-arcs -ftest-coverage")
+endif()
+
+function(add_fsanitize_to_target _target _sanitizer)
+  if(WIN32)
+    target_compile_definitions(${_target} PRIVATE FLATBUFFERS_MEMORY_LEAK_TRACKING)
+    message(STATUS "Sanitizer MSVC::_CrtDumpMemoryLeaks added to ${_target}")
+  else()
+    # FLATBUFFERS_CODE_SANITIZE: boolean {ON,OFF,YES,NO} or string with list of sanitizer.
+    # List of sanitizer is string starts with '=': "=address,undefined,thread,memory".
+    if((${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") OR
+      ((${CMAKE_CXX_COMPILER_ID} MATCHES "GNU") AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9"))
+    )
+      set(_sanitizer_flags "=address,undefined")
+      if(_sanitizer MATCHES "=.*")
+        # override default by user-defined sanitizer list
+        set(_sanitizer_flags ${_sanitizer})
+      endif()
+      target_compile_options(${_target} PRIVATE
+        -g -fsigned-char -fno-omit-frame-pointer
+        "-fsanitize${_sanitizer_flags}")
+      target_link_libraries(${_target} PRIVATE
+        "-fsanitize${_sanitizer_flags}")
+      set_property(TARGET ${_target} PROPERTY POSITION_INDEPENDENT_CODE ON)
+      message(STATUS "Sanitizer ${_sanitizer_flags} added to ${_target}")
+    endif()
+  endif()
+endfunction()
+
+function(add_pch_to_target _target _pch_header)
+  if(COMMAND target_precompile_headers)
+    target_precompile_headers(${_target} PRIVATE ${_pch_header})
+    if(NOT MSVC)
+      set_source_files_properties(src/util.cpp PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
+    endif()
+  endif()
+endfunction()
+
+if(BIICODE)
+  include(biicode/cmake/biicode.cmake)
+  return()
+endif()
+
+include_directories(include)
+include_directories(grpc)
+
+if(FLATBUFFERS_BUILD_FLATLIB)
+  add_library(flatbuffers STATIC ${FlatBuffers_Library_SRCS})
+  # Attach header directory for when build via add_subdirectory().
+  target_include_directories(flatbuffers INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+  target_compile_options(flatbuffers PRIVATE "${FLATBUFFERS_PRIVATE_CXX_FLAGS}")
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatbuffers include/flatbuffers/pch/pch.h)
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_FLATC)
+  add_executable(flatc ${FlatBuffers_Compiler_SRCS})
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatc include/flatbuffers/pch/flatc_pch.h)
+  endif()
+  target_compile_options(flatc PRIVATE "${FLATBUFFERS_PRIVATE_CXX_FLAGS}")
+  if(FLATBUFFERS_CODE_SANITIZE AND NOT WIN32)
+    add_fsanitize_to_target(flatc ${FLATBUFFERS_CODE_SANITIZE})
+  endif()
+  if(NOT FLATBUFFERS_FLATC_EXECUTABLE)
+    set(FLATBUFFERS_FLATC_EXECUTABLE $<TARGET_FILE:flatc>)
+  endif()
+  if(MSVC)
+    # Make flatc.exe not depend on runtime dlls for easy distribution.
+    target_compile_options(flatc PUBLIC $<$<CONFIG:Release>:/MT>)
+  endif()
+  if(FLATBUFFERS_STATIC_FLATC AND NOT MSVC)
+    target_link_libraries(flatc PRIVATE -static)
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_FLATHASH)
+  add_executable(flathash ${FlatHash_SRCS})
+endif()
+
+if(FLATBUFFERS_BUILD_SHAREDLIB)
+  add_library(flatbuffers_shared SHARED ${FlatBuffers_Library_SRCS})
+
+  # Shared object version: "major.minor.micro"
+  # - micro updated every release when there is no API/ABI changes
+  # - minor updated when there are additions in API/ABI
+  # - major (ABI number) updated when there are changes in ABI (or removals)
+  set(FlatBuffers_Library_SONAME_MAJOR "2")
+  set(FlatBuffers_Library_SONAME_FULL "${FlatBuffers_Library_SONAME_MAJOR}.0.0")
+  set_target_properties(flatbuffers_shared PROPERTIES OUTPUT_NAME flatbuffers
+                        SOVERSION "${FlatBuffers_Library_SONAME_MAJOR}"
+                        VERSION "${FlatBuffers_Library_SONAME_FULL}")
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatbuffers_shared include/flatbuffers/pch/pch.h)
+  endif()
+endif()
+
+# Global list of generated files.
+# Use the global property to be independent of PARENT_SCOPE.
+set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+
+function(get_generated_output generated_files)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  set(${generated_files} ${tmp} PARENT_SCOPE)
+endfunction(get_generated_output)
+
+function(register_generated_output file_name)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  list(APPEND tmp ${file_name})
+  set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
+endfunction(register_generated_output)
+
+function(compile_flatbuffers_schema_to_cpp_opt SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT ${GEN_HEADER}
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            --cpp --gen-mutable --gen-object-api --reflect-names
+            --cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs
+            ${OPT}
+            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
+            -o "${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatc
+    COMMENT "Run generation: '${GEN_HEADER}'")
+  register_generated_output(${GEN_HEADER})
+endfunction()
+
+function(compile_flatbuffers_schema_to_cpp SRC_FBS)
+  compile_flatbuffers_schema_to_cpp_opt(${SRC_FBS} "--no-includes;--gen-compare")
+endfunction()
+
+function(compile_flatbuffers_schema_to_binary SRC_FBS)
+  message(STATUS "`${SRC_FBS}`: add generation of binary (.bfbs) schema")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  string(REGEX REPLACE "\\.fbs$" ".bfbs" GEN_BINARY_SCHEMA ${SRC_FBS})
+  # For details about flags see generate_code.bat(sh)
+  add_custom_command(
+    OUTPUT ${GEN_BINARY_SCHEMA}
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            -b --schema --bfbs-comments --bfbs-builtins
+            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
+            -o "${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatc
+    COMMENT "Run generation: '${GEN_BINARY_SCHEMA}'")
+  register_generated_output(${GEN_BINARY_SCHEMA})
+endfunction()
+
+function(compile_flatbuffers_schema_to_embedded_binary SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ embedded binary schema code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  string(REGEX REPLACE "\\.fbs$" "_bfbs_generated.h" GEN_BFBS_HEADER ${SRC_FBS})
+  # For details about flags see generate_code.bat(sh)
+  add_custom_command(
+          OUTPUT ${GEN_BFBS_HEADER}
+          COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+          --cpp --gen-mutable --gen-object-api --reflect-names
+          --cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs
+          ${OPT}
+          --bfbs-comments --bfbs-builtins --bfbs-gen-embed
+          -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
+          -o "${SRC_FBS_DIR}"
+          "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+          DEPENDS flatc
+          COMMENT "Run generation: '${GEN_BFBS_HEADER}'")
+  register_generated_output(${GEN_BFBS_HEADER})
+endfunction()
+
+if(FLATBUFFERS_BUILD_TESTS)
+  file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/tests" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+  file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/samples" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+
+  # TODO Add (monster_test.fbs monsterdata_test.json)->monsterdata_test.mon
+  compile_flatbuffers_schema_to_cpp(tests/monster_test.fbs)
+  compile_flatbuffers_schema_to_binary(tests/monster_test.fbs)
+  compile_flatbuffers_schema_to_cpp(tests/namespace_test/namespace_test1.fbs)
+  compile_flatbuffers_schema_to_cpp(tests/namespace_test/namespace_test2.fbs)
+  compile_flatbuffers_schema_to_cpp(tests/union_vector/union_vector.fbs)
+  compile_flatbuffers_schema_to_cpp(tests/optional_scalars.fbs)
+  compile_flatbuffers_schema_to_cpp_opt(tests/native_type_test.fbs "")
+  compile_flatbuffers_schema_to_cpp_opt(tests/arrays_test.fbs "--scoped-enums;--gen-compare")
+  compile_flatbuffers_schema_to_binary(tests/arrays_test.fbs)
+  compile_flatbuffers_schema_to_embedded_binary(tests/monster_test.fbs "--no-includes;--gen-compare")
+  if(NOT (MSVC AND (MSVC_VERSION LESS 1900)))
+    compile_flatbuffers_schema_to_cpp(tests/monster_extra.fbs) # Test floating-point NAN/INF.
+  endif()
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/tests)
+  add_executable(flattests ${FlatBuffers_Tests_SRCS})
+  add_dependencies(flattests generated_code)
+  set_property(TARGET flattests
+    PROPERTY COMPILE_DEFINITIONS FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+    FLATBUFFERS_DEBUG_VERIFICATION_FAILURE=1)
+  if(FLATBUFFERS_CODE_SANITIZE)
+    add_fsanitize_to_target(flattests ${FLATBUFFERS_CODE_SANITIZE})
+  endif()
+
+  compile_flatbuffers_schema_to_cpp(samples/monster.fbs)
+  compile_flatbuffers_schema_to_binary(samples/monster.fbs)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/samples)
+  add_executable(flatsamplebinary ${FlatBuffers_Sample_Binary_SRCS})
+  add_dependencies(flatsamplebinary generated_code)
+  add_executable(flatsampletext ${FlatBuffers_Sample_Text_SRCS})
+  add_dependencies(flatsampletext generated_code)
+  add_executable(flatsamplebfbs ${FlatBuffers_Sample_BFBS_SRCS})
+  add_dependencies(flatsamplebfbs generated_code)
+
+  if(FLATBUFFERS_BUILD_CPP17)
+    # Don't generate header for flattests_cpp17 target.
+    # This target uses "generated_cpp17/monster_test_generated.h"
+    # produced by direct call of generate_code.bat(sh) script.
+    add_executable(flattests_cpp17 ${FlatBuffers_Tests_CPP17_SRCS})
+    add_dependencies(flattests_cpp17 generated_code)
+    target_compile_features(flattests_cpp17 PRIVATE cxx_std_17)
+    target_compile_definitions(flattests_cpp17 PRIVATE
+      FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      FLATBUFFERS_DEBUG_VERIFICATION_FAILURE=1
+    )
+    if(FLATBUFFERS_CODE_SANITIZE)
+      add_fsanitize_to_target(flattests_cpp17 ${FLATBUFFERS_CODE_SANITIZE})
+    endif()
+  endif(FLATBUFFERS_BUILD_CPP17)
+endif()
+
+if(FLATBUFFERS_BUILD_GRPCTEST)
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-shadow")
+  endif()
+  if(NOT GRPC_INSTALL_PATH)
+    message(SEND_ERROR "GRPC_INSTALL_PATH variable is not defined. See grpc/README.md")
+  endif()
+  if(NOT PROTOBUF_DOWNLOAD_PATH)
+    message(SEND_ERROR "PROTOBUF_DOWNLOAD_PATH variable is not defined. See grpc/README.md")
+  endif()
+  INCLUDE_DIRECTORIES(${GRPC_INSTALL_PATH}/include)
+  INCLUDE_DIRECTORIES(${PROTOBUF_DOWNLOAD_PATH}/src)
+  find_package(Threads REQUIRED)
+  list(APPEND CMAKE_PREFIX_PATH ${GRPC_INSTALL_PATH})
+  find_package(protobuf CONFIG REQUIRED)
+  find_package(gRPC CONFIG REQUIRED)
+  add_executable(grpctest ${FlatBuffers_GRPCTest_SRCS})
+  add_dependencies(grpctest generated_code)
+  target_link_libraries(grpctest PRIVATE gRPC::grpc++_unsecure gRPC::grpc_unsecure gRPC::gpr pthread dl)
+  if(FLATBUFFERS_CODE_SANITIZE AND NOT WIN32)
+    # GRPC test has problems with alignment and will fail under ASAN/UBSAN.
+    # add_fsanitize_to_target(grpctest ${FLATBUFFERS_CODE_SANITIZE})
+  endif()
+endif()
+
+include(CMake/Version.cmake)
+
+if(FLATBUFFERS_INSTALL)
+  include(GNUInstallDirs)
+
+  install(DIRECTORY include/flatbuffers DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  set(FB_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/flatbuffers")
+
+  configure_file(CMake/FlatbuffersConfigVersion.cmake.in FlatbuffersConfigVersion.cmake @ONLY)
+  install(
+      FILES "CMake/FlatbuffersConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/FlatbuffersConfigVersion.cmake"
+      DESTINATION ${FB_CMAKE_DIR}
+  )
+
+  if(FLATBUFFERS_BUILD_FLATLIB)
+    if(CMAKE_VERSION VERSION_LESS 3.0)
+      install(
+        TARGETS flatbuffers EXPORT FlatbuffersTargets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      )
+    else()
+      install(
+        TARGETS flatbuffers EXPORT FlatbuffersTargets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      )
+    endif()
+
+    install(EXPORT FlatbuffersTargets
+      FILE FlatbuffersTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_FLATC)
+    install(
+      TARGETS flatc EXPORT FlatcTargets
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+
+    install(
+      EXPORT FlatcTargets
+      FILE FlatcTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_SHAREDLIB)
+    if(CMAKE_VERSION VERSION_LESS 3.0)
+      install(
+        TARGETS flatbuffers_shared EXPORT FlatbuffersSharedTargets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      )
+    else()
+      install(
+        TARGETS flatbuffers_shared EXPORT FlatbuffersSharedTargets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      )
+    endif()
+
+  install(
+      EXPORT FlatbuffersSharedTargets
+      FILE FlatbuffersSharedTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_SHAREDLIB OR FLATBUFFERS_BUILD_FLATLIB)
+      configure_file(CMake/flatbuffers.pc.in flatbuffers.pc @ONLY)
+      install(
+        FILES "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers.pc"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+      )
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_TESTS)
+  enable_testing()
+
+  add_test(NAME flattests COMMAND flattests)
+  if(FLATBUFFERS_BUILD_CPP17)
+    add_test(NAME flattests_cpp17 COMMAND flattests_cpp17)
+  endif()
+  if(FLATBUFFERS_BUILD_GRPCTEST)
+    add_test(NAME grpctest COMMAND grpctest)
+  endif()
+endif()
+
+# This target is sync-barrier.
+# Other generate-dependent targets can depend on 'generated_code' only.
+get_generated_output(fbs_generated)
+if(fbs_generated)
+  # message(STATUS "Add generated_code target with files:${fbs_generated}")
+  add_custom_target(generated_code
+    DEPENDS ${fbs_generated}
+    COMMENT "All generated files were updated.")
+endif()
+
+include(CMake/BuildFlatBuffers.cmake)
+
+if(UNIX)
+    # Use of CPack only supported on Linux systems.
+    if(FLATBUFFERS_PACKAGE_DEBIAN)
+        include(CMake/PackageDebian.cmake)
+        include(CPack)
+    endif()
+    if (FLATBUFFERS_PACKAGE_REDHAT)
+        include(CMake/PackageRedhat.cmake)
+        include(CPack)
+    endif()
+endif()
diff --git a/3rdparty/TNN/third_party/flatbuffers/CONTRIBUTING.md b/3rdparty/TNN/third_party/flatbuffers/CONTRIBUTING.md
new file mode 100644
index 0000000..17428ad
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/CONTRIBUTING.md
@@ -0,0 +1,42 @@
+Contributing    {#contributing}
+============
+
+Want to contribute? Great! First, read this page (including the small print at
+the end).
+
+# Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+# Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+Some tips for good pull requests:
+* Use our code
+  [style guide](https://google.github.io/styleguide/cppguide.html).
+  When in doubt, try to stay true to the existing code of the project.
+* Write a descriptive commit message. What problem are you solving and what
+  are the consequences? Where and what did you test? Some good tips:
+  [here](http://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message)
+  and [here](https://www.kernel.org/doc/Documentation/SubmittingPatches).
+* If your PR consists of multiple commits which are successive improvements /
+  fixes to your first commit, consider squashing them into a single commit
+  (`git rebase -i`) such that your PR is a single commit on top of the current
+  HEAD. This make reviewing the code so much easier, and our history more
+  readable.
+
+# The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/3rdparty/TNN/third_party/flatbuffers/Formatters.md b/3rdparty/TNN/third_party/flatbuffers/Formatters.md
new file mode 100644
index 0000000..18a51c4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/Formatters.md
@@ -0,0 +1,22 @@
+# Format Guidelines
+
+If you are interesting in contributing to the flatbuffers project, please take a second to read this document. Each language has it's own set of rules, that are defined in their respective formatter/linter documents.
+
+# Notes
+
+- Run the linter on the language you are working on before making a Pull Request.
+- DONT format/lint the generated code.
+
+# Languages
+
+## C++
+
+C++ uses `clang-format` as it's formatter. Run the following script `sh src/clang-format-git.sh`, and it should style the C++ code according to [google style guide](https://google.github.io/styleguide/cppguide.html).
+
+## Swift
+
+Swift uses swiftformat as it's formatter. Take a look at [how to install here](https://github.com/nicklockwood/SwiftFormat/blob/master/README.md#how-do-i-install-it). Run the following command `swiftformat --config swift.swiftformat .` in the root directory of the project
+
+## Typescript
+
+Typescript uses eslint as it's linter. Take a look at [how to install here](https://eslint.org/docs/user-guide/getting-started). Run the following command `eslint ts/** --ext .ts` in the root directory of the project
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/LICENSE.txt b/3rdparty/TNN/third_party/flatbuffers/LICENSE.txt
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/3rdparty/TNN/third_party/flatbuffers/WORKSPACE b/3rdparty/TNN/third_party/flatbuffers/WORKSPACE
new file mode 100644
index 0000000..c3bfdbe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/WORKSPACE
@@ -0,0 +1,46 @@
+workspace(name = "com_github_google_flatbuffers")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "io_bazel_rules_go",
+    sha256 = "d1ffd055969c8f8d431e2d439813e42326961d0942bdf734d2c95dc30c369566",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.24.5/rules_go-v0.24.5.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.24.5/rules_go-v0.24.5.tar.gz",
+    ],
+)
+
+load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_dependencies")
+
+go_rules_dependencies()
+
+go_register_toolchains()
+
+##### Protobuf
+_PROTOBUF_VERSION = "3.15.2"
+
+http_archive(
+    name = "com_google_protobuf",
+    strip_prefix = "protobuf-" + _PROTOBUF_VERSION,
+    urls = [
+        "https://github.com/protocolbuffers/protobuf/archive/v" + _PROTOBUF_VERSION + ".tar.gz",
+    ],
+)
+
+##### GRPC
+_GRPC_VERSION = "1.36.1"
+
+http_archive(
+    name = "com_github_grpc_grpc",
+    strip_prefix = "grpc-" + _GRPC_VERSION,
+    urls = ["https://github.com/grpc/grpc/archive/v" + _GRPC_VERSION + ".tar.gz"],
+)
+
+load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+
+grpc_deps()
+
+load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
+
+grpc_extra_deps()
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/AndroidManifest.xml b/3rdparty/TNN/third_party/flatbuffers/android/AndroidManifest.xml
new file mode 100644
index 0000000..846fd13
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/AndroidManifest.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (c) 2013 Google, Inc.
+
+     This software is provided 'as-is', without any express or implied
+     warranty.  In no event will the authors be held liable for any damages
+     arising from the use of this software.
+     Permission is granted to anyone to use this software for any purpose,
+     including commercial applications, and to alter it and redistribute it
+     freely, subject to the following restrictions:
+     1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+     2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+     3. This notice may not be removed or altered from any source distribution.
+ -->
+<!-- BEGIN_INCLUDE(manifest) -->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+          package="com.example.FlatBufferTest">
+
+    <uses-feature android:glEsVersion="0x00020000"></uses-feature>
+
+    <!-- This .apk has no Java code itself, so set hasCode to false. -->
+    <application android:label="@string/app_name"
+                 android:hasCode="false"
+                 android:allowBackup="false">
+        <!-- Our activity is the built-in NativeActivity framework class.
+             This will take care of integrating with our NDK code. -->
+        <activity android:name="android.app.NativeActivity"
+                android:label="@string/app_name"
+                android:configChanges="orientation|keyboardHidden"
+                android:screenOrientation="landscape">
+            <!-- Tell NativeActivity the name of or .so -->
+            <meta-data android:name="android.app.lib_name"
+                    android:value="FlatBufferTest" />
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
+<!-- END_INCLUDE(manifest) -->
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/.gitignore b/3rdparty/TNN/third_party/flatbuffers/android/app/.gitignore
new file mode 100644
index 0000000..796b96d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/build.gradle b/3rdparty/TNN/third_party/flatbuffers/android/app/build.gradle
new file mode 100644
index 0000000..f72f21f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/build.gradle
@@ -0,0 +1,144 @@
+apply plugin: 'com.android.application'
+apply plugin: 'kotlin-android'
+apply plugin: 'kotlin-android-extensions'
+
+android {
+  compileSdkVersion 30
+  buildToolsVersion "30.0.2"
+
+  defaultConfig {
+    applicationId "com.flatbuffers.app"
+    minSdkVersion 16
+    targetSdkVersion 30
+    versionCode 1
+    versionName "1.0"
+
+    compileOptions {
+      sourceCompatibility JavaVersion.VERSION_1_8
+      targetCompatibility JavaVersion.VERSION_1_8
+    }
+
+    testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+    externalNativeBuild {
+      cmake {
+        arguments "-DFLATBUFFERS_SRC=${rootProject.projectDir}/.."
+      }
+    }
+  }
+
+  buildTypes {
+    release {
+      minifyEnabled false
+      proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+    }
+  }
+
+  ndkVersion "21.3.6528147"
+  externalNativeBuild {
+    cmake {
+      path "src/main/cpp/CMakeLists.txt"
+    }
+  }
+
+  task generateFbsCpp(type: Exec) {
+    def inputDir = file("$projectDir/src/main/fbs")
+    def outputCppDir = file("$projectDir/src/main/cpp/generated/")
+    def fbsFiles = layout.files { file(inputDir).listFiles() }.filter { File f -> f.name.endsWith(".fbs") }.toList()
+    ignoreExitValue(true)
+
+    standardOutput = new ByteArrayOutputStream()
+    errorOutput = new ByteArrayOutputStream()
+    def commandLineArgs = ['flatc', '-o', outputCppDir, '--cpp']
+    fbsFiles.forEach{
+      commandLineArgs.add(it.path)
+    }
+    commandLine commandLineArgs
+
+    doFirst {
+      delete "$outputCppDir/"
+      mkdir "$outputCppDir/"
+    }
+    doLast {
+      if (execResult.getExitValue() != 0) {
+        println(standardOutput.toString())
+        throw new GradleException("flatc command line failed")
+      }
+    }
+  }
+
+  task generateFbsKotlin(type: Exec) {
+    def inputDir = file("$projectDir/src/main/fbs")
+    def outputKotlinDir = file("$projectDir/src/main/java/generated/")
+    def fbsFiles = layout.files { file(inputDir).listFiles() }.filter { File f -> f.name.endsWith(".fbs") }.toList()
+    ignoreExitValue(true)
+
+    standardOutput = new ByteArrayOutputStream()
+    errorOutput = new ByteArrayOutputStream()
+    def commandLineArgs = ['flatc', '-o', outputKotlinDir, '--kotlin']
+    fbsFiles.forEach{
+      commandLineArgs.add(it.path)
+    }
+    commandLine commandLineArgs
+
+    doFirst {
+      delete "$outputKotlinDir/"
+      mkdir "$outputKotlinDir/"
+    }
+    doLast {
+      if (execResult.getExitValue() != 0) {
+        println(standardOutput.toString())
+        throw new GradleException("flatc command line failed")
+      }
+    }
+  }
+
+  afterEvaluate {
+    android.applicationVariants.all { variant ->
+      variant.javaCompiler.dependsOn(generateFbsKotlin)
+      variant.javaCompiler.dependsOn(generateFbsCpp)
+    }
+  }
+
+  flavorDimensions "stl-variant"
+  productFlavors {
+    stlport {
+      dimension "stl-variant"
+      applicationIdSuffix ".stlport"
+      versionNameSuffix "-stlport"
+      externalNativeBuild {
+        ndkBuild {
+          arguments "APP_STL=stlport_static"
+        }
+      }
+    }
+    gnustl {
+      dimension "stl-variant"
+      applicationIdSuffix ".gnustl"
+      versionNameSuffix "-gnustl"
+      externalNativeBuild {
+        ndkBuild {
+          arguments "APP_STL=gnustl_static"
+        }
+      }
+    }
+    libcpp {
+      dimension "stl-variant"
+      applicationIdSuffix ".libcpp"
+      versionNameSuffix "-libcpp"
+      externalNativeBuild {
+        ndkBuild {
+          arguments "APP_STL=c++_static"
+        }
+      }
+    }
+  }
+}
+
+dependencies {
+  implementation fileTree(dir: "libs", include: ["*.jar"])
+  implementation "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
+  implementation 'androidx.core:core-ktx:1.3.2'
+  implementation 'androidx.appcompat:appcompat:1.2.0'
+  implementation 'com.google.flatbuffers:flatbuffers-java:2.0.0'
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/proguard-rules.pro b/3rdparty/TNN/third_party/flatbuffers/android/app/proguard-rules.pro
new file mode 100644
index 0000000..f1b4245
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/AndroidManifest.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..c2dcba9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/AndroidManifest.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+  package="com.flatbuffers.app">
+
+  <application
+    android:allowBackup="true"
+    android:icon="@mipmap/ic_launcher"
+    android:label="@string/app_name"
+    android:roundIcon="@mipmap/ic_launcher_round"
+    android:supportsRtl="true"
+    android:theme="@style/AppTheme">
+    <activity android:name=".MainActivity">
+      <intent-filter>
+        <action android:name="android.intent.action.MAIN" />
+
+        <category android:name="android.intent.category.LAUNCHER" />
+      </intent-filter>
+    </activity>
+  </application>
+
+</manifest>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000..f30dd4a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,54 @@
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html
+
+# Sets the minimum version of CMake required to build the native library.
+
+cmake_minimum_required(VERSION 3.4.1)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+
+include_directories(${FLATBUFFERS_SRC}/include)
+
+add_subdirectory(flatbuffers)
+
+FILE(GLOB Generated_SRCS generated/*.h)
+
+add_library( # Sets the name of the library.
+             native-lib
+
+             # Sets the library as a shared library.
+             SHARED
+
+             # Provides a relative path to your source file(s).
+             animals.cpp
+             ${Generated_SRCS}
+
+)
+
+# Searches for a specified prebuilt library and stores the path as a
+# variable. Because CMake includes system libraries in the search path by
+# default, you only need to specify the name of the public NDK library
+# you want to add. CMake verifies that the library exists before
+# completing its build.
+
+find_library( # Sets the name of the path variable.
+              log-lib
+
+              # Specifies the name of the NDK library that
+              # you want CMake to locate.
+              log )
+
+# Specifies libraries CMake should link to your target library. You
+# can link multiple libraries, such as libraries you define in this
+# build script, prebuilt third-party libraries, or system libraries.
+
+target_link_libraries( # Specifies the target library.
+                       native-lib
+                       flatbuffers
+                       flatbuffers_tests
+                       # Links the target library to the log library
+                       # included in the NDK.
+                       ${log-lib} )
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/animals.cpp b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/animals.cpp
new file mode 100644
index 0000000..bf87b87
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/animals.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <jni.h>
+#include <string>
+#include <search.h>
+#include "generated/animal_generated.h"
+
+using namespace com::fbs::app;
+using namespace flatbuffers;
+
+extern "C" JNIEXPORT jbyteArray JNICALL Java_com_flatbuffers_app_MainActivity_createAnimalFromJNI(
+        JNIEnv* env,
+        jobject /* this */) {
+    // create a new animal flatbuffers
+    auto fb = FlatBufferBuilder(1024);
+    auto tiger = CreateAnimalDirect(fb, "Tiger", "Roar", 300);
+    fb.Finish(tiger);
+
+    // copies it to a Java byte array.
+    auto buf = reinterpret_cast<jbyte*>(fb.GetBufferPointer());
+    int size = fb.GetSize();
+    auto ret = env->NewByteArray(size);
+    env->SetByteArrayRegion (ret, 0, fb.GetSize(), buf);
+  return ret;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/flatbuffers/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/flatbuffers/CMakeLists.txt
new file mode 100644
index 0000000..f32b0bb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/flatbuffers/CMakeLists.txt
@@ -0,0 +1,59 @@
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html
+
+# Sets the minimum version of CMake required to build the native library.
+
+cmake_minimum_required(VERSION 3.4.1)
+
+include_directories(${FLATBUFFERS_SRC}/include)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11 -fexceptions -Wall -DFLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE")
+
+# Certain platforms such as ARM do not use signed chars by default
+# which causes issues with certain bounds checks.
+set(CMAKE_CXX_FLAGS
+   "${CMAKE_CXX_FLAGS} -fsigned-char")
+
+set(FlatBuffers_Library_SRCS
+        ${FLATBUFFERS_SRC}/include/flatbuffers/base.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/flatbuffers.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/hash.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/idl.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/util.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/reflection.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/reflection_generated.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/stl_emulation.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/flexbuffers.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/registry.h
+        ${FLATBUFFERS_SRC}/include/flatbuffers/minireflect.h
+        ${FLATBUFFERS_SRC}/src/idl_parser.cpp
+        ${FLATBUFFERS_SRC}/src/idl_gen_text.cpp
+        ${FLATBUFFERS_SRC}/src/reflection.cpp
+        ${FLATBUFFERS_SRC}/src/util.cpp
+        ${FLATBUFFERS_SRC}/src/idl_gen_fbs.cpp
+        ${FLATBUFFERS_SRC}/src/code_generators.cpp
+        )
+
+set(FlatBuffers_Test_SRCS
+        ${FLATBUFFERS_SRC}/tests/test.cpp
+        ${FLATBUFFERS_SRC}/tests/test_assert.h
+        ${FLATBUFFERS_SRC}/tests/test_builder.h
+        ${FLATBUFFERS_SRC}/tests/test_assert.cpp
+        ${FLATBUFFERS_SRC}/tests/test_builder.cpp
+        ${FLATBUFFERS_SRC}/tests/native_type_test_impl.h
+        ${FLATBUFFERS_SRC}/tests/native_type_test_impl.cpp
+)
+
+add_library( # Sets the name of the library.
+             flatbuffers
+
+             ${FlatBuffers_Library_SRCS}
+             ${FlatBuffers_Test_SRCS}
+             ${Generated_SRCS}
+)
+
+add_library( # Sets the name of the library.
+             flatbuffers_tests
+
+             ${FlatBuffers_Test_SRCS}
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/generated/animal_generated.h b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/generated/animal_generated.h
new file mode 100644
index 0000000..1ba9b09
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/cpp/generated/animal_generated.h
@@ -0,0 +1,128 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_ANIMAL_COM_FBS_APP_H_
+#define FLATBUFFERS_GENERATED_ANIMAL_COM_FBS_APP_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace com {
+namespace fbs {
+namespace app {
+
+struct Animal;
+struct AnimalBuilder;
+
+struct Animal FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AnimalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_SOUND = 6,
+    VT_WEIGHT = 8
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  const flatbuffers::String *sound() const {
+    return GetPointer<const flatbuffers::String *>(VT_SOUND);
+  }
+  uint16_t weight() const {
+    return GetField<uint16_t>(VT_WEIGHT, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_SOUND) &&
+           verifier.VerifyString(sound()) &&
+           VerifyField<uint16_t>(verifier, VT_WEIGHT) &&
+           verifier.EndTable();
+  }
+};
+
+struct AnimalBuilder {
+  typedef Animal Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Animal::VT_NAME, name);
+  }
+  void add_sound(flatbuffers::Offset<flatbuffers::String> sound) {
+    fbb_.AddOffset(Animal::VT_SOUND, sound);
+  }
+  void add_weight(uint16_t weight) {
+    fbb_.AddElement<uint16_t>(Animal::VT_WEIGHT, weight, 0);
+  }
+  explicit AnimalBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AnimalBuilder &operator=(const AnimalBuilder &);
+  flatbuffers::Offset<Animal> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Animal>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Animal> CreateAnimal(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::String> sound = 0,
+    uint16_t weight = 0) {
+  AnimalBuilder builder_(_fbb);
+  builder_.add_sound(sound);
+  builder_.add_name(name);
+  builder_.add_weight(weight);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Animal> CreateAnimalDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *sound = nullptr,
+    uint16_t weight = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto sound__ = sound ? _fbb.CreateString(sound) : 0;
+  return com::fbs::app::CreateAnimal(
+      _fbb,
+      name__,
+      sound__,
+      weight);
+}
+
+inline const com::fbs::app::Animal *GetAnimal(const void *buf) {
+  return flatbuffers::GetRoot<com::fbs::app::Animal>(buf);
+}
+
+inline const com::fbs::app::Animal *GetSizePrefixedAnimal(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<com::fbs::app::Animal>(buf);
+}
+
+inline bool VerifyAnimalBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<com::fbs::app::Animal>(nullptr);
+}
+
+inline bool VerifySizePrefixedAnimalBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<com::fbs::app::Animal>(nullptr);
+}
+
+inline void FinishAnimalBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<com::fbs::app::Animal> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedAnimalBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<com::fbs::app::Animal> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace app
+}  // namespace fbs
+}  // namespace com
+
+#endif  // FLATBUFFERS_GENERATED_ANIMAL_COM_FBS_APP_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/fbs/animal.fbs b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/fbs/animal.fbs
new file mode 100644
index 0000000..479e22b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/fbs/animal.fbs
@@ -0,0 +1,23 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace com.fbs.app;
+
+table Animal {
+  name:string;
+  sound:string;
+  weight: uint16;
+}
+
+root_type Animal;
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/com/flatbuffers/app/MainActivity.kt b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/com/flatbuffers/app/MainActivity.kt
new file mode 100644
index 0000000..cdc3573
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/com/flatbuffers/app/MainActivity.kt
@@ -0,0 +1,51 @@
+package com.flatbuffers.app
+
+import android.annotation.SuppressLint
+import androidx.appcompat.app.AppCompatActivity
+import android.os.Bundle
+import android.widget.TextView
+import com.fbs.app.Animal
+import com.google.flatbuffers.FlatBufferBuilder
+import java.nio.ByteBuffer
+
+@ExperimentalUnsignedTypes
+class MainActivity : AppCompatActivity() {
+
+  @SuppressLint("SetTextI18n")
+  override fun onCreate(savedInstanceState: Bundle?) {
+    super.onCreate(savedInstanceState)
+    setContentView(R.layout.activity_main)
+
+    val tiger = Animal.getRootAsAnimal(ByteBuffer.wrap(createAnimalFromJNI()))
+    findViewById<TextView>(R.id.tv_animal_one).text = animalInfo(tiger)
+
+    findViewById<TextView>(R.id.tv_animal_two).text = animalInfo(createAnimalFromKotlin())
+  }
+
+  // This function is a sample of communicating FlatBuffers between JNI (native C++) and Java.
+  // Implementation can be found on animals.cpp file.
+  private external fun createAnimalFromJNI(): ByteArray
+
+  // Create a "Cow" Animal flatbuffers from Kotlin
+  private fun createAnimalFromKotlin():Animal {
+    val fb = FlatBufferBuilder(100)
+    val cowOffset = Animal.createAnimal(
+      builder = fb,
+      nameOffset = fb.createString("Cow"),
+      soundOffset = fb.createString("Moo"),
+      weight = 720u
+    )
+    fb.finish(cowOffset)
+    return Animal.getRootAsAnimal(fb.dataBuffer())
+  }
+
+  private fun animalInfo(animal: Animal): String =
+    "The ${animal.name} sound is ${animal.sound} and it weights ${animal.weight}kg."
+
+  companion object {
+    // Used to load the 'native-lib' library on application startup.
+    init {
+      System.loadLibrary("native-lib")
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/generated/com/fbs/app/Animal.kt b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/generated/com/fbs/app/Animal.kt
new file mode 100644
index 0000000..89da646
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/java/generated/com/fbs/app/Animal.kt
@@ -0,0 +1,64 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package com.fbs.app
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Animal : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Animal {
+        __init(_i, _bb)
+        return this
+    }
+    val name : String?
+        get() {
+            val o = __offset(4)
+            return if (o != 0) __string(o + bb_pos) else null
+        }
+    val nameAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(4, 1)
+    fun nameInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 4, 1)
+    val sound : String?
+        get() {
+            val o = __offset(6)
+            return if (o != 0) __string(o + bb_pos) else null
+        }
+    val soundAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(6, 1)
+    fun soundInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 6, 1)
+    val weight : UShort
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else 0u
+        }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsAnimal(_bb: ByteBuffer): Animal = getRootAsAnimal(_bb, Animal())
+        fun getRootAsAnimal(_bb: ByteBuffer, obj: Animal): Animal {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createAnimal(builder: FlatBufferBuilder, nameOffset: Int, soundOffset: Int, weight: UShort) : Int {
+            builder.startTable(3)
+            addSound(builder, soundOffset)
+            addName(builder, nameOffset)
+            addWeight(builder, weight)
+            return endAnimal(builder)
+        }
+        fun startAnimal(builder: FlatBufferBuilder) = builder.startTable(3)
+        fun addName(builder: FlatBufferBuilder, name: Int) = builder.addOffset(0, name, 0)
+        fun addSound(builder: FlatBufferBuilder, sound: Int) = builder.addOffset(1, sound, 0)
+        fun addWeight(builder: FlatBufferBuilder, weight: UShort) = builder.addShort(2, weight.toShort(), 0)
+        fun endAnimal(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun finishAnimalBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finish(offset)
+        fun finishSizePrefixedAnimalBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finishSizePrefixed(offset)
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
new file mode 100644
index 0000000..7706ab9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
@@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable/ic_launcher_background.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 0000000..07d5da9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/layout/activity_main.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 0000000..d339c22
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+  xmlns:app="http://schemas.android.com/apk/res-auto"
+  xmlns:tools="http://schemas.android.com/tools"
+  android:layout_width="match_parent"
+  android:layout_height="match_parent"
+  android:orientation="vertical"
+  android:gravity="center"
+  tools:context=".MainActivity">
+
+  <TextView
+    android:id="@+id/tv_animal_one"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    tools:text="Text Sample"/>
+
+  <TextView
+    android:id="@+id/tv_animal_two"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    tools:text="Text Sample 2"/>
+
+</LinearLayout>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
new file mode 100644
index 0000000..6b78462
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
new file mode 100644
index 0000000..6b78462
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 0000000..a571e60
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
new file mode 100644
index 0000000..61da551
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 0000000..c41dd28
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
new file mode 100644
index 0000000..db5080a
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 0000000..6dba46d
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
new file mode 100644
index 0000000..da31a87
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000..15ac681
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
new file mode 100644
index 0000000..b216f2d
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 0000000..f25a419
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
new file mode 100644
index 0000000..e96783c
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/colors.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/colors.xml
new file mode 100644
index 0000000..030098f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/colors.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#6200EE</color>
+    <color name="colorPrimaryDark">#3700B3</color>
+    <color name="colorAccent">#03DAC5</color>
+</resources>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/strings.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/strings.xml
new file mode 100644
index 0000000..71d53e9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+  <string name="app_name">FlatbuffersTestApp</string>
+</resources>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/styles.xml b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/styles.xml
new file mode 100644
index 0000000..391ec9a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/app/src/main/res/values/styles.xml
@@ -0,0 +1,10 @@
+<resources>
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+
+</resources>
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/build.gradle b/3rdparty/TNN/third_party/flatbuffers/android/build.gradle
new file mode 100644
index 0000000..d37c10c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/build.gradle
@@ -0,0 +1,35 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+buildscript {
+  ext.kotlin_version = "1.4.10"
+  repositories {
+    google()
+    jcenter()
+  }
+  dependencies {
+    classpath 'com.android.tools.build:gradle:4.1.0'
+    classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
+
+    // NOTE: Do not place your application dependencies here; they belong
+    // in the individual module build.gradle files
+  }
+}
+
+allprojects {
+  repositories {
+    google()
+    jcenter()
+  }
+}
+
+tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).all {
+  sourceCompatibility = JavaVersion.VERSION_1_8
+  targetCompatibility = JavaVersion.VERSION_1_8
+
+  compileKotlin {
+    dependsOn flatbuffer
+  }
+}
+
+task clean(type: Delete) {
+  delete rootProject.buildDir
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/gradle.properties b/3rdparty/TNN/third_party/flatbuffers/android/gradle.properties
new file mode 100644
index 0000000..9bb1cb2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/gradle.properties
@@ -0,0 +1,21 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app"s APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Automatically convert third-party libraries to use AndroidX
+android.enableJetifier=true
+# Kotlin code style for this project: "official" or "obsolete":
+kotlin.code.style=official
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..b4163b8
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..416f23c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Oct 29 19:47:23 CET 2020
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.5-all.zip
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/gradlew b/3rdparty/TNN/third_party/flatbuffers/android/gradlew
new file mode 100755
index 0000000..cccdd3d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/gradlew.bat b/3rdparty/TNN/third_party/flatbuffers/android/gradlew.bat
new file mode 100644
index 0000000..f955316
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/third_party/flatbuffers/android/settings.gradle b/3rdparty/TNN/third_party/flatbuffers/android/settings.gradle
new file mode 100644
index 0000000..c4c0afb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/android/settings.gradle
@@ -0,0 +1,2 @@
+include ':app'
+rootProject.name = "FlatbuffersTest"
diff --git a/3rdparty/TNN/third_party/flatbuffers/appveyor.yml b/3rdparty/TNN/third_party/flatbuffers/appveyor.yml
new file mode 100644
index 0000000..b6e641a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/appveyor.yml
@@ -0,0 +1,89 @@
+branches:
+  only:
+    - master
+
+environment:
+  nodejs_version: "14"
+
+  global:
+    # Workaround for https://github.com/conda/conda-build/issues/636
+    PYTHONIOENCODING: UTF-8
+    CONDA_INSTALL_LOCN: "C:\\Miniconda35-x64"
+    CMAKE_OPTIONS: ""
+    CPP_TEST_OPTIONS: ""
+
+  matrix:
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      CMAKE_VS_VERSION: "10 2010"
+      CMAKE_OPTIONS: "-DFLATBUFFERS_BUILD_LEGACY=1"
+      CPP_TEST_OPTIONS: "--std-cpp c++0x"
+      MONSTER_EXTRA: "skip"
+
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      CMAKE_VS_VERSION: "12 2013"
+      MONSTER_EXTRA: "skip"
+
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      CMAKE_VS_VERSION: "14 2015"
+      MONSTER_EXTRA: ""
+
+platform:
+  - x86
+  - x64
+
+configuration:
+  - Debug
+  - Release
+
+before_build:
+  - set MONSTER_EXTRA=%MONSTER_EXTRA%
+  - cmake . -G"Visual Studio %CMAKE_VS_VERSION%" -DFLATBUFFERS_CODE_SANITIZE=1 %CMAKE_OPTIONS%
+  # This cuts down on a lot of noise generated by xamarin warnings.
+  - if exist "C:\Program Files (x86)\MSBuild\14.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets" del "C:\Program Files (x86)\MSBuild\14.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
+
+build:
+  project: ALL_BUILD.vcxproj
+  verbosity: minimal
+
+after_build:
+  - python conan/appveyor/install.py
+  - python conan/appveyor/build.py
+
+install:
+  - set PATH=%CONDA_INSTALL_LOCN%;%CONDA_INSTALL_LOCN%\scripts;%PATH%;
+  - ps: Install-Product node $env:nodejs_version
+
+test_script:
+  - call .appveyor\check-generate-code.bat -b %CONFIGURATION%
+  - "cd tests"
+  - rem "Building all code"
+  - generate_code.bat -b %CONFIGURATION% %CPP_TEST_OPTIONS%
+  - 7z a GeneratedMyGameCode.zip MyGame\
+  - rem "---------------- C++ -----------------"
+  - "cd .."
+  - "%CONFIGURATION%\\flattests.exe"
+  - rem "---------------- JS -----------------"
+  - "node --version"
+  - "npm install"
+  - "npm run compile"
+  - "cd tests"
+  - "TypeScriptTest.bat"
+  - rem "---------------- C# -----------------"
+  # Have to compile this here rather than in "build" above because AppVeyor only
+  # supports building one project??
+  - "cd FlatBuffers.Test"
+  - "dotnet new sln"
+  - "dotnet sln add FlatBuffers.Test.csproj"
+  - "nuget restore"
+  - "mkdir .tmp"
+  - "msbuild.exe /property:Configuration=Release;OutputPath=.tmp /verbosity:minimal FlatBuffers.Test.csproj"
+  - ".tmp\\FlatBuffers.Test.exe"
+  # Run tests with UNSAFE_BYTEBUFFER
+  - "msbuild.exe /property:Configuration=Release;UnsafeByteBuffer=true;OutputPath=.tmp /verbosity:minimal FlatBuffers.Test.csproj"
+  - ".tmp\\FlatBuffers.Test.exe"
+
+artifacts:
+  - path: $(CONFIGURATION)\flatc.exe
+    name: flatc.exe
+  - path: tests\GeneratedMyGameCode.zip
+    name: GeneratedMyGameCode.zip
diff --git a/3rdparty/TNN/third_party/flatbuffers/bazel/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/bazel/BUILD.bazel
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/build_defs.bzl b/3rdparty/TNN/third_party/flatbuffers/build_defs.bzl
new file mode 100644
index 0000000..88792be
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/build_defs.bzl
@@ -0,0 +1,261 @@
+# Description:
+#   BUILD rules for generating flatbuffer files in various languages.
+
+"""
+Rules for building C++ flatbuffers with Bazel.
+"""
+
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+flatc_path = "@com_github_google_flatbuffers//:flatc"
+
+DEFAULT_INCLUDE_PATHS = [
+    "./",
+    "$(GENDIR)",
+    "$(BINDIR)",
+]
+
+DEFAULT_FLATC_ARGS = [
+    "--gen-object-api",
+    "--gen-compare",
+    "--no-includes",
+    "--gen-mutable",
+    "--reflect-names",
+    "--cpp-ptr-type flatbuffers::unique_ptr",
+]
+
+def flatbuffer_library_public(
+        name,
+        srcs,
+        outs,
+        language_flag,
+        out_prefix = "",
+        includes = [],
+        include_paths = DEFAULT_INCLUDE_PATHS,
+        flatc_args = DEFAULT_FLATC_ARGS,
+        reflection_name = "",
+        reflection_visibility = None,
+        compatible_with = None,
+        restricted_to = None,
+        output_to_bindir = False):
+    """Generates code files for reading/writing the given flatbuffers in the requested language using the public compiler.
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      outs: Output files from flatc.
+      language_flag: Target language flag. One of [-c, -j, -js].
+      out_prefix: Prepend this path to the front of all generated files except on
+          single source targets. Usually is a directory name.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional, list of additional arguments to pass to flatc.
+      reflection_name: Optional, if set this will generate the flatbuffer
+        reflection binaries for the schemas.
+      reflection_visibility: The visibility of the generated reflection Fileset.
+      output_to_bindir: Passed to genrule for output to bin directory.
+      compatible_with: Optional, The list of environments this rule can be
+        built for, in addition to default-supported environments.
+      restricted_to: Optional, The list of environments this rule can be built
+        for, instead of default-supported environments.
+      output_to_bindir: Passed to genrule for output to bin directory.
+
+
+    This rule creates a filegroup(name) with all generated source files, and
+    optionally a Fileset([reflection_name]) with all generated reflection
+    binaries.
+    """
+    include_paths_cmd = ["-I %s" % (s) for s in include_paths]
+
+    # '$(@D)' when given a single source target will give the appropriate
+    # directory. Appending 'out_prefix' is only necessary when given a build
+    # target with multiple sources.
+    output_directory = (
+        ("-o $(@D)/%s" % (out_prefix)) if len(srcs) > 1 else ("-o $(@D)")
+    )
+    genrule_cmd = " ".join([
+        "SRCS=($(SRCS));",
+        "for f in $${SRCS[@]:0:%s}; do" % len(srcs),
+        "$(location %s)" % (flatc_path),
+        " ".join(include_paths_cmd),
+        " ".join(flatc_args),
+        language_flag,
+        output_directory,
+        "$$f;",
+        "done",
+    ])
+    native.genrule(
+        name = name,
+        srcs = srcs + includes,
+        outs = outs,
+        output_to_bindir = output_to_bindir,
+        tools = [flatc_path],
+        cmd = genrule_cmd,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        message = "Generating flatbuffer files for %s:" % (name),
+    )
+    if reflection_name:
+        reflection_genrule_cmd = " ".join([
+            "SRCS=($(SRCS));",
+            "for f in $${SRCS[@]:0:%s}; do" % len(srcs),
+            "$(location %s)" % (flatc_path),
+            "-b --schema",
+            " ".join(flatc_args),
+            " ".join(include_paths_cmd),
+            language_flag,
+            output_directory,
+            "$$f;",
+            "done",
+        ])
+        reflection_outs = [
+            (out_prefix + "%s.bfbs") % (s.replace(".fbs", "").split("/")[-1])
+            for s in srcs
+        ]
+        native.genrule(
+            name = "%s_srcs" % reflection_name,
+            srcs = srcs + includes,
+            outs = reflection_outs,
+            output_to_bindir = output_to_bindir,
+            tools = [flatc_path],
+            compatible_with = compatible_with,
+            restricted_to = restricted_to,
+            cmd = reflection_genrule_cmd,
+            message = "Generating flatbuffer reflection binary for %s:" % (name),
+        )
+        native.filegroup(
+            name = "%s_out" % reflection_name,
+            srcs = reflection_outs,
+            visibility = reflection_visibility,
+            compatible_with = compatible_with,
+            restricted_to = restricted_to,
+        )
+
+def flatbuffer_cc_library(
+        name,
+        srcs,
+        srcs_filegroup_name = "",
+        out_prefix = "",
+        includes = [],
+        include_paths = DEFAULT_INCLUDE_PATHS,
+        flatc_args = DEFAULT_FLATC_ARGS,
+        visibility = None,
+        compatible_with = None,
+        restricted_to = None,
+        srcs_filegroup_visibility = None,
+        gen_reflections = False):
+    '''A cc_library with the generated reader/writers for the given flatbuffer definitions.
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      srcs_filegroup_name: Name of the output filegroup that holds srcs. Pass this
+          filegroup into the `includes` parameter of any other
+          flatbuffer_cc_library that depends on this one's schemas.
+      out_prefix: Prepend this path to the front of all generated files. Usually
+          is a directory name.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+          ** SEE REMARKS BELOW **
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional list of additional arguments to pass to flatc
+          (e.g. --gen-mutable).
+      visibility: The visibility of the generated cc_library. By default, use the
+          default visibility of the project.
+      srcs_filegroup_visibility: The visibility of the generated srcs filegroup.
+          By default, use the value of the visibility parameter above.
+      gen_reflections: Optional, if true this will generate the flatbuffer
+        reflection binaries for the schemas.
+      compatible_with: Optional, The list of environments this rule can be built
+        for, in addition to default-supported environments.
+      restricted_to: Optional, The list of environments this rule can be built
+        for, instead of default-supported environments.
+
+    This produces:
+      filegroup([name]_srcs): all generated .h files.
+      filegroup(srcs_filegroup_name if specified, or [name]_includes if not):
+          Other flatbuffer_cc_library's can pass this in for their `includes`
+          parameter, if they depend on the schemas in this library.
+      Fileset([name]_reflection): (Optional) all generated reflection binaries.
+      cc_library([name]): library with sources and flatbuffers deps.
+
+    Remarks:
+      ** Because the genrule used to call flatc does not have any trivial way of
+        computing the output list of files transitively generated by includes and
+        --gen-includes (the default) being defined for flatc, the --gen-includes
+        flag will not work as expected. The way around this is to add a dependency
+        to the flatbuffer_cc_library defined alongside the flatc included Fileset.
+        For example you might define:
+
+        flatbuffer_cc_library(
+            name = "my_fbs",
+            srcs = [ "schemas/foo.fbs" ],
+            includes = [ "//third_party/bazz:bazz_fbs_includes" ],
+        )
+
+        In which foo.fbs includes a few files from the Fileset defined at
+        //third_party/bazz:bazz_fbs_includes. When compiling the library that
+        includes foo_generated.h, and therefore has my_fbs as a dependency, it
+        will fail to find any of the bazz *_generated.h files unless you also
+        add bazz's flatbuffer_cc_library to your own dependency list, e.g.:
+
+        cc_library(
+            name = "my_lib",
+            deps = [
+                ":my_fbs",
+                "//third_party/bazz:bazz_fbs"
+            ],
+        )
+
+        Happy dependent Flatbuffering!
+    '''
+    output_headers = [
+        (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1])
+        for s in srcs
+    ]
+    reflection_name = "%s_reflection" % name if gen_reflections else ""
+
+    srcs_lib = "%s_srcs" % (name)
+    flatbuffer_library_public(
+        name = srcs_lib,
+        srcs = srcs,
+        outs = output_headers,
+        language_flag = "-c",
+        out_prefix = out_prefix,
+        includes = includes,
+        include_paths = include_paths,
+        flatc_args = flatc_args,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        reflection_name = reflection_name,
+        reflection_visibility = visibility,
+    )
+    cc_library(
+        name = name,
+        hdrs = [
+            ":" + srcs_lib,
+        ],
+        srcs = [
+            ":" + srcs_lib,
+        ],
+        features = [
+            "-parse_headers",
+        ],
+        deps = [
+            "@com_github_google_flatbuffers//:runtime_cc",
+        ],
+        includes = [],
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        linkstatic = 1,
+        visibility = visibility,
+    )
+
+    # A filegroup for the `srcs`. That is, all the schema files for this
+    # Flatbuffer set.
+    native.filegroup(
+        name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
+        srcs = srcs,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        visibility = srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility,
+    )
diff --git a/3rdparty/TNN/third_party/flatbuffers/composer.json b/3rdparty/TNN/third_party/flatbuffers/composer.json
new file mode 100644
index 0000000..807709c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/composer.json
@@ -0,0 +1,18 @@
+{
+  "name": "google/flatbuffers",
+  "type": "library",
+  "description": "FlatBuffers for PHP",
+  "keywords": ["google", "flatbuffers", "serialization"],
+  "homepage": "https://github.com/google/flatbuffers",
+  "license": "Apache-2.0",
+  "require": {
+    "php": ">=5.4"
+  },
+  "require-dev": {
+  },
+  "autoload": {
+    "psr-4": {
+      "Google\\FlatBuffers\\": "php"
+    }
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/conan/CMakeLists.txt
new file mode 100644
index 0000000..d32a013
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8)
+
+message(STATUS "Conan FlatBuffers Wrapper")
+
+include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+conan_basic_setup()
+
+if (WIN32 AND MSVC AND FLATBUFFERS_BUILD_SHAREDLIB)
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif(WIN32 AND MSVC AND FLATBUFFERS_BUILD_SHAREDLIB)
+
+include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/build.py b/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/build.py
new file mode 100644
index 0000000..9bac46d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/build.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+
+if os.getenv("APPVEYOR_REPO_TAG") != "true":
+    print("Skip build step. It's not TAG")
+else:
+    os.system("python conan/build.py")
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/install.py b/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/install.py
new file mode 100644
index 0000000..962c7da
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/appveyor/install.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+
+if os.getenv("APPVEYOR_REPO_TAG") != "true":
+    print("Skip step. It's not TAG")
+else:
+    os.system("pip install conan conan-package-tools")
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/build.py b/3rdparty/TNN/third_party/flatbuffers/conan/build.py
new file mode 100644
index 0000000..5545631
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/build.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import re
+import subprocess
+from cpt.packager import ConanMultiPackager
+
+
+def set_appveyor_environment():
+    if os.getenv("APPVEYOR") is not None:
+        compiler_version = os.getenv("CMAKE_VS_VERSION").split(" ")[0].replace('"', '')
+        os.environ["CONAN_VISUAL_VERSIONS"] = compiler_version
+        os.environ["CONAN_STABLE_BRANCH_PATTERN"] = "master"
+        ci_platform = os.getenv("Platform").replace('"', '')
+        ci_platform = "x86" if ci_platform == "x86" else "x86_64"
+        os.environ["CONAN_ARCHS"] = ci_platform
+        os.environ["CONAN_BUILD_TYPES"] = os.getenv("Configuration").replace('"', '')
+
+
+def get_branch():
+    try:
+        for line in subprocess.check_output("git branch", shell=True).decode().splitlines():
+            line = line.strip()
+            if line.startswith("*") and " (HEAD detached" not in line:
+                return line.replace("*", "", 1).strip()
+        return ""
+    except Exception:
+        pass
+    return ""
+
+
+def get_version():
+    version = get_branch()
+    if os.getenv("TRAVIS", False):
+        version = os.getenv("TRAVIS_BRANCH")
+
+    if os.getenv("APPVEYOR", False):
+        version = os.getenv("APPVEYOR_REPO_BRANCH")
+        if os.getenv("APPVEYOR_REPO_TAG") == "true":
+            version = os.getenv("APPVEYOR_REPO_TAG_NAME")
+
+    match = re.search(r"v(\d+\.\d+\.\d+.*)", version)
+    if match:
+        return match.group(1)
+    return version
+
+
+def get_reference(username):
+    return "flatbuffers/{}@google/stable".format(get_version())
+
+
+if __name__ == "__main__":
+    login_username = os.getenv("CONAN_LOGIN_USERNAME", "aardappel")
+    username = os.getenv("CONAN_USERNAME", "google")
+    upload = os.getenv("CONAN_UPLOAD", "https://api.bintray.com/conan/aardappel/flatbuffers")
+    stable_branch_pattern = os.getenv("CONAN_STABLE_BRANCH_PATTERN", r"v\d+\.\d+\.\d+.*")
+    test_folder = os.getenv("CPT_TEST_FOLDER", os.path.join("conan", "test_package"))
+    upload_only_when_stable = os.getenv("CONAN_UPLOAD_ONLY_WHEN_STABLE", True)
+    set_appveyor_environment()
+
+    builder = ConanMultiPackager(reference=get_reference(username),
+                                 username=username,
+                                 login_username=login_username,
+                                 upload=upload,
+                                 stable_branch_pattern=stable_branch_pattern,
+                                 upload_only_when_stable=upload_only_when_stable,
+                                 test_folder=test_folder)
+    builder.add_common_builds(pure_c=False)
+    builder.run()
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/test_package/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/CMakeLists.txt
new file mode 100644
index 0000000..9c1c78c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/CMakeLists.txt
@@ -0,0 +1,9 @@
+project(test_package CXX)
+cmake_minimum_required(VERSION 2.8.11)
+
+include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+conan_basic_setup()
+
+add_executable(${PROJECT_NAME} test_package.cpp)
+target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
+set_property(TARGET ${PROJECT_NAME} PROPERTY CXX_STANDARD 11)
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/test_package/conanfile.py b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/conanfile.py
new file mode 100644
index 0000000..735e31d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/conanfile.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from conans import ConanFile, CMake
+import os
+
+
+class TestPackageConan(ConanFile):
+    settings = "os", "compiler", "build_type", "arch"
+    generators = "cmake"
+
+    def build(self):
+        cmake = CMake(self)
+        cmake.configure()
+        cmake.build()
+
+    def test(self):
+        bin_path = os.path.join("bin", "test_package")
+        self.run(bin_path, run_environment=True)
+        self.run("flatc --version", run_environment=True)
+        self.run("flathash fnv1_16 conan", run_environment=True)
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/test_package/test_package.cpp b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/test_package.cpp
new file mode 100644
index 0000000..df7d577
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/test_package/test_package.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <iostream>
+#include "flatbuffers/util.h"
+
+// Test to validate Conan package generated
+
+int main(int /*argc*/, const char * /*argv*/ []) {
+
+  const std::string filename("conanbuildinfo.cmake");
+
+  if (flatbuffers::FileExists(filename.c_str())) {
+    std::cout << "File " << filename << " exists.\n";
+  } else {
+    std::cout << "File " << filename << " does not exist.\n";
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/travis/build.sh b/3rdparty/TNN/third_party/flatbuffers/conan/travis/build.sh
new file mode 100755
index 0000000..069ced2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/travis/build.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+set -x
+
+if [[ "$(uname -s)" == 'Darwin' ]]; then
+    if which pyenv > /dev/null; then
+        eval "$(pyenv init -)"
+    fi
+    pyenv activate conan
+fi
+
+conan user
+python conan/build.py
diff --git a/3rdparty/TNN/third_party/flatbuffers/conan/travis/install.sh b/3rdparty/TNN/third_party/flatbuffers/conan/travis/install.sh
new file mode 100755
index 0000000..f4208d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conan/travis/install.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+set -x
+
+if [[ "$(uname -s)" == 'Darwin' ]]; then
+    brew update || brew update
+    brew outdated pyenv || brew upgrade pyenv
+    brew install pyenv-virtualenv
+    brew install cmake || true
+
+    if which pyenv > /dev/null; then
+        eval "$(pyenv init -)"
+    fi
+
+    pyenv install 2.7.10
+    pyenv virtualenv 2.7.10 conan
+    pyenv rehash
+    pyenv activate conan
+fi
+
+pip install -U conan_package_tools conan
diff --git a/3rdparty/TNN/third_party/flatbuffers/conanfile.py b/3rdparty/TNN/third_party/flatbuffers/conanfile.py
new file mode 100644
index 0000000..bdd832c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/conanfile.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Conan recipe package for Google FlatBuffers
+"""
+import os
+import shutil
+from conans import ConanFile, CMake, tools
+
+
+class FlatbuffersConan(ConanFile):
+    name = "flatbuffers"
+    license = "Apache-2.0"
+    url = "https://github.com/google/flatbuffers"
+    homepage = "http://google.github.io/flatbuffers/"
+    author = "Wouter van Oortmerssen"
+    topics = ("conan", "flatbuffers", "serialization", "rpc", "json-parser")
+    description = "Memory Efficient Serialization Library"
+    settings = "os", "compiler", "build_type", "arch"
+    options = {"shared": [True, False], "fPIC": [True, False]}
+    default_options = {"shared": False, "fPIC": True}
+    generators = "cmake"
+    exports = "LICENSE.txt"
+    exports_sources = ["CMake/*", "include/*", "src/*", "grpc/*", "CMakeLists.txt", "conan/CMakeLists.txt"]
+
+    def source(self):
+        """Wrap the original CMake file to call conan_basic_setup
+        """
+        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
+        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
+
+    def config_options(self):
+        """Remove fPIC option on Windows platform
+        """
+        if self.settings.os == "Windows":
+            self.options.remove("fPIC")
+
+    def configure_cmake(self):
+        """Create CMake instance and execute configure step
+        """
+        cmake = CMake(self)
+        cmake.definitions["FLATBUFFERS_BUILD_TESTS"] = False
+        cmake.definitions["FLATBUFFERS_BUILD_SHAREDLIB"] = self.options.shared
+        cmake.definitions["FLATBUFFERS_BUILD_FLATLIB"] = not self.options.shared
+        cmake.configure()
+        return cmake
+
+    def build(self):
+        """Configure, build and install FlatBuffers using CMake.
+        """
+        cmake = self.configure_cmake()
+        cmake.build()
+
+    def package(self):
+        """Copy Flatbuffers' artifacts to package folder
+        """
+        cmake = self.configure_cmake()
+        cmake.install()
+        self.copy(pattern="LICENSE.txt", dst="licenses")
+        self.copy(pattern="FindFlatBuffers.cmake", dst=os.path.join("lib", "cmake", "flatbuffers"), src="CMake")
+        self.copy(pattern="flathash*", dst="bin", src="bin")
+        self.copy(pattern="flatc*", dst="bin", src="bin")
+        if self.settings.os == "Windows" and self.options.shared:
+            if self.settings.compiler == "Visual Studio":
+                shutil.move(os.path.join(self.package_folder, "lib", "%s.dll" % self.name),
+                            os.path.join(self.package_folder, "bin", "%s.dll" % self.name))
+            elif self.settings.compiler == "gcc":
+                shutil.move(os.path.join(self.package_folder, "lib", "lib%s.dll" % self.name),
+                            os.path.join(self.package_folder, "bin", "lib%s.dll" % self.name))
+
+    def package_info(self):
+        """Collect built libraries names and solve flatc path.
+        """
+        self.cpp_info.libs = tools.collect_libs(self)
+        self.user_info.flatc = os.path.join(self.package_folder, "bin", "flatc")
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/CHANGELOG.md b/3rdparty/TNN/third_party/flatbuffers/dart/CHANGELOG.md
new file mode 100644
index 0000000..5e2d2de
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/CHANGELOG.md
@@ -0,0 +1,14 @@
+# CHANGELOG
+
+## 1.9.2
+
+- Ensure `_writeString` adds enough padding to null terminate strings.
+
+## 1.9.1
+
+- Changed constant identifiers to be compatible with Dart 2.x
+- No longer supports Dart 1.x
+
+## 1.9.0
+
+- Initial release, supports Dart 1.x and many dev versions of Dart 2.x
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/LICENSE b/3rdparty/TNN/third_party/flatbuffers/dart/LICENSE
new file mode 100644
index 0000000..b2ae013
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/LICENSE
@@ -0,0 +1,233 @@
+The code in lib/flat_buffers.dart is based on code that was releases under the 
+following license:
+
+Copyright 2012, the Dart project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+To the extent permissible, the changes to that code and the other assets in 
+this package are licensed under the Apache2 license:
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2014 Google Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/README.md b/3rdparty/TNN/third_party/flatbuffers/dart/README.md
new file mode 100644
index 0000000..11bc0c4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/README.md
@@ -0,0 +1,13 @@
+# FlatBuffers for Dart
+
+This package is used to read and write FlatBuffer files in Dart.
+
+Most consumers will want to use the [`flatc`](https://github.com/google/flatbuffers)
+compiler to generate Dart code from a FlatBuffers IDL schema.  For example, the
+`monster_my_game.sample_generated.dart` was generated with `flatc` from
+`monster.fbs` in the example folder. The generated classes can be used to read
+or write binary files that are interoperable with other languages and platforms
+supported by FlatBuffers, as illustrated in the `example.dart` in the
+examples folder.
+
+Additional documentation and examples are available [at the FlatBuffers site](https://google.github.io/flatbuffers/index.html)
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/example/example.dart b/3rdparty/TNN/third_party/flatbuffers/dart/example/example.dart
new file mode 100644
index 0000000..d95bb31
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/example/example.dart
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2018 Dan Field. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+import './monster_my_game.sample_generated.dart' as myGame;
+
+// Example how to use FlatBuffers to create and read binary buffers.
+
+void main() {
+  builderTest();
+  objectBuilderTest();
+}
+
+void builderTest() {
+  final builder = new fb.Builder(initialSize: 1024);
+  final int weaponOneName = builder.writeString("Sword");
+  final int weaponOneDamage = 3;
+
+  final int weaponTwoName = builder.writeString("Axe");
+  final int weaponTwoDamage = 5;
+
+  final swordBuilder = new myGame.WeaponBuilder(builder)
+    ..begin()
+    ..addNameOffset(weaponOneName)
+    ..addDamage(weaponOneDamage);
+  final int sword = swordBuilder.finish();
+
+  final axeBuilder = new myGame.WeaponBuilder(builder)
+    ..begin()
+    ..addNameOffset(weaponTwoName)
+    ..addDamage(weaponTwoDamage);
+  final int axe = axeBuilder.finish();
+
+  // Serialize a name for our monster, called "Orc".
+  final int name = builder.writeString('Orc');
+
+  // Create a list representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  final List<int> treasure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+  final inventory = builder.writeListUint8(treasure);
+  final weapons = builder.writeList([sword, axe]);
+
+  // Struct builders are very easy to reuse.
+  final vec3Builder = new myGame.Vec3Builder(builder);
+
+  vec3Builder.finish(4.0, 5.0, 6.0);
+  vec3Builder.finish(1.0, 2.0, 3.0);
+  // Set his hit points to 300 and his mana to 150.
+  final int hp = 300;
+  final int mana = 150;
+
+  final monster = new myGame.MonsterBuilder(builder)
+    ..begin()
+    ..addNameOffset(name)
+    ..addInventoryOffset(inventory)
+    ..addWeaponsOffset(weapons)
+    ..addEquippedType(myGame.EquipmentTypeId.Weapon)
+    ..addEquippedOffset(axe)
+    ..addHp(hp)
+    ..addMana(mana)
+    ..addPos(vec3Builder.finish(1.0, 2.0, 3.0))
+    ..addColor(myGame.Color.Red);
+
+  final int monsteroff = monster.finish();
+  final buffer = builder.finish(monsteroff);
+  if (verify(buffer)) {
+    print(
+        "The FlatBuffer was successfully created with a builder and verified!");
+  }
+}
+
+void objectBuilderTest() {
+  // Create the builder here so we can use it for both weapons and equipped
+  // the actual data will only be written to the buffer once.
+  var axe = new myGame.WeaponObjectBuilder(name: 'Axe', damage: 5);
+
+  var monsterBuilder = new myGame.MonsterObjectBuilder(
+    pos: new myGame.Vec3ObjectBuilder(x: 1.0, y: 2.0, z: 3.0),
+    mana: 150,
+    hp: 300,
+    name: 'Orc',
+    inventory: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+    color: myGame.Color.Red,
+    weapons: [new myGame.WeaponObjectBuilder(name: 'Sword', damage: 3), axe],
+    equippedType: myGame.EquipmentTypeId.Weapon,
+    equipped: axe,
+  );
+
+  var buffer = monsterBuilder.toBytes();
+
+  // We now have a FlatBuffer we can store on disk or send over a network.
+
+  // ** file/network code goes here :) **
+
+  // Instead, we're going to access it right away (as if we just received it).
+  if (verify(buffer)) {
+    print(
+        "The FlatBuffer was successfully created with an object builder and verified!");
+  }
+}
+
+bool verify(List<int> buffer) {
+  // Get access to the root:
+  var monster = new myGame.Monster(buffer);
+
+  // Get and test some scalar types from the FlatBuffer.
+  assert(monster.hp == 80);
+  assert(monster.mana == 150); // default
+  assert(monster.name == "MyMonster");
+
+  // Get and test a field of the FlatBuffer's `struct`.
+  var pos = monster.pos;
+  assert(pos != null);
+  assert(pos.z == 3.0);
+
+  // Get a test an element from the `inventory` FlatBuffer's `vector`.
+  var inv = monster.inventory;
+  assert(inv != null);
+  assert(inv.length == 10);
+  assert(inv[9] == 9);
+
+  // Get and test the `weapons` FlatBuffers's `vector`.
+  var expected_weapon_names = ["Sword", "Axe"];
+  var expected_weapon_damages = [3, 5];
+  var weps = monster.weapons;
+  for (int i = 0; i < weps.length; i++) {
+    assert(weps[i].name == expected_weapon_names[i]);
+    assert(weps[i].damage == expected_weapon_damages[i]);
+  }
+
+  // Get and test the `Equipment` union (`equipped` field).
+  assert(monster.equippedType.value == myGame.EquipmentTypeId.Weapon.value);
+  assert(monster.equippedType == myGame.EquipmentTypeId.Weapon);
+
+  assert(monster.equipped is myGame.Weapon);
+  var equipped = monster.equipped as myGame.Weapon;
+  assert(equipped.name == "Axe");
+  assert(equipped.damage == 5);
+
+  print(monster);
+  return true;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/example/monster_my_game.sample_generated.dart b/3rdparty/TNN/third_party/flatbuffers/dart/example/monster_my_game.sample_generated.dart
new file mode 100644
index 0000000..ced9b31
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/example/monster_my_game.sample_generated.dart
@@ -0,0 +1,440 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game.sample;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+
+class Color {
+  final int value;
+  const Color._(this.value);
+
+  factory Color.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum Color');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 2;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const Color Red = const Color._(0);
+  static const Color Green = const Color._(1);
+  static const Color Blue = const Color._(2);
+  static const Map<int,Color> values = {0: Red,1: Green,2: Blue,};
+
+  static const fb.Reader<Color> reader = const _ColorReader();
+
+  @override
+  String toString() {
+    return 'Color{value: $value}';
+  }
+}
+
+class _ColorReader extends fb.Reader<Color> {
+  const _ColorReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  Color read(fb.BufferContext bc, int offset) =>
+      new Color.fromValue(const fb.Int8Reader().read(bc, offset));
+}
+
+class EquipmentTypeId {
+  final int value;
+  const EquipmentTypeId._(this.value);
+
+  factory EquipmentTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum EquipmentTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 1;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const EquipmentTypeId NONE = const EquipmentTypeId._(0);
+  static const EquipmentTypeId Weapon = const EquipmentTypeId._(1);
+  static const Map<int,EquipmentTypeId> values = {0: NONE,1: Weapon,};
+
+  static const fb.Reader<EquipmentTypeId> reader = const _EquipmentTypeIdReader();
+
+  @override
+  String toString() {
+    return 'EquipmentTypeId{value: $value}';
+  }
+}
+
+class _EquipmentTypeIdReader extends fb.Reader<EquipmentTypeId> {
+  const _EquipmentTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  EquipmentTypeId read(fb.BufferContext bc, int offset) =>
+      new EquipmentTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class Vec3 {
+  Vec3._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Vec3> reader = const _Vec3Reader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  double get x => const fb.Float32Reader().read(_bc, _bcOffset + 0);
+  double get y => const fb.Float32Reader().read(_bc, _bcOffset + 4);
+  double get z => const fb.Float32Reader().read(_bc, _bcOffset + 8);
+
+  @override
+  String toString() {
+    return 'Vec3{x: $x, y: $y, z: $z}';
+  }
+}
+
+class _Vec3Reader extends fb.StructReader<Vec3> {
+  const _Vec3Reader();
+
+  @override
+  int get size => 12;
+
+  @override
+  Vec3 createObject(fb.BufferContext bc, int offset) => 
+    new Vec3._(bc, offset);
+}
+
+class Vec3Builder {
+  Vec3Builder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(double x, double y, double z) {
+    fbBuilder.putFloat32(z);
+    fbBuilder.putFloat32(y);
+    fbBuilder.putFloat32(x);
+    return fbBuilder.offset;
+  }
+
+}
+
+class Vec3ObjectBuilder extends fb.ObjectBuilder {
+  final double _x;
+  final double _y;
+  final double _z;
+
+  Vec3ObjectBuilder({
+    double x,
+    double y,
+    double z,
+  })
+      : _x = x,
+        _y = y,
+        _z = z;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.putFloat32(_z);
+    fbBuilder.putFloat32(_y);
+    fbBuilder.putFloat32(_x);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Monster {
+  Monster._(this._bc, this._bcOffset);
+  factory Monster(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Monster> reader = const _MonsterReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Vec3 get pos => Vec3.reader.vTableGet(_bc, _bcOffset, 4, null);
+  int get mana => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 6, 150);
+  int get hp => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 8, 100);
+  String get name => const fb.StringReader().vTableGet(_bc, _bcOffset, 10, null);
+  List<int> get inventory => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 14, null);
+  Color get color => new Color.fromValue(const fb.Int8Reader().vTableGet(_bc, _bcOffset, 16, 2));
+  List<Weapon> get weapons => const fb.ListReader<Weapon>(Weapon.reader).vTableGet(_bc, _bcOffset, 18, null);
+  EquipmentTypeId get equippedType => new EquipmentTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 20, 0));
+  dynamic get equipped {
+    switch (equippedType?.value) {
+      case 1: return Weapon.reader.vTableGet(_bc, _bcOffset, 22, null);
+      default: return null;
+    }
+  }
+  List<Vec3> get path => const fb.ListReader<Vec3>(Vec3.reader).vTableGet(_bc, _bcOffset, 24, null);
+
+  @override
+  String toString() {
+    return 'Monster{pos: $pos, mana: $mana, hp: $hp, name: $name, inventory: $inventory, color: $color, weapons: $weapons, equippedType: $equippedType, equipped: $equipped, path: $path}';
+  }
+}
+
+class _MonsterReader extends fb.TableReader<Monster> {
+  const _MonsterReader();
+
+  @override
+  Monster createObject(fb.BufferContext bc, int offset) => 
+    new Monster._(bc, offset);
+}
+
+class MonsterBuilder {
+  MonsterBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addPos(int offset) {
+    fbBuilder.addStruct(0, offset);
+    return fbBuilder.offset;
+  }
+  int addMana(int mana) {
+    fbBuilder.addInt16(1, mana);
+    return fbBuilder.offset;
+  }
+  int addHp(int hp) {
+    fbBuilder.addInt16(2, hp);
+    return fbBuilder.offset;
+  }
+  int addNameOffset(int offset) {
+    fbBuilder.addOffset(3, offset);
+    return fbBuilder.offset;
+  }
+  int addInventoryOffset(int offset) {
+    fbBuilder.addOffset(5, offset);
+    return fbBuilder.offset;
+  }
+  int addColor(Color color) {
+    fbBuilder.addInt8(6, color?.value);
+    return fbBuilder.offset;
+  }
+  int addWeaponsOffset(int offset) {
+    fbBuilder.addOffset(7, offset);
+    return fbBuilder.offset;
+  }
+  int addEquippedType(EquipmentTypeId equippedType) {
+    fbBuilder.addUint8(8, equippedType?.value);
+    return fbBuilder.offset;
+  }
+  int addEquippedOffset(int offset) {
+    fbBuilder.addOffset(9, offset);
+    return fbBuilder.offset;
+  }
+  int addPathOffset(int offset) {
+    fbBuilder.addOffset(10, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class MonsterObjectBuilder extends fb.ObjectBuilder {
+  final Vec3ObjectBuilder _pos;
+  final int _mana;
+  final int _hp;
+  final String _name;
+  final List<int> _inventory;
+  final Color _color;
+  final List<WeaponObjectBuilder> _weapons;
+  final EquipmentTypeId _equippedType;
+  final dynamic _equipped;
+  final List<Vec3ObjectBuilder> _path;
+
+  MonsterObjectBuilder({
+    Vec3ObjectBuilder pos,
+    int mana,
+    int hp,
+    String name,
+    List<int> inventory,
+    Color color,
+    List<WeaponObjectBuilder> weapons,
+    EquipmentTypeId equippedType,
+    dynamic equipped,
+    List<Vec3ObjectBuilder> path,
+  })
+      : _pos = pos,
+        _mana = mana,
+        _hp = hp,
+        _name = name,
+        _inventory = inventory,
+        _color = color,
+        _weapons = weapons,
+        _equippedType = equippedType,
+        _equipped = equipped,
+        _path = path;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int nameOffset = fbBuilder.writeString(_name);
+    final int inventoryOffset = _inventory?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_inventory)
+        : null;
+    final int weaponsOffset = _weapons?.isNotEmpty == true
+        ? fbBuilder.writeList(_weapons.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int equippedOffset = _equipped?.getOrCreateOffset(fbBuilder);
+    final int pathOffset = _path?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_path)
+        : null;
+
+    fbBuilder.startTable();
+    if (_pos != null) {
+      fbBuilder.addStruct(0, _pos.finish(fbBuilder));
+    }
+    fbBuilder.addInt16(1, _mana);
+    fbBuilder.addInt16(2, _hp);
+    if (nameOffset != null) {
+      fbBuilder.addOffset(3, nameOffset);
+    }
+    if (inventoryOffset != null) {
+      fbBuilder.addOffset(5, inventoryOffset);
+    }
+    fbBuilder.addInt8(6, _color?.value);
+    if (weaponsOffset != null) {
+      fbBuilder.addOffset(7, weaponsOffset);
+    }
+    fbBuilder.addUint8(8, _equippedType?.value);
+    if (equippedOffset != null) {
+      fbBuilder.addOffset(9, equippedOffset);
+    }
+    if (pathOffset != null) {
+      fbBuilder.addOffset(10, pathOffset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Weapon {
+  Weapon._(this._bc, this._bcOffset);
+  factory Weapon(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Weapon> reader = const _WeaponReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  String get name => const fb.StringReader().vTableGet(_bc, _bcOffset, 4, null);
+  int get damage => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 6, 0);
+
+  @override
+  String toString() {
+    return 'Weapon{name: $name, damage: $damage}';
+  }
+}
+
+class _WeaponReader extends fb.TableReader<Weapon> {
+  const _WeaponReader();
+
+  @override
+  Weapon createObject(fb.BufferContext bc, int offset) => 
+    new Weapon._(bc, offset);
+}
+
+class WeaponBuilder {
+  WeaponBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addNameOffset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+  int addDamage(int damage) {
+    fbBuilder.addInt16(1, damage);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class WeaponObjectBuilder extends fb.ObjectBuilder {
+  final String _name;
+  final int _damage;
+
+  WeaponObjectBuilder({
+    String name,
+    int damage,
+  })
+      : _name = name,
+        _damage = damage;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int nameOffset = fbBuilder.writeString(_name);
+
+    fbBuilder.startTable();
+    if (nameOffset != null) {
+      fbBuilder.addOffset(0, nameOffset);
+    }
+    fbBuilder.addInt16(1, _damage);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/publish.sh b/3rdparty/TNN/third_party/flatbuffers/dart/publish.sh
new file mode 100755
index 0000000..7f03101
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/publish.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright 2018 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note to pub consumers: this file is used to assist with publishing the
+# pub package from the flatbuffers repository and is not meant for general use.
+# As pub does not currently provide a way to exclude files, it is included here.
+set -e
+
+command -v dart >/dev/null 2>&1 || { echo >&2 "Require `dart` but it's not installed.  Aborting."; exit 1; }
+
+cp ../samples/monster.fbs example/
+cp ../tests/monster_test.fbs test/
+cp -r ../tests/include_test/*.fbs test/
+cp -r ../tests/include_test/sub test/
+
+pushd example
+../../flatc --dart ./monster.fbs
+popd
+
+pushd test
+../../flatc --dart ./monster_test.fbs
+popd
+
+dart pub publish
+
+rm example/monster.fbs
+rm test/*.fbs
+rm -rf test/sub
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/pubspec.yaml b/3rdparty/TNN/third_party/flatbuffers/dart/pubspec.yaml
new file mode 100644
index 0000000..c13d3ce
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/pubspec.yaml
@@ -0,0 +1,20 @@
+name: flat_buffers
+version: 2.0.0
+description: >
+  FlatBuffers reading and writing library for Dart.  Use the flatc compiler to
+  generate Dart classes for a FlatBuffers schema, and this library to assist with
+  reading and writing the binary format.
+
+  Based on original work by Konstantin Scheglov and Paul Berry of the Dart SDK team.
+authors:
+- Dan Field <dfield@gmail.com>
+- Konstantin Scheglov
+- Paul Berry
+homepage: https://github.com/google/flatbuffers
+documentation: https://google.github.io/flatbuffers/index.html
+dev_dependencies:
+  test: ^1.3.0
+  test_reflective_loader: ^0.1.4
+  path: ^1.5.1
+environment:
+  sdk: '>=2.0.0-dev.28.0 <3.0.0'
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/flat_buffers_test.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/flat_buffers_test.dart
new file mode 100644
index 0000000..999a837
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/flat_buffers_test.dart
@@ -0,0 +1,649 @@
+// Copyright (c) 2016, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import 'dart:typed_data';
+import 'dart:io' as io;
+
+import 'package:path/path.dart' as path;
+
+import 'package:flat_buffers/flat_buffers.dart';
+import 'package:test/test.dart';
+import 'package:test_reflective_loader/test_reflective_loader.dart';
+
+import './monster_test_my_game.example_generated.dart' as example;
+
+main() {
+  defineReflectiveSuite(() {
+    defineReflectiveTests(BuilderTest);
+    defineReflectiveTests(CheckOtherLangaugesData);
+    defineReflectiveTests(GeneratorTest);
+  });
+}
+
+int indexToField(int index) {
+  return (1 + 1 + index) * 2;
+}
+
+@reflectiveTest
+class CheckOtherLangaugesData {
+  test_cppData() async {
+    List<int> data = await new io.File(path.join(
+      path.dirname(io.Platform.script.path),
+      'monsterdata_test.mon',
+    )).readAsBytes();
+    example.Monster mon = new example.Monster(data);
+    expect(mon.hp, 80);
+    expect(mon.mana, 150);
+    expect(mon.name, 'MyMonster');
+    expect(mon.pos.x, 1.0);
+    expect(mon.pos.y, 2.0);
+    expect(mon.pos.z, 3.0);
+    expect(mon.pos.test1, 3.0);
+    expect(mon.pos.test2.value, 2.0);
+    expect(mon.pos.test3.a, 5);
+    expect(mon.pos.test3.b, 6);
+    expect(mon.testType.value, example.AnyTypeId.Monster.value);
+    expect(mon.test is example.Monster, true);
+    final monster2 = mon.test as example.Monster;
+    expect(monster2.name, "Fred");
+
+    expect(mon.inventory.length, 5);
+    expect(mon.inventory.reduce((cur, next) => cur + next), 10);
+    expect(mon.test4.length, 2);
+    expect(
+        mon.test4[0].a + mon.test4[0].b + mon.test4[1].a + mon.test4[1].b, 100);
+    expect(mon.testarrayofstring.length, 2);
+    expect(mon.testarrayofstring[0], "test1");
+    expect(mon.testarrayofstring[1], "test2");
+
+    // this will fail if accessing any field fails.
+    expect(
+      mon.toString(),
+      'Monster{'
+      'pos: Vec3{x: 1.0, y: 2.0, z: 3.0, test1: 3.0, test2: Color{value: 2}, test3: Test{a: 5, b: 6}}, '
+      'mana: 150, hp: 80, name: MyMonster, inventory: [0, 1, 2, 3, 4], '
+      'color: Color{value: 8}, testType: AnyTypeId{value: 1}, '
+      'test: Monster{pos: null, mana: 150, hp: 100, name: Fred, '
+      'inventory: null, color: Color{value: 8}, testType: AnyTypeId{value: 0}, '
+      'test: null, test4: null, testarrayofstring: null, '
+      'testarrayoftables: null, enemy: null, testnestedflatbuffer: null, '
+      'testempty: null, testbool: false, testhashs32Fnv1: 0, '
+      'testhashu32Fnv1: 0, testhashs64Fnv1: 0, testhashu64Fnv1: 0, '
+      'testhashs32Fnv1a: 0, testhashu32Fnv1a: 0, testhashs64Fnv1a: 0, '
+      'testhashu64Fnv1a: 0, testarrayofbools: null, testf: 3.14159, '
+      'testf2: 3.0, testf3: 0.0, testarrayofstring2: null, '
+      'testarrayofsortedstruct: null, flex: null, test5: null, '
+      'vectorOfLongs: null, vectorOfDoubles: null, parentNamespaceTest: null, '
+      'vectorOfReferrables: null, singleWeakReference: 0, '
+      'vectorOfWeakReferences: null, vectorOfStrongReferrables: null, '
+      'coOwningReference: 0, vectorOfCoOwningReferences: null, '
+      'nonOwningReference: 0, vectorOfNonOwningReferences: null, '
+      'anyUniqueType: AnyUniqueAliasesTypeId{value: 0}, anyUnique: null, '
+      'anyAmbiguousType: AnyAmbiguousAliasesTypeId{value: 0}, '
+      'anyAmbiguous: null, vectorOfEnums: null, signedEnum: Race{value: -1}, '
+      'testrequirednestedflatbuffer: null}, '
+      'test4: [Test{a: 10, b: 20}, Test{a: 30, b: 40}], '
+      'testarrayofstring: [test1, test2], testarrayoftables: null, '
+      'enemy: Monster{pos: null, mana: 150, hp: 100, name: Fred, '
+      'inventory: null, color: Color{value: 8}, testType: AnyTypeId{value: 0}, '
+      'test: null, test4: null, testarrayofstring: null, '
+      'testarrayoftables: null, enemy: null, testnestedflatbuffer: null, '
+      'testempty: null, testbool: false, testhashs32Fnv1: 0, '
+      'testhashu32Fnv1: 0, testhashs64Fnv1: 0, testhashu64Fnv1: 0, '
+      'testhashs32Fnv1a: 0, testhashu32Fnv1a: 0, testhashs64Fnv1a: 0, '
+      'testhashu64Fnv1a: 0, testarrayofbools: null, testf: 3.14159, '
+      'testf2: 3.0, testf3: 0.0, testarrayofstring2: null, '
+      'testarrayofsortedstruct: null, flex: null, test5: null, '
+      'vectorOfLongs: null, vectorOfDoubles: null, parentNamespaceTest: null, '
+      'vectorOfReferrables: null, singleWeakReference: 0, '
+      'vectorOfWeakReferences: null, vectorOfStrongReferrables: null, '
+      'coOwningReference: 0, vectorOfCoOwningReferences: null, '
+      'nonOwningReference: 0, vectorOfNonOwningReferences: null, '
+      'anyUniqueType: AnyUniqueAliasesTypeId{value: 0}, anyUnique: null, '
+      'anyAmbiguousType: AnyAmbiguousAliasesTypeId{value: 0}, '
+      'anyAmbiguous: null, vectorOfEnums: null, signedEnum: Race{value: -1}, '
+      'testrequirednestedflatbuffer: null}, '
+      'testnestedflatbuffer: null, testempty: null, testbool: true, '
+      'testhashs32Fnv1: -579221183, testhashu32Fnv1: 3715746113, '
+      'testhashs64Fnv1: 7930699090847568257, '
+      'testhashu64Fnv1: 7930699090847568257, '
+      'testhashs32Fnv1a: -1904106383, testhashu32Fnv1a: 2390860913, '
+      'testhashs64Fnv1a: 4898026182817603057, '
+      'testhashu64Fnv1a: 4898026182817603057, '
+      'testarrayofbools: [true, false, true], testf: 3.14159, testf2: 3.0, '
+      'testf3: 0.0, testarrayofstring2: null, testarrayofsortedstruct: null, '
+      'flex: null, test5: [Test{a: 10, b: 20}, Test{a: 30, b: 40}], '
+      'vectorOfLongs: [1, 100, 10000, 1000000, 100000000], '
+      'vectorOfDoubles: [-1.7976931348623157e+308, 0.0, 1.7976931348623157e+308], '
+      'parentNamespaceTest: null, vectorOfReferrables: null, '
+      'singleWeakReference: 0, vectorOfWeakReferences: null, '
+      'vectorOfStrongReferrables: null, coOwningReference: 0, '
+      'vectorOfCoOwningReferences: null, nonOwningReference: 0, '
+      'vectorOfNonOwningReferences: null, '
+      'anyUniqueType: AnyUniqueAliasesTypeId{value: 0}, anyUnique: null, '
+      'anyAmbiguousType: AnyAmbiguousAliasesTypeId{value: 0}, '
+      'anyAmbiguous: null, vectorOfEnums: null, signedEnum: Race{value: -1}, '
+      'testrequirednestedflatbuffer: null}',
+    );
+  }
+}
+
+@reflectiveTest
+class BuilderTest {
+  void test_monsterBuilder() {
+    final fbBuilder = new Builder();
+    final str = fbBuilder.writeString('MyMonster');
+
+    fbBuilder.writeString('test1');
+    fbBuilder.writeString('test2');
+    final testArrayOfString = fbBuilder.endStructVector(2);
+
+    final fred = fbBuilder.writeString('Fred');
+
+    final List<int> treasure = [0, 1, 2, 3, 4];
+    final inventory = fbBuilder.writeListUint8(treasure);
+
+    final monBuilder = new example.MonsterBuilder(fbBuilder)
+      ..begin()
+      ..addNameOffset(fred);
+    final mon2 = monBuilder.finish();
+
+    final testBuilder = new example.TestBuilder(fbBuilder);
+    testBuilder.finish(10, 20);
+    testBuilder.finish(30, 40);
+    final test4 = fbBuilder.endStructVector(2);
+
+    monBuilder
+      ..begin()
+      ..addPos(
+        new example.Vec3Builder(fbBuilder).finish(
+          1.0,
+          2.0,
+          3.0,
+          3.0,
+          example.Color.Green,
+          () => testBuilder.finish(5, 6),
+        ),
+      )
+      ..addHp(80)
+      ..addNameOffset(str)
+      ..addInventoryOffset(inventory)
+      ..addTestType(example.AnyTypeId.Monster)
+      ..addTestOffset(mon2)
+      ..addTest4Offset(test4)
+      ..addTestarrayofstringOffset(testArrayOfString);
+    final mon = monBuilder.finish();
+    fbBuilder.finish(mon);
+  }
+
+  void test_error_addInt32_withoutStartTable() {
+    Builder builder = new Builder();
+    expect(() {
+      builder.addInt32(0, 0);
+    }, throwsStateError);
+  }
+
+  void test_error_addOffset_withoutStartTable() {
+    Builder builder = new Builder();
+    expect(() {
+      builder.addOffset(0, 0);
+    }, throwsStateError);
+  }
+
+  void test_error_endTable_withoutStartTable() {
+    Builder builder = new Builder();
+    expect(() {
+      builder.endTable();
+    }, throwsStateError);
+  }
+
+  void test_error_startTable_duringTable() {
+    Builder builder = new Builder();
+    builder.startTable();
+    expect(() {
+      builder.startTable();
+    }, throwsStateError);
+  }
+
+  void test_error_writeString_duringTable() {
+    Builder builder = new Builder();
+    builder.startTable();
+    expect(() {
+      builder.writeString('12345');
+    }, throwsStateError);
+  }
+
+  void test_file_identifier() {
+    Uint8List byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      builder.startTable();
+      int offset = builder.endTable();
+      byteList = builder.finish(offset, 'Az~ÿ');
+    }
+    // Convert byteList to a ByteData so that we can read data from it.
+    ByteData byteData = byteList.buffer.asByteData(byteList.offsetInBytes);
+    // First 4 bytes are an offset to the table data.
+    int tableDataLoc = byteData.getUint32(0, Endian.little);
+    // Next 4 bytes are the file identifier.
+    expect(byteData.getUint8(4), 65); // 'a'
+    expect(byteData.getUint8(5), 122); // 'z'
+    expect(byteData.getUint8(6), 126); // '~'
+    expect(byteData.getUint8(7), 255); // 'ÿ'
+    // First 4 bytes of the table data are a backwards offset to the vtable.
+    int vTableLoc =
+        tableDataLoc - byteData.getInt32(tableDataLoc, Endian.little);
+    // First 2 bytes of the vtable are the size of the vtable in bytes, which
+    // should be 4.
+    expect(byteData.getUint16(vTableLoc, Endian.little), 4);
+    // Next 2 bytes are the size of the object in bytes (including the vtable
+    // pointer), which should be 4.
+    expect(byteData.getUint16(vTableLoc + 2, Endian.little), 4);
+  }
+
+  void test_low() {
+    Builder builder = new Builder(initialSize: 0);
+    expect((builder..putUint8(1)).lowFinish(), [1]);
+    expect((builder..putUint32(2)).lowFinish(), [2, 0, 0, 0, 0, 0, 0, 1]);
+    expect((builder..putUint8(3)).lowFinish(),
+        [0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 1]);
+    expect((builder..putUint8(4)).lowFinish(),
+        [0, 0, 4, 3, 2, 0, 0, 0, 0, 0, 0, 1]);
+    expect((builder..putUint8(5)).lowFinish(),
+        [0, 5, 4, 3, 2, 0, 0, 0, 0, 0, 0, 1]);
+    expect((builder..putUint32(6)).lowFinish(),
+        [6, 0, 0, 0, 0, 5, 4, 3, 2, 0, 0, 0, 0, 0, 0, 1]);
+  }
+
+  void test_table_default() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      builder.startTable();
+      builder.addInt32(0, 10, 10);
+      builder.addInt32(1, 20, 10);
+      int offset = builder.endTable();
+      byteList = builder.finish(offset);
+      expect(builder.size(), byteList.length);
+    }
+    // read and verify
+    BufferContext buffer = new BufferContext.fromBytes(byteList);
+    int objectOffset = buffer.derefObject(0);
+    // was not written, so uses the new default value
+    expect(
+        const Int32Reader()
+            .vTableGet(buffer, objectOffset, indexToField(0), 15),
+        15);
+    // has the written value
+    expect(
+        const Int32Reader()
+            .vTableGet(buffer, objectOffset, indexToField(1), 15),
+        20);
+  }
+
+  void test_table_format() {
+    Uint8List byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      builder.startTable();
+      builder.addInt32(0, 10);
+      builder.addInt32(1, 20);
+      builder.addInt32(2, 30);
+      byteList = builder.finish(builder.endTable());
+    }
+    // Convert byteList to a ByteData so that we can read data from it.
+    ByteData byteData = byteList.buffer.asByteData(byteList.offsetInBytes);
+    // First 4 bytes are an offset to the table data.
+    int tableDataLoc = byteData.getUint32(0, Endian.little);
+    // First 4 bytes of the table data are a backwards offset to the vtable.
+    int vTableLoc =
+        tableDataLoc - byteData.getInt32(tableDataLoc, Endian.little);
+    // First 2 bytes of the vtable are the size of the vtable in bytes, which
+    // should be 10.
+    expect(byteData.getUint16(vTableLoc, Endian.little), 10);
+    // Next 2 bytes are the size of the object in bytes (including the vtable
+    // pointer), which should be 16.
+    expect(byteData.getUint16(vTableLoc + 2, Endian.little), 16);
+    // Remaining 6 bytes are the offsets within the object where the ints are
+    // located.
+    for (int i = 0; i < 3; i++) {
+      int offset = byteData.getUint16(vTableLoc + 4 + 2 * i, Endian.little);
+      expect(
+          byteData.getInt32(tableDataLoc + offset, Endian.little), 10 + 10 * i);
+    }
+  }
+
+  void test_table_string() {
+    String latinString = 'test';
+    String unicodeString = 'Проба пера';
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int latinStringOffset = builder.writeString(latinString);
+      int unicodeStringOffset = builder.writeString(unicodeString);
+      builder.startTable();
+      builder.addOffset(0, latinStringOffset);
+      builder.addOffset(1, unicodeStringOffset);
+      int offset = builder.endTable();
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    int objectOffset = buf.derefObject(0);
+    expect(const StringReader().vTableGet(buf, objectOffset, indexToField(0)),
+        latinString);
+    expect(const StringReader().vTableGet(buf, objectOffset, indexToField(1)),
+        unicodeString);
+  }
+
+  void test_table_types() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int stringOffset = builder.writeString('12345');
+      builder.startTable();
+      builder.addBool(0, true);
+      builder.addInt8(1, 10);
+      builder.addInt32(2, 20);
+      builder.addOffset(3, stringOffset);
+      builder.addInt32(4, 40);
+      builder.addUint32(5, 0x9ABCDEF0);
+      builder.addUint8(6, 0x9A);
+      int offset = builder.endTable();
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    int objectOffset = buf.derefObject(0);
+    expect(
+        const BoolReader().vTableGet(buf, objectOffset, indexToField(0)), true);
+    expect(
+        const Int8Reader().vTableGet(buf, objectOffset, indexToField(1)), 10);
+    expect(
+        const Int32Reader().vTableGet(buf, objectOffset, indexToField(2)), 20);
+    expect(const StringReader().vTableGet(buf, objectOffset, indexToField(3)),
+        '12345');
+    expect(
+        const Int32Reader().vTableGet(buf, objectOffset, indexToField(4)), 40);
+    expect(const Uint32Reader().vTableGet(buf, objectOffset, indexToField(5)),
+        0x9ABCDEF0);
+    expect(const Uint8Reader().vTableGet(buf, objectOffset, indexToField(6)),
+        0x9A);
+  }
+
+  void test_writeList_of_Uint32() {
+    List<int> values = <int>[10, 100, 12345, 0x9abcdef0];
+    // write
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListUint32(values);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<int> items = const Uint32ListReader().read(buf, 0);
+    expect(items, hasLength(4));
+    expect(items, orderedEquals(values));
+  }
+
+  void test_writeList_ofBool() {
+    void verifyListBooleans(int len, List<int> trueBits) {
+      // write
+      List<int> byteList;
+      {
+        Builder builder = new Builder(initialSize: 0);
+        List<bool> values = new List<bool>.filled(len, false);
+        for (int bit in trueBits) {
+          values[bit] = true;
+        }
+        int offset = builder.writeListBool(values);
+        byteList = builder.finish(offset);
+      }
+      // read and verify
+      BufferContext buf = new BufferContext.fromBytes(byteList);
+      List<bool> items = const BoolListReader().read(buf, 0);
+      expect(items, hasLength(len));
+      for (int i = 0; i < items.length; i++) {
+        expect(items[i], trueBits.contains(i), reason: 'bit $i of $len');
+      }
+    }
+
+    verifyListBooleans(0, <int>[]);
+    verifyListBooleans(1, <int>[]);
+    verifyListBooleans(1, <int>[0]);
+    verifyListBooleans(31, <int>[0, 1]);
+    verifyListBooleans(31, <int>[1, 2, 24, 25, 30]);
+    verifyListBooleans(31, <int>[0, 30]);
+    verifyListBooleans(32, <int>[1, 2, 24, 25, 31]);
+    verifyListBooleans(33, <int>[1, 2, 24, 25, 32]);
+    verifyListBooleans(33, <int>[1, 2, 24, 25, 31, 32]);
+    verifyListBooleans(63, <int>[]);
+    verifyListBooleans(63, <int>[0, 1, 2, 61, 62]);
+    verifyListBooleans(63, new List<int>.generate(63, (i) => i));
+    verifyListBooleans(64, <int>[]);
+    verifyListBooleans(64, <int>[0, 1, 2, 61, 62, 63]);
+    verifyListBooleans(64, <int>[1, 2, 62]);
+    verifyListBooleans(64, <int>[0, 1, 2, 63]);
+    verifyListBooleans(64, new List<int>.generate(64, (i) => i));
+    verifyListBooleans(100, <int>[0, 3, 30, 60, 90, 99]);
+  }
+
+  void test_writeList_ofInt32() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListInt32(<int>[1, 2, 3, 4, 5]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<int> items = const ListReader<int>(const Int32Reader()).read(buf, 0);
+    expect(items, hasLength(5));
+    expect(items, orderedEquals(<int>[1, 2, 3, 4, 5]));
+  }
+
+  void test_writeList_ofFloat64() {
+    List<double> values = <double>[-1.234567, 3.4E+9, -5.6E-13, 7.8, 12.13];
+    // write
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListFloat64(values);
+      byteList = builder.finish(offset);
+    }
+
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<double> items = const Float64ListReader().read(buf, 0);
+
+    expect(items, hasLength(values.length));
+    for (int i = 0; i < values.length; i++) {
+      expect(values[i], closeTo(items[i], .001));
+    }
+  }
+
+  void test_writeList_ofFloat32() {
+    List<double> values = [1.0, 2.23, -3.213, 7.8, 12.13];
+    // write
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListFloat32(values);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<double> items = const Float32ListReader().read(buf, 0);
+    expect(items, hasLength(5));
+    for (int i = 0; i < values.length; i++) {
+      expect(values[i], closeTo(items[i], .001));
+    }
+  }
+
+  void test_writeList_ofObjects() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      // write the object #1
+      int object1;
+      {
+        builder.startTable();
+        builder.addInt32(0, 10);
+        builder.addInt32(1, 20);
+        object1 = builder.endTable();
+      }
+      // write the object #1
+      int object2;
+      {
+        builder.startTable();
+        builder.addInt32(0, 100);
+        builder.addInt32(1, 200);
+        object2 = builder.endTable();
+      }
+      // write the list
+      int offset = builder.writeList([object1, object2]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<TestPointImpl> items =
+        const ListReader<TestPointImpl>(const TestPointReader()).read(buf, 0);
+    expect(items, hasLength(2));
+    expect(items[0].x, 10);
+    expect(items[0].y, 20);
+    expect(items[1].x, 100);
+    expect(items[1].y, 200);
+  }
+
+  void test_writeList_ofStrings_asRoot() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int str1 = builder.writeString('12345');
+      int str2 = builder.writeString('ABC');
+      int offset = builder.writeList([str1, str2]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<String> items =
+        const ListReader<String>(const StringReader()).read(buf, 0);
+    expect(items, hasLength(2));
+    expect(items, contains('12345'));
+    expect(items, contains('ABC'));
+  }
+
+  void test_writeList_ofStrings_inObject() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int listOffset = builder.writeList(
+          [builder.writeString('12345'), builder.writeString('ABC')]);
+      builder.startTable();
+      builder.addOffset(0, listOffset);
+      int offset = builder.endTable();
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    StringListWrapperImpl reader = new StringListWrapperReader().read(buf, 0);
+    List<String> items = reader.items;
+    expect(items, hasLength(2));
+    expect(items, contains('12345'));
+    expect(items, contains('ABC'));
+  }
+
+  void test_writeList_ofUint32() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListUint32(<int>[1, 2, 0x9ABCDEF0]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<int> items = const Uint32ListReader().read(buf, 0);
+    expect(items, hasLength(3));
+    expect(items, orderedEquals(<int>[1, 2, 0x9ABCDEF0]));
+  }
+
+  void test_writeList_ofUint16() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListUint16(<int>[1, 2, 60000]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<int> items = const Uint16ListReader().read(buf, 0);
+    expect(items, hasLength(3));
+    expect(items, orderedEquals(<int>[1, 2, 60000]));
+  }
+
+  void test_writeList_ofUint8() {
+    List<int> byteList;
+    {
+      Builder builder = new Builder(initialSize: 0);
+      int offset = builder.writeListUint8(<int>[1, 2, 3, 4, 0x9A]);
+      byteList = builder.finish(offset);
+    }
+    // read and verify
+    BufferContext buf = new BufferContext.fromBytes(byteList);
+    List<int> items = const Uint8ListReader().read(buf, 0);
+    expect(items, hasLength(5));
+    expect(items, orderedEquals(<int>[1, 2, 3, 4, 0x9A]));
+  }
+}
+
+class StringListWrapperImpl {
+  final BufferContext bp;
+  final int offset;
+
+  StringListWrapperImpl(this.bp, this.offset);
+
+  List<String> get items => const ListReader<String>(const StringReader())
+      .vTableGet(bp, offset, indexToField(0));
+}
+
+class StringListWrapperReader extends TableReader<StringListWrapperImpl> {
+  const StringListWrapperReader();
+
+  @override
+  StringListWrapperImpl createObject(BufferContext object, int offset) {
+    return new StringListWrapperImpl(object, offset);
+  }
+}
+
+class TestPointImpl {
+  final BufferContext bp;
+  final int offset;
+
+  TestPointImpl(this.bp, this.offset);
+
+  int get x => const Int32Reader().vTableGet(bp, offset, indexToField(0), 0);
+
+  int get y => const Int32Reader().vTableGet(bp, offset, indexToField(1), 0);
+}
+
+class TestPointReader extends TableReader<TestPointImpl> {
+  const TestPointReader();
+
+  @override
+  TestPointImpl createObject(BufferContext object, int offset) {
+    return new TestPointImpl(object, offset);
+  }
+}
+
+@reflectiveTest
+class GeneratorTest {
+  void test_constantEnumValues() async {
+    expect(example.Color.values, same(example.Color.values));
+    expect(example.Race.values, same(example.Race.values));
+    expect(example.AnyTypeId.values, same(example.AnyTypeId.values));
+    expect(example.AnyUniqueAliasesTypeId.values, same(example.AnyUniqueAliasesTypeId.values));
+    expect(example.AnyAmbiguousAliasesTypeId.values, same(example.AnyAmbiguousAliasesTypeId.values));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_builder_test.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_builder_test.dart
new file mode 100644
index 0000000..5a1c1a8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_builder_test.dart
@@ -0,0 +1,315 @@
+import 'dart:typed_data';
+
+import 'package:flat_buffers/flex_buffers.dart' show Builder;
+import 'package:test/test.dart';
+
+void main() {
+  test('build with single value', () {
+    {
+      var flx = Builder();
+      flx.addNull();
+      expect(flx.finish(), [0, 0, 1]);
+    }
+    {
+      var flx = Builder();
+      flx.addBool(true);
+      expect(flx.finish(), [1, 104, 1]);
+    }
+    {
+      var flx = Builder();
+      flx.addBool(false);
+      expect(flx.finish(), [0, 104, 1]);
+    }
+    {
+      var flx = Builder();
+      flx.addInt(1);
+      expect(flx.finish(), [1, 4, 1]);
+    }
+    {
+      var flx = Builder();
+      flx.addInt(230);
+      expect(flx.finish(), [230, 0, 5, 2]);
+    }
+    {
+      var flx = Builder();
+      flx.addInt(1025);
+      expect(flx.finish(), [1, 4, 5, 2]);
+    }
+    {
+      var flx = Builder();
+      flx.addInt(-1025);
+      expect(flx.finish(), [255, 251, 5, 2]);
+    }
+    {
+      var flx = Builder();
+      flx.addDouble(0.1);
+      expect(flx.finish(), [154, 153, 153, 153, 153, 153, 185, 63, 15, 8]);
+    }
+    {
+      var flx = Builder();
+      flx.addDouble(0.5);
+      expect(flx.finish(), [0, 0, 0, 63, 14, 4]);
+    }
+    {
+      var flx = Builder();
+      flx.addString('Maxim');
+      expect(flx.finish(), [5, 77, 97, 120, 105, 109, 0, 6, 20, 1]);
+    }
+    {
+      var flx = Builder();
+      flx.addString('hello 😱');
+      expect(flx.finish(), [10, 104, 101, 108, 108, 111, 32, 240, 159, 152, 177, 0, 11, 20, 1]);
+    }
+  });
+
+  test('build vector', (){
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addInt(1)
+        ..addInt(2)
+        ..end()
+      ;
+      expect(flx.finish(), [1, 2, 2, 64, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addInt(-1)
+        ..addInt(256)
+        ..end()
+      ;
+      expect(flx.finish(), [255, 255, 0, 1, 4, 65, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addInt(-45)
+        ..addInt(256000)
+        ..end()
+      ;
+      expect(flx.finish(), [211, 255, 255, 255, 0, 232, 3, 0, 8, 66, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addDouble(1.1)
+        ..addDouble(-256)
+        ..end()
+      ;
+      expect(flx.finish(), [154, 153, 153, 153, 153, 153, 241, 63, 0, 0, 0, 0, 0, 0, 112, 192, 16, 75, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addInt(1)
+        ..addInt(2)
+        ..addInt(4)
+        ..end()
+      ;
+      expect(flx.finish(), [1, 2, 4, 3, 76, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addInt(-1)
+        ..addInt(256)
+        ..addInt(4)
+        ..end()
+      ;
+      expect(flx.finish(), [255, 255, 0, 1, 4, 0, 6, 77, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+          ..startVector()
+          ..addInt(61)
+          ..end()
+        ..addInt(64)
+        ..end()
+      ;
+      expect(flx.finish(), [1, 61, 2, 2, 64, 44, 4, 4, 40, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addString('foo')
+        ..addString('bar')
+        ..addString('baz')
+        ..end()
+      ;
+      expect(flx.finish(), [3, 102, 111, 111, 0, 3, 98, 97, 114, 0, 3, 98, 97, 122, 0, 3, 15, 11, 7, 3, 60, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addString('foo')
+        ..addString('bar')
+        ..addString('baz')
+        ..addString('foo')
+        ..addString('bar')
+        ..addString('baz')
+        ..end()
+      ;
+      expect(flx.finish(), [3, 102, 111, 111, 0, 3, 98, 97, 114, 0, 3, 98, 97, 122, 0, 6, 15, 11, 7, 18, 14, 10, 6, 60, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addBool(true)
+        ..addBool(false)
+        ..addBool(true)
+        ..end()
+      ;
+      expect(flx.finish(), [3, 1, 0, 1, 3, 144, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+        ..addString('foo')
+        ..addInt(1)
+        ..addInt(-5)
+        ..addDouble(1.3)
+        ..addBool(true)
+        ..end()
+      ;
+      expect(flx.finish(), [
+        3, 102, 111, 111, 0, 0, 0, 0,
+        5, 0, 0, 0, 0, 0, 0, 0,
+        15, 0, 0, 0, 0, 0, 0, 0,
+        1, 0, 0, 0, 0, 0, 0, 0,
+        251, 255, 255, 255, 255, 255, 255, 255,
+        205, 204, 204, 204, 204, 204, 244, 63,
+        1, 0, 0, 0, 0, 0, 0, 0,
+        20, 4, 4, 15, 104, 45, 43, 1]);
+    }
+  });
+
+  test('build map', ()
+  {
+    {
+      var flx = Builder()
+        ..startMap()
+        ..addKey('a')
+        ..addInt(12)
+        ..end()
+      ;
+      expect(flx.finish(), [97, 0, 1, 3, 1, 1, 1, 12, 4, 2, 36, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startMap()
+        ..addKey('a')
+        ..addInt(12)
+        ..addKey('')
+        ..addInt(45)
+        ..end()
+      ;
+      expect(flx.finish(), [97, 0, 0, 2, 2, 5, 2, 1, 2, 45, 12, 4, 4, 4, 36, 1]);
+    }
+    {
+      var flx = Builder()
+        ..startVector()
+          ..startMap()
+            ..addKey('something')
+            ..addInt(12)
+          ..end()
+          ..startMap()
+            ..addKey('something')
+            ..addInt(45)
+          ..end()
+        ..end()
+      ;
+      expect(flx.finish(), [115, 111, 109, 101, 116, 104, 105, 110, 103, 0,
+        1, 11, 1, 1, 1, 12, 4, 6, 1, 1, 45, 4, 2, 8, 4, 36, 36, 4, 40, 1]);
+    }
+  });
+
+  test('build blob', ()
+  {
+    {
+      var flx = Builder()
+        ..addBlob(Uint8List.fromList([1, 2, 3]).buffer)
+      ;
+      expect(flx.finish(), [3, 1, 2, 3, 3, 100, 1]);
+    }
+  });
+
+  test('build from object', (){
+    expect(Builder.buildFromObject(Uint8List.fromList([1, 2, 3]).buffer).asUint8List(), [3, 1, 2, 3, 3, 100, 1]);
+    expect(Builder.buildFromObject(null).asUint8List(), [0, 0, 1]);
+    expect(Builder.buildFromObject(true).asUint8List(), [1, 104, 1]);
+    expect(Builder.buildFromObject(false).asUint8List(), [0, 104, 1]);
+    expect(Builder.buildFromObject(25).asUint8List(), [25, 4, 1]);
+    expect(Builder.buildFromObject(-250).asUint8List(), [6, 255, 5, 2]);
+    expect(Builder.buildFromObject(-2.50).asUint8List(), [0, 0, 32, 192, 14, 4]);
+    expect(Builder.buildFromObject('Maxim').asUint8List(), [5, 77, 97, 120, 105, 109, 0, 6, 20, 1]);
+    expect(Builder.buildFromObject([1, 3.3, 'max', true, null, false]).asUint8List(), [
+      3, 109, 97, 120, 0, 0, 0, 0,
+      6, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0,
+      102, 102, 102, 102, 102, 102, 10, 64,
+      31, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      4, 15, 20, 104, 0, 104, 54, 43, 1
+    ]);
+    expect(Builder.buildFromObject([{'something':12}, {'something': 45}]).asUint8List(), [
+      115, 111, 109, 101, 116, 104, 105, 110, 103, 0,
+      1, 11, 1, 1, 1, 12, 4, 6, 1, 1, 45, 4, 2, 8, 4, 36, 36, 4, 40, 1
+    ]);
+  });
+
+  test('add double indirectly', (){
+    var flx = Builder()
+      ..addDoubleIndirectly(0.1)
+    ;
+    expect(flx.finish(), [154, 153, 153, 153, 153, 153, 185, 63, 8, 35, 1]);
+  });
+
+  test('add double indirectly to vector with cache', (){
+    var flx = Builder()
+      ..startVector()
+      ..addDoubleIndirectly(0.1, cache: true)
+      ..addDoubleIndirectly(0.1, cache: true)
+      ..addDoubleIndirectly(0.1, cache: true)
+      ..addDoubleIndirectly(0.1, cache: true)
+      ..end()
+    ;
+    expect(flx.finish(), [154, 153, 153, 153, 153, 153, 185, 63,
+      4, 9, 10, 11, 12, 35, 35, 35, 35, 8, 40, 1]);
+  });
+
+  test('add int indirectly', (){
+    var flx = Builder()
+      ..addIntIndirectly(2345234523452345)
+    ;
+    expect(flx.finish(), [185, 115, 175, 118, 250, 84, 8, 0, 8, 27, 1]);
+  });
+
+  test('add int indirectly to vector with cache', (){
+    var flx = Builder()
+      ..startVector()
+      ..addIntIndirectly(2345234523452345, cache: true)
+      ..addIntIndirectly(2345234523452345, cache: true)
+      ..addIntIndirectly(2345234523452345, cache: true)
+      ..addIntIndirectly(2345234523452345, cache: true)
+      ..end()
+    ;
+    expect(flx.finish(), [185, 115, 175, 118, 250, 84, 8, 0,
+      4, 9, 10, 11, 12, 27, 27, 27, 27, 8, 40, 1]);
+  });
+
+  test('snapshot', (){
+    var flx = Builder();
+    flx.startVector();
+    flx.addInt(12);
+    expect(flx.snapshot().asUint8List(), [1, 12, 1, 44, 1]);
+    flx.addInt(24);
+    expect(flx.snapshot().asUint8List(), [12, 24, 2, 64, 1]);
+    flx.addInt(45);
+    expect(flx.snapshot().asUint8List(), [12, 24, 45, 3, 76, 1]);
+  });
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_reader_test.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_reader_test.dart
new file mode 100644
index 0000000..ec30367
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_reader_test.dart
@@ -0,0 +1,334 @@
+import 'dart:typed_data';
+
+import 'package:flat_buffers/flex_buffers.dart' show Reference, Builder;
+import 'package:test/test.dart';
+
+void main() {
+  test('is null', () {
+    expect(Reference.fromBuffer(b([0, 0, 1])).isNull, isTrue);
+  });
+
+  test('bool value', () {
+    expect(Reference.fromBuffer(b([1, 104, 1])).boolValue, isTrue);
+    expect(Reference.fromBuffer(b([0, 104, 1])).boolValue, isFalse);
+  });
+  test('int value', () {
+    expect(Reference.fromBuffer(b([25, 4, 1])).intValue, 25);
+    expect(Reference.fromBuffer(b([231, 4, 1])).intValue, -25);
+    expect(Reference.fromBuffer(b([230, 8, 1])).intValue, 230);
+    expect(Reference.fromBuffer(b([230, 0, 5, 2])).intValue, 230);
+    expect(Reference.fromBuffer(b([1, 4, 5, 2])).intValue, 1025);
+    expect(Reference.fromBuffer(b([255, 251, 5, 2])).intValue, -1025);
+    expect(Reference.fromBuffer(b([1, 4, 9, 2])).intValue, 1025);
+    expect(Reference.fromBuffer(b([255, 255, 255, 127, 6, 4])).intValue,
+      2147483647);
+    expect(
+      Reference.fromBuffer(b([0, 0, 0, 128, 6, 4])).intValue, -2147483648);
+    expect(
+      Reference.fromBuffer(b([255, 255, 255, 255, 0, 0, 0, 0, 7, 8]))
+        .intValue,
+      4294967295);
+    expect(
+      Reference.fromBuffer(b([255, 255, 255, 255, 255, 255, 255, 127, 7, 8]))
+        .intValue,
+      9223372036854775807);
+    expect(Reference.fromBuffer(b([0, 0, 0, 0, 0, 0, 0, 128, 7, 8])).intValue,
+      -9223372036854775808);
+    // Dart does not really support UInt64
+//      expect(FlxValue.fromBuffer(b([255, 255, 255, 255, 255, 255, 255, 255, 11, 8])).intValue, 18446744073709551615);
+  });
+  test('double value', () {
+    expect(Reference.fromBuffer(b([0, 0, 144, 64, 14, 4])).doubleValue, 4.5);
+    expect(Reference.fromBuffer(b([205, 204, 204, 61, 14, 4])).doubleValue,
+      closeTo(.1, .001));
+    expect(
+      Reference.fromBuffer(b([154, 153, 153, 153, 153, 153, 185, 63, 15, 8]))
+        .doubleValue,
+      .1);
+  });
+  test('num value', () {
+    expect(Reference.fromBuffer(b([0, 0, 144, 64, 14, 4])).numValue, 4.5);
+    expect(Reference.fromBuffer(b([205, 204, 204, 61, 14, 4])).numValue,
+      closeTo(.1, .001));
+    expect(
+      Reference.fromBuffer(b([154, 153, 153, 153, 153, 153, 185, 63, 15, 8]))
+        .numValue,
+      .1);
+    expect(Reference.fromBuffer(b([255, 251, 5, 2])).numValue, -1025);
+  });
+  test('string value', () {
+    expect(
+      Reference.fromBuffer(b([5, 77, 97, 120, 105, 109, 0, 6, 20, 1]))
+        .stringValue,
+      'Maxim');
+    expect(
+      Reference.fromBuffer(b([
+        10, 104, 101, 108, 108, 111, 32, 240, 159, 152, 177, 0, 11, 20, 1
+      ])).stringValue,
+      'hello 😱');
+  });
+  test('blob value', () {
+    expect(
+      Reference.fromBuffer(b([3, 1, 2, 3, 3, 100, 1])).blobValue, [1, 2, 3]);
+  });
+  test('bool vector', () {
+    var flx = Reference.fromBuffer(b([3, 1, 0, 1, 3, 144, 1]));
+    expect(flx[0].boolValue, true);
+    expect(flx[1].boolValue, false);
+    expect(flx[2].boolValue, true);
+  });
+  test('number vector', () {
+    testNumbers([3, 1, 2, 3, 3, 44, 1], [1, 2, 3]);
+    testNumbers([3, 255, 2, 3, 3, 44, 1], [-1, 2, 3]);
+    testNumbers([3, 0, 1, 0, 43, 2, 3, 0, 6, 45, 1], [1, 555, 3]);
+    testNumbers(
+      [3, 0, 0, 0, 1, 0, 0, 0, 204, 216, 0, 0, 3, 0, 0, 0, 12, 46, 1],
+      [1, 55500, 3]);
+    testNumbers([
+      3, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0,
+      172, 128, 94, 239, 12, 0, 0, 0,
+      3, 0, 0, 0, 0, 0, 0, 0,
+      24, 47, 1
+    ], [1, 55555555500, 3
+    ]);
+    testNumbers(
+      [3, 0, 0, 0, 0, 0, 192, 63, 0, 0, 32, 64, 0, 0, 96, 64, 12, 54, 1],
+      [1.5, 2.5, 3.5]);
+    testNumbers([
+      3, 0, 0, 0, 0, 0, 0, 0,
+      154, 153, 153, 153, 153, 153, 241, 63,
+      154, 153, 153, 153, 153, 153, 1, 64,
+      102, 102, 102, 102, 102, 102, 10, 64,
+      24, 55, 1
+    ], [1.1, 2.2, 3.3
+    ]);
+  });
+  test('number vector, fixed type', () {
+    testNumbers([1, 2, 2, 64, 1], [1, 2]);
+    testNumbers([255, 255, 0, 1, 4, 65, 1], [-1, 256]);
+    testNumbers([211, 255, 255, 255, 0, 232, 3, 0, 8, 66, 1], [-45, 256000]);
+    testNumbers([
+      211, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 127,
+      16, 67, 1
+    ], [
+      -45, 9223372036854775807
+    ]);
+
+    testNumbers([1, 2, 2, 68, 1], [1, 2]);
+    testNumbers([1, 0, 0, 1, 4, 69, 1], [1, 256]);
+    testNumbers([45, 0, 0, 0, 0, 232, 3, 0, 8, 70, 1], [45, 256000]);
+
+    testNumbers([205, 204, 140, 63, 0, 0, 0, 192, 8, 74, 1], [1.1, -2]);
+    testNumbers([
+      154, 153, 153, 153, 153, 153, 241, 63,
+      0, 0, 0, 0, 0, 0, 112, 192,
+      16, 75, 1
+    ], [
+      1.1, -256
+    ]);
+
+    testNumbers([211, 255, 255, 255, 0, 232, 3, 0, 4, 0, 0, 0, 12, 78, 1],
+      [-45, 256000, 4]);
+
+    testNumbers([
+      211, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 127,
+      4, 0, 0, 0, 0, 0, 0, 0,
+      9, 0, 0, 0, 0, 0, 0, 0,
+      32, 91, 1
+    ], [
+      -45, 9223372036854775807, 4, 9
+    ]);
+
+    testNumbers([
+      45, 0, 0, 0, 0, 0, 0, 0,
+      255, 255, 255, 255, 255, 255, 255, 127,
+      4, 0, 0, 0, 0, 0, 0, 0,
+      9, 0, 0, 0, 0, 0, 0, 0,
+      32, 95, 1
+    ], [
+      45, 9223372036854775807, 4, 9
+    ]);
+
+    testNumbers([
+      154, 153, 153, 153, 153, 153, 241, 63,
+      0, 0, 0, 0, 0, 0, 112, 64,
+      0, 0, 0, 0, 0, 0, 16, 64,
+      24, 87, 1
+    ], [
+      1.1, 256, 4
+    ]);
+
+    testNumbers([
+      154, 153, 153, 153, 153, 153, 241, 63,
+      0, 0, 0, 0, 0, 0, 112, 64,
+      0, 0, 0, 0, 0, 0, 16, 64,
+      0, 0, 0, 0, 0, 0, 34, 64,
+      32, 99, 1
+    ], [
+      1.1, 256, 4, 9
+    ]);
+  });
+  test('string vector', () {
+    testStrings([
+      3, 102, 111, 111, 0,
+      3, 98, 97, 114, 0,
+      3, 98, 97, 122, 0,
+      3, 15, 11, 7,
+      3, 60, 1
+    ], [
+      'foo', 'bar', 'baz'
+    ]);
+    testStrings([
+      3, 102, 111, 111, 0,
+      3, 98, 97, 114, 0,
+      3, 98, 97, 122, 0,
+      6, 15, 11, 7, 18, 14, 10,
+      6, 60, 1
+    ], [
+      'foo', 'bar', 'baz', 'foo', 'bar', 'baz'
+    ]);
+  });
+  test('mixed vector', () {
+    var flx = Reference.fromBuffer(b([
+      3, 102, 111, 111, 0, 0, 0, 0,
+      5, 0, 0, 0, 0, 0, 0, 0,
+      15, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0,
+      251, 255, 255, 255, 255, 255, 255, 255,
+      205, 204, 204, 204, 204, 204, 244, 63,
+      1, 0, 0, 0, 0, 0, 0, 0,
+      20, 4, 4, 15, 104, 45, 43, 1
+    ]));
+    expect(flx.length, 5);
+    expect(flx[0].stringValue, 'foo');
+    expect(flx[1].numValue, 1);
+    expect(flx[2].numValue, -5);
+    expect(flx[3].numValue, 1.3);
+    expect(flx[4].boolValue, true);
+  });
+
+  test('single value map', () {
+    var flx = Reference.fromBuffer(b([97, 0, 1, 3, 1, 1, 1, 12, 4, 2, 36, 1]));
+    expect(flx.length, 1);
+    expect(flx['a'].numValue, 12);
+  });
+  test('two value map', () {
+    var flx = Reference.fromBuffer(b([0, 97, 0, 2, 4, 4, 2, 1, 2, 45, 12, 4, 4, 4, 36, 1]));
+    expect(flx.length, 2);
+    expect(flx['a'].numValue, 12);
+    expect(flx[''].numValue, 45);
+  });
+  test('complex map', () {
+    var flx = complexMap();
+    expect(flx.length, 5);
+    expect(flx['age'].numValue, 35);
+    expect(flx['weight'].numValue, 72.5);
+    expect(flx['name'].stringValue, 'Maxim');
+
+    expect(flx['flags'].length, 4);
+    expect(flx['flags'][0].boolValue, true);
+    expect(flx['flags'][1].boolValue, false);
+    expect(flx['flags'][2].boolValue, true);
+    expect(flx['flags'][3].boolValue, true);
+
+    expect(flx['address'].length, 3);
+    expect(flx['address']['city'].stringValue, 'Bla');
+    expect(flx['address']['zip'].stringValue, '12345');
+    expect(flx['address']['countryCode'].stringValue, 'XX');
+
+    expect(() => flx['address']['country'].stringValue,
+      throwsA(predicate((e) => e is ArgumentError && e.message == 'Key: [country] is not applicable on: //address of: ValueType.Map')));
+    expect(() => flx['address']['countryCode'][0],
+      throwsA(predicate((e) => e is ArgumentError && e.message == 'Key: [0] is not applicable on: //address/countryCode of: ValueType.String')));
+    expect(() => flx[1],
+      throwsA(predicate((e) => e is ArgumentError && e.message == 'Key: [1] is not applicable on: / of: ValueType.Map')));
+    expect(() => flx['flags'][4],
+      throwsA(predicate((e) => e is ArgumentError && e.message == 'Key: [4] is not applicable on: //flags of: ValueType.VectorBool length: 4')));
+    expect(() => flx['flags'][-1],
+      throwsA(predicate((e) => e is ArgumentError && e.message == 'Key: [-1] is not applicable on: //flags of: ValueType.VectorBool length: 4')));
+  });
+  test('complex map to json', () {
+    var flx = complexMap();
+    expect(flx.json, '{"address":{"city":"Bla","countryCode":"XX","zip":"12345"},"age":35,"flags":[true,false,true,true],"name":"Maxim","weight":72.5}');
+  });
+
+  test('complex map iterators', () {
+    var flx = complexMap();
+    expect(flx.mapKeyIterable.map((e) => e).toList(), ['address', 'age', 'flags', 'name', 'weight']);
+    expect(flx.mapValueIterable.map((e) => e.json).toList(), [flx['address'].json, flx['age'].json, flx['flags'].json, flx['name'].json, flx['weight'].json]);
+    expect(flx['flags'].vectorIterable.map((e) => e.boolValue).toList(), [true, false, true, true]);
+  });
+
+  test('bug where offest were stored as int instead of uint', (){
+    const data = [99, 104, 97, 110, 110, 101, 108, 115, 95, 105, 110, 0,
+      100, 105, 108, 97, 116, 105, 111, 110, 95, 104, 101, 105, 103, 104, 116, 95, 102, 97, 99, 116, 111, 114, 0,
+      100, 105, 108, 97, 116, 105, 111, 110, 95, 119, 105, 100, 116, 104, 95, 102, 97, 99, 116, 111, 114, 0,
+      102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0,
+      112, 97, 100, 95, 118, 97, 108, 117, 101, 115, 0, 112, 97, 100, 100, 105, 110, 103, 0,
+      115, 116, 114, 105, 100, 101, 95, 104, 101, 105, 103, 104, 116, 0,
+      115, 116, 114, 105, 100, 101, 95, 119, 105, 100, 116, 104, 0,
+      8, 130, 119, 97, 76, 51, 41, 34, 21, 8, 1, 8, 64, 1, 1, 1, 1, 0, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 16, 36, 1];
+    var flx = Reference.fromBuffer(b(data));
+    expect(flx.json, '{"channels_in":64,"dilation_height_factor":1,"dilation_width_factor":1,"fused_activation_function":1,"pad_values":1,"padding":0,"stride_height":1,"stride_width":1}');
+    const object = {"channels_in":64,"dilation_height_factor":1,"dilation_width_factor":1,"fused_activation_function":1,"pad_values":1,"padding":0,"stride_height":1,"stride_width":1};
+    var data1 = Builder.buildFromObject(object).asUint8List();
+    expect(data1.length, data.length);
+    var flx1 = Reference.fromBuffer(b(data1));
+    expect(flx1.json, '{"channels_in":64,"dilation_height_factor":1,"dilation_width_factor":1,"fused_activation_function":1,"pad_values":1,"padding":0,"stride_height":1,"stride_width":1}');
+  });
+}
+
+ByteBuffer b(List<int> values) {
+  var data = Uint8List.fromList(values);
+  return data.buffer;
+}
+
+void testNumbers(List<int> buffer, List<num> numbers) {
+  var flx = Reference.fromBuffer(b(buffer));
+  expect(flx.length, numbers.length);
+  for (var i = 0; i < flx.length; i++) {
+    expect(flx[i].numValue, closeTo(numbers[i], 0.001));
+  }
+}
+
+void testStrings(List<int> buffer, List<String> numbers) {
+  var flx = Reference.fromBuffer(b(buffer));
+  expect(flx.length, numbers.length);
+  for (var i = 0; i < flx.length; i++) {
+    expect(flx[i].stringValue, numbers[i]);
+  }
+}
+
+Reference complexMap(){
+//  {
+//    "age": 35,
+//    "flags": [True, False, True, True],
+//    "weight": 72.5,
+//    "name": "Maxim",
+//    "address": {
+//      "city": "Bla",
+//      "zip": "12345",
+//      "countryCode": "XX",
+//    }
+//  }
+  return Reference.fromBuffer(b([
+  97, 100, 100, 114, 101, 115, 115, 0,
+      99, 105, 116, 121, 0, 3, 66, 108, 97, 0,
+      99, 111, 117, 110, 116, 114, 121, 67, 111, 100, 101, 0,
+      2, 88, 88, 0,
+      122, 105, 112, 0,
+      5, 49, 50, 51, 52, 53, 0,
+      3, 38, 29, 14, 3, 1, 3, 38, 22, 15, 20, 20, 20,
+      97, 103, 101, 0,
+      102, 108, 97, 103, 115, 0,
+      4, 1, 0, 1, 1,
+      110, 97, 109, 101, 0,
+      5, 77, 97, 120, 105, 109, 0,
+      119, 101, 105, 103, 104, 116, 0,
+      5, 93, 36, 33, 23, 12, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 60, 0, 0, 0, 35, 0, 0, 0, 51, 0, 0, 0, 45,
+      0, 0, 0, 0, 0, 145, 66, 36, 4, 144, 20, 14, 25, 38, 1
+  ]));
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_types_test.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_types_test.dart
new file mode 100644
index 0000000..16235f6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/flex_types_test.dart
@@ -0,0 +1,137 @@
+import 'package:flat_buffers/src/types.dart';
+import 'package:test/test.dart';
+
+void main() {
+  test('is inline', () {
+    expect(ValueTypeUtils.isInline(ValueType.Bool), isTrue);
+    expect(ValueTypeUtils.isInline(ValueType.Int), isTrue);
+    expect(ValueTypeUtils.isInline(ValueType.UInt), isTrue);
+    expect(ValueTypeUtils.isInline(ValueType.Float), isTrue);
+    expect(ValueTypeUtils.isInline(ValueType.Null), isTrue);
+    expect(ValueTypeUtils.isInline(ValueType.String), isFalse);
+  });
+  test('is type vector element', () {
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Bool), isTrue);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Int), isTrue);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.UInt), isTrue);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Float), isTrue);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Key), isTrue);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.String), isTrue);
+
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Null), isFalse);
+    expect(ValueTypeUtils.isTypedVectorElement(ValueType.Blob), isFalse);
+  });
+  test('is typed vector', () {
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorInt), isTrue);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorUInt), isTrue);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorFloat), isTrue);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorBool), isTrue);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorKey), isTrue);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorString), isTrue);
+
+    expect(ValueTypeUtils.isTypedVector(ValueType.Vector), isFalse);
+    expect(ValueTypeUtils.isTypedVector(ValueType.Map), isFalse);
+    expect(ValueTypeUtils.isTypedVector(ValueType.Bool), isFalse);
+    expect(ValueTypeUtils.isTypedVector(ValueType.VectorInt2), isFalse);
+  });
+  test('is fixed typed vector', () {
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorInt2), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorInt3), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorInt4), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorUInt2), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorUInt3), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorUInt4), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorFloat2), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorFloat3), isTrue);
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorFloat4), isTrue);
+
+    expect(ValueTypeUtils.isFixedTypedVector(ValueType.VectorInt), isFalse);
+  });
+  test('to typed vector', () {
+    expect(ValueTypeUtils.toTypedVector(ValueType.Int,0), equals(ValueType.VectorInt));
+    expect(ValueTypeUtils.toTypedVector(ValueType.UInt,0), equals(ValueType.VectorUInt));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Bool,0), equals(ValueType.VectorBool));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Float,0), equals(ValueType.VectorFloat));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Key,0), equals(ValueType.VectorKey));
+    expect(ValueTypeUtils.toTypedVector(ValueType.String,0), equals(ValueType.VectorString));
+
+    expect(ValueTypeUtils.toTypedVector(ValueType.Int,2), equals(ValueType.VectorInt2));
+    expect(ValueTypeUtils.toTypedVector(ValueType.UInt,2), equals(ValueType.VectorUInt2));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Float,2), equals(ValueType.VectorFloat2));
+
+    expect(ValueTypeUtils.toTypedVector(ValueType.Int,3), equals(ValueType.VectorInt3));
+    expect(ValueTypeUtils.toTypedVector(ValueType.UInt,3), equals(ValueType.VectorUInt3));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Float,3), equals(ValueType.VectorFloat3));
+
+    expect(ValueTypeUtils.toTypedVector(ValueType.Int,4), equals(ValueType.VectorInt4));
+    expect(ValueTypeUtils.toTypedVector(ValueType.UInt,4), equals(ValueType.VectorUInt4));
+    expect(ValueTypeUtils.toTypedVector(ValueType.Float,4), equals(ValueType.VectorFloat4));
+  });
+  test('typed vector element type', () {
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorInt), equals(ValueType.Int));
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorUInt), equals(ValueType.UInt));
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorFloat), equals(ValueType.Float));
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorString), equals(ValueType.String));
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorKey), equals(ValueType.Key));
+    expect(ValueTypeUtils.typedVectorElementType(ValueType.VectorBool), equals(ValueType.Bool));
+  });
+  test('fixed typed vector element type', () {
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorInt2), equals(ValueType.Int));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorInt3), equals(ValueType.Int));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorInt4), equals(ValueType.Int));
+
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorUInt2), equals(ValueType.UInt));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorUInt3), equals(ValueType.UInt));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorUInt4), equals(ValueType.UInt));
+
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorFloat2), equals(ValueType.Float));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorFloat3), equals(ValueType.Float));
+    expect(ValueTypeUtils.fixedTypedVectorElementType(ValueType.VectorFloat4), equals(ValueType.Float));
+  });
+  test('fixed typed vector element size', () {
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorInt2), equals(2));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorInt3), equals(3));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorInt4), equals(4));
+
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorUInt2), equals(2));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorUInt3), equals(3));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorUInt4), equals(4));
+
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorFloat2), equals(2));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorFloat3), equals(3));
+    expect(ValueTypeUtils.fixedTypedVectorElementSize(ValueType.VectorFloat4), equals(4));
+  });
+  test('packed type', () {
+    expect(ValueTypeUtils.packedType(ValueType.Null, BitWidth.width8), equals(0));
+    expect(ValueTypeUtils.packedType(ValueType.Null, BitWidth.width16), equals(1));
+    expect(ValueTypeUtils.packedType(ValueType.Null, BitWidth.width32), equals(2));
+    expect(ValueTypeUtils.packedType(ValueType.Null, BitWidth.width64), equals(3));
+
+    expect(ValueTypeUtils.packedType(ValueType.Int, BitWidth.width8), equals(4));
+    expect(ValueTypeUtils.packedType(ValueType.Int, BitWidth.width16), equals(5));
+    expect(ValueTypeUtils.packedType(ValueType.Int, BitWidth.width32), equals(6));
+    expect(ValueTypeUtils.packedType(ValueType.Int, BitWidth.width64), equals(7));
+  });
+  test('bit width', () {
+    expect(BitWidthUtil.width(0), BitWidth.width8);
+    expect(BitWidthUtil.width(-20), BitWidth.width8);
+    expect(BitWidthUtil.width(127), BitWidth.width8);
+    expect(BitWidthUtil.width(128), BitWidth.width16);
+    expect(BitWidthUtil.width(128123), BitWidth.width32);
+    expect(BitWidthUtil.width(12812324534), BitWidth.width64);
+    expect(BitWidthUtil.width(-127), BitWidth.width8);
+    expect(BitWidthUtil.width(-128), BitWidth.width16);
+    expect(BitWidthUtil.width(-12812324534), BitWidth.width64);
+    expect(BitWidthUtil.width(-0.1), BitWidth.width64);
+    expect(BitWidthUtil.width(0.25), BitWidth.width32);
+  });
+  test('padding size', () {
+    expect(BitWidthUtil.paddingSize(10, 8), 6);
+    expect(BitWidthUtil.paddingSize(10, 4), 2);
+    expect(BitWidthUtil.paddingSize(15, 4), 1);
+    expect(BitWidthUtil.paddingSize(15, 2), 1);
+    expect(BitWidthUtil.paddingSize(15, 1), 0);
+    expect(BitWidthUtil.paddingSize(16, 8), 0);
+    expect(BitWidthUtil.paddingSize(17, 8), 7);
+  });
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example2_generated.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example2_generated.dart
new file mode 100644
index 0000000..eed14bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example2_generated.dart
@@ -0,0 +1,60 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game.example2;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game_generated.dart' as my_game;
+import './monster_test_my_game.example_generated.dart' as my_game_example;
+
+class Monster {
+  Monster._(this._bc, this._bcOffset);
+  factory Monster(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Monster> reader = const _MonsterReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+
+  @override
+  String toString() {
+    return 'Monster{}';
+  }
+}
+
+class _MonsterReader extends fb.TableReader<Monster> {
+  const _MonsterReader();
+
+  @override
+  Monster createObject(fb.BufferContext bc, int offset) => 
+    new Monster._(bc, offset);
+}
+
+class MonsterObjectBuilder extends fb.ObjectBuilder {
+
+  MonsterObjectBuilder();
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example_generated.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example_generated.dart
new file mode 100644
index 0000000..4a5f5c9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game.example_generated.dart
@@ -0,0 +1,1647 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game.example;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game_generated.dart' as my_game;
+import './monster_test_my_game.example2_generated.dart' as my_game_example2;
+
+///  Composite components of Monster color.
+class Color {
+  final int value;
+  const Color._(this.value);
+
+  factory Color.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum Color');
+    }
+    return values[value];
+  }
+
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const Color Red = const Color._(1);
+
+  ///  \brief color Green
+  ///  Green is bit_flag with value (1u << 1)
+  static const Color Green = const Color._(2);
+
+  ///  \brief color Blue (1u << 3)
+  static const Color Blue = const Color._(8);
+  static const Map<int,Color> values = {1: Red,2: Green,8: Blue,};
+
+  static const fb.Reader<Color> reader = const _ColorReader();
+
+  @override
+  String toString() {
+    return 'Color{value: $value}';
+  }
+}
+
+class _ColorReader extends fb.Reader<Color> {
+  const _ColorReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  Color read(fb.BufferContext bc, int offset) =>
+      new Color.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class Race {
+  final int value;
+  const Race._(this.value);
+
+  factory Race.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum Race');
+    }
+    return values[value];
+  }
+
+  static const int minValue = -1;
+  static const int maxValue = 2;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const Race None = const Race._(-1);
+  static const Race Human = const Race._(0);
+  static const Race Dwarf = const Race._(1);
+  static const Race Elf = const Race._(2);
+  static const Map<int,Race> values = {-1: None,0: Human,1: Dwarf,2: Elf,};
+
+  static const fb.Reader<Race> reader = const _RaceReader();
+
+  @override
+  String toString() {
+    return 'Race{value: $value}';
+  }
+}
+
+class _RaceReader extends fb.Reader<Race> {
+  const _RaceReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  Race read(fb.BufferContext bc, int offset) =>
+      new Race.fromValue(const fb.Int8Reader().read(bc, offset));
+}
+
+class AnyTypeId {
+  final int value;
+  const AnyTypeId._(this.value);
+
+  factory AnyTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyTypeId NONE = const AnyTypeId._(0);
+  static const AnyTypeId Monster = const AnyTypeId._(1);
+  static const AnyTypeId TestSimpleTableWithEnum = const AnyTypeId._(2);
+  static const AnyTypeId MyGame_Example2_Monster = const AnyTypeId._(3);
+  static const Map<int,AnyTypeId> values = {0: NONE,1: Monster,2: TestSimpleTableWithEnum,3: MyGame_Example2_Monster,};
+
+  static const fb.Reader<AnyTypeId> reader = const _AnyTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyTypeId{value: $value}';
+  }
+}
+
+class _AnyTypeIdReader extends fb.Reader<AnyTypeId> {
+  const _AnyTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class AnyUniqueAliasesTypeId {
+  final int value;
+  const AnyUniqueAliasesTypeId._(this.value);
+
+  factory AnyUniqueAliasesTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyUniqueAliasesTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyUniqueAliasesTypeId NONE = const AnyUniqueAliasesTypeId._(0);
+  static const AnyUniqueAliasesTypeId M = const AnyUniqueAliasesTypeId._(1);
+  static const AnyUniqueAliasesTypeId TS = const AnyUniqueAliasesTypeId._(2);
+  static const AnyUniqueAliasesTypeId M2 = const AnyUniqueAliasesTypeId._(3);
+  static const Map<int,AnyUniqueAliasesTypeId> values = {0: NONE,1: M,2: TS,3: M2,};
+
+  static const fb.Reader<AnyUniqueAliasesTypeId> reader = const _AnyUniqueAliasesTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyUniqueAliasesTypeId{value: $value}';
+  }
+}
+
+class _AnyUniqueAliasesTypeIdReader extends fb.Reader<AnyUniqueAliasesTypeId> {
+  const _AnyUniqueAliasesTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyUniqueAliasesTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyUniqueAliasesTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class AnyAmbiguousAliasesTypeId {
+  final int value;
+  const AnyAmbiguousAliasesTypeId._(this.value);
+
+  factory AnyAmbiguousAliasesTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyAmbiguousAliasesTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyAmbiguousAliasesTypeId NONE = const AnyAmbiguousAliasesTypeId._(0);
+  static const AnyAmbiguousAliasesTypeId M1 = const AnyAmbiguousAliasesTypeId._(1);
+  static const AnyAmbiguousAliasesTypeId M2 = const AnyAmbiguousAliasesTypeId._(2);
+  static const AnyAmbiguousAliasesTypeId M3 = const AnyAmbiguousAliasesTypeId._(3);
+  static const Map<int,AnyAmbiguousAliasesTypeId> values = {0: NONE,1: M1,2: M2,3: M3,};
+
+  static const fb.Reader<AnyAmbiguousAliasesTypeId> reader = const _AnyAmbiguousAliasesTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyAmbiguousAliasesTypeId{value: $value}';
+  }
+}
+
+class _AnyAmbiguousAliasesTypeIdReader extends fb.Reader<AnyAmbiguousAliasesTypeId> {
+  const _AnyAmbiguousAliasesTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyAmbiguousAliasesTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyAmbiguousAliasesTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class Test {
+  Test._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Test> reader = const _TestReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get a => const fb.Int16Reader().read(_bc, _bcOffset + 0);
+  int get b => const fb.Int8Reader().read(_bc, _bcOffset + 2);
+
+  @override
+  String toString() {
+    return 'Test{a: $a, b: $b}';
+  }
+}
+
+class _TestReader extends fb.StructReader<Test> {
+  const _TestReader();
+
+  @override
+  int get size => 4;
+
+  @override
+  Test createObject(fb.BufferContext bc, int offset) => 
+    new Test._(bc, offset);
+}
+
+class TestBuilder {
+  TestBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(int a, int b) {
+    fbBuilder.pad(1);
+    fbBuilder.putInt8(b);
+    fbBuilder.putInt16(a);
+    return fbBuilder.offset;
+  }
+
+}
+
+class TestObjectBuilder extends fb.ObjectBuilder {
+  final int _a;
+  final int _b;
+
+  TestObjectBuilder({
+    int a,
+    int b,
+  })
+      : _a = a,
+        _b = b;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.pad(1);
+    fbBuilder.putInt8(_b);
+    fbBuilder.putInt16(_a);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class TestSimpleTableWithEnum {
+  TestSimpleTableWithEnum._(this._bc, this._bcOffset);
+  factory TestSimpleTableWithEnum(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TestSimpleTableWithEnum> reader = const _TestSimpleTableWithEnumReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Color get color => new Color.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 4, 2));
+
+  @override
+  String toString() {
+    return 'TestSimpleTableWithEnum{color: $color}';
+  }
+}
+
+class _TestSimpleTableWithEnumReader extends fb.TableReader<TestSimpleTableWithEnum> {
+  const _TestSimpleTableWithEnumReader();
+
+  @override
+  TestSimpleTableWithEnum createObject(fb.BufferContext bc, int offset) => 
+    new TestSimpleTableWithEnum._(bc, offset);
+}
+
+class TestSimpleTableWithEnumBuilder {
+  TestSimpleTableWithEnumBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addColor(Color color) {
+    fbBuilder.addUint8(0, color?.value);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TestSimpleTableWithEnumObjectBuilder extends fb.ObjectBuilder {
+  final Color _color;
+
+  TestSimpleTableWithEnumObjectBuilder({
+    Color color,
+  })
+      : _color = color;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    fbBuilder.addUint8(0, _color?.value);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Vec3 {
+  Vec3._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Vec3> reader = const _Vec3Reader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  double get x => const fb.Float32Reader().read(_bc, _bcOffset + 0);
+  double get y => const fb.Float32Reader().read(_bc, _bcOffset + 4);
+  double get z => const fb.Float32Reader().read(_bc, _bcOffset + 8);
+  double get test1 => const fb.Float64Reader().read(_bc, _bcOffset + 16);
+  Color get test2 => new Color.fromValue(const fb.Uint8Reader().read(_bc, _bcOffset + 24));
+  Test get test3 => Test.reader.read(_bc, _bcOffset + 26);
+
+  @override
+  String toString() {
+    return 'Vec3{x: $x, y: $y, z: $z, test1: $test1, test2: $test2, test3: $test3}';
+  }
+}
+
+class _Vec3Reader extends fb.StructReader<Vec3> {
+  const _Vec3Reader();
+
+  @override
+  int get size => 32;
+
+  @override
+  Vec3 createObject(fb.BufferContext bc, int offset) => 
+    new Vec3._(bc, offset);
+}
+
+class Vec3Builder {
+  Vec3Builder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(double x, double y, double z, double test1, Color test2, fb.StructBuilder test3) {
+    fbBuilder.pad(2);
+    test3();
+    fbBuilder.pad(1);
+    fbBuilder.putUint8(test2?.value);
+    fbBuilder.putFloat64(test1);
+    fbBuilder.pad(4);
+    fbBuilder.putFloat32(z);
+    fbBuilder.putFloat32(y);
+    fbBuilder.putFloat32(x);
+    return fbBuilder.offset;
+  }
+
+}
+
+class Vec3ObjectBuilder extends fb.ObjectBuilder {
+  final double _x;
+  final double _y;
+  final double _z;
+  final double _test1;
+  final Color _test2;
+  final TestObjectBuilder _test3;
+
+  Vec3ObjectBuilder({
+    double x,
+    double y,
+    double z,
+    double test1,
+    Color test2,
+    TestObjectBuilder test3,
+  })
+      : _x = x,
+        _y = y,
+        _z = z,
+        _test1 = test1,
+        _test2 = test2,
+        _test3 = test3;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.pad(2);
+    _test3.finish(fbBuilder);
+    fbBuilder.pad(1);
+    fbBuilder.putUint8(_test2?.value);
+    fbBuilder.putFloat64(_test1);
+    fbBuilder.pad(4);
+    fbBuilder.putFloat32(_z);
+    fbBuilder.putFloat32(_y);
+    fbBuilder.putFloat32(_x);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Ability {
+  Ability._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Ability> reader = const _AbilityReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get id => const fb.Uint32Reader().read(_bc, _bcOffset + 0);
+  int get distance => const fb.Uint32Reader().read(_bc, _bcOffset + 4);
+
+  @override
+  String toString() {
+    return 'Ability{id: $id, distance: $distance}';
+  }
+}
+
+class _AbilityReader extends fb.StructReader<Ability> {
+  const _AbilityReader();
+
+  @override
+  int get size => 8;
+
+  @override
+  Ability createObject(fb.BufferContext bc, int offset) => 
+    new Ability._(bc, offset);
+}
+
+class AbilityBuilder {
+  AbilityBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(int id, int distance) {
+    fbBuilder.putUint32(distance);
+    fbBuilder.putUint32(id);
+    return fbBuilder.offset;
+  }
+
+}
+
+class AbilityObjectBuilder extends fb.ObjectBuilder {
+  final int _id;
+  final int _distance;
+
+  AbilityObjectBuilder({
+    int id,
+    int distance,
+  })
+      : _id = id,
+        _distance = distance;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.putUint32(_distance);
+    fbBuilder.putUint32(_id);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class StructOfStructs {
+  StructOfStructs._(this._bc, this._bcOffset);
+
+  static const fb.Reader<StructOfStructs> reader = const _StructOfStructsReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Ability get a => Ability.reader.read(_bc, _bcOffset + 0);
+  Test get b => Test.reader.read(_bc, _bcOffset + 8);
+  Ability get c => Ability.reader.read(_bc, _bcOffset + 12);
+
+  @override
+  String toString() {
+    return 'StructOfStructs{a: $a, b: $b, c: $c}';
+  }
+}
+
+class _StructOfStructsReader extends fb.StructReader<StructOfStructs> {
+  const _StructOfStructsReader();
+
+  @override
+  int get size => 20;
+
+  @override
+  StructOfStructs createObject(fb.BufferContext bc, int offset) => 
+    new StructOfStructs._(bc, offset);
+}
+
+class StructOfStructsBuilder {
+  StructOfStructsBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(fb.StructBuilder a, fb.StructBuilder b, fb.StructBuilder c) {
+    c();
+    b();
+    a();
+    return fbBuilder.offset;
+  }
+
+}
+
+class StructOfStructsObjectBuilder extends fb.ObjectBuilder {
+  final AbilityObjectBuilder _a;
+  final TestObjectBuilder _b;
+  final AbilityObjectBuilder _c;
+
+  StructOfStructsObjectBuilder({
+    AbilityObjectBuilder a,
+    TestObjectBuilder b,
+    AbilityObjectBuilder c,
+  })
+      : _a = a,
+        _b = b,
+        _c = c;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    _c.finish(fbBuilder);
+    _b.finish(fbBuilder);
+    _a.finish(fbBuilder);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Stat {
+  Stat._(this._bc, this._bcOffset);
+  factory Stat(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Stat> reader = const _StatReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  String get id => const fb.StringReader().vTableGet(_bc, _bcOffset, 4, null);
+  int get val => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 6, 0);
+  int get count => const fb.Uint16Reader().vTableGet(_bc, _bcOffset, 8, 0);
+
+  @override
+  String toString() {
+    return 'Stat{id: $id, val: $val, count: $count}';
+  }
+}
+
+class _StatReader extends fb.TableReader<Stat> {
+  const _StatReader();
+
+  @override
+  Stat createObject(fb.BufferContext bc, int offset) => 
+    new Stat._(bc, offset);
+}
+
+class StatBuilder {
+  StatBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addIdOffset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+  int addVal(int val) {
+    fbBuilder.addInt64(1, val);
+    return fbBuilder.offset;
+  }
+  int addCount(int count) {
+    fbBuilder.addUint16(2, count);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class StatObjectBuilder extends fb.ObjectBuilder {
+  final String _id;
+  final int _val;
+  final int _count;
+
+  StatObjectBuilder({
+    String id,
+    int val,
+    int count,
+  })
+      : _id = id,
+        _val = val,
+        _count = count;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int idOffset = fbBuilder.writeString(_id);
+
+    fbBuilder.startTable();
+    if (idOffset != null) {
+      fbBuilder.addOffset(0, idOffset);
+    }
+    fbBuilder.addInt64(1, _val);
+    fbBuilder.addUint16(2, _count);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Referrable {
+  Referrable._(this._bc, this._bcOffset);
+  factory Referrable(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Referrable> reader = const _ReferrableReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get id => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 4, 0);
+
+  @override
+  String toString() {
+    return 'Referrable{id: $id}';
+  }
+}
+
+class _ReferrableReader extends fb.TableReader<Referrable> {
+  const _ReferrableReader();
+
+  @override
+  Referrable createObject(fb.BufferContext bc, int offset) => 
+    new Referrable._(bc, offset);
+}
+
+class ReferrableBuilder {
+  ReferrableBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addId(int id) {
+    fbBuilder.addUint64(0, id);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class ReferrableObjectBuilder extends fb.ObjectBuilder {
+  final int _id;
+
+  ReferrableObjectBuilder({
+    int id,
+  })
+      : _id = id;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    fbBuilder.addUint64(0, _id);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+///  an example documentation comment: "monster object"
+class Monster {
+  Monster._(this._bc, this._bcOffset);
+  factory Monster(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Monster> reader = const _MonsterReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Vec3 get pos => Vec3.reader.vTableGet(_bc, _bcOffset, 4, null);
+  int get mana => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 6, 150);
+  int get hp => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 8, 100);
+  String get name => const fb.StringReader().vTableGet(_bc, _bcOffset, 10, null);
+  List<int> get inventory => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 14, null);
+  Color get color => new Color.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 16, 8));
+  AnyTypeId get testType => new AnyTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 18, 0));
+  dynamic get test {
+    switch (testType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 20, null);
+      case 2: return TestSimpleTableWithEnum.reader.vTableGet(_bc, _bcOffset, 20, null);
+      case 3: return my_game_example2.Monster.reader.vTableGet(_bc, _bcOffset, 20, null);
+      default: return null;
+    }
+  }
+  List<Test> get test4 => const fb.ListReader<Test>(Test.reader).vTableGet(_bc, _bcOffset, 22, null);
+  List<String> get testarrayofstring => const fb.ListReader<String>(const fb.StringReader()).vTableGet(_bc, _bcOffset, 24, null);
+  ///  an example documentation comment: this will end up in the generated code
+  ///  multiline too
+  List<Monster> get testarrayoftables => const fb.ListReader<Monster>(Monster.reader).vTableGet(_bc, _bcOffset, 26, null);
+  Monster get enemy => Monster.reader.vTableGet(_bc, _bcOffset, 28, null);
+  List<int> get testnestedflatbuffer => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 30, null);
+  Stat get testempty => Stat.reader.vTableGet(_bc, _bcOffset, 32, null);
+  bool get testbool => const fb.BoolReader().vTableGet(_bc, _bcOffset, 34, false);
+  int get testhashs32Fnv1 => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 36, 0);
+  int get testhashu32Fnv1 => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 38, 0);
+  int get testhashs64Fnv1 => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 40, 0);
+  int get testhashu64Fnv1 => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 42, 0);
+  int get testhashs32Fnv1a => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 44, 0);
+  int get testhashu32Fnv1a => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 46, 0);
+  int get testhashs64Fnv1a => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 48, 0);
+  int get testhashu64Fnv1a => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 50, 0);
+  List<bool> get testarrayofbools => const fb.ListReader<bool>(const fb.BoolReader()).vTableGet(_bc, _bcOffset, 52, null);
+  double get testf => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 54, 3.14159);
+  double get testf2 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 56, 3.0);
+  double get testf3 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 58, 0.0);
+  List<String> get testarrayofstring2 => const fb.ListReader<String>(const fb.StringReader()).vTableGet(_bc, _bcOffset, 60, null);
+  List<Ability> get testarrayofsortedstruct => const fb.ListReader<Ability>(Ability.reader).vTableGet(_bc, _bcOffset, 62, null);
+  List<int> get flex => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 64, null);
+  List<Test> get test5 => const fb.ListReader<Test>(Test.reader).vTableGet(_bc, _bcOffset, 66, null);
+  List<int> get vectorOfLongs => const fb.ListReader<int>(const fb.Int64Reader()).vTableGet(_bc, _bcOffset, 68, null);
+  List<double> get vectorOfDoubles => const fb.ListReader<double>(const fb.Float64Reader()).vTableGet(_bc, _bcOffset, 70, null);
+  my_game.InParentNamespace get parentNamespaceTest => my_game.InParentNamespace.reader.vTableGet(_bc, _bcOffset, 72, null);
+  List<Referrable> get vectorOfReferrables => const fb.ListReader<Referrable>(Referrable.reader).vTableGet(_bc, _bcOffset, 74, null);
+  int get singleWeakReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 76, 0);
+  List<int> get vectorOfWeakReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 78, null);
+  List<Referrable> get vectorOfStrongReferrables => const fb.ListReader<Referrable>(Referrable.reader).vTableGet(_bc, _bcOffset, 80, null);
+  int get coOwningReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 82, 0);
+  List<int> get vectorOfCoOwningReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 84, null);
+  int get nonOwningReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 86, 0);
+  List<int> get vectorOfNonOwningReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 88, null);
+  AnyUniqueAliasesTypeId get anyUniqueType => new AnyUniqueAliasesTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 90, 0));
+  dynamic get anyUnique {
+    switch (anyUniqueType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 92, null);
+      case 2: return TestSimpleTableWithEnum.reader.vTableGet(_bc, _bcOffset, 92, null);
+      case 3: return my_game_example2.Monster.reader.vTableGet(_bc, _bcOffset, 92, null);
+      default: return null;
+    }
+  }
+  AnyAmbiguousAliasesTypeId get anyAmbiguousType => new AnyAmbiguousAliasesTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 94, 0));
+  dynamic get anyAmbiguous {
+    switch (anyAmbiguousType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      case 2: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      case 3: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      default: return null;
+    }
+  }
+  List<Color> get vectorOfEnums => const fb.ListReader<Color>(Color.reader).vTableGet(_bc, _bcOffset, 98, null);
+  Race get signedEnum => new Race.fromValue(const fb.Int8Reader().vTableGet(_bc, _bcOffset, 100, -1));
+  List<int> get testrequirednestedflatbuffer => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 102, null);
+  List<Stat> get scalarKeySortedTables => const fb.ListReader<Stat>(Stat.reader).vTableGet(_bc, _bcOffset, 104, null);
+
+  @override
+  String toString() {
+    return 'Monster{pos: $pos, mana: $mana, hp: $hp, name: $name, inventory: $inventory, color: $color, testType: $testType, test: $test, test4: $test4, testarrayofstring: $testarrayofstring, testarrayoftables: $testarrayoftables, enemy: $enemy, testnestedflatbuffer: $testnestedflatbuffer, testempty: $testempty, testbool: $testbool, testhashs32Fnv1: $testhashs32Fnv1, testhashu32Fnv1: $testhashu32Fnv1, testhashs64Fnv1: $testhashs64Fnv1, testhashu64Fnv1: $testhashu64Fnv1, testhashs32Fnv1a: $testhashs32Fnv1a, testhashu32Fnv1a: $testhashu32Fnv1a, testhashs64Fnv1a: $testhashs64Fnv1a, testhashu64Fnv1a: $testhashu64Fnv1a, testarrayofbools: $testarrayofbools, testf: $testf, testf2: $testf2, testf3: $testf3, testarrayofstring2: $testarrayofstring2, testarrayofsortedstruct: $testarrayofsortedstruct, flex: $flex, test5: $test5, vectorOfLongs: $vectorOfLongs, vectorOfDoubles: $vectorOfDoubles, parentNamespaceTest: $parentNamespaceTest, vectorOfReferrables: $vectorOfReferrables, singleWeakReference: $singleWeakReference, vectorOfWeakReferences: $vectorOfWeakReferences, vectorOfStrongReferrables: $vectorOfStrongReferrables, coOwningReference: $coOwningReference, vectorOfCoOwningReferences: $vectorOfCoOwningReferences, nonOwningReference: $nonOwningReference, vectorOfNonOwningReferences: $vectorOfNonOwningReferences, anyUniqueType: $anyUniqueType, anyUnique: $anyUnique, anyAmbiguousType: $anyAmbiguousType, anyAmbiguous: $anyAmbiguous, vectorOfEnums: $vectorOfEnums, signedEnum: $signedEnum, testrequirednestedflatbuffer: $testrequirednestedflatbuffer, scalarKeySortedTables: $scalarKeySortedTables}';
+  }
+}
+
+class _MonsterReader extends fb.TableReader<Monster> {
+  const _MonsterReader();
+
+  @override
+  Monster createObject(fb.BufferContext bc, int offset) => 
+    new Monster._(bc, offset);
+}
+
+class MonsterBuilder {
+  MonsterBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addPos(int offset) {
+    fbBuilder.addStruct(0, offset);
+    return fbBuilder.offset;
+  }
+  int addMana(int mana) {
+    fbBuilder.addInt16(1, mana);
+    return fbBuilder.offset;
+  }
+  int addHp(int hp) {
+    fbBuilder.addInt16(2, hp);
+    return fbBuilder.offset;
+  }
+  int addNameOffset(int offset) {
+    fbBuilder.addOffset(3, offset);
+    return fbBuilder.offset;
+  }
+  int addInventoryOffset(int offset) {
+    fbBuilder.addOffset(5, offset);
+    return fbBuilder.offset;
+  }
+  int addColor(Color color) {
+    fbBuilder.addUint8(6, color?.value);
+    return fbBuilder.offset;
+  }
+  int addTestType(AnyTypeId testType) {
+    fbBuilder.addUint8(7, testType?.value);
+    return fbBuilder.offset;
+  }
+  int addTestOffset(int offset) {
+    fbBuilder.addOffset(8, offset);
+    return fbBuilder.offset;
+  }
+  int addTest4Offset(int offset) {
+    fbBuilder.addOffset(9, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofstringOffset(int offset) {
+    fbBuilder.addOffset(10, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayoftablesOffset(int offset) {
+    fbBuilder.addOffset(11, offset);
+    return fbBuilder.offset;
+  }
+  int addEnemyOffset(int offset) {
+    fbBuilder.addOffset(12, offset);
+    return fbBuilder.offset;
+  }
+  int addTestnestedflatbufferOffset(int offset) {
+    fbBuilder.addOffset(13, offset);
+    return fbBuilder.offset;
+  }
+  int addTestemptyOffset(int offset) {
+    fbBuilder.addOffset(14, offset);
+    return fbBuilder.offset;
+  }
+  int addTestbool(bool testbool) {
+    fbBuilder.addBool(15, testbool);
+    return fbBuilder.offset;
+  }
+  int addTesthashs32Fnv1(int testhashs32Fnv1) {
+    fbBuilder.addInt32(16, testhashs32Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashu32Fnv1(int testhashu32Fnv1) {
+    fbBuilder.addUint32(17, testhashu32Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashs64Fnv1(int testhashs64Fnv1) {
+    fbBuilder.addInt64(18, testhashs64Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashu64Fnv1(int testhashu64Fnv1) {
+    fbBuilder.addUint64(19, testhashu64Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashs32Fnv1a(int testhashs32Fnv1a) {
+    fbBuilder.addInt32(20, testhashs32Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashu32Fnv1a(int testhashu32Fnv1a) {
+    fbBuilder.addUint32(21, testhashu32Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashs64Fnv1a(int testhashs64Fnv1a) {
+    fbBuilder.addInt64(22, testhashs64Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashu64Fnv1a(int testhashu64Fnv1a) {
+    fbBuilder.addUint64(23, testhashu64Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofboolsOffset(int offset) {
+    fbBuilder.addOffset(24, offset);
+    return fbBuilder.offset;
+  }
+  int addTestf(double testf) {
+    fbBuilder.addFloat32(25, testf);
+    return fbBuilder.offset;
+  }
+  int addTestf2(double testf2) {
+    fbBuilder.addFloat32(26, testf2);
+    return fbBuilder.offset;
+  }
+  int addTestf3(double testf3) {
+    fbBuilder.addFloat32(27, testf3);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofstring2Offset(int offset) {
+    fbBuilder.addOffset(28, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofsortedstructOffset(int offset) {
+    fbBuilder.addOffset(29, offset);
+    return fbBuilder.offset;
+  }
+  int addFlexOffset(int offset) {
+    fbBuilder.addOffset(30, offset);
+    return fbBuilder.offset;
+  }
+  int addTest5Offset(int offset) {
+    fbBuilder.addOffset(31, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfLongsOffset(int offset) {
+    fbBuilder.addOffset(32, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfDoublesOffset(int offset) {
+    fbBuilder.addOffset(33, offset);
+    return fbBuilder.offset;
+  }
+  int addParentNamespaceTestOffset(int offset) {
+    fbBuilder.addOffset(34, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfReferrablesOffset(int offset) {
+    fbBuilder.addOffset(35, offset);
+    return fbBuilder.offset;
+  }
+  int addSingleWeakReference(int singleWeakReference) {
+    fbBuilder.addUint64(36, singleWeakReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfWeakReferencesOffset(int offset) {
+    fbBuilder.addOffset(37, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfStrongReferrablesOffset(int offset) {
+    fbBuilder.addOffset(38, offset);
+    return fbBuilder.offset;
+  }
+  int addCoOwningReference(int coOwningReference) {
+    fbBuilder.addUint64(39, coOwningReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfCoOwningReferencesOffset(int offset) {
+    fbBuilder.addOffset(40, offset);
+    return fbBuilder.offset;
+  }
+  int addNonOwningReference(int nonOwningReference) {
+    fbBuilder.addUint64(41, nonOwningReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfNonOwningReferencesOffset(int offset) {
+    fbBuilder.addOffset(42, offset);
+    return fbBuilder.offset;
+  }
+  int addAnyUniqueType(AnyUniqueAliasesTypeId anyUniqueType) {
+    fbBuilder.addUint8(43, anyUniqueType?.value);
+    return fbBuilder.offset;
+  }
+  int addAnyUniqueOffset(int offset) {
+    fbBuilder.addOffset(44, offset);
+    return fbBuilder.offset;
+  }
+  int addAnyAmbiguousType(AnyAmbiguousAliasesTypeId anyAmbiguousType) {
+    fbBuilder.addUint8(45, anyAmbiguousType?.value);
+    return fbBuilder.offset;
+  }
+  int addAnyAmbiguousOffset(int offset) {
+    fbBuilder.addOffset(46, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfEnumsOffset(int offset) {
+    fbBuilder.addOffset(47, offset);
+    return fbBuilder.offset;
+  }
+  int addSignedEnum(Race signedEnum) {
+    fbBuilder.addInt8(48, signedEnum?.value);
+    return fbBuilder.offset;
+  }
+  int addTestrequirednestedflatbufferOffset(int offset) {
+    fbBuilder.addOffset(49, offset);
+    return fbBuilder.offset;
+  }
+  int addScalarKeySortedTablesOffset(int offset) {
+    fbBuilder.addOffset(50, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class MonsterObjectBuilder extends fb.ObjectBuilder {
+  final Vec3ObjectBuilder _pos;
+  final int _mana;
+  final int _hp;
+  final String _name;
+  final List<int> _inventory;
+  final Color _color;
+  final AnyTypeId _testType;
+  final dynamic _test;
+  final List<TestObjectBuilder> _test4;
+  final List<String> _testarrayofstring;
+  final List<MonsterObjectBuilder> _testarrayoftables;
+  final MonsterObjectBuilder _enemy;
+  final List<int> _testnestedflatbuffer;
+  final StatObjectBuilder _testempty;
+  final bool _testbool;
+  final int _testhashs32Fnv1;
+  final int _testhashu32Fnv1;
+  final int _testhashs64Fnv1;
+  final int _testhashu64Fnv1;
+  final int _testhashs32Fnv1a;
+  final int _testhashu32Fnv1a;
+  final int _testhashs64Fnv1a;
+  final int _testhashu64Fnv1a;
+  final List<bool> _testarrayofbools;
+  final double _testf;
+  final double _testf2;
+  final double _testf3;
+  final List<String> _testarrayofstring2;
+  final List<AbilityObjectBuilder> _testarrayofsortedstruct;
+  final List<int> _flex;
+  final List<TestObjectBuilder> _test5;
+  final List<int> _vectorOfLongs;
+  final List<double> _vectorOfDoubles;
+  final my_game.InParentNamespaceObjectBuilder _parentNamespaceTest;
+  final List<ReferrableObjectBuilder> _vectorOfReferrables;
+  final int _singleWeakReference;
+  final List<int> _vectorOfWeakReferences;
+  final List<ReferrableObjectBuilder> _vectorOfStrongReferrables;
+  final int _coOwningReference;
+  final List<int> _vectorOfCoOwningReferences;
+  final int _nonOwningReference;
+  final List<int> _vectorOfNonOwningReferences;
+  final AnyUniqueAliasesTypeId _anyUniqueType;
+  final dynamic _anyUnique;
+  final AnyAmbiguousAliasesTypeId _anyAmbiguousType;
+  final dynamic _anyAmbiguous;
+  final List<Color> _vectorOfEnums;
+  final Race _signedEnum;
+  final List<int> _testrequirednestedflatbuffer;
+  final List<StatObjectBuilder> _scalarKeySortedTables;
+
+  MonsterObjectBuilder({
+    Vec3ObjectBuilder pos,
+    int mana,
+    int hp,
+    String name,
+    List<int> inventory,
+    Color color,
+    AnyTypeId testType,
+    dynamic test,
+    List<TestObjectBuilder> test4,
+    List<String> testarrayofstring,
+    List<MonsterObjectBuilder> testarrayoftables,
+    MonsterObjectBuilder enemy,
+    List<int> testnestedflatbuffer,
+    StatObjectBuilder testempty,
+    bool testbool,
+    int testhashs32Fnv1,
+    int testhashu32Fnv1,
+    int testhashs64Fnv1,
+    int testhashu64Fnv1,
+    int testhashs32Fnv1a,
+    int testhashu32Fnv1a,
+    int testhashs64Fnv1a,
+    int testhashu64Fnv1a,
+    List<bool> testarrayofbools,
+    double testf,
+    double testf2,
+    double testf3,
+    List<String> testarrayofstring2,
+    List<AbilityObjectBuilder> testarrayofsortedstruct,
+    List<int> flex,
+    List<TestObjectBuilder> test5,
+    List<int> vectorOfLongs,
+    List<double> vectorOfDoubles,
+    my_game.InParentNamespaceObjectBuilder parentNamespaceTest,
+    List<ReferrableObjectBuilder> vectorOfReferrables,
+    int singleWeakReference,
+    List<int> vectorOfWeakReferences,
+    List<ReferrableObjectBuilder> vectorOfStrongReferrables,
+    int coOwningReference,
+    List<int> vectorOfCoOwningReferences,
+    int nonOwningReference,
+    List<int> vectorOfNonOwningReferences,
+    AnyUniqueAliasesTypeId anyUniqueType,
+    dynamic anyUnique,
+    AnyAmbiguousAliasesTypeId anyAmbiguousType,
+    dynamic anyAmbiguous,
+    List<Color> vectorOfEnums,
+    Race signedEnum,
+    List<int> testrequirednestedflatbuffer,
+    List<StatObjectBuilder> scalarKeySortedTables,
+  })
+      : _pos = pos,
+        _mana = mana,
+        _hp = hp,
+        _name = name,
+        _inventory = inventory,
+        _color = color,
+        _testType = testType,
+        _test = test,
+        _test4 = test4,
+        _testarrayofstring = testarrayofstring,
+        _testarrayoftables = testarrayoftables,
+        _enemy = enemy,
+        _testnestedflatbuffer = testnestedflatbuffer,
+        _testempty = testempty,
+        _testbool = testbool,
+        _testhashs32Fnv1 = testhashs32Fnv1,
+        _testhashu32Fnv1 = testhashu32Fnv1,
+        _testhashs64Fnv1 = testhashs64Fnv1,
+        _testhashu64Fnv1 = testhashu64Fnv1,
+        _testhashs32Fnv1a = testhashs32Fnv1a,
+        _testhashu32Fnv1a = testhashu32Fnv1a,
+        _testhashs64Fnv1a = testhashs64Fnv1a,
+        _testhashu64Fnv1a = testhashu64Fnv1a,
+        _testarrayofbools = testarrayofbools,
+        _testf = testf,
+        _testf2 = testf2,
+        _testf3 = testf3,
+        _testarrayofstring2 = testarrayofstring2,
+        _testarrayofsortedstruct = testarrayofsortedstruct,
+        _flex = flex,
+        _test5 = test5,
+        _vectorOfLongs = vectorOfLongs,
+        _vectorOfDoubles = vectorOfDoubles,
+        _parentNamespaceTest = parentNamespaceTest,
+        _vectorOfReferrables = vectorOfReferrables,
+        _singleWeakReference = singleWeakReference,
+        _vectorOfWeakReferences = vectorOfWeakReferences,
+        _vectorOfStrongReferrables = vectorOfStrongReferrables,
+        _coOwningReference = coOwningReference,
+        _vectorOfCoOwningReferences = vectorOfCoOwningReferences,
+        _nonOwningReference = nonOwningReference,
+        _vectorOfNonOwningReferences = vectorOfNonOwningReferences,
+        _anyUniqueType = anyUniqueType,
+        _anyUnique = anyUnique,
+        _anyAmbiguousType = anyAmbiguousType,
+        _anyAmbiguous = anyAmbiguous,
+        _vectorOfEnums = vectorOfEnums,
+        _signedEnum = signedEnum,
+        _testrequirednestedflatbuffer = testrequirednestedflatbuffer,
+        _scalarKeySortedTables = scalarKeySortedTables;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int nameOffset = fbBuilder.writeString(_name);
+    final int inventoryOffset = _inventory?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_inventory)
+        : null;
+    final int testOffset = _test?.getOrCreateOffset(fbBuilder);
+    final int test4Offset = _test4?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_test4)
+        : null;
+    final int testarrayofstringOffset = _testarrayofstring?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayofstring.map((b) => fbBuilder.writeString(b)).toList())
+        : null;
+    final int testarrayoftablesOffset = _testarrayoftables?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayoftables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int enemyOffset = _enemy?.getOrCreateOffset(fbBuilder);
+    final int testnestedflatbufferOffset = _testnestedflatbuffer?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_testnestedflatbuffer)
+        : null;
+    final int testemptyOffset = _testempty?.getOrCreateOffset(fbBuilder);
+    final int testarrayofboolsOffset = _testarrayofbools?.isNotEmpty == true
+        ? fbBuilder.writeListBool(_testarrayofbools)
+        : null;
+    final int testarrayofstring2Offset = _testarrayofstring2?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayofstring2.map((b) => fbBuilder.writeString(b)).toList())
+        : null;
+    final int testarrayofsortedstructOffset = _testarrayofsortedstruct?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_testarrayofsortedstruct)
+        : null;
+    final int flexOffset = _flex?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_flex)
+        : null;
+    final int test5Offset = _test5?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_test5)
+        : null;
+    final int vectorOfLongsOffset = _vectorOfLongs?.isNotEmpty == true
+        ? fbBuilder.writeListInt64(_vectorOfLongs)
+        : null;
+    final int vectorOfDoublesOffset = _vectorOfDoubles?.isNotEmpty == true
+        ? fbBuilder.writeListFloat64(_vectorOfDoubles)
+        : null;
+    final int parentNamespaceTestOffset = _parentNamespaceTest?.getOrCreateOffset(fbBuilder);
+    final int vectorOfReferrablesOffset = _vectorOfReferrables?.isNotEmpty == true
+        ? fbBuilder.writeList(_vectorOfReferrables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int vectorOfWeakReferencesOffset = _vectorOfWeakReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfWeakReferences)
+        : null;
+    final int vectorOfStrongReferrablesOffset = _vectorOfStrongReferrables?.isNotEmpty == true
+        ? fbBuilder.writeList(_vectorOfStrongReferrables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int vectorOfCoOwningReferencesOffset = _vectorOfCoOwningReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfCoOwningReferences)
+        : null;
+    final int vectorOfNonOwningReferencesOffset = _vectorOfNonOwningReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfNonOwningReferences)
+        : null;
+    final int anyUniqueOffset = _anyUnique?.getOrCreateOffset(fbBuilder);
+    final int anyAmbiguousOffset = _anyAmbiguous?.getOrCreateOffset(fbBuilder);
+    final int vectorOfEnumsOffset = _vectorOfEnums?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_vectorOfEnums.map((f) => f.value))
+        : null;
+    final int testrequirednestedflatbufferOffset = _testrequirednestedflatbuffer?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_testrequirednestedflatbuffer)
+        : null;
+    final int scalarKeySortedTablesOffset = _scalarKeySortedTables?.isNotEmpty == true
+        ? fbBuilder.writeList(_scalarKeySortedTables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+
+    fbBuilder.startTable();
+    if (_pos != null) {
+      fbBuilder.addStruct(0, _pos.finish(fbBuilder));
+    }
+    fbBuilder.addInt16(1, _mana);
+    fbBuilder.addInt16(2, _hp);
+    if (nameOffset != null) {
+      fbBuilder.addOffset(3, nameOffset);
+    }
+    if (inventoryOffset != null) {
+      fbBuilder.addOffset(5, inventoryOffset);
+    }
+    fbBuilder.addUint8(6, _color?.value);
+    fbBuilder.addUint8(7, _testType?.value);
+    if (testOffset != null) {
+      fbBuilder.addOffset(8, testOffset);
+    }
+    if (test4Offset != null) {
+      fbBuilder.addOffset(9, test4Offset);
+    }
+    if (testarrayofstringOffset != null) {
+      fbBuilder.addOffset(10, testarrayofstringOffset);
+    }
+    if (testarrayoftablesOffset != null) {
+      fbBuilder.addOffset(11, testarrayoftablesOffset);
+    }
+    if (enemyOffset != null) {
+      fbBuilder.addOffset(12, enemyOffset);
+    }
+    if (testnestedflatbufferOffset != null) {
+      fbBuilder.addOffset(13, testnestedflatbufferOffset);
+    }
+    if (testemptyOffset != null) {
+      fbBuilder.addOffset(14, testemptyOffset);
+    }
+    fbBuilder.addBool(15, _testbool);
+    fbBuilder.addInt32(16, _testhashs32Fnv1);
+    fbBuilder.addUint32(17, _testhashu32Fnv1);
+    fbBuilder.addInt64(18, _testhashs64Fnv1);
+    fbBuilder.addUint64(19, _testhashu64Fnv1);
+    fbBuilder.addInt32(20, _testhashs32Fnv1a);
+    fbBuilder.addUint32(21, _testhashu32Fnv1a);
+    fbBuilder.addInt64(22, _testhashs64Fnv1a);
+    fbBuilder.addUint64(23, _testhashu64Fnv1a);
+    if (testarrayofboolsOffset != null) {
+      fbBuilder.addOffset(24, testarrayofboolsOffset);
+    }
+    fbBuilder.addFloat32(25, _testf);
+    fbBuilder.addFloat32(26, _testf2);
+    fbBuilder.addFloat32(27, _testf3);
+    if (testarrayofstring2Offset != null) {
+      fbBuilder.addOffset(28, testarrayofstring2Offset);
+    }
+    if (testarrayofsortedstructOffset != null) {
+      fbBuilder.addOffset(29, testarrayofsortedstructOffset);
+    }
+    if (flexOffset != null) {
+      fbBuilder.addOffset(30, flexOffset);
+    }
+    if (test5Offset != null) {
+      fbBuilder.addOffset(31, test5Offset);
+    }
+    if (vectorOfLongsOffset != null) {
+      fbBuilder.addOffset(32, vectorOfLongsOffset);
+    }
+    if (vectorOfDoublesOffset != null) {
+      fbBuilder.addOffset(33, vectorOfDoublesOffset);
+    }
+    if (parentNamespaceTestOffset != null) {
+      fbBuilder.addOffset(34, parentNamespaceTestOffset);
+    }
+    if (vectorOfReferrablesOffset != null) {
+      fbBuilder.addOffset(35, vectorOfReferrablesOffset);
+    }
+    fbBuilder.addUint64(36, _singleWeakReference);
+    if (vectorOfWeakReferencesOffset != null) {
+      fbBuilder.addOffset(37, vectorOfWeakReferencesOffset);
+    }
+    if (vectorOfStrongReferrablesOffset != null) {
+      fbBuilder.addOffset(38, vectorOfStrongReferrablesOffset);
+    }
+    fbBuilder.addUint64(39, _coOwningReference);
+    if (vectorOfCoOwningReferencesOffset != null) {
+      fbBuilder.addOffset(40, vectorOfCoOwningReferencesOffset);
+    }
+    fbBuilder.addUint64(41, _nonOwningReference);
+    if (vectorOfNonOwningReferencesOffset != null) {
+      fbBuilder.addOffset(42, vectorOfNonOwningReferencesOffset);
+    }
+    fbBuilder.addUint8(43, _anyUniqueType?.value);
+    if (anyUniqueOffset != null) {
+      fbBuilder.addOffset(44, anyUniqueOffset);
+    }
+    fbBuilder.addUint8(45, _anyAmbiguousType?.value);
+    if (anyAmbiguousOffset != null) {
+      fbBuilder.addOffset(46, anyAmbiguousOffset);
+    }
+    if (vectorOfEnumsOffset != null) {
+      fbBuilder.addOffset(47, vectorOfEnumsOffset);
+    }
+    fbBuilder.addInt8(48, _signedEnum?.value);
+    if (testrequirednestedflatbufferOffset != null) {
+      fbBuilder.addOffset(49, testrequirednestedflatbufferOffset);
+    }
+    if (scalarKeySortedTablesOffset != null) {
+      fbBuilder.addOffset(50, scalarKeySortedTablesOffset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class TypeAliases {
+  TypeAliases._(this._bc, this._bcOffset);
+  factory TypeAliases(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TypeAliases> reader = const _TypeAliasesReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get i8 => const fb.Int8Reader().vTableGet(_bc, _bcOffset, 4, 0);
+  int get u8 => const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 6, 0);
+  int get i16 => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 8, 0);
+  int get u16 => const fb.Uint16Reader().vTableGet(_bc, _bcOffset, 10, 0);
+  int get i32 => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 12, 0);
+  int get u32 => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 14, 0);
+  int get i64 => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 16, 0);
+  int get u64 => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 18, 0);
+  double get f32 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 20, 0.0);
+  double get f64 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 22, 0.0);
+  List<int> get v8 => const fb.ListReader<int>(const fb.Int8Reader()).vTableGet(_bc, _bcOffset, 24, null);
+  List<double> get vf64 => const fb.ListReader<double>(const fb.Float64Reader()).vTableGet(_bc, _bcOffset, 26, null);
+
+  @override
+  String toString() {
+    return 'TypeAliases{i8: $i8, u8: $u8, i16: $i16, u16: $u16, i32: $i32, u32: $u32, i64: $i64, u64: $u64, f32: $f32, f64: $f64, v8: $v8, vf64: $vf64}';
+  }
+}
+
+class _TypeAliasesReader extends fb.TableReader<TypeAliases> {
+  const _TypeAliasesReader();
+
+  @override
+  TypeAliases createObject(fb.BufferContext bc, int offset) => 
+    new TypeAliases._(bc, offset);
+}
+
+class TypeAliasesBuilder {
+  TypeAliasesBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addI8(int i8) {
+    fbBuilder.addInt8(0, i8);
+    return fbBuilder.offset;
+  }
+  int addU8(int u8) {
+    fbBuilder.addUint8(1, u8);
+    return fbBuilder.offset;
+  }
+  int addI16(int i16) {
+    fbBuilder.addInt16(2, i16);
+    return fbBuilder.offset;
+  }
+  int addU16(int u16) {
+    fbBuilder.addUint16(3, u16);
+    return fbBuilder.offset;
+  }
+  int addI32(int i32) {
+    fbBuilder.addInt32(4, i32);
+    return fbBuilder.offset;
+  }
+  int addU32(int u32) {
+    fbBuilder.addUint32(5, u32);
+    return fbBuilder.offset;
+  }
+  int addI64(int i64) {
+    fbBuilder.addInt64(6, i64);
+    return fbBuilder.offset;
+  }
+  int addU64(int u64) {
+    fbBuilder.addUint64(7, u64);
+    return fbBuilder.offset;
+  }
+  int addF32(double f32) {
+    fbBuilder.addFloat32(8, f32);
+    return fbBuilder.offset;
+  }
+  int addF64(double f64) {
+    fbBuilder.addFloat64(9, f64);
+    return fbBuilder.offset;
+  }
+  int addV8Offset(int offset) {
+    fbBuilder.addOffset(10, offset);
+    return fbBuilder.offset;
+  }
+  int addVf64Offset(int offset) {
+    fbBuilder.addOffset(11, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TypeAliasesObjectBuilder extends fb.ObjectBuilder {
+  final int _i8;
+  final int _u8;
+  final int _i16;
+  final int _u16;
+  final int _i32;
+  final int _u32;
+  final int _i64;
+  final int _u64;
+  final double _f32;
+  final double _f64;
+  final List<int> _v8;
+  final List<double> _vf64;
+
+  TypeAliasesObjectBuilder({
+    int i8,
+    int u8,
+    int i16,
+    int u16,
+    int i32,
+    int u32,
+    int i64,
+    int u64,
+    double f32,
+    double f64,
+    List<int> v8,
+    List<double> vf64,
+  })
+      : _i8 = i8,
+        _u8 = u8,
+        _i16 = i16,
+        _u16 = u16,
+        _i32 = i32,
+        _u32 = u32,
+        _i64 = i64,
+        _u64 = u64,
+        _f32 = f32,
+        _f64 = f64,
+        _v8 = v8,
+        _vf64 = vf64;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int v8Offset = _v8?.isNotEmpty == true
+        ? fbBuilder.writeListInt8(_v8)
+        : null;
+    final int vf64Offset = _vf64?.isNotEmpty == true
+        ? fbBuilder.writeListFloat64(_vf64)
+        : null;
+
+    fbBuilder.startTable();
+    fbBuilder.addInt8(0, _i8);
+    fbBuilder.addUint8(1, _u8);
+    fbBuilder.addInt16(2, _i16);
+    fbBuilder.addUint16(3, _u16);
+    fbBuilder.addInt32(4, _i32);
+    fbBuilder.addUint32(5, _u32);
+    fbBuilder.addInt64(6, _i64);
+    fbBuilder.addUint64(7, _u64);
+    fbBuilder.addFloat32(8, _f32);
+    fbBuilder.addFloat64(9, _f64);
+    if (v8Offset != null) {
+      fbBuilder.addOffset(10, v8Offset);
+    }
+    if (vf64Offset != null) {
+      fbBuilder.addOffset(11, vf64Offset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game_generated.dart b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game_generated.dart
new file mode 100644
index 0000000..abd538c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/dart/test/monster_test_my_game_generated.dart
@@ -0,0 +1,60 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game.example_generated.dart' as my_game_example;
+import './monster_test_my_game.example2_generated.dart' as my_game_example2;
+
+class InParentNamespace {
+  InParentNamespace._(this._bc, this._bcOffset);
+  factory InParentNamespace(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<InParentNamespace> reader = const _InParentNamespaceReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+
+  @override
+  String toString() {
+    return 'InParentNamespace{}';
+  }
+}
+
+class _InParentNamespaceReader extends fb.TableReader<InParentNamespace> {
+  const _InParentNamespaceReader();
+
+  @override
+  InParentNamespace createObject(fb.BufferContext bc, int offset) => 
+    new InParentNamespace._(bc, offset);
+}
+
+class InParentNamespaceObjectBuilder extends fb.ObjectBuilder {
+
+  InParentNamespaceObjectBuilder();
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/footer.html b/3rdparty/TNN/third_party/flatbuffers/docs/footer.html
new file mode 100644
index 0000000..42bc5f2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/footer.html
@@ -0,0 +1,14 @@
+<!-- Google Analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  ga('create', 'UA-49880327-7', 'auto');
+  ga('send', 'pageview');
+
+</script>
+
+</body>
+</html>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/header.html b/3rdparty/TNN/third_party/flatbuffers/docs/header.html
new file mode 100644
index 0000000..0266e7c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/header.html
@@ -0,0 +1,62 @@
+<!-- HTML header for doxygen 1.8.6-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+<link href="https://fonts.googleapis.com/css?family=Roboto:300,400,400italic,500,500italic,700,700italic|Roboto+Mono:400,700" rel="stylesheet">
+$extrastylesheet
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea" style="height: 110px;">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"/></td>
+  <!--END PROJECT_LOGO-->
+  <td id="commonprojectlogo">
+    <img alt="Logo" src="$relpath^fpl_logo_small.png"/>
+  </td>
+  <!--BEGIN PROJECT_NAME-->
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">$projectname
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <div style="font-size:12px;">
+    An open source project by <a href="https://developers.google.com/games/#Tools">FPL</a>.
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/images/fpl_logo_small.png b/3rdparty/TNN/third_party/flatbuffers/docs/images/fpl_logo_small.png
new file mode 100644
index 0000000..2c728f3
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/docs/images/fpl_logo_small.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2mnode.png b/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2mnode.png
new file mode 100644
index 0000000..3caf47d
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2mnode.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2pnode.png b/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2pnode.png
new file mode 100644
index 0000000..f1aaad3
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/docs/images/ftv2pnode.png differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Benchmarks.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Benchmarks.md
new file mode 100644
index 0000000..848f4e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Benchmarks.md
@@ -0,0 +1,63 @@
+C++ Benchmarks    {#flatbuffers_benchmarks}
+==========
+
+Comparing against other serialization solutions, running on Windows 7
+64bit. We use the LITE runtime for Protocol Buffers (less code / lower
+overhead), Rapid JSON (one of the fastest C++ JSON parsers around),
+and pugixml, also one of the fastest XML parsers.
+
+We also compare against code that doesn't use a serialization library
+at all (the column "Raw structs"), which is what you get if you write
+hardcoded code that just writes structs. This is the fastest possible,
+but of course is not cross platform nor has any kind of forwards /
+backwards compatibility.
+
+We compare against Flatbuffers with the binary wire format (as
+intended), and also with JSON as the wire format with the optional JSON
+parser (which, using a schema, parses JSON into a binary buffer that can
+then be accessed as before).
+
+The benchmark object is a set of about 10 objects containing an array, 4
+strings, and a large variety of int/float scalar values of all sizes,
+meant to be representative of game data, e.g. a scene format.
+
+|                                                        | FlatBuffers (binary)  | Protocol Buffers LITE | Rapid JSON            | FlatBuffers (JSON)     | pugixml               | Raw structs           |
+|--------------------------------------------------------|-----------------------|-----------------------|-----------------------|------------------------| ----------------------| ----------------------|
+| Decode + Traverse + Dealloc (1 million times, seconds) | 0.08                  | 302                   | 583                   | 105                    | 196                   | 0.02                  |
+| Decode / Traverse / Dealloc (breakdown)                | 0 / 0.08 / 0          | 220 / 0.15 / 81       | 294 / 0.9 / 287       | 70 / 0.08 / 35         | 41 / 3.9 / 150        | 0 / 0.02 / 0          |
+| Encode (1 million times, seconds)                      | 3.2                   | 185                   | 650                   | 169                    | 273                   | 0.15                  |
+| Wire format size (normal / zlib, bytes)                | 344 / 220             | 228 / 174             | 1475 / 322            | 1029 / 298             | 1137 / 341            | 312 / 187             |
+| Memory needed to store decoded wire (bytes / blocks)   | 0 / 0                 | 760 / 20              | 65689 / 4             | 328 / 1                | 34194 / 3             | 0 / 0                 |
+| Transient memory allocated during decode (KB)          | 0                     | 1                     | 131                   | 4                      | 34                    | 0                     |
+| Generated source code size (KB)                        | 4                     | 61                    | 0                     | 4                      | 0                     | 0                     |
+| Field access in handwritten traversal code             | typed accessors       | typed accessors       | manual error checking | typed accessors        | manual error checking | typed but no safety   |
+| Library source code (KB)                               | 15                    | some subset of 3800   | 87                    | 43                     | 327                   | 0                     |
+
+### Some other serialization systems we compared against but did not benchmark (yet), in rough order of applicability:
+
+-   Cap'n'Proto promises to reduce Protocol Buffers much like FlatBuffers does,
+    though with a more complicated binary encoding and less flexibility (no
+    optional fields to allow deprecating fields or serializing with missing
+    fields for which defaults exist).
+    It currently also isn't fully cross-platform portable (lack of VS support).
+-   msgpack: has very minimal forwards/backwards compatibility support when used
+    with the typed C++ interface. Also lacks VS2010 support.
+-   Thrift: very similar to Protocol Buffers, but appears to be less efficient,
+    and have more dependencies.
+-   YAML: a superset of JSON and otherwise very similar. Used by e.g. Unity.
+-   C# comes with built-in serialization functionality, as used by Unity also.
+    Being tied to the language, and having no automatic versioning support
+    limits its applicability.
+-   Project Anarchy (the free mobile engine by Havok) comes with a serialization
+    system, that however does no automatic versioning (have to code around new
+    fields manually), is very much tied to the rest of the engine, and works
+    without a schema to generate code (tied to your C++ class definition).
+
+### Code for benchmarks
+
+Code for these benchmarks sits in `benchmarks/` in git branch `benchmarks`.
+It sits in its own branch because it has submodule dependencies that the main
+project doesn't need, and the code standards do not meet those of the main
+project. Please read `benchmarks/cpp/README.txt` before working with the code.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Building.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Building.md
new file mode 100644
index 0000000..cc69f48
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Building.md
@@ -0,0 +1,134 @@
+Building    {#flatbuffers_guide_building}
+========
+
+## Building with CMake
+
+The distribution comes with a `cmake` file that should allow
+you to build project/make files for any platform. For details on `cmake`, see
+<https://www.cmake.org>. In brief, depending on your platform, use one of
+e.g.:
+
+    cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release
+    cmake -G "Visual Studio 10" -DCMAKE_BUILD_TYPE=Release
+    cmake -G "Xcode" -DCMAKE_BUILD_TYPE=Release
+
+Then, build as normal for your platform. This should result in a `flatc`
+executable, essential for the next steps.
+Note that to use clang instead of gcc, you may need to set up your environment
+variables, e.g.
+`CC=/usr/bin/clang CXX=/usr/bin/clang++ cmake -G "Unix Makefiles"`.
+
+Optionally, run the `flattests` executable from the root `flatbuffers/`
+directory to ensure everything is working correctly on your system. If this
+fails, please contact us!
+
+Building should also produce two sample executables, `flatsamplebinary` and
+`flatsampletext`, see the corresponding `.cpp` files in the
+`flatbuffers/samples` directory.
+
+*Note that you MUST be in the root of the FlatBuffers distribution when you
+run 'flattests' or `flatsampletext`, or it will fail to load its files.*
+
+### Make all warnings into errors
+
+By default all Flatbuffers `cmake` targets are build with `-Werror` flag.
+With this flag (or `/WX` for MSVC) C++ compiler will treat all warnings as errors.
+Additionally `-Wall -pedantic -Wextra` (or `/W4` form MSVC) flags are set.
+These flags minimize the number of possible defects in code and keep code highly portable.
+Using these flags is considered good practice but sometimes it can break dependent projects 
+if a compiler is upgraded or a toolset is changed.
+Usually, newer compiler versions add new compile-time diagnostics that were unavailable before.
+These new diagnostic warnings could stop the build process if `-Werror` flag is set.
+
+It is possible to cancel `warnings as errors` flag at `cmake` configuration stage using 
+`FLATBUFFERS_CXX_FLAGS` option. Compilation flags declared in `FLATBUFFERS_CXX_FLAGS` will be 
+appended to the project-level `CMAKE_CXX_FLAGS` variable.
+Examples:
+
+- GCC and Clang: `cmake . -D FLATBUFFERS_CXX_FLAGS="-Wno-error"`
+- MSVC: `cmake . -D FLATBUFFERS_CXX_FLAGS="/WX-"`
+- MSVC: `cmake . -D FLATBUFFERS_CXX_FLAGS="/Wv <compiler.version>"`
+
+
+## Building with VCPKG
+
+You can download and install flatbuffers using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+    git clone https://github.com/Microsoft/vcpkg.git
+    cd vcpkg
+    ./bootstrap-vcpkg.sh
+    ./vcpkg integrate install
+    ./vcpkg install flatbuffers
+
+The flatbuffers port in vcpkg is kept up to date by Microsoft team members and community contributors.
+If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+## Building for Android
+
+There is a `flatbuffers/android` directory that contains all you need to build
+the test executable on android (use the included `build_apk.sh` script, or use
+`ndk_build` / `adb` etc. as usual). Upon running, it will output to the log
+if tests succeeded or not.
+
+You may also run an android sample from inside the `flatbuffers/samples`, by
+running the `android_sample.sh` script. Optionally, you may go to the
+`flatbuffers/samples/android` folder and build the sample with the
+`build_apk.sh` script or `ndk_build` / `adb` etc.
+
+## Using FlatBuffers in your own projects
+
+For C++, there is usually no runtime to compile, as the code consists of a
+single header, `include/flatbuffers/flatbuffers.h`. You should add the
+`include` folder to your include paths. If you wish to be
+able to load schemas and/or parse text into binary buffers at runtime,
+you additionally need the other headers in `include/flatbuffers`. You must
+also compile/link `src/idl_parser.cpp` (and `src/idl_gen_text.cpp` if you
+also want to be able convert binary to text).
+
+To see how to include FlatBuffers in any of our supported languages, please
+view the [Tutorial](@ref flatbuffers_guide_tutorial) and select your appropriate
+language using the radio buttons.
+
+### Using in CMake-based projects
+If you want to use FlatBuffers in a project which already uses CMake, then a more
+robust and flexible approach is to build FlatBuffers as part of that project directly.
+This is done by making the FlatBuffers source code available to the main build
+and adding it using CMake's `add_subdirectory()` command. This has the
+significant advantage that the same compiler and linker settings are used
+between FlatBuffers and the rest of your project, so issues associated with using
+incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows.
+
+Suppose you put FlatBuffers source code in directory `${FLATBUFFERS_SRC_DIR}`.
+To build it as part of your project, add following code to your `CMakeLists.txt` file:
+```cmake
+# Add FlatBuffers directly to our build. This defines the `flatbuffers` target.
+add_subdirectory(${FLATBUFFERS_SRC_DIR}
+                 ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-build
+                 EXCLUDE_FROM_ALL)
+
+# Now simply link against flatbuffers as needed to your already declared target.
+# The flatbuffers target carry header search path automatically if CMake > 2.8.11.
+target_link_libraries(own_project_target PRIVATE flatbuffers)
+```
+When build your project the `flatbuffers` library will be compiled and linked 
+to a target as part of your project.
+
+#### Override default depth limit of nested objects
+To override [the depth limit of recursion](@ref flatbuffers_guide_use_cpp), 
+add this directive:
+```cmake
+set(FLATBUFFERS_MAX_PARSING_DEPTH 16)
+```
+to `CMakeLists.txt` file before `add_subdirectory(${FLATBUFFERS_SRC_DIR})` line.
+
+#### For Google Play apps
+
+For applications on Google Play that integrate this library, usage is tracked.
+This tracking is done automatically using the embedded version string
+(flatbuffer_version_string), and helps us continue to optimize it.
+Aside from consuming a few extra bytes in your application binary, it shouldn't
+affect your application at all. We use this information to let us know if
+FlatBuffers is useful and if we should continue to invest in it. Since this is
+open source, you are free to remove the version string but we would appreciate
+if you would leave it in.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/CONTRIBUTING.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/CONTRIBUTING.md
new file mode 100644
index 0000000..17428ad
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/CONTRIBUTING.md
@@ -0,0 +1,42 @@
+Contributing    {#contributing}
+============
+
+Want to contribute? Great! First, read this page (including the small print at
+the end).
+
+# Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+# Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+Some tips for good pull requests:
+* Use our code
+  [style guide](https://google.github.io/styleguide/cppguide.html).
+  When in doubt, try to stay true to the existing code of the project.
+* Write a descriptive commit message. What problem are you solving and what
+  are the consequences? Where and what did you test? Some good tips:
+  [here](http://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message)
+  and [here](https://www.kernel.org/doc/Documentation/SubmittingPatches).
+* If your PR consists of multiple commits which are successive improvements /
+  fixes to your first commit, consider squashing them into a single commit
+  (`git rebase -i`) such that your PR is a single commit on top of the current
+  HEAD. This make reviewing the code so much easier, and our history more
+  readable.
+
+# The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/CUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/CUsage.md
new file mode 100644
index 0000000..9aafa6f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/CUsage.md
@@ -0,0 +1,224 @@
+Use in C    {#flatbuffers_guide_use_c}
+==========
+
+The C language binding exists in a separate project named [FlatCC](https://github.com/dvidelabs/flatcc).
+
+The `flatcc` C schema compiler can generate code offline as well as
+online via a C library. It can also generate buffer verifiers and fast
+JSON parsers, printers.
+
+Great care has been taken to ensure compatibily with the main `flatc`
+project.
+
+## General Documention
+
+- [Tutorial](@ref flatbuffers_guide_tutorial) - select C as language
+  when scrolling down
+- [FlatCC Guide](https://github.com/dvidelabs/flatcc#flatcc-flatbuffers-in-c-for-c)
+- [The C Builder Interface](https://github.com/dvidelabs/flatcc/blob/master/doc/builder.md#the-builder-interface)
+- [The Monster Sample in C](https://github.com/dvidelabs/flatcc/blob/master/samples/monster/monster.c)
+- [GitHub](https://github.com/dvidelabs/flatcc)
+
+
+## Supported Platforms
+
+- Ubuntu (clang / gcc, ninja / gnu make)
+- OS-X (clang / gcc, ninja / gnu make)
+- Windows MSVC 2010, 2013, 2015
+
+CI builds recent versions of gcc, clang and MSVC on OS-X, Ubuntu, and
+Windows, and occasionally older compiler versions. See main project [Status](https://github.com/dvidelabs/flatcc#status).
+
+Other platforms may well work, including Centos, but are not tested
+regularly.
+
+The monster sample project was specifically written for C99 in order to
+follow the C++ version and for that reason it will not work with MSVC
+2010.
+
+## Modular Object Creation
+
+In the tutorial we used the call `Monster_create_as_root` to create the
+root buffer object since this is easier in simple use cases. Sometimes
+we need more modularity so we can reuse a function to create nested
+tables and root tables the same way. For this we need the
+`flatcc_builder_buffer_create_call`. It is best to keep `flatcc_builder`
+calls isolated at the top driver level, so we get:
+
+<div class="language-c">
+~~~{.c}
+  ns(Monster_ref_t) create_orc(flatcc_builder_t *B)
+  {
+    // ... same as in the tutorial.
+    return s(Monster_create(B, ...));
+  }
+
+  void create_monster_buffer()
+  {
+      uint8_t *buf;
+      size_t size;
+      flatcc_builder_t builder, *B;
+
+      // Initialize the builder object.
+      B = &builder;
+      flatcc_builder_init(B);
+      // Only use `buffer_create` without `create/start/end_as_root`.
+      flatcc_builder_buffer_create(create_orc(B));
+      // Allocate and copy buffer to user memory.
+      buf = flatcc_builder_finalize_buffer(B, &size);
+      // ... write the buffer to disk or network, or something.
+
+      free(buf);
+      flatcc_builder_clear(B);
+  }
+~~~
+</div>
+
+The same principle applies with `start/end` vs `start/end_as_root` in
+the top-down approach.
+
+
+## Top Down Example
+
+The tutorial uses a bottom up approach. In C it is also possible to use
+a top-down approach by starting and ending objects nested within each
+other. In the tutorial there is no deep nesting, so the difference is
+limited, but it shows the idea:
+
+<div class="language-c">
+<br>
+~~~{.c}
+  uint8_t treasure[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  size_t treasure_count = c_vec_len(treasure);
+  ns(Weapon_ref_t) axe;
+
+  // NOTE: if we use end_as_root, we MUST also start as root.
+  ns(Monster_start_as_root(B));
+  ns(Monster_pos_create(B, 1.0f, 2.0f, 3.0f));
+  ns(Monster_hp_add(B, 300));
+  ns(Monster_mana_add(B, 150));
+  // We use create_str instead of add because we have no existing string reference.
+  ns(Monster_name_create_str(B, "Orc"));
+  // Again we use create because we no existing vector object, only a C-array.
+  ns(Monster_inventory_create(B, treasure, treasure_count));
+  ns(Monster_color_add(B, ns(Color_Red)));
+  if (1) {
+      ns(Monster_weapons_start(B));
+      ns(Monster_weapons_push_create(B, flatbuffers_string_create_str(B, "Sword"), 3));
+      // We reuse the axe object later. Note that we dereference a pointer
+      // because push always returns a short-term pointer to the stored element.
+      // We could also have created the axe object first and simply pushed it.
+      axe = *ns(Monster_weapons_push_create(B, flatbuffers_string_create_str(B, "Axe"), 5));
+      ns(Monster_weapons_end(B));
+  } else {
+      // We can have more control with the table elements added to a vector:
+      //
+      ns(Monster_weapons_start(B));
+      ns(Monster_weapons_push_start(B));
+      ns(Weapon_name_create_str(B, "Sword"));
+      ns(Weapon_damage_add(B, 3));
+      ns(Monster_weapons_push_end(B));
+      ns(Monster_weapons_push_start(B));
+      ns(Monster_weapons_push_start(B));
+      ns(Weapon_name_create_str(B, "Axe"));
+      ns(Weapon_damage_add(B, 5));
+      axe = *ns(Monster_weapons_push_end(B));
+      ns(Monster_weapons_end(B));
+  }
+  // Unions can get their type by using a type-specific add/create/start method.
+  ns(Monster_equipped_Weapon_add(B, axe));
+
+  ns(Monster_end_as_root(B));
+~~~
+</div>
+
+
+## Basic Reflection
+
+The C-API does support reading binary schema (.bfbs)
+files via code generated from the `reflection.fbs` schema, and an
+[example usage](https://github.com/dvidelabs/flatcc/tree/master/samples/reflection)
+shows how to use this. The reflection schema files are pre-generated
+in the [runtime distribution](https://github.com/dvidelabs/flatcc/tree/master/include/flatcc/reflection).
+
+
+## Mutations and Reflection
+
+The C-API does not support mutating reflection like C++ does, nor does
+the reader interface support mutating scalars (and it is generally
+unsafe to do so even after verification).
+
+The generated reader interface supports sorting vectors in-place after
+casting them to a mutating type because it is not practical to do so
+while building a buffer. This is covered in the builder documentation.  
+The reflection example makes use of this feature to look up objects by
+name.
+
+It is possible to build new buffers using complex objects from existing
+buffers as source. This can be very efficient due to direct copy
+semantics without endian conversion or temporary stack allocation.
+
+Scalars, structs and strings can be used as source, as well vectors of
+these.
+
+It is currently not possible to use an existing table or vector of table
+as source, but it would be possible to add support for this at some
+point.
+
+
+## Namespaces
+
+The `FLATBUFFERS_WRAP_NAMESPACE` approach used in the tutorial is convenient
+when each function has a very long namespace prefix. But it isn't always
+the best approach. If the namespace is absent, or simple and
+informative, we might as well use the prefix directly. The
+[reflection example](https://github.com/dvidelabs/flatcc/blob/master/samples/reflection/bfbs2json.c)
+mentioned above uses this approach.
+
+
+## Checking for Present Members
+
+Not all languages support testing if a field is present, but in C we can
+elaborate the reader section of the tutorial with tests for this. Recall
+that `mana` was set to the default value `150` and therefore shouldn't
+be present.
+
+<div class="language-c">
+~~~{.c}
+  int hp_present = ns(Monster_hp_is_present(monster)); // 1
+  int mana_present = ns(Monster_mana_is_present(monster)); // 0
+~~~
+</div>
+
+## Alternative ways to add a Union
+
+In the tutorial we used a single call to add a union.  Here we show
+different ways to accomplish the same thing. The last form is rarely
+used, but is the low-level way to do it. It can be used to group small
+values together in the table by adding type and data at different
+points in time.
+
+<div class="language-c">
+~~~{.c}
+   ns(Equipment_union_ref_t) equipped = ns(Equipment_as_Weapon(axe));
+   ns(Monster_equipped_add(B, equipped));
+   // or alternatively
+   ns(Monster_equipped_Weapon_add(B, axe);
+   // or alternatively
+   ns(Monster_equipped_add_type(B, ns(Equipment_Weapon));
+   ns(Monster_equipped_add_member(B, axe));
+~~~
+</div>
+
+## Why not integrate with the `flatc` tool?
+
+[It was considered how the C code generator could be integrated into the
+`flatc` tool](https://github.com/dvidelabs/flatcc/issues/1), but it
+would either require that the standalone C implementation of the schema
+compiler was dropped, or it would lead to excessive code duplication, or
+a complicated intermediate representation would have to be invented.
+Neither of these alternatives are very attractive, and it isn't a big
+deal to use the `flatcc` tool instead of `flatc` given that the
+FlatBuffers C runtime library needs to be made available regardless.
+
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Compiler.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Compiler.md
new file mode 100644
index 0000000..0d2d51f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Compiler.md
@@ -0,0 +1,222 @@
+Using the schema compiler    {#flatbuffers_guide_using_schema_compiler}
+=========================
+
+Usage:
+
+    flatc [ GENERATOR OPTIONS ] [ -o PATH ] [ -I PATH ] FILES...
+          [ -- FILES...]
+
+The files are read and parsed in order, and can contain either schemas
+or data (see below). Data files are processed according to the definitions of
+the most recent schema specified.
+
+`--` indicates that the following files are binary files in
+FlatBuffer format conforming to the schema indicated before it.
+
+Depending on the flags passed, additional files may
+be generated for each file processed:
+
+For any schema input files, one or more generators can be specified:
+
+-   `--cpp`, `-c` : Generate a C++ header for all definitions in this file (as
+    `filename_generated.h`).
+
+-   `--java`, `-j` : Generate Java code.
+
+-   `--kotlin`, `-k` : Generate Kotlin code.
+
+-   `--csharp`, `-n` : Generate C# code.
+
+-   `--go`, `-g` : Generate Go code.
+
+-   `--python`, `-p`: Generate Python code.
+
+-   `--js`, `-s`: Generate JavaScript code.
+
+-   `--ts`: Generate TypeScript code.
+
+-   `--php`: Generate PHP code.
+
+-   `--grpc`: Generate RPC stub code for GRPC.
+
+-   `--dart`: Generate Dart code.
+
+-   `--lua`: Generate Lua code.
+
+-   `--lobster`: Generate Lobster code.
+
+-   `--rust`, `-r` : Generate Rust code.
+
+-   `--swift`: Generate Swift code.
+
+For any data input files:
+
+-   `--binary`, `-b` : If data is contained in this file, generate a
+    `filename.bin` containing the binary flatbuffer (or a different extension
+    if one is specified in the schema).
+
+-   `--json`, `-t` : If data is contained in this file, generate a
+    `filename.json` representing the data in the flatbuffer.
+
+Additional options:
+
+-   `-o PATH` : Output all generated files to PATH (either absolute, or
+    relative to the current directory). If omitted, PATH will be the
+    current directory. PATH should end in your systems path separator,
+    e.g. `/` or `\`.
+
+-   `-I PATH` : when encountering `include` statements, attempt to load the
+    files from this path. Paths will be tried in the order given, and if all
+    fail (or none are specified) it will try to load relative to the path of
+    the schema file being parsed.
+
+-   `-M` : Print make rules for generated files.
+
+-   `--strict-json` : Require & generate strict JSON (field names are enclosed
+    in quotes, no trailing commas in tables/vectors). By default, no quotes are
+    required/generated, and trailing commas are allowed.
+
+-   `--allow-non-utf8` : Pass non-UTF-8 input through parser and emit nonstandard
+    \x escapes in JSON. (Default is to raise parse error on non-UTF-8 input.)
+
+-  `--natural-utf8` : Output strings with UTF-8 as human-readable strings.
+     By default, UTF-8 characters are printed as \uXXXX escapes."
+
+-   `--defaults-json` : Output fields whose value is equal to the default value
+    when writing JSON text.
+
+-   `--no-prefix` : Don't prefix enum values in generated C++ by their enum
+    type.
+
+-   `--scoped-enums` : Use C++11 style scoped and strongly typed enums in
+    generated C++. This also implies `--no-prefix`.
+
+-   `--gen-includes` : (deprecated), this is the default behavior.
+                       If the original behavior is required (no include
+	                   statements) use `--no-includes.`
+
+-   `--no-includes` : Don't generate include statements for included schemas the
+    generated file depends on (C++ / Python).
+
+-   `--gen-mutable` : Generate additional non-const accessors for mutating
+    FlatBuffers in-place.
+
+-   `--gen-onefile` : Generate single output file for C# and Go.
+
+-   `--gen-name-strings` : Generate type name functions for C++.
+
+-   `--gen-object-api` : Generate an additional object-based API. This API is
+    more convenient for object construction and mutation than the base API,
+    at the cost of efficiency (object allocation). Recommended only to be used
+    if other options are insufficient.
+
+-   `--gen-compare`  :  Generate operator== for object-based API types.
+
+-   `--gen-nullable` : Add Clang _Nullable for C++ pointer. or @Nullable for Java.
+
+-   `--gen-generated` : Add @Generated annotation for Java.
+
+-   `--gen-jvmstatic` : Add @JvmStatic annotation for Kotlin methods
+    in companion object for interop from Java to Kotlin.
+
+-   `--gen-all` : Generate not just code for the current schema files, but
+    for all files it includes as well. If the language uses a single file for
+    output (by default the case for C++ and JS), all code will end up in
+    this one file.
+
+-   `--cpp-include` : Adds an #include in generated file
+
+-   `--cpp-ptr-type T` : Set object API pointer type (default std::unique_ptr)
+
+-   `--cpp-str-type T` : Set object API string type (default std::string)
+    T::c_str(), T::length() and T::empty() must be supported.
+    The custom type also needs to be constructible from std::string (see the
+	--cpp-str-flex-ctor option to change this behavior).
+
+-   `--cpp-str-flex-ctor` : Don't construct custom string types by passing
+    std::string from Flatbuffers, but (char* + length). This allows efficient
+	construction of custom string types, including zero-copy construction.
+
+-   `--no-cpp-direct-copy` : Don't generate direct copy methods for C++
+    object-based API.
+
+-   `--cpp-std CPP_STD` : Generate a C++ code using features of selected C++ standard.
+     Supported `CPP_STD` values:
+    * `c++0x` - generate code compatible with old compilers (VS2010),
+    * `c++11` - use C++11 code generator (default),
+    * `c++17` - use C++17 features in generated code (experimental).
+
+-   `--object-prefix` : Customise class prefix for C++ object-based API.
+
+-   `--object-suffix` : Customise class suffix for C++ object-based API.
+
+-   `--go-namespace` : Generate the overrided namespace in Golang.
+
+-   `--go-import` : Generate the overrided import for flatbuffers in Golang.
+     (default is "github.com/google/flatbuffers/go").
+
+-   `--raw-binary` : Allow binaries without a file_indentifier to be read.
+    This may crash flatc given a mismatched schema.
+
+-   `--size-prefixed` : Input binaries are size prefixed buffers.
+
+-   `--proto`: Expect input files to be .proto files (protocol buffers).
+    Output the corresponding .fbs file.
+    Currently supports: `package`, `message`, `enum`, nested declarations,
+    `import` (use `-I` for paths), `extend`, `oneof`, `group`.
+    Does not support, but will skip without error: `option`, `service`,
+    `extensions`, and most everything else.
+
+-   `--oneof-union` : Translate .proto oneofs to flatbuffer unions.
+
+-   `--grpc` : Generate GRPC interfaces for the specified languages.
+
+-   `--schema`: Serialize schemas instead of JSON (use with -b). This will
+    output a binary version of the specified schema that itself corresponds
+    to the reflection/reflection.fbs schema. Loading this binary file is the
+    basis for reflection functionality.
+
+-   `--bfbs-comments`: Add doc comments to the binary schema files.
+
+-   `--conform FILE` : Specify a schema the following schemas should be
+    an evolution of. Gives errors if not. Useful to check if schema
+    modifications don't break schema evolution rules.
+
+-   `--conform-includes PATH` : Include path for the schema given with
+    `--conform PATH`.
+
+-   `--filename-suffix SUFFIX` : The suffix appended to the generated
+    file names. Default is '_generated'.
+
+-   `--filename-ext EXTENSION` : The extension appended to the generated
+    file names. Default is language-specific (e.g. "h" for C++). This
+    should not be used when multiple languages are specified.
+
+-   `--include-prefix PATH` : Prefix this path to any generated include
+    statements.
+
+-   `--keep-prefix` : Keep original prefix of schema include statement.
+
+-   `--reflect-types` : Add minimal type reflection to code generation.
+
+-   `--reflect-names` : Add minimal type/name reflection.
+
+-   `--root-type T` : Select or override the default root_type.
+
+-   `--require-explicit-ids` : When parsing schemas, require explicit ids (id: x).
+
+-   `--force-defaults` : Emit default values in binary output from JSON.
+
+-   `--force-empty` : When serializing from object API representation, force
+     strings and vectors to empty rather than null.
+
+-   `--force-empty-vectors` : When serializing from object API representation, force
+     vectors to empty rather than null.
+
+-   `--flexbuffers` : Used with "binary" and "json" options, it generates
+     data using schema-less FlexBuffers.
+
+-    `--no-warnings` : Inhibit all warning messages.
+
+NOTE: short-form options for generators are deprecated, use the long form
+whenever possible.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/CppUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/CppUsage.md
new file mode 100644
index 0000000..da7d147
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/CppUsage.md
@@ -0,0 +1,638 @@
+Use in C++    {#flatbuffers_guide_use_cpp}
+==========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in C++, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide
+to general FlatBuffers usage in all of the supported languages (including C++).
+This page is designed to cover the nuances of FlatBuffers usage, specific to
+C++.
+
+#### Prerequisites
+
+This page assumes you have written a FlatBuffers schema and compiled it
+with the Schema Compiler. If you have not, please see
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler)
+and [Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+Assuming you wrote a schema, say `mygame.fbs` (though the extension doesn't
+matter), you've generated a C++ header called `mygame_generated.h` using the
+compiler (e.g. `flatc -c mygame.fbs`), you can now start using this in
+your program by including the header. As noted, this header relies on
+`flatbuffers/flatbuffers.h`, which should be in your include path.
+
+## FlatBuffers C++ library code location
+
+The code for the FlatBuffers C++ library can be found at
+`flatbuffers/include/flatbuffers`. You can browse the library code on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/include/flatbuffers).
+
+## Testing the FlatBuffers C++ library
+
+The code to test the C++ library can be found at `flatbuffers/tests`.
+The test code itself is located in
+[test.cpp](https://github.com/google/flatbuffers/blob/master/tests/test.cpp).
+
+This test file is built alongside `flatc`. To review how to build the project,
+please read the [Building](@ref flatbuffers_guide_building) documentation.
+
+To run the tests, execute `flattests` from the root `flatbuffers/` directory.
+For example, on [Linux](https://en.wikipedia.org/wiki/Linux), you would simply
+run: `./flattests`.
+
+## Using the FlatBuffers C++ library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in C++.*
+
+FlatBuffers supports both reading and writing FlatBuffers in C++.
+
+To use FlatBuffers in your code, first generate the C++ classes from your
+schema with the `--cpp` option to `flatc`. Then you can include both FlatBuffers
+and the generated code to read or write FlatBuffers.
+
+For example, here is how you would read a FlatBuffer binary file in C++:
+First, include the library and generated code. Then read the file into
+a `char *` array, which you pass to `GetMonster()`.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    #include "flatbuffers/flatbuffers.h"
+    #include "monster_test_generate.h"
+    #include <iostream> // C++ header file for printing
+    #include <fstream> // C++ header file for file access
+
+
+    std::ifstream infile;
+    infile.open("monsterdata_test.mon", std::ios::binary | std::ios::in);
+    infile.seekg(0,std::ios::end);
+    int length = infile.tellg();
+    infile.seekg(0,std::ios::beg);
+    char *data = new char[length];
+    infile.read(data, length);
+    infile.close();
+
+    auto monster = GetMonster(data);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`monster` is of type `Monster *`, and points to somewhere *inside* your
+buffer (root object pointers are not the same as `buffer_pointer` !).
+If you look in your generated header, you'll see it has
+convenient accessors for all fields, e.g. `hp()`, `mana()`, etc:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    std::cout << "hp : " << monster->hp() << std::endl;            // `80`
+    std::cout << "mana : " << monster->mana() << std::endl;        // default value of `150`
+    std::cout << "name : " << monster->name()->c_str() << std::endl;        // "MyMonster"
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+*Note: That we never stored a `mana` value, so it will return the default.*
+
+The following attributes are supported:
+
+-   `shared` (on a field): For string fields, this enables the usage of string
+    pooling (i.e. `CreateSharedString`) as default serialization behavior.
+
+    Specifically, `CreateXxxDirect` functions and `Pack` functions for object
+    based API (see below) will use `CreateSharedString` to create strings.
+
+## Object based API.  {#flatbuffers_cpp_object_based_api}
+
+FlatBuffers is all about memory efficiency, which is why its base API is written
+around using as little as possible of it. This does make the API clumsier
+(requiring pre-order construction of all data, and making mutation harder).
+
+For times when efficiency is less important a more convenient object based API
+can be used (through `--gen-object-api`) that is able to unpack & pack a
+FlatBuffer into objects and standard STL containers, allowing for convenient
+construction, access and mutation.
+
+To use:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    // Autogenerated class from table Monster.
+    MonsterT monsterobj;
+
+    // Deserialize from buffer into object.
+    GetMonster(flatbuffer)->UnPackTo(&monsterobj);
+
+    // Update object directly like a C++ class instance.
+    cout << monsterobj.name;  // This is now a std::string!
+    monsterobj.name = "Bob";  // Change the name.
+
+    // Serialize into new flatbuffer.
+    FlatBufferBuilder fbb;
+    fbb.Finish(Monster::Pack(fbb, &monsterobj));
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following attributes are specific to the object-based API code generation:
+
+-   `native_inline` (on a field): Because FlatBuffer tables and structs are
+    optionally present in a given buffer, they are best represented as pointers
+    (specifically std::unique_ptrs) in the native class since they can be null.
+    This attribute changes the member declaration to use the type directly
+    rather than wrapped in a unique_ptr.
+
+-   `native_default("value")` (on a field): For members that are declared
+    "native_inline", the value specified with this attribute will be included
+    verbatim in the class constructor initializer list for this member.
+
+-   `native_custom_alloc("custom_allocator")` (on a table or struct): When using the
+    object-based API all generated NativeTables that  are allocated when unpacking
+    your  flatbuffer will use "custom allocator". The allocator is also used by
+    any std::vector that appears in a table defined with `native_custom_alloc`.
+    This can be  used to provide allocation from a pool for example, for faster
+    unpacking when using the object-based API.
+
+    Minimal Example:
+
+    schema:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    table mytable(native_custom_alloc:"custom_allocator") {
+      ...
+    }
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    with custom_allocator defined before `flatbuffers.h` is included, as:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    template <typename T> struct custom_allocator : public std::allocator<T> {
+
+      typedef T *pointer;
+
+      template <class U>
+      struct rebind {
+        typedef custom_allocator<U> other;
+      };
+
+      pointer allocate(const std::size_t n) {
+        return std::allocator<T>::allocate(n);
+      }
+
+      void deallocate(T* ptr, std::size_t n) {
+        return std::allocator<T>::deallocate(ptr,n);
+      }
+
+      custom_allocator() throw() {}
+      template <class U>
+      custom_allocator(const custom_allocator<U>&) throw() {}
+    };
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-   `native_type("type")` (on a struct): In some cases, a more optimal C++ data
+    type exists for a given struct.  For example, the following schema:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    struct Vec2 {
+      x: float;
+      y: float;
+    }
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    generates the following Object-Based API class:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    struct Vec2T : flatbuffers::NativeTable {
+      float x;
+      float y;
+    };
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    However, it can be useful to instead use a user-defined C++ type since it
+    can provide more functionality, eg.
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    struct vector2 {
+      float x = 0, y = 0;
+      vector2 operator+(vector2 rhs) const { ... }
+      vector2 operator-(vector2 rhs) const { ... }
+      float length() const { ... }
+      // etc.
+    };
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    The `native_type` attribute will replace the usage of the generated class
+    with the given type.  So, continuing with the example, the generated
+    code would use `vector2` in place of `Vec2T` for all generated code of
+    the Object-Based API.
+
+    However, because the `native_type` is unknown to flatbuffers, the user must
+    provide the following functions to aide in the serialization process:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    namespace flatbuffers {
+      Vec2 Pack(const vector2& obj);
+      vector2 UnPack(const Vec2& obj);
+    }
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-   `native_type_pack_name("name")` (on a struct when `native_type` is
+    specified, too): when you want to use the same `native_type` multiple times
+    (e. g. with different precision) you must make the names of the Pack/UnPack
+    functions unique, otherwise you will run into compile errors. This attribute
+    appends a name to the expected Pack/UnPack functions. So when you
+    specify `native_type_pack_name("Vec2")` in the above example you now need to
+    implement these serialization functions instead:
+
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    namespace flatbuffers {
+      Vec2 PackVec2(const vector2& obj);
+      vector2 UnPackVec2(const Vec2& obj);
+    }
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Finally, the following top-level attributes:
+
+-   `native_include("path")` (at file level): Because the `native_type` attribute
+    can be used to introduce types that are unknown to flatbuffers, it may be
+    necessary to include "external" header files in the generated code.  This
+    attribute can be used to directly add an #include directive to the top of
+    the generated code that includes the specified path directly.
+
+-   `force_align`: this attribute may not be respected in the object API,
+    depending on the aligned of the allocator used with `new`.
+
+# External references.
+
+An additional feature of the object API is the ability to allow you to load
+multiple independent FlatBuffers, and have them refer to eachothers objects
+using hashes which are then represented as typed pointers in the object API.
+
+To make this work have a field in the objects you want to referred to which is
+using the string hashing feature (see `hash` attribute in the
+[schema](@ref flatbuffers_guide_writing_schema) documentation). Then you have
+a similar hash in the field referring to it, along with a `cpp_type`
+attribute specifying the C++ type this will refer to (this can be any C++
+type, and will get a `*` added).
+
+Then, in JSON or however you create these buffers, make sure they use the
+same string (or hash).
+
+When you call `UnPack` (or `Create`), you'll need a function that maps from
+hash to the object (see `resolver_function_t` for details).
+
+# Using different pointer types.
+
+By default the object tree is built out of `std::unique_ptr`, but you can
+influence this either globally (using the `--cpp-ptr-type` argument to
+`flatc`) or per field (using the `cpp_ptr_type` attribute) to by any smart
+pointer type (`my_ptr<T>`), or by specifying `naked` as the type to get `T *`
+pointers. Unlike the smart pointers, naked pointers do not manage memory for
+you, so you'll have to manage their lifecycles manually.  To reference the
+pointer type specified by the `--cpp-ptr-type` argument to `flatc` from a
+flatbuffer field set the `cpp_ptr_type` attribute to `default_ptr_type`.
+
+# Using different string type.
+
+By default the object tree is built out of `std::string`, but you can
+influence this either globally (using the `--cpp-str-type` argument to
+`flatc`) or per field using the `cpp_str_type` attribute.
+
+The type must support T::c_str(), T::length() and T::empty() as member functions.
+
+Further, the type must be constructible from std::string, as by default a
+std::string instance is constructed and then used to initialize the custom
+string type. This behavior impedes efficient and zero-copy construction of
+custom string types; the `--cpp-str-flex-ctor` argument to `flatc` or the
+per field attribute `cpp_str_flex_ctor` can be used to change this behavior,
+so that the custom string type is constructed by passing the pointer and
+length of the FlatBuffers String. The custom string class will require a
+constructor in the following format: custom_str_class(const char *, size_t).
+Please note that the character array is not guaranteed to be NULL terminated,
+you should always use the provided size to determine end of string.
+
+## Reflection (& Resizing)
+
+There is experimental support for reflection in FlatBuffers, allowing you to
+read and write data even if you don't know the exact format of a buffer, and
+even allows you to change sizes of strings and vectors in-place.
+
+The way this works is very elegant; there is actually a FlatBuffer schema that
+describes schemas (!) which you can find in `reflection/reflection.fbs`.
+The compiler, `flatc`, can write out any schemas it has just parsed as a binary
+FlatBuffer, corresponding to this meta-schema.
+
+Loading in one of these binary schemas at runtime allows you traverse any
+FlatBuffer data that corresponds to it without knowing the exact format. You
+can query what fields are present, and then read/write them after.
+
+For convenient field manipulation, you can include the header
+`flatbuffers/reflection.h` which includes both the generated code from the meta
+schema, as well as a lot of helper functions.
+
+And example of usage, for the time being, can be found in
+`test.cpp/ReflectionTest()`.
+
+## Mini Reflection
+
+A more limited form of reflection is available for direct inclusion in
+generated code, which doesn't do any (binary) schema access at all. It was designed
+to keep the overhead of reflection as low as possible (on the order of 2-6
+bytes per field added to your executable), but doesn't contain all the
+information the (binary) schema contains.
+
+You add this information to your generated code by specifying `--reflect-types`
+(or instead `--reflect-names` if you also want field / enum names).
+
+You can now use this information, for example to print a FlatBuffer to text:
+
+    auto s = flatbuffers::FlatBufferToString(flatbuf, MonsterTypeTable());
+
+`MonsterTypeTable()` is declared in the generated code for each type. The
+string produced is very similar to the JSON produced by the `Parser` based
+text generator.
+
+You'll need `flatbuffers/minireflect.h` for this functionality. In there is also
+a convenient visitor/iterator so you can write your own output / functionality
+based on the mini reflection tables without having to know the FlatBuffers or
+reflection encoding.
+
+## Storing maps / dictionaries in a FlatBuffer
+
+FlatBuffers doesn't support maps natively, but there is support to
+emulate their behavior with vectors and binary search, which means you
+can have fast lookups directly from a FlatBuffer without having to unpack
+your data into a `std::map` or similar.
+
+To use it:
+-   Designate one of the fields in a table as they "key" field. You do this
+    by setting the `key` attribute on this field, e.g.
+    `name:string (key)`.
+    You may only have one key field, and it must be of string or scalar type.
+-   Write out tables of this type as usual, collect their offsets in an
+    array or vector.
+-   Instead of `CreateVector`, call `CreateVectorOfSortedTables`,
+    which will first sort all offsets such that the tables they refer to
+    are sorted by the key field, then serialize it.
+-   Now when you're accessing the FlatBuffer, you can use `Vector::LookupByKey`
+    instead of just `Vector::Get` to access elements of the vector, e.g.:
+    `myvector->LookupByKey("Fred")`, which returns a pointer to the
+    corresponding table type, or `nullptr` if not found.
+    `LookupByKey` performs a binary search, so should have a similar speed to
+    `std::map`, though may be faster because of better caching. `LookupByKey`
+    only works if the vector has been sorted, it will likely not find elements
+    if it hasn't been sorted.
+
+## Direct memory access
+
+As you can see from the above examples, all elements in a buffer are
+accessed through generated accessors. This is because everything is
+stored in little endian format on all platforms (the accessor
+performs a swap operation on big endian machines), and also because
+the layout of things is generally not known to the user.
+
+For structs, layout is deterministic and guaranteed to be the same
+across platforms (scalars are aligned to their
+own size, and structs themselves to their largest member), and you
+are allowed to access this memory directly by using `sizeof()` and
+`memcpy` on the pointer to a struct, or even an array of structs.
+
+To compute offsets to sub-elements of a struct, make sure they
+are a structs themselves, as then you can use the pointers to
+figure out the offset without having to hardcode it. This is
+handy for use of arrays of structs with calls like `glVertexAttribPointer`
+in OpenGL or similar APIs.
+
+It is important to note is that structs are still little endian on all
+machines, so only use tricks like this if you can guarantee you're not
+shipping on a big endian machine (an `assert(FLATBUFFERS_LITTLEENDIAN)`
+would be wise).
+
+## Access of untrusted buffers
+
+The generated accessor functions access fields over offsets, which is
+very quick. These offsets are not verified at run-time, so a malformed
+buffer could cause a program to crash by accessing random memory.
+
+When you're processing large amounts of data from a source you know (e.g.
+your own generated data on disk), this is acceptable, but when reading
+data from the network that can potentially have been modified by an
+attacker, this is undesirable.
+
+For this reason, you can optionally use a buffer verifier before you
+access the data. This verifier will check all offsets, all sizes of
+fields, and null termination of strings to ensure that when a buffer
+is accessed, all reads will end up inside the buffer.
+
+Each root type will have a verification function generated for it,
+e.g. for `Monster`, you can call:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+	bool ok = VerifyMonsterBuffer(Verifier(buf, len));
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if `ok` is true, the buffer is safe to read.
+
+Besides untrusted data, this function may be useful to call in debug
+mode, as extra insurance against data being corrupted somewhere along
+the way.
+
+While verifying a buffer isn't "free", it is typically faster than
+a full traversal (since any scalar data is not actually touched),
+and since it may cause the buffer to be brought into cache before
+reading, the actual overhead may be even lower than expected.
+
+In specialized cases where a denial of service attack is possible,
+the verifier has two additional constructor arguments that allow
+you to limit the nesting depth and total amount of tables the
+verifier may encounter before declaring the buffer malformed. The default is
+`Verifier(buf, len, 64 /* max depth */, 1000000, /* max tables */)` which
+should be sufficient for most uses.
+
+## Text & schema parsing
+
+Using binary buffers with the generated header provides a super low
+overhead use of FlatBuffer data. There are, however, times when you want
+to use text formats, for example because it interacts better with source
+control, or you want to give your users easy access to data.
+
+Another reason might be that you already have a lot of data in JSON
+format, or a tool that generates JSON, and if you can write a schema for
+it, this will provide you an easy way to use that data directly.
+
+(see the schema documentation for some specifics on the JSON format
+accepted).
+
+Schema evolution compatibility for the JSON format follows the same rules as the binary format (JSON formatted data will be forwards/backwards compatible with schemas that evolve in a compatible way).
+
+There are two ways to use text formats:
+
+#### Using the compiler as a conversion tool
+
+This is the preferred path, as it doesn't require you to add any new
+code to your program, and is maximally efficient since you can ship with
+binary data. The disadvantage is that it is an extra step for your
+users/developers to perform, though you might be able to automate it.
+
+    flatc -b myschema.fbs mydata.json
+
+This will generate the binary file `mydata_wire.bin` which can be loaded
+as before.
+
+#### Making your program capable of loading text directly
+
+This gives you maximum flexibility. You could even opt to support both,
+i.e. check for both files, and regenerate the binary from text when
+required, otherwise just load the binary.
+
+This option is currently only available for C++, or Java through JNI.
+
+As mentioned in the section "Building" above, this technique requires
+you to link a few more files into your program, and you'll want to include
+`flatbuffers/idl.h`.
+
+Load text (either a schema or json) into an in-memory buffer (there is a
+convenient `LoadFile()` utility function in `flatbuffers/util.h` if you
+wish). Construct a parser:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    flatbuffers::Parser parser;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can parse any number of text files in sequence:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+    parser.Parse(text_file.c_str());
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This works similarly to how the command-line compiler works: a sequence
+of files parsed by the same `Parser` object allow later files to
+reference definitions in earlier files. Typically this means you first
+load a schema file (which populates `Parser` with definitions), followed
+by one or more JSON files.
+
+As optional argument to `Parse`, you may specify a null-terminated list of
+include paths. If not specified, any include statements try to resolve from
+the current directory.
+
+If there were any parsing errors, `Parse` will return `false`, and
+`Parser::error_` contains a human readable error string with a line number
+etc, which you should present to the creator of that file.
+
+After each JSON file, the `Parser::fbb` member variable is the
+`FlatBufferBuilder` that contains the binary buffer version of that
+file, that you can access as described above.
+
+`samples/sample_text.cpp` is a code sample showing the above operations.
+
+## Threading
+
+Reading a FlatBuffer does not touch any memory outside the original buffer,
+and is entirely read-only (all const), so is safe to access from multiple
+threads even without synchronisation primitives.
+
+Creating a FlatBuffer is not thread safe. All state related to building
+a FlatBuffer is contained in a FlatBufferBuilder instance, and no memory
+outside of it is touched. To make this thread safe, either do not
+share instances of FlatBufferBuilder between threads (recommended), or
+manually wrap it in synchronisation primites. There's no automatic way to
+accomplish this, by design, as we feel multithreaded construction
+of a single buffer will be rare, and synchronisation overhead would be costly.
+
+## Advanced union features
+
+The C++ implementation currently supports vectors of unions (i.e. you can
+declare a field as `[T]` where `T` is a union type instead of a table type). It
+also supports structs and strings in unions, besides tables.
+
+For an example of these features, see `tests/union_vector`, and
+`UnionVectorTest` in `test.cpp`.
+
+Since these features haven't been ported to other languages yet, if you
+choose to use them, you won't be able to use these buffers in other languages
+(`flatc` will refuse to compile a schema that uses these features).
+
+These features reduce the amount of "table wrapping" that was previously
+needed to use unions.
+
+To use scalars, simply wrap them in a struct.
+
+## Depth limit of nested objects and stack-overflow control
+The parser of Flatbuffers schema or json-files is kind of recursive parser.
+To avoid stack-overflow problem the parser has a built-in limiter of
+recursion depth. Number of nested declarations in a schema or number of
+nested json-objects is limited. By default, this depth limit set to `64`.
+It is possible to override this limit with `FLATBUFFERS_MAX_PARSING_DEPTH`
+definition. This definition can be helpful for testing purposes or embedded
+applications. For details see [build](@ref flatbuffers_guide_building) of
+CMake-based projects.
+
+## Dependence from C-locale {#flatbuffers_locale_cpp}
+The Flatbuffers [grammar](@ref flatbuffers grammar) uses ASCII
+character set for identifiers, alphanumeric literals, reserved words.
+
+Internal implementation of the Flatbuffers depends from functions which
+depend from C-locale: `strtod()` or `strtof()`, for example.
+The library expects the dot `.` symbol as the separator of an integer
+part from the fractional part of a float number.
+Another separator symbols (`,` for example) will break the compatibility
+and may lead to an error while parsing a Flatbuffers schema or a json file.
+
+The Standard C locale is a global resource, there is only one locale for
+the entire application. Some modern compilers and platforms have
+locale-independent or locale-narrow functions `strtof_l`, `strtod_l`,
+`strtoll_l`, `strtoull_l` to resolve this dependency.
+These functions use specified locale rather than the global or per-thread
+locale instead. They are part of POSIX-2008 but not part of the C/C++
+standard library, therefore, may be missing on some platforms.
+The Flatbuffers library try to detect these functions at configuration and
+compile time:
+- CMake `"CMakeLists.txt"`:
+  - Check existence of `strtol_l` and `strtod_l` in the `<stdlib.h>`.
+- Compile-time `"/include/base.h"`:
+  - `_MSC_VER >= 1900`: MSVC2012 or higher if build with MSVC.
+  - `_XOPEN_SOURCE>=700`: POSIX-2008 if build with GCC/Clang.
+
+After detection, the definition `FLATBUFFERS_LOCALE_INDEPENDENT` will be
+set to `0` or `1`.
+To override or stop this detection use CMake `-DFLATBUFFERS_LOCALE_INDEPENDENT={0|1}`
+or predefine `FLATBUFFERS_LOCALE_INDEPENDENT` symbol.
+
+To test the compatibility of the Flatbuffers library with
+a specific locale use the environment variable `FLATBUFFERS_TEST_LOCALE`:
+```sh
+>FLATBUFFERS_TEST_LOCALE="" ./flattests
+>FLATBUFFERS_TEST_LOCALE="ru_RU.CP1251" ./flattests
+```
+
+## Support of floating-point numbers
+The Flatbuffers library assumes that a C++ compiler and a CPU are
+compatible with the `IEEE-754` floating-point standard.
+The schema and json parser may fail if `fast-math` or `/fp:fast` mode is active.
+
+### Support of hexadecimal and special floating-point numbers
+According to the [grammar](@ref flatbuffers_grammar) `fbs` and `json` files
+may use hexadecimal and special (`NaN`, `Inf`) floating-point literals.
+The Flatbuffers uses `strtof` and `strtod` functions to parse floating-point
+literals. The Flatbuffers library has a code to detect a compiler compatibility
+with the literals. If necessary conditions are met the preprocessor constant
+`FLATBUFFERS_HAS_NEW_STRTOD` will be set to `1`.
+The support of floating-point literals will be limited at compile time
+if `FLATBUFFERS_HAS_NEW_STRTOD` constant is less than `1`.
+In this case, schemas with hexadecimal or special literals cannot be used.
+
+### Comparison of floating-point NaN values
+The floating-point `NaN` (`not a number`) is special value which
+representing an undefined or unrepresentable value.
+`NaN` may be explicitly assigned to variables, typically as a representation
+for missing values or may be a result of a mathematical operation.
+The `IEEE-754` defines two kind of `NaNs`:
+- Quiet NaNs, or `qNaNs`.
+- Signaling NaNs, or `sNaNs`.
+
+According to the `IEEE-754`, a comparison with `NaN` always returns
+an unordered result even when compared with itself. As a result, a whole
+Flatbuffers object will be not equal to itself if has one or more `NaN`.
+Flatbuffers scalar fields that have the default value are not actually stored
+in the serialized data but are generated in code (see [Writing a schema](@ref flatbuffers_guide_writing_schema)).
+Scalar fields with `NaN` defaults break this behavior.
+If a schema has a lot of `NaN` defaults the Flatbuffers can override
+the unordered comparison by the ordered: `(NaN==NaN)->true`.
+This ordered comparison is enabled when compiling a program with the symbol
+`FLATBUFFERS_NAN_DEFAULTS` defined.
+Additional computations added by `FLATBUFFERS_NAN_DEFAULTS` are very cheap
+if GCC or Clang used. These compilers have a compile-time implementation
+of `isnan` checking which MSVC does not.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/CsharpUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/CsharpUsage.md
new file mode 100644
index 0000000..83f4842
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/CsharpUsage.md
@@ -0,0 +1,226 @@
+Use in C#    {#flatbuffers_guide_use_c-sharp}
+==============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in C#, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages (including C#).
+This page is designed to cover the nuances of FlatBuffers usage,
+specific to C#.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers C# code location
+
+The code for the FlatBuffers C# library can be found at
+`flatbuffers/net/FlatBuffers`. You can browse the library on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/net/
+FlatBuffers).
+
+## Building the FlatBuffers C# library
+
+The `FlatBuffers.csproj` project contains multitargeting for .NET Standard 2.1,
+.NET Standard 2.0, and .NET Framework 4.6 (Unity 2017). Support for .NET
+Framework 3.5 (Unity 5) is provided by the `FlatBuffers.net35.csproj` project.
+In most cases (including Unity 2018 and newer), .NET Standard 2.0 is
+recommended.
+
+You can build for a specific framework target when using the cross-platform
+[.NET Core SDK](https://dotnet.microsoft.com/download) by adding the `-f`
+command line option:
+
+~~~{.sh}
+    dotnet build -f netstandard2.0 "FlatBuffers.csproj"
+~~~
+
+The `FlatBuffers.csproj` project also provides support for defining various
+conditional compilation symbols (see "Conditional compilation symbols" section
+below) using the `-p` command line option:
+
+~~~{.sh}
+    dotnet build -f netstandard2.1 -p:ENABLE_SPAN_T=true -p:UNSAFE_BYTEBUFFER=true "FlatBuffers.csproj"
+~~~
+
+## Testing the FlatBuffers C# library
+
+The code to test the libraries can be found at `flatbuffers/tests`.
+
+The test code for C# is located in the [FlatBuffers.Test](https://github.com/
+google/flatbuffers/tree/master/tests/FlatBuffers.Test) subfolder. To run the
+tests, open `FlatBuffers.Test.csproj` in [Visual Studio](
+https://www.visualstudio.com), and compile/run the project.
+
+Optionally, you can run this using [Mono](http://www.mono-project.com/) instead.
+Once you have installed Mono, you can run the tests from the command line
+by running the following commands from inside the `FlatBuffers.Test` folder:
+
+~~~{.sh}
+    mcs *.cs ../MyGame/Example/*.cs ../../net/FlatBuffers/*.cs
+    mono Assert.exe
+~~~
+
+## Using the FlatBuffers C# library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in C#.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in C#.
+
+To use FlatBuffers in your own code, first generate C# classes from your
+schema with the `--csharp` option to `flatc`.
+Then you can include both FlatBuffers and the generated code to read
+or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in C#:
+First, import the library and generated code. Then, you read a FlatBuffer binary
+file into a `byte[]`.  You then turn the `byte[]` into a `ByteBuffer`, which you
+pass to the `GetRootAsMyRootType` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cs}
+    using MyGame.Example;
+    using FlatBuffers;
+
+    // This snippet ignores exceptions for brevity.
+    byte[] data = File.ReadAllBytes("monsterdata_test.mon");
+
+    ByteBuffer bb = new ByteBuffer(data);
+    Monster monster = Monster.GetRootAsMonster(bb);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access the data from the `Monster monster`:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cs}
+    short hp = monster.Hp;
+    Vec3 pos = monster.Pos;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+C# code naming follows standard C# style with PascalCasing identifiers,
+e.g. `GetRootAsMyRootType`. Also, values (except vectors and unions) are
+available as properties instead of parameterless accessor methods.
+The performance-enhancing methods to which you can pass an already created
+object are prefixed with `Get`, e.g.:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cs}
+    // property
+    var pos = monster.Pos;
+
+    // method filling a preconstructed object
+    var preconstructedPos = new Vec3();
+    monster.GetPos(preconstructedPos);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+## Storing dictionaries in a FlatBuffer
+
+FlatBuffers doesn't support dictionaries natively, but there is support to
+emulate their behavior with vectors and binary search, which means you
+can have fast lookups directly from a FlatBuffer without having to unpack
+your data into a `Dictionary` or similar.
+
+To use it:
+-   Designate one of the fields in a table as the "key" field. You do this
+    by setting the `key` attribute on this field, e.g.
+    `name:string (key)`.
+    You may only have one key field, and it must be of string or scalar type.
+-   Write out tables of this type as usual, collect their offsets in an
+    array.
+-   Instead of calling standard generated method,
+    e.g.: `Monster.createTestarrayoftablesVector`,
+    call `CreateSortedVectorOfMonster` in C#
+    which will first sort all offsets such that the tables they refer to
+    are sorted by the key field, then serialize it.
+-   Now when you're accessing the FlatBuffer, you can use
+    the `ByKey` accessor to access elements of the vector, e.g.:
+    `monster.TestarrayoftablesByKey("Frodo")` in C#,
+    which returns an object of the corresponding table type,
+    or `null` if not found.
+    `ByKey` performs a binary search, so should have a similar
+    speed to `Dictionary`, though may be faster because of better caching.
+    `ByKey` only works if the vector has been sorted, it will
+    likely not find elements if it hasn't been sorted.
+
+## Text parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from C#, though you could use the C++ parser through native call
+interfaces available to each language. Please see the
+C++ documentation for more on text parsing.
+
+## Object based API
+
+FlatBuffers is all about memory efficiency, which is why its base API is written
+around using as little as possible of it. This does make the API clumsier
+(requiring pre-order construction of all data, and making mutation harder).
+
+For times when efficiency is less important a more convenient object based API
+can be used (through `--gen-object-api`) that is able to unpack & pack a
+FlatBuffer into objects and standard `System.Collections.Generic` containers,
+allowing for convenient construction, access and mutation.
+
+To use:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cs}
+    // Deserialize from buffer into object.
+    MonsterT monsterobj = GetMonster(flatbuffer).UnPack();
+
+    // Update object directly like a C# class instance.
+    Console.WriteLine(monsterobj.Name);
+    monsterobj.Name = "Bob";  // Change the name.
+
+    // Serialize into new flatbuffer.
+    FlatBufferBuilder fbb = new FlatBufferBuilder(1);
+    fbb.Finish(Monster.Pack(fbb, monsterobj).Value);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+### Json Serialization
+
+An additional feature of the object API is the ability to allow you to
+serialize & deserialize a JSON text.
+To use Json Serialization, add `--cs-gen-json-serializer` option to `flatc` and
+add `Newtonsoft.Json` nuget package to csproj.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cs}
+    // Deserialize MonsterT from json
+    string jsonText = File.ReadAllText(@"Resources/monsterdata_test.json");
+    MonsterT mon = MonsterT.DeserializeFromJson(jsonText);
+
+    // Serialize MonsterT to json
+    string jsonText2 = mon.SerializeToJson();
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Limitation
+  * `hash` attribute currentry not supported.
+* NuGet package Dependency
+  * [Newtonsoft.Json](https://github.com/JamesNK/Newtonsoft.Json)
+
+## Conditional compilation symbols
+
+There are three conditional compilation symbols that have an impact on
+performance/features of the C# `ByteBuffer` implementation.
+
+* `UNSAFE_BYTEBUFFER`
+
+  This will use unsafe code to manipulate the underlying byte array. This can
+  yield a reasonable performance increase.
+
+* `BYTEBUFFER_NO_BOUNDS_CHECK`
+
+  This will disable the bounds check asserts to the byte array. This can yield a
+  small performance gain in normal code.
+
+* `ENABLE_SPAN_T`
+
+  This will enable reading and writing blocks of memory with a `Span<T>` instead
+  of just `T[]`. You can also enable writing directly to shared memory or other
+  types of memory by providing a custom implementation of `ByteBufferAllocator`.
+  `ENABLE_SPAN_T` also requires `UNSAFE_BYTEBUFFER` to be defined, or .NET
+  Standard 2.1.
+
+Using `UNSAFE_BYTEBUFFER` and `BYTEBUFFER_NO_BOUNDS_CHECK` together can yield a
+performance gain of ~15% for some operations, however doing so is potentially
+dangerous. Do so at your own risk!
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/DartUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/DartUsage.md
new file mode 100644
index 0000000..6670cc5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/DartUsage.md
@@ -0,0 +1,108 @@
+Use in Dart    {#flatbuffers_guide_use_dart}
+===========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Dart, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide
+to general FlatBuffers usage in all of the supported languages (including Dart).
+This page is designed to cover the nuances of FlatBuffers usage, specific to
+Dart.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Dart library code location
+
+The code for the FlatBuffers Dart library can be found at
+`flatbuffers/dart`. You can browse the library code on the [FlatBuffers
+GitHub page](https://github.com/google/flatbuffers/tree/master/dart).
+
+## Testing the FlatBuffers Dart library
+
+The code to test the Dart library can be found at `flatbuffers/tests`.
+The test code itself is located in [dart_test.dart](https://github.com/google/
+flatbuffers/blob/master/tests/dart_test.dart).
+
+To run the tests, use the [DartTest.sh](https://github.com/google/flatbuffers/
+blob/master/tests/DartTest.sh) shell script.
+
+*Note: The shell script requires the [Dart SDK](https://www.dartlang.org/tools/sdk)
+to be installed.*
+
+## Using the FlatBuffers Dart library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Dart.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in Dart.
+
+To use FlatBuffers in your own code, first generate Dart classes from your
+schema with the `--dart` option to `flatc`. Then you can include both FlatBuffers
+and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Dart: First,
+include the library and generated code. Then read a FlatBuffer binary file into
+a `List<int>`, which you pass to the factory constructor for `Monster`:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.dart}
+import 'dart:io' as io;
+
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+import './monster_my_game.sample_generated.dart' as myGame;
+
+List<int> data = await new io.File('monster.dat').readAsBytes();
+var monster = new myGame.Monster(data);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.dart}
+var hp = monster.hp;
+var pos = monster.pos;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+## Differences from the Dart SDK Front End flat_buffers
+
+The work in this repository is signfiicantly based on the implementation used
+internally by the Dart SDK in the front end/analyzer package. Several
+significant changes have been made.
+
+1. Support for packed boolean lists has been removed.  This is not standard
+   in other implementations and is not compatible with them.  Do note that,
+   like in the JavaScript implementation, __null values in boolean lists
+   will be treated as false__.  It is also still entirely possible to pack data
+   in a single scalar field, but that would have to be done on the application
+   side.
+2. The SDK implementation supports enums with regular Dart enums, which
+   works if enums are always indexed at 1; however, FlatBuffers does not
+   require that.  This implementation uses specialized enum-like classes to
+   ensure proper mapping from FlatBuffers to Dart and other platforms.
+3. The SDK implementation does not appear to support FlatBuffer structs or
+   vectors of structs - it treated everything as a built-in scalar or a table.
+   This implementation treats structs in a way that is compatible with other
+   non-Dart implementations, and properly handles vectors of structs.  Many of
+   the methods prefixed with 'low' have been prepurposed to support this.
+4. The SDK implementation treats int64 and uint64 as float64s. This
+   implementation does not.  This may cause problems with JavaScript
+   compatibility - however, it should be possible to use the JavaScript
+   implementation, or to do a customized implementation that treats all 64 bit
+   numbers as floats.  Supporting the Dart VM and Flutter was a more important
+   goal of this implementation.  Support for 16 bit integers was also added.
+5. The code generation in this offers an "ObjectBuilder", which generates code
+   very similar to the SDK classes that consume FlatBuffers, as well as Builder
+   classes, which produces code which more closely resembles the builders in 
+   other languages. The ObjectBuilder classes are easier to use, at the cost of
+   additional references allocated.
+
+## Text Parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from Dart, though you could use the C++ parser through Dart Native Extensions.
+Please see the C++ documentation for more on text parsing (note that this is
+not currently an option in Flutter - follow [this issue](https://github.com/flutter/flutter/issues/7053)
+for the latest).
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/FlatBuffers.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/FlatBuffers.md
new file mode 100644
index 0000000..bbd2cb0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/FlatBuffers.md
@@ -0,0 +1,188 @@
+FlatBuffers    {#flatbuffers_index}
+===========
+
+# Overview {#flatbuffers_overview}
+
+[FlatBuffers](@ref flatbuffers_overview) is an efficient cross platform
+serialization library for C++, C#, C, Go, Java, Kotlin, JavaScript, Lobster, Lua, TypeScript, PHP, Python, Rust and Swift.
+It was originally created at Google for game development and other
+performance-critical applications.
+
+It is available as Open Source on [GitHub](http://github.com/google/flatbuffers)
+under the Apache license, v2 (see LICENSE.txt).
+
+## Why use FlatBuffers?
+
+-   **Access to serialized data without parsing/unpacking** - What sets
+    FlatBuffers apart is that it represents hierarchical data in a flat
+    binary buffer in such a way that it can still be accessed directly
+    without parsing/unpacking, while also still supporting data
+    structure evolution (forwards/backwards compatibility).
+
+-   **Memory efficiency and speed** - The only memory needed to access
+    your data is that of the buffer. It requires 0 additional allocations
+    (in C++, other languages may vary). FlatBuffers is also very
+    suitable for use with mmap (or streaming), requiring only part of the
+    buffer to be in memory. Access is close to the speed of raw
+    struct access with only one extra indirection (a kind of vtable) to
+    allow for format evolution and optional fields. It is aimed at
+    projects where spending time and space (many memory allocations) to
+    be able to access or construct serialized data is undesirable, such
+    as in games or any other performance sensitive applications. See the
+    [benchmarks](@ref flatbuffers_benchmarks) for details.
+
+-   **Flexible** - Optional fields means not only do you get great
+    forwards and backwards compatibility (increasingly important for
+    long-lived games: don't have to update all data with each new
+    version!). It also means you have a lot of choice in what data you
+    write and what data you don't, and how you design data structures.
+
+-   **Tiny code footprint** - Small amounts of generated code, and just
+    a single small header as the minimum dependency, which is very easy
+    to integrate. Again, see the benchmark section for details.
+
+-   **Strongly typed** - Errors happen at compile time rather than
+    manually having to write repetitive and error prone run-time checks.
+    Useful code can be generated for you.
+
+-   **Convenient to use** - Generated C++ code allows for terse access
+    & construction code. Then there's optional functionality for parsing
+    schemas and JSON-like text representations at runtime efficiently if
+    needed (faster and more memory efficient than other JSON
+    parsers).
+
+    Java, Kotlin and Go code supports object-reuse. C# has efficient struct based
+    accessors.
+
+-   **Cross platform code with no dependencies** - C++ code will work
+    with any recent gcc/clang and VS2010. Comes with build files for the tests &
+    samples (Android .mk files, and cmake for all other platforms).
+
+### Why not use Protocol Buffers, or .. ?
+
+Protocol Buffers is indeed relatively similar to FlatBuffers,
+with the primary difference being that FlatBuffers does not need a parsing/
+unpacking step to a secondary representation before you can
+access data, often coupled with per-object memory allocation. The code
+is an order of magnitude bigger, too. Protocol Buffers has no optional
+text import/export.
+
+### But all the cool kids use JSON!
+
+JSON is very readable (which is why we use it as our optional text
+format) and very convenient when used together with dynamically typed
+languages (such as JavaScript). When serializing data from statically
+typed languages, however, JSON not only has the obvious drawback of runtime
+inefficiency, but also forces you to write *more* code to access data
+(counterintuitively) due to its dynamic-typing serialization system.
+In this context, it is only a better choice for systems that have very
+little to no information ahead of time about what data needs to be stored.
+
+If you do need to store data that doesn't fit a schema, FlatBuffers also
+offers a schema-less (self-describing) version!
+
+Read more about the "why" of FlatBuffers in the
+[white paper](@ref flatbuffers_white_paper).
+
+### Who uses FlatBuffers?
+-   [Cocos2d-x](http://www.cocos2d-x.org/), the #1 open source mobile game
+    engine, uses it to serialize all their
+    [game data](http://www.cocos2d-x.org/reference/native-cpp/V3.5/d7/d2d/namespaceflatbuffers.html).
+-   [Facebook](http://facebook.com/) uses it for client-server communication in
+    their Android app. They have a nice
+    [article](https://code.facebook.com/posts/872547912839369/improving-facebook-s-performance-on-android-with-flatbuffers/)
+    explaining how it speeds up loading their posts.
+-   [Fun Propulsion Labs](https://developers.google.com/games/#Tools)
+    at Google uses it extensively in all their libraries and games.
+
+## Usage in brief
+
+This section is a quick rundown of how to use this system. Subsequent
+sections provide a more in-depth usage guide.
+
+-   Write a schema file that allows you to define the data structures
+    you may want to serialize. Fields can have a scalar type
+    (ints/floats of all sizes), or they can be a: string; array of any type;
+    reference to yet another object; or, a set of possible objects (unions).
+    Fields are optional and have defaults, so they don't need to be
+    present for every object instance.
+
+-   Use `flatc` (the FlatBuffer compiler) to generate a C++ header (or
+    Java/Kotlin/C#/Go/Python.. classes) with helper classes to access and construct
+    serialized data. This header (say `mydata_generated.h`) only depends on
+    `flatbuffers.h`, which defines the core functionality.
+
+-   Use the `FlatBufferBuilder` class to construct a flat binary buffer.
+    The generated functions allow you to add objects to this
+    buffer recursively, often as simply as making a single function call.
+
+-   Store or send your buffer somewhere!
+
+-   When reading it back, you can obtain the pointer to the root object
+    from the binary buffer, and from there traverse it conveniently
+    in-place with `object->field()`.
+
+## In-depth documentation
+
+-   How to [build the compiler](@ref flatbuffers_guide_building) and samples on
+    various platforms.
+-   How to [use the compiler](@ref flatbuffers_guide_using_schema_compiler).
+-   How to [write a schema](@ref flatbuffers_guide_writing_schema).
+-   How to [use the generated C++ code](@ref flatbuffers_guide_use_cpp) in your
+    own programs.
+-   How to [use the generated Java code](@ref flatbuffers_guide_use_java)
+    in your own programs.
+-   How to [use the generated C# code](@ref flatbuffers_guide_use_c-sharp)
+    in your own programs.
+-   How to [use the generated Kotlin code](@ref flatbuffers_guide_use_kotlin)
+        in your own programs.
+-   How to [use the generated Go code](@ref flatbuffers_guide_use_go) in your
+    own programs.
+-   How to [use the generated Lua code](@ref flatbuffers_guide_use_lua) in your
+    own programs.
+-   How to [use the generated JavaScript code](@ref flatbuffers_guide_use_javascript) in your
+    own programs.
+-   How to [use the generated TypeScript code](@ref flatbuffers_guide_use_typescript) in your
+    own programs.
+-   How to [use FlatBuffers in C with `flatcc`](@ref flatbuffers_guide_use_c) in your
+    own programs.
+-   How to [use the generated Lobster code](@ref flatbuffers_guide_use_lobster) in your
+    own programs.
+-   How to [use the generated Rust code](@ref flatbuffers_guide_use_rust) in your
+    own programs.
+-   How to [use the generated Swift code](@ref flatbuffers_guide_use_swift) in your
+    own programs.
+-   [Support matrix](@ref flatbuffers_support) for platforms/languages/features.
+-   Some [benchmarks](@ref flatbuffers_benchmarks) showing the advantage of
+    using FlatBuffers.
+-   A [white paper](@ref flatbuffers_white_paper) explaining the "why" of
+    FlatBuffers.
+-   How to use the [schema-less](@ref flexbuffers) version of
+    FlatBuffers.
+-   A description of the [internals](@ref flatbuffers_internals) of FlatBuffers.
+-   A formal [grammar](@ref flatbuffers_grammar) of the schema language.
+
+## Online resources
+
+-   [GitHub repository](http://github.com/google/flatbuffers)
+-   [Landing page](http://google.github.io/flatbuffers)
+-   [FlatBuffers Google Group](https://groups.google.com/forum/#!forum/flatbuffers)
+-   [Discord](https://discord.gg/6qgKs3R) and [Gitter](https://gitter.im/lobster_programming_language/community) chat.
+-   [FlatBuffers Issues Tracker](http://github.com/google/flatbuffers/issues)
+-   Independent implementations & tools:
+    - [FlatCC](https://github.com/dvidelabs/flatcc) Alternative FlatBuffers
+      parser, code generator and runtime all in C.
+-   Videos:
+    - Colt's [DevByte](https://www.youtube.com/watch?v=iQTxMkSJ1dQ).
+    - GDC 2015 [Lightning Talk](https://www.youtube.com/watch?v=olmL1fUnQAQ).
+    - FlatBuffers for [Go](https://www.youtube.com/watch?v=-BPVId_lA5w).
+    - Evolution of FlatBuffers
+      [visualization](https://www.youtube.com/watch?v=a0QE0xS8rKM).
+-   Useful documentation created by others:
+    - [FlatBuffers in Go](https://rwinslow.com/tags/flatbuffers/)
+    - [FlatBuffers in Android](http://frogermcs.github.io/flatbuffers-in-android-introdution/)
+    - [Parsing JSON to FlatBuffers in Java](http://frogermcs.github.io/json-parsing-with-flatbuffers-in-android/)
+    - [FlatBuffers in Unity](http://exiin.com/blog/flatbuffers-for-unity-sample-code/)
+    - [FlexBuffers C#](https://github.com/mzaks/FlexBuffers-CSharp) and
+      [article](https://medium.com/@icex33/flexbuffers-for-unity3d-4d1ab5c53fbe?)
+      on its use.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/FlexBuffers.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/FlexBuffers.md
new file mode 100644
index 0000000..974dd69
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/FlexBuffers.md
@@ -0,0 +1,204 @@
+FlexBuffers    {#flexbuffers}
+==========
+
+FlatBuffers was designed around schemas, because when you want maximum
+performance and data consistency, strong typing is helpful.
+
+There are however times when you want to store data that doesn't fit a
+schema, because you can't know ahead of time what all needs to be stored.
+
+For this, FlatBuffers has a dedicated format, called FlexBuffers.
+This is a binary format that can be used in conjunction
+with FlatBuffers (by storing a part of a buffer in FlexBuffers
+format), or also as its own independent serialization format.
+
+While it loses the strong typing, you retain the most unique advantage
+FlatBuffers has over other serialization formats (schema-based or not):
+FlexBuffers can also be accessed without parsing / copying / object allocation.
+This is a huge win in efficiency / memory friendly-ness, and allows unique
+use cases such as mmap-ing large amounts of free-form data.
+
+FlexBuffers' design and implementation allows for a very compact encoding,
+combining automatic pooling of strings with automatic sizing of containers to
+their smallest possible representation (8/16/32/64 bits). Many values and
+offsets can be encoded in just 8 bits. While a schema-less representation is
+usually more bulky because of the need to be self-descriptive, FlexBuffers
+generates smaller binaries for many cases than regular FlatBuffers.
+
+FlexBuffers is still slower than regular FlatBuffers though, so we recommend to
+only use it if you need it.
+
+
+# Usage in C++
+
+Include the header `flexbuffers.h`, which in turn depends on `flatbuffers.h`
+and `util.h`.
+
+To create a buffer:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+flexbuffers::Builder fbb;
+fbb.Int(13);
+fbb.Finish();
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You create any value, followed by `Finish`. Unlike FlatBuffers which requires
+the root value to be a table, here any value can be the root, including a lonely
+int value.
+
+You can now access the `std::vector<uint8_t>` that contains the encoded value
+as `fbb.GetBuffer()`. Write it, send it, or store it in a parent FlatBuffer. In
+this case, the buffer is just 3 bytes in size.
+
+To read this value back, you could just say:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+auto root = flexbuffers::GetRoot(my_buffer);
+int64_t i = root.AsInt64();
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+FlexBuffers stores ints only as big as needed, so it doesn't differentiate
+between different sizes of ints. You can ask for the 64 bit version,
+regardless of what you put in. In fact, since you demand to read the root
+as an int, if you supply a buffer that actually contains a float, or a
+string with numbers in it, it will convert it for you on the fly as well,
+or return 0 if it can't. If instead you actually want to know what is inside
+the buffer before you access it, you can call `root.GetType()` or `root.IsInt()`
+etc.
+
+Here's a slightly more complex value you could write instead of `fbb.Int` above:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+fbb.Map([&]() {
+  fbb.Vector("vec", [&]() {
+    fbb.Int(-100);
+    fbb.String("Fred");
+    fbb.IndirectFloat(4.0f);
+  });
+  fbb.UInt("foo", 100);
+});
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This stores the equivalent of the JSON value
+`{ vec: [ -100, "Fred", 4.0 ], foo: 100 }`. The root is a dictionary that has
+just two key-value pairs, with keys `vec` and `foo`. Unlike FlatBuffers, it
+actually has to store these keys in the buffer (which it does only once if
+you store multiple such objects, by pooling key values), but also unlike
+FlatBuffers it has no restriction on the keys (fields) that you use.
+
+The map constructor uses a C++11 Lambda to group its children, but you can
+also use more conventional start/end calls if you prefer.
+
+The first value in the map is a vector. You'll notice that unlike FlatBuffers,
+you can use mixed types. There is also a `TypedVector` variant that only
+allows a single type, and uses a bit less memory.
+
+`IndirectFloat` is an interesting feature that allows you to store values
+by offset rather than inline. Though that doesn't make any visible change
+to the user, the consequence is that large values (especially doubles or
+64 bit ints) that occur more than once can be shared (see ReuseValue).
+Another use case is inside of vectors, where the largest element makes
+up the size of all elements (e.g. a single double forces all elements to
+64bit), so storing a lot of small integers together with a double is more efficient if the double is indirect.
+
+Accessing it:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
+auto map = flexbuffers::GetRoot(my_buffer).AsMap();
+map.size();  // 2
+auto vec = map["vec"].AsVector();
+vec.size();  // 3
+vec[0].AsInt64();  // -100;
+vec[1].AsString().c_str();  // "Fred";
+vec[1].AsInt64();  // 0 (Number parsing failed).
+vec[2].AsDouble();  // 4.0
+vec[2].AsString().IsTheEmptyString();  // true (Wrong Type).
+vec[2].AsString().c_str();  // "" (This still works though).
+vec[2].ToString().c_str();  // "4" (Or have it converted).
+map["foo"].AsUInt8();  // 100
+map["unknown"].IsNull();  // true
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+# Usage in Java
+
+Java implementation follows the C++ one, closely.
+
+For creating the equivalent of the same JSON `{ vec: [ -100, "Fred", 4.0 ], foo: 100 }`,
+one could use the following code:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.java}
+FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512),
+		                                                FlexBuffersBuilder.BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+int smap = builder.startMap();
+int svec = builder.startVector();
+builder.putInt(-100);
+builder.putString("Fred");
+builder.putFloat(4.0);
+builder.endVector("vec", svec, false, false);
+builder.putInt("foo", 100);
+builder.endMap(null, smap);
+ByteBuffer bb = builder.finish();
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Similarly, to read the data, just:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.java}
+FlexBuffers.Map map = FlexBuffers.getRoot(bb).asMap();
+map.size();  // 2
+FlexBuffers.Vector vec = map.get("vec").asVector();
+vec.size();  // 3
+vec.get(0).asLong();  // -100;
+vec.get(1).asString();  // "Fred";
+vec.get(1).asLong();  // 0 (Number parsing failed).
+vec.get(2).asFloat();  // 4.0
+vec.get(2).asString().isEmpty();  // true (Wrong Type).
+vec.get(2).asString();  // "" (This still works though).
+vec.get(2).toString();  // "4.0" (Or have it converted).
+map.get("foo").asUInt();  // 100
+map.get("unknown").isNull();  // true
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+# Binary encoding
+
+A description of how FlexBuffers are encoded is in the
+[internals](@ref flatbuffers_internals) document.
+
+
+# Nesting inside a FlatBuffer
+
+You can mark a field as containing a FlexBuffer, e.g.
+
+    a:[ubyte] (flexbuffer);
+
+A special accessor will be generated that allows you to access the root value
+directly, e.g. `a_flexbuffer_root().AsInt64()`.
+
+
+# Efficiency tips
+
+* Vectors generally are a lot more efficient than maps, so prefer them over maps
+  when possible for small objects. Instead of a map with keys `x`, `y` and `z`,
+  use a vector. Better yet, use a typed vector. Or even better, use a fixed
+  size typed vector.
+* Maps are backwards compatible with vectors, and can be iterated as such.
+  You can iterate either just the values (`map.Values()`), or in parallel with
+  the keys vector (`map.Keys()`). If you intend
+  to access most or all elements, this is faster than looking up each element
+  by key, since that involves a binary search of the key vector.
+* When possible, don't mix values that require a big bit width (such as double)
+  in a large vector of smaller values, since all elements will take on this
+  width. Use `IndirectDouble` when this is a possibility. Note that
+  integers automatically use the smallest width possible, i.e. if you ask
+  to serialize an int64_t whose value is actually small, you will use less
+  bits. Doubles are represented as floats whenever possible losslessly, but
+  this is only possible for few values.
+  Since nested vectors/maps are stored over offsets, they typically don't
+  affect the vector width.
+* To store large arrays of byte data, use a blob. If you'd use a typed
+  vector, the bit width of the size field may make it use more space than
+  expected, and may not be compatible with `memcpy`.
+  Similarly, large arrays of (u)int16_t may be better off stored as a
+  binary blob if their size could exceed 64k elements.
+  Construction and use are otherwise similar to strings.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi.md
new file mode 100644
index 0000000..98be2b6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi.md
@@ -0,0 +1,26 @@
+Go API
+======
+
+\addtogroup flatbuffers_go_api
+
+<!-- Note: The `GoApi_generate.txt` code snippet was generated using `godoc` and
+     customized for use with this markdown file. To regenerate the file, use the
+     `godoc` tool (http://godoc.org) with the files in the `flatbuffers/go`
+     folder.
+
+     You may need to ensure that copies of the files exist in the `src/`
+     subfolder at the path set by the `$GOROOT` environment variable. You can
+     either move the files to `$GOROOT/src/flatbuffers` manually, if `$GOROOT`
+     is already set, otherwise you will need to manually set the `$GOROOT`
+     variable to a path and create `src/flatbuffers` subfolders at that path.
+     Then copy the flatbuffers files into `$GOROOT/src/flatbuffers`. (Some
+     versions of `godoc` include a `-path` flag. This could be used instead, if
+     available).
+
+     Once the files exist at the `$GOROOT/src/flatbuffers` location, you can
+     regenerate this doc using the following command:
+     `godoc flatbuffers > GoApi_generated.txt`.
+
+     After the documentation is generated, you will have to manually remove any
+     non-user facing documentation from this file. -->
+\snippet GoApi_generated.txt Go API
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi_generated.txt b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi_generated.txt
new file mode 100644
index 0000000..3d4e0fc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoApi_generated.txt
@@ -0,0 +1,125 @@
+// This file was generated using `godoc` and customized for use with the
+// API Reference documentation. To recreate this file, use the `godoc` tool
+// (http://godoc.org) with the files in the `flatbuffers/go` folder.
+//
+// Note: You may need to ensure that copies of the files exist in the
+// `src/` subfolder at the path set by the `$GOROOT` environment variable.
+// You can either move the files to `$GOROOT/src/flatbuffers` manually, if
+// `$GOROOT` is already set, otherwise you will need to manually set the
+// `$GOROOT` variable to a path and create `src/flatbuffers` subfolders at that
+// path. Then copy these files into `$GOROOT/src/flatbuffers`. (Some versions of
+// `godoc` include a `-path` flag. This could be used instead, if available).
+//
+// Once the files exist at the `$GOROOT/src/flatbuffers` location, you can
+// regenerate this doc using the following command:
+// `godoc flatbuffers > GoApi_generated.txt`.
+//
+// After the documentation is generated, you will have to manually remove any
+// non-user facing documentation from this file.
+
+/// [Go API]
+PACKAGE DOCUMENTATION
+
+package flatbuffers
+    Package flatbuffers provides facilities to read and write flatbuffers
+    objects.
+
+TYPES
+
+type Builder struct {
+    // `Bytes` gives raw access to the buffer. Most users will want to use
+    // FinishedBytes() instead.
+    Bytes []byte
+}
+    Builder is a state machine for creating FlatBuffer objects. Use a
+    Builder to construct object(s) starting from leaf nodes.
+
+    A Builder constructs byte buffers in a last-first manner for simplicity
+    and performance.
+
+FUNCTIONS
+
+func NewBuilder(initialSize int) *Builder
+    NewBuilder initializes a Builder of size `initial_size`. The internal
+    buffer is grown as needed.
+
+func (b *Builder) CreateByteString(s []byte) UOffsetT
+    CreateByteString writes a byte slice as a string (null-terminated).
+
+func (b *Builder) CreateByteVector(v []byte) UOffsetT
+    CreateByteVector writes a ubyte vector
+
+func (b *Builder) CreateString(s string) UOffsetT
+    CreateString writes a null-terminated string as a vector.
+
+func (b *Builder) EndVector(vectorNumElems int) UOffsetT
+    EndVector writes data necessary to finish vector construction.
+
+func (b *Builder) Finish(rootTable UOffsetT)
+    Finish finalizes a buffer, pointing to the given `rootTable`.
+
+func (b *Builder) FinishedBytes() []byte
+    FinishedBytes returns a pointer to the written data in the byte buffer.
+    Panics if the builder is not in a finished state (which is caused by
+    calling `Finish()`).
+
+func (b *Builder) Head() UOffsetT
+    Head gives the start of useful data in the underlying byte buffer. Note:
+    unlike other functions, this value is interpreted as from the left.
+
+func (b *Builder) PrependBool(x bool)
+    PrependBool prepends a bool to the Builder buffer. Aligns and checks for
+    space.
+
+func (b *Builder) PrependByte(x byte)
+    PrependByte prepends a byte to the Builder buffer. Aligns and checks for
+    space.
+
+func (b *Builder) PrependFloat32(x float32)
+    PrependFloat32 prepends a float32 to the Builder buffer. Aligns and
+    checks for space.
+
+func (b *Builder) PrependFloat64(x float64)
+    PrependFloat64 prepends a float64 to the Builder buffer. Aligns and
+    checks for space.
+
+func (b *Builder) PrependInt16(x int16)
+    PrependInt16 prepends a int16 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependInt32(x int32)
+    PrependInt32 prepends a int32 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependInt64(x int64)
+    PrependInt64 prepends a int64 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependInt8(x int8)
+    PrependInt8 prepends a int8 to the Builder buffer. Aligns and checks for
+    space.
+
+func (b *Builder) PrependUOffsetT(off UOffsetT)
+    PrependUOffsetT prepends an UOffsetT, relative to where it will be
+    written.
+
+func (b *Builder) PrependUint16(x uint16)
+    PrependUint16 prepends a uint16 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependUint32(x uint32)
+    PrependUint32 prepends a uint32 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependUint64(x uint64)
+    PrependUint64 prepends a uint64 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) PrependUint8(x uint8)
+    PrependUint8 prepends a uint8 to the Builder buffer. Aligns and checks
+    for space.
+
+func (b *Builder) Reset()
+    Reset truncates the underlying Builder buffer, facilitating alloc-free
+    reuse of a Builder. It also resets bookkeeping data.
+/// [Go API]
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/GoUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoUsage.md
new file mode 100644
index 0000000..ab6ddbd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/GoUsage.md
@@ -0,0 +1,99 @@
+Use in Go    {#flatbuffers_guide_use_go}
+=========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Go, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide
+to general FlatBuffers usage in all of the supported languages (including Go).
+This page is designed to cover the nuances of FlatBuffers usage, specific to
+Go.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Go library code location
+
+The code for the FlatBuffers Go library can be found at
+`flatbuffers/go`. You can browse the library code on the [FlatBuffers
+GitHub page](https://github.com/google/flatbuffers/tree/master/go).
+
+## Testing the FlatBuffers Go library
+
+The code to test the Go library can be found at `flatbuffers/tests`.
+The test code itself is located in [go_test.go](https://github.com/google/
+flatbuffers/blob/master/tests/go_test.go).
+
+To run the tests, use the [GoTest.sh](https://github.com/google/flatbuffers/
+blob/master/tests/GoTest.sh) shell script.
+
+*Note: The shell script requires [Go](https://golang.org/doc/install) to
+be installed.*
+
+## Using the FlatBuffers Go library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Go.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in Go.
+
+To use FlatBuffers in your own code, first generate Go classes from your
+schema with the `--go` option to `flatc`. Then you can include both FlatBuffers
+and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Go: First,
+include the library and generated code. Then read a FlatBuffer binary file into
+a `[]byte`, which you pass to the `GetRootAsMonster` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.go}
+    import (
+       example "MyGame/Example"
+       flatbuffers "github.com/google/flatbuffers/go"
+
+       io/ioutil
+    )
+
+    buf, err := ioutil.ReadFile("monster.dat")
+    // handle err
+    monster := example.GetRootAsMonster(buf, 0)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.go}
+    hp := monster.Hp()
+    pos := monster.Pos(nil)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+In some cases it's necessary to modify values in an existing FlatBuffer in place (without creating a copy). For this reason, scalar fields of a Flatbuffer table or struct can be mutated.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.go}
+    monster := example.GetRootAsMonster(buf, 0)
+
+    // Set table field.
+    if ok := monster.MutateHp(10); !ok {
+      panic("failed to mutate Hp")
+    }
+
+    // Set struct field.
+    monster.Pos().MutateZ(4)
+
+    // This mutation will fail because the mana field is not available in
+    // the buffer. It should be set when creating the buffer.
+    if ok := monster.MutateMana(20); !ok {
+      panic("failed to mutate Hp")
+    }
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The term `mutate` is used instead of `set` to indicate that this is a special use case. All mutate functions return a boolean value which is false if the field we're trying to mutate is not available in the buffer.
+
+## Text Parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from Go, though you could use the C++ parser through cgo. Please see the
+C++ documentation for more on text parsing.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Grammar.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Grammar.md
new file mode 100644
index 0000000..f8cf832
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Grammar.md
@@ -0,0 +1,74 @@
+Grammar of the schema language    {#flatbuffers_grammar}
+==============================
+
+schema = include*
+         ( namespace\_decl | type\_decl | enum\_decl | root\_decl |
+           file_extension_decl | file_identifier_decl |
+           attribute\_decl | rpc\_decl | object )*
+
+include = `include` string\_constant `;`
+
+namespace\_decl = `namespace` ident ( `.` ident )* `;`
+
+attribute\_decl = `attribute` ident | `"`ident`"` `;`
+
+type\_decl = ( `table` | `struct` ) ident metadata `{` field\_decl+ `}`
+
+enum\_decl = ( `enum` ident `:` type | `union` ident )  metadata `{`
+commasep( enumval\_decl ) `}`
+
+root\_decl = `root_type` ident `;`
+
+field\_decl = ident `:` type [ `=` scalar ] metadata `;`
+
+rpc\_decl = `rpc_service` ident `{` rpc\_method+ `}`
+
+rpc\_method = ident `(` ident `)` `:` ident metadata `;`
+
+type = `bool` | `byte` | `ubyte` | `short` | `ushort` | `int` | `uint` |
+`float` | `long` | `ulong` | `double` |
+`int8` | `uint8` | `int16` | `uint16` | `int32` | `uint32`| `int64` | `uint64` |
+`float32` | `float64` |
+`string` | `[` type `]` | ident
+
+enumval\_decl = ident [ `=` integer\_constant ]
+
+metadata = [ `(` commasep( ident [ `:` single\_value ] ) `)` ]
+
+scalar = boolean\_constant | integer\_constant | float\_constant
+
+object = `{` commasep( ident `:` value ) `}`
+
+single\_value = scalar | string\_constant
+
+value = single\_value | object | `[` commasep( value ) `]`
+
+commasep(x) = [ x ( `,` x )\* ]
+
+file_extension_decl = `file_extension` string\_constant `;`
+
+file_identifier_decl = `file_identifier` string\_constant `;`
+
+string\_constant = `\".*?\"`
+
+ident = `[a-zA-Z_][a-zA-Z0-9_]*`
+
+`[:digit:]` = `[0-9]`
+
+`[:xdigit:]` = `[0-9a-fA-F]`
+
+dec\_integer\_constant = `[-+]?[:digit:]+`
+
+hex\_integer\_constant = `[-+]?0[xX][:xdigit:]+`
+
+integer\_constant = dec\_integer\_constant | hex\_integer\_constant
+
+dec\_float\_constant = `[-+]?(([.][:digit:]+)|([:digit:]+[.][:digit:]*)|([:digit:]+))([eE][-+]?[:digit:]+)?`
+
+hex\_float\_constant = `[-+]?0[xX](([.][:xdigit:]+)|([:xdigit:]+[.][:xdigit:]*)|([:xdigit:]+))([pP][-+]?[:digit:]+)`
+
+special\_float\_constant = `[-+]?(nan|inf|infinity)`
+
+float\_constant = dec\_float\_constant | hex\_float\_constant | special\_float\_constant
+
+boolean\_constant = `true` | `false`
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Internals.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Internals.md
new file mode 100644
index 0000000..16a1666
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Internals.md
@@ -0,0 +1,468 @@
+FlatBuffer Internals    {#flatbuffers_internals}
+====================
+
+This section is entirely optional for the use of FlatBuffers. In normal
+usage, you should never need the information contained herein. If you're
+interested however, it should give you more of an appreciation of why
+FlatBuffers is both efficient and convenient.
+
+### Format components
+
+A FlatBuffer is a binary file and in-memory format consisting mostly of
+scalars of various sizes, all aligned to their own size. Each scalar is
+also always represented in little-endian format, as this corresponds to
+all commonly used CPUs today. FlatBuffers will also work on big-endian
+machines, but will be slightly slower because of additional
+byte-swap intrinsics.
+
+It is assumed that the following conditions are met, to ensure
+cross-platform interoperability:
+- The binary `IEEE-754` format is used for floating-point numbers.
+- The `two's complemented` representation is used for signed integers.
+- The endianness is the same for floating-point numbers as for integers.
+
+On purpose, the format leaves a lot of details about where exactly
+things live in memory undefined, e.g. fields in a table can have any
+order, and objects to some extent can be stored in many orders. This is
+because the format doesn't need this information to be efficient, and it
+leaves room for optimization and extension (for example, fields can be
+packed in a way that is most compact). Instead, the format is defined in
+terms of offsets and adjacency only. This may mean two different
+implementations may produce different binaries given the same input
+values, and this is perfectly valid.
+
+### Format identification
+
+The format also doesn't contain information for format identification
+and versioning, which is also by design. FlatBuffers is a statically typed
+system, meaning the user of a buffer needs to know what kind of buffer
+it is. FlatBuffers can of course be wrapped inside other containers
+where needed, or you can use its union feature to dynamically identify
+multiple possible sub-objects stored. Additionally, it can be used
+together with the schema parser if full reflective capabilities are
+desired.
+
+Versioning is something that is intrinsically part of the format (the
+optionality / extensibility of fields), so the format itself does not
+need a version number (it's a meta-format, in a sense). We're hoping
+that this format can accommodate all data needed. If format breaking
+changes are ever necessary, it would become a new kind of format rather
+than just a variation.
+
+### Offsets
+
+The most important and generic offset type (see `flatbuffers.h`) is
+`uoffset_t`, which is currently always a `uint32_t`, and is used to
+refer to all tables/unions/strings/vectors (these are never stored
+in-line). 32bit is
+intentional, since we want to keep the format binary compatible between
+32 and 64bit systems, and a 64bit offset would bloat the size for almost
+all uses. A version of this format with 64bit (or 16bit) offsets is easy to set
+when needed. Unsigned means they can only point in one direction, which
+typically is forward (towards a higher memory location). Any backwards
+offsets will be explicitly marked as such.
+
+The format starts with an `uoffset_t` to the root table in the buffer.
+
+We have two kinds of objects, structs and tables.
+
+### Structs
+
+These are the simplest, and as mentioned, intended for simple data that
+benefits from being extra efficient and doesn't need versioning /
+extensibility. They are always stored inline in their parent (a struct,
+table, or vector) for maximum compactness. Structs define a consistent
+memory layout where all components are aligned to their size, and
+structs aligned to their largest scalar member. This is done independent
+of the alignment rules of the underlying compiler to guarantee a cross
+platform compatible layout. This layout is then enforced in the generated
+code.
+
+### Tables
+
+Unlike structs, these are not stored in inline in their parent, but are
+referred to by offset.
+
+They start with an `soffset_t` to a vtable. This is a signed version of
+`uoffset_t`, since vtables may be stored anywhere relative to the object.
+This offset is substracted (not added) from the object start to arrive at
+the vtable start. This offset is followed by all the
+fields as aligned scalars (or offsets). Unlike structs, not all fields
+need to be present. There is no set order and layout. A table may contain
+field offsets that point to the same value if the user explicitly
+serializes the same offset twice.
+
+To be able to access fields regardless of these uncertainties, we go
+through a vtable of offsets. Vtables are shared between any objects that
+happen to have the same vtable values.
+
+The elements of a vtable are all of type `voffset_t`, which is
+a `uint16_t`. The first element is the size of the vtable in bytes,
+including the size element. The second one is the size of the object, in bytes
+(including the vtable offset). This size could be used for streaming, to know
+how many bytes to read to be able to access all *inline* fields of the object.
+The remaining elements are the N offsets, where N is the amount of fields
+declared in the schema when the code that constructed this buffer was
+compiled (thus, the size of the table is N + 2).
+
+All accessor functions in the generated code for tables contain the
+offset into this table as a constant. This offset is checked against the
+first field (the number of elements), to protect against newer code
+reading older data. If this offset is out of range, or the vtable entry
+is 0, that means the field is not present in this object, and the
+default value is return. Otherwise, the entry is used as offset to the
+field to be read.
+
+### Unions
+
+Unions are encoded as the combination of two fields: an enum representing the
+union choice and the offset to the actual element. FlatBuffers reserves the
+enumeration constant `NONE` (encoded as 0) to mean that the union field is not
+set.
+
+### Strings and Vectors
+
+Strings are simply a vector of bytes, and are always
+null-terminated. Vectors are stored as contiguous aligned scalar
+elements prefixed by a 32bit element count (not including any
+null termination). Neither is stored inline in their parent, but are referred to
+by offset. A vector may consist of more than one offset pointing to the same
+value if the user explicitly serializes the same offset twice.
+
+### Construction
+
+The current implementation constructs these buffers backwards (starting
+at the highest memory address of the buffer), since
+that significantly reduces the amount of bookkeeping and simplifies the
+construction API.
+
+### Code example
+
+Here's an example of the code that gets generated for the `samples/monster.fbs`.
+What follows is the entire file, broken up by comments:
+
+    // automatically generated, do not modify
+
+    #include "flatbuffers/flatbuffers.h"
+
+    namespace MyGame {
+    namespace Sample {
+
+Nested namespace support.
+
+    enum {
+      Color_Red = 0,
+      Color_Green = 1,
+      Color_Blue = 2,
+    };
+
+    inline const char **EnumNamesColor() {
+      static const char *names[] = { "Red", "Green", "Blue", nullptr };
+      return names;
+    }
+
+    inline const char *EnumNameColor(int e) { return EnumNamesColor()[e]; }
+
+Enums and convenient reverse lookup.
+
+    enum {
+      Any_NONE = 0,
+      Any_Monster = 1,
+    };
+
+    inline const char **EnumNamesAny() {
+      static const char *names[] = { "NONE", "Monster", nullptr };
+      return names;
+    }
+
+    inline const char *EnumNameAny(int e) { return EnumNamesAny()[e]; }
+
+Unions share a lot with enums.
+
+    struct Vec3;
+    struct Monster;
+
+Predeclare all data types since circular references between types are allowed
+(circular references between object are not, though).
+
+    FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Vec3 {
+     private:
+      float x_;
+      float y_;
+      float z_;
+
+     public:
+      Vec3(float x, float y, float z)
+        : x_(flatbuffers::EndianScalar(x)), y_(flatbuffers::EndianScalar(y)), z_(flatbuffers::EndianScalar(z)) {}
+
+      float x() const { return flatbuffers::EndianScalar(x_); }
+      float y() const { return flatbuffers::EndianScalar(y_); }
+      float z() const { return flatbuffers::EndianScalar(z_); }
+    };
+    FLATBUFFERS_STRUCT_END(Vec3, 12);
+
+These ugly macros do a couple of things: they turn off any padding the compiler
+might normally do, since we add padding manually (though none in this example),
+and they enforce alignment chosen by FlatBuffers. This ensures the layout of
+this struct will look the same regardless of compiler and platform. Note that
+the fields are private: this is because these store little endian scalars
+regardless of platform (since this is part of the serialized data).
+`EndianScalar` then converts back and forth, which is a no-op on all current
+mobile and desktop platforms, and a single machine instruction on the few
+remaining big endian platforms.
+
+    struct Monster : private flatbuffers::Table {
+      const Vec3 *pos() const { return GetStruct<const Vec3 *>(4); }
+      int16_t mana() const { return GetField<int16_t>(6, 150); }
+      int16_t hp() const { return GetField<int16_t>(8, 100); }
+      const flatbuffers::String *name() const { return GetPointer<const flatbuffers::String *>(10); }
+      const flatbuffers::Vector<uint8_t> *inventory() const { return GetPointer<const flatbuffers::Vector<uint8_t> *>(14); }
+      int8_t color() const { return GetField<int8_t>(16, 2); }
+    };
+
+Tables are a bit more complicated. A table accessor struct is used to point at
+the serialized data for a table, which always starts with an offset to its
+vtable. It derives from `Table`, which contains the `GetField` helper functions.
+GetField takes a vtable offset, and a default value. It will look in the vtable
+at that offset. If the offset is out of bounds (data from an older version) or
+the vtable entry is 0, the field is not present and the default is returned.
+Otherwise, it uses the entry as an offset into the table to locate the field.
+
+    struct MonsterBuilder {
+      flatbuffers::FlatBufferBuilder &fbb_;
+      flatbuffers::uoffset_t start_;
+      void add_pos(const Vec3 *pos) { fbb_.AddStruct(4, pos); }
+      void add_mana(int16_t mana) { fbb_.AddElement<int16_t>(6, mana, 150); }
+      void add_hp(int16_t hp) { fbb_.AddElement<int16_t>(8, hp, 100); }
+      void add_name(flatbuffers::Offset<flatbuffers::String> name) { fbb_.AddOffset(10, name); }
+      void add_inventory(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory) { fbb_.AddOffset(14, inventory); }
+      void add_color(int8_t color) { fbb_.AddElement<int8_t>(16, color, 2); }
+      MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+      flatbuffers::Offset<Monster> Finish() { return flatbuffers::Offset<Monster>(fbb_.EndTable(start_, 7)); }
+    };
+
+`MonsterBuilder` is the base helper struct to construct a table using a
+`FlatBufferBuilder`. You can add the fields in any order, and the `Finish`
+call will ensure the correct vtable gets generated.
+
+    inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb,
+                                                      const Vec3 *pos, int16_t mana,
+                                                      int16_t hp,
+                                                      flatbuffers::Offset<flatbuffers::String> name,
+                                                      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory,
+                                                      int8_t color) {
+      MonsterBuilder builder_(_fbb);
+      builder_.add_inventory(inventory);
+      builder_.add_name(name);
+      builder_.add_pos(pos);
+      builder_.add_hp(hp);
+      builder_.add_mana(mana);
+      builder_.add_color(color);
+      return builder_.Finish();
+    }
+
+`CreateMonster` is a convenience function that calls all functions in
+`MonsterBuilder` above for you. Note that if you pass values which are
+defaults as arguments, it will not actually construct that field, so
+you can probably use this function instead of the builder class in
+almost all cases.
+
+    inline const Monster *GetMonster(const void *buf) { return flatbuffers::GetRoot<Monster>(buf); }
+
+This function is only generated for the root table type, to be able to
+start traversing a FlatBuffer from a raw buffer pointer.
+
+    }; // namespace MyGame
+    }; // namespace Sample
+
+### Encoding example.
+
+Below is a sample encoding for the following JSON corresponding to the above
+schema:
+
+    { pos: { x: 1, y: 2, z: 3 }, name: "fred", hp: 50 }
+
+Resulting in this binary buffer:
+
+    // Start of the buffer:
+    uint32_t 20  // Offset to the root table.
+
+    // Start of the vtable. Not shared in this example, but could be:
+    uint16_t 16 // Size of table, starting from here.
+    uint16_t 22 // Size of object inline data.
+    uint16_t 4, 0, 20, 16, 0, 0  // Offsets to fields from start of (root) table, 0 for not present.
+
+    // Start of the root table:
+    int32_t 16     // Offset to vtable used (default negative direction)
+    float 1, 2, 3  // the Vec3 struct, inline.
+    uint32_t 8     // Offset to the name string.
+    int16_t 50     // hp field.
+    int16_t 0      // Padding for alignment.
+
+    // Start of name string:
+    uint32_t 4  // Length of string.
+    int8_t 'f', 'r', 'e', 'd', 0, 0, 0, 0  // Text + 0 termination + padding.
+
+Note that this not the only possible encoding, since the writer has some
+flexibility in which of the children of root object to write first (though in
+this case there's only one string), and what order to write the fields in.
+Different orders may also cause different alignments to happen.
+
+### Additional reading.
+
+The author of the C language implementation has made a similar
+[document](https://github.com/dvidelabs/flatcc/blob/master/doc/binary-format.md#flatbuffers-binary-format)
+that may further help clarify the format.
+
+# FlexBuffers
+
+The [schema-less](@ref flexbuffers) version of FlatBuffers have their
+own encoding, detailed here.
+
+It shares many properties mentioned above, in that all data is accessed
+over offsets, all scalars are aligned to their own size, and
+all data is always stored in little endian format.
+
+One difference is that FlexBuffers are built front to back, so children are
+stored before parents, and the root of the data starts at the last byte.
+
+Another difference is that scalar data is stored with a variable number of bits
+(8/16/32/64). The current width is always determined by the *parent*, i.e. if
+the scalar sits in a vector, the vector determines the bit width for all
+elements at once. Selecting the minimum bit width for a particular vector is
+something the encoder does automatically and thus is typically of no concern
+to the user, though being aware of this feature (and not sticking a double in
+the same vector as a bunch of byte sized elements) is helpful for efficiency.
+
+Unlike FlatBuffers there is only one kind of offset, and that is an unsigned
+integer indicating the number of bytes in a negative direction from the address
+of itself (where the offset is stored).
+
+### Vectors
+
+The representation of the vector is at the core of how FlexBuffers works (since
+maps are really just a combination of 2 vectors), so it is worth starting there.
+
+As mentioned, a vector is governed by a single bit width (supplied by its
+parent). This includes the size field. For example, a vector that stores the
+integer values `1, 2, 3` is encoded as follows:
+
+    uint8_t 3, 1, 2, 3, 4, 4, 4
+
+The first `3` is the size field, and is placed before the vector (an offset
+from the parent to this vector points to the first element, not the size
+field, so the size field is effectively at index -1).
+Since this is an untyped vector `SL_VECTOR`, it is followed by 3 type
+bytes (one per element of the vector), which are always following the vector,
+and are always a uint8_t even if the vector is made up of bigger scalars.
+
+A vector may include more than one offset pointing to the same value if the
+user explicitly serializes the same offset twice.
+
+### Types
+
+A type byte is made up of 2 components (see flexbuffers.h for exact values):
+
+* 2 lower bits representing the bit-width of the child (8, 16, 32, 64).
+  This is only used if the child is accessed over an offset, such as a child
+  vector. It is ignored for inline types.
+* 6 bits representing the actual type (see flexbuffers.h).
+
+Thus, in this example `4` means 8 bit child (value 0, unused, since the value is
+in-line), type `SL_INT` (value 1).
+
+### Typed Vectors
+
+These are like the Vectors above, but omit the type bytes. The type is instead
+determined by the vector type supplied by the parent. Typed vectors are only
+available for a subset of types for which these savings can be significant,
+namely inline signed/unsigned integers (`TYPE_VECTOR_INT` / `TYPE_VECTOR_UINT`),
+floats (`TYPE_VECTOR_FLOAT`), and keys (`TYPE_VECTOR_KEY`, see below).
+
+Additionally, for scalars, there are fixed length vectors of sizes 2 / 3 / 4
+that don't store the size (`TYPE_VECTOR_INT2` etc.), for an additional savings
+in space when storing common vector or color data.
+
+### Scalars
+
+FlexBuffers supports integers (`TYPE_INT` and `TYPE_UINT`) and floats
+(`TYPE_FLOAT`), available in the bit-widths mentioned above. They can be stored
+both inline and over an offset (`TYPE_INDIRECT_*`).
+
+The offset version is useful to encode costly 64bit (or even 32bit) quantities
+into vectors / maps of smaller sizes, and to share / repeat a value multiple
+times.
+
+### Booleans and Nulls
+
+Booleans (`TYPE_BOOL`) and nulls (`TYPE_NULL`) are encoded as inlined unsigned integers.
+
+### Blobs, Strings and Keys.
+
+A blob (`TYPE_BLOB`) is encoded similar to a vector, with one difference: the
+elements are always `uint8_t`. The parent bit width only determines the width of
+the size field, allowing blobs to be large without the elements being large.
+
+Strings (`TYPE_STRING`) are similar to blobs, except they have an additional 0
+termination byte for convenience, and they MUST be UTF-8 encoded (since an
+accessor in a language that does not support pointers to UTF-8 data may have to
+convert them to a native string type).
+
+A "Key" (`TYPE_KEY`) is similar to a string, but doesn't store the size
+field. They're so named because they are used with maps, which don't care
+for the size, and can thus be even more compact. Unlike strings, keys cannot
+contain bytes of value 0 as part of their data (size can only be determined by
+`strlen`), so while you can use them outside the context of maps if you so
+desire, you're usually better off with strings.
+
+### Maps
+
+A map (`TYPE_MAP`) is like an (untyped) vector, but with 2 prefixes before the
+size field:
+
+| index | field                                                        |
+| ----: | :----------------------------------------------------------- |
+| -3    | An offset to the keys vector (may be shared between tables). |
+| -2    | Byte width of the keys vector.                               |
+| -1    | Size (from here on it is compatible with `TYPE_VECTOR`)      |
+| 0     | Elements.                                                    |
+| Size  | Types.                                                       |
+
+Since a map is otherwise the same as a vector, it can be iterated like
+a vector (which is probably faster than lookup by key).
+
+The keys vector is a typed vector of keys. Both the keys and corresponding
+values *have* to be stored in sorted order (as determined by `strcmp`), such
+that lookups can be made using binary search.
+
+The reason the key vector is a seperate structure from the value vector is
+such that it can be shared between multiple value vectors, and also to
+allow it to be treated as its own individual vector in code.
+
+An example map { foo: 13, bar: 14 } would be encoded as:
+
+    0 : uint8_t 'b', 'a', 'r', 0
+    4 : uint8_t 'f', 'o', 'o', 0
+    8 : uint8_t 2      // key vector of size 2
+    // key vector offset points here
+    9 : uint8_t 9, 6   // offsets to bar_key and foo_key
+    11: uint8_t 2, 1   // offset to key vector, and its byte width
+    13: uint8_t 2      // value vector of size
+    // value vector offset points here
+    14: uint8_t 14, 13 // values
+    16: uint8_t 4, 4   // types
+
+### The root
+
+As mentioned, the root starts at the end of the buffer.
+The last uint8_t is the width in bytes of the root (normally the parent
+determines the width, but the root has no parent). The uint8_t before this is
+the type of the root, and the bytes before that are the root value (of the
+number of bytes specified by the last byte).
+
+So for example, the integer value `13` as root would be:
+
+    uint8_t 13, 4, 1    // Value, type, root byte width.
+
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaScriptUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaScriptUsage.md
new file mode 100644
index 0000000..2f0d379
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaScriptUsage.md
@@ -0,0 +1,93 @@
+Use in JavaScript    {#flatbuffers_guide_use_javascript}
+=================
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in JavaScript, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages
+(including JavaScript). This page is specifically designed to cover the nuances
+of FlatBuffers usage in JavaScript.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers JavaScript library code location
+
+The generated code for the FlatBuffers JavaScript library can be found at 
+https://www.npmjs.com/package/flatbuffers. To use it from sources:
+
+1. Run `npm run compile` from the main folder to generate JS files from TS.
+1. In your project, install it as a normal dependency, using the flatbuffers
+folder as the source.
+
+## Using the FlatBuffers JavaScript libary
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers.*
+
+Due to the complexity related with large amounts of JS flavors and module types,
+native JS support has been replaced in 2.0 by transpilation from TypeScript.
+
+Please look at [TypeScript usage](@ref flatbuffers_guide_use_typescript) and
+transpile your sources to desired JS flavor. The minimal steps to get up and
+running with JS are:
+
+1. Generate TS files from `*.fbs` by using the `--ts` option.
+1. Transpile resulting TS files to desired JS flavor using `tsc` (see 
+   https://www.typescriptlang.org/download for installation instructions).
+
+~~~{.js}
+  // Note: These require functions are an example - use your desired module flavor.
+  var fs = require('fs');
+
+  var flatbuffers = require('../flatbuffers').flatbuffers;
+  var MyGame = require('./monster_generated').MyGame;
+
+  var data = new Uint8Array(fs.readFileSync('monster.dat'));
+  var buf = new flatbuffers.ByteBuffer(data);
+
+  var monster = MyGame.Example.Monster.getRootAsMonster(buf);
+
+  //--------------------------------------------------------------------------//
+
+  // Note: This code is an example of browser-based HTML/JavaScript. See above
+  //       for the code using JavaScript module loaders (e.g. Node.js).
+  <script src="../js/flatbuffers.js"></script>
+  <script src="monster_generated.js"></script>
+  <script>
+    function readFile() {
+      var reader = new FileReader(); // This example uses the HTML5 FileReader.
+      var file = document.getElementById(
+          'file_input').files[0]; // "monster.dat" from the HTML <input> field.
+
+      reader.onload = function() { // Executes after the file is read.
+        var data = new Uint8Array(reader.result);
+
+        var buf = new flatbuffers.ByteBuffer(data);
+
+        var monster = MyGame.Example.Monster.getRootAsMonster(buf);
+      }
+
+      reader.readAsArrayBuffer(file);
+    }
+  </script>
+
+  // Open the HTML file in a browser and select "monster.dat" from with the
+  // <input> field.
+  <input type="file" id="file_input" onchange="readFile();">
+~~~
+
+Now you can access values like this:
+
+~~~{.js}
+  var hp = monster.hp();
+  var pos = monster.pos();
+~~~
+
+## Text parsing FlatBuffers in JavaScript
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from JavaScript.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaUsage.md
new file mode 100644
index 0000000..30ba061
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/JavaUsage.md
@@ -0,0 +1,114 @@
+Use in Java    {#flatbuffers_guide_use_java}
+==============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Java, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages (including Java).
+This page is designed to cover the nuances of FlatBuffers usage,
+specific to Java.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Java code location
+
+The code for the FlatBuffers Java library can be found at
+`flatbuffers/java/com/google/flatbuffers`. You can browse the library on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/
+java/com/google/flatbuffers).
+
+## Testing the FlatBuffers Java libraries
+
+The code to test the libraries can be found at `flatbuffers/tests`.
+
+The test code for Java is located in [JavaTest.java](https://github.com/google
+/flatbuffers/blob/master/tests/JavaTest.java).
+
+To run the tests, use either [JavaTest.sh](https://github.com/google/
+flatbuffers/blob/master/tests/JavaTest.sh) or [JavaTest.bat](https://github.com/
+google/flatbuffers/blob/master/tests/JavaTest.bat), depending on your operating
+system.
+
+*Note: These scripts require that [Java](https://www.oracle.com/java/index.html)
+is installed.*
+
+## Using the FlatBuffers Java library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Java.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in Java.
+
+To use FlatBuffers in your own code, first generate Java classes from your
+schema with the `--java` option to `flatc`.
+Then you can include both FlatBuffers and the generated code to read
+or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Java:
+First, import the library and generated code. Then, you read a FlatBuffer binary
+file into a `byte[]`.  You then turn the `byte[]` into a `ByteBuffer`, which you
+pass to the `getRootAsMyRootType` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.java}
+    import MyGame.Example.*;
+    import com.google.flatbuffers.FlatBufferBuilder;
+
+    // This snippet ignores exceptions for brevity.
+    File file = new File("monsterdata_test.mon");
+    RandomAccessFile f = new RandomAccessFile(file, "r");
+    byte[] data = new byte[(int)f.length()];
+    f.readFully(data);
+    f.close();
+
+    ByteBuffer bb = ByteBuffer.wrap(data);
+    Monster monster = Monster.getRootAsMonster(bb);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access the data from the `Monster monster`:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.java}
+    short hp = monster.hp();
+    Vec3 pos = monster.pos();
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+## Storing dictionaries in a FlatBuffer
+
+FlatBuffers doesn't support dictionaries natively, but there is support to
+emulate their behavior with vectors and binary search, which means you
+can have fast lookups directly from a FlatBuffer without having to unpack
+your data into a `Dictionary` or similar.
+
+To use it:
+-   Designate one of the fields in a table as the "key" field. You do this
+    by setting the `key` attribute on this field, e.g.
+    `name:string (key)`.
+    You may only have one key field, and it must be of string or scalar type.
+-   Write out tables of this type as usual, collect their offsets in an
+    array.
+-   Instead of calling standard generated method,
+    e.g.: `Monster.createTestarrayoftablesVector`,
+    call `createSortedVectorOfTables` (from the `FlatBufferBuilder` object).
+    which will first sort all offsets such that the tables they refer to
+    are sorted by the key field, then serialize it.
+-   Now when you're accessing the FlatBuffer, you can use
+    the `ByKey` accessor to access elements of the vector, e.g.:
+    `monster.testarrayoftablesByKey("Frodo")`.
+    which returns an object of the corresponding table type,
+    or `null` if not found.
+    `ByKey` performs a binary search, so should have a similar
+    speed to `Dictionary`, though may be faster because of better caching.
+    `ByKey` only works if the vector has been sorted, it will
+    likely not find elements if it hasn't been sorted.
+
+## Text parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from Java, though you could use the C++ parser through native call
+interfaces available to each language. Please see the
+C++ documentation for more on text parsing.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/KotlinUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/KotlinUsage.md
new file mode 100644
index 0000000..092fcd7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/KotlinUsage.md
@@ -0,0 +1,84 @@
+Use in Kotlin    {#flatbuffers_guide_use_kotlin}
+==============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Kotlin, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages (including K).
+
+This page is designed to cover the nuances of FlatBuffers usage, specific to Kotlin.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## Kotlin and FlatBuffers Java code location
+
+Code generated for Kotlin currently uses the flatbuffers java runtime library. That means that Kotlin generated code can only have Java virtual machine as target architecture (which includes Android). Kotlin Native and Kotlin.js are currently not supported.
+
+The code for the FlatBuffers Java library can be found at
+`flatbuffers/java/com/google/flatbuffers`. You can browse the library on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/
+java/com/google/flatbuffers).
+
+## Testing FlatBuffers Kotlin
+
+The test code for Java is located in [KotlinTest.java](https://github.com/google
+/flatbuffers/blob/master/tests/KotlinTest.kt).
+
+To run the tests, use  [KotlinTest.sh](https://github.com/google/
+flatbuffers/blob/master/tests/KotlinTest.sh) shell script.
+
+*Note: These scripts require that [Kotlin](https://kotlinlang.org/) is installed.*
+
+## Using the FlatBuffers Kotlin library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Kotlin.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in Kotlin.
+
+To use FlatBuffers in your own code, first generate Java classes from your
+schema with the `--kotlin` option to `flatc`.
+Then you can include both FlatBuffers and the generated code to read
+or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Kotlin:
+First, import the library and generated code. Then, you read a FlatBuffer binary
+file into a `ByteArray`.  You then turn the `ByteArray` into a `ByteBuffer`, which you
+pass to the `getRootAsMyRootType` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.kt}
+    import MyGame.Example.*
+    import com.google.flatbuffers.FlatBufferBuilder
+
+    // This snippet ignores exceptions for brevity.
+    val data = RandomAccessFile(File("monsterdata_test.mon"), "r").use {
+        val temp = ByteArray(it.length().toInt())
+        it.readFully(temp)
+        temp
+    }
+
+    val bb = ByteBuffer.wrap(data)
+    val monster = Monster.getRootAsMonster(bb)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access the data from the `Monster monster`:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.kt}
+    val hp = monster.hp
+    val pos = monster.pos!!;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+## Differences between Kotlin and Java code
+
+Kotlin generated code was designed to be as close as possible to the java counterpart, as for now, we only support kotlin on java virtual machine. So the differences in implementation and usage are basically the ones introduced by the Kotlin language itself. You can find more in-depth information [here](https://kotlinlang.org/docs/reference/comparison-to-java.html).
+
+The most obvious ones are:
+
+* Fields as accessed as Kotlin [properties](https://kotlinlang.org/docs/reference/properties.html)
+* Static methods are accessed in [companion object](https://kotlinlang.org/docs/reference/classes.html#companion-objects)
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/LobsterUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/LobsterUsage.md
new file mode 100644
index 0000000..9d69caf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/LobsterUsage.md
@@ -0,0 +1,85 @@
+Use in Lobster    {#flatbuffers_guide_use_lobster}
+==============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Lobster, it should be noted that the
+[Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to general
+FlatBuffers usage in all of the supported languages (including Lobster). This
+page is designed to cover the nuances of FlatBuffers usage, specific to
+Lobster.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Lobster library code location
+
+The code for the FlatBuffers Lobster library can be found at
+`flatbuffers/lobster`. You can browse the library code on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/
+lobster).
+
+## Testing the FlatBuffers Lobster library
+
+The code to test the Lobster library can be found at `flatbuffers/tests`.
+The test code itself is located in [lobstertest.lobster](https://github.com/google/
+flatbuffers/blob/master/tests/lobstertest.lobster).
+
+To run the tests, run `lobster lobstertest.lobster`. To obtain Lobster itself,
+go to the [Lobster homepage](http://strlen.com/lobster) or
+[github](https://github.com/aardappel/lobster) to learn how to build it for your
+platform.
+
+## Using the FlatBuffers Lobster library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Lobster.*
+
+There is support for both reading and writing FlatBuffers in Lobster.
+
+To use FlatBuffers in your own code, first generate Lobster classes from your
+schema with the `--lobster` option to `flatc`. Then you can include both
+FlatBuffers and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Lobster:
+First, import the library and the generated code. Then read a FlatBuffer binary
+file into a string, which you pass to the `GetRootAsMonster` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.lobster}
+    include "monster_generated.lobster"
+
+    let fb = read_file("monsterdata_test.mon")
+    assert fb
+    let monster = MyGame_Example_GetRootAsMonster(fb)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.lobster}
+    let hp = monster.hp
+    let pos = monster.pos
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As you can see, even though `hp` and `pos` are functions that access FlatBuffer
+data in-place in the string buffer, they appear as field accesses.
+
+## Speed
+
+Using FlatBuffers in Lobster should be relatively fast, as the implementation
+makes use of native support for writing binary values, and access of vtables.
+Both generated code and the runtime library are therefore small and fast.
+
+Actual speed will depend on wether you use Lobster as bytecode VM or compiled to
+C++.
+
+## Text Parsing
+
+Lobster has full support for parsing JSON into FlatBuffers, or generating
+JSON from FlatBuffers. See `samples/sample_test.lobster` for an example.
+
+This uses the C++ parser and generator underneath, so should be both fast and
+conformant.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/LuaUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/LuaUsage.md
new file mode 100644
index 0000000..43c370f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/LuaUsage.md
@@ -0,0 +1,81 @@
+Use in Lua    {#flatbuffers_guide_use_lua}
+=============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Lua, it should be noted that the
+[Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to general
+FlatBuffers usage in all of the supported languages (including Lua). This
+page is designed to cover the nuances of FlatBuffers usage, specific to
+Lua.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Lua library code location
+
+The code for the FlatBuffers Lua library can be found at
+`flatbuffers/lua`. You can browse the library code on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/lua).
+
+## Testing the FlatBuffers Lua library
+
+The code to test the Lua library can be found at `flatbuffers/tests`.
+The test code itself is located in [luatest.lua](https://github.com/google/
+flatbuffers/blob/master/tests/luatest.lua).
+
+To run the tests, use the [LuaTest.sh](https://github.com/google/flatbuffers/
+blob/master/tests/LuaTest.sh) shell script.
+
+*Note: This script requires [Lua 5.3](https://www.lua.org/) and
+[LuaJIT](http://luajit.org/) to be installed.*
+
+## Using the FlatBuffers Lua library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Lua.*
+
+There is support for both reading and writing FlatBuffers in Lua.
+
+To use FlatBuffers in your own code, first generate Lua classes from your
+schema with the `--lua` option to `flatc`. Then you can include both
+FlatBuffers and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Lua:
+First, require the module and the generated code. Then read a FlatBuffer binary
+file into a `string`, which you pass to the `GetRootAsMonster` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.lua}
+    -- require the library
+    local flatbuffers = require("flatbuffers")
+    
+    -- require the generated code
+    local monster = require("MyGame.Sample.Monster")
+
+    -- read the flatbuffer from a file into a string
+    local f = io.open('monster.dat', 'rb')
+    local buf = f:read('*a')
+    f:close()
+
+    -- parse the flatbuffer to get an instance to the root monster
+    local monster1 = monster.GetRootAsMonster(buf, 0)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.lua}
+    -- use the : notation to access member data
+    local hp = monster1:Hp()
+    local pos = monster1:Pos()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+## Text Parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from Lua, though you could use the C++ parser through SWIG or ctypes. Please
+see the C++ documentation for more on text parsing.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/PHPUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/PHPUsage.md
new file mode 100644
index 0000000..cdff449
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/PHPUsage.md
@@ -0,0 +1,89 @@
+Use in PHP    {#flatbuffers_guide_use_php}
+==========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in PHP, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages
+(including PHP). This page is specifically designed to cover the nuances of
+FlatBuffers usage in PHP.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers PHP library code location
+
+The code for FlatBuffers PHP library can be found at `flatbuffers/php`. You
+can browse the library code on the [FlatBuffers
+GitHub page](https://github.com/google/flatbuffers/tree/master/php).
+
+## Testing the FlatBuffers JavaScript library
+
+The code to test the PHP library can be found at `flatbuffers/tests`.
+The test code itself is located in [phpTest.php](https://github.com/google/
+flatbuffers/blob/master/tests/phpTest.php).
+
+You can run the test with `php phpTest.php` from the command line.
+
+*Note: The PHP test file requires
+[PHP](http://php.net/manual/en/install.php) to be installed.*
+
+## Using theFlatBuffers PHP library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in PHP.*
+
+FlatBuffers supports both reading and writing FlatBuffers in PHP.
+
+To use FlatBuffers in your own code, first generate PHP classes from your schema
+with the `--php` option to `flatc`. Then you can include both FlatBuffers and
+the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in PHP:
+First, include the library and generated code (using the PSR `autoload`
+function). Then you can read a FlatBuffer binary file, which you
+pass the contents of to the `GetRootAsMonster` function:
+
+~~~{.php}
+  // It is recommended that your use PSR autoload when using FlatBuffers in PHP.
+  // Here is an example:
+  function __autoload($class_name) {
+    // The last segment of the class name matches the file name.
+    $class = substr($class_name, strrpos($class_name, "\\") + 1);
+    $root_dir = join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)))); // `flatbuffers` root.
+
+    // Contains the `*.php` files for the FlatBuffers library and the `flatc` generated files.
+    $paths = array(join(DIRECTORY_SEPARATOR, array($root_dir, "php")),
+                   join(DIRECTORY_SEPARATOR, array($root_dir, "tests", "MyGame", "Example")));
+    foreach ($paths as $path) {
+      $file = join(DIRECTORY_SEPARATOR, array($path, $class . ".php"));
+      if (file_exists($file)) {
+        require($file);
+        break;
+    }
+  }
+
+  // Read the contents of the FlatBuffer binary file.
+  $filename = "monster.dat";
+  $handle = fopen($filename, "rb");
+  $contents = $fread($handle, filesize($filename));
+  fclose($handle);
+
+  // Pass the contents to `GetRootAsMonster`.
+  $monster = \MyGame\Example\Monster::GetRootAsMonster($contents);
+~~~
+
+Now you can access values like this:
+
+~~~{.php}
+  $hp = $monster->GetHp();
+  $pos = $monster->GetPos();
+~~~
+
+## Text Parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from PHP.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/PythonUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/PythonUsage.md
new file mode 100644
index 0000000..f338cda
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/PythonUsage.md
@@ -0,0 +1,100 @@
+Use in Python    {#flatbuffers_guide_use_python}
+=============
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Python, it should be noted that the
+[Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to general
+FlatBuffers usage in all of the supported languages (including Python). This
+page is designed to cover the nuances of FlatBuffers usage, specific to
+Python.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Python library code location
+
+The code for the FlatBuffers Python library can be found at
+`flatbuffers/python/flatbuffers`. You can browse the library code on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/
+python).
+
+## Testing the FlatBuffers Python library
+
+The code to test the Python library can be found at `flatbuffers/tests`.
+The test code itself is located in [py_test.py](https://github.com/google/
+flatbuffers/blob/master/tests/py_test.py).
+
+To run the tests, use the [PythonTest.sh](https://github.com/google/flatbuffers/
+blob/master/tests/PythonTest.sh) shell script.
+
+*Note: This script requires [python](https://www.python.org/) to be
+installed.*
+
+## Using the FlatBuffers Python library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Python.*
+
+There is support for both reading and writing FlatBuffers in Python.
+
+To use FlatBuffers in your own code, first generate Python classes from your
+schema with the `--python` option to `flatc`. Then you can include both
+FlatBuffers and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in Python:
+First, import the library and the generated code. Then read a FlatBuffer binary
+file into a `bytearray`, which you pass to the `GetRootAsMonster` function:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.py}
+    import MyGame.Example as example
+    import flatbuffers
+
+    buf = open('monster.dat', 'rb').read()
+    buf = bytearray(buf)
+    monster = example.GetRootAsMonster(buf, 0)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.py}
+    hp = monster.Hp()
+    pos = monster.Pos()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+## Support for Numpy arrays
+
+The Flatbuffers python library also has support for accessing scalar
+vectors as numpy arrays. This can be orders of magnitude faster than
+iterating over the vector one element at a time, and is particularly
+useful when unpacking large nested flatbuffers. The generated code for
+a scalar vector will have a method `<vector name>AsNumpy()`. In the
+case of the Monster example, you could access the inventory vector
+like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.py}
+    inventory = monster.InventoryAsNumpy()
+    # inventory is a numpy array of type np.dtype('uint8')
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+instead of
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.py}
+    inventory = []
+    for i in range(monster.InventoryLength()):
+        inventory.append(int(monster.Inventory(i)))
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Numpy is not a requirement. If numpy is not installed on your system,
+then attempting to access one of the `*asNumpy()` methods will result
+in a `NumpyRequiredForThisFeature` exception.
+
+## Text Parsing
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from Python, though you could use the C++ parser through SWIG or ctypes. Please
+see the C++ documentation for more on text parsing.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/README_TO_GENERATE_DOCS.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/README_TO_GENERATE_DOCS.md
new file mode 100644
index 0000000..5df0b6a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/README_TO_GENERATE_DOCS.md
@@ -0,0 +1,32 @@
+## Prerequisites
+
+To generate the docs for FlatBuffers from the source files, you
+will first need to install two programs.
+
+1. You will need to install `doxygen`. See
+   [Download Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html).
+
+2. You will need to install `doxypypy` to format python comments appropriately.
+   Install it from [here](https://github.com/Feneric/doxypypy).
+
+*Note: You will need both `doxygen` and `doxypypy` to be in your
+[PATH](https://en.wikipedia.org/wiki/PATH_(variable)) environment variable.*
+
+After you have both of those files installed and in your path, you need to
+set up the `py_filter` to invoke `doxypypy` from `doxygen`.
+
+Follow the steps
+[here](https://github.com/Feneric/doxypypy#invoking-doxypypy-from-doxygen).
+
+## Generating Docs
+
+Run the following commands to generate the docs:
+
+`cd flatbuffers/docs/source`
+`doxygen`
+
+The output is placed in `flatbuffers/docs/html`.
+
+*Note: The Go API Reference code must be generated ahead of time. For
+instructions on how to regenerated this file, please read the comments
+in `GoApi.md`.*
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/RustUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/RustUsage.md
new file mode 100644
index 0000000..6819117
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/RustUsage.md
@@ -0,0 +1,174 @@
+Use in Rust    {#flatbuffers_guide_use_rust}
+==========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Rust, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide
+to general FlatBuffers usage in all of the supported languages (including Rust).
+This page is designed to cover the nuances of FlatBuffers usage, specific to
+Rust.
+
+#### Prerequisites
+
+This page assumes you have written a FlatBuffers schema and compiled it
+with the Schema Compiler. If you have not, please see
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler)
+and [Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+Assuming you wrote a schema, say `mygame.fbs` (though the extension doesn't
+matter), you've generated a Rust file called `mygame_generated.rs` using the
+compiler (e.g. `flatc --rust mygame.fbs` or via helpers listed in "Useful
+tools created by others" section bellow), you can now start using this in
+your program by including the file. As noted, this header relies on the crate
+`flatbuffers`, which should be in your include `Cargo.toml`.
+
+## FlatBuffers Rust library code location
+
+The code for the FlatBuffers Rust library can be found at
+`flatbuffers/rust`. You can browse the library code on the
+[FlatBuffers GitHub page](https://github.com/google/flatbuffers/tree/master/rust).
+
+## Testing the FlatBuffers Rust library
+
+The code to test the Rust library can be found at `flatbuffers/tests/rust_usage_test`.
+The test code itself is located in
+[integration_test.rs](https://github.com/google/flatbuffers/blob/master/tests/rust_usage_test/tests/integration_test.rs)
+
+This test file requires `flatc` to be present. To review how to build the project,
+please read the [Building](@ref flatbuffers_guide_building) documenation.
+
+To run the tests, execute `RustTest.sh` from the `flatbuffers/tests` directory.
+For example, on [Linux](https://en.wikipedia.org/wiki/Linux), you would simply
+run: `cd tests && ./RustTest.sh`.
+
+*Note: The shell script requires [Rust](https://www.rust-lang.org) to
+be installed.*
+
+## Using the FlatBuffers Rust library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Rust.*
+
+FlatBuffers supports both reading and writing FlatBuffers in Rust.
+
+To use FlatBuffers in your code, first generate the Rust modules from your
+schema with the `--rust` option to `flatc`. Then you can import both FlatBuffers
+and the generated code to read or write FlatBuffers.
+
+For example, here is how you would read a FlatBuffer binary file in Rust:
+First, include the library and generated code. Then read the file into
+a `u8` vector, which you pass, as a byte slice, to `get_root_as_monster()`.
+
+This full example program is available in the Rust test suite:
+[monster_example.rs](https://github.com/google/flatbuffers/blob/master/tests/rust_usage_test/bin/monster_example.rs)
+
+It can be run by `cd`ing to the `rust_usage_test` directory and executing: `cargo run monster_example`.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.rs}
+    extern crate flatbuffers;
+
+    #[allow(dead_code, unused_imports)]
+    #[path = "../../monster_test_generated.rs"]
+    mod monster_test_generated;
+    pub use monster_test_generated::my_game;
+
+    use std::io::Read;
+
+    fn main() {
+        let mut f = std::fs::File::open("../monsterdata_test.mon").unwrap();
+        let mut buf = Vec::new();
+        f.read_to_end(&mut buf).expect("file reading failed");
+
+        let monster = my_game::example::get_root_as_monster(&buf[..]);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`monster` is of type `Monster`, and points to somewhere *inside* your
+buffer (root object pointers are not the same as `buffer_pointer` !).
+If you look in your generated header, you'll see it has
+convenient accessors for all fields, e.g. `hp()`, `mana()`, etc:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.rs}
+        println!("{}", monster.hp());     // `80`
+        println!("{}", monster.mana());   // default value of `150`
+        println!("{:?}", monster.name()); // Some("MyMonster")
+    }
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+*Note: That we never stored a `mana` value, so it will return the default.*
+
+## Direct memory access
+
+As you can see from the above examples, all elements in a buffer are
+accessed through generated accessors. This is because everything is
+stored in little endian format on all platforms (the accessor
+performs a swap operation on big endian machines), and also because
+the layout of things is generally not known to the user.
+
+For structs, layout is deterministic and guaranteed to be the same
+across platforms (scalars are aligned to their
+own size, and structs themselves to their largest member), and you
+are allowed to access this memory directly by using `safe_slice` and
+on the reference to a struct, or even an array of structs.
+
+To compute offsets to sub-elements of a struct, make sure they
+are structs themselves, as then you can use the pointers to
+figure out the offset without having to hardcode it. This is
+handy for use of arrays of structs with calls like `glVertexAttribPointer`
+in OpenGL or similar APIs.
+
+It is important to note is that structs are still little endian on all
+machines, so only use tricks like this if you can guarantee you're not
+shipping on a big endian machine (using an `#[cfg(target_endian = "little")]`
+attribute would be wise).
+
+The special function `safe_slice` is implemented on Vector objects that are
+represented in memory the same way as they are represented on the wire. This
+function is always available on vectors of struct, bool, u8, and i8. It is
+conditionally-compiled on little-endian systems for all the remaining scalar
+types.
+
+The FlatBufferBuilder function `create_vector_direct` is implemented for all
+types that are endian-safe to write with a `memcpy`. It is the write-equivalent
+of `safe_slice`.
+
+## Access of untrusted buffers
+
+The generated accessor functions access fields over offsets, which is
+very quick. These offsets are used to index into Rust slices, so they are
+bounds-checked by the Rust runtime. However, our Rust implementation may
+change: we may convert access functions to use direct pointer dereferencing, to
+improve lookup speed. As a result, users should not rely on the aforementioned
+bounds-checking behavior.
+
+When you're processing large amounts of data from a source you know (e.g.
+your own generated data on disk), this is acceptable, but when reading
+data from the network that can potentially have been modified by an
+attacker, this is undesirable.
+
+The C++ port provides a buffer verifier. At this time, Rust does not. Rust may
+provide a verifier in a future version. In the meantime, Rust users can access
+the buffer verifier generated by the C++ port through a foreign function
+interface (FFI).
+
+## Threading
+
+Reading a FlatBuffer does not touch any memory outside the original buffer,
+and is entirely read-only (all immutable), so is safe to access from multiple
+threads even without synchronisation primitives.
+
+Creating a FlatBuffer is not thread safe. All state related to building
+a FlatBuffer is contained in a FlatBufferBuilder instance, and no memory
+outside of it is touched. To make this thread safe, either do not
+share instances of FlatBufferBuilder between threads (recommended), or
+manually wrap it in synchronisation primitives. There's no automatic way to
+accomplish this, by design, as we feel multithreaded construction
+of a single buffer will be rare, and synchronisation overhead would be costly.
+
+## Useful tools created by others
+
+* [flatc-rust](https://github.com/frol/flatc-rust) - FlatBuffers compiler
+(flatc) as API for transparent `.fbs` to `.rs` code-generation via Cargo
+build scripts integration.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Schemas.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Schemas.md
new file mode 100644
index 0000000..af01a6a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Schemas.md
@@ -0,0 +1,646 @@
+Writing a schema    {#flatbuffers_guide_writing_schema}
+================
+
+The syntax of the schema language (aka IDL, [Interface Definition Language][])
+should look quite familiar to users of any of the C family of
+languages, and also to users of other IDLs. Let's look at an example
+first:
+
+    // example IDL file
+
+    namespace MyGame;
+
+    attribute "priority";
+
+    enum Color : byte { Red = 1, Green, Blue }
+
+    union Any { Monster, Weapon, Pickup }
+
+    struct Vec3 {
+      x:float;
+      y:float;
+      z:float;
+    }
+
+    table Monster {
+      pos:Vec3;
+      mana:short = 150;
+      hp:short = 100;
+      name:string;
+      friendly:bool = false (deprecated, priority: 1);
+      inventory:[ubyte];
+      color:Color = Blue;
+      test:Any;
+    }
+
+    root_type Monster;
+
+(`Weapon` & `Pickup` not defined as part of this example).
+
+### Tables
+
+Tables are the main way of defining objects in FlatBuffers, and consist of a
+name (here `Monster`) and a list of fields. Each field has a name, a type, and
+optionally a default value. If the default value is not specified in the schema,
+it will be `0` for scalar types, or `null` for other types. Some languages
+support setting a scalar's default to `null`. This makes the scalar optional.
+
+Fields do not have to appear in the wire representation, and you can choose
+to omit fields when constructing an object. You have the flexibility to add
+fields without fear of bloating your data. This design is also FlatBuffer's
+mechanism for forward and backwards compatibility. Note that:
+
+-   You can add new fields in the schema ONLY at the end of a table
+    definition. Older data will still
+    read correctly, and give you the default value when read. Older code
+    will simply ignore the new field.
+    If you want to have flexibility to use any order for fields in your
+    schema, you can manually assign ids (much like Protocol Buffers),
+    see the `id` attribute below.
+
+-   You cannot delete fields you don't use anymore from the schema,
+    but you can simply
+    stop writing them into your data for almost the same effect.
+    Additionally you can mark them as `deprecated` as in the example
+    above, which will prevent the generation of accessors in the
+    generated C++, as a way to enforce the field not being used any more.
+    (careful: this may break code!).
+
+-   You may change field names and table names, if you're ok with your
+    code breaking until you've renamed them there too.
+
+See "Schema evolution examples" below for more on this
+topic.
+
+### Structs
+
+Similar to a table, only now none of the fields are optional (so no defaults
+either), and fields may not be added or be deprecated. Structs may only contain
+scalars or other structs. Use this for
+simple objects where you are very sure no changes will ever be made
+(as quite clear in the example `Vec3`). Structs use less memory than
+tables and are even faster to access (they are always stored in-line in their
+parent object, and use no virtual table).
+
+### Types
+
+Built-in scalar types are
+
+-   8 bit: `byte` (`int8`), `ubyte` (`uint8`), `bool`
+
+-   16 bit: `short` (`int16`), `ushort` (`uint16`)
+
+-   32 bit: `int` (`int32`), `uint` (`uint32`), `float` (`float32`)
+
+-   64 bit: `long` (`int64`), `ulong` (`uint64`), `double` (`float64`)
+
+The type names in parentheses are alias names such that for example
+`uint8` can be used in place of `ubyte`, and `int32` can be used in
+place of `int` without affecting code generation.
+
+Built-in non-scalar types:
+
+-   Vector of any other type (denoted with `[type]`). Nesting vectors
+    is not supported, instead you can wrap the inner vector in a table.
+
+-   `string`, which may only hold UTF-8 or 7-bit ASCII. For other text encodings
+    or general binary data use vectors (`[byte]` or `[ubyte]`) instead.
+
+-   References to other tables or structs, enums or unions (see
+    below).
+
+You can't change types of fields once they're used, with the exception
+of same-size data where a `reinterpret_cast` would give you a desirable result,
+e.g. you could change a `uint` to an `int` if no values in current data use the
+high bit yet.
+
+### Arrays
+
+Arrays are a convenience short-hand for a fixed-length collection of elements.
+Arrays can be used to replace the following schema:
+
+    struct Vec3 {
+        x:float;
+        y:float;
+        z:float;
+    }
+
+with the following schema:
+
+    struct Vec3 {
+        v:[float:3];
+    }
+
+Both representations are binary equivalent.
+
+Arrays are currently only supported in a `struct`.
+
+### Default, Optional and Required Values
+
+There are three, mutually exclusive, reactions to the non-presence of a table's
+field in the binary data:
+
+1.   Default valued fields will return the default value (as defined in the schema).
+2.   Optional valued fields will return some form of `null` depending on the
+     local language. (In a sense, `null` is the default value).
+3.   Required fields will cause an error. Flatbuffer verifiers would
+     consider the whole buffer invalid. See the `required` tag below.
+
+When writing a schema, values are a sequence of digits. Values may be optionally
+followed by a decimal point (`.`) and more digits, for float constants, or
+optionally prefixed by a `-`. Floats may also be in scientific notation;
+optionally ending with an `e` or `E`, followed by a `+` or `-` and more digits.
+Values can also be the keyword `null`.
+
+Only scalar values can have defaults, non-scalar (string/vector/table) fields
+default to `null` when not present.
+
+You generally do not want to change default values after they're initially
+defined. Fields that have the default value are not actually stored in the
+serialized data (see also Gotchas below). Values explicitly written by code
+generated by the old schema old version, if they happen to be the default, will
+be read as a different value by code generated with the new schema. This is
+slightly less bad when converting an optional scalar into a default valued
+scalar since non-presence would not be overloaded with a previous default value.
+There are situations, however, where this may be desirable, especially if you
+can ensure a simultaneous rebuild of all code.
+
+### Enums
+
+Define a sequence of named constants, each with a given value, or
+increasing by one from the previous one. The default first value
+is `0`. As you can see in the enum declaration, you specify the underlying
+integral type of the enum with `:` (in this case `byte`), which then determines
+the type of any fields declared with this enum type.
+
+Only integer types are allowed, i.e. `byte`, `ubyte`, `short` `ushort`, `int`,
+`uint`, `long` and `ulong`.
+
+Typically, enum values should only ever be added, never removed (there is no
+deprecation for enums). This requires code to handle forwards compatibility
+itself, by handling unknown enum values.
+
+### Unions
+
+Unions share a lot of properties with enums, but instead of new names
+for constants, you use names of tables. You can then declare
+a union field, which can hold a reference to any of those types, and
+additionally a field with the suffix `_type` is generated that holds
+the corresponding enum value, allowing you to know which type to cast
+to at runtime.
+
+It's possible to give an alias name to a type union. This way a type can even be
+used to mean different things depending on the name used:
+
+    table PointPosition { x:uint; y:uint; }
+    table MarkerPosition {}
+    union Position {
+      Start:MarkerPosition,
+      Point:PointPosition,
+      Finish:MarkerPosition
+    }
+
+Unions contain a special `NONE` marker to denote that no value is stored so that
+name cannot be used as an alias.
+
+Unions are a good way to be able to send multiple message types as a FlatBuffer.
+Note that because a union field is really two fields, it must always be
+part of a table, it cannot be the root of a FlatBuffer by itself.
+
+If you have a need to distinguish between different FlatBuffers in a more
+open-ended way, for example for use as files, see the file identification
+feature below.
+
+There is an experimental support only in C++ for a vector of unions (and
+types). In the example IDL file above, use [Any] to add a vector of Any to
+Monster table. There is also experimental support for other types besides
+tables in unions, in particular structs and strings. There's no direct support
+for scalars in unions, but they can be wrapped in a struct at no space cost.
+
+### Namespaces
+
+These will generate the corresponding namespace in C++ for all helper
+code, and packages in Java. You can use `.` to specify nested namespaces /
+packages.
+
+### Includes
+
+You can include other schemas files in your current one, e.g.:
+
+    include "mydefinitions.fbs";
+
+This makes it easier to refer to types defined elsewhere. `include`
+automatically ensures each file is parsed just once, even when referred to
+more than once.
+
+When using the `flatc` compiler to generate code for schema definitions,
+only definitions in the current file will be generated, not those from the
+included files (those you still generate separately).
+
+### Root type
+
+This declares what you consider to be the root table (or struct) of the
+serialized data. This is particularly important for parsing JSON data,
+which doesn't include object type information.
+
+### File identification and extension
+
+Typically, a FlatBuffer binary buffer is not self-describing, i.e. it
+needs you to know its schema to parse it correctly. But if you
+want to use a FlatBuffer as a file format, it would be convenient
+to be able to have a "magic number" in there, like most file formats
+have, to be able to do a sanity check to see if you're reading the
+kind of file you're expecting.
+
+Now, you can always prefix a FlatBuffer with your own file header,
+but FlatBuffers has a built-in way to add an identifier to a
+FlatBuffer that takes up minimal space, and keeps the buffer
+compatible with buffers that don't have such an identifier.
+
+You can specify in a schema, similar to `root_type`, that you intend
+for this type of FlatBuffer to be used as a file format:
+
+    file_identifier "MYFI";
+
+Identifiers must always be exactly 4 characters long. These 4 characters
+will end up as bytes at offsets 4-7 (inclusive) in the buffer.
+
+For any schema that has such an identifier, `flatc` will automatically
+add the identifier to any binaries it generates (with `-b`),
+and generated calls like `FinishMonsterBuffer` also add the identifier.
+If you have specified an identifier and wish to generate a buffer
+without one, you can always still do so by calling
+`FlatBufferBuilder::Finish` explicitly.
+
+After loading a buffer, you can use a call like
+`MonsterBufferHasIdentifier` to check if the identifier is present.
+
+Note that this is best for open-ended uses such as files. If you simply wanted
+to send one of a set of possible messages over a network for example, you'd
+be better off with a union.
+
+Additionally, by default `flatc` will output binary files as `.bin`.
+This declaration in the schema will change that to whatever you want:
+
+    file_extension "ext";
+
+### RPC interface declarations
+
+You can declare RPC calls in a schema, that define a set of functions
+that take a FlatBuffer as an argument (the request) and return a FlatBuffer
+as the response (both of which must be table types):
+
+    rpc_service MonsterStorage {
+      Store(Monster):StoreResponse;
+      Retrieve(MonsterId):Monster;
+    }
+
+What code this produces and how it is used depends on language and RPC system
+used, there is preliminary support for GRPC through the `--grpc` code generator,
+see `grpc/tests` for an example.
+
+### Comments & documentation
+
+May be written as in most C-based languages. Additionally, a triple
+comment (`///`) on a line by itself signals that a comment is documentation
+for whatever is declared on the line after it
+(table/struct/field/enum/union/element), and the comment is output
+in the corresponding C++ code. Multiple such lines per item are allowed.
+
+### Attributes
+
+Attributes may be attached to a declaration, behind a field, or after
+the name of a table/struct/enum/union. These may either have a value or
+not. Some attributes like `deprecated` are understood by the compiler;
+user defined ones need to be declared with the attribute declaration
+(like `priority` in the example above), and are
+available to query if you parse the schema at runtime.
+This is useful if you write your own code generators/editors etc., and
+you wish to add additional information specific to your tool (such as a
+help text).
+
+Current understood attributes:
+
+-   `id: n` (on a table field): manually set the field identifier to `n`.
+    If you use this attribute, you must use it on ALL fields of this table,
+    and the numbers must be a contiguous range from 0 onwards.
+    Additionally, since a union type effectively adds two fields, its
+    id must be that of the second field (the first field is the type
+    field and not explicitly declared in the schema).
+    For example, if the last field before the union field had id 6,
+    the union field should have id 8, and the unions type field will
+    implicitly be 7.
+    IDs allow the fields to be placed in any order in the schema.
+    When a new field is added to the schema it must use the next available ID.
+-   `deprecated` (on a field): do not generate accessors for this field
+    anymore, code should stop using this data. Old data may still contain this
+    field, but it won't be accessible anymore by newer code. Note that if you
+    deprecate a field that was previous required, old code may fail to validate
+    new data (when using the optional verifier).
+-   `required` (on a non-scalar table field): this field must always be set.
+    By default, fields do not need to be present in the binary. This is
+    desirable, as it helps with forwards/backwards compatibility, and
+    flexibility of data structures. By specifying this attribute, you make non-
+    presence in an error for both reader and writer. The reading code may access
+    the field directly, without checking for null. If the constructing code does
+    not initialize this field, they will get an assert, and also the verifier
+    will fail on buffers that have missing required fields. Both adding and
+    removing this attribute may be forwards/backwards incompatible as readers
+    will be unable read old or new data, respectively, unless the data happens to
+    always have the field set.
+-   `force_align: size` (on a struct): force the alignment of this struct
+    to be something higher than what it is naturally aligned to. Causes
+    these structs to be aligned to that amount inside a buffer, IF that
+    buffer is allocated with that alignment (which is not necessarily
+    the case for buffers accessed directly inside a `FlatBufferBuilder`).
+    Note: currently not guaranteed to have an effect when used with
+    `--object-api`, since that may allocate objects at alignments less than
+    what you specify with `force_align`.
+-   `force_align: size` (on a vector): force the alignment of this vector to be
+    something different than what the element size would normally dictate.
+    Note: Now only work for generated C++ code.
+-   `bit_flags` (on an unsigned enum): the values of this field indicate bits,
+    meaning that any unsigned value N specified in the schema will end up
+    representing 1<<N, or if you don't specify values at all, you'll get
+    the sequence 1, 2, 4, 8, ...
+-   `nested_flatbuffer: "table_name"` (on a field): this indicates that the field
+    (which must be a vector of ubyte) contains flatbuffer data, for which the
+    root type is given by `table_name`. The generated code will then produce
+    a convenient accessor for the nested FlatBuffer.
+-   `flexbuffer` (on a field): this indicates that the field
+    (which must be a vector of ubyte) contains flexbuffer data. The generated
+    code will then produce a convenient accessor for the FlexBuffer root.
+-   `key` (on a field): this field is meant to be used as a key when sorting
+    a vector of the type of table it sits in. Can be used for in-place
+    binary search.
+-   `hash` (on a field). This is an (un)signed 32/64 bit integer field, whose
+    value during JSON parsing is allowed to be a string, which will then be
+    stored as its hash. The value of attribute is the hashing algorithm to
+    use, one of `fnv1_32` `fnv1_64` `fnv1a_32` `fnv1a_64`.
+-   `original_order` (on a table): since elements in a table do not need
+    to be stored in any particular order, they are often optimized for
+    space by sorting them to size. This attribute stops that from happening.
+    There should generally not be any reason to use this flag.
+-   'native_*'.  Several attributes have been added to support the [C++ object
+    Based API](@ref flatbuffers_cpp_object_based_api).  All such attributes
+    are prefixed with the term "native_".
+
+
+## JSON Parsing
+
+The same parser that parses the schema declarations above is also able
+to parse JSON objects that conform to this schema. So, unlike other JSON
+parsers, this parser is strongly typed, and parses directly into a FlatBuffer
+(see the compiler documentation on how to do this from the command line, or
+the C++ documentation on how to do this at runtime).
+
+Besides needing a schema, there are a few other changes to how it parses
+JSON:
+
+-   It accepts field names with and without quotes, like many JSON parsers
+    already do. It outputs them without quotes as well, though can be made
+    to output them using the `strict_json` flag.
+-   If a field has an enum type, the parser will recognize symbolic enum
+    values (with or without quotes) instead of numbers, e.g.
+    `field: EnumVal`. If a field is of integral type, you can still use
+    symbolic names, but values need to be prefixed with their type and
+    need to be quoted, e.g. `field: "Enum.EnumVal"`. For enums
+    representing flags, you may place multiple inside a string
+    separated by spaces to OR them, e.g.
+    `field: "EnumVal1 EnumVal2"` or `field: "Enum.EnumVal1 Enum.EnumVal2"`.
+-   Similarly, for unions, these need to specified with two fields much like
+    you do when serializing from code. E.g. for a field `foo`, you must
+    add a field `foo_type: FooOne` right before the `foo` field, where
+    `FooOne` would be the table out of the union you want to use.
+-   A field that has the value `null` (e.g. `field: null`) is intended to
+    have the default value for that field (thus has the same effect as if
+    that field wasn't specified at all).
+-   It has some built in conversion functions, so you can write for example
+    `rad(180)` where ever you'd normally write `3.14159`.
+    Currently supports the following functions: `rad`, `deg`, `cos`, `sin`,
+    `tan`, `acos`, `asin`, `atan`.
+
+When parsing JSON, it recognizes the following escape codes in strings:
+
+-   `\n` - linefeed.
+-   `\t` - tab.
+-   `\r` - carriage return.
+-   `\b` - backspace.
+-   `\f` - form feed.
+-   `\"` - double quote.
+-   `\\` - backslash.
+-   `\/` - forward slash.
+-   `\uXXXX` - 16-bit unicode code point, converted to the equivalent UTF-8
+    representation.
+-   `\xXX` - 8-bit binary hexadecimal number XX. This is the only one that is
+     not in the JSON spec (see http://json.org/), but is needed to be able to
+     encode arbitrary binary in strings to text and back without losing
+     information (e.g. the byte 0xFF can't be represented in standard JSON).
+
+It also generates these escape codes back again when generating JSON from a
+binary representation.
+
+When parsing numbers, the parser is more flexible than JSON.
+A format of numeric literals is more close to the C/C++.
+According to the [grammar](@ref flatbuffers_grammar), it accepts the following
+numerical literals:
+
+-   An integer literal can have any number of leading zero `0` digits.
+    Unlike C/C++, the parser ignores a leading zero, not interpreting it as the
+    beginning of the octal number.
+    The numbers `[081, -00094]` are equal to `[81, -94]`  decimal integers.
+-   The parser accepts unsigned and signed hexadecimal integer numbers.
+    For example: `[0x123, +0x45, -0x67]` are equal to `[291, 69, -103]` decimals.
+-   The format of float-point numbers is fully compatible with C/C++ format.
+    If a modern C++ compiler is used the parser accepts hexadecimal and special
+    floating-point literals as well:
+    `[-1.0, 2., .3e0, 3.e4, 0x21.34p-5, -inf, nan]`.
+
+    The following conventions for floating-point numbers are used:
+    - The exponent suffix of hexadecimal floating-point number is mandatory.
+    - Parsed `NaN` converted to unsigned IEEE-754 `quiet-NaN` value.
+
+    Extended floating-point support was tested with:
+    - x64 Windows: `MSVC2015` and higher.
+    - x64 Linux: `LLVM 6.0`, `GCC 4.9` and higher.
+
+    For details, see [Use in C++](@ref flatbuffers_guide_use_cpp) section.
+
+-   For compatibility with a JSON lint tool all numeric literals of scalar
+    fields can be wrapped to quoted string:
+    `"1", "2.0", "0x48A", "0x0C.0Ep-1", "-inf", "true"`.
+
+## Guidelines
+
+### Efficiency
+
+FlatBuffers is all about efficiency, but to realize that efficiency you
+require an efficient schema. There are usually multiple choices on
+how to represent data that have vastly different size characteristics.
+
+It is very common nowadays to represent any kind of data as dictionaries
+(as in e.g. JSON), because of its flexibility and extensibility. While
+it is possible to emulate this in FlatBuffers (as a vector
+of tables with key and value(s)), this is a bad match for a strongly
+typed system like FlatBuffers, leading to relatively large binaries.
+FlatBuffer tables are more flexible than classes/structs in most systems,
+since having a large number of fields only few of which are actually
+used is still efficient. You should thus try to organize your data
+as much as possible such that you can use tables where you might be
+tempted to use a dictionary.
+
+Similarly, strings as values should only be used when they are
+truely open-ended. If you can, always use an enum instead.
+
+FlatBuffers doesn't have inheritance, so the way to represent a set
+of related data structures is a union. Unions do have a cost however,
+so an alternative to a union is to have a single table that has
+all the fields of all the data structures you are trying to
+represent, if they are relatively similar / share many fields.
+Again, this is efficient because non-present fields are cheap.
+
+FlatBuffers supports the full range of integer sizes, so try to pick
+the smallest size needed, rather than defaulting to int/long.
+
+Remember that you can share data (refer to the same string/table
+within a buffer), so factoring out repeating data into its own
+data structure may be worth it.
+
+### Style guide
+
+Identifiers in a schema are meant to translate to many different programming
+languages, so using the style of your "main" language is generally a bad idea.
+
+For this reason, below is a suggested style guide to adhere to, to keep schemas
+consistent for interoperation regardless of the target language.
+
+Where possible, the code generators for specific languages will generate
+identifiers that adhere to the language style, based on the schema identifiers.
+
+- Table, struct, enum and rpc names (types): UpperCamelCase.
+- Table and struct field names: snake_case. This is translated to lowerCamelCase
+  automatically for some languages, e.g. Java.
+- Enum values: UpperCamelCase.
+- namespaces: UpperCamelCase.
+
+Formatting (this is less important, but still worth adhering to):
+
+- Opening brace: on the same line as the start of the declaration.
+- Spacing: Indent by 2 spaces. None around `:` for types, on both sides for `=`.
+
+For an example, see the schema at the top of this file.
+
+## Gotchas
+
+### Schemas and version control
+
+FlatBuffers relies on new field declarations being added at the end, and earlier
+declarations to not be removed, but be marked deprecated when needed. We think
+this is an improvement over the manual number assignment that happens in
+Protocol Buffers (and which is still an option using the `id` attribute
+mentioned above).
+
+One place where this is possibly problematic however is source control. If user
+A adds a field, generates new binary data with this new schema, then tries to
+commit both to source control after user B already committed a new field also,
+and just auto-merges the schema, the binary files are now invalid compared to
+the new schema.
+
+The solution of course is that you should not be generating binary data before
+your schema changes have been committed, ensuring consistency with the rest of
+the world. If this is not practical for you, use explicit field ids, which
+should always generate a merge conflict if two people try to allocate the same
+id.
+
+### Schema evolution examples
+
+Some examples to clarify what happens as you change a schema:
+
+If we have the following original schema:
+
+    table { a:int; b:int; }
+
+And we extend it:
+
+    table { a:int; b:int; c:int; }
+
+This is ok. Code compiled with the old schema reading data generated with the
+new one will simply ignore the presence of the new field. Code compiled with the
+new schema reading old data will get the default value for `c` (which is 0
+in this case, since it is not specified).
+
+    table { a:int (deprecated); b:int; }
+
+This is also ok. Code compiled with the old schema reading newer data will now
+always get the default value for `a` since it is not present. Code compiled
+with the new schema now cannot read nor write `a` anymore (any existing code
+that tries to do so will result in compile errors), but can still read
+old data (they will ignore the field).
+
+    table { c:int; a:int; b:int; }
+
+This is NOT ok, as this makes the schemas incompatible. Old code reading newer
+data will interpret `c` as if it was `a`, and new code reading old data
+accessing `a` will instead receive `b`.
+
+    table { c:int (id: 2); a:int (id: 0); b:int (id: 1); }
+
+This is ok. If your intent was to order/group fields in a way that makes sense
+semantically, you can do so using explicit id assignment. Now we are compatible
+with the original schema, and the fields can be ordered in any way, as long as
+we keep the sequence of ids.
+
+    table { b:int; }
+
+NOT ok. We can only remove a field by deprecation, regardless of wether we use
+explicit ids or not.
+
+    table { a:uint; b:uint; }
+
+This is MAYBE ok, and only in the case where the type change is the same size,
+like here. If old data never contained any negative numbers, this will be
+safe to do.
+
+    table { a:int = 1; b:int = 2; }
+
+Generally NOT ok. Any older data written that had 0 values were not written to
+the buffer, and rely on the default value to be recreated. These will now have
+those values appear to `1` and `2` instead. There may be cases in which this
+is ok, but care must be taken.
+
+    table { aa:int; bb:int; }
+
+Occasionally ok. You've renamed fields, which will break all code (and JSON
+files!) that use this schema, but as long as the change is obvious, this is not
+incompatible with the actual binary buffers, since those only ever address
+fields by id/offset.
+<br>
+
+### Testing whether a field is present in a table
+
+Most serialization formats (e.g. JSON or Protocol Buffers) make it very
+explicit in the format whether a field is present in an object or not,
+allowing you to use this as "extra" information.
+
+FlatBuffers will not write fields that are equal to their default value,
+sometimes resulting in significant space savings. However, this also means we
+cannot disambiguate the meaning of non-presence as "written default value" or
+"not written at all". This only applies to scalar fields since only they support
+default values. Unless otherwise specified, their default is 0.
+
+If you care about the presence of scalars, most languages support "optional
+scalars." You can set `null` as the default value in the schema. `null` is a
+value that's outside of all types, so we will always write if `add_field` is
+called. The generated field accessor should use the local language's canonical
+optional type.
+
+Some `FlatBufferBuilder` implementations have an option called `force_defaults`
+that circumvents this "not writing defaults" behavior you can then use
+`IsFieldPresent` to query presence.
+
+Another option that works in all languages is to wrap a scalar field in a
+struct. This way it will return null if it is not present. This will be slightly
+less ergonomic but structs don't take up any more space than the scalar they
+represent.
+
+   [Interface Definition Language]: https://en.wikipedia.org/wiki/Interface_description_language
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Support.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Support.md
new file mode 100644
index 0000000..4cac209
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Support.md
@@ -0,0 +1,57 @@
+Platform / Language / Feature support    {#flatbuffers_support}
+=====================================
+
+FlatBuffers is actively being worked on, which means that certain platform /
+language / feature combinations may not be available yet.
+
+This page tries to track those issues, to make informed decisions easier.
+In general:
+
+  * Languages: language support beyond the ones created by the original
+    FlatBuffer authors typically depends on community contributions.
+  * Features: C++ was the first language supported, since our original
+    target was high performance game development. It thus has the richest
+    feature set, and is likely most robust. Other languages are catching up
+    however.
+  * Platforms: All language implementations are typically portable to most
+    platforms, unless where noted otherwise.
+
+NOTE: this table is a start, it needs to be extended.
+
+Feature                        | C++    | Java  | C#       | Go    | Python | JS    | TS  | C      | PHP | Dart    | Lobster | Rust   | Swift
+------------------------------ | ------ | ----- | -------- | ----- | ------ | ----- | --- | ------ | --- | ------- | ------- | ------ | ------
+Codegen for all basic features | Yes    | Yes   | Yes      | Yes   | Yes    | Yes   | Yes | Yes    | WiP | Yes     | Yes     | Yes    | Yes
+JSON parsing                   | Yes    | No    | No       | No    | No     | No    | No  | Yes    | No  | No      | Yes     | No     | No
+Simple mutation                | Yes    | Yes   | Yes      | Yes   | No     | No    | No  | No     | No  | No      | No      | No     | Yes
+Reflection                     | Yes    | No    | No       | No    | No     | No    | No  | Basic  | No  | No      | No      | No     | No
+Buffer verifier                | Yes    | No    | No       | No    | No     | No    | No  | Yes    | No  | No      | No      | No     | No
+Native Object API              | Yes    | No    | Yes      | Yes   | Yes    | Yes   | Yes | No     | No  | No      | No      | No     | No
+Optional Scalars               | Yes    | Yes   | Yes      | No    | No     | Yes   | Yes | Yes    | No  | No      | Yes     | Yes    | Yes
+Flexbuffers                    | Yes    | Yes   | ?        | ?     | ?      | ?     | ?   | ?      | ?   | ?       | ?       | Yes    | ?
+Testing: basic                 | Yes    | Yes   | Yes      | Yes   | Yes    | Yes   | Yes | Yes    | ?   | Yes     | Yes     | Yes    | Yes
+Testing: fuzz                  | Yes    | No    | No       | Yes   | Yes    | No    | No  | No     | ?   | No      | No      | Yes    | No
+Performance:                   | Superb | Great | Great    | Great | Ok     | ?     | ?   | Superb | ?   | ?       | Great   | Superb | Great
+Platform: Windows              | VS2010 | Yes   | Yes      | ?     | ?      | ?     | Yes | VS2010 | ?   | Yes     | Yes     | Yes    | No
+Platform: Linux                | GCC282 | Yes   | ?        | Yes   | Yes    | ?     | Yes | Yes    | ?   | Yes     | Yes     | Yes    | Yes
+Platform: OS X                 | Xcode4 | ?     | ?        | ?     | Yes    | ?     | Yes | Yes    | ?   | Yes     | Yes     | Yes    | Yes
+Platform: Android              | NDK10d | Yes   | ?        | ?     | ?      | ?     | ?   | ?      | ?   | Flutter | Yes     | ?      | No
+Platform: iOS                  | ?      | ?     | ?        | ?     | ?      | ?     | ?   | ?      | ?   | Flutter | Yes     | ?      | Yes
+Engine: Unity                  | ?      | ?     | Yes      | ?     | ?      | ?     | ?   | ?      | ?   | ?       | No      | ?      | No
+Primary authors (github)       | aard   | aard  | ev/js/df | rw    | rw     | ew/ev | kr  | mik    | ch  | df      | aard    | rw/cn  | mi/mz
+
+Above | Github username
+----- | -----------------------------
+aard  | aardappel (previously: gwvo)
+ch    | chobie
+cn    | caspern
+df    | dnfield
+ev    | evolutional
+ew    | evanw
+js    | jonsimantov
+kr    | krojew
+mi    | mustiikhalil
+mik   | mikkelfj
+mz    | mzaks
+rw    | rw
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/SwiftUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/SwiftUsage.md
new file mode 100644
index 0000000..1c438f2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/SwiftUsage.md
@@ -0,0 +1,94 @@
+Use in Swift {#flatbuffers_guide_use_swift}
+=========
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in Swift, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide
+to general FlatBuffers usage in all of the supported languages (including Swift).
+This page is designed to cover the nuances of FlatBuffers usage, specific to
+Swift.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers Swift library code location
+
+The code for the FlatBuffers Swift library can be found at
+`flatbuffers/swift`. You can browse the library code on the [FlatBuffers
+GitHub page](https://github.com/google/flatbuffers/tree/master/swift).
+
+## Testing the FlatBuffers Swift library
+
+The code to test the Swift library can be found at `flatbuffers/Flatbuffers.Test.Swift`.
+The test code itself is located in [Flatbuffers.Test.Swift](https://github.com/google/flatbuffers/blob/master/tests/FlatBuffers.Test.Swift).
+
+To run the tests, use the [SwiftTest.sh](https://github.com/google/flatbuffers/blob/master/tests/FlatBuffers.Test.Swift/SwiftTest.sh) shell script.
+
+*Note: The shell script requires [Swift](https://swift.org) to
+be installed.*
+
+## Using the FlatBuffers Swift library
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in Swift.*
+
+FlatBuffers supports reading and writing binary FlatBuffers in Swift.
+
+To use FlatBuffers in your own code, first generate Swift structs from your
+schema with the `--swift` option to `flatc`. Then include FlatBuffers using `SPM` in
+by adding the path to `FlatBuffers/swift` into it. The generated code should also be
+added to xcode or the path of the package you will be using. Note: sometimes xcode cant
+and wont see the generated files, so it's better that you copy them to xcode.
+
+For example, here is how you would read a FlatBuffer binary file in Swift: First,
+include the library and copy thegenerated code. Then read a FlatBuffer binary file or
+a data object from the server, which you can pass into the `GetRootAsMonster` function.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.swift}
+    import FlatBuffers
+
+    typealias Monster1 = MyGame.Sample.Monster
+    typealias Vec3 = MyGame.Sample.Vec3
+
+    let path = FileManager.default.currentDirectoryPath
+    let url = URL(fileURLWithPath: path, isDirectory: true).appendingPathComponent("monsterdata_test").appendingPathExtension("mon")
+    guard let data = try? Data(contentsOf: url) else { return }
+
+    let monster = Monster.getRootAsMonster(bb: ByteBuffer(data: data))
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now you can access values like this:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.swift}
+    let hp = monster.hp
+    let pos = monster.pos // uses native swift structs
+    let pos = monster.mutablePos // uses flatbuffers structs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+In some cases it's necessary to modify values in an existing FlatBuffer in place (without creating a copy). For this reason, scalar fields of a Flatbuffer table or struct can be mutated.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.swift}
+    let monster = Monster.getRootAsMonster(bb: ByteBuffer(data: data))
+
+    if !monster.mutate(hp: 10) {
+      fatalError("couldn't mutate")
+    }
+    // mutate a struct field using flatbuffers struct
+    // DONT use monster.pos to mutate since swift copy on write 
+    // will not mutate the value in the buffer
+    let vec = monster.mutablePos.mutate(z: 4)
+
+    // This mutation will fail because the mana field is not available in
+    // the buffer. It should be set when creating the buffer.
+    if !monster.mutate(mana: 20) {
+      fatalError("couldn't mutate")
+    }
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The term `mutate` is used instead of `set` to indicate that this is a special use case. All mutate functions return a boolean value which is false if the field we're trying to mutate is not available in the buffer.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/Tutorial.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/Tutorial.md
new file mode 100644
index 0000000..9b80100
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/Tutorial.md
@@ -0,0 +1,3445 @@
+Tutorial   {#flatbuffers_guide_tutorial}
+========
+
+## Overview
+
+This tutorial provides a basic example of how to work with
+[FlatBuffers](@ref flatbuffers_overview). We will step through a simple example
+application, which shows you how to:
+
+   - Write a FlatBuffer `schema` file.
+   - Use the `flatc` FlatBuffer compiler.
+   - Parse [JSON](http://json.org) files that conform to a schema into
+     FlatBuffer binary files.
+   - Use the generated files in many of the supported languages (such as C++,
+     Java, and more.)
+
+During this example, imagine that you are creating a game where the main
+character, the hero of the story, needs to slay some `orc`s. We will walk
+through each step necessary to create this monster type using FlatBuffers.
+
+Please select your desired language for our quest:
+\htmlonly
+<form>
+  <input type="radio" name="language" value="cpp" checked="checked">C++</input>
+  <input type="radio" name="language" value="java">Java</input>
+  <input type="radio" name="language" value="kotlin">Kotlin</input>
+  <input type="radio" name="language" value="csharp">C#</input>
+  <input type="radio" name="language" value="go">Go</input>
+  <input type="radio" name="language" value="python">Python</input>
+  <input type="radio" name="language" value="javascript">JavaScript</input>
+  <input type="radio" name="language" value="typescript">TypeScript</input>
+  <input type="radio" name="language" value="php">PHP</input>
+  <input type="radio" name="language" value="c">C</input>
+  <input type="radio" name="language" value="dart">Dart</input>
+  <input type="radio" name="language" value="lua">Lua</input>
+  <input type="radio" name="language" value="lobster">Lobster</input>
+  <input type="radio" name="language" value="rust">Rust</input>
+  <input type="radio" name="language" value="swift">Swift</input>
+</form>
+\endhtmlonly
+
+\htmlonly
+<script>
+  /**
+   * Check if an HTML `class` attribute is in the language-specific format.
+   * @param {string} languageClass An HTML `class` attribute in the format
+   * 'language-{lang}', where {lang} is a programming language (e.g. 'cpp',
+   * 'java', 'go', etc.).
+   * @return {boolean} Returns `true` if `languageClass` was in the valid
+   * format, prefixed with 'language-'. Otherwise, it returns false.
+   */
+  function isProgrammingLanguageClassName(languageClass) {
+    if (languageClass && languageClass.substring(0, 9) == 'language-' &&
+        languageClass.length > 8) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Given a language-specific HTML `class` attribute, extract the language.
+   * @param {string} languageClass The string name of an HTML `class` attribute,
+   * in the format `language-{lang}`, where {lang} is a programming language
+   * (e.g. 'cpp', 'java', 'go', etc.).
+   * @return {string} Returns a string containing only the {lang} portion of
+   * the class name. If the input was invalid, then it returns `null`.
+   */
+  function extractProgrammingLanguageFromLanguageClass(languageClass) {
+    if (isProgrammingLanguageClassName(languageClass)) {
+      return languageClass.substring(9);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Hide every code snippet, except for the language that is selected.
+   */
+  function displayChosenLanguage() {
+    var selection = $('input:checked').val();
+
+    var htmlElements = document.getElementsByTagName('*');
+    for (var i = 0; i < htmlElements.length; i++) {
+      if (isProgrammingLanguageClassName(htmlElements[i].className)) {
+        if (extractProgrammingLanguageFromLanguageClass(
+              htmlElements[i].className).toLowerCase() != selection) {
+          htmlElements[i].style.display = 'none';
+        } else {
+          htmlElements[i].style.display = 'initial';
+        }
+      }
+    }
+  }
+
+  $( document ).ready(displayChosenLanguage);
+
+  $('input[type=radio]').on("click", displayChosenLanguage);
+</script>
+\endhtmlonly
+
+## Where to Find the Example Code
+
+Samples demonstating the concepts in this example are located in the source code
+package, under the `samples` directory. You can browse the samples on GitHub
+[here](https://github.com/google/flatbuffers/tree/master/samples).
+
+<div class="language-c">
+*Note: The above does not apply to C, instead [look here](https://github.com/dvidelabs/flatcc/tree/master/samples).*
+</div>
+
+For your chosen language, please cross-reference with:
+
+<div class="language-cpp">
+[sample_binary.cpp](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.cpp)
+</div>
+<div class="language-java">
+[SampleBinary.java](https://github.com/google/flatbuffers/blob/master/samples/SampleBinary.java)
+</div>
+<div class="language-kotlin">
+[SampleBinary.kt](https://github.com/google/flatbuffers/blob/master/samples/SampleBinary.kt)
+</div>
+<div class="language-csharp">
+[SampleBinary.cs](https://github.com/google/flatbuffers/blob/master/samples/SampleBinary.cs)
+</div>
+<div class="language-go">
+[sample_binary.go](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.go)
+</div>
+<div class="language-python">
+[sample_binary.py](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.py)
+</div>
+<div class="language-javascript">
+No sample binary is provided, since JS needs to be transpiled from TypeScript. Please see TypeScript support.
+</div>
+<div class="language-typescript">
+<em>none yet</em>
+</div>
+<div class="language-php">
+[SampleBinary.php](https://github.com/google/flatbuffers/blob/master/samples/SampleBinary.php)
+</div>
+<div class="language-c">
+[monster.c](https://github.com/dvidelabs/flatcc/blob/master/samples/monster/monster.c)
+</div>
+<div class="language-dart">
+[example.dart](https://github.com/google/flatbuffers/blob/master/dart/example/example.dart)
+</div>
+<div class="language-lua">
+[sample_binary.lua](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.lua)
+</div>
+<div class="language-lobster">
+[sample_binary.lobster](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.lobster)
+</div>
+<div class="language-rust">
+[sample_binary.rs](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.rs)
+</div>
+<div class="language-swift">
+[sample_binary.swift](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.swift)
+</div>
+
+
+## Writing the Monsters' FlatBuffer Schema
+
+To start working with FlatBuffers, you first need to create a `schema` file,
+which defines the format for each data structure you wish to serialize. Here is
+the `schema` that defines the template for our monsters:
+
+~~~
+  // Example IDL file for our monster's schema.
+
+  namespace MyGame.Sample;
+
+  enum Color:byte { Red = 0, Green, Blue = 2 }
+
+  union Equipment { Weapon } // Optionally add more tables.
+
+  struct Vec3 {
+    x:float;
+    y:float;
+    z:float;
+  }
+
+  table Monster {
+    pos:Vec3; // Struct.
+    mana:short = 150;
+    hp:short = 100;
+    name:string;
+    friendly:bool = false (deprecated);
+    inventory:[ubyte];  // Vector of scalars.
+    color:Color = Blue; // Enum.
+    weapons:[Weapon];   // Vector of tables.
+    equipped:Equipment; // Union.
+    path:[Vec3];        // Vector of structs.
+  }
+
+  table Weapon {
+    name:string;
+    damage:short;
+  }
+
+  root_type Monster;
+~~~
+
+As you can see, the syntax for the `schema`
+[Interface Definition Language (IDL)](https://en.wikipedia.org/wiki/Interface_description_language)
+is similar to those of the C family of languages, and other IDL languages. Let's
+examine each part of this `schema` to determine what it does.
+
+The `schema` starts with a `namespace` declaration. This determines the
+corresponding package/namespace for the generated code. In our example, we have
+the `Sample` namespace inside of the `MyGame` namespace.
+
+Next, we have an `enum` definition. In this example, we have an `enum` of type
+`byte`, named `Color`. We have three values in this `enum`: `Red`, `Green`, and
+`Blue`. We specify `Red = 0` and `Blue = 2`, but we do not specify an explicit
+value for `Green`. Since the behavior of an `enum` is to increment if
+unspecified, `Green` will receive the implicit value of `1`.
+
+Following the `enum` is a `union`. The `union` in this example is not very
+useful, as it only contains the one `table` (named `Weapon`). If we had created
+multiple tables that we would want the `union` to be able to reference, we
+could add more elements to the `union Equipment`.
+
+After the `union` comes a `struct Vec3`, which represents a floating point
+vector with `3` dimensions. We use a `struct` here, over a `table`, because
+`struct`s are ideal for data structures that will not change, since they use
+less memory and have faster lookup.
+
+The `Monster` table is the main object in our FlatBuffer. This will be used as
+the template to store our `orc` monster. We specify some default values for
+fields, such as `mana:short = 150`. If unspecified, scalar fields (like `int`,
+`uint`, or `float`) will be given a default of `0` while strings and tables will
+be given a default of `null`. Another thing to note is the line `friendly:bool =
+false (deprecated);`. Since you cannot delete fields from a `table` (to support
+backwards compatability), you can set fields as `deprecated`, which will prevent
+the generation of accessors for this field in the generated code. Be careful
+when using `deprecated`, however, as it may break legacy code that used this
+accessor.
+
+The `Weapon` table is a sub-table used within our FlatBuffer. It is
+used twice: once within the `Monster` table and once within the `Equipment`
+union. For our `Monster`, it is used to populate a `vector of tables` via the
+`weapons` field within our `Monster`. It is also the only table referenced by
+the `Equipment` union.
+
+The last part of the `schema` is the `root_type`. The root type declares what
+will be the root table for the serialized data. In our case, the root type is
+our `Monster` table.
+
+The scalar types can also use alias type names such as `int16` instead
+of `short` and `float32` instead of `float`. Thus we could also write
+the `Weapon` table as:
+
+~~~
+  table Weapon {
+    name:string;
+    damage:int16;
+  }
+~~~
+
+#### More Information About Schemas
+
+You can find a complete guide to writing `schema` files in the
+[Writing a schema](@ref flatbuffers_guide_writing_schema) section of the
+Programmer's Guide. You can also view the formal
+[Grammar of the schema language](@ref flatbuffers_grammar).
+
+## Compiling the Monsters' Schema
+
+After you have written the FlatBuffers schema, the next step is to compile it.
+
+If you have not already done so, please follow
+[these instructions](@ref flatbuffers_guide_building) to build `flatc`, the
+FlatBuffer compiler.
+
+Once `flatc` is built successfully, compile the schema for your language:
+
+<div class="language-c">
+*Note: If you're working in C, you need to use the separate project [FlatCC](https://github.com/dvidelabs/flatcc) which contains a schema compiler and runtime library in C for C.*
+<br>
+See [flatcc build instructions](https://github.com/dvidelabs/flatcc#building).
+<br>
+Please be aware of the difference between `flatc` and `flatcc` tools.
+<br>
+</div>
+<div class="language-cpp">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --cpp monster.fbs
+~~~
+</div>
+<div class="language-java">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --java monster.fbs
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --kotlin monster.fbs
+~~~
+</div>
+<div class="language-csharp">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --csharp monster.fbs
+~~~
+</div>
+<div class="language-go">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --go monster.fbs
+~~~
+</div>
+<div class="language-python">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --python monster.fbs
+~~~
+</div>
+<div class="language-javascript">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --ts monster.fbs
+  # customize your TS -> JS transpilation
+  tsc monster_generated.ts
+~~~
+</div>
+<div class="language-typescript">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --ts monster.fbs
+~~~
+</div>
+<div class="language-php">
+~~~{.sh}
+  cd flatbuffers/sample
+  ./../flatc --php monster.fbs
+~~~
+</div>
+<div class="language-c">
+~~~{.sh}
+  cd flatcc
+  mkdir -p build/tmp/samples/monster
+  bin/flatcc -a -o build/tmp/samples/monster samples/monster/monster.fbs
+  # or just
+  flatcc/samples/monster/build.sh
+~~~
+</div>
+<div class="language-dart">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --dart monster.fbs
+~~~
+</div>
+<div class="language-lua">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --lua monster.fbs
+~~~
+</div>
+<div class="language-lobster">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --lobster monster.fbs
+~~~
+</div>
+<div class="language-rust">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --rust monster.fbs
+~~~
+</div>
+<div class="language-swift">
+~~~{.sh}
+  cd flatbuffers/samples
+  ./../flatc --swift monster.fbs
+~~~
+</div>
+
+For a more complete guide to using the `flatc` compiler, please read the
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler)
+section of the Programmer's Guide.
+
+## Reading and Writing Monster FlatBuffers
+
+Now that we have compiled the schema for our programming language, we can
+start creating some monsters and serializing/deserializing them from
+FlatBuffers.
+
+#### Creating and Writing Orc FlatBuffers
+
+The first step is to import/include the library, generated files, etc.
+
+<div class="language-cpp">
+~~~{.cpp}
+  #include "monster_generated.h" // This was generated by `flatc`.
+
+  using namespace MyGame::Sample; // Specified in the schema.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  import MyGame.Sample.*; //The `flatc` generated files. (Monster, Vec3, etc.)
+
+  import com.google.flatbuffers.FlatBufferBuilder;
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kotlin}
+  import MyGame.Sample.* //The `flatc` generated files. (Monster, Vec3, etc.)
+
+  import com.google.flatbuffers.FlatBufferBuilder
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  using FlatBuffers;
+  using MyGame.Sample; // The `flatc` generated files. (Monster, Vec3, etc.)
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  import (
+          flatbuffers "github.com/google/flatbuffers/go"
+          sample "MyGame/Sample"
+  )
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  import flatbuffers
+
+  # Generated by `flatc`.
+  import MyGame.Sample.Color
+  import MyGame.Sample.Equipment
+  import MyGame.Sample.Monster
+  import MyGame.Sample.Vec3
+  import MyGame.Sample.Weapon
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // The following code is an example - use your desired module flavor by transpiling from TS. 
+  var flatbuffers = require('/js/flatbuffers').flatbuffers;
+  var MyGame = require('./monster_generated').MyGame; // Generated by `flatc`.
+
+  //--------------------------------------------------------------------------//
+
+  // The following code is for browser-based HTML/JavaScript. Use the above code
+  // for JavaScript module loaders (e.g. Node.js).
+  <script src="../js/flatbuffers.js"></script>
+  <script src="monster_generated.js"></script> // Generated by `flatc`.
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // note: import flatbuffers with your desired import method
+
+  import { MyGame } from './monster_generated';
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // It is recommended that your use PSR autoload when using FlatBuffers in PHP.
+  // Here is an example from `SampleBinary.php`:
+  function __autoload($class_name) {
+    // The last segment of the class name matches the file name.
+    $class = substr($class_name, strrpos($class_name, "\\") + 1);
+    $root_dir = join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)))); // `flatbuffers` root.
+
+    // Contains the `*.php` files for the FlatBuffers library and the `flatc` generated files.
+    $paths = array(join(DIRECTORY_SEPARATOR, array($root_dir, "php")),
+                   join(DIRECTORY_SEPARATOR, array($root_dir, "samples", "MyGame", "Sample")));
+    foreach ($paths as $path) {
+      $file = join(DIRECTORY_SEPARATOR, array($path, $class . ".php"));
+      if (file_exists($file)) {
+        require($file);
+        break;
+      }
+    }
+  }
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  #include "monster_builder.h" // Generated by `flatcc`.
+
+  // Convenient namespace macro to manage long namespace prefix.
+  #undef ns
+  #define ns(x) FLATBUFFERS_WRAP_NAMESPACE(MyGame_Sample, x) // Specified in the schema.
+
+  // A helper to simplify creating vectors from C-arrays.
+  #define c_vec_len(V) (sizeof(V)/sizeof((V)[0]))
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+  // Generated by `flatc`.
+  import 'monster_my_game.sample_generated.dart' as myGame;
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  -- require the flatbuffers module
+  local flatbuffers = require("flatbuffers")
+
+  -- require the generated files from `flatc`.
+  local color = require("MyGame.Sample.Color")
+  local equipment = require("MyGame.Sample.Equipment")
+  local monster = require("MyGame.Sample.Monster")
+  local vec3 = require("MyGame.Sample.Vec3")
+  local weapon = require("MyGame.Sample.Weapon")
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  import from "../lobster/"  // Where to find flatbuffers.lobster
+  import monster_generated
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // import the flatbuffers runtime library
+  extern crate flatbuffers;
+
+  // import the generated code
+  #[allow(dead_code, unused_imports)]
+  #[path = "./monster_generated.rs"]
+  mod monster_generated;
+  pub use monster_generated::my_game::sample::{get_root_as_monster,
+                                               Color, Equipment,
+                                               Monster, MonsterArgs,
+                                               Vec3,
+                                               Weapon, WeaponArgs};
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  /**
+  // make sure that monster_generated.swift is included in your project
+  */
+  import Flatbuffers
+
+  // typealiases for convenience
+  typealias Monster = MyGame1_Sample_Monster
+  typealias Weapon = MyGame1_Sample_Weapon
+  typealias Color = MyGame1_Sample_Color
+  typealias Vec3 = MyGame1_Sample_Vec3
+~~~
+</div>
+
+Now we are ready to start building some buffers. In order to start, we need
+to create an instance of the `FlatBufferBuilder`, which will contain the buffer
+as it grows. You can pass an initial size of the buffer (here 1024 bytes),
+which will grow automatically if needed:
+
+<div class="language-cpp">
+~~~{.cpp}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  flatbuffers::FlatBufferBuilder builder(1024);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  FlatBufferBuilder builder = new FlatBufferBuilder(1024);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  val builder = FlatBufferBuilder(1024)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  var builder = new FlatBufferBuilder(1024);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  builder := flatbuffers.NewBuilder(1024)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # Create a `FlatBufferBuilder`, which will be used to create our
+  # monsters' FlatBuffers.
+  builder = flatbuffers.Builder(1024)
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // Create a `flatbuffer.Builder`, which will be used to create our
+  // monsters' FlatBuffers.
+  var builder = new flatbuffers.Builder(1024);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // Create a `flatbuffer.Builder`, which will be used to create our
+  // monsters' FlatBuffers.
+  let builder = new flatbuffers.Builder(1024);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Create a `FlatBufferBuilder`, which will be used to create our
+  // monsters' FlatBuffers.
+  $builder = new Google\FlatBuffers\FlatbufferBuilder(1024);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+    flatcc_builder_t builder, *B;
+    B = &builder;
+    // Initialize the builder object.
+    flatcc_builder_init(B);
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // Create the fb.Builder object that will be used by our generated builders
+  // Note that if you are only planning to immediately get the byte array this builder would create,
+  // you can use the convenience method `toBytes()` on the generated builders.
+  // For example, you could do something like `new myGame.MonsterBuilder(...).toBytes()`
+  var builder = new fb.Builder(initialSize: 1024);
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  -- get access to the builder, providing an array of size 1024
+  local builder = flatbuffers.Builder(1024)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  // get access to the builder
+  let builder = flatbuffers_builder {}
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Build up a serialized buffer algorithmically.
+  // Initialize it with a capacity of 1024 bytes.
+  let mut builder = flatbuffers::FlatBufferBuilder::new_with_capacity(1024);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // create a `FlatBufferBuilder`, which will be used to serialize objects
+  let builder = FlatBufferBuilder(initialSize: 1024)
+~~~
+</div>
+
+After creating the `builder`, we can start serializing our data. Before we make
+our `orc` Monster, let's create some `Weapon`s: a `Sword` and an `Axe`.
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto weapon_one_name = builder.CreateString("Sword");
+  short weapon_one_damage = 3;
+
+  auto weapon_two_name = builder.CreateString("Axe");
+  short weapon_two_damage = 5;
+
+  // Use the `CreateWeapon` shortcut to create Weapons with all the fields set.
+  auto sword = CreateWeapon(builder, weapon_one_name, weapon_one_damage);
+  auto axe = CreateWeapon(builder, weapon_two_name, weapon_two_damage);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  int weaponOneName = builder.createString("Sword")
+  short weaponOneDamage = 3;
+
+  int weaponTwoName = builder.createString("Axe");
+  short weaponTwoDamage = 5;
+
+  // Use the `createWeapon()` helper function to create the weapons, since we set every field.
+  int sword = Weapon.createWeapon(builder, weaponOneName, weaponOneDamage);
+  int axe = Weapon.createWeapon(builder, weaponTwoName, weaponTwoDamage);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val weaponOneName = builder.createString("Sword")
+  val weaponOneDamage: Short = 3;
+
+  val weaponTwoName = builder.createString("Axe")
+  val weaponTwoDamage: Short = 5;
+
+  // Use the `createWeapon()` helper function to create the weapons, since we set every field.
+  val sword = Weapon.createWeapon(builder, weaponOneName, weaponOneDamage)
+  val axe = Weapon.createWeapon(builder, weaponTwoName, weaponTwoDamage)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  var weaponOneName = builder.CreateString("Sword");
+  var weaponOneDamage = 3;
+
+  var weaponTwoName = builder.CreateString("Axe");
+  var weaponTwoDamage = 5;
+
+  // Use the `CreateWeapon()` helper function to create the weapons, since we set every field.
+  var sword = Weapon.CreateWeapon(builder, weaponOneName, (short)weaponOneDamage);
+  var axe = Weapon.CreateWeapon(builder, weaponTwoName, (short)weaponTwoDamage);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  weaponOne := builder.CreateString("Sword")
+  weaponTwo := builder.CreateString("Axe")
+
+  // Create the first `Weapon` ("Sword").
+  sample.WeaponStart(builder)
+  sample.WeaponAddName(builder, weaponOne)
+  sample.WeaponAddDamage(builder, 3)
+  sword := sample.WeaponEnd(builder)
+
+  // Create the second `Weapon` ("Axe").
+  sample.WeaponStart(builder)
+  sample.WeaponAddName(builder, weaponTwo)
+  sample.WeaponAddDamage(builder, 5)
+  axe := sample.WeaponEnd(builder)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  weapon_one = builder.CreateString('Sword')
+  weapon_two = builder.CreateString('Axe')
+
+  # Create the first `Weapon` ('Sword').
+  MyGame.Sample.Weapon.Start(builder)
+  MyGame.Sample.Weapon.AddName(builder, weapon_one)
+  MyGame.Sample.Weapon.AddDamage(builder, 3)
+  sword = MyGame.Sample.Weapon.End(builder)
+
+  # Create the second `Weapon` ('Axe').
+  MyGame.Sample.Weapon.Start(builder)
+  MyGame.Sample.Weapon.AddName(builder, weapon_two)
+  MyGame.Sample.Weapon.AddDamage(builder, 5)
+  axe = MyGame.Sample.Weapon.End(builder)
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var weaponOne = builder.createString('Sword');
+  var weaponTwo = builder.createString('Axe');
+
+  // Create the first `Weapon` ('Sword').
+  MyGame.Sample.Weapon.startWeapon(builder);
+  MyGame.Sample.Weapon.addName(builder, weaponOne);
+  MyGame.Sample.Weapon.addDamage(builder, 3);
+  var sword = MyGame.Sample.Weapon.endWeapon(builder);
+
+  // Create the second `Weapon` ('Axe').
+  MyGame.Sample.Weapon.startWeapon(builder);
+  MyGame.Sample.Weapon.addName(builder, weaponTwo);
+  MyGame.Sample.Weapon.addDamage(builder, 5);
+  var axe = MyGame.Sample.Weapon.endWeapon(builder);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let weaponOne = builder.createString('Sword');
+  let weaponTwo = builder.createString('Axe');
+
+  // Create the first `Weapon` ('Sword').
+  MyGame.Sample.Weapon.startWeapon(builder);
+  MyGame.Sample.Weapon.addName(builder, weaponOne);
+  MyGame.Sample.Weapon.addDamage(builder, 3);
+  let sword = MyGame.Sample.Weapon.endWeapon(builder);
+
+  // Create the second `Weapon` ('Axe').
+  MyGame.Sample.Weapon.startWeapon(builder);
+  MyGame.Sample.Weapon.addName(builder, weaponTwo);
+  MyGame.Sample.Weapon.addDamage(builder, 5);
+  let axe = MyGame.Sample.Weapon.endWeapon(builder);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Create the `Weapon`s using the `createWeapon()` helper function.
+  $weapon_one_name = $builder->createString("Sword");
+  $sword = \MyGame\Sample\Weapon::CreateWeapon($builder, $weapon_one_name, 3);
+
+  $weapon_two_name = $builder->createString("Axe");
+  $axe = \MyGame\Sample\Weapon::CreateWeapon($builder, $weapon_two_name, 5);
+
+  // Create an array from the two `Weapon`s and pass it to the
+  // `CreateWeaponsVector()` method to create a FlatBuffer vector.
+  $weaps = array($sword, $axe);
+  $weapons = \MyGame\Sample\Monster::CreateWeaponsVector($builder, $weaps);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  flatbuffers_string_ref_t weapon_one_name = flatbuffers_string_create_str(B, "Sword");
+  uint16_t weapon_one_damage = 3;
+
+  flatbuffers_string_ref_t weapon_two_name = flatbuffers_string_create_str(B, "Axe");
+  uint16_t weapon_two_damage = 5;
+
+  ns(Weapon_ref_t) sword = ns(Weapon_create(B, weapon_one_name, weapon_one_damage));
+  ns(Weapon_ref_t) axe = ns(Weapon_create(B, weapon_two_name, weapon_two_damage));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // The generated Builder classes work much like in other languages,
+  final int weaponOneName = builder.writeString("Sword");
+  final int weaponOneDamage = 3;
+
+  final int weaponTwoName = builder.writeString("Axe");
+  final int weaponTwoDamage = 5;
+
+  final swordBuilder = new myGame.WeaponBuilder(builder)
+    ..begin()
+    ..addNameOffset(weaponOneName)
+    ..addDamage(weaponOneDamage);
+  final int sword = swordBuilder.finish();
+
+  final axeBuilder = new myGame.WeaponBuilder(builder)
+    ..begin()
+    ..addNameOffset(weaponTwoName)
+    ..addDamage(weaponTwoDamage);
+  final int axe = axeBuilder.finish();
+
+
+
+  // The generated ObjectBuilder classes offer an easier to use alternative
+  // at the cost of requiring some additional reference allocations. If memory
+  // usage is critical, or if you'll be working with especially large messages
+  // or tables, you should prefer using the generated Builder classes.
+  // The following code would produce an identical buffer as above.
+  final String weaponOneName = "Sword";
+  final int weaponOneDamage = 3;
+
+  final String weaponTwoName = "Axe";
+  final int weaponTwoDamage = 5;
+
+  final myGame.WeaponBuilder sword = new myGame.WeaponObjectBuilder(
+    name: weaponOneName,
+    damage: weaponOneDamage,
+  );
+
+  final myGame.WeaponBuilder axe = new myGame.WeaponObjectBuilder(
+    name: weaponTwoName,
+    damage: weaponTwoDamage,
+  );
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    local weaponOne = builder:CreateString("Sword")
+    local weaponTwo = builder:CreateString("Axe")
+
+    -- Create the first 'Weapon'
+    weapon.Start(builder)
+    weapon.AddName(builder, weaponOne)
+    weapon.AddDamage(builder, 3)
+    local sword = weapon.End(builder)
+
+    -- Create the second 'Weapon'
+    weapon.Start(builder)
+    weapon.AddName(builder, weaponTwo)
+    weapon.AddDamage(builder, 5)
+    local axe = weapon.End(builder)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let weapon_names = [ "Sword", "Axe" ]
+  let weapon_damages = [ 3, 5 ]
+
+  let weapon_offsets = map(weapon_names) name, i:
+      let ns = builder.CreateString(name)
+      MyGame_Sample_WeaponBuilder { b }
+          .start()
+          .add_name(ns)
+          .add_damage(weapon_damages[i])
+          .end()
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Serialize some weapons for the Monster: A 'sword' and an 'axe'.
+  let weapon_one_name = builder.create_string("Sword");
+  let weapon_two_name = builder.create_string("Axe");
+
+  // Use the `Weapon::create` shortcut to create Weapons with named field
+  // arguments.
+  let sword = Weapon::create(&mut builder, &WeaponArgs{
+      name: Some(weapon_one_name),
+      damage: 3,
+  });
+  let axe = Weapon::create(&mut builder, &WeaponArgs{
+      name: Some(weapon_two_name),
+      damage: 5,
+  });
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  let weapon1Name = builder.create(string: "Sword")
+  let weapon2Name = builder.create(string: "Axe")
+
+  // start creating the weapon by calling startWeapon
+  let weapon1Start = Weapon.startWeapon(&builder)
+  Weapon.add(name: weapon1Name, &builder)
+  Weapon.add(damage: 3, &builder)
+  // end the object by passing the start point for the weapon 1
+  let sword = Weapon.endWeapon(&builder, start: weapon1Start)
+
+  let weapon2Start = Weapon.startWeapon(&builder)
+  Weapon.add(name: weapon2Name, &builder)
+  Weapon.add(damage: 5, &builder)
+  let axe = Weapon.endWeapon(&builder, start: weapon2Start)
+~~~
+</div>
+
+Now let's create our monster, the `orc`. For this `orc`, lets make him
+`red` with rage, positioned at `(1.0, 2.0, 3.0)`, and give him
+a large pool of hit points with `300`. We can give him a vector of weapons
+to choose from (our `Sword` and `Axe` from earlier). In this case, we will
+equip him with the `Axe`, since it is the most powerful of the two. Lastly,
+let's fill his inventory with some potential treasures that can be taken once he
+is defeated.
+
+Before we serialize a monster, we need to first serialize any objects that are
+contained therein, i.e. we serialize the data tree using depth-first, pre-order
+traversal. This is generally easy to do on any tree structures.
+
+<div class="language-cpp">
+~~~{.cpp}
+  // Serialize a name for our monster, called "Orc".
+  auto name = builder.CreateString("Orc");
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  unsigned char treasure[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto inventory = builder.CreateVector(treasure, 10);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // Serialize a name for our monster, called "Orc".
+  int name = builder.createString("Orc");
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  byte[] treasure = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  int inv = Monster.createInventoryVector(builder, treasure);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // Serialize a name for our monster, called "Orc".
+  val name = builder.createString("Orc")
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  val treasure = byteArrayOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+  val inv = Monster.createInventoryVector(builder, treasure)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // Serialize a name for our monster, called "Orc".
+  var name = builder.CreateString("Orc");
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  // Note: Since we prepend the bytes, this loop iterates in reverse order.
+  Monster.StartInventoryVector(builder, 10);
+  for (int i = 9; i >= 0; i--)
+  {
+    builder.AddByte((byte)i);
+  }
+  var inv = builder.EndVector();
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // Serialize a name for our monster, called "Orc".
+  name := builder.CreateString("Orc")
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  // Note: Since we prepend the bytes, this loop iterates in reverse.
+  sample.MonsterStartInventoryVector(builder, 10)
+  for i := 9; i >= 0; i-- {
+          builder.PrependByte(byte(i))
+  }
+  inv := builder.EndVector(10)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # Serialize a name for our monster, called "Orc".
+  name = builder.CreateString("Orc")
+
+  # Create a `vector` representing the inventory of the Orc. Each number
+  # could correspond to an item that can be claimed after he is slain.
+  # Note: Since we prepend the bytes, this loop iterates in reverse.
+  MyGame.Sample.Monster.StartInventoryVector(builder, 10)
+  for i in reversed(range(0, 10)):
+    builder.PrependByte(i)
+  inv = builder.EndVector()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // Serialize a name for our monster, called 'Orc'.
+  var name = builder.createString('Orc');
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  var treasure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+  var inv = MyGame.Sample.Monster.createInventoryVector(builder, treasure);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // Serialize a name for our monster, called 'Orc'.
+  let name = builder.createString('Orc');
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  let treasure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+  let inv = MyGame.Sample.Monster.createInventoryVector(builder, treasure);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Serialize a name for our monster, called "Orc".
+  $name = $builder->createString("Orc");
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  $treasure = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
+  $inv = \MyGame\Sample\Monster::CreateInventoryVector($builder, $treasure);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Serialize a name for our monster, called "Orc".
+  // The _str suffix indicates the source is an ascii-z string.
+  flatbuffers_string_ref_t name = flatbuffers_string_create_str(B, "Orc");
+
+  // Create a `vector` representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  uint8_t treasure[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  flatbuffers_uint8_vec_ref_t inventory;
+  // `c_vec_len` is the convenience macro we defined earlier.
+  inventory = flatbuffers_uint8_vec_create(B, treasure, c_vec_len(treasure));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // Serialize a name for our monster, called "Orc".
+  final int name = builder.writeString('Orc');
+
+  // Create a list representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  final List<int> treasure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+  final inventory = builder.writeListUint8(treasure);
+
+  // The following code should be used instead if you intend to use the
+  // ObjectBuilder classes:
+  // Serialize a name for our monster, called "Orc".
+  final String name = 'Orc';
+
+  // Create a list representing the inventory of the Orc. Each number
+  // could correspond to an item that can be claimed after he is slain.
+  final List<int> treasure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+~~~
+</div>
+<div class="language-lua">
+~~~{.py}
+    -- Serialize a name for our mosnter, called 'orc'
+    local name = builder:CreateString("Orc")
+
+    -- Create a `vector` representing the inventory of the Orc. Each number
+    -- could correspond to an item that can be claimed after he is slain.
+    -- Note: Since we prepend the bytes, this loop iterates in reverse.
+    monster.StartInventoryVector(builder, 10)
+    for i=10,1,-1 do
+        builder:PrependByte(i)
+    end
+    local inv = builder:EndVector(10)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  // Name of the monster.
+  let name = builder.CreateString("Orc")
+
+  // Inventory.
+  let inv = builder.MyGame_Sample_MonsterCreateInventoryVector(map(10): _)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Name of the Monster.
+  let name = builder.create_string("Orc");
+
+  // Inventory.
+  let inventory = builder.create_vector(&[0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Name of the Monster.
+  let name = builder.create(string: "Orc")
+
+  // create inventory
+  let inventory: [Byte] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  let inventoryOffset = builder.createVector(inventory)
+~~~
+</div>
+
+We serialized two built-in data types (`string` and `vector`) and captured
+their return values. These values are offsets into the serialized data,
+indicating where they are stored, such that we can refer to them below when
+adding fields to our monster.
+
+*Note: To create a `vector` of nested objects (e.g. `table`s, `string`s, or
+other `vector`s), collect their offsets into a temporary data structure, and
+then create an additional `vector` containing their offsets.*
+
+If instead of creating a vector from an existing array you serialize elements
+individually one by one, take care to note that this happens in reverse order,
+as buffers are built back to front.
+
+For example, take a look at the two `Weapon`s that we created earlier (`Sword`
+and `Axe`). These are both FlatBuffer `table`s, whose offsets we now store in
+memory. Therefore we can create a FlatBuffer `vector` to contain these
+offsets.
+
+<div class="language-cpp">
+~~~{.cpp}
+  // Place the weapons into a `std::vector`, then convert that into a FlatBuffer `vector`.
+  std::vector<flatbuffers::Offset<Weapon>> weapons_vector;
+  weapons_vector.push_back(sword);
+  weapons_vector.push_back(axe);
+  auto weapons = builder.CreateVector(weapons_vector);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // Place the two weapons into an array, and pass it to the `createWeaponsVector()` method to
+  // create a FlatBuffer vector.
+  int[] weaps = new int[2];
+  weaps[0] = sword;
+  weaps[1] = axe;
+
+  // Pass the `weaps` array into the `createWeaponsVector()` method to create a FlatBuffer vector.
+  int weapons = Monster.createWeaponsVector(builder, weaps);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // Place the two weapons into an array, and pass it to the `createWeaponsVector()` method to
+  // create a FlatBuffer vector.
+  val weaps = intArrayOf(sword, axe)
+
+  // Pass the `weaps` array into the `createWeaponsVector()` method to create a FlatBuffer vector.
+  val weapons = Monster.createWeaponsVector(builder, weaps)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  var weaps = new Offset<Weapon>[2];
+  weaps[0] = sword;
+  weaps[1] = axe;
+
+  // Pass the `weaps` array into the `CreateWeaponsVector()` method to create a FlatBuffer vector.
+  var weapons = Monster.CreateWeaponsVector(builder, weaps);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // Create a FlatBuffer vector and prepend the weapons.
+  // Note: Since we prepend the data, prepend them in reverse order.
+  sample.MonsterStartWeaponsVector(builder, 2)
+  builder.PrependUOffsetT(axe)
+  builder.PrependUOffsetT(sword)
+  weapons := builder.EndVector(2)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # Create a FlatBuffer vector and prepend the weapons.
+  # Note: Since we prepend the data, prepend them in reverse order.
+  MyGame.Sample.Monster.StartWeaponsVector(builder, 2)
+  builder.PrependUOffsetTRelative(axe)
+  builder.PrependUOffsetTRelative(sword)
+  weapons = builder.EndVector()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // Create an array from the two `Weapon`s and pass it to the
+  // `createWeaponsVector()` method to create a FlatBuffer vector.
+  var weaps = [sword, axe];
+  var weapons = MyGame.Sample.Monster.createWeaponsVector(builder, weaps);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // Create an array from the two `Weapon`s and pass it to the
+  // `createWeaponsVector()` method to create a FlatBuffer vector.
+  let weaps = [sword, axe];
+  let weapons = MyGame.Sample.Monster.createWeaponsVector(builder, weaps);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Create an array from the two `Weapon`s and pass it to the
+  // `CreateWeaponsVector()` method to create a FlatBuffer vector.
+  $weaps = array($sword, $axe);
+  $weapons = \MyGame\Sample\Monster::CreateWeaponsVector($builder, $weaps);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // We use the internal builder stack to implement a dynamic vector.
+  ns(Weapon_vec_start(B));
+  ns(Weapon_vec_push(B, sword));
+  ns(Weapon_vec_push(B, axe));
+  ns(Weapon_vec_ref_t) weapons = ns(Weapon_vec_end(B));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // If using the Builder classes, serialize the `[sword,axe]`
+  final weapons = builder.writeList([sword, axe]);
+
+  // If using the ObjectBuilders, just create an array from the two `Weapon`s
+  final List<myGame.WeaponBuilder> weaps = [sword, axe];
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    -- Create a FlatBuffer vector and prepend the weapons.
+    -- Note: Since we prepend the data, prepend them in reverse order.
+    monster.StartWeaponsVector(builder, 2)
+    builder:PrependUOffsetTRelative(axe)
+    builder:PrependUOffsetTRelative(sword)
+    local weapons = builder:EndVector(2)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let weapons = builder.MyGame_Sample_MonsterCreateWeaponsVector(weapon_offsets)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Create a FlatBuffer `vector` that contains offsets to the sword and axe
+  // we created above.
+  let weapons = builder.create_vector(&[sword, axe]);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Create a FlatBuffer `vector` that contains offsets to the sword and axe
+  // we created above.
+  let weaponsOffset = builder.createVector(ofOffsets: [sword, axe])
+~~~
+</div>
+
+<br>
+Note there are additional convenience overloads of `CreateVector`, allowing you
+to work with data that's not in a `std::vector` or allowing you to generate
+elements by calling a lambda. For the common case of `std::vector<std::string>`
+there's also `CreateVectorOfStrings`.
+</div>
+
+Note that vectors of structs are serialized differently from tables, since
+structs are stored in-line in the vector. For example, to create a vector
+for the `path` field above:
+
+<div class="language-cpp">
+~~~{.cpp}
+  Vec3 points[] = { Vec3(1.0f, 2.0f, 3.0f), Vec3(4.0f, 5.0f, 6.0f) };
+  auto path = builder.CreateVectorOfStructs(points, 2);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  Monster.startPathVector(fbb, 2);
+  Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f);
+  Vec3.createVec3(builder, 4.0f, 5.0f, 6.0f);
+  int path = fbb.endVector();
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  Monster.startPathVector(fbb, 2)
+  Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f)
+  Vec3.createVec3(builder, 4.0f, 5.0f, 6.0f)
+  val path = fbb.endVector()
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  Monster.StartPathVector(fbb, 2);
+  Vec3.CreateVec3(builder, 1.0f, 2.0f, 3.0f);
+  Vec3.CreateVec3(builder, 4.0f, 5.0f, 6.0f);
+  var path = fbb.EndVector();
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  sample.MonsterStartPathVector(builder, 2)
+  sample.CreateVec3(builder, 1.0, 2.0, 3.0)
+  sample.CreateVec3(builder, 4.0, 5.0, 6.0)
+  path := builder.EndVector(2)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  MyGame.Sample.Monster.StartPathVector(builder, 2)
+  MyGame.Sample.Vec3.CreateVec3(builder, 1.0, 2.0, 3.0)
+  MyGame.Sample.Vec3.CreateVec3(builder, 4.0, 5.0, 6.0)
+  path = builder.EndVector()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  MyGame.Sample.Monster.startPathVector(builder, 2);
+  MyGame.Sample.Vec3.createVec3(builder, 1.0, 2.0, 3.0);
+  MyGame.Sample.Vec3.createVec3(builder, 4.0, 5.0, 6.0);
+  var path = builder.endVector();
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  MyGame.Sample.Monster.startPathVector(builder, 2);
+  MyGame.Sample.Vec3.createVec3(builder, 1.0, 2.0, 3.0);
+  MyGame.Sample.Vec3.createVec3(builder, 4.0, 5.0, 6.0);
+  let path = builder.endVector();
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  \MyGame\Example\Monster::StartPathVector($builder, 2);
+  \MyGame\Sample\Vec3::CreateVec3($builder, 1.0, 2.0, 3.0);
+  \MyGame\Sample\Vec3::CreateVec3($builder, 1.0, 2.0, 3.0);
+  $path = $builder->endVector();
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // TBD
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // Using the Builder classes, you can write a list of structs like so:
+  // Note that the intended order should be reversed if order is important.
+  final vec3Builder = new myGame.Vec3Builder(builder);
+  vec3Builder.finish(4.0, 5.0, 6.0);
+  vec3Builder.finish(1.0, 2.0, 3.0);
+  final int path = builder.endStructVector(2); // the length of the vector
+
+  // Otherwise, using the ObjectBuilder classes:
+  // The dart implementation provides a simple interface for writing vectors
+  // of structs, in `writeListOfStructs`. This method takes
+  // `List<ObjectBuilder>` and is used by the generated builder classes.
+  final List<myGame.Vec3ObjectBuilder> path = [
+    new myGame.Vec3ObjectBuilder(x: 1.0, y: 2.0, z: 3.0),
+    new myGame.Vec3ObjectBuilder(x: 4.0, y: 5.0, z: 6.0)
+  ];
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    -- Create a FlatBuffer vector and prepend the path locations.
+    -- Note: Since we prepend the data, prepend them in reverse order.
+    monster.StartPathVector(builder, 2)
+    vec3.CreateVec3(builder, 1.0, 2.0, 3.0)
+    vec3.CreateVec3(builder, 4.0, 5.0, 6.0)
+    local path = builder:EndVector(2)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  builder.MyGame_Sample_MonsterStartPathVector(2)
+  builder.MyGame_Sample_CreateVec3(1.0, 2.0, 3.0)
+  builder.MyGame_Sample_CreateVec3(4.0, 5.0, 6.0)
+  let path = builder.EndVector(2)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Create the path vector of Vec3 objects.
+  let x = Vec3::new(1.0, 2.0, 3.0);
+  let y = Vec3::new(4.0, 5.0, 6.0);
+  let path = builder.create_vector(&[x, y]);
+
+  // Note that, for convenience, it is also valid to create a vector of
+  // references to structs, like this:
+  // let path = builder.create_vector(&[&x, &y]);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  //
+  let points = fbb.createVector(ofStructs: [
+    Vec3(x: 1, y: 2, z: 3),
+    Vec3(x: 4, y: 5, z: 6)
+  ])
+
+  // OR
+  var vec3 = [
+    Vec3(x: 1, y: 2, z: 3),
+    Vec3(x: 4, y: 5, z: 6)
+  ]
+  Monster.startVectorOfVec3(2, in: &fbb)
+  for i in obj {
+    _ = create(struct: i)
+  }
+  let points = fbb.endVector(len: size)
+~~~
+</div>
+
+
+We have now serialized the non-scalar components of the orc, so we
+can serialize the monster itself:
+
+<div class="language-cpp">
+~~~{.cpp}
+  // Create the position struct
+  auto position = Vec3(1.0f, 2.0f, 3.0f);
+
+  // Set his hit points to 300 and his mana to 150.
+  int hp = 300;
+  int mana = 150;
+
+  // Finally, create the monster using the `CreateMonster` helper function
+  // to set all fields.
+  auto orc = CreateMonster(builder, &position, mana, hp, name, inventory,
+                          Color_Red, weapons, Equipment_Weapon, axe.Union(),
+                          path);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // Create our monster using `startMonster()` and `endMonster()`.
+  Monster.startMonster(builder);
+  Monster.addPos(builder, Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f));
+  Monster.addName(builder, name);
+  Monster.addColor(builder, Color.Red);
+  Monster.addHp(builder, (short)300);
+  Monster.addInventory(builder, inv);
+  Monster.addWeapons(builder, weapons);
+  Monster.addEquippedType(builder, Equipment.Weapon);
+  Monster.addEquipped(builder, axe);
+  Monster.addPath(builder, path);
+  int orc = Monster.endMonster(builder);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // Create our monster using `startMonster()` and `endMonster()`.
+  Monster.startMonster(builder)
+  Monster.addPos(builder, Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f))
+  Monster.addName(builder, name)
+  Monster.addColor(builder, Color.Red)
+  Monster.addHp(builder, 300.toShort())
+  Monster.addInventory(builder, inv)
+  Monster.addWeapons(builder, weapons)
+  Monster.addEquippedType(builder, Equipment.Weapon)
+  Monster.addEquipped(builder, axe)
+  Monster.addPath(builder, path)
+  val orc = Monster.endMonster(builder)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // Create our monster using `StartMonster()` and `EndMonster()`.
+  Monster.StartMonster(builder);
+  Monster.AddPos(builder, Vec3.CreateVec3(builder, 1.0f, 2.0f, 3.0f));
+  Monster.AddHp(builder, (short)300);
+  Monster.AddName(builder, name);
+  Monster.AddInventory(builder, inv);
+  Monster.AddColor(builder, Color.Red);
+  Monster.AddWeapons(builder, weapons);
+  Monster.AddEquippedType(builder, Equipment.Weapon);
+  Monster.AddEquipped(builder, axe.Value); // Axe
+  Monster.AddPath(builder, path);
+  var orc = Monster.EndMonster(builder);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // Create our monster using `MonsterStart()` and `MonsterEnd()`.
+  sample.MonsterStart(builder)
+  sample.MonsterAddPos(builder, sample.CreateVec3(builder, 1.0, 2.0, 3.0))
+  sample.MonsterAddHp(builder, 300)
+  sample.MonsterAddName(builder, name)
+  sample.MonsterAddInventory(builder, inv)
+  sample.MonsterAddColor(builder, sample.ColorRed)
+  sample.MonsterAddWeapons(builder, weapons)
+  sample.MonsterAddEquippedType(builder, sample.EquipmentWeapon)
+  sample.MonsterAddEquipped(builder, axe)
+  sample.MonsterAddPath(builder, path)
+  orc := sample.MonsterEnd(builder)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # Create our monster by using `Monster.Start()` and `Monster.End()`.
+  MyGame.Sample.Monster.Start(builder)
+  MyGame.Sample.Monster.AddPos(builder,
+                          MyGame.Sample.Vec3.CreateVec3(builder, 1.0, 2.0, 3.0))
+  MyGame.Sample.Monster.AddHp(builder, 300)
+  MyGame.Sample.Monster.AddName(builder, name)
+  MyGame.Sample.Monster.AddInventory(builder, inv)
+  MyGame.Sample.Monster.AddColor(builder,
+                                        MyGame.Sample.Color.Color().Red)
+  MyGame.Sample.Monster.AddWeapons(builder, weapons)
+  MyGame.Sample.Monster.AddEquippedType(
+      builder, MyGame.Sample.Equipment.Equipment().Weapon)
+  MyGame.Sample.Monster.AddEquipped(builder, axe)
+  MyGame.Sample.Monster.AddPath(builder, path)
+  orc = MyGame.Sample.Monster.End(builder)
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // Create our monster by using `startMonster()` and `endMonster()`.
+  MyGame.Sample.Monster.startMonster(builder);
+  MyGame.Sample.Monster.addPos(builder,
+                         MyGame.Sample.Vec3.createVec3(builder, 1.0, 2.0, 3.0));
+  MyGame.Sample.Monster.addHp(builder, 300);
+  MyGame.Sample.Monster.addColor(builder, MyGame.Sample.Color.Red)
+  MyGame.Sample.Monster.addName(builder, name);
+  MyGame.Sample.Monster.addInventory(builder, inv);
+  MyGame.Sample.Monster.addWeapons(builder, weapons);
+  MyGame.Sample.Monster.addEquippedType(builder, MyGame.Sample.Equipment.Weapon);
+  MyGame.Sample.Monster.addEquipped(builder, axe);
+  MyGame.Sample.Monster.addPath(builder, path);
+  var orc = MyGame.Sample.Monster.endMonster(builder);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // Create our monster by using `startMonster()` and `endMonster()`.
+  MyGame.Sample.Monster.startMonster(builder);
+  MyGame.Sample.Monster.addPos(builder,
+                         MyGame.Sample.Vec3.createVec3(builder, 1.0, 2.0, 3.0));
+  MyGame.Sample.Monster.addHp(builder, 300);
+  MyGame.Sample.Monster.addColor(builder, MyGame.Sample.Color.Red)
+  MyGame.Sample.Monster.addName(builder, name);
+  MyGame.Sample.Monster.addInventory(builder, inv);
+  MyGame.Sample.Monster.addWeapons(builder, weapons);
+  MyGame.Sample.Monster.addEquippedType(builder, MyGame.Sample.Equipment.Weapon);
+  MyGame.Sample.Monster.addEquipped(builder, axe);
+  MyGame.Sample.Monster.addPath(builder, path);
+  let orc = MyGame.Sample.Monster.endMonster(builder);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Create our monster by using `StartMonster()` and `EndMonster()`.
+  \MyGame\Sample\Monster::StartMonster($builder);
+  \MyGame\Sample\Monster::AddPos($builder,
+                      \MyGame\Sample\Vec3::CreateVec3($builder, 1.0, 2.0, 3.0));
+  \MyGame\Sample\Monster::AddHp($builder, 300);
+  \MyGame\Sample\Monster::AddName($builder, $name);
+  \MyGame\Sample\Monster::AddInventory($builder, $inv);
+  \MyGame\Sample\Monster::AddColor($builder, \MyGame\Sample\Color::Red);
+  \MyGame\Sample\Monster::AddWeapons($builder, $weapons);
+  \MyGame\Sample\Monster::AddEquippedType($builder, \MyGame\Sample\Equipment::Weapon);
+  \MyGame\Sample\Monster::AddEquipped($builder, $axe);
+  \MyGame\Sample\Monster::AddPath($builder, $path);
+  $orc = \MyGame\Sample\Monster::EndMonster($builder);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Set his hit points to 300 and his mana to 150.
+  uint16_t hp = 300;
+  uint16_t mana = 150;
+
+  // Define an equipment union. `create` calls in C has a single
+  // argument for unions where C++ has both a type and a data argument.
+  ns(Equipment_union_ref_t) equipped = ns(Equipment_as_Weapon(axe));
+  ns(Vec3_t) pos = { 1.0f, 2.0f, 3.0f };
+  ns(Monster_create_as_root(B, &pos, mana, hp, name, inventory, ns(Color_Red),
+          weapons, equipped, path));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // Using the Builder API:
+  // Set his hit points to 300 and his mana to 150.
+  final int hp = 300;
+  final int mana = 150;
+
+  final monster = new myGame.MonsterBuilder(builder)
+    ..begin()
+    ..addNameOffset(name)
+    ..addInventoryOffset(inventory)
+    ..addWeaponsOffset(weapons)
+    ..addEquippedType(myGame.EquipmentTypeId.Weapon)
+    ..addEquippedOffset(axe)
+    ..addHp(hp)
+    ..addMana(mana)
+    ..addPos(vec3Builder.finish(1.0, 2.0, 3.0))
+    ..addPathOffset(path)
+    ..addColor(myGame.Color.Red);
+
+  final int orc = monster.finish();
+
+  // -Or- using the ObjectBuilder API:
+  // Set his hit points to 300 and his mana to 150.
+  final int hp = 300;
+  final int mana = 150;
+
+  // Note that these parameters are optional - it is not necessary to set
+  // all of them.
+  // Also note that it is not necessary to `finish` the builder helpers above
+  // - the generated code will automatically reuse offsets if the same object
+  // is used in more than one place (e.g. the axe appearing in `weapons` and
+  // `equipped`).
+  final myGame.MonsterBuilder orcBuilder = new myGame.MonsterBuilder(
+    name: name,
+    inventory: treasure,
+    weapons: weaps,
+    equippedType: myGame.EquipmentTypeId.Weapon,
+    equipped: axe,
+    path: path,
+    hp: hp,
+    mana: mana,
+    pos: new myGame.Vec3Builder(x: 1.0, y: 2.0, z: 3.0),
+    color: myGame.Color.Red,
+    path: [
+        new myGame.Vec3ObjectBuilder(x: 1.0, y: 2.0, z: 3.0),
+        new myGame.Vec3ObjectBuilder(x: 4.0, y: 5.0, z: 6.0)
+    ]);
+
+  final int orc = orcBuilder.finish(builder);
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    -- Create our monster by using Start() andEnd()
+    monster.Start(builder)
+    monster.AddPos(builder, vec3.CreateVec3(builder, 1.0, 2.0, 3.0))
+    monster.AddHp(builder, 300)
+    monster.AddName(builder, name)
+    monster.AddInventory(builder, inv)
+    monster.AddColor(builder, color.Red)
+    monster.AddWeapons(builder, weapons)
+    monster.AddEquippedType(builder, equipment.Weapon)
+    monster.AddEquipped(builder, axe)
+    monster.AddPath(builder, path)
+    local orc = monster.End(builder)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let orc = MyGame_Sample_MonsterBuilder { b }
+      .start()
+      .add_pos(b.MyGame_Sample_CreateVec3(1.0, 2.0, 3.0))
+      .add_hp(300)
+      .add_name(name)
+      .add_inventory(inv)
+      .add_color(MyGame_Sample_Color_Red)
+      .add_weapons(weapons)
+      .add_equipped_type(MyGame_Sample_Equipment_Weapon)
+      .add_equipped(weapon_offsets[1])
+      .add_path(path)
+      .end()
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Create the monster using the `Monster::create` helper function. This
+  // function accepts a `MonsterArgs` struct, which supplies all of the data
+  // needed to build a `Monster`. To supply empty/default fields, just use the
+  // Rust built-in `Default::default()` function, as demonstrated below.
+  let orc = Monster::create(&mut builder, &MonsterArgs{
+      pos: Some(&Vec3::new(1.0f32, 2.0f32, 3.0f32)),
+      mana: 150,
+      hp: 80,
+      name: Some(name),
+      inventory: Some(inventory),
+      color: Color::Red,
+      weapons: Some(weapons),
+      equipped_type: Equipment::Weapon,
+      equipped: Some(axe.as_union_value()),
+      path: Some(path),
+      ..Default::default()
+  });
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  let orc = Monster.createMonster(
+    &builder,
+    pos: MyGame_Sample_Vec3(x: 1, y: 2, z: 3),
+    hp: 300,
+    nameOffset: name,
+    inventoryVectorOffset: inventoryOffset,
+    color: .red,
+    weaponsVectorOffset: weaponsOffset,
+    equippedType: .weapon,
+    equippedOffset: axe)
+~~~
+</div>
+
+Note how we create `Vec3` struct in-line in the table. Unlike tables, structs
+are simple combinations of scalars that are always stored inline, just like
+scalars themselves.
+
+**Important**: Unlike structs, you should not nest tables or other objects,
+which is why we created all the strings/vectors/tables that this monster refers
+to before `start`. If you try to create any of them between `start` and `end`,
+you will get an assert/exception/panic depending on your language.
+
+*Note: Since we are passing `150` as the `mana` field, which happens to be the
+default value, the field will not actually be written to the buffer, since the
+default value will be returned on query anyway. This is a nice space savings,
+especially if default values are common in your data. It also means that you do
+not need to be worried about adding a lot of fields that are only used in a small
+number of instances, as it will not bloat the buffer if unused.*
+
+<div class="language-cpp">
+<br>
+If you do not wish to set every field in a `table`, it may be more convenient to
+manually set each field of your monster, instead of calling `CreateMonster()`.
+The following snippet is functionally equivalent to the above code, but provides
+a bit more flexibility.
+<br>
+~~~{.cpp}
+  // You can use this code instead of `CreateMonster()`, to create our orc
+  // manually.
+  MonsterBuilder monster_builder(builder);
+  monster_builder.add_pos(&position);
+  monster_builder.add_hp(hp);
+  monster_builder.add_name(name);
+  monster_builder.add_inventory(inventory);
+  monster_builder.add_color(Color_Red);
+  monster_builder.add_weapons(weapons);
+  monster_builder.add_equipped_type(Equipment_Weapon);
+  monster_builder.add_equipped(axe.Union());
+  auto orc = monster_builder.Finish();
+~~~
+</div>
+<div class="language-c">
+If you do not wish to set every field in a `table`, it may be more convenient to
+manually set each field of your monster, instead of calling `create_monster_as_root()`.
+The following snippet is functionally equivalent to the above code, but provides
+a bit more flexibility.
+<br>
+~~~{.c}
+  // It is important to pair `start_as_root` with `end_as_root`.
+  ns(Monster_start_as_root(B));
+  ns(Monster_pos_create(B, 1.0f, 2.0f, 3.0f));
+  // or alternatively
+  //ns(Monster_pos_add(&pos);
+
+  ns(Monster_hp_add(B, hp));
+  // Notice that `Monser_name_add` adds a string reference unlike the
+  // add_str and add_strn variants.
+  ns(Monster_name_add(B, name));
+  ns(Monster_inventory_add(B, inventory));
+  ns(Monster_color_add(B, ns(Color_Red)));
+  ns(Monster_weapons_add(B, weapons));
+  ns(Monster_equipped_add(B, equipped));
+  // Complete the monster object and make it the buffer root object.
+  ns(Monster_end_as_root(B));
+~~~
+</div>
+
+<div class="language-swift">
+~~~{.swift}
+  let start = Monster.startMonster(&builder)
+  Monster.add(pos: Vec3(x: 1, y: 2, z: 3), &builder)
+  Monster.add(hp: 300, &builder)
+  Monster.add(name: name, &builder)
+  Monster.addVectorOf(inventory: inventoryOffset, &builder)
+  Monster.add(color: .red, &builder)
+  Monster.addVectorOf(weapons: weaponsOffset, &builder)
+  Monster.add(equippedType: .weapon, &builder)
+  Monster.add(equipped: axe, &builder)
+  var orc = Monster.endMonster(&builder, start: start)
+~~~
+</div>
+
+Before finishing the serialization, let's take a quick look at FlatBuffer
+`union Equipped`. There are two parts to each FlatBuffer `union`. The first is
+a hidden field `_type` that is generated to hold the type of `table` referred
+to by the `union`. This allows you to know which type to cast to at runtime.
+Second is the `union`'s data.
+
+In our example, the last two things we added to our `Monster` were the
+`Equipped Type` and the `Equipped` union itself.
+
+Here is a repetition of these lines, to help highlight them more clearly:
+
+<div class="language-cpp">
+  ~~~{.cpp}
+    monster_builder.add_equipped_type(Equipment_Weapon); // Union type
+    monster_builder.add_equipped(axe); // Union data
+  ~~~
+</div>
+<div class="language-java">
+  ~~~{.java}
+    Monster.addEquippedType(builder, Equipment.Weapon); // Union type
+    Monster.addEquipped(axe); // Union data
+  ~~~
+</div>
+<div class="language-kotlin">
+  ~~~{.kt}
+    Monster.addEquippedType(builder, Equipment.Weapon) // Union type
+    Monster.addEquipped(axe) // Union data
+  ~~~
+</div>
+<div class="language-csharp">
+  ~~~{.cs}
+    Monster.AddEquippedType(builder, Equipment.Weapon); // Union type
+    Monster.AddEquipped(builder, axe.Value); // Union data
+  ~~~
+</div>
+<div class="language-go">
+  ~~~{.go}
+    sample.MonsterAddEquippedType(builder, sample.EquipmentWeapon) // Union type
+    sample.MonsterAddEquipped(builder, axe) // Union data
+  ~~~
+</div>
+<div class="language-python">
+  ~~~{.py}
+    MyGame.Sample.Monster.AddEquippedType(            # Union type
+        builder, MyGame.Sample.Equipment.Equipment().Weapon)
+    MyGame.Sample.Monster.AddEquipped(builder, axe)   # Union data
+  ~~~
+</div>
+<div class="language-javascript">
+  ~~~{.js}
+    MyGame.Sample.Monster.addEquippedType(builder, MyGame.Sample.Equipment.Weapon); // Union type
+    MyGame.Sample.Monster.addEquipped(builder, axe); // Union data
+  ~~~
+</div>
+<div class="language-typescript">
+  ~~~{.ts}
+    MyGame.Sample.Monster.addEquippedType(builder, MyGame.Sample.Equipment.Weapon); // Union type
+    MyGame.Sample.Monster.addEquipped(builder, axe); // Union data
+  ~~~
+</div>
+<div class="language-php">
+  ~~~{.php}
+    \MyGame\Sample\Monster::AddEquippedType($builder, \MyGame\Sample\Equipment::Weapon); // Union type
+    \MyGame\Sample\Monster::AddEquipped($builder, $axe); // Union data
+  ~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Add union type and data simultaneously.
+  ns(Monster_equipped_Weapon_add(B, axe));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // using the builder API:
+  ..addEquippedType(myGame.EquipmentTypeId.Weapon)
+  ..addEquippedOffset(axe)
+
+  // in the ObjectBuilder API:
+  equippedTypeId: myGame.EquipmentTypeId.Weapon,  // Union type
+  equipped: axe,                                  // Union data
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    monster.AddEquippedType(builder, equipment.Weapon) -- Union type
+    monster.AddEquipped(builder, axe) -- Union data
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+    .add_equipped_type(MyGame_Sample_Equipment_Weapon)
+    .add_equipped(axe)
+~~~
+</div>
+<div class="language-rust">
+  ~~~{.rs}
+    // You need to call `as_union_value` to turn an object into a type that
+    // can be used as a union value.
+    monster_builder.add_equipped_type(Equipment::Weapon); // Union type
+    monster_builder.add_equipped(axe.as_union_value()); // Union data
+  ~~~
+</div>
+<div class="language-swift">
+  ~~~{.swift}
+    Monster.add(equippedType: .weapon, builder) // Type of union
+    Monster.add(equipped: axe, builder) // Union data
+  ~~~
+</div>
+
+
+After you have created your buffer, you will have the offset to the root of the
+data in the `orc` variable, so you can finish the buffer by calling the
+appropriate `finish` method.
+
+
+<div class="language-cpp">
+~~~{.cpp}
+  // Call `Finish()` to instruct the builder that this monster is complete.
+  // Note: Regardless of how you created the `orc`, you still need to call
+  // `Finish()` on the `FlatBufferBuilder`.
+  builder.Finish(orc); // You could also call `FinishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  builder.finish(orc); // You could also call `Monster.finishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  builder.finish(orc) // You could also call `Monster.finishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // Call `Finish()` to instruct the builder that this monster is complete.
+  builder.Finish(orc.Value); // You could also call `Monster.FinishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // Call `Finish()` to instruct the builder that this monster is complete.
+  builder.Finish(orc)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # Call `Finish()` to instruct the builder that this monster is complete.
+  builder.Finish(orc)
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  builder.finish(orc); // You could also call `MyGame.Sample.Monster.finishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  builder.finish(orc); // You could also call `MyGame.Sample.Monster.finishMonsterBuffer(builder, orc);`.
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // Call `finish()` to instruct the builder that this monster is complete.
+   $builder->finish($orc); // You may also call `\MyGame\Sample\Monster::FinishMonsterBuffer($builder, $orc);`.
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Because we used `Monster_create_as_root`, we do not need a `finish` call in C`.
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  // See the next code section, as in Dart `finish` will also return the byte array.
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    -- Call 'Finish()' to instruct the builder that this monster is complete.
+    builder:Finish(orc)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  // Call `Finish()` to instruct the builder that this monster is complete.
+  builder.Finish(orc)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Call `finish()` to instruct the builder that this monster is complete.
+  builder.finish(orc, None);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Call `finish(offset:)` to instruct the builder that this monster is complete.
+  builder.finish(offset: orc)
+~~~
+</div>
+
+The buffer is now ready to be stored somewhere, sent over the network, be
+compressed, or whatever you'd like to do with it. You can access the buffer
+like so:
+
+<div class="language-cpp">
+~~~{.cpp}
+  // This must be called after `Finish()`.
+  uint8_t *buf = builder.GetBufferPointer();
+  int size = builder.GetSize(); // Returns the size of the buffer that
+                                // `GetBufferPointer()` points to.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  // This must be called after `finish()`.
+  java.nio.ByteBuffer buf = builder.dataBuffer();
+  // The data in this ByteBuffer does NOT start at 0, but at buf.position().
+  // The number of bytes is buf.remaining().
+
+  // Alternatively this copies the above data out of the ByteBuffer for you:
+  byte[] buf = builder.sizedByteArray();
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  // This must be called after `finish()`.
+  val buf = builder.dataBuffer()
+  // The data in this ByteBuffer does NOT start at 0, but at buf.position().
+  // The number of bytes is buf.remaining().
+
+  // Alternatively this copies the above data out of the ByteBuffer for you:
+  val buf = builder.sizedByteArray()
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // This must be called after `Finish()`.
+  var buf = builder.DataBuffer; // Of type `FlatBuffers.ByteBuffer`.
+  // The data in this ByteBuffer does NOT start at 0, but at buf.Position.
+  // The end of the data is marked by buf.Length, so the size is
+  // buf.Length - buf.Position.
+
+  // Alternatively this copies the above data out of the ByteBuffer for you:
+  byte[] buf = builder.SizedByteArray();
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // This must be called after `Finish()`.
+  buf := builder.FinishedBytes() // Of type `byte[]`.
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  # This must be called after `Finish()`.
+  buf = builder.Output() // Of type `bytearray`.
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // This must be called after `finish()`.
+  var buf = builder.asUint8Array(); // Of type `Uint8Array`.
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // This must be called after `finish()`.
+  let buf = builder.asUint8Array(); // Of type `Uint8Array`.
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // This must be called after `finish()`.
+  $buf = $builder->dataBuffer(); // Of type `Google\FlatBuffers\ByteBuffer`
+  // The data in this ByteBuffer does NOT start at 0, but at buf->getPosition().
+  // The end of the data is marked by buf->capacity(), so the size is
+  // buf->capacity() - buf->getPosition().
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  uint8_t *buf;
+  size_t size;
+
+  // Allocate and extract a readable buffer from internal builder heap.
+  // The returned buffer must be deallocated using `free`.
+  // NOTE: Finalizing the buffer does NOT change the builder, it
+  // just creates a snapshot of the builder content.
+  buf = flatcc_builder_finalize_buffer(B, &size);
+  // use buf
+  free(buf);
+
+  // Optionally reset builder to reuse builder without deallocating
+  // internal stack and heap.
+  flatcc_builder_reset(B);
+  // build next buffer.
+  // ...
+
+  // Cleanup.
+  flatcc_builder_clear(B);
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  final Uint8List buf = builder.finish(orc);
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    -- Get the flatbuffer as a string containing the binary data
+    local bufAsString = builder:Output()
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  // This must be called after `Finish()`.
+  let buf = builder.SizedCopy() // Of type `string`.
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // This must be called after `finish()`.
+  // `finished_data` returns a byte slice.
+  let buf = builder.finished_data(); // Of type `&[u8]`
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // This must be called after `finish()`.
+  // `sizedByteArray` returns the finished buf of type [UInt8].
+  let buf = builder.sizedByteArray
+  // or you can use to get an object of type Data
+  let bufData = ByteBuffer(data: builder.data)
+~~~
+</div>
+
+Now you can write the bytes to a file or send them over the network.
+**Make sure your file mode (or transfer protocol) is set to BINARY, not text.**
+If you transfer a FlatBuffer in text mode, the buffer will be corrupted,
+which will lead to hard to find problems when you read the buffer.
+
+<div class="language-javascript">
+For example, in Node you can simply do:
+~~~{.js}
+  writeFileSync('monster.bin', buf, 'binary');
+~~~
+</div>
+<div class="language-typescript">
+For example, in Node you can simply do:
+~~~{.ts}
+  writeFileSync('monster.bin', buf, 'binary');
+~~~
+</div>
+
+#### Reading Orc FlatBuffers
+
+Now that we have successfully created an `Orc` FlatBuffer, the monster data can
+be saved, sent over a network, etc. Let's now adventure into the inverse, and
+access a FlatBuffer.
+
+This section requires the same import/include, namespace, etc. requirements as
+before:
+
+<div class="language-cpp">
+~~~{.cpp}
+  #include "monster_generated.h" // This was generated by `flatc`.
+
+  using namespace MyGame::Sample; // Specified in the schema.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  import MyGame.Sample.*; //The `flatc` generated files. (Monster, Vec3, etc.)
+
+  import com.google.flatbuffers.FlatBufferBuilder;
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  import MyGame.Sample.* //The `flatc` generated files. (Monster, Vec3, etc.)
+
+  import com.google.flatbuffers.FlatBufferBuilder
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  using FlatBuffers;
+  using MyGame.Sample; // The `flatc` generated files. (Monster, Vec3, etc.)
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  import (
+          flatbuffers "github.com/google/flatbuffers/go"
+          sample "MyGame/Sample"
+  )
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  import flatbuffers
+
+  # Generated by `flatc`.
+  import MyGame.Sample.Any
+  import MyGame.Sample.Color
+  import MyGame.Sample.Monster
+  import MyGame.Sample.Vec3
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // The following code is an example - use your desired module flavor by transpiling from TS. 
+  var flatbuffers = require('/js/flatbuffers').flatbuffers;
+  var MyGame = require('./monster_generated').MyGame; // Generated by `flatc`.
+
+  //--------------------------------------------------------------------------//
+
+  // The following code an example for browser-based HTML/JavaScript. Use the above code
+  // for JavaScript module loaders (e.g. Node.js).
+  <script src="../js/flatbuffers.js"></script>
+  <script src="monster_generated.js"></script> // Generated by `flatc`.
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // note: import flatbuffers with your desired import method
+
+  // note: the `./monster_generated.ts` file was previously generated by `flatc` above using the `monster.fbs` schema
+  import { MyGame } from './monster_generated';
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  // It is recommended that your use PSR autoload when using FlatBuffers in PHP.
+  // Here is an example from `SampleBinary.php`:
+  function __autoload($class_name) {
+    // The last segment of the class name matches the file name.
+    $class = substr($class_name, strrpos($class_name, "\\") + 1);
+    $root_dir = join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)))); // `flatbuffers` root.
+
+    // Contains the `*.php` files for the FlatBuffers library and the `flatc` generated files.
+    $paths = array(join(DIRECTORY_SEPARATOR, array($root_dir, "php")),
+                   join(DIRECTORY_SEPARATOR, array($root_dir, "samples", "MyGame", "Sample")));
+    foreach ($paths as $path) {
+      $file = join(DIRECTORY_SEPARATOR, array($path, $class . ".php"));
+      if (file_exists($file)) {
+        require($file);
+        break;
+      }
+    }
+  }
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Only needed if we don't have `#include "monster_builder.h"`.
+  #include "monster_reader.h"
+
+  #undef ns
+  #define ns(x) FLATBUFFERS_WRAP_NAMESPACE(MyGame_Sample, x) // Specified in the schema.
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+import './monster_my_game.sample_generated.dart' as myGame;
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  -- require the flatbuffers module
+  local flatbuffers = require("flatbuffers")
+
+  -- require the generated files from `flatc`.
+  local color = require("MyGame.Sample.Color")
+  local equipment = require("MyGame.Sample.Equipment")
+  local monster = require("MyGame.Sample.Monster")
+  local vec3 = require("MyGame.Sample.Vec3")
+  local weapon = require("MyGame.Sample.Weapon")
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  import from "../lobster/"  // Where to find flatbuffers.lobster
+  import monster_generated
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // import the flatbuffers runtime library
+  extern crate flatbuffers;
+
+  // import the generated code
+  #[allow(dead_code, unused_imports)]
+  #[path = "./monster_generated.rs"]
+  mod monster_generated;
+  pub use monster_generated::my_game::sample::{get_root_as_monster,
+                                               Color, Equipment,
+                                               Monster, MonsterArgs,
+                                               Vec3,
+                                               Weapon, WeaponArgs};
+~~~
+</div>
+
+Then, assuming you have a buffer of bytes received from disk,
+network, etc., you can start accessing the buffer like so:
+
+**Again, make sure you read the bytes in BINARY mode, otherwise the code below
+won't work.**
+
+<div class="language-cpp">
+~~~{.cpp}
+  uint8_t *buffer_pointer = /* the data you just read */;
+
+  // Get a pointer to the root object inside the buffer.
+  auto monster = GetMonster(buffer_pointer);
+
+  // `monster` is of type `Monster *`.
+  // Note: root object pointers are NOT the same as `buffer_pointer`.
+  // `GetMonster` is a convenience function that calls `GetRoot<Monster>`,
+  // the latter is also available for non-root types.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  byte[] bytes = /* the data you just read */
+  java.nio.ByteBuffer buf = java.nio.ByteBuffer.wrap(bytes);
+
+  // Get an accessor to the root object inside the buffer.
+  Monster monster = Monster.getRootAsMonster(buf);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val bytes = /* the data you just read */
+  val buf = java.nio.ByteBuffer.wrap(bytes)
+
+  // Get an accessor to the root object inside the buffer.
+  Monster monster = Monster.getRootAsMonster(buf)
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  byte[] bytes = /* the data you just read */
+  var buf = new ByteBuffer(bytes);
+
+  // Get an accessor to the root object inside the buffer.
+  var monster = Monster.GetRootAsMonster(buf);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  var buf []byte = /* the data you just read */
+
+  // Get an accessor to the root object inside the buffer.
+  monster := sample.GetRootAsMonster(buf, 0)
+
+  // Note: We use `0` for the offset here, which is typical for most buffers
+  // you would read. If you wanted to read from `builder.Bytes` directly, you
+  // would need to pass in the offset of `builder.Head()`, as the builder
+  // constructs the buffer backwards, so may not start at offset 0.
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  buf = /* the data you just read, in an object of type "bytearray" */
+
+  // Get an accessor to the root object inside the buffer.
+  monster = MyGame.Sample.Monster.Monster.GetRootAs(buf, 0)
+
+  # Note: We use `0` for the offset here, which is typical for most buffers
+  # you would read.  If you wanted to read from the `builder.Bytes` directly,
+  # you would need to pass in the offset of `builder.Head()`, as the builder
+  # constructs the buffer backwards, so may not start at offset 0.
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  // the data you just read, as a `Uint8Array`
+  // Note that the example here uses `readFileSync` from the built-in `fs` module,
+  // but other methods for accessing the file contents will also work.
+  var bytes = new Uint8Array(readFileSync('./monsterdata.bin'));
+
+  var buf = new flatbuffers.ByteBuffer(bytes);
+
+  // Get an accessor to the root object inside the buffer.
+  var monster = MyGame.Sample.Monster.getRootAsMonster(buf);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  // the data you just read, as a `Uint8Array`.
+  // Note that the example here uses `readFileSync` from the built-in `fs` module,
+  // but other methods for accessing the file contents will also work.
+  let bytes = new Uint8Array(readFileSync('./monsterdata.bin'));
+
+  let buf = new flatbuffers.ByteBuffer(bytes);
+
+  // Get an accessor to the root object inside the buffer.
+  let monster = MyGame.Sample.Monster.getRootAsMonster(buf);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $bytes = /* the data you just read, in a string */
+  $buf = Google\FlatBuffers\ByteBuffer::wrap($bytes);
+
+  // Get an accessor to the root object inside the buffer.
+  $monster = \MyGame\Sample\Monster::GetRootAsMonster($buf);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Note that we use the `table_t` suffix when reading a table object
+  // as opposed to the `ref_t` suffix used during the construction of
+  // the buffer.
+  ns(Monster_table_t) monster = ns(Monster_as_root(buffer));
+
+  // Note: root object pointers are NOT the same as the `buffer` pointer.
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+List<int> data = ... // the data, e.g. from file or network
+// A generated factory constructor that will read the data.
+myGame.Monster monster = new myGame.Monster(data);
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+    local bufAsString =   -- The data you just read in
+
+    -- Convert the string representation into binary array Lua structure
+    local buf = flatbuffers.binaryArray.New(bufAsString)
+
+    -- Get an accessor to the root object insert the buffer
+    local mon = monster.GetRootAsMonster(buf, 0)
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  buf = /* the data you just read, in a string */
+
+  // Get an accessor to the root object inside the buffer.
+  let monster = MyGame_Sample_GetRootAsMonster(buf)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  let buf = /* the data you just read, in a &[u8] */
+
+  // Get an accessor to the root object inside the buffer.
+  let monster = get_root_as_monster(buf);
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // create a ByteBuffer(:) from an [UInt8] or Data()
+  let buf = // Get your data
+
+  // Get an accessor to the root object inside the buffer.
+  let monster = Monster.getRootAsMonster(bb: ByteBuffer(bytes: buf))
+~~~
+</div>
+
+If you look in the generated files from the schema compiler, you will see it generated
+accessors for all non-`deprecated` fields. For example:
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto hp = monster->hp();
+  auto mana = monster->mana();
+  auto name = monster->name()->c_str();
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  short hp = monster.hp();
+  short mana = monster.mana();
+  String name = monster.name();
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val hp = monster.hp
+  val mana = monster.mana
+  val name = monster.name
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  // For C#, unlike most other languages support by FlatBuffers, most values (except for
+  // vectors and unions) are available as properties instead of accessor methods.
+  var hp = monster.Hp
+  var mana = monster.Mana
+  var name = monster.Name
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  hp := monster.Hp()
+  mana := monster.Mana()
+  name := string(monster.Name()) // Note: `monster.Name()` returns a byte[].
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  hp = monster.Hp()
+  mana = monster.Mana()
+  name = monster.Name()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var hp = monster.hp();
+  var mana = monster.mana();
+  var name = monster.name();
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let hp = monster.hp();
+  let mana = monster.mana();
+  let name = monster.name();
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $hp = $monster->getHp();
+  $mana = $monster->getMana();
+  $name = monster->getName();
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  uint16_t hp = ns(Monster_hp(monster));
+  uint16_t mana = ns(Monster_mana(monster));
+  flatbuffers_string_t name = ns(Monster_name(monster));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  // For Dart, unlike other languages support by FlatBuffers, most values
+  // are available as properties instead of accessor methods.
+  var hp = monster.hp;
+  var mana = monster.mana;
+  var name = monster.name;
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  local hp = mon:Hp()
+  local mana = mon:Mana()
+  local name = mon:Name()
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let hp = monster.hp
+  let mana = monster.mana
+  let name = monster.name
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Get and test some scalar types from the FlatBuffer.
+  let hp = monster.hp();
+  let mana = monster.mana();
+  let name = monster.name();
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  let hp = monster.hp
+  let mana = monster.mana
+  let name = monster.name // returns an optional string
+~~~
+</div>
+
+These should hold `300`, `150`, and `"Orc"` respectively.
+
+*Note: The default value `150` wasn't stored in `mana`, but we are still able to retrieve it.*
+
+To access sub-objects, in the case of our `pos`, which is a `Vec3`:
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto pos = monster->pos();
+  auto x = pos->x();
+  auto y = pos->y();
+  auto z = pos->z();
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  Vec3 pos = monster.pos();
+  float x = pos.x();
+  float y = pos.y();
+  float z = pos.z();
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val pos = monster.pos!!
+  val x = pos.x
+  val y = pos.y
+  val z = pos.z
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  var pos = monster.Pos.Value;
+  var x = pos.X;
+  var y = pos.Y;
+  var z = pos.Z;
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  pos := monster.Pos(nil)
+  x := pos.X()
+  y := pos.Y()
+  z := pos.Z()
+
+  // Note: Whenever you access a new object, like in `Pos()`, a new temporary
+  // accessor object gets created. If your code is very performance sensitive,
+  // you can pass in a pointer to an existing `Vec3` instead of `nil`. This
+  // allows you to reuse it across many calls to reduce the amount of object
+  // allocation/garbage collection.
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  pos = monster.Pos()
+  x = pos.X()
+  y = pos.Y()
+  z = pos.Z()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var pos = monster.pos();
+  var x = pos.x();
+  var y = pos.y();
+  var z = pos.z();
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let pos = monster.pos();
+  let x = pos.x();
+  let y = pos.y();
+  let z = pos.z();
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $pos = $monster->getPos();
+  $x = $pos->getX();
+  $y = $pos->getY();
+  $z = $pos->getZ();
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  ns(Vec3_struct_t) pos = ns(Monster_pos(monster));
+  float x = ns(Vec3_x(pos));
+  float y = ns(Vec3_y(pos));
+  float z = ns(Vec3_z(pos));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  myGame.Vec3 pos = monster.pos;
+  double x = pos.x;
+  double y = pos.y;
+  double z = pos.z;
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  local pos = mon:Pos()
+  local x = pos:X()
+  local y = pos:Y()
+  local z = pos:Z()
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let pos = monster.pos
+  let x = pos.x
+  let y = pos.y
+  let z = pos.z
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  let pos = monster.pos().unwrap();
+  let x = pos.x();
+  let y = pos.y();
+  let z = pos.z();
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  let pos = monster.pos
+  let x = pos.x
+  let y = pos.y
+  let z = pos.z
+~~~
+</div>
+
+`x`, `y`, and `z` will contain `1.0`, `2.0`, and `3.0`, respectively.
+
+*Note: Had we not set `pos` during serialization, it would be a `null`-value.*
+
+Similarly, we can access elements of the inventory `vector` by indexing it. You
+can also iterate over the length of the array/vector representing the
+FlatBuffers `vector`.
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto inv = monster->inventory(); // A pointer to a `flatbuffers::Vector<>`.
+  auto inv_len = inv->size();
+  auto third_item = inv->Get(2);
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  int invLength = monster.inventoryLength();
+  byte thirdItem = monster.inventory(2);
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kotlin}
+  val invLength = monster.inventoryLength
+  val thirdItem = monster.inventory(2)!!
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  int invLength = monster.InventoryLength;
+  var thirdItem = monster.Inventory(2);
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  invLength := monster.InventoryLength()
+  thirdItem := monster.Inventory(2)
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  inv_len = monster.InventoryLength()
+  third_item = monster.Inventory(2)
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var invLength = monster.inventoryLength();
+  var thirdItem = monster.inventory(2);
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let invLength = monster.inventoryLength();
+  let thirdItem = monster.inventory(2);
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $inv_len = $monster->getInventoryLength();
+  $third_item = $monster->getInventory(2);
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+    // If `inv` hasn't been set, it will be null. It is valid get
+    // the length of null which will be 0, useful for iteration.
+    flatbuffers_uint8_vec_t inv = ns(Monster_inventory(monster));
+    size_t inv_len = flatbuffers_uint8_vec_len(inv);
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  int invLength = monster.inventory.length;
+  var thirdItem = monster.inventory[2];
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  local invLength = mon:InventoryLength()
+  local thirdItem = mon:Inventory(3) -- Lua is 1-based
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let inv_len = monster.inventory_length
+  let third_item = monster.inventory(2)
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Get and test an element from the `inventory` FlatBuffer's `vector`.
+  let inv = monster.inventory().unwrap();
+
+  // Note that this vector is returned as a slice, because direct access for
+  // this type, a `u8` vector, is safe on all platforms:
+  let third_item = inv[2];
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Get a the count of objects in the vector
+  let count = monster.inventoryCount
+
+  // get item at index 4
+  let object = monster.inventory(at: 4)
+
+  // or you can fetch the entire array
+  let inv = monster.inventory
+  // inv[4] should equal object
+~~~
+</div>
+
+For `vector`s of `table`s, you can access the elements like any other vector,
+except you need to handle the result as a FlatBuffer `table`:
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto weapons = monster->weapons(); // A pointer to a `flatbuffers::Vector<>`.
+  auto weapon_len = weapons->size();
+  auto second_weapon_name = weapons->Get(1)->name()->str();
+  auto second_weapon_damage = weapons->Get(1)->damage()
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  int weaponsLength = monster.weaponsLength();
+  String secondWeaponName = monster.weapons(1).name();
+  short secondWeaponDamage = monster.weapons(1).damage();
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val weaponsLength = monster.weaponsLength
+  val secondWeaponName = monster.weapons(1)!!.name
+  val secondWeaponDamage = monster.weapons(1)!!.damage
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  int weaponsLength = monster.WeaponsLength;
+  var secondWeaponName = monster.Weapons(1).Name;
+  var secondWeaponDamage = monster.Weapons(1).Damage;
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  weaponLength := monster.WeaponsLength()
+  weapon := new(sample.Weapon) // We need a `sample.Weapon` to pass into `monster.Weapons()`
+                               // to capture the output of the function.
+  if monster.Weapons(weapon, 1) {
+          secondWeaponName := weapon.Name()
+          secondWeaponDamage := weapon.Damage()
+  }
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  weapons_length = monster.WeaponsLength()
+  second_weapon_name = monster.Weapons(1).Name()
+  second_weapon_damage = monster.Weapons(1).Damage()
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var weaponsLength = monster.weaponsLength();
+  var secondWeaponName = monster.weapons(1).name();
+  var secondWeaponDamage = monster.weapons(1).damage();
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let weaponsLength = monster.weaponsLength();
+  let secondWeaponName = monster.weapons(1).name();
+  let secondWeaponDamage = monster.weapons(1).damage();
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $weapons_len = $monster->getWeaponsLength();
+  $second_weapon_name = $monster->getWeapons(1)->getName();
+  $second_weapon_damage = $monster->getWeapons(1)->getDamage();
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  ns(Weapon_vec_t) weapons = ns(Monster_weapons(monster));
+  size_t weapons_len = ns(Weapon_vec_len(weapons));
+  // We can use `const char *` instead of `flatbuffers_string_t`.
+  const char *second_weapon_name = ns(Weapon_name(ns(Weapon_vec_at(weapons, 1))));
+  uint16_t second_weapon_damage =  ns(Weapon_damage(ns(Weapon_vec_at(weapons, 1))));
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  int weaponsLength = monster.weapons.length;
+  var secondWeaponName = monster.weapons[1].name;
+  var secondWeaponDamage = monster.Weapons[1].damage;
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  local weaponsLength = mon:WeaponsLength()
+  local secondWeaponName = mon:Weapon(2):Name()
+  local secondWeaponDamage = mon:Weapon(2):Damage()
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  let weapons_length = monster.weapons_length
+  let second_weapon_name = monster.weapons(1).name
+  let second_weapon_damage = monster.weapons(1).damage
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Get and test the `weapons` FlatBuffers's `vector`.
+  let weps = monster.weapons().unwrap();
+  let weps_len = weps.len();
+
+  let wep2 = weps.get(1);
+  let second_weapon_name = wep2.name();
+  let second_weapon_damage = wep2.damage();
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Get the count of weapon objects
+  let wepsCount = monster.weaponsCount
+
+  let weapon2 = monster.weapons(at: 1)
+  let weaponName = weapon2.name
+  let weaponDmg = weapon2.damage
+~~~
+</div>
+
+Last, we can access our `Equipped` FlatBuffer `union`. Just like when we created
+the `union`, we need to get both parts of the `union`: the type and the data.
+
+We can access the type to dynamically cast the data as needed (since the
+`union` only stores a FlatBuffer `table`).
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto union_type = monster.equipped_type();
+
+  if (union_type == Equipment_Weapon) {
+    auto weapon = static_cast<const Weapon*>(monster->equipped()); // Requires `static_cast`
+                                                                   // to type `const Weapon*`.
+
+    auto weapon_name = weapon->name()->str(); // "Axe"
+    auto weapon_damage = weapon->damage();    // 5
+  }
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  int unionType = monster.EquippedType();
+
+  if (unionType == Equipment.Weapon) {
+    Weapon weapon = (Weapon)monster.equipped(new Weapon()); // Requires explicit cast
+                                                            // to `Weapon`.
+
+    String weaponName = weapon.name();    // "Axe"
+    short weaponDamage = weapon.damage(); // 5
+  }
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val unionType = monster.EquippedType
+
+  if (unionType == Equipment.Weapon) {
+    val weapon = monster.equipped(Weapon()) as Weapon // Requires explicit cast
+                                                            // to `Weapon`.
+
+    val weaponName = weapon.name   // "Axe"
+    val weaponDamage = weapon.damage // 5
+  }
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  var unionType = monster.EquippedType;
+
+  if (unionType == Equipment.Weapon) {
+    var weapon = monster.Equipped<Weapon>().Value;
+
+    var weaponName = weapon.Name;     // "Axe"
+    var weaponDamage = weapon.Damage; // 5
+  }
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  // We need a `flatbuffers.Table` to capture the output of the
+  // `monster.Equipped()` function.
+  unionTable := new(flatbuffers.Table)
+
+  if monster.Equipped(unionTable) {
+          unionType := monster.EquippedType()
+
+          if unionType == sample.EquipmentWeapon {
+                  // Create a `sample.Weapon` object that can be initialized with the contents
+                  // of the `flatbuffers.Table` (`unionTable`), which was populated by
+                  // `monster.Equipped()`.
+                  unionWeapon = new(sample.Weapon)
+                  unionWeapon.Init(unionTable.Bytes, unionTable.Pos)
+
+                  weaponName = unionWeapon.Name()
+                  weaponDamage = unionWeapon.Damage()
+          }
+  }
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  union_type = monster.EquippedType()
+
+  if union_type == MyGame.Sample.Equipment.Equipment().Weapon:
+    # `monster.Equipped()` returns a `flatbuffers.Table`, which can be used to
+    # initialize a `MyGame.Sample.Weapon.Weapon()`.
+    union_weapon = MyGame.Sample.Weapon.Weapon()
+    union_weapon.Init(monster.Equipped().Bytes, monster.Equipped().Pos)
+
+    weapon_name = union_weapon.Name()     // 'Axe'
+    weapon_damage = union_weapon.Damage() // 5
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  var unionType = monster.equippedType();
+
+  if (unionType == MyGame.Sample.Equipment.Weapon) {
+    var weaponName = monster.equipped(new MyGame.Sample.Weapon()).name();     // 'Axe'
+    var weaponDamage = monster.equipped(new MyGame.Sample.Weapon()).damage(); // 5
+  }
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  let unionType = monster.equippedType();
+
+  if (unionType == MyGame.Sample.Equipment.Weapon) {
+    let weaponName = monster.equipped(new MyGame.Sample.Weapon()).name();     // 'Axe'
+    let weaponDamage = monster.equipped(new MyGame.Sample.Weapon()).damage(); // 5
+  }
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  $union_type = $monster->getEquippedType();
+
+  if ($union_type == \MyGame\Sample\Equipment::Weapon) {
+    $weapon_name = $monster->getEquipped(new \MyGame\Sample\Weapon())->getName();     // "Axe"
+    $weapon_damage = $monster->getEquipped(new \MyGame\Sample\Weapon())->getDamage(); // 5
+  }
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  // Access union type field.
+  if (ns(Monster_equipped_type(monster)) == ns(Equipment_Weapon)) {
+      // Cast to appropriate type:
+      // C allows for silent void pointer assignment, so we need no explicit cast.
+      ns(Weapon_table_t) weapon = ns(Monster_equipped(monster));
+      const char *weapon_name = ns(Weapon_name(weapon)); // "Axe"
+      uint16_t weapon_damage = ns(Weapon_damage(weapon)); // 5
+  }
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  var unionType = monster.equippedType.value;
+
+  if (unionType == myGame.EquipmentTypeId.Weapon.value) {
+    myGame.Weapon weapon = mon.equipped as myGame.Weapon;
+
+    var weaponName = weapon.name;     // "Axe"
+    var weaponDamage = weapon.damage; // 5
+  }
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  local unionType = mon:EquippedType()
+
+  if unionType == equipment.Weapon then
+    local unionWeapon = weapon.New()
+    unionWeapon:Init(mon:Equipped().bytes, mon:Equipped().pos)
+
+    local weaponName = unionWeapon:Name()     -- 'Axe'
+    local weaponDamage = unionWeapon:Damage() -- 5
+  end
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  union_type = monster.equipped_type
+
+  if union_type == MyGame_Sample_Equipment_Weapon:
+      // `monster.equipped_as_Weapon` returns a FlatBuffer handle much like normal table fields,
+      // but this is only valid to call if we already know it is the correct type.
+      let union_weapon = monster.equipped_as_Weapon
+
+      let weapon_name = union_weapon.name     // "Axe"
+      let weapon_damage = union_weapon.damage // 5
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  // Get and test the `Equipment` union (`equipped` field).
+  // `equipped_as_weapon` returns a FlatBuffer handle much like normal table
+  // fields, but this will return `None` if the union is not actually of that
+  // type.
+  if monster.equipped_type() == Equipment::Weapon {
+    let equipped = monster.equipped_as_weapon().unwrap();
+    let weapon_name = equipped.name();
+    let weapon_damage = equipped.damage();
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  // Get and check if the monster has an equipped item
+  if monster.equippedType == .weapon {
+    let _weapon = monster.equipped(type: Weapon.self)
+    let name = _weapon.name // should return "Axe"
+    let dmg = _weapon.damage // should return 5
+  }
+~~~
+</div>
+
+## Mutating FlatBuffers
+
+As you saw above, typically once you have created a FlatBuffer, it is read-only
+from that moment on. There are, however, cases where you have just received a
+FlatBuffer, and you'd like to modify something about it before sending it on to
+another recipient. With the above functionality, you'd have to generate an
+entirely new FlatBuffer, while tracking what you modified in your own data
+structures. This is inconvenient.
+
+For this reason FlatBuffers can also be mutated in-place. While this is great
+for making small fixes to an existing buffer, you generally want to create
+buffers from scratch whenever possible, since it is much more efficient and the
+API is much more general purpose.
+
+To get non-const accessors, invoke `flatc` with `--gen-mutable`.
+
+Similar to how we read fields using the accessors above, we can now use the
+mutators like so:
+
+<div class="language-cpp">
+~~~{.cpp}
+  auto monster = GetMutableMonster(buffer_pointer);  // non-const
+  monster->mutate_hp(10);                      // Set the table `hp` field.
+  monster->mutable_pos()->mutate_z(4);         // Set struct field.
+  monster->mutable_inventory()->Mutate(0, 1);  // Set vector element.
+~~~
+</div>
+<div class="language-java">
+~~~{.java}
+  Monster monster = Monster.getRootAsMonster(buf);
+  monster.mutateHp(10);            // Set table field.
+  monster.pos().mutateZ(4);        // Set struct field.
+  monster.mutateInventory(0, 1);   // Set vector element.
+~~~
+</div>
+<div class="language-kotlin">
+~~~{.kt}
+  val monster = Monster.getRootAsMonster(buf)
+  monster.mutateHp(10)            // Set table field.
+  monster.pos!!.mutateZ(4)        // Set struct field.
+  monster.mutateInventory(0, 1)   // Set vector element.
+~~~
+</div>
+<div class="language-csharp">
+~~~{.cs}
+  var monster = Monster.GetRootAsMonster(buf);
+  monster.MutateHp(10);            // Set table field.
+  monster.Pos.MutateZ(4);          // Set struct field.
+  monster.MutateInventory(0, 1);   // Set vector element.
+~~~
+</div>
+<div class="language-go">
+~~~{.go}
+  <API for mutating FlatBuffers is not yet available in Go.>
+~~~
+</div>
+<div class="language-python">
+~~~{.py}
+  <API for mutating FlatBuffers is not yet available in Python.>
+~~~
+</div>
+<div class="language-javascript">
+~~~{.js}
+  <API for mutating FlatBuffers is not yet supported in JavaScript.>
+~~~
+</div>
+<div class="language-typescript">
+~~~{.ts}
+  <API for mutating FlatBuffers is not yet supported in TypeScript.>
+~~~
+</div>
+<div class="language-php">
+~~~{.php}
+  <API for mutating FlatBuffers is not yet supported in PHP.>
+~~~
+</div>
+<div class="language-c">
+~~~{.c}
+  <API for in-place mutating FlatBuffers will not be supported in C
+  (except in-place vector sorting is possible).>
+~~~
+</div>
+<div class="language-dart">
+~~~{.dart}
+  <API for mutating FlatBuffers not yet available in Dart.>
+~~~
+</div>
+<div class="language-lua">
+~~~{.lua}
+  <API for mutating FlatBuffers is not yet available in Lua.>
+~~~
+</div>
+<div class="language-lobster">
+~~~{.lobster}
+  <API for mutating FlatBuffers is not yet available in Lobster.>
+~~~
+</div>
+<div class="language-rust">
+~~~{.rs}
+  <API for mutating FlatBuffers is not yet available in Rust.>
+~~~
+</div>
+<div class="language-swift">
+~~~{.swift}
+  let monster = Monster.getRootAsMonster(bb: ByteBuffer(bytes: buf))
+  monster.mutate(hp: 10) // mutates a value in a table
+  /// to mutate structs in swift you have to use the mutable accessors
+  monster.mutablePos.mutate(z: 4) // mutates a value in a struct
+  monster.mutate(inventory: 6, at index: 0) // mutates a value in an Scalar array
+~~~
+</div>
+
+We use the somewhat verbose term `mutate` instead of `set` to indicate that this
+is a special use case, not to be confused with the default way of constructing
+FlatBuffer data.
+
+After the above mutations, you can send on the FlatBuffer to a new recipient
+without any further work!
+
+Note that any `mutate` functions on a table will return a boolean, which is
+`false` if the field we're trying to set is not present in the buffer. Fields
+are not present if they weren't set, or even if they happen to be equal to
+the default value. For example, in the creation code above, the `mana`
+field is equal to `150`, which is the default value, so it was never stored in
+the buffer. Trying to call the corresponding `mutate` method for `mana` on such
+data will return `false`, and the value won't actually be modified!
+
+One way to solve this is to call `ForceDefaults` on a FlatBufferBuilder to
+force all fields you set to actually be written. This, of course, increases the
+size of the buffer somewhat, but this may be acceptable for a mutable buffer.
+
+If this is not sufficient, other ways of mutating FlatBuffers may be supported
+in your language through an object based API (`--gen-object-api`) or reflection.
+See the individual language documents for support.
+
+## Using `flatc` as a JSON Conversion Tool
+
+If you are working with C, C++, or Lobster, you can parse JSON at runtime.
+If your language does not support JSON at the moment, `flatc` may provide an
+alternative. Using `flatc` is often the preferred method, as it doesn't require you to
+add any new code to your program. It is also efficient, since you can ship with
+the binary data. The drawback is that it requires an extra step for your
+users/developers to perform (although it may be able to be automated
+as part of your compilation).
+
+#### JSON to binary representation
+
+Let's say you have a JSON file that describes your monster. In this example,
+we will use the file `flatbuffers/samples/monsterdata.json`.
+
+Here are the contents of the file:
+
+~~~{.json}
+{
+  "pos": {
+    "x": 1.0,
+    "y": 2.0,
+    "z": 3.0
+  },
+  "hp": 300,
+  "name": "Orc",
+  "weapons": [
+    {
+      "name": "axe",
+      "damage": 100
+    },
+    {
+      "name": "bow",
+      "damage": 90
+    }
+  ],
+  "equipped_type": "Weapon",
+  "equipped": {
+    "name": "bow",
+    "damage": 90
+  }
+}
+~~~
+
+You can run this file through the `flatc` compiler with the `-b` flag and
+our `monster.fbs` schema to produce a FlatBuffer binary file.
+
+~~~{.sh}
+./../flatc --binary monster.fbs monsterdata.json
+~~~
+
+The output of this will be a file `monsterdata.bin`, which will contain the
+FlatBuffer binary representation of the contents from our `.json` file.
+
+<div class="language-cpp">
+*Note: If you're working in C++, you can also parse JSON at runtime. See the
+[Use in C++](@ref flatbuffers_guide_use_cpp) section of the Programmer's
+Guide for more information.*
+</div>
+<div class="language-c">
+*Note: If you're working in C, the `flatcc --json` (not `flatc`)
+compiler will generate schema specific high performance json parsers and
+printers that you can compile and use at runtime. The `flatc` compiler (not
+`flatcc`) on the other hand, is still useful for general offline json to
+flatbuffer conversion from a given schema. There are no current plans
+for `flatcc` to support this.*
+</div>
+<div class="language-lobster">
+*Note: If you're working in Lobster, you can also parse JSON at runtime. See the
+[Use in Lobster](@ref flatbuffers_guide_use_lobster) section of the Programmer's
+Guide for more information.*
+</div>
+
+#### FlatBuffer binary to JSON
+
+Converting from a FlatBuffer binary representation to JSON is supported as well:
+~~~{.sh}
+./../flatc --json --raw-binary monster.fbs -- monsterdata.bin
+~~~
+This will convert `monsterdata.bin` back to its original JSON representation.
+You need to pass the corresponding FlatBuffers schema so that flatc knows how to
+interpret the binary buffer. Since `monster.fbs` does not specify an explicit
+`file_identifier` for binary buffers, `flatc` needs to be forced into reading
+the `.bin` file using the `--raw-binary` option.
+
+The FlatBuffer binary representation does not explicitly encode default values,
+therefore they are not present in the resulting JSON unless you specify
+`--defaults-json`.
+
+If you intend to process the JSON with other tools, you may consider switching
+on `--strict-json` so that identifiers are quoted properly.
+
+*Note: The resulting JSON file is not necessarily identical with the original JSON.
+If the binary representation contains floating point numbers, floats and doubles
+are rounded to 6 and 12 digits, respectively, in order to represent them as
+decimals in the JSON document.*
+
+## Advanced Features for Each Language
+
+Each language has a dedicated `Use in XXX` page in the Programmer's Guide
+to cover the nuances of FlatBuffers in that language.
+
+For your chosen language, see:
+
+<div class="language-cpp">
+[Use in C++](@ref flatbuffers_guide_use_cpp)
+</div>
+<div class="language-java">
+[Use in Java](@ref flatbuffers_guide_use_java)
+</div>
+<div class="language-kotlin">
+[Use in Kotlin](@ref flatbuffers_guide_use_kotlin)
+</div>
+<div class="language-csharp">
+[Use in C#](@ref flatbuffers_guide_use_c-sharp)
+</div>
+<div class="language-go">
+[Use in Go](@ref flatbuffers_guide_use_go)
+</div>
+<div class="language-python">
+[Use in Python](@ref flatbuffers_guide_use_python)
+</div>
+<div class="language-javascript">
+[Use in JavaScript](@ref flatbuffers_guide_use_javascript)
+</div>
+<div class="language-typescript">
+[Use in TypeScript](@ref flatbuffers_guide_use_typescript)
+</div>
+<div class="language-php">
+[Use in PHP](@ref flatbuffers_guide_use_php)
+</div>
+<div class="language-c">
+[Use in C](@ref flatbuffers_guide_use_c)
+</div>
+<div class="language-dart">
+[Use in Dart](@ref flatbuffers_guide_use_dart)
+</div>
+<div class="language-lua">
+[Use in Lua](@ref flatbuffers_guide_use_lua)
+</div>
+<div class="language-lobster">
+[Use in Lobster](@ref flatbuffers_guide_use_lobster)
+</div>
+<div class="language-rust">
+[Use in Rust](@ref flatbuffers_guide_use_rust)
+</div>
+<div class="language-swift">
+[Use in Swift](@ref flatbuffers_guide_use_swift)
+</div>
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/TypeScriptUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/TypeScriptUsage.md
new file mode 100644
index 0000000..437b49d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/TypeScriptUsage.md
@@ -0,0 +1,96 @@
+Use in TypeScript    {#flatbuffers_guide_use_typescript}
+=================
+
+## Before you get started
+
+Before diving into the FlatBuffers usage in TypeScript, it should be noted that
+the [Tutorial](@ref flatbuffers_guide_tutorial) page has a complete guide to
+general FlatBuffers usage in all of the supported languages
+(including TypeScript). This page is specifically designed to cover the nuances
+of FlatBuffers usage in TypeScript.
+
+You should also have read the [Building](@ref flatbuffers_guide_building)
+documentation to build `flatc` and should be familiar with
+[Using the schema compiler](@ref flatbuffers_guide_using_schema_compiler) and
+[Writing a schema](@ref flatbuffers_guide_writing_schema).
+
+## FlatBuffers TypeScript library code location
+
+The code for the FlatBuffers TypeScript library can be found at
+https://www.npmjs.com/package/flatbuffers.
+
+## Testing the FlatBuffers TypeScript library
+
+To run the tests, use the [TypeScriptTest.sh](https://github.com/google/
+flatbuffers/blob/master/tests/TypeScriptTest.sh) shell script.
+
+*Note: The TypeScript test file requires [Node.js](https://nodejs.org/en/).*
+
+## Using the FlatBuffers TypeScript libary
+
+*Note: See [Tutorial](@ref flatbuffers_guide_tutorial) for a more in-depth
+example of how to use FlatBuffers in TypeScript.*
+
+FlatBuffers supports both reading and writing FlatBuffers in TypeScript.
+
+To use FlatBuffers in your own code, first generate TypeScript classes from your
+schema with the `--ts` option to `flatc`. Then you can include both FlatBuffers
+and the generated code to read or write a FlatBuffer.
+
+For example, here is how you would read a FlatBuffer binary file in TypeScript:
+First, include the library and generated code. Then read the file into an
+`Uint8Array`. Make a `flatbuffers.ByteBuffer` out of the `Uint8Array`, and pass
+the ByteBuffer to the `getRootAsMonster` function.
+
+~~~{.ts}
+  import * as flatbuffers from 'flatbuffers';
+
+  import { MyGame } from './monster_generated';
+
+  let data = new Uint8Array(fs.readFileSync('monster.dat'));
+  let buf = new flatbuffers.ByteBuffer(data);
+
+  let monster = MyGame.Example.Monster.getRootAsMonster(buf);
+~~~
+
+Now you can access values like this:
+
+~~~{.ts}
+  let hp = monster.hp();
+  let pos = monster.pos();
+~~~
+
+## Object based API
+
+FlatBuffers is all about memory efficiency, which is why its base API is written
+around using as little as possible of it. This does make the API clumsier
+(requiring pre-order construction of all data, and making mutation harder).
+
+For times when efficiency is less important a more convenient object based API
+can be used (through `--gen-object-api`) that is able to unpack & pack a
+FlatBuffer into objects and standard TS types.
+
+To use:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.ts}
+    // Autogenerated class from table Monster.
+    let monsterobj = new MonsterT();
+
+    // Deserialize from buffer into object.
+    Monster.getRootAsMonster(flatbuffer).unpackTo(monsterobj);
+    // or
+    let monsterobj = Monster.getRootAsMonster(flatbuffer).unpack();
+
+    // Update object directly like a regular TS class instance.
+    console.log(monsterobj.name);
+    monsterobj.name = "Bob";
+
+    // Serialize into new flatbuffer.
+    let fbb = new flatbuffers.Builder(1);
+    Monster.finishMonsterBuffer(fbb, monsterobj.pack(fbb));
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+## Text parsing FlatBuffers in TypeScript
+
+There currently is no support for parsing text (Schema's and JSON) directly
+from TypeScript.
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/WhitePaper.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/WhitePaper.md
new file mode 100644
index 0000000..e504ada
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/WhitePaper.md
@@ -0,0 +1,128 @@
+FlatBuffers white paper    {#flatbuffers_white_paper}
+=======================
+
+This document tries to shed some light on to the "why" of FlatBuffers, a
+new serialization library.
+
+## Motivation
+
+Back in the good old days, performance was all about instructions and
+cycles. Nowadays, processing units have run so far ahead of the memory
+subsystem, that making an efficient application should start and finish
+with thinking about memory. How much you use of it. How you lay it out
+and access it. How you allocate it. When you copy it.
+
+Serialization is a pervasive activity in a lot programs, and a common
+source of memory inefficiency, with lots of temporary data structures
+needed to parse and represent data, and inefficient allocation patterns
+and locality.
+
+If it would be possible to do serialization with no temporary objects,
+no additional allocation, no copying, and good locality, this could be
+of great value. The reason serialization systems usually don't manage
+this is because it goes counter to forwards/backwards compatability, and
+platform specifics like endianness and alignment.
+
+FlatBuffers is what you get if you try anyway.
+
+In particular, FlatBuffers focus is on mobile hardware (where memory
+size and memory bandwidth is even more constrained than on desktop
+hardware), and applications that have the highest performance needs:
+games.
+
+## FlatBuffers
+
+*This is a summary of FlatBuffers functionality, with some rationale.
+A more detailed description can be found in the FlatBuffers
+documentation.*
+
+### Summary
+
+A FlatBuffer is a binary buffer containing nested objects (structs,
+tables, vectors,..) organized using offsets so that the data can be
+traversed in-place just like any pointer-based data structure. Unlike
+most in-memory data structures however, it uses strict rules of
+alignment and endianness (always little) to ensure these buffers are
+cross platform. Additionally, for objects that are tables, FlatBuffers
+provides forwards/backwards compatibility and general optionality of
+fields, to support most forms of format evolution.
+
+You define your object types in a schema, which can then be compiled to
+C++ or Java for low to zero overhead reading & writing.
+Optionally, JSON data can be dynamically parsed into buffers.
+
+### Tables
+
+Tables are the cornerstone of FlatBuffers, since format evolution is
+essential for most applications of serialization. Typically, dealing
+with format changes is something that can be done transparently during
+the parsing process of most serialization solutions out there.
+But a FlatBuffer isn't parsed before it is accessed.
+
+Tables get around this by using an extra indirection to access fields,
+through a *vtable*. Each table comes with a vtable (which may be shared
+between multiple tables with the same layout), and contains information
+where fields for this particular kind of instance of vtable are stored.
+The vtable may also indicate that the field is not present (because this
+FlatBuffer was written with an older version of the software, of simply
+because the information was not necessary for this instance, or deemed
+deprecated), in which case a default value is returned.
+
+Tables have a low overhead in memory (since vtables are small and
+shared) and in access cost (an extra indirection), but provide great
+flexibility. Tables may even cost less memory than the equivalent
+struct, since fields do not need to be stored when they are equal to
+their default.
+
+FlatBuffers additionally offers "naked" structs, which do not offer
+forwards/backwards compatibility, but can be even smaller (useful for
+very small objects that are unlikely to change, like e.g. a coordinate
+pair or a RGBA color).
+
+### Schemas
+
+While schemas reduce some generality (you can't just read any data
+without having its schema), they have a lot of upsides:
+
+-   Most information about the format can be factored into the generated
+    code, reducing memory needed to store data, and time to access it.
+
+-   The strong typing of the data definitions means less error
+    checking/handling at runtime (less can go wrong).
+
+-   A schema enables us to access a buffer without parsing.
+
+FlatBuffer schemas are fairly similar to those of the incumbent,
+Protocol Buffers, and generally should be readable to those familiar
+with the C family of languages. We chose to improve upon the features
+offered by .proto files in the following ways:
+
+-   Deprecation of fields instead of manual field id assignment.
+    Extending an object in a .proto means hunting for a free slot among
+    the numbers (preferring lower numbers since they have a more compact
+    representation). Besides being inconvenient, it also makes removing
+    fields problematic: you either have to keep them, not making it
+    obvious that this field shouldn't be read/written anymore, and still
+    generating accessors. Or you remove it, but now you risk that
+    there's still old data around that uses that field by the time
+    someone reuses that field id, with nasty consequences.
+
+-   Differentiating between tables and structs (see above). Effectively
+    all table fields are `optional`, and all struct fields are
+    `required`.
+
+-   Having a native vector type instead of `repeated`. This gives you a
+    length without having to collect all items, and in the case of
+    scalars provides for a more compact representation, and one that
+    guarantees adjacency.
+
+-   Having a native `union` type instead of using a series of `optional`
+    fields, all of which must be checked individually.
+
+-   Being able to define defaults for all scalars, instead of having to
+    deal with their optionality at each access.
+
+-   A parser that can deal with both schemas and data definitions (JSON
+    compatible) uniformly.
+
+<br>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/doxyfile b/3rdparty/TNN/third_party/flatbuffers/docs/source/doxyfile
new file mode 100644
index 0000000..3a5bd98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/doxyfile
@@ -0,0 +1,2373 @@
+# Doxyfile 1.8.5
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "FlatBuffers"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = ".."
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-
+# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi,
+# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en,
+# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
+# Turkish, Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = NO
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 2
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = NO  # Due to the multiple languages included in the API
+                             # reference for FlatBuffers, the Auto-links were
+                             # wrong more often than not.
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = NO
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = NO
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = NO
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = doxygen_layout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = NO
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT = "FlatBuffers.md" \
+        "Building.md" \
+        "Compiler.md" \
+        "Schemas.md" \
+        "CppUsage.md" \
+        "CUsage.md" \
+        "DartUsage.md" \
+        "GoUsage.md" \
+        "JavaUsage.md" \
+        "CsharpUsage.md" \
+        "SwiftUsage.md" \
+        "JavaScriptUsage.md" \
+        "TypeScriptUsage.md" \
+        "PHPUsage.md" \
+        "PythonUsage.md" \
+        "LuaUsage.md" \
+        "LobsterUsage.md" \
+        "RustUsage.md" \
+        "Support.md" \
+        "Benchmarks.md" \
+        "WhitePaper.md" \
+        "FlexBuffers.md" \
+        "Internals.md" \
+        "Grammar.md" \
+        "../../CONTRIBUTING.md" \
+        "Tutorial.md" \
+        "GoApi.md" \
+        "gRPC/CppUsage.md" \
+        "groups" \
+        "../../java/com/google/flatbuffers" \
+        "../../python/flatbuffers/builder.py" \
+        "../../js/flatbuffers.js" \
+        "../../php/FlatbufferBuilder.php" \
+        "../../net/FlatBuffers/FlatBufferBuilder.cs" \
+        "../../include/flatbuffers/flatbuffers.h" \
+        "../../go/builder.go" \
+        "../../rust/flatbuffers/src/builder.rs"
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js \
+                         *.go
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = *_test.py |
+                         __init__.py
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = "GoApi_generated.txt" "../../grpc/samples"
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *.cpp *.h *.txt *.fbs
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = YES
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        = *.py=py_filter
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = FlatBuffers.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = ../header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = ../footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = style.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = "../images/fpl_logo_small.png" \
+                         "../images/ftv2mnode.png" \
+                         "../images/ftv2pnode.png"
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = NO
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all refrences to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = NO
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/doxygen_layout.xml b/3rdparty/TNN/third_party/flatbuffers/docs/source/doxygen_layout.xml
new file mode 100644
index 0000000..e1a6b81
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/doxygen_layout.xml
@@ -0,0 +1,252 @@
+<!-- Copyright 2015 Google Inc. All rights reserved.
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+ -->
+<doxygenlayout version="1.0">
+  <navindex>
+    <tab type="mainpage" visible="no" title=""/>
+    <tab type="usergroup" url="" title="Programmer's Guide">
+      <tab type="user" url="@ref flatbuffers_guide_building"
+          title="Building"/>
+      <tab type="user" url="@ref flatbuffers_guide_tutorial" title="Tutorial"/>
+      <tab type="user" url="@ref flatbuffers_guide_using_schema_compiler"
+          title="Using the schema compiler"/>
+      <tab type="user" url="@ref flatbuffers_guide_writing_schema"
+          title="Writing a schema"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_cpp"
+          title="Use in C++"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_c"
+          title="Use in C"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_go"
+          title="Use in Go"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_java"
+          title="Use in Java"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_c-sharp"
+          title="Use in C#"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_javascript"
+          title="Use in JavaScript"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_typescript"
+          title="Use in TypeScript"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_php"
+          title="Use in PHP"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_python"
+          title="Use in Python"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_dart"
+          title="Use in Dart"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_lua"
+          title="Use in Lua"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_lobster"
+          title="Use in Lobster"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_rust"
+          title="Use in Rust"/>
+      <tab type="user" url="@ref flatbuffers_guide_use_swift"
+          title="Use in Swift"/>
+      <tab type="user" url="@ref flexbuffers"
+          title="FlexBuffers (Schema-less version)"/>
+      <tab type="usergroup" url="" title="gRPC">
+        <tab type="user" url="@ref flatbuffers_grpc_guide_use_cpp"
+            title="Use in C++"/>
+      </tab>
+    </tab>
+    <tab type="user" url="@ref flatbuffers_support"
+        title="Platform / Language / Feature support"/>
+    <tab type="user" url="@ref flatbuffers_benchmarks"
+        title="Benchmarks"/>
+    <tab type="user" url="@ref flatbuffers_white_paper"
+        title="FlatBuffers white paper"/>
+    <tab type="user" url="@ref flatbuffers_internals"
+        title="FlatBuffers internals"/>
+    <tab type="user" url="@ref flatbuffers_grammar"
+        title="Grammar of the schema language"/>
+    <tab type="usergroup" url="" title="API Reference">
+      <tab type="modules" visible="yes" title="APIs" intro=""/>
+      <tab type="classes" visible="yes" title="">
+        <tab type="classlist" visible="yes" title="" intro=""/>
+        <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+        <tab type="hierarchy" visible="yes" title="" intro=""/>
+        <tab type="classmembers" visible="yes" title="" intro=""/>
+      </tab>
+    </tab>
+    <tab type="user" url="@ref contributing" title="Contributing"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <detaileddescription title=""/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <detaileddescription title=""/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <detaileddescription title=""/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/gRPC/CppUsage.md b/3rdparty/TNN/third_party/flatbuffers/docs/source/gRPC/CppUsage.md
new file mode 100644
index 0000000..93dbb29
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/gRPC/CppUsage.md
@@ -0,0 +1,29 @@
+Use in C++    {#flatbuffers_grpc_guide_use_cpp}
+==========
+
+## Before you get started
+
+Before diving into the FlatBuffers gRPC usage in C++, you should already be
+familiar with the following:
+
+- FlatBuffers as a serialization format
+- [gRPC](http://www.grpc.io/docs/) usage
+
+## Using the FlatBuffers gRPC C++ library
+
+NOTE: The examples below are also in the `grpc/samples/greeter` directory.
+
+We will illustrate usage with the following schema:
+
+@include grpc/samples/greeter/greeter.fbs
+
+When we run `flatc`, we pass in the `--grpc` option and generage an additional
+`greeter.grpc.fb.h` and `greeter.grpc.fb.cc`.
+
+Example server code looks like this:
+
+@include grpc/samples/greeter/server.cpp
+
+Example client code looks like this:
+
+@include grpc/samples/greeter/client.cpp
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/groups b/3rdparty/TNN/third_party/flatbuffers/docs/source/groups
new file mode 100644
index 0000000..c3aea18
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/groups
@@ -0,0 +1,23 @@
+/// @defgroup flatbuffers_cpp_api C++ API
+/// @brief FlatBuffers API for C++
+
+/// @defgroup flatbuffers_csharp_api C# API
+/// @brief FlatBuffers API for C#
+
+/// @defgroup flatbuffers_go_api Go API
+/// @brief FlatBuffers API for Go
+
+/// @defgroup flatbuffers_java_api Java API
+/// @brief FlatBuffers API for Java
+
+/// @defgroup flatbuffers_javascript_api JavaScript API
+/// @brief FlatBuffers API for JavaScript
+
+/// @defgroup flatbuffers_typescript_api TypeScript API
+/// @brief FlatBuffers API for TypeScript
+
+/// @defgroup flatbuffers_php_api PHP API
+/// @brief FlatBuffers API for PHP
+
+/// @defgroup flatbuffers_python_api Python API
+/// @brief FlatBuffers API for Python
diff --git a/3rdparty/TNN/third_party/flatbuffers/docs/source/style.css b/3rdparty/TNN/third_party/flatbuffers/docs/source/style.css
new file mode 100644
index 0000000..6045a97
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/docs/source/style.css
@@ -0,0 +1,396 @@
+body,
+#projectname,
+table,
+div,
+p,
+dl,
+.title,
+.tabs,
+.tabs2,
+.tabs3,
+#nav-tree .label {
+  font-family: roboto, sans-serif;
+}
+
+#commonprojectlogo {
+  padding: 5px 0px 5px 15px;
+}
+
+#projectname {
+  color: #00bcd4;
+  font-size: 280%;
+  padding: 15px 0px;
+  font-weight: 300;
+}
+
+#titlearea {
+  border-bottom: 2px solid #e5e5e5;
+}
+
+.title {
+  color: #212121;
+  font: 300 34px/40px Roboto,sans-serif;
+}
+
+#nav-tree {
+  background-color: #fff;
+}
+
+#navrow1, #navrow2 {
+  border-bottom: 2px solid #e7e7e7;
+}
+
+.tabs, .tabs2, .tabs3 {
+  font-size: 14px;
+}
+
+.tabs,
+.tabs2,
+.tabs3,
+.tablist li,
+.tablist li.current a {
+  background-image: none;
+}
+
+.tablist {
+  list-style: none;
+}
+
+.tablist li, .tablist li p {
+  margin: 0;
+}
+
+.tablist li a,
+.tablist li.current a {
+  color: #757575;
+  text-shadow: none;
+}
+
+.tablist li.current a {
+  background: #00bcd4;
+  color: #fff;
+}
+
+.tablist a {
+  background-image: none;
+  border-right: 2px solid #e5e5e5;
+  font-weight: normal;
+}
+
+.tablist a:hover,
+.tablist li.current a:hover {
+  background-image: none;
+  text-decoration: underline;
+  text-shadow: none;
+}
+
+.tablist a:hover {
+  color: #00bcd4;
+}
+
+.tablist li.current a:hover {
+  color: #fff;
+}
+
+div.header {
+  background-color: #f7f7f7;
+  background-image: none;
+  border-bottom: none;
+}
+
+#MSearchBox {
+  border: 1px solid #ccc;
+  border-radius: 5px;
+  display: inline-block;
+  height: 20px;
+  right: 10px;
+}
+
+#MSearchBox .left,
+#MSearchBox .right,
+#MSearchField {
+  background: none;
+}
+
+a.SelectItem:hover {
+  background-color: #00bcd4;
+}
+
+#nav-tree {
+  background-image: none;
+}
+
+#nav-tree .selected {
+  background-image: none;
+  text-shadow: none;
+  background-color: #f7f7f7;
+}
+
+#nav-tree a {
+  color: #212121;
+}
+
+#nav-tree .selected a {
+  color: #0288d1;
+}
+
+#nav-tree .item:hover {
+  background-color: #f7f7f7;
+}
+
+#nav-tree .item:hover a {
+  color: #0288d1;
+}
+
+#nav-tree .label {
+  font-size: 13px;
+}
+
+#nav-sync {
+  display: none;
+}
+
+.ui-resizable-e {
+  background: #ebebeb;
+  border-left: 1px solid #ddd;
+  border-right: 1px solid #ddd;
+}
+
+.contents tr td .image {
+  margin-top: 24px;
+}
+
+.image {
+  text-align: left;
+  margin-bottom: 8px;
+}
+
+a:link,
+a:visited,
+.contents a:link,
+.contents a:visited,
+a.el {
+  color: #0288d1;
+  font-weight: normal;
+  text-decoration: none;
+}
+
+div.contents {
+  margin-right: 12px;
+}
+
+.directory tr, .directory tr.even {
+  background: #7cb342;
+  border-top: 1px solid #7cb342;
+}
+
+.directory td,
+.directory td.entry,
+.directory td.desc {
+  background: rgba(255,255,255,.95);
+  border-left: none;
+  color: #212121;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  padding-left: 8px;
+  padding-right: 8px;
+}
+
+.directory tr#row_0_ {
+  border-top-color: #7cb342;
+}
+
+.directory tr#row_0_ td {
+  background: #7cb342;
+  color: #fff;
+  font-size: 18px;
+}
+
+.memSeparator {
+  border-bottom: none;
+}
+
+.memitem {
+  background: #7cb342;
+}
+
+.memproto, dl.reflist dt {
+  background: #7cb342;
+  background-image: none;
+  border: none;
+  box-shadow: none;
+  -webkit-box-shadow: none;
+  color: #fff;
+  text-shadow: none;
+}
+
+.memproto .memtemplate,
+.memproto a.el,
+.memproto .paramname {
+  color: #fff;
+}
+
+.memdoc, dl.reflist dd {
+  border: none;
+  background-color: rgba(255,255,255,.95);
+  background-image: none;
+  box-shadow: none;
+  -webkit-box-shadow: none;
+  -webkit-border-bottom-left-radius: 0;
+  -webkit-border-bottom-right-radius: 0;
+}
+
+.memitem, table.doxtable, table.memberdecls {
+  margin-bottom: 24px;
+}
+
+table.doxtable th {
+  background: #7cb342;
+}
+
+table.doxtable tr {
+  background: #7cb342;
+  border-top: 1px solid #7cb342;
+}
+
+table.doxtable td, table.doxtable th {
+  border: none;
+  padding: 10px 8px;
+}
+
+table.doxtable td {
+  background-color: rgba(255,255,255,.95);
+}
+
+.memberdecls {
+  background: #7cb342;
+  border-top: 1px solid #7cb342;
+}
+
+.memberdecls .heading h2 {
+  border-bottom: none;
+  color: #fff;
+  font-size: 110%;
+  font-weight: bold;
+  margin: 0 0 0 6px;
+}
+
+.memberdecls tr:not(.heading) td {
+  background-color: rgba(255,255,255,.95);
+}
+
+h1, h2, h2.groupheader, h3, h4, h5, h6 {
+  color: #212121;
+}
+
+h1 {
+  border-bottom: 1px solid #ebebeb;
+  font: 400 28px/32px Roboto,sans-serif;
+  letter-spacing: -.01em;
+  margin: 40px 0 20px;
+  padding-bottom: 3px;
+}
+
+h2, h2.groupheader {
+  border-bottom: 1px solid #ebebeb;
+  font: 400 23px/32px Roboto,sans-serif;
+  letter-spacing: -.01em;
+  margin: 40px 0 20px;
+  padding-bottom: 3px;
+}
+
+h3 {
+  font: 500 20px/32px Roboto,sans-serif;
+  margin: 32px 0 16px;
+}
+
+h4 {
+  font: 500 18px/32px Roboto,sans-serif;
+  margin: 32px 0 16px;
+}
+
+ol,
+ul {
+  margin: 0;
+  padding-left: 40px;
+}
+
+ol {
+  list-style: decimal outside;
+}
+
+ol ol {
+  list-style-type: lower-alpha;
+}
+
+ol ol ol {
+  list-style-type: lower-roman;
+}
+
+ul {
+  list-style: disc outside;
+}
+
+li,
+li p {
+  margin: 8px 0;
+  padding: 0;
+}
+
+div.summary
+{
+  float: none;
+  font-size: 8pt;
+  padding-left: 5px;
+  width: calc(100% - 10px);
+  text-align: left;
+  display: block;
+}
+
+div.ingroups {
+  margin-top: 8px;
+}
+
+div.fragment {
+  border: 1px solid #ddd;
+  color: #455a64;
+  font: 14px/20px Roboto Mono, monospace;
+  padding: 8px;
+}
+
+div.line {
+  line-height: 1.5;
+  font-size: inherit;
+}
+
+code, pre {
+  color: #455a64;
+  background: #f7f7f7;
+  font: 400 100% Roboto Mono,monospace;
+  padding: 1px 4px;
+}
+
+span.preprocessor, span.comment {
+  color: #0b8043;
+}
+
+span.keywordtype {
+  color: #0097a7;
+}
+
+.paramname {
+  color: #ef6c00;
+}
+
+.memTemplParams {
+  color: #ef6c00;
+}
+
+span.mlabel {
+  background: rgba(255,255,255,.25);
+  border: none;
+}
+
+blockquote {
+  border: 1px solid #ddd;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/go/BUILD.bazel
new file mode 100644
index 0000000..78bd8d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/BUILD.bazel
@@ -0,0 +1,23 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+alias(
+    name = "go_default_library",
+    actual = ":go",
+    visibility = ["//visibility:public"],
+)
+
+go_library(
+    name = "go",
+    srcs = [
+        "builder.go",
+        "doc.go",
+        "encode.go",
+        "grpc.go",
+        "lib.go",
+        "sizes.go",
+        "struct.go",
+        "table.go",
+    ],
+    importpath = "github.com/google/flatbuffers/go",
+    visibility = ["//visibility:public"],
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/builder.go b/3rdparty/TNN/third_party/flatbuffers/go/builder.go
new file mode 100644
index 0000000..d99b590
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/builder.go
@@ -0,0 +1,835 @@
+package flatbuffers
+
+// Builder is a state machine for creating FlatBuffer objects.
+// Use a Builder to construct object(s) starting from leaf nodes.
+//
+// A Builder constructs byte buffers in a last-first manner for simplicity and
+// performance.
+type Builder struct {
+	// `Bytes` gives raw access to the buffer. Most users will want to use
+	// FinishedBytes() instead.
+	Bytes []byte
+
+	minalign  int
+	vtable    []UOffsetT
+	objectEnd UOffsetT
+	vtables   []UOffsetT
+	head      UOffsetT
+	nested    bool
+	finished  bool
+
+	sharedStrings map[string]UOffsetT
+}
+
+const fileIdentifierLength = 4
+const sizePrefixLength = 4
+
+// NewBuilder initializes a Builder of size `initial_size`.
+// The internal buffer is grown as needed.
+func NewBuilder(initialSize int) *Builder {
+	if initialSize <= 0 {
+		initialSize = 0
+	}
+
+	b := &Builder{}
+	b.Bytes = make([]byte, initialSize)
+	b.head = UOffsetT(initialSize)
+	b.minalign = 1
+	b.vtables = make([]UOffsetT, 0, 16) // sensible default capacity
+	return b
+}
+
+// Reset truncates the underlying Builder buffer, facilitating alloc-free
+// reuse of a Builder. It also resets bookkeeping data.
+func (b *Builder) Reset() {
+	if b.Bytes != nil {
+		b.Bytes = b.Bytes[:cap(b.Bytes)]
+	}
+
+	if b.vtables != nil {
+		b.vtables = b.vtables[:0]
+	}
+
+	if b.vtable != nil {
+		b.vtable = b.vtable[:0]
+	}
+
+	if b.sharedStrings != nil {
+		for key := range b.sharedStrings {
+			delete(b.sharedStrings, key)
+		}
+	}
+
+	b.head = UOffsetT(len(b.Bytes))
+	b.minalign = 1
+	b.nested = false
+	b.finished = false
+}
+
+// FinishedBytes returns a pointer to the written data in the byte buffer.
+// Panics if the builder is not in a finished state (which is caused by calling
+// `Finish()`).
+func (b *Builder) FinishedBytes() []byte {
+	b.assertFinished()
+	return b.Bytes[b.Head():]
+}
+
+// StartObject initializes bookkeeping for writing a new object.
+func (b *Builder) StartObject(numfields int) {
+	b.assertNotNested()
+	b.nested = true
+
+	// use 32-bit offsets so that arithmetic doesn't overflow.
+	if cap(b.vtable) < numfields || b.vtable == nil {
+		b.vtable = make([]UOffsetT, numfields)
+	} else {
+		b.vtable = b.vtable[:numfields]
+		for i := 0; i < len(b.vtable); i++ {
+			b.vtable[i] = 0
+		}
+	}
+
+	b.objectEnd = b.Offset()
+}
+
+// WriteVtable serializes the vtable for the current object, if applicable.
+//
+// Before writing out the vtable, this checks pre-existing vtables for equality
+// to this one. If an equal vtable is found, point the object to the existing
+// vtable and return.
+//
+// Because vtable values are sensitive to alignment of object data, not all
+// logically-equal vtables will be deduplicated.
+//
+// A vtable has the following format:
+//   <VOffsetT: size of the vtable in bytes, including this value>
+//   <VOffsetT: size of the object in bytes, including the vtable offset>
+//   <VOffsetT: offset for a field> * N, where N is the number of fields in
+//	        the schema for this type. Includes deprecated fields.
+// Thus, a vtable is made of 2 + N elements, each SizeVOffsetT bytes wide.
+//
+// An object has the following format:
+//   <SOffsetT: offset to this object's vtable (may be negative)>
+//   <byte: data>+
+func (b *Builder) WriteVtable() (n UOffsetT) {
+	// Prepend a zero scalar to the object. Later in this function we'll
+	// write an offset here that points to the object's vtable:
+	b.PrependSOffsetT(0)
+
+	objectOffset := b.Offset()
+	existingVtable := UOffsetT(0)
+
+	// Trim vtable of trailing zeroes.
+	i := len(b.vtable) - 1
+	for ; i >= 0 && b.vtable[i] == 0; i-- {
+	}
+	b.vtable = b.vtable[:i+1]
+
+	// Search backwards through existing vtables, because similar vtables
+	// are likely to have been recently appended. See
+	// BenchmarkVtableDeduplication for a case in which this heuristic
+	// saves about 30% of the time used in writing objects with duplicate
+	// tables.
+	for i := len(b.vtables) - 1; i >= 0; i-- {
+		// Find the other vtable, which is associated with `i`:
+		vt2Offset := b.vtables[i]
+		vt2Start := len(b.Bytes) - int(vt2Offset)
+		vt2Len := GetVOffsetT(b.Bytes[vt2Start:])
+
+		metadata := VtableMetadataFields * SizeVOffsetT
+		vt2End := vt2Start + int(vt2Len)
+		vt2 := b.Bytes[vt2Start+metadata : vt2End]
+
+		// Compare the other vtable to the one under consideration.
+		// If they are equal, store the offset and break:
+		if vtableEqual(b.vtable, objectOffset, vt2) {
+			existingVtable = vt2Offset
+			break
+		}
+	}
+
+	if existingVtable == 0 {
+		// Did not find a vtable, so write this one to the buffer.
+
+		// Write out the current vtable in reverse , because
+		// serialization occurs in last-first order:
+		for i := len(b.vtable) - 1; i >= 0; i-- {
+			var off UOffsetT
+			if b.vtable[i] != 0 {
+				// Forward reference to field;
+				// use 32bit number to assert no overflow:
+				off = objectOffset - b.vtable[i]
+			}
+
+			b.PrependVOffsetT(VOffsetT(off))
+		}
+
+		// The two metadata fields are written last.
+
+		// First, store the object bytesize:
+		objectSize := objectOffset - b.objectEnd
+		b.PrependVOffsetT(VOffsetT(objectSize))
+
+		// Second, store the vtable bytesize:
+		vBytes := (len(b.vtable) + VtableMetadataFields) * SizeVOffsetT
+		b.PrependVOffsetT(VOffsetT(vBytes))
+
+		// Next, write the offset to the new vtable in the
+		// already-allocated SOffsetT at the beginning of this object:
+		objectStart := SOffsetT(len(b.Bytes)) - SOffsetT(objectOffset)
+		WriteSOffsetT(b.Bytes[objectStart:],
+			SOffsetT(b.Offset())-SOffsetT(objectOffset))
+
+		// Finally, store this vtable in memory for future
+		// deduplication:
+		b.vtables = append(b.vtables, b.Offset())
+	} else {
+		// Found a duplicate vtable.
+
+		objectStart := SOffsetT(len(b.Bytes)) - SOffsetT(objectOffset)
+		b.head = UOffsetT(objectStart)
+
+		// Write the offset to the found vtable in the
+		// already-allocated SOffsetT at the beginning of this object:
+		WriteSOffsetT(b.Bytes[b.head:],
+			SOffsetT(existingVtable)-SOffsetT(objectOffset))
+	}
+
+	b.vtable = b.vtable[:0]
+	return objectOffset
+}
+
+// EndObject writes data necessary to finish object construction.
+func (b *Builder) EndObject() UOffsetT {
+	b.assertNested()
+	n := b.WriteVtable()
+	b.nested = false
+	return n
+}
+
+// Doubles the size of the byteslice, and copies the old data towards the
+// end of the new byteslice (since we build the buffer backwards).
+func (b *Builder) growByteBuffer() {
+	if (int64(len(b.Bytes)) & int64(0xC0000000)) != 0 {
+		panic("cannot grow buffer beyond 2 gigabytes")
+	}
+	newLen := len(b.Bytes) * 2
+	if newLen == 0 {
+		newLen = 1
+	}
+
+	if cap(b.Bytes) >= newLen {
+		b.Bytes = b.Bytes[:newLen]
+	} else {
+		extension := make([]byte, newLen-len(b.Bytes))
+		b.Bytes = append(b.Bytes, extension...)
+	}
+
+	middle := newLen / 2
+	copy(b.Bytes[middle:], b.Bytes[:middle])
+}
+
+// Head gives the start of useful data in the underlying byte buffer.
+// Note: unlike other functions, this value is interpreted as from the left.
+func (b *Builder) Head() UOffsetT {
+	return b.head
+}
+
+// Offset relative to the end of the buffer.
+func (b *Builder) Offset() UOffsetT {
+	return UOffsetT(len(b.Bytes)) - b.head
+}
+
+// Pad places zeros at the current offset.
+func (b *Builder) Pad(n int) {
+	for i := 0; i < n; i++ {
+		b.PlaceByte(0)
+	}
+}
+
+// Prep prepares to write an element of `size` after `additional_bytes`
+// have been written, e.g. if you write a string, you need to align such
+// the int length field is aligned to SizeInt32, and the string data follows it
+// directly.
+// If all you need to do is align, `additionalBytes` will be 0.
+func (b *Builder) Prep(size, additionalBytes int) {
+	// Track the biggest thing we've ever aligned to.
+	if size > b.minalign {
+		b.minalign = size
+	}
+	// Find the amount of alignment needed such that `size` is properly
+	// aligned after `additionalBytes`:
+	alignSize := (^(len(b.Bytes) - int(b.Head()) + additionalBytes)) + 1
+	alignSize &= (size - 1)
+
+	// Reallocate the buffer if needed:
+	for int(b.head) <= alignSize+size+additionalBytes {
+		oldBufSize := len(b.Bytes)
+		b.growByteBuffer()
+		b.head += UOffsetT(len(b.Bytes) - oldBufSize)
+	}
+	b.Pad(alignSize)
+}
+
+// PrependSOffsetT prepends an SOffsetT, relative to where it will be written.
+func (b *Builder) PrependSOffsetT(off SOffsetT) {
+	b.Prep(SizeSOffsetT, 0) // Ensure alignment is already done.
+	if !(UOffsetT(off) <= b.Offset()) {
+		panic("unreachable: off <= b.Offset()")
+	}
+	off2 := SOffsetT(b.Offset()) - off + SOffsetT(SizeSOffsetT)
+	b.PlaceSOffsetT(off2)
+}
+
+// PrependUOffsetT prepends an UOffsetT, relative to where it will be written.
+func (b *Builder) PrependUOffsetT(off UOffsetT) {
+	b.Prep(SizeUOffsetT, 0) // Ensure alignment is already done.
+	if !(off <= b.Offset()) {
+		panic("unreachable: off <= b.Offset()")
+	}
+	off2 := b.Offset() - off + UOffsetT(SizeUOffsetT)
+	b.PlaceUOffsetT(off2)
+}
+
+// StartVector initializes bookkeeping for writing a new vector.
+//
+// A vector has the following format:
+//   <UOffsetT: number of elements in this vector>
+//   <T: data>+, where T is the type of elements of this vector.
+func (b *Builder) StartVector(elemSize, numElems, alignment int) UOffsetT {
+	b.assertNotNested()
+	b.nested = true
+	b.Prep(SizeUint32, elemSize*numElems)
+	b.Prep(alignment, elemSize*numElems) // Just in case alignment > int.
+	return b.Offset()
+}
+
+// EndVector writes data necessary to finish vector construction.
+func (b *Builder) EndVector(vectorNumElems int) UOffsetT {
+	b.assertNested()
+
+	// we already made space for this, so write without PrependUint32
+	b.PlaceUOffsetT(UOffsetT(vectorNumElems))
+
+	b.nested = false
+	return b.Offset()
+}
+
+// CreateSharedString Checks if the string is already written
+// to the buffer before calling CreateString
+func (b *Builder) CreateSharedString(s string) UOffsetT {
+	if b.sharedStrings == nil {
+		b.sharedStrings = make(map[string]UOffsetT)
+	}
+	if v, ok := b.sharedStrings[s]; ok {
+		return v
+	}
+	off := b.CreateString(s)
+	b.sharedStrings[s] = off
+	return off
+}
+
+// CreateString writes a null-terminated string as a vector.
+func (b *Builder) CreateString(s string) UOffsetT {
+	b.assertNotNested()
+	b.nested = true
+
+	b.Prep(int(SizeUOffsetT), (len(s)+1)*SizeByte)
+	b.PlaceByte(0)
+
+	l := UOffsetT(len(s))
+
+	b.head -= l
+	copy(b.Bytes[b.head:b.head+l], s)
+
+	return b.EndVector(len(s))
+}
+
+// CreateByteString writes a byte slice as a string (null-terminated).
+func (b *Builder) CreateByteString(s []byte) UOffsetT {
+	b.assertNotNested()
+	b.nested = true
+
+	b.Prep(int(SizeUOffsetT), (len(s)+1)*SizeByte)
+	b.PlaceByte(0)
+
+	l := UOffsetT(len(s))
+
+	b.head -= l
+	copy(b.Bytes[b.head:b.head+l], s)
+
+	return b.EndVector(len(s))
+}
+
+// CreateByteVector writes a ubyte vector
+func (b *Builder) CreateByteVector(v []byte) UOffsetT {
+	b.assertNotNested()
+	b.nested = true
+
+	b.Prep(int(SizeUOffsetT), len(v)*SizeByte)
+
+	l := UOffsetT(len(v))
+
+	b.head -= l
+	copy(b.Bytes[b.head:b.head+l], v)
+
+	return b.EndVector(len(v))
+}
+
+func (b *Builder) assertNested() {
+	// If you get this assert, you're in an object while trying to write
+	// data that belongs outside of an object.
+	// To fix this, write non-inline data (like vectors) before creating
+	// objects.
+	if !b.nested {
+		panic("Incorrect creation order: must be inside object.")
+	}
+}
+
+func (b *Builder) assertNotNested() {
+	// If you hit this, you're trying to construct a Table/Vector/String
+	// during the construction of its parent table (between the MyTableBuilder
+	// and builder.Finish()).
+	// Move the creation of these sub-objects to above the MyTableBuilder to
+	// not get this assert.
+	// Ignoring this assert may appear to work in simple cases, but the reason
+	// it is here is that storing objects in-line may cause vtable offsets
+	// to not fit anymore. It also leads to vtable duplication.
+	if b.nested {
+		panic("Incorrect creation order: object must not be nested.")
+	}
+}
+
+func (b *Builder) assertFinished() {
+	// If you get this assert, you're attempting to get access a buffer
+	// which hasn't been finished yet. Be sure to call builder.Finish()
+	// with your root table.
+	// If you really need to access an unfinished buffer, use the Bytes
+	// buffer directly.
+	if !b.finished {
+		panic("Incorrect use of FinishedBytes(): must call 'Finish' first.")
+	}
+}
+
+// PrependBoolSlot prepends a bool onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependBoolSlot(o int, x, d bool) {
+	val := byte(0)
+	if x {
+		val = 1
+	}
+	def := byte(0)
+	if d {
+		def = 1
+	}
+	b.PrependByteSlot(o, val, def)
+}
+
+// PrependByteSlot prepends a byte onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependByteSlot(o int, x, d byte) {
+	if x != d {
+		b.PrependByte(x)
+		b.Slot(o)
+	}
+}
+
+// PrependUint8Slot prepends a uint8 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependUint8Slot(o int, x, d uint8) {
+	if x != d {
+		b.PrependUint8(x)
+		b.Slot(o)
+	}
+}
+
+// PrependUint16Slot prepends a uint16 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependUint16Slot(o int, x, d uint16) {
+	if x != d {
+		b.PrependUint16(x)
+		b.Slot(o)
+	}
+}
+
+// PrependUint32Slot prepends a uint32 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependUint32Slot(o int, x, d uint32) {
+	if x != d {
+		b.PrependUint32(x)
+		b.Slot(o)
+	}
+}
+
+// PrependUint64Slot prepends a uint64 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependUint64Slot(o int, x, d uint64) {
+	if x != d {
+		b.PrependUint64(x)
+		b.Slot(o)
+	}
+}
+
+// PrependInt8Slot prepends a int8 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependInt8Slot(o int, x, d int8) {
+	if x != d {
+		b.PrependInt8(x)
+		b.Slot(o)
+	}
+}
+
+// PrependInt16Slot prepends a int16 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependInt16Slot(o int, x, d int16) {
+	if x != d {
+		b.PrependInt16(x)
+		b.Slot(o)
+	}
+}
+
+// PrependInt32Slot prepends a int32 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependInt32Slot(o int, x, d int32) {
+	if x != d {
+		b.PrependInt32(x)
+		b.Slot(o)
+	}
+}
+
+// PrependInt64Slot prepends a int64 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependInt64Slot(o int, x, d int64) {
+	if x != d {
+		b.PrependInt64(x)
+		b.Slot(o)
+	}
+}
+
+// PrependFloat32Slot prepends a float32 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependFloat32Slot(o int, x, d float32) {
+	if x != d {
+		b.PrependFloat32(x)
+		b.Slot(o)
+	}
+}
+
+// PrependFloat64Slot prepends a float64 onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependFloat64Slot(o int, x, d float64) {
+	if x != d {
+		b.PrependFloat64(x)
+		b.Slot(o)
+	}
+}
+
+// PrependUOffsetTSlot prepends an UOffsetT onto the object at vtable slot `o`.
+// If value `x` equals default `d`, then the slot will be set to zero and no
+// other data will be written.
+func (b *Builder) PrependUOffsetTSlot(o int, x, d UOffsetT) {
+	if x != d {
+		b.PrependUOffsetT(x)
+		b.Slot(o)
+	}
+}
+
+// PrependStructSlot prepends a struct onto the object at vtable slot `o`.
+// Structs are stored inline, so nothing additional is being added.
+// In generated code, `d` is always 0.
+func (b *Builder) PrependStructSlot(voffset int, x, d UOffsetT) {
+	if x != d {
+		b.assertNested()
+		if x != b.Offset() {
+			panic("inline data write outside of object")
+		}
+		b.Slot(voffset)
+	}
+}
+
+// Slot sets the vtable key `voffset` to the current location in the buffer.
+func (b *Builder) Slot(slotnum int) {
+	b.vtable[slotnum] = UOffsetT(b.Offset())
+}
+
+// FinishWithFileIdentifier finalizes a buffer, pointing to the given `rootTable`.
+// as well as applys a file identifier
+func (b *Builder) FinishWithFileIdentifier(rootTable UOffsetT, fid []byte) {
+	if fid == nil || len(fid) != fileIdentifierLength {
+		panic("incorrect file identifier length")
+	}
+	// In order to add a file identifier to the flatbuffer message, we need
+	// to prepare an alignment and file identifier length
+	b.Prep(b.minalign, SizeInt32+fileIdentifierLength)
+	for i := fileIdentifierLength - 1; i >= 0; i-- {
+		// place the file identifier
+		b.PlaceByte(fid[i])
+	}
+	// finish
+	b.Finish(rootTable)
+}
+
+// FinishSizePrefixed finalizes a buffer, pointing to the given `rootTable`.
+// The buffer is prefixed with the size of the buffer, excluding the size
+// of the prefix itself.
+func (b *Builder) FinishSizePrefixed(rootTable UOffsetT) {
+	b.finish(rootTable, true)
+}
+
+// FinishSizePrefixedWithFileIdentifier finalizes a buffer, pointing to the given `rootTable`
+// and applies a file identifier. The buffer is prefixed with the size of the buffer,
+// excluding the size of the prefix itself.
+func (b *Builder) FinishSizePrefixedWithFileIdentifier(rootTable UOffsetT, fid []byte) {
+	if fid == nil || len(fid) != fileIdentifierLength {
+		panic("incorrect file identifier length")
+	}
+	// In order to add a file identifier and size prefix to the flatbuffer message,
+	// we need to prepare an alignment, a size prefix length, and file identifier length
+	b.Prep(b.minalign, SizeInt32+fileIdentifierLength+sizePrefixLength)
+	for i := fileIdentifierLength - 1; i >= 0; i-- {
+		// place the file identifier
+		b.PlaceByte(fid[i])
+	}
+	// finish
+	b.finish(rootTable, true)
+}
+
+// Finish finalizes a buffer, pointing to the given `rootTable`.
+func (b *Builder) Finish(rootTable UOffsetT) {
+	b.finish(rootTable, false)
+}
+
+// finish finalizes a buffer, pointing to the given `rootTable`
+// with an optional size prefix.
+func (b *Builder) finish(rootTable UOffsetT, sizePrefix bool) {
+	b.assertNotNested()
+
+	if sizePrefix {
+		b.Prep(b.minalign, SizeUOffsetT+sizePrefixLength)
+	} else {
+		b.Prep(b.minalign, SizeUOffsetT)
+	}
+
+	b.PrependUOffsetT(rootTable)
+
+	if sizePrefix {
+		b.PlaceUint32(uint32(b.Offset()))
+	}
+
+	b.finished = true
+}
+
+// vtableEqual compares an unwritten vtable to a written vtable.
+func vtableEqual(a []UOffsetT, objectStart UOffsetT, b []byte) bool {
+	if len(a)*SizeVOffsetT != len(b) {
+		return false
+	}
+
+	for i := 0; i < len(a); i++ {
+		x := GetVOffsetT(b[i*SizeVOffsetT : (i+1)*SizeVOffsetT])
+
+		// Skip vtable entries that indicate a default value.
+		if x == 0 && a[i] == 0 {
+			continue
+		}
+
+		y := SOffsetT(objectStart) - SOffsetT(a[i])
+		if SOffsetT(x) != y {
+			return false
+		}
+	}
+	return true
+}
+
+// PrependBool prepends a bool to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependBool(x bool) {
+	b.Prep(SizeBool, 0)
+	b.PlaceBool(x)
+}
+
+// PrependUint8 prepends a uint8 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependUint8(x uint8) {
+	b.Prep(SizeUint8, 0)
+	b.PlaceUint8(x)
+}
+
+// PrependUint16 prepends a uint16 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependUint16(x uint16) {
+	b.Prep(SizeUint16, 0)
+	b.PlaceUint16(x)
+}
+
+// PrependUint32 prepends a uint32 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependUint32(x uint32) {
+	b.Prep(SizeUint32, 0)
+	b.PlaceUint32(x)
+}
+
+// PrependUint64 prepends a uint64 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependUint64(x uint64) {
+	b.Prep(SizeUint64, 0)
+	b.PlaceUint64(x)
+}
+
+// PrependInt8 prepends a int8 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependInt8(x int8) {
+	b.Prep(SizeInt8, 0)
+	b.PlaceInt8(x)
+}
+
+// PrependInt16 prepends a int16 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependInt16(x int16) {
+	b.Prep(SizeInt16, 0)
+	b.PlaceInt16(x)
+}
+
+// PrependInt32 prepends a int32 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependInt32(x int32) {
+	b.Prep(SizeInt32, 0)
+	b.PlaceInt32(x)
+}
+
+// PrependInt64 prepends a int64 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependInt64(x int64) {
+	b.Prep(SizeInt64, 0)
+	b.PlaceInt64(x)
+}
+
+// PrependFloat32 prepends a float32 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependFloat32(x float32) {
+	b.Prep(SizeFloat32, 0)
+	b.PlaceFloat32(x)
+}
+
+// PrependFloat64 prepends a float64 to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependFloat64(x float64) {
+	b.Prep(SizeFloat64, 0)
+	b.PlaceFloat64(x)
+}
+
+// PrependByte prepends a byte to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependByte(x byte) {
+	b.Prep(SizeByte, 0)
+	b.PlaceByte(x)
+}
+
+// PrependVOffsetT prepends a VOffsetT to the Builder buffer.
+// Aligns and checks for space.
+func (b *Builder) PrependVOffsetT(x VOffsetT) {
+	b.Prep(SizeVOffsetT, 0)
+	b.PlaceVOffsetT(x)
+}
+
+// PlaceBool prepends a bool to the Builder, without checking for space.
+func (b *Builder) PlaceBool(x bool) {
+	b.head -= UOffsetT(SizeBool)
+	WriteBool(b.Bytes[b.head:], x)
+}
+
+// PlaceUint8 prepends a uint8 to the Builder, without checking for space.
+func (b *Builder) PlaceUint8(x uint8) {
+	b.head -= UOffsetT(SizeUint8)
+	WriteUint8(b.Bytes[b.head:], x)
+}
+
+// PlaceUint16 prepends a uint16 to the Builder, without checking for space.
+func (b *Builder) PlaceUint16(x uint16) {
+	b.head -= UOffsetT(SizeUint16)
+	WriteUint16(b.Bytes[b.head:], x)
+}
+
+// PlaceUint32 prepends a uint32 to the Builder, without checking for space.
+func (b *Builder) PlaceUint32(x uint32) {
+	b.head -= UOffsetT(SizeUint32)
+	WriteUint32(b.Bytes[b.head:], x)
+}
+
+// PlaceUint64 prepends a uint64 to the Builder, without checking for space.
+func (b *Builder) PlaceUint64(x uint64) {
+	b.head -= UOffsetT(SizeUint64)
+	WriteUint64(b.Bytes[b.head:], x)
+}
+
+// PlaceInt8 prepends a int8 to the Builder, without checking for space.
+func (b *Builder) PlaceInt8(x int8) {
+	b.head -= UOffsetT(SizeInt8)
+	WriteInt8(b.Bytes[b.head:], x)
+}
+
+// PlaceInt16 prepends a int16 to the Builder, without checking for space.
+func (b *Builder) PlaceInt16(x int16) {
+	b.head -= UOffsetT(SizeInt16)
+	WriteInt16(b.Bytes[b.head:], x)
+}
+
+// PlaceInt32 prepends a int32 to the Builder, without checking for space.
+func (b *Builder) PlaceInt32(x int32) {
+	b.head -= UOffsetT(SizeInt32)
+	WriteInt32(b.Bytes[b.head:], x)
+}
+
+// PlaceInt64 prepends a int64 to the Builder, without checking for space.
+func (b *Builder) PlaceInt64(x int64) {
+	b.head -= UOffsetT(SizeInt64)
+	WriteInt64(b.Bytes[b.head:], x)
+}
+
+// PlaceFloat32 prepends a float32 to the Builder, without checking for space.
+func (b *Builder) PlaceFloat32(x float32) {
+	b.head -= UOffsetT(SizeFloat32)
+	WriteFloat32(b.Bytes[b.head:], x)
+}
+
+// PlaceFloat64 prepends a float64 to the Builder, without checking for space.
+func (b *Builder) PlaceFloat64(x float64) {
+	b.head -= UOffsetT(SizeFloat64)
+	WriteFloat64(b.Bytes[b.head:], x)
+}
+
+// PlaceByte prepends a byte to the Builder, without checking for space.
+func (b *Builder) PlaceByte(x byte) {
+	b.head -= UOffsetT(SizeByte)
+	WriteByte(b.Bytes[b.head:], x)
+}
+
+// PlaceVOffsetT prepends a VOffsetT to the Builder, without checking for space.
+func (b *Builder) PlaceVOffsetT(x VOffsetT) {
+	b.head -= UOffsetT(SizeVOffsetT)
+	WriteVOffsetT(b.Bytes[b.head:], x)
+}
+
+// PlaceSOffsetT prepends a SOffsetT to the Builder, without checking for space.
+func (b *Builder) PlaceSOffsetT(x SOffsetT) {
+	b.head -= UOffsetT(SizeSOffsetT)
+	WriteSOffsetT(b.Bytes[b.head:], x)
+}
+
+// PlaceUOffsetT prepends a UOffsetT to the Builder, without checking for space.
+func (b *Builder) PlaceUOffsetT(x UOffsetT) {
+	b.head -= UOffsetT(SizeUOffsetT)
+	WriteUOffsetT(b.Bytes[b.head:], x)
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/doc.go b/3rdparty/TNN/third_party/flatbuffers/go/doc.go
new file mode 100644
index 0000000..694edc7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/doc.go
@@ -0,0 +1,3 @@
+// Package flatbuffers provides facilities to read and write flatbuffers
+// objects.
+package flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/encode.go b/3rdparty/TNN/third_party/flatbuffers/go/encode.go
new file mode 100644
index 0000000..a2a5798
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/encode.go
@@ -0,0 +1,238 @@
+package flatbuffers
+
+import (
+	"math"
+)
+
+type (
+	// A SOffsetT stores a signed offset into arbitrary data.
+	SOffsetT int32
+	// A UOffsetT stores an unsigned offset into vector data.
+	UOffsetT uint32
+	// A VOffsetT stores an unsigned offset in a vtable.
+	VOffsetT uint16
+)
+
+const (
+	// VtableMetadataFields is the count of metadata fields in each vtable.
+	VtableMetadataFields = 2
+)
+
+// GetByte decodes a little-endian byte from a byte slice.
+func GetByte(buf []byte) byte {
+	return byte(GetUint8(buf))
+}
+
+// GetBool decodes a little-endian bool from a byte slice.
+func GetBool(buf []byte) bool {
+	return buf[0] == 1
+}
+
+// GetUint8 decodes a little-endian uint8 from a byte slice.
+func GetUint8(buf []byte) (n uint8) {
+	n = uint8(buf[0])
+	return
+}
+
+// GetUint16 decodes a little-endian uint16 from a byte slice.
+func GetUint16(buf []byte) (n uint16) {
+	_ = buf[1] // Force one bounds check. See: golang.org/issue/14808
+	n |= uint16(buf[0])
+	n |= uint16(buf[1]) << 8
+	return
+}
+
+// GetUint32 decodes a little-endian uint32 from a byte slice.
+func GetUint32(buf []byte) (n uint32) {
+	_ = buf[3] // Force one bounds check. See: golang.org/issue/14808
+	n |= uint32(buf[0])
+	n |= uint32(buf[1]) << 8
+	n |= uint32(buf[2]) << 16
+	n |= uint32(buf[3]) << 24
+	return
+}
+
+// GetUint64 decodes a little-endian uint64 from a byte slice.
+func GetUint64(buf []byte) (n uint64) {
+	_ = buf[7] // Force one bounds check. See: golang.org/issue/14808
+	n |= uint64(buf[0])
+	n |= uint64(buf[1]) << 8
+	n |= uint64(buf[2]) << 16
+	n |= uint64(buf[3]) << 24
+	n |= uint64(buf[4]) << 32
+	n |= uint64(buf[5]) << 40
+	n |= uint64(buf[6]) << 48
+	n |= uint64(buf[7]) << 56
+	return
+}
+
+// GetInt8 decodes a little-endian int8 from a byte slice.
+func GetInt8(buf []byte) (n int8) {
+	n = int8(buf[0])
+	return
+}
+
+// GetInt16 decodes a little-endian int16 from a byte slice.
+func GetInt16(buf []byte) (n int16) {
+	_ = buf[1] // Force one bounds check. See: golang.org/issue/14808
+	n |= int16(buf[0])
+	n |= int16(buf[1]) << 8
+	return
+}
+
+// GetInt32 decodes a little-endian int32 from a byte slice.
+func GetInt32(buf []byte) (n int32) {
+	_ = buf[3] // Force one bounds check. See: golang.org/issue/14808
+	n |= int32(buf[0])
+	n |= int32(buf[1]) << 8
+	n |= int32(buf[2]) << 16
+	n |= int32(buf[3]) << 24
+	return
+}
+
+// GetInt64 decodes a little-endian int64 from a byte slice.
+func GetInt64(buf []byte) (n int64) {
+	_ = buf[7] // Force one bounds check. See: golang.org/issue/14808
+	n |= int64(buf[0])
+	n |= int64(buf[1]) << 8
+	n |= int64(buf[2]) << 16
+	n |= int64(buf[3]) << 24
+	n |= int64(buf[4]) << 32
+	n |= int64(buf[5]) << 40
+	n |= int64(buf[6]) << 48
+	n |= int64(buf[7]) << 56
+	return
+}
+
+// GetFloat32 decodes a little-endian float32 from a byte slice.
+func GetFloat32(buf []byte) float32 {
+	x := GetUint32(buf)
+	return math.Float32frombits(x)
+}
+
+// GetFloat64 decodes a little-endian float64 from a byte slice.
+func GetFloat64(buf []byte) float64 {
+	x := GetUint64(buf)
+	return math.Float64frombits(x)
+}
+
+// GetUOffsetT decodes a little-endian UOffsetT from a byte slice.
+func GetUOffsetT(buf []byte) UOffsetT {
+	return UOffsetT(GetUint32(buf))
+}
+
+// GetSOffsetT decodes a little-endian SOffsetT from a byte slice.
+func GetSOffsetT(buf []byte) SOffsetT {
+	return SOffsetT(GetInt32(buf))
+}
+
+// GetVOffsetT decodes a little-endian VOffsetT from a byte slice.
+func GetVOffsetT(buf []byte) VOffsetT {
+	return VOffsetT(GetUint16(buf))
+}
+
+// WriteByte encodes a little-endian uint8 into a byte slice.
+func WriteByte(buf []byte, n byte) {
+	WriteUint8(buf, uint8(n))
+}
+
+// WriteBool encodes a little-endian bool into a byte slice.
+func WriteBool(buf []byte, b bool) {
+	buf[0] = 0
+	if b {
+		buf[0] = 1
+	}
+}
+
+// WriteUint8 encodes a little-endian uint8 into a byte slice.
+func WriteUint8(buf []byte, n uint8) {
+	buf[0] = byte(n)
+}
+
+// WriteUint16 encodes a little-endian uint16 into a byte slice.
+func WriteUint16(buf []byte, n uint16) {
+	_ = buf[1] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+}
+
+// WriteUint32 encodes a little-endian uint32 into a byte slice.
+func WriteUint32(buf []byte, n uint32) {
+	_ = buf[3] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+	buf[2] = byte(n >> 16)
+	buf[3] = byte(n >> 24)
+}
+
+// WriteUint64 encodes a little-endian uint64 into a byte slice.
+func WriteUint64(buf []byte, n uint64) {
+	_ = buf[7] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+	buf[2] = byte(n >> 16)
+	buf[3] = byte(n >> 24)
+	buf[4] = byte(n >> 32)
+	buf[5] = byte(n >> 40)
+	buf[6] = byte(n >> 48)
+	buf[7] = byte(n >> 56)
+}
+
+// WriteInt8 encodes a little-endian int8 into a byte slice.
+func WriteInt8(buf []byte, n int8) {
+	buf[0] = byte(n)
+}
+
+// WriteInt16 encodes a little-endian int16 into a byte slice.
+func WriteInt16(buf []byte, n int16) {
+	_ = buf[1] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+}
+
+// WriteInt32 encodes a little-endian int32 into a byte slice.
+func WriteInt32(buf []byte, n int32) {
+	_ = buf[3] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+	buf[2] = byte(n >> 16)
+	buf[3] = byte(n >> 24)
+}
+
+// WriteInt64 encodes a little-endian int64 into a byte slice.
+func WriteInt64(buf []byte, n int64) {
+	_ = buf[7] // Force one bounds check. See: golang.org/issue/14808
+	buf[0] = byte(n)
+	buf[1] = byte(n >> 8)
+	buf[2] = byte(n >> 16)
+	buf[3] = byte(n >> 24)
+	buf[4] = byte(n >> 32)
+	buf[5] = byte(n >> 40)
+	buf[6] = byte(n >> 48)
+	buf[7] = byte(n >> 56)
+}
+
+// WriteFloat32 encodes a little-endian float32 into a byte slice.
+func WriteFloat32(buf []byte, n float32) {
+	WriteUint32(buf, math.Float32bits(n))
+}
+
+// WriteFloat64 encodes a little-endian float64 into a byte slice.
+func WriteFloat64(buf []byte, n float64) {
+	WriteUint64(buf, math.Float64bits(n))
+}
+
+// WriteVOffsetT encodes a little-endian VOffsetT into a byte slice.
+func WriteVOffsetT(buf []byte, n VOffsetT) {
+	WriteUint16(buf, uint16(n))
+}
+
+// WriteSOffsetT encodes a little-endian SOffsetT into a byte slice.
+func WriteSOffsetT(buf []byte, n SOffsetT) {
+	WriteInt32(buf, int32(n))
+}
+
+// WriteUOffsetT encodes a little-endian UOffsetT into a byte slice.
+func WriteUOffsetT(buf []byte, n UOffsetT) {
+	WriteUint32(buf, uint32(n))
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/grpc.go b/3rdparty/TNN/third_party/flatbuffers/go/grpc.go
new file mode 100644
index 0000000..15f1a51
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/grpc.go
@@ -0,0 +1,38 @@
+package flatbuffers
+
+// Codec implements gRPC-go Codec which is used to encode and decode messages.
+var Codec = "flatbuffers"
+
+// FlatbuffersCodec defines the interface gRPC uses to encode and decode messages.  Note
+// that implementations of this interface must be thread safe; a Codec's
+// methods can be called from concurrent goroutines.
+type FlatbuffersCodec struct{}
+
+// Marshal returns the wire format of v.
+func (FlatbuffersCodec) Marshal(v interface{}) ([]byte, error) {
+	return v.(*Builder).FinishedBytes(), nil
+}
+
+// Unmarshal parses the wire format into v.
+func (FlatbuffersCodec) Unmarshal(data []byte, v interface{}) error {
+	v.(flatbuffersInit).Init(data, GetUOffsetT(data))
+	return nil
+}
+
+// String  old gRPC Codec interface func
+func (FlatbuffersCodec) String() string {
+	return Codec
+}
+
+// Name returns the name of the Codec implementation. The returned string
+// will be used as part of content type in transmission.  The result must be
+// static; the result cannot change between calls.
+//
+// add Name() for ForceCodec interface
+func (FlatbuffersCodec) Name() string {
+	return Codec
+}
+
+type flatbuffersInit interface {
+	Init(data []byte, i UOffsetT)
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/lib.go b/3rdparty/TNN/third_party/flatbuffers/go/lib.go
new file mode 100644
index 0000000..9a333ff
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/lib.go
@@ -0,0 +1,25 @@
+package flatbuffers
+
+// FlatBuffer is the interface that represents a flatbuffer.
+type FlatBuffer interface {
+	Table() Table
+	Init(buf []byte, i UOffsetT)
+}
+
+// GetRootAs is a generic helper to initialize a FlatBuffer with the provided buffer bytes and its data offset.
+func GetRootAs(buf []byte, offset UOffsetT, fb FlatBuffer) {
+	n := GetUOffsetT(buf[offset:])
+	fb.Init(buf, n+offset)
+}
+
+// GetSizePrefixedRootAs is a generic helper to initialize a FlatBuffer with the provided size-prefixed buffer
+// bytes and its data offset
+func GetSizePrefixedRootAs(buf []byte, offset UOffsetT, fb FlatBuffer) {
+	n := GetUOffsetT(buf[offset+sizePrefixLength:])
+	fb.Init(buf, n+offset+sizePrefixLength)
+}
+
+// GetSizePrefix reads the size from a size-prefixed flatbuffer
+func GetSizePrefix(buf []byte, offset UOffsetT) uint32 {
+	return GetUint32(buf[offset:])
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/sizes.go b/3rdparty/TNN/third_party/flatbuffers/go/sizes.go
new file mode 100644
index 0000000..ba22169
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/sizes.go
@@ -0,0 +1,55 @@
+package flatbuffers
+
+import (
+	"unsafe"
+)
+
+const (
+	// See http://golang.org/ref/spec#Numeric_types
+
+	// SizeUint8 is the byte size of a uint8.
+	SizeUint8 = 1
+	// SizeUint16 is the byte size of a uint16.
+	SizeUint16 = 2
+	// SizeUint32 is the byte size of a uint32.
+	SizeUint32 = 4
+	// SizeUint64 is the byte size of a uint64.
+	SizeUint64 = 8
+
+	// SizeInt8 is the byte size of a int8.
+	SizeInt8 = 1
+	// SizeInt16 is the byte size of a int16.
+	SizeInt16 = 2
+	// SizeInt32 is the byte size of a int32.
+	SizeInt32 = 4
+	// SizeInt64 is the byte size of a int64.
+	SizeInt64 = 8
+
+	// SizeFloat32 is the byte size of a float32.
+	SizeFloat32 = 4
+	// SizeFloat64 is the byte size of a float64.
+	SizeFloat64 = 8
+
+	// SizeByte is the byte size of a byte.
+	// The `byte` type is aliased (by Go definition) to uint8.
+	SizeByte = 1
+
+	// SizeBool is the byte size of a bool.
+	// The `bool` type is aliased (by flatbuffers convention) to uint8.
+	SizeBool = 1
+
+	// SizeSOffsetT is the byte size of an SOffsetT.
+	// The `SOffsetT` type is aliased (by flatbuffers convention) to int32.
+	SizeSOffsetT = 4
+	// SizeUOffsetT is the byte size of an UOffsetT.
+	// The `UOffsetT` type is aliased (by flatbuffers convention) to uint32.
+	SizeUOffsetT = 4
+	// SizeVOffsetT is the byte size of an VOffsetT.
+	// The `VOffsetT` type is aliased (by flatbuffers convention) to uint16.
+	SizeVOffsetT = 2
+)
+
+// byteSliceToString converts a []byte to string without a heap allocation.
+func byteSliceToString(b []byte) string {
+	return *(*string)(unsafe.Pointer(&b))
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/struct.go b/3rdparty/TNN/third_party/flatbuffers/go/struct.go
new file mode 100644
index 0000000..11258f7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/struct.go
@@ -0,0 +1,8 @@
+package flatbuffers
+
+// Struct wraps a byte slice and provides read access to its data.
+//
+// Structs do not have a vtable.
+type Struct struct {
+	Table
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/go/table.go b/3rdparty/TNN/third_party/flatbuffers/go/table.go
new file mode 100644
index 0000000..b273146
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/go/table.go
@@ -0,0 +1,505 @@
+package flatbuffers
+
+// Table wraps a byte slice and provides read access to its data.
+//
+// The variable `Pos` indicates the root of the FlatBuffers object therein.
+type Table struct {
+	Bytes []byte
+	Pos   UOffsetT // Always < 1<<31.
+}
+
+// Offset provides access into the Table's vtable.
+//
+// Fields which are deprecated are ignored by checking against the vtable's length.
+func (t *Table) Offset(vtableOffset VOffsetT) VOffsetT {
+	vtable := UOffsetT(SOffsetT(t.Pos) - t.GetSOffsetT(t.Pos))
+	if vtableOffset < t.GetVOffsetT(vtable) {
+		return t.GetVOffsetT(vtable + UOffsetT(vtableOffset))
+	}
+	return 0
+}
+
+// Indirect retrieves the relative offset stored at `offset`.
+func (t *Table) Indirect(off UOffsetT) UOffsetT {
+	return off + GetUOffsetT(t.Bytes[off:])
+}
+
+// String gets a string from data stored inside the flatbuffer.
+func (t *Table) String(off UOffsetT) string {
+	b := t.ByteVector(off)
+	return byteSliceToString(b)
+}
+
+// ByteVector gets a byte slice from data stored inside the flatbuffer.
+func (t *Table) ByteVector(off UOffsetT) []byte {
+	off += GetUOffsetT(t.Bytes[off:])
+	start := off + UOffsetT(SizeUOffsetT)
+	length := GetUOffsetT(t.Bytes[off:])
+	return t.Bytes[start : start+length]
+}
+
+// VectorLen retrieves the length of the vector whose offset is stored at
+// "off" in this object.
+func (t *Table) VectorLen(off UOffsetT) int {
+	off += t.Pos
+	off += GetUOffsetT(t.Bytes[off:])
+	return int(GetUOffsetT(t.Bytes[off:]))
+}
+
+// Vector retrieves the start of data of the vector whose offset is stored
+// at "off" in this object.
+func (t *Table) Vector(off UOffsetT) UOffsetT {
+	off += t.Pos
+	x := off + GetUOffsetT(t.Bytes[off:])
+	// data starts after metadata containing the vector length
+	x += UOffsetT(SizeUOffsetT)
+	return x
+}
+
+// Union initializes any Table-derived type to point to the union at the given
+// offset.
+func (t *Table) Union(t2 *Table, off UOffsetT) {
+	off += t.Pos
+	t2.Pos = off + t.GetUOffsetT(off)
+	t2.Bytes = t.Bytes
+}
+
+// GetBool retrieves a bool at the given offset.
+func (t *Table) GetBool(off UOffsetT) bool {
+	return GetBool(t.Bytes[off:])
+}
+
+// GetByte retrieves a byte at the given offset.
+func (t *Table) GetByte(off UOffsetT) byte {
+	return GetByte(t.Bytes[off:])
+}
+
+// GetUint8 retrieves a uint8 at the given offset.
+func (t *Table) GetUint8(off UOffsetT) uint8 {
+	return GetUint8(t.Bytes[off:])
+}
+
+// GetUint16 retrieves a uint16 at the given offset.
+func (t *Table) GetUint16(off UOffsetT) uint16 {
+	return GetUint16(t.Bytes[off:])
+}
+
+// GetUint32 retrieves a uint32 at the given offset.
+func (t *Table) GetUint32(off UOffsetT) uint32 {
+	return GetUint32(t.Bytes[off:])
+}
+
+// GetUint64 retrieves a uint64 at the given offset.
+func (t *Table) GetUint64(off UOffsetT) uint64 {
+	return GetUint64(t.Bytes[off:])
+}
+
+// GetInt8 retrieves a int8 at the given offset.
+func (t *Table) GetInt8(off UOffsetT) int8 {
+	return GetInt8(t.Bytes[off:])
+}
+
+// GetInt16 retrieves a int16 at the given offset.
+func (t *Table) GetInt16(off UOffsetT) int16 {
+	return GetInt16(t.Bytes[off:])
+}
+
+// GetInt32 retrieves a int32 at the given offset.
+func (t *Table) GetInt32(off UOffsetT) int32 {
+	return GetInt32(t.Bytes[off:])
+}
+
+// GetInt64 retrieves a int64 at the given offset.
+func (t *Table) GetInt64(off UOffsetT) int64 {
+	return GetInt64(t.Bytes[off:])
+}
+
+// GetFloat32 retrieves a float32 at the given offset.
+func (t *Table) GetFloat32(off UOffsetT) float32 {
+	return GetFloat32(t.Bytes[off:])
+}
+
+// GetFloat64 retrieves a float64 at the given offset.
+func (t *Table) GetFloat64(off UOffsetT) float64 {
+	return GetFloat64(t.Bytes[off:])
+}
+
+// GetUOffsetT retrieves a UOffsetT at the given offset.
+func (t *Table) GetUOffsetT(off UOffsetT) UOffsetT {
+	return GetUOffsetT(t.Bytes[off:])
+}
+
+// GetVOffsetT retrieves a VOffsetT at the given offset.
+func (t *Table) GetVOffsetT(off UOffsetT) VOffsetT {
+	return GetVOffsetT(t.Bytes[off:])
+}
+
+// GetSOffsetT retrieves a SOffsetT at the given offset.
+func (t *Table) GetSOffsetT(off UOffsetT) SOffsetT {
+	return GetSOffsetT(t.Bytes[off:])
+}
+
+// GetBoolSlot retrieves the bool that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetBoolSlot(slot VOffsetT, d bool) bool {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetBool(t.Pos + UOffsetT(off))
+}
+
+// GetByteSlot retrieves the byte that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetByteSlot(slot VOffsetT, d byte) byte {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetByte(t.Pos + UOffsetT(off))
+}
+
+// GetInt8Slot retrieves the int8 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetInt8Slot(slot VOffsetT, d int8) int8 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetInt8(t.Pos + UOffsetT(off))
+}
+
+// GetUint8Slot retrieves the uint8 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetUint8Slot(slot VOffsetT, d uint8) uint8 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetUint8(t.Pos + UOffsetT(off))
+}
+
+// GetInt16Slot retrieves the int16 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetInt16Slot(slot VOffsetT, d int16) int16 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetInt16(t.Pos + UOffsetT(off))
+}
+
+// GetUint16Slot retrieves the uint16 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetUint16Slot(slot VOffsetT, d uint16) uint16 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetUint16(t.Pos + UOffsetT(off))
+}
+
+// GetInt32Slot retrieves the int32 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetInt32Slot(slot VOffsetT, d int32) int32 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetInt32(t.Pos + UOffsetT(off))
+}
+
+// GetUint32Slot retrieves the uint32 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetUint32Slot(slot VOffsetT, d uint32) uint32 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetUint32(t.Pos + UOffsetT(off))
+}
+
+// GetInt64Slot retrieves the int64 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetInt64Slot(slot VOffsetT, d int64) int64 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetInt64(t.Pos + UOffsetT(off))
+}
+
+// GetUint64Slot retrieves the uint64 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetUint64Slot(slot VOffsetT, d uint64) uint64 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetUint64(t.Pos + UOffsetT(off))
+}
+
+// GetFloat32Slot retrieves the float32 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetFloat32Slot(slot VOffsetT, d float32) float32 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetFloat32(t.Pos + UOffsetT(off))
+}
+
+// GetFloat64Slot retrieves the float64 that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetFloat64Slot(slot VOffsetT, d float64) float64 {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+
+	return t.GetFloat64(t.Pos + UOffsetT(off))
+}
+
+// GetVOffsetTSlot retrieves the VOffsetT that the given vtable location
+// points to. If the vtable value is zero, the default value `d`
+// will be returned.
+func (t *Table) GetVOffsetTSlot(slot VOffsetT, d VOffsetT) VOffsetT {
+	off := t.Offset(slot)
+	if off == 0 {
+		return d
+	}
+	return VOffsetT(off)
+}
+
+// MutateBool updates a bool at the given offset.
+func (t *Table) MutateBool(off UOffsetT, n bool) bool {
+	WriteBool(t.Bytes[off:], n)
+	return true
+}
+
+// MutateByte updates a Byte at the given offset.
+func (t *Table) MutateByte(off UOffsetT, n byte) bool {
+	WriteByte(t.Bytes[off:], n)
+	return true
+}
+
+// MutateUint8 updates a Uint8 at the given offset.
+func (t *Table) MutateUint8(off UOffsetT, n uint8) bool {
+	WriteUint8(t.Bytes[off:], n)
+	return true
+}
+
+// MutateUint16 updates a Uint16 at the given offset.
+func (t *Table) MutateUint16(off UOffsetT, n uint16) bool {
+	WriteUint16(t.Bytes[off:], n)
+	return true
+}
+
+// MutateUint32 updates a Uint32 at the given offset.
+func (t *Table) MutateUint32(off UOffsetT, n uint32) bool {
+	WriteUint32(t.Bytes[off:], n)
+	return true
+}
+
+// MutateUint64 updates a Uint64 at the given offset.
+func (t *Table) MutateUint64(off UOffsetT, n uint64) bool {
+	WriteUint64(t.Bytes[off:], n)
+	return true
+}
+
+// MutateInt8 updates a Int8 at the given offset.
+func (t *Table) MutateInt8(off UOffsetT, n int8) bool {
+	WriteInt8(t.Bytes[off:], n)
+	return true
+}
+
+// MutateInt16 updates a Int16 at the given offset.
+func (t *Table) MutateInt16(off UOffsetT, n int16) bool {
+	WriteInt16(t.Bytes[off:], n)
+	return true
+}
+
+// MutateInt32 updates a Int32 at the given offset.
+func (t *Table) MutateInt32(off UOffsetT, n int32) bool {
+	WriteInt32(t.Bytes[off:], n)
+	return true
+}
+
+// MutateInt64 updates a Int64 at the given offset.
+func (t *Table) MutateInt64(off UOffsetT, n int64) bool {
+	WriteInt64(t.Bytes[off:], n)
+	return true
+}
+
+// MutateFloat32 updates a Float32 at the given offset.
+func (t *Table) MutateFloat32(off UOffsetT, n float32) bool {
+	WriteFloat32(t.Bytes[off:], n)
+	return true
+}
+
+// MutateFloat64 updates a Float64 at the given offset.
+func (t *Table) MutateFloat64(off UOffsetT, n float64) bool {
+	WriteFloat64(t.Bytes[off:], n)
+	return true
+}
+
+// MutateUOffsetT updates a UOffsetT at the given offset.
+func (t *Table) MutateUOffsetT(off UOffsetT, n UOffsetT) bool {
+	WriteUOffsetT(t.Bytes[off:], n)
+	return true
+}
+
+// MutateVOffsetT updates a VOffsetT at the given offset.
+func (t *Table) MutateVOffsetT(off UOffsetT, n VOffsetT) bool {
+	WriteVOffsetT(t.Bytes[off:], n)
+	return true
+}
+
+// MutateSOffsetT updates a SOffsetT at the given offset.
+func (t *Table) MutateSOffsetT(off UOffsetT, n SOffsetT) bool {
+	WriteSOffsetT(t.Bytes[off:], n)
+	return true
+}
+
+// MutateBoolSlot updates the bool at given vtable location
+func (t *Table) MutateBoolSlot(slot VOffsetT, n bool) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateBool(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateByteSlot updates the byte at given vtable location
+func (t *Table) MutateByteSlot(slot VOffsetT, n byte) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateByte(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateInt8Slot updates the int8 at given vtable location
+func (t *Table) MutateInt8Slot(slot VOffsetT, n int8) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateInt8(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateUint8Slot updates the uint8 at given vtable location
+func (t *Table) MutateUint8Slot(slot VOffsetT, n uint8) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateUint8(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateInt16Slot updates the int16 at given vtable location
+func (t *Table) MutateInt16Slot(slot VOffsetT, n int16) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateInt16(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateUint16Slot updates the uint16 at given vtable location
+func (t *Table) MutateUint16Slot(slot VOffsetT, n uint16) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateUint16(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateInt32Slot updates the int32 at given vtable location
+func (t *Table) MutateInt32Slot(slot VOffsetT, n int32) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateInt32(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateUint32Slot updates the uint32 at given vtable location
+func (t *Table) MutateUint32Slot(slot VOffsetT, n uint32) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateUint32(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateInt64Slot updates the int64 at given vtable location
+func (t *Table) MutateInt64Slot(slot VOffsetT, n int64) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateInt64(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateUint64Slot updates the uint64 at given vtable location
+func (t *Table) MutateUint64Slot(slot VOffsetT, n uint64) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateUint64(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateFloat32Slot updates the float32 at given vtable location
+func (t *Table) MutateFloat32Slot(slot VOffsetT, n float32) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateFloat32(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
+
+// MutateFloat64Slot updates the float64 at given vtable location
+func (t *Table) MutateFloat64Slot(slot VOffsetT, n float64) bool {
+	if off := t.Offset(slot); off != 0 {
+		t.MutateFloat64(t.Pos+UOffsetT(off), n)
+		return true
+	}
+
+	return false
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/grpc/BUILD.bazel
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/README.md
new file mode 100644
index 0000000..685003f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/README.md
@@ -0,0 +1,43 @@
+GRPC implementation and test
+============================
+
+NOTE: files in `src/` are shared with the GRPC project, and maintained there
+(any changes should be submitted to GRPC instead). These files are copied
+from GRPC, and work with both the Protobuf and FlatBuffers code generator.
+
+`tests/` contains a GRPC specific test, you need to have built and installed
+the GRPC libraries for this to compile. This test will build using the
+`FLATBUFFERS_BUILD_GRPCTEST` option to the main FlatBuffers CMake project.
+
+## Building Flatbuffers with gRPC
+
+### Linux
+
+1. Download, build and install gRPC. See [instructions](https://github.com/grpc/grpc/tree/master/src/cpp).
+    * Lets say your gRPC clone is at `/your/path/to/grpc_repo`.
+    * Install gRPC in a custom directory by running `make install prefix=/your/path/to/grpc_repo/install`.
+2. `export GRPC_INSTALL_PATH=/your/path/to/grpc_repo/install`
+3. `export PROTOBUF_DOWNLOAD_PATH=/your/path/to/grpc_repo/third_party/protobuf`
+4. `mkdir build ; cd build`
+5. `cmake -DFLATBUFFERS_BUILD_GRPCTEST=ON -DGRPC_INSTALL_PATH=${GRPC_INSTALL_PATH} -DPROTOBUF_DOWNLOAD_PATH=${PROTOBUF_DOWNLOAD_PATH} ..`
+6. `make`
+
+For Bazel users:
+
+```shell
+$bazel test src/compiler/...
+```
+
+## Running FlatBuffer gRPC tests
+
+### Linux
+
+1. `ln -s ${GRPC_INSTALL_PATH}/lib/libgrpc++_unsecure.so.6 ${GRPC_INSTALL_PATH}/lib/libgrpc++_unsecure.so.1`
+2. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${GRPC_INSTALL_PATH}/lib`
+3. `make test ARGS=-V` 
+
+For Bazel users:
+
+```shell
+$bazel test tests/...
+```
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/boringssl.patch b/3rdparty/TNN/third_party/flatbuffers/grpc/boringssl.patch
new file mode 100644
index 0000000..632499e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/boringssl.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 1645a264a..12f8ca999 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -635,6 +635,8 @@ add_library(
+   src/ssl/tls_record.cc
+ )
+ 
++target_link_libraries(ssl crypto)
++
+ add_executable(
+   bssl
+ 
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/build_grpc.sh b/3rdparty/TNN/third_party/flatbuffers/grpc/build_grpc.sh
new file mode 100755
index 0000000..190a1ef
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/build_grpc.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+grpc_1_36_0_githash=736e3758351ced3cd842bad3ba4e2540f01bbc48
+
+function build_grpc () {
+  git clone https://github.com/grpc/grpc.git google/grpc
+  cd google/grpc
+  git checkout ${grpc_1_36_0_githash}
+  git submodule update --init
+  # Apply boringssl build patch
+  cd third_party/boringssl-with-bazel
+  git apply ../../../../grpc/boringssl.patch
+  cd ../..
+  mkdir ../grpc_build
+  cd ../grpc_build
+  cmake ../grpc -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../grpc/install
+  cmake --build . --target install ${JOBS:+-j$JOBS}
+  if [ ! -f ${GRPC_INSTALL_PATH}/lib/libgrpc++_unsecure.so.1 ]; then
+    ln -s ${GRPC_INSTALL_PATH}/lib/libgrpc++_unsecure.so.6 ${GRPC_INSTALL_PATH}/lib/libgrpc++_unsecure.so.1
+  fi
+  cd ../..
+}
+
+GRPC_INSTALL_PATH=`pwd`/google/grpc/install
+PROTOBUF_DOWNLOAD_PATH=`pwd`/google/grpc/third_party/protobuf
+
+build_grpc
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/README.md
new file mode 100644
index 0000000..c821218
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/README.md
@@ -0,0 +1,35 @@
+## Languages known issues
+
+### Python
+
+- Assert the type required in your server/client since python is able to receive `Bytes array` or `utf8 strings`.
+
+```python
+def SayHello(self, request, context):
+    # request might be a byte array or a utf8 string
+
+    r = HelloRequest.HelloRequest().GetRootAs(request, 0)
+    reply = "Unknown"
+    if r.Name():
+        reply = r.Name()
+    # Issues might happen if type checking isnt present.
+    # thus encoding it as a `reply.decode('UTF-8')`
+    return build_reply("welcome " + reply.decode('UTF-8'))
+
+```
+
+This can be prevented by making sure all the requests coming to/from python are `Bytes array`
+
+```python
+def say_hello(stub, builder):
+    hello_request = bytes(builder.Output())
+    reply = stub.SayHello(hello_request)
+    r = HelloReply.HelloReply.GetRootAs(reply)
+    print(r.Message())
+```
+
+### Go
+
+- Always requires the `content-type` of the payload to be set to `application/grpc+flatbuffers`
+
+example: `.SayHello(ctx, b, grpc.CallContentSubtype("flatbuffers"))`
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/generate.sh b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/generate.sh
new file mode 100755
index 0000000..b0740d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/generate.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# Copyright 2021 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+current_dir=`pwd`
+
+cd ../..
+
+main_dir=`pwd`
+
+cd ${current_dir}
+
+# Looks for flatc within the root dir & debug
+if [ -e ${main_dir}/flatc ]; then
+  alias fbc='${main_dir}/flatc'
+elif [ -e ${main_dir}/Debug/flatc ]; then
+  alias fbc='${main_dir}/Debug/flatc'
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+generator="--grpc $current_dir/greeter.fbs"
+
+# Regenerate Go lang code
+cd go
+
+cd greeter
+fbc --go ${generator}
+
+cd ${current_dir}
+
+# Regenerate Python code
+cd python
+
+cd greeter
+
+fbc --python ${generator}
+
+cd ${current_dir}
+
+# Regenerate Swift code
+cd swift
+
+cd Greeter/Sources/Model
+fbc --swift ${generator}
+
+cd ${current_dir}
+
+# Regenerate Typescript code
+cd ts
+
+cd greeter/src
+fbc --ts ${generator}
+
+cd ${current_dir}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/format.sh b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/format.sh
new file mode 100644
index 0000000..a7ee9e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/format.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Copyright 2021 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+
+format_greeter() {
+  cd greeter
+
+  # Format client
+  cd client
+  gofmt -w .
+  cd ..
+
+  # Format server
+  cd server
+  gofmt -w .
+  cd ..
+
+  cd ..
+}
+
+format_greeter
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/.gitignore b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/.gitignore
new file mode 100644
index 0000000..535db5e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/.gitignore
@@ -0,0 +1,2 @@
+**/server
+**/client
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/README.md
new file mode 100644
index 0000000..3390508
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/README.md
@@ -0,0 +1,25 @@
+# Go Greeter example
+
+## Project Structure
+
+    .
+    ├── server                   # Server module
+    ├── client                   # Client module
+    ├── models                   # Flatbuffers models & main grpc code.
+    └── README.md
+
+## How to run Server:
+
+- `cd server`
+
+- `go clean`
+
+- `go run main.go`
+
+## How to run Client:
+
+- `cd client`
+
+- `go clean`
+
+- `go run main.go --name NAME`
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/Greeter_grpc.go b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/Greeter_grpc.go
new file mode 100644
index 0000000..9a2405c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/Greeter_grpc.go
@@ -0,0 +1,158 @@
+//Generated by gRPC Go plugin
+//If you make any local changes, they will be lost
+//source: greeter
+
+package models
+
+import (
+	context "context"
+	flatbuffers "github.com/google/flatbuffers/go"
+	grpc "google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// Client API for Greeter service
+type GreeterClient interface {
+	SayHello(ctx context.Context, in *flatbuffers.Builder,
+		opts ...grpc.CallOption) (*HelloReply, error)
+	SayManyHellos(ctx context.Context, in *flatbuffers.Builder,
+		opts ...grpc.CallOption) (Greeter_SayManyHellosClient, error)
+}
+
+type greeterClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewGreeterClient(cc grpc.ClientConnInterface) GreeterClient {
+	return &greeterClient{cc}
+}
+
+func (c *greeterClient) SayHello(ctx context.Context, in *flatbuffers.Builder,
+	opts ...grpc.CallOption) (*HelloReply, error) {
+	out := new(HelloReply)
+	err := c.cc.Invoke(ctx, "/models.Greeter/SayHello", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *greeterClient) SayManyHellos(ctx context.Context, in *flatbuffers.Builder,
+	opts ...grpc.CallOption) (Greeter_SayManyHellosClient, error) {
+	stream, err := c.cc.NewStream(ctx, &_Greeter_serviceDesc.Streams[0], "/models.Greeter/SayManyHellos", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &greeterSayManyHellosClient{stream}
+	if err := x.ClientStream.SendMsg(in); err != nil {
+		return nil, err
+	}
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	return x, nil
+}
+
+type Greeter_SayManyHellosClient interface {
+	Recv() (*HelloReply, error)
+	grpc.ClientStream
+}
+
+type greeterSayManyHellosClient struct {
+	grpc.ClientStream
+}
+
+func (x *greeterSayManyHellosClient) Recv() (*HelloReply, error) {
+	m := new(HelloReply)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+// Server API for Greeter service
+type GreeterServer interface {
+	SayHello(context.Context, *HelloRequest) (*flatbuffers.Builder, error)
+	SayManyHellos(*HelloRequest, Greeter_SayManyHellosServer) error
+	mustEmbedUnimplementedGreeterServer()
+}
+
+type UnimplementedGreeterServer struct {
+}
+
+func (UnimplementedGreeterServer) SayHello(context.Context, *HelloRequest) (*flatbuffers.Builder, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method SayHello not implemented")
+}
+
+func (UnimplementedGreeterServer) SayManyHellos(*HelloRequest, Greeter_SayManyHellosServer) error {
+	return status.Errorf(codes.Unimplemented, "method SayManyHellos not implemented")
+}
+
+func (UnimplementedGreeterServer) mustEmbedUnimplementedGreeterServer() {}
+
+type UnsafeGreeterServer interface {
+	mustEmbedUnimplementedGreeterServer()
+}
+
+func RegisterGreeterServer(s grpc.ServiceRegistrar, srv GreeterServer) {
+	s.RegisterService(&_Greeter_serviceDesc, srv)
+}
+
+func _Greeter_SayHello_Handler(srv interface{}, ctx context.Context,
+	dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HelloRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(GreeterServer).SayHello(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/models.Greeter/SayHello",
+	}
+
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(GreeterServer).SayHello(ctx, req.(*HelloRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+func _Greeter_SayManyHellos_Handler(srv interface{}, stream grpc.ServerStream) error {
+	m := new(HelloRequest)
+	if err := stream.RecvMsg(m); err != nil {
+		return err
+	}
+	return srv.(GreeterServer).SayManyHellos(m, &greeterSayManyHellosServer{stream})
+}
+
+type Greeter_SayManyHellosServer interface {
+	Send(*flatbuffers.Builder) error
+	grpc.ServerStream
+}
+
+type greeterSayManyHellosServer struct {
+	grpc.ServerStream
+}
+
+func (x *greeterSayManyHellosServer) Send(m *flatbuffers.Builder) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+var _Greeter_serviceDesc = grpc.ServiceDesc{
+	ServiceName: "models.Greeter",
+	HandlerType: (*GreeterServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "SayHello",
+			Handler:    _Greeter_SayHello_Handler,
+		},
+	},
+	Streams: []grpc.StreamDesc{
+		{
+			StreamName:    "SayManyHellos",
+			Handler:       _Greeter_SayManyHellos_Handler,
+			ServerStreams: true,
+		},
+	},
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloReply.go b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloReply.go
new file mode 100644
index 0000000..bb5db40
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloReply.go
@@ -0,0 +1,52 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package models
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type HelloReply struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsHelloReply(buf []byte, offset flatbuffers.UOffsetT) *HelloReply {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &HelloReply{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsHelloReply(buf []byte, offset flatbuffers.UOffsetT) *HelloReply {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &HelloReply{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *HelloReply) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *HelloReply) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *HelloReply) Message() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func HelloReplyStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func HelloReplyAddMessage(builder *flatbuffers.Builder, message flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(message), 0)
+}
+func HelloReplyEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloRequest.go b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloRequest.go
new file mode 100644
index 0000000..52feab9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/go/greeter/models/HelloRequest.go
@@ -0,0 +1,52 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package models
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type HelloRequest struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsHelloRequest(buf []byte, offset flatbuffers.UOffsetT) *HelloRequest {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &HelloRequest{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsHelloRequest(buf []byte, offset flatbuffers.UOffsetT) *HelloRequest {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &HelloRequest{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *HelloRequest) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *HelloRequest) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *HelloRequest) Name() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func HelloRequestStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func HelloRequestAddName(builder *flatbuffers.Builder, name flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(name), 0)
+}
+func HelloRequestEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/greeter.fbs b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/greeter.fbs
new file mode 100644
index 0000000..651fb67
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/greeter.fbs
@@ -0,0 +1,14 @@
+namespace models;
+
+table HelloReply {
+  message:string;
+}
+
+table HelloRequest {
+  name:string;
+}
+
+rpc_service Greeter {
+  SayHello(HelloRequest):HelloReply;
+  SayManyHellos(HelloRequest):HelloReply (streaming: "server");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/README.md
new file mode 100644
index 0000000..fcf310c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/README.md
@@ -0,0 +1,12 @@
+# Python Greeter example
+
+## Prerequisite
+
+- You need to have grpc python installed on your device `pip install grpcio`
+## How to run Server:
+
+- `python server.py ${PORT}`
+
+## How to run Client:
+
+- `python client.py ${PORT} ${NAME}`
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/client.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/client.py
new file mode 100644
index 0000000..d2d7184
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/client.py
@@ -0,0 +1,40 @@
+import sys		
+import argparse
+import grpc
+
+sys.path.insert(0, '../../../../../flatbuffers/python')
+
+import flatbuffers
+from models import HelloReply, HelloRequest, greeter_grpc_fb
+
+parser = argparse.ArgumentParser()
+parser.add_argument("port", help="server port to connect to", default=3000)
+parser.add_argument("name", help="name to be sent to server", default="flatbuffers")
+
+def say_hello(stub, hello_request):
+    reply = stub.SayHello(hello_request)
+    r = HelloReply.HelloReply.GetRootAs(reply)
+    print(r.Message())
+
+def say_many_hellos(stub, hello_request):
+    greetings = stub.SayManyHellos(hello_request)
+    for greeting in greetings:
+        r = HelloReply.HelloReply.GetRootAs(greeting)
+        print(r.Message())
+
+def main():
+    args = parser.parse_args()
+
+    with grpc.insecure_channel('localhost:' + args.port) as channel:
+        builder = flatbuffers.Builder()		
+        ind = builder.CreateString(args.name)
+        HelloRequest.HelloRequestStart(builder)
+        HelloRequest.HelloRequestAddName(builder, ind)
+        root = HelloRequest.HelloRequestEnd(builder)
+        builder.Finish(root)
+        output = bytes(builder.Output())
+        stub = greeter_grpc_fb.GreeterStub(channel)
+        say_hello(stub, output)
+        say_many_hellos(stub, output)
+
+main()
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloReply.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloReply.py
new file mode 100644
index 0000000..95434dc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloReply.py
@@ -0,0 +1,45 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: models
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class HelloReply(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HelloReply()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHelloReply(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # HelloReply
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # HelloReply
+    def Message(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+def Start(builder): builder.StartObject(1)
+def HelloReplyStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddMessage(builder, message): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(message), 0)
+def HelloReplyAddMessage(builder, message):
+    """This method is deprecated. Please switch to AddMessage."""
+    return AddMessage(builder, message)
+def End(builder): return builder.EndObject()
+def HelloReplyEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloRequest.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloRequest.py
new file mode 100644
index 0000000..0263095
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/HelloRequest.py
@@ -0,0 +1,45 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: models
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class HelloRequest(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HelloRequest()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsHelloRequest(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # HelloRequest
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # HelloRequest
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+def Start(builder): builder.StartObject(1)
+def HelloRequestStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def HelloRequestAddName(builder, name):
+    """This method is deprecated. Please switch to AddName."""
+    return AddName(builder, name)
+def End(builder): return builder.EndObject()
+def HelloRequestEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/__init__.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/greeter_grpc_fb.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/greeter_grpc_fb.py
new file mode 100644
index 0000000..b9f8a4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/models/greeter_grpc_fb.py
@@ -0,0 +1,52 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+
+import grpc
+
+class GreeterStub(object):
+  """ Interface exported by the server. """  
+  
+  def __init__(self, channel):
+    """ Constructor. 
+    
+    Args: 
+    channel: A grpc.Channel. 
+    """
+    
+    self.SayHello = channel.unary_unary(
+      "/models.Greeter/SayHello"
+      )
+    
+    self.SayManyHellos = channel.unary_stream(
+      "/models.Greeter/SayManyHellos"
+      )
+    
+
+class GreeterServicer(object):
+  """ Interface exported by the server. """  
+  
+  def SayHello(self, request, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+  
+  
+  def SayManyHellos(self, request, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+  
+  
+
+def add_GreeterServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+    'SayHello': grpc.unary_unary_rpc_method_handler(
+      servicer.SayHello
+    ),
+    'SayManyHellos': grpc.unary_stream_rpc_method_handler(
+      servicer.SayManyHellos
+    ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+    'models.Greeter', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/server.py b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/server.py
new file mode 100644
index 0000000..acca880
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/python/greeter/server.py
@@ -0,0 +1,57 @@
+from concurrent import futures
+import sys		
+import argparse
+import grpc
+
+sys.path.insert(0, '../../../../../flatbuffers/python')
+
+import flatbuffers
+from models import HelloReply, HelloRequest, greeter_grpc_fb
+
+parser = argparse.ArgumentParser()
+parser.add_argument("port", help="server on port", default=3000)
+
+def build_reply(message):
+    builder = flatbuffers.Builder()		
+    ind = builder.CreateString(message)
+    HelloReply.HelloReplyStart(builder)
+    HelloReply.HelloReplyAddMessage(builder, ind)
+    root = HelloReply.HelloReplyEnd(builder)
+    builder.Finish(root)
+    return bytes(builder.Output())
+
+class GreeterServicer(greeter_grpc_fb.GreeterServicer):
+
+    def __init__(self):
+        self.greetings = ["Hi", "Hallo", "Ciao"]
+
+    def SayHello(self, request, context):
+        r = HelloRequest.HelloRequest().GetRootAs(request, 0)
+        reply = "Unknown"
+        if r.Name():
+            reply = r.Name()
+        return build_reply("welcome " + reply.decode('UTF-8'))
+
+    def SayManyHellos(self, request, context):
+        r = HelloRequest.HelloRequest().GetRootAs(request, 0)
+        reply = "Unknown"
+        if r.Name():
+            reply = r.Name()
+
+        for greeting in self.greetings:
+            print(type(reply))
+            yield build_reply(greeting + " " + reply.decode('UTF-8'))
+        
+
+def serve():
+    args = parser.parse_args()
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    greeter_grpc_fb.add_GreeterServicer_to_server(
+        GreeterServicer(), server
+    )
+    server.add_insecure_port('[::]:' + args.port)
+    server.start()
+    server.wait_for_termination()
+
+if __name__ == '__main__':
+    serve()
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Package.swift b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Package.swift
new file mode 100644
index 0000000..5e6fb68
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Package.swift
@@ -0,0 +1,58 @@
+// swift-tools-version:5.1
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "Greeter",
+  platforms: [
+    .iOS(.v11),
+    .macOS(.v10_14),
+  ],
+  dependencies: [
+    .package(path: "../../../../swift"),
+    .package(url: "https://github.com/grpc/grpc-swift.git", from: "1.0.0"),
+  ],
+  targets: [
+    // Targets are the basic building blocks of a package. A target can define a module or a test suite.
+    // Targets can depend on other targets in this package, and on products in packages which this package depends on.
+    .target(
+      name: "Model",
+      dependencies: [
+        "GRPC",
+        "FlatBuffers",
+      ],
+      path: "Sources/Model"),
+
+    // Client for the Greeter example
+    .target(
+      name: "Client",
+      dependencies: [
+        "GRPC",
+        "Model",
+      ],
+      path: "Sources/client"),
+
+    // Server for the Greeter example
+    .target(
+      name: "Server",
+      dependencies: [
+        "GRPC",
+        "Model",
+      ],
+      path: "Sources/server"),
+  ])
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/README.md
new file mode 100644
index 0000000..1632b78
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/README.md
@@ -0,0 +1,7 @@
+# FlatBuffers.GRPC.Swift
+
+The following is Swift example on how GRPC would be with Swift Flatbuffers, you can simply run the following commands:
+
+`swift run Server`
+
+`swift run Client {port} {name}`
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/client/main.swift b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/client/main.swift
new file mode 100644
index 0000000..a6b1130
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/client/main.swift
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import FlatBuffers
+import GRPC
+import Logging
+import Model
+import NIO
+
+// Quieten the logs.
+LoggingSystem.bootstrap {
+  var handler = StreamLogHandler.standardOutput(label: $0)
+  handler.logLevel = .critical
+  return handler
+}
+
+func greet(name: String, client greeter: models_GreeterServiceClient) {
+  // Form the request with the name, if one was provided.
+  var builder = FlatBufferBuilder()
+  let nameOff = builder.create(string: name)
+  let root = models_HelloRequest.createHelloRequest(
+    &builder,
+    nameOffset: nameOff)
+  builder.finish(offset: root)
+
+  // Make the RPC call to the server.
+  let sayHello = greeter.SayHello(Message<models_HelloRequest>(builder: &builder))
+
+  // wait() on the response to stop the program from exiting before the response is received.
+  do {
+    let response = try sayHello.response.wait()
+    print("Greeter SayHello received: \(response.object.message ?? "Unknown")")
+  } catch {
+    print("Greeter failed: \(error)")
+  }
+
+  let surname = builder.create(string: name)
+  let manyRoot = models_HelloRequest.createHelloRequest(
+    &builder,
+    nameOffset: surname)
+  builder.finish(offset: manyRoot)
+
+  let call = greeter.SayManyHellos(Message(builder: &builder)) { message in
+    print("Greeter SayManyHellos received: \(message.object.message  ?? "Unknown")")
+  }
+
+  let status = try! call.status.recover { _ in .processingError }.wait()
+  if status.code != .ok {
+    print("RPC failed: \(status)")
+  }
+}
+
+func main(args: [String]) {
+  // arg0 (dropped) is the program name. We expect arg1 to be the port, and arg2 (optional) to be
+  // the name sent in the request.
+  let arg1 = args.dropFirst(1).first
+  let arg2 = args.dropFirst(2).first
+
+  switch (arg1.flatMap(Int.init), arg2) {
+  case (.none, _):
+    print("Usage: PORT [NAME]")
+    exit(1)
+
+  case let (.some(port), name):
+    // Setup an `EventLoopGroup` for the connection to run on.
+    //
+    // See: https://github.com/apple/swift-nio#eventloops-and-eventloopgroups
+    let group = MultiThreadedEventLoopGroup(numberOfThreads: 1)
+
+    // Make sure the group is shutdown when we're done with it.
+    defer {
+      try! group.syncShutdownGracefully()
+    }
+
+    // Configure the channel, we're not using TLS so the connection is `insecure`.
+    let channel = ClientConnection.insecure(group: group)
+      .connect(host: "localhost", port: port)
+
+    // Close the connection when we're done with it.
+    defer {
+      try! channel.close().wait()
+    }
+
+    // Provide the connection to the generated client.
+    let greeter = models_GreeterServiceClient(channel: channel)
+
+    // Do the greeting.
+    greet(name: name ?? "FlatBuffers!", client: greeter)
+  }
+}
+
+main(args: CommandLine.arguments)
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/server/main.swift b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/server/main.swift
new file mode 100644
index 0000000..af1c555
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/swift/Greeter/Sources/server/main.swift
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import FlatBuffers
+import GRPC
+import Logging
+import Model
+import NIO
+
+class Greeter: models_GreeterProvider {
+
+  var interceptors: models_GreeterServerInterceptorFactoryProtocol?
+
+  let greetings: [String]
+
+  init() {
+    greetings = ["Hi", "Hallo", "Ciao"]
+  }
+
+  func SayHello(
+    request: Message<models_HelloRequest>,
+    context: StatusOnlyCallContext) -> EventLoopFuture<Message<models_HelloReply>>
+  {
+    let recipient = request.object.name ?? "Stranger"
+
+    var builder = FlatBufferBuilder()
+    let off = builder.create(string: "Hello \(recipient)")
+    let root = models_HelloReply.createHelloReply(&builder, messageOffset: off)
+    builder.finish(offset: root)
+    return context.eventLoop.makeSucceededFuture(Message<models_HelloReply>(builder: &builder))
+  }
+
+  func SayManyHellos(
+    request: Message<models_HelloRequest>,
+    context: StreamingResponseCallContext<Message<models_HelloReply>>) -> EventLoopFuture<GRPCStatus>
+  {
+    for name in greetings {
+      var builder = FlatBufferBuilder()
+      let off = builder.create(string: "\(name) \(request.object.name ?? "Unknown")")
+      let root = models_HelloReply.createHelloReply(&builder, messageOffset: off)
+      builder.finish(offset: root)
+      _ = context.sendResponse(Message<models_HelloReply>(builder: &builder))
+    }
+    return context.eventLoop.makeSucceededFuture(.ok)
+  }
+}
+
+// Quieten the logs.
+LoggingSystem.bootstrap {
+  var handler = StreamLogHandler.standardOutput(label: $0)
+  handler.logLevel = .critical
+  return handler
+}
+
+let group = MultiThreadedEventLoopGroup(numberOfThreads: 1)
+defer {
+  try! group.syncShutdownGracefully()
+}
+
+// Create some configuration for the server:
+let configuration = Server.Configuration(
+  target: .hostAndPort("localhost", 0),
+  eventLoopGroup: group,
+  serviceProviders: [Greeter()])
+
+// Start the server and print its address once it has started.
+let server = Server.start(configuration: configuration)
+server.map {
+  $0.channel.localAddress
+}.whenSuccess { address in
+  print("server started on port \(address!.port!)")
+}
+
+// Wait on the server's `onClose` future to stop the program from exiting.
+_ = try server.flatMap {
+  $0.onClose
+}.wait()
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/README.md b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/README.md
new file mode 100644
index 0000000..5c7b380
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/README.md
@@ -0,0 +1,13 @@
+# TS Greeter example
+
+The following is an example on how to run the TS grpc server. Make sure that you have `Typescript` installed
+
+you would need to run `npm run build` or simply use `npm install && tsc`
+
+## How to run Server:
+
+- `npm run server`
+
+## How to run Client:
+
+- `npm run client 3000`
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/package.json b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/package.json
new file mode 100644
index 0000000..c216544
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "flatbuffers-js-grpc",
+  "version": "1.0.0",
+  "author": "mustii@mmk.one",
+  "scripts": {
+    "build": "npm install && tsc",
+    "client": "node dist/client.js",
+    "server": "node dist/server.js"
+  },
+  "dependencies": {
+    "flatbuffers": "^2.0.0",
+    "grpc": "^1.24.3"
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/client.ts b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/client.ts
new file mode 100644
index 0000000..ebae902
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/client.ts
@@ -0,0 +1,34 @@
+import grpc from 'grpc';
+import { HelloReply } from './models/hello-reply';
+import { HelloRequest } from './models/hello-request';
+import { GreeterClient } from './greeter_grpc';
+import { flatbuffers } from 'flatbuffers';
+
+async function main(PORT: Number, name: String) {
+    const _server = new GreeterClient(`localhost:${PORT}`, grpc.credentials.createInsecure());
+    const builder = new flatbuffers.Builder();
+    const offset = builder.createString(name);
+    const root = HelloRequest.createHelloRequest(builder, offset);
+    builder.finish(root);
+    const buffer = HelloRequest.getRootAsHelloRequest(new flatbuffers.ByteBuffer(builder.asUint8Array()));
+
+    _server.SayHello(buffer, (err, response) => {
+        console.log(response.message());
+    });
+
+    const data = _server.SayManyHellos(buffer, null);
+
+    data.on('data', (data) => {
+        console.log(data.message());
+    });
+}
+
+const args = process.argv.slice(2)
+const PORT = Number(args[0]);
+const name = String(args[1] ?? "flatbuffers");
+
+if (PORT) {
+    main(PORT, name);
+} else {
+    throw new Error("Requires a valid port number.")
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.d.ts b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.d.ts
new file mode 100644
index 0000000..07386bf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.d.ts
@@ -0,0 +1,55 @@
+// Generated GRPC code for FlatBuffers TS *** DO NOT EDIT ***
+import * as flatbuffers from 'flatbuffers';
+import { HelloReply as models_HelloReply } from './models/hello-reply';
+import { HelloRequest as models_HelloRequest } from './models/hello-request';
+
+import * as grpc from 'grpc';
+
+interface IGreeterService extends grpc.ServiceDefinition<grpc.UntypedServiceImplementation> {
+  SayHello: IGreeterService_ISayHello;
+  SayManyHellos: IGreeterService_ISayManyHellos;
+}
+interface IGreeterService_ISayHello extends grpc.MethodDefinition<models_HelloRequest, models_HelloReply> {
+  path: string; // /models.Greeter/SayHello
+  requestStream: boolean; // false
+  responseStream: boolean; // false
+  requestSerialize: grpc.serialize<models_HelloRequest>;
+  requestDeserialize: grpc.deserialize<models_HelloRequest>;
+  responseSerialize: grpc.serialize<models_HelloReply>;
+  responseDeserialize: grpc.deserialize<models_HelloReply>;
+}
+
+interface IGreeterService_ISayManyHellos extends grpc.MethodDefinition<models_HelloRequest, models_HelloReply> {
+  path: string; // /models.Greeter/SayManyHellos
+  requestStream: boolean; // false
+  responseStream: boolean; // true
+  requestSerialize: grpc.serialize<models_HelloRequest>;
+  requestDeserialize: grpc.deserialize<models_HelloRequest>;
+  responseSerialize: grpc.serialize<models_HelloReply>;
+  responseDeserialize: grpc.deserialize<models_HelloReply>;
+}
+
+
+export const GreeterService: IGreeterService;
+
+export interface IGreeterServer {
+  SayHello: grpc.handleUnaryCall<models_HelloRequest, models_HelloReply>;
+  SayManyHellos: grpc.handleServerStreamingCall<models_HelloRequest, models_HelloReply>;
+}
+
+export interface IGreeterClient {
+  SayHello(request: models_HelloRequest, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  SayHello(request: models_HelloRequest, metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  SayHello(request: models_HelloRequest, metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  SayManyHellos(request: models_HelloRequest, metadata: grpc.Metadata): grpc.ClientReadableStream<models_HelloReply>;
+  SayManyHellos(request: models_HelloRequest, options: Partial<grpc.CallOptions>): grpc.ClientReadableStream<models_HelloReply>;
+}
+
+export class GreeterClient extends grpc.Client implements IGreeterClient {
+  constructor(address: string, credentials: grpc.ChannelCredentials, options?: object);  public SayHello(request: models_HelloRequest, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  public SayHello(request: models_HelloRequest, metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  public SayHello(request: models_HelloRequest, metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: models_HelloReply) => void): grpc.ClientUnaryCall;
+  public SayManyHellos(request: models_HelloRequest, metadata: grpc.Metadata): grpc.ClientReadableStream<models_HelloReply>;
+  public SayManyHellos(request: models_HelloRequest, options: Partial<grpc.CallOptions>): grpc.ClientReadableStream<models_HelloReply>;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.js b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.js
new file mode 100644
index 0000000..6148a9b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/greeter_grpc.js
@@ -0,0 +1,56 @@
+// Generated GRPC code for FlatBuffers TS *** DO NOT EDIT ***
+import * as flatbuffers from 'flatbuffers';
+import { HelloReply as models_HelloReply } from './models/hello-reply';
+import { HelloRequest as models_HelloRequest } from './models/hello-request';
+
+var grpc = require('grpc');
+
+function serialize_models_HelloReply(buffer_args) {
+  if (!(buffer_args instanceof models_HelloReply)) {
+    throw new Error('Expected argument of type HelloReply');
+  }
+  return buffer_args.serialize();
+}
+
+function deserialize_models_HelloReply(buffer) {
+  return models_HelloReply.getRootAsHelloReply(new flatbuffers.ByteBuffer(buffer))
+}
+
+
+function serialize_models_HelloRequest(buffer_args) {
+  if (!(buffer_args instanceof models_HelloRequest)) {
+    throw new Error('Expected argument of type HelloRequest');
+  }
+  return buffer_args.serialize();
+}
+
+function deserialize_models_HelloRequest(buffer) {
+  return models_HelloRequest.getRootAsHelloRequest(new flatbuffers.ByteBuffer(buffer))
+}
+
+
+var GreeterService = exports.GreeterService = {
+  SayHello: {
+    path: '/models.Greeter/SayHello',
+    requestStream: false,
+    responseStream: false,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: models_HelloReply,
+    requestSerialize: serialize_models_HelloRequest,
+    requestDeserialize: deserialize_models_HelloRequest,
+    responseSerialize: serialize_models_HelloReply,
+    responseDeserialize: deserialize_models_HelloReply,
+  },
+  SayManyHellos: {
+    path: '/models.Greeter/SayManyHellos',
+    requestStream: false,
+    responseStream: true,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: models_HelloReply,
+    requestSerialize: serialize_models_HelloRequest,
+    requestDeserialize: deserialize_models_HelloRequest,
+    responseSerialize: serialize_models_HelloReply,
+    responseDeserialize: deserialize_models_HelloReply,
+  },
+};
+exports.GreeterClient = grpc.makeGenericClientConstructor(GreeterService);
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-reply.ts b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-reply.ts
new file mode 100644
index 0000000..e68f7a4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-reply.ts
@@ -0,0 +1,58 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class HelloReply {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):HelloReply {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsHelloReply(bb:flatbuffers.ByteBuffer, obj?:HelloReply):HelloReply {
+  return (obj || new HelloReply()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsHelloReply(bb:flatbuffers.ByteBuffer, obj?:HelloReply):HelloReply {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new HelloReply()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+message():string|null
+message(optionalEncoding:flatbuffers.Encoding):string|Uint8Array|null
+message(optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null;
+}
+
+static startHelloReply(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addMessage(builder:flatbuffers.Builder, messageOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, messageOffset, 0);
+}
+
+static endHelloReply(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createHelloReply(builder:flatbuffers.Builder, messageOffset:flatbuffers.Offset):flatbuffers.Offset {
+  HelloReply.startHelloReply(builder);
+  HelloReply.addMessage(builder, messageOffset);
+  return HelloReply.endHelloReply(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):HelloReply {
+  return HelloReply.getRootAsHelloReply(new flatbuffers.ByteBuffer(buffer))
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-request.ts b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-request.ts
new file mode 100644
index 0000000..3718167
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/models/hello-request.ts
@@ -0,0 +1,58 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class HelloRequest {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):HelloRequest {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsHelloRequest(bb:flatbuffers.ByteBuffer, obj?:HelloRequest):HelloRequest {
+  return (obj || new HelloRequest()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsHelloRequest(bb:flatbuffers.ByteBuffer, obj?:HelloRequest):HelloRequest {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new HelloRequest()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+name():string|null
+name(optionalEncoding:flatbuffers.Encoding):string|Uint8Array|null
+name(optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null;
+}
+
+static startHelloRequest(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addName(builder:flatbuffers.Builder, nameOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, nameOffset, 0);
+}
+
+static endHelloRequest(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createHelloRequest(builder:flatbuffers.Builder, nameOffset:flatbuffers.Offset):flatbuffers.Offset {
+  HelloRequest.startHelloRequest(builder);
+  HelloRequest.addName(builder, nameOffset);
+  return HelloRequest.endHelloRequest(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):HelloRequest {
+  return HelloRequest.getRootAsHelloRequest(new flatbuffers.ByteBuffer(buffer))
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/server.ts b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/server.ts
new file mode 100644
index 0000000..3efb5c8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/src/server.ts
@@ -0,0 +1,41 @@
+import grpc from 'grpc';
+import { HelloReply } from './models/hello-reply';
+import { HelloRequest } from './models/hello-request';
+import { IGreeterServer, GreeterService } from './greeter_grpc';
+import { flatbuffers } from 'flatbuffers';
+
+class GreeterServer implements IGreeterServer {
+
+    SayHello(call: grpc.ServerUnaryCall<HelloRequest>, callback: grpc.sendUnaryData<HelloReply>): void {
+        console.log(`SayHello ${call.request.name()}`);
+        const builder = new flatbuffers.Builder();
+        const offset = builder.createString(`welcome ${call.request.name()}`);
+        const root = HelloReply.createHelloReply(builder, offset);
+        builder.finish(root);
+        callback(null, HelloReply.getRootAsHelloReply(new flatbuffers.ByteBuffer(builder.asUint8Array())));
+    }
+
+    async SayManyHellos(call: grpc.ServerWritableStream<HelloRequest>): Promise<void> {
+        const name = call.request.name();
+        console.log(`${call.request.name()} saying hi in different langagues`);
+        ['Hi', 'Hallo', 'Ciao'].forEach(element => {
+            const builder = new flatbuffers.Builder();
+            const offset = builder.createString(`${element} ${name}`);
+            const root = HelloReply.createHelloReply(builder, offset);
+            builder.finish(root);
+            call.write(HelloReply.getRootAsHelloReply(new flatbuffers.ByteBuffer(builder.asUint8Array())))
+        });
+        call.end();
+    }
+}
+
+function serve(): void {
+    const PORT = 3000;
+    const server = new grpc.Server();
+    server.addService<IGreeterServer>(GreeterService, new GreeterServer());
+    console.log(`Listening on ${PORT}`);
+    server.bind(`localhost:${PORT}`, grpc.ServerCredentials.createInsecure());
+    server.start();
+}
+
+serve();
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/tsconfig.json b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/tsconfig.json
new file mode 100644
index 0000000..7076378
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/examples/ts/greeter/tsconfig.json
@@ -0,0 +1,17 @@
+{
+    "compilerOptions": {
+      "target": "es5",
+      "module": "commonjs",
+      "outDir": "./dist",
+      "allowJs": true,
+      "sourceMap": true,
+      "strict": true,
+      "noImplicitAny": false,
+      "strictNullChecks": false,
+      "esModuleInterop": true,
+      "baseUrl": "./",
+      "typeRoots": ["node_modules/@types"],
+      "skipLibCheck": true, 
+      "forceConsistentCasingInFileNames": true
+    }
+  }
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/pom.xml b/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/pom.xml
new file mode 100644
index 0000000..f9d9657
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/pom.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>com.google.flatbuffers</groupId>
+        <artifactId>flatbuffers-parent</artifactId>
+        <version>2.0.0</version>
+    </parent>
+    <artifactId>flatbuffers-java-grpc</artifactId>
+    <name>${project.artifactId}</name>
+    <packaging>bundle</packaging>
+    <description>
+        Utilities supporting generated code for GRPC
+    </description>
+    <developers>
+        <developer>
+            <name>Wouter van Oortmerssen</name>
+        </developer>
+        <developer>
+            <name>Yuri Finkelstein</name>
+            <url>https://github.com/yfinkelstein</url>
+        </developer>
+    </developers>
+    <properties>
+        <gRPC.version>2.0.0</gRPC.version>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.google.flatbuffers</groupId>
+            <artifactId>flatbuffers-java</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-core</artifactId>
+            <version>${gRPC.version}</version>
+        </dependency>
+  </dependencies>
+</project>
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/src/main/java/com/google/flatbuffers/grpc/FlatbuffersUtils.java b/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/src/main/java/com/google/flatbuffers/grpc/FlatbuffersUtils.java
new file mode 100644
index 0000000..768708b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/flatbuffers-java-grpc/src/main/java/com/google/flatbuffers/grpc/FlatbuffersUtils.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.grpc;
+
+import com.google.flatbuffers.Table;
+import io.grpc.Drainable;
+import io.grpc.KnownLength;
+import io.grpc.MethodDescriptor;
+
+import javax.annotation.Nullable;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+public class FlatbuffersUtils {
+    abstract public static class FBExtactor  <T extends Table> {
+        T extract (InputStream stream) throws IOException {
+            if (stream instanceof KnownLength) {
+                int size = stream.available();
+                ByteBuffer buffer = ByteBuffer.allocate(size);
+                stream.read(buffer.array());
+                return extract(buffer);
+            } else
+                throw new RuntimeException("The class " + stream.getClass().getCanonicalName() + " does not extend from KnownLength ");
+        }
+
+        public abstract T extract(ByteBuffer buffer);
+
+    }
+
+    static class FBInputStream extends InputStream implements Drainable, KnownLength {
+        private final ByteBuffer buffer;
+        private final int size;
+        @Nullable private ByteArrayInputStream inputStream;
+
+        FBInputStream(ByteBuffer buffer) {
+            this.buffer = buffer;
+            this.size = buffer.remaining();
+        }
+
+        private void makeStreamIfNotAlready() {
+            if (inputStream == null)
+                inputStream = new ByteArrayInputStream(buffer.array(), buffer.position(), size);
+        }
+
+        @Override
+        public int drainTo(OutputStream target) throws IOException {
+            target.write(buffer.array(), buffer.position(), size);
+            return size;
+        }
+
+        @Override
+        public int read() throws IOException {
+            makeStreamIfNotAlready();
+            return inputStream.read();
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            makeStreamIfNotAlready();
+            if (inputStream == null) {
+                if (len >= size) {
+                    System.arraycopy(buffer.array(), buffer.position(), b, off, size);
+                    return size;
+                } else {
+                    makeStreamIfNotAlready();
+                    return inputStream.read(b, off, len);
+                }
+            } else
+                return inputStream.read(b, off, len);
+        }
+
+        @Override
+        public int available() throws IOException {
+            return inputStream == null ? size : inputStream.available();
+        }
+
+    }
+
+    public static <T extends Table> MethodDescriptor.Marshaller<T> marshaller(final Class<T> clazz, final FBExtactor<T> extractor) {
+        return new MethodDescriptor.ReflectableMarshaller<T>() {
+            @Override
+            public Class<T> getMessageClass() {
+                return clazz;
+            }
+
+            @Override
+            public InputStream stream(T value) {
+                return new FBInputStream (value.getByteBuffer());
+            }
+
+            @Override
+            public T parse(InputStream stream) {
+                try {
+                    return extractor.extract(stream);
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        };
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/pom.xml b/3rdparty/TNN/third_party/flatbuffers/grpc/pom.xml
new file mode 100644
index 0000000..f50a9bd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/pom.xml
@@ -0,0 +1,219 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.google.flatbuffers</groupId>
+    <artifactId>flatbuffers-parent</artifactId>
+    <packaging>pom</packaging>
+    <version>2.0.0</version>
+    <name>flatbuffers-parent</name>
+    <description>parent pom for flatbuffers java artifacts</description>
+    <properties>
+        <scm.url>https://github.com/google/flatbuffers</scm.url>
+        <scm.connection>scm:git:${scm.url}.git</scm.connection>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <issueManagement>
+        <system>GitHub</system>
+        <url>https://github.com/google/flatbuffers/issues</url>
+    </issueManagement>
+
+    <developers>
+        <developer>
+            <name>Wouter van Oortmerssen</name>
+        </developer>
+    </developers>
+
+    <url>${scm.url}</url>
+
+    <scm>
+        <connection>${scm.connection}</connection>
+        <developerConnection>${scm.connection}</developerConnection>
+        <url>${scm.url}</url>
+        <tag>HEAD</tag>
+    </scm>
+
+    <distributionManagement>
+        <snapshotRepository>
+            <id>ossrh</id>
+            <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+    </distributionManagement>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+    <build>
+        <extensions>
+            <extension>
+                <!--
+                    os-maven-plugin is a Maven extension/plugin that generates various useful platform-dependent
+                    project properties normalized from ${os.detected.name} and ${os.detected.arch}.
+                -->
+                <groupId>kr.motd.maven</groupId>
+                <artifactId>os-maven-plugin</artifactId>
+                <version>1.5.0.Final</version>
+            </extension>
+        </extensions>
+        <pluginManagement>
+            <plugins>
+                <plugin>
+                    <artifactId>maven-compiler-plugin</artifactId>
+                    <version>3.6.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <version>3.0.2</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-source-plugin</artifactId>
+                    <version>3.0.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.19.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-javadoc-plugin</artifactId>
+                    <version>2.10.4</version>
+                </plugin>
+                <plugin>
+                    <groupId>org.codehaus.mojo</groupId>
+                    <artifactId>build-helper-maven-plugin</artifactId>
+                    <version>1.12</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-dependency-plugin</artifactId>
+                    <version>2.8</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-deploy-plugin</artifactId>
+                    <version>2.7</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-gpg-plugin</artifactId>
+                    <version>1.5</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-release-plugin</artifactId>
+                    <version>2.5.3</version>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.codehaus.mojo</groupId>
+                    <artifactId>exec-maven-plugin</artifactId>
+                    <version>1.5.0</version>
+                </plugin>
+            </plugins>
+        </pluginManagement>
+
+        <plugins>
+            <plugin>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>1.6</source>
+                    <target>1.6</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <configuration>
+                    <includes>
+                        <include>**/*Test.java</include>
+                    </includes>
+                </configuration>
+            </plugin>
+            <plugin>
+                <artifactId>maven-source-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>attach-javadocs</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.felix</groupId>
+                <artifactId>maven-bundle-plugin</artifactId>
+                <version>3.0.1</version>
+                <extensions>true</extensions>
+            </plugin>
+            <plugin>
+                <groupId>org.sonatype.plugins</groupId>
+                <artifactId>nexus-staging-maven-plugin</artifactId>
+                <version>1.6.7</version>
+                <extensions>true</extensions>
+                <configuration>
+                    <serverId>ossrh</serverId>
+                    <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+                    <autoReleaseAfterClose>true</autoReleaseAfterClose>
+                </configuration>
+            </plugin>
+            <plugin>
+                <artifactId>maven-gpg-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>sign-artifacts</id>
+                        <phase>verify</phase>
+                        <goals>
+                            <goal>sign</goal>
+                        </goals>
+                        <configuration>
+                            <gpgArguments>
+                                <arg>--pinentry-mode</arg>
+                                <arg>loopback</arg>
+                            </gpgArguments>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-release-plugin</artifactId>
+                <configuration>
+                    <autoVersionSubmodules>true</autoVersionSubmodules>
+                    <useReleaseProfile>false</useReleaseProfile>
+                    <releaseProfiles>release</releaseProfiles>
+                    <goals>deploy</goals>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+    <modules>
+<!-- consider the benefits of publishing all maven artifacts in this project
+
+        <module>flatbuffers-compiler</module>
+        <module>flatbuffers-java</module>
+
+-->
+        <module>flatbuffers-java-grpc</module>
+    </modules>
+
+</project>
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/client.cpp b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/client.cpp
new file mode 100644
index 0000000..0e4f348
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/client.cpp
@@ -0,0 +1,85 @@
+#include "greeter.grpc.fb.h"
+#include "greeter_generated.h"
+
+#include <grpcpp/grpcpp.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+class GreeterClient {
+ public:
+  GreeterClient(std::shared_ptr<grpc::Channel> channel)
+    : stub_(Greeter::NewStub(channel)) {}
+
+  std::string SayHello(const std::string &name) {
+    flatbuffers::grpc::MessageBuilder mb;
+    auto name_offset = mb.CreateString(name);
+    auto request_offset = CreateHelloRequest(mb, name_offset);
+    mb.Finish(request_offset);
+    auto request_msg = mb.ReleaseMessage<HelloRequest>();
+
+    flatbuffers::grpc::Message<HelloReply> response_msg;
+
+    grpc::ClientContext context;
+
+    auto status = stub_->SayHello(&context, request_msg, &response_msg);
+    if (status.ok()) {
+      const HelloReply *response = response_msg.GetRoot();
+      return response->message()->str();
+    } else {
+      std::cerr << status.error_code() << ": " << status.error_message()
+                << std::endl;
+      return "RPC failed";
+    }
+  }
+
+  void SayManyHellos(const std::string &name, int num_greetings,
+                     std::function<void(const std::string &)> callback) {
+    flatbuffers::grpc::MessageBuilder mb;
+    auto name_offset = mb.CreateString(name);
+    auto request_offset =
+        CreateManyHellosRequest(mb, name_offset, num_greetings);
+    mb.Finish(request_offset);
+    auto request_msg = mb.ReleaseMessage<ManyHellosRequest>();
+
+    flatbuffers::grpc::Message<HelloReply> response_msg;
+
+    grpc::ClientContext context;
+
+    auto stream = stub_->SayManyHellos(&context, request_msg);
+    while (stream->Read(&response_msg)) {
+      const HelloReply *response = response_msg.GetRoot();
+      callback(response->message()->str());
+    }
+    auto status = stream->Finish();
+    if (!status.ok()) {
+      std::cerr << status.error_code() << ": " << status.error_message()
+                << std::endl;
+      callback("RPC failed");
+    }
+  }
+
+ private:
+  std::unique_ptr<Greeter::Stub> stub_;
+};
+
+int main(int argc, char **argv) {
+  std::string server_address("localhost:50051");
+
+  auto channel =
+      grpc::CreateChannel(server_address, grpc::InsecureChannelCredentials());
+  GreeterClient greeter(channel);
+
+  std::string name("world");
+
+  std::string message = greeter.SayHello(name);
+  std::cerr << "Greeter received: " << message << std::endl;
+
+  int num_greetings = 10;
+  greeter.SayManyHellos(name, num_greetings, [](const std::string &message) {
+    std::cerr << "Greeter received: " << message << std::endl;
+  });
+
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/greeter.fbs b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/greeter.fbs
new file mode 100644
index 0000000..811303c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/greeter.fbs
@@ -0,0 +1,17 @@
+table HelloReply {
+  message:string;
+}
+
+table HelloRequest {
+  name:string;
+}
+
+table ManyHellosRequest {
+  name:string;
+  num_greetings:int;
+}
+
+rpc_service Greeter {
+  SayHello(HelloRequest):HelloReply;
+  SayManyHellos(ManyHellosRequest):HelloReply (streaming: "server");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/server.cpp b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/server.cpp
new file mode 100644
index 0000000..ac38fa9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/samples/greeter/server.cpp
@@ -0,0 +1,81 @@
+#include "greeter.grpc.fb.h"
+#include "greeter_generated.h"
+
+#include <grpcpp/grpcpp.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+class GreeterServiceImpl final : public Greeter::Service {
+  virtual grpc::Status SayHello(
+      grpc::ServerContext *context,
+      const flatbuffers::grpc::Message<HelloRequest> *request_msg,
+      flatbuffers::grpc::Message<HelloReply> *response_msg) override {
+    flatbuffers::grpc::MessageBuilder mb_;
+
+    // We call GetRoot to "parse" the message. Verification is already
+    // performed by default. See the notes below for more details.
+    const HelloRequest *request = request_msg->GetRoot();
+
+    // Fields are retrieved as usual with FlatBuffers
+    const std::string &name = request->name()->str();
+
+    // `flatbuffers::grpc::MessageBuilder` is a `FlatBufferBuilder` with a
+    // special allocator for efficient gRPC buffer transfer, but otherwise
+    // usage is the same as usual.
+    auto msg_offset = mb_.CreateString("Hello, " + name);
+    auto hello_offset = CreateHelloReply(mb_, msg_offset);
+    mb_.Finish(hello_offset);
+
+    // The `ReleaseMessage<T>()` function detaches the message from the
+    // builder, so we can transfer the resopnse to gRPC while simultaneously
+    // detaching that memory buffer from the builer.
+    *response_msg = mb_.ReleaseMessage<HelloReply>();
+    assert(response_msg->Verify());
+
+    // Return an OK status.
+    return grpc::Status::OK;
+  }
+
+  virtual grpc::Status SayManyHellos(
+      grpc::ServerContext *context,
+      const flatbuffers::grpc::Message<ManyHellosRequest> *request_msg,
+      grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>> *writer)
+      override {
+    // The streaming usage below is simply a combination of standard gRPC
+    // streaming with the FlatBuffers usage shown above.
+    const ManyHellosRequest *request = request_msg->GetRoot();
+    const std::string &name = request->name()->str();
+    int num_greetings = request->num_greetings();
+
+    for (int i = 0; i < num_greetings; i++) {
+      auto msg_offset = mb_.CreateString("Many hellos, " + name);
+      auto hello_offset = CreateHelloReply(mb_, msg_offset);
+      mb_.Finish(hello_offset);
+      writer->Write(mb_.ReleaseMessage<HelloReply>());
+    }
+
+    return grpc::Status::OK;
+  }
+
+  flatbuffers::grpc::MessageBuilder mb_;
+};
+
+void RunServer() {
+  std::string server_address("0.0.0.0:50051");
+  GreeterServiceImpl service;
+
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+  std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+  std::cerr << "Server listening on " << server_address << std::endl;
+
+  server->Wait();
+}
+
+int main(int argc, const char *argv[]) {
+  RunServer();
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/BUILD.bazel
new file mode 100644
index 0000000..e9954d5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/BUILD.bazel
@@ -0,0 +1,122 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "common_headers",
+    srcs = [
+        "config.h",
+        "schema_interface.h",
+    ],
+)
+
+cc_library(
+    name = "cpp_generator",
+    srcs = [
+        "cpp_generator.cc",
+    ],
+    hdrs = [
+        "cpp_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "go_generator",
+    srcs = [
+        "go_generator.cc",
+    ],
+    hdrs = [
+        "go_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "java_generator",
+    srcs = [
+        "java_generator.cc",
+    ],
+    hdrs = [
+        "java_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "python_generator",
+    hdrs = [
+        "python_generator.h",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        ":python_generator_private",
+    ],
+)
+
+cc_library(
+    name = "python_generator_private",
+    srcs = [
+        "python_generator.cc",
+    ],
+    hdrs = [
+        "python_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "swift_generator",
+    srcs = [
+        "swift_generator.cc",
+    ],
+    hdrs = [
+        "swift_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "ts_generator",
+    srcs = [
+        "ts_generator.cc",
+    ],
+    hdrs = [
+        "ts_generator.h",
+        ":common_headers",
+    ],
+    include_prefix = "src/compiler",
+    strip_include_prefix = "/grpc/src/compiler",
+    deps = [
+        "//:flatbuffers",
+    ],
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/config.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/config.h
new file mode 100644
index 0000000..4adc594
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/config.h
@@ -0,0 +1,40 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef SRC_COMPILER_CONFIG_H
+#define SRC_COMPILER_CONFIG_H
+
+// This file is here only because schema_interface.h, which is copied from gRPC,
+// includes it. There is nothing for Flatbuffers to configure.
+
+#endif  // SRC_COMPILER_CONFIG_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.cc
new file mode 100644
index 0000000..8dd4088
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.cc
@@ -0,0 +1,1780 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <map>
+
+#include "src/compiler/cpp_generator.h"
+#include "flatbuffers/util.h"
+
+#include <sstream>
+
+namespace grpc_cpp_generator {
+namespace {
+
+grpc::string message_header_ext() { return "_generated.h"; }
+grpc::string service_header_ext() { return ".grpc.fb.h"; }
+
+template <class T>
+grpc::string as_string(T x) {
+  std::ostringstream out;
+  out << x;
+  return out.str();
+}
+
+inline bool ClientOnlyStreaming(const grpc_generator::Method *method) {
+  return method->ClientStreaming() && !method->ServerStreaming();
+}
+
+inline bool ServerOnlyStreaming(const grpc_generator::Method *method) {
+  return !method->ClientStreaming() && method->ServerStreaming();
+}
+
+grpc::string FilenameIdentifier(const grpc::string &filename) {
+  grpc::string result;
+  for (unsigned i = 0; i < filename.size(); i++) {
+    char c = filename[i];
+    if (isalnum(c)) {
+      result.push_back(c);
+    } else {
+      static char hex[] = "0123456789abcdef";
+      result.push_back('_');
+      result.push_back(hex[(c >> 4) & 0xf]);
+      result.push_back(hex[c & 0xf]);
+    }
+  }
+  return result;
+}
+}  // namespace
+
+template <class T, size_t N>
+T *array_end(T (&array)[N]) {
+  return array + N;
+}
+
+void PrintIncludes(grpc_generator::Printer *printer,
+                   const std::vector<grpc::string> &headers,
+                   const Parameters &params) {
+  std::map<grpc::string, grpc::string> vars;
+
+  vars["l"] = params.use_system_headers ? '<' : '"';
+  vars["r"] = params.use_system_headers ? '>' : '"';
+
+  auto &s = params.grpc_search_path;
+  if (!s.empty()) {
+    vars["l"] += s;
+    if (s[s.size() - 1] != '/') {
+      vars["l"] += '/';
+    }
+  }
+
+  for (auto i = headers.begin(); i != headers.end(); i++) {
+    vars["h"] = *i;
+    printer->Print(vars, "#include $l$$h$$r$\n");
+  }
+}
+
+grpc::string GetHeaderPrologue(grpc_generator::File *file,
+                               const Parameters & /*params*/) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    vars["filename"] = file->filename();
+    vars["filename_identifier"] = FilenameIdentifier(file->filename());
+    vars["filename_base"] = file->filename_without_ext();
+    vars["message_header_ext"] = message_header_ext();
+
+    printer->Print(vars, "// Generated by the gRPC C++ plugin.\n");
+    printer->Print(vars,
+                   "// If you make any local change, they will be lost.\n");
+    printer->Print(vars, "// source: $filename$\n");
+    grpc::string leading_comments = file->GetLeadingComments("//");
+    if (!leading_comments.empty()) {
+      printer->Print(vars, "// Original file comments:\n");
+      printer->Print(leading_comments.c_str());
+    }
+    printer->Print(vars, "#ifndef GRPC_$filename_identifier$__INCLUDED\n");
+    printer->Print(vars, "#define GRPC_$filename_identifier$__INCLUDED\n");
+    printer->Print(vars, "\n");
+    printer->Print(vars, "#include \"$filename_base$$message_header_ext$\"\n");
+    printer->Print(vars, file->additional_headers().c_str());
+    printer->Print(vars, "\n");
+  }
+  return output;
+}
+
+grpc::string GetHeaderIncludes(grpc_generator::File *file,
+                               const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    static const char *headers_strs[] = {
+        "grpcpp/impl/codegen/async_stream.h",
+        "grpcpp/impl/codegen/async_unary_call.h",
+        "grpcpp/impl/codegen/method_handler.h",
+        "grpcpp/impl/codegen/proto_utils.h",
+        "grpcpp/impl/codegen/rpc_method.h",
+        "grpcpp/impl/codegen/service_type.h",
+        "grpcpp/impl/codegen/status.h",
+        "grpcpp/impl/codegen/stub_options.h",
+        "grpcpp/impl/codegen/sync_stream.h"};
+    std::vector<grpc::string> headers(headers_strs, array_end(headers_strs));
+    PrintIncludes(printer.get(), headers, params);
+    printer->Print(vars, "\n");
+    printer->Print(vars, "namespace grpc {\n");
+    printer->Print(vars, "class CompletionQueue;\n");
+    printer->Print(vars, "class Channel;\n");
+    printer->Print(vars, "class ServerCompletionQueue;\n");
+    printer->Print(vars, "class ServerContext;\n");
+    printer->Print(vars, "}  // namespace grpc\n\n");
+
+    if (!file->package().empty()) {
+      std::vector<grpc::string> parts = file->package_parts();
+
+      for (auto part = parts.begin(); part != parts.end(); part++) {
+        vars["part"] = *part;
+        printer->Print(vars, "namespace $part$ {\n");
+      }
+      printer->Print(vars, "\n");
+    }
+  }
+  return output;
+}
+
+void PrintHeaderClientMethodInterfaces(
+    grpc_generator::Printer *printer, const grpc_generator::Method *method,
+    std::map<grpc::string, grpc::string> *vars, bool is_public) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+
+  struct {
+    grpc::string prefix;
+    grpc::string method_params;  // extra arguments to method
+    grpc::string raw_args;       // extra arguments to raw version of method
+  } async_prefixes[] = {{"Async", ", void* tag", ", tag"},
+                        {"PrepareAsync", "", ""}};
+
+  if (is_public) {
+    if (method->NoStreaming()) {
+      printer->Print(
+          *vars,
+          "virtual ::grpc::Status $Method$(::grpc::ClientContext* context, "
+          "const $Request$& request, $Response$* response) = 0;\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< "
+            "::grpc::ClientAsyncResponseReaderInterface< $Response$>> "
+            "$AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+            "const $Request$& request, "
+            "::grpc::CompletionQueue* cq) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< "
+            "::grpc::ClientAsyncResponseReaderInterface< $Response$>>("
+            "$AsyncPrefix$$Method$Raw(context, request, cq));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (ClientOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "std::unique_ptr< ::grpc::ClientWriterInterface< $Request$>>"
+          " $Method$("
+          "::grpc::ClientContext* context, $Response$* response) {\n");
+      printer->Indent();
+      printer->Print(
+          *vars,
+          "return std::unique_ptr< ::grpc::ClientWriterInterface< $Request$>>"
+          "($Method$Raw(context, response));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< ::grpc::ClientAsyncWriterInterface< $Request$>>"
+            " $AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+            "$Response$* "
+            "response, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(*vars,
+                       "return std::unique_ptr< "
+                       "::grpc::ClientAsyncWriterInterface< $Request$>>("
+                       "$AsyncPrefix$$Method$Raw(context, response, "
+                       "cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (ServerOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "std::unique_ptr< ::grpc::ClientReaderInterface< $Response$>>"
+          " $Method$(::grpc::ClientContext* context, const $Request$& request)"
+          " {\n");
+      printer->Indent();
+      printer->Print(
+          *vars,
+          "return std::unique_ptr< ::grpc::ClientReaderInterface< $Response$>>"
+          "($Method$Raw(context, request));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< ::grpc::ClientAsyncReaderInterface< $Response$>> "
+            "$AsyncPrefix$$Method$("
+            "::grpc::ClientContext* context, const $Request$& request, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< "
+            "::grpc::ClientAsyncReaderInterface< $Response$>>("
+            "$AsyncPrefix$$Method$Raw(context, request, cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (method->BidiStreaming()) {
+      printer->Print(*vars,
+                     "std::unique_ptr< ::grpc::ClientReaderWriterInterface< "
+                     "$Request$, $Response$>> "
+                     "$Method$(::grpc::ClientContext* context) {\n");
+      printer->Indent();
+      printer->Print(
+          *vars,
+          "return std::unique_ptr< "
+          "::grpc::ClientReaderWriterInterface< $Request$, $Response$>>("
+          "$Method$Raw(context));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< "
+            "::grpc::ClientAsyncReaderWriterInterface< $Request$, $Response$>> "
+            "$AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< "
+            "::grpc::ClientAsyncReaderWriterInterface< $Request$, $Response$>>("
+            "$AsyncPrefix$$Method$Raw(context, cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    }
+  } else {
+    if (method->NoStreaming()) {
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        printer->Print(
+            *vars,
+            "virtual ::grpc::ClientAsyncResponseReaderInterface< $Response$>* "
+            "$AsyncPrefix$$Method$Raw(::grpc::ClientContext* context, "
+            "const $Request$& request, "
+            "::grpc::CompletionQueue* cq) = 0;\n");
+      }
+    } else if (ClientOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "virtual ::grpc::ClientWriterInterface< $Request$>*"
+          " $Method$Raw("
+          "::grpc::ClientContext* context, $Response$* response) = 0;\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        printer->Print(
+            *vars,
+            "virtual ::grpc::ClientAsyncWriterInterface< $Request$>*"
+            " $AsyncPrefix$$Method$Raw(::grpc::ClientContext* context, "
+            "$Response$* response, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) = 0;\n");
+      }
+    } else if (ServerOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "virtual ::grpc::ClientReaderInterface< $Response$>* "
+          "$Method$Raw("
+          "::grpc::ClientContext* context, const $Request$& request) = 0;\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        printer->Print(
+            *vars,
+            "virtual ::grpc::ClientAsyncReaderInterface< $Response$>* "
+            "$AsyncPrefix$$Method$Raw("
+            "::grpc::ClientContext* context, const $Request$& request, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) = 0;\n");
+      }
+    } else if (method->BidiStreaming()) {
+      printer->Print(*vars,
+                     "virtual ::grpc::ClientReaderWriterInterface< $Request$, "
+                     "$Response$>* "
+                     "$Method$Raw(::grpc::ClientContext* context) = 0;\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        printer->Print(
+            *vars,
+            "virtual ::grpc::ClientAsyncReaderWriterInterface< "
+            "$Request$, $Response$>* "
+            "$AsyncPrefix$$Method$Raw(::grpc::ClientContext* context, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) = 0;\n");
+      }
+    }
+  }
+}
+
+void PrintHeaderClientMethod(grpc_generator::Printer *printer,
+                             const grpc_generator::Method *method,
+                             std::map<grpc::string, grpc::string> *vars,
+                             bool is_public) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  struct {
+    grpc::string prefix;
+    grpc::string method_params;  // extra arguments to method
+    grpc::string raw_args;       // extra arguments to raw version of method
+  } async_prefixes[] = {{"Async", ", void* tag", ", tag"},
+                        {"PrepareAsync", "", ""}};
+
+  if (is_public) {
+    if (method->NoStreaming()) {
+      printer->Print(
+          *vars,
+          "::grpc::Status $Method$(::grpc::ClientContext* context, "
+          "const $Request$& request, $Response$* response) override;\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< ::grpc::ClientAsyncResponseReader< $Response$>> "
+            "$AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+            "const $Request$& request, "
+            "::grpc::CompletionQueue* cq) {\n");
+        printer->Indent();
+        printer->Print(*vars,
+                       "return std::unique_ptr< "
+                       "::grpc::ClientAsyncResponseReader< $Response$>>("
+                       "$AsyncPrefix$$Method$Raw(context, request, cq));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (ClientOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "std::unique_ptr< ::grpc::ClientWriter< $Request$>>"
+          " $Method$("
+          "::grpc::ClientContext* context, $Response$* response) {\n");
+      printer->Indent();
+      printer->Print(*vars,
+                     "return std::unique_ptr< ::grpc::ClientWriter< $Request$>>"
+                     "($Method$Raw(context, response));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(*vars,
+                       "std::unique_ptr< ::grpc::ClientAsyncWriter< $Request$>>"
+                       " $AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+                       "$Response$* response, "
+                       "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< ::grpc::ClientAsyncWriter< $Request$>>("
+            "$AsyncPrefix$$Method$Raw(context, response, "
+            "cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (ServerOnlyStreaming(method)) {
+      printer->Print(
+          *vars,
+          "std::unique_ptr< ::grpc::ClientReader< $Response$>>"
+          " $Method$(::grpc::ClientContext* context, const $Request$& request)"
+          " {\n");
+      printer->Indent();
+      printer->Print(
+          *vars,
+          "return std::unique_ptr< ::grpc::ClientReader< $Response$>>"
+          "($Method$Raw(context, request));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "std::unique_ptr< ::grpc::ClientAsyncReader< $Response$>> "
+            "$AsyncPrefix$$Method$("
+            "::grpc::ClientContext* context, const $Request$& request, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< ::grpc::ClientAsyncReader< $Response$>>("
+            "$AsyncPrefix$$Method$Raw(context, request, cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    } else if (method->BidiStreaming()) {
+      printer->Print(
+          *vars,
+          "std::unique_ptr< ::grpc::ClientReaderWriter< $Request$, $Response$>>"
+          " $Method$(::grpc::ClientContext* context) {\n");
+      printer->Indent();
+      printer->Print(*vars,
+                     "return std::unique_ptr< "
+                     "::grpc::ClientReaderWriter< $Request$, $Response$>>("
+                     "$Method$Raw(context));\n");
+      printer->Outdent();
+      printer->Print("}\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(*vars,
+                       "std::unique_ptr<  ::grpc::ClientAsyncReaderWriter< "
+                       "$Request$, $Response$>> "
+                       "$AsyncPrefix$$Method$(::grpc::ClientContext* context, "
+                       "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+        printer->Indent();
+        printer->Print(
+            *vars,
+            "return std::unique_ptr< "
+            "::grpc::ClientAsyncReaderWriter< $Request$, $Response$>>("
+            "$AsyncPrefix$$Method$Raw(context, cq$AsyncRawArgs$));\n");
+        printer->Outdent();
+        printer->Print("}\n");
+      }
+    }
+  } else {
+    if (method->NoStreaming()) {
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        printer->Print(
+            *vars,
+            "::grpc::ClientAsyncResponseReader< $Response$>* "
+            "$AsyncPrefix$$Method$Raw(::grpc::ClientContext* context, "
+            "const $Request$& request, "
+            "::grpc::CompletionQueue* cq) override;\n");
+      }
+    } else if (ClientOnlyStreaming(method)) {
+      printer->Print(*vars,
+                     "::grpc::ClientWriter< $Request$>* $Method$Raw("
+                     "::grpc::ClientContext* context, $Response$* response) "
+                     "override;\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "::grpc::ClientAsyncWriter< $Request$>* $AsyncPrefix$$Method$Raw("
+            "::grpc::ClientContext* context, $Response$* response, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) override;\n");
+      }
+    } else if (ServerOnlyStreaming(method)) {
+      printer->Print(*vars,
+                     "::grpc::ClientReader< $Response$>* $Method$Raw("
+                     "::grpc::ClientContext* context, const $Request$& request)"
+                     " override;\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "::grpc::ClientAsyncReader< $Response$>* $AsyncPrefix$$Method$Raw("
+            "::grpc::ClientContext* context, const $Request$& request, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) override;\n");
+      }
+    } else if (method->BidiStreaming()) {
+      printer->Print(*vars,
+                     "::grpc::ClientReaderWriter< $Request$, $Response$>* "
+                     "$Method$Raw(::grpc::ClientContext* context) override;\n");
+      for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+        auto& async_prefix = async_prefixes[i];
+        (*vars)["AsyncPrefix"] = async_prefix.prefix;
+        (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+        (*vars)["AsyncRawArgs"] = async_prefix.raw_args;
+        printer->Print(
+            *vars,
+            "::grpc::ClientAsyncReaderWriter< $Request$, $Response$>* "
+            "$AsyncPrefix$$Method$Raw(::grpc::ClientContext* context, "
+            "::grpc::CompletionQueue* cq$AsyncMethodParams$) override;\n");
+      }
+    }
+  }
+}
+
+void PrintHeaderClientMethodData(grpc_generator::Printer *printer,
+                                 const grpc_generator::Method *method,
+                                 std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  printer->Print(*vars,
+                 "const ::grpc::internal::RpcMethod rpcmethod_$Method$_;\n");
+}
+
+void PrintHeaderServerMethodSync(grpc_generator::Printer *printer,
+                                 const grpc_generator::Method *method,
+                                 std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  printer->Print(method->GetLeadingComments("//").c_str());
+  if (method->NoStreaming()) {
+    printer->Print(*vars,
+                   "virtual ::grpc::Status $Method$("
+                   "::grpc::ServerContext* context, const $Request$* request, "
+                   "$Response$* response);\n");
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(*vars,
+                   "virtual ::grpc::Status $Method$("
+                   "::grpc::ServerContext* context, "
+                   "::grpc::ServerReader< $Request$>* reader, "
+                   "$Response$* response);\n");
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(*vars,
+                   "virtual ::grpc::Status $Method$("
+                   "::grpc::ServerContext* context, const $Request$* request, "
+                   "::grpc::ServerWriter< $Response$>* writer);\n");
+  } else if (method->BidiStreaming()) {
+    printer->Print(
+        *vars,
+        "virtual ::grpc::Status $Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerReaderWriter< $Response$, $Request$>* stream);"
+        "\n");
+  }
+  printer->Print(method->GetTrailingComments("//").c_str());
+}
+
+void PrintHeaderServerMethodAsync(grpc_generator::Printer *printer,
+                                  const grpc_generator::Method *method,
+                                  std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  printer->Print(*vars, "template <class BaseClass>\n");
+  printer->Print(*vars,
+                 "class WithAsyncMethod_$Method$ : public BaseClass {\n");
+  printer->Print(
+      " private:\n"
+      "  void BaseClassMustBeDerivedFromService(const Service *service) {}\n");
+  printer->Print(" public:\n");
+  printer->Indent();
+  printer->Print(*vars,
+                 "WithAsyncMethod_$Method$() {\n"
+                 "  ::grpc::Service::MarkMethodAsync($Idx$);\n"
+                 "}\n");
+  printer->Print(*vars,
+                 "~WithAsyncMethod_$Method$() override {\n"
+                 "  BaseClassMustBeDerivedFromService(this);\n"
+                 "}\n");
+  if (method->NoStreaming()) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "$Response$* response) final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(
+        *vars,
+        "void Request$Method$("
+        "::grpc::ServerContext* context, $Request$* request, "
+        "::grpc::ServerAsyncResponseWriter< $Response$>* response, "
+        "::grpc::CompletionQueue* new_call_cq, "
+        "::grpc::ServerCompletionQueue* notification_cq, void *tag) {\n");
+    printer->Print(*vars,
+                   "  ::grpc::Service::RequestAsyncUnary($Idx$, context, "
+                   "request, response, new_call_cq, notification_cq, tag);\n");
+    printer->Print("}\n");
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerReader< $Request$>* reader, "
+        "$Response$* response) final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(
+        *vars,
+        "void Request$Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerAsyncReader< $Response$, $Request$>* reader, "
+        "::grpc::CompletionQueue* new_call_cq, "
+        "::grpc::ServerCompletionQueue* notification_cq, void *tag) {\n");
+    printer->Print(*vars,
+                   "  ::grpc::Service::RequestAsyncClientStreaming($Idx$, "
+                   "context, reader, new_call_cq, notification_cq, tag);\n");
+    printer->Print("}\n");
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "::grpc::ServerWriter< $Response$>* writer) final override "
+        "{\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(
+        *vars,
+        "void Request$Method$("
+        "::grpc::ServerContext* context, $Request$* request, "
+        "::grpc::ServerAsyncWriter< $Response$>* writer, "
+        "::grpc::CompletionQueue* new_call_cq, "
+        "::grpc::ServerCompletionQueue* notification_cq, void *tag) {\n");
+    printer->Print(
+        *vars,
+        "  ::grpc::Service::RequestAsyncServerStreaming($Idx$, "
+        "context, request, writer, new_call_cq, notification_cq, tag);\n");
+    printer->Print("}\n");
+  } else if (method->BidiStreaming()) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerReaderWriter< $Response$, $Request$>* stream) "
+        "final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(
+        *vars,
+        "void Request$Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerAsyncReaderWriter< $Response$, $Request$>* stream, "
+        "::grpc::CompletionQueue* new_call_cq, "
+        "::grpc::ServerCompletionQueue* notification_cq, void *tag) {\n");
+    printer->Print(*vars,
+                   "  ::grpc::Service::RequestAsyncBidiStreaming($Idx$, "
+                   "context, stream, new_call_cq, notification_cq, tag);\n");
+    printer->Print("}\n");
+  }
+  printer->Outdent();
+  printer->Print(*vars, "};\n");
+}
+
+void PrintHeaderServerMethodStreamedUnary(
+    grpc_generator::Printer *printer, const grpc_generator::Method *method,
+    std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  if (method->NoStreaming()) {
+    printer->Print(*vars, "template <class BaseClass>\n");
+    printer->Print(*vars,
+                   "class WithStreamedUnaryMethod_$Method$ : "
+                   "public BaseClass {\n");
+    printer->Print(
+        " private:\n"
+        "  void BaseClassMustBeDerivedFromService(const Service *service) "
+        "{}\n");
+    printer->Print(" public:\n");
+    printer->Indent();
+    printer->Print(*vars,
+                   "WithStreamedUnaryMethod_$Method$() {\n"
+                   "  ::grpc::Service::MarkMethodStreamed($Idx$,\n"
+                   "    new ::grpc::internal::StreamedUnaryHandler< $Request$, "
+                   "$Response$>(std::bind"
+                   "(&WithStreamedUnaryMethod_$Method$<BaseClass>::"
+                   "Streamed$Method$, this, std::placeholders::_1, "
+                   "std::placeholders::_2)));\n"
+                   "}\n");
+    printer->Print(*vars,
+                   "~WithStreamedUnaryMethod_$Method$() override {\n"
+                   "  BaseClassMustBeDerivedFromService(this);\n"
+                   "}\n");
+    printer->Print(
+        *vars,
+        "// disable regular version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "$Response$* response) final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(*vars,
+                   "// replace default version of method with streamed unary\n"
+                   "virtual ::grpc::Status Streamed$Method$("
+                   "::grpc::ServerContext* context, "
+                   "::grpc::ServerUnaryStreamer< "
+                   "$Request$,$Response$>* server_unary_streamer)"
+                   " = 0;\n");
+    printer->Outdent();
+    printer->Print(*vars, "};\n");
+  }
+}
+
+void PrintHeaderServerMethodSplitStreaming(
+    grpc_generator::Printer *printer, const grpc_generator::Method *method,
+    std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  if (ServerOnlyStreaming(method)) {
+    printer->Print(*vars, "template <class BaseClass>\n");
+    printer->Print(*vars,
+                   "class WithSplitStreamingMethod_$Method$ : "
+                   "public BaseClass {\n");
+    printer->Print(
+        " private:\n"
+        "  void BaseClassMustBeDerivedFromService(const Service *service) "
+        "{}\n");
+    printer->Print(" public:\n");
+    printer->Indent();
+    printer->Print(
+        *vars,
+        "WithSplitStreamingMethod_$Method$() {\n"
+        "  ::grpc::Service::MarkMethodStreamed($Idx$,\n"
+        "    new ::grpc::internal::SplitServerStreamingHandler< $Request$, "
+        "$Response$>(std::bind"
+        "(&WithSplitStreamingMethod_$Method$<BaseClass>::"
+        "Streamed$Method$, this, std::placeholders::_1, "
+        "std::placeholders::_2)));\n"
+        "}\n");
+    printer->Print(*vars,
+                   "~WithSplitStreamingMethod_$Method$() override {\n"
+                   "  BaseClassMustBeDerivedFromService(this);\n"
+                   "}\n");
+    printer->Print(
+        *vars,
+        "// disable regular version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "::grpc::ServerWriter< $Response$>* writer) final override "
+        "{\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+    printer->Print(*vars,
+                   "// replace default version of method with split streamed\n"
+                   "virtual ::grpc::Status Streamed$Method$("
+                   "::grpc::ServerContext* context, "
+                   "::grpc::ServerSplitStreamer< "
+                   "$Request$,$Response$>* server_split_streamer)"
+                   " = 0;\n");
+    printer->Outdent();
+    printer->Print(*vars, "};\n");
+  }
+}
+
+void PrintHeaderServerMethodGeneric(
+    grpc_generator::Printer *printer, const grpc_generator::Method *method,
+    std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  printer->Print(*vars, "template <class BaseClass>\n");
+  printer->Print(*vars,
+                 "class WithGenericMethod_$Method$ : public BaseClass {\n");
+  printer->Print(
+      " private:\n"
+      "  void BaseClassMustBeDerivedFromService(const Service *service) {}\n");
+  printer->Print(" public:\n");
+  printer->Indent();
+  printer->Print(*vars,
+                 "WithGenericMethod_$Method$() {\n"
+                 "  ::grpc::Service::MarkMethodGeneric($Idx$);\n"
+                 "}\n");
+  printer->Print(*vars,
+                 "~WithGenericMethod_$Method$() override {\n"
+                 "  BaseClassMustBeDerivedFromService(this);\n"
+                 "}\n");
+  if (method->NoStreaming()) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "$Response$* response) final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerReader< $Request$>* reader, "
+        "$Response$* response) final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, const $Request$* request, "
+        "::grpc::ServerWriter< $Response$>* writer) final override "
+        "{\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+  } else if (method->BidiStreaming()) {
+    printer->Print(
+        *vars,
+        "// disable synchronous version of this method\n"
+        "::grpc::Status $Method$("
+        "::grpc::ServerContext* context, "
+        "::grpc::ServerReaderWriter< $Response$, $Request$>* stream) "
+        "final override {\n"
+        "  abort();\n"
+        "  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, \"\");\n"
+        "}\n");
+  }
+  printer->Outdent();
+  printer->Print(*vars, "};\n");
+}
+
+void PrintHeaderService(grpc_generator::Printer *printer,
+                        const grpc_generator::Service *service,
+                        std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Service"] = service->name();
+
+  printer->Print(service->GetLeadingComments("//").c_str());
+  printer->Print(*vars,
+                 "class $Service$ final {\n"
+                 " public:\n");
+  printer->Indent();
+
+  // Service metadata
+  printer->Print(*vars,
+                 "static constexpr char const* service_full_name() {\n"
+                 "  return \"$Package$$Service$\";\n"
+                 "}\n");
+
+  // Client side
+  printer->Print(
+      "class StubInterface {\n"
+      " public:\n");
+  printer->Indent();
+  printer->Print("virtual ~StubInterface() {}\n");
+  for (int i = 0; i < service->method_count(); ++i) {
+    printer->Print(service->method(i)->GetLeadingComments("//").c_str());
+    PrintHeaderClientMethodInterfaces(printer, service->method(i).get(), vars,
+                                      true);
+    printer->Print(service->method(i)->GetTrailingComments("//").c_str());
+  }
+  printer->Outdent();
+  printer->Print("private:\n");
+  printer->Indent();
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintHeaderClientMethodInterfaces(printer, service->method(i).get(), vars,
+                                      false);
+  }
+  printer->Outdent();
+  printer->Print("};\n");
+  printer->Print(
+      "class Stub final : public StubInterface"
+      " {\n public:\n");
+  printer->Indent();
+  printer->Print(
+      "Stub(const std::shared_ptr< ::grpc::ChannelInterface>& "
+      "channel);\n");
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintHeaderClientMethod(printer, service->method(i).get(), vars, true);
+  }
+  printer->Outdent();
+  printer->Print("\n private:\n");
+  printer->Indent();
+  printer->Print("std::shared_ptr< ::grpc::ChannelInterface> channel_;\n");
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintHeaderClientMethod(printer, service->method(i).get(), vars, false);
+  }
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintHeaderClientMethodData(printer, service->method(i).get(), vars);
+  }
+  printer->Outdent();
+  printer->Print("};\n");
+  printer->Print(
+      "static std::unique_ptr<Stub> NewStub(const std::shared_ptr< "
+      "::grpc::ChannelInterface>& channel, "
+      "const ::grpc::StubOptions& options = ::grpc::StubOptions());\n");
+
+  printer->Print("\n");
+
+  // Server side - base
+  printer->Print(
+      "class Service : public ::grpc::Service {\n"
+      " public:\n");
+  printer->Indent();
+  printer->Print("Service();\n");
+  printer->Print("virtual ~Service();\n");
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintHeaderServerMethodSync(printer, service->method(i).get(), vars);
+  }
+  printer->Outdent();
+  printer->Print("};\n");
+
+  // Server side - Asynchronous
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintHeaderServerMethodAsync(printer, service->method(i).get(), vars);
+  }
+
+  printer->Print("typedef ");
+
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["method_name"] = service->method(i).get()->name();
+    printer->Print(*vars, "WithAsyncMethod_$method_name$<");
+  }
+  printer->Print("Service");
+  for (int i = 0; i < service->method_count(); ++i) {
+    printer->Print(" >");
+  }
+  printer->Print(" AsyncService;\n");
+
+  // Server side - Generic
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintHeaderServerMethodGeneric(printer, service->method(i).get(), vars);
+  }
+
+  // Server side - Streamed Unary
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintHeaderServerMethodStreamedUnary(printer, service->method(i).get(),
+                                         vars);
+  }
+
+  printer->Print("typedef ");
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["method_name"] = service->method(i).get()->name();
+    if (service->method(i)->NoStreaming()) {
+      printer->Print(*vars, "WithStreamedUnaryMethod_$method_name$<");
+    }
+  }
+  printer->Print("Service");
+  for (int i = 0; i < service->method_count(); ++i) {
+    if (service->method(i)->NoStreaming()) {
+      printer->Print(" >");
+    }
+  }
+  printer->Print(" StreamedUnaryService;\n");
+
+  // Server side - controlled server-side streaming
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintHeaderServerMethodSplitStreaming(printer, service->method(i).get(),
+                                          vars);
+  }
+
+  printer->Print("typedef ");
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["method_name"] = service->method(i).get()->name();
+    auto method = service->method(i);
+    if (ServerOnlyStreaming(method.get())) {
+      printer->Print(*vars, "WithSplitStreamingMethod_$method_name$<");
+    }
+  }
+  printer->Print("Service");
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    if (ServerOnlyStreaming(method.get())) {
+      printer->Print(" >");
+    }
+  }
+  printer->Print(" SplitStreamedService;\n");
+
+  // Server side - typedef for controlled both unary and server-side streaming
+  printer->Print("typedef ");
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["method_name"] = service->method(i).get()->name();
+    auto method = service->method(i);
+    if (ServerOnlyStreaming(method.get())) {
+      printer->Print(*vars, "WithSplitStreamingMethod_$method_name$<");
+    }
+    if (service->method(i)->NoStreaming()) {
+      printer->Print(*vars, "WithStreamedUnaryMethod_$method_name$<");
+    }
+  }
+  printer->Print("Service");
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    if (service->method(i)->NoStreaming() ||
+        ServerOnlyStreaming(method.get())) {
+      printer->Print(" >");
+    }
+  }
+  printer->Print(" StreamedService;\n");
+
+  printer->Outdent();
+  printer->Print("};\n");
+  printer->Print(service->GetTrailingComments("//").c_str());
+}
+
+grpc::string GetHeaderServices(grpc_generator::File *file,
+                               const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+    // Package string is empty or ends with a dot. It is used to fully qualify
+    // method names.
+    vars["Package"] = file->package();
+    if (!file->package().empty()) {
+      vars["Package"].append(".");
+    }
+
+    if (!params.services_namespace.empty()) {
+      vars["services_namespace"] = params.services_namespace;
+      printer->Print(vars, "\nnamespace $services_namespace$ {\n\n");
+    }
+
+    for (int i = 0; i < file->service_count(); ++i) {
+      PrintHeaderService(printer.get(), file->service(i).get(), &vars);
+      printer->Print("\n");
+    }
+
+    if (!params.services_namespace.empty()) {
+      printer->Print(vars, "}  // namespace $services_namespace$\n\n");
+    }
+  }
+  return output;
+}
+
+grpc::string GetHeaderEpilogue(grpc_generator::File *file,
+                               const Parameters & /*params*/) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    vars["filename"] = file->filename();
+    vars["filename_identifier"] = FilenameIdentifier(file->filename());
+
+    if (!file->package().empty()) {
+      std::vector<grpc::string> parts = file->package_parts();
+
+      for (auto part = parts.rbegin(); part != parts.rend(); part++) {
+        vars["part"] = *part;
+        printer->Print(vars, "}  // namespace $part$\n");
+      }
+      printer->Print(vars, "\n");
+    }
+
+    printer->Print(vars, "\n");
+    printer->Print(vars, "#endif  // GRPC_$filename_identifier$__INCLUDED\n");
+
+    printer->Print(file->GetTrailingComments("//").c_str());
+  }
+  return output;
+}
+
+grpc::string GetSourcePrologue(grpc_generator::File *file,
+                               const Parameters & /*params*/) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    vars["filename"] = file->filename();
+    vars["filename_base"] = file->filename_without_ext();
+    vars["message_header_ext"] = message_header_ext();
+    vars["service_header_ext"] = service_header_ext();
+
+    printer->Print(vars, "// Generated by the gRPC C++ plugin.\n");
+    printer->Print(vars,
+                   "// If you make any local change, they will be lost.\n");
+    printer->Print(vars, "// source: $filename$\n\n");
+
+    printer->Print(vars, "#include \"$filename_base$$message_header_ext$\"\n");
+    printer->Print(vars, "#include \"$filename_base$$service_header_ext$\"\n");
+    printer->Print(vars, "\n");
+  }
+  return output;
+}
+
+grpc::string GetSourceIncludes(grpc_generator::File *file,
+                               const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    static const char *headers_strs[] = {
+        "grpcpp/impl/codegen/async_stream.h",
+        "grpcpp/impl/codegen/async_unary_call.h",
+        "grpcpp/impl/codegen/channel_interface.h",
+        "grpcpp/impl/codegen/client_unary_call.h",
+        "grpcpp/impl/codegen/method_handler.h",
+        "grpcpp/impl/codegen/rpc_service_method.h",
+        "grpcpp/impl/codegen/service_type.h",
+        "grpcpp/impl/codegen/sync_stream.h"};
+    std::vector<grpc::string> headers(headers_strs, array_end(headers_strs));
+    PrintIncludes(printer.get(), headers, params);
+
+    if (!file->package().empty()) {
+      std::vector<grpc::string> parts = file->package_parts();
+
+      for (auto part = parts.begin(); part != parts.end(); part++) {
+        vars["part"] = *part;
+        printer->Print(vars, "namespace $part$ {\n");
+      }
+    }
+
+    printer->Print(vars, "\n");
+  }
+  return output;
+}
+
+void PrintSourceClientMethod(grpc_generator::Printer *printer,
+                             const grpc_generator::Method *method,
+                             std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  struct {
+    grpc::string prefix;
+    grpc::string start;          // bool literal expressed as string
+    grpc::string method_params;  // extra arguments to method
+    grpc::string create_args;    // extra arguments to creator
+  } async_prefixes[] = {{"Async", "true", ", void* tag", ", tag"},
+                        {"PrepareAsync", "false", "", ", nullptr"}};
+  if (method->NoStreaming()) {
+    printer->Print(*vars,
+                   "::grpc::Status $ns$$Service$::Stub::$Method$("
+                   "::grpc::ClientContext* context, "
+                   "const $Request$& request, $Response$* response) {\n");
+    printer->Print(*vars,
+                   "  return ::grpc::internal::BlockingUnaryCall"
+                   "(channel_.get(), rpcmethod_$Method$_, "
+                   "context, request, response);\n}\n\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncStart"] = async_prefix.start;
+      printer->Print(*vars,
+                     "::grpc::ClientAsyncResponseReader< $Response$>* "
+                     "$ns$$Service$::Stub::$AsyncPrefix$$Method$Raw(::grpc::"
+                     "ClientContext* context, "
+                     "const $Request$& request, "
+                     "::grpc::CompletionQueue* cq) {\n");
+      printer->Print(
+          *vars,
+          "  return "
+          "::grpc::internal::ClientAsyncResponseReaderFactory< $Response$>"
+          "::Create(channel_.get(), cq, "
+          "rpcmethod_$Method$_, "
+          "context, request, $AsyncStart$);\n"
+          "}\n\n");
+    }
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(*vars,
+                   "::grpc::ClientWriter< $Request$>* "
+                   "$ns$$Service$::Stub::$Method$Raw("
+                   "::grpc::ClientContext* context, $Response$* response) {\n");
+    printer->Print(
+        *vars,
+        "  return ::grpc::internal::ClientWriterFactory< $Request$>::Create("
+        "channel_.get(), "
+        "rpcmethod_$Method$_, "
+        "context, response);\n"
+        "}\n\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncStart"] = async_prefix.start;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["AsyncCreateArgs"] = async_prefix.create_args;
+      printer->Print(*vars,
+                     "::grpc::ClientAsyncWriter< $Request$>* "
+                     "$ns$$Service$::Stub::$AsyncPrefix$$Method$Raw("
+                     "::grpc::ClientContext* context, $Response$* response, "
+                     "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+      printer->Print(
+          *vars,
+          "  return ::grpc::internal::ClientAsyncWriterFactory< $Request$>"
+          "::Create(channel_.get(), cq, "
+          "rpcmethod_$Method$_, "
+          "context, response, $AsyncStart$$AsyncCreateArgs$);\n"
+          "}\n\n");
+    }
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "::grpc::ClientReader< $Response$>* "
+        "$ns$$Service$::Stub::$Method$Raw("
+        "::grpc::ClientContext* context, const $Request$& request) {\n");
+    printer->Print(
+        *vars,
+        "  return ::grpc::internal::ClientReaderFactory< $Response$>::Create("
+        "channel_.get(), "
+        "rpcmethod_$Method$_, "
+        "context, request);\n"
+        "}\n\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncStart"] = async_prefix.start;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["AsyncCreateArgs"] = async_prefix.create_args;
+      printer->Print(
+          *vars,
+          "::grpc::ClientAsyncReader< $Response$>* "
+          "$ns$$Service$::Stub::$AsyncPrefix$$Method$Raw("
+          "::grpc::ClientContext* context, const $Request$& request, "
+          "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+      printer->Print(
+          *vars,
+          "  return ::grpc::internal::ClientAsyncReaderFactory< $Response$>"
+          "::Create(channel_.get(), cq, "
+          "rpcmethod_$Method$_, "
+          "context, request, $AsyncStart$$AsyncCreateArgs$);\n"
+          "}\n\n");
+    }
+  } else if (method->BidiStreaming()) {
+    printer->Print(
+        *vars,
+        "::grpc::ClientReaderWriter< $Request$, $Response$>* "
+        "$ns$$Service$::Stub::$Method$Raw(::grpc::ClientContext* context) {\n");
+    printer->Print(*vars,
+                   "  return ::grpc::internal::ClientReaderWriterFactory< "
+                   "$Request$, $Response$>::Create("
+                   "channel_.get(), "
+                   "rpcmethod_$Method$_, "
+                   "context);\n"
+                   "}\n\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncStart"] = async_prefix.start;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["AsyncCreateArgs"] = async_prefix.create_args;
+      printer->Print(*vars,
+                     "::grpc::ClientAsyncReaderWriter< $Request$, $Response$>* "
+                     "$ns$$Service$::Stub::$AsyncPrefix$$Method$Raw(::grpc::"
+                     "ClientContext* context, "
+                     "::grpc::CompletionQueue* cq$AsyncMethodParams$) {\n");
+      printer->Print(*vars,
+                     "  return "
+                     "::grpc::internal::ClientAsyncReaderWriterFactory< "
+                     "$Request$, $Response$>::Create("
+                     "channel_.get(), cq, "
+                     "rpcmethod_$Method$_, "
+                     "context, $AsyncStart$$AsyncCreateArgs$);\n"
+                     "}\n\n");
+    }
+  }
+}
+
+void PrintSourceServerMethod(grpc_generator::Printer *printer,
+                             const grpc_generator::Method *method,
+                             std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+  if (method->NoStreaming()) {
+    printer->Print(*vars,
+                   "::grpc::Status $ns$$Service$::Service::$Method$("
+                   "::grpc::ServerContext* context, "
+                   "const $Request$* request, $Response$* response) {\n");
+    printer->Print("  (void) context;\n");
+    printer->Print("  (void) request;\n");
+    printer->Print("  (void) response;\n");
+    printer->Print(
+        "  return ::grpc::Status("
+        "::grpc::StatusCode::UNIMPLEMENTED, \"\");\n");
+    printer->Print("}\n\n");
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(*vars,
+                   "::grpc::Status $ns$$Service$::Service::$Method$("
+                   "::grpc::ServerContext* context, "
+                   "::grpc::ServerReader< $Request$>* reader, "
+                   "$Response$* response) {\n");
+    printer->Print("  (void) context;\n");
+    printer->Print("  (void) reader;\n");
+    printer->Print("  (void) response;\n");
+    printer->Print(
+        "  return ::grpc::Status("
+        "::grpc::StatusCode::UNIMPLEMENTED, \"\");\n");
+    printer->Print("}\n\n");
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(*vars,
+                   "::grpc::Status $ns$$Service$::Service::$Method$("
+                   "::grpc::ServerContext* context, "
+                   "const $Request$* request, "
+                   "::grpc::ServerWriter< $Response$>* writer) {\n");
+    printer->Print("  (void) context;\n");
+    printer->Print("  (void) request;\n");
+    printer->Print("  (void) writer;\n");
+    printer->Print(
+        "  return ::grpc::Status("
+        "::grpc::StatusCode::UNIMPLEMENTED, \"\");\n");
+    printer->Print("}\n\n");
+  } else if (method->BidiStreaming()) {
+    printer->Print(*vars,
+                   "::grpc::Status $ns$$Service$::Service::$Method$("
+                   "::grpc::ServerContext* context, "
+                   "::grpc::ServerReaderWriter< $Response$, $Request$>* "
+                   "stream) {\n");
+    printer->Print("  (void) context;\n");
+    printer->Print("  (void) stream;\n");
+    printer->Print(
+        "  return ::grpc::Status("
+        "::grpc::StatusCode::UNIMPLEMENTED, \"\");\n");
+    printer->Print("}\n\n");
+  }
+}
+
+void PrintSourceService(grpc_generator::Printer *printer,
+                        const grpc_generator::Service *service,
+                        std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Service"] = service->name();
+
+  if (service->method_count() > 0) {
+    printer->Print(*vars,
+                   "static const char* $prefix$$Service$_method_names[] = {\n");
+    for (int i = 0; i < service->method_count(); ++i) {
+      (*vars)["Method"] = service->method(i).get()->name();
+      printer->Print(*vars, "  \"/$Package$$Service$/$Method$\",\n");
+    }
+    printer->Print(*vars, "};\n\n");
+  }
+
+  printer->Print(*vars,
+                 "std::unique_ptr< $ns$$Service$::Stub> $ns$$Service$::NewStub("
+                 "const std::shared_ptr< ::grpc::ChannelInterface>& channel, "
+                 "const ::grpc::StubOptions& options) {\n"
+                 "  std::unique_ptr< $ns$$Service$::Stub> stub(new "
+                 "$ns$$Service$::Stub(channel));\n"
+                 "  return stub;\n"
+                 "}\n\n");
+  printer->Print(*vars,
+                 "$ns$$Service$::Stub::Stub(const std::shared_ptr< "
+                 "::grpc::ChannelInterface>& channel)\n");
+  printer->Indent();
+  printer->Print(": channel_(channel)");
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    (*vars)["Method"] = method->name();
+    (*vars)["Idx"] = as_string(i);
+    if (method->NoStreaming()) {
+      (*vars)["StreamingType"] = "NORMAL_RPC";
+      // NOTE: There is no reason to consider streamed-unary as a separate
+      // category here since this part is setting up the client-side stub
+      // and this appears as a NORMAL_RPC from the client-side.
+    } else if (ClientOnlyStreaming(method.get())) {
+      (*vars)["StreamingType"] = "CLIENT_STREAMING";
+    } else if (ServerOnlyStreaming(method.get())) {
+      (*vars)["StreamingType"] = "SERVER_STREAMING";
+    } else {
+      (*vars)["StreamingType"] = "BIDI_STREAMING";
+    }
+    printer->Print(*vars,
+                   ", rpcmethod_$Method$_("
+                   "$prefix$$Service$_method_names[$Idx$], "
+                   "::grpc::internal::RpcMethod::$StreamingType$, "
+                   "channel"
+                   ")\n");
+  }
+  printer->Print("{}\n\n");
+  printer->Outdent();
+
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintSourceClientMethod(printer, service->method(i).get(), vars);
+  }
+
+  printer->Print(*vars, "$ns$$Service$::Service::Service() {\n");
+  printer->Indent();
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    (*vars)["Idx"] = as_string(i);
+    (*vars)["Method"] = method->name();
+    (*vars)["Request"] = method->input_type_name();
+    (*vars)["Response"] = method->output_type_name();
+    if (method->NoStreaming()) {
+      printer->Print(
+          *vars,
+          "AddMethod(new ::grpc::internal::RpcServiceMethod(\n"
+          "    $prefix$$Service$_method_names[$Idx$],\n"
+          "    ::grpc::internal::RpcMethod::NORMAL_RPC,\n"
+          "    new ::grpc::internal::RpcMethodHandler< $ns$$Service$::Service, "
+          "$Request$, "
+          "$Response$>(\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this)));\n");
+    } else if (ClientOnlyStreaming(method.get())) {
+      printer->Print(
+          *vars,
+          "AddMethod(new ::grpc::internal::RpcServiceMethod(\n"
+          "    $prefix$$Service$_method_names[$Idx$],\n"
+          "    ::grpc::internal::RpcMethod::CLIENT_STREAMING,\n"
+          "    new ::grpc::internal::ClientStreamingHandler< "
+          "$ns$$Service$::Service, $Request$, $Response$>(\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this)));\n");
+    } else if (ServerOnlyStreaming(method.get())) {
+      printer->Print(
+          *vars,
+          "AddMethod(new ::grpc::internal::RpcServiceMethod(\n"
+          "    $prefix$$Service$_method_names[$Idx$],\n"
+          "    ::grpc::internal::RpcMethod::SERVER_STREAMING,\n"
+          "    new ::grpc::internal::ServerStreamingHandler< "
+          "$ns$$Service$::Service, $Request$, $Response$>(\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this)));\n");
+    } else if (method->BidiStreaming()) {
+      printer->Print(
+          *vars,
+          "AddMethod(new ::grpc::internal::RpcServiceMethod(\n"
+          "    $prefix$$Service$_method_names[$Idx$],\n"
+          "    ::grpc::internal::RpcMethod::BIDI_STREAMING,\n"
+          "    new ::grpc::internal::BidiStreamingHandler< "
+          "$ns$$Service$::Service, $Request$, $Response$>(\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this)));\n");
+    }
+  }
+  printer->Outdent();
+  printer->Print(*vars, "}\n\n");
+  printer->Print(*vars,
+                 "$ns$$Service$::Service::~Service() {\n"
+                 "}\n\n");
+  for (int i = 0; i < service->method_count(); ++i) {
+    (*vars)["Idx"] = as_string(i);
+    PrintSourceServerMethod(printer, service->method(i).get(), vars);
+  }
+}
+
+grpc::string GetSourceServices(grpc_generator::File *file,
+                               const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+    // Package string is empty or ends with a dot. It is used to fully qualify
+    // method names.
+    vars["Package"] = file->package();
+    if (!file->package().empty()) {
+      vars["Package"].append(".");
+    }
+    if (!params.services_namespace.empty()) {
+      vars["ns"] = params.services_namespace + "::";
+      vars["prefix"] = params.services_namespace;
+    } else {
+      vars["ns"] = "";
+      vars["prefix"] = "";
+    }
+
+    for (int i = 0; i < file->service_count(); ++i) {
+      PrintSourceService(printer.get(), file->service(i).get(), &vars);
+      printer->Print("\n");
+    }
+  }
+  return output;
+}
+
+grpc::string GetSourceEpilogue(grpc_generator::File *file,
+                               const Parameters & /*params*/) {
+  grpc::string temp;
+
+  if (!file->package().empty()) {
+    std::vector<grpc::string> parts = file->package_parts();
+
+    for (auto part = parts.begin(); part != parts.end(); part++) {
+      temp.append("}  // namespace ");
+      temp.append(*part);
+      temp.append("\n");
+    }
+    temp.append("\n");
+  }
+
+  return temp;
+}
+
+// TODO(mmukhi): Make sure we need parameters or not.
+grpc::string GetMockPrologue(grpc_generator::File *file,
+                             const Parameters & /*params*/) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    vars["filename"] = file->filename();
+    vars["filename_base"] = file->filename_without_ext();
+    vars["message_header_ext"] = message_header_ext();
+    vars["service_header_ext"] = service_header_ext();
+
+    printer->Print(vars, "// Generated by the gRPC C++ plugin.\n");
+    printer->Print(vars,
+                   "// If you make any local change, they will be lost.\n");
+    printer->Print(vars, "// source: $filename$\n\n");
+
+    printer->Print(vars, "#include \"$filename_base$$message_header_ext$\"\n");
+    printer->Print(vars, "#include \"$filename_base$$service_header_ext$\"\n");
+    printer->Print(vars, file->additional_headers().c_str());
+    printer->Print(vars, "\n");
+  }
+  return output;
+}
+
+// TODO(mmukhi): Add client-stream and completion-queue headers.
+grpc::string GetMockIncludes(grpc_generator::File *file,
+                             const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+
+    static const char *headers_strs[] = {
+        "grpcpp/impl/codegen/async_stream.h",
+        "grpcpp/impl/codegen/sync_stream.h",
+        "gmock/gmock.h",
+    };
+    std::vector<grpc::string> headers(headers_strs, array_end(headers_strs));
+    PrintIncludes(printer.get(), headers, params);
+
+    if (!file->package().empty()) {
+      std::vector<grpc::string> parts = file->package_parts();
+
+      for (auto part = parts.begin(); part != parts.end(); part++) {
+        vars["part"] = *part;
+        printer->Print(vars, "namespace $part$ {\n");
+      }
+    }
+
+    printer->Print(vars, "\n");
+  }
+  return output;
+}
+
+void PrintMockClientMethods(grpc_generator::Printer *printer,
+                            const grpc_generator::Method *method,
+                            std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Method"] = method->name();
+  (*vars)["Request"] = method->input_type_name();
+  (*vars)["Response"] = method->output_type_name();
+
+  struct {
+    grpc::string prefix;
+    grpc::string method_params;  // extra arguments to method
+    int extra_method_param_count;
+  } async_prefixes[] = {{"Async", ", void* tag", 1}, {"PrepareAsync", "", 0}};
+
+  if (method->NoStreaming()) {
+    printer->Print(
+        *vars,
+        "MOCK_METHOD3($Method$, ::grpc::Status(::grpc::ClientContext* context, "
+        "const $Request$& request, $Response$* response));\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      printer->Print(
+          *vars,
+          "MOCK_METHOD3($AsyncPrefix$$Method$Raw, "
+          "::grpc::ClientAsyncResponseReaderInterface< $Response$>*"
+          "(::grpc::ClientContext* context, const $Request$& request, "
+          "::grpc::CompletionQueue* cq));\n");
+    }
+  } else if (ClientOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "MOCK_METHOD2($Method$Raw, "
+        "::grpc::ClientWriterInterface< $Request$>*"
+        "(::grpc::ClientContext* context, $Response$* response));\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["MockArgs"] =
+        flatbuffers::NumToString(3 + async_prefix.extra_method_param_count);
+      printer->Print(*vars,
+                     "MOCK_METHOD$MockArgs$($AsyncPrefix$$Method$Raw, "
+                     "::grpc::ClientAsyncWriterInterface< $Request$>*"
+                     "(::grpc::ClientContext* context, $Response$* response, "
+                     "::grpc::CompletionQueue* cq$AsyncMethodParams$));\n");
+    }
+  } else if (ServerOnlyStreaming(method)) {
+    printer->Print(
+        *vars,
+        "MOCK_METHOD2($Method$Raw, "
+        "::grpc::ClientReaderInterface< $Response$>*"
+        "(::grpc::ClientContext* context, const $Request$& request));\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["MockArgs"] =
+          flatbuffers::NumToString(3 + async_prefix.extra_method_param_count);
+      printer->Print(
+          *vars,
+          "MOCK_METHOD$MockArgs$($AsyncPrefix$$Method$Raw, "
+          "::grpc::ClientAsyncReaderInterface< $Response$>*"
+          "(::grpc::ClientContext* context, const $Request$& request, "
+          "::grpc::CompletionQueue* cq$AsyncMethodParams$));\n");
+    }
+  } else if (method->BidiStreaming()) {
+    printer->Print(
+        *vars,
+        "MOCK_METHOD1($Method$Raw, "
+        "::grpc::ClientReaderWriterInterface< $Request$, $Response$>*"
+        "(::grpc::ClientContext* context));\n");
+    for (size_t i = 0; i < sizeof(async_prefixes)/sizeof(async_prefixes[0]); i ++) {
+      auto& async_prefix = async_prefixes[i];
+      (*vars)["AsyncPrefix"] = async_prefix.prefix;
+      (*vars)["AsyncMethodParams"] = async_prefix.method_params;
+      (*vars)["MockArgs"] =
+          flatbuffers::NumToString(2 + async_prefix.extra_method_param_count);
+      printer->Print(
+          *vars,
+          "MOCK_METHOD$MockArgs$($AsyncPrefix$$Method$Raw, "
+          "::grpc::ClientAsyncReaderWriterInterface<$Request$, $Response$>*"
+          "(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq"
+          "$AsyncMethodParams$));\n");
+    }
+  }
+}
+
+void PrintMockService(grpc_generator::Printer *printer,
+                      const grpc_generator::Service *service,
+                      std::map<grpc::string, grpc::string> *vars) {
+  (*vars)["Service"] = service->name();
+
+  printer->Print(*vars,
+                 "class Mock$Service$Stub : public $Service$::StubInterface {\n"
+                 " public:\n");
+  printer->Indent();
+  for (int i = 0; i < service->method_count(); ++i) {
+    PrintMockClientMethods(printer, service->method(i).get(), vars);
+  }
+  printer->Outdent();
+  printer->Print("};\n");
+}
+
+grpc::string GetMockServices(grpc_generator::File *file,
+                             const Parameters &params) {
+  grpc::string output;
+  {
+    // Scope the output stream so it closes and finalizes output to the string.
+    auto printer = file->CreatePrinter(&output);
+    std::map<grpc::string, grpc::string> vars;
+    // Package string is empty or ends with a dot. It is used to fully qualify
+    // method names.
+    vars["Package"] = file->package();
+    if (!file->package().empty()) {
+      vars["Package"].append(".");
+    }
+
+    if (!params.services_namespace.empty()) {
+      vars["services_namespace"] = params.services_namespace;
+      printer->Print(vars, "\nnamespace $services_namespace$ {\n\n");
+    }
+
+    for (int i = 0; i < file->service_count(); i++) {
+      PrintMockService(printer.get(), file->service(i).get(), &vars);
+      printer->Print("\n");
+    }
+
+    if (!params.services_namespace.empty()) {
+      printer->Print(vars, "} // namespace $services_namespace$\n\n");
+    }
+  }
+  return output;
+}
+
+grpc::string GetMockEpilogue(grpc_generator::File *file,
+                             const Parameters & /*params*/) {
+  grpc::string temp;
+
+  if (!file->package().empty()) {
+    std::vector<grpc::string> parts = file->package_parts();
+
+    for (auto part = parts.begin(); part != parts.end(); part++) {
+      temp.append("} // namespace ");
+      temp.append(*part);
+      temp.append("\n");
+    }
+    temp.append("\n");
+  }
+
+  return temp;
+}
+
+}  // namespace grpc_cpp_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.h
new file mode 100644
index 0000000..6119ebe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/cpp_generator.h
@@ -0,0 +1,138 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_INTERNAL_COMPILER_CPP_GENERATOR_H
+#define GRPC_INTERNAL_COMPILER_CPP_GENERATOR_H
+
+// cpp_generator.h/.cc do not directly depend on GRPC/ProtoBuf, such that they
+// can be used to generate code for other serialization systems, such as
+// FlatBuffers.
+
+#include <memory>
+#include <vector>
+
+#include "src/compiler/config.h"
+#include "src/compiler/schema_interface.h"
+
+#ifndef GRPC_CUSTOM_STRING
+#include <string>
+#define GRPC_CUSTOM_STRING std::string
+#endif
+
+namespace grpc {
+
+typedef GRPC_CUSTOM_STRING string;
+
+}  // namespace grpc
+
+namespace grpc_cpp_generator {
+
+// Contains all the parameters that are parsed from the command line.
+struct Parameters {
+  // Puts the service into a namespace
+  grpc::string services_namespace;
+  // Use system includes (<>) or local includes ("")
+  bool use_system_headers;
+  // Prefix to any grpc include
+  grpc::string grpc_search_path;
+  // Generate GMOCK code to facilitate unit testing.
+  bool generate_mock_code;
+};
+
+// Return the prologue of the generated header file.
+grpc::string GetHeaderPrologue(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the includes needed for generated header file.
+grpc::string GetHeaderIncludes(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the includes needed for generated source file.
+grpc::string GetSourceIncludes(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the epilogue of the generated header file.
+grpc::string GetHeaderEpilogue(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the prologue of the generated source file.
+grpc::string GetSourcePrologue(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the services for generated header file.
+grpc::string GetHeaderServices(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the services for generated source file.
+grpc::string GetSourceServices(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the epilogue of the generated source file.
+grpc::string GetSourceEpilogue(grpc_generator::File *file,
+                               const Parameters &params);
+
+// Return the prologue of the generated mock file.
+grpc::string GetMockPrologue(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the includes needed for generated mock file.
+grpc::string GetMockIncludes(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the services for generated mock file.
+grpc::string GetMockServices(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the epilogue of generated mock file.
+grpc::string GetMockEpilogue(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the prologue of the generated mock file.
+grpc::string GetMockPrologue(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the includes needed for generated mock file.
+grpc::string GetMockIncludes(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the services for generated mock file.
+grpc::string GetMockServices(grpc_generator::File *file,
+                             const Parameters &params);
+
+// Return the epilogue of generated mock file.
+grpc::string GetMockEpilogue(grpc_generator::File *file,
+                             const Parameters &params);
+
+}  // namespace grpc_cpp_generator
+
+#endif  // GRPC_INTERNAL_COMPILER_CPP_GENERATOR_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.cc
new file mode 100644
index 0000000..d646451
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.cc
@@ -0,0 +1,501 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation AN/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <map>
+#include <cctype>
+#include <sstream>
+
+#include "src/compiler/go_generator.h"
+
+template <class T>
+grpc::string as_string(T x) {
+	std::ostringstream out;
+	out << x;
+	return out.str();
+}
+
+inline bool ClientOnlyStreaming(const grpc_generator::Method *method) {
+  return method->ClientStreaming() && !method->ServerStreaming();
+}
+
+inline bool ServerOnlyStreaming(const grpc_generator::Method *method) {
+  return !method->ClientStreaming() && method->ServerStreaming();
+}
+
+namespace grpc_go_generator {
+
+// Returns string with first letter to lowerCase
+grpc::string unexportName(grpc::string s) {
+	if (s.empty())
+		return s;
+	s[0] = static_cast<char>(std::tolower(s[0]));
+	return s;
+}
+
+// Returns string with first letter to uppercase
+grpc::string exportName(grpc::string s) {
+	if (s.empty())
+		return s;
+	s[0] = static_cast<char>(std::toupper(s[0]));
+	return s;
+}
+
+void GenerateError(grpc_generator::Printer *printer,
+                   std::map<grpc::string, grpc::string> vars,
+                   const bool multiple_return = true) {
+  printer->Print(vars, "if $Error_Check$ {\n");
+  printer->Indent();
+  vars["Return"] = multiple_return ? "nil, err" : "err";
+  printer->Print(vars, "return $Return$\n");
+  printer->Outdent();
+  printer->Print("}\n");
+}
+
+// Generates imports for the service
+void GenerateImports(grpc_generator::File *file, grpc_generator::Printer *printer,
+                     std::map<grpc::string, grpc::string> vars) {
+	vars["filename"] = file->filename();
+	printer->Print("//Generated by gRPC Go plugin\n");
+	printer->Print("//If you make any local changes, they will be lost\n");
+	printer->Print(vars, "//source: $filename$\n\n");
+	printer->Print(vars, "package $Package$\n\n");
+	printer->Print("import (\n");
+	printer->Indent();
+	printer->Print(vars, "$context$ \"context\"\n");
+  printer->Print("flatbuffers \"github.com/google/flatbuffers/go\"\n");
+	printer->Print(vars, "$grpc$ \"google.golang.org/grpc\"\n");
+  printer->Print("\"google.golang.org/grpc/codes\"\n");
+  printer->Print("\"google.golang.org/grpc/status\"\n");
+	printer->Outdent();
+	printer->Print(")\n\n");
+}
+
+// Generates Server method signature source
+void GenerateServerMethodSignature(const grpc_generator::Method *method, grpc_generator::Printer *printer,
+                                   std::map<grpc::string, grpc::string> vars) {
+  vars["Method"] = exportName(method->name());
+	vars["Request"] = method->get_input_type_name();
+	vars["Response"] = (vars["CustomMethodIO"] == "") ? method->get_output_type_name() : vars["CustomMethodIO"];
+	if (method->NoStreaming()) {
+		printer->Print(vars, "$Method$($context$.Context, *$Request$) (*$Response$, error)$Ending$");
+	} else if (ServerOnlyStreaming(method)) {
+		printer->Print(vars, "$Method$(*$Request$, $Service$_$Method$Server) error$Ending$");
+	} else {
+		printer->Print(vars, "$Method$($Service$_$Method$Server) error$Ending$");
+	}
+}
+
+void GenerateServerMethod(const grpc_generator::Method *method, grpc_generator::Printer *printer,
+                          std::map<grpc::string, grpc::string> vars) {
+	vars["Method"] = exportName(method->name());
+	vars["Request"] = method->get_input_type_name();
+	vars["Response"] = (vars["CustomMethodIO"] == "") ? method->get_output_type_name() : vars["CustomMethodIO"];
+	vars["FullMethodName"] = "/" + vars["ServicePrefix"] + vars["Service"] + "/" + vars["Method"];
+	vars["Handler"] = "_" + vars["Service"] + "_" + vars["Method"] + "_Handler";
+	if (method->NoStreaming()) {
+		printer->Print(vars, "func $Handler$(srv interface{}, ctx $context$.Context,\n\tdec func(interface{}) error, interceptor $grpc$.UnaryServerInterceptor) (interface{}, error) {\n");
+		printer->Indent();
+		printer->Print(vars, "in := new($Request$)\n");
+    vars["Error_Check"] = "err := dec(in); err != nil";
+    GenerateError(printer, vars);
+    printer->Print("if interceptor == nil {\n");
+    printer->Indent();
+    printer->Print(vars, "return srv.($Service$Server).$Method$(ctx, in)\n");
+    printer->Outdent();
+    printer->Print("}\n");
+		printer->Print(vars, "info := &$grpc$.UnaryServerInfo{\n");
+		printer->Indent();
+		printer->Print("Server:     srv,\n");
+		printer->Print(vars, "FullMethod: \"$FullMethodName$\",\n");
+		printer->Outdent();
+		printer->Print("}\n");
+    printer->Outdent();
+    printer->Print("\n");
+    printer->Indent();
+		printer->Print(vars, "handler := func(ctx $context$.Context, req interface{}) (interface{}, error) {\n");
+		printer->Indent();
+		printer->Print(vars, "return srv.($Service$Server).$Method$(ctx, req.(*$Request$))\n");
+		printer->Outdent();
+		printer->Print("}\n");
+		printer->Print("return interceptor(ctx, in, info, handler)\n");
+		printer->Outdent();
+		printer->Print("}\n");
+		return;
+	}
+	vars["StreamType"] = vars["ServiceUnexported"] + vars["Method"] + "Server";
+	printer->Print(vars, "func $Handler$(srv interface{}, stream $grpc$.ServerStream) error {\n");
+	printer->Indent();
+	if (ServerOnlyStreaming(method)) {
+		printer->Print(vars, "m := new($Request$)\n");
+    vars["Error_Check"] = "err := stream.RecvMsg(m); err != nil";
+    GenerateError(printer, vars, false);
+		printer->Print(vars, "return srv.($Service$Server).$Method$(m, &$StreamType${stream})\n");
+	} else {
+		printer->Print(vars, "return srv.($Service$Server).$Method$(&$StreamType${stream})\n");
+	}
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	bool genSend = method->BidiStreaming() || ServerOnlyStreaming(method);
+	bool genRecv = method->BidiStreaming() || ClientOnlyStreaming(method);
+	bool genSendAndClose = ClientOnlyStreaming(method);
+
+	printer->Print(vars, "type $Service$_$Method$Server interface {\n");
+	printer->Indent();
+	if (genSend) {
+		printer->Print(vars, "Send(*$Response$) error\n");
+	}
+	if (genRecv) {
+		printer->Print(vars, "Recv() (*$Request$, error)\n");
+	}
+	if (genSendAndClose) {
+		printer->Print(vars, "SendAndClose(*$Response$) error\n");
+	}
+	printer->Print(vars, "$grpc$.ServerStream\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	printer->Print(vars, "type $StreamType$ struct {\n");
+	printer->Indent();
+	printer->Print(vars, "$grpc$.ServerStream\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	if (genSend) {
+		printer->Print(vars, "func (x *$StreamType$) Send(m *$Response$) error {\n");
+		printer->Indent();
+		printer->Print("return x.ServerStream.SendMsg(m)\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+	if (genRecv) {
+		printer->Print(vars, "func (x *$StreamType$) Recv() (*$Request$, error) {\n");
+		printer->Indent();
+		printer->Print(vars, "m := new($Request$)\n");
+    vars["Error_Check"] = "err := x.ServerStream.RecvMsg(m); err != nil";
+    GenerateError(printer, vars);
+		printer->Print("return m, nil\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+	if (genSendAndClose) {
+		printer->Print(vars, "func (x *$StreamType$) SendAndClose(m *$Response$) error {\n");
+		printer->Indent();
+		printer->Print("return x.ServerStream.SendMsg(m)\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+
+}
+
+// Generates Client method signature source
+void GenerateClientMethodSignature(const grpc_generator::Method *method, grpc_generator::Printer *printer,
+                                   std::map<grpc::string, grpc::string> vars) {
+	vars["Method"] = exportName(method->name());
+	vars["Request"] = ", in *" + ((vars["CustomMethodIO"] == "") ? method->get_input_type_name() : vars["CustomMethodIO"]);
+	if (ClientOnlyStreaming(method) || method->BidiStreaming()) {
+		vars["Request"] = "";
+	}
+	vars["Response"] = "*" + method->get_output_type_name();
+	if (ClientOnlyStreaming(method) || method->BidiStreaming() || ServerOnlyStreaming(method)) {
+		vars["Response"] = vars["Service"] + "_" + vars["Method"] + "Client" ;
+	}
+	printer->Print(vars, "$Method$(ctx $context$.Context$Request$,\n\topts ...$grpc$.CallOption) ($Response$, error)$Ending$");
+}
+
+// Generates Client method source
+void GenerateClientMethod(const grpc_generator::Method *method, grpc_generator::Printer *printer,
+                          std::map<grpc::string, grpc::string> vars) {
+	printer->Print(vars, "func (c *$ServiceUnexported$Client) ");
+  vars["Ending"] = " {\n";
+	GenerateClientMethodSignature(method, printer, vars);
+	printer->Indent();
+	vars["Method"] = exportName(method->name());
+	vars["Request"] = (vars["CustomMethodIO"] == "") ? method->get_input_type_name() : vars["CustomMethodIO"];
+	vars["Response"] = method->get_output_type_name();
+	vars["FullMethodName"] = "/" + vars["ServicePrefix"] + vars["Service"] + "/" + vars["Method"];
+	if (method->NoStreaming()) {
+		printer->Print(vars, "out := new($Response$)\n");
+		printer->Print(vars, "err := c.cc.Invoke(ctx, \"$FullMethodName$\", in, out, opts...)\n");
+    vars["Error_Check"] = "err != nil";
+    GenerateError(printer, vars);
+		printer->Print("return out, nil\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+		return;
+	}
+	vars["StreamType"] = vars["ServiceUnexported"] + vars["Method"] + "Client";
+	printer->Print(vars, "stream, err := c.cc.NewStream(ctx, &$MethodDesc$, \"$FullMethodName$\", opts...)\n");
+  vars["Error_Check"] = "err != nil";
+  GenerateError(printer, vars);
+
+	printer->Print(vars, "x := &$StreamType${stream}\n");
+	if (ServerOnlyStreaming(method)) {
+    vars["Error_Check"] = "err := x.ClientStream.SendMsg(in); err != nil";
+    GenerateError(printer, vars);
+    vars["Error_Check"] = "err := x.ClientStream.CloseSend(); err != nil";
+    GenerateError(printer, vars);
+	}
+	printer->Print("return x, nil\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	bool genSend = method->BidiStreaming() || ClientOnlyStreaming(method);
+	bool genRecv = method->BidiStreaming() || ServerOnlyStreaming(method);
+	bool genCloseAndRecv = ClientOnlyStreaming(method);
+
+	//Stream interface
+	printer->Print(vars, "type $Service$_$Method$Client interface {\n");
+	printer->Indent();
+	if (genSend) {
+		printer->Print(vars, "Send(*$Request$) error\n");
+	}
+	if (genRecv) {
+		printer->Print(vars, "Recv() (*$Response$, error)\n");
+	}
+	if (genCloseAndRecv) {
+		printer->Print(vars, "CloseAndRecv() (*$Response$, error)\n");
+	}
+	printer->Print(vars, "$grpc$.ClientStream\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	//Stream Client
+	printer->Print(vars, "type $StreamType$ struct {\n");
+	printer->Indent();
+	printer->Print(vars, "$grpc$.ClientStream\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	if (genSend) {
+		printer->Print(vars, "func (x *$StreamType$) Send(m *$Request$) error {\n");
+		printer->Indent();
+		printer->Print("return x.ClientStream.SendMsg(m)\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+
+	if (genRecv) {
+		printer->Print(vars, "func (x *$StreamType$) Recv() (*$Response$, error) {\n");
+		printer->Indent();
+		printer->Print(vars, "m := new($Response$)\n");
+    vars["Error_Check"] = "err := x.ClientStream.RecvMsg(m); err != nil";
+    GenerateError(printer, vars);
+		printer->Print("return m, nil\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+
+	if (genCloseAndRecv) {
+		printer->Print(vars, "func (x *$StreamType$) CloseAndRecv() (*$Response$, error) {\n");
+		printer->Indent();
+    vars["Error_Check"] = "err := x.ClientStream.CloseSend(); err != nil";
+    GenerateError(printer, vars);
+		printer->Print(vars, "m := new($Response$)\n");
+    vars["Error_Check"] = "err := x.ClientStream.RecvMsg(m); err != nil";
+    GenerateError(printer, vars);
+		printer->Print("return m, nil\n");
+		printer->Outdent();
+		printer->Print("}\n\n");
+	}
+}
+
+// Generates client API for the service
+void GenerateService(const grpc_generator::Service *service, grpc_generator::Printer* printer,
+                     std::map<grpc::string, grpc::string> vars) {
+	vars["Service"] = exportName(service->name());
+	// Client Interface
+	printer->Print(vars, "// Client API for $Service$ service\n");
+	printer->Print(vars, "type $Service$Client interface {\n");
+	printer->Indent();
+  vars["Ending"] = "\n";
+	for (int i = 0; i < service->method_count(); i++) {
+		GenerateClientMethodSignature(service->method(i).get(), printer, vars);
+	}
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	// Client structure
+	vars["ServiceUnexported"] = unexportName(vars["Service"]);
+	printer->Print(vars, "type $ServiceUnexported$Client struct {\n");
+	printer->Indent();
+	printer->Print(vars, "cc $grpc$.ClientConnInterface\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	// NewClient
+	printer->Print(vars, "func New$Service$Client(cc $grpc$.ClientConnInterface) $Service$Client {\n");
+	printer->Indent();
+	printer->Print(vars, "return &$ServiceUnexported$Client{cc}");
+	printer->Outdent();
+	printer->Print("\n}\n\n");
+
+	int unary_methods = 0, streaming_methods = 0;
+	vars["ServiceDesc"] = "_" + vars["Service"] + "_serviceDesc";
+	for (int i = 0; i < service->method_count(); i++) {
+		auto method = service->method(i);
+		if (method->NoStreaming()) {
+			vars["MethodDesc"] = vars["ServiceDesc"] + ".Method[" + as_string(unary_methods) + "]";
+			unary_methods++;
+		} else {
+			vars["MethodDesc"] = vars["ServiceDesc"] + ".Streams[" + as_string(streaming_methods) + "]";
+			streaming_methods++;
+		}
+		GenerateClientMethod(method.get(), printer, vars);
+	}
+
+	//Server Interface
+	printer->Print(vars, "// Server API for $Service$ service\n");
+	printer->Print(vars, "type $Service$Server interface {\n");
+	printer->Indent();
+  vars["Ending"] = "\n";
+	for (int i = 0; i < service->method_count(); i++) {
+		GenerateServerMethodSignature(service->method(i).get(), printer, vars);
+	}
+  printer->Print(vars, "mustEmbedUnimplemented$Service$Server()\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+  printer->Print(vars, "type Unimplemented$Service$Server struct {\n");
+  printer->Print("}\n\n");
+
+  vars["Ending"] = " {\n";
+  for (int i = 0; i < service->method_count(); i++) {
+    auto method = service->method(i);
+    vars["Method"] = exportName(method->name());
+    vars["Nil"] = method->NoStreaming() ? "nil, " : "";
+    printer->Print(vars, "func (Unimplemented$Service$Server) ");
+    GenerateServerMethodSignature(method.get(), printer, vars);
+    printer->Indent();
+    printer->Print(vars, "return $Nil$status.Errorf(codes.Unimplemented, \"method $Method$ not implemented\")\n");
+    printer->Outdent();
+    printer->Print("}\n");
+    printer->Print("\n");
+  }
+
+  printer->Print(vars, "func (Unimplemented$Service$Server) mustEmbedUnimplemented$Service$Server() {}");
+  printer->Print("\n\n");
+
+  printer->Print(vars, "type Unsafe$Service$Server interface {\n");
+  printer->Indent();
+  printer->Print(vars, "mustEmbedUnimplemented$Service$Server()\n");
+  printer->Outdent();
+  printer->Print("}\n\n");
+	// Server registration.
+	printer->Print(vars, "func Register$Service$Server(s $grpc$.ServiceRegistrar, srv $Service$Server) {\n");
+	printer->Indent();
+	printer->Print(vars, "s.RegisterService(&$ServiceDesc$, srv)\n");
+	printer->Outdent();
+	printer->Print("}\n\n");
+
+	for (int i = 0; i < service->method_count(); i++) {
+		GenerateServerMethod(service->method(i).get(), printer, vars);
+	}
+
+
+	//Service Descriptor
+	printer->Print(vars, "var $ServiceDesc$ = $grpc$.ServiceDesc{\n");
+	printer->Indent();
+	printer->Print(vars, "ServiceName: \"$ServicePrefix$$Service$\",\n");
+	printer->Print(vars, "HandlerType: (*$Service$Server)(nil),\n");
+	printer->Print(vars, "Methods: []$grpc$.MethodDesc{\n");
+	printer->Indent();
+	for (int i = 0; i < service->method_count(); i++) {
+		auto method = service->method(i);
+		vars["Method"] = exportName(method->name());
+		vars["Handler"] = "_" + vars["Service"] + "_" + vars["Method"] + "_Handler";
+		if (method->NoStreaming()) {
+			printer->Print("{\n");
+			printer->Indent();
+			printer->Print(vars, "MethodName: \"$Method$\",\n");
+			printer->Print(vars, "Handler:    $Handler$,\n");
+			printer->Outdent();
+			printer->Print("},\n");
+		}
+	}
+	printer->Outdent();
+	printer->Print("},\n");
+	printer->Print(vars, "Streams: []$grpc$.StreamDesc{\n");
+	printer->Indent();
+	for (int i = 0; i < service->method_count(); i++) {
+		auto method = service->method(i);
+		vars["Method"] = exportName(method->name());
+		vars["Handler"] = "_" + vars["Service"] + "_" + vars["Method"] + "_Handler";
+		if (!method->NoStreaming()) {
+			printer->Print("{\n");
+			printer->Indent();
+			printer->Print(vars, "StreamName:    \"$Method$\",\n");
+			printer->Print(vars, "Handler:       $Handler$,\n");
+			if (ClientOnlyStreaming(method.get())) {
+				printer->Print("ClientStreams: true,\n");
+			} else if (ServerOnlyStreaming(method.get())) {
+				printer->Print("ServerStreams: true,\n");
+			} else {
+				printer->Print("ServerStreams: true,\n");
+				printer->Print("ClientStreams: true,\n");
+			}
+			printer->Outdent();
+			printer->Print("},\n");
+		}
+	}
+	printer->Outdent();
+	printer->Print("},\n");
+	printer->Outdent();
+	printer->Print("}\n");
+
+}
+
+
+// Returns source for the service
+grpc::string GenerateServiceSource(grpc_generator::File *file,
+                                   const grpc_generator::Service *service,
+                                   grpc_go_generator::Parameters *parameters) {
+	grpc::string out;
+	auto p = file->CreatePrinter(&out, '\t');
+  p->SetIndentationSize(1);
+	auto printer = p.get();
+	std::map<grpc::string, grpc::string> vars;
+	vars["Package"] = parameters->package_name;
+	vars["ServicePrefix"] = parameters->service_prefix;
+  if (!parameters->service_prefix.empty())
+    vars["ServicePrefix"].append(".");
+	vars["grpc"] = "grpc";
+	vars["context"] = "context";
+	GenerateImports(file, printer, vars);
+	if (parameters->custom_method_io_type != "") {
+		vars["CustomMethodIO"] = parameters->custom_method_io_type;
+	}
+	GenerateService(service, printer, vars);
+	return out;
+}
+}// Namespace grpc_go_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.h
new file mode 100644
index 0000000..baa94e0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/go_generator.h
@@ -0,0 +1,64 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef GRPC_INTERNAL_COMPILER_GO_GENERATOR_H
+#define GRPC_INTERNAL_COMPILER_GO_GENERATOR_H
+
+//go generator is used to generate GRPC code for serialization system, such as flatbuffers
+#include <memory>
+#include <vector>
+
+#include "src/compiler/schema_interface.h"
+
+namespace grpc_go_generator {
+
+struct Parameters {
+  //Defines the custom parameter types for methods
+  //eg: flatbuffers uses flatbuffers.Builder as input for the client and output for the server
+  grpc::string custom_method_io_type;
+
+  //Package name for the service
+  grpc::string package_name;
+
+  //Prefix for RPC Calls
+  grpc::string service_prefix;
+};
+
+// Return the source of the generated service file.
+grpc::string GenerateServiceSource(grpc_generator::File *file,
+                                   const grpc_generator::Service *service,
+                                   grpc_go_generator::Parameters *parameters);
+
+}
+
+#endif  // GRPC_INTERNAL_COMPILER_GO_GENERATOR_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.cc
new file mode 100644
index 0000000..d2cf5cc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.cc
@@ -0,0 +1,1135 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/compiler/java_generator.h"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <utility>
+#include <vector>
+
+// just to get flatbuffer_version_string()
+#include <flatbuffers/flatbuffers.h>
+#include <flatbuffers/util.h>
+#define to_string flatbuffers::NumToString
+
+// Stringify helpers used solely to cast GRPC_VERSION
+#ifndef STR
+#define STR(s) #s
+#endif
+
+#ifndef XSTR
+#define XSTR(s) STR(s)
+#endif
+
+
+typedef grpc_generator::Printer Printer;
+typedef std::map<grpc::string, grpc::string> VARS;
+typedef grpc_generator::Service ServiceDescriptor;
+typedef grpc_generator::CommentHolder
+    DescriptorType;  // base class of all 'descriptors'
+typedef grpc_generator::Method MethodDescriptor;
+
+namespace grpc_java_generator {
+typedef std::string string;
+// Generates imports for the service
+void GenerateImports(grpc_generator::File* file,
+                     grpc_generator::Printer* printer, VARS& vars) {
+  vars["filename"] = file->filename();
+  printer->Print(
+      vars,
+      "//Generated by flatc compiler (version $flatc_version$)\n");
+  printer->Print("//If you make any local changes, they will be lost\n");
+  printer->Print(vars, "//source: $filename$.fbs\n\n");
+  printer->Print(vars, "package $Package$;\n\n");
+  vars["Package"] = vars["Package"] + ".";
+  if (!file->additional_headers().empty()) {
+    printer->Print(file->additional_headers().c_str());
+    printer->Print("\n\n");
+  }
+}
+
+// Adjust a method name prefix identifier to follow the JavaBean spec:
+//   - decapitalize the first letter
+//   - remove embedded underscores & capitalize the following letter
+static string MixedLower(const string& word) {
+  string w;
+  w += static_cast<string::value_type>(tolower(word[0]));
+  bool after_underscore = false;
+  for (size_t i = 1; i < word.length(); ++i) {
+    if (word[i] == '_') {
+      after_underscore = true;
+    } else {
+      w += after_underscore ? static_cast<string::value_type>(toupper(word[i]))
+                            : word[i];
+      after_underscore = false;
+    }
+  }
+  return w;
+}
+
+// Converts to the identifier to the ALL_UPPER_CASE format.
+//   - An underscore is inserted where a lower case letter is followed by an
+//     upper case letter.
+//   - All letters are converted to upper case
+static string ToAllUpperCase(const string& word) {
+  string w;
+  for (size_t i = 0; i < word.length(); ++i) {
+    w += static_cast<string::value_type>(toupper(word[i]));
+    if ((i < word.length() - 1) && islower(word[i]) && isupper(word[i + 1])) {
+      w += '_';
+    }
+  }
+  return w;
+}
+
+static inline string LowerMethodName(const MethodDescriptor* method) {
+  return MixedLower(method->name());
+}
+
+static inline string MethodPropertiesFieldName(const MethodDescriptor* method) {
+  return "METHOD_" + ToAllUpperCase(method->name());
+}
+
+static inline string MethodPropertiesGetterName(
+    const MethodDescriptor* method) {
+  return MixedLower("get_" + method->name() + "_method");
+}
+
+static inline string MethodIdFieldName(const MethodDescriptor* method) {
+  return "METHODID_" + ToAllUpperCase(method->name());
+}
+
+static inline string JavaClassName(VARS& vars, const string& name) {
+  // string name = google::protobuf::compiler::java::ClassName(desc);
+  return vars["Package"] + name;
+}
+
+static inline string ServiceClassName(const string& service_name) {
+  return service_name + "Grpc";
+}
+
+// TODO(nmittler): Remove once protobuf includes javadoc methods in
+// distribution.
+template <typename ITR>
+static void GrpcSplitStringToIteratorUsing(const string& full,
+                                           const char* delim, ITR& result) {
+  // Optimize the common case where delim is a single character.
+  if (delim[0] != '\0' && delim[1] == '\0') {
+    char c = delim[0];
+    const char* p = full.data();
+    const char* end = p + full.size();
+    while (p != end) {
+      if (*p == c) {
+        ++p;
+      } else {
+        const char* start = p;
+        while (++p != end && *p != c)
+          ;
+        *result++ = string(start, p - start);
+      }
+    }
+    return;
+  }
+
+  string::size_type begin_index, end_index;
+  begin_index = full.find_first_not_of(delim);
+  while (begin_index != string::npos) {
+    end_index = full.find_first_of(delim, begin_index);
+    if (end_index == string::npos) {
+      *result++ = full.substr(begin_index);
+      return;
+    }
+    *result++ = full.substr(begin_index, (end_index - begin_index));
+    begin_index = full.find_first_not_of(delim, end_index);
+  }
+}
+
+static void GrpcSplitStringUsing(const string& full, const char* delim,
+                                 std::vector<string>* result) {
+  std::back_insert_iterator<std::vector<string>> it(*result);
+  GrpcSplitStringToIteratorUsing(full, delim, it);
+}
+
+static std::vector<string> GrpcSplit(const string& full, const char* delim) {
+  std::vector<string> result;
+  GrpcSplitStringUsing(full, delim, &result);
+  return result;
+}
+
+// TODO(nmittler): Remove once protobuf includes javadoc methods in
+// distribution.
+static string GrpcEscapeJavadoc(const string& input) {
+  string result;
+  result.reserve(input.size() * 2);
+
+  char prev = '*';
+
+  for (string::size_type i = 0; i < input.size(); i++) {
+    char c = input[i];
+    switch (c) {
+      case '*':
+        // Avoid "/*".
+        if (prev == '/') {
+          result.append("&#42;");
+        } else {
+          result.push_back(c);
+        }
+        break;
+      case '/':
+        // Avoid "*/".
+        if (prev == '*') {
+          result.append("&#47;");
+        } else {
+          result.push_back(c);
+        }
+        break;
+      case '@':
+        // '@' starts javadoc tags including the @deprecated tag, which will
+        // cause a compile-time error if inserted before a declaration that
+        // does not have a corresponding @Deprecated annotation.
+        result.append("&#64;");
+        break;
+      case '<':
+        // Avoid interpretation as HTML.
+        result.append("&lt;");
+        break;
+      case '>':
+        // Avoid interpretation as HTML.
+        result.append("&gt;");
+        break;
+      case '&':
+        // Avoid interpretation as HTML.
+        result.append("&amp;");
+        break;
+      case '\\':
+        // Java interprets Unicode escape sequences anywhere!
+        result.append("&#92;");
+        break;
+      default:
+        result.push_back(c);
+        break;
+    }
+
+    prev = c;
+  }
+
+  return result;
+}
+
+static std::vector<string> GrpcGetDocLines(const string& comments) {
+  if (!comments.empty()) {
+    // TODO(kenton):  Ideally we should parse the comment text as Markdown and
+    //   write it back as HTML, but this requires a Markdown parser.  For now
+    //   we just use <pre> to get fixed-width text formatting.
+
+    // If the comment itself contains block comment start or end markers,
+    // HTML-escape them so that they don't accidentally close the doc comment.
+    string escapedComments = GrpcEscapeJavadoc(comments);
+
+    std::vector<string> lines = GrpcSplit(escapedComments, "\n");
+    while (!lines.empty() && lines.back().empty()) {
+      lines.pop_back();
+    }
+    return lines;
+  }
+  return std::vector<string>();
+}
+
+static std::vector<string> GrpcGetDocLinesForDescriptor(
+    const DescriptorType* descriptor) {
+  return descriptor->GetAllComments();
+  // return GrpcGetDocLines(descriptor->GetLeadingComments("///"));
+}
+
+static void GrpcWriteDocCommentBody(Printer* printer, VARS& vars,
+                                    const std::vector<string>& lines,
+                                    bool surroundWithPreTag) {
+  if (!lines.empty()) {
+    if (surroundWithPreTag) {
+      printer->Print(" * <pre>\n");
+    }
+
+    for (size_t i = 0; i < lines.size(); i++) {
+      // Most lines should start with a space.  Watch out for lines that start
+      // with a /, since putting that right after the leading asterisk will
+      // close the comment.
+      vars["line"] = lines[i];
+      if (!lines[i].empty() && lines[i][0] == '/') {
+        printer->Print(vars, " * $line$\n");
+      } else {
+        printer->Print(vars, " *$line$\n");
+      }
+    }
+
+    if (surroundWithPreTag) {
+      printer->Print(" * </pre>\n");
+    }
+  }
+}
+
+static void GrpcWriteDocComment(Printer* printer, VARS& vars,
+                                const string& comments) {
+  printer->Print("/**\n");
+  std::vector<string> lines = GrpcGetDocLines(comments);
+  GrpcWriteDocCommentBody(printer, vars, lines, false);
+  printer->Print(" */\n");
+}
+
+static void GrpcWriteServiceDocComment(Printer* printer, VARS& vars,
+                                       const ServiceDescriptor* service) {
+  printer->Print("/**\n");
+  std::vector<string> lines = GrpcGetDocLinesForDescriptor(service);
+  GrpcWriteDocCommentBody(printer, vars, lines, true);
+  printer->Print(" */\n");
+}
+
+void GrpcWriteMethodDocComment(Printer* printer, VARS& vars,
+                               const MethodDescriptor* method) {
+  printer->Print("/**\n");
+  std::vector<string> lines = GrpcGetDocLinesForDescriptor(method);
+  GrpcWriteDocCommentBody(printer, vars, lines, true);
+  printer->Print(" */\n");
+}
+
+//outputs static singleton extractor for type stored in "extr_type" and "extr_type_name" vars
+static void PrintTypeExtractor(Printer* p, VARS& vars) {
+  p->Print(
+    vars,
+    "private static volatile FlatbuffersUtils.FBExtactor<$extr_type$> "
+    "extractorOf$extr_type_name$;\n"
+    "private static FlatbuffersUtils.FBExtactor<$extr_type$> "
+    "getExtractorOf$extr_type_name$() {\n"
+    "    if (extractorOf$extr_type_name$ != null) return "
+    "extractorOf$extr_type_name$;\n"
+    "    synchronized ($service_class_name$.class) {\n"
+    "        if (extractorOf$extr_type_name$ != null) return "
+    "extractorOf$extr_type_name$;\n"
+    "        extractorOf$extr_type_name$ = new "
+    "FlatbuffersUtils.FBExtactor<$extr_type$>() {\n"
+    "            public $extr_type$ extract (ByteBuffer buffer) {\n"
+    "                return "
+    "$extr_type$.getRootAs$extr_type_name$(buffer);\n"
+    "            }\n"
+    "        };\n"
+    "        return extractorOf$extr_type_name$;\n"
+    "    }\n"
+    "}\n\n");
+}
+static void PrintMethodFields(Printer* p, VARS& vars,
+                              const ServiceDescriptor* service) {
+  p->Print("// Static method descriptors that strictly reflect the proto.\n");
+  vars["service_name"] = service->name();
+
+  //set of names of rpc input- and output- types that were already encountered.
+  //this is needed to avoid duplicating type extractor since it's possible that
+  //the same type is used as an input or output type of more than a single RPC method
+  std::set<std::string> encounteredTypes;
+
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    vars["arg_in_id"] = to_string(2L * i); //trying to make msvc 10 happy
+    vars["arg_out_id"] = to_string(2L * i + 1);
+    vars["method_name"] = method->name();
+    vars["input_type_name"] = method->get_input_type_name();
+    vars["output_type_name"] = method->get_output_type_name();
+    vars["input_type"] = JavaClassName(vars, method->get_input_type_name());
+    vars["output_type"] = JavaClassName(vars, method->get_output_type_name());
+    vars["method_field_name"] = MethodPropertiesFieldName(method.get());
+    vars["method_new_field_name"] = MethodPropertiesGetterName(method.get());
+    vars["method_method_name"] = MethodPropertiesGetterName(method.get());
+    bool client_streaming = method->ClientStreaming() || method->BidiStreaming();
+    bool server_streaming = method->ServerStreaming() || method->BidiStreaming();
+    if (client_streaming) {
+      if (server_streaming) {
+        vars["method_type"] = "BIDI_STREAMING";
+      } else {
+        vars["method_type"] = "CLIENT_STREAMING";
+      }
+    } else {
+      if (server_streaming) {
+        vars["method_type"] = "SERVER_STREAMING";
+      } else {
+        vars["method_type"] = "UNARY";
+      }
+    }
+
+    p->Print(
+        vars,
+        "@$ExperimentalApi$(\"https://github.com/grpc/grpc-java/issues/"
+        "1901\")\n"
+        "@$Deprecated$ // Use {@link #$method_method_name$()} instead. \n"
+        "public static final $MethodDescriptor$<$input_type$,\n"
+        "    $output_type$> $method_field_name$ = $method_method_name$();\n"
+        "\n"
+        "private static volatile $MethodDescriptor$<$input_type$,\n"
+        "    $output_type$> $method_new_field_name$;\n"
+        "\n");
+
+    if (encounteredTypes.insert(vars["input_type_name"]).second) {
+      vars["extr_type"] = vars["input_type"];
+      vars["extr_type_name"] = vars["input_type_name"];
+      PrintTypeExtractor(p, vars);
+    }
+
+    if (encounteredTypes.insert(vars["output_type_name"]).second) {
+      vars["extr_type"] = vars["output_type"];
+      vars["extr_type_name"] = vars["output_type_name"];
+      PrintTypeExtractor(p, vars);
+    }
+
+    p->Print(
+      vars,
+      "@$ExperimentalApi$(\"https://github.com/grpc/grpc-java/issues/"
+      "1901\")\n"
+      "public static $MethodDescriptor$<$input_type$,\n"
+      "    $output_type$> $method_method_name$() {\n"
+      "  $MethodDescriptor$<$input_type$, $output_type$> "
+      "$method_new_field_name$;\n"
+      "  if (($method_new_field_name$ = "
+      "$service_class_name$.$method_new_field_name$) == null) {\n"
+      "    synchronized ($service_class_name$.class) {\n"
+      "      if (($method_new_field_name$ = "
+      "$service_class_name$.$method_new_field_name$) == null) {\n"
+      "        $service_class_name$.$method_new_field_name$ = "
+      "$method_new_field_name$ = \n"
+      "            $MethodDescriptor$.<$input_type$, "
+      "$output_type$>newBuilder()\n"
+      "            .setType($MethodType$.$method_type$)\n"
+      "            .setFullMethodName(generateFullMethodName(\n"
+      "                \"$Package$$service_name$\", \"$method_name$\"))\n"
+      "            .setSampledToLocalTracing(true)\n"
+      "            .setRequestMarshaller(FlatbuffersUtils.marshaller(\n"
+      "                $input_type$.class, "
+      "getExtractorOf$input_type_name$()))\n"
+      "            .setResponseMarshaller(FlatbuffersUtils.marshaller(\n"
+      "                $output_type$.class, "
+      "getExtractorOf$output_type_name$()))\n");
+
+    //            vars["proto_method_descriptor_supplier"] = service->name() +
+    //            "MethodDescriptorSupplier";
+    p->Print(vars, "                .setSchemaDescriptor(null)\n");
+    //"                .setSchemaDescriptor(new
+    //$proto_method_descriptor_supplier$(\"$method_name$\"))\n");
+
+    p->Print(vars, "                .build();\n");
+    p->Print(vars,
+             "        }\n"
+             "      }\n"
+             "   }\n"
+             "   return $method_new_field_name$;\n"
+             "}\n");
+
+    p->Print("\n");
+  }
+}
+enum StubType {
+  ASYNC_INTERFACE = 0,
+  BLOCKING_CLIENT_INTERFACE = 1,
+  FUTURE_CLIENT_INTERFACE = 2,
+  BLOCKING_SERVER_INTERFACE = 3,
+  ASYNC_CLIENT_IMPL = 4,
+  BLOCKING_CLIENT_IMPL = 5,
+  FUTURE_CLIENT_IMPL = 6,
+  ABSTRACT_CLASS = 7,
+};
+
+enum CallType { ASYNC_CALL = 0, BLOCKING_CALL = 1, FUTURE_CALL = 2 };
+
+static void PrintBindServiceMethodBody(Printer* p, VARS& vars,
+                                       const ServiceDescriptor* service);
+
+// Prints a client interface or implementation class, or a server interface.
+static void PrintStub(Printer* p, VARS& vars, const ServiceDescriptor* service,
+                      StubType type) {
+  const string service_name = service->name();
+  vars["service_name"] = service_name;
+  vars["abstract_name"] = service_name + "ImplBase";
+  string stub_name = service_name;
+  string client_name = service_name;
+  CallType call_type = ASYNC_CALL;
+  bool impl_base = false;
+  bool interface = false;
+  switch (type) {
+    case ABSTRACT_CLASS:
+      call_type = ASYNC_CALL;
+      impl_base = true;
+      break;
+    case ASYNC_CLIENT_IMPL:
+      call_type = ASYNC_CALL;
+      stub_name += "Stub";
+      break;
+    case BLOCKING_CLIENT_INTERFACE:
+      interface = true;
+      FLATBUFFERS_FALLTHROUGH(); // fall thru
+    case BLOCKING_CLIENT_IMPL:
+      call_type = BLOCKING_CALL;
+      stub_name += "BlockingStub";
+      client_name += "BlockingClient";
+      break;
+    case FUTURE_CLIENT_INTERFACE:
+      interface = true;
+      FLATBUFFERS_FALLTHROUGH(); // fall thru
+    case FUTURE_CLIENT_IMPL:
+      call_type = FUTURE_CALL;
+      stub_name += "FutureStub";
+      client_name += "FutureClient";
+      break;
+    case ASYNC_INTERFACE:
+      call_type = ASYNC_CALL;
+      interface = true;
+      break;
+    default:
+      GRPC_CODEGEN_FAIL << "Cannot determine class name for StubType: " << type;
+  }
+  vars["stub_name"] = stub_name;
+  vars["client_name"] = client_name;
+
+  // Class head
+  if (!interface) {
+    GrpcWriteServiceDocComment(p, vars, service);
+  }
+  if (impl_base) {
+    p->Print(vars,
+             "public static abstract class $abstract_name$ implements "
+             "$BindableService$ {\n");
+  } else {
+    p->Print(vars,
+             "public static final class $stub_name$ extends "
+             "$AbstractStub$<$stub_name$> {\n");
+  }
+  p->Indent();
+
+  // Constructor and build() method
+  if (!impl_base && !interface) {
+    p->Print(vars, "private $stub_name$($Channel$ channel) {\n");
+    p->Indent();
+    p->Print("super(channel);\n");
+    p->Outdent();
+    p->Print("}\n\n");
+    p->Print(vars,
+             "private $stub_name$($Channel$ channel,\n"
+             "    $CallOptions$ callOptions) {\n");
+    p->Indent();
+    p->Print("super(channel, callOptions);\n");
+    p->Outdent();
+    p->Print("}\n\n");
+    p->Print(vars,
+             "@$Override$\n"
+             "protected $stub_name$ build($Channel$ channel,\n"
+             "    $CallOptions$ callOptions) {\n");
+    p->Indent();
+    p->Print(vars, "return new $stub_name$(channel, callOptions);\n");
+    p->Outdent();
+    p->Print("}\n");
+  }
+
+  // RPC methods
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    vars["input_type"] = JavaClassName(vars, method->get_input_type_name());
+    vars["output_type"] = JavaClassName(vars, method->get_output_type_name());
+    vars["lower_method_name"] = LowerMethodName(&*method);
+    vars["method_method_name"] = MethodPropertiesGetterName(&*method);
+    bool client_streaming = method->ClientStreaming() || method->BidiStreaming();
+    bool server_streaming = method->ServerStreaming() || method->BidiStreaming();
+
+    if (call_type == BLOCKING_CALL && client_streaming) {
+      // Blocking client interface with client streaming is not available
+      continue;
+    }
+
+    if (call_type == FUTURE_CALL && (client_streaming || server_streaming)) {
+      // Future interface doesn't support streaming.
+      continue;
+    }
+
+    // Method signature
+    p->Print("\n");
+    // TODO(nmittler): Replace with WriteMethodDocComment once included by the
+    // protobuf distro.
+    if (!interface) {
+      GrpcWriteMethodDocComment(p, vars, &*method);
+    }
+    p->Print("public ");
+    switch (call_type) {
+      case BLOCKING_CALL:
+        GRPC_CODEGEN_CHECK(!client_streaming)
+            << "Blocking client interface with client streaming is unavailable";
+        if (server_streaming) {
+          // Server streaming
+          p->Print(vars,
+                   "$Iterator$<$output_type$> $lower_method_name$(\n"
+                   "    $input_type$ request)");
+        } else {
+          // Simple RPC
+          p->Print(vars,
+                   "$output_type$ $lower_method_name$($input_type$ request)");
+        }
+        break;
+      case ASYNC_CALL:
+        if (client_streaming) {
+          // Bidirectional streaming or client streaming
+          p->Print(vars,
+                   "$StreamObserver$<$input_type$> $lower_method_name$(\n"
+                   "    $StreamObserver$<$output_type$> responseObserver)");
+        } else {
+          // Server streaming or simple RPC
+          p->Print(vars,
+                   "void $lower_method_name$($input_type$ request,\n"
+                   "    $StreamObserver$<$output_type$> responseObserver)");
+        }
+        break;
+      case FUTURE_CALL:
+        GRPC_CODEGEN_CHECK(!client_streaming && !server_streaming)
+            << "Future interface doesn't support streaming. "
+            << "client_streaming=" << client_streaming << ", "
+            << "server_streaming=" << server_streaming;
+        p->Print(vars,
+                 "$ListenableFuture$<$output_type$> $lower_method_name$(\n"
+                 "    $input_type$ request)");
+        break;
+    }
+
+    if (interface) {
+      p->Print(";\n");
+      continue;
+    }
+    // Method body.
+    p->Print(" {\n");
+    p->Indent();
+    if (impl_base) {
+      switch (call_type) {
+          // NB: Skipping validation of service methods. If something is wrong,
+          // we wouldn't get to this point as compiler would return errors when
+          // generating service interface.
+        case ASYNC_CALL:
+          if (client_streaming) {
+            p->Print(vars,
+                     "return "
+                     "asyncUnimplementedStreamingCall($method_method_name$(), "
+                     "responseObserver);\n");
+          } else {
+            p->Print(vars,
+                     "asyncUnimplementedUnaryCall($method_method_name$(), "
+                     "responseObserver);\n");
+          }
+          break;
+        default:
+          break;
+      }
+    } else if (!interface) {
+      switch (call_type) {
+        case BLOCKING_CALL:
+          GRPC_CODEGEN_CHECK(!client_streaming)
+              << "Blocking client streaming interface is not available";
+          if (server_streaming) {
+            vars["calls_method"] = "blockingServerStreamingCall";
+            vars["params"] = "request";
+          } else {
+            vars["calls_method"] = "blockingUnaryCall";
+            vars["params"] = "request";
+          }
+          p->Print(vars,
+                   "return $calls_method$(\n"
+                   "    getChannel(), $method_method_name$(), "
+                   "getCallOptions(), $params$);\n");
+          break;
+        case ASYNC_CALL:
+          if (server_streaming) {
+            if (client_streaming) {
+              vars["calls_method"] = "asyncBidiStreamingCall";
+              vars["params"] = "responseObserver";
+            } else {
+              vars["calls_method"] = "asyncServerStreamingCall";
+              vars["params"] = "request, responseObserver";
+            }
+          } else {
+            if (client_streaming) {
+              vars["calls_method"] = "asyncClientStreamingCall";
+              vars["params"] = "responseObserver";
+            } else {
+              vars["calls_method"] = "asyncUnaryCall";
+              vars["params"] = "request, responseObserver";
+            }
+          }
+          vars["last_line_prefix"] = client_streaming ? "return " : "";
+          p->Print(vars,
+                   "$last_line_prefix$$calls_method$(\n"
+                   "    getChannel().newCall($method_method_name$(), "
+                   "getCallOptions()), $params$);\n");
+          break;
+        case FUTURE_CALL:
+          GRPC_CODEGEN_CHECK(!client_streaming && !server_streaming)
+              << "Future interface doesn't support streaming. "
+              << "client_streaming=" << client_streaming << ", "
+              << "server_streaming=" << server_streaming;
+          vars["calls_method"] = "futureUnaryCall";
+          p->Print(vars,
+                   "return $calls_method$(\n"
+                   "    getChannel().newCall($method_method_name$(), "
+                   "getCallOptions()), request);\n");
+          break;
+      }
+    }
+    p->Outdent();
+    p->Print("}\n");
+  }
+
+  if (impl_base) {
+    p->Print("\n");
+    p->Print(
+        vars,
+        "@$Override$ public final $ServerServiceDefinition$ bindService() {\n");
+    vars["instance"] = "this";
+    PrintBindServiceMethodBody(p, vars, service);
+    p->Print("}\n");
+  }
+
+  p->Outdent();
+  p->Print("}\n\n");
+}
+
+static bool CompareMethodClientStreaming(
+    const std::unique_ptr<const grpc_generator::Method>& method1,
+    const std::unique_ptr<const grpc_generator::Method>& method2) {
+  return method1->ClientStreaming() < method2->ClientStreaming();
+}
+
+// Place all method invocations into a single class to reduce memory footprint
+// on Android.
+static void PrintMethodHandlerClass(Printer* p, VARS& vars,
+                                    const ServiceDescriptor* service) {
+  // Sort method ids based on ClientStreaming() so switch tables are compact.
+  std::vector<std::unique_ptr<const grpc_generator::Method>> sorted_methods(
+      service->method_count());
+  for (int i = 0; i < service->method_count(); ++i) {
+    sorted_methods[i] = service->method(i);
+  }
+  stable_sort(sorted_methods.begin(), sorted_methods.end(),
+              CompareMethodClientStreaming);
+  for (size_t i = 0; i < sorted_methods.size(); i++) {
+    auto& method = sorted_methods[i];
+    vars["method_id"] = to_string(i);
+    vars["method_id_name"] = MethodIdFieldName(&*method);
+    p->Print(vars,
+             "private static final int $method_id_name$ = $method_id$;\n");
+  }
+  p->Print("\n");
+  vars["service_name"] = service->name() + "ImplBase";
+  p->Print(vars,
+           "private static final class MethodHandlers<Req, Resp> implements\n"
+           "    io.grpc.stub.ServerCalls.UnaryMethod<Req, Resp>,\n"
+           "    io.grpc.stub.ServerCalls.ServerStreamingMethod<Req, Resp>,\n"
+           "    io.grpc.stub.ServerCalls.ClientStreamingMethod<Req, Resp>,\n"
+           "    io.grpc.stub.ServerCalls.BidiStreamingMethod<Req, Resp> {\n"
+           "  private final $service_name$ serviceImpl;\n"
+           "  private final int methodId;\n"
+           "\n"
+           "  MethodHandlers($service_name$ serviceImpl, int methodId) {\n"
+           "    this.serviceImpl = serviceImpl;\n"
+           "    this.methodId = methodId;\n"
+           "  }\n\n");
+  p->Indent();
+  p->Print(vars,
+           "@$Override$\n"
+           "@java.lang.SuppressWarnings(\"unchecked\")\n"
+           "public void invoke(Req request, $StreamObserver$<Resp> "
+           "responseObserver) {\n"
+           "  switch (methodId) {\n");
+  p->Indent();
+  p->Indent();
+
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    if (method->ClientStreaming() || method->BidiStreaming()) {
+      continue;
+    }
+    vars["method_id_name"] = MethodIdFieldName(&*method);
+    vars["lower_method_name"] = LowerMethodName(&*method);
+    vars["input_type"] = JavaClassName(vars, method->get_input_type_name());
+    vars["output_type"] = JavaClassName(vars, method->get_output_type_name());
+    p->Print(vars,
+             "case $method_id_name$:\n"
+             "  serviceImpl.$lower_method_name$(($input_type$) request,\n"
+             "      ($StreamObserver$<$output_type$>) responseObserver);\n"
+             "  break;\n");
+  }
+  p->Print(
+      "default:\n"
+      "  throw new AssertionError();\n");
+
+  p->Outdent();
+  p->Outdent();
+  p->Print(
+      "  }\n"
+      "}\n\n");
+
+  p->Print(vars,
+           "@$Override$\n"
+           "@java.lang.SuppressWarnings(\"unchecked\")\n"
+           "public $StreamObserver$<Req> invoke(\n"
+           "    $StreamObserver$<Resp> responseObserver) {\n"
+           "  switch (methodId) {\n");
+  p->Indent();
+  p->Indent();
+
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    if (!(method->ClientStreaming() || method->BidiStreaming())) {
+      continue;
+    }
+    vars["method_id_name"] = MethodIdFieldName(&*method);
+    vars["lower_method_name"] = LowerMethodName(&*method);
+    vars["input_type"] = JavaClassName(vars, method->get_input_type_name());
+    vars["output_type"] = JavaClassName(vars, method->get_output_type_name());
+    p->Print(
+        vars,
+        "case $method_id_name$:\n"
+        "  return ($StreamObserver$<Req>) serviceImpl.$lower_method_name$(\n"
+        "      ($StreamObserver$<$output_type$>) responseObserver);\n");
+  }
+  p->Print(
+      "default:\n"
+      "  throw new AssertionError();\n");
+
+  p->Outdent();
+  p->Outdent();
+  p->Print(
+      "  }\n"
+      "}\n");
+
+  p->Outdent();
+  p->Print("}\n\n");
+}
+
+static void PrintGetServiceDescriptorMethod(Printer* p, VARS& vars,
+                                            const ServiceDescriptor* service) {
+  vars["service_name"] = service->name();
+  //        vars["proto_base_descriptor_supplier"] = service->name() +
+  //        "BaseDescriptorSupplier"; vars["proto_file_descriptor_supplier"] =
+  //        service->name() + "FileDescriptorSupplier";
+  //        vars["proto_method_descriptor_supplier"] = service->name() +
+  //        "MethodDescriptorSupplier"; vars["proto_class_name"] =
+  //        google::protobuf::compiler::java::ClassName(service->file());
+  //        p->Print(
+  //                 vars,
+  //                 "private static abstract class
+  //                 $proto_base_descriptor_supplier$\n" "    implements
+  //                 $ProtoFileDescriptorSupplier$,
+  //                 $ProtoServiceDescriptorSupplier$ {\n" "
+  //                 $proto_base_descriptor_supplier$() {}\n"
+  //                 "\n"
+  //                 "  @$Override$\n"
+  //                 "  public com.google.protobuf.Descriptors.FileDescriptor
+  //                 getFileDescriptor() {\n" "    return
+  //                 $proto_class_name$.getDescriptor();\n" "  }\n"
+  //                 "\n"
+  //                 "  @$Override$\n"
+  //                 "  public com.google.protobuf.Descriptors.ServiceDescriptor
+  //                 getServiceDescriptor() {\n" "    return
+  //                 getFileDescriptor().findServiceByName(\"$service_name$\");\n"
+  //                 "  }\n"
+  //                 "}\n"
+  //                 "\n"
+  //                 "private static final class
+  //                 $proto_file_descriptor_supplier$\n" "    extends
+  //                 $proto_base_descriptor_supplier$ {\n" "
+  //                 $proto_file_descriptor_supplier$() {}\n"
+  //                 "}\n"
+  //                 "\n"
+  //                 "private static final class
+  //                 $proto_method_descriptor_supplier$\n" "    extends
+  //                 $proto_base_descriptor_supplier$\n" "    implements
+  //                 $ProtoMethodDescriptorSupplier$ {\n" "  private final
+  //                 String methodName;\n"
+  //                 "\n"
+  //                 "  $proto_method_descriptor_supplier$(String methodName)
+  //                 {\n" "    this.methodName = methodName;\n" "  }\n"
+  //                 "\n"
+  //                 "  @$Override$\n"
+  //                 "  public com.google.protobuf.Descriptors.MethodDescriptor
+  //                 getMethodDescriptor() {\n" "    return
+  //                 getServiceDescriptor().findMethodByName(methodName);\n" "
+  //                 }\n"
+  //                 "}\n\n");
+
+  p->Print(
+      vars,
+      "private static volatile $ServiceDescriptor$ serviceDescriptor;\n\n");
+
+  p->Print(vars,
+           "public static $ServiceDescriptor$ getServiceDescriptor() {\n");
+  p->Indent();
+  p->Print(vars, "$ServiceDescriptor$ result = serviceDescriptor;\n");
+  p->Print("if (result == null) {\n");
+  p->Indent();
+  p->Print(vars, "synchronized ($service_class_name$.class) {\n");
+  p->Indent();
+  p->Print("result = serviceDescriptor;\n");
+  p->Print("if (result == null) {\n");
+  p->Indent();
+
+  p->Print(vars,
+           "serviceDescriptor = result = "
+           "$ServiceDescriptor$.newBuilder(SERVICE_NAME)");
+  p->Indent();
+  p->Indent();
+  p->Print(vars, "\n.setSchemaDescriptor(null)");
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    vars["method_method_name"] = MethodPropertiesGetterName(&*method);
+    p->Print(vars, "\n.addMethod($method_method_name$())");
+  }
+  p->Print("\n.build();\n");
+  p->Outdent();
+  p->Outdent();
+
+  p->Outdent();
+  p->Print("}\n");
+  p->Outdent();
+  p->Print("}\n");
+  p->Outdent();
+  p->Print("}\n");
+  p->Print("return result;\n");
+  p->Outdent();
+  p->Print("}\n");
+}
+
+static void PrintBindServiceMethodBody(Printer* p, VARS& vars,
+                                       const ServiceDescriptor* service) {
+  vars["service_name"] = service->name();
+  p->Indent();
+  p->Print(vars,
+           "return "
+           "$ServerServiceDefinition$.builder(getServiceDescriptor())\n");
+  p->Indent();
+  p->Indent();
+  for (int i = 0; i < service->method_count(); ++i) {
+    auto method = service->method(i);
+    vars["lower_method_name"] = LowerMethodName(&*method);
+    vars["method_method_name"] = MethodPropertiesGetterName(&*method);
+    vars["input_type"] = JavaClassName(vars, method->get_input_type_name());
+    vars["output_type"] = JavaClassName(vars, method->get_output_type_name());
+    vars["method_id_name"] = MethodIdFieldName(&*method);
+    bool client_streaming = method->ClientStreaming() || method->BidiStreaming();
+    bool server_streaming = method->ServerStreaming() || method->BidiStreaming();
+    if (client_streaming) {
+      if (server_streaming) {
+        vars["calls_method"] = "asyncBidiStreamingCall";
+      } else {
+        vars["calls_method"] = "asyncClientStreamingCall";
+      }
+    } else {
+      if (server_streaming) {
+        vars["calls_method"] = "asyncServerStreamingCall";
+      } else {
+        vars["calls_method"] = "asyncUnaryCall";
+      }
+    }
+    p->Print(vars, ".addMethod(\n");
+    p->Indent();
+    p->Print(vars,
+             "$method_method_name$(),\n"
+             "$calls_method$(\n");
+    p->Indent();
+    p->Print(vars,
+             "new MethodHandlers<\n"
+             "  $input_type$,\n"
+             "  $output_type$>(\n"
+             "    $instance$, $method_id_name$)))\n");
+    p->Outdent();
+    p->Outdent();
+  }
+  p->Print(".build();\n");
+  p->Outdent();
+  p->Outdent();
+  p->Outdent();
+}
+
+static void PrintService(Printer* p, VARS& vars,
+                         const ServiceDescriptor* service,
+                         bool disable_version) {
+  vars["service_name"] = service->name();
+  vars["service_class_name"] = ServiceClassName(service->name());
+  vars["grpc_version"] = "";
+#ifdef GRPC_VERSION
+  if (!disable_version) {
+    vars["grpc_version"] = " (version " XSTR(GRPC_VERSION) ")";
+  }
+#else
+  (void)disable_version;
+#endif
+  // TODO(nmittler): Replace with WriteServiceDocComment once included by
+  // protobuf distro.
+  GrpcWriteServiceDocComment(p, vars, service);
+  p->Print(vars,
+           "@$Generated$(\n"
+           "    value = \"by gRPC proto compiler$grpc_version$\",\n"
+           "    comments = \"Source: $file_name$.fbs\")\n"
+           "public final class $service_class_name$ {\n\n");
+  p->Indent();
+  p->Print(vars, "private $service_class_name$() {}\n\n");
+
+  p->Print(vars,
+           "public static final String SERVICE_NAME = "
+           "\"$Package$$service_name$\";\n\n");
+
+  PrintMethodFields(p, vars, service);
+
+  // TODO(nmittler): Replace with WriteDocComment once included by protobuf
+  // distro.
+  GrpcWriteDocComment(
+      p, vars,
+      " Creates a new async stub that supports all call types for the service");
+  p->Print(vars,
+           "public static $service_name$Stub newStub($Channel$ channel) {\n");
+  p->Indent();
+  p->Print(vars, "return new $service_name$Stub(channel);\n");
+  p->Outdent();
+  p->Print("}\n\n");
+
+  // TODO(nmittler): Replace with WriteDocComment once included by protobuf
+  // distro.
+  GrpcWriteDocComment(
+      p, vars,
+      " Creates a new blocking-style stub that supports unary and streaming "
+      "output calls on the service");
+  p->Print(vars,
+           "public static $service_name$BlockingStub newBlockingStub(\n"
+           "    $Channel$ channel) {\n");
+  p->Indent();
+  p->Print(vars, "return new $service_name$BlockingStub(channel);\n");
+  p->Outdent();
+  p->Print("}\n\n");
+
+  // TODO(nmittler): Replace with WriteDocComment once included by protobuf
+  // distro.
+  GrpcWriteDocComment(
+      p, vars,
+      " Creates a new ListenableFuture-style stub that supports unary calls "
+      "on the service");
+  p->Print(vars,
+           "public static $service_name$FutureStub newFutureStub(\n"
+           "    $Channel$ channel) {\n");
+  p->Indent();
+  p->Print(vars, "return new $service_name$FutureStub(channel);\n");
+  p->Outdent();
+  p->Print("}\n\n");
+
+  PrintStub(p, vars, service, ABSTRACT_CLASS);
+  PrintStub(p, vars, service, ASYNC_CLIENT_IMPL);
+  PrintStub(p, vars, service, BLOCKING_CLIENT_IMPL);
+  PrintStub(p, vars, service, FUTURE_CLIENT_IMPL);
+
+  PrintMethodHandlerClass(p, vars, service);
+  PrintGetServiceDescriptorMethod(p, vars, service);
+  p->Outdent();
+  p->Print("}\n");
+}
+
+void PrintStaticImports(Printer* p) {
+  p->Print(
+      "import java.nio.ByteBuffer;\n"
+      "import static "
+      "io.grpc.MethodDescriptor.generateFullMethodName;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.asyncBidiStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.asyncClientStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.asyncServerStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.asyncUnaryCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.blockingServerStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.blockingUnaryCall;\n"
+      "import static "
+      "io.grpc.stub.ClientCalls.futureUnaryCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncBidiStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncClientStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncServerStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncUnaryCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncUnimplementedStreamingCall;\n"
+      "import static "
+      "io.grpc.stub.ServerCalls.asyncUnimplementedUnaryCall;\n\n");
+}
+
+void GenerateService(const grpc_generator::Service* service,
+                     grpc_generator::Printer* printer, VARS& vars,
+                     bool disable_version) {
+  // All non-generated classes must be referred by fully qualified names to
+  // avoid collision with generated classes.
+  vars["String"] = "java.lang.String";
+  vars["Deprecated"] = "java.lang.Deprecated";
+  vars["Override"] = "java.lang.Override";
+  vars["Channel"] = "io.grpc.Channel";
+  vars["CallOptions"] = "io.grpc.CallOptions";
+  vars["MethodType"] = "io.grpc.MethodDescriptor.MethodType";
+  vars["ServerMethodDefinition"] = "io.grpc.ServerMethodDefinition";
+  vars["BindableService"] = "io.grpc.BindableService";
+  vars["ServerServiceDefinition"] = "io.grpc.ServerServiceDefinition";
+  vars["ServiceDescriptor"] = "io.grpc.ServiceDescriptor";
+  vars["ProtoFileDescriptorSupplier"] =
+      "io.grpc.protobuf.ProtoFileDescriptorSupplier";
+  vars["ProtoServiceDescriptorSupplier"] =
+      "io.grpc.protobuf.ProtoServiceDescriptorSupplier";
+  vars["ProtoMethodDescriptorSupplier"] =
+      "io.grpc.protobuf.ProtoMethodDescriptorSupplier";
+  vars["AbstractStub"] = "io.grpc.stub.AbstractStub";
+  vars["MethodDescriptor"] = "io.grpc.MethodDescriptor";
+  vars["NanoUtils"] = "io.grpc.protobuf.nano.NanoUtils";
+  vars["StreamObserver"] = "io.grpc.stub.StreamObserver";
+  vars["Iterator"] = "java.util.Iterator";
+  vars["Generated"] = "javax.annotation.Generated";
+  vars["ListenableFuture"] =
+      "com.google.common.util.concurrent.ListenableFuture";
+  vars["ExperimentalApi"] = "io.grpc.ExperimentalApi";
+
+  PrintStaticImports(printer);
+
+  PrintService(printer, vars, service, disable_version);
+}
+
+grpc::string GenerateServiceSource(
+    grpc_generator::File* file, const grpc_generator::Service* service,
+    grpc_java_generator::Parameters* parameters) {
+  grpc::string out;
+  auto printer = file->CreatePrinter(&out);
+  VARS vars;
+  vars["flatc_version"] = grpc::string(
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MAJOR) "." FLATBUFFERS_STRING(
+          FLATBUFFERS_VERSION_MINOR) "." FLATBUFFERS_STRING(FLATBUFFERS_VERSION_REVISION));
+
+  vars["file_name"] = file->filename();
+
+  if (!parameters->package_name.empty()) {
+    vars["Package"] = parameters->package_name;  // ServiceJavaPackage(service);
+  }
+  GenerateImports(file, &*printer, vars);
+  GenerateService(service, &*printer, vars, false);
+  return out;
+}
+
+}  // namespace grpc_java_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.h
new file mode 100644
index 0000000..b101fbf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/java_generator.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NET_GRPC_COMPILER_JAVA_GENERATOR_H_
+#define NET_GRPC_COMPILER_JAVA_GENERATOR_H_
+
+#include <stdlib.h>  // for abort()
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "src/compiler/schema_interface.h"
+
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(std::ostream&) {}
+};
+
+class LogHelper {
+  std::ostream* os_;
+
+ public:
+  LogHelper(std::ostream* os) : os_(os) {}
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning( \
+    disable : 4722)  // the flow of control terminates in a destructor
+  // (needed to compile ~LogHelper where destructor emits abort intentionally -
+  // inherited from grpc/java code generator).
+#endif
+  ~LogHelper() {
+    *os_ << std::endl;
+    ::abort();
+  }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+  std::ostream& get_os() const { return *os_; }
+};
+
+// Abort the program after logging the mesage if the given condition is not
+// true. Otherwise, do nothing.
+#define GRPC_CODEGEN_CHECK(x)                                            \
+  (x) ? (void)0                                                          \
+      : LogMessageVoidify() & LogHelper(&std::cerr).get_os()             \
+                                  << "CHECK FAILED: " << __FILE__ << ":" \
+                                  << __LINE__ << ": "
+
+// Abort the program after logging the mesage.
+#define GRPC_CODEGEN_FAIL GRPC_CODEGEN_CHECK(false)
+
+namespace grpc_java_generator {
+struct Parameters {
+  //        //Defines the custom parameter types for methods
+  //        //eg: flatbuffers uses flatbuffers.Builder as input for the client
+  //        and output for the server grpc::string custom_method_io_type;
+
+  // Package name for the service
+  grpc::string package_name;
+};
+
+// Return the source of the generated service file.
+grpc::string GenerateServiceSource(grpc_generator::File* file,
+                                   const grpc_generator::Service* service,
+                                   grpc_java_generator::Parameters* parameters);
+
+}  // namespace grpc_java_generator
+
+#endif  // NET_GRPC_COMPILER_JAVA_GENERATOR_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.cc
new file mode 100644
index 0000000..8108db4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.cc
@@ -0,0 +1,149 @@
+/*
+ *
+ * Copyright 2015 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <map>
+#include <sstream>
+
+#include "flatbuffers/util.h"
+#include "src/compiler/python_generator.h"
+
+namespace grpc_python_generator {
+
+grpc::string GenerateMethodType(const grpc_generator::Method *method) {
+
+  if (method->NoStreaming())
+    return "unary_unary";
+
+  if (method->ServerStreaming())
+    return "unary_stream";
+
+  if (method->ClientStreaming())
+    return "stream_unary";
+
+  return "stream_stream";
+}
+
+grpc::string GenerateMethodInput(const grpc_generator::Method *method) {
+
+  if (method->NoStreaming() || method->ServerStreaming())
+    return "self, request, context";
+
+  return "self, request_iterator, context";
+}
+
+void GenerateStub(const grpc_generator::Service *service,
+                  grpc_generator::Printer *printer,
+                  std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "class $ServiceName$Stub(object):\n");
+  printer->Indent();
+  printer->Print("\"\"\" Interface exported by the server. \"\"\"");
+  printer->Print("\n\n");
+  printer->Print("def __init__(self, channel):\n");
+  printer->Indent();
+  printer->Print("\"\"\" Constructor. \n\n");
+  printer->Print("Args: \n");
+  printer->Print("channel: A grpc.Channel. \n");
+  printer->Print("\"\"\"\n\n");
+
+  for (int j = 0; j < service->method_count(); j++) {
+    auto method = service->method(j);
+    vars["MethodName"] = method->name();
+    vars["MethodType"] = GenerateMethodType(&*method);
+    printer->Print(vars, "self.$MethodName$ = channel.$MethodType$(\n");
+    printer->Indent();
+    printer->Print(vars, "\"/$PATH$$ServiceName$/$MethodName$\"\n");
+    printer->Print(")\n");
+    printer->Outdent();
+    printer->Print("\n");
+  }
+  printer->Outdent();
+  printer->Outdent();
+  printer->Print("\n");
+}
+
+void GenerateServicer(const grpc_generator::Service *service,
+                      grpc_generator::Printer *printer,
+                      std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "class $ServiceName$Servicer(object):\n");
+  printer->Indent();
+  printer->Print("\"\"\" Interface exported by the server. \"\"\"");
+  printer->Print("\n\n");
+
+  for (int j = 0; j < service->method_count(); j++) {
+    auto method = service->method(j);
+    vars["MethodName"] = method->name();
+    vars["MethodInput"] = GenerateMethodInput(&*method);
+    printer->Print(vars, "def $MethodName$($MethodInput$):\n");
+    printer->Indent();
+    printer->Print("context.set_code(grpc.StatusCode.UNIMPLEMENTED)\n");
+    printer->Print("context.set_details('Method not implemented!')\n");
+    printer->Print("raise NotImplementedError('Method not implemented!')\n");
+    printer->Outdent();
+    printer->Print("\n\n");
+  }
+  printer->Outdent();
+  printer->Print("\n");
+
+}
+
+void GenerateRegister(const grpc_generator::Service *service,
+                      grpc_generator::Printer *printer,
+                      std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "def add_$ServiceName$Servicer_to_server(servicer, server):\n");
+  printer->Indent();
+  printer->Print("rpc_method_handlers = {\n");
+  printer->Indent();
+  for (int j = 0; j < service->method_count(); j++) {
+    auto method = service->method(j);
+    vars["MethodName"] = method->name();
+    vars["MethodType"] = GenerateMethodType(&*method);
+    printer->Print(vars, "'$MethodName$': grpc.$MethodType$_rpc_method_handler(\n");
+    printer->Indent();
+    printer->Print(vars, "servicer.$MethodName$\n");
+    printer->Outdent();
+    printer->Print("),\n");
+  }
+  printer->Outdent();
+  printer->Print("}\n");
+  printer->Print(vars, "generic_handler = grpc.method_handlers_generic_handler(\n");
+  printer->Indent();
+  printer->Print(vars, "'$PATH$$ServiceName$', rpc_method_handlers)\n");
+  printer->Outdent();
+  printer->Print("server.add_generic_rpc_handlers((generic_handler,))");
+  printer->Outdent();
+  printer->Print("\n");
+}
+
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service) {
+  grpc::string output;
+  std::map<grpc::string, grpc::string> vars;
+  vars["PATH"] = file->package();
+  if (!file->package().empty()) { vars["PATH"].append("."); }
+  vars["ServiceName"] = service->name();
+  auto printer = file->CreatePrinter(&output);
+  GenerateStub(service, &*printer, &vars);
+  GenerateServicer(service, &*printer, &vars);
+  GenerateRegister(service, &*printer, &vars);
+  return output;
+}
+
+}  // namespace grpc_python_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.h
new file mode 100644
index 0000000..4f8f5cc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/python_generator.h
@@ -0,0 +1,33 @@
+/*
+ *
+ * Copyright 2015 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef GRPC_INTERNAL_COMPILER_PYTHON_GENERATOR_H
+#define GRPC_INTERNAL_COMPILER_PYTHON_GENERATOR_H
+
+#include <utility>
+
+#include "src/compiler/config.h"
+#include "src/compiler/schema_interface.h"
+
+namespace grpc_python_generator {
+
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service);
+}  // namespace grpc_python_generator
+
+#endif  // GRPC_INTERNAL_COMPILER_PYTHON_GENERATOR_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/schema_interface.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/schema_interface.h
new file mode 100644
index 0000000..0449498
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/schema_interface.h
@@ -0,0 +1,120 @@
+/*
+ *
+ * Copyright 2015 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef GRPC_INTERNAL_COMPILER_SCHEMA_INTERFACE_H
+#define GRPC_INTERNAL_COMPILER_SCHEMA_INTERFACE_H
+
+#include <memory>
+#include <vector>
+
+#include "src/compiler/config.h"
+
+#ifndef GRPC_CUSTOM_STRING
+#  include <string>
+#  define GRPC_CUSTOM_STRING std::string
+#endif
+
+namespace grpc {
+
+typedef GRPC_CUSTOM_STRING string;
+
+}  // namespace grpc
+
+namespace grpc_generator {
+
+// A common interface for objects having comments in the source.
+// Return formatted comments to be inserted in generated code.
+struct CommentHolder {
+  virtual ~CommentHolder() {}
+  virtual grpc::string GetLeadingComments(const grpc::string prefix) const = 0;
+  virtual grpc::string GetTrailingComments(const grpc::string prefix) const = 0;
+  virtual std::vector<grpc::string> GetAllComments() const = 0;
+};
+
+// An abstract interface representing a method.
+struct Method : public CommentHolder {
+  virtual ~Method() {}
+
+  virtual grpc::string name() const = 0;
+
+  virtual grpc::string input_type_name() const = 0;
+  virtual grpc::string output_type_name() const = 0;
+
+  virtual bool get_module_and_message_path_input(
+      grpc::string *str, grpc::string generator_file_name,
+      bool generate_in_pb2_grpc, grpc::string import_prefix) const = 0;
+  virtual bool get_module_and_message_path_output(
+      grpc::string *str, grpc::string generator_file_name,
+      bool generate_in_pb2_grpc, grpc::string import_prefix) const = 0;
+
+  virtual std::vector<grpc::string> get_input_namespace_parts() const = 0;
+  virtual grpc::string get_input_type_name() const = 0;
+  virtual std::vector<grpc::string> get_output_namespace_parts() const = 0;
+  virtual grpc::string get_output_type_name() const = 0;
+
+  virtual grpc::string get_fb_builder() const = 0;
+
+  virtual bool NoStreaming() const = 0;
+  virtual bool ClientStreaming() const = 0;
+  virtual bool ServerStreaming() const = 0;
+  virtual bool BidiStreaming() const = 0;
+};
+
+// An abstract interface representing a service.
+struct Service : public CommentHolder {
+  virtual ~Service() {}
+
+  virtual std::vector<grpc::string> namespace_parts() const = 0;
+  virtual grpc::string name() const = 0;
+  virtual bool is_internal() const = 0;
+
+  virtual int method_count() const = 0;
+  virtual std::unique_ptr<const Method> method(int i) const = 0;
+};
+
+struct Printer {
+  virtual ~Printer() {}
+
+  virtual void Print(const std::map<grpc::string, grpc::string> &vars,
+                     const char *template_string) = 0;
+  virtual void Print(const char *string) = 0;
+  virtual void SetIndentationSize(const int size) = 0;
+  virtual void Indent() = 0;
+  virtual void Outdent() = 0;
+};
+
+// An interface that allows the source generated to be output using various
+// libraries/idls/serializers.
+struct File : public CommentHolder {
+  virtual ~File() {}
+
+  virtual grpc::string filename() const = 0;
+  virtual grpc::string filename_without_ext() const = 0;
+  virtual grpc::string package() const = 0;
+  virtual std::vector<grpc::string> package_parts() const = 0;
+  virtual grpc::string additional_headers() const = 0;
+
+  virtual int service_count() const = 0;
+  virtual std::unique_ptr<const Service> service(int i) const = 0;
+
+  virtual std::unique_ptr<Printer> CreatePrinter(
+      grpc::string *str, const char indentation_type = ' ') const = 0;
+};
+}  // namespace grpc_generator
+
+#endif  // GRPC_INTERNAL_COMPILER_SCHEMA_INTERFACE_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.cc
new file mode 100644
index 0000000..403a803
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.cc
@@ -0,0 +1,438 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: The following implementation is a translation for the Swift-grpc
+ * generator since flatbuffers doesnt allow plugins for now. if an issue arises
+ * please open an issue in the flatbuffers repository. This file should always
+ * be maintained according to the Swift-grpc repository
+ */
+#include <map>
+#include <sstream>
+
+#include "flatbuffers/util.h"
+#include "src/compiler/schema_interface.h"
+#include "src/compiler/swift_generator.h"
+
+namespace grpc_swift_generator {
+
+std::string WrapInNameSpace(const std::vector<std::string> &components,
+                            const grpc::string &name) {
+  std::string qualified_name;
+  for (auto it = components.begin(); it != components.end(); ++it)
+    qualified_name += *it + "_";
+  return qualified_name + name;
+}
+
+grpc::string GenerateMessage(const std::vector<std::string> &components,
+                             const grpc::string &name) {
+  return "Message<" + WrapInNameSpace(components, name) + ">";
+}
+
+// MARK: - Client
+
+void GenerateClientFuncName(const grpc_generator::Method *method,
+                            grpc_generator::Printer *printer,
+                            std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  if (method->NoStreaming()) {
+    printer->Print(vars,
+                   "  $GenAccess$func $MethodName$(\n"
+                   "    _ request: $Input$\n"
+                   "    , callOptions: CallOptions?$isNil$\n"
+                   "  ) -> UnaryCall<$Input$, $Output$>");
+    return;
+  }
+
+  if (method->ServerStreaming()) {
+    printer->Print(vars,
+                   "  $GenAccess$func $MethodName$(\n"
+                   "    _ request: $Input$\n"
+                   "    , callOptions: CallOptions?$isNil$,\n"
+                   "    handler: @escaping ($Output$) -> Void\n"
+                   "  ) -> ServerStreamingCall<$Input$, $Output$>");
+    return;
+  }
+
+  if (method->ClientStreaming()) {
+    printer->Print(vars,
+                   "  $GenAccess$func $MethodName$(\n"
+                   "    callOptions: CallOptions?$isNil$\n"
+                   "  ) -> ClientStreamingCall<$Input$, $Output$>");
+    return;
+  }
+
+  printer->Print(vars,
+                 "  $GenAccess$func $MethodName$(\n"
+                 "    callOptions: CallOptions?$isNil$,\n"
+                 "    handler: @escaping ($Output$ ) -> Void\n"
+                 "  ) -> BidirectionalStreamingCall<$Input$, $Output$>");
+}
+
+void GenerateClientFuncBody(const grpc_generator::Method *method,
+                            grpc_generator::Printer *printer,
+                            std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  vars["Interceptor"] =
+      "interceptors: self.interceptors?.make$MethodName$Interceptors() ?? []";
+  if (method->NoStreaming()) {
+    printer->Print(
+        vars,
+        "    return self.makeUnaryCall(\n"
+        "      path: \"/$PATH$$ServiceName$/$MethodName$\",\n"
+        "      request: request,\n"
+        "      callOptions: callOptions ?? self.defaultCallOptions,\n"
+        "      $Interceptor$\n"
+        "    )\n");
+    return;
+  }
+
+  if (method->ServerStreaming()) {
+    printer->Print(
+        vars,
+        "    return self.makeServerStreamingCall(\n"
+        "      path: \"/$PATH$$ServiceName$/$MethodName$\",\n"
+        "      request: request,\n"
+        "      callOptions: callOptions ?? self.defaultCallOptions,\n"
+        "      $Interceptor$,\n"
+        "      handler: handler\n"
+        "    )\n");
+    return;
+  }
+
+  if (method->ClientStreaming()) {
+    printer->Print(
+        vars,
+        "    return self.makeClientStreamingCall(\n"
+        "      path: \"/$PATH$$ServiceName$/$MethodName$\",\n"
+        "      callOptions: callOptions ?? self.defaultCallOptions,\n"
+        "      $Interceptor$\n"
+        "    )\n");
+    return;
+  }
+  printer->Print(vars,
+                 "    return self.makeBidirectionalStreamingCall(\n"
+                 "      path: \"/$PATH$$ServiceName$/$MethodName$\",\n"
+                 "      callOptions: callOptions ?? self.defaultCallOptions,\n"
+                 "      $Interceptor$,\n"
+                 "      handler: handler\n"
+                 "    )\n");
+}
+
+void GenerateClientProtocol(const grpc_generator::Service *service,
+                            grpc_generator::Printer *printer,
+                            std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(
+      vars,
+      "$ACCESS$ protocol $ServiceQualifiedName$ClientProtocol: GRPCClient {");
+  printer->Print("\n\n");
+  printer->Print("  var serviceName: String { get }");
+  printer->Print("\n\n");
+  printer->Print(
+      vars,
+      "  var interceptors: "
+      "$ServiceQualifiedName$ClientInterceptorFactoryProtocol? { get }");
+  printer->Print("\n\n");
+
+  vars["GenAccess"] = "";
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    vars["isNil"] = "";
+    GenerateClientFuncName(method.get(), &*printer, &vars);
+    printer->Print("\n\n");
+  }
+  printer->Print("}\n\n");
+
+  printer->Print(vars, "extension $ServiceQualifiedName$ClientProtocol {");
+  printer->Print("\n\n");
+  printer->Print(vars,
+                 "  $ACCESS$ var serviceName: String { "
+                 "\"$PATH$$ServiceName$\" }\n");
+
+  vars["GenAccess"] = service->is_internal() ? "internal " : "public ";
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    vars["isNil"] = " = nil";
+    printer->Print("\n");
+    GenerateClientFuncName(method.get(), &*printer, &vars);
+    printer->Print(" {\n");
+    GenerateClientFuncBody(method.get(), &*printer, &vars);
+    printer->Print("  }\n");
+  }
+  printer->Print("}\n\n");
+
+  printer->Print(vars,
+                 "$ACCESS$ protocol "
+                 "$ServiceQualifiedName$ClientInterceptorFactoryProtocol {\n");
+
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    printer->Print(
+        vars,
+        "  /// - Returns: Interceptors to use when invoking '$MethodName$'.\n");
+    printer->Print(vars,
+                   "  func make$MethodName$Interceptors() -> "
+                   "[ClientInterceptor<$Input$, $Output$>]\n\n");
+  }
+  printer->Print("}\n\n");
+}
+
+void GenerateClientClass(grpc_generator::Printer *printer,
+                         std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars,
+                 "$ACCESS$ final class $ServiceQualifiedName$ServiceClient: "
+                 "$ServiceQualifiedName$ClientProtocol {\n");
+  printer->Print(vars, "  $ACCESS$ let channel: GRPCChannel\n");
+  printer->Print(vars, "  $ACCESS$ var defaultCallOptions: CallOptions\n");
+  printer->Print(vars,
+                 "  $ACCESS$ var interceptors: "
+                 "$ServiceQualifiedName$ClientInterceptorFactoryProtocol?\n");
+  printer->Print("\n");
+  printer->Print(
+      vars,
+      "  $ACCESS$ init(\n"
+      "    channel: GRPCChannel,\n"
+      "    defaultCallOptions: CallOptions = CallOptions(),\n"
+      "    interceptors: "
+      "$ServiceQualifiedName$ClientInterceptorFactoryProtocol? = nil\n"
+      "  ) {\n");
+  printer->Print("    self.channel = channel\n");
+  printer->Print("    self.defaultCallOptions = defaultCallOptions\n");
+  printer->Print("    self.interceptors = interceptors\n");
+  printer->Print("  }");
+  printer->Print("\n");
+  printer->Print("}\n");
+}
+
+// MARK: - Server
+
+grpc::string GenerateServerFuncName(const grpc_generator::Method *method) {
+  if (method->NoStreaming()) {
+    return "func $MethodName$(request: $Input$"
+           ", context: StatusOnlyCallContext) -> EventLoopFuture<$Output$>";
+  }
+
+  if (method->ClientStreaming()) {
+    return "func $MethodName$(context: UnaryResponseCallContext<$Output$>) -> "
+           "EventLoopFuture<(StreamEvent<$Input$"
+           ">) -> Void>";
+  }
+
+  if (method->ServerStreaming()) {
+    return "func $MethodName$(request: $Input$"
+           ", context: StreamingResponseCallContext<$Output$>) -> "
+           "EventLoopFuture<GRPCStatus>";
+  }
+  return "func $MethodName$(context: StreamingResponseCallContext<$Output$>) "
+         "-> EventLoopFuture<(StreamEvent<$Input$>) -> Void>";
+}
+
+grpc::string GenerateServerExtensionBody(const grpc_generator::Method *method) {
+  grpc::string start = "    case \"$MethodName$\":\n    ";
+  grpc::string interceptors =
+      "      interceptors: self.interceptors?.make$MethodName$Interceptors() "
+      "?? [],\n";
+  if (method->NoStreaming()) {
+    return start +
+           "return UnaryServerHandler(\n"
+           "      context: context,\n"
+           "      requestDeserializer: GRPCPayloadDeserializer<$Input$>(),\n"
+           "      responseSerializer: GRPCPayloadSerializer<$Output$>(),\n" +
+           interceptors +
+           "      userFunction: self.$MethodName$(request:context:))\n";
+  }
+  if (method->ServerStreaming()) {
+    return start +
+           "return ServerStreamingServerHandler(\n"
+           "      context: context,\n"
+           "      requestDeserializer: GRPCPayloadDeserializer<$Input$>(),\n"
+           "      responseSerializer: GRPCPayloadSerializer<$Output$>(),\n" +
+           interceptors +
+           "      userFunction: self.$MethodName$(request:context:))\n";
+  }
+  if (method->ClientStreaming()) {
+    return start +
+           "return ClientStreamingServerHandler(\n"
+           "      context: context,\n"
+           "      requestDeserializer: GRPCPayloadDeserializer<$Input$>(),\n"
+           "      responseSerializer: GRPCPayloadSerializer<$Output$>(),\n" +
+           interceptors +
+           "      observerFactory: self.$MethodName$(context:))\n";
+  }
+  if (method->BidiStreaming()) {
+    return start +
+           "return BidirectionalStreamingServerHandler(\n"
+           "      context: context,\n"
+           "      requestDeserializer: GRPCPayloadDeserializer<$Input$>(),\n"
+           "      responseSerializer: GRPCPayloadSerializer<$Output$>(),\n" +
+           interceptors +
+           "      observerFactory: self.$MethodName$(context:))\n";
+  }
+  return "";
+}
+
+void GenerateServerProtocol(const grpc_generator::Service *service,
+                            grpc_generator::Printer *printer,
+                            std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars,
+                 "$ACCESS$ protocol $ServiceQualifiedName$Provider: "
+                 "CallHandlerProvider {\n");
+  printer->Print(
+      vars,
+      "  var interceptors: "
+      "$ServiceQualifiedName$ServerInterceptorFactoryProtocol? { get }\n");
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    printer->Print("  ");
+    auto func = GenerateServerFuncName(method.get());
+    printer->Print(vars, func.c_str());
+    printer->Print("\n");
+  }
+  printer->Print("}\n\n");
+
+  printer->Print(vars, "$ACCESS$ extension $ServiceQualifiedName$Provider {\n");
+  printer->Print("\n");
+  printer->Print(vars,
+                 "  var serviceName: Substring { return "
+                 "\"$PATH$$ServiceName$\" }\n");
+  printer->Print("\n");
+  printer->Print(
+      "  func handle(method name: Substring, context: "
+      "CallHandlerContext) -> GRPCServerHandlerProtocol? {\n");
+  printer->Print("    switch name {\n");
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    auto body = GenerateServerExtensionBody(method.get());
+    printer->Print(vars, body.c_str());
+    printer->Print("\n");
+  }
+  printer->Print("    default: return nil;\n");
+  printer->Print("    }\n");
+  printer->Print("  }\n\n");
+  printer->Print("}\n\n");
+
+  printer->Print(vars,
+                 "$ACCESS$ protocol "
+                 "$ServiceQualifiedName$ServerInterceptorFactoryProtocol {\n");
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Input"] = GenerateMessage(method->get_input_namespace_parts(),
+                                    method->get_input_type_name());
+    vars["Output"] = GenerateMessage(method->get_output_namespace_parts(),
+                                     method->get_output_type_name());
+    vars["MethodName"] = method->name();
+    printer->Print(
+        vars,
+        "  /// - Returns: Interceptors to use when handling '$MethodName$'.\n"
+        "  ///   Defaults to calling `self.makeInterceptors()`.\n");
+    printer->Print(vars,
+                   "  func make$MethodName$Interceptors() -> "
+                   "[ServerInterceptor<$Input$, $Output$>]\n\n");
+  }
+  printer->Print("}");
+}
+
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service) {
+  grpc::string output;
+  std::map<grpc::string, grpc::string> vars;
+  vars["PATH"] = file->package();
+  if (!file->package().empty()) { vars["PATH"].append("."); }
+  vars["ServiceQualifiedName"] =
+      WrapInNameSpace(service->namespace_parts(), service->name());
+  vars["ServiceName"] = service->name();
+  vars["ACCESS"] = service->is_internal() ? "internal" : "public";
+  auto printer = file->CreatePrinter(&output);
+  printer->Print(
+      vars,
+      "/// Usage: instantiate $ServiceQualifiedName$ServiceClient, then call "
+      "methods of this protocol to make API calls.\n");
+  GenerateClientProtocol(service, &*printer, &vars);
+  GenerateClientClass(&*printer, &vars);
+  printer->Print("\n");
+  GenerateServerProtocol(service, &*printer, &vars);
+  return output;
+}
+
+grpc::string GenerateHeader() {
+  grpc::string code;
+  code +=
+      "/// The following code is generated by the Flatbuffers library which "
+      "might not be in sync with grpc-swift\n";
+  code +=
+      "/// in case of an issue please open github issue, though it would be "
+      "maintained\n";
+  code += "\n";
+  code += "// swiftlint:disable all\n";
+  code += "// swiftformat:disable all\n";
+  code += "\n";
+  code += "import Foundation\n";
+  code += "import GRPC\n";
+  code += "import NIO\n";
+  code += "import NIOHTTP1\n";
+  code += "import FlatBuffers\n";
+  code += "\n";
+  code +=
+      "public protocol GRPCFlatBufPayload: GRPCPayload, FlatBufferGRPCMessage "
+      "{}\n";
+
+  code += "public extension GRPCFlatBufPayload {\n";
+  code += "  init(serializedByteBuffer: inout NIO.ByteBuffer) throws {\n";
+  code +=
+      "    self.init(byteBuffer: FlatBuffers.ByteBuffer(contiguousBytes: "
+      "serializedByteBuffer.readableBytesView, count: "
+      "serializedByteBuffer.readableBytes))\n";
+  code += "  }\n";
+
+  code += "  func serialize(into buffer: inout NIO.ByteBuffer) throws {\n";
+  code +=
+      "    let buf = UnsafeRawBufferPointer(start: self.rawPointer, count: "
+      "Int(self.size))\n";
+  code += "    buffer.writeBytes(buf)\n";
+  code += "  }\n";
+  code += "}\n";
+  code += "extension Message: GRPCFlatBufPayload {}\n";
+  return code;
+}
+}  // namespace grpc_swift_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.h
new file mode 100644
index 0000000..1639cb0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/swift_generator.h
@@ -0,0 +1,55 @@
+/*
+ *
+ * Copyright 2020, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <memory>
+#include <vector>
+
+#include "src/compiler/config.h"
+#include "src/compiler/schema_interface.h"
+
+#ifndef GRPC_CUSTOM_STRING
+#  include <string>
+#  define GRPC_CUSTOM_STRING std::string
+#endif
+
+namespace grpc {
+
+typedef GRPC_CUSTOM_STRING string;
+
+}  // namespace grpc
+
+namespace grpc_swift_generator {
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service);
+grpc::string GenerateHeader();
+}  // namespace grpc_swift_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.cc b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.cc
new file mode 100644
index 0000000..e49fd8d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.cc
@@ -0,0 +1,523 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: The following implementation is a translation for the Swift-grpc
+ * generator since flatbuffers doesnt allow plugins for now. if an issue arises
+ * please open an issue in the flatbuffers repository. This file should always
+ * be maintained according to the Swift-grpc repository
+ */
+
+#include <map>
+#include <sstream>
+
+#include "flatbuffers/util.h"
+#include "src/compiler/schema_interface.h"
+#include "src/compiler/ts_generator.h"
+
+namespace grpc_ts_generator {
+
+grpc::string ToDasherizedCase(const grpc::string pascal_case) {
+  std::string dasherized_case;
+  char p = 0;
+  for (size_t i = 0; i < pascal_case.length(); i++) {
+    char const &c = pascal_case[i];
+    if (flatbuffers::is_alpha_upper(c)) {
+      if (i > 0 && p != flatbuffers::kPathSeparator) dasherized_case += "-";
+      dasherized_case += flatbuffers::CharToLower(c);
+    } else {
+      dasherized_case += c;
+    }
+    p = c;
+  }
+  return dasherized_case;
+}
+
+grpc::string GenerateNamespace(const std::vector<std::string> namepsace,
+                               const std::string filename,
+                               const bool include_separator) {
+  grpc::string path = "";
+  if (include_separator) path += ".";
+
+  for (auto it = namepsace.begin(); it < namepsace.end(); it++) {
+    if (include_separator) path += "/";
+    path += include_separator ? ToDasherizedCase(*it) : *it + "_";
+  }
+
+  if (include_separator) path += "/";
+  path += include_separator ? ToDasherizedCase(filename) : filename;
+  return path;
+}
+
+// MARK: - Shared code
+
+void GenerateImports(const grpc_generator::Service *service,
+                     grpc_generator::Printer *printer,
+                     std::map<grpc::string, grpc::string> *dictonary,
+                     const bool grpc_var_import) {
+  auto vars = *dictonary;
+  printer->Print(
+      "// Generated GRPC code for FlatBuffers TS *** DO NOT EDIT ***\n");
+  printer->Print("import * as flatbuffers from 'flatbuffers';\n");
+
+  std::set<grpc::string> generated_imports;
+
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    auto output = method->get_output_type_name();
+    auto input = method->get_input_type_name();
+    auto input_namespace = method->get_input_namespace_parts();
+
+    vars["OUTPUT"] = output;
+    vars["INPUT"] = input;
+
+    if (generated_imports.find(output) == generated_imports.end()) {
+      generated_imports.insert(output);
+      vars["OUTPUT_DIR"] =
+          GenerateNamespace(method->get_output_namespace_parts(), output, true);
+      vars["Output_alias"] = GenerateNamespace(
+          method->get_output_namespace_parts(), output, false);
+      printer->Print(
+          vars, "import { $OUTPUT$ as $Output_alias$ } from '$OUTPUT_DIR$';\n");
+    }
+    if (generated_imports.find(input) == generated_imports.end()) {
+      generated_imports.insert(input);
+      vars["INPUT_DIR"] =
+          GenerateNamespace(method->get_output_namespace_parts(), input, true);
+      vars["Input_alias"] =
+          GenerateNamespace(method->get_output_namespace_parts(), input, false);
+      printer->Print(
+          vars, "import { $INPUT$ as $Input_alias$ } from '$INPUT_DIR$';\n");
+    }
+  }
+  printer->Print("\n");
+  if (grpc_var_import)
+    printer->Print("var grpc = require('grpc');\n");
+  else
+    printer->Print("import * as grpc from 'grpc';\n");
+  printer->Print("\n");
+}
+
+// MARK: - Generate Main GRPC Code
+
+void GetStreamType(grpc_generator::Printer *printer,
+                   const grpc_generator::Method *method,
+                   std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  auto client_streaming = method->ClientStreaming() || method->BidiStreaming();
+  auto server_streaming = method->ServerStreaming() || method->BidiStreaming();
+  vars["ClientStreaming"] = client_streaming ? "true" : "false";
+  vars["ServerStreaming"] = server_streaming ? "true" : "false";
+  printer->Print(vars, "requestStream: $ClientStreaming$,\n");
+  printer->Print(vars, "responseStream: $ServerStreaming$,\n");
+}
+
+void GenerateSerializeMethod(grpc_generator::Printer *printer,
+                             std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "function serialize_$Type$(buffer_args) {\n");
+  printer->Indent();
+  printer->Print(vars, "if (!(buffer_args instanceof $Type$)) {\n");
+  printer->Indent();
+  printer->Print(vars,
+                 "throw new Error('Expected argument of type $VALUE$');\n");
+  printer->Outdent();
+  printer->Print("}\n");
+  printer->Print(vars, "return buffer_args.serialize();\n");
+  printer->Outdent();
+  printer->Print("}\n\n");
+}
+
+void GenerateDeserializeMethod(
+    grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "function deserialize_$Type$(buffer) {\n");
+  printer->Indent();
+  printer->Print(vars,
+                 "return $Type$.getRootAs$VALUE$(new "
+                 "flatbuffers.ByteBuffer(buffer))\n");
+  printer->Outdent();
+  printer->Print("}\n\n");
+}
+
+void GenerateMethods(const grpc_generator::Service *service,
+                     grpc_generator::Printer *printer,
+                     std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+
+  std::set<grpc::string> generated_functions;
+
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    auto output = method->get_output_type_name();
+    auto input = method->get_input_type_name();
+
+    if (generated_functions.find(output) == generated_functions.end()) {
+      generated_functions.insert(output);
+      vars["VALUE"] = output;
+      vars["Type"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       output, false);
+      GenerateSerializeMethod(printer, &vars);
+      GenerateDeserializeMethod(printer, &vars);
+    }
+    printer->Print("\n");
+    if (generated_functions.find(input) == generated_functions.end()) {
+      generated_functions.insert(input);
+      vars["VALUE"] = input;
+      vars["Type"] =
+          GenerateNamespace(method->get_input_namespace_parts(), input, false);
+      GenerateSerializeMethod(printer, &vars);
+      GenerateDeserializeMethod(printer, &vars);
+    }
+  }
+}
+
+void GenerateService(const grpc_generator::Service *service,
+                     grpc_generator::Printer *printer,
+                     std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  vars["NAME"] = service->name() + "Service";
+
+  printer->Print(vars, "var $NAME$ = exports.$NAME$ = {\n");
+  printer->Indent();
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["MethodName"] = method->name();
+    vars["OUTPUT"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       method->get_output_type_name(), false);
+    vars["INPUT"] = GenerateNamespace(method->get_input_namespace_parts(),
+                                      method->get_input_type_name(), false);
+    printer->Print(vars, "$MethodName$: {\n");
+    printer->Indent();
+    printer->Print(vars, "path: '/$PATH$$ServiceName$/$MethodName$',\n");
+    GetStreamType(printer, &*method, &vars);
+    printer->Print(vars, "requestType: flatbuffers.ByteBuffer,\n");
+    printer->Print(vars, "responseType: $OUTPUT$,\n");
+    printer->Print(vars, "requestSerialize: serialize_$INPUT$,\n");
+    printer->Print(vars, "requestDeserialize: deserialize_$INPUT$,\n");
+    printer->Print(vars, "responseSerialize: serialize_$OUTPUT$,\n");
+    printer->Print(vars, "responseDeserialize: deserialize_$OUTPUT$,\n");
+    printer->Outdent();
+    printer->Print("},\n");
+  }
+  printer->Outdent();
+  printer->Print("};\n");
+  printer->Print(vars,
+                 "exports.$ServiceName$Client = "
+                 "grpc.makeGenericClientConstructor($NAME$);");
+}
+
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service,
+                      const grpc::string &filename) {
+  grpc::string output;
+  std::map<grpc::string, grpc::string> vars;
+
+  vars["PATH"] = file->package();
+
+  if (!file->package().empty()) { vars["PATH"].append("."); }
+
+  vars["ServiceName"] = service->name();
+  vars["FBSFile"] = service->name() + "_fbs";
+  vars["Filename"] = filename;
+  auto printer = file->CreatePrinter(&output);
+
+  GenerateImports(service, &*printer, &vars, true);
+  GenerateMethods(service, &*printer, &vars);
+  GenerateService(service, &*printer, &vars);
+  return output;
+}
+
+// MARK: - Generate Interface
+
+void FillInterface(grpc_generator::Printer *printer,
+                   std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars,
+                 "interface I$ServiceName$Service_I$MethodName$ extends "
+                 "grpc.MethodDefinition<$INPUT$, $OUTPUT$> {\n");
+  printer->Indent();
+  printer->Print(vars, "path: string; // /$PATH$$ServiceName$/$MethodName$\n");
+  printer->Print(vars, "requestStream: boolean; // $ClientStreaming$\n");
+  printer->Print(vars, "responseStream: boolean; // $ServerStreaming$\n");
+  printer->Print(vars, "requestSerialize: grpc.serialize<$INPUT$>;\n");
+  printer->Print(vars, "requestDeserialize: grpc.deserialize<$INPUT$>;\n");
+  printer->Print(vars, "responseSerialize: grpc.serialize<$OUTPUT$>;\n");
+  printer->Print(vars, "responseDeserialize: grpc.deserialize<$OUTPUT$>;\n");
+  printer->Outdent();
+  printer->Print("}\n");
+}
+
+void GenerateInterfaces(const grpc_generator::Service *service,
+                        grpc_generator::Printer *printer,
+                        std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    auto client_streaming =
+        method->ClientStreaming() || method->BidiStreaming();
+    auto server_streaming =
+        method->ServerStreaming() || method->BidiStreaming();
+    vars["ClientStreaming"] = client_streaming ? "true" : "false";
+    vars["ServerStreaming"] = server_streaming ? "true" : "false";
+    vars["MethodName"] = method->name();
+    vars["OUTPUT"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       method->get_output_type_name(), false);
+    vars["INPUT"] = GenerateNamespace(method->get_input_namespace_parts(),
+                                      method->get_input_type_name(), false);
+    FillInterface(printer, &vars);
+    printer->Print("\n");
+  }
+}
+
+void GenerateExportedInterface(
+    const grpc_generator::Service *service, grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "export interface I$ServiceName$Server {\n");
+  printer->Indent();
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["Name"] = method->name();
+    vars["OUTPUT"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       method->get_output_type_name(), false);
+    vars["INPUT"] = GenerateNamespace(method->get_input_namespace_parts(),
+                                      method->get_input_type_name(), false);
+    if (method->BidiStreaming()) {
+      printer->Print(vars,
+                     "$Name$: grpc.handleBidiStreamingCall<$INPUT$, "
+                     "$OUTPUT$>;\n");
+      continue;
+    }
+    if (method->NoStreaming()) {
+      printer->Print(vars,
+                     "$Name$: grpc.handleUnaryCall<$INPUT$, "
+                     "$OUTPUT$>;\n");
+      continue;
+    }
+    if (method->ClientStreaming()) {
+      printer->Print(vars,
+                     "$Name$: grpc.handleClientStreamingCall<$INPUT$, "
+                     "$OUTPUT$>;\n");
+      continue;
+    }
+    if (method->ServerStreaming()) {
+      printer->Print(vars,
+                     "$Name$: grpc.handleServerStreamingCall<$INPUT$, "
+                     "$OUTPUT$>;\n");
+      continue;
+    }
+  }
+  printer->Outdent();
+  printer->Print("}\n");
+}
+
+void GenerateMainInterface(const grpc_generator::Service *service,
+                           grpc_generator::Printer *printer,
+                           std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(
+      vars,
+      "interface I$ServiceName$Service extends "
+      "grpc.ServiceDefinition<grpc.UntypedServiceImplementation> {\n");
+  printer->Indent();
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["MethodName"] = method->name();
+    printer->Print(vars,
+                   "$MethodName$: I$ServiceName$Service_I$MethodName$;\n");
+  }
+  printer->Outdent();
+  printer->Print("}\n");
+  GenerateInterfaces(service, printer, &vars);
+  printer->Print("\n");
+  printer->Print(vars,
+                 "export const $ServiceName$Service: I$ServiceName$Service;\n");
+  printer->Print("\n");
+  GenerateExportedInterface(service, printer, &vars);
+}
+
+grpc::string GenerateMetaData() { return "metadata: grpc.Metadata"; }
+
+grpc::string GenerateOptions() { return "options: Partial<grpc.CallOptions>"; }
+
+void GenerateUnaryClientInterface(
+    grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  grpc::string main = "$ISPUBLIC$$MethodName$(request: $INPUT$, ";
+  grpc::string callback =
+      "callback: (error: grpc.ServiceError | null, response: "
+      "$OUTPUT$) => void): grpc.ClientUnaryCall;\n";
+  auto meta_data = GenerateMetaData() + ", ";
+  auto options = GenerateOptions() + ", ";
+  printer->Print(vars, (main + callback).c_str());
+  printer->Print(vars, (main + meta_data + callback).c_str());
+  printer->Print(vars, (main + meta_data + options + callback).c_str());
+}
+
+void GenerateClientWriteStreamInterface(
+    grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  grpc::string main = "$ISPUBLIC$$MethodName$(";
+  grpc::string callback =
+      "callback: (error: grpc.ServiceError | null, response: "
+      "$INPUT$) => void): "
+      "grpc.ClientWritableStream<$OUTPUT$>;\n";
+  auto meta_data = GenerateMetaData() + ", ";
+  auto options = GenerateOptions() + ", ";
+  printer->Print(vars, (main + callback).c_str());
+  printer->Print(vars, (main + meta_data + callback).c_str());
+  printer->Print(vars, (main + options + callback).c_str());
+  printer->Print(vars, (main + meta_data + options + callback).c_str());
+}
+
+void GenerateClientReadableStreamInterface(
+    grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  grpc::string main = "$ISPUBLIC$$MethodName$(request: $INPUT$, ";
+  grpc::string end_function = "): grpc.ClientReadableStream<$OUTPUT$>;\n";
+  auto meta_data = GenerateMetaData();
+  auto options = GenerateOptions();
+  printer->Print(vars, (main + meta_data + end_function).c_str());
+  printer->Print(vars, (main + options + end_function).c_str());
+}
+
+void GenerateDepluxStreamInterface(
+    grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  grpc::string main = "$ISPUBLIC$$MethodName$(";
+  grpc::string end_function =
+      "): grpc.ClientDuplexStream<$INPUT$, $OUTPUT$>;\n";
+  auto meta_data = GenerateMetaData();
+  auto options = GenerateOptions();
+  printer->Print(vars, (main + end_function).c_str());
+  printer->Print(vars, (main + options + end_function).c_str());
+  printer->Print(vars, (main + meta_data +
+                        ", options?: Partial<grpc.CallOptions>" + end_function)
+                           .c_str());
+}
+
+void GenerateClientInterface(const grpc_generator::Service *service,
+                             grpc_generator::Printer *printer,
+                             std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars, "export interface I$ServiceName$Client {\n");
+  printer->Indent();
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["MethodName"] = method->name();
+    vars["OUTPUT"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       method->get_output_type_name(), false);
+    vars["INPUT"] = GenerateNamespace(method->get_input_namespace_parts(),
+                                      method->get_input_type_name(), false);
+    vars["ISPUBLIC"] = "";
+
+    if (method->NoStreaming()) {
+      GenerateUnaryClientInterface(printer, &vars);
+      continue;
+    }
+    if (method->BidiStreaming()) {
+      GenerateDepluxStreamInterface(printer, &vars);
+      continue;
+    }
+
+    if (method->ClientStreaming()) {
+      GenerateClientWriteStreamInterface(printer, &vars);
+      continue;
+    }
+
+    if (method->ServerStreaming()) {
+      GenerateClientReadableStreamInterface(printer, &vars);
+      continue;
+    }
+  }
+  printer->Outdent();
+  printer->Print("}\n");
+}
+
+void GenerateClientClassInterface(
+    const grpc_generator::Service *service, grpc_generator::Printer *printer,
+    std::map<grpc::string, grpc::string> *dictonary) {
+  auto vars = *dictonary;
+  printer->Print(vars,
+                 "export class $ServiceName$Client extends grpc.Client "
+                 "implements I$ServiceName$Client {\n");
+  printer->Indent();
+  printer->Print(
+      "constructor(address: string, credentials: grpc.ChannelCredentials, "
+      "options?: object);");
+  for (auto it = 0; it < service->method_count(); it++) {
+    auto method = service->method(it);
+    vars["MethodName"] = method->name();
+    vars["OUTPUT"] = GenerateNamespace(method->get_output_namespace_parts(),
+                                       method->get_output_type_name(), false);
+    vars["INPUT"] = GenerateNamespace(method->get_input_namespace_parts(),
+                                      method->get_input_type_name(), false);
+    vars["ISPUBLIC"] = "public ";
+    if (method->NoStreaming()) {
+      GenerateUnaryClientInterface(printer, &vars);
+      continue;
+    }
+    if (method->BidiStreaming()) {
+      GenerateDepluxStreamInterface(printer, &vars);
+      continue;
+    }
+
+    if (method->ClientStreaming()) {
+      GenerateClientWriteStreamInterface(printer, &vars);
+      continue;
+    }
+
+    if (method->ServerStreaming()) {
+      GenerateClientReadableStreamInterface(printer, &vars);
+      continue;
+    }
+  }
+  printer->Outdent();
+  printer->Print("}\n");
+}
+
+grpc::string GenerateInterface(grpc_generator::File *file,
+                               const grpc_generator::Service *service,
+                               const grpc::string &filename) {
+  grpc::string output;
+
+  std::set<grpc::string> generated_functions;
+  std::map<grpc::string, grpc::string> vars;
+
+  vars["PATH"] = file->package();
+
+  if (!file->package().empty()) { vars["PATH"].append("."); }
+
+  vars["ServiceName"] = service->name();
+  vars["FBSFile"] = service->name() + "_fbs";
+  vars["Filename"] = filename;
+  auto printer = file->CreatePrinter(&output);
+
+  GenerateImports(service, &*printer, &vars, false);
+  GenerateMainInterface(service, &*printer, &vars);
+  printer->Print("\n");
+  GenerateClientInterface(service, &*printer, &vars);
+  printer->Print("\n");
+  GenerateClientClassInterface(service, &*printer, &vars);
+  return output;
+}
+}  // namespace grpc_ts_generator
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.h b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.h
new file mode 100644
index 0000000..a33bb3c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/src/compiler/ts_generator.h
@@ -0,0 +1,61 @@
+/*
+ *
+ * Copyright 2020, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <memory>
+#include <vector>
+#include <set>
+
+#include "src/compiler/config.h"
+#include "src/compiler/schema_interface.h"
+
+#ifndef GRPC_CUSTOM_STRING
+#  include <string>
+#  define GRPC_CUSTOM_STRING std::string
+#endif
+
+namespace grpc {
+
+typedef GRPC_CUSTOM_STRING string;
+
+}  // namespace grpc
+
+namespace grpc_ts_generator {
+grpc::string Generate(grpc_generator::File *file,
+                      const grpc_generator::Service *service,
+                      const grpc::string &filename);
+
+grpc::string GenerateInterface(grpc_generator::File *file,
+                               const grpc_generator::Service *service,
+                               const grpc::string &filename);
+}  // namespace grpc_ts_generator
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/GameFactory.java b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/GameFactory.java
new file mode 100644
index 0000000..520ae39
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/GameFactory.java
@@ -0,0 +1,42 @@
+import java.nio.ByteBuffer;
+import MyGame.Example.Monster;
+import MyGame.Example.Stat;
+import com.google.flatbuffers.FlatBufferBuilder;
+
+class GameFactory {
+  public static Monster createMonster(String monsterName, short nestedMonsterHp, short nestedMonsterMana) {
+    FlatBufferBuilder builder = new FlatBufferBuilder();
+
+    int name_offset = builder.createString(monsterName);
+    Monster.startMonster(builder);
+    Monster.addName(builder, name_offset);
+    Monster.addHp(builder, nestedMonsterHp);
+    Monster.addMana(builder, nestedMonsterMana);
+    int monster_offset = Monster.endMonster(builder);
+    Monster.finishMonsterBuffer(builder, monster_offset);
+
+    ByteBuffer buffer = builder.dataBuffer();
+    Monster monster = Monster.getRootAsMonster(buffer);
+    return monster;
+  }
+
+  public static Monster createMonsterFromStat(Stat stat, int seqNo) {
+    FlatBufferBuilder builder = new FlatBufferBuilder();
+    int name_offset = builder.createString(stat.id() + " No." + seqNo);
+    Monster.startMonster(builder);
+    Monster.addName(builder, name_offset);
+    int monster_offset = Monster.endMonster(builder);
+    Monster.finishMonsterBuffer(builder, monster_offset);
+    Monster monster = Monster.getRootAsMonster(builder.dataBuffer());
+    return monster;
+  }
+
+  public static Stat createStat(String greeting, long val, int count) { 
+    FlatBufferBuilder builder = new FlatBufferBuilder();
+    int statOffset = Stat.createStat(builder, builder.createString(greeting), val, count);
+    builder.finish(statOffset);
+    Stat stat = Stat.getRootAsStat(builder.dataBuffer());
+    return stat;
+  }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/JavaGrpcTest.java b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/JavaGrpcTest.java
new file mode 100644
index 0000000..2732911
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/JavaGrpcTest.java
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import MyGame.Example.Monster;
+import MyGame.Example.MonsterStorageGrpc;
+import MyGame.Example.Stat;
+import com.google.flatbuffers.FlatBufferBuilder;
+import io.grpc.ManagedChannel;
+import io.grpc.ManagedChannelBuilder;
+import io.grpc.Server;
+import io.grpc.ServerBuilder;
+import io.grpc.stub.StreamObserver;
+import org.junit.Assert;
+
+import java.io.IOException;
+import java.lang.InterruptedException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.CountDownLatch;
+
+
+/**
+ * Demonstrates basic client-server interaction using grpc-java over netty.
+ */
+public class JavaGrpcTest {
+    static final String BIG_MONSTER_NAME = "Cyberdemon";
+    static final short nestedMonsterHp = 600;
+    static final short nestedMonsterMana = 1024;
+    static final int numStreamedMsgs = 10;
+    static final int timeoutMs = 3000;
+    static Server server;
+    static ManagedChannel channel;
+    static MonsterStorageGrpc.MonsterStorageBlockingStub blockingStub;
+    static MonsterStorageGrpc.MonsterStorageStub asyncStub;
+
+    static class MyService extends MonsterStorageGrpc.MonsterStorageImplBase {
+        @Override
+        public void store(Monster request, io.grpc.stub.StreamObserver<Stat> responseObserver) {
+            Assert.assertEquals(request.name(), BIG_MONSTER_NAME);
+            Assert.assertEquals(request.hp(), nestedMonsterHp);
+            Assert.assertEquals(request.mana(), nestedMonsterMana);
+            System.out.println("Received store request from " + request.name());
+            // Create a response from the incoming request name.
+            Stat stat = GameFactory.createStat("Hello " + request.name(), 100, 10);
+            responseObserver.onNext(stat);
+            responseObserver.onCompleted();
+        }
+
+        @Override
+        public void retrieve(Stat request, io.grpc.stub.StreamObserver<Monster> responseObserver) {
+            // Create 10 monsters for streaming response.
+            for (int i=0; i<numStreamedMsgs; i++) {
+                Monster monster = GameFactory.createMonsterFromStat(request, i);
+                responseObserver.onNext(monster);
+            }
+            responseObserver.onCompleted();
+        }
+
+        @Override
+        public StreamObserver<Monster> getMaxHitPoint(final StreamObserver<Stat> responseObserver) {
+          return computeMinMax(responseObserver, false);
+        }
+
+        @Override
+        public StreamObserver<Monster> getMinMaxHitPoints(final StreamObserver<Stat> responseObserver) {
+          return computeMinMax(responseObserver, true);
+        }
+
+        private StreamObserver<Monster> computeMinMax(final StreamObserver<Stat> responseObserver, final boolean includeMin) {
+          final AtomicInteger maxHp = new AtomicInteger(Integer.MIN_VALUE);
+          final AtomicReference<String> maxHpMonsterName = new AtomicReference<String>();
+          final AtomicInteger maxHpCount = new AtomicInteger();
+
+          final AtomicInteger minHp = new AtomicInteger(Integer.MAX_VALUE);
+          final AtomicReference<String> minHpMonsterName = new AtomicReference<String>();
+          final AtomicInteger minHpCount = new AtomicInteger();
+
+          return new StreamObserver<Monster>() {
+            public void onNext(Monster monster) {
+              if (monster.hp() > maxHp.get()) {
+                // Found a monster of higher hit points.
+                maxHp.set(monster.hp());
+                maxHpMonsterName.set(monster.name());
+                maxHpCount.set(1);
+              }
+              else if (monster.hp() == maxHp.get()) {
+                // Count how many times we saw a monster of current max hit points.
+                maxHpCount.getAndIncrement();
+              }
+
+              if (monster.hp() < minHp.get()) {
+                // Found a monster of a lower hit points.
+                minHp.set(monster.hp());
+                minHpMonsterName.set(monster.name());
+                minHpCount.set(1);
+              }
+              else if (monster.hp() == minHp.get()) {
+                // Count how many times we saw a monster of current min hit points.
+                minHpCount.getAndIncrement();
+              }
+            }
+            public void onCompleted() {
+              Stat maxHpStat = GameFactory.createStat(maxHpMonsterName.get(), maxHp.get(), maxHpCount.get());
+              // Send max hit points first.
+              responseObserver.onNext(maxHpStat);
+              if (includeMin) {
+                // Send min hit points.
+                Stat minHpStat = GameFactory.createStat(minHpMonsterName.get(), minHp.get(), minHpCount.get());
+                responseObserver.onNext(minHpStat);
+              }
+              responseObserver.onCompleted();
+            }
+            public void onError(Throwable t) {
+              // Not expected
+              Assert.fail();
+            };
+          };
+        }
+    }
+
+    @org.junit.BeforeClass
+    public static void startServer() throws IOException {
+        server = ServerBuilder.forPort(0).addService(new MyService()).build().start();
+        int port = server.getPort();
+        channel = ManagedChannelBuilder.forAddress("localhost", port)
+                // Channels are secure by default (via SSL/TLS). For the example we disable TLS to avoid
+                // needing certificates.
+                .usePlaintext()
+                .directExecutor()
+                .build();
+        blockingStub = MonsterStorageGrpc.newBlockingStub(channel);
+        asyncStub = MonsterStorageGrpc.newStub(channel);
+    }
+
+    @org.junit.Test
+    public void testUnary() throws IOException {
+        Monster monsterRequest = GameFactory.createMonster(BIG_MONSTER_NAME, nestedMonsterHp, nestedMonsterMana);
+        Stat stat = blockingStub.store(monsterRequest);
+        Assert.assertEquals(stat.id(), "Hello " + BIG_MONSTER_NAME);
+        System.out.println("Received stat response from service: " + stat.id());
+    }
+
+    @org.junit.Test
+    public void testServerStreaming() throws IOException {
+        Monster monsterRequest = GameFactory.createMonster(BIG_MONSTER_NAME, nestedMonsterHp, nestedMonsterMana);
+        Stat stat = blockingStub.store(monsterRequest);
+        Iterator<Monster> iterator = blockingStub.retrieve(stat);
+        int counter = 0;
+        while(iterator.hasNext()) {
+            Monster m = iterator.next();
+            System.out.println("Received monster " + m.name());
+            counter ++;
+        }
+        Assert.assertEquals(counter, numStreamedMsgs);
+        System.out.println("FlatBuffers GRPC client/server test: completed successfully");
+    }
+
+    @org.junit.Test
+    public void testClientStreaming() throws IOException, InterruptedException {
+      final AtomicReference<Stat> maxHitStat = new AtomicReference<Stat>();
+      final CountDownLatch streamAlive = new CountDownLatch(1);
+
+      StreamObserver<Stat> statObserver = new StreamObserver<Stat>() {
+        public void onCompleted() {
+          streamAlive.countDown();
+        }
+        public void onError(Throwable ex) { }
+        public void onNext(Stat stat) {
+          maxHitStat.set(stat);
+        }
+      };
+      StreamObserver<Monster> monsterStream = asyncStub.getMaxHitPoint(statObserver);
+      short count = 10;
+      for (short i = 0;i < count; ++i) {
+        Monster monster = GameFactory.createMonster(BIG_MONSTER_NAME + i, (short) (nestedMonsterHp * i), nestedMonsterMana);
+        monsterStream.onNext(monster);
+      }
+      monsterStream.onCompleted();
+      // Wait a little bit for the server to send the stats of the monster with the max hit-points.
+      streamAlive.await(timeoutMs, TimeUnit.MILLISECONDS);
+      Assert.assertEquals(maxHitStat.get().id(), BIG_MONSTER_NAME + (count - 1));
+      Assert.assertEquals(maxHitStat.get().val(), nestedMonsterHp * (count - 1));
+      Assert.assertEquals(maxHitStat.get().count(), 1);
+    }
+
+    @org.junit.Test
+    public void testBiDiStreaming() throws IOException, InterruptedException {
+      final AtomicReference<Stat> maxHitStat = new AtomicReference<Stat>();
+      final AtomicReference<Stat> minHitStat = new AtomicReference<Stat>();
+      final CountDownLatch streamAlive = new CountDownLatch(1);
+
+      StreamObserver<Stat> statObserver = new StreamObserver<Stat>() {
+        public void onCompleted() {
+          streamAlive.countDown();
+        }
+        public void onError(Throwable ex) { }
+        public void onNext(Stat stat) {
+          // We expect the server to send the max stat first and then the min stat.
+          if (maxHitStat.get() == null) {
+            maxHitStat.set(stat);
+          }
+          else {
+            minHitStat.set(stat);
+          }
+        }
+      };
+      StreamObserver<Monster> monsterStream = asyncStub.getMinMaxHitPoints(statObserver);
+      short count = 10;
+      for (short i = 0;i < count; ++i) {
+        Monster monster = GameFactory.createMonster(BIG_MONSTER_NAME + i, (short) (nestedMonsterHp * i), nestedMonsterMana);
+        monsterStream.onNext(monster);
+      }
+      monsterStream.onCompleted();
+
+      // Wait a little bit for the server to send the stats of the monster with the max and min hit-points.
+      streamAlive.await(timeoutMs, TimeUnit.MILLISECONDS);
+
+      Assert.assertEquals(maxHitStat.get().id(), BIG_MONSTER_NAME + (count - 1));
+      Assert.assertEquals(maxHitStat.get().val(), nestedMonsterHp * (count - 1));
+      Assert.assertEquals(maxHitStat.get().count(), 1);
+
+      Assert.assertEquals(minHitStat.get().id(), BIG_MONSTER_NAME + 0);
+      Assert.assertEquals(minHitStat.get().val(), nestedMonsterHp * 0);
+      Assert.assertEquals(minHitStat.get().count(), 1);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/go_test.go b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/go_test.go
new file mode 100644
index 0000000..260e236
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/go_test.go
@@ -0,0 +1,96 @@
+package testing
+
+import (
+	"../../tests/MyGame/Example"
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	"context"
+	"net"
+	"testing"
+
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/encoding"
+)
+
+type server struct{}
+
+// test used to send and receive in grpc methods
+var test = "Flatbuffers"
+var addr = "0.0.0.0:50051"
+
+// gRPC server store method
+func (s *server) Store(context context.Context, in *Example.Monster) (*flatbuffers.Builder, error) {
+	b := flatbuffers.NewBuilder(0)
+	i := b.CreateString(test)
+	Example.StatStart(b)
+	Example.StatAddId(b, i)
+	b.Finish(Example.StatEnd(b))
+	return b, nil
+
+}
+
+// gRPC server retrieve method
+func (s *server) Retrieve(context context.Context, in *Example.Stat) (*flatbuffers.Builder, error) {
+	b := flatbuffers.NewBuilder(0)
+	i := b.CreateString(test)
+	Example.MonsterStart(b)
+	Example.MonsterAddName(b, i)
+	b.Finish(Example.MonsterEnd(b))
+	return b, nil
+}
+
+func StoreClient(c Example.MonsterStorageClient, t *testing.T) {
+	b := flatbuffers.NewBuilder(0)
+	i := b.CreateString(test)
+	Example.MonsterStart(b)
+	Example.MonsterAddName(b, i)
+	b.Finish(Example.MonsterEnd(b))
+	out, err := c.Store(context.Background(), b)
+	if err != nil {
+		t.Fatalf("Store client failed: %v", err)
+	}
+	if string(out.Id()) != test {
+		t.Errorf("StoreClient failed: expected=%s, got=%s\n", test, out.Id())
+		t.Fail()
+	}
+}
+
+func RetrieveClient(c Example.MonsterStorageClient, t *testing.T) {
+	b := flatbuffers.NewBuilder(0)
+	i := b.CreateString(test)
+	Example.StatStart(b)
+	Example.StatAddId(b, i)
+	b.Finish(Example.StatEnd(b))
+	out, err := c.Retrieve(context.Background(), b)
+	if err != nil {
+		t.Fatalf("Retrieve client failed: %v", err)
+	}
+	if string(out.Name()) != test {
+		t.Errorf("RetrieveClient failed: expected=%s, got=%s\n", test, out.Name())
+		t.Fail()
+	}
+}
+
+func TestGRPC(t *testing.T) {
+	lis, err := net.Listen("tcp", addr)
+	if err != nil {
+		t.Fatalf("Failed to listen: %v", err)
+	}
+	ser := grpc.NewServer()
+	encoding.RegisterCodec(flatbuffers.FlatbuffersCodec{})
+	Example.RegisterMonsterStorageServer(ser, &server{})
+	go func() {
+		if err := ser.Serve(lis); err != nil {
+			t.Fatalf("Failed to serve: %v", err)
+			t.FailNow()
+		}
+	}()
+	conn, err := grpc.Dial(addr, grpc.WithInsecure(), grpc.WithCodec(flatbuffers.FlatbuffersCodec{}))
+	if err != nil {
+		t.Fatalf("Failed to connect: %v", err)
+	}
+	defer conn.Close()
+	client := Example.NewMonsterStorageClient(conn)
+	StoreClient(client, t)
+	RetrieveClient(client, t)
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.cpp b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.cpp
new file mode 100644
index 0000000..6991b7e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <grpcpp/grpcpp.h>
+
+#include <condition_variable>
+#include <thread>
+
+#include "monster_test.grpc.fb.h"
+#include "monster_test_generated.h"
+#include "test_assert.h"
+
+using namespace MyGame::Example;
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::grpc::MessageBuilder;
+
+void message_builder_tests();
+
+// The callback implementation of our server, that derives from the generated
+// code. It implements all rpcs specified in the FlatBuffers schema.
+class ServiceImpl final : public MyGame::Example::MonsterStorage::Service {
+  virtual ::grpc::Status Store(
+      ::grpc::ServerContext *context,
+      const flatbuffers::grpc::Message<Monster> *request,
+      flatbuffers::grpc::Message<Stat> *response) override {
+    // Create a response from the incoming request name.
+    fbb_.Clear();
+    auto stat_offset = CreateStat(
+        fbb_, fbb_.CreateString("Hello, " + request->GetRoot()->name()->str()));
+    fbb_.Finish(stat_offset);
+    // Transfer ownership of the message to gRPC
+    *response = fbb_.ReleaseMessage<Stat>();
+    return grpc::Status::OK;
+  }
+  virtual ::grpc::Status Retrieve(
+      ::grpc::ServerContext *context,
+      const flatbuffers::grpc::Message<Stat> *request,
+      ::grpc::ServerWriter<flatbuffers::grpc::Message<Monster>> *writer)
+      override {
+    for (int i = 0; i < 5; i++) {
+      fbb_.Clear();
+      // Create 5 monsters for resposne.
+      auto monster_offset =
+          CreateMonster(fbb_, 0, 0, 0,
+                        fbb_.CreateString(request->GetRoot()->id()->str() +
+                                          " No." + std::to_string(i)));
+      fbb_.Finish(monster_offset);
+
+      flatbuffers::grpc::Message<Monster> monster =
+          fbb_.ReleaseMessage<Monster>();
+
+      // Send monster to client using streaming.
+      writer->Write(monster);
+    }
+    return grpc::Status::OK;
+  }
+
+ private:
+  flatbuffers::grpc::MessageBuilder fbb_;
+};
+
+// Track the server instance, so we can terminate it later.
+grpc::Server *server_instance = nullptr;
+// Mutex to protec this variable.
+std::mutex wait_for_server;
+std::condition_variable server_instance_cv;
+
+// This function implements the server thread.
+void RunServer() {
+  auto server_address = "0.0.0.0:50051";
+  // Callback interface we implemented above.
+  ServiceImpl service;
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+
+  // Start the server. Lock to change the variable we're changing.
+  wait_for_server.lock();
+  server_instance = builder.BuildAndStart().release();
+  wait_for_server.unlock();
+  server_instance_cv.notify_one();
+
+  std::cout << "Server listening on " << server_address << std::endl;
+  // This will block the thread and serve requests.
+  server_instance->Wait();
+}
+
+template<class Builder> void StoreRPC(MonsterStorage::Stub *stub) {
+  Builder fbb;
+  grpc::ClientContext context;
+  // Build a request with the name set.
+  auto monster_offset = CreateMonster(fbb, 0, 0, 0, fbb.CreateString("Fred"));
+  MessageBuilder mb(std::move(fbb));
+  mb.Finish(monster_offset);
+  auto request = mb.ReleaseMessage<Monster>();
+  flatbuffers::grpc::Message<Stat> response;
+
+  // The actual RPC.
+  auto status = stub->Store(&context, request, &response);
+
+  if (status.ok()) {
+    auto resp = response.GetRoot()->id();
+    std::cout << "RPC response: " << resp->str() << std::endl;
+  } else {
+    std::cout << "RPC failed" << std::endl;
+  }
+}
+
+template<class Builder> void RetrieveRPC(MonsterStorage::Stub *stub) {
+  Builder fbb;
+  grpc::ClientContext context;
+  fbb.Clear();
+  auto stat_offset = CreateStat(fbb, fbb.CreateString("Fred"));
+  fbb.Finish(stat_offset);
+  auto request = MessageBuilder(std::move(fbb)).ReleaseMessage<Stat>();
+
+  flatbuffers::grpc::Message<Monster> response;
+  auto stream = stub->Retrieve(&context, request);
+  while (stream->Read(&response)) {
+    auto resp = response.GetRoot()->name();
+    std::cout << "RPC Streaming response: " << resp->str() << std::endl;
+  }
+}
+
+int grpc_server_test() {
+  // Launch server.
+  std::thread server_thread(RunServer);
+
+  // wait for server to spin up.
+  std::unique_lock<std::mutex> lock(wait_for_server);
+  while (!server_instance) server_instance_cv.wait(lock);
+
+  // Now connect the client.
+  auto channel = grpc::CreateChannel("localhost:50051",
+                                     grpc::InsecureChannelCredentials());
+  auto stub = MyGame::Example::MonsterStorage::NewStub(channel);
+
+  StoreRPC<MessageBuilder>(stub.get());
+  StoreRPC<FlatBufferBuilder>(stub.get());
+
+  RetrieveRPC<MessageBuilder>(stub.get());
+  RetrieveRPC<FlatBufferBuilder>(stub.get());
+
+#if !FLATBUFFERS_GRPC_DISABLE_AUTO_VERIFICATION
+  {
+    // Test that an invalid request errors out correctly
+    grpc::ClientContext context;
+    flatbuffers::grpc::Message<Monster> request;  // simulate invalid message
+    flatbuffers::grpc::Message<Stat> response;
+    auto status = stub->Store(&context, request, &response);
+    // The rpc status should be INTERNAL to indicate a verification error. This
+    // matches the protobuf gRPC status code for an unparseable message.
+    assert(!status.ok());
+    assert(status.error_code() == ::grpc::StatusCode::INTERNAL);
+    assert(strcmp(status.error_message().c_str(),
+                  "Message verification failed") == 0);
+  }
+#endif
+
+  server_instance->Shutdown();
+
+  server_thread.join();
+
+  delete server_instance;
+
+  return 0;
+}
+
+int main(int /*argc*/, const char * /*argv*/[]) {
+  message_builder_tests();
+  grpc_server_test();
+
+  if (!testing_fails) {
+    TEST_OUTPUT_LINE("ALL TESTS PASSED");
+    return 0;
+  } else {
+    TEST_OUTPUT_LINE("%d FAILED TESTS", testing_fails);
+    return 1;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.py b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.py
new file mode 100644
index 0000000..1c5e92f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/grpctest.py
@@ -0,0 +1,174 @@
+from __future__ import print_function
+
+import os
+import sys
+import grpc
+import flatbuffers
+
+from concurrent import futures
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'tests'))
+import MyGame.Example.Monster as Monster
+import MyGame.Example.Stat as Stat
+import MyGame.Example.Vec3 as Vec3
+import MyGame.Example.Test as Test
+import MyGame.Example.monster_test_grpc_fb as monster_grpc_fb
+
+
+test_stat_id = "test_stat_id"
+test_stat_val = 8
+test_stat_count = 1
+
+test_monster_name1 = "test_monster_name1"
+test_monster_name2 = "test_monster_name2"
+test_string = "test_string"
+test_color = 2
+test_X = 3.0
+test_Y = 2.0
+test_Z = 6.0
+test_test1 = 4.0
+test_a = 8
+test_b = 5
+test_hp = 67
+test_inventory = [1, 1, 2, 3, 5, 8]
+test_testtype = 4
+
+test_monsters_name_retrieve = ["big_monster", "small_monster"]
+test_no_of_monsters = 2
+
+
+class MonsterStorage(monster_grpc_fb.MonsterStorageServicer):
+
+    def Store(self, request, context):
+
+        m = Monster.Monster().GetRootAsMonster(request, 0)
+
+        assert m.Name().decode("utf-8") == test_monster_name1
+
+        assert m.Pos().X() == test_X
+        assert m.Pos().Y() == test_Y
+        assert m.Pos().Z() == test_Z
+        assert m.Pos().Test1() == test_test1
+        assert m.Pos().Test2() == test_color
+        test3 = Test.Test()
+        assert m.Pos().Test3(test3).A() == test_a
+        assert m.Pos().Test3(test3).B() == test_b
+
+        assert m.Hp() == test_hp
+
+        assert m.Color() == test_color
+
+        assert m.InventoryLength() == len(test_inventory)
+        for i in range(0, len(test_inventory)):
+            assert m.Inventory(i) == test_inventory[len(test_inventory)-i -1]
+
+        assert m.TestType() == test_testtype
+
+        assert m.Test() is not None
+        table = m.Test()
+
+        m2 = Monster.Monster()
+        m2.Init(table.Bytes, table.Pos)
+        assert m2.Name().decode("utf-8") == test_monster_name2
+
+        m3 = m.Enemy()
+        assert m3.Name().decode("utf-8") == test_monster_name2
+
+        assert m.Testarrayofstring(0).decode("utf-8") == test_string
+
+        b = flatbuffers.Builder(0)
+        i = b.CreateString(test_stat_id)
+        Stat.StatStart(b)
+        Stat.StatAddId(b, i)
+        Stat.StatAddVal(b, test_stat_val)
+        Stat.StatAddCount(b, test_stat_count)
+        b.Finish(Stat.StatEnd(b))
+        return bytes(b.Output())
+
+    def Retrieve(self, request, context):
+
+        s = Stat.Stat().GetRootAsStat(request, 0)
+
+        no_of_monsters = test_no_of_monsters
+        for i in range(0, no_of_monsters):
+            b = flatbuffers.Builder(0)
+            i = b.CreateString(test_monsters_name_retrieve[i])
+            Monster.MonsterStart(b)
+            Monster.MonsterAddName(b, i)
+            b.Finish(Monster.MonsterEnd(b))
+            yield bytes(b.Output())
+
+
+def serve():
+
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    monster_grpc_fb.add_MonsterStorageServicer_to_server(MonsterStorage(), server)
+    server.add_insecure_port('[::]:50051')
+
+    server.start()
+
+    run()
+
+
+def run():
+
+    channel = grpc.insecure_channel('127.0.0.1:50051')
+    stub = monster_grpc_fb.MonsterStorageStub(channel)
+
+    b = flatbuffers.Builder(0)
+    name2 = b.CreateString(test_monster_name2)
+    name1 = b.CreateString(test_monster_name1)
+    Monster.MonsterStart(b)
+    Monster.MonsterAddName(b, name2)
+    monster2 = Monster.MonsterEnd(b)
+    test1 = b.CreateString(test_string)
+
+    Monster.MonsterStartInventoryVector(b, len(test_inventory))
+    for i in range(0, len(test_inventory)):
+        b.PrependByte(test_inventory[i])
+    inv = b.EndVector(len(test_inventory))
+
+    Monster.MonsterStartTest4Vector(b, 2)
+    Test.CreateTest(b, 10, 20)
+    Test.CreateTest(b, 30, 40)
+    test4 = b.EndVector(2)
+
+    Monster.MonsterStartTestarrayofstringVector(b, 1)
+    b.PrependUOffsetTRelative(test1)
+    test_array_of_string = b.EndVector(1)
+
+    Monster.MonsterStart(b)
+
+    Monster.MonsterAddHp(b, test_hp)
+    Monster.MonsterAddName(b, name1)
+    Monster.MonsterAddColor(b, test_color)
+    pos = Vec3.CreateVec3(b, test_X, test_Y, test_Z, test_test1, test_color, test_a, test_b)
+    Monster.MonsterAddPos(b, pos)
+    Monster.MonsterAddInventory(b, inv)
+    Monster.MonsterAddTestType(b, test_testtype)
+    Monster.MonsterAddTest(b, monster2)
+    Monster.MonsterAddTest4(b, test4)
+    Monster.MonsterAddEnemy(b, monster2)
+    Monster.MonsterAddTestarrayofstring(b, test_array_of_string)
+    monster = Monster.MonsterEnd(b)
+
+    b.Finish(monster)
+
+    stat_response = stub.Store(bytes(b.Output()))
+
+    s = Stat.Stat().GetRootAsStat(stat_response, 0)
+
+    assert s.Id().decode("utf-8") == test_stat_id
+    assert s.Val() == test_stat_val
+    assert s.Count() == test_stat_count
+
+    monster_reponses = stub.Retrieve(stat_response)
+    count = 0
+    for monster_reponse in monster_reponses:
+        m = Monster.Monster().GetRootAsMonster(monster_reponse, 0)
+        assert m.Name().decode("utf-8") == test_monsters_name_retrieve[count]
+        count = count + 1
+
+
+if __name__ == '__main__':
+    serve()
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/java-grpc-test.sh b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/java-grpc-test.sh
new file mode 100755
index 0000000..ec42960
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/java-grpc-test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+# NOTE: make sure `mvn install` in /gprc is executed before running this test
+mvn test
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/message_builder_test.cpp b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/message_builder_test.cpp
new file mode 100644
index 0000000..3ce33a9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/message_builder_test.cpp
@@ -0,0 +1,371 @@
+#include "flatbuffers/grpc.h"
+#include "monster_test_generated.h"
+#include "test_assert.h"
+#include "test_builder.h"
+
+using MyGame::Example::Any_NONE;
+using MyGame::Example::CreateStat;
+using MyGame::Example::Vec3;
+
+bool verify(flatbuffers::grpc::Message<Monster> &msg,
+            const std::string &expected_name, Color expected_color) {
+  const Monster *monster = msg.GetRoot();
+  const auto name = monster->name()->str();
+  const auto color = monster->color();
+  TEST_EQ(name, expected_name);
+  TEST_EQ(color, expected_color);
+  return (name == expected_name) && (color == expected_color);
+}
+
+bool release_n_verify(flatbuffers::grpc::MessageBuilder &mbb,
+                      const std::string &expected_name, Color expected_color) {
+  flatbuffers::grpc::Message<Monster> msg = mbb.ReleaseMessage<Monster>();
+  return verify(msg, expected_name, expected_color);
+}
+
+void builder_move_assign_after_releaseraw_test(
+    flatbuffers::grpc::MessageBuilder dst) {
+  auto root_offset1 = populate1(dst);
+  dst.Finish(root_offset1);
+  size_t size, offset;
+  grpc_slice slice;
+  dst.ReleaseRaw(size, offset, slice);
+  flatbuffers::FlatBufferBuilder src;
+  auto root_offset2 = populate2(src);
+  src.Finish(root_offset2);
+  auto src_size = src.GetSize();
+  // Move into a released builder.
+  dst = std::move(src);
+  TEST_EQ(dst.GetSize(), src_size);
+  TEST_ASSERT(release_n_verify(dst, m2_name(), m2_color()));
+  TEST_EQ(src.GetSize(), 0);
+  grpc_slice_unref(slice);
+}
+
+template<class SrcBuilder>
+struct BuilderReuseTests<flatbuffers::grpc::MessageBuilder, SrcBuilder> {
+  static void builder_reusable_after_release_message_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_MESSAGE)) { return; }
+
+    flatbuffers::grpc::MessageBuilder mb;
+    std::vector<flatbuffers::grpc::Message<Monster>> buffers;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(mb);
+      mb.Finish(root_offset1);
+      buffers.push_back(mb.ReleaseMessage<Monster>());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+    }
+  }
+
+  static void builder_reusable_after_release_test(TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE)) { return; }
+
+    // FIXME: Populate-Release loop fails assert(GRPC_SLICE_IS_EMPTY(slice_)) in
+    // SliceAllocator::allocate in the second iteration.
+
+    flatbuffers::grpc::MessageBuilder mb;
+    std::vector<flatbuffers::DetachedBuffer> buffers;
+    for (int i = 0; i < 2; ++i) {
+      auto root_offset1 = populate1(mb);
+      mb.Finish(root_offset1);
+      buffers.push_back(mb.Release());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+    }
+  }
+
+  static void builder_reusable_after_releaseraw_test(TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_RAW)) { return; }
+
+    flatbuffers::grpc::MessageBuilder mb;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(mb);
+      mb.Finish(root_offset1);
+      size_t size, offset;
+      grpc_slice slice;
+      const uint8_t *buf = mb.ReleaseRaw(size, offset, slice);
+      TEST_ASSERT_FUNC(verify(buf, offset, m1_name(), m1_color()));
+      grpc_slice_unref(slice);
+    }
+  }
+
+  static void builder_reusable_after_release_and_move_assign_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_AND_MOVE_ASSIGN)) { return; }
+
+    // FIXME: Release-move_assign loop fails assert(p ==
+    // GRPC_SLICE_START_PTR(slice_)) in DetachedBuffer destructor after all the
+    // iterations
+
+    flatbuffers::grpc::MessageBuilder dst;
+    std::vector<flatbuffers::DetachedBuffer> buffers;
+
+    for (int i = 0; i < 2; ++i) {
+      auto root_offset1 = populate1(dst);
+      dst.Finish(root_offset1);
+      buffers.push_back(dst.Release());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+
+      // bring dst back to life.
+      SrcBuilder src;
+      dst = std::move(src);
+      TEST_EQ_FUNC(dst.GetSize(), 0);
+      TEST_EQ_FUNC(src.GetSize(), 0);
+    }
+  }
+
+  static void builder_reusable_after_release_message_and_move_assign_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_MESSAGE_AND_MOVE_ASSIGN)) {
+      return;
+    }
+
+    flatbuffers::grpc::MessageBuilder dst;
+    std::vector<flatbuffers::grpc::Message<Monster>> buffers;
+
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(dst);
+      dst.Finish(root_offset1);
+      buffers.push_back(dst.ReleaseMessage<Monster>());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+
+      // bring dst back to life.
+      SrcBuilder src;
+      dst = std::move(src);
+      TEST_EQ_FUNC(dst.GetSize(), 0);
+      TEST_EQ_FUNC(src.GetSize(), 0);
+    }
+  }
+
+  static void builder_reusable_after_releaseraw_and_move_assign_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_RAW_AND_MOVE_ASSIGN)) { return; }
+
+    flatbuffers::grpc::MessageBuilder dst;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(dst);
+      dst.Finish(root_offset1);
+      size_t size, offset;
+      grpc_slice slice = grpc_empty_slice();
+      const uint8_t *buf = dst.ReleaseRaw(size, offset, slice);
+      TEST_ASSERT_FUNC(verify(buf, offset, m1_name(), m1_color()));
+      grpc_slice_unref(slice);
+
+      SrcBuilder src;
+      dst = std::move(src);
+      TEST_EQ_FUNC(dst.GetSize(), 0);
+      TEST_EQ_FUNC(src.GetSize(), 0);
+    }
+  }
+
+  static void run_tests(TestSelector selector) {
+    builder_reusable_after_release_test(selector);
+    builder_reusable_after_release_message_test(selector);
+    builder_reusable_after_releaseraw_test(selector);
+    builder_reusable_after_release_and_move_assign_test(selector);
+    builder_reusable_after_releaseraw_and_move_assign_test(selector);
+    builder_reusable_after_release_message_and_move_assign_test(selector);
+  }
+};
+
+void slice_allocator_tests() {
+  // move-construct no-delete test
+  {
+    size_t size = 2048;
+    flatbuffers::grpc::SliceAllocator sa1;
+    uint8_t *buf = sa1.allocate(size);
+    TEST_ASSERT_FUNC(buf != 0);
+    buf[0] = 100;
+    buf[size - 1] = 200;
+    flatbuffers::grpc::SliceAllocator sa2(std::move(sa1));
+    // buf should not be deleted after move-construct
+    TEST_EQ_FUNC(buf[0], 100);
+    TEST_EQ_FUNC(buf[size - 1], 200);
+    // buf is freed here
+  }
+
+  // move-assign test
+  {
+    flatbuffers::grpc::SliceAllocator sa1, sa2;
+    uint8_t *buf = sa1.allocate(2048);
+    sa1 = std::move(sa2);
+    // sa1 deletes previously allocated memory in move-assign.
+    // So buf is no longer usable here.
+    TEST_ASSERT_FUNC(buf != 0);
+  }
+}
+
+/// This function does not populate exactly the first half of the table. But it
+/// could.
+void populate_first_half(MyGame::Example::MonsterBuilder &wrapper,
+                         flatbuffers::Offset<flatbuffers::String> name_offset) {
+  wrapper.add_name(name_offset);
+  wrapper.add_color(m1_color());
+}
+
+/// This function does not populate exactly the second half of the table. But it
+/// could.
+void populate_second_half(MyGame::Example::MonsterBuilder &wrapper) {
+  wrapper.add_hp(77);
+  wrapper.add_mana(88);
+  Vec3 vec3;
+  wrapper.add_pos(&vec3);
+}
+
+/// This function is a hack to update the FlatBufferBuilder reference (fbb_) in
+/// the MonsterBuilder object. This function will break if fbb_ is not the first
+/// member in MonsterBuilder. In that case, some offset must be added. This
+/// function is used exclusively for testing correctness of move operations
+/// between FlatBufferBuilders. If MonsterBuilder had a fbb_ pointer, this hack
+/// would be unnecessary. That involves a code-generator change though.
+void test_only_hack_update_fbb_reference(
+    MyGame::Example::MonsterBuilder &monsterBuilder,
+    flatbuffers::grpc::MessageBuilder &mb) {
+  *reinterpret_cast<flatbuffers::FlatBufferBuilder **>(&monsterBuilder) = &mb;
+}
+
+/// This test validates correctness of move conversion of FlatBufferBuilder to a
+/// MessageBuilder DURING a table construction. Half of the table is constructed
+/// using FlatBufferBuilder and the other half of the table is constructed using
+/// a MessageBuilder.
+void builder_move_ctor_conversion_before_finish_half_n_half_table_test() {
+  for (size_t initial_size = 4; initial_size <= 2048; initial_size *= 2) {
+    flatbuffers::FlatBufferBuilder fbb(initial_size);
+    auto name_offset = fbb.CreateString(m1_name());
+    MyGame::Example::MonsterBuilder monsterBuilder(
+        fbb);  // starts a table in FlatBufferBuilder
+    populate_first_half(monsterBuilder, name_offset);
+    flatbuffers::grpc::MessageBuilder mb(std::move(fbb));
+    test_only_hack_update_fbb_reference(monsterBuilder, mb);  // hack
+    populate_second_half(monsterBuilder);
+    mb.Finish(monsterBuilder.Finish());  // ends the table in MessageBuilder
+    TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+    TEST_EQ_FUNC(fbb.GetSize(), 0);
+  }
+}
+
+/// This test populates a COMPLETE inner table before move conversion and later
+/// populates more members in the outer table.
+void builder_move_ctor_conversion_before_finish_test() {
+  for (size_t initial_size = 1; initial_size <= 2048; initial_size += 1) {
+    flatbuffers::FlatBufferBuilder fbb(initial_size);
+    auto stat_offset = CreateStat(fbb, fbb.CreateString("SomeId"), 0, 0);
+    flatbuffers::grpc::MessageBuilder mb(std::move(fbb));
+    auto monster_offset =
+        CreateMonster(mb, 0, 150, 100, mb.CreateString(m1_name()), 0,
+                      m1_color(), Any_NONE, 0, 0, 0, 0, 0, 0, stat_offset);
+    mb.Finish(monster_offset);
+    {
+      auto mon = flatbuffers::GetRoot<Monster>(mb.GetBufferPointer());
+      TEST_NOTNULL(mon);
+      TEST_NOTNULL(mon->name());
+      TEST_EQ_STR(mon->name()->c_str(), m1_name().c_str());
+      TEST_EQ(mon->color(), m1_color());
+    }
+    TEST_EQ(1, MyGame::Example::Color_Red);
+    TEST_EQ(1, m1_color());
+    TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+    TEST_EQ_FUNC(fbb.GetSize(), 0);
+  }
+}
+
+/// This test validates correctness of move conversion of FlatBufferBuilder to a
+/// MessageBuilder DURING a table construction. Half of the table is constructed
+/// using FlatBufferBuilder and the other half of the table is constructed using
+/// a MessageBuilder.
+void builder_move_assign_conversion_before_finish_half_n_half_table_test() {
+  flatbuffers::FlatBufferBuilder fbb;
+  flatbuffers::grpc::MessageBuilder mb;
+
+  for (int i = 0; i < 5; ++i) {
+    flatbuffers::FlatBufferBuilder fbb;
+    auto name_offset = fbb.CreateString(m1_name());
+    MyGame::Example::MonsterBuilder monsterBuilder(
+        fbb);  // starts a table in FlatBufferBuilder
+    populate_first_half(monsterBuilder, name_offset);
+    mb = std::move(fbb);
+    test_only_hack_update_fbb_reference(monsterBuilder, mb);  // hack
+    populate_second_half(monsterBuilder);
+    mb.Finish(monsterBuilder.Finish());  // ends the table in MessageBuilder
+    TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+    TEST_EQ_FUNC(fbb.GetSize(), 0);
+  }
+}
+
+/// This test populates a COMPLETE inner table before move conversion and later
+/// populates more members in the outer table.
+void builder_move_assign_conversion_before_finish_test() {
+  flatbuffers::FlatBufferBuilder fbb;
+  flatbuffers::grpc::MessageBuilder mb;
+
+  for (int i = 0; i < 5; ++i) {
+    auto stat_offset = CreateStat(fbb, fbb.CreateString("SomeId"), 0, 0);
+    mb = std::move(fbb);
+    auto monster_offset =
+        CreateMonster(mb, 0, 150, 100, mb.CreateString(m1_name()), 0,
+                      m1_color(), Any_NONE, 0, 0, 0, 0, 0, 0, stat_offset);
+    mb.Finish(monster_offset);
+    TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+    TEST_EQ_FUNC(fbb.GetSize(), 0);
+  }
+}
+
+/// This test populates data, finishes the buffer, and does move conversion
+/// after.
+void builder_move_ctor_conversion_after_finish_test() {
+  flatbuffers::FlatBufferBuilder fbb;
+  fbb.Finish(populate1(fbb));
+  flatbuffers::grpc::MessageBuilder mb(std::move(fbb));
+  TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+  TEST_EQ_FUNC(fbb.GetSize(), 0);
+}
+
+/// This test populates data, finishes the buffer, and does move conversion
+/// after.
+void builder_move_assign_conversion_after_finish_test() {
+  flatbuffers::FlatBufferBuilder fbb;
+  flatbuffers::grpc::MessageBuilder mb;
+
+  for (int i = 0; i < 5; ++i) {
+    fbb.Finish(populate1(fbb));
+    mb = std::move(fbb);
+    TEST_ASSERT_FUNC(release_n_verify(mb, m1_name(), m1_color()));
+    TEST_EQ_FUNC(fbb.GetSize(), 0);
+  }
+}
+
+void message_builder_tests() {
+  using flatbuffers::FlatBufferBuilder;
+  using flatbuffers::grpc::MessageBuilder;
+
+  slice_allocator_tests();
+
+#ifndef __APPLE__
+  builder_move_ctor_conversion_before_finish_half_n_half_table_test();
+  builder_move_assign_conversion_before_finish_half_n_half_table_test();
+#endif  // __APPLE__
+  builder_move_ctor_conversion_before_finish_test();
+  builder_move_assign_conversion_before_finish_test();
+
+  builder_move_ctor_conversion_after_finish_test();
+  builder_move_assign_conversion_after_finish_test();
+
+  BuilderTests<MessageBuilder, MessageBuilder>::all_tests();
+  BuilderTests<MessageBuilder, FlatBufferBuilder>::all_tests();
+
+  BuilderReuseTestSelector tests[6] = {
+    // REUSABLE_AFTER_RELEASE,                 // Assertion failed:
+    // (GRPC_SLICE_IS_EMPTY(slice_))
+    // REUSABLE_AFTER_RELEASE_AND_MOVE_ASSIGN, // Assertion failed: (p ==
+    // GRPC_SLICE_START_PTR(slice_)
+
+    REUSABLE_AFTER_RELEASE_RAW, REUSABLE_AFTER_RELEASE_MESSAGE,
+    REUSABLE_AFTER_RELEASE_MESSAGE_AND_MOVE_ASSIGN,
+    REUSABLE_AFTER_RELEASE_RAW_AND_MOVE_ASSIGN
+  };
+
+  BuilderReuseTests<MessageBuilder, MessageBuilder>::run_tests(
+      TestSelector(tests, tests + 6));
+  BuilderReuseTests<MessageBuilder, FlatBufferBuilder>::run_tests(
+      TestSelector(tests, tests + 6));
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/grpc/tests/pom.xml b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/pom.xml
new file mode 100644
index 0000000..e80e93b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/grpc/tests/pom.xml
@@ -0,0 +1,73 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>com.google.flatbuffers</groupId>
+        <artifactId>flatbuffers-parent</artifactId>
+        <version>2.0.0</version>
+    </parent>
+    <artifactId>grpc-test</artifactId>
+    <description>Example/Test project demonstrating usage of flatbuffers with GRPC-Java instead of protobufs
+    </description>
+    <properties>
+        <gRPC.version>2.0.0</gRPC.version>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.google.flatbuffers</groupId>
+            <artifactId>flatbuffers-java</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.flatbuffers</groupId>
+            <artifactId>flatbuffers-java-grpc</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-stub</artifactId>
+            <version>${gRPC.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-netty</artifactId>
+            <version>${gRPC.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>build-helper-maven-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>add-source</id>
+                        <phase>generate-sources</phase>
+                        <goals>
+                            <goal>add-test-source</goal>
+                        </goals>
+                        <configuration>
+                            <sources>
+                                <source>${project.basedir}</source>
+				<source>${project.basedir}/../../tests</source>
+                            </sources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+        <!--<testSourceDirectory>${project.basedir}</testSourceDirectory>-->
+    </build>
+</project>
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/base.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/base.h
new file mode 100644
index 0000000..cc9e22d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/base.h
@@ -0,0 +1,437 @@
+#ifndef FLATBUFFERS_BASE_H_
+#define FLATBUFFERS_BASE_H_
+
+// clang-format off
+
+// If activate should be declared and included first.
+#if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING) && \
+    defined(_MSC_VER) && defined(_DEBUG)
+  // The _CRTDBG_MAP_ALLOC inside <crtdbg.h> will replace
+  // calloc/free (etc) to its debug version using #define directives.
+  #define _CRTDBG_MAP_ALLOC
+  #include <stdlib.h>
+  #include <crtdbg.h>
+  // Replace operator new by trace-enabled version.
+  #define DEBUG_NEW new(_NORMAL_BLOCK, __FILE__, __LINE__)
+  #define new DEBUG_NEW
+#endif
+
+#if !defined(FLATBUFFERS_ASSERT)
+#include <assert.h>
+#define FLATBUFFERS_ASSERT assert
+#elif defined(FLATBUFFERS_ASSERT_INCLUDE)
+// Include file with forward declaration
+#include FLATBUFFERS_ASSERT_INCLUDE
+#endif
+
+#ifndef ARDUINO
+#include <cstdint>
+#endif
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#if defined(ARDUINO) && !defined(ARDUINOSTL_M_H)
+  #include <utility.h>
+#else
+  #include <utility>
+#endif
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include <memory>
+
+#if defined(__unix__) && !defined(FLATBUFFERS_LOCALE_INDEPENDENT)
+  #include <unistd.h>
+#endif
+
+#ifdef _STLPORT_VERSION
+  #define FLATBUFFERS_CPP98_STL
+#endif
+
+#ifdef __ANDROID__
+  #include <android/api-level.h>
+#endif
+
+#if defined(__ICCARM__)
+#include <intrinsics.h>
+#endif
+
+// Note the __clang__ check is needed, because clang presents itself
+// as an older GNUC compiler (4.2).
+// Clang 3.3 and later implement all of the ISO C++ 2011 standard.
+// Clang 3.4 and later implement all of the ISO C++ 2014 standard.
+// http://clang.llvm.org/cxx_status.html
+
+// Note the MSVC value '__cplusplus' may be incorrect:
+// The '__cplusplus' predefined macro in the MSVC stuck at the value 199711L,
+// indicating (erroneously!) that the compiler conformed to the C++98 Standard.
+// This value should be correct starting from MSVC2017-15.7-Preview-3.
+// The '__cplusplus' will be valid only if MSVC2017-15.7-P3 and the `/Zc:__cplusplus` switch is set.
+// Workaround (for details see MSDN):
+// Use the _MSC_VER and _MSVC_LANG definition instead of the __cplusplus  for compatibility.
+// The _MSVC_LANG macro reports the Standard version regardless of the '/Zc:__cplusplus' switch.
+
+#if defined(__GNUC__) && !defined(__clang__)
+  #define FLATBUFFERS_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+  #define FLATBUFFERS_GCC 0
+#endif
+
+#if defined(__clang__)
+  #define FLATBUFFERS_CLANG (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#else
+  #define FLATBUFFERS_CLANG 0
+#endif
+
+/// @cond FLATBUFFERS_INTERNAL
+#if __cplusplus <= 199711L && \
+    (!defined(_MSC_VER) || _MSC_VER < 1600) && \
+    (!defined(__GNUC__) || \
+      (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400))
+  #error A C++11 compatible compiler with support for the auto typing is \
+         required for FlatBuffers.
+  #error __cplusplus _MSC_VER __GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+#endif
+
+#if !defined(__clang__) && \
+    defined(__GNUC__) && \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40600)
+  // Backwards compatibility for g++ 4.4, and 4.5 which don't have the nullptr
+  // and constexpr keywords. Note the __clang__ check is needed, because clang
+  // presents itself as an older GNUC compiler.
+  #ifndef nullptr_t
+    const class nullptr_t {
+    public:
+      template<class T> inline operator T*() const { return 0; }
+    private:
+      void operator&() const;
+    } nullptr = {};
+  #endif
+  #ifndef constexpr
+    #define constexpr const
+  #endif
+#endif
+
+// The wire format uses a little endian encoding (since that's efficient for
+// the common platforms).
+#if defined(__s390x__)
+  #define FLATBUFFERS_LITTLEENDIAN 0
+#endif // __s390x__
+#if !defined(FLATBUFFERS_LITTLEENDIAN)
+  #if defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
+    #if (defined(__BIG_ENDIAN__) || \
+         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+      #define FLATBUFFERS_LITTLEENDIAN 0
+    #else
+      #define FLATBUFFERS_LITTLEENDIAN 1
+    #endif // __BIG_ENDIAN__
+  #elif defined(_MSC_VER)
+    #if defined(_M_PPC)
+      #define FLATBUFFERS_LITTLEENDIAN 0
+    #else
+      #define FLATBUFFERS_LITTLEENDIAN 1
+    #endif
+  #else
+    #error Unable to determine endianness, define FLATBUFFERS_LITTLEENDIAN.
+  #endif
+#endif // !defined(FLATBUFFERS_LITTLEENDIAN)
+
+#define FLATBUFFERS_VERSION_MAJOR 2
+#define FLATBUFFERS_VERSION_MINOR 0
+#define FLATBUFFERS_VERSION_REVISION 0
+#define FLATBUFFERS_STRING_EXPAND(X) #X
+#define FLATBUFFERS_STRING(X) FLATBUFFERS_STRING_EXPAND(X)
+namespace flatbuffers {
+  // Returns version as string  "MAJOR.MINOR.REVISION".
+  const char* FLATBUFFERS_VERSION();
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER > 1600) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 407)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_FINAL_CLASS final
+  #define FLATBUFFERS_OVERRIDE override
+  #define FLATBUFFERS_EXPLICIT_CPP11 explicit
+  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE : flatbuffers::voffset_t
+#else
+  #define FLATBUFFERS_FINAL_CLASS
+  #define FLATBUFFERS_OVERRIDE
+  #define FLATBUFFERS_EXPLICIT_CPP11
+  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+#endif
+
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
+    (defined(__cpp_constexpr) && __cpp_constexpr >= 200704)
+  #define FLATBUFFERS_CONSTEXPR constexpr
+  #define FLATBUFFERS_CONSTEXPR_CPP11 constexpr
+  #define FLATBUFFERS_CONSTEXPR_DEFINED
+#else
+  #define FLATBUFFERS_CONSTEXPR const
+  #define FLATBUFFERS_CONSTEXPR_CPP11
+#endif
+
+#if (defined(__cplusplus) && __cplusplus >= 201402L) || \
+    (defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
+  #define FLATBUFFERS_CONSTEXPR_CPP14 FLATBUFFERS_CONSTEXPR_CPP11
+#else
+  #define FLATBUFFERS_CONSTEXPR_CPP14
+#endif
+
+#if (defined(__GXX_EXPERIMENTAL_CXX0X__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
+    (defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023026)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_NOEXCEPT noexcept
+#else
+  #define FLATBUFFERS_NOEXCEPT
+#endif
+
+// NOTE: the FLATBUFFERS_DELETE_FUNC macro may change the access mode to
+// private, so be sure to put it at the end or reset access mode explicitly.
+#if (!defined(_MSC_VER) || _MSC_FULL_VER >= 180020827) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_DELETE_FUNC(func) func = delete
+#else
+  #define FLATBUFFERS_DELETE_FUNC(func) private: func
+#endif
+
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_DEFAULT_DECLARATION
+#endif
+
+// Check if we can use template aliases
+// Not possible if Microsoft Compiler before 2012
+// Possible is the language feature __cpp_alias_templates is defined well
+// Or possible if the C++ std is C+11 or newer
+#if (defined(_MSC_VER) && _MSC_VER > 1700 /* MSVC2012 */) \
+    || (defined(__cpp_alias_templates) && __cpp_alias_templates >= 200704) \
+    || (defined(__cplusplus) && __cplusplus >= 201103L)
+  #define FLATBUFFERS_TEMPLATES_ALIASES
+#endif
+
+#ifndef FLATBUFFERS_HAS_STRING_VIEW
+  // Only provide flatbuffers::string_view if __has_include can be used
+  // to detect a header that provides an implementation
+  #if defined(__has_include)
+    // Check for std::string_view (in c++17)
+    #if __has_include(<string_view>) && (__cplusplus >= 201606 || (defined(_HAS_CXX17) && _HAS_CXX17))
+      #include <string_view>
+      namespace flatbuffers {
+        typedef std::string_view string_view;
+      }
+      #define FLATBUFFERS_HAS_STRING_VIEW 1
+    // Check for std::experimental::string_view (in c++14, compiler-dependent)
+    #elif __has_include(<experimental/string_view>) && (__cplusplus >= 201411)
+      #include <experimental/string_view>
+      namespace flatbuffers {
+        typedef std::experimental::string_view string_view;
+      }
+      #define FLATBUFFERS_HAS_STRING_VIEW 1
+    // Check for absl::string_view
+    #elif __has_include("absl/strings/string_view.h")
+      #include "absl/strings/string_view.h"
+      namespace flatbuffers {
+        typedef absl::string_view string_view;
+      }
+      #define FLATBUFFERS_HAS_STRING_VIEW 1
+    #endif
+  #endif // __has_include
+#endif // !FLATBUFFERS_HAS_STRING_VIEW
+
+#ifndef FLATBUFFERS_HAS_NEW_STRTOD
+  // Modern (C++11) strtod and strtof functions are available for use.
+  // 1) nan/inf strings as argument of strtod;
+  // 2) hex-float  as argument of  strtod/strtof.
+  #if (defined(_MSC_VER) && _MSC_VER >= 1900) || \
+      (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
+      (defined(__clang__))
+    #define FLATBUFFERS_HAS_NEW_STRTOD 1
+  #endif
+#endif // !FLATBUFFERS_HAS_NEW_STRTOD
+
+#ifndef FLATBUFFERS_LOCALE_INDEPENDENT
+  // Enable locale independent functions {strtof_l, strtod_l,strtoll_l, strtoull_l}.
+  #if ((defined(_MSC_VER) && _MSC_VER >= 1800)            || \
+       (defined(_XOPEN_VERSION) && (_XOPEN_VERSION>=700)) && (!defined(__ANDROID_API__) || (defined(__ANDROID_API__) && (__ANDROID_API__>=21))))
+    #define FLATBUFFERS_LOCALE_INDEPENDENT 1
+  #else
+    #define FLATBUFFERS_LOCALE_INDEPENDENT 0
+  #endif
+#endif  // !FLATBUFFERS_LOCALE_INDEPENDENT
+
+// Suppress Undefined Behavior Sanitizer (recoverable only). Usage:
+// - __supress_ubsan__("undefined")
+// - __supress_ubsan__("signed-integer-overflow")
+#if defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >=7))
+  #define __supress_ubsan__(type) __attribute__((no_sanitize(type)))
+#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)
+  #define __supress_ubsan__(type) __attribute__((no_sanitize_undefined))
+#else
+  #define __supress_ubsan__(type)
+#endif
+
+// This is constexpr function used for checking compile-time constants.
+// Avoid `#pragma warning(disable: 4127) // C4127: expression is constant`.
+template<typename T> FLATBUFFERS_CONSTEXPR inline bool IsConstTrue(T t) {
+  return !!t;
+}
+
+// Enable C++ attribute [[]] if std:c++17 or higher.
+#if ((__cplusplus >= 201703L) \
+    || (defined(_MSVC_LANG) &&  (_MSVC_LANG >= 201703L)))
+  // All attributes unknown to an implementation are ignored without causing an error.
+  #define FLATBUFFERS_ATTRIBUTE(attr) [[attr]]
+
+  #define FLATBUFFERS_FALLTHROUGH() [[fallthrough]]
+#else
+  #define FLATBUFFERS_ATTRIBUTE(attr)
+
+  #if FLATBUFFERS_CLANG >= 30800
+    #define FLATBUFFERS_FALLTHROUGH() [[clang::fallthrough]]
+  #elif FLATBUFFERS_GCC >= 70300
+    #define FLATBUFFERS_FALLTHROUGH() [[gnu::fallthrough]]
+  #else
+    #define FLATBUFFERS_FALLTHROUGH()
+  #endif
+#endif
+
+/// @endcond
+
+/// @file
+namespace flatbuffers {
+
+/// @cond FLATBUFFERS_INTERNAL
+// Our default offset / size type, 32bit on purpose on 64bit systems.
+// Also, using a consistent offset type maintains compatibility of serialized
+// offset values between 32bit and 64bit systems.
+typedef uint32_t uoffset_t;
+
+// Signed offsets for references that can go in both directions.
+typedef int32_t soffset_t;
+
+// Offset/index used in v-tables, can be changed to uint8_t in
+// format forks to save a bit of space if desired.
+typedef uint16_t voffset_t;
+
+typedef uintmax_t largest_scalar_t;
+
+// In 32bits, this evaluates to 2GB - 1
+#define FLATBUFFERS_MAX_BUFFER_SIZE ((1ULL << (sizeof(::flatbuffers::soffset_t) * 8 - 1)) - 1)
+
+// We support aligning the contents of buffers up to this size.
+#define FLATBUFFERS_MAX_ALIGNMENT 16
+
+inline bool VerifyAlignmentRequirements(size_t align, size_t min_align = 1) {
+  return (min_align <= align) && (align <= (FLATBUFFERS_MAX_ALIGNMENT)) &&
+         (align & (align - 1)) == 0;  // must be power of 2
+}
+
+#if defined(_MSC_VER)
+  #pragma warning(disable: 4351) // C4351: new behavior: elements of array ... will be default initialized
+  #pragma warning(push)
+  #pragma warning(disable: 4127) // C4127: conditional expression is constant
+#endif
+
+template<typename T> T EndianSwap(T t) {
+  #if defined(_MSC_VER)
+    #define FLATBUFFERS_BYTESWAP16 _byteswap_ushort
+    #define FLATBUFFERS_BYTESWAP32 _byteswap_ulong
+    #define FLATBUFFERS_BYTESWAP64 _byteswap_uint64
+  #elif defined(__ICCARM__)
+    #define FLATBUFFERS_BYTESWAP16 __REV16
+    #define FLATBUFFERS_BYTESWAP32 __REV
+    #define FLATBUFFERS_BYTESWAP64(x) \
+       ((__REV(static_cast<uint32_t>(x >> 32U))) | (static_cast<uint64_t>(__REV(static_cast<uint32_t>(x)))) << 32U)
+  #else
+    #if defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ < 408 && !defined(__clang__)
+      // __builtin_bswap16 was missing prior to GCC 4.8.
+      #define FLATBUFFERS_BYTESWAP16(x) \
+        static_cast<uint16_t>(__builtin_bswap32(static_cast<uint32_t>(x) << 16))
+    #else
+      #define FLATBUFFERS_BYTESWAP16 __builtin_bswap16
+    #endif
+    #define FLATBUFFERS_BYTESWAP32 __builtin_bswap32
+    #define FLATBUFFERS_BYTESWAP64 __builtin_bswap64
+  #endif
+  if (sizeof(T) == 1) {   // Compile-time if-then's.
+    return t;
+  } else if (sizeof(T) == 2) {
+    union { T t; uint16_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP16(u.i);
+    return u.t;
+  } else if (sizeof(T) == 4) {
+    union { T t; uint32_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP32(u.i);
+    return u.t;
+  } else if (sizeof(T) == 8) {
+    union { T t; uint64_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP64(u.i);
+    return u.t;
+  } else {
+    FLATBUFFERS_ASSERT(0);
+    return t;
+  }
+}
+
+#if defined(_MSC_VER)
+  #pragma warning(pop)
+#endif
+
+
+template<typename T> T EndianScalar(T t) {
+  #if FLATBUFFERS_LITTLEENDIAN
+    return t;
+  #else
+    return EndianSwap(t);
+  #endif
+}
+
+template<typename T>
+// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
+__supress_ubsan__("alignment")
+T ReadScalar(const void *p) {
+  return EndianScalar(*reinterpret_cast<const T *>(p));
+}
+
+// See https://github.com/google/flatbuffers/issues/5950
+
+#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
+template<typename T>
+// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
+__supress_ubsan__("alignment")
+void WriteScalar(void *p, T t) {
+  *reinterpret_cast<T *>(p) = EndianScalar(t);
+}
+
+template<typename T> struct Offset;
+template<typename T> __supress_ubsan__("alignment") void WriteScalar(void *p, Offset<T> t) {
+  *reinterpret_cast<uoffset_t *>(p) = EndianScalar(t.o);
+}
+
+#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
+  #pragma GCC diagnostic pop
+#endif
+
+// Computes how many bytes you'd have to pad to be able to write an
+// "scalar_size" scalar if the buffer had grown to "buf_size" (downwards in
+// memory).
+__supress_ubsan__("unsigned-integer-overflow")
+inline size_t PaddingBytes(size_t buf_size, size_t scalar_size) {
+  return ((~buf_size) + 1) & (scalar_size - 1);
+}
+
+}  // namespace flatbuffers
+#endif  // FLATBUFFERS_BASE_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/code_generators.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/code_generators.h
new file mode 100644
index 0000000..d1f7b5a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/code_generators.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_CODE_GENERATORS_H_
+#define FLATBUFFERS_CODE_GENERATORS_H_
+
+#include <map>
+#include <sstream>
+
+#include "flatbuffers/idl.h"
+
+namespace flatbuffers {
+
+// Utility class to assist in generating code through use of text templates.
+//
+// Example code:
+//   CodeWriter code("\t");
+//   code.SetValue("NAME", "Foo");
+//   code += "void {{NAME}}() { printf("%s", "{{NAME}}"); }";
+//   code.SetValue("NAME", "Bar");
+//   code += "void {{NAME}}() { printf("%s", "{{NAME}}"); }";
+//   std::cout << code.ToString() << std::endl;
+//
+// Output:
+//  void Foo() { printf("%s", "Foo"); }
+//  void Bar() { printf("%s", "Bar"); }
+class CodeWriter {
+ public:
+  CodeWriter(std::string pad = std::string())
+      : pad_(pad), cur_ident_lvl_(0), ignore_ident_(false) {}
+
+  // Clears the current "written" code.
+  void Clear() {
+    stream_.str("");
+    stream_.clear();
+  }
+
+  // Associates a key with a value.  All subsequent calls to operator+=, where
+  // the specified key is contained in {{ and }} delimiters will be replaced by
+  // the given value.
+  void SetValue(const std::string &key, const std::string &value) {
+    value_map_[key] = value;
+  }
+
+  std::string GetValue(const std::string &key) const {
+    const auto it = value_map_.find(key);
+    return it == value_map_.end() ? "" : it->second;
+  }
+
+  // Appends the given text to the generated code as well as a newline
+  // character.  Any text within {{ and }} delimiters is replaced by values
+  // previously stored in the CodeWriter by calling SetValue above.  The newline
+  // will be suppressed if the text ends with the \\ character.
+  void operator+=(std::string text);
+
+  // Returns the current contents of the CodeWriter as a std::string.
+  std::string ToString() const { return stream_.str(); }
+
+  // Increase ident level for writing code
+  void IncrementIdentLevel() { cur_ident_lvl_++; }
+  // Decrease ident level for writing code
+  void DecrementIdentLevel() {
+    if (cur_ident_lvl_) cur_ident_lvl_--;
+  }
+
+  void SetPadding(const std::string &padding) { pad_ = padding; }
+
+ private:
+  std::map<std::string, std::string> value_map_;
+  std::stringstream stream_;
+  std::string pad_;
+  int cur_ident_lvl_;
+  bool ignore_ident_;
+
+  // Add ident padding (tab or space) based on ident level
+  void AppendIdent(std::stringstream &stream);
+};
+
+class BaseGenerator {
+ public:
+  virtual bool generate() = 0;
+
+  static std::string NamespaceDir(const Parser &parser, const std::string &path,
+                                  const Namespace &ns,
+                                  const bool dasherize = false);
+
+  static std::string ToDasherizedCase(const std::string pascal_case);
+
+  std::string GeneratedFileName(const std::string &path,
+                                const std::string &file_name,
+                                const IDLOptions &options) const;
+
+ protected:
+  BaseGenerator(const Parser &parser, const std::string &path,
+                const std::string &file_name, std::string qualifying_start,
+                std::string qualifying_separator, std::string default_extension)
+      : parser_(parser),
+        path_(path),
+        file_name_(file_name),
+        qualifying_start_(qualifying_start),
+        qualifying_separator_(qualifying_separator),
+        default_extension_(default_extension) {}
+  virtual ~BaseGenerator() {}
+
+  // No copy/assign.
+  BaseGenerator &operator=(const BaseGenerator &);
+  BaseGenerator(const BaseGenerator &);
+
+  std::string NamespaceDir(const Namespace &ns,
+                           const bool dasherize = false) const;
+
+  static const char *FlatBuffersGeneratedWarning();
+
+  static std::string FullNamespace(const char *separator, const Namespace &ns);
+
+  static std::string LastNamespacePart(const Namespace &ns);
+
+  // tracks the current namespace for early exit in WrapInNameSpace
+  // c++, java and csharp returns a different namespace from
+  // the following default (no early exit, always fully qualify),
+  // which works for js and php
+  virtual const Namespace *CurrentNameSpace() const { return nullptr; }
+
+  // Ensure that a type is prefixed with its namespace even within
+  // its own namespace to avoid conflict between generated method
+  // names and similarly named classes or structs
+  std::string WrapInNameSpace(const Namespace *ns,
+                              const std::string &name) const;
+
+  std::string WrapInNameSpace(const Definition &def) const;
+
+  std::string GetNameSpace(const Definition &def) const;
+
+  const Parser &parser_;
+  const std::string &path_;
+  const std::string &file_name_;
+  const std::string qualifying_start_;
+  const std::string qualifying_separator_;
+  const std::string default_extension_;
+};
+
+struct CommentConfig {
+  const char *first_line;
+  const char *content_line_prefix;
+  const char *last_line;
+};
+
+extern void GenComment(const std::vector<std::string> &dc,
+                       std::string *code_ptr, const CommentConfig *config,
+                       const char *prefix = "");
+
+class FloatConstantGenerator {
+ public:
+  virtual ~FloatConstantGenerator() {}
+  std::string GenFloatConstant(const FieldDef &field) const;
+
+ private:
+  virtual std::string Value(double v, const std::string &src) const = 0;
+  virtual std::string Inf(double v) const = 0;
+  virtual std::string NaN(double v) const = 0;
+
+  virtual std::string Value(float v, const std::string &src) const = 0;
+  virtual std::string Inf(float v) const = 0;
+  virtual std::string NaN(float v) const = 0;
+
+  template<typename T>
+  std::string GenFloatConstantImpl(const FieldDef &field) const;
+};
+
+class SimpleFloatConstantGenerator : public FloatConstantGenerator {
+ public:
+  SimpleFloatConstantGenerator(const char *nan_number,
+                               const char *pos_inf_number,
+                               const char *neg_inf_number);
+
+ private:
+  std::string Value(double v,
+                    const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(double v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string Value(float v, const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(float v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(float v) const FLATBUFFERS_OVERRIDE;
+
+  const std::string nan_number_;
+  const std::string pos_inf_number_;
+  const std::string neg_inf_number_;
+};
+
+// C++, C#, Java like generator.
+class TypedFloatConstantGenerator : public FloatConstantGenerator {
+ public:
+  TypedFloatConstantGenerator(const char *double_prefix,
+                              const char *single_prefix, const char *nan_number,
+                              const char *pos_inf_number,
+                              const char *neg_inf_number = "");
+
+ private:
+  std::string Value(double v,
+                    const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string NaN(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string Value(float v, const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(float v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(float v) const FLATBUFFERS_OVERRIDE;
+
+  std::string MakeNaN(const std::string &prefix) const;
+  std::string MakeInf(bool neg, const std::string &prefix) const;
+
+  const std::string double_prefix_;
+  const std::string single_prefix_;
+  const std::string nan_number_;
+  const std::string pos_inf_number_;
+  const std::string neg_inf_number_;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_CODE_GENERATORS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatbuffers.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatbuffers.h
new file mode 100644
index 0000000..ee34d54
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatbuffers.h
@@ -0,0 +1,2954 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_H_
+#define FLATBUFFERS_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/stl_emulation.h"
+
+#ifndef FLATBUFFERS_CPP98_STL
+#  include <functional>
+#endif
+
+#if defined(FLATBUFFERS_NAN_DEFAULTS)
+#  include <cmath>
+#endif
+
+namespace flatbuffers {
+// Generic 'operator==' with conditional specialisations.
+// T e - new value of a scalar field.
+// T def - default of scalar (is known at compile-time).
+template<typename T> inline bool IsTheSameAs(T e, T def) { return e == def; }
+
+#if defined(FLATBUFFERS_NAN_DEFAULTS) && \
+    defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+// Like `operator==(e, def)` with weak NaN if T=(float|double).
+template<typename T> inline bool IsFloatTheSameAs(T e, T def) {
+  return (e == def) || ((def != def) && (e != e));
+}
+template<> inline bool IsTheSameAs<float>(float e, float def) {
+  return IsFloatTheSameAs(e, def);
+}
+template<> inline bool IsTheSameAs<double>(double e, double def) {
+  return IsFloatTheSameAs(e, def);
+}
+#endif
+
+// Check 'v' is out of closed range [low; high].
+// Workaround for GCC warning [-Werror=type-limits]:
+// comparison is always true due to limited range of data type.
+template<typename T>
+inline bool IsOutRange(const T &v, const T &low, const T &high) {
+  return (v < low) || (high < v);
+}
+
+// Check 'v' is in closed range [low; high].
+template<typename T>
+inline bool IsInRange(const T &v, const T &low, const T &high) {
+  return !IsOutRange(v, low, high);
+}
+
+// Wrapper for uoffset_t to allow safe template specialization.
+// Value is allowed to be 0 to indicate a null object (see e.g. AddOffset).
+template<typename T> struct Offset {
+  uoffset_t o;
+  Offset() : o(0) {}
+  Offset(uoffset_t _o) : o(_o) {}
+  Offset<void> Union() const { return Offset<void>(o); }
+  bool IsNull() const { return !o; }
+};
+
+inline void EndianCheck() {
+  int endiantest = 1;
+  // If this fails, see FLATBUFFERS_LITTLEENDIAN above.
+  FLATBUFFERS_ASSERT(*reinterpret_cast<char *>(&endiantest) ==
+                     FLATBUFFERS_LITTLEENDIAN);
+  (void)endiantest;
+}
+
+template<typename T> FLATBUFFERS_CONSTEXPR size_t AlignOf() {
+  // clang-format off
+  #ifdef _MSC_VER
+    return __alignof(T);
+  #else
+    #ifndef alignof
+      return __alignof__(T);
+    #else
+      return alignof(T);
+    #endif
+  #endif
+  // clang-format on
+}
+
+// When we read serialized data from memory, in the case of most scalars,
+// we want to just read T, but in the case of Offset, we want to actually
+// perform the indirection and return a pointer.
+// The template specialization below does just that.
+// It is wrapped in a struct since function templates can't overload on the
+// return type like this.
+// The typedef is for the convenience of callers of this function
+// (avoiding the need for a trailing return decltype)
+template<typename T> struct IndirectHelper {
+  typedef T return_type;
+  typedef T mutable_return_type;
+  static const size_t element_stride = sizeof(T);
+  static return_type Read(const uint8_t *p, uoffset_t i) {
+    return EndianScalar((reinterpret_cast<const T *>(p))[i]);
+  }
+};
+template<typename T> struct IndirectHelper<Offset<T>> {
+  typedef const T *return_type;
+  typedef T *mutable_return_type;
+  static const size_t element_stride = sizeof(uoffset_t);
+  static return_type Read(const uint8_t *p, uoffset_t i) {
+    p += i * sizeof(uoffset_t);
+    return reinterpret_cast<return_type>(p + ReadScalar<uoffset_t>(p));
+  }
+};
+template<typename T> struct IndirectHelper<const T *> {
+  typedef const T *return_type;
+  typedef T *mutable_return_type;
+  static const size_t element_stride = sizeof(T);
+  static return_type Read(const uint8_t *p, uoffset_t i) {
+    return reinterpret_cast<const T *>(p + i * sizeof(T));
+  }
+};
+
+// An STL compatible iterator implementation for Vector below, effectively
+// calling Get() for every element.
+template<typename T, typename IT> struct VectorIterator {
+  typedef std::random_access_iterator_tag iterator_category;
+  typedef IT value_type;
+  typedef ptrdiff_t difference_type;
+  typedef IT *pointer;
+  typedef IT &reference;
+
+  VectorIterator(const uint8_t *data, uoffset_t i)
+      : data_(data + IndirectHelper<T>::element_stride * i) {}
+  VectorIterator(const VectorIterator &other) : data_(other.data_) {}
+  VectorIterator() : data_(nullptr) {}
+
+  VectorIterator &operator=(const VectorIterator &other) {
+    data_ = other.data_;
+    return *this;
+  }
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  VectorIterator &operator=(VectorIterator &&other) {
+    data_ = other.data_;
+    return *this;
+  }
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  bool operator==(const VectorIterator &other) const {
+    return data_ == other.data_;
+  }
+
+  bool operator<(const VectorIterator &other) const {
+    return data_ < other.data_;
+  }
+
+  bool operator!=(const VectorIterator &other) const {
+    return data_ != other.data_;
+  }
+
+  difference_type operator-(const VectorIterator &other) const {
+    return (data_ - other.data_) / IndirectHelper<T>::element_stride;
+  }
+
+  // Note: return type is incompatible with the standard
+  // `reference operator*()`.
+  IT operator*() const { return IndirectHelper<T>::Read(data_, 0); }
+
+  // Note: return type is incompatible with the standard
+  // `pointer operator->()`.
+  IT operator->() const { return IndirectHelper<T>::Read(data_, 0); }
+
+  VectorIterator &operator++() {
+    data_ += IndirectHelper<T>::element_stride;
+    return *this;
+  }
+
+  VectorIterator operator++(int) {
+    VectorIterator temp(data_, 0);
+    data_ += IndirectHelper<T>::element_stride;
+    return temp;
+  }
+
+  VectorIterator operator+(const uoffset_t &offset) const {
+    return VectorIterator(data_ + offset * IndirectHelper<T>::element_stride,
+                          0);
+  }
+
+  VectorIterator &operator+=(const uoffset_t &offset) {
+    data_ += offset * IndirectHelper<T>::element_stride;
+    return *this;
+  }
+
+  VectorIterator &operator--() {
+    data_ -= IndirectHelper<T>::element_stride;
+    return *this;
+  }
+
+  VectorIterator operator--(int) {
+    VectorIterator temp(data_, 0);
+    data_ -= IndirectHelper<T>::element_stride;
+    return temp;
+  }
+
+  VectorIterator operator-(const uoffset_t &offset) const {
+    return VectorIterator(data_ - offset * IndirectHelper<T>::element_stride,
+                          0);
+  }
+
+  VectorIterator &operator-=(const uoffset_t &offset) {
+    data_ -= offset * IndirectHelper<T>::element_stride;
+    return *this;
+  }
+
+ private:
+  const uint8_t *data_;
+};
+
+template<typename Iterator>
+struct VectorReverseIterator : public std::reverse_iterator<Iterator> {
+  explicit VectorReverseIterator(Iterator iter)
+      : std::reverse_iterator<Iterator>(iter) {}
+
+  // Note: return type is incompatible with the standard
+  // `reference operator*()`.
+  typename Iterator::value_type operator*() const {
+    auto tmp = std::reverse_iterator<Iterator>::current;
+    return *--tmp;
+  }
+
+  // Note: return type is incompatible with the standard
+  // `pointer operator->()`.
+  typename Iterator::value_type operator->() const {
+    auto tmp = std::reverse_iterator<Iterator>::current;
+    return *--tmp;
+  }
+};
+
+struct String;
+
+// This is used as a helper type for accessing vectors.
+// Vector::data() assumes the vector elements start after the length field.
+template<typename T> class Vector {
+ public:
+  typedef VectorIterator<T, typename IndirectHelper<T>::mutable_return_type>
+      iterator;
+  typedef VectorIterator<T, typename IndirectHelper<T>::return_type>
+      const_iterator;
+  typedef VectorReverseIterator<iterator> reverse_iterator;
+  typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
+
+  uoffset_t size() const { return EndianScalar(length_); }
+
+  // Deprecated: use size(). Here for backwards compatibility.
+  FLATBUFFERS_ATTRIBUTE(deprecated("use size() instead"))
+  uoffset_t Length() const { return size(); }
+
+  typedef typename IndirectHelper<T>::return_type return_type;
+  typedef typename IndirectHelper<T>::mutable_return_type mutable_return_type;
+  typedef return_type value_type;
+
+  return_type Get(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return IndirectHelper<T>::Read(Data(), i);
+  }
+
+  return_type operator[](uoffset_t i) const { return Get(i); }
+
+  // If this is a Vector of enums, T will be its storage type, not the enum
+  // type. This function makes it convenient to retrieve value with enum
+  // type E.
+  template<typename E> E GetEnum(uoffset_t i) const {
+    return static_cast<E>(Get(i));
+  }
+
+  // If this a vector of unions, this does the cast for you. There's no check
+  // to make sure this is the right type!
+  template<typename U> const U *GetAs(uoffset_t i) const {
+    return reinterpret_cast<const U *>(Get(i));
+  }
+
+  // If this a vector of unions, this does the cast for you. There's no check
+  // to make sure this is actually a string!
+  const String *GetAsString(uoffset_t i) const {
+    return reinterpret_cast<const String *>(Get(i));
+  }
+
+  const void *GetStructFromOffset(size_t o) const {
+    return reinterpret_cast<const void *>(Data() + o);
+  }
+
+  iterator begin() { return iterator(Data(), 0); }
+  const_iterator begin() const { return const_iterator(Data(), 0); }
+
+  iterator end() { return iterator(Data(), size()); }
+  const_iterator end() const { return const_iterator(Data(), size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+  const_reverse_iterator crbegin() const { return rbegin(); }
+
+  const_reverse_iterator crend() const { return rend(); }
+
+  // Change elements if you have a non-const pointer to this object.
+  // Scalars only. See reflection.h, and the documentation.
+  void Mutate(uoffset_t i, const T &val) {
+    FLATBUFFERS_ASSERT(i < size());
+    WriteScalar(data() + i, val);
+  }
+
+  // Change an element of a vector of tables (or strings).
+  // "val" points to the new table/string, as you can obtain from
+  // e.g. reflection::AddFlatBuffer().
+  void MutateOffset(uoffset_t i, const uint8_t *val) {
+    FLATBUFFERS_ASSERT(i < size());
+    static_assert(sizeof(T) == sizeof(uoffset_t), "Unrelated types");
+    WriteScalar(data() + i,
+                static_cast<uoffset_t>(val - (Data() + i * sizeof(uoffset_t))));
+  }
+
+  // Get a mutable pointer to tables/strings inside this vector.
+  mutable_return_type GetMutableObject(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return const_cast<mutable_return_type>(IndirectHelper<T>::Read(Data(), i));
+  }
+
+  // The raw data in little endian format. Use with care.
+  const uint8_t *Data() const {
+    return reinterpret_cast<const uint8_t *>(&length_ + 1);
+  }
+
+  uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
+
+  // Similarly, but typed, much like std::vector::data
+  const T *data() const { return reinterpret_cast<const T *>(Data()); }
+  T *data() { return reinterpret_cast<T *>(Data()); }
+
+  template<typename K> return_type LookupByKey(K key) const {
+    void *search_result = std::bsearch(
+        &key, Data(), size(), IndirectHelper<T>::element_stride, KeyCompare<K>);
+
+    if (!search_result) {
+      return nullptr;  // Key not found.
+    }
+
+    const uint8_t *element = reinterpret_cast<const uint8_t *>(search_result);
+
+    return IndirectHelper<T>::Read(element, 0);
+  }
+
+ protected:
+  // This class is only used to access pre-existing data. Don't ever
+  // try to construct these manually.
+  Vector();
+
+  uoffset_t length_;
+
+ private:
+  // This class is a pointer. Copying will therefore create an invalid object.
+  // Private and unimplemented copy constructor.
+  Vector(const Vector &);
+  Vector &operator=(const Vector &);
+
+  template<typename K> static int KeyCompare(const void *ap, const void *bp) {
+    const K *key = reinterpret_cast<const K *>(ap);
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(bp);
+    auto table = IndirectHelper<T>::Read(data, 0);
+
+    // std::bsearch compares with the operands transposed, so we negate the
+    // result here.
+    return -table->KeyCompareWithValue(*key);
+  }
+};
+
+// Represent a vector much like the template above, but in this case we
+// don't know what the element types are (used with reflection.h).
+class VectorOfAny {
+ public:
+  uoffset_t size() const { return EndianScalar(length_); }
+
+  const uint8_t *Data() const {
+    return reinterpret_cast<const uint8_t *>(&length_ + 1);
+  }
+  uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
+
+ protected:
+  VectorOfAny();
+
+  uoffset_t length_;
+
+ private:
+  VectorOfAny(const VectorOfAny &);
+  VectorOfAny &operator=(const VectorOfAny &);
+};
+
+#ifndef FLATBUFFERS_CPP98_STL
+template<typename T, typename U>
+Vector<Offset<T>> *VectorCast(Vector<Offset<U>> *ptr) {
+  static_assert(std::is_base_of<T, U>::value, "Unrelated types");
+  return reinterpret_cast<Vector<Offset<T>> *>(ptr);
+}
+
+template<typename T, typename U>
+const Vector<Offset<T>> *VectorCast(const Vector<Offset<U>> *ptr) {
+  static_assert(std::is_base_of<T, U>::value, "Unrelated types");
+  return reinterpret_cast<const Vector<Offset<T>> *>(ptr);
+}
+#endif
+
+// Convenient helper function to get the length of any vector, regardless
+// of whether it is null or not (the field is not set).
+template<typename T> static inline size_t VectorLength(const Vector<T> *v) {
+  return v ? v->size() : 0;
+}
+
+// This is used as a helper type for accessing arrays.
+template<typename T, uint16_t length> class Array {
+  typedef
+      typename flatbuffers::integral_constant<bool,
+                                              flatbuffers::is_scalar<T>::value>
+          scalar_tag;
+  typedef
+      typename flatbuffers::conditional<scalar_tag::value, T, const T *>::type
+          IndirectHelperType;
+
+ public:
+  typedef uint16_t size_type;
+  typedef typename IndirectHelper<IndirectHelperType>::return_type return_type;
+  typedef VectorIterator<T, return_type> const_iterator;
+  typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
+
+  FLATBUFFERS_CONSTEXPR uint16_t size() const { return length; }
+
+  return_type Get(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return IndirectHelper<IndirectHelperType>::Read(Data(), i);
+  }
+
+  return_type operator[](uoffset_t i) const { return Get(i); }
+
+  // If this is a Vector of enums, T will be its storage type, not the enum
+  // type. This function makes it convenient to retrieve value with enum
+  // type E.
+  template<typename E> E GetEnum(uoffset_t i) const {
+    return static_cast<E>(Get(i));
+  }
+
+  const_iterator begin() const { return const_iterator(Data(), 0); }
+  const_iterator end() const { return const_iterator(Data(), size()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+
+  const_reverse_iterator crbegin() const { return rbegin(); }
+  const_reverse_iterator crend() const { return rend(); }
+
+  // Get a mutable pointer to elements inside this array.
+  // This method used to mutate arrays of structs followed by a @p Mutate
+  // operation. For primitive types use @p Mutate directly.
+  // @warning Assignments and reads to/from the dereferenced pointer are not
+  //  automatically converted to the correct endianness.
+  typename flatbuffers::conditional<scalar_tag::value, void, T *>::type
+  GetMutablePointer(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return const_cast<T *>(&data()[i]);
+  }
+
+  // Change elements if you have a non-const pointer to this object.
+  void Mutate(uoffset_t i, const T &val) { MutateImpl(scalar_tag(), i, val); }
+
+  // The raw data in little endian format. Use with care.
+  const uint8_t *Data() const { return data_; }
+
+  uint8_t *Data() { return data_; }
+
+  // Similarly, but typed, much like std::vector::data
+  const T *data() const { return reinterpret_cast<const T *>(Data()); }
+  T *data() { return reinterpret_cast<T *>(Data()); }
+
+  // Copy data from a span with endian conversion.
+  // If this Array and the span overlap, the behavior is undefined.
+  void CopyFromSpan(flatbuffers::span<const T, length> src) {
+    const auto p1 = reinterpret_cast<const uint8_t *>(src.data());
+    const auto p2 = Data();
+    FLATBUFFERS_ASSERT(!(p1 >= p2 && p1 < (p2 + length)) &&
+                       !(p2 >= p1 && p2 < (p1 + length)));
+    (void)p1;
+    (void)p2;
+
+    CopyFromSpanImpl(
+        flatbuffers::integral_constant < bool,
+        !scalar_tag::value || sizeof(T) == 1 || FLATBUFFERS_LITTLEENDIAN > (),
+        src);
+  }
+
+ protected:
+  void MutateImpl(flatbuffers::integral_constant<bool, true>, uoffset_t i,
+                  const T &val) {
+    FLATBUFFERS_ASSERT(i < size());
+    WriteScalar(data() + i, val);
+  }
+
+  void MutateImpl(flatbuffers::integral_constant<bool, false>, uoffset_t i,
+                  const T &val) {
+    *(GetMutablePointer(i)) = val;
+  }
+
+  void CopyFromSpanImpl(flatbuffers::integral_constant<bool, true>,
+                        flatbuffers::span<const T, length> src) {
+    // Use std::memcpy() instead of std::copy() to avoid preformance degradation
+    // due to aliasing if T is char or unsigned char.
+    // The size is known at compile time, so memcpy would be inlined.
+    std::memcpy(data(), src.data(), length * sizeof(T));
+  }
+
+  // Copy data from flatbuffers::span with endian conversion.
+  void CopyFromSpanImpl(flatbuffers::integral_constant<bool, false>,
+                        flatbuffers::span<const T, length> src) {
+    for (size_type k = 0; k < length; k++) { Mutate(k, src[k]); }
+  }
+
+  // This class is only used to access pre-existing data. Don't ever
+  // try to construct these manually.
+  // 'constexpr' allows us to use 'size()' at compile time.
+  // @note Must not use 'FLATBUFFERS_CONSTEXPR' here, as const is not allowed on
+  //  a constructor.
+#if defined(__cpp_constexpr)
+  constexpr Array();
+#else
+  Array();
+#endif
+
+  uint8_t data_[length * sizeof(T)];
+
+ private:
+  // This class is a pointer. Copying will therefore create an invalid object.
+  // Private and unimplemented copy constructor.
+  Array(const Array &);
+  Array &operator=(const Array &);
+};
+
+// Specialization for Array[struct] with access using Offset<void> pointer.
+// This specialization used by idl_gen_text.cpp.
+template<typename T, uint16_t length> class Array<Offset<T>, length> {
+  static_assert(flatbuffers::is_same<T, void>::value, "unexpected type T");
+
+ public:
+  typedef const void *return_type;
+
+  const uint8_t *Data() const { return data_; }
+
+  // Make idl_gen_text.cpp::PrintContainer happy.
+  return_type operator[](uoffset_t) const {
+    FLATBUFFERS_ASSERT(false);
+    return nullptr;
+  }
+
+ private:
+  // This class is only used to access pre-existing data.
+  Array();
+  Array(const Array &);
+  Array &operator=(const Array &);
+
+  uint8_t data_[1];
+};
+
+// Cast a raw T[length] to a raw flatbuffers::Array<T, length>
+// without endian conversion. Use with care.
+template<typename T, uint16_t length>
+Array<T, length> &CastToArray(T (&arr)[length]) {
+  return *reinterpret_cast<Array<T, length> *>(arr);
+}
+
+template<typename T, uint16_t length>
+const Array<T, length> &CastToArray(const T (&arr)[length]) {
+  return *reinterpret_cast<const Array<T, length> *>(arr);
+}
+
+template<typename E, typename T, uint16_t length>
+Array<E, length> &CastToArrayOfEnum(T (&arr)[length]) {
+  static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
+  return *reinterpret_cast<Array<E, length> *>(arr);
+}
+
+template<typename E, typename T, uint16_t length>
+const Array<E, length> &CastToArrayOfEnum(const T (&arr)[length]) {
+  static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
+  return *reinterpret_cast<const Array<E, length> *>(arr);
+}
+
+// Lexicographically compare two strings (possibly containing nulls), and
+// return true if the first is less than the second.
+static inline bool StringLessThan(const char *a_data, uoffset_t a_size,
+                                  const char *b_data, uoffset_t b_size) {
+  const auto cmp = memcmp(a_data, b_data, (std::min)(a_size, b_size));
+  return cmp == 0 ? a_size < b_size : cmp < 0;
+}
+
+struct String : public Vector<char> {
+  const char *c_str() const { return reinterpret_cast<const char *>(Data()); }
+  std::string str() const { return std::string(c_str(), size()); }
+
+  // clang-format off
+  #ifdef FLATBUFFERS_HAS_STRING_VIEW
+  flatbuffers::string_view string_view() const {
+    return flatbuffers::string_view(c_str(), size());
+  }
+  #endif // FLATBUFFERS_HAS_STRING_VIEW
+  // clang-format on
+
+  bool operator<(const String &o) const {
+    return StringLessThan(this->data(), this->size(), o.data(), o.size());
+  }
+};
+
+// Convenience function to get std::string from a String returning an empty
+// string on null pointer.
+static inline std::string GetString(const String *str) {
+  return str ? str->str() : "";
+}
+
+// Convenience function to get char* from a String returning an empty string on
+// null pointer.
+static inline const char *GetCstring(const String *str) {
+  return str ? str->c_str() : "";
+}
+
+#ifdef FLATBUFFERS_HAS_STRING_VIEW
+// Convenience function to get string_view from a String returning an empty
+// string_view on null pointer.
+static inline flatbuffers::string_view GetStringView(const String *str) {
+  return str ? str->string_view() : flatbuffers::string_view();
+}
+#endif  // FLATBUFFERS_HAS_STRING_VIEW
+
+// Allocator interface. This is flatbuffers-specific and meant only for
+// `vector_downward` usage.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  // Allocate `size` bytes of memory.
+  virtual uint8_t *allocate(size_t size) = 0;
+
+  // Deallocate `size` bytes of memory at `p` allocated by this allocator.
+  virtual void deallocate(uint8_t *p, size_t size) = 0;
+
+  // Reallocate `new_size` bytes of memory, replacing the old region of size
+  // `old_size` at `p`. In contrast to a normal realloc, this grows downwards,
+  // and is intended specifcally for `vector_downward` use.
+  // `in_use_back` and `in_use_front` indicate how much of `old_size` is
+  // actually in use at each end, and needs to be copied.
+  virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
+                                       size_t new_size, size_t in_use_back,
+                                       size_t in_use_front) {
+    FLATBUFFERS_ASSERT(new_size > old_size);  // vector_downward only grows
+    uint8_t *new_p = allocate(new_size);
+    memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
+                    in_use_front);
+    deallocate(old_p, old_size);
+    return new_p;
+  }
+
+ protected:
+  // Called by `reallocate_downward` to copy memory from `old_p` of `old_size`
+  // to `new_p` of `new_size`. Only memory of size `in_use_front` and
+  // `in_use_back` will be copied from the front and back of the old memory
+  // allocation.
+  void memcpy_downward(uint8_t *old_p, size_t old_size, uint8_t *new_p,
+                       size_t new_size, size_t in_use_back,
+                       size_t in_use_front) {
+    memcpy(new_p + new_size - in_use_back, old_p + old_size - in_use_back,
+           in_use_back);
+    memcpy(new_p, old_p, in_use_front);
+  }
+};
+
+// DefaultAllocator uses new/delete to allocate memory regions
+class DefaultAllocator : public Allocator {
+ public:
+  uint8_t *allocate(size_t size) FLATBUFFERS_OVERRIDE {
+    return new uint8_t[size];
+  }
+
+  void deallocate(uint8_t *p, size_t) FLATBUFFERS_OVERRIDE { delete[] p; }
+
+  static void dealloc(void *p, size_t) { delete[] static_cast<uint8_t *>(p); }
+};
+
+// These functions allow for a null allocator to mean use the default allocator,
+// as used by DetachedBuffer and vector_downward below.
+// This is to avoid having a statically or dynamically allocated default
+// allocator, or having to move it between the classes that may own it.
+inline uint8_t *Allocate(Allocator *allocator, size_t size) {
+  return allocator ? allocator->allocate(size)
+                   : DefaultAllocator().allocate(size);
+}
+
+inline void Deallocate(Allocator *allocator, uint8_t *p, size_t size) {
+  if (allocator)
+    allocator->deallocate(p, size);
+  else
+    DefaultAllocator().deallocate(p, size);
+}
+
+inline uint8_t *ReallocateDownward(Allocator *allocator, uint8_t *old_p,
+                                   size_t old_size, size_t new_size,
+                                   size_t in_use_back, size_t in_use_front) {
+  return allocator ? allocator->reallocate_downward(old_p, old_size, new_size,
+                                                    in_use_back, in_use_front)
+                   : DefaultAllocator().reallocate_downward(
+                         old_p, old_size, new_size, in_use_back, in_use_front);
+}
+
+// DetachedBuffer is a finished flatbuffer memory region, detached from its
+// builder. The original memory region and allocator are also stored so that
+// the DetachedBuffer can manage the memory lifetime.
+class DetachedBuffer {
+ public:
+  DetachedBuffer()
+      : allocator_(nullptr),
+        own_allocator_(false),
+        buf_(nullptr),
+        reserved_(0),
+        cur_(nullptr),
+        size_(0) {}
+
+  DetachedBuffer(Allocator *allocator, bool own_allocator, uint8_t *buf,
+                 size_t reserved, uint8_t *cur, size_t sz)
+      : allocator_(allocator),
+        own_allocator_(own_allocator),
+        buf_(buf),
+        reserved_(reserved),
+        cur_(cur),
+        size_(sz) {}
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  DetachedBuffer(DetachedBuffer &&other)
+      : allocator_(other.allocator_),
+        own_allocator_(other.own_allocator_),
+        buf_(other.buf_),
+        reserved_(other.reserved_),
+        cur_(other.cur_),
+        size_(other.size_) {
+    other.reset();
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  DetachedBuffer &operator=(DetachedBuffer &&other) {
+    if (this == &other) return *this;
+
+    destroy();
+
+    allocator_ = other.allocator_;
+    own_allocator_ = other.own_allocator_;
+    buf_ = other.buf_;
+    reserved_ = other.reserved_;
+    cur_ = other.cur_;
+    size_ = other.size_;
+
+    other.reset();
+
+    return *this;
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  ~DetachedBuffer() { destroy(); }
+
+  const uint8_t *data() const { return cur_; }
+
+  uint8_t *data() { return cur_; }
+
+  size_t size() const { return size_; }
+
+  // clang-format off
+  #if 0  // disabled for now due to the ordering of classes in this header
+  template <class T>
+  bool Verify() const {
+    Verifier verifier(data(), size());
+    return verifier.Verify<T>(nullptr);
+  }
+
+  template <class T>
+  const T* GetRoot() const {
+    return flatbuffers::GetRoot<T>(data());
+  }
+
+  template <class T>
+  T* GetRoot() {
+    return flatbuffers::GetRoot<T>(data());
+  }
+  #endif
+  // clang-format on
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  // These may change access mode, leave these at end of public section
+  FLATBUFFERS_DELETE_FUNC(DetachedBuffer(const DetachedBuffer &other));
+  FLATBUFFERS_DELETE_FUNC(
+      DetachedBuffer &operator=(const DetachedBuffer &other));
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+ protected:
+  Allocator *allocator_;
+  bool own_allocator_;
+  uint8_t *buf_;
+  size_t reserved_;
+  uint8_t *cur_;
+  size_t size_;
+
+  inline void destroy() {
+    if (buf_) Deallocate(allocator_, buf_, reserved_);
+    if (own_allocator_ && allocator_) { delete allocator_; }
+    reset();
+  }
+
+  inline void reset() {
+    allocator_ = nullptr;
+    own_allocator_ = false;
+    buf_ = nullptr;
+    reserved_ = 0;
+    cur_ = nullptr;
+    size_ = 0;
+  }
+};
+
+// This is a minimal replication of std::vector<uint8_t> functionality,
+// except growing from higher to lower addresses. i.e push_back() inserts data
+// in the lowest address in the vector.
+// Since this vector leaves the lower part unused, we support a "scratch-pad"
+// that can be stored there for temporary data, to share the allocated space.
+// Essentially, this supports 2 std::vectors in a single buffer.
+class vector_downward {
+ public:
+  explicit vector_downward(size_t initial_size, Allocator *allocator,
+                           bool own_allocator, size_t buffer_minalign)
+      : allocator_(allocator),
+        own_allocator_(own_allocator),
+        initial_size_(initial_size),
+        buffer_minalign_(buffer_minalign),
+        reserved_(0),
+        buf_(nullptr),
+        cur_(nullptr),
+        scratch_(nullptr) {}
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  vector_downward(vector_downward &&other)
+  #else
+  vector_downward(vector_downward &other)
+  #endif  // defined(FLATBUFFERS_CPP98_STL)
+      // clang-format on
+      : allocator_(other.allocator_),
+        own_allocator_(other.own_allocator_),
+        initial_size_(other.initial_size_),
+        buffer_minalign_(other.buffer_minalign_),
+        reserved_(other.reserved_),
+        buf_(other.buf_),
+        cur_(other.cur_),
+        scratch_(other.scratch_) {
+    // No change in other.allocator_
+    // No change in other.initial_size_
+    // No change in other.buffer_minalign_
+    other.own_allocator_ = false;
+    other.reserved_ = 0;
+    other.buf_ = nullptr;
+    other.cur_ = nullptr;
+    other.scratch_ = nullptr;
+  }
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  vector_downward &operator=(vector_downward &&other) {
+    // Move construct a temporary and swap idiom
+    vector_downward temp(std::move(other));
+    swap(temp);
+    return *this;
+  }
+  // clang-format off
+  #endif  // defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  ~vector_downward() {
+    clear_buffer();
+    clear_allocator();
+  }
+
+  void reset() {
+    clear_buffer();
+    clear();
+  }
+
+  void clear() {
+    if (buf_) {
+      cur_ = buf_ + reserved_;
+    } else {
+      reserved_ = 0;
+      cur_ = nullptr;
+    }
+    clear_scratch();
+  }
+
+  void clear_scratch() { scratch_ = buf_; }
+
+  void clear_allocator() {
+    if (own_allocator_ && allocator_) { delete allocator_; }
+    allocator_ = nullptr;
+    own_allocator_ = false;
+  }
+
+  void clear_buffer() {
+    if (buf_) Deallocate(allocator_, buf_, reserved_);
+    buf_ = nullptr;
+  }
+
+  // Relinquish the pointer to the caller.
+  uint8_t *release_raw(size_t &allocated_bytes, size_t &offset) {
+    auto *buf = buf_;
+    allocated_bytes = reserved_;
+    offset = static_cast<size_t>(cur_ - buf_);
+
+    // release_raw only relinquishes the buffer ownership.
+    // Does not deallocate or reset the allocator. Destructor will do that.
+    buf_ = nullptr;
+    clear();
+    return buf;
+  }
+
+  // Relinquish the pointer to the caller.
+  DetachedBuffer release() {
+    // allocator ownership (if any) is transferred to DetachedBuffer.
+    DetachedBuffer fb(allocator_, own_allocator_, buf_, reserved_, cur_,
+                      size());
+    if (own_allocator_) {
+      allocator_ = nullptr;
+      own_allocator_ = false;
+    }
+    buf_ = nullptr;
+    clear();
+    return fb;
+  }
+
+  size_t ensure_space(size_t len) {
+    FLATBUFFERS_ASSERT(cur_ >= scratch_ && scratch_ >= buf_);
+    if (len > static_cast<size_t>(cur_ - scratch_)) { reallocate(len); }
+    // Beyond this, signed offsets may not have enough range:
+    // (FlatBuffers > 2GB not supported).
+    FLATBUFFERS_ASSERT(size() < FLATBUFFERS_MAX_BUFFER_SIZE);
+    return len;
+  }
+
+  inline uint8_t *make_space(size_t len) {
+    size_t space = ensure_space(len);
+    cur_ -= space;
+    return cur_;
+  }
+
+  // Returns nullptr if using the DefaultAllocator.
+  Allocator *get_custom_allocator() { return allocator_; }
+
+  uoffset_t size() const {
+    return static_cast<uoffset_t>(reserved_ - static_cast<size_t>(cur_ - buf_));
+  }
+
+  uoffset_t scratch_size() const {
+    return static_cast<uoffset_t>(scratch_ - buf_);
+  }
+
+  size_t capacity() const { return reserved_; }
+
+  uint8_t *data() const {
+    FLATBUFFERS_ASSERT(cur_);
+    return cur_;
+  }
+
+  uint8_t *scratch_data() const {
+    FLATBUFFERS_ASSERT(buf_);
+    return buf_;
+  }
+
+  uint8_t *scratch_end() const {
+    FLATBUFFERS_ASSERT(scratch_);
+    return scratch_;
+  }
+
+  uint8_t *data_at(size_t offset) const { return buf_ + reserved_ - offset; }
+
+  void push(const uint8_t *bytes, size_t num) {
+    if (num > 0) { memcpy(make_space(num), bytes, num); }
+  }
+
+  // Specialized version of push() that avoids memcpy call for small data.
+  template<typename T> void push_small(const T &little_endian_t) {
+    make_space(sizeof(T));
+    *reinterpret_cast<T *>(cur_) = little_endian_t;
+  }
+
+  template<typename T> void scratch_push_small(const T &t) {
+    ensure_space(sizeof(T));
+    *reinterpret_cast<T *>(scratch_) = t;
+    scratch_ += sizeof(T);
+  }
+
+  // fill() is most frequently called with small byte counts (<= 4),
+  // which is why we're using loops rather than calling memset.
+  void fill(size_t zero_pad_bytes) {
+    make_space(zero_pad_bytes);
+    for (size_t i = 0; i < zero_pad_bytes; i++) cur_[i] = 0;
+  }
+
+  // Version for when we know the size is larger.
+  // Precondition: zero_pad_bytes > 0
+  void fill_big(size_t zero_pad_bytes) {
+    memset(make_space(zero_pad_bytes), 0, zero_pad_bytes);
+  }
+
+  void pop(size_t bytes_to_remove) { cur_ += bytes_to_remove; }
+  void scratch_pop(size_t bytes_to_remove) { scratch_ -= bytes_to_remove; }
+
+  void swap(vector_downward &other) {
+    using std::swap;
+    swap(allocator_, other.allocator_);
+    swap(own_allocator_, other.own_allocator_);
+    swap(initial_size_, other.initial_size_);
+    swap(buffer_minalign_, other.buffer_minalign_);
+    swap(reserved_, other.reserved_);
+    swap(buf_, other.buf_);
+    swap(cur_, other.cur_);
+    swap(scratch_, other.scratch_);
+  }
+
+  void swap_allocator(vector_downward &other) {
+    using std::swap;
+    swap(allocator_, other.allocator_);
+    swap(own_allocator_, other.own_allocator_);
+  }
+
+ private:
+  // You shouldn't really be copying instances of this class.
+  FLATBUFFERS_DELETE_FUNC(vector_downward(const vector_downward &));
+  FLATBUFFERS_DELETE_FUNC(vector_downward &operator=(const vector_downward &));
+
+  Allocator *allocator_;
+  bool own_allocator_;
+  size_t initial_size_;
+  size_t buffer_minalign_;
+  size_t reserved_;
+  uint8_t *buf_;
+  uint8_t *cur_;  // Points at location between empty (below) and used (above).
+  uint8_t *scratch_;  // Points to the end of the scratchpad in use.
+
+  void reallocate(size_t len) {
+    auto old_reserved = reserved_;
+    auto old_size = size();
+    auto old_scratch_size = scratch_size();
+    reserved_ +=
+        (std::max)(len, old_reserved ? old_reserved / 2 : initial_size_);
+    reserved_ = (reserved_ + buffer_minalign_ - 1) & ~(buffer_minalign_ - 1);
+    if (buf_) {
+      buf_ = ReallocateDownward(allocator_, buf_, old_reserved, reserved_,
+                                old_size, old_scratch_size);
+    } else {
+      buf_ = Allocate(allocator_, reserved_);
+    }
+    cur_ = buf_ + reserved_ - old_size;
+    scratch_ = buf_ + old_scratch_size;
+  }
+};
+
+// Converts a Field ID to a virtual table offset.
+inline voffset_t FieldIndexToOffset(voffset_t field_id) {
+  // Should correspond to what EndTable() below builds up.
+  const int fixed_fields = 2;  // Vtable size and Object Size.
+  return static_cast<voffset_t>((field_id + fixed_fields) * sizeof(voffset_t));
+}
+
+template<typename T, typename Alloc>
+const T *data(const std::vector<T, Alloc> &v) {
+  // Eventually the returned pointer gets passed down to memcpy, so
+  // we need it to be non-null to avoid undefined behavior.
+  static uint8_t t;
+  return v.empty() ? reinterpret_cast<const T *>(&t) : &v.front();
+}
+template<typename T, typename Alloc> T *data(std::vector<T, Alloc> &v) {
+  // Eventually the returned pointer gets passed down to memcpy, so
+  // we need it to be non-null to avoid undefined behavior.
+  static uint8_t t;
+  return v.empty() ? reinterpret_cast<T *>(&t) : &v.front();
+}
+
+/// @endcond
+
+/// @addtogroup flatbuffers_cpp_api
+/// @{
+/// @class FlatBufferBuilder
+/// @brief Helper class to hold data needed in creation of a FlatBuffer.
+/// To serialize data, you typically call one of the `Create*()` functions in
+/// the generated code, which in turn call a sequence of `StartTable`/
+/// `PushElement`/`AddElement`/`EndTable`, or the builtin `CreateString`/
+/// `CreateVector` functions. Do this is depth-first order to build up a tree to
+/// the root. `Finish()` wraps up the buffer ready for transport.
+class FlatBufferBuilder {
+ public:
+  /// @brief Default constructor for FlatBufferBuilder.
+  /// @param[in] initial_size The initial size of the buffer, in bytes. Defaults
+  /// to `1024`.
+  /// @param[in] allocator An `Allocator` to use. If null will use
+  /// `DefaultAllocator`.
+  /// @param[in] own_allocator Whether the builder/vector should own the
+  /// allocator. Defaults to / `false`.
+  /// @param[in] buffer_minalign Force the buffer to be aligned to the given
+  /// minimum alignment upon reallocation. Only needed if you intend to store
+  /// types with custom alignment AND you wish to read the buffer in-place
+  /// directly after creation.
+  explicit FlatBufferBuilder(
+      size_t initial_size = 1024, Allocator *allocator = nullptr,
+      bool own_allocator = false,
+      size_t buffer_minalign = AlignOf<largest_scalar_t>())
+      : buf_(initial_size, allocator, own_allocator, buffer_minalign),
+        num_field_loc(0),
+        max_voffset_(0),
+        nested(false),
+        finished(false),
+        minalign_(1),
+        force_defaults_(false),
+        dedup_vtables_(true),
+        string_pool(nullptr) {
+    EndianCheck();
+  }
+
+  // clang-format off
+  /// @brief Move constructor for FlatBufferBuilder.
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  FlatBufferBuilder(FlatBufferBuilder &&other)
+  #else
+  FlatBufferBuilder(FlatBufferBuilder &other)
+  #endif  // #if !defined(FLATBUFFERS_CPP98_STL)
+    : buf_(1024, nullptr, false, AlignOf<largest_scalar_t>()),
+      num_field_loc(0),
+      max_voffset_(0),
+      nested(false),
+      finished(false),
+      minalign_(1),
+      force_defaults_(false),
+      dedup_vtables_(true),
+      string_pool(nullptr) {
+    EndianCheck();
+    // Default construct and swap idiom.
+    // Lack of delegating constructors in vs2010 makes it more verbose than needed.
+    Swap(other);
+  }
+  // clang-format on
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  /// @brief Move assignment operator for FlatBufferBuilder.
+  FlatBufferBuilder &operator=(FlatBufferBuilder &&other) {
+    // Move construct a temporary and swap idiom
+    FlatBufferBuilder temp(std::move(other));
+    Swap(temp);
+    return *this;
+  }
+  // clang-format off
+  #endif  // defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  void Swap(FlatBufferBuilder &other) {
+    using std::swap;
+    buf_.swap(other.buf_);
+    swap(num_field_loc, other.num_field_loc);
+    swap(max_voffset_, other.max_voffset_);
+    swap(nested, other.nested);
+    swap(finished, other.finished);
+    swap(minalign_, other.minalign_);
+    swap(force_defaults_, other.force_defaults_);
+    swap(dedup_vtables_, other.dedup_vtables_);
+    swap(string_pool, other.string_pool);
+  }
+
+  ~FlatBufferBuilder() {
+    if (string_pool) delete string_pool;
+  }
+
+  void Reset() {
+    Clear();       // clear builder state
+    buf_.reset();  // deallocate buffer
+  }
+
+  /// @brief Reset all the state in this FlatBufferBuilder so it can be reused
+  /// to construct another buffer.
+  void Clear() {
+    ClearOffsets();
+    buf_.clear();
+    nested = false;
+    finished = false;
+    minalign_ = 1;
+    if (string_pool) string_pool->clear();
+  }
+
+  /// @brief The current size of the serialized buffer, counting from the end.
+  /// @return Returns an `uoffset_t` with the current size of the buffer.
+  uoffset_t GetSize() const { return buf_.size(); }
+
+  /// @brief Get the serialized buffer (after you call `Finish()`).
+  /// @return Returns an `uint8_t` pointer to the FlatBuffer data inside the
+  /// buffer.
+  uint8_t *GetBufferPointer() const {
+    Finished();
+    return buf_.data();
+  }
+
+  /// @brief Get the serialized buffer (after you call `Finish()`) as a span.
+  /// @return Returns a constructed flatbuffers::span that is a view over the
+  /// FlatBuffer data inside the buffer.
+  flatbuffers::span<uint8_t> GetBufferSpan() const {
+    Finished();
+    return flatbuffers::span<uint8_t>(buf_.data(), buf_.size());
+  }
+
+  /// @brief Get a pointer to an unfinished buffer.
+  /// @return Returns a `uint8_t` pointer to the unfinished buffer.
+  uint8_t *GetCurrentBufferPointer() const { return buf_.data(); }
+
+  /// @brief Get the released pointer to the serialized buffer.
+  /// @warning Do NOT attempt to use this FlatBufferBuilder afterwards!
+  /// @return A `FlatBuffer` that owns the buffer and its allocator and
+  /// behaves similar to a `unique_ptr` with a deleter.
+  FLATBUFFERS_ATTRIBUTE(deprecated("use Release() instead"))
+  DetachedBuffer ReleaseBufferPointer() {
+    Finished();
+    return buf_.release();
+  }
+
+  /// @brief Get the released DetachedBuffer.
+  /// @return A `DetachedBuffer` that owns the buffer and its allocator.
+  DetachedBuffer Release() {
+    Finished();
+    return buf_.release();
+  }
+
+  /// @brief Get the released pointer to the serialized buffer.
+  /// @param size The size of the memory block containing
+  /// the serialized `FlatBuffer`.
+  /// @param offset The offset from the released pointer where the finished
+  /// `FlatBuffer` starts.
+  /// @return A raw pointer to the start of the memory block containing
+  /// the serialized `FlatBuffer`.
+  /// @remark If the allocator is owned, it gets deleted when the destructor is
+  /// called..
+  uint8_t *ReleaseRaw(size_t &size, size_t &offset) {
+    Finished();
+    return buf_.release_raw(size, offset);
+  }
+
+  /// @brief get the minimum alignment this buffer needs to be accessed
+  /// properly. This is only known once all elements have been written (after
+  /// you call Finish()). You can use this information if you need to embed
+  /// a FlatBuffer in some other buffer, such that you can later read it
+  /// without first having to copy it into its own buffer.
+  size_t GetBufferMinAlignment() const {
+    Finished();
+    return minalign_;
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  void Finished() const {
+    // If you get this assert, you're attempting to get access a buffer
+    // which hasn't been finished yet. Be sure to call
+    // FlatBufferBuilder::Finish with your root table.
+    // If you really need to access an unfinished buffer, call
+    // GetCurrentBufferPointer instead.
+    FLATBUFFERS_ASSERT(finished);
+  }
+  /// @endcond
+
+  /// @brief In order to save space, fields that are set to their default value
+  /// don't get serialized into the buffer.
+  /// @param[in] fd When set to `true`, always serializes default values that
+  /// are set. Optional fields which are not set explicitly, will still not be
+  /// serialized.
+  void ForceDefaults(bool fd) { force_defaults_ = fd; }
+
+  /// @brief By default vtables are deduped in order to save space.
+  /// @param[in] dedup When set to `true`, dedup vtables.
+  void DedupVtables(bool dedup) { dedup_vtables_ = dedup; }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  void Pad(size_t num_bytes) { buf_.fill(num_bytes); }
+
+  void TrackMinAlign(size_t elem_size) {
+    if (elem_size > minalign_) minalign_ = elem_size;
+  }
+
+  void Align(size_t elem_size) {
+    TrackMinAlign(elem_size);
+    buf_.fill(PaddingBytes(buf_.size(), elem_size));
+  }
+
+  void PushFlatBuffer(const uint8_t *bytes, size_t size) {
+    PushBytes(bytes, size);
+    finished = true;
+  }
+
+  void PushBytes(const uint8_t *bytes, size_t size) { buf_.push(bytes, size); }
+
+  void PopBytes(size_t amount) { buf_.pop(amount); }
+
+  template<typename T> void AssertScalarT() {
+    // The code assumes power of 2 sizes and endian-swap-ability.
+    static_assert(flatbuffers::is_scalar<T>::value, "T must be a scalar type");
+  }
+
+  // Write a single aligned scalar to the buffer
+  template<typename T> uoffset_t PushElement(T element) {
+    AssertScalarT<T>();
+    T litle_endian_element = EndianScalar(element);
+    Align(sizeof(T));
+    buf_.push_small(litle_endian_element);
+    return GetSize();
+  }
+
+  template<typename T> uoffset_t PushElement(Offset<T> off) {
+    // Special case for offsets: see ReferTo below.
+    return PushElement(ReferTo(off.o));
+  }
+
+  // When writing fields, we track where they are, so we can create correct
+  // vtables later.
+  void TrackField(voffset_t field, uoffset_t off) {
+    FieldLoc fl = { off, field };
+    buf_.scratch_push_small(fl);
+    num_field_loc++;
+    max_voffset_ = (std::max)(max_voffset_, field);
+  }
+
+  // Like PushElement, but additionally tracks the field this represents.
+  template<typename T> void AddElement(voffset_t field, T e, T def) {
+    // We don't serialize values equal to the default.
+    if (IsTheSameAs(e, def) && !force_defaults_) return;
+    auto off = PushElement(e);
+    TrackField(field, off);
+  }
+
+  template<typename T> void AddElement(voffset_t field, T e) {
+    auto off = PushElement(e);
+    TrackField(field, off);
+  }
+
+  template<typename T> void AddOffset(voffset_t field, Offset<T> off) {
+    if (off.IsNull()) return;  // Don't store.
+    AddElement(field, ReferTo(off.o), static_cast<uoffset_t>(0));
+  }
+
+  template<typename T> void AddStruct(voffset_t field, const T *structptr) {
+    if (!structptr) return;  // Default, don't store.
+    Align(AlignOf<T>());
+    buf_.push_small(*structptr);
+    TrackField(field, GetSize());
+  }
+
+  void AddStructOffset(voffset_t field, uoffset_t off) {
+    TrackField(field, off);
+  }
+
+  // Offsets initially are relative to the end of the buffer (downwards).
+  // This function converts them to be relative to the current location
+  // in the buffer (when stored here), pointing upwards.
+  uoffset_t ReferTo(uoffset_t off) {
+    // Align to ensure GetSize() below is correct.
+    Align(sizeof(uoffset_t));
+    // Offset must refer to something already in buffer.
+    FLATBUFFERS_ASSERT(off && off <= GetSize());
+    return GetSize() - off + static_cast<uoffset_t>(sizeof(uoffset_t));
+  }
+
+  void NotNested() {
+    // If you hit this, you're trying to construct a Table/Vector/String
+    // during the construction of its parent table (between the MyTableBuilder
+    // and table.Finish().
+    // Move the creation of these sub-objects to above the MyTableBuilder to
+    // not get this assert.
+    // Ignoring this assert may appear to work in simple cases, but the reason
+    // it is here is that storing objects in-line may cause vtable offsets
+    // to not fit anymore. It also leads to vtable duplication.
+    FLATBUFFERS_ASSERT(!nested);
+    // If you hit this, fields were added outside the scope of a table.
+    FLATBUFFERS_ASSERT(!num_field_loc);
+  }
+
+  // From generated code (or from the parser), we call StartTable/EndTable
+  // with a sequence of AddElement calls in between.
+  uoffset_t StartTable() {
+    NotNested();
+    nested = true;
+    return GetSize();
+  }
+
+  // This finishes one serialized object by generating the vtable if it's a
+  // table, comparing it against existing vtables, and writing the
+  // resulting vtable offset.
+  uoffset_t EndTable(uoffset_t start) {
+    // If you get this assert, a corresponding StartTable wasn't called.
+    FLATBUFFERS_ASSERT(nested);
+    // Write the vtable offset, which is the start of any Table.
+    // We fill it's value later.
+    auto vtableoffsetloc = PushElement<soffset_t>(0);
+    // Write a vtable, which consists entirely of voffset_t elements.
+    // It starts with the number of offsets, followed by a type id, followed
+    // by the offsets themselves. In reverse:
+    // Include space for the last offset and ensure empty tables have a
+    // minimum size.
+    max_voffset_ =
+        (std::max)(static_cast<voffset_t>(max_voffset_ + sizeof(voffset_t)),
+                   FieldIndexToOffset(0));
+    buf_.fill_big(max_voffset_);
+    auto table_object_size = vtableoffsetloc - start;
+    // Vtable use 16bit offsets.
+    FLATBUFFERS_ASSERT(table_object_size < 0x10000);
+    WriteScalar<voffset_t>(buf_.data() + sizeof(voffset_t),
+                           static_cast<voffset_t>(table_object_size));
+    WriteScalar<voffset_t>(buf_.data(), max_voffset_);
+    // Write the offsets into the table
+    for (auto it = buf_.scratch_end() - num_field_loc * sizeof(FieldLoc);
+         it < buf_.scratch_end(); it += sizeof(FieldLoc)) {
+      auto field_location = reinterpret_cast<FieldLoc *>(it);
+      auto pos = static_cast<voffset_t>(vtableoffsetloc - field_location->off);
+      // If this asserts, it means you've set a field twice.
+      FLATBUFFERS_ASSERT(
+          !ReadScalar<voffset_t>(buf_.data() + field_location->id));
+      WriteScalar<voffset_t>(buf_.data() + field_location->id, pos);
+    }
+    ClearOffsets();
+    auto vt1 = reinterpret_cast<voffset_t *>(buf_.data());
+    auto vt1_size = ReadScalar<voffset_t>(vt1);
+    auto vt_use = GetSize();
+    // See if we already have generated a vtable with this exact same
+    // layout before. If so, make it point to the old one, remove this one.
+    if (dedup_vtables_) {
+      for (auto it = buf_.scratch_data(); it < buf_.scratch_end();
+           it += sizeof(uoffset_t)) {
+        auto vt_offset_ptr = reinterpret_cast<uoffset_t *>(it);
+        auto vt2 = reinterpret_cast<voffset_t *>(buf_.data_at(*vt_offset_ptr));
+        auto vt2_size = ReadScalar<voffset_t>(vt2);
+        if (vt1_size != vt2_size || 0 != memcmp(vt2, vt1, vt1_size)) continue;
+        vt_use = *vt_offset_ptr;
+        buf_.pop(GetSize() - vtableoffsetloc);
+        break;
+      }
+    }
+    // If this is a new vtable, remember it.
+    if (vt_use == GetSize()) { buf_.scratch_push_small(vt_use); }
+    // Fill the vtable offset we created above.
+    // The offset points from the beginning of the object to where the
+    // vtable is stored.
+    // Offsets default direction is downward in memory for future format
+    // flexibility (storing all vtables at the start of the file).
+    WriteScalar(buf_.data_at(vtableoffsetloc),
+                static_cast<soffset_t>(vt_use) -
+                    static_cast<soffset_t>(vtableoffsetloc));
+
+    nested = false;
+    return vtableoffsetloc;
+  }
+
+  FLATBUFFERS_ATTRIBUTE(deprecated("call the version above instead"))
+  uoffset_t EndTable(uoffset_t start, voffset_t /*numfields*/) {
+    return EndTable(start);
+  }
+
+  // This checks a required field has been set in a given table that has
+  // just been constructed.
+  template<typename T> void Required(Offset<T> table, voffset_t field);
+
+  uoffset_t StartStruct(size_t alignment) {
+    Align(alignment);
+    return GetSize();
+  }
+
+  uoffset_t EndStruct() { return GetSize(); }
+
+  void ClearOffsets() {
+    buf_.scratch_pop(num_field_loc * sizeof(FieldLoc));
+    num_field_loc = 0;
+    max_voffset_ = 0;
+  }
+
+  // Aligns such that when "len" bytes are written, an object can be written
+  // after it with "alignment" without padding.
+  void PreAlign(size_t len, size_t alignment) {
+    TrackMinAlign(alignment);
+    buf_.fill(PaddingBytes(GetSize() + len, alignment));
+  }
+  template<typename T> void PreAlign(size_t len) {
+    AssertScalarT<T>();
+    PreAlign(len, sizeof(T));
+  }
+  /// @endcond
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const char pointer to the data to be stored as a string.
+  /// @param[in] len The number of bytes that should be stored from `str`.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateString(const char *str, size_t len) {
+    NotNested();
+    PreAlign<uoffset_t>(len + 1);  // Always 0-terminated.
+    buf_.fill(1);
+    PushBytes(reinterpret_cast<const uint8_t *>(str), len);
+    PushElement(static_cast<uoffset_t>(len));
+    return Offset<String>(GetSize());
+  }
+
+  /// @brief Store a string in the buffer, which is null-terminated.
+  /// @param[in] str A const char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateString(const char *str) {
+    return CreateString(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which is null-terminated.
+  /// @param[in] str A char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateString(char *str) {
+    return CreateString(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const reference to a std::string to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateString(const std::string &str) {
+    return CreateString(str.c_str(), str.length());
+  }
+
+  // clang-format off
+  #ifdef FLATBUFFERS_HAS_STRING_VIEW
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const string_view to copy in to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateString(flatbuffers::string_view str) {
+    return CreateString(str.data(), str.size());
+  }
+  #endif // FLATBUFFERS_HAS_STRING_VIEW
+  // clang-format on
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const pointer to a `String` struct to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  Offset<String> CreateString(const String *str) {
+    return str ? CreateString(str->c_str(), str->size()) : 0;
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const reference to a std::string like type with support
+  /// of T::c_str() and T::length() to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<typename T> Offset<String> CreateString(const T &str) {
+    return CreateString(str.c_str(), str.length());
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string.
+  /// @param[in] str A const char pointer to the data to be stored as a string.
+  /// @param[in] len The number of bytes that should be stored from `str`.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const char *str, size_t len) {
+    if (!string_pool)
+      string_pool = new StringOffsetMap(StringOffsetCompare(buf_));
+    auto size_before_string = buf_.size();
+    // Must first serialize the string, since the set is all offsets into
+    // buffer.
+    auto off = CreateString(str, len);
+    auto it = string_pool->find(off);
+    // If it exists we reuse existing serialized data!
+    if (it != string_pool->end()) {
+      // We can remove the string we serialized.
+      buf_.pop(buf_.size() - size_before_string);
+      return *it;
+    }
+    // Record this string for future use.
+    string_pool->insert(off);
+    return off;
+  }
+
+#ifdef FLATBUFFERS_HAS_STRING_VIEW
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string.
+  /// @param[in] str A const std::string_view to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  Offset<String> CreateSharedString(const flatbuffers::string_view str) {
+    return CreateSharedString(str.data(), str.size());
+  }
+#else
+  /// @brief Store a string in the buffer, which null-terminated.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string.
+  /// @param[in] str A const char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const char *str) {
+    return CreateSharedString(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string.
+  /// @param[in] str A const reference to a std::string to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const std::string &str) {
+    return CreateSharedString(str.c_str(), str.length());
+  }
+#endif
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string.
+  /// @param[in] str A const pointer to a `String` struct to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  Offset<String> CreateSharedString(const String *str) {
+    return CreateSharedString(str->c_str(), str->size());
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  uoffset_t EndVector(size_t len) {
+    FLATBUFFERS_ASSERT(nested);  // Hit if no corresponding StartVector.
+    nested = false;
+    return PushElement(static_cast<uoffset_t>(len));
+  }
+
+  void StartVector(size_t len, size_t elemsize) {
+    NotNested();
+    nested = true;
+    PreAlign<uoffset_t>(len * elemsize);
+    PreAlign(len * elemsize, elemsize);  // Just in case elemsize > uoffset_t.
+  }
+
+  // Call this right before StartVector/CreateVector if you want to force the
+  // alignment to be something different than what the element size would
+  // normally dictate.
+  // This is useful when storing a nested_flatbuffer in a vector of bytes,
+  // or when storing SIMD floats, etc.
+  void ForceVectorAlignment(size_t len, size_t elemsize, size_t alignment) {
+    FLATBUFFERS_ASSERT(VerifyAlignmentRequirements(alignment));
+    PreAlign(len * elemsize, alignment);
+  }
+
+  // Similar to ForceVectorAlignment but for String fields.
+  void ForceStringAlignment(size_t len, size_t alignment) {
+    FLATBUFFERS_ASSERT(VerifyAlignmentRequirements(alignment));
+    PreAlign((len + 1) * sizeof(char), alignment);
+  }
+
+  /// @endcond
+
+  /// @brief Serialize an array into a FlatBuffer `vector`.
+  /// @tparam T The data type of the array elements.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T> Offset<Vector<T>> CreateVector(const T *v, size_t len) {
+    // If this assert hits, you're specifying a template argument that is
+    // causing the wrong overload to be selected, remove it.
+    AssertScalarT<T>();
+    StartVector(len, sizeof(T));
+    if (len == 0) { return Offset<Vector<T>>(EndVector(len)); }
+    // clang-format off
+    #if FLATBUFFERS_LITTLEENDIAN
+      PushBytes(reinterpret_cast<const uint8_t *>(v), len * sizeof(T));
+    #else
+      if (sizeof(T) == 1) {
+        PushBytes(reinterpret_cast<const uint8_t *>(v), len);
+      } else {
+        for (auto i = len; i > 0; ) {
+          PushElement(v[--i]);
+        }
+      }
+    #endif
+    // clang-format on
+    return Offset<Vector<T>>(EndVector(len));
+  }
+
+  template<typename T>
+  Offset<Vector<Offset<T>>> CreateVector(const Offset<T> *v, size_t len) {
+    StartVector(len, sizeof(Offset<T>));
+    for (auto i = len; i > 0;) { PushElement(v[--i]); }
+    return Offset<Vector<Offset<T>>>(EndVector(len));
+  }
+
+  /// @brief Serialize a `std::vector` into a FlatBuffer `vector`.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param v A const reference to the `std::vector` to serialize into the
+  /// buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T> Offset<Vector<T>> CreateVector(const std::vector<T> &v) {
+    return CreateVector(data(v), v.size());
+  }
+
+  // vector<bool> may be implemented using a bit-set, so we can't access it as
+  // an array. Instead, read elements manually.
+  // Background: https://isocpp.org/blog/2012/11/on-vectorbool
+  Offset<Vector<uint8_t>> CreateVector(const std::vector<bool> &v) {
+    StartVector(v.size(), sizeof(uint8_t));
+    for (auto i = v.size(); i > 0;) {
+      PushElement(static_cast<uint8_t>(v[--i]));
+    }
+    return Offset<Vector<uint8_t>>(EndVector(v.size()));
+  }
+
+  // clang-format off
+  #ifndef FLATBUFFERS_CPP98_STL
+  /// @brief Serialize values returned by a function into a FlatBuffer `vector`.
+  /// This is a convenience function that takes care of iteration for you.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param f A function that takes the current iteration 0..vector_size-1 and
+  /// returns any type that you can construct a FlatBuffers vector out of.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T> Offset<Vector<T>> CreateVector(size_t vector_size,
+      const std::function<T (size_t i)> &f) {
+    std::vector<T> elems(vector_size);
+    for (size_t i = 0; i < vector_size; i++) elems[i] = f(i);
+    return CreateVector(elems);
+  }
+  #endif
+  // clang-format on
+
+  /// @brief Serialize values returned by a function into a FlatBuffer `vector`.
+  /// This is a convenience function that takes care of iteration for you.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param f A function that takes the current iteration 0..vector_size-1,
+  /// and the state parameter returning any type that you can construct a
+  /// FlatBuffers vector out of.
+  /// @param state State passed to f.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename F, typename S>
+  Offset<Vector<T>> CreateVector(size_t vector_size, F f, S *state) {
+    std::vector<T> elems(vector_size);
+    for (size_t i = 0; i < vector_size; i++) elems[i] = f(i, state);
+    return CreateVector(elems);
+  }
+
+  /// @brief Serialize a `std::vector<std::string>` into a FlatBuffer `vector`.
+  /// This is a convenience function for a common case.
+  /// @param v A const reference to the `std::vector` to serialize into the
+  /// buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  Offset<Vector<Offset<String>>> CreateVectorOfStrings(
+      const std::vector<std::string> &v) {
+    std::vector<Offset<String>> offsets(v.size());
+    for (size_t i = 0; i < v.size(); i++) offsets[i] = CreateString(v[i]);
+    return CreateVector(offsets);
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<const T *>> CreateVectorOfStructs(const T *v, size_t len) {
+    StartVector(len * sizeof(T) / AlignOf<T>(), AlignOf<T>());
+    PushBytes(reinterpret_cast<const uint8_t *>(v), sizeof(T) * len);
+    return Offset<Vector<const T *>>(EndVector(len));
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @param[in] pack_func Pointer to a function to convert the native struct
+  /// to the FlatBuffer struct.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const S *v, size_t len, T((*const pack_func)(const S &))) {
+    FLATBUFFERS_ASSERT(pack_func);
+    std::vector<T> vv(len);
+    std::transform(v, v + len, vv.begin(), pack_func);
+    return CreateVectorOfStructs<T>(data(vv), vv.size());
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(const S *v,
+                                                        size_t len) {
+    extern T Pack(const S &);
+    return CreateVectorOfNativeStructs(v, len, Pack);
+  }
+
+  // clang-format off
+  #ifndef FLATBUFFERS_CPP98_STL
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] filler A function that takes the current iteration 0..vector_size-1
+  /// and a pointer to the struct that must be filled.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  /// This is mostly useful when flatbuffers are generated with mutation
+  /// accessors.
+  template<typename T> Offset<Vector<const T *>> CreateVectorOfStructs(
+      size_t vector_size, const std::function<void(size_t i, T *)> &filler) {
+    T* structs = StartVectorOfStructs<T>(vector_size);
+    for (size_t i = 0; i < vector_size; i++) {
+      filler(i, structs);
+      structs++;
+    }
+    return EndVectorOfStructs<T>(vector_size);
+  }
+  #endif
+  // clang-format on
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] f A function that takes the current iteration 0..vector_size-1,
+  /// a pointer to the struct that must be filled and the state argument.
+  /// @param[in] state Arbitrary state to pass to f.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  /// This is mostly useful when flatbuffers are generated with mutation
+  /// accessors.
+  template<typename T, typename F, typename S>
+  Offset<Vector<const T *>> CreateVectorOfStructs(size_t vector_size, F f,
+                                                  S *state) {
+    T *structs = StartVectorOfStructs<T>(vector_size);
+    for (size_t i = 0; i < vector_size; i++) {
+      f(i, structs, state);
+      structs++;
+    }
+    return EndVectorOfStructs<T>(vector_size);
+  }
+
+  /// @brief Serialize a `std::vector` of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename Alloc>
+  Offset<Vector<const T *>> CreateVectorOfStructs(
+      const std::vector<T, Alloc> &v) {
+    return CreateVectorOfStructs(data(v), v.size());
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @param[in] pack_func Pointer to a function to convert the native struct
+  /// to the FlatBuffer struct.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const std::vector<S> &v, T((*const pack_func)(const S &))) {
+    return CreateVectorOfNativeStructs<T, S>(data(v), v.size(), pack_func);
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const std::vector<S> &v) {
+    return CreateVectorOfNativeStructs<T, S>(data(v), v.size());
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  template<typename T> struct StructKeyComparator {
+    bool operator()(const T &a, const T &b) const {
+      return a.KeyCompareLessThan(&b);
+    }
+
+    FLATBUFFERS_DELETE_FUNC(
+        StructKeyComparator &operator=(const StructKeyComparator &));
+  };
+  /// @endcond
+
+  /// @brief Serialize a `std::vector` of structs into a FlatBuffer `vector`
+  /// in sorted order.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<const T *>> CreateVectorOfSortedStructs(std::vector<T> *v) {
+    return CreateVectorOfSortedStructs(data(*v), v->size());
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector` in sorted order.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfSortedNativeStructs(
+      std::vector<S> *v) {
+    return CreateVectorOfSortedNativeStructs<T, S>(data(*v), v->size());
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector` in sorted
+  /// order.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<const T *>> CreateVectorOfSortedStructs(T *v, size_t len) {
+    std::sort(v, v + len, StructKeyComparator<T>());
+    return CreateVectorOfStructs(v, len);
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector` in
+  /// sorted order.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfSortedNativeStructs(S *v,
+                                                              size_t len) {
+    extern T Pack(const S &);
+    typedef T (*Pack_t)(const S &);
+    std::vector<T> vv(len);
+    std::transform(v, v + len, vv.begin(), static_cast<Pack_t &>(Pack));
+    return CreateVectorOfSortedStructs<T>(vv, len);
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  template<typename T> struct TableKeyComparator {
+    TableKeyComparator(vector_downward &buf) : buf_(buf) {}
+    TableKeyComparator(const TableKeyComparator &other) : buf_(other.buf_) {}
+    bool operator()(const Offset<T> &a, const Offset<T> &b) const {
+      auto table_a = reinterpret_cast<T *>(buf_.data_at(a.o));
+      auto table_b = reinterpret_cast<T *>(buf_.data_at(b.o));
+      return table_a->KeyCompareLessThan(table_b);
+    }
+    vector_downward &buf_;
+
+   private:
+    FLATBUFFERS_DELETE_FUNC(
+        TableKeyComparator &operator=(const TableKeyComparator &other));
+  };
+  /// @endcond
+
+  /// @brief Serialize an array of `table` offsets as a `vector` in the buffer
+  /// in sorted order.
+  /// @tparam T The data type that the offset refers to.
+  /// @param[in] v An array of type `Offset<T>` that contains the `table`
+  /// offsets to store in the buffer in sorted order.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<Offset<T>>> CreateVectorOfSortedTables(Offset<T> *v,
+                                                       size_t len) {
+    std::sort(v, v + len, TableKeyComparator<T>(buf_));
+    return CreateVector(v, len);
+  }
+
+  /// @brief Serialize an array of `table` offsets as a `vector` in the buffer
+  /// in sorted order.
+  /// @tparam T The data type that the offset refers to.
+  /// @param[in] v An array of type `Offset<T>` that contains the `table`
+  /// offsets to store in the buffer in sorted order.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<Offset<T>>> CreateVectorOfSortedTables(
+      std::vector<Offset<T>> *v) {
+    return CreateVectorOfSortedTables(data(*v), v->size());
+  }
+
+  /// @brief Specialized version of `CreateVector` for non-copying use cases.
+  /// Write the data any time later to the returned buffer pointer `buf`.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @param[in] elemsize The size of each element in the `vector`.
+  /// @param[out] buf A pointer to a `uint8_t` pointer that can be
+  /// written to at a later time to serialize the data into a `vector`
+  /// in the buffer.
+  uoffset_t CreateUninitializedVector(size_t len, size_t elemsize,
+                                      uint8_t **buf) {
+    NotNested();
+    StartVector(len, elemsize);
+    buf_.make_space(len * elemsize);
+    auto vec_start = GetSize();
+    auto vec_end = EndVector(len);
+    *buf = buf_.data_at(vec_start);
+    return vec_end;
+  }
+
+  /// @brief Specialized version of `CreateVector` for non-copying use cases.
+  /// Write the data any time later to the returned buffer pointer `buf`.
+  /// @tparam T The data type of the data that will be stored in the buffer
+  /// as a `vector`.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @param[out] buf A pointer to a pointer of type `T` that can be
+  /// written to at a later time to serialize the data into a `vector`
+  /// in the buffer.
+  template<typename T>
+  Offset<Vector<T>> CreateUninitializedVector(size_t len, T **buf) {
+    AssertScalarT<T>();
+    return CreateUninitializedVector(len, sizeof(T),
+                                     reinterpret_cast<uint8_t **>(buf));
+  }
+
+  template<typename T>
+  Offset<Vector<const T *>> CreateUninitializedVectorOfStructs(size_t len,
+                                                               T **buf) {
+    return CreateUninitializedVector(len, sizeof(T),
+                                     reinterpret_cast<uint8_t **>(buf));
+  }
+
+  // @brief Create a vector of scalar type T given as input a vector of scalar
+  // type U, useful with e.g. pre "enum class" enums, or any existing scalar
+  // data of the wrong type.
+  template<typename T, typename U>
+  Offset<Vector<T>> CreateVectorScalarCast(const U *v, size_t len) {
+    AssertScalarT<T>();
+    AssertScalarT<U>();
+    StartVector(len, sizeof(T));
+    for (auto i = len; i > 0;) { PushElement(static_cast<T>(v[--i])); }
+    return Offset<Vector<T>>(EndVector(len));
+  }
+
+  /// @brief Write a struct by itself, typically to be part of a union.
+  template<typename T> Offset<const T *> CreateStruct(const T &structobj) {
+    NotNested();
+    Align(AlignOf<T>());
+    buf_.push_small(structobj);
+    return Offset<const T *>(GetSize());
+  }
+
+  /// @brief The length of a FlatBuffer file header.
+  static const size_t kFileIdentifierLength = 4;
+
+  /// @brief Finish serializing a buffer by writing the root offset.
+  /// @param[in] file_identifier If a `file_identifier` is given, the buffer
+  /// will be prefixed with a standard FlatBuffers file header.
+  template<typename T>
+  void Finish(Offset<T> root, const char *file_identifier = nullptr) {
+    Finish(root.o, file_identifier, false);
+  }
+
+  /// @brief Finish a buffer with a 32 bit size field pre-fixed (size of the
+  /// buffer following the size field). These buffers are NOT compatible
+  /// with standard buffers created by Finish, i.e. you can't call GetRoot
+  /// on them, you have to use GetSizePrefixedRoot instead.
+  /// All >32 bit quantities in this buffer will be aligned when the whole
+  /// size pre-fixed buffer is aligned.
+  /// These kinds of buffers are useful for creating a stream of FlatBuffers.
+  template<typename T>
+  void FinishSizePrefixed(Offset<T> root,
+                          const char *file_identifier = nullptr) {
+    Finish(root.o, file_identifier, true);
+  }
+
+  void SwapBufAllocator(FlatBufferBuilder &other) {
+    buf_.swap_allocator(other.buf_);
+  }
+
+ protected:
+  // You shouldn't really be copying instances of this class.
+  FlatBufferBuilder(const FlatBufferBuilder &);
+  FlatBufferBuilder &operator=(const FlatBufferBuilder &);
+
+  void Finish(uoffset_t root, const char *file_identifier, bool size_prefix) {
+    NotNested();
+    buf_.clear_scratch();
+    // This will cause the whole buffer to be aligned.
+    PreAlign((size_prefix ? sizeof(uoffset_t) : 0) + sizeof(uoffset_t) +
+                 (file_identifier ? kFileIdentifierLength : 0),
+             minalign_);
+    if (file_identifier) {
+      FLATBUFFERS_ASSERT(strlen(file_identifier) == kFileIdentifierLength);
+      PushBytes(reinterpret_cast<const uint8_t *>(file_identifier),
+                kFileIdentifierLength);
+    }
+    PushElement(ReferTo(root));  // Location of root.
+    if (size_prefix) { PushElement(GetSize()); }
+    finished = true;
+  }
+
+  struct FieldLoc {
+    uoffset_t off;
+    voffset_t id;
+  };
+
+  vector_downward buf_;
+
+  // Accumulating offsets of table members while it is being built.
+  // We store these in the scratch pad of buf_, after the vtable offsets.
+  uoffset_t num_field_loc;
+  // Track how much of the vtable is in use, so we can output the most compact
+  // possible vtable.
+  voffset_t max_voffset_;
+
+  // Ensure objects are not nested.
+  bool nested;
+
+  // Ensure the buffer is finished before it is being accessed.
+  bool finished;
+
+  size_t minalign_;
+
+  bool force_defaults_;  // Serialize values equal to their defaults anyway.
+
+  bool dedup_vtables_;
+
+  struct StringOffsetCompare {
+    StringOffsetCompare(const vector_downward &buf) : buf_(&buf) {}
+    bool operator()(const Offset<String> &a, const Offset<String> &b) const {
+      auto stra = reinterpret_cast<const String *>(buf_->data_at(a.o));
+      auto strb = reinterpret_cast<const String *>(buf_->data_at(b.o));
+      return StringLessThan(stra->data(), stra->size(), strb->data(),
+                            strb->size());
+    }
+    const vector_downward *buf_;
+  };
+
+  // For use with CreateSharedString. Instantiated on first use only.
+  typedef std::set<Offset<String>, StringOffsetCompare> StringOffsetMap;
+  StringOffsetMap *string_pool;
+
+ private:
+  // Allocates space for a vector of structures.
+  // Must be completed with EndVectorOfStructs().
+  template<typename T> T *StartVectorOfStructs(size_t vector_size) {
+    StartVector(vector_size * sizeof(T) / AlignOf<T>(), AlignOf<T>());
+    return reinterpret_cast<T *>(buf_.make_space(vector_size * sizeof(T)));
+  }
+
+  // End the vector of structues in the flatbuffers.
+  // Vector should have previously be started with StartVectorOfStructs().
+  template<typename T>
+  Offset<Vector<const T *>> EndVectorOfStructs(size_t vector_size) {
+    return Offset<Vector<const T *>>(EndVector(vector_size));
+  }
+};
+/// @}
+
+/// @cond FLATBUFFERS_INTERNAL
+// Helpers to get a typed pointer to the root object contained in the buffer.
+template<typename T> T *GetMutableRoot(void *buf) {
+  EndianCheck();
+  return reinterpret_cast<T *>(
+      reinterpret_cast<uint8_t *>(buf) +
+      EndianScalar(*reinterpret_cast<uoffset_t *>(buf)));
+}
+
+template<typename T> const T *GetRoot(const void *buf) {
+  return GetMutableRoot<T>(const_cast<void *>(buf));
+}
+
+template<typename T> const T *GetSizePrefixedRoot(const void *buf) {
+  return GetRoot<T>(reinterpret_cast<const uint8_t *>(buf) + sizeof(uoffset_t));
+}
+
+/// Helpers to get a typed pointer to objects that are currently being built.
+/// @warning Creating new objects will lead to reallocations and invalidates
+/// the pointer!
+template<typename T>
+T *GetMutableTemporaryPointer(FlatBufferBuilder &fbb, Offset<T> offset) {
+  return reinterpret_cast<T *>(fbb.GetCurrentBufferPointer() + fbb.GetSize() -
+                               offset.o);
+}
+
+template<typename T>
+const T *GetTemporaryPointer(FlatBufferBuilder &fbb, Offset<T> offset) {
+  return GetMutableTemporaryPointer<T>(fbb, offset);
+}
+
+/// @brief Get a pointer to the the file_identifier section of the buffer.
+/// @return Returns a const char pointer to the start of the file_identifier
+/// characters in the buffer.  The returned char * has length
+/// 'flatbuffers::FlatBufferBuilder::kFileIdentifierLength'.
+/// This function is UNDEFINED for FlatBuffers whose schema does not include
+/// a file_identifier (likely points at padding or the start of a the root
+/// vtable).
+inline const char *GetBufferIdentifier(const void *buf,
+                                       bool size_prefixed = false) {
+  return reinterpret_cast<const char *>(buf) +
+         ((size_prefixed) ? 2 * sizeof(uoffset_t) : sizeof(uoffset_t));
+}
+
+// Helper to see if the identifier in a buffer has the expected value.
+inline bool BufferHasIdentifier(const void *buf, const char *identifier,
+                                bool size_prefixed = false) {
+  return strncmp(GetBufferIdentifier(buf, size_prefixed), identifier,
+                 FlatBufferBuilder::kFileIdentifierLength) == 0;
+}
+
+// Helper class to verify the integrity of a FlatBuffer
+class Verifier FLATBUFFERS_FINAL_CLASS {
+ public:
+  Verifier(const uint8_t *buf, size_t buf_len, uoffset_t _max_depth = 64,
+           uoffset_t _max_tables = 1000000, bool _check_alignment = true)
+      : buf_(buf),
+        size_(buf_len),
+        depth_(0),
+        max_depth_(_max_depth),
+        num_tables_(0),
+        max_tables_(_max_tables),
+        upper_bound_(0),
+        check_alignment_(_check_alignment) {
+    FLATBUFFERS_ASSERT(size_ < FLATBUFFERS_MAX_BUFFER_SIZE);
+  }
+
+  // Central location where any verification failures register.
+  bool Check(bool ok) const {
+    // clang-format off
+    #ifdef FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
+      FLATBUFFERS_ASSERT(ok);
+    #endif
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      if (!ok)
+        upper_bound_ = 0;
+    #endif
+    // clang-format on
+    return ok;
+  }
+
+  // Verify any range within the buffer.
+  bool Verify(size_t elem, size_t elem_len) const {
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      auto upper_bound = elem + elem_len;
+      if (upper_bound_ < upper_bound)
+        upper_bound_ =  upper_bound;
+    #endif
+    // clang-format on
+    return Check(elem_len < size_ && elem <= size_ - elem_len);
+  }
+
+  template<typename T> bool VerifyAlignment(size_t elem) const {
+    return Check((elem & (sizeof(T) - 1)) == 0 || !check_alignment_);
+  }
+
+  // Verify a range indicated by sizeof(T).
+  template<typename T> bool Verify(size_t elem) const {
+    return VerifyAlignment<T>(elem) && Verify(elem, sizeof(T));
+  }
+
+  bool VerifyFromPointer(const uint8_t *p, size_t len) {
+    auto o = static_cast<size_t>(p - buf_);
+    return Verify(o, len);
+  }
+
+  // Verify relative to a known-good base pointer.
+  bool Verify(const uint8_t *base, voffset_t elem_off, size_t elem_len) const {
+    return Verify(static_cast<size_t>(base - buf_) + elem_off, elem_len);
+  }
+
+  template<typename T>
+  bool Verify(const uint8_t *base, voffset_t elem_off) const {
+    return Verify(static_cast<size_t>(base - buf_) + elem_off, sizeof(T));
+  }
+
+  // Verify a pointer (may be NULL) of a table type.
+  template<typename T> bool VerifyTable(const T *table) {
+    return !table || table->Verify(*this);
+  }
+
+  // Verify a pointer (may be NULL) of any vector type.
+  template<typename T> bool VerifyVector(const Vector<T> *vec) const {
+    return !vec || VerifyVectorOrString(reinterpret_cast<const uint8_t *>(vec),
+                                        sizeof(T));
+  }
+
+  // Verify a pointer (may be NULL) of a vector to struct.
+  template<typename T> bool VerifyVector(const Vector<const T *> *vec) const {
+    return VerifyVector(reinterpret_cast<const Vector<T> *>(vec));
+  }
+
+  // Verify a pointer (may be NULL) to string.
+  bool VerifyString(const String *str) const {
+    size_t end;
+    return !str || (VerifyVectorOrString(reinterpret_cast<const uint8_t *>(str),
+                                         1, &end) &&
+                    Verify(end, 1) &&           // Must have terminator
+                    Check(buf_[end] == '\0'));  // Terminating byte must be 0.
+  }
+
+  // Common code between vectors and strings.
+  bool VerifyVectorOrString(const uint8_t *vec, size_t elem_size,
+                            size_t *end = nullptr) const {
+    auto veco = static_cast<size_t>(vec - buf_);
+    // Check we can read the size field.
+    if (!Verify<uoffset_t>(veco)) return false;
+    // Check the whole array. If this is a string, the byte past the array
+    // must be 0.
+    auto size = ReadScalar<uoffset_t>(vec);
+    auto max_elems = FLATBUFFERS_MAX_BUFFER_SIZE / elem_size;
+    if (!Check(size < max_elems))
+      return false;  // Protect against byte_size overflowing.
+    auto byte_size = sizeof(size) + elem_size * size;
+    if (end) *end = veco + byte_size;
+    return Verify(veco, byte_size);
+  }
+
+  // Special case for string contents, after the above has been called.
+  bool VerifyVectorOfStrings(const Vector<Offset<String>> *vec) const {
+    if (vec) {
+      for (uoffset_t i = 0; i < vec->size(); i++) {
+        if (!VerifyString(vec->Get(i))) return false;
+      }
+    }
+    return true;
+  }
+
+  // Special case for table contents, after the above has been called.
+  template<typename T> bool VerifyVectorOfTables(const Vector<Offset<T>> *vec) {
+    if (vec) {
+      for (uoffset_t i = 0; i < vec->size(); i++) {
+        if (!vec->Get(i)->Verify(*this)) return false;
+      }
+    }
+    return true;
+  }
+
+  __supress_ubsan__("unsigned-integer-overflow") bool VerifyTableStart(
+      const uint8_t *table) {
+    // Check the vtable offset.
+    auto tableo = static_cast<size_t>(table - buf_);
+    if (!Verify<soffset_t>(tableo)) return false;
+    // This offset may be signed, but doing the subtraction unsigned always
+    // gives the result we want.
+    auto vtableo = tableo - static_cast<size_t>(ReadScalar<soffset_t>(table));
+    // Check the vtable size field, then check vtable fits in its entirety.
+    return VerifyComplexity() && Verify<voffset_t>(vtableo) &&
+           VerifyAlignment<voffset_t>(ReadScalar<voffset_t>(buf_ + vtableo)) &&
+           Verify(vtableo, ReadScalar<voffset_t>(buf_ + vtableo));
+  }
+
+  template<typename T>
+  bool VerifyBufferFromStart(const char *identifier, size_t start) {
+    if (identifier && !Check((size_ >= 2 * sizeof(flatbuffers::uoffset_t) &&
+                              BufferHasIdentifier(buf_ + start, identifier)))) {
+      return false;
+    }
+
+    // Call T::Verify, which must be in the generated code for this type.
+    auto o = VerifyOffset(start);
+    return o && reinterpret_cast<const T *>(buf_ + start + o)->Verify(*this)
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+           && GetComputedSize()
+    #endif
+        ;
+    // clang-format on
+  }
+
+  // Verify this whole buffer, starting with root type T.
+  template<typename T> bool VerifyBuffer() { return VerifyBuffer<T>(nullptr); }
+
+  template<typename T> bool VerifyBuffer(const char *identifier) {
+    return VerifyBufferFromStart<T>(identifier, 0);
+  }
+
+  template<typename T> bool VerifySizePrefixedBuffer(const char *identifier) {
+    return Verify<uoffset_t>(0U) &&
+           ReadScalar<uoffset_t>(buf_) == size_ - sizeof(uoffset_t) &&
+           VerifyBufferFromStart<T>(identifier, sizeof(uoffset_t));
+  }
+
+  uoffset_t VerifyOffset(size_t start) const {
+    if (!Verify<uoffset_t>(start)) return 0;
+    auto o = ReadScalar<uoffset_t>(buf_ + start);
+    // May not point to itself.
+    if (!Check(o != 0)) return 0;
+    // Can't wrap around / buffers are max 2GB.
+    if (!Check(static_cast<soffset_t>(o) >= 0)) return 0;
+    // Must be inside the buffer to create a pointer from it (pointer outside
+    // buffer is UB).
+    if (!Verify(start + o, 1)) return 0;
+    return o;
+  }
+
+  uoffset_t VerifyOffset(const uint8_t *base, voffset_t start) const {
+    return VerifyOffset(static_cast<size_t>(base - buf_) + start);
+  }
+
+  // Called at the start of a table to increase counters measuring data
+  // structure depth and amount, and possibly bails out with false if
+  // limits set by the constructor have been hit. Needs to be balanced
+  // with EndTable().
+  bool VerifyComplexity() {
+    depth_++;
+    num_tables_++;
+    return Check(depth_ <= max_depth_ && num_tables_ <= max_tables_);
+  }
+
+  // Called at the end of a table to pop the depth count.
+  bool EndTable() {
+    depth_--;
+    return true;
+  }
+
+  // Returns the message size in bytes
+  size_t GetComputedSize() const {
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      uintptr_t size = upper_bound_;
+      // Align the size to uoffset_t
+      size = (size - 1 + sizeof(uoffset_t)) & ~(sizeof(uoffset_t) - 1);
+      return (size > size_) ?  0 : size;
+    #else
+      // Must turn on FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE for this to work.
+      (void)upper_bound_;
+      FLATBUFFERS_ASSERT(false);
+      return 0;
+    #endif
+    // clang-format on
+  }
+
+ private:
+  const uint8_t *buf_;
+  size_t size_;
+  uoffset_t depth_;
+  uoffset_t max_depth_;
+  uoffset_t num_tables_;
+  uoffset_t max_tables_;
+  mutable size_t upper_bound_;
+  bool check_alignment_;
+};
+
+// Convenient way to bundle a buffer and its length, to pass it around
+// typed by its root.
+// A BufferRef does not own its buffer.
+struct BufferRefBase {};  // for std::is_base_of
+template<typename T> struct BufferRef : BufferRefBase {
+  BufferRef() : buf(nullptr), len(0), must_free(false) {}
+  BufferRef(uint8_t *_buf, uoffset_t _len)
+      : buf(_buf), len(_len), must_free(false) {}
+
+  ~BufferRef() {
+    if (must_free) free(buf);
+  }
+
+  const T *GetRoot() const { return flatbuffers::GetRoot<T>(buf); }
+
+  bool Verify() {
+    Verifier verifier(buf, len);
+    return verifier.VerifyBuffer<T>(nullptr);
+  }
+
+  uint8_t *buf;
+  uoffset_t len;
+  bool must_free;
+};
+
+// "structs" are flat structures that do not have an offset table, thus
+// always have all members present and do not support forwards/backwards
+// compatible extensions.
+
+class Struct FLATBUFFERS_FINAL_CLASS {
+ public:
+  template<typename T> T GetField(uoffset_t o) const {
+    return ReadScalar<T>(&data_[o]);
+  }
+
+  template<typename T> T GetStruct(uoffset_t o) const {
+    return reinterpret_cast<T>(&data_[o]);
+  }
+
+  const uint8_t *GetAddressOf(uoffset_t o) const { return &data_[o]; }
+  uint8_t *GetAddressOf(uoffset_t o) { return &data_[o]; }
+
+ private:
+  // private constructor & copy constructor: you obtain instances of this
+  // class by pointing to existing data only
+  Struct();
+  Struct(const Struct &);
+  Struct &operator=(const Struct &);
+
+  uint8_t data_[1];
+};
+
+// "tables" use an offset table (possibly shared) that allows fields to be
+// omitted and added at will, but uses an extra indirection to read.
+class Table {
+ public:
+  const uint8_t *GetVTable() const {
+    return data_ - ReadScalar<soffset_t>(data_);
+  }
+
+  // This gets the field offset for any of the functions below it, or 0
+  // if the field was not present.
+  voffset_t GetOptionalFieldOffset(voffset_t field) const {
+    // The vtable offset is always at the start.
+    auto vtable = GetVTable();
+    // The first element is the size of the vtable (fields + type id + itself).
+    auto vtsize = ReadScalar<voffset_t>(vtable);
+    // If the field we're accessing is outside the vtable, we're reading older
+    // data, so it's the same as if the offset was 0 (not present).
+    return field < vtsize ? ReadScalar<voffset_t>(vtable + field) : 0;
+  }
+
+  template<typename T> T GetField(voffset_t field, T defaultval) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return field_offset ? ReadScalar<T>(data_ + field_offset) : defaultval;
+  }
+
+  template<typename P> P GetPointer(voffset_t field) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = data_ + field_offset;
+    return field_offset ? reinterpret_cast<P>(p + ReadScalar<uoffset_t>(p))
+                        : nullptr;
+  }
+  template<typename P> P GetPointer(voffset_t field) const {
+    return const_cast<Table *>(this)->GetPointer<P>(field);
+  }
+
+  template<typename P> P GetStruct(voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = const_cast<uint8_t *>(data_ + field_offset);
+    return field_offset ? reinterpret_cast<P>(p) : nullptr;
+  }
+
+  template<typename Raw, typename Face>
+  flatbuffers::Optional<Face> GetOptional(voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = data_ + field_offset;
+    return field_offset ? Optional<Face>(static_cast<Face>(ReadScalar<Raw>(p)))
+                        : Optional<Face>();
+  }
+
+  template<typename T> bool SetField(voffset_t field, T val, T def) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return IsTheSameAs(val, def);
+    WriteScalar(data_ + field_offset, val);
+    return true;
+  }
+  template<typename T> bool SetField(voffset_t field, T val) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return false;
+    WriteScalar(data_ + field_offset, val);
+    return true;
+  }
+
+  bool SetPointer(voffset_t field, const uint8_t *val) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return false;
+    WriteScalar(data_ + field_offset,
+                static_cast<uoffset_t>(val - (data_ + field_offset)));
+    return true;
+  }
+
+  uint8_t *GetAddressOf(voffset_t field) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return field_offset ? data_ + field_offset : nullptr;
+  }
+  const uint8_t *GetAddressOf(voffset_t field) const {
+    return const_cast<Table *>(this)->GetAddressOf(field);
+  }
+
+  bool CheckField(voffset_t field) const {
+    return GetOptionalFieldOffset(field) != 0;
+  }
+
+  // Verify the vtable of this table.
+  // Call this once per table, followed by VerifyField once per field.
+  bool VerifyTableStart(Verifier &verifier) const {
+    return verifier.VerifyTableStart(data_);
+  }
+
+  // Verify a particular field.
+  template<typename T>
+  bool VerifyField(const Verifier &verifier, voffset_t field) const {
+    // Calling GetOptionalFieldOffset should be safe now thanks to
+    // VerifyTable().
+    auto field_offset = GetOptionalFieldOffset(field);
+    // Check the actual field.
+    return !field_offset || verifier.Verify<T>(data_, field_offset);
+  }
+
+  // VerifyField for required fields.
+  template<typename T>
+  bool VerifyFieldRequired(const Verifier &verifier, voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return verifier.Check(field_offset != 0) &&
+           verifier.Verify<T>(data_, field_offset);
+  }
+
+  // Versions for offsets.
+  bool VerifyOffset(const Verifier &verifier, voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return !field_offset || verifier.VerifyOffset(data_, field_offset);
+  }
+
+  bool VerifyOffsetRequired(const Verifier &verifier, voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return verifier.Check(field_offset != 0) &&
+           verifier.VerifyOffset(data_, field_offset);
+  }
+
+ private:
+  // private constructor & copy constructor: you obtain instances of this
+  // class by pointing to existing data only
+  Table();
+  Table(const Table &other);
+  Table &operator=(const Table &);
+
+  uint8_t data_[1];
+};
+
+// This specialization allows avoiding warnings like:
+// MSVC C4800: type: forcing value to bool 'true' or 'false'.
+template<>
+inline flatbuffers::Optional<bool> Table::GetOptional<uint8_t, bool>(
+    voffset_t field) const {
+  auto field_offset = GetOptionalFieldOffset(field);
+  auto p = data_ + field_offset;
+  return field_offset ? Optional<bool>(ReadScalar<uint8_t>(p) != 0)
+                      : Optional<bool>();
+}
+
+template<typename T>
+void FlatBufferBuilder::Required(Offset<T> table, voffset_t field) {
+  auto table_ptr = reinterpret_cast<const Table *>(buf_.data_at(table.o));
+  bool ok = table_ptr->GetOptionalFieldOffset(field) != 0;
+  // If this fails, the caller will show what field needs to be set.
+  FLATBUFFERS_ASSERT(ok);
+  (void)ok;
+}
+
+/// @brief This can compute the start of a FlatBuffer from a root pointer, i.e.
+/// it is the opposite transformation of GetRoot().
+/// This may be useful if you want to pass on a root and have the recipient
+/// delete the buffer afterwards.
+inline const uint8_t *GetBufferStartFromRootPointer(const void *root) {
+  auto table = reinterpret_cast<const Table *>(root);
+  auto vtable = table->GetVTable();
+  // Either the vtable is before the root or after the root.
+  auto start = (std::min)(vtable, reinterpret_cast<const uint8_t *>(root));
+  // Align to at least sizeof(uoffset_t).
+  start = reinterpret_cast<const uint8_t *>(reinterpret_cast<uintptr_t>(start) &
+                                            ~(sizeof(uoffset_t) - 1));
+  // Additionally, there may be a file_identifier in the buffer, and the root
+  // offset. The buffer may have been aligned to any size between
+  // sizeof(uoffset_t) and FLATBUFFERS_MAX_ALIGNMENT (see "force_align").
+  // Sadly, the exact alignment is only known when constructing the buffer,
+  // since it depends on the presence of values with said alignment properties.
+  // So instead, we simply look at the next uoffset_t values (root,
+  // file_identifier, and alignment padding) to see which points to the root.
+  // None of the other values can "impersonate" the root since they will either
+  // be 0 or four ASCII characters.
+  static_assert(FlatBufferBuilder::kFileIdentifierLength == sizeof(uoffset_t),
+                "file_identifier is assumed to be the same size as uoffset_t");
+  for (auto possible_roots = FLATBUFFERS_MAX_ALIGNMENT / sizeof(uoffset_t) + 1;
+       possible_roots; possible_roots--) {
+    start -= sizeof(uoffset_t);
+    if (ReadScalar<uoffset_t>(start) + start ==
+        reinterpret_cast<const uint8_t *>(root))
+      return start;
+  }
+  // We didn't find the root, either the "root" passed isn't really a root,
+  // or the buffer is corrupt.
+  // Assert, because calling this function with bad data may cause reads
+  // outside of buffer boundaries.
+  FLATBUFFERS_ASSERT(false);
+  return nullptr;
+}
+
+/// @brief This return the prefixed size of a FlatBuffer.
+inline uoffset_t GetPrefixedSize(const uint8_t *buf) {
+  return ReadScalar<uoffset_t>(buf);
+}
+
+// Base class for native objects (FlatBuffer data de-serialized into native
+// C++ data structures).
+// Contains no functionality, purely documentative.
+struct NativeTable {};
+
+/// @brief Function types to be used with resolving hashes into objects and
+/// back again. The resolver gets a pointer to a field inside an object API
+/// object that is of the type specified in the schema using the attribute
+/// `cpp_type` (it is thus important whatever you write to this address
+/// matches that type). The value of this field is initially null, so you
+/// may choose to implement a delayed binding lookup using this function
+/// if you wish. The resolver does the opposite lookup, for when the object
+/// is being serialized again.
+typedef uint64_t hash_value_t;
+// clang-format off
+#ifdef FLATBUFFERS_CPP98_STL
+  typedef void (*resolver_function_t)(void **pointer_adr, hash_value_t hash);
+  typedef hash_value_t (*rehasher_function_t)(void *pointer);
+#else
+  typedef std::function<void (void **pointer_adr, hash_value_t hash)>
+          resolver_function_t;
+  typedef std::function<hash_value_t (void *pointer)> rehasher_function_t;
+#endif
+// clang-format on
+
+// Helper function to test if a field is present, using any of the field
+// enums in the generated code.
+// `table` must be a generated table type. Since this is a template parameter,
+// this is not typechecked to be a subclass of Table, so beware!
+// Note: this function will return false for fields equal to the default
+// value, since they're not stored in the buffer (unless force_defaults was
+// used).
+template<typename T>
+bool IsFieldPresent(const T *table, typename T::FlatBuffersVTableOffset field) {
+  // Cast, since Table is a private baseclass of any table types.
+  return reinterpret_cast<const Table *>(table)->CheckField(
+      static_cast<voffset_t>(field));
+}
+
+// Utility function for reverse lookups on the EnumNames*() functions
+// (in the generated C++ code)
+// names must be NULL terminated.
+inline int LookupEnum(const char **names, const char *name) {
+  for (const char **p = names; *p; p++)
+    if (!strcmp(*p, name)) return static_cast<int>(p - names);
+  return -1;
+}
+
+// These macros allow us to layout a struct with a guarantee that they'll end
+// up looking the same on different compilers and platforms.
+// It does this by disallowing the compiler to do any padding, and then
+// does padding itself by inserting extra padding fields that make every
+// element aligned to its own size.
+// Additionally, it manually sets the alignment of the struct as a whole,
+// which is typically its largest element, or a custom size set in the schema
+// by the force_align attribute.
+// These are used in the generated code only.
+
+// clang-format off
+#if defined(_MSC_VER)
+  #define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
+    __pragma(pack(1)) \
+    struct __declspec(align(alignment))
+  #define FLATBUFFERS_STRUCT_END(name, size) \
+    __pragma(pack()) \
+    static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#elif defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
+  #define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
+    _Pragma("pack(1)") \
+    struct __attribute__((aligned(alignment)))
+  #define FLATBUFFERS_STRUCT_END(name, size) \
+    _Pragma("pack()") \
+    static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#else
+  #error Unknown compiler, please define structure alignment macros
+#endif
+// clang-format on
+
+// Minimal reflection via code generation.
+// Besides full-fat reflection (see reflection.h) and parsing/printing by
+// loading schemas (see idl.h), we can also have code generation for mimimal
+// reflection data which allows pretty-printing and other uses without needing
+// a schema or a parser.
+// Generate code with --reflect-types (types only) or --reflect-names (names
+// also) to enable.
+// See minireflect.h for utilities using this functionality.
+
+// These types are organized slightly differently as the ones in idl.h.
+enum SequenceType { ST_TABLE, ST_STRUCT, ST_UNION, ST_ENUM };
+
+// Scalars have the same order as in idl.h
+// clang-format off
+#define FLATBUFFERS_GEN_ELEMENTARY_TYPES(ET) \
+  ET(ET_UTYPE) \
+  ET(ET_BOOL) \
+  ET(ET_CHAR) \
+  ET(ET_UCHAR) \
+  ET(ET_SHORT) \
+  ET(ET_USHORT) \
+  ET(ET_INT) \
+  ET(ET_UINT) \
+  ET(ET_LONG) \
+  ET(ET_ULONG) \
+  ET(ET_FLOAT) \
+  ET(ET_DOUBLE) \
+  ET(ET_STRING) \
+  ET(ET_SEQUENCE)  // See SequenceType.
+
+enum ElementaryType {
+  #define FLATBUFFERS_ET(E) E,
+    FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
+  #undef FLATBUFFERS_ET
+};
+
+inline const char * const *ElementaryTypeNames() {
+  static const char * const names[] = {
+    #define FLATBUFFERS_ET(E) #E,
+      FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
+    #undef FLATBUFFERS_ET
+  };
+  return names;
+}
+// clang-format on
+
+// Basic type info cost just 16bits per field!
+// We're explicitly defining the signedness since the signedness of integer
+// bitfields is otherwise implementation-defined and causes warnings on older
+// GCC compilers.
+struct TypeCode {
+  // ElementaryType
+  unsigned short base_type : 4;
+  // Either vector (in table) or array (in struct)
+  unsigned short is_repeating : 1;
+  // Index into type_refs below, or -1 for none.
+  signed short sequence_ref : 11;
+};
+
+static_assert(sizeof(TypeCode) == 2, "TypeCode");
+
+struct TypeTable;
+
+// Signature of the static method present in each type.
+typedef const TypeTable *(*TypeFunction)();
+
+struct TypeTable {
+  SequenceType st;
+  size_t num_elems;  // of type_codes, values, names (but not type_refs).
+  const TypeCode *type_codes;     // num_elems count
+  const TypeFunction *type_refs;  // less than num_elems entries (see TypeCode).
+  const int16_t *array_sizes;     // less than num_elems entries (see TypeCode).
+  const int64_t *values;  // Only set for non-consecutive enum/union or structs.
+  const char *const *names;  // Only set if compiled with --reflect-names.
+};
+
+// String which identifies the current version of FlatBuffers.
+// flatbuffer_version_string is used by Google developers to identify which
+// applications uploaded to Google Play are using this library.  This allows
+// the development team at Google to determine the popularity of the library.
+// How it works: Applications that are uploaded to the Google Play Store are
+// scanned for this version string.  We track which applications are using it
+// to measure popularity.  You are free to remove it (of course) but we would
+// appreciate if you left it in.
+
+// Weak linkage is culled by VS & doesn't work on cygwin.
+// clang-format off
+#if !defined(_WIN32) && !defined(__CYGWIN__)
+
+extern volatile __attribute__((weak)) const char *flatbuffer_version_string;
+volatile __attribute__((weak)) const char *flatbuffer_version_string =
+  "FlatBuffers "
+  FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MAJOR) "."
+  FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MINOR) "."
+  FLATBUFFERS_STRING(FLATBUFFERS_VERSION_REVISION);
+
+#endif  // !defined(_WIN32) && !defined(__CYGWIN__)
+
+#define FLATBUFFERS_DEFINE_BITMASK_OPERATORS(E, T)\
+    inline E operator | (E lhs, E rhs){\
+        return E(T(lhs) | T(rhs));\
+    }\
+    inline E operator & (E lhs, E rhs){\
+        return E(T(lhs) & T(rhs));\
+    }\
+    inline E operator ^ (E lhs, E rhs){\
+        return E(T(lhs) ^ T(rhs));\
+    }\
+    inline E operator ~ (E lhs){\
+        return E(~T(lhs));\
+    }\
+    inline E operator |= (E &lhs, E rhs){\
+        lhs = lhs | rhs;\
+        return lhs;\
+    }\
+    inline E operator &= (E &lhs, E rhs){\
+        lhs = lhs & rhs;\
+        return lhs;\
+    }\
+    inline E operator ^= (E &lhs, E rhs){\
+        lhs = lhs ^ rhs;\
+        return lhs;\
+    }\
+    inline bool operator !(E rhs) \
+    {\
+        return !bool(T(rhs)); \
+    }
+/// @endcond
+}  // namespace flatbuffers
+
+// clang-format on
+
+#endif  // FLATBUFFERS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatc.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatc.h
new file mode 100644
index 0000000..5e2709e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flatc.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLATC_H_
+#define FLATBUFFERS_FLATC_H_
+
+#include <functional>
+#include <limits>
+#include <string>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+extern void LogCompilerWarn(const std::string &warn);
+extern void LogCompilerError(const std::string &err);
+
+class FlatCompiler {
+ public:
+  // Output generator for the various programming languages and formats we
+  // support.
+  struct Generator {
+    typedef bool (*GenerateFn)(const flatbuffers::Parser &parser,
+                               const std::string &path,
+                               const std::string &file_name);
+    typedef std::string (*MakeRuleFn)(const flatbuffers::Parser &parser,
+                                      const std::string &path,
+                                      const std::string &file_name);
+
+    GenerateFn generate;
+    const char *generator_opt_short;
+    const char *generator_opt_long;
+    const char *lang_name;
+    bool schema_only;
+    GenerateFn generateGRPC;
+    flatbuffers::IDLOptions::Language lang;
+    const char *generator_help;
+    MakeRuleFn make_rule;
+  };
+
+  typedef void (*WarnFn)(const FlatCompiler *flatc, const std::string &warn,
+                         bool show_exe_name);
+
+  typedef void (*ErrorFn)(const FlatCompiler *flatc, const std::string &err,
+                          bool usage, bool show_exe_name);
+
+  // Parameters required to initialize the FlatCompiler.
+  struct InitParams {
+    InitParams()
+        : generators(nullptr),
+          num_generators(0),
+          warn_fn(nullptr),
+          error_fn(nullptr) {}
+
+    const Generator *generators;
+    size_t num_generators;
+    WarnFn warn_fn;
+    ErrorFn error_fn;
+  };
+
+  explicit FlatCompiler(const InitParams &params) : params_(params) {}
+
+  int Compile(int argc, const char **argv);
+
+  std::string GetUsageString(const char *program_name) const;
+
+ private:
+  void ParseFile(flatbuffers::Parser &parser, const std::string &filename,
+                 const std::string &contents,
+                 std::vector<const char *> &include_directories) const;
+
+  void LoadBinarySchema(Parser &parser, const std::string &filename,
+                        const std::string &contents);
+
+  void Warn(const std::string &warn, bool show_exe_name = true) const;
+
+  void Error(const std::string &err, bool usage = true,
+             bool show_exe_name = true) const;
+
+  InitParams params_;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_FLATC_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flexbuffers.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flexbuffers.h
new file mode 100644
index 0000000..c71928e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/flexbuffers.h
@@ -0,0 +1,1636 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLEXBUFFERS_H_
+#define FLATBUFFERS_FLEXBUFFERS_H_
+
+#include <map>
+// Used to select STL variant.
+#include "flatbuffers/base.h"
+// We use the basic binary writing functions from the regular FlatBuffers.
+#include "flatbuffers/util.h"
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable : 4127)  // C4127: conditional expression is constant
+#endif
+
+namespace flexbuffers {
+
+class Reference;
+class Map;
+
+// These are used in the lower 2 bits of a type field to determine the size of
+// the elements (and or size field) of the item pointed to (e.g. vector).
+enum BitWidth {
+  BIT_WIDTH_8 = 0,
+  BIT_WIDTH_16 = 1,
+  BIT_WIDTH_32 = 2,
+  BIT_WIDTH_64 = 3,
+};
+
+// These are used as the upper 6 bits of a type field to indicate the actual
+// type.
+enum Type {
+  FBT_NULL = 0,
+  FBT_INT = 1,
+  FBT_UINT = 2,
+  FBT_FLOAT = 3,
+  // Types above stored inline, types below store an offset.
+  FBT_KEY = 4,
+  FBT_STRING = 5,
+  FBT_INDIRECT_INT = 6,
+  FBT_INDIRECT_UINT = 7,
+  FBT_INDIRECT_FLOAT = 8,
+  FBT_MAP = 9,
+  FBT_VECTOR = 10,      // Untyped.
+  FBT_VECTOR_INT = 11,  // Typed any size (stores no type table).
+  FBT_VECTOR_UINT = 12,
+  FBT_VECTOR_FLOAT = 13,
+  FBT_VECTOR_KEY = 14,
+  // DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
+  // Read test.cpp/FlexBuffersDeprecatedTest() for details on why.
+  FBT_VECTOR_STRING_DEPRECATED = 15,
+  FBT_VECTOR_INT2 = 16,  // Typed tuple (no type table, no size field).
+  FBT_VECTOR_UINT2 = 17,
+  FBT_VECTOR_FLOAT2 = 18,
+  FBT_VECTOR_INT3 = 19,  // Typed triple (no type table, no size field).
+  FBT_VECTOR_UINT3 = 20,
+  FBT_VECTOR_FLOAT3 = 21,
+  FBT_VECTOR_INT4 = 22,  // Typed quad (no type table, no size field).
+  FBT_VECTOR_UINT4 = 23,
+  FBT_VECTOR_FLOAT4 = 24,
+  FBT_BLOB = 25,
+  FBT_BOOL = 26,
+  FBT_VECTOR_BOOL =
+      36,  // To Allow the same type of conversion of type to vector type
+};
+
+inline bool IsInline(Type t) { return t <= FBT_FLOAT || t == FBT_BOOL; }
+
+inline bool IsTypedVectorElementType(Type t) {
+  return (t >= FBT_INT && t <= FBT_STRING) || t == FBT_BOOL;
+}
+
+inline bool IsTypedVector(Type t) {
+  return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING_DEPRECATED) ||
+         t == FBT_VECTOR_BOOL;
+}
+
+inline bool IsFixedTypedVector(Type t) {
+  return t >= FBT_VECTOR_INT2 && t <= FBT_VECTOR_FLOAT4;
+}
+
+inline Type ToTypedVector(Type t, size_t fixed_len = 0) {
+  FLATBUFFERS_ASSERT(IsTypedVectorElementType(t));
+  switch (fixed_len) {
+    case 0: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT);
+    case 2: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT2);
+    case 3: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT3);
+    case 4: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT4);
+    default: FLATBUFFERS_ASSERT(0); return FBT_NULL;
+  }
+}
+
+inline Type ToTypedVectorElementType(Type t) {
+  FLATBUFFERS_ASSERT(IsTypedVector(t));
+  return static_cast<Type>(t - FBT_VECTOR_INT + FBT_INT);
+}
+
+inline Type ToFixedTypedVectorElementType(Type t, uint8_t *len) {
+  FLATBUFFERS_ASSERT(IsFixedTypedVector(t));
+  auto fixed_type = t - FBT_VECTOR_INT2;
+  *len = static_cast<uint8_t>(fixed_type / 3 +
+                              2);  // 3 types each, starting from length 2.
+  return static_cast<Type>(fixed_type % 3 + FBT_INT);
+}
+
+// TODO: implement proper support for 8/16bit floats, or decide not to
+// support them.
+typedef int16_t half;
+typedef int8_t quarter;
+
+// TODO: can we do this without conditionals using intrinsics or inline asm
+// on some platforms? Given branch prediction the method below should be
+// decently quick, but it is the most frequently executed function.
+// We could do an (unaligned) 64-bit read if we ifdef out the platforms for
+// which that doesn't work (or where we'd read into un-owned memory).
+template<typename R, typename T1, typename T2, typename T4, typename T8>
+R ReadSizedScalar(const uint8_t *data, uint8_t byte_width) {
+  return byte_width < 4
+             ? (byte_width < 2
+                    ? static_cast<R>(flatbuffers::ReadScalar<T1>(data))
+                    : static_cast<R>(flatbuffers::ReadScalar<T2>(data)))
+             : (byte_width < 8
+                    ? static_cast<R>(flatbuffers::ReadScalar<T4>(data))
+                    : static_cast<R>(flatbuffers::ReadScalar<T8>(data)));
+}
+
+inline int64_t ReadInt64(const uint8_t *data, uint8_t byte_width) {
+  return ReadSizedScalar<int64_t, int8_t, int16_t, int32_t, int64_t>(
+      data, byte_width);
+}
+
+inline uint64_t ReadUInt64(const uint8_t *data, uint8_t byte_width) {
+  // This is the "hottest" function (all offset lookups use this), so worth
+  // optimizing if possible.
+  // TODO: GCC apparently replaces memcpy by a rep movsb, but only if count is a
+  // constant, which here it isn't. Test if memcpy is still faster than
+  // the conditionals in ReadSizedScalar. Can also use inline asm.
+  // clang-format off
+  #if defined(_MSC_VER) && ((defined(_M_X64) && !defined(_M_ARM64EC)) || defined _M_IX86)
+    uint64_t u = 0;
+    __movsb(reinterpret_cast<uint8_t *>(&u),
+            reinterpret_cast<const uint8_t *>(data), byte_width);
+    return flatbuffers::EndianScalar(u);
+  #else
+    return ReadSizedScalar<uint64_t, uint8_t, uint16_t, uint32_t, uint64_t>(
+             data, byte_width);
+  #endif
+  // clang-format on
+}
+
+inline double ReadDouble(const uint8_t *data, uint8_t byte_width) {
+  return ReadSizedScalar<double, quarter, half, float, double>(data,
+                                                               byte_width);
+}
+
+inline const uint8_t *Indirect(const uint8_t *offset, uint8_t byte_width) {
+  return offset - ReadUInt64(offset, byte_width);
+}
+
+template<typename T> const uint8_t *Indirect(const uint8_t *offset) {
+  return offset - flatbuffers::ReadScalar<T>(offset);
+}
+
+inline BitWidth WidthU(uint64_t u) {
+#define FLATBUFFERS_GET_FIELD_BIT_WIDTH(value, width)                   \
+  {                                                                     \
+    if (!((u) & ~((1ULL << (width)) - 1ULL))) return BIT_WIDTH_##width; \
+  }
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 8);
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 16);
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 32);
+#undef FLATBUFFERS_GET_FIELD_BIT_WIDTH
+  return BIT_WIDTH_64;
+}
+
+inline BitWidth WidthI(int64_t i) {
+  auto u = static_cast<uint64_t>(i) << 1;
+  return WidthU(i >= 0 ? u : ~u);
+}
+
+inline BitWidth WidthF(double f) {
+  return static_cast<double>(static_cast<float>(f)) == f ? BIT_WIDTH_32
+                                                         : BIT_WIDTH_64;
+}
+
+// Base class of all types below.
+// Points into the data buffer and allows access to one type.
+class Object {
+ public:
+  Object(const uint8_t *data, uint8_t byte_width)
+      : data_(data), byte_width_(byte_width) {}
+
+ protected:
+  const uint8_t *data_;
+  uint8_t byte_width_;
+};
+
+// Object that has a size, obtained either from size prefix, or elsewhere.
+class Sized : public Object {
+ public:
+  // Size prefix.
+  Sized(const uint8_t *data, uint8_t byte_width)
+      : Object(data, byte_width), size_(read_size()) {}
+  // Manual size.
+  Sized(const uint8_t *data, uint8_t byte_width, size_t sz)
+      : Object(data, byte_width), size_(sz) {}
+  size_t size() const { return size_; }
+  // Access size stored in `byte_width_` bytes before data_ pointer.
+  size_t read_size() const {
+    return static_cast<size_t>(ReadUInt64(data_ - byte_width_, byte_width_));
+  }
+
+ protected:
+  size_t size_;
+};
+
+class String : public Sized {
+ public:
+  // Size prefix.
+  String(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {}
+  // Manual size.
+  String(const uint8_t *data, uint8_t byte_width, size_t sz)
+      : Sized(data, byte_width, sz) {}
+
+  size_t length() const { return size(); }
+  const char *c_str() const { return reinterpret_cast<const char *>(data_); }
+  std::string str() const { return std::string(c_str(), size()); }
+
+  static String EmptyString() {
+    static const char *empty_string = "";
+    return String(reinterpret_cast<const uint8_t *>(empty_string), 1, 0);
+  }
+  bool IsTheEmptyString() const { return data_ == EmptyString().data_; }
+};
+
+class Blob : public Sized {
+ public:
+  Blob(const uint8_t *data_buf, uint8_t byte_width)
+      : Sized(data_buf, byte_width) {}
+
+  static Blob EmptyBlob() {
+    static const uint8_t empty_blob[] = { 0 /*len*/ };
+    return Blob(empty_blob + 1, 1);
+  }
+  bool IsTheEmptyBlob() const { return data_ == EmptyBlob().data_; }
+  const uint8_t *data() const { return data_; }
+};
+
+class Vector : public Sized {
+ public:
+  Vector(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {}
+
+  Reference operator[](size_t i) const;
+
+  static Vector EmptyVector() {
+    static const uint8_t empty_vector[] = { 0 /*len*/ };
+    return Vector(empty_vector + 1, 1);
+  }
+  bool IsTheEmptyVector() const { return data_ == EmptyVector().data_; }
+};
+
+class TypedVector : public Sized {
+ public:
+  TypedVector(const uint8_t *data, uint8_t byte_width, Type element_type)
+      : Sized(data, byte_width), type_(element_type) {}
+
+  Reference operator[](size_t i) const;
+
+  static TypedVector EmptyTypedVector() {
+    static const uint8_t empty_typed_vector[] = { 0 /*len*/ };
+    return TypedVector(empty_typed_vector + 1, 1, FBT_INT);
+  }
+  bool IsTheEmptyVector() const {
+    return data_ == TypedVector::EmptyTypedVector().data_;
+  }
+
+  Type ElementType() { return type_; }
+
+  friend Reference;
+
+ private:
+  Type type_;
+
+  friend Map;
+};
+
+class FixedTypedVector : public Object {
+ public:
+  FixedTypedVector(const uint8_t *data, uint8_t byte_width, Type element_type,
+                   uint8_t len)
+      : Object(data, byte_width), type_(element_type), len_(len) {}
+
+  Reference operator[](size_t i) const;
+
+  static FixedTypedVector EmptyFixedTypedVector() {
+    static const uint8_t fixed_empty_vector[] = { 0 /* unused */ };
+    return FixedTypedVector(fixed_empty_vector, 1, FBT_INT, 0);
+  }
+  bool IsTheEmptyFixedTypedVector() const {
+    return data_ == FixedTypedVector::EmptyFixedTypedVector().data_;
+  }
+
+  Type ElementType() { return type_; }
+  uint8_t size() { return len_; }
+
+ private:
+  Type type_;
+  uint8_t len_;
+};
+
+class Map : public Vector {
+ public:
+  Map(const uint8_t *data, uint8_t byte_width) : Vector(data, byte_width) {}
+
+  Reference operator[](const char *key) const;
+  Reference operator[](const std::string &key) const;
+
+  Vector Values() const { return Vector(data_, byte_width_); }
+
+  TypedVector Keys() const {
+    const size_t num_prefixed_fields = 3;
+    auto keys_offset = data_ - byte_width_ * num_prefixed_fields;
+    return TypedVector(Indirect(keys_offset, byte_width_),
+                       static_cast<uint8_t>(
+                           ReadUInt64(keys_offset + byte_width_, byte_width_)),
+                       FBT_KEY);
+  }
+
+  static Map EmptyMap() {
+    static const uint8_t empty_map[] = {
+      0 /*keys_len*/, 0 /*keys_offset*/, 1 /*keys_width*/, 0 /*len*/
+    };
+    return Map(empty_map + 4, 1);
+  }
+
+  bool IsTheEmptyMap() const { return data_ == EmptyMap().data_; }
+};
+
+template<typename T>
+void AppendToString(std::string &s, T &&v, bool keys_quoted) {
+  s += "[ ";
+  for (size_t i = 0; i < v.size(); i++) {
+    if (i) s += ", ";
+    v[i].ToString(true, keys_quoted, s);
+  }
+  s += " ]";
+}
+
+class Reference {
+ public:
+  Reference()
+      : data_(nullptr),
+        parent_width_(0),
+        byte_width_(BIT_WIDTH_8),
+        type_(FBT_NULL) {}
+
+  Reference(const uint8_t *data, uint8_t parent_width, uint8_t byte_width,
+            Type type)
+      : data_(data),
+        parent_width_(parent_width),
+        byte_width_(byte_width),
+        type_(type) {}
+
+  Reference(const uint8_t *data, uint8_t parent_width, uint8_t packed_type)
+      : data_(data), parent_width_(parent_width) {
+    byte_width_ = 1U << static_cast<BitWidth>(packed_type & 3);
+    type_ = static_cast<Type>(packed_type >> 2);
+  }
+
+  Type GetType() const { return type_; }
+
+  bool IsNull() const { return type_ == FBT_NULL; }
+  bool IsBool() const { return type_ == FBT_BOOL; }
+  bool IsInt() const { return type_ == FBT_INT || type_ == FBT_INDIRECT_INT; }
+  bool IsUInt() const {
+    return type_ == FBT_UINT || type_ == FBT_INDIRECT_UINT;
+  }
+  bool IsIntOrUint() const { return IsInt() || IsUInt(); }
+  bool IsFloat() const {
+    return type_ == FBT_FLOAT || type_ == FBT_INDIRECT_FLOAT;
+  }
+  bool IsNumeric() const { return IsIntOrUint() || IsFloat(); }
+  bool IsString() const { return type_ == FBT_STRING; }
+  bool IsKey() const { return type_ == FBT_KEY; }
+  bool IsVector() const { return type_ == FBT_VECTOR || type_ == FBT_MAP; }
+  bool IsUntypedVector() const { return type_ == FBT_VECTOR; }
+  bool IsTypedVector() const { return flexbuffers::IsTypedVector(type_); }
+  bool IsFixedTypedVector() const {
+    return flexbuffers::IsFixedTypedVector(type_);
+  }
+  bool IsAnyVector() const {
+    return (IsTypedVector() || IsFixedTypedVector() || IsVector());
+  }
+  bool IsMap() const { return type_ == FBT_MAP; }
+  bool IsBlob() const { return type_ == FBT_BLOB; }
+  bool AsBool() const {
+    return (type_ == FBT_BOOL ? ReadUInt64(data_, parent_width_)
+                              : AsUInt64()) != 0;
+  }
+
+  // Reads any type as a int64_t. Never fails, does most sensible conversion.
+  // Truncates floats, strings are attempted to be parsed for a number,
+  // vectors/maps return their size. Returns 0 if all else fails.
+  int64_t AsInt64() const {
+    if (type_ == FBT_INT) {
+      // A fast path for the common case.
+      return ReadInt64(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_INT: return ReadInt64(Indirect(), byte_width_);
+        case FBT_UINT: return ReadUInt64(data_, parent_width_);
+        case FBT_INDIRECT_UINT: return ReadUInt64(Indirect(), byte_width_);
+        case FBT_FLOAT:
+          return static_cast<int64_t>(ReadDouble(data_, parent_width_));
+        case FBT_INDIRECT_FLOAT:
+          return static_cast<int64_t>(ReadDouble(Indirect(), byte_width_));
+        case FBT_NULL: return 0;
+        case FBT_STRING: return flatbuffers::StringToInt(AsString().c_str());
+        case FBT_VECTOR: return static_cast<int64_t>(AsVector().size());
+        case FBT_BOOL: return ReadInt64(data_, parent_width_);
+        default:
+          // Convert other things to int.
+          return 0;
+      }
+  }
+
+  // TODO: could specialize these to not use AsInt64() if that saves
+  // extension ops in generated code, and use a faster op than ReadInt64.
+  int32_t AsInt32() const { return static_cast<int32_t>(AsInt64()); }
+  int16_t AsInt16() const { return static_cast<int16_t>(AsInt64()); }
+  int8_t AsInt8() const { return static_cast<int8_t>(AsInt64()); }
+
+  uint64_t AsUInt64() const {
+    if (type_ == FBT_UINT) {
+      // A fast path for the common case.
+      return ReadUInt64(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_UINT: return ReadUInt64(Indirect(), byte_width_);
+        case FBT_INT: return ReadInt64(data_, parent_width_);
+        case FBT_INDIRECT_INT: return ReadInt64(Indirect(), byte_width_);
+        case FBT_FLOAT:
+          return static_cast<uint64_t>(ReadDouble(data_, parent_width_));
+        case FBT_INDIRECT_FLOAT:
+          return static_cast<uint64_t>(ReadDouble(Indirect(), byte_width_));
+        case FBT_NULL: return 0;
+        case FBT_STRING: return flatbuffers::StringToUInt(AsString().c_str());
+        case FBT_VECTOR: return static_cast<uint64_t>(AsVector().size());
+        case FBT_BOOL: return ReadUInt64(data_, parent_width_);
+        default:
+          // Convert other things to uint.
+          return 0;
+      }
+  }
+
+  uint32_t AsUInt32() const { return static_cast<uint32_t>(AsUInt64()); }
+  uint16_t AsUInt16() const { return static_cast<uint16_t>(AsUInt64()); }
+  uint8_t AsUInt8() const { return static_cast<uint8_t>(AsUInt64()); }
+
+  double AsDouble() const {
+    if (type_ == FBT_FLOAT) {
+      // A fast path for the common case.
+      return ReadDouble(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_FLOAT: return ReadDouble(Indirect(), byte_width_);
+        case FBT_INT:
+          return static_cast<double>(ReadInt64(data_, parent_width_));
+        case FBT_UINT:
+          return static_cast<double>(ReadUInt64(data_, parent_width_));
+        case FBT_INDIRECT_INT:
+          return static_cast<double>(ReadInt64(Indirect(), byte_width_));
+        case FBT_INDIRECT_UINT:
+          return static_cast<double>(ReadUInt64(Indirect(), byte_width_));
+        case FBT_NULL: return 0.0;
+        case FBT_STRING: {
+          double d;
+          flatbuffers::StringToNumber(AsString().c_str(), &d);
+          return d;
+        }
+        case FBT_VECTOR: return static_cast<double>(AsVector().size());
+        case FBT_BOOL:
+          return static_cast<double>(ReadUInt64(data_, parent_width_));
+        default:
+          // Convert strings and other things to float.
+          return 0;
+      }
+  }
+
+  float AsFloat() const { return static_cast<float>(AsDouble()); }
+
+  const char *AsKey() const {
+    if (type_ == FBT_KEY || type_ == FBT_STRING) {
+      return reinterpret_cast<const char *>(Indirect());
+    } else {
+      return "";
+    }
+  }
+
+  // This function returns the empty string if you try to read something that
+  // is not a string or key.
+  String AsString() const {
+    if (type_ == FBT_STRING) {
+      return String(Indirect(), byte_width_);
+    } else if (type_ == FBT_KEY) {
+      auto key = Indirect();
+      return String(key, byte_width_,
+                    strlen(reinterpret_cast<const char *>(key)));
+    } else {
+      return String::EmptyString();
+    }
+  }
+
+  // Unlike AsString(), this will convert any type to a std::string.
+  std::string ToString() const {
+    std::string s;
+    ToString(false, false, s);
+    return s;
+  }
+
+  // Convert any type to a JSON-like string. strings_quoted determines if
+  // string values at the top level receive "" quotes (inside other values
+  // they always do). keys_quoted determines if keys are quoted, at any level.
+  // TODO(wvo): add further options to have indentation/newlines.
+  void ToString(bool strings_quoted, bool keys_quoted, std::string &s) const {
+    if (type_ == FBT_STRING) {
+      String str(Indirect(), byte_width_);
+      if (strings_quoted) {
+        flatbuffers::EscapeString(str.c_str(), str.length(), &s, true, false);
+      } else {
+        s.append(str.c_str(), str.length());
+      }
+    } else if (IsKey()) {
+      auto str = AsKey();
+      if (keys_quoted) {
+        flatbuffers::EscapeString(str, strlen(str), &s, true, false);
+      } else {
+        s += str;
+      }
+    } else if (IsInt()) {
+      s += flatbuffers::NumToString(AsInt64());
+    } else if (IsUInt()) {
+      s += flatbuffers::NumToString(AsUInt64());
+    } else if (IsFloat()) {
+      s += flatbuffers::NumToString(AsDouble());
+    } else if (IsNull()) {
+      s += "null";
+    } else if (IsBool()) {
+      s += AsBool() ? "true" : "false";
+    } else if (IsMap()) {
+      s += "{ ";
+      auto m = AsMap();
+      auto keys = m.Keys();
+      auto vals = m.Values();
+      for (size_t i = 0; i < keys.size(); i++) {
+        keys[i].ToString(true, keys_quoted, s);
+        s += ": ";
+        vals[i].ToString(true, keys_quoted, s);
+        if (i < keys.size() - 1) s += ", ";
+      }
+      s += " }";
+    } else if (IsVector()) {
+      AppendToString<Vector>(s, AsVector(), keys_quoted);
+    } else if (IsTypedVector()) {
+      AppendToString<TypedVector>(s, AsTypedVector(), keys_quoted);
+    } else if (IsFixedTypedVector()) {
+      AppendToString<FixedTypedVector>(s, AsFixedTypedVector(), keys_quoted);
+    } else if (IsBlob()) {
+      auto blob = AsBlob();
+      flatbuffers::EscapeString(reinterpret_cast<const char *>(blob.data()),
+                                blob.size(), &s, true, false);
+    } else {
+      s += "(?)";
+    }
+  }
+
+  // This function returns the empty blob if you try to read a not-blob.
+  // Strings can be viewed as blobs too.
+  Blob AsBlob() const {
+    if (type_ == FBT_BLOB || type_ == FBT_STRING) {
+      return Blob(Indirect(), byte_width_);
+    } else {
+      return Blob::EmptyBlob();
+    }
+  }
+
+  // This function returns the empty vector if you try to read a not-vector.
+  // Maps can be viewed as vectors too.
+  Vector AsVector() const {
+    if (type_ == FBT_VECTOR || type_ == FBT_MAP) {
+      return Vector(Indirect(), byte_width_);
+    } else {
+      return Vector::EmptyVector();
+    }
+  }
+
+  TypedVector AsTypedVector() const {
+    if (IsTypedVector()) {
+      auto tv =
+          TypedVector(Indirect(), byte_width_, ToTypedVectorElementType(type_));
+      if (tv.type_ == FBT_STRING) {
+        // These can't be accessed as strings, since we don't know the bit-width
+        // of the size field, see the declaration of
+        // FBT_VECTOR_STRING_DEPRECATED above for details.
+        // We change the type here to be keys, which are a subtype of strings,
+        // and will ignore the size field. This will truncate strings with
+        // embedded nulls.
+        tv.type_ = FBT_KEY;
+      }
+      return tv;
+    } else {
+      return TypedVector::EmptyTypedVector();
+    }
+  }
+
+  FixedTypedVector AsFixedTypedVector() const {
+    if (IsFixedTypedVector()) {
+      uint8_t len = 0;
+      auto vtype = ToFixedTypedVectorElementType(type_, &len);
+      return FixedTypedVector(Indirect(), byte_width_, vtype, len);
+    } else {
+      return FixedTypedVector::EmptyFixedTypedVector();
+    }
+  }
+
+  Map AsMap() const {
+    if (type_ == FBT_MAP) {
+      return Map(Indirect(), byte_width_);
+    } else {
+      return Map::EmptyMap();
+    }
+  }
+
+  template<typename T> T As() const;
+
+  // Experimental: Mutation functions.
+  // These allow scalars in an already created buffer to be updated in-place.
+  // Since by default scalars are stored in the smallest possible space,
+  // the new value may not fit, in which case these functions return false.
+  // To avoid this, you can construct the values you intend to mutate using
+  // Builder::ForceMinimumBitWidth.
+  bool MutateInt(int64_t i) {
+    if (type_ == FBT_INT) {
+      return Mutate(data_, i, parent_width_, WidthI(i));
+    } else if (type_ == FBT_INDIRECT_INT) {
+      return Mutate(Indirect(), i, byte_width_, WidthI(i));
+    } else if (type_ == FBT_UINT) {
+      auto u = static_cast<uint64_t>(i);
+      return Mutate(data_, u, parent_width_, WidthU(u));
+    } else if (type_ == FBT_INDIRECT_UINT) {
+      auto u = static_cast<uint64_t>(i);
+      return Mutate(Indirect(), u, byte_width_, WidthU(u));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateBool(bool b) {
+    return type_ == FBT_BOOL && Mutate(data_, b, parent_width_, BIT_WIDTH_8);
+  }
+
+  bool MutateUInt(uint64_t u) {
+    if (type_ == FBT_UINT) {
+      return Mutate(data_, u, parent_width_, WidthU(u));
+    } else if (type_ == FBT_INDIRECT_UINT) {
+      return Mutate(Indirect(), u, byte_width_, WidthU(u));
+    } else if (type_ == FBT_INT) {
+      auto i = static_cast<int64_t>(u);
+      return Mutate(data_, i, parent_width_, WidthI(i));
+    } else if (type_ == FBT_INDIRECT_INT) {
+      auto i = static_cast<int64_t>(u);
+      return Mutate(Indirect(), i, byte_width_, WidthI(i));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateFloat(float f) {
+    if (type_ == FBT_FLOAT) {
+      return MutateF(data_, f, parent_width_, BIT_WIDTH_32);
+    } else if (type_ == FBT_INDIRECT_FLOAT) {
+      return MutateF(Indirect(), f, byte_width_, BIT_WIDTH_32);
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateFloat(double d) {
+    if (type_ == FBT_FLOAT) {
+      return MutateF(data_, d, parent_width_, WidthF(d));
+    } else if (type_ == FBT_INDIRECT_FLOAT) {
+      return MutateF(Indirect(), d, byte_width_, WidthF(d));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateString(const char *str, size_t len) {
+    auto s = AsString();
+    if (s.IsTheEmptyString()) return false;
+    // This is very strict, could allow shorter strings, but that creates
+    // garbage.
+    if (s.length() != len) return false;
+    memcpy(const_cast<char *>(s.c_str()), str, len);
+    return true;
+  }
+  bool MutateString(const char *str) { return MutateString(str, strlen(str)); }
+  bool MutateString(const std::string &str) {
+    return MutateString(str.data(), str.length());
+  }
+
+ private:
+  const uint8_t *Indirect() const {
+    return flexbuffers::Indirect(data_, parent_width_);
+  }
+
+  template<typename T>
+  bool Mutate(const uint8_t *dest, T t, size_t byte_width,
+              BitWidth value_width) {
+    auto fits = static_cast<size_t>(static_cast<size_t>(1U) << value_width) <=
+                byte_width;
+    if (fits) {
+      t = flatbuffers::EndianScalar(t);
+      memcpy(const_cast<uint8_t *>(dest), &t, byte_width);
+    }
+    return fits;
+  }
+
+  template<typename T>
+  bool MutateF(const uint8_t *dest, T t, size_t byte_width,
+               BitWidth value_width) {
+    if (byte_width == sizeof(double))
+      return Mutate(dest, static_cast<double>(t), byte_width, value_width);
+    if (byte_width == sizeof(float))
+      return Mutate(dest, static_cast<float>(t), byte_width, value_width);
+    FLATBUFFERS_ASSERT(false);
+    return false;
+  }
+
+  const uint8_t *data_;
+  uint8_t parent_width_;
+  uint8_t byte_width_;
+  Type type_;
+};
+
+// Template specialization for As().
+template<> inline bool Reference::As<bool>() const { return AsBool(); }
+
+template<> inline int8_t Reference::As<int8_t>() const { return AsInt8(); }
+template<> inline int16_t Reference::As<int16_t>() const { return AsInt16(); }
+template<> inline int32_t Reference::As<int32_t>() const { return AsInt32(); }
+template<> inline int64_t Reference::As<int64_t>() const { return AsInt64(); }
+
+template<> inline uint8_t Reference::As<uint8_t>() const { return AsUInt8(); }
+template<> inline uint16_t Reference::As<uint16_t>() const {
+  return AsUInt16();
+}
+template<> inline uint32_t Reference::As<uint32_t>() const {
+  return AsUInt32();
+}
+template<> inline uint64_t Reference::As<uint64_t>() const {
+  return AsUInt64();
+}
+
+template<> inline double Reference::As<double>() const { return AsDouble(); }
+template<> inline float Reference::As<float>() const { return AsFloat(); }
+
+template<> inline String Reference::As<String>() const { return AsString(); }
+template<> inline std::string Reference::As<std::string>() const {
+  return AsString().str();
+}
+
+template<> inline Blob Reference::As<Blob>() const { return AsBlob(); }
+template<> inline Vector Reference::As<Vector>() const { return AsVector(); }
+template<> inline TypedVector Reference::As<TypedVector>() const {
+  return AsTypedVector();
+}
+template<> inline FixedTypedVector Reference::As<FixedTypedVector>() const {
+  return AsFixedTypedVector();
+}
+template<> inline Map Reference::As<Map>() const { return AsMap(); }
+
+inline uint8_t PackedType(BitWidth bit_width, Type type) {
+  return static_cast<uint8_t>(bit_width | (type << 2));
+}
+
+inline uint8_t NullPackedType() { return PackedType(BIT_WIDTH_8, FBT_NULL); }
+
+// Vector accessors.
+// Note: if you try to access outside of bounds, you get a Null value back
+// instead. Normally this would be an assert, but since this is "dynamically
+// typed" data, you may not want that (someone sends you a 2d vector and you
+// wanted 3d).
+// The Null converts seamlessly into a default value for any other type.
+// TODO(wvo): Could introduce an #ifdef that makes this into an assert?
+inline Reference Vector::operator[](size_t i) const {
+  auto len = size();
+  if (i >= len) return Reference(nullptr, 1, NullPackedType());
+  auto packed_type = (data_ + len * byte_width_)[i];
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, packed_type);
+}
+
+inline Reference TypedVector::operator[](size_t i) const {
+  auto len = size();
+  if (i >= len) return Reference(nullptr, 1, NullPackedType());
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, 1, type_);
+}
+
+inline Reference FixedTypedVector::operator[](size_t i) const {
+  if (i >= len_) return Reference(nullptr, 1, NullPackedType());
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, 1, type_);
+}
+
+template<typename T> int KeyCompare(const void *key, const void *elem) {
+  auto str_elem = reinterpret_cast<const char *>(
+      Indirect<T>(reinterpret_cast<const uint8_t *>(elem)));
+  auto skey = reinterpret_cast<const char *>(key);
+  return strcmp(skey, str_elem);
+}
+
+inline Reference Map::operator[](const char *key) const {
+  auto keys = Keys();
+  // We can't pass keys.byte_width_ to the comparison function, so we have
+  // to pick the right one ahead of time.
+  int (*comp)(const void *, const void *) = nullptr;
+  switch (keys.byte_width_) {
+    case 1: comp = KeyCompare<uint8_t>; break;
+    case 2: comp = KeyCompare<uint16_t>; break;
+    case 4: comp = KeyCompare<uint32_t>; break;
+    case 8: comp = KeyCompare<uint64_t>; break;
+  }
+  auto res = std::bsearch(key, keys.data_, keys.size(), keys.byte_width_, comp);
+  if (!res) return Reference(nullptr, 1, NullPackedType());
+  auto i = (reinterpret_cast<uint8_t *>(res) - keys.data_) / keys.byte_width_;
+  return (*static_cast<const Vector *>(this))[i];
+}
+
+inline Reference Map::operator[](const std::string &key) const {
+  return (*this)[key.c_str()];
+}
+
+inline Reference GetRoot(const uint8_t *buffer, size_t size) {
+  // See Finish() below for the serialization counterpart of this.
+  // The root starts at the end of the buffer, so we parse backwards from there.
+  auto end = buffer + size;
+  auto byte_width = *--end;
+  auto packed_type = *--end;
+  end -= byte_width;  // The root data item.
+  return Reference(end, byte_width, packed_type);
+}
+
+inline Reference GetRoot(const std::vector<uint8_t> &buffer) {
+  return GetRoot(flatbuffers::vector_data(buffer), buffer.size());
+}
+
+// Flags that configure how the Builder behaves.
+// The "Share" flags determine if the Builder automatically tries to pool
+// this type. Pooling can reduce the size of serialized data if there are
+// multiple maps of the same kind, at the expense of slightly slower
+// serialization (the cost of lookups) and more memory use (std::set).
+// By default this is on for keys, but off for strings.
+// Turn keys off if you have e.g. only one map.
+// Turn strings on if you expect many non-unique string values.
+// Additionally, sharing key vectors can save space if you have maps with
+// identical field populations.
+enum BuilderFlag {
+  BUILDER_FLAG_NONE = 0,
+  BUILDER_FLAG_SHARE_KEYS = 1,
+  BUILDER_FLAG_SHARE_STRINGS = 2,
+  BUILDER_FLAG_SHARE_KEYS_AND_STRINGS = 3,
+  BUILDER_FLAG_SHARE_KEY_VECTORS = 4,
+  BUILDER_FLAG_SHARE_ALL = 7,
+};
+
+class Builder FLATBUFFERS_FINAL_CLASS {
+ public:
+  Builder(size_t initial_size = 256,
+          BuilderFlag flags = BUILDER_FLAG_SHARE_KEYS)
+      : buf_(initial_size),
+        finished_(false),
+        has_duplicate_keys_(false),
+        flags_(flags),
+        force_min_bit_width_(BIT_WIDTH_8),
+        key_pool(KeyOffsetCompare(buf_)),
+        string_pool(StringOffsetCompare(buf_)) {
+    buf_.clear();
+  }
+
+#ifdef FLATBUFFERS_DEFAULT_DECLARATION
+  Builder(Builder &&) = default;
+  Builder &operator=(Builder &&) = default;
+#endif
+
+  /// @brief Get the serialized buffer (after you call `Finish()`).
+  /// @return Returns a vector owned by this class.
+  const std::vector<uint8_t> &GetBuffer() const {
+    Finished();
+    return buf_;
+  }
+
+  // Size of the buffer. Does not include unfinished values.
+  size_t GetSize() const { return buf_.size(); }
+
+  // Reset all state so we can re-use the buffer.
+  void Clear() {
+    buf_.clear();
+    stack_.clear();
+    finished_ = false;
+    // flags_ remains as-is;
+    force_min_bit_width_ = BIT_WIDTH_8;
+    key_pool.clear();
+    string_pool.clear();
+  }
+
+  // All value constructing functions below have two versions: one that
+  // takes a key (for placement inside a map) and one that doesn't (for inside
+  // vectors and elsewhere).
+
+  void Null() { stack_.push_back(Value()); }
+  void Null(const char *key) {
+    Key(key);
+    Null();
+  }
+
+  void Int(int64_t i) { stack_.push_back(Value(i, FBT_INT, WidthI(i))); }
+  void Int(const char *key, int64_t i) {
+    Key(key);
+    Int(i);
+  }
+
+  void UInt(uint64_t u) { stack_.push_back(Value(u, FBT_UINT, WidthU(u))); }
+  void UInt(const char *key, uint64_t u) {
+    Key(key);
+    UInt(u);
+  }
+
+  void Float(float f) { stack_.push_back(Value(f)); }
+  void Float(const char *key, float f) {
+    Key(key);
+    Float(f);
+  }
+
+  void Double(double f) { stack_.push_back(Value(f)); }
+  void Double(const char *key, double d) {
+    Key(key);
+    Double(d);
+  }
+
+  void Bool(bool b) { stack_.push_back(Value(b)); }
+  void Bool(const char *key, bool b) {
+    Key(key);
+    Bool(b);
+  }
+
+  void IndirectInt(int64_t i) { PushIndirect(i, FBT_INDIRECT_INT, WidthI(i)); }
+  void IndirectInt(const char *key, int64_t i) {
+    Key(key);
+    IndirectInt(i);
+  }
+
+  void IndirectUInt(uint64_t u) {
+    PushIndirect(u, FBT_INDIRECT_UINT, WidthU(u));
+  }
+  void IndirectUInt(const char *key, uint64_t u) {
+    Key(key);
+    IndirectUInt(u);
+  }
+
+  void IndirectFloat(float f) {
+    PushIndirect(f, FBT_INDIRECT_FLOAT, BIT_WIDTH_32);
+  }
+  void IndirectFloat(const char *key, float f) {
+    Key(key);
+    IndirectFloat(f);
+  }
+
+  void IndirectDouble(double f) {
+    PushIndirect(f, FBT_INDIRECT_FLOAT, WidthF(f));
+  }
+  void IndirectDouble(const char *key, double d) {
+    Key(key);
+    IndirectDouble(d);
+  }
+
+  size_t Key(const char *str, size_t len) {
+    auto sloc = buf_.size();
+    WriteBytes(str, len + 1);
+    if (flags_ & BUILDER_FLAG_SHARE_KEYS) {
+      auto it = key_pool.find(sloc);
+      if (it != key_pool.end()) {
+        // Already in the buffer. Remove key we just serialized, and use
+        // existing offset instead.
+        buf_.resize(sloc);
+        sloc = *it;
+      } else {
+        key_pool.insert(sloc);
+      }
+    }
+    stack_.push_back(Value(static_cast<uint64_t>(sloc), FBT_KEY, BIT_WIDTH_8));
+    return sloc;
+  }
+
+  size_t Key(const char *str) { return Key(str, strlen(str)); }
+  size_t Key(const std::string &str) { return Key(str.c_str(), str.size()); }
+
+  size_t String(const char *str, size_t len) {
+    auto reset_to = buf_.size();
+    auto sloc = CreateBlob(str, len, 1, FBT_STRING);
+    if (flags_ & BUILDER_FLAG_SHARE_STRINGS) {
+      StringOffset so(sloc, len);
+      auto it = string_pool.find(so);
+      if (it != string_pool.end()) {
+        // Already in the buffer. Remove string we just serialized, and use
+        // existing offset instead.
+        buf_.resize(reset_to);
+        sloc = it->first;
+        stack_.back().u_ = sloc;
+      } else {
+        string_pool.insert(so);
+      }
+    }
+    return sloc;
+  }
+  size_t String(const char *str) { return String(str, strlen(str)); }
+  size_t String(const std::string &str) {
+    return String(str.c_str(), str.size());
+  }
+  void String(const flexbuffers::String &str) {
+    String(str.c_str(), str.length());
+  }
+
+  void String(const char *key, const char *str) {
+    Key(key);
+    String(str);
+  }
+  void String(const char *key, const std::string &str) {
+    Key(key);
+    String(str);
+  }
+  void String(const char *key, const flexbuffers::String &str) {
+    Key(key);
+    String(str);
+  }
+
+  size_t Blob(const void *data, size_t len) {
+    return CreateBlob(data, len, 0, FBT_BLOB);
+  }
+  size_t Blob(const std::vector<uint8_t> &v) {
+    return CreateBlob(flatbuffers::vector_data(v), v.size(), 0, FBT_BLOB);
+  }
+
+  // TODO(wvo): support all the FlexBuffer types (like flexbuffers::String),
+  // e.g. Vector etc. Also in overloaded versions.
+  // Also some FlatBuffers types?
+
+  size_t StartVector() { return stack_.size(); }
+  size_t StartVector(const char *key) {
+    Key(key);
+    return stack_.size();
+  }
+  size_t StartMap() { return stack_.size(); }
+  size_t StartMap(const char *key) {
+    Key(key);
+    return stack_.size();
+  }
+
+  // TODO(wvo): allow this to specify an aligment greater than the natural
+  // alignment.
+  size_t EndVector(size_t start, bool typed, bool fixed) {
+    auto vec = CreateVector(start, stack_.size() - start, 1, typed, fixed);
+    // Remove temp elements and return vector.
+    stack_.resize(start);
+    stack_.push_back(vec);
+    return static_cast<size_t>(vec.u_);
+  }
+
+  size_t EndMap(size_t start) {
+    // We should have interleaved keys and values on the stack.
+    // Make sure it is an even number:
+    auto len = stack_.size() - start;
+    FLATBUFFERS_ASSERT(!(len & 1));
+    len /= 2;
+    // Make sure keys are all strings:
+    for (auto key = start; key < stack_.size(); key += 2) {
+      FLATBUFFERS_ASSERT(stack_[key].type_ == FBT_KEY);
+    }
+    // Now sort values, so later we can do a binary search lookup.
+    // We want to sort 2 array elements at a time.
+    struct TwoValue {
+      Value key;
+      Value val;
+    };
+    // TODO(wvo): strict aliasing?
+    // TODO(wvo): allow the caller to indicate the data is already sorted
+    // for maximum efficiency? With an assert to check sortedness to make sure
+    // we're not breaking binary search.
+    // Or, we can track if the map is sorted as keys are added which would be
+    // be quite cheap (cheaper than checking it here), so we can skip this
+    // step automatically when appliccable, and encourage people to write in
+    // sorted fashion.
+    // std::sort is typically already a lot faster on sorted data though.
+    auto dict =
+        reinterpret_cast<TwoValue *>(flatbuffers::vector_data(stack_) + start);
+    std::sort(dict, dict + len,
+              [&](const TwoValue &a, const TwoValue &b) -> bool {
+                auto as = reinterpret_cast<const char *>(
+                    flatbuffers::vector_data(buf_) + a.key.u_);
+                auto bs = reinterpret_cast<const char *>(
+                    flatbuffers::vector_data(buf_) + b.key.u_);
+                auto comp = strcmp(as, bs);
+                // We want to disallow duplicate keys, since this results in a
+                // map where values cannot be found.
+                // But we can't assert here (since we don't want to fail on
+                // random JSON input) or have an error mechanism.
+                // Instead, we set has_duplicate_keys_ in the builder to
+                // signal this.
+                // TODO: Have to check for pointer equality, as some sort
+                // implementation apparently call this function with the same
+                // element?? Why?
+                if (!comp && &a != &b) has_duplicate_keys_ = true;
+                return comp < 0;
+              });
+    // First create a vector out of all keys.
+    // TODO(wvo): if kBuilderFlagShareKeyVectors is true, see if we can share
+    // the first vector.
+    auto keys = CreateVector(start, len, 2, true, false);
+    auto vec = CreateVector(start + 1, len, 2, false, false, &keys);
+    // Remove temp elements and return map.
+    stack_.resize(start);
+    stack_.push_back(vec);
+    return static_cast<size_t>(vec.u_);
+  }
+
+  // Call this after EndMap to see if the map had any duplicate keys.
+  // Any map with such keys won't be able to retrieve all values.
+  bool HasDuplicateKeys() const { return has_duplicate_keys_; }
+
+  template<typename F> size_t Vector(F f) {
+    auto start = StartVector();
+    f();
+    return EndVector(start, false, false);
+  }
+  template<typename F, typename T> size_t Vector(F f, T &state) {
+    auto start = StartVector();
+    f(state);
+    return EndVector(start, false, false);
+  }
+  template<typename F> size_t Vector(const char *key, F f) {
+    auto start = StartVector(key);
+    f();
+    return EndVector(start, false, false);
+  }
+  template<typename F, typename T>
+  size_t Vector(const char *key, F f, T &state) {
+    auto start = StartVector(key);
+    f(state);
+    return EndVector(start, false, false);
+  }
+
+  template<typename T> void Vector(const T *elems, size_t len) {
+    if (flatbuffers::is_scalar<T>::value) {
+      // This path should be a lot quicker and use less space.
+      ScalarVector(elems, len, false);
+    } else {
+      auto start = StartVector();
+      for (size_t i = 0; i < len; i++) Add(elems[i]);
+      EndVector(start, false, false);
+    }
+  }
+  template<typename T>
+  void Vector(const char *key, const T *elems, size_t len) {
+    Key(key);
+    Vector(elems, len);
+  }
+  template<typename T> void Vector(const std::vector<T> &vec) {
+    Vector(flatbuffers::vector_data(vec), vec.size());
+  }
+
+  template<typename F> size_t TypedVector(F f) {
+    auto start = StartVector();
+    f();
+    return EndVector(start, true, false);
+  }
+  template<typename F, typename T> size_t TypedVector(F f, T &state) {
+    auto start = StartVector();
+    f(state);
+    return EndVector(start, true, false);
+  }
+  template<typename F> size_t TypedVector(const char *key, F f) {
+    auto start = StartVector(key);
+    f();
+    return EndVector(start, true, false);
+  }
+  template<typename F, typename T>
+  size_t TypedVector(const char *key, F f, T &state) {
+    auto start = StartVector(key);
+    f(state);
+    return EndVector(start, true, false);
+  }
+
+  template<typename T> size_t FixedTypedVector(const T *elems, size_t len) {
+    // We only support a few fixed vector lengths. Anything bigger use a
+    // regular typed vector.
+    FLATBUFFERS_ASSERT(len >= 2 && len <= 4);
+    // And only scalar values.
+    static_assert(flatbuffers::is_scalar<T>::value, "Unrelated types");
+    return ScalarVector(elems, len, true);
+  }
+
+  template<typename T>
+  size_t FixedTypedVector(const char *key, const T *elems, size_t len) {
+    Key(key);
+    return FixedTypedVector(elems, len);
+  }
+
+  template<typename F> size_t Map(F f) {
+    auto start = StartMap();
+    f();
+    return EndMap(start);
+  }
+  template<typename F, typename T> size_t Map(F f, T &state) {
+    auto start = StartMap();
+    f(state);
+    return EndMap(start);
+  }
+  template<typename F> size_t Map(const char *key, F f) {
+    auto start = StartMap(key);
+    f();
+    return EndMap(start);
+  }
+  template<typename F, typename T> size_t Map(const char *key, F f, T &state) {
+    auto start = StartMap(key);
+    f(state);
+    return EndMap(start);
+  }
+  template<typename T> void Map(const std::map<std::string, T> &map) {
+    auto start = StartMap();
+    for (auto it = map.begin(); it != map.end(); ++it)
+      Add(it->first.c_str(), it->second);
+    EndMap(start);
+  }
+
+  // If you wish to share a value explicitly (a value not shared automatically
+  // through one of the BUILDER_FLAG_SHARE_* flags) you can do so with these
+  // functions. Or if you wish to turn those flags off for performance reasons
+  // and still do some explicit sharing. For example:
+  // builder.IndirectDouble(M_PI);
+  // auto id = builder.LastValue();  // Remember where we stored it.
+  // .. more code goes here ..
+  // builder.ReuseValue(id);  // Refers to same double by offset.
+  // LastValue works regardless of whether the value has a key or not.
+  // Works on any data type.
+  struct Value;
+  Value LastValue() { return stack_.back(); }
+  void ReuseValue(Value v) { stack_.push_back(v); }
+  void ReuseValue(const char *key, Value v) {
+    Key(key);
+    ReuseValue(v);
+  }
+
+  // Overloaded Add that tries to call the correct function above.
+  void Add(int8_t i) { Int(i); }
+  void Add(int16_t i) { Int(i); }
+  void Add(int32_t i) { Int(i); }
+  void Add(int64_t i) { Int(i); }
+  void Add(uint8_t u) { UInt(u); }
+  void Add(uint16_t u) { UInt(u); }
+  void Add(uint32_t u) { UInt(u); }
+  void Add(uint64_t u) { UInt(u); }
+  void Add(float f) { Float(f); }
+  void Add(double d) { Double(d); }
+  void Add(bool b) { Bool(b); }
+  void Add(const char *str) { String(str); }
+  void Add(const std::string &str) { String(str); }
+  void Add(const flexbuffers::String &str) { String(str); }
+
+  template<typename T> void Add(const std::vector<T> &vec) { Vector(vec); }
+
+  template<typename T> void Add(const char *key, const T &t) {
+    Key(key);
+    Add(t);
+  }
+
+  template<typename T> void Add(const std::map<std::string, T> &map) {
+    Map(map);
+  }
+
+  template<typename T> void operator+=(const T &t) { Add(t); }
+
+  // This function is useful in combination with the Mutate* functions above.
+  // It forces elements of vectors and maps to have a minimum size, such that
+  // they can later be updated without failing.
+  // Call with no arguments to reset.
+  void ForceMinimumBitWidth(BitWidth bw = BIT_WIDTH_8) {
+    force_min_bit_width_ = bw;
+  }
+
+  void Finish() {
+    // If you hit this assert, you likely have objects that were never included
+    // in a parent. You need to have exactly one root to finish a buffer.
+    // Check your Start/End calls are matched, and all objects are inside
+    // some other object.
+    FLATBUFFERS_ASSERT(stack_.size() == 1);
+
+    // Write root value.
+    auto byte_width = Align(stack_[0].ElemWidth(buf_.size(), 0));
+    WriteAny(stack_[0], byte_width);
+    // Write root type.
+    Write(stack_[0].StoredPackedType(), 1);
+    // Write root size. Normally determined by parent, but root has no parent :)
+    Write(byte_width, 1);
+
+    finished_ = true;
+  }
+
+ private:
+  void Finished() const {
+    // If you get this assert, you're attempting to get access a buffer
+    // which hasn't been finished yet. Be sure to call
+    // Builder::Finish with your root object.
+    FLATBUFFERS_ASSERT(finished_);
+  }
+
+  // Align to prepare for writing a scalar with a certain size.
+  uint8_t Align(BitWidth alignment) {
+    auto byte_width = 1U << alignment;
+    buf_.insert(buf_.end(), flatbuffers::PaddingBytes(buf_.size(), byte_width),
+                0);
+    return static_cast<uint8_t>(byte_width);
+  }
+
+  void WriteBytes(const void *val, size_t size) {
+    buf_.insert(buf_.end(), reinterpret_cast<const uint8_t *>(val),
+                reinterpret_cast<const uint8_t *>(val) + size);
+  }
+
+  template<typename T> void Write(T val, size_t byte_width) {
+    FLATBUFFERS_ASSERT(sizeof(T) >= byte_width);
+    val = flatbuffers::EndianScalar(val);
+    WriteBytes(&val, byte_width);
+  }
+
+  void WriteDouble(double f, uint8_t byte_width) {
+    switch (byte_width) {
+      case 8: Write(f, byte_width); break;
+      case 4: Write(static_cast<float>(f), byte_width); break;
+      // case 2: Write(static_cast<half>(f), byte_width); break;
+      // case 1: Write(static_cast<quarter>(f), byte_width); break;
+      default: FLATBUFFERS_ASSERT(0);
+    }
+  }
+
+  void WriteOffset(uint64_t o, uint8_t byte_width) {
+    auto reloff = buf_.size() - o;
+    FLATBUFFERS_ASSERT(byte_width == 8 || reloff < 1ULL << (byte_width * 8));
+    Write(reloff, byte_width);
+  }
+
+  template<typename T> void PushIndirect(T val, Type type, BitWidth bit_width) {
+    auto byte_width = Align(bit_width);
+    auto iloc = buf_.size();
+    Write(val, byte_width);
+    stack_.push_back(Value(static_cast<uint64_t>(iloc), type, bit_width));
+  }
+
+  static BitWidth WidthB(size_t byte_width) {
+    switch (byte_width) {
+      case 1: return BIT_WIDTH_8;
+      case 2: return BIT_WIDTH_16;
+      case 4: return BIT_WIDTH_32;
+      case 8: return BIT_WIDTH_64;
+      default: FLATBUFFERS_ASSERT(false); return BIT_WIDTH_64;
+    }
+  }
+
+  template<typename T> static Type GetScalarType() {
+    static_assert(flatbuffers::is_scalar<T>::value, "Unrelated types");
+    return flatbuffers::is_floating_point<T>::value
+               ? FBT_FLOAT
+               : flatbuffers::is_same<T, bool>::value
+                     ? FBT_BOOL
+                     : (flatbuffers::is_unsigned<T>::value ? FBT_UINT
+                                                           : FBT_INT);
+  }
+
+ public:
+  // This was really intended to be private, except for LastValue/ReuseValue.
+  struct Value {
+    union {
+      int64_t i_;
+      uint64_t u_;
+      double f_;
+    };
+
+    Type type_;
+
+    // For scalars: of itself, for vector: of its elements, for string: length.
+    BitWidth min_bit_width_;
+
+    Value() : i_(0), type_(FBT_NULL), min_bit_width_(BIT_WIDTH_8) {}
+
+    Value(bool b)
+        : u_(static_cast<uint64_t>(b)),
+          type_(FBT_BOOL),
+          min_bit_width_(BIT_WIDTH_8) {}
+
+    Value(int64_t i, Type t, BitWidth bw)
+        : i_(i), type_(t), min_bit_width_(bw) {}
+    Value(uint64_t u, Type t, BitWidth bw)
+        : u_(u), type_(t), min_bit_width_(bw) {}
+
+    Value(float f)
+        : f_(static_cast<double>(f)),
+          type_(FBT_FLOAT),
+          min_bit_width_(BIT_WIDTH_32) {}
+    Value(double f) : f_(f), type_(FBT_FLOAT), min_bit_width_(WidthF(f)) {}
+
+    uint8_t StoredPackedType(BitWidth parent_bit_width_ = BIT_WIDTH_8) const {
+      return PackedType(StoredWidth(parent_bit_width_), type_);
+    }
+
+    BitWidth ElemWidth(size_t buf_size, size_t elem_index) const {
+      if (IsInline(type_)) {
+        return min_bit_width_;
+      } else {
+        // We have an absolute offset, but want to store a relative offset
+        // elem_index elements beyond the current buffer end. Since whether
+        // the relative offset fits in a certain byte_width depends on
+        // the size of the elements before it (and their alignment), we have
+        // to test for each size in turn.
+        for (size_t byte_width = 1;
+             byte_width <= sizeof(flatbuffers::largest_scalar_t);
+             byte_width *= 2) {
+          // Where are we going to write this offset?
+          auto offset_loc = buf_size +
+                            flatbuffers::PaddingBytes(buf_size, byte_width) +
+                            elem_index * byte_width;
+          // Compute relative offset.
+          auto offset = offset_loc - u_;
+          // Does it fit?
+          auto bit_width = WidthU(offset);
+          if (static_cast<size_t>(static_cast<size_t>(1U) << bit_width) ==
+              byte_width)
+            return bit_width;
+        }
+        FLATBUFFERS_ASSERT(false);  // Must match one of the sizes above.
+        return BIT_WIDTH_64;
+      }
+    }
+
+    BitWidth StoredWidth(BitWidth parent_bit_width_ = BIT_WIDTH_8) const {
+      if (IsInline(type_)) {
+        return (std::max)(min_bit_width_, parent_bit_width_);
+      } else {
+        return min_bit_width_;
+      }
+    }
+  };
+
+ private:
+  void WriteAny(const Value &val, uint8_t byte_width) {
+    switch (val.type_) {
+      case FBT_NULL:
+      case FBT_INT: Write(val.i_, byte_width); break;
+      case FBT_BOOL:
+      case FBT_UINT: Write(val.u_, byte_width); break;
+      case FBT_FLOAT: WriteDouble(val.f_, byte_width); break;
+      default: WriteOffset(val.u_, byte_width); break;
+    }
+  }
+
+  size_t CreateBlob(const void *data, size_t len, size_t trailing, Type type) {
+    auto bit_width = WidthU(len);
+    auto byte_width = Align(bit_width);
+    Write<uint64_t>(len, byte_width);
+    auto sloc = buf_.size();
+    WriteBytes(data, len + trailing);
+    stack_.push_back(Value(static_cast<uint64_t>(sloc), type, bit_width));
+    return sloc;
+  }
+
+  template<typename T>
+  size_t ScalarVector(const T *elems, size_t len, bool fixed) {
+    auto vector_type = GetScalarType<T>();
+    auto byte_width = sizeof(T);
+    auto bit_width = WidthB(byte_width);
+    // If you get this assert, you're trying to write a vector with a size
+    // field that is bigger than the scalars you're trying to write (e.g. a
+    // byte vector > 255 elements). For such types, write a "blob" instead.
+    // TODO: instead of asserting, could write vector with larger elements
+    // instead, though that would be wasteful.
+    FLATBUFFERS_ASSERT(WidthU(len) <= bit_width);
+    Align(bit_width);
+    if (!fixed) Write<uint64_t>(len, byte_width);
+    auto vloc = buf_.size();
+    for (size_t i = 0; i < len; i++) Write(elems[i], byte_width);
+    stack_.push_back(Value(static_cast<uint64_t>(vloc),
+                           ToTypedVector(vector_type, fixed ? len : 0),
+                           bit_width));
+    return vloc;
+  }
+
+  Value CreateVector(size_t start, size_t vec_len, size_t step, bool typed,
+                     bool fixed, const Value *keys = nullptr) {
+    FLATBUFFERS_ASSERT(
+        !fixed ||
+        typed);  // typed=false, fixed=true combination is not supported.
+    // Figure out smallest bit width we can store this vector with.
+    auto bit_width = (std::max)(force_min_bit_width_, WidthU(vec_len));
+    auto prefix_elems = 1;
+    if (keys) {
+      // If this vector is part of a map, we will pre-fix an offset to the keys
+      // to this vector.
+      bit_width = (std::max)(bit_width, keys->ElemWidth(buf_.size(), 0));
+      prefix_elems += 2;
+    }
+    Type vector_type = FBT_KEY;
+    // Check bit widths and types for all elements.
+    for (size_t i = start; i < stack_.size(); i += step) {
+      auto elem_width =
+          stack_[i].ElemWidth(buf_.size(), i - start + prefix_elems);
+      bit_width = (std::max)(bit_width, elem_width);
+      if (typed) {
+        if (i == start) {
+          vector_type = stack_[i].type_;
+        } else {
+          // If you get this assert, you are writing a typed vector with
+          // elements that are not all the same type.
+          FLATBUFFERS_ASSERT(vector_type == stack_[i].type_);
+        }
+      }
+    }
+    // If you get this assert, your fixed types are not one of:
+    // Int / UInt / Float / Key.
+    FLATBUFFERS_ASSERT(!fixed || IsTypedVectorElementType(vector_type));
+    auto byte_width = Align(bit_width);
+    // Write vector. First the keys width/offset if available, and size.
+    if (keys) {
+      WriteOffset(keys->u_, byte_width);
+      Write<uint64_t>(1ULL << keys->min_bit_width_, byte_width);
+    }
+    if (!fixed) Write<uint64_t>(vec_len, byte_width);
+    // Then the actual data.
+    auto vloc = buf_.size();
+    for (size_t i = start; i < stack_.size(); i += step) {
+      WriteAny(stack_[i], byte_width);
+    }
+    // Then the types.
+    if (!typed) {
+      for (size_t i = start; i < stack_.size(); i += step) {
+        buf_.push_back(stack_[i].StoredPackedType(bit_width));
+      }
+    }
+    return Value(static_cast<uint64_t>(vloc),
+                 keys ? FBT_MAP
+                      : (typed ? ToTypedVector(vector_type, fixed ? vec_len : 0)
+                               : FBT_VECTOR),
+                 bit_width);
+  }
+
+  // You shouldn't really be copying instances of this class.
+  Builder(const Builder &);
+  Builder &operator=(const Builder &);
+
+  std::vector<uint8_t> buf_;
+  std::vector<Value> stack_;
+
+  bool finished_;
+  bool has_duplicate_keys_;
+
+  BuilderFlag flags_;
+
+  BitWidth force_min_bit_width_;
+
+  struct KeyOffsetCompare {
+    explicit KeyOffsetCompare(const std::vector<uint8_t> &buf) : buf_(&buf) {}
+    bool operator()(size_t a, size_t b) const {
+      auto stra =
+          reinterpret_cast<const char *>(flatbuffers::vector_data(*buf_) + a);
+      auto strb =
+          reinterpret_cast<const char *>(flatbuffers::vector_data(*buf_) + b);
+      return strcmp(stra, strb) < 0;
+    }
+    const std::vector<uint8_t> *buf_;
+  };
+
+  typedef std::pair<size_t, size_t> StringOffset;
+  struct StringOffsetCompare {
+    explicit StringOffsetCompare(const std::vector<uint8_t> &buf)
+        : buf_(&buf) {}
+    bool operator()(const StringOffset &a, const StringOffset &b) const {
+      auto stra = reinterpret_cast<const char *>(
+          flatbuffers::vector_data(*buf_) + a.first);
+      auto strb = reinterpret_cast<const char *>(
+          flatbuffers::vector_data(*buf_) + b.first);
+      return strncmp(stra, strb, (std::min)(a.second, b.second) + 1) < 0;
+    }
+    const std::vector<uint8_t> *buf_;
+  };
+
+  typedef std::set<size_t, KeyOffsetCompare> KeyOffsetMap;
+  typedef std::set<StringOffset, StringOffsetCompare> StringOffsetMap;
+
+  KeyOffsetMap key_pool;
+  StringOffsetMap string_pool;
+};
+
+}  // namespace flexbuffers
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
+
+#endif  // FLATBUFFERS_FLEXBUFFERS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/grpc.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/grpc.h
new file mode 100644
index 0000000..b793555
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/grpc.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_GRPC_H_
+#define FLATBUFFERS_GRPC_H_
+
+// Helper functionality to glue FlatBuffers and GRPC.
+
+#include "flatbuffers/flatbuffers.h"
+#include "grpc/byte_buffer_reader.h"
+#include "grpcpp/support/byte_buffer.h"
+
+namespace flatbuffers {
+namespace grpc {
+
+// Message is a typed wrapper around a buffer that manages the underlying
+// `grpc_slice` and also provides flatbuffers-specific helpers such as `Verify`
+// and `GetRoot`. Since it is backed by a `grpc_slice`, the underlying buffer
+// is refcounted and ownership is be managed automatically.
+template<class T> class Message {
+ public:
+  Message() : slice_(grpc_empty_slice()) {}
+
+  Message(grpc_slice slice, bool add_ref)
+      : slice_(add_ref ? grpc_slice_ref(slice) : slice) {}
+
+  Message &operator=(const Message &other) = delete;
+
+  Message(Message &&other) : slice_(other.slice_) {
+    other.slice_ = grpc_empty_slice();
+  }
+
+  Message(const Message &other) = delete;
+
+  Message &operator=(Message &&other) {
+    grpc_slice_unref(slice_);
+    slice_ = other.slice_;
+    other.slice_ = grpc_empty_slice();
+    return *this;
+  }
+
+  ~Message() { grpc_slice_unref(slice_); }
+
+  const uint8_t *mutable_data() const { return GRPC_SLICE_START_PTR(slice_); }
+
+  const uint8_t *data() const { return GRPC_SLICE_START_PTR(slice_); }
+
+  size_t size() const { return GRPC_SLICE_LENGTH(slice_); }
+
+  bool Verify() const {
+    Verifier verifier(data(), size());
+    return verifier.VerifyBuffer<T>(nullptr);
+  }
+
+  T *GetMutableRoot() { return flatbuffers::GetMutableRoot<T>(mutable_data()); }
+
+  const T *GetRoot() const { return flatbuffers::GetRoot<T>(data()); }
+
+  // This is only intended for serializer use, or if you know what you're doing
+  const grpc_slice &BorrowSlice() const { return slice_; }
+
+ private:
+  grpc_slice slice_;
+};
+
+class MessageBuilder;
+
+// SliceAllocator is a gRPC-specific allocator that uses the `grpc_slice`
+// refcounted slices to manage memory ownership. This makes it easy and
+// efficient to transfer buffers to gRPC.
+class SliceAllocator : public Allocator {
+ public:
+  SliceAllocator() : slice_(grpc_empty_slice()) {}
+
+  SliceAllocator(const SliceAllocator &other) = delete;
+  SliceAllocator &operator=(const SliceAllocator &other) = delete;
+
+  SliceAllocator(SliceAllocator &&other) : slice_(grpc_empty_slice()) {
+    // default-construct and swap idiom
+    swap(other);
+  }
+
+  SliceAllocator &operator=(SliceAllocator &&other) {
+    // move-construct and swap idiom
+    SliceAllocator temp(std::move(other));
+    swap(temp);
+    return *this;
+  }
+
+  void swap(SliceAllocator &other) {
+    using std::swap;
+    swap(slice_, other.slice_);
+  }
+
+  virtual ~SliceAllocator() { grpc_slice_unref(slice_); }
+
+  virtual uint8_t *allocate(size_t size) override {
+    FLATBUFFERS_ASSERT(GRPC_SLICE_IS_EMPTY(slice_));
+    slice_ = grpc_slice_malloc(size);
+    return GRPC_SLICE_START_PTR(slice_);
+  }
+
+  virtual void deallocate(uint8_t *p, size_t size) override {
+    FLATBUFFERS_ASSERT(p == GRPC_SLICE_START_PTR(slice_));
+    FLATBUFFERS_ASSERT(size == GRPC_SLICE_LENGTH(slice_));
+    grpc_slice_unref(slice_);
+    slice_ = grpc_empty_slice();
+  }
+
+  virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
+                                       size_t new_size, size_t in_use_back,
+                                       size_t in_use_front) override {
+    FLATBUFFERS_ASSERT(old_p == GRPC_SLICE_START_PTR(slice_));
+    FLATBUFFERS_ASSERT(old_size == GRPC_SLICE_LENGTH(slice_));
+    FLATBUFFERS_ASSERT(new_size > old_size);
+    grpc_slice old_slice = slice_;
+    grpc_slice new_slice = grpc_slice_malloc(new_size);
+    uint8_t *new_p = GRPC_SLICE_START_PTR(new_slice);
+    memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
+                    in_use_front);
+    slice_ = new_slice;
+    grpc_slice_unref(old_slice);
+    return new_p;
+  }
+
+ private:
+  grpc_slice &get_slice(uint8_t *p, size_t size) {
+    FLATBUFFERS_ASSERT(p == GRPC_SLICE_START_PTR(slice_));
+    FLATBUFFERS_ASSERT(size == GRPC_SLICE_LENGTH(slice_));
+    return slice_;
+  }
+
+  grpc_slice slice_;
+
+  friend class MessageBuilder;
+};
+
+// SliceAllocatorMember is a hack to ensure that the MessageBuilder's
+// slice_allocator_ member is constructed before the FlatBufferBuilder, since
+// the allocator is used in the FlatBufferBuilder ctor.
+namespace detail {
+struct SliceAllocatorMember {
+  SliceAllocator slice_allocator_;
+};
+}  // namespace detail
+
+// MessageBuilder is a gRPC-specific FlatBufferBuilder that uses SliceAllocator
+// to allocate gRPC buffers.
+class MessageBuilder : private detail::SliceAllocatorMember,
+                       public FlatBufferBuilder {
+ public:
+  explicit MessageBuilder(uoffset_t initial_size = 1024)
+      : FlatBufferBuilder(initial_size, &slice_allocator_, false) {}
+
+  MessageBuilder(const MessageBuilder &other) = delete;
+  MessageBuilder &operator=(const MessageBuilder &other) = delete;
+
+  MessageBuilder(MessageBuilder &&other)
+      : FlatBufferBuilder(1024, &slice_allocator_, false) {
+    // Default construct and swap idiom.
+    Swap(other);
+  }
+
+  /// Create a MessageBuilder from a FlatBufferBuilder.
+  explicit MessageBuilder(FlatBufferBuilder &&src,
+                          void (*dealloc)(void *,
+                                          size_t) = &DefaultAllocator::dealloc)
+      : FlatBufferBuilder(1024, &slice_allocator_, false) {
+    src.Swap(*this);
+    src.SwapBufAllocator(*this);
+    if (buf_.capacity()) {
+      uint8_t *buf = buf_.scratch_data();  // pointer to memory
+      size_t capacity = buf_.capacity();   // size of memory
+      slice_allocator_.slice_ = grpc_slice_new_with_len(buf, capacity, dealloc);
+    } else {
+      slice_allocator_.slice_ = grpc_empty_slice();
+    }
+  }
+
+  /// Move-assign a FlatBufferBuilder to a MessageBuilder.
+  /// Only FlatBufferBuilder with default allocator (basically, nullptr) is
+  /// supported.
+  MessageBuilder &operator=(FlatBufferBuilder &&src) {
+    // Move construct a temporary and swap
+    MessageBuilder temp(std::move(src));
+    Swap(temp);
+    return *this;
+  }
+
+  MessageBuilder &operator=(MessageBuilder &&other) {
+    // Move construct a temporary and swap
+    MessageBuilder temp(std::move(other));
+    Swap(temp);
+    return *this;
+  }
+
+  void Swap(MessageBuilder &other) {
+    slice_allocator_.swap(other.slice_allocator_);
+    FlatBufferBuilder::Swap(other);
+    // After swapping the FlatBufferBuilder, we swap back the allocator, which
+    // restores the original allocator back in place. This is necessary because
+    // MessageBuilder's allocator is its own member (SliceAllocatorMember). The
+    // allocator passed to FlatBufferBuilder::vector_downward must point to this
+    // member.
+    buf_.swap_allocator(other.buf_);
+  }
+
+  // Releases the ownership of the buffer pointer.
+  // Returns the size, offset, and the original grpc_slice that
+  // allocated the buffer. Also see grpc_slice_unref().
+  uint8_t *ReleaseRaw(size_t &size, size_t &offset, grpc_slice &slice) {
+    uint8_t *buf = FlatBufferBuilder::ReleaseRaw(size, offset);
+    slice = slice_allocator_.slice_;
+    slice_allocator_.slice_ = grpc_empty_slice();
+    return buf;
+  }
+
+  ~MessageBuilder() {}
+
+  // GetMessage extracts the subslice of the buffer corresponding to the
+  // flatbuffers-encoded region and wraps it in a `Message<T>` to handle buffer
+  // ownership.
+  template<class T> Message<T> GetMessage() {
+    auto buf_data = buf_.scratch_data();  // pointer to memory
+    auto buf_size = buf_.capacity();      // size of memory
+    auto msg_data = buf_.data();          // pointer to msg
+    auto msg_size = buf_.size();          // size of msg
+    // Do some sanity checks on data/size
+    FLATBUFFERS_ASSERT(msg_data);
+    FLATBUFFERS_ASSERT(msg_size);
+    FLATBUFFERS_ASSERT(msg_data >= buf_data);
+    FLATBUFFERS_ASSERT(msg_data + msg_size <= buf_data + buf_size);
+    // Calculate offsets from the buffer start
+    auto begin = msg_data - buf_data;
+    auto end = begin + msg_size;
+    // Get the slice we are working with (no refcount change)
+    grpc_slice slice = slice_allocator_.get_slice(buf_data, buf_size);
+    // Extract a subslice of the existing slice (increment refcount)
+    grpc_slice subslice = grpc_slice_sub(slice, begin, end);
+    // Wrap the subslice in a `Message<T>`, but don't increment refcount
+    Message<T> msg(subslice, false);
+    return msg;
+  }
+
+  template<class T> Message<T> ReleaseMessage() {
+    Message<T> msg = GetMessage<T>();
+    Reset();
+    return msg;
+  }
+
+ private:
+  // SliceAllocator slice_allocator_;  // part of SliceAllocatorMember
+};
+
+}  // namespace grpc
+}  // namespace flatbuffers
+
+namespace grpc {
+
+template<class T> class SerializationTraits<flatbuffers::grpc::Message<T>> {
+ public:
+  static grpc::Status Serialize(const flatbuffers::grpc::Message<T> &msg,
+                                grpc_byte_buffer **buffer, bool *own_buffer) {
+    // We are passed in a `Message<T>`, which is a wrapper around a
+    // `grpc_slice`. We extract it here using `BorrowSlice()`. The const cast
+    // is necessary because the `grpc_raw_byte_buffer_create` func expects
+    // non-const slices in order to increment their refcounts.
+    grpc_slice *slice = const_cast<grpc_slice *>(&msg.BorrowSlice());
+    // Now use `grpc_raw_byte_buffer_create` to package the single slice into a
+    // `grpc_byte_buffer`, incrementing the refcount in the process.
+    *buffer = grpc_raw_byte_buffer_create(slice, 1);
+    *own_buffer = true;
+    return grpc::Status::OK;
+  }
+
+  // Deserialize by pulling the
+  static grpc::Status Deserialize(ByteBuffer *buf,
+                                  flatbuffers::grpc::Message<T> *msg) {
+    grpc_byte_buffer *buffer = *reinterpret_cast<grpc_byte_buffer **>(buf);
+    if (!buffer) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL, "No payload");
+    }
+    // Check if this is a single uncompressed slice.
+    if ((buffer->type == GRPC_BB_RAW) &&
+        (buffer->data.raw.compression == GRPC_COMPRESS_NONE) &&
+        (buffer->data.raw.slice_buffer.count == 1)) {
+      // If it is, then we can reference the `grpc_slice` directly.
+      grpc_slice slice = buffer->data.raw.slice_buffer.slices[0];
+      // We wrap a `Message<T>` around the slice, incrementing the refcount.
+      *msg = flatbuffers::grpc::Message<T>(slice, true);
+    } else {
+      // Otherwise, we need to use `grpc_byte_buffer_reader_readall` to read
+      // `buffer` into a single contiguous `grpc_slice`. The gRPC reader gives
+      // us back a new slice with the refcount already incremented.
+      grpc_byte_buffer_reader reader;
+      grpc_byte_buffer_reader_init(&reader, buffer);
+      grpc_slice slice = grpc_byte_buffer_reader_readall(&reader);
+      grpc_byte_buffer_reader_destroy(&reader);
+      // We wrap a `Message<T>` around the slice, but don't increment refcount
+      *msg = flatbuffers::grpc::Message<T>(slice, false);
+    }
+    grpc_byte_buffer_destroy(buffer);
+#if FLATBUFFERS_GRPC_DISABLE_AUTO_VERIFICATION
+    return ::grpc::Status::OK;
+#else
+    if (msg->Verify()) {
+      return ::grpc::Status::OK;
+    } else {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            "Message verification failed");
+    }
+#endif
+  }
+};
+
+}  // namespace grpc
+
+#endif  // FLATBUFFERS_GRPC_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/hash.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/hash.h
new file mode 100644
index 0000000..aebf071
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/hash.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_HASH_H_
+#define FLATBUFFERS_HASH_H_
+
+#include <cstdint>
+#include <cstring>
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace flatbuffers {
+
+template<typename T> struct FnvTraits {
+  static const T kFnvPrime;
+  static const T kOffsetBasis;
+};
+
+template<> struct FnvTraits<uint32_t> {
+  static const uint32_t kFnvPrime = 0x01000193;
+  static const uint32_t kOffsetBasis = 0x811C9DC5;
+};
+
+template<> struct FnvTraits<uint64_t> {
+  static const uint64_t kFnvPrime = 0x00000100000001b3ULL;
+  static const uint64_t kOffsetBasis = 0xcbf29ce484222645ULL;
+};
+
+template<typename T> T HashFnv1(const char *input) {
+  T hash = FnvTraits<T>::kOffsetBasis;
+  for (const char *c = input; *c; ++c) {
+    hash *= FnvTraits<T>::kFnvPrime;
+    hash ^= static_cast<unsigned char>(*c);
+  }
+  return hash;
+}
+
+template<typename T> T HashFnv1a(const char *input) {
+  T hash = FnvTraits<T>::kOffsetBasis;
+  for (const char *c = input; *c; ++c) {
+    hash ^= static_cast<unsigned char>(*c);
+    hash *= FnvTraits<T>::kFnvPrime;
+  }
+  return hash;
+}
+
+template<> inline uint16_t HashFnv1<uint16_t>(const char *input) {
+  uint32_t hash = HashFnv1<uint32_t>(input);
+  return (hash >> 16) ^ (hash & 0xffff);
+}
+
+template<> inline uint16_t HashFnv1a<uint16_t>(const char *input) {
+  uint32_t hash = HashFnv1a<uint32_t>(input);
+  return (hash >> 16) ^ (hash & 0xffff);
+}
+
+template<typename T> struct NamedHashFunction {
+  const char *name;
+
+  typedef T (*HashFunction)(const char *);
+  HashFunction function;
+};
+
+const NamedHashFunction<uint16_t> kHashFunctions16[] = {
+  { "fnv1_16", HashFnv1<uint16_t> },
+  { "fnv1a_16", HashFnv1a<uint16_t> },
+};
+
+const NamedHashFunction<uint32_t> kHashFunctions32[] = {
+  { "fnv1_32", HashFnv1<uint32_t> },
+  { "fnv1a_32", HashFnv1a<uint32_t> },
+};
+
+const NamedHashFunction<uint64_t> kHashFunctions64[] = {
+  { "fnv1_64", HashFnv1<uint64_t> },
+  { "fnv1a_64", HashFnv1a<uint64_t> },
+};
+
+inline NamedHashFunction<uint16_t>::HashFunction FindHashFunction16(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions16) / sizeof(kHashFunctions16[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions16[i].name) == 0) {
+      return kHashFunctions16[i].function;
+    }
+  }
+  return nullptr;
+}
+
+inline NamedHashFunction<uint32_t>::HashFunction FindHashFunction32(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions32) / sizeof(kHashFunctions32[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions32[i].name) == 0) {
+      return kHashFunctions32[i].function;
+    }
+  }
+  return nullptr;
+}
+
+inline NamedHashFunction<uint64_t>::HashFunction FindHashFunction64(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions64) / sizeof(kHashFunctions64[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions64[i].name) == 0) {
+      return kHashFunctions64[i].function;
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_HASH_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/idl.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/idl.h
new file mode 100644
index 0000000..29bf621
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/idl.h
@@ -0,0 +1,1198 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_IDL_H_
+#define FLATBUFFERS_IDL_H_
+
+#include <map>
+#include <memory>
+#include <stack>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/hash.h"
+#include "flatbuffers/reflection.h"
+
+#if !defined(FLATBUFFERS_CPP98_STL)
+#  include <functional>
+#endif  // !defined(FLATBUFFERS_CPP98_STL)
+
+// This file defines the data types representing a parsed IDL (Interface
+// Definition Language) / schema file.
+
+// Limits maximum depth of nested objects.
+// Prevents stack overflow while parse scheme, or json, or flexbuffer.
+#if !defined(FLATBUFFERS_MAX_PARSING_DEPTH)
+#  define FLATBUFFERS_MAX_PARSING_DEPTH 64
+#endif
+
+namespace flatbuffers {
+
+// The order of these matters for Is*() functions below.
+// Additionally, Parser::ParseType assumes bool..string is a contiguous range
+// of type tokens.
+// clang-format off
+#define FLATBUFFERS_GEN_TYPES_SCALAR(TD) \
+  TD(NONE,   "",       uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8) \
+  TD(UTYPE,  "",       uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8) /* begin scalar/int */ \
+  TD(BOOL,   "bool",   uint8_t,  boolean,bool,    bool,   bool,    bool, Boolean, Bool) \
+  TD(CHAR,   "byte",   int8_t,   byte,   int8,    sbyte,  int8,    i8,   Byte, Int8) \
+  TD(UCHAR,  "ubyte",  uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8) \
+  TD(SHORT,  "short",  int16_t,  short,  int16,   short,  int16,   i16,  Short, Int16) \
+  TD(USHORT, "ushort", uint16_t, short,  uint16,  ushort, uint16,  u16,  UShort, UInt16) \
+  TD(INT,    "int",    int32_t,  int,    int32,   int,    int32,   i32,  Int, Int32) \
+  TD(UINT,   "uint",   uint32_t, int,    uint32,  uint,   uint32,  u32,  UInt, UInt32) \
+  TD(LONG,   "long",   int64_t,  long,   int64,   long,   int64,   i64,  Long, Int64) \
+  TD(ULONG,  "ulong",  uint64_t, long,   uint64,  ulong,  uint64,  u64,  ULong, UInt64) /* end int */ \
+  TD(FLOAT,  "float",  float,    float,  float32, float,  float32, f32,  Float, Float32) /* begin float */ \
+  TD(DOUBLE, "double", double,   double, float64, double, float64, f64,  Double, Double) /* end float/scalar */
+#define FLATBUFFERS_GEN_TYPES_POINTER(TD) \
+  TD(STRING, "string", Offset<void>, int, int, StringOffset, int, unused, Int, Offset<String>) \
+  TD(VECTOR, "",       Offset<void>, int, int, VectorOffset, int, unused, Int, Offset<UOffset>) \
+  TD(STRUCT, "",       Offset<void>, int, int, int,          int, unused, Int, Offset<UOffset>) \
+  TD(UNION,  "",       Offset<void>, int, int, int,          int, unused, Int, Offset<UOffset>)
+#define FLATBUFFERS_GEN_TYPE_ARRAY(TD) \
+  TD(ARRAY,  "",       int,          int, int, int,          int, unused, Int, Offset<UOffset>)
+// The fields are:
+// - enum
+// - FlatBuffers schema type.
+// - C++ type.
+// - Java type.
+// - Go type.
+// - C# / .Net type.
+// - Python type.
+// - Rust type.
+// - Kotlin type.
+
+// using these macros, we can now write code dealing with types just once, e.g.
+
+/*
+switch (type) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, \
+                         RTYPE, KTYPE) \
+    case BASE_TYPE_ ## ENUM: \
+      // do something specific to CTYPE here
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+}
+*/
+
+// If not all FLATBUFFERS_GEN_() arguments are necessary for implementation
+// of FLATBUFFERS_TD, you can use a variadic macro (with __VA_ARGS__ if needed).
+// In the above example, only CTYPE is used to generate the code, it can be rewritten:
+
+/*
+switch (type) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+    case BASE_TYPE_ ## ENUM: \
+      // do something specific to CTYPE here
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+}
+*/
+
+#define FLATBUFFERS_GEN_TYPES(TD) \
+        FLATBUFFERS_GEN_TYPES_SCALAR(TD) \
+        FLATBUFFERS_GEN_TYPES_POINTER(TD) \
+        FLATBUFFERS_GEN_TYPE_ARRAY(TD)
+
+// Create an enum for all the types above.
+#ifdef __GNUC__
+__extension__  // Stop GCC complaining about trailing comma with -Wpendantic.
+#endif
+enum BaseType {
+  #define FLATBUFFERS_TD(ENUM, ...) \
+    BASE_TYPE_ ## ENUM,
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+};
+
+#define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+  static_assert(sizeof(CTYPE) <= sizeof(largest_scalar_t), \
+                "define largest_scalar_t as " #CTYPE);
+  FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+#undef FLATBUFFERS_TD
+
+inline bool IsScalar (BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_DOUBLE; }
+inline bool IsInteger(BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_ULONG; }
+inline bool IsFloat  (BaseType t) { return t == BASE_TYPE_FLOAT ||
+                                           t == BASE_TYPE_DOUBLE; }
+inline bool IsLong   (BaseType t) { return t == BASE_TYPE_LONG ||
+                                           t == BASE_TYPE_ULONG; }
+inline bool IsBool   (BaseType t) { return t == BASE_TYPE_BOOL; }
+inline bool IsOneByte(BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_UCHAR; }
+
+inline bool IsUnsigned(BaseType t) {
+  return (t == BASE_TYPE_UTYPE)  || (t == BASE_TYPE_UCHAR) ||
+         (t == BASE_TYPE_USHORT) || (t == BASE_TYPE_UINT)  ||
+         (t == BASE_TYPE_ULONG);
+}
+
+// clang-format on
+
+extern const char *const kTypeNames[];
+extern const char kTypeSizes[];
+
+inline size_t SizeOf(BaseType t) { return kTypeSizes[t]; }
+
+struct StructDef;
+struct EnumDef;
+class Parser;
+
+// Represents any type in the IDL, which is a combination of the BaseType
+// and additional information for vectors/structs_.
+struct Type {
+  explicit Type(BaseType _base_type = BASE_TYPE_NONE, StructDef *_sd = nullptr,
+                EnumDef *_ed = nullptr, uint16_t _fixed_length = 0)
+      : base_type(_base_type),
+        element(BASE_TYPE_NONE),
+        struct_def(_sd),
+        enum_def(_ed),
+        fixed_length(_fixed_length) {}
+
+  bool operator==(const Type &o) {
+    return base_type == o.base_type && element == o.element &&
+           struct_def == o.struct_def && enum_def == o.enum_def;
+  }
+
+  Type VectorType() const {
+    return Type(element, struct_def, enum_def, fixed_length);
+  }
+
+  Offset<reflection::Type> Serialize(FlatBufferBuilder *builder) const;
+
+  bool Deserialize(const Parser &parser, const reflection::Type *type);
+
+  BaseType base_type;
+  BaseType element;       // only set if t == BASE_TYPE_VECTOR
+  StructDef *struct_def;  // only set if t or element == BASE_TYPE_STRUCT
+  EnumDef *enum_def;      // set if t == BASE_TYPE_UNION / BASE_TYPE_UTYPE,
+                          // or for an integral type derived from an enum.
+  uint16_t fixed_length;  // only set if t == BASE_TYPE_ARRAY
+};
+
+// Represents a parsed scalar value, it's type, and field offset.
+struct Value {
+  Value()
+      : constant("0"),
+        offset(static_cast<voffset_t>(~(static_cast<voffset_t>(0U)))) {}
+  Type type;
+  std::string constant;
+  voffset_t offset;
+};
+
+// Helper class that retains the original order of a set of identifiers and
+// also provides quick lookup.
+template<typename T> class SymbolTable {
+ public:
+  ~SymbolTable() {
+    for (auto it = vec.begin(); it != vec.end(); ++it) { delete *it; }
+  }
+
+  bool Add(const std::string &name, T *e) {
+    vector_emplace_back(&vec, e);
+    auto it = dict.find(name);
+    if (it != dict.end()) return true;
+    dict[name] = e;
+    return false;
+  }
+
+  void Move(const std::string &oldname, const std::string &newname) {
+    auto it = dict.find(oldname);
+    if (it != dict.end()) {
+      auto obj = it->second;
+      dict.erase(it);
+      dict[newname] = obj;
+    } else {
+      FLATBUFFERS_ASSERT(false);
+    }
+  }
+
+  T *Lookup(const std::string &name) const {
+    auto it = dict.find(name);
+    return it == dict.end() ? nullptr : it->second;
+  }
+
+ public:
+  std::map<std::string, T *> dict;  // quick lookup
+  std::vector<T *> vec;             // Used to iterate in order of insertion
+};
+
+// A name space, as set in the schema.
+struct Namespace {
+  Namespace() : from_table(0) {}
+
+  // Given a (potentially unqualified) name, return the "fully qualified" name
+  // which has a full namespaced descriptor.
+  // With max_components you can request less than the number of components
+  // the current namespace has.
+  std::string GetFullyQualifiedName(const std::string &name,
+                                    size_t max_components = 1000) const;
+
+  std::vector<std::string> components;
+  size_t from_table;  // Part of the namespace corresponds to a message/table.
+};
+
+inline bool operator<(const Namespace &a, const Namespace &b) {
+  size_t min_size = std::min(a.components.size(), b.components.size());
+  for (size_t i = 0; i < min_size; ++i) {
+    if (a.components[i] != b.components[i])
+      return a.components[i] < b.components[i];
+  }
+  return a.components.size() < b.components.size();
+}
+
+// Base class for all definition types (fields, structs_, enums_).
+struct Definition {
+  Definition()
+      : generated(false),
+        defined_namespace(nullptr),
+        serialized_location(0),
+        index(-1),
+        refcount(1) {}
+
+  flatbuffers::Offset<
+      flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>>
+  SerializeAttributes(FlatBufferBuilder *builder, const Parser &parser) const;
+
+  bool DeserializeAttributes(Parser &parser,
+                             const Vector<Offset<reflection::KeyValue>> *attrs);
+
+  std::string name;
+  std::string file;
+  std::vector<std::string> doc_comment;
+  SymbolTable<Value> attributes;
+  bool generated;  // did we already output code for this definition?
+  Namespace *defined_namespace;  // Where it was defined.
+
+  // For use with Serialize()
+  uoffset_t serialized_location;
+  int index;  // Inside the vector it is stored.
+  int refcount;
+};
+
+struct FieldDef : public Definition {
+  FieldDef()
+      : deprecated(false),
+        key(false),
+        shared(false),
+        native_inline(false),
+        flexbuffer(false),
+        presence(kDefault),
+        nested_flatbuffer(NULL),
+        padding(0) {}
+
+  Offset<reflection::Field> Serialize(FlatBufferBuilder *builder, uint16_t id,
+                                      const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Field *field);
+
+  bool IsScalarOptional() const {
+    return IsScalar(value.type.base_type) && IsOptional();
+  }
+  bool IsOptional() const {
+    return presence == kOptional;
+  }
+  bool IsRequired() const {
+    return presence == kRequired;
+  }
+  bool IsDefault() const {
+    return presence == kDefault;
+  }
+
+  Value value;
+  bool deprecated;  // Field is allowed to be present in old data, but can't be.
+                    // written in new data nor accessed in new code.
+  bool key;         // Field functions as a key for creating sorted vectors.
+  bool shared;  // Field will be using string pooling (i.e. CreateSharedString)
+                // as default serialization behavior if field is a string.
+  bool native_inline;  // Field will be defined inline (instead of as a pointer)
+                       // for native tables if field is a struct.
+  bool flexbuffer;     // This field contains FlexBuffer data.
+
+  enum Presence {
+    // Field must always be present.
+    kRequired,
+    // Non-presence should be signalled to and controlled by users.
+    kOptional,
+    // Non-presence is hidden from users.
+    // Implementations may omit writing default values.
+    kDefault,
+  };
+  Presence static MakeFieldPresence(bool optional, bool required) {
+    FLATBUFFERS_ASSERT(!(required && optional));
+    // clang-format off
+    return required ? FieldDef::kRequired
+         : optional ? FieldDef::kOptional
+                    : FieldDef::kDefault;
+    // clang-format on
+  }
+  Presence presence;
+
+  StructDef *nested_flatbuffer;  // This field contains nested FlatBuffer data.
+  size_t padding;                // Bytes to always pad after this field.
+};
+
+struct StructDef : public Definition {
+  StructDef()
+      : fixed(false),
+        predecl(true),
+        sortbysize(true),
+        has_key(false),
+        minalign(1),
+        bytesize(0) {}
+
+  void PadLastField(size_t min_align) {
+    auto padding = PaddingBytes(bytesize, min_align);
+    bytesize += padding;
+    if (fields.vec.size()) fields.vec.back()->padding = padding;
+  }
+
+  Offset<reflection::Object> Serialize(FlatBufferBuilder *builder,
+                                       const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Object *object);
+
+  SymbolTable<FieldDef> fields;
+
+  bool fixed;       // If it's struct, not a table.
+  bool predecl;     // If it's used before it was defined.
+  bool sortbysize;  // Whether fields come in the declaration or size order.
+  bool has_key;     // It has a key field.
+  size_t minalign;  // What the whole object needs to be aligned to.
+  size_t bytesize;  // Size if fixed.
+
+  flatbuffers::unique_ptr<std::string> original_location;
+};
+
+struct EnumDef;
+struct EnumValBuilder;
+
+struct EnumVal {
+  Offset<reflection::EnumVal> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+
+  bool Deserialize(const Parser &parser, const reflection::EnumVal *val);
+
+  uint64_t GetAsUInt64() const { return static_cast<uint64_t>(value); }
+  int64_t GetAsInt64() const { return value; }
+  bool IsZero() const { return 0 == value; }
+  bool IsNonZero() const { return !IsZero(); }
+
+  std::string name;
+  std::vector<std::string> doc_comment;
+  Type union_type;
+
+ private:
+  friend EnumDef;
+  friend EnumValBuilder;
+  friend bool operator==(const EnumVal &lhs, const EnumVal &rhs);
+
+  EnumVal(const std::string &_name, int64_t _val) : name(_name), value(_val) {}
+  EnumVal() : value(0) {}
+
+  int64_t value;
+};
+
+struct EnumDef : public Definition {
+  EnumDef() : is_union(false), uses_multiple_type_instances(false) {}
+
+  Offset<reflection::Enum> Serialize(FlatBufferBuilder *builder,
+                                     const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Enum *values);
+
+  template<typename T> void ChangeEnumValue(EnumVal *ev, T new_val);
+  void SortByValue();
+  void RemoveDuplicates();
+
+  std::string AllFlags() const;
+  const EnumVal *MinValue() const;
+  const EnumVal *MaxValue() const;
+  // Returns the number of integer steps from v1 to v2.
+  uint64_t Distance(const EnumVal *v1, const EnumVal *v2) const;
+  // Returns the number of integer steps from Min to Max.
+  uint64_t Distance() const { return Distance(MinValue(), MaxValue()); }
+
+  EnumVal *ReverseLookup(int64_t enum_idx,
+                         bool skip_union_default = false) const;
+  EnumVal *FindByValue(const std::string &constant) const;
+
+  std::string ToString(const EnumVal &ev) const {
+    return IsUInt64() ? NumToString(ev.GetAsUInt64())
+                      : NumToString(ev.GetAsInt64());
+  }
+
+  size_t size() const { return vals.vec.size(); }
+
+  const std::vector<EnumVal *> &Vals() const { return vals.vec; }
+
+  const EnumVal *Lookup(const std::string &enum_name) const {
+    return vals.Lookup(enum_name);
+  }
+
+  bool is_union;
+  // Type is a union which uses type aliases where at least one type is
+  // available under two different names.
+  bool uses_multiple_type_instances;
+  Type underlying_type;
+
+ private:
+  bool IsUInt64() const {
+    return (BASE_TYPE_ULONG == underlying_type.base_type);
+  }
+
+  friend EnumValBuilder;
+  SymbolTable<EnumVal> vals;
+};
+
+inline bool IsString(const Type &type) {
+  return type.base_type == BASE_TYPE_STRING;
+}
+
+inline bool IsStruct(const Type &type) {
+  return type.base_type == BASE_TYPE_STRUCT && type.struct_def->fixed;
+}
+
+inline bool IsUnion(const Type &type) {
+  return type.enum_def != nullptr && type.enum_def->is_union;
+}
+
+inline bool IsVector(const Type &type) {
+  return type.base_type == BASE_TYPE_VECTOR;
+}
+
+inline bool IsArray(const Type &type) {
+  return type.base_type == BASE_TYPE_ARRAY;
+}
+
+inline bool IsSeries(const Type &type) {
+  return IsVector(type) || IsArray(type);
+}
+
+inline bool IsEnum(const Type &type) {
+  return type.enum_def != nullptr && IsInteger(type.base_type);
+}
+
+inline size_t InlineSize(const Type &type) {
+  return IsStruct(type)
+             ? type.struct_def->bytesize
+             : (IsArray(type)
+                    ? InlineSize(type.VectorType()) * type.fixed_length
+                    : SizeOf(type.base_type));
+}
+
+inline size_t InlineAlignment(const Type &type) {
+  if (IsStruct(type)) {
+    return type.struct_def->minalign;
+  } else if (IsArray(type)) {
+    return IsStruct(type.VectorType()) ? type.struct_def->minalign
+                                       : SizeOf(type.element);
+  } else {
+    return SizeOf(type.base_type);
+  }
+}
+inline bool operator==(const EnumVal &lhs, const EnumVal &rhs) {
+  return lhs.value == rhs.value;
+}
+inline bool operator!=(const EnumVal &lhs, const EnumVal &rhs) {
+  return !(lhs == rhs);
+}
+
+inline bool EqualByName(const Type &a, const Type &b) {
+  return a.base_type == b.base_type && a.element == b.element &&
+         (a.struct_def == b.struct_def ||
+          a.struct_def->name == b.struct_def->name) &&
+         (a.enum_def == b.enum_def || a.enum_def->name == b.enum_def->name);
+}
+
+struct RPCCall : public Definition {
+  Offset<reflection::RPCCall> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::RPCCall *call);
+
+  StructDef *request, *response;
+};
+
+struct ServiceDef : public Definition {
+  Offset<reflection::Service> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+  bool Deserialize(Parser &parser, const reflection::Service *service);
+
+  SymbolTable<RPCCall> calls;
+};
+
+// Container of options that may apply to any of the source/text generators.
+struct IDLOptions {
+  bool gen_jvmstatic;
+  // Use flexbuffers instead for binary and text generation
+  bool use_flexbuffers;
+  bool strict_json;
+  bool output_default_scalars_in_json;
+  int indent_step;
+  bool output_enum_identifiers;
+  bool prefixed_enums;
+  bool scoped_enums;
+  bool include_dependence_headers;
+  bool mutable_buffer;
+  bool one_file;
+  bool proto_mode;
+  bool proto_oneof_union;
+  bool generate_all;
+  bool skip_unexpected_fields_in_json;
+  bool generate_name_strings;
+  bool generate_object_based_api;
+  bool gen_compare;
+  std::string cpp_object_api_pointer_type;
+  std::string cpp_object_api_string_type;
+  bool cpp_object_api_string_flexible_constructor;
+  bool cpp_direct_copy;
+  bool gen_nullable;
+  bool java_checkerframework;
+  bool gen_generated;
+  std::string object_prefix;
+  std::string object_suffix;
+  bool union_value_namespacing;
+  bool allow_non_utf8;
+  bool natural_utf8;
+  std::string include_prefix;
+  bool keep_include_path;
+  bool binary_schema_comments;
+  bool binary_schema_builtins;
+  bool binary_schema_gen_embed;
+  std::string go_import;
+  std::string go_namespace;
+  bool protobuf_ascii_alike;
+  bool size_prefixed;
+  std::string root_type;
+  bool force_defaults;
+  bool java_primitive_has_method;
+  bool cs_gen_json_serializer;
+  std::vector<std::string> cpp_includes;
+  std::string cpp_std;
+  bool cpp_static_reflection;
+  std::string proto_namespace_suffix;
+  std::string filename_suffix;
+  std::string filename_extension;
+  bool no_warnings;
+
+  // Possible options for the more general generator below.
+  enum Language {
+    kJava = 1 << 0,
+    kCSharp = 1 << 1,
+    kGo = 1 << 2,
+    kCpp = 1 << 3,
+    kPython = 1 << 5,
+    kPhp = 1 << 6,
+    kJson = 1 << 7,
+    kBinary = 1 << 8,
+    kTs = 1 << 9,
+    kJsonSchema = 1 << 10,
+    kDart = 1 << 11,
+    kLua = 1 << 12,
+    kLobster = 1 << 13,
+    kRust = 1 << 14,
+    kKotlin = 1 << 15,
+    kSwift = 1 << 16,
+    kMAX
+  };
+
+  Language lang;
+
+  enum MiniReflect { kNone, kTypes, kTypesAndNames };
+
+  MiniReflect mini_reflect;
+
+  // If set, require all fields in a table to be explicitly numbered.
+  bool require_explicit_ids;
+
+  // The corresponding language bit will be set if a language is included
+  // for code generation.
+  unsigned long lang_to_generate;
+
+  // If set (default behavior), empty string fields will be set to nullptr to
+  // make the flatbuffer more compact.
+  bool set_empty_strings_to_null;
+
+  // If set (default behavior), empty vector fields will be set to nullptr to
+  // make the flatbuffer more compact.
+  bool set_empty_vectors_to_null;
+
+  IDLOptions()
+      : gen_jvmstatic(false),
+        use_flexbuffers(false),
+        strict_json(false),
+        output_default_scalars_in_json(false),
+        indent_step(2),
+        output_enum_identifiers(true),
+        prefixed_enums(true),
+        scoped_enums(false),
+        include_dependence_headers(true),
+        mutable_buffer(false),
+        one_file(false),
+        proto_mode(false),
+        proto_oneof_union(false),
+        generate_all(false),
+        skip_unexpected_fields_in_json(false),
+        generate_name_strings(false),
+        generate_object_based_api(false),
+        gen_compare(false),
+        cpp_object_api_pointer_type("std::unique_ptr"),
+        cpp_object_api_string_flexible_constructor(false),
+        cpp_direct_copy(true),
+        gen_nullable(false),
+        java_checkerframework(false),
+        gen_generated(false),
+        object_suffix("T"),
+        union_value_namespacing(true),
+        allow_non_utf8(false),
+        natural_utf8(false),
+        keep_include_path(false),
+        binary_schema_comments(false),
+        binary_schema_builtins(false),
+        binary_schema_gen_embed(false),
+        protobuf_ascii_alike(false),
+        size_prefixed(false),
+        force_defaults(false),
+        java_primitive_has_method(false),
+        cs_gen_json_serializer(false),
+        cpp_static_reflection(false),
+        filename_suffix("_generated"),
+        filename_extension(),
+        no_warnings(false),
+        lang(IDLOptions::kJava),
+        mini_reflect(IDLOptions::kNone),
+        require_explicit_ids(false),
+        lang_to_generate(0),
+        set_empty_strings_to_null(true),
+        set_empty_vectors_to_null(true) {}
+};
+
+// This encapsulates where the parser is in the current source file.
+struct ParserState {
+  ParserState()
+      : cursor_(nullptr),
+        line_start_(nullptr),
+        line_(0),
+        token_(-1),
+        attr_is_trivial_ascii_string_(true) {}
+
+ protected:
+  void ResetState(const char *source) {
+    cursor_ = source;
+    line_ = 0;
+    MarkNewLine();
+  }
+
+  void MarkNewLine() {
+    line_start_ = cursor_;
+    line_ += 1;
+  }
+
+  int64_t CursorPosition() const {
+    FLATBUFFERS_ASSERT(cursor_ && line_start_ && cursor_ >= line_start_);
+    return static_cast<int64_t>(cursor_ - line_start_);
+  }
+
+  const char *cursor_;
+  const char *line_start_;
+  int line_;  // the current line being parsed
+  int token_;
+
+  // Flag: text in attribute_ is true ASCII string without escape
+  // sequences. Only printable ASCII (without [\t\r\n]).
+  // Used for number-in-string (and base64 string in future).
+  bool attr_is_trivial_ascii_string_;
+  std::string attribute_;
+  std::vector<std::string> doc_comment_;
+};
+
+// A way to make error propagation less error prone by requiring values to be
+// checked.
+// Once you create a value of this type you must either:
+// - Call Check() on it.
+// - Copy or assign it to another value.
+// Failure to do so leads to an assert.
+// This guarantees that this as return value cannot be ignored.
+class CheckedError {
+ public:
+  explicit CheckedError(bool error)
+      : is_error_(error), has_been_checked_(false) {}
+
+  CheckedError &operator=(const CheckedError &other) {
+    is_error_ = other.is_error_;
+    has_been_checked_ = false;
+    other.has_been_checked_ = true;
+    return *this;
+  }
+
+  CheckedError(const CheckedError &other) {
+    *this = other;  // Use assignment operator.
+  }
+
+  ~CheckedError() { FLATBUFFERS_ASSERT(has_been_checked_); }
+
+  bool Check() {
+    has_been_checked_ = true;
+    return is_error_;
+  }
+
+ private:
+  bool is_error_;
+  mutable bool has_been_checked_;
+};
+
+// Additionally, in GCC we can get these errors statically, for additional
+// assurance:
+// clang-format off
+#ifdef __GNUC__
+#define FLATBUFFERS_CHECKED_ERROR CheckedError \
+          __attribute__((warn_unused_result))
+#else
+#define FLATBUFFERS_CHECKED_ERROR CheckedError
+#endif
+// clang-format on
+
+class Parser : public ParserState {
+ public:
+  explicit Parser(const IDLOptions &options = IDLOptions())
+      : current_namespace_(nullptr),
+        empty_namespace_(nullptr),
+        flex_builder_(256, flexbuffers::BUILDER_FLAG_SHARE_ALL),
+        root_struct_def_(nullptr),
+        opts(options),
+        uses_flexbuffers_(false),
+        advanced_features_(0),
+        source_(nullptr),
+        anonymous_counter_(0),
+        parse_depth_counter_(0) {
+    if (opts.force_defaults) { builder_.ForceDefaults(true); }
+    // Start out with the empty namespace being current.
+    empty_namespace_ = new Namespace();
+    namespaces_.push_back(empty_namespace_);
+    current_namespace_ = empty_namespace_;
+    known_attributes_["deprecated"] = true;
+    known_attributes_["required"] = true;
+    known_attributes_["key"] = true;
+    known_attributes_["shared"] = true;
+    known_attributes_["hash"] = true;
+    known_attributes_["id"] = true;
+    known_attributes_["force_align"] = true;
+    known_attributes_["bit_flags"] = true;
+    known_attributes_["original_order"] = true;
+    known_attributes_["nested_flatbuffer"] = true;
+    known_attributes_["csharp_partial"] = true;
+    known_attributes_["streaming"] = true;
+    known_attributes_["idempotent"] = true;
+    known_attributes_["cpp_type"] = true;
+    known_attributes_["cpp_ptr_type"] = true;
+    known_attributes_["cpp_ptr_type_get"] = true;
+    known_attributes_["cpp_str_type"] = true;
+    known_attributes_["cpp_str_flex_ctor"] = true;
+    known_attributes_["native_inline"] = true;
+    known_attributes_["native_custom_alloc"] = true;
+    known_attributes_["native_type"] = true;
+    known_attributes_["native_type_pack_name"] = true;
+    known_attributes_["native_default"] = true;
+    known_attributes_["flexbuffer"] = true;
+    known_attributes_["private"] = true;
+  }
+
+  ~Parser() {
+    for (auto it = namespaces_.begin(); it != namespaces_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Parse the string containing either schema or JSON data, which will
+  // populate the SymbolTable's or the FlatBufferBuilder above.
+  // include_paths is used to resolve any include statements, and typically
+  // should at least include the project path (where you loaded source_ from).
+  // include_paths must be nullptr terminated if specified.
+  // If include_paths is nullptr, it will attempt to load from the current
+  // directory.
+  // If the source was loaded from a file and isn't an include file,
+  // supply its name in source_filename.
+  // All paths specified in this call must be in posix format, if you accept
+  // paths from user input, please call PosixPath on them first.
+  bool Parse(const char *_source, const char **include_paths = nullptr,
+             const char *source_filename = nullptr);
+
+  bool ParseJson(const char *json, const char *json_filename = nullptr);
+
+  // Set the root type. May override the one set in the schema.
+  bool SetRootType(const char *name);
+
+  // Mark all definitions as already having code generated.
+  void MarkGenerated();
+
+  // Get the files recursively included by the given file. The returned
+  // container will have at least the given file.
+  std::set<std::string> GetIncludedFilesRecursive(
+      const std::string &file_name) const;
+
+  // Fills builder_ with a binary version of the schema parsed.
+  // See reflection/reflection.fbs
+  void Serialize();
+
+  // Deserialize a schema buffer
+  bool Deserialize(const uint8_t *buf, const size_t size);
+
+  // Fills internal structure as if the schema passed had been loaded by parsing
+  // with Parse except that included filenames will not be populated.
+  bool Deserialize(const reflection::Schema *schema);
+
+  Type *DeserializeType(const reflection::Type *type);
+
+  // Checks that the schema represented by this parser is a safe evolution
+  // of the schema provided. Returns non-empty error on any problems.
+  std::string ConformTo(const Parser &base);
+
+  // Similar to Parse(), but now only accepts JSON to be parsed into a
+  // FlexBuffer.
+  bool ParseFlexBuffer(const char *source, const char *source_filename,
+                       flexbuffers::Builder *builder);
+
+  StructDef *LookupStruct(const std::string &id) const;
+  StructDef *LookupStructThruParentNamespaces(const std::string &id) const;
+
+  std::string UnqualifiedName(const std::string &fullQualifiedName);
+
+  FLATBUFFERS_CHECKED_ERROR Error(const std::string &msg);
+
+  // @brief Verify that any of 'opts.lang_to_generate' supports Optional scalars
+  // in a schema.
+  // @param opts Options used to parce a schema and generate code.
+  static bool SupportsOptionalScalars(const flatbuffers::IDLOptions &opts);
+
+ private:
+  class ParseDepthGuard;
+
+  void Message(const std::string &msg);
+  void Warning(const std::string &msg);
+  FLATBUFFERS_CHECKED_ERROR ParseHexNum(int nibbles, uint64_t *val);
+  FLATBUFFERS_CHECKED_ERROR Next();
+  FLATBUFFERS_CHECKED_ERROR SkipByteOrderMark();
+  bool Is(int t) const;
+  bool IsIdent(const char *id) const;
+  FLATBUFFERS_CHECKED_ERROR Expect(int t);
+  std::string TokenToStringId(int t) const;
+  EnumDef *LookupEnum(const std::string &id);
+  FLATBUFFERS_CHECKED_ERROR ParseNamespacing(std::string *id,
+                                             std::string *last);
+  FLATBUFFERS_CHECKED_ERROR ParseTypeIdent(Type &type);
+  FLATBUFFERS_CHECKED_ERROR ParseType(Type &type);
+  FLATBUFFERS_CHECKED_ERROR AddField(StructDef &struct_def,
+                                     const std::string &name, const Type &type,
+                                     FieldDef **dest);
+  FLATBUFFERS_CHECKED_ERROR ParseField(StructDef &struct_def);
+  FLATBUFFERS_CHECKED_ERROR ParseString(Value &val, bool use_string_pooling);
+  FLATBUFFERS_CHECKED_ERROR ParseComma();
+  FLATBUFFERS_CHECKED_ERROR ParseAnyValue(Value &val, FieldDef *field,
+                                          size_t parent_fieldn,
+                                          const StructDef *parent_struct_def,
+                                          uoffset_t count,
+                                          bool inside_vector = false);
+  template<typename F>
+  FLATBUFFERS_CHECKED_ERROR ParseTableDelimiters(size_t &fieldn,
+                                                 const StructDef *struct_def,
+                                                 F body);
+  FLATBUFFERS_CHECKED_ERROR ParseTable(const StructDef &struct_def,
+                                       std::string *value, uoffset_t *ovalue);
+  void SerializeStruct(const StructDef &struct_def, const Value &val);
+  void SerializeStruct(FlatBufferBuilder &builder, const StructDef &struct_def,
+                       const Value &val);
+  template<typename F>
+  FLATBUFFERS_CHECKED_ERROR ParseVectorDelimiters(uoffset_t &count, F body);
+  FLATBUFFERS_CHECKED_ERROR ParseVector(const Type &type, uoffset_t *ovalue,
+                                        FieldDef *field, size_t fieldn);
+  FLATBUFFERS_CHECKED_ERROR ParseArray(Value &array);
+  FLATBUFFERS_CHECKED_ERROR ParseNestedFlatbuffer(
+      Value &val, FieldDef *field, size_t fieldn,
+      const StructDef *parent_struct_def);
+  FLATBUFFERS_CHECKED_ERROR ParseMetaData(SymbolTable<Value> *attributes);
+  FLATBUFFERS_CHECKED_ERROR TryTypedValue(const std::string *name, int dtoken,
+                                          bool check, Value &e, BaseType req,
+                                          bool *destmatch);
+  FLATBUFFERS_CHECKED_ERROR ParseHash(Value &e, FieldDef *field);
+  FLATBUFFERS_CHECKED_ERROR TokenError();
+  FLATBUFFERS_CHECKED_ERROR ParseSingleValue(const std::string *name, Value &e,
+                                             bool check_now);
+  FLATBUFFERS_CHECKED_ERROR ParseFunction(const std::string *name, Value &e);
+  FLATBUFFERS_CHECKED_ERROR ParseEnumFromString(const Type &type,
+                                                std::string *result);
+  StructDef *LookupCreateStruct(const std::string &name,
+                                bool create_if_new = true,
+                                bool definition = false);
+  FLATBUFFERS_CHECKED_ERROR ParseEnum(bool is_union, EnumDef **dest);
+  FLATBUFFERS_CHECKED_ERROR ParseNamespace();
+  FLATBUFFERS_CHECKED_ERROR StartStruct(const std::string &name,
+                                        StructDef **dest);
+  FLATBUFFERS_CHECKED_ERROR StartEnum(const std::string &name, bool is_union,
+                                      EnumDef **dest);
+  FLATBUFFERS_CHECKED_ERROR ParseDecl();
+  FLATBUFFERS_CHECKED_ERROR ParseService();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoFields(StructDef *struct_def,
+                                             bool isextend, bool inside_oneof);
+  FLATBUFFERS_CHECKED_ERROR ParseProtoOption();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoKey();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoDecl();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoCurliesOrIdent();
+  FLATBUFFERS_CHECKED_ERROR ParseTypeFromProtoType(Type *type);
+  FLATBUFFERS_CHECKED_ERROR SkipAnyJsonValue();
+  FLATBUFFERS_CHECKED_ERROR ParseFlexBufferNumericConstant(
+      flexbuffers::Builder *builder);
+  FLATBUFFERS_CHECKED_ERROR ParseFlexBufferValue(flexbuffers::Builder *builder);
+  FLATBUFFERS_CHECKED_ERROR StartParseFile(const char *source,
+                                           const char *source_filename);
+  FLATBUFFERS_CHECKED_ERROR ParseRoot(const char *_source,
+                                      const char **include_paths,
+                                      const char *source_filename);
+  FLATBUFFERS_CHECKED_ERROR DoParse(const char *_source,
+                                    const char **include_paths,
+                                    const char *source_filename,
+                                    const char *include_filename);
+  FLATBUFFERS_CHECKED_ERROR DoParseJson();
+  FLATBUFFERS_CHECKED_ERROR CheckClash(std::vector<FieldDef *> &fields,
+                                       StructDef *struct_def,
+                                       const char *suffix, BaseType baseType);
+  FLATBUFFERS_CHECKED_ERROR ParseAlignAttribute(
+      const std::string &align_constant, size_t min_align, size_t *align);
+
+  bool SupportsAdvancedUnionFeatures() const;
+  bool SupportsAdvancedArrayFeatures() const;
+  bool SupportsOptionalScalars() const;
+  bool SupportsDefaultVectorsAndStrings() const;
+  Namespace *UniqueNamespace(Namespace *ns);
+
+  FLATBUFFERS_CHECKED_ERROR RecurseError();
+  template<typename F> CheckedError Recurse(F f);
+
+ public:
+  SymbolTable<Type> types_;
+  SymbolTable<StructDef> structs_;
+  SymbolTable<EnumDef> enums_;
+  SymbolTable<ServiceDef> services_;
+  std::vector<Namespace *> namespaces_;
+  Namespace *current_namespace_;
+  Namespace *empty_namespace_;
+  std::string error_;  // User readable error_ if Parse() == false
+
+  FlatBufferBuilder builder_;  // any data contained in the file
+  flexbuffers::Builder flex_builder_;
+  flexbuffers::Reference flex_root_;
+  StructDef *root_struct_def_;
+  std::string file_identifier_;
+  std::string file_extension_;
+
+  std::map<uint64_t, std::string> included_files_;
+  std::map<std::string, std::set<std::string>> files_included_per_file_;
+  std::vector<std::string> native_included_files_;
+
+  std::map<std::string, bool> known_attributes_;
+
+  IDLOptions opts;
+  bool uses_flexbuffers_;
+
+  uint64_t advanced_features_;
+
+ private:
+  const char *source_;
+
+  std::string file_being_parsed_;
+
+  std::vector<std::pair<Value, FieldDef *>> field_stack_;
+
+  int anonymous_counter_;
+  int parse_depth_counter_;  // stack-overflow guard
+};
+
+// Utility functions for multiple generators:
+
+extern std::string MakeCamel(const std::string &in, bool first = true);
+
+extern std::string MakeScreamingCamel(const std::string &in);
+
+// Generate text (JSON) from a given FlatBuffer, and a given Parser
+// object that has been populated with the corresponding schema.
+// If ident_step is 0, no indentation will be generated. Additionally,
+// if it is less than 0, no linefeeds will be generated either.
+// See idl_gen_text.cpp.
+// strict_json adds "quotes" around field names if true.
+// If the flatbuffer cannot be encoded in JSON (e.g., it contains non-UTF-8
+// byte arrays in String values), returns false.
+extern bool GenerateTextFromTable(const Parser &parser, const void *table,
+                                  const std::string &tablename,
+                                  std::string *text);
+extern bool GenerateText(const Parser &parser, const void *flatbuffer,
+                         std::string *text);
+extern bool GenerateTextFile(const Parser &parser, const std::string &path,
+                             const std::string &file_name);
+
+// Generate Json schema to string
+// See idl_gen_json_schema.cpp.
+extern bool GenerateJsonSchema(const Parser &parser, std::string *json);
+
+// Generate binary files from a given FlatBuffer, and a given Parser
+// object that has been populated with the corresponding schema.
+// See code_generators.cpp.
+extern bool GenerateBinary(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+
+// Generate a C++ header from the definitions in the Parser object.
+// See idl_gen_cpp.
+extern bool GenerateCPP(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate C# files from the definitions in the Parser object.
+// See idl_gen_csharp.cpp.
+extern bool GenerateCSharp(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+
+extern bool GenerateDart(const Parser &parser, const std::string &path,
+                         const std::string &file_name);
+
+// Generate Java files from the definitions in the Parser object.
+// See idl_gen_java.cpp.
+extern bool GenerateJava(const Parser &parser, const std::string &path,
+                         const std::string &file_name);
+
+// Generate JavaScript or TypeScript code from the definitions in the Parser
+// object. See idl_gen_js.
+extern bool GenerateTS(const Parser &parser, const std::string &path,
+                       const std::string &file_name);
+
+// Generate Go files from the definitions in the Parser object.
+// See idl_gen_go.cpp.
+extern bool GenerateGo(const Parser &parser, const std::string &path,
+                       const std::string &file_name);
+
+// Generate Php code from the definitions in the Parser object.
+// See idl_gen_php.
+extern bool GeneratePhp(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate Python files from the definitions in the Parser object.
+// See idl_gen_python.cpp.
+extern bool GeneratePython(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+
+// Generate Lobster files from the definitions in the Parser object.
+// See idl_gen_lobster.cpp.
+extern bool GenerateLobster(const Parser &parser, const std::string &path,
+                            const std::string &file_name);
+
+// Generate Lua files from the definitions in the Parser object.
+// See idl_gen_lua.cpp.
+extern bool GenerateLua(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate Rust files from the definitions in the Parser object.
+// See idl_gen_rust.cpp.
+extern bool GenerateRust(const Parser &parser, const std::string &path,
+                         const std::string &file_name);
+
+// Generate Json schema file
+// See idl_gen_json_schema.cpp.
+extern bool GenerateJsonSchema(const Parser &parser, const std::string &path,
+                               const std::string &file_name);
+
+extern bool GenerateKotlin(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+
+// Generate Swift classes.
+// See idl_gen_swift.cpp
+extern bool GenerateSwift(const Parser &parser, const std::string &path,
+                          const std::string &file_name);
+
+// Generate a schema file from the internal representation, useful after
+// parsing a .proto schema.
+extern std::string GenerateFBS(const Parser &parser,
+                               const std::string &file_name);
+extern bool GenerateFBS(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate a make rule for the generated TypeScript code.
+// See idl_gen_ts.cpp.
+extern std::string TSMakeRule(const Parser &parser, const std::string &path,
+                              const std::string &file_name);
+
+// Generate a make rule for the generated C++ header.
+// See idl_gen_cpp.cpp.
+extern std::string CPPMakeRule(const Parser &parser, const std::string &path,
+                               const std::string &file_name);
+
+// Generate a make rule for the generated Dart code
+// see idl_gen_dart.cpp
+extern std::string DartMakeRule(const Parser &parser, const std::string &path,
+                                const std::string &file_name);
+
+// Generate a make rule for the generated Rust code.
+// See idl_gen_rust.cpp.
+extern std::string RustMakeRule(const Parser &parser, const std::string &path,
+                                const std::string &file_name);
+
+// Generate a make rule for generated Java or C# files.
+// See code_generators.cpp.
+extern std::string JavaCSharpMakeRule(const Parser &parser,
+                                      const std::string &path,
+                                      const std::string &file_name);
+
+// Generate a make rule for the generated text (JSON) files.
+// See idl_gen_text.cpp.
+extern std::string TextMakeRule(const Parser &parser, const std::string &path,
+                                const std::string &file_names);
+
+// Generate a make rule for the generated binary files.
+// See code_generators.cpp.
+extern std::string BinaryMakeRule(const Parser &parser, const std::string &path,
+                                  const std::string &file_name);
+
+// Generate GRPC Cpp interfaces.
+// See idl_gen_grpc.cpp.
+bool GenerateCppGRPC(const Parser &parser, const std::string &path,
+                     const std::string &file_name);
+
+// Generate GRPC Go interfaces.
+// See idl_gen_grpc.cpp.
+bool GenerateGoGRPC(const Parser &parser, const std::string &path,
+                    const std::string &file_name);
+
+// Generate GRPC Java classes.
+// See idl_gen_grpc.cpp
+bool GenerateJavaGRPC(const Parser &parser, const std::string &path,
+                      const std::string &file_name);
+
+// Generate GRPC Python interfaces.
+// See idl_gen_grpc.cpp.
+bool GeneratePythonGRPC(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate GRPC Swift interfaces.
+// See idl_gen_grpc.cpp.
+extern bool GenerateSwiftGRPC(const Parser &parser, const std::string &path,
+                              const std::string &file_name);
+
+extern bool GenerateTSGRPC(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_IDL_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/minireflect.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/minireflect.h
new file mode 100644
index 0000000..26fd86c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/minireflect.h
@@ -0,0 +1,419 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_MINIREFLECT_H_
+#define FLATBUFFERS_MINIREFLECT_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// Utilities that can be used with the "mini reflection" tables present
+// in generated code with --reflect-types (only types) or --reflect-names
+// (also names).
+// This allows basic reflection functionality such as pretty-printing
+// that does not require the use of the schema parser or loading of binary
+// schema files at runtime (reflection.h).
+
+// For any of the functions below that take `const TypeTable *`, you pass
+// `FooTypeTable()` if the type of the root is `Foo`.
+
+// First, a generic iterator that can be used by multiple algorithms.
+
+struct IterationVisitor {
+  // These mark the scope of a table or struct.
+  virtual void StartSequence() {}
+  virtual void EndSequence() {}
+  // Called for each field regardless of whether it is present or not.
+  // If not present, val == nullptr. set_idx is the index of all set fields.
+  virtual void Field(size_t /*field_idx*/, size_t /*set_idx*/,
+                     ElementaryType /*type*/, bool /*is_vector*/,
+                     const TypeTable * /*type_table*/, const char * /*name*/,
+                     const uint8_t * /*val*/) {}
+  // Called for a value that is actually present, after a field, or as part
+  // of a vector.
+  virtual void UType(uint8_t, const char *) {}
+  virtual void Bool(bool) {}
+  virtual void Char(int8_t, const char *) {}
+  virtual void UChar(uint8_t, const char *) {}
+  virtual void Short(int16_t, const char *) {}
+  virtual void UShort(uint16_t, const char *) {}
+  virtual void Int(int32_t, const char *) {}
+  virtual void UInt(uint32_t, const char *) {}
+  virtual void Long(int64_t) {}
+  virtual void ULong(uint64_t) {}
+  virtual void Float(float) {}
+  virtual void Double(double) {}
+  virtual void String(const String *) {}
+  virtual void Unknown(const uint8_t *) {}  // From a future version.
+  // These mark the scope of a vector.
+  virtual void StartVector() {}
+  virtual void EndVector() {}
+  virtual void Element(size_t /*i*/, ElementaryType /*type*/,
+                       const TypeTable * /*type_table*/,
+                       const uint8_t * /*val*/) {}
+  virtual ~IterationVisitor() {}
+};
+
+inline size_t InlineSize(ElementaryType type, const TypeTable *type_table) {
+  switch (type) {
+    case ET_UTYPE:
+    case ET_BOOL:
+    case ET_CHAR:
+    case ET_UCHAR: return 1;
+    case ET_SHORT:
+    case ET_USHORT: return 2;
+    case ET_INT:
+    case ET_UINT:
+    case ET_FLOAT:
+    case ET_STRING: return 4;
+    case ET_LONG:
+    case ET_ULONG:
+    case ET_DOUBLE: return 8;
+    case ET_SEQUENCE:
+      switch (type_table->st) {
+        case ST_TABLE:
+        case ST_UNION: return 4;
+        case ST_STRUCT:
+          return static_cast<size_t>(type_table->values[type_table->num_elems]);
+        default: FLATBUFFERS_ASSERT(false); return 1;
+      }
+    default: FLATBUFFERS_ASSERT(false); return 1;
+  }
+}
+
+inline int64_t LookupEnum(int64_t enum_val, const int64_t *values,
+                          size_t num_values) {
+  if (!values) return enum_val;
+  for (size_t i = 0; i < num_values; i++) {
+    if (enum_val == values[i]) return static_cast<int64_t>(i);
+  }
+  return -1;  // Unknown enum value.
+}
+
+template<typename T> const char *EnumName(T tval, const TypeTable *type_table) {
+  if (!type_table || !type_table->names) return nullptr;
+  auto i = LookupEnum(static_cast<int64_t>(tval), type_table->values,
+                      type_table->num_elems);
+  if (i >= 0 && i < static_cast<int64_t>(type_table->num_elems)) {
+    return type_table->names[i];
+  }
+  return nullptr;
+}
+
+void IterateObject(const uint8_t *obj, const TypeTable *type_table,
+                   IterationVisitor *visitor);
+
+inline void IterateValue(ElementaryType type, const uint8_t *val,
+                         const TypeTable *type_table, const uint8_t *prev_val,
+                         soffset_t vector_index, IterationVisitor *visitor) {
+  switch (type) {
+    case ET_UTYPE: {
+      auto tval = ReadScalar<uint8_t>(val);
+      visitor->UType(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_BOOL: {
+      visitor->Bool(ReadScalar<uint8_t>(val) != 0);
+      break;
+    }
+    case ET_CHAR: {
+      auto tval = ReadScalar<int8_t>(val);
+      visitor->Char(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_UCHAR: {
+      auto tval = ReadScalar<uint8_t>(val);
+      visitor->UChar(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_SHORT: {
+      auto tval = ReadScalar<int16_t>(val);
+      visitor->Short(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_USHORT: {
+      auto tval = ReadScalar<uint16_t>(val);
+      visitor->UShort(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_INT: {
+      auto tval = ReadScalar<int32_t>(val);
+      visitor->Int(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_UINT: {
+      auto tval = ReadScalar<uint32_t>(val);
+      visitor->UInt(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_LONG: {
+      visitor->Long(ReadScalar<int64_t>(val));
+      break;
+    }
+    case ET_ULONG: {
+      visitor->ULong(ReadScalar<uint64_t>(val));
+      break;
+    }
+    case ET_FLOAT: {
+      visitor->Float(ReadScalar<float>(val));
+      break;
+    }
+    case ET_DOUBLE: {
+      visitor->Double(ReadScalar<double>(val));
+      break;
+    }
+    case ET_STRING: {
+      val += ReadScalar<uoffset_t>(val);
+      visitor->String(reinterpret_cast<const String *>(val));
+      break;
+    }
+    case ET_SEQUENCE: {
+      switch (type_table->st) {
+        case ST_TABLE:
+          val += ReadScalar<uoffset_t>(val);
+          IterateObject(val, type_table, visitor);
+          break;
+        case ST_STRUCT: IterateObject(val, type_table, visitor); break;
+        case ST_UNION: {
+          val += ReadScalar<uoffset_t>(val);
+          FLATBUFFERS_ASSERT(prev_val);
+          auto union_type = *prev_val;  // Always a uint8_t.
+          if (vector_index >= 0) {
+            auto type_vec = reinterpret_cast<const Vector<uint8_t> *>(prev_val);
+            union_type = type_vec->Get(static_cast<uoffset_t>(vector_index));
+          }
+          auto type_code_idx =
+              LookupEnum(union_type, type_table->values, type_table->num_elems);
+          if (type_code_idx >= 0 &&
+              type_code_idx < static_cast<int32_t>(type_table->num_elems)) {
+            auto type_code = type_table->type_codes[type_code_idx];
+            switch (type_code.base_type) {
+              case ET_SEQUENCE: {
+                auto ref = type_table->type_refs[type_code.sequence_ref]();
+                IterateObject(val, ref, visitor);
+                break;
+              }
+              case ET_STRING:
+                visitor->String(reinterpret_cast<const String *>(val));
+                break;
+              default: visitor->Unknown(val);
+            }
+          } else {
+            visitor->Unknown(val);
+          }
+          break;
+        }
+        case ST_ENUM: FLATBUFFERS_ASSERT(false); break;
+      }
+      break;
+    }
+    default: {
+      visitor->Unknown(val);
+      break;
+    }
+  }
+}
+
+inline void IterateObject(const uint8_t *obj, const TypeTable *type_table,
+                          IterationVisitor *visitor) {
+  visitor->StartSequence();
+  const uint8_t *prev_val = nullptr;
+  size_t set_idx = 0;
+  size_t array_idx = 0;
+  for (size_t i = 0; i < type_table->num_elems; i++) {
+    auto type_code = type_table->type_codes[i];
+    auto type = static_cast<ElementaryType>(type_code.base_type);
+    auto is_repeating = type_code.is_repeating != 0;
+    auto ref_idx = type_code.sequence_ref;
+    const TypeTable *ref = nullptr;
+    if (ref_idx >= 0) { ref = type_table->type_refs[ref_idx](); }
+    auto name = type_table->names ? type_table->names[i] : nullptr;
+    const uint8_t *val = nullptr;
+    if (type_table->st == ST_TABLE) {
+      val = reinterpret_cast<const Table *>(obj)->GetAddressOf(
+          FieldIndexToOffset(static_cast<voffset_t>(i)));
+    } else {
+      val = obj + type_table->values[i];
+    }
+    visitor->Field(i, set_idx, type, is_repeating, ref, name, val);
+    if (val) {
+      set_idx++;
+      if (is_repeating) {
+        auto elem_ptr = val;
+        size_t size = 0;
+        if (type_table->st == ST_TABLE) {
+          // variable length vector
+          val += ReadScalar<uoffset_t>(val);
+          auto vec = reinterpret_cast<const Vector<uint8_t> *>(val);
+          elem_ptr = vec->Data();
+          size = vec->size();
+        } else {
+          // otherwise fixed size array
+          size = type_table->array_sizes[array_idx];
+          ++array_idx;
+        }
+        visitor->StartVector();
+        for (size_t j = 0; j < size; j++) {
+          visitor->Element(j, type, ref, elem_ptr);
+          IterateValue(type, elem_ptr, ref, prev_val, static_cast<soffset_t>(j),
+                       visitor);
+          elem_ptr += InlineSize(type, ref);
+        }
+        visitor->EndVector();
+      } else {
+        IterateValue(type, val, ref, prev_val, -1, visitor);
+      }
+    }
+    prev_val = val;
+  }
+  visitor->EndSequence();
+}
+
+inline void IterateFlatBuffer(const uint8_t *buffer,
+                              const TypeTable *type_table,
+                              IterationVisitor *callback) {
+  IterateObject(GetRoot<uint8_t>(buffer), type_table, callback);
+}
+
+// Outputting a Flatbuffer to a string. Tries to conform as close to JSON /
+// the output generated by idl_gen_text.cpp.
+
+struct ToStringVisitor : public IterationVisitor {
+  std::string s;
+  std::string d;
+  bool q;
+  std::string in;
+  size_t indent_level;
+  bool vector_delimited;
+  ToStringVisitor(std::string delimiter, bool quotes, std::string indent,
+                  bool vdelimited = true)
+      : d(delimiter),
+        q(quotes),
+        in(indent),
+        indent_level(0),
+        vector_delimited(vdelimited) {}
+  ToStringVisitor(std::string delimiter)
+      : d(delimiter),
+        q(false),
+        in(""),
+        indent_level(0),
+        vector_delimited(true) {}
+
+  void append_indent() {
+    for (size_t i = 0; i < indent_level; i++) { s += in; }
+  }
+
+  void StartSequence() {
+    s += "{";
+    s += d;
+    indent_level++;
+  }
+  void EndSequence() {
+    s += d;
+    indent_level--;
+    append_indent();
+    s += "}";
+  }
+  void Field(size_t /*field_idx*/, size_t set_idx, ElementaryType /*type*/,
+             bool /*is_vector*/, const TypeTable * /*type_table*/,
+             const char *name, const uint8_t *val) {
+    if (!val) return;
+    if (set_idx) {
+      s += ",";
+      s += d;
+    }
+    append_indent();
+    if (name) {
+      if (q) s += "\"";
+      s += name;
+      if (q) s += "\"";
+      s += ": ";
+    }
+  }
+  template<typename T> void Named(T x, const char *name) {
+    if (name) {
+      if (q) s += "\"";
+      s += name;
+      if (q) s += "\"";
+    } else {
+      s += NumToString(x);
+    }
+  }
+  void UType(uint8_t x, const char *name) { Named(x, name); }
+  void Bool(bool x) { s += x ? "true" : "false"; }
+  void Char(int8_t x, const char *name) { Named(x, name); }
+  void UChar(uint8_t x, const char *name) { Named(x, name); }
+  void Short(int16_t x, const char *name) { Named(x, name); }
+  void UShort(uint16_t x, const char *name) { Named(x, name); }
+  void Int(int32_t x, const char *name) { Named(x, name); }
+  void UInt(uint32_t x, const char *name) { Named(x, name); }
+  void Long(int64_t x) { s += NumToString(x); }
+  void ULong(uint64_t x) { s += NumToString(x); }
+  void Float(float x) { s += NumToString(x); }
+  void Double(double x) { s += NumToString(x); }
+  void String(const struct String *str) {
+    EscapeString(str->c_str(), str->size(), &s, true, false);
+  }
+  void Unknown(const uint8_t *) { s += "(?)"; }
+  void StartVector() {
+    s += "[";
+    if (vector_delimited) {
+      s += d;
+      indent_level++;
+      append_indent();
+    } else {
+      s += " ";
+    }
+  }
+  void EndVector() {
+    if (vector_delimited) {
+      s += d;
+      indent_level--;
+      append_indent();
+    } else {
+      s += " ";
+    }
+    s += "]";
+  }
+  void Element(size_t i, ElementaryType /*type*/,
+               const TypeTable * /*type_table*/, const uint8_t * /*val*/) {
+    if (i) {
+      s += ",";
+      if (vector_delimited) {
+        s += d;
+        append_indent();
+      } else {
+        s += " ";
+      }
+    }
+  }
+};
+
+inline std::string FlatBufferToString(const uint8_t *buffer,
+                                      const TypeTable *type_table,
+                                      bool multi_line = false,
+                                      bool vector_delimited = true) {
+  ToStringVisitor tostring_visitor(multi_line ? "\n" : " ", false, "",
+                                   vector_delimited);
+  IterateFlatBuffer(buffer, type_table, &tostring_visitor);
+  return tostring_visitor.s;
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_MINIREFLECT_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/flatc_pch.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/flatc_pch.h
new file mode 100644
index 0000000..7713279
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/flatc_pch.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLATC_PCH_H_
+#define FLATBUFFERS_FLATC_PCH_H_
+
+// stl
+#include <cmath>
+#include <sstream>
+#include <cassert>
+#include <unordered_set>
+#include <unordered_map>
+#include <iostream>
+#include <functional>
+#include <set>
+#include <iterator>
+#include <tuple>
+
+// flatbuffers
+#include "flatbuffers/pch/pch.h"
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/idl.h"
+
+#endif // FLATBUFFERS_FLATC_PCH_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/pch.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/pch.h
new file mode 100644
index 0000000..804e99e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/pch/pch.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_PCH_H_
+#define FLATBUFFERS_PCH_H_
+
+// stl
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#include <list>
+#include <string>
+#include <utility>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <limits>
+#include <stack>
+#include <vector>
+#include <type_traits>
+
+// flatbuffers
+#include "flatbuffers/util.h"
+
+#endif // FLATBUFFERS_PCH_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection.h
new file mode 100644
index 0000000..c6fa411
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_REFLECTION_H_
+#define FLATBUFFERS_REFLECTION_H_
+
+// This is somewhat of a circular dependency because flatc (and thus this
+// file) is needed to generate this header in the first place.
+// Should normally not be a problem since it can be generated by the
+// previous version of flatc whenever this code needs to change.
+// See reflection/generate_code.sh
+#include "flatbuffers/reflection_generated.h"
+
+// Helper functionality for reflection.
+
+namespace flatbuffers {
+
+// ------------------------- GETTERS -------------------------
+
+inline bool IsScalar(reflection::BaseType t) {
+  return t >= reflection::UType && t <= reflection::Double;
+}
+inline bool IsInteger(reflection::BaseType t) {
+  return t >= reflection::UType && t <= reflection::ULong;
+}
+inline bool IsFloat(reflection::BaseType t) {
+  return t == reflection::Float || t == reflection::Double;
+}
+inline bool IsLong(reflection::BaseType t) {
+  return t == reflection::Long || t == reflection::ULong;
+}
+
+// Size of a basic type, don't use with structs.
+inline size_t GetTypeSize(reflection::BaseType base_type) {
+  // This needs to correspond to the BaseType enum.
+  static size_t sizes[] = {
+    0,  // None
+    1,  // UType
+    1,  // Bool
+    1,  // Byte
+    1,  // UByte
+    2,  // Short
+    2,  // UShort
+    4,  // Int
+    4,  // UInt
+    8,  // Long
+    8,  // ULong
+    4,  // Float
+    8,  // Double
+    4,  // String
+    4,  // Vector
+    4,  // Obj
+    4,  // Union
+    0,  // Array. Only used in structs. 0 was chosen to prevent out-of-bounds
+        // errors.
+
+    0  // MaxBaseType. This must be kept the last entry in this array.
+  };
+  static_assert(sizeof(sizes) / sizeof(size_t) == reflection::MaxBaseType + 1,
+                "Size of sizes[] array does not match the count of BaseType "
+                "enum values.");
+  return sizes[base_type];
+}
+
+// Same as above, but now correctly returns the size of a struct if
+// the field (or vector element) is a struct.
+inline size_t GetTypeSizeInline(reflection::BaseType base_type, int type_index,
+                                const reflection::Schema &schema) {
+  if (base_type == reflection::Obj &&
+      schema.objects()->Get(type_index)->is_struct()) {
+    return schema.objects()->Get(type_index)->bytesize();
+  } else {
+    return GetTypeSize(base_type);
+  }
+}
+
+// Get the root, regardless of what type it is.
+inline Table *GetAnyRoot(uint8_t *flatbuf) {
+  return GetMutableRoot<Table>(flatbuf);
+}
+inline const Table *GetAnyRoot(const uint8_t *flatbuf) {
+  return GetRoot<Table>(flatbuf);
+}
+
+// Get a field's default, if you know it's an integer, and its exact type.
+template<typename T> T GetFieldDefaultI(const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return static_cast<T>(field.default_integer());
+}
+
+// Get a field's default, if you know it's floating point and its exact type.
+template<typename T> T GetFieldDefaultF(const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return static_cast<T>(field.default_real());
+}
+
+// Get a field, if you know it's an integer, and its exact type.
+template<typename T>
+T GetFieldI(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return table.GetField<T>(field.offset(),
+                           static_cast<T>(field.default_integer()));
+}
+
+// Get a field, if you know it's floating point and its exact type.
+template<typename T>
+T GetFieldF(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return table.GetField<T>(field.offset(),
+                           static_cast<T>(field.default_real()));
+}
+
+// Get a field, if you know it's a string.
+inline const String *GetFieldS(const Table &table,
+                               const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::String);
+  return table.GetPointer<const String *>(field.offset());
+}
+
+// Get a field, if you know it's a vector.
+template<typename T>
+Vector<T> *GetFieldV(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Vector &&
+                     sizeof(T) == GetTypeSize(field.type()->element()));
+  return table.GetPointer<Vector<T> *>(field.offset());
+}
+
+// Get a field, if you know it's a vector, generically.
+// To actually access elements, use the return value together with
+// field.type()->element() in any of GetAnyVectorElemI below etc.
+inline VectorOfAny *GetFieldAnyV(const Table &table,
+                                 const reflection::Field &field) {
+  return table.GetPointer<VectorOfAny *>(field.offset());
+}
+
+// Get a field, if you know it's a table.
+inline Table *GetFieldT(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj ||
+                     field.type()->base_type() == reflection::Union);
+  return table.GetPointer<Table *>(field.offset());
+}
+
+// Get a field, if you know it's a struct.
+inline const Struct *GetFieldStruct(const Table &table,
+                                    const reflection::Field &field) {
+  // TODO: This does NOT check if the field is a table or struct, but we'd need
+  // access to the schema to check the is_struct flag.
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj);
+  return table.GetStruct<const Struct *>(field.offset());
+}
+
+// Get a structure's field, if you know it's a struct.
+inline const Struct *GetFieldStruct(const Struct &structure,
+                                    const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj);
+  return structure.GetStruct<const Struct *>(field.offset());
+}
+
+// Raw helper functions used below: get any value in memory as a 64bit int, a
+// double or a string.
+// All scalars get static_cast to an int64_t, strings use strtoull, every other
+// data type returns 0.
+int64_t GetAnyValueI(reflection::BaseType type, const uint8_t *data);
+// All scalars static cast to double, strings use strtod, every other data
+// type is 0.0.
+double GetAnyValueF(reflection::BaseType type, const uint8_t *data);
+// All scalars converted using stringstream, strings as-is, and all other
+// data types provide some level of debug-pretty-printing.
+std::string GetAnyValueS(reflection::BaseType type, const uint8_t *data,
+                         const reflection::Schema *schema, int type_index);
+
+// Get any table field as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyFieldI(const Table &table,
+                            const reflection::Field &field) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueI(field.type()->base_type(), field_ptr)
+                   : field.default_integer();
+}
+
+// Get any table field as a double, regardless of what type it is.
+inline double GetAnyFieldF(const Table &table, const reflection::Field &field) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueF(field.type()->base_type(), field_ptr)
+                   : field.default_real();
+}
+
+// Get any table field as a string, regardless of what type it is.
+// You may pass nullptr for the schema if you don't care to have fields that
+// are of table type pretty-printed.
+inline std::string GetAnyFieldS(const Table &table,
+                                const reflection::Field &field,
+                                const reflection::Schema *schema) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueS(field.type()->base_type(), field_ptr, schema,
+                                  field.type()->index())
+                   : "";
+}
+
+// Get any struct field as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyFieldI(const Struct &st, const reflection::Field &field) {
+  return GetAnyValueI(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()));
+}
+
+// Get any struct field as a double, regardless of what type it is.
+inline double GetAnyFieldF(const Struct &st, const reflection::Field &field) {
+  return GetAnyValueF(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()));
+}
+
+// Get any struct field as a string, regardless of what type it is.
+inline std::string GetAnyFieldS(const Struct &st,
+                                const reflection::Field &field) {
+  return GetAnyValueS(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()), nullptr, -1);
+}
+
+// Get any vector element as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyVectorElemI(const VectorOfAny *vec,
+                                 reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueI(elem_type, vec->Data() + GetTypeSize(elem_type) * i);
+}
+
+// Get any vector element as a double, regardless of what type it is.
+inline double GetAnyVectorElemF(const VectorOfAny *vec,
+                                reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueF(elem_type, vec->Data() + GetTypeSize(elem_type) * i);
+}
+
+// Get any vector element as a string, regardless of what type it is.
+inline std::string GetAnyVectorElemS(const VectorOfAny *vec,
+                                     reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueS(elem_type, vec->Data() + GetTypeSize(elem_type) * i,
+                      nullptr, -1);
+}
+
+// Get a vector element that's a table/string/vector from a generic vector.
+// Pass Table/String/VectorOfAny as template parameter.
+// Warning: does no typechecking.
+template<typename T>
+T *GetAnyVectorElemPointer(const VectorOfAny *vec, size_t i) {
+  auto elem_ptr = vec->Data() + sizeof(uoffset_t) * i;
+  return reinterpret_cast<T *>(elem_ptr + ReadScalar<uoffset_t>(elem_ptr));
+}
+
+// Get the inline-address of a vector element. Useful for Structs (pass Struct
+// as template arg), or being able to address a range of scalars in-line.
+// Get elem_size from GetTypeSizeInline().
+// Note: little-endian data on all platforms, use EndianScalar() instead of
+// raw pointer access with scalars).
+template<typename T>
+T *GetAnyVectorElemAddressOf(const VectorOfAny *vec, size_t i,
+                             size_t elem_size) {
+  return reinterpret_cast<T *>(vec->Data() + elem_size * i);
+}
+
+// Similarly, for elements of tables.
+template<typename T>
+T *GetAnyFieldAddressOf(const Table &table, const reflection::Field &field) {
+  return reinterpret_cast<T *>(table.GetAddressOf(field.offset()));
+}
+
+// Similarly, for elements of structs.
+template<typename T>
+T *GetAnyFieldAddressOf(const Struct &st, const reflection::Field &field) {
+  return reinterpret_cast<T *>(st.GetAddressOf(field.offset()));
+}
+
+// ------------------------- SETTERS -------------------------
+
+// Set any scalar field, if you know its exact type.
+template<typename T>
+bool SetField(Table *table, const reflection::Field &field, T val) {
+  reflection::BaseType type = field.type()->base_type();
+  if (!IsScalar(type)) { return false; }
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(type));
+  T def;
+  if (IsInteger(type)) {
+    def = GetFieldDefaultI<T>(field);
+  } else {
+    FLATBUFFERS_ASSERT(IsFloat(type));
+    def = GetFieldDefaultF<T>(field);
+  }
+  return table->SetField(field.offset(), val, def);
+}
+
+// Raw helper functions used below: set any value in memory as a 64bit int, a
+// double or a string.
+// These work for all scalar values, but do nothing for other data types.
+// To set a string, see SetString below.
+void SetAnyValueI(reflection::BaseType type, uint8_t *data, int64_t val);
+void SetAnyValueF(reflection::BaseType type, uint8_t *data, double val);
+void SetAnyValueS(reflection::BaseType type, uint8_t *data, const char *val);
+
+// Set any table field as a 64bit int, regardless of type what it is.
+inline bool SetAnyFieldI(Table *table, const reflection::Field &field,
+                         int64_t val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return val == GetFieldDefaultI<int64_t>(field);
+  SetAnyValueI(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any table field as a double, regardless of what type it is.
+inline bool SetAnyFieldF(Table *table, const reflection::Field &field,
+                         double val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return val == GetFieldDefaultF<double>(field);
+  SetAnyValueF(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any table field as a string, regardless of what type it is.
+inline bool SetAnyFieldS(Table *table, const reflection::Field &field,
+                         const char *val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return false;
+  SetAnyValueS(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any struct field as a 64bit int, regardless of type what it is.
+inline void SetAnyFieldI(Struct *st, const reflection::Field &field,
+                         int64_t val) {
+  SetAnyValueI(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any struct field as a double, regardless of type what it is.
+inline void SetAnyFieldF(Struct *st, const reflection::Field &field,
+                         double val) {
+  SetAnyValueF(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any struct field as a string, regardless of type what it is.
+inline void SetAnyFieldS(Struct *st, const reflection::Field &field,
+                         const char *val) {
+  SetAnyValueS(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any vector element as a 64bit int, regardless of type what it is.
+inline void SetAnyVectorElemI(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, int64_t val) {
+  SetAnyValueI(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// Set any vector element as a double, regardless of type what it is.
+inline void SetAnyVectorElemF(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, double val) {
+  SetAnyValueF(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// Set any vector element as a string, regardless of type what it is.
+inline void SetAnyVectorElemS(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, const char *val) {
+  SetAnyValueS(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// ------------------------- RESIZING SETTERS -------------------------
+
+// "smart" pointer for use with resizing vectors: turns a pointer inside
+// a vector into a relative offset, such that it is not affected by resizes.
+template<typename T, typename U> class pointer_inside_vector {
+ public:
+  pointer_inside_vector(T *ptr, std::vector<U> &vec)
+      : offset_(reinterpret_cast<uint8_t *>(ptr) -
+                reinterpret_cast<uint8_t *>(flatbuffers::vector_data(vec))),
+        vec_(vec) {}
+
+  T *operator*() const {
+    return reinterpret_cast<T *>(
+        reinterpret_cast<uint8_t *>(flatbuffers::vector_data(vec_)) + offset_);
+  }
+  T *operator->() const { return operator*(); }
+
+ private:
+  size_t offset_;
+  std::vector<U> &vec_;
+};
+
+// Helper to create the above easily without specifying template args.
+template<typename T, typename U>
+pointer_inside_vector<T, U> piv(T *ptr, std::vector<U> &vec) {
+  return pointer_inside_vector<T, U>(ptr, vec);
+}
+
+inline const char *UnionTypeFieldSuffix() { return "_type"; }
+
+// Helper to figure out the actual table type a union refers to.
+inline const reflection::Object &GetUnionType(
+    const reflection::Schema &schema, const reflection::Object &parent,
+    const reflection::Field &unionfield, const Table &table) {
+  auto enumdef = schema.enums()->Get(unionfield.type()->index());
+  // TODO: this is clumsy and slow, but no other way to find it?
+  auto type_field = parent.fields()->LookupByKey(
+      (unionfield.name()->str() + UnionTypeFieldSuffix()).c_str());
+  FLATBUFFERS_ASSERT(type_field);
+  auto union_type = GetFieldI<uint8_t>(table, *type_field);
+  auto enumval = enumdef->values()->LookupByKey(union_type);
+  return *enumval->object();
+}
+
+// Changes the contents of a string inside a FlatBuffer. FlatBuffer must
+// live inside a std::vector so we can resize the buffer if needed.
+// "str" must live inside "flatbuf" and may be invalidated after this call.
+// If your FlatBuffer's root table is not the schema's root table, you should
+// pass in your root_table type as well.
+void SetString(const reflection::Schema &schema, const std::string &val,
+               const String *str, std::vector<uint8_t> *flatbuf,
+               const reflection::Object *root_table = nullptr);
+
+// Resizes a flatbuffers::Vector inside a FlatBuffer. FlatBuffer must
+// live inside a std::vector so we can resize the buffer if needed.
+// "vec" must live inside "flatbuf" and may be invalidated after this call.
+// If your FlatBuffer's root table is not the schema's root table, you should
+// pass in your root_table type as well.
+uint8_t *ResizeAnyVector(const reflection::Schema &schema, uoffset_t newsize,
+                         const VectorOfAny *vec, uoffset_t num_elems,
+                         uoffset_t elem_size, std::vector<uint8_t> *flatbuf,
+                         const reflection::Object *root_table = nullptr);
+
+template<typename T>
+void ResizeVector(const reflection::Schema &schema, uoffset_t newsize, T val,
+                  const Vector<T> *vec, std::vector<uint8_t> *flatbuf,
+                  const reflection::Object *root_table = nullptr) {
+  auto delta_elem = static_cast<int>(newsize) - static_cast<int>(vec->size());
+  auto newelems = ResizeAnyVector(
+      schema, newsize, reinterpret_cast<const VectorOfAny *>(vec), vec->size(),
+      static_cast<uoffset_t>(sizeof(T)), flatbuf, root_table);
+  // Set new elements to "val".
+  for (int i = 0; i < delta_elem; i++) {
+    auto loc = newelems + i * sizeof(T);
+    auto is_scalar = flatbuffers::is_scalar<T>::value;
+    if (is_scalar) {
+      WriteScalar(loc, val);
+    } else {  // struct
+      *reinterpret_cast<T *>(loc) = val;
+    }
+  }
+}
+
+// Adds any new data (in the form of a new FlatBuffer) to an existing
+// FlatBuffer. This can be used when any of the above methods are not
+// sufficient, in particular for adding new tables and new fields.
+// This is potentially slightly less efficient than a FlatBuffer constructed
+// in one piece, since the new FlatBuffer doesn't share any vtables with the
+// existing one.
+// The return value can now be set using Vector::MutateOffset or SetFieldT
+// below.
+const uint8_t *AddFlatBuffer(std::vector<uint8_t> &flatbuf,
+                             const uint8_t *newbuf, size_t newlen);
+
+inline bool SetFieldT(Table *table, const reflection::Field &field,
+                      const uint8_t *val) {
+  FLATBUFFERS_ASSERT(sizeof(uoffset_t) ==
+                     GetTypeSize(field.type()->base_type()));
+  return table->SetPointer(field.offset(), val);
+}
+
+// ------------------------- COPYING -------------------------
+
+// Generic copying of tables from a FlatBuffer into a FlatBuffer builder.
+// Can be used to do any kind of merging/selecting you may want to do out
+// of existing buffers. Also useful to reconstruct a whole buffer if the
+// above resizing functionality has introduced garbage in a buffer you want
+// to remove.
+// Note: this does not deal with DAGs correctly. If the table passed forms a
+// DAG, the copy will be a tree instead (with duplicates). Strings can be
+// shared however, by passing true for use_string_pooling.
+
+Offset<const Table *> CopyTable(FlatBufferBuilder &fbb,
+                                const reflection::Schema &schema,
+                                const reflection::Object &objectdef,
+                                const Table &table,
+                                bool use_string_pooling = false);
+
+// Verifies the provided flatbuffer using reflection.
+// root should point to the root type for this flatbuffer.
+// buf should point to the start of flatbuffer data.
+// length specifies the size of the flatbuffer data.
+bool Verify(const reflection::Schema &schema, const reflection::Object &root,
+            const uint8_t *buf, size_t length, uoffset_t max_depth = 64,
+            uoffset_t max_tables = 1000000);
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_REFLECTION_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection_generated.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection_generated.h
new file mode 100644
index 0000000..235146e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/reflection_generated.h
@@ -0,0 +1,1278 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
+#define FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace reflection {
+
+struct Type;
+struct TypeBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct EnumVal;
+struct EnumValBuilder;
+
+struct Enum;
+struct EnumBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Object;
+struct ObjectBuilder;
+
+struct RPCCall;
+struct RPCCallBuilder;
+
+struct Service;
+struct ServiceBuilder;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum BaseType {
+  None = 0,
+  UType = 1,
+  Bool = 2,
+  Byte = 3,
+  UByte = 4,
+  Short = 5,
+  UShort = 6,
+  Int = 7,
+  UInt = 8,
+  Long = 9,
+  ULong = 10,
+  Float = 11,
+  Double = 12,
+  String = 13,
+  Vector = 14,
+  Obj = 15,
+  Union = 16,
+  Array = 17,
+  MaxBaseType = 18
+};
+
+inline const BaseType (&EnumValuesBaseType())[19] {
+  static const BaseType values[] = {
+    None,
+    UType,
+    Bool,
+    Byte,
+    UByte,
+    Short,
+    UShort,
+    Int,
+    UInt,
+    Long,
+    ULong,
+    Float,
+    Double,
+    String,
+    Vector,
+    Obj,
+    Union,
+    Array,
+    MaxBaseType
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBaseType() {
+  static const char * const names[20] = {
+    "None",
+    "UType",
+    "Bool",
+    "Byte",
+    "UByte",
+    "Short",
+    "UShort",
+    "Int",
+    "UInt",
+    "Long",
+    "ULong",
+    "Float",
+    "Double",
+    "String",
+    "Vector",
+    "Obj",
+    "Union",
+    "Array",
+    "MaxBaseType",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBaseType(BaseType e) {
+  if (flatbuffers::IsOutRange(e, None, MaxBaseType)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBaseType()[index];
+}
+
+enum AdvancedFeatures {
+  AdvancedArrayFeatures = 1ULL,
+  AdvancedUnionFeatures = 2ULL,
+  OptionalScalars = 4ULL,
+  DefaultVectorsAndStrings = 8ULL
+};
+
+inline const AdvancedFeatures (&EnumValuesAdvancedFeatures())[4] {
+  static const AdvancedFeatures values[] = {
+    AdvancedArrayFeatures,
+    AdvancedUnionFeatures,
+    OptionalScalars,
+    DefaultVectorsAndStrings
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAdvancedFeatures() {
+  static const char * const names[9] = {
+    "AdvancedArrayFeatures",
+    "AdvancedUnionFeatures",
+    "",
+    "OptionalScalars",
+    "",
+    "",
+    "",
+    "DefaultVectorsAndStrings",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAdvancedFeatures(AdvancedFeatures e) {
+  if (flatbuffers::IsOutRange(e, AdvancedArrayFeatures, DefaultVectorsAndStrings)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(AdvancedArrayFeatures);
+  return EnumNamesAdvancedFeatures()[index];
+}
+
+struct Type FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TypeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BASE_TYPE = 4,
+    VT_ELEMENT = 6,
+    VT_INDEX = 8,
+    VT_FIXED_LENGTH = 10
+  };
+  reflection::BaseType base_type() const {
+    return static_cast<reflection::BaseType>(GetField<int8_t>(VT_BASE_TYPE, 0));
+  }
+  reflection::BaseType element() const {
+    return static_cast<reflection::BaseType>(GetField<int8_t>(VT_ELEMENT, 0));
+  }
+  int32_t index() const {
+    return GetField<int32_t>(VT_INDEX, -1);
+  }
+  uint16_t fixed_length() const {
+    return GetField<uint16_t>(VT_FIXED_LENGTH, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_BASE_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_ELEMENT) &&
+           VerifyField<int32_t>(verifier, VT_INDEX) &&
+           VerifyField<uint16_t>(verifier, VT_FIXED_LENGTH) &&
+           verifier.EndTable();
+  }
+};
+
+struct TypeBuilder {
+  typedef Type Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_base_type(reflection::BaseType base_type) {
+    fbb_.AddElement<int8_t>(Type::VT_BASE_TYPE, static_cast<int8_t>(base_type), 0);
+  }
+  void add_element(reflection::BaseType element) {
+    fbb_.AddElement<int8_t>(Type::VT_ELEMENT, static_cast<int8_t>(element), 0);
+  }
+  void add_index(int32_t index) {
+    fbb_.AddElement<int32_t>(Type::VT_INDEX, index, -1);
+  }
+  void add_fixed_length(uint16_t fixed_length) {
+    fbb_.AddElement<uint16_t>(Type::VT_FIXED_LENGTH, fixed_length, 0);
+  }
+  explicit TypeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Type> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Type>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Type> CreateType(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    reflection::BaseType base_type = reflection::None,
+    reflection::BaseType element = reflection::None,
+    int32_t index = -1,
+    uint16_t fixed_length = 0) {
+  TypeBuilder builder_(_fbb);
+  builder_.add_index(index);
+  builder_.add_fixed_length(fixed_length);
+  builder_.add_element(element);
+  builder_.add_base_type(base_type);
+  return builder_.Finish();
+}
+
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef KeyValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const flatbuffers::String *key() const {
+    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  }
+  bool KeyCompareLessThan(const KeyValue *o) const {
+    return *key() < *o->key();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(key()->c_str(), val);
+  }
+  const flatbuffers::String *value() const {
+    return GetPointer<const flatbuffers::String *>(VT_VALUE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) &&
+           verifier.EndTable();
+  }
+};
+
+struct KeyValueBuilder {
+  typedef KeyValue Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+    fbb_.AddOffset(KeyValue::VT_KEY, key);
+  }
+  void add_value(flatbuffers::Offset<flatbuffers::String> value) {
+    fbb_.AddOffset(KeyValue::VT_VALUE, value);
+  }
+  explicit KeyValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<KeyValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<KeyValue>(end);
+    fbb_.Required(o, KeyValue::VT_KEY);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<KeyValue> CreateKeyValue(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> key = 0,
+    flatbuffers::Offset<flatbuffers::String> value = 0) {
+  KeyValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<KeyValue> CreateKeyValueDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    const char *value = nullptr) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return reflection::CreateKeyValue(
+      _fbb,
+      key__,
+      value__);
+}
+
+struct EnumVal FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EnumValBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUE = 6,
+    VT_OBJECT = 8,
+    VT_UNION_TYPE = 10,
+    VT_DOCUMENTATION = 12
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  int64_t value() const {
+    return GetField<int64_t>(VT_VALUE, 0);
+  }
+  bool KeyCompareLessThan(const EnumVal *o) const {
+    return value() < o->value();
+  }
+  int KeyCompareWithValue(int64_t val) const {
+    return static_cast<int>(value() > val) - static_cast<int>(value() < val);
+  }
+  const reflection::Object *object() const {
+    return GetPointer<const reflection::Object *>(VT_OBJECT);
+  }
+  const reflection::Type *union_type() const {
+    return GetPointer<const reflection::Type *>(VT_UNION_TYPE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int64_t>(verifier, VT_VALUE) &&
+           VerifyOffset(verifier, VT_OBJECT) &&
+           verifier.VerifyTable(object()) &&
+           VerifyOffset(verifier, VT_UNION_TYPE) &&
+           verifier.VerifyTable(union_type()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumValBuilder {
+  typedef EnumVal Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(EnumVal::VT_NAME, name);
+  }
+  void add_value(int64_t value) {
+    fbb_.AddElement<int64_t>(EnumVal::VT_VALUE, value, 0);
+  }
+  void add_object(flatbuffers::Offset<reflection::Object> object) {
+    fbb_.AddOffset(EnumVal::VT_OBJECT, object);
+  }
+  void add_union_type(flatbuffers::Offset<reflection::Type> union_type) {
+    fbb_.AddOffset(EnumVal::VT_UNION_TYPE, union_type);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(EnumVal::VT_DOCUMENTATION, documentation);
+  }
+  explicit EnumValBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<EnumVal> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<EnumVal>(end);
+    fbb_.Required(o, EnumVal::VT_NAME);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<EnumVal> CreateEnumVal(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    int64_t value = 0,
+    flatbuffers::Offset<reflection::Object> object = 0,
+    flatbuffers::Offset<reflection::Type> union_type = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0) {
+  EnumValBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_documentation(documentation);
+  builder_.add_union_type(union_type);
+  builder_.add_object(object);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<EnumVal> CreateEnumValDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int64_t value = 0,
+    flatbuffers::Offset<reflection::Object> object = 0,
+    flatbuffers::Offset<reflection::Type> union_type = 0,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateEnumVal(
+      _fbb,
+      name__,
+      value,
+      object,
+      union_type,
+      documentation__);
+}
+
+struct Enum FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EnumBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUES = 6,
+    VT_IS_UNION = 8,
+    VT_UNDERLYING_TYPE = 10,
+    VT_ATTRIBUTES = 12,
+    VT_DOCUMENTATION = 14
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Enum *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::EnumVal>> *values() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::EnumVal>> *>(VT_VALUES);
+  }
+  bool is_union() const {
+    return GetField<uint8_t>(VT_IS_UNION, 0) != 0;
+  }
+  const reflection::Type *underlying_type() const {
+    return GetPointer<const reflection::Type *>(VT_UNDERLYING_TYPE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.VerifyVectorOfTables(values()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_UNION) &&
+           VerifyOffsetRequired(verifier, VT_UNDERLYING_TYPE) &&
+           verifier.VerifyTable(underlying_type()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumBuilder {
+  typedef Enum Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Enum::VT_NAME, name);
+  }
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::EnumVal>>> values) {
+    fbb_.AddOffset(Enum::VT_VALUES, values);
+  }
+  void add_is_union(bool is_union) {
+    fbb_.AddElement<uint8_t>(Enum::VT_IS_UNION, static_cast<uint8_t>(is_union), 0);
+  }
+  void add_underlying_type(flatbuffers::Offset<reflection::Type> underlying_type) {
+    fbb_.AddOffset(Enum::VT_UNDERLYING_TYPE, underlying_type);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Enum::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Enum::VT_DOCUMENTATION, documentation);
+  }
+  explicit EnumBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Enum> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Enum>(end);
+    fbb_.Required(o, Enum::VT_NAME);
+    fbb_.Required(o, Enum::VT_VALUES);
+    fbb_.Required(o, Enum::VT_UNDERLYING_TYPE);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Enum> CreateEnum(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::EnumVal>>> values = 0,
+    bool is_union = false,
+    flatbuffers::Offset<reflection::Type> underlying_type = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0) {
+  EnumBuilder builder_(_fbb);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_underlying_type(underlying_type);
+  builder_.add_values(values);
+  builder_.add_name(name);
+  builder_.add_is_union(is_union);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Enum> CreateEnumDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<flatbuffers::Offset<reflection::EnumVal>> *values = nullptr,
+    bool is_union = false,
+    flatbuffers::Offset<reflection::Type> underlying_type = 0,
+    std::vector<flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto values__ = values ? _fbb.CreateVectorOfSortedTables<reflection::EnumVal>(values) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateEnum(
+      _fbb,
+      name__,
+      values__,
+      is_union,
+      underlying_type,
+      attributes__,
+      documentation__);
+}
+
+struct Field FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FieldBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TYPE = 6,
+    VT_ID = 8,
+    VT_OFFSET = 10,
+    VT_DEFAULT_INTEGER = 12,
+    VT_DEFAULT_REAL = 14,
+    VT_DEPRECATED = 16,
+    VT_REQUIRED = 18,
+    VT_KEY = 20,
+    VT_ATTRIBUTES = 22,
+    VT_DOCUMENTATION = 24,
+    VT_OPTIONAL = 26
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Field *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const reflection::Type *type() const {
+    return GetPointer<const reflection::Type *>(VT_TYPE);
+  }
+  uint16_t id() const {
+    return GetField<uint16_t>(VT_ID, 0);
+  }
+  uint16_t offset() const {
+    return GetField<uint16_t>(VT_OFFSET, 0);
+  }
+  int64_t default_integer() const {
+    return GetField<int64_t>(VT_DEFAULT_INTEGER, 0);
+  }
+  double default_real() const {
+    return GetField<double>(VT_DEFAULT_REAL, 0.0);
+  }
+  bool deprecated() const {
+    return GetField<uint8_t>(VT_DEPRECATED, 0) != 0;
+  }
+  bool required() const {
+    return GetField<uint8_t>(VT_REQUIRED, 0) != 0;
+  }
+  bool key() const {
+    return GetField<uint8_t>(VT_KEY, 0) != 0;
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool optional() const {
+    return GetField<uint8_t>(VT_OPTIONAL, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_TYPE) &&
+           verifier.VerifyTable(type()) &&
+           VerifyField<uint16_t>(verifier, VT_ID) &&
+           VerifyField<uint16_t>(verifier, VT_OFFSET) &&
+           VerifyField<int64_t>(verifier, VT_DEFAULT_INTEGER) &&
+           VerifyField<double>(verifier, VT_DEFAULT_REAL) &&
+           VerifyField<uint8_t>(verifier, VT_DEPRECATED) &&
+           VerifyField<uint8_t>(verifier, VT_REQUIRED) &&
+           VerifyField<uint8_t>(verifier, VT_KEY) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyField<uint8_t>(verifier, VT_OPTIONAL) &&
+           verifier.EndTable();
+  }
+};
+
+struct FieldBuilder {
+  typedef Field Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Field::VT_NAME, name);
+  }
+  void add_type(flatbuffers::Offset<reflection::Type> type) {
+    fbb_.AddOffset(Field::VT_TYPE, type);
+  }
+  void add_id(uint16_t id) {
+    fbb_.AddElement<uint16_t>(Field::VT_ID, id, 0);
+  }
+  void add_offset(uint16_t offset) {
+    fbb_.AddElement<uint16_t>(Field::VT_OFFSET, offset, 0);
+  }
+  void add_default_integer(int64_t default_integer) {
+    fbb_.AddElement<int64_t>(Field::VT_DEFAULT_INTEGER, default_integer, 0);
+  }
+  void add_default_real(double default_real) {
+    fbb_.AddElement<double>(Field::VT_DEFAULT_REAL, default_real, 0.0);
+  }
+  void add_deprecated(bool deprecated) {
+    fbb_.AddElement<uint8_t>(Field::VT_DEPRECATED, static_cast<uint8_t>(deprecated), 0);
+  }
+  void add_required(bool required) {
+    fbb_.AddElement<uint8_t>(Field::VT_REQUIRED, static_cast<uint8_t>(required), 0);
+  }
+  void add_key(bool key) {
+    fbb_.AddElement<uint8_t>(Field::VT_KEY, static_cast<uint8_t>(key), 0);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Field::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Field::VT_DOCUMENTATION, documentation);
+  }
+  void add_optional(bool optional) {
+    fbb_.AddElement<uint8_t>(Field::VT_OPTIONAL, static_cast<uint8_t>(optional), 0);
+  }
+  explicit FieldBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Field> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Field>(end);
+    fbb_.Required(o, Field::VT_NAME);
+    fbb_.Required(o, Field::VT_TYPE);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Field> CreateField(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<reflection::Type> type = 0,
+    uint16_t id = 0,
+    uint16_t offset = 0,
+    int64_t default_integer = 0,
+    double default_real = 0.0,
+    bool deprecated = false,
+    bool required = false,
+    bool key = false,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0,
+    bool optional = false) {
+  FieldBuilder builder_(_fbb);
+  builder_.add_default_real(default_real);
+  builder_.add_default_integer(default_integer);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  builder_.add_offset(offset);
+  builder_.add_id(id);
+  builder_.add_optional(optional);
+  builder_.add_key(key);
+  builder_.add_required(required);
+  builder_.add_deprecated(deprecated);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Field> CreateFieldDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    flatbuffers::Offset<reflection::Type> type = 0,
+    uint16_t id = 0,
+    uint16_t offset = 0,
+    int64_t default_integer = 0,
+    double default_real = 0.0,
+    bool deprecated = false,
+    bool required = false,
+    bool key = false,
+    std::vector<flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr,
+    bool optional = false) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateField(
+      _fbb,
+      name__,
+      type,
+      id,
+      offset,
+      default_integer,
+      default_real,
+      deprecated,
+      required,
+      key,
+      attributes__,
+      documentation__,
+      optional);
+}
+
+struct Object FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ObjectBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_FIELDS = 6,
+    VT_IS_STRUCT = 8,
+    VT_MINALIGN = 10,
+    VT_BYTESIZE = 12,
+    VT_ATTRIBUTES = 14,
+    VT_DOCUMENTATION = 16
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Object *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::Field>> *fields() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::Field>> *>(VT_FIELDS);
+  }
+  bool is_struct() const {
+    return GetField<uint8_t>(VT_IS_STRUCT, 0) != 0;
+  }
+  int32_t minalign() const {
+    return GetField<int32_t>(VT_MINALIGN, 0);
+  }
+  int32_t bytesize() const {
+    return GetField<int32_t>(VT_BYTESIZE, 0);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_FIELDS) &&
+           verifier.VerifyVector(fields()) &&
+           verifier.VerifyVectorOfTables(fields()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_STRUCT) &&
+           VerifyField<int32_t>(verifier, VT_MINALIGN) &&
+           VerifyField<int32_t>(verifier, VT_BYTESIZE) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectBuilder {
+  typedef Object Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Object::VT_NAME, name);
+  }
+  void add_fields(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Field>>> fields) {
+    fbb_.AddOffset(Object::VT_FIELDS, fields);
+  }
+  void add_is_struct(bool is_struct) {
+    fbb_.AddElement<uint8_t>(Object::VT_IS_STRUCT, static_cast<uint8_t>(is_struct), 0);
+  }
+  void add_minalign(int32_t minalign) {
+    fbb_.AddElement<int32_t>(Object::VT_MINALIGN, minalign, 0);
+  }
+  void add_bytesize(int32_t bytesize) {
+    fbb_.AddElement<int32_t>(Object::VT_BYTESIZE, bytesize, 0);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Object::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Object::VT_DOCUMENTATION, documentation);
+  }
+  explicit ObjectBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Object> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Object>(end);
+    fbb_.Required(o, Object::VT_NAME);
+    fbb_.Required(o, Object::VT_FIELDS);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Object> CreateObject(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Field>>> fields = 0,
+    bool is_struct = false,
+    int32_t minalign = 0,
+    int32_t bytesize = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0) {
+  ObjectBuilder builder_(_fbb);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_bytesize(bytesize);
+  builder_.add_minalign(minalign);
+  builder_.add_fields(fields);
+  builder_.add_name(name);
+  builder_.add_is_struct(is_struct);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Object> CreateObjectDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<flatbuffers::Offset<reflection::Field>> *fields = nullptr,
+    bool is_struct = false,
+    int32_t minalign = 0,
+    int32_t bytesize = 0,
+    std::vector<flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto fields__ = fields ? _fbb.CreateVectorOfSortedTables<reflection::Field>(fields) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateObject(
+      _fbb,
+      name__,
+      fields__,
+      is_struct,
+      minalign,
+      bytesize,
+      attributes__,
+      documentation__);
+}
+
+struct RPCCall FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RPCCallBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_REQUEST = 6,
+    VT_RESPONSE = 8,
+    VT_ATTRIBUTES = 10,
+    VT_DOCUMENTATION = 12
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const RPCCall *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const reflection::Object *request() const {
+    return GetPointer<const reflection::Object *>(VT_REQUEST);
+  }
+  const reflection::Object *response() const {
+    return GetPointer<const reflection::Object *>(VT_RESPONSE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_REQUEST) &&
+           verifier.VerifyTable(request()) &&
+           VerifyOffsetRequired(verifier, VT_RESPONSE) &&
+           verifier.VerifyTable(response()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct RPCCallBuilder {
+  typedef RPCCall Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(RPCCall::VT_NAME, name);
+  }
+  void add_request(flatbuffers::Offset<reflection::Object> request) {
+    fbb_.AddOffset(RPCCall::VT_REQUEST, request);
+  }
+  void add_response(flatbuffers::Offset<reflection::Object> response) {
+    fbb_.AddOffset(RPCCall::VT_RESPONSE, response);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(RPCCall::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(RPCCall::VT_DOCUMENTATION, documentation);
+  }
+  explicit RPCCallBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<RPCCall> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RPCCall>(end);
+    fbb_.Required(o, RPCCall::VT_NAME);
+    fbb_.Required(o, RPCCall::VT_REQUEST);
+    fbb_.Required(o, RPCCall::VT_RESPONSE);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RPCCall> CreateRPCCall(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<reflection::Object> request = 0,
+    flatbuffers::Offset<reflection::Object> response = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0) {
+  RPCCallBuilder builder_(_fbb);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_response(response);
+  builder_.add_request(request);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<RPCCall> CreateRPCCallDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    flatbuffers::Offset<reflection::Object> request = 0,
+    flatbuffers::Offset<reflection::Object> response = 0,
+    std::vector<flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateRPCCall(
+      _fbb,
+      name__,
+      request,
+      response,
+      attributes__,
+      documentation__);
+}
+
+struct Service FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ServiceBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_CALLS = 6,
+    VT_ATTRIBUTES = 8,
+    VT_DOCUMENTATION = 10
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Service *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::RPCCall>> *calls() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::RPCCall>> *>(VT_CALLS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *documentation() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_CALLS) &&
+           verifier.VerifyVector(calls()) &&
+           verifier.VerifyVectorOfTables(calls()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ServiceBuilder {
+  typedef Service Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Service::VT_NAME, name);
+  }
+  void add_calls(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::RPCCall>>> calls) {
+    fbb_.AddOffset(Service::VT_CALLS, calls);
+  }
+  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Service::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Service::VT_DOCUMENTATION, documentation);
+  }
+  explicit ServiceBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Service> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Service>(end);
+    fbb_.Required(o, Service::VT_NAME);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Service> CreateService(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::RPCCall>>> calls = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> documentation = 0) {
+  ServiceBuilder builder_(_fbb);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_calls(calls);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Service> CreateServiceDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<flatbuffers::Offset<reflection::RPCCall>> *calls = nullptr,
+    std::vector<flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto calls__ = calls ? _fbb.CreateVectorOfSortedTables<reflection::RPCCall>(calls) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateService(
+      _fbb,
+      name__,
+      calls__,
+      attributes__,
+      documentation__);
+}
+
+struct Schema FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OBJECTS = 4,
+    VT_ENUMS = 6,
+    VT_FILE_IDENT = 8,
+    VT_FILE_EXT = 10,
+    VT_ROOT_TABLE = 12,
+    VT_SERVICES = 14,
+    VT_ADVANCED_FEATURES = 16
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::Object>> *objects() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::Object>> *>(VT_OBJECTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::Enum>> *enums() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::Enum>> *>(VT_ENUMS);
+  }
+  const flatbuffers::String *file_ident() const {
+    return GetPointer<const flatbuffers::String *>(VT_FILE_IDENT);
+  }
+  const flatbuffers::String *file_ext() const {
+    return GetPointer<const flatbuffers::String *>(VT_FILE_EXT);
+  }
+  const reflection::Object *root_table() const {
+    return GetPointer<const reflection::Object *>(VT_ROOT_TABLE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<reflection::Service>> *services() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<reflection::Service>> *>(VT_SERVICES);
+  }
+  reflection::AdvancedFeatures advanced_features() const {
+    return static_cast<reflection::AdvancedFeatures>(GetField<uint64_t>(VT_ADVANCED_FEATURES, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_OBJECTS) &&
+           verifier.VerifyVector(objects()) &&
+           verifier.VerifyVectorOfTables(objects()) &&
+           VerifyOffsetRequired(verifier, VT_ENUMS) &&
+           verifier.VerifyVector(enums()) &&
+           verifier.VerifyVectorOfTables(enums()) &&
+           VerifyOffset(verifier, VT_FILE_IDENT) &&
+           verifier.VerifyString(file_ident()) &&
+           VerifyOffset(verifier, VT_FILE_EXT) &&
+           verifier.VerifyString(file_ext()) &&
+           VerifyOffset(verifier, VT_ROOT_TABLE) &&
+           verifier.VerifyTable(root_table()) &&
+           VerifyOffset(verifier, VT_SERVICES) &&
+           verifier.VerifyVector(services()) &&
+           verifier.VerifyVectorOfTables(services()) &&
+           VerifyField<uint64_t>(verifier, VT_ADVANCED_FEATURES) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_objects(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Object>>> objects) {
+    fbb_.AddOffset(Schema::VT_OBJECTS, objects);
+  }
+  void add_enums(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Enum>>> enums) {
+    fbb_.AddOffset(Schema::VT_ENUMS, enums);
+  }
+  void add_file_ident(flatbuffers::Offset<flatbuffers::String> file_ident) {
+    fbb_.AddOffset(Schema::VT_FILE_IDENT, file_ident);
+  }
+  void add_file_ext(flatbuffers::Offset<flatbuffers::String> file_ext) {
+    fbb_.AddOffset(Schema::VT_FILE_EXT, file_ext);
+  }
+  void add_root_table(flatbuffers::Offset<reflection::Object> root_table) {
+    fbb_.AddOffset(Schema::VT_ROOT_TABLE, root_table);
+  }
+  void add_services(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Service>>> services) {
+    fbb_.AddOffset(Schema::VT_SERVICES, services);
+  }
+  void add_advanced_features(reflection::AdvancedFeatures advanced_features) {
+    fbb_.AddElement<uint64_t>(Schema::VT_ADVANCED_FEATURES, static_cast<uint64_t>(advanced_features), 0);
+  }
+  explicit SchemaBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Schema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Schema>(end);
+    fbb_.Required(o, Schema::VT_OBJECTS);
+    fbb_.Required(o, Schema::VT_ENUMS);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Schema> CreateSchema(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Object>>> objects = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Enum>>> enums = 0,
+    flatbuffers::Offset<flatbuffers::String> file_ident = 0,
+    flatbuffers::Offset<flatbuffers::String> file_ext = 0,
+    flatbuffers::Offset<reflection::Object> root_table = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<reflection::Service>>> services = 0,
+    reflection::AdvancedFeatures advanced_features = static_cast<reflection::AdvancedFeatures>(0)) {
+  SchemaBuilder builder_(_fbb);
+  builder_.add_advanced_features(advanced_features);
+  builder_.add_services(services);
+  builder_.add_root_table(root_table);
+  builder_.add_file_ext(file_ext);
+  builder_.add_file_ident(file_ident);
+  builder_.add_enums(enums);
+  builder_.add_objects(objects);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Schema> CreateSchemaDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    std::vector<flatbuffers::Offset<reflection::Object>> *objects = nullptr,
+    std::vector<flatbuffers::Offset<reflection::Enum>> *enums = nullptr,
+    const char *file_ident = nullptr,
+    const char *file_ext = nullptr,
+    flatbuffers::Offset<reflection::Object> root_table = 0,
+    std::vector<flatbuffers::Offset<reflection::Service>> *services = nullptr,
+    reflection::AdvancedFeatures advanced_features = static_cast<reflection::AdvancedFeatures>(0)) {
+  auto objects__ = objects ? _fbb.CreateVectorOfSortedTables<reflection::Object>(objects) : 0;
+  auto enums__ = enums ? _fbb.CreateVectorOfSortedTables<reflection::Enum>(enums) : 0;
+  auto file_ident__ = file_ident ? _fbb.CreateString(file_ident) : 0;
+  auto file_ext__ = file_ext ? _fbb.CreateString(file_ext) : 0;
+  auto services__ = services ? _fbb.CreateVectorOfSortedTables<reflection::Service>(services) : 0;
+  return reflection::CreateSchema(
+      _fbb,
+      objects__,
+      enums__,
+      file_ident__,
+      file_ext__,
+      root_table,
+      services__,
+      advanced_features);
+}
+
+inline const reflection::Schema *GetSchema(const void *buf) {
+  return flatbuffers::GetRoot<reflection::Schema>(buf);
+}
+
+inline const reflection::Schema *GetSizePrefixedSchema(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<reflection::Schema>(buf);
+}
+
+inline const char *SchemaIdentifier() {
+  return "BFBS";
+}
+
+inline bool SchemaBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, SchemaIdentifier());
+}
+
+inline bool VerifySchemaBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<reflection::Schema>(SchemaIdentifier());
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<reflection::Schema>(SchemaIdentifier());
+}
+
+inline const char *SchemaExtension() {
+  return "bfbs";
+}
+
+inline void FinishSchemaBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<reflection::Schema> root) {
+  fbb.Finish(root, SchemaIdentifier());
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<reflection::Schema> root) {
+  fbb.FinishSizePrefixed(root, SchemaIdentifier());
+}
+
+}  // namespace reflection
+
+#endif  // FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/registry.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/registry.h
new file mode 100644
index 0000000..9ea425b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/registry.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_REGISTRY_H_
+#define FLATBUFFERS_REGISTRY_H_
+
+#include "flatbuffers/idl.h"
+
+namespace flatbuffers {
+
+// Convenience class to easily parse or generate text for arbitrary FlatBuffers.
+// Simply pre-populate it with all schema filenames that may be in use, and
+// This class will look them up using the file_identifier declared in the
+// schema.
+class Registry {
+ public:
+  // Call this for all schemas that may be in use. The identifier has
+  // a function in the generated code, e.g. MonsterIdentifier().
+  void Register(const char *file_identifier, const char *schema_path) {
+    Schema schema;
+    schema.path_ = schema_path;
+    schemas_[file_identifier] = schema;
+  }
+
+  // Generate text from an arbitrary FlatBuffer by looking up its
+  // file_identifier in the registry.
+  bool FlatBufferToText(const uint8_t *flatbuf, size_t len, std::string *dest) {
+    // Get the identifier out of the buffer.
+    // If the buffer is truncated, exit.
+    if (len < sizeof(uoffset_t) + FlatBufferBuilder::kFileIdentifierLength) {
+      lasterror_ = "buffer truncated";
+      return false;
+    }
+    std::string ident(
+        reinterpret_cast<const char *>(flatbuf) + sizeof(uoffset_t),
+        FlatBufferBuilder::kFileIdentifierLength);
+    // Load and parse the schema.
+    Parser parser;
+    if (!LoadSchema(ident, &parser)) return false;
+    // Now we're ready to generate text.
+    if (!GenerateText(parser, flatbuf, dest)) {
+      lasterror_ = "unable to generate text for FlatBuffer binary";
+      return false;
+    }
+    return true;
+  }
+
+  // Converts a binary buffer to text using one of the schemas in the registry,
+  // use the file_identifier to indicate which.
+  // If DetachedBuffer::data() is null then parsing failed.
+  DetachedBuffer TextToFlatBuffer(const char *text,
+                                  const char *file_identifier) {
+    // Load and parse the schema.
+    Parser parser;
+    if (!LoadSchema(file_identifier, &parser)) return DetachedBuffer();
+    // Parse the text.
+    if (!parser.Parse(text)) {
+      lasterror_ = parser.error_;
+      return DetachedBuffer();
+    }
+    // We have a valid FlatBuffer. Detach it from the builder and return.
+    return parser.builder_.Release();
+  }
+
+  // Modify any parsing / output options used by the other functions.
+  void SetOptions(const IDLOptions &opts) { opts_ = opts; }
+
+  // If schemas used contain include statements, call this function for every
+  // directory the parser should search them for.
+  void AddIncludeDirectory(const char *path) { include_paths_.push_back(path); }
+
+  // Returns a human readable error if any of the above functions fail.
+  const std::string &GetLastError() { return lasterror_; }
+
+ private:
+  bool LoadSchema(const std::string &ident, Parser *parser) {
+    // Find the schema, if not, exit.
+    auto it = schemas_.find(ident);
+    if (it == schemas_.end()) {
+      // Don't attach the identifier, since it may not be human readable.
+      lasterror_ = "identifier for this buffer not in the registry";
+      return false;
+    }
+    auto &schema = it->second;
+    // Load the schema from disk. If not, exit.
+    std::string schematext;
+    if (!LoadFile(schema.path_.c_str(), false, &schematext)) {
+      lasterror_ = "could not load schema: " + schema.path_;
+      return false;
+    }
+    // Parse schema.
+    parser->opts = opts_;
+    if (!parser->Parse(schematext.c_str(), vector_data(include_paths_),
+                       schema.path_.c_str())) {
+      lasterror_ = parser->error_;
+      return false;
+    }
+    return true;
+  }
+
+  struct Schema {
+    std::string path_;
+    // TODO(wvo) optionally cache schema file or parsed schema here.
+  };
+
+  std::string lasterror_;
+  IDLOptions opts_;
+  std::vector<const char *> include_paths_;
+  std::map<std::string, Schema> schemas_;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_REGISTRY_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/stl_emulation.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/stl_emulation.h
new file mode 100644
index 0000000..70e5dc9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/stl_emulation.h
@@ -0,0 +1,673 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_STL_EMULATION_H_
+#define FLATBUFFERS_STL_EMULATION_H_
+
+// clang-format off
+#include "flatbuffers/base.h"
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+#include <limits>
+
+#if defined(_STLPORT_VERSION) && !defined(FLATBUFFERS_CPP98_STL)
+  #define FLATBUFFERS_CPP98_STL
+#endif  // defined(_STLPORT_VERSION) && !defined(FLATBUFFERS_CPP98_STL)
+
+#if defined(FLATBUFFERS_CPP98_STL)
+  #include <cctype>
+#endif  // defined(FLATBUFFERS_CPP98_STL)
+
+// Detect C++17 compatible compiler.
+// __cplusplus >= 201703L - a compiler has support of 'static inline' variables.
+#if defined(FLATBUFFERS_USE_STD_OPTIONAL) \
+    || (defined(__cplusplus) && __cplusplus >= 201703L) \
+    || (defined(_MSVC_LANG) &&  (_MSVC_LANG >= 201703L))
+  #include <optional>
+  #ifndef FLATBUFFERS_USE_STD_OPTIONAL
+    #define FLATBUFFERS_USE_STD_OPTIONAL
+  #endif
+#endif // defined(FLATBUFFERS_USE_STD_OPTIONAL) ...
+
+// The __cpp_lib_span is the predefined feature macro.
+#if defined(FLATBUFFERS_USE_STD_SPAN)
+    #include <span>
+#elif defined(__cpp_lib_span) && defined(__has_include)
+  #if __has_include(<span>)
+    #include <span>
+    #define FLATBUFFERS_USE_STD_SPAN
+  #endif
+#else
+  // Disable non-trivial ctors if FLATBUFFERS_SPAN_MINIMAL defined.
+  #if !defined(FLATBUFFERS_TEMPLATES_ALIASES) || defined(FLATBUFFERS_CPP98_STL)
+    #define FLATBUFFERS_SPAN_MINIMAL
+  #else
+    // Enable implicit construction of a span<T,N> from a std::array<T,N>.
+    #include <array>
+  #endif
+#endif // defined(FLATBUFFERS_USE_STD_SPAN)
+
+// This header provides backwards compatibility for C++98 STLs like stlport.
+namespace flatbuffers {
+
+// Retrieve ::back() from a string in a way that is compatible with pre C++11
+// STLs (e.g stlport).
+inline char& string_back(std::string &value) {
+  return value[value.length() - 1];
+}
+
+inline char string_back(const std::string &value) {
+  return value[value.length() - 1];
+}
+
+// Helper method that retrieves ::data() from a vector in a way that is
+// compatible with pre C++11 STLs (e.g stlport).
+template <typename T> inline T *vector_data(std::vector<T> &vector) {
+  // In some debug environments, operator[] does bounds checking, so &vector[0]
+  // can't be used.
+  return vector.empty() ? nullptr : &vector[0];
+}
+
+template <typename T> inline const T *vector_data(
+    const std::vector<T> &vector) {
+  return vector.empty() ? nullptr : &vector[0];
+}
+
+template <typename T, typename V>
+inline void vector_emplace_back(std::vector<T> *vector, V &&data) {
+  #if defined(FLATBUFFERS_CPP98_STL)
+    vector->push_back(data);
+  #else
+    vector->emplace_back(std::forward<V>(data));
+  #endif  // defined(FLATBUFFERS_CPP98_STL)
+}
+
+#ifndef FLATBUFFERS_CPP98_STL
+  #if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+    template <typename T>
+    using numeric_limits = std::numeric_limits<T>;
+  #else
+    template <typename T> class numeric_limits :
+      public std::numeric_limits<T> {};
+  #endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+#else
+  template <typename T> class numeric_limits :
+      public std::numeric_limits<T> {
+    public:
+      // Android NDK fix.
+      static T lowest() {
+        return std::numeric_limits<T>::min();
+      }
+  };
+
+  template <> class numeric_limits<float> :
+      public std::numeric_limits<float> {
+    public:
+      static float lowest() { return -FLT_MAX; }
+  };
+
+  template <> class numeric_limits<double> :
+      public std::numeric_limits<double> {
+    public:
+      static double lowest() { return -DBL_MAX; }
+  };
+
+  template <> class numeric_limits<unsigned long long> {
+   public:
+    static unsigned long long min() { return 0ULL; }
+    static unsigned long long max() { return ~0ULL; }
+    static unsigned long long lowest() {
+      return numeric_limits<unsigned long long>::min();
+    }
+  };
+
+  template <> class numeric_limits<long long> {
+   public:
+    static long long min() {
+      return static_cast<long long>(1ULL << ((sizeof(long long) << 3) - 1));
+    }
+    static long long max() {
+      return static_cast<long long>(
+          (1ULL << ((sizeof(long long) << 3) - 1)) - 1);
+    }
+    static long long lowest() {
+      return numeric_limits<long long>::min();
+    }
+  };
+#endif  // FLATBUFFERS_CPP98_STL
+
+#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+  #ifndef FLATBUFFERS_CPP98_STL
+    template <typename T> using is_scalar = std::is_scalar<T>;
+    template <typename T, typename U> using is_same = std::is_same<T,U>;
+    template <typename T> using is_floating_point = std::is_floating_point<T>;
+    template <typename T> using is_unsigned = std::is_unsigned<T>;
+    template <typename T> using is_enum = std::is_enum<T>;
+    template <typename T> using make_unsigned = std::make_unsigned<T>;
+    template<bool B, class T, class F>
+    using conditional = std::conditional<B, T, F>;
+    template<class T, T v>
+    using integral_constant = std::integral_constant<T, v>;
+    template <bool B>
+    using bool_constant = integral_constant<bool, B>;
+  #else
+    // Map C++ TR1 templates defined by stlport.
+    template <typename T> using is_scalar = std::tr1::is_scalar<T>;
+    template <typename T, typename U> using is_same = std::tr1::is_same<T,U>;
+    template <typename T> using is_floating_point =
+        std::tr1::is_floating_point<T>;
+    template <typename T> using is_unsigned = std::tr1::is_unsigned<T>;
+    template <typename T> using is_enum = std::tr1::is_enum<T>;
+    // Android NDK doesn't have std::make_unsigned or std::tr1::make_unsigned.
+    template<typename T> struct make_unsigned {
+      static_assert(is_unsigned<T>::value, "Specialization not implemented!");
+      using type = T;
+    };
+    template<> struct make_unsigned<char> { using type = unsigned char; };
+    template<> struct make_unsigned<short> { using type = unsigned short; };
+    template<> struct make_unsigned<int> { using type = unsigned int; };
+    template<> struct make_unsigned<long> { using type = unsigned long; };
+    template<>
+    struct make_unsigned<long long> { using type = unsigned long long; };
+    template<bool B, class T, class F>
+    using conditional = std::tr1::conditional<B, T, F>;
+    template<class T, T v>
+    using integral_constant = std::tr1::integral_constant<T, v>;
+    template <bool B>
+    using bool_constant = integral_constant<bool, B>;
+  #endif  // !FLATBUFFERS_CPP98_STL
+#else
+  // MSVC 2010 doesn't support C++11 aliases.
+  template <typename T> struct is_scalar : public std::is_scalar<T> {};
+  template <typename T, typename U> struct is_same : public std::is_same<T,U> {};
+  template <typename T> struct is_floating_point :
+        public std::is_floating_point<T> {};
+  template <typename T> struct is_unsigned : public std::is_unsigned<T> {};
+  template <typename T> struct is_enum : public std::is_enum<T> {};
+  template <typename T> struct make_unsigned : public std::make_unsigned<T> {};
+  template<bool B, class T, class F>
+  struct conditional : public std::conditional<B, T, F> {};
+  template<class T, T v>
+  struct integral_constant : public std::integral_constant<T, v> {};
+  template <bool B>
+  struct bool_constant : public integral_constant<bool, B> {};
+#endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+
+#ifndef FLATBUFFERS_CPP98_STL
+  #if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+    template <class T> using unique_ptr = std::unique_ptr<T>;
+  #else
+    // MSVC 2010 doesn't support C++11 aliases.
+    // We're manually "aliasing" the class here as we want to bring unique_ptr
+    // into the flatbuffers namespace.  We have unique_ptr in the flatbuffers
+    // namespace we have a completely independent implementation (see below)
+    // for C++98 STL implementations.
+    template <class T> class unique_ptr : public std::unique_ptr<T> {
+     public:
+      unique_ptr() {}
+      explicit unique_ptr(T* p) : std::unique_ptr<T>(p) {}
+      unique_ptr(std::unique_ptr<T>&& u) { *this = std::move(u); }
+      unique_ptr(unique_ptr&& u) { *this = std::move(u); }
+      unique_ptr& operator=(std::unique_ptr<T>&& u) {
+        std::unique_ptr<T>::reset(u.release());
+        return *this;
+      }
+      unique_ptr& operator=(unique_ptr&& u) {
+        std::unique_ptr<T>::reset(u.release());
+        return *this;
+      }
+      unique_ptr& operator=(T* p) {
+        return std::unique_ptr<T>::operator=(p);
+      }
+    };
+  #endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+#else
+  // Very limited implementation of unique_ptr.
+  // This is provided simply to allow the C++ code generated from the default
+  // settings to function in C++98 environments with no modifications.
+  template <class T> class unique_ptr {
+   public:
+    typedef T element_type;
+
+    unique_ptr() : ptr_(nullptr) {}
+    explicit unique_ptr(T* p) : ptr_(p) {}
+    unique_ptr(unique_ptr&& u) : ptr_(nullptr) { reset(u.release()); }
+    unique_ptr(const unique_ptr& u) : ptr_(nullptr) {
+      reset(const_cast<unique_ptr*>(&u)->release());
+    }
+    ~unique_ptr() { reset(); }
+
+    unique_ptr& operator=(const unique_ptr& u) {
+      reset(const_cast<unique_ptr*>(&u)->release());
+      return *this;
+    }
+
+    unique_ptr& operator=(unique_ptr&& u) {
+      reset(u.release());
+      return *this;
+    }
+
+    unique_ptr& operator=(T* p) {
+      reset(p);
+      return *this;
+    }
+
+    const T& operator*() const { return *ptr_; }
+    T* operator->() const { return ptr_; }
+    T* get() const noexcept { return ptr_; }
+    explicit operator bool() const { return ptr_ != nullptr; }
+
+    // modifiers
+    T* release() {
+      T* value = ptr_;
+      ptr_ = nullptr;
+      return value;
+    }
+
+    void reset(T* p = nullptr) {
+      T* value = ptr_;
+      ptr_ = p;
+      if (value) delete value;
+    }
+
+    void swap(unique_ptr& u) {
+      T* temp_ptr = ptr_;
+      ptr_ = u.ptr_;
+      u.ptr_ = temp_ptr;
+    }
+
+   private:
+    T* ptr_;
+  };
+
+  template <class T> bool operator==(const unique_ptr<T>& x,
+                                     const unique_ptr<T>& y) {
+    return x.get() == y.get();
+  }
+
+  template <class T, class D> bool operator==(const unique_ptr<T>& x,
+                                              const D* y) {
+    return static_cast<D*>(x.get()) == y;
+  }
+
+  template <class T> bool operator==(const unique_ptr<T>& x, intptr_t y) {
+    return reinterpret_cast<intptr_t>(x.get()) == y;
+  }
+
+  template <class T> bool operator!=(const unique_ptr<T>& x, decltype(nullptr)) {
+    return !!x;
+  }
+
+  template <class T> bool operator!=(decltype(nullptr), const unique_ptr<T>& x) {
+    return !!x;
+  }
+
+  template <class T> bool operator==(const unique_ptr<T>& x, decltype(nullptr)) {
+    return !x;
+  }
+
+  template <class T> bool operator==(decltype(nullptr), const unique_ptr<T>& x) {
+    return !x;
+  }
+
+#endif  // !FLATBUFFERS_CPP98_STL
+
+#ifdef FLATBUFFERS_USE_STD_OPTIONAL
+template<class T>
+using Optional = std::optional<T>;
+using nullopt_t = std::nullopt_t;
+inline constexpr nullopt_t nullopt = std::nullopt;
+
+#else
+// Limited implementation of Optional<T> type for a scalar T.
+// This implementation limited by trivial types compatible with
+// std::is_arithmetic<T> or std::is_enum<T> type traits.
+
+// A tag to indicate an empty flatbuffers::optional<T>.
+struct nullopt_t {
+  explicit FLATBUFFERS_CONSTEXPR_CPP11 nullopt_t(int) {}
+};
+
+#if defined(FLATBUFFERS_CONSTEXPR_DEFINED)
+  namespace internal {
+    template <class> struct nullopt_holder {
+      static constexpr nullopt_t instance_ = nullopt_t(0);
+    };
+    template<class Dummy>
+    constexpr nullopt_t nullopt_holder<Dummy>::instance_;
+  }
+  static constexpr const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
+
+#else
+  namespace internal {
+    template <class> struct nullopt_holder {
+      static const nullopt_t instance_;
+    };
+    template<class Dummy>
+    const nullopt_t nullopt_holder<Dummy>::instance_  = nullopt_t(0);
+  }
+  static const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
+
+#endif
+
+template<class T>
+class Optional FLATBUFFERS_FINAL_CLASS {
+  // Non-scalar 'T' would extremely complicated Optional<T>.
+  // Use is_scalar<T> checking because flatbuffers flatbuffers::is_arithmetic<T>
+  // isn't implemented.
+  static_assert(flatbuffers::is_scalar<T>::value, "unexpected type T");
+
+ public:
+  ~Optional() {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional() FLATBUFFERS_NOEXCEPT
+    : value_(), has_value_(false) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(nullopt_t) FLATBUFFERS_NOEXCEPT
+    : value_(), has_value_(false) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(T val) FLATBUFFERS_NOEXCEPT
+    : value_(val), has_value_(true) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(const Optional &other) FLATBUFFERS_NOEXCEPT
+    : value_(other.value_), has_value_(other.has_value_) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(const Optional &other) FLATBUFFERS_NOEXCEPT {
+    value_ = other.value_;
+    has_value_ = other.has_value_;
+    return *this;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(nullopt_t) FLATBUFFERS_NOEXCEPT {
+    value_ = T();
+    has_value_ = false;
+    return *this;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(T val) FLATBUFFERS_NOEXCEPT {
+    value_ = val;
+    has_value_ = true;
+    return *this;
+  }
+
+  void reset() FLATBUFFERS_NOEXCEPT {
+    *this = nullopt;
+  }
+
+  void swap(Optional &other) FLATBUFFERS_NOEXCEPT {
+    std::swap(value_, other.value_);
+    std::swap(has_value_, other.has_value_);
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 FLATBUFFERS_EXPLICIT_CPP11 operator bool() const FLATBUFFERS_NOEXCEPT {
+    return has_value_;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 bool has_value() const FLATBUFFERS_NOEXCEPT {
+    return has_value_;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 const T& operator*() const FLATBUFFERS_NOEXCEPT {
+    return value_;
+  }
+
+  const T& value() const {
+    FLATBUFFERS_ASSERT(has_value());
+    return value_;
+  }
+
+  T value_or(T default_value) const FLATBUFFERS_NOEXCEPT {
+    return has_value() ? value_ : default_value;
+  }
+
+ private:
+  T value_;
+  bool has_value_;
+};
+
+template<class T>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& opt, nullopt_t) FLATBUFFERS_NOEXCEPT {
+  return !opt;
+}
+template<class T>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(nullopt_t, const Optional<T>& opt) FLATBUFFERS_NOEXCEPT {
+  return !opt;
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const U& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(lhs) && (*lhs == rhs);
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const T& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(rhs) && (lhs == *rhs);
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(lhs) != static_cast<bool>(rhs)
+              ? false
+              : !static_cast<bool>(lhs) ? false : (*lhs == *rhs);
+}
+#endif // FLATBUFFERS_USE_STD_OPTIONAL
+
+
+// Very limited and naive partial implementation of C++20 std::span<T,Extent>.
+#if defined(FLATBUFFERS_USE_STD_SPAN)
+  inline constexpr std::size_t dynamic_extent = std::dynamic_extent;
+  template<class T, std::size_t Extent = std::dynamic_extent>
+  using span = std::span<T, Extent>;
+
+#else // !defined(FLATBUFFERS_USE_STD_SPAN)
+FLATBUFFERS_CONSTEXPR std::size_t dynamic_extent = static_cast<std::size_t>(-1);
+
+// Exclude this code if MSVC2010 or non-STL Android is active.
+// The non-STL Android doesn't have `std::is_convertible` required for SFINAE.
+#if !defined(FLATBUFFERS_SPAN_MINIMAL)
+namespace internal {
+  // This is SFINAE helper class for checking of a common condition:
+  // > This overload only participates in overload resolution
+  // > Check whether a pointer to an array of U can be converted
+  // > to a pointer to an array of E.
+  // This helper is used for checking of 'U -> const U'.
+  template<class E, std::size_t Extent, class U, std::size_t N>
+  struct is_span_convertable {
+    using type =
+      typename std::conditional<std::is_convertible<U (*)[], E (*)[]>::value
+                                && (Extent == dynamic_extent || N == Extent),
+                                int, void>::type;
+  };
+
+}  // namespace internal
+#endif  // !defined(FLATBUFFERS_SPAN_MINIMAL)
+
+// T - element type; must be a complete type that is not an abstract
+// class type.
+// Extent - the number of elements in the sequence, or dynamic.
+template<class T, std::size_t Extent = dynamic_extent>
+class span FLATBUFFERS_FINAL_CLASS {
+ public:
+  typedef T element_type;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef std::size_t size_type;
+
+  static FLATBUFFERS_CONSTEXPR size_type extent = Extent;
+
+  // Returns the number of elements in the span.
+  FLATBUFFERS_CONSTEXPR_CPP11 size_type size() const FLATBUFFERS_NOEXCEPT {
+    return count_;
+  }
+
+  // Returns the size of the sequence in bytes.
+  FLATBUFFERS_CONSTEXPR_CPP11
+  size_type size_bytes() const FLATBUFFERS_NOEXCEPT {
+    return size() * sizeof(element_type);
+  }
+
+  // Checks if the span is empty.
+  FLATBUFFERS_CONSTEXPR_CPP11 bool empty() const FLATBUFFERS_NOEXCEPT {
+    return size() == 0;
+  }
+
+  // Returns a pointer to the beginning of the sequence.
+  FLATBUFFERS_CONSTEXPR_CPP11 pointer data() const FLATBUFFERS_NOEXCEPT {
+    return data_;
+  }
+
+  // Returns a reference to the idx-th element of the sequence.
+  // The behavior is undefined if the idx is greater than or equal to size().
+  FLATBUFFERS_CONSTEXPR_CPP11 reference operator[](size_type idx) const {
+    return data()[idx];
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const span &other) FLATBUFFERS_NOEXCEPT
+      : data_(other.data_), count_(other.count_) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP14 span &operator=(const span &other)
+      FLATBUFFERS_NOEXCEPT {
+    data_ = other.data_;
+    count_ = other.count_;
+  }
+
+  // Limited implementation of
+  // `template <class It> constexpr std::span(It first, size_type count);`.
+  //
+  // Constructs a span that is a view over the range [first, first + count);
+  // the resulting span has: data() == first and size() == count.
+  // The behavior is undefined if [first, first + count) is not a valid range,
+  // or if (extent != flatbuffers::dynamic_extent && count != extent).
+  FLATBUFFERS_CONSTEXPR_CPP11
+  explicit span(pointer first, size_type count) FLATBUFFERS_NOEXCEPT
+    : data_ (Extent == dynamic_extent ? first : (Extent == count ? first : nullptr)),
+      count_(Extent == dynamic_extent ? count : (Extent == count ? Extent : 0)) {
+      // Make span empty if the count argument is incompatible with span<T,N>.
+  }
+
+  // Exclude this code if MSVC2010 is active. The MSVC2010 isn't C++11
+  // compliant, it doesn't support default template arguments for functions.
+  #if defined(FLATBUFFERS_SPAN_MINIMAL)
+  FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
+                                                            count_(0) {
+    static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
+  }
+
+  #else
+  // Constructs an empty span whose data() == nullptr and size() == 0.
+  // This overload only participates in overload resolution if
+  // extent == 0 || extent == flatbuffers::dynamic_extent.
+  // A dummy template argument N is need dependency for SFINAE.
+  template<std::size_t N = 0,
+    typename internal::is_span_convertable<element_type, Extent, element_type, (N - N)>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
+                                                            count_(0) {
+    static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
+  }
+
+  // Constructs a span that is a view over the array arr; the resulting span
+  // has size() == N and data() == std::data(arr). These overloads only
+  // participate in overload resolution if
+  // extent == std::dynamic_extent || N == extent is true and
+  // std::remove_pointer_t<decltype(std::data(arr))>(*)[]
+  // is convertible to element_type (*)[].
+  template<std::size_t N,
+    typename internal::is_span_convertable<element_type, Extent, element_type, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(element_type (&arr)[N]) FLATBUFFERS_NOEXCEPT
+      : data_(arr), count_(N) {}
+
+  template<class U, std::size_t N,
+    typename internal::is_span_convertable<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+     : data_(arr.data()), count_(N) {}
+
+  //template<class U, std::size_t N,
+  //  int = 0>
+  //FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+  //   : data_(arr.data()), count_(N) {}
+
+  template<class U, std::size_t N,
+    typename internal::is_span_convertable<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+    : data_(arr.data()), count_(N) {}
+
+  // Converting constructor from another span s;
+  // the resulting span has size() == s.size() and data() == s.data().
+  // This overload only participates in overload resolution
+  // if extent == std::dynamic_extent || N == extent is true and U (*)[]
+  // is convertible to element_type (*)[].
+  template<class U, std::size_t N,
+    typename internal::is_span_convertable<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const flatbuffers::span<U, N> &s) FLATBUFFERS_NOEXCEPT
+      : span(s.data(), s.size()) {
+  }
+
+  #endif  // !defined(FLATBUFFERS_SPAN_MINIMAL)
+
+ private:
+  // This is a naive implementation with 'count_' member even if (Extent != dynamic_extent).
+  pointer const data_;
+  const size_type count_;
+};
+
+ #if !defined(FLATBUFFERS_SPAN_MINIMAL)
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<U, N> make_span(U(&arr)[N]) FLATBUFFERS_NOEXCEPT {
+    return span<U, N>(arr);
+  }
+
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<const U, N> make_span(const U(&arr)[N]) FLATBUFFERS_NOEXCEPT {
+    return span<const U, N>(arr);
+  }
+
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<U, N> make_span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
+    return span<U, N>(arr);
+  }
+
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<const U, N> make_span(const std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
+    return span<const U, N>(arr);
+  }
+
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<U, dynamic_extent> make_span(U *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
+    return span<U, dynamic_extent>(first, count);
+  }
+
+  template<class U, std::size_t N>
+  FLATBUFFERS_CONSTEXPR_CPP11
+  flatbuffers::span<const U, dynamic_extent> make_span(const U *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
+    return span<const U, dynamic_extent>(first, count);
+  }
+#endif
+
+#endif  // defined(FLATBUFFERS_USE_STD_SPAN)
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_STL_EMULATION_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/util.h b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/util.h
new file mode 100644
index 0000000..f30bd98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/include/flatbuffers/util.h
@@ -0,0 +1,698 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_UTIL_H_
+#define FLATBUFFERS_UTIL_H_
+
+#include <errno.h>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/stl_emulation.h"
+
+#ifndef FLATBUFFERS_PREFER_PRINTF
+#  include <sstream>
+#else  // FLATBUFFERS_PREFER_PRINTF
+#  include <float.h>
+#  include <stdio.h>
+#endif  // FLATBUFFERS_PREFER_PRINTF
+
+#include <iomanip>
+#include <string>
+
+namespace flatbuffers {
+
+// @locale-independent functions for ASCII characters set.
+
+// Fast checking that character lies in closed range: [a <= x <= b]
+// using one compare (conditional branch) operator.
+inline bool check_ascii_range(char x, char a, char b) {
+  FLATBUFFERS_ASSERT(a <= b);
+  // (Hacker's Delight): `a <= x <= b` <=> `(x-a) <={u} (b-a)`.
+  // The x, a, b will be promoted to int and subtracted without overflow.
+  return static_cast<unsigned int>(x - a) <= static_cast<unsigned int>(b - a);
+}
+
+// Case-insensitive isalpha
+inline bool is_alpha(char c) {
+  // ASCII only: alpha to upper case => reset bit 0x20 (~0x20 = 0xDF).
+  return check_ascii_range(c & 0xDF, 'a' & 0xDF, 'z' & 0xDF);
+}
+
+// Check for uppercase alpha
+inline bool is_alpha_upper(char c) { return check_ascii_range(c, 'A', 'Z'); }
+
+// Check (case-insensitive) that `c` is equal to alpha.
+inline bool is_alpha_char(char c, char alpha) {
+  FLATBUFFERS_ASSERT(is_alpha(alpha));
+  // ASCII only: alpha to upper case => reset bit 0x20 (~0x20 = 0xDF).
+  return ((c & 0xDF) == (alpha & 0xDF));
+}
+
+// https://en.cppreference.com/w/cpp/string/byte/isxdigit
+// isdigit and isxdigit are the only standard narrow character classification
+// functions that are not affected by the currently installed C locale. although
+// some implementations (e.g. Microsoft in 1252 codepage) may classify
+// additional single-byte characters as digits.
+inline bool is_digit(char c) { return check_ascii_range(c, '0', '9'); }
+
+inline bool is_xdigit(char c) {
+  // Replace by look-up table.
+  return is_digit(c) || check_ascii_range(c & 0xDF, 'a' & 0xDF, 'f' & 0xDF);
+}
+
+// Case-insensitive isalnum
+inline bool is_alnum(char c) { return is_alpha(c) || is_digit(c); }
+
+inline char CharToUpper(char c) {
+  return static_cast<char>(::toupper(static_cast<unsigned char>(c)));
+}
+
+inline char CharToLower(char c) {
+  return static_cast<char>(::tolower(static_cast<unsigned char>(c)));
+}
+
+// @end-locale-independent functions for ASCII character set
+
+#ifdef FLATBUFFERS_PREFER_PRINTF
+template<typename T> size_t IntToDigitCount(T t) {
+  size_t digit_count = 0;
+  // Count the sign for negative numbers
+  if (t < 0) digit_count++;
+  // Count a single 0 left of the dot for fractional numbers
+  if (-1 < t && t < 1) digit_count++;
+  // Count digits until fractional part
+  T eps = std::numeric_limits<float>::epsilon();
+  while (t <= (-1 + eps) || (1 - eps) <= t) {
+    t /= 10;
+    digit_count++;
+  }
+  return digit_count;
+}
+
+template<typename T> size_t NumToStringWidth(T t, int precision = 0) {
+  size_t string_width = IntToDigitCount(t);
+  // Count the dot for floating point numbers
+  if (precision) string_width += (precision + 1);
+  return string_width;
+}
+
+template<typename T>
+std::string NumToStringImplWrapper(T t, const char *fmt, int precision = 0) {
+  size_t string_width = NumToStringWidth(t, precision);
+  std::string s(string_width, 0x00);
+  // Allow snprintf to use std::string trailing null to detect buffer overflow
+  snprintf(const_cast<char *>(s.data()), (s.size() + 1), fmt, string_width, t);
+  return s;
+}
+#endif  // FLATBUFFERS_PREFER_PRINTF
+
+// Convert an integer or floating point value to a string.
+// In contrast to std::stringstream, "char" values are
+// converted to a string of digits, and we don't use scientific notation.
+template<typename T> std::string NumToString(T t) {
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    std::stringstream ss;
+    ss << t;
+    return ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    auto v = static_cast<long long>(t);
+    return NumToStringImplWrapper(v, "%.*lld");
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+}
+// Avoid char types used as character data.
+template<> inline std::string NumToString<signed char>(signed char t) {
+  return NumToString(static_cast<int>(t));
+}
+template<> inline std::string NumToString<unsigned char>(unsigned char t) {
+  return NumToString(static_cast<int>(t));
+}
+template<> inline std::string NumToString<char>(char t) {
+  return NumToString(static_cast<int>(t));
+}
+#if defined(FLATBUFFERS_CPP98_STL)
+template<> inline std::string NumToString<long long>(long long t) {
+  char buf[21];  // (log((1 << 63) - 1) / log(10)) + 2
+  snprintf(buf, sizeof(buf), "%lld", t);
+  return std::string(buf);
+}
+
+template<>
+inline std::string NumToString<unsigned long long>(unsigned long long t) {
+  char buf[22];  // (log((1 << 63) - 1) / log(10)) + 1
+  snprintf(buf, sizeof(buf), "%llu", t);
+  return std::string(buf);
+}
+#endif  // defined(FLATBUFFERS_CPP98_STL)
+
+// Special versions for floats/doubles.
+template<typename T> std::string FloatToString(T t, int precision) {
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    // to_string() prints different numbers of digits for floats depending on
+    // platform and isn't available on Android, so we use stringstream
+    std::stringstream ss;
+    // Use std::fixed to suppress scientific notation.
+    ss << std::fixed;
+    // Default precision is 6, we want that to be higher for doubles.
+    ss << std::setprecision(precision);
+    ss << t;
+    auto s = ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    auto v = static_cast<double>(t);
+    auto s = NumToStringImplWrapper(v, "%0.*f", precision);
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+  // Sadly, std::fixed turns "1" into "1.00000", so here we undo that.
+  auto p = s.find_last_not_of('0');
+  if (p != std::string::npos) {
+    // Strip trailing zeroes. If it is a whole number, keep one zero.
+    s.resize(p + (s[p] == '.' ? 2 : 1));
+  }
+  return s;
+}
+
+template<> inline std::string NumToString<double>(double t) {
+  return FloatToString(t, 12);
+}
+template<> inline std::string NumToString<float>(float t) {
+  return FloatToString(t, 6);
+}
+
+// Convert an integer value to a hexadecimal string.
+// The returned string length is always xdigits long, prefixed by 0 digits.
+// For example, IntToStringHex(0x23, 8) returns the string "00000023".
+inline std::string IntToStringHex(int i, int xdigits) {
+  FLATBUFFERS_ASSERT(i >= 0);
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    std::stringstream ss;
+    ss << std::setw(xdigits) << std::setfill('0') << std::hex << std::uppercase
+       << i;
+    return ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    return NumToStringImplWrapper(i, "%.*X", xdigits);
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+}
+
+// clang-format off
+// Use locale independent functions {strtod_l, strtof_l, strtoll_l, strtoull_l}.
+#if defined(FLATBUFFERS_LOCALE_INDEPENDENT) && (FLATBUFFERS_LOCALE_INDEPENDENT > 0)
+  class ClassicLocale {
+    #ifdef _MSC_VER
+      typedef _locale_t locale_type;
+    #else
+      typedef locale_t locale_type;  // POSIX.1-2008 locale_t type
+    #endif
+    ClassicLocale();
+    ~ClassicLocale();
+    locale_type locale_;
+    static ClassicLocale instance_;
+  public:
+    static locale_type Get() { return instance_.locale_; }
+  };
+
+  #ifdef _MSC_VER
+    #define __strtoull_impl(s, pe, b) _strtoui64_l(s, pe, b, ClassicLocale::Get())
+    #define __strtoll_impl(s, pe, b) _strtoi64_l(s, pe, b, ClassicLocale::Get())
+    #define __strtod_impl(s, pe) _strtod_l(s, pe, ClassicLocale::Get())
+    #define __strtof_impl(s, pe) _strtof_l(s, pe, ClassicLocale::Get())
+  #else
+    #define __strtoull_impl(s, pe, b) strtoull_l(s, pe, b, ClassicLocale::Get())
+    #define __strtoll_impl(s, pe, b) strtoll_l(s, pe, b, ClassicLocale::Get())
+    #define __strtod_impl(s, pe) strtod_l(s, pe, ClassicLocale::Get())
+    #define __strtof_impl(s, pe) strtof_l(s, pe, ClassicLocale::Get())
+  #endif
+#else
+  #define __strtod_impl(s, pe) strtod(s, pe)
+  #define __strtof_impl(s, pe) static_cast<float>(strtod(s, pe))
+  #ifdef _MSC_VER
+    #define __strtoull_impl(s, pe, b) _strtoui64(s, pe, b)
+    #define __strtoll_impl(s, pe, b) _strtoi64(s, pe, b)
+  #else
+    #define __strtoull_impl(s, pe, b) strtoull(s, pe, b)
+    #define __strtoll_impl(s, pe, b) strtoll(s, pe, b)
+  #endif
+#endif
+
+inline void strtoval_impl(int64_t *val, const char *str, char **endptr,
+                                 int base) {
+    *val = __strtoll_impl(str, endptr, base);
+}
+
+inline void strtoval_impl(uint64_t *val, const char *str, char **endptr,
+                                 int base) {
+  *val = __strtoull_impl(str, endptr, base);
+}
+
+inline void strtoval_impl(double *val, const char *str, char **endptr) {
+  *val = __strtod_impl(str, endptr);
+}
+
+// UBSAN: double to float is safe if numeric_limits<float>::is_iec559 is true.
+__supress_ubsan__("float-cast-overflow")
+inline void strtoval_impl(float *val, const char *str, char **endptr) {
+  *val = __strtof_impl(str, endptr);
+}
+#undef __strtoull_impl
+#undef __strtoll_impl
+#undef __strtod_impl
+#undef __strtof_impl
+// clang-format on
+
+// Adaptor for strtoull()/strtoll().
+// Flatbuffers accepts numbers with any count of leading zeros (-009 is -9),
+// while strtoll with base=0 interprets first leading zero as octal prefix.
+// In future, it is possible to add prefixed 0b0101.
+// 1) Checks errno code for overflow condition (out of range).
+// 2) If base <= 0, function try to detect base of number by prefix.
+//
+// Return value (like strtoull and strtoll, but reject partial result):
+// - If successful, an integer value corresponding to the str is returned.
+// - If full string conversion can't be performed, 0 is returned.
+// - If the converted value falls out of range of corresponding return type, a
+// range error occurs. In this case value MAX(T)/MIN(T) is returned.
+template<typename T>
+inline bool StringToIntegerImpl(T *val, const char *const str,
+                                const int base = 0,
+                                const bool check_errno = true) {
+  // T is int64_t or uint64_T
+  FLATBUFFERS_ASSERT(str);
+  if (base <= 0) {
+    auto s = str;
+    while (*s && !is_digit(*s)) s++;
+    if (s[0] == '0' && is_alpha_char(s[1], 'X'))
+      return StringToIntegerImpl(val, str, 16, check_errno);
+    // if a prefix not match, try base=10
+    return StringToIntegerImpl(val, str, 10, check_errno);
+  } else {
+    if (check_errno) errno = 0;  // clear thread-local errno
+    auto endptr = str;
+    strtoval_impl(val, str, const_cast<char **>(&endptr), base);
+    if ((*endptr != '\0') || (endptr == str)) {
+      *val = 0;      // erase partial result
+      return false;  // invalid string
+    }
+    // errno is out-of-range, return MAX/MIN
+    if (check_errno && errno) return false;
+    return true;
+  }
+}
+
+template<typename T>
+inline bool StringToFloatImpl(T *val, const char *const str) {
+  // Type T must be either float or double.
+  FLATBUFFERS_ASSERT(str && val);
+  auto end = str;
+  strtoval_impl(val, str, const_cast<char **>(&end));
+  auto done = (end != str) && (*end == '\0');
+  if (!done) *val = 0;  // erase partial result
+  return done;
+}
+
+// Convert a string to an instance of T.
+// Return value (matched with StringToInteger64Impl and strtod):
+// - If successful, a numeric value corresponding to the str is returned.
+// - If full string conversion can't be performed, 0 is returned.
+// - If the converted value falls out of range of corresponding return type, a
+// range error occurs. In this case value MAX(T)/MIN(T) is returned.
+template<typename T> inline bool StringToNumber(const char *s, T *val) {
+  // Assert on `unsigned long` and `signed long` on LP64.
+  // If it is necessary, it could be solved with flatbuffers::enable_if<B,T>.
+  static_assert(sizeof(T) < sizeof(int64_t), "unexpected type T");
+  FLATBUFFERS_ASSERT(s && val);
+  int64_t i64;
+  // The errno check isn't needed, will return MAX/MIN on overflow.
+  if (StringToIntegerImpl(&i64, s, 0, false)) {
+    const int64_t max = (flatbuffers::numeric_limits<T>::max)();
+    const int64_t min = flatbuffers::numeric_limits<T>::lowest();
+    if (i64 > max) {
+      *val = static_cast<T>(max);
+      return false;
+    }
+    if (i64 < min) {
+      // For unsigned types return max to distinguish from
+      // "no conversion can be performed" when 0 is returned.
+      *val = static_cast<T>(flatbuffers::is_unsigned<T>::value ? max : min);
+      return false;
+    }
+    *val = static_cast<T>(i64);
+    return true;
+  }
+  *val = 0;
+  return false;
+}
+
+template<> inline bool StringToNumber<int64_t>(const char *str, int64_t *val) {
+  return StringToIntegerImpl(val, str);
+}
+
+template<>
+inline bool StringToNumber<uint64_t>(const char *str, uint64_t *val) {
+  if (!StringToIntegerImpl(val, str)) return false;
+  // The strtoull accepts negative numbers:
+  // If the minus sign was part of the input sequence, the numeric value
+  // calculated from the sequence of digits is negated as if by unary minus
+  // in the result type, which applies unsigned integer wraparound rules.
+  // Fix this behaviour (except -0).
+  if (*val) {
+    auto s = str;
+    while (*s && !is_digit(*s)) s++;
+    s = (s > str) ? (s - 1) : s;  // step back to one symbol
+    if (*s == '-') {
+      // For unsigned types return the max to distinguish from
+      // "no conversion can be performed".
+      *val = (flatbuffers::numeric_limits<uint64_t>::max)();
+      return false;
+    }
+  }
+  return true;
+}
+
+template<> inline bool StringToNumber(const char *s, float *val) {
+  return StringToFloatImpl(val, s);
+}
+
+template<> inline bool StringToNumber(const char *s, double *val) {
+  return StringToFloatImpl(val, s);
+}
+
+inline int64_t StringToInt(const char *s, int base = 10) {
+  int64_t val;
+  return StringToIntegerImpl(&val, s, base) ? val : 0;
+}
+
+inline uint64_t StringToUInt(const char *s, int base = 10) {
+  uint64_t val;
+  return StringToIntegerImpl(&val, s, base) ? val : 0;
+}
+
+typedef bool (*LoadFileFunction)(const char *filename, bool binary,
+                                 std::string *dest);
+typedef bool (*FileExistsFunction)(const char *filename);
+
+LoadFileFunction SetLoadFileFunction(LoadFileFunction load_file_function);
+
+FileExistsFunction SetFileExistsFunction(
+    FileExistsFunction file_exists_function);
+
+// Check if file "name" exists.
+bool FileExists(const char *name);
+
+// Check if "name" exists and it is also a directory.
+bool DirExists(const char *name);
+
+// Load file "name" into "buf" returning true if successful
+// false otherwise.  If "binary" is false data is read
+// using ifstream's text mode, otherwise data is read with
+// no transcoding.
+bool LoadFile(const char *name, bool binary, std::string *buf);
+
+// Save data "buf" of length "len" bytes into a file
+// "name" returning true if successful, false otherwise.
+// If "binary" is false data is written using ifstream's
+// text mode, otherwise data is written with no
+// transcoding.
+bool SaveFile(const char *name, const char *buf, size_t len, bool binary);
+
+// Save data "buf" into file "name" returning true if
+// successful, false otherwise.  If "binary" is false
+// data is written using ifstream's text mode, otherwise
+// data is written with no transcoding.
+inline bool SaveFile(const char *name, const std::string &buf, bool binary) {
+  return SaveFile(name, buf.c_str(), buf.size(), binary);
+}
+
+// Functionality for minimalistic portable path handling.
+
+// The functions below behave correctly regardless of whether posix ('/') or
+// Windows ('/' or '\\') separators are used.
+
+// Any new separators inserted are always posix.
+FLATBUFFERS_CONSTEXPR char kPathSeparator = '/';
+
+// Returns the path with the extension, if any, removed.
+std::string StripExtension(const std::string &filepath);
+
+// Returns the extension, if any.
+std::string GetExtension(const std::string &filepath);
+
+// Return the last component of the path, after the last separator.
+std::string StripPath(const std::string &filepath);
+
+// Strip the last component of the path + separator.
+std::string StripFileName(const std::string &filepath);
+
+// Concatenates a path with a filename, regardless of whether the path
+// ends in a separator or not.
+std::string ConCatPathFileName(const std::string &path,
+                               const std::string &filename);
+
+// Replaces any '\\' separators with '/'
+std::string PosixPath(const char *path);
+
+// This function ensure a directory exists, by recursively
+// creating dirs for any parts of the path that don't exist yet.
+void EnsureDirExists(const std::string &filepath);
+
+// Obtains the absolute path from any other path.
+// Returns the input path if the absolute path couldn't be resolved.
+std::string AbsolutePath(const std::string &filepath);
+
+// To and from UTF-8 unicode conversion functions
+
+// Convert a unicode code point into a UTF-8 representation by appending it
+// to a string. Returns the number of bytes generated.
+inline int ToUTF8(uint32_t ucc, std::string *out) {
+  FLATBUFFERS_ASSERT(!(ucc & 0x80000000));  // Top bit can't be set.
+  // 6 possible encodings: http://en.wikipedia.org/wiki/UTF-8
+  for (int i = 0; i < 6; i++) {
+    // Max bits this encoding can represent.
+    uint32_t max_bits = 6 + i * 5 + static_cast<int>(!i);
+    if (ucc < (1u << max_bits)) {  // does it fit?
+      // Remaining bits not encoded in the first byte, store 6 bits each
+      uint32_t remain_bits = i * 6;
+      // Store first byte:
+      (*out) += static_cast<char>((0xFE << (max_bits - remain_bits)) |
+                                  (ucc >> remain_bits));
+      // Store remaining bytes:
+      for (int j = i - 1; j >= 0; j--) {
+        (*out) += static_cast<char>(((ucc >> (j * 6)) & 0x3F) | 0x80);
+      }
+      return i + 1;  // Return the number of bytes added.
+    }
+  }
+  FLATBUFFERS_ASSERT(0);  // Impossible to arrive here.
+  return -1;
+}
+
+// Converts whatever prefix of the incoming string corresponds to a valid
+// UTF-8 sequence into a unicode code. The incoming pointer will have been
+// advanced past all bytes parsed.
+// returns -1 upon corrupt UTF-8 encoding (ignore the incoming pointer in
+// this case).
+inline int FromUTF8(const char **in) {
+  int len = 0;
+  // Count leading 1 bits.
+  for (int mask = 0x80; mask >= 0x04; mask >>= 1) {
+    if (**in & mask) {
+      len++;
+    } else {
+      break;
+    }
+  }
+  if ((static_cast<unsigned char>(**in) << len) & 0x80)
+    return -1;  // Bit after leading 1's must be 0.
+  if (!len) return *(*in)++;
+  // UTF-8 encoded values with a length are between 2 and 4 bytes.
+  if (len < 2 || len > 4) { return -1; }
+  // Grab initial bits of the code.
+  int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
+  for (int i = 0; i < len - 1; i++) {
+    if ((**in & 0xC0) != 0x80) return -1;  // Upper bits must 1 0.
+    ucc <<= 6;
+    ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
+  }
+  // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+  // UTF-16 surrogate pairs).
+  if (ucc >= 0xD800 && ucc <= 0xDFFF) { return -1; }
+  // UTF-8 must represent code points in their shortest possible encoding.
+  switch (len) {
+    case 2:
+      // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+      if (ucc < 0x0080 || ucc > 0x07FF) { return -1; }
+      break;
+    case 3:
+      // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+      if (ucc < 0x0800 || ucc > 0xFFFF) { return -1; }
+      break;
+    case 4:
+      // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+      if (ucc < 0x10000 || ucc > 0x10FFFF) { return -1; }
+      break;
+  }
+  return ucc;
+}
+
+#ifndef FLATBUFFERS_PREFER_PRINTF
+// Wraps a string to a maximum length, inserting new lines where necessary. Any
+// existing whitespace will be collapsed down to a single space. A prefix or
+// suffix can be provided, which will be inserted before or after a wrapped
+// line, respectively.
+inline std::string WordWrap(const std::string in, size_t max_length,
+                            const std::string wrapped_line_prefix,
+                            const std::string wrapped_line_suffix) {
+  std::istringstream in_stream(in);
+  std::string wrapped, line, word;
+
+  in_stream >> word;
+  line = word;
+
+  while (in_stream >> word) {
+    if ((line.length() + 1 + word.length() + wrapped_line_suffix.length()) <
+        max_length) {
+      line += " " + word;
+    } else {
+      wrapped += line + wrapped_line_suffix + "\n";
+      line = wrapped_line_prefix + word;
+    }
+  }
+  wrapped += line;
+
+  return wrapped;
+}
+#endif  // !FLATBUFFERS_PREFER_PRINTF
+
+inline bool EscapeString(const char *s, size_t length, std::string *_text,
+                         bool allow_non_utf8, bool natural_utf8) {
+  std::string &text = *_text;
+  text += "\"";
+  for (uoffset_t i = 0; i < length; i++) {
+    char c = s[i];
+    switch (c) {
+      case '\n': text += "\\n"; break;
+      case '\t': text += "\\t"; break;
+      case '\r': text += "\\r"; break;
+      case '\b': text += "\\b"; break;
+      case '\f': text += "\\f"; break;
+      case '\"': text += "\\\""; break;
+      case '\\': text += "\\\\"; break;
+      default:
+        if (c >= ' ' && c <= '~') {
+          text += c;
+        } else {
+          // Not printable ASCII data. Let's see if it's valid UTF-8 first:
+          const char *utf8 = s + i;
+          int ucc = FromUTF8(&utf8);
+          if (ucc < 0) {
+            if (allow_non_utf8) {
+              text += "\\x";
+              text += IntToStringHex(static_cast<uint8_t>(c), 2);
+            } else {
+              // There are two cases here:
+              //
+              // 1) We reached here by parsing an IDL file. In that case,
+              // we previously checked for non-UTF-8, so we shouldn't reach
+              // here.
+              //
+              // 2) We reached here by someone calling GenerateText()
+              // on a previously-serialized flatbuffer. The data might have
+              // non-UTF-8 Strings, or might be corrupt.
+              //
+              // In both cases, we have to give up and inform the caller
+              // they have no JSON.
+              return false;
+            }
+          } else {
+            if (natural_utf8) {
+              // utf8 points to past all utf-8 bytes parsed
+              text.append(s + i, static_cast<size_t>(utf8 - s - i));
+            } else if (ucc <= 0xFFFF) {
+              // Parses as Unicode within JSON's \uXXXX range, so use that.
+              text += "\\u";
+              text += IntToStringHex(ucc, 4);
+            } else if (ucc <= 0x10FFFF) {
+              // Encode Unicode SMP values to a surrogate pair using two \u
+              // escapes.
+              uint32_t base = ucc - 0x10000;
+              auto high_surrogate = (base >> 10) + 0xD800;
+              auto low_surrogate = (base & 0x03FF) + 0xDC00;
+              text += "\\u";
+              text += IntToStringHex(high_surrogate, 4);
+              text += "\\u";
+              text += IntToStringHex(low_surrogate, 4);
+            }
+            // Skip past characters recognized.
+            i = static_cast<uoffset_t>(utf8 - s - 1);
+          }
+        }
+        break;
+    }
+  }
+  text += "\"";
+  return true;
+}
+
+inline std::string BufferToHexText(const void *buffer, size_t buffer_size,
+                                   size_t max_length,
+                                   const std::string &wrapped_line_prefix,
+                                   const std::string &wrapped_line_suffix) {
+  std::string text = wrapped_line_prefix;
+  size_t start_offset = 0;
+  const char *s = reinterpret_cast<const char *>(buffer);
+  for (size_t i = 0; s && i < buffer_size; i++) {
+    // Last iteration or do we have more?
+    bool have_more = i + 1 < buffer_size;
+    text += "0x";
+    text += IntToStringHex(static_cast<uint8_t>(s[i]), 2);
+    if (have_more) { text += ','; }
+    // If we have more to process and we reached max_length
+    if (have_more &&
+        text.size() + wrapped_line_suffix.size() >= start_offset + max_length) {
+      text += wrapped_line_suffix;
+      text += '\n';
+      start_offset = text.size();
+      text += wrapped_line_prefix;
+    }
+  }
+  text += wrapped_line_suffix;
+  return text;
+}
+
+// Remove paired quotes in a string: "text"|'text' -> text.
+std::string RemoveStringQuotes(const std::string &s);
+
+// Change th global C-locale to locale with name <locale_name>.
+// Returns an actual locale name in <_value>, useful if locale_name is "" or
+// null.
+bool SetGlobalTestLocale(const char *locale_name,
+                         std::string *_value = nullptr);
+
+// Read (or test) a value of environment variable.
+bool ReadEnvironmentVariable(const char *var_name,
+                             std::string *_value = nullptr);
+
+// MSVC specific: Send all assert reports to STDOUT to prevent CI hangs.
+void SetupDefaultCRTReportMode();
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_UTIL_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ArrayReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ArrayReadWriteBuf.java
new file mode 100644
index 0000000..b7573d6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ArrayReadWriteBuf.java
@@ -0,0 +1,252 @@
+package com.google.flatbuffers;
+
+import java.util.Arrays;
+
+/**
+ * Implements {@code ReadBuf} using an array of bytes
+ * as a backing storage. Using array of bytes are
+ * usually faster than {@code ByteBuffer}.
+ *
+ * This class is not thread-safe, meaning that
+ * it must operate on a single thread. Operating from
+ * multiple thread leads into a undefined behavior
+ */
+public class ArrayReadWriteBuf implements ReadWriteBuf {
+
+  private byte[] buffer;
+  private int writePos;
+
+  public ArrayReadWriteBuf() {
+    this(10);
+  }
+
+  public ArrayReadWriteBuf(int initialCapacity) {
+    this(new byte[initialCapacity]);
+  }
+
+  public ArrayReadWriteBuf(byte[] buffer) {
+    this.buffer = buffer;
+    this.writePos = 0;
+  }
+
+  public ArrayReadWriteBuf(byte[] buffer, int startPos) {
+    this.buffer = buffer;
+    this.writePos = startPos;
+  }
+
+  @Override
+  public void clear() {
+    this.writePos = 0;
+  }
+
+  @Override
+  public boolean getBoolean(int index) {
+    return buffer[index] != 0;
+  }
+
+  @Override
+  public byte get(int index) {
+    return buffer[index];
+  }
+
+  @Override
+  public short getShort(int index) {
+    return (short) ((buffer[index+ 1] << 8) | (buffer[index] & 0xff));
+  }
+
+  @Override
+  public int getInt(int index) {
+    return (((buffer[index + 3]) << 24) |
+      ((buffer[index + 2] & 0xff) << 16) |
+      ((buffer[index + 1] & 0xff) << 8) |
+      ((buffer[index] & 0xff)));
+  }
+
+  @Override
+  public long getLong(int index) {
+    return ((((long) buffer[index++] & 0xff)) |
+      (((long) buffer[index++] & 0xff) << 8) |
+      (((long) buffer[index++] & 0xff) << 16) |
+      (((long) buffer[index++] & 0xff) << 24) |
+      (((long) buffer[index++] & 0xff) << 32) |
+      (((long) buffer[index++] & 0xff) << 40) |
+      (((long) buffer[index++] & 0xff) << 48) |
+      (((long) buffer[index]) << 56));
+  }
+
+  @Override
+  public float getFloat(int index) {
+    return Float.intBitsToFloat(getInt(index));
+  }
+
+  @Override
+  public double getDouble(int index) {
+    return Double.longBitsToDouble(getLong(index));
+  }
+
+  @Override
+  public String getString(int start, int size) {
+    return Utf8Safe.decodeUtf8Array(buffer, start, size);
+  }
+
+  @Override
+  public byte[] data() {
+    return buffer;
+  }
+
+
+  @Override
+  public void putBoolean(boolean value) {
+      setBoolean(writePos, value);
+      writePos++;
+  }
+
+  @Override
+  public void put(byte[] value, int start, int length) {
+    set(writePos, value, start, length);
+    writePos+=length;
+  }
+
+  @Override
+  public void put(byte value) {
+    set(writePos, value);
+    writePos++;
+  }
+
+  @Override
+  public void putShort(short value) {
+    setShort(writePos, value);
+    writePos +=2;
+  }
+
+  @Override
+  public void putInt(int value) {
+    setInt(writePos, value);
+    writePos +=4;
+  }
+
+  @Override
+  public void putLong(long value) {
+    setLong(writePos, value);
+    writePos +=8;
+  }
+
+  @Override
+  public void putFloat(float value) {
+    setFloat(writePos, value);
+    writePos +=4;
+  }
+
+  @Override
+  public void putDouble(double value) {
+    setDouble(writePos, value);
+    writePos +=8;
+  }
+
+  @Override
+  public void setBoolean(int index, boolean value) {
+    set(index, value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void set(int index, byte value) {
+    requestCapacity(index + 1);
+    buffer[index] = value;
+  }
+
+  @Override
+  public void set(int index, byte[] toCopy, int start, int length) {
+    requestCapacity(index + (length - start));
+    System.arraycopy(toCopy, start, buffer, index, length);
+  }
+
+  @Override
+  public void setShort(int index, short value) {
+    requestCapacity(index + 2);
+
+    buffer[index++] = (byte) ((value) & 0xff);
+    buffer[index  ] = (byte) ((value >> 8) & 0xff);
+  }
+
+  @Override
+  public void setInt(int index, int value) {
+    requestCapacity(index + 4);
+
+    buffer[index++] = (byte) ((value) & 0xff);
+    buffer[index++] = (byte) ((value >>  8) & 0xff);
+    buffer[index++] = (byte) ((value >> 16) & 0xff);
+    buffer[index  ] = (byte) ((value >> 24) & 0xff);
+  }
+
+  @Override
+  public void setLong(int index, long value) {
+    requestCapacity(index + 8);
+
+    int i = (int) value;
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index++] = (byte) ((i >> 24) & 0xff);
+    i = (int) (value >> 32);
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index  ] = (byte) ((i >> 24) & 0xff);
+  }
+
+  @Override
+  public void setFloat(int index, float value) {
+    requestCapacity(index + 4);
+
+    int iValue = Float.floatToRawIntBits(value);
+    buffer[index++] = (byte) ((iValue) & 0xff);
+    buffer[index++] = (byte) ((iValue >>  8) & 0xff);
+    buffer[index++] = (byte) ((iValue >> 16) & 0xff);
+    buffer[index  ] = (byte) ((iValue >> 24) & 0xff);
+  }
+
+  @Override
+  public void setDouble(int index, double value) {
+    requestCapacity(index + 8);
+
+    long lValue = Double.doubleToRawLongBits(value);
+    int i = (int) lValue;
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index++] = (byte) ((i >> 24) & 0xff);
+    i = (int) (lValue >> 32);
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index  ] = (byte) ((i >> 24) & 0xff);
+  }
+
+  @Override
+  public int limit() {
+    return writePos;
+  }
+
+  @Override
+  public int writePosition() {
+    return writePos;
+  }
+
+  @Override
+  public boolean requestCapacity(int capacity) {
+    if (capacity < 0) {
+      throw new IllegalArgumentException("Capacity may not be negative (likely a previous int overflow)");
+    }
+    if (buffer.length >= capacity) {
+      return true;
+    }
+    // implemented in the same growing fashion as ArrayList
+    int oldCapacity = buffer.length;
+    int newCapacity = oldCapacity + (oldCapacity >> 1);
+    if (newCapacity < capacity) {  // Note: this also catches newCapacity int overflow
+      newCapacity = capacity;
+    }
+    buffer = Arrays.copyOf(buffer, newCapacity);
+    return true;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BaseVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BaseVector.java
new file mode 100644
index 0000000..9230da7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BaseVector.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All vector access objects derive from this class, and add their own accessors.
+ */
+public class BaseVector {
+  /** Used to hold the vector data position. */
+  private int vector;
+  /** Used to hold the vector size. */
+  private int length;
+  /** Used to hold the vector element size in table. */
+  private int element_size;
+  /** The underlying ByteBuffer to hold the data of the vector. */
+  protected ByteBuffer bb;
+
+  /**
+   * Get the start data of a vector.
+   *
+   * @return Returns the start of the vector data.
+   */
+  protected int __vector() {
+    return vector;
+  }
+
+  /**
+   * Gets the element position in vector's ByteBuffer.
+   *
+   * @param j An `int` index of element into a vector.
+   * @return Returns the position of the vector element in a ByteBuffer.
+   */
+  protected int __element(int j) {
+    return vector + j * element_size;
+  }
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer}, an offset within and
+   * element size.
+   *
+   * This method exists primarily to allow recycling vector instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _vector, int _element_size, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      vector = _vector;
+      length = bb.getInt(_vector - Constants.SIZEOF_INT);
+      element_size = _element_size;
+    } else {
+      vector = 0;
+      length = 0;
+      element_size = 0;
+    }
+  }
+
+  /**
+   * Resets the internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling vector instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   */
+  public void reset() {
+    __reset(0, 0, null);
+  }
+
+  /**
+   * Get the length of a vector.
+   *
+   * @return Returns the length of the vector.
+   */
+  public int length() {
+    return length;
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BooleanVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BooleanVector.java
new file mode 100644
index 0000000..1c2a4cd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/BooleanVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of booleans.
+ */
+public final class BooleanVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public BooleanVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_BYTE, _bb); return this;
+  }
+
+  /**
+   * Reads the boolean at the given index.
+   *
+   * @param j The index from which the boolean will be read.
+   * @return the boolean value at the given index.
+   */
+  public boolean get(int j) { 
+    return 0 != bb.get(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java
new file mode 100644
index 0000000..aaf72fe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java
@@ -0,0 +1,170 @@
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+public class ByteBufferReadWriteBuf implements ReadWriteBuf {
+
+  private final ByteBuffer buffer;
+
+  public ByteBufferReadWriteBuf(ByteBuffer bb) {
+    this.buffer = bb;
+    this.buffer.order(ByteOrder.LITTLE_ENDIAN);
+  }
+
+  @Override
+  public void clear() {
+    buffer.clear();
+  }
+
+  @Override
+  public boolean getBoolean(int index) {
+    return get(index) != 0;
+  }
+
+  @Override
+  public byte get(int index) {
+    return buffer.get(index);
+  }
+
+  @Override
+  public short getShort(int index) {
+    return buffer.getShort(index);
+  }
+
+  @Override
+  public int getInt(int index) {
+    return buffer.getInt(index);
+  }
+
+  @Override
+  public long getLong(int index) {
+    return buffer.getLong(index);
+  }
+
+  @Override
+  public float getFloat(int index) {
+    return buffer.getFloat(index);
+  }
+
+  @Override
+  public double getDouble(int index) {
+    return buffer.getDouble(index);
+  }
+
+  @Override
+  public String getString(int start, int size) {
+    return Utf8Safe.decodeUtf8Buffer(buffer, start, size);
+  }
+
+  @Override
+  public byte[] data() {
+    return buffer.array();
+  }
+
+  @Override
+  public void putBoolean(boolean value) {
+    buffer.put(value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void put(byte[] value, int start, int length) {
+    buffer.put(value, start, length);
+  }
+
+  @Override
+  public void put(byte value) {
+    buffer.put(value);
+  }
+
+  @Override
+  public void putShort(short value) {
+    buffer.putShort(value);
+  }
+
+  @Override
+  public void putInt(int value) {
+    buffer.putInt(value);
+  }
+
+  @Override
+  public void putLong(long value) {
+    buffer.putLong(value);
+  }
+
+  @Override
+  public void putFloat(float value) {
+    buffer.putFloat(value);
+  }
+
+  @Override
+  public void putDouble(double value) {
+    buffer.putDouble(value);
+  }
+
+  @Override
+  public void setBoolean(int index, boolean value) {
+    set(index, value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void set(int index, byte value) {
+    requestCapacity(index + 1);
+    buffer.put(index, value);
+  }
+
+  @Override
+  public void set(int index, byte[] value, int start, int length) {
+    requestCapacity(index + (length - start));
+    int curPos = buffer.position();
+    buffer.position(index);
+    buffer.put(value, start, length);
+    buffer.position(curPos);
+  }
+
+  @Override
+  public void setShort(int index, short value) {
+    requestCapacity(index + 2);
+    buffer.putShort(index, value);
+  }
+
+  @Override
+  public void setInt(int index, int value) {
+    requestCapacity(index + 4);
+    buffer.putInt(index, value);
+  }
+
+  @Override
+  public void setLong(int index, long value) {
+    requestCapacity(index + 8);
+    buffer.putLong(index, value);
+  }
+
+  @Override
+  public void setFloat(int index, float value) {
+    requestCapacity(index + 4);
+    buffer.putFloat(index, value);
+  }
+
+  @Override
+  public void setDouble(int index, double value) {
+    requestCapacity(index + 8);
+    buffer.putDouble(index, value);
+  }
+
+  @Override
+  public int writePosition() {
+    return buffer.position();
+  }
+
+  @Override
+  public int limit() {
+    return buffer.limit();
+  }
+
+  @Override
+  public boolean requestCapacity(int capacity) {
+    return capacity <= buffer.limit();
+  }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferUtil.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferUtil.java
new file mode 100644
index 0000000..624dc4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteBufferUtil.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+
+import java.nio.ByteBuffer;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Class that collects utility functions around `ByteBuffer`.
+ */
+public class ByteBufferUtil {
+
+	/**
+     * Extract the size prefix from a `ByteBuffer`.
+     * 
+     * @param bb a size-prefixed buffer
+     * @return the size prefix
+     */
+    public static int getSizePrefix(ByteBuffer bb) {
+        return bb.getInt(bb.position());
+    }
+
+	/**
+     * Create a duplicate of a size-prefixed `ByteBuffer` that has its position
+     * advanced just past the size prefix.
+     * 
+     * @param bb a size-prefixed buffer
+     * @return a new buffer on the same underlying data that has skipped the
+     *         size prefix
+     */
+    public static ByteBuffer removeSizePrefix(ByteBuffer bb) {
+        ByteBuffer s = bb.duplicate();
+        s.position(s.position() + SIZE_PREFIX_LENGTH);
+        return s;
+    }
+
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteVector.java
new file mode 100644
index 0000000..8bc715b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ByteVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 8-bit values.
+ */
+public final class ByteVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param vector Start data of a vector.
+   * @param bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public ByteVector __assign(int vector, ByteBuffer bb) { 
+    __reset(vector, Constants.SIZEOF_BYTE, bb); return this;
+  }
+
+  /**
+   * Reads the byte at the given index.
+   *
+   * @param j The index from which the byte will be read.
+   * @return the 8-bit value at the given index.
+   */
+  public byte get(int j) {
+     return bb.get(__element(j));
+  }
+
+  /**
+   * Reads the byte at the given index, zero-extends it to type int, and returns the result,
+   * which is therefore in the range 0 through 255.
+   *
+   * @param j The index from which the byte will be read.
+   * @return the unsigned 8-bit at the given index.
+   */
+  public int getAsUnsigned(int j) {
+    return (int) get(j) & 0xFF;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Constants.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Constants.java
new file mode 100644
index 0000000..0c0920f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Constants.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * Class that holds shared constants
+ */
+public class Constants {
+    // Java doesn't seem to have these.
+    /** The number of bytes in an `byte`. */
+    static final int SIZEOF_BYTE = 1;
+    /** The number of bytes in a `short`. */
+    static final int SIZEOF_SHORT = 2;
+    /** The number of bytes in an `int`. */
+    static final int SIZEOF_INT = 4;
+    /** The number of bytes in an `float`. */
+    static final int SIZEOF_FLOAT = 4;
+    /** The number of bytes in an `long`. */
+    static final int SIZEOF_LONG = 8;
+    /** The number of bytes in an `double`. */
+    static final int SIZEOF_DOUBLE = 8;
+    /** The number of bytes in a file identifier. */
+    static final int FILE_IDENTIFIER_LENGTH = 4;
+    /** The number of bytes in a size prefix. */
+    public static final int SIZE_PREFIX_LENGTH = 4;
+    /** A version identifier to force a compile error if someone
+    accidentally tries to build generated code with a runtime of
+    two mismatched version. Versions need to always match, as
+    the runtime and generated code are modified in sync.
+    Changes to the Java implementation need to be sure to change
+    the version here and in the code generator on every possible
+    incompatible change */
+    public static void FLATBUFFERS_2_0_0() {}
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/DoubleVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/DoubleVector.java
new file mode 100644
index 0000000..fd4a3a4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/DoubleVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of double values.
+ */
+public final class DoubleVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public DoubleVector __assign(int _vector, ByteBuffer _bb) {
+     __reset(_vector, Constants.SIZEOF_DOUBLE, _bb); return this;
+  }
+
+  /**
+   * Reads the double value at the given index.
+   *
+   * @param j The index from which the double value will be read.
+   * @return the double value at the given index.
+   */
+  public double get(int j) {
+    return bb.getDouble(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlatBufferBuilder.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlatBufferBuilder.java
new file mode 100644
index 0000000..a954d9f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlatBufferBuilder.java
@@ -0,0 +1,1120 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.*;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.lang.Integer;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Class that helps you build a FlatBuffer.  See the section
+ * "Use in Java/C#" in the main FlatBuffers documentation.
+ */
+public class FlatBufferBuilder {
+    /// @cond FLATBUFFERS_INTERNAL
+    ByteBuffer bb;                    // Where we construct the FlatBuffer.
+    int space;                        // Remaining space in the ByteBuffer.
+    int minalign = 1;                 // Minimum alignment encountered so far.
+    int[] vtable = null;              // The vtable for the current table.
+    int vtable_in_use = 0;            // The amount of fields we're actually using.
+    boolean nested = false;           // Whether we are currently serializing a table.
+    boolean finished = false;         // Whether the buffer is finished.
+    int object_start;                 // Starting offset of the current struct/table.
+    int[] vtables = new int[16];      // List of offsets of all vtables.
+    int num_vtables = 0;              // Number of entries in `vtables` in use.
+    int vector_num_elems = 0;         // For the current vector being built.
+    boolean force_defaults = false;   // False omits default values from the serialized data.
+    ByteBufferFactory bb_factory;     // Factory for allocating the internal buffer
+    final Utf8 utf8;                  // UTF-8 encoder to use
+    Map<String, Integer> string_pool; // map used to cache shared strings.
+    /// @endcond
+
+
+    /**
+     * Maximum size of buffer to allocate. If we're allocating arrays on the heap,
+     * the header size of the array counts towards its maximum size.
+     */
+    private static final int MAX_BUFFER_SIZE = Integer.MAX_VALUE - 8;
+
+    /**
+     * Default buffer size that is allocated if an initial size is not given, or is
+     * non positive.
+     */
+    private static final int DEFAULT_BUFFER_SIZE = 1024;
+
+    /**
+     * Start with a buffer of size `initial_size`, then grow as required.
+     *
+     * @param initial_size The initial size of the internal buffer to use.
+     * @param bb_factory The factory to be used for allocating the internal buffer
+     */
+    public FlatBufferBuilder(int initial_size, ByteBufferFactory bb_factory) {
+        this(initial_size, bb_factory, null, Utf8.getDefault());
+    }
+
+    /**
+     * Start with a buffer of size `initial_size`, then grow as required.
+     *
+     * @param initial_size The initial size of the internal buffer to use.
+     * @param bb_factory The factory to be used for allocating the internal buffer
+     * @param existing_bb The byte buffer to reuse.
+     * @param utf8 The Utf8 codec
+     */
+    public FlatBufferBuilder(int initial_size, ByteBufferFactory bb_factory,
+                             ByteBuffer existing_bb, Utf8 utf8) {
+        if (initial_size <= 0) {
+          initial_size = DEFAULT_BUFFER_SIZE;
+        }
+        this.bb_factory = bb_factory;
+        if (existing_bb != null) {
+          bb = existing_bb;
+          bb.clear();
+          bb.order(ByteOrder.LITTLE_ENDIAN);
+        } else {
+          bb = bb_factory.newByteBuffer(initial_size);
+        }
+        this.utf8 = utf8;
+        space = bb.capacity();
+    }
+
+   /**
+    * Start with a buffer of size `initial_size`, then grow as required.
+    *
+    * @param initial_size The initial size of the internal buffer to use.
+    */
+    public FlatBufferBuilder(int initial_size) {
+        this(initial_size, HeapByteBufferFactory.INSTANCE, null, Utf8.getDefault());
+    }
+
+    /**
+     * Start with a buffer of 1KiB, then grow as required.
+     */
+    public FlatBufferBuilder() {
+        this(DEFAULT_BUFFER_SIZE);
+    }
+
+    /**
+     * Alternative constructor allowing reuse of {@link ByteBuffer}s.  The builder
+     * can still grow the buffer as necessary.  User classes should make sure
+     * to call {@link #dataBuffer()} to obtain the resulting encoded message.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     * @param bb_factory The factory to be used for allocating a new internal buffer if
+     *                   the existing buffer needs to grow
+     */
+    public FlatBufferBuilder(ByteBuffer existing_bb, ByteBufferFactory bb_factory) {
+        this(existing_bb.capacity(), bb_factory, existing_bb, Utf8.getDefault());
+    }
+
+    /**
+     * Alternative constructor allowing reuse of {@link ByteBuffer}s.  The builder
+     * can still grow the buffer as necessary.  User classes should make sure
+     * to call {@link #dataBuffer()} to obtain the resulting encoded message.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     */
+    public FlatBufferBuilder(ByteBuffer existing_bb) {
+        this(existing_bb, new HeapByteBufferFactory());
+    }
+
+    /**
+     * Alternative initializer that allows reusing this object on an existing
+     * `ByteBuffer`. This method resets the builder's internal state, but keeps
+     * objects that have been allocated for temporary storage.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     * @param bb_factory The factory to be used for allocating a new internal buffer if
+     *                   the existing buffer needs to grow
+     * @return Returns `this`.
+     */
+    public FlatBufferBuilder init(ByteBuffer existing_bb, ByteBufferFactory bb_factory){
+        this.bb_factory = bb_factory;
+        bb = existing_bb;
+        bb.clear();
+        bb.order(ByteOrder.LITTLE_ENDIAN);
+        minalign = 1;
+        space = bb.capacity();
+        vtable_in_use = 0;
+        nested = false;
+        finished = false;
+        object_start = 0;
+        num_vtables = 0;
+        vector_num_elems = 0;
+        if (string_pool != null) {
+            string_pool.clear();
+        }
+        return this;
+    }
+
+    /**
+     * An interface that provides a user of the FlatBufferBuilder class the ability to specify
+     * the method in which the internal buffer gets allocated. This allows for alternatives
+     * to the default behavior, which is to allocate memory for a new byte-array
+     * backed `ByteBuffer` array inside the JVM.
+     *
+     * The FlatBufferBuilder class contains the HeapByteBufferFactory class to
+     * preserve the default behavior in the event that the user does not provide
+     * their own implementation of this interface.
+     */
+    public static abstract class ByteBufferFactory {
+        /**
+         * Create a `ByteBuffer` with a given capacity.
+         * The returned ByteBuf must have a ByteOrder.LITTLE_ENDIAN ByteOrder.
+         *
+         * @param capacity The size of the `ByteBuffer` to allocate.
+         * @return Returns the new `ByteBuffer` that was allocated.
+         */
+        public abstract ByteBuffer newByteBuffer(int capacity);
+
+        /**
+         * Release a ByteBuffer. Current {@link FlatBufferBuilder}
+         * released any reference to it, so it is safe to dispose the buffer
+         * or return it to a pool.
+         * It is not guaranteed that the buffer has been created
+         * with {@link #newByteBuffer(int) }.
+         *
+         * @param bb the buffer to release
+         */
+        public void releaseByteBuffer(ByteBuffer bb) {
+        }
+    }
+
+    /**
+     * An implementation of the ByteBufferFactory interface that is used when
+     * one is not provided by the user.
+     *
+     * Allocate memory for a new byte-array backed `ByteBuffer` array inside the JVM.
+     */
+    public static final class HeapByteBufferFactory extends ByteBufferFactory {
+
+        public static final HeapByteBufferFactory INSTANCE = new HeapByteBufferFactory();
+
+        @Override
+        public ByteBuffer newByteBuffer(int capacity) {
+            return ByteBuffer.allocate(capacity).order(ByteOrder.LITTLE_ENDIAN);
+        }
+    }
+
+   /**
+   * Helper function to test if a field is present in the table
+   *
+   * @param table Flatbuffer table
+   * @param offset virtual table offset
+   * @return true if the filed is present
+   */
+   public static boolean isFieldPresent(Table table, int offset) {
+     return table.__offset(offset) != 0;
+   }
+
+    /**
+     * Reset the FlatBufferBuilder by purging all data that it holds.
+     */
+    public void clear(){
+        space = bb.capacity();
+        bb.clear();
+        minalign = 1;
+        while(vtable_in_use > 0) vtable[--vtable_in_use] = 0;
+        vtable_in_use = 0;
+        nested = false;
+        finished = false;
+        object_start = 0;
+        num_vtables = 0;
+        vector_num_elems = 0;
+        if (string_pool != null) {
+            string_pool.clear();
+        }
+    }
+
+    /**
+     * Doubles the size of the backing {@link ByteBuffer} and copies the old data towards the
+     * end of the new buffer (since we build the buffer backwards).
+     *
+     * @param bb The current buffer with the existing data.
+     * @param bb_factory The factory to be used for allocating the new internal buffer
+     * @return A new byte buffer with the old data copied copied to it.  The data is
+     * located at the end of the buffer.
+     */
+    static ByteBuffer growByteBuffer(ByteBuffer bb, ByteBufferFactory bb_factory) {
+        int old_buf_size = bb.capacity();
+
+        int new_buf_size;
+
+        if (old_buf_size == 0) {
+            new_buf_size = DEFAULT_BUFFER_SIZE;
+        }
+        else {
+            if (old_buf_size == MAX_BUFFER_SIZE) { // Ensure we don't grow beyond what fits in an int.
+                throw new AssertionError("FlatBuffers: cannot grow buffer beyond 2 gigabytes.");
+            }
+            new_buf_size = (old_buf_size & 0xC0000000) != 0 ? MAX_BUFFER_SIZE : old_buf_size << 1;
+        }
+
+        bb.position(0);
+        ByteBuffer nbb = bb_factory.newByteBuffer(new_buf_size);
+        new_buf_size = nbb.clear().capacity(); // Ensure the returned buffer is treated as empty
+        nbb.position(new_buf_size - old_buf_size);
+        nbb.put(bb);
+        return nbb;
+    }
+
+   /**
+    * Offset relative to the end of the buffer.
+    *
+    * @return Offset relative to the end of the buffer.
+    */
+    public int offset() {
+        return bb.capacity() - space;
+    }
+
+   /**
+    * Add zero valued bytes to prepare a new entry to be added.
+    *
+    * @param byte_size Number of bytes to add.
+    */
+    public void pad(int byte_size) {
+        for (int i = 0; i < byte_size; i++) bb.put(--space, (byte)0);
+    }
+
+   /**
+    * Prepare to write an element of `size` after `additional_bytes`
+    * have been written, e.g. if you write a string, you need to align such
+    * the int length field is aligned to {@link com.google.flatbuffers.Constants#SIZEOF_INT}, and
+    * the string data follows it directly.  If all you need to do is alignment, `additional_bytes`
+    * will be 0.
+    *
+    * @param size This is the of the new element to write.
+    * @param additional_bytes The padding size.
+    */
+    public void prep(int size, int additional_bytes) {
+        // Track the biggest thing we've ever aligned to.
+        if (size > minalign) minalign = size;
+        // Find the amount of alignment needed such that `size` is properly
+        // aligned after `additional_bytes`
+        int align_size = ((~(bb.capacity() - space + additional_bytes)) + 1) & (size - 1);
+        // Reallocate the buffer if needed.
+        while (space < align_size + size + additional_bytes) {
+            int old_buf_size = bb.capacity();
+            ByteBuffer old = bb;
+            bb = growByteBuffer(old, bb_factory);
+            if (old != bb) {
+                bb_factory.releaseByteBuffer(old);
+            }
+            space += bb.capacity() - old_buf_size;
+        }
+        pad(align_size);
+    }
+
+    /**
+     * Add a `boolean` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `boolean` to put into the buffer.
+     */
+    public void putBoolean(boolean x) { bb.put      (space -= Constants.SIZEOF_BYTE, (byte)(x ? 1 : 0)); }
+
+    /**
+     * Add a `byte` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `byte` to put into the buffer.
+     */
+    public void putByte   (byte    x) { bb.put      (space -= Constants.SIZEOF_BYTE, x); }
+
+    /**
+     * Add a `short` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `short` to put into the buffer.
+     */
+    public void putShort  (short   x) { bb.putShort (space -= Constants.SIZEOF_SHORT, x); }
+
+    /**
+     * Add an `int` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x An `int` to put into the buffer.
+     */
+    public void putInt    (int     x) { bb.putInt   (space -= Constants.SIZEOF_INT, x); }
+
+    /**
+     * Add a `long` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `long` to put into the buffer.
+     */
+    public void putLong   (long    x) { bb.putLong  (space -= Constants.SIZEOF_LONG, x); }
+
+    /**
+     * Add a `float` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `float` to put into the buffer.
+     */
+    public void putFloat  (float   x) { bb.putFloat (space -= Constants.SIZEOF_FLOAT, x); }
+
+    /**
+     * Add a `double` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `double` to put into the buffer.
+     */
+    public void putDouble (double  x) { bb.putDouble(space -= Constants.SIZEOF_DOUBLE, x); }
+    /// @endcond
+
+    /**
+     * Add a `boolean` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `boolean` to put into the buffer.
+     */
+    public void addBoolean(boolean x) { prep(Constants.SIZEOF_BYTE, 0); putBoolean(x); }
+
+    /**
+     * Add a `byte` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `byte` to put into the buffer.
+     */
+    public void addByte   (byte    x) { prep(Constants.SIZEOF_BYTE, 0); putByte   (x); }
+
+    /**
+     * Add a `short` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `short` to put into the buffer.
+     */
+    public void addShort  (short   x) { prep(Constants.SIZEOF_SHORT, 0); putShort  (x); }
+
+    /**
+     * Add an `int` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x An `int` to put into the buffer.
+     */
+    public void addInt    (int     x) { prep(Constants.SIZEOF_INT, 0); putInt    (x); }
+
+    /**
+     * Add a `long` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `long` to put into the buffer.
+     */
+    public void addLong   (long    x) { prep(Constants.SIZEOF_LONG, 0); putLong   (x); }
+
+    /**
+     * Add a `float` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `float` to put into the buffer.
+     */
+    public void addFloat  (float   x) { prep(Constants.SIZEOF_FLOAT, 0); putFloat  (x); }
+
+    /**
+     * Add a `double` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `double` to put into the buffer.
+     */
+    public void addDouble (double  x) { prep(Constants.SIZEOF_DOUBLE, 0); putDouble (x); }
+
+   /**
+    * Adds on offset, relative to where it will be written.
+    *
+    * @param off The offset to add.
+    */
+    public void addOffset(int off) {
+        prep(SIZEOF_INT, 0);  // Ensure alignment is already done.
+        assert off <= offset();
+        off = offset() - off + SIZEOF_INT;
+        putInt(off);
+    }
+
+   /// @cond FLATBUFFERS_INTERNAL
+   /**
+    * Start a new array/vector of objects.  Users usually will not call
+    * this directly.  The `FlatBuffers` compiler will create a start/end
+    * method for vector types in generated code.
+    * <p>
+    * The expected sequence of calls is:
+    * <ol>
+    * <li>Start the array using this method.</li>
+    * <li>Call {@link #addOffset(int)} `num_elems` number of times to set
+    * the offset of each element in the array.</li>
+    * <li>Call {@link #endVector()} to retrieve the offset of the array.</li>
+    * </ol>
+    * <p>
+    * For example, to create an array of strings, do:
+    * <pre>{@code
+    * // Need 10 strings
+    * FlatBufferBuilder builder = new FlatBufferBuilder(existingBuffer);
+    * int[] offsets = new int[10];
+    *
+    * for (int i = 0; i < 10; i++) {
+    *   offsets[i] = fbb.createString(" " + i);
+    * }
+    *
+    * // Have the strings in the buffer, but don't have a vector.
+    * // Add a vector that references the newly created strings:
+    * builder.startVector(4, offsets.length, 4);
+    *
+    * // Add each string to the newly created vector
+    * // The strings are added in reverse order since the buffer
+    * // is filled in back to front
+    * for (int i = offsets.length - 1; i >= 0; i--) {
+    *   builder.addOffset(offsets[i]);
+    * }
+    *
+    * // Finish off the vector
+    * int offsetOfTheVector = fbb.endVector();
+    * }</pre>
+    *
+    * @param elem_size The size of each element in the array.
+    * @param num_elems The number of elements in the array.
+    * @param alignment The alignment of the array.
+    */
+    public void startVector(int elem_size, int num_elems, int alignment) {
+        notNested();
+        vector_num_elems = num_elems;
+        prep(SIZEOF_INT, elem_size * num_elems);
+        prep(alignment, elem_size * num_elems); // Just in case alignment > int.
+        nested = true;
+    }
+
+   /**
+    * Finish off the creation of an array and all its elements.  The array
+    * must be created with {@link #startVector(int, int, int)}.
+    *
+    * @return The offset at which the newly created array starts.
+    * @see #startVector(int, int, int)
+    */
+    public int endVector() {
+        if (!nested)
+            throw new AssertionError("FlatBuffers: endVector called without startVector");
+        nested = false;
+        putInt(vector_num_elems);
+        return offset();
+    }
+    /// @endcond
+
+    /**
+     * Create a new array/vector and return a ByteBuffer to be filled later.
+     * Call {@link #endVector} after this method to get an offset to the beginning
+     * of vector.
+     *
+     * @param elem_size the size of each element in bytes.
+     * @param num_elems number of elements in the vector.
+     * @param alignment byte alignment.
+     * @return ByteBuffer with position and limit set to the space allocated for the array.
+     */
+    public ByteBuffer createUnintializedVector(int elem_size, int num_elems, int alignment) {
+        int length = elem_size * num_elems;
+        startVector(elem_size, num_elems, alignment);
+
+        bb.position(space -= length);
+
+        // Slice and limit the copy vector to point to the 'array'
+        ByteBuffer copy = bb.slice().order(ByteOrder.LITTLE_ENDIAN);
+        copy.limit(length);
+        return copy;
+    }
+
+   /**
+     * Create a vector of tables.
+     *
+     * @param offsets Offsets of the tables.
+     * @return Returns offset of the vector.
+     */
+    public int createVectorOfTables(int[] offsets) {
+        notNested();
+        startVector(Constants.SIZEOF_INT, offsets.length, Constants.SIZEOF_INT);
+        for(int i = offsets.length - 1; i >= 0; i--) addOffset(offsets[i]);
+        return endVector();
+    }
+
+    /**
+     * Create a vector of sorted by the key tables.
+     *
+     * @param obj Instance of the table subclass.
+     * @param offsets Offsets of the tables.
+     * @return Returns offset of the sorted vector.
+     */
+    public <T extends Table> int createSortedVectorOfTables(T obj, int[] offsets) {
+        obj.sortTables(offsets, bb);
+        return createVectorOfTables(offsets);
+    }
+
+    /**
+    * Encode the String `s` in the buffer using UTF-8. If a String with
+    * this exact contents has already been serialized using this method,
+    * instead simply returns the offset of the existing String.
+    *
+    * Usage of the method will incur into additional allocations,
+    * so it is advisable to use it only when it is known upfront that
+    * your message will have several repeated strings.
+    *
+    * @param s The String to encode.
+    * @return The offset in the buffer where the encoded String starts.
+    */
+    public int createSharedString(String s) {
+
+        if (string_pool == null) {
+            string_pool = new HashMap<>();
+            int offset = createString(s);
+            string_pool.put(s, offset);
+            return offset;
+
+        }
+
+        Integer offset = string_pool.get(s);
+
+        if(offset == null) {
+            offset = createString(s);
+            string_pool.put(s, offset);
+        }
+        return offset;
+    }
+
+   /**
+    * Encode the string `s` in the buffer using UTF-8.  If {@code s} is
+    * already a {@link CharBuffer}, this method is allocation free.
+    *
+    * @param s The string to encode.
+    * @return The offset in the buffer where the encoded string starts.
+    */
+    public int createString(CharSequence s) {
+        int length = utf8.encodedLength(s);
+        addByte((byte)0);
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        utf8.encodeUtf8(s, bb);
+        return endVector();
+    }
+
+   /**
+    * Create a string in the buffer from an already encoded UTF-8 string in a ByteBuffer.
+    *
+    * @param s An already encoded UTF-8 string as a `ByteBuffer`.
+    * @return The offset in the buffer where the encoded string starts.
+    */
+    public int createString(ByteBuffer s) {
+        int length = s.remaining();
+        addByte((byte)0);
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(s);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * @param arr A source array with data
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(byte[] arr) {
+        int length = arr.length;
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(arr);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * @param arr a source array with data.
+     * @param offset the offset in the source array to start copying from.
+     * @param length the number of bytes to copy from the source array.
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(byte[] arr, int offset, int length) {
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(arr, offset, length);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * The source {@link ByteBuffer} position is advanced by {@link ByteBuffer#remaining()} places
+     * after this call.
+     *
+     * @param byteBuffer A source {@link ByteBuffer} with data.
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(ByteBuffer byteBuffer) {
+        int length = byteBuffer.remaining();
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(byteBuffer);
+        return endVector();
+    }
+
+   /// @cond FLATBUFFERS_INTERNAL
+   /**
+    * Should not be accessing the final buffer before it is finished.
+    */
+    public void finished() {
+        if (!finished)
+            throw new AssertionError(
+                "FlatBuffers: you can only access the serialized buffer after it has been" +
+                " finished by FlatBufferBuilder.finish().");
+    }
+
+   /**
+    * Should not be creating any other object, string or vector
+    * while an object is being constructed.
+    */
+    public void notNested() {
+        if (nested)
+            throw new AssertionError("FlatBuffers: object serialization must not be nested.");
+    }
+
+   /**
+    * Structures are always stored inline, they need to be created right
+    * where they're used.  You'll get this assertion failure if you
+    * created it elsewhere.
+    *
+    * @param obj The offset of the created object.
+    */
+    public void Nested(int obj) {
+        if (obj != offset())
+            throw new AssertionError("FlatBuffers: struct must be serialized inline.");
+    }
+
+   /**
+    * Start encoding a new object in the buffer.  Users will not usually need to
+    * call this directly. The `FlatBuffers` compiler will generate helper methods
+    * that call this method internally.
+    * <p>
+    * For example, using the "Monster" code found on the "landing page". An
+    * object of type `Monster` can be created using the following code:
+    *
+    * <pre>{@code
+    * int testArrayOfString = Monster.createTestarrayofstringVector(fbb, new int[] {
+    *   fbb.createString("test1"),
+    *   fbb.createString("test2")
+    * });
+    *
+    * Monster.startMonster(fbb);
+    * Monster.addPos(fbb, Vec3.createVec3(fbb, 1.0f, 2.0f, 3.0f, 3.0,
+    *   Color.Green, (short)5, (byte)6));
+    * Monster.addHp(fbb, (short)80);
+    * Monster.addName(fbb, str);
+    * Monster.addInventory(fbb, inv);
+    * Monster.addTestType(fbb, (byte)Any.Monster);
+    * Monster.addTest(fbb, mon2);
+    * Monster.addTest4(fbb, test4);
+    * Monster.addTestarrayofstring(fbb, testArrayOfString);
+    * int mon = Monster.endMonster(fbb);
+    * }</pre>
+    * <p>
+    * Here:
+    * <ul>
+    * <li>The call to `Monster#startMonster(FlatBufferBuilder)` will call this
+    * method with the right number of fields set.</li>
+    * <li>`Monster#endMonster(FlatBufferBuilder)` will ensure {@link #endObject()} is called.</li>
+    * </ul>
+    * <p>
+    * It's not recommended to call this method directly.  If it's called manually, you must ensure
+    * to audit all calls to it whenever fields are added or removed from your schema.  This is
+    * automatically done by the code generated by the `FlatBuffers` compiler.
+    *
+    * @param numfields The number of fields found in this object.
+    */
+    public void startTable(int numfields) {
+        notNested();
+        if (vtable == null || vtable.length < numfields) vtable = new int[numfields];
+        vtable_in_use = numfields;
+        Arrays.fill(vtable, 0, vtable_in_use, 0);
+        nested = true;
+        object_start = offset();
+    }
+
+    /**
+     * Add a `boolean` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `boolean` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `boolean` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addBoolean(int o, boolean x, boolean d) { if(force_defaults || x != d) { addBoolean(x); slot(o); } }
+
+    /**
+     * Add a `byte` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `byte` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `byte` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addByte   (int o, byte    x, int     d) { if(force_defaults || x != d) { addByte   (x); slot(o); } }
+
+    /**
+     * Add a `short` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `short` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `short` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addShort  (int o, short   x, int     d) { if(force_defaults || x != d) { addShort  (x); slot(o); } }
+
+    /**
+     * Add an `int` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x An `int` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d An `int` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addInt    (int o, int     x, int     d) { if(force_defaults || x != d) { addInt    (x); slot(o); } }
+
+    /**
+     * Add a `long` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `long` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `long` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addLong   (int o, long    x, long    d) { if(force_defaults || x != d) { addLong   (x); slot(o); } }
+
+    /**
+     * Add a `float` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `float` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `float` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addFloat  (int o, float   x, double  d) { if(force_defaults || x != d) { addFloat  (x); slot(o); } }
+
+    /**
+     * Add a `double` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `double` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `double` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addDouble (int o, double  x, double  d) { if(force_defaults || x != d) { addDouble (x); slot(o); } }
+
+    /**
+     * Add an `offset` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x An `offset` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d An `offset` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addOffset (int o, int     x, int     d) { if(force_defaults || x != d) { addOffset (x); slot(o); } }
+
+    /**
+     * Add a struct to the table. Structs are stored inline, so nothing additional is being added.
+     *
+     * @param voffset The index into the vtable.
+     * @param x The offset of the created struct.
+     * @param d The default value is always `0`.
+     */
+    public void addStruct(int voffset, int x, int d) {
+        if(x != d) {
+            Nested(x);
+            slot(voffset);
+        }
+    }
+
+    /**
+     * Set the current vtable at `voffset` to the current location in the buffer.
+     *
+     * @param voffset The index into the vtable to store the offset relative to the end of the
+     * buffer.
+     */
+    public void slot(int voffset) {
+        vtable[voffset] = offset();
+    }
+
+   /**
+    * Finish off writing the object that is under construction.
+    *
+    * @return The offset to the object inside {@link #dataBuffer()}.
+    * @see #startTable(int)
+    */
+    public int endTable() {
+        if (vtable == null || !nested)
+            throw new AssertionError("FlatBuffers: endTable called without startTable");
+        addInt(0);
+        int vtableloc = offset();
+        // Write out the current vtable.
+        int i = vtable_in_use - 1;
+        // Trim trailing zeroes.
+        for (; i >= 0 && vtable[i] == 0; i--) {}
+        int trimmed_size = i + 1;
+        for (; i >= 0 ; i--) {
+            // Offset relative to the start of the table.
+            short off = (short)(vtable[i] != 0 ? vtableloc - vtable[i] : 0);
+            addShort(off);
+        }
+
+        final int standard_fields = 2; // The fields below:
+        addShort((short)(vtableloc - object_start));
+        addShort((short)((trimmed_size + standard_fields) * SIZEOF_SHORT));
+
+        // Search for an existing vtable that matches the current one.
+        int existing_vtable = 0;
+        outer_loop:
+        for (i = 0; i < num_vtables; i++) {
+            int vt1 = bb.capacity() - vtables[i];
+            int vt2 = space;
+            short len = bb.getShort(vt1);
+            if (len == bb.getShort(vt2)) {
+                for (int j = SIZEOF_SHORT; j < len; j += SIZEOF_SHORT) {
+                    if (bb.getShort(vt1 + j) != bb.getShort(vt2 + j)) {
+                        continue outer_loop;
+                    }
+                }
+                existing_vtable = vtables[i];
+                break outer_loop;
+            }
+        }
+
+        if (existing_vtable != 0) {
+            // Found a match:
+            // Remove the current vtable.
+            space = bb.capacity() - vtableloc;
+            // Point table to existing vtable.
+            bb.putInt(space, existing_vtable - vtableloc);
+        } else {
+            // No match:
+            // Add the location of the current vtable to the list of vtables.
+            if (num_vtables == vtables.length) vtables = Arrays.copyOf(vtables, num_vtables * 2);
+            vtables[num_vtables++] = offset();
+            // Point table to current vtable.
+            bb.putInt(bb.capacity() - vtableloc, offset() - vtableloc);
+        }
+
+        nested = false;
+        return vtableloc;
+    }
+
+    /**
+     * Checks that a required field has been set in a given table that has
+     * just been constructed.
+     *
+     * @param table The offset to the start of the table from the `ByteBuffer` capacity.
+     * @param field The offset to the field in the vtable.
+     */
+    public void required(int table, int field) {
+        int table_start = bb.capacity() - table;
+        int vtable_start = table_start - bb.getInt(table_start);
+        boolean ok = bb.getShort(vtable_start + field) != 0;
+        // If this fails, the caller will show what field needs to be set.
+        if (!ok)
+            throw new AssertionError("FlatBuffers: field " + field + " must be set");
+    }
+    /// @endcond
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param size_prefix Whether to prefix the size to the buffer.
+     */
+    protected void finish(int root_table, boolean size_prefix) {
+        prep(minalign, SIZEOF_INT + (size_prefix ? SIZEOF_INT : 0));
+        addOffset(root_table);
+        if (size_prefix) {
+            addInt(bb.capacity() - space);
+        }
+        bb.position(space);
+        finished = true;
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     */
+    public void finish(int root_table) {
+        finish(root_table, false);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`, with the size prefixed.
+     *
+     * @param root_table An offset to be added to the buffer.
+     */
+    public void finishSizePrefixed(int root_table) {
+        finish(root_table, true);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     * @param size_prefix Whether to prefix the size to the buffer.
+     */
+    protected void finish(int root_table, String file_identifier, boolean size_prefix) {
+        prep(minalign, SIZEOF_INT + FILE_IDENTIFIER_LENGTH + (size_prefix ? SIZEOF_INT : 0));
+        if (file_identifier.length() != FILE_IDENTIFIER_LENGTH)
+            throw new AssertionError("FlatBuffers: file identifier must be length " +
+                                     FILE_IDENTIFIER_LENGTH);
+        for (int i = FILE_IDENTIFIER_LENGTH - 1; i >= 0; i--) {
+            addByte((byte)file_identifier.charAt(i));
+        }
+        finish(root_table, size_prefix);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     */
+    public void finish(int root_table, String file_identifier) {
+        finish(root_table, file_identifier, false);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`, with the size prefixed.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     */
+    public void finishSizePrefixed(int root_table, String file_identifier) {
+        finish(root_table, file_identifier, true);
+    }
+
+    /**
+     * In order to save space, fields that are set to their default value
+     * don't get serialized into the buffer. Forcing defaults provides a
+     * way to manually disable this optimization.
+     *
+     * @param forceDefaults When set to `true`, always serializes default values.
+     * @return Returns `this`.
+     */
+    public FlatBufferBuilder forceDefaults(boolean forceDefaults){
+        this.force_defaults = forceDefaults;
+        return this;
+    }
+
+    /**
+     * Get the ByteBuffer representing the FlatBuffer. Only call this after you've
+     * called `finish()`. The actual data starts at the ByteBuffer's current position,
+     * not necessarily at `0`.
+     *
+     * @return The {@link ByteBuffer} representing the FlatBuffer
+     */
+    public ByteBuffer dataBuffer() {
+        finished();
+        return bb;
+    }
+
+   /**
+    * The FlatBuffer data doesn't start at offset 0 in the {@link ByteBuffer}, but
+    * now the {@code ByteBuffer}'s position is set to that location upon {@link #finish(int)}.
+    *
+    * @return The {@link ByteBuffer#position() position} the data starts in {@link #dataBuffer()}
+    * @deprecated This method should not be needed anymore, but is left
+    * here for the moment to document this API change. It will be removed in the future.
+    */
+    @Deprecated
+    private int dataStart() {
+        finished();
+        return space;
+    }
+
+   /**
+    * A utility function to copy and return the ByteBuffer data from `start` to
+    * `start` + `length` as a `byte[]`.
+    *
+    * @param start Start copying at this offset.
+    * @param length How many bytes to copy.
+    * @return A range copy of the {@link #dataBuffer() data buffer}.
+    * @throws IndexOutOfBoundsException If the range of bytes is ouf of bound.
+    */
+    public byte[] sizedByteArray(int start, int length){
+        finished();
+        byte[] array = new byte[length];
+        bb.position(start);
+        bb.get(array);
+        return array;
+    }
+
+   /**
+    * A utility function to copy and return the ByteBuffer data as a `byte[]`.
+    *
+    * @return A full copy of the {@link #dataBuffer() data buffer}.
+    */
+    public byte[] sizedByteArray() {
+        return sizedByteArray(space, bb.capacity() - space);
+    }
+
+    /**
+     * A utility function to return an InputStream to the ByteBuffer data
+     *
+     * @return An InputStream that starts at the beginning of the ByteBuffer data
+     *         and can read to the end of it.
+     */
+    public InputStream sizedInputStream() {
+        finished();
+        ByteBuffer duplicate = bb.duplicate();
+        duplicate.position(space);
+        duplicate.limit(bb.capacity());
+        return new ByteBufferBackedInputStream(duplicate);
+    }
+
+    /**
+     * A class that allows a user to create an InputStream from a ByteBuffer.
+     */
+    static class ByteBufferBackedInputStream extends InputStream {
+
+        ByteBuffer buf;
+
+        public ByteBufferBackedInputStream(ByteBuffer buf) {
+            this.buf = buf;
+        }
+
+        public int read() throws IOException {
+            try {
+                return buf.get() & 0xFF;
+            } catch(BufferUnderflowException e) {
+                return -1;
+            }
+        }
+    }
+
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffers.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffers.java
new file mode 100644
index 0000000..8263f9a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffers.java
@@ -0,0 +1,1211 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+
+import static com.google.flatbuffers.FlexBuffers.Unsigned.byteToUnsignedInt;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.intToUnsignedLong;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.shortToUnsignedInt;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * This class can be used to parse FlexBuffer messages.
+ * <p>
+ * For generating FlexBuffer messages, use {@link FlexBuffersBuilder}.
+ * <p>
+ * Example of usage:
+ * <pre>
+ * ReadBuf bb = ... // load message from file or network
+ * FlexBuffers.Reference r = FlexBuffers.getRoot(bb); // Reads the root element
+ * FlexBuffers.Map map = r.asMap(); // We assumed root object is a map
+ * System.out.println(map.get("name").asString()); // prints element with key "name"
+ * </pre>
+ */
+public class FlexBuffers {
+
+    // These are used as the upper 6 bits of a type field to indicate the actual
+    // type.
+    /** Represent a null type */
+    public static final int FBT_NULL = 0;
+    /** Represent a signed integer type */
+    public static final int FBT_INT = 1;
+    /** Represent a unsigned type */
+    public static final int FBT_UINT = 2;
+    /** Represent a float type */
+    public static final int FBT_FLOAT = 3; // Types above stored inline, types below store an offset.
+    /** Represent a key to a map type */
+    public static final int FBT_KEY = 4;
+    /** Represent a string type */
+    public static final int FBT_STRING = 5;
+    /** Represent a indirect signed integer type */
+    public static final int FBT_INDIRECT_INT = 6;
+    /** Represent a indirect unsigned integer type */
+    public static final int FBT_INDIRECT_UINT = 7;
+    /** Represent a indirect float type */
+    public static final int FBT_INDIRECT_FLOAT = 8;
+    /** Represent a map type */
+    public static final int FBT_MAP = 9;
+    /** Represent a vector type */
+    public static final int FBT_VECTOR = 10; // Untyped.
+    /** Represent a vector of signed integers type */
+    public static final int FBT_VECTOR_INT = 11;  // Typed any size  = stores no type table).
+    /** Represent a vector of unsigned integers type */
+    public static final int FBT_VECTOR_UINT = 12;
+    /** Represent a vector of floats type */
+    public static final int FBT_VECTOR_FLOAT = 13;
+    /** Represent a vector of keys type */
+    public static final int FBT_VECTOR_KEY = 14;
+    /** Represent a vector of strings type */
+    // DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
+    // more info on thttps://github.com/google/flatbuffers/issues/5627.
+    public static final int FBT_VECTOR_STRING_DEPRECATED = 15;
+
+    /// @cond FLATBUFFERS_INTERNAL
+    public static final int FBT_VECTOR_INT2 = 16;  // Typed tuple  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT2 = 17;
+    public static final int FBT_VECTOR_FLOAT2 = 18;
+    public static final int FBT_VECTOR_INT3 = 19;  // Typed triple  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT3 = 20;
+    public static final int FBT_VECTOR_FLOAT3 = 21;
+    public static final int FBT_VECTOR_INT4 = 22;  // Typed quad  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT4 = 23;
+    public static final int FBT_VECTOR_FLOAT4 = 24;
+    /// @endcond FLATBUFFERS_INTERNAL
+
+    /** Represent a blob type */
+    public static final int FBT_BLOB = 25;
+    /** Represent a boolean type */
+    public static final int FBT_BOOL = 26;
+    /** Represent a vector of booleans type */
+    public static final int FBT_VECTOR_BOOL = 36;  // To Allow the same type of conversion of type to vector type
+
+    private static final ReadBuf EMPTY_BB = new ArrayReadWriteBuf(new byte[] {0}, 1);
+
+    /**
+     * Checks where a type is a typed vector
+     *
+     * @param type type to be checked
+     * @return true if typed vector
+     */
+    static boolean isTypedVector(int type) {
+        return (type >= FBT_VECTOR_INT && type <= FBT_VECTOR_STRING_DEPRECATED) || type == FBT_VECTOR_BOOL;
+    }
+
+    /**
+     * Check whether you can access type directly (no indirection) or not.
+     *
+     * @param type type to be checked
+     * @return true if inline type
+     */
+    static boolean isTypeInline(int type) {
+        return type <= FBT_FLOAT || type == FBT_BOOL;
+    }
+
+    static int toTypedVectorElementType(int original_type) {
+        return original_type - FBT_VECTOR_INT + FBT_INT;
+    }
+
+    /**
+     * Return a vector type our of a original element type
+     *
+     * @param type        element type
+     * @param fixedLength size of element
+     * @return typed vector type
+     */
+    static int toTypedVector(int type, int fixedLength) {
+        assert (isTypedVectorElementType(type));
+        switch (fixedLength) {
+            case 0: return type - FBT_INT + FBT_VECTOR_INT;
+            case 2: return type - FBT_INT + FBT_VECTOR_INT2;
+            case 3: return type - FBT_INT + FBT_VECTOR_INT3;
+            case 4: return type - FBT_INT + FBT_VECTOR_INT4;
+            default:
+                assert (false);
+                return FBT_NULL;
+        }
+    }
+
+    static boolean isTypedVectorElementType(int type) {
+        return (type >= FBT_INT && type <= FBT_KEY) || type == FBT_BOOL;
+    }
+
+    // return position of the element that the offset is pointing to
+    private static int indirect(ReadBuf bb, int offset, int byteWidth) {
+        // we assume all offset fits on a int, since ReadBuf operates with that assumption
+        return (int) (offset - readUInt(bb, offset, byteWidth));
+    }
+
+    // read unsigned int with size byteWidth and return as a 64-bit integer
+    private static long readUInt(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 1: return byteToUnsignedInt(buff.get(end));
+            case 2: return shortToUnsignedInt(buff.getShort(end));
+            case 4: return intToUnsignedLong(buff.getInt(end));
+            case 8: return buff.getLong(end); // We are passing signed long here. Losing information (user should know)
+            default: return -1; // we should never reach here
+        }
+    }
+
+    // read signed int of size byteWidth and return as 32-bit int
+    private static int readInt(ReadBuf buff, int end, int byteWidth) {
+        return (int) readLong(buff, end, byteWidth);
+    }
+
+    // read signed int of size byteWidth and return as 64-bit int
+    private static long readLong(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 1: return buff.get(end);
+            case 2: return buff.getShort(end);
+            case 4: return buff.getInt(end);
+            case 8: return buff.getLong(end);
+            default: return -1; // we should never reach here
+        }
+    }
+
+    private static double readDouble(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 4: return buff.getFloat(end);
+            case 8: return buff.getDouble(end);
+            default: return -1; // we should never reach here
+        }
+    }
+
+    /**
+     * Reads a FlexBuffer message in ReadBuf and returns {@link Reference} to
+     * the root element.
+     * @param buffer ReadBuf containing FlexBuffer message
+     * @return {@link Reference} to the root object
+     */
+    @Deprecated
+    public static Reference getRoot(ByteBuffer buffer) {
+        return getRoot( buffer.hasArray() ? new ArrayReadWriteBuf(buffer.array(), buffer.limit()) : new ByteBufferReadWriteBuf(buffer));
+    }
+
+        /**
+     * Reads a FlexBuffer message in ReadBuf and returns {@link Reference} to
+     * the root element.
+     * @param buffer ReadBuf containing FlexBuffer message
+     * @return {@link Reference} to the root object
+     */
+    public static Reference getRoot(ReadBuf buffer) {
+        // See Finish() below for the serialization counterpart of this.
+        // The root ends at the end of the buffer, so we parse backwards from there.
+        int end = buffer.limit();
+        int byteWidth = buffer.get(--end);
+        int packetType = byteToUnsignedInt(buffer.get(--end));
+        end -= byteWidth;  // The root data item.
+        return new Reference(buffer, end, byteWidth, packetType);
+    }
+
+    /**
+     * Represents an generic element in the buffer.
+     */
+    public static class Reference {
+
+        private static final Reference NULL_REFERENCE = new Reference(EMPTY_BB, 0, 1, 0);
+        private ReadBuf bb;
+        private int end;
+        private int parentWidth;
+        private int byteWidth;
+        private int type;
+
+        Reference(ReadBuf bb, int end, int parentWidth, int packedType) {
+            this(bb, end, parentWidth, (1 << (packedType & 3)), packedType >> 2);
+        }
+
+        Reference(ReadBuf bb, int end, int parentWidth, int byteWidth, int type) {
+            this.bb = bb;
+            this.end = end;
+            this.parentWidth = parentWidth;
+            this.byteWidth = byteWidth;
+            this.type = type;
+        }
+
+        /**
+         * Return element type
+         * @return element type as integer
+         */
+        public int getType() {
+            return type;
+        }
+
+        /**
+         * Checks whether the element is null type
+         * @return true if null type
+         */
+        public boolean isNull() {
+            return type == FBT_NULL;
+        }
+
+        /**
+         * Checks whether the element is boolean type
+         * @return true if boolean type
+         */
+        public boolean isBoolean() {
+            return type == FBT_BOOL;
+        }
+
+        /**
+         * Checks whether the element type is numeric (signed/unsigned integers and floats)
+         * @return true if numeric type
+         */
+        public boolean isNumeric() {
+            return isIntOrUInt() || isFloat();
+        }
+
+        /**
+         * Checks whether the element type is signed or unsigned integers
+         * @return true if an integer type
+         */
+        public boolean isIntOrUInt() {
+            return isInt() || isUInt();
+        }
+
+        /**
+         * Checks whether the element type is float
+         * @return true if a float type
+         */
+        public boolean isFloat() {
+            return type == FBT_FLOAT || type == FBT_INDIRECT_FLOAT;
+        }
+
+        /**
+         * Checks whether the element type is signed integer
+         * @return true if a signed integer type
+         */
+        public boolean isInt() {
+            return type == FBT_INT || type == FBT_INDIRECT_INT;
+        }
+
+        /**
+         * Checks whether the element type is signed integer
+         * @return true if a signed integer type
+         */
+        public boolean isUInt() {
+            return type == FBT_UINT || type == FBT_INDIRECT_UINT;
+        }
+
+        /**
+         * Checks whether the element type is string
+         * @return true if a string type
+         */
+        public boolean isString() {
+            return type == FBT_STRING;
+        }
+
+        /**
+         * Checks whether the element type is key
+         * @return true if a key type
+         */
+        public boolean isKey() {
+            return type == FBT_KEY;
+        }
+
+        /**
+         * Checks whether the element type is vector
+         * @return true if a vector type
+         */
+        public boolean isVector() {
+            return type == FBT_VECTOR || type == FBT_MAP;
+        }
+
+        /**
+         * Checks whether the element type is typed vector
+         * @return true if a typed vector type
+         */
+        public boolean isTypedVector() {
+            return FlexBuffers.isTypedVector(type);
+        }
+
+        /**
+         * Checks whether the element type is a map
+         * @return true if a map type
+         */
+        public boolean isMap() {
+            return type == FBT_MAP;
+        }
+
+        /**
+         * Checks whether the element type is a blob
+         * @return true if a blob type
+         */
+        public boolean isBlob() {
+            return type == FBT_BLOB;
+        }
+
+        /**
+         * Returns element as 32-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Unsigned elements will become negative</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 32-bit integer or 0 if fail to convert element to integer.
+         */
+        public int asInt() {
+            if (type == FBT_INT) {
+                // A fast path for the common case.
+                return readInt(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_INT: return readInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_UINT: return (int) readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_UINT: return (int) readUInt(bb, indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_FLOAT: return (int) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (int) readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: return Integer.parseInt(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to int.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as unsigned 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Negative signed elements will become unsigned counterpart</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 64-bit integer or 0 if fail to convert element to integer.
+         */
+        public long asUInt() {
+            if (type == FBT_UINT) {
+                // A fast path for the common case.
+                return readUInt(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INT: return readLong(bb, end, parentWidth);
+                    case FBT_INDIRECT_INT: return readLong(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_FLOAT: return (long) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (long) readDouble(bb,  indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: return Long.parseLong(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to uint.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Unsigned elements will become negative</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 64-bit integer or 0 if fail to convert element to long.
+         */
+        public long asLong() {
+            if (type == FBT_INT) {
+                // A fast path for the common case.
+                return readLong(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_INT: return readLong(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_UINT: return readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_FLOAT: return (long) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (long) readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: {
+                        try {
+                            return Long.parseLong(asString());
+                        } catch (NumberFormatException nfe) {
+                            return 0; //same as C++ implementation
+                        }
+                    }
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to int.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * @return 64-bit integer or 0 if fail to convert element to long.
+         */
+        public double asFloat() {
+            if (type == FBT_FLOAT) {
+                // A fast path for the common case.
+                return readDouble(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_FLOAT: return readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INT: return readInt(bb, end, parentWidth);
+                    case FBT_UINT:
+                    case FBT_BOOL:
+                        return readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_INT: return readInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0.0;
+                    case FBT_STRING: return Double.parseDouble(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    default:
+                        // Convert strings and other things to float.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as a {@link Key}
+         * @return key or {@link Key#empty()} if element is not a key
+         */
+        public Key asKey() {
+            if (isKey()) {
+                return new Key(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Key.empty();
+            }
+        }
+
+        /**
+         * Returns element as a `String`
+         * @return element as `String` or empty `String` if fail
+         */
+        public String asString() {
+            if (isString()) {
+                int start = indirect(bb, end, parentWidth);
+                int size = (int) readUInt(bb, start - byteWidth, byteWidth);
+                return bb.getString(start, size);
+            }
+            else if (isKey()){
+                int start = indirect(bb, end, byteWidth);
+                for (int i = start; ; i++) {
+                    if (bb.get(i) == 0) {
+                        return bb.getString(start, i - start);
+                    }
+                }
+            } else {
+                return "";
+            }
+        }
+
+        /**
+         * Returns element as a {@link Map}
+         * @return element as {@link Map} or empty {@link Map} if fail
+         */
+        public Map asMap() {
+            if (isMap()) {
+                return new Map(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Map.empty();
+            }
+        }
+
+        /**
+         * Returns element as a {@link Vector}
+         * @return element as {@link Vector} or empty {@link Vector} if fail
+         */
+        public Vector asVector() {
+            if (isVector()) {
+                return new Vector(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else if(type == FlexBuffers.FBT_VECTOR_STRING_DEPRECATED) {
+                // deprecated. Should be treated as key vector
+                return new TypedVector(bb, indirect(bb, end, parentWidth), byteWidth, FlexBuffers.FBT_KEY);
+            } else if (FlexBuffers.isTypedVector(type)) {
+                return new TypedVector(bb, indirect(bb, end, parentWidth), byteWidth, FlexBuffers.toTypedVectorElementType(type));
+            } else {
+                return Vector.empty();
+            }
+        }
+
+        /**
+         * Returns element as a {@link Blob}
+         * @return element as {@link Blob} or empty {@link Blob} if fail
+         */
+        public Blob asBlob() {
+            if (isBlob() || isString()) {
+                return new Blob(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Blob.empty();
+            }
+        }
+
+        /**
+         * Returns element as a boolean
+         * <p>If element type is not boolean, it will be casted to integer and compared against 0</p>
+         * @return element as boolean
+         */
+        public boolean asBoolean() {
+            if (isBoolean()) {
+                return bb.get(end) != 0;
+            }
+            return asUInt() != 0;
+        }
+
+        /**
+         * Returns text representation of the element (JSON)
+         * @return String containing text representation of the element
+         */
+        @Override
+        public String toString() {
+            return toString(new StringBuilder(128)).toString();
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        StringBuilder toString(StringBuilder sb) {
+            //TODO: Original C++ implementation escape strings.
+            // probably we should do it as well.
+            switch (type) {
+                case FBT_NULL:
+                    return sb.append("null");
+                case FBT_INT:
+                case FBT_INDIRECT_INT:
+                    return sb.append(asLong());
+                case FBT_UINT:
+                case FBT_INDIRECT_UINT:
+                    return sb.append(asUInt());
+                case FBT_INDIRECT_FLOAT:
+                case FBT_FLOAT:
+                    return sb.append(asFloat());
+                case FBT_KEY:
+                    return asKey().toString(sb.append('"')).append('"');
+                case FBT_STRING:
+                    return sb.append('"').append(asString()).append('"');
+                case FBT_MAP:
+                    return asMap().toString(sb);
+                case FBT_VECTOR:
+                    return asVector().toString(sb);
+                case FBT_BLOB:
+                    return asBlob().toString(sb);
+                case FBT_BOOL:
+                    return sb.append(asBoolean());
+                case FBT_VECTOR_INT:
+                case FBT_VECTOR_UINT:
+                case FBT_VECTOR_FLOAT:
+                case FBT_VECTOR_KEY:
+                case FBT_VECTOR_STRING_DEPRECATED:
+                case FBT_VECTOR_BOOL:
+                    return sb.append(asVector());
+                case FBT_VECTOR_INT2:
+                case FBT_VECTOR_UINT2:
+                case FBT_VECTOR_FLOAT2:
+                case FBT_VECTOR_INT3:
+                case FBT_VECTOR_UINT3:
+                case FBT_VECTOR_FLOAT3:
+                case FBT_VECTOR_INT4:
+                case FBT_VECTOR_UINT4:
+                case FBT_VECTOR_FLOAT4:
+
+                    throw new FlexBufferException("not_implemented:" + type);
+                default:
+                    return sb;
+            }
+        }
+    }
+
+    /**
+     * Base class of all types below.
+     * Points into the data buffer and allows access to one type.
+     */
+    private static abstract class Object {
+        ReadBuf bb;
+        int end;
+        int byteWidth;
+
+        Object(ReadBuf buff, int end, int byteWidth) {
+            this.bb = buff;
+            this.end = end;
+            this.byteWidth = byteWidth;
+        }
+
+        @Override
+        public String toString() {
+            return toString(new StringBuilder(128)).toString();
+        }
+
+        public abstract StringBuilder toString(StringBuilder sb);
+    }
+
+    // Stores size in `byte_width_` bytes before end position.
+    private static abstract class Sized extends Object {
+
+        protected final int size;
+
+        Sized(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+            size = readInt(bb, end - byteWidth, byteWidth);
+        }
+
+        public int size() {
+            return size;
+        }
+    }
+
+    /**
+     * Represents a array of bytes element in the buffer
+     *
+     * <p>It can be converted to `ReadBuf` using {@link data()},
+     * copied into a byte[] using {@link getBytes()} or
+     * have individual bytes accessed individually using {@link get(int)}</p>
+     */
+    public static class Blob extends Sized {
+        static final Blob EMPTY = new Blob(EMPTY_BB, 1, 1);
+
+        Blob(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+        }
+
+        /** Return an empty {@link Blob} */
+        public static Blob empty() {
+            return EMPTY;
+        }
+
+        /**
+         * Return {@link Blob} as `ReadBuf`
+         * @return blob as `ReadBuf`
+         */
+        public ByteBuffer data() {
+            ByteBuffer dup = ByteBuffer.wrap(bb.data());
+            dup.position(end);
+            dup.limit(end + size());
+            return dup.asReadOnlyBuffer().slice();
+        }
+
+        /**
+         * Copy blob into a byte[]
+         * @return blob as a byte[]
+         */
+        public byte[] getBytes() {
+            int size = size();
+            byte[] result = new byte[size];
+            for (int i = 0; i < size; i++) {
+                result[i] = bb.get(end + i);
+            }
+            return result;
+        }
+
+        /**
+         * Return individual byte at a given position
+         * @param pos position of the byte to be read
+         */
+        public byte get(int pos) {
+            assert pos >=0 && pos <= size();
+            return bb.get(end + pos);
+        }
+
+        /**
+         * Returns a text(JSON) representation of the {@link Blob}
+         */
+        @Override
+        public String toString() {
+            return bb.getString(end, size());
+        }
+
+        /**
+         * Append a text(JSON) representation of the {@link Blob} into a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            sb.append('"');
+            sb.append(bb.getString(end, size()));
+            return sb.append('"');
+        }
+    }
+
+    /**
+     * Represents a key element in the buffer. Keys are
+     * used to reference objects in a {@link Map}
+     */
+    public static class Key extends Object {
+
+        private static final Key EMPTY = new Key(EMPTY_BB, 0, 0);
+
+        Key(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+        }
+
+        /**
+         * Return an empty {@link Key}
+         * @return empty {@link Key}
+         * */
+        public static Key empty() {
+            return Key.EMPTY;
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            return sb.append(toString());
+        }
+
+        @Override
+        public String toString() {
+            int size;
+            for (int i = end; ; i++) {
+                if (bb.get(i) == 0) {
+                    size = i - end;
+                    break;
+                }
+            }
+            return bb.getString(end, size);
+        }
+
+        int compareTo(byte[] other) {
+            int ia = end;
+            int io = 0;
+            byte c1, c2;
+            do {
+                c1 = bb.get(ia);
+                c2 = other[io];
+                if (c1 == '\0')
+                    return c1 - c2;
+                ia++;
+                io++;
+                if (io == other.length) {
+                    // in our buffer we have an additional \0 byte
+                    // but this does not exist in regular Java strings, so we return now
+                    return c1 - c2;
+                }
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+
+        /**
+         *  Compare keys
+         *  @param obj other key to compare
+         *  @return true if keys are the same
+         */
+        @Override
+        public boolean equals(java.lang.Object obj) {
+            if (!(obj instanceof Key))
+                return false;
+
+            return ((Key) obj).end == end && ((Key) obj).byteWidth == byteWidth;
+        }
+
+        public int hashCode() {
+          return end ^ byteWidth;
+        }
+    }
+
+    /**
+     * Map object representing a set of key-value pairs.
+     */
+    public static class Map extends Vector {
+        private static final Map EMPTY_MAP = new Map(EMPTY_BB, 1, 1);
+        // cache for converting UTF-8 codepoints into
+        // Java chars. Used to speed up String comparison
+        private final byte[] comparisonBuffer = new byte[4];
+
+        Map(ReadBuf bb, int end, int byteWidth) {
+            super(bb, end, byteWidth);
+        }
+
+        /**
+         * Returns an empty {@link Map}
+         * @return an empty {@link Map}
+         */
+        public static Map empty() {
+            return EMPTY_MAP;
+        }
+
+        /**
+         * @param key access key to element on map
+         * @return reference to value in map
+         */
+        public Reference get(String key) {
+            int index = binarySearch(key);
+            if (index >= 0 && index < size) {
+                return get(index);
+            }
+            return Reference.NULL_REFERENCE;
+        }
+
+        /**
+         * @param key access key to element on map. Keys are assumed to be encoded in UTF-8
+         * @return reference to value in map
+         */
+        public Reference get(byte[] key) {
+            int index = binarySearch(key);
+            if (index >= 0 && index < size) {
+                return get(index);
+            }
+            return Reference.NULL_REFERENCE;
+        }
+
+        /**
+         * Get a vector or keys in the map
+         *
+         * @return vector of keys
+         */
+        public KeyVector keys() {
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            return new KeyVector(new TypedVector(bb,
+                    indirect(bb, keysOffset, byteWidth),
+                    readInt(bb, keysOffset + byteWidth, byteWidth),
+                    FBT_KEY));
+        }
+
+        /**
+         * @return {@code Vector} of values from map
+         */
+        public Vector values() {
+            return new Vector(bb, end, byteWidth);
+        }
+
+        /**
+         * Writes text (json) representation of map in a {@code StringBuilder}.
+         *
+         * @param builder {@code StringBuilder} to be appended to
+         * @return Same {@code StringBuilder} with appended text
+         */
+        public StringBuilder toString(StringBuilder builder) {
+            builder.append("{ ");
+            KeyVector keys = keys();
+            int size = size();
+            Vector vals = values();
+            for (int i = 0; i < size; i++) {
+                builder.append('"')
+                        .append(keys.get(i).toString())
+                        .append("\" : ");
+                builder.append(vals.get(i).toString());
+                if (i != size - 1)
+                    builder.append(", ");
+            }
+            builder.append(" }");
+            return builder;
+        }
+
+        // Performs a binary search on a key vector and return index of the key in key vector
+        private int binarySearch(CharSequence searchedKey) {
+            int low = 0;
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
+            while (low <= high) {
+                int mid = (low + high) >>> 1;
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareCharSequence(keyPos, searchedKey);
+                if (cmp < 0)
+                    low = mid + 1;
+                else if (cmp > 0)
+                    high = mid - 1;
+                else
+                    return mid; // key found
+            }
+            return -(low + 1);  // key not found
+        }
+
+        private int binarySearch(byte[] searchedKey) {
+            int low = 0;
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
+
+            while (low <= high) {
+                int mid = (low + high) >>> 1;
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareBytes(bb, keyPos, searchedKey);
+                if (cmp < 0)
+                    low = mid + 1;
+                else if (cmp > 0)
+                    high = mid - 1;
+                else
+                    return mid; // key found
+            }
+            return -(low + 1);  // key not found
+        }
+
+        // compares a byte[] against a FBT_KEY
+        private int compareBytes(ReadBuf bb, int start, byte[] other) {
+            int l1 = start;
+            int l2 = 0;
+            byte c1, c2;
+            do {
+                c1 = bb.get(l1);
+                c2 = other[l2];
+                if (c1 == '\0')
+                    return c1 - c2;
+                l1++;
+                l2++;
+                if (l2 == other.length) {
+                    // in our buffer we have an additional \0 byte
+                    // but this does not exist in regular Java strings, so we return now
+                    return c1 - c2;
+                }
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+
+        // compares a CharSequence against a FBT_KEY
+        private int compareCharSequence(int start, CharSequence other) {
+            int bufferPos = start;
+            int otherPos = 0;
+            int limit = bb.limit();
+            int otherLimit = other.length();
+
+            // special loop for ASCII characters. Most of keys should be ASCII only, so this
+            // loop should be optimized for that.
+            // breaks if a multi-byte character is found
+            while (otherPos < otherLimit) {
+                char c2 = other.charAt(otherPos);
+
+                if (c2 >= 0x80) {
+                    // not a single byte codepoint
+                    break;
+                }
+
+                byte b = bb.get(bufferPos);
+
+                if (b == 0) {
+                    return -c2;
+                } else if (b < 0) {
+                    break;
+                } else if ((char) b != c2) {
+                    return b - c2;
+                }
+                ++bufferPos;
+                ++otherPos;
+            }
+
+            while (bufferPos < limit) {
+
+                int sizeInBuff = Utf8.encodeUtf8CodePoint(other, otherPos, comparisonBuffer);
+
+                if (sizeInBuff == 0) {
+                    // That means we finish with other and there are not more chars to
+                    // compare. String in the buffer is bigger.
+                    return bb.get(bufferPos);
+                }
+
+                for (int i = 0; i < sizeInBuff; i++) {
+                    byte bufferByte = bb.get(bufferPos++);
+                    byte otherByte = comparisonBuffer[i];
+                    if (bufferByte == 0) {
+                        // Our key is finished, so other is bigger
+                        return -otherByte;
+                    } else if (bufferByte != otherByte) {
+                        return bufferByte - otherByte;
+                    }
+                }
+
+                otherPos += sizeInBuff == 4 ? 2 : 1;
+            }
+            return 0;
+        }
+    }
+
+    /**
+     * Object that represents a set of elements in the buffer
+     */
+    public static class Vector extends Sized {
+
+        private static final Vector EMPTY_VECTOR = new Vector(EMPTY_BB, 1, 1);
+
+        Vector(ReadBuf bb, int end, int byteWidth) {
+            super(bb, end, byteWidth);
+        }
+
+        /**
+         * Returns an empty {@link Map}
+         * @return an empty {@link Map}
+         */
+        public static Vector empty() {
+            return EMPTY_VECTOR;
+        }
+
+        /**
+         * Checks if the vector is empty
+         * @return true if vector is empty
+         */
+        public boolean isEmpty() {
+            return this == EMPTY_VECTOR;
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            sb.append("[ ");
+            int size = size();
+            for (int i = 0; i < size; i++) {
+                get(i).toString(sb);
+                if (i != size - 1) {
+                    sb.append(", ");
+                }
+            }
+            sb.append(" ]");
+            return sb;
+        }
+
+        /**
+         * Get a element in a vector by index
+         *
+         * @param index position of the element
+         * @return {@code Reference} to the element
+         */
+        public Reference get(int index) {
+            long len = size();
+            if (index >= len) {
+                return Reference.NULL_REFERENCE;
+            }
+            int packedType = byteToUnsignedInt(bb.get((int) (end + (len * byteWidth) + index)));
+            int obj_end = end + index * byteWidth;
+            return new Reference(bb, obj_end, byteWidth, packedType);
+        }
+    }
+
+    /**
+     * Object that represents a set of elements with the same type
+     */
+    public static class TypedVector extends Vector {
+
+        private static final TypedVector EMPTY_VECTOR = new TypedVector(EMPTY_BB, 1, 1, FBT_INT);
+
+        private final int elemType;
+
+        TypedVector(ReadBuf bb, int end, int byteWidth, int elemType) {
+            super(bb, end, byteWidth);
+            this.elemType = elemType;
+        }
+
+        public static TypedVector empty() {
+            return EMPTY_VECTOR;
+        }
+
+        /**
+         * Returns whether the vector is empty
+         *
+         * @return true if empty
+         */
+        public boolean isEmptyVector() {
+            return this == EMPTY_VECTOR;
+        }
+
+        /**
+         * Return element type for all elements in the vector
+         *
+         * @return element type
+         */
+        public int getElemType() {
+            return elemType;
+        }
+
+        /**
+         * Get reference to an object in the {@code Vector}
+         *
+         * @param pos position of the object in {@code Vector}
+         * @return reference to element
+         */
+        @Override
+        public Reference get(int pos) {
+            int len = size();
+            if (pos >= len) return Reference.NULL_REFERENCE;
+            int childPos = end + pos * byteWidth;
+            return new Reference(bb, childPos, byteWidth, 1, elemType);
+        }
+    }
+
+    /**
+     * Represent a vector of keys in a map
+     */
+    public static class KeyVector {
+
+        private final TypedVector vec;
+
+        KeyVector(TypedVector vec) {
+            this.vec = vec;
+        }
+
+        /**
+         * Return key
+         *
+         * @param pos position of the key in key vector
+         * @return key
+         */
+        public Key get(int pos) {
+            int len = size();
+            if (pos >= len) return Key.EMPTY;
+            int childPos = vec.end + pos * vec.byteWidth;
+            return new Key(vec.bb, indirect(vec.bb, childPos, vec.byteWidth), 1);
+        }
+
+        /**
+         * Returns size of key vector
+         *
+         * @return size
+         */
+        public int size() {
+            return vec.size();
+        }
+
+        /**
+         * Returns a text(JSON) representation
+         */
+        public String toString() {
+            StringBuilder b = new StringBuilder();
+            b.append('[');
+            for (int i = 0; i < vec.size(); i++) {
+                vec.get(i).toString(b);
+                if (i != vec.size() - 1) {
+                    b.append(", ");
+                }
+            }
+            return b.append("]").toString();
+        }
+    }
+
+    public static class FlexBufferException extends RuntimeException {
+        FlexBufferException(String msg) {
+            super(msg);
+        }
+    }
+
+    static class Unsigned {
+
+        static int byteToUnsignedInt(byte x) {
+            return ((int) x) & 0xff;
+        }
+
+        static int shortToUnsignedInt(short x) {
+            return ((int) x) & 0xffff;
+        }
+
+        static long intToUnsignedLong(int x) {
+            return ((long) x) & 0xffffffffL;
+        }
+    }
+}
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffersBuilder.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffersBuilder.java
new file mode 100644
index 0000000..dc0df96
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FlexBuffersBuilder.java
@@ -0,0 +1,781 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+
+import static com.google.flatbuffers.FlexBuffers.*;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.byteToUnsignedInt;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.intToUnsignedLong;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.shortToUnsignedInt;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Helper class that builds FlexBuffers
+ * <p> This class presents all necessary APIs to create FlexBuffers. A `ByteBuffer` will be used to store the
+ * data. It can be created internally, or passed down in the constructor.</p>
+ *
+ * <p>There are some limitations when compared to original implementation in C++. Most notably:
+ * <ul>
+ *   <li><p> No support for mutations (might change in the future).</p></li>
+ *   <li><p> Buffer size limited to {@link Integer#MAX_VALUE}</p></li>
+ *   <li><p> Since Java does not support unsigned type, all unsigned operations accepts an immediate higher representation
+ *   of similar type.</p></li>
+ * </ul>
+ * </p>
+ */
+public class FlexBuffersBuilder {
+
+    /**
+     * No keys or strings will be shared
+     */
+    public static final int BUILDER_FLAG_NONE = 0;
+    /**
+     * Keys will be shared between elements. Identical keys will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEYS = 1;
+    /**
+     * Strings will be shared between elements. Identical strings will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory. This is ideal if you expect many repeated
+     * strings on the message.
+     */
+    public static final int BUILDER_FLAG_SHARE_STRINGS = 2;
+    /**
+     * Strings and keys will be shared between elements.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEYS_AND_STRINGS = 3;
+    /**
+     * Reserved for the future.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEY_VECTORS = 4;
+    /**
+     * Reserved for the future.
+     */
+    public static final int BUILDER_FLAG_SHARE_ALL = 7;
+
+    /// @cond FLATBUFFERS_INTERNAL
+    private static final int WIDTH_8 = 0;
+    private static final int WIDTH_16 = 1;
+    private static final int WIDTH_32 = 2;
+    private static final int WIDTH_64 = 3;
+    private final ReadWriteBuf bb;
+    private final ArrayList<Value> stack = new ArrayList<>();
+    private final HashMap<String, Integer> keyPool = new HashMap<>();
+    private final HashMap<String, Integer> stringPool = new HashMap<>();
+    private final int flags;
+    private boolean finished = false;
+
+    // A lambda to sort map keys
+    private Comparator<Value> keyComparator = new Comparator<Value>() {
+        @Override
+        public int compare(Value o1, Value o2) {
+            int ia = o1.key;
+            int io =  o2.key;
+            byte c1, c2;
+            do {
+                c1 = bb.get(ia);
+                c2 = bb.get(io);
+                if (c1 == 0)
+                    return c1 - c2;
+                ia++;
+                io++;
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+    };
+    /// @endcond
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder} with {@link #BUILDER_FLAG_SHARE_KEYS} set.
+     * @param bufSize size of buffer in bytes.
+     */
+    public FlexBuffersBuilder(int bufSize) {
+        this(new ArrayReadWriteBuf(bufSize), BUILDER_FLAG_SHARE_KEYS);
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder} with {@link #BUILDER_FLAG_SHARE_KEYS} set.
+     */
+    public FlexBuffersBuilder() {
+        this(256);
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder}.
+     *
+     * @param bb    `ByteBuffer` that will hold the message
+     * @param flags Share flags
+     */
+    @Deprecated
+    public FlexBuffersBuilder(ByteBuffer bb, int flags) {
+        this(new ArrayReadWriteBuf(bb.array()), flags);
+    }
+
+    public FlexBuffersBuilder(ReadWriteBuf bb, int flags) {
+        this.bb = bb;
+        this.flags = flags;
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder}.
+     * By default same keys will be serialized only once
+     * @param bb `ByteBuffer` that will hold the message
+     */
+    public FlexBuffersBuilder(ByteBuffer bb) {
+        this(bb, BUILDER_FLAG_SHARE_KEYS);
+    }
+
+    /**
+     * Reset the FlexBuffersBuilder by purging all data that it holds.
+     */
+    public void clear(){
+        bb.clear();
+        stack.clear();
+        keyPool.clear();
+        stringPool.clear();
+        finished = false;
+    }
+
+    /**
+     * Return `ByteBuffer` containing FlexBuffer message. {@code #finish()} must be called before calling this
+     * function otherwise an assert will trigger.
+     *
+     * @return `ByteBuffer` with finished message
+     */
+    public ReadWriteBuf getBuffer() {
+        assert (finished);
+        return bb;
+    }
+
+    /**
+     * Insert a single boolean into the buffer
+     * @param val true or false
+     */
+    public void putBoolean(boolean val) {
+        putBoolean(null, val);
+    }
+
+    /**
+     * Insert a single boolean into the buffer
+     * @param key key used to store element in map
+     * @param val true or false
+     */
+    public void putBoolean(String key, boolean val) {
+        stack.add(Value.bool(putKey(key), val));
+    }
+
+    private int putKey(String key) {
+        if (key == null) {
+            return -1;
+        }
+        int pos = bb.writePosition();
+        if ((flags & BUILDER_FLAG_SHARE_KEYS) != 0) {
+            Integer keyFromPool = keyPool.get(key);
+            if (keyFromPool == null) {
+                byte[]  keyBytes = key.getBytes(StandardCharsets.UTF_8);
+                bb.put(keyBytes, 0, keyBytes.length);
+                bb.put((byte) 0);
+                keyPool.put(key, pos);
+            } else {
+                pos = keyFromPool;
+            }
+        } else {
+            byte[]  keyBytes = key.getBytes(StandardCharsets.UTF_8);
+            bb.put(keyBytes, 0, keyBytes.length);
+            bb.put((byte) 0);
+            keyPool.put(key, pos);
+        }
+        return pos;
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param val integer
+     */
+    public void putInt(int val) {
+        putInt(null, val);
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param key key used to store element in map
+     * @param val integer
+     */
+    public void putInt(String key, int val) {
+        putInt(key, (long) val);
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param key key used to store element in map
+     * @param val 64-bit integer
+     */
+    public void putInt(String key, long val) {
+        int iKey = putKey(key);
+        if (Byte.MIN_VALUE <= val && val <= Byte.MAX_VALUE) {
+            stack.add(Value.int8(iKey, (int) val));
+        } else if (Short.MIN_VALUE <= val && val <= Short.MAX_VALUE) {
+            stack.add(Value.int16(iKey, (int) val));
+        } else if (Integer.MIN_VALUE <= val && val <= Integer.MAX_VALUE) {
+            stack.add(Value.int32(iKey, (int) val));
+        } else {
+            stack.add(Value.int64(iKey, val));
+        }
+    }
+
+    /**
+     * Adds a 64-bit integer into the buff
+     * @param value integer
+     */
+    public void putInt(long value) {
+        putInt(null, value);
+    }
+
+    /**
+     * Adds a unsigned integer into the buff.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt(int value) {
+        putUInt(null, (long) value);
+    }
+
+    /**
+     * Adds a unsigned integer (stored in a signed 64-bit integer) into the buff.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt(long value) {
+        putUInt(null, value);
+    }
+
+    /**
+     * Adds a 64-bit unsigned integer (stored as {@link BigInteger}) into the buff.
+     * Warning: This operation might be very slow.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt64(BigInteger value) {
+        putUInt64(null, value.longValue());
+    }
+
+    private void putUInt64(String key, long value) {
+        stack.add(Value.uInt64(putKey(key), value));
+    }
+
+    private void putUInt(String key, long value) {
+        int iKey = putKey(key);
+        Value vVal;
+
+        int width = widthUInBits(value);
+
+        if (width == WIDTH_8) {
+            vVal = Value.uInt8(iKey, (int)value);
+        } else if (width == WIDTH_16) {
+            vVal = Value.uInt16(iKey, (int)value);
+        } else if (width == WIDTH_32) {
+            vVal = Value.uInt32(iKey, (int)value);
+        } else {
+            vVal = Value.uInt64(iKey, value);
+        }
+        stack.add(vVal);
+    }
+
+    /**
+     * Adds a 32-bit float into the buff.
+     * @param value float representing value
+     */
+    public void putFloat(float value) {
+        putFloat(null, value);
+    }
+
+    /**
+     * Adds a 32-bit float into the buff.
+     * @param key key used to store element in map
+     * @param value float representing value
+     */
+    public void putFloat(String key, float val) {
+        stack.add(Value.float32(putKey(key), val));
+    }
+
+    /**
+     * Adds a 64-bit float into the buff.
+     * @param value float representing value
+     */
+    public void putFloat(double value) {
+        putFloat(null, value);
+    }
+
+    /**
+     * Adds a 64-bit float into the buff.
+     * @param key key used to store element in map
+     * @param value float representing value
+     */
+    public void putFloat(String key, double val) {
+        stack.add(Value.float64(putKey(key), val));
+    }
+
+    /**
+     * Adds a String into the buffer
+     * @param value string
+     * @return start position of string in the buffer
+     */
+    public int putString(String value) {
+        return putString(null, value);
+    }
+
+    /**
+     * Adds a String into the buffer
+     * @param key key used to store element in map
+     * @param value string
+     * @return start position of string in the buffer
+     */
+    public int putString(String key, String val) {
+        int iKey = putKey(key);
+        if ((flags & FlexBuffersBuilder.BUILDER_FLAG_SHARE_STRINGS) != 0) {
+            Integer i = stringPool.get(val);
+            if (i == null) {
+                Value value = writeString(iKey, val);
+                stringPool.put(val, (int) value.iValue);
+                stack.add(value);
+                return (int) value.iValue;
+            } else {
+                int bitWidth = widthUInBits(val.length());
+                stack.add(Value.blob(iKey, i, FBT_STRING, bitWidth));
+                return i;
+            }
+        } else {
+            Value value = writeString(iKey, val);
+            stack.add(value);
+            return (int) value.iValue;
+        }
+    }
+
+    private Value writeString(int key, String s) {
+        return writeBlob(key, s.getBytes(StandardCharsets.UTF_8), FBT_STRING, true);
+    }
+
+    // in bits to fit a unsigned int
+    static int widthUInBits(long len) {
+        if (len <= byteToUnsignedInt((byte)0xff)) return WIDTH_8;
+        if (len <= shortToUnsignedInt((short)0xffff)) return WIDTH_16;
+        if (len <= intToUnsignedLong(0xffff_ffff)) return WIDTH_32;
+        return WIDTH_64;
+    }
+
+    private Value writeBlob(int key, byte[] blob, int type, boolean trailing) {
+        int bitWidth = widthUInBits(blob.length);
+        int byteWidth = align(bitWidth);
+        writeInt(blob.length, byteWidth);
+        int sloc = bb.writePosition();
+        bb.put(blob, 0, blob.length);
+        if (trailing) {
+            bb.put((byte) 0);
+        }
+        return Value.blob(key, sloc, type, bitWidth);
+    }
+
+    // Align to prepare for writing a scalar with a certain size.
+    private int align(int alignment) {
+        int byteWidth = 1 << alignment;
+        int padBytes = Value.paddingBytes(bb.writePosition(), byteWidth);
+        while (padBytes-- != 0) {
+            bb.put((byte) 0);
+        }
+        return byteWidth;
+    }
+
+    private void writeInt(long value, int byteWidth) {
+        switch (byteWidth) {
+            case 1: bb.put((byte) value); break;
+            case 2: bb.putShort((short) value); break;
+            case 4: bb.putInt((int) value); break;
+            case 8: bb.putLong(value); break;
+        }
+    }
+
+    /**
+     * Adds a byte array into the message
+     * @param value byte array
+     * @return position in buffer as the start of byte array
+     */
+    public int putBlob(byte[] value) {
+        return putBlob(null, value);
+    }
+
+    /**
+     * Adds a byte array into the message
+     * @param key key used to store element in map
+     * @param value byte array
+     * @return position in buffer as the start of byte array
+     */
+    public int putBlob(String key, byte[] val) {
+        int iKey = putKey(key);
+        Value value = writeBlob(iKey, val, FBT_BLOB, false);
+        stack.add(value);
+        return (int) value.iValue;
+    }
+
+    /**
+     * Start a new vector in the buffer.
+     * @return a reference indicating position of the vector in buffer. This
+     * reference must be passed along when the vector is finished using endVector()
+     */
+    public int startVector() {
+        return stack.size();
+    }
+
+    /**
+     * Finishes a vector, but writing the information in the buffer
+     * @param key   key used to store element in map
+     * @param start reference for begining of the vector. Returned by {@link startVector()}
+     * @param typed boolean indicating wether vector is typed
+     * @param fixed boolean indicating wether vector is fixed
+     * @return      Reference to the vector
+     */
+    public int endVector(String key, int start, boolean typed, boolean fixed) {
+        int iKey = putKey(key);
+        Value vec = createVector(iKey, start, stack.size() - start, typed, fixed, null);
+        // Remove temp elements and return vector.
+        while (stack.size() > start) {
+            stack.remove(stack.size() - 1);
+        }
+        stack.add(vec);
+        return (int) vec.iValue;
+    }
+
+    /**
+     * Finish writing the message into the buffer. After that no other element must
+     * be inserted into the buffer. Also, you must call this function before start using the
+     * FlexBuffer message
+     * @return `ByteBuffer` containing the FlexBuffer message
+     */
+    public ByteBuffer finish() {
+        // If you hit this assert, you likely have objects that were never included
+        // in a parent. You need to have exactly one root to finish a buffer.
+        // Check your Start/End calls are matched, and all objects are inside
+        // some other object.
+        assert (stack.size() == 1);
+        // Write root value.
+        int byteWidth = align(stack.get(0).elemWidth(bb.writePosition(), 0));
+        writeAny(stack.get(0), byteWidth);
+        // Write root type.
+        bb.put(stack.get(0).storedPackedType());
+        // Write root size. Normally determined by parent, but root has no parent :)
+        bb.put((byte) byteWidth);
+        this.finished = true;
+        return ByteBuffer.wrap(bb.data(), 0, bb.writePosition());
+    }
+
+    /*
+     * Create a vector based on the elements stored in the stack
+     *
+     * @param key    reference to its key
+     * @param start  element in the stack
+     * @param length size of the vector
+     * @param typed  whether is TypedVector or not
+     * @param fixed  whether is Fixed vector or not
+     * @param keys   Value representing key vector
+     * @return Value representing the created vector
+     */
+    private Value createVector(int key, int start, int length, boolean typed, boolean fixed, Value keys) {
+        assert (!fixed || typed); // typed=false, fixed=true combination is not supported.
+        // Figure out smallest bit width we can store this vector with.
+        int bitWidth = Math.max(WIDTH_8, widthUInBits(length));
+        int prefixElems = 1;
+        if (keys != null) {
+            // If this vector is part of a map, we will pre-fix an offset to the keys
+            // to this vector.
+            bitWidth = Math.max(bitWidth, keys.elemWidth(bb.writePosition(), 0));
+            prefixElems += 2;
+        }
+        int vectorType = FBT_KEY;
+        // Check bit widths and types for all elements.
+        for (int i = start; i < stack.size(); i++) {
+            int elemWidth = stack.get(i).elemWidth(bb.writePosition(), i + prefixElems);
+            bitWidth = Math.max(bitWidth, elemWidth);
+            if (typed) {
+                if (i == start) {
+                    vectorType = stack.get(i).type;
+                    if (!FlexBuffers.isTypedVectorElementType(vectorType)) {
+                        throw new FlexBufferException("TypedVector does not support this element type");
+                    }
+                } else {
+                    // If you get this assert, you are writing a typed vector with
+                    // elements that are not all the same type.
+                    assert (vectorType == stack.get(i).type);
+                }
+            }
+        }
+        // If you get this assert, your fixed types are not one of:
+        // Int / UInt / Float / Key.
+        assert (!fixed || FlexBuffers.isTypedVectorElementType(vectorType));
+
+        int byteWidth = align(bitWidth);
+        // Write vector. First the keys width/offset if available, and size.
+        if (keys != null) {
+            writeOffset(keys.iValue, byteWidth);
+            writeInt(1L << keys.minBitWidth, byteWidth);
+        }
+        if (!fixed) {
+            writeInt(length, byteWidth);
+        }
+        // Then the actual data.
+        int vloc = bb.writePosition();
+        for (int i = start; i < stack.size(); i++) {
+            writeAny(stack.get(i), byteWidth);
+        }
+        // Then the types.
+        if (!typed) {
+            for (int i = start; i < stack.size(); i++) {
+                bb.put(stack.get(i).storedPackedType(bitWidth));
+            }
+        }
+        return new Value(key, keys != null ? FBT_MAP
+                : (typed ? FlexBuffers.toTypedVector(vectorType, fixed ? length : 0)
+                : FBT_VECTOR), bitWidth, vloc);
+    }
+
+    private void writeOffset(long val, int byteWidth) {
+        int reloff = (int) (bb.writePosition() - val);
+        assert (byteWidth == 8 || reloff < 1L << (byteWidth * 8));
+        writeInt(reloff, byteWidth);
+    }
+
+    private void writeAny(final Value val, int byteWidth) {
+        switch (val.type) {
+            case FBT_NULL:
+            case FBT_BOOL:
+            case FBT_INT:
+            case FBT_UINT:
+                writeInt(val.iValue, byteWidth);
+                break;
+            case FBT_FLOAT:
+                writeDouble(val.dValue, byteWidth);
+                break;
+            default:
+                writeOffset(val.iValue, byteWidth);
+                break;
+        }
+    }
+
+    private void writeDouble(double val, int byteWidth) {
+        if (byteWidth == 4) {
+            bb.putFloat((float) val);
+        } else if (byteWidth == 8) {
+            bb.putDouble(val);
+        }
+    }
+
+    /**
+     * Start a new map in the buffer.
+     * @return a reference indicating position of the map in buffer. This
+     * reference must be passed along when the map is finished using endMap()
+     */
+    public int startMap() {
+        return stack.size();
+    }
+
+    /**
+     * Finishes a map, but writing the information in the buffer
+     * @param key   key used to store element in map
+     * @param start reference for begining of the map. Returned by {@link startMap()}
+     * @return      Reference to the map
+     */
+    public int endMap(String key, int start) {
+        int iKey = putKey(key);
+
+        Collections.sort(stack.subList(start, stack.size()), keyComparator);
+
+        Value keys = createKeyVector(start, stack.size() - start);
+        Value vec = createVector(iKey, start, stack.size() - start, false, false, keys);
+        // Remove temp elements and return map.
+        while (stack.size() > start) {
+            stack.remove(stack.size() - 1);
+        }
+        stack.add(vec);
+        return (int) vec.iValue;
+    }
+
+    private Value createKeyVector(int start, int length) {
+        // Figure out smallest bit width we can store this vector with.
+        int bitWidth = Math.max(WIDTH_8, widthUInBits(length));
+        int prefixElems = 1;
+        // Check bit widths and types for all elements.
+        for (int i = start; i < stack.size(); i++) {
+            int elemWidth = Value.elemWidth(FBT_KEY, WIDTH_8, stack.get(i).key, bb.writePosition(), i + prefixElems);
+            bitWidth = Math.max(bitWidth, elemWidth);
+        }
+
+        int byteWidth = align(bitWidth);
+        // Write vector. First the keys width/offset if available, and size.
+        writeInt(length, byteWidth);
+        // Then the actual data.
+        int vloc = bb.writePosition();
+        for (int i = start; i < stack.size(); i++) {
+            int pos = stack.get(i).key;
+            assert(pos != -1);
+            writeOffset(stack.get(i).key, byteWidth);
+        }
+        // Then the types.
+        return new Value(-1, FlexBuffers.toTypedVector(FBT_KEY,0), bitWidth, vloc);
+    }
+
+    private static class Value {
+        final int type;
+        // for scalars, represents scalar size in bytes
+        // for vectors, represents the size
+        // for string, length
+        final int minBitWidth;
+        // float value
+        final double dValue;
+        // integer value
+        long iValue;
+        // position of the key associated with this value in buffer
+        int key;
+
+        Value(int key, int type, int bitWidth, long iValue) {
+            this.key = key;
+            this.type = type;
+            this.minBitWidth = bitWidth;
+            this.iValue = iValue;
+            this.dValue = Double.MIN_VALUE;
+        }
+
+        Value(int key, int type, int bitWidth, double dValue) {
+            this.key = key;
+            this.type = type;
+            this.minBitWidth = bitWidth;
+            this.dValue = dValue;
+            this.iValue = Long.MIN_VALUE;
+        }
+
+        static Value bool(int key, boolean b) {
+            return new Value(key, FBT_BOOL, WIDTH_8, b ? 1 : 0);
+        }
+
+        static Value blob(int key, int position, int type, int bitWidth) {
+            return new Value(key, type, bitWidth, position);
+        }
+
+        static Value int8(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_8, value);
+        }
+
+        static Value int16(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_16, value);
+        }
+
+        static Value int32(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_32, value);
+        }
+
+        static Value int64(int key, long value) {
+            return new Value(key, FBT_INT, WIDTH_64, value);
+        }
+
+        static Value uInt8(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_8, value);
+        }
+
+        static Value uInt16(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_16, value);
+        }
+
+        static Value uInt32(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_32, value);
+        }
+
+        static Value uInt64(int key, long value) {
+            return new Value(key, FBT_UINT, WIDTH_64, value);
+        }
+
+        static Value float32(int key, float value) {
+            return new Value(key, FBT_FLOAT, WIDTH_32, value);
+        }
+
+        static Value float64(int key, double value) {
+            return new Value(key, FBT_FLOAT, WIDTH_64, value);
+        }
+
+        private byte storedPackedType() {
+            return storedPackedType(WIDTH_8);
+        }
+
+        private byte storedPackedType(int parentBitWidth) {
+            return packedType(storedWidth(parentBitWidth), type);
+        }
+
+        private static byte packedType(int bitWidth, int type) {
+            return (byte) (bitWidth | (type << 2));
+        }
+
+        private int storedWidth(int parentBitWidth) {
+            if (FlexBuffers.isTypeInline(type)) {
+                return Math.max(minBitWidth, parentBitWidth);
+            } else {
+                return minBitWidth;
+            }
+        }
+
+        private int elemWidth(int bufSize, int elemIndex) {
+            return elemWidth(type, minBitWidth, iValue, bufSize, elemIndex);
+        }
+
+        private static int elemWidth(int type, int minBitWidth, long iValue, int bufSize, int elemIndex) {
+            if (FlexBuffers.isTypeInline(type)) {
+                return minBitWidth;
+            } else {
+                // We have an absolute offset, but want to store a relative offset
+                // elem_index elements beyond the current buffer end. Since whether
+                // the relative offset fits in a certain byte_width depends on
+                // the size of the elements before it (and their alignment), we have
+                // to test for each size in turn.
+
+                // Original implementation checks for largest scalar
+                // which is long unsigned int
+                for (int byteWidth = 1; byteWidth <= 32; byteWidth *= 2) {
+                    // Where are we going to write this offset?
+                    int offsetLoc = bufSize + paddingBytes(bufSize, byteWidth) + (elemIndex * byteWidth);
+                    // Compute relative offset.
+                    long offset = offsetLoc - iValue;
+                    // Does it fit?
+                    int bitWidth = widthUInBits((int) offset);
+                    if (((1L) << bitWidth) == byteWidth)
+                        return bitWidth;
+                }
+                assert (false);  // Must match one of the sizes above.
+                return WIDTH_64;
+            }
+        }
+
+        private static int paddingBytes(int bufSize, int scalarSize) {
+            return ((~bufSize) + 1) & (scalarSize - 1);
+        }
+    }
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FloatVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FloatVector.java
new file mode 100644
index 0000000..5c505ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/FloatVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of float values.
+ */
+public final class FloatVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public FloatVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_FLOAT, _bb); return this;
+  }
+
+  /**
+   * Reads the float value at the given index.
+   *
+   * @param j The index from which the float value will be read.
+   * @return the float value at the given index.
+   */
+  public float get(int j) { 
+    return bb.getFloat(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/IntVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/IntVector.java
new file mode 100644
index 0000000..85549f4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/IntVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 32-bit values.
+ */
+public final class IntVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public IntVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_INT, _bb); return this;
+  }
+
+  /**
+   * Reads the integer at the given index.
+   *
+   * @param j The index from which the integer will be read.
+   * @return the 32-bit value at the given index.
+   */
+  public int get(int j) {
+    return bb.getInt(__element(j));
+  }
+
+  /**
+   * Reads the integer at the given index, zero-extends it to type long, and returns the result,
+   * which is therefore in the range 0 through 4294967295.
+   *
+   * @param j The index from which the integer will be read.
+   * @return the unsigned 32-bit at the given index.
+   */
+  public long getAsUnsigned(int j) {
+    return (long) get(j) & 0xFFFFFFFFL;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/LongVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/LongVector.java
new file mode 100644
index 0000000..0ca5ab8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/LongVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of long values.
+ */
+public final class LongVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public LongVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_LONG, _bb); return this;
+  }
+
+  /**
+   * Reads the long value at the given index.
+   *
+   * @param j The index from which the long value will be read.
+   * @return the signed 64-bit value at the given index.
+   */
+  public long get(int j) {
+    return bb.getLong(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadBuf.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadBuf.java
new file mode 100644
index 0000000..751361f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadBuf.java
@@ -0,0 +1,81 @@
+package com.google.flatbuffers;
+
+/**
+ *  Represent a chunk of data, where FlexBuffers will read from.
+ */
+public interface ReadBuf {
+
+  /**
+   * Read boolean from data. Booleans as stored as single byte
+   * @param index position of the element in ReadBuf
+   * @return boolean element
+   */
+  boolean getBoolean(int index);
+
+  /**
+   * Read a byte from data.
+   * @param index position of the element in ReadBuf
+   * @return a byte
+   */
+  byte get(int index);
+
+  /**
+   * Read a short from data.
+   * @param index position of the element in ReadBuf
+   * @return a short
+   */
+  short getShort(int index);
+
+  /**
+   * Read a 32-bit int from data.
+   * @param index position of the element in ReadBuf
+   * @return an int
+   */
+  int getInt(int index);
+
+  /**
+   * Read a 64-bit long from data.
+   * @param index position of the element in ReadBuf
+   * @return a long
+   */
+  long getLong(int index);
+
+  /**
+   * Read a 32-bit float from data.
+   * @param index position of the element in ReadBuf
+   * @return a float
+   */
+  float getFloat(int index);
+
+  /**
+   * Read a 64-bit float from data.
+   * @param index position of the element in ReadBuf
+   * @return a double
+   */
+  double getDouble(int index);
+
+  /**
+   * Read an UTF-8 string from data.
+   * @param start initial element of the string
+   * @param size size of the string in bytes.
+   * @return a {@code String}
+   */
+  String getString(int start, int size);
+
+  /**
+   * Expose ReadBuf as an array of bytes.
+   * This method is meant to be as efficient as possible, so for a array-backed ReadBuf, it should
+   * return its own internal data. In case access to internal data is not possible,
+   * a copy of the data into an array of bytes might occur.
+   * @return ReadBuf as an array of bytes
+   */
+  byte[] data();
+
+  /**
+   * Defines the size of the message in the buffer. It also determines last position that buffer
+   * can be read. Last byte to be accessed is in position {@code limit() -1}.
+   * @return indicate last position
+   */
+  int limit();
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadWriteBuf.java
new file mode 100644
index 0000000..6eb43bd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ReadWriteBuf.java
@@ -0,0 +1,142 @@
+package com.google.flatbuffers;
+
+/**
+ * Interface to represent a read-write buffer. This interface will be used to access and write
+ * FlexBuffers message.
+ */
+public interface ReadWriteBuf extends ReadBuf {
+
+    /**
+     * Clears (resets) the buffer so that it can be reused. Write position will be set to the
+     * start.
+     */
+    void clear();
+
+    /**
+     * Put a boolean into the buffer at {@code writePosition()} . Booleans as stored as single
+     * byte. Write position will be incremented.
+     * @return boolean element
+     */
+    void putBoolean(boolean value);
+
+    /**
+     * Put an array of bytes into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     * @param value the data to be copied
+     * @param start initial position on value to be copied
+     * @param length amount of bytes to be copied
+     */
+    void put (byte[] value, int start, int length);
+
+    /**
+     * Write a byte into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void put(byte value);
+
+    /**
+     * Write a 16-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putShort(short value);
+
+    /**
+     * Write a 32-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putInt(int value);
+
+    /**
+     * Write a 64-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putLong(long value);
+
+    /**
+     * Write a 32-bit float into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putFloat(float value);
+
+    /**
+     * Write a 64-bit float into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putDouble(double value);
+
+    /**
+     * Write boolean into a given position on the buffer. Booleans as stored as single byte.
+     * @param index position of the element in buffer
+     */
+    void setBoolean(int index, boolean value);
+
+    /**
+     * Read a byte from data.
+     * @param index position of the element in the buffer
+     * @return a byte
+     */
+    void set(int index, byte value);
+
+    /**
+     * Write an array of bytes into the buffer.
+     * @param index initial position of the buffer to be written
+     * @param value the data to be copied
+     * @param start initial position on value to be copied
+     * @param length amount of bytes to be copied
+     */
+    void set(int index, byte[] value, int start, int length);
+
+    /**
+     * Read a short from data.
+     * @param index position of the element in ReadBuf
+     * @return a short
+     */
+    void setShort(int index, short value);
+
+    /**
+     * Read a 32-bit int from data.
+     * @param index position of the element in ReadBuf
+     * @return an int
+     */
+    void setInt(int index, int value);
+
+    /**
+     * Read a 64-bit long from data.
+     * @param index position of the element in ReadBuf
+     * @return a long
+     */
+    void setLong(int index, long value);
+
+    /**
+     * Read a 32-bit float from data.
+     * @param index position of the element in ReadBuf
+     * @return a float
+     */
+    void setFloat(int index, float value);
+
+    /**
+     * Read a 64-bit float from data.
+     * @param index position of the element in ReadBuf
+     * @return a double
+     */
+    void setDouble(int index, double value);
+
+
+    int writePosition();
+    /**
+     * Defines the size of the message in the buffer. It also determines last position that buffer
+     * can be read or write. Last byte to be accessed is in position {@code limit() -1}.
+     * @return indicate last position
+     */
+    int limit();
+
+    /**
+     * Request capacity of the buffer. In case buffer is already larger
+     * than the requested, this method will just return true. Otherwise
+     * It might try to resize the buffer.
+     *
+     * @return true if buffer is able to offer
+     * the requested capacity
+     */
+    boolean requestCapacity(int capacity);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ShortVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ShortVector.java
new file mode 100644
index 0000000..b02ac3e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/ShortVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 16-bit values.
+ */
+public final class ShortVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public ShortVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_SHORT, _bb); return this;
+  }
+
+  /**
+   * Reads the short value at the given index.
+   *
+   * @param j The index from which the short value will be read.
+   * @return the 16-bit value at the given index.
+   */
+  public short get(int j) {
+    return bb.getShort(__element(j));
+  }
+
+  /**
+   * Reads the short at the given index, zero-extends it to type int, and returns the result,
+   * which is therefore in the range 0 through 65535.
+   *
+   * @param j The index from which the short value will be read.
+   * @return the unsigned 16-bit at the given index.
+   */
+  public int getAsUnsigned(int j) {
+    return (int) get(j) & 0xFFFF;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/StringVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/StringVector.java
new file mode 100644
index 0000000..6c20775
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/StringVector.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of String.
+ */
+public final class StringVector extends BaseVector {
+  private Utf8 utf8 = Utf8.getDefault();
+
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _element_size Size of a vector element.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public StringVector __assign(int _vector, int _element_size, ByteBuffer _bb) {
+    __reset(_vector, _element_size, _bb); return this;
+  }
+
+  /**
+   * Reads the String at the given index.
+   *
+   * @param j The index from which the String value will be read.
+   * @return the String at the given index.
+   */
+  public String get(int j) {
+    return Table.__string(__element(j), bb, utf8);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Struct.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Struct.java
new file mode 100644
index 0000000..c92164f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Struct.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All structs in the generated code derive from this class, and add their own accessors.
+ */
+public class Struct {
+  /** Used to hold the position of the `bb` buffer. */
+  protected int bb_pos;
+  /** The underlying ByteBuffer to hold the data of the Struct. */
+  protected ByteBuffer bb;
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _i, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      bb_pos = _i;
+    } else {
+      bb_pos = 0;
+    }
+  }
+
+  /**
+   * Resets internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling Struct instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   *
+   * @param struct the instance to reset to initial state
+   */
+  public void __reset() {
+    __reset(0, null);
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Table.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Table.java
new file mode 100644
index 0000000..7f41639
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Table.java
@@ -0,0 +1,322 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All tables in the generated code derive from this class, and add their own accessors.
+ */
+public class Table {
+  /** Used to hold the position of the `bb` buffer. */
+  protected int bb_pos;
+  /** The underlying ByteBuffer to hold the data of the Table. */
+  protected ByteBuffer bb;
+  /** Used to hold the vtable position. */
+  private int vtable_start;
+  /** Used to hold the vtable size. */
+  private int vtable_size;
+  Utf8 utf8 = Utf8.getDefault();
+
+  /**
+   * Get the underlying ByteBuffer.
+   *
+   * @return Returns the Table's ByteBuffer.
+   */
+  public ByteBuffer getByteBuffer() { return bb; }
+
+  /**
+   * Look up a field in the vtable.
+   *
+   * @param vtable_offset An `int` offset to the vtable in the Table's ByteBuffer.
+   * @return Returns an offset into the object, or `0` if the field is not present.
+   */
+  protected int __offset(int vtable_offset) {
+    return vtable_offset < vtable_size ? bb.getShort(vtable_start + vtable_offset) : 0;
+  }
+
+  protected static int __offset(int vtable_offset, int offset, ByteBuffer bb) {
+    int vtable = bb.capacity() - offset;
+    return bb.getShort(vtable + vtable_offset - bb.getInt(vtable)) + vtable;
+  }
+
+  /**
+   * Retrieve a relative offset.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer containing the relative offset.
+   * @return Returns the relative offset stored at `offset`.
+   */
+  protected int __indirect(int offset) {
+    return offset + bb.getInt(offset);
+  }
+
+  /**
+   * Retrieve a relative offset.
+   *
+   * @param offset An `int` index into a ByteBuffer containing the relative offset.
+   * @param bb from which the relative offset will be retrieved.
+   * @return Returns the relative offset stored at `offset`.
+   */
+  protected static int __indirect(int offset, ByteBuffer bb) {
+    return offset + bb.getInt(offset);
+  }
+
+  /**
+   * Create a Java `String` from UTF-8 data stored inside the FlatBuffer.
+   *
+   * This allocates a new string and converts to wide chars upon each access,
+   * which is not very efficient. Instead, each FlatBuffer string also comes with an
+   * accessor based on __vector_as_bytebuffer below, which is much more efficient,
+   * assuming your Java program can handle UTF-8 data directly.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns a `String` from the data stored inside the FlatBuffer at `offset`.
+   */
+  protected String __string(int offset) {
+    return __string(offset, bb, utf8);
+  }
+
+  /**
+   * Create a Java `String` from UTF-8 data stored inside the FlatBuffer.
+   *
+   * This allocates a new string and converts to wide chars upon each access,
+   * which is not very efficient. Instead, each FlatBuffer string also comes with an
+   * accessor based on __vector_as_bytebuffer below, which is much more efficient,
+   * assuming your Java program can handle UTF-8 data directly.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @param bb Table ByteBuffer used to read a string at given offset.
+   * @param utf8 decoder that creates a Java `String` from UTF-8 characters.
+   * @return Returns a `String` from the data stored inside the FlatBuffer at `offset`.
+   */
+  protected static String __string(int offset, ByteBuffer bb, Utf8 utf8) {
+    offset += bb.getInt(offset);
+    int length = bb.getInt(offset);
+    return utf8.decodeUtf8(bb, offset + SIZEOF_INT, length);
+  }
+
+  /**
+   * Get the length of a vector.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the length of the vector whose offset is stored at `offset`.
+   */
+  protected int __vector_len(int offset) {
+    offset += bb_pos;
+    offset += bb.getInt(offset);
+    return bb.getInt(offset);
+  }
+
+  /**
+   * Get the start data of a vector.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the start of the vector data whose offset is stored at `offset`.
+   */
+  protected int __vector(int offset) {
+    offset += bb_pos;
+    return offset + bb.getInt(offset) + SIZEOF_INT;  // data starts after the length
+  }
+
+  /**
+   * Get a whole vector as a ByteBuffer.
+   *
+   * This is efficient, since it only allocates a new {@link ByteBuffer} object,
+   * but does not actually copy the data, it still refers to the same bytes
+   * as the original ByteBuffer. Also useful with nested FlatBuffers, etc.
+   *
+   * @param vector_offset The position of the vector in the byte buffer
+   * @param elem_size The size of each element in the array
+   * @return The {@link ByteBuffer} for the array
+   */
+  protected ByteBuffer __vector_as_bytebuffer(int vector_offset, int elem_size) {
+    int o = __offset(vector_offset);
+    if (o == 0) return null;
+    ByteBuffer bb = this.bb.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+    int vectorstart = __vector(o);
+    bb.position(vectorstart);
+    bb.limit(vectorstart + __vector_len(o) * elem_size);
+    return bb;
+  }
+
+  /**
+   * Initialize vector as a ByteBuffer.
+   *
+   * This is more efficient than using duplicate, since it doesn't copy the data
+   * nor allocattes a new {@link ByteBuffer}, creating no garbage to be collected.
+   *
+   * @param bb The {@link ByteBuffer} for the array
+   * @param vector_offset The position of the vector in the byte buffer
+   * @param elem_size The size of each element in the array
+   * @return The {@link ByteBuffer} for the array
+   */
+  protected ByteBuffer __vector_in_bytebuffer(ByteBuffer bb, int vector_offset, int elem_size) {
+    int o = this.__offset(vector_offset);
+    if (o == 0) return null;
+    int vectorstart = __vector(o);
+    bb.rewind();
+    bb.limit(vectorstart + __vector_len(o) * elem_size);
+    bb.position(vectorstart);
+    return bb;
+  }
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `offset`.
+   *
+   * @param t A `Table`-derived type that should point to the union at `offset`.
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the Table that points to the union at `offset`.
+   */
+  protected Table __union(Table t, int offset) {
+    return __union(t, offset, bb);
+  }
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `offset`.
+   *
+   * @param t A `Table`-derived type that should point to the union at `offset`.
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @param bb Table ByteBuffer used to initialize the object Table-derived type.
+   * @return Returns the Table that points to the union at `offset`.
+   */
+  protected static Table __union(Table t, int offset, ByteBuffer bb) {
+    t.__reset(__indirect(offset, bb), bb);
+    return t;
+  }
+
+  /**
+   * Check if a {@link ByteBuffer} contains a file identifier.
+   *
+   * @param bb A {@code ByteBuffer} to check if it contains the identifier
+   * `ident`.
+   * @param ident A `String` identifier of the FlatBuffer file.
+   * @return True if the buffer contains the file identifier
+   */
+  protected static boolean __has_identifier(ByteBuffer bb, String ident) {
+    if (ident.length() != FILE_IDENTIFIER_LENGTH)
+        throw new AssertionError("FlatBuffers: file identifier must be length " +
+                                 FILE_IDENTIFIER_LENGTH);
+    for (int i = 0; i < FILE_IDENTIFIER_LENGTH; i++) {
+      if (ident.charAt(i) != (char)bb.get(bb.position() + SIZEOF_INT + i)) return false;
+    }
+    return true;
+  }
+
+  /**
+   * Sort tables by the key.
+   *
+   * @param offsets An 'int' indexes of the tables into the bb.
+   * @param bb A {@code ByteBuffer} to get the tables.
+   */
+  protected void sortTables(int[] offsets, final ByteBuffer bb) {
+    Integer[] off = new Integer[offsets.length];
+    for (int i = 0; i < offsets.length; i++) off[i] = offsets[i];
+    java.util.Arrays.sort(off, new java.util.Comparator<Integer>() {
+      public int compare(Integer o1, Integer o2) {
+        return keysCompare(o1, o2, bb);
+      }
+    });
+    for (int i = 0; i < offsets.length; i++) offsets[i] = off[i];
+  }
+
+  /**
+   * Compare two tables by the key.
+   *
+   * @param o1 An 'Integer' index of the first key into the bb.
+   * @param o2 An 'Integer' index of the second key into the bb.
+   * @param bb A {@code ByteBuffer} to get the keys.
+   */
+  protected int keysCompare(Integer o1, Integer o2, ByteBuffer bb) { return 0; }
+
+  /**
+   * Compare two strings in the buffer.
+   *
+   * @param offset_1 An 'int' index of the first string into the bb.
+   * @param offset_2 An 'int' index of the second string into the bb.
+   * @param bb A {@code ByteBuffer} to get the strings.
+   */
+  protected static int compareStrings(int offset_1, int offset_2, ByteBuffer bb) {
+    offset_1 += bb.getInt(offset_1);
+    offset_2 += bb.getInt(offset_2);
+    int len_1 = bb.getInt(offset_1);
+    int len_2 = bb.getInt(offset_2);
+    int startPos_1 = offset_1 + SIZEOF_INT;
+    int startPos_2 = offset_2 + SIZEOF_INT;
+    int len = Math.min(len_1, len_2);
+    for(int i = 0; i < len; i++) {
+      if (bb.get(i + startPos_1) != bb.get(i + startPos_2))
+        return bb.get(i + startPos_1) - bb.get(i + startPos_2);
+    }
+    return len_1 - len_2;
+  }
+
+  /**
+   * Compare string from the buffer with the 'String' object.
+   *
+   * @param offset_1 An 'int' index of the first string into the bb.
+   * @param key Second string as a byte array.
+   * @param bb A {@code ByteBuffer} to get the first string.
+   */
+  protected static int compareStrings(int offset_1, byte[] key, ByteBuffer bb) {
+    offset_1 += bb.getInt(offset_1);
+    int len_1 = bb.getInt(offset_1);
+    int len_2 = key.length;
+    int startPos_1 = offset_1 + Constants.SIZEOF_INT;
+    int len = Math.min(len_1, len_2);
+    for (int i = 0; i < len; i++) {
+      if (bb.get(i + startPos_1) != key[i])
+        return bb.get(i + startPos_1) - key[i];
+    }
+    return len_1 - len_2;
+  }
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _i, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      bb_pos = _i;
+      vtable_start = bb_pos - bb.getInt(bb_pos);
+      vtable_size = bb.getShort(vtable_start);
+    } else {
+      bb_pos = 0;
+      vtable_start = 0;
+      vtable_size = 0;
+    }
+  }
+
+  /**
+   * Resets the internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   */
+  public void __reset() {
+    __reset(0, null);
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/UnionVector.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/UnionVector.java
new file mode 100644
index 0000000..986cfea
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/UnionVector.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of unions.
+ */
+public final class UnionVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _element_size Size of a vector element.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public UnionVector __assign(int _vector, int _element_size, ByteBuffer _bb) {
+    __reset(_vector, _element_size, _bb); return this;
+  }
+
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `index`.
+   *
+   * @param obj A `Table`-derived type that should point to the union at `index`.
+   * @param j An `int` index into the union vector.
+   * @return Returns the Table that points to the union at `index`.
+   */
+  public Table get(Table obj, int j) {
+    return Table.__union(obj, __element(j), bb);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8.java
new file mode 100644
index 0000000..e8af8ad
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8.java
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.MIN_HIGH_SURROGATE;
+import static java.lang.Character.MIN_LOW_SURROGATE;
+import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
+import static java.lang.Character.isSurrogatePair;
+import static java.lang.Character.toCodePoint;
+
+public abstract class Utf8 {
+
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
+   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
+   *     surrogates)
+   */
+  public abstract int encodedLength(CharSequence sequence);
+
+  /**
+   * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
+   *
+   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
+   * and the capabilities of the platform.
+   *
+   * @param in the source string to be encoded
+   * @param out the target buffer to receive the encoded string.
+   */
+  public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
+
+  /**
+   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
+   *
+   * @throws IllegalArgumentException if the input is not valid UTF-8.
+   */
+  public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
+
+  private static Utf8 DEFAULT;
+
+  /**
+   * Get the default UTF-8 processor.
+   * @return the default processor
+   */
+  public static Utf8 getDefault() {
+    if (DEFAULT == null) {
+      DEFAULT = new Utf8Safe();
+    }
+    return DEFAULT;
+  }
+
+  /**
+   * Set the default instance of the UTF-8 processor.
+   * @param instance the new instance to use
+   */
+  public static void setDefault(Utf8 instance) {
+    DEFAULT = instance;
+  }
+
+  /**
+   * Encode a Java's CharSequence UTF8 codepoint into a byte array.
+   * @param in CharSequence to be encoded
+   * @param start start position of the first char in the codepoint
+   * @param out byte array of 4 bytes to be filled
+   * @return return the amount of bytes occupied by the codepoint
+   */
+  public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
+    // utf8 codepoint needs at least 4 bytes
+    assert out.length >= 4;
+
+    final int inLength = in.length();
+    if (start >= inLength) {
+      return 0;
+    }
+
+    char c = in.charAt(start);
+     if (c < 0x80) {
+       // One byte (0xxx xxxx)
+       out[0] = (byte) c;
+       return 1;
+     } else if (c < 0x800) {
+      // Two bytes (110x xxxx 10xx xxxx)
+      out[0] = (byte) (0xC0 | (c >>> 6));
+      out[1] = (byte) (0x80 | (0x3F & c));
+      return 2;
+    } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
+      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+      // Maximum single-char code point is 0xFFFF, 16 bits.
+      out[0] = (byte) (0xE0 | (c >>> 12));
+      out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
+      out[2] = (byte) (0x80 | (0x3F & c));
+      return 3;
+    } else {
+      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+      // bytes
+      final char low;
+      if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
+        throw new UnpairedSurrogateException(start, inLength);
+      }
+      int codePoint = toCodePoint(c, low);
+      out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
+      out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
+      out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
+      out[3] = (byte) (0x80 | (0x3F & codePoint));
+      return 4;
+    }
+  }
+
+  /**
+   * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
+   * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
+   * checks and codepoint conversion happen in this class.
+   */
+  static class DecodeUtil {
+
+    /**
+     * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
+     */
+    static boolean isOneByte(byte b) {
+      return b >= 0;
+    }
+
+    /**
+     * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
+     */
+    static boolean isTwoBytes(byte b) {
+      return b < (byte) 0xE0;
+    }
+
+    /**
+     * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
+     */
+    static boolean isThreeBytes(byte b) {
+      return b < (byte) 0xF0;
+    }
+
+    static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
+      resultArr[resultPos] = (char) byte1;
+    }
+
+    static void handleTwoBytes(
+        byte byte1, byte byte2, char[] resultArr, int resultPos)
+        throws IllegalArgumentException {
+      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
+      // overlong 2-byte, '11000001'.
+      if (byte1 < (byte) 0xC2) {
+        throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
+      }
+      if (isNotTrailingByte(byte2)) {
+        throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
+      }
+      resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
+    }
+
+    static void handleThreeBytes(
+        byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
+        throws IllegalArgumentException {
+      if (isNotTrailingByte(byte2)
+              // overlong? 5 most significant bits must not all be zero
+              || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
+              // check for illegal surrogate codepoints
+              || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
+              || isNotTrailingByte(byte3)) {
+        throw new IllegalArgumentException("Invalid UTF-8");
+      }
+      resultArr[resultPos] = (char)
+                                 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
+    }
+
+    static void handleFourBytes(
+        byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
+        throws IllegalArgumentException{
+      if (isNotTrailingByte(byte2)
+              // Check that 1 <= plane <= 16.  Tricky optimized form of:
+              //   valid 4-byte leading byte?
+              // if (byte1 > (byte) 0xF4 ||
+              //   overlong? 4 most significant bits must not all be zero
+              //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
+              //   codepoint larger than the highest code point (U+10FFFF)?
+              //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+              || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
+              || isNotTrailingByte(byte3)
+              || isNotTrailingByte(byte4)) {
+        throw new IllegalArgumentException("Invalid UTF-8");
+      }
+      int codepoint = ((byte1 & 0x07) << 18)
+                          | (trailingByteValue(byte2) << 12)
+                          | (trailingByteValue(byte3) << 6)
+                          | trailingByteValue(byte4);
+      resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
+      resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
+    }
+
+    /**
+     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
+     */
+    private static boolean isNotTrailingByte(byte b) {
+      return b > (byte) 0xBF;
+    }
+
+    /**
+     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
+     */
+    private static int trailingByteValue(byte b) {
+      return b & 0x3F;
+    }
+
+    private static char highSurrogate(int codePoint) {
+      return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
+                         + (codePoint >>> 10));
+    }
+
+    private static char lowSurrogate(int codePoint) {
+      return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
+    }
+  }
+
+  // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
+  // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
+  // fallback to more lenient behavior.
+  static class UnpairedSurrogateException extends IllegalArgumentException {
+    UnpairedSurrogateException(int index, int length) {
+      super("Unpaired surrogate at index " + index + " of " + length);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Old.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Old.java
new file mode 100644
index 0000000..3dac714
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Old.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * This class implements the Utf8 API using the Java Utf8 encoder. Use
+ * Utf8.setDefault(new Utf8Old()); to use it.
+ */
+public class Utf8Old extends Utf8 {
+
+  private static class Cache {
+    final CharsetEncoder encoder;
+    final CharsetDecoder decoder;
+    CharSequence lastInput = null;
+    ByteBuffer lastOutput = null;
+
+    Cache() {
+      encoder = StandardCharsets.UTF_8.newEncoder();
+      decoder = StandardCharsets.UTF_8.newDecoder();
+    }
+  }
+
+  private static final ThreadLocal<Cache> CACHE =
+      ThreadLocal.withInitial(() -> new Cache());
+
+  // Play some games so that the old encoder doesn't pay twice for computing
+  // the length of the encoded string.
+
+  @Override
+  public int encodedLength(CharSequence in) {
+    final Cache cache = CACHE.get();
+    int estimated = (int) (in.length() * cache.encoder.maxBytesPerChar());
+    if (cache.lastOutput == null || cache.lastOutput.capacity() < estimated) {
+      cache.lastOutput = ByteBuffer.allocate(Math.max(128, estimated));
+    }
+    cache.lastOutput.clear();
+    cache.lastInput = in;
+    CharBuffer wrap = (in instanceof CharBuffer) ?
+                          (CharBuffer) in : CharBuffer.wrap(in);
+    CoderResult result = cache.encoder.encode(wrap, cache.lastOutput, true);
+    if (result.isError()) {
+      try {
+        result.throwException();
+      } catch (CharacterCodingException e) {
+        throw new IllegalArgumentException("bad character encoding", e);
+      }
+    }
+    cache.lastOutput.flip();
+    return cache.lastOutput.remaining();
+  }
+
+  @Override
+  public void encodeUtf8(CharSequence in, ByteBuffer out) {
+    final Cache cache = CACHE.get();
+    if (cache.lastInput != in) {
+      // Update the lastOutput to match our input, although flatbuffer should
+      // never take this branch.
+      encodedLength(in);
+    }
+    out.put(cache.lastOutput);
+  }
+
+  @Override
+  public String decodeUtf8(ByteBuffer buffer, int offset, int length) {
+    CharsetDecoder decoder = CACHE.get().decoder;
+    decoder.reset();
+    buffer = buffer.duplicate();
+    buffer.position(offset);
+    buffer.limit(offset + length);
+    try {
+      CharBuffer result = decoder.decode(buffer);
+      return result.toString();
+    } catch (CharacterCodingException e) {
+      throw new IllegalArgumentException("Bad encoding", e);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Safe.java b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Safe.java
new file mode 100644
index 0000000..523e3f1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/java/com/google/flatbuffers/Utf8Safe.java
@@ -0,0 +1,451 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.isSurrogatePair;
+import static java.lang.Character.toCodePoint;
+
+/**
+ * A set of low-level, high-performance static utility methods related
+ * to the UTF-8 character encoding.  This class has no dependencies
+ * outside of the core JDK libraries.
+ *
+ * <p>There are several variants of UTF-8.  The one implemented by
+ * this class is the restricted definition of UTF-8 introduced in
+ * Unicode 3.1, which mandates the rejection of "overlong" byte
+ * sequences as well as rejection of 3-byte surrogate codepoint byte
+ * sequences.  Note that the UTF-8 decoder included in Oracle's JDK
+ * has been modified to also reject "overlong" byte sequences, but (as
+ * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
+ *
+ * <p>The byte sequences considered valid by this class are exactly
+ * those that can be roundtrip converted to Strings and back to bytes
+ * using the UTF-8 charset, without loss: <pre> {@code
+ * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
+ * }</pre>
+ *
+ * <p>See the Unicode Standard,</br>
+ * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
+ * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
+ */
+final public class Utf8Safe extends Utf8 {
+
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
+   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
+   *     surrogates)
+   */
+  private static int computeEncodedLength(CharSequence sequence) {
+    // Warning to maintainers: this implementation is highly optimized.
+    int utf16Length = sequence.length();
+    int utf8Length = utf16Length;
+    int i = 0;
+
+    // This loop optimizes for pure ASCII.
+    while (i < utf16Length && sequence.charAt(i) < 0x80) {
+      i++;
+    }
+
+    // This loop optimizes for chars less than 0x800.
+    for (; i < utf16Length; i++) {
+      char c = sequence.charAt(i);
+      if (c < 0x800) {
+        utf8Length += ((0x7f - c) >>> 31);  // branch free!
+      } else {
+        utf8Length += encodedLengthGeneral(sequence, i);
+        break;
+      }
+    }
+
+    if (utf8Length < utf16Length) {
+      // Necessary and sufficient condition for overflow because of maximum 3x expansion
+      throw new IllegalArgumentException("UTF-8 length does not fit in int: "
+                                             + (utf8Length + (1L << 32)));
+    }
+    return utf8Length;
+  }
+
+  private static int encodedLengthGeneral(CharSequence sequence, int start) {
+    int utf16Length = sequence.length();
+    int utf8Length = 0;
+    for (int i = start; i < utf16Length; i++) {
+      char c = sequence.charAt(i);
+      if (c < 0x800) {
+        utf8Length += (0x7f - c) >>> 31; // branch free!
+      } else {
+        utf8Length += 2;
+        // jdk7+: if (Character.isSurrogate(c)) {
+        if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
+          // Check that we have a well-formed surrogate pair.
+          int cp = Character.codePointAt(sequence, i);
+          if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
+            throw new Utf8Safe.UnpairedSurrogateException(i, utf16Length);
+          }
+          i++;
+        }
+      }
+    }
+    return utf8Length;
+  }
+
+  public static String decodeUtf8Array(byte[] bytes, int index, int size) {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    if ((index | size | bytes.length - index - size) < 0) {
+      throw new ArrayIndexOutOfBoundsException(
+          String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+    }
+
+    int offset = index;
+    final int limit = offset + size;
+
+    // The longest possible resulting String is the same as the number of input bytes, when it is
+    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+    char[] resultArr = new char[size];
+    int resultPos = 0;
+
+    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+    while (offset < limit) {
+      byte b = bytes[offset];
+      if (!DecodeUtil.isOneByte(b)) {
+        break;
+      }
+      offset++;
+      DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+    }
+
+    while (offset < limit) {
+      byte byte1 = bytes[offset++];
+      if (DecodeUtil.isOneByte(byte1)) {
+        DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+        // extra optimized loop to take care of these runs.
+        while (offset < limit) {
+          byte b = bytes[offset];
+          if (!DecodeUtil.isOneByte(b)) {
+            break;
+          }
+          offset++;
+          DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+        }
+      } else if (DecodeUtil.isTwoBytes(byte1)) {
+        if (offset >= limit) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
+      } else if (DecodeUtil.isThreeBytes(byte1)) {
+        if (offset >= limit - 1) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleThreeBytes(
+            byte1,
+            /* byte2 */ bytes[offset++],
+            /* byte3 */ bytes[offset++],
+            resultArr,
+            resultPos++);
+      } else {
+        if (offset >= limit - 2) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleFourBytes(
+            byte1,
+            /* byte2 */ bytes[offset++],
+            /* byte3 */ bytes[offset++],
+            /* byte4 */ bytes[offset++],
+            resultArr,
+            resultPos++);
+        // 4-byte case requires two chars.
+        resultPos++;
+      }
+    }
+
+    return new String(resultArr, 0, resultPos);
+  }
+
+  public static String decodeUtf8Buffer(ByteBuffer buffer, int offset,
+                                         int length) {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    if ((offset | length | buffer.limit() - offset - length) < 0) {
+      throw new ArrayIndexOutOfBoundsException(
+          String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(),
+              offset, length));
+    }
+
+    final int limit = offset + length;
+
+    // The longest possible resulting String is the same as the number of input bytes, when it is
+    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+    char[] resultArr = new char[length];
+    int resultPos = 0;
+
+    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+    while (offset < limit) {
+      byte b = buffer.get(offset);
+      if (!DecodeUtil.isOneByte(b)) {
+        break;
+      }
+      offset++;
+      DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+    }
+
+    while (offset < limit) {
+      byte byte1 = buffer.get(offset++);
+      if (DecodeUtil.isOneByte(byte1)) {
+        DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+        // extra optimized loop to take care of these runs.
+        while (offset < limit) {
+          byte b = buffer.get(offset);
+          if (!DecodeUtil.isOneByte(b)) {
+            break;
+          }
+          offset++;
+          DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+        }
+      } else if (DecodeUtil.isTwoBytes(byte1)) {
+        if (offset >= limit) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleTwoBytes(
+            byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
+      } else if (DecodeUtil.isThreeBytes(byte1)) {
+        if (offset >= limit - 1) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleThreeBytes(
+            byte1,
+            /* byte2 */ buffer.get(offset++),
+            /* byte3 */ buffer.get(offset++),
+            resultArr,
+            resultPos++);
+      } else {
+        if (offset >= limit - 2) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleFourBytes(
+            byte1,
+            /* byte2 */ buffer.get(offset++),
+            /* byte3 */ buffer.get(offset++),
+            /* byte4 */ buffer.get(offset++),
+            resultArr,
+            resultPos++);
+        // 4-byte case requires two chars.
+        resultPos++;
+      }
+    }
+
+    return new String(resultArr, 0, resultPos);
+  }
+
+  @Override
+  public int encodedLength(CharSequence in) {
+    return computeEncodedLength(in);
+  }
+
+  /**
+   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
+   *
+   * @throws IllegalArgumentException if the input is not valid UTF-8.
+   */
+  @Override
+  public String decodeUtf8(ByteBuffer buffer, int offset, int length)
+      throws IllegalArgumentException {
+    if (buffer.hasArray()) {
+      return decodeUtf8Array(buffer.array(), buffer.arrayOffset() + offset, length);
+    } else {
+      return decodeUtf8Buffer(buffer, offset, length);
+    }
+  }
+
+
+  private static void encodeUtf8Buffer(CharSequence in, ByteBuffer out) {
+    final int inLength = in.length();
+    int outIx = out.position();
+    int inIx = 0;
+
+    // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
+    // access. Assume the buffer is big enough and let it handle the out of bounds exception
+    // if it occurs.
+    try {
+      // Designed to take advantage of
+      // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
+      for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
+        out.put(outIx + inIx, (byte) c);
+      }
+      if (inIx == inLength) {
+        // Successfully encoded the entire string.
+        out.position(outIx + inIx);
+        return;
+      }
+
+      outIx += inIx;
+      for (char c; inIx < inLength; ++inIx, ++outIx) {
+        c = in.charAt(inIx);
+        if (c < 0x80) {
+          // One byte (0xxx xxxx)
+          out.put(outIx, (byte) c);
+        } else if (c < 0x800) {
+          // Two bytes (110x xxxx 10xx xxxx)
+
+          // Benchmarks show put performs better than putShort here (for HotSpot).
+          out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
+          out.put(outIx, (byte) (0x80 | (0x3F & c)));
+        } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
+          // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+          // Maximum single-char code point is 0xFFFF, 16 bits.
+
+          // Benchmarks show put performs better than putShort here (for HotSpot).
+          out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
+          out.put(outIx, (byte) (0x80 | (0x3F & c)));
+        } else {
+          // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+
+          // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+          // bytes
+          final char low;
+          if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
+            throw new UnpairedSurrogateException(inIx, inLength);
+          }
+          // TODO(nathanmittler): Consider using putInt() to improve performance.
+          int codePoint = toCodePoint(c, low);
+          out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
+          out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
+        }
+      }
+
+      // Successfully encoded the entire string.
+      out.position(outIx);
+    } catch (IndexOutOfBoundsException e) {
+      // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
+
+      // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
+      // use inIx to determine the bad write index.
+      int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
+      throw new ArrayIndexOutOfBoundsException(
+          "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
+    }
+  }
+
+  private static int encodeUtf8Array(CharSequence in, byte[] out,
+                                     int offset, int length) {
+    int utf16Length = in.length();
+    int j = offset;
+    int i = 0;
+    int limit = offset + length;
+    // Designed to take advantage of
+    // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
+    for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
+      out[j + i] = (byte) c;
+    }
+    if (i == utf16Length) {
+      return j + utf16Length;
+    }
+    j += i;
+    for (char c; i < utf16Length; i++) {
+      c = in.charAt(i);
+      if (c < 0x80 && j < limit) {
+        out[j++] = (byte) c;
+      } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
+        out[j++] = (byte) ((0xF << 6) | (c >>> 6));
+        out[j++] = (byte) (0x80 | (0x3F & c));
+      } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
+        // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
+        out[j++] = (byte) ((0xF << 5) | (c >>> 12));
+        out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
+        out[j++] = (byte) (0x80 | (0x3F & c));
+      } else if (j <= limit - 4) {
+        // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
+        // four UTF-8 bytes
+        final char low;
+        if (i + 1 == in.length()
+                || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
+          throw new UnpairedSurrogateException((i - 1), utf16Length);
+        }
+        int codePoint = Character.toCodePoint(c, low);
+        out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
+        out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
+        out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
+        out[j++] = (byte) (0x80 | (0x3F & codePoint));
+      } else {
+        // If we are surrogates and we're not a surrogate pair, always throw an
+        // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
+        if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
+                && (i + 1 == in.length()
+                        || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
+          throw new UnpairedSurrogateException(i, utf16Length);
+        }
+        throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
+      }
+    }
+    return j;
+  }
+
+  /**
+   * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
+   *
+   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
+   * and the capabilities of the platform.
+   *
+   * @param in the source string to be encoded
+   * @param out the target buffer to receive the encoded string.
+   */
+  @Override
+  public void encodeUtf8(CharSequence in, ByteBuffer out) {
+    if (out.hasArray()) {
+      int start = out.arrayOffset();
+      int end = encodeUtf8Array(in, out.array(), start + out.position(),
+          out.remaining());
+      out.position(end - start);
+    } else {
+      encodeUtf8Buffer(in, out);
+    }
+  }
+
+  // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with
+  // a modification to throw a local exception. This exception can be caught
+  // to fallback to more lenient behavior.
+  static class UnpairedSurrogateException extends IllegalArgumentException {
+    UnpairedSurrogateException(int index, int length) {
+      super("Unpaired surrogate at index " + index + " of " + length);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/js/README.md b/3rdparty/TNN/third_party/flatbuffers/js/README.md
new file mode 100644
index 0000000..cbcebe0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/js/README.md
@@ -0,0 +1 @@
+This folder is intentionally empty and will contain transpiled js modules in Common JS format after compiling with tsc.
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/build.gradle.kts b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/build.gradle.kts
new file mode 100644
index 0000000..2294e4c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/build.gradle.kts
@@ -0,0 +1,111 @@
+import org.jetbrains.kotlin.ir.backend.js.compile
+
+plugins {
+  kotlin("multiplatform") version "1.4.20"
+  id("org.jetbrains.kotlin.plugin.allopen") version "1.4.20"
+  id("org.jetbrains.kotlinx.benchmark") version "0.3.0"
+  id("io.morethan.jmhreport") version "0.9.0"
+  id("de.undercouch.download") version "4.1.1"
+}
+
+// allOpen plugin is needed for the benchmark annotations.
+// for more infomation, see https://github.com/Kotlin/kotlinx-benchmark#gradle-plugin
+allOpen {
+  annotation("org.openjdk.jmh.annotations.State")
+}
+
+group = "com.google.flatbuffers.jmh"
+version = "2.0.0-SNAPSHOT"
+
+// This plugin generates a static html page with the aggregation
+// of all benchmarks ran. very useful visualization tool.
+jmhReport {
+  val baseFolder = project.file("build/reports/benchmarks/main").absolutePath
+  val lastFolder = project.file(baseFolder).list()?.sortedArray()?.lastOrNull() ?: ""
+  jmhResultPath = "$baseFolder/$lastFolder/jvm.json"
+  jmhReportOutput = "$baseFolder/$lastFolder"
+}
+
+// For now we benchmark on JVM only
+benchmark {
+  configurations {
+    this.getByName("main") {
+      iterations = 5
+      iterationTime = 300
+      iterationTimeUnit = "ms"
+      // uncomment for benchmarking JSON op only
+      // include(".*JsonBenchmark.*")
+    }
+  }
+  targets {
+    register("jvm")
+  }
+}
+
+kotlin {
+  jvm {
+    withJava()
+    compilations.all {
+      kotlinOptions {
+        jvmTarget = JavaVersion.VERSION_1_8.toString()
+      }
+    }
+  }
+
+  sourceSets {
+
+    all {
+      languageSettings.enableLanguageFeature("InlineClasses")
+      languageSettings.useExperimentalAnnotation("kotlin.ExperimentalUnsignedTypes")
+    }
+
+    val commonTest by getting {
+      dependencies {
+        implementation(kotlin("test-common"))
+        implementation(kotlin("test-annotations-common"))
+      }
+    }
+    val jvmTest by getting {
+      dependencies {
+        implementation(kotlin("test-junit"))
+      }
+    }
+    val jvmMain by getting {
+      dependencies {
+        implementation("org.jetbrains.kotlinx:kotlinx-benchmark-runtime:0.3.0")
+        implementation(kotlin("stdlib-common"))
+        implementation(project(":flatbuffers-kotlin"))
+        implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
+        implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core-jvm:1.4.1")
+
+        //moshi
+        implementation("com.squareup.moshi:moshi-kotlin:1.11.0")
+
+        //gson
+        implementation("com.google.code.gson:gson:2.8.5")
+      }
+    }
+
+    /* Targets configuration omitted.
+     *  To find out how to configure the targets, please follow the link:
+     *  https://kotlinlang.org/docs/reference/building-mpp-with-gradle.html#setting-up-targets
+     */
+    targets {
+      targetFromPreset(presets.getAt("jvm"))
+    }
+  }
+}
+
+// This task download all JSON files used for benchmarking
+tasks.register<de.undercouch.gradle.tasks.download.Download>("downloadMultipleFiles") {
+  // We are downloading json benchmark samples from serdes-rs project.
+  // see: https://github.com/serde-rs/json-benchmark/blob/master/data
+  val baseUrl = "https://github.com/serde-rs/json-benchmark/raw/master/data/"
+  src(listOf("$baseUrl/canada.json", "$baseUrl/twitter.json", "$baseUrl/citm_catalog.json"))
+  dest(File("${project.projectDir.absolutePath}/src/jvmMain/resources"))
+  overwrite(false)
+}
+
+project.tasks.named("compileKotlinJvm") {
+  dependsOn("downloadMultipleFiles")
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ArrayReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ArrayReadWriteBuf.java
new file mode 100644
index 0000000..b7573d6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ArrayReadWriteBuf.java
@@ -0,0 +1,252 @@
+package com.google.flatbuffers;
+
+import java.util.Arrays;
+
+/**
+ * Implements {@code ReadBuf} using an array of bytes
+ * as a backing storage. Using array of bytes are
+ * usually faster than {@code ByteBuffer}.
+ *
+ * This class is not thread-safe, meaning that
+ * it must operate on a single thread. Operating from
+ * multiple thread leads into a undefined behavior
+ */
+public class ArrayReadWriteBuf implements ReadWriteBuf {
+
+  private byte[] buffer;
+  private int writePos;
+
+  public ArrayReadWriteBuf() {
+    this(10);
+  }
+
+  public ArrayReadWriteBuf(int initialCapacity) {
+    this(new byte[initialCapacity]);
+  }
+
+  public ArrayReadWriteBuf(byte[] buffer) {
+    this.buffer = buffer;
+    this.writePos = 0;
+  }
+
+  public ArrayReadWriteBuf(byte[] buffer, int startPos) {
+    this.buffer = buffer;
+    this.writePos = startPos;
+  }
+
+  @Override
+  public void clear() {
+    this.writePos = 0;
+  }
+
+  @Override
+  public boolean getBoolean(int index) {
+    return buffer[index] != 0;
+  }
+
+  @Override
+  public byte get(int index) {
+    return buffer[index];
+  }
+
+  @Override
+  public short getShort(int index) {
+    return (short) ((buffer[index+ 1] << 8) | (buffer[index] & 0xff));
+  }
+
+  @Override
+  public int getInt(int index) {
+    return (((buffer[index + 3]) << 24) |
+      ((buffer[index + 2] & 0xff) << 16) |
+      ((buffer[index + 1] & 0xff) << 8) |
+      ((buffer[index] & 0xff)));
+  }
+
+  @Override
+  public long getLong(int index) {
+    return ((((long) buffer[index++] & 0xff)) |
+      (((long) buffer[index++] & 0xff) << 8) |
+      (((long) buffer[index++] & 0xff) << 16) |
+      (((long) buffer[index++] & 0xff) << 24) |
+      (((long) buffer[index++] & 0xff) << 32) |
+      (((long) buffer[index++] & 0xff) << 40) |
+      (((long) buffer[index++] & 0xff) << 48) |
+      (((long) buffer[index]) << 56));
+  }
+
+  @Override
+  public float getFloat(int index) {
+    return Float.intBitsToFloat(getInt(index));
+  }
+
+  @Override
+  public double getDouble(int index) {
+    return Double.longBitsToDouble(getLong(index));
+  }
+
+  @Override
+  public String getString(int start, int size) {
+    return Utf8Safe.decodeUtf8Array(buffer, start, size);
+  }
+
+  @Override
+  public byte[] data() {
+    return buffer;
+  }
+
+
+  @Override
+  public void putBoolean(boolean value) {
+      setBoolean(writePos, value);
+      writePos++;
+  }
+
+  @Override
+  public void put(byte[] value, int start, int length) {
+    set(writePos, value, start, length);
+    writePos+=length;
+  }
+
+  @Override
+  public void put(byte value) {
+    set(writePos, value);
+    writePos++;
+  }
+
+  @Override
+  public void putShort(short value) {
+    setShort(writePos, value);
+    writePos +=2;
+  }
+
+  @Override
+  public void putInt(int value) {
+    setInt(writePos, value);
+    writePos +=4;
+  }
+
+  @Override
+  public void putLong(long value) {
+    setLong(writePos, value);
+    writePos +=8;
+  }
+
+  @Override
+  public void putFloat(float value) {
+    setFloat(writePos, value);
+    writePos +=4;
+  }
+
+  @Override
+  public void putDouble(double value) {
+    setDouble(writePos, value);
+    writePos +=8;
+  }
+
+  @Override
+  public void setBoolean(int index, boolean value) {
+    set(index, value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void set(int index, byte value) {
+    requestCapacity(index + 1);
+    buffer[index] = value;
+  }
+
+  @Override
+  public void set(int index, byte[] toCopy, int start, int length) {
+    requestCapacity(index + (length - start));
+    System.arraycopy(toCopy, start, buffer, index, length);
+  }
+
+  @Override
+  public void setShort(int index, short value) {
+    requestCapacity(index + 2);
+
+    buffer[index++] = (byte) ((value) & 0xff);
+    buffer[index  ] = (byte) ((value >> 8) & 0xff);
+  }
+
+  @Override
+  public void setInt(int index, int value) {
+    requestCapacity(index + 4);
+
+    buffer[index++] = (byte) ((value) & 0xff);
+    buffer[index++] = (byte) ((value >>  8) & 0xff);
+    buffer[index++] = (byte) ((value >> 16) & 0xff);
+    buffer[index  ] = (byte) ((value >> 24) & 0xff);
+  }
+
+  @Override
+  public void setLong(int index, long value) {
+    requestCapacity(index + 8);
+
+    int i = (int) value;
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index++] = (byte) ((i >> 24) & 0xff);
+    i = (int) (value >> 32);
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index  ] = (byte) ((i >> 24) & 0xff);
+  }
+
+  @Override
+  public void setFloat(int index, float value) {
+    requestCapacity(index + 4);
+
+    int iValue = Float.floatToRawIntBits(value);
+    buffer[index++] = (byte) ((iValue) & 0xff);
+    buffer[index++] = (byte) ((iValue >>  8) & 0xff);
+    buffer[index++] = (byte) ((iValue >> 16) & 0xff);
+    buffer[index  ] = (byte) ((iValue >> 24) & 0xff);
+  }
+
+  @Override
+  public void setDouble(int index, double value) {
+    requestCapacity(index + 8);
+
+    long lValue = Double.doubleToRawLongBits(value);
+    int i = (int) lValue;
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index++] = (byte) ((i >> 24) & 0xff);
+    i = (int) (lValue >> 32);
+    buffer[index++] = (byte) ((i) & 0xff);
+    buffer[index++] = (byte) ((i >>  8) & 0xff);
+    buffer[index++] = (byte) ((i >> 16) & 0xff);
+    buffer[index  ] = (byte) ((i >> 24) & 0xff);
+  }
+
+  @Override
+  public int limit() {
+    return writePos;
+  }
+
+  @Override
+  public int writePosition() {
+    return writePos;
+  }
+
+  @Override
+  public boolean requestCapacity(int capacity) {
+    if (capacity < 0) {
+      throw new IllegalArgumentException("Capacity may not be negative (likely a previous int overflow)");
+    }
+    if (buffer.length >= capacity) {
+      return true;
+    }
+    // implemented in the same growing fashion as ArrayList
+    int oldCapacity = buffer.length;
+    int newCapacity = oldCapacity + (oldCapacity >> 1);
+    if (newCapacity < capacity) {  // Note: this also catches newCapacity int overflow
+      newCapacity = capacity;
+    }
+    buffer = Arrays.copyOf(buffer, newCapacity);
+    return true;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BaseVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BaseVector.java
new file mode 100644
index 0000000..9230da7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BaseVector.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All vector access objects derive from this class, and add their own accessors.
+ */
+public class BaseVector {
+  /** Used to hold the vector data position. */
+  private int vector;
+  /** Used to hold the vector size. */
+  private int length;
+  /** Used to hold the vector element size in table. */
+  private int element_size;
+  /** The underlying ByteBuffer to hold the data of the vector. */
+  protected ByteBuffer bb;
+
+  /**
+   * Get the start data of a vector.
+   *
+   * @return Returns the start of the vector data.
+   */
+  protected int __vector() {
+    return vector;
+  }
+
+  /**
+   * Gets the element position in vector's ByteBuffer.
+   *
+   * @param j An `int` index of element into a vector.
+   * @return Returns the position of the vector element in a ByteBuffer.
+   */
+  protected int __element(int j) {
+    return vector + j * element_size;
+  }
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer}, an offset within and
+   * element size.
+   *
+   * This method exists primarily to allow recycling vector instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _vector, int _element_size, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      vector = _vector;
+      length = bb.getInt(_vector - Constants.SIZEOF_INT);
+      element_size = _element_size;
+    } else {
+      vector = 0;
+      length = 0;
+      element_size = 0;
+    }
+  }
+
+  /**
+   * Resets the internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling vector instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   */
+  public void reset() {
+    __reset(0, 0, null);
+  }
+
+  /**
+   * Get the length of a vector.
+   *
+   * @return Returns the length of the vector.
+   */
+  public int length() {
+    return length;
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BooleanVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BooleanVector.java
new file mode 100644
index 0000000..1c2a4cd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/BooleanVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of booleans.
+ */
+public final class BooleanVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public BooleanVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_BYTE, _bb); return this;
+  }
+
+  /**
+   * Reads the boolean at the given index.
+   *
+   * @param j The index from which the boolean will be read.
+   * @return the boolean value at the given index.
+   */
+  public boolean get(int j) { 
+    return 0 != bb.get(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java
new file mode 100644
index 0000000..aaf72fe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferReadWriteBuf.java
@@ -0,0 +1,170 @@
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+public class ByteBufferReadWriteBuf implements ReadWriteBuf {
+
+  private final ByteBuffer buffer;
+
+  public ByteBufferReadWriteBuf(ByteBuffer bb) {
+    this.buffer = bb;
+    this.buffer.order(ByteOrder.LITTLE_ENDIAN);
+  }
+
+  @Override
+  public void clear() {
+    buffer.clear();
+  }
+
+  @Override
+  public boolean getBoolean(int index) {
+    return get(index) != 0;
+  }
+
+  @Override
+  public byte get(int index) {
+    return buffer.get(index);
+  }
+
+  @Override
+  public short getShort(int index) {
+    return buffer.getShort(index);
+  }
+
+  @Override
+  public int getInt(int index) {
+    return buffer.getInt(index);
+  }
+
+  @Override
+  public long getLong(int index) {
+    return buffer.getLong(index);
+  }
+
+  @Override
+  public float getFloat(int index) {
+    return buffer.getFloat(index);
+  }
+
+  @Override
+  public double getDouble(int index) {
+    return buffer.getDouble(index);
+  }
+
+  @Override
+  public String getString(int start, int size) {
+    return Utf8Safe.decodeUtf8Buffer(buffer, start, size);
+  }
+
+  @Override
+  public byte[] data() {
+    return buffer.array();
+  }
+
+  @Override
+  public void putBoolean(boolean value) {
+    buffer.put(value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void put(byte[] value, int start, int length) {
+    buffer.put(value, start, length);
+  }
+
+  @Override
+  public void put(byte value) {
+    buffer.put(value);
+  }
+
+  @Override
+  public void putShort(short value) {
+    buffer.putShort(value);
+  }
+
+  @Override
+  public void putInt(int value) {
+    buffer.putInt(value);
+  }
+
+  @Override
+  public void putLong(long value) {
+    buffer.putLong(value);
+  }
+
+  @Override
+  public void putFloat(float value) {
+    buffer.putFloat(value);
+  }
+
+  @Override
+  public void putDouble(double value) {
+    buffer.putDouble(value);
+  }
+
+  @Override
+  public void setBoolean(int index, boolean value) {
+    set(index, value ? (byte)1 : (byte)0);
+  }
+
+  @Override
+  public void set(int index, byte value) {
+    requestCapacity(index + 1);
+    buffer.put(index, value);
+  }
+
+  @Override
+  public void set(int index, byte[] value, int start, int length) {
+    requestCapacity(index + (length - start));
+    int curPos = buffer.position();
+    buffer.position(index);
+    buffer.put(value, start, length);
+    buffer.position(curPos);
+  }
+
+  @Override
+  public void setShort(int index, short value) {
+    requestCapacity(index + 2);
+    buffer.putShort(index, value);
+  }
+
+  @Override
+  public void setInt(int index, int value) {
+    requestCapacity(index + 4);
+    buffer.putInt(index, value);
+  }
+
+  @Override
+  public void setLong(int index, long value) {
+    requestCapacity(index + 8);
+    buffer.putLong(index, value);
+  }
+
+  @Override
+  public void setFloat(int index, float value) {
+    requestCapacity(index + 4);
+    buffer.putFloat(index, value);
+  }
+
+  @Override
+  public void setDouble(int index, double value) {
+    requestCapacity(index + 8);
+    buffer.putDouble(index, value);
+  }
+
+  @Override
+  public int writePosition() {
+    return buffer.position();
+  }
+
+  @Override
+  public int limit() {
+    return buffer.limit();
+  }
+
+  @Override
+  public boolean requestCapacity(int capacity) {
+    return capacity <= buffer.limit();
+  }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferUtil.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferUtil.java
new file mode 100644
index 0000000..624dc4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteBufferUtil.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+
+import java.nio.ByteBuffer;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Class that collects utility functions around `ByteBuffer`.
+ */
+public class ByteBufferUtil {
+
+	/**
+     * Extract the size prefix from a `ByteBuffer`.
+     * 
+     * @param bb a size-prefixed buffer
+     * @return the size prefix
+     */
+    public static int getSizePrefix(ByteBuffer bb) {
+        return bb.getInt(bb.position());
+    }
+
+	/**
+     * Create a duplicate of a size-prefixed `ByteBuffer` that has its position
+     * advanced just past the size prefix.
+     * 
+     * @param bb a size-prefixed buffer
+     * @return a new buffer on the same underlying data that has skipped the
+     *         size prefix
+     */
+    public static ByteBuffer removeSizePrefix(ByteBuffer bb) {
+        ByteBuffer s = bb.duplicate();
+        s.position(s.position() + SIZE_PREFIX_LENGTH);
+        return s;
+    }
+
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteVector.java
new file mode 100644
index 0000000..8bc715b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ByteVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 8-bit values.
+ */
+public final class ByteVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param vector Start data of a vector.
+   * @param bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public ByteVector __assign(int vector, ByteBuffer bb) { 
+    __reset(vector, Constants.SIZEOF_BYTE, bb); return this;
+  }
+
+  /**
+   * Reads the byte at the given index.
+   *
+   * @param j The index from which the byte will be read.
+   * @return the 8-bit value at the given index.
+   */
+  public byte get(int j) {
+     return bb.get(__element(j));
+  }
+
+  /**
+   * Reads the byte at the given index, zero-extends it to type int, and returns the result,
+   * which is therefore in the range 0 through 255.
+   *
+   * @param j The index from which the byte will be read.
+   * @return the unsigned 8-bit at the given index.
+   */
+  public int getAsUnsigned(int j) {
+    return (int) get(j) & 0xFF;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Constants.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Constants.java
new file mode 100644
index 0000000..0c0920f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Constants.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * Class that holds shared constants
+ */
+public class Constants {
+    // Java doesn't seem to have these.
+    /** The number of bytes in an `byte`. */
+    static final int SIZEOF_BYTE = 1;
+    /** The number of bytes in a `short`. */
+    static final int SIZEOF_SHORT = 2;
+    /** The number of bytes in an `int`. */
+    static final int SIZEOF_INT = 4;
+    /** The number of bytes in an `float`. */
+    static final int SIZEOF_FLOAT = 4;
+    /** The number of bytes in an `long`. */
+    static final int SIZEOF_LONG = 8;
+    /** The number of bytes in an `double`. */
+    static final int SIZEOF_DOUBLE = 8;
+    /** The number of bytes in a file identifier. */
+    static final int FILE_IDENTIFIER_LENGTH = 4;
+    /** The number of bytes in a size prefix. */
+    public static final int SIZE_PREFIX_LENGTH = 4;
+    /** A version identifier to force a compile error if someone
+    accidentally tries to build generated code with a runtime of
+    two mismatched version. Versions need to always match, as
+    the runtime and generated code are modified in sync.
+    Changes to the Java implementation need to be sure to change
+    the version here and in the code generator on every possible
+    incompatible change */
+    public static void FLATBUFFERS_2_0_0() {}
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/DoubleVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/DoubleVector.java
new file mode 100644
index 0000000..fd4a3a4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/DoubleVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of double values.
+ */
+public final class DoubleVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public DoubleVector __assign(int _vector, ByteBuffer _bb) {
+     __reset(_vector, Constants.SIZEOF_DOUBLE, _bb); return this;
+  }
+
+  /**
+   * Reads the double value at the given index.
+   *
+   * @param j The index from which the double value will be read.
+   * @return the double value at the given index.
+   */
+  public double get(int j) {
+    return bb.getDouble(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlatBufferBuilder.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlatBufferBuilder.java
new file mode 100644
index 0000000..a954d9f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlatBufferBuilder.java
@@ -0,0 +1,1120 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.*;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.lang.Integer;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Class that helps you build a FlatBuffer.  See the section
+ * "Use in Java/C#" in the main FlatBuffers documentation.
+ */
+public class FlatBufferBuilder {
+    /// @cond FLATBUFFERS_INTERNAL
+    ByteBuffer bb;                    // Where we construct the FlatBuffer.
+    int space;                        // Remaining space in the ByteBuffer.
+    int minalign = 1;                 // Minimum alignment encountered so far.
+    int[] vtable = null;              // The vtable for the current table.
+    int vtable_in_use = 0;            // The amount of fields we're actually using.
+    boolean nested = false;           // Whether we are currently serializing a table.
+    boolean finished = false;         // Whether the buffer is finished.
+    int object_start;                 // Starting offset of the current struct/table.
+    int[] vtables = new int[16];      // List of offsets of all vtables.
+    int num_vtables = 0;              // Number of entries in `vtables` in use.
+    int vector_num_elems = 0;         // For the current vector being built.
+    boolean force_defaults = false;   // False omits default values from the serialized data.
+    ByteBufferFactory bb_factory;     // Factory for allocating the internal buffer
+    final Utf8 utf8;                  // UTF-8 encoder to use
+    Map<String, Integer> string_pool; // map used to cache shared strings.
+    /// @endcond
+
+
+    /**
+     * Maximum size of buffer to allocate. If we're allocating arrays on the heap,
+     * the header size of the array counts towards its maximum size.
+     */
+    private static final int MAX_BUFFER_SIZE = Integer.MAX_VALUE - 8;
+
+    /**
+     * Default buffer size that is allocated if an initial size is not given, or is
+     * non positive.
+     */
+    private static final int DEFAULT_BUFFER_SIZE = 1024;
+
+    /**
+     * Start with a buffer of size `initial_size`, then grow as required.
+     *
+     * @param initial_size The initial size of the internal buffer to use.
+     * @param bb_factory The factory to be used for allocating the internal buffer
+     */
+    public FlatBufferBuilder(int initial_size, ByteBufferFactory bb_factory) {
+        this(initial_size, bb_factory, null, Utf8.getDefault());
+    }
+
+    /**
+     * Start with a buffer of size `initial_size`, then grow as required.
+     *
+     * @param initial_size The initial size of the internal buffer to use.
+     * @param bb_factory The factory to be used for allocating the internal buffer
+     * @param existing_bb The byte buffer to reuse.
+     * @param utf8 The Utf8 codec
+     */
+    public FlatBufferBuilder(int initial_size, ByteBufferFactory bb_factory,
+                             ByteBuffer existing_bb, Utf8 utf8) {
+        if (initial_size <= 0) {
+          initial_size = DEFAULT_BUFFER_SIZE;
+        }
+        this.bb_factory = bb_factory;
+        if (existing_bb != null) {
+          bb = existing_bb;
+          bb.clear();
+          bb.order(ByteOrder.LITTLE_ENDIAN);
+        } else {
+          bb = bb_factory.newByteBuffer(initial_size);
+        }
+        this.utf8 = utf8;
+        space = bb.capacity();
+    }
+
+   /**
+    * Start with a buffer of size `initial_size`, then grow as required.
+    *
+    * @param initial_size The initial size of the internal buffer to use.
+    */
+    public FlatBufferBuilder(int initial_size) {
+        this(initial_size, HeapByteBufferFactory.INSTANCE, null, Utf8.getDefault());
+    }
+
+    /**
+     * Start with a buffer of 1KiB, then grow as required.
+     */
+    public FlatBufferBuilder() {
+        this(DEFAULT_BUFFER_SIZE);
+    }
+
+    /**
+     * Alternative constructor allowing reuse of {@link ByteBuffer}s.  The builder
+     * can still grow the buffer as necessary.  User classes should make sure
+     * to call {@link #dataBuffer()} to obtain the resulting encoded message.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     * @param bb_factory The factory to be used for allocating a new internal buffer if
+     *                   the existing buffer needs to grow
+     */
+    public FlatBufferBuilder(ByteBuffer existing_bb, ByteBufferFactory bb_factory) {
+        this(existing_bb.capacity(), bb_factory, existing_bb, Utf8.getDefault());
+    }
+
+    /**
+     * Alternative constructor allowing reuse of {@link ByteBuffer}s.  The builder
+     * can still grow the buffer as necessary.  User classes should make sure
+     * to call {@link #dataBuffer()} to obtain the resulting encoded message.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     */
+    public FlatBufferBuilder(ByteBuffer existing_bb) {
+        this(existing_bb, new HeapByteBufferFactory());
+    }
+
+    /**
+     * Alternative initializer that allows reusing this object on an existing
+     * `ByteBuffer`. This method resets the builder's internal state, but keeps
+     * objects that have been allocated for temporary storage.
+     *
+     * @param existing_bb The byte buffer to reuse.
+     * @param bb_factory The factory to be used for allocating a new internal buffer if
+     *                   the existing buffer needs to grow
+     * @return Returns `this`.
+     */
+    public FlatBufferBuilder init(ByteBuffer existing_bb, ByteBufferFactory bb_factory){
+        this.bb_factory = bb_factory;
+        bb = existing_bb;
+        bb.clear();
+        bb.order(ByteOrder.LITTLE_ENDIAN);
+        minalign = 1;
+        space = bb.capacity();
+        vtable_in_use = 0;
+        nested = false;
+        finished = false;
+        object_start = 0;
+        num_vtables = 0;
+        vector_num_elems = 0;
+        if (string_pool != null) {
+            string_pool.clear();
+        }
+        return this;
+    }
+
+    /**
+     * An interface that provides a user of the FlatBufferBuilder class the ability to specify
+     * the method in which the internal buffer gets allocated. This allows for alternatives
+     * to the default behavior, which is to allocate memory for a new byte-array
+     * backed `ByteBuffer` array inside the JVM.
+     *
+     * The FlatBufferBuilder class contains the HeapByteBufferFactory class to
+     * preserve the default behavior in the event that the user does not provide
+     * their own implementation of this interface.
+     */
+    public static abstract class ByteBufferFactory {
+        /**
+         * Create a `ByteBuffer` with a given capacity.
+         * The returned ByteBuf must have a ByteOrder.LITTLE_ENDIAN ByteOrder.
+         *
+         * @param capacity The size of the `ByteBuffer` to allocate.
+         * @return Returns the new `ByteBuffer` that was allocated.
+         */
+        public abstract ByteBuffer newByteBuffer(int capacity);
+
+        /**
+         * Release a ByteBuffer. Current {@link FlatBufferBuilder}
+         * released any reference to it, so it is safe to dispose the buffer
+         * or return it to a pool.
+         * It is not guaranteed that the buffer has been created
+         * with {@link #newByteBuffer(int) }.
+         *
+         * @param bb the buffer to release
+         */
+        public void releaseByteBuffer(ByteBuffer bb) {
+        }
+    }
+
+    /**
+     * An implementation of the ByteBufferFactory interface that is used when
+     * one is not provided by the user.
+     *
+     * Allocate memory for a new byte-array backed `ByteBuffer` array inside the JVM.
+     */
+    public static final class HeapByteBufferFactory extends ByteBufferFactory {
+
+        public static final HeapByteBufferFactory INSTANCE = new HeapByteBufferFactory();
+
+        @Override
+        public ByteBuffer newByteBuffer(int capacity) {
+            return ByteBuffer.allocate(capacity).order(ByteOrder.LITTLE_ENDIAN);
+        }
+    }
+
+   /**
+   * Helper function to test if a field is present in the table
+   *
+   * @param table Flatbuffer table
+   * @param offset virtual table offset
+   * @return true if the filed is present
+   */
+   public static boolean isFieldPresent(Table table, int offset) {
+     return table.__offset(offset) != 0;
+   }
+
+    /**
+     * Reset the FlatBufferBuilder by purging all data that it holds.
+     */
+    public void clear(){
+        space = bb.capacity();
+        bb.clear();
+        minalign = 1;
+        while(vtable_in_use > 0) vtable[--vtable_in_use] = 0;
+        vtable_in_use = 0;
+        nested = false;
+        finished = false;
+        object_start = 0;
+        num_vtables = 0;
+        vector_num_elems = 0;
+        if (string_pool != null) {
+            string_pool.clear();
+        }
+    }
+
+    /**
+     * Doubles the size of the backing {@link ByteBuffer} and copies the old data towards the
+     * end of the new buffer (since we build the buffer backwards).
+     *
+     * @param bb The current buffer with the existing data.
+     * @param bb_factory The factory to be used for allocating the new internal buffer
+     * @return A new byte buffer with the old data copied copied to it.  The data is
+     * located at the end of the buffer.
+     */
+    static ByteBuffer growByteBuffer(ByteBuffer bb, ByteBufferFactory bb_factory) {
+        int old_buf_size = bb.capacity();
+
+        int new_buf_size;
+
+        if (old_buf_size == 0) {
+            new_buf_size = DEFAULT_BUFFER_SIZE;
+        }
+        else {
+            if (old_buf_size == MAX_BUFFER_SIZE) { // Ensure we don't grow beyond what fits in an int.
+                throw new AssertionError("FlatBuffers: cannot grow buffer beyond 2 gigabytes.");
+            }
+            new_buf_size = (old_buf_size & 0xC0000000) != 0 ? MAX_BUFFER_SIZE : old_buf_size << 1;
+        }
+
+        bb.position(0);
+        ByteBuffer nbb = bb_factory.newByteBuffer(new_buf_size);
+        new_buf_size = nbb.clear().capacity(); // Ensure the returned buffer is treated as empty
+        nbb.position(new_buf_size - old_buf_size);
+        nbb.put(bb);
+        return nbb;
+    }
+
+   /**
+    * Offset relative to the end of the buffer.
+    *
+    * @return Offset relative to the end of the buffer.
+    */
+    public int offset() {
+        return bb.capacity() - space;
+    }
+
+   /**
+    * Add zero valued bytes to prepare a new entry to be added.
+    *
+    * @param byte_size Number of bytes to add.
+    */
+    public void pad(int byte_size) {
+        for (int i = 0; i < byte_size; i++) bb.put(--space, (byte)0);
+    }
+
+   /**
+    * Prepare to write an element of `size` after `additional_bytes`
+    * have been written, e.g. if you write a string, you need to align such
+    * the int length field is aligned to {@link com.google.flatbuffers.Constants#SIZEOF_INT}, and
+    * the string data follows it directly.  If all you need to do is alignment, `additional_bytes`
+    * will be 0.
+    *
+    * @param size This is the of the new element to write.
+    * @param additional_bytes The padding size.
+    */
+    public void prep(int size, int additional_bytes) {
+        // Track the biggest thing we've ever aligned to.
+        if (size > minalign) minalign = size;
+        // Find the amount of alignment needed such that `size` is properly
+        // aligned after `additional_bytes`
+        int align_size = ((~(bb.capacity() - space + additional_bytes)) + 1) & (size - 1);
+        // Reallocate the buffer if needed.
+        while (space < align_size + size + additional_bytes) {
+            int old_buf_size = bb.capacity();
+            ByteBuffer old = bb;
+            bb = growByteBuffer(old, bb_factory);
+            if (old != bb) {
+                bb_factory.releaseByteBuffer(old);
+            }
+            space += bb.capacity() - old_buf_size;
+        }
+        pad(align_size);
+    }
+
+    /**
+     * Add a `boolean` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `boolean` to put into the buffer.
+     */
+    public void putBoolean(boolean x) { bb.put      (space -= Constants.SIZEOF_BYTE, (byte)(x ? 1 : 0)); }
+
+    /**
+     * Add a `byte` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `byte` to put into the buffer.
+     */
+    public void putByte   (byte    x) { bb.put      (space -= Constants.SIZEOF_BYTE, x); }
+
+    /**
+     * Add a `short` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `short` to put into the buffer.
+     */
+    public void putShort  (short   x) { bb.putShort (space -= Constants.SIZEOF_SHORT, x); }
+
+    /**
+     * Add an `int` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x An `int` to put into the buffer.
+     */
+    public void putInt    (int     x) { bb.putInt   (space -= Constants.SIZEOF_INT, x); }
+
+    /**
+     * Add a `long` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `long` to put into the buffer.
+     */
+    public void putLong   (long    x) { bb.putLong  (space -= Constants.SIZEOF_LONG, x); }
+
+    /**
+     * Add a `float` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `float` to put into the buffer.
+     */
+    public void putFloat  (float   x) { bb.putFloat (space -= Constants.SIZEOF_FLOAT, x); }
+
+    /**
+     * Add a `double` to the buffer, backwards from the current location. Doesn't align nor
+     * check for space.
+     *
+     * @param x A `double` to put into the buffer.
+     */
+    public void putDouble (double  x) { bb.putDouble(space -= Constants.SIZEOF_DOUBLE, x); }
+    /// @endcond
+
+    /**
+     * Add a `boolean` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `boolean` to put into the buffer.
+     */
+    public void addBoolean(boolean x) { prep(Constants.SIZEOF_BYTE, 0); putBoolean(x); }
+
+    /**
+     * Add a `byte` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `byte` to put into the buffer.
+     */
+    public void addByte   (byte    x) { prep(Constants.SIZEOF_BYTE, 0); putByte   (x); }
+
+    /**
+     * Add a `short` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `short` to put into the buffer.
+     */
+    public void addShort  (short   x) { prep(Constants.SIZEOF_SHORT, 0); putShort  (x); }
+
+    /**
+     * Add an `int` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x An `int` to put into the buffer.
+     */
+    public void addInt    (int     x) { prep(Constants.SIZEOF_INT, 0); putInt    (x); }
+
+    /**
+     * Add a `long` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `long` to put into the buffer.
+     */
+    public void addLong   (long    x) { prep(Constants.SIZEOF_LONG, 0); putLong   (x); }
+
+    /**
+     * Add a `float` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `float` to put into the buffer.
+     */
+    public void addFloat  (float   x) { prep(Constants.SIZEOF_FLOAT, 0); putFloat  (x); }
+
+    /**
+     * Add a `double` to the buffer, properly aligned, and grows the buffer (if necessary).
+     *
+     * @param x A `double` to put into the buffer.
+     */
+    public void addDouble (double  x) { prep(Constants.SIZEOF_DOUBLE, 0); putDouble (x); }
+
+   /**
+    * Adds on offset, relative to where it will be written.
+    *
+    * @param off The offset to add.
+    */
+    public void addOffset(int off) {
+        prep(SIZEOF_INT, 0);  // Ensure alignment is already done.
+        assert off <= offset();
+        off = offset() - off + SIZEOF_INT;
+        putInt(off);
+    }
+
+   /// @cond FLATBUFFERS_INTERNAL
+   /**
+    * Start a new array/vector of objects.  Users usually will not call
+    * this directly.  The `FlatBuffers` compiler will create a start/end
+    * method for vector types in generated code.
+    * <p>
+    * The expected sequence of calls is:
+    * <ol>
+    * <li>Start the array using this method.</li>
+    * <li>Call {@link #addOffset(int)} `num_elems` number of times to set
+    * the offset of each element in the array.</li>
+    * <li>Call {@link #endVector()} to retrieve the offset of the array.</li>
+    * </ol>
+    * <p>
+    * For example, to create an array of strings, do:
+    * <pre>{@code
+    * // Need 10 strings
+    * FlatBufferBuilder builder = new FlatBufferBuilder(existingBuffer);
+    * int[] offsets = new int[10];
+    *
+    * for (int i = 0; i < 10; i++) {
+    *   offsets[i] = fbb.createString(" " + i);
+    * }
+    *
+    * // Have the strings in the buffer, but don't have a vector.
+    * // Add a vector that references the newly created strings:
+    * builder.startVector(4, offsets.length, 4);
+    *
+    * // Add each string to the newly created vector
+    * // The strings are added in reverse order since the buffer
+    * // is filled in back to front
+    * for (int i = offsets.length - 1; i >= 0; i--) {
+    *   builder.addOffset(offsets[i]);
+    * }
+    *
+    * // Finish off the vector
+    * int offsetOfTheVector = fbb.endVector();
+    * }</pre>
+    *
+    * @param elem_size The size of each element in the array.
+    * @param num_elems The number of elements in the array.
+    * @param alignment The alignment of the array.
+    */
+    public void startVector(int elem_size, int num_elems, int alignment) {
+        notNested();
+        vector_num_elems = num_elems;
+        prep(SIZEOF_INT, elem_size * num_elems);
+        prep(alignment, elem_size * num_elems); // Just in case alignment > int.
+        nested = true;
+    }
+
+   /**
+    * Finish off the creation of an array and all its elements.  The array
+    * must be created with {@link #startVector(int, int, int)}.
+    *
+    * @return The offset at which the newly created array starts.
+    * @see #startVector(int, int, int)
+    */
+    public int endVector() {
+        if (!nested)
+            throw new AssertionError("FlatBuffers: endVector called without startVector");
+        nested = false;
+        putInt(vector_num_elems);
+        return offset();
+    }
+    /// @endcond
+
+    /**
+     * Create a new array/vector and return a ByteBuffer to be filled later.
+     * Call {@link #endVector} after this method to get an offset to the beginning
+     * of vector.
+     *
+     * @param elem_size the size of each element in bytes.
+     * @param num_elems number of elements in the vector.
+     * @param alignment byte alignment.
+     * @return ByteBuffer with position and limit set to the space allocated for the array.
+     */
+    public ByteBuffer createUnintializedVector(int elem_size, int num_elems, int alignment) {
+        int length = elem_size * num_elems;
+        startVector(elem_size, num_elems, alignment);
+
+        bb.position(space -= length);
+
+        // Slice and limit the copy vector to point to the 'array'
+        ByteBuffer copy = bb.slice().order(ByteOrder.LITTLE_ENDIAN);
+        copy.limit(length);
+        return copy;
+    }
+
+   /**
+     * Create a vector of tables.
+     *
+     * @param offsets Offsets of the tables.
+     * @return Returns offset of the vector.
+     */
+    public int createVectorOfTables(int[] offsets) {
+        notNested();
+        startVector(Constants.SIZEOF_INT, offsets.length, Constants.SIZEOF_INT);
+        for(int i = offsets.length - 1; i >= 0; i--) addOffset(offsets[i]);
+        return endVector();
+    }
+
+    /**
+     * Create a vector of sorted by the key tables.
+     *
+     * @param obj Instance of the table subclass.
+     * @param offsets Offsets of the tables.
+     * @return Returns offset of the sorted vector.
+     */
+    public <T extends Table> int createSortedVectorOfTables(T obj, int[] offsets) {
+        obj.sortTables(offsets, bb);
+        return createVectorOfTables(offsets);
+    }
+
+    /**
+    * Encode the String `s` in the buffer using UTF-8. If a String with
+    * this exact contents has already been serialized using this method,
+    * instead simply returns the offset of the existing String.
+    *
+    * Usage of the method will incur into additional allocations,
+    * so it is advisable to use it only when it is known upfront that
+    * your message will have several repeated strings.
+    *
+    * @param s The String to encode.
+    * @return The offset in the buffer where the encoded String starts.
+    */
+    public int createSharedString(String s) {
+
+        if (string_pool == null) {
+            string_pool = new HashMap<>();
+            int offset = createString(s);
+            string_pool.put(s, offset);
+            return offset;
+
+        }
+
+        Integer offset = string_pool.get(s);
+
+        if(offset == null) {
+            offset = createString(s);
+            string_pool.put(s, offset);
+        }
+        return offset;
+    }
+
+   /**
+    * Encode the string `s` in the buffer using UTF-8.  If {@code s} is
+    * already a {@link CharBuffer}, this method is allocation free.
+    *
+    * @param s The string to encode.
+    * @return The offset in the buffer where the encoded string starts.
+    */
+    public int createString(CharSequence s) {
+        int length = utf8.encodedLength(s);
+        addByte((byte)0);
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        utf8.encodeUtf8(s, bb);
+        return endVector();
+    }
+
+   /**
+    * Create a string in the buffer from an already encoded UTF-8 string in a ByteBuffer.
+    *
+    * @param s An already encoded UTF-8 string as a `ByteBuffer`.
+    * @return The offset in the buffer where the encoded string starts.
+    */
+    public int createString(ByteBuffer s) {
+        int length = s.remaining();
+        addByte((byte)0);
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(s);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * @param arr A source array with data
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(byte[] arr) {
+        int length = arr.length;
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(arr);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * @param arr a source array with data.
+     * @param offset the offset in the source array to start copying from.
+     * @param length the number of bytes to copy from the source array.
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(byte[] arr, int offset, int length) {
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(arr, offset, length);
+        return endVector();
+    }
+
+    /**
+     * Create a byte array in the buffer.
+     *
+     * The source {@link ByteBuffer} position is advanced by {@link ByteBuffer#remaining()} places
+     * after this call.
+     *
+     * @param byteBuffer A source {@link ByteBuffer} with data.
+     * @return The offset in the buffer where the encoded array starts.
+     */
+    public int createByteVector(ByteBuffer byteBuffer) {
+        int length = byteBuffer.remaining();
+        startVector(1, length, 1);
+        bb.position(space -= length);
+        bb.put(byteBuffer);
+        return endVector();
+    }
+
+   /// @cond FLATBUFFERS_INTERNAL
+   /**
+    * Should not be accessing the final buffer before it is finished.
+    */
+    public void finished() {
+        if (!finished)
+            throw new AssertionError(
+                "FlatBuffers: you can only access the serialized buffer after it has been" +
+                " finished by FlatBufferBuilder.finish().");
+    }
+
+   /**
+    * Should not be creating any other object, string or vector
+    * while an object is being constructed.
+    */
+    public void notNested() {
+        if (nested)
+            throw new AssertionError("FlatBuffers: object serialization must not be nested.");
+    }
+
+   /**
+    * Structures are always stored inline, they need to be created right
+    * where they're used.  You'll get this assertion failure if you
+    * created it elsewhere.
+    *
+    * @param obj The offset of the created object.
+    */
+    public void Nested(int obj) {
+        if (obj != offset())
+            throw new AssertionError("FlatBuffers: struct must be serialized inline.");
+    }
+
+   /**
+    * Start encoding a new object in the buffer.  Users will not usually need to
+    * call this directly. The `FlatBuffers` compiler will generate helper methods
+    * that call this method internally.
+    * <p>
+    * For example, using the "Monster" code found on the "landing page". An
+    * object of type `Monster` can be created using the following code:
+    *
+    * <pre>{@code
+    * int testArrayOfString = Monster.createTestarrayofstringVector(fbb, new int[] {
+    *   fbb.createString("test1"),
+    *   fbb.createString("test2")
+    * });
+    *
+    * Monster.startMonster(fbb);
+    * Monster.addPos(fbb, Vec3.createVec3(fbb, 1.0f, 2.0f, 3.0f, 3.0,
+    *   Color.Green, (short)5, (byte)6));
+    * Monster.addHp(fbb, (short)80);
+    * Monster.addName(fbb, str);
+    * Monster.addInventory(fbb, inv);
+    * Monster.addTestType(fbb, (byte)Any.Monster);
+    * Monster.addTest(fbb, mon2);
+    * Monster.addTest4(fbb, test4);
+    * Monster.addTestarrayofstring(fbb, testArrayOfString);
+    * int mon = Monster.endMonster(fbb);
+    * }</pre>
+    * <p>
+    * Here:
+    * <ul>
+    * <li>The call to `Monster#startMonster(FlatBufferBuilder)` will call this
+    * method with the right number of fields set.</li>
+    * <li>`Monster#endMonster(FlatBufferBuilder)` will ensure {@link #endObject()} is called.</li>
+    * </ul>
+    * <p>
+    * It's not recommended to call this method directly.  If it's called manually, you must ensure
+    * to audit all calls to it whenever fields are added or removed from your schema.  This is
+    * automatically done by the code generated by the `FlatBuffers` compiler.
+    *
+    * @param numfields The number of fields found in this object.
+    */
+    public void startTable(int numfields) {
+        notNested();
+        if (vtable == null || vtable.length < numfields) vtable = new int[numfields];
+        vtable_in_use = numfields;
+        Arrays.fill(vtable, 0, vtable_in_use, 0);
+        nested = true;
+        object_start = offset();
+    }
+
+    /**
+     * Add a `boolean` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `boolean` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `boolean` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addBoolean(int o, boolean x, boolean d) { if(force_defaults || x != d) { addBoolean(x); slot(o); } }
+
+    /**
+     * Add a `byte` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `byte` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `byte` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addByte   (int o, byte    x, int     d) { if(force_defaults || x != d) { addByte   (x); slot(o); } }
+
+    /**
+     * Add a `short` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `short` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `short` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addShort  (int o, short   x, int     d) { if(force_defaults || x != d) { addShort  (x); slot(o); } }
+
+    /**
+     * Add an `int` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x An `int` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d An `int` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addInt    (int o, int     x, int     d) { if(force_defaults || x != d) { addInt    (x); slot(o); } }
+
+    /**
+     * Add a `long` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `long` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `long` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addLong   (int o, long    x, long    d) { if(force_defaults || x != d) { addLong   (x); slot(o); } }
+
+    /**
+     * Add a `float` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `float` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `float` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addFloat  (int o, float   x, double  d) { if(force_defaults || x != d) { addFloat  (x); slot(o); } }
+
+    /**
+     * Add a `double` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x A `double` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d A `double` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addDouble (int o, double  x, double  d) { if(force_defaults || x != d) { addDouble (x); slot(o); } }
+
+    /**
+     * Add an `offset` to a table at `o` into its vtable, with value `x` and default `d`.
+     *
+     * @param o The index into the vtable.
+     * @param x An `offset` to put into the buffer, depending on how defaults are handled. If
+     * `force_defaults` is `false`, compare `x` against the default value `d`. If `x` contains the
+     * default value, it can be skipped.
+     * @param d An `offset` default value to compare against when `force_defaults` is `false`.
+     */
+    public void addOffset (int o, int     x, int     d) { if(force_defaults || x != d) { addOffset (x); slot(o); } }
+
+    /**
+     * Add a struct to the table. Structs are stored inline, so nothing additional is being added.
+     *
+     * @param voffset The index into the vtable.
+     * @param x The offset of the created struct.
+     * @param d The default value is always `0`.
+     */
+    public void addStruct(int voffset, int x, int d) {
+        if(x != d) {
+            Nested(x);
+            slot(voffset);
+        }
+    }
+
+    /**
+     * Set the current vtable at `voffset` to the current location in the buffer.
+     *
+     * @param voffset The index into the vtable to store the offset relative to the end of the
+     * buffer.
+     */
+    public void slot(int voffset) {
+        vtable[voffset] = offset();
+    }
+
+   /**
+    * Finish off writing the object that is under construction.
+    *
+    * @return The offset to the object inside {@link #dataBuffer()}.
+    * @see #startTable(int)
+    */
+    public int endTable() {
+        if (vtable == null || !nested)
+            throw new AssertionError("FlatBuffers: endTable called without startTable");
+        addInt(0);
+        int vtableloc = offset();
+        // Write out the current vtable.
+        int i = vtable_in_use - 1;
+        // Trim trailing zeroes.
+        for (; i >= 0 && vtable[i] == 0; i--) {}
+        int trimmed_size = i + 1;
+        for (; i >= 0 ; i--) {
+            // Offset relative to the start of the table.
+            short off = (short)(vtable[i] != 0 ? vtableloc - vtable[i] : 0);
+            addShort(off);
+        }
+
+        final int standard_fields = 2; // The fields below:
+        addShort((short)(vtableloc - object_start));
+        addShort((short)((trimmed_size + standard_fields) * SIZEOF_SHORT));
+
+        // Search for an existing vtable that matches the current one.
+        int existing_vtable = 0;
+        outer_loop:
+        for (i = 0; i < num_vtables; i++) {
+            int vt1 = bb.capacity() - vtables[i];
+            int vt2 = space;
+            short len = bb.getShort(vt1);
+            if (len == bb.getShort(vt2)) {
+                for (int j = SIZEOF_SHORT; j < len; j += SIZEOF_SHORT) {
+                    if (bb.getShort(vt1 + j) != bb.getShort(vt2 + j)) {
+                        continue outer_loop;
+                    }
+                }
+                existing_vtable = vtables[i];
+                break outer_loop;
+            }
+        }
+
+        if (existing_vtable != 0) {
+            // Found a match:
+            // Remove the current vtable.
+            space = bb.capacity() - vtableloc;
+            // Point table to existing vtable.
+            bb.putInt(space, existing_vtable - vtableloc);
+        } else {
+            // No match:
+            // Add the location of the current vtable to the list of vtables.
+            if (num_vtables == vtables.length) vtables = Arrays.copyOf(vtables, num_vtables * 2);
+            vtables[num_vtables++] = offset();
+            // Point table to current vtable.
+            bb.putInt(bb.capacity() - vtableloc, offset() - vtableloc);
+        }
+
+        nested = false;
+        return vtableloc;
+    }
+
+    /**
+     * Checks that a required field has been set in a given table that has
+     * just been constructed.
+     *
+     * @param table The offset to the start of the table from the `ByteBuffer` capacity.
+     * @param field The offset to the field in the vtable.
+     */
+    public void required(int table, int field) {
+        int table_start = bb.capacity() - table;
+        int vtable_start = table_start - bb.getInt(table_start);
+        boolean ok = bb.getShort(vtable_start + field) != 0;
+        // If this fails, the caller will show what field needs to be set.
+        if (!ok)
+            throw new AssertionError("FlatBuffers: field " + field + " must be set");
+    }
+    /// @endcond
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param size_prefix Whether to prefix the size to the buffer.
+     */
+    protected void finish(int root_table, boolean size_prefix) {
+        prep(minalign, SIZEOF_INT + (size_prefix ? SIZEOF_INT : 0));
+        addOffset(root_table);
+        if (size_prefix) {
+            addInt(bb.capacity() - space);
+        }
+        bb.position(space);
+        finished = true;
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     */
+    public void finish(int root_table) {
+        finish(root_table, false);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`, with the size prefixed.
+     *
+     * @param root_table An offset to be added to the buffer.
+     */
+    public void finishSizePrefixed(int root_table) {
+        finish(root_table, true);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     * @param size_prefix Whether to prefix the size to the buffer.
+     */
+    protected void finish(int root_table, String file_identifier, boolean size_prefix) {
+        prep(minalign, SIZEOF_INT + FILE_IDENTIFIER_LENGTH + (size_prefix ? SIZEOF_INT : 0));
+        if (file_identifier.length() != FILE_IDENTIFIER_LENGTH)
+            throw new AssertionError("FlatBuffers: file identifier must be length " +
+                                     FILE_IDENTIFIER_LENGTH);
+        for (int i = FILE_IDENTIFIER_LENGTH - 1; i >= 0; i--) {
+            addByte((byte)file_identifier.charAt(i));
+        }
+        finish(root_table, size_prefix);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     */
+    public void finish(int root_table, String file_identifier) {
+        finish(root_table, file_identifier, false);
+    }
+
+    /**
+     * Finalize a buffer, pointing to the given `root_table`, with the size prefixed.
+     *
+     * @param root_table An offset to be added to the buffer.
+     * @param file_identifier A FlatBuffer file identifier to be added to the buffer before
+     * `root_table`.
+     */
+    public void finishSizePrefixed(int root_table, String file_identifier) {
+        finish(root_table, file_identifier, true);
+    }
+
+    /**
+     * In order to save space, fields that are set to their default value
+     * don't get serialized into the buffer. Forcing defaults provides a
+     * way to manually disable this optimization.
+     *
+     * @param forceDefaults When set to `true`, always serializes default values.
+     * @return Returns `this`.
+     */
+    public FlatBufferBuilder forceDefaults(boolean forceDefaults){
+        this.force_defaults = forceDefaults;
+        return this;
+    }
+
+    /**
+     * Get the ByteBuffer representing the FlatBuffer. Only call this after you've
+     * called `finish()`. The actual data starts at the ByteBuffer's current position,
+     * not necessarily at `0`.
+     *
+     * @return The {@link ByteBuffer} representing the FlatBuffer
+     */
+    public ByteBuffer dataBuffer() {
+        finished();
+        return bb;
+    }
+
+   /**
+    * The FlatBuffer data doesn't start at offset 0 in the {@link ByteBuffer}, but
+    * now the {@code ByteBuffer}'s position is set to that location upon {@link #finish(int)}.
+    *
+    * @return The {@link ByteBuffer#position() position} the data starts in {@link #dataBuffer()}
+    * @deprecated This method should not be needed anymore, but is left
+    * here for the moment to document this API change. It will be removed in the future.
+    */
+    @Deprecated
+    private int dataStart() {
+        finished();
+        return space;
+    }
+
+   /**
+    * A utility function to copy and return the ByteBuffer data from `start` to
+    * `start` + `length` as a `byte[]`.
+    *
+    * @param start Start copying at this offset.
+    * @param length How many bytes to copy.
+    * @return A range copy of the {@link #dataBuffer() data buffer}.
+    * @throws IndexOutOfBoundsException If the range of bytes is ouf of bound.
+    */
+    public byte[] sizedByteArray(int start, int length){
+        finished();
+        byte[] array = new byte[length];
+        bb.position(start);
+        bb.get(array);
+        return array;
+    }
+
+   /**
+    * A utility function to copy and return the ByteBuffer data as a `byte[]`.
+    *
+    * @return A full copy of the {@link #dataBuffer() data buffer}.
+    */
+    public byte[] sizedByteArray() {
+        return sizedByteArray(space, bb.capacity() - space);
+    }
+
+    /**
+     * A utility function to return an InputStream to the ByteBuffer data
+     *
+     * @return An InputStream that starts at the beginning of the ByteBuffer data
+     *         and can read to the end of it.
+     */
+    public InputStream sizedInputStream() {
+        finished();
+        ByteBuffer duplicate = bb.duplicate();
+        duplicate.position(space);
+        duplicate.limit(bb.capacity());
+        return new ByteBufferBackedInputStream(duplicate);
+    }
+
+    /**
+     * A class that allows a user to create an InputStream from a ByteBuffer.
+     */
+    static class ByteBufferBackedInputStream extends InputStream {
+
+        ByteBuffer buf;
+
+        public ByteBufferBackedInputStream(ByteBuffer buf) {
+            this.buf = buf;
+        }
+
+        public int read() throws IOException {
+            try {
+                return buf.get() & 0xFF;
+            } catch(BufferUnderflowException e) {
+                return -1;
+            }
+        }
+    }
+
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffers.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffers.java
new file mode 100644
index 0000000..8263f9a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffers.java
@@ -0,0 +1,1211 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+
+import static com.google.flatbuffers.FlexBuffers.Unsigned.byteToUnsignedInt;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.intToUnsignedLong;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.shortToUnsignedInt;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * This class can be used to parse FlexBuffer messages.
+ * <p>
+ * For generating FlexBuffer messages, use {@link FlexBuffersBuilder}.
+ * <p>
+ * Example of usage:
+ * <pre>
+ * ReadBuf bb = ... // load message from file or network
+ * FlexBuffers.Reference r = FlexBuffers.getRoot(bb); // Reads the root element
+ * FlexBuffers.Map map = r.asMap(); // We assumed root object is a map
+ * System.out.println(map.get("name").asString()); // prints element with key "name"
+ * </pre>
+ */
+public class FlexBuffers {
+
+    // These are used as the upper 6 bits of a type field to indicate the actual
+    // type.
+    /** Represent a null type */
+    public static final int FBT_NULL = 0;
+    /** Represent a signed integer type */
+    public static final int FBT_INT = 1;
+    /** Represent a unsigned type */
+    public static final int FBT_UINT = 2;
+    /** Represent a float type */
+    public static final int FBT_FLOAT = 3; // Types above stored inline, types below store an offset.
+    /** Represent a key to a map type */
+    public static final int FBT_KEY = 4;
+    /** Represent a string type */
+    public static final int FBT_STRING = 5;
+    /** Represent a indirect signed integer type */
+    public static final int FBT_INDIRECT_INT = 6;
+    /** Represent a indirect unsigned integer type */
+    public static final int FBT_INDIRECT_UINT = 7;
+    /** Represent a indirect float type */
+    public static final int FBT_INDIRECT_FLOAT = 8;
+    /** Represent a map type */
+    public static final int FBT_MAP = 9;
+    /** Represent a vector type */
+    public static final int FBT_VECTOR = 10; // Untyped.
+    /** Represent a vector of signed integers type */
+    public static final int FBT_VECTOR_INT = 11;  // Typed any size  = stores no type table).
+    /** Represent a vector of unsigned integers type */
+    public static final int FBT_VECTOR_UINT = 12;
+    /** Represent a vector of floats type */
+    public static final int FBT_VECTOR_FLOAT = 13;
+    /** Represent a vector of keys type */
+    public static final int FBT_VECTOR_KEY = 14;
+    /** Represent a vector of strings type */
+    // DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
+    // more info on thttps://github.com/google/flatbuffers/issues/5627.
+    public static final int FBT_VECTOR_STRING_DEPRECATED = 15;
+
+    /// @cond FLATBUFFERS_INTERNAL
+    public static final int FBT_VECTOR_INT2 = 16;  // Typed tuple  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT2 = 17;
+    public static final int FBT_VECTOR_FLOAT2 = 18;
+    public static final int FBT_VECTOR_INT3 = 19;  // Typed triple  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT3 = 20;
+    public static final int FBT_VECTOR_FLOAT3 = 21;
+    public static final int FBT_VECTOR_INT4 = 22;  // Typed quad  = no type table; no size field).
+    public static final int FBT_VECTOR_UINT4 = 23;
+    public static final int FBT_VECTOR_FLOAT4 = 24;
+    /// @endcond FLATBUFFERS_INTERNAL
+
+    /** Represent a blob type */
+    public static final int FBT_BLOB = 25;
+    /** Represent a boolean type */
+    public static final int FBT_BOOL = 26;
+    /** Represent a vector of booleans type */
+    public static final int FBT_VECTOR_BOOL = 36;  // To Allow the same type of conversion of type to vector type
+
+    private static final ReadBuf EMPTY_BB = new ArrayReadWriteBuf(new byte[] {0}, 1);
+
+    /**
+     * Checks where a type is a typed vector
+     *
+     * @param type type to be checked
+     * @return true if typed vector
+     */
+    static boolean isTypedVector(int type) {
+        return (type >= FBT_VECTOR_INT && type <= FBT_VECTOR_STRING_DEPRECATED) || type == FBT_VECTOR_BOOL;
+    }
+
+    /**
+     * Check whether you can access type directly (no indirection) or not.
+     *
+     * @param type type to be checked
+     * @return true if inline type
+     */
+    static boolean isTypeInline(int type) {
+        return type <= FBT_FLOAT || type == FBT_BOOL;
+    }
+
+    static int toTypedVectorElementType(int original_type) {
+        return original_type - FBT_VECTOR_INT + FBT_INT;
+    }
+
+    /**
+     * Return a vector type our of a original element type
+     *
+     * @param type        element type
+     * @param fixedLength size of element
+     * @return typed vector type
+     */
+    static int toTypedVector(int type, int fixedLength) {
+        assert (isTypedVectorElementType(type));
+        switch (fixedLength) {
+            case 0: return type - FBT_INT + FBT_VECTOR_INT;
+            case 2: return type - FBT_INT + FBT_VECTOR_INT2;
+            case 3: return type - FBT_INT + FBT_VECTOR_INT3;
+            case 4: return type - FBT_INT + FBT_VECTOR_INT4;
+            default:
+                assert (false);
+                return FBT_NULL;
+        }
+    }
+
+    static boolean isTypedVectorElementType(int type) {
+        return (type >= FBT_INT && type <= FBT_KEY) || type == FBT_BOOL;
+    }
+
+    // return position of the element that the offset is pointing to
+    private static int indirect(ReadBuf bb, int offset, int byteWidth) {
+        // we assume all offset fits on a int, since ReadBuf operates with that assumption
+        return (int) (offset - readUInt(bb, offset, byteWidth));
+    }
+
+    // read unsigned int with size byteWidth and return as a 64-bit integer
+    private static long readUInt(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 1: return byteToUnsignedInt(buff.get(end));
+            case 2: return shortToUnsignedInt(buff.getShort(end));
+            case 4: return intToUnsignedLong(buff.getInt(end));
+            case 8: return buff.getLong(end); // We are passing signed long here. Losing information (user should know)
+            default: return -1; // we should never reach here
+        }
+    }
+
+    // read signed int of size byteWidth and return as 32-bit int
+    private static int readInt(ReadBuf buff, int end, int byteWidth) {
+        return (int) readLong(buff, end, byteWidth);
+    }
+
+    // read signed int of size byteWidth and return as 64-bit int
+    private static long readLong(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 1: return buff.get(end);
+            case 2: return buff.getShort(end);
+            case 4: return buff.getInt(end);
+            case 8: return buff.getLong(end);
+            default: return -1; // we should never reach here
+        }
+    }
+
+    private static double readDouble(ReadBuf buff, int end, int byteWidth) {
+        switch (byteWidth) {
+            case 4: return buff.getFloat(end);
+            case 8: return buff.getDouble(end);
+            default: return -1; // we should never reach here
+        }
+    }
+
+    /**
+     * Reads a FlexBuffer message in ReadBuf and returns {@link Reference} to
+     * the root element.
+     * @param buffer ReadBuf containing FlexBuffer message
+     * @return {@link Reference} to the root object
+     */
+    @Deprecated
+    public static Reference getRoot(ByteBuffer buffer) {
+        return getRoot( buffer.hasArray() ? new ArrayReadWriteBuf(buffer.array(), buffer.limit()) : new ByteBufferReadWriteBuf(buffer));
+    }
+
+        /**
+     * Reads a FlexBuffer message in ReadBuf and returns {@link Reference} to
+     * the root element.
+     * @param buffer ReadBuf containing FlexBuffer message
+     * @return {@link Reference} to the root object
+     */
+    public static Reference getRoot(ReadBuf buffer) {
+        // See Finish() below for the serialization counterpart of this.
+        // The root ends at the end of the buffer, so we parse backwards from there.
+        int end = buffer.limit();
+        int byteWidth = buffer.get(--end);
+        int packetType = byteToUnsignedInt(buffer.get(--end));
+        end -= byteWidth;  // The root data item.
+        return new Reference(buffer, end, byteWidth, packetType);
+    }
+
+    /**
+     * Represents an generic element in the buffer.
+     */
+    public static class Reference {
+
+        private static final Reference NULL_REFERENCE = new Reference(EMPTY_BB, 0, 1, 0);
+        private ReadBuf bb;
+        private int end;
+        private int parentWidth;
+        private int byteWidth;
+        private int type;
+
+        Reference(ReadBuf bb, int end, int parentWidth, int packedType) {
+            this(bb, end, parentWidth, (1 << (packedType & 3)), packedType >> 2);
+        }
+
+        Reference(ReadBuf bb, int end, int parentWidth, int byteWidth, int type) {
+            this.bb = bb;
+            this.end = end;
+            this.parentWidth = parentWidth;
+            this.byteWidth = byteWidth;
+            this.type = type;
+        }
+
+        /**
+         * Return element type
+         * @return element type as integer
+         */
+        public int getType() {
+            return type;
+        }
+
+        /**
+         * Checks whether the element is null type
+         * @return true if null type
+         */
+        public boolean isNull() {
+            return type == FBT_NULL;
+        }
+
+        /**
+         * Checks whether the element is boolean type
+         * @return true if boolean type
+         */
+        public boolean isBoolean() {
+            return type == FBT_BOOL;
+        }
+
+        /**
+         * Checks whether the element type is numeric (signed/unsigned integers and floats)
+         * @return true if numeric type
+         */
+        public boolean isNumeric() {
+            return isIntOrUInt() || isFloat();
+        }
+
+        /**
+         * Checks whether the element type is signed or unsigned integers
+         * @return true if an integer type
+         */
+        public boolean isIntOrUInt() {
+            return isInt() || isUInt();
+        }
+
+        /**
+         * Checks whether the element type is float
+         * @return true if a float type
+         */
+        public boolean isFloat() {
+            return type == FBT_FLOAT || type == FBT_INDIRECT_FLOAT;
+        }
+
+        /**
+         * Checks whether the element type is signed integer
+         * @return true if a signed integer type
+         */
+        public boolean isInt() {
+            return type == FBT_INT || type == FBT_INDIRECT_INT;
+        }
+
+        /**
+         * Checks whether the element type is signed integer
+         * @return true if a signed integer type
+         */
+        public boolean isUInt() {
+            return type == FBT_UINT || type == FBT_INDIRECT_UINT;
+        }
+
+        /**
+         * Checks whether the element type is string
+         * @return true if a string type
+         */
+        public boolean isString() {
+            return type == FBT_STRING;
+        }
+
+        /**
+         * Checks whether the element type is key
+         * @return true if a key type
+         */
+        public boolean isKey() {
+            return type == FBT_KEY;
+        }
+
+        /**
+         * Checks whether the element type is vector
+         * @return true if a vector type
+         */
+        public boolean isVector() {
+            return type == FBT_VECTOR || type == FBT_MAP;
+        }
+
+        /**
+         * Checks whether the element type is typed vector
+         * @return true if a typed vector type
+         */
+        public boolean isTypedVector() {
+            return FlexBuffers.isTypedVector(type);
+        }
+
+        /**
+         * Checks whether the element type is a map
+         * @return true if a map type
+         */
+        public boolean isMap() {
+            return type == FBT_MAP;
+        }
+
+        /**
+         * Checks whether the element type is a blob
+         * @return true if a blob type
+         */
+        public boolean isBlob() {
+            return type == FBT_BLOB;
+        }
+
+        /**
+         * Returns element as 32-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Unsigned elements will become negative</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 32-bit integer or 0 if fail to convert element to integer.
+         */
+        public int asInt() {
+            if (type == FBT_INT) {
+                // A fast path for the common case.
+                return readInt(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_INT: return readInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_UINT: return (int) readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_UINT: return (int) readUInt(bb, indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_FLOAT: return (int) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (int) readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: return Integer.parseInt(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to int.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as unsigned 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Negative signed elements will become unsigned counterpart</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 64-bit integer or 0 if fail to convert element to integer.
+         */
+        public long asUInt() {
+            if (type == FBT_UINT) {
+                // A fast path for the common case.
+                return readUInt(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INT: return readLong(bb, end, parentWidth);
+                    case FBT_INDIRECT_INT: return readLong(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_FLOAT: return (long) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (long) readDouble(bb,  indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: return Long.parseLong(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to uint.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * <p> Unsigned elements will become negative</p>
+         * <p> Float elements will be casted to integer </p>
+         * @return 64-bit integer or 0 if fail to convert element to long.
+         */
+        public long asLong() {
+            if (type == FBT_INT) {
+                // A fast path for the common case.
+                return readLong(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_INT: return readLong(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_UINT: return readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), parentWidth);
+                    case FBT_FLOAT: return (long) readDouble(bb, end, parentWidth);
+                    case FBT_INDIRECT_FLOAT: return (long) readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0;
+                    case FBT_STRING: {
+                        try {
+                            return Long.parseLong(asString());
+                        } catch (NumberFormatException nfe) {
+                            return 0; //same as C++ implementation
+                        }
+                    }
+                    case FBT_VECTOR: return asVector().size();
+                    case FBT_BOOL: return readInt(bb, end, parentWidth);
+                    default:
+                        // Convert other things to int.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as 64-bit integer.
+         * <p> For vector element, it will return size of the vector</p>
+         * <p> For String element, it will type to be parsed as integer</p>
+         * @return 64-bit integer or 0 if fail to convert element to long.
+         */
+        public double asFloat() {
+            if (type == FBT_FLOAT) {
+                // A fast path for the common case.
+                return readDouble(bb, end, parentWidth);
+            } else
+                switch (type) {
+                    case FBT_INDIRECT_FLOAT: return readDouble(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INT: return readInt(bb, end, parentWidth);
+                    case FBT_UINT:
+                    case FBT_BOOL:
+                        return readUInt(bb, end, parentWidth);
+                    case FBT_INDIRECT_INT: return readInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_INDIRECT_UINT: return readUInt(bb, indirect(bb, end, parentWidth), byteWidth);
+                    case FBT_NULL: return 0.0;
+                    case FBT_STRING: return Double.parseDouble(asString());
+                    case FBT_VECTOR: return asVector().size();
+                    default:
+                        // Convert strings and other things to float.
+                        return 0;
+                }
+        }
+
+        /**
+         * Returns element as a {@link Key}
+         * @return key or {@link Key#empty()} if element is not a key
+         */
+        public Key asKey() {
+            if (isKey()) {
+                return new Key(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Key.empty();
+            }
+        }
+
+        /**
+         * Returns element as a `String`
+         * @return element as `String` or empty `String` if fail
+         */
+        public String asString() {
+            if (isString()) {
+                int start = indirect(bb, end, parentWidth);
+                int size = (int) readUInt(bb, start - byteWidth, byteWidth);
+                return bb.getString(start, size);
+            }
+            else if (isKey()){
+                int start = indirect(bb, end, byteWidth);
+                for (int i = start; ; i++) {
+                    if (bb.get(i) == 0) {
+                        return bb.getString(start, i - start);
+                    }
+                }
+            } else {
+                return "";
+            }
+        }
+
+        /**
+         * Returns element as a {@link Map}
+         * @return element as {@link Map} or empty {@link Map} if fail
+         */
+        public Map asMap() {
+            if (isMap()) {
+                return new Map(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Map.empty();
+            }
+        }
+
+        /**
+         * Returns element as a {@link Vector}
+         * @return element as {@link Vector} or empty {@link Vector} if fail
+         */
+        public Vector asVector() {
+            if (isVector()) {
+                return new Vector(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else if(type == FlexBuffers.FBT_VECTOR_STRING_DEPRECATED) {
+                // deprecated. Should be treated as key vector
+                return new TypedVector(bb, indirect(bb, end, parentWidth), byteWidth, FlexBuffers.FBT_KEY);
+            } else if (FlexBuffers.isTypedVector(type)) {
+                return new TypedVector(bb, indirect(bb, end, parentWidth), byteWidth, FlexBuffers.toTypedVectorElementType(type));
+            } else {
+                return Vector.empty();
+            }
+        }
+
+        /**
+         * Returns element as a {@link Blob}
+         * @return element as {@link Blob} or empty {@link Blob} if fail
+         */
+        public Blob asBlob() {
+            if (isBlob() || isString()) {
+                return new Blob(bb, indirect(bb, end, parentWidth), byteWidth);
+            } else {
+                return Blob.empty();
+            }
+        }
+
+        /**
+         * Returns element as a boolean
+         * <p>If element type is not boolean, it will be casted to integer and compared against 0</p>
+         * @return element as boolean
+         */
+        public boolean asBoolean() {
+            if (isBoolean()) {
+                return bb.get(end) != 0;
+            }
+            return asUInt() != 0;
+        }
+
+        /**
+         * Returns text representation of the element (JSON)
+         * @return String containing text representation of the element
+         */
+        @Override
+        public String toString() {
+            return toString(new StringBuilder(128)).toString();
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        StringBuilder toString(StringBuilder sb) {
+            //TODO: Original C++ implementation escape strings.
+            // probably we should do it as well.
+            switch (type) {
+                case FBT_NULL:
+                    return sb.append("null");
+                case FBT_INT:
+                case FBT_INDIRECT_INT:
+                    return sb.append(asLong());
+                case FBT_UINT:
+                case FBT_INDIRECT_UINT:
+                    return sb.append(asUInt());
+                case FBT_INDIRECT_FLOAT:
+                case FBT_FLOAT:
+                    return sb.append(asFloat());
+                case FBT_KEY:
+                    return asKey().toString(sb.append('"')).append('"');
+                case FBT_STRING:
+                    return sb.append('"').append(asString()).append('"');
+                case FBT_MAP:
+                    return asMap().toString(sb);
+                case FBT_VECTOR:
+                    return asVector().toString(sb);
+                case FBT_BLOB:
+                    return asBlob().toString(sb);
+                case FBT_BOOL:
+                    return sb.append(asBoolean());
+                case FBT_VECTOR_INT:
+                case FBT_VECTOR_UINT:
+                case FBT_VECTOR_FLOAT:
+                case FBT_VECTOR_KEY:
+                case FBT_VECTOR_STRING_DEPRECATED:
+                case FBT_VECTOR_BOOL:
+                    return sb.append(asVector());
+                case FBT_VECTOR_INT2:
+                case FBT_VECTOR_UINT2:
+                case FBT_VECTOR_FLOAT2:
+                case FBT_VECTOR_INT3:
+                case FBT_VECTOR_UINT3:
+                case FBT_VECTOR_FLOAT3:
+                case FBT_VECTOR_INT4:
+                case FBT_VECTOR_UINT4:
+                case FBT_VECTOR_FLOAT4:
+
+                    throw new FlexBufferException("not_implemented:" + type);
+                default:
+                    return sb;
+            }
+        }
+    }
+
+    /**
+     * Base class of all types below.
+     * Points into the data buffer and allows access to one type.
+     */
+    private static abstract class Object {
+        ReadBuf bb;
+        int end;
+        int byteWidth;
+
+        Object(ReadBuf buff, int end, int byteWidth) {
+            this.bb = buff;
+            this.end = end;
+            this.byteWidth = byteWidth;
+        }
+
+        @Override
+        public String toString() {
+            return toString(new StringBuilder(128)).toString();
+        }
+
+        public abstract StringBuilder toString(StringBuilder sb);
+    }
+
+    // Stores size in `byte_width_` bytes before end position.
+    private static abstract class Sized extends Object {
+
+        protected final int size;
+
+        Sized(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+            size = readInt(bb, end - byteWidth, byteWidth);
+        }
+
+        public int size() {
+            return size;
+        }
+    }
+
+    /**
+     * Represents a array of bytes element in the buffer
+     *
+     * <p>It can be converted to `ReadBuf` using {@link data()},
+     * copied into a byte[] using {@link getBytes()} or
+     * have individual bytes accessed individually using {@link get(int)}</p>
+     */
+    public static class Blob extends Sized {
+        static final Blob EMPTY = new Blob(EMPTY_BB, 1, 1);
+
+        Blob(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+        }
+
+        /** Return an empty {@link Blob} */
+        public static Blob empty() {
+            return EMPTY;
+        }
+
+        /**
+         * Return {@link Blob} as `ReadBuf`
+         * @return blob as `ReadBuf`
+         */
+        public ByteBuffer data() {
+            ByteBuffer dup = ByteBuffer.wrap(bb.data());
+            dup.position(end);
+            dup.limit(end + size());
+            return dup.asReadOnlyBuffer().slice();
+        }
+
+        /**
+         * Copy blob into a byte[]
+         * @return blob as a byte[]
+         */
+        public byte[] getBytes() {
+            int size = size();
+            byte[] result = new byte[size];
+            for (int i = 0; i < size; i++) {
+                result[i] = bb.get(end + i);
+            }
+            return result;
+        }
+
+        /**
+         * Return individual byte at a given position
+         * @param pos position of the byte to be read
+         */
+        public byte get(int pos) {
+            assert pos >=0 && pos <= size();
+            return bb.get(end + pos);
+        }
+
+        /**
+         * Returns a text(JSON) representation of the {@link Blob}
+         */
+        @Override
+        public String toString() {
+            return bb.getString(end, size());
+        }
+
+        /**
+         * Append a text(JSON) representation of the {@link Blob} into a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            sb.append('"');
+            sb.append(bb.getString(end, size()));
+            return sb.append('"');
+        }
+    }
+
+    /**
+     * Represents a key element in the buffer. Keys are
+     * used to reference objects in a {@link Map}
+     */
+    public static class Key extends Object {
+
+        private static final Key EMPTY = new Key(EMPTY_BB, 0, 0);
+
+        Key(ReadBuf buff, int end, int byteWidth) {
+            super(buff, end, byteWidth);
+        }
+
+        /**
+         * Return an empty {@link Key}
+         * @return empty {@link Key}
+         * */
+        public static Key empty() {
+            return Key.EMPTY;
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            return sb.append(toString());
+        }
+
+        @Override
+        public String toString() {
+            int size;
+            for (int i = end; ; i++) {
+                if (bb.get(i) == 0) {
+                    size = i - end;
+                    break;
+                }
+            }
+            return bb.getString(end, size);
+        }
+
+        int compareTo(byte[] other) {
+            int ia = end;
+            int io = 0;
+            byte c1, c2;
+            do {
+                c1 = bb.get(ia);
+                c2 = other[io];
+                if (c1 == '\0')
+                    return c1 - c2;
+                ia++;
+                io++;
+                if (io == other.length) {
+                    // in our buffer we have an additional \0 byte
+                    // but this does not exist in regular Java strings, so we return now
+                    return c1 - c2;
+                }
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+
+        /**
+         *  Compare keys
+         *  @param obj other key to compare
+         *  @return true if keys are the same
+         */
+        @Override
+        public boolean equals(java.lang.Object obj) {
+            if (!(obj instanceof Key))
+                return false;
+
+            return ((Key) obj).end == end && ((Key) obj).byteWidth == byteWidth;
+        }
+
+        public int hashCode() {
+          return end ^ byteWidth;
+        }
+    }
+
+    /**
+     * Map object representing a set of key-value pairs.
+     */
+    public static class Map extends Vector {
+        private static final Map EMPTY_MAP = new Map(EMPTY_BB, 1, 1);
+        // cache for converting UTF-8 codepoints into
+        // Java chars. Used to speed up String comparison
+        private final byte[] comparisonBuffer = new byte[4];
+
+        Map(ReadBuf bb, int end, int byteWidth) {
+            super(bb, end, byteWidth);
+        }
+
+        /**
+         * Returns an empty {@link Map}
+         * @return an empty {@link Map}
+         */
+        public static Map empty() {
+            return EMPTY_MAP;
+        }
+
+        /**
+         * @param key access key to element on map
+         * @return reference to value in map
+         */
+        public Reference get(String key) {
+            int index = binarySearch(key);
+            if (index >= 0 && index < size) {
+                return get(index);
+            }
+            return Reference.NULL_REFERENCE;
+        }
+
+        /**
+         * @param key access key to element on map. Keys are assumed to be encoded in UTF-8
+         * @return reference to value in map
+         */
+        public Reference get(byte[] key) {
+            int index = binarySearch(key);
+            if (index >= 0 && index < size) {
+                return get(index);
+            }
+            return Reference.NULL_REFERENCE;
+        }
+
+        /**
+         * Get a vector or keys in the map
+         *
+         * @return vector of keys
+         */
+        public KeyVector keys() {
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            return new KeyVector(new TypedVector(bb,
+                    indirect(bb, keysOffset, byteWidth),
+                    readInt(bb, keysOffset + byteWidth, byteWidth),
+                    FBT_KEY));
+        }
+
+        /**
+         * @return {@code Vector} of values from map
+         */
+        public Vector values() {
+            return new Vector(bb, end, byteWidth);
+        }
+
+        /**
+         * Writes text (json) representation of map in a {@code StringBuilder}.
+         *
+         * @param builder {@code StringBuilder} to be appended to
+         * @return Same {@code StringBuilder} with appended text
+         */
+        public StringBuilder toString(StringBuilder builder) {
+            builder.append("{ ");
+            KeyVector keys = keys();
+            int size = size();
+            Vector vals = values();
+            for (int i = 0; i < size; i++) {
+                builder.append('"')
+                        .append(keys.get(i).toString())
+                        .append("\" : ");
+                builder.append(vals.get(i).toString());
+                if (i != size - 1)
+                    builder.append(", ");
+            }
+            builder.append(" }");
+            return builder;
+        }
+
+        // Performs a binary search on a key vector and return index of the key in key vector
+        private int binarySearch(CharSequence searchedKey) {
+            int low = 0;
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
+            while (low <= high) {
+                int mid = (low + high) >>> 1;
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareCharSequence(keyPos, searchedKey);
+                if (cmp < 0)
+                    low = mid + 1;
+                else if (cmp > 0)
+                    high = mid - 1;
+                else
+                    return mid; // key found
+            }
+            return -(low + 1);  // key not found
+        }
+
+        private int binarySearch(byte[] searchedKey) {
+            int low = 0;
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
+
+            while (low <= high) {
+                int mid = (low + high) >>> 1;
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareBytes(bb, keyPos, searchedKey);
+                if (cmp < 0)
+                    low = mid + 1;
+                else if (cmp > 0)
+                    high = mid - 1;
+                else
+                    return mid; // key found
+            }
+            return -(low + 1);  // key not found
+        }
+
+        // compares a byte[] against a FBT_KEY
+        private int compareBytes(ReadBuf bb, int start, byte[] other) {
+            int l1 = start;
+            int l2 = 0;
+            byte c1, c2;
+            do {
+                c1 = bb.get(l1);
+                c2 = other[l2];
+                if (c1 == '\0')
+                    return c1 - c2;
+                l1++;
+                l2++;
+                if (l2 == other.length) {
+                    // in our buffer we have an additional \0 byte
+                    // but this does not exist in regular Java strings, so we return now
+                    return c1 - c2;
+                }
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+
+        // compares a CharSequence against a FBT_KEY
+        private int compareCharSequence(int start, CharSequence other) {
+            int bufferPos = start;
+            int otherPos = 0;
+            int limit = bb.limit();
+            int otherLimit = other.length();
+
+            // special loop for ASCII characters. Most of keys should be ASCII only, so this
+            // loop should be optimized for that.
+            // breaks if a multi-byte character is found
+            while (otherPos < otherLimit) {
+                char c2 = other.charAt(otherPos);
+
+                if (c2 >= 0x80) {
+                    // not a single byte codepoint
+                    break;
+                }
+
+                byte b = bb.get(bufferPos);
+
+                if (b == 0) {
+                    return -c2;
+                } else if (b < 0) {
+                    break;
+                } else if ((char) b != c2) {
+                    return b - c2;
+                }
+                ++bufferPos;
+                ++otherPos;
+            }
+
+            while (bufferPos < limit) {
+
+                int sizeInBuff = Utf8.encodeUtf8CodePoint(other, otherPos, comparisonBuffer);
+
+                if (sizeInBuff == 0) {
+                    // That means we finish with other and there are not more chars to
+                    // compare. String in the buffer is bigger.
+                    return bb.get(bufferPos);
+                }
+
+                for (int i = 0; i < sizeInBuff; i++) {
+                    byte bufferByte = bb.get(bufferPos++);
+                    byte otherByte = comparisonBuffer[i];
+                    if (bufferByte == 0) {
+                        // Our key is finished, so other is bigger
+                        return -otherByte;
+                    } else if (bufferByte != otherByte) {
+                        return bufferByte - otherByte;
+                    }
+                }
+
+                otherPos += sizeInBuff == 4 ? 2 : 1;
+            }
+            return 0;
+        }
+    }
+
+    /**
+     * Object that represents a set of elements in the buffer
+     */
+    public static class Vector extends Sized {
+
+        private static final Vector EMPTY_VECTOR = new Vector(EMPTY_BB, 1, 1);
+
+        Vector(ReadBuf bb, int end, int byteWidth) {
+            super(bb, end, byteWidth);
+        }
+
+        /**
+         * Returns an empty {@link Map}
+         * @return an empty {@link Map}
+         */
+        public static Vector empty() {
+            return EMPTY_VECTOR;
+        }
+
+        /**
+         * Checks if the vector is empty
+         * @return true if vector is empty
+         */
+        public boolean isEmpty() {
+            return this == EMPTY_VECTOR;
+        }
+
+        /**
+         * Appends a text(JSON) representation to a `StringBuilder`
+         */
+        @Override
+        public StringBuilder toString(StringBuilder sb) {
+            sb.append("[ ");
+            int size = size();
+            for (int i = 0; i < size; i++) {
+                get(i).toString(sb);
+                if (i != size - 1) {
+                    sb.append(", ");
+                }
+            }
+            sb.append(" ]");
+            return sb;
+        }
+
+        /**
+         * Get a element in a vector by index
+         *
+         * @param index position of the element
+         * @return {@code Reference} to the element
+         */
+        public Reference get(int index) {
+            long len = size();
+            if (index >= len) {
+                return Reference.NULL_REFERENCE;
+            }
+            int packedType = byteToUnsignedInt(bb.get((int) (end + (len * byteWidth) + index)));
+            int obj_end = end + index * byteWidth;
+            return new Reference(bb, obj_end, byteWidth, packedType);
+        }
+    }
+
+    /**
+     * Object that represents a set of elements with the same type
+     */
+    public static class TypedVector extends Vector {
+
+        private static final TypedVector EMPTY_VECTOR = new TypedVector(EMPTY_BB, 1, 1, FBT_INT);
+
+        private final int elemType;
+
+        TypedVector(ReadBuf bb, int end, int byteWidth, int elemType) {
+            super(bb, end, byteWidth);
+            this.elemType = elemType;
+        }
+
+        public static TypedVector empty() {
+            return EMPTY_VECTOR;
+        }
+
+        /**
+         * Returns whether the vector is empty
+         *
+         * @return true if empty
+         */
+        public boolean isEmptyVector() {
+            return this == EMPTY_VECTOR;
+        }
+
+        /**
+         * Return element type for all elements in the vector
+         *
+         * @return element type
+         */
+        public int getElemType() {
+            return elemType;
+        }
+
+        /**
+         * Get reference to an object in the {@code Vector}
+         *
+         * @param pos position of the object in {@code Vector}
+         * @return reference to element
+         */
+        @Override
+        public Reference get(int pos) {
+            int len = size();
+            if (pos >= len) return Reference.NULL_REFERENCE;
+            int childPos = end + pos * byteWidth;
+            return new Reference(bb, childPos, byteWidth, 1, elemType);
+        }
+    }
+
+    /**
+     * Represent a vector of keys in a map
+     */
+    public static class KeyVector {
+
+        private final TypedVector vec;
+
+        KeyVector(TypedVector vec) {
+            this.vec = vec;
+        }
+
+        /**
+         * Return key
+         *
+         * @param pos position of the key in key vector
+         * @return key
+         */
+        public Key get(int pos) {
+            int len = size();
+            if (pos >= len) return Key.EMPTY;
+            int childPos = vec.end + pos * vec.byteWidth;
+            return new Key(vec.bb, indirect(vec.bb, childPos, vec.byteWidth), 1);
+        }
+
+        /**
+         * Returns size of key vector
+         *
+         * @return size
+         */
+        public int size() {
+            return vec.size();
+        }
+
+        /**
+         * Returns a text(JSON) representation
+         */
+        public String toString() {
+            StringBuilder b = new StringBuilder();
+            b.append('[');
+            for (int i = 0; i < vec.size(); i++) {
+                vec.get(i).toString(b);
+                if (i != vec.size() - 1) {
+                    b.append(", ");
+                }
+            }
+            return b.append("]").toString();
+        }
+    }
+
+    public static class FlexBufferException extends RuntimeException {
+        FlexBufferException(String msg) {
+            super(msg);
+        }
+    }
+
+    static class Unsigned {
+
+        static int byteToUnsignedInt(byte x) {
+            return ((int) x) & 0xff;
+        }
+
+        static int shortToUnsignedInt(short x) {
+            return ((int) x) & 0xffff;
+        }
+
+        static long intToUnsignedLong(int x) {
+            return ((long) x) & 0xffffffffL;
+        }
+    }
+}
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffersBuilder.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffersBuilder.java
new file mode 100644
index 0000000..dc0df96
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FlexBuffersBuilder.java
@@ -0,0 +1,781 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+
+import static com.google.flatbuffers.FlexBuffers.*;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.byteToUnsignedInt;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.intToUnsignedLong;
+import static com.google.flatbuffers.FlexBuffers.Unsigned.shortToUnsignedInt;
+
+/// @file
+/// @addtogroup flatbuffers_java_api
+/// @{
+
+/**
+ * Helper class that builds FlexBuffers
+ * <p> This class presents all necessary APIs to create FlexBuffers. A `ByteBuffer` will be used to store the
+ * data. It can be created internally, or passed down in the constructor.</p>
+ *
+ * <p>There are some limitations when compared to original implementation in C++. Most notably:
+ * <ul>
+ *   <li><p> No support for mutations (might change in the future).</p></li>
+ *   <li><p> Buffer size limited to {@link Integer#MAX_VALUE}</p></li>
+ *   <li><p> Since Java does not support unsigned type, all unsigned operations accepts an immediate higher representation
+ *   of similar type.</p></li>
+ * </ul>
+ * </p>
+ */
+public class FlexBuffersBuilder {
+
+    /**
+     * No keys or strings will be shared
+     */
+    public static final int BUILDER_FLAG_NONE = 0;
+    /**
+     * Keys will be shared between elements. Identical keys will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEYS = 1;
+    /**
+     * Strings will be shared between elements. Identical strings will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory. This is ideal if you expect many repeated
+     * strings on the message.
+     */
+    public static final int BUILDER_FLAG_SHARE_STRINGS = 2;
+    /**
+     * Strings and keys will be shared between elements.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEYS_AND_STRINGS = 3;
+    /**
+     * Reserved for the future.
+     */
+    public static final int BUILDER_FLAG_SHARE_KEY_VECTORS = 4;
+    /**
+     * Reserved for the future.
+     */
+    public static final int BUILDER_FLAG_SHARE_ALL = 7;
+
+    /// @cond FLATBUFFERS_INTERNAL
+    private static final int WIDTH_8 = 0;
+    private static final int WIDTH_16 = 1;
+    private static final int WIDTH_32 = 2;
+    private static final int WIDTH_64 = 3;
+    private final ReadWriteBuf bb;
+    private final ArrayList<Value> stack = new ArrayList<>();
+    private final HashMap<String, Integer> keyPool = new HashMap<>();
+    private final HashMap<String, Integer> stringPool = new HashMap<>();
+    private final int flags;
+    private boolean finished = false;
+
+    // A lambda to sort map keys
+    private Comparator<Value> keyComparator = new Comparator<Value>() {
+        @Override
+        public int compare(Value o1, Value o2) {
+            int ia = o1.key;
+            int io =  o2.key;
+            byte c1, c2;
+            do {
+                c1 = bb.get(ia);
+                c2 = bb.get(io);
+                if (c1 == 0)
+                    return c1 - c2;
+                ia++;
+                io++;
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+    };
+    /// @endcond
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder} with {@link #BUILDER_FLAG_SHARE_KEYS} set.
+     * @param bufSize size of buffer in bytes.
+     */
+    public FlexBuffersBuilder(int bufSize) {
+        this(new ArrayReadWriteBuf(bufSize), BUILDER_FLAG_SHARE_KEYS);
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder} with {@link #BUILDER_FLAG_SHARE_KEYS} set.
+     */
+    public FlexBuffersBuilder() {
+        this(256);
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder}.
+     *
+     * @param bb    `ByteBuffer` that will hold the message
+     * @param flags Share flags
+     */
+    @Deprecated
+    public FlexBuffersBuilder(ByteBuffer bb, int flags) {
+        this(new ArrayReadWriteBuf(bb.array()), flags);
+    }
+
+    public FlexBuffersBuilder(ReadWriteBuf bb, int flags) {
+        this.bb = bb;
+        this.flags = flags;
+    }
+
+    /**
+     * Constructs a newly allocated {@code FlexBuffersBuilder}.
+     * By default same keys will be serialized only once
+     * @param bb `ByteBuffer` that will hold the message
+     */
+    public FlexBuffersBuilder(ByteBuffer bb) {
+        this(bb, BUILDER_FLAG_SHARE_KEYS);
+    }
+
+    /**
+     * Reset the FlexBuffersBuilder by purging all data that it holds.
+     */
+    public void clear(){
+        bb.clear();
+        stack.clear();
+        keyPool.clear();
+        stringPool.clear();
+        finished = false;
+    }
+
+    /**
+     * Return `ByteBuffer` containing FlexBuffer message. {@code #finish()} must be called before calling this
+     * function otherwise an assert will trigger.
+     *
+     * @return `ByteBuffer` with finished message
+     */
+    public ReadWriteBuf getBuffer() {
+        assert (finished);
+        return bb;
+    }
+
+    /**
+     * Insert a single boolean into the buffer
+     * @param val true or false
+     */
+    public void putBoolean(boolean val) {
+        putBoolean(null, val);
+    }
+
+    /**
+     * Insert a single boolean into the buffer
+     * @param key key used to store element in map
+     * @param val true or false
+     */
+    public void putBoolean(String key, boolean val) {
+        stack.add(Value.bool(putKey(key), val));
+    }
+
+    private int putKey(String key) {
+        if (key == null) {
+            return -1;
+        }
+        int pos = bb.writePosition();
+        if ((flags & BUILDER_FLAG_SHARE_KEYS) != 0) {
+            Integer keyFromPool = keyPool.get(key);
+            if (keyFromPool == null) {
+                byte[]  keyBytes = key.getBytes(StandardCharsets.UTF_8);
+                bb.put(keyBytes, 0, keyBytes.length);
+                bb.put((byte) 0);
+                keyPool.put(key, pos);
+            } else {
+                pos = keyFromPool;
+            }
+        } else {
+            byte[]  keyBytes = key.getBytes(StandardCharsets.UTF_8);
+            bb.put(keyBytes, 0, keyBytes.length);
+            bb.put((byte) 0);
+            keyPool.put(key, pos);
+        }
+        return pos;
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param val integer
+     */
+    public void putInt(int val) {
+        putInt(null, val);
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param key key used to store element in map
+     * @param val integer
+     */
+    public void putInt(String key, int val) {
+        putInt(key, (long) val);
+    }
+
+    /**
+     * Adds a integer into the buff
+     * @param key key used to store element in map
+     * @param val 64-bit integer
+     */
+    public void putInt(String key, long val) {
+        int iKey = putKey(key);
+        if (Byte.MIN_VALUE <= val && val <= Byte.MAX_VALUE) {
+            stack.add(Value.int8(iKey, (int) val));
+        } else if (Short.MIN_VALUE <= val && val <= Short.MAX_VALUE) {
+            stack.add(Value.int16(iKey, (int) val));
+        } else if (Integer.MIN_VALUE <= val && val <= Integer.MAX_VALUE) {
+            stack.add(Value.int32(iKey, (int) val));
+        } else {
+            stack.add(Value.int64(iKey, val));
+        }
+    }
+
+    /**
+     * Adds a 64-bit integer into the buff
+     * @param value integer
+     */
+    public void putInt(long value) {
+        putInt(null, value);
+    }
+
+    /**
+     * Adds a unsigned integer into the buff.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt(int value) {
+        putUInt(null, (long) value);
+    }
+
+    /**
+     * Adds a unsigned integer (stored in a signed 64-bit integer) into the buff.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt(long value) {
+        putUInt(null, value);
+    }
+
+    /**
+     * Adds a 64-bit unsigned integer (stored as {@link BigInteger}) into the buff.
+     * Warning: This operation might be very slow.
+     * @param value integer representing unsigned value
+     */
+    public void putUInt64(BigInteger value) {
+        putUInt64(null, value.longValue());
+    }
+
+    private void putUInt64(String key, long value) {
+        stack.add(Value.uInt64(putKey(key), value));
+    }
+
+    private void putUInt(String key, long value) {
+        int iKey = putKey(key);
+        Value vVal;
+
+        int width = widthUInBits(value);
+
+        if (width == WIDTH_8) {
+            vVal = Value.uInt8(iKey, (int)value);
+        } else if (width == WIDTH_16) {
+            vVal = Value.uInt16(iKey, (int)value);
+        } else if (width == WIDTH_32) {
+            vVal = Value.uInt32(iKey, (int)value);
+        } else {
+            vVal = Value.uInt64(iKey, value);
+        }
+        stack.add(vVal);
+    }
+
+    /**
+     * Adds a 32-bit float into the buff.
+     * @param value float representing value
+     */
+    public void putFloat(float value) {
+        putFloat(null, value);
+    }
+
+    /**
+     * Adds a 32-bit float into the buff.
+     * @param key key used to store element in map
+     * @param value float representing value
+     */
+    public void putFloat(String key, float val) {
+        stack.add(Value.float32(putKey(key), val));
+    }
+
+    /**
+     * Adds a 64-bit float into the buff.
+     * @param value float representing value
+     */
+    public void putFloat(double value) {
+        putFloat(null, value);
+    }
+
+    /**
+     * Adds a 64-bit float into the buff.
+     * @param key key used to store element in map
+     * @param value float representing value
+     */
+    public void putFloat(String key, double val) {
+        stack.add(Value.float64(putKey(key), val));
+    }
+
+    /**
+     * Adds a String into the buffer
+     * @param value string
+     * @return start position of string in the buffer
+     */
+    public int putString(String value) {
+        return putString(null, value);
+    }
+
+    /**
+     * Adds a String into the buffer
+     * @param key key used to store element in map
+     * @param value string
+     * @return start position of string in the buffer
+     */
+    public int putString(String key, String val) {
+        int iKey = putKey(key);
+        if ((flags & FlexBuffersBuilder.BUILDER_FLAG_SHARE_STRINGS) != 0) {
+            Integer i = stringPool.get(val);
+            if (i == null) {
+                Value value = writeString(iKey, val);
+                stringPool.put(val, (int) value.iValue);
+                stack.add(value);
+                return (int) value.iValue;
+            } else {
+                int bitWidth = widthUInBits(val.length());
+                stack.add(Value.blob(iKey, i, FBT_STRING, bitWidth));
+                return i;
+            }
+        } else {
+            Value value = writeString(iKey, val);
+            stack.add(value);
+            return (int) value.iValue;
+        }
+    }
+
+    private Value writeString(int key, String s) {
+        return writeBlob(key, s.getBytes(StandardCharsets.UTF_8), FBT_STRING, true);
+    }
+
+    // in bits to fit a unsigned int
+    static int widthUInBits(long len) {
+        if (len <= byteToUnsignedInt((byte)0xff)) return WIDTH_8;
+        if (len <= shortToUnsignedInt((short)0xffff)) return WIDTH_16;
+        if (len <= intToUnsignedLong(0xffff_ffff)) return WIDTH_32;
+        return WIDTH_64;
+    }
+
+    private Value writeBlob(int key, byte[] blob, int type, boolean trailing) {
+        int bitWidth = widthUInBits(blob.length);
+        int byteWidth = align(bitWidth);
+        writeInt(blob.length, byteWidth);
+        int sloc = bb.writePosition();
+        bb.put(blob, 0, blob.length);
+        if (trailing) {
+            bb.put((byte) 0);
+        }
+        return Value.blob(key, sloc, type, bitWidth);
+    }
+
+    // Align to prepare for writing a scalar with a certain size.
+    private int align(int alignment) {
+        int byteWidth = 1 << alignment;
+        int padBytes = Value.paddingBytes(bb.writePosition(), byteWidth);
+        while (padBytes-- != 0) {
+            bb.put((byte) 0);
+        }
+        return byteWidth;
+    }
+
+    private void writeInt(long value, int byteWidth) {
+        switch (byteWidth) {
+            case 1: bb.put((byte) value); break;
+            case 2: bb.putShort((short) value); break;
+            case 4: bb.putInt((int) value); break;
+            case 8: bb.putLong(value); break;
+        }
+    }
+
+    /**
+     * Adds a byte array into the message
+     * @param value byte array
+     * @return position in buffer as the start of byte array
+     */
+    public int putBlob(byte[] value) {
+        return putBlob(null, value);
+    }
+
+    /**
+     * Adds a byte array into the message
+     * @param key key used to store element in map
+     * @param value byte array
+     * @return position in buffer as the start of byte array
+     */
+    public int putBlob(String key, byte[] val) {
+        int iKey = putKey(key);
+        Value value = writeBlob(iKey, val, FBT_BLOB, false);
+        stack.add(value);
+        return (int) value.iValue;
+    }
+
+    /**
+     * Start a new vector in the buffer.
+     * @return a reference indicating position of the vector in buffer. This
+     * reference must be passed along when the vector is finished using endVector()
+     */
+    public int startVector() {
+        return stack.size();
+    }
+
+    /**
+     * Finishes a vector, but writing the information in the buffer
+     * @param key   key used to store element in map
+     * @param start reference for begining of the vector. Returned by {@link startVector()}
+     * @param typed boolean indicating wether vector is typed
+     * @param fixed boolean indicating wether vector is fixed
+     * @return      Reference to the vector
+     */
+    public int endVector(String key, int start, boolean typed, boolean fixed) {
+        int iKey = putKey(key);
+        Value vec = createVector(iKey, start, stack.size() - start, typed, fixed, null);
+        // Remove temp elements and return vector.
+        while (stack.size() > start) {
+            stack.remove(stack.size() - 1);
+        }
+        stack.add(vec);
+        return (int) vec.iValue;
+    }
+
+    /**
+     * Finish writing the message into the buffer. After that no other element must
+     * be inserted into the buffer. Also, you must call this function before start using the
+     * FlexBuffer message
+     * @return `ByteBuffer` containing the FlexBuffer message
+     */
+    public ByteBuffer finish() {
+        // If you hit this assert, you likely have objects that were never included
+        // in a parent. You need to have exactly one root to finish a buffer.
+        // Check your Start/End calls are matched, and all objects are inside
+        // some other object.
+        assert (stack.size() == 1);
+        // Write root value.
+        int byteWidth = align(stack.get(0).elemWidth(bb.writePosition(), 0));
+        writeAny(stack.get(0), byteWidth);
+        // Write root type.
+        bb.put(stack.get(0).storedPackedType());
+        // Write root size. Normally determined by parent, but root has no parent :)
+        bb.put((byte) byteWidth);
+        this.finished = true;
+        return ByteBuffer.wrap(bb.data(), 0, bb.writePosition());
+    }
+
+    /*
+     * Create a vector based on the elements stored in the stack
+     *
+     * @param key    reference to its key
+     * @param start  element in the stack
+     * @param length size of the vector
+     * @param typed  whether is TypedVector or not
+     * @param fixed  whether is Fixed vector or not
+     * @param keys   Value representing key vector
+     * @return Value representing the created vector
+     */
+    private Value createVector(int key, int start, int length, boolean typed, boolean fixed, Value keys) {
+        assert (!fixed || typed); // typed=false, fixed=true combination is not supported.
+        // Figure out smallest bit width we can store this vector with.
+        int bitWidth = Math.max(WIDTH_8, widthUInBits(length));
+        int prefixElems = 1;
+        if (keys != null) {
+            // If this vector is part of a map, we will pre-fix an offset to the keys
+            // to this vector.
+            bitWidth = Math.max(bitWidth, keys.elemWidth(bb.writePosition(), 0));
+            prefixElems += 2;
+        }
+        int vectorType = FBT_KEY;
+        // Check bit widths and types for all elements.
+        for (int i = start; i < stack.size(); i++) {
+            int elemWidth = stack.get(i).elemWidth(bb.writePosition(), i + prefixElems);
+            bitWidth = Math.max(bitWidth, elemWidth);
+            if (typed) {
+                if (i == start) {
+                    vectorType = stack.get(i).type;
+                    if (!FlexBuffers.isTypedVectorElementType(vectorType)) {
+                        throw new FlexBufferException("TypedVector does not support this element type");
+                    }
+                } else {
+                    // If you get this assert, you are writing a typed vector with
+                    // elements that are not all the same type.
+                    assert (vectorType == stack.get(i).type);
+                }
+            }
+        }
+        // If you get this assert, your fixed types are not one of:
+        // Int / UInt / Float / Key.
+        assert (!fixed || FlexBuffers.isTypedVectorElementType(vectorType));
+
+        int byteWidth = align(bitWidth);
+        // Write vector. First the keys width/offset if available, and size.
+        if (keys != null) {
+            writeOffset(keys.iValue, byteWidth);
+            writeInt(1L << keys.minBitWidth, byteWidth);
+        }
+        if (!fixed) {
+            writeInt(length, byteWidth);
+        }
+        // Then the actual data.
+        int vloc = bb.writePosition();
+        for (int i = start; i < stack.size(); i++) {
+            writeAny(stack.get(i), byteWidth);
+        }
+        // Then the types.
+        if (!typed) {
+            for (int i = start; i < stack.size(); i++) {
+                bb.put(stack.get(i).storedPackedType(bitWidth));
+            }
+        }
+        return new Value(key, keys != null ? FBT_MAP
+                : (typed ? FlexBuffers.toTypedVector(vectorType, fixed ? length : 0)
+                : FBT_VECTOR), bitWidth, vloc);
+    }
+
+    private void writeOffset(long val, int byteWidth) {
+        int reloff = (int) (bb.writePosition() - val);
+        assert (byteWidth == 8 || reloff < 1L << (byteWidth * 8));
+        writeInt(reloff, byteWidth);
+    }
+
+    private void writeAny(final Value val, int byteWidth) {
+        switch (val.type) {
+            case FBT_NULL:
+            case FBT_BOOL:
+            case FBT_INT:
+            case FBT_UINT:
+                writeInt(val.iValue, byteWidth);
+                break;
+            case FBT_FLOAT:
+                writeDouble(val.dValue, byteWidth);
+                break;
+            default:
+                writeOffset(val.iValue, byteWidth);
+                break;
+        }
+    }
+
+    private void writeDouble(double val, int byteWidth) {
+        if (byteWidth == 4) {
+            bb.putFloat((float) val);
+        } else if (byteWidth == 8) {
+            bb.putDouble(val);
+        }
+    }
+
+    /**
+     * Start a new map in the buffer.
+     * @return a reference indicating position of the map in buffer. This
+     * reference must be passed along when the map is finished using endMap()
+     */
+    public int startMap() {
+        return stack.size();
+    }
+
+    /**
+     * Finishes a map, but writing the information in the buffer
+     * @param key   key used to store element in map
+     * @param start reference for begining of the map. Returned by {@link startMap()}
+     * @return      Reference to the map
+     */
+    public int endMap(String key, int start) {
+        int iKey = putKey(key);
+
+        Collections.sort(stack.subList(start, stack.size()), keyComparator);
+
+        Value keys = createKeyVector(start, stack.size() - start);
+        Value vec = createVector(iKey, start, stack.size() - start, false, false, keys);
+        // Remove temp elements and return map.
+        while (stack.size() > start) {
+            stack.remove(stack.size() - 1);
+        }
+        stack.add(vec);
+        return (int) vec.iValue;
+    }
+
+    private Value createKeyVector(int start, int length) {
+        // Figure out smallest bit width we can store this vector with.
+        int bitWidth = Math.max(WIDTH_8, widthUInBits(length));
+        int prefixElems = 1;
+        // Check bit widths and types for all elements.
+        for (int i = start; i < stack.size(); i++) {
+            int elemWidth = Value.elemWidth(FBT_KEY, WIDTH_8, stack.get(i).key, bb.writePosition(), i + prefixElems);
+            bitWidth = Math.max(bitWidth, elemWidth);
+        }
+
+        int byteWidth = align(bitWidth);
+        // Write vector. First the keys width/offset if available, and size.
+        writeInt(length, byteWidth);
+        // Then the actual data.
+        int vloc = bb.writePosition();
+        for (int i = start; i < stack.size(); i++) {
+            int pos = stack.get(i).key;
+            assert(pos != -1);
+            writeOffset(stack.get(i).key, byteWidth);
+        }
+        // Then the types.
+        return new Value(-1, FlexBuffers.toTypedVector(FBT_KEY,0), bitWidth, vloc);
+    }
+
+    private static class Value {
+        final int type;
+        // for scalars, represents scalar size in bytes
+        // for vectors, represents the size
+        // for string, length
+        final int minBitWidth;
+        // float value
+        final double dValue;
+        // integer value
+        long iValue;
+        // position of the key associated with this value in buffer
+        int key;
+
+        Value(int key, int type, int bitWidth, long iValue) {
+            this.key = key;
+            this.type = type;
+            this.minBitWidth = bitWidth;
+            this.iValue = iValue;
+            this.dValue = Double.MIN_VALUE;
+        }
+
+        Value(int key, int type, int bitWidth, double dValue) {
+            this.key = key;
+            this.type = type;
+            this.minBitWidth = bitWidth;
+            this.dValue = dValue;
+            this.iValue = Long.MIN_VALUE;
+        }
+
+        static Value bool(int key, boolean b) {
+            return new Value(key, FBT_BOOL, WIDTH_8, b ? 1 : 0);
+        }
+
+        static Value blob(int key, int position, int type, int bitWidth) {
+            return new Value(key, type, bitWidth, position);
+        }
+
+        static Value int8(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_8, value);
+        }
+
+        static Value int16(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_16, value);
+        }
+
+        static Value int32(int key, int value) {
+            return new Value(key, FBT_INT, WIDTH_32, value);
+        }
+
+        static Value int64(int key, long value) {
+            return new Value(key, FBT_INT, WIDTH_64, value);
+        }
+
+        static Value uInt8(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_8, value);
+        }
+
+        static Value uInt16(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_16, value);
+        }
+
+        static Value uInt32(int key, int value) {
+            return new Value(key, FBT_UINT, WIDTH_32, value);
+        }
+
+        static Value uInt64(int key, long value) {
+            return new Value(key, FBT_UINT, WIDTH_64, value);
+        }
+
+        static Value float32(int key, float value) {
+            return new Value(key, FBT_FLOAT, WIDTH_32, value);
+        }
+
+        static Value float64(int key, double value) {
+            return new Value(key, FBT_FLOAT, WIDTH_64, value);
+        }
+
+        private byte storedPackedType() {
+            return storedPackedType(WIDTH_8);
+        }
+
+        private byte storedPackedType(int parentBitWidth) {
+            return packedType(storedWidth(parentBitWidth), type);
+        }
+
+        private static byte packedType(int bitWidth, int type) {
+            return (byte) (bitWidth | (type << 2));
+        }
+
+        private int storedWidth(int parentBitWidth) {
+            if (FlexBuffers.isTypeInline(type)) {
+                return Math.max(minBitWidth, parentBitWidth);
+            } else {
+                return minBitWidth;
+            }
+        }
+
+        private int elemWidth(int bufSize, int elemIndex) {
+            return elemWidth(type, minBitWidth, iValue, bufSize, elemIndex);
+        }
+
+        private static int elemWidth(int type, int minBitWidth, long iValue, int bufSize, int elemIndex) {
+            if (FlexBuffers.isTypeInline(type)) {
+                return minBitWidth;
+            } else {
+                // We have an absolute offset, but want to store a relative offset
+                // elem_index elements beyond the current buffer end. Since whether
+                // the relative offset fits in a certain byte_width depends on
+                // the size of the elements before it (and their alignment), we have
+                // to test for each size in turn.
+
+                // Original implementation checks for largest scalar
+                // which is long unsigned int
+                for (int byteWidth = 1; byteWidth <= 32; byteWidth *= 2) {
+                    // Where are we going to write this offset?
+                    int offsetLoc = bufSize + paddingBytes(bufSize, byteWidth) + (elemIndex * byteWidth);
+                    // Compute relative offset.
+                    long offset = offsetLoc - iValue;
+                    // Does it fit?
+                    int bitWidth = widthUInBits((int) offset);
+                    if (((1L) << bitWidth) == byteWidth)
+                        return bitWidth;
+                }
+                assert (false);  // Must match one of the sizes above.
+                return WIDTH_64;
+            }
+        }
+
+        private static int paddingBytes(int bufSize, int scalarSize) {
+            return ((~bufSize) + 1) & (scalarSize - 1);
+        }
+    }
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FloatVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FloatVector.java
new file mode 100644
index 0000000..5c505ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/FloatVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of float values.
+ */
+public final class FloatVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public FloatVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_FLOAT, _bb); return this;
+  }
+
+  /**
+   * Reads the float value at the given index.
+   *
+   * @param j The index from which the float value will be read.
+   * @return the float value at the given index.
+   */
+  public float get(int j) { 
+    return bb.getFloat(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/IntVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/IntVector.java
new file mode 100644
index 0000000..85549f4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/IntVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 32-bit values.
+ */
+public final class IntVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public IntVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_INT, _bb); return this;
+  }
+
+  /**
+   * Reads the integer at the given index.
+   *
+   * @param j The index from which the integer will be read.
+   * @return the 32-bit value at the given index.
+   */
+  public int get(int j) {
+    return bb.getInt(__element(j));
+  }
+
+  /**
+   * Reads the integer at the given index, zero-extends it to type long, and returns the result,
+   * which is therefore in the range 0 through 4294967295.
+   *
+   * @param j The index from which the integer will be read.
+   * @return the unsigned 32-bit at the given index.
+   */
+  public long getAsUnsigned(int j) {
+    return (long) get(j) & 0xFFFFFFFFL;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/LongVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/LongVector.java
new file mode 100644
index 0000000..0ca5ab8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/LongVector.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of long values.
+ */
+public final class LongVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public LongVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_LONG, _bb); return this;
+  }
+
+  /**
+   * Reads the long value at the given index.
+   *
+   * @param j The index from which the long value will be read.
+   * @return the signed 64-bit value at the given index.
+   */
+  public long get(int j) {
+    return bb.getLong(__element(j));
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadBuf.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadBuf.java
new file mode 100644
index 0000000..751361f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadBuf.java
@@ -0,0 +1,81 @@
+package com.google.flatbuffers;
+
+/**
+ *  Represent a chunk of data, where FlexBuffers will read from.
+ */
+public interface ReadBuf {
+
+  /**
+   * Read boolean from data. Booleans as stored as single byte
+   * @param index position of the element in ReadBuf
+   * @return boolean element
+   */
+  boolean getBoolean(int index);
+
+  /**
+   * Read a byte from data.
+   * @param index position of the element in ReadBuf
+   * @return a byte
+   */
+  byte get(int index);
+
+  /**
+   * Read a short from data.
+   * @param index position of the element in ReadBuf
+   * @return a short
+   */
+  short getShort(int index);
+
+  /**
+   * Read a 32-bit int from data.
+   * @param index position of the element in ReadBuf
+   * @return an int
+   */
+  int getInt(int index);
+
+  /**
+   * Read a 64-bit long from data.
+   * @param index position of the element in ReadBuf
+   * @return a long
+   */
+  long getLong(int index);
+
+  /**
+   * Read a 32-bit float from data.
+   * @param index position of the element in ReadBuf
+   * @return a float
+   */
+  float getFloat(int index);
+
+  /**
+   * Read a 64-bit float from data.
+   * @param index position of the element in ReadBuf
+   * @return a double
+   */
+  double getDouble(int index);
+
+  /**
+   * Read an UTF-8 string from data.
+   * @param start initial element of the string
+   * @param size size of the string in bytes.
+   * @return a {@code String}
+   */
+  String getString(int start, int size);
+
+  /**
+   * Expose ReadBuf as an array of bytes.
+   * This method is meant to be as efficient as possible, so for a array-backed ReadBuf, it should
+   * return its own internal data. In case access to internal data is not possible,
+   * a copy of the data into an array of bytes might occur.
+   * @return ReadBuf as an array of bytes
+   */
+  byte[] data();
+
+  /**
+   * Defines the size of the message in the buffer. It also determines last position that buffer
+   * can be read. Last byte to be accessed is in position {@code limit() -1}.
+   * @return indicate last position
+   */
+  int limit();
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadWriteBuf.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadWriteBuf.java
new file mode 100644
index 0000000..6eb43bd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ReadWriteBuf.java
@@ -0,0 +1,142 @@
+package com.google.flatbuffers;
+
+/**
+ * Interface to represent a read-write buffer. This interface will be used to access and write
+ * FlexBuffers message.
+ */
+public interface ReadWriteBuf extends ReadBuf {
+
+    /**
+     * Clears (resets) the buffer so that it can be reused. Write position will be set to the
+     * start.
+     */
+    void clear();
+
+    /**
+     * Put a boolean into the buffer at {@code writePosition()} . Booleans as stored as single
+     * byte. Write position will be incremented.
+     * @return boolean element
+     */
+    void putBoolean(boolean value);
+
+    /**
+     * Put an array of bytes into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     * @param value the data to be copied
+     * @param start initial position on value to be copied
+     * @param length amount of bytes to be copied
+     */
+    void put (byte[] value, int start, int length);
+
+    /**
+     * Write a byte into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void put(byte value);
+
+    /**
+     * Write a 16-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putShort(short value);
+
+    /**
+     * Write a 32-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putInt(int value);
+
+    /**
+     * Write a 64-bit into in the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putLong(long value);
+
+    /**
+     * Write a 32-bit float into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putFloat(float value);
+
+    /**
+     * Write a 64-bit float into the buffer at {@code writePosition()}. Write position will be
+     * incremented.
+     */
+    void putDouble(double value);
+
+    /**
+     * Write boolean into a given position on the buffer. Booleans as stored as single byte.
+     * @param index position of the element in buffer
+     */
+    void setBoolean(int index, boolean value);
+
+    /**
+     * Read a byte from data.
+     * @param index position of the element in the buffer
+     * @return a byte
+     */
+    void set(int index, byte value);
+
+    /**
+     * Write an array of bytes into the buffer.
+     * @param index initial position of the buffer to be written
+     * @param value the data to be copied
+     * @param start initial position on value to be copied
+     * @param length amount of bytes to be copied
+     */
+    void set(int index, byte[] value, int start, int length);
+
+    /**
+     * Read a short from data.
+     * @param index position of the element in ReadBuf
+     * @return a short
+     */
+    void setShort(int index, short value);
+
+    /**
+     * Read a 32-bit int from data.
+     * @param index position of the element in ReadBuf
+     * @return an int
+     */
+    void setInt(int index, int value);
+
+    /**
+     * Read a 64-bit long from data.
+     * @param index position of the element in ReadBuf
+     * @return a long
+     */
+    void setLong(int index, long value);
+
+    /**
+     * Read a 32-bit float from data.
+     * @param index position of the element in ReadBuf
+     * @return a float
+     */
+    void setFloat(int index, float value);
+
+    /**
+     * Read a 64-bit float from data.
+     * @param index position of the element in ReadBuf
+     * @return a double
+     */
+    void setDouble(int index, double value);
+
+
+    int writePosition();
+    /**
+     * Defines the size of the message in the buffer. It also determines last position that buffer
+     * can be read or write. Last byte to be accessed is in position {@code limit() -1}.
+     * @return indicate last position
+     */
+    int limit();
+
+    /**
+     * Request capacity of the buffer. In case buffer is already larger
+     * than the requested, this method will just return true. Otherwise
+     * It might try to resize the buffer.
+     *
+     * @return true if buffer is able to offer
+     * the requested capacity
+     */
+    boolean requestCapacity(int capacity);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ShortVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ShortVector.java
new file mode 100644
index 0000000..b02ac3e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/ShortVector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of signed or unsigned 16-bit values.
+ */
+public final class ShortVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public ShortVector __assign(int _vector, ByteBuffer _bb) {
+    __reset(_vector, Constants.SIZEOF_SHORT, _bb); return this;
+  }
+
+  /**
+   * Reads the short value at the given index.
+   *
+   * @param j The index from which the short value will be read.
+   * @return the 16-bit value at the given index.
+   */
+  public short get(int j) {
+    return bb.getShort(__element(j));
+  }
+
+  /**
+   * Reads the short at the given index, zero-extends it to type int, and returns the result,
+   * which is therefore in the range 0 through 65535.
+   *
+   * @param j The index from which the short value will be read.
+   * @return the unsigned 16-bit at the given index.
+   */
+  public int getAsUnsigned(int j) {
+    return (int) get(j) & 0xFFFF;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/StringVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/StringVector.java
new file mode 100644
index 0000000..6c20775
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/StringVector.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of String.
+ */
+public final class StringVector extends BaseVector {
+  private Utf8 utf8 = Utf8.getDefault();
+
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _element_size Size of a vector element.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public StringVector __assign(int _vector, int _element_size, ByteBuffer _bb) {
+    __reset(_vector, _element_size, _bb); return this;
+  }
+
+  /**
+   * Reads the String at the given index.
+   *
+   * @param j The index from which the String value will be read.
+   * @return the String at the given index.
+   */
+  public String get(int j) {
+    return Table.__string(__element(j), bb, utf8);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Struct.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Struct.java
new file mode 100644
index 0000000..c92164f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Struct.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All structs in the generated code derive from this class, and add their own accessors.
+ */
+public class Struct {
+  /** Used to hold the position of the `bb` buffer. */
+  protected int bb_pos;
+  /** The underlying ByteBuffer to hold the data of the Struct. */
+  protected ByteBuffer bb;
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _i, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      bb_pos = _i;
+    } else {
+      bb_pos = 0;
+    }
+  }
+
+  /**
+   * Resets internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling Struct instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   *
+   * @param struct the instance to reset to initial state
+   */
+  public void __reset() {
+    __reset(0, null);
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Table.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Table.java
new file mode 100644
index 0000000..7f41639
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Table.java
@@ -0,0 +1,322 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+/// @cond FLATBUFFERS_INTERNAL
+
+/**
+ * All tables in the generated code derive from this class, and add their own accessors.
+ */
+public class Table {
+  /** Used to hold the position of the `bb` buffer. */
+  protected int bb_pos;
+  /** The underlying ByteBuffer to hold the data of the Table. */
+  protected ByteBuffer bb;
+  /** Used to hold the vtable position. */
+  private int vtable_start;
+  /** Used to hold the vtable size. */
+  private int vtable_size;
+  Utf8 utf8 = Utf8.getDefault();
+
+  /**
+   * Get the underlying ByteBuffer.
+   *
+   * @return Returns the Table's ByteBuffer.
+   */
+  public ByteBuffer getByteBuffer() { return bb; }
+
+  /**
+   * Look up a field in the vtable.
+   *
+   * @param vtable_offset An `int` offset to the vtable in the Table's ByteBuffer.
+   * @return Returns an offset into the object, or `0` if the field is not present.
+   */
+  protected int __offset(int vtable_offset) {
+    return vtable_offset < vtable_size ? bb.getShort(vtable_start + vtable_offset) : 0;
+  }
+
+  protected static int __offset(int vtable_offset, int offset, ByteBuffer bb) {
+    int vtable = bb.capacity() - offset;
+    return bb.getShort(vtable + vtable_offset - bb.getInt(vtable)) + vtable;
+  }
+
+  /**
+   * Retrieve a relative offset.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer containing the relative offset.
+   * @return Returns the relative offset stored at `offset`.
+   */
+  protected int __indirect(int offset) {
+    return offset + bb.getInt(offset);
+  }
+
+  /**
+   * Retrieve a relative offset.
+   *
+   * @param offset An `int` index into a ByteBuffer containing the relative offset.
+   * @param bb from which the relative offset will be retrieved.
+   * @return Returns the relative offset stored at `offset`.
+   */
+  protected static int __indirect(int offset, ByteBuffer bb) {
+    return offset + bb.getInt(offset);
+  }
+
+  /**
+   * Create a Java `String` from UTF-8 data stored inside the FlatBuffer.
+   *
+   * This allocates a new string and converts to wide chars upon each access,
+   * which is not very efficient. Instead, each FlatBuffer string also comes with an
+   * accessor based on __vector_as_bytebuffer below, which is much more efficient,
+   * assuming your Java program can handle UTF-8 data directly.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns a `String` from the data stored inside the FlatBuffer at `offset`.
+   */
+  protected String __string(int offset) {
+    return __string(offset, bb, utf8);
+  }
+
+  /**
+   * Create a Java `String` from UTF-8 data stored inside the FlatBuffer.
+   *
+   * This allocates a new string and converts to wide chars upon each access,
+   * which is not very efficient. Instead, each FlatBuffer string also comes with an
+   * accessor based on __vector_as_bytebuffer below, which is much more efficient,
+   * assuming your Java program can handle UTF-8 data directly.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @param bb Table ByteBuffer used to read a string at given offset.
+   * @param utf8 decoder that creates a Java `String` from UTF-8 characters.
+   * @return Returns a `String` from the data stored inside the FlatBuffer at `offset`.
+   */
+  protected static String __string(int offset, ByteBuffer bb, Utf8 utf8) {
+    offset += bb.getInt(offset);
+    int length = bb.getInt(offset);
+    return utf8.decodeUtf8(bb, offset + SIZEOF_INT, length);
+  }
+
+  /**
+   * Get the length of a vector.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the length of the vector whose offset is stored at `offset`.
+   */
+  protected int __vector_len(int offset) {
+    offset += bb_pos;
+    offset += bb.getInt(offset);
+    return bb.getInt(offset);
+  }
+
+  /**
+   * Get the start data of a vector.
+   *
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the start of the vector data whose offset is stored at `offset`.
+   */
+  protected int __vector(int offset) {
+    offset += bb_pos;
+    return offset + bb.getInt(offset) + SIZEOF_INT;  // data starts after the length
+  }
+
+  /**
+   * Get a whole vector as a ByteBuffer.
+   *
+   * This is efficient, since it only allocates a new {@link ByteBuffer} object,
+   * but does not actually copy the data, it still refers to the same bytes
+   * as the original ByteBuffer. Also useful with nested FlatBuffers, etc.
+   *
+   * @param vector_offset The position of the vector in the byte buffer
+   * @param elem_size The size of each element in the array
+   * @return The {@link ByteBuffer} for the array
+   */
+  protected ByteBuffer __vector_as_bytebuffer(int vector_offset, int elem_size) {
+    int o = __offset(vector_offset);
+    if (o == 0) return null;
+    ByteBuffer bb = this.bb.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+    int vectorstart = __vector(o);
+    bb.position(vectorstart);
+    bb.limit(vectorstart + __vector_len(o) * elem_size);
+    return bb;
+  }
+
+  /**
+   * Initialize vector as a ByteBuffer.
+   *
+   * This is more efficient than using duplicate, since it doesn't copy the data
+   * nor allocattes a new {@link ByteBuffer}, creating no garbage to be collected.
+   *
+   * @param bb The {@link ByteBuffer} for the array
+   * @param vector_offset The position of the vector in the byte buffer
+   * @param elem_size The size of each element in the array
+   * @return The {@link ByteBuffer} for the array
+   */
+  protected ByteBuffer __vector_in_bytebuffer(ByteBuffer bb, int vector_offset, int elem_size) {
+    int o = this.__offset(vector_offset);
+    if (o == 0) return null;
+    int vectorstart = __vector(o);
+    bb.rewind();
+    bb.limit(vectorstart + __vector_len(o) * elem_size);
+    bb.position(vectorstart);
+    return bb;
+  }
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `offset`.
+   *
+   * @param t A `Table`-derived type that should point to the union at `offset`.
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @return Returns the Table that points to the union at `offset`.
+   */
+  protected Table __union(Table t, int offset) {
+    return __union(t, offset, bb);
+  }
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `offset`.
+   *
+   * @param t A `Table`-derived type that should point to the union at `offset`.
+   * @param offset An `int` index into the Table's ByteBuffer.
+   * @param bb Table ByteBuffer used to initialize the object Table-derived type.
+   * @return Returns the Table that points to the union at `offset`.
+   */
+  protected static Table __union(Table t, int offset, ByteBuffer bb) {
+    t.__reset(__indirect(offset, bb), bb);
+    return t;
+  }
+
+  /**
+   * Check if a {@link ByteBuffer} contains a file identifier.
+   *
+   * @param bb A {@code ByteBuffer} to check if it contains the identifier
+   * `ident`.
+   * @param ident A `String` identifier of the FlatBuffer file.
+   * @return True if the buffer contains the file identifier
+   */
+  protected static boolean __has_identifier(ByteBuffer bb, String ident) {
+    if (ident.length() != FILE_IDENTIFIER_LENGTH)
+        throw new AssertionError("FlatBuffers: file identifier must be length " +
+                                 FILE_IDENTIFIER_LENGTH);
+    for (int i = 0; i < FILE_IDENTIFIER_LENGTH; i++) {
+      if (ident.charAt(i) != (char)bb.get(bb.position() + SIZEOF_INT + i)) return false;
+    }
+    return true;
+  }
+
+  /**
+   * Sort tables by the key.
+   *
+   * @param offsets An 'int' indexes of the tables into the bb.
+   * @param bb A {@code ByteBuffer} to get the tables.
+   */
+  protected void sortTables(int[] offsets, final ByteBuffer bb) {
+    Integer[] off = new Integer[offsets.length];
+    for (int i = 0; i < offsets.length; i++) off[i] = offsets[i];
+    java.util.Arrays.sort(off, new java.util.Comparator<Integer>() {
+      public int compare(Integer o1, Integer o2) {
+        return keysCompare(o1, o2, bb);
+      }
+    });
+    for (int i = 0; i < offsets.length; i++) offsets[i] = off[i];
+  }
+
+  /**
+   * Compare two tables by the key.
+   *
+   * @param o1 An 'Integer' index of the first key into the bb.
+   * @param o2 An 'Integer' index of the second key into the bb.
+   * @param bb A {@code ByteBuffer} to get the keys.
+   */
+  protected int keysCompare(Integer o1, Integer o2, ByteBuffer bb) { return 0; }
+
+  /**
+   * Compare two strings in the buffer.
+   *
+   * @param offset_1 An 'int' index of the first string into the bb.
+   * @param offset_2 An 'int' index of the second string into the bb.
+   * @param bb A {@code ByteBuffer} to get the strings.
+   */
+  protected static int compareStrings(int offset_1, int offset_2, ByteBuffer bb) {
+    offset_1 += bb.getInt(offset_1);
+    offset_2 += bb.getInt(offset_2);
+    int len_1 = bb.getInt(offset_1);
+    int len_2 = bb.getInt(offset_2);
+    int startPos_1 = offset_1 + SIZEOF_INT;
+    int startPos_2 = offset_2 + SIZEOF_INT;
+    int len = Math.min(len_1, len_2);
+    for(int i = 0; i < len; i++) {
+      if (bb.get(i + startPos_1) != bb.get(i + startPos_2))
+        return bb.get(i + startPos_1) - bb.get(i + startPos_2);
+    }
+    return len_1 - len_2;
+  }
+
+  /**
+   * Compare string from the buffer with the 'String' object.
+   *
+   * @param offset_1 An 'int' index of the first string into the bb.
+   * @param key Second string as a byte array.
+   * @param bb A {@code ByteBuffer} to get the first string.
+   */
+  protected static int compareStrings(int offset_1, byte[] key, ByteBuffer bb) {
+    offset_1 += bb.getInt(offset_1);
+    int len_1 = bb.getInt(offset_1);
+    int len_2 = key.length;
+    int startPos_1 = offset_1 + Constants.SIZEOF_INT;
+    int len = Math.min(len_1, len_2);
+    for (int i = 0; i < len; i++) {
+      if (bb.get(i + startPos_1) != key[i])
+        return bb.get(i + startPos_1) - key[i];
+    }
+    return len_1 - len_2;
+  }
+
+  /**
+   * Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references.
+   */
+  protected void __reset(int _i, ByteBuffer _bb) { 
+    bb = _bb;
+    if (bb != null) {
+      bb_pos = _i;
+      vtable_start = bb_pos - bb.getInt(bb_pos);
+      vtable_size = bb.getShort(vtable_start);
+    } else {
+      bb_pos = 0;
+      vtable_start = 0;
+      vtable_size = 0;
+    }
+  }
+
+  /**
+   * Resets the internal state with a null {@code ByteBuffer} and a zero position.
+   *
+   * This method exists primarily to allow recycling Table instances without risking memory leaks
+   * due to {@code ByteBuffer} references. The instance will be unusable until it is assigned
+   * again to a {@code ByteBuffer}.
+   */
+  public void __reset() {
+    __reset(0, null);
+  }
+}
+
+/// @endcond
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/UnionVector.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/UnionVector.java
new file mode 100644
index 0000000..986cfea
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/UnionVector.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import static com.google.flatbuffers.Constants.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+
+/**
+ * Helper type for accessing vector of unions.
+ */
+public final class UnionVector extends BaseVector {
+  /**
+   * Assigns vector access object to vector data.
+   *
+   * @param _vector Start data of a vector.
+   * @param _element_size Size of a vector element.
+   * @param _bb Table's ByteBuffer.
+   * @return Returns current vector access object assigned to vector data whose offset is stored at
+   *         `vector`.
+   */
+  public UnionVector __assign(int _vector, int _element_size, ByteBuffer _bb) {
+    __reset(_vector, _element_size, _bb); return this;
+  }
+
+
+  /**
+   * Initialize any Table-derived type to point to the union at the given `index`.
+   *
+   * @param obj A `Table`-derived type that should point to the union at `index`.
+   * @param j An `int` index into the union vector.
+   * @return Returns the Table that points to the union at `index`.
+   */
+  public Table get(Table obj, int j) {
+    return Table.__union(obj, __element(j), bb);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8.java
new file mode 100644
index 0000000..e8af8ad
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8.java
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.MIN_HIGH_SURROGATE;
+import static java.lang.Character.MIN_LOW_SURROGATE;
+import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
+import static java.lang.Character.isSurrogatePair;
+import static java.lang.Character.toCodePoint;
+
+public abstract class Utf8 {
+
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
+   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
+   *     surrogates)
+   */
+  public abstract int encodedLength(CharSequence sequence);
+
+  /**
+   * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
+   *
+   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
+   * and the capabilities of the platform.
+   *
+   * @param in the source string to be encoded
+   * @param out the target buffer to receive the encoded string.
+   */
+  public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
+
+  /**
+   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
+   *
+   * @throws IllegalArgumentException if the input is not valid UTF-8.
+   */
+  public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
+
+  private static Utf8 DEFAULT;
+
+  /**
+   * Get the default UTF-8 processor.
+   * @return the default processor
+   */
+  public static Utf8 getDefault() {
+    if (DEFAULT == null) {
+      DEFAULT = new Utf8Safe();
+    }
+    return DEFAULT;
+  }
+
+  /**
+   * Set the default instance of the UTF-8 processor.
+   * @param instance the new instance to use
+   */
+  public static void setDefault(Utf8 instance) {
+    DEFAULT = instance;
+  }
+
+  /**
+   * Encode a Java's CharSequence UTF8 codepoint into a byte array.
+   * @param in CharSequence to be encoded
+   * @param start start position of the first char in the codepoint
+   * @param out byte array of 4 bytes to be filled
+   * @return return the amount of bytes occupied by the codepoint
+   */
+  public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
+    // utf8 codepoint needs at least 4 bytes
+    assert out.length >= 4;
+
+    final int inLength = in.length();
+    if (start >= inLength) {
+      return 0;
+    }
+
+    char c = in.charAt(start);
+     if (c < 0x80) {
+       // One byte (0xxx xxxx)
+       out[0] = (byte) c;
+       return 1;
+     } else if (c < 0x800) {
+      // Two bytes (110x xxxx 10xx xxxx)
+      out[0] = (byte) (0xC0 | (c >>> 6));
+      out[1] = (byte) (0x80 | (0x3F & c));
+      return 2;
+    } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
+      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+      // Maximum single-char code point is 0xFFFF, 16 bits.
+      out[0] = (byte) (0xE0 | (c >>> 12));
+      out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
+      out[2] = (byte) (0x80 | (0x3F & c));
+      return 3;
+    } else {
+      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+      // bytes
+      final char low;
+      if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
+        throw new UnpairedSurrogateException(start, inLength);
+      }
+      int codePoint = toCodePoint(c, low);
+      out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
+      out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
+      out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
+      out[3] = (byte) (0x80 | (0x3F & codePoint));
+      return 4;
+    }
+  }
+
+  /**
+   * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
+   * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
+   * checks and codepoint conversion happen in this class.
+   */
+  static class DecodeUtil {
+
+    /**
+     * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
+     */
+    static boolean isOneByte(byte b) {
+      return b >= 0;
+    }
+
+    /**
+     * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
+     */
+    static boolean isTwoBytes(byte b) {
+      return b < (byte) 0xE0;
+    }
+
+    /**
+     * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
+     */
+    static boolean isThreeBytes(byte b) {
+      return b < (byte) 0xF0;
+    }
+
+    static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
+      resultArr[resultPos] = (char) byte1;
+    }
+
+    static void handleTwoBytes(
+        byte byte1, byte byte2, char[] resultArr, int resultPos)
+        throws IllegalArgumentException {
+      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
+      // overlong 2-byte, '11000001'.
+      if (byte1 < (byte) 0xC2) {
+        throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
+      }
+      if (isNotTrailingByte(byte2)) {
+        throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
+      }
+      resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
+    }
+
+    static void handleThreeBytes(
+        byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
+        throws IllegalArgumentException {
+      if (isNotTrailingByte(byte2)
+              // overlong? 5 most significant bits must not all be zero
+              || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
+              // check for illegal surrogate codepoints
+              || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
+              || isNotTrailingByte(byte3)) {
+        throw new IllegalArgumentException("Invalid UTF-8");
+      }
+      resultArr[resultPos] = (char)
+                                 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
+    }
+
+    static void handleFourBytes(
+        byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
+        throws IllegalArgumentException{
+      if (isNotTrailingByte(byte2)
+              // Check that 1 <= plane <= 16.  Tricky optimized form of:
+              //   valid 4-byte leading byte?
+              // if (byte1 > (byte) 0xF4 ||
+              //   overlong? 4 most significant bits must not all be zero
+              //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
+              //   codepoint larger than the highest code point (U+10FFFF)?
+              //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+              || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
+              || isNotTrailingByte(byte3)
+              || isNotTrailingByte(byte4)) {
+        throw new IllegalArgumentException("Invalid UTF-8");
+      }
+      int codepoint = ((byte1 & 0x07) << 18)
+                          | (trailingByteValue(byte2) << 12)
+                          | (trailingByteValue(byte3) << 6)
+                          | trailingByteValue(byte4);
+      resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
+      resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
+    }
+
+    /**
+     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
+     */
+    private static boolean isNotTrailingByte(byte b) {
+      return b > (byte) 0xBF;
+    }
+
+    /**
+     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
+     */
+    private static int trailingByteValue(byte b) {
+      return b & 0x3F;
+    }
+
+    private static char highSurrogate(int codePoint) {
+      return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
+                         + (codePoint >>> 10));
+    }
+
+    private static char lowSurrogate(int codePoint) {
+      return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
+    }
+  }
+
+  // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
+  // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
+  // fallback to more lenient behavior.
+  static class UnpairedSurrogateException extends IllegalArgumentException {
+    UnpairedSurrogateException(int index, int length) {
+      super("Unpaired surrogate at index " + index + " of " + length);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Old.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Old.java
new file mode 100644
index 0000000..3dac714
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Old.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * This class implements the Utf8 API using the Java Utf8 encoder. Use
+ * Utf8.setDefault(new Utf8Old()); to use it.
+ */
+public class Utf8Old extends Utf8 {
+
+  private static class Cache {
+    final CharsetEncoder encoder;
+    final CharsetDecoder decoder;
+    CharSequence lastInput = null;
+    ByteBuffer lastOutput = null;
+
+    Cache() {
+      encoder = StandardCharsets.UTF_8.newEncoder();
+      decoder = StandardCharsets.UTF_8.newDecoder();
+    }
+  }
+
+  private static final ThreadLocal<Cache> CACHE =
+      ThreadLocal.withInitial(() -> new Cache());
+
+  // Play some games so that the old encoder doesn't pay twice for computing
+  // the length of the encoded string.
+
+  @Override
+  public int encodedLength(CharSequence in) {
+    final Cache cache = CACHE.get();
+    int estimated = (int) (in.length() * cache.encoder.maxBytesPerChar());
+    if (cache.lastOutput == null || cache.lastOutput.capacity() < estimated) {
+      cache.lastOutput = ByteBuffer.allocate(Math.max(128, estimated));
+    }
+    cache.lastOutput.clear();
+    cache.lastInput = in;
+    CharBuffer wrap = (in instanceof CharBuffer) ?
+                          (CharBuffer) in : CharBuffer.wrap(in);
+    CoderResult result = cache.encoder.encode(wrap, cache.lastOutput, true);
+    if (result.isError()) {
+      try {
+        result.throwException();
+      } catch (CharacterCodingException e) {
+        throw new IllegalArgumentException("bad character encoding", e);
+      }
+    }
+    cache.lastOutput.flip();
+    return cache.lastOutput.remaining();
+  }
+
+  @Override
+  public void encodeUtf8(CharSequence in, ByteBuffer out) {
+    final Cache cache = CACHE.get();
+    if (cache.lastInput != in) {
+      // Update the lastOutput to match our input, although flatbuffer should
+      // never take this branch.
+      encodedLength(in);
+    }
+    out.put(cache.lastOutput);
+  }
+
+  @Override
+  public String decodeUtf8(ByteBuffer buffer, int offset, int length) {
+    CharsetDecoder decoder = CACHE.get().decoder;
+    decoder.reset();
+    buffer = buffer.duplicate();
+    buffer.position(offset);
+    buffer.limit(offset + length);
+    try {
+      CharBuffer result = decoder.decode(buffer);
+      return result.toString();
+    } catch (CharacterCodingException e) {
+      throw new IllegalArgumentException("Bad encoding", e);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Safe.java b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Safe.java
new file mode 100644
index 0000000..523e3f1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/java/com/google/flatbuffers/Utf8Safe.java
@@ -0,0 +1,451 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package com.google.flatbuffers;
+
+import java.nio.ByteBuffer;
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.isSurrogatePair;
+import static java.lang.Character.toCodePoint;
+
+/**
+ * A set of low-level, high-performance static utility methods related
+ * to the UTF-8 character encoding.  This class has no dependencies
+ * outside of the core JDK libraries.
+ *
+ * <p>There are several variants of UTF-8.  The one implemented by
+ * this class is the restricted definition of UTF-8 introduced in
+ * Unicode 3.1, which mandates the rejection of "overlong" byte
+ * sequences as well as rejection of 3-byte surrogate codepoint byte
+ * sequences.  Note that the UTF-8 decoder included in Oracle's JDK
+ * has been modified to also reject "overlong" byte sequences, but (as
+ * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
+ *
+ * <p>The byte sequences considered valid by this class are exactly
+ * those that can be roundtrip converted to Strings and back to bytes
+ * using the UTF-8 charset, without loss: <pre> {@code
+ * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
+ * }</pre>
+ *
+ * <p>See the Unicode Standard,</br>
+ * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
+ * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
+ */
+final public class Utf8Safe extends Utf8 {
+
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
+   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
+   *     surrogates)
+   */
+  private static int computeEncodedLength(CharSequence sequence) {
+    // Warning to maintainers: this implementation is highly optimized.
+    int utf16Length = sequence.length();
+    int utf8Length = utf16Length;
+    int i = 0;
+
+    // This loop optimizes for pure ASCII.
+    while (i < utf16Length && sequence.charAt(i) < 0x80) {
+      i++;
+    }
+
+    // This loop optimizes for chars less than 0x800.
+    for (; i < utf16Length; i++) {
+      char c = sequence.charAt(i);
+      if (c < 0x800) {
+        utf8Length += ((0x7f - c) >>> 31);  // branch free!
+      } else {
+        utf8Length += encodedLengthGeneral(sequence, i);
+        break;
+      }
+    }
+
+    if (utf8Length < utf16Length) {
+      // Necessary and sufficient condition for overflow because of maximum 3x expansion
+      throw new IllegalArgumentException("UTF-8 length does not fit in int: "
+                                             + (utf8Length + (1L << 32)));
+    }
+    return utf8Length;
+  }
+
+  private static int encodedLengthGeneral(CharSequence sequence, int start) {
+    int utf16Length = sequence.length();
+    int utf8Length = 0;
+    for (int i = start; i < utf16Length; i++) {
+      char c = sequence.charAt(i);
+      if (c < 0x800) {
+        utf8Length += (0x7f - c) >>> 31; // branch free!
+      } else {
+        utf8Length += 2;
+        // jdk7+: if (Character.isSurrogate(c)) {
+        if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
+          // Check that we have a well-formed surrogate pair.
+          int cp = Character.codePointAt(sequence, i);
+          if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
+            throw new Utf8Safe.UnpairedSurrogateException(i, utf16Length);
+          }
+          i++;
+        }
+      }
+    }
+    return utf8Length;
+  }
+
+  public static String decodeUtf8Array(byte[] bytes, int index, int size) {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    if ((index | size | bytes.length - index - size) < 0) {
+      throw new ArrayIndexOutOfBoundsException(
+          String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+    }
+
+    int offset = index;
+    final int limit = offset + size;
+
+    // The longest possible resulting String is the same as the number of input bytes, when it is
+    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+    char[] resultArr = new char[size];
+    int resultPos = 0;
+
+    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+    while (offset < limit) {
+      byte b = bytes[offset];
+      if (!DecodeUtil.isOneByte(b)) {
+        break;
+      }
+      offset++;
+      DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+    }
+
+    while (offset < limit) {
+      byte byte1 = bytes[offset++];
+      if (DecodeUtil.isOneByte(byte1)) {
+        DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+        // extra optimized loop to take care of these runs.
+        while (offset < limit) {
+          byte b = bytes[offset];
+          if (!DecodeUtil.isOneByte(b)) {
+            break;
+          }
+          offset++;
+          DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+        }
+      } else if (DecodeUtil.isTwoBytes(byte1)) {
+        if (offset >= limit) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
+      } else if (DecodeUtil.isThreeBytes(byte1)) {
+        if (offset >= limit - 1) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleThreeBytes(
+            byte1,
+            /* byte2 */ bytes[offset++],
+            /* byte3 */ bytes[offset++],
+            resultArr,
+            resultPos++);
+      } else {
+        if (offset >= limit - 2) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleFourBytes(
+            byte1,
+            /* byte2 */ bytes[offset++],
+            /* byte3 */ bytes[offset++],
+            /* byte4 */ bytes[offset++],
+            resultArr,
+            resultPos++);
+        // 4-byte case requires two chars.
+        resultPos++;
+      }
+    }
+
+    return new String(resultArr, 0, resultPos);
+  }
+
+  public static String decodeUtf8Buffer(ByteBuffer buffer, int offset,
+                                         int length) {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    if ((offset | length | buffer.limit() - offset - length) < 0) {
+      throw new ArrayIndexOutOfBoundsException(
+          String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(),
+              offset, length));
+    }
+
+    final int limit = offset + length;
+
+    // The longest possible resulting String is the same as the number of input bytes, when it is
+    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+    char[] resultArr = new char[length];
+    int resultPos = 0;
+
+    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+    while (offset < limit) {
+      byte b = buffer.get(offset);
+      if (!DecodeUtil.isOneByte(b)) {
+        break;
+      }
+      offset++;
+      DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+    }
+
+    while (offset < limit) {
+      byte byte1 = buffer.get(offset++);
+      if (DecodeUtil.isOneByte(byte1)) {
+        DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+        // extra optimized loop to take care of these runs.
+        while (offset < limit) {
+          byte b = buffer.get(offset);
+          if (!DecodeUtil.isOneByte(b)) {
+            break;
+          }
+          offset++;
+          DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+        }
+      } else if (DecodeUtil.isTwoBytes(byte1)) {
+        if (offset >= limit) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleTwoBytes(
+            byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
+      } else if (DecodeUtil.isThreeBytes(byte1)) {
+        if (offset >= limit - 1) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleThreeBytes(
+            byte1,
+            /* byte2 */ buffer.get(offset++),
+            /* byte3 */ buffer.get(offset++),
+            resultArr,
+            resultPos++);
+      } else {
+        if (offset >= limit - 2) {
+          throw new IllegalArgumentException("Invalid UTF-8");
+        }
+        DecodeUtil.handleFourBytes(
+            byte1,
+            /* byte2 */ buffer.get(offset++),
+            /* byte3 */ buffer.get(offset++),
+            /* byte4 */ buffer.get(offset++),
+            resultArr,
+            resultPos++);
+        // 4-byte case requires two chars.
+        resultPos++;
+      }
+    }
+
+    return new String(resultArr, 0, resultPos);
+  }
+
+  @Override
+  public int encodedLength(CharSequence in) {
+    return computeEncodedLength(in);
+  }
+
+  /**
+   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
+   *
+   * @throws IllegalArgumentException if the input is not valid UTF-8.
+   */
+  @Override
+  public String decodeUtf8(ByteBuffer buffer, int offset, int length)
+      throws IllegalArgumentException {
+    if (buffer.hasArray()) {
+      return decodeUtf8Array(buffer.array(), buffer.arrayOffset() + offset, length);
+    } else {
+      return decodeUtf8Buffer(buffer, offset, length);
+    }
+  }
+
+
+  private static void encodeUtf8Buffer(CharSequence in, ByteBuffer out) {
+    final int inLength = in.length();
+    int outIx = out.position();
+    int inIx = 0;
+
+    // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
+    // access. Assume the buffer is big enough and let it handle the out of bounds exception
+    // if it occurs.
+    try {
+      // Designed to take advantage of
+      // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
+      for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
+        out.put(outIx + inIx, (byte) c);
+      }
+      if (inIx == inLength) {
+        // Successfully encoded the entire string.
+        out.position(outIx + inIx);
+        return;
+      }
+
+      outIx += inIx;
+      for (char c; inIx < inLength; ++inIx, ++outIx) {
+        c = in.charAt(inIx);
+        if (c < 0x80) {
+          // One byte (0xxx xxxx)
+          out.put(outIx, (byte) c);
+        } else if (c < 0x800) {
+          // Two bytes (110x xxxx 10xx xxxx)
+
+          // Benchmarks show put performs better than putShort here (for HotSpot).
+          out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
+          out.put(outIx, (byte) (0x80 | (0x3F & c)));
+        } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
+          // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+          // Maximum single-char code point is 0xFFFF, 16 bits.
+
+          // Benchmarks show put performs better than putShort here (for HotSpot).
+          out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
+          out.put(outIx, (byte) (0x80 | (0x3F & c)));
+        } else {
+          // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+
+          // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+          // bytes
+          final char low;
+          if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
+            throw new UnpairedSurrogateException(inIx, inLength);
+          }
+          // TODO(nathanmittler): Consider using putInt() to improve performance.
+          int codePoint = toCodePoint(c, low);
+          out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
+          out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
+          out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
+        }
+      }
+
+      // Successfully encoded the entire string.
+      out.position(outIx);
+    } catch (IndexOutOfBoundsException e) {
+      // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
+
+      // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
+      // use inIx to determine the bad write index.
+      int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
+      throw new ArrayIndexOutOfBoundsException(
+          "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
+    }
+  }
+
+  private static int encodeUtf8Array(CharSequence in, byte[] out,
+                                     int offset, int length) {
+    int utf16Length = in.length();
+    int j = offset;
+    int i = 0;
+    int limit = offset + length;
+    // Designed to take advantage of
+    // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
+    for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
+      out[j + i] = (byte) c;
+    }
+    if (i == utf16Length) {
+      return j + utf16Length;
+    }
+    j += i;
+    for (char c; i < utf16Length; i++) {
+      c = in.charAt(i);
+      if (c < 0x80 && j < limit) {
+        out[j++] = (byte) c;
+      } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
+        out[j++] = (byte) ((0xF << 6) | (c >>> 6));
+        out[j++] = (byte) (0x80 | (0x3F & c));
+      } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
+        // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
+        out[j++] = (byte) ((0xF << 5) | (c >>> 12));
+        out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
+        out[j++] = (byte) (0x80 | (0x3F & c));
+      } else if (j <= limit - 4) {
+        // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
+        // four UTF-8 bytes
+        final char low;
+        if (i + 1 == in.length()
+                || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
+          throw new UnpairedSurrogateException((i - 1), utf16Length);
+        }
+        int codePoint = Character.toCodePoint(c, low);
+        out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
+        out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
+        out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
+        out[j++] = (byte) (0x80 | (0x3F & codePoint));
+      } else {
+        // If we are surrogates and we're not a surrogate pair, always throw an
+        // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
+        if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
+                && (i + 1 == in.length()
+                        || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
+          throw new UnpairedSurrogateException(i, utf16Length);
+        }
+        throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
+      }
+    }
+    return j;
+  }
+
+  /**
+   * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
+   *
+   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
+   * and the capabilities of the platform.
+   *
+   * @param in the source string to be encoded
+   * @param out the target buffer to receive the encoded string.
+   */
+  @Override
+  public void encodeUtf8(CharSequence in, ByteBuffer out) {
+    if (out.hasArray()) {
+      int start = out.arrayOffset();
+      int end = encodeUtf8Array(in, out.array(), start + out.position(),
+          out.remaining());
+      out.position(end - start);
+    } else {
+      encodeUtf8Buffer(in, out);
+    }
+  }
+
+  // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with
+  // a modification to throw a local exception. This exception can be caught
+  // to fallback to more lenient behavior.
+  static class UnpairedSurrogateException extends IllegalArgumentException {
+    UnpairedSurrogateException(int index, int length) {
+      super("Unpaired surrogate at index " + index + " of " + length);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/FlexBuffersBenchmark.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/FlexBuffersBenchmark.kt
new file mode 100644
index 0000000..ade57d9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/FlexBuffersBenchmark.kt
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin.benchmark
+
+import com.google.flatbuffers.ArrayReadWriteBuf
+import com.google.flatbuffers.FlexBuffers
+import com.google.flatbuffers.FlexBuffersBuilder.BUILDER_FLAG_SHARE_ALL
+import com.google.flatbuffers.kotlin.FlexBuffersBuilder
+import com.google.flatbuffers.kotlin.getRoot
+import kotlinx.benchmark.Blackhole
+import org.openjdk.jmh.annotations.Benchmark
+import org.openjdk.jmh.annotations.BenchmarkMode
+import org.openjdk.jmh.annotations.Measurement
+import org.openjdk.jmh.annotations.Mode
+import org.openjdk.jmh.annotations.OutputTimeUnit
+import org.openjdk.jmh.annotations.Scope
+import org.openjdk.jmh.annotations.Setup
+import org.openjdk.jmh.annotations.State
+import java.util.concurrent.TimeUnit
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Measurement(iterations = 20, time = 1, timeUnit = TimeUnit.NANOSECONDS)
+class FlexBuffersBenchmark {
+
+  var initialCapacity = 1024
+  var value: Double = 0.0
+  val stringKey = Array(500) { "Ḧ̵̘́ȩ̵̐myFairlyBigKey$it" }
+  val stringValue = Array(500) { "Ḧ̵̘́ȩ̵̐myFairlyBigValue$it" }
+  val bigIntArray = IntArray(5000) { it }
+
+  @Setup
+  fun setUp() {
+    value = 3.0
+  }
+
+  @Benchmark
+  fun mapKotlin(blackhole: Blackhole) {
+    val kBuilder = FlexBuffersBuilder(initialCapacity, FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    kBuilder.putMap {
+      this["hello"] = "world"
+      this["int"] = 10
+      this["float"] = 12.3
+      this["intarray"] = bigIntArray
+      this.putMap("myMap") {
+        this["cool"] = "beans"
+      }
+    }
+    val ref = getRoot(kBuilder.finish())
+    val map = ref.toMap()
+    blackhole.consume(map.size)
+    blackhole.consume(map["hello"].toString())
+    blackhole.consume(map["int"].toInt())
+    blackhole.consume(map["float"].toDouble())
+    blackhole.consume(map["intarray"].toIntArray())
+    blackhole.consume(ref["myMap"]["cool"].toString())
+    blackhole.consume(ref["invalid_key"].isNull)
+  }
+
+  @Benchmark
+  fun mapJava(blackhole: Blackhole) {
+    val jBuilder = com.google.flatbuffers.FlexBuffersBuilder(ArrayReadWriteBuf(initialCapacity), BUILDER_FLAG_SHARE_ALL)
+    val startMap = jBuilder.startMap()
+    jBuilder.putString("hello", "world")
+    jBuilder.putInt("int", 10)
+    jBuilder.putFloat("float", 12.3)
+
+    val startVec = jBuilder.startVector()
+    bigIntArray.forEach { jBuilder.putInt(it) }
+    jBuilder.endVector("intarray", startVec, true, false)
+
+    val startInnerMap = jBuilder.startMap()
+    jBuilder.putString("cool", "beans")
+    jBuilder.endMap("myMap", startInnerMap)
+
+    jBuilder.endMap(null, startMap)
+    val ref = FlexBuffers.getRoot(jBuilder.finish())
+    val map = ref.asMap()
+    blackhole.consume(map.size())
+    blackhole.consume(map.get("hello").toString())
+    blackhole.consume(map.get("int").asInt())
+    blackhole.consume(map.get("float").asFloat())
+    val vec = map.get("intarray").asVector()
+    blackhole.consume(IntArray(vec.size()) { vec.get(it).asInt() })
+
+    blackhole.consume(ref.asMap()["myMap"].asMap()["cool"].toString())
+    blackhole.consume(ref.asMap()["invalid_key"].isNull)
+  }
+
+  @Benchmark
+  fun intArrayKotlin(blackhole: Blackhole) {
+    val kBuilder = FlexBuffersBuilder(initialCapacity, FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    kBuilder.put(bigIntArray)
+    val root = getRoot(kBuilder.finish())
+    blackhole.consume(root.toIntArray())
+  }
+
+  @Benchmark
+  fun intArrayJava(blackhole: Blackhole) {
+    val jBuilder = com.google.flatbuffers.FlexBuffersBuilder(ArrayReadWriteBuf(initialCapacity), BUILDER_FLAG_SHARE_ALL)
+    val v = jBuilder.startVector()
+    bigIntArray.forEach { jBuilder.putInt(it) }
+    jBuilder.endVector(null, v, true, false)
+    jBuilder.finish()
+    val root = FlexBuffers.getRoot(jBuilder.buffer)
+    val vec = root.asVector()
+    blackhole.consume(
+      IntArray(vec.size()) {
+        vec[it].asInt()
+      }
+    )
+  }
+
+  @Benchmark
+  fun stringArrayKotlin(blackhole: Blackhole) {
+    val kBuilder = FlexBuffersBuilder(initialCapacity, FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    kBuilder.putVector { stringValue.forEach { kBuilder.put(it) } }
+    kBuilder.finish()
+    val root = getRoot(kBuilder.buffer)
+    val vec = root.toVector()
+    blackhole.consume(Array(vec.size) { vec[it].toString() })
+  }
+
+  @Benchmark
+  fun stringArrayJava(blackhole: Blackhole) {
+    val jBuilder = com.google.flatbuffers.FlexBuffersBuilder(ArrayReadWriteBuf(initialCapacity), BUILDER_FLAG_SHARE_ALL)
+    val v = jBuilder.startVector()
+    stringValue.forEach { jBuilder.putString(it) }
+    jBuilder.endVector(null, v, false, false)
+    jBuilder.finish()
+    val root = FlexBuffers.getRoot(jBuilder.buffer)
+    val vec = root.asVector()
+    blackhole.consume(Array(vec.size()) { vec[it].toString() })
+  }
+
+  @Benchmark
+  fun stringMapKotlin(blackhole: Blackhole) {
+    val kBuilder = FlexBuffersBuilder(initialCapacity, FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    val pos = kBuilder.startMap()
+    for (i in stringKey.indices) {
+      kBuilder[stringKey[i]] = stringValue[i]
+    }
+    kBuilder.endMap(pos)
+    val ref = getRoot(kBuilder.finish())
+    val map = ref.toMap()
+    val keys = map.keys
+
+    for (key in keys) {
+      blackhole.consume(map[key.toString()].toString())
+    }
+  }
+
+  @Benchmark
+  fun stringMapBytIndexKotlin(blackhole: Blackhole) {
+    val kBuilder = FlexBuffersBuilder(initialCapacity, FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    val pos = kBuilder.startMap()
+    for (i in stringKey.indices) {
+      kBuilder[stringKey[i]] = stringValue[i]
+    }
+    kBuilder.endMap(pos)
+    val ref = getRoot(kBuilder.finish())
+    val map = ref.toMap()
+    for (index in 0 until map.size) {
+      blackhole.consume(map[index].toString())
+    }
+  }
+
+  @Benchmark
+  fun stringMapJava(blackhole: Blackhole) {
+    val jBuilder = com.google.flatbuffers.FlexBuffersBuilder(ArrayReadWriteBuf(initialCapacity), BUILDER_FLAG_SHARE_ALL)
+    val v = jBuilder.startMap()
+    for (i in stringKey.indices) {
+      jBuilder.putString(stringKey[i], stringValue[i])
+    }
+    jBuilder.endMap(null, v)
+    val ref = FlexBuffers.getRoot(jBuilder.finish())
+    val map = ref.asMap()
+    val keyVec = map.keys()
+    for (i in 0 until keyVec.size()) {
+      val s = keyVec[i].toString()
+      blackhole.consume(map[s].toString())
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/JsonBenchmark.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/JsonBenchmark.kt
new file mode 100644
index 0000000..7d2ae50
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/JsonBenchmark.kt
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin.benchmark
+
+import com.google.flatbuffers.kotlin.ArrayReadBuffer
+import com.google.flatbuffers.kotlin.JSONParser
+import com.google.flatbuffers.kotlin.Reference
+import com.google.flatbuffers.kotlin.toJson
+import com.google.gson.Gson
+import com.google.gson.JsonObject
+import com.google.gson.JsonParser
+import com.squareup.moshi.Moshi
+import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory
+import kotlinx.benchmark.Blackhole
+import okio.Buffer
+import org.openjdk.jmh.annotations.Benchmark
+import org.openjdk.jmh.annotations.BenchmarkMode
+import org.openjdk.jmh.annotations.Measurement
+import org.openjdk.jmh.annotations.Mode
+import org.openjdk.jmh.annotations.OutputTimeUnit
+import org.openjdk.jmh.annotations.Scope
+import org.openjdk.jmh.annotations.State
+import java.io.ByteArrayInputStream
+import java.io.InputStreamReader
+import java.util.concurrent.TimeUnit
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Measurement(iterations = 100, time = 1, timeUnit = TimeUnit.MICROSECONDS)
+class JsonBenchmark {
+
+  final val moshi = Moshi.Builder()
+    .addLast(KotlinJsonAdapterFactory())
+    .build()
+  final val moshiAdapter = moshi.adapter(Map::class.java)
+
+  final val gson = Gson()
+  final val gsonParser = JsonParser()
+
+  val fbParser = JSONParser()
+
+  final val twitterData = this.javaClass.classLoader.getResourceAsStream("twitter.json")!!.readBytes()
+  final val canadaData = this.javaClass.classLoader.getResourceAsStream("canada.json")!!.readBytes()
+  final val citmData = this.javaClass.classLoader.getResourceAsStream("citm_catalog.json")!!.readBytes()
+
+  val fbCitmRef = JSONParser().parse(ArrayReadBuffer(citmData))
+  val moshiCitmRef = moshi.adapter(Map::class.java).fromJson(citmData.decodeToString())
+  val gsonCitmRef = gsonParser.parse(citmData.decodeToString())
+
+  fun readFlexBuffers(data: ByteArray): Reference = fbParser.parse(ArrayReadBuffer(data))
+
+  fun readMoshi(data: ByteArray): Map<*, *>? {
+    val buffer = Buffer().write(data)
+    return moshiAdapter.fromJson(buffer)
+  }
+
+  fun readGson(data: ByteArray): JsonObject {
+    val parser = JsonParser()
+    val jsonReader = InputStreamReader(ByteArrayInputStream(data))
+    return parser.parse(jsonReader).asJsonObject
+  }
+
+  // TWITTER
+  @Benchmark
+  fun readTwitterFlexBuffers(hole: Blackhole? = null) = hole?.consume(readFlexBuffers(twitterData))
+  @Benchmark
+  fun readTwitterMoshi(hole: Blackhole?) = hole?.consume(readMoshi(twitterData))
+  @Benchmark
+  fun readTwitterGson(hole: Blackhole?) = hole?.consume(readGson(twitterData))
+
+  @Benchmark
+  fun roundTripTwitterFlexBuffers(hole: Blackhole? = null) = hole?.consume(readFlexBuffers(twitterData).toJson())
+  @Benchmark
+  fun roundTripTwitterMoshi(hole: Blackhole?) = hole?.consume(moshiAdapter.toJson(readMoshi(twitterData)))
+  @Benchmark
+  fun roundTripTwitterGson(hole: Blackhole?) = hole?.consume(gson.toJson(readGson(twitterData)))
+
+  // CITM
+  @Benchmark
+  fun readCITMFlexBuffers(hole: Blackhole? = null) = hole?.consume(readFlexBuffers(citmData))
+  @Benchmark
+  fun readCITMMoshi(hole: Blackhole?) = hole?.consume(moshiAdapter.toJson(readMoshi(citmData)))
+  @Benchmark
+  fun readCITMGson(hole: Blackhole?) = hole?.consume(gson.toJson(readGson(citmData)))
+
+  @Benchmark
+  fun roundTripCITMFlexBuffers(hole: Blackhole? = null) = hole?.consume(readFlexBuffers(citmData).toJson())
+  @Benchmark
+  fun roundTripCITMMoshi(hole: Blackhole?) = hole?.consume(moshiAdapter.toJson(readMoshi(citmData)))
+  @Benchmark
+  fun roundTripCITMGson(hole: Blackhole?) = hole?.consume(gson.toJson(readGson(citmData)))
+
+  @Benchmark
+  fun writeCITMFlexBuffers(hole: Blackhole? = null) = hole?.consume(fbCitmRef.toJson())
+  @Benchmark
+  fun writeCITMMoshi(hole: Blackhole?) = hole?.consume(moshiAdapter.toJson(moshiCitmRef))
+  @Benchmark
+  fun writeCITMGson(hole: Blackhole?) = hole?.consume(gson.toJson(gsonCitmRef))
+
+  // CANADA
+  @Benchmark
+  fun readCanadaFlexBuffers(hole: Blackhole? = null) = hole?.consume(readFlexBuffers(canadaData))
+  @Benchmark
+  fun readCanadaMoshi(hole: Blackhole?) = hole?.consume(readMoshi(canadaData))
+  @Benchmark
+  fun readCanadaGson(hole: Blackhole?) = hole?.consume(readGson(canadaData))
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/UTF8Benchmark.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/UTF8Benchmark.kt
new file mode 100644
index 0000000..6fa2882
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/benchmark/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/benchmark/UTF8Benchmark.kt
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin.benchmark
+
+import com.google.flatbuffers.kotlin.ArrayReadWriteBuffer
+import com.google.flatbuffers.kotlin.Key
+import com.google.flatbuffers.kotlin.Utf8
+import kotlinx.benchmark.Blackhole
+import org.openjdk.jmh.annotations.Benchmark
+import org.openjdk.jmh.annotations.BenchmarkMode
+import org.openjdk.jmh.annotations.Measurement
+import org.openjdk.jmh.annotations.Mode
+import org.openjdk.jmh.annotations.OutputTimeUnit
+import org.openjdk.jmh.annotations.Scope
+import org.openjdk.jmh.annotations.Setup
+import org.openjdk.jmh.annotations.State
+import java.nio.ByteBuffer
+import java.util.concurrent.TimeUnit
+import kotlin.random.Random
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Measurement(iterations = 100, time = 1, timeUnit = TimeUnit.MICROSECONDS)
+class UTF8Benchmark {
+
+  private final val sampleSize = 5000
+  private final val stringSize = 25
+  final var sampleSmallUtf8 = (0..sampleSize).map { populateUTF8(stringSize) }.toList()
+  final var sampleSmallUtf8Decoded = sampleSmallUtf8.map { it.encodeToByteArray() }.toList()
+  final var sampleSmallAscii = (0..sampleSize).map { populateAscii(stringSize) }.toList()
+  final var sampleSmallAsciiDecoded = sampleSmallAscii.map { it.encodeToByteArray() }.toList()
+
+  @Setup
+  fun setUp() {
+  }
+
+  @Benchmark
+  fun encodeUtf8KotlinStandard(blackhole: Blackhole) {
+    for (i in sampleSmallUtf8) {
+      blackhole.consume(i.encodeToByteArray())
+    }
+  }
+  @Benchmark
+  fun encodeUtf8KotlinFlatbuffers(blackhole: Blackhole) {
+    for (i in sampleSmallUtf8) {
+      val byteArray = ByteArray((i.length * 4))
+      blackhole.consume(Utf8.encodeUtf8Array(i, byteArray, 0, byteArray.size))
+    }
+  }
+  @Benchmark
+  fun encodeUtf8JavaFlatbuffers(blackhole: Blackhole) {
+    val javaUtf8 = com.google.flatbuffers.Utf8.getDefault()
+    for (i in sampleSmallUtf8) {
+      val byteBuffer = ByteBuffer.wrap(ByteArray(i.length * 4))
+      blackhole.consume(javaUtf8.encodeUtf8(i, byteBuffer))
+    }
+  }
+
+  @Benchmark
+  fun decodeUtf8KotlinStandard(blackhole: Blackhole) {
+    for (ary in sampleSmallUtf8Decoded) {
+      blackhole.consume(ary.decodeToString())
+    }
+  }
+
+  @Benchmark
+  fun decodeUtf8KotlinFlatbuffers(blackhole: Blackhole) {
+    for (ary in sampleSmallUtf8Decoded) {
+      blackhole.consume(Utf8.decodeUtf8Array(ary, 0, ary.size))
+    }
+  }
+
+  @Benchmark
+  fun decodeUtf8JavaFlatbuffers(blackhole: Blackhole) {
+    val javaUtf8 = com.google.flatbuffers.Utf8.getDefault()
+    for (ary in sampleSmallUtf8Decoded) {
+      val byteBuffer = ByteBuffer.wrap(ary)
+      blackhole.consume(javaUtf8.decodeUtf8(byteBuffer, 0, ary.size))
+    }
+  }
+
+  // ASCII TESTS
+
+  @Benchmark
+  fun encodeAsciiKotlinStandard(blackhole: Blackhole) {
+    for (i in sampleSmallAscii) {
+      blackhole.consume(i.encodeToByteArray())
+    }
+  }
+  @Benchmark
+  fun encodeAsciiKotlinFlatbuffers(blackhole: Blackhole) {
+    for (i in sampleSmallAscii) {
+      val byteArray = ByteArray(i.length) // Utf8.encodedLength(i))
+      blackhole.consume(Utf8.encodeUtf8Array(i, byteArray, 0, byteArray.size))
+    }
+  }
+  @Benchmark
+  fun encodeAsciiJavaFlatbuffers(blackhole: Blackhole) {
+    val javaUtf8 = com.google.flatbuffers.Utf8.getDefault()
+    for (i in sampleSmallAscii) {
+      val byteBuffer = ByteBuffer.wrap(ByteArray(i.length))
+      blackhole.consume(javaUtf8.encodeUtf8(i, byteBuffer))
+    }
+  }
+
+  @Benchmark
+  fun decodeAsciiKotlinStandard(blackhole: Blackhole) {
+
+    for (ary in sampleSmallAsciiDecoded) {
+      String(ary)
+      blackhole.consume(ary.decodeToString())
+    }
+  }
+
+  @Benchmark
+  fun decodeAsciiKotlinFlatbuffers(blackhole: Blackhole) {
+    for (ary in sampleSmallAsciiDecoded) {
+      blackhole.consume(Utf8.decodeUtf8Array(ary, 0, ary.size))
+    }
+  }
+
+  @Benchmark
+  fun decodeAsciiJavaFlatbuffers(blackhole: Blackhole) {
+    val javaUtf8 = com.google.flatbuffers.Utf8.getDefault()
+    for (ary in sampleSmallAsciiDecoded) {
+      val byteBuffer = ByteBuffer.wrap(ary)
+      blackhole.consume(javaUtf8.decodeUtf8(byteBuffer, 0, ary.size))
+    }
+  }
+
+  @Benchmark
+  fun readAllCharsString(blackhole: Blackhole) {
+    for (ary in sampleSmallAsciiDecoded) {
+      val key = Utf8.decodeUtf8Array(ary, 0, ary.size)
+      for (i in key.indices) {
+        blackhole.consume(key[i])
+      }
+    }
+  }
+
+  @Benchmark
+  fun readAllCharsCharSequence(blackhole: Blackhole) {
+    for (ary in sampleSmallAsciiDecoded) {
+      val key = Key(ArrayReadWriteBuffer(ary), 0, ary.size)
+      for (i in 0 until key.sizeInChars) {
+        blackhole.consume(key[i])
+      }
+    }
+  }
+
+  fun populateAscii(size: Int): String {
+    val data = ByteArray(size)
+    for (i in data.indices) {
+      data[i] = Random.nextInt(0, 127).toByte()
+    }
+
+    return String(data, 0, data.size)
+  }
+
+  // generate a string having at least length N
+  // can exceed by up to 3 chars, returns the actual length
+  fun populateUTF8(size: Int): String {
+    val data = ByteArray(size + 3)
+    var i = 0
+    while (i < size) {
+      val w = Random.nextInt() and 0xFF
+      when {
+        w < 0x80 -> data[i++] = 0x20; // w;
+        w < 0xE0 -> {
+          data[i++] = (0xC2 + Random.nextInt() % (0xDF - 0xC2 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w == 0xE0 -> {
+          data[i++] = w.toByte()
+          data[i++] = (0xA0 + Random.nextInt() % (0xBF - 0xA0 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w <= 0xEC -> {
+          data[i++] = w.toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w == 0xED -> {
+          data[i++] = w.toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0x9F - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w <= 0xEF -> {
+          data[i++] = w.toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w < 0xF0 -> {
+          data[i++] = (0xF1 + Random.nextInt() % (0xF3 - 0xF1 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w == 0xF0 -> {
+          data[i++] = w.toByte()
+          data[i++] = (0x90 + Random.nextInt() % (0xBF - 0x90 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w <= 0xF3 -> {
+          data[i++] = (0xF1 + Random.nextInt() % (0xF3 - 0xF1 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+        w == 0xF4 -> {
+          data[i++] = w.toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0x8F - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+          data[i++] = (0x80 + Random.nextInt() % (0xBF - 0x80 + 1)).toByte()
+        }
+      }
+    }
+    return String(data, 0, i)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/build.gradle.kts b/3rdparty/TNN/third_party/flatbuffers/kotlin/build.gradle.kts
new file mode 100644
index 0000000..455ee0d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/build.gradle.kts
@@ -0,0 +1,40 @@
+plugins {
+  id("com.diffplug.spotless") version "5.8.2"
+}
+
+group = "com.google.flatbuffers"
+version = "2.0.0-SNAPSHOT"
+
+subprojects {
+
+  repositories {
+    maven { setUrl("https://plugins.gradle.org/m2/") }
+    mavenCentral()
+  }
+}
+
+buildscript {
+  repositories {
+    maven { setUrl("https://plugins.gradle.org/m2/") }
+    gradlePluginPortal()
+    mavenCentral()
+  }
+}
+
+// plugin used to enforce code style
+spotless {
+  val klintConfig = mapOf("indent_size" to "2", "continuation_indent_size" to "2")
+  kotlin {
+    target("**/*.kt")
+    ktlint("0.40.0").userData(klintConfig)
+    trimTrailingWhitespace()
+    indentWithSpaces()
+    endWithNewline()
+    licenseHeaderFile("$rootDir/spotless/spotless.kt").updateYearWithLatest(false)
+    targetExclude("**/spotless.kt", "**/build/**")
+  }
+  kotlinGradle {
+    target("*.gradle.kts")
+    ktlint().userData(klintConfig)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/build.gradle.kts b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/build.gradle.kts
new file mode 100644
index 0000000..b74807e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/build.gradle.kts
@@ -0,0 +1,97 @@
+plugins {
+  kotlin("multiplatform") version "1.4.20"
+}
+
+group = "com.google.flatbuffers.kotlin"
+version = "2.0.0-SNAPSHOT"
+
+kotlin {
+  explicitApi()
+  jvm()
+  js {
+    browser {
+      binaries.executable()
+      testTask {
+        useKarma {
+          useChromeHeadless()
+        }
+      }
+    }
+  }
+  macosX64()
+  iosArm32()
+  iosArm64()
+  iosX64()
+
+  sourceSets {
+    val commonMain by getting {
+      dependencies {
+        implementation(kotlin("stdlib-common"))
+      }
+    }
+
+    val commonTest by getting {
+      dependencies {
+        implementation(kotlin("test-common"))
+        implementation(kotlin("test-annotations-common"))
+      }
+    }
+    val jvmTest by getting {
+      dependencies {
+        implementation(kotlin("test-junit"))
+      }
+    }
+    val jvmMain by getting {
+      kotlin.srcDir("java")
+      dependencies {
+        implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core-jvm:1.4.1")
+      }
+    }
+
+    val jsMain by getting {
+      dependsOn(commonMain)
+    }
+    val jsTest by getting {
+      dependsOn(commonTest)
+      dependencies {
+        implementation(kotlin("test-js"))
+      }
+    }
+    val nativeMain by creating {
+        dependsOn(commonMain)
+    }
+    val nativeTest by creating {
+      dependsOn(commonMain)
+    }
+    val macosX64Main by getting {
+      dependsOn(nativeMain)
+    }
+
+    val iosArm32Main by getting {
+      dependsOn(nativeMain)
+    }
+    val iosArm64Main by getting {
+      dependsOn(nativeMain)
+    }
+    val iosX64Main by getting {
+      dependsOn(nativeMain)
+    }
+
+    all {
+      languageSettings.enableLanguageFeature("InlineClasses")
+      languageSettings.useExperimentalAnnotation("kotlin.ExperimentalUnsignedTypes")
+    }
+  }
+
+  /* Targets configuration omitted.
+   *  To find out how to configure the targets, please follow the link:
+   *  https://kotlinlang.org/docs/reference/building-mpp-with-gradle.html#setting-up-targets */
+  targets {
+    targetFromPreset(presets.getAt("jvm"))
+    targetFromPreset(presets.getAt("js"))
+    targetFromPreset(presets.getAt("macosX64"))
+    targetFromPreset(presets.getAt("iosArm32"))
+    targetFromPreset(presets.getAt("iosArm64"))
+    targetFromPreset(presets.getAt("iosX64"))
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Buffers.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Buffers.kt
new file mode 100644
index 0000000..9851d90
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Buffers.kt
@@ -0,0 +1,489 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin
+
+import kotlin.math.max
+import kotlin.math.min
+
+/**
+ * Represent a chunk of data, where FlexBuffers will be read from.
+ */
+public interface ReadBuffer {
+
+  /**
+   * Scan through the buffer for first byte matching value.
+   * @param value to be match
+   * @param start inclusive initial position to start searching
+   * @param end exclusive final position of the search
+   * @return position of a match or -1
+   */
+  public fun findFirst(value: Byte, start: Int, end: Int = limit): Int
+
+  /**
+   * Read boolean from the buffer. Booleans as stored as a single byte
+   * @param index position of the element in [ReadBuffer]
+   * @return [Boolean] element
+   */
+  public fun getBoolean(index: Int): Boolean
+
+  /**
+   * Read a [Byte] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a byte
+   */
+  public operator fun get(index: Int): Byte
+
+  /**
+   * Read a [UByte] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a [UByte]
+   */
+  public fun getUByte(index: Int): UByte
+
+  /**
+   * Read a [Short] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a [Short]
+   */
+  public fun getShort(index: Int): Short
+
+  /**
+   * Read a [UShort] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a [UShort]
+   */
+  public fun getUShort(index: Int): UShort
+
+  /**
+   * Read a [Int] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return an [Int]
+   */
+  public fun getInt(index: Int): Int
+
+  /**
+   * Read a [UInt] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return an [UInt]
+   */
+  public fun getUInt(index: Int): UInt
+
+  /**
+   * Read a [Long] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a [Long]
+   */
+  public fun getLong(index: Int): Long
+
+  /**
+   * Read a [ULong] from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a [ULong]
+   */
+  public fun getULong(index: Int): ULong
+
+  /**
+   * Read a 32-bit float from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a float
+   */
+  public fun getFloat(index: Int): Float
+
+  /**
+   * Read a 64-bit float from the buffer.
+   * @param index position of the element in [ReadBuffer]
+   * @return a double
+   */
+  public fun getDouble(index: Int): Double
+
+  /**
+   * Read an UTF-8 string from the buffer.
+   * @param start initial element of the string
+   * @param size size of the string in bytes.
+   * @return a `String`
+   */
+  public fun getString(start: Int, size: Int): String
+
+  /**
+   * Expose [ReadBuffer] as an array of bytes.
+   * This method is meant to be as efficient as possible, so for a array-backed [ReadBuffer], it should
+   * return its own internal data. In case access to internal data is not possible,
+   * a copy of the data into an array of bytes might occur.
+   * @return [ReadBuffer] as an array of bytes
+   */
+  public fun data(): ByteArray
+
+  /**
+   * Creates a new [ReadBuffer] point to a region of the current buffer, starting at [start] with size [size].
+   * @param start starting position of the [ReadBuffer]
+   * @param size in bytes of the [ReadBuffer]
+   * @return [ReadBuffer] slice.
+   */
+  public fun slice(start: Int, size: Int): ReadBuffer
+
+  /**
+   * Defines the size of the message in the buffer. It also determines last position that buffer
+   * can be read. Last byte to be accessed is in position `limit() -1`.
+   * @return indicate last position
+   */
+  public val limit: Int
+}
+
+/**
+ * Interface to represent a read-write buffers. This interface will be used to access and write FlexBuffer messages.
+ */
+public interface ReadWriteBuffer : ReadBuffer {
+  /**
+   * Clears (resets) the buffer so that it can be reused. Write position will be set to the start.
+   */
+  public fun clear()
+
+  /**
+   * Put a [Boolean] into the buffer at [writePosition] . Booleans as stored as single byte.
+   * Write position will be incremented.
+   * @return [Boolean] element
+   */
+  public fun put(value: Boolean)
+
+  /**
+   * Put an array of bytes into the buffer at [writePosition]. Write position will be incremented.
+   * @param value the data to be copied
+   * @param start initial position on value to be copied
+   * @param length amount of bytes to be copied
+   */
+  public fun put(value: ByteArray, start: Int, length: Int)
+
+  /**
+   * Write a [Byte] into the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: Byte)
+
+  /**
+   * Write a [UByte] into the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: UByte)
+
+  /**
+   * Write a [Short] into in the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: Short)
+
+  /**
+   * Writea [UShort] into in the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: UShort)
+
+  /**
+   * Write a [Int] in the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: Int)
+
+  /**
+   * Write a [UInt] into in the buffer at [writePosition]. Write position will be incremented.
+   */
+  public fun put(value: UInt)
+
+  /**
+   * Write a [Long] into in the buffer at [writePosition]. Write position will be
+   * incremented.
+   */
+  public fun put(value: Long)
+
+  /**
+   * Write a [ULong] into in the buffer at [writePosition]. Write position will be
+   * incremented.
+   */
+  public fun put(value: ULong)
+
+  /**
+   * Write a 32-bit [Float] into the buffer at [writePosition]. Write position will be
+   * incremented.
+   */
+  public fun put(value: Float)
+
+  /**
+   * Write a 64-bit [Double] into the buffer at [writePosition]. Write position will be
+   * incremented.
+   */
+  public fun put(value: Double)
+
+  /**
+   * Write a [String] encoded as UTF-8 into the buffer at [writePosition]. Write position will be incremented.
+   * @return size in bytes of the encoded string
+   */
+  public fun put(value: String, encodedLength: Int = -1): Int
+
+  /**
+   * Write an array of bytes into the buffer.
+   * @param dstIndex initial position where [src] will be copied into.
+   * @param src the data to be copied.
+   * @param srcStart initial position on [src] that will be copied.
+   * @param srcLength amount of bytes to be copied
+   */
+  public operator fun set(dstIndex: Int, src: ByteArray, srcStart: Int, srcLength: Int)
+
+  /**
+   * Write [Boolean] into a given position [index] on the buffer. Booleans as stored as single byte.
+   * @param index position of the element in buffer
+   */
+  public operator fun set(index: Int, value: Boolean)
+
+  /**
+   * Write [Byte] into a given position [index] on the buffer.
+   * @param index position of the element in the buffer
+   */
+  public operator fun set(index: Int, value: Byte)
+
+  /**
+   * Write [UByte] into a given position [index] on the buffer.
+   * @param index position of the element in the buffer
+   */
+  public operator fun set(index: Int, value: UByte)
+
+  /**
+   Short
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: Short)
+
+  /**
+   * Write [UShort] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: UShort)
+
+  /**
+   * Write [Int] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: Int)
+
+  /**
+   * Write [UInt] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: UInt)
+
+  /**
+   * Write [Long] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: Long)
+
+  /**
+   * Write [ULong] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: ULong)
+
+  /**
+   * Write [Float] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: Float)
+
+  /**
+   * Write [Double] into a given position [index] on the buffer.
+   * @param index position of the element in [ReadBuffer]
+   */
+  public fun set(index: Int, value: Double)
+
+  /**
+   * Current position of the buffer to be written. It will be automatically updated on [put] operations.
+   */
+  public var writePosition: Int
+
+  /**
+   * Defines the size of the message in the buffer. It also determines last position that buffer
+   * can be read or write. Last byte to be accessed is in position `limit() -1`.
+   * @return indicate last position
+   */
+  override val limit: Int
+
+  /**
+   * Request capacity of the buffer. In case buffer is already larger
+   * than the requested, this method will just return true. Otherwise
+   * It might try to resize the buffer. In case of being unable to allocate
+   * enough memory, an exception will be thrown.
+   */
+  public fun requestCapacity(capacity: Int)
+}
+
+public open class ArrayReadBuffer(protected var buffer: ByteArray, override val limit: Int = buffer.size) : ReadBuffer {
+
+  override fun findFirst(value: Byte, start: Int, end: Int): Int {
+    val e = min(end, limit)
+    val s = max(0, start)
+    for (i in s until e) if (buffer[i] == value) return i
+    return -1
+  }
+
+  override fun getBoolean(index: Int): Boolean = buffer[index] != 0.toByte()
+
+  override operator fun get(index: Int): Byte = buffer[index]
+
+  override fun getUByte(index: Int): UByte = buffer.getUByte(index)
+
+  override fun getShort(index: Int): Short = buffer.getShort(index)
+
+  override fun getUShort(index: Int): UShort = buffer.getUShort(index)
+
+  override fun getInt(index: Int): Int = buffer.getInt(index)
+
+  override fun getUInt(index: Int): UInt = buffer.getUInt(index)
+
+  override fun getLong(index: Int): Long = buffer.getLong(index)
+
+  override fun getULong(index: Int): ULong = buffer.getULong(index)
+
+  override fun getFloat(index: Int): Float = buffer.getFloat(index)
+
+  override fun getDouble(index: Int): Double = buffer.getDouble(index)
+
+  override fun getString(start: Int, size: Int): String = buffer.decodeToString(start, start + size)
+
+  override fun data(): ByteArray = buffer
+
+  override fun slice(start: Int, size: Int): ReadBuffer = ArrayReadBuffer(buffer, limit)
+}
+/**
+ * Implements `[ReadWriteBuffer]` using [ByteArray] as backing buffer. Using array of bytes are
+ * usually faster than `ByteBuffer`.
+ *
+ * This class is not thread-safe, meaning that
+ * it must operate on a single thread. Operating from
+ * multiple thread leads into a undefined behavior
+ *
+ * All operations assumes Little Endian byte order.
+ */
+public class ArrayReadWriteBuffer(
+  buffer: ByteArray,
+  override var writePosition: Int = 0
+) : ArrayReadBuffer(buffer, writePosition), ReadWriteBuffer {
+
+  public constructor(initialCapacity: Int = 10) : this(ByteArray(initialCapacity))
+
+  override val limit: Int get() = writePosition
+
+  override fun clear(): Unit = run { writePosition = 0 }
+
+  override fun put(value: Boolean) {
+    set(writePosition, value)
+    writePosition++
+  }
+
+  override fun put(value: ByteArray, start: Int, length: Int) {
+    set(writePosition, value, start, length)
+    writePosition += length
+  }
+
+  override fun put(value: Byte) {
+    set(writePosition, value)
+    writePosition++
+  }
+
+  override fun put(value: UByte) {
+    set(writePosition, value)
+    writePosition++
+  }
+
+  override fun put(value: Short) {
+    set(writePosition, value)
+    writePosition += 2
+  }
+
+  override fun put(value: UShort) {
+    set(writePosition, value)
+    writePosition += 2
+  }
+
+  override fun put(value: Int) {
+    set(writePosition, value)
+    writePosition += 4
+  }
+
+  override fun put(value: UInt) {
+    set(writePosition, value)
+    writePosition += 4
+  }
+
+  override fun put(value: Long) {
+    set(writePosition, value)
+    writePosition += 8
+  }
+
+  override fun put(value: ULong) {
+    set(writePosition, value)
+    writePosition += 8
+  }
+
+  override fun put(value: Float) {
+    set(writePosition, value)
+    writePosition += 4
+  }
+
+  override fun put(value: Double) {
+    set(writePosition, value)
+    writePosition += 8
+  }
+
+  override fun put(value: String, encodedLength: Int): Int {
+    val length = if (encodedLength != -1) encodedLength else Utf8.encodedLength(value)
+    withCapacity(writePosition + length) {
+      writePosition = setString(writePosition, value)
+    }
+    return length
+  }
+
+  override fun set(index: Int, value: Boolean) {
+    set(index, if (value) 1.toByte() else 0.toByte())
+  }
+
+  override operator fun set(dstIndex: Int, src: ByteArray, srcStart: Int, srcLength: Int) {
+    withCapacity(dstIndex + (srcLength + srcStart)) {
+      src.copyInto(buffer, dstIndex, srcStart, srcStart + srcLength)
+    }
+  }
+
+  override operator fun set(index: Int, value: Byte): Unit = withCapacity(index + 1) { set(index, value) }
+  override operator fun set(index: Int, value: UByte): Unit = withCapacity(index + 1) { setUByte(index, value) }
+  override operator fun set(index: Int, value: Short): Unit = withCapacity(index + 2) { setShort(index, value) }
+  override operator fun set(index: Int, value: UShort): Unit = withCapacity(index + 2) { setUShort(index, value) }
+  override operator fun set(index: Int, value: Int): Unit = withCapacity(index + 4) { setInt(index, value) }
+  override operator fun set(index: Int, value: UInt): Unit = withCapacity(index + 4) { setUInt(index, value) }
+  override operator fun set(index: Int, value: Long): Unit = withCapacity(index + 8) { setLong(index, value) }
+  override operator fun set(index: Int, value: ULong): Unit = withCapacity(index + 8) { setULong(index, value) }
+  override operator fun set(index: Int, value: Float): Unit = withCapacity(index + 4) { setFloat(index, value) }
+  override operator fun set(index: Int, value: Double): Unit = withCapacity(index + 8) { setDouble(index, value) }
+
+  override fun requestCapacity(capacity: Int) {
+    if (capacity < 0) error("Capacity may not be negative (likely a previous int overflow)")
+
+    if (buffer.size >= capacity) return
+    // implemented in the same growing fashion as ArrayList
+    val oldCapacity = buffer.size
+    var newCapacity = oldCapacity + (oldCapacity shr 1)
+    if (newCapacity < capacity) { // Note: this also catches newCapacity int overflow
+      newCapacity = capacity
+    }
+    buffer = buffer.copyOf(newCapacity)
+  }
+
+  private inline fun withCapacity(size: Int, crossinline action: ByteArray.() -> Unit) {
+    requestCapacity(size)
+    buffer.action()
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
new file mode 100644
index 0000000..68fd0f3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+package com.google.flatbuffers.kotlin
+
+import kotlin.experimental.and
+
+internal fun ByteArray.getString(index: Int, size: Int): String = Utf8.decodeUtf8Array(this, index, size)
+
+internal fun ByteArray.setString(index: Int, value: String): Int =
+  Utf8.encodeUtf8Array(value, this, index, this.size - index)
+
+// List of functions that needs to be implemented on all platforms.
+internal expect inline fun ByteArray.getUByte(index: Int): UByte
+internal expect inline fun ByteArray.getShort(index: Int): Short
+internal expect inline fun ByteArray.getUShort(index: Int): UShort
+internal expect inline fun ByteArray.getInt(index: Int): Int
+internal expect inline fun ByteArray.getUInt(index: Int): UInt
+internal expect inline fun ByteArray.getLong(index: Int): Long
+internal expect inline fun ByteArray.getULong(index: Int): ULong
+internal expect inline fun ByteArray.getFloat(index: Int): Float
+internal expect inline fun ByteArray.getDouble(index: Int): Double
+
+internal expect inline fun ByteArray.setUByte(index: Int, value: UByte)
+internal expect inline fun ByteArray.setShort(index: Int, value: Short)
+internal expect inline fun ByteArray.setUShort(index: Int, value: UShort)
+internal expect inline fun ByteArray.setInt(index: Int, value: Int)
+internal expect inline fun ByteArray.setUInt(index: Int, value: UInt)
+internal expect inline fun ByteArray.setLong(index: Int, value: Long)
+internal expect inline fun ByteArray.setULong(index: Int, value: ULong)
+internal expect inline fun ByteArray.setFloat(index: Int, value: Float)
+internal expect inline fun ByteArray.setDouble(index: Int, value: Double)
+
+/**
+ * This implementation uses Little Endian order.
+ */
+public object ByteArrayOps {
+  public inline fun getUByte(ary: ByteArray, index: Int): UByte = ary[index].toUByte()
+  public inline fun getShort(ary: ByteArray, index: Int): Short {
+    return (ary[index + 1].toInt() shl 8 or (ary[index].toInt() and 0xff)).toShort()
+  }
+
+  public inline fun getUShort(ary: ByteArray, index: Int): UShort = getShort(ary, index).toUShort()
+
+  public inline fun getInt(ary: ByteArray, index: Int): Int {
+    return (
+      (ary[index + 3].toInt() shl 24) or
+        ((ary[index + 2].toInt() and 0xff) shl 16) or
+        ((ary[index + 1].toInt() and 0xff) shl 8) or
+        ((ary[index].toInt() and 0xff))
+      )
+  }
+
+  public inline fun getUInt(ary: ByteArray, index: Int): UInt = getInt(ary, index).toUInt()
+
+  public inline fun getLong(ary: ByteArray, index: Int): Long {
+    var idx = index
+    return ary[idx++].toLong() and 0xff or
+      (ary[idx++].toLong() and 0xff shl 8) or
+      (ary[idx++].toLong() and 0xff shl 16) or
+      (ary[idx++].toLong() and 0xff shl 24) or
+      (ary[idx++].toLong() and 0xff shl 32) or
+      (ary[idx++].toLong() and 0xff shl 40) or
+      (ary[idx++].toLong() and 0xff shl 48) or
+      (ary[idx].toLong() shl 56)
+  }
+
+  public inline fun getULong(ary: ByteArray, index: Int): ULong = getLong(ary, index).toULong()
+
+  public inline fun setUByte(ary: ByteArray, index: Int, value: UByte) {
+    ary[index] = value.toByte()
+  }
+  public inline fun setShort(ary: ByteArray, index: Int, value: Short) {
+    var idx = index
+    ary[idx++] = (value and 0xff).toByte()
+    ary[idx] = (value.toInt() shr 8 and 0xff).toByte()
+  }
+
+  public inline fun setUShort(ary: ByteArray, index: Int, value: UShort): Unit = setShort(ary, index, value.toShort())
+
+  public inline fun setInt(ary: ByteArray, index: Int, value: Int) {
+    var idx = index
+    ary[idx++] = (value and 0xff).toByte()
+    ary[idx++] = (value shr 8 and 0xff).toByte()
+    ary[idx++] = (value shr 16 and 0xff).toByte()
+    ary[idx] = (value shr 24 and 0xff).toByte()
+  }
+
+  public inline fun setUInt(ary: ByteArray, index: Int, value: UInt): Unit = setInt(ary, index, value.toInt())
+
+  public inline fun setLong(ary: ByteArray, index: Int, value: Long) {
+    var idx = index
+    var i = value.toInt()
+    ary[idx++] = (i and 0xff).toByte()
+    ary[idx++] = (i shr 8 and 0xff).toByte()
+    ary[idx++] = (i shr 16 and 0xff).toByte()
+    ary[idx++] = (i shr 24 and 0xff).toByte()
+    i = (value shr 32).toInt()
+    ary[idx++] = (i and 0xff).toByte()
+    ary[idx++] = (i shr 8 and 0xff).toByte()
+    ary[idx++] = (i shr 16 and 0xff).toByte()
+    ary[idx] = (i shr 24 and 0xff).toByte()
+  }
+
+  public inline fun setULong(ary: ByteArray, index: Int, value: ULong): Unit = setLong(ary, index, value.toLong())
+
+  public inline fun setFloat(ary: ByteArray, index: Int, value: Float) {
+    var idx = index
+    val iValue: Int = value.toRawBits()
+    ary[idx++] = (iValue and 0xff).toByte()
+    ary[idx++] = (iValue shr 8 and 0xff).toByte()
+    ary[idx++] = (iValue shr 16 and 0xff).toByte()
+    ary[idx] = (iValue shr 24 and 0xff).toByte()
+  }
+
+  public inline fun setDouble(ary: ByteArray, index: Int, value: Double) {
+    var idx = index
+    val lValue: Long = value.toRawBits()
+    var i = lValue.toInt()
+    ary[idx++] = (i and 0xff).toByte()
+    ary[idx++] = (i shr 8 and 0xff).toByte()
+    ary[idx++] = (i shr 16 and 0xff).toByte()
+    ary[idx++] = (i shr 24 and 0xff).toByte()
+    i = (lValue shr 32).toInt()
+    ary[idx++] = (i and 0xff).toByte()
+    ary[idx++] = (i shr 8 and 0xff).toByte()
+    ary[idx++] = (i shr 16 and 0xff).toByte()
+    ary[idx] = (i shr 24 and 0xff).toByte()
+  }
+
+  public inline fun getFloat(ary: ByteArray, index: Int): Float = Float.fromBits(getInt(ary, index))
+  public inline fun getDouble(ary: ByteArray, index: Int): Double = Double.fromBits(getLong(ary, index))
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffers.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffers.kt
new file mode 100644
index 0000000..a22dd13
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffers.kt
@@ -0,0 +1,905 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+@file:JvmName("FlexBuffers")
+package com.google.flatbuffers.kotlin
+
+import kotlin.jvm.JvmName
+
+/**
+ * Reads a FlexBuffer message in ReadBuf and returns [Reference] to
+ * the root element.
+ * @param buffer ReadBuf containing FlexBuffer message
+ * @return [Reference] to the root object
+ */
+public fun getRoot(buffer: ReadBuffer): Reference {
+  var end: Int = buffer.limit
+  val byteWidth = buffer[--end].toInt()
+  val packetType = buffer[--end].toInt()
+  end -= byteWidth // The root data item.
+  return Reference(buffer, end, ByteWidth(byteWidth), packetType)
+}
+
+/**
+ * Represents an generic element in the buffer. It can be specialized into scalar types, using for example,
+ * [Reference.toInt], or casted into Flexbuffer object types, like [Reference.toMap] or [Reference.toBlob].
+ */
+@Suppress("NOTHING_TO_INLINE")
+public class Reference internal constructor(
+  internal val buffer: ReadBuffer,
+  internal val end: Int,
+  internal val parentWidth: ByteWidth,
+  internal val byteWidth: ByteWidth,
+  internal val type: FlexBufferType
+) {
+
+  internal constructor(bb: ReadBuffer, end: Int, parentWidth: ByteWidth, packedType: Int) :
+    this(bb, end, parentWidth, ByteWidth(1 shl (packedType and 3)), FlexBufferType((packedType shr 2)))
+
+  /**
+   * Checks whether the element is null type
+   * @return true if null type
+   */
+  public val isNull: Boolean get() = type == T_NULL
+
+  /**
+   * Checks whether the element is boolean type
+   * @return true if boolean type
+   */
+  public val isBoolean: Boolean get() = type == T_BOOL
+
+  /**
+   * Checks whether the element type is numeric (signed/unsigned integers and floats)
+   * @return true if numeric type
+   */
+  public val isNumeric: Boolean get() = isIntOrUInt || isFloat
+
+  /**
+   * Checks whether the element type is signed or unsigned integers
+   * @return true if an integer type
+   */
+  public val isIntOrUInt: Boolean get() = isInt || isUInt
+
+  /**
+   * Checks whether the element type is float
+   * @return true if a float type
+   */
+  public val isFloat: Boolean get() = type == T_FLOAT || type == T_INDIRECT_FLOAT
+
+  /**
+   * Checks whether the element type is signed integer
+   * @return true if a signed integer type
+   */
+  public val isInt: Boolean get() = type == T_INT || type == T_INDIRECT_INT
+
+  /**
+   * Checks whether the element type is signed integer
+   * @return true if a signed integer type
+   */
+  public val isUInt: Boolean get() = type == T_UINT || type == T_INDIRECT_UINT
+
+  /**
+   * Checks whether the element type is string
+   * @return true if a string type
+   */
+  public val isString: Boolean get() = type == T_STRING
+
+  /**
+   * Checks whether the element type is key
+   * @return true if a key type
+   */
+  public val isKey: Boolean get() = type == T_KEY
+
+  /**
+   * Checks whether the element type is vector or a map. [TypedVector] are considered different types and will return
+   * false.
+   * @return true if a vector type
+   */
+  public val isVector: Boolean get() = type == T_VECTOR || type == T_MAP
+
+  /**
+   * Checks whether the element type is typed vector
+   * @return true if a typed vector type
+   */
+  public val isTypedVector: Boolean get() = type.isTypedVector()
+
+  /**
+   * Checks whether the element type is a map
+   * @return true if a map type
+   */
+  public val isMap: Boolean get() = type == T_MAP
+
+  /**
+   * Checks whether the element type is a blob
+   * @return true if a blob type
+   */
+  public val isBlob: Boolean get() = type == T_BLOB
+
+  /**
+   * Assumes [Reference] as a [Vector] and returns a [Reference] at index [index].
+   */
+  public operator fun get(index: Int): Reference = toVector()[index]
+
+  /**
+   * Assumes [Reference] as a [Map] and returns a [Reference] for the value at key [key].
+   */
+  public operator fun get(key: String): Reference = toMap()[key]
+
+  /**
+   * Returns element as a [Boolean].
+   * If element type is not boolean, it will be casted to integer and compared against 0
+   * @return element as [Boolean]
+   */
+  public fun toBoolean(): Boolean = if (isBoolean) buffer.getBoolean(end) else toUInt() != 0u
+
+  /**
+   * Returns element as [Byte].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will be parsed as integer.
+   *  Unsigned elements will become signed (with possible overflow).
+   *  Float elements will be casted to [Byte].
+   * @return [Byte] or 0 if fail to convert element to integer.
+   */
+  public fun toByte(): Byte = toULong().toByte()
+
+  /**
+   * Returns element as [Short].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will type to be parsed as integer.
+   *  Unsigned elements will become signed (with possible overflow).
+   *  Float elements will be casted to [Short]
+   * @return [Short] or 0 if fail to convert element to integer.
+   */
+  public fun toShort(): Short = toULong().toShort()
+
+  /**
+   * Returns element as [Int].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will type to be parsed as integer.
+   *  Unsigned elements will become signed (with possible overflow).
+   *  Float elements will be casted to [Int]
+   * @return [Int] or 0 if fail to convert element to integer.
+   */
+  public fun toInt(): Int = toULong().toInt()
+
+  /**
+   * Returns element as [Long].
+   * For vector types, it will return size of the vector
+   * For String type, it will type to be parsed as integer
+   * Unsigned elements will become negative
+   * Float elements will be casted to integer
+   * @return [Long] integer or 0 if fail to convert element to long.
+   */
+  public fun toLong(): Long = toULong().toLong()
+
+  /**
+   * Returns element as [UByte].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will type to be parsed as integer.
+   *  Negative elements will become unsigned counterpart.
+   *  Float elements will be casted to [UByte]
+   * @return [UByte] or 0 if fail to convert element to integer.
+   */
+  public fun toUByte(): UByte = toULong().toUByte()
+
+  /**
+   * Returns element as [UShort].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will type to be parsed as integer.
+   *  Negative elements will become unsigned counterpart.
+   *  Float elements will be casted to [UShort]
+   * @return [UShort] or 0 if fail to convert element to integer.
+   */
+  public fun toUShort(): UShort = toULong().toUShort()
+
+  /**
+   * Returns element as [UInt].
+   *  For vector types, it will return size of the vector.
+   *  For String type, it will type to be parsed as integer.
+   *  Negative elements will become unsigned counterpart.
+   *  Float elements will be casted to [UInt]
+   * @return [UInt] or 0 if fail to convert element to integer.
+   */
+  public fun toUInt(): UInt = toULong().toUInt()
+
+  /**
+   * Returns element as  [ULong] integer.
+   * For vector types, it will return size of the vector
+   * For String type, it will type to be parsed as integer
+   * Negative elements will become unsigned counterpart.
+   * Float elements will be casted to integer
+   * @return [ULong] integer or 0 if fail to convert element to long.
+   */
+  public fun toULong(): ULong = resolve { pos: Int, width: ByteWidth ->
+    when (type) {
+      T_INDIRECT_INT, T_INDIRECT_UINT, T_INT, T_BOOL, T_UINT -> buffer.readULong(pos, width)
+      T_FLOAT, T_INDIRECT_FLOAT -> buffer.readFloat(pos, width).toULong()
+      T_STRING -> toString().toULong()
+      T_VECTOR -> toVector().size.toULong()
+      else -> 0UL
+    }
+  }
+
+  /**
+   * Returns element as  [Float].
+   * For vector types, it will return size of the vector
+   * For String type, it will type to be parsed as [Float]
+   * Float elements will be casted to integer
+   * @return [Float] integer or 0 if fail to convert element to long.
+   */
+  public fun toFloat(): Float = resolve { pos: Int, width: ByteWidth ->
+    when (type) {
+      T_INDIRECT_FLOAT, T_FLOAT -> buffer.readFloat(pos, width).toFloat()
+      T_INT -> buffer.readInt(end, parentWidth).toFloat()
+      T_UINT, T_BOOL -> buffer.readUInt(end, parentWidth).toFloat()
+      T_INDIRECT_INT -> buffer.readInt(pos, width).toFloat()
+      T_INDIRECT_UINT -> buffer.readUInt(pos, width).toFloat()
+      T_NULL -> 0.0f
+      T_STRING -> toString().toFloat()
+      T_VECTOR -> toVector().size.toFloat()
+      else -> 0f
+    }
+  }
+
+  /**
+   * Returns element as  [Double].
+   * For vector types, it will return size of the vector
+   * For String type, it will type to be parsed as [Double]
+   * @return [Float] integer or 0 if fail to convert element to long.
+   */
+  public fun toDouble(): Double = resolve { pos: Int, width: ByteWidth ->
+    when (type) {
+      T_INDIRECT_FLOAT, T_FLOAT -> buffer.readFloat(pos, width)
+      T_INT -> buffer.readInt(pos, width).toDouble()
+      T_UINT, T_BOOL -> buffer.readUInt(pos, width).toDouble()
+      T_INDIRECT_INT -> buffer.readInt(pos, width).toDouble()
+      T_INDIRECT_UINT -> buffer.readUInt(pos, width).toDouble()
+      T_NULL -> 0.0
+      T_STRING -> toString().toDouble()
+      T_VECTOR -> toVector().size.toDouble()
+      else -> 0.0
+    }
+  }
+
+  /**
+   * Returns element as [Key] or invalid key.
+   */
+  public fun toKey(): Key = when (type) {
+    T_KEY -> Key(buffer, buffer.indirect(end, parentWidth))
+    else -> nullKey()
+  }
+  /**
+   * Returns element as a [String]
+   * @return element as [String] or empty [String] if fail
+   */
+  override fun toString(): String = when (type) {
+    T_STRING -> {
+      val start = buffer.indirect(end, parentWidth)
+      val size = buffer.readULong(start - byteWidth, byteWidth).toInt()
+      buffer.getString(start, size)
+    }
+    T_KEY -> buffer.getKeyString(buffer.indirect(end, parentWidth))
+    T_MAP -> "{ ${toMap().entries.joinToString(", ") { "${it.key}: ${it.value}"}} }"
+    T_VECTOR, T_VECTOR_BOOL, T_VECTOR_FLOAT, T_VECTOR_INT,
+    T_VECTOR_UINT, T_VECTOR_KEY, T_VECTOR_STRING_DEPRECATED -> "[ ${toVector().joinToString(", ") { it.toString() }} ]"
+    T_INT -> toLong().toString()
+    T_UINT -> toULong().toString()
+    T_FLOAT -> toDouble().toString()
+    else -> "${type.typeToString()}(end=$end)"
+  }
+
+  /**
+   * Returns element as a [ByteArray], converting scalar types when possible.
+   * @return element as [ByteArray] or empty [ByteArray] if fail.
+   */
+  public fun toByteArray(): ByteArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> ByteArray(vec.size) { vec.getInt(it).toByte() }
+      T_VECTOR_UINT -> ByteArray(vec.size) { vec.getUInt(it).toByte() }
+      T_VECTOR -> ByteArray(vec.size) { vec[it].toByte() }
+      T_VECTOR_FLOAT -> ByteArray(vec.size) { vec.getFloat(it).toInt().toByte() }
+      else -> ByteArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [ByteArray], converting scalar types when possible.
+   * @return element as [ByteArray] or empty [ByteArray] if fail.
+   */
+  public fun toShortArray(): ShortArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> ShortArray(vec.size) { vec.getInt(it).toShort() }
+      T_VECTOR_UINT -> ShortArray(vec.size) { vec.getUInt(it).toShort() }
+      T_VECTOR -> ShortArray(vec.size) { vec[it].toShort() }
+      T_VECTOR_FLOAT -> ShortArray(vec.size) { vec.getFloat(it).toInt().toShort() }
+      else -> ShortArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [IntArray], converting scalar types when possible.
+   * @return element as [IntArray] or empty [IntArray] if fail.
+   */
+  public fun toIntArray(): IntArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> IntArray(vec.size) { vec.getInt(it).toInt() }
+      T_VECTOR_UINT -> IntArray(vec.size) { vec.getUInt(it).toInt() }
+      T_VECTOR -> IntArray(vec.size) { vec[it].toInt() }
+      T_VECTOR_FLOAT -> IntArray(vec.size) { vec.getFloat(it).toInt() }
+      else -> IntArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [LongArray], converting scalar types when possible.
+   * @return element as [LongArray] or empty [LongArray] if fail.
+   */
+  public fun toLongArray(): LongArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> LongArray(vec.size) { vec.getInt(it) }
+      T_VECTOR_UINT -> LongArray(vec.size) { vec.getInt(it) }
+      T_VECTOR -> LongArray(vec.size) { vec[it].toLong() }
+      T_VECTOR_FLOAT -> LongArray(vec.size) { vec.getFloat(it).toLong() }
+      else -> LongArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [UByteArray], converting scalar types when possible.
+   * @return element as [UByteArray] or empty [UByteArray] if fail.
+   */
+  public fun toUByteArray(): UByteArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> UByteArray(vec.size) { vec.getInt(it).toUByte() }
+      T_VECTOR_UINT -> UByteArray(vec.size) { vec.getUInt(it).toUByte() }
+      T_VECTOR -> UByteArray(vec.size) { vec[it].toUByte() }
+      T_VECTOR_FLOAT -> UByteArray(vec.size) { vec.getFloat(it).toInt().toUByte() }
+      else -> UByteArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [UIntArray], converting scalar types when possible.
+   * @return element as [UIntArray] or empty [UIntArray] if fail.
+   */
+  public fun toUShortArray(): UShortArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> UShortArray(vec.size) { vec.getInt(it).toUShort() }
+      T_VECTOR_UINT -> UShortArray(vec.size) { vec.getUInt(it).toUShort() }
+      T_VECTOR -> UShortArray(vec.size) { vec[it].toUShort() }
+      T_VECTOR_FLOAT -> UShortArray(vec.size) { vec.getFloat(it).toUInt().toUShort() }
+      else -> UShortArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [UIntArray], converting scalar types when possible.
+   * @return element as [UIntArray] or empty [UIntArray] if fail.
+   */
+  public fun toUIntArray(): UIntArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> UIntArray(vec.size) { vec.getInt(it).toUInt() }
+      T_VECTOR_UINT -> UIntArray(vec.size) { vec.getUInt(it).toUInt() }
+      T_VECTOR -> UIntArray(vec.size) { vec[it].toUInt() }
+      T_VECTOR_FLOAT -> UIntArray(vec.size) { vec.getFloat(it).toUInt() }
+      else -> UIntArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [ULongArray], converting scalar types when possible.
+   * @return element as [ULongArray] or empty [ULongArray] if fail.
+   */
+  public fun toULongArray(): ULongArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_INT -> ULongArray(vec.size) { vec.getUInt(it) }
+      T_VECTOR_UINT -> ULongArray(vec.size) { vec.getUInt(it) }
+      T_VECTOR -> ULongArray(vec.size) { vec[it].toULong() }
+      T_VECTOR_FLOAT -> ULongArray(vec.size) { vec.getFloat(it).toULong() }
+      else -> ULongArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [FloatArray], converting scalar types when possible.
+   * @return element as [FloatArray] or empty [FloatArray] if fail.
+   */
+  public fun toFloatArray(): FloatArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_FLOAT -> FloatArray(vec.size) { vec.getFloat(it).toFloat() }
+      T_VECTOR_INT -> FloatArray(vec.size) { vec.getInt(it).toFloat() }
+      T_VECTOR_UINT -> FloatArray(vec.size) { vec.getUInt(it).toFloat() }
+      T_VECTOR -> FloatArray(vec.size) { vec[it].toFloat() }
+      else -> FloatArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [DoubleArray], converting scalar types when possible.
+   * @return element as [DoubleArray] or empty [DoubleArray] if fail.
+   */
+  public fun toDoubleArray(): DoubleArray {
+    val vec = TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+    return when (type) {
+      T_VECTOR_FLOAT -> DoubleArray(vec.size) { vec[it].toDouble() }
+      T_VECTOR_INT -> DoubleArray(vec.size) { vec[it].toDouble() }
+      T_VECTOR_UINT -> DoubleArray(vec.size) { vec[it].toDouble() }
+      T_VECTOR -> DoubleArray(vec.size) { vec[it].toDouble() }
+      else -> DoubleArray(0)
+    }
+  }
+
+  /**
+   * Returns element as a [Vector]
+   * @return element as [Vector] or empty [Vector] if fail
+   */
+  public fun toVector(): Vector {
+    return when {
+      isVector -> Vector(buffer, buffer.indirect(end, parentWidth), byteWidth)
+      isTypedVector -> TypedVector(type.toElementTypedVector(), buffer, buffer.indirect(end, parentWidth), byteWidth)
+      else -> emptyVector()
+    }
+  }
+
+  /**
+   * Returns element as a [Blob]
+   * @return element as [Blob] or empty [Blob] if fail
+   */
+  public fun toBlob(): Blob {
+    return when (type) {
+      T_BLOB, T_STRING -> Blob(buffer, buffer.indirect(end, parentWidth), byteWidth)
+      else -> emptyBlob()
+    }
+  }
+
+  /**
+   * Returns element as a [Map].
+   * @return element as [Map] or empty [Map] if fail
+   */
+  public fun toMap(): Map = when (type) {
+    T_MAP -> Map(buffer, buffer.indirect(end, parentWidth), byteWidth)
+    else -> emptyMap()
+  }
+
+  private inline fun <T> resolve(crossinline block: (pos: Int, width: ByteWidth) -> T): T {
+    return if (type.isIndirectScalar()) {
+      block(buffer.indirect(end, byteWidth), byteWidth)
+    } else {
+      block(end, parentWidth)
+    }
+  }
+
+  override fun equals(other: Any?): Boolean {
+    if (this === other) return true
+    if (other == null || this::class != other::class) return false
+    other as Reference
+    if (buffer != other.buffer ||
+      end != other.end ||
+      parentWidth != other.parentWidth ||
+      byteWidth != other.byteWidth ||
+      type != other.type
+    ) return false
+    return true
+  }
+
+  override fun hashCode(): Int {
+    var result = buffer.hashCode()
+    result = 31 * result + end
+    result = 31 * result + parentWidth.value
+    result = 31 * result + byteWidth.value
+    result = 31 * result + type.hashCode()
+    return result
+  }
+}
+
+/**
+ * Represents any element that has a size property to it, like: [Map], [Vector] and [TypedVector].
+ */
+public open class Sized internal constructor(
+  public val buffer: ReadBuffer,
+  public val end: Int,
+  public val byteWidth: ByteWidth
+) {
+  public open val size: Int = buffer.readSize(end, byteWidth)
+}
+
+/**
+ * Represent an array of bytes in the buffer.
+ */
+public open class Blob internal constructor(
+  buffer: ReadBuffer,
+  end: Int,
+  byteWidth: ByteWidth
+) : Sized(buffer, end, byteWidth) {
+  /**
+   * Return [Blob] as [ReadBuffer]
+   * @return blob as [ReadBuffer]
+   */
+  public fun data(): ReadBuffer = buffer.slice(end, size)
+
+  /**
+   * Copy [Blob] into a [ByteArray]
+   * @return A [ByteArray] containing the blob data.
+   */
+  public fun toByteArray(): ByteArray {
+    val result = ByteArray(size)
+    for (i in 0 until size) {
+      result[i] = buffer[end + i]
+    }
+    return result
+  }
+
+  /**
+   * Return individual byte at a given position
+   * @param pos position of the byte to be read
+   */
+  public operator fun get(pos: Int): Byte {
+    if (pos !in 0..size) error("$pos index out of bounds. Should be in range 0..$size")
+    return buffer[end + pos]
+  }
+
+  override fun toString(): String = buffer.getString(end, size)
+}
+
+/**
+ * [Vector] represents an array of elements in the buffer. The element can be of any type.
+ */
+public open class Vector internal constructor(
+  buffer: ReadBuffer,
+  end: Int,
+  byteWidth: ByteWidth
+) : Collection<Reference>,
+  Sized(buffer, end, byteWidth) {
+
+  /**
+   * Returns a [Reference] from the [Vector] at position [index]. Returns a null reference
+   * @param index position in the vector.
+   * @return [Reference] for a key or a null [Reference] if not found.
+   */
+  public open operator fun get(index: Int): Reference {
+    if (index >= size) return nullReference()
+    val packedType = buffer[(end + size * byteWidth.value + index)].toInt()
+    val objEnd = end + index * byteWidth
+    return Reference(buffer, objEnd, byteWidth, packedType)
+  }
+
+  // overrides from Collection<Reference>
+
+  override fun contains(element: Reference): Boolean = find { it == element } != null
+
+  override fun containsAll(elements: Collection<Reference>): Boolean {
+    elements.forEach { if (!contains(it)) return false }
+    return true
+  }
+
+  override fun isEmpty(): Boolean = size == 0
+
+  override fun iterator(): Iterator<Reference> = object : Iterator<Reference> {
+    var position = 0
+    override fun hasNext(): Boolean = position != size
+    override fun next(): Reference = get(position++)
+  }
+}
+
+/**
+ * [TypedVector] represents an array of scalar elements of the same type in the buffer.
+ */
+public open class TypedVector(
+  private val elementType: FlexBufferType,
+  buffer: ReadBuffer,
+  end: Int,
+  byteWidth: ByteWidth
+) : Vector(buffer, end, byteWidth) {
+
+  /**
+   * Returns a [Reference] from the [TypedVector] at position [index]. Returns a null reference
+   * @param index position in the vector.
+   * @return [Reference] for a key or a null [Reference] if not found.
+   */
+  override operator fun get(index: Int): Reference {
+    if (index >= size) return nullReference()
+    val childPos: Int = end + index * byteWidth
+    return Reference(buffer, childPos, byteWidth, ByteWidth(1), elementType)
+  }
+
+  private inline fun <T> resolveAt(index: Int, crossinline block: (Int, ByteWidth) -> T): T {
+    val childPos: Int = end + index * byteWidth
+    return block(childPos, byteWidth)
+  }
+
+  internal fun getBoolean(index: Int): Boolean = resolveAt(index) { pos: Int, _: ByteWidth -> buffer.getBoolean(pos) }
+  internal fun getInt(index: Int): Long = resolveAt(index) { pos: Int, width: ByteWidth -> buffer.readLong(pos, width) }
+  internal fun getUInt(index: Int): ULong = resolveAt(index) { pos: Int, width: ByteWidth -> buffer.readULong(pos, width) }
+  internal fun getFloat(index: Int): Double = resolveAt(index) { pos: Int, width: ByteWidth -> buffer.readFloat(pos, width) }
+}
+
+/**
+ * Represents a key element in the buffer. Keys are
+ * used to reference objects in a [Map]
+ */
+public data class Key(
+  public val buffer: ReadBuffer,
+  public val start: Int,
+  public val end: Int = buffer.findFirst(ZeroByte, start)
+) {
+
+  val sizeInBytes: Int = end - start
+
+  private val codePoint = CharArray(2)
+
+  val sizeInChars: Int
+    get() {
+      var count = 0
+      var i = start
+      while (i < end) {
+        val size = codePointSizeInBytes(i)
+        i += size
+        count += if (size == 4) 2 else 1
+      }
+      return count
+    }
+
+  public operator fun get(index: Int): Char {
+    var count = 0
+    var i = start
+    var size = 0
+    // we loop over the bytes to find the right position for the "char" at index i
+    while (i < end && count < index) {
+      size = codePointSizeInBytes(i)
+      i += size
+      // 4 bytes utf8 are 2 chars wide, the rest is on char.
+      count += if (size == 4) 2 else 1
+    }
+    return when {
+      count == index -> {
+        Utf8.decodeUtf8CodePoint(buffer, i, codePoint)
+        codePoint[0]
+      }
+      count == index + 1 && size == 4 -> {
+        Utf8.decodeUtf8CodePoint(buffer, i - size, codePoint)
+        codePoint[1]
+      }
+      else -> error("Invalid count=$count, index=$index")
+    }
+  }
+
+  private inline fun codePointSizeInBytes(pos: Int): Int {
+    val b = buffer[pos]
+    return when {
+      Utf8.isOneByte(b) -> 1
+      Utf8.isTwoBytes(b) -> 2
+      Utf8.isThreeBytes(b) -> 3
+      else -> 4
+    }
+  }
+
+  override fun toString(): String = if (sizeInBytes > 0) buffer.getString(start, sizeInBytes) else ""
+
+  /**
+   * Checks whether Key is invalid or not.
+   */
+  public fun isInvalid(): Boolean = sizeInBytes <= 0
+}
+
+/**
+ * A Map class that provide support to access Key-Value data from Flexbuffers.
+ */
+public class Map internal constructor(buffer: ReadBuffer, end: Int, byteWidth: ByteWidth) :
+  Sized(buffer, end, byteWidth),
+  kotlin.collections.Map<Key, Reference> {
+
+  // used for accessing the key vector elements
+  private var keyVectorEnd: Int
+  private var keyVectorByteWidth: ByteWidth
+  init {
+    val keysOffset = end - (3 * byteWidth) // 3 is number of prefixed fields
+    keyVectorEnd = buffer.indirect(keysOffset, byteWidth)
+    keyVectorByteWidth = ByteWidth(buffer.readInt(keysOffset + byteWidth, byteWidth))
+  }
+
+  /**
+   * Returns a [Reference] from the [Map] at position [index]. Returns a null reference
+   * @param index position in the map
+   * @return [Reference] for a key or a null [Reference] if not found.
+   */
+  public operator fun get(index: Int): Reference {
+    if (index >= size) return nullReference()
+    val packedPos = end + size * byteWidth + index
+    val packedType = buffer[packedPos].toInt()
+    val objEnd = end + index * byteWidth
+    return Reference(buffer, objEnd, byteWidth, packedType)
+  }
+
+  /**
+   * Returns a [Reference] from the [Map] for a given [String] [key].
+   * @param key access key to element on map
+   * @return [Reference] for a key or a null [Reference] if not found.
+   */
+  public operator fun get(key: String): Reference {
+    val index: Int = binarySearch(key)
+    return if (index in 0 until size) {
+      get(index)
+    } else nullReference()
+  }
+
+  /**
+   * Returns a [Reference] from the [Map] for a given [Key] [key].
+   * @param key access key to element on map
+   * @return [Reference] for a key or a null [Reference] if not found.
+   */
+  override operator fun get(key: Key): Reference {
+    val index = binarySearch(key)
+    return if (index in 0 until size) {
+      get(index)
+    } else nullReference()
+  }
+
+  /**
+   * Checks whether the map contains a [key].
+   * @param key [String]
+   * @return true if key is found in the map, otherwise false.
+   */
+  public operator fun contains(key: String): Boolean = binarySearch(key) >= 0
+
+  /**
+   * Returns a [Key] for a given position [index] in the [Map].
+   * @param index of the key in the map
+   * @return a Key for the given index. Out of bounds indexes returns invalid keys.
+   */
+  public fun keyAt(index: Int): Key {
+    val childPos: Int = keyVectorEnd + index * keyVectorByteWidth
+    return Key(buffer, buffer.indirect(childPos, keyVectorByteWidth))
+  }
+
+  /**
+   * Returns a [Key] as [String] for a given position [index] in the [Map].
+   * @param index of the key in the map
+   * @return a Key for the given index. Out of bounds indexes returns empty string.
+   */
+  public fun keyAsString(index: Int): String {
+    val childPos: Int = keyVectorEnd + index * keyVectorByteWidth
+    val start = buffer.indirect(childPos, keyVectorByteWidth)
+    val end = buffer.findFirst(ZeroByte, start)
+    return if (end > start) buffer.getString(start, end - start) else ""
+  }
+
+  // Overrides from kotlin.collections.Map<Key, Reference>
+
+  public data class Entry(override val key: Key, override val value: Reference) :
+    kotlin.collections.Map.Entry<Key, Reference>
+
+  override val entries: Set<kotlin.collections.Map.Entry<Key, Reference>>
+    get() = keys.map { Entry(it, get(it.toString())) }.toSet()
+
+  override val keys: Set<Key>
+    get() {
+      val set = LinkedHashSet<Key>(size)
+      for (i in 0 until size) {
+        val key = keyAt(i)
+        set.add(key)
+      }
+      return set
+    }
+
+  /**
+   * Returns a [Vector] for accessing all values in the [Map].
+   * @return [Vector] of values.
+   */
+  override val values: Collection<Reference>
+    get() = Vector(buffer, end, byteWidth)
+
+  override fun containsKey(key: Key): Boolean {
+    for (i in 0 until size) {
+      if (key == keyAt(i))
+        return true
+    }
+    return false
+  }
+
+  override fun containsValue(value: Reference): Boolean = values.contains(value)
+
+  override fun isEmpty(): Boolean = size == 0
+
+  // Performs a binary search on a key vector and return index of the key in key vector
+  private fun binarySearch(searchedKey: String) = binarySearch { compareCharSequence(it, searchedKey) }
+  // Performs a binary search on a key vector and return index of the key in key vector
+  private fun binarySearch(key: Key): Int = binarySearch { compareKeys(it, key.start) }
+
+  private inline fun binarySearch(crossinline comparisonBlock: (Int) -> Int): Int {
+    var low = 0
+    var high = size - 1
+    while (low <= high) {
+      val mid = low + high ushr 1
+      val keyPos: Int = buffer.indirect(keyVectorEnd + mid * keyVectorByteWidth, keyVectorByteWidth)
+      val cmp: Int = comparisonBlock(keyPos)
+      if (cmp < 0) low = mid + 1 else if (cmp > 0) high = mid - 1 else return mid // key found
+    }
+    return -(low + 1) // key not found
+  }
+
+  // compares a CharSequence against a T_KEY
+  private fun compareKeys(start: Int, other: Int): Int {
+    var bufferPos = start
+    var otherPos = other
+    val limit: Int = buffer.limit
+    var c1: Byte = ZeroByte
+    var c2: Byte = ZeroByte
+    while (otherPos < limit) {
+      c1 = buffer[bufferPos++]
+      c2 = buffer[otherPos++]
+      when {
+        c1 == ZeroByte -> return c1 - c2
+        c1 != c2 -> return c1 - c2
+      }
+    }
+    return c1 - c2
+  }
+
+  // compares a CharSequence against a [CharSequence]
+  private fun compareCharSequence(start: Int, other: CharSequence): Int {
+    var bufferPos = start
+    var otherPos = 0
+    val limit: Int = buffer.limit
+    val otherLimit = other.length
+    // special loop for ASCII characters. Most of keys should be ASCII only, so this
+    // loop should be optimized for that.
+    // breaks if a multi-byte character is found
+    while (otherPos < otherLimit) {
+      val c2 = other[otherPos]
+      // not a single byte codepoint
+      if (c2.toInt() >= 0x80) {
+        break
+      }
+      val b: Byte = buffer[bufferPos]
+      when {
+        b == ZeroByte -> return -c2.toInt()
+        b < 0 -> break
+        b != c2.toByte() -> return b - c2.toByte()
+      }
+      ++bufferPos
+      ++otherPos
+    }
+    if (bufferPos < limit)
+      return 0
+
+    val comparisonBuffer = ByteArray(4)
+    while (bufferPos < limit) {
+      val sizeInBuff = Utf8.encodeUtf8CodePoint(other, otherPos, comparisonBuffer)
+      if (sizeInBuff == 0) {
+        return buffer[bufferPos].toInt()
+      }
+      for (i in 0 until sizeInBuff) {
+        val bufferByte: Byte = buffer[bufferPos++]
+        val otherByte: Byte = comparisonBuffer[i]
+        when {
+          bufferByte == ZeroByte -> return -otherByte
+          bufferByte != otherByte -> return bufferByte - otherByte
+        }
+      }
+      otherPos += if (sizeInBuff == 4) 2 else 1
+    }
+    return 0
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersBuilder.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersBuilder.kt
new file mode 100644
index 0000000..a4cd9d3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersBuilder.kt
@@ -0,0 +1,771 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+
+package com.google.flatbuffers.kotlin
+
+public class FlexBuffersBuilder(
+  public val buffer: ReadWriteBuffer,
+  private val shareFlag: Int = SHARE_KEYS
+) {
+
+  public constructor(initialCapacity: Int = 1024, shareFlag: Int = SHARE_KEYS) :
+    this(ArrayReadWriteBuffer(initialCapacity), shareFlag)
+
+  private val stringValuePool: HashMap<String, Value> = HashMap()
+  private val stringKeyPool: HashMap<String, Int> = HashMap()
+  private val stack: MutableList<Value> = mutableListOf()
+  private var finished: Boolean = false
+
+  /**
+   * Reset the FlexBuffersBuilder by purging all data that it holds. Buffer might
+   * keep its capacity after a reset.
+   */
+  public fun clear() {
+    buffer.clear()
+    stringValuePool.clear()
+    stringKeyPool.clear()
+    stack.clear()
+    finished = false
+  }
+
+  /**
+   * Finish writing the message into the buffer. After that no other element must
+   * be inserted into the buffer. Also, you must call this function before start using the
+   * FlexBuffer message
+   * @return [ReadBuffer] containing the FlexBuffer message
+   */
+  public fun finish(): ReadBuffer {
+    // If you hit this assert, you likely have objects that were never included
+    // in a parent. You need to have exactly one root to finish a buffer.
+    // Check your Start/End calls are matched, and all objects are inside
+    // some other object.
+    if (stack.size != 1) error("There is must be only on object as root. Current ${stack.size}.")
+    // Write root value.
+    val byteWidth = align(stack[0].elemWidth(buffer.writePosition, 0))
+    writeAny(stack[0], byteWidth)
+    // Write root type.
+    buffer.put(stack[0].storedPackedType())
+    // Write root size. Normally determined by parent, but root has no parent :)
+    buffer.put(byteWidth.value.toByte())
+    this.finished = true
+    return buffer // TODO: make a read-only shallow copy
+  }
+
+  /**
+   * Insert a single [Boolean] into the buffer
+   * @param value true or false
+   */
+  public fun put(value: Boolean): Unit = run { this[null] = value }
+
+  /**
+   * Insert a null reference into the buffer. A key must be present if element is inserted into a map.
+   */
+  public fun putNull(key: String? = null): Unit =
+    run { stack.add(Value(T_NULL, putKey(key), W_8, 0UL)) }
+
+  /**
+   * Insert a single [Boolean] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: Boolean): Unit =
+    run { stack.add(Value(T_BOOL, putKey(key), W_8, if (value) 1UL else 0UL)) }
+
+  /**
+   * Insert a single [Byte] into the buffer
+   */
+  public fun put(value: Byte): Unit = set(null, value.toLong())
+
+  /**
+   * Insert a single [Byte] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: Byte): Unit = set(key, value.toLong())
+
+  /**
+   * Insert a single [Short] into the buffer.
+   */
+  public fun put(value: Short): Unit = set(null, value.toLong())
+
+  /**
+   * Insert a single [Short] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public inline operator fun set(key: String? = null, value: Short): Unit = set(key, value.toLong())
+
+  /**
+   * Insert a single [Int] into the buffer.
+   */
+  public fun put(value: Int): Unit = set(null, value.toLong())
+
+  /**
+   * Insert a single [Int] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public inline operator fun set(key: String? = null, value: Int): Unit = set(key, value.toLong())
+
+  /**
+   * Insert a single [Long] into the buffer.
+   */
+  public fun put(value: Long): Unit = set(null, value)
+
+  /**
+   * Insert a single [Long] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: Long): Unit =
+    run { stack.add(Value(T_INT, putKey(key), value.toULong().widthInUBits(), value.toULong())) }
+
+  /**
+   * Insert a single [UByte] into the buffer
+   */
+  public fun put(value: UByte): Unit = set(null, value.toULong())
+
+  /**
+   * Insert a single [UByte] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public inline operator fun set(key: String? = null, value: UByte): Unit = set(key, value.toULong())
+
+  /**
+   * Insert a single [UShort] into the buffer.
+   */
+  public fun put(value: UShort): Unit = set(null, value.toULong())
+
+  /**
+   * Insert a single [UShort] into the buffer. A key must be present if element is inserted into a map.
+   */
+  private inline operator fun set(key: String? = null, value: UShort): Unit = set(key, value.toULong())
+
+  /**
+   * Insert a single [UInt] into the buffer.
+   */
+  public fun put(value: UInt): Unit = set(null, value.toULong())
+
+  /**
+   * Insert a single [UInt] into the buffer. A key must be present if element is inserted into a map.
+   */
+  private inline operator fun set(key: String? = null, value: UInt): Unit = set(key, value.toULong())
+
+  /**
+   * Insert a single [ULong] into the buffer.
+   */
+  public fun put(value: ULong): Unit = set(null, value)
+
+  /**
+   * Insert a single [ULong] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: ULong): Unit =
+    run { stack.add(Value(T_UINT, putKey(key), value.widthInUBits(), value)) }
+
+  /**
+   * Insert a single [Float] into the buffer.
+   */
+  public fun put(value: Float): Unit = run { this[null] = value }
+
+  /**
+   * Insert a single [Float] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: Float): Unit =
+    run { stack.add(Value(T_FLOAT, putKey(key), W_32, dValue = value.toDouble())) }
+
+  /**
+   * Insert a single [Double] into the buffer.
+   */
+  public fun put(value: Double): Unit = run { this[null] = value }
+
+  /**
+   * Insert a single [Double] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: Double): Unit =
+    run { stack.add(Value(T_FLOAT, putKey(key), W_64, dValue = value)) }
+
+  /**
+   * Insert a single [String] into the buffer.
+   */
+  public fun put(value: String): Int = set(null, value)
+
+  /**
+   * Insert a single [String] into the buffer. A key must be present if element is inserted into a map.
+   */
+  public operator fun set(key: String? = null, value: String): Int {
+    val iKey = putKey(key)
+    val holder = if (shareFlag and SHARE_STRINGS != 0) {
+      stringValuePool.getOrPut(value) { writeString(iKey, value).also { stringValuePool[value] = it } }.copy(key = iKey)
+    } else {
+      writeString(iKey, value)
+    }
+    stack.add(holder)
+    return holder.iValue.toInt()
+  }
+
+  /**
+   * Adds a [ByteArray] into the message as a [Blob].
+   * @param value byte array
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: ByteArray): Int = set(null, value)
+
+  /**
+   * Adds a [ByteArray] into the message as a [Blob]. A key must be present if element is inserted into a map.
+   * @param value byte array
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: ByteArray): Int {
+    val element = writeBlob(putKey(key), value, T_BLOB, false)
+    stack.add(element)
+    return element.iValue.toInt()
+  }
+
+  /**
+   * Adds a [IntArray] into the message as a typed vector of fixed size.
+   * @param value [IntArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: IntArray): Int = set(null, value)
+
+  /**
+   * Adds a [IntArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [IntArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: IntArray): Int =
+    setTypedVector(key, value.size, T_VECTOR_INT, value.widthInUBits()) { writeIntArray(value, it) }
+
+  /**
+   * Adds a [ShortArray] into the message as a typed vector of fixed size.
+   * @param value [ShortArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: ShortArray): Int = set(null, value)
+
+  /**
+   * Adds a [ShortArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [ShortArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: ShortArray): Int =
+    setTypedVector(key, value.size, T_VECTOR_INT, value.widthInUBits()) { writeIntArray(value, it) }
+
+  /**
+   * Adds a [LongArray] into the message as a typed vector of fixed size.
+   * @param value [LongArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: LongArray): Int = set(null, value)
+
+  /**
+   * Adds a [LongArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [LongArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: LongArray): Int =
+    setTypedVector(key, value.size, T_VECTOR_INT, value.widthInUBits()) { writeIntArray(value, it) }
+
+  /**
+   * Adds a [FloatArray] into the message as a typed vector of fixed size.
+   * @param value [FloatArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: FloatArray): Int = set(null, value)
+
+  /**
+   * Adds a [FloatArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [FloatArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: FloatArray): Int =
+    setTypedVector(key, value.size, T_VECTOR_FLOAT, W_32) { writeFloatArray(value) }
+
+  /**
+   * Adds a [DoubleArray] into the message as a typed vector of fixed size.
+   * @param value [DoubleArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: DoubleArray): Int = set(null, value)
+
+  /**
+   * Adds a [DoubleArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [DoubleArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: DoubleArray): Int =
+    setTypedVector(key, value.size, T_VECTOR_FLOAT, W_64) { writeFloatArray(value) }
+
+  /**
+   * Adds a [UByteArray] into the message as a typed vector of fixed size.
+   * @param value [UByteArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: UByteArray): Int = set(null, value)
+
+  /**
+   * Adds a [UByteArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [UByteArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: UByteArray): Int =
+    setTypedVec(key) { value.forEach { put(it) } }
+
+  /**
+   * Adds a [UShortArray] into the message as a typed vector of fixed size.
+   * @param value [UShortArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: UShortArray): Int = set(null, value)
+
+  /**
+   * Adds a [UShortArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [UShortArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: UShortArray): Int =
+    setTypedVec(key) { value.forEach { put(it) } }
+
+  /**
+   * Adds a [UIntArray] into the message as a typed vector of fixed size.
+   * @param value [UIntArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: UIntArray): Int = set(null, value)
+
+  /**
+   * Adds a [UIntArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [UIntArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun set(key: String? = null, value: UIntArray): Int =
+    setTypedVec(key) { value.forEach { put(it) } }
+
+  /**
+   * Adds a [ULongArray] into the message as a typed vector of fixed size.
+   * @param value [ULongArray]
+   * @return position in buffer as the start of byte array
+   */
+  public fun put(value: ULongArray): Int = set(null, value)
+
+  /**
+   * Adds a [ULongArray] into the message as a typed vector of fixed size.
+   * A key must be present if element is inserted into a map.
+   * @param value [ULongArray]
+   * @return position in buffer as the start of byte array
+   */
+  public operator fun set(key: String? = null, value: ULongArray): Int =
+    setTypedVec(key) { value.forEach { put(it) } }
+
+  /**
+   * Creates a new vector will all elements inserted in [block].
+   * @param block where elements will be inserted
+   * @return position in buffer as the start of byte array
+   */
+  public inline fun putVector(crossinline block: FlexBuffersBuilder.() -> Unit): Int {
+    val pos = startVector()
+    this.block()
+    return endVector(pos)
+  }
+
+  /**
+   * Creates a new typed vector will all elements inserted in [block].
+   * @param block where elements will be inserted
+   * @return position in buffer as the start of byte array
+   */
+  public inline fun putTypedVector(crossinline block: FlexBuffersBuilder.() -> Unit): Int {
+    val pos = startVector()
+    this.block()
+    return endTypedVector(pos)
+  }
+
+  /**
+   * Helper function to return position for starting a new vector.
+   */
+  public fun startVector(): Int = stack.size
+
+  /**
+   * Finishes a vector element. The initial position of the vector must be passed
+   * @param position position at the start of the vector
+   */
+  public fun endVector(position: Int): Int = endVector(null, position)
+
+  /**
+   * Finishes a vector element. The initial position of the vector must be passed
+   * @param position position at the start of the vector
+   */
+  public fun endVector(key: String? = null, position: Int): Int =
+    endAnyVector(position) { createVector(putKey(key), position, stack.size - position) }
+  /**
+   * Finishes a typed vector element. The initial position of the vector must be passed
+   * @param position position at the start of the vector
+   */
+  public fun endTypedVector(position: Int): Int = endTypedVector(position, null)
+
+  /**
+   * Helper function to return position for starting a new vector.
+   */
+  public fun startMap(): Int = stack.size
+
+  /**
+   * Creates a new map will all elements inserted in [block].
+   * @param block where elements will be inserted
+   * @return position in buffer as the start of byte array
+   */
+  public inline fun putMap(key: String? = null, crossinline block: FlexBuffersBuilder.() -> Unit): Int {
+    val pos = startMap()
+    this.block()
+    return endMap(pos, key)
+  }
+
+  /**
+   * Finishes a map, but writing the information in the buffer
+   * @param key   key used to store element in map
+   * @return Reference to the map
+   */
+  public fun endMap(start: Int, key: String? = null): Int {
+    stack.subList(start, stack.size).sortWith(keyComparator)
+    val length = stack.size - start
+    val keys = createKeyVector(start, length)
+    val vec = putMap(putKey(key), start, length, keys)
+    // Remove temp elements and return map.
+    while (stack.size > start) {
+      stack.removeAt(stack.size - 1)
+    }
+    stack.add(vec)
+    return vec.iValue.toInt()
+  }
+
+  private inline fun setTypedVector(
+    key: String? = null,
+    length: Int,
+    vecType: FlexBufferType,
+    bitWidth: BitWidth,
+    crossinline writeBlock: (ByteWidth) -> Unit
+  ): Int {
+    val keyPos = putKey(key)
+    val byteWidth = align(bitWidth)
+    // Write vector. First the keys width/offset if available, and size.
+    // write the size
+    writeInt(length, byteWidth)
+
+    // Then the actual data.
+    val vloc: Int = buffer.writePosition
+    writeBlock(byteWidth)
+    stack.add(Value(vecType, keyPos, bitWidth, vloc.toULong()))
+    return vloc
+  }
+
+  private inline fun setTypedVec(key: String? = null, crossinline block: FlexBuffersBuilder.() -> Unit): Int {
+    val pos = startVector()
+    this.block()
+    return endTypedVector(pos, key)
+  }
+
+  public fun endTypedVector(position: Int, key: String? = null): Int =
+    endAnyVector(position) { createTypedVector(putKey(key), position, stack.size - position) }
+
+  private inline fun endAnyVector(start: Int, crossinline creationBlock: () -> Value): Int {
+    val vec = creationBlock()
+    // Remove temp elements and return vector.
+    while (stack.size > start) {
+      stack.removeLast()
+    }
+    stack.add(vec)
+    return vec.iValue.toInt()
+  }
+
+  private inline fun putKey(key: String? = null): Int {
+    if (key == null) return -1
+    return if ((shareFlag and SHARE_KEYS) != 0) {
+      stringKeyPool.getOrPut(key) {
+        val pos: Int = buffer.writePosition
+        buffer.put(key)
+        buffer.put(ZeroByte)
+        pos
+      }
+    } else {
+      val pos: Int = buffer.writePosition
+      buffer.put(key)
+      buffer.put(ZeroByte)
+      pos
+    }
+  }
+
+  private fun writeAny(toWrite: Value, byteWidth: ByteWidth) = when (toWrite.type) {
+    T_NULL, T_BOOL, T_INT, T_UINT -> writeInt(toWrite.iValue, byteWidth)
+    T_FLOAT -> writeDouble(toWrite.dValue, byteWidth)
+    else -> writeOffset(toWrite.iValue.toInt(), byteWidth)
+  }
+
+  private fun writeString(key: Int, s: String): Value {
+    val size = Utf8.encodedLength(s)
+    val bitWidth = size.toULong().widthInUBits()
+    val byteWidth = align(bitWidth)
+
+    writeInt(size, byteWidth)
+
+    val sloc: Int = buffer.writePosition
+    if (size > 0)
+      buffer.put(s, size)
+    buffer.put(ZeroByte)
+    return Value(T_STRING, key, bitWidth, sloc.toULong())
+  }
+
+  private fun writeDouble(toWrite: Double, byteWidth: ByteWidth): Unit = when (byteWidth.value) {
+    4 -> buffer.put(toWrite.toFloat())
+    8 -> buffer.put(toWrite)
+    else -> Unit
+  }
+
+  private fun writeOffset(toWrite: Int, byteWidth: ByteWidth) {
+    val relativeOffset = (buffer.writePosition - toWrite)
+    if (byteWidth.value != 8 && relativeOffset >= 1L shl byteWidth.value * 8) error("invalid offset $relativeOffset, writer pos ${buffer.writePosition}")
+    writeInt(relativeOffset, byteWidth)
+  }
+
+  private inline fun writeBlob(key: Int, blob: ByteArray, type: FlexBufferType, trailing: Boolean): Value {
+    val bitWidth = blob.size.toULong().widthInUBits()
+    val byteWidth = align(bitWidth)
+
+    writeInt(blob.size, byteWidth)
+
+    val sloc: Int = buffer.writePosition
+    buffer.put(blob, 0, blob.size)
+    if (trailing) {
+      buffer.put(ZeroByte)
+    }
+    return Value(type, key, bitWidth, sloc.toULong())
+  }
+
+  private fun writeIntArray(value: IntArray, byteWidth: ByteWidth) =
+    writeIntegerArray(0, value.size, byteWidth) { value[it].toULong() }
+
+  private fun writeIntArray(value: ShortArray, byteWidth: ByteWidth) =
+    writeIntegerArray(0, value.size, byteWidth) { value[it].toULong() }
+
+  private fun writeIntArray(value: LongArray, byteWidth: ByteWidth) =
+    writeIntegerArray(0, value.size, byteWidth) { value[it].toULong() }
+
+  private fun writeFloatArray(value: FloatArray) {
+    val byteWidth = Float.SIZE_BYTES
+    // since we know we are writing an array, we can avoid multiple copy/growth of the buffer by requesting
+    // the right size on the spot
+    buffer.requestCapacity(buffer.writePosition + (value.size * byteWidth))
+    value.forEach { buffer.put(it) }
+  }
+
+  private fun writeFloatArray(value: DoubleArray) {
+    val byteWidth = Double.SIZE_BYTES
+    // since we know we are writing an array, we can avoid multiple copy/growth of the buffer by requesting
+    // the right size on the spot
+    buffer.requestCapacity(buffer.writePosition + (value.size * byteWidth))
+    value.forEach { buffer.put(it) }
+  }
+
+  private inline fun writeIntegerArray(
+    start: Int,
+    size: Int,
+    byteWidth: ByteWidth,
+    crossinline valueBlock: (Int) -> ULong
+  ) {
+    // since we know we are writing an array, we can avoid multiple copy/growth of the buffer by requesting
+    // the right size on the spot
+    buffer.requestCapacity(buffer.writePosition + (size * byteWidth))
+    return when (byteWidth.value) {
+      1 -> for (i in start until start + size) {
+        buffer.put(valueBlock(i).toUByte())
+      }
+      2 -> for (i in start until start + size) {
+        buffer.put(valueBlock(i).toUShort())
+      }
+      4 -> for (i in start until start + size) {
+        buffer.put(valueBlock(i).toUInt())
+      }
+      8 -> for (i in start until start + size) {
+        buffer.put(valueBlock(i))
+      }
+      else -> Unit
+    }
+  }
+
+  private fun writeInt(value: Int, byteWidth: ByteWidth) = when (byteWidth.value) {
+    1 -> buffer.put(value.toUByte())
+    2 -> buffer.put(value.toUShort())
+    4 -> buffer.put(value.toUInt())
+    8 -> buffer.put(value.toULong())
+    else -> Unit
+  }
+
+  private fun writeInt(value: ULong, byteWidth: ByteWidth) = when (byteWidth.value) {
+    1 -> buffer.put(value.toUByte())
+    2 -> buffer.put(value.toUShort())
+    4 -> buffer.put(value.toUInt())
+    8 -> buffer.put(value)
+    else -> Unit
+  }
+
+  // Align to prepare for writing a scalar with a certain size.
+  // returns the amounts of bytes needed to be written.
+  private fun align(alignment: BitWidth): ByteWidth {
+    val byteWidth = 1 shl alignment.value
+    var padBytes = paddingBytes(buffer.writePosition, byteWidth)
+    while (padBytes-- != 0) {
+      buffer.put(ZeroByte)
+    }
+    return ByteWidth(byteWidth)
+  }
+
+  private fun calculateKeyVectorBitWidth(start: Int, length: Int): BitWidth {
+    val bitWidth = length.toULong().widthInUBits()
+    var width = bitWidth
+    val prefixElems = 1
+    // Check bit widths and types for all elements.
+    for (i in start until stack.size) {
+      val elemWidth = elemWidth(T_KEY, W_8, stack[i].key.toLong(), buffer.writePosition, i + prefixElems)
+      width = width.max(elemWidth)
+    }
+    return width
+  }
+
+  private fun createKeyVector(start: Int, length: Int): Value {
+    // Figure out smallest bit width we can store this vector with.
+    val bitWidth = calculateKeyVectorBitWidth(start, length)
+    val byteWidth = align(bitWidth)
+    // Write vector. First the keys width/offset if available, and size.
+    writeInt(length, byteWidth)
+    // Then the actual data.
+    val vloc = buffer.writePosition.toULong()
+    for (i in start until stack.size) {
+      val pos = stack[i].key
+      if (pos == -1) error("invalid position $pos for key")
+      writeOffset(stack[i].key, byteWidth)
+    }
+    // Then the types.
+    return Value(T_VECTOR_KEY, -1, bitWidth, vloc)
+  }
+
+  private inline fun createVector(key: Int, start: Int, length: Int, keys: Value? = null): Value {
+    return createAnyVector(key, start, length, T_VECTOR, keys) {
+      // add types since we are not creating a typed vector.
+      for (i in start until stack.size) {
+        buffer.put(stack[i].storedPackedType(it))
+      }
+    }
+  }
+
+  private fun putMap(key: Int, start: Int, length: Int, keys: Value? = null): Value {
+    return createAnyVector(key, start, length, T_MAP, keys) {
+      // add types since we are not creating a typed vector.
+      for (i in start until stack.size) {
+        buffer.put(stack[i].storedPackedType(it))
+      }
+    }
+  }
+
+  private inline fun createTypedVector(key: Int, start: Int, length: Int, keys: Value? = null): Value {
+    // We assume the callers of this method guarantees all elements are of the same type.
+    val elementType: FlexBufferType = stack[start].type
+    for (i in start + 1 until length) {
+      if (elementType != stack[i].type) error("TypedVector does not support array of different element types")
+    }
+    if (!elementType.isTypedVectorElementType()) error("TypedVector does not support this element type")
+    return createAnyVector(key, start, length, elementType.toTypedVector(), keys)
+  }
+
+  private inline fun createAnyVector(
+    key: Int,
+    start: Int,
+    length: Int,
+    type: FlexBufferType,
+    keys: Value? = null,
+    crossinline typeBlock: (BitWidth) -> Unit = {}
+  ): Value {
+    // Figure out smallest bit width we can store this vector with.
+    var bitWidth = W_8.max(length.toULong().widthInUBits())
+    var prefixElems = 1
+    if (keys != null) {
+      // If this vector is part of a map, we will pre-fix an offset to the keys
+      // to this vector.
+      bitWidth = bitWidth.max(keys.elemWidth(buffer.writePosition, 0))
+      prefixElems += 2
+    }
+    // Check bit widths and types for all elements.
+    for (i in start until stack.size) {
+      val elemWidth = stack[i].elemWidth(buffer.writePosition, i + prefixElems)
+      bitWidth = bitWidth.max(elemWidth)
+    }
+    val byteWidth = align(bitWidth)
+    // Write vector. First the keys width/offset if available, and size.
+    if (keys != null) {
+      writeOffset(keys.iValue.toInt(), byteWidth)
+      writeInt(1 shl keys.minBitWidth.value, byteWidth)
+    }
+    // write the size
+    writeInt(length, byteWidth)
+
+    // Then the actual data.
+    val vloc: Int = buffer.writePosition
+    for (i in start until stack.size) {
+      writeAny(stack[i], byteWidth)
+    }
+
+    // Optionally you can introduce the types for non-typed vector
+    typeBlock(bitWidth)
+    return Value(type, key, bitWidth, vloc.toULong())
+  }
+
+  // A lambda to sort map keys
+  internal val keyComparator = object : Comparator<Value> {
+    override fun compare(a: Value, b: Value): Int {
+      var ia: Int = a.key
+      var io: Int = b.key
+      var c1: Byte
+      var c2: Byte
+      do {
+        c1 = buffer[ia]
+        c2 = buffer[io]
+        if (c1.toInt() == 0) return c1 - c2
+        ia++
+        io++
+      } while (c1 == c2)
+      return c1 - c2
+    }
+  }
+
+  public companion object {
+    /**
+     * No keys or strings will be shared
+     */
+    public const val SHARE_NONE: Int = 0
+
+    /**
+     * Keys will be shared between elements. Identical keys will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory.
+     */
+    public const val SHARE_KEYS: Int = 1
+
+    /**
+     * Strings will be shared between elements. Identical strings will only be serialized once, thus possibly saving space.
+     * But serialization performance might be slower and consumes more memory. This is ideal if you expect many repeated
+     * strings on the message.
+     */
+    public const val SHARE_STRINGS: Int = 2
+
+    /**
+     * Strings and keys will be shared between elements.
+     */
+    public const val SHARE_KEYS_AND_STRINGS: Int = 3
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersInternals.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersInternals.kt
new file mode 100644
index 0000000..15d0027
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/FlexBuffersInternals.kt
@@ -0,0 +1,251 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+
+package com.google.flatbuffers.kotlin
+
+public inline class BitWidth(public val value: Int) {
+  public inline fun max(other: BitWidth): BitWidth = if (this.value >= other.value) this else other
+}
+
+public inline class ByteWidth(public val value: Int)
+
+public inline class FlexBufferType(public val value: Int) {
+  public operator fun minus(other: FlexBufferType): FlexBufferType = FlexBufferType(this.value - other.value)
+  public operator fun plus(other: FlexBufferType): FlexBufferType = FlexBufferType(this.value + other.value)
+  public operator fun compareTo(other: FlexBufferType): Int = this.value - other.value
+}
+
+internal operator fun Int.times(width: ByteWidth): Int = this * width.value
+internal operator fun Int.minus(width: ByteWidth): Int = this - width.value
+internal operator fun Int.plus(width: ByteWidth): Int = this + width.value
+internal operator fun Int.minus(type: FlexBufferType): Int = this - type.value
+
+// Returns a Key string from the buffer starting at index [start]. Key Strings are stored as
+// C-Strings, ending with '\0'. If zero byte not found returns empty string.
+internal inline fun ReadBuffer.getKeyString(start: Int): String {
+  val i = findFirst(0.toByte(), start)
+  return if (i >= 0) getString(start, i - start) else ""
+}
+
+// read unsigned int with size byteWidth and return as a 64-bit integer
+internal inline fun ReadBuffer.readULong(end: Int, byteWidth: ByteWidth): ULong {
+  return when (byteWidth.value) {
+    1 -> this.getUByte(end).toULong()
+    2 -> this.getUShort(end).toULong()
+    4 -> this.getUInt(end).toULong()
+    8 -> this.getULong(end)
+    else -> error("invalid byte width $byteWidth for scalar unsigned integer")
+  }
+}
+
+internal inline fun ReadBuffer.readFloat(end: Int, byteWidth: ByteWidth): Double {
+  return when (byteWidth.value) {
+    4 -> this.getFloat(end).toDouble()
+    8 -> this.getDouble(end)
+    else -> error("invalid byte width $byteWidth for floating point scalar") // we should never reach here
+  }
+}
+// return position on the [ReadBuffer] of the element that the offset is pointing to
+// we assume all offset fits on a int, since ReadBuffer operates with that assumption
+internal inline fun ReadBuffer.indirect(offset: Int, byteWidth: ByteWidth): Int = offset - readInt(offset, byteWidth)
+// returns the size of an array-like element from [ReadBuffer].
+internal inline fun ReadBuffer.readSize(end: Int, byteWidth: ByteWidth) = readInt(end - byteWidth, byteWidth)
+internal inline fun ReadBuffer.readUInt(end: Int, byteWidth: ByteWidth): UInt = readULong(end, byteWidth).toUInt()
+internal inline fun ReadBuffer.readInt(end: Int, byteWidth: ByteWidth): Int = readULong(end, byteWidth).toInt()
+internal inline fun ReadBuffer.readLong(end: Int, byteWidth: ByteWidth): Long = readULong(end, byteWidth).toLong()
+
+internal fun IntArray.widthInUBits(): BitWidth = arrayWidthInUBits(this.size) { this[it].toULong().widthInUBits() }
+internal fun ShortArray.widthInUBits(): BitWidth = arrayWidthInUBits(this.size) { this[it].toULong().widthInUBits() }
+internal fun LongArray.widthInUBits(): BitWidth = arrayWidthInUBits(this.size) { this[it].toULong().widthInUBits() }
+
+private inline fun arrayWidthInUBits(size: Int, crossinline elemWidthBlock: (Int) -> BitWidth): BitWidth {
+  // Figure out smallest bit width we can store this vector with.
+  var bitWidth = W_8.max(size.toULong().widthInUBits())
+  // Check bit widths and types for all elements.
+  for (i in 0 until size) {
+    // since we know its inline types we can just assume elmentWidth to be the value width in bits.
+    bitWidth = bitWidth.max(elemWidthBlock(i))
+  }
+  return bitWidth
+}
+
+internal fun ULong.widthInUBits(): BitWidth = when {
+  this <= MAX_UBYTE_ULONG -> W_8
+  this <= UShort.MAX_VALUE -> W_16
+  this <= UInt.MAX_VALUE -> W_32
+  else -> W_64
+}
+
+// returns the number of bytes needed for padding the scalar of size scalarSize.
+internal inline fun paddingBytes(bufSize: Int, scalarSize: Int): Int = bufSize.inv() + 1 and scalarSize - 1
+
+internal inline fun FlexBufferType.isInline(): Boolean = this.value <= T_FLOAT.value || this == T_BOOL
+
+internal fun FlexBufferType.isScalar(): Boolean = when (this) {
+  T_INT, T_UINT, T_FLOAT, T_BOOL -> true
+  else -> false
+}
+
+internal fun FlexBufferType.isIndirectScalar(): Boolean = when (this) {
+  T_INDIRECT_INT, T_INDIRECT_UINT, T_INDIRECT_FLOAT -> true
+  else -> false
+}
+
+internal fun FlexBufferType.isTypedVector(): Boolean =
+  this >= T_VECTOR_INT && this <= T_VECTOR_STRING_DEPRECATED || this == T_VECTOR_BOOL
+
+internal fun FlexBufferType.isTypedVectorElementType(): Boolean = (this.value in T_INT.value..T_KEY.value) || this == T_BOOL
+
+// returns the typed vector of a given scalar type.
+internal fun FlexBufferType.toTypedVector(): FlexBufferType = (this - T_INT) + T_VECTOR_INT
+// returns the element type of a given typed vector.
+internal fun FlexBufferType.toElementTypedVector(): FlexBufferType = this - T_VECTOR_INT + T_INT
+
+// Holds information about the elements inserted on the buffer.
+internal data class Value(
+  var type: FlexBufferType = T_INT,
+  var key: Int = -1,
+  var minBitWidth: BitWidth = W_8,
+  var iValue: ULong = 0UL, // integer value
+  var dValue: Double = 0.0 // TODO(paulovap): maybe we can keep floating type on iValue as well.
+) { // float value
+
+  inline fun storedPackedType(parentBitWidth: BitWidth = W_8): Byte = packedType(storedWidth(parentBitWidth), type)
+
+  private inline fun packedType(bitWidth: BitWidth, type: FlexBufferType): Byte = (bitWidth.value or (type.value shl 2)).toByte()
+
+  private inline fun storedWidth(parentBitWidth: BitWidth): BitWidth =
+    if (type.isInline()) minBitWidth.max(parentBitWidth) else minBitWidth
+
+  fun elemWidth(bufSize: Int, elemIndex: Int): BitWidth =
+    elemWidth(type, minBitWidth, iValue.toLong(), bufSize, elemIndex)
+}
+
+internal fun elemWidth(
+  type: FlexBufferType,
+  minBitWidth: BitWidth,
+  iValue: Long,
+  bufSize: Int,
+  elemIndex: Int
+): BitWidth {
+  if (type.isInline()) return minBitWidth
+
+  // We have an absolute offset, but want to store a relative offset
+  // elem_index elements beyond the current buffer end. Since whether
+  // the relative offset fits in a certain byte_width depends on
+  // the size of the elements before it (and their alignment), we have
+  // to test for each size in turn.
+  // Original implementation checks for largest scalar
+  // which is long unsigned int
+  var byteWidth = 1
+  while (byteWidth <= 32) {
+    // Where are we going to write this offset?
+    val offsetLoc: Int = bufSize + paddingBytes(bufSize, byteWidth) + elemIndex * byteWidth
+    // Compute relative offset.
+    val offset: Int = offsetLoc - iValue.toInt()
+    // Does it fit?
+    val bitWidth = offset.toULong().widthInUBits()
+    if (1 shl bitWidth.value == byteWidth) return bitWidth
+    byteWidth *= 2
+  }
+  return W_64
+}
+
+// For debugging purposes, convert type to a human-readable string.
+internal fun FlexBufferType.typeToString(): String = when (this) {
+  T_NULL -> "Null"
+  T_INT -> "Int"
+  T_UINT -> "UInt"
+  T_FLOAT -> "Float"
+  T_KEY -> "Key"
+  T_STRING -> "String"
+  T_INDIRECT_INT -> "IndirectInt"
+  T_INDIRECT_UINT -> "IndirectUInt"
+  T_INDIRECT_FLOAT -> "IndirectFloat"
+  T_MAP -> "Map"
+  T_VECTOR -> "Vector"
+  T_VECTOR_INT -> "IntVector"
+  T_VECTOR_UINT -> "UIntVector"
+  T_VECTOR_FLOAT -> "FloatVector"
+  T_VECTOR_KEY -> "KeyVector"
+  T_VECTOR_STRING_DEPRECATED -> "StringVectorDeprecated"
+  T_VECTOR_INT2 -> "Int2Vector"
+  T_VECTOR_UINT2 -> "UInt2Vector"
+  T_VECTOR_FLOAT2 -> "Float2Vector"
+  T_VECTOR_INT3 -> "Int3Vector"
+  T_VECTOR_UINT3 -> "UInt3Vector"
+  T_VECTOR_FLOAT3 -> "Float3Vector"
+  T_VECTOR_INT4 -> "Int4Vector"
+  T_VECTOR_UINT4 -> "UInt4Vector"
+  T_VECTOR_FLOAT4 -> "Float4Vector"
+  T_BLOB -> "BlobVector"
+  T_BOOL -> "BoolVector"
+  T_VECTOR_BOOL -> "BoolVector"
+  else -> "UnknownType"
+}
+
+// Few repeated values used in hot path is cached here
+internal val emptyBuffer = ArrayReadWriteBuffer(1)
+internal fun emptyBlob() = Blob(emptyBuffer, 1, ByteWidth(1))
+internal fun emptyVector() = Vector(emptyBuffer, 1, ByteWidth(1))
+internal fun emptyMap() = Map(ArrayReadWriteBuffer(3), 3, ByteWidth(1))
+internal fun nullReference() = Reference(emptyBuffer, 1, ByteWidth(0), T_NULL.value)
+internal fun nullKey() = Key(emptyBuffer, 1)
+
+internal const val ZeroByte = 0.toByte()
+internal const val MAX_UBYTE_ULONG = 255UL
+internal const val MAX_UBYTE = 255
+internal const val MAX_USHORT = 65535
+
+// value bit width possible sizes
+internal val W_8 = BitWidth(0)
+internal val W_16 = BitWidth(1)
+internal val W_32 = BitWidth(2)
+internal val W_64 = BitWidth(3)
+
+// These are used as the upper 6 bits of a type field to indicate the actual type.
+internal val T_INVALID = FlexBufferType(-1)
+internal val T_NULL = FlexBufferType(0)
+internal val T_INT = FlexBufferType(1)
+internal val T_UINT = FlexBufferType(2)
+internal val T_FLOAT = FlexBufferType(3) // Types above stored inline, types below are stored in an offset.
+internal val T_KEY = FlexBufferType(4)
+internal val T_STRING = FlexBufferType(5)
+internal val T_INDIRECT_INT = FlexBufferType(6)
+internal val T_INDIRECT_UINT = FlexBufferType(7)
+internal val T_INDIRECT_FLOAT = FlexBufferType(8)
+internal val T_MAP = FlexBufferType(9)
+internal val T_VECTOR = FlexBufferType(10) // Untyped.
+internal val T_VECTOR_INT = FlexBufferType(11) // Typed any size  = stores no type table).
+internal val T_VECTOR_UINT = FlexBufferType(12)
+internal val T_VECTOR_FLOAT = FlexBufferType(13)
+internal val T_VECTOR_KEY = FlexBufferType(14)
+// DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
+// more info on https://github.com/google/flatbuffers/issues/5627.
+internal val T_VECTOR_STRING_DEPRECATED = FlexBufferType(15)
+internal val T_VECTOR_INT2 = FlexBufferType(16) // Typed tuple  = no type table; no size field).
+internal val T_VECTOR_UINT2 = FlexBufferType(17)
+internal val T_VECTOR_FLOAT2 = FlexBufferType(18)
+internal val T_VECTOR_INT3 = FlexBufferType(19) // Typed triple  = no type table; no size field).
+internal val T_VECTOR_UINT3 = FlexBufferType(20)
+internal val T_VECTOR_FLOAT3 = FlexBufferType(21)
+internal val T_VECTOR_INT4 = FlexBufferType(22) // Typed quad  = no type table; no size field).
+internal val T_VECTOR_UINT4 = FlexBufferType(23)
+internal val T_VECTOR_FLOAT4 = FlexBufferType(24)
+internal val T_BLOB = FlexBufferType(25)
+internal val T_BOOL = FlexBufferType(26)
+internal val T_VECTOR_BOOL = FlexBufferType(36) // To Allow the same type of conversion of type to vector type
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/JSON.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/JSON.kt
new file mode 100644
index 0000000..ee20138
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/JSON.kt
@@ -0,0 +1,828 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+
+package com.google.flatbuffers.kotlin
+
+import com.google.flatbuffers.kotlin.FlexBuffersBuilder.Companion.SHARE_KEYS_AND_STRINGS
+import kotlin.experimental.and
+import kotlin.math.pow
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ */
+public fun Reference.toJson(): String = ArrayReadWriteBuffer(1024).let {
+  toJson(it)
+  val data = it.data() // it.getString(0, it.writePosition)
+  return data.decodeToString(0, it.writePosition)
+}
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ * @param out [ReadWriteBuffer] the JSON will be written.
+ */
+public fun Reference.toJson(out: ReadWriteBuffer) {
+  when (type) {
+    T_STRING -> {
+      val start = buffer.indirect(end, parentWidth)
+      val size = buffer.readULong(start - byteWidth, byteWidth).toInt()
+      out.jsonEscape(buffer, start, size)
+    }
+    T_KEY -> {
+      val start = buffer.indirect(end, parentWidth)
+      val end = buffer.findFirst(0.toByte(), start)
+      out.jsonEscape(buffer, start, end - start)
+    }
+    T_BLOB -> {
+      val blob = toBlob()
+      out.jsonEscape(out, blob.end, blob.size)
+    }
+    T_INT -> out.put(toLong().toString())
+    T_UINT -> out.put(toULong().toString())
+    T_FLOAT -> out.put(toDouble().toString())
+    T_NULL -> out.put("null")
+    T_BOOL -> out.put(toBoolean().toString())
+    T_MAP -> toMap().toJson(out)
+    T_VECTOR, T_VECTOR_BOOL, T_VECTOR_FLOAT, T_VECTOR_INT,
+    T_VECTOR_UINT, T_VECTOR_KEY, T_VECTOR_STRING_DEPRECATED -> toVector().toJson(out)
+    else -> error("Unable to convert type ${type.typeToString()} to JSON")
+  }
+}
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ */
+public fun Map.toJson(): String = ArrayReadWriteBuffer(1024).let { toJson(it); it.toString() }
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ * @param out [ReadWriteBuffer] the JSON will be written.
+ */
+public fun Map.toJson(out: ReadWriteBuffer) {
+  out.put('{'.toByte())
+  // key values pairs
+  for (i in 0 until size) {
+    val key = keyAt(i)
+    out.jsonEscape(buffer, key.start, key.sizeInBytes)
+    out.put(':'.toByte())
+    get(i).toJson(out)
+    if (i != size - 1) {
+      out.put(','.toByte())
+    }
+  }
+  // close bracket
+  out.put('}'.toByte())
+}
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ */
+public fun Vector.toJson(): String = ArrayReadWriteBuffer(1024).let { toJson(it); it.toString() }
+
+/**
+ * Returns a minified version of this FlexBuffer as a JSON.
+ * @param out that the JSON is being concatenated.
+ */
+public fun Vector.toJson(out: ReadWriteBuffer) {
+  out.put('['.toByte())
+  for (i in 0 until size) {
+    get(i).toJson(out)
+    if (i != size - 1) {
+      out.put(','.toByte())
+    }
+  }
+  out.put(']'.toByte())
+}
+
+/**
+ * JSONParser class is used to parse a JSON as FlexBuffers. Calling [JSONParser.parse] fiils [output]
+ * and returns a [Reference] ready to be used.
+ */
+public class JSONParser(public var output: FlexBuffersBuilder = FlexBuffersBuilder(1024, SHARE_KEYS_AND_STRINGS)) {
+  private var readPos = 0
+  private var scopes = ScopeStack()
+
+  /**
+   * Parse a json as [String] and returns a [Reference] to a FlexBuffer.
+   */
+  public fun parse(data: String): Reference = parse(ArrayReadBuffer(data.encodeToByteArray()))
+
+  /**
+   * Parse a json as [ByteArray] and returns a [Reference] to a FlexBuffer.
+   */
+  public fun parse(data: ByteArray): Reference = parse(ArrayReadBuffer(data))
+
+  /**
+   * Parse a json as [ReadBuffer] and returns a [Reference] to a FlexBuffer.
+   */
+  public fun parse(data: ReadBuffer): Reference {
+    reset()
+    parseValue(data, nextToken(data), null)
+    if (readPos < data.limit) {
+      val tok = skipWhitespace(data)
+      if (tok != CHAR_EOF) {
+        makeError(data, "Extraneous charaters after parse has finished", tok)
+      }
+    }
+    output.finish()
+    return getRoot(output.buffer)
+  }
+
+  private fun parseValue(data: ReadBuffer, token: Token, key: String? = null): FlexBufferType {
+    return when (token) {
+      TOK_BEGIN_OBJECT -> parseObject(data, key)
+      TOK_BEGIN_ARRAY -> parseArray(data, key)
+      TOK_TRUE -> T_BOOL.also { output[key] = true }
+      TOK_FALSE -> T_BOOL.also { output[key] = false }
+      TOK_NULL -> T_NULL.also { output.putNull(key) }
+      TOK_BEGIN_QUOTE -> parseString(data, key)
+      TOK_NUMBER -> parseNumber(data, data.data(), key)
+      else -> makeError(data, "Unexpected Character while parsing", 'x'.toByte())
+    }
+  }
+
+  private fun parseObject(data: ReadBuffer, key: String? = null): FlexBufferType {
+    this.scopes.push(SCOPE_OBJ_EMPTY)
+
+    val fPos = output.startMap()
+    val limit = data.limit
+    while (readPos <= limit) {
+      when (val tok = nextToken(data)) {
+        TOK_END_OBJECT -> {
+          this.scopes.pop()
+          output.endMap(fPos, key); return T_MAP
+        }
+        TOK_BEGIN_QUOTE -> {
+          val childKey = readString(data)
+          parseValue(data, nextToken(data), childKey)
+        }
+        else -> makeError(data, "Expecting start of object key", tok)
+      }
+    }
+    makeError(data, "Unable to parse the object", "x".toByte())
+  }
+
+  private fun parseArray(data: ReadBuffer, key: String? = null): FlexBufferType {
+    this.scopes.push(SCOPE_ARRAY_EMPTY)
+    val fPos = output.startVector()
+    var elementType = T_INVALID
+    var multiType = false
+    val limit = data.limit
+
+    while (readPos <= limit) {
+      when (val tok = nextToken(data)) {
+        TOK_END_ARRAY -> {
+          this.scopes.pop()
+          return if (!multiType && elementType.isScalar()) {
+            output.endTypedVector(fPos, key)
+            elementType.toElementTypedVector()
+          } else {
+            output.endVector(key, fPos)
+            T_VECTOR
+          }
+        }
+
+        else -> {
+          val newType = parseValue(data, tok, null)
+
+          if (elementType == T_INVALID) {
+            elementType = newType
+          } else if (newType != elementType) {
+            multiType = true
+          }
+        }
+      }
+    }
+    makeError(data, "Unable to parse the array")
+  }
+
+  private fun parseNumber(data: ReadBuffer, array: ByteArray, key: String?): FlexBufferType {
+    val ary = array
+    var cursor = readPos
+    var c = data[readPos++]
+    var useDouble = false
+    val limit = ary.size
+    var sign = 1
+    var double: Double
+    var long = 0L
+    var digits = 0
+
+    if (c == CHAR_MINUS) {
+      cursor++
+      checkEOF(data, cursor)
+      c = ary[cursor]
+      sign = -1
+    }
+
+    // peek first byte
+    when (c) {
+      CHAR_0 -> {
+        cursor++
+        if (cursor != limit) {
+          c = ary[cursor]
+        }
+      }
+      !in CHAR_0..CHAR_9 -> makeError(data, "Invalid Number", c)
+      else -> {
+        do {
+          val digit = c - CHAR_0
+          // double = 10.0 * double + digit
+          long = 10 * long + digit
+          digits++
+          cursor++
+          if (cursor == limit) break
+          c = ary[cursor]
+        } while (c in CHAR_0..CHAR_9)
+      }
+    }
+
+    var exponent = 0
+    // If we find '.' we need to convert to double
+    if (c == CHAR_DOT) {
+      useDouble = true
+      checkEOF(data, cursor)
+      c = ary[++cursor]
+      if (c < CHAR_0 || c > CHAR_9) {
+        makeError(data, "Invalid Number", c)
+      }
+      do {
+        // double = double * 10 + (tok - CHAR_0)
+        long = 10 * long + (c - CHAR_0)
+        digits++
+        --exponent
+        cursor++
+        if (cursor == limit) break
+        c = ary[cursor]
+      } while (c in CHAR_0..CHAR_9)
+    }
+
+    // If we find 'e' we need to convert to double
+    if (c == CHAR_e || c == CHAR_E) {
+      useDouble = true
+      ++cursor
+      checkEOF(data, cursor)
+      c = ary[cursor]
+      var negativeExponent = false
+      if (c == CHAR_MINUS) {
+        ++cursor
+        checkEOF(data, cursor)
+        negativeExponent = true
+        c = ary[cursor]
+      } else if (c == CHAR_PLUS) {
+        ++cursor
+        checkEOF(data, cursor)
+        c = ary[cursor]
+      }
+      if (c < CHAR_0 || c > CHAR_9) {
+        makeError(data, "Missing exponent", c)
+      }
+      var exp = 0
+      do {
+        val digit = c - CHAR_0
+        exp = 10 * exp + digit
+        ++cursor
+        if (cursor == limit) break
+        c = ary[cursor]
+      } while (c in CHAR_0..CHAR_9)
+
+      exponent += if (negativeExponent) -exp else exp
+    }
+
+    if (digits > 17 || exponent < -19 || exponent > 19) {
+      // if the float number is not simple enough
+      // we use language's Double parsing, which is slower but
+      // produce more expected results for extreme numbers.
+      val firstPos = readPos - 1
+      val str = data.getString(firstPos, cursor - firstPos)
+      if (useDouble) {
+        double = str.toDouble()
+        output[key] = double
+      } else {
+        long = str.toLong()
+        output[key] = long
+      }
+    } else {
+      // this happens on single numbers outside any object
+      // or array
+      if (useDouble || exponent != 0) {
+        double = if (long == 0L) 0.0 else long.toDouble() * 10.0.pow(exponent)
+        double *= sign
+        output[key] = double
+      } else {
+        long *= sign
+        output[key] = long
+      }
+    }
+    readPos = cursor
+    return if (useDouble) T_FLOAT else T_INT
+  }
+
+  private fun parseString(data: ReadBuffer, key: String?): FlexBufferType {
+    output[key] = readString(data)
+    return T_STRING
+  }
+
+  private fun readString(data: ReadBuffer): String {
+    val limit = data.limit
+    if (data is ArrayReadBuffer) {
+      val ary = data.data()
+      // enables range check elimination
+      return readString(data, limit) { ary[it] }
+    }
+    return readString(data, limit) { data[it] }
+  }
+
+  private inline fun readString(data: ReadBuffer, limit: Int, crossinline fetch: (Int) -> Byte): String {
+    var cursorPos = readPos
+    var foundEscape = false
+    var currentChar: Byte = 0
+    // we loop over every 4 bytes until find any non-plain char
+    while (limit - cursorPos >= 4) {
+      currentChar = fetch(cursorPos)
+      if (!isPlainStringChar(currentChar)) {
+        foundEscape = true
+        break
+      }
+      currentChar = fetch(cursorPos + 1)
+      if (!isPlainStringChar(currentChar)) {
+        cursorPos += 1
+        foundEscape = true
+        break
+      }
+      currentChar = fetch(cursorPos + 2)
+      if (!isPlainStringChar(currentChar)) {
+        cursorPos += 2
+        foundEscape = true
+        break
+      }
+      currentChar = fetch(cursorPos + 3)
+      if (!isPlainStringChar(currentChar)) {
+        cursorPos += 3
+        foundEscape = true
+        break
+      }
+      cursorPos += 4
+    }
+    if (!foundEscape) {
+      // if non-plain string char is not found we loop over
+      // the remaining bytes
+      while (true) {
+        if (cursorPos >= limit) {
+          error("Unexpected end of string")
+        }
+        currentChar = fetch(cursorPos)
+        if (!isPlainStringChar(currentChar)) {
+          break
+        }
+        ++cursorPos
+      }
+    }
+    if (currentChar == CHAR_DOUBLE_QUOTE) {
+      val str = data.getString(readPos, cursorPos - readPos)
+      readPos = cursorPos + 1
+      return str
+    }
+    if (currentChar in 0..0x1f) {
+      error("Illegal Codepoint")
+    } else {
+      // backslash or >0x7f
+      return readStringSlow(data, currentChar, cursorPos)
+    }
+  }
+
+  private fun readStringSlow(data: ReadBuffer, first: Byte, lastPos: Int): String {
+    var cursorPos = lastPos
+
+    var endOfString = lastPos
+    while (true) {
+      val pos = data.findFirst(CHAR_DOUBLE_QUOTE, endOfString)
+      when {
+        pos == -1 -> makeError(data, "Unexpected EOF, missing end of string '\"'", first)
+        data[pos - 1] == CHAR_BACKSLASH && data[pos - 2] != CHAR_BACKSLASH -> {
+          // here we are checking for double quotes preceded by backslash. eg \"
+          // we have to look past pos -2 to make sure that the backlash is not
+          // part of a previous escape, eg "\\"
+          endOfString = pos + 1
+        }
+        else -> {
+          endOfString = pos; break
+        }
+      }
+    }
+    // copy everything before the escape
+    val builder = StringBuilder(data.getString(readPos, lastPos - readPos))
+    while (true) {
+      when (val pos = data.findFirst(CHAR_BACKSLASH, cursorPos, endOfString)) {
+        -1 -> {
+          val doubleQuotePos = data.findFirst(CHAR_DOUBLE_QUOTE, cursorPos)
+          if (doubleQuotePos == -1) makeError(data, "Reached EOF before enclosing string", first)
+          val rest = data.getString(cursorPos, doubleQuotePos - cursorPos)
+          builder.append(rest)
+          readPos = doubleQuotePos + 1
+          return builder.toString()
+        }
+
+        else -> {
+          // we write everything up to \
+          builder.append(data.getString(cursorPos, pos - cursorPos))
+          val c = data[pos + 1]
+          builder.append(readEscapedChar(data, c, pos))
+          cursorPos = pos + if (c == CHAR_u) 6 else 2
+        }
+      }
+    }
+  }
+
+  private inline fun isPlainStringChar(c: Byte): Boolean {
+    val flags = parseFlags
+    // return c in 0x20..0x7f && c != 0x22.toByte() && c != 0x5c.toByte()
+    return (flags[c.toInt() and 0xFF] and 1) != 0.toByte()
+  }
+
+  private inline fun isWhitespace(c: Byte): Boolean {
+    val flags = parseFlags
+    // return c == '\r'.toByte() || c == '\n'.toByte() || c == '\t'.toByte() || c == ' '.toByte()
+    return (flags[c.toInt() and 0xFF] and 2) != 0.toByte()
+  }
+
+  private fun reset() {
+    readPos = 0
+    output.clear()
+    scopes.reset()
+  }
+
+  private fun nextToken(data: ReadBuffer): Token {
+    val scope = this.scopes.last
+
+    when (scope) {
+      SCOPE_ARRAY_EMPTY -> this.scopes.last = SCOPE_ARRAY_FILLED
+      SCOPE_ARRAY_FILLED -> {
+        when (val c = skipWhitespace(data)) {
+          CHAR_CLOSE_ARRAY -> return TOK_END_ARRAY
+          CHAR_COMMA -> Unit
+          else -> makeError(data, "Unfinished Array", c)
+        }
+      }
+      SCOPE_OBJ_EMPTY, SCOPE_OBJ_FILLED -> {
+        this.scopes.last = SCOPE_OBJ_KEY
+        // Look for a comma before the next element.
+        if (scope == SCOPE_OBJ_FILLED) {
+          when (val c = skipWhitespace(data)) {
+            CHAR_CLOSE_OBJECT -> return TOK_END_OBJECT
+            CHAR_COMMA -> Unit
+            else -> makeError(data, "Unfinished Object", c)
+          }
+        }
+        return when (val c = skipWhitespace(data)) {
+          CHAR_DOUBLE_QUOTE -> TOK_BEGIN_QUOTE
+          CHAR_CLOSE_OBJECT -> if (scope != SCOPE_OBJ_FILLED) {
+            TOK_END_OBJECT
+          } else {
+            makeError(data, "Expected Key", c)
+          }
+          else -> {
+            makeError(data, "Expected Key/Value", c)
+          }
+        }
+      }
+      SCOPE_OBJ_KEY -> {
+        this.scopes.last = SCOPE_OBJ_FILLED
+        when (val c = skipWhitespace(data)) {
+          CHAR_COLON -> Unit
+          else -> makeError(data, "Expect ${CHAR_COLON.print()}", c)
+        }
+      }
+      SCOPE_DOC_EMPTY -> this.scopes.last = SCOPE_DOC_FILLED
+      SCOPE_DOC_FILLED -> {
+        val c = skipWhitespace(data)
+        if (c != CHAR_EOF)
+          makeError(data, "Root object already finished", c)
+        return TOK_EOF
+      }
+    }
+
+    val c = skipWhitespace(data)
+    when (c) {
+      CHAR_CLOSE_ARRAY -> if (scope == SCOPE_ARRAY_EMPTY) return TOK_END_ARRAY
+      CHAR_COLON -> makeError(data, "Unexpected character", c)
+      CHAR_DOUBLE_QUOTE -> return TOK_BEGIN_QUOTE
+      CHAR_OPEN_ARRAY -> return TOK_BEGIN_ARRAY
+      CHAR_OPEN_OBJECT -> return TOK_BEGIN_OBJECT
+      CHAR_t -> {
+        checkEOF(data, readPos + 2)
+        // 0x65757274 is equivalent to ['t', 'r', 'u', 'e' ] as a 4 byte Int
+        if (data.getInt(readPos - 1) != 0x65757274) {
+          makeError(data, "Expecting keyword \"true\"", c)
+        }
+        readPos += 3
+        return TOK_TRUE
+      }
+      CHAR_n -> {
+        checkEOF(data, readPos + 2)
+        // 0x6c6c756e  is equivalent to ['n', 'u', 'l', 'l' ] as a 4 byte Int
+        if (data.getInt(readPos - 1) != 0x6c6c756e) {
+          makeError(data, "Expecting keyword \"null\"", c)
+        }
+        readPos += 3
+        return TOK_NULL
+      }
+      CHAR_f -> {
+        checkEOF(data, readPos + 3)
+        // 0x65736c61 is equivalent to ['a', 'l', 's', 'e' ] as a 4 byte Int
+        if (data.getInt(readPos) != 0x65736c61) {
+          makeError(data, "Expecting keyword \"false\"", c)
+        }
+        readPos += 4
+        return TOK_FALSE
+      }
+      CHAR_0, CHAR_1, CHAR_2, CHAR_3, CHAR_4, CHAR_5,
+      CHAR_6, CHAR_7, CHAR_8, CHAR_9, CHAR_MINUS -> return TOK_NUMBER.also {
+        readPos-- // rewind one position so we don't lose first digit
+      }
+    }
+    makeError(data, "Expecting element", c)
+  }
+
+  // keeps increasing [readPos] until finds a non-whitespace byte
+  private inline fun skipWhitespace(data: ReadBuffer): Byte {
+    val limit = data.limit
+    if (data is ArrayReadBuffer) {
+      // enables range check elimination
+      val ary = data.data()
+      return skipWhitespace(limit) { ary[it] }
+    }
+    return skipWhitespace(limit) { data[it] }
+  }
+
+  private inline fun skipWhitespace(limit: Int, crossinline fetch: (Int) -> Byte): Byte {
+    var pos = readPos
+    while (pos < limit) {
+      val d = fetch(pos++)
+      if (!isWhitespace(d)) {
+        readPos = pos
+        return d
+      }
+    }
+    readPos = limit
+    return CHAR_EOF
+  }
+
+  // byte1 is expected to be first char before `\`
+  private fun readEscapedChar(data: ReadBuffer, byte1: Byte, cursorPos: Int): Char {
+    return when (byte1) {
+      CHAR_u -> {
+        checkEOF(data, cursorPos + 1 + 4)
+        var result: Char = 0.toChar()
+        var i = cursorPos + 2 // cursorPos is on '\\', cursorPos + 1 is 'u'
+        val end = i + 4
+        while (i < end) {
+          val part: Byte = data[i]
+          result = (result.toInt() shl 4).toChar()
+          result += when (part) {
+            in CHAR_0..CHAR_9 -> part - CHAR_0
+            in CHAR_a..CHAR_f -> part - CHAR_a + 10
+            in CHAR_A..CHAR_F -> part - CHAR_A + 10
+            else -> makeError(data, "Invalid utf8 escaped character", -1)
+          }
+          i++
+        }
+        result
+      }
+      CHAR_b -> '\b'
+      CHAR_t -> '\t'
+      CHAR_r -> '\r'
+      CHAR_n -> '\n'
+      CHAR_f -> 12.toChar() // '\f'
+      CHAR_DOUBLE_QUOTE, CHAR_BACKSLASH, CHAR_FORWARDSLASH -> byte1.toChar()
+      else -> makeError(data, "Invalid escape sequence.", byte1)
+    }
+  }
+
+  private fun Byte.print(): String = when (this) {
+    in 0x21..0x7E -> "'${this.toChar()}'" // visible ascii chars
+    CHAR_EOF -> "EOF"
+    else -> "'0x${this.toString(16)}'"
+  }
+
+  private inline fun makeError(data: ReadBuffer, msg: String, tok: Byte? = null): Nothing {
+    val (line, column) = calculateErrorPosition(data, readPos)
+    if (tok != null) {
+      error("Error At ($line, $column): $msg, got ${tok.print()}")
+    } else {
+      error("Error At ($line, $column): $msg")
+    }
+  }
+
+  private inline fun makeError(data: ReadBuffer, msg: String, tok: Token): Nothing {
+    val (line, column) = calculateErrorPosition(data, readPos)
+    error("Error At ($line, $column): $msg, got ${tok.print()}")
+  }
+
+  private inline fun checkEOF(data: ReadBuffer, pos: Int) {
+    if (pos >= data.limit)
+      makeError(data, "Unexpected end of file", -1)
+  }
+
+  private fun calculateErrorPosition(data: ReadBuffer, endPos: Int): Pair<Int, Int> {
+    var line = 1
+    var column = 1
+    var current = 0
+    while (current < endPos - 1) {
+      if (data[current++] == CHAR_NEWLINE) {
+        ++line
+        column = 1
+      } else {
+        ++column
+      }
+    }
+    return Pair(line, column)
+  }
+}
+
+internal inline fun Int.toPaddedHex(): String = "\\u${this.toString(16).padStart(4, '0')}"
+
+private inline fun ReadWriteBuffer.jsonEscape(data: ReadBuffer, start: Int, size: Int) {
+  val replacements = JSON_ESCAPE_CHARS
+  put(CHAR_DOUBLE_QUOTE)
+  var last = start
+  val length: Int = size
+  val ary = data.data()
+  for (i in start until start + length) {
+    val c = ary[i].toUByte()
+    var replacement: ByteArray?
+    if (c.toInt() < 128) {
+      replacement = replacements[c.toInt()]
+      if (replacement == null) {
+        continue
+      }
+    } else {
+      continue
+    }
+    if (last < i) {
+      put(ary, last, i - last)
+    }
+    put(replacement, 0, replacement.size)
+    last = i + 1
+  }
+  if (last < (last + length)) {
+    put(ary, last, (start + length) - last)
+  }
+  put(CHAR_DOUBLE_QUOTE)
+}
+
+// Following escape strategy defined in RFC7159.
+private val JSON_ESCAPE_CHARS: Array<ByteArray?> = arrayOfNulls<ByteArray>(128).apply {
+  this['\n'.toInt()] = "\\n".encodeToByteArray()
+  this['\t'.toInt()] = "\\t".encodeToByteArray()
+  this['\r'.toInt()] = "\\r".encodeToByteArray()
+  this['\b'.toInt()] = "\\b".encodeToByteArray()
+  this[0x0c] = "\\f".encodeToByteArray()
+  this['"'.toInt()] = "\\\"".encodeToByteArray()
+  this['\\'.toInt()] = "\\\\".encodeToByteArray()
+  for (i in 0..0x1f) {
+    this[i] = "\\u${i.toPaddedHex()}".encodeToByteArray()
+  }
+}
+
+// Scope is used to the define current space that the scanner is operating.
+private inline class Scope(val id: Int)
+private val SCOPE_DOC_EMPTY = Scope(0)
+private val SCOPE_DOC_FILLED = Scope(1)
+private val SCOPE_OBJ_EMPTY = Scope(2)
+private val SCOPE_OBJ_KEY = Scope(3)
+private val SCOPE_OBJ_FILLED = Scope(4)
+private val SCOPE_ARRAY_EMPTY = Scope(5)
+private val SCOPE_ARRAY_FILLED = Scope(6)
+
+// Keeps the stack state of the scopes being scanned. Currently defined to have a
+// max stack size of 22, as per tests cases defined in http://json.org/JSON_checker/
+private class ScopeStack(
+  private val ary: IntArray = IntArray(22) { SCOPE_DOC_EMPTY.id },
+  var lastPos: Int = 0
+) {
+  var last: Scope
+    get() = Scope(ary[lastPos])
+    set(x) {
+      ary[lastPos] = x.id
+    }
+
+  fun reset() {
+    lastPos = 0
+    ary[0] = SCOPE_DOC_EMPTY.id
+  }
+
+  fun pop(): Scope {
+    // println("Popping: ${last.print()}")
+    return Scope(ary[lastPos--])
+  }
+
+  fun push(scope: Scope): Scope {
+    if (lastPos == ary.size - 1)
+      error("Too much nesting reached. Max nesting is ${ary.size} levels")
+    // println("PUSHING : ${scope.print()}")
+    ary[++lastPos] = scope.id
+    return scope
+  }
+}
+
+private inline class Token(val id: Int) {
+  fun print(): String = when (this) {
+    TOK_EOF -> "TOK_EOF"
+    TOK_NONE -> "TOK_NONE"
+    TOK_BEGIN_OBJECT -> "TOK_BEGIN_OBJECT"
+    TOK_END_OBJECT -> "TOK_END_OBJECT"
+    TOK_BEGIN_ARRAY -> "TOK_BEGIN_ARRAY"
+    TOK_END_ARRAY -> "TOK_END_ARRAY"
+    TOK_NUMBER -> "TOK_NUMBER"
+    TOK_TRUE -> "TOK_TRUE"
+    TOK_FALSE -> "TOK_FALSE"
+    TOK_NULL -> "TOK_NULL"
+    TOK_BEGIN_QUOTE -> "TOK_BEGIN_QUOTE"
+    else -> this.toString()
+  }
+}
+
+private val TOK_EOF = Token(-1)
+private val TOK_NONE = Token(0)
+private val TOK_BEGIN_OBJECT = Token(1)
+private val TOK_END_OBJECT = Token(2)
+private val TOK_BEGIN_ARRAY = Token(3)
+private val TOK_END_ARRAY = Token(4)
+private val TOK_NUMBER = Token(5)
+private val TOK_TRUE = Token(6)
+private val TOK_FALSE = Token(7)
+private val TOK_NULL = Token(8)
+private val TOK_BEGIN_QUOTE = Token(9)
+
+private const val CHAR_NEWLINE = '\n'.toByte()
+private const val CHAR_OPEN_OBJECT = '{'.toByte()
+private const val CHAR_COLON = ':'.toByte()
+private const val CHAR_CLOSE_OBJECT = '}'.toByte()
+private const val CHAR_OPEN_ARRAY = '['.toByte()
+private const val CHAR_CLOSE_ARRAY = ']'.toByte()
+private const val CHAR_DOUBLE_QUOTE = '"'.toByte()
+private const val CHAR_BACKSLASH = '\\'.toByte()
+private const val CHAR_FORWARDSLASH = '/'.toByte()
+private const val CHAR_f = 'f'.toByte()
+private const val CHAR_a = 'a'.toByte()
+private const val CHAR_r = 'r'.toByte()
+private const val CHAR_t = 't'.toByte()
+private const val CHAR_n = 'n'.toByte()
+private const val CHAR_b = 'b'.toByte()
+private const val CHAR_e = 'e'.toByte()
+private const val CHAR_E = 'E'.toByte()
+private const val CHAR_u = 'u'.toByte()
+private const val CHAR_A = 'A'.toByte()
+private const val CHAR_F = 'F'.toByte()
+private const val CHAR_EOF = (-1).toByte()
+private const val CHAR_COMMA = ','.toByte()
+private const val CHAR_0 = '0'.toByte()
+private const val CHAR_1 = '1'.toByte()
+private const val CHAR_2 = '2'.toByte()
+private const val CHAR_3 = '3'.toByte()
+private const val CHAR_4 = '4'.toByte()
+private const val CHAR_5 = '5'.toByte()
+private const val CHAR_6 = '6'.toByte()
+private const val CHAR_7 = '7'.toByte()
+private const val CHAR_8 = '8'.toByte()
+private const val CHAR_9 = '9'.toByte()
+private const val CHAR_MINUS = '-'.toByte()
+private const val CHAR_PLUS = '+'.toByte()
+private const val CHAR_DOT = '.'.toByte()
+
+// This template utilizes the One Definition Rule to create global arrays in a
+// header. As seen in:
+// https://github.com/chadaustin/sajson/blob/master/include/sajson.h
+// bit 0 (1) - set if: plain ASCII string character
+// bit 1 (2) - set if: whitespace
+// bit 4 (0x10) - set if: 0-9 e E .
+private val parseFlags = byteArrayOf(
+// 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, // 0
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
+  3, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0x11, 1, // 2
+  0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 1, 1, 1, 1, 1, 1, // 3
+  1, 1, 1, 1, 1, 0x11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 5
+  1, 1, 1, 1, 1, 0x11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
+
+  // 128-255
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Utf8.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Utf8.kt
new file mode 100644
index 0000000..4b02cc5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonMain/kotlin/com/google/flatbuffers/kotlin/Utf8.kt
@@ -0,0 +1,416 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+package com.google.flatbuffers.kotlin
+
+public object Utf8 {
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
+   * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
+   * surrogates)
+   */
+  private fun computeEncodedLength(sequence: CharSequence): Int {
+    // Warning to maintainers: this implementation is highly optimized.
+    val utf16Length = sequence.length
+    var utf8Length = utf16Length
+    var i = 0
+
+    // This loop optimizes for pure ASCII.
+    while (i < utf16Length && sequence[i].toInt() < 0x80) {
+      i++
+    }
+
+    // This loop optimizes for chars less than 0x800.
+    while (i < utf16Length) {
+      val c = sequence[i]
+      if (c.toInt() < 0x800) {
+        utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
+      } else {
+        utf8Length += encodedLengthGeneral(sequence, i)
+        break
+      }
+      i++
+    }
+    if (utf8Length < utf16Length) {
+      // Necessary and sufficient condition for overflow because of maximum 3x expansion
+      error("UTF-8 length does not fit in int: ${(utf8Length + (1L shl 32))}")
+    }
+    return utf8Length
+  }
+
+  private fun encodedLengthGeneral(sequence: CharSequence, start: Int): Int {
+    val utf16Length = sequence.length
+    var utf8Length = 0
+    var i = start
+    while (i < utf16Length) {
+      val c = sequence[i]
+      if (c.toInt() < 0x800) {
+        utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
+      } else {
+        utf8Length += 2
+        if (c.isSurrogate()) {
+          // Check that we have a well-formed surrogate pair.
+          val cp: Int = codePointAt(sequence, i)
+          if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
+            errorSurrogate(i, utf16Length)
+          }
+          i++
+        }
+      }
+      i++
+    }
+    return utf8Length
+  }
+
+  /**
+   * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
+   * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
+   * both time and space.
+   *
+   * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
+   * surrogates)
+   */
+  public fun encodedLength(sequence: CharSequence): Int = computeEncodedLength(sequence)
+
+  /**
+   * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
+   */
+  public inline fun isOneByte(b: Byte): Boolean = b >= 0
+
+  /**
+   * Returns whether this is a two-byte codepoint with the form 110xxxxx  0xC0..0xDF.
+   */
+  public inline fun isTwoBytes(b: Byte): Boolean = b < 0xE0.toByte()
+
+  /**
+   * Returns whether this is a three-byte codepoint with the form 1110xxxx  0xE0..0xEF.
+   */
+  public inline fun isThreeBytes(b: Byte): Boolean = b < 0xF0.toByte()
+
+  /**
+   * Returns whether this is a four-byte codepoint with the form 11110xxx  0xF0..0xF4.
+   */
+  public inline fun isFourByte(b: Byte): Boolean = b < 0xF8.toByte()
+
+  public fun handleOneByte(byte1: Byte, resultArr: CharArray, resultPos: Int) {
+    resultArr[resultPos] = byte1.toChar()
+  }
+
+  public fun handleTwoBytes(
+    byte1: Byte,
+    byte2: Byte,
+    resultArr: CharArray,
+    resultPos: Int
+  ) {
+    // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
+    // overlong 2-byte, '11000001'.
+    if (byte1 < 0xC2.toByte()) {
+      error("Invalid UTF-8: Illegal leading byte in 2 bytes utf")
+    }
+    if (isNotTrailingByte(byte2)) {
+      error("Invalid UTF-8: Illegal trailing byte in 2 bytes utf")
+    }
+    resultArr[resultPos] = (byte1.toInt() and 0x1F shl 6 or trailingByteValue(byte2)).toChar()
+  }
+
+  public fun handleThreeBytes(
+    byte1: Byte,
+    byte2: Byte,
+    byte3: Byte,
+    resultArr: CharArray,
+    resultPos: Int
+  ) {
+    if (isNotTrailingByte(byte2) || // overlong? 5 most significant bits must not all be zero
+      byte1 == 0xE0.toByte() && byte2 < 0xA0.toByte() || // check for illegal surrogate codepoints
+      byte1 == 0xED.toByte() && byte2 >= 0xA0.toByte() ||
+      isNotTrailingByte(byte3)
+    ) {
+      error("Invalid UTF-8")
+    }
+    resultArr[resultPos] =
+      (byte1.toInt() and 0x0F shl 12 or (trailingByteValue(byte2) shl 6) or trailingByteValue(byte3)).toChar()
+  }
+
+  public fun handleFourBytes(
+    byte1: Byte,
+    byte2: Byte,
+    byte3: Byte,
+    byte4: Byte,
+    resultArr: CharArray,
+    resultPos: Int
+  ) {
+    if (isNotTrailingByte(byte2) || // Check that 1 <= plane <= 16.  Tricky optimized form of:
+      //   valid 4-byte leading byte?
+      // if (byte1 > (byte) 0xF4 ||
+      //   overlong? 4 most significant bits must not all be zero
+      //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
+      //   codepoint larger than the highest code point (U+10FFFF)?
+      //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+      (byte1.toInt() shl 28) + (byte2 - 0x90.toByte()) shr 30 != 0 || isNotTrailingByte(byte3) ||
+      isNotTrailingByte(byte4)
+    ) {
+      error("Invalid UTF-8")
+    }
+    val codepoint: Int = (
+      byte1.toInt() and 0x07 shl 18
+        or (trailingByteValue(byte2) shl 12)
+        or (trailingByteValue(byte3) shl 6)
+        or trailingByteValue(byte4)
+      )
+    resultArr[resultPos] = highSurrogate(codepoint)
+    resultArr[resultPos + 1] = lowSurrogate(codepoint)
+  }
+
+  /**
+   * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
+   */
+  private fun isNotTrailingByte(b: Byte): Boolean = b > 0xBF.toByte()
+
+  /**
+   * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
+   */
+  private fun trailingByteValue(b: Byte): Int = b.toInt() and 0x3F
+
+  private fun highSurrogate(codePoint: Int): Char =
+    (
+      Char.MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10) +
+        (codePoint ushr 10)
+      )
+
+  private fun lowSurrogate(codePoint: Int): Char = (Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
+
+  /**
+   * Encode a [CharSequence] UTF8 codepoint into a byte array.
+   * @param `in` CharSequence to be encoded
+   * @param start start position of the first char in the codepoint
+   * @param out byte array of 4 bytes to be filled
+   * @return return the amount of bytes occupied by the codepoint
+   */
+  public fun encodeUtf8CodePoint(input: CharSequence, start: Int, out: ByteArray): Int {
+    // utf8 codepoint needs at least 4 bytes
+    val inLength = input.length
+    if (start >= inLength) {
+      return 0
+    }
+    val c = input[start]
+    return if (c.toInt() < 0x80) {
+      // One byte (0xxx xxxx)
+      out[0] = c.toByte()
+      1
+    } else if (c.toInt() < 0x800) {
+      // Two bytes (110x xxxx 10xx xxxx)
+      out[0] = (0xC0 or (c.toInt() ushr 6)).toByte()
+      out[1] = (0x80 or (0x3F and c.toInt())).toByte()
+      2
+    } else if (c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) {
+      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+      // Maximum single-char code point is 0xFFFF, 16 bits.
+      out[0] = (0xE0 or (c.toInt() ushr 12)).toByte()
+      out[1] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
+      out[2] = (0x80 or (0x3F and c.toInt())).toByte()
+      3
+    } else {
+      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+      // bytes
+      val low: Char = input[start + 1]
+      if (start + 1 == inLength || !(c.isHighSurrogate() and low.isLowSurrogate())) {
+        errorSurrogate(start, inLength)
+      }
+      val codePoint: Int = toCodePoint(c, low)
+      out[0] = (0xF shl 4 or (codePoint ushr 18)).toByte()
+      out[1] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
+      out[2] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
+      out[3] = (0x80 or (0x3F and codePoint)).toByte()
+      4
+    }
+  }
+
+  // Decodes a code point starting at index into out. Out parameter
+  // should have at least 2 chars.
+  public fun decodeUtf8CodePoint(bytes: ReadBuffer, index: Int, out: CharArray) {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    val b1 = bytes[index]
+    when {
+      isOneByte(b1) -> handleOneByte(b1, out, 0)
+      isTwoBytes(b1) -> handleTwoBytes(b1, bytes[index + 1], out, 0)
+      isThreeBytes(b1) -> handleThreeBytes(b1, bytes[index + 1], bytes[index + 2], out, 0)
+      else -> handleFourBytes(b1, bytes[index + 1], bytes[index + 2], bytes[index + 3], out, 0)
+    }
+  }
+
+  public fun decodeUtf8Array(bytes: ByteArray, index: Int = 0, size: Int = bytes.size): String {
+    // Bitwise OR combines the sign bits so any negative value fails the check.
+    if (index or size or bytes.size - index - size < 0) {
+      error("buffer length=${bytes.size}, index=$index, size=$size")
+    }
+    var offset = index
+    val limit = offset + size
+
+    // The longest possible resulting String is the same as the number of input bytes, when it is
+    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+    val resultArr = CharArray(size)
+    var resultPos = 0
+
+    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+    while (offset < limit) {
+      val b = bytes[offset]
+      if (!isOneByte(b)) {
+        break
+      }
+      offset++
+      handleOneByte(b, resultArr, resultPos++)
+    }
+    while (offset < limit) {
+      val byte1 = bytes[offset++]
+      if (isOneByte(byte1)) {
+        handleOneByte(byte1, resultArr, resultPos++)
+        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+        // extra optimized loop to take care of these runs.
+        while (offset < limit) {
+          val b = bytes[offset]
+          if (!isOneByte(b)) {
+            break
+          }
+          offset++
+          handleOneByte(b, resultArr, resultPos++)
+        }
+      } else if (isTwoBytes(byte1)) {
+        if (offset >= limit) {
+          error("Invalid UTF-8")
+        }
+        handleTwoBytes(
+          byte1, /* byte2 */
+          bytes[offset++], resultArr, resultPos++
+        )
+      } else if (isThreeBytes(byte1)) {
+        if (offset >= limit - 1) {
+          error("Invalid UTF-8")
+        }
+        handleThreeBytes(
+          byte1, /* byte2 */
+          bytes[offset++], /* byte3 */
+          bytes[offset++],
+          resultArr,
+          resultPos++
+        )
+      } else {
+        if (offset >= limit - 2) {
+          error("Invalid UTF-8")
+        }
+        handleFourBytes(
+          byte1, /* byte2 */
+          bytes[offset++], /* byte3 */
+          bytes[offset++], /* byte4 */
+          bytes[offset++],
+          resultArr,
+          resultPos++
+        )
+        // 4-byte case requires two chars.
+        resultPos++
+      }
+    }
+    return resultArr.concatToString(0, resultPos)
+  }
+
+  public fun encodeUtf8Array(input: CharSequence, out: ByteArray, offset: Int = 0, length: Int = out.size - offset): Int {
+    val utf16Length = input.length
+    var j = offset
+    var i = 0
+    val limit = offset + length
+    // Designed to take advantage of
+    // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
+
+    if (utf16Length == 0)
+      return 0
+    var cc: Char = input[i]
+    while (i < utf16Length && i + j < limit && input[i].also { cc = it }.toInt() < 0x80) {
+      out[j + i] = cc.toByte()
+      i++
+    }
+    if (i == utf16Length) {
+      return j + utf16Length
+    }
+    j += i
+    var c: Char
+    while (i < utf16Length) {
+      c = input[i]
+      if (c.toInt() < 0x80 && j < limit) {
+        out[j++] = c.toByte()
+      } else if (c.toInt() < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
+        out[j++] = (0xF shl 6 or (c.toInt() ushr 6)).toByte()
+        out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
+      } else if ((c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) && j <= limit - 3) {
+        // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
+        out[j++] = (0xF shl 5 or (c.toInt() ushr 12)).toByte()
+        out[j++] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
+        out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
+      } else if (j <= limit - 4) {
+        // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
+        // four UTF-8 bytes
+        var low: Char = Char.MIN_VALUE
+        if (i + 1 == input.length ||
+          !isSurrogatePair(c, input[++i].also { low = it })
+        ) {
+          errorSurrogate(i - 1, utf16Length)
+        }
+        val codePoint: Int = toCodePoint(c, low)
+        out[j++] = (0xF shl 4 or (codePoint ushr 18)).toByte()
+        out[j++] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
+        out[j++] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
+        out[j++] = (0x80 or (0x3F and codePoint)).toByte()
+      } else {
+        // If we are surrogates and we're not a surrogate pair, always throw an
+        // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
+        if (Char.MIN_SURROGATE <= c && c <= Char.MAX_SURROGATE &&
+          (i + 1 == input.length || !isSurrogatePair(c, input[i + 1]))
+        ) {
+          errorSurrogate(i, utf16Length)
+        }
+        error("Failed writing character ${c.toShort().toString(radix = 16)} at index $j")
+      }
+      i++
+    }
+    return j
+  }
+
+  public fun codePointAt(seq: CharSequence, position: Int): Int {
+    var index = position
+    val c1 = seq[index]
+    if (c1.isHighSurrogate() && ++index < seq.length) {
+      val c2 = seq[index]
+      if (c2.isLowSurrogate()) {
+        return toCodePoint(c1, c2)
+      }
+    }
+    return c1.toInt()
+  }
+
+  private fun isSurrogatePair(high: Char, low: Char) = high.isHighSurrogate() and low.isLowSurrogate()
+
+  private fun toCodePoint(high: Char, low: Char): Int = (high.toInt() shl 10) + low.toInt() +
+    (MIN_SUPPLEMENTARY_CODE_POINT - (Char.MIN_HIGH_SURROGATE.toInt() shl 10) - Char.MIN_LOW_SURROGATE.toInt())
+
+  private fun errorSurrogate(i: Int, utf16Length: Int): Unit =
+    error("Unpaired surrogate at index $i of $utf16Length length")
+
+  // The minimum value of Unicode supplementary code point, constant `U+10000`.
+  private const val MIN_SUPPLEMENTARY_CODE_POINT = 0x010000
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/ByteArrayTest.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/ByteArrayTest.kt
new file mode 100644
index 0000000..560b0f3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/ByteArrayTest.kt
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin
+
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+class ByteArrayTest {
+
+  @Test
+  fun testByte() {
+    val testSet = arrayOf(
+      67.toByte() to byteArrayOf(67),
+      Byte.MIN_VALUE to byteArrayOf(-128),
+      Byte.MAX_VALUE to byteArrayOf(127),
+      0.toByte() to byteArrayOf(0)
+    )
+    val data = ByteArray(1)
+    testSet.forEach {
+      data[0] = it.first
+      assertArrayEquals(data, it.second)
+      assertEquals(it.first, data[0])
+    }
+  }
+
+  @Test
+  fun testShort() {
+    val testSet = arrayOf(
+      6712.toShort() to byteArrayOf(56, 26),
+      Short.MIN_VALUE to byteArrayOf(0, -128),
+      Short.MAX_VALUE to byteArrayOf(-1, 127),
+      0.toShort() to byteArrayOf(0, 0,)
+    )
+
+    val data = ByteArray(Short.SIZE_BYTES)
+    testSet.forEach {
+      data.setShort(0, it.first)
+      assertArrayEquals(data, it.second)
+      assertEquals(it.first, data.getShort(0))
+    }
+  }
+
+  @Test
+  fun testInt() {
+    val testSet = arrayOf(
+      33333500 to byteArrayOf(-4, -96, -4, 1),
+      Int.MIN_VALUE to byteArrayOf(0, 0, 0, -128),
+      Int.MAX_VALUE to byteArrayOf(-1, -1, -1, 127),
+      0 to byteArrayOf(0, 0, 0, 0)
+    )
+    val data = ByteArray(Int.SIZE_BYTES)
+    testSet.forEach {
+      data.setInt(0, it.first)
+      assertArrayEquals(data, it.second)
+      assertEquals(it.first, data.getInt(0))
+    }
+  }
+
+  @Test
+  fun testLong() {
+    val testSet = arrayOf(
+      1234567123122890123L to byteArrayOf(-117, -91, 29, -23, 65, 16, 34, 17),
+      -1L to byteArrayOf(-1, -1, -1, -1, -1, -1, -1, -1),
+      Long.MIN_VALUE to byteArrayOf(0, 0, 0, 0, 0, 0, 0, -128),
+      Long.MAX_VALUE to byteArrayOf(-1, -1, -1, -1, -1, -1, -1, 127),
+      0L to byteArrayOf(0, 0, 0, 0, 0, 0, 0, 0)
+    )
+    val data = ByteArray(Long.SIZE_BYTES)
+    testSet.forEach {
+      data.setLong(0, it.first)
+      assertArrayEquals(data, it.second)
+      assertEquals(it.first, data.getLong(0))
+    }
+  }
+
+  @Test
+  fun testULong() {
+    val testSet = arrayOf(
+      1234567123122890123UL to byteArrayOf(-117, -91, 29, -23, 65, 16, 34, 17),
+      ULong.MIN_VALUE to byteArrayOf(0, 0, 0, 0, 0, 0, 0, 0),
+      (-1L).toULong() to byteArrayOf(-1, -1, -1, -1, -1, -1, -1, -1),
+      0UL to byteArrayOf(0, 0, 0, 0, 0, 0, 0, 0)
+    )
+    val data = ByteArray(ULong.SIZE_BYTES)
+    testSet.forEach {
+      data.setULong(0, it.first)
+      assertArrayEquals(it.second, data)
+      assertEquals(it.first, data.getULong(0))
+    }
+  }
+
+  @Test
+  fun testFloat() {
+    val testSet = arrayOf(
+      3545.56337f to byteArrayOf(4, -103, 93, 69),
+      Float.MIN_VALUE to byteArrayOf(1, 0, 0, 0),
+      Float.MAX_VALUE to byteArrayOf(-1, -1, 127, 127),
+      0f to byteArrayOf(0, 0, 0, 0)
+    )
+    val data = ByteArray(Float.SIZE_BYTES)
+    testSet.forEach {
+      data.setFloat(0, it.first)
+      assertArrayEquals(data, it.second)
+    }
+  }
+
+  @Test
+  fun testDouble() {
+    val testSet = arrayOf(
+      123456.523423423412 to byteArrayOf(88, 61, -15, 95, 8, 36, -2, 64),
+      Double.MIN_VALUE to byteArrayOf(1, 0, 0, 0, 0, 0, 0, 0),
+      Double.MAX_VALUE to byteArrayOf(-1, -1, -1, -1, -1, -1, -17, 127),
+      0.0 to byteArrayOf(0, 0, 0, 0, 0, 0, 0, 0)
+    )
+    val data = ByteArray(Long.SIZE_BYTES)
+    testSet.forEach {
+      data.setDouble(0, it.first)
+      assertArrayEquals(data, it.second)
+      assertEquals(it.first, data.getDouble(0))
+    }
+  }
+
+  @Test
+  fun testString() {
+    val testSet = "∮ E⋅da = Q"
+    val encoded = testSet.encodeToByteArray()
+    val data = ByteArray(encoded.size)
+    data.setString(0, testSet)
+    assertArrayEquals(encoded, data)
+    assertEquals(testSet, data.getString(0, encoded.size))
+  }
+}
+
+fun <T> assertArrayEquals(expected: Array<out T>, actual: Array<out T>) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: IntArray, actual: IntArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: ShortArray, actual: ShortArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: LongArray, actual: LongArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: ByteArray, actual: ByteArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: DoubleArray, actual: DoubleArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun assertArrayEquals(expected: FloatArray, actual: FloatArray) =
+  assertTrue(expected contentEquals actual, arrayFailMessage(expected, actual))
+
+fun <T> arrayFailMessage(expected: Array<out T>, actual: Array<out T>): String =
+  failMessage(expected.contentToString(), actual.contentToString())
+
+fun arrayFailMessage(expected: IntArray, actual: IntArray): String =
+  failMessage(expected.contentToString(), actual.contentToString())
+
+fun arrayFailMessage(expected: ShortArray, actual: ShortArray): String =
+  failMessage(expected.contentToString(), actual.contentToString())
+
+fun arrayFailMessage(expected: LongArray, actual: LongArray): String =
+  failMessage(expected.contentToString(), actual.contentToString())
+
+fun failMessage(expected: String, actual: String): String =
+  "Expected: $expected\nActual: $actual"
+
+fun arrayFailMessage(expected: FloatArray, actual: FloatArray): String {
+  return "Expected: ${expected.contentToString()}\nActual: ${actual.contentToString()}"
+}
+
+fun arrayFailMessage(expected: DoubleArray, actual: DoubleArray): String {
+  return "Expected: ${expected.contentToString()}\nActual: ${actual.contentToString()}"
+}
+
+fun arrayFailMessage(expected: BooleanArray, actual: BooleanArray): String {
+  return "Expected: ${expected.contentToString()}\nActual: ${actual.contentToString()}"
+}
+
+fun arrayFailMessage(expected: ByteArray, actual: ByteArray): String {
+  return "Expected: ${expected.contentToString()}\nActual: ${actual.contentToString()}"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/FlexBuffersTest.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/FlexBuffersTest.kt
new file mode 100644
index 0000000..71820b6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/FlexBuffersTest.kt
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin
+
+import com.google.flatbuffers.kotlin.FlexBuffersBuilder.Companion.SHARE_NONE
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+class FlexBuffersTest {
+  @Test
+  fun testWriteInt() {
+    val values = listOf(
+      Byte.MAX_VALUE.toLong() to 3,
+      Short.MAX_VALUE.toLong() to 4,
+      Int.MAX_VALUE.toLong() to 6,
+      Long.MAX_VALUE to 10
+    )
+    val builder = FlexBuffersBuilder()
+    values.forEach {
+      builder.clear()
+      builder.put(it.first)
+      val data = builder.finish()
+      val ref = getRoot(data)
+      // although we put a long, it is shrink to a byte
+      assertEquals(it.second, data.limit)
+      assertEquals(it.first, ref.toLong())
+    }
+  }
+
+  @Test
+  fun testWriteUInt() {
+    val values = listOf(
+      UByte.MAX_VALUE.toULong() to 3,
+      UShort.MAX_VALUE.toULong() to 4,
+      UInt.MAX_VALUE.toULong() to 6,
+      ULong.MAX_VALUE to 10
+    )
+    val builder = FlexBuffersBuilder()
+    values.forEach {
+      builder.clear()
+      builder.put(it.first)
+      val data = builder.finish()
+      val ref = getRoot(data)
+      // although we put a long, it is shrink to a byte
+      assertEquals(it.second, data.limit)
+      assertEquals(it.first, ref.toULong())
+    }
+  }
+
+  @Test
+  fun testWriteString() {
+    val text = "Ḧ̵̘́ȩ̵̐l̶̿͜l̶͚͝o̷̦̚ ̷̫̊w̴̤͊ö̸̞́r̴͎̾l̷͚̐d̶̰̍"
+    val builder = FlexBuffersBuilder()
+    builder.put(text)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    assertEquals(text, ref.toString())
+  }
+
+  @Test
+  fun testInt8Array() {
+    val ary = intArrayOf(1, 2, 3, 4)
+    val builder = FlexBuffersBuilder()
+    builder.put(intArrayOf(1, 2, 3, 4))
+    val data = builder.finish()
+    val ref = getRoot(data)
+    // although we put a long, it is shrink to a byte
+    assertEquals(8, data.limit)
+    assertArrayEquals(ary, ref.toIntArray())
+  }
+
+  @Test
+  fun testShortArray() {
+    val builder = FlexBuffersBuilder(ArrayReadWriteBuffer(20))
+    val numbers = ShortArray(10) { it.toShort() }
+    builder.put(numbers)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    assertArrayEquals(numbers, ref.toShortArray())
+  }
+
+  @Test
+  fun testHugeArray() {
+    val builder = FlexBuffersBuilder()
+    val numbers = IntArray(1024) { it }
+    builder.put(numbers)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    assertArrayEquals(numbers, ref.toIntArray())
+  }
+
+  @Test
+  fun testFloatArray() {
+    val builder = FlexBuffersBuilder()
+    val numbers = FloatArray(1024) { it * 0.05f }
+    builder.put(numbers)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    assertArrayEquals(numbers, ref.toFloatArray())
+  }
+
+  @Test
+  fun testDoubleArray() {
+    val builder = FlexBuffersBuilder()
+    val numbers = DoubleArray(1024) { it * 0.0005 }
+    builder.put(numbers)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    assertArrayEquals(numbers, ref.toDoubleArray())
+  }
+
+  @Test
+  fun testLongArray() {
+    val ary: LongArray = longArrayOf(0, Short.MIN_VALUE.toLong(), Int.MAX_VALUE.toLong(), Long.MAX_VALUE)
+    val builder = FlexBuffersBuilder()
+    builder.put(ary)
+    val data = builder.finish()
+    val ref = getRoot(data)
+    // although we put a long, it is shrink to a byte
+    assertArrayEquals(ary, ref.toLongArray())
+  }
+
+  @Test
+  fun testStringArray() {
+    val ary = Array(5) { "Hello world number: $it" }
+    val builder = FlexBuffersBuilder(ArrayReadWriteBuffer(20), SHARE_NONE)
+    builder.putVector {
+      ary.forEach { put(it) }
+    }
+    val data = builder.finish()
+    val vec = getRoot(data).toVector()
+    // although we put a long, it is shrink to a byte
+    assertEquals(5, vec.size)
+    val stringAry = vec.map { it.toString() }.toTypedArray()
+    // although we put a long, it is shrink to a byte
+    assertArrayEquals(ary, stringAry)
+  }
+
+  @Test
+  fun testBlobArray() {
+    val ary = ByteArray(1000) { Random.nextInt().toByte() }
+    val builder = FlexBuffersBuilder()
+    builder.put(ary)
+    val data = builder.finish()
+    val blob = getRoot(data).toBlob()
+    // although we put a long, it is shrink to a byte
+    assertArrayEquals(ary, blob.toByteArray())
+    for (i in 0 until blob.size) {
+      assertEquals(ary[i], blob[i])
+    }
+  }
+
+  @Test
+  fun testArrays() {
+    val builder = FlexBuffersBuilder()
+    val ary: Array<String> = Array(5) { "Hello world number: $it" }
+    val numbers = IntArray(10) { it }
+    val doubles = DoubleArray(10) { it * 0.35 }
+
+    // add 3 level array of arrays in the following way
+    // [ [ "..", ...] [ "..", ..., [ "..", ...] ] ]
+    val vec = builder.startVector()
+
+    // [0, 1, 2, 3 ,4 ,5 ,6 ,7 ,8, 9]
+    val vec1 = builder.startVector()
+    numbers.forEach { builder.put(it) }
+    builder.endTypedVector(vec1)
+
+    // [0, 2, 4, 6 , 8, 10, 12, 14, 16, 18]
+    builder.putTypedVector { doubles.forEach { put(it) } }
+
+    // nested array
+    // [ "He..", "He..", "He..", "He..", "He..", [ "He..", "He..", "He..", "He..", "He.." ] ]
+    val vec3 = builder.startVector()
+    ary.forEach { builder.put(it) }
+    builder.putVector { ary.forEach { put("inner: $it") } }
+    builder.endVector(vec3)
+
+    builder.endVector(vec)
+
+    val data = builder.finish()
+    val ref = getRoot(data)
+    val vecRef = getRoot(data).toVector()
+    // although we put a long, it is shrink to a byte
+    assertEquals(3, vecRef.size)
+
+    assertArrayEquals(numbers, vecRef[0].toVector().map { it.toInt() }.toIntArray())
+    assertArrayEquals(doubles, ref[1].toDoubleArray())
+    assertEquals("Hello world number: 4", vecRef[2][4].toString())
+    assertEquals("inner: Hello world number: 4", vecRef[2][5][4].toString())
+    assertEquals("inner: Hello world number: 4", ref[2][5][4].toString())
+  }
+
+  @Test
+  fun testMap() {
+    val builder = FlexBuffersBuilder(shareFlag = FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    builder.putVector {
+      put(10)
+      putMap {
+        this["chello"] = "world"
+        this["aint"] = 10
+        this["bfloat"] = 12.3
+      }
+      put("aString")
+    }
+
+    val ref = getRoot(builder.finish())
+    val map = ref.toVector()
+    assertEquals(3, map.size)
+    assertEquals(10, map[0].toInt())
+    assertEquals("aString", map[2].toString())
+    assertEquals("world", map[1]["chello"].toString())
+    assertEquals(10, map[1]["aint"].toInt())
+    assertEquals(12.3, map[1]["bfloat"].toDouble())
+  }
+
+  @Test
+  fun testMultiMap() {
+    val builder = FlexBuffersBuilder(shareFlag = FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+    builder.putMap {
+      this["hello"] = "world"
+      this["int"] = 10
+      this["float"] = 12.3
+      this["intarray"] = intArrayOf(1, 2, 3, 4, 5)
+      this.putMap("myMap") {
+        this["cool"] = "beans"
+      }
+    }
+
+    val ref = getRoot(builder.finish())
+    val map = ref.toMap()
+    assertEquals(5, map.size)
+    assertEquals("world", map["hello"].toString())
+    assertEquals(10, map["int"].toInt())
+    assertEquals(12.3, map["float"].toDouble())
+    assertArrayEquals(intArrayOf(1, 2, 3, 4, 5), map["intarray"].toIntArray())
+    assertEquals("beans", ref["myMap"]["cool"].toString())
+    assertEquals(true, "myMap" in map)
+    assertEquals(true, "cool" in map["myMap"].toMap())
+
+    // testing null values
+    assertEquals(true, ref["invalid_key"].isNull)
+
+    val keys = map.keys.toTypedArray()
+    arrayOf("hello", "int", "float", "intarray", "myMap").sortedArray().forEachIndexed { i: Int, it: String ->
+      assertEquals(it, keys[i].toString())
+    }
+  }
+
+  @Test
+  fun testBigStringMap() {
+    val builder = FlexBuffersBuilder(shareFlag = FlexBuffersBuilder.SHARE_KEYS_AND_STRINGS)
+
+    val stringKey = Array(10000) { "Ḧ̵̘́ȩ̵̐myFairlyBigKey$it" }
+    val stringValue = Array(10000) { "Ḧ̵̘́ȩ̵̐myFairlyBigValue$it" }
+    val hashMap = mutableMapOf<String, String>()
+    val pos = builder.startMap()
+    for (i in stringKey.indices) {
+      builder[stringKey[i]] = stringValue[i]
+      hashMap[stringKey[i]] = stringValue[i]
+    }
+    builder.endMap(pos)
+    val ref = getRoot(builder.finish())
+    val map = ref.toMap()
+    val sortedKeys = stringKey.sortedArray()
+    val size = map.size
+    for (i in 0 until size) {
+      assertEquals(sortedKeys[i], map.keyAsString(i))
+      assertEquals(sortedKeys[i], map.keyAt(i).toString())
+      assertEquals(hashMap[sortedKeys[i]], map[map.keyAt(i)].toString())
+    }
+  }
+
+  @Test
+  fun testKeysAccess() {
+    for (i in 1 until 1000) {
+      val utf8String = "ሰማይ አይታረስ ንጉሥ አይከሰስ።$i"
+      val bytes = ByteArray(Utf8.encodedLength(utf8String))
+      val pos = Utf8.encodeUtf8Array(utf8String, bytes)
+      val key = Key(ArrayReadWriteBuffer(bytes), 0, pos)
+      assertEquals(utf8String.length, key.sizeInChars)
+      for (j in utf8String.indices) {
+        assertEquals(utf8String[j], key[j])
+      }
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/JSONTest.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/JSONTest.kt
new file mode 100644
index 0000000..0375bb5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/commonTest/kotlin/com/google/flatbuffers/kotlin/JSONTest.kt
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin
+
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+class JSONTest {
+
+  @Test
+  fun parse2Test() {
+    val dataStr = """
+      { "myKey" : [1, "yay"] }
+    """.trimIndent()
+    val data = dataStr.encodeToByteArray()
+    val buffer = ArrayReadWriteBuffer(data, writePosition = data.size)
+    val parser = JSONParser()
+    val root = parser.parse(buffer)
+    println(root.toJson())
+  }
+
+  @Test
+  fun parseSample() {
+    val dataStr = """
+      {
+        "ary" : [1, 2, 3],
+        "boolean_false": false,
+"boolean_true": true, "double": 1.2E33,
+        "hello":"world"
+   ,"interesting": "value",
+
+ "null_value":  null,
+
+
+  "object" : {
+    "field1": "hello"
+  }
+ }
+    """
+    val data = dataStr.encodeToByteArray()
+    val root = JSONParser().parse(ArrayReadWriteBuffer(data, writePosition = data.size))
+    println(root.toJson())
+    val map = root.toMap()
+
+    assertEquals(8, map.size)
+    assertEquals("world", map["hello"].toString())
+    assertEquals("value", map["interesting"].toString())
+    assertEquals(12e32, map["double"].toDouble())
+    assertArrayEquals(intArrayOf(1, 2, 3), map["ary"].toIntArray())
+    assertEquals(true, map["boolean_true"].toBoolean())
+    assertEquals(false, map["boolean_false"].toBoolean())
+    assertEquals(true, map["null_value"].isNull)
+    assertEquals("hello", map["object"]["field1"].toString())
+
+    val obj = map["object"]
+    assertEquals(true, obj.isMap)
+    assertEquals("{\"field1\":\"hello\"}", obj.toJson())
+    // TODO: Kotlin Double.toString() produce different strings dependending on the platform, so on JVM
+    // is 1.2E33, while on js is 1.2e+33. For now we are disabling this test.
+    //
+    // val minified = data.filterNot { it == ' '.toByte() || it == '\n'.toByte() }.toByteArray().decodeToString()
+    // assertEquals(minified, root.toJson())
+  }
+
+  @Test
+  fun testDoubles() {
+    val values = arrayOf(
+      "-0.0",
+      "1.0",
+      "1.7976931348613157",
+      "0.0",
+      "-0.5",
+      "3.141592653589793",
+      "2.718281828459045E-3",
+      "2.2250738585072014E-308",
+      "4.9E-15",
+    )
+    val parser = JSONParser()
+    assertEquals(-0.0, parser.parse(values[0]).toDouble())
+    assertEquals(1.0, parser.parse(values[1]).toDouble())
+    assertEquals(1.7976931348613157, parser.parse(values[2]).toDouble())
+    assertEquals(0.0, parser.parse(values[3]).toDouble())
+    assertEquals(-0.5, parser.parse(values[4]).toDouble())
+    assertEquals(3.141592653589793, parser.parse(values[5]).toDouble())
+    assertEquals(2.718281828459045e-3, parser.parse(values[6]).toDouble())
+    assertEquals(2.2250738585072014E-308, parser.parse(values[7]).toDouble())
+    assertEquals(4.9E-15, parser.parse(values[8]).toDouble())
+  }
+
+  @Test
+  fun testInts() {
+    val values = arrayOf(
+      "-0",
+      "0",
+      "-1",
+      "${Int.MAX_VALUE}",
+      "${Int.MIN_VALUE}",
+      "${Long.MAX_VALUE}",
+      "${Long.MIN_VALUE}",
+    )
+    val parser = JSONParser()
+
+    assertEquals(parser.parse(values[0]).toInt(), 0)
+    assertEquals(parser.parse(values[1]).toInt(), 0)
+    assertEquals(parser.parse(values[2]).toInt(), -1)
+    assertEquals(parser.parse(values[3]).toInt(), Int.MAX_VALUE)
+    assertEquals(parser.parse(values[4]).toInt(), Int.MIN_VALUE)
+    assertEquals(parser.parse(values[5]).toLong(), Long.MAX_VALUE)
+    assertEquals(parser.parse(values[6]).toLong(), Long.MIN_VALUE)
+  }
+
+  @Test
+  fun testBooleansAndNull() {
+    val values = arrayOf(
+      "true",
+      "false",
+      "null"
+    )
+    val parser = JSONParser()
+
+    assertEquals(true, parser.parse(values[0]).toBoolean())
+    assertEquals(false, parser.parse(values[1]).toBoolean())
+    assertEquals(true, parser.parse(values[2]).isNull)
+  }
+
+  @Test
+  fun testStrings() {
+    val values = arrayOf(
+      "\"\"",
+      "\"a\"",
+      "\"hello world\"",
+      "\"\\\"\\\\\\/\\b\\f\\n\\r\\t cool\"",
+      "\"\\u0000\"",
+      "\"\\u0021\"",
+      "\"hell\\u24AC\\n\\ro wor \\u0021 ld\"",
+      "\"\\/_\\\\_\\\"_\\uCAFE\\uBABE\\uAB98\\uFCDE\\ubcda\\uef4A\\b\\n\\r\\t`1~!@#\$%^&*()_+-=[]{}|;:',./<>?\"",
+    )
+    val parser = JSONParser()
+
+    // empty
+    var ref = parser.parse(values[0])
+    assertEquals(true, ref.isString)
+    assertEquals("", ref.toString())
+    // a
+    ref = parser.parse(values[1])
+    assertEquals(true, ref.isString)
+    assertEquals("a", ref.toString())
+    // hello world
+    ref = parser.parse(values[2])
+    assertEquals(true, ref.isString)
+    assertEquals("hello world", ref.toString())
+    // "\\\"\\\\\\/\\b\\f\\n\\r\\t\""
+    ref = parser.parse(values[3])
+    assertEquals(true, ref.isString)
+    assertEquals("\"\\/\b${12.toChar()}\n\r\t cool", ref.toString())
+    // 0
+    ref = parser.parse(values[4])
+    assertEquals(true, ref.isString)
+    assertEquals(0.toChar().toString(), ref.toString())
+    // u0021
+    ref = parser.parse(values[5])
+    assertEquals(true, ref.isString)
+    assertEquals(0x21.toChar().toString(), ref.toString())
+    // "\"hell\\u24AC\\n\\ro wor \\u0021 ld\"",
+    ref = parser.parse(values[6])
+    assertEquals(true, ref.isString)
+    assertEquals("hell${0x24AC.toChar()}\n\ro wor ${0x21.toChar()} ld", ref.toString())
+
+    ref = parser.parse(values[7])
+    println(ref.toJson())
+    assertEquals(true, ref.isString)
+    assertEquals("/_\\_\"_쫾몾ꮘﳞ볚\b\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?", ref.toString())
+  }
+
+  @Test
+  fun testUnicode() {
+    // took from test/unicode_test.json
+    val data = """
+      {
+        "name": "unicode_test",
+        "testarrayofstring": [
+          "Цлїςσδε",
+          "ﾌﾑｱﾑｶﾓｹﾓ",
+          "フムヤムカモケモ",
+          "㊀㊁㊂㊃㊄",
+          "☳☶☲",
+          "𡇙𝌆"
+        ],
+        "testarrayoftables": [
+          {
+            "name": "Цлїςσδε"
+          },
+          {
+            "name": "☳☶☲"
+          },
+          {
+            "name": "フムヤムカモケモ"
+          },
+          {
+            "name": "㊀㊁㊂㊃㊄"
+          },
+          {
+            "name": "ﾌﾑｱﾑｶﾓｹﾓ"
+          },
+          {
+            "name": "𡇙𝌆"
+          }
+        ]
+      }
+    """.trimIndent()
+    val parser = JSONParser()
+    val ref = parser.parse(data)
+
+    // name
+    assertEquals(3, ref.toMap().size)
+    assertEquals("unicode_test", ref["name"].toString())
+    // testarrayofstring
+    assertEquals(6, ref["testarrayofstring"].toVector().size)
+    assertEquals("Цлїςσδε", ref["testarrayofstring"][0].toString())
+    assertEquals("ﾌﾑｱﾑｶﾓｹﾓ", ref["testarrayofstring"][1].toString())
+    assertEquals("フムヤムカモケモ", ref["testarrayofstring"][2].toString())
+    assertEquals("㊀㊁㊂㊃㊄", ref["testarrayofstring"][3].toString())
+    assertEquals("☳☶☲", ref["testarrayofstring"][4].toString())
+    assertEquals("𡇙𝌆", ref["testarrayofstring"][5].toString())
+    // testarrayoftables
+    assertEquals(6, ref["testarrayoftables"].toVector().size)
+    assertEquals("Цлїςσδε", ref["testarrayoftables"][0]["name"].toString())
+    assertEquals("☳☶☲", ref["testarrayoftables"][1]["name"].toString())
+    assertEquals("フムヤムカモケモ", ref["testarrayoftables"][2]["name"].toString())
+    assertEquals("㊀㊁㊂㊃㊄", ref["testarrayoftables"][3]["name"].toString())
+    assertEquals("ﾌﾑｱﾑｶﾓｹﾓ", ref["testarrayoftables"][4]["name"].toString())
+    assertEquals("𡇙𝌆", ref["testarrayoftables"][5]["name"].toString())
+  }
+
+  @Test
+  fun testArrays() {
+    val values = arrayOf(
+      "[]",
+      "[1]",
+      "[0,1, 2,3  , 4 ]",
+      "[1.0, 2.2250738585072014E-308,  4.9E-320]",
+      "[1.0, 2,  \"hello world\"]   ",
+      "[ 1.1, 2, [ \"hello\" ] ]",
+      "[[[1]]]"
+    )
+    val parser = JSONParser()
+
+    // empty
+    var ref = parser.parse(values[0])
+    assertEquals(true, ref.isVector)
+    assertEquals(0, parser.parse(values[0]).toVector().size)
+    // single
+    ref = parser.parse(values[1])
+    assertEquals(true, ref.isTypedVector)
+    assertEquals(1, ref[0].toInt())
+    // ints
+    ref = parser.parse(values[2])
+    assertEquals(true, ref.isTypedVector)
+    assertEquals(T_VECTOR_INT, ref.type)
+    assertEquals(5, ref.toVector().size)
+    for (i in 0..4) {
+      assertEquals(i, ref[i].toInt())
+    }
+    // floats
+    ref = parser.parse(values[3])
+    assertEquals(true, ref.isTypedVector)
+    assertEquals(T_VECTOR_FLOAT, ref.type)
+    assertEquals(3, ref.toVector().size)
+    assertEquals(1.0, ref[0].toDouble())
+    assertEquals(2.2250738585072014E-308, ref[1].toDouble())
+    assertEquals(4.9E-320, ref[2].toDouble())
+    // mixed
+    ref = parser.parse(values[4])
+    assertEquals(false, ref.isTypedVector)
+    assertEquals(T_VECTOR, ref.type)
+    assertEquals(1.0, ref[0].toDouble())
+    assertEquals(2, ref[1].toInt())
+    assertEquals("hello world", ref[2].toString())
+    // nester array
+    ref = parser.parse(values[5])
+    assertEquals(false, ref.isTypedVector)
+    assertEquals(T_VECTOR, ref.type)
+    assertEquals(1.1, ref[0].toDouble())
+    assertEquals(2, ref[1].toInt())
+    assertEquals("hello", ref[2][0].toString())
+  }
+
+  /**
+   * Several test cases provided by json.org
+   * For more details, see: http://json.org/JSON_checker/, with only
+   * one exception. Single strings are considered accepted, whereas on
+   * the test suit is should fail.
+   */
+  @Test
+  fun testParseMustFail() {
+    val failList = listOf(
+      "[\"Unclosed array\"",
+      "{unquoted_key: \"keys must be quoted\"}",
+      "[\"extra comma\",]",
+      "[\"double extra comma\",,]",
+      "[   , \"<-- missing value\"]",
+      "[\"Comma after the close\"],",
+      "[\"Extra close\"]]",
+      "{\"Extra comma\": true,}",
+      "{\"Extra value after close\": true} \"misplaced quoted value\"",
+      "{\"Illegal expression\": 1 + 2}",
+      "{\"Illegal invocation\": alert()}",
+      "{\"Numbers cannot have leading zeroes\": 013}",
+      "{\"Numbers cannot be hex\": 0x14}",
+      "[\"Illegal backslash escape: \\x15\"]",
+      "[\\naked]",
+      "[\"Illegal backslash escape: \\017\"]",
+      "[[[[[[[[[[[[[[[[[[[[[[[\"Too deep\"]]]]]]]]]]]]]]]]]]]]]]]",
+      "{\"Missing colon\" null}",
+      "{\"Double colon\":: null}",
+      "{\"Comma instead of colon\", null}",
+      "[\"Colon instead of comma\": false]",
+      "[\"Bad value\", truth]",
+      "['single quote']",
+      "[\"\ttab\tcharacter\tin\tstring\t\"]",
+      "[\"tab\\   character\\   in\\  string\\  \"]",
+      "[\"line\nbreak\"]",
+      "[\"line\\\nbreak\"]",
+      "[0e]",
+      "[0e+]",
+      "[0e+-1]",
+      "{\"Comma instead if closing brace\": true,",
+      "[\"mismatch\"}"
+    )
+    for (data in failList) {
+      try {
+        JSONParser().parse(ArrayReadBuffer(data.encodeToByteArray()))
+        assertTrue(false, "SHOULD NOT PASS: $data")
+      } catch (e: IllegalStateException) {
+        println("FAIL $e")
+      }
+    }
+  }
+
+  @Test
+  fun testParseMustPass() {
+    val passList = listOf(
+      "[\n" +
+        "    \"JSON Test Pattern pass1\",\n" +
+        "    {\"object with 1 member\":[\"array with 1 element\"]},\n" +
+        "    {},\n" +
+        "    [],\n" +
+        "    -42,\n" +
+        "    true,\n" +
+        "    false,\n" +
+        "    null,\n" +
+        "    {\n" +
+        "        \"integer\": 1234567890,\n" +
+        "        \"real\": -9876.543210,\n" +
+        "        \"e\": 0.123456789e-12,\n" +
+        "        \"E\": 1.234567890E+34,\n" +
+        "        \"\":  23456789012E66,\n" +
+        "        \"zero\": 0,\n" +
+        "        \"one\": 1,\n" +
+        "        \"space\": \" \",\n" +
+        "        \"quote\": \"\\\"\",\n" +
+        "        \"backslash\": \"\\\\\",\n" +
+        "        \"controls\": \"\\b\\f\\n\\r\\t\",\n" +
+        "        \"slash\": \"/ & \\/\",\n" +
+        "        \"alpha\": \"abcdefghijklmnopqrstuvwyz\",\n" +
+        "        \"ALPHA\": \"ABCDEFGHIJKLMNOPQRSTUVWYZ\",\n" +
+        "        \"digit\": \"0123456789\",\n" +
+        "        \"0123456789\": \"digit\",\n" +
+        "        \"special\": \"`1~!@#\$%^&*()_+-={':[,]}|;.</>?\",\n" +
+        "        \"hex\": \"\\u0123\\u4567\\u89AB\\uCDEF\\uabcd\\uef4A\",\n" +
+        "        \"true\": true,\n" +
+        "        \"false\": false,\n" +
+        "        \"null\": null,\n" +
+        "        \"array\":[  ],\n" +
+        "        \"object\":{  },\n" +
+        "        \"address\": \"50 St. James Street\",\n" +
+        "        \"url\": \"http://www.JSON.org/\",\n" +
+        "        \"comment\": \"// /* <!-- --\",\n" +
+        "        \"# -- --> */\": \" \",\n" +
+        "        \" s p a c e d \" :[1,2 , 3\n" +
+        "\n" +
+        ",\n" +
+        "\n" +
+        "4 , 5        ,          6           ,7        ],\"compact\":[1,2,3,4,5,6,7],\n" +
+        "        \"jsontext\": \"{\\\"object with 1 member\\\":[\\\"array with 1 element\\\"]}\",\n" +
+        "        \"quotes\": \"&#34; \\u0022 %22 0x22 034 &#x22;\",\n" +
+        "        \"\\/\\\\\\\"\\uCAFE\\uBABE\\uAB98\\uFCDE\\ubcda\\uef4A\\b\\f\\n\\r\\t`1~!@#\$%^&*()_+-=[]{}|;:',./<>?\"\n" +
+        ": \"A key can be any string\"\n" +
+        "    },\n" +
+        "    0.5 ,98.6\n" +
+        ",\n" +
+        "99.44\n" +
+        ",\n" +
+        "\n" +
+        "1066,\n" +
+        "1e1,\n" +
+        "0.1e1,\n" +
+        "1e-1,\n" +
+        "1e00,2e+00,2e-00\n" +
+        ",\"rosebud\"]",
+      "{\n" +
+        "    \"JSON Test Pattern pass3\": {\n" +
+        "        \"The outermost value\": \"must be an object or array.\",\n" +
+        "        \"In this test\": \"It is an object.\"\n" +
+        "    }\n" +
+        "}",
+      "[[[[[[[[[[[[[[[[[[[\"Not too deep\"]]]]]]]]]]]]]]]]]]]",
+    )
+    for (data in passList) {
+      JSONParser().parse(ArrayReadBuffer(data.encodeToByteArray()))
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jsMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jsMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
new file mode 100644
index 0000000..99c6b51
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jsMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+
+package com.google.flatbuffers.kotlin
+
+/**
+ * This implementation uses Little Endian order.
+ */
+public actual inline fun ByteArray.getUByte(index: Int): UByte = ByteArrayOps.getUByte(this, index)
+public actual inline fun ByteArray.getShort(index: Int): Short = ByteArrayOps.getShort(this, index)
+public actual inline fun ByteArray.getUShort(index: Int): UShort = ByteArrayOps.getUShort(this, index)
+public actual inline fun ByteArray.getInt(index: Int): Int = ByteArrayOps.getInt(this, index)
+public actual inline fun ByteArray.getUInt(index: Int): UInt = ByteArrayOps.getUInt(this, index)
+public actual inline fun ByteArray.getLong(index: Int): Long = ByteArrayOps.getLong(this, index)
+public actual inline fun ByteArray.getULong(index: Int): ULong = ByteArrayOps.getULong(this, index)
+public actual inline fun ByteArray.getFloat(index: Int): Float = ByteArrayOps.getFloat(this, index)
+public actual inline fun ByteArray.getDouble(index: Int): Double = ByteArrayOps.getDouble(this, index)
+
+public actual inline fun ByteArray.setUByte(index: Int, value: UByte): Unit = ByteArrayOps.setUByte(this, index, value)
+public actual inline fun ByteArray.setShort(index: Int, value: Short): Unit = ByteArrayOps.setShort(this, index, value)
+public actual inline fun ByteArray.setUShort(index: Int, value: UShort): Unit = ByteArrayOps.setUShort(this, index, value)
+public actual inline fun ByteArray.setInt(index: Int, value: Int): Unit = ByteArrayOps.setInt(this, index, value)
+public actual inline fun ByteArray.setUInt(index: Int, value: UInt): Unit = ByteArrayOps.setUInt(this, index, value)
+public actual inline fun ByteArray.setLong(index: Int, value: Long): Unit = ByteArrayOps.setLong(this, index, value)
+public actual inline fun ByteArray.setULong(index: Int, value: ULong): Unit = ByteArrayOps.setULong(this, index, value)
+public actual inline fun ByteArray.setFloat(index: Int, value: Float): Unit = ByteArrayOps.setFloat(this, index, value)
+public actual inline fun ByteArray.setDouble(index: Int, value: Double): Unit = ByteArrayOps.setDouble(this, index, value)
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
new file mode 100644
index 0000000..7e5d376
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:JvmName("JVMByteArray")
+@file:Suppress("NOTHING_TO_INLINE")
+
+package com.google.flatbuffers.kotlin
+
+/**
+ * This implementation uses Little Endian order.
+ */
+public actual inline fun ByteArray.getUByte(index: Int): UByte = ByteArrayOps.getUByte(this, index)
+public actual inline fun ByteArray.getShort(index: Int): Short = ByteArrayOps.getShort(this, index)
+public actual inline fun ByteArray.getUShort(index: Int): UShort = ByteArrayOps.getUShort(this, index)
+public actual inline fun ByteArray.getInt(index: Int): Int = ByteArrayOps.getInt(this, index)
+public actual inline fun ByteArray.getUInt(index: Int): UInt = ByteArrayOps.getUInt(this, index)
+public actual inline fun ByteArray.getLong(index: Int): Long = ByteArrayOps.getLong(this, index)
+public actual inline fun ByteArray.getULong(index: Int): ULong = ByteArrayOps.getULong(this, index)
+public actual inline fun ByteArray.getFloat(index: Int): Float = ByteArrayOps.getFloat(this, index)
+public actual inline fun ByteArray.getDouble(index: Int): Double = ByteArrayOps.getDouble(this, index)
+
+public actual inline fun ByteArray.setUByte(index: Int, value: UByte): Unit = ByteArrayOps.setUByte(this, index, value)
+public actual inline fun ByteArray.setShort(index: Int, value: Short): Unit = ByteArrayOps.setShort(this, index, value)
+public actual inline fun ByteArray.setUShort(index: Int, value: UShort): Unit = ByteArrayOps.setUShort(this, index, value)
+public actual inline fun ByteArray.setInt(index: Int, value: Int): Unit = ByteArrayOps.setInt(this, index, value)
+public actual inline fun ByteArray.setUInt(index: Int, value: UInt): Unit = ByteArrayOps.setUInt(this, index, value)
+public actual inline fun ByteArray.setLong(index: Int, value: Long): Unit = ByteArrayOps.setLong(this, index, value)
+public actual inline fun ByteArray.setULong(index: Int, value: ULong): Unit = ByteArrayOps.setULong(this, index, value)
+public actual inline fun ByteArray.setFloat(index: Int, value: Float): Unit = ByteArrayOps.setFloat(this, index, value)
+public actual inline fun ByteArray.setDouble(index: Int, value: Double): Unit = ByteArrayOps.setDouble(this, index, value)
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/kotlin/com/google/flatbuffers/kotlin/Utf8Test.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/kotlin/com/google/flatbuffers/kotlin/Utf8Test.kt
new file mode 100644
index 0000000..96b9c0a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/kotlin/com/google/flatbuffers/kotlin/Utf8Test.kt
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.flatbuffers.kotlin
+
+import org.junit.Test
+import kotlin.test.assertEquals
+
+class Utf8Test {
+
+  @Test
+  fun testUtf8EncodingDecoding() {
+    val utf8Lines = String(this.javaClass.classLoader.getResourceAsStream("utf8_sample.txt")!!.readBytes())
+      .split("\n")
+      .filter { it.trim().isNotEmpty() }
+
+    val utf8Bytes = utf8Lines.map { s -> ByteArray(Utf8.encodedLength(s)).also { Utf8.encodeUtf8Array(s, it) } }
+    utf8Bytes.indices.forEach {
+      assertArrayEquals(utf8Lines[it].encodeToByteArray(), utf8Bytes[it])
+      assertEquals(utf8Lines[it], Utf8.decodeUtf8Array(utf8Bytes[it]))
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/resources/utf8_sample.txt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/resources/utf8_sample.txt
new file mode 100644
index 0000000..4fea69b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/jvmTest/resources/utf8_sample.txt
@@ -0,0 +1,201 @@
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+Markus Kuhn [ˈmaʳkʊs kuːn] <mkuhn@acm.org> — 1999-08-20
+
+The ASCII compatible UTF-8 encoding of ISO 10646 and Unicode
+plain-text files is defined in RFC 2279 and in ISO 10646-1 Annex R.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and Sciences:
+
+  ∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),
+
+  ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (A ⇔ B),
+
+  2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm
+
+Linguistics and dictionaries:
+
+  ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+  Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+  ((V⍳V)=⍳⍴V)/V←,V    ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+  ╔══════════════════════════════════════════╗
+  ║                                          ║
+  ║   • ‘single’ and “double” quotes         ║
+  ║                                          ║
+  ║   • Curly apostrophes: “We’ve been here” ║
+  ║                                          ║
+  ║   • Latin-1 apostrophe and accents: '´`  ║
+  ║                                          ║
+  ║   • ‚deutsche‘ „Anführungszeichen“       ║
+  ║                                          ║
+  ║   • †, ‡, ‰, •, 3–4, —, −5/+5, ™, …      ║
+  ║                                          ║
+  ║   • ASCII safety test: 1lI|, 0OD, 8B     ║
+  ║                      ╭─────────╮         ║
+  ║   • the euro symbol: │ 14.95 € │         ║
+  ║                      ╰─────────╯         ║
+  ╚══════════════════════════════════════════╝
+
+Greek (in Polytonic):
+
+  The Greek anthem:
+
+  Σὲ γνωρίζω ἀπὸ τὴν κόψη
+  τοῦ σπαθιοῦ τὴν τρομερή,
+  σὲ γνωρίζω ἀπὸ τὴν ὄψη
+  ποὺ μὲ βία μετράει τὴ γῆ.
+
+  ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+  τῶν ῾Ελλήνων τὰ ἱερά
+  καὶ σὰν πρῶτα ἀνδρειωμένη
+  χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+  From a speech of Demosthenes in the 4th century BC:
+
+  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+  Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+  From a Unicode conference invitation:
+
+  გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+  კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+  ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+  ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+  ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+  ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+  ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+  From a Unicode conference invitation:
+
+  Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+  Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+  Конференция соберет широкий круг экспертов по  вопросам глобального
+  Интернета и Unicode, локализации и интернационализации, воплощению и
+  применению Unicode в различных операционных системах и программных
+  приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+  Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+  classic 'San Gua'):
+
+  [----------------------------|------------------------]
+    ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช  พระปกเกศกองบู๊กู้ขึ้นใหม่
+  สิบสองกษัตริย์ก่อนหน้าแลถัดไป       สององค์ไซร้โง่เขลาเบาปัญญา
+    ทรงนับถือขันทีเป็นที่พึ่ง           บ้านเมืองจึงวิปริตเป็นนักหนา
+  โฮจิ๋นเรียกทัพทั่วหัวเมืองมา         หมายจะฆ่ามดชั่วตัวสำคัญ
+    เหมือนขับไสไล่เสือจากเคหา      รับหมาป่าเข้ามาเลยอาสัญ
+  ฝ่ายอ้องอุ้นยุแยกให้แตกกัน          ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+    พลันลิฉุยกุยกีกลับก่อเหตุ          ช่างอาเพศจริงหนาฟ้าร้องไห้
+  ต้องรบราฆ่าฟันจนบรรลัย           ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+  (The above is a two-column text. If combining characters are handled
+  correctly, the lines of the second column should be aligned with the
+  | character above.)
+
+Ethiopian:
+
+  Proverbs in the Amharic language:
+
+  ሰማይ አይታረስ ንጉሥ አይከሰስ።
+  ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+  ጌጥ ያለቤቱ ቁምጥና ነው።
+  ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+  የአፍ ወለምታ በቅቤ አይታሽም።
+  አይጥ በበላ ዳዋ ተመታ።
+  ሲተረጉሙ ይደረግሙ።
+  ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+  ድር ቢያብር አንበሳ ያስር።
+  ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+  እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+  የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+  ሥራ ከመፍታት ልጄን ላፋታት።
+  ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+  የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+  ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+  ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+  እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+  ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+  (Old English, which transcribed into Latin reads 'He cwaeth that he
+  bude thaem lande northweardum with tha Westsae.' and means 'He said
+  that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+  ⡌⠁⠧⠑ ⠼⠁⠒  ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+  ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+  ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+  ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+  ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+  ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+  ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+  ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+  ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+  ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+  ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+  ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+  ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+  ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+  ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+  (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+  ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+  abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+  –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+  ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ ﬁ�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+  Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests:                                          █
+                                                                      ▉
+  ╔══╦══╗  ┌──┬──┐  ╭──┬──╮  ╭──┬──╮  ┏━━┳━━┓  ┎┒┏┑   ╷  ╻ ┏┯┓ ┌┰┐    ▊ ╱╲╱╲╳╳╳
+  ║┌─╨─┐║  │╔═╧═╗│  │╒═╪═╕│  │╓─╁─╖│  ┃┌─╂─┐┃  ┗╃╄┙  ╶┼╴╺╋╸┠┼┨ ┝╋┥    ▋ ╲╱╲╱╳╳╳
+  ║│╲ ╱│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╿ │┃  ┍╅╆┓   ╵  ╹ ┗┷┛ └┸┘    ▌ ╱╲╱╲╳╳╳
+  ╠╡ ╳ ╞╣  ├╢   ╟┤  ├┼─┼─┼┤  ├╫─╂─╫┤  ┣┿╾┼╼┿┫  ┕┛┖┚     ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+  ║│╱ ╲│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╽ │┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▎
+  ║└─╥─┘║  │╚═╤═╝│  │╘═╪═╛│  │╙─╀─╜│  ┃└─╂─┘┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▏
+  ╚══╩══╝  └──┴──┘  ╰──┴──╯  ╰──┴──╯  ┗━━┻━━┛           └╌╌┘ ╎ ┗╍╍┛ ┋  ▁▂▃▄▅▆▇█
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/nativeMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/nativeMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
new file mode 100644
index 0000000..e9dc087
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/flatbuffers-kotlin/src/nativeMain/kotlin/com/google/flatbuffers/kotlin/ByteArray.kt
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@file:Suppress("NOTHING_TO_INLINE")
+package com.google.flatbuffers.kotlin
+
+/**
+ * This implementation assumes that of native macOSX64 the byte order of the implementation is Little Endian.
+ */
+
+public actual inline fun ByteArray.getUByte(index: Int): UByte = getUByteAt(index)
+public actual inline fun ByteArray.getShort(index: Int): Short = getShortAt(index)
+public actual inline fun ByteArray.getUShort(index: Int): UShort = getUShortAt(index)
+public actual inline fun ByteArray.getInt(index: Int): Int = getIntAt(index)
+public actual inline fun ByteArray.getUInt(index: Int): UInt = getUIntAt(index)
+public actual inline fun ByteArray.getLong(index: Int): Long = getLongAt(index)
+public actual inline fun ByteArray.getULong(index: Int): ULong = getULongAt(index)
+
+public actual inline fun ByteArray.setUByte(index: Int, value: UByte): Unit = setUByteAt(index, value)
+public actual inline fun ByteArray.setShort(index: Int, value: Short): Unit = setShortAt(index, value)
+public actual inline fun ByteArray.setUShort(index: Int, value: UShort): Unit = setUShortAt(index, value)
+public actual inline fun ByteArray.setInt(index: Int, value: Int): Unit = setIntAt(index, value)
+public actual inline fun ByteArray.setUInt(index: Int, value: UInt): Unit = setUIntAt(index, value)
+public actual inline fun ByteArray.setLong(index: Int, value: Long): Unit = setLongAt(index, value)
+public actual inline fun ByteArray.setULong(index: Int, value: ULong): Unit = setULongAt(index, value)
+public actual inline fun ByteArray.setFloat(index: Int, value: Float): Unit = setFloatAt(index, value)
+public actual inline fun ByteArray.setDouble(index: Int, value: Double): Unit = setDoubleAt(index, value)
+public actual inline fun ByteArray.getFloat(index: Int): Float = Float.fromBits(getIntAt(index))
+public actual inline fun ByteArray.getDouble(index: Int): Double = Double.fromBits(getLongAt(index))
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.jar b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..62d4c05
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.properties b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..8faf39d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,9 @@
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+# Remove kotlin MPP warning
+kotlin.mpp.stability.nowarn=true
+# Needed to share source among different targets
+kotlin.mpp.enableGranularSourceSetsMetadata=true
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew
new file mode 100755
index 0000000..fbd7c51
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew
@@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew.bat b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew.bat
new file mode 100644
index 0000000..5093609
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/gradlew.bat
@@ -0,0 +1,104 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/settings.gradle b/3rdparty/TNN/third_party/flatbuffers/kotlin/settings.gradle
new file mode 100644
index 0000000..a30971c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/settings.gradle
@@ -0,0 +1,10 @@
+pluginManagement {
+  repositories {
+    mavenCentral()
+    gradlePluginPortal()
+  }
+}
+
+
+rootProject.name = 'flatbuffers-kotlin'
+include 'flatbuffers-kotlin', "benchmark"
diff --git a/3rdparty/TNN/third_party/flatbuffers/kotlin/spotless/spotless.kt b/3rdparty/TNN/third_party/flatbuffers/kotlin/spotless/spotless.kt
new file mode 100644
index 0000000..6363ca0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/kotlin/spotless/spotless.kt
@@ -0,0 +1,15 @@
+/*
+ * Copyright $YEAR Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
diff --git a/3rdparty/TNN/third_party/flatbuffers/lobster/flatbuffers.lobster b/3rdparty/TNN/third_party/flatbuffers/lobster/flatbuffers.lobster
new file mode 100644
index 0000000..8b04c87
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lobster/flatbuffers.lobster
@@ -0,0 +1,301 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import std
+
+namespace flatbuffers
+
+class handle:
+    buf_:string
+    pos_:int
+
+// More strongly typed than a naked int, at no cost.
+struct offset:
+    o:int
+
+enum sizeof:
+    sz_8 = 1
+    sz_16 = 2
+    sz_32 = 4
+    sz_64 = 8
+    sz_voffset = 2
+    sz_uoffset = 4
+    sz_soffset = 4
+    sz_metadata_fields = 2
+
+class builder:
+    buf = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+    current_vtable:[int] = []
+    head = 0
+    minalign = 1
+    object_end = 0
+    vtables:[int] = []
+    nested = false
+    finished = false
+
+    // Optionally call this right after creating the builder for a larger initial buffer.
+    def Initial(initial_size:int):
+        buf = "\x00".repeat_string(initial_size)
+
+    def Start():
+        // Get the start of useful data in the underlying byte buffer.
+        return buf.length - head
+
+    def Offset():
+        // Offset relative to the end of the buffer.
+        return offset { head }
+
+    // Returns a copy of the part of the buffer containing only the finished FlatBuffer
+    def SizedCopy():
+        assert finished
+        return buf.substring(Start(), -1)
+
+    def StartNesting():
+        assert not nested
+        nested = true
+
+    def EndNesting():
+        assert nested
+        nested = false
+
+    def StartObject(numfields):
+        StartNesting()
+        current_vtable = map(numfields): 0
+        object_end = head
+        minalign = 1
+
+    def EndObject():
+        EndNesting()
+        // Prepend a zero scalar to the object. Later in this function we'll
+        // write an offset here that points to the object's vtable:
+        PrependInt32(0)
+        let object_offset = head
+        // Write out new vtable speculatively.
+        let vtable_size = (current_vtable.length + sz_metadata_fields) * sz_voffset
+        while current_vtable.length:
+            let o = current_vtable.pop()
+            PrependVOffsetT(if o: object_offset - o else: 0)
+        // The two metadata fields are written last.
+        // First, store the object bytesize:
+        PrependVOffsetT(object_offset - object_end)
+        // Second, store the vtable bytesize:
+        PrependVOffsetT(vtable_size)
+        // Search backwards through existing vtables, because similar vtables
+        // are likely to have been recently appended. See
+        // BenchmarkVtableDeduplication for a case in which this heuristic
+        // saves about 30% of the time used in writing objects with duplicate
+        // tables.
+        def find_existing_table():
+            reverse(vtables) vt2_offset:
+                // Find the other vtable:
+                let vt2_start = buf.length - vt2_offset
+                let vt2_len = buf.read_int16_le(vt2_start)
+                // Compare the other vtable to the one under consideration.
+                // If they are equal, return the offset:
+                if vtable_size == vt2_len and
+                    not compare_substring(buf, Start(), buf, vt2_start, vtable_size):
+                        return vt2_offset
+            return 0
+        let existing_vtable = find_existing_table()
+        if existing_vtable:
+            // Found a duplicate vtable, remove the one we wrote.
+            head = object_offset
+            // Write the offset to the found vtable in the
+            // already-allocated offset at the beginning of this object:
+            buf.write_int32_le(Start(), existing_vtable - object_offset)
+        else:
+            // Did not find a vtable, so keep the one we wrote.
+            // Next, write the offset to the new vtable in the
+            // already-allocated offset at the beginning of this object:
+            buf.write_int32_le(buf.length - object_offset, head - object_offset)
+            // Finally, store this vtable in memory for future
+            // deduplication:
+            vtables.push(head)
+        return offset { object_offset }
+
+    def Pad(n):
+        for(n):
+            buf, head = buf.write_int8_le_back(head, 0)
+
+    def Prep(size, additional_bytes):
+        // Track the biggest thing we've ever aligned to.
+        if size > minalign:
+            minalign = size
+        // Find the amount of alignment needed such that `size` is properly
+        // aligned after `additionalBytes`:
+        let align_size = ((~(head + additional_bytes)) + 1) & (size - 1)
+        Pad(align_size)
+
+    def PrependUOffsetTRelative(off:offset):
+        // Prepends an unsigned offset into vector data, relative to where it will be written.
+        Prep(sz_uoffset, 0)
+        assert off.o <= head
+        PlaceUOffsetT(head - off.o + sz_uoffset)
+
+    def StartVector(elem_size, num_elems, alignment):
+        // Initializes bookkeeping for writing a new vector.
+        StartNesting()
+        Prep(sz_32, elem_size * num_elems)
+        Prep(alignment, elem_size * num_elems)  // In case alignment > int.
+        return Offset()
+
+    def EndVector(vector_num_elems):
+        EndNesting()
+        // we already made space for this, so write without PrependUint32
+        PlaceUOffsetT(vector_num_elems)
+        return Offset()
+
+    def CreateString(s:string):
+        // writes a null-terminated byte string.
+        StartNesting()
+        Prep(sz_32, s.length + 1)
+        buf, head = buf.write_substring_back(head, s, true)
+        return EndVector(s.length)
+
+    def CreateByteVector(s:string):
+        // writes a non-null-terminated byte string.
+        StartNesting()
+        Prep(sz_32, s.length)
+        buf, head = buf.write_substring_back(head, s, false)
+        return EndVector(s.length)
+
+    def Slot(slotnum):
+        assert nested
+        while current_vtable.length <= slotnum: current_vtable.push(0)
+        current_vtable[slotnum] = head
+
+    def __Finish(root_table:offset, size_prefix:int):
+        // Finish finalizes a buffer, pointing to the given root_table
+        assert not finished
+        assert not nested
+        var prep_size = sz_32
+        if size_prefix:
+            prep_size += sz_32
+        Prep(minalign, prep_size)
+        PrependUOffsetTRelative(root_table)
+        if size_prefix:
+            PrependInt32(head)
+        finished = true
+        return Start()
+
+    def Finish(root_table:offset):
+        return __Finish(root_table, false)
+
+    def FinishSizePrefixed(root_table:offset):
+        return __Finish(root_table, true)
+
+    def PrependBool(x):
+        buf, head = buf.write_int8_le_back(head, x)
+
+    def PrependByte(x):
+        buf, head = buf.write_int8_le_back(head, x)
+
+    def PrependUint8(x):
+        buf, head = buf.write_int8_le_back(head, x)
+
+    def PrependUint16(x):
+        Prep(sz_16, 0)
+        buf, head = buf.write_int16_le_back(head, x)
+
+    def PrependUint32(x):
+        Prep(sz_32, 0)
+        buf, head = buf.write_int32_le_back(head, x)
+
+    def PrependUint64(x):
+        Prep(sz_64, 0)
+        buf, head = buf.write_int64_le_back(head, x)
+
+    def PrependInt8(x):
+        buf, head = buf.write_int8_le_back(head, x)
+
+    def PrependInt16(x):
+        Prep(sz_16, 0)
+        buf, head = buf.write_int16_le_back(head, x)
+
+    def PrependInt32(x):
+        Prep(sz_32, 0)
+        buf, head = buf.write_int32_le_back(head, x)
+
+    def PrependInt64(x):
+        Prep(sz_64, 0)
+        buf, head = buf.write_int64_le_back(head, x)
+
+    def PrependFloat32(x):
+        Prep(sz_32, 0)
+        buf, head = buf.write_float32_le_back(head, x)
+
+    def PrependFloat64(x):
+        Prep(sz_64, 0)
+        buf, head = buf.write_float64_le_back(head, x)
+
+    def PrependVOffsetT(x):
+        Prep(sz_voffset, 0)
+        buf, head = buf.write_int16_le_back(head, x)
+
+    def PlaceVOffsetT(x):
+        buf, head = buf.write_int16_le_back(head, x)
+
+    def PlaceSOffsetT(x):
+        buf, head = buf.write_int32_le_back(head, x)
+
+    def PlaceUOffsetT(x):
+        buf, head = buf.write_int32_le_back(head, x)
+
+    def PrependSlot(o:int, x, d, f):
+        if x != d:
+            f(x)
+            Slot(o)
+
+    def PrependSlot(o:int, x, f):
+        f(x)
+        Slot(o)
+
+    def PrependBoolSlot(o, x, d): PrependSlot(o, x, d): PrependBool(_)
+    def PrependByteSlot(o, x, d): PrependSlot(o, x, d): PrependByte(_)
+    def PrependUint8Slot(o, x, d): PrependSlot(o, x, d): PrependUint8(_)
+    def PrependUint16Slot(o, x, d): PrependSlot(o, x, d): PrependUint16(_)
+    def PrependUint32Slot(o, x, d): PrependSlot(o, x, d): PrependUint32(_)
+    def PrependUint64Slot(o, x, d): PrependSlot(o, x, d): PrependUint64(_)
+    def PrependInt8Slot(o, x, d): PrependSlot(o, x, d): PrependInt8(_)
+    def PrependInt16Slot(o, x, d): PrependSlot(o, x, d): PrependInt16(_)
+    def PrependInt32Slot(o, x, d): PrependSlot(o, x, d): PrependInt32(_)
+    def PrependInt64Slot(o, x, d): PrependSlot(o, x, d): PrependInt64(_)
+    def PrependFloat32Slot(o, x, d): PrependSlot(o, x, d): PrependFloat32(_)
+    def PrependFloat64Slot(o, x, d): PrependSlot(o, x, d): PrependFloat64(_)
+
+    def PrependBoolSlot(o, x): PrependSlot(o, x): PrependBool(_)
+    def PrependByteSlot(o, x): PrependSlot(o, x): PrependByte(_)
+    def PrependUint8Slot(o, x): PrependSlot(o, x): PrependUint8(_)
+    def PrependUint16Slot(o, x): PrependSlot(o, x): PrependUint16(_)
+    def PrependUint32Slot(o, x): PrependSlot(o, x): PrependUint32(_)
+    def PrependUint64Slot(o, x): PrependSlot(o, x): PrependUint64(_)
+    def PrependInt8Slot(o, x): PrependSlot(o, x): PrependInt8(_)
+    def PrependInt16Slot(o, x): PrependSlot(o, x): PrependInt16(_)
+    def PrependInt32Slot(o, x): PrependSlot(o, x): PrependInt32(_)
+    def PrependInt64Slot(o, x): PrependSlot(o, x): PrependInt64(_)
+    def PrependFloat32Slot(o, x): PrependSlot(o, x): PrependFloat32(_)
+    def PrependFloat64Slot(o, x): PrependSlot(o, x): PrependFloat64(_)
+
+    def PrependUOffsetTRelativeSlot(o:int, x:offset):
+        if x.o:
+            PrependUOffsetTRelative(x)
+            Slot(o)
+
+    def PrependStructSlot(v:int, x:offset):
+        if x.o:
+            // Structs are always stored inline, so need to be created right
+            // where they are used. You'll get this error if you created it
+            // elsewhere.
+            assert x.o == head
+            Slot(v)
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers.lua
new file mode 100644
index 0000000..1c85c4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers.lua
@@ -0,0 +1,8 @@
+local m = {}
+
+m.Builder = require("flatbuffers.builder").New
+m.N = require("flatbuffers.numTypes")
+m.view = require("flatbuffers.view")
+m.binaryArray = require("flatbuffers.binaryarray")
+
+return m
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/binaryarray.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/binaryarray.lua
new file mode 100644
index 0000000..4d72375
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/binaryarray.lua
@@ -0,0 +1,128 @@
+local compat = require("flatbuffers.compat")
+-- locals for slightly faster access
+local string_pack = compat.string_pack
+local string_unpack = compat.string_unpack
+
+
+local m = {} -- the module table
+
+local mt = {} -- the module metatable
+
+-- given a binary array, set a metamethod to return its length
+-- (e.g., #binaryArray, calls this)
+function mt:__len()
+    return self.size
+end
+
+-- Create a new binary array of an initial size
+function m.New(sizeOrString)
+    -- the array storage itself
+    local o = {}
+    
+    if type(sizeOrString) == "string" then
+        o.str = sizeOrString
+        o.size = #sizeOrString
+    elseif type(sizeOrString) == "number" then
+        o.data = {}
+        o.size = sizeOrString
+    else
+        error("Expect a integer size value or string to construct a binary array")
+    end
+    -- set the inheritance
+    setmetatable(o, {__index = mt, __len = mt.__len})
+    return o
+end
+
+-- Get a slice of the binary array from start to end position
+function mt:Slice(startPos, endPos)
+    startPos = startPos or 0
+    endPos = endPos or self.size
+    local d = self.data
+    if d then
+        -- if the self.data is defined, we are building the buffer
+        -- in a Lua table
+        
+        -- new table to store the slice components
+        local b = {}
+        
+        -- starting with the startPos, put all
+        -- values into the new table to be concat later
+        -- updated the startPos based on the size of the
+        -- value
+        while startPos < endPos do
+            local v = d[startPos]
+            if not v or v == "" then
+                v = '/0'
+            end
+            table.insert(b, v)
+            startPos = startPos + #v
+        end
+
+        -- combine the table of strings into one string
+        -- this is faster than doing a bunch of concats by themselves
+        return table.concat(b)
+    else
+        -- n.b start/endPos are 0-based incoming, so need to convert
+        --     correctly. in python a slice includes start -> end - 1
+        return self.str:sub(startPos+1, endPos)
+    end
+end
+
+-- Grow the binary array to a new size, placing the exisiting data
+-- at then end of the new array
+function mt:Grow(newsize)
+    -- the new table to store the data
+    local newT = {}
+    
+    -- the offset to be applied to existing entries
+    local offset = newsize - self.size
+    
+    -- loop over all the current entries and
+    -- add them to the new table at the correct
+    -- offset location
+    local d = self.data
+    for i,data in pairs(d) do
+        newT[i + offset] = data
+    end
+    
+    -- update this storage with the new table and size
+    self.data = newT
+    self.size = newsize
+end
+
+-- memorization for padding strings
+local pads = {}
+
+-- pad the binary with n \0 bytes at the starting position
+function mt:Pad(n, startPos)
+    -- use memorization to avoid creating a bunch of strings
+    -- all the time
+    local s = pads[n]
+    if not s then
+        s = string.rep('\0', n)
+        pads[n] = s
+    end
+    
+    -- store the padding string at the start position in the
+    -- Lua table
+    self.data[startPos] = s
+end
+
+-- Sets the binary array value at the specified position
+function mt:Set(value, position)
+    self.data[position] = value
+end
+
+-- Pack the data into a binary representation
+function m.Pack(fmt, ...)
+    return string_pack(fmt, ...)
+end
+
+-- Unpack the data from a binary representation in
+-- a Lua value
+function m.Unpack(fmt, s, pos)
+    return string_unpack(fmt, s.str, pos + 1)
+end
+
+-- Return the binary array module
+return m
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/builder.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/builder.lua
new file mode 100644
index 0000000..25e0032
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/builder.lua
@@ -0,0 +1,385 @@
+local N = require("flatbuffers.numTypes")
+local ba = require("flatbuffers.binaryarray")
+local compat = require("flatbuffers.compat")
+local string_unpack = compat.string_unpack
+
+local m = {}
+
+local mt = {}
+
+-- get locals for faster access
+local VOffsetT  = N.VOffsetT
+local UOffsetT  = N.UOffsetT
+local SOffsetT  = N.SOffsetT
+local Bool      = N.Bool
+local Uint8     = N.Uint8
+local Uint16    = N.Uint16
+local Uint32    = N.Uint32
+local Uint64    = N.Uint64
+local Int8      = N.Int8
+local Int16     = N.Int16
+local Int32     = N.Int32
+local Int64     = N.Int64
+local Float32   = N.Float32
+local Float64   = N.Float64
+
+local MAX_BUFFER_SIZE = 0x80000000 -- 2 GB
+local VtableMetadataFields = 2
+
+local getAlignSize = compat.GetAlignSize
+
+local function vtableEqual(a, objectStart, b)
+    UOffsetT:EnforceNumber(objectStart)
+    if (#a * 2) ~= #b then
+        return false
+    end
+
+    for i, elem in ipairs(a) do
+        local x = string_unpack(VOffsetT.packFmt, b, 1 + (i - 1) * 2)
+        if x ~= 0 or elem ~= 0 then
+            local y = objectStart - elem
+            if x ~= y then
+                return false
+            end
+        end
+    end
+    return true
+end
+
+function m.New(initialSize)
+    assert(0 <= initialSize and initialSize < MAX_BUFFER_SIZE)
+    local o =
+    {
+        finished = false,
+        bytes = ba.New(initialSize),
+        nested = false,
+        head = initialSize,
+        minalign = 1,
+        vtables = {}
+    }
+    setmetatable(o, {__index = mt})
+    return o
+end
+
+-- Clears the builder and resets the state. It does not actually clear the backing binary array, it just reuses it as
+-- needed. This is a performant way to use the builder for multiple constructions without the overhead of multiple
+-- builder allocations.
+function mt:Clear()
+    self.finished = false
+    self.nested = false
+    self.minalign = 1
+    self.currentVTable = nil
+    self.objectEnd = nil
+    self.head = self.bytes.size -- place the head at the end of the binary array
+
+    -- clear vtables instead of making a new table
+    local vtable = self.vtables
+    local vtableCount = #vtable
+    for i=1,vtableCount do vtable[i] = nil end
+end
+
+function mt:Output(full)
+    assert(self.finished, "Builder Not Finished")
+    if full then
+        return self.bytes:Slice()
+    else
+        return self.bytes:Slice(self.head)
+    end
+end
+
+function mt:StartObject(numFields)
+    assert(not self.nested)
+
+    local vtable = {}
+
+    for _=1,numFields do
+        table.insert(vtable, 0)
+    end
+
+    self.currentVTable = vtable
+    self.objectEnd = self:Offset()
+    self.nested = true
+end
+
+function mt:WriteVtable()
+    self:PrependSOffsetTRelative(0)
+    local objectOffset = self:Offset()
+
+    local exisitingVTable
+    local i = #self.vtables
+    while i >= 1 do
+        if self.vtables[i] == 0 then
+            table.remove(self.vtables,i)
+        end
+        i = i - 1
+    end
+
+    i = #self.vtables
+    while i >= 1 do
+
+        local vt2Offset = self.vtables[i]
+        local vt2Start = self.bytes.size - vt2Offset
+        local vt2lenstr = self.bytes:Slice(vt2Start, vt2Start+1)
+        local vt2Len = string_unpack(VOffsetT.packFmt, vt2lenstr, 1)
+
+        local metadata = VtableMetadataFields * 2
+        local vt2End = vt2Start + vt2Len
+        local vt2 = self.bytes:Slice(vt2Start+metadata,vt2End)
+
+        if vtableEqual(self.currentVTable, objectOffset, vt2) then
+            exisitingVTable = vt2Offset
+            break
+        end
+
+        i = i - 1
+    end
+
+    if not exisitingVTable then
+        i = #self.currentVTable
+        while i >= 1 do
+            local off = 0
+            local a = self.currentVTable[i]
+            if a and a ~= 0 then
+                off = objectOffset - a
+            end
+            self:PrependVOffsetT(off)
+
+            i = i - 1
+        end
+
+        local objectSize = objectOffset - self.objectEnd
+        self:PrependVOffsetT(objectSize)
+
+        local vBytes = #self.currentVTable + VtableMetadataFields
+        vBytes = vBytes * 2
+        self:PrependVOffsetT(vBytes)
+
+        local objectStart = self.bytes.size - objectOffset
+        self.bytes:Set(SOffsetT:Pack(self:Offset() - objectOffset),objectStart)
+
+        table.insert(self.vtables, self:Offset())
+    else
+        local objectStart = self.bytes.size - objectOffset
+        self.head = objectStart
+        self.bytes:Set(SOffsetT:Pack(exisitingVTable - objectOffset),self.head)
+    end
+
+    self.currentVTable = nil
+    return objectOffset
+end
+
+function mt:EndObject()
+    assert(self.nested)
+    self.nested = false
+    return self:WriteVtable()
+end
+
+local function growByteBuffer(self, desiredSize)
+    local s = self.bytes.size
+    assert(s < MAX_BUFFER_SIZE, "Flat Buffers cannot grow buffer beyond 2 gigabytes")
+    local newsize = s
+    repeat
+        newsize = math.min(newsize * 2, MAX_BUFFER_SIZE)
+        if newsize == 0 then newsize = 1 end
+    until newsize > desiredSize
+
+    self.bytes:Grow(newsize)
+end
+
+function mt:Head()
+    return self.head
+end
+
+function mt:Offset()
+   return self.bytes.size - self.head
+end
+
+function mt:Pad(n)
+    if n > 0 then
+        -- pads are 8-bit, so skip the bytewidth lookup
+        local h = self.head - n  -- UInt8
+        self.head = h
+        self.bytes:Pad(n, h)
+    end
+end
+
+function mt:Prep(size, additionalBytes)
+    if size > self.minalign then
+        self.minalign = size
+    end
+
+    local h = self.head
+
+    local k = self.bytes.size - h + additionalBytes
+    local alignsize = getAlignSize(k, size)
+
+    local desiredSize = alignsize + size + additionalBytes
+
+    while self.head < desiredSize do
+        local oldBufSize = self.bytes.size
+        growByteBuffer(self, desiredSize)
+        local updatedHead = self.head + self.bytes.size - oldBufSize
+        self.head = updatedHead
+    end
+
+    self:Pad(alignsize)
+end
+
+function mt:PrependSOffsetTRelative(off)
+    self:Prep(4, 0)
+    assert(off <= self:Offset(), "Offset arithmetic error")
+    local off2 = self:Offset() - off + 4
+    self:Place(off2, SOffsetT)
+end
+
+function mt:PrependUOffsetTRelative(off)
+    self:Prep(4, 0)
+    local soffset = self:Offset()
+    if off <= soffset then
+        local off2 = soffset - off + 4
+        self:Place(off2, UOffsetT)
+    else
+        error("Offset arithmetic error")
+    end
+end
+
+function mt:StartVector(elemSize, numElements, alignment)
+    assert(not self.nested)
+    self.nested = true
+    local elementSize = elemSize * numElements
+    self:Prep(4, elementSize) -- Uint32 length
+    self:Prep(alignment, elementSize)
+    return self:Offset()
+end
+
+function mt:EndVector(vectorNumElements)
+    assert(self.nested)
+    self.nested = false
+    self:Place(vectorNumElements, UOffsetT)
+    return self:Offset()
+end
+
+function mt:CreateString(s)
+    assert(not self.nested)
+    self.nested = true
+
+    assert(type(s) == "string")
+
+    self:Prep(4, #s + 1)
+    self:Place(0, Uint8)
+
+    local l = #s
+    self.head = self.head - l
+
+    self.bytes:Set(s, self.head, self.head + l)
+
+    return self:EndVector(l)
+end
+
+function mt:CreateByteVector(x)
+    assert(not self.nested)
+    self.nested = true
+
+    local l = #x
+    self:Prep(4, l)
+
+    self.head = self.head - l
+
+    self.bytes:Set(x, self.head, self.head + l)
+
+    return self:EndVector(l)
+end
+
+function mt:Slot(slotnum)
+    assert(self.nested)
+    -- n.b. slot number is 0-based
+    self.currentVTable[slotnum + 1] = self:Offset()
+end
+
+local function finish(self, rootTable, sizePrefix)
+    UOffsetT:EnforceNumber(rootTable)
+    self:Prep(self.minalign, sizePrefix and 8 or 4)
+    self:PrependUOffsetTRelative(rootTable)
+    if sizePrefix then
+        local size = self.bytes.size - self.head
+        Int32:EnforceNumber(size)
+        self:PrependInt32(size)
+    end
+    self.finished = true
+    return self.head
+end
+
+function mt:Finish(rootTable)
+    return finish(self, rootTable, false)
+end
+
+function mt:FinishSizePrefixed(rootTable)
+    return finish(self, rootTable, true)
+end
+
+function mt:Prepend(flags, off)
+    self:Prep(flags.bytewidth, 0)
+    self:Place(off, flags)
+end
+
+function mt:PrependSlot(flags, o, x, d)
+    flags:EnforceNumbers(x,d)
+--    flags:EnforceNumber(x)
+--    flags:EnforceNumber(d)
+    if x ~= d then
+        self:Prepend(flags, x)
+        self:Slot(o)
+    end
+end
+
+function mt:PrependBoolSlot(...)    self:PrependSlot(Bool, ...) end
+function mt:PrependByteSlot(...)    self:PrependSlot(Uint8, ...) end
+function mt:PrependUint8Slot(...)   self:PrependSlot(Uint8, ...) end
+function mt:PrependUint16Slot(...)  self:PrependSlot(Uint16, ...) end
+function mt:PrependUint32Slot(...)  self:PrependSlot(Uint32, ...) end
+function mt:PrependUint64Slot(...)  self:PrependSlot(Uint64, ...) end
+function mt:PrependInt8Slot(...)    self:PrependSlot(Int8, ...) end
+function mt:PrependInt16Slot(...)   self:PrependSlot(Int16, ...) end
+function mt:PrependInt32Slot(...)   self:PrependSlot(Int32, ...) end
+function mt:PrependInt64Slot(...)   self:PrependSlot(Int64, ...) end
+function mt:PrependFloat32Slot(...) self:PrependSlot(Float32, ...) end
+function mt:PrependFloat64Slot(...) self:PrependSlot(Float64, ...) end
+
+function mt:PrependUOffsetTRelativeSlot(o,x,d)
+    if x~=d then
+        self:PrependUOffsetTRelative(x)
+        self:Slot(o)
+    end
+end
+
+function mt:PrependStructSlot(v,x,d)
+    UOffsetT:EnforceNumber(d)
+    if x~=d then
+        UOffsetT:EnforceNumber(x)
+        assert(x == self:Offset(), "Tried to write a Struct at an Offset that is different from the current Offset of the Builder.")
+        self:Slot(v)
+    end
+end
+
+function mt:PrependBool(x)      self:Prepend(Bool, x) end
+function mt:PrependByte(x)      self:Prepend(Uint8, x) end
+function mt:PrependUint8(x)     self:Prepend(Uint8, x) end
+function mt:PrependUint16(x)    self:Prepend(Uint16, x) end
+function mt:PrependUint32(x)    self:Prepend(Uint32, x) end
+function mt:PrependUint64(x)    self:Prepend(Uint64, x) end
+function mt:PrependInt8(x)      self:Prepend(Int8, x) end
+function mt:PrependInt16(x)     self:Prepend(Int16, x) end
+function mt:PrependInt32(x)     self:Prepend(Int32, x) end
+function mt:PrependInt64(x)     self:Prepend(Int64, x) end
+function mt:PrependFloat32(x)   self:Prepend(Float32, x) end
+function mt:PrependFloat64(x)   self:Prepend(Float64, x) end
+function mt:PrependVOffsetT(x)  self:Prepend(VOffsetT, x) end
+
+function mt:Place(x, flags)
+    local d = flags:EnforceNumberAndPack(x)
+    local h = self.head - flags.bytewidth
+    self.head = h
+    self.bytes:Set(d, h)
+end
+
+return m
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat.lua
new file mode 100644
index 0000000..1296c33
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat.lua
@@ -0,0 +1,15 @@
+local compats = {
+    ["Lua 5.1"] = function()  
+        -- Check if Lua JIT is installed first
+        local ok = pcall(require, "jit")
+        if not ok then
+            return require("flatbuffers.compat_5_1")
+        else
+            return require("flatbuffers.compat_luajit")
+        end
+    end,
+    ["Lua 5.2"] = function() return require("flatbuffers.compat_5_1") end,
+    ["Lua 5.3"] = function() return require("flatbuffers.compat_5_3") end,
+    ["Lua 5.4"] = function() return require("flatbuffers.compat_5_3") end,
+}
+return assert(compats[_VERSION], "Unsupported Lua Version: ".._VERSION)()
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_1.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_1.lua
new file mode 100644
index 0000000..8c5e432
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_1.lua
@@ -0,0 +1,21 @@
+local m = {}
+local ok, bit = pcall(require, "bit32")
+assert(ok, "The Bit32 library must be installed")
+assert(pcall(require, "compat53"), "The Compat 5.3 library must be installed")
+
+m.GetAlignSize = function(k, size)
+    return bit.band(bit.bnot(k) + 1,(size - 1))
+end
+
+if not table.unpack then
+    table.unpack = unpack
+end
+
+if not table.pack then
+    table.pack = pack
+end
+
+m.string_pack = string.pack
+m.string_unpack = string.unpack
+
+return m
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_3.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_3.lua
new file mode 100644
index 0000000..762bf07
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_5_3.lua
@@ -0,0 +1,14 @@
+-- We need to put it into a separate file to avoid syntax error like `unexpected symbol near '~'`
+local m = {}
+
+
+m.GetAlignSize = function(k, size)
+    return ((~k) + 1) & (size - 1)
+end
+
+
+m.string_pack = string.pack
+m.string_unpack = string.unpack
+
+
+return m
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_luajit.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_luajit.lua
new file mode 100644
index 0000000..1ad9bd1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/compat_luajit.lua
@@ -0,0 +1,213 @@
+local bit = require("bit")
+local ffi = require("ffi")
+local band = bit.band
+local bnot = bit.bnot
+
+
+local m = {}
+local Uint8Bound = 256 -- bound is the max uintN + 1
+local Uint16Bound = 65536
+local Uint32Bound = 4294967296
+
+if not table.unpack then
+    table.unpack = unpack
+end
+
+if not table.pack then
+    table.pack = pack
+end
+
+m.GetAlignSize = function(k, size)
+    return band((bnot(k) + 1), (size - 1))
+end
+
+
+local function pack_I1(n)
+    return string.char(n)
+end
+local function pack_i1(n)
+    if n < 0 then
+        n = Uint8Bound + n
+    end
+    return pack_I1(n)
+end
+
+local function unpack_I1(n, pos)
+    return string.byte(n, pos)
+end
+local function unpack_i1(n, pos)
+    local res = unpack_I1(n, pos)
+    if res >= Uint8Bound / 2 then
+        return res - Uint8Bound
+    end
+    return res
+end
+
+local b2 = ffi.new("unsigned char[2]")
+local function pack_I2(n)
+    for i = 0, 1 do
+        b2[i] = bit.band(n, 255)
+        n = bit.rshift(n, 8)
+    end
+    return ffi.string(b2, 2)
+end
+local function pack_i2(n)
+    if n < 0 then
+        n = Uint16Bound + n
+    end
+    return pack_I2(n)
+end
+
+local function unpack_I2(n, pos)
+    local a, b = string.byte(n, pos, pos + 1)
+    return b * Uint8Bound + a
+end
+local function unpack_i2(n, pos)
+    local res = unpack_I2(n, pos)
+    if res >= Uint16Bound / 2 then
+        return res - Uint16Bound
+    end
+    return res
+end
+
+local b4 = ffi.new("unsigned char[4]")
+local function pack_I4(n)
+    for i = 0, 3 do
+        b4[i] = bit.band(n, 255)
+        n = bit.rshift(n, 8)
+    end
+    return ffi.string(b4, 4)
+end
+local function pack_i4(n)
+    if n < 0 then
+        n = Uint32Bound + n
+    end
+    return pack_I4(n)
+end
+
+local function unpack_I4(n, pos)
+    local a, b, c, d = string.byte(n, pos, pos + 3)
+    return Uint8Bound * (Uint8Bound * ((Uint8Bound * d) + c) + b) + a
+end
+local function unpack_i4(n, pos)
+    local res = unpack_I4(n, pos)
+    if res >= Uint32Bound / 2 then
+        return res - Uint32Bound
+    end
+    return res
+end
+
+local b8 = ffi.new("unsigned char[8]")
+local function pack_I8(n)
+    n = ffi.cast("unsigned long long", n)
+    local hi = math.floor(tonumber(n / Uint32Bound))
+    local li = n % Uint32Bound
+    for i = 0, 3 do
+        b8[i] = bit.band(li, 255)
+        li = bit.rshift(li, 8)
+    end
+    for i = 4, 7 do
+        b8[i] = bit.band(hi, 255)
+        hi = bit.rshift(hi, 8)
+    end
+    return ffi.string(b8, 8)
+end
+local function pack_i8(n)
+    n = ffi.cast("signed long long", n)
+    return pack_I8(n)
+end
+
+local function unpack_I8(n, pos)
+    local a, b, c, d = string.byte(n, pos, pos + 3)
+    local li = Uint8Bound * (Uint8Bound * ((Uint8Bound * d) + c) + b) + a
+    local a, b, c, d = string.byte(n, pos + 4, pos + 7)
+    local hi = Uint8Bound * (Uint8Bound * ((Uint8Bound * d) + c) + b) + a
+    return ffi.cast("unsigned long long", hi) * Uint32Bound + li
+end
+local function unpack_i8(n, pos)
+    local res = unpack_I8(n, pos)
+    return ffi.cast("signed long long", res)
+end
+
+local bf = ffi.new("float[1]")
+local function pack_f(n)
+    bf[0] = n
+    return ffi.string(bf, 4)
+end
+
+local function unpack_f(n, pos)
+    ffi.copy(bf, ffi.cast("char *", n) + pos - 1, 4)
+    return tonumber(bf[0])
+end
+
+local bd = ffi.new("double[1]")
+local function pack_d(n)
+    bd[0] = n
+    return ffi.string(bd, 8)
+end
+
+local function unpack_d(n, pos)
+    ffi.copy(bd, ffi.cast("char *", n) + pos - 1, 8)
+    return tonumber(bd[0])
+end
+
+
+m.string_pack = function(fmt, i, ...)
+    if fmt == "<I1" then
+        return pack_I1(i)
+    elseif fmt == "<I2" then
+        return pack_I2(i)
+    elseif fmt == "<I4" then
+        return pack_I4(i)
+    elseif fmt == "<I8" then
+        return pack_I8(i)
+    elseif fmt == "<i1" then
+        return pack_i1(i)
+    elseif fmt == "<i2" then
+        return pack_i2(i)
+    elseif fmt == "<i4" then
+        return pack_i4(i)
+    elseif fmt == "<i8" then
+        return pack_i8(i)
+    elseif fmt == "<f" then
+        return pack_f(i)
+    elseif fmt == "<d" then
+        return pack_d(i)
+    else
+        error(string.format("FIXME: support fmt %s", fmt))
+    end
+end
+
+
+m.string_unpack = function(fmt, s, pos)
+    if not pos then
+        pos = 1
+    end
+
+    if fmt == "<I1" then
+        return unpack_I1(s, pos)
+    elseif fmt == "<I2" then
+        return unpack_I2(s, pos)
+    elseif fmt == "<I4" then
+        return unpack_I4(s, pos)
+    elseif fmt == "<I8" then
+        return unpack_I8(s, pos)
+    elseif fmt == "<i1" then
+        return unpack_i1(s, pos)
+    elseif fmt == "<i2" then
+        return unpack_i2(s, pos)
+    elseif fmt == "<i4" then
+        return unpack_i4(s, pos)
+    elseif fmt == "<i8" then
+        return unpack_i8(s, pos)
+    elseif fmt == "<f" then
+        return unpack_f(s, pos)
+    elseif fmt == "<d" then
+        return unpack_d(s, pos)
+    else
+        error(string.format("FIXME: support fmt %s", fmt))
+    end
+end
+
+
+return m
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/numTypes.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/numTypes.lua
new file mode 100644
index 0000000..289fa59
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/numTypes.lua
@@ -0,0 +1,219 @@
+local m = {}
+
+local ba = require("flatbuffers.binaryarray")
+
+local bpack = ba.Pack
+local bunpack = ba.Unpack
+
+local type_mt =  {}
+
+function type_mt:Pack(value)
+    return bpack(self.packFmt, value)
+end
+
+function type_mt:Unpack(buf, pos)    
+    return bunpack(self.packFmt, buf, pos)
+end
+
+function type_mt:ValidNumber(n)
+    if not self.min_value and not self.max_value then return true end
+    return self.min_value <= n and n <= self.max_value
+end
+
+function type_mt:EnforceNumber(n)
+    -- duplicate code since the overhead of function calls 
+    -- for such a popular method is time consuming
+    if not self.min_value and not self.max_value then 
+        return 
+    end
+    
+    if self.min_value <= n and n <= self.max_value then 
+        return
+    end    
+    
+    error("Number is not in the valid range") 
+end
+
+function type_mt:EnforceNumbers(a,b)
+   -- duplicate code since the overhead of function calls
+    -- for such a popular method is time consuming
+    if not self.min_value and not self.max_value then
+        return
+    end
+
+    if self.min_value <= a and a <= self.max_value and self.min_value <= b and b <= self.max_value then
+        return
+    end
+
+    error("Number is not in the valid range")
+end
+
+function type_mt:EnforceNumberAndPack(n)
+    return bpack(self.packFmt, n)    
+end
+
+function type_mt:ConvertType(n, otherType)
+    assert(self.bytewidth == otherType.bytewidth, "Cannot convert between types of different widths")
+    if self == otherType then
+        return n
+    end
+    return otherType:Unpack(self:Pack(n))
+end
+
+local bool_mt =
+{
+    bytewidth = 1,
+    min_value = false,
+    max_value = true,
+    lua_type = type(true),
+    name = "bool",
+    packFmt = "<I1",
+    Pack = function(self, value) return value and "1" or "0" end,
+    Unpack = function(self, buf, pos) return buf[pos] == "1" end,
+    ValidNumber = function(self, n) return true end, -- anything is a valid boolean in Lua
+    EnforceNumber = function(self, n) end, -- anything is a valid boolean in Lua
+    EnforceNumbers = function(self, a, b) end, -- anything is a valid boolean in Lua
+    EnforceNumberAndPack = function(self, n) return self:Pack(n) end,
+}
+
+local uint8_mt = 
+{
+    bytewidth = 1,
+    min_value = 0,
+    max_value = 2^8-1,
+    lua_type = type(1),
+    name = "uint8",
+    packFmt = "<I1"
+}
+
+local uint16_mt = 
+{
+    bytewidth = 2,
+    min_value = 0,
+    max_value = 2^16-1,
+    lua_type = type(1),
+    name = "uint16",
+    packFmt = "<I2"
+}
+
+local uint32_mt = 
+{
+    bytewidth = 4,
+    min_value = 0,
+    max_value = 2^32-1,
+    lua_type = type(1),
+    name = "uint32",
+    packFmt = "<I4"
+}
+
+local uint64_mt = 
+{
+    bytewidth = 8,
+    min_value = 0,
+    max_value = 2^64-1,
+    lua_type = type(1),
+    name = "uint64",
+    packFmt = "<I8"
+}
+
+local int8_mt = 
+{
+    bytewidth = 1,
+    min_value = -2^7,
+    max_value = 2^7-1,
+    lua_type = type(1),
+    name = "int8",
+    packFmt = "<i1"
+}
+
+local int16_mt = 
+{
+    bytewidth = 2,
+    min_value = -2^15,
+    max_value = 2^15-1,
+    lua_type = type(1),
+    name = "int16",
+    packFmt = "<i2"
+}
+
+local int32_mt = 
+{
+    bytewidth = 4,
+    min_value = -2^31,
+    max_value = 2^31-1,
+    lua_type = type(1),
+    name = "int32",
+    packFmt = "<i4"
+}
+
+local int64_mt = 
+{
+    bytewidth = 8,
+    min_value = -2^63,
+    max_value = 2^63-1,
+    lua_type = type(1),
+    name = "int64",
+    packFmt = "<i8"
+}
+
+local float32_mt = 
+{
+    bytewidth = 4,
+    min_value = nil,
+    max_value = nil,
+    lua_type = type(1.0),
+    name = "float32",
+    packFmt = "<f"
+}
+
+local float64_mt = 
+{
+    bytewidth = 8,
+    min_value = nil,
+    max_value = nil,
+    lua_type = type(1.0),
+    name = "float64",
+    packFmt = "<d"
+}
+
+-- register the base class
+setmetatable(bool_mt, {__index = type_mt})
+setmetatable(uint8_mt, {__index = type_mt})
+setmetatable(uint16_mt, {__index = type_mt})
+setmetatable(uint32_mt, {__index = type_mt})
+setmetatable(uint64_mt, {__index = type_mt})
+setmetatable(int8_mt, {__index = type_mt})
+setmetatable(int16_mt, {__index = type_mt})
+setmetatable(int32_mt, {__index = type_mt})
+setmetatable(int64_mt, {__index = type_mt})
+setmetatable(float32_mt, {__index = type_mt})
+setmetatable(float64_mt, {__index = type_mt})
+
+
+m.Uint8     = uint8_mt
+m.Uint16    = uint16_mt
+m.Uint32    = uint32_mt
+m.Uint64    = uint64_mt
+m.Int8      = int8_mt
+m.Int16     = int16_mt
+m.Int32     = int32_mt
+m.Int64     = int64_mt
+m.Float32   = float32_mt
+m.Float64   = float64_mt
+
+m.UOffsetT  = uint32_mt
+m.VOffsetT  = uint16_mt
+m.SOffsetT  = int32_mt
+
+local GenerateTypes = function(listOfTypes)
+    for _,t in pairs(listOfTypes) do
+        t.Pack = function(self, value) return bpack(self.packFmt, value) end
+        t.Unpack = function(self, buf, pos) return bunpack(self.packFmt, buf, pos) end
+    end
+end
+
+GenerateTypes(m)
+
+-- explicitly execute after GenerateTypes call, as we don't want to define a Pack/Unpack function for it.
+m.Bool      = bool_mt
+return m
diff --git a/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/view.lua b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/view.lua
new file mode 100644
index 0000000..433f25c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/lua/flatbuffers/view.lua
@@ -0,0 +1,124 @@
+local compat = require("flatbuffers.compat")
+local string_unpack = compat.string_unpack
+
+
+local m = {}
+local mt = {}
+
+local mt_name = "flatbuffers.view.mt"
+
+local N = require("flatbuffers.numTypes")
+local binaryarray = require("flatbuffers.binaryarray")
+
+local function enforceOffset(off)
+    if off < 0 or off > 42949672951 then
+        error("Offset is not valid")
+    end
+end
+
+local function unPackUoffset(bytes, off)
+    return string_unpack("<I4", bytes.str, off + 1)
+end
+
+local function unPackVoffset(bytes, off)
+    return string_unpack("<I2", bytes.str, off + 1)
+end
+
+function m.New(buf, pos)
+    enforceOffset(pos)
+    -- need to convert from a string buffer into
+    -- a binary array
+
+    local o = {
+        bytes = type(buf) == "string" and binaryarray.New(buf) or buf,
+        pos = pos,
+    }
+    setmetatable(o, {__index = mt, __metatable = mt_name})
+    return o
+end
+
+function mt:Offset(vtableOffset)
+    local vtable = self.vtable
+    if not vtable then
+        vtable = self.pos - self:Get(N.SOffsetT, self.pos)
+        self.vtable = vtable
+        self.vtableEnd = self:Get(N.VOffsetT, vtable)
+    end
+    if vtableOffset < self.vtableEnd then
+        return unPackVoffset(self.bytes, vtable + vtableOffset)
+    end
+    return 0
+end
+
+function mt:Indirect(off)
+    enforceOffset(off)
+    return off + unPackUoffset(self.bytes, off)
+end
+
+function mt:String(off)
+    enforceOffset(off)
+    off = off + unPackUoffset(self.bytes, off)
+    local start = off + 4
+    local length = unPackUoffset(self.bytes, off)
+    return self.bytes:Slice(start, start+length)
+end
+
+function mt:VectorLen(off)
+    enforceOffset(off)
+    off = off + self.pos
+    off = off + unPackUoffset(self.bytes, off)
+    return unPackUoffset(self.bytes, off)
+end
+
+function mt:Vector(off)
+    enforceOffset(off)
+    off = off + self.pos
+    return off + self:Get(N.UOffsetT, off) + 4
+end
+
+function mt:VectorAsString(off, start, stop)
+    local o = self:Offset(off)
+    if o ~= 0 then
+        start = start or 1
+        stop = stop or self:VectorLen(o)
+        local a = self:Vector(o) + start - 1
+        return self.bytes:Slice(a, a + stop - start + 1)
+    end
+    return nil
+end
+
+function mt:Union(t2, off)
+    assert(getmetatable(t2) == mt_name)
+    enforceOffset(off)
+    off = off + self.pos
+    t2.pos = off + self:Get(N.UOffsetT, off)
+    t2.bytes = self.bytes
+end
+
+function mt:Get(flags, off)
+    enforceOffset(off)
+    return flags:Unpack(self.bytes, off)
+end
+
+function mt:GetSlot(slot, d, validatorFlags)
+    N.VOffsetT:EnforceNumber(slot)
+    if validatorFlags then
+        validatorFlags:EnforceNumber(d)
+    end
+    local off = self:Offset(slot)
+    if off == 0 then
+        return d
+    end
+    return self:Get(validatorFlags, self.pos + off)
+end
+
+function mt:GetVOffsetTSlot(slot, d)
+    N.VOffsetT:EnforceNumbers(slot, d)
+    local off = self:Offset(slot)
+    if off == 0 then
+        return d
+    end
+    return off
+end
+
+return m
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/mjs/README.md b/3rdparty/TNN/third_party/flatbuffers/mjs/README.md
new file mode 100644
index 0000000..e3ca9db
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/mjs/README.md
@@ -0,0 +1 @@
+This folder is intentionally empty and will contain transpiled js modules in ES modules format after compiling with tsc.
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBuffer.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBuffer.cs
new file mode 100644
index 0000000..6e0fe35
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBuffer.cs
@@ -0,0 +1,945 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// There are three conditional compilation symbols that have an impact on performance/features of this ByteBuffer implementation.
+//
+//      UNSAFE_BYTEBUFFER
+//          This will use unsafe code to manipulate the underlying byte array. This
+//          can yield a reasonable performance increase.
+//
+//      BYTEBUFFER_NO_BOUNDS_CHECK
+//          This will disable the bounds check asserts to the byte array. This can
+//          yield a small performance gain in normal code.
+//
+//      ENABLE_SPAN_T
+//          This will enable reading and writing blocks of memory with a Span<T> instead of just
+//          T[].  You can also enable writing directly to shared memory or other types of memory
+//          by providing a custom implementation of ByteBufferAllocator.
+//          ENABLE_SPAN_T also requires UNSAFE_BYTEBUFFER to be defined, or .NET
+//          Standard 2.1.
+//
+// Using UNSAFE_BYTEBUFFER and BYTEBUFFER_NO_BOUNDS_CHECK together can yield a
+// performance gain of ~15% for some operations, however doing so is potentially
+// dangerous. Do so at your own risk!
+//
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+using System.Buffers.Binary;
+#endif
+
+#if ENABLE_SPAN_T && !UNSAFE_BYTEBUFFER && !NETSTANDARD2_1
+#warning ENABLE_SPAN_T requires UNSAFE_BYTEBUFFER to also be defined
+#endif
+
+namespace FlatBuffers
+{
+    public abstract class ByteBufferAllocator
+    {
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public abstract Span<byte> Span { get; }
+        public abstract ReadOnlySpan<byte> ReadOnlySpan { get; }
+        public abstract Memory<byte> Memory { get; }
+        public abstract ReadOnlyMemory<byte> ReadOnlyMemory { get; }
+
+#else
+        public byte[] Buffer
+        {
+            get;
+            protected set;
+        }
+#endif
+
+        public int Length
+        {
+            get;
+            protected set;
+        }
+
+        public abstract void GrowFront(int newSize);
+    }
+
+    public sealed class ByteArrayAllocator : ByteBufferAllocator
+    {
+        private byte[] _buffer;
+
+        public ByteArrayAllocator(byte[] buffer)
+        {
+            _buffer = buffer;
+            InitBuffer();
+        }
+
+        public override void GrowFront(int newSize)
+        {
+            if ((Length & 0xC0000000) != 0)
+                throw new Exception(
+                    "ByteBuffer: cannot grow buffer beyond 2 gigabytes.");
+
+            if (newSize < Length)
+                throw new Exception("ByteBuffer: cannot truncate buffer.");
+
+            byte[] newBuffer = new byte[newSize];
+            System.Buffer.BlockCopy(_buffer, 0, newBuffer, newSize - Length, Length);
+            _buffer = newBuffer;
+            InitBuffer();
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public override Span<byte> Span => _buffer;
+        public override ReadOnlySpan<byte> ReadOnlySpan => _buffer;
+        public override Memory<byte> Memory => _buffer;
+        public override ReadOnlyMemory<byte> ReadOnlyMemory => _buffer;
+#endif
+
+        private void InitBuffer()
+        {
+            Length = _buffer.Length;
+#if !ENABLE_SPAN_T
+            Buffer = _buffer;
+#endif
+        }
+    }
+
+    /// <summary>
+    /// Class to mimic Java's ByteBuffer which is used heavily in Flatbuffers.
+    /// </summary>
+    public class ByteBuffer
+    {
+        private ByteBufferAllocator _buffer;
+        private int _pos;  // Must track start of the buffer.
+
+        public ByteBuffer(ByteBufferAllocator allocator, int position)
+        {
+            _buffer = allocator;
+            _pos = position;
+        }
+
+        public ByteBuffer(int size) : this(new byte[size]) { }
+
+        public ByteBuffer(byte[] buffer) : this(buffer, 0) { }
+
+        public ByteBuffer(byte[] buffer, int pos)
+        {
+            _buffer = new ByteArrayAllocator(buffer);
+            _pos = pos;
+        }
+
+        public int Position
+        {
+            get { return _pos; }
+            set { _pos = value; }
+        }
+
+        public int Length { get { return _buffer.Length; } }
+
+        public void Reset()
+        {
+            _pos = 0;
+        }
+
+        // Create a new ByteBuffer on the same underlying data.
+        // The new ByteBuffer's position will be same as this buffer's.
+        public ByteBuffer Duplicate()
+        {
+            return new ByteBuffer(_buffer, Position);
+        }
+
+        // Increases the size of the ByteBuffer, and copies the old data towards
+        // the end of the new buffer.
+        public void GrowFront(int newSize)
+        {
+            _buffer.GrowFront(newSize);
+        }
+
+        public byte[] ToArray(int pos, int len)
+        {
+            return ToArray<byte>(pos, len);
+        }
+
+        /// <summary>
+        /// A lookup of type sizes. Used instead of Marshal.SizeOf() which has additional
+        /// overhead, but also is compatible with generic functions for simplified code.
+        /// </summary>
+        private static Dictionary<Type, int> genericSizes = new Dictionary<Type, int>()
+        {
+            { typeof(bool),     sizeof(bool) },
+            { typeof(float),    sizeof(float) },
+            { typeof(double),   sizeof(double) },
+            { typeof(sbyte),    sizeof(sbyte) },
+            { typeof(byte),     sizeof(byte) },
+            { typeof(short),    sizeof(short) },
+            { typeof(ushort),   sizeof(ushort) },
+            { typeof(int),      sizeof(int) },
+            { typeof(uint),     sizeof(uint) },
+            { typeof(ulong),    sizeof(ulong) },
+            { typeof(long),     sizeof(long) },
+        };
+
+        /// <summary>
+        /// Get the wire-size (in bytes) of a type supported by flatbuffers.
+        /// </summary>
+        /// <param name="t">The type to get the wire size of</param>
+        /// <returns></returns>
+        public static int SizeOf<T>()
+        {
+            return genericSizes[typeof(T)];
+        }
+
+        /// <summary>
+        /// Checks if the Type provided is supported as scalar value
+        /// </summary>
+        /// <typeparam name="T">The Type to check</typeparam>
+        /// <returns>True if the type is a scalar type that is supported, falsed otherwise</returns>
+        public static bool IsSupportedType<T>()
+        {
+            return genericSizes.ContainsKey(typeof(T));
+        }
+
+        /// <summary>
+        /// Get the wire-size (in bytes) of an typed array
+        /// </summary>
+        /// <typeparam name="T">The type of the array</typeparam>
+        /// <param name="x">The array to get the size of</param>
+        /// <returns>The number of bytes the array takes on wire</returns>
+        public static int ArraySize<T>(T[] x)
+        {
+            return SizeOf<T>() * x.Length;
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public static int ArraySize<T>(Span<T> x)
+        {
+            return SizeOf<T>() * x.Length;
+        }
+#endif
+
+        // Get a portion of the buffer casted into an array of type T, given
+        // the buffer position and length.
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public T[] ToArray<T>(int pos, int len)
+            where T : struct
+        {
+            AssertOffsetAndLength(pos, len);
+            return MemoryMarshal.Cast<byte, T>(_buffer.ReadOnlySpan.Slice(pos)).Slice(0, len).ToArray();
+        }
+#else
+        public T[] ToArray<T>(int pos, int len)
+            where T : struct
+        {
+            AssertOffsetAndLength(pos, len);
+            T[] arr = new T[len];
+            Buffer.BlockCopy(_buffer.Buffer, pos, arr, 0, ArraySize(arr));
+            return arr;
+        }
+#endif
+
+        public byte[] ToSizedArray()
+        {
+            return ToArray<byte>(Position, Length - Position);
+        }
+
+        public byte[] ToFullArray()
+        {
+            return ToArray<byte>(0, Length);
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public ReadOnlyMemory<byte> ToReadOnlyMemory(int pos, int len)
+        {
+            return _buffer.ReadOnlyMemory.Slice(pos, len);
+        }
+
+        public Memory<byte> ToMemory(int pos, int len)
+        {
+            return _buffer.Memory.Slice(pos, len);
+        }
+
+        public Span<byte> ToSpan(int pos, int len)
+        {
+            return _buffer.Span.Slice(pos, len);
+        }
+#else
+        public ArraySegment<byte> ToArraySegment(int pos, int len)
+        {
+            return new ArraySegment<byte>(_buffer.Buffer, pos, len);
+        }
+
+        public MemoryStream ToMemoryStream(int pos, int len)
+        {
+            return new MemoryStream(_buffer.Buffer, pos, len);
+        }
+#endif
+
+#if !UNSAFE_BYTEBUFFER
+        // A conversion union where all the members are overlapping. This allows to reinterpret the bytes of one type
+        // as another, without additional copies.
+        [StructLayout(LayoutKind.Explicit)]
+        struct ConversionUnion
+        {
+          [FieldOffset(0)] public int intValue;
+          [FieldOffset(0)] public float floatValue;
+        }
+#endif // !UNSAFE_BYTEBUFFER
+
+        // Helper functions for the unsafe version.
+        static public ushort ReverseBytes(ushort input)
+        {
+            return (ushort)(((input & 0x00FFU) << 8) |
+                            ((input & 0xFF00U) >> 8));
+        }
+        static public uint ReverseBytes(uint input)
+        {
+            return ((input & 0x000000FFU) << 24) |
+                   ((input & 0x0000FF00U) <<  8) |
+                   ((input & 0x00FF0000U) >>  8) |
+                   ((input & 0xFF000000U) >> 24);
+        }
+        static public ulong ReverseBytes(ulong input)
+        {
+            return (((input & 0x00000000000000FFUL) << 56) |
+                    ((input & 0x000000000000FF00UL) << 40) |
+                    ((input & 0x0000000000FF0000UL) << 24) |
+                    ((input & 0x00000000FF000000UL) <<  8) |
+                    ((input & 0x000000FF00000000UL) >>  8) |
+                    ((input & 0x0000FF0000000000UL) >> 24) |
+                    ((input & 0x00FF000000000000UL) >> 40) |
+                    ((input & 0xFF00000000000000UL) >> 56));
+        }
+
+#if !UNSAFE_BYTEBUFFER && (!ENABLE_SPAN_T || !NETSTANDARD2_1)
+        // Helper functions for the safe (but slower) version.
+        protected void WriteLittleEndian(int offset, int count, ulong data)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    _buffer.Buffer[offset + i] = (byte)(data >> i * 8);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    _buffer.Buffer[offset + count - 1 - i] = (byte)(data >> i * 8);
+                }
+            }
+        }
+
+        protected ulong ReadLittleEndian(int offset, int count)
+        {
+            AssertOffsetAndLength(offset, count);
+            ulong r = 0;
+            if (BitConverter.IsLittleEndian)
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    r |= (ulong)_buffer.Buffer[offset + i] << i * 8;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    r |= (ulong)_buffer.Buffer[offset + count - 1 - i] << i * 8;
+                }
+            }
+            return r;
+        }
+#elif ENABLE_SPAN_T && NETSTANDARD2_1
+        protected void WriteLittleEndian(int offset, int count, ulong data)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    _buffer.Span[offset + i] = (byte)(data >> i * 8);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    _buffer.Span[offset + count - 1 - i] = (byte)(data >> i * 8);
+                }
+            }
+        }
+
+        protected ulong ReadLittleEndian(int offset, int count)
+        {
+            AssertOffsetAndLength(offset, count);
+            ulong r = 0;
+            if (BitConverter.IsLittleEndian)
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    r |= (ulong)_buffer.Span[offset + i] << i * 8;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < count; i++)
+                {
+                    r |= (ulong)_buffer.Span[offset + count - 1 - i] << i * 8;
+                }
+            }
+            return r;
+        }
+#endif
+
+        private void AssertOffsetAndLength(int offset, int length)
+        {
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+            if (offset < 0 ||
+                offset > _buffer.Length - length)
+                throw new ArgumentOutOfRangeException();
+#endif
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+
+        public void PutSbyte(int offset, sbyte value)
+        {
+            AssertOffsetAndLength(offset, sizeof(sbyte));
+            _buffer.Span[offset] = (byte)value;
+        }
+
+        public void PutByte(int offset, byte value)
+        {
+            AssertOffsetAndLength(offset, sizeof(byte));
+            _buffer.Span[offset] = value;
+        }
+
+        public void PutByte(int offset, byte value, int count)
+        {
+            AssertOffsetAndLength(offset, sizeof(byte) * count);
+            Span<byte> span = _buffer.Span.Slice(offset, count);
+            for (var i = 0; i < span.Length; ++i)
+                span[i] = value;
+        }
+#else
+        public void PutSbyte(int offset, sbyte value)
+        {
+            AssertOffsetAndLength(offset, sizeof(sbyte));
+            _buffer.Buffer[offset] = (byte)value;
+        }
+
+        public void PutByte(int offset, byte value)
+        {
+            AssertOffsetAndLength(offset, sizeof(byte));
+            _buffer.Buffer[offset] = value;
+        }
+
+        public void PutByte(int offset, byte value, int count)
+        {
+            AssertOffsetAndLength(offset, sizeof(byte) * count);
+            for (var i = 0; i < count; ++i)
+                _buffer.Buffer[offset + i] = value;
+        }
+#endif
+
+        // this method exists in order to conform with Java ByteBuffer standards
+        public void Put(int offset, byte value)
+        {
+            PutByte(offset, value);
+        }
+
+#if ENABLE_SPAN_T && UNSAFE_BYTEBUFFER
+        public unsafe void PutStringUTF8(int offset, string value)
+        {
+            AssertOffsetAndLength(offset, value.Length);
+            fixed (char* s = value)
+            {
+                fixed (byte* buffer = &MemoryMarshal.GetReference(_buffer.Span))
+                {
+                    Encoding.UTF8.GetBytes(s, value.Length, buffer + offset, Length - offset);
+                }
+            }
+        }
+#elif ENABLE_SPAN_T && NETSTANDARD2_1
+        public void PutStringUTF8(int offset, string value)
+        {
+            AssertOffsetAndLength(offset, value.Length);
+            Encoding.UTF8.GetBytes(value.AsSpan().Slice(0, value.Length),
+                _buffer.Span.Slice(offset));
+        }
+#else
+        public void PutStringUTF8(int offset, string value)
+        {
+            AssertOffsetAndLength(offset, value.Length);
+            Encoding.UTF8.GetBytes(value, 0, value.Length,
+                _buffer.Buffer, offset);
+        }
+#endif
+
+#if UNSAFE_BYTEBUFFER
+        // Unsafe but more efficient versions of Put*.
+        public void PutShort(int offset, short value)
+        {
+            PutUshort(offset, (ushort)value);
+        }
+
+        public unsafe void PutUshort(int offset, ushort value)
+        {
+            AssertOffsetAndLength(offset, sizeof(ushort));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            Span<byte> span = _buffer.Span.Slice(offset);
+            BinaryPrimitives.WriteUInt16LittleEndian(span, value);
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                *(ushort*)(ptr + offset) = BitConverter.IsLittleEndian
+                    ? value
+                    : ReverseBytes(value);
+            }
+#endif
+        }
+
+        public void PutInt(int offset, int value)
+        {
+            PutUint(offset, (uint)value);
+        }
+
+        public unsafe void PutUint(int offset, uint value)
+        {
+            AssertOffsetAndLength(offset, sizeof(uint));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            Span<byte> span = _buffer.Span.Slice(offset);
+            BinaryPrimitives.WriteUInt32LittleEndian(span, value);
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                *(uint*)(ptr + offset) = BitConverter.IsLittleEndian
+                    ? value
+                    : ReverseBytes(value);
+            }
+#endif
+        }
+
+        public unsafe void PutLong(int offset, long value)
+        {
+            PutUlong(offset, (ulong)value);
+        }
+
+        public unsafe void PutUlong(int offset, ulong value)
+        {
+            AssertOffsetAndLength(offset, sizeof(ulong));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            Span<byte> span = _buffer.Span.Slice(offset);
+            BinaryPrimitives.WriteUInt64LittleEndian(span, value);
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                *(ulong*)(ptr + offset) = BitConverter.IsLittleEndian
+                    ? value
+                    : ReverseBytes(value);
+            }
+#endif
+        }
+
+        public unsafe void PutFloat(int offset, float value)
+        {
+            AssertOffsetAndLength(offset, sizeof(float));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            fixed (byte* ptr = &MemoryMarshal.GetReference(_buffer.Span))
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+#endif
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    *(float*)(ptr + offset) = value;
+                }
+                else
+                {
+                    *(uint*)(ptr + offset) = ReverseBytes(*(uint*)(&value));
+                }
+            }
+        }
+
+        public unsafe void PutDouble(int offset, double value)
+        {
+            AssertOffsetAndLength(offset, sizeof(double));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            fixed (byte* ptr = &MemoryMarshal.GetReference(_buffer.Span))
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+#endif
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    *(double*)(ptr + offset) = value;
+                }
+                else
+                {
+                    *(ulong*)(ptr + offset) = ReverseBytes(*(ulong*)(&value));
+                }
+            }
+        }
+#else // !UNSAFE_BYTEBUFFER
+        // Slower versions of Put* for when unsafe code is not allowed.
+        public void PutShort(int offset, short value)
+        {
+            AssertOffsetAndLength(offset, sizeof(short));
+            WriteLittleEndian(offset, sizeof(short), (ulong)value);
+        }
+
+        public void PutUshort(int offset, ushort value)
+        {
+            AssertOffsetAndLength(offset, sizeof(ushort));
+            WriteLittleEndian(offset, sizeof(ushort), (ulong)value);
+        }
+
+        public void PutInt(int offset, int value)
+        {
+            AssertOffsetAndLength(offset, sizeof(int));
+            WriteLittleEndian(offset, sizeof(int), (ulong)value);
+        }
+
+        public void PutUint(int offset, uint value)
+        {
+            AssertOffsetAndLength(offset, sizeof(uint));
+            WriteLittleEndian(offset, sizeof(uint), (ulong)value);
+        }
+
+        public void PutLong(int offset, long value)
+        {
+            AssertOffsetAndLength(offset, sizeof(long));
+            WriteLittleEndian(offset, sizeof(long), (ulong)value);
+        }
+
+        public void PutUlong(int offset, ulong value)
+        {
+            AssertOffsetAndLength(offset, sizeof(ulong));
+            WriteLittleEndian(offset, sizeof(ulong), value);
+        }
+
+        public void PutFloat(int offset, float value)
+        {
+            AssertOffsetAndLength(offset, sizeof(float));
+            // TODO(derekbailey): use BitConvert.SingleToInt32Bits() whenever flatbuffers upgrades to a .NET version
+            // that contains it.
+            ConversionUnion union;
+            union.intValue = 0;
+            union.floatValue = value;    
+            WriteLittleEndian(offset, sizeof(float), (ulong)union.intValue);
+        }
+
+        public void PutDouble(int offset, double value)
+        {
+            AssertOffsetAndLength(offset, sizeof(double));
+            WriteLittleEndian(offset, sizeof(double), (ulong)BitConverter.DoubleToInt64Bits(value));
+        }
+
+#endif // UNSAFE_BYTEBUFFER
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public sbyte GetSbyte(int index)
+        {
+            AssertOffsetAndLength(index, sizeof(sbyte));
+            return (sbyte)_buffer.ReadOnlySpan[index];
+        }
+
+        public byte Get(int index)
+        {
+            AssertOffsetAndLength(index, sizeof(byte));
+            return _buffer.ReadOnlySpan[index];
+        }
+#else
+        public sbyte GetSbyte(int index)
+        {
+            AssertOffsetAndLength(index, sizeof(sbyte));
+            return (sbyte)_buffer.Buffer[index];
+        }
+
+        public byte Get(int index)
+        {
+            AssertOffsetAndLength(index, sizeof(byte));
+            return _buffer.Buffer[index];
+        }
+#endif
+
+#if ENABLE_SPAN_T && UNSAFE_BYTEBUFFER
+        public unsafe string GetStringUTF8(int startPos, int len)
+        {
+            fixed (byte* buffer = &MemoryMarshal.GetReference(_buffer.ReadOnlySpan.Slice(startPos)))
+            {
+                return Encoding.UTF8.GetString(buffer, len);
+            }
+        }
+#elif ENABLE_SPAN_T && NETSTANDARD2_1
+        public string GetStringUTF8(int startPos, int len)
+        {
+            return Encoding.UTF8.GetString(_buffer.Span.Slice(startPos, len));
+        }
+#else
+        public string GetStringUTF8(int startPos, int len)
+        {
+            return Encoding.UTF8.GetString(_buffer.Buffer, startPos, len);
+        }
+#endif
+
+#if UNSAFE_BYTEBUFFER
+        // Unsafe but more efficient versions of Get*.
+        public short GetShort(int offset)
+        {
+            return (short)GetUshort(offset);
+        }
+
+        public unsafe ushort GetUshort(int offset)
+        {
+            AssertOffsetAndLength(offset, sizeof(ushort));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            ReadOnlySpan<byte> span = _buffer.ReadOnlySpan.Slice(offset);
+            return BinaryPrimitives.ReadUInt16LittleEndian(span);
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                return BitConverter.IsLittleEndian
+                    ? *(ushort*)(ptr + offset)
+                    : ReverseBytes(*(ushort*)(ptr + offset));
+            }
+#endif
+        }
+
+        public int GetInt(int offset)
+        {
+            return (int)GetUint(offset);
+        }
+
+        public unsafe uint GetUint(int offset)
+        {
+            AssertOffsetAndLength(offset, sizeof(uint));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            ReadOnlySpan<byte> span = _buffer.ReadOnlySpan.Slice(offset);
+            return BinaryPrimitives.ReadUInt32LittleEndian(span);
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                return BitConverter.IsLittleEndian
+                    ? *(uint*)(ptr + offset)
+                    : ReverseBytes(*(uint*)(ptr + offset));
+            }
+#endif
+        }
+
+        public long GetLong(int offset)
+        {
+            return (long)GetUlong(offset);
+        }
+
+        public unsafe ulong GetUlong(int offset)
+        {
+            AssertOffsetAndLength(offset, sizeof(ulong));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            ReadOnlySpan<byte> span = _buffer.ReadOnlySpan.Slice(offset);
+            return BinaryPrimitives.ReadUInt64LittleEndian(span);
+#else            
+            fixed (byte* ptr = _buffer.Buffer)
+            {
+                return BitConverter.IsLittleEndian
+                    ? *(ulong*)(ptr + offset)
+                    : ReverseBytes(*(ulong*)(ptr + offset));
+            }
+#endif
+        }
+
+        public unsafe float GetFloat(int offset)
+        {
+            AssertOffsetAndLength(offset, sizeof(float));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            fixed (byte* ptr = &MemoryMarshal.GetReference(_buffer.ReadOnlySpan))
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+#endif
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    return *(float*)(ptr + offset);
+                }
+                else
+                {
+                    uint uvalue = ReverseBytes(*(uint*)(ptr + offset));
+                    return *(float*)(&uvalue);
+                }
+            }
+        }
+
+        public unsafe double GetDouble(int offset)
+        {
+            AssertOffsetAndLength(offset, sizeof(double));
+#if ENABLE_SPAN_T // && UNSAFE_BYTEBUFFER
+            fixed (byte* ptr = &MemoryMarshal.GetReference(_buffer.ReadOnlySpan))
+#else
+            fixed (byte* ptr = _buffer.Buffer)
+#endif
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    return *(double*)(ptr + offset);
+                }
+                else
+                {
+                    ulong uvalue = ReverseBytes(*(ulong*)(ptr + offset));
+                    return *(double*)(&uvalue);
+                }
+            }
+        }
+#else // !UNSAFE_BYTEBUFFER
+        // Slower versions of Get* for when unsafe code is not allowed.
+        public short GetShort(int index)
+        {
+            return (short)ReadLittleEndian(index, sizeof(short));
+        }
+
+        public ushort GetUshort(int index)
+        {
+            return (ushort)ReadLittleEndian(index, sizeof(ushort));
+        }
+
+        public int GetInt(int index)
+        {
+            return (int)ReadLittleEndian(index, sizeof(int));
+        }
+
+        public uint GetUint(int index)
+        {
+            return (uint)ReadLittleEndian(index, sizeof(uint));
+        }
+
+        public long GetLong(int index)
+        {
+            return (long)ReadLittleEndian(index, sizeof(long));
+        }
+
+        public ulong GetUlong(int index)
+        {
+            return ReadLittleEndian(index, sizeof(ulong));
+        }
+
+        public float GetFloat(int index)
+        {
+            // TODO(derekbailey): use BitConvert.Int32BitsToSingle() whenever flatbuffers upgrades to a .NET version
+            // that contains it.
+            ConversionUnion union;
+            union.floatValue = 0;
+            union.intValue = (int)ReadLittleEndian(index, sizeof(float));
+            return union.floatValue;
+        }
+
+        public double GetDouble(int index)
+        {
+            return BitConverter.Int64BitsToDouble((long)ReadLittleEndian(index, sizeof(double)));
+        }
+#endif // UNSAFE_BYTEBUFFER
+
+        /// <summary>
+        /// Copies an array of type T into this buffer, ending at the given
+        /// offset into this buffer. The starting offset is calculated based on the length
+        /// of the array and is the value returned.
+        /// </summary>
+        /// <typeparam name="T">The type of the input data (must be a struct)</typeparam>
+        /// <param name="offset">The offset into this buffer where the copy will end</param>
+        /// <param name="x">The array to copy data from</param>
+        /// <returns>The 'start' location of this buffer now, after the copy completed</returns>
+        public int Put<T>(int offset, T[] x)
+            where T : struct
+        {
+            if (x == null)
+            {
+                throw new ArgumentNullException("Cannot put a null array");
+            }
+
+            if (x.Length == 0)
+            {
+                throw new ArgumentException("Cannot put an empty array");
+            }
+
+            if (!IsSupportedType<T>())
+            {
+                throw new ArgumentException("Cannot put an array of type "
+                    + typeof(T) + " into this buffer");
+            }
+
+            if (BitConverter.IsLittleEndian)
+            {
+                int numBytes = ByteBuffer.ArraySize(x);
+                offset -= numBytes;
+                AssertOffsetAndLength(offset, numBytes);
+                // if we are LE, just do a block copy
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+                MemoryMarshal.Cast<T, byte>(x).CopyTo(_buffer.Span.Slice(offset, numBytes));
+#else
+                Buffer.BlockCopy(x, 0, _buffer.Buffer, offset, numBytes);
+#endif
+            }
+            else
+            {
+                throw new NotImplementedException("Big Endian Support not implemented yet " +
+                    "for putting typed arrays");
+                // if we are BE, we have to swap each element by itself
+                //for(int i = x.Length - 1; i >= 0; i--)
+                //{
+                //  todo: low priority, but need to genericize the Put<T>() functions
+                //}
+            }
+            return offset;
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        public int Put<T>(int offset, Span<T> x)
+            where T : struct
+        {
+            if (x.Length == 0)
+            {
+                throw new ArgumentException("Cannot put an empty array");
+            }
+
+            if (!IsSupportedType<T>())
+            {
+                throw new ArgumentException("Cannot put an array of type "
+                    + typeof(T) + " into this buffer");
+            }
+
+            if (BitConverter.IsLittleEndian)
+            {
+                int numBytes = ByteBuffer.ArraySize(x);
+                offset -= numBytes;
+                AssertOffsetAndLength(offset, numBytes);
+                // if we are LE, just do a block copy
+                MemoryMarshal.Cast<T, byte>(x).CopyTo(_buffer.Span.Slice(offset, numBytes));
+            }
+            else
+            {
+                throw new NotImplementedException("Big Endian Support not implemented yet " +
+                    "for putting typed arrays");
+                // if we are BE, we have to swap each element by itself
+                //for(int i = x.Length - 1; i >= 0; i--)
+                //{
+                //  todo: low priority, but need to genericize the Put<T>() functions
+                //}
+            }
+            return offset;
+        }
+#endif
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBufferUtil.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBufferUtil.cs
new file mode 100644
index 0000000..66e8266
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/ByteBufferUtil.cs
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace FlatBuffers
+{
+	/// <summary>
+	/// Class that collects utility functions around `ByteBuffer`.
+	/// </summary>
+	public class ByteBufferUtil
+	{
+		// Extract the size prefix from a `ByteBuffer`.
+		public static int GetSizePrefix(ByteBuffer bb) {
+			return bb.GetInt(bb.Position);
+		}
+
+		// Create a duplicate of a size-prefixed `ByteBuffer` that has its position
+		// advanced just past the size prefix.
+		public static ByteBuffer RemoveSizePrefix(ByteBuffer bb) {
+			ByteBuffer s = bb.Duplicate();
+			s.Position += FlatBufferConstants.SizePrefixLength;
+			return s;
+		}
+	}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferBuilder.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferBuilder.cs
new file mode 100644
index 0000000..c72b624
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferBuilder.cs
@@ -0,0 +1,944 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+/// @file
+/// @addtogroup flatbuffers_csharp_api
+/// @{
+
+namespace FlatBuffers
+{
+    /// <summary>
+    /// Responsible for building up and accessing a FlatBuffer formatted byte
+    /// array (via ByteBuffer).
+    /// </summary>
+    public class FlatBufferBuilder
+    {
+        private int _space;
+        private ByteBuffer _bb;
+        private int _minAlign = 1;
+
+        // The vtable for the current table (if _vtableSize >= 0)
+        private int[] _vtable = new int[16];
+        // The size of the vtable. -1 indicates no vtable
+        private int _vtableSize = -1;
+        // Starting offset of the current struct/table.
+        private int _objectStart;
+        // List of offsets of all vtables.
+        private int[] _vtables = new int[16];
+        // Number of entries in `vtables` in use.
+        private int _numVtables = 0;
+        // For the current vector being built.
+        private int _vectorNumElems = 0;
+
+        // For CreateSharedString
+        private Dictionary<string, StringOffset> _sharedStringMap = null;
+
+        /// <summary>
+        /// Create a FlatBufferBuilder with a given initial size.
+        /// </summary>
+        /// <param name="initialSize">
+        /// The initial size to use for the internal buffer.
+        /// </param>
+        public FlatBufferBuilder(int initialSize)
+        {
+            if (initialSize <= 0)
+                throw new ArgumentOutOfRangeException("initialSize",
+                    initialSize, "Must be greater than zero");
+            _space = initialSize;
+            _bb = new ByteBuffer(initialSize);
+        }
+
+        /// <summary>
+        /// Create a FlatBufferBuilder backed by the pased in ByteBuffer
+        /// </summary>
+        /// <param name="buffer">The ByteBuffer to write to</param>
+        public FlatBufferBuilder(ByteBuffer buffer)
+        {
+            _bb = buffer;
+            _space = buffer.Length;
+            buffer.Reset();
+        }
+
+        /// <summary>
+        /// Reset the FlatBufferBuilder by purging all data that it holds.
+        /// </summary>
+        public void Clear()
+        {
+            _space = _bb.Length;
+            _bb.Reset();
+            _minAlign = 1;
+            while (_vtableSize > 0) _vtable[--_vtableSize] = 0;
+            _vtableSize = -1;
+            _objectStart = 0;
+            _numVtables = 0;
+            _vectorNumElems = 0;
+            if (_sharedStringMap != null)
+            {
+                _sharedStringMap.Clear();
+            }
+        }
+
+        /// <summary>
+        /// Gets and sets a Boolean to disable the optimization when serializing
+        /// default values to a Table.
+        ///
+        /// In order to save space, fields that are set to their default value
+        /// don't get serialized into the buffer.
+        /// </summary>
+        public bool ForceDefaults { get; set; }
+
+        /// @cond FLATBUFFERS_INTERNAL
+
+        public int Offset { get { return _bb.Length - _space; } }
+
+        public void Pad(int size)
+        {
+             _bb.PutByte(_space -= size, 0, size);
+        }
+
+        // Doubles the size of the ByteBuffer, and copies the old data towards
+        // the end of the new buffer (since we build the buffer backwards).
+        void GrowBuffer()
+        {
+            _bb.GrowFront(_bb.Length << 1);
+        }
+
+        // Prepare to write an element of `size` after `additional_bytes`
+        // have been written, e.g. if you write a string, you need to align
+        // such the int length field is aligned to SIZEOF_INT, and the string
+        // data follows it directly.
+        // If all you need to do is align, `additional_bytes` will be 0.
+        public void Prep(int size, int additionalBytes)
+        {
+            // Track the biggest thing we've ever aligned to.
+            if (size > _minAlign)
+                _minAlign = size;
+            // Find the amount of alignment needed such that `size` is properly
+            // aligned after `additional_bytes`
+            var alignSize =
+                ((~((int)_bb.Length - _space + additionalBytes)) + 1) &
+                (size - 1);
+            // Reallocate the buffer if needed.
+            while (_space < alignSize + size + additionalBytes)
+            {
+                var oldBufSize = (int)_bb.Length;
+                GrowBuffer();
+                _space += (int)_bb.Length - oldBufSize;
+
+            }
+            if (alignSize > 0)
+                Pad(alignSize);
+        }
+
+        public void PutBool(bool x)
+        {
+          _bb.PutByte(_space -= sizeof(byte), (byte)(x ? 1 : 0));
+        }
+
+        public void PutSbyte(sbyte x)
+        {
+          _bb.PutSbyte(_space -= sizeof(sbyte), x);
+        }
+
+        public void PutByte(byte x)
+        {
+            _bb.PutByte(_space -= sizeof(byte), x);
+        }
+
+        public void PutShort(short x)
+        {
+            _bb.PutShort(_space -= sizeof(short), x);
+        }
+
+        public void PutUshort(ushort x)
+        {
+          _bb.PutUshort(_space -= sizeof(ushort), x);
+        }
+
+        public void PutInt(int x)
+        {
+            _bb.PutInt(_space -= sizeof(int), x);
+        }
+
+        public void PutUint(uint x)
+        {
+          _bb.PutUint(_space -= sizeof(uint), x);
+        }
+
+        public void PutLong(long x)
+        {
+            _bb.PutLong(_space -= sizeof(long), x);
+        }
+
+        public void PutUlong(ulong x)
+        {
+          _bb.PutUlong(_space -= sizeof(ulong), x);
+        }
+
+        public void PutFloat(float x)
+        {
+            _bb.PutFloat(_space -= sizeof(float), x);
+        }
+
+        /// <summary>
+        /// Puts an array of type T into this builder at the
+        /// current offset
+        /// </summary>
+        /// <typeparam name="T">The type of the input data </typeparam>
+        /// <param name="x">The array to copy data from</param>
+        public void Put<T>(T[] x)
+            where T : struct
+        {
+            _space = _bb.Put(_space, x);
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        /// <summary>
+        /// Puts a span of type T into this builder at the
+        /// current offset
+        /// </summary>
+        /// <typeparam name="T">The type of the input data </typeparam>
+        /// <param name="x">The span to copy data from</param>
+        public void Put<T>(Span<T> x)
+            where T : struct
+        {
+            _space = _bb.Put(_space, x);
+        }
+#endif
+
+        public void PutDouble(double x)
+        {
+            _bb.PutDouble(_space -= sizeof(double), x);
+        }
+        /// @endcond
+
+        /// <summary>
+        /// Add a `bool` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `bool` to add to the buffer.</param>
+        public void AddBool(bool x) { Prep(sizeof(byte), 0); PutBool(x); }
+
+        /// <summary>
+        /// Add a `sbyte` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `sbyte` to add to the buffer.</param>
+        public void AddSbyte(sbyte x) { Prep(sizeof(sbyte), 0); PutSbyte(x); }
+
+        /// <summary>
+        /// Add a `byte` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `byte` to add to the buffer.</param>
+        public void AddByte(byte x) { Prep(sizeof(byte), 0); PutByte(x); }
+
+        /// <summary>
+        /// Add a `short` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `short` to add to the buffer.</param>
+        public void AddShort(short x) { Prep(sizeof(short), 0); PutShort(x); }
+
+        /// <summary>
+        /// Add an `ushort` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `ushort` to add to the buffer.</param>
+        public void AddUshort(ushort x) { Prep(sizeof(ushort), 0); PutUshort(x); }
+
+        /// <summary>
+        /// Add an `int` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `int` to add to the buffer.</param>
+        public void AddInt(int x) { Prep(sizeof(int), 0); PutInt(x); }
+
+        /// <summary>
+        /// Add an `uint` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `uint` to add to the buffer.</param>
+        public void AddUint(uint x) { Prep(sizeof(uint), 0); PutUint(x); }
+
+        /// <summary>
+        /// Add a `long` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `long` to add to the buffer.</param>
+        public void AddLong(long x) { Prep(sizeof(long), 0); PutLong(x); }
+
+        /// <summary>
+        /// Add an `ulong` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `ulong` to add to the buffer.</param>
+        public void AddUlong(ulong x) { Prep(sizeof(ulong), 0); PutUlong(x); }
+
+        /// <summary>
+        /// Add a `float` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `float` to add to the buffer.</param>
+        public void AddFloat(float x) { Prep(sizeof(float), 0); PutFloat(x); }
+
+        /// <summary>
+        /// Add an array of type T to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <typeparam name="T">The type of the input data</typeparam>
+        /// <param name="x">The array to copy data from</param>
+        public void Add<T>(T[] x)
+            where T : struct
+        {
+            if (x == null)
+            {
+                throw new ArgumentNullException("Cannot add a null array");
+            }
+
+            if( x.Length == 0)
+            {
+                // don't do anything if the array is empty
+                return;
+            }
+
+            if(!ByteBuffer.IsSupportedType<T>())
+            {
+                throw new ArgumentException("Cannot add this Type array to the builder");
+            }
+
+            int size = ByteBuffer.SizeOf<T>();
+            // Need to prep on size (for data alignment) and then we pass the
+            // rest of the length (minus 1) as additional bytes
+            Prep(size, size * (x.Length - 1));
+            Put(x);
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        /// <summary>
+        /// Add a span of type T to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <typeparam name="T">The type of the input data</typeparam>
+        /// <param name="x">The span to copy data from</param>
+        public void Add<T>(Span<T> x)
+            where T : struct
+        {
+            if (!ByteBuffer.IsSupportedType<T>())
+            {
+                throw new ArgumentException("Cannot add this Type array to the builder");
+            }
+
+            int size = ByteBuffer.SizeOf<T>();
+            // Need to prep on size (for data alignment) and then we pass the
+            // rest of the length (minus 1) as additional bytes
+            Prep(size, size * (x.Length - 1));
+            Put(x);
+        }
+#endif
+
+        /// <summary>
+        /// Add a `double` to the buffer (aligns the data and grows if necessary).
+        /// </summary>
+        /// <param name="x">The `double` to add to the buffer.</param>
+        public void AddDouble(double x) { Prep(sizeof(double), 0);
+                                          PutDouble(x); }
+
+        /// <summary>
+        /// Adds an offset, relative to where it will be written.
+        /// </summary>
+        /// <param name="off">The offset to add to the buffer.</param>
+        public void AddOffset(int off)
+        {
+            Prep(sizeof(int), 0);  // Ensure alignment is already done.
+            if (off > Offset)
+                throw new ArgumentException();
+
+            off = Offset - off + sizeof(int);
+            PutInt(off);
+        }
+
+        /// @cond FLATBUFFERS_INTERNAL
+        public void StartVector(int elemSize, int count, int alignment)
+        {
+            NotNested();
+            _vectorNumElems = count;
+            Prep(sizeof(int), elemSize * count);
+            Prep(alignment, elemSize * count); // Just in case alignment > int.
+        }
+        /// @endcond
+
+        /// <summary>
+        /// Writes data necessary to finish a vector construction.
+        /// </summary>
+        public VectorOffset EndVector()
+        {
+            PutInt(_vectorNumElems);
+            return new VectorOffset(Offset);
+        }
+
+        /// <summary>
+        /// Creates a vector of tables.
+        /// </summary>
+        /// <param name="offsets">Offsets of the tables.</param>
+        public VectorOffset CreateVectorOfTables<T>(Offset<T>[] offsets) where T : struct
+        {
+            NotNested();
+            StartVector(sizeof(int), offsets.Length, sizeof(int));
+            for (int i = offsets.Length - 1; i >= 0; i--) AddOffset(offsets[i].Value);
+            return EndVector();
+        }
+
+        /// @cond FLATBUFFERS_INTENRAL
+        public void Nested(int obj)
+        {
+            // Structs are always stored inline, so need to be created right
+            // where they are used. You'll get this assert if you created it
+            // elsewhere.
+            if (obj != Offset)
+                throw new Exception(
+                    "FlatBuffers: struct must be serialized inline.");
+        }
+
+        public void NotNested()
+        {
+            // You should not be creating any other objects or strings/vectors
+            // while an object is being constructed
+            if (_vtableSize >= 0)
+                throw new Exception(
+                    "FlatBuffers: object serialization must not be nested.");
+        }
+
+        public void StartTable(int numfields)
+        {
+            if (numfields < 0)
+                throw new ArgumentOutOfRangeException("Flatbuffers: invalid numfields");
+
+            NotNested();
+
+            if (_vtable.Length < numfields)
+                _vtable = new int[numfields];
+
+            _vtableSize = numfields;
+            _objectStart = Offset;
+        }
+
+
+        // Set the current vtable at `voffset` to the current location in the
+        // buffer.
+        public void Slot(int voffset)
+        {
+            if (voffset >= _vtableSize)
+                throw new IndexOutOfRangeException("Flatbuffers: invalid voffset");
+
+            _vtable[voffset] = Offset;
+        }
+
+        /// <summary>
+        /// Adds a Boolean to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddBool(int o, bool x, bool d) { if (ForceDefaults || x != d) { AddBool(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Boolean to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable boolean value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>       
+        public void AddBool(int o, bool? x) { if (x.HasValue) { AddBool(x.Value); Slot(o); } }
+
+        
+        /// <summary>
+        /// Adds a SByte to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddSbyte(int o, sbyte x, sbyte d) { if (ForceDefaults || x != d) { AddSbyte(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a SByte to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable sbyte value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddSbyte(int o, sbyte? x) { if (x.HasValue) { AddSbyte(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Byte to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddByte(int o, byte x, byte d) { if (ForceDefaults || x != d) { AddByte(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Byte to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable byte value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddByte(int o, byte? x) { if (x.HasValue) { AddByte(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Int16 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddShort(int o, short x, int d) { if (ForceDefaults || x != d) { AddShort(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Int16 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable int16 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddShort(int o, short? x) { if (x.HasValue) { AddShort(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a UInt16 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddUshort(int o, ushort x, ushort d) { if (ForceDefaults || x != d) { AddUshort(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Uint16 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable uint16 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddUshort(int o, ushort? x) { if (x.HasValue) { AddUshort(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds an Int32 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddInt(int o, int x, int d) { if (ForceDefaults || x != d) { AddInt(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Int32 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable int32 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddInt(int o, int? x) { if (x.HasValue) { AddInt(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a UInt32 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddUint(int o, uint x, uint d) { if (ForceDefaults || x != d) { AddUint(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a UInt32 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable uint32 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddUint(int o, uint? x) { if (x.HasValue) { AddUint(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds an Int64 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddLong(int o, long x, long d) { if (ForceDefaults || x != d) { AddLong(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Int64 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable int64 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddLong(int o, long? x) { if (x.HasValue) { AddLong(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a UInt64 to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddUlong(int o, ulong x, ulong d) { if (ForceDefaults || x != d) { AddUlong(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a UInt64 to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable int64 value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddUlong(int o, ulong? x) { if (x.HasValue) { AddUlong(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Single to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddFloat(int o, float x, double d) { if (ForceDefaults || x != d) { AddFloat(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Single to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable single value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddFloat(int o, float? x) { if (x.HasValue) { AddFloat(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Double to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// and <see cref="ForceDefaults"/> is false, the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddDouble(int o, double x, double d) { if (ForceDefaults || x != d) { AddDouble(x); Slot(o); } }
+
+        /// <summary>
+        /// Adds a Double to the Table at index `o` in its vtable using the nullable value `x`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The nullable double value to put into the buffer. If it doesn't have a value
+        /// it will skip writing to the buffer.</param>  
+        public void AddDouble(int o, double? x) { if (x.HasValue) { AddDouble(x.Value); Slot(o); } }
+
+        /// <summary>
+        /// Adds a buffer offset to the Table at index `o` in its vtable using the value `x` and default `d`
+        /// </summary>
+        /// <param name="o">The index into the vtable</param>
+        /// <param name="x">The value to put into the buffer. If the value is equal to the default
+        /// the value will be skipped.</param>
+        /// <param name="d">The default value to compare the value against</param>
+        public void AddOffset(int o, int x, int d) { if (x != d) { AddOffset(x); Slot(o); } }
+        /// @endcond
+
+        /// <summary>
+        /// Encode the string `s` in the buffer using UTF-8.
+        /// </summary>
+        /// <param name="s">The string to encode.</param>
+        /// <returns>
+        /// The offset in the buffer where the encoded string starts.
+        /// </returns>
+        public StringOffset CreateString(string s)
+        {
+            if (s == null)
+            {
+                return new StringOffset(0);
+            }
+            NotNested();
+            AddByte(0);
+            var utf8StringLen = Encoding.UTF8.GetByteCount(s);
+            StartVector(1, utf8StringLen, 1);
+            _bb.PutStringUTF8(_space -= utf8StringLen, s);
+            return new StringOffset(EndVector().Value);
+        }
+
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        /// <summary>
+        /// Creates a string in the buffer from a Span containing
+        /// a UTF8 string.
+        /// </summary>
+        /// <param name="chars">the UTF8 string to add to the buffer</param>
+        /// <returns>
+        /// The offset in the buffer where the encoded string starts.
+        /// </returns>
+        public StringOffset CreateUTF8String(Span<byte> chars)
+        {
+            NotNested();
+            AddByte(0);
+            var utf8StringLen = chars.Length;
+            StartVector(1, utf8StringLen, 1);
+            _space = _bb.Put(_space, chars);
+            return new StringOffset(EndVector().Value);
+        }
+#endif
+
+        /// <summary>
+        /// Store a string in the buffer, which can contain any binary data.
+        /// If a string with this exact contents has already been serialized before,
+        /// instead simply returns the offset of the existing string.
+        /// </summary>
+        /// <param name="s">The string to encode.</param>
+        /// <returns>
+        /// The offset in the buffer where the encoded string starts.
+        /// </returns>
+        public StringOffset CreateSharedString(string s)
+        {
+            if (s == null)
+            {
+              return new StringOffset(0);
+            }
+
+            if (_sharedStringMap == null)
+            {
+                _sharedStringMap = new Dictionary<string, StringOffset>();
+            }
+
+            if (_sharedStringMap.ContainsKey(s))
+            {
+                return _sharedStringMap[s];
+            }
+
+            var stringOffset = CreateString(s);
+            _sharedStringMap.Add(s, stringOffset);
+            return stringOffset;
+        }
+
+        /// @cond FLATBUFFERS_INTERNAL
+        // Structs are stored inline, so nothing additional is being added.
+        // `d` is always 0.
+        public void AddStruct(int voffset, int x, int d)
+        {
+            if (x != d)
+            {
+                Nested(x);
+                Slot(voffset);
+            }
+        }
+
+        public int EndTable()
+        {
+            if (_vtableSize < 0)
+                throw new InvalidOperationException(
+                  "Flatbuffers: calling EndTable without a StartTable");
+
+            AddInt((int)0);
+            var vtableloc = Offset;
+            // Write out the current vtable.
+            int i = _vtableSize - 1;
+            // Trim trailing zeroes.
+            for (; i >= 0 && _vtable[i] == 0; i--) {}
+            int trimmedSize = i + 1;
+            for (; i >= 0 ; i--) {
+                // Offset relative to the start of the table.
+                short off = (short)(_vtable[i] != 0
+                                        ? vtableloc - _vtable[i]
+                                        : 0);
+                AddShort(off);
+
+                // clear out written entry
+                _vtable[i] = 0;
+            }
+
+            const int standardFields = 2; // The fields below:
+            AddShort((short)(vtableloc - _objectStart));
+            AddShort((short)((trimmedSize + standardFields) *
+                             sizeof(short)));
+
+            // Search for an existing vtable that matches the current one.
+            int existingVtable = 0;
+            for (i = 0; i < _numVtables; i++) {
+                int vt1 = _bb.Length - _vtables[i];
+                int vt2 = _space;
+                short len = _bb.GetShort(vt1);
+                if (len == _bb.GetShort(vt2)) {
+                    for (int j = sizeof(short); j < len; j += sizeof(short)) {
+                        if (_bb.GetShort(vt1 + j) != _bb.GetShort(vt2 + j)) {
+                            goto endLoop;
+                        }
+                    }
+                    existingVtable = _vtables[i];
+                    break;
+                }
+
+                endLoop: { }
+            }
+
+            if (existingVtable != 0) {
+                // Found a match:
+                // Remove the current vtable.
+                _space = _bb.Length - vtableloc;
+                // Point table to existing vtable.
+                _bb.PutInt(_space, existingVtable - vtableloc);
+            } else {
+                // No match:
+                // Add the location of the current vtable to the list of
+                // vtables.
+                if (_numVtables == _vtables.Length)
+                {
+                    // Arrays.CopyOf(vtables num_vtables * 2);
+                    var newvtables = new int[ _numVtables * 2];
+                    Array.Copy(_vtables, newvtables, _vtables.Length);
+
+                    _vtables = newvtables;
+                };
+                _vtables[_numVtables++] = Offset;
+                // Point table to current vtable.
+                _bb.PutInt(_bb.Length - vtableloc, Offset - vtableloc);
+            }
+
+            _vtableSize = -1;
+            return vtableloc;
+        }
+
+        // This checks a required field has been set in a given table that has
+        // just been constructed.
+        public void Required(int table, int field)
+        {
+          int table_start = _bb.Length - table;
+          int vtable_start = table_start - _bb.GetInt(table_start);
+          bool ok = _bb.GetShort(vtable_start + field) != 0;
+          // If this fails, the caller will show what field needs to be set.
+          if (!ok)
+            throw new InvalidOperationException("FlatBuffers: field " + field +
+                                                " must be set");
+        }
+        /// @endcond
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `root_table`.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        /// <param name="sizePrefix">
+        /// Whether to prefix the size to the buffer.
+        /// </param>
+        protected void Finish(int rootTable, bool sizePrefix)
+        {
+            Prep(_minAlign, sizeof(int) + (sizePrefix ? sizeof(int) : 0));
+            AddOffset(rootTable);
+            if (sizePrefix) {
+                AddInt(_bb.Length - _space);
+            }
+            _bb.Position = _space;
+        }
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `root_table`.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        public void Finish(int rootTable)
+        {
+            Finish(rootTable, false);
+        }
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `root_table`, with the size prefixed.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        public void FinishSizePrefixed(int rootTable)
+        {
+            Finish(rootTable, true);
+        }
+
+        /// <summary>
+        /// Get the ByteBuffer representing the FlatBuffer.
+        /// </summary>
+        /// <remarks>
+        /// This is typically only called after you call `Finish()`.
+        /// The actual data starts at the ByteBuffer's current position,
+        /// not necessarily at `0`.
+        /// </remarks>
+        /// <returns>
+        /// Returns the ByteBuffer for this FlatBuffer.
+        /// </returns>
+        public ByteBuffer DataBuffer { get { return _bb; } }
+
+        /// <summary>
+        /// A utility function to copy and return the ByteBuffer data as a
+        /// `byte[]`.
+        /// </summary>
+        /// <returns>
+        /// A full copy of the FlatBuffer data.
+        /// </returns>
+        public byte[] SizedByteArray()
+        {
+            return _bb.ToSizedArray();
+        }
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `rootTable`.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        /// <param name="fileIdentifier">
+        /// A FlatBuffer file identifier to be added to the buffer before
+        /// `root_table`.
+        /// </param>
+        /// <param name="sizePrefix">
+        /// Whether to prefix the size to the buffer.
+        /// </param>
+        protected void Finish(int rootTable, string fileIdentifier, bool sizePrefix)
+        {
+            Prep(_minAlign, sizeof(int) + (sizePrefix ? sizeof(int) : 0) +
+                            FlatBufferConstants.FileIdentifierLength);
+            if (fileIdentifier.Length !=
+                FlatBufferConstants.FileIdentifierLength)
+                throw new ArgumentException(
+                    "FlatBuffers: file identifier must be length " +
+                    FlatBufferConstants.FileIdentifierLength,
+                    "fileIdentifier");
+            for (int i = FlatBufferConstants.FileIdentifierLength - 1; i >= 0;
+                 i--)
+            {
+               AddByte((byte)fileIdentifier[i]);
+            }
+            Finish(rootTable, sizePrefix);
+        }
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `rootTable`.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        /// <param name="fileIdentifier">
+        /// A FlatBuffer file identifier to be added to the buffer before
+        /// `root_table`.
+        /// </param>
+        public void Finish(int rootTable, string fileIdentifier)
+        {
+            Finish(rootTable, fileIdentifier, false);
+        }
+
+        /// <summary>
+        /// Finalize a buffer, pointing to the given `rootTable`, with the size prefixed.
+        /// </summary>
+        /// <param name="rootTable">
+        /// An offset to be added to the buffer.
+        /// </param>
+        /// <param name="fileIdentifier">
+        /// A FlatBuffer file identifier to be added to the buffer before
+        /// `root_table`.
+        /// </param>
+        public void FinishSizePrefixed(int rootTable, string fileIdentifier)
+        {
+            Finish(rootTable, fileIdentifier, true);
+        }
+    }
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferConstants.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferConstants.cs
new file mode 100644
index 0000000..473f79a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBufferConstants.cs
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace FlatBuffers
+{
+    public static class FlatBufferConstants
+    {
+        public const int FileIdentifierLength = 4;
+        public const int SizePrefixLength = 4;
+        /** A version identifier to force a compile error if someone
+        accidentally tries to build generated code with a runtime of
+        two mismatched version. Versions need to always match, as
+        the runtime and generated code are modified in sync.
+        Changes to the C# implementation need to be sure to change
+        the version here and in the code generator on every possible
+        incompatible change */
+        public static void FLATBUFFERS_2_0_0() {}
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.csproj b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.csproj
new file mode 100644
index 0000000..54cba5a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.csproj
@@ -0,0 +1,23 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>netstandard2.1;netstandard2.0;net46</TargetFrameworks>
+    <GenerateAssemblyInfo>false</GenerateAssemblyInfo>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(UNSAFE_BYTEBUFFER)' == 'true'">
+    <DefineConstants>$(DefineConstants);UNSAFE_BYTEBUFFER</DefineConstants>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(BYTEBUFFER_NO_BOUNDS_CHECK)' == 'true'">
+    <DefineConstants>$(DefineConstants);BYTEBUFFER_NO_BOUNDS_CHECK</DefineConstants>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(ENABLE_SPAN_T)' == 'true'">
+    <DefineConstants>$(DefineConstants);ENABLE_SPAN_T</DefineConstants>
+  </PropertyGroup>
+
+  <ItemGroup Condition="('$(ENABLE_SPAN_T)' == 'true') And (('$(TargetFramework)' == 'netstandard2.0') Or ('$(TargetFramework)' == 'net46'))">
+    <PackageReference Include="System.Memory" Version="4.5.4" />
+  </ItemGroup>
+
+</Project>
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.net35.csproj b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.net35.csproj
new file mode 100644
index 0000000..574580e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/FlatBuffers.net35.csproj
@@ -0,0 +1,57 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{28C00774-1E73-4A75-AD8F-844CD21A064D}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>FlatBuffers</RootNamespace>
+    <AssemblyName>FlatBuffers</AssemblyName>
+    <TargetFrameworkVersion>v3.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\net35</OutputPath>
+    <IntermediateOutputPath>obj\Debug\net35</IntermediateOutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\net35</OutputPath>
+    <IntermediateOutputPath>obj\Release\net35</IntermediateOutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="ByteBuffer.cs" />
+    <Compile Include="ByteBufferUtil.cs" />
+    <Compile Include="FlatBufferBuilder.cs" />
+    <Compile Include="FlatBufferConstants.cs" />
+    <Compile Include="IFlatbufferObject.cs" />
+    <Compile Include="Offset.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Struct.cs" />
+    <Compile Include="Table.cs" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/IFlatbufferObject.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/IFlatbufferObject.cs
new file mode 100644
index 0000000..6a15aba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/IFlatbufferObject.cs
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace FlatBuffers
+{
+    /// <summary>
+    /// This is the base for both structs and tables.
+    /// </summary>
+    public interface IFlatbufferObject
+    {
+        void __init(int _i, ByteBuffer _bb);
+
+        ByteBuffer ByteBuffer { get; }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Offset.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Offset.cs
new file mode 100644
index 0000000..2b17cec
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Offset.cs
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace FlatBuffers
+{
+    /// <summary>
+    /// Offset class for typesafe assignments.
+    /// </summary>
+    public struct Offset<T> where T : struct
+    {
+        public int Value;
+        public Offset(int value)
+        {
+            Value = value;
+        }
+    }
+
+    public struct StringOffset
+    {
+        public int Value;
+        public StringOffset(int value)
+        {
+            Value = value;
+        }
+    }
+
+    public struct VectorOffset
+    {
+        public int Value;
+        public VectorOffset(int value)
+        {
+            Value = value;
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Properties/AssemblyInfo.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..1edfac4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Properties/AssemblyInfo.cs
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("FlatBuffers")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("FlatBuffers")]
+[assembly: AssemblyCopyright("Copyright (c) 2015 Google Inc")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components.  If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("91c32e64-ef20-47df-9c9f-cec9207bc6df")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Struct.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Struct.cs
new file mode 100644
index 0000000..4832cda
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Struct.cs
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace FlatBuffers
+{
+    /// <summary>
+    /// All structs in the generated code derive from this class, and add their own accessors.
+    /// </summary>
+    public struct Struct
+    {
+        public int bb_pos { get; private set; }
+        public ByteBuffer bb { get; private set; }
+
+        // Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+        public Struct(int _i, ByteBuffer _bb) : this()
+        {
+            bb = _bb;
+            bb_pos = _i;
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Table.cs b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Table.cs
new file mode 100644
index 0000000..d888de5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/net/FlatBuffers/Table.cs
@@ -0,0 +1,208 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Text;
+using System.Runtime.InteropServices;
+
+namespace FlatBuffers
+{
+    /// <summary>
+    /// All tables in the generated code derive from this struct, and add their own accessors.
+    /// </summary>
+    public struct Table
+    {
+        public int bb_pos { get; private set; }
+        public ByteBuffer bb { get; private set; }
+
+        public ByteBuffer ByteBuffer { get { return bb; } }
+
+        // Re-init the internal state with an external buffer {@code ByteBuffer} and an offset within.
+        public Table(int _i, ByteBuffer _bb) : this()
+        {
+            bb = _bb;
+            bb_pos = _i;
+        }
+
+        // Look up a field in the vtable, return an offset into the object, or 0 if the field is not
+        // present.
+        public int __offset(int vtableOffset)
+        {
+            int vtable = bb_pos - bb.GetInt(bb_pos);
+            return vtableOffset < bb.GetShort(vtable) ? (int)bb.GetShort(vtable + vtableOffset) : 0;
+        }
+
+        public static int __offset(int vtableOffset, int offset, ByteBuffer bb)
+        {
+            int vtable = bb.Length - offset;
+            return (int)bb.GetShort(vtable + vtableOffset - bb.GetInt(vtable)) + vtable;
+        }
+
+        // Retrieve the relative offset stored at "offset"
+        public int __indirect(int offset)
+        {
+            return offset + bb.GetInt(offset);
+        }
+
+        public static int __indirect(int offset, ByteBuffer bb)
+        {
+            return offset + bb.GetInt(offset);
+        }
+
+        // Create a .NET String from UTF-8 data stored inside the flatbuffer.
+        public string __string(int offset)
+        {
+            offset += bb.GetInt(offset);
+            var len = bb.GetInt(offset);
+            var startPos = offset + sizeof(int);
+            return bb.GetStringUTF8(startPos, len);
+        }
+
+        // Get the length of a vector whose offset is stored at "offset" in this object.
+        public int __vector_len(int offset)
+        {
+            offset += bb_pos;
+            offset += bb.GetInt(offset);
+            return bb.GetInt(offset);
+        }
+
+        // Get the start of data of a vector whose offset is stored at "offset" in this object.
+        public int __vector(int offset)
+        {
+            offset += bb_pos;
+            return offset + bb.GetInt(offset) + sizeof(int);  // data starts after the length
+        }
+
+#if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1)
+        // Get the data of a vector whoses offset is stored at "offset" in this object as an
+        // Spant&lt;byte&gt;. If the vector is not present in the ByteBuffer,
+        // then an empty span will be returned.
+        public Span<T> __vector_as_span<T>(int offset, int elementSize) where T : struct
+        {
+            if (!BitConverter.IsLittleEndian)
+            {
+               throw new NotSupportedException("Getting typed span on a Big Endian " +
+                                               "system is not support");
+            }
+
+            var o = this.__offset(offset);
+            if (0 == o)
+            {
+                return new Span<T>();
+            }
+
+            var pos = this.__vector(o);
+            var len = this.__vector_len(o);
+            return MemoryMarshal.Cast<byte, T>(bb.ToSpan(pos, len * elementSize));
+        }
+#else
+        // Get the data of a vector whoses offset is stored at "offset" in this object as an
+        // ArraySegment&lt;byte&gt;. If the vector is not present in the ByteBuffer,
+        // then a null value will be returned.
+        public ArraySegment<byte>? __vector_as_arraysegment(int offset)
+        {
+            var o = this.__offset(offset);
+            if (0 == o)
+            {
+                return null;
+            }
+
+            var pos = this.__vector(o);
+            var len = this.__vector_len(o);
+            return bb.ToArraySegment(pos, len);
+        }
+#endif
+
+        // Get the data of a vector whoses offset is stored at "offset" in this object as an
+        // T[]. If the vector is not present in the ByteBuffer, then a null value will be
+        // returned.
+        public T[] __vector_as_array<T>(int offset)
+            where T : struct
+        {
+            if(!BitConverter.IsLittleEndian)
+            {
+                throw new NotSupportedException("Getting typed arrays on a Big Endian " +
+                    "system is not support");
+            }
+
+            var o = this.__offset(offset);
+            if (0 == o)
+            {
+                return null;
+            }
+
+            var pos = this.__vector(o);
+            var len = this.__vector_len(o);
+            return bb.ToArray<T>(pos, len);
+        }
+
+        // Initialize any Table-derived type to point to the union at the given offset.
+        public T __union<T>(int offset) where T : struct, IFlatbufferObject
+        {
+            T t = new T();
+            t.__init(__indirect(offset), bb);
+            return t;
+        }
+
+        public static bool __has_identifier(ByteBuffer bb, string ident)
+        {
+            if (ident.Length != FlatBufferConstants.FileIdentifierLength)
+                throw new ArgumentException("FlatBuffers: file identifier must be length " + FlatBufferConstants.FileIdentifierLength, "ident");
+
+            for (var i = 0; i < FlatBufferConstants.FileIdentifierLength; i++)
+            {
+                if (ident[i] != (char)bb.Get(bb.Position + sizeof(int) + i)) return false;
+            }
+
+            return true;
+        }
+
+        // Compare strings in the ByteBuffer.
+        public static int CompareStrings(int offset_1, int offset_2, ByteBuffer bb)
+        {
+            offset_1 += bb.GetInt(offset_1);
+            offset_2 += bb.GetInt(offset_2);
+            var len_1 = bb.GetInt(offset_1);
+            var len_2 = bb.GetInt(offset_2);
+            var startPos_1 = offset_1 + sizeof(int);
+            var startPos_2 = offset_2 + sizeof(int);
+            var len = Math.Min(len_1, len_2);
+            for(int i = 0; i < len; i++) {
+                byte b1 = bb.Get(i + startPos_1);
+                byte b2 = bb.Get(i + startPos_2);
+                if (b1 != b2)
+                    return b1 - b2;
+            }
+            return len_1 - len_2;
+        }
+
+        // Compare string from the ByteBuffer with the string object
+        public static int CompareStrings(int offset_1, byte[] key, ByteBuffer bb)
+        {
+            offset_1 += bb.GetInt(offset_1);
+            var len_1 = bb.GetInt(offset_1);
+            var len_2 = key.Length;
+            var startPos_1 = offset_1 + sizeof(int);
+            var len = Math.Min(len_1, len_2);
+            for (int i = 0; i < len; i++) {
+                byte b = bb.Get(i + startPos_1);
+                if (b != key[i])
+                    return b - key[i];
+            }
+            return len_1 - len_2;
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/package.json b/3rdparty/TNN/third_party/flatbuffers/package.json
new file mode 100644
index 0000000..7798eb4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/package.json
@@ -0,0 +1,43 @@
+{
+  "name": "flatbuffers",
+  "version": "2.0.0",
+  "description": "Memory Efficient Serialization Library",
+  "files": [
+    "js/*.js",
+    "js/*.d.ts",
+    "mjs/*.js",
+    "mjs/*.d.ts",
+    "ts/*.ts"
+  ],
+  "main": "js/flatbuffers.js",
+  "module": "mjs/index.mjs",
+  "directories": {
+    "doc": "docs",
+    "test": "tests"
+  },
+  "scripts": {
+    "test": "npm run compile && cd tests && ./TypeScriptTest.sh",
+    "compile": "tsc && tsc -p tsconfig.mjs.json",
+    "prepublishOnly": "npm run compile"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/google/flatbuffers.git"
+  },
+  "keywords": [
+    "flatbuffers"
+  ],
+  "author": "The FlatBuffers project",
+  "license": "SEE LICENSE IN LICENSE.txt",
+  "bugs": {
+    "url": "https://github.com/google/flatbuffers/issues"
+  },
+  "homepage": "https://google.github.io/flatbuffers/",
+  "dependencies": {},
+  "devDependencies": {
+    "@typescript-eslint/eslint-plugin": "^4.12.0",
+    "@typescript-eslint/parser": "^4.12.0",
+    "eslint": "^7.17.0",
+    "typescript": "^4.1.3"
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/php/ByteBuffer.php b/3rdparty/TNN/third_party/flatbuffers/php/ByteBuffer.php
new file mode 100644
index 0000000..9929a7d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/php/ByteBuffer.php
@@ -0,0 +1,493 @@
+<?php
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Google\FlatBuffers;
+
+class ByteBuffer
+{
+    /**
+     * @var string $_buffer;
+     */
+    public $_buffer;
+
+    /**
+     * @var int $_pos;
+     */
+    private $_pos;
+
+    /**
+     * @var bool $_is_little_endian
+     */
+    private static $_is_little_endian = null;
+
+    public static function wrap($bytes)
+    {
+        $bb = new ByteBuffer(0);
+        $bb->_buffer = $bytes;
+
+        return $bb;
+    }
+
+    /**
+     * @param $size
+     */
+    public function __construct($size)
+    {
+        $this->_buffer = str_repeat("\0", $size);
+    }
+
+    /**
+     * @return int
+     */
+    public function capacity()
+    {
+        return strlen($this->_buffer);
+    }
+
+    /**
+     * @return int
+     */
+    public function getPosition()
+    {
+        return $this->_pos;
+    }
+
+    /**
+     * @param $pos
+     */
+    public function setPosition($pos)
+    {
+        $this->_pos = $pos;
+    }
+
+    /**
+     *
+     */
+    public function reset()
+    {
+        $this->_pos = 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function length()
+    {
+        return strlen($this->_buffer);
+    }
+
+    /**
+     * @return string
+     */
+    public function data()
+    {
+        return substr($this->_buffer, $this->_pos);
+    }
+
+    /**
+     * @return bool
+     */
+    public static function isLittleEndian()
+    {
+        if (ByteBuffer::$_is_little_endian === null) {
+            ByteBuffer::$_is_little_endian = unpack('S', "\x01\x00")[1] === 1;
+        }
+
+        return ByteBuffer::$_is_little_endian;
+    }
+
+    /**
+     * write little endian value to the buffer.
+     *
+     * @param $offset
+     * @param $count byte length
+     * @param $data actual values
+     */
+    public function writeLittleEndian($offset, $count, $data)
+    {
+        if (ByteBuffer::isLittleEndian()) {
+            for ($i = 0; $i < $count; $i++) {
+                $this->_buffer[$offset + $i] = chr($data >> $i * 8);
+            }
+        } else {
+            for ($i = 0; $i < $count; $i++) {
+                $this->_buffer[$offset + $count - 1 - $i] = chr($data >> $i * 8);
+            }
+        }
+    }
+
+    /**
+     * read little endian value from the buffer
+     *
+     * @param $offset
+     * @param $count acutal size
+     * @return int
+     */
+    public function readLittleEndian($offset, $count, $force_bigendian = false)
+    {
+        $this->assertOffsetAndLength($offset, $count);
+        $r = 0;
+
+        if (ByteBuffer::isLittleEndian() && $force_bigendian == false) {
+            for ($i = 0; $i < $count; $i++) {
+                $r |= ord($this->_buffer[$offset + $i]) << $i * 8;
+            }
+        } else {
+            for ($i = 0; $i < $count; $i++) {
+                $r |= ord($this->_buffer[$offset + $count -1 - $i]) << $i * 8;
+            }
+        }
+
+        return $r;
+    }
+
+    /**
+     * @param $offset
+     * @param $length
+     */
+    public function assertOffsetAndLength($offset, $length)
+    {
+        if ($offset < 0 ||
+            $offset >= strlen($this->_buffer) ||
+            $offset + $length > strlen($this->_buffer)) {
+            throw new \OutOfRangeException(sprintf("offset: %d, length: %d, buffer; %d", $offset, $length, strlen($this->_buffer)));
+        }
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     * @return mixed
+     */
+    public function putSbyte($offset, $value)
+    {
+        self::validateValue(-128, 127, $value, "sbyte");
+
+        $length = strlen($value);
+        $this->assertOffsetAndLength($offset, $length);
+        return $this->_buffer[$offset] = $value;
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     * @return mixed
+     */
+    public function putByte($offset, $value)
+    {
+        self::validateValue(0, 255, $value, "byte");
+
+        $length = strlen($value);
+        $this->assertOffsetAndLength($offset, $length);
+        return $this->_buffer[$offset] = $value;
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function put($offset, $value)
+    {
+        $length = strlen($value);
+        $this->assertOffsetAndLength($offset, $length);
+        for ($i = 0; $i < $length; $i++) {
+            $this->_buffer[$offset + $i] = $value[$i];
+        }
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putShort($offset, $value)
+    {
+        self::validateValue(-32768, 32767, $value, "short");
+
+        $this->assertOffsetAndLength($offset, 2);
+        $this->writeLittleEndian($offset, 2, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putUshort($offset, $value)
+    {
+        self::validateValue(0, 65535, $value, "short");
+
+        $this->assertOffsetAndLength($offset, 2);
+        $this->writeLittleEndian($offset, 2, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putInt($offset, $value)
+    {
+        // 2147483647 = (1 << 31) -1 = Maximum signed 32-bit int
+        // -2147483648 = -1 << 31 = Minimum signed 32-bit int
+        self::validateValue(-2147483648, 2147483647, $value, "int");
+
+        $this->assertOffsetAndLength($offset, 4);
+        $this->writeLittleEndian($offset, 4, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putUint($offset, $value)
+    {
+        // NOTE: We can't put big integer value. this is PHP limitation.
+        // 4294967295 = (1 << 32) -1 = Maximum unsigned 32-bin int
+        self::validateValue(0, 4294967295, $value, "uint",  " php has big numbers limitation. check your PHP_INT_MAX");
+
+        $this->assertOffsetAndLength($offset, 4);
+        $this->writeLittleEndian($offset, 4, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putLong($offset, $value)
+    {
+        // NOTE: We can't put big integer value. this is PHP limitation.
+        self::validateValue(~PHP_INT_MAX, PHP_INT_MAX, $value, "long",  " php has big numbers limitation. check your PHP_INT_MAX");
+
+        $this->assertOffsetAndLength($offset, 8);
+        $this->writeLittleEndian($offset, 8, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putUlong($offset, $value)
+    {
+        // NOTE: We can't put big integer value. this is PHP limitation.
+        self::validateValue(0, PHP_INT_MAX, $value, "long", " php has big numbers limitation. check your PHP_INT_MAX");
+
+        $this->assertOffsetAndLength($offset, 8);
+        $this->writeLittleEndian($offset, 8, $value);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putFloat($offset, $value)
+    {
+        $this->assertOffsetAndLength($offset, 4);
+
+        $floathelper = pack("f", $value);
+        $v = unpack("V", $floathelper);
+        $this->writeLittleEndian($offset, 4, $v[1]);
+    }
+
+    /**
+     * @param $offset
+     * @param $value
+     */
+    public function putDouble($offset, $value)
+    {
+        $this->assertOffsetAndLength($offset, 8);
+
+        $floathelper = pack("d", $value);
+        $v = unpack("V*", $floathelper);
+
+        $this->writeLittleEndian($offset, 4, $v[1]);
+        $this->writeLittleEndian($offset + 4, 4, $v[2]);
+    }
+
+    /**
+     * @param $index
+     * @return mixed
+     */
+    public function getByte($index)
+    {
+        return ord($this->_buffer[$index]);
+    }
+
+    /**
+     * @param $index
+     * @return mixed
+     */
+    public function getSbyte($index)
+    {
+        $v = unpack("c", $this->_buffer[$index]);
+        return $v[1];
+    }
+
+    /**
+     * @param $buffer
+     */
+    public function getX(&$buffer)
+    {
+        for ($i = $this->_pos, $j = 0; $j < strlen($buffer); $i++, $j++) {
+            $buffer[$j] = $this->_buffer[$i];
+        }
+    }
+
+    /**
+     * @param $index
+     * @return mixed
+     */
+    public function get($index)
+    {
+        $this->assertOffsetAndLength($index, 1);
+        return $this->_buffer[$index];
+    }
+
+
+    /**
+     * @param $index
+     * @return mixed
+     */
+    public function getBool($index)
+    {
+        return (bool)ord($this->_buffer[$index]);
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getShort($index)
+    {
+        $result = $this->readLittleEndian($index, 2);
+
+        $sign = $index + (ByteBuffer::isLittleEndian() ? 1 : 0);
+        $issigned = isset($this->_buffer[$sign]) && ord($this->_buffer[$sign]) & 0x80;
+
+        // 65536 = 1 << 16 = Maximum unsigned 16-bit int
+        return $issigned ? $result - 65536 : $result;
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getUShort($index)
+    {
+        return $this->readLittleEndian($index, 2);
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getInt($index)
+    {
+        $result = $this->readLittleEndian($index, 4);
+
+        $sign = $index + (ByteBuffer::isLittleEndian() ? 3 : 0);
+        $issigned = isset($this->_buffer[$sign]) && ord($this->_buffer[$sign]) & 0x80;
+
+        if (PHP_INT_SIZE > 4) {
+            // 4294967296 = 1 << 32 = Maximum unsigned 32-bit int
+            return $issigned ? $result - 4294967296 : $result;
+        } else {
+            // 32bit / Windows treated number as signed integer.
+            return $result;
+        }
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getUint($index)
+    {
+        return $this->readLittleEndian($index, 4);
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getLong($index)
+    {
+        return $this->readLittleEndian($index, 8);
+    }
+
+    /**
+     * @param $index
+     * @return int
+     */
+    public function getUlong($index)
+    {
+        return $this->readLittleEndian($index, 8);
+    }
+
+    /**
+     * @param $index
+     * @return mixed
+     */
+    public function getFloat($index)
+    {
+        $i = $this->readLittleEndian($index, 4);
+
+        return self::convertHelper(self::__FLOAT, $i);
+    }
+
+    /**
+     * @param $index
+     * @return float
+     */
+    public function getDouble($index)
+    {
+        $i = $this->readLittleEndian($index, 4);
+        $i2 = $this->readLittleEndian($index + 4, 4);
+
+        return self::convertHelper(self::__DOUBLE, $i, $i2);
+    }
+
+    const __SHORT = 1;
+    const __INT = 2;
+    const __LONG = 3;
+    const __FLOAT = 4;
+    const __DOUBLE = 5;
+    private static function convertHelper($type, $value, $value2 = null) {
+        // readLittleEndian construct unsigned integer value from bytes. we have to encode this value to
+        // correct bytes, and decode as expected types with `unpack` function.
+        // then it returns correct type value.
+        // see also: http://php.net/manual/en/function.pack.php
+
+        switch ($type) {
+            case self::__FLOAT:
+                $inthelper = pack("V", $value);
+                $v = unpack("f", $inthelper);
+                return $v[1];
+                break;
+            case self::__DOUBLE:
+                $inthelper = pack("VV", $value, $value2);
+                $v = unpack("d", $inthelper);
+                return $v[1];
+                break;
+            default:
+                throw new \Exception(sprintf("unexpected type %d specified", $type));
+        }
+    }
+
+    private static function validateValue($min, $max, $value, $type, $additional_notes = "") {
+        if(!($min <= $value && $value <= $max)) {
+            throw new \InvalidArgumentException(sprintf("bad number %s for type %s.%s", $value, $type, $additional_notes));
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/php/Constants.php b/3rdparty/TNN/third_party/flatbuffers/php/Constants.php
new file mode 100644
index 0000000..ef3730d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/php/Constants.php
@@ -0,0 +1,25 @@
+<?php
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Google\FlatBuffers;
+
+class Constants
+{
+    const SIZEOF_SHORT = 2;
+    const SIZEOF_INT = 4;
+    const FILE_IDENTIFIER_LENGTH = 4;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/php/FlatbufferBuilder.php b/3rdparty/TNN/third_party/flatbuffers/php/FlatbufferBuilder.php
new file mode 100644
index 0000000..4fbc2bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/php/FlatbufferBuilder.php
@@ -0,0 +1,977 @@
+<?php
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/// @file
+/// @addtogroup flatbuffers_php_api
+/// @{
+
+namespace Google\FlatBuffers;
+
+final class FlatbufferBuilder
+{
+    /**
+     * Internal ByteBuffer for the FlatBuffer data.
+     * @var ByteBuffer $bb
+     */
+    public $bb;
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * @var int $space
+     */
+    protected $space;
+
+    /**
+     * @var int $minalign
+     */
+    protected $minalign = 1;
+
+    /**
+     * @var array $vtable
+     */
+    protected $vtable;
+
+    /**
+     * @var int $vtable_in_use
+     */
+    protected $vtable_in_use = 0;
+
+    /**
+     * @var bool $nested
+     */
+    protected $nested = false;
+
+    /**
+     * @var int $object_start
+     */
+    protected $object_start;
+
+    /**
+     * @var array $vtables
+     */
+    protected $vtables = array();
+
+    /**
+     * @var int $num_vtables
+     */
+    protected $num_vtables = 0;
+
+    /**
+     * @var int $vector_num_elems
+     */
+    protected $vector_num_elems = 0;
+
+    /**
+     * @var bool $force_defaults
+     */
+    protected $force_defaults = false;
+    /// @endcond
+
+    /**
+     * Create a FlatBufferBuilder with a given initial size.
+     *
+     * @param $initial_size initial byte buffer size.
+     */
+    public function __construct($initial_size)
+    {
+        if ($initial_size <= 0) {
+            $initial_size = 1;
+        }
+        $this->space = $initial_size;
+        $this->bb = $this->newByteBuffer($initial_size);
+    }
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * create new bytebuffer
+     *
+     * @param $size
+     * @return ByteBuffer
+     */
+    private function newByteBuffer($size)
+    {
+        return new ByteBuffer($size);
+    }
+
+    /**
+     * Returns the current ByteBuffer offset.
+     *
+     * @return int
+     */
+    public function offset()
+    {
+        return $this->bb->capacity() - $this->space;
+    }
+
+    /**
+     * padding buffer
+     *
+     * @param $byte_size
+     */
+    public function pad($byte_size)
+    {
+        for ($i = 0; $i < $byte_size; $i++) {
+            $this->bb->putByte(--$this->space, "\0");
+        }
+    }
+
+    /**
+     * prepare bytebuffer
+     *
+     * @param $size
+     * @param $additional_bytes
+     * @throws \Exception
+     */
+    public function prep($size, $additional_bytes)
+    {
+        if ($size > $this->minalign) {
+            $this->minalign = $size;
+        }
+
+        $align_size = ((~($this->bb->capacity() - $this->space + $additional_bytes)) + 1) & ($size - 1);
+        while ($this->space < $align_size + $size  + $additional_bytes) {
+            $old_buf_size = $this->bb->capacity();
+            $this->bb = $this->growByteBuffer($this->bb);
+            $this->space += $this->bb->capacity() - $old_buf_size;
+        }
+
+        $this->pad($align_size);
+    }
+
+    /**
+     * @param ByteBuffer $bb
+     * @return ByteBuffer
+     * @throws \Exception
+     */
+    private static function growByteBuffer(ByteBuffer $bb)
+    {
+        $old_buf_size = $bb->capacity();
+        if (($old_buf_size & 0xC0000000) != 0) {
+            throw new \Exception("FlatBuffers: cannot grow buffer beyond 2 gigabytes");
+        }
+        $new_buf_size = $old_buf_size << 1;
+
+        $bb->setPosition(0);
+        $nbb = new ByteBuffer($new_buf_size);
+
+        $nbb->setPosition($new_buf_size - $old_buf_size);
+
+        // TODO(chobie): is this little bit faster?
+        //$nbb->_buffer = substr_replace($nbb->_buffer, $bb->_buffer, $new_buf_size - $old_buf_size, strlen($bb->_buffer));
+        for ($i = $new_buf_size - $old_buf_size, $j = 0; $j < strlen($bb->_buffer); $i++, $j++) {
+            $nbb->_buffer[$i] = $bb->_buffer[$j];
+        }
+
+        return $nbb;
+    }
+
+    /**
+     * @param $x
+     */
+    public function putBool($x)
+    {
+        $this->bb->put($this->space -= 1, chr((int)(bool)($x)));
+    }
+
+    /**
+     * @param $x
+     */
+    public function putByte($x)
+    {
+        $this->bb->put($this->space -= 1, chr($x));
+    }
+
+    /**
+     * @param $x
+     */
+    public function putSbyte($x)
+    {
+        $this->bb->put($this->space -= 1, chr($x));
+    }
+
+    /**
+     * @param $x
+     */
+    public function putShort($x)
+    {
+        $this->bb->putShort($this->space -= 2, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putUshort($x)
+    {
+        $this->bb->putUshort($this->space -= 2, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putInt($x)
+    {
+        $this->bb->putInt($this->space -= 4, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putUint($x)
+    {
+        if ($x > PHP_INT_MAX) {
+            throw new \InvalidArgumentException("your platform can't handle uint correctly. use 64bit machine.");
+        }
+
+        $this->bb->putUint($this->space -= 4, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putLong($x)
+    {
+        if ($x > PHP_INT_MAX) {
+            throw new \InvalidArgumentException("Your platform can't handle long correctly. Use a 64bit machine.");
+        }
+
+        $this->bb->putLong($this->space -= 8, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putUlong($x)
+    {
+        if ($x > PHP_INT_MAX) {
+            throw new \InvalidArgumentException("Your platform can't handle ulong correctly. This is a php limitation. Please wait for the extension release.");
+        }
+
+        $this->bb->putUlong($this->space -= 8, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putFloat($x)
+    {
+        $this->bb->putFloat($this->space -= 4, $x);
+    }
+
+    /**
+     * @param $x
+     */
+    public function putDouble($x)
+    {
+        $this->bb->putDouble($this->space -= 8, $x);
+    }
+
+    /**
+     * @param $off
+     */
+    public function putOffset($off)
+    {
+        $new_off = $this->offset() - $off + Constants::SIZEOF_INT;
+        $this->putInt($new_off);
+    }
+    /// @endcond
+
+    /**
+     * Add a `bool` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `bool` to add to the buffer.
+     */
+    public function addBool($x)
+    {
+        $this->prep(1, 0);
+        $this->putBool($x);
+    }
+
+    /**
+     * Add a `byte` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `byte` to add to the buffer.
+     */
+    public function addByte($x)
+    {
+        $this->prep(1, 0);
+        $this->putByte($x);
+    }
+
+    /**
+     * Add a `signed byte` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `signed byte` to add to the buffer.
+     */
+    public function addSbyte($x)
+    {
+        $this->prep(1, 0);
+        $this->putSbyte($x);
+    }
+
+    /**
+     * Add a `short` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `short` to add to the buffer.
+     */
+    public function addShort($x)
+    {
+        $this->prep(2, 0);
+        $this->putShort($x);
+    }
+
+    /**
+     * Add an `unsigned short` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `unsigned short` to add to the buffer.
+     */
+    public function addUshort($x)
+    {
+        $this->prep(2, 0);
+        $this->putUshort($x);
+    }
+
+    /**
+     * Add an `int` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `int` to add to the buffer.
+     */
+    public function addInt($x)
+    {
+        $this->prep(4, 0);
+        $this->putInt($x);
+    }
+
+    /**
+     * Add an `unsigned int` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `unsigned int` to add to the buffer.
+     */
+    public function addUint($x)
+    {
+        $this->prep(4, 0);
+        $this->putUint($x);
+    }
+
+    /**
+     * Add a `long` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `long` to add to the buffer.
+     */
+    public function addLong($x)
+    {
+        $this->prep(8, 0);
+        $this->putLong($x);
+    }
+
+    /**
+     * Add an `unsigned long` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `unsigned long` to add to the buffer.
+     */
+    public function addUlong($x)
+    {
+        $this->prep(8, 0);
+        $this->putUlong($x);
+    }
+
+    /**
+     * Add a `float` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `float` to add to the buffer.
+     */
+    public function addFloat($x)
+    {
+        $this->prep(4, 0);
+        $this->putFloat($x);
+    }
+
+    /**
+     * Add a `double` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param $x The `double` to add to the buffer.
+     */
+    public function addDouble($x)
+    {
+        $this->prep(8, 0);
+        $this->putDouble($x);
+    }
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addBoolX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addBool($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addByteX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addByte($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addSbyteX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addSbyte($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addShortX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addShort($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addUshortX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addUshort($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addIntX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addInt($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addUintX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addUint($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addLongX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addLong($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addUlongX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addUlong($x);
+            $this->slot($o);
+        }
+    }
+
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addFloatX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addFloat($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     */
+    public function addDoubleX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addDouble($x);
+            $this->slot($o);
+        }
+    }
+
+    /**
+     * @param $o
+     * @param $x
+     * @param $d
+     * @throws \Exception
+     */
+    public function addOffsetX($o, $x, $d)
+    {
+        if ($this->force_defaults || $x != $d) {
+            $this->addOffset($x);
+            $this->slot($o);
+        }
+    }
+    /// @endcond
+
+    /**
+     * Adds on offset, relative to where it will be written.
+     * @param $off The offset to add to the buffer.
+     * @throws \Exception Throws an exception if `$off` is greater than the underlying ByteBuffer's
+     * offest.
+     */
+    public function addOffset($off)
+    {
+        $this->prep(Constants::SIZEOF_INT, 0); // Ensure alignment is already done
+        if ($off > $this->offset()) {
+            throw new \Exception("");
+        }
+        $this->putOffset($off);
+    }
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * @param $elem_size
+     * @param $num_elems
+     * @param $alignment
+     * @throws \Exception
+     */
+    public function startVector($elem_size, $num_elems, $alignment)
+    {
+        $this->notNested();
+        $this->vector_num_elems = $num_elems;
+        $this->prep(Constants::SIZEOF_INT, $elem_size * $num_elems);
+        $this->prep($alignment, $elem_size * $num_elems); // Just in case alignemnt > int;
+    }
+
+    /**
+     * @return int
+     */
+    public function endVector()
+    {
+        $this->putUint($this->vector_num_elems);
+        return $this->offset();
+    }
+
+    protected function is_utf8($bytes)
+    {
+        if (function_exists('mb_detect_encoding')) {
+            return (bool) mb_detect_encoding($bytes, 'UTF-8', true);
+        }
+
+        $len = strlen($bytes);
+        if ($len < 1) {
+            /* NOTE: always return 1 when passed string is null */
+            return true;
+        }
+
+        for ($j = 0, $i = 0; $i < $len; $i++) {
+            // check ACII
+            if ($bytes[$j] == "\x09" ||
+                $bytes[$j] == "\x0A" ||
+                $bytes[$j] == "\x0D" ||
+                ($bytes[$j] >= "\x20" && $bytes[$j] <= "\x7E")) {
+                $j++;
+                continue;
+            }
+
+            /* non-overlong 2-byte */
+            if ((($i+1) <= $len) &&
+                ($bytes[$j] >= "\xC2" && $bytes[$j] <= "\xDF" &&
+                    ($bytes[$j+1] >= "\x80" && $bytes[$j+1] <= "\xBF"))) {
+                $j += 2;
+                $i++;
+                continue;
+            }
+
+            /* excluding overlongs */
+            if ((($i + 2) <= $len) &&
+                $bytes[$j] == "\xE0" &&
+                ($bytes[$j+1] >= "\xA0" && $bytes[$j+1] <= "\xBF" &&
+                    ($bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF"))) {
+                $bytes += 3;
+                $i +=2;
+                continue;
+            }
+
+            /* straight 3-byte */
+            if ((($i+2) <= $len) &&
+                (($bytes[$j] >= "\xE1" && $bytes[$j] <= "\xEC") ||
+                    $bytes[$j] == "\xEE" ||
+                    $bytes[$j] = "\xEF") &&
+                ($bytes[$j+1] >= "\x80" && $bytes[$j+1] <= "\xBF") &&
+                ($bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF")) {
+                $j += 3;
+                $i += 2;
+                continue;
+            }
+
+            /* excluding surrogates */
+            if ((($i+2) <= $len) &&
+                $bytes[$j] == "\xED" &&
+                ($bytes[$j+1] >= "\x80" && $bytes[$j+1] <= "\x9f" &&
+                    ($bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF"))) {
+                $j += 3;
+                $i += 2;
+                continue;
+            }
+
+            /* planes 1-3 */
+            if ((($i + 3) <= $len) &&
+                $bytes[$j] == "\xF0" &&
+                ($bytes[$j+1] >= "\x90" && $bytes[$j+1] <= "\xBF") &&
+                ($bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF") &&
+                ($bytes[$j+3] >= "\x80" && $bytes[$j+3] <= "\xBF")) {
+                $j += 4;
+                $i += 3;
+                continue;
+            }
+
+
+            /* planes 4-15 */
+            if ((($i+3) <= $len) &&
+                $bytes[$j] >= "\xF1" && $bytes[$j] <= "\xF3" &&
+                $bytes[$j+1] >= "\x80" && $bytes[$j+1] <= "\xBF" &&
+                $bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF" &&
+                $bytes[$j+3] >= "\x80" && $bytes[$j+3] <= "\xBF"
+            ) {
+                $j += 4;
+                $i += 3;
+                continue;
+            }
+
+            /* plane 16 */
+            if ((($i+3) <= $len) &&
+                $bytes[$j] == "\xF4" &&
+                ($bytes[$j+1] >= "\x80" && $bytes[$j+1] <= "\x8F") &&
+                ($bytes[$j+2] >= "\x80" && $bytes[$j+2] <= "\xBF") &&
+                ($bytes[$j+3] >= "\x80" && $bytes[$j+3] <= "\xBF")
+            ) {
+                $bytes += 4;
+                $i += 3;
+                continue;
+            }
+
+
+            return false;
+        }
+
+        return true;
+    }
+    /// @endcond
+
+    /**
+     * Encode the string `$s` in the buffer using UTF-8.
+     * @param string $s The string to encode.
+     * @return int The offset in the buffer where the encoded string starts.
+     * @throws InvalidArgumentException Thrown if the input string `$s` is not
+     *     UTF-8.
+     */
+    public function createString($s)
+    {
+        if (!$this->is_utf8($s)) {
+            throw new \InvalidArgumentException("string must be utf-8 encoded value.");
+        }
+
+        $this->notNested();
+        $this->addByte(0); // null terminated
+        $this->startVector(1, strlen($s), 1);
+        $this->space -= strlen($s);
+        for ($i =  $this->space, $j = 0 ; $j < strlen($s) ; $i++, $j++) {
+            $this->bb->_buffer[$i] = $s[$j];
+        }
+        return $this->endVector();
+    }
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * @throws \Exception
+     */
+    public function notNested()
+    {
+        if ($this->nested) {
+            throw new \Exception("FlatBuffers; object serialization must not be nested");
+        }
+    }
+
+    /**
+     * @param $obj
+     * @throws \Exception
+     */
+    public function nested($obj)
+    {
+        if ($obj != $this->offset()) {
+            throw new \Exception("FlatBuffers: struct must be serialized inline");
+        }
+    }
+
+    /**
+     * @param $numfields
+     * @throws \Exception
+     */
+    public function startObject($numfields)
+    {
+        $this->notNested();
+        if ($this->vtable == null || count($this->vtable) < $numfields) {
+            $this->vtable = array();
+        }
+
+        $this->vtable_in_use = $numfields;
+        for ($i = 0; $i < $numfields; $i++) {
+            $this->vtable[$i] = 0;
+        }
+
+        $this->nested = true;
+        $this->object_start = $this->offset();
+    }
+
+    /**
+     * @param $voffset
+     * @param $x
+     * @param $d
+     * @throws \Exception
+     */
+    public function addStructX($voffset, $x, $d)
+    {
+        if ($x != $d) {
+            $this->nested($x);
+            $this->slot($voffset);
+        }
+    }
+
+    /**
+     * @param $voffset
+     * @param $x
+     * @param $d
+     * @throws \Exception
+     */
+    public function addStruct($voffset, $x, $d)
+    {
+        if ($x != $d) {
+            $this->nested($x);
+            $this->slot($voffset);
+        }
+    }
+
+    /**
+     * @param $voffset
+     */
+    public function slot($voffset)
+    {
+        $this->vtable[$voffset] = $this->offset();
+    }
+
+    /**
+     * @return int
+     * @throws \Exception
+     */
+    public function endObject()
+    {
+        if ($this->vtable == null || !$this->nested) {
+            throw new \Exception("FlatBuffers: endObject called without startObject");
+        }
+
+        $this->addInt(0);
+        $vtableloc = $this->offset();
+
+        $i = $this->vtable_in_use -1;
+        // Trim trailing zeroes.
+        for (; $i >= 0 && $this->vtable[$i] == 0; $i--) {}
+        $trimmed_size = $i + 1;
+        for (; $i >= 0; $i--) {
+            $off = ($this->vtable[$i] != 0) ? $vtableloc - $this->vtable[$i] : 0;
+            $this->addShort($off);
+        }
+
+        $standard_fields = 2; // the fields below
+        $this->addShort($vtableloc - $this->object_start);
+        $this->addShort(($trimmed_size + $standard_fields) * Constants::SIZEOF_SHORT);
+
+        // search for an existing vtable that matches the current one.
+        $existing_vtable = 0;
+
+        for ($i = 0; $i < $this->num_vtables; $i++) {
+            $vt1 = $this->bb->capacity() - $this->vtables[$i];
+            $vt2 = $this->space;
+
+            $len = $this->bb->getShort($vt1);
+
+            if ($len == $this->bb->getShort($vt2)) {
+                for ($j = Constants::SIZEOF_SHORT; $j < $len; $j += Constants::SIZEOF_SHORT) {
+                    if ($this->bb->getShort($vt1 + $j) != $this->bb->getShort($vt2 + $j)) {
+                        continue 2;
+                    }
+                }
+                $existing_vtable = $this->vtables[$i];
+                break;
+            }
+        }
+
+        if ($existing_vtable != 0) {
+            // Found a match:
+            // Remove the current vtable
+            $this->space = $this->bb->capacity() - $vtableloc;
+            $this->bb->putInt($this->space, $existing_vtable - $vtableloc);
+        } else {
+            // No Match:
+            // Add the location of the current vtable to the list of vtables
+            if ($this->num_vtables == count($this->vtables)) {
+                $vtables = $this->vtables;
+                $this->vtables = array();
+                // copy of
+                for ($i = 0; $i < count($vtables) * 2; $i++) {
+                    $this->vtables[$i] = ($i < count($vtables)) ? $vtables[$i] : 0;
+                }
+            }
+            $this->vtables[$this->num_vtables++] = $this->offset();
+            $this->bb->putInt($this->bb->capacity() - $vtableloc, $this->offset() - $vtableloc);
+        }
+
+        $this->nested = false;
+        $this->vtable = null;
+        return $vtableloc;
+    }
+
+    /**
+     * @param $table
+     * @param $field
+     * @throws \Exception
+     */
+    public function required($table, $field)
+    {
+        $table_start = $this->bb->capacity() - $table;
+        $vtable_start = $table_start - $this->bb->getInt($table_start);
+        $ok = $this->bb->getShort($vtable_start + $field) != 0;
+
+        if (!$ok) {
+            throw new \Exception("FlatBuffers: field "  . $field  .  " must be set");
+        }
+    }
+    /// @endcond
+
+    /**
+     * Finalize a buffer, pointing to the given `$root_table`.
+     * @param $root_table An offest to be added to the buffer.
+     * @param $file_identifier A FlatBuffer file identifier to be added to the
+     *     buffer before `$root_table`. This defaults to `null`.
+     * @throws InvalidArgumentException Thrown if an invalid `$identifier` is
+     *     given, where its length is not equal to
+     *    `Constants::FILE_IDENTIFIER_LENGTH`.
+     */
+    public function finish($root_table, $identifier = null)
+    {
+        if ($identifier == null) {
+            $this->prep($this->minalign, Constants::SIZEOF_INT);
+            $this->addOffset($root_table);
+            $this->bb->setPosition($this->space);
+        } else {
+            $this->prep($this->minalign, Constants::SIZEOF_INT + Constants::FILE_IDENTIFIER_LENGTH);
+            if (strlen($identifier) != Constants::FILE_IDENTIFIER_LENGTH) {
+                throw new \InvalidArgumentException(
+                    sprintf("FlatBuffers: file identifier must be length %d",
+                        Constants::FILE_IDENTIFIER_LENGTH));
+            }
+
+            for ($i = Constants::FILE_IDENTIFIER_LENGTH - 1; $i >= 0;
+                  $i--) {
+                $this->addByte(ord($identifier[$i]));
+            }
+            $this->finish($root_table);
+        }
+    }
+
+    /**
+     * In order to save space, fields that are set to their default value don't
+     * get serialized into the buffer.
+     * @param bool $forceDefaults When set to `true`, always serializes default
+     *     values.
+     */
+    public function forceDefaults($forceDefaults)
+    {
+        $this->force_defaults = $forceDefaults;
+    }
+
+    /**
+     * Get the ByteBuffer representing the FlatBuffer.
+     * @return ByteBuffer The ByteBuffer containing the FlatBuffer data.
+     */
+    public function dataBuffer()
+    {
+        return $this->bb;
+    }
+
+    /// @cond FLATBUFFERS_INTERNAL
+    /**
+     * @return int
+     */
+    public function dataStart()
+    {
+        return $this->space;
+    }
+    /// @endcond
+
+    /**
+     * Utility function to copy and return the FlatBuffer data from the
+     * underlying ByteBuffer.
+     * @return string A string (representing a byte[]) that contains a copy
+     * of the FlatBuffer data.
+     */
+    public function sizedByteArray()
+    {
+        $start = $this->space;
+        $length = $this->bb->capacity() - $this->space;
+
+        $result = str_repeat("\0", $length);
+        $this->bb->setPosition($start);
+        $this->bb->getX($result);
+
+        return $result;
+    }
+}
+
+/// @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/php/Struct.php b/3rdparty/TNN/third_party/flatbuffers/php/Struct.php
new file mode 100644
index 0000000..cd7652e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/php/Struct.php
@@ -0,0 +1,41 @@
+<?php
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Google\FlatBuffers;
+
+abstract class Struct
+{
+    /**
+     * @var int $bb_pos
+     */
+    protected $bb_pos;
+
+    /**
+     * @var ByteBuffer $bb
+     */
+    protected $bb;
+
+    public function setByteBufferPos($pos)
+    {
+        $this->bb_pos = $pos;
+    }
+
+    public function setByteBuffer($bb)
+    {
+        $this->bb = $bb;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/php/Table.php b/3rdparty/TNN/third_party/flatbuffers/php/Table.php
new file mode 100644
index 0000000..bf6fe21
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/php/Table.php
@@ -0,0 +1,145 @@
+<?php
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Google\FlatBuffers;
+
+abstract class Table
+{
+    /**
+     * @var int $bb_pos
+     */
+    protected $bb_pos;
+    /**
+     * @var ByteBuffer $bb
+     */
+    protected $bb;
+
+    public function __construct()
+    {
+    }
+
+    public function setByteBufferPos($pos)
+    {
+        $this->bb_pos = $pos;
+    }
+
+    public function setByteBuffer($bb)
+    {
+        $this->bb = $bb;
+    }
+
+    /**
+     * returns actual vtable offset
+     *
+     * @param $vtable_offset
+     * @return int offset > 0 means exist value. 0 means not exist
+     */
+    protected function __offset($vtable_offset)
+    {
+        $vtable = $this->bb_pos - $this->bb->getInt($this->bb_pos);
+        return $vtable_offset < $this->bb->getShort($vtable) ? $this->bb->getShort($vtable + $vtable_offset) : 0;
+    }
+
+    /**
+     * @param $offset
+     * @return mixed
+     */
+    protected function __indirect($offset)
+    {
+        return $offset + $this->bb->getInt($offset);
+    }
+
+    /**
+     * fetch utf8 encoded string.
+     *
+     * @param $offset
+     * @return string
+     */
+    protected function __string($offset)
+    {
+        $offset += $this->bb->getInt($offset);
+        $len = $this->bb->getInt($offset);
+        $startPos = $offset + Constants::SIZEOF_INT;
+        return substr($this->bb->_buffer, $startPos, $len);
+    }
+
+    /**
+     * @param $offset
+     * @return int
+     */
+    protected function __vector_len($offset)
+    {
+        $offset += $this->bb_pos;
+        $offset += $this->bb->getInt($offset);
+        return $this->bb->getInt($offset);
+    }
+
+    /**
+     * @param $offset
+     * @return int
+     */
+    protected function __vector($offset)
+    {
+        $offset += $this->bb_pos;
+        // data starts after the length
+        return $offset + $this->bb->getInt($offset) + Constants::SIZEOF_INT;
+    }
+
+    protected function __vector_as_bytes($vector_offset, $elem_size=1)
+    {
+        $o = $this->__offset($vector_offset);
+        if ($o == 0) {
+            return null;
+        }
+
+        return substr($this->bb->_buffer, $this->__vector($o), $this->__vector_len($o) * $elem_size);
+    }
+
+    /**
+     * @param Table $table
+     * @param int $offset
+     * @return Table
+     */
+    protected function __union($table, $offset)
+    {
+        $offset += $this->bb_pos;
+        $table->setByteBufferPos($offset + $this->bb->getInt($offset));
+        $table->setByteBuffer($this->bb);
+        return $table;
+    }
+
+    /**
+     * @param ByteBuffer $bb
+     * @param string $ident
+     * @return bool
+     * @throws \ArgumentException
+     */
+    protected static function __has_identifier($bb, $ident)
+    {
+        if (strlen($ident) != Constants::FILE_IDENTIFIER_LENGTH) {
+            throw new \ArgumentException("FlatBuffers: file identifier must be length "  . Constants::FILE_IDENTIFIER_LENGTH);
+        }
+
+        for ($i = 0; $i < 4; $i++) {
+            if ($ident[$i] != $bb->get($bb->getPosition() + Constants::SIZEOF_INT + $i)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/pom.xml b/3rdparty/TNN/third_party/flatbuffers/pom.xml
new file mode 100644
index 0000000..8728595
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/pom.xml
@@ -0,0 +1,144 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>com.google.flatbuffers</groupId>
+  <artifactId>flatbuffers-java</artifactId>
+  <version>2.0.0</version>
+  <packaging>bundle</packaging>
+  <name>FlatBuffers Java API</name>
+  <description>
+    Memory Efficient Serialization Library
+  </description>
+  <developers>
+    <developer>
+      <name>Wouter van Oortmerssen</name>
+    </developer>
+  </developers>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+  <url>https://github.com/google/flatbuffers</url>
+  <licenses>
+    <license>
+      <name>Apache License V2.0</name>
+      <url>https://raw.githubusercontent.com/google/flatbuffers/master/LICENSE.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <scm>
+    <url>https://github.com/google/flatbuffers</url>
+    <connection>
+      scm:git:https://github.com/google/flatbuffers.git
+    </connection>
+    <tag>HEAD</tag>
+  </scm>
+  <dependencies>
+  </dependencies>
+  <distributionManagement>
+    <snapshotRepository>
+      <id>ossrh</id>
+      <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+    </snapshotRepository>
+  </distributionManagement>
+  <build>
+    <sourceDirectory>java</sourceDirectory>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>1.8</source>
+          <target>1.8</target>
+        </configuration>
+        <version>3.2</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <includes>
+            <include>**/*Test.java</include>
+          </includes>
+        </configuration>
+        <version>2.18.1</version>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <version>2.3</version>
+        <executions>
+          <execution>
+            <id>attach-sources</id>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.9.1</version>
+        <configuration>
+          <additionalparam>-Xdoclint:none</additionalparam>
+          <additionalOptions>-Xdoclint:none</additionalOptions>
+        </configuration>
+        <executions>
+          <execution>
+            <id>attach-javadocs</id>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <version>3.0.1</version>
+        <extensions>true</extensions>
+      </plugin>
+      <plugin>
+        <groupId>org.sonatype.plugins</groupId>
+        <artifactId>nexus-staging-maven-plugin</artifactId>
+        <version>1.6.7</version>
+        <extensions>true</extensions>
+        <configuration>
+          <serverId>ossrh</serverId>
+          <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+          <autoReleaseAfterClose>true</autoReleaseAfterClose>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-gpg-plugin</artifactId>
+        <version>1.6</version>
+        <executions>
+          <execution>
+            <id>sign-artifacts</id>
+            <phase>verify</phase>
+            <goals>
+              <goal>sign</goal>
+            </goals>
+            <configuration>
+                <gpgArguments>
+                    <arg>--pinentry-mode</arg>
+                    <arg>loopback</arg>
+                </gpgArguments>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-release-plugin</artifactId>
+        <version>2.5.3</version>
+        <configuration>
+          <autoVersionSubmodules>true</autoVersionSubmodules>
+          <useReleaseProfile>false</useReleaseProfile>
+          <releaseProfiles>release</releaseProfiles>
+          <goals>deploy</goals>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/.gitignore b/3rdparty/TNN/third_party/flatbuffers/python/.gitignore
new file mode 100644
index 0000000..a771946
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/.gitignore
@@ -0,0 +1,2 @@
+/dist/
+/*.egg-info/
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/__init__.py b/3rdparty/TNN/third_party/flatbuffers/python/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/__init__.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/__init__.py
new file mode 100644
index 0000000..74dc7ee
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import Builder
+from .table import Table
+from .compat import range_func as compat_range
+from ._version import __version__
+from . import util
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/_version.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/_version.py
new file mode 100644
index 0000000..a44e900
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/_version.py
@@ -0,0 +1,17 @@
+# Copyright 2019 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder, to be updated during the release process
+# by the setup.py
+__version__ = u"latest"
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/builder.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/builder.py
new file mode 100644
index 0000000..3f0cf4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/builder.py
@@ -0,0 +1,776 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import number_types as N
+from .number_types import (UOffsetTFlags, SOffsetTFlags, VOffsetTFlags)
+
+from . import encode
+from . import packer
+
+from . import compat
+from .compat import range_func
+from .compat import memoryview_type
+from .compat import import_numpy, NumpyRequiredForThisFeature
+
+np = import_numpy()
+## @file
+## @addtogroup flatbuffers_python_api
+## @{
+
+## @cond FLATBUFFERS_INTERNAL
+class OffsetArithmeticError(RuntimeError):
+    """
+    Error caused by an Offset arithmetic error. Probably caused by bad
+    writing of fields. This is considered an unreachable situation in
+    normal circumstances.
+    """
+    pass
+
+
+class IsNotNestedError(RuntimeError):
+    """
+    Error caused by using a Builder to write Object data when not inside
+    an Object.
+    """
+    pass
+
+
+class IsNestedError(RuntimeError):
+    """
+    Error caused by using a Builder to begin an Object when an Object is
+    already being built.
+    """
+    pass
+
+
+class StructIsNotInlineError(RuntimeError):
+    """
+    Error caused by using a Builder to write a Struct at a location that
+    is not the current Offset.
+    """
+    pass
+
+
+class BuilderSizeError(RuntimeError):
+    """
+    Error caused by causing a Builder to exceed the hardcoded limit of 2
+    gigabytes.
+    """
+    pass
+
+class BuilderNotFinishedError(RuntimeError):
+    """
+    Error caused by not calling `Finish` before calling `Output`.
+    """
+    pass
+
+
+# VtableMetadataFields is the count of metadata fields in each vtable.
+VtableMetadataFields = 2
+## @endcond
+
+class Builder(object):
+    """ A Builder is used to construct one or more FlatBuffers.
+
+    Typically, Builder objects will be used from code generated by the `flatc`
+    compiler.
+
+    A Builder constructs byte buffers in a last-first manner for simplicity and
+    performance during reading.
+
+    Internally, a Builder is a state machine for creating FlatBuffer objects.
+
+    It holds the following internal state:
+        - Bytes: an array of bytes.
+        - current_vtable: a list of integers.
+        - vtables: a hash of vtable entries.
+
+    Attributes:
+      Bytes: The internal `bytearray` for the Builder.
+      finished: A boolean determining if the Builder has been finalized.
+    """
+
+    ## @cond FLATBUFFERS_INTENRAL
+    __slots__ = ("Bytes", "current_vtable", "head", "minalign", "objectEnd",
+                 "vtables", "nested", "forceDefaults", "finished", "vectorNumElems")
+
+    """Maximum buffer size constant, in bytes.
+
+    Builder will never allow it's buffer grow over this size.
+    Currently equals 2Gb.
+    """
+    MAX_BUFFER_SIZE = 2**31
+    ## @endcond
+
+    def __init__(self, initialSize=1024):
+        """Initializes a Builder of size `initial_size`.
+
+        The internal buffer is grown as needed.
+        """
+
+        if not (0 <= initialSize <= Builder.MAX_BUFFER_SIZE):
+            msg = "flatbuffers: Cannot create Builder larger than 2 gigabytes."
+            raise BuilderSizeError(msg)
+
+        self.Bytes = bytearray(initialSize)
+        ## @cond FLATBUFFERS_INTERNAL
+        self.current_vtable = None
+        self.head = UOffsetTFlags.py_type(initialSize)
+        self.minalign = 1
+        self.objectEnd = None
+        self.vtables = {}
+        self.nested = False
+        self.forceDefaults = False
+        ## @endcond
+        self.finished = False
+
+    def Output(self):
+        """Return the portion of the buffer that has been used for writing data.
+
+        This is the typical way to access the FlatBuffer data inside the
+        builder. If you try to access `Builder.Bytes` directly, you would need
+        to manually index it with `Head()`, since the buffer is constructed
+        backwards.
+
+        It raises BuilderNotFinishedError if the buffer has not been finished
+        with `Finish`.
+        """
+
+        if not self.finished:
+            raise BuilderNotFinishedError()
+
+        return self.Bytes[self.Head():]
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def StartObject(self, numfields):
+        """StartObject initializes bookkeeping for writing a new object."""
+
+        self.assertNotNested()
+
+        # use 32-bit offsets so that arithmetic doesn't overflow.
+        self.current_vtable = [0 for _ in range_func(numfields)]
+        self.objectEnd = self.Offset()
+        self.nested = True
+
+    def WriteVtable(self):
+        """
+        WriteVtable serializes the vtable for the current object, if needed.
+
+        Before writing out the vtable, this checks pre-existing vtables for
+        equality to this one. If an equal vtable is found, point the object to
+        the existing vtable and return.
+
+        Because vtable values are sensitive to alignment of object data, not
+        all logically-equal vtables will be deduplicated.
+
+        A vtable has the following format:
+          <VOffsetT: size of the vtable in bytes, including this value>
+          <VOffsetT: size of the object in bytes, including the vtable offset>
+          <VOffsetT: offset for a field> * N, where N is the number of fields
+                     in the schema for this type. Includes deprecated fields.
+        Thus, a vtable is made of 2 + N elements, each VOffsetT bytes wide.
+
+        An object has the following format:
+          <SOffsetT: offset to this object's vtable (may be negative)>
+          <byte: data>+
+        """
+
+        # Prepend a zero scalar to the object. Later in this function we'll
+        # write an offset here that points to the object's vtable:
+        self.PrependSOffsetTRelative(0)
+
+        objectOffset = self.Offset()
+
+        vtKey = []
+        trim = True
+        for elem in reversed(self.current_vtable):
+            if elem == 0:
+                if trim:
+                    continue
+            else:
+                elem = objectOffset - elem
+                trim = False
+
+            vtKey.append(elem)
+
+        vtKey = tuple(vtKey)
+        vt2Offset = self.vtables.get(vtKey)
+        if vt2Offset is None:
+            # Did not find a vtable, so write this one to the buffer.
+
+            # Write out the current vtable in reverse , because
+            # serialization occurs in last-first order:
+            i = len(self.current_vtable) - 1
+            trailing = 0
+            trim = True
+            while i >= 0:
+                off = 0
+                elem = self.current_vtable[i]
+                i -= 1
+
+                if elem == 0:
+                    if trim:
+                        trailing += 1
+                        continue
+                else:
+                    # Forward reference to field;
+                    # use 32bit number to ensure no overflow:
+                    off = objectOffset - elem
+                    trim = False
+
+                self.PrependVOffsetT(off)
+
+            # The two metadata fields are written last.
+
+            # First, store the object bytesize:
+            objectSize = UOffsetTFlags.py_type(objectOffset - self.objectEnd)
+            self.PrependVOffsetT(VOffsetTFlags.py_type(objectSize))
+
+            # Second, store the vtable bytesize:
+            vBytes = len(self.current_vtable) - trailing + VtableMetadataFields
+            vBytes *= N.VOffsetTFlags.bytewidth
+            self.PrependVOffsetT(VOffsetTFlags.py_type(vBytes))
+
+            # Next, write the offset to the new vtable in the
+            # already-allocated SOffsetT at the beginning of this object:
+            objectStart = SOffsetTFlags.py_type(len(self.Bytes) - objectOffset)
+            encode.Write(packer.soffset, self.Bytes, objectStart,
+                         SOffsetTFlags.py_type(self.Offset() - objectOffset))
+
+            # Finally, store this vtable in memory for future
+            # deduplication:
+            self.vtables[vtKey] = self.Offset()
+        else:
+            # Found a duplicate vtable.
+            objectStart = SOffsetTFlags.py_type(len(self.Bytes) - objectOffset)
+            self.head = UOffsetTFlags.py_type(objectStart)
+
+            # Write the offset to the found vtable in the
+            # already-allocated SOffsetT at the beginning of this object:
+            encode.Write(packer.soffset, self.Bytes, self.Head(),
+                         SOffsetTFlags.py_type(vt2Offset - objectOffset))
+
+        self.current_vtable = None
+        return objectOffset
+
+    def EndObject(self):
+        """EndObject writes data necessary to finish object construction."""
+        self.assertNested()
+        self.nested = False
+        return self.WriteVtable()
+
+    def growByteBuffer(self):
+        """Doubles the size of the byteslice, and copies the old data towards
+           the end of the new buffer (since we build the buffer backwards)."""
+        if len(self.Bytes) == Builder.MAX_BUFFER_SIZE:
+            msg = "flatbuffers: cannot grow buffer beyond 2 gigabytes"
+            raise BuilderSizeError(msg)
+
+        newSize = min(len(self.Bytes) * 2, Builder.MAX_BUFFER_SIZE)
+        if newSize == 0:
+            newSize = 1
+        bytes2 = bytearray(newSize)
+        bytes2[newSize-len(self.Bytes):] = self.Bytes
+        self.Bytes = bytes2
+    ## @endcond
+
+    def Head(self):
+        """Get the start of useful data in the underlying byte buffer.
+
+        Note: unlike other functions, this value is interpreted as from the
+        left.
+        """
+        ## @cond FLATBUFFERS_INTERNAL
+        return self.head
+        ## @endcond
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def Offset(self):
+        """Offset relative to the end of the buffer."""
+        return UOffsetTFlags.py_type(len(self.Bytes) - self.Head())
+
+    def Pad(self, n):
+        """Pad places zeros at the current offset."""
+        for i in range_func(n):
+            self.Place(0, N.Uint8Flags)
+
+    def Prep(self, size, additionalBytes):
+        """
+        Prep prepares to write an element of `size` after `additional_bytes`
+        have been written, e.g. if you write a string, you need to align
+        such the int length field is aligned to SizeInt32, and the string
+        data follows it directly.
+        If all you need to do is align, `additionalBytes` will be 0.
+        """
+
+        # Track the biggest thing we've ever aligned to.
+        if size > self.minalign:
+            self.minalign = size
+
+        # Find the amount of alignment needed such that `size` is properly
+        # aligned after `additionalBytes`:
+        alignSize = (~(len(self.Bytes) - self.Head() + additionalBytes)) + 1
+        alignSize &= (size - 1)
+
+        # Reallocate the buffer if needed:
+        while self.Head() < alignSize+size+additionalBytes:
+            oldBufSize = len(self.Bytes)
+            self.growByteBuffer()
+            updated_head = self.head + len(self.Bytes) - oldBufSize
+            self.head = UOffsetTFlags.py_type(updated_head)
+        self.Pad(alignSize)
+
+    def PrependSOffsetTRelative(self, off):
+        """
+        PrependSOffsetTRelative prepends an SOffsetT, relative to where it
+        will be written.
+        """
+
+        # Ensure alignment is already done:
+        self.Prep(N.SOffsetTFlags.bytewidth, 0)
+        if not (off <= self.Offset()):
+            msg = "flatbuffers: Offset arithmetic error."
+            raise OffsetArithmeticError(msg)
+        off2 = self.Offset() - off + N.SOffsetTFlags.bytewidth
+        self.PlaceSOffsetT(off2)
+    ## @endcond
+
+    def PrependUOffsetTRelative(self, off):
+        """Prepends an unsigned offset into vector data, relative to where it
+        will be written.
+        """
+
+        # Ensure alignment is already done:
+        self.Prep(N.UOffsetTFlags.bytewidth, 0)
+        if not (off <= self.Offset()):
+            msg = "flatbuffers: Offset arithmetic error."
+            raise OffsetArithmeticError(msg)
+        off2 = self.Offset() - off + N.UOffsetTFlags.bytewidth
+        self.PlaceUOffsetT(off2)
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def StartVector(self, elemSize, numElems, alignment):
+        """
+        StartVector initializes bookkeeping for writing a new vector.
+
+        A vector has the following format:
+          - <UOffsetT: number of elements in this vector>
+          - <T: data>+, where T is the type of elements of this vector.
+        """
+
+        self.assertNotNested()
+        self.nested = True
+        self.vectorNumElems = numElems
+        self.Prep(N.Uint32Flags.bytewidth, elemSize*numElems)
+        self.Prep(alignment, elemSize*numElems)  # In case alignment > int.
+        return self.Offset()
+    ## @endcond
+
+    def EndVector(self):
+        """EndVector writes data necessary to finish vector construction."""
+
+        self.assertNested()
+        ## @cond FLATBUFFERS_INTERNAL
+        self.nested = False
+        ## @endcond
+        # we already made space for this, so write without PrependUint32
+        self.PlaceUOffsetT(self.vectorNumElems)
+        self.vectorNumElems = None
+        return self.Offset()
+
+    def CreateString(self, s, encoding='utf-8', errors='strict'):
+        """CreateString writes a null-terminated byte string as a vector."""
+
+        self.assertNotNested()
+        ## @cond FLATBUFFERS_INTERNAL
+        self.nested = True
+        ## @endcond
+
+        if isinstance(s, compat.string_types):
+            x = s.encode(encoding, errors)
+        elif isinstance(s, compat.binary_types):
+            x = s
+        else:
+            raise TypeError("non-string passed to CreateString")
+
+        self.Prep(N.UOffsetTFlags.bytewidth, (len(x)+1)*N.Uint8Flags.bytewidth)
+        self.Place(0, N.Uint8Flags)
+
+        l = UOffsetTFlags.py_type(len(s))
+        ## @cond FLATBUFFERS_INTERNAL
+        self.head = UOffsetTFlags.py_type(self.Head() - l)
+        ## @endcond
+        self.Bytes[self.Head():self.Head()+l] = x
+
+        self.vectorNumElems = len(x)
+        return self.EndVector()
+
+    def CreateByteVector(self, x):
+        """CreateString writes a byte vector."""
+
+        self.assertNotNested()
+        ## @cond FLATBUFFERS_INTERNAL
+        self.nested = True
+        ## @endcond
+
+        if not isinstance(x, compat.binary_types):
+            raise TypeError("non-byte vector passed to CreateByteVector")
+
+        self.Prep(N.UOffsetTFlags.bytewidth, len(x)*N.Uint8Flags.bytewidth)
+
+        l = UOffsetTFlags.py_type(len(x))
+        ## @cond FLATBUFFERS_INTERNAL
+        self.head = UOffsetTFlags.py_type(self.Head() - l)
+        ## @endcond
+        self.Bytes[self.Head():self.Head()+l] = x
+
+        self.vectorNumElems = len(x)
+        return self.EndVector()
+
+    def CreateNumpyVector(self, x):
+        """CreateNumpyVector writes a numpy array into the buffer."""
+
+        if np is None:
+            # Numpy is required for this feature
+            raise NumpyRequiredForThisFeature("Numpy was not found.")
+
+        if not isinstance(x, np.ndarray):
+            raise TypeError("non-numpy-ndarray passed to CreateNumpyVector")
+
+        if x.dtype.kind not in ['b', 'i', 'u', 'f']:
+            raise TypeError("numpy-ndarray holds elements of unsupported datatype")
+
+        if x.ndim > 1:
+            raise TypeError("multidimensional-ndarray passed to CreateNumpyVector")
+
+        self.StartVector(x.itemsize, x.size, x.dtype.alignment)
+
+        # Ensure little endian byte ordering
+        if x.dtype.str[0] == "<":
+            x_lend = x
+        else:
+            x_lend = x.byteswap(inplace=False)
+
+        # Calculate total length
+        l = UOffsetTFlags.py_type(x_lend.itemsize * x_lend.size)
+        ## @cond FLATBUFFERS_INTERNAL
+        self.head = UOffsetTFlags.py_type(self.Head() - l)
+        ## @endcond
+
+        # tobytes ensures c_contiguous ordering
+        self.Bytes[self.Head():self.Head()+l] = x_lend.tobytes(order='C')
+
+        self.vectorNumElems = x.size
+        return self.EndVector()
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def assertNested(self):
+        """
+        Check that we are in the process of building an object.
+        """
+
+        if not self.nested:
+            raise IsNotNestedError()
+
+    def assertNotNested(self):
+        """
+        Check that no other objects are being built while making this
+        object. If not, raise an exception.
+        """
+
+        if self.nested:
+            raise IsNestedError()
+
+    def assertStructIsInline(self, obj):
+        """
+        Structs are always stored inline, so need to be created right
+        where they are used. You'll get this error if you created it
+        elsewhere.
+        """
+
+        N.enforce_number(obj, N.UOffsetTFlags)
+        if obj != self.Offset():
+            msg = ("flatbuffers: Tried to write a Struct at an Offset that "
+                   "is different from the current Offset of the Builder.")
+            raise StructIsNotInlineError(msg)
+
+    def Slot(self, slotnum):
+        """
+        Slot sets the vtable key `voffset` to the current location in the
+        buffer.
+
+        """
+        self.assertNested()
+        self.current_vtable[slotnum] = self.Offset()
+    ## @endcond
+
+    def __Finish(self, rootTable, sizePrefix, file_identifier=None):
+        """Finish finalizes a buffer, pointing to the given `rootTable`."""
+        N.enforce_number(rootTable, N.UOffsetTFlags)
+
+        prepSize = N.UOffsetTFlags.bytewidth
+        if file_identifier is not None:
+            prepSize += N.Int32Flags.bytewidth
+        if sizePrefix:
+            prepSize += N.Int32Flags.bytewidth
+        self.Prep(self.minalign, prepSize)
+
+        if file_identifier is not None:
+            self.Prep(N.UOffsetTFlags.bytewidth, encode.FILE_IDENTIFIER_LENGTH)
+
+            # Convert bytes object file_identifier to an array of 4 8-bit integers,
+            # and use big-endian to enforce size compliance.
+            # https://docs.python.org/2/library/struct.html#format-characters
+            file_identifier = N.struct.unpack(">BBBB", file_identifier)
+            for i in range(encode.FILE_IDENTIFIER_LENGTH-1, -1, -1):
+                # Place the bytes of the file_identifer in reverse order:
+                self.Place(file_identifier[i], N.Uint8Flags)
+
+        self.PrependUOffsetTRelative(rootTable)
+        if sizePrefix:
+            size = len(self.Bytes) - self.Head()
+            N.enforce_number(size, N.Int32Flags)
+            self.PrependInt32(size)
+        self.finished = True
+        return self.Head()
+
+    def Finish(self, rootTable, file_identifier=None):
+        """Finish finalizes a buffer, pointing to the given `rootTable`."""
+        return self.__Finish(rootTable, False, file_identifier=file_identifier)
+
+    def FinishSizePrefixed(self, rootTable, file_identifier=None):
+        """
+        Finish finalizes a buffer, pointing to the given `rootTable`,
+        with the size prefixed.
+        """
+        return self.__Finish(rootTable, True, file_identifier=file_identifier)
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def Prepend(self, flags, off):
+        self.Prep(flags.bytewidth, 0)
+        self.Place(off, flags)
+
+    def PrependSlot(self, flags, o, x, d):
+        N.enforce_number(x, flags)
+        N.enforce_number(d, flags)
+        if x != d or self.forceDefaults:
+            self.Prepend(flags, x)
+            self.Slot(o)
+
+    def PrependBoolSlot(self, *args): self.PrependSlot(N.BoolFlags, *args)
+
+    def PrependByteSlot(self, *args): self.PrependSlot(N.Uint8Flags, *args)
+
+    def PrependUint8Slot(self, *args): self.PrependSlot(N.Uint8Flags, *args)
+
+    def PrependUint16Slot(self, *args): self.PrependSlot(N.Uint16Flags, *args)
+
+    def PrependUint32Slot(self, *args): self.PrependSlot(N.Uint32Flags, *args)
+
+    def PrependUint64Slot(self, *args): self.PrependSlot(N.Uint64Flags, *args)
+
+    def PrependInt8Slot(self, *args): self.PrependSlot(N.Int8Flags, *args)
+
+    def PrependInt16Slot(self, *args): self.PrependSlot(N.Int16Flags, *args)
+
+    def PrependInt32Slot(self, *args): self.PrependSlot(N.Int32Flags, *args)
+
+    def PrependInt64Slot(self, *args): self.PrependSlot(N.Int64Flags, *args)
+
+    def PrependFloat32Slot(self, *args): self.PrependSlot(N.Float32Flags,
+                                                          *args)
+
+    def PrependFloat64Slot(self, *args): self.PrependSlot(N.Float64Flags,
+                                                          *args)
+
+    def PrependUOffsetTRelativeSlot(self, o, x, d):
+        """
+        PrependUOffsetTRelativeSlot prepends an UOffsetT onto the object at
+        vtable slot `o`. If value `x` equals default `d`, then the slot will
+        be set to zero and no other data will be written.
+        """
+
+        if x != d or self.forceDefaults:
+            self.PrependUOffsetTRelative(x)
+            self.Slot(o)
+
+    def PrependStructSlot(self, v, x, d):
+        """
+        PrependStructSlot prepends a struct onto the object at vtable slot `o`.
+        Structs are stored inline, so nothing additional is being added.
+        In generated code, `d` is always 0.
+        """
+
+        N.enforce_number(d, N.UOffsetTFlags)
+        if x != d:
+            self.assertStructIsInline(x)
+            self.Slot(v)
+
+    ## @endcond
+
+    def PrependBool(self, x):
+        """Prepend a `bool` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.BoolFlags, x)
+
+    def PrependByte(self, x):
+        """Prepend a `byte` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Uint8Flags, x)
+
+    def PrependUint8(self, x):
+        """Prepend an `uint8` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Uint8Flags, x)
+
+    def PrependUint16(self, x):
+        """Prepend an `uint16` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Uint16Flags, x)
+
+    def PrependUint32(self, x):
+        """Prepend an `uint32` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Uint32Flags, x)
+
+    def PrependUint64(self, x):
+        """Prepend an `uint64` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Uint64Flags, x)
+
+    def PrependInt8(self, x):
+        """Prepend an `int8` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Int8Flags, x)
+
+    def PrependInt16(self, x):
+        """Prepend an `int16` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Int16Flags, x)
+
+    def PrependInt32(self, x):
+        """Prepend an `int32` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Int32Flags, x)
+
+    def PrependInt64(self, x):
+        """Prepend an `int64` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Int64Flags, x)
+
+    def PrependFloat32(self, x):
+        """Prepend a `float32` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Float32Flags, x)
+
+    def PrependFloat64(self, x):
+        """Prepend a `float64` to the Builder buffer.
+
+        Note: aligns and checks for space.
+        """
+        self.Prepend(N.Float64Flags, x)
+
+    def ForceDefaults(self, forceDefaults):
+        """
+        In order to save space, fields that are set to their default value
+        don't get serialized into the buffer. Forcing defaults provides a
+        way to manually disable this optimization. When set to `True`, will
+        always serialize default values.
+        """
+        self.forceDefaults = forceDefaults
+
+##############################################################
+
+    ## @cond FLATBUFFERS_INTERNAL
+    def PrependVOffsetT(self, x): self.Prepend(N.VOffsetTFlags, x)
+
+    def Place(self, x, flags):
+        """
+        Place prepends a value specified by `flags` to the Builder,
+        without checking for available space.
+        """
+
+        N.enforce_number(x, flags)
+        self.head = self.head - flags.bytewidth
+        encode.Write(flags.packer_type, self.Bytes, self.Head(), x)
+
+    def PlaceVOffsetT(self, x):
+        """PlaceVOffsetT prepends a VOffsetT to the Builder, without checking
+        for space.
+        """
+        N.enforce_number(x, N.VOffsetTFlags)
+        self.head = self.head - N.VOffsetTFlags.bytewidth
+        encode.Write(packer.voffset, self.Bytes, self.Head(), x)
+
+    def PlaceSOffsetT(self, x):
+        """PlaceSOffsetT prepends a SOffsetT to the Builder, without checking
+        for space.
+        """
+        N.enforce_number(x, N.SOffsetTFlags)
+        self.head = self.head - N.SOffsetTFlags.bytewidth
+        encode.Write(packer.soffset, self.Bytes, self.Head(), x)
+
+    def PlaceUOffsetT(self, x):
+        """PlaceUOffsetT prepends a UOffsetT to the Builder, without checking
+        for space.
+        """
+        N.enforce_number(x, N.UOffsetTFlags)
+        self.head = self.head - N.UOffsetTFlags.bytewidth
+        encode.Write(packer.uoffset, self.Bytes, self.Head(), x)
+    ## @endcond
+
+## @cond FLATBUFFERS_INTERNAL
+def vtableEqual(a, objectStart, b):
+    """vtableEqual compares an unwritten vtable to a written vtable."""
+
+    N.enforce_number(objectStart, N.UOffsetTFlags)
+
+    if len(a) * N.VOffsetTFlags.bytewidth != len(b):
+        return False
+
+    for i, elem in enumerate(a):
+        x = encode.Get(packer.voffset, b, i * N.VOffsetTFlags.bytewidth)
+
+        # Skip vtable entries that indicate a default value.
+        if x == 0 and elem == 0:
+            pass
+        else:
+            y = objectStart - elem
+            if x != y:
+                return False
+    return True
+## @endcond
+## @}
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/compat.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/compat.py
new file mode 100644
index 0000000..2fc9cca
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/compat.py
@@ -0,0 +1,81 @@
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" A tiny version of `six` to help with backwards compability. Also includes
+ compatibility helpers for numpy. """
+
+import sys
+import imp
+
+PY2 = sys.version_info[0] == 2
+PY26 = sys.version_info[0:2] == (2, 6)
+PY27 = sys.version_info[0:2] == (2, 7)
+PY275 = sys.version_info[0:3] >= (2, 7, 5)
+PY3 = sys.version_info[0] == 3
+PY34 = sys.version_info[0:2] >= (3, 4)
+
+if PY3:
+    string_types = (str,)
+    binary_types = (bytes,bytearray)
+    range_func = range
+    memoryview_type = memoryview
+    struct_bool_decl = "?"
+else:
+    string_types = (unicode,)
+    if PY26 or PY27:
+        binary_types = (str,bytearray)
+    else:
+        binary_types = (str,)
+    range_func = xrange
+    if PY26 or (PY27 and not PY275):
+        memoryview_type = buffer
+        struct_bool_decl = "<b"
+    else:
+        memoryview_type = memoryview
+        struct_bool_decl = "?"
+
+# Helper functions to facilitate making numpy optional instead of required
+
+def import_numpy():
+    """
+    Returns the numpy module if it exists on the system,
+    otherwise returns None.
+    """
+    try:
+        imp.find_module('numpy')
+        numpy_exists = True
+    except ImportError:
+        numpy_exists = False
+
+    if numpy_exists:
+        # We do this outside of try/except block in case numpy exists
+        # but is not installed correctly. We do not want to catch an
+        # incorrect installation which would manifest as an
+        # ImportError.
+        import numpy as np
+    else:
+        np = None
+
+    return np
+
+
+class NumpyRequiredForThisFeature(RuntimeError):
+    """
+    Error raised when user tries to use a feature that
+    requires numpy without having numpy installed.
+    """
+    pass
+
+
+# NOTE: Future Jython support may require code here (look at `six`).
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/encode.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/encode.py
new file mode 100644
index 0000000..c4f1a59
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/encode.py
@@ -0,0 +1,42 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import number_types as N
+from . import packer
+from .compat import memoryview_type
+from .compat import import_numpy, NumpyRequiredForThisFeature
+
+np = import_numpy()
+
+FILE_IDENTIFIER_LENGTH=4
+
+def Get(packer_type, buf, head):
+    """ Get decodes a value at buf[head] using `packer_type`. """
+    return packer_type.unpack_from(memoryview_type(buf), head)[0]
+
+
+def GetVectorAsNumpy(numpy_type, buf, count, offset):
+    """ GetVecAsNumpy decodes values starting at buf[head] as
+    `numpy_type`, where `numpy_type` is a numpy dtype. """
+    if np is not None:
+        # TODO: could set .flags.writeable = False to make users jump through
+        #       hoops before modifying...
+        return np.frombuffer(buf, dtype=numpy_type, count=count, offset=offset)
+    else:
+        raise NumpyRequiredForThisFeature('Numpy was not found.')
+
+
+def Write(packer_type, buf, head, n):
+    """ Write encodes `n` at buf[head] using `packer_type`. """
+    packer_type.pack_into(buf, head, n)
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/flexbuffers.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/flexbuffers.py
new file mode 100644
index 0000000..da10668
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/flexbuffers.py
@@ -0,0 +1,1527 @@
+# Lint as: python3
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implementation of FlexBuffers binary format.
+
+For more info check https://google.github.io/flatbuffers/flexbuffers.html and
+corresponding C++ implementation at
+https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flexbuffers.h
+"""
+
+# pylint: disable=invalid-name
+# TODO(dkovalev): Add type hints everywhere, so tools like pytypes could work.
+
+import array
+import contextlib
+import enum
+import struct
+
+__all__ = ('Type', 'Builder', 'GetRoot', 'Dumps', 'Loads')
+
+
+class BitWidth(enum.IntEnum):
+  """Supported bit widths of value types.
+
+  These are used in the lower 2 bits of a type field to determine the size of
+  the elements (and or size field) of the item pointed to (e.g. vector).
+  """
+  W8 = 0  # 2^0 = 1 byte
+  W16 = 1  # 2^1 = 2 bytes
+  W32 = 2  # 2^2 = 4 bytes
+  W64 = 3  # 2^3 = 8 bytes
+
+  @staticmethod
+  def U(value):
+    """Returns the minimum `BitWidth` to encode unsigned integer value."""
+    assert value >= 0
+
+    if value < (1 << 8):
+      return BitWidth.W8
+    elif value < (1 << 16):
+      return BitWidth.W16
+    elif value < (1 << 32):
+      return BitWidth.W32
+    elif value < (1 << 64):
+      return BitWidth.W64
+    else:
+      raise ValueError('value is too big to encode: %s' % value)
+
+  @staticmethod
+  def I(value):
+    """Returns the minimum `BitWidth` to encode signed integer value."""
+    # -2^(n-1) <=     value < 2^(n-1)
+    # -2^n     <= 2 * value < 2^n
+    # 2 * value < 2^n, when value >= 0 or 2 * (-value) <= 2^n, when value < 0
+    # 2 * value < 2^n, when value >= 0 or 2 * (-value) - 1 < 2^n, when value < 0
+    #
+    # if value >= 0:
+    #   return BitWidth.U(2 * value)
+    # else:
+    #   return BitWidth.U(2 * (-value) - 1)  # ~x = -x - 1
+    value *= 2
+    return BitWidth.U(value if value >= 0 else ~value)
+
+  @staticmethod
+  def F(value):
+    """Returns the `BitWidth` to encode floating point value."""
+    if struct.unpack('f', struct.pack('f', value))[0] == value:
+      return BitWidth.W32
+    return BitWidth.W64
+
+  @staticmethod
+  def B(byte_width):
+    return {
+        1: BitWidth.W8,
+        2: BitWidth.W16,
+        4: BitWidth.W32,
+        8: BitWidth.W64
+    }[byte_width]
+
+
+I = {1: 'b', 2: 'h', 4: 'i', 8: 'q'}  # Integer formats
+U = {1: 'B', 2: 'H', 4: 'I', 8: 'Q'}  # Unsigned integer formats
+F = {4: 'f', 8: 'd'}  # Floating point formats
+
+
+def _Unpack(fmt, buf):
+  return struct.unpack(fmt[len(buf)], buf)[0]
+
+
+def _UnpackVector(fmt, buf, length):
+  byte_width = len(buf) // length
+  return struct.unpack('%d%s' % (length, fmt[byte_width]), buf)
+
+
+def _Pack(fmt, value, byte_width):
+  return struct.pack(fmt[byte_width], value)
+
+
+def _PackVector(fmt, values, byte_width):
+  return struct.pack('%d%s' % (len(values), fmt[byte_width]), *values)
+
+
+def _Mutate(fmt, buf, value, byte_width, value_bit_width):
+  if (1 << value_bit_width) <= byte_width:
+    buf[:byte_width] = _Pack(fmt, value, byte_width)
+    return True
+  return False
+
+
+# Computes how many bytes you'd have to pad to be able to write an
+# "scalar_size" scalar if the buffer had grown to "buf_size",
+# "scalar_size" is a power of two.
+def _PaddingBytes(buf_size, scalar_size):
+  # ((buf_size + (scalar_size - 1)) // scalar_size) * scalar_size - buf_size
+  return -buf_size & (scalar_size - 1)
+
+
+def _ShiftSlice(s, offset, length):
+  start = offset + (0 if s.start is None else s.start)
+  stop = offset + (length if s.stop is None else s.stop)
+  return slice(start, stop, s.step)
+
+
+# https://en.cppreference.com/w/cpp/algorithm/lower_bound
+def _LowerBound(values, value, pred):
+  """Implementation of C++ std::lower_bound() algorithm."""
+  first, last = 0, len(values)
+  count = last - first
+  while count > 0:
+    i = first
+    step = count // 2
+    i += step
+    if pred(values[i], value):
+      i += 1
+      first = i
+      count -= step + 1
+    else:
+      count = step
+  return first
+
+
+# https://en.cppreference.com/w/cpp/algorithm/binary_search
+def _BinarySearch(values, value, pred=lambda x, y: x < y):
+  """Implementation of C++ std::binary_search() algorithm."""
+  index = _LowerBound(values, value, pred)
+  if index != len(values) and not pred(value, values[index]):
+    return index
+  return -1
+
+
+class Type(enum.IntEnum):
+  """Supported types of encoded data.
+
+  These are used as the upper 6 bits of a type field to indicate the actual
+  type.
+  """
+  NULL = 0
+  INT = 1
+  UINT = 2
+  FLOAT = 3
+  # Types above stored inline, types below store an offset.
+  KEY = 4
+  STRING = 5
+  INDIRECT_INT = 6
+  INDIRECT_UINT = 7
+  INDIRECT_FLOAT = 8
+  MAP = 9
+  VECTOR = 10  # Untyped.
+
+  VECTOR_INT = 11  # Typed any size (stores no type table).
+  VECTOR_UINT = 12
+  VECTOR_FLOAT = 13
+  VECTOR_KEY = 14
+  # DEPRECATED, use VECTOR or VECTOR_KEY instead.
+  # Read test.cpp/FlexBuffersDeprecatedTest() for details on why.
+  VECTOR_STRING_DEPRECATED = 15
+
+  VECTOR_INT2 = 16  # Typed tuple (no type table, no size field).
+  VECTOR_UINT2 = 17
+  VECTOR_FLOAT2 = 18
+  VECTOR_INT3 = 19  # Typed triple (no type table, no size field).
+  VECTOR_UINT3 = 20
+  VECTOR_FLOAT3 = 21
+  VECTOR_INT4 = 22  # Typed quad (no type table, no size field).
+  VECTOR_UINT4 = 23
+  VECTOR_FLOAT4 = 24
+
+  BLOB = 25
+  BOOL = 26
+  VECTOR_BOOL = 36  # To do the same type of conversion of type to vector type
+
+  @staticmethod
+  def Pack(type_, bit_width):
+    return (int(type_) << 2) | bit_width
+
+  @staticmethod
+  def Unpack(packed_type):
+    return 1 << (packed_type & 0b11), Type(packed_type >> 2)
+
+  @staticmethod
+  def IsInline(type_):
+    return type_ <= Type.FLOAT or type_ == Type.BOOL
+
+  @staticmethod
+  def IsTypedVector(type_):
+    return Type.VECTOR_INT <= type_ <= Type.VECTOR_STRING_DEPRECATED or \
+           type_ == Type.VECTOR_BOOL
+
+  @staticmethod
+  def IsTypedVectorElementType(type_):
+    return Type.INT <= type_ <= Type.STRING or type_ == Type.BOOL
+
+  @staticmethod
+  def ToTypedVectorElementType(type_):
+    if not Type.IsTypedVector(type_):
+      raise ValueError('must be typed vector type')
+
+    return Type(type_ - Type.VECTOR_INT + Type.INT)
+
+  @staticmethod
+  def IsFixedTypedVector(type_):
+    return Type.VECTOR_INT2 <= type_ <= Type.VECTOR_FLOAT4
+
+  @staticmethod
+  def IsFixedTypedVectorElementType(type_):
+    return Type.INT <= type_ <= Type.FLOAT
+
+  @staticmethod
+  def ToFixedTypedVectorElementType(type_):
+    if not Type.IsFixedTypedVector(type_):
+      raise ValueError('must be fixed typed vector type')
+
+    # 3 types each, starting from length 2.
+    fixed_type = type_ - Type.VECTOR_INT2
+    return Type(fixed_type % 3 + Type.INT), fixed_type // 3 + 2
+
+  @staticmethod
+  def ToTypedVector(element_type, fixed_len=0):
+    """Converts element type to corresponding vector type.
+
+    Args:
+      element_type: vector element type
+      fixed_len: number of elements: 0 for typed vector; 2, 3, or 4 for fixed
+        typed vector.
+
+    Returns:
+      Typed vector type or fixed typed vector type.
+    """
+    if fixed_len == 0:
+      if not Type.IsTypedVectorElementType(element_type):
+        raise ValueError('must be typed vector element type')
+    else:
+      if not Type.IsFixedTypedVectorElementType(element_type):
+        raise ValueError('must be fixed typed vector element type')
+
+    offset = element_type - Type.INT
+    if fixed_len == 0:
+      return Type(offset + Type.VECTOR_INT)  # TypedVector
+    elif fixed_len == 2:
+      return Type(offset + Type.VECTOR_INT2)  # FixedTypedVector
+    elif fixed_len == 3:
+      return Type(offset + Type.VECTOR_INT3)  # FixedTypedVector
+    elif fixed_len == 4:
+      return Type(offset + Type.VECTOR_INT4)  # FixedTypedVector
+    else:
+      raise ValueError('unsupported fixed_len: %s' % fixed_len)
+
+
+class Buf:
+  """Class to access underlying buffer object starting from the given offset."""
+
+  def __init__(self, buf, offset):
+    self._buf = buf
+    self._offset = offset if offset >= 0 else len(buf) + offset
+    self._length = len(buf) - self._offset
+
+  def __getitem__(self, key):
+    if isinstance(key, slice):
+      return self._buf[_ShiftSlice(key, self._offset, self._length)]
+    elif isinstance(key, int):
+      return self._buf[self._offset + key]
+    else:
+      raise TypeError('invalid key type')
+
+  def __setitem__(self, key, value):
+    if isinstance(key, slice):
+      self._buf[_ShiftSlice(key, self._offset, self._length)] = value
+    elif isinstance(key, int):
+      self._buf[self._offset + key] = key
+    else:
+      raise TypeError('invalid key type')
+
+  def __repr__(self):
+    return 'buf[%d:]' % self._offset
+
+  def Find(self, sub):
+    """Returns the lowest index where the sub subsequence is found."""
+    return self._buf[self._offset:].find(sub)
+
+  def Slice(self, offset):
+    """Returns new `Buf` which starts from the given offset."""
+    return Buf(self._buf, self._offset + offset)
+
+  def Indirect(self, offset, byte_width):
+    """Return new `Buf` based on the encoded offset (indirect encoding)."""
+    return self.Slice(offset - _Unpack(U, self[offset:offset + byte_width]))
+
+
+class Object:
+  """Base class for all non-trivial data accessors."""
+  __slots__ = '_buf', '_byte_width'
+
+  def __init__(self, buf, byte_width):
+    self._buf = buf
+    self._byte_width = byte_width
+
+  @property
+  def ByteWidth(self):
+    return self._byte_width
+
+
+class Sized(Object):
+  """Base class for all data accessors which need to read encoded size."""
+  __slots__ = '_size',
+
+  def __init__(self, buf, byte_width, size=0):
+    super().__init__(buf, byte_width)
+    if size == 0:
+      self._size = _Unpack(U, self.SizeBytes)
+    else:
+      self._size = size
+
+  @property
+  def SizeBytes(self):
+    return self._buf[-self._byte_width:0]
+
+  def __len__(self):
+    return self._size
+
+
+class Blob(Sized):
+  """Data accessor for the encoded blob bytes."""
+  __slots__ = ()
+
+  @property
+  def Bytes(self):
+    return self._buf[0:len(self)]
+
+  def __repr__(self):
+    return 'Blob(%s, size=%d)' % (self._buf, len(self))
+
+
+class String(Sized):
+  """Data accessor for the encoded string bytes."""
+  __slots__ = ()
+
+  @property
+  def Bytes(self):
+    return self._buf[0:len(self)]
+
+  def Mutate(self, value):
+    """Mutates underlying string bytes in place.
+
+    Args:
+      value: New string to replace the existing one. New string must have less
+        or equal UTF-8-encoded bytes than the existing one to successfully
+        mutate underlying byte buffer.
+
+    Returns:
+      Whether the value was mutated or not.
+    """
+    encoded = value.encode('utf-8')
+    n = len(encoded)
+    if n <= len(self):
+      self._buf[-self._byte_width:0] = _Pack(U, n, self._byte_width)
+      self._buf[0:n] = encoded
+      self._buf[n:len(self)] = bytearray(len(self) - n)
+      return True
+    return False
+
+  def __str__(self):
+    return self.Bytes.decode('utf-8')
+
+  def __repr__(self):
+    return 'String(%s, size=%d)' % (self._buf, len(self))
+
+
+class Key(Object):
+  """Data accessor for the encoded key bytes."""
+  __slots__ = ()
+
+  def __init__(self, buf, byte_width):
+    assert byte_width == 1
+    super().__init__(buf, byte_width)
+
+  @property
+  def Bytes(self):
+    return self._buf[0:len(self)]
+
+  def __len__(self):
+    return self._buf.Find(0)
+
+  def __str__(self):
+    return self.Bytes.decode('ascii')
+
+  def __repr__(self):
+    return 'Key(%s, size=%d)' % (self._buf, len(self))
+
+
+class Vector(Sized):
+  """Data accessor for the encoded vector bytes."""
+  __slots__ = ()
+
+  def __getitem__(self, index):
+    if index < 0 or index >= len(self):
+      raise IndexError('vector index %s is out of [0, %d) range' % \
+          (index, len(self)))
+
+    packed_type = self._buf[len(self) * self._byte_width + index]
+    buf = self._buf.Slice(index * self._byte_width)
+    return Ref.PackedType(buf, self._byte_width, packed_type)
+
+  @property
+  def Value(self):
+    """Returns the underlying encoded data as a list object."""
+    return [e.Value for e in self]
+
+  def __repr__(self):
+    return 'Vector(%s, byte_width=%d, size=%d)' % \
+        (self._buf, self._byte_width, self._size)
+
+
+class TypedVector(Sized):
+  """Data accessor for the encoded typed vector or fixed typed vector bytes."""
+  __slots__ = '_element_type', '_size'
+
+  def __init__(self, buf, byte_width, element_type, size=0):
+    super().__init__(buf, byte_width, size)
+
+    if element_type == Type.STRING:
+      # These can't be accessed as strings, since we don't know the bit-width
+      # of the size field, see the declaration of
+      # FBT_VECTOR_STRING_DEPRECATED above for details.
+      # We change the type here to be keys, which are a subtype of strings,
+      # and will ignore the size field. This will truncate strings with
+      # embedded nulls.
+      element_type = Type.KEY
+
+    self._element_type = element_type
+
+  @property
+  def Bytes(self):
+    return self._buf[:self._byte_width * len(self)]
+
+  @property
+  def ElementType(self):
+    return self._element_type
+
+  def __getitem__(self, index):
+    if index < 0 or index >= len(self):
+      raise IndexError('vector index %s is out of [0, %d) range' % \
+          (index, len(self)))
+
+    buf = self._buf.Slice(index * self._byte_width)
+    return Ref(buf, self._byte_width, 1, self._element_type)
+
+  @property
+  def Value(self):
+    """Returns underlying data as list object."""
+    if not self:
+      return []
+
+    if self._element_type is Type.BOOL:
+      return [bool(e) for e in _UnpackVector(U, self.Bytes, len(self))]
+    elif self._element_type is Type.INT:
+      return list(_UnpackVector(I, self.Bytes, len(self)))
+    elif self._element_type is Type.UINT:
+      return list(_UnpackVector(U, self.Bytes, len(self)))
+    elif self._element_type is Type.FLOAT:
+      return list(_UnpackVector(F, self.Bytes, len(self)))
+    elif self._element_type is Type.KEY:
+      return [e.AsKey for e in self]
+    elif self._element_type is Type.STRING:
+      return [e.AsString for e in self]
+    else:
+      raise TypeError('unsupported element_type: %s' % self._element_type)
+
+  def __repr__(self):
+    return 'TypedVector(%s, byte_width=%d, element_type=%s, size=%d)' % \
+        (self._buf, self._byte_width, self._element_type, self._size)
+
+
+class Map(Vector):
+  """Data accessor for the encoded map bytes."""
+
+  @staticmethod
+  def CompareKeys(a, b):
+    if isinstance(a, Ref):
+      a = a.AsKeyBytes
+    if isinstance(b, Ref):
+      b = b.AsKeyBytes
+    return a < b
+
+  def __getitem__(self, key):
+    if isinstance(key, int):
+      return super().__getitem__(key)
+
+    index = _BinarySearch(self.Keys, key.encode('ascii'), self.CompareKeys)
+    if index != -1:
+      return super().__getitem__(index)
+
+    raise KeyError(key)
+
+  @property
+  def Keys(self):
+    byte_width = _Unpack(U, self._buf[-2 * self._byte_width:-self._byte_width])
+    buf = self._buf.Indirect(-3 * self._byte_width, self._byte_width)
+    return TypedVector(buf, byte_width, Type.KEY)
+
+  @property
+  def Values(self):
+    return Vector(self._buf, self._byte_width)
+
+  @property
+  def Value(self):
+    return {k.Value: v.Value for k, v in zip(self.Keys, self.Values)}
+
+  def __repr__(self):
+    return 'Map(%s, size=%d)' % (self._buf, len(self))
+
+
+class Ref:
+  """Data accessor for the encoded data bytes."""
+  __slots__ = '_buf', '_parent_width', '_byte_width', '_type'
+
+  @staticmethod
+  def PackedType(buf, parent_width, packed_type):
+    byte_width, type_ = Type.Unpack(packed_type)
+    return Ref(buf, parent_width, byte_width, type_)
+
+  def __init__(self, buf, parent_width, byte_width, type_):
+    self._buf = buf
+    self._parent_width = parent_width
+    self._byte_width = byte_width
+    self._type = type_
+
+  def __repr__(self):
+    return 'Ref(%s, parent_width=%d, byte_width=%d, type_=%s)' % \
+            (self._buf, self._parent_width, self._byte_width, self._type)
+
+  @property
+  def _Bytes(self):
+    return self._buf[:self._parent_width]
+
+  def _ConvertError(self, target_type):
+    raise TypeError('cannot convert %s to %s' % (self._type, target_type))
+
+  def _Indirect(self):
+    return self._buf.Indirect(0, self._parent_width)
+
+  @property
+  def IsNull(self):
+    return self._type is Type.NULL
+
+  @property
+  def IsBool(self):
+    return self._type is Type.BOOL
+
+  @property
+  def AsBool(self):
+    if self._type is Type.BOOL:
+      return bool(_Unpack(U, self._Bytes))
+    else:
+      return self.AsInt != 0
+
+  def MutateBool(self, value):
+    """Mutates underlying boolean value bytes in place.
+
+    Args:
+      value: New boolean value.
+
+    Returns:
+      Whether the value was mutated or not.
+    """
+    return self.IsBool and \
+           _Mutate(U, self._buf, value, self._parent_width, BitWidth.W8)
+
+  @property
+  def IsNumeric(self):
+    return self.IsInt or self.IsFloat
+
+  @property
+  def IsInt(self):
+    return self._type in (Type.INT, Type.INDIRECT_INT, Type.UINT,
+                          Type.INDIRECT_UINT)
+
+  @property
+  def AsInt(self):
+    """Returns current reference as integer value."""
+    if self.IsNull:
+      return 0
+    elif self.IsBool:
+      return int(self.AsBool)
+    elif self._type is Type.INT:
+      return _Unpack(I, self._Bytes)
+    elif self._type is Type.INDIRECT_INT:
+      return _Unpack(I, self._Indirect()[:self._byte_width])
+    if self._type is Type.UINT:
+      return _Unpack(U, self._Bytes)
+    elif self._type is Type.INDIRECT_UINT:
+      return _Unpack(U, self._Indirect()[:self._byte_width])
+    elif self.IsString:
+      return len(self.AsString)
+    elif self.IsKey:
+      return len(self.AsKey)
+    elif self.IsBlob:
+      return len(self.AsBlob)
+    elif self.IsVector:
+      return len(self.AsVector)
+    elif self.IsTypedVector:
+      return len(self.AsTypedVector)
+    elif self.IsFixedTypedVector:
+      return len(self.AsFixedTypedVector)
+    else:
+      raise self._ConvertError(Type.INT)
+
+  def MutateInt(self, value):
+    """Mutates underlying integer value bytes in place.
+
+    Args:
+      value: New integer value. It must fit to the byte size of the existing
+        encoded value.
+
+    Returns:
+      Whether the value was mutated or not.
+    """
+    if self._type is Type.INT:
+      return _Mutate(I, self._buf, value, self._parent_width, BitWidth.I(value))
+    elif self._type is Type.INDIRECT_INT:
+      return _Mutate(I, self._Indirect(), value, self._byte_width,
+                     BitWidth.I(value))
+    elif self._type is Type.UINT:
+      return _Mutate(U, self._buf, value, self._parent_width, BitWidth.U(value))
+    elif self._type is Type.INDIRECT_UINT:
+      return _Mutate(U, self._Indirect(), value, self._byte_width,
+                     BitWidth.U(value))
+    else:
+      return False
+
+  @property
+  def IsFloat(self):
+    return self._type in (Type.FLOAT, Type.INDIRECT_FLOAT)
+
+  @property
+  def AsFloat(self):
+    """Returns current reference as floating point value."""
+    if self.IsNull:
+      return 0.0
+    elif self.IsBool:
+      return float(self.AsBool)
+    elif self.IsInt:
+      return float(self.AsInt)
+    elif self._type is Type.FLOAT:
+      return _Unpack(F, self._Bytes)
+    elif self._type is Type.INDIRECT_FLOAT:
+      return _Unpack(F, self._Indirect()[:self._byte_width])
+    elif self.IsString:
+      return float(self.AsString)
+    elif self.IsVector:
+      return float(len(self.AsVector))
+    elif self.IsTypedVector():
+      return float(len(self.AsTypedVector))
+    elif self.IsFixedTypedVector():
+      return float(len(self.FixedTypedVector))
+    else:
+      raise self._ConvertError(Type.FLOAT)
+
+  def MutateFloat(self, value):
+    """Mutates underlying floating point value bytes in place.
+
+    Args:
+      value: New float value. It must fit to the byte size of the existing
+        encoded value.
+
+    Returns:
+      Whether the value was mutated or not.
+    """
+    if self._type is Type.FLOAT:
+      return _Mutate(F, self._buf, value, self._parent_width,
+                     BitWidth.B(self._parent_width))
+    elif self._type is Type.INDIRECT_FLOAT:
+      return _Mutate(F, self._Indirect(), value, self._byte_width,
+                     BitWidth.B(self._byte_width))
+    else:
+      return False
+
+  @property
+  def IsKey(self):
+    return self._type is Type.KEY
+
+  @property
+  def AsKeyBytes(self):
+    if self.IsKey:
+      return Key(self._Indirect(), self._byte_width).Bytes
+    else:
+      raise self._ConvertError(Type.KEY)
+
+  @property
+  def AsKey(self):
+    if self.IsKey:
+      return str(Key(self._Indirect(), self._byte_width))
+    else:
+      raise self._ConvertError(Type.KEY)
+
+  @property
+  def IsString(self):
+    return self._type is Type.STRING
+
+  @property
+  def AsString(self):
+    if self.IsString:
+      return str(String(self._Indirect(), self._byte_width))
+    elif self.IsKey:
+      return self.AsKey
+    else:
+      raise self._ConvertError(Type.STRING)
+
+  def MutateString(self, value):
+    return String(self._Indirect(), self._byte_width).Mutate(value)
+
+  @property
+  def IsBlob(self):
+    return self._type is Type.BLOB
+
+  @property
+  def AsBlob(self):
+    if self.IsBlob:
+      return Blob(self._Indirect(), self._byte_width).Bytes
+    else:
+      raise self._ConvertError(Type.BLOB)
+
+  @property
+  def IsAnyVector(self):
+    return self.IsVector or self.IsTypedVector or self.IsFixedTypedVector()
+
+  @property
+  def IsVector(self):
+    return self._type in (Type.VECTOR, Type.MAP)
+
+  @property
+  def AsVector(self):
+    if self.IsVector:
+      return Vector(self._Indirect(), self._byte_width)
+    else:
+      raise self._ConvertError(Type.VECTOR)
+
+  @property
+  def IsTypedVector(self):
+    return Type.IsTypedVector(self._type)
+
+  @property
+  def AsTypedVector(self):
+    if self.IsTypedVector:
+      return TypedVector(self._Indirect(), self._byte_width,
+                         Type.ToTypedVectorElementType(self._type))
+    else:
+      raise self._ConvertError('TYPED_VECTOR')
+
+  @property
+  def IsFixedTypedVector(self):
+    return Type.IsFixedTypedVector(self._type)
+
+  @property
+  def AsFixedTypedVector(self):
+    if self.IsFixedTypedVector:
+      element_type, size = Type.ToFixedTypedVectorElementType(self._type)
+      return TypedVector(self._Indirect(), self._byte_width, element_type, size)
+    else:
+      raise self._ConvertError('FIXED_TYPED_VECTOR')
+
+  @property
+  def IsMap(self):
+    return self._type is Type.MAP
+
+  @property
+  def AsMap(self):
+    if self.IsMap:
+      return Map(self._Indirect(), self._byte_width)
+    else:
+      raise self._ConvertError(Type.MAP)
+
+  @property
+  def Value(self):
+    """Converts current reference to value of corresponding type.
+
+    This is equivalent to calling `AsInt` for integer values, `AsFloat` for
+    floating point values, etc.
+
+    Returns:
+      Value of corresponding type.
+    """
+    if self.IsNull:
+      return None
+    elif self.IsBool:
+      return self.AsBool
+    elif self.IsInt:
+      return self.AsInt
+    elif self.IsFloat:
+      return self.AsFloat
+    elif self.IsString:
+      return self.AsString
+    elif self.IsKey:
+      return self.AsKey
+    elif self.IsBlob:
+      return self.AsBlob
+    elif self.IsMap:
+      return self.AsMap.Value
+    elif self.IsVector:
+      return self.AsVector.Value
+    elif self.IsTypedVector:
+      return self.AsTypedVector.Value
+    elif self.IsFixedTypedVector:
+      return self.AsFixedTypedVector.Value
+    else:
+      raise TypeError('cannot convert %r to value' % self)
+
+
+def _IsIterable(obj):
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
+class Value:
+  """Class to represent given value during the encoding process."""
+
+  @staticmethod
+  def Null():
+    return Value(0, Type.NULL, BitWidth.W8)
+
+  @staticmethod
+  def Bool(value):
+    return Value(value, Type.BOOL, BitWidth.W8)
+
+  @staticmethod
+  def Int(value, bit_width):
+    return Value(value, Type.INT, bit_width)
+
+  @staticmethod
+  def UInt(value, bit_width):
+    return Value(value, Type.UINT, bit_width)
+
+  @staticmethod
+  def Float(value, bit_width):
+    return Value(value, Type.FLOAT, bit_width)
+
+  @staticmethod
+  def Key(offset):
+    return Value(offset, Type.KEY, BitWidth.W8)
+
+  def __init__(self, value, type_, min_bit_width):
+    self._value = value
+    self._type = type_
+
+    # For scalars: of itself, for vector: of its elements, for string: length.
+    self._min_bit_width = min_bit_width
+
+  @property
+  def Value(self):
+    return self._value
+
+  @property
+  def Type(self):
+    return self._type
+
+  @property
+  def MinBitWidth(self):
+    return self._min_bit_width
+
+  def StoredPackedType(self, parent_bit_width=BitWidth.W8):
+    return Type.Pack(self._type, self.StoredWidth(parent_bit_width))
+
+  # We have an absolute offset, but want to store a relative offset
+  # elem_index elements beyond the current buffer end. Since whether
+  # the relative offset fits in a certain byte_width depends on
+  # the size of the elements before it (and their alignment), we have
+  # to test for each size in turn.
+  def ElemWidth(self, buf_size, elem_index=0):
+    if Type.IsInline(self._type):
+      return self._min_bit_width
+    for byte_width in 1, 2, 4, 8:
+      offset_loc = buf_size + _PaddingBytes(buf_size, byte_width) + \
+                   elem_index * byte_width
+      bit_width = BitWidth.U(offset_loc - self._value)
+      if byte_width == (1 << bit_width):
+        return bit_width
+    raise ValueError('relative offset is too big')
+
+  def StoredWidth(self, parent_bit_width=BitWidth.W8):
+    if Type.IsInline(self._type):
+      return max(self._min_bit_width, parent_bit_width)
+    return self._min_bit_width
+
+  def __repr__(self):
+    return 'Value(%s, %s, %s)' % (self._value, self._type, self._min_bit_width)
+
+  def __str__(self):
+    return str(self._value)
+
+
+def InMap(func):
+  def wrapper(self, *args, **kwargs):
+    if isinstance(args[0], str):
+      self.Key(args[0])
+      func(self, *args[1:], **kwargs)
+    else:
+      func(self, *args, **kwargs)
+  return wrapper
+
+
+def InMapForString(func):
+  def wrapper(self, *args):
+    if len(args) == 1:
+      func(self, args[0])
+    elif len(args) == 2:
+      self.Key(args[0])
+      func(self, args[1])
+    else:
+      raise ValueError('invalid number of arguments')
+  return wrapper
+
+
+class Pool:
+  """Collection of (data, offset) pairs sorted by data for quick access."""
+
+  def __init__(self):
+    self._pool = []  # sorted list of (data, offset) tuples
+
+  def FindOrInsert(self, data, offset):
+    do = data, offset
+    index = _BinarySearch(self._pool, do, lambda a, b: a[0] < b[0])
+    if index != -1:
+      _, offset = self._pool[index]
+      return offset
+    self._pool.insert(index, do)
+    return None
+
+  def Clear(self):
+    self._pool = []
+
+  @property
+  def Elements(self):
+    return [data for data, _ in self._pool]
+
+
+class Builder:
+  """Helper class to encode structural data into flexbuffers format."""
+
+  def __init__(self,
+               share_strings=False,
+               share_keys=True,
+               force_min_bit_width=BitWidth.W8):
+    self._share_strings = share_strings
+    self._share_keys = share_keys
+    self._force_min_bit_width = force_min_bit_width
+
+    self._string_pool = Pool()
+    self._key_pool = Pool()
+
+    self._finished = False
+    self._buf = bytearray()
+    self._stack = []
+
+  def __len__(self):
+    return len(self._buf)
+
+  @property
+  def StringPool(self):
+    return self._string_pool
+
+  @property
+  def KeyPool(self):
+    return self._key_pool
+
+  def Clear(self):
+    self._string_pool.Clear()
+    self._key_pool.Clear()
+    self._finished = False
+    self._buf = bytearray()
+    self._stack = []
+
+  def Finish(self):
+    """Finishes encoding process and returns underlying buffer."""
+    if self._finished:
+      raise RuntimeError('builder has been already finished')
+
+    # If you hit this exception, you likely have objects that were never
+    # included in a parent. You need to have exactly one root to finish a
+    # buffer. Check your Start/End calls are matched, and all objects are inside
+    # some other object.
+    if len(self._stack) != 1:
+      raise RuntimeError('internal stack size must be one')
+
+    value = self._stack[0]
+    byte_width = self._Align(value.ElemWidth(len(self._buf)))
+    self._WriteAny(value, byte_width=byte_width)  # Root value
+    self._Write(U, value.StoredPackedType(), byte_width=1)  # Root type
+    self._Write(U, byte_width, byte_width=1)  # Root size
+
+    self.finished = True
+    return self._buf
+
+  def _ReadKey(self, offset):
+    key = self._buf[offset:]
+    return key[:key.find(0)]
+
+  def _Align(self, alignment):
+    byte_width = 1 << alignment
+    self._buf.extend(b'\x00' * _PaddingBytes(len(self._buf), byte_width))
+    return byte_width
+
+  def _Write(self, fmt, value, byte_width):
+    self._buf.extend(_Pack(fmt, value, byte_width))
+
+  def _WriteVector(self, fmt, values, byte_width):
+    self._buf.extend(_PackVector(fmt, values, byte_width))
+
+  def _WriteOffset(self, offset, byte_width):
+    relative_offset = len(self._buf) - offset
+    assert byte_width == 8 or relative_offset < (1 << (8 * byte_width))
+    self._Write(U, relative_offset, byte_width)
+
+  def _WriteAny(self, value, byte_width):
+    fmt = {
+        Type.NULL: U, Type.BOOL: U, Type.INT: I, Type.UINT: U, Type.FLOAT: F
+    }.get(value.Type)
+    if fmt:
+      self._Write(fmt, value.Value, byte_width)
+    else:
+      self._WriteOffset(value.Value, byte_width)
+
+  def _WriteBlob(self, data, append_zero, type_):
+    bit_width = BitWidth.U(len(data))
+    byte_width = self._Align(bit_width)
+    self._Write(U, len(data), byte_width)
+    loc = len(self._buf)
+    self._buf.extend(data)
+    if append_zero:
+      self._buf.append(0)
+    self._stack.append(Value(loc, type_, bit_width))
+    return loc
+
+  def _WriteScalarVector(self, element_type, byte_width, elements, fixed):
+    """Writes scalar vector elements to the underlying buffer."""
+    bit_width = BitWidth.B(byte_width)
+    # If you get this exception, you're trying to write a vector with a size
+    # field that is bigger than the scalars you're trying to write (e.g. a
+    # byte vector > 255 elements). For such types, write a "blob" instead.
+    if BitWidth.U(len(elements)) > bit_width:
+      raise ValueError('too many elements for the given byte_width')
+
+    self._Align(bit_width)
+    if not fixed:
+      self._Write(U, len(elements), byte_width)
+
+    loc = len(self._buf)
+
+    fmt = {Type.INT: I, Type.UINT: U, Type.FLOAT: F}.get(element_type)
+    if not fmt:
+      raise TypeError('unsupported element_type')
+    self._WriteVector(fmt, elements, byte_width)
+
+    type_ = Type.ToTypedVector(element_type, len(elements) if fixed else 0)
+    self._stack.append(Value(loc, type_, bit_width))
+    return loc
+
+  def _CreateVector(self, elements, typed, fixed, keys=None):
+    """Writes vector elements to the underlying buffer."""
+    length = len(elements)
+
+    if fixed and not typed:
+      raise ValueError('fixed vector must be typed')
+
+    # Figure out smallest bit width we can store this vector with.
+    bit_width = max(self._force_min_bit_width, BitWidth.U(length))
+    prefix_elems = 1  # Vector size
+    if keys:
+      bit_width = max(bit_width, keys.ElemWidth(len(self._buf)))
+      prefix_elems += 2  # Offset to the keys vector and its byte width.
+
+    vector_type = Type.KEY
+    # Check bit widths and types for all elements.
+    for i, e in enumerate(elements):
+      bit_width = max(bit_width, e.ElemWidth(len(self._buf), prefix_elems + i))
+
+      if typed:
+        if i == 0:
+          vector_type = e.Type
+        else:
+          if vector_type != e.Type:
+            raise RuntimeError('typed vector elements must be of the same type')
+
+    if fixed and not Type.IsFixedTypedVectorElementType(vector_type):
+      raise RuntimeError('must be fixed typed vector element type')
+
+    byte_width = self._Align(bit_width)
+    # Write vector. First the keys width/offset if available, and size.
+    if keys:
+      self._WriteOffset(keys.Value, byte_width)
+      self._Write(U, 1 << keys.MinBitWidth, byte_width)
+
+    if not fixed:
+      self._Write(U, length, byte_width)
+
+    # Then the actual data.
+    loc = len(self._buf)
+    for e in elements:
+      self._WriteAny(e, byte_width)
+
+    # Then the types.
+    if not typed:
+      for e in elements:
+        self._buf.append(e.StoredPackedType(bit_width))
+
+    if keys:
+      type_ = Type.MAP
+    else:
+      if typed:
+        type_ = Type.ToTypedVector(vector_type, length if fixed else 0)
+      else:
+        type_ = Type.VECTOR
+
+    return Value(loc, type_, bit_width)
+
+  def _PushIndirect(self, value, type_, bit_width):
+    byte_width = self._Align(bit_width)
+    loc = len(self._buf)
+    fmt = {
+        Type.INDIRECT_INT: I,
+        Type.INDIRECT_UINT: U,
+        Type.INDIRECT_FLOAT: F
+    }[type_]
+    self._Write(fmt, value, byte_width)
+    self._stack.append(Value(loc, type_, bit_width))
+
+  @InMapForString
+  def String(self, value):
+    """Encodes string value."""
+    reset_to = len(self._buf)
+    encoded = value.encode('utf-8')
+    loc = self._WriteBlob(encoded, append_zero=True, type_=Type.STRING)
+    if self._share_strings:
+      prev_loc = self._string_pool.FindOrInsert(encoded, loc)
+      if prev_loc is not None:
+        del self._buf[reset_to:]
+        self._stack[-1]._value = loc = prev_loc  # pylint: disable=protected-access
+
+    return loc
+
+  @InMap
+  def Blob(self, value):
+    """Encodes binary blob value.
+
+    Args:
+      value: A byte/bytearray value to encode
+
+    Returns:
+      Offset of the encoded value in underlying the byte buffer.
+    """
+    return self._WriteBlob(value, append_zero=False, type_=Type.BLOB)
+
+  def Key(self, value):
+    """Encodes key value.
+
+    Args:
+      value: A byte/bytearray/str value to encode. Byte object must not contain
+        zero bytes. String object must be convertible to ASCII.
+
+    Returns:
+      Offset of the encoded value in the underlying byte buffer.
+    """
+    if isinstance(value, (bytes, bytearray)):
+      encoded = value
+    else:
+      encoded = value.encode('ascii')
+
+    if 0 in encoded:
+      raise ValueError('key contains zero byte')
+
+    loc = len(self._buf)
+    self._buf.extend(encoded)
+    self._buf.append(0)
+    if self._share_keys:
+      prev_loc = self._key_pool.FindOrInsert(encoded, loc)
+      if prev_loc is not None:
+        del self._buf[loc:]
+        loc = prev_loc
+
+    self._stack.append(Value.Key(loc))
+    return loc
+
+  def Null(self, key=None):
+    """Encodes None value."""
+    if key:
+      self.Key(key)
+    self._stack.append(Value.Null())
+
+  @InMap
+  def Bool(self, value):
+    """Encodes boolean value.
+
+    Args:
+      value: A boolean value.
+    """
+    self._stack.append(Value.Bool(value))
+
+  @InMap
+  def Int(self, value, byte_width=0):
+    """Encodes signed integer value.
+
+    Args:
+      value: A signed integer value.
+      byte_width: Number of bytes to use: 1, 2, 4, or 8.
+    """
+    bit_width = BitWidth.I(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._stack.append(Value.Int(value, bit_width))
+
+  @InMap
+  def IndirectInt(self, value, byte_width=0):
+    """Encodes signed integer value indirectly.
+
+    Args:
+      value: A signed integer value.
+      byte_width: Number of bytes to use: 1, 2, 4, or 8.
+    """
+    bit_width = BitWidth.I(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._PushIndirect(value, Type.INDIRECT_INT, bit_width)
+
+  @InMap
+  def UInt(self, value, byte_width=0):
+    """Encodes unsigned integer value.
+
+    Args:
+      value: An unsigned integer value.
+      byte_width: Number of bytes to use: 1, 2, 4, or 8.
+    """
+    bit_width = BitWidth.U(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._stack.append(Value.UInt(value, bit_width))
+
+  @InMap
+  def IndirectUInt(self, value, byte_width=0):
+    """Encodes unsigned integer value indirectly.
+
+    Args:
+      value: An unsigned integer value.
+      byte_width: Number of bytes to use: 1, 2, 4, or 8.
+    """
+    bit_width = BitWidth.U(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._PushIndirect(value, Type.INDIRECT_UINT, bit_width)
+
+  @InMap
+  def Float(self, value, byte_width=0):
+    """Encodes floating point value.
+
+    Args:
+      value: A floating point value.
+      byte_width: Number of bytes to use: 4 or 8.
+    """
+    bit_width = BitWidth.F(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._stack.append(Value.Float(value, bit_width))
+
+  @InMap
+  def IndirectFloat(self, value, byte_width=0):
+    """Encodes floating point value indirectly.
+
+    Args:
+      value: A floating point value.
+      byte_width: Number of bytes to use: 4 or 8.
+    """
+    bit_width = BitWidth.F(value) if byte_width == 0 else BitWidth.B(byte_width)
+    self._PushIndirect(value, Type.INDIRECT_FLOAT, bit_width)
+
+  def _StartVector(self):
+    """Starts vector construction."""
+    return len(self._stack)
+
+  def _EndVector(self, start, typed, fixed):
+    """Finishes vector construction by encodung its elements."""
+    vec = self._CreateVector(self._stack[start:], typed, fixed)
+    del self._stack[start:]
+    self._stack.append(vec)
+    return vec.Value
+
+  @contextlib.contextmanager
+  def Vector(self, key=None):
+    if key:
+      self.Key(key)
+
+    try:
+      start = self._StartVector()
+      yield self
+    finally:
+      self._EndVector(start, typed=False, fixed=False)
+
+  @InMap
+  def VectorFromElements(self, elements):
+    """Encodes sequence of any elements as a vector.
+
+    Args:
+      elements: sequence of elements, they may have different types.
+    """
+    with self.Vector():
+      for e in elements:
+        self.Add(e)
+
+  @contextlib.contextmanager
+  def TypedVector(self, key=None):
+    if key:
+      self.Key(key)
+
+    try:
+      start = self._StartVector()
+      yield self
+    finally:
+      self._EndVector(start, typed=True, fixed=False)
+
+  @InMap
+  def TypedVectorFromElements(self, elements, element_type=None):
+    """Encodes sequence of elements of the same type as typed vector.
+
+    Args:
+      elements: Sequence of elements, they must be of the same type.
+      element_type: Suggested element type. Setting it to None means determining
+        correct value automatically based on the given elements.
+    """
+    if isinstance(elements, array.array):
+      if elements.typecode == 'f':
+        self._WriteScalarVector(Type.FLOAT, 4, elements, fixed=False)
+      elif elements.typecode == 'd':
+        self._WriteScalarVector(Type.FLOAT, 8, elements, fixed=False)
+      elif elements.typecode in ('b', 'h', 'i', 'l', 'q'):
+        self._WriteScalarVector(
+            Type.INT, elements.itemsize, elements, fixed=False)
+      elif elements.typecode in ('B', 'H', 'I', 'L', 'Q'):
+        self._WriteScalarVector(
+            Type.UINT, elements.itemsize, elements, fixed=False)
+      else:
+        raise ValueError('unsupported array typecode: %s' % elements.typecode)
+    else:
+      add = self.Add if element_type is None else self.Adder(element_type)
+      with self.TypedVector():
+        for e in elements:
+          add(e)
+
+  @InMap
+  def FixedTypedVectorFromElements(self,
+                                   elements,
+                                   element_type=None,
+                                   byte_width=0):
+    """Encodes sequence of elements of the same type as fixed typed vector.
+
+    Args:
+      elements: Sequence of elements, they must be of the same type. Allowed
+        types are `Type.INT`, `Type.UINT`, `Type.FLOAT`. Allowed number of
+        elements are 2, 3, or 4.
+      element_type: Suggested element type. Setting it to None means determining
+        correct value automatically based on the given elements.
+      byte_width: Number of bytes to use per element. For `Type.INT` and
+        `Type.UINT`: 1, 2, 4, or 8. For `Type.FLOAT`: 4 or 8. Setting it to 0
+        means determining correct value automatically based on the given
+        elements.
+    """
+    if not 2 <= len(elements) <= 4:
+      raise ValueError('only 2, 3, or 4 elements are supported')
+
+    types = {type(e) for e in elements}
+    if len(types) != 1:
+      raise TypeError('all elements must be of the same type')
+
+    type_, = types
+
+    if element_type is None:
+      element_type = {int: Type.INT, float: Type.FLOAT}.get(type_)
+      if not element_type:
+        raise TypeError('unsupported element_type: %s' % type_)
+
+    if byte_width == 0:
+      width = {
+          Type.UINT: BitWidth.U,
+          Type.INT: BitWidth.I,
+          Type.FLOAT: BitWidth.F
+      }[element_type]
+      byte_width = 1 << max(width(e) for e in elements)
+
+    self._WriteScalarVector(element_type, byte_width, elements, fixed=True)
+
+  def _StartMap(self):
+    """Starts map construction."""
+    return len(self._stack)
+
+  def _EndMap(self, start):
+    """Finishes map construction by encodung its elements."""
+    # Interleaved keys and values on the stack.
+    stack = self._stack[start:]
+
+    if len(stack) % 2 != 0:
+      raise RuntimeError('must be even number of keys and values')
+
+    for key in stack[::2]:
+      if key.Type is not Type.KEY:
+        raise RuntimeError('all map keys must be of %s type' % Type.KEY)
+
+    pairs = zip(stack[::2], stack[1::2])  # [(key, value), ...]
+    pairs = sorted(pairs, key=lambda pair: self._ReadKey(pair[0].Value))
+
+    del self._stack[start:]
+    for pair in pairs:
+      self._stack.extend(pair)
+
+    keys = self._CreateVector(self._stack[start::2], typed=True, fixed=False)
+    values = self._CreateVector(
+        self._stack[start + 1::2], typed=False, fixed=False, keys=keys)
+
+    del self._stack[start:]
+    self._stack.append(values)
+    return values.Value
+
+  @contextlib.contextmanager
+  def Map(self, key=None):
+    if key:
+      self.Key(key)
+
+    try:
+      start = self._StartMap()
+      yield self
+    finally:
+      self._EndMap(start)
+
+  def MapFromElements(self, elements):
+    start = self._StartMap()
+    for k, v in elements.items():
+      self.Key(k)
+      self.Add(v)
+    self._EndMap(start)
+
+  def Adder(self, type_):
+    return {
+        Type.BOOL: self.Bool,
+        Type.INT: self.Int,
+        Type.INDIRECT_INT: self.IndirectInt,
+        Type.UINT: self.UInt,
+        Type.INDIRECT_UINT: self.IndirectUInt,
+        Type.FLOAT: self.Float,
+        Type.INDIRECT_FLOAT: self.IndirectFloat,
+        Type.KEY: self.Key,
+        Type.BLOB: self.Blob,
+        Type.STRING: self.String,
+    }[type_]
+
+  @InMapForString
+  def Add(self, value):
+    """Encodes value of any supported type."""
+    if value is None:
+      self.Null()
+    elif isinstance(value, bool):
+      self.Bool(value)
+    elif isinstance(value, int):
+      self.Int(value)
+    elif isinstance(value, float):
+      self.Float(value)
+    elif isinstance(value, str):
+      self.String(value)
+    elif isinstance(value, (bytes, bytearray)):
+      self.Blob(value)
+    elif isinstance(value, dict):
+      with self.Map():
+        for k, v in value.items():
+          self.Key(k)
+          self.Add(v)
+    elif isinstance(value, array.array):
+      self.TypedVectorFromElements(value)
+    elif _IsIterable(value):
+      self.VectorFromElements(value)
+    else:
+      raise TypeError('unsupported python type: %s' % type(value))
+
+  @property
+  def LastValue(self):
+    return self._stack[-1]
+
+  @InMap
+  def ReuseValue(self, value):
+    self._stack.append(value)
+
+
+def GetRoot(buf):
+  """Returns root `Ref` object for the given buffer."""
+  if len(buf) < 3:
+    raise ValueError('buffer is too small')
+  byte_width = buf[-1]
+  return Ref.PackedType(
+      Buf(buf, -(2 + byte_width)), byte_width, packed_type=buf[-2])
+
+
+def Dumps(obj):
+  """Returns bytearray with the encoded python object."""
+  fbb = Builder()
+  fbb.Add(obj)
+  return fbb.Finish()
+
+
+def Loads(buf):
+  """Returns python object decoded from the buffer."""
+  return GetRoot(buf).Value
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/number_types.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/number_types.py
new file mode 100644
index 0000000..47942ff
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/number_types.py
@@ -0,0 +1,181 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import struct
+
+from . import packer
+from .compat import import_numpy, NumpyRequiredForThisFeature
+
+np = import_numpy()
+
+# For reference, see:
+# https://docs.python.org/2/library/ctypes.html#ctypes-fundamental-data-types-2
+
+# These classes could be collections.namedtuple instances, but those are new
+# in 2.6 and we want to work towards 2.5 compatability.
+
+class BoolFlags(object):
+    bytewidth = 1
+    min_val = False
+    max_val = True
+    py_type = bool
+    name = "bool"
+    packer_type = packer.boolean
+
+
+class Uint8Flags(object):
+    bytewidth = 1
+    min_val = 0
+    max_val = (2**8) - 1
+    py_type = int
+    name = "uint8"
+    packer_type = packer.uint8
+
+
+class Uint16Flags(object):
+    bytewidth = 2
+    min_val = 0
+    max_val = (2**16) - 1
+    py_type = int
+    name = "uint16"
+    packer_type = packer.uint16
+
+
+class Uint32Flags(object):
+    bytewidth = 4
+    min_val = 0
+    max_val = (2**32) - 1
+    py_type = int
+    name = "uint32"
+    packer_type = packer.uint32
+
+
+class Uint64Flags(object):
+    bytewidth = 8
+    min_val = 0
+    max_val = (2**64) - 1
+    py_type = int
+    name = "uint64"
+    packer_type = packer.uint64
+
+
+class Int8Flags(object):
+    bytewidth = 1
+    min_val = -(2**7)
+    max_val = (2**7) - 1
+    py_type = int
+    name = "int8"
+    packer_type = packer.int8
+
+
+class Int16Flags(object):
+    bytewidth = 2
+    min_val = -(2**15)
+    max_val = (2**15) - 1
+    py_type = int
+    name = "int16"
+    packer_type = packer.int16
+
+
+class Int32Flags(object):
+    bytewidth = 4
+    min_val = -(2**31)
+    max_val = (2**31) - 1
+    py_type = int
+    name = "int32"
+    packer_type = packer.int32
+
+
+class Int64Flags(object):
+    bytewidth = 8
+    min_val = -(2**63)
+    max_val = (2**63) - 1
+    py_type = int
+    name = "int64"
+    packer_type = packer.int64
+
+
+class Float32Flags(object):
+    bytewidth = 4
+    min_val = None
+    max_val = None
+    py_type = float
+    name = "float32"
+    packer_type = packer.float32
+
+
+class Float64Flags(object):
+    bytewidth = 8
+    min_val = None
+    max_val = None
+    py_type = float
+    name = "float64"
+    packer_type = packer.float64
+
+
+class SOffsetTFlags(Int32Flags):
+    pass
+
+
+class UOffsetTFlags(Uint32Flags):
+    pass
+
+
+class VOffsetTFlags(Uint16Flags):
+    pass
+
+
+def valid_number(n, flags):
+    if flags.min_val is None and flags.max_val is None:
+        return True
+    return flags.min_val <= n <= flags.max_val
+
+
+def enforce_number(n, flags):
+    if flags.min_val is None and flags.max_val is None:
+        return
+    if not flags.min_val <= n <= flags.max_val:
+        raise TypeError("bad number %s for type %s" % (str(n), flags.name))
+
+
+def float32_to_uint32(n):
+    packed = struct.pack("<1f", n)
+    (converted,) = struct.unpack("<1L", packed)
+    return converted
+
+
+def uint32_to_float32(n):
+    packed = struct.pack("<1L", n)
+    (unpacked,) = struct.unpack("<1f", packed)
+    return unpacked
+
+
+def float64_to_uint64(n):
+    packed = struct.pack("<1d", n)
+    (converted,) = struct.unpack("<1Q", packed)
+    return converted
+
+
+def uint64_to_float64(n):
+    packed = struct.pack("<1Q", n)
+    (unpacked,) = struct.unpack("<1d", packed)
+    return unpacked
+
+
+def to_numpy_type(number_type):
+    if np is not None:
+        return np.dtype(number_type.name).newbyteorder('<')
+    else:
+        raise NumpyRequiredForThisFeature('Numpy was not found.')
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/packer.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/packer.py
new file mode 100644
index 0000000..20ee9f1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/packer.py
@@ -0,0 +1,42 @@
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Provide pre-compiled struct packers for encoding and decoding.
+
+See: https://docs.python.org/2/library/struct.html#format-characters
+"""
+
+import struct
+from . import compat
+
+
+boolean = struct.Struct(compat.struct_bool_decl)
+
+uint8 = struct.Struct("<B")
+uint16 = struct.Struct("<H")
+uint32 = struct.Struct("<I")
+uint64 = struct.Struct("<Q")
+
+int8 = struct.Struct("<b")
+int16 = struct.Struct("<h")
+int32 = struct.Struct("<i")
+int64 = struct.Struct("<q")
+
+float32 = struct.Struct("<f")
+float64 = struct.Struct("<d")
+
+uoffset = uint32
+soffset = int32
+voffset = uint16
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/table.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/table.py
new file mode 100644
index 0000000..adc76ca
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/table.py
@@ -0,0 +1,129 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import encode
+from . import number_types as N
+
+
+class Table(object):
+    """Table wraps a byte slice and provides read access to its data.
+
+    The variable `Pos` indicates the root of the FlatBuffers object therein."""
+
+    __slots__ = ("Bytes", "Pos")
+
+    def __init__(self, buf, pos):
+        N.enforce_number(pos, N.UOffsetTFlags)
+
+        self.Bytes = buf
+        self.Pos = pos
+
+    def Offset(self, vtableOffset):
+        """Offset provides access into the Table's vtable.
+
+        Deprecated fields are ignored by checking the vtable's length."""
+
+        vtable = self.Pos - self.Get(N.SOffsetTFlags, self.Pos)
+        vtableEnd = self.Get(N.VOffsetTFlags, vtable)
+        if vtableOffset < vtableEnd:
+            return self.Get(N.VOffsetTFlags, vtable + vtableOffset)
+        return 0
+
+    def Indirect(self, off):
+        """Indirect retrieves the relative offset stored at `offset`."""
+        N.enforce_number(off, N.UOffsetTFlags)
+        return off + encode.Get(N.UOffsetTFlags.packer_type, self.Bytes, off)
+
+    def String(self, off):
+        """String gets a string from data stored inside the flatbuffer."""
+        N.enforce_number(off, N.UOffsetTFlags)
+        off += encode.Get(N.UOffsetTFlags.packer_type, self.Bytes, off)
+        start = off + N.UOffsetTFlags.bytewidth
+        length = encode.Get(N.UOffsetTFlags.packer_type, self.Bytes, off)
+        return bytes(self.Bytes[start:start+length])
+
+    def VectorLen(self, off):
+        """VectorLen retrieves the length of the vector whose offset is stored
+           at "off" in this object."""
+        N.enforce_number(off, N.UOffsetTFlags)
+
+        off += self.Pos
+        off += encode.Get(N.UOffsetTFlags.packer_type, self.Bytes, off)
+        ret = encode.Get(N.UOffsetTFlags.packer_type, self.Bytes, off)
+        return ret
+
+    def Vector(self, off):
+        """Vector retrieves the start of data of the vector whose offset is
+           stored at "off" in this object."""
+        N.enforce_number(off, N.UOffsetTFlags)
+
+        off += self.Pos
+        x = off + self.Get(N.UOffsetTFlags, off)
+        # data starts after metadata containing the vector length
+        x += N.UOffsetTFlags.bytewidth
+        return x
+
+    def Union(self, t2, off):
+        """Union initializes any Table-derived type to point to the union at
+           the given offset."""
+        assert type(t2) is Table
+        N.enforce_number(off, N.UOffsetTFlags)
+
+        off += self.Pos
+        t2.Pos = off + self.Get(N.UOffsetTFlags, off)
+        t2.Bytes = self.Bytes
+
+    def Get(self, flags, off):
+        """
+        Get retrieves a value of the type specified by `flags`  at the
+        given offset.
+        """
+        N.enforce_number(off, N.UOffsetTFlags)
+        return flags.py_type(encode.Get(flags.packer_type, self.Bytes, off))
+
+    def GetSlot(self, slot, d, validator_flags):
+        N.enforce_number(slot, N.VOffsetTFlags)
+        if validator_flags is not None:
+            N.enforce_number(d, validator_flags)
+        off = self.Offset(slot)
+        if off == 0:
+            return d
+        return self.Get(validator_flags, self.Pos + off)
+
+    def GetVectorAsNumpy(self, flags, off):
+        """
+        GetVectorAsNumpy returns the vector that starts at `Vector(off)`
+        as a numpy array with the type specified by `flags`. The array is
+        a `view` into Bytes, so modifying the returned array will
+        modify Bytes in place.
+        """
+        offset = self.Vector(off)
+        length = self.VectorLen(off) # TODO: length accounts for bytewidth, right?
+        numpy_dtype = N.to_numpy_type(flags)
+        return encode.GetVectorAsNumpy(numpy_dtype, self.Bytes, length, offset)
+
+    def GetVOffsetTSlot(self, slot, d):
+        """
+        GetVOffsetTSlot retrieves the VOffsetT that the given vtable location
+        points to. If the vtable value is zero, the default value `d`
+        will be returned.
+        """
+
+        N.enforce_number(slot, N.VOffsetTFlags)
+        N.enforce_number(d, N.VOffsetTFlags)
+
+        off = self.Offset(slot)
+        if off == 0:
+                return d
+        return off
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/util.py b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/util.py
new file mode 100644
index 0000000..a5a7838
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/flatbuffers/util.py
@@ -0,0 +1,43 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import encode
+from . import number_types
+from . import packer
+
+def GetSizePrefix(buf, offset):
+	"""Extract the size prefix from a buffer."""
+	return encode.Get(packer.int32, buf, offset)
+
+def GetBufferIdentifier(buf, offset, size_prefixed=False):
+        """Extract the file_identifier from a buffer"""
+        if size_prefixed:
+            # increase offset by size of UOffsetTFlags
+            offset += number_types.UOffsetTFlags.bytewidth
+        # increase offset by size of root table pointer
+        offset += number_types.UOffsetTFlags.bytewidth
+        # end of FILE_IDENTIFIER
+        end = offset + encode.FILE_IDENTIFIER_LENGTH
+        return buf[offset:end]
+
+def BufferHasIdentifier(buf, offset, file_identifier, size_prefixed=False):
+        got = GetBufferIdentifier(buf, offset, size_prefixed=size_prefixed)
+        return got == file_identifier
+
+def RemoveSizePrefix(buf, offset):
+	"""
+	Create a slice of a size-prefixed buffer that has
+	its position advanced just past the size prefix.
+	"""
+	return buf, offset + number_types.Int32Flags.bytewidth
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/setup.cfg b/3rdparty/TNN/third_party/flatbuffers/python/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/3rdparty/TNN/third_party/flatbuffers/python/setup.py b/3rdparty/TNN/third_party/flatbuffers/python/setup.py
new file mode 100644
index 0000000..0615f2b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/python/setup.py
@@ -0,0 +1,84 @@
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fileinput
+import os
+import re
+import sys
+from datetime import datetime
+from setuptools import setup
+
+
+def _update_version_attr(new_version):
+    for line in fileinput.input('flatbuffers/_version.py', inplace=True):
+        if line.startswith('__version__'):
+            line = re.sub(r'".*"', '"{}"'.format(new_version), line)
+        sys.stdout.write(line)
+
+
+def version():
+    version = os.getenv('VERSION', None)
+    if version:
+        # Most git tags are prefixed with 'v' (example: v1.2.3) this is
+        # never desirable for artifact repositories, so we strip the
+        # leading 'v' if it's present.
+        version = version[1:] if version.startswith('v') else version
+    else:
+        # Default version is an ISO8601 compiliant datetime. PyPI doesn't allow
+        # the colon ':' character in its versions, and time is required to allow
+        # for multiple publications to master in one day. This datetime string
+        # uses the "basic" ISO8601 format for both its date and time components
+        # to avoid issues with the colon character (ISO requires that date and
+        # time components of a date-time string must be uniformly basic or
+        # extended, which is why the date component does not have dashes.
+        #
+        # Publications using datetime versions should only be made from master
+        # to represent the HEAD moving forward.
+        version = datetime.utcnow().strftime('%Y%m%d%H%M%S')
+        print("VERSION environment variable not set, using datetime instead: {}"
+              .format(version))
+
+    _update_version_attr(version)
+
+    return version
+
+
+setup(
+    name='flatbuffers',
+    version=version(),
+    license='Apache 2.0',
+    author='FlatBuffers Contributors',
+    author_email='me@rwinslow.com',
+    url='https://google.github.io/flatbuffers/',
+    long_description=('Python runtime library for use with the '
+                      '`Flatbuffers <https://google.github.io/flatbuffers/>`_ '
+                      'serialization format.'),
+    packages=['flatbuffers'],
+    include_package_data=True,
+    requires=[],
+    description='The FlatBuffers serialization format for Python',
+    classifiers=[
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 3',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    project_urls={
+        'Documentation': 'https://google.github.io/flatbuffers/',
+        'Source': 'https://github.com/google/flatbuffers',
+    },
+)
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/readme.md b/3rdparty/TNN/third_party/flatbuffers/readme.md
new file mode 100644
index 0000000..7cccfa1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/readme.md
@@ -0,0 +1,58 @@
+![logo](http://google.github.io/flatbuffers/fpl_logo_small.png) FlatBuffers
+===========
+
+[![Build Status](https://travis-ci.org/google/flatbuffers.svg?branch=master)](https://travis-ci.org/google/flatbuffers)
+[![Build status](https://ci.appveyor.com/api/projects/status/yg5idd2fnusv1n10?svg=true)](https://ci.appveyor.com/project/gwvo/flatbuffers)
+[![Join the chat at https://gitter.im/google/flatbuffers](https://badges.gitter.im/google/flatbuffers.svg)](https://gitter.im/google/flatbuffers?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Discord Chat](https://img.shields.io/discord/656202785926152206.svg)](https:///discord.gg/6qgKs3R)
+[![Twitter Follow](https://img.shields.io/twitter/follow/wvo.svg?style=social)](https://twitter.com/wvo)
+
+
+**FlatBuffers** is a cross platform serialization library architected for
+maximum memory efficiency. It allows you to directly access serialized data without parsing/unpacking it first, while still having great forwards/backwards compatibility.
+
+**Go to our [landing page][] to browse our documentation.**
+
+## Supported operating systems
+* Windows
+* MacOS X
+* Linux
+* Android
+* And any others with a recent C++ compiler.
+
+## Supported programming languages
+* C++
+* C#
+* C
+* Dart
+* Go
+* Java
+* JavaScript
+* Lobster
+* Lua
+* PHP
+* Python
+* Rust
+* TypeScript
+
+*and more in progress...*
+
+## Contribution
+* [FlatBuffers Google Group][] to discuss FlatBuffers with other developers and users.
+* [FlatBuffers Issues Tracker][] to submit an issue.
+* [stackoverflow.com][] with [`flatbuffers` tag][] for any questions regarding FlatBuffers.
+
+*To contribute to this project,* see [CONTRIBUTING][].
+
+## Licensing
+*Flatbuffers* is licensed under the Apache License, Version 2.0. See [LICENSE][] for the full license text.
+
+<br>
+
+   [CONTRIBUTING]: http://github.com/google/flatbuffers/blob/master/CONTRIBUTING.md
+   [`flatbuffers` tag]: https://stackoverflow.com/questions/tagged/flatbuffers
+   [FlatBuffers Google Group]: https://groups.google.com/forum/#!forum/flatbuffers
+   [FlatBuffers Issues Tracker]: http://github.com/google/flatbuffers/issues
+   [stackoverflow.com]: http://stackoverflow.com/search?q=flatbuffers
+   [landing page]: https://google.github.io/flatbuffers
+   [LICENSE]: https://github.com/google/flatbuffers/blob/master/LICENSE.txt
diff --git a/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.bat b/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.bat
new file mode 100644
index 0000000..7b0f4d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.bat
@@ -0,0 +1,18 @@
+:: Copyright 2015 Google Inc. All rights reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+
+set buildtype=Release
+if "%1"=="-b" set buildtype=%2
+
+..\%buildtype%\flatc.exe --cpp --cpp-std c++0x --no-prefix -o ../include/flatbuffers reflection.fbs || exit /b 1
diff --git a/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.sh b/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.sh
new file mode 100755
index 0000000..694baa2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/reflection/generate_code.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+tempDir="../include/flatbuffers/.tmp"
+originalFile="../include/flatbuffers/reflection_generated.h"
+newFile="$tempDir/reflection_generated.h"
+
+../flatc -c --cpp-std c++0x --no-prefix -o $tempDir reflection.fbs
+
+if [ -f "$newFile" ]; then
+  if ! cmp -s "$originalFile" "$newFile"; then
+    mv $newFile $originalFile
+  else
+    rm $newFile
+  fi
+  rmdir $tempDir
+fi
diff --git a/3rdparty/TNN/third_party/flatbuffers/reflection/reflection.fbs b/3rdparty/TNN/third_party/flatbuffers/reflection/reflection.fbs
new file mode 100644
index 0000000..36230b2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/reflection/reflection.fbs
@@ -0,0 +1,126 @@
+// This schema defines objects that represent a parsed schema, like
+// the binary version of a .fbs file.
+// This could be used to operate on unknown FlatBuffers at runtime.
+// It can even ... represent itself (!)
+
+namespace reflection;
+
+// These must correspond to the enum in idl.h.
+enum BaseType : byte {
+    None,
+    UType,
+    Bool,
+    Byte,
+    UByte,
+    Short,
+    UShort,
+    Int,
+    UInt,
+    Long,
+    ULong,
+    Float,
+    Double,
+    String,
+    Vector,
+    Obj,     // Used for tables & structs.
+    Union,
+    Array,
+
+    // Add any new type above this value.
+    MaxBaseType
+}
+
+table Type {
+    base_type:BaseType;
+    element:BaseType = None;  // Only if base_type == Vector
+                              // or base_type == Array.
+    index:int = -1;  // If base_type == Object, index into "objects" below.
+                     // If base_type == Union, UnionType, or integral derived
+                     // from an enum, index into "enums" below.
+    fixed_length:uint16 = 0;  // Only if base_type == Array.
+}
+
+table KeyValue {
+    key:string (required, key);
+    value:string;
+}
+
+table EnumVal {
+    name:string (required);
+    value:long (key);
+    object:Object;  // Will be deprecated in favor of union_type in the future.
+    union_type:Type;
+    documentation:[string];
+}
+
+table Enum {
+    name:string (required, key);
+    values:[EnumVal] (required);  // In order of their values.
+    is_union:bool = false;
+    underlying_type:Type (required);
+    attributes:[KeyValue];
+    documentation:[string];
+}
+
+table Field {
+    name:string (required, key);
+    type:Type (required);
+    id:ushort;
+    offset:ushort;  // Offset into the vtable for tables, or into the struct.
+    default_integer:long = 0;
+    default_real:double = 0.0;
+    deprecated:bool = false;
+    required:bool = false;
+    key:bool = false;
+    attributes:[KeyValue];
+    documentation:[string];
+    optional:bool = false;
+}
+
+table Object {  // Used for both tables and structs.
+    name:string (required, key);
+    fields:[Field] (required);  // Sorted.
+    is_struct:bool = false;
+    minalign:int;
+    bytesize:int;  // For structs.
+    attributes:[KeyValue];
+    documentation:[string];
+}
+
+table RPCCall {
+    name:string (required, key);
+    request:Object (required);      // must be a table (not a struct)
+    response:Object (required);     // must be a table (not a struct)
+    attributes:[KeyValue];
+    documentation:[string];
+}
+
+table Service {
+    name:string (required, key);
+    calls:[RPCCall];
+    attributes:[KeyValue];
+    documentation:[string];
+}
+
+// New schema language features that are not supported by old code generators.
+enum AdvancedFeatures : ulong (bit_flags) {
+    AdvancedArrayFeatures,
+    AdvancedUnionFeatures,
+    OptionalScalars,
+    DefaultVectorsAndStrings,
+}
+
+table Schema {
+    objects:[Object] (required);    // Sorted.
+    enums:[Enum] (required);        // Sorted.
+    file_ident:string;
+    file_ext:string;
+    root_table:Object;
+    services:[Service];             // Sorted.
+    advanced_features:AdvancedFeatures;
+}
+
+root_type Schema;
+
+file_identifier "BFBS";
+file_extension "bfbs";
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/Cargo.toml b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/Cargo.toml
new file mode 100644
index 0000000..b3114bd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "flatbuffers"
+version = "2.0.0"
+edition = "2018"
+authors = ["Robert Winslow <hello@rwinslow.com>", "FlatBuffers Maintainers"]
+license = "Apache-2.0"
+description = "Official FlatBuffers Rust runtime library."
+homepage = "https://google.github.io/flatbuffers/"
+repository = "https://github.com/google/flatbuffers"
+keywords = ["flatbuffers", "serialization", "zero-copy"]
+categories = ["encoding", "data-structures", "memory-management"]
+rust = "1.51"
+
+[dependencies]
+smallvec = "1.6.1"
+bitflags = "1.2.1"
+thiserror = "1.0.23"
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/array.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/array.rs
new file mode 100644
index 0000000..f00a560
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/array.rs
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use crate::follow::Follow;
+use crate::vector::VectorIter;
+use crate::EndianScalar;
+use std::fmt::{Debug, Formatter, Result};
+use std::marker::PhantomData;
+use std::mem::size_of;
+
+#[derive(Copy, Clone)]
+pub struct Array<'a, T: 'a, const N: usize>(&'a [u8], PhantomData<T>);
+
+impl<'a, T: 'a, const N: usize> Debug for Array<'a, T, N>
+where
+    T: 'a + Follow<'a>,
+    <T as Follow<'a>>::Inner: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> Result {
+        f.debug_list().entries(self.iter()).finish()
+    }
+}
+
+#[allow(clippy::len_without_is_empty)]
+#[allow(clippy::from_over_into)] // TODO(caspern): Go from From to Into.
+impl<'a, T: 'a, const N: usize> Array<'a, T, N> {
+    #[inline(always)]
+    pub fn new(buf: &'a [u8]) -> Self {
+        debug_assert!(size_of::<T>() * N == buf.len());
+
+        Array {
+            0: buf,
+            1: PhantomData,
+        }
+    }
+
+    #[inline(always)]
+    pub const fn len(&self) -> usize {
+        N
+    }
+    pub fn as_ptr(&self) -> *const u8 {
+        self.0.as_ptr()
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a, const N: usize> Array<'a, T, N> {
+    #[inline(always)]
+    pub fn get(&self, idx: usize) -> T::Inner {
+        debug_assert!(idx < N);
+        let sz = size_of::<T>();
+        T::follow(self.0, sz * idx)
+    }
+
+    #[inline(always)]
+    pub fn iter(&self) -> VectorIter<'a, T> {
+        VectorIter::from_slice(self.0, self.len())
+    }
+}
+
+impl<'a, T: Follow<'a> + Debug, const N: usize> Into<[T::Inner; N]> for Array<'a, T, N> {
+    #[inline(always)]
+    fn into(self) -> [T::Inner; N] {
+        array_init(|i| self.get(i))
+    }
+}
+
+// TODO(caspern): Implement some future safe version of SafeSliceAccess.
+
+/// Implement Follow for all possible Arrays that have Follow-able elements.
+impl<'a, T: Follow<'a> + 'a, const N: usize> Follow<'a> for Array<'a, T, N> {
+    type Inner = Array<'a, T, N>;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Array::new(&buf[loc..loc + N * size_of::<T>()])
+    }
+}
+
+pub fn emplace_scalar_array<T: EndianScalar, const N: usize>(
+    buf: &mut [u8],
+    loc: usize,
+    src: &[T; N],
+) {
+    let mut buf_ptr = buf[loc..].as_mut_ptr();
+    for item in src.iter() {
+        let item_le = item.to_little_endian();
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                &item_le as *const T as *const u8,
+                buf_ptr,
+                size_of::<T>(),
+            );
+            buf_ptr = buf_ptr.add(size_of::<T>());
+        }
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a, const N: usize> IntoIterator for Array<'a, T, N> {
+    type Item = T::Inner;
+    type IntoIter = VectorIter<'a, T>;
+    #[inline]
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+#[inline]
+pub fn array_init<F, T, const N: usize>(mut initializer: F) -> [T; N]
+where
+    F: FnMut(usize) -> T,
+{
+    let mut array: core::mem::MaybeUninit<[T; N]> = core::mem::MaybeUninit::uninit();
+    let mut ptr_i = array.as_mut_ptr() as *mut T;
+
+    unsafe {
+        for i in 0..N {
+            let value_i = initializer(i);
+            ptr_i.write(value_i);
+            ptr_i = ptr_i.add(1);
+        }
+        array.assume_init()
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/builder.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/builder.rs
new file mode 100644
index 0000000..bf1ad02
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/builder.rs
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+extern crate smallvec;
+
+use std::cmp::max;
+use std::iter::{DoubleEndedIterator, ExactSizeIterator};
+use std::marker::PhantomData;
+use std::ptr::write_bytes;
+use std::slice::from_raw_parts;
+
+use crate::endian_scalar::{emplace_scalar, read_scalar_at};
+use crate::primitives::*;
+use crate::push::{Push, PushAlignment};
+use crate::table::Table;
+use crate::vector::{SafeSliceAccess, Vector};
+use crate::vtable::{field_index_to_field_offset, VTable};
+use crate::vtable_writer::VTableWriter;
+
+pub const N_SMALLVEC_STRING_VECTOR_CAPACITY: usize = 16;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct FieldLoc {
+    off: UOffsetT,
+    id: VOffsetT,
+}
+
+/// FlatBufferBuilder builds a FlatBuffer through manipulating its internal
+/// state. It has an owned `Vec<u8>` that grows as needed (up to the hardcoded
+/// limit of 2GiB, which is set by the FlatBuffers format).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct FlatBufferBuilder<'fbb> {
+    owned_buf: Vec<u8>,
+    head: usize,
+
+    field_locs: Vec<FieldLoc>,
+    written_vtable_revpos: Vec<UOffsetT>,
+
+    nested: bool,
+    finished: bool,
+
+    min_align: usize,
+    force_defaults: bool,
+    strings_pool: Vec<WIPOffset<&'fbb str>>,
+
+    _phantom: PhantomData<&'fbb ()>,
+}
+
+impl<'fbb> FlatBufferBuilder<'fbb> {
+    /// Create a FlatBufferBuilder that is ready for writing.
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+    #[deprecated(note = "replaced with `with_capacity`", since = "0.8.5")]
+    pub fn new_with_capacity(size: usize) -> Self {
+        Self::with_capacity(size)
+    }
+    /// Create a FlatBufferBuilder that is ready for writing, with a
+    /// ready-to-use capacity of the provided size.
+    ///
+    /// The maximum valid value is `FLATBUFFERS_MAX_BUFFER_SIZE`.
+    pub fn with_capacity(size: usize) -> Self {
+        Self::from_vec(vec![0; size])
+    }
+    /// Create a FlatBufferBuilder that is ready for writing, reusing
+    /// an existing vector.
+    pub fn from_vec(buffer: Vec<u8>) -> Self {
+        // we need to check the size here because we create the backing buffer
+        // directly, bypassing the typical way of using grow_owned_buf:
+        assert!(
+            buffer.len() <= FLATBUFFERS_MAX_BUFFER_SIZE,
+            "cannot initialize buffer bigger than 2 gigabytes"
+        );
+        let head = buffer.len();
+        FlatBufferBuilder {
+            owned_buf: buffer,
+            head,
+
+            field_locs: Vec::new(),
+            written_vtable_revpos: Vec::new(),
+
+            nested: false,
+            finished: false,
+
+            min_align: 0,
+            force_defaults: false,
+            strings_pool: Vec::new(),
+
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Reset the FlatBufferBuilder internal state. Use this method after a
+    /// call to a `finish` function in order to re-use a FlatBufferBuilder.
+    ///
+    /// This function is the only way to reset the `finished` state and start
+    /// again.
+    ///
+    /// If you are using a FlatBufferBuilder repeatedly, make sure to use this
+    /// function, because it re-uses the FlatBufferBuilder's existing
+    /// heap-allocated `Vec<u8>` internal buffer. This offers significant speed
+    /// improvements as compared to creating a new FlatBufferBuilder for every
+    /// new object.
+    pub fn reset(&mut self) {
+        // memset only the part of the buffer that could be dirty:
+        {
+            let to_clear = self.owned_buf.len() - self.head;
+            let ptr = (&mut self.owned_buf[self.head..]).as_mut_ptr();
+            unsafe {
+                write_bytes(ptr, 0, to_clear);
+            }
+        }
+
+        self.head = self.owned_buf.len();
+        self.written_vtable_revpos.clear();
+
+        self.nested = false;
+        self.finished = false;
+
+        self.min_align = 0;
+        self.strings_pool.clear();
+    }
+
+    /// Destroy the FlatBufferBuilder, returning its internal byte vector
+    /// and the index into it that represents the start of valid data.
+    pub fn collapse(self) -> (Vec<u8>, usize) {
+        (self.owned_buf, self.head)
+    }
+
+    /// Push a Push'able value onto the front of the in-progress data.
+    ///
+    /// This function uses traits to provide a unified API for writing
+    /// scalars, tables, vectors, and WIPOffsets.
+    #[inline]
+    pub fn push<P: Push>(&mut self, x: P) -> WIPOffset<P::Output> {
+        let sz = P::size();
+        self.align(sz, P::alignment());
+        self.make_space(sz);
+        {
+            let (dst, rest) = (&mut self.owned_buf[self.head..]).split_at_mut(sz);
+            x.push(dst, rest);
+        }
+        WIPOffset::new(self.used_space() as UOffsetT)
+    }
+
+    /// Push a Push'able value onto the front of the in-progress data, and
+    /// store a reference to it in the in-progress vtable. If the value matches
+    /// the default, then this is a no-op.
+    #[inline]
+    pub fn push_slot<X: Push + PartialEq>(&mut self, slotoff: VOffsetT, x: X, default: X) {
+        self.assert_nested("push_slot");
+        if x != default || self.force_defaults {
+            self.push_slot_always(slotoff, x);
+        }
+    }
+
+    /// Push a Push'able value onto the front of the in-progress data, and
+    /// store a reference to it in the in-progress vtable.
+    #[inline]
+    pub fn push_slot_always<X: Push>(&mut self, slotoff: VOffsetT, x: X) {
+        self.assert_nested("push_slot_always");
+        let off = self.push(x);
+        self.track_field(slotoff, off.value());
+    }
+
+    /// Retrieve the number of vtables that have been serialized into the
+    /// FlatBuffer. This is primarily used to check vtable deduplication.
+    #[inline]
+    pub fn num_written_vtables(&self) -> usize {
+        self.written_vtable_revpos.len()
+    }
+
+    /// Start a Table write.
+    ///
+    /// Asserts that the builder is not in a nested state.
+    ///
+    /// Users probably want to use `push_slot` to add values after calling this.
+    #[inline]
+    pub fn start_table(&mut self) -> WIPOffset<TableUnfinishedWIPOffset> {
+        self.assert_not_nested(
+            "start_table can not be called when a table or vector is under construction",
+        );
+        self.nested = true;
+
+        WIPOffset::new(self.used_space() as UOffsetT)
+    }
+
+    /// End a Table write.
+    ///
+    /// Asserts that the builder is in a nested state.
+    #[inline]
+    pub fn end_table(
+        &mut self,
+        off: WIPOffset<TableUnfinishedWIPOffset>,
+    ) -> WIPOffset<TableFinishedWIPOffset> {
+        self.assert_nested("end_table");
+
+        let o = self.write_vtable(off);
+
+        self.nested = false;
+        self.field_locs.clear();
+
+        WIPOffset::new(o.value())
+    }
+
+    /// Start a Vector write.
+    ///
+    /// Asserts that the builder is not in a nested state.
+    ///
+    /// Most users will prefer to call `create_vector`.
+    /// Speed optimizing users who choose to create vectors manually using this
+    /// function will want to use `push` to add values.
+    #[inline]
+    pub fn start_vector<T: Push>(&mut self, num_items: usize) {
+        self.assert_not_nested(
+            "start_vector can not be called when a table or vector is under construction",
+        );
+        self.nested = true;
+        self.align(num_items * T::size(), T::alignment().max_of(SIZE_UOFFSET));
+    }
+
+    /// End a Vector write.
+    ///
+    /// Note that the `num_elems` parameter is the number of written items, not
+    /// the byte count.
+    ///
+    /// Asserts that the builder is in a nested state.
+    #[inline]
+    pub fn end_vector<T: Push>(&mut self, num_elems: usize) -> WIPOffset<Vector<'fbb, T>> {
+        self.assert_nested("end_vector");
+        self.nested = false;
+        let o = self.push::<UOffsetT>(num_elems as UOffsetT);
+        WIPOffset::new(o.value())
+    }
+
+    #[inline]
+    pub fn create_shared_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> {
+        self.assert_not_nested(
+            "create_shared_string can not be called when a table or vector is under construction",
+        );
+
+        // Saves a ref to owned_buf since rust doesnt like us refrencing it
+        // in the binary_search_by code.
+        let buf = &self.owned_buf;
+
+        let found = self.strings_pool.binary_search_by(|offset| {
+            let ptr = offset.value() as usize;
+            // Gets The pointer to the size of the string
+            let str_memory = &buf[buf.len() - ptr..];
+            // Gets the size of the written string from buffer
+            let size =
+                u32::from_le_bytes([str_memory[0], str_memory[1], str_memory[2], str_memory[3]])
+                    as usize;
+            // Size of the string size
+            let string_size: usize = 4;
+            // Fetches actual string bytes from index of string after string size
+            // to the size of string plus string size
+            let iter = str_memory[string_size..size + string_size].iter();
+            // Compares bytes of fetched string and current writable string
+            iter.cloned().cmp(s.bytes())
+        });
+
+        match found {
+            Ok(index) => self.strings_pool[index],
+            Err(index) => {
+                let address = WIPOffset::new(self.create_byte_string(s.as_bytes()).value());
+                self.strings_pool.insert(index, address);
+                address
+            }
+        }
+    }
+
+    /// Create a utf8 string.
+    ///
+    /// The wire format represents this as a zero-terminated byte vector.
+    #[inline]
+    pub fn create_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> {
+        self.assert_not_nested(
+            "create_string can not be called when a table or vector is under construction",
+        );
+        WIPOffset::new(self.create_byte_string(s.as_bytes()).value())
+    }
+
+    /// Create a zero-terminated byte vector.
+    #[inline]
+    pub fn create_byte_string(&mut self, data: &[u8]) -> WIPOffset<&'fbb [u8]> {
+        self.assert_not_nested(
+            "create_byte_string can not be called when a table or vector is under construction",
+        );
+        self.align(data.len() + 1, PushAlignment::new(SIZE_UOFFSET));
+        self.push(0u8);
+        self.push_bytes_unprefixed(data);
+        self.push(data.len() as UOffsetT);
+        WIPOffset::new(self.used_space() as UOffsetT)
+    }
+
+    /// Create a vector by memcpy'ing. This is much faster than calling
+    /// `create_vector`, but the underlying type must be represented as
+    /// little-endian on the host machine. This property is encoded in the
+    /// type system through the SafeSliceAccess trait. The following types are
+    /// always safe, on any platform: bool, u8, i8, and any
+    /// FlatBuffers-generated struct.
+    #[inline]
+    pub fn create_vector_direct<'a: 'b, 'b, T: SafeSliceAccess + Push + Sized + 'b>(
+        &'a mut self,
+        items: &'b [T],
+    ) -> WIPOffset<Vector<'fbb, T>> {
+        self.assert_not_nested(
+            "create_vector_direct can not be called when a table or vector is under construction",
+        );
+        let elem_size = T::size();
+        self.align(items.len() * elem_size, T::alignment().max_of(SIZE_UOFFSET));
+
+        let bytes = {
+            let ptr = items.as_ptr() as *const T as *const u8;
+            unsafe { from_raw_parts(ptr, items.len() * elem_size) }
+        };
+        self.push_bytes_unprefixed(bytes);
+        self.push(items.len() as UOffsetT);
+
+        WIPOffset::new(self.used_space() as UOffsetT)
+    }
+
+    /// Create a vector of strings.
+    ///
+    /// Speed-sensitive users may wish to reduce memory usage by creating the
+    /// vector manually: use `start_vector`, `push`, and `end_vector`.
+    #[inline]
+    pub fn create_vector_of_strings<'a, 'b>(
+        &'a mut self,
+        xs: &'b [&'b str],
+    ) -> WIPOffset<Vector<'fbb, ForwardsUOffset<&'fbb str>>> {
+        self.assert_not_nested("create_vector_of_strings can not be called when a table or vector is under construction");
+        // internally, smallvec can be a stack-allocated or heap-allocated vector:
+        // if xs.len() > N_SMALLVEC_STRING_VECTOR_CAPACITY then it will overflow to the heap.
+        let mut offsets: smallvec::SmallVec<[WIPOffset<&str>; N_SMALLVEC_STRING_VECTOR_CAPACITY]> =
+            smallvec::SmallVec::with_capacity(xs.len());
+        unsafe {
+            offsets.set_len(xs.len());
+        }
+
+        // note that this happens in reverse, because the buffer is built back-to-front:
+        for (i, &s) in xs.iter().enumerate().rev() {
+            let o = self.create_string(s);
+            offsets[i] = o;
+        }
+        self.create_vector(&offsets[..])
+    }
+
+    /// Create a vector of Push-able objects.
+    ///
+    /// Speed-sensitive users may wish to reduce memory usage by creating the
+    /// vector manually: use `start_vector`, `push`, and `end_vector`.
+    #[inline]
+    pub fn create_vector<'a: 'b, 'b, T: Push + Copy + 'b>(
+        &'a mut self,
+        items: &'b [T],
+    ) -> WIPOffset<Vector<'fbb, T::Output>> {
+        let elem_size = T::size();
+        self.align(items.len() * elem_size, T::alignment().max_of(SIZE_UOFFSET));
+        for i in (0..items.len()).rev() {
+            self.push(items[i]);
+        }
+        WIPOffset::new(self.push::<UOffsetT>(items.len() as UOffsetT).value())
+    }
+
+    /// Create a vector of Push-able objects.
+    ///
+    /// Speed-sensitive users may wish to reduce memory usage by creating the
+    /// vector manually: use `start_vector`, `push`, and `end_vector`.
+    #[inline]
+    pub fn create_vector_from_iter<T: Push + Copy>(
+        &mut self,
+        items: impl ExactSizeIterator<Item = T> + DoubleEndedIterator,
+    ) -> WIPOffset<Vector<'fbb, T::Output>> {
+        let elem_size = T::size();
+        let len = items.len();
+        self.align(len * elem_size, T::alignment().max_of(SIZE_UOFFSET));
+        for item in items.rev() {
+            self.push(item);
+        }
+        WIPOffset::new(self.push::<UOffsetT>(len as UOffsetT).value())
+    }
+
+    /// Set whether default values are stored.
+    ///
+    /// In order to save space, fields that are set to their default value
+    /// aren't stored in the buffer. Setting `force_defaults` to `true`
+    /// disables this optimization.
+    ///
+    /// By default, `force_defaults` is `false`.
+    #[inline]
+    pub fn force_defaults(&mut self, force_defaults: bool) {
+        self.force_defaults = force_defaults;
+    }
+
+    /// Get the byte slice for the data that has been written, regardless of
+    /// whether it has been finished.
+    #[inline]
+    pub fn unfinished_data(&self) -> &[u8] {
+        &self.owned_buf[self.head..]
+    }
+    /// Get the byte slice for the data that has been written after a call to
+    /// one of the `finish` functions.
+    /// # Panics
+    /// Panics if the buffer is not finished.
+    #[inline]
+    pub fn finished_data(&self) -> &[u8] {
+        self.assert_finished("finished_bytes cannot be called when the buffer is not yet finished");
+        &self.owned_buf[self.head..]
+    }
+    /// Returns a mutable view of a finished buffer and location of where the flatbuffer starts.
+    /// Note that modifying the flatbuffer data may corrupt it.
+    /// # Panics
+    /// Panics if the flatbuffer is not finished.
+    #[inline]
+    pub fn mut_finished_buffer(&mut self) -> (&mut [u8], usize) {
+        (&mut self.owned_buf, self.head)
+    }
+    /// Assert that a field is present in the just-finished Table.
+    ///
+    /// This is somewhat low-level and is mostly used by the generated code.
+    #[inline]
+    pub fn required(
+        &self,
+        tab_revloc: WIPOffset<TableFinishedWIPOffset>,
+        slot_byte_loc: VOffsetT,
+        assert_msg_name: &'static str,
+    ) {
+        let idx = self.used_space() - tab_revloc.value() as usize;
+        let tab = Table::new(&self.owned_buf[self.head..], idx);
+        let o = tab.vtable().get(slot_byte_loc) as usize;
+        assert!(o != 0, "missing required field {}", assert_msg_name);
+    }
+
+    /// Finalize the FlatBuffer by: aligning it, pushing an optional file
+    /// identifier on to it, pushing a size prefix on to it, and marking the
+    /// internal state of the FlatBufferBuilder as `finished`. Afterwards,
+    /// users can call `finished_data` to get the resulting data.
+    #[inline]
+    pub fn finish_size_prefixed<T>(&mut self, root: WIPOffset<T>, file_identifier: Option<&str>) {
+        self.finish_with_opts(root, file_identifier, true);
+    }
+
+    /// Finalize the FlatBuffer by: aligning it, pushing an optional file
+    /// identifier on to it, and marking the internal state of the
+    /// FlatBufferBuilder as `finished`. Afterwards, users can call
+    /// `finished_data` to get the resulting data.
+    #[inline]
+    pub fn finish<T>(&mut self, root: WIPOffset<T>, file_identifier: Option<&str>) {
+        self.finish_with_opts(root, file_identifier, false);
+    }
+
+    /// Finalize the FlatBuffer by: aligning it and marking the internal state
+    /// of the FlatBufferBuilder as `finished`. Afterwards, users can call
+    /// `finished_data` to get the resulting data.
+    #[inline]
+    pub fn finish_minimal<T>(&mut self, root: WIPOffset<T>) {
+        self.finish_with_opts(root, None, false);
+    }
+
+    #[inline]
+    fn used_space(&self) -> usize {
+        self.owned_buf.len() - self.head as usize
+    }
+
+    #[inline]
+    fn track_field(&mut self, slot_off: VOffsetT, off: UOffsetT) {
+        let fl = FieldLoc { id: slot_off, off };
+        self.field_locs.push(fl);
+    }
+
+    /// Write the VTable, if it is new.
+    fn write_vtable(
+        &mut self,
+        table_tail_revloc: WIPOffset<TableUnfinishedWIPOffset>,
+    ) -> WIPOffset<VTableWIPOffset> {
+        self.assert_nested("write_vtable");
+
+        // Write the vtable offset, which is the start of any Table.
+        // We fill its value later.
+        let object_revloc_to_vtable: WIPOffset<VTableWIPOffset> =
+            WIPOffset::new(self.push::<UOffsetT>(0xF0F0_F0F0).value());
+
+        // Layout of the data this function will create when a new vtable is
+        // needed.
+        // --------------------------------------------------------------------
+        // vtable starts here
+        // | x, x -- vtable len (bytes) [u16]
+        // | x, x -- object inline len (bytes) [u16]
+        // | x, x -- zero, or num bytes from start of object to field #0   [u16]
+        // | ...
+        // | x, x -- zero, or num bytes from start of object to field #n-1 [u16]
+        // vtable ends here
+        // table starts here
+        // | x, x, x, x -- offset (negative direction) to the vtable [i32]
+        // |               aka "vtableoffset"
+        // | -- table inline data begins here, we don't touch it --
+        // table ends here -- aka "table_start"
+        // --------------------------------------------------------------------
+        //
+        // Layout of the data this function will create when we re-use an
+        // existing vtable.
+        //
+        // We always serialize this particular vtable, then compare it to the
+        // other vtables we know about to see if there is a duplicate. If there
+        // is, then we erase the serialized vtable we just made.
+        // We serialize it first so that we are able to do byte-by-byte
+        // comparisons with already-serialized vtables. This 1) saves
+        // bookkeeping space (we only keep revlocs to existing vtables), 2)
+        // allows us to convert to little-endian once, then do
+        // fast memcmp comparisons, and 3) by ensuring we are comparing real
+        // serialized vtables, we can be more assured that we are doing the
+        // comparisons correctly.
+        //
+        // --------------------------------------------------------------------
+        // table starts here
+        // | x, x, x, x -- offset (negative direction) to an existing vtable [i32]
+        // |               aka "vtableoffset"
+        // | -- table inline data begins here, we don't touch it --
+        // table starts here: aka "table_start"
+        // --------------------------------------------------------------------
+
+        // fill the WIP vtable with zeros:
+        let vtable_byte_len = get_vtable_byte_len(&self.field_locs);
+        self.make_space(vtable_byte_len);
+
+        // compute the length of the table (not vtable!) in bytes:
+        let table_object_size = object_revloc_to_vtable.value() - table_tail_revloc.value();
+        debug_assert!(table_object_size < 0x10000); // vTable use 16bit offsets.
+
+        // Write the VTable (we may delete it afterwards, if it is a duplicate):
+        let vt_start_pos = self.head;
+        let vt_end_pos = self.head + vtable_byte_len;
+        {
+            // write the vtable header:
+            let vtfw = &mut VTableWriter::init(&mut self.owned_buf[vt_start_pos..vt_end_pos]);
+            vtfw.write_vtable_byte_length(vtable_byte_len as VOffsetT);
+            vtfw.write_object_inline_size(table_object_size as VOffsetT);
+
+            // serialize every FieldLoc to the vtable:
+            for &fl in self.field_locs.iter() {
+                let pos: VOffsetT = (object_revloc_to_vtable.value() - fl.off) as VOffsetT;
+                debug_assert_eq!(
+                    vtfw.get_field_offset(fl.id),
+                    0,
+                    "tried to write a vtable field multiple times"
+                );
+                vtfw.write_field_offset(fl.id, pos);
+            }
+        }
+        let dup_vt_use = {
+            let this_vt = VTable::init(&self.owned_buf[..], self.head);
+            self.find_duplicate_stored_vtable_revloc(this_vt)
+        };
+
+        let vt_use = match dup_vt_use {
+            Some(n) => {
+                VTableWriter::init(&mut self.owned_buf[vt_start_pos..vt_end_pos]).clear();
+                self.head += vtable_byte_len;
+                n
+            }
+            None => {
+                let new_vt_use = self.used_space() as UOffsetT;
+                self.written_vtable_revpos.push(new_vt_use);
+                new_vt_use
+            }
+        };
+
+        {
+            let n = self.head + self.used_space() - object_revloc_to_vtable.value() as usize;
+            let saw = unsafe { read_scalar_at::<UOffsetT>(&self.owned_buf, n) };
+            debug_assert_eq!(saw, 0xF0F0_F0F0);
+            unsafe {
+                emplace_scalar::<SOffsetT>(
+                    &mut self.owned_buf[n..n + SIZE_SOFFSET],
+                    vt_use as SOffsetT - object_revloc_to_vtable.value() as SOffsetT,
+                );
+            }
+        }
+
+        self.field_locs.clear();
+
+        object_revloc_to_vtable
+    }
+
+    #[inline]
+    fn find_duplicate_stored_vtable_revloc(&self, needle: VTable) -> Option<UOffsetT> {
+        for &revloc in self.written_vtable_revpos.iter().rev() {
+            let o = VTable::init(
+                &self.owned_buf[..],
+                self.head + self.used_space() - revloc as usize,
+            );
+            if needle == o {
+                return Some(revloc);
+            }
+        }
+        None
+    }
+
+    // Only call this when you know it is safe to double the size of the buffer.
+    #[inline]
+    fn grow_owned_buf(&mut self) {
+        let old_len = self.owned_buf.len();
+        let new_len = max(1, old_len * 2);
+
+        let starting_active_size = self.used_space();
+
+        let diff = new_len - old_len;
+        self.owned_buf.resize(new_len, 0);
+        self.head += diff;
+
+        let ending_active_size = self.used_space();
+        debug_assert_eq!(starting_active_size, ending_active_size);
+
+        if new_len == 1 {
+            return;
+        }
+
+        // calculate the midpoint, and safely copy the old end data to the new
+        // end position:
+        let middle = new_len / 2;
+        {
+            let (left, right) = &mut self.owned_buf[..].split_at_mut(middle);
+            right.copy_from_slice(left);
+        }
+        // finally, zero out the old end data.
+        {
+            let ptr = (&mut self.owned_buf[..middle]).as_mut_ptr();
+            unsafe {
+                write_bytes(ptr, 0, middle);
+            }
+        }
+    }
+
+    // with or without a size prefix changes how we load the data, so finish*
+    // functions are split along those lines.
+    fn finish_with_opts<T>(
+        &mut self,
+        root: WIPOffset<T>,
+        file_identifier: Option<&str>,
+        size_prefixed: bool,
+    ) {
+        self.assert_not_finished("buffer cannot be finished when it is already finished");
+        self.assert_not_nested(
+            "buffer cannot be finished when a table or vector is under construction",
+        );
+        self.written_vtable_revpos.clear();
+
+        let to_align = {
+            // for the root offset:
+            let a = SIZE_UOFFSET;
+            // for the size prefix:
+            let b = if size_prefixed { SIZE_UOFFSET } else { 0 };
+            // for the file identifier (a string that is not zero-terminated):
+            let c = if file_identifier.is_some() {
+                FILE_IDENTIFIER_LENGTH
+            } else {
+                0
+            };
+            a + b + c
+        };
+
+        {
+            let ma = PushAlignment::new(self.min_align);
+            self.align(to_align, ma);
+        }
+
+        if let Some(ident) = file_identifier {
+            debug_assert_eq!(ident.len(), FILE_IDENTIFIER_LENGTH);
+            self.push_bytes_unprefixed(ident.as_bytes());
+        }
+
+        self.push(root);
+
+        if size_prefixed {
+            let sz = self.used_space() as UOffsetT;
+            self.push::<UOffsetT>(sz);
+        }
+        self.finished = true;
+    }
+
+    #[inline]
+    fn align(&mut self, len: usize, alignment: PushAlignment) {
+        self.track_min_align(alignment.value());
+        let s = self.used_space() as usize;
+        self.make_space(padding_bytes(s + len, alignment.value()));
+    }
+
+    #[inline]
+    fn track_min_align(&mut self, alignment: usize) {
+        self.min_align = max(self.min_align, alignment);
+    }
+
+    #[inline]
+    fn push_bytes_unprefixed(&mut self, x: &[u8]) -> UOffsetT {
+        let n = self.make_space(x.len());
+        self.owned_buf[n..n + x.len()].copy_from_slice(x);
+
+        n as UOffsetT
+    }
+
+    #[inline]
+    fn make_space(&mut self, want: usize) -> usize {
+        self.ensure_capacity(want);
+        self.head -= want;
+        self.head
+    }
+
+    #[inline]
+    fn ensure_capacity(&mut self, want: usize) -> usize {
+        if self.unused_ready_space() >= want {
+            return want;
+        }
+        assert!(
+            want <= FLATBUFFERS_MAX_BUFFER_SIZE,
+            "cannot grow buffer beyond 2 gigabytes"
+        );
+
+        while self.unused_ready_space() < want {
+            self.grow_owned_buf();
+        }
+        want
+    }
+    #[inline]
+    fn unused_ready_space(&self) -> usize {
+        self.head
+    }
+    #[inline]
+    fn assert_nested(&self, fn_name: &'static str) {
+        // we don't assert that self.field_locs.len() >0 because the vtable
+        // could be empty (e.g. for empty tables, or for all-default values).
+        debug_assert!(
+            self.nested,
+            "incorrect FlatBufferBuilder usage: {} must be called while in a nested state",
+            fn_name
+        );
+    }
+    #[inline]
+    fn assert_not_nested(&self, msg: &'static str) {
+        debug_assert!(!self.nested, "{}", msg);
+    }
+    #[inline]
+    fn assert_finished(&self, msg: &'static str) {
+        debug_assert!(self.finished, "{}", msg);
+    }
+    #[inline]
+    fn assert_not_finished(&self, msg: &'static str) {
+        debug_assert!(!self.finished, "{}", msg);
+    }
+}
+
+/// Compute the length of the vtable needed to represent the provided FieldLocs.
+/// If there are no FieldLocs, then provide the minimum number of bytes
+/// required: enough to write the VTable header.
+#[inline]
+fn get_vtable_byte_len(field_locs: &[FieldLoc]) -> usize {
+    let max_voffset = field_locs.iter().map(|fl| fl.id).max();
+    match max_voffset {
+        None => field_index_to_field_offset(0) as usize,
+        Some(mv) => mv as usize + SIZE_VOFFSET,
+    }
+}
+
+#[inline]
+fn padding_bytes(buf_size: usize, scalar_size: usize) -> usize {
+    // ((!buf_size) + 1) & (scalar_size - 1)
+    (!buf_size).wrapping_add(1) & (scalar_size.wrapping_sub(1))
+}
+
+impl<'fbb> Default for FlatBufferBuilder<'fbb> {
+    fn default() -> Self {
+        Self::with_capacity(0)
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/endian_scalar.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/endian_scalar.rs
new file mode 100644
index 0000000..7e8d8e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/endian_scalar.rs
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#![allow(clippy::wrong_self_convention)]
+
+use std::mem::size_of;
+
+/// Trait for values that must be stored in little-endian byte order, but
+/// might be represented in memory as big-endian. Every type that implements
+/// EndianScalar is a valid FlatBuffers scalar value.
+///
+/// The Rust stdlib does not provide a trait to represent scalars, so this trait
+/// serves that purpose, too.
+///
+/// Note that we do not use the num-traits crate for this, because it provides
+/// "too much". For example, num-traits provides i128 support, but that is an
+/// invalid FlatBuffers type.
+pub trait EndianScalar: Sized + PartialEq + Copy + Clone {
+    fn to_little_endian(self) -> Self;
+    fn from_little_endian(self) -> Self;
+}
+
+/// Macro for implementing a no-op endian conversion. This is used for types
+/// that are one byte wide.
+macro_rules! impl_endian_scalar_noop {
+    ($ty:ident) => {
+        impl EndianScalar for $ty {
+            #[inline]
+            fn to_little_endian(self) -> Self {
+                self
+            }
+            #[inline]
+            fn from_little_endian(self) -> Self {
+                self
+            }
+        }
+    };
+}
+
+/// Macro for implementing an endian conversion using the stdlib `to_le` and
+/// `from_le` functions. This is used for integer types. It is not used for
+/// floats, because the `to_le` and `from_le` are not implemented for them in
+/// the stdlib.
+macro_rules! impl_endian_scalar_stdlib_le_conversion {
+    ($ty:ident) => {
+        impl EndianScalar for $ty {
+            #[inline]
+            fn to_little_endian(self) -> Self {
+                Self::to_le(self)
+            }
+            #[inline]
+            fn from_little_endian(self) -> Self {
+                Self::from_le(self)
+            }
+        }
+    };
+}
+
+impl_endian_scalar_noop!(bool);
+impl_endian_scalar_noop!(u8);
+impl_endian_scalar_noop!(i8);
+
+impl_endian_scalar_stdlib_le_conversion!(u16);
+impl_endian_scalar_stdlib_le_conversion!(u32);
+impl_endian_scalar_stdlib_le_conversion!(u64);
+impl_endian_scalar_stdlib_le_conversion!(i16);
+impl_endian_scalar_stdlib_le_conversion!(i32);
+impl_endian_scalar_stdlib_le_conversion!(i64);
+
+impl EndianScalar for f32 {
+    /// Convert f32 from host endian-ness to little-endian.
+    #[inline]
+    fn to_little_endian(self) -> Self {
+        #[cfg(target_endian = "little")]
+        {
+            self
+        }
+        #[cfg(not(target_endian = "little"))]
+        {
+            byte_swap_f32(self)
+        }
+    }
+    /// Convert f32 from little-endian to host endian-ness.
+    #[inline]
+    fn from_little_endian(self) -> Self {
+        #[cfg(target_endian = "little")]
+        {
+            self
+        }
+        #[cfg(not(target_endian = "little"))]
+        {
+            byte_swap_f32(self)
+        }
+    }
+}
+
+impl EndianScalar for f64 {
+    /// Convert f64 from host endian-ness to little-endian.
+    #[inline]
+    fn to_little_endian(self) -> Self {
+        #[cfg(target_endian = "little")]
+        {
+            self
+        }
+        #[cfg(not(target_endian = "little"))]
+        {
+            byte_swap_f64(self)
+        }
+    }
+    /// Convert f64 from little-endian to host endian-ness.
+    #[inline]
+    fn from_little_endian(self) -> Self {
+        #[cfg(target_endian = "little")]
+        {
+            self
+        }
+        #[cfg(not(target_endian = "little"))]
+        {
+            byte_swap_f64(self)
+        }
+    }
+}
+
+/// Swaps the bytes of an f32.
+#[allow(dead_code)]
+#[inline]
+pub fn byte_swap_f32(x: f32) -> f32 {
+    f32::from_bits(x.to_bits().swap_bytes())
+}
+
+/// Swaps the bytes of an f64.
+#[allow(dead_code)]
+#[inline]
+pub fn byte_swap_f64(x: f64) -> f64 {
+    f64::from_bits(x.to_bits().swap_bytes())
+}
+
+/// Place an EndianScalar into the provided mutable byte slice. Performs
+/// endian conversion, if necessary.
+/// # Safety
+/// Caller must ensure `s.len() > size_of::<T>()`
+/// and `x` does not overlap with `s`.
+#[inline]
+pub unsafe fn emplace_scalar<T: EndianScalar>(s: &mut [u8], x: T) {
+    let x_le = x.to_little_endian();
+    core::ptr::copy_nonoverlapping(
+        &x_le as *const T as *const u8,
+        s.as_mut_ptr() as *mut u8,
+        size_of::<T>(),
+    );
+}
+
+/// Read an EndianScalar from the provided byte slice at the specified location.
+/// Performs endian conversion, if necessary.
+/// # Safety
+/// Caller must ensure `s.len() > loc + size_of::<T>()`.
+#[inline]
+pub unsafe fn read_scalar_at<T: EndianScalar>(s: &[u8], loc: usize) -> T {
+    read_scalar(&s[loc..])
+}
+
+/// Read an EndianScalar from the provided byte slice. Performs endian
+/// conversion, if necessary.
+/// # Safety
+/// Caller must ensure `s.len() > size_of::<T>()`.
+#[inline]
+pub unsafe fn read_scalar<T: EndianScalar>(s: &[u8]) -> T {
+    let mut mem = core::mem::MaybeUninit::<T>::uninit();
+    // Since [u8] has alignment 1, we copy it into T which may have higher alignment.
+    core::ptr::copy_nonoverlapping(s.as_ptr(), mem.as_mut_ptr() as *mut u8, size_of::<T>());
+    mem.assume_init().from_little_endian()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/follow.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/follow.rs
new file mode 100644
index 0000000..a09003d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/follow.rs
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use std::marker::PhantomData;
+
+/// Follow is a trait that allows us to access FlatBuffers in a declarative,
+/// type safe, and fast way. They compile down to almost no code (after
+/// optimizations). Conceptually, Follow lifts the offset-based access
+/// patterns of FlatBuffers data into the type system. This trait is used
+/// pervasively at read time, to access tables, vtables, vectors, strings, and
+/// all other data. At this time, Follow is not utilized much on the write
+/// path.
+///
+/// Writing a new Follow implementation primarily involves deciding whether
+/// you want to return data (of the type Self::Inner) or do you want to
+/// continue traversing the FlatBuffer.
+pub trait Follow<'buf> {
+    type Inner;
+    fn follow(buf: &'buf [u8], loc: usize) -> Self::Inner;
+}
+
+/// FollowStart wraps a Follow impl in a struct type. This can make certain
+/// programming patterns more ergonomic.
+#[derive(Debug, Default)]
+pub struct FollowStart<T>(PhantomData<T>);
+impl<'a, T: Follow<'a> + 'a> FollowStart<T> {
+    #[inline]
+    pub fn new() -> Self {
+        Self { 0: PhantomData }
+    }
+    #[inline]
+    pub fn self_follow(&'a self, buf: &'a [u8], loc: usize) -> T::Inner {
+        T::follow(buf, loc)
+    }
+}
+impl<'a, T: Follow<'a>> Follow<'a> for FollowStart<T> {
+    type Inner = T::Inner;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        T::follow(buf, loc)
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/get_root.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/get_root.rs
new file mode 100644
index 0000000..2a01cf8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/get_root.rs
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use crate::{
+    Follow, ForwardsUOffset, InvalidFlatbuffer, SkipSizePrefix, Verifiable, Verifier,
+    VerifierOptions,
+};
+
+/// Gets the root of the Flatbuffer, verifying it first with default options.
+/// Note that verification is an experimental feature and may not be maximally performant or
+/// catch every error (though that is the goal). See the `_unchecked` variants for previous
+/// behavior.
+pub fn root<'buf, T>(data: &'buf [u8]) -> Result<T::Inner, InvalidFlatbuffer>
+where
+    T: 'buf + Follow<'buf> + Verifiable,
+{
+    let opts = VerifierOptions::default();
+    root_with_opts::<T>(&opts, data)
+}
+
+#[inline]
+/// Gets the root of the Flatbuffer, verifying it first with given options.
+/// Note that verification is an experimental feature and may not be maximally performant or
+/// catch every error (though that is the goal). See the `_unchecked` variants for previous
+/// behavior.
+pub fn root_with_opts<'opts, 'buf, T>(
+    opts: &'opts VerifierOptions,
+    data: &'buf [u8],
+) -> Result<T::Inner, InvalidFlatbuffer>
+where
+    T: 'buf + Follow<'buf> + Verifiable,
+{
+    let mut v = Verifier::new(&opts, data);
+    <ForwardsUOffset<T>>::run_verifier(&mut v, 0)?;
+    Ok(unsafe { root_unchecked::<T>(data) })
+}
+
+#[inline]
+/// Gets the root of a size prefixed Flatbuffer, verifying it first with default options.
+/// Note that verification is an experimental feature and may not be maximally performant or
+/// catch every error (though that is the goal). See the `_unchecked` variants for previous
+/// behavior.
+pub fn size_prefixed_root<'buf, T>(data: &'buf [u8]) -> Result<T::Inner, InvalidFlatbuffer>
+where
+    T: 'buf + Follow<'buf> + Verifiable,
+{
+    let opts = VerifierOptions::default();
+    size_prefixed_root_with_opts::<T>(&opts, data)
+}
+
+#[inline]
+/// Gets the root of a size prefixed Flatbuffer, verifying it first with given options.
+/// Note that verification is an experimental feature and may not be maximally performant or
+/// catch every error (though that is the goal). See the `_unchecked` variants for previous
+/// behavior.
+pub fn size_prefixed_root_with_opts<'opts, 'buf, T>(
+    opts: &'opts VerifierOptions,
+    data: &'buf [u8],
+) -> Result<T::Inner, InvalidFlatbuffer>
+where
+    T: 'buf + Follow<'buf> + Verifiable,
+{
+    let mut v = Verifier::new(&opts, data);
+    <SkipSizePrefix<ForwardsUOffset<T>>>::run_verifier(&mut v, 0)?;
+    Ok(unsafe { size_prefixed_root_unchecked::<T>(data) })
+}
+
+#[inline]
+/// Gets root for a trusted Flatbuffer.
+/// # Safety
+/// Flatbuffers accessors do not perform validation checks before accessing. Unlike the other
+/// `root` functions, this does not validate the flatbuffer before returning the accessor. Users
+/// must trust `data` contains a valid flatbuffer (e.g. b/c it was built by your software). Reading
+/// unchecked buffers may cause panics or even UB.
+pub unsafe fn root_unchecked<'buf, T>(data: &'buf [u8]) -> T::Inner
+where
+    T: Follow<'buf> + 'buf,
+{
+    <ForwardsUOffset<T>>::follow(data, 0)
+}
+
+#[inline]
+/// Gets root for a trusted, size prefixed, Flatbuffer.
+/// # Safety
+/// Flatbuffers accessors do not perform validation checks before accessing. Unlike the other
+/// `root` functions, this does not validate the flatbuffer before returning the accessor. Users
+/// must trust `data` contains a valid flatbuffer (e.g. b/c it was built by your software). Reading
+/// unchecked buffers may cause panics or even UB.
+pub unsafe fn size_prefixed_root_unchecked<'buf, T>(data: &'buf [u8]) -> T::Inner
+where
+    T: Follow<'buf> + 'buf,
+{
+    <SkipSizePrefix<ForwardsUOffset<T>>>::follow(data, 0)
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/lib.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/lib.rs
new file mode 100644
index 0000000..465e169
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/lib.rs
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//! # FlatBuffers
+//!
+//! A library for memory-efficient serialization of data.
+//!
+//! This crate provides runtime support for the FlatBuffers format in the Rust programming language.
+//! To use this crate, first generate code with the `flatc` compiler, as described here: <https://google.github.io/flatbuffers/>
+//! Then, include that code into your project.
+//! Finally, add this crate to your `Cargo.toml`.
+//!
+//! At this time, Rust support is experimental, and APIs may change between minor versions.
+//!
+//! At this time, to generate Rust code, you will need the latest `master` version of `flatc`, available from here: <https://github.com/google/flatbuffers>
+//! (On OSX, you can install FlatBuffers from `HEAD` with the Homebrew package manager.)
+
+mod array;
+mod builder;
+mod endian_scalar;
+mod follow;
+mod get_root;
+mod primitives;
+mod push;
+mod table;
+mod vector;
+mod verifier;
+mod vtable;
+mod vtable_writer;
+
+pub use crate::array::{array_init, emplace_scalar_array, Array};
+pub use crate::builder::FlatBufferBuilder;
+pub use crate::endian_scalar::{
+    byte_swap_f32, byte_swap_f64, emplace_scalar, read_scalar, read_scalar_at, EndianScalar,
+};
+pub use crate::follow::{Follow, FollowStart};
+pub use crate::primitives::*;
+pub use crate::push::Push;
+pub use crate::table::{buffer_has_identifier, Table};
+pub use crate::vector::{follow_cast_ref, SafeSliceAccess, Vector, VectorIter};
+pub use crate::verifier::{
+    ErrorTraceDetail, InvalidFlatbuffer, SimpleToVerifyInSlice, Verifiable, Verifier,
+    VerifierOptions,
+};
+pub use crate::vtable::field_index_to_field_offset;
+pub use bitflags;
+pub use get_root::*;
+
+// TODO(rw): Unify `create_vector` and `create_vector_direct` by using
+//           `Into<Vector<...>>`.
+// TODO(rw): Split fill ops in builder into fill_small, fill_big like in C++.
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/primitives.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/primitives.rs
new file mode 100644
index 0000000..b7b4942
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/primitives.rs
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use std::marker::PhantomData;
+use std::mem::size_of;
+use std::ops::Deref;
+
+use crate::endian_scalar::{emplace_scalar, read_scalar, read_scalar_at};
+use crate::follow::Follow;
+use crate::push::Push;
+
+pub const FLATBUFFERS_MAX_BUFFER_SIZE: usize = (1u64 << 31) as usize;
+
+pub const FILE_IDENTIFIER_LENGTH: usize = 4;
+
+pub const VTABLE_METADATA_FIELDS: usize = 2;
+
+pub const SIZE_U8: usize = size_of::<u8>();
+pub const SIZE_I8: usize = size_of::<i8>();
+
+pub const SIZE_U16: usize = size_of::<u16>();
+pub const SIZE_I16: usize = size_of::<i16>();
+
+pub const SIZE_U32: usize = size_of::<u32>();
+pub const SIZE_I32: usize = size_of::<i32>();
+
+pub const SIZE_U64: usize = size_of::<u64>();
+pub const SIZE_I64: usize = size_of::<i64>();
+
+pub const SIZE_F32: usize = size_of::<f32>();
+pub const SIZE_F64: usize = size_of::<f64>();
+
+pub const SIZE_SOFFSET: usize = SIZE_I32;
+pub const SIZE_UOFFSET: usize = SIZE_U32;
+pub const SIZE_VOFFSET: usize = SIZE_I16;
+
+pub const SIZE_SIZEPREFIX: usize = SIZE_UOFFSET;
+
+/// SOffsetT is a relative pointer from tables to their vtables.
+pub type SOffsetT = i32;
+
+/// UOffsetT is used represent both for relative pointers and lengths of vectors.
+pub type UOffsetT = u32;
+
+/// VOffsetT is a relative pointer in vtables to point from tables to field data.
+pub type VOffsetT = u16;
+
+/// TableFinishedWIPOffset marks a WIPOffset as being for a finished table.
+#[derive(Clone, Copy)]
+pub struct TableFinishedWIPOffset {}
+
+/// TableUnfinishedWIPOffset marks a WIPOffset as being for an unfinished table.
+#[derive(Clone, Copy)]
+pub struct TableUnfinishedWIPOffset {}
+
+/// UnionWIPOffset marks a WIPOffset as being for a union value.
+#[derive(Clone, Copy)]
+pub struct UnionWIPOffset {}
+
+/// VTableWIPOffset marks a WIPOffset as being for a vtable.
+#[derive(Clone, Copy)]
+pub struct VTableWIPOffset {}
+
+/// WIPOffset contains an UOffsetT with a special meaning: it is the location of
+/// data relative to the *end* of an in-progress FlatBuffer. The
+/// FlatBufferBuilder uses this to track the location of objects in an absolute
+/// way. The impl of Push converts a WIPOffset into a ForwardsUOffset.
+#[derive(Debug)]
+pub struct WIPOffset<T>(UOffsetT, PhantomData<T>);
+
+// We cannot use derive for these two impls, as the derived impls would only
+// implement `Copy` and `Clone` for `T: Copy` and `T: Clone` respectively.
+// However `WIPOffset<T>` can always be copied, no matter that `T` you
+// have.
+impl<T> Copy for WIPOffset<T> {}
+impl<T> Clone for WIPOffset<T> {
+    #[inline(always)]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T> Eq for WIPOffset<T> {}
+
+impl<T> PartialEq for WIPOffset<T> {
+    fn eq(&self, o: &WIPOffset<T>) -> bool {
+        self.value() == o.value()
+    }
+}
+
+impl<T> Deref for WIPOffset<T> {
+    type Target = UOffsetT;
+    #[inline]
+    fn deref(&self) -> &UOffsetT {
+        &self.0
+    }
+}
+impl<'a, T: 'a> WIPOffset<T> {
+    /// Create a new WIPOffset.
+    #[inline]
+    pub fn new(o: UOffsetT) -> WIPOffset<T> {
+        WIPOffset {
+            0: o,
+            1: PhantomData,
+        }
+    }
+
+    /// Return a wrapped value that brings its meaning as a union WIPOffset
+    /// into the type system.
+    #[inline(always)]
+    pub fn as_union_value(self) -> WIPOffset<UnionWIPOffset> {
+        WIPOffset::new(self.0)
+    }
+    /// Get the underlying value.
+    #[inline(always)]
+    pub fn value(self) -> UOffsetT {
+        self.0
+    }
+}
+
+impl<T> Push for WIPOffset<T> {
+    type Output = ForwardsUOffset<T>;
+
+    #[inline(always)]
+    fn push(&self, dst: &mut [u8], rest: &[u8]) {
+        let n = (SIZE_UOFFSET + rest.len() - self.value() as usize) as UOffsetT;
+        unsafe {
+            emplace_scalar::<UOffsetT>(dst, n);
+        }
+    }
+}
+
+impl<T> Push for ForwardsUOffset<T> {
+    type Output = Self;
+
+    #[inline(always)]
+    fn push(&self, dst: &mut [u8], rest: &[u8]) {
+        self.value().push(dst, rest);
+    }
+}
+
+/// ForwardsUOffset is used by Follow to traverse a FlatBuffer: the pointer
+/// is incremented by the value contained in this type.
+#[derive(Debug)]
+pub struct ForwardsUOffset<T>(UOffsetT, PhantomData<T>);
+
+// We cannot use derive for these two impls, as the derived impls would only
+// implement `Copy` and `Clone` for `T: Copy` and `T: Clone` respectively.
+// However `ForwardsUOffset<T>` can always be copied, no matter that `T` you
+// have.
+impl<T> Copy for ForwardsUOffset<T> {}
+impl<T> Clone for ForwardsUOffset<T> {
+    #[inline(always)]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T> ForwardsUOffset<T> {
+    #[inline(always)]
+    pub fn value(self) -> UOffsetT {
+        self.0
+    }
+}
+
+impl<'a, T: Follow<'a>> Follow<'a> for ForwardsUOffset<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        let slice = &buf[loc..loc + SIZE_UOFFSET];
+        let off = unsafe { read_scalar::<u32>(slice) as usize };
+        T::follow(buf, loc + off)
+    }
+}
+
+/// ForwardsVOffset is used by Follow to traverse a FlatBuffer: the pointer
+/// is incremented by the value contained in this type.
+#[derive(Debug)]
+pub struct ForwardsVOffset<T>(VOffsetT, PhantomData<T>);
+impl<T> ForwardsVOffset<T> {
+    #[inline(always)]
+    pub fn value(&self) -> VOffsetT {
+        self.0
+    }
+}
+
+impl<'a, T: Follow<'a>> Follow<'a> for ForwardsVOffset<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        let slice = &buf[loc..loc + SIZE_VOFFSET];
+        let off = unsafe { read_scalar::<VOffsetT>(slice) as usize };
+        T::follow(buf, loc + off)
+    }
+}
+
+impl<T> Push for ForwardsVOffset<T> {
+    type Output = Self;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], rest: &[u8]) {
+        self.value().push(dst, rest);
+    }
+}
+
+/// ForwardsSOffset is used by Follow to traverse a FlatBuffer: the pointer
+/// is incremented by the *negative* of the value contained in this type.
+#[derive(Debug)]
+pub struct BackwardsSOffset<T>(SOffsetT, PhantomData<T>);
+impl<T> BackwardsSOffset<T> {
+    #[inline(always)]
+    pub fn value(&self) -> SOffsetT {
+        self.0
+    }
+}
+
+impl<'a, T: Follow<'a>> Follow<'a> for BackwardsSOffset<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        let slice = &buf[loc..loc + SIZE_SOFFSET];
+        let off = unsafe { read_scalar::<SOffsetT>(slice) };
+        T::follow(buf, (loc as SOffsetT - off) as usize)
+    }
+}
+
+impl<T> Push for BackwardsSOffset<T> {
+    type Output = Self;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], rest: &[u8]) {
+        self.value().push(dst, rest);
+    }
+}
+
+/// SkipSizePrefix is used by Follow to traverse a FlatBuffer: the pointer is
+/// incremented by a fixed constant in order to skip over the size prefix value.
+pub struct SkipSizePrefix<T>(PhantomData<T>);
+impl<'a, T: Follow<'a> + 'a> Follow<'a> for SkipSizePrefix<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        T::follow(buf, loc + SIZE_SIZEPREFIX)
+    }
+}
+
+/// SkipRootOffset is used by Follow to traverse a FlatBuffer: the pointer is
+/// incremented by a fixed constant in order to skip over the root offset value.
+pub struct SkipRootOffset<T>(PhantomData<T>);
+impl<'a, T: Follow<'a> + 'a> Follow<'a> for SkipRootOffset<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        T::follow(buf, loc + SIZE_UOFFSET)
+    }
+}
+
+/// FileIdentifier is used by Follow to traverse a FlatBuffer: the pointer is
+/// dereferenced into a byte slice, whose bytes are the file identifer value.
+pub struct FileIdentifier;
+impl<'a> Follow<'a> for FileIdentifier {
+    type Inner = &'a [u8];
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        &buf[loc..loc + FILE_IDENTIFIER_LENGTH]
+    }
+}
+
+/// SkipFileIdentifier is used by Follow to traverse a FlatBuffer: the pointer
+/// is incremented by a fixed constant in order to skip over the file
+/// identifier value.
+pub struct SkipFileIdentifier<T>(PhantomData<T>);
+impl<'a, T: Follow<'a> + 'a> Follow<'a> for SkipFileIdentifier<T> {
+    type Inner = T::Inner;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        T::follow(buf, loc + FILE_IDENTIFIER_LENGTH)
+    }
+}
+
+impl<'a> Follow<'a> for bool {
+    type Inner = bool;
+    #[inline(always)]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        unsafe { read_scalar_at::<u8>(buf, loc) != 0 }
+    }
+}
+
+/// Follow trait impls for primitive types.
+///
+/// Ideally, these would be implemented as a single impl using trait bounds on
+/// EndianScalar, but implementing Follow that way causes a conflict with
+/// other impls.
+macro_rules! impl_follow_for_endian_scalar {
+    ($ty:ident) => {
+        impl<'a> Follow<'a> for $ty {
+            type Inner = $ty;
+            #[inline(always)]
+            fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+                unsafe { read_scalar_at::<$ty>(buf, loc) }
+            }
+        }
+    };
+}
+
+impl_follow_for_endian_scalar!(u8);
+impl_follow_for_endian_scalar!(u16);
+impl_follow_for_endian_scalar!(u32);
+impl_follow_for_endian_scalar!(u64);
+impl_follow_for_endian_scalar!(i8);
+impl_follow_for_endian_scalar!(i16);
+impl_follow_for_endian_scalar!(i32);
+impl_follow_for_endian_scalar!(i64);
+impl_follow_for_endian_scalar!(f32);
+impl_follow_for_endian_scalar!(f64);
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/push.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/push.rs
new file mode 100644
index 0000000..72ff88c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/push.rs
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use std::cmp::max;
+use std::mem::{align_of, size_of};
+
+use crate::endian_scalar::emplace_scalar;
+
+/// Trait to abstract over functionality needed to write values (either owned
+/// or referenced). Used in FlatBufferBuilder and implemented for generated
+/// types.
+pub trait Push: Sized {
+    type Output;
+    fn push(&self, dst: &mut [u8], _rest: &[u8]);
+    #[inline]
+    fn size() -> usize {
+        size_of::<Self::Output>()
+    }
+    #[inline]
+    fn alignment() -> PushAlignment {
+        PushAlignment::new(align_of::<Self::Output>())
+    }
+}
+
+/// Ensure Push alignment calculations are typesafe (because this helps reduce
+/// implementation issues when using FlatBufferBuilder::align).
+pub struct PushAlignment(usize);
+impl PushAlignment {
+    #[inline]
+    pub fn new(x: usize) -> Self {
+        PushAlignment { 0: x }
+    }
+    #[inline]
+    pub fn value(&self) -> usize {
+        self.0
+    }
+    #[inline]
+    pub fn max_of(&self, o: usize) -> Self {
+        PushAlignment::new(max(self.0, o))
+    }
+}
+
+/// Macro to implement Push for EndianScalar types.
+macro_rules! impl_push_for_endian_scalar {
+    ($ty:ident) => {
+        impl Push for $ty {
+            type Output = $ty;
+
+            #[inline]
+            fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+                unsafe {
+                    emplace_scalar::<$ty>(dst, *self);
+                }
+            }
+        }
+    };
+}
+
+impl_push_for_endian_scalar!(bool);
+impl_push_for_endian_scalar!(u8);
+impl_push_for_endian_scalar!(i8);
+impl_push_for_endian_scalar!(u16);
+impl_push_for_endian_scalar!(i16);
+impl_push_for_endian_scalar!(u32);
+impl_push_for_endian_scalar!(i32);
+impl_push_for_endian_scalar!(u64);
+impl_push_for_endian_scalar!(i64);
+impl_push_for_endian_scalar!(f32);
+impl_push_for_endian_scalar!(f64);
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/table.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/table.rs
new file mode 100644
index 0000000..cfb8559
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/table.rs
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use crate::follow::Follow;
+use crate::primitives::*;
+use crate::vtable::VTable;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Table<'a> {
+    pub buf: &'a [u8],
+    pub loc: usize,
+}
+
+impl<'a> Table<'a> {
+    #[inline]
+    pub fn new(buf: &'a [u8], loc: usize) -> Self {
+        Table { buf, loc }
+    }
+    #[inline]
+    pub fn vtable(&self) -> VTable<'a> {
+        <BackwardsSOffset<VTable<'a>>>::follow(self.buf, self.loc)
+    }
+    #[inline]
+    pub fn get<T: Follow<'a> + 'a>(
+        &self,
+        slot_byte_loc: VOffsetT,
+        default: Option<T::Inner>,
+    ) -> Option<T::Inner> {
+        let o = self.vtable().get(slot_byte_loc) as usize;
+        if o == 0 {
+            return default;
+        }
+        Some(<T>::follow(self.buf, self.loc + o))
+    }
+}
+
+impl<'a> Follow<'a> for Table<'a> {
+    type Inner = Table<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Table { buf, loc }
+    }
+}
+
+#[inline]
+pub fn buffer_has_identifier(data: &[u8], ident: &str, size_prefixed: bool) -> bool {
+    assert_eq!(ident.len(), FILE_IDENTIFIER_LENGTH);
+
+    let got = if size_prefixed {
+        <SkipSizePrefix<SkipRootOffset<FileIdentifier>>>::follow(data, 0)
+    } else {
+        <SkipRootOffset<FileIdentifier>>::follow(data, 0)
+    };
+
+    ident.as_bytes() == got
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vector.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vector.rs
new file mode 100644
index 0000000..fe46c50
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vector.rs
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use std::fmt::{Debug, Formatter, Result};
+use std::iter::{DoubleEndedIterator, ExactSizeIterator, FusedIterator};
+use std::marker::PhantomData;
+use std::mem::size_of;
+use std::slice::from_raw_parts;
+use std::str::from_utf8_unchecked;
+
+use crate::endian_scalar::read_scalar_at;
+#[cfg(target_endian = "little")]
+use crate::endian_scalar::EndianScalar;
+use crate::follow::Follow;
+use crate::primitives::*;
+
+pub struct Vector<'a, T: 'a>(&'a [u8], usize, PhantomData<T>);
+
+impl<'a, T: 'a> Default for Vector<'a, T> {
+    fn default() -> Self {
+        // Static, length 0 vector.
+        // Note that derived default causes UB due to issues in read_scalar_at /facepalm.
+        Self(
+            &[0; core::mem::size_of::<UOffsetT>()],
+            0,
+            Default::default(),
+        )
+    }
+}
+
+impl<'a, T> Debug for Vector<'a, T>
+where
+    T: 'a + Follow<'a>,
+    <T as Follow<'a>>::Inner: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> Result {
+        f.debug_list().entries(self.iter()).finish()
+    }
+}
+
+// We cannot use derive for these two impls, as it would only implement Copy
+// and Clone for `T: Copy` and `T: Clone` respectively. However `Vector<'a, T>`
+// can always be copied, no matter that `T` you have.
+impl<'a, T> Copy for Vector<'a, T> {}
+impl<'a, T> Clone for Vector<'a, T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T: 'a> Vector<'a, T> {
+    #[inline(always)]
+    pub fn new(buf: &'a [u8], loc: usize) -> Self {
+        Vector {
+            0: buf,
+            1: loc,
+            2: PhantomData,
+        }
+    }
+
+    #[inline(always)]
+    pub fn len(&self) -> usize {
+        unsafe { read_scalar_at::<UOffsetT>(&self.0, self.1) as usize }
+    }
+    #[inline(always)]
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a> Vector<'a, T> {
+    #[inline(always)]
+    pub fn get(&self, idx: usize) -> T::Inner {
+        debug_assert!(idx < self.len() as usize);
+        let sz = size_of::<T>();
+        debug_assert!(sz > 0);
+        T::follow(self.0, self.1 as usize + SIZE_UOFFSET + sz * idx)
+    }
+
+    #[inline(always)]
+    pub fn iter(&self) -> VectorIter<'a, T> {
+        VectorIter::from_vector(*self)
+    }
+}
+
+pub trait SafeSliceAccess {}
+impl<'a, T: SafeSliceAccess + 'a> Vector<'a, T> {
+    pub fn safe_slice(self) -> &'a [T] {
+        let buf = self.0;
+        let loc = self.1;
+        let sz = size_of::<T>();
+        debug_assert!(sz > 0);
+        let len = unsafe { read_scalar_at::<UOffsetT>(&buf, loc) } as usize;
+        let data_buf = &buf[loc + SIZE_UOFFSET..loc + SIZE_UOFFSET + len * sz];
+        let ptr = data_buf.as_ptr() as *const T;
+        let s: &'a [T] = unsafe { from_raw_parts(ptr, len) };
+        s
+    }
+}
+
+impl SafeSliceAccess for u8 {}
+impl SafeSliceAccess for i8 {}
+impl SafeSliceAccess for bool {}
+
+// TODO(caspern): Get rid of this. Conditional compliation is unnecessary complexity.
+// Vectors of primitives just don't work on big endian machines!!!
+#[cfg(target_endian = "little")]
+mod le_safe_slice_impls {
+    impl super::SafeSliceAccess for u16 {}
+    impl super::SafeSliceAccess for u32 {}
+    impl super::SafeSliceAccess for u64 {}
+
+    impl super::SafeSliceAccess for i16 {}
+    impl super::SafeSliceAccess for i32 {}
+    impl super::SafeSliceAccess for i64 {}
+
+    impl super::SafeSliceAccess for f32 {}
+    impl super::SafeSliceAccess for f64 {}
+}
+
+#[cfg(target_endian = "little")]
+pub use self::le_safe_slice_impls::*;
+
+pub fn follow_cast_ref<'a, T: Sized + 'a>(buf: &'a [u8], loc: usize) -> &'a T {
+    let sz = size_of::<T>();
+    let buf = &buf[loc..loc + sz];
+    let ptr = buf.as_ptr() as *const T;
+    unsafe { &*ptr }
+}
+
+impl<'a> Follow<'a> for &'a str {
+    type Inner = &'a str;
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        let len = unsafe { read_scalar_at::<UOffsetT>(&buf, loc) } as usize;
+        let slice = &buf[loc + SIZE_UOFFSET..loc + SIZE_UOFFSET + len];
+        unsafe { from_utf8_unchecked(slice) }
+    }
+}
+
+#[cfg(target_endian = "little")]
+fn follow_slice_helper<T>(buf: &[u8], loc: usize) -> &[T] {
+    let sz = size_of::<T>();
+    debug_assert!(sz > 0);
+    let len = unsafe { read_scalar_at::<UOffsetT>(&buf, loc) as usize };
+    let data_buf = &buf[loc + SIZE_UOFFSET..loc + SIZE_UOFFSET + len * sz];
+    let ptr = data_buf.as_ptr() as *const T;
+    let s: &[T] = unsafe { from_raw_parts(ptr, len) };
+    s
+}
+
+/// Implement direct slice access if the host is little-endian.
+#[cfg(target_endian = "little")]
+impl<'a, T: EndianScalar> Follow<'a> for &'a [T] {
+    type Inner = &'a [T];
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        follow_slice_helper::<T>(buf, loc)
+    }
+}
+
+/// Implement Follow for all possible Vectors that have Follow-able elements.
+impl<'a, T: Follow<'a> + 'a> Follow<'a> for Vector<'a, T> {
+    type Inner = Vector<'a, T>;
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Vector::new(buf, loc)
+    }
+}
+
+/// An iterator over a `Vector`.
+#[derive(Debug)]
+pub struct VectorIter<'a, T: 'a> {
+    buf: &'a [u8],
+    loc: usize,
+    remaining: usize,
+    phantom: PhantomData<T>,
+}
+
+impl<'a, T: 'a> VectorIter<'a, T> {
+    #[inline]
+    pub fn from_vector(inner: Vector<'a, T>) -> Self {
+        VectorIter {
+            buf: inner.0,
+            // inner.1 is the location of the data for the vector.
+            // The first SIZE_UOFFSET bytes is the length. We skip
+            // that to get to the actual vector content.
+            loc: inner.1 + SIZE_UOFFSET,
+            remaining: inner.len(),
+            phantom: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn from_slice(buf: &'a [u8], items_num: usize) -> Self {
+        VectorIter {
+            buf,
+            loc: 0,
+            remaining: items_num,
+            phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a> Clone for VectorIter<'a, T> {
+    #[inline]
+    fn clone(&self) -> Self {
+        VectorIter {
+            buf: self.buf,
+            loc: self.loc,
+            remaining: self.remaining,
+            phantom: self.phantom,
+        }
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a> Iterator for VectorIter<'a, T> {
+    type Item = T::Inner;
+
+    #[inline]
+    fn next(&mut self) -> Option<T::Inner> {
+        let sz = size_of::<T>();
+        debug_assert!(sz > 0);
+
+        if self.remaining == 0 {
+            None
+        } else {
+            let result = T::follow(self.buf, self.loc);
+            self.loc += sz;
+            self.remaining -= 1;
+            Some(result)
+        }
+    }
+
+    #[inline]
+    fn nth(&mut self, n: usize) -> Option<T::Inner> {
+        let sz = size_of::<T>();
+        debug_assert!(sz > 0);
+
+        self.remaining = self.remaining.saturating_sub(n);
+
+        // Note that this might overflow, but that is okay because
+        // in that case self.remaining will have been set to zero.
+        self.loc = self.loc.wrapping_add(sz * n);
+
+        self.next()
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.remaining, Some(self.remaining))
+    }
+}
+
+impl<'a, T: Follow<'a> + 'a> DoubleEndedIterator for VectorIter<'a, T> {
+    #[inline]
+    fn next_back(&mut self) -> Option<T::Inner> {
+        let sz = size_of::<T>();
+        debug_assert!(sz > 0);
+
+        if self.remaining == 0 {
+            None
+        } else {
+            self.remaining -= 1;
+            Some(T::follow(self.buf, self.loc + sz * self.remaining))
+        }
+    }
+
+    #[inline]
+    fn nth_back(&mut self, n: usize) -> Option<T::Inner> {
+        self.remaining = self.remaining.saturating_sub(n);
+        self.next_back()
+    }
+}
+
+impl<'a, T: 'a + Follow<'a>> ExactSizeIterator for VectorIter<'a, T> {
+    #[inline]
+    fn len(&self) -> usize {
+        self.remaining
+    }
+}
+
+impl<'a, T: 'a + Follow<'a>> FusedIterator for VectorIter<'a, T> {}
+
+impl<'a, T: Follow<'a> + 'a> IntoIterator for Vector<'a, T> {
+    type Item = T::Inner;
+    type IntoIter = VectorIter<'a, T>;
+    #[inline]
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+impl<'a, 'b, T: Follow<'a> + 'a> IntoIterator for &'b Vector<'a, T> {
+    type Item = T::Inner;
+    type IntoIter = VectorIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/verifier.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/verifier.rs
new file mode 100644
index 0000000..12a02f9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/verifier.rs
@@ -0,0 +1,568 @@
+use crate::follow::Follow;
+use crate::{ForwardsUOffset, SOffsetT, SkipSizePrefix, UOffsetT, VOffsetT, Vector, SIZE_UOFFSET};
+use std::ops::Range;
+use thiserror::Error;
+
+/// Traces the location of data errors. Not populated for Dos detecting errors.
+/// Useful for MissingRequiredField and Utf8Error in particular, though
+/// the other errors should not be producible by correct flatbuffers implementations.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum ErrorTraceDetail {
+    VectorElement {
+        index: usize,
+        position: usize,
+    },
+    TableField {
+        field_name: &'static str,
+        position: usize,
+    },
+    UnionVariant {
+        variant: &'static str,
+        position: usize,
+    },
+}
+#[derive(PartialEq, Eq, Default, Debug, Clone)]
+pub struct ErrorTrace(Vec<ErrorTraceDetail>);
+impl std::convert::AsRef<[ErrorTraceDetail]> for ErrorTrace {
+    #[inline]
+    fn as_ref(&self) -> &[ErrorTraceDetail] {
+        &self.0
+    }
+}
+
+/// Describes how a flatuffer is invalid and, for data errors, roughly where. No extra tracing
+/// information is given for DoS detecting errors since it will probably be a lot.
+#[derive(Clone, Error, Debug, PartialEq, Eq)]
+pub enum InvalidFlatbuffer {
+    #[error("Missing required field `{required}`.\n{error_trace}")]
+    MissingRequiredField {
+        required: &'static str,
+        error_trace: ErrorTrace,
+    },
+    #[error(
+        "Union exactly one of union discriminant (`{field_type}`) and value \
+             (`{field}`) are present.\n{error_trace}"
+    )]
+    InconsistentUnion {
+        field: &'static str,
+        field_type: &'static str,
+        error_trace: ErrorTrace,
+    },
+    #[error("Utf8 error for string in {range:?}: {error}\n{error_trace}")]
+    Utf8Error {
+        #[source]
+        error: std::str::Utf8Error,
+        range: Range<usize>,
+        error_trace: ErrorTrace,
+    },
+    #[error("String in range [{}, {}) is missing its null terminator.\n{error_trace}",
+            range.start, range.end)]
+    MissingNullTerminator {
+        range: Range<usize>,
+        error_trace: ErrorTrace,
+    },
+    #[error("Type `{unaligned_type}` at position {position} is unaligned.\n{error_trace}")]
+    Unaligned {
+        position: usize,
+        unaligned_type: &'static str,
+        error_trace: ErrorTrace,
+    },
+    #[error("Range [{}, {}) is out of bounds.\n{error_trace}", range.start, range.end)]
+    RangeOutOfBounds {
+        range: Range<usize>,
+        error_trace: ErrorTrace,
+    },
+    #[error(
+        "Signed offset at position {position} has value {soffset} which points out of bounds.\
+             \n{error_trace}"
+    )]
+    SignedOffsetOutOfBounds {
+        soffset: SOffsetT,
+        position: usize,
+        error_trace: ErrorTrace,
+    },
+    // Dos detecting errors. These do not get error traces since it will probably be very large.
+    #[error("Too many tables.")]
+    TooManyTables,
+    #[error("Apparent size too large.")]
+    ApparentSizeTooLarge,
+    #[error("Nested table depth limit reached.")]
+    DepthLimitReached,
+}
+
+impl std::fmt::Display for ErrorTrace {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        use ErrorTraceDetail::*;
+        for e in self.0.iter() {
+            match e {
+                VectorElement { index, position } => {
+                    writeln!(
+                        f,
+                        "\twhile verifying vector element {:?} at position {:?}",
+                        index, position
+                    )?;
+                }
+                TableField {
+                    field_name,
+                    position,
+                } => {
+                    writeln!(
+                        f,
+                        "\twhile verifying table field `{}` at position {:?}",
+                        field_name, position
+                    )?;
+                }
+                UnionVariant { variant, position } => {
+                    writeln!(
+                        f,
+                        "\t while verifying union variant `{}` at position {:?}",
+                        variant, position
+                    )?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+pub type Result<T> = std::prelude::v1::Result<T, InvalidFlatbuffer>;
+
+impl InvalidFlatbuffer {
+    fn new_range_oob<T>(start: usize, end: usize) -> Result<T> {
+        Err(Self::RangeOutOfBounds {
+            range: Range { start, end },
+            error_trace: Default::default(),
+        })
+    }
+    fn new_inconsistent_union<T>(field: &'static str, field_type: &'static str) -> Result<T> {
+        Err(Self::InconsistentUnion {
+            field,
+            field_type,
+            error_trace: Default::default(),
+        })
+    }
+    fn new_missing_required<T>(required: &'static str) -> Result<T> {
+        Err(Self::MissingRequiredField {
+            required,
+            error_trace: Default::default(),
+        })
+    }
+}
+
+/// Records the path to the verifier detail if the error is a data error and not a DoS error.
+fn append_trace<T>(mut res: Result<T>, d: ErrorTraceDetail) -> Result<T> {
+    if let Err(e) = res.as_mut() {
+        use InvalidFlatbuffer::*;
+        if let MissingRequiredField { error_trace, .. }
+        | Unaligned { error_trace, .. }
+        | RangeOutOfBounds { error_trace, .. }
+        | InconsistentUnion { error_trace, .. }
+        | Utf8Error { error_trace, .. }
+        | MissingNullTerminator { error_trace, .. }
+        | SignedOffsetOutOfBounds { error_trace, .. } = e
+        {
+            error_trace.0.push(d)
+        }
+    }
+    res
+}
+
+/// Adds a TableField trace detail if `res` is a data error.
+fn trace_field<T>(res: Result<T>, field_name: &'static str, position: usize) -> Result<T> {
+    append_trace(
+        res,
+        ErrorTraceDetail::TableField {
+            field_name,
+            position,
+        },
+    )
+}
+/// Adds a TableField trace detail if `res` is a data error.
+fn trace_elem<T>(res: Result<T>, index: usize, position: usize) -> Result<T> {
+    append_trace(res, ErrorTraceDetail::VectorElement { index, position })
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct VerifierOptions {
+    /// Maximum depth of nested tables allowed in a valid flatbuffer.
+    pub max_depth: usize,
+    /// Maximum number of tables allowed in a valid flatbuffer.
+    pub max_tables: usize,
+    /// Maximum "apparent" size of the message if the Flatbuffer object DAG is expanded into a
+    /// tree.
+    pub max_apparent_size: usize,
+    /// Ignore errors where a string is missing its null terminator.
+    /// This is mostly a problem if the message will be sent to a client using old c-strings.
+    pub ignore_missing_null_terminator: bool,
+    // probably want an option to ignore utf8 errors since strings come from c++
+    // options to error un-recognized enums and unions? possible footgun.
+    // Ignore nested flatbuffers, etc?
+}
+impl Default for VerifierOptions {
+    fn default() -> Self {
+        Self {
+            max_depth: 64,
+            max_tables: 1_000_000,
+            // size_ might do something different.
+            max_apparent_size: 1 << 31,
+            ignore_missing_null_terminator: false,
+        }
+    }
+}
+
+/// Carries the verification state. Should not be reused between tables.
+#[derive(Debug)]
+pub struct Verifier<'opts, 'buf> {
+    buffer: &'buf [u8],
+    opts: &'opts VerifierOptions,
+    depth: usize,
+    num_tables: usize,
+    apparent_size: usize,
+}
+impl<'opts, 'buf> Verifier<'opts, 'buf> {
+    pub fn new(opts: &'opts VerifierOptions, buffer: &'buf [u8]) -> Self {
+        Self {
+            opts,
+            buffer,
+            depth: 0,
+            num_tables: 0,
+            apparent_size: 0,
+        }
+    }
+    /// Resets verifier internal state.
+    #[inline]
+    pub fn reset(&mut self) {
+        self.depth = 0;
+        self.num_tables = 0;
+        self.num_tables = 0;
+    }
+    /// Checks `pos` is aligned to T's alignment. This does not mean `buffer[pos]` is aligned w.r.t
+    /// memory since `buffer: &[u8]` has alignment 1.
+    ///
+    /// ### WARNING
+    /// This does not work for flatbuffers-structs as they have alignment 1 according to
+    /// `core::mem::align_of` but are meant to have higher alignment within a Flatbuffer w.r.t.
+    /// `buffer[0]`. TODO(caspern).
+    #[inline]
+    fn is_aligned<T>(&self, pos: usize) -> Result<()> {
+        if pos % std::mem::align_of::<T>() == 0 {
+            Ok(())
+        } else {
+            Err(InvalidFlatbuffer::Unaligned {
+                unaligned_type: std::any::type_name::<T>(),
+                position: pos,
+                error_trace: Default::default(),
+            })
+        }
+    }
+    #[inline]
+    fn range_in_buffer(&mut self, pos: usize, size: usize) -> Result<()> {
+        let end = pos.saturating_add(size);
+        if end > self.buffer.len() {
+            return InvalidFlatbuffer::new_range_oob(pos, end);
+        }
+        self.apparent_size += size;
+        if self.apparent_size > self.opts.max_apparent_size {
+            return Err(InvalidFlatbuffer::ApparentSizeTooLarge);
+        }
+        Ok(())
+    }
+    /// Check that there really is a T in there.
+    #[inline]
+    pub fn in_buffer<T>(&mut self, pos: usize) -> Result<()> {
+        self.is_aligned::<T>(pos)?;
+        self.range_in_buffer(pos, std::mem::size_of::<T>())
+    }
+    #[inline]
+    fn get_u16(&mut self, pos: usize) -> Result<u16> {
+        self.in_buffer::<u16>(pos)?;
+        Ok(u16::from_le_bytes([self.buffer[pos], self.buffer[pos + 1]]))
+    }
+    #[inline]
+    fn get_uoffset(&mut self, pos: usize) -> Result<UOffsetT> {
+        self.in_buffer::<u32>(pos)?;
+        Ok(u32::from_le_bytes([
+            self.buffer[pos],
+            self.buffer[pos + 1],
+            self.buffer[pos + 2],
+            self.buffer[pos + 3],
+        ]))
+    }
+    #[inline]
+    fn deref_soffset(&mut self, pos: usize) -> Result<usize> {
+        self.in_buffer::<SOffsetT>(pos)?;
+        let offset = SOffsetT::from_le_bytes([
+            self.buffer[pos],
+            self.buffer[pos + 1],
+            self.buffer[pos + 2],
+            self.buffer[pos + 3],
+        ]);
+
+        // signed offsets are subtracted.
+        let derefed = if offset > 0 {
+            pos.checked_sub(offset.abs() as usize)
+        } else {
+            pos.checked_add(offset.abs() as usize)
+        };
+        if let Some(x) = derefed {
+            if x < self.buffer.len() {
+                return Ok(x);
+            }
+        }
+        Err(InvalidFlatbuffer::SignedOffsetOutOfBounds {
+            soffset: offset,
+            position: pos,
+            error_trace: Default::default(),
+        })
+    }
+    #[inline]
+    pub fn visit_table<'ver>(
+        &'ver mut self,
+        table_pos: usize,
+    ) -> Result<TableVerifier<'ver, 'opts, 'buf>> {
+        let vtable_pos = self.deref_soffset(table_pos)?;
+        let vtable_len = self.get_u16(vtable_pos)? as usize;
+        self.is_aligned::<VOffsetT>(vtable_pos.saturating_add(vtable_len))?; // i.e. vtable_len is even.
+        self.range_in_buffer(vtable_pos, vtable_len)?;
+        // Check bounds.
+        self.num_tables += 1;
+        if self.num_tables > self.opts.max_tables {
+            return Err(InvalidFlatbuffer::TooManyTables);
+        }
+        self.depth += 1;
+        if self.depth > self.opts.max_depth {
+            return Err(InvalidFlatbuffer::DepthLimitReached);
+        }
+        Ok(TableVerifier {
+            pos: table_pos,
+            vtable: vtable_pos,
+            vtable_len,
+            verifier: self,
+        })
+    }
+
+    /// Runs the union variant's type's verifier assuming the variant is at the given position,
+    /// tracing the error.
+    pub fn verify_union_variant<T: Verifiable>(
+        &mut self,
+        variant: &'static str,
+        position: usize,
+    ) -> Result<()> {
+        let res = T::run_verifier(self, position);
+        append_trace(res, ErrorTraceDetail::UnionVariant { variant, position })
+    }
+}
+
+// Cache table metadata in usize so we don't have to cast types or jump around so much.
+// We will visit every field anyway.
+pub struct TableVerifier<'ver, 'opts, 'buf> {
+    // Absolute position of table in buffer
+    pos: usize,
+    // Absolute position of vtable in buffer.
+    vtable: usize,
+    // Length of vtable.
+    vtable_len: usize,
+    // Verifier struct which holds the surrounding state and options.
+    verifier: &'ver mut Verifier<'opts, 'buf>,
+}
+impl<'ver, 'opts, 'buf> TableVerifier<'ver, 'opts, 'buf> {
+    fn deref(&mut self, field: VOffsetT) -> Result<Option<usize>> {
+        let field = field as usize;
+        if field < self.vtable_len {
+            let field_offset = self.verifier.get_u16(self.vtable.saturating_add(field))?;
+            if field_offset > 0 {
+                // Field is present.
+                let field_pos = self.pos.saturating_add(field_offset as usize);
+                return Ok(Some(field_pos));
+            }
+        }
+        Ok(None)
+    }
+
+    #[inline]
+    pub fn visit_field<T: Verifiable>(
+        mut self,
+        field_name: &'static str,
+        field: VOffsetT,
+        required: bool,
+    ) -> Result<Self> {
+        if let Some(field_pos) = self.deref(field)? {
+            trace_field(
+                T::run_verifier(self.verifier, field_pos),
+                field_name,
+                field_pos,
+            )?;
+            return Ok(self);
+        }
+        if required {
+            InvalidFlatbuffer::new_missing_required(field_name)
+        } else {
+            Ok(self)
+        }
+    }
+    #[inline]
+    /// Union verification is complicated. The schemas passes this function the metadata of the
+    /// union's key (discriminant) and value fields, and a callback. The function verifies and
+    /// reads the key, then invokes the callback to perform data-dependent verification.
+    pub fn visit_union<Key, UnionVerifier>(
+        mut self,
+        key_field_name: &'static str,
+        key_field_voff: VOffsetT,
+        val_field_name: &'static str,
+        val_field_voff: VOffsetT,
+        required: bool,
+        verify_union: UnionVerifier,
+    ) -> Result<Self>
+    where
+        Key: Follow<'buf> + Verifiable,
+        UnionVerifier:
+            (std::ops::FnOnce(<Key as Follow<'buf>>::Inner, &mut Verifier, usize) -> Result<()>),
+        // NOTE: <Key as Follow<'buf>>::Inner == Key
+    {
+        // TODO(caspern): how to trace vtable errors?
+        let val_pos = self.deref(val_field_voff)?;
+        let key_pos = self.deref(key_field_voff)?;
+        match (key_pos, val_pos) {
+            (None, None) => {
+                if required {
+                    InvalidFlatbuffer::new_missing_required(val_field_name)
+                } else {
+                    Ok(self)
+                }
+            }
+            (Some(k), Some(v)) => {
+                trace_field(Key::run_verifier(self.verifier, k), key_field_name, k)?;
+                let discriminant = Key::follow(self.verifier.buffer, k);
+                trace_field(
+                    verify_union(discriminant, self.verifier, v),
+                    val_field_name,
+                    v,
+                )?;
+                Ok(self)
+            }
+            _ => InvalidFlatbuffer::new_inconsistent_union(key_field_name, val_field_name),
+        }
+    }
+    pub fn finish(self) -> &'ver mut Verifier<'opts, 'buf> {
+        self.verifier.depth -= 1;
+        self.verifier
+    }
+}
+
+// Needs to be implemented for Tables and maybe structs.
+// Unions need some special treatment.
+pub trait Verifiable {
+    /// Runs the verifier for this type, assuming its at position `pos` in the verifier's buffer.
+    /// Should not need to be called directly.
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()>;
+}
+
+// Verify the uoffset and then pass verifier to the type being pointed to.
+impl<T: Verifiable> Verifiable for ForwardsUOffset<T> {
+    #[inline]
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()> {
+        let offset = v.get_uoffset(pos)? as usize;
+        let next_pos = offset.saturating_add(pos);
+        T::run_verifier(v, next_pos)
+    }
+}
+
+/// Checks and returns the range containing the flatbuffers vector.
+fn verify_vector_range<T>(v: &mut Verifier, pos: usize) -> Result<std::ops::Range<usize>> {
+    let len = v.get_uoffset(pos)? as usize;
+    let start = pos.saturating_add(SIZE_UOFFSET);
+    v.is_aligned::<T>(start)?;
+    let size = len.saturating_mul(std::mem::size_of::<T>());
+    let end = start.saturating_add(size);
+    v.range_in_buffer(start, size)?;
+    Ok(std::ops::Range { start, end })
+}
+
+pub trait SimpleToVerifyInSlice {}
+impl SimpleToVerifyInSlice for bool {}
+impl SimpleToVerifyInSlice for i8 {}
+impl SimpleToVerifyInSlice for u8 {}
+impl SimpleToVerifyInSlice for i16 {}
+impl SimpleToVerifyInSlice for u16 {}
+impl SimpleToVerifyInSlice for i32 {}
+impl SimpleToVerifyInSlice for u32 {}
+impl SimpleToVerifyInSlice for f32 {}
+impl SimpleToVerifyInSlice for i64 {}
+impl SimpleToVerifyInSlice for u64 {}
+impl SimpleToVerifyInSlice for f64 {}
+
+impl<T: SimpleToVerifyInSlice> Verifiable for Vector<'_, T> {
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()> {
+        verify_vector_range::<T>(v, pos)?;
+        Ok(())
+    }
+}
+
+impl<T: Verifiable> Verifiable for SkipSizePrefix<T> {
+    #[inline]
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()> {
+        T::run_verifier(v, pos.saturating_add(crate::SIZE_SIZEPREFIX))
+    }
+}
+
+impl<T: Verifiable> Verifiable for Vector<'_, ForwardsUOffset<T>> {
+    #[inline]
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()> {
+        let range = verify_vector_range::<ForwardsUOffset<T>>(v, pos)?;
+        let size = std::mem::size_of::<ForwardsUOffset<T>>();
+        for (i, element_pos) in range.step_by(size).enumerate() {
+            trace_elem(
+                <ForwardsUOffset<T>>::run_verifier(v, element_pos),
+                i,
+                element_pos,
+            )?;
+        }
+        Ok(())
+    }
+}
+
+impl<'a> Verifiable for &'a str {
+    #[inline]
+    fn run_verifier(v: &mut Verifier, pos: usize) -> Result<()> {
+        let range = verify_vector_range::<u8>(v, pos)?;
+        let has_null_terminator = v.buffer.get(range.end).map(|&b| b == 0).unwrap_or(false);
+        let s = std::str::from_utf8(&v.buffer[range.clone()]);
+        if let Err(error) = s {
+            return Err(InvalidFlatbuffer::Utf8Error {
+                error,
+                range,
+                error_trace: Default::default(),
+            });
+        }
+        if !v.opts.ignore_missing_null_terminator && !has_null_terminator {
+            return Err(InvalidFlatbuffer::MissingNullTerminator {
+                range,
+                error_trace: Default::default(),
+            });
+        }
+        Ok(())
+    }
+}
+
+// Verify VectorOfTables, Unions, Arrays, Structs...
+macro_rules! impl_verifiable_for {
+    ($T: ty) => {
+        impl Verifiable for $T {
+            #[inline]
+            fn run_verifier<'opts, 'buf>(v: &mut Verifier<'opts, 'buf>, pos: usize) -> Result<()> {
+                v.in_buffer::<$T>(pos)
+            }
+        }
+    };
+}
+impl_verifiable_for!(bool);
+impl_verifiable_for!(u8);
+impl_verifiable_for!(i8);
+impl_verifiable_for!(u16);
+impl_verifiable_for!(i16);
+impl_verifiable_for!(u32);
+impl_verifiable_for!(i32);
+impl_verifiable_for!(f32);
+impl_verifiable_for!(u64);
+impl_verifiable_for!(i64);
+impl_verifiable_for!(f64);
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable.rs
new file mode 100644
index 0000000..bbb7190
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable.rs
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use crate::endian_scalar::read_scalar_at;
+use crate::follow::Follow;
+use crate::primitives::*;
+
+/// VTable encapsulates read-only usage of a vtable. It is only to be used
+/// by generated code.
+#[derive(Debug)]
+pub struct VTable<'a> {
+    buf: &'a [u8],
+    loc: usize,
+}
+
+impl<'a> PartialEq for VTable<'a> {
+    fn eq(&self, other: &VTable) -> bool {
+        self.as_bytes().eq(other.as_bytes())
+    }
+}
+
+impl<'a> VTable<'a> {
+    pub fn init(buf: &'a [u8], loc: usize) -> Self {
+        VTable { buf, loc }
+    }
+    pub fn num_fields(&self) -> usize {
+        (self.num_bytes() / SIZE_VOFFSET) - 2
+    }
+    pub fn num_bytes(&self) -> usize {
+        unsafe { read_scalar_at::<VOffsetT>(self.buf, self.loc) as usize }
+    }
+    pub fn object_inline_num_bytes(&self) -> usize {
+        let n = unsafe { read_scalar_at::<VOffsetT>(self.buf, self.loc + SIZE_VOFFSET) };
+        n as usize
+    }
+    pub fn get_field(&self, idx: usize) -> VOffsetT {
+        // TODO(rw): distinguish between None and 0?
+        if idx > self.num_fields() {
+            return 0;
+        }
+        unsafe {
+            read_scalar_at::<VOffsetT>(
+                self.buf,
+                self.loc + SIZE_VOFFSET + SIZE_VOFFSET + SIZE_VOFFSET * idx,
+            )
+        }
+    }
+    pub fn get(&self, byte_loc: VOffsetT) -> VOffsetT {
+        // TODO(rw): distinguish between None and 0?
+        if byte_loc as usize >= self.num_bytes() {
+            return 0;
+        }
+        unsafe { read_scalar_at::<VOffsetT>(self.buf, self.loc + byte_loc as usize) }
+    }
+    pub fn as_bytes(&self) -> &[u8] {
+        let len = self.num_bytes();
+        &self.buf[self.loc..self.loc + len]
+    }
+}
+
+#[allow(dead_code)]
+pub fn field_index_to_field_offset(field_id: VOffsetT) -> VOffsetT {
+    // Should correspond to what end_table() below builds up.
+    let fixed_fields = 2; // Vtable size and Object Size.
+    ((field_id + fixed_fields) * (SIZE_VOFFSET as VOffsetT)) as VOffsetT
+}
+
+#[allow(dead_code)]
+pub fn field_offset_to_field_index(field_o: VOffsetT) -> VOffsetT {
+    debug_assert!(field_o >= 2);
+    let fixed_fields = 2; // VTable size and Object Size.
+    (field_o / (SIZE_VOFFSET as VOffsetT)) - fixed_fields
+}
+
+impl<'a> Follow<'a> for VTable<'a> {
+    type Inner = VTable<'a>;
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        VTable::init(buf, loc)
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable_writer.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable_writer.rs
new file mode 100644
index 0000000..75eabd4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flatbuffers/src/vtable_writer.rs
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use std::ptr::write_bytes;
+
+use crate::endian_scalar::{emplace_scalar, read_scalar_at};
+use crate::primitives::*;
+
+/// VTableWriter compartmentalizes actions needed to create a vtable.
+#[derive(Debug)]
+pub struct VTableWriter<'a> {
+    buf: &'a mut [u8],
+}
+
+impl<'a> VTableWriter<'a> {
+    #[inline(always)]
+    pub fn init(buf: &'a mut [u8]) -> Self {
+        VTableWriter { buf }
+    }
+
+    /// Writes the vtable length (in bytes) into the vtable.
+    ///
+    /// Note that callers already need to have computed this to initialize
+    /// a VTableWriter.
+    ///
+    /// In debug mode, asserts that the length of the underlying data is equal
+    /// to the provided value.
+    #[inline(always)]
+    pub fn write_vtable_byte_length(&mut self, n: VOffsetT) {
+        unsafe {
+            emplace_scalar::<VOffsetT>(&mut self.buf[..SIZE_VOFFSET], n);
+        }
+        debug_assert_eq!(n as usize, self.buf.len());
+    }
+
+    /// Writes an object length (in bytes) into the vtable.
+    #[inline(always)]
+    pub fn write_object_inline_size(&mut self, n: VOffsetT) {
+        unsafe {
+            emplace_scalar::<VOffsetT>(&mut self.buf[SIZE_VOFFSET..2 * SIZE_VOFFSET], n);
+        }
+    }
+
+    /// Gets an object field offset from the vtable. Only used for debugging.
+    ///
+    /// Note that this expects field offsets (which are like pointers), not
+    /// field ids (which are like array indices).
+    #[inline(always)]
+    pub fn get_field_offset(&self, vtable_offset: VOffsetT) -> VOffsetT {
+        let idx = vtable_offset as usize;
+        unsafe { read_scalar_at::<VOffsetT>(&self.buf, idx) }
+    }
+
+    /// Writes an object field offset into the vtable.
+    ///
+    /// Note that this expects field offsets (which are like pointers), not
+    /// field ids (which are like array indices).
+    #[inline(always)]
+    pub fn write_field_offset(&mut self, vtable_offset: VOffsetT, object_data_offset: VOffsetT) {
+        let idx = vtable_offset as usize;
+        unsafe {
+            emplace_scalar::<VOffsetT>(&mut self.buf[idx..idx + SIZE_VOFFSET], object_data_offset);
+        }
+    }
+
+    /// Clears all data in this VTableWriter. Used to cleanly undo a
+    /// vtable write.
+    #[inline(always)]
+    pub fn clear(&mut self) {
+        // This is the closest thing to memset in Rust right now.
+        let len = self.buf.len();
+        let p = self.buf.as_mut_ptr() as *mut u8;
+        unsafe {
+            write_bytes(p, 0, len);
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/.gitignore b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/.gitignore
new file mode 100644
index 0000000..6936990
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/.gitignore
@@ -0,0 +1,3 @@
+/target
+**/*.rs.bk
+Cargo.lock
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/Cargo.toml b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/Cargo.toml
new file mode 100644
index 0000000..e4fa6fa
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "flexbuffers"
+version = "2.0.0"
+authors = ["Casper Neo <cneo@google.com>", "FlatBuffers Maintainers"]
+edition = "2018"
+license = "Apache-2.0"
+description = "Official FlexBuffers Rust runtime library."
+homepage = "https://google.github.io/flatbuffers/flexbuffers"
+repository = "https://github.com/google/flatbuffers"
+keywords = ["flatbuffers", "flexbuffers", "serialization", "zero-copy"]
+categories = ["encoding", "data-structures", "memory-management"]
+
+[features]
+# Sets serde::Serializer::is_human_readable() to true.
+# The default was changed from true to false in version "0.2.1".
+# You basically never need this to be true unless writing data for old binaries.
+serialize_human_readable = []
+# Sets serde::Deserializer::is_human_readable() to true.
+# The default was changed from true to false in version "0.2.1".
+# You basically never need this to be true unless reading data from old binaries.
+deserialize_human_readable = []
+
+[dependencies]
+serde = "1.0.119"
+serde_derive = "1.0.119"
+byteorder = "1.4.2"
+num_enum = "0.5.1"
+bitflags = "1.2.1"
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/README.md b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/README.md
new file mode 100644
index 0000000..0b3331b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/README.md
@@ -0,0 +1,22 @@
+# Flexbuffers
+
+[Flexbuffers](https://google.github.io/flatbuffers/flexbuffers.html) is a
+schema-less binary format developed at Google. FlexBuffers can be accessed
+without parsing, copying, or allocation. This is a huge win for efficiency,
+memory friendly-ness, and allows for unique use cases such as mmap-ing large
+amounts of free-form data.
+
+FlexBuffers' design and implementation allows for a very compact encoding,
+with automatic sizing of containers to their smallest possible representation
+(8/16/32/64 bits). Many values and offsets can be encoded in just 8 bits.
+
+FlexBuffers supports [Serde](https://serde.rs/) for automatically serializing
+Rust data structures into its binary format.
+
+## See Examples for Usage:
+* [Example](https://github.com/google/flatbuffers/blob/master/samples/sample_flexbuffers.rs)
+* [Serde Example](https://github.com/google/flatbuffers/blob/master/samples/sample_flexbuffers_serde.rs)
+* [Documentation](https://docs.rs/flexbuffers)
+
+Flexbuffers is the schema-less cousin of
+[Flatbuffers](https://google.github.io/flatbuffers/).
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/bitwidth.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/bitwidth.rs
new file mode 100644
index 0000000..8e0bfed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/bitwidth.rs
@@ -0,0 +1,113 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::bitwidth::BitWidth::*;
+use std::slice::Iter;
+
+/// Represents the size of Flexbuffers data.
+///
+/// Flexbuffers automatically compresses numbers to the smallest possible width
+/// (`250u64` is stored as `250u8`).
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    PartialOrd,
+    Serialize,
+    Deserialize,
+    Ord,
+    num_enum::TryFromPrimitive,
+)]
+#[repr(u8)]
+pub enum BitWidth {
+    W8 = 0,
+    W16 = 1,
+    W32 = 2,
+    W64 = 3,
+}
+impl BitWidth {
+    pub(crate) fn iter() -> Iter<'static, Self> {
+        [W8, W16, W32, W64].iter()
+    }
+    pub fn n_bytes(self) -> usize {
+        1 << self as usize
+    }
+    pub fn from_nbytes(n: impl std::convert::Into<usize>) -> Option<Self> {
+        match n.into() {
+            1 => Some(W8),
+            2 => Some(W16),
+            4 => Some(W32),
+            8 => Some(W64),
+            _ => None,
+        }
+    }
+}
+
+impl Default for BitWidth {
+    fn default() -> Self {
+        W8
+    }
+}
+
+// TODO(cneo): Overloading with `from` is probably not the most readable idea in hindsight.
+macro_rules! impl_bitwidth_from {
+    ($from: ident, $w64: ident, $w32: ident, $w16: ident, $w8: ident) => {
+        impl From<$from> for BitWidth {
+            fn from(x: $from) -> BitWidth {
+                let x = x as $w64;
+                if x >= $w8::min_value() as $w64 && x <= $w8::max_value() as $w64 {
+                    return W8;
+                }
+                if x >= $w16::min_value() as $w64 && x <= $w16::max_value() as $w64 {
+                    return W16;
+                }
+                if x >= $w32::min_value() as $w64 && x <= $w32::max_value() as $w64 {
+                    return W32;
+                }
+                W64
+            }
+        }
+    };
+}
+impl_bitwidth_from!(u64, u64, u32, u16, u8);
+impl_bitwidth_from!(usize, u64, u32, u16, u8);
+impl_bitwidth_from!(i64, i64, i32, i16, i8);
+
+#[allow(clippy::float_cmp)]
+impl From<f64> for BitWidth {
+    fn from(x: f64) -> BitWidth {
+        if x != x as f32 as f64 {
+            W64
+        } else {
+            W32
+        }
+    }
+}
+impl From<f32> for BitWidth {
+    fn from(_: f32) -> BitWidth {
+        W32
+    }
+}
+
+/// Zero pad `v` until `T` will be byte aligned when pushed.
+pub fn align(buffer: &mut Vec<u8>, width: BitWidth) {
+    let bytes = 1 << width as u8;
+    let alignment = (bytes - buffer.len() % bytes) % bytes;
+    // Profiling reveals the loop is faster than Vec::resize.
+    for _ in 0..alignment as usize {
+        buffer.push(0);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/buffer.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/buffer.rs
new file mode 100644
index 0000000..b562249
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/buffer.rs
@@ -0,0 +1,81 @@
+use std::ops::{Deref, Range};
+
+/// The underlying buffer that is used by a flexbuffer Reader. 
+///
+/// This allows for custom buffer implementations as long as they can be viewed as a &[u8].
+pub trait Buffer: Deref<Target = [u8]> + Sized {
+    // The `BufferString` allows for a buffer to return a custom string which will have the
+    // lifetime of the underlying buffer. A simple `std::str::from_utf8` wouldn't work since that
+    // returns a &str, which is then owned by the callee (cannot be returned from a function).
+    //
+    // Example: During deserialization a `BufferString` is returned, allowing the deserializer
+    // to "borrow" the given str - b/c there is a "lifetime" guarantee, so to speak, from the
+    // underlying buffer.
+    /// A BufferString which will live at least as long as the Buffer itself.
+    ///
+    /// Deref's to UTF-8 `str`, and only generated from the `buffer_str` function Result.
+    type BufferString: Deref<Target = str> + Sized;
+
+    /// This method returns an instance of type Self. This allows for lifetimes to be tracked
+    /// in cases of deserialization.
+    ///
+    /// It also lets custom buffers manage reference counts. 
+    ///
+    /// Returns None if:
+    /// - range start is greater than end
+    /// - range end is out of bounds
+    ///
+    /// This operation should be fast -> O(1), ideally with no heap allocations.
+    fn slice(&self, range: Range<usize>) -> Option<Self>;
+
+    /// Creates a shallow copy of the given buffer, similar to `slice`.
+    ///
+    /// This operation should be fast -> O(1), ideally with no heap allocations.
+    #[inline]
+    fn shallow_copy(&self) -> Self {
+        self.slice(0..self.len()).unwrap()
+    }
+
+    /// Creates an empty instance of a `Buffer`. This is different than `Default` b/c it
+    /// guarantees that the buffer instance will have length zero.
+    ///
+    /// Most impls shold be able to implement this via `Default`.
+    fn empty() -> Self;
+
+    /// Based off of the `empty` function, allows override for optimization purposes.
+    #[inline]
+    fn empty_str() -> Self::BufferString {
+        Self::empty().buffer_str().unwrap()
+    }
+
+    /// Attempts to convert the given buffer to a custom string type. 
+    ///
+    /// This should fail if the type does not have valid UTF-8 bytes, and must be zero copy.
+    fn buffer_str(&self) -> Result<Self::BufferString, std::str::Utf8Error>;
+}
+
+impl<'de> Buffer for &'de [u8] {
+    type BufferString = &'de str;
+
+    #[inline]
+    fn slice(&self, range: Range<usize>) -> Option<Self> {
+        self.get(range)
+    }
+
+    #[inline]
+    fn empty() -> Self {
+        &[]
+    }
+
+    /// Based off of the `empty` function, allows override for optimization purposes.
+    #[inline]
+    fn empty_str() -> Self::BufferString {
+        &""
+    }
+
+    #[inline]
+    fn buffer_str(&self) -> Result<Self::BufferString, std::str::Utf8Error> {
+        std::str::from_utf8(self)
+    }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/map.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/map.rs
new file mode 100644
index 0000000..1635f64
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/map.rs
@@ -0,0 +1,118 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Builder, Pushable, Value, VectorBuilder};
+
+/// Builds a Flexbuffer map, returned by a [Builder](struct.Builder.html).
+///
+/// ## Side effect when dropped:
+/// When this is dropped, or `end_map` is called, the map is
+/// commited to the buffer. If this map is the root of the flexbuffer, then the
+/// root is written and the flexbuffer is complete.
+/// ## Panics:
+/// -  Duplicate keys will result in a panic in both debug and release mode.
+/// -  Keys with internal nulls results in a panic in debug mode and result in silent truncaction
+///    in release mode.
+pub struct MapBuilder<'a> {
+    pub(super) builder: &'a mut Builder,
+    // If the root is this map then start == None. Otherwise start is the
+    // number of values in the 'values stack' before adding this map.
+    pub(super) start: Option<usize>,
+}
+impl<'a> MapBuilder<'a> {
+    /// Push `p` onto this map with key `key`.
+    /// This will panic (in debug mode) if `key` contains internal nulls.
+    #[inline]
+    pub fn push<P: Pushable>(&mut self, key: &str, p: P) {
+        self.builder.push_key(key);
+        self.builder.push(p);
+    }
+    /// Starts a nested vector that will be pushed onto this map
+    /// with key `key` when it is dropped.
+    ///
+    /// This will panic (in debug mode) if `key` contains internal nulls.
+    #[inline]
+    pub fn start_vector(&mut self, key: &str) -> VectorBuilder {
+        // Push the key that refers to this nested vector.
+        self.builder.push_key(key);
+        // Nested vector.
+        let start = Some(self.builder.values.len());
+        VectorBuilder {
+            builder: &mut self.builder,
+            start,
+        }
+    }
+    /// Starts a nested map which that will be pushed onto this map
+    /// with key `key` when it is dropped.
+    ///
+    /// This will panic (in debug mode) if `key` contains internal nulls.
+    #[inline]
+    pub fn start_map(&mut self, key: &str) -> MapBuilder {
+        // Push the key that refers to this nested vector.
+        self.builder.push_key(key);
+        // Nested map.
+        let start = Some(self.builder.values.len());
+        MapBuilder {
+            builder: &mut self.builder,
+            start,
+        }
+    }
+    /// `end_map` sorts the map by key and writes it to the buffer. This happens anyway
+    /// when the map builder is dropped.
+    #[inline]
+    pub fn end_map(self) {}
+}
+impl<'a> Drop for MapBuilder<'a> {
+    #[inline]
+    fn drop(&mut self) {
+        self.builder.end_map_or_vector(true, self.start);
+    }
+}
+
+// Read known keys / strings as iterators over bytes -- skipping utf8 validation and strlen.
+pub(super) fn get_key(buffer: &[u8], address: usize) -> impl Iterator<Item = &u8> {
+    buffer[address..].iter().take_while(|&&b| b != b'\0')
+}
+
+// `values` is assumed to be of the format [key1, value1, ..., keyN, valueN].
+// The keys refer to cstrings in `buffer`. When this function returns,
+// `values` is sorted in place by key.
+pub(super) fn sort_map_by_keys(values: &mut [Value], buffer: &[u8]) {
+    debug_assert_eq!(values.len() % 2, 0);
+    debug_assert!(values.iter().step_by(2).all(Value::is_key));
+    let raw_pairs = values.as_mut_ptr() as *mut [Value; 2];
+    let pairs_len = values.len() / 2;
+    // Unsafe code needed to treat the slice as key-value pairs when sorting in place. This is
+    // preferred over custom sorting or adding another dependency. By construction, this part
+    // of the values stack must be alternating (key, value) pairs. The public API must not be
+    // able to trigger the above debug_assets that protect this unsafe usage.
+    let pairs: &mut [[Value; 2]] =
+        unsafe { std::slice::from_raw_parts_mut(raw_pairs, pairs_len) };
+    #[rustfmt::skip]
+    pairs.sort_unstable_by(|[key1, _], [key2, _]| {
+        if let Value::Key(a1) = *key1 {
+            if let Value::Key(a2) = *key2 {
+                let s1 = get_key(buffer, a1);
+                let s2 = get_key(buffer, a2);
+                let ord = s1.cmp(s2);
+                if ord == std::cmp::Ordering::Equal {
+                    let dup: String = get_key(buffer, a1).map(|&b| b as char).collect();
+                    panic!("Duplicated key in map {:?}", dup);
+                }
+                return ord;
+            }
+        }
+        unreachable!();
+    });
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/mod.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/mod.rs
new file mode 100644
index 0000000..46ee79e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/mod.rs
@@ -0,0 +1,402 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::bitwidth::{align, BitWidth};
+mod value;
+use crate::FlexBufferType;
+use std::cmp::max;
+use value::{find_vector_type, store_value, Value};
+mod map;
+mod push;
+mod ser;
+mod vector;
+use map::sort_map_by_keys;
+pub use map::MapBuilder;
+pub use push::Pushable;
+pub use ser::{Error, FlexbufferSerializer};
+pub use vector::VectorBuilder;
+
+macro_rules! push_slice {
+    ($push_name: ident, $scalar: ty, $Val: ident, $new_vec: ident) => {
+        fn $push_name<T, S>(&mut self, xs: S)
+        where
+            T: Into<$scalar> + Copy,
+            S: AsRef<[T]>
+        {
+            let mut value = Value::$new_vec(xs.as_ref().len());
+            let mut width = xs.as_ref()
+                .iter()
+                .map(|x| BitWidth::from((*x).into()))
+                .max()
+                .unwrap_or_default();
+            if !value.is_fixed_length_vector() {
+                let length = Value::UInt(xs.as_ref().len() as u64);
+                width = std::cmp::max(width, length.width_or_child_width());
+                align(&mut self.buffer, width);
+                store_value(&mut self.buffer, length, width);
+            } else {
+                align(&mut self.buffer, width);
+            }
+            let address = self.buffer.len();
+            for &x in xs.as_ref().iter() {
+                store_value(&mut self.buffer, Value::$Val(x.into()), width);
+            }
+            value.set_address_or_panic(address);
+            value.set_child_width_or_panic(width);
+            self.values.push(value);
+        }
+    }
+}
+macro_rules! push_indirect {
+    ($push_name: ident, $scalar: ty, $Direct: ident, $Indirect: ident) => {
+        fn $push_name<T: Into<$scalar>>(&mut self, x: T) {
+            let x = Value::$Direct(x.into());
+            let child_width = x.width_or_child_width();
+            let address = self.buffer.len();
+            store_value(&mut self.buffer, x, child_width);
+            self.values.push(
+                Value::Reference {
+                    address,
+                    child_width,
+                    fxb_type: FlexBufferType::$Indirect,
+                }
+            );
+        }
+    }
+}
+
+bitflags! {
+    /// Options for sharing data within a flexbuffer.
+    ///
+    /// These increase serialization time but decrease the size of the resulting buffer. By
+    /// default, `SHARE_KEYS`. You may wish to turn on `SHARE_STRINGS` if you know your data has
+    /// many duplicate strings or `SHARE_KEY_VECTORS` if your data has many maps with identical
+    /// keys.
+    ///
+    /// ## Not Yet Implemented
+    /// - `SHARE_STRINGS`
+    /// - `SHARE_KEY_VECTORS`
+    pub struct BuilderOptions: u8 {
+        const SHARE_NONE = 0;
+        const SHARE_KEYS = 1;
+        const SHARE_STRINGS = 2;
+        const SHARE_KEYS_AND_STRINGS = 3;
+        const SHARE_KEY_VECTORS = 4;
+        const SHARE_ALL = 7;
+    }
+}
+impl Default for BuilderOptions {
+    fn default() -> Self {
+        Self::SHARE_KEYS
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+// Address of a Key inside of the buffer.
+struct CachedKey(usize);
+
+/// **Use this struct to build a Flexbuffer.**
+///
+/// Flexbuffers may only have a single root value, which may be constructed
+/// with  one of the following functions.
+/// * `build_singleton` will push 1 value to the buffer and serialize it as the root.
+/// * `start_vector` returns a `VectorBuilder`, into which many (potentially
+/// heterogenous) values can be pushed. The vector itself is the root and is serialized
+/// when the `VectorBuilder` is dropped (or `end` is called).
+/// * `start_map` returns a `MapBuilder`, which is similar to a `VectorBuilder` except
+/// every value must be pushed with an associated key. The map is serialized when the
+/// `MapBuilder` is dropped (or `end` is called).
+///
+/// These functions reset and overwrite the Builder which means, while there are no
+/// active `MapBuilder` or `VectorBuilder`, the internal buffer is empty or contains a
+/// finished Flexbuffer. The internal buffer is accessed with `view`.
+#[derive(Debug, Clone)]
+pub struct Builder {
+    buffer: Vec<u8>,
+    values: Vec<Value>,
+    key_pool: Option<Vec<CachedKey>>,
+}
+impl Default for Builder {
+    fn default() -> Self {
+        let opts = Default::default();
+        Builder::new(opts)
+    }
+}
+
+impl<'a> Builder {
+    pub fn new(opts: BuilderOptions) -> Self {
+        let key_pool = if opts.contains(BuilderOptions::SHARE_KEYS) {
+            Some(vec![])
+        } else {
+            None
+        };
+        Builder {
+            key_pool,
+            values: Vec::new(),
+            buffer: Vec::new(),
+        }
+    }
+    /// Shows the internal flexbuffer. It will either be empty or populated with the most
+    /// recently built flexbuffer.
+    pub fn view(&self) -> &[u8] {
+        &self.buffer
+    }
+    /// Returns the internal buffer, replacing it with a new vector. The returned buffer will
+    /// either be empty or populated with the most recently built flexbuffer.
+    pub fn take_buffer(&mut self) -> Vec<u8> {
+        let mut b = Vec::new();
+        std::mem::swap(&mut self.buffer, &mut b);
+        b
+    }
+    /// Resets the internal state. Automatically called before building a new flexbuffer.
+    pub fn reset(&mut self) {
+        self.buffer.clear();
+        self.values.clear();
+        if let Some(pool) = self.key_pool.as_mut() {
+            pool.clear();
+        }
+    }
+    fn push_key(&mut self, key: &str) {
+        debug_assert!(
+            key.bytes().all(|b| b != b'\0'),
+            "Keys must not have internal nulls."
+        );
+        // Search key pool if there is one.
+        let found = self.key_pool.as_ref().map(|pool| {
+            pool.binary_search_by(|&CachedKey(addr)| {
+                let old_key = map::get_key(&self.buffer, addr);
+                old_key.cloned().cmp(key.bytes())
+            })
+        });
+        let address = if let Some(Ok(idx)) = found {
+            // Found key in key pool.
+            self.key_pool.as_ref().unwrap()[idx].0
+        } else {
+            // Key not in pool (or no pool).
+            let address = self.buffer.len();
+            self.buffer.extend_from_slice(key.as_bytes());
+            self.buffer.push(b'\0');
+            address
+        };
+        if let Some(Err(idx)) = found {
+            // Insert into key pool.
+            let pool = self.key_pool.as_mut().unwrap();
+            pool.insert(idx, CachedKey(address));
+        }
+        self.values.push(Value::Key(address));
+    }
+    fn push_uint<T: Into<u64>>(&mut self, x: T) {
+        self.values.push(Value::UInt(x.into()));
+    }
+    fn push_int<T: Into<i64>>(&mut self, x: T) {
+        self.values.push(Value::Int(x.into()));
+    }
+    fn push_float<T: Into<f64>>(&mut self, x: T) {
+        self.values.push(Value::Float(x.into()));
+    }
+    fn push_null(&mut self) {
+        self.values.push(Value::Null);
+    }
+    fn push_bool(&mut self, x: bool) {
+        self.values.push(Value::Bool(x));
+    }
+    fn store_blob(&mut self, xs: &[u8]) -> Value {
+        let length = Value::UInt(xs.len() as u64);
+        let width = length.width_or_child_width();
+        align(&mut self.buffer, width);
+        store_value(&mut self.buffer, length, width);
+        let address = self.buffer.len();
+        self.buffer.extend_from_slice(xs);
+        Value::Reference {
+            fxb_type: FlexBufferType::Blob,
+            address,
+            child_width: width,
+        }
+    }
+    fn push_str(&mut self, x: &str) {
+        let mut string = self.store_blob(x.as_bytes());
+        self.buffer.push(b'\0');
+        string.set_fxb_type_or_panic(FlexBufferType::String);
+        self.values.push(string);
+    }
+    fn push_blob(&mut self, x: &[u8]) {
+        let blob = self.store_blob(x);
+        self.values.push(blob);
+    }
+    fn push_bools(&mut self, xs: &[bool]) {
+        let length = Value::UInt(xs.len() as u64);
+        let width = length.width_or_child_width();
+        align(&mut self.buffer, width);
+        store_value(&mut self.buffer, length, width);
+        let address = self.buffer.len();
+        for &b in xs.iter() {
+            self.buffer.push(b as u8);
+            self.buffer.resize(self.buffer.len() + width as usize, 0);
+        }
+        self.values.push(Value::Reference {
+            fxb_type: FlexBufferType::VectorBool,
+            address,
+            child_width: width,
+        });
+    }
+
+    push_slice!(push_uints, u64, UInt, new_uint_vector);
+    push_slice!(push_ints, i64, Int, new_int_vector);
+    push_slice!(push_floats, f64, Float, new_float_vector);
+    push_indirect!(push_indirect_int, i64, Int, IndirectInt);
+    push_indirect!(push_indirect_uint, u64, UInt, IndirectUInt);
+    push_indirect!(push_indirect_float, f64, Float, IndirectFloat);
+
+    /// Resets the builder and starts a new flexbuffer with a vector at the root.
+    /// The exact Flexbuffer vector type is dynamically inferred.
+    pub fn start_vector(&'a mut self) -> VectorBuilder<'a> {
+        self.reset();
+        VectorBuilder {
+            builder: self,
+            start: None,
+        }
+    }
+    /// Resets the builder and builds a new flexbuffer with a map at the root.
+    pub fn start_map(&'a mut self) -> MapBuilder<'a> {
+        self.reset();
+        MapBuilder {
+            builder: self,
+            start: None,
+        }
+    }
+    /// Resets the builder and builds a new flexbuffer with the pushed value at the root.
+    pub fn build_singleton<P: Pushable>(&mut self, p: P) {
+        self.reset();
+        p.push_to_builder(self);
+        let root = self.values.pop().unwrap();
+        store_root(&mut self.buffer, root);
+    }
+    fn push<P: Pushable>(&mut self, p: P) {
+        p.push_to_builder(self);
+    }
+    /// Stores the values past `previous_end` as a map or vector depending on `is_map`.
+    /// If `previous_end` is None then this was a root map / vector and the last value
+    /// is stored as the root.
+    fn end_map_or_vector(&mut self, is_map: bool, previous_end: Option<usize>) {
+        let split = previous_end.unwrap_or(0);
+        let value = if is_map {
+            let key_vals = &mut self.values[split..];
+            sort_map_by_keys(key_vals, &self.buffer);
+            let key_vector = store_vector(&mut self.buffer, key_vals, StoreOption::MapKeys);
+            store_vector(&mut self.buffer, key_vals, StoreOption::Map(key_vector))
+        } else {
+            store_vector(&mut self.buffer, &self.values[split..], StoreOption::Vector)
+        };
+        self.values.truncate(split);
+        if previous_end.is_some() {
+            self.values.push(value);
+        } else {
+            store_root(&mut self.buffer, value);
+        }
+    }
+}
+
+/// Builds a Flexbuffer with the single pushed value as the root.
+pub fn singleton<P: Pushable>(p: P) -> Vec<u8> {
+    let mut b = Builder::default();
+    b.build_singleton(p);
+    let Builder { buffer, .. } = b;
+    buffer
+}
+
+/// Stores the root value, root type and root width.
+/// This should be called to finish the Flexbuffer.
+fn store_root(buffer: &mut Vec<u8>, root: Value) {
+    let root_width = root.width_in_vector(buffer.len(), 0);
+    align(buffer, root_width);
+    store_value(buffer, root, root_width);
+    buffer.push(root.packed_type(root_width));
+    buffer.push(root_width.n_bytes() as u8);
+}
+
+pub enum StoreOption {
+    Vector,
+    Map(Value),
+    MapKeys,
+}
+/// Writes a Flexbuffer Vector or Map.
+/// StoreOption::Map(Keys) must be a Value::Key or this will panic.
+// #[inline(always)]
+pub fn store_vector(buffer: &mut Vec<u8>, values: &[Value], opt: StoreOption) -> Value {
+    let (skip, stride) = match opt {
+        StoreOption::Vector => (0, 1),
+        StoreOption::MapKeys => (0, 2),
+        StoreOption::Map(_) => (1, 2),
+    };
+    let iter_values = || values.iter().skip(skip).step_by(stride);
+
+    // Figure out vector type and how long is the prefix.
+    let mut result = if let StoreOption::Map(_) = opt {
+        Value::new_map()
+    } else {
+        find_vector_type(iter_values())
+    };
+    let length_slot = if !result.is_fixed_length_vector() {
+        let length = iter_values().count();
+        Some(Value::UInt(length as u64))
+    } else {
+        None
+    };
+    // Measure required width and align to it.
+    let mut width = BitWidth::W8;
+    if let StoreOption::Map(keys) = opt {
+        width = max(width, keys.width_in_vector(buffer.len(), 0))
+    }
+    if let Some(l) = length_slot {
+        width = max(width, l.width_or_child_width());
+    }
+    let prefix_length = result.prefix_length();
+    for (i, &val) in iter_values().enumerate() {
+        width = max(width, val.width_in_vector(buffer.len(), i + prefix_length));
+    }
+    align(buffer, width);
+    #[allow(deprecated)]
+    {
+        debug_assert_ne!(
+            result.fxb_type(),
+            FlexBufferType::VectorString,
+            "VectorString is deprecated and cannot be written.\
+             (https://github.com/google/flatbuffers/issues/5627)"
+        );
+    }
+    // Write Prefix.
+    if let StoreOption::Map(keys) = opt {
+        let key_width = Value::UInt(keys.width_or_child_width().n_bytes() as u64);
+        store_value(buffer, keys, width);
+        store_value(buffer, key_width, width);
+    }
+    if let Some(len) = length_slot {
+        store_value(buffer, len, width);
+    }
+    // Write data.
+    let address = buffer.len();
+    for &v in iter_values() {
+        store_value(buffer, v, width);
+    }
+    // Write types
+    if result.is_typed_vector_or_map() {
+        for v in iter_values() {
+            buffer.push(v.packed_type(width));
+        }
+    }
+    // Return Value representing this Vector.
+    result.set_address_or_panic(address);
+    result.set_child_width_or_panic(width);
+    result
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/push.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/push.rs
new file mode 100644
index 0000000..5cb584d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/push.rs
@@ -0,0 +1,168 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::builder::Builder;
+use crate::private::Sealed;
+use crate::{Blob, Buffer, IndirectFloat, IndirectInt, IndirectUInt};
+
+impl<B: Buffer> Sealed for Blob<B> {}
+impl Sealed for () {}
+
+// TODO: String interning
+// TODO: Pushable for Map types?
+
+/// Types that implement the Pushable trait can be written into a Flexbuffer.
+///
+/// All Rust's standard numbers, `u8, u16, u32, u64, i8, i16, i32, i64, f32, f64`,
+/// can all be pushed. They are  `FlexBufferType::{UInt, Int, Float}`.
+/// Flexbuffers chooses the smallest width that can represent the given number.
+/// Strings can pe pushed, they become `FlexBufferType::String` and are stored
+/// with both a length and null terminator.
+///
+/// * For convenience and speed push typed vectors using rust arrays and slices.
+/// Doing so will immediately serialize the data, skipping the `Builder`'s
+/// internal cache.
+///
+/// * Pushable cannot not be implemented by any downstream crates.
+pub trait Pushable: Sealed + Sized {
+    fn push_to_builder(self, _: &mut Builder) {}
+}
+
+impl Pushable for () {
+    fn push_to_builder(self, builder: &mut Builder) {
+        builder.push_null();
+    }
+}
+
+impl<B: Buffer> Pushable for Blob<B> {
+    fn push_to_builder(self, builder: &mut Builder) {
+        builder.push_blob(&self.0);
+    }
+}
+
+macro_rules! forward_to_builder {
+    ($T: ty, $method: ident) => {
+        impl Sealed for $T {}
+        impl Pushable for $T {
+            fn push_to_builder(self, builder: &mut Builder) {
+                builder.$method(self);
+            }
+        }
+    };
+    ($T: ty, $method: ident, $asT: ty) => {
+        impl Sealed for $T {}
+        impl Pushable for $T {
+            fn push_to_builder(self, builder: &mut Builder) {
+                builder.$method(self as $asT);
+            }
+        }
+    };
+}
+forward_to_builder!(&str, push_str);
+forward_to_builder!(bool, push_bool);
+forward_to_builder!(u8, push_uint);
+forward_to_builder!(u16, push_uint);
+forward_to_builder!(u32, push_uint);
+forward_to_builder!(u64, push_uint);
+forward_to_builder!(i8, push_int);
+forward_to_builder!(i16, push_int);
+forward_to_builder!(i32, push_int);
+forward_to_builder!(i64, push_int);
+forward_to_builder!(f32, push_float);
+forward_to_builder!(f64, push_float);
+forward_to_builder!(&[u8], push_uints);
+forward_to_builder!(&[u16], push_uints);
+forward_to_builder!(&[u32], push_uints);
+forward_to_builder!(&[u64], push_uints);
+forward_to_builder!(&[i8], push_ints);
+forward_to_builder!(&[i16], push_ints);
+forward_to_builder!(&[i32], push_ints);
+forward_to_builder!(&[i64], push_ints);
+forward_to_builder!(&[f32], push_floats);
+forward_to_builder!(&[f64], push_floats);
+forward_to_builder!(&[bool], push_bools);
+forward_to_builder!(&Vec<u8>, push_uints);
+forward_to_builder!(&Vec<u16>, push_uints);
+forward_to_builder!(&Vec<u32>, push_uints);
+forward_to_builder!(&Vec<u64>, push_uints);
+forward_to_builder!(&Vec<i8>, push_ints);
+forward_to_builder!(&Vec<i16>, push_ints);
+forward_to_builder!(&Vec<i32>, push_ints);
+forward_to_builder!(&Vec<i64>, push_ints);
+forward_to_builder!(&Vec<f32>, push_floats);
+forward_to_builder!(&Vec<f64>, push_floats);
+forward_to_builder!(&Vec<bool>, push_bools);
+
+macro_rules! impl_indirects {
+    ($Indirect: ident, $method: ident) => {
+        impl Sealed for $Indirect {}
+        impl Pushable for $Indirect {
+            fn push_to_builder(self, builder: &mut Builder) {
+                builder.$method(self.0);
+            }
+        }
+    };
+}
+impl_indirects!(IndirectInt, push_indirect_int);
+impl_indirects!(IndirectUInt, push_indirect_uint);
+impl_indirects!(IndirectFloat, push_indirect_float);
+
+macro_rules! impl_arrays {
+    ($num: expr) => {
+        forward_to_builder!(&[u8; $num], push_uints, &[u8]);
+        forward_to_builder!(&[u16; $num], push_uints, &[u16]);
+        forward_to_builder!(&[u32; $num], push_uints, &[u32]);
+        forward_to_builder!(&[u64; $num], push_uints, &[u64]);
+        forward_to_builder!(&[i8; $num], push_ints, &[i8]);
+        forward_to_builder!(&[i16; $num], push_ints, &[i16]);
+        forward_to_builder!(&[i32; $num], push_ints, &[i32]);
+        forward_to_builder!(&[i64; $num], push_ints, &[i64]);
+        forward_to_builder!(&[f32; $num], push_floats, &[f32]);
+        forward_to_builder!(&[f64; $num], push_floats, &[f64]);
+        forward_to_builder!(&[bool; $num], push_bools, &[bool]);
+    };
+}
+impl_arrays!(0);
+impl_arrays!(1);
+impl_arrays!(2);
+impl_arrays!(3);
+impl_arrays!(4);
+impl_arrays!(5);
+impl_arrays!(6);
+// impl_arrays!(7);
+// impl_arrays!(8);
+// impl_arrays!(9);
+// impl_arrays!(10);
+// impl_arrays!(11);
+// impl_arrays!(12);
+// impl_arrays!(13);
+// impl_arrays!(14);
+// impl_arrays!(15);
+// impl_arrays!(16);
+// impl_arrays!(17);
+// impl_arrays!(18);
+// impl_arrays!(19);
+// impl_arrays!(20);
+// impl_arrays!(21);
+// impl_arrays!(22);
+// impl_arrays!(23);
+// impl_arrays!(24);
+// impl_arrays!(25);
+// impl_arrays!(26);
+// impl_arrays!(27);
+// impl_arrays!(28);
+// impl_arrays!(29);
+// impl_arrays!(30);
+// impl_arrays!(31);
+// impl_arrays!(32);
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/ser.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/ser.rs
new file mode 100644
index 0000000..8e483ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/ser.rs
@@ -0,0 +1,533 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::Builder;
+use serde::ser;
+use serde::ser::*;
+use std::fmt::Display;
+
+// This struct internally tracks the nested vectors representing
+// nested structs and such.
+// TODO: Add an option field names in a map.
+/// Flexbuffer Serializer. This should be used to serialize structs.
+#[derive(Debug, Default)]
+pub struct FlexbufferSerializer {
+    builder: Builder,
+    nesting: Vec<Option<usize>>,
+}
+impl FlexbufferSerializer {
+    pub fn new() -> Self {
+        Self::default()
+    }
+    pub fn view(&self) -> &[u8] {
+        self.builder.view()
+    }
+    pub fn take_buffer(&mut self) -> Vec<u8> {
+        self.builder.take_buffer()
+    }
+    fn finish_if_not_nested(&mut self) -> Result<(), Error> {
+        if self.nesting.is_empty() {
+            assert_eq!(self.builder.values.len(), 1);
+            let root = self.builder.values.pop().unwrap();
+            super::store_root(&mut self.builder.buffer, root);
+        }
+        Ok(())
+    }
+    fn start_vector(&mut self) {
+        let previous_end = if self.nesting.is_empty() {
+            None
+        } else {
+            Some(self.builder.values.len())
+        };
+        self.nesting.push(previous_end);
+    }
+    fn start_map(&mut self) {
+        let previous_end = if self.nesting.is_empty() {
+            None
+        } else {
+            Some(self.builder.values.len())
+        };
+        self.nesting.push(previous_end);
+    }
+    fn end_vector(&mut self) -> Result<(), Error> {
+        let previous_end = self.nesting.pop().unwrap();
+        self.builder.end_map_or_vector(false, previous_end);
+        Ok(())
+    }
+    fn end_map(&mut self) -> Result<(), Error> {
+        let previous_end = self.nesting.pop().unwrap();
+        self.builder.end_map_or_vector(true, previous_end);
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+/// Errors that may happen with Serde.
+pub enum Error {
+    /// Only `str` and `String` can be serialized as keys in serde maps.
+    KeyMustBeString,
+    Serde(String),
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        write!(f, "{:?}", self)
+    }
+}
+impl std::error::Error for Error {}
+impl ser::Error for Error {
+    fn custom<T>(msg: T) -> Self
+    where
+        T: Display,
+    {
+        Self::Serde(format!("{}", msg))
+    }
+}
+impl<'a> ser::SerializeSeq for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_vector()
+    }
+}
+// This is unlike a flexbuffers map which requires CString like keys.
+// Its implemented as alternating keys and values (hopefully).
+impl<'a> ser::SerializeMap for &'a mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        key.serialize(MapKeySerializer(&mut **self))
+    }
+    fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_map()
+    }
+}
+impl<'a> ser::SerializeTuple for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_vector()
+    }
+}
+impl<'a> ser::SerializeTupleStruct for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_vector()
+    }
+}
+impl<'a> ser::SerializeStruct for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_field<T: ?Sized>(
+        &mut self,
+        key: &'static str,
+        value: &T,
+    ) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        self.builder.push_key(key);
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_map()
+    }
+}
+impl<'a> ser::SerializeTupleVariant for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_vector()?;
+        self.end_map()
+    }
+}
+impl<'a> ser::SerializeStructVariant for &mut FlexbufferSerializer {
+    type Ok = ();
+    type Error = Error;
+    fn serialize_field<T: ?Sized>(
+        &mut self,
+        key: &'static str,
+        value: &T,
+    ) -> Result<(), Self::Error>
+    where
+        T: Serialize,
+    {
+        self.builder.push_key(key);
+        value.serialize(&mut **self)
+    }
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        self.end_map()?;
+        self.end_map()
+    }
+    // TODO: skip field?
+}
+
+impl<'a> ser::Serializer for &'a mut FlexbufferSerializer {
+    type SerializeSeq = &'a mut FlexbufferSerializer;
+    type SerializeTuple = &'a mut FlexbufferSerializer;
+    type SerializeTupleStruct = &'a mut FlexbufferSerializer;
+    type SerializeTupleVariant = &'a mut FlexbufferSerializer;
+    type SerializeMap = &'a mut FlexbufferSerializer;
+    type SerializeStruct = &'a mut FlexbufferSerializer;
+    type SerializeStructVariant = &'a mut FlexbufferSerializer;
+    type Ok = ();
+    type Error = Error;
+    fn is_human_readable(&self) -> bool {
+        cfg!(serialize_human_readable)
+    }
+    fn serialize_bool(self, v: bool) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_i8(self, v: i8) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_i16(self, v: i16) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_i32(self, v: i32) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_i64(self, v: i64) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_u8(self, v: u8) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_u16(self, v: u16) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_u32(self, v: u32) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_u64(self, v: u64) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_f32(self, v: f32) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_f64(self, v: f64) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_char(self, v: char) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v as u8);
+        self.finish_if_not_nested()
+    }
+    fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(v);
+        self.finish_if_not_nested()
+    }
+    fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(crate::Blob(v));
+        self.finish_if_not_nested()
+    }
+    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(());
+        self.finish_if_not_nested()
+    }
+    fn serialize_some<T: ?Sized>(self, t: &T) -> Result<Self::Ok, Self::Error>
+    where
+        T: Serialize,
+    {
+        t.serialize(self)
+    }
+    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(());
+        self.finish_if_not_nested()
+    }
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(());
+        self.finish_if_not_nested()
+    }
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+    ) -> Result<Self::Ok, Self::Error> {
+        self.builder.push(variant);
+        self.finish_if_not_nested()
+    }
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T,
+    ) -> Result<Self::Ok, Self::Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(self)
+    }
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+        value: &T,
+    ) -> Result<Self::Ok, Self::Error>
+    where
+        T: Serialize,
+    {
+        self.start_map();
+        self.builder.push_key(variant);
+        value.serialize(&mut *self)?;
+        self.end_map()
+    }
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
+        self.start_vector();
+        Ok(self)
+    }
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
+        self.start_vector();
+        Ok(self)
+    }
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
+        self.start_map();
+        Ok(self)
+    }
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
+        self.start_map();
+        self.builder.push_key(variant);
+        self.start_vector();
+        Ok(self)
+    }
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
+        self.start_map();
+        Ok(self)
+    }
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeStruct, Self::Error> {
+        self.start_map();
+        Ok(self)
+    }
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeStructVariant, Self::Error> {
+        self.start_map();
+        self.builder.push_key(variant);
+        self.start_map();
+        Ok(self)
+    }
+}
+
+fn key_must_be_a_string<T>() -> Result<T, Error> {
+    Err(Error::KeyMustBeString)
+}
+struct MapKeySerializer<'a>(&'a mut FlexbufferSerializer);
+impl<'a> Serializer for MapKeySerializer<'a> {
+    type Ok = ();
+    type Error = Error;
+    #[inline]
+    fn serialize_str(self, value: &str) -> Result<(), Error> {
+        self.0.builder.push_key(value);
+        Ok(())
+    }
+    #[inline]
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        variant: &'static str,
+    ) -> Result<(), Error> {
+        self.0.builder.push_key(variant);
+        Ok(())
+    }
+    #[inline]
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T,
+    ) -> Result<(), Error>
+    where
+        T: Serialize,
+    {
+        value.serialize(self)
+    }
+    type SerializeSeq = Impossible<(), Error>;
+    type SerializeTuple = Impossible<(), Error>;
+    type SerializeTupleStruct = Impossible<(), Error>;
+    type SerializeTupleVariant = Impossible<(), Error>;
+    type SerializeMap = Impossible<(), Error>;
+    type SerializeStruct = Impossible<(), Error>;
+    type SerializeStructVariant = Impossible<(), Error>;
+
+    fn serialize_bool(self, _value: bool) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_i8(self, _value: i8) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_i16(self, _value: i16) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_i32(self, _value: i32) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_i64(self, _value: i64) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_u8(self, _value: u8) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_u16(self, _value: u16) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_u32(self, _value: u32) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_u64(self, _value: u64) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_f32(self, _value: f32) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_f64(self, _value: f64) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_char(self, _value: char) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_bytes(self, _value: &[u8]) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_unit(self) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _value: &T,
+    ) -> Result<(), Error>
+    where
+        T: Serialize,
+    {
+        key_must_be_a_string()
+    }
+    fn serialize_none(self) -> Result<(), Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<(), Error>
+    where
+        T: Serialize,
+    {
+        key_must_be_a_string()
+    }
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeTupleStruct, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeTupleVariant, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeStruct, Error> {
+        key_must_be_a_string()
+    }
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize,
+    ) -> Result<Self::SerializeStructVariant, Error> {
+        key_must_be_a_string()
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/value.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/value.rs
new file mode 100644
index 0000000..88ff7b9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/value.rs
@@ -0,0 +1,307 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use byteorder::{LittleEndian, WriteBytesExt};
+
+use crate::bitwidth::BitWidth;
+use crate::bitwidth::BitWidth::*;
+use crate::flexbuffer_type::FlexBufferType;
+use crate::flexbuffer_type::FlexBufferType::*;
+
+/// Internal representation of FlexBuffer Types and Data before writing.
+/// These get placed on the builder's stack and are eventually commited.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum Value {
+    // Inline types
+    Null,
+    Int(i64),
+    UInt(u64),
+    Float(f64),
+    Bool(bool),
+    /// Null termintated, c_string. Only used with `Map`s.
+    Key(usize),
+    /// The other ~20 or so types.
+    Reference {
+        address: usize,
+        child_width: BitWidth,
+        fxb_type: FlexBufferType,
+    },
+}
+
+macro_rules! new_typed_vector {
+    ($name: ident, $v2: ident, $v3: ident, $v4: ident, $vn: ident) => {
+        /// Returns a typed vector, fixed length if possible.
+        /// Address and child width are zero initialized and must be set.
+        pub fn $name(n: usize) -> Value {
+            let address = 0;
+            let child_width = W8;
+            match n {
+                2 => Value::Reference {
+                    address,
+                    child_width,
+                    fxb_type: $v2,
+                },
+                3 => Value::Reference {
+                    address,
+                    child_width,
+                    fxb_type: $v3,
+                },
+                4 => Value::Reference {
+                    address,
+                    child_width,
+                    fxb_type: $v4,
+                },
+                _ => Value::Reference {
+                    address,
+                    child_width,
+                    fxb_type: $vn,
+                },
+            }
+        }
+    };
+}
+
+impl Value {
+    pub fn new_vector() -> Self {
+        Value::Reference {
+            address: 0,
+            child_width: W8,
+            fxb_type: Vector,
+        }
+    }
+    pub fn new_map() -> Self {
+        Value::Reference {
+            address: 0,
+            child_width: W8,
+            fxb_type: Map,
+        }
+    }
+    new_typed_vector!(
+        new_int_vector,
+        VectorInt2,
+        VectorInt3,
+        VectorInt4,
+        VectorInt
+    );
+    new_typed_vector!(
+        new_uint_vector,
+        VectorUInt2,
+        VectorUInt3,
+        VectorUInt4,
+        VectorUInt
+    );
+    new_typed_vector!(
+        new_float_vector,
+        VectorFloat2,
+        VectorFloat3,
+        VectorFloat4,
+        VectorFloat
+    );
+    pub fn fxb_type(&self) -> FlexBufferType {
+        match *self {
+            Value::Null => Null,
+            Value::Int(_) => Int,
+            Value::UInt(_) => UInt,
+            Value::Float(_) => Float,
+            Value::Bool(_) => Bool,
+            Value::Key(_) => Key,
+            Value::Reference { fxb_type, .. } => fxb_type,
+        }
+    }
+    pub fn is_fixed_length_vector(&self) -> bool {
+        self.fxb_type().is_fixed_length_vector()
+    }
+    pub fn is_inline(&self) -> bool {
+        self.fxb_type().is_inline()
+    }
+    pub fn is_reference(&self) -> bool {
+        !self.is_inline()
+    }
+    pub fn is_key(&self) -> bool {
+        if let Value::Key(_) = self {
+            true
+        } else {
+            false
+        }
+    }
+    pub fn is_typed_vector_or_map(&self) -> bool {
+        if let Value::Reference { fxb_type, .. } = self {
+            fxb_type.is_heterogenous()
+        } else {
+            false
+        }
+    }
+    pub fn prefix_length(&self) -> usize {
+        if self.is_fixed_length_vector() || self.is_inline() {
+            return 0;
+        }
+        if let Value::Reference { fxb_type, .. } = self {
+            if *fxb_type == Map {
+                return 3;
+            }
+        }
+        1
+    }
+    pub fn set_fxb_type_or_panic(&mut self, new_type: FlexBufferType) {
+        if let Value::Reference { fxb_type, .. } = self {
+            *fxb_type = new_type;
+        } else {
+            panic!("`set_fxb_type_or_panic` called on {:?}", self)
+        }
+    }
+    pub fn set_child_width_or_panic(&mut self, new_width: BitWidth) {
+        if let Value::Reference { child_width, .. } = self {
+            *child_width = new_width;
+        } else {
+            panic!("`set_child_width_or_panic` called on {:?}", self);
+        }
+    }
+    pub fn get_address(&self) -> Option<usize> {
+        if let Value::Reference { address, .. } | Value::Key(address) = self {
+            Some(*address)
+        } else {
+            None
+        }
+    }
+    pub fn set_address_or_panic(&mut self, new_address: usize) {
+        if let Value::Reference { address, .. } | Value::Key(address) = self {
+            *address = new_address;
+        } else {
+            panic!("`set_address_or_panic` called on {:?}", self);
+        }
+    }
+    /// For inline types - the width of the value to be stored.
+    /// For reference types, the width of the referred.
+    /// Note Key types always refer to 8 bit data.
+    pub fn width_or_child_width(&self) -> BitWidth {
+        match *self {
+            Value::Int(x) => x.into(),
+            Value::UInt(x) => x.into(),
+            Value::Float(x) => x.into(),
+            Value::Key(_) | Value::Bool(_) | Value::Null => W8,
+            Value::Reference { child_width, .. } => child_width,
+        }
+    }
+    pub fn relative_address(self, written_at: usize) -> Option<Value> {
+        self.get_address().map(|address| {
+            let offset = written_at
+                .checked_sub(address)
+                .expect("Error: References may only refer backwards in buffer.");
+            Value::UInt(offset as u64)
+        })
+    }
+    /// Computes the minimum required width of `value` when stored in a vector
+    /// starting at `vector_start` at index `idx` (this index includes the prefix).
+    /// `Value::Reference{..}` variants require location information because
+    /// offsets are relative.
+    pub fn width_in_vector(self, vector_start: usize, idx: usize) -> BitWidth {
+        match self {
+            Value::Bool(_) => W8,
+            Value::Null => W8,
+            Value::Int(x) => x.into(),
+            Value::UInt(x) => x.into(),
+            Value::Float(x) => x.into(),
+            _ => {
+                debug_assert!(self.is_reference());
+                for &width in BitWidth::iter() {
+                    let bytes = width as usize + 1;
+                    let alignment = (bytes - vector_start % bytes) % bytes;
+                    let written_at = vector_start + alignment + idx * bytes;
+                    // This match must always succeed.
+                    if let Some(Value::UInt(offset)) = self.relative_address(written_at) {
+                        if BitWidth::from(offset) == width {
+                            return width;
+                        }
+                    }
+                }
+                unreachable!()
+            }
+        }
+    }
+    pub fn packed_type(self, parent_width: BitWidth) -> u8 {
+        let width = if self.is_inline() {
+            std::cmp::max(parent_width, self.width_or_child_width())
+        } else {
+            self.width_or_child_width()
+        };
+        (self.fxb_type() as u8) << 2 | width as u8
+    }
+}
+
+pub fn find_vector_type<'a, T>(mut values: T) -> Value
+where
+    T: std::iter::Iterator<Item = &'a Value>,
+{
+    let first = values.next();
+    if first.is_none() {
+        return Value::new_vector();
+    }
+    let mut len = 1;
+    let init = first.unwrap().fxb_type();
+    for v in values {
+        if v.fxb_type() != init {
+            return Value::new_vector();
+        }
+        len += 1;
+    }
+    let vector_type = match init {
+        Bool => VectorBool,
+        UInt => return Value::new_uint_vector(len),
+        Int => return Value::new_int_vector(len),
+        Float => return Value::new_float_vector(len),
+        Key => VectorKey,
+        // Note that VectorString is deprecated for writing
+        _ => return Value::new_vector(),
+    };
+    Value::Reference {
+        address: 0,
+        child_width: W8,
+        fxb_type: vector_type,
+    }
+}
+
+#[inline]
+pub fn store_value(buffer: &mut Vec<u8>, mut value: Value, width: BitWidth) {
+    // Remap to number types.
+    use Value::*;
+    if let Some(offset) = value.relative_address(buffer.len()) {
+        value = offset;
+    } else {
+        value = match value {
+            Bool(x) => UInt(x.into()),
+            Null => UInt(0), // Should this be 0 bytes?
+            _ => value,
+        }
+    }
+    let write_result = match (value, width) {
+        (UInt(x), W8) => buffer.write_u8(x as u8),
+        (UInt(x), W16) => buffer.write_u16::<LittleEndian>(x as u16),
+        (UInt(x), W32) => buffer.write_u32::<LittleEndian>(x as u32),
+        (UInt(x), W64) => buffer.write_u64::<LittleEndian>(x),
+        (Int(x), W8) => buffer.write_i8(x as i8),
+        (Int(x), W16) => buffer.write_i16::<LittleEndian>(x as i16),
+        (Int(x), W32) => buffer.write_i32::<LittleEndian>(x as i32),
+        (Int(x), W64) => buffer.write_i64::<LittleEndian>(x),
+        (Float(x), W32) => buffer.write_f32::<LittleEndian>(x as f32),
+        (Float(x), W64) => buffer.write_f64::<LittleEndian>(x),
+        (Float(_), _) => unreachable!("Error: Flatbuffers does not support 8 and 16 bit floats."),
+        _ => unreachable!("Variant not considered: {:?}", value),
+    };
+    write_result.unwrap_or_else(|err| {
+        panic!(
+            "Error writing value {:?} with width {:?}: {:?}",
+            value, width, err
+        )
+    });
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/vector.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/vector.rs
new file mode 100644
index 0000000..4d73da3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/builder/vector.rs
@@ -0,0 +1,65 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Builder, MapBuilder, Pushable};
+
+/// Builds a Flexbuffer vector, returned by a [Builder](struct.Builder.html).
+///
+/// ## Side effect when dropped:
+/// When this is dropped, or `end_vector` is called, the vector is
+/// commited to the buffer. If this vector is the root of the flexbuffer, then the
+/// root is written and the flexbuffer is complete. The FlexBufferType of this vector
+/// is determined by the pushed values when this is dropped. The most compact vector type is
+/// automatically chosen.
+pub struct VectorBuilder<'a> {
+    pub(crate) builder: &'a mut Builder,
+    // If the root is this vector then start == None. Otherwise start is the
+    // number of values in the 'values stack' before adding this vector.
+    pub(crate) start: Option<usize>,
+}
+impl<'a> VectorBuilder<'a> {
+    /// Pushes `p` onto the vector.
+    #[inline]
+    pub fn push<P: Pushable>(&mut self, p: P) {
+        self.builder.push(p);
+    }
+    /// Starts a nested vector that will be pushed onto this vector when it is dropped.
+    #[inline]
+    pub fn start_vector(&mut self) -> VectorBuilder {
+        let start = Some(self.builder.values.len());
+        VectorBuilder {
+            builder: &mut self.builder,
+            start,
+        }
+    }
+    /// Starts a nested map that will be pushed onto this vector when it is dropped.
+    #[inline]
+    pub fn start_map(&mut self) -> MapBuilder {
+        let start = Some(self.builder.values.len());
+        MapBuilder {
+            builder: &mut self.builder,
+            start,
+        }
+    }
+    /// `end_vector` determines the type of the vector and writes it to the buffer.
+    /// This will happen automatically if the VectorBuilder is dropped.
+    #[inline]
+    pub fn end_vector(self) {}
+}
+impl<'a> Drop for VectorBuilder<'a> {
+    #[inline]
+    fn drop(&mut self) {
+        self.builder.end_map_or_vector(false, self.start);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/flexbuffer_type.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/flexbuffer_type.rs
new file mode 100644
index 0000000..eda5195
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/flexbuffer_type.rs
@@ -0,0 +1,240 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#![allow(deprecated)]
+/// Represents all the valid types in a flexbuffer.
+///
+/// Flexbuffers supports
+/// heterogenous maps, heterogenous vectors, typed vectors, and fixed length
+/// typed vectors for some lengths and types. Rust types are converted into
+/// Flexbuffers via the [Pushable](trait.Pushable.html) trait.
+///
+/// For exact details see the [internals document](
+/// https://google.github.io/flatbuffers/flatbuffers_internals.html)
+///
+/// ### Notes:
+/// * In the binary format, Each element of a `Map` or (heterogenous) `Vector`
+/// is stored with a byte describing its FlexBufferType and BitWidth.
+///
+/// * Typed vectors do not store this extra type information and fixed length
+/// typed vectors do not store length. Whether a vector is stored as a typed
+/// vector or fixed length typed vector is determined dymaically from the
+/// given data.
+///
+/// * Indirect numbers are stored as an offset instead of inline. Using
+/// indirect numbers instead of their inline counterparts in maps and typed
+/// vectors can reduce the minimum element width and therefore bytes used.
+
+#[repr(u8)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, num_enum::TryFromPrimitive)]
+pub enum FlexBufferType {
+    /// Nulls are represented with `()` in Rust.
+    Null = 0,
+    /// Variable width signed integer: `i8, i16, i32, i64`
+    Int = 1,
+    /// Variable width unsigned integer: `u8, u16, u32, u64`
+    UInt = 2,
+    /// Variable width floating point: `f32, f64`
+    Float = 3,
+    Bool = 26,
+    /// Null termintated, utf8 string. Typically used with `Map`s.
+    Key = 4,
+    /// Stored with a unsigned integer length, then UTF-8 bytes, and an extra null terminator that
+    /// is not counted with the length.
+    String = 5,
+    /// An Int, stored by offset rather than inline. Indirect types can keep the bitwidth of a
+    /// vector or map small when the inline value would have increased the bitwidth.
+    IndirectInt = 6,
+    /// A UInt, stored by offset rather than inline. Indirect types can keep the bitwidth of a
+    /// vector or map small when the inline value would have increased the bitwidth.
+    IndirectUInt = 7,
+    /// A Float, stored by offset rather than inline. Indirect types can keep the bitwidth of a
+    /// vector or map small when the inline value would have increased the bitwidth.
+    IndirectFloat = 8,
+    /// Maps are like Vectors except elements are associated with, and sorted by, keys.
+    Map = 9,
+    /// Heterogenous Vector (stored with a type table).
+    Vector = 10,
+    /// Homogenous Vector of Ints.
+    VectorInt = 11,
+    /// Homogenous Vector of UInts.
+    VectorUInt = 12,
+    /// Homogenous Vector of Floats.
+    VectorFloat = 13,
+    /// Homogenous Vector of Keys.
+    VectorKey = 14,
+    /// Homogenous Vector of Strings.
+    #[deprecated(
+        note = "Please use Vector or VectorKey instead. See https://github.com/google/flatbuffers/issues/5627"
+    )]
+    VectorString = 15,
+    /// Since the elements of a vector use the same `BitWidth` as the length,
+    /// Blob is more efficient for >255 element boolean vectors.
+    VectorBool = 36,
+    /// Homogenous vector of two Ints
+    VectorInt2 = 16,
+    /// Homogenous vector of two UInts
+    VectorUInt2 = 17,
+    /// Homogenous vector of two Floats
+    VectorFloat2 = 18,
+    /// Homogenous vector of three Ints
+    VectorInt3 = 19,
+    /// Homogenous vector of three UInts
+    VectorUInt3 = 20,
+    /// Homogenous vector of three Floats
+    VectorFloat3 = 21,
+    /// Homogenous vector of four Ints
+    VectorInt4 = 22,
+    /// Homogenous vector of four UInts
+    VectorUInt4 = 23,
+    /// Homogenous vector of four Floats
+    VectorFloat4 = 24,
+    /// An array of bytes. Stored with a variable width length.
+    Blob = 25,
+}
+use FlexBufferType::*;
+
+impl Default for FlexBufferType {
+    fn default() -> Self {
+        Null
+    }
+}
+
+macro_rules! is_ty {
+    ($is_T: ident, $FTy: ident) => {
+        #[inline(always)]
+        pub fn $is_T(self) -> bool {
+            self == $FTy
+        }
+    };
+}
+
+impl FlexBufferType {
+    /// Returns true for flexbuffer types that are stored inline.
+    pub fn is_inline(self) -> bool {
+        match self {
+            Null | Int | UInt | Float | Bool => true,
+            _ => false,
+        }
+    }
+    /// Returns true for flexbuffer types that are stored by offset.
+    pub fn is_reference(self) -> bool {
+        !self.is_inline()
+    }
+    /// Returns true if called on a map, vector, typed vector, or fixed length typed vector.
+    pub fn is_vector(self) -> bool {
+        let d = self as u8;
+        (9..25).contains(&d) || self == VectorBool
+    }
+    /// True iff the binary format stores the length.
+    /// This applies to Blob, String, Maps, and Vectors of variable length.
+    pub fn has_length_slot(self) -> bool {
+        !self.is_fixed_length_vector() && self.is_vector() || self == String || self == Blob
+    }
+    /// Returns true if called on a fixed length typed vector.
+    pub fn is_fixed_length_vector(self) -> bool {
+        self.fixed_length_vector_length().is_some()
+    }
+    /// If called on a fixed type vector, returns the type of the elements.
+    pub fn typed_vector_type(self) -> Option<FlexBufferType> {
+        match self {
+            VectorInt | VectorInt2 | VectorInt3 | VectorInt4 => Some(Int),
+            VectorUInt | VectorUInt2 | VectorUInt3 | VectorUInt4 => Some(UInt),
+            VectorFloat | VectorFloat2 | VectorFloat3 | VectorFloat4 => Some(Float),
+            VectorKey => Some(Key),
+            // Treat them as keys because we do not know width of length slot.
+            // see deprecation link.
+            VectorString => Some(Key),
+            VectorBool => Some(Bool),
+            _ => None,
+        }
+    }
+    /// Return the length of the fixed length vector or None.
+    pub fn fixed_length_vector_length(self) -> Option<usize> {
+        match self {
+            VectorInt2 | VectorUInt2 | VectorFloat2 => Some(2),
+            VectorInt3 | VectorUInt3 | VectorFloat3 => Some(3),
+            VectorInt4 | VectorUInt4 | VectorFloat4 => Some(4),
+            _ => None,
+        }
+    }
+    /// Returns true if self is a Map or Vector. Typed vectors are not heterogenous.
+    pub fn is_heterogenous(self) -> bool {
+        self == Map || self == Vector
+    }
+    /// If `self` is an indirect scalar, remap it to the scalar. Otherwise do nothing.
+    pub fn to_direct(self) -> Option<Self> {
+        match self {
+            IndirectInt => Some(Int),
+            IndirectUInt => Some(UInt),
+            IndirectFloat => Some(Float),
+            _ => None,
+        }
+    }
+    // returns true if and only if the flexbuffer type is `Null`.
+    is_ty!(is_null, Null);
+    // returns true if and only if the flexbuffer type is `Int`.
+    is_ty!(is_int, Int);
+    // returns true if and only if the flexbuffer type is `UInt`.
+    is_ty!(is_uint, UInt);
+    // returns true if and only if the flexbuffer type is `Float`.
+    is_ty!(is_float, Float);
+    // returns true if and only if the flexbuffer type is `Bool`.
+    is_ty!(is_bool, Bool);
+    // returns true if and only if the flexbuffer type is `Key`.
+    is_ty!(is_key, Key);
+    // returns true if and only if the flexbuffer type is `String`.
+    is_ty!(is_string, String);
+    // returns true if and only if the flexbuffer type is `IndirectInt`.
+    is_ty!(is_indirect_int, IndirectInt);
+    // returns true if and only if the flexbuffer type is `IndirectUInt`.
+    is_ty!(is_indirect_uint, IndirectUInt);
+    // returns true if and only if the flexbuffer type is `IndirectFloat`.
+    is_ty!(is_indirect_float, IndirectFloat);
+    // returns true if and only if the flexbuffer type is `Map`.
+    is_ty!(is_map, Map);
+    // returns true if and only if the flexbuffer type is `Vector`.
+    is_ty!(is_heterogenous_vector, Vector);
+    // returns true if and only if the flexbuffer type is `VectorInt`.
+    is_ty!(is_vector_int, VectorInt);
+    // returns true if and only if the flexbuffer type is `VectorUInt`.
+    is_ty!(is_vector_uint, VectorUInt);
+    // returns true if and only if the flexbuffer type is `VectorFloat`.
+    is_ty!(is_vector_float, VectorFloat);
+    // returns true if and only if the flexbuffer type is `VectorKey`.
+    is_ty!(is_vector_key, VectorKey);
+    // returns true if and only if the flexbuffer type is `VectorString`.
+    is_ty!(is_vector_string, VectorString);
+    // returns true if and only if the flexbuffer type is `VectorBool`.
+    is_ty!(is_vector_bool, VectorBool);
+    // returns true if and only if the flexbuffer type is `VectorInt2`.
+    is_ty!(is_vector_int2, VectorInt2);
+    // returns true if and only if the flexbuffer type is `VectorUInt2`.
+    is_ty!(is_vector_uint2, VectorUInt2);
+    // returns true if and only if the flexbuffer type is `VectorFloat2`.
+    is_ty!(is_vector_float2, VectorFloat2);
+    // returns true if and only if the flexbuffer type is `VectorInt3`.
+    is_ty!(is_vector_int3, VectorInt3);
+    // returns true if and only if the flexbuffer type is `VectorUInt3`.
+    is_ty!(is_vector_uint3, VectorUInt3);
+    // returns true if and only if the flexbuffer type is `VectorFloat3`.
+    is_ty!(is_vector_float3, VectorFloat3);
+    // returns true if and only if the flexbuffer type is `VectorInt4`.
+    is_ty!(is_vector_int4, VectorInt4);
+    // returns true if and only if the flexbuffer type is `VectorUInt4`.
+    is_ty!(is_vector_uint4, VectorUInt4);
+    // returns true if and only if the flexbuffer type is `VectorFloat4`.
+    is_ty!(is_vector_float4, VectorFloat4);
+    // returns true if and only if the flexbuffer type is `Blob`.
+    is_ty!(is_blob, Blob);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/lib.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/lib.rs
new file mode 100644
index 0000000..614b2b3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/lib.rs
@@ -0,0 +1,116 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Flexbuffers is a high performance schemaless binary data format designed at Google.
+//! It is complementary to the schema-ed format [Flatbuffers](http://docs.rs/flatbuffers/).
+//! See [Flexbuffer Internals](https://google.github.io/flatbuffers/flatbuffers_internals.html)
+//! for details on the binary format.
+//!
+//! See the examples for usage:
+//! * [Example](https://github.com/google/flatbuffers/blob/master/samples/sample_flexbuffers.rs)
+//! * [Serde Example](https://github.com/google/flatbuffers/blob/master/samples/sample_flexbuffers_serde.rs)
+//!
+//! This rust implementation is in progress and, until the 1.0 release, breaking API changes may
+//! happen between minor versions.
+// TODO(cneo): serde stuff are behind a default-on feature flag
+//             Reader to Json is behind a default-off feature flag
+//             Serializable structs are Pushable
+//             Serde with maps - field names and type names.
+
+// Until flat/flexbuffers is on Rust v1.42, we cannot use the previously unstable matches! macro.
+#![allow(clippy::unknown_clippy_lints)]
+#![allow(clippy::match_like_matches_macro)]
+
+#[macro_use]
+extern crate bitflags;
+extern crate byteorder;
+#[macro_use]
+extern crate serde_derive;
+extern crate num_enum;
+extern crate serde;
+
+mod bitwidth;
+mod builder;
+mod flexbuffer_type;
+mod reader;
+mod buffer;
+
+pub use bitwidth::BitWidth;
+pub use builder::Error as SerializationError;
+pub use builder::{
+    singleton, Builder, BuilderOptions, FlexbufferSerializer, MapBuilder, Pushable, VectorBuilder,
+};
+pub use flexbuffer_type::FlexBufferType;
+pub use buffer::Buffer;
+pub use reader::Error as ReaderError;
+pub use reader::{DeserializationError, MapReader, Reader, ReaderIterator, VectorReader};
+use serde::{Deserialize, Serialize};
+
+mod private {
+    pub trait Sealed {}
+}
+
+/// Serialize as a flexbuffer into a vector.
+pub fn to_vec<T: Serialize>(x: T) -> Result<Vec<u8>, SerializationError> {
+    let mut s = FlexbufferSerializer::new();
+    x.serialize(&mut s)?;
+    Ok(s.take_buffer())
+}
+
+/// Deserialize a type from a flexbuffer.
+pub fn from_slice<'de, T: Deserialize<'de>>(buf: &'de [u8]) -> Result<T, DeserializationError> {
+    let r = Reader::get_root(buf)?;
+    T::deserialize(r)
+}
+
+/// Deserialize a type from a flexbuffer.
+pub fn from_buffer<'de, T: Deserialize<'de>, B: Buffer>(
+    buf: &'de B
+) -> Result<T, DeserializationError> {
+    let r = Reader::get_root(buf as &'de [u8])?;
+    T::deserialize(r)
+}
+
+/// This struct, when pushed will be serialized as a `FlexBufferType::Blob`.
+///
+/// A `Blob` is a variable width `length` followed by that many bytes of data.
+#[derive(Debug, PartialEq, Eq)]
+pub struct Blob<B>(pub B);
+
+impl<B: Buffer> Clone for Blob<B> {
+    fn clone(&self) -> Self {
+        Blob(self.0.shallow_copy())
+    }
+}
+
+/// This struct, when pushed, will be serialized as a `FlexBufferType::IndirectUInt`.
+///
+/// It is an unsigned integer stored by reference in the flexbuffer. This can reduce the
+/// size of vectors and maps containing the `IndirectUInt`.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct IndirectUInt(pub u64);
+
+/// This struct, when pushed, will be serialized as a `FlexBufferType::IndirectInt`.
+///
+/// It is a signed integer stored by reference in the flexbuffer. This can reduce the
+/// size of vectors and maps containing the `IndirectInt`.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct IndirectInt(pub i64);
+
+/// This struct, when pushed, will be serialized as a `FlexBufferType::IndirectFloat`.
+///
+/// It is a floating point stored by reference in the flexbuffer. This can reduce the
+/// size of vectors and maps containing the `IndirectFloat`.
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct IndirectFloat(pub f64);
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/de.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/de.rs
new file mode 100644
index 0000000..ed2b815
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/de.rs
@@ -0,0 +1,271 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::Error;
+use crate::{FlexBufferType, Reader, ReaderIterator};
+use serde::de::{
+    DeserializeSeed, Deserializer, EnumAccess, IntoDeserializer, MapAccess, SeqAccess,
+    VariantAccess, Visitor,
+};
+
+/// Errors that may happen when deserializing a flexbuffer with serde.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum DeserializationError {
+    Reader(Error),
+    Serde(String),
+}
+
+impl std::error::Error for DeserializationError {}
+impl std::fmt::Display for DeserializationError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Reader(r) => write!(f, "Flexbuffer Read Error: {:?}", r),
+            Self::Serde(s) => write!(f, "Serde Error: {}", s),
+        }
+    }
+}
+
+impl serde::de::Error for DeserializationError {
+    fn custom<T>(msg: T) -> Self
+    where
+        T: std::fmt::Display,
+    {
+        Self::Serde(format!("{}", msg))
+    }
+}
+
+impl std::convert::From<super::Error> for DeserializationError {
+    fn from(e: super::Error) -> Self {
+        Self::Reader(e)
+    }
+}
+
+impl<'de> SeqAccess<'de> for ReaderIterator<&'de [u8]> {
+    type Error = DeserializationError;
+
+    fn next_element_seed<T>(
+        &mut self,
+        seed: T,
+    ) -> Result<Option<<T as DeserializeSeed<'de>>::Value>, Self::Error>
+    where
+        T: DeserializeSeed<'de>,
+    {
+        if let Some(elem) = self.next() {
+            seed.deserialize(elem).map(Some)
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn size_hint(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+struct EnumReader<'de> {
+    variant: &'de str,
+    value: Option<Reader<&'de [u8]>>,
+}
+
+impl<'de> EnumAccess<'de> for EnumReader<'de> {
+    type Error = DeserializationError;
+    type Variant = Reader<&'de [u8]>;
+
+    fn variant_seed<V>(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error>
+    where
+        V: DeserializeSeed<'de>,
+    {
+        seed.deserialize(self.variant.into_deserializer())
+            .map(|v| (v, self.value.unwrap_or_default()))
+    }
+}
+
+struct MapAccessor<'de> {
+    keys: ReaderIterator<&'de [u8]>,
+    vals: ReaderIterator<&'de [u8]>,
+}
+
+impl<'de> MapAccess<'de> for MapAccessor<'de> {
+    type Error = DeserializationError;
+
+    fn next_key_seed<K>(&mut self, seed: K) -> Result<Option<K::Value>, Self::Error>
+    where
+        K: DeserializeSeed<'de>,
+    {
+        if let Some(k) = self.keys.next() {
+            seed.deserialize(k).map(Some)
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn next_value_seed<V>(&mut self, seed: V) -> Result<V::Value, Self::Error>
+    where
+        V: DeserializeSeed<'de>,
+    {
+        let val = self.vals.next().ok_or(Error::IndexOutOfBounds)?;
+        seed.deserialize(val)
+    }
+}
+
+impl<'de> VariantAccess<'de> for Reader<&'de [u8]> {
+    type Error = DeserializationError;
+
+    fn unit_variant(self) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    fn newtype_variant_seed<T>(self, seed: T) -> Result<T::Value, Self::Error>
+    where
+        T: DeserializeSeed<'de>,
+    {
+        seed.deserialize(self)
+    }
+
+    // Tuple variants have an internally tagged representation. They are vectors where Index 0 is
+    // the discriminant and index N is field N-1.
+    fn tuple_variant<V>(self, _len: usize, visitor: V) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        visitor.visit_seq(self.as_vector().iter())
+    }
+
+    // Struct variants have an internally tagged representation. They are vectors where Index 0 is
+    // the discriminant and index N is field N-1.
+    fn struct_variant<V>(
+        self,
+        _fields: &'static [&'static str],
+        visitor: V,
+    ) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        let m = self.get_map()?;
+        visitor.visit_map(MapAccessor {
+            keys: m.keys_vector().iter(),
+            vals: m.iter_values(),
+        })
+    }
+}
+
+impl<'de> Deserializer<'de> for Reader<&'de [u8]> {
+    type Error = DeserializationError;
+    fn is_human_readable(&self) -> bool {
+        cfg!(deserialize_human_readable)
+    }
+
+    fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        use crate::BitWidth::*;
+        use crate::FlexBufferType::*;
+        match (self.flexbuffer_type(), self.bitwidth()) {
+            (Bool, _) => visitor.visit_bool(self.as_bool()),
+            (UInt, W8) => visitor.visit_u8(self.as_u8()),
+            (UInt, W16) => visitor.visit_u16(self.as_u16()),
+            (UInt, W32) => visitor.visit_u32(self.as_u32()),
+            (UInt, W64) => visitor.visit_u64(self.as_u64()),
+            (Int, W8) => visitor.visit_i8(self.as_i8()),
+            (Int, W16) => visitor.visit_i16(self.as_i16()),
+            (Int, W32) => visitor.visit_i32(self.as_i32()),
+            (Int, W64) => visitor.visit_i64(self.as_i64()),
+            (Float, W32) => visitor.visit_f32(self.as_f32()),
+            (Float, W64) => visitor.visit_f64(self.as_f64()),
+            (Float, _) => Err(Error::InvalidPackedType.into()), // f8 and f16 are not supported.
+            (Null, _) => visitor.visit_unit(),
+            (String, _) | (Key, _) => visitor.visit_borrowed_str(self.as_str()),
+            (Blob, _) => visitor.visit_borrowed_bytes(self.get_blob()?.0),
+            (Map, _) => {
+                let m = self.get_map()?;
+                visitor.visit_map(MapAccessor {
+                    keys: m.keys_vector().iter(),
+                    vals: m.iter_values(),
+                })
+            }
+            (ty, _) if ty.is_vector() => visitor.visit_seq(self.as_vector().iter()),
+            (ty, bw) => unreachable!("TODO deserialize_any {:?} {:?}.", ty, bw),
+        }
+    }
+
+    serde::forward_to_deserialize_any! {
+        bool i8 i16 i32 i64 u8 u16 u32 u64 f32 f64 str unit unit_struct bytes
+        ignored_any map identifier struct tuple tuple_struct seq string
+    }
+
+    fn deserialize_char<V>(self, visitor: V) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        visitor.visit_char(self.as_u8() as char)
+    }
+
+    fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        visitor.visit_byte_buf(self.get_blob()?.0.to_vec())
+    }
+
+    fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        if self.flexbuffer_type() == FlexBufferType::Null {
+            visitor.visit_none()
+        } else {
+            visitor.visit_some(self)
+        }
+    }
+
+    fn deserialize_newtype_struct<V>(
+        self,
+        _name: &'static str,
+        visitor: V,
+    ) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        visitor.visit_newtype_struct(self)
+    }
+
+    fn deserialize_enum<V>(
+        self,
+        _name: &'static str,
+        _variants: &'static [&'static str],
+        visitor: V,
+    ) -> Result<V::Value, Self::Error>
+    where
+        V: Visitor<'de>,
+    {
+        let (variant, value) = match self.fxb_type {
+            FlexBufferType::String => (self.as_str(), None),
+            FlexBufferType::Map => {
+                let m = self.get_map()?;
+                let variant = m.keys_vector().idx(0).get_key()?;
+                let value = Some(m.idx(0));
+                (variant, value)
+            }
+            _ => {
+                return Err(Error::UnexpectedFlexbufferType {
+                    expected: FlexBufferType::Map,
+                    actual: self.fxb_type,
+                }
+                .into());
+            }
+        };
+        visitor.visit_enum(EnumReader { variant, value })
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/iter.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/iter.rs
new file mode 100644
index 0000000..267a01f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/iter.rs
@@ -0,0 +1,67 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::{Buffer, Reader, VectorReader};
+use std::iter::{DoubleEndedIterator, ExactSizeIterator, FusedIterator, Iterator};
+
+/// Iterates over a flexbuffer vector, typed vector, or map. Yields [Readers](struct.Reader.html).
+///
+/// If any error occurs, the Reader is defaulted to a Null flexbuffer Reader.
+pub struct ReaderIterator<B> {
+    pub(super) reader: VectorReader<B>,
+    pub(super) front: usize,
+    end: usize,
+}
+
+impl<B: Buffer> ReaderIterator<B> {
+    pub(super) fn new(reader: VectorReader<B>) -> Self {
+        let end = reader.len();
+        ReaderIterator {
+            reader,
+            front: 0,
+            end,
+        }
+    }
+}
+
+impl<B: Buffer> Iterator for ReaderIterator<B> {
+    type Item = Reader<B>;
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.front < self.end {
+            let r = self.reader.idx(self.front);
+            self.front += 1;
+            Some(r)
+        } else {
+            None
+        }
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let remaining = self.end - self.front;
+        (remaining, Some(remaining))
+    }
+}
+
+impl<B: Buffer> DoubleEndedIterator for ReaderIterator<B> {
+    fn next_back(&mut self) -> Option<Self::Item> {
+        if self.front < self.end {
+            self.end -= 1;
+            Some(self.reader.idx(self.end))
+        } else {
+            None
+        }
+    }
+}
+
+impl<B: Buffer> ExactSizeIterator for ReaderIterator<B> {}
+impl<B: Buffer> FusedIterator for ReaderIterator<B> {}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/map.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/map.rs
new file mode 100644
index 0000000..7d6ada8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/map.rs
@@ -0,0 +1,192 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{deref_offset, unpack_type, Error, Reader, ReaderIterator, VectorReader};
+use crate::BitWidth;
+use crate::Buffer;
+use std::cmp::Ordering;
+use std::iter::{DoubleEndedIterator, ExactSizeIterator, FusedIterator, Iterator};
+
+/// Allows indexing on a flexbuffer map.
+///
+/// MapReaders may be indexed with strings or usizes. `index` returns a result type,
+/// which may indicate failure due to a missing key or bad data, `idx` returns an Null Reader in
+/// cases of error.
+pub struct MapReader<B> {
+    pub(super) buffer: B,
+    pub(super) values_address: usize,
+    pub(super) keys_address: usize,
+    pub(super) values_width: BitWidth,
+    pub(super) keys_width: BitWidth,
+    pub(super) length: usize,
+}
+
+impl<B: Buffer> Clone for MapReader<B> {
+    fn clone(&self) -> Self {
+        MapReader {
+            buffer: self.buffer.shallow_copy(),
+            ..*self
+        }
+    }
+}
+
+impl<B: Buffer> Default for MapReader<B> {
+    fn default() -> Self {
+        MapReader {
+            buffer: B::empty(),
+            values_address: usize::default(),
+            keys_address: usize::default(),
+            values_width: BitWidth::default(),
+            keys_width: BitWidth::default(),
+            length: usize::default(),
+        }
+    }
+}
+
+// manual implementation of Debug because buffer slice can't be automatically displayed
+impl<B: Buffer> std::fmt::Debug for MapReader<B> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // skips buffer field
+        f.debug_struct("MapReader")
+            .field("values_address", &self.values_address)
+            .field("keys_address", &self.keys_address)
+            .field("values_width", &self.values_width)
+            .field("keys_width", &self.keys_width)
+            .field("length", &self.length)
+            .finish()
+    }
+}
+
+impl<B: Buffer> MapReader<B> {
+    /// Returns the number of key/value pairs are in the map.
+    pub fn len(&self) -> usize {
+        self.length
+    }
+
+    /// Returns true if the map has zero key/value pairs.
+    pub fn is_empty(&self) -> bool {
+        self.length == 0
+    }
+
+    // Using &CStr will eagerly compute the length of the key. &str needs length info AND utf8
+    // validation. This version is faster than both.
+    fn lazy_strcmp(&self, key_addr: usize, key: &str) -> Ordering {
+        // TODO: Can we know this won't OOB and panic?
+        let k = self.buffer[key_addr..].iter().take_while(|&&b| b != b'\0');
+        k.cmp(key.as_bytes().iter())
+    }
+
+    /// Returns the index of a given key in the map.
+    pub fn index_key(&self, key: &str) -> Option<usize> {
+        let (mut low, mut high) = (0, self.length);
+        while low < high {
+            let i = (low + high) / 2;
+            let key_offset_address = self.keys_address + i * self.keys_width.n_bytes();
+            let key_address =
+                deref_offset(&self.buffer, key_offset_address, self.keys_width).ok()?;
+            match self.lazy_strcmp(key_address, key) {
+                Ordering::Equal => return Some(i),
+                Ordering::Less => low = if i == low { i + 1 } else { i },
+                Ordering::Greater => high = i,
+            }
+        }
+        None
+    }
+
+    /// Index into a map with a key or usize.
+    pub fn index<I: MapReaderIndexer>(&self, i: I) -> Result<Reader<B>, Error> {
+        i.index_map_reader(self)
+    }
+
+    /// Index into a map with a key or usize. If any errors occur a Null reader is returned.
+    pub fn idx<I: MapReaderIndexer>(&self, i: I) -> Reader<B> {
+        i.index_map_reader(self).unwrap_or_default()
+    }
+
+    fn usize_index(&self, i: usize) -> Result<Reader<B>, Error> {
+        if i >= self.length {
+            return Err(Error::IndexOutOfBounds);
+        }
+        let data_address = self.values_address + self.values_width.n_bytes() * i;
+        let type_address = self.values_address + self.values_width.n_bytes() * self.length + i;
+        let (fxb_type, width) = self
+            .buffer
+            .get(type_address)
+            .ok_or(Error::FlexbufferOutOfBounds)
+            .and_then(|&b| unpack_type(b))?;
+        Reader::new(
+            self.buffer.shallow_copy(),
+            data_address,
+            fxb_type,
+            width,
+            self.values_width,
+        )
+    }
+
+    fn key_index(&self, k: &str) -> Result<Reader<B>, Error> {
+        let i = self.index_key(k).ok_or(Error::KeyNotFound)?;
+        self.usize_index(i)
+    }
+
+    /// Iterate over the values of the map.
+    pub fn iter_values(&self) -> ReaderIterator<B> {
+        ReaderIterator::new(VectorReader {
+            reader: Reader {
+                buffer: self.buffer.shallow_copy(),
+                fxb_type: crate::FlexBufferType::Map,
+                width: self.values_width,
+                address: self.values_address,
+            },
+            length: self.length,
+        })
+    }
+
+    /// Iterate over the keys of the map.
+    pub fn iter_keys(
+        &self,
+    ) -> impl Iterator<Item = B::BufferString> + DoubleEndedIterator + ExactSizeIterator + FusedIterator
+    {
+        self.keys_vector().iter().map(|k| k.as_str())
+    }
+
+    pub fn keys_vector(&self) -> VectorReader<B> {
+        VectorReader {
+            reader: Reader {
+                buffer: self.buffer.shallow_copy(),
+                fxb_type: crate::FlexBufferType::VectorKey,
+                width: self.keys_width,
+                address: self.keys_address,
+            },
+            length: self.length,
+        }
+    }
+}
+
+pub trait MapReaderIndexer {
+    fn index_map_reader<B: Buffer>(self, r: &MapReader<B>) -> Result<Reader<B>, Error>;
+}
+
+impl MapReaderIndexer for usize {
+    #[inline]
+    fn index_map_reader<B: Buffer>(self, r: &MapReader<B>) -> Result<Reader<B>, Error> {
+        r.usize_index(self)
+    }
+}
+
+impl MapReaderIndexer for &str {
+    #[inline]
+    fn index_map_reader<B: Buffer>(self, r: &MapReader<B>) -> Result<Reader<B>, Error> {
+        r.key_index(self)
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/mod.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/mod.rs
new file mode 100644
index 0000000..311f91d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/mod.rs
@@ -0,0 +1,669 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::bitwidth::BitWidth;
+use crate::flexbuffer_type::FlexBufferType;
+use crate::{Buffer, Blob};
+use std::convert::{TryFrom, TryInto};
+use std::fmt;
+use std::ops::Rem;
+use std::str::FromStr;
+mod de;
+mod iter;
+mod map;
+mod vector;
+pub use de::DeserializationError;
+pub use iter::ReaderIterator;
+pub use map::{MapReader, MapReaderIndexer};
+pub use vector::VectorReader;
+
+/// All the possible errors when reading a flexbuffer.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub enum Error {
+    /// One of the following data errors occured:
+    ///
+    /// *    The read flexbuffer had an offset that pointed outside the flexbuffer.
+    /// *    The 'negative indicies' where length and map keys are stored were out of bounds
+    /// *    The buffer was too small to contain a flexbuffer root.
+    FlexbufferOutOfBounds,
+    /// Failed to parse a valid FlexbufferType and Bitwidth from a type byte.
+    InvalidPackedType,
+    /// Flexbuffer type of the read data does not match function used.
+    UnexpectedFlexbufferType {
+        expected: FlexBufferType,
+        actual: FlexBufferType,
+    },
+    /// BitWidth type of the read data does not match function used.
+    UnexpectedBitWidth {
+        expected: BitWidth,
+        actual: BitWidth,
+    },
+    /// Read a flexbuffer offset or length that overflowed usize.
+    ReadUsizeOverflowed,
+    /// Tried to index a type that's not one of the Flexbuffer vector types.
+    CannotIndexAsVector,
+    /// Tried to index a Flexbuffer vector or map out of bounds.
+    IndexOutOfBounds,
+    /// A Map was indexed with a key that it did not contain.
+    KeyNotFound,
+    /// Failed to parse a Utf8 string.
+    /// The Option will be `None` if and only if this Error was deserialized.
+    // NOTE: std::str::Utf8Error does not implement Serialize, Deserialize, nor Default. We tell
+    // serde to skip the field and default to None. We prefer to have the boxed error so it can be
+    // used with std::error::Error::source, though another (worse) option could be to drop that
+    // information.
+    Utf8Error(#[serde(skip)] Option<Box<std::str::Utf8Error>>),
+    /// get_slice failed because the given data buffer is misaligned.
+    AlignmentError,
+    InvalidRootWidth,
+    InvalidMapKeysVectorWidth,
+}
+impl std::convert::From<std::str::Utf8Error> for Error {
+    fn from(e: std::str::Utf8Error) -> Self {
+        Self::Utf8Error(Some(Box::new(e)))
+    }
+}
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::UnexpectedBitWidth { expected, actual } => write!(
+                f,
+                "Error reading flexbuffer: Expected bitwidth: {:?}, found bitwidth: {:?}",
+                expected, actual
+            ),
+            Self::UnexpectedFlexbufferType { expected, actual } => write!(
+                f,
+                "Error reading flexbuffer: Expected type: {:?}, found type: {:?}",
+                expected, actual
+            ),
+            _ => write!(f, "Error reading flexbuffer: {:?}", self),
+        }
+    }
+}
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        if let Self::Utf8Error(Some(e)) = self {
+            Some(e)
+        } else {
+            None
+        }
+    }
+}
+
+pub trait ReadLE: crate::private::Sealed + std::marker::Sized {
+    const VECTOR_TYPE: FlexBufferType;
+    const WIDTH: BitWidth;
+}
+macro_rules! rle {
+    ($T: ty, $VECTOR_TYPE: ident, $WIDTH: ident) => {
+        impl ReadLE for $T {
+            const VECTOR_TYPE: FlexBufferType = FlexBufferType::$VECTOR_TYPE;
+            const WIDTH: BitWidth = BitWidth::$WIDTH;
+        }
+    };
+}
+rle!(u8, VectorUInt, W8);
+rle!(u16, VectorUInt, W16);
+rle!(u32, VectorUInt, W32);
+rle!(u64, VectorUInt, W64);
+rle!(i8, VectorInt, W8);
+rle!(i16, VectorInt, W16);
+rle!(i32, VectorInt, W32);
+rle!(i64, VectorInt, W64);
+rle!(f32, VectorFloat, W32);
+rle!(f64, VectorFloat, W64);
+
+macro_rules! as_default {
+    ($as: ident, $get: ident, $T: ty) => {
+        pub fn $as(&self) -> $T {
+            self.$get().unwrap_or_default()
+        }
+    };
+}
+
+/// `Reader`s allow access to data stored in a Flexbuffer.
+///
+/// Each reader represents a single address in the buffer so data is read lazily. Start a reader
+/// by calling `get_root` on your flexbuffer `&[u8]`.
+///
+/// - The `get_T` methods return a `Result<T, Error>`. They return an OK value if and only if the
+/// flexbuffer type matches `T`. This is analogous to the behavior of Rust's json library, though
+/// with Result instead of Option.
+/// - The `as_T` methods will try their best to return to a value of type `T`
+/// (by casting or even parsing a string if necessary) but ultimately returns `T::default` if it
+/// fails. This behavior is analogous to that of flexbuffers C++.
+pub struct Reader<B> {
+    fxb_type: FlexBufferType,
+    width: BitWidth,
+    address: usize,
+    buffer: B,
+}
+
+impl<B: Buffer> Clone for Reader<B> {
+    fn clone(&self) -> Self {
+        Reader {
+            fxb_type: self.fxb_type,
+            width: self.width,
+            address: self.address,
+            buffer: self.buffer.shallow_copy(),
+        }
+    }
+}
+
+impl<B: Buffer> Default for Reader<B> {
+    fn default() -> Self {
+        Reader {
+            fxb_type: FlexBufferType::default(),
+            width: BitWidth::default(),
+            address: usize::default(),
+            buffer: B::empty(),
+        }
+    }
+}
+
+// manual implementation of Debug because buffer slice can't be automatically displayed
+impl<B> std::fmt::Debug for Reader<B> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // skips buffer field
+        f.debug_struct("Reader")
+            .field("fxb_type", &self.fxb_type)
+            .field("width", &self.width)
+            .field("address", &self.address)
+            .finish()
+    }
+}
+
+
+macro_rules! try_cast_fn {
+    ($name: ident, $full_width: ident, $Ty: ident) => {
+        pub fn $name(&self) -> $Ty {
+            self.$full_width().try_into().unwrap_or_default()
+        }
+    }
+}
+
+fn safe_sub(a: usize, b: usize) -> Result<usize, Error> {
+    a.checked_sub(b).ok_or(Error::FlexbufferOutOfBounds)
+}
+
+fn deref_offset(buffer: &[u8], address: usize, width: BitWidth) -> Result<usize, Error> {
+    let off = read_usize(buffer, address, width);
+    safe_sub(address, off)
+}
+
+impl<B: Buffer> Reader<B> {
+    fn new(
+        buffer: B,
+        mut address: usize,
+        mut fxb_type: FlexBufferType,
+        width: BitWidth,
+        parent_width: BitWidth,
+    ) -> Result<Self, Error> {
+        if fxb_type.is_reference() {
+            address = deref_offset(&buffer, address, parent_width)?;
+            // Indirects were dereferenced.
+            if let Some(t) = fxb_type.to_direct() {
+                fxb_type = t;
+            }
+        }
+        Ok(Reader {
+            address,
+            fxb_type,
+            width,
+            buffer,
+        })
+    }
+
+    /// Parses the flexbuffer from the given buffer. Assumes the flexbuffer root is the last byte
+    /// of the buffer.
+    pub fn get_root(buffer: B) -> Result<Self, Error> {
+        let end = buffer.len();
+        if end < 3 {
+            return Err(Error::FlexbufferOutOfBounds);
+        }
+        // Last byte is the root width.
+        let root_width = BitWidth::from_nbytes(buffer[end - 1]).ok_or(Error::InvalidRootWidth)?;
+        // Second last byte is root type.
+        let (fxb_type, width) = unpack_type(buffer[end - 2])?;
+        // Location of root data. (BitWidth bits before root type)
+        let address = safe_sub(end - 2, root_width.n_bytes())?;
+        Self::new(buffer, address, fxb_type, width, root_width)
+    }
+
+    /// Convenience function to get the underlying buffer. By using `shallow_copy`, this preserves
+    /// the lifetime that the underlying buffer has. 
+    pub fn buffer(&self) -> B {
+        self.buffer.shallow_copy()
+    }
+
+    /// Returns the FlexBufferType of this Reader.
+    pub fn flexbuffer_type(&self) -> FlexBufferType {
+        self.fxb_type
+    }
+
+    /// Returns the bitwidth of this Reader.
+    pub fn bitwidth(&self) -> BitWidth {
+        self.width
+    }
+
+    /// Returns the length of the Flexbuffer. If the type has no length, or if an error occurs,
+    /// 0 is returned.
+    pub fn length(&self) -> usize {
+        if let Some(len) = self.fxb_type.fixed_length_vector_length() {
+            len
+        } else if self.fxb_type.has_length_slot() && self.address >= self.width.n_bytes() {
+            read_usize(&self.buffer, self.address - self.width.n_bytes(), self.width)
+        } else {
+            0
+        }
+    }
+    /// Returns true if the flexbuffer is aligned to 8 bytes. This guarantees, for valid
+    /// flexbuffers, that the data is correctly aligned in memory and slices can be read directly
+    /// e.g. with `get_f64s` or `get_i16s`.
+    #[inline]
+    pub fn is_aligned(&self) -> bool {
+        (self.buffer.as_ptr() as usize).rem(8) == 0
+    }
+
+    as_default!(as_vector, get_vector, VectorReader<B>);
+    as_default!(as_map, get_map, MapReader<B>);
+
+    fn expect_type(&self, ty: FlexBufferType) -> Result<(), Error> {
+        if self.fxb_type == ty {
+            Ok(())
+        } else {
+            Err(Error::UnexpectedFlexbufferType {
+                expected: ty,
+                actual: self.fxb_type,
+            })
+        }
+    }
+    fn expect_bw(&self, bw: BitWidth) -> Result<(), Error> {
+        if self.width == bw {
+            Ok(())
+        } else {
+            Err(Error::UnexpectedBitWidth {
+                expected: bw,
+                actual: self.width,
+            })
+        }
+    }
+
+    /// Directly reads a slice of type `T` where `T` is one of `u8,u16,u32,u64,i8,i16,i32,i64,f32,f64`.
+    /// Returns Err if the type, bitwidth, or memory alignment does not match. Since the bitwidth is
+    /// dynamic, its better to use a VectorReader unless you know your data and performance is critical.
+    #[cfg(target_endian = "little")]
+    #[deprecated(
+        since = "0.3.0",
+        note = "This function is unsafe - if this functionality is needed use `Reader::buffer::align_to`"
+    )]
+    pub fn get_slice<T: ReadLE>(&self) -> Result<&[T], Error> {
+        if self.flexbuffer_type().typed_vector_type() != T::VECTOR_TYPE.typed_vector_type() {
+            self.expect_type(T::VECTOR_TYPE)?;
+        }
+        if self.bitwidth().n_bytes() != std::mem::size_of::<T>() {
+            self.expect_bw(T::WIDTH)?;
+        }
+        let end = self.address + self.length() * std::mem::size_of::<T>();
+        let slice: &[u8] = self
+            .buffer
+            .get(self.address..end)
+            .ok_or(Error::FlexbufferOutOfBounds)?;
+
+        // `align_to` is required because the point of this function is to directly hand back a
+        // slice of scalars. This can fail because Rust's default allocator is not 16byte aligned
+        // (though in practice this only happens for small buffers).
+        let (pre, mid, suf) = unsafe { slice.align_to::<T>() };
+        if pre.is_empty() && suf.is_empty() {
+            Ok(mid)
+        } else {
+            Err(Error::AlignmentError)
+        }
+    }
+
+    /// Returns the value of the reader if it is a boolean.
+    /// Otherwise Returns error.
+    pub fn get_bool(&self) -> Result<bool, Error> {
+        self.expect_type(FlexBufferType::Bool)?;
+        Ok(
+            self.buffer[self.address..self.address + self.width.n_bytes()]
+                .iter()
+                .any(|&b| b != 0),
+        )
+    }
+
+    /// Gets the length of the key if this type is a key.
+    ///
+    /// Otherwise, returns an error.
+    #[inline]
+    fn get_key_len(&self) -> Result<usize, Error> {
+        self.expect_type(FlexBufferType::Key)?;
+        let (length, _) = self.buffer[self.address..]
+            .iter()
+            .enumerate()
+            .find(|(_, &b)| b == b'\0')
+            .unwrap_or((0, &0));
+        Ok(length)
+    }
+
+    /// Retrieves the string value up until the first `\0` character.
+    pub fn get_key(&self) -> Result<B::BufferString, Error> {
+        let bytes = self.buffer
+            .slice(self.address..self.address + self.get_key_len()?)
+            .ok_or(Error::IndexOutOfBounds)?;
+        Ok(bytes.buffer_str()?)
+    }
+
+    pub fn get_blob(&self) -> Result<Blob<B>, Error> {
+        self.expect_type(FlexBufferType::Blob)?;
+        Ok(Blob(
+                self.buffer
+                    .slice(self.address..self.address + self.length())
+                    .ok_or(Error::IndexOutOfBounds)?
+        ))
+    }
+
+    pub fn as_blob(&self) -> Blob<B> {
+        self.get_blob().unwrap_or(Blob(B::empty()))
+    }
+
+    /// Retrieves str pointer, errors if invalid UTF-8, or the provided index
+    /// is out of bounds.
+    pub fn get_str(&self) -> Result<B::BufferString, Error> {
+        self.expect_type(FlexBufferType::String)?;
+        let bytes = self.buffer.slice(self.address..self.address + self.length());
+        Ok(bytes.ok_or(Error::ReadUsizeOverflowed)?.buffer_str()?)
+    }
+
+    fn get_map_info(&self) -> Result<(usize, BitWidth), Error> {
+        self.expect_type(FlexBufferType::Map)?;
+        if 3 * self.width.n_bytes() >= self.address {
+            return Err(Error::FlexbufferOutOfBounds);
+        }
+        let keys_offset_address = self.address - 3 * self.width.n_bytes();
+        let keys_width = {
+            let kw_addr = self.address - 2 * self.width.n_bytes();
+            let kw = read_usize(&self.buffer, kw_addr, self.width);
+            BitWidth::from_nbytes(kw).ok_or(Error::InvalidMapKeysVectorWidth)
+        }?;
+        Ok((keys_offset_address, keys_width))
+    }
+
+    pub fn get_map(&self) -> Result<MapReader<B>, Error> {
+        let (keys_offset_address, keys_width) = self.get_map_info()?;
+        let keys_address = deref_offset(&self.buffer, keys_offset_address, self.width)?;
+        // TODO(cneo): Check that vectors length equals keys length.
+        Ok(MapReader {
+            buffer: self.buffer.shallow_copy(),
+            values_address: self.address,
+            values_width: self.width,
+            keys_address,
+            keys_width,
+            length: self.length(),
+        })
+    }
+
+    /// Tries to read a FlexBufferType::UInt. Returns Err if the type is not a UInt or if the
+    /// address is out of bounds.
+    pub fn get_u64(&self) -> Result<u64, Error> {
+        self.expect_type(FlexBufferType::UInt)?;
+        let cursor = self
+            .buffer
+            .get(self.address..self.address + self.width.n_bytes());
+        match self.width {
+            BitWidth::W8 => cursor.map(|s| s[0] as u8).map(Into::into),
+            BitWidth::W16 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<u16>::from_le_bytes)
+                .map(Into::into),
+            BitWidth::W32 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<u32>::from_le_bytes)
+                .map(Into::into),
+            BitWidth::W64 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<u64>::from_le_bytes),
+        }
+        .ok_or(Error::FlexbufferOutOfBounds)
+    }
+    /// Tries to read a FlexBufferType::Int. Returns Err if the type is not a UInt or if the
+    /// address is out of bounds.
+    pub fn get_i64(&self) -> Result<i64, Error> {
+        self.expect_type(FlexBufferType::Int)?;
+        let cursor = self
+            .buffer
+            .get(self.address..self.address + self.width.n_bytes());
+        match self.width {
+            BitWidth::W8 => cursor.map(|s| s[0] as i8).map(Into::into),
+            BitWidth::W16 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<i16>::from_le_bytes)
+                .map(Into::into),
+            BitWidth::W32 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<i32>::from_le_bytes)
+                .map(Into::into),
+            BitWidth::W64 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(<i64>::from_le_bytes),
+        }
+        .ok_or(Error::FlexbufferOutOfBounds)
+    }
+    /// Tries to read a FlexBufferType::Float. Returns Err if the type is not a UInt, if the
+    /// address is out of bounds, or if its a f16 or f8 (not currently supported).
+    pub fn get_f64(&self) -> Result<f64, Error> {
+        self.expect_type(FlexBufferType::Float)?;
+        let cursor = self
+            .buffer
+            .get(self.address..self.address + self.width.n_bytes());
+        match self.width {
+            BitWidth::W8 | BitWidth::W16 => return Err(Error::InvalidPackedType),
+            BitWidth::W32 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(f32_from_le_bytes)
+                .map(Into::into),
+            BitWidth::W64 => cursor
+                .and_then(|s| s.try_into().ok())
+                .map(f64_from_le_bytes),
+        }
+        .ok_or(Error::FlexbufferOutOfBounds)
+    }
+    pub fn as_bool(&self) -> bool {
+        use FlexBufferType::*;
+        match self.fxb_type {
+            Bool => self.get_bool().unwrap_or_default(),
+            UInt => self.as_u64() != 0,
+            Int => self.as_i64() != 0,
+            Float => self.as_f64().abs() > std::f64::EPSILON,
+            String | Key => !self.as_str().is_empty(),
+            Null => false,
+            Blob => self.length() != 0,
+            ty if ty.is_vector() => self.length() != 0,
+            _ => unreachable!(),
+        }
+    }
+    /// Returns a u64, casting if necessary. For Maps and Vectors, their length is
+    /// returned. If anything fails, 0 is returned.
+    pub fn as_u64(&self) -> u64 {
+        match self.fxb_type {
+            FlexBufferType::UInt => self.get_u64().unwrap_or_default(),
+            FlexBufferType::Int => self
+                .get_i64()
+                .unwrap_or_default()
+                .try_into()
+                .unwrap_or_default(),
+            FlexBufferType::Float => self.get_f64().unwrap_or_default() as u64,
+            FlexBufferType::String => {
+                if let Ok(s) = self.get_str() {
+                    if let Ok(f) = u64::from_str(&s) {
+                        return f;
+                    }
+                }
+                0
+            }
+            _ if self.fxb_type.is_vector() => self.length() as u64,
+            _ => 0,
+        }
+    }
+    try_cast_fn!(as_u32, as_u64, u32);
+    try_cast_fn!(as_u16, as_u64, u16);
+    try_cast_fn!(as_u8, as_u64, u8);
+
+    /// Returns an i64, casting if necessary. For Maps and Vectors, their length is
+    /// returned. If anything fails, 0 is returned.
+    pub fn as_i64(&self) -> i64 {
+        match self.fxb_type {
+            FlexBufferType::Int => self.get_i64().unwrap_or_default(),
+            FlexBufferType::UInt => self
+                .get_u64()
+                .unwrap_or_default()
+                .try_into()
+                .unwrap_or_default(),
+            FlexBufferType::Float => self.get_f64().unwrap_or_default() as i64,
+            FlexBufferType::String => {
+                if let Ok(s) = self.get_str() {
+                    if let Ok(f) = i64::from_str(&s) {
+                        return f;
+                    }
+                }
+                0
+            }
+            _ if self.fxb_type.is_vector() => self.length() as i64,
+            _ => 0,
+        }
+    }
+    try_cast_fn!(as_i32, as_i64, i32);
+    try_cast_fn!(as_i16, as_i64, i16);
+    try_cast_fn!(as_i8, as_i64, i8);
+
+    /// Returns an f64, casting if necessary. For Maps and Vectors, their length is
+    /// returned. If anything fails, 0 is returned.
+    pub fn as_f64(&self) -> f64 {
+        match self.fxb_type {
+            FlexBufferType::Int => self.get_i64().unwrap_or_default() as f64,
+            FlexBufferType::UInt => self.get_u64().unwrap_or_default() as f64,
+            FlexBufferType::Float => self.get_f64().unwrap_or_default(),
+            FlexBufferType::String => {
+                if let Ok(s) = self.get_str() {
+                    if let Ok(f) = f64::from_str(&s) {
+                        return f;
+                    }
+                }
+                0.0
+            }
+            _ if self.fxb_type.is_vector() => self.length() as f64,
+            _ => 0.0,
+        }
+    }
+    pub fn as_f32(&self) -> f32 {
+        self.as_f64() as f32
+    }
+
+    /// Returns empty string if you're not trying to read a string.
+    pub fn as_str(&self) -> B::BufferString {
+        match self.fxb_type {
+            FlexBufferType::String => self.get_str().unwrap_or(B::empty_str()),
+            FlexBufferType::Key => self.get_key().unwrap_or(B::empty_str()),
+            _ => B::empty_str(),
+        }
+    }
+
+    pub fn get_vector(&self) -> Result<VectorReader<B>, Error> {
+        if !self.fxb_type.is_vector() {
+            self.expect_type(FlexBufferType::Vector)?;
+        };
+        Ok(VectorReader {
+            reader: self.clone(),
+            length: self.length(),
+        })
+    }
+}
+
+impl<B: Buffer> fmt::Display for Reader<B> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use FlexBufferType::*;
+        match self.flexbuffer_type() {
+            Null => write!(f, "null"),
+            UInt => write!(f, "{}", self.as_u64()),
+            Int => write!(f, "{}", self.as_i64()),
+            Float => write!(f, "{}", self.as_f64()),
+            Key | String => write!(f, "{:?}", &self.as_str() as &str),
+            Bool => write!(f, "{}", self.as_bool()),
+            Blob => write!(f, "blob"),
+            Map => {
+                write!(f, "{{")?;
+                let m = self.as_map();
+                let mut pairs = m.iter_keys().zip(m.iter_values());
+                if let Some((k, v)) = pairs.next() {
+                    write!(f, "{:?}: {}", &k as &str, v)?;
+                    for (k, v) in pairs {
+                        write!(f, ", {:?}: {}", &k as &str, v)?;
+                    }
+                }
+                write!(f, "}}")
+            }
+            t if t.is_vector() => {
+                write!(f, "[")?;
+                let mut elems = self.as_vector().iter();
+                if let Some(first) = elems.next() {
+                    write!(f, "{}", first)?;
+                    for e in elems {
+                        write!(f, ", {}", e)?;
+                    }
+                }
+                write!(f, "]")
+            }
+            _ => unreachable!("Display not implemented for {:?}", self),
+        }
+    }
+}
+
+// TODO(cneo): Use <f..>::from_le_bytes when we move past rustc 1.39.
+fn f32_from_le_bytes(bytes: [u8; 4]) -> f32 {
+    let bits = <u32>::from_le_bytes(bytes);
+    <f32>::from_bits(bits)
+}
+
+fn f64_from_le_bytes(bytes: [u8; 8]) -> f64 {
+    let bits = <u64>::from_le_bytes(bytes);
+    <f64>::from_bits(bits)
+}
+
+fn read_usize(buffer: &[u8], address: usize, width: BitWidth) -> usize {
+    let cursor = &buffer[address..];
+    match width {
+        BitWidth::W8 => cursor[0] as usize,
+        BitWidth::W16 => cursor
+            .get(0..2)
+            .and_then(|s| s.try_into().ok())
+            .map(<u16>::from_le_bytes)
+            .unwrap_or_default() as usize,
+        BitWidth::W32 => cursor
+            .get(0..4)
+            .and_then(|s| s.try_into().ok())
+            .map(<u32>::from_le_bytes)
+            .unwrap_or_default() as usize,
+        BitWidth::W64 => cursor
+            .get(0..8)
+            .and_then(|s| s.try_into().ok())
+            .map(<u64>::from_le_bytes)
+            .unwrap_or_default() as usize,
+    }
+}
+
+fn unpack_type(ty: u8) -> Result<(FlexBufferType, BitWidth), Error> {
+    let w = BitWidth::try_from(ty & 3u8).map_err(|_| Error::InvalidPackedType)?;
+    let t = FlexBufferType::try_from(ty >> 2).map_err(|_| Error::InvalidPackedType)?;
+    Ok((t, w))
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/vector.rs b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/vector.rs
new file mode 100644
index 0000000..b23cbb4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/rust/flexbuffers/src/reader/vector.rs
@@ -0,0 +1,92 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{unpack_type, Error, Reader, ReaderIterator};
+use crate::{BitWidth, Buffer, FlexBufferType};
+
+/// Allows indexing on any flexbuffer vector type, (heterogenous vector, typed vector, or fixed
+/// length typed vector).
+///
+/// VectorReaders may be indexed with usize, `index` returns a result type
+/// which may indicate failure due to indexing out of bounds or bad data. `idx` returns a
+/// Null Reader in the event of any failure.
+pub struct VectorReader<B> {
+    pub(super) reader: Reader<B>,
+    // Cache the length because read_usize can be slow.
+    pub(super) length: usize,
+}
+
+impl<B: Buffer> Clone for VectorReader<B> {
+    fn clone(&self) -> Self {
+        VectorReader {
+            reader: self.reader.clone(),
+            ..*self
+        }
+    }
+}
+
+impl<B: Buffer> Default for VectorReader<B> {
+    fn default() -> Self {
+        VectorReader {
+            reader: Reader::default(),
+            length: usize::default()
+        }
+    }
+}
+
+impl<B: Buffer> VectorReader<B> {
+    /// Returns the number of elements in the vector.
+    pub fn len(&self) -> usize {
+        self.length
+    }
+    /// Returns true if there are 0 elements in the vector.
+    pub fn is_empty(&self) -> bool {
+        self.length == 0
+    }
+    fn get_elem_type(&self, i: usize) -> Result<(FlexBufferType, BitWidth), Error> {
+        if let Some(ty) = self.reader.fxb_type.typed_vector_type() {
+            Ok((ty, self.reader.width))
+        } else {
+            let types_addr = self.reader.address + self.length * self.reader.width.n_bytes();
+            self.reader
+                .buffer
+                .get(types_addr + i)
+                .ok_or(Error::FlexbufferOutOfBounds)
+                .and_then(|&t| unpack_type(t))
+        }
+    }
+    /// Index into a flexbuffer vector. Any errors are defaulted to Null Readers.
+    pub fn idx(&self, i: usize) -> Reader<B> {
+        self.index(i).unwrap_or_default()
+    }
+    /// Index into a flexbuffer.
+    pub fn index(&self, i: usize) -> Result<Reader<B>, Error> {
+        if i >= self.length {
+            return Err(Error::IndexOutOfBounds);
+        }
+        let (fxb_type, bw) = self.get_elem_type(i)?;
+        let data_address = self.reader.address + self.reader.width.n_bytes() * i;
+        Reader::new(
+            self.reader.buffer.shallow_copy(),
+            data_address,
+            fxb_type,
+            bw,
+            self.reader.width,
+        )
+    }
+
+    pub fn iter(&self) -> ReaderIterator<B> {
+        ReaderIterator::new(self.clone())
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.cs b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.cs
new file mode 100644
index 0000000..d07caf7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.cs
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// To run, use the `csharp_sample.sh` script.
+
+using System;
+using FlatBuffers;
+using MyGame.Sample;
+
+class SampleBinary
+{
+  // Example how to use FlatBuffers to create and read binary buffers.
+  static void Main()
+  {
+    var builder = new FlatBufferBuilder(1);
+
+    // Create some weapons for our Monster ('Sword' and 'Axe').
+    var weapon1Name = builder.CreateString("Sword");
+    var weapon1Damage = 3;
+    var weapon2Name = builder.CreateString("Axe");
+    var weapon2Damage = 5;
+
+    // Use the `CreateWeapon()` helper function to create the weapons, since we set every field.
+    var weaps = new Offset<Weapon>[2];
+    weaps[0] = Weapon.CreateWeapon(builder, weapon1Name, (short)weapon1Damage);
+    weaps[1] = Weapon.CreateWeapon(builder, weapon2Name, (short)weapon2Damage);
+
+    // Serialize the FlatBuffer data.
+    var name = builder.CreateString("Orc");
+    var treasure =  new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    var inv = Monster.CreateInventoryVector(builder, treasure);
+    var weapons = Monster.CreateWeaponsVector(builder, weaps);
+    var pos = Vec3.CreateVec3(builder, 1.0f, 2.0f, 3.0f);
+
+    Monster.StartMonster(builder);
+    Monster.AddPos(builder, pos);
+    Monster.AddHp(builder, (short)300);
+    Monster.AddName(builder, name);
+    Monster.AddInventory(builder, inv);
+    Monster.AddColor(builder, Color.Red);
+    Monster.AddWeapons(builder, weapons);
+    Monster.AddEquippedType(builder, Equipment.Weapon);
+    Monster.AddEquipped(builder, weaps[1].Value);
+    var orc = Monster.EndMonster(builder);
+
+    builder.Finish(orc.Value); // You could also call `Monster.FinishMonsterBuffer(builder, orc);`.
+
+    // We now have a FlatBuffer that we could store on disk or send over a network.
+
+    // ...Code to store to disk or send over a network goes here...
+
+    // Instead, we are going to access it right away, as if we just received it.
+
+    var buf = builder.DataBuffer;
+
+    // Get access to the root:
+    var monster = Monster.GetRootAsMonster(buf);
+
+    // For C#, unlike other languages, most values (except for vectors and unions) are available as
+    // properties instead of accessor methods.
+
+    // Note: We did not set the `Mana` field explicitly, so we get back the default value.
+    Assert(monster.Mana == 150, "monster.Mana", Convert.ToString(monster.Mana),
+           Convert.ToString(150));
+    Assert(monster.Hp == 300, "monster.Hp", Convert.ToString(monster.Hp), Convert.ToString(30));
+    Assert(monster.Name.Equals("Orc", StringComparison.Ordinal), "monster.Name", monster.Name,
+           "Orc");
+    Assert(monster.Color == Color.Red, "monster.Color", Convert.ToString(monster.Color),
+           Convert.ToString(Color.Red));
+
+    var vec = monster.Pos.Value;
+    Assert(vec.X == 1.0f, "vec.X",
+           Convert.ToString(vec.X), Convert.ToString(1.0f));
+    Assert(vec.Y == 2.0f, "vec.Y",
+           Convert.ToString(vec.Y), Convert.ToString(2.0f));
+    Assert(vec.Z == 3.0f, "vec.Z",
+           Convert.ToString(vec.Z), Convert.ToString(3.0f));
+
+    // Get and test the `Inventory` FlatBuffer `vector`.
+    for (int i = 0; i < monster.InventoryLength; i++)
+    {
+      Assert(monster.Inventory(i) == i, "monster.Inventory",
+             Convert.ToString(monster.Inventory(i)), Convert.ToString(i));
+    }
+
+    // Get and test the `Weapons` FlatBuffer `vector` of `table`s.
+    var expectedWeaponNames = new string[] {"Sword", "Axe"};
+    var expectedWeaponDamages = new short[] {3, 5};
+    for (int i = 0; i < monster.WeaponsLength; i++)
+    {
+      Assert(monster.Weapons(i).Value.Name.Equals(expectedWeaponNames[i], StringComparison.Ordinal),
+             "monster.Weapons", monster.Weapons(i).Value.Name, expectedWeaponNames[i]);
+      Assert(monster.Weapons(i).Value.Damage == expectedWeaponDamages[i], "monster.GetWeapons",
+             Convert.ToString(monster.Weapons(i).Value.Damage),
+             Convert.ToString(expectedWeaponDamages[i]));
+    }
+
+    // Get and test the `Equipped` FlatBuffer `union`.
+    Assert(monster.EquippedType == Equipment.Weapon, "monster.EquippedType",
+           Convert.ToString(monster.EquippedType), Convert.ToString(Equipment.Weapon));
+    var equipped = monster.Equipped<Weapon>().Value;
+    Assert(equipped.Name.Equals("Axe", StringComparison.Ordinal), "equipped.Name", equipped.Name,
+           "Axe");
+    Assert(equipped.Damage == 5, "equipped.Damage", Convert.ToString(equipped.Damage),
+           Convert.ToString(5));
+
+    Console.WriteLine("The FlatBuffer was successfully created and verified!");
+  }
+
+  // A helper function to handle assertions.
+  static void Assert(bool assertPassed, string codeExecuted, string actualValue,
+                     string expectedValue)
+  {
+    if (assertPassed == false)
+    {
+      Console.WriteLine("Assert failed! " + codeExecuted + " (" + actualValue +
+          ") was not equal to " + expectedValue + ".");
+      System.Environment.Exit(1);
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.java b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.java
new file mode 100644
index 0000000..f7052ae
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.java
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Run this file with the `java_sample.sh` script.
+
+import MyGame.Sample.Color;
+import MyGame.Sample.Equipment;
+import MyGame.Sample.Monster;
+import MyGame.Sample.Vec3;
+import MyGame.Sample.Weapon;
+
+import com.google.flatbuffers.FlatBufferBuilder;
+
+import java.nio.ByteBuffer;
+
+class SampleBinary {
+  // Example how to use FlatBuffers to create and read binary buffers.
+  public static void main(String[] args) {
+    FlatBufferBuilder builder = new FlatBufferBuilder(0);
+
+    // Create some weapons for our Monster ('Sword' and 'Axe').
+    int weaponOneName = builder.createString("Sword");
+    short weaponOneDamage = 3;
+    int weaponTwoName = builder.createString("Axe");
+    short weaponTwoDamage = 5;
+
+    // Use the `createWeapon()` helper function to create the weapons, since we set every field.
+    int[] weaps = new int[2];
+    weaps[0] = Weapon.createWeapon(builder, weaponOneName, weaponOneDamage);
+    weaps[1] = Weapon.createWeapon(builder, weaponTwoName, weaponTwoDamage);
+
+    // Serialize the FlatBuffer data.
+    int name = builder.createString("Orc");
+    byte[] treasure = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    int inv = Monster.createInventoryVector(builder, treasure);
+    int weapons = Monster.createWeaponsVector(builder, weaps);
+    int pos = Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f);
+
+    Monster.startMonster(builder);
+    Monster.addPos(builder, pos);
+    Monster.addName(builder, name);
+    Monster.addColor(builder, Color.Red);
+    Monster.addHp(builder, (short)300);
+    Monster.addInventory(builder, inv);
+    Monster.addWeapons(builder, weapons);
+    Monster.addEquippedType(builder, Equipment.Weapon);
+    Monster.addEquipped(builder, weaps[1]);
+    int orc = Monster.endMonster(builder);
+
+    builder.finish(orc); // You could also call `Monster.finishMonsterBuffer(builder, orc);`.
+
+    // We now have a FlatBuffer that can be stored on disk or sent over a network.
+
+    // ...Code to store to disk or send over a network goes here...
+
+    // Instead, we are going to access it right away, as if we just received it.
+
+    ByteBuffer buf = builder.dataBuffer();
+
+    // Get access to the root:
+    Monster monster = Monster.getRootAsMonster(buf);
+
+    // Note: We did not set the `mana` field explicitly, so we get back the default value.
+    assert monster.mana() == (short)150;
+    assert monster.hp() == (short)300;
+    assert monster.name().equals("Orc");
+    assert monster.color() == Color.Red;
+    assert monster.pos().x() == 1.0f;
+    assert monster.pos().y() == 2.0f;
+    assert monster.pos().z() == 3.0f;
+
+    // Get and test the `inventory` FlatBuffer `vector`.
+    for (int i = 0; i < monster.inventoryLength(); i++) {
+      assert monster.inventory(i) == (byte)i;
+    }
+
+    // Get and test the `weapons` FlatBuffer `vector` of `table`s.
+    String[] expectedWeaponNames = {"Sword", "Axe"};
+    int[] expectedWeaponDamages = {3, 5};
+    for (int i = 0; i < monster.weaponsLength(); i++) {
+      assert monster.weapons(i).name().equals(expectedWeaponNames[i]);
+      assert monster.weapons(i).damage() == expectedWeaponDamages[i];
+    }
+
+    Weapon.Vector weaponsVector = monster.weaponsVector();
+    for (int i = 0; i < weaponsVector.length(); i++) {
+      assert weaponsVector.get(i).name().equals(expectedWeaponNames[i]);
+      assert weaponsVector.get(i).damage() == expectedWeaponDamages[i];
+    }
+
+    // Get and test the `equipped` FlatBuffer `union`.
+    assert monster.equippedType() == Equipment.Weapon;
+    Weapon equipped = (Weapon)monster.equipped(new Weapon());
+    assert equipped.name().equals("Axe");
+    assert equipped.damage() == 5;
+
+    System.out.println("The FlatBuffer was successfully created and verified!");
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.kt b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.kt
new file mode 100644
index 0000000..2974f36
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.kt
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Run this file with the `java_sample.sh` script.
+
+import MyGame.Sample.Color
+import MyGame.Sample.Equipment
+import MyGame.Sample.Monster
+import MyGame.Sample.Vec3
+import MyGame.Sample.Weapon
+
+import com.google.flatbuffers.FlatBufferBuilder
+
+class SampleBinary {
+
+  companion object {
+    // Example how to use FlatBuffers to create and read binary buffers.
+    @JvmStatic
+    fun main(args: Array<String>) {
+        val builder = FlatBufferBuilder(0)
+
+        // Create some weapons for our Monster ('Sword' and 'Axe').
+        val weaponOneName = builder.createString("Sword")
+        val weaponOneDamage: Short = 3
+        val weaponTwoName = builder.createString("Axe")
+        val weaponTwoDamage: Short = 5
+
+        // Use the `createWeapon()` helper function to create the weapons, since we set every field.
+        val weaps = IntArray(2)
+        weaps[0] = Weapon.createWeapon(builder, weaponOneName, weaponOneDamage)
+        weaps[1] = Weapon.createWeapon(builder, weaponTwoName, weaponTwoDamage)
+
+        // Serialize the FlatBuffer data.
+        val name = builder.createString("Orc")
+        val treasure = byteArrayOf(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+        val inv = Monster.createInventoryVector(builder, treasure)
+        val weapons = Monster.createWeaponsVector(builder, weaps)
+        val pos = Vec3.createVec3(builder, 1.0f, 2.0f, 3.0f)
+
+        Monster.startMonster(builder)
+        Monster.addPos(builder, pos)
+        Monster.addName(builder, name)
+        Monster.addColor(builder, Color.Red)
+        Monster.addHp(builder, 300.toShort())
+        Monster.addInventory(builder, inv)
+        Monster.addWeapons(builder, weapons)
+        Monster.addEquippedType(builder, Equipment.Weapon)
+        Monster.addEquipped(builder, weaps[1])
+        val orc = Monster.endMonster(builder)
+
+        builder.finish(orc) // You could also call `Monster.finishMonsterBuffer(builder, orc);`.
+
+        // We now have a FlatBuffer that can be stored on disk or sent over a network.
+
+        // ...Code to store to disk or send over a network goes here...
+
+        // Instead, we are going to access it right away, as if we just received it.
+
+        val buf = builder.dataBuffer()
+
+        // Get access to the root:
+        val monster = Monster.getRootAsMonster(buf)
+
+        // Note: We did not set the `mana` field explicitly, so we get back the default value.
+        assert(monster.mana == 150.toShort())
+        assert(monster.hp == 300.toShort())
+        assert(monster.name.equals("Orc"))
+        assert(monster.color == Color.Red)
+        assert(monster.pos!!.x == 1.0f)
+        assert(monster.pos!!.y == 2.0f)
+        assert(monster.pos!!.z == 3.0f)
+
+        // Get and test the `inventory` FlatBuffer `vector`.
+        for (i in 0 until monster.inventoryLength) {
+            assert(monster.inventory(i) == i.toByte().toInt())
+        }
+
+        // Get and test the `weapons` FlatBuffer `vector` of `table`s.
+        val expectedWeaponNames = arrayOf("Sword", "Axe")
+        val expectedWeaponDamages = intArrayOf(3, 5)
+        for (i in 0 until monster.weaponsLength) {
+            assert(monster.weapons(i)!!.name.equals(expectedWeaponNames[i]))
+            assert(monster.weapons(i)!!.damage.toInt() == expectedWeaponDamages[i])
+        }
+
+        // Get and test the `equipped` FlatBuffer `union`.
+        assert(monster.equippedType == Equipment.Weapon)
+        val equipped = monster.equipped(Weapon()) as Weapon?
+        assert(equipped!!.name.equals("Axe"))
+        assert(equipped.damage == 5.toShort())
+
+        println("The FlatBuffer was successfully created and verified!")
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.php b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.php
new file mode 100644
index 0000000..d28ffa3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/SampleBinary.php
@@ -0,0 +1,115 @@
+<?php
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// To run, use the `php_sample.sh` script.
+
+// It is recommended that you use PSR autoload when using FlatBuffers.
+function __autoload($class_name) {
+  $class = substr($class_name, strrpos($class_name, "\\") + 1);
+  $root_dir = join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)))); // `flatbuffers` root.
+  $paths = array(join(DIRECTORY_SEPARATOR, array($root_dir, "php")),
+                 join(DIRECTORY_SEPARATOR, array($root_dir, "samples", "MyGame", "Sample")));
+  foreach ($paths as $path) {
+    $file = join(DIRECTORY_SEPARATOR, array($path, $class . ".php"));
+    if (file_exists($file)) {
+      require($file);
+      break;
+    }
+  }
+}
+
+// Example how to use FlatBuffers to create and read binary buffers.
+function main() {
+  $builder = new Google\FlatBuffers\FlatbufferBuilder(0);
+
+  // Create some weapons for our Monster using the `createWeapon()` helper function.
+  $weapon_one = $builder->createString("Sword");
+  $sword = \MyGame\Sample\Weapon::CreateWeapon($builder, $weapon_one, 3);
+  $weapon_two = $builder->createString("Axe");
+  $axe = \MyGame\Sample\Weapon::CreateWeapon($builder, $weapon_two, 5);
+
+  // Serialize the FlatBuffer data.
+  $name = $builder->createString("Orc");
+
+  $treasure = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
+  $inv = \MyGame\Sample\Monster::CreateInventoryVector($builder, $treasure);
+
+  $weaps = array($sword, $axe);
+  $weapons = \MyGame\Sample\Monster::CreateWeaponsVector($builder, $weaps);
+
+  $pos = \MyGame\Sample\Vec3::CreateVec3($builder, 1.0, 2.0, 3.0);
+
+  \MyGame\Sample\Monster::StartMonster($builder);
+  \MyGame\Sample\Monster::AddPos($builder, $pos);
+  \MyGame\Sample\Monster::AddHp($builder, 300);
+  \MyGame\Sample\Monster::AddName($builder, $name);
+  \MyGame\Sample\Monster::AddInventory($builder, $inv);
+  \MyGame\Sample\Monster::AddColor($builder, \MyGame\Sample\Color::Red);
+  \MyGame\Sample\Monster::AddWeapons($builder, $weapons);
+  \MyGame\Sample\Monster::AddEquippedType($builder, \MyGame\Sample\Equipment::Weapon);
+  \MyGame\Sample\Monster::AddEquipped($builder, $weaps[1]);
+  $orc = \MyGame\Sample\Monster::EndMonster($builder);
+
+  $builder->finish($orc); // You may also call `\MyGame\Sample\Monster::FinishMonsterBuffer($builder, $orc);`.
+
+  // We now have a FlatBuffer that can be stored on disk or sent over a network.
+
+  // ...Code to store to disk or send over a network goes here...
+
+  // Instead, we are going to access it right away, as if we just received it.
+
+  $buf = $builder->dataBuffer();
+
+  // Get access to the root:
+  $monster = \MyGame\Sample\Monster::GetRootAsMonster($buf);
+
+  $success = true; // Tracks if an assert occurred.
+
+  // Note: We did not set the `mana` field explicitly, so we get back the default value.
+  $success &= assert($monster->getMana() == 150);
+  $success &= assert($monster->getHp() == 300);
+  $success &= assert($monster->getName() == "Orc");
+  $success &= assert($monster->getColor() == \MyGame\Sample\Color::Red);
+  $success &= assert($monster->getPos()->getX() == 1.0);
+  $success &= assert($monster->getPos()->getY() == 2.0);
+  $success &= assert($monster->getPos()->getZ() == 3.0);
+
+  // Get and test the `inventory` FlatBuffer `vector`.
+  for ($i = 0; $i < $monster->getInventoryLength(); $i++) {
+    $success &= assert($monster->getInventory($i) == $i);
+  }
+
+  // Get and test the `weapons` FlatBuffer `vector` of `table`s.
+  $expected_weapon_names = array("Sword", "Axe");
+  $expected_weapon_damages = array(3, 5);
+  for ($i = 0; $i < $monster->getWeaponsLength(); $i++) {
+    $success &= assert($monster->getWeapons($i)->getName() == $expected_weapon_names[$i]);
+    $success &= assert($monster->getWeapons($i)->getDamage() == $expected_weapon_damages[$i]);
+  }
+
+  // Get and test the `equipped` FlatBuffer `union`.
+  $success &= assert($monster->getEquippedType() == \MyGame\Sample\Equipment::Weapon);
+  $success &= assert($monster->getEquipped(new \MyGame\Sample\Weapon())->getName() == "Axe");
+  $success &= assert($monster->getEquipped(new \MyGame\Sample\Weapon())->getDamage() == 5);
+
+  if ($success) {
+    print("The FlatBuffer was successfully created and verified!\n");
+  }
+}
+
+main();
+?>
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/csharp_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/csharp_sample.sh
new file mode 100755
index 0000000..ea472ed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/csharp_sample.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `mono` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootidr=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --csharp --gen-mutable monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --csharp --gen-mutable monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Compiling and running the C# sample.
+
+# Compile and execute the sample.
+mcs SampleBinary.cs MyGame/Sample/*.cs ../net/FlatBuffers/*.cs
+mono SampleBinary.exe
+
+# Cleanup temporary files.
+rm SampleBinary.exe
+rm -rf MyGame/
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/dart_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/dart_sample.sh
new file mode 100755
index 0000000..5616066
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/dart_sample.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Copyright 2018 Dan Field. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `Node.js` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+cd ../dart/example
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../../flatc ]; then
+  ../../flatc --dart ../../samples/monster.fbs
+elif [ -e ../../Debug/flatc ]; then
+  ../../Debug/flatc --dart ../../samples/monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Running the Dart sample.
+
+# Execute the sample.
+dart example.dart
+
+# Cleanup temporary files.
+git checkout monster_my_game.sample_generated.dart
+
+cd ../../samples
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/go_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/go_sample.sh
new file mode 100755
index 0000000..13a96c1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/go_sample.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `go` to be installed
+# and 'flatc' to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --go monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --go monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+ exit 1
+fi
+
+echo Compiling and running the Go sample.
+
+# Go requires a particular layout of files in order to link the necessary
+# packages. Copy these files to the respective directores to compile the
+# sample.
+mkdir -p ${sampledir}/go_gen/src/MyGame/Sample
+mkdir -p ${sampledir}/go_gen/src/github.com/google/flatbuffers/go
+cp MyGame/Sample/*.go ${sampledir}/go_gen/src/MyGame/Sample/
+cp ${sampledir}/../go/* ${sampledir}/go_gen/src/github.com/google/flatbuffers/go
+
+# Export the `GOPATH`, so that `go` will know which directories to search for
+# the libraries.
+export GOPATH=${sampledir}/go_gen/
+
+# Compile and execute the sample.
+go build -o go_sample sample_binary.go
+./go_sample
+
+# Clean up the temporary files.
+rm -rf MyGame/
+rm -rf ${sampledir}/go_gen/
+rm go_sample
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/java_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/java_sample.sh
new file mode 100755
index 0000000..bd1315f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/java_sample.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `java` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --java --gen-mutable monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --java --gen-mutable monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Compiling and running the Java sample.
+
+# Compile and execute the sample.
+javac -classpath ${sampledir}/../java:${sampledir} SampleBinary.java
+java -classpath ${sampledir}/../java:${sampledir} SampleBinary
+
+# Cleanup temporary files.
+rm -rf MyGame/
+rm ${sampledir}/../java/com/google/flatbuffers/*.class
+rm *.class
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/javascript_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/javascript_sample.sh
new file mode 100755
index 0000000..4bbc478
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/javascript_sample.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `Node.js` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --js monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --js monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Running the JavaScript sample.
+
+# Execute the sample.
+node samplebinary.js
+
+# Cleanup temporary files.
+rm monster_generated.js
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/kotlin_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/kotlin_sample.sh
new file mode 100755
index 0000000..ca4ea86
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/kotlin_sample.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `kotlin` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  echo "compiling now"
+  ../flatc --kotlin --gen-mutable monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --kotlin --gen-mutable monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Compiling and running the Kotlin sample
+
+all_kt_files=`find $sampledir -name "*.kt" -print`
+# Run test
+mkdir -v "${sampledir}/kotlin"
+echo Compiling Java Runtime
+javac ${rootdir}/java/com/google/flatbuffers/*.java -d ${sampledir}/kotlin
+echo Compiling Kotlin source
+kotlinc -classpath ${sampledir}/../java:${sampledir}/kotlin $all_kt_files SampleBinary.kt -include-runtime -d ${sampledir}/kotlin
+# Make jar
+echo Making jar
+jar cvf ${sampledir}/kotlin/kotlin_sample.jar -C ${sampledir}/kotlin . > /dev/null
+echo Running test
+kotlin -cp ${sampledir}/kotlin/kotlin_sample.jar SampleBinary
+
+# Cleanup temporary files.
+rm -rf MyGame/
+# rm -rf ${sampledir}/kotlin
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Color.lua b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Color.lua
new file mode 100644
index 0000000..8bf62e1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Color.lua
@@ -0,0 +1,11 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Sample
+
+local Color = {
+    Red = 0,
+    Green = 1,
+    Blue = 2,
+}
+
+return Color -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Equipment.lua b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Equipment.lua
new file mode 100644
index 0000000..0a20b36
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Equipment.lua
@@ -0,0 +1,10 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Sample
+
+local Equipment = {
+    NONE = 0,
+    Weapon = 1,
+}
+
+return Equipment -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Monster.lua b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Monster.lua
new file mode 100644
index 0000000..b55c27e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Monster.lua
@@ -0,0 +1,122 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Sample
+
+local flatbuffers = require('flatbuffers')
+
+local Monster = {} -- the module
+local Monster_mt = {} -- the class metatable
+
+function Monster.New()
+    local o = {}
+    setmetatable(o, {__index = Monster_mt})
+    return o
+end
+function Monster.GetRootAsMonster(buf, offset)
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Monster.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Monster_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Monster_mt:Pos()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        local x = o + self.view.pos
+        local obj = require('MyGame.Sample.Vec3').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Mana()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 150
+end
+function Monster_mt:Hp()
+    local o = self.view:Offset(8)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 100
+end
+function Monster_mt:Name()
+    local o = self.view:Offset(10)
+    if o ~= 0 then
+        return self.view:String(o + self.view.pos)
+    end
+end
+function Monster_mt:Inventory(j)
+    local o = self.view:Offset(14)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:InventoryLength()
+    local o = self.view:Offset(14)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Color()
+    local o = self.view:Offset(16)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int8, o + self.view.pos)
+    end
+    return 2
+end
+function Monster_mt:Weapons(j)
+    local o = self.view:Offset(18)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        x = self.view:Indirect(x)
+        local obj = require('MyGame.Sample.Weapon').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:WeaponsLength()
+    local o = self.view:Offset(18)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:EquippedType()
+    local o = self.view:Offset(20)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Equipped()
+    local o = self.view:Offset(22)
+    if o ~= 0 then
+        local obj = flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)
+        self.view:Union(obj, o)
+        return obj
+    end
+end
+function Monster.Start(builder) builder:StartObject(10) end
+function Monster.AddPos(builder, pos) builder:PrependStructSlot(0, pos, 0) end
+function Monster.AddMana(builder, mana) builder:PrependInt16Slot(1, mana, 150) end
+function Monster.AddHp(builder, hp) builder:PrependInt16Slot(2, hp, 100) end
+function Monster.AddName(builder, name) builder:PrependUOffsetTRelativeSlot(3, name, 0) end
+function Monster.AddInventory(builder, inventory) builder:PrependUOffsetTRelativeSlot(5, inventory, 0) end
+function Monster.StartInventoryVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddColor(builder, color) builder:PrependInt8Slot(6, color, 2) end
+function Monster.AddWeapons(builder, weapons) builder:PrependUOffsetTRelativeSlot(7, weapons, 0) end
+function Monster.StartWeaponsVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddEquippedType(builder, equippedType) builder:PrependUint8Slot(8, equippedType, 0) end
+function Monster.AddEquipped(builder, equipped) builder:PrependUOffsetTRelativeSlot(9, equipped, 0) end
+function Monster.End(builder) return builder:EndObject() end
+
+return Monster -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Vec3.lua b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Vec3.lua
new file mode 100644
index 0000000..d42b055
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Vec3.lua
@@ -0,0 +1,35 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Sample
+
+local flatbuffers = require('flatbuffers')
+
+local Vec3 = {} -- the module
+local Vec3_mt = {} -- the class metatable
+
+function Vec3.New()
+    local o = {}
+    setmetatable(o, {__index = Vec3_mt})
+    return o
+end
+function Vec3_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Vec3_mt:X()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 0)
+end
+function Vec3_mt:Y()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 4)
+end
+function Vec3_mt:Z()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 8)
+end
+function Vec3.CreateVec3(builder, x, y, z)
+    builder:Prep(4, 12)
+    builder:PrependFloat32(z)
+    builder:PrependFloat32(y)
+    builder:PrependFloat32(x)
+    return builder:Offset()
+end
+
+return Vec3 -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Weapon.lua b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Weapon.lua
new file mode 100644
index 0000000..60442b9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/lua/MyGame/Sample/Weapon.lua
@@ -0,0 +1,42 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Sample
+
+local flatbuffers = require('flatbuffers')
+
+local Weapon = {} -- the module
+local Weapon_mt = {} -- the class metatable
+
+function Weapon.New()
+    local o = {}
+    setmetatable(o, {__index = Weapon_mt})
+    return o
+end
+function Weapon.GetRootAsWeapon(buf, offset)
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Weapon.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Weapon_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Weapon_mt:Name()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:String(o + self.view.pos)
+    end
+end
+function Weapon_mt:Damage()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 0
+end
+function Weapon.Start(builder) builder:StartObject(2) end
+function Weapon.AddName(builder, name) builder:PrependUOffsetTRelativeSlot(0, name, 0) end
+function Weapon.AddDamage(builder, damage) builder:PrependInt16Slot(1, damage, 0) end
+function Weapon.End(builder) return builder:EndObject() end
+
+return Weapon -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster.bfbs b/3rdparty/TNN/third_party/flatbuffers/samples/monster.bfbs
new file mode 100644
index 0000000..7e1790c
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/samples/monster.bfbs differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster.fbs b/3rdparty/TNN/third_party/flatbuffers/samples/monster.fbs
new file mode 100644
index 0000000..af22451
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monster.fbs
@@ -0,0 +1,33 @@
+// Example IDL file for our monster's schema.
+
+namespace MyGame.Sample;
+
+enum Color:byte { Red = 0, Green, Blue = 2 }
+
+union Equipment { Weapon } // Optionally add more tables.
+
+struct Vec3 {
+  x:float;
+  y:float;
+  z:float;
+}
+
+table Monster {
+  pos:Vec3;
+  mana:short = 150;
+  hp:short = 100;
+  name:string;
+  friendly:bool = false (deprecated);
+  inventory:[ubyte];
+  color:Color = Blue;
+  weapons:[Weapon];
+  equipped:Equipment;
+  path:[Vec3];
+}
+
+table Weapon {
+  name:string;
+  damage:short;
+}
+
+root_type Monster;
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.h b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.h
new file mode 100644
index 0000000..9b70be4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.h
@@ -0,0 +1,872 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MONSTER_MYGAME_SAMPLE_H_
+#define FLATBUFFERS_GENERATED_MONSTER_MYGAME_SAMPLE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace MyGame {
+namespace Sample {
+
+struct Vec3;
+
+struct Monster;
+struct MonsterBuilder;
+struct MonsterT;
+
+struct Weapon;
+struct WeaponBuilder;
+struct WeaponT;
+
+bool operator==(const Vec3 &lhs, const Vec3 &rhs);
+bool operator!=(const Vec3 &lhs, const Vec3 &rhs);
+bool operator==(const MonsterT &lhs, const MonsterT &rhs);
+bool operator!=(const MonsterT &lhs, const MonsterT &rhs);
+bool operator==(const WeaponT &lhs, const WeaponT &rhs);
+bool operator!=(const WeaponT &lhs, const WeaponT &rhs);
+
+inline const flatbuffers::TypeTable *Vec3TypeTable();
+
+inline const flatbuffers::TypeTable *MonsterTypeTable();
+
+inline const flatbuffers::TypeTable *WeaponTypeTable();
+
+enum Color : int8_t {
+  Color_Red = 0,
+  Color_Green = 1,
+  Color_Blue = 2,
+  Color_MIN = Color_Red,
+  Color_MAX = Color_Blue
+};
+
+inline const Color (&EnumValuesColor())[3] {
+  static const Color values[] = {
+    Color_Red,
+    Color_Green,
+    Color_Blue
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesColor() {
+  static const char * const names[4] = {
+    "Red",
+    "Green",
+    "Blue",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameColor(Color e) {
+  if (flatbuffers::IsOutRange(e, Color_Red, Color_Blue)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesColor()[index];
+}
+
+enum Equipment : uint8_t {
+  Equipment_NONE = 0,
+  Equipment_Weapon = 1,
+  Equipment_MIN = Equipment_NONE,
+  Equipment_MAX = Equipment_Weapon
+};
+
+inline const Equipment (&EnumValuesEquipment())[2] {
+  static const Equipment values[] = {
+    Equipment_NONE,
+    Equipment_Weapon
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEquipment() {
+  static const char * const names[3] = {
+    "NONE",
+    "Weapon",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEquipment(Equipment e) {
+  if (flatbuffers::IsOutRange(e, Equipment_NONE, Equipment_Weapon)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEquipment()[index];
+}
+
+template<typename T> struct EquipmentTraits {
+  static const Equipment enum_value = Equipment_NONE;
+};
+
+template<> struct EquipmentTraits<MyGame::Sample::Weapon> {
+  static const Equipment enum_value = Equipment_Weapon;
+};
+
+struct EquipmentUnion {
+  Equipment type;
+  void *value;
+
+  EquipmentUnion() : type(Equipment_NONE), value(nullptr) {}
+  EquipmentUnion(EquipmentUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(Equipment_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  EquipmentUnion(const EquipmentUnion &);
+  EquipmentUnion &operator=(const EquipmentUnion &u)
+    { EquipmentUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  EquipmentUnion &operator=(EquipmentUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~EquipmentUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = EquipmentTraits<typename RT::TableType>::enum_value;
+    if (type != Equipment_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, Equipment type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Sample::WeaponT *AsWeapon() {
+    return type == Equipment_Weapon ?
+      reinterpret_cast<MyGame::Sample::WeaponT *>(value) : nullptr;
+  }
+  const MyGame::Sample::WeaponT *AsWeapon() const {
+    return type == Equipment_Weapon ?
+      reinterpret_cast<const MyGame::Sample::WeaponT *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const EquipmentUnion &lhs, const EquipmentUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case Equipment_NONE: {
+      return true;
+    }
+    case Equipment_Weapon: {
+      return *(reinterpret_cast<const MyGame::Sample::WeaponT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Sample::WeaponT *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const EquipmentUnion &lhs, const EquipmentUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyEquipment(flatbuffers::Verifier &verifier, const void *obj, Equipment type);
+bool VerifyEquipmentVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Vec3 FLATBUFFERS_FINAL_CLASS {
+ private:
+  float x_;
+  float y_;
+  float z_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Vec3TypeTable();
+  }
+  Vec3()
+      : x_(0),
+        y_(0),
+        z_(0) {
+  }
+  Vec3(float _x, float _y, float _z)
+      : x_(flatbuffers::EndianScalar(_x)),
+        y_(flatbuffers::EndianScalar(_y)),
+        z_(flatbuffers::EndianScalar(_z)) {
+  }
+  float x() const {
+    return flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(float _x) {
+    flatbuffers::WriteScalar(&x_, _x);
+  }
+  float y() const {
+    return flatbuffers::EndianScalar(y_);
+  }
+  void mutate_y(float _y) {
+    flatbuffers::WriteScalar(&y_, _y);
+  }
+  float z() const {
+    return flatbuffers::EndianScalar(z_);
+  }
+  void mutate_z(float _z) {
+    flatbuffers::WriteScalar(&z_, _z);
+  }
+};
+FLATBUFFERS_STRUCT_END(Vec3, 12);
+
+inline bool operator==(const Vec3 &lhs, const Vec3 &rhs) {
+  return
+      (lhs.x() == rhs.x()) &&
+      (lhs.y() == rhs.y()) &&
+      (lhs.z() == rhs.z());
+}
+
+inline bool operator!=(const Vec3 &lhs, const Vec3 &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct MonsterT : public flatbuffers::NativeTable {
+  typedef Monster TableType;
+  flatbuffers::unique_ptr<MyGame::Sample::Vec3> pos{};
+  int16_t mana = 150;
+  int16_t hp = 100;
+  std::string name{};
+  std::vector<uint8_t> inventory{};
+  MyGame::Sample::Color color = MyGame::Sample::Color_Blue;
+  std::vector<flatbuffers::unique_ptr<MyGame::Sample::WeaponT>> weapons{};
+  MyGame::Sample::EquipmentUnion equipped{};
+  std::vector<MyGame::Sample::Vec3> path{};
+};
+
+inline bool operator==(const MonsterT &lhs, const MonsterT &rhs) {
+  return
+      (lhs.pos == rhs.pos) &&
+      (lhs.mana == rhs.mana) &&
+      (lhs.hp == rhs.hp) &&
+      (lhs.name == rhs.name) &&
+      (lhs.inventory == rhs.inventory) &&
+      (lhs.color == rhs.color) &&
+      (lhs.weapons == rhs.weapons) &&
+      (lhs.equipped == rhs.equipped) &&
+      (lhs.path == rhs.path);
+}
+
+inline bool operator!=(const MonsterT &lhs, const MonsterT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Monster FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterT NativeTableType;
+  typedef MonsterBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_POS = 4,
+    VT_MANA = 6,
+    VT_HP = 8,
+    VT_NAME = 10,
+    VT_INVENTORY = 14,
+    VT_COLOR = 16,
+    VT_WEAPONS = 18,
+    VT_EQUIPPED_TYPE = 20,
+    VT_EQUIPPED = 22,
+    VT_PATH = 24
+  };
+  const MyGame::Sample::Vec3 *pos() const {
+    return GetStruct<const MyGame::Sample::Vec3 *>(VT_POS);
+  }
+  MyGame::Sample::Vec3 *mutable_pos() {
+    return GetStruct<MyGame::Sample::Vec3 *>(VT_POS);
+  }
+  int16_t mana() const {
+    return GetField<int16_t>(VT_MANA, 150);
+  }
+  bool mutate_mana(int16_t _mana) {
+    return SetField<int16_t>(VT_MANA, _mana, 150);
+  }
+  int16_t hp() const {
+    return GetField<int16_t>(VT_HP, 100);
+  }
+  bool mutate_hp(int16_t _hp) {
+    return SetField<int16_t>(VT_HP, _hp, 100);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  flatbuffers::String *mutable_name() {
+    return GetPointer<flatbuffers::String *>(VT_NAME);
+  }
+  const flatbuffers::Vector<uint8_t> *inventory() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_inventory() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  MyGame::Sample::Color color() const {
+    return static_cast<MyGame::Sample::Color>(GetField<int8_t>(VT_COLOR, 2));
+  }
+  bool mutate_color(MyGame::Sample::Color _color) {
+    return SetField<int8_t>(VT_COLOR, static_cast<int8_t>(_color), 2);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>> *weapons() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>> *>(VT_WEAPONS);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>> *mutable_weapons() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>> *>(VT_WEAPONS);
+  }
+  MyGame::Sample::Equipment equipped_type() const {
+    return static_cast<MyGame::Sample::Equipment>(GetField<uint8_t>(VT_EQUIPPED_TYPE, 0));
+  }
+  const void *equipped() const {
+    return GetPointer<const void *>(VT_EQUIPPED);
+  }
+  template<typename T> const T *equipped_as() const;
+  const MyGame::Sample::Weapon *equipped_as_Weapon() const {
+    return equipped_type() == MyGame::Sample::Equipment_Weapon ? static_cast<const MyGame::Sample::Weapon *>(equipped()) : nullptr;
+  }
+  void *mutable_equipped() {
+    return GetPointer<void *>(VT_EQUIPPED);
+  }
+  const flatbuffers::Vector<const MyGame::Sample::Vec3 *> *path() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Sample::Vec3 *> *>(VT_PATH);
+  }
+  flatbuffers::Vector<const MyGame::Sample::Vec3 *> *mutable_path() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Sample::Vec3 *> *>(VT_PATH);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<MyGame::Sample::Vec3>(verifier, VT_POS) &&
+           VerifyField<int16_t>(verifier, VT_MANA) &&
+           VerifyField<int16_t>(verifier, VT_HP) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_INVENTORY) &&
+           verifier.VerifyVector(inventory()) &&
+           VerifyField<int8_t>(verifier, VT_COLOR) &&
+           VerifyOffset(verifier, VT_WEAPONS) &&
+           verifier.VerifyVector(weapons()) &&
+           verifier.VerifyVectorOfTables(weapons()) &&
+           VerifyField<uint8_t>(verifier, VT_EQUIPPED_TYPE) &&
+           VerifyOffset(verifier, VT_EQUIPPED) &&
+           VerifyEquipment(verifier, equipped(), equipped_type()) &&
+           VerifyOffset(verifier, VT_PATH) &&
+           verifier.VerifyVector(path()) &&
+           verifier.EndTable();
+  }
+  MonsterT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Monster> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const MyGame::Sample::Weapon *Monster::equipped_as<MyGame::Sample::Weapon>() const {
+  return equipped_as_Weapon();
+}
+
+struct MonsterBuilder {
+  typedef Monster Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_pos(const MyGame::Sample::Vec3 *pos) {
+    fbb_.AddStruct(Monster::VT_POS, pos);
+  }
+  void add_mana(int16_t mana) {
+    fbb_.AddElement<int16_t>(Monster::VT_MANA, mana, 150);
+  }
+  void add_hp(int16_t hp) {
+    fbb_.AddElement<int16_t>(Monster::VT_HP, hp, 100);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Monster::VT_NAME, name);
+  }
+  void add_inventory(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory) {
+    fbb_.AddOffset(Monster::VT_INVENTORY, inventory);
+  }
+  void add_color(MyGame::Sample::Color color) {
+    fbb_.AddElement<int8_t>(Monster::VT_COLOR, static_cast<int8_t>(color), 2);
+  }
+  void add_weapons(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>>> weapons) {
+    fbb_.AddOffset(Monster::VT_WEAPONS, weapons);
+  }
+  void add_equipped_type(MyGame::Sample::Equipment equipped_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_EQUIPPED_TYPE, static_cast<uint8_t>(equipped_type), 0);
+  }
+  void add_equipped(flatbuffers::Offset<void> equipped) {
+    fbb_.AddOffset(Monster::VT_EQUIPPED, equipped);
+  }
+  void add_path(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Sample::Vec3 *>> path) {
+    fbb_.AddOffset(Monster::VT_PATH, path);
+  }
+  explicit MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Monster> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Monster>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Monster> CreateMonster(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Sample::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory = 0,
+    MyGame::Sample::Color color = MyGame::Sample::Color_Blue,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Sample::Weapon>>> weapons = 0,
+    MyGame::Sample::Equipment equipped_type = MyGame::Sample::Equipment_NONE,
+    flatbuffers::Offset<void> equipped = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Sample::Vec3 *>> path = 0) {
+  MonsterBuilder builder_(_fbb);
+  builder_.add_path(path);
+  builder_.add_equipped(equipped);
+  builder_.add_weapons(weapons);
+  builder_.add_inventory(inventory);
+  builder_.add_name(name);
+  builder_.add_pos(pos);
+  builder_.add_hp(hp);
+  builder_.add_mana(mana);
+  builder_.add_equipped_type(equipped_type);
+  builder_.add_color(color);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Monster> CreateMonsterDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Sample::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    const char *name = nullptr,
+    const std::vector<uint8_t> *inventory = nullptr,
+    MyGame::Sample::Color color = MyGame::Sample::Color_Blue,
+    const std::vector<flatbuffers::Offset<MyGame::Sample::Weapon>> *weapons = nullptr,
+    MyGame::Sample::Equipment equipped_type = MyGame::Sample::Equipment_NONE,
+    flatbuffers::Offset<void> equipped = 0,
+    const std::vector<MyGame::Sample::Vec3> *path = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto inventory__ = inventory ? _fbb.CreateVector<uint8_t>(*inventory) : 0;
+  auto weapons__ = weapons ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Sample::Weapon>>(*weapons) : 0;
+  auto path__ = path ? _fbb.CreateVectorOfStructs<MyGame::Sample::Vec3>(*path) : 0;
+  return MyGame::Sample::CreateMonster(
+      _fbb,
+      pos,
+      mana,
+      hp,
+      name__,
+      inventory__,
+      color,
+      weapons__,
+      equipped_type,
+      equipped,
+      path__);
+}
+
+flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WeaponT : public flatbuffers::NativeTable {
+  typedef Weapon TableType;
+  std::string name{};
+  int16_t damage = 0;
+};
+
+inline bool operator==(const WeaponT &lhs, const WeaponT &rhs) {
+  return
+      (lhs.name == rhs.name) &&
+      (lhs.damage == rhs.damage);
+}
+
+inline bool operator!=(const WeaponT &lhs, const WeaponT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Weapon FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WeaponT NativeTableType;
+  typedef WeaponBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return WeaponTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_DAMAGE = 6
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  flatbuffers::String *mutable_name() {
+    return GetPointer<flatbuffers::String *>(VT_NAME);
+  }
+  int16_t damage() const {
+    return GetField<int16_t>(VT_DAMAGE, 0);
+  }
+  bool mutate_damage(int16_t _damage) {
+    return SetField<int16_t>(VT_DAMAGE, _damage, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int16_t>(verifier, VT_DAMAGE) &&
+           verifier.EndTable();
+  }
+  WeaponT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WeaponT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Weapon> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WeaponT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WeaponBuilder {
+  typedef Weapon Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Weapon::VT_NAME, name);
+  }
+  void add_damage(int16_t damage) {
+    fbb_.AddElement<int16_t>(Weapon::VT_DAMAGE, damage, 0);
+  }
+  explicit WeaponBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Weapon> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Weapon>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Weapon> CreateWeapon(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    int16_t damage = 0) {
+  WeaponBuilder builder_(_fbb);
+  builder_.add_name(name);
+  builder_.add_damage(damage);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Weapon> CreateWeaponDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int16_t damage = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return MyGame::Sample::CreateWeapon(
+      _fbb,
+      name__,
+      damage);
+}
+
+flatbuffers::Offset<Weapon> CreateWeapon(flatbuffers::FlatBufferBuilder &_fbb, const WeaponT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline MonsterT *Monster::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MonsterT>(new MonsterT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Monster::UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = pos(); if (_e) _o->pos = flatbuffers::unique_ptr<MyGame::Sample::Vec3>(new MyGame::Sample::Vec3(*_e)); }
+  { auto _e = mana(); _o->mana = _e; }
+  { auto _e = hp(); _o->hp = _e; }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = inventory(); if (_e) { _o->inventory.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->inventory.begin()); } }
+  { auto _e = color(); _o->color = _e; }
+  { auto _e = weapons(); if (_e) { _o->weapons.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->weapons[_i] = flatbuffers::unique_ptr<MyGame::Sample::WeaponT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = equipped_type(); _o->equipped.type = _e; }
+  { auto _e = equipped(); if (_e) _o->equipped.value = MyGame::Sample::EquipmentUnion::UnPack(_e, equipped_type(), _resolver); }
+  { auto _e = path(); if (_e) { _o->path.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->path[_i] = *_e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<Monster> Monster::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonster(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _pos = _o->pos ? _o->pos.get() : 0;
+  auto _mana = _o->mana;
+  auto _hp = _o->hp;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _inventory = _o->inventory.size() ? _fbb.CreateVector(_o->inventory) : 0;
+  auto _color = _o->color;
+  auto _weapons = _o->weapons.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Sample::Weapon>> (_o->weapons.size(), [](size_t i, _VectorArgs *__va) { return CreateWeapon(*__va->__fbb, __va->__o->weapons[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _equipped_type = _o->equipped.type;
+  auto _equipped = _o->equipped.Pack(_fbb);
+  auto _path = _o->path.size() ? _fbb.CreateVectorOfStructs(_o->path) : 0;
+  return MyGame::Sample::CreateMonster(
+      _fbb,
+      _pos,
+      _mana,
+      _hp,
+      _name,
+      _inventory,
+      _color,
+      _weapons,
+      _equipped_type,
+      _equipped,
+      _path);
+}
+
+inline WeaponT *Weapon::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<WeaponT>(new WeaponT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Weapon::UnPackTo(WeaponT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = damage(); _o->damage = _e; }
+}
+
+inline flatbuffers::Offset<Weapon> Weapon::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WeaponT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWeapon(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Weapon> CreateWeapon(flatbuffers::FlatBufferBuilder &_fbb, const WeaponT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WeaponT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _damage = _o->damage;
+  return MyGame::Sample::CreateWeapon(
+      _fbb,
+      _name,
+      _damage);
+}
+
+inline bool VerifyEquipment(flatbuffers::Verifier &verifier, const void *obj, Equipment type) {
+  switch (type) {
+    case Equipment_NONE: {
+      return true;
+    }
+    case Equipment_Weapon: {
+      auto ptr = reinterpret_cast<const MyGame::Sample::Weapon *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyEquipmentVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyEquipment(
+        verifier,  values->Get(i), types->GetEnum<Equipment>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *EquipmentUnion::UnPack(const void *obj, Equipment type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case Equipment_Weapon: {
+      auto ptr = reinterpret_cast<const MyGame::Sample::Weapon *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> EquipmentUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case Equipment_Weapon: {
+      auto ptr = reinterpret_cast<const MyGame::Sample::WeaponT *>(value);
+      return CreateWeapon(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline EquipmentUnion::EquipmentUnion(const EquipmentUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case Equipment_Weapon: {
+      value = new MyGame::Sample::WeaponT(*reinterpret_cast<MyGame::Sample::WeaponT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void EquipmentUnion::Reset() {
+  switch (type) {
+    case Equipment_Weapon: {
+      auto ptr = reinterpret_cast<MyGame::Sample::WeaponT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = Equipment_NONE;
+}
+
+inline const flatbuffers::TypeTable *ColorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Sample::ColorTypeTable
+  };
+  static const char * const names[] = {
+    "Red",
+    "Green",
+    "Blue"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *EquipmentTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Sample::WeaponTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Weapon"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Vec3TypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8, 12 };
+  static const char * const names[] = {
+    "x",
+    "y",
+    "z"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 3, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MonsterTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 1, 2 },
+    { flatbuffers::ET_UTYPE, 0, 3 },
+    { flatbuffers::ET_SEQUENCE, 0, 3 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Sample::Vec3TypeTable,
+    MyGame::Sample::ColorTypeTable,
+    MyGame::Sample::WeaponTypeTable,
+    MyGame::Sample::EquipmentTypeTable
+  };
+  static const char * const names[] = {
+    "pos",
+    "mana",
+    "hp",
+    "name",
+    "friendly",
+    "inventory",
+    "color",
+    "weapons",
+    "equipped_type",
+    "equipped",
+    "path"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *WeaponTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "name",
+    "damage"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const MyGame::Sample::Monster *GetMonster(const void *buf) {
+  return flatbuffers::GetRoot<MyGame::Sample::Monster>(buf);
+}
+
+inline const MyGame::Sample::Monster *GetSizePrefixedMonster(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<MyGame::Sample::Monster>(buf);
+}
+
+inline Monster *GetMutableMonster(void *buf) {
+  return flatbuffers::GetMutableRoot<Monster>(buf);
+}
+
+inline bool VerifyMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<MyGame::Sample::Monster>(nullptr);
+}
+
+inline bool VerifySizePrefixedMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<MyGame::Sample::Monster>(nullptr);
+}
+
+inline void FinishMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Sample::Monster> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Sample::Monster> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+inline flatbuffers::unique_ptr<MyGame::Sample::MonsterT> UnPackMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Sample::MonsterT>(GetMonster(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<MyGame::Sample::MonsterT> UnPackSizePrefixedMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Sample::MonsterT>(GetSizePrefixedMonster(buf)->UnPack(res));
+}
+
+}  // namespace Sample
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_MONSTER_MYGAME_SAMPLE_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.lobster b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.lobster
new file mode 100644
index 0000000..702cdd4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.lobster
@@ -0,0 +1,143 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import flatbuffers
+
+namespace MyGame_Sample
+
+enum Color:
+    Color_Red = 0
+    Color_Green = 1
+    Color_Blue = 2
+
+enum Equipment:
+    Equipment_NONE = 0
+    Equipment_Weapon = 1
+
+class Vec3
+
+class Monster
+
+class Weapon
+
+class Vec3 : flatbuffers_handle
+    def x():
+        return buf_.read_float32_le(pos_ + 0)
+    def y():
+        return buf_.read_float32_le(pos_ + 4)
+    def z():
+        return buf_.read_float32_le(pos_ + 8)
+
+def CreateVec3(b_:flatbuffers_builder, x:float, y:float, z:float):
+    b_.Prep(4, 12)
+    b_.PrependFloat32(z)
+    b_.PrependFloat32(y)
+    b_.PrependFloat32(x)
+    return b_.Offset()
+
+class Monster : flatbuffers_handle
+    def pos():
+        let o = buf_.flatbuffers_field_struct(pos_, 4)
+        return if o: MyGame_Sample_Vec3 { buf_, o } else: nil
+    def mana():
+        return buf_.flatbuffers_field_int16(pos_, 6, 150)
+    def hp():
+        return buf_.flatbuffers_field_int16(pos_, 8, 100)
+    def name():
+        return buf_.flatbuffers_field_string(pos_, 10)
+    def inventory(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 14) + i * 1)
+    def inventory_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 14)
+    def color():
+        return Color(buf_.flatbuffers_field_int8(pos_, 16, 2))
+    def weapons(i:int):
+        return MyGame_Sample_Weapon { buf_, buf_.flatbuffers_indirect(buf_.flatbuffers_field_vector(pos_, 18) + i * 4) }
+    def weapons_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 18)
+    def equipped_type():
+        return Equipment(buf_.flatbuffers_field_int8(pos_, 20, 0))
+    def equipped_as_Weapon():
+        return MyGame_Sample_Weapon { buf_, buf_.flatbuffers_field_table(pos_, 22) }
+    def path(i:int):
+        return MyGame_Sample_Vec3 { buf_, buf_.flatbuffers_field_vector(pos_, 24) + i * 12 }
+    def path_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 24)
+
+def GetRootAsMonster(buf:string): return Monster { buf, buf.flatbuffers_indirect(0) }
+
+struct MonsterBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(11)
+        return this
+    def add_pos(pos:flatbuffers_offset):
+        b_.PrependStructSlot(0, pos)
+        return this
+    def add_mana(mana:int):
+        b_.PrependInt16Slot(1, mana, 150)
+        return this
+    def add_hp(hp:int):
+        b_.PrependInt16Slot(2, hp, 100)
+        return this
+    def add_name(name:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(3, name)
+        return this
+    def add_inventory(inventory:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(5, inventory)
+        return this
+    def add_color(color:Color):
+        b_.PrependInt8Slot(6, color, 2)
+        return this
+    def add_weapons(weapons:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(7, weapons)
+        return this
+    def add_equipped_type(equipped_type:Equipment):
+        b_.PrependUint8Slot(8, equipped_type, 0)
+        return this
+    def add_equipped(equipped:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(9, equipped)
+        return this
+    def add_path(path:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(10, path)
+        return this
+    def end():
+        return b_.EndObject()
+
+def MonsterStartInventoryVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateInventoryVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartWeaponsVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateWeaponsVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartPathVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(12, n_, 4)
+
+class Weapon : flatbuffers_handle
+    def name():
+        return buf_.flatbuffers_field_string(pos_, 4)
+    def damage():
+        return buf_.flatbuffers_field_int16(pos_, 6, 0)
+
+def GetRootAsWeapon(buf:string): return Weapon { buf, buf.flatbuffers_indirect(0) }
+
+struct WeaponBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(2)
+        return this
+    def add_name(name:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(0, name)
+        return this
+    def add_damage(damage:int):
+        b_.PrependInt16Slot(1, damage, 0)
+        return this
+    def end():
+        return b_.EndObject()
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.rs b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.rs
new file mode 100644
index 0000000..b3d8267
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.rs
@@ -0,0 +1,1006 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod my_game {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+#[allow(unused_imports, dead_code)]
+pub mod sample {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_COLOR: i8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_COLOR: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_COLOR: [Color; 3] = [
+  Color::Red,
+  Color::Green,
+  Color::Blue,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct Color(pub i8);
+#[allow(non_upper_case_globals)]
+impl Color {
+  pub const Red: Self = Self(0);
+  pub const Green: Self = Self(1);
+  pub const Blue: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = 0;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::Red,
+    Self::Green,
+    Self::Blue,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::Red => Some("Red"),
+      Self::Green => Some("Green"),
+      Self::Blue => Some("Blue"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for Color {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for Color {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for Color {
+    type Output = Color;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for Color {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for Color {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Color {}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_EQUIPMENT: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_EQUIPMENT: u8 = 1;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_EQUIPMENT: [Equipment; 2] = [
+  Equipment::NONE,
+  Equipment::Weapon,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct Equipment(pub u8);
+#[allow(non_upper_case_globals)]
+impl Equipment {
+  pub const NONE: Self = Self(0);
+  pub const Weapon: Self = Self(1);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 1;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::Weapon,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::Weapon => Some("Weapon"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for Equipment {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for Equipment {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for Equipment {
+    type Output = Equipment;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for Equipment {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for Equipment {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Equipment {}
+pub struct EquipmentUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum EquipmentT {
+  NONE,
+  Weapon(Box<WeaponT>),
+}
+impl Default for EquipmentT {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl EquipmentT {
+  pub fn equipment_type(&self) -> Equipment {
+    match self {
+      Self::NONE => Equipment::NONE,
+      Self::Weapon(_) => Equipment::Weapon,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::Weapon(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned WeaponT, setting the union to NONE.
+  pub fn take_weapon(&mut self) -> Option<Box<WeaponT>> {
+    if let Self::Weapon(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::Weapon(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the WeaponT.
+  pub fn as_weapon(&self) -> Option<&WeaponT> {
+    if let Self::Weapon(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the WeaponT.
+  pub fn as_weapon_mut(&mut self) -> Option<&mut WeaponT> {
+    if let Self::Weapon(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+// struct Vec3, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct Vec3(pub [u8; 12]);
+impl Default for Vec3 { 
+  fn default() -> Self { 
+    Self([0; 12])
+  }
+}
+impl std::fmt::Debug for Vec3 {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("Vec3")
+      .field("x", &self.x())
+      .field("y", &self.y())
+      .field("z", &self.z())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Vec3 {}
+impl flatbuffers::SafeSliceAccess for Vec3 {}
+impl<'a> flatbuffers::Follow<'a> for Vec3 {
+  type Inner = &'a Vec3;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a Vec3>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a Vec3 {
+  type Inner = &'a Vec3;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<Vec3>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const Vec3 as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b Vec3 {
+    type Output = Vec3;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const Vec3 as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for Vec3 {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> Vec3 {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    x: f32,
+    y: f32,
+    z: f32,
+  ) -> Self {
+    let mut s = Self([0; 12]);
+    s.set_x(x);
+    s.set_y(y);
+    s.set_z(z);
+    s
+  }
+
+  pub fn x(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_x(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn y(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[4..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_y(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[4..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn z(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[8..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_z(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[8..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn unpack(&self) -> Vec3T {
+    Vec3T {
+      x: self.x(),
+      y: self.y(),
+      z: self.z(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct Vec3T {
+  pub x: f32,
+  pub y: f32,
+  pub z: f32,
+}
+impl Vec3T {
+  pub fn pack(&self) -> Vec3 {
+    Vec3::new(
+      self.x,
+      self.y,
+      self.z,
+    )
+  }
+}
+
+pub enum MonsterOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct Monster<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Monster<'a> {
+    type Inner = Monster<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Monster<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Monster { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args MonsterArgs<'args>) -> flatbuffers::WIPOffset<Monster<'bldr>> {
+      let mut builder = MonsterBuilder::new(_fbb);
+      if let Some(x) = args.path { builder.add_path(x); }
+      if let Some(x) = args.equipped { builder.add_equipped(x); }
+      if let Some(x) = args.weapons { builder.add_weapons(x); }
+      if let Some(x) = args.inventory { builder.add_inventory(x); }
+      if let Some(x) = args.name { builder.add_name(x); }
+      if let Some(x) = args.pos { builder.add_pos(x); }
+      builder.add_hp(args.hp);
+      builder.add_mana(args.mana);
+      builder.add_equipped_type(args.equipped_type);
+      builder.add_color(args.color);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> MonsterT {
+      let pos = self.pos().map(|x| {
+        x.unpack()
+      });
+      let mana = self.mana();
+      let hp = self.hp();
+      let name = self.name().map(|x| {
+        x.to_string()
+      });
+      let inventory = self.inventory().map(|x| {
+        x.to_vec()
+      });
+      let color = self.color();
+      let weapons = self.weapons().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let equipped = match self.equipped_type() {
+        Equipment::NONE => EquipmentT::NONE,
+        Equipment::Weapon => EquipmentT::Weapon(Box::new(
+          self.equipped_as_weapon()
+              .expect("Invalid union table, expected `Equipment::Weapon`.")
+              .unpack()
+        )),
+        _ => EquipmentT::NONE,
+      };
+      let path = self.path().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      MonsterT {
+        pos,
+        mana,
+        hp,
+        name,
+        inventory,
+        color,
+        weapons,
+        equipped,
+        path,
+      }
+    }
+    pub const VT_POS: flatbuffers::VOffsetT = 4;
+    pub const VT_MANA: flatbuffers::VOffsetT = 6;
+    pub const VT_HP: flatbuffers::VOffsetT = 8;
+    pub const VT_NAME: flatbuffers::VOffsetT = 10;
+    pub const VT_INVENTORY: flatbuffers::VOffsetT = 14;
+    pub const VT_COLOR: flatbuffers::VOffsetT = 16;
+    pub const VT_WEAPONS: flatbuffers::VOffsetT = 18;
+    pub const VT_EQUIPPED_TYPE: flatbuffers::VOffsetT = 20;
+    pub const VT_EQUIPPED: flatbuffers::VOffsetT = 22;
+    pub const VT_PATH: flatbuffers::VOffsetT = 24;
+
+  #[inline]
+  pub fn pos(&self) -> Option<&'a Vec3> {
+    self._tab.get::<Vec3>(Monster::VT_POS, None)
+  }
+  #[inline]
+  pub fn mana(&self) -> i16 {
+    self._tab.get::<i16>(Monster::VT_MANA, Some(150)).unwrap()
+  }
+  #[inline]
+  pub fn hp(&self) -> i16 {
+    self._tab.get::<i16>(Monster::VT_HP, Some(100)).unwrap()
+  }
+  #[inline]
+  pub fn name(&self) -> Option<&'a str> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(Monster::VT_NAME, None)
+  }
+  #[inline]
+  pub fn inventory(&self) -> Option<&'a [u8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u8>>>(Monster::VT_INVENTORY, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn color(&self) -> Color {
+    self._tab.get::<Color>(Monster::VT_COLOR, Some(Color::Blue)).unwrap()
+  }
+  #[inline]
+  pub fn weapons(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Weapon<'a>>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Weapon>>>>(Monster::VT_WEAPONS, None)
+  }
+  #[inline]
+  pub fn equipped_type(&self) -> Equipment {
+    self._tab.get::<Equipment>(Monster::VT_EQUIPPED_TYPE, Some(Equipment::NONE)).unwrap()
+  }
+  #[inline]
+  pub fn equipped(&self) -> Option<flatbuffers::Table<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Table<'a>>>(Monster::VT_EQUIPPED, None)
+  }
+  #[inline]
+  pub fn path(&self) -> Option<&'a [Vec3]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, Vec3>>>(Monster::VT_PATH, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn equipped_as_weapon(&self) -> Option<Weapon<'a>> {
+    if self.equipped_type() == Equipment::Weapon {
+      self.equipped().map(Weapon::init_from_table)
+    } else {
+      None
+    }
+  }
+
+}
+
+impl flatbuffers::Verifiable for Monster<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<Vec3>(&"pos", Self::VT_POS, false)?
+     .visit_field::<i16>(&"mana", Self::VT_MANA, false)?
+     .visit_field::<i16>(&"hp", Self::VT_HP, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"name", Self::VT_NAME, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u8>>>(&"inventory", Self::VT_INVENTORY, false)?
+     .visit_field::<Color>(&"color", Self::VT_COLOR, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<Weapon>>>>(&"weapons", Self::VT_WEAPONS, false)?
+     .visit_union::<Equipment, _>(&"equipped_type", Self::VT_EQUIPPED_TYPE, &"equipped", Self::VT_EQUIPPED, false, |key, v, pos| {
+        match key {
+          Equipment::Weapon => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Weapon>>("Equipment::Weapon", pos),
+          _ => Ok(()),
+        }
+     })?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, Vec3>>>(&"path", Self::VT_PATH, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct MonsterArgs<'a> {
+    pub pos: Option<&'a Vec3>,
+    pub mana: i16,
+    pub hp: i16,
+    pub name: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub inventory: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u8>>>,
+    pub color: Color,
+    pub weapons: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Weapon<'a>>>>>,
+    pub equipped_type: Equipment,
+    pub equipped: Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>,
+    pub path: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, Vec3>>>,
+}
+impl<'a> Default for MonsterArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        MonsterArgs {
+            pos: None,
+            mana: 150,
+            hp: 100,
+            name: None,
+            inventory: None,
+            color: Color::Blue,
+            weapons: None,
+            equipped_type: Equipment::NONE,
+            equipped: None,
+            path: None,
+        }
+    }
+}
+pub struct MonsterBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> MonsterBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_pos(&mut self, pos: &Vec3) {
+    self.fbb_.push_slot_always::<&Vec3>(Monster::VT_POS, pos);
+  }
+  #[inline]
+  pub fn add_mana(&mut self, mana: i16) {
+    self.fbb_.push_slot::<i16>(Monster::VT_MANA, mana, 150);
+  }
+  #[inline]
+  pub fn add_hp(&mut self, hp: i16) {
+    self.fbb_.push_slot::<i16>(Monster::VT_HP, hp, 100);
+  }
+  #[inline]
+  pub fn add_name(&mut self, name: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_NAME, name);
+  }
+  #[inline]
+  pub fn add_inventory(&mut self, inventory: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_INVENTORY, inventory);
+  }
+  #[inline]
+  pub fn add_color(&mut self, color: Color) {
+    self.fbb_.push_slot::<Color>(Monster::VT_COLOR, color, Color::Blue);
+  }
+  #[inline]
+  pub fn add_weapons(&mut self, weapons: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<Weapon<'b >>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_WEAPONS, weapons);
+  }
+  #[inline]
+  pub fn add_equipped_type(&mut self, equipped_type: Equipment) {
+    self.fbb_.push_slot::<Equipment>(Monster::VT_EQUIPPED_TYPE, equipped_type, Equipment::NONE);
+  }
+  #[inline]
+  pub fn add_equipped(&mut self, equipped: flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_EQUIPPED, equipped);
+  }
+  #[inline]
+  pub fn add_path(&mut self, path: flatbuffers::WIPOffset<flatbuffers::Vector<'b , Vec3>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_PATH, path);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> MonsterBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    MonsterBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Monster<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Monster<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Monster");
+      ds.field("pos", &self.pos());
+      ds.field("mana", &self.mana());
+      ds.field("hp", &self.hp());
+      ds.field("name", &self.name());
+      ds.field("inventory", &self.inventory());
+      ds.field("color", &self.color());
+      ds.field("weapons", &self.weapons());
+      ds.field("equipped_type", &self.equipped_type());
+      match self.equipped_type() {
+        Equipment::Weapon => {
+          if let Some(x) = self.equipped_as_weapon() {
+            ds.field("equipped", &x)
+          } else {
+            ds.field("equipped", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        _ => {
+          let x: Option<()> = None;
+          ds.field("equipped", &x)
+        },
+      };
+      ds.field("path", &self.path());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct MonsterT {
+  pub pos: Option<Vec3T>,
+  pub mana: i16,
+  pub hp: i16,
+  pub name: Option<String>,
+  pub inventory: Option<Vec<u8>>,
+  pub color: Color,
+  pub weapons: Option<Vec<WeaponT>>,
+  pub equipped: EquipmentT,
+  pub path: Option<Vec<Vec3T>>,
+}
+impl Default for MonsterT {
+  fn default() -> Self {
+    Self {
+      pos: None,
+      mana: 150,
+      hp: 100,
+      name: None,
+      inventory: None,
+      color: Color::Blue,
+      weapons: None,
+      equipped: EquipmentT::NONE,
+      path: None,
+    }
+  }
+}
+impl MonsterT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Monster<'b>> {
+    let pos_tmp = self.pos.as_ref().map(|x| x.pack());
+    let pos = pos_tmp.as_ref();
+    let mana = self.mana;
+    let hp = self.hp;
+    let name = self.name.as_ref().map(|x|{
+      _fbb.create_string(x)
+    });
+    let inventory = self.inventory.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let color = self.color;
+    let weapons = self.weapons.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();_fbb.create_vector(&w)
+    });
+    let equipped_type = self.equipped.equipment_type();
+    let equipped = self.equipped.pack(_fbb);
+    let path = self.path.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack()).collect();_fbb.create_vector(&w)
+    });
+    Monster::create(_fbb, &MonsterArgs{
+      pos,
+      mana,
+      hp,
+      name,
+      inventory,
+      color,
+      weapons,
+      equipped_type,
+      equipped,
+      path,
+    })
+  }
+}
+pub enum WeaponOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct Weapon<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Weapon<'a> {
+    type Inner = Weapon<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Weapon<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Weapon { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args WeaponArgs<'args>) -> flatbuffers::WIPOffset<Weapon<'bldr>> {
+      let mut builder = WeaponBuilder::new(_fbb);
+      if let Some(x) = args.name { builder.add_name(x); }
+      builder.add_damage(args.damage);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> WeaponT {
+      let name = self.name().map(|x| {
+        x.to_string()
+      });
+      let damage = self.damage();
+      WeaponT {
+        name,
+        damage,
+      }
+    }
+    pub const VT_NAME: flatbuffers::VOffsetT = 4;
+    pub const VT_DAMAGE: flatbuffers::VOffsetT = 6;
+
+  #[inline]
+  pub fn name(&self) -> Option<&'a str> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(Weapon::VT_NAME, None)
+  }
+  #[inline]
+  pub fn damage(&self) -> i16 {
+    self._tab.get::<i16>(Weapon::VT_DAMAGE, Some(0)).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for Weapon<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"name", Self::VT_NAME, false)?
+     .visit_field::<i16>(&"damage", Self::VT_DAMAGE, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct WeaponArgs<'a> {
+    pub name: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub damage: i16,
+}
+impl<'a> Default for WeaponArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        WeaponArgs {
+            name: None,
+            damage: 0,
+        }
+    }
+}
+pub struct WeaponBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> WeaponBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_name(&mut self, name: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Weapon::VT_NAME, name);
+  }
+  #[inline]
+  pub fn add_damage(&mut self, damage: i16) {
+    self.fbb_.push_slot::<i16>(Weapon::VT_DAMAGE, damage, 0);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> WeaponBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    WeaponBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Weapon<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Weapon<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Weapon");
+      ds.field("name", &self.name());
+      ds.field("damage", &self.damage());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct WeaponT {
+  pub name: Option<String>,
+  pub damage: i16,
+}
+impl Default for WeaponT {
+  fn default() -> Self {
+    Self {
+      name: None,
+      damage: 0,
+    }
+  }
+}
+impl WeaponT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Weapon<'b>> {
+    let name = self.name.as_ref().map(|x|{
+      _fbb.create_string(x)
+    });
+    let damage = self.damage;
+    Weapon::create(_fbb, &WeaponArgs{
+      name,
+      damage,
+    })
+  }
+}
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_root_as_monster<'a>(buf: &'a [u8]) -> Monster<'a> {
+  unsafe { flatbuffers::root_unchecked::<Monster<'a>>(buf) }
+}
+
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_size_prefixed_root_as_monster<'a>(buf: &'a [u8]) -> Monster<'a> {
+  unsafe { flatbuffers::size_prefixed_root_unchecked::<Monster<'a>>(buf) }
+}
+
+#[inline]
+/// Verifies that a buffer of bytes contains a `Monster`
+/// and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn root_as_monster(buf: &[u8]) -> Result<Monster, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root::<Monster>(buf)
+}
+#[inline]
+/// Verifies that a buffer of bytes contains a size prefixed
+/// `Monster` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `size_prefixed_root_as_monster_unchecked`.
+pub fn size_prefixed_root_as_monster(buf: &[u8]) -> Result<Monster, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root::<Monster>(buf)
+}
+#[inline]
+/// Verifies, with the given options, that a buffer of bytes
+/// contains a `Monster` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn root_as_monster_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<Monster<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root_with_opts::<Monster<'b>>(opts, buf)
+}
+#[inline]
+/// Verifies, with the given verifier options, that a buffer of
+/// bytes contains a size prefixed `Monster` and returns
+/// it. Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn size_prefixed_root_as_monster_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<Monster<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root_with_opts::<Monster<'b>>(opts, buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a Monster and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid `Monster`.
+pub unsafe fn root_as_monster_unchecked(buf: &[u8]) -> Monster {
+  flatbuffers::root_unchecked::<Monster>(buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a size prefixed Monster and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid size prefixed `Monster`.
+pub unsafe fn size_prefixed_root_as_monster_unchecked(buf: &[u8]) -> Monster {
+  flatbuffers::size_prefixed_root_unchecked::<Monster>(buf)
+}
+#[inline]
+pub fn finish_monster_buffer<'a, 'b>(
+    fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+    root: flatbuffers::WIPOffset<Monster<'a>>) {
+  fbb.finish(root, None);
+}
+
+#[inline]
+pub fn finish_size_prefixed_monster_buffer<'a, 'b>(fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, root: flatbuffers::WIPOffset<Monster<'a>>) {
+  fbb.finish_size_prefixed(root, None);
+}
+}  // pub mod Sample
+}  // pub mod MyGame
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.swift b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.swift
new file mode 100644
index 0000000..b3b6f05
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monster_generated.swift
@@ -0,0 +1,200 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// swiftlint:disable all
+// swiftformat:disable all
+
+import FlatBuffers
+
+public enum MyGame_Sample_Color: Int8, Enum {
+  public typealias T = Int8
+  public static var byteSize: Int { return MemoryLayout<Int8>.size }
+  public var value: Int8 { return self.rawValue }
+  case red = 0
+  case green = 1
+  case blue = 2
+
+
+  public static var max: MyGame_Sample_Color { return .blue }
+  public static var min: MyGame_Sample_Color { return .red }
+}
+
+public enum MyGame_Sample_Equipment: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case none_ = 0
+  case weapon = 1
+
+
+  public static var max: MyGame_Sample_Equipment { return .weapon }
+  public static var min: MyGame_Sample_Equipment { return .none_ }
+}
+
+public struct MyGame_Sample_Vec3: NativeStruct {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _x: Float32
+  private var _y: Float32
+  private var _z: Float32
+
+  public init(x: Float32, y: Float32, z: Float32) {
+    _x = x
+    _y = y
+    _z = z
+  }
+
+  public init() {
+    _x = 0.0
+    _y = 0.0
+    _z = 0.0
+  }
+
+  public var x: Float32 { _x }
+  public var y: Float32 { _y }
+  public var z: Float32 { _z }
+}
+
+public struct MyGame_Sample_Vec3_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var x: Float32 { return _accessor.readBuffer(of: Float32.self, at: 0) }
+  @discardableResult public func mutate(x: Float32) -> Bool { return _accessor.mutate(x, index: 0) }
+  public var y: Float32 { return _accessor.readBuffer(of: Float32.self, at: 4) }
+  @discardableResult public func mutate(y: Float32) -> Bool { return _accessor.mutate(y, index: 4) }
+  public var z: Float32 { return _accessor.readBuffer(of: Float32.self, at: 8) }
+  @discardableResult public func mutate(z: Float32) -> Bool { return _accessor.mutate(z, index: 8) }
+}
+
+public struct MyGame_Sample_Monster: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func getRootAsMonster(bb: ByteBuffer) -> MyGame_Sample_Monster { return MyGame_Sample_Monster(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case pos = 4
+    case mana = 6
+    case hp = 8
+    case name = 10
+    case inventory = 14
+    case color = 16
+    case weapons = 18
+    case equippedType = 20
+    case equipped = 22
+    case path = 24
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var pos: MyGame_Sample_Vec3? { let o = _accessor.offset(VTOFFSET.pos.v); return o == 0 ? nil : _accessor.readBuffer(of: MyGame_Sample_Vec3.self, at: o) }
+  public var mutablePos: MyGame_Sample_Vec3_Mutable? { let o = _accessor.offset(VTOFFSET.pos.v); return o == 0 ? nil : MyGame_Sample_Vec3_Mutable(_accessor.bb, o: o + _accessor.postion) }
+  public var mana: Int16 { let o = _accessor.offset(VTOFFSET.mana.v); return o == 0 ? 150 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(mana: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.mana.v);  return _accessor.mutate(mana, index: o) }
+  public var hp: Int16 { let o = _accessor.offset(VTOFFSET.hp.v); return o == 0 ? 100 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(hp: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.hp.v);  return _accessor.mutate(hp, index: o) }
+  public var name: String? { let o = _accessor.offset(VTOFFSET.name.v); return o == 0 ? nil : _accessor.string(at: o) }
+  public var nameSegmentArray: [UInt8]? { return _accessor.getVector(at: VTOFFSET.name.v) }
+  public var inventoryCount: Int32 { let o = _accessor.offset(VTOFFSET.inventory.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func inventory(at index: Int32) -> UInt8 { let o = _accessor.offset(VTOFFSET.inventory.v); return o == 0 ? 0 : _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var inventory: [UInt8] { return _accessor.getVector(at: VTOFFSET.inventory.v) ?? [] }
+  public func mutate(inventory: UInt8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.inventory.v); return _accessor.directMutate(inventory, index: _accessor.vector(at: o) + index * 1) }
+  public var color: MyGame_Sample_Color { let o = _accessor.offset(VTOFFSET.color.v); return o == 0 ? .blue : MyGame_Sample_Color(rawValue: _accessor.readBuffer(of: Int8.self, at: o)) ?? .blue }
+  @discardableResult public func mutate(color: MyGame_Sample_Color) -> Bool {let o = _accessor.offset(VTOFFSET.color.v);  return _accessor.mutate(color.rawValue, index: o) }
+  public var weaponsCount: Int32 { let o = _accessor.offset(VTOFFSET.weapons.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func weapons(at index: Int32) -> MyGame_Sample_Weapon? { let o = _accessor.offset(VTOFFSET.weapons.v); return o == 0 ? nil : MyGame_Sample_Weapon(_accessor.bb, o: _accessor.indirect(_accessor.vector(at: o) + index * 4)) }
+  public var equippedType: MyGame_Sample_Equipment { let o = _accessor.offset(VTOFFSET.equippedType.v); return o == 0 ? .none_ : MyGame_Sample_Equipment(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .none_ }
+  public func equipped<T: FlatbuffersInitializable>(type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.equipped.v); return o == 0 ? nil : _accessor.union(o) }
+  public var pathCount: Int32 { let o = _accessor.offset(VTOFFSET.path.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func path(at index: Int32) -> MyGame_Sample_Vec3? { let o = _accessor.offset(VTOFFSET.path.v); return o == 0 ? nil : _accessor.directRead(of: MyGame_Sample_Vec3.self, offset: _accessor.vector(at: o) + index * 12) }
+  public func mutablePath(at index: Int32) -> MyGame_Sample_Vec3_Mutable? { let o = _accessor.offset(VTOFFSET.path.v); return o == 0 ? nil : MyGame_Sample_Vec3_Mutable(_accessor.bb, o: _accessor.vector(at: o) + index * 12) }
+  public static func startMonster(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 11) }
+  public static func add(pos: MyGame_Sample_Vec3?, _ fbb: inout FlatBufferBuilder) { guard let pos = pos else { return }; fbb.create(struct: pos, position: VTOFFSET.pos.p) }
+  public static func add(mana: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: mana, def: 150, at: VTOFFSET.mana.p) }
+  public static func add(hp: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: hp, def: 100, at: VTOFFSET.hp.p) }
+  public static func add(name: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: name, at: VTOFFSET.name.p) }
+  public static func addVectorOf(inventory: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: inventory, at: VTOFFSET.inventory.p) }
+  public static func add(color: MyGame_Sample_Color, _ fbb: inout FlatBufferBuilder) { fbb.add(element: color.rawValue, def: 2, at: VTOFFSET.color.p) }
+  public static func addVectorOf(weapons: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: weapons, at: VTOFFSET.weapons.p) }
+  public static func add(equippedType: MyGame_Sample_Equipment, _ fbb: inout FlatBufferBuilder) { fbb.add(element: equippedType.rawValue, def: 0, at: VTOFFSET.equippedType.p) }
+  public static func add(equipped: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: equipped, at: VTOFFSET.equipped.p) }
+  public static func addVectorOf(path: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: path, at: VTOFFSET.path.p) }
+  public static func startVectorOfPath(_ size: Int, in builder: inout FlatBufferBuilder) {
+    builder.startVector(size * MemoryLayout<MyGame_Sample_Vec3>.size, elementSize: MemoryLayout<MyGame_Sample_Vec3>.alignment)
+  }
+  public static func endMonster(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createMonster(
+    _ fbb: inout FlatBufferBuilder,
+    pos: MyGame_Sample_Vec3? = nil,
+    mana: Int16 = 150,
+    hp: Int16 = 100,
+    nameOffset name: Offset = Offset(),
+    inventoryVectorOffset inventory: Offset = Offset(),
+    color: MyGame_Sample_Color = .blue,
+    weaponsVectorOffset weapons: Offset = Offset(),
+    equippedType: MyGame_Sample_Equipment = .none_,
+    equippedOffset equipped: Offset = Offset(),
+    pathVectorOffset path: Offset = Offset()
+  ) -> Offset {
+    let __start = MyGame_Sample_Monster.startMonster(&fbb)
+    MyGame_Sample_Monster.add(pos: pos, &fbb)
+    MyGame_Sample_Monster.add(mana: mana, &fbb)
+    MyGame_Sample_Monster.add(hp: hp, &fbb)
+    MyGame_Sample_Monster.add(name: name, &fbb)
+    MyGame_Sample_Monster.addVectorOf(inventory: inventory, &fbb)
+    MyGame_Sample_Monster.add(color: color, &fbb)
+    MyGame_Sample_Monster.addVectorOf(weapons: weapons, &fbb)
+    MyGame_Sample_Monster.add(equippedType: equippedType, &fbb)
+    MyGame_Sample_Monster.add(equipped: equipped, &fbb)
+    MyGame_Sample_Monster.addVectorOf(path: path, &fbb)
+    return MyGame_Sample_Monster.endMonster(&fbb, start: __start)
+  }
+}
+
+public struct MyGame_Sample_Weapon: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func getRootAsWeapon(bb: ByteBuffer) -> MyGame_Sample_Weapon { return MyGame_Sample_Weapon(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case name = 4
+    case damage = 6
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var name: String? { let o = _accessor.offset(VTOFFSET.name.v); return o == 0 ? nil : _accessor.string(at: o) }
+  public var nameSegmentArray: [UInt8]? { return _accessor.getVector(at: VTOFFSET.name.v) }
+  public var damage: Int16 { let o = _accessor.offset(VTOFFSET.damage.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(damage: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.damage.v);  return _accessor.mutate(damage, index: o) }
+  public static func startWeapon(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 2) }
+  public static func add(name: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: name, at: VTOFFSET.name.p) }
+  public static func add(damage: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: damage, def: 0, at: VTOFFSET.damage.p) }
+  public static func endWeapon(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createWeapon(
+    _ fbb: inout FlatBufferBuilder,
+    nameOffset name: Offset = Offset(),
+    damage: Int16 = 0
+  ) -> Offset {
+    let __start = MyGame_Sample_Weapon.startWeapon(&fbb)
+    MyGame_Sample_Weapon.add(name: name, &fbb)
+    MyGame_Sample_Weapon.add(damage: damage, &fbb)
+    return MyGame_Sample_Weapon.endWeapon(&fbb, start: __start)
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/monsterdata.json b/3rdparty/TNN/third_party/flatbuffers/samples/monsterdata.json
new file mode 100644
index 0000000..b1e7090
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/monsterdata.json
@@ -0,0 +1,24 @@
+{
+  "pos": {
+    "x": 1.0,
+    "y": 2.0,
+    "z": 3.0
+  },
+  "hp": 300,
+  "name": "Orc",
+  "weapons": [
+    {
+      "name": "axe",
+      "damage": 100
+    },
+    {
+      "name": "bow",
+      "damage": 90
+    }
+  ],
+  "equipped_type": "Weapon",
+  "equipped": {
+    "name": "bow",
+    "damage": 90
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/php_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/php_sample.sh
new file mode 100755
index 0000000..c23edc3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/php_sample.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `php` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --php monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --php monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+        $rootdir directory.
+  exit 1
+fi
+
+echo Running the PHP sample.
+
+# Execute the sample.
+php SampleBinary.php
+
+# Clean up temporary files.
+rm -rf MyGame/
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/python_sample.sh b/3rdparty/TNN/third_party/flatbuffers/samples/python_sample.sh
new file mode 100755
index 0000000..ca5de2b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/python_sample.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Note: This script runs on Mac and Linux. It requires `python` to be installed
+# and `flatc` to be built (using `cmake` in the root directory).
+
+sampledir=$(cd $(dirname $BASH_SOURCE) && pwd)
+rootdir=$(cd $sampledir/.. && pwd)
+currentdir=$(pwd)
+
+if [[ "$sampledir" != "$currentdir" ]]; then
+  echo Error: This script must be run from inside the $sampledir directory.
+  echo You executed it from the $currentdir directory.
+  exit 1
+fi
+
+# Run `flatc`. Note: This requires you to compile using `cmake` from the
+# root `/flatbuffers` directory.
+if [ -e ../flatc ]; then
+  ../flatc --python monster.fbs
+elif [ -e ../Debug/flatc ]; then
+  ../Debug/flatc --python monster.fbs
+else
+  echo 'flatc' could not be found. Make sure to build FlatBuffers from the \
+       $rootdir directory.
+  exit 1
+fi
+
+echo Running the Python sample.
+
+# Execute the sample.
+python sample_binary.py
+
+# Clean up the temporary files.
+rm -rf MyGame
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_bfbs.cpp b/3rdparty/TNN/third_party/flatbuffers/samples/sample_bfbs.cpp
new file mode 100644
index 0000000..0e17690
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_bfbs.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+#include "monster_generated.h"  // Already includes "flatbuffers/flatbuffers.h".
+
+using namespace MyGame::Sample;
+
+// This is an example of parsing text straight into a buffer and then
+// generating flatbuffer (JSON) text from the buffer.
+int main(int /*argc*/, const char * /*argv*/[]) {
+  // load FlatBuffer schema (.fbs) and JSON from disk
+  std::string schema_file;
+  std::string json_file;
+  std::string bfbs_file;
+  bool ok =
+      flatbuffers::LoadFile("tests/monster_test.fbs", false, &schema_file) &&
+      flatbuffers::LoadFile("tests/monsterdata_test.golden", false,
+                            &json_file) &&
+      flatbuffers::LoadFile("tests/monster_test.bfbs", true, &bfbs_file);
+  if (!ok) {
+    printf("couldn't load files!\n");
+    return 1;
+  }
+
+  const char *include_directories[] = { "samples", "tests",
+                                        "tests/include_test", nullptr };
+  // parse fbs schema
+  flatbuffers::Parser parser1;
+  ok = parser1.Parse(schema_file.c_str(), include_directories);
+  assert(ok);
+
+  // inizialize parser by deserializing bfbs schema
+  flatbuffers::Parser parser2;
+  ok = parser2.Deserialize((uint8_t *)bfbs_file.c_str(), bfbs_file.length());
+  assert(ok);
+
+  // parse json in parser from fbs and bfbs
+  ok = parser1.Parse(json_file.c_str(), include_directories);
+  assert(ok);
+  ok = parser2.Parse(json_file.c_str(), include_directories);
+  assert(ok);
+
+  // to ensure it is correct, we now generate text back from the binary,
+  // and compare the two:
+  std::string jsongen1;
+  if (!GenerateText(parser1, parser1.builder_.GetBufferPointer(), &jsongen1)) {
+    printf("Couldn't serialize parsed data to JSON!\n");
+    return 1;
+  }
+
+  std::string jsongen2;
+  if (!GenerateText(parser2, parser2.builder_.GetBufferPointer(), &jsongen2)) {
+    printf("Couldn't serialize parsed data to JSON!\n");
+    return 1;
+  }
+
+  if (jsongen1 != jsongen2) {
+    printf("%s----------------\n%s", jsongen1.c_str(), jsongen2.c_str());
+  }
+
+  printf("The FlatBuffer has been parsed from JSON successfully.\n");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.cpp b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.cpp
new file mode 100644
index 0000000..b8f4f1f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "monster_generated.h"  // Already includes "flatbuffers/flatbuffers.h".
+
+using namespace MyGame::Sample;
+
+// Example how to use FlatBuffers to create and read binary buffers.
+
+int main(int /*argc*/, const char * /*argv*/[]) {
+  // Build up a serialized buffer algorithmically:
+  flatbuffers::FlatBufferBuilder builder;
+
+  // First, lets serialize some weapons for the Monster: A 'sword' and an 'axe'.
+  auto weapon_one_name = builder.CreateString("Sword");
+  short weapon_one_damage = 3;
+
+  auto weapon_two_name = builder.CreateString("Axe");
+  short weapon_two_damage = 5;
+
+  // Use the `CreateWeapon` shortcut to create Weapons with all fields set.
+  auto sword = CreateWeapon(builder, weapon_one_name, weapon_one_damage);
+  auto axe = CreateWeapon(builder, weapon_two_name, weapon_two_damage);
+
+  // Create a FlatBuffer's `vector` from the `std::vector`.
+  std::vector<flatbuffers::Offset<Weapon>> weapons_vector;
+  weapons_vector.push_back(sword);
+  weapons_vector.push_back(axe);
+  auto weapons = builder.CreateVector(weapons_vector);
+
+  // Second, serialize the rest of the objects needed by the Monster.
+  auto position = Vec3(1.0f, 2.0f, 3.0f);
+
+  auto name = builder.CreateString("MyMonster");
+
+  unsigned char inv_data[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  auto inventory = builder.CreateVector(inv_data, 10);
+
+  // Shortcut for creating monster with all fields set:
+  auto orc = CreateMonster(builder, &position, 150, 80, name, inventory,
+                           Color_Red, weapons, Equipment_Weapon, axe.Union());
+
+  builder.Finish(orc);  // Serialize the root of the object.
+
+  // We now have a FlatBuffer we can store on disk or send over a network.
+
+  // ** file/network code goes here :) **
+  // access builder.GetBufferPointer() for builder.GetSize() bytes
+
+  // Instead, we're going to access it right away (as if we just received it).
+
+  // Get access to the root:
+  auto monster = GetMonster(builder.GetBufferPointer());
+
+  // Get and test some scalar types from the FlatBuffer.
+  assert(monster->hp() == 80);
+  assert(monster->mana() == 150);  // default
+  assert(monster->name()->str() == "MyMonster");
+
+  // Get and test a field of the FlatBuffer's `struct`.
+  auto pos = monster->pos();
+  assert(pos);
+  assert(pos->z() == 3.0f);
+  (void)pos;
+
+  // Get a test an element from the `inventory` FlatBuffer's `vector`.
+  auto inv = monster->inventory();
+  assert(inv);
+  assert(inv->Get(9) == 9);
+  (void)inv;
+
+  // Get and test the `weapons` FlatBuffers's `vector`.
+  std::string expected_weapon_names[] = { "Sword", "Axe" };
+  short expected_weapon_damages[] = { 3, 5 };
+  auto weps = monster->weapons();
+  for (unsigned int i = 0; i < weps->size(); i++) {
+    assert(weps->Get(i)->name()->str() == expected_weapon_names[i]);
+    assert(weps->Get(i)->damage() == expected_weapon_damages[i]);
+  }
+  (void)expected_weapon_names;
+  (void)expected_weapon_damages;
+
+  // Get and test the `Equipment` union (`equipped` field).
+  assert(monster->equipped_type() == Equipment_Weapon);
+  auto equipped = static_cast<const Weapon *>(monster->equipped());
+  assert(equipped->name()->str() == "Axe");
+  assert(equipped->damage() == 5);
+  (void)equipped;
+
+  printf("The FlatBuffer was successfully created and verified!\n");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.go b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.go
new file mode 100644
index 0000000..e04650b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.go
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// To run, use the `go_sample.sh` script.
+
+package main
+
+import (
+	sample "MyGame/Sample"
+	"fmt"
+	flatbuffers "github.com/google/flatbuffers/go"
+	"strconv"
+)
+
+// Example how to use Flatbuffers to create and read binary buffers.
+func main() {
+	builder := flatbuffers.NewBuilder(0)
+
+	// Create some weapons for our Monster ("Sword" and "Axe").
+	weaponOne := builder.CreateString("Sword")
+	weaponTwo := builder.CreateString("Axe")
+
+	sample.WeaponStart(builder)
+	sample.WeaponAddName(builder, weaponOne)
+	sample.WeaponAddDamage(builder, 3)
+	sword := sample.WeaponEnd(builder)
+
+	sample.WeaponStart(builder)
+	sample.WeaponAddName(builder, weaponTwo)
+	sample.WeaponAddDamage(builder, 5)
+	axe := sample.WeaponEnd(builder)
+
+	// Serialize the FlatBuffer data.
+	name := builder.CreateString("Orc")
+
+	sample.MonsterStartInventoryVector(builder, 10)
+	// Note: Since we prepend the bytes, this loop iterates in reverse.
+	for i := 9; i >= 0; i-- {
+		builder.PrependByte(byte(i))
+	}
+	inv := builder.EndVector(10)
+
+	sample.MonsterStartWeaponsVector(builder, 2)
+	// Note: Since we prepend the weapons, prepend in reverse order.
+	builder.PrependUOffsetT(axe)
+	builder.PrependUOffsetT(sword)
+	weapons := builder.EndVector(2)
+
+	pos := sample.CreateVec3(builder, 1.0, 2.0, 3.0)
+
+	sample.MonsterStart(builder)
+	sample.MonsterAddPos(builder, pos)
+	sample.MonsterAddHp(builder, 300)
+	sample.MonsterAddName(builder, name)
+	sample.MonsterAddInventory(builder, inv)
+	sample.MonsterAddColor(builder, sample.ColorRed)
+	sample.MonsterAddWeapons(builder, weapons)
+	sample.MonsterAddEquippedType(builder, sample.EquipmentWeapon)
+	sample.MonsterAddEquipped(builder, axe)
+	orc := sample.MonsterEnd(builder)
+
+	builder.Finish(orc)
+
+	// We now have a FlatBuffer that we could store on disk or send over a network.
+
+	// ...Saving to file or sending over a network code goes here...
+
+	// Instead, we are going to access this buffer right away (as if we just received it).
+
+	buf := builder.FinishedBytes()
+
+	// Note: We use `0` for the offset here, since we got the data using the
+	// `builder.FinishedBytes()` method. This simulates the data you would store/receive in your
+	// FlatBuffer. If you wanted to read from the `builder.Bytes` directly, you would need to
+	// pass in the offset of `builder.Head()`, as the builder actually constructs the buffer
+	// backwards.
+	monster := sample.GetRootAsMonster(buf, 0)
+
+	// Note: We did not set the `mana` field explicitly, so we get the
+	// default value.
+	assert(monster.Mana() == 150, "`monster.Mana()`", strconv.Itoa(int(monster.Mana())), "150")
+	assert(monster.Hp() == 300, "`monster.Hp()`", strconv.Itoa(int(monster.Hp())), "300")
+	assert(string(monster.Name()) == "Orc", "`string(monster.Name())`", string(monster.Name()),
+		"\"Orc\"")
+	assert(monster.Color() == sample.ColorRed, "`monster.Color()`",
+		strconv.Itoa(int(monster.Color())), strconv.Itoa(int(sample.ColorRed)))
+
+	// Note: Whenever you access a new object, like in `Pos()`, a new temporary accessor object
+	// gets created. If your code is very performance sensitive, you can pass in a pointer to an
+	// existing `Vec3` instead of `nil`. This allows you to reuse it across many calls to reduce
+	// the amount of object allocation/garbage collection.
+	assert(monster.Pos(nil).X() == 1.0, "`monster.Pos(nil).X()`",
+		strconv.FormatFloat(float64(monster.Pos(nil).X()), 'f', 1, 32), "1.0")
+	assert(monster.Pos(nil).Y() == 2.0, "`monster.Pos(nil).Y()`",
+		strconv.FormatFloat(float64(monster.Pos(nil).Y()), 'f', 1, 32), "2.0")
+	assert(monster.Pos(nil).Z() == 3.0, "`monster.Pos(nil).Z()`",
+		strconv.FormatFloat(float64(monster.Pos(nil).Z()), 'f', 1, 32), "3.0")
+
+	// For vectors, like `Inventory`, they have a method suffixed with 'Length' that can be used
+	// to query the length of the vector. You can index the vector by passing an index value
+	// into the accessor.
+	for i := 0; i < monster.InventoryLength(); i++ {
+		assert(monster.Inventory(i) == byte(i), "`monster.Inventory(i)`",
+			strconv.Itoa(int(monster.Inventory(i))), strconv.Itoa(int(byte(i))))
+	}
+
+	expectedWeaponNames := []string{"Sword", "Axe"}
+	expectedWeaponDamages := []int{3, 5}
+	weapon := new(sample.Weapon) // We need a `sample.Weapon` to pass into `monster.Weapons()`
+	// to capture the output of that function.
+	for i := 0; i < monster.WeaponsLength(); i++ {
+		if monster.Weapons(weapon, i) {
+			assert(string(weapon.Name()) == expectedWeaponNames[i], "`weapon.Name()`",
+				string(weapon.Name()), expectedWeaponNames[i])
+			assert(int(weapon.Damage()) == expectedWeaponDamages[i],
+				"`weapon.Damage()`", strconv.Itoa(int(weapon.Damage())),
+				strconv.Itoa(expectedWeaponDamages[i]))
+		}
+	}
+
+	// For FlatBuffer `union`s, you can get the type of the union, as well as the union
+	// data itself.
+	assert(monster.EquippedType() == sample.EquipmentWeapon, "`monster.EquippedType()`",
+		strconv.Itoa(int(monster.EquippedType())), strconv.Itoa(int(sample.EquipmentWeapon)))
+
+	unionTable := new(flatbuffers.Table)
+	if monster.Equipped(unionTable) {
+		// An example of how you can appropriately convert the table depending on the
+		// FlatBuffer `union` type. You could add `else if` and `else` clauses to handle
+		// other FlatBuffer `union` types for this field. (Similarly, this could be
+		// done in a switch statement.)
+		if monster.EquippedType() == sample.EquipmentWeapon {
+			unionWeapon := new(sample.Weapon)
+			unionWeapon.Init(unionTable.Bytes, unionTable.Pos)
+
+			assert(string(unionWeapon.Name()) == "Axe", "`unionWeapon.Name()`",
+				string(unionWeapon.Name()), "Axe")
+			assert(int(unionWeapon.Damage()) == 5, "`unionWeapon.Damage()`",
+				strconv.Itoa(int(unionWeapon.Damage())), strconv.Itoa(5))
+		}
+	}
+
+	fmt.Printf("The FlatBuffer was successfully created and verified!\n")
+}
+
+// A helper function to print out if an assertion failed.
+func assert(assertPassed bool, codeExecuted string, actualValue string, expectedValue string) {
+	if assertPassed == false {
+		panic("Assert failed! " + codeExecuted + " (" + actualValue +
+			") was not equal to " + expectedValue + ".")
+	}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lobster b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lobster
new file mode 100644
index 0000000..cd7adab
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lobster
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import from "../lobster/"
+import monster_generated
+
+// Example of how to use FlatBuffers to create and read binary buffers.
+
+// Create a builder.
+let b = flatbuffers_builder {}
+
+// Create some weapons for our monster.
+let weapon_names = [ "Sword", "Axe" ]
+let weapon_damages = [ 3, 5 ]
+
+let weapon_offsets = map(weapon_names) name, i:
+    let ns = b.CreateString(name)
+    MyGame_Sample_WeaponBuilder { b }
+        .start()
+        .add_name(ns)
+        .add_damage(weapon_damages[i])
+        .end()
+
+let weapons = b.MyGame_Sample_MonsterCreateWeaponsVector(weapon_offsets)
+
+// Name of the monster.
+let name = b.CreateString("Orc")
+
+// Inventory.
+let inv = b.MyGame_Sample_MonsterCreateInventoryVector(map(10): _)
+
+// Now pack it all together in our root monster object.
+let orc = MyGame_Sample_MonsterBuilder { b }
+    .start()
+    .add_pos(b.MyGame_Sample_CreateVec3(1.0, 2.0, 3.0))
+    .add_hp(300)
+    .add_name(name)
+    .add_inventory(inv)
+    .add_color(MyGame_Sample_Color_Red)
+    .add_weapons(weapons)
+    .add_equipped_type(MyGame_Sample_Equipment_Weapon)
+    .add_equipped(weapon_offsets[1])
+    .end()
+
+// Finish the buffer!
+b.Finish(orc)
+
+// We now have a FlatBuffer that we could store on disk or send over a network.
+
+let buf = b.SizedCopy()
+
+// ...Saving to file or sending over a network code goes here...
+
+// Instead, we are going to access this buffer right away (as if we just
+// received it).
+
+// Get the root object accessor.
+let monster = MyGame_Sample_GetRootAsMonster(buf)
+
+// Note: We did not set the `mana` field explicitly, so we get a default value.
+assert monster.mana == 150
+assert monster.hp == 300
+assert monster.name == "Orc"
+assert monster.color == MyGame_Sample_Color_Red
+let pos = monster.pos
+assert pos
+assert pos.x == 1.0
+assert pos.y == 2.0
+assert pos.z == 3.0
+
+// Get and test the `inventory` FlatBuffer vector.
+for(monster.inventory_length) e, i:
+  assert monster.inventory(i) == e
+
+// Get and test the `weapons` FlatBuffer vector of tables.
+for(monster.weapons_length) i:
+  assert monster.weapons(i).name == weapon_names[i]
+  assert monster.weapons(i).damage == weapon_damages[i]
+
+// Get and test the `equipped` FlatBuffer union.
+assert monster.equipped_type() == MyGame_Sample_Equipment_Weapon
+
+// Now that we know the union value is a weapon, we can safely call as_Weapon:
+let union_weapon = monster.equipped_as_Weapon
+
+assert union_weapon.name == "Axe"
+assert union_weapon.damage == 5
+
+print "The FlatBuffer was successfully created and verified!"
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lua b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lua
new file mode 100644
index 0000000..da96539
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.lua
@@ -0,0 +1,107 @@
+-- need to update the Lua path to point to the local flatbuffers implementation
+package.path = string.format("../lua/?.lua;%s",package.path)
+package.path = string.format("./lua/?.lua;%s",package.path)
+
+-- require the library
+local flatbuffers = require("flatbuffers")
+
+local binaryArray = flatbuffers.binaryArray-- for hex dump utility
+
+-- require the files generated from the schema
+local weapon = require("MyGame.Sample.Weapon")
+local monster = require("MyGame.Sample.Monster")
+local vec3 = require("MyGame.Sample.Vec3")
+local color = require("MyGame.Sample.Color")
+local equipment = require("MyGame.Sample.Equipment")
+
+-- get access to the builder, providing an array of size 1024
+local builder = flatbuffers.Builder(1024)
+
+local weaponOne = builder:CreateString("Sword")
+local weaponTwo = builder:CreateString("Axe")
+
+-- Create the first 'Weapon'
+weapon.Start(builder)
+weapon.AddName(builder, weaponOne)
+weapon.AddDamage(builder, 3)
+local sword = weapon.End(builder)
+
+-- Create the second 'Weapon'
+weapon.Start(builder)
+weapon.AddName(builder, weaponTwo)
+weapon.AddDamage(builder, 5)
+local axe = weapon.End(builder)
+
+-- Serialize a name for our mosnter, called 'orc'
+local name = builder:CreateString("Orc")
+
+-- Create a `vector` representing the inventory of the Orc. Each number
+-- could correspond to an item that can be claimed after he is slain.
+-- Note: Since we prepend the bytes, this loop iterates in reverse.
+monster.StartInventoryVector(builder, 10)
+for i=10,1,-1 do
+    builder:PrependByte(i)
+end
+local inv = builder:EndVector(10)
+
+-- Create a FlatBuffer vector and prepend the weapons.
+-- Note: Since we prepend the data, prepend them in reverse order.
+monster.StartWeaponsVector(builder, 2)
+builder:PrependUOffsetTRelative(axe)
+builder:PrependUOffsetTRelative(sword)
+local weapons = builder:EndVector(2)
+
+-- Create our monster by using Start() andEnd()
+monster.Start(builder)
+monster.AddPos(builder, vec3.CreateVec3(builder, 1.0, 2.0, 3.0))
+monster.AddHp(builder, 300)
+monster.AddName(builder, name)
+monster.AddInventory(builder, inv)
+monster.AddColor(builder, color.Red)
+monster.AddWeapons(builder, weapons)
+monster.AddEquippedType(builder, equipment.Weapon)
+monster.AddEquipped(builder, axe)
+local orc = monster.End(builder)
+
+-- Call 'Finish()' to instruct the builder that this monster is complete.
+builder:Finish(orc)
+
+-- Get the flatbuffer as a string containing the binary data
+local bufAsString = builder:Output()
+
+-- Convert the string representation into binary array Lua structure
+local buf = flatbuffers.binaryArray.New(bufAsString)
+
+-- Get an accessor to the root object insert the buffer
+local mon = monster.GetRootAsMonster(buf, 0)
+
+assert(mon:Mana() == 150)
+assert(mon:Hp() == 300)
+assert(mon:Name() == "Orc")
+assert(mon:Color() == color.Red)
+assert(mon:Pos():X() == 1.0)
+assert(mon:Pos():Y() == 2.0)
+assert(mon:Pos():Z() == 3.0)
+
+for i=1,mon:InventoryLength() do
+    assert(mon:Inventory(i) == i)
+end
+
+local expected = { 
+    {w = 'Sword', d = 3}, 
+    {w = 'Axe', d = 5}
+}
+
+for i=1,mon:WeaponsLength() do
+   assert(mon:Weapons(i):Name() == expected[i].w)
+   assert(mon:Weapons(i):Damage() == expected[i].d)
+end
+
+assert(mon:EquippedType() == equipment.Weapon)
+
+local unionWeapon = weapon.New()
+unionWeapon:Init(mon:Equipped().bytes,mon:Equipped().pos)
+assert(unionWeapon:Name() == "Axe")
+assert(unionWeapon:Damage() == 5)
+
+print("The Lua FlatBuffer example was successfully created and verified!")
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.py b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.py
new file mode 100644
index 0000000..cd250a9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# To run this file, use `python_sample.sh`.
+
+# Append paths to the `flatbuffers` and `MyGame` modules. This is necessary
+# to facilitate executing this script in the `samples` folder, and to root
+# folder (where it gets placed when using `cmake`).
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '../python'))
+
+import flatbuffers
+import MyGame.Sample.Color
+import MyGame.Sample.Equipment
+import MyGame.Sample.Monster
+import MyGame.Sample.Vec3
+import MyGame.Sample.Weapon
+
+# Example of how to use FlatBuffers to create and read binary buffers.
+
+def main():
+  builder = flatbuffers.Builder(0)
+
+  # Create some weapons for our Monster ('Sword' and 'Axe').
+  weapon_one = builder.CreateString('Sword')
+  weapon_two = builder.CreateString('Axe')
+
+  MyGame.Sample.Weapon.WeaponStart(builder)
+  MyGame.Sample.Weapon.WeaponAddName(builder, weapon_one)
+  MyGame.Sample.Weapon.WeaponAddDamage(builder, 3)
+  sword = MyGame.Sample.Weapon.WeaponEnd(builder)
+
+  MyGame.Sample.Weapon.WeaponStart(builder)
+  MyGame.Sample.Weapon.WeaponAddName(builder, weapon_two)
+  MyGame.Sample.Weapon.WeaponAddDamage(builder, 5)
+  axe = MyGame.Sample.Weapon.WeaponEnd(builder)
+
+  # Serialize the FlatBuffer data.
+  name = builder.CreateString('Orc')
+
+  MyGame.Sample.Monster.MonsterStartInventoryVector(builder, 10)
+  # Note: Since we prepend the bytes, this loop iterates in reverse order.
+  for i in reversed(range(0, 10)):
+    builder.PrependByte(i)
+  inv = builder.EndVector()
+
+  MyGame.Sample.Monster.MonsterStartWeaponsVector(builder, 2)
+  # Note: Since we prepend the data, prepend the weapons in reverse order.
+  builder.PrependUOffsetTRelative(axe)
+  builder.PrependUOffsetTRelative(sword)
+  weapons = builder.EndVector()
+
+  pos = MyGame.Sample.Vec3.CreateVec3(builder, 1.0, 2.0, 3.0)
+
+  MyGame.Sample.Monster.MonsterStart(builder)
+  MyGame.Sample.Monster.MonsterAddPos(builder, pos)
+  MyGame.Sample.Monster.MonsterAddHp(builder, 300)
+  MyGame.Sample.Monster.MonsterAddName(builder, name)
+  MyGame.Sample.Monster.MonsterAddInventory(builder, inv)
+  MyGame.Sample.Monster.MonsterAddColor(builder,
+                                        MyGame.Sample.Color.Color().Red)
+  MyGame.Sample.Monster.MonsterAddWeapons(builder, weapons)
+  MyGame.Sample.Monster.MonsterAddEquippedType(
+      builder, MyGame.Sample.Equipment.Equipment().Weapon)
+  MyGame.Sample.Monster.MonsterAddEquipped(builder, axe)
+  orc = MyGame.Sample.Monster.MonsterEnd(builder)
+
+  builder.Finish(orc)
+
+  # We now have a FlatBuffer that we could store on disk or send over a network.
+
+  # ...Saving to file or sending over a network code goes here...
+
+  # Instead, we are going to access this buffer right away (as if we just
+  # received it).
+
+  buf = builder.Output()
+
+  # Note: We use `0` for the offset here, since we got the data using the
+  # `builder.Output()` method. This simulates the data you would store/receive
+  # in your FlatBuffer. If you wanted to read from the `builder.Bytes` directly,
+  # you would need to pass in the offset of `builder.Head()`, as the builder
+  # actually constructs the buffer backwards.
+  monster = MyGame.Sample.Monster.Monster.GetRootAsMonster(buf, 0)
+
+  # Note: We did not set the `Mana` field explicitly, so we get a default value.
+  assert monster.Mana() == 150
+  assert monster.Hp() == 300
+  assert monster.Name() == 'Orc'
+  assert monster.Color() == MyGame.Sample.Color.Color().Red
+  assert monster.Pos().X() == 1.0
+  assert monster.Pos().Y() == 2.0
+  assert monster.Pos().Z() == 3.0
+
+  # Get and test the `inventory` FlatBuffer `vector`.
+  for i in xrange(monster.InventoryLength()):
+    assert monster.Inventory(i) == i
+
+  # Get and test the `weapons` FlatBuffer `vector` of `table`s.
+  expected_weapon_names = ['Sword', 'Axe']
+  expected_weapon_damages = [3, 5]
+  for i in xrange(monster.WeaponsLength()):
+    assert monster.Weapons(i).Name() == expected_weapon_names[i]
+    assert monster.Weapons(i).Damage() == expected_weapon_damages[i]
+
+  # Get and test the `equipped` FlatBuffer `union`.
+  assert monster.EquippedType() == MyGame.Sample.Equipment.Equipment().Weapon
+
+  # An example of how you can appropriately convert the table depending on the
+  # FlatBuffer `union` type. You could add `elif` and `else` clauses to handle
+  # the other FlatBuffer `union` types for this field.
+  if monster.EquippedType() == MyGame.Sample.Equipment.Equipment().Weapon:
+    # `monster.Equipped()` returns a `flatbuffers.Table`, which can be used
+    # to initialize a `MyGame.Sample.Weapon.Weapon()`, in this case.
+    union_weapon = MyGame.Sample.Weapon.Weapon()
+    union_weapon.Init(monster.Equipped().Bytes, monster.Equipped().Pos)
+
+    assert union_weapon.Name() == "Axe"
+    assert union_weapon.Damage() == 5
+
+  print 'The FlatBuffer was successfully created and verified!'
+
+if __name__ == '__main__':
+  main()
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.rs b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.rs
new file mode 100644
index 0000000..6972e7f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.rs
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// import the flatbuffers runtime library
+extern crate flatbuffers;
+
+// import the generated code
+#[allow(dead_code, unused_imports)]
+#[path = "./monster_generated.rs"]
+#[allow(clippy::approx_constant)]  // We use low precision PI as a default value.
+mod monster_generated;
+pub use monster_generated::my_game::sample::{Color, Equipment,
+                                             Monster, MonsterArgs,
+                                             Vec3,
+                                             Weapon, WeaponArgs};
+
+
+// Example how to use FlatBuffers to create and read binary buffers.
+#[allow(clippy::float_cmp)]
+fn main() {
+  // Build up a serialized buffer algorithmically.
+  // Initialize it with a capacity of 1024 bytes.
+  let mut builder = flatbuffers::FlatBufferBuilder::new_with_capacity(1024);
+
+  // Serialize some weapons for the Monster: A 'sword' and an 'axe'.
+  let weapon_one_name = builder.create_string("Sword");
+  let weapon_two_name = builder.create_string("Axe");
+
+  // Use the `Weapon::create` shortcut to create Weapons with named field
+  // arguments.
+  let sword = Weapon::create(&mut builder, &WeaponArgs{
+      name: Some(weapon_one_name),
+      damage: 3,
+  });
+  let axe = Weapon::create(&mut builder, &WeaponArgs{
+      name: Some(weapon_two_name),
+      damage: 5,
+  });
+
+  // Name of the Monster.
+  let name = builder.create_string("Orc");
+
+  // Inventory.
+  let inventory = builder.create_vector(&[0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+  // Create a FlatBuffer `vector` that contains offsets to the sword and axe
+  // we created above.
+  let weapons = builder.create_vector(&[sword, axe]);
+
+  // Create the path vector of Vec3 objects:
+  //let x = Vec3::new(1.0, 2.0, 3.0);
+  //let y = Vec3::new(4.0, 5.0, 6.0);
+  //let path = builder.create_vector(&[x, y]);
+
+  // Note that, for convenience, it is also valid to create a vector of
+  // references to structs, like this:
+  // let path = builder.create_vector(&[&x, &y]);
+
+  // Create the monster using the `Monster::create` helper function. This
+  // function accepts a `MonsterArgs` struct, which supplies all of the data
+  // needed to build a `Monster`. To supply empty/default fields, just use the
+  // Rust built-in `Default::default()` function, as demonstrated below.
+  let orc = Monster::create(&mut builder, &MonsterArgs{
+      pos: Some(&Vec3::new(1.0f32, 2.0f32, 3.0f32)),
+      mana: 150,
+      hp: 80,
+      name: Some(name),
+      inventory: Some(inventory),
+      color: Color::Red,
+      weapons: Some(weapons),
+      equipped_type: Equipment::Weapon,
+      equipped: Some(axe.as_union_value()),
+      //path: Some(path),
+      ..Default::default()
+  });
+
+  // Serialize the root of the object, without providing a file identifier.
+  builder.finish(orc, None);
+
+  // We now have a FlatBuffer we can store on disk or send over a network.
+
+  // ** file/network code goes here :) **
+
+  // Instead, we're going to access it right away (as if we just received it).
+  // This must be called after `finish()`.
+  let buf = builder.finished_data(); // Of type `&[u8]`
+
+  // Get access to the root:
+  let monster = flatbuffers::root::<Monster>(buf).unwrap();
+
+  // Get and test some scalar types from the FlatBuffer.
+  let hp = monster.hp();
+  let mana = monster.mana();
+  let name = monster.name();
+
+  assert_eq!(hp, 80);
+  assert_eq!(mana, 150);  // default
+  assert_eq!(name, Some("Orc"));
+
+  // Get and test a field of the FlatBuffer's `struct`.
+  assert!(monster.pos().is_some());
+  let pos = monster.pos().unwrap();
+  let x = pos.x();
+  let y = pos.y();
+  let z = pos.z();
+  assert_eq!(x, 1.0f32);
+  assert_eq!(y, 2.0f32);
+  assert_eq!(z, 3.0f32);
+
+  // Get an element from the `inventory` FlatBuffer's `vector`.
+  assert!(monster.inventory().is_some());
+  let inv = monster.inventory().unwrap();
+
+  // Note that this vector is returned as a slice, because direct access for
+  // this type, a u8 vector, is safe on all platforms:
+  let third_item = inv[2];
+  assert_eq!(third_item, 2);
+
+  // Get and test the `weapons` FlatBuffers's `vector`.
+  assert!(monster.weapons().is_some());
+  let weps = monster.weapons().unwrap();
+  //let weps_len = weps.len();
+  let wep2 = weps.get(1);
+  let second_weapon_name = wep2.name();
+  let second_weapon_damage = wep2.damage();
+  assert_eq!(second_weapon_name, Some("Axe"));
+  assert_eq!(second_weapon_damage, 5);
+
+  // Get and test the `Equipment` union (`equipped` field).
+  assert_eq!(monster.equipped_type(), Equipment::Weapon);
+  let equipped = monster.equipped_as_weapon().unwrap();
+  let weapon_name = equipped.name();
+  let weapon_damage = equipped.damage();
+  assert_eq!(weapon_name, Some("Axe"));
+  assert_eq!(weapon_damage, 5);
+
+  // Get and test the `path` FlatBuffers's `vector`.
+  //assert_eq!(monster.path().unwrap().len(), 2);
+  //assert_eq!(monster.path().unwrap()[0].x(), 1.0);
+  //assert_eq!(monster.path().unwrap()[1].x(), 4.0);
+
+  println!("The FlatBuffer was successfully created and accessed!");
+  dbg!(monster);
+}
+
+#[cfg(test)]
+#[test]
+fn test_main() {
+    main()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.swift b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.swift
new file mode 100644
index 0000000..889bc98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_binary.swift
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import FlatBuffers
+
+typealias Monster = MyGame_Sample_Monster
+typealias Weapon = MyGame_Sample_Weapon
+typealias Color = MyGame_Sample_Color
+typealias Vec3 = MyGame_Sample_Vec3
+
+func main() {
+  let expectedDMG: [Int16] = [3, 5]
+  let expectedNames = ["Sword", "Axe"]
+
+  var builder = FlatBufferBuilder(initialSize: 1024)
+  let weapon1Name = builder.create(string: expectedNames[0])
+  let weapon2Name = builder.create(string: expectedNames[1])
+
+  let weapon1Start = Weapon.startWeapon(&builder)
+  Weapon.add(name: weapon1Name, &builder)
+  Weapon.add(damage: expectedDMG[0], &builder)
+  let sword = Weapon.endWeapon(&builder, start: weapon1Start)
+  let weapon2Start = Weapon.startWeapon(&builder)
+  Weapon.add(name: weapon2Name, &builder)
+  Weapon.add(damage: expectedDMG[1], &builder)
+  let axe = Weapon.endWeapon(&builder, start: weapon2Start)
+
+  let name = builder.create(string: "Orc")
+  let inventory: [Byte] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  let inventoryOffset = builder.createVector(inventory)
+
+  let weaponsOffset = builder.createVector(ofOffsets: [sword, axe])
+
+  let orc = Monster.createMonster(
+    &builder,
+    pos: MyGame_Sample_Vec3(x: 1, y: 2, z: 3),
+    hp: 300,
+    nameOffset: name,
+    inventoryVectorOffset: inventoryOffset,
+    color: .red,
+    weaponsVectorOffset: weaponsOffset,
+    equippedType: .weapon,
+    equippedOffset: axe)
+  builder.finish(offset: orc)
+
+  let buf = builder.sizedByteArray
+  let monster = Monster.getRootAsMonster(bb: ByteBuffer(bytes: buf))
+
+  assert(monster.mana == 150)
+  assert(monster.hp == 300)
+  assert(monster.name == "Orc")
+  assert(monster.color == MyGame.Sample.Color.red)
+  assert(monster.pos != nil)
+  assert(monster.mutablePos != nil)
+  for i in 0..<monster.inventoryCount {
+    assert(i == monster.inventory(at: i))
+  }
+
+  for i in 0..<monster.weaponsCount {
+    let weap = monster.weapons(at: i)
+    let index = Int(i)
+    assert(weap?.damage == expectedDMG[index])
+    assert(weap?.name == expectedNames[index])
+  }
+  assert(monster.equippedType == .weapon)
+  let equipped = monster.equipped(type: Weapon.self)
+  assert(equipped?.name == "Axe")
+  assert(equipped?.damage == 5)
+  print("Monster Object is Verified")
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers.rs b/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers.rs
new file mode 100644
index 0000000..237dbf0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers.rs
@@ -0,0 +1,167 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern crate flexbuffers;
+
+use flexbuffers::{BitWidth, Builder, Reader, ReaderError};
+
+
+// In this Example we're creating a monster that corresponds to the following JSON:
+// {
+//     "coins": [5, 10, 25, 25, 25, 100],
+//     "color": [255, 0, 0, 255],
+//     "enraged": true,
+//     "hp": 80,
+//     "mana": 200,
+//     "position": [0, 0, 0],
+//     "velocity": [1, 0, 0],
+//     "weapons": [
+//         "fist",
+//         {"damage": 15, "name": "great axe"},
+//         {"damage": 5, "name": "hammer"}]
+// }
+#[allow(clippy::float_cmp)]
+fn main() {
+    // Create a new Flexbuffer builder.
+    let mut builder = Builder::default();
+
+    // The root of the builder can be a singleton, map or vector.
+    // Our monster will be represented with a map.
+    let mut monster = builder.start_map();
+
+    // Use `push` to add elements to a vector or map. Note that it up to the programmer to ensure
+    // duplicate keys are avoided and the key has no null bytes.
+    monster.push("hp", 80);
+    monster.push("mana", 200);
+    monster.push("enraged", true);
+
+    // Let's give our monster some weapons. Use `start_vector` to store a vector.
+    let mut weapons = monster.start_vector("weapons");
+
+    // The first weapon is a fist which has no damage so we'll store it as a string.
+    // Strings in Flexbuffers are utf8 encoded and are distinct from map Keys which are c strings.
+    weapons.push("fist");
+
+    // The monster also has an axe. We'll store it as a map to make it more interesting.
+    let mut axe = weapons.start_map();
+    axe.push("name", "great axe");
+    axe.push("damage", 15);
+    // We're done adding to the axe.
+    axe.end_map();
+
+    // The monster also has a hammer.
+    {
+        let mut hammer = weapons.start_map();
+        hammer.push("name", "hammer");
+        hammer.push("damage", 5);
+        // Instead of calling `hammer.end_map()`, we can just drop the `hammer` for the same effect.
+        // Vectors and maps are completed and serialized when their builders are dropped.
+    }
+
+    // We're done adding weapons.
+    weapons.end_vector();
+
+    // Give the monster some money. Flexbuffers has typed vectors which are smaller than
+    // heterogenous vectors. Elements of typed vectors can be pushed one at a time, as above, or
+    // they can be passed as a slice. This will be stored as a `FlexBufferType::VectorInt`.
+    monster.push("coins", &[5, 10, 25, 25, 25, 100]);
+
+    // Flexbuffer has special types for fixed-length-typed-vectors (if the length is 3 or 4 and the
+    // type is int, uint, or float). They're even more compact than typed vectors.
+    // The monster's position and Velocity will be stored as `FlexbufferType::VectorFloat3`.
+    monster.push("position", &[0.0; 3]);
+    monster.push("velocity", &[1.0, 0.0, 0.0]);
+
+    // Give the monster bright red skin. In rust, numbers are assumed integers until proven
+    // otherwise. We annotate u8 to tell flexbuffers to store it as a FlexbufferType::VectorUInt4.
+    monster.push("color", &[255, 0, 0, 255u8]);
+
+    // End the map at the root of the builder. This finishes the Flexbuffer.
+    monster.end_map();
+
+    // Now the buffer is free to be reused. Let's see the final buffer.
+    let data = builder.view();
+    println!("The monster was serialized in {:?} bytes.", data.len());
+
+    // Let's read and verify the data.
+    let root = Reader::get_root(data).unwrap();
+    println!("The monster: {}", root);
+
+    let read_monster = root.as_map();
+
+    // What attributes does this monster have?
+    let attrs: Vec<_> = read_monster.iter_keys().collect();
+    assert_eq!(
+        attrs,
+        vec!["coins", "color", "enraged", "hp", "mana", "position", "velocity", "weapons"]
+    );
+
+    // index into a vector or map with the `idx` method.
+    let read_hp = read_monster.idx("hp");
+    let read_mana = read_monster.idx("mana");
+    // If `idx` fails it will return a Null flexbuffer Reader
+
+    // Use `as_T` to cast the data to your desired type.
+    assert_eq!(read_hp.as_u8(), 80);
+    assert_eq!(read_hp.as_f32(), 80.0);
+    // If it fails it will return T::default().
+    assert_eq!(read_hp.as_str(), ""); // Its not a string.
+    assert_eq!(read_mana.as_i8(), 0); // 200 is not representable in i8.
+    assert!(read_mana.as_vector().is_empty()); // Its not a vector.
+    assert_eq!(read_monster.idx("foo").as_i32(), 0); // `foo` is not a monster attribute.
+
+    // To examine how your data is stored, check the flexbuffer type and bitwidth.
+    assert!(read_hp.flexbuffer_type().is_int());
+    assert!(read_mana.flexbuffer_type().is_int());
+    // Note that mana=200 is bigger than the maximum i8 so everything in the top layer of the
+    // monster map is stored in 16 bits.
+    assert_eq!(read_hp.bitwidth(), BitWidth::W16);
+    assert_eq!(read_monster.idx("mana").bitwidth(), BitWidth::W16);
+
+    // Use get_T functions if you want to ensure the flexbuffer type matches what you expect.
+    assert_eq!(read_hp.get_i64(), Ok(80));
+    assert!(read_hp.get_u64().is_err());
+    assert!(read_hp.get_vector().is_err());
+
+    // Analogously, the `index` method is the safe version of `idx`.
+    assert!(read_monster.index("hp").is_ok());
+    assert_eq!(
+        read_monster.index("foo").unwrap_err(),
+        ReaderError::KeyNotFound
+    );
+
+    // Maps can also be indexed by usize. They're stored by key so `coins` are the first element.
+    let monster_coins = read_monster.idx(0);
+    // Maps and Vectors can be iterated over.
+    assert!(monster_coins
+        .as_vector()
+        .iter()
+        .map(|r| r.as_u8())
+        .eq(vec![5, 10, 25, 25, 25, 100].into_iter()));
+
+    // Build the answer to life the universe and everything. Reusing a builder resets it. The
+    // reused internals won't need to reallocate leading to a potential 2x speedup.
+    builder.build_singleton(42);
+
+    // The monster is now no more.
+    assert_eq!(builder.view().len(), 3); // Bytes.
+
+    let the_answer = Reader::get_root(builder.view()).unwrap();
+    assert_eq!(the_answer.as_i32(), 42);
+}
+
+#[test]
+fn test_main() {
+    main()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers_serde.rs b/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers_serde.rs
new file mode 100644
index 0000000..359ba45
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_flexbuffers_serde.rs
@@ -0,0 +1,81 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern crate flexbuffers;
+extern crate serde;
+#[macro_use]
+extern crate serde_derive;
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+enum Weapon {
+    Fist,
+    Equipment { name: String, damage: i32 },
+}
+
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+struct Color(u8, u8, u8, u8);
+
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+struct Monster {
+    hp: u32,
+    mana: i32,
+    enraged: bool,
+    weapons: Vec<Weapon>,
+    color: Color,
+    position: [f64; 3],
+    velocity: [f64; 3],
+    coins: Vec<u32>,
+}
+
+fn main() {
+    let monster = Monster {
+        hp: 80,
+        mana: 200,
+        enraged: true,
+        color: Color(255, 255, 255, 255),
+        position: [0.0; 3],
+        velocity: [1.0, 0.0, 0.0],
+        weapons: vec![
+            Weapon::Fist,
+            Weapon::Equipment {
+                name: "great axe".to_string(),
+                damage: 15,
+            },
+            Weapon::Equipment {
+                name: "hammer".to_string(),
+                damage: 5,
+            },
+        ],
+        coins: vec![5, 10, 25, 25, 25, 100],
+    };
+    let mut s = flexbuffers::FlexbufferSerializer::new();
+    monster.serialize(&mut s).unwrap();
+
+    let r = flexbuffers::Reader::get_root(s.view()).unwrap();
+
+    // Serialization is similar to JSON. Field names are stored in the buffer but are reused
+    // between all maps and structs.
+    println!("Monster stored in {:?} bytes.", s.view().len());
+    println!("{}", r);
+
+    let monster2 = Monster::deserialize(r).unwrap();
+
+    assert_eq!(monster, monster2);
+}
+
+#[test]
+fn test_main() {
+    main()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.cpp b/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.cpp
new file mode 100644
index 0000000..d46185b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+#include "monster_generated.h"  // Already includes "flatbuffers/flatbuffers.h".
+
+using namespace MyGame::Sample;
+
+// This is an example of parsing text straight into a buffer and then
+// generating flatbuffer (JSON) text from the buffer.
+int main(int /*argc*/, const char * /*argv*/[]) {
+  // load FlatBuffer schema (.fbs) and JSON from disk
+  std::string schemafile;
+  std::string jsonfile;
+  bool ok = flatbuffers::LoadFile("samples/monster.fbs", false, &schemafile) &&
+            flatbuffers::LoadFile("samples/monsterdata.json", false, &jsonfile);
+  if (!ok) {
+    printf("couldn't load files!\n");
+    return 1;
+  }
+
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parser;
+  const char *include_directories[] = { "samples", nullptr };
+  ok = parser.Parse(schemafile.c_str(), include_directories) &&
+       parser.Parse(jsonfile.c_str(), include_directories);
+  assert(ok);
+
+  // here, parser.builder_ contains a binary buffer that is the parsed data.
+
+  // to ensure it is correct, we now generate text back from the binary,
+  // and compare the two:
+  std::string jsongen;
+  if (!GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen)) {
+    printf("Couldn't serialize parsed data to JSON!\n");
+    return 1;
+  }
+
+  if (jsongen != jsonfile) {
+    printf("%s----------------\n%s", jsongen.c_str(), jsonfile.c_str());
+  }
+
+  printf("The FlatBuffer has been parsed from JSON successfully.\n");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.lobster b/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.lobster
new file mode 100644
index 0000000..26b3eaf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/samples/sample_text.lobster
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import from "../lobster/"
+import monster_generated
+
+// Example how to interop with JSON.
+
+// Test loading some JSON, converting it to a binary FlatBuffer and back again.
+
+// First read the schema and JSON data.
+let schema = read_file("monster.fbs", true)
+let json = read_file("monsterdata.json", true)
+assert schema and json
+
+// Parse JSON  to binary:
+let fb, err1 = flatbuffers_json_to_binary(schema, json, [])
+assert not err1
+
+// Access one field in it, just to check:
+let monster = MyGame_Sample_GetRootAsMonster(fb)
+assert monster.name == "Orc"
+
+// Convert binary back to JSON:
+let json2, err2 = flatbuffers_binary_to_json(schema, fb, [])
+assert not err2
+
+// The generated JSON should be exactly equal to the original!
+assert json == json2
+
+// Print what we've been converting for good measure:
+print json
diff --git a/3rdparty/TNN/third_party/flatbuffers/scripts/check-generate-code.sh b/3rdparty/TNN/third_party/flatbuffers/scripts/check-generate-code.sh
new file mode 100755
index 0000000..1f2d84d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/scripts/check-generate-code.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Copyright 2018 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cd tests
+./generate_code.sh
+cd ..
+
+# TODO: Linux and macos builds produce differences here for some reason.
+git checkout HEAD -- tests/monster_test.bfbs
+git checkout HEAD -- tests/arrays_test.bfbs
+git checkout HEAD -- samples/monster.bfbs
+
+if ! git diff --quiet; then
+  echo >&2
+  echo "ERROR: ********************************************************" >&2
+  echo "ERROR: The following differences were found after running the" >&2
+  echo "ERROR: tests/generate_code.sh script.  Maybe you forgot to run" >&2
+  echo "ERROR: it after making changes in a generator or schema?" >&2
+  echo "ERROR: ********************************************************" >&2
+  echo >&2
+  git diff --binary --exit-code
+fi
diff --git a/3rdparty/TNN/third_party/flatbuffers/scripts/check-grpc-generated-code.sh b/3rdparty/TNN/third_party/flatbuffers/scripts/check-grpc-generated-code.sh
new file mode 100644
index 0000000..4541be1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/scripts/check-grpc-generated-code.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright 2021 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+echo "Checks generated grpc code"
+cd grpc/examples
+sh generate.sh
+cd ..
+
+if ! git diff --quiet; then
+  echo >&2
+  echo "ERROR: ********************************************************" >&2
+  echo "ERROR: The following differences were found after running the" >&2
+  echo "ERROR: grpc/example/generate.sh script. Maybe you forgot to run" >&2
+  echo "ERROR: it after making changes in a generator or schema?" >&2
+  echo "ERROR: ********************************************************" >&2
+  echo >&2
+  git diff --binary --exit-code
+fi
diff --git a/3rdparty/TNN/third_party/flatbuffers/snap/snapcraft.yaml b/3rdparty/TNN/third_party/flatbuffers/snap/snapcraft.yaml
new file mode 100644
index 0000000..c0bc939
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/snap/snapcraft.yaml
@@ -0,0 +1,35 @@
+name: flatbuffers
+base: core18
+version: git
+summary: FlatBuffers compiler
+description: |
+  FlatBuffers compiler
+
+  NOTE: This snap also ships the necessary header files required to compile
+  projects using flatbuffers, however, for the compilation to work, you have
+  to manually add the following path in your project's configuration:
+
+  /snap/flatbuffers/current/include
+
+  If you need to use flatbuffers headers from a location other than the above
+  path, it is recommended to not use this snap as that could cause a mismatch.
+
+grade: stable
+confinement: strict
+
+parts:
+  flatc:
+    plugin: cmake
+    source: .
+    configflags:
+      - -GUnix Makefiles
+      - -DCMAKE_BUILD_TYPE=Release
+    build-packages:
+      - g++
+
+apps:
+  flatc:
+    command: flatc
+    plugs:
+      - home
+      - removable-media
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/src/BUILD.bazel
new file mode 100644
index 0000000..d41d0fc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/BUILD.bazel
@@ -0,0 +1,77 @@
+# @unused
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+# Public flatc library to compile flatbuffer files at runtime.
+cc_library(
+    name = "flatbuffers",
+    srcs = [
+        "code_generators.cpp",
+        "idl_gen_fbs.cpp",
+        "idl_gen_text.cpp",
+        "idl_parser.cpp",
+        "reflection.cpp",
+        "util.cpp",
+    ],
+    hdrs = ["//:public_headers"],
+    strip_include_prefix = "/include",
+    visibility = ["//:__pkg__"],
+)
+
+# Public flatc compiler library.
+cc_library(
+    name = "flatc_library",
+    srcs = [
+        "flatc.cpp",
+    ],
+    hdrs = [
+        "//:flatc_headers",
+    ],
+    strip_include_prefix = "/include",
+    visibility = ["//:__pkg__"],
+    deps = [
+        ":flatbuffers",
+    ],
+)
+
+# Public flatc compiler.
+cc_library(
+    name = "flatc",
+    srcs = [
+        "flatc_main.cpp",
+        "idl_gen_cpp.cpp",
+        "idl_gen_csharp.cpp",
+        "idl_gen_dart.cpp",
+        "idl_gen_go.cpp",
+        "idl_gen_grpc.cpp",
+        "idl_gen_java.cpp",
+        "idl_gen_json_schema.cpp",
+        "idl_gen_kotlin.cpp",
+        "idl_gen_lobster.cpp",
+        "idl_gen_lua.cpp",
+        "idl_gen_php.cpp",
+        "idl_gen_python.cpp",
+        "idl_gen_rust.cpp",
+        "idl_gen_swift.cpp",
+        "idl_gen_text.cpp",
+        "idl_gen_ts.cpp",
+        "util.cpp",
+    ],
+    hdrs = [
+        "//:flatc_headers",
+    ],
+    strip_include_prefix = "/include",
+    visibility = ["//:__pkg__"],
+    deps = [
+        ":flatc_library",
+        "//grpc/src/compiler:cpp_generator",
+        "//grpc/src/compiler:go_generator",
+        "//grpc/src/compiler:java_generator",
+        "//grpc/src/compiler:python_generator",
+        "//grpc/src/compiler:swift_generator",
+        "//grpc/src/compiler:ts_generator",
+    ],
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/clang-format-all.sh b/3rdparty/TNN/third_party/flatbuffers/src/clang-format-all.sh
new file mode 100644
index 0000000..3fd9e33
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/clang-format-all.sh
@@ -0,0 +1,6 @@
+# Running it twice corrects some bugs in clang-format.
+for run in {1..2}
+do
+  clang-format -i -style=file include/flatbuffers/* src/*.cpp tests/*.cpp samples/*.cpp grpc/src/compiler/schema_interface.h grpc/tests/*.cpp
+done
+git checkout include/flatbuffers/reflection_generated.h
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/clang-format-git.sh b/3rdparty/TNN/third_party/flatbuffers/src/clang-format-git.sh
new file mode 100644
index 0000000..0611cbb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/clang-format-git.sh
@@ -0,0 +1,6 @@
+# Running it twice corrects some bugs in clang-format.
+for run in {1..2}
+do
+  git clang-format HEAD^ -- include/flatbuffers/* src/*.cpp tests/*.cpp samples/*.cpp grpc/src/compiler/schema_interface.h grpc/tests/*.cpp
+done
+git checkout include/flatbuffers/reflection_generated.h
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/code_generators.cpp b/3rdparty/TNN/third_party/flatbuffers/src/code_generators.cpp
new file mode 100644
index 0000000..745406b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/code_generators.cpp
@@ -0,0 +1,395 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/code_generators.h"
+
+#include <assert.h>
+
+#include <cmath>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/util.h"
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable : 4127)  // C4127: conditional expression is constant
+#endif
+
+namespace flatbuffers {
+
+void CodeWriter::operator+=(std::string text) {
+  if (!ignore_ident_ && !text.empty()) AppendIdent(stream_);
+
+  while (true) {
+    auto begin = text.find("{{");
+    if (begin == std::string::npos) { break; }
+
+    auto end = text.find("}}");
+    if (end == std::string::npos || end < begin) { break; }
+
+    // Write all the text before the first {{ into the stream.
+    stream_.write(text.c_str(), begin);
+
+    // The key is between the {{ and }}.
+    const std::string key = text.substr(begin + 2, end - begin - 2);
+
+    // Find the value associated with the key.  If it exists, write the
+    // value into the stream, otherwise write the key itself into the stream.
+    auto iter = value_map_.find(key);
+    if (iter != value_map_.end()) {
+      const std::string &value = iter->second;
+      stream_ << value;
+    } else {
+      FLATBUFFERS_ASSERT(false && "could not find key");
+      stream_ << key;
+    }
+
+    // Update the text to everything after the }}.
+    text = text.substr(end + 2);
+  }
+  if (!text.empty() && string_back(text) == '\\') {
+    text.pop_back();
+    ignore_ident_ = true;
+    stream_ << text;
+  } else {
+    ignore_ident_ = false;
+    stream_ << text << std::endl;
+  }
+}
+
+void CodeWriter::AppendIdent(std::stringstream &stream) {
+  int lvl = cur_ident_lvl_;
+  while (lvl--) {
+    stream.write(pad_.c_str(), static_cast<std::streamsize>(pad_.size()));
+  }
+}
+
+const char *BaseGenerator::FlatBuffersGeneratedWarning() {
+  return "automatically generated by the FlatBuffers compiler,"
+         " do not modify";
+}
+
+std::string BaseGenerator::NamespaceDir(const Parser &parser,
+                                        const std::string &path,
+                                        const Namespace &ns,
+                                        const bool dasherize) {
+  EnsureDirExists(path);
+  if (parser.opts.one_file) return path;
+  std::string namespace_dir = path;  // Either empty or ends in separator.
+  auto &namespaces = ns.components;
+  for (auto it = namespaces.begin(); it != namespaces.end(); ++it) {
+    namespace_dir += !dasherize ? *it : ToDasherizedCase(*it);
+    namespace_dir += kPathSeparator;
+    EnsureDirExists(namespace_dir);
+  }
+  return namespace_dir;
+}
+
+std::string BaseGenerator::NamespaceDir(const Namespace &ns,
+                                        const bool dasherize) const {
+  return BaseGenerator::NamespaceDir(parser_, path_, ns, dasherize);
+}
+
+std::string BaseGenerator::ToDasherizedCase(const std::string pascal_case) {
+  std::string dasherized_case;
+  char p = 0;
+  for (size_t i = 0; i < pascal_case.length(); i++) {
+    char const &c = pascal_case[i];
+    if (is_alpha_upper(c)) {
+      if (i > 0 && p != kPathSeparator) dasherized_case += "-";
+      dasherized_case += CharToLower(c);
+    } else {
+      dasherized_case += c;
+    }
+    p = c;
+  }
+  return dasherized_case;
+}
+
+std::string BaseGenerator::FullNamespace(const char *separator,
+                                         const Namespace &ns) {
+  std::string namespace_name;
+  auto &namespaces = ns.components;
+  for (auto it = namespaces.begin(); it != namespaces.end(); ++it) {
+    if (namespace_name.length()) namespace_name += separator;
+    namespace_name += *it;
+  }
+  return namespace_name;
+}
+
+std::string BaseGenerator::LastNamespacePart(const Namespace &ns) {
+  if (!ns.components.empty())
+    return ns.components.back();
+  else
+    return std::string("");
+}
+
+// Ensure that a type is prefixed with its namespace.
+std::string BaseGenerator::WrapInNameSpace(const Namespace *ns,
+                                           const std::string &name) const {
+  std::string qualified_name = qualifying_start_;
+  for (auto it = ns->components.begin(); it != ns->components.end(); ++it)
+    qualified_name += *it + qualifying_separator_;
+  return qualified_name + name;
+}
+
+std::string BaseGenerator::WrapInNameSpace(const Definition &def) const {
+  return WrapInNameSpace(def.defined_namespace, def.name);
+}
+
+std::string BaseGenerator::GetNameSpace(const Definition &def) const {
+  const Namespace *ns = def.defined_namespace;
+  if (CurrentNameSpace() == ns) return "";
+  std::string qualified_name = qualifying_start_;
+  for (auto it = ns->components.begin(); it != ns->components.end(); ++it) {
+    qualified_name += *it;
+    if ((it + 1) != ns->components.end()) {
+      qualified_name += qualifying_separator_;
+    }
+  }
+
+  return qualified_name;
+}
+
+std::string BaseGenerator::GeneratedFileName(const std::string &path,
+                                             const std::string &file_name,
+                                             const IDLOptions &options) const {
+  return path + file_name + options.filename_suffix + "." +
+         (options.filename_extension.empty() ? default_extension_
+                                             : options.filename_extension);
+}
+
+// Generate a documentation comment, if available.
+void GenComment(const std::vector<std::string> &dc, std::string *code_ptr,
+                const CommentConfig *config, const char *prefix) {
+  if (dc.begin() == dc.end()) {
+    // Don't output empty comment blocks with 0 lines of comment content.
+    return;
+  }
+
+  std::string &code = *code_ptr;
+  if (config != nullptr && config->first_line != nullptr) {
+    code += std::string(prefix) + std::string(config->first_line) + "\n";
+  }
+  std::string line_prefix =
+      std::string(prefix) +
+      ((config != nullptr && config->content_line_prefix != nullptr)
+           ? config->content_line_prefix
+           : "///");
+  for (auto it = dc.begin(); it != dc.end(); ++it) {
+    code += line_prefix + *it + "\n";
+  }
+  if (config != nullptr && config->last_line != nullptr) {
+    code += std::string(prefix) + std::string(config->last_line) + "\n";
+  }
+}
+
+template<typename T>
+std::string FloatConstantGenerator::GenFloatConstantImpl(
+    const FieldDef &field) const {
+  const auto &constant = field.value.constant;
+  T v;
+  auto done = StringToNumber(constant.c_str(), &v);
+  FLATBUFFERS_ASSERT(done);
+  if (done) {
+#if (!defined(_MSC_VER) || (_MSC_VER >= 1800))
+    if (std::isnan(v)) return NaN(v);
+    if (std::isinf(v)) return Inf(v);
+#endif
+    return Value(v, constant);
+  }
+  return "#";  // compile time error
+}
+
+std::string FloatConstantGenerator::GenFloatConstant(
+    const FieldDef &field) const {
+  switch (field.value.type.base_type) {
+    case BASE_TYPE_FLOAT: return GenFloatConstantImpl<float>(field);
+    case BASE_TYPE_DOUBLE: return GenFloatConstantImpl<double>(field);
+    default: {
+      FLATBUFFERS_ASSERT(false);
+      return "INVALID_BASE_TYPE";
+    }
+  };
+}
+
+TypedFloatConstantGenerator::TypedFloatConstantGenerator(
+    const char *double_prefix, const char *single_prefix,
+    const char *nan_number, const char *pos_inf_number,
+    const char *neg_inf_number)
+    : double_prefix_(double_prefix),
+      single_prefix_(single_prefix),
+      nan_number_(nan_number),
+      pos_inf_number_(pos_inf_number),
+      neg_inf_number_(neg_inf_number) {}
+
+std::string TypedFloatConstantGenerator::MakeNaN(
+    const std::string &prefix) const {
+  return prefix + nan_number_;
+}
+std::string TypedFloatConstantGenerator::MakeInf(
+    bool neg, const std::string &prefix) const {
+  if (neg)
+    return !neg_inf_number_.empty() ? (prefix + neg_inf_number_)
+                                    : ("-" + prefix + pos_inf_number_);
+  else
+    return prefix + pos_inf_number_;
+}
+
+std::string TypedFloatConstantGenerator::Value(double v,
+                                               const std::string &src) const {
+  (void)v;
+  return src;
+}
+
+std::string TypedFloatConstantGenerator::Inf(double v) const {
+  return MakeInf(v < 0, double_prefix_);
+}
+
+std::string TypedFloatConstantGenerator::NaN(double v) const {
+  (void)v;
+  return MakeNaN(double_prefix_);
+}
+
+std::string TypedFloatConstantGenerator::Value(float v,
+                                               const std::string &src) const {
+  (void)v;
+  return src + "f";
+}
+
+std::string TypedFloatConstantGenerator::Inf(float v) const {
+  return MakeInf(v < 0, single_prefix_);
+}
+
+std::string TypedFloatConstantGenerator::NaN(float v) const {
+  (void)v;
+  return MakeNaN(single_prefix_);
+}
+
+SimpleFloatConstantGenerator::SimpleFloatConstantGenerator(
+    const char *nan_number, const char *pos_inf_number,
+    const char *neg_inf_number)
+    : nan_number_(nan_number),
+      pos_inf_number_(pos_inf_number),
+      neg_inf_number_(neg_inf_number) {}
+
+std::string SimpleFloatConstantGenerator::Value(double v,
+                                                const std::string &src) const {
+  (void)v;
+  return src;
+}
+
+std::string SimpleFloatConstantGenerator::Inf(double v) const {
+  return (v < 0) ? neg_inf_number_ : pos_inf_number_;
+}
+
+std::string SimpleFloatConstantGenerator::NaN(double v) const {
+  (void)v;
+  return nan_number_;
+}
+
+std::string SimpleFloatConstantGenerator::Value(float v,
+                                                const std::string &src) const {
+  return this->Value(static_cast<double>(v), src);
+}
+
+std::string SimpleFloatConstantGenerator::Inf(float v) const {
+  return this->Inf(static_cast<double>(v));
+}
+
+std::string SimpleFloatConstantGenerator::NaN(float v) const {
+  return this->NaN(static_cast<double>(v));
+}
+
+std::string JavaCSharpMakeRule(const Parser &parser, const std::string &path,
+                               const std::string &file_name) {
+  FLATBUFFERS_ASSERT(parser.opts.lang == IDLOptions::kJava ||
+                     parser.opts.lang == IDLOptions::kCSharp);
+
+  std::string file_extension =
+      (parser.opts.lang == IDLOptions::kJava) ? ".java" : ".cs";
+
+  std::string make_rule;
+
+  for (auto it = parser.enums_.vec.begin(); it != parser.enums_.vec.end();
+       ++it) {
+    auto &enum_def = **it;
+    if (!make_rule.empty()) make_rule += " ";
+    std::string directory =
+        BaseGenerator::NamespaceDir(parser, path, *enum_def.defined_namespace);
+    make_rule += directory + enum_def.name + file_extension;
+  }
+
+  for (auto it = parser.structs_.vec.begin(); it != parser.structs_.vec.end();
+       ++it) {
+    auto &struct_def = **it;
+    if (!make_rule.empty()) make_rule += " ";
+    std::string directory = BaseGenerator::NamespaceDir(
+        parser, path, *struct_def.defined_namespace);
+    make_rule += directory + struct_def.name + file_extension;
+  }
+
+  make_rule += ": ";
+  auto included_files = parser.GetIncludedFilesRecursive(file_name);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+std::string BinaryFileName(const Parser &parser, const std::string &path,
+                           const std::string &file_name) {
+  auto ext = parser.file_extension_.length() ? parser.file_extension_ : "bin";
+  return path + file_name + "." + ext;
+}
+
+bool GenerateBinary(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  if (parser.opts.use_flexbuffers) {
+    auto data_vec = parser.flex_builder_.GetBuffer();
+    auto data_ptr = reinterpret_cast<char *>(data(data_vec));
+    return !parser.flex_builder_.GetSize() ||
+           flatbuffers::SaveFile(
+               BinaryFileName(parser, path, file_name).c_str(), data_ptr,
+               parser.flex_builder_.GetSize(), true);
+  }
+  return !parser.builder_.GetSize() ||
+         flatbuffers::SaveFile(
+             BinaryFileName(parser, path, file_name).c_str(),
+             reinterpret_cast<char *>(parser.builder_.GetBufferPointer()),
+             parser.builder_.GetSize(), true);
+}
+
+std::string BinaryMakeRule(const Parser &parser, const std::string &path,
+                           const std::string &file_name) {
+  if (!parser.builder_.GetSize()) return "";
+  std::string filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  std::string make_rule =
+      BinaryFileName(parser, path, filebase) + ": " + file_name;
+  auto included_files =
+      parser.GetIncludedFilesRecursive(parser.root_struct_def_->file);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/flatc.cpp b/3rdparty/TNN/third_party/flatbuffers/src/flatc.cpp
new file mode 100644
index 0000000..221b886
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/flatc.cpp
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/flatc.h"
+
+#include <list>
+
+namespace flatbuffers {
+
+const char *FLATC_VERSION() { return FLATBUFFERS_VERSION(); }
+
+void FlatCompiler::ParseFile(
+    flatbuffers::Parser &parser, const std::string &filename,
+    const std::string &contents,
+    std::vector<const char *> &include_directories) const {
+  auto local_include_directory = flatbuffers::StripFileName(filename);
+  include_directories.push_back(local_include_directory.c_str());
+  include_directories.push_back(nullptr);
+  if (!parser.Parse(contents.c_str(), &include_directories[0],
+                    filename.c_str())) {
+    Error(parser.error_, false, false);
+  }
+  if (!parser.error_.empty()) { Warn(parser.error_, false); }
+  include_directories.pop_back();
+  include_directories.pop_back();
+}
+
+void FlatCompiler::LoadBinarySchema(flatbuffers::Parser &parser,
+                                    const std::string &filename,
+                                    const std::string &contents) {
+  if (!parser.Deserialize(reinterpret_cast<const uint8_t *>(contents.c_str()),
+                          contents.size())) {
+    Error("failed to load binary schema: " + filename, false, false);
+  }
+}
+
+void FlatCompiler::Warn(const std::string &warn, bool show_exe_name) const {
+  params_.warn_fn(this, warn, show_exe_name);
+}
+
+void FlatCompiler::Error(const std::string &err, bool usage,
+                         bool show_exe_name) const {
+  params_.error_fn(this, err, usage, show_exe_name);
+}
+
+std::string FlatCompiler::GetUsageString(const char *program_name) const {
+  std::stringstream ss;
+  ss << "Usage: " << program_name << " [OPTION]... FILE... [-- FILE...]\n";
+  for (size_t i = 0; i < params_.num_generators; ++i) {
+    const Generator &g = params_.generators[i];
+
+    std::stringstream full_name;
+    full_name << std::setw(16) << std::left << g.generator_opt_long;
+    const char *name = g.generator_opt_short ? g.generator_opt_short : "  ";
+    const char *help = g.generator_help;
+
+    ss << "  " << full_name.str() << " " << name << "    " << help << ".\n";
+  }
+  // clang-format off
+
+  // Output width
+  // 12345678901234567890123456789012345678901234567890123456789012345678901234567890
+  ss <<
+    "  -o PATH                Prefix PATH to all generated files.\n"
+    "  -I PATH                Search for includes in the specified path.\n"
+    "  -M                     Print make rules for generated files.\n"
+    "  --version              Print the version number of flatc and exit.\n"
+    "  --strict-json          Strict JSON: field names must be / will be quoted,\n"
+    "                         no trailing commas in tables/vectors.\n"
+    "  --allow-non-utf8       Pass non-UTF-8 input through parser and emit nonstandard\n"
+    "                         \\x escapes in JSON. (Default is to raise parse error on\n"
+    "                         non-UTF-8 input.)\n"
+    "  --natural-utf8         Output strings with UTF-8 as human-readable strings.\n"
+    "                         By default, UTF-8 characters are printed as \\uXXXX escapes.\n"
+    "  --defaults-json        Output fields whose value is the default when\n"
+    "                         writing JSON\n"
+    "  --unknown-json         Allow fields in JSON that are not defined in the\n"
+    "                         schema. These fields will be discared when generating\n"
+    "                         binaries.\n"
+    "  --no-prefix            Don\'t prefix enum values with the enum type in C++.\n"
+    "  --scoped-enums         Use C++11 style scoped and strongly typed enums.\n"
+    "                         also implies --no-prefix.\n"
+    "  --gen-includes         (deprecated), this is the default behavior.\n"
+    "                         If the original behavior is required (no include\n"
+    "                         statements) use --no-includes.\n"
+    "  --no-includes          Don\'t generate include statements for included\n"
+    "                         schemas the generated file depends on (C++ / Python).\n"
+    "  --gen-mutable          Generate accessors that can mutate buffers in-place.\n"
+    "  --gen-onefile          Generate single output file for C# and Go.\n"
+    "  --gen-name-strings     Generate type name functions for C++ and Rust.\n"
+    "  --gen-object-api       Generate an additional object-based API.\n"
+    "  --gen-compare          Generate operator== for object-based API types.\n"
+    "  --gen-nullable         Add Clang _Nullable for C++ pointer. or @Nullable for Java\n"
+    "  --java-checkerframe    work Add @Pure for Java.\n"
+    "  --gen-generated        Add @Generated annotation for Java\n"
+    "  --gen-jvmstatic        Add @JvmStatic annotation for Kotlin methods\n"
+    "                         in companion object for interop from Java to Kotlin.\n"
+    "  --gen-all              Generate not just code for the current schema files,\n"
+    "                         but for all files it includes as well.\n"
+    "                         If the language uses a single file for output (by default\n"
+    "                         the case for C++ and JS), all code will end up in this one\n"
+    "                         file.\n"
+    "  --cpp-include          Adds an #include in generated file.\n"
+    "  --cpp-ptr-type T       Set object API pointer type (default std::unique_ptr).\n"
+    "  --cpp-str-type T       Set object API string type (default std::string).\n"
+    "                         T::c_str(), T::length() and T::empty() must be supported.\n"
+    "                         The custom type also needs to be constructible from std::string\n"
+    "                         (see the --cpp-str-flex-ctor option to change this behavior).\n"
+    "  --cpp-str-flex-ctor    Don't construct custom string types by passing std::string\n"
+    "                         from Flatbuffers, but (char* + length).\n"
+    "  --cpp-std CPP_STD      Generate a C++ code using features of selected C++ standard.\n"
+    "                         Supported CPP_STD values:\n"
+    "                          * 'c++0x' - generate code compatible with old compilers;\n"
+    "                          * 'c++11' - use C++11 code generator (default);\n"
+    "                          * 'c++17' - use C++17 features in generated code (experimental).\n"
+    "  --cpp-static-reflection When using C++17, generate extra code to provide compile-time\n"
+    "                          (static) reflection of Flatbuffers types.  Requires --cpp-std\n"
+    "                          to be \"c++17\" or higher.\n"
+    "  --object-prefix        Customise class prefix for C++ object-based API.\n"
+    "  --object-suffix        Customise class suffix for C++ object-based API.\n"
+    "                         Default value is \"T\".\n"
+    "  --go-namespace         Generate the overriding namespace in Golang.\n"
+    "  --go-import            Generate the overriding import for flatbuffers in Golang\n"
+    "                         (default is \"github.com/google/flatbuffers/go\").\n"
+    "  --raw-binary           Allow binaries without file_identifier to be read.\n"
+    "                         This may crash flatc given a mismatched schema.\n"
+    "  --size-prefixed        Input binaries are size prefixed buffers.\n"
+    "  --proto                Input is a .proto, translate to .fbs.\n"
+    "  --proto-namespace-suffix Add this namespace to any flatbuffers generated\n"
+    "    SUFFIX                 from protobufs.\n"
+    "  --oneof-union          Translate .proto oneofs to flatbuffer unions.\n"
+    "  --grpc                 Generate GRPC interfaces for the specified languages.\n"
+    "  --schema               Serialize schemas instead of JSON (use with -b).\n"
+    "  --bfbs-comments        Add doc comments to the binary schema files.\n"
+    "  --bfbs-builtins        Add builtin attributes to the binary schema files.\n"
+    "  --bfbs-gen-embed       Generate code to embed the bfbs schema to the source.\n"
+    "  --conform FILE         Specify a schema the following schemas should be\n"
+    "                         an evolution of. Gives errors if not.\n"
+    "  --conform-includes     Include path for the schema given with --conform PATH\n"
+    "  --filename-suffix      The suffix appended to the generated file names.\n"
+    "                         Default is '_generated'.\n"
+    "  --filename-ext         The extension appended to the generated file names.\n"
+    "                         Default is language-specific (e.g., '.h' for C++)\n"
+    "  --include-prefix       Prefix this path to any generated include statements.\n"
+    "    PATH\n"
+    "  --keep-prefix          Keep original prefix of schema include statement.\n"
+    "  --reflect-types        Add minimal type reflection to code generation.\n"
+    "  --reflect-names        Add minimal type/name reflection.\n"
+    "  --root-type T          Select or override the default root_type\n"
+    "  --require-explicit-ids When parsing schemas, require explicit ids (id: x).\n"
+    "  --force-defaults       Emit default values in binary output from JSON\n"
+    "  --force-empty          When serializing from object API representation,\n"
+    "                         force strings and vectors to empty rather than null.\n"
+    "  --force-empty-vectors  When serializing from object API representation,\n"
+    "                         force vectors to empty rather than null.\n"
+    "  --flexbuffers          Used with \"binary\" and \"json\" options, it generates\n"
+    "                         data using schema-less FlexBuffers.\n"
+    "  --no-warnings          Inhibit all warning messages.\n"
+    "FILEs may be schemas (must end in .fbs), binary schemas (must end in .bfbs),\n"
+    "or JSON files (conforming to preceding schema). FILEs after the -- must be\n"
+    "binary flatbuffer format files.\n"
+    "Output files are named using the base file name of the input,\n"
+    "and written to the current directory or the path given by -o.\n"
+    "example: " << program_name << " -c -b schema1.fbs schema2.fbs data.json\n";
+  // 12345678901234567890123456789012345678901234567890123456789012345678901234567890
+  // clang-format on
+  return ss.str();
+}
+
+int FlatCompiler::Compile(int argc, const char **argv) {
+  if (params_.generators == nullptr || params_.num_generators == 0) {
+    return 0;
+  }
+
+  flatbuffers::IDLOptions opts;
+  std::string output_path;
+
+  bool any_generator = false;
+  bool print_make_rules = false;
+  bool raw_binary = false;
+  bool schema_binary = false;
+  bool grpc_enabled = false;
+  std::vector<std::string> filenames;
+  std::list<std::string> include_directories_storage;
+  std::vector<const char *> include_directories;
+  std::vector<const char *> conform_include_directories;
+  std::vector<bool> generator_enabled(params_.num_generators, false);
+  size_t binary_files_from = std::numeric_limits<size_t>::max();
+  std::string conform_to_schema;
+
+  for (int argi = 0; argi < argc; argi++) {
+    std::string arg = argv[argi];
+    if (arg[0] == '-') {
+      if (filenames.size() && arg[1] != '-')
+        Error("invalid option location: " + arg, true);
+      if (arg == "-o") {
+        if (++argi >= argc) Error("missing path following: " + arg, true);
+        output_path = flatbuffers::ConCatPathFileName(
+            flatbuffers::PosixPath(argv[argi]), "");
+      } else if (arg == "-I") {
+        if (++argi >= argc) Error("missing path following: " + arg, true);
+        include_directories_storage.push_back(
+            flatbuffers::PosixPath(argv[argi]));
+        include_directories.push_back(
+            include_directories_storage.back().c_str());
+      } else if (arg == "--conform") {
+        if (++argi >= argc) Error("missing path following: " + arg, true);
+        conform_to_schema = flatbuffers::PosixPath(argv[argi]);
+      } else if (arg == "--conform-includes") {
+        if (++argi >= argc) Error("missing path following: " + arg, true);
+        include_directories_storage.push_back(
+            flatbuffers::PosixPath(argv[argi]));
+        conform_include_directories.push_back(
+            include_directories_storage.back().c_str());
+      } else if (arg == "--include-prefix") {
+        if (++argi >= argc) Error("missing path following: " + arg, true);
+        opts.include_prefix = flatbuffers::ConCatPathFileName(
+            flatbuffers::PosixPath(argv[argi]), "");
+      } else if (arg == "--keep-prefix") {
+        opts.keep_include_path = true;
+      } else if (arg == "--strict-json") {
+        opts.strict_json = true;
+      } else if (arg == "--allow-non-utf8") {
+        opts.allow_non_utf8 = true;
+      } else if (arg == "--natural-utf8") {
+        opts.natural_utf8 = true;
+      } else if (arg == "--go-namespace") {
+        if (++argi >= argc) Error("missing golang namespace" + arg, true);
+        opts.go_namespace = argv[argi];
+      } else if (arg == "--go-import") {
+        if (++argi >= argc) Error("missing golang import" + arg, true);
+        opts.go_import = argv[argi];
+      } else if (arg == "--defaults-json") {
+        opts.output_default_scalars_in_json = true;
+      } else if (arg == "--unknown-json") {
+        opts.skip_unexpected_fields_in_json = true;
+      } else if (arg == "--no-prefix") {
+        opts.prefixed_enums = false;
+      } else if (arg == "--scoped-enums") {
+        opts.prefixed_enums = false;
+        opts.scoped_enums = true;
+      } else if (arg == "--no-union-value-namespacing") {
+        opts.union_value_namespacing = false;
+      } else if (arg == "--gen-mutable") {
+        opts.mutable_buffer = true;
+      } else if (arg == "--gen-name-strings") {
+        opts.generate_name_strings = true;
+      } else if (arg == "--gen-object-api") {
+        opts.generate_object_based_api = true;
+      } else if (arg == "--gen-compare") {
+        opts.gen_compare = true;
+      } else if (arg == "--cpp-include") {
+        if (++argi >= argc) Error("missing include following: " + arg, true);
+        opts.cpp_includes.push_back(argv[argi]);
+      } else if (arg == "--cpp-ptr-type") {
+        if (++argi >= argc) Error("missing type following: " + arg, true);
+        opts.cpp_object_api_pointer_type = argv[argi];
+      } else if (arg == "--cpp-str-type") {
+        if (++argi >= argc) Error("missing type following: " + arg, true);
+        opts.cpp_object_api_string_type = argv[argi];
+      } else if (arg == "--cpp-str-flex-ctor") {
+        opts.cpp_object_api_string_flexible_constructor = true;
+      } else if (arg == "--no-cpp-direct-copy") {
+        opts.cpp_direct_copy = false;
+      } else if (arg == "--gen-nullable") {
+        opts.gen_nullable = true;
+      } else if (arg == "--java-checkerframework") {
+        opts.java_checkerframework = true;
+      } else if (arg == "--gen-generated") {
+        opts.gen_generated = true;
+      } else if (arg == "--object-prefix") {
+        if (++argi >= argc) Error("missing prefix following: " + arg, true);
+        opts.object_prefix = argv[argi];
+      } else if (arg == "--object-suffix") {
+        if (++argi >= argc) Error("missing suffix following: " + arg, true);
+        opts.object_suffix = argv[argi];
+      } else if (arg == "--gen-all") {
+        opts.generate_all = true;
+        opts.include_dependence_headers = false;
+      } else if (arg == "--gen-includes") {
+        // Deprecated, remove this option some time in the future.
+        Warn("warning: --gen-includes is deprecated (it is now default)\n");
+      } else if (arg == "--no-includes") {
+        opts.include_dependence_headers = false;
+      } else if (arg == "--gen-onefile") {
+        opts.one_file = true;
+      } else if (arg == "--raw-binary") {
+        raw_binary = true;
+      } else if (arg == "--size-prefixed") {
+        opts.size_prefixed = true;
+      } else if (arg == "--") {  // Separator between text and binary inputs.
+        binary_files_from = filenames.size();
+      } else if (arg == "--proto") {
+        opts.proto_mode = true;
+      } else if (arg == "--proto-namespace-suffix") {
+        if (++argi >= argc) Error("missing namespace suffix" + arg, true);
+        opts.proto_namespace_suffix = argv[argi];
+      } else if (arg == "--oneof-union") {
+        opts.proto_oneof_union = true;
+      } else if (arg == "--schema") {
+        schema_binary = true;
+      } else if (arg == "-M") {
+        print_make_rules = true;
+      } else if (arg == "--version") {
+        printf("flatc version %s\n", FLATC_VERSION());
+        exit(0);
+      } else if (arg == "--grpc") {
+        grpc_enabled = true;
+      } else if (arg == "--bfbs-comments") {
+        opts.binary_schema_comments = true;
+      } else if (arg == "--bfbs-builtins") {
+        opts.binary_schema_builtins = true;
+      } else if (arg == "--bfbs-gen-embed") {
+        opts.binary_schema_gen_embed = true;
+      } else if (arg == "--reflect-types") {
+        opts.mini_reflect = IDLOptions::kTypes;
+      } else if (arg == "--reflect-names") {
+        opts.mini_reflect = IDLOptions::kTypesAndNames;
+      } else if (arg == "--require-explicit-ids") {
+        opts.require_explicit_ids = true;
+      } else if (arg == "--root-type") {
+        if (++argi >= argc) Error("missing type following: " + arg, true);
+        opts.root_type = argv[argi];
+      } else if (arg == "--filename-suffix") {
+        if (++argi >= argc) Error("missing filename suffix: " + arg, true);
+        opts.filename_suffix = argv[argi];
+      } else if (arg == "--filename-ext") {
+        if (++argi >= argc) Error("missing filename extension: " + arg, true);
+        opts.filename_extension = argv[argi];
+      } else if (arg == "--force-defaults") {
+        opts.force_defaults = true;
+      } else if (arg == "--force-empty") {
+        opts.set_empty_strings_to_null = false;
+        opts.set_empty_vectors_to_null = false;
+      } else if (arg == "--force-empty-vectors") {
+        opts.set_empty_vectors_to_null = false;
+      } else if (arg == "--java-primitive-has-method") {
+        opts.java_primitive_has_method = true;
+      } else if (arg == "--cs-gen-json-serializer") {
+        opts.cs_gen_json_serializer = true;
+      } else if (arg == "--flexbuffers") {
+        opts.use_flexbuffers = true;
+      } else if (arg == "--gen-jvmstatic") {
+        opts.gen_jvmstatic = true;
+      } else if (arg == "--no-warnings") {
+        opts.no_warnings = true;
+      } else if (arg == "--cpp-std") {
+        if (++argi >= argc)
+          Error("missing C++ standard specification" + arg, true);
+        opts.cpp_std = argv[argi];
+      } else if (arg.rfind("--cpp-std=", 0) == 0) {
+        opts.cpp_std = arg.substr(std::string("--cpp-std=").size());
+      } else if (arg == "--cpp-static-reflection") {
+        opts.cpp_static_reflection = true;
+      } else {
+        for (size_t i = 0; i < params_.num_generators; ++i) {
+          if (arg == params_.generators[i].generator_opt_long ||
+              (params_.generators[i].generator_opt_short &&
+               arg == params_.generators[i].generator_opt_short)) {
+            generator_enabled[i] = true;
+            any_generator = true;
+            opts.lang_to_generate |= params_.generators[i].lang;
+            goto found;
+          }
+        }
+        Error("unknown commandline argument: " + arg, true);
+      found:;
+      }
+    } else {
+      filenames.push_back(flatbuffers::PosixPath(argv[argi]));
+    }
+  }
+
+  if (!filenames.size()) Error("missing input files", false, true);
+
+  if (opts.proto_mode) {
+    if (any_generator)
+      Error("cannot generate code directly from .proto files", true);
+  } else if (!any_generator && conform_to_schema.empty()) {
+    Error("no options: specify at least one generator.", true);
+  }
+
+  flatbuffers::Parser conform_parser;
+  if (!conform_to_schema.empty()) {
+    std::string contents;
+    if (!flatbuffers::LoadFile(conform_to_schema.c_str(), true, &contents))
+      Error("unable to load schema: " + conform_to_schema);
+
+    if (flatbuffers::GetExtension(conform_to_schema) ==
+        reflection::SchemaExtension()) {
+      LoadBinarySchema(conform_parser, conform_to_schema, contents);
+    } else {
+      ParseFile(conform_parser, conform_to_schema, contents,
+                conform_include_directories);
+    }
+  }
+
+  std::unique_ptr<flatbuffers::Parser> parser(new flatbuffers::Parser(opts));
+
+  for (auto file_it = filenames.begin(); file_it != filenames.end();
+       ++file_it) {
+    auto &filename = *file_it;
+    std::string contents;
+    if (!flatbuffers::LoadFile(filename.c_str(), true, &contents))
+      Error("unable to load file: " + filename);
+
+    bool is_binary =
+        static_cast<size_t>(file_it - filenames.begin()) >= binary_files_from;
+    auto ext = flatbuffers::GetExtension(filename);
+    auto is_schema = ext == "fbs" || ext == "proto";
+    auto is_binary_schema = ext == reflection::SchemaExtension();
+    if (is_binary) {
+      parser->builder_.Clear();
+      parser->builder_.PushFlatBuffer(
+          reinterpret_cast<const uint8_t *>(contents.c_str()),
+          contents.length());
+      if (!raw_binary) {
+        // Generally reading binaries that do not correspond to the schema
+        // will crash, and sadly there's no way around that when the binary
+        // does not contain a file identifier.
+        // We'd expect that typically any binary used as a file would have
+        // such an identifier, so by default we require them to match.
+        if (!parser->file_identifier_.length()) {
+          Error("current schema has no file_identifier: cannot test if \"" +
+                filename +
+                "\" matches the schema, use --raw-binary to read this file"
+                " anyway.");
+        } else if (!flatbuffers::BufferHasIdentifier(
+                       contents.c_str(), parser->file_identifier_.c_str(),
+                       opts.size_prefixed)) {
+          Error("binary \"" + filename +
+                "\" does not have expected file_identifier \"" +
+                parser->file_identifier_ +
+                "\", use --raw-binary to read this file anyway.");
+        }
+      }
+    } else {
+      // Check if file contains 0 bytes.
+      if (!opts.use_flexbuffers && !is_binary_schema &&
+          contents.length() != strlen(contents.c_str())) {
+        Error("input file appears to be binary: " + filename, true);
+      }
+      if (is_schema) {
+        // If we're processing multiple schemas, make sure to start each
+        // one from scratch. If it depends on previous schemas it must do
+        // so explicitly using an include.
+        parser.reset(new flatbuffers::Parser(opts));
+      }
+      if (is_binary_schema) {
+        LoadBinarySchema(*parser.get(), filename, contents);
+      }
+      if (opts.use_flexbuffers) {
+        if (opts.lang_to_generate == IDLOptions::kJson) {
+          parser->flex_root_ = flexbuffers::GetRoot(
+              reinterpret_cast<const uint8_t *>(contents.c_str()),
+              contents.size());
+        } else {
+          parser->flex_builder_.Clear();
+          ParseFile(*parser.get(), filename, contents, include_directories);
+        }
+      } else {
+        ParseFile(*parser.get(), filename, contents, include_directories);
+        if (!is_schema && !parser->builder_.GetSize()) {
+          // If a file doesn't end in .fbs, it must be json/binary. Ensure we
+          // didn't just parse a schema with a different extension.
+          Error("input file is neither json nor a .fbs (schema) file: " +
+                    filename,
+                true);
+        }
+      }
+      if ((is_schema || is_binary_schema) && !conform_to_schema.empty()) {
+        auto err = parser->ConformTo(conform_parser);
+        if (!err.empty()) Error("schemas don\'t conform: " + err);
+      }
+      if (schema_binary || opts.binary_schema_gen_embed) {
+        parser->Serialize();
+      }
+      if (schema_binary) {
+        parser->file_extension_ = reflection::SchemaExtension();
+      }
+    }
+
+    std::string filebase =
+        flatbuffers::StripPath(flatbuffers::StripExtension(filename));
+
+    for (size_t i = 0; i < params_.num_generators; ++i) {
+      parser->opts.lang = params_.generators[i].lang;
+      if (generator_enabled[i]) {
+        if (!print_make_rules) {
+          flatbuffers::EnsureDirExists(output_path);
+          if ((!params_.generators[i].schema_only ||
+               (is_schema || is_binary_schema)) &&
+              !params_.generators[i].generate(*parser.get(), output_path,
+                                              filebase)) {
+            Error(std::string("Unable to generate ") +
+                  params_.generators[i].lang_name + " for " + filebase);
+          }
+        } else {
+          if (params_.generators[i].make_rule == nullptr) {
+            Error(std::string("Cannot generate make rule for ") +
+                  params_.generators[i].lang_name);
+          } else {
+            std::string make_rule = params_.generators[i].make_rule(
+                *parser.get(), output_path, filename);
+            if (!make_rule.empty())
+              printf("%s\n",
+                     flatbuffers::WordWrap(make_rule, 80, " ", " \\").c_str());
+          }
+        }
+        if (grpc_enabled) {
+          if (params_.generators[i].generateGRPC != nullptr) {
+            if (!params_.generators[i].generateGRPC(*parser.get(), output_path,
+                                                    filebase)) {
+              Error(std::string("Unable to generate GRPC interface for") +
+                    params_.generators[i].lang_name);
+            }
+          } else {
+            Warn(std::string("GRPC interface generator not implemented for ") +
+                 params_.generators[i].lang_name);
+          }
+        }
+      }
+    }
+
+    if (!opts.root_type.empty()) {
+      if (!parser->SetRootType(opts.root_type.c_str()))
+        Error("unknown root type: " + opts.root_type);
+      else if (parser->root_struct_def_->fixed)
+        Error("root type must be a table");
+    }
+
+    if (opts.proto_mode) GenerateFBS(*parser.get(), output_path, filebase);
+
+    // We do not want to generate code for the definitions in this file
+    // in any files coming up next.
+    parser->MarkGenerated();
+  }
+  return 0;
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/flatc_main.cpp b/3rdparty/TNN/third_party/flatbuffers/src/flatc_main.cpp
new file mode 100644
index 0000000..b196666
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/flatc_main.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/flatc.h"
+#include "flatbuffers/util.h"
+
+static const char *g_program_name = nullptr;
+
+static void Warn(const flatbuffers::FlatCompiler *flatc,
+                 const std::string &warn, bool show_exe_name) {
+  (void)flatc;
+  if (show_exe_name) { printf("%s: ", g_program_name); }
+  printf("warning: %s\n", warn.c_str());
+}
+
+static void Error(const flatbuffers::FlatCompiler *flatc,
+                  const std::string &err, bool usage, bool show_exe_name) {
+  if (show_exe_name) { printf("%s: ", g_program_name); }
+  printf("error: %s\n", err.c_str());
+  if (usage && flatc) {
+    printf("%s", flatc->GetUsageString(g_program_name).c_str());
+  }
+  exit(1);
+}
+
+namespace flatbuffers {
+void LogCompilerWarn(const std::string &warn) {
+  Warn(static_cast<const flatbuffers::FlatCompiler *>(nullptr), warn, true);
+}
+void LogCompilerError(const std::string &err) {
+  Error(static_cast<const flatbuffers::FlatCompiler *>(nullptr), err, false,
+        true);
+}
+}  // namespace flatbuffers
+
+int main(int argc, const char *argv[]) {
+  // Prevent Appveyor-CI hangs.
+  flatbuffers::SetupDefaultCRTReportMode();
+
+  g_program_name = argv[0];
+
+  const flatbuffers::FlatCompiler::Generator generators[] = {
+    { flatbuffers::GenerateBinary, "-b", "--binary", "binary", false, nullptr,
+      flatbuffers::IDLOptions::kBinary,
+      "Generate wire format binaries for any data definitions",
+      flatbuffers::BinaryMakeRule },
+    { flatbuffers::GenerateTextFile, "-t", "--json", "text", false, nullptr,
+      flatbuffers::IDLOptions::kJson,
+      "Generate text output for any data definitions",
+      flatbuffers::TextMakeRule },
+    { flatbuffers::GenerateCPP, "-c", "--cpp", "C++", true,
+      flatbuffers::GenerateCppGRPC, flatbuffers::IDLOptions::kCpp,
+      "Generate C++ headers for tables/structs", flatbuffers::CPPMakeRule },
+    { flatbuffers::GenerateGo, "-g", "--go", "Go", true,
+      flatbuffers::GenerateGoGRPC, flatbuffers::IDLOptions::kGo,
+      "Generate Go files for tables/structs", nullptr },
+    { flatbuffers::GenerateJava, "-j", "--java", "Java", true,
+      flatbuffers::GenerateJavaGRPC, flatbuffers::IDLOptions::kJava,
+      "Generate Java classes for tables/structs",
+      flatbuffers::JavaCSharpMakeRule },
+    { flatbuffers::GenerateDart, "-d", "--dart", "Dart", true, nullptr,
+      flatbuffers::IDLOptions::kDart,
+      "Generate Dart classes for tables/structs", flatbuffers::DartMakeRule },
+    { flatbuffers::GenerateTS, "-T", "--ts", "TypeScript", true,
+      flatbuffers::GenerateTSGRPC, flatbuffers::IDLOptions::kTs,
+      "Generate TypeScript code for tables/structs", flatbuffers::TSMakeRule },
+    { flatbuffers::GenerateCSharp, "-n", "--csharp", "C#", true, nullptr,
+      flatbuffers::IDLOptions::kCSharp,
+      "Generate C# classes for tables/structs",
+      flatbuffers::JavaCSharpMakeRule },
+    { flatbuffers::GeneratePython, "-p", "--python", "Python", true,
+      flatbuffers::GeneratePythonGRPC, flatbuffers::IDLOptions::kPython,
+      "Generate Python files for tables/structs", nullptr },
+    { flatbuffers::GenerateLobster, nullptr, "--lobster", "Lobster", true,
+      nullptr, flatbuffers::IDLOptions::kLobster,
+      "Generate Lobster files for tables/structs", nullptr },
+    { flatbuffers::GenerateLua, "-l", "--lua", "Lua", true, nullptr,
+      flatbuffers::IDLOptions::kLua, "Generate Lua files for tables/structs",
+      nullptr },
+    { flatbuffers::GenerateRust, "-r", "--rust", "Rust", true, nullptr,
+      flatbuffers::IDLOptions::kRust, "Generate Rust files for tables/structs",
+      flatbuffers::RustMakeRule },
+    { flatbuffers::GeneratePhp, nullptr, "--php", "PHP", true, nullptr,
+      flatbuffers::IDLOptions::kPhp, "Generate PHP files for tables/structs",
+      nullptr },
+    { flatbuffers::GenerateKotlin, nullptr, "--kotlin", "Kotlin", true, nullptr,
+      flatbuffers::IDLOptions::kKotlin,
+      "Generate Kotlin classes for tables/structs", nullptr },
+    { flatbuffers::GenerateJsonSchema, nullptr, "--jsonschema", "JsonSchema",
+      true, nullptr, flatbuffers::IDLOptions::kJsonSchema,
+      "Generate Json schema", nullptr },
+    { flatbuffers::GenerateSwift, nullptr, "--swift", "swift", true,
+      flatbuffers::GenerateSwiftGRPC, flatbuffers::IDLOptions::kSwift,
+      "Generate Swift files for tables/structs", nullptr },
+  };
+
+  flatbuffers::FlatCompiler::InitParams params;
+  params.generators = generators;
+  params.num_generators = sizeof(generators) / sizeof(generators[0]);
+  params.warn_fn = Warn;
+  params.error_fn = Error;
+
+  flatbuffers::FlatCompiler flatc(params);
+  return flatc.Compile(argc - 1, argv + 1);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/flathash.cpp b/3rdparty/TNN/third_party/flatbuffers/src/flathash.cpp
new file mode 100644
index 0000000..1264f82
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/flathash.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "flatbuffers/hash.h"
+
+enum OutputFormat { kDecimal, kHexadecimal, kHexadecimal0x };
+
+int main(int argc, char *argv[]) {
+  const char *name = argv[0];
+  if (argc <= 1) {
+    printf("%s HASH [OPTION]... [--] STRING...\n", name);
+    printf("Available hashing algorithms:\n");
+    printf("  16 bit:\n");
+    size_t size = sizeof(flatbuffers::kHashFunctions16) /
+                  sizeof(flatbuffers::kHashFunctions16[0]);
+    for (size_t i = 0; i < size; ++i) {
+      printf("    * %s\n", flatbuffers::kHashFunctions16[i].name);
+    }
+    printf("  32 bit:\n");
+    size = sizeof(flatbuffers::kHashFunctions32) /
+           sizeof(flatbuffers::kHashFunctions32[0]);
+    for (size_t i = 0; i < size; ++i) {
+      printf("    * %s\n", flatbuffers::kHashFunctions32[i].name);
+    }
+    printf("  64 bit:\n");
+    size = sizeof(flatbuffers::kHashFunctions64) /
+           sizeof(flatbuffers::kHashFunctions64[0]);
+    for (size_t i = 0; i < size; ++i) {
+      printf("    * %s\n", flatbuffers::kHashFunctions64[i].name);
+    }
+    printf(
+        "  -d         Output hash in decimal.\n"
+        "  -x         Output hash in hexadecimal.\n"
+        "  -0x        Output hash in hexadecimal and prefix with 0x.\n"
+        "  -c         Append the string to the output in a c-style comment.\n");
+    return 1;
+  }
+
+  const char *hash_algorithm = argv[1];
+
+  flatbuffers::NamedHashFunction<uint16_t>::HashFunction hash_function16 =
+      flatbuffers::FindHashFunction16(hash_algorithm);
+  flatbuffers::NamedHashFunction<uint32_t>::HashFunction hash_function32 =
+      flatbuffers::FindHashFunction32(hash_algorithm);
+  flatbuffers::NamedHashFunction<uint64_t>::HashFunction hash_function64 =
+      flatbuffers::FindHashFunction64(hash_algorithm);
+
+  if (!hash_function16 && !hash_function32 && !hash_function64) {
+    printf("\"%s\" is not a known hash algorithm.\n", hash_algorithm);
+    return 1;
+  }
+
+  OutputFormat output_format = kHexadecimal;
+  bool annotate = false;
+  bool escape_dash = false;
+  for (int i = 2; i < argc; i++) {
+    const char *arg = argv[i];
+    if (!escape_dash && arg[0] == '-') {
+      std::string opt = arg;
+      if (opt == "-d")
+        output_format = kDecimal;
+      else if (opt == "-x")
+        output_format = kHexadecimal;
+      else if (opt == "-0x")
+        output_format = kHexadecimal0x;
+      else if (opt == "-c")
+        annotate = true;
+      else if (opt == "--")
+        escape_dash = true;
+      else
+        printf("Unrecognized argument: \"%s\"\n", arg);
+    } else {
+      std::stringstream ss;
+      if (output_format == kDecimal) {
+        ss << std::dec;
+      } else if (output_format == kHexadecimal) {
+        ss << std::hex;
+      } else if (output_format == kHexadecimal0x) {
+        ss << std::hex;
+        ss << "0x";
+      }
+      if (hash_function16)
+        ss << hash_function16(arg);
+      else if (hash_function32)
+        ss << hash_function32(arg);
+      else if (hash_function64)
+        ss << hash_function64(arg);
+
+      if (annotate) ss << " /* \"" << arg << "\" */";
+
+      ss << "\n";
+
+      std::cout << ss.str();
+    }
+  }
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_cpp.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_cpp.cpp
new file mode 100644
index 0000000..210a38f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_cpp.cpp
@@ -0,0 +1,3514 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flatc.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// Make numerical literal with type-suffix.
+// This function is only needed for C++! Other languages do not need it.
+static inline std::string NumToStringCpp(std::string val, BaseType type) {
+  // Avoid issues with -2147483648, -9223372036854775808.
+  switch (type) {
+    case BASE_TYPE_INT:
+      return (val != "-2147483648") ? val : ("(-2147483647 - 1)");
+    case BASE_TYPE_ULONG: return (val == "0") ? val : (val + "ULL");
+    case BASE_TYPE_LONG:
+      if (val == "-9223372036854775808")
+        return "(-9223372036854775807LL - 1LL)";
+      else
+        return (val == "0") ? val : (val + "LL");
+    default: return val;
+  }
+}
+
+static std::string GenIncludeGuard(const std::string &file_name,
+                                   const Namespace &name_space,
+                                   const std::string &postfix = "") {
+  // Generate include guard.
+  std::string guard = file_name;
+  // Remove any non-alpha-numeric characters that may appear in a filename.
+  struct IsAlnum {
+    bool operator()(char c) const { return !is_alnum(c); }
+  };
+  guard.erase(std::remove_if(guard.begin(), guard.end(), IsAlnum()),
+              guard.end());
+  guard = "FLATBUFFERS_GENERATED_" + guard;
+  guard += "_";
+  // For further uniqueness, also add the namespace.
+  for (auto it = name_space.components.begin();
+       it != name_space.components.end(); ++it) {
+    guard += *it + "_";
+  }
+  // Anything extra to add to the guard?
+  if (!postfix.empty()) { guard += postfix + "_"; }
+  guard += "H_";
+  std::transform(guard.begin(), guard.end(), guard.begin(), CharToUpper);
+  return guard;
+}
+
+namespace cpp {
+
+enum CppStandard { CPP_STD_X0 = 0, CPP_STD_11, CPP_STD_17 };
+
+// Define a style of 'struct' constructor if it has 'Array' fields.
+enum GenArrayArgMode {
+  kArrayArgModeNone,        // don't generate initialization args
+  kArrayArgModeSpanStatic,  // generate flatbuffers::span<T,N>
+};
+
+// Extension of IDLOptions for cpp-generator.
+struct IDLOptionsCpp : public IDLOptions {
+  // All fields start with 'g_' prefix to distinguish from the base IDLOptions.
+  CppStandard g_cpp_std;    // Base version of C++ standard.
+  bool g_only_fixed_enums;  // Generate underlaying type for all enums.
+
+  IDLOptionsCpp(const IDLOptions &opts)
+      : IDLOptions(opts), g_cpp_std(CPP_STD_11), g_only_fixed_enums(true) {}
+};
+
+class CppGenerator : public BaseGenerator {
+ public:
+  CppGenerator(const Parser &parser, const std::string &path,
+               const std::string &file_name, IDLOptionsCpp opts)
+      : BaseGenerator(parser, path, file_name, "", "::", "h"),
+        cur_name_space_(nullptr),
+        opts_(opts),
+        float_const_gen_("std::numeric_limits<double>::",
+                         "std::numeric_limits<float>::", "quiet_NaN()",
+                         "infinity()") {
+    static const char *const keywords[] = {
+      "alignas",
+      "alignof",
+      "and",
+      "and_eq",
+      "asm",
+      "atomic_cancel",
+      "atomic_commit",
+      "atomic_noexcept",
+      "auto",
+      "bitand",
+      "bitor",
+      "bool",
+      "break",
+      "case",
+      "catch",
+      "char",
+      "char16_t",
+      "char32_t",
+      "class",
+      "compl",
+      "concept",
+      "const",
+      "constexpr",
+      "const_cast",
+      "continue",
+      "co_await",
+      "co_return",
+      "co_yield",
+      "decltype",
+      "default",
+      "delete",
+      "do",
+      "double",
+      "dynamic_cast",
+      "else",
+      "enum",
+      "explicit",
+      "export",
+      "extern",
+      "false",
+      "float",
+      "for",
+      "friend",
+      "goto",
+      "if",
+      "import",
+      "inline",
+      "int",
+      "long",
+      "module",
+      "mutable",
+      "namespace",
+      "new",
+      "noexcept",
+      "not",
+      "not_eq",
+      "nullptr",
+      "operator",
+      "or",
+      "or_eq",
+      "private",
+      "protected",
+      "public",
+      "register",
+      "reinterpret_cast",
+      "requires",
+      "return",
+      "short",
+      "signed",
+      "sizeof",
+      "static",
+      "static_assert",
+      "static_cast",
+      "struct",
+      "switch",
+      "synchronized",
+      "template",
+      "this",
+      "thread_local",
+      "throw",
+      "true",
+      "try",
+      "typedef",
+      "typeid",
+      "typename",
+      "union",
+      "unsigned",
+      "using",
+      "virtual",
+      "void",
+      "volatile",
+      "wchar_t",
+      "while",
+      "xor",
+      "xor_eq",
+      nullptr,
+    };
+    for (auto kw = keywords; *kw; kw++) keywords_.insert(*kw);
+  }
+
+  void GenIncludeDependencies() {
+    int num_includes = 0;
+    if (opts_.generate_object_based_api) {
+      for (auto it = parser_.native_included_files_.begin();
+           it != parser_.native_included_files_.end(); ++it) {
+        code_ += "#include \"" + *it + "\"";
+        num_includes++;
+      }
+    }
+    for (auto it = parser_.included_files_.begin();
+         it != parser_.included_files_.end(); ++it) {
+      if (it->second.empty()) continue;
+      auto noext = flatbuffers::StripExtension(it->second);
+      auto basename = flatbuffers::StripPath(noext);
+      auto includeName =
+          GeneratedFileName(opts_.include_prefix,
+                            opts_.keep_include_path ? noext : basename, opts_);
+      code_ += "#include \"" + includeName + "\"";
+      num_includes++;
+    }
+    if (num_includes) code_ += "";
+  }
+
+  void GenExtraIncludes() {
+    for (std::size_t i = 0; i < opts_.cpp_includes.size(); ++i) {
+      code_ += "#include \"" + opts_.cpp_includes[i] + "\"";
+    }
+    if (!opts_.cpp_includes.empty()) { code_ += ""; }
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : name + "_";
+  }
+
+  std::string Name(const Definition &def) const {
+    return EscapeKeyword(def.name);
+  }
+
+  std::string Name(const EnumVal &ev) const { return EscapeKeyword(ev.name); }
+
+  bool generate_bfbs_embed() {
+    code_.Clear();
+    code_ += "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    // If we don't have a root struct definition,
+    if (!parser_.root_struct_def_) {
+      // put a comment in the output why there is no code generated.
+      code_ += "// Binary schema not generated, no root struct found";
+    } else {
+      auto &struct_def = *parser_.root_struct_def_;
+      const auto include_guard =
+          GenIncludeGuard(file_name_, *struct_def.defined_namespace, "bfbs");
+
+      code_ += "#ifndef " + include_guard;
+      code_ += "#define " + include_guard;
+      code_ += "";
+      if (parser_.opts.gen_nullable) {
+        code_ += "#pragma clang system_header\n\n";
+      }
+
+      SetNameSpace(struct_def.defined_namespace);
+      auto name = Name(struct_def);
+      code_.SetValue("STRUCT_NAME", name);
+
+      // Create code to return the binary schema data.
+      auto binary_schema_hex_text =
+          BufferToHexText(parser_.builder_.GetBufferPointer(),
+                          parser_.builder_.GetSize(), 105, "      ", "");
+
+      code_ += "struct {{STRUCT_NAME}}BinarySchema {";
+      code_ += "  static const uint8_t *data() {";
+      code_ += "    // Buffer containing the binary schema.";
+      code_ += "    static const uint8_t bfbsData[" +
+               NumToString(parser_.builder_.GetSize()) + "] = {";
+      code_ += binary_schema_hex_text;
+      code_ += "    };";
+      code_ += "    return bfbsData;";
+      code_ += "  }";
+      code_ += "  static size_t size() {";
+      code_ += "    return " + NumToString(parser_.builder_.GetSize()) + ";";
+      code_ += "  }";
+      code_ += "  const uint8_t *begin() {";
+      code_ += "    return data();";
+      code_ += "  }";
+      code_ += "  const uint8_t *end() {";
+      code_ += "    return data() + size();";
+      code_ += "  }";
+      code_ += "};";
+      code_ += "";
+
+      if (cur_name_space_) SetNameSpace(nullptr);
+
+      // Close the include guard.
+      code_ += "#endif  // " + include_guard;
+    }
+
+    // We are just adding "_bfbs" to the generated filename.
+    const auto file_path =
+        GeneratedFileName(path_, file_name_ + "_bfbs", opts_);
+    const auto final_code = code_.ToString();
+
+    return SaveFile(file_path.c_str(), final_code, false);
+  }
+
+  // Iterate through all definitions we haven't generate code for (enums,
+  // structs, and tables) and output them to a single file.
+  bool generate() {
+    code_.Clear();
+    code_ += "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    const auto include_guard =
+        GenIncludeGuard(file_name_, *parser_.current_namespace_);
+    code_ += "#ifndef " + include_guard;
+    code_ += "#define " + include_guard;
+    code_ += "";
+
+    if (opts_.gen_nullable) { code_ += "#pragma clang system_header\n\n"; }
+
+    code_ += "#include \"flatbuffers/flatbuffers.h\"";
+    if (parser_.uses_flexbuffers_) {
+      code_ += "#include \"flatbuffers/flexbuffers.h\"";
+    }
+    code_ += "";
+
+    if (opts_.include_dependence_headers) { GenIncludeDependencies(); }
+    GenExtraIncludes();
+
+    FLATBUFFERS_ASSERT(!cur_name_space_);
+
+    // Generate forward declarations for all structs/tables, since they may
+    // have circular references.
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (!struct_def.generated) {
+        SetNameSpace(struct_def.defined_namespace);
+        code_ += "struct " + Name(struct_def) + ";";
+        if (!struct_def.fixed) {
+          code_ += "struct " + Name(struct_def) + "Builder;";
+        }
+        if (opts_.generate_object_based_api) {
+          auto nativeName = NativeName(Name(struct_def), &struct_def, opts_);
+          if (!struct_def.fixed) { code_ += "struct " + nativeName + ";"; }
+        }
+        code_ += "";
+      }
+    }
+
+    // Generate forward declarations for all equal operators
+    if (opts_.generate_object_based_api && opts_.gen_compare) {
+      for (auto it = parser_.structs_.vec.begin();
+           it != parser_.structs_.vec.end(); ++it) {
+        const auto &struct_def = **it;
+        if (!struct_def.generated) {
+          SetNameSpace(struct_def.defined_namespace);
+          auto nativeName = NativeName(Name(struct_def), &struct_def, opts_);
+          code_ += "bool operator==(const " + nativeName + " &lhs, const " +
+                   nativeName + " &rhs);";
+          code_ += "bool operator!=(const " + nativeName + " &lhs, const " +
+                   nativeName + " &rhs);";
+        }
+      }
+      code_ += "";
+    }
+
+    // Generate preablmle code for mini reflection.
+    if (opts_.mini_reflect != IDLOptions::kNone) {
+      // To break cyclic dependencies, first pre-declare all tables/structs.
+      for (auto it = parser_.structs_.vec.begin();
+           it != parser_.structs_.vec.end(); ++it) {
+        const auto &struct_def = **it;
+        if (!struct_def.generated) {
+          SetNameSpace(struct_def.defined_namespace);
+          GenMiniReflectPre(&struct_def);
+        }
+      }
+    }
+
+    // Generate code for all the enum declarations.
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      const auto &enum_def = **it;
+      if (!enum_def.generated) {
+        SetNameSpace(enum_def.defined_namespace);
+        GenEnum(enum_def);
+      }
+    }
+
+    // Generate code for all structs, then all tables.
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (struct_def.fixed && !struct_def.generated) {
+        SetNameSpace(struct_def.defined_namespace);
+        GenStruct(struct_def);
+      }
+    }
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (!struct_def.fixed && !struct_def.generated) {
+        SetNameSpace(struct_def.defined_namespace);
+        GenTable(struct_def);
+      }
+    }
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (!struct_def.fixed && !struct_def.generated) {
+        SetNameSpace(struct_def.defined_namespace);
+        GenTablePost(struct_def);
+      }
+    }
+
+    // Generate code for union verifiers.
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      const auto &enum_def = **it;
+      if (enum_def.is_union && !enum_def.generated) {
+        SetNameSpace(enum_def.defined_namespace);
+        GenUnionPost(enum_def);
+      }
+    }
+
+    // Generate code for mini reflection.
+    if (opts_.mini_reflect != IDLOptions::kNone) {
+      // Then the unions/enums that may refer to them.
+      for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+           ++it) {
+        const auto &enum_def = **it;
+        if (!enum_def.generated) {
+          SetNameSpace(enum_def.defined_namespace);
+          GenMiniReflect(nullptr, &enum_def);
+        }
+      }
+      // Then the full tables/structs.
+      for (auto it = parser_.structs_.vec.begin();
+           it != parser_.structs_.vec.end(); ++it) {
+        const auto &struct_def = **it;
+        if (!struct_def.generated) {
+          SetNameSpace(struct_def.defined_namespace);
+          GenMiniReflect(&struct_def, nullptr);
+        }
+      }
+    }
+
+    // Generate convenient global helper functions:
+    if (parser_.root_struct_def_) {
+      auto &struct_def = *parser_.root_struct_def_;
+      SetNameSpace(struct_def.defined_namespace);
+      auto name = Name(struct_def);
+      auto qualified_name = cur_name_space_->GetFullyQualifiedName(name);
+      auto cpp_name = TranslateNameSpace(qualified_name);
+
+      code_.SetValue("STRUCT_NAME", name);
+      code_.SetValue("CPP_NAME", cpp_name);
+      code_.SetValue("NULLABLE_EXT", NullableExtension());
+
+      // The root datatype accessor:
+      code_ += "inline \\";
+      code_ +=
+          "const {{CPP_NAME}} *{{NULLABLE_EXT}}Get{{STRUCT_NAME}}(const void "
+          "*buf) {";
+      code_ += "  return flatbuffers::GetRoot<{{CPP_NAME}}>(buf);";
+      code_ += "}";
+      code_ += "";
+
+      code_ += "inline \\";
+      code_ +=
+          "const {{CPP_NAME}} "
+          "*{{NULLABLE_EXT}}GetSizePrefixed{{STRUCT_NAME}}(const void "
+          "*buf) {";
+      code_ += "  return flatbuffers::GetSizePrefixedRoot<{{CPP_NAME}}>(buf);";
+      code_ += "}";
+      code_ += "";
+
+      if (opts_.mutable_buffer) {
+        code_ += "inline \\";
+        code_ += "{{STRUCT_NAME}} *GetMutable{{STRUCT_NAME}}(void *buf) {";
+        code_ += "  return flatbuffers::GetMutableRoot<{{STRUCT_NAME}}>(buf);";
+        code_ += "}";
+        code_ += "";
+      }
+
+      if (parser_.file_identifier_.length()) {
+        // Return the identifier
+        code_ += "inline const char *{{STRUCT_NAME}}Identifier() {";
+        code_ += "  return \"" + parser_.file_identifier_ + "\";";
+        code_ += "}";
+        code_ += "";
+
+        // Check if a buffer has the identifier.
+        code_ += "inline \\";
+        code_ += "bool {{STRUCT_NAME}}BufferHasIdentifier(const void *buf) {";
+        code_ += "  return flatbuffers::BufferHasIdentifier(";
+        code_ += "      buf, {{STRUCT_NAME}}Identifier());";
+        code_ += "}";
+        code_ += "";
+      }
+
+      // The root verifier.
+      if (parser_.file_identifier_.length()) {
+        code_.SetValue("ID", name + "Identifier()");
+      } else {
+        code_.SetValue("ID", "nullptr");
+      }
+
+      code_ += "inline bool Verify{{STRUCT_NAME}}Buffer(";
+      code_ += "    flatbuffers::Verifier &verifier) {";
+      code_ += "  return verifier.VerifyBuffer<{{CPP_NAME}}>({{ID}});";
+      code_ += "}";
+      code_ += "";
+
+      code_ += "inline bool VerifySizePrefixed{{STRUCT_NAME}}Buffer(";
+      code_ += "    flatbuffers::Verifier &verifier) {";
+      code_ +=
+          "  return verifier.VerifySizePrefixedBuffer<{{CPP_NAME}}>({{ID}});";
+      code_ += "}";
+      code_ += "";
+
+      if (parser_.file_extension_.length()) {
+        // Return the extension
+        code_ += "inline const char *{{STRUCT_NAME}}Extension() {";
+        code_ += "  return \"" + parser_.file_extension_ + "\";";
+        code_ += "}";
+        code_ += "";
+      }
+
+      // Finish a buffer with a given root object:
+      code_ += "inline void Finish{{STRUCT_NAME}}Buffer(";
+      code_ += "    flatbuffers::FlatBufferBuilder &fbb,";
+      code_ += "    flatbuffers::Offset<{{CPP_NAME}}> root) {";
+      if (parser_.file_identifier_.length())
+        code_ += "  fbb.Finish(root, {{STRUCT_NAME}}Identifier());";
+      else
+        code_ += "  fbb.Finish(root);";
+      code_ += "}";
+      code_ += "";
+
+      code_ += "inline void FinishSizePrefixed{{STRUCT_NAME}}Buffer(";
+      code_ += "    flatbuffers::FlatBufferBuilder &fbb,";
+      code_ += "    flatbuffers::Offset<{{CPP_NAME}}> root) {";
+      if (parser_.file_identifier_.length())
+        code_ += "  fbb.FinishSizePrefixed(root, {{STRUCT_NAME}}Identifier());";
+      else
+        code_ += "  fbb.FinishSizePrefixed(root);";
+      code_ += "}";
+      code_ += "";
+
+      if (opts_.generate_object_based_api) {
+        // A convenient root unpack function.
+        auto native_name = WrapNativeNameInNameSpace(struct_def, opts_);
+        code_.SetValue("UNPACK_RETURN",
+                       GenTypeNativePtr(native_name, nullptr, false));
+        code_.SetValue("UNPACK_TYPE",
+                       GenTypeNativePtr(native_name, nullptr, true));
+
+        code_ += "inline {{UNPACK_RETURN}} UnPack{{STRUCT_NAME}}(";
+        code_ += "    const void *buf,";
+        code_ += "    const flatbuffers::resolver_function_t *res = nullptr) {";
+        code_ += "  return {{UNPACK_TYPE}}\\";
+        code_ += "(Get{{STRUCT_NAME}}(buf)->UnPack(res));";
+        code_ += "}";
+        code_ += "";
+
+        code_ += "inline {{UNPACK_RETURN}} UnPackSizePrefixed{{STRUCT_NAME}}(";
+        code_ += "    const void *buf,";
+        code_ += "    const flatbuffers::resolver_function_t *res = nullptr) {";
+        code_ += "  return {{UNPACK_TYPE}}\\";
+        code_ += "(GetSizePrefixed{{STRUCT_NAME}}(buf)->UnPack(res));";
+        code_ += "}";
+        code_ += "";
+      }
+    }
+
+    if (cur_name_space_) SetNameSpace(nullptr);
+
+    // Close the include guard.
+    code_ += "#endif  // " + include_guard;
+
+    const auto file_path = GeneratedFileName(path_, file_name_, opts_);
+    const auto final_code = code_.ToString();
+
+    // Save the file and optionally generate the binary schema code.
+    return SaveFile(file_path.c_str(), final_code, false) &&
+           (!parser_.opts.binary_schema_gen_embed || generate_bfbs_embed());
+  }
+
+ private:
+  CodeWriter code_;
+
+  std::unordered_set<std::string> keywords_;
+
+  // This tracks the current namespace so we can insert namespace declarations.
+  const Namespace *cur_name_space_;
+
+  const IDLOptionsCpp opts_;
+  const TypedFloatConstantGenerator float_const_gen_;
+
+  const Namespace *CurrentNameSpace() const { return cur_name_space_; }
+
+  // Translates a qualified name in flatbuffer text format to the same name in
+  // the equivalent C++ namespace.
+  static std::string TranslateNameSpace(const std::string &qualified_name) {
+    std::string cpp_qualified_name = qualified_name;
+    size_t start_pos = 0;
+    while ((start_pos = cpp_qualified_name.find('.', start_pos)) !=
+           std::string::npos) {
+      cpp_qualified_name.replace(start_pos, 1, "::");
+    }
+    return cpp_qualified_name;
+  }
+
+  bool TypeHasKey(const Type &type) {
+    if (type.base_type != BASE_TYPE_STRUCT) { return false; }
+    for (auto it = type.struct_def->fields.vec.begin();
+         it != type.struct_def->fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.key) { return true; }
+    }
+    return false;
+  }
+
+  bool VectorElementUserFacing(const Type &type) const {
+    return opts_.g_cpp_std >= cpp::CPP_STD_17 && opts_.g_only_fixed_enums &&
+           IsEnum(type);
+  }
+
+  void GenComment(const std::vector<std::string> &dc, const char *prefix = "") {
+    std::string text;
+    ::flatbuffers::GenComment(dc, &text, nullptr, prefix);
+    code_ += text + "\\";
+  }
+
+  // Return a C++ type from the table in idl.h
+  std::string GenTypeBasic(const Type &type, bool user_facing_type) const {
+    // clang-format off
+    static const char *const ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+        #CTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    if (user_facing_type) {
+      if (type.enum_def) return WrapInNameSpace(*type.enum_def);
+      if (type.base_type == BASE_TYPE_BOOL) return "bool";
+    }
+    return ctypename[type.base_type];
+  }
+
+  // Return a C++ pointer type, specialized to the actual struct/table types,
+  // and vector element types.
+  std::string GenTypePointer(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: {
+        return "flatbuffers::String";
+      }
+      case BASE_TYPE_VECTOR: {
+        const auto type_name = GenTypeWire(
+            type.VectorType(), "", VectorElementUserFacing(type.VectorType()));
+        return "flatbuffers::Vector<" + type_name + ">";
+      }
+      case BASE_TYPE_STRUCT: {
+        return WrapInNameSpace(*type.struct_def);
+      }
+      case BASE_TYPE_UNION:
+        // fall through
+      default: {
+        return "void";
+      }
+    }
+  }
+
+  // Return a C++ type for any type (scalar/pointer) specifically for
+  // building a flatbuffer.
+  std::string GenTypeWire(const Type &type, const char *postfix,
+                          bool user_facing_type) const {
+    if (IsScalar(type.base_type)) {
+      return GenTypeBasic(type, user_facing_type) + postfix;
+    } else if (IsStruct(type)) {
+      return "const " + GenTypePointer(type) + " *";
+    } else {
+      return "flatbuffers::Offset<" + GenTypePointer(type) + ">" + postfix;
+    }
+  }
+
+  // Return a C++ type for any type (scalar/pointer) that reflects its
+  // serialized size.
+  std::string GenTypeSize(const Type &type) const {
+    if (IsScalar(type.base_type)) {
+      return GenTypeBasic(type, false);
+    } else if (IsStruct(type)) {
+      return GenTypePointer(type);
+    } else {
+      return "flatbuffers::uoffset_t";
+    }
+  }
+
+  std::string NullableExtension() {
+    return opts_.gen_nullable ? " _Nullable " : "";
+  }
+
+  static std::string NativeName(const std::string &name, const StructDef *sd,
+                                const IDLOptions &opts) {
+    return sd && !sd->fixed ? opts.object_prefix + name + opts.object_suffix
+                            : name;
+  }
+
+  std::string WrapNativeNameInNameSpace(const StructDef &struct_def,
+                                        const IDLOptions &opts) {
+    return WrapInNameSpace(struct_def.defined_namespace,
+                           NativeName(Name(struct_def), &struct_def, opts));
+  }
+
+  const std::string &PtrType(const FieldDef *field) {
+    auto attr = field ? field->attributes.Lookup("cpp_ptr_type") : nullptr;
+    return attr ? attr->constant : opts_.cpp_object_api_pointer_type;
+  }
+
+  const std::string NativeString(const FieldDef *field) {
+    auto attr = field ? field->attributes.Lookup("cpp_str_type") : nullptr;
+    auto &ret = attr ? attr->constant : opts_.cpp_object_api_string_type;
+    if (ret.empty()) { return "std::string"; }
+    return ret;
+  }
+
+  bool FlexibleStringConstructor(const FieldDef *field) {
+    auto attr = field
+                    ? (field->attributes.Lookup("cpp_str_flex_ctor") != nullptr)
+                    : false;
+    auto ret = attr ? attr : opts_.cpp_object_api_string_flexible_constructor;
+    return ret && NativeString(field) !=
+                      "std::string";  // Only for custom string types.
+  }
+
+  std::string GenTypeNativePtr(const std::string &type, const FieldDef *field,
+                               bool is_constructor) {
+    auto &ptr_type = PtrType(field);
+    if (ptr_type != "naked") {
+      return (ptr_type != "default_ptr_type"
+                  ? ptr_type
+                  : opts_.cpp_object_api_pointer_type) +
+             "<" + type + ">";
+    } else if (is_constructor) {
+      return "";
+    } else {
+      return type + " *";
+    }
+  }
+
+  std::string GenPtrGet(const FieldDef &field) {
+    auto cpp_ptr_type_get = field.attributes.Lookup("cpp_ptr_type_get");
+    if (cpp_ptr_type_get) return cpp_ptr_type_get->constant;
+    auto &ptr_type = PtrType(&field);
+    return ptr_type == "naked" ? "" : ".get()";
+  }
+
+  std::string GenOptionalNull() { return "flatbuffers::nullopt"; }
+
+  std::string GenOptionalDecl(const Type &type) {
+    return "flatbuffers::Optional<" + GenTypeBasic(type, true) + ">";
+  }
+
+  std::string GenTypeNative(const Type &type, bool invector,
+                            const FieldDef &field) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: {
+        return NativeString(&field);
+      }
+      case BASE_TYPE_VECTOR: {
+        const auto type_name = GenTypeNative(type.VectorType(), true, field);
+        if (type.struct_def &&
+            type.struct_def->attributes.Lookup("native_custom_alloc")) {
+          auto native_custom_alloc =
+              type.struct_def->attributes.Lookup("native_custom_alloc");
+          return "std::vector<" + type_name + "," +
+                 native_custom_alloc->constant + "<" + type_name + ">>";
+        } else
+          return "std::vector<" + type_name + ">";
+      }
+      case BASE_TYPE_STRUCT: {
+        auto type_name = WrapInNameSpace(*type.struct_def);
+        if (IsStruct(type)) {
+          auto native_type = type.struct_def->attributes.Lookup("native_type");
+          if (native_type) { type_name = native_type->constant; }
+          if (invector || field.native_inline) {
+            return type_name;
+          } else {
+            return GenTypeNativePtr(type_name, &field, false);
+          }
+        } else {
+          return GenTypeNativePtr(
+              WrapNativeNameInNameSpace(*type.struct_def, opts_), &field,
+              false);
+        }
+      }
+      case BASE_TYPE_UNION: {
+        auto type_name = WrapInNameSpace(*type.enum_def);
+        return type_name + "Union";
+      }
+      default: {
+        return field.IsScalarOptional() ? GenOptionalDecl(type)
+                                        : GenTypeBasic(type, true);
+      }
+    }
+  }
+
+  // Return a C++ type for any type (scalar/pointer) specifically for
+  // using a flatbuffer.
+  std::string GenTypeGet(const Type &type, const char *afterbasic,
+                         const char *beforeptr, const char *afterptr,
+                         bool user_facing_type) {
+    if (IsScalar(type.base_type)) {
+      return GenTypeBasic(type, user_facing_type) + afterbasic;
+    } else if (IsArray(type)) {
+      auto element_type = type.VectorType();
+      // Check if enum arrays are used in C++ without specifying --scoped-enums
+      if (IsEnum(element_type) && !opts_.g_only_fixed_enums) {
+        LogCompilerError(
+            "--scoped-enums must be enabled to use enum arrays in C++");
+        FLATBUFFERS_ASSERT(true);
+      }
+      return beforeptr +
+             (IsScalar(element_type.base_type)
+                  ? GenTypeBasic(element_type, user_facing_type)
+                  : GenTypePointer(element_type)) +
+             afterptr;
+    } else {
+      return beforeptr + GenTypePointer(type) + afterptr;
+    }
+  }
+
+  std::string GenTypeSpan(const Type &type, bool immutable, size_t extent) {
+    // Generate "flatbuffers::span<const U, extent>".
+    FLATBUFFERS_ASSERT(IsSeries(type) && "unexpected type");
+    auto element_type = type.VectorType();
+    std::string text = "flatbuffers::span<";
+    text += immutable ? "const " : "";
+    if (IsScalar(element_type.base_type)) {
+      text += GenTypeBasic(element_type, IsEnum(element_type));
+    } else {
+      switch (element_type.base_type) {
+        case BASE_TYPE_STRING: {
+          text += "char";
+          break;
+        }
+        case BASE_TYPE_STRUCT: {
+          FLATBUFFERS_ASSERT(type.struct_def);
+          text += WrapInNameSpace(*type.struct_def);
+          break;
+        }
+        default:
+          FLATBUFFERS_ASSERT(false && "unexpected element's type");
+          break;
+      }
+    }
+    if (extent != flatbuffers::dynamic_extent) {
+      text += ", ";
+      text += NumToString(extent);
+    }
+    text += "> ";
+    return text;
+  }
+
+  std::string GenEnumValDecl(const EnumDef &enum_def,
+                             const std::string &enum_val) const {
+    return opts_.prefixed_enums ? Name(enum_def) + "_" + enum_val : enum_val;
+  }
+
+  std::string GetEnumValUse(const EnumDef &enum_def,
+                            const EnumVal &enum_val) const {
+    if (opts_.scoped_enums) {
+      return Name(enum_def) + "::" + Name(enum_val);
+    } else if (opts_.prefixed_enums) {
+      return Name(enum_def) + "_" + Name(enum_val);
+    } else {
+      return Name(enum_val);
+    }
+  }
+
+  std::string StripUnionType(const std::string &name) {
+    return name.substr(0, name.size() - strlen(UnionTypeFieldSuffix()));
+  }
+
+  std::string GetUnionElement(const EnumVal &ev, bool native_type,
+                              const IDLOptions &opts) {
+    if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+      auto name = ev.union_type.struct_def->name;
+      if (native_type) {
+        name = NativeName(name, ev.union_type.struct_def, opts);
+      }
+      return WrapInNameSpace(ev.union_type.struct_def->defined_namespace, name);
+    } else if (IsString(ev.union_type)) {
+      return native_type ? "std::string" : "flatbuffers::String";
+    } else {
+      FLATBUFFERS_ASSERT(false);
+      return Name(ev);
+    }
+  }
+
+  std::string UnionVerifySignature(const EnumDef &enum_def) {
+    return "bool Verify" + Name(enum_def) +
+           "(flatbuffers::Verifier &verifier, const void *obj, " +
+           Name(enum_def) + " type)";
+  }
+
+  std::string UnionVectorVerifySignature(const EnumDef &enum_def) {
+    return "bool Verify" + Name(enum_def) + "Vector" +
+           "(flatbuffers::Verifier &verifier, " +
+           "const flatbuffers::Vector<flatbuffers::Offset<void>> *values, " +
+           "const flatbuffers::Vector<uint8_t> *types)";
+  }
+
+  std::string UnionUnPackSignature(const EnumDef &enum_def, bool inclass) {
+    return (inclass ? "static " : "") + std::string("void *") +
+           (inclass ? "" : Name(enum_def) + "Union::") +
+           "UnPack(const void *obj, " + Name(enum_def) +
+           " type, const flatbuffers::resolver_function_t *resolver)";
+  }
+
+  std::string UnionPackSignature(const EnumDef &enum_def, bool inclass) {
+    return "flatbuffers::Offset<void> " +
+           (inclass ? "" : Name(enum_def) + "Union::") +
+           "Pack(flatbuffers::FlatBufferBuilder &_fbb, " +
+           "const flatbuffers::rehasher_function_t *_rehasher" +
+           (inclass ? " = nullptr" : "") + ") const";
+  }
+
+  std::string TableCreateSignature(const StructDef &struct_def, bool predecl,
+                                   const IDLOptions &opts) {
+    return "flatbuffers::Offset<" + Name(struct_def) + "> Create" +
+           Name(struct_def) + "(flatbuffers::FlatBufferBuilder &_fbb, const " +
+           NativeName(Name(struct_def), &struct_def, opts) +
+           " *_o, const flatbuffers::rehasher_function_t *_rehasher" +
+           (predecl ? " = nullptr" : "") + ")";
+  }
+
+  std::string TablePackSignature(const StructDef &struct_def, bool inclass,
+                                 const IDLOptions &opts) {
+    return std::string(inclass ? "static " : "") + "flatbuffers::Offset<" +
+           Name(struct_def) + "> " + (inclass ? "" : Name(struct_def) + "::") +
+           "Pack(flatbuffers::FlatBufferBuilder &_fbb, " + "const " +
+           NativeName(Name(struct_def), &struct_def, opts) + "* _o, " +
+           "const flatbuffers::rehasher_function_t *_rehasher" +
+           (inclass ? " = nullptr" : "") + ")";
+  }
+
+  std::string TableUnPackSignature(const StructDef &struct_def, bool inclass,
+                                   const IDLOptions &opts) {
+    return NativeName(Name(struct_def), &struct_def, opts) + " *" +
+           (inclass ? "" : Name(struct_def) + "::") +
+           "UnPack(const flatbuffers::resolver_function_t *_resolver" +
+           (inclass ? " = nullptr" : "") + ") const";
+  }
+
+  std::string TableUnPackToSignature(const StructDef &struct_def, bool inclass,
+                                     const IDLOptions &opts) {
+    return "void " + (inclass ? "" : Name(struct_def) + "::") + "UnPackTo(" +
+           NativeName(Name(struct_def), &struct_def, opts) + " *" +
+           "_o, const flatbuffers::resolver_function_t *_resolver" +
+           (inclass ? " = nullptr" : "") + ") const";
+  }
+
+  void GenMiniReflectPre(const StructDef *struct_def) {
+    code_.SetValue("NAME", struct_def->name);
+    code_ += "inline const flatbuffers::TypeTable *{{NAME}}TypeTable();";
+    code_ += "";
+  }
+
+  void GenMiniReflect(const StructDef *struct_def, const EnumDef *enum_def) {
+    code_.SetValue("NAME", struct_def ? struct_def->name : enum_def->name);
+    code_.SetValue("SEQ_TYPE",
+                   struct_def ? (struct_def->fixed ? "ST_STRUCT" : "ST_TABLE")
+                              : (enum_def->is_union ? "ST_UNION" : "ST_ENUM"));
+    auto num_fields =
+        struct_def ? struct_def->fields.vec.size() : enum_def->size();
+    code_.SetValue("NUM_FIELDS", NumToString(num_fields));
+    std::vector<std::string> names;
+    std::vector<Type> types;
+
+    if (struct_def) {
+      for (auto it = struct_def->fields.vec.begin();
+           it != struct_def->fields.vec.end(); ++it) {
+        const auto &field = **it;
+        names.push_back(Name(field));
+        types.push_back(field.value.type);
+      }
+    } else {
+      for (auto it = enum_def->Vals().begin(); it != enum_def->Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        names.push_back(Name(ev));
+        types.push_back(enum_def->is_union ? ev.union_type
+                                           : Type(enum_def->underlying_type));
+      }
+    }
+    std::string ts;
+    std::vector<std::string> type_refs;
+    std::vector<uint16_t> array_sizes;
+    for (auto it = types.begin(); it != types.end(); ++it) {
+      auto &type = *it;
+      if (!ts.empty()) ts += ",\n    ";
+      auto is_vector = IsVector(type);
+      auto is_array = IsArray(type);
+      auto bt = is_vector || is_array ? type.element : type.base_type;
+      auto et = IsScalar(bt) || bt == BASE_TYPE_STRING
+                    ? bt - BASE_TYPE_UTYPE + ET_UTYPE
+                    : ET_SEQUENCE;
+      int ref_idx = -1;
+      std::string ref_name =
+          type.struct_def
+              ? WrapInNameSpace(*type.struct_def)
+              : type.enum_def ? WrapInNameSpace(*type.enum_def) : "";
+      if (!ref_name.empty()) {
+        auto rit = type_refs.begin();
+        for (; rit != type_refs.end(); ++rit) {
+          if (*rit == ref_name) {
+            ref_idx = static_cast<int>(rit - type_refs.begin());
+            break;
+          }
+        }
+        if (rit == type_refs.end()) {
+          ref_idx = static_cast<int>(type_refs.size());
+          type_refs.push_back(ref_name);
+        }
+      }
+      if (is_array) { array_sizes.push_back(type.fixed_length); }
+      ts += "{ flatbuffers::" + std::string(ElementaryTypeNames()[et]) + ", " +
+            NumToString(is_vector || is_array) + ", " + NumToString(ref_idx) +
+            " }";
+    }
+    std::string rs;
+    for (auto it = type_refs.begin(); it != type_refs.end(); ++it) {
+      if (!rs.empty()) rs += ",\n    ";
+      rs += *it + "TypeTable";
+    }
+    std::string as;
+    for (auto it = array_sizes.begin(); it != array_sizes.end(); ++it) {
+      as += NumToString(*it);
+      as += ", ";
+    }
+    std::string ns;
+    for (auto it = names.begin(); it != names.end(); ++it) {
+      if (!ns.empty()) ns += ",\n    ";
+      ns += "\"" + *it + "\"";
+    }
+    std::string vs;
+    const auto consecutive_enum_from_zero =
+        enum_def && enum_def->MinValue()->IsZero() &&
+        ((enum_def->size() - 1) == enum_def->Distance());
+    if (enum_def && !consecutive_enum_from_zero) {
+      for (auto it = enum_def->Vals().begin(); it != enum_def->Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        if (!vs.empty()) vs += ", ";
+        vs += NumToStringCpp(enum_def->ToString(ev),
+                             enum_def->underlying_type.base_type);
+      }
+    } else if (struct_def && struct_def->fixed) {
+      for (auto it = struct_def->fields.vec.begin();
+           it != struct_def->fields.vec.end(); ++it) {
+        const auto &field = **it;
+        vs += NumToString(field.value.offset);
+        vs += ", ";
+      }
+      vs += NumToString(struct_def->bytesize);
+    }
+    code_.SetValue("TYPES", ts);
+    code_.SetValue("REFS", rs);
+    code_.SetValue("ARRAYSIZES", as);
+    code_.SetValue("NAMES", ns);
+    code_.SetValue("VALUES", vs);
+    code_ += "inline const flatbuffers::TypeTable *{{NAME}}TypeTable() {";
+    if (num_fields) {
+      code_ += "  static const flatbuffers::TypeCode type_codes[] = {";
+      code_ += "    {{TYPES}}";
+      code_ += "  };";
+    }
+    if (!type_refs.empty()) {
+      code_ += "  static const flatbuffers::TypeFunction type_refs[] = {";
+      code_ += "    {{REFS}}";
+      code_ += "  };";
+    }
+    if (!as.empty()) {
+      code_ += "  static const int16_t array_sizes[] = { {{ARRAYSIZES}} };";
+    }
+    if (!vs.empty()) {
+      // Problem with uint64_t values greater than 9223372036854775807ULL.
+      code_ += "  static const int64_t values[] = { {{VALUES}} };";
+    }
+    auto has_names =
+        num_fields && opts_.mini_reflect == IDLOptions::kTypesAndNames;
+    if (has_names) {
+      code_ += "  static const char * const names[] = {";
+      code_ += "    {{NAMES}}";
+      code_ += "  };";
+    }
+    code_ += "  static const flatbuffers::TypeTable tt = {";
+    code_ += std::string("    flatbuffers::{{SEQ_TYPE}}, {{NUM_FIELDS}}, ") +
+             (num_fields ? "type_codes, " : "nullptr, ") +
+             (!type_refs.empty() ? "type_refs, " : "nullptr, ") +
+             (!as.empty() ? "array_sizes, " : "nullptr, ") +
+             (!vs.empty() ? "values, " : "nullptr, ") +
+             (has_names ? "names" : "nullptr");
+    code_ += "  };";
+    code_ += "  return &tt;";
+    code_ += "}";
+    code_ += "";
+  }
+
+  // Generate an enum declaration,
+  // an enum string lookup table,
+  // and an enum array of values
+
+  void GenEnum(const EnumDef &enum_def) {
+    code_.SetValue("ENUM_NAME", Name(enum_def));
+    code_.SetValue("BASE_TYPE", GenTypeBasic(enum_def.underlying_type, false));
+
+    GenComment(enum_def.doc_comment);
+    code_ +=
+        (opts_.scoped_enums ? "enum class " : "enum ") + Name(enum_def) + "\\";
+    if (opts_.g_only_fixed_enums) { code_ += " : {{BASE_TYPE}}\\"; }
+    code_ += " {";
+
+    code_.SetValue("SEP", ",");
+    auto add_sep = false;
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const auto &ev = **it;
+      if (add_sep) code_ += "{{SEP}}";
+      GenComment(ev.doc_comment, "  ");
+      code_.SetValue("KEY", GenEnumValDecl(enum_def, Name(ev)));
+      code_.SetValue("VALUE",
+                     NumToStringCpp(enum_def.ToString(ev),
+                                    enum_def.underlying_type.base_type));
+      code_ += "  {{KEY}} = {{VALUE}}\\";
+      add_sep = true;
+    }
+    const EnumVal *minv = enum_def.MinValue();
+    const EnumVal *maxv = enum_def.MaxValue();
+
+    if (opts_.scoped_enums || opts_.prefixed_enums) {
+      FLATBUFFERS_ASSERT(minv && maxv);
+
+      code_.SetValue("SEP", ",\n");
+      if (enum_def.attributes.Lookup("bit_flags")) {
+        code_.SetValue("KEY", GenEnumValDecl(enum_def, "NONE"));
+        code_.SetValue("VALUE", "0");
+        code_ += "{{SEP}}  {{KEY}} = {{VALUE}}\\";
+
+        code_.SetValue("KEY", GenEnumValDecl(enum_def, "ANY"));
+        code_.SetValue("VALUE",
+                       NumToStringCpp(enum_def.AllFlags(),
+                                      enum_def.underlying_type.base_type));
+        code_ += "{{SEP}}  {{KEY}} = {{VALUE}}\\";
+      } else {  // MIN & MAX are useless for bit_flags
+        code_.SetValue("KEY", GenEnumValDecl(enum_def, "MIN"));
+        code_.SetValue("VALUE", GenEnumValDecl(enum_def, Name(*minv)));
+        code_ += "{{SEP}}  {{KEY}} = {{VALUE}}\\";
+
+        code_.SetValue("KEY", GenEnumValDecl(enum_def, "MAX"));
+        code_.SetValue("VALUE", GenEnumValDecl(enum_def, Name(*maxv)));
+        code_ += "{{SEP}}  {{KEY}} = {{VALUE}}\\";
+      }
+    }
+    code_ += "";
+    code_ += "};";
+
+    if (opts_.scoped_enums && enum_def.attributes.Lookup("bit_flags")) {
+      code_ +=
+          "FLATBUFFERS_DEFINE_BITMASK_OPERATORS({{ENUM_NAME}}, {{BASE_TYPE}})";
+    }
+    code_ += "";
+
+    // Generate an array of all enumeration values
+    auto num_fields = NumToString(enum_def.size());
+    code_ += "inline const {{ENUM_NAME}} (&EnumValues{{ENUM_NAME}}())[" +
+             num_fields + "] {";
+    code_ += "  static const {{ENUM_NAME}} values[] = {";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const auto &ev = **it;
+      auto value = GetEnumValUse(enum_def, ev);
+      auto suffix = *it != enum_def.Vals().back() ? "," : "";
+      code_ += "    " + value + suffix;
+    }
+    code_ += "  };";
+    code_ += "  return values;";
+    code_ += "}";
+    code_ += "";
+
+    // Generate a generate string table for enum values.
+    // Problem is, if values are very sparse that could generate really big
+    // tables. Ideally in that case we generate a map lookup instead, but for
+    // the moment we simply don't output a table at all.
+    auto range = enum_def.Distance();
+    // Average distance between values above which we consider a table
+    // "too sparse". Change at will.
+    static const uint64_t kMaxSparseness = 5;
+    if (range / static_cast<uint64_t>(enum_def.size()) < kMaxSparseness) {
+      code_ += "inline const char * const *EnumNames{{ENUM_NAME}}() {";
+      code_ += "  static const char * const names[" +
+               NumToString(range + 1 + 1) + "] = {";
+
+      auto val = enum_def.Vals().front();
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        auto ev = *it;
+        for (auto k = enum_def.Distance(val, ev); k > 1; --k) {
+          code_ += "    \"\",";
+        }
+        val = ev;
+        code_ += "    \"" + Name(*ev) + "\",";
+      }
+      code_ += "    nullptr";
+      code_ += "  };";
+
+      code_ += "  return names;";
+      code_ += "}";
+      code_ += "";
+
+      code_ += "inline const char *EnumName{{ENUM_NAME}}({{ENUM_NAME}} e) {";
+
+      code_ += "  if (flatbuffers::IsOutRange(e, " +
+               GetEnumValUse(enum_def, *enum_def.MinValue()) + ", " +
+               GetEnumValUse(enum_def, *enum_def.MaxValue()) +
+               ")) return \"\";";
+
+      code_ += "  const size_t index = static_cast<size_t>(e)\\";
+      if (enum_def.MinValue()->IsNonZero()) {
+        auto vals = GetEnumValUse(enum_def, *enum_def.MinValue());
+        code_ += " - static_cast<size_t>(" + vals + ")\\";
+      }
+      code_ += ";";
+
+      code_ += "  return EnumNames{{ENUM_NAME}}()[index];";
+      code_ += "}";
+      code_ += "";
+    } else {
+      code_ += "inline const char *EnumName{{ENUM_NAME}}({{ENUM_NAME}} e) {";
+
+      code_ += "  switch (e) {";
+
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        code_ += "    case " + GetEnumValUse(enum_def, ev) + ": return \"" +
+                 Name(ev) + "\";";
+      }
+
+      code_ += "    default: return \"\";";
+      code_ += "  }";
+
+      code_ += "}";
+      code_ += "";
+    }
+
+    // Generate type traits for unions to map from a type to union enum value.
+    if (enum_def.is_union && !enum_def.uses_multiple_type_instances) {
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+
+        if (it == enum_def.Vals().begin()) {
+          code_ += "template<typename T> struct {{ENUM_NAME}}Traits {";
+        } else {
+          auto name = GetUnionElement(ev, false, opts_);
+          code_ += "template<> struct {{ENUM_NAME}}Traits<" + name + "> {";
+        }
+
+        auto value = GetEnumValUse(enum_def, ev);
+        code_ += "  static const {{ENUM_NAME}} enum_value = " + value + ";";
+        code_ += "};";
+        code_ += "";
+      }
+    }
+
+    if (opts_.generate_object_based_api && enum_def.is_union) {
+      // Generate a union type
+      code_.SetValue("NAME", Name(enum_def));
+      FLATBUFFERS_ASSERT(enum_def.Lookup("NONE"));
+      code_.SetValue("NONE", GetEnumValUse(enum_def, *enum_def.Lookup("NONE")));
+
+      code_ += "struct {{NAME}}Union {";
+      code_ += "  {{NAME}} type;";
+      code_ += "  void *value;";
+      code_ += "";
+      code_ += "  {{NAME}}Union() : type({{NONE}}), value(nullptr) {}";
+      code_ += "  {{NAME}}Union({{NAME}}Union&& u) FLATBUFFERS_NOEXCEPT :";
+      code_ += "    type({{NONE}}), value(nullptr)";
+      code_ += "    { std::swap(type, u.type); std::swap(value, u.value); }";
+      code_ += "  {{NAME}}Union(const {{NAME}}Union &);";
+      code_ += "  {{NAME}}Union &operator=(const {{NAME}}Union &u)";
+      code_ +=
+          "    { {{NAME}}Union t(u); std::swap(type, t.type); std::swap(value, "
+          "t.value); return *this; }";
+      code_ +=
+          "  {{NAME}}Union &operator=({{NAME}}Union &&u) FLATBUFFERS_NOEXCEPT";
+      code_ +=
+          "    { std::swap(type, u.type); std::swap(value, u.value); return "
+          "*this; }";
+      code_ += "  ~{{NAME}}Union() { Reset(); }";
+      code_ += "";
+      code_ += "  void Reset();";
+      code_ += "";
+      if (!enum_def.uses_multiple_type_instances) {
+        code_ += "#ifndef FLATBUFFERS_CPP98_STL";
+        code_ += "  template <typename T>";
+        code_ += "  void Set(T&& val) {";
+        code_ += "    using RT = typename std::remove_reference<T>::type;";
+        code_ += "    Reset();";
+        code_ +=
+            "    type = {{NAME}}Traits<typename RT::TableType>::enum_value;";
+        code_ += "    if (type != {{NONE}}) {";
+        code_ += "      value = new RT(std::forward<T>(val));";
+        code_ += "    }";
+        code_ += "  }";
+        code_ += "#endif  // FLATBUFFERS_CPP98_STL";
+        code_ += "";
+      }
+      code_ += "  " + UnionUnPackSignature(enum_def, true) + ";";
+      code_ += "  " + UnionPackSignature(enum_def, true) + ";";
+      code_ += "";
+
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        if (ev.IsZero()) { continue; }
+
+        const auto native_type = GetUnionElement(ev, true, opts_);
+        code_.SetValue("NATIVE_TYPE", native_type);
+        code_.SetValue("NATIVE_NAME", Name(ev));
+        code_.SetValue("NATIVE_ID", GetEnumValUse(enum_def, ev));
+
+        code_ += "  {{NATIVE_TYPE}} *As{{NATIVE_NAME}}() {";
+        code_ += "    return type == {{NATIVE_ID}} ?";
+        code_ += "      reinterpret_cast<{{NATIVE_TYPE}} *>(value) : nullptr;";
+        code_ += "  }";
+
+        code_ += "  const {{NATIVE_TYPE}} *As{{NATIVE_NAME}}() const {";
+        code_ += "    return type == {{NATIVE_ID}} ?";
+        code_ +=
+            "      reinterpret_cast<const {{NATIVE_TYPE}} *>(value) : nullptr;";
+        code_ += "  }";
+      }
+      code_ += "};";
+      code_ += "";
+
+      if (opts_.gen_compare) {
+        code_ += "";
+        code_ +=
+            "inline bool operator==(const {{NAME}}Union &lhs, const "
+            "{{NAME}}Union &rhs) {";
+        code_ += "  if (lhs.type != rhs.type) return false;";
+        code_ += "  switch (lhs.type) {";
+
+        for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+             ++it) {
+          const auto &ev = **it;
+          code_.SetValue("NATIVE_ID", GetEnumValUse(enum_def, ev));
+          if (ev.IsNonZero()) {
+            const auto native_type = GetUnionElement(ev, true, opts_);
+            code_.SetValue("NATIVE_TYPE", native_type);
+            code_ += "    case {{NATIVE_ID}}: {";
+            code_ +=
+                "      return *(reinterpret_cast<const {{NATIVE_TYPE}} "
+                "*>(lhs.value)) ==";
+            code_ +=
+                "             *(reinterpret_cast<const {{NATIVE_TYPE}} "
+                "*>(rhs.value));";
+            code_ += "    }";
+          } else {
+            code_ += "    case {{NATIVE_ID}}: {";
+            code_ += "      return true;";  // "NONE" enum value.
+            code_ += "    }";
+          }
+        }
+        code_ += "    default: {";
+        code_ += "      return false;";
+        code_ += "    }";
+        code_ += "  }";
+        code_ += "}";
+
+        code_ += "";
+        code_ +=
+            "inline bool operator!=(const {{NAME}}Union &lhs, const "
+            "{{NAME}}Union &rhs) {";
+        code_ += "    return !(lhs == rhs);";
+        code_ += "}";
+        code_ += "";
+      }
+    }
+
+    if (enum_def.is_union) {
+      code_ += UnionVerifySignature(enum_def) + ";";
+      code_ += UnionVectorVerifySignature(enum_def) + ";";
+      code_ += "";
+    }
+  }
+
+  void GenUnionPost(const EnumDef &enum_def) {
+    // Generate a verifier function for this union that can be called by the
+    // table verifier functions. It uses a switch case to select a specific
+    // verifier function to call, this should be safe even if the union type
+    // has been corrupted, since the verifiers will simply fail when called
+    // on the wrong type.
+    code_.SetValue("ENUM_NAME", Name(enum_def));
+
+    code_ += "inline " + UnionVerifySignature(enum_def) + " {";
+    code_ += "  switch (type) {";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const auto &ev = **it;
+      code_.SetValue("LABEL", GetEnumValUse(enum_def, ev));
+
+      if (ev.IsNonZero()) {
+        code_.SetValue("TYPE", GetUnionElement(ev, false, opts_));
+        code_ += "    case {{LABEL}}: {";
+        auto getptr =
+            "      auto ptr = reinterpret_cast<const {{TYPE}} *>(obj);";
+        if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+          if (ev.union_type.struct_def->fixed) {
+            code_ +=
+                "      return verifier.Verify<{{TYPE}}>(static_cast<const "
+                "uint8_t *>(obj), 0);";
+          } else {
+            code_ += getptr;
+            code_ += "      return verifier.VerifyTable(ptr);";
+          }
+        } else if (IsString(ev.union_type)) {
+          code_ += getptr;
+          code_ += "      return verifier.VerifyString(ptr);";
+        } else {
+          FLATBUFFERS_ASSERT(false);
+        }
+        code_ += "    }";
+      } else {
+        code_ += "    case {{LABEL}}: {";
+        code_ += "      return true;";  // "NONE" enum value.
+        code_ += "    }";
+      }
+    }
+    code_ += "    default: return true;";  // unknown values are OK.
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+
+    code_ += "inline " + UnionVectorVerifySignature(enum_def) + " {";
+    code_ += "  if (!values || !types) return !values && !types;";
+    code_ += "  if (values->size() != types->size()) return false;";
+    code_ += "  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {";
+    code_ += "    if (!Verify" + Name(enum_def) + "(";
+    code_ += "        verifier,  values->Get(i), types->GetEnum<" +
+             Name(enum_def) + ">(i))) {";
+    code_ += "      return false;";
+    code_ += "    }";
+    code_ += "  }";
+    code_ += "  return true;";
+    code_ += "}";
+    code_ += "";
+
+    if (opts_.generate_object_based_api) {
+      // Generate union Unpack() and Pack() functions.
+      code_ += "inline " + UnionUnPackSignature(enum_def, false) + " {";
+      code_ += "  switch (type) {";
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        if (ev.IsZero()) { continue; }
+
+        code_.SetValue("LABEL", GetEnumValUse(enum_def, ev));
+        code_.SetValue("TYPE", GetUnionElement(ev, false, opts_));
+        code_ += "    case {{LABEL}}: {";
+        code_ += "      auto ptr = reinterpret_cast<const {{TYPE}} *>(obj);";
+        if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+          if (ev.union_type.struct_def->fixed) {
+            code_ += "      return new " +
+                     WrapInNameSpace(*ev.union_type.struct_def) + "(*ptr);";
+          } else {
+            code_ += "      return ptr->UnPack(resolver);";
+          }
+        } else if (IsString(ev.union_type)) {
+          code_ += "      return new std::string(ptr->c_str(), ptr->size());";
+        } else {
+          FLATBUFFERS_ASSERT(false);
+        }
+        code_ += "    }";
+      }
+      code_ += "    default: return nullptr;";
+      code_ += "  }";
+      code_ += "}";
+      code_ += "";
+
+      code_ += "inline " + UnionPackSignature(enum_def, false) + " {";
+      code_ += "  switch (type) {";
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        auto &ev = **it;
+        if (ev.IsZero()) { continue; }
+
+        code_.SetValue("LABEL", GetEnumValUse(enum_def, ev));
+        code_.SetValue("TYPE", GetUnionElement(ev, true, opts_));
+        code_ += "    case {{LABEL}}: {";
+        code_ += "      auto ptr = reinterpret_cast<const {{TYPE}} *>(value);";
+        if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+          if (ev.union_type.struct_def->fixed) {
+            code_ += "      return _fbb.CreateStruct(*ptr).Union();";
+          } else {
+            code_.SetValue("NAME", ev.union_type.struct_def->name);
+            code_ +=
+                "      return Create{{NAME}}(_fbb, ptr, _rehasher).Union();";
+          }
+        } else if (IsString(ev.union_type)) {
+          code_ += "      return _fbb.CreateString(*ptr).Union();";
+        } else {
+          FLATBUFFERS_ASSERT(false);
+        }
+        code_ += "    }";
+      }
+      code_ += "    default: return 0;";
+      code_ += "  }";
+      code_ += "}";
+      code_ += "";
+
+      // Union copy constructor
+      code_ +=
+          "inline {{ENUM_NAME}}Union::{{ENUM_NAME}}Union(const "
+          "{{ENUM_NAME}}Union &u) : type(u.type), value(nullptr) {";
+      code_ += "  switch (type) {";
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        if (ev.IsZero()) { continue; }
+        code_.SetValue("LABEL", GetEnumValUse(enum_def, ev));
+        code_.SetValue("TYPE", GetUnionElement(ev, true, opts_));
+        code_ += "    case {{LABEL}}: {";
+        bool copyable = true;
+        if (ev.union_type.base_type == BASE_TYPE_STRUCT &&
+            !ev.union_type.struct_def->fixed) {
+          // Don't generate code to copy if table is not copyable.
+          // TODO(wvo): make tables copyable instead.
+          for (auto fit = ev.union_type.struct_def->fields.vec.begin();
+               fit != ev.union_type.struct_def->fields.vec.end(); ++fit) {
+            const auto &field = **fit;
+            if (!field.deprecated && field.value.type.struct_def &&
+                !field.native_inline) {
+              copyable = false;
+              break;
+            }
+          }
+        }
+        if (copyable) {
+          code_ +=
+              "      value = new {{TYPE}}(*reinterpret_cast<{{TYPE}} *>"
+              "(u.value));";
+        } else {
+          code_ +=
+              "      FLATBUFFERS_ASSERT(false);  // {{TYPE}} not copyable.";
+        }
+        code_ += "      break;";
+        code_ += "    }";
+      }
+      code_ += "    default:";
+      code_ += "      break;";
+      code_ += "  }";
+      code_ += "}";
+      code_ += "";
+
+      // Union Reset() function.
+      FLATBUFFERS_ASSERT(enum_def.Lookup("NONE"));
+      code_.SetValue("NONE", GetEnumValUse(enum_def, *enum_def.Lookup("NONE")));
+
+      code_ += "inline void {{ENUM_NAME}}Union::Reset() {";
+      code_ += "  switch (type) {";
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        const auto &ev = **it;
+        if (ev.IsZero()) { continue; }
+        code_.SetValue("LABEL", GetEnumValUse(enum_def, ev));
+        code_.SetValue("TYPE", GetUnionElement(ev, true, opts_));
+        code_ += "    case {{LABEL}}: {";
+        code_ += "      auto ptr = reinterpret_cast<{{TYPE}} *>(value);";
+        code_ += "      delete ptr;";
+        code_ += "      break;";
+        code_ += "    }";
+      }
+      code_ += "    default: break;";
+      code_ += "  }";
+      code_ += "  value = nullptr;";
+      code_ += "  type = {{NONE}};";
+      code_ += "}";
+      code_ += "";
+    }
+  }
+
+  // Generates a value with optionally a cast applied if the field has a
+  // different underlying type from its interface type (currently only the
+  // case for enums. "from" specify the direction, true meaning from the
+  // underlying type to the interface type.
+  std::string GenUnderlyingCast(const FieldDef &field, bool from,
+                                const std::string &val) {
+    if (from && field.value.type.base_type == BASE_TYPE_BOOL) {
+      return val + " != 0";
+    } else if ((field.value.type.enum_def &&
+                IsScalar(field.value.type.base_type)) ||
+               field.value.type.base_type == BASE_TYPE_BOOL) {
+      return "static_cast<" + GenTypeBasic(field.value.type, from) + ">(" +
+             val + ")";
+    } else {
+      return val;
+    }
+  }
+
+  std::string GenFieldOffsetName(const FieldDef &field) {
+    std::string uname = Name(field);
+    std::transform(uname.begin(), uname.end(), uname.begin(), CharToUpper);
+    return "VT_" + uname;
+  }
+
+  void GenFullyQualifiedNameGetter(const StructDef &struct_def,
+                                   const std::string &name) {
+    if (!opts_.generate_name_strings) { return; }
+    auto fullname = struct_def.defined_namespace->GetFullyQualifiedName(name);
+    code_.SetValue("NAME", fullname);
+    code_.SetValue("CONSTEXPR", "FLATBUFFERS_CONSTEXPR");
+    code_ += "  static {{CONSTEXPR}} const char *GetFullyQualifiedName() {";
+    code_ += "    return \"{{NAME}}\";";
+    code_ += "  }";
+  }
+
+  std::string GenDefaultConstant(const FieldDef &field) {
+    if (IsFloat(field.value.type.base_type))
+      return float_const_gen_.GenFloatConstant(field);
+    else
+      return NumToStringCpp(field.value.constant, field.value.type.base_type);
+  }
+
+  std::string GetDefaultScalarValue(const FieldDef &field, bool is_ctor) {
+    const auto &type = field.value.type;
+    if (field.IsScalarOptional()) {
+      return GenOptionalNull();
+    } else if (type.enum_def && IsScalar(type.base_type)) {
+      auto ev = type.enum_def->FindByValue(field.value.constant);
+      if (ev) {
+        return WrapInNameSpace(type.enum_def->defined_namespace,
+                               GetEnumValUse(*type.enum_def, *ev));
+      } else {
+        return GenUnderlyingCast(
+            field, true, NumToStringCpp(field.value.constant, type.base_type));
+      }
+    } else if (type.base_type == BASE_TYPE_BOOL) {
+      return field.value.constant == "0" ? "false" : "true";
+    } else if (field.attributes.Lookup("cpp_type")) {
+      if (is_ctor) {
+        if (PtrType(&field) == "naked") {
+          return "nullptr";
+        } else {
+          return "";
+        }
+      } else {
+        return "0";
+      }
+    } else {
+      return GenDefaultConstant(field);
+    }
+  }
+
+  void GenParam(const FieldDef &field, bool direct, const char *prefix) {
+    code_.SetValue("PRE", prefix);
+    code_.SetValue("PARAM_NAME", Name(field));
+    if (direct && IsString(field.value.type)) {
+      code_.SetValue("PARAM_TYPE", "const char *");
+      code_.SetValue("PARAM_VALUE", "nullptr");
+    } else if (direct && IsVector(field.value.type)) {
+      const auto vtype = field.value.type.VectorType();
+      std::string type;
+      if (IsStruct(vtype)) {
+        type = WrapInNameSpace(*vtype.struct_def);
+      } else {
+        type = GenTypeWire(vtype, "", VectorElementUserFacing(vtype));
+      }
+      if (TypeHasKey(vtype)) {
+        code_.SetValue("PARAM_TYPE", "std::vector<" + type + "> *");
+      } else {
+        code_.SetValue("PARAM_TYPE", "const std::vector<" + type + "> *");
+      }
+      code_.SetValue("PARAM_VALUE", "nullptr");
+    } else {
+      const auto &type = field.value.type;
+      code_.SetValue("PARAM_VALUE", GetDefaultScalarValue(field, false));
+      if (field.IsScalarOptional())
+        code_.SetValue("PARAM_TYPE", GenOptionalDecl(type) + " ");
+      else
+        code_.SetValue("PARAM_TYPE", GenTypeWire(type, " ", true));
+    }
+    code_ += "{{PRE}}{{PARAM_TYPE}}{{PARAM_NAME}} = {{PARAM_VALUE}}\\";
+  }
+
+  // Generate a member, including a default value for scalars and raw pointers.
+  void GenMember(const FieldDef &field) {
+    if (!field.deprecated &&  // Deprecated fields won't be accessible.
+        field.value.type.base_type != BASE_TYPE_UTYPE &&
+        (field.value.type.base_type != BASE_TYPE_VECTOR ||
+         field.value.type.element != BASE_TYPE_UTYPE)) {
+      auto type = GenTypeNative(field.value.type, false, field);
+      auto cpp_type = field.attributes.Lookup("cpp_type");
+      auto full_type =
+          (cpp_type
+               ? (IsVector(field.value.type)
+                      ? "std::vector<" +
+                            GenTypeNativePtr(cpp_type->constant, &field,
+                                             false) +
+                            "> "
+                      : GenTypeNativePtr(cpp_type->constant, &field, false))
+               : type + " ");
+      // Generate default member initializers for >= C++11.
+      std::string field_di = "";
+      if (opts_.g_cpp_std >= cpp::CPP_STD_11) {
+        field_di = "{}";
+        auto native_default = field.attributes.Lookup("native_default");
+        // Scalar types get parsed defaults, raw pointers get nullptrs.
+        if (IsScalar(field.value.type.base_type)) {
+          field_di =
+              " = " + (native_default ? std::string(native_default->constant)
+                                      : GetDefaultScalarValue(field, true));
+        } else if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+          if (IsStruct(field.value.type) && native_default) {
+            field_di = " = " + native_default->constant;
+          }
+        }
+      }
+      code_.SetValue("FIELD_TYPE", full_type);
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_DI", field_di);
+      code_ += "  {{FIELD_TYPE}}{{FIELD_NAME}}{{FIELD_DI}};";
+    }
+  }
+
+  // Generate the default constructor for this struct. Properly initialize all
+  // scalar members with default values.
+  void GenDefaultConstructor(const StructDef &struct_def) {
+    code_.SetValue("NATIVE_NAME",
+                   NativeName(Name(struct_def), &struct_def, opts_));
+    // In >= C++11, default member initializers are generated.
+    if (opts_.g_cpp_std >= cpp::CPP_STD_11) { return; }
+    std::string initializer_list;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (!field.deprecated &&  // Deprecated fields won't be accessible.
+          field.value.type.base_type != BASE_TYPE_UTYPE) {
+        auto cpp_type = field.attributes.Lookup("cpp_type");
+        auto native_default = field.attributes.Lookup("native_default");
+        // Scalar types get parsed defaults, raw pointers get nullptrs.
+        if (IsScalar(field.value.type.base_type)) {
+          if (!initializer_list.empty()) { initializer_list += ",\n        "; }
+          initializer_list += Name(field);
+          initializer_list +=
+              "(" +
+              (native_default ? std::string(native_default->constant)
+                              : GetDefaultScalarValue(field, true)) +
+              ")";
+        } else if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+          if (IsStruct(field.value.type)) {
+            if (native_default) {
+              if (!initializer_list.empty()) {
+                initializer_list += ",\n        ";
+              }
+              initializer_list +=
+                  Name(field) + "(" + native_default->constant + ")";
+            }
+          }
+        } else if (cpp_type && field.value.type.base_type != BASE_TYPE_VECTOR) {
+          if (!initializer_list.empty()) { initializer_list += ",\n        "; }
+          initializer_list += Name(field) + "(0)";
+        }
+      }
+    }
+    if (!initializer_list.empty()) {
+      initializer_list = "\n      : " + initializer_list;
+    }
+
+    code_.SetValue("INIT_LIST", initializer_list);
+
+    code_ += "  {{NATIVE_NAME}}(){{INIT_LIST}} {";
+    code_ += "  }";
+  }
+
+  void GenCompareOperator(const StructDef &struct_def,
+                          std::string accessSuffix = "") {
+    std::string compare_op;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (!field.deprecated &&  // Deprecated fields won't be accessible.
+          field.value.type.base_type != BASE_TYPE_UTYPE &&
+          (field.value.type.base_type != BASE_TYPE_VECTOR ||
+           field.value.type.element != BASE_TYPE_UTYPE)) {
+        if (!compare_op.empty()) { compare_op += " &&\n      "; }
+        auto accessor = Name(field) + accessSuffix;
+        compare_op += "(lhs." + accessor + " == rhs." + accessor + ")";
+      }
+    }
+
+    std::string cmp_lhs;
+    std::string cmp_rhs;
+    if (compare_op.empty()) {
+      cmp_lhs = "";
+      cmp_rhs = "";
+      compare_op = "  return true;";
+    } else {
+      cmp_lhs = "lhs";
+      cmp_rhs = "rhs";
+      compare_op = "  return\n      " + compare_op + ";";
+    }
+
+    code_.SetValue("CMP_OP", compare_op);
+    code_.SetValue("CMP_LHS", cmp_lhs);
+    code_.SetValue("CMP_RHS", cmp_rhs);
+    code_ += "";
+    code_ +=
+        "inline bool operator==(const {{NATIVE_NAME}} &{{CMP_LHS}}, const "
+        "{{NATIVE_NAME}} &{{CMP_RHS}}) {";
+    code_ += "{{CMP_OP}}";
+    code_ += "}";
+
+    code_ += "";
+    code_ +=
+        "inline bool operator!=(const {{NATIVE_NAME}} &lhs, const "
+        "{{NATIVE_NAME}} &rhs) {";
+    code_ += "    return !(lhs == rhs);";
+    code_ += "}";
+    code_ += "";
+  }
+
+  void GenOperatorNewDelete(const StructDef &struct_def) {
+    if (auto native_custom_alloc =
+            struct_def.attributes.Lookup("native_custom_alloc")) {
+      code_ += "  inline void *operator new (std::size_t count) {";
+      code_ += "    return " + native_custom_alloc->constant +
+               "<{{NATIVE_NAME}}>().allocate(count / sizeof({{NATIVE_NAME}}));";
+      code_ += "  }";
+      code_ += "  inline void operator delete (void *ptr) {";
+      code_ += "    return " + native_custom_alloc->constant +
+               "<{{NATIVE_NAME}}>().deallocate(static_cast<{{NATIVE_NAME}}*>("
+               "ptr),1);";
+      code_ += "  }";
+    }
+  }
+
+  void GenNativeTable(const StructDef &struct_def) {
+    const auto native_name = NativeName(Name(struct_def), &struct_def, opts_);
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+    code_.SetValue("NATIVE_NAME", native_name);
+
+    // Generate a C++ object that can hold an unpacked version of this table.
+    code_ += "struct {{NATIVE_NAME}} : public flatbuffers::NativeTable {";
+    code_ += "  typedef {{STRUCT_NAME}} TableType;";
+    GenFullyQualifiedNameGetter(struct_def, native_name);
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      GenMember(**it);
+    }
+    GenOperatorNewDelete(struct_def);
+    GenDefaultConstructor(struct_def);
+    code_ += "};";
+    if (opts_.gen_compare) GenCompareOperator(struct_def);
+    code_ += "";
+  }
+
+  // Generate the code to call the appropriate Verify function(s) for a field.
+  void GenVerifyCall(const FieldDef &field, const char *prefix) {
+    code_.SetValue("PRE", prefix);
+    code_.SetValue("NAME", Name(field));
+    code_.SetValue("REQUIRED", field.IsRequired() ? "Required" : "");
+    code_.SetValue("SIZE", GenTypeSize(field.value.type));
+    code_.SetValue("OFFSET", GenFieldOffsetName(field));
+    if (IsScalar(field.value.type.base_type) || IsStruct(field.value.type)) {
+      code_ +=
+          "{{PRE}}VerifyField{{REQUIRED}}<{{SIZE}}>(verifier, {{OFFSET}})\\";
+    } else {
+      code_ += "{{PRE}}VerifyOffset{{REQUIRED}}(verifier, {{OFFSET}})\\";
+    }
+
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_UNION: {
+        code_.SetValue("ENUM_NAME", field.value.type.enum_def->name);
+        code_.SetValue("SUFFIX", UnionTypeFieldSuffix());
+        code_ +=
+            "{{PRE}}Verify{{ENUM_NAME}}(verifier, {{NAME}}(), "
+            "{{NAME}}{{SUFFIX}}())\\";
+        break;
+      }
+      case BASE_TYPE_STRUCT: {
+        if (!field.value.type.struct_def->fixed) {
+          code_ += "{{PRE}}verifier.VerifyTable({{NAME}}())\\";
+        }
+        break;
+      }
+      case BASE_TYPE_STRING: {
+        code_ += "{{PRE}}verifier.VerifyString({{NAME}}())\\";
+        break;
+      }
+      case BASE_TYPE_VECTOR: {
+        code_ += "{{PRE}}verifier.VerifyVector({{NAME}}())\\";
+
+        switch (field.value.type.element) {
+          case BASE_TYPE_STRING: {
+            code_ += "{{PRE}}verifier.VerifyVectorOfStrings({{NAME}}())\\";
+            break;
+          }
+          case BASE_TYPE_STRUCT: {
+            if (!field.value.type.struct_def->fixed) {
+              code_ += "{{PRE}}verifier.VerifyVectorOfTables({{NAME}}())\\";
+            }
+            break;
+          }
+          case BASE_TYPE_UNION: {
+            code_.SetValue("ENUM_NAME", field.value.type.enum_def->name);
+            code_ +=
+                "{{PRE}}Verify{{ENUM_NAME}}Vector(verifier, {{NAME}}(), "
+                "{{NAME}}_type())\\";
+            break;
+          }
+          default: break;
+        }
+        break;
+      }
+      default: {
+        break;
+      }
+    }
+  }
+
+  // Generate CompareWithValue method for a key field.
+  void GenKeyFieldMethods(const FieldDef &field) {
+    FLATBUFFERS_ASSERT(field.key);
+    const bool is_string = (IsString(field.value.type));
+
+    code_ += "  bool KeyCompareLessThan(const {{STRUCT_NAME}} *o) const {";
+    if (is_string) {
+      // use operator< of flatbuffers::String
+      code_ += "    return *{{FIELD_NAME}}() < *o->{{FIELD_NAME}}();";
+    } else {
+      code_ += "    return {{FIELD_NAME}}() < o->{{FIELD_NAME}}();";
+    }
+    code_ += "  }";
+
+    if (is_string) {
+      code_ += "  int KeyCompareWithValue(const char *val) const {";
+      code_ += "    return strcmp({{FIELD_NAME}}()->c_str(), val);";
+      code_ += "  }";
+    } else {
+      FLATBUFFERS_ASSERT(IsScalar(field.value.type.base_type));
+      auto type = GenTypeBasic(field.value.type, false);
+      if (opts_.scoped_enums && field.value.type.enum_def &&
+          IsScalar(field.value.type.base_type)) {
+        type = GenTypeGet(field.value.type, " ", "const ", " *", true);
+      }
+      // Returns {field<val: -1, field==val: 0, field>val: +1}.
+      code_.SetValue("KEY_TYPE", type);
+      code_ += "  int KeyCompareWithValue({{KEY_TYPE}} val) const {";
+      code_ +=
+          "    return static_cast<int>({{FIELD_NAME}}() > val) - "
+          "static_cast<int>({{FIELD_NAME}}() < val);";
+      code_ += "  }";
+    }
+  }
+
+  void GenTableUnionAsGetters(const FieldDef &field) {
+    const auto &type = field.value.type;
+    auto u = type.enum_def;
+
+    if (!type.enum_def->uses_multiple_type_instances)
+      code_ +=
+          "  template<typename T> "
+          "const T *{{NULLABLE_EXT}}{{FIELD_NAME}}_as() const;";
+
+    for (auto u_it = u->Vals().begin(); u_it != u->Vals().end(); ++u_it) {
+      auto &ev = **u_it;
+      if (ev.union_type.base_type == BASE_TYPE_NONE) { continue; }
+      auto full_struct_name = GetUnionElement(ev, false, opts_);
+
+      // @TODO: Mby make this decisions more universal? How?
+      code_.SetValue("U_GET_TYPE",
+                     EscapeKeyword(field.name + UnionTypeFieldSuffix()));
+      code_.SetValue("U_ELEMENT_TYPE", WrapInNameSpace(u->defined_namespace,
+                                                       GetEnumValUse(*u, ev)));
+      code_.SetValue("U_FIELD_TYPE", "const " + full_struct_name + " *");
+      code_.SetValue("U_FIELD_NAME", Name(field) + "_as_" + Name(ev));
+      code_.SetValue("U_NULLABLE", NullableExtension());
+
+      // `const Type *union_name_asType() const` accessor.
+      code_ += "  {{U_FIELD_TYPE}}{{U_NULLABLE}}{{U_FIELD_NAME}}() const {";
+      code_ +=
+          "    return {{U_GET_TYPE}}() == {{U_ELEMENT_TYPE}} ? "
+          "static_cast<{{U_FIELD_TYPE}}>({{FIELD_NAME}}()) "
+          ": nullptr;";
+      code_ += "  }";
+    }
+  }
+
+  void GenTableFieldGetter(const FieldDef &field) {
+    const auto &type = field.value.type;
+    const auto offset_str = GenFieldOffsetName(field);
+
+    GenComment(field.doc_comment, "  ");
+    // Call a different accessor for pointers, that indirects.
+    if (false == field.IsScalarOptional()) {
+      const bool is_scalar = IsScalar(type.base_type);
+      std::string accessor;
+      if (is_scalar)
+        accessor = "GetField<";
+      else if (IsStruct(type))
+        accessor = "GetStruct<";
+      else
+        accessor = "GetPointer<";
+      auto offset_type = GenTypeGet(type, "", "const ", " *", false);
+      auto call = accessor + offset_type + ">(" + offset_str;
+      // Default value as second arg for non-pointer types.
+      if (is_scalar) { call += ", " + GenDefaultConstant(field); }
+      call += ")";
+
+      std::string afterptr = " *" + NullableExtension();
+      code_.SetValue("FIELD_TYPE",
+                     GenTypeGet(type, " ", "const ", afterptr.c_str(), true));
+      code_.SetValue("FIELD_VALUE", GenUnderlyingCast(field, true, call));
+      code_.SetValue("NULLABLE_EXT", NullableExtension());
+      code_ += "  {{FIELD_TYPE}}{{FIELD_NAME}}() const {";
+      code_ += "    return {{FIELD_VALUE}};";
+      code_ += "  }";
+    } else {
+      auto wire_type = GenTypeBasic(type, false);
+      auto face_type = GenTypeBasic(type, true);
+      auto opt_value = "GetOptional<" + wire_type + ", " + face_type + ">(" +
+                       offset_str + ")";
+      code_.SetValue("FIELD_TYPE", GenOptionalDecl(type));
+      code_ += "  {{FIELD_TYPE}} {{FIELD_NAME}}() const {";
+      code_ += "    return " + opt_value + ";";
+      code_ += "  }";
+    }
+
+    if (type.base_type == BASE_TYPE_UNION) { GenTableUnionAsGetters(field); }
+  }
+
+  void GenTableFieldType(const FieldDef &field) {
+    const auto &type = field.value.type;
+    const auto offset_str = GenFieldOffsetName(field);
+    if (!field.IsScalarOptional()) {
+      std::string afterptr = " *" + NullableExtension();
+      code_.SetValue("FIELD_TYPE",
+                     GenTypeGet(type, "", "const ", afterptr.c_str(), true));
+      code_ += "    {{FIELD_TYPE}}\\";
+    } else {
+      code_.SetValue("FIELD_TYPE", GenOptionalDecl(type));
+      code_ += "    {{FIELD_TYPE}}\\";
+    }
+  }
+
+  void GenStructFieldType(const FieldDef &field) {
+    const auto is_array = IsArray(field.value.type);
+    std::string field_type =
+        GenTypeGet(field.value.type, "", is_array ? "" : "const ",
+                   is_array ? "" : " &", true);
+    code_.SetValue("FIELD_TYPE", field_type);
+    code_ += "    {{FIELD_TYPE}}\\";
+  }
+
+  void GenFieldTypeHelper(const StructDef &struct_def) {
+    if (struct_def.fields.vec.empty()) { return; }
+    code_ += "  template<size_t Index>";
+    code_ += "  using FieldType = \\";
+    code_ += "decltype(std::declval<type>().get_field<Index>());";
+  }
+
+  void GenIndexBasedFieldGetter(const StructDef &struct_def) {
+    if (struct_def.fields.vec.empty()) { return; }
+    code_ += "  template<size_t Index>";
+    code_ += "  auto get_field() const {";
+
+    size_t index = 0;
+    bool need_else = false;
+    // Generate one index-based getter for each field.
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) {
+        // Deprecated fields won't be accessible.
+        continue;
+      }
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_INDEX",
+                     std::to_string(static_cast<long long>(index++)));
+      if (need_else) {
+        code_ += "    else \\";
+      } else {
+        code_ += "         \\";
+      }
+      need_else = true;
+      code_ += "if constexpr (Index == {{FIELD_INDEX}}) \\";
+      code_ += "return {{FIELD_NAME}}();";
+    }
+    code_ += "    else static_assert(Index != Index, \"Invalid Field Index\");";
+    code_ += "  }";
+  }
+
+  // Sample for Vec3:
+  //
+  //   static constexpr std::array<const char *, 3> field_names = {
+  //     "x",
+  //     "y",
+  //     "z"
+  //   };
+  //
+  void GenFieldNames(const StructDef &struct_def) {
+    auto non_deprecated_field_count = std::count_if(
+        struct_def.fields.vec.begin(), struct_def.fields.vec.end(),
+        [](const FieldDef *field) { return !field->deprecated; });
+    code_ += "  static constexpr std::array<\\";
+    code_.SetValue(
+        "FIELD_COUNT",
+        std::to_string(static_cast<long long>(non_deprecated_field_count)));
+    code_ += "const char *, {{FIELD_COUNT}}> field_names = {\\";
+    if (struct_def.fields.vec.empty()) {
+      code_ += "};";
+      return;
+    }
+    code_ += "";
+    // Generate the field_names elements.
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) {
+        // Deprecated fields won't be accessible.
+        continue;
+      }
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_ += "    \"{{FIELD_NAME}}\"\\";
+      if (it + 1 != struct_def.fields.vec.end()) { code_ += ","; }
+    }
+    code_ += "\n  };";
+  }
+
+  void GenFieldsNumber(const StructDef &struct_def) {
+    auto non_deprecated_field_count = std::count_if(
+        struct_def.fields.vec.begin(), struct_def.fields.vec.end(),
+        [](const FieldDef *field) { return !field->deprecated; });
+    code_.SetValue(
+        "FIELD_COUNT",
+        std::to_string(static_cast<long long>(non_deprecated_field_count)));
+    code_ += "  static constexpr size_t fields_number = {{FIELD_COUNT}};";
+  }
+
+  void GenTraitsStruct(const StructDef &struct_def) {
+    code_.SetValue(
+        "FULLY_QUALIFIED_NAME",
+        struct_def.defined_namespace->GetFullyQualifiedName(Name(struct_def)));
+    code_ += "struct {{STRUCT_NAME}}::Traits {";
+    code_ += "  using type = {{STRUCT_NAME}};";
+    if (!struct_def.fixed) {
+      // We have a table and not a struct.
+      code_ += "  static auto constexpr Create = Create{{STRUCT_NAME}};";
+    }
+    if (opts_.cpp_static_reflection) {
+      code_ += "  static constexpr auto name = \"{{STRUCT_NAME}}\";";
+      code_ +=
+          "  static constexpr auto fully_qualified_name = "
+          "\"{{FULLY_QUALIFIED_NAME}}\";";
+      GenFieldNames(struct_def);
+      GenFieldTypeHelper(struct_def);
+      GenFieldsNumber(struct_def);
+    }
+    code_ += "};";
+    code_ += "";
+  }
+
+  void GenTableFieldSetter(const FieldDef &field) {
+    const auto &type = field.value.type;
+    const bool is_scalar = IsScalar(type.base_type);
+    if (is_scalar && IsUnion(type))
+      return;  // changing of a union's type is forbidden
+
+    auto offset_str = GenFieldOffsetName(field);
+    if (is_scalar) {
+      const auto wire_type = GenTypeWire(type, "", false);
+      code_.SetValue("SET_FN", "SetField<" + wire_type + ">");
+      code_.SetValue("OFFSET_NAME", offset_str);
+      code_.SetValue("FIELD_TYPE", GenTypeBasic(type, true));
+      code_.SetValue("FIELD_VALUE",
+                     GenUnderlyingCast(field, false, "_" + Name(field)));
+
+      code_ +=
+          "  bool mutate_{{FIELD_NAME}}({{FIELD_TYPE}} "
+          "_{{FIELD_NAME}}) {";
+      if (false == field.IsScalarOptional()) {
+        code_.SetValue("DEFAULT_VALUE", GenDefaultConstant(field));
+        code_ +=
+            "    return {{SET_FN}}({{OFFSET_NAME}}, {{FIELD_VALUE}}, "
+            "{{DEFAULT_VALUE}});";
+      } else {
+        code_ += "    return {{SET_FN}}({{OFFSET_NAME}}, {{FIELD_VALUE}});";
+      }
+      code_ += "  }";
+    } else {
+      auto postptr = " *" + NullableExtension();
+      auto wire_type = GenTypeGet(type, " ", "", postptr.c_str(), true);
+      std::string accessor = IsStruct(type) ? "GetStruct<" : "GetPointer<";
+      auto underlying = accessor + wire_type + ">(" + offset_str + ")";
+      code_.SetValue("FIELD_TYPE", wire_type);
+      code_.SetValue("FIELD_VALUE", GenUnderlyingCast(field, true, underlying));
+
+      code_ += "  {{FIELD_TYPE}}mutable_{{FIELD_NAME}}() {";
+      code_ += "    return {{FIELD_VALUE}};";
+      code_ += "  }";
+    }
+  }
+
+  // Generate an accessor struct, builder structs & function for a table.
+  void GenTable(const StructDef &struct_def) {
+    if (opts_.generate_object_based_api) { GenNativeTable(struct_def); }
+
+    // Generate an accessor struct, with methods of the form:
+    // type name() const { return GetField<type>(offset, defaultval); }
+    GenComment(struct_def.doc_comment);
+
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+    code_ +=
+        "struct {{STRUCT_NAME}} FLATBUFFERS_FINAL_CLASS"
+        " : private flatbuffers::Table {";
+    if (opts_.generate_object_based_api) {
+      code_ += "  typedef {{NATIVE_NAME}} NativeTableType;";
+    }
+    code_ += "  typedef {{STRUCT_NAME}}Builder Builder;";
+    if (opts_.g_cpp_std >= cpp::CPP_STD_17) { code_ += "  struct Traits;"; }
+    if (opts_.mini_reflect != IDLOptions::kNone) {
+      code_ +=
+          "  static const flatbuffers::TypeTable *MiniReflectTypeTable() {";
+      code_ += "    return {{STRUCT_NAME}}TypeTable();";
+      code_ += "  }";
+    }
+
+    GenFullyQualifiedNameGetter(struct_def, Name(struct_def));
+
+    // Generate field id constants.
+    if (struct_def.fields.vec.size() > 0) {
+      // We need to add a trailing comma to all elements except the last one as
+      // older versions of gcc complain about this.
+      code_.SetValue("SEP", "");
+      code_ +=
+          "  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (field.deprecated) {
+          // Deprecated fields won't be accessible.
+          continue;
+        }
+
+        code_.SetValue("OFFSET_NAME", GenFieldOffsetName(field));
+        code_.SetValue("OFFSET_VALUE", NumToString(field.value.offset));
+        code_ += "{{SEP}}    {{OFFSET_NAME}} = {{OFFSET_VALUE}}\\";
+        code_.SetValue("SEP", ",\n");
+      }
+      code_ += "";
+      code_ += "  };";
+    }
+
+    // Generate the accessors.
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) {
+        // Deprecated fields won't be accessible.
+        continue;
+      }
+
+      code_.SetValue("FIELD_NAME", Name(field));
+      GenTableFieldGetter(field);
+      if (opts_.mutable_buffer) { GenTableFieldSetter(field); }
+
+      auto nested = field.attributes.Lookup("nested_flatbuffer");
+      if (nested) {
+        std::string qualified_name = nested->constant;
+        auto nested_root = parser_.LookupStruct(nested->constant);
+        if (nested_root == nullptr) {
+          qualified_name = parser_.current_namespace_->GetFullyQualifiedName(
+              nested->constant);
+          nested_root = parser_.LookupStruct(qualified_name);
+        }
+        FLATBUFFERS_ASSERT(nested_root);  // Guaranteed to exist by parser.
+        (void)nested_root;
+        code_.SetValue("CPP_NAME", TranslateNameSpace(qualified_name));
+
+        code_ += "  const {{CPP_NAME}} *{{FIELD_NAME}}_nested_root() const {";
+        code_ +=
+            "    return "
+            "flatbuffers::GetRoot<{{CPP_NAME}}>({{FIELD_NAME}}()->Data());";
+        code_ += "  }";
+      }
+
+      if (field.flexbuffer) {
+        code_ +=
+            "  flexbuffers::Reference {{FIELD_NAME}}_flexbuffer_root()"
+            " const {";
+        // Both Data() and size() are const-methods, therefore call order
+        // doesn't matter.
+        code_ +=
+            "    return flexbuffers::GetRoot({{FIELD_NAME}}()->Data(), "
+            "{{FIELD_NAME}}()->size());";
+        code_ += "  }";
+      }
+
+      // Generate a comparison function for this field if it is a key.
+      if (field.key) { GenKeyFieldMethods(field); }
+    }
+
+    if (opts_.cpp_static_reflection) { GenIndexBasedFieldGetter(struct_def); }
+
+    // Generate a verifier function that can check a buffer from an untrusted
+    // source will never cause reads outside the buffer.
+    code_ += "  bool Verify(flatbuffers::Verifier &verifier) const {";
+    code_ += "    return VerifyTableStart(verifier)\\";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) { continue; }
+      GenVerifyCall(field, " &&\n           ");
+    }
+
+    code_ += " &&\n           verifier.EndTable();";
+    code_ += "  }";
+
+    if (opts_.generate_object_based_api) {
+      // Generate the UnPack() pre declaration.
+      code_ += "  " + TableUnPackSignature(struct_def, true, opts_) + ";";
+      code_ += "  " + TableUnPackToSignature(struct_def, true, opts_) + ";";
+      code_ += "  " + TablePackSignature(struct_def, true, opts_) + ";";
+    }
+
+    code_ += "};";  // End of table.
+    code_ += "";
+
+    // Explicit specializations for union accessors
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated || field.value.type.base_type != BASE_TYPE_UNION) {
+        continue;
+      }
+
+      auto u = field.value.type.enum_def;
+      if (u->uses_multiple_type_instances) continue;
+
+      code_.SetValue("FIELD_NAME", Name(field));
+
+      for (auto u_it = u->Vals().begin(); u_it != u->Vals().end(); ++u_it) {
+        auto &ev = **u_it;
+        if (ev.union_type.base_type == BASE_TYPE_NONE) { continue; }
+
+        auto full_struct_name = GetUnionElement(ev, false, opts_);
+
+        code_.SetValue(
+            "U_ELEMENT_TYPE",
+            WrapInNameSpace(u->defined_namespace, GetEnumValUse(*u, ev)));
+        code_.SetValue("U_FIELD_TYPE", "const " + full_struct_name + " *");
+        code_.SetValue("U_ELEMENT_NAME", full_struct_name);
+        code_.SetValue("U_FIELD_NAME", Name(field) + "_as_" + Name(ev));
+
+        // `template<> const T *union_name_as<T>() const` accessor.
+        code_ +=
+            "template<> "
+            "inline {{U_FIELD_TYPE}}{{STRUCT_NAME}}::{{FIELD_NAME}}_as"
+            "<{{U_ELEMENT_NAME}}>() const {";
+        code_ += "  return {{U_FIELD_NAME}}();";
+        code_ += "}";
+        code_ += "";
+      }
+    }
+
+    GenBuilders(struct_def);
+
+    if (opts_.generate_object_based_api) {
+      // Generate a pre-declaration for a CreateX method that works with an
+      // unpacked C++ object.
+      code_ += TableCreateSignature(struct_def, true, opts_) + ";";
+      code_ += "";
+    }
+  }
+
+  // Generate code to force vector alignment. Return empty string for vector
+  // that doesn't need alignment code.
+  std::string GenVectorForceAlign(const FieldDef &field,
+                                  const std::string &field_size) {
+    FLATBUFFERS_ASSERT(IsVector(field.value.type));
+    // Get the value of the force_align attribute.
+    const auto *force_align = field.attributes.Lookup("force_align");
+    const int align = force_align ? atoi(force_align->constant.c_str()) : 1;
+    // Generate code to do force_align for the vector.
+    if (align > 1) {
+      const auto vtype = field.value.type.VectorType();
+      const auto type = IsStruct(vtype) ? WrapInNameSpace(*vtype.struct_def)
+                                        : GenTypeWire(vtype, "", false);
+      return "_fbb.ForceVectorAlignment(" + field_size + ", sizeof(" + type +
+             "), " + std::to_string(static_cast<long long>(align)) + ");";
+    }
+    return "";
+  }
+
+  void GenBuilders(const StructDef &struct_def) {
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+
+    // Generate a builder struct:
+    code_ += "struct {{STRUCT_NAME}}Builder {";
+    code_ += "  typedef {{STRUCT_NAME}} Table;";
+    code_ += "  flatbuffers::FlatBufferBuilder &fbb_;";
+    code_ += "  flatbuffers::uoffset_t start_;";
+
+    bool has_string_or_vector_fields = false;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) continue;
+      const bool is_scalar = IsScalar(field.value.type.base_type);
+      const bool is_default_scalar = is_scalar && !field.IsScalarOptional();
+      const bool is_string = IsString(field.value.type);
+      const bool is_vector = IsVector(field.value.type);
+      if (is_string || is_vector) { has_string_or_vector_fields = true; }
+
+      std::string offset = GenFieldOffsetName(field);
+      std::string name = GenUnderlyingCast(field, false, Name(field));
+      std::string value = is_default_scalar ? GenDefaultConstant(field) : "";
+
+      // Generate accessor functions of the form:
+      // void add_name(type name) {
+      //   fbb_.AddElement<type>(offset, name, default);
+      // }
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_TYPE", GenTypeWire(field.value.type, " ", true));
+      code_.SetValue("ADD_OFFSET", Name(struct_def) + "::" + offset);
+      code_.SetValue("ADD_NAME", name);
+      code_.SetValue("ADD_VALUE", value);
+      if (is_scalar) {
+        const auto type = GenTypeWire(field.value.type, "", false);
+        code_.SetValue("ADD_FN", "AddElement<" + type + ">");
+      } else if (IsStruct(field.value.type)) {
+        code_.SetValue("ADD_FN", "AddStruct");
+      } else {
+        code_.SetValue("ADD_FN", "AddOffset");
+      }
+
+      code_ += "  void add_{{FIELD_NAME}}({{FIELD_TYPE}}{{FIELD_NAME}}) {";
+      code_ += "    fbb_.{{ADD_FN}}(\\";
+      if (is_default_scalar) {
+        code_ += "{{ADD_OFFSET}}, {{ADD_NAME}}, {{ADD_VALUE}});";
+      } else {
+        code_ += "{{ADD_OFFSET}}, {{ADD_NAME}});";
+      }
+      code_ += "  }";
+    }
+
+    // Builder constructor
+    code_ +=
+        "  explicit {{STRUCT_NAME}}Builder(flatbuffers::FlatBufferBuilder "
+        "&_fbb)";
+    code_ += "        : fbb_(_fbb) {";
+    code_ += "    start_ = fbb_.StartTable();";
+    code_ += "  }";
+
+    // Finish() function.
+    code_ += "  flatbuffers::Offset<{{STRUCT_NAME}}> Finish() {";
+    code_ += "    const auto end = fbb_.EndTable(start_);";
+    code_ += "    auto o = flatbuffers::Offset<{{STRUCT_NAME}}>(end);";
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (!field.deprecated && field.IsRequired()) {
+        code_.SetValue("FIELD_NAME", Name(field));
+        code_.SetValue("OFFSET_NAME", GenFieldOffsetName(field));
+        code_ += "    fbb_.Required(o, {{STRUCT_NAME}}::{{OFFSET_NAME}});";
+      }
+    }
+    code_ += "    return o;";
+    code_ += "  }";
+    code_ += "};";
+    code_ += "";
+
+    // Generate a convenient CreateX function that uses the above builder
+    // to create a table in one go.
+    code_ +=
+        "inline flatbuffers::Offset<{{STRUCT_NAME}}> "
+        "Create{{STRUCT_NAME}}(";
+    code_ += "    flatbuffers::FlatBufferBuilder &_fbb\\";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (!field.deprecated) { GenParam(field, false, ",\n    "); }
+    }
+    code_ += ") {";
+
+    code_ += "  {{STRUCT_NAME}}Builder builder_(_fbb);";
+    for (size_t size = struct_def.sortbysize ? sizeof(largest_scalar_t) : 1;
+         size; size /= 2) {
+      for (auto it = struct_def.fields.vec.rbegin();
+           it != struct_def.fields.vec.rend(); ++it) {
+        const auto &field = **it;
+        if (!field.deprecated && (!struct_def.sortbysize ||
+                                  size == SizeOf(field.value.type.base_type))) {
+          code_.SetValue("FIELD_NAME", Name(field));
+          if (field.IsScalarOptional()) {
+            code_ +=
+                "  if({{FIELD_NAME}}) { "
+                "builder_.add_{{FIELD_NAME}}(*{{FIELD_NAME}}); }";
+          } else {
+            code_ += "  builder_.add_{{FIELD_NAME}}({{FIELD_NAME}});";
+          }
+        }
+      }
+    }
+    code_ += "  return builder_.Finish();";
+    code_ += "}";
+    code_ += "";
+
+    // Definition for type traits for this table type. This allows querying var-
+    // ious compile-time traits of the table.
+    if (opts_.g_cpp_std >= cpp::CPP_STD_17) { GenTraitsStruct(struct_def); }
+
+    // Generate a CreateXDirect function with vector types as parameters
+    if (opts_.cpp_direct_copy && has_string_or_vector_fields) {
+      code_ +=
+          "inline flatbuffers::Offset<{{STRUCT_NAME}}> "
+          "Create{{STRUCT_NAME}}Direct(";
+      code_ += "    flatbuffers::FlatBufferBuilder &_fbb\\";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (!field.deprecated) { GenParam(field, true, ",\n    "); }
+      }
+      // Need to call "Create" with the struct namespace.
+      const auto qualified_create_name =
+          struct_def.defined_namespace->GetFullyQualifiedName("Create");
+      code_.SetValue("CREATE_NAME", TranslateNameSpace(qualified_create_name));
+      code_ += ") {";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (!field.deprecated) {
+          code_.SetValue("FIELD_NAME", Name(field));
+          if (IsString(field.value.type)) {
+            if (!field.shared) {
+              code_.SetValue("CREATE_STRING", "CreateString");
+            } else {
+              code_.SetValue("CREATE_STRING", "CreateSharedString");
+            }
+            code_ +=
+                "  auto {{FIELD_NAME}}__ = {{FIELD_NAME}} ? "
+                "_fbb.{{CREATE_STRING}}({{FIELD_NAME}}) : 0;";
+          } else if (IsVector(field.value.type)) {
+            const std::string force_align_code =
+                GenVectorForceAlign(field, Name(field) + "->size()");
+            if (!force_align_code.empty()) {
+              code_ += "  if ({{FIELD_NAME}}) { " + force_align_code + " }";
+            }
+            code_ += "  auto {{FIELD_NAME}}__ = {{FIELD_NAME}} ? \\";
+            const auto vtype = field.value.type.VectorType();
+            const auto has_key = TypeHasKey(vtype);
+            if (IsStruct(vtype)) {
+              const auto type = WrapInNameSpace(*vtype.struct_def);
+              code_ += (has_key ? "_fbb.CreateVectorOfSortedStructs<"
+                                : "_fbb.CreateVectorOfStructs<") +
+                       type + ">\\";
+            } else if (has_key) {
+              const auto type = WrapInNameSpace(*vtype.struct_def);
+              code_ += "_fbb.CreateVectorOfSortedTables<" + type + ">\\";
+            } else {
+              const auto type =
+                  GenTypeWire(vtype, "", VectorElementUserFacing(vtype));
+              code_ += "_fbb.CreateVector<" + type + ">\\";
+            }
+            code_ +=
+                has_key ? "({{FIELD_NAME}}) : 0;" : "(*{{FIELD_NAME}}) : 0;";
+          }
+        }
+      }
+      code_ += "  return {{CREATE_NAME}}{{STRUCT_NAME}}(";
+      code_ += "      _fbb\\";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (!field.deprecated) {
+          code_.SetValue("FIELD_NAME", Name(field));
+          code_ += ",\n      {{FIELD_NAME}}\\";
+          if (IsString(field.value.type) || IsVector(field.value.type)) {
+            code_ += "__\\";
+          }
+        }
+      }
+      code_ += ");";
+      code_ += "}";
+      code_ += "";
+    }
+  }
+
+  std::string GenUnionUnpackVal(const FieldDef &afield,
+                                const char *vec_elem_access,
+                                const char *vec_type_access) {
+    auto type_name = WrapInNameSpace(*afield.value.type.enum_def);
+    return type_name + "Union::UnPack(" + "_e" + vec_elem_access + ", " +
+           EscapeKeyword(afield.name + UnionTypeFieldSuffix()) + "()" +
+           vec_type_access + ", _resolver)";
+  }
+
+  std::string GenUnpackVal(const Type &type, const std::string &val,
+                           bool invector, const FieldDef &afield) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: {
+        if (FlexibleStringConstructor(&afield)) {
+          return NativeString(&afield) + "(" + val + "->c_str(), " + val +
+                 "->size())";
+        } else {
+          return val + "->str()";
+        }
+      }
+      case BASE_TYPE_STRUCT: {
+        if (IsStruct(type)) {
+          const auto &struct_attrs = type.struct_def->attributes;
+          const auto native_type = struct_attrs.Lookup("native_type");
+          if (native_type) {
+            std::string unpack_call = "flatbuffers::UnPack";
+            const auto pack_name = struct_attrs.Lookup("native_type_pack_name");
+            if (pack_name) { unpack_call += pack_name->constant; }
+            unpack_call += "(*" + val + ")";
+            return unpack_call;
+          } else if (invector || afield.native_inline) {
+            return "*" + val;
+          } else {
+            const auto name = WrapInNameSpace(*type.struct_def);
+            const auto ptype = GenTypeNativePtr(name, &afield, true);
+            return ptype + "(new " + name + "(*" + val + "))";
+          }
+        } else {
+          const auto ptype = GenTypeNativePtr(
+              WrapNativeNameInNameSpace(*type.struct_def, opts_), &afield,
+              true);
+          return ptype + "(" + val + "->UnPack(_resolver))";
+        }
+      }
+      case BASE_TYPE_UNION: {
+        return GenUnionUnpackVal(
+            afield, invector ? "->Get(_i)" : "",
+            invector ? ("->GetEnum<" + type.enum_def->name + ">(_i)").c_str()
+                     : "");
+      }
+      default: {
+        return val;
+        break;
+      }
+    }
+  }
+
+  std::string GenUnpackFieldStatement(const FieldDef &field,
+                                      const FieldDef *union_field) {
+    std::string code;
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_VECTOR: {
+        auto name = Name(field);
+        if (field.value.type.element == BASE_TYPE_UTYPE) {
+          name = StripUnionType(Name(field));
+        }
+        code += "{ _o->" + name + ".resize(_e->size()); ";
+        if (!field.value.type.enum_def && !IsBool(field.value.type.element) &&
+            IsOneByte(field.value.type.element)) {
+          // For vectors of bytes, std::copy is used to improve performance.
+          // This doesn't work for:
+          //  - enum types because they have to be explicitly static_cast.
+          //  - vectors of bool, since they are a template specialization.
+          //  - multiple-byte types due to endianness.
+          code +=
+              "std::copy(_e->begin(), _e->end(), _o->" + name + ".begin()); }";
+        } else {
+          std::string indexing;
+          if (field.value.type.enum_def) {
+            indexing += "static_cast<" +
+                        WrapInNameSpace(*field.value.type.enum_def) + ">(";
+          }
+          indexing += "_e->Get(_i)";
+          if (field.value.type.enum_def) { indexing += ")"; }
+          if (field.value.type.element == BASE_TYPE_BOOL) {
+            indexing += " != 0";
+          }
+          // Generate code that pushes data from _e to _o in the form:
+          //   for (uoffset_t i = 0; i < _e->size(); ++i) {
+          //     _o->field.push_back(_e->Get(_i));
+          //   }
+          auto access =
+              field.value.type.element == BASE_TYPE_UTYPE
+                  ? ".type"
+                  : (field.value.type.element == BASE_TYPE_UNION ? ".value"
+                                                                 : "");
+
+          code += "for (flatbuffers::uoffset_t _i = 0;";
+          code += " _i < _e->size(); _i++) { ";
+          auto cpp_type = field.attributes.Lookup("cpp_type");
+          if (cpp_type) {
+            // Generate code that resolves the cpp pointer type, of the form:
+            //  if (resolver)
+            //    (*resolver)(&_o->field, (hash_value_t)(_e));
+            //  else
+            //    _o->field = nullptr;
+            code += "//vector resolver, " + PtrType(&field) + "\n";
+            code += "if (_resolver) ";
+            code += "(*_resolver)";
+            code += "(reinterpret_cast<void **>(&_o->" + name + "[_i]" +
+                    access + "), ";
+            code +=
+                "static_cast<flatbuffers::hash_value_t>(" + indexing + "));";
+            if (PtrType(&field) == "naked") {
+              code += " else ";
+              code += "_o->" + name + "[_i]" + access + " = nullptr";
+            } else {
+              // code += " else ";
+              // code += "_o->" + name + "[_i]" + access + " = " +
+              // GenTypeNativePtr(cpp_type->constant, &field, true) + "();";
+              code += "/* else do nothing */";
+            }
+          } else {
+            code += "_o->" + name + "[_i]" + access + " = ";
+            code += GenUnpackVal(field.value.type.VectorType(), indexing, true,
+                                 field);
+          }
+          code += "; } }";
+        }
+        break;
+      }
+      case BASE_TYPE_UTYPE: {
+        FLATBUFFERS_ASSERT(union_field->value.type.base_type ==
+                           BASE_TYPE_UNION);
+        // Generate code that sets the union type, of the form:
+        //   _o->field.type = _e;
+        code += "_o->" + union_field->name + ".type = _e;";
+        break;
+      }
+      case BASE_TYPE_UNION: {
+        // Generate code that sets the union value, of the form:
+        //   _o->field.value = Union::Unpack(_e, field_type(), resolver);
+        code += "_o->" + Name(field) + ".value = ";
+        code += GenUnionUnpackVal(field, "", "");
+        code += ";";
+        break;
+      }
+      default: {
+        auto cpp_type = field.attributes.Lookup("cpp_type");
+        if (cpp_type) {
+          // Generate code that resolves the cpp pointer type, of the form:
+          //  if (resolver)
+          //    (*resolver)(&_o->field, (hash_value_t)(_e));
+          //  else
+          //    _o->field = nullptr;
+          code += "//scalar resolver, " + PtrType(&field) + " \n";
+          code += "if (_resolver) ";
+          code += "(*_resolver)";
+          code += "(reinterpret_cast<void **>(&_o->" + Name(field) + "), ";
+          code += "static_cast<flatbuffers::hash_value_t>(_e));";
+          if (PtrType(&field) == "naked") {
+            code += " else ";
+            code += "_o->" + Name(field) + " = nullptr;";
+          } else {
+            // code += " else ";
+            // code += "_o->" + Name(field) + " = " +
+            // GenTypeNativePtr(cpp_type->constant, &field, true) + "();";
+            code += "/* else do nothing */;";
+          }
+        } else {
+          // Generate code for assigning the value, of the form:
+          //  _o->field = value;
+          code += "_o->" + Name(field) + " = ";
+          code += GenUnpackVal(field.value.type, "_e", false, field) + ";";
+        }
+        break;
+      }
+    }
+    return code;
+  }
+
+  std::string GenCreateParam(const FieldDef &field) {
+    std::string value = "_o->";
+    if (field.value.type.base_type == BASE_TYPE_UTYPE) {
+      value += StripUnionType(Name(field));
+      value += ".type";
+    } else {
+      value += Name(field);
+    }
+    if (field.value.type.base_type != BASE_TYPE_VECTOR &&
+        field.attributes.Lookup("cpp_type")) {
+      auto type = GenTypeBasic(field.value.type, false);
+      value =
+          "_rehasher ? "
+          "static_cast<" +
+          type + ">((*_rehasher)(" + value + GenPtrGet(field) + ")) : 0";
+    }
+
+    std::string code;
+    switch (field.value.type.base_type) {
+      // String fields are of the form:
+      //   _fbb.CreateString(_o->field)
+      // or
+      //   _fbb.CreateSharedString(_o->field)
+      case BASE_TYPE_STRING: {
+        if (!field.shared) {
+          code += "_fbb.CreateString(";
+        } else {
+          code += "_fbb.CreateSharedString(";
+        }
+        code += value;
+        code.push_back(')');
+
+        // For optional fields, check to see if there actually is any data
+        // in _o->field before attempting to access it. If there isn't,
+        // depending on set_empty_strings_to_null either set it to 0 or an empty
+        // string.
+        if (!field.IsRequired()) {
+          auto empty_value = opts_.set_empty_strings_to_null
+                                 ? "0"
+                                 : "_fbb.CreateSharedString(\"\")";
+          code = value + ".empty() ? " + empty_value + " : " + code;
+        }
+        break;
+      }
+        // Vector fields come in several flavours, of the forms:
+        //   _fbb.CreateVector(_o->field);
+        //   _fbb.CreateVector((const utype*)_o->field.data(),
+        //   _o->field.size()); _fbb.CreateVectorOfStrings(_o->field)
+        //   _fbb.CreateVectorOfStructs(_o->field)
+        //   _fbb.CreateVector<Offset<T>>(_o->field.size() [&](size_t i) {
+        //     return CreateT(_fbb, _o->Get(i), rehasher);
+        //   });
+      case BASE_TYPE_VECTOR: {
+        auto vector_type = field.value.type.VectorType();
+        switch (vector_type.base_type) {
+          case BASE_TYPE_STRING: {
+            if (NativeString(&field) == "std::string") {
+              code += "_fbb.CreateVectorOfStrings(" + value + ")";
+            } else {
+              // Use by-function serialization to emulate
+              // CreateVectorOfStrings(); this works also with non-std strings.
+              code +=
+                  "_fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>"
+                  " ";
+              code += "(" + value + ".size(), ";
+              code += "[](size_t i, _VectorArgs *__va) { ";
+              code +=
+                  "return __va->__fbb->CreateString(__va->_" + value + "[i]);";
+              code += " }, &_va )";
+            }
+            break;
+          }
+          case BASE_TYPE_STRUCT: {
+            if (IsStruct(vector_type)) {
+              const auto &struct_attrs =
+                  field.value.type.struct_def->attributes;
+              const auto native_type = struct_attrs.Lookup("native_type");
+              if (native_type) {
+                code += "_fbb.CreateVectorOfNativeStructs<";
+                code += WrapInNameSpace(*vector_type.struct_def) + ", " +
+                        native_type->constant + ">";
+                code += "(" + value;
+                const auto pack_name =
+                    struct_attrs.Lookup("native_type_pack_name");
+                if (pack_name) {
+                  code += ", flatbuffers::Pack" + pack_name->constant;
+                }
+                code += ")";
+              } else {
+                code += "_fbb.CreateVectorOfStructs";
+                code += "(" + value + ")";
+              }
+            } else {
+              code += "_fbb.CreateVector<flatbuffers::Offset<";
+              code += WrapInNameSpace(*vector_type.struct_def) + ">> ";
+              code += "(" + value + ".size(), ";
+              code += "[](size_t i, _VectorArgs *__va) { ";
+              code += "return Create" + vector_type.struct_def->name;
+              code += "(*__va->__fbb, __va->_" + value + "[i]" +
+                      GenPtrGet(field) + ", ";
+              code += "__va->__rehasher); }, &_va )";
+            }
+            break;
+          }
+          case BASE_TYPE_BOOL: {
+            code += "_fbb.CreateVector(" + value + ")";
+            break;
+          }
+          case BASE_TYPE_UNION: {
+            code +=
+                "_fbb.CreateVector<flatbuffers::"
+                "Offset<void>>(" +
+                value +
+                ".size(), [](size_t i, _VectorArgs *__va) { "
+                "return __va->_" +
+                value + "[i].Pack(*__va->__fbb, __va->__rehasher); }, &_va)";
+            break;
+          }
+          case BASE_TYPE_UTYPE: {
+            value = StripUnionType(value);
+            code += "_fbb.CreateVector<uint8_t>(" + value +
+                    ".size(), [](size_t i, _VectorArgs *__va) { "
+                    "return static_cast<uint8_t>(__va->_" +
+                    value + "[i].type); }, &_va)";
+            break;
+          }
+          default: {
+            if (field.value.type.enum_def &&
+                !VectorElementUserFacing(vector_type)) {
+              // For enumerations, we need to get access to the array data for
+              // the underlying storage type (eg. uint8_t).
+              const auto basetype = GenTypeBasic(
+                  field.value.type.enum_def->underlying_type, false);
+              code += "_fbb.CreateVectorScalarCast<" + basetype +
+                      ">(flatbuffers::data(" + value + "), " + value +
+                      ".size())";
+            } else if (field.attributes.Lookup("cpp_type")) {
+              auto type = GenTypeBasic(vector_type, false);
+              code += "_fbb.CreateVector<" + type + ">(" + value + ".size(), ";
+              code += "[](size_t i, _VectorArgs *__va) { ";
+              code += "return __va->__rehasher ? ";
+              code += "static_cast<" + type + ">((*__va->__rehasher)";
+              code += "(__va->_" + value + "[i]" + GenPtrGet(field) + ")) : 0";
+              code += "; }, &_va )";
+            } else {
+              code += "_fbb.CreateVector(" + value + ")";
+            }
+            break;
+          }
+        }
+
+        // If set_empty_vectors_to_null option is enabled, for optional fields,
+        // check to see if there actually is any data in _o->field before
+        // attempting to access it.
+        if (opts_.set_empty_vectors_to_null && !field.IsRequired()) {
+          code = value + ".size() ? " + code + " : 0";
+        }
+        break;
+      }
+      case BASE_TYPE_UNION: {
+        // _o->field.Pack(_fbb);
+        code += value + ".Pack(_fbb)";
+        break;
+      }
+      case BASE_TYPE_STRUCT: {
+        if (IsStruct(field.value.type)) {
+          const auto &struct_attribs = field.value.type.struct_def->attributes;
+          const auto native_type = struct_attribs.Lookup("native_type");
+          if (native_type) {
+            code += "flatbuffers::Pack";
+            const auto pack_name =
+                struct_attribs.Lookup("native_type_pack_name");
+            if (pack_name) { code += pack_name->constant; }
+            code += "(" + value + ")";
+          } else if (field.native_inline) {
+            code += "&" + value;
+          } else {
+            code += value + " ? " + value + GenPtrGet(field) + " : 0";
+          }
+        } else {
+          // _o->field ? CreateT(_fbb, _o->field.get(), _rehasher);
+          const auto type = field.value.type.struct_def->name;
+          code += value + " ? Create" + type;
+          code += "(_fbb, " + value + GenPtrGet(field) + ", _rehasher)";
+          code += " : 0";
+        }
+        break;
+      }
+      default: {
+        code += value;
+        break;
+      }
+    }
+    return code;
+  }
+
+  // Generate code for tables that needs to come after the regular definition.
+  void GenTablePost(const StructDef &struct_def) {
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+    code_.SetValue("NATIVE_NAME",
+                   NativeName(Name(struct_def), &struct_def, opts_));
+
+    if (opts_.generate_object_based_api) {
+      // Generate the X::UnPack() method.
+      code_ +=
+          "inline " + TableUnPackSignature(struct_def, false, opts_) + " {";
+
+      if (opts_.g_cpp_std == cpp::CPP_STD_X0) {
+        auto native_name = WrapNativeNameInNameSpace(struct_def, parser_.opts);
+        code_.SetValue("POINTER_TYPE",
+                       GenTypeNativePtr(native_name, nullptr, false));
+        code_ +=
+            "  {{POINTER_TYPE}} _o = {{POINTER_TYPE}}(new {{NATIVE_NAME}}());";
+      } else if (opts_.g_cpp_std == cpp::CPP_STD_11) {
+        code_ +=
+            "  auto _o = std::unique_ptr<{{NATIVE_NAME}}>(new "
+            "{{NATIVE_NAME}}());";
+      } else {
+        code_ += "  auto _o = std::make_unique<{{NATIVE_NAME}}>();";
+      }
+      code_ += "  UnPackTo(_o.get(), _resolver);";
+      code_ += "  return _o.release();";
+      code_ += "}";
+      code_ += "";
+      code_ +=
+          "inline " + TableUnPackToSignature(struct_def, false, opts_) + " {";
+      code_ += "  (void)_o;";
+      code_ += "  (void)_resolver;";
+
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (field.deprecated) { continue; }
+
+        // Assign a value from |this| to |_o|.   Values from |this| are stored
+        // in a variable |_e| by calling this->field_type().  The value is then
+        // assigned to |_o| using the GenUnpackFieldStatement.
+        const bool is_union = field.value.type.base_type == BASE_TYPE_UTYPE;
+        const auto statement =
+            GenUnpackFieldStatement(field, is_union ? *(it + 1) : nullptr);
+
+        code_.SetValue("FIELD_NAME", Name(field));
+        auto prefix = "  { auto _e = {{FIELD_NAME}}(); ";
+        auto check = IsScalar(field.value.type.base_type) ? "" : "if (_e) ";
+        auto postfix = " }";
+        code_ += std::string(prefix) + check + statement + postfix;
+      }
+      code_ += "}";
+      code_ += "";
+
+      // Generate the X::Pack member function that simply calls the global
+      // CreateX function.
+      code_ += "inline " + TablePackSignature(struct_def, false, opts_) + " {";
+      code_ += "  return Create{{STRUCT_NAME}}(_fbb, _o, _rehasher);";
+      code_ += "}";
+      code_ += "";
+
+      // Generate a CreateX method that works with an unpacked C++ object.
+      code_ +=
+          "inline " + TableCreateSignature(struct_def, false, opts_) + " {";
+      code_ += "  (void)_rehasher;";
+      code_ += "  (void)_o;";
+
+      code_ +=
+          "  struct _VectorArgs "
+          "{ flatbuffers::FlatBufferBuilder *__fbb; "
+          "const " +
+          NativeName(Name(struct_def), &struct_def, opts_) +
+          "* __o; "
+          "const flatbuffers::rehasher_function_t *__rehasher; } _va = { "
+          "&_fbb, _o, _rehasher}; (void)_va;";
+
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) { continue; }
+        if (IsVector(field.value.type)) {
+          const std::string force_align_code =
+              GenVectorForceAlign(field, "_o->" + Name(field) + ".size()");
+          if (!force_align_code.empty()) { code_ += "  " + force_align_code; }
+        }
+        code_ += "  auto _" + Name(field) + " = " + GenCreateParam(field) + ";";
+      }
+      // Need to call "Create" with the struct namespace.
+      const auto qualified_create_name =
+          struct_def.defined_namespace->GetFullyQualifiedName("Create");
+      code_.SetValue("CREATE_NAME", TranslateNameSpace(qualified_create_name));
+
+      code_ += "  return {{CREATE_NAME}}{{STRUCT_NAME}}(";
+      code_ += "      _fbb\\";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) { continue; }
+
+        bool pass_by_address = false;
+        if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+          if (IsStruct(field.value.type)) {
+            auto native_type =
+                field.value.type.struct_def->attributes.Lookup("native_type");
+            if (native_type) { pass_by_address = true; }
+          }
+        }
+
+        // Call the CreateX function using values from |_o|.
+        if (pass_by_address) {
+          code_ += ",\n      &_" + Name(field) + "\\";
+        } else {
+          code_ += ",\n      _" + Name(field) + "\\";
+        }
+      }
+      code_ += ");";
+      code_ += "}";
+      code_ += "";
+    }
+  }
+
+  static void GenPadding(
+      const FieldDef &field, std::string *code_ptr, int *id,
+      const std::function<void(int bits, std::string *code_ptr, int *id)> &f) {
+    if (field.padding) {
+      for (int i = 0; i < 4; i++) {
+        if (static_cast<int>(field.padding) & (1 << i)) {
+          f((1 << i) * 8, code_ptr, id);
+        }
+      }
+      FLATBUFFERS_ASSERT(!(field.padding & ~0xF));
+    }
+  }
+
+  static void PaddingDefinition(int bits, std::string *code_ptr, int *id) {
+    *code_ptr += "  int" + NumToString(bits) + "_t padding" +
+                 NumToString((*id)++) + "__;";
+  }
+
+  static void PaddingInitializer(int bits, std::string *code_ptr, int *id) {
+    (void)bits;
+    if (!code_ptr->empty()) *code_ptr += ",\n        ";
+    *code_ptr += "padding" + NumToString((*id)++) + "__(0)";
+  }
+
+  static void PaddingNoop(int bits, std::string *code_ptr, int *id) {
+    (void)bits;
+    if (!code_ptr->empty()) *code_ptr += '\n';
+    *code_ptr += "    (void)padding" + NumToString((*id)++) + "__;";
+  }
+
+  void GenStructDefaultConstructor(const StructDef &struct_def) {
+    std::string init_list;
+    std::string body;
+    bool first_in_init_list = true;
+    int padding_initializer_id = 0;
+    int padding_body_id = 0;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto field = *it;
+      const auto field_name = field->name + "_";
+
+      if (first_in_init_list) {
+        first_in_init_list = false;
+      } else {
+        init_list += ",";
+        init_list += "\n        ";
+      }
+
+      init_list += field_name;
+      if (IsStruct(field->value.type) || IsArray(field->value.type)) {
+        // this is either default initialization of struct
+        // or
+        // implicit initialization of array
+        // for each object in array it:
+        // * sets it as zeros for POD types (integral, floating point, etc)
+        // * calls default constructor for classes/structs
+        init_list += "()";
+      } else {
+        init_list += "(0)";
+      }
+      if (field->padding) {
+        GenPadding(*field, &init_list, &padding_initializer_id,
+                   PaddingInitializer);
+        GenPadding(*field, &body, &padding_body_id, PaddingNoop);
+      }
+    }
+
+    if (init_list.empty()) {
+      code_ += "  {{STRUCT_NAME}}()";
+      code_ += "  {}";
+    } else {
+      code_.SetValue("INIT_LIST", init_list);
+      code_ += "  {{STRUCT_NAME}}()";
+      code_ += "      : {{INIT_LIST}} {";
+      if (!body.empty()) { code_ += body; }
+      code_ += "  }";
+    }
+  }
+
+  void GenStructConstructor(const StructDef &struct_def,
+                            GenArrayArgMode array_mode) {
+    std::string arg_list;
+    std::string init_list;
+    int padding_id = 0;
+    auto first = struct_def.fields.vec.begin();
+    // skip arrays if generate ctor without array assignment
+    const auto init_arrays = (array_mode != kArrayArgModeNone);
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      const auto &type = field.value.type;
+      const auto is_array = IsArray(type);
+      const auto arg_name = "_" + Name(field);
+      if (!is_array || init_arrays) {
+        if (it != first && !arg_list.empty()) { arg_list += ", "; }
+        arg_list += !is_array ? GenTypeGet(type, " ", "const ", " &", true)
+                              : GenTypeSpan(type, true, type.fixed_length);
+        arg_list += arg_name;
+      }
+      // skip an array with initialization from span
+      if (false == (is_array && init_arrays)) {
+        if (it != first && !init_list.empty()) { init_list += ",\n        "; }
+        init_list += Name(field) + "_";
+        if (IsScalar(type.base_type)) {
+          auto scalar_type = GenUnderlyingCast(field, false, arg_name);
+          init_list += "(flatbuffers::EndianScalar(" + scalar_type + "))";
+        } else {
+          FLATBUFFERS_ASSERT((is_array && !init_arrays) || IsStruct(type));
+          if (!is_array)
+            init_list += "(" + arg_name + ")";
+          else
+            init_list += "()";
+        }
+      }
+      if (field.padding)
+        GenPadding(field, &init_list, &padding_id, PaddingInitializer);
+    }
+
+    if (!arg_list.empty()) {
+      code_.SetValue("ARG_LIST", arg_list);
+      code_.SetValue("INIT_LIST", init_list);
+      if (!init_list.empty()) {
+        code_ += "  {{STRUCT_NAME}}({{ARG_LIST}})";
+        code_ += "      : {{INIT_LIST}} {";
+      } else {
+        code_ += "  {{STRUCT_NAME}}({{ARG_LIST}}) {";
+      }
+      padding_id = 0;
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        const auto &type = field.value.type;
+        if (IsArray(type) && init_arrays) {
+          const auto &element_type = type.VectorType();
+          const auto is_enum = IsEnum(element_type);
+          FLATBUFFERS_ASSERT(
+              (IsScalar(element_type.base_type) || IsStruct(element_type)) &&
+              "invalid declaration");
+          const auto face_type = GenTypeGet(type, " ", "", "", is_enum);
+          std::string get_array =
+              is_enum ? "CastToArrayOfEnum<" + face_type + ">" : "CastToArray";
+          const auto field_name = Name(field) + "_";
+          const auto arg_name = "_" + Name(field);
+          code_ += "    flatbuffers::" + get_array + "(" + field_name +
+                   ").CopyFromSpan(" + arg_name + ");";
+        }
+        if (field.padding) {
+          std::string padding;
+          GenPadding(field, &padding, &padding_id, PaddingNoop);
+          code_ += padding;
+        }
+      }
+      code_ += "  }";
+    }
+  }
+
+  void GenArrayAccessor(const Type &type, bool mutable_accessor) {
+    FLATBUFFERS_ASSERT(IsArray(type));
+    const auto is_enum = IsEnum(type.VectorType());
+    // The Array<bool,N> is a tricky case, like std::vector<bool>.
+    // It requires a specialization of Array class.
+    // Generate Array<uint8_t> for Array<bool>.
+    const auto face_type = GenTypeGet(type, " ", "", "", is_enum);
+    std::string ret_type = "flatbuffers::Array<" + face_type + ", " +
+                           NumToString(type.fixed_length) + ">";
+    if (mutable_accessor)
+      code_ += "  " + ret_type + " *mutable_{{FIELD_NAME}}() {";
+    else
+      code_ += "  const " + ret_type + " *{{FIELD_NAME}}() const {";
+
+    std::string get_array =
+        is_enum ? "CastToArrayOfEnum<" + face_type + ">" : "CastToArray";
+    code_ += "    return &flatbuffers::" + get_array + "({{FIELD_VALUE}});";
+    code_ += "  }";
+  }
+
+  // Generate an accessor struct with constructor for a flatbuffers struct.
+  void GenStruct(const StructDef &struct_def) {
+    // Generate an accessor struct, with private variables of the form:
+    // type name_;
+    // Generates manual padding and alignment.
+    // Variables are private because they contain little endian data on all
+    // platforms.
+    GenComment(struct_def.doc_comment);
+    code_.SetValue("ALIGN", NumToString(struct_def.minalign));
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+
+    code_ +=
+        "FLATBUFFERS_MANUALLY_ALIGNED_STRUCT({{ALIGN}}) "
+        "{{STRUCT_NAME}} FLATBUFFERS_FINAL_CLASS {";
+    code_ += " private:";
+
+    int padding_id = 0;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      const auto &field_type = field.value.type;
+      code_.SetValue("FIELD_TYPE", GenTypeGet(field_type, " ", "", " ", false));
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("ARRAY",
+                     IsArray(field_type)
+                         ? "[" + NumToString(field_type.fixed_length) + "]"
+                         : "");
+      code_ += ("  {{FIELD_TYPE}}{{FIELD_NAME}}_{{ARRAY}};");
+
+      if (field.padding) {
+        std::string padding;
+        GenPadding(field, &padding, &padding_id, PaddingDefinition);
+        code_ += padding;
+      }
+    }
+
+    // Generate GetFullyQualifiedName
+    code_ += "";
+    code_ += " public:";
+
+    if (opts_.g_cpp_std >= cpp::CPP_STD_17) { code_ += "  struct Traits;"; }
+
+    // Make TypeTable accessible via the generated struct.
+    if (opts_.mini_reflect != IDLOptions::kNone) {
+      code_ +=
+          "  static const flatbuffers::TypeTable *MiniReflectTypeTable() {";
+      code_ += "    return {{STRUCT_NAME}}TypeTable();";
+      code_ += "  }";
+    }
+
+    GenFullyQualifiedNameGetter(struct_def, Name(struct_def));
+
+    // Generate a default constructor.
+    GenStructDefaultConstructor(struct_def);
+
+    // Generate a constructor that takes all fields as arguments,
+    // excluding arrays.
+    GenStructConstructor(struct_def, kArrayArgModeNone);
+
+    auto arrays_num = std::count_if(struct_def.fields.vec.begin(),
+                                    struct_def.fields.vec.end(),
+                                    [](const flatbuffers::FieldDef *fd) {
+                                      return IsArray(fd->value.type);
+                                    });
+    if (arrays_num > 0) {
+      GenStructConstructor(struct_def, kArrayArgModeSpanStatic);
+    }
+
+    // Generate accessor methods of the form:
+    // type name() const { return flatbuffers::EndianScalar(name_); }
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      const auto &type = field.value.type;
+      const auto is_scalar = IsScalar(type.base_type);
+      const auto is_array = IsArray(type);
+
+      const auto field_type = GenTypeGet(type, " ", is_array ? "" : "const ",
+                                         is_array ? "" : " &", true);
+      auto member = Name(field) + "_";
+      auto value =
+          is_scalar ? "flatbuffers::EndianScalar(" + member + ")" : member;
+
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_TYPE", field_type);
+      code_.SetValue("FIELD_VALUE", GenUnderlyingCast(field, true, value));
+
+      GenComment(field.doc_comment, "  ");
+
+      // Generate a const accessor function.
+      if (is_array) {
+        GenArrayAccessor(type, false);
+      } else {
+        code_ += "  {{FIELD_TYPE}}{{FIELD_NAME}}() const {";
+        code_ += "    return {{FIELD_VALUE}};";
+        code_ += "  }";
+      }
+
+      // Generate a mutable accessor function.
+      if (opts_.mutable_buffer) {
+        auto mut_field_type =
+            GenTypeGet(type, " ", "", is_array ? "" : " &", true);
+        code_.SetValue("FIELD_TYPE", mut_field_type);
+        if (is_scalar) {
+          code_.SetValue("ARG", GenTypeBasic(type, true));
+          code_.SetValue("FIELD_VALUE",
+                         GenUnderlyingCast(field, false, "_" + Name(field)));
+
+          code_ += "  void mutate_{{FIELD_NAME}}({{ARG}} _{{FIELD_NAME}}) {";
+          code_ +=
+              "    flatbuffers::WriteScalar(&{{FIELD_NAME}}_, "
+              "{{FIELD_VALUE}});";
+          code_ += "  }";
+        } else if (is_array) {
+          GenArrayAccessor(type, true);
+        } else {
+          code_ += "  {{FIELD_TYPE}}mutable_{{FIELD_NAME}}() {";
+          code_ += "    return {{FIELD_VALUE}};";
+          code_ += "  }";
+        }
+      }
+
+      // Generate a comparison function for this field if it is a key.
+      if (field.key) { GenKeyFieldMethods(field); }
+    }
+    code_.SetValue("NATIVE_NAME", Name(struct_def));
+    GenOperatorNewDelete(struct_def);
+
+    if (opts_.cpp_static_reflection) { GenIndexBasedFieldGetter(struct_def); }
+
+    code_ += "};";
+
+    code_.SetValue("STRUCT_BYTE_SIZE", NumToString(struct_def.bytesize));
+    code_ += "FLATBUFFERS_STRUCT_END({{STRUCT_NAME}}, {{STRUCT_BYTE_SIZE}});";
+    if (opts_.gen_compare) GenCompareOperator(struct_def, "()");
+    code_ += "";
+
+    // Definition for type traits for this table type. This allows querying var-
+    // ious compile-time traits of the table.
+    if (opts_.g_cpp_std >= cpp::CPP_STD_17) { GenTraitsStruct(struct_def); }
+  }
+
+  // Set up the correct namespace. Only open a namespace if the existing one is
+  // different (closing/opening only what is necessary).
+  //
+  // The file must start and end with an empty (or null) namespace so that
+  // namespaces are properly opened and closed.
+  void SetNameSpace(const Namespace *ns) {
+    if (cur_name_space_ == ns) { return; }
+
+    // Compute the size of the longest common namespace prefix.
+    // If cur_name_space is A::B::C::D and ns is A::B::E::F::G,
+    // the common prefix is A::B:: and we have old_size = 4, new_size = 5
+    // and common_prefix_size = 2
+    size_t old_size = cur_name_space_ ? cur_name_space_->components.size() : 0;
+    size_t new_size = ns ? ns->components.size() : 0;
+
+    size_t common_prefix_size = 0;
+    while (common_prefix_size < old_size && common_prefix_size < new_size &&
+           ns->components[common_prefix_size] ==
+               cur_name_space_->components[common_prefix_size]) {
+      common_prefix_size++;
+    }
+
+    // Close cur_name_space in reverse order to reach the common prefix.
+    // In the previous example, D then C are closed.
+    for (size_t j = old_size; j > common_prefix_size; --j) {
+      code_ += "}  // namespace " + cur_name_space_->components[j - 1];
+    }
+    if (old_size != common_prefix_size) { code_ += ""; }
+
+    // open namespace parts to reach the ns namespace
+    // in the previous example, E, then F, then G are opened
+    for (auto j = common_prefix_size; j != new_size; ++j) {
+      code_ += "namespace " + ns->components[j] + " {";
+    }
+    if (new_size != common_prefix_size) { code_ += ""; }
+
+    cur_name_space_ = ns;
+  }
+};
+
+}  // namespace cpp
+
+bool GenerateCPP(const Parser &parser, const std::string &path,
+                 const std::string &file_name) {
+  cpp::IDLOptionsCpp opts(parser.opts);
+  // The '--cpp_std' argument could be extended (like ASAN):
+  // Example: "flatc --cpp_std c++17:option1:option2".
+  auto cpp_std = !opts.cpp_std.empty() ? opts.cpp_std : "C++11";
+  std::transform(cpp_std.begin(), cpp_std.end(), cpp_std.begin(), CharToUpper);
+  if (cpp_std == "C++0X") {
+    opts.g_cpp_std = cpp::CPP_STD_X0;
+    opts.g_only_fixed_enums = false;
+  } else if (cpp_std == "C++11") {
+    // Use the standard C++11 code generator.
+    opts.g_cpp_std = cpp::CPP_STD_11;
+    opts.g_only_fixed_enums = true;
+  } else if (cpp_std == "C++17") {
+    opts.g_cpp_std = cpp::CPP_STD_17;
+    // With c++17 generate strong enums only.
+    opts.scoped_enums = true;
+    // By default, prefixed_enums==true, reset it.
+    opts.prefixed_enums = false;
+  } else {
+    LogCompilerError("Unknown value of the '--cpp-std' switch: " +
+                     opts.cpp_std);
+    return false;
+  }
+  // The opts.scoped_enums has priority.
+  opts.g_only_fixed_enums |= opts.scoped_enums;
+
+  if (opts.cpp_static_reflection && opts.g_cpp_std < cpp::CPP_STD_17) {
+    LogCompilerError(
+        "--cpp-static-reflection requires using --cpp-std at \"C++17\" or "
+        "higher.");
+    return false;
+  }
+
+  cpp::CppGenerator generator(parser, path, file_name, opts);
+  return generator.generate();
+}
+
+std::string CPPMakeRule(const Parser &parser, const std::string &path,
+                        const std::string &file_name) {
+  const auto filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  cpp::CppGenerator geneartor(parser, path, file_name, parser.opts);
+  const auto included_files = parser.GetIncludedFilesRecursive(file_name);
+  std::string make_rule =
+      geneartor.GeneratedFileName(path, filebase, parser.opts) + ": ";
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_csharp.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_csharp.cpp
new file mode 100644
index 0000000..681ab6d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_csharp.cpp
@@ -0,0 +1,2100 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+#if defined(FLATBUFFERS_CPP98_STL)
+#  include <cctype>
+#endif  // defined(FLATBUFFERS_CPP98_STL)
+
+namespace flatbuffers {
+
+static TypedFloatConstantGenerator CSharpFloatGen("Double.", "Single.", "NaN",
+                                                  "PositiveInfinity",
+                                                  "NegativeInfinity");
+static CommentConfig comment_config = {
+  nullptr,
+  "///",
+  nullptr,
+};
+
+namespace csharp {
+class CSharpGenerator : public BaseGenerator {
+  struct FieldArrayLength {
+    std::string name;
+    int length;
+  };
+
+ public:
+  CSharpGenerator(const Parser &parser, const std::string &path,
+                  const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", ".", "cs"),
+        cur_name_space_(nullptr) {}
+
+  CSharpGenerator &operator=(const CSharpGenerator &);
+
+  bool generate() {
+    std::string one_file_code;
+    cur_name_space_ = parser_.current_namespace_;
+
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      std::string enumcode;
+      auto &enum_def = **it;
+      if (!parser_.opts.one_file) cur_name_space_ = enum_def.defined_namespace;
+      GenEnum(enum_def, &enumcode, parser_.opts);
+      if (parser_.opts.one_file) {
+        one_file_code += enumcode;
+      } else {
+        if (!SaveType(enum_def.name, *enum_def.defined_namespace, enumcode,
+                      false))
+          return false;
+      }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      std::string declcode;
+      auto &struct_def = **it;
+      if (!parser_.opts.one_file)
+        cur_name_space_ = struct_def.defined_namespace;
+      GenStruct(struct_def, &declcode, parser_.opts);
+      if (parser_.opts.one_file) {
+        one_file_code += declcode;
+      } else {
+        if (!SaveType(struct_def.name, *struct_def.defined_namespace, declcode,
+                      true))
+          return false;
+      }
+    }
+
+    if (parser_.opts.one_file) {
+      return SaveType(file_name_, *parser_.current_namespace_, one_file_code,
+                      true);
+    }
+    return true;
+  }
+
+  // Save out the generated code for a single class while adding
+  // declaration boilerplate.
+  bool SaveType(const std::string &defname, const Namespace &ns,
+                const std::string &classcode, bool needs_includes) const {
+    if (!classcode.length()) return true;
+
+    std::string code =
+        "// <auto-generated>\n"
+        "//  " +
+        std::string(FlatBuffersGeneratedWarning()) +
+        "\n"
+        "// </auto-generated>\n\n";
+
+    std::string namespace_name = FullNamespace(".", ns);
+    if (!namespace_name.empty()) {
+      code += "namespace " + namespace_name + "\n{\n\n";
+    }
+    if (needs_includes) {
+      code += "using global::System;\n";
+      code += "using global::System.Collections.Generic;\n";
+      code += "using global::FlatBuffers;\n\n";
+    }
+    code += classcode;
+    if (!namespace_name.empty()) { code += "\n}\n"; }
+    auto filename = NamespaceDir(ns) + defname + ".cs";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+  const Namespace *CurrentNameSpace() const { return cur_name_space_; }
+
+  std::string GenTypeBasic(const Type &type, bool enableLangOverrides) const {
+    // clang-format off
+    static const char * const csharp_typename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, NTYPE, ...) \
+        #NTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+
+    if (enableLangOverrides) {
+      if (IsEnum(type)) return WrapInNameSpace(*type.enum_def);
+      if (type.base_type == BASE_TYPE_STRUCT) {
+        return "Offset<" + WrapInNameSpace(*type.struct_def) + ">";
+      }
+    }
+
+    return csharp_typename[type.base_type];
+  }
+
+  inline std::string GenTypeBasic(const Type &type) const {
+    return GenTypeBasic(type, true);
+  }
+
+  std::string GenTypePointer(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "string";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return WrapInNameSpace(*type.struct_def);
+      case BASE_TYPE_UNION: return "TTable";
+      default: return "Table";
+    }
+  }
+
+  std::string GenTypeGet(const Type &type) const {
+    return IsScalar(type.base_type)
+               ? GenTypeBasic(type)
+               : (IsArray(type) ? GenTypeGet(type.VectorType())
+                                : GenTypePointer(type));
+  }
+
+  std::string GenOffsetType(const StructDef &struct_def) const {
+    return "Offset<" + WrapInNameSpace(struct_def) + ">";
+  }
+
+  std::string GenOffsetConstruct(const StructDef &struct_def,
+                                 const std::string &variable_name) const {
+    return "new Offset<" + WrapInNameSpace(struct_def) + ">(" + variable_name +
+           ")";
+  }
+
+  // Casts necessary to correctly read serialized data
+  std::string DestinationCast(const Type &type) const {
+    if (IsSeries(type)) {
+      return DestinationCast(type.VectorType());
+    } else {
+      if (IsEnum(type)) return "(" + WrapInNameSpace(*type.enum_def) + ")";
+    }
+    return "";
+  }
+
+  // Cast statements for mutator method parameters.
+  // In Java, parameters representing unsigned numbers need to be cast down to
+  // their respective type. For example, a long holding an unsigned int value
+  // would be cast down to int before being put onto the buffer. In C#, one cast
+  // directly cast an Enum to its underlying type, which is essential before
+  // putting it onto the buffer.
+  std::string SourceCast(const Type &type) const {
+    if (IsSeries(type)) {
+      return SourceCast(type.VectorType());
+    } else {
+      if (IsEnum(type)) return "(" + GenTypeBasic(type, false) + ")";
+    }
+    return "";
+  }
+
+  std::string SourceCastBasic(const Type &type) const {
+    return IsScalar(type.base_type) ? SourceCast(type) : "";
+  }
+
+  std::string GenEnumDefaultValue(const FieldDef &field) const {
+    auto &value = field.value;
+    FLATBUFFERS_ASSERT(value.type.enum_def);
+    auto &enum_def = *value.type.enum_def;
+    auto enum_val = enum_def.FindByValue(value.constant);
+    return enum_val ? (WrapInNameSpace(enum_def) + "." + enum_val->name)
+                    : value.constant;
+  }
+
+  std::string GenDefaultValue(const FieldDef &field,
+                              bool enableLangOverrides) const {
+    // If it is an optional scalar field, the default is null
+    if (field.IsScalarOptional()) { return "null"; }
+
+    auto &value = field.value;
+    if (enableLangOverrides) {
+      // handles both enum case and vector of enum case
+      if (value.type.enum_def != nullptr &&
+          value.type.base_type != BASE_TYPE_UNION) {
+        return GenEnumDefaultValue(field);
+      }
+    }
+
+    auto longSuffix = "";
+    switch (value.type.base_type) {
+      case BASE_TYPE_BOOL: return value.constant == "0" ? "false" : "true";
+      case BASE_TYPE_ULONG: return value.constant;
+      case BASE_TYPE_UINT:
+      case BASE_TYPE_LONG: return value.constant + longSuffix;
+      default:
+        if (IsFloat(value.type.base_type))
+          return CSharpFloatGen.GenFloatConstant(field);
+        else
+          return value.constant;
+    }
+  }
+
+  std::string GenDefaultValue(const FieldDef &field) const {
+    return GenDefaultValue(field, true);
+  }
+
+  std::string GenDefaultValueBasic(const FieldDef &field,
+                                   bool enableLangOverrides) const {
+    auto &value = field.value;
+    if (!IsScalar(value.type.base_type)) {
+      if (enableLangOverrides) {
+        switch (value.type.base_type) {
+          case BASE_TYPE_STRING: return "default(StringOffset)";
+          case BASE_TYPE_STRUCT:
+            return "default(Offset<" + WrapInNameSpace(*value.type.struct_def) +
+                   ">)";
+          case BASE_TYPE_VECTOR: return "default(VectorOffset)";
+          default: break;
+        }
+      }
+      return "0";
+    }
+    return GenDefaultValue(field, enableLangOverrides);
+  }
+
+  std::string GenDefaultValueBasic(const FieldDef &field) const {
+    return GenDefaultValueBasic(field, true);
+  }
+
+  void GenEnum(EnumDef &enum_def, std::string *code_ptr,
+               const IDLOptions &opts) const {
+    std::string &code = *code_ptr;
+    if (enum_def.generated) return;
+
+    // Generate enum definitions of the form:
+    // public static (final) int name = value;
+    // In Java, we use ints rather than the Enum feature, because we want them
+    // to map directly to how they're used in C/C++ and file formats.
+    // That, and Java Enums are expensive, and not universally liked.
+    GenComment(enum_def.doc_comment, code_ptr, &comment_config);
+
+    if (opts.cs_gen_json_serializer && opts.generate_object_based_api) {
+      code +=
+          "[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters."
+          "StringEnumConverter))]\n";
+    }
+    // In C# this indicates enumeration values can be treated as bit flags.
+    if (enum_def.attributes.Lookup("bit_flags")) {
+      code += "[System.FlagsAttribute]\n";
+    }
+    if (enum_def.attributes.Lookup("private")) {
+      code += "internal ";
+    } else {
+      code += "public ";
+    }
+    code += "enum " + enum_def.name;
+    code += " : " + GenTypeBasic(enum_def.underlying_type, false);
+    code += "\n{\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, &comment_config, "  ");
+      code += "  ";
+      code += ev.name + " = ";
+      code += enum_def.ToString(ev);
+      code += ",\n";
+    }
+    // Close the class
+    code += "};\n\n";
+
+    if (opts.generate_object_based_api) {
+      GenEnum_ObjectAPI(enum_def, code_ptr, opts);
+    }
+  }
+
+  bool HasUnionStringValue(const EnumDef &enum_def) const {
+    if (!enum_def.is_union) return false;
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &val = **it;
+      if (IsString(val.union_type)) { return true; }
+    }
+    return false;
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetter(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "__p.__string";
+      case BASE_TYPE_STRUCT: return "__p.__struct";
+      case BASE_TYPE_UNION: return "__p.__union";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      case BASE_TYPE_ARRAY: return GenGetter(type.VectorType());
+      default: {
+        std::string getter = "__p.bb.Get";
+        if (type.base_type == BASE_TYPE_BOOL) {
+          getter = "0!=" + getter;
+        } else if (GenTypeBasic(type, false) != "byte") {
+          getter += MakeCamel(GenTypeBasic(type, false));
+        }
+        return getter;
+      }
+    }
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetterForLookupByKey(flatbuffers::FieldDef *key_field,
+                                      const std::string &data_buffer,
+                                      const char *num = nullptr) const {
+    auto type = key_field->value.type;
+    auto dest_mask = "";
+    auto dest_cast = DestinationCast(type);
+    auto getter = data_buffer + ".Get";
+    if (GenTypeBasic(type, false) != "byte") {
+      getter += MakeCamel(GenTypeBasic(type, false));
+    }
+    getter = dest_cast + getter + "(" + GenOffsetGetter(key_field, num) + ")" +
+             dest_mask;
+    return getter;
+  }
+
+  // Direct mutation is only allowed for scalar fields.
+  // Hence a setter method will only be generated for such fields.
+  std::string GenSetter(const Type &type) const {
+    if (IsScalar(type.base_type)) {
+      std::string setter = "__p.bb.Put";
+      if (GenTypeBasic(type, false) != "byte" &&
+          type.base_type != BASE_TYPE_BOOL) {
+        setter += MakeCamel(GenTypeBasic(type, false));
+      }
+      return setter;
+    } else {
+      return "";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const Type &type) const {
+    return IsScalar(type.base_type) ? MakeCamel(GenTypeBasic(type, false))
+                                    : (IsStruct(type) ? "Struct" : "Offset");
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void GenStructArgs(const StructDef &struct_def, std::string *code_ptr,
+                     const char *nameprefix, size_t array_count = 0) const {
+    std::string &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      const auto array_field = IsArray(field_type);
+      const auto &type = array_field ? field_type.VectorType() : field_type;
+      const auto array_cnt = array_field ? (array_count + 1) : array_count;
+      if (IsStruct(type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        GenStructArgs(*field_type.struct_def, code_ptr,
+                      (nameprefix + (field.name + "_")).c_str(), array_cnt);
+      } else {
+        code += ", ";
+        code += GenTypeBasic(type);
+        if (field.IsScalarOptional()) { code += "?"; }
+        if (array_cnt > 0) {
+          code += "[";
+          for (size_t i = 1; i < array_cnt; i++) code += ",";
+          code += "]";
+        }
+        code += " ";
+        code += nameprefix;
+        code += MakeCamel(field.name, true);
+      }
+    }
+  }
+
+  // Recusively generate struct construction statements of the form:
+  // builder.putType(name);
+  // and insert manual padding.
+  void GenStructBody(const StructDef &struct_def, std::string *code_ptr,
+                     const char *nameprefix, size_t index = 0,
+                     bool in_array = false) const {
+    std::string &code = *code_ptr;
+    std::string indent((index + 1) * 2, ' ');
+    code += indent + "  builder.Prep(";
+    code += NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ");\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      if (field.padding) {
+        code += indent + "  builder.Pad(";
+        code += NumToString(field.padding) + ");\n";
+      }
+      if (IsStruct(field_type)) {
+        GenStructBody(*field_type.struct_def, code_ptr,
+                      (nameprefix + (field.name + "_")).c_str(), index,
+                      in_array);
+      } else {
+        const auto &type =
+            IsArray(field_type) ? field_type.VectorType() : field_type;
+        const auto index_var = "_idx" + NumToString(index);
+        if (IsArray(field_type)) {
+          code += indent + "  for (int " + index_var + " = ";
+          code += NumToString(field_type.fixed_length);
+          code += "; " + index_var + " > 0; " + index_var + "--) {\n";
+          in_array = true;
+        }
+        if (IsStruct(type)) {
+          GenStructBody(*field_type.struct_def, code_ptr,
+                        (nameprefix + (field.name + "_")).c_str(), index + 1,
+                        in_array);
+        } else {
+          code += IsArray(field_type) ? "  " : "";
+          code += indent + "  builder.Put";
+          code += GenMethod(type) + "(";
+          code += SourceCast(type);
+          auto argname = nameprefix + MakeCamel(field.name, true);
+          code += argname;
+          size_t array_cnt = index + (IsArray(field_type) ? 1 : 0);
+          if (array_cnt > 0) {
+            code += "[";
+            for (size_t i = 0; in_array && i < array_cnt; i++) {
+              code += "_idx" + NumToString(i) + "-1";
+              if (i != (array_cnt - 1)) code += ",";
+            }
+            code += "]";
+          }
+          code += ");\n";
+        }
+        if (IsArray(field_type)) { code += indent + "  }\n"; }
+      }
+    }
+  }
+  std::string GenOffsetGetter(flatbuffers::FieldDef *key_field,
+                              const char *num = nullptr) const {
+    std::string key_offset =
+        "Table.__offset(" + NumToString(key_field->value.offset) + ", ";
+    if (num) {
+      key_offset += num;
+      key_offset += ".Value, builder.DataBuffer)";
+    } else {
+      key_offset += "bb.Length";
+      key_offset += " - tableOffset, bb)";
+    }
+    return key_offset;
+  }
+
+  std::string GenLookupKeyGetter(flatbuffers::FieldDef *key_field) const {
+    std::string key_getter = "      ";
+    key_getter += "int tableOffset = Table.";
+    key_getter += "__indirect(vectorLocation + 4 * (start + middle)";
+    key_getter += ", bb);\n      ";
+    if (IsString(key_field->value.type)) {
+      key_getter += "int comp = Table.";
+      key_getter += "CompareStrings(";
+      key_getter += GenOffsetGetter(key_field);
+      key_getter += ", byteKey, bb);\n";
+    } else {
+      auto get_val = GenGetterForLookupByKey(key_field, "bb");
+      key_getter += "int comp = " + get_val + ".CompareTo(key);\n";
+    }
+    return key_getter;
+  }
+
+  std::string GenKeyGetter(flatbuffers::FieldDef *key_field) const {
+    std::string key_getter = "";
+    auto data_buffer = "builder.DataBuffer";
+    if (IsString(key_field->value.type)) {
+      key_getter += "Table.CompareStrings(";
+      key_getter += GenOffsetGetter(key_field, "o1") + ", ";
+      key_getter += GenOffsetGetter(key_field, "o2") + ", " + data_buffer + ")";
+    } else {
+      auto field_getter = GenGetterForLookupByKey(key_field, data_buffer, "o1");
+      key_getter += field_getter;
+      field_getter = GenGetterForLookupByKey(key_field, data_buffer, "o2");
+      key_getter += ".CompareTo(" + field_getter + ")";
+    }
+    return key_getter;
+  }
+
+  void GenStruct(StructDef &struct_def, std::string *code_ptr,
+                 const IDLOptions &opts) const {
+    if (struct_def.generated) return;
+    std::string &code = *code_ptr;
+
+    // Generate a struct accessor class, with methods of the form:
+    // public type name() { return bb.getType(i + offset); }
+    // or for tables of the form:
+    // public type name() {
+    //   int o = __offset(offset); return o != 0 ? bb.getType(o + i) : default;
+    // }
+    GenComment(struct_def.doc_comment, code_ptr, &comment_config);
+    if (struct_def.attributes.Lookup("private")) {
+      code += "internal ";
+    } else {
+      code += "public ";
+    }
+    if (struct_def.attributes.Lookup("csharp_partial")) {
+      // generate a partial class for this C# struct/table
+      code += "partial ";
+    }
+    code += "struct " + struct_def.name;
+    code += " : IFlatbufferObject";
+    code += "\n{\n";
+    code += "  private ";
+    code += struct_def.fixed ? "Struct" : "Table";
+    code += " __p;\n";
+
+    code += "  public ByteBuffer ByteBuffer { get { return __p.bb; } }\n";
+
+    if (!struct_def.fixed) {
+      // Generate verson check method.
+      // Force compile time error if not using the same version runtime.
+      code += "  public static void ValidateVersion() {";
+      code += " FlatBufferConstants.";
+      code += "FLATBUFFERS_2_0_0(); ";
+      code += "}\n";
+
+      // Generate a special accessor for the table that when used as the root
+      // of a FlatBuffer
+      std::string method_name = "GetRootAs" + struct_def.name;
+      std::string method_signature =
+          "  public static " + struct_def.name + " " + method_name;
+
+      // create convenience method that doesn't require an existing object
+      code += method_signature + "(ByteBuffer _bb) ";
+      code += "{ return " + method_name + "(_bb, new " + struct_def.name +
+              "()); }\n";
+
+      // create method that allows object reuse
+      code +=
+          method_signature + "(ByteBuffer _bb, " + struct_def.name + " obj) { ";
+      code += "return (obj.__assign(_bb.GetInt(_bb.Position";
+      code += ") + _bb.Position";
+      code += ", _bb)); }\n";
+      if (parser_.root_struct_def_ == &struct_def) {
+        if (parser_.file_identifier_.length()) {
+          // Check if a buffer has the identifier.
+          code += "  public static ";
+          code += "bool " + struct_def.name;
+          code += "BufferHasIdentifier(ByteBuffer _bb) { return ";
+          code += "Table.__has_identifier(_bb, \"";
+          code += parser_.file_identifier_;
+          code += "\"); }\n";
+        }
+      }
+    }
+    // Generate the __init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    code += "  public void __init(int _i, ByteBuffer _bb) ";
+    code += "{ ";
+    code += "__p = new ";
+    code += struct_def.fixed ? "Struct" : "Table";
+    code += "(_i, _bb); ";
+    code += "}\n";
+    code +=
+        "  public " + struct_def.name + " __assign(int _i, ByteBuffer _bb) ";
+    code += "{ __init(_i, _bb); return this; }\n\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      GenComment(field.doc_comment, code_ptr, &comment_config, "  ");
+      std::string type_name = GenTypeGet(field.value.type);
+      std::string type_name_dest = GenTypeGet(field.value.type);
+      std::string conditional_cast = "";
+      std::string optional = "";
+      if (!struct_def.fixed &&
+          (field.value.type.base_type == BASE_TYPE_STRUCT ||
+           field.value.type.base_type == BASE_TYPE_UNION ||
+           (IsVector(field.value.type) &&
+            (field.value.type.element == BASE_TYPE_STRUCT ||
+             field.value.type.element == BASE_TYPE_UNION)))) {
+        optional = "?";
+        conditional_cast = "(" + type_name_dest + optional + ")";
+      }
+      if (field.IsScalarOptional()) { optional = "?"; }
+      std::string dest_mask = "";
+      std::string dest_cast = DestinationCast(field.value.type);
+      std::string src_cast = SourceCast(field.value.type);
+      std::string field_name_camel = MakeCamel(field.name, true);
+      std::string method_start =
+          "  public " + type_name_dest + optional + " " + field_name_camel;
+      std::string obj = "(new " + type_name + "())";
+
+      // Most field accessors need to retrieve and test the field offset first,
+      // this is the prefix code for that:
+      auto offset_prefix =
+          IsArray(field.value.type)
+              ? " { return "
+              : (" { int o = __p.__offset(" + NumToString(field.value.offset) +
+                 "); return o != 0 ? ");
+      // Generate the accessors that don't do object reuse.
+      if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+      } else if (IsVector(field.value.type) &&
+                 field.value.type.element == BASE_TYPE_STRUCT) {
+      } else if (field.value.type.base_type == BASE_TYPE_UNION ||
+                 (IsVector(field.value.type) &&
+                  field.value.type.VectorType().base_type == BASE_TYPE_UNION)) {
+        method_start += "<TTable>";
+        type_name = type_name_dest;
+      }
+      std::string getter = dest_cast + GenGetter(field.value.type);
+      code += method_start;
+      std::string default_cast = "";
+      // only create default casts for c# scalars or vectors of scalars
+      if ((IsScalar(field.value.type.base_type) ||
+           (IsVector(field.value.type) &&
+            IsScalar(field.value.type.element)))) {
+        // For scalars, default value will be returned by GetDefaultValue().
+        // If the scalar is an enum, GetDefaultValue() returns an actual c# enum
+        // that doesn't need to be casted. However, default values for enum
+        // elements of vectors are integer literals ("0") and are still casted
+        // for clarity.
+        // If the scalar is optional and enum, we still need the cast.
+        if ((field.value.type.enum_def == nullptr ||
+             IsVector(field.value.type)) ||
+            (IsEnum(field.value.type) && field.IsScalarOptional())) {
+          default_cast = "(" + type_name_dest + optional + ")";
+        }
+      }
+      std::string member_suffix = "; ";
+      if (IsScalar(field.value.type.base_type)) {
+        code += " { get";
+        member_suffix += "} ";
+        if (struct_def.fixed) {
+          code += " { return " + getter;
+          code += "(__p.bb_pos + ";
+          code += NumToString(field.value.offset) + ")";
+          code += dest_mask;
+        } else {
+          code += offset_prefix + getter;
+          code += "(o + __p.bb_pos)" + dest_mask;
+          code += " : " + default_cast;
+          code += GenDefaultValue(field);
+        }
+      } else {
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT:
+            code += " { get";
+            member_suffix += "} ";
+            if (struct_def.fixed) {
+              code += " { return " + obj + ".__assign(" + "__p.";
+              code += "bb_pos + " + NumToString(field.value.offset) + ", ";
+              code += "__p.bb)";
+            } else {
+              code += offset_prefix + conditional_cast;
+              code += obj + ".__assign(";
+              code += field.value.type.struct_def->fixed
+                          ? "o + __p.bb_pos"
+                          : "__p.__indirect(o + __p.bb_pos)";
+              code += ", __p.bb) : null";
+            }
+            break;
+          case BASE_TYPE_STRING:
+            code += " { get";
+            member_suffix += "} ";
+            code += offset_prefix + getter + "(o + " + "__p.";
+            code += "bb_pos) : null";
+            break;
+          case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();  // fall thru
+          case BASE_TYPE_VECTOR: {
+            auto vectortype = field.value.type.VectorType();
+            if (vectortype.base_type == BASE_TYPE_UNION) {
+              conditional_cast = "(TTable?)";
+              getter += "<TTable>";
+            }
+            code += "(";
+            if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              getter = obj + ".__assign";
+            } else if (vectortype.base_type == BASE_TYPE_UNION) {
+            }
+            code += "int j)";
+            const auto body = offset_prefix + conditional_cast + getter + "(";
+            if (vectortype.base_type == BASE_TYPE_UNION) {
+              code += " where TTable : struct, IFlatbufferObject" + body;
+            } else {
+              code += body;
+            }
+            std::string index = "__p.";
+            if (IsArray(field.value.type)) {
+              index += "bb_pos + " + NumToString(field.value.offset) + " + ";
+            } else {
+              index += "__vector(o) + ";
+            }
+            index += "j * " + NumToString(InlineSize(vectortype));
+            if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              code += vectortype.struct_def->fixed
+                          ? index
+                          : "__p.__indirect(" + index + ")";
+              code += ", __p.bb";
+            } else {
+              code += index;
+            }
+            code += ")" + dest_mask;
+            if (!IsArray(field.value.type)) {
+              code += " : ";
+              code +=
+                  field.value.type.element == BASE_TYPE_BOOL
+                      ? "false"
+                      : (IsScalar(field.value.type.element) ? default_cast + "0"
+                                                            : "null");
+            }
+            if (vectortype.base_type == BASE_TYPE_UNION &&
+                HasUnionStringValue(*vectortype.enum_def)) {
+              code += member_suffix;
+              code += "}\n";
+              code += "  public string " + MakeCamel(field.name, true) +
+                      "AsString(int j)";
+              code += offset_prefix + GenGetter(Type(BASE_TYPE_STRING));
+              code += "(" + index + ") : null";
+            }
+            break;
+          }
+          case BASE_TYPE_UNION:
+            code += "() where TTable : struct, IFlatbufferObject";
+            code += offset_prefix + "(TTable?)" + getter;
+            code += "<TTable>(o + __p.bb_pos) : null";
+            if (HasUnionStringValue(*field.value.type.enum_def)) {
+              code += member_suffix;
+              code += "}\n";
+              code += "  public string " + MakeCamel(field.name, true) +
+                      "AsString()";
+              code += offset_prefix + GenGetter(Type(BASE_TYPE_STRING));
+              code += "(o + __p.bb_pos) : null";
+            }
+            // As<> accesors for Unions
+            // Loop through all the possible union types and generate an As
+            // accessor that casts to the correct type.
+            for (auto uit = field.value.type.enum_def->Vals().begin();
+                 uit != field.value.type.enum_def->Vals().end(); ++uit) {
+              auto val = *uit;
+              if (val->union_type.base_type == BASE_TYPE_NONE) { continue; }
+              auto union_field_type_name = GenTypeGet(val->union_type);
+              code += member_suffix + "}\n";
+              if (val->union_type.base_type == BASE_TYPE_STRUCT &&
+                  val->union_type.struct_def->attributes.Lookup("private")) {
+                code += "  internal ";
+              } else {
+                code += "  public ";
+              }
+              code += union_field_type_name + " ";
+              code += field_name_camel + "As" + val->name + "() { return ";
+              code += field_name_camel;
+              if (IsString(val->union_type)) {
+                code += "AsString()";
+              } else {
+                code += "<" + union_field_type_name + ">().Value";
+              }
+            }
+            break;
+          default: FLATBUFFERS_ASSERT(0);
+        }
+      }
+      code += member_suffix;
+      code += "}\n";
+      if (IsVector(field.value.type)) {
+        code += "  public int " + MakeCamel(field.name, true);
+        code += "Length";
+        code += " { get";
+        code += offset_prefix;
+        code += "__p.__vector_len(o) : 0; ";
+        code += "} ";
+        code += "}\n";
+        // See if we should generate a by-key accessor.
+        if (field.value.type.element == BASE_TYPE_STRUCT &&
+            !field.value.type.struct_def->fixed) {
+          auto &sd = *field.value.type.struct_def;
+          auto &fields = sd.fields.vec;
+          for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+            auto &key_field = **kit;
+            if (key_field.key) {
+              auto qualified_name = WrapInNameSpace(sd);
+              code += "  public " + qualified_name + "? ";
+              code += MakeCamel(field.name, true) + "ByKey(";
+              code += GenTypeGet(key_field.value.type) + " key)";
+              code += offset_prefix;
+              code += qualified_name + ".__lookup_by_key(";
+              code += "__p.__vector(o), key, ";
+              code += "__p.bb) : null; ";
+              code += "}\n";
+              break;
+            }
+          }
+        }
+      }
+      // Generate a ByteBuffer accessor for strings & vectors of scalars.
+      if ((IsVector(field.value.type) &&
+           IsScalar(field.value.type.VectorType().base_type)) ||
+          IsString(field.value.type)) {
+        code += "#if ENABLE_SPAN_T\n";
+        code += "  public Span<" + GenTypeBasic(field.value.type.VectorType()) +
+                "> Get";
+        code += MakeCamel(field.name, true);
+        code += "Bytes() { return ";
+        code += "__p.__vector_as_span<" +
+                GenTypeBasic(field.value.type.VectorType()) + ">(";
+        code += NumToString(field.value.offset);
+        code +=
+            ", " + NumToString(SizeOf(field.value.type.VectorType().base_type));
+        code += "); }\n";
+        code += "#else\n";
+        code += "  public ArraySegment<byte>? Get";
+        code += MakeCamel(field.name, true);
+        code += "Bytes() { return ";
+        code += "__p.__vector_as_arraysegment(";
+        code += NumToString(field.value.offset);
+        code += "); }\n";
+        code += "#endif\n";
+
+        // For direct blockcopying the data into a typed array
+        code += "  public ";
+        code += GenTypeBasic(field.value.type.VectorType());
+        code += "[] Get";
+        code += MakeCamel(field.name, true);
+        code += "Array() { ";
+        if (IsEnum(field.value.type.VectorType())) {
+          // Since __vector_as_array does not work for enum types,
+          // fill array using an explicit loop.
+          code += "int o = __p.__offset(";
+          code += NumToString(field.value.offset);
+          code += "); if (o == 0) return null; int p = ";
+          code += "__p.__vector(o); int l = ";
+          code += "__p.__vector_len(o); ";
+          code += GenTypeBasic(field.value.type.VectorType());
+          code += "[] a = new ";
+          code += GenTypeBasic(field.value.type.VectorType());
+          code += "[l]; for (int i = 0; i < l; i++) { a[i] = " + getter;
+          code += "(p + i * ";
+          code += NumToString(InlineSize(field.value.type.VectorType()));
+          code += "); } return a;";
+        } else {
+          code += "return ";
+          code += "__p.__vector_as_array<";
+          code += GenTypeBasic(field.value.type.VectorType());
+          code += ">(";
+          code += NumToString(field.value.offset);
+          code += ");";
+        }
+        code += " }\n";
+      }
+      // generate object accessors if is nested_flatbuffer
+      if (field.nested_flatbuffer) {
+        auto nested_type_name = WrapInNameSpace(*field.nested_flatbuffer);
+        auto nested_method_name =
+            MakeCamel(field.name, true) + "As" + field.nested_flatbuffer->name;
+        auto get_nested_method_name = nested_method_name;
+        get_nested_method_name = "Get" + nested_method_name;
+        conditional_cast = "(" + nested_type_name + "?)";
+        obj = "(new " + nested_type_name + "())";
+        code += "  public " + nested_type_name + "? ";
+        code += get_nested_method_name + "(";
+        code += ") { int o = __p.__offset(";
+        code += NumToString(field.value.offset) + "); ";
+        code += "return o != 0 ? " + conditional_cast + obj + ".__assign(";
+        code += "__p.";
+        code += "__indirect(__p.__vector(o)), ";
+        code += "__p.bb) : null; }\n";
+      }
+      // Generate mutators for scalar fields or vectors of scalars.
+      if (parser_.opts.mutable_buffer) {
+        auto is_series = (IsSeries(field.value.type));
+        const auto &underlying_type =
+            is_series ? field.value.type.VectorType() : field.value.type;
+        // Boolean parameters have to be explicitly converted to byte
+        // representation.
+        auto setter_parameter = underlying_type.base_type == BASE_TYPE_BOOL
+                                    ? "(byte)(" + field.name + " ? 1 : 0)"
+                                    : field.name;
+        auto mutator_prefix = MakeCamel("mutate", true);
+        // A vector mutator also needs the index of the vector element it should
+        // mutate.
+        auto mutator_params = (is_series ? "(int j, " : "(") +
+                              GenTypeGet(underlying_type) + " " + field.name +
+                              ") { ";
+        auto setter_index =
+            is_series
+                ? "__p." +
+                      (IsArray(field.value.type)
+                           ? "bb_pos + " + NumToString(field.value.offset)
+                           : "__vector(o)") +
+                      +" + j * " + NumToString(InlineSize(underlying_type))
+                : (struct_def.fixed
+                       ? "__p.bb_pos + " + NumToString(field.value.offset)
+                       : "o + __p.bb_pos");
+        if (IsScalar(underlying_type.base_type) && !IsUnion(field.value.type)) {
+          code += "  public ";
+          code += struct_def.fixed ? "void " : "bool ";
+          code += mutator_prefix + MakeCamel(field.name, true);
+          code += mutator_params;
+          if (struct_def.fixed) {
+            code += GenSetter(underlying_type) + "(" + setter_index + ", ";
+            code += src_cast + setter_parameter + "); }\n";
+          } else {
+            code += "int o = __p.__offset(";
+            code += NumToString(field.value.offset) + ");";
+            code += " if (o != 0) { " + GenSetter(underlying_type);
+            code += "(" + setter_index + ", " + src_cast + setter_parameter +
+                    "); return true; } else { return false; } }\n";
+          }
+        }
+      }
+      if (parser_.opts.java_primitive_has_method &&
+          IsScalar(field.value.type.base_type) && !struct_def.fixed) {
+        auto vt_offset_constant = "  public static final int VT_" +
+                                  MakeScreamingCamel(field.name) + " = " +
+                                  NumToString(field.value.offset) + ";";
+
+        code += vt_offset_constant;
+        code += "\n";
+      }
+    }
+    code += "\n";
+    auto struct_has_create = false;
+    std::set<flatbuffers::FieldDef *> field_has_create_set;
+    flatbuffers::FieldDef *key_field = nullptr;
+    if (struct_def.fixed) {
+      struct_has_create = true;
+      // create a struct constructor function
+      code += "  public static " + GenOffsetType(struct_def) + " ";
+      code += "Create";
+      code += struct_def.name + "(FlatBufferBuilder builder";
+      GenStructArgs(struct_def, code_ptr, "");
+      code += ") {\n";
+      GenStructBody(struct_def, code_ptr, "");
+      code += "    return ";
+      code += GenOffsetConstruct(struct_def, "builder.Offset");
+      code += ";\n  }\n";
+    } else {
+      // Generate a method that creates a table in one go. This is only possible
+      // when the table has no struct fields, since those have to be created
+      // inline, and there's no way to do so in Java.
+      bool has_no_struct_fields = true;
+      int num_fields = 0;
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        if (IsStruct(field.value.type)) {
+          has_no_struct_fields = false;
+        } else {
+          num_fields++;
+        }
+      }
+      // JVM specifications restrict default constructor params to be < 255.
+      // Longs and doubles take up 2 units, so we set the limit to be < 127.
+      if ((has_no_struct_fields || opts.generate_object_based_api) &&
+          num_fields && num_fields < 127) {
+        struct_has_create = true;
+        // Generate a table constructor of the form:
+        // public static int createName(FlatBufferBuilder builder, args...)
+        code += "  public static " + GenOffsetType(struct_def) + " ";
+        code += "Create" + struct_def.name;
+        code += "(FlatBufferBuilder builder";
+        for (auto it = struct_def.fields.vec.begin();
+             it != struct_def.fields.vec.end(); ++it) {
+          auto &field = **it;
+          if (field.deprecated) continue;
+          code += ",\n      ";
+          if (IsStruct(field.value.type) && opts.generate_object_based_api) {
+            code += WrapInNameSpace(
+                field.value.type.struct_def->defined_namespace,
+                GenTypeName_ObjectAPI(field.value.type.struct_def->name, opts));
+            code += " ";
+            code += field.name;
+            code += " = null";
+          } else {
+            code += GenTypeBasic(field.value.type);
+            if (field.IsScalarOptional()) { code += "?"; }
+            code += " ";
+            code += field.name;
+            if (!IsScalar(field.value.type.base_type)) code += "Offset";
+
+            code += " = ";
+            code += GenDefaultValueBasic(field);
+          }
+        }
+        code += ") {\n    builder.";
+        code += "StartTable(";
+        code += NumToString(struct_def.fields.vec.size()) + ");\n";
+        for (size_t size = struct_def.sortbysize ? sizeof(largest_scalar_t) : 1;
+             size; size /= 2) {
+          for (auto it = struct_def.fields.vec.rbegin();
+               it != struct_def.fields.vec.rend(); ++it) {
+            auto &field = **it;
+            if (!field.deprecated &&
+                (!struct_def.sortbysize ||
+                 size == SizeOf(field.value.type.base_type))) {
+              code += "    " + struct_def.name + ".";
+              code += "Add";
+              code += MakeCamel(field.name) + "(builder, ";
+              if (IsStruct(field.value.type) &&
+                  opts.generate_object_based_api) {
+                code += GenTypePointer(field.value.type) + ".Pack(builder, " +
+                        field.name + ")";
+              } else {
+                code += field.name;
+                if (!IsScalar(field.value.type.base_type)) code += "Offset";
+              }
+
+              code += ");\n";
+            }
+          }
+        }
+        code += "    return " + struct_def.name + ".";
+        code += "End" + struct_def.name;
+        code += "(builder);\n  }\n\n";
+      }
+      // Generate a set of static methods that allow table construction,
+      // of the form:
+      // public static void addName(FlatBufferBuilder builder, short name)
+      // { builder.addShort(id, name, default); }
+      // Unlike the Create function, these always work.
+      code += "  public static void Start";
+      code += struct_def.name;
+      code += "(FlatBufferBuilder builder) { builder.";
+      code += "StartTable(";
+      code += NumToString(struct_def.fields.vec.size()) + "); }\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        if (field.key) key_field = &field;
+        code += "  public static void Add";
+        code += MakeCamel(field.name);
+        code += "(FlatBufferBuilder builder, ";
+        code += GenTypeBasic(field.value.type);
+        auto argname = MakeCamel(field.name, false);
+        if (!IsScalar(field.value.type.base_type)) argname += "Offset";
+        if (field.IsScalarOptional()) { code += "?"; }
+        code += " " + argname + ") { builder.Add";
+        code += GenMethod(field.value.type) + "(";
+        code += NumToString(it - struct_def.fields.vec.begin()) + ", ";
+        code += SourceCastBasic(field.value.type);
+        code += argname;
+        if (!IsScalar(field.value.type.base_type) &&
+            field.value.type.base_type != BASE_TYPE_UNION) {
+          code += ".Value";
+        }
+        if (!field.IsScalarOptional()) {
+          // When the scalar is optional, use the builder method that doesn't
+          // supply a default value. Otherwise, we to continue to use the
+          // default value method.
+          code += ", ";
+          code += GenDefaultValue(field, false);
+        }
+        code += "); }\n";
+        if (IsVector(field.value.type)) {
+          auto vector_type = field.value.type.VectorType();
+          auto alignment = InlineAlignment(vector_type);
+          auto elem_size = InlineSize(vector_type);
+          if (!IsStruct(vector_type)) {
+            field_has_create_set.insert(&field);
+            code += "  public static VectorOffset ";
+            code += "Create";
+            code += MakeCamel(field.name);
+            code += "Vector(FlatBufferBuilder builder, ";
+            code += GenTypeBasic(vector_type) + "[] data) ";
+            code += "{ builder.StartVector(";
+            code += NumToString(elem_size);
+            code += ", data.Length, ";
+            code += NumToString(alignment);
+            code += "); for (int i = data.";
+            code += "Length - 1; i >= 0; i--) builder.";
+            code += "Add";
+            code += GenMethod(vector_type);
+            code += "(";
+            code += SourceCastBasic(vector_type);
+            code += "data[i]";
+            if (vector_type.base_type == BASE_TYPE_STRUCT ||
+                IsString(vector_type))
+              code += ".Value";
+            code += "); return ";
+            code += "builder.EndVector(); }\n";
+
+            code += "  public static VectorOffset ";
+            code += "Create";
+            code += MakeCamel(field.name);
+            code += "VectorBlock(FlatBufferBuilder builder, ";
+            code += GenTypeBasic(vector_type) + "[] data) ";
+            code += "{ builder.StartVector(";
+            code += NumToString(elem_size);
+            code += ", data.Length, ";
+            code += NumToString(alignment);
+            code += "); builder.Add(data); return builder.EndVector(); }\n";
+          }
+          // Generate a method to start a vector, data to be added manually
+          // after.
+          code += "  public static void Start";
+          code += MakeCamel(field.name);
+          code += "Vector(FlatBufferBuilder builder, int numElems) ";
+          code += "{ builder.StartVector(";
+          code += NumToString(elem_size);
+          code += ", numElems, " + NumToString(alignment);
+          code += "); }\n";
+        }
+      }
+      code += "  public static " + GenOffsetType(struct_def) + " ";
+      code += "End" + struct_def.name;
+      code += "(FlatBufferBuilder builder) {\n    int o = builder.";
+      code += "EndTable();\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (!field.deprecated && field.IsRequired()) {
+          code += "    builder.Required(o, ";
+          code += NumToString(field.value.offset);
+          code += ");  // " + field.name + "\n";
+        }
+      }
+      code += "    return " + GenOffsetConstruct(struct_def, "o") + ";\n  }\n";
+      if (parser_.root_struct_def_ == &struct_def) {
+        std::string size_prefix[] = { "", "SizePrefixed" };
+        for (int i = 0; i < 2; ++i) {
+          code += "  public static void ";
+          code += "Finish" + size_prefix[i] + struct_def.name;
+          code +=
+              "Buffer(FlatBufferBuilder builder, " + GenOffsetType(struct_def);
+          code += " offset) {";
+          code += " builder.Finish" + size_prefix[i] + "(offset";
+          code += ".Value";
+
+          if (parser_.file_identifier_.length())
+            code += ", \"" + parser_.file_identifier_ + "\"";
+          code += "); }\n";
+        }
+      }
+    }
+    // Only generate key compare function for table,
+    // because `key_field` is not set for struct
+    if (struct_def.has_key && !struct_def.fixed) {
+      FLATBUFFERS_ASSERT(key_field);
+      code += "\n  public static VectorOffset ";
+      code += "CreateSortedVectorOf" + struct_def.name;
+      code += "(FlatBufferBuilder builder, ";
+      code += "Offset<" + struct_def.name + ">";
+      code += "[] offsets) {\n";
+      code += "    Array.Sort(offsets, (Offset<" + struct_def.name +
+              "> o1, Offset<" + struct_def.name + "> o2) => " +
+              GenKeyGetter(key_field);
+      code += ");\n";
+      code += "    return builder.CreateVectorOfTables(offsets);\n  }\n";
+
+      code += "\n  public static " + struct_def.name + "?";
+      code += " __lookup_by_key(";
+      code += "int vectorLocation, ";
+      code += GenTypeGet(key_field->value.type);
+      code += " key, ByteBuffer bb) {\n";
+      if (IsString(key_field->value.type)) {
+        code += "    byte[] byteKey = ";
+        code += "System.Text.Encoding.UTF8.GetBytes(key);\n";
+      }
+      code += "    int span = ";
+      code += "bb.GetInt(vectorLocation - 4);\n";
+      code += "    int start = 0;\n";
+      code += "    while (span != 0) {\n";
+      code += "      int middle = span / 2;\n";
+      code += GenLookupKeyGetter(key_field);
+      code += "      if (comp > 0) {\n";
+      code += "        span = middle;\n";
+      code += "      } else if (comp < 0) {\n";
+      code += "        middle++;\n";
+      code += "        start += middle;\n";
+      code += "        span -= middle;\n";
+      code += "      } else {\n";
+      code += "        return ";
+      code += "new " + struct_def.name + "()";
+      code += ".__assign(tableOffset, bb);\n";
+      code += "      }\n    }\n";
+      code += "    return null;\n";
+      code += "  }\n";
+    }
+
+    if (opts.generate_object_based_api) {
+      GenPackUnPack_ObjectAPI(struct_def, code_ptr, opts, struct_has_create,
+                              field_has_create_set);
+    }
+    code += "};\n\n";
+
+    if (opts.generate_object_based_api) {
+      GenStruct_ObjectAPI(struct_def, code_ptr, opts);
+    }
+  }
+
+  void GenVectorAccessObject(StructDef &struct_def,
+                             std::string *code_ptr) const {
+    auto &code = *code_ptr;
+    // Generate a vector of structs accessor class.
+    code += "\n";
+    code += "  ";
+    if (!struct_def.attributes.Lookup("private")) code += "public ";
+    code += "static struct Vector : BaseVector\n{\n";
+
+    // Generate the __assign method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    std::string method_indent = "    ";
+    code += method_indent + "public Vector ";
+    code += "__assign(int _vector, int _element_size, ByteBuffer _bb) { ";
+    code += "__reset(_vector, _element_size, _bb); return this; }\n\n";
+
+    auto type_name = struct_def.name;
+    auto method_start = method_indent + "public " + type_name + " Get";
+    // Generate the accessors that don't do object reuse.
+    code += method_start + "(int j) { return Get";
+    code += "(new " + type_name + "(), j); }\n";
+    code += method_start + "(" + type_name + " obj, int j) { ";
+    code += " return obj.__assign(";
+    code += struct_def.fixed ? "__p.__element(j)"
+                             : "__p.__indirect(__p.__element(j), bb)";
+    code += ", __p.bb); }\n";
+    // See if we should generate a by-key accessor.
+    if (!struct_def.fixed) {
+      auto &fields = struct_def.fields.vec;
+      for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+        auto &key_field = **kit;
+        if (key_field.key) {
+          auto nullable_annotation =
+              parser_.opts.gen_nullable ? "@Nullable " : "";
+          code += method_indent + nullable_annotation;
+          code += "public " + type_name + "? ";
+          code += "GetByKey(";
+          code += GenTypeGet(key_field.value.type) + " key) { ";
+          code += " return __lookup_by_key(null, ";
+          code += "__p.__vector(), key, ";
+          code += "__p.bb); ";
+          code += "}\n";
+          code += method_indent + nullable_annotation;
+          code += "public " + type_name + "?" + " ";
+          code += "GetByKey(";
+          code += type_name + "? obj, ";
+          code += GenTypeGet(key_field.value.type) + " key) { ";
+          code += " return __lookup_by_key(obj, ";
+          code += "__p.__vector(), key, ";
+          code += "__p.bb); ";
+          code += "}\n";
+          break;
+        }
+      }
+    }
+    code += "  }\n";
+  }
+
+  void GenEnum_ObjectAPI(EnumDef &enum_def, std::string *code_ptr,
+                         const IDLOptions &opts) const {
+    auto &code = *code_ptr;
+    if (enum_def.generated) return;
+    if (!enum_def.is_union) return;
+    if (enum_def.attributes.Lookup("private")) {
+      code += "internal ";
+    } else {
+      code += "public ";
+    }
+    auto union_name = enum_def.name + "Union";
+    code += "class " + union_name + " {\n";
+    // Type
+    code += "  public " + enum_def.name + " Type { get; set; }\n";
+    // Value
+    code += "  public object Value { get; set; }\n";
+    code += "\n";
+    // Constructor
+    code += "  public " + union_name + "() {\n";
+    code += "    this.Type = " + enum_def.name + "." +
+            enum_def.Vals()[0]->name + ";\n";
+    code += "    this.Value = null;\n";
+    code += "  }\n\n";
+    // As<T>
+    code += "  public T As<T>() where T : class { return this.Value as T; }\n";
+    // As
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      if (ev.union_type.base_type == BASE_TYPE_NONE) continue;
+      auto type_name = GenTypeGet_ObjectAPI(ev.union_type, opts);
+      if (ev.union_type.base_type == BASE_TYPE_STRUCT &&
+          ev.union_type.struct_def->attributes.Lookup("private")) {
+        code += "  internal ";
+      } else {
+        code += "  public ";
+      }
+      code += type_name + " As" + ev.name + "() { return this.As<" + type_name +
+              ">(); }\n";
+    }
+    code += "\n";
+    // Pack()
+    code += "  public static int Pack(FlatBuffers.FlatBufferBuilder builder, " +
+            union_name + " _o) {\n";
+    code += "    switch (_o.Type) {\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      if (ev.union_type.base_type == BASE_TYPE_NONE) {
+        code += "      default: return 0;\n";
+      } else {
+        code += "      case " + enum_def.name + "." + ev.name + ": return ";
+        if (IsString(ev.union_type)) {
+          code += "builder.CreateString(_o.As" + ev.name + "()).Value;\n";
+        } else {
+          code += GenTypeGet(ev.union_type) + ".Pack(builder, _o.As" + ev.name +
+                  "()).Value;\n";
+        }
+      }
+    }
+    code += "    }\n";
+    code += "  }\n";
+    code += "}\n\n";
+    // JsonConverter
+    if (opts.cs_gen_json_serializer) {
+      if (enum_def.attributes.Lookup("private")) {
+        code += "internal ";
+      } else {
+        code += "public ";
+      }
+      code += "class " + union_name +
+              "_JsonConverter : Newtonsoft.Json.JsonConverter {\n";
+      code += "  public override bool CanConvert(System.Type objectType) {\n";
+      code += "    return objectType == typeof(" + union_name +
+              ") || objectType == typeof(System.Collections.Generic.List<" +
+              union_name + ">);\n";
+      code += "  }\n";
+      code +=
+          "  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, "
+          "object value, "
+          "Newtonsoft.Json.JsonSerializer serializer) {\n";
+      code += "    var _olist = value as System.Collections.Generic.List<" +
+              union_name + ">;\n";
+      code += "    if (_olist != null) {\n";
+      code += "      writer.WriteStartArray();\n";
+      code +=
+          "      foreach (var _o in _olist) { this.WriteJson(writer, _o, "
+          "serializer); }\n";
+      code += "      writer.WriteEndArray();\n";
+      code += "    } else {\n";
+      code += "      this.WriteJson(writer, value as " + union_name +
+              ", serializer);\n";
+      code += "    }\n";
+      code += "  }\n";
+      code += "  public void WriteJson(Newtonsoft.Json.JsonWriter writer, " +
+              union_name +
+              " _o, "
+              "Newtonsoft.Json.JsonSerializer serializer) {\n";
+      code += "    if (_o == null) return;\n";
+      code += "    serializer.Serialize(writer, _o.Value);\n";
+      code += "  }\n";
+      code +=
+          "  public override object ReadJson(Newtonsoft.Json.JsonReader "
+          "reader, "
+          "System.Type objectType, "
+          "object existingValue, Newtonsoft.Json.JsonSerializer serializer) "
+          "{\n";
+      code +=
+          "    var _olist = existingValue as System.Collections.Generic.List<" +
+          union_name + ">;\n";
+      code += "    if (_olist != null) {\n";
+      code += "      for (var _j = 0; _j < _olist.Count; ++_j) {\n";
+      code += "        reader.Read();\n";
+      code +=
+          "        _olist[_j] = this.ReadJson(reader, _olist[_j], "
+          "serializer);\n";
+      code += "      }\n";
+      code += "      reader.Read();\n";
+      code += "      return _olist;\n";
+      code += "    } else {\n";
+      code += "      return this.ReadJson(reader, existingValue as " +
+              union_name + ", serializer);\n";
+      code += "    }\n";
+      code += "  }\n";
+      code += "  public " + union_name +
+              " ReadJson(Newtonsoft.Json.JsonReader reader, " + union_name +
+              " _o, Newtonsoft.Json.JsonSerializer serializer) {\n";
+      code += "    if (_o == null) return null;\n";
+      code += "    switch (_o.Type) {\n";
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        auto &ev = **it;
+        if (ev.union_type.base_type == BASE_TYPE_NONE) {
+          code += "      default: break;\n";
+        } else {
+          auto type_name = GenTypeGet_ObjectAPI(ev.union_type, opts);
+          code += "      case " + enum_def.name + "." + ev.name +
+                  ": _o.Value = serializer.Deserialize<" + type_name +
+                  ">(reader); break;\n";
+        }
+      }
+      code += "    }\n";
+      code += "    return _o;\n";
+      code += "  }\n";
+      code += "}\n\n";
+    }
+  }
+
+  std::string GenTypeName_ObjectAPI(const std::string &name,
+                                    const IDLOptions &opts) const {
+    return opts.object_prefix + name + opts.object_suffix;
+  }
+
+  void GenUnionUnPack_ObjectAPI(const EnumDef &enum_def, std::string *code_ptr,
+                                const std::string &camel_name,
+                                bool is_vector) const {
+    auto &code = *code_ptr;
+    std::string varialbe_name = "_o." + camel_name;
+    std::string type_suffix = "";
+    std::string func_suffix = "()";
+    std::string indent = "    ";
+    if (is_vector) {
+      varialbe_name = "_o_" + camel_name;
+      type_suffix = "(_j)";
+      func_suffix = "(_j)";
+      indent = "      ";
+    }
+    if (is_vector) {
+      code += indent + "var " + varialbe_name + " = new ";
+    } else {
+      code += indent + varialbe_name + " = new ";
+    }
+    code += WrapInNameSpace(enum_def) + "Union();\n";
+    code += indent + varialbe_name + ".Type = this." + camel_name + "Type" +
+            type_suffix + ";\n";
+    code +=
+        indent + "switch (this." + camel_name + "Type" + type_suffix + ") {\n";
+    for (auto eit = enum_def.Vals().begin(); eit != enum_def.Vals().end();
+         ++eit) {
+      auto &ev = **eit;
+      if (ev.union_type.base_type == BASE_TYPE_NONE) {
+        code += indent + "  default: break;\n";
+      } else {
+        code += indent + "  case " + WrapInNameSpace(enum_def) + "." + ev.name +
+                ":\n";
+        code += indent + "    " + varialbe_name + ".Value = this." + camel_name;
+        if (IsString(ev.union_type)) {
+          code += "AsString" + func_suffix + ";\n";
+        } else {
+          code += "<" + GenTypeGet(ev.union_type) + ">" + func_suffix;
+          code += ".HasValue ? this." + camel_name;
+          code += "<" + GenTypeGet(ev.union_type) + ">" + func_suffix +
+                  ".Value.UnPack() : null;\n";
+        }
+        code += indent + "    break;\n";
+      }
+    }
+    code += indent + "}\n";
+    if (is_vector) {
+      code += indent + "_o." + camel_name + ".Add(" + varialbe_name + ");\n";
+    }
+  }
+
+  void GenPackUnPack_ObjectAPI(
+      StructDef &struct_def, std::string *code_ptr, const IDLOptions &opts,
+      bool struct_has_create,
+      const std::set<FieldDef *> &field_has_create) const {
+    auto &code = *code_ptr;
+    auto struct_name = GenTypeName_ObjectAPI(struct_def.name, opts);
+    // UnPack()
+    code += "  public " + struct_name + " UnPack() {\n";
+    code += "    var _o = new " + struct_name + "();\n";
+    code += "    this.UnPackTo(_o);\n";
+    code += "    return _o;\n";
+    code += "  }\n";
+    // UnPackTo()
+    code += "  public void UnPackTo(" + struct_name + " _o) {\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto camel_name = MakeCamel(field.name);
+      auto start = "    _o." + camel_name + " = ";
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT: {
+          auto fixed = struct_def.fixed && field.value.type.struct_def->fixed;
+          if (fixed) {
+            code += start + "this." + camel_name + ".UnPack();\n";
+          } else {
+            code += start + "this." + camel_name + ".HasValue ? this." +
+                    camel_name + ".Value.UnPack() : null;\n";
+          }
+          break;
+        }
+        case BASE_TYPE_ARRAY: {
+          auto type_name = GenTypeGet_ObjectAPI(field.value.type, opts);
+          auto length_str = NumToString(field.value.type.fixed_length);
+          auto unpack_method = field.value.type.struct_def == nullptr
+                                   ? ""
+                                   : field.value.type.struct_def->fixed
+                                         ? ".UnPack()"
+                                         : "?.UnPack()";
+          code += start + "new " + type_name.substr(0, type_name.length() - 1) +
+                  length_str + "];\n";
+          code += "    for (var _j = 0; _j < " + length_str + "; ++_j) { _o." +
+                  camel_name + "[_j] = this." + camel_name + "(_j)" +
+                  unpack_method + "; }\n";
+          break;
+        }
+        case BASE_TYPE_VECTOR:
+          if (field.value.type.element == BASE_TYPE_UNION) {
+            code += start + "new " +
+                    GenTypeGet_ObjectAPI(field.value.type, opts) + "();\n";
+            code += "    for (var _j = 0; _j < this." + camel_name +
+                    "Length; ++_j) {\n";
+            GenUnionUnPack_ObjectAPI(*field.value.type.enum_def, code_ptr,
+                                     camel_name, true);
+            code += "    }\n";
+          } else if (field.value.type.element != BASE_TYPE_UTYPE) {
+            auto fixed = field.value.type.struct_def == nullptr;
+            code += start + "new " +
+                    GenTypeGet_ObjectAPI(field.value.type, opts) + "();\n";
+            code += "    for (var _j = 0; _j < this." + camel_name +
+                    "Length; ++_j) {";
+            code += "_o." + camel_name + ".Add(";
+            if (fixed) {
+              code += "this." + camel_name + "(_j)";
+            } else {
+              code += "this." + camel_name + "(_j).HasValue ? this." +
+                      camel_name + "(_j).Value.UnPack() : null";
+            }
+            code += ");}\n";
+          }
+          break;
+        case BASE_TYPE_UTYPE: break;
+        case BASE_TYPE_UNION: {
+          GenUnionUnPack_ObjectAPI(*field.value.type.enum_def, code_ptr,
+                                   camel_name, false);
+          break;
+        }
+        default: {
+          code += start + "this." + camel_name + ";\n";
+          break;
+        }
+      }
+    }
+    code += "  }\n";
+    // Pack()
+    code += "  public static " + GenOffsetType(struct_def) +
+            " Pack(FlatBufferBuilder builder, " + struct_name + " _o) {\n";
+    code += "    if (_o == null) return default(" + GenOffsetType(struct_def) +
+            ");\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto camel_name = MakeCamel(field.name);
+      // pre
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT: {
+          if (!field.value.type.struct_def->fixed) {
+            code += "    var _" + field.name + " = _o." + camel_name +
+                    " == null ? default(" +
+                    GenOffsetType(*field.value.type.struct_def) +
+                    ") : " + GenTypeGet(field.value.type) +
+                    ".Pack(builder, _o." + camel_name + ");\n";
+          } else if (struct_def.fixed && struct_has_create) {
+            std::vector<FieldArrayLength> array_lengths;
+            FieldArrayLength tmp_array_length = {
+              field.name,
+              field.value.type.fixed_length,
+            };
+            array_lengths.push_back(tmp_array_length);
+            GenStructPackDecl_ObjectAPI(*field.value.type.struct_def, code_ptr,
+                                        array_lengths);
+          }
+          break;
+        }
+        case BASE_TYPE_STRING: {
+          std::string create_string =
+              field.shared ? "CreateSharedString" : "CreateString";
+          code += "    var _" + field.name + " = _o." + camel_name +
+                  " == null ? default(StringOffset) : "
+                  "builder." +
+                  create_string + "(_o." + camel_name + ");\n";
+          break;
+        }
+        case BASE_TYPE_VECTOR: {
+          if (field_has_create.find(&field) != field_has_create.end()) {
+            auto property_name = camel_name;
+            auto gen_for_loop = true;
+            std::string array_name = "__" + field.name;
+            std::string array_type = "";
+            std::string to_array = "";
+            switch (field.value.type.element) {
+              case BASE_TYPE_STRING: {
+                std::string create_string =
+                    field.shared ? "CreateSharedString" : "CreateString";
+                array_type = "StringOffset";
+                to_array += "builder." + create_string + "(_o." +
+                            property_name + "[_j])";
+                break;
+              }
+              case BASE_TYPE_STRUCT:
+                array_type = "Offset<" + GenTypeGet(field.value.type) + ">";
+                to_array = GenTypeGet(field.value.type) + ".Pack(builder, _o." +
+                           property_name + "[_j])";
+                break;
+              case BASE_TYPE_UTYPE:
+                property_name = camel_name.substr(0, camel_name.size() - 4);
+                array_type = WrapInNameSpace(*field.value.type.enum_def);
+                to_array = "_o." + property_name + "[_j].Type";
+                break;
+              case BASE_TYPE_UNION:
+                array_type = "int";
+                to_array = WrapInNameSpace(*field.value.type.enum_def) +
+                           "Union.Pack(builder,  _o." + property_name + "[_j])";
+                break;
+              default: gen_for_loop = false; break;
+            }
+            code += "    var _" + field.name + " = default(VectorOffset);\n";
+            code += "    if (_o." + property_name + " != null) {\n";
+            if (gen_for_loop) {
+              code += "      var " + array_name + " = new " + array_type +
+                      "[_o." + property_name + ".Count];\n";
+              code += "      for (var _j = 0; _j < " + array_name +
+                      ".Length; ++_j) { ";
+              code += array_name + "[_j] = " + to_array + "; }\n";
+            } else {
+              code += "      var " + array_name + " = _o." + property_name +
+                      ".ToArray();\n";
+            }
+            code += "      _" + field.name + " = Create" + camel_name +
+                    "Vector(builder, " + array_name + ");\n";
+            code += "    }\n";
+          } else {
+            auto pack_method =
+                field.value.type.struct_def == nullptr
+                    ? "builder.Add" + GenMethod(field.value.type.VectorType()) +
+                          "(_o." + camel_name + "[_j]);"
+                    : GenTypeGet(field.value.type) + ".Pack(builder, _o." +
+                          camel_name + "[_j]);";
+            code += "    var _" + field.name + " = default(VectorOffset);\n";
+            code += "    if (_o." + camel_name + " != null) {\n";
+            code += "      Start" + camel_name + "Vector(builder, _o." +
+                    camel_name + ".Count);\n";
+            code += "      for (var _j = _o." + camel_name +
+                    ".Count - 1; _j >= 0; --_j) { " + pack_method + " }\n";
+            code += "      _" + field.name + " = builder.EndVector();\n";
+            code += "    }\n";
+          }
+          break;
+        }
+        case BASE_TYPE_ARRAY: {
+          if (field.value.type.struct_def != nullptr) {
+            std::vector<FieldArrayLength> array_lengths;
+            FieldArrayLength tmp_array_length = {
+              field.name,
+              field.value.type.fixed_length,
+            };
+            array_lengths.push_back(tmp_array_length);
+            GenStructPackDecl_ObjectAPI(*field.value.type.struct_def, code_ptr,
+                                        array_lengths);
+          } else {
+            code += "    var _" + field.name + " = _o." + camel_name + ";\n";
+          }
+          break;
+        }
+        case BASE_TYPE_UNION: {
+          code += "    var _" + field.name + "_type = _o." + camel_name +
+                  " == null ? " + WrapInNameSpace(*field.value.type.enum_def) +
+                  ".NONE : " + "_o." + camel_name + ".Type;\n";
+          code +=
+              "    var _" + field.name + " = _o." + camel_name +
+              " == null ? 0 : " + GenTypeGet_ObjectAPI(field.value.type, opts) +
+              ".Pack(builder, _o." + camel_name + ");\n";
+          break;
+        }
+        default: break;
+      }
+    }
+    if (struct_has_create) {
+      // Create
+      code += "    return Create" + struct_def.name + "(\n";
+      code += "      builder";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        auto camel_name = MakeCamel(field.name);
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT: {
+            if (struct_def.fixed) {
+              GenStructPackCall_ObjectAPI(*field.value.type.struct_def,
+                                          code_ptr,
+                                          "      _" + field.name + "_");
+            } else {
+              code += ",\n";
+              if (field.value.type.struct_def->fixed) {
+                if (opts.generate_object_based_api)
+                  code += "      _o." + camel_name;
+                else
+                  code += "      " + GenTypeGet(field.value.type) +
+                          ".Pack(builder, _o." + camel_name + ")";
+              } else {
+                code += "      _" + field.name;
+              }
+            }
+            break;
+          }
+          case BASE_TYPE_ARRAY: {
+            if (field.value.type.struct_def != nullptr) {
+              GenStructPackCall_ObjectAPI(*field.value.type.struct_def,
+                                          code_ptr,
+                                          "      _" + field.name + "_");
+            } else {
+              code += ",\n";
+              code += "      _" + field.name;
+            }
+            break;
+          }
+          case BASE_TYPE_UNION: FLATBUFFERS_FALLTHROUGH();   // fall thru
+          case BASE_TYPE_UTYPE: FLATBUFFERS_FALLTHROUGH();   // fall thru
+          case BASE_TYPE_STRING: FLATBUFFERS_FALLTHROUGH();  // fall thru
+          case BASE_TYPE_VECTOR: {
+            code += ",\n";
+            code += "      _" + field.name;
+            break;
+          }
+          default:  // scalar
+            code += ",\n";
+            code += "      _o." + camel_name;
+            break;
+        }
+      }
+      code += ");\n";
+    } else {
+      // Start, End
+      code += "    Start" + struct_def.name + "(builder);\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        auto camel_name = MakeCamel(field.name);
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT: {
+            if (field.value.type.struct_def->fixed) {
+              code += "    Add" + camel_name + "(builder, " +
+                      GenTypeGet(field.value.type) + ".Pack(builder, _o." +
+                      camel_name + "));\n";
+            } else {
+              code +=
+                  "    Add" + camel_name + "(builder, _" + field.name + ");\n";
+            }
+            break;
+          }
+          case BASE_TYPE_STRING: FLATBUFFERS_FALLTHROUGH();  // fall thru
+          case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();   // fall thru
+          case BASE_TYPE_VECTOR: {
+            code +=
+                "    Add" + camel_name + "(builder, _" + field.name + ");\n";
+            break;
+          }
+          case BASE_TYPE_UTYPE: break;
+          case BASE_TYPE_UNION: {
+            code += "    Add" + camel_name + "Type(builder, _" + field.name +
+                    "_type);\n";
+            code +=
+                "    Add" + camel_name + "(builder, _" + field.name + ");\n";
+            break;
+          }
+          // scalar
+          default: {
+            code +=
+                "    Add" + camel_name + "(builder, _o." + camel_name + ");\n";
+            break;
+          }
+        }
+      }
+      code += "    return End" + struct_def.name + "(builder);\n";
+    }
+    code += "  }\n";
+  }
+
+  void GenStructPackDecl_ObjectAPI(
+      const StructDef &struct_def, std::string *code_ptr,
+      std::vector<FieldArrayLength> &array_lengths) const {
+    auto &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      auto is_array = IsArray(field.value.type);
+      const auto &field_type =
+          is_array ? field.value.type.VectorType() : field.value.type;
+      FieldArrayLength tmp_array_length = {
+        field.name,
+        field_type.fixed_length,
+      };
+      array_lengths.push_back(tmp_array_length);
+      if (field_type.struct_def != nullptr) {
+        GenStructPackDecl_ObjectAPI(*field_type.struct_def, code_ptr,
+                                    array_lengths);
+      } else {
+        std::vector<FieldArrayLength> array_only_lengths;
+        for (size_t i = 0; i < array_lengths.size(); ++i) {
+          if (array_lengths[i].length > 0) {
+            array_only_lengths.push_back(array_lengths[i]);
+          }
+        }
+        std::string name;
+        for (size_t i = 0; i < array_lengths.size(); ++i) {
+          name += "_" + array_lengths[i].name;
+        }
+        code += "    var " + name + " = ";
+        if (array_only_lengths.size() > 0) {
+          code += "new " + GenTypeBasic(field_type) + "[";
+          for (size_t i = 0; i < array_only_lengths.size(); ++i) {
+            if (i != 0) { code += ","; }
+            code += NumToString(array_only_lengths[i].length);
+          }
+          code += "];\n";
+          code += "    ";
+          // initialize array
+          for (size_t i = 0; i < array_only_lengths.size(); ++i) {
+            auto idx = "idx" + NumToString(i);
+            code += "for (var " + idx + " = 0; " + idx + " < " +
+                    NumToString(array_only_lengths[i].length) + "; ++" + idx +
+                    ") {";
+          }
+          for (size_t i = 0; i < array_only_lengths.size(); ++i) {
+            auto idx = "idx" + NumToString(i);
+            if (i == 0) {
+              code += name + "[" + idx;
+            } else {
+              code += "," + idx;
+            }
+          }
+          code += "] = _o";
+          for (size_t i = 0, j = 0; i < array_lengths.size(); ++i) {
+            code += "." + MakeCamel(array_lengths[i].name);
+            if (array_lengths[i].length <= 0) continue;
+            code += "[idx" + NumToString(j++) + "]";
+          }
+          code += ";";
+          for (size_t i = 0; i < array_only_lengths.size(); ++i) {
+            code += "}";
+          }
+        } else {
+          code += "_o";
+          for (size_t i = 0; i < array_lengths.size(); ++i) {
+            code += "." + MakeCamel(array_lengths[i].name);
+          }
+          code += ";";
+        }
+        code += "\n";
+      }
+      array_lengths.pop_back();
+    }
+  }
+
+  void GenStructPackCall_ObjectAPI(const StructDef &struct_def,
+                                   std::string *code_ptr,
+                                   std::string prefix) const {
+    auto &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      if (field_type.struct_def != nullptr) {
+        GenStructPackCall_ObjectAPI(*field_type.struct_def, code_ptr,
+                                    prefix + field.name + "_");
+      } else {
+        code += ",\n";
+        code += prefix + field.name;
+      }
+    }
+  }
+
+  std::string GenTypeGet_ObjectAPI(flatbuffers::Type type,
+                                   const IDLOptions &opts) const {
+    auto type_name = GenTypeGet(type);
+    // Replace to ObjectBaseAPI Type Name
+    switch (type.base_type) {
+      case BASE_TYPE_STRUCT: FLATBUFFERS_FALLTHROUGH();  // fall thru
+      case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();   // fall thru
+      case BASE_TYPE_VECTOR: {
+        if (type.struct_def != nullptr) {
+          auto type_name_length = type.struct_def->name.length();
+          auto new_type_name =
+              GenTypeName_ObjectAPI(type.struct_def->name, opts);
+          type_name.replace(type_name.length() - type_name_length,
+                            type_name_length, new_type_name);
+        } else if (type.element == BASE_TYPE_UNION) {
+          type_name = WrapInNameSpace(*type.enum_def) + "Union";
+        }
+        break;
+      }
+
+      case BASE_TYPE_UNION: {
+        type_name = WrapInNameSpace(*type.enum_def) + "Union";
+        break;
+      }
+      default: break;
+    }
+
+    switch (type.base_type) {
+      case BASE_TYPE_ARRAY: {
+        type_name = type_name + "[]";
+        break;
+      }
+      case BASE_TYPE_VECTOR: {
+        type_name = "List<" + type_name + ">";
+        break;
+      }
+      default: break;
+    }
+    return type_name;
+  }
+
+  void GenStruct_ObjectAPI(StructDef &struct_def, std::string *code_ptr,
+                           const IDLOptions &opts) const {
+    auto &code = *code_ptr;
+    if (struct_def.attributes.Lookup("private")) {
+      code += "internal ";
+    } else {
+      code += "public ";
+    }
+    if (struct_def.attributes.Lookup("csharp_partial")) {
+      // generate a partial class for this C# struct/table
+      code += "partial ";
+    }
+    auto class_name = GenTypeName_ObjectAPI(struct_def.name, opts);
+    code += "class " + class_name;
+    code += "\n{\n";
+    // Generate Properties
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (field.value.type.base_type == BASE_TYPE_UTYPE) continue;
+      if (field.value.type.element == BASE_TYPE_UTYPE) continue;
+      auto type_name = GenTypeGet_ObjectAPI(field.value.type, opts);
+      if (field.IsScalarOptional()) type_name += "?";
+      auto camel_name = MakeCamel(field.name, true);
+      if (opts.cs_gen_json_serializer) {
+        if (IsUnion(field.value.type)) {
+          auto utype_name = WrapInNameSpace(*field.value.type.enum_def);
+          code +=
+              "  [Newtonsoft.Json.JsonProperty(\"" + field.name + "_type\")]\n";
+          if (IsVector(field.value.type)) {
+            code += "  private " + utype_name + "[] " + camel_name + "Type {\n";
+            code += "    get {\n";
+            code += "      if (this." + camel_name + " == null) return null;\n";
+            code += "      var _o = new " + utype_name + "[this." + camel_name +
+                    ".Count];\n";
+            code +=
+                "      for (var _j = 0; _j < _o.Length; ++_j) { _o[_j] = "
+                "this." +
+                camel_name + "[_j].Type; }\n";
+            code += "      return _o;\n";
+            code += "    }\n";
+            code += "    set {\n";
+            code += "      this." + camel_name + " = new List<" + utype_name +
+                    "Union>();\n";
+            code += "      for (var _j = 0; _j < value.Length; ++_j) {\n";
+            code += "        var _o = new " + utype_name + "Union();\n";
+            code += "        _o.Type = value[_j];\n";
+            code += "        this." + camel_name + ".Add(_o);\n";
+            code += "      }\n";
+            code += "    }\n";
+            code += "  }\n";
+          } else {
+            code += "  private " + utype_name + " " + camel_name + "Type {\n";
+            code += "    get {\n";
+            code += "      return this." + camel_name + " != null ? this." +
+                    camel_name + ".Type : " + utype_name + ".NONE;\n";
+            code += "    }\n";
+            code += "    set {\n";
+            code += "      this." + camel_name + " = new " + utype_name +
+                    "Union();\n";
+            code += "      this." + camel_name + ".Type = value;\n";
+            code += "    }\n";
+            code += "  }\n";
+          }
+        }
+        code += "  [Newtonsoft.Json.JsonProperty(\"" + field.name + "\")]\n";
+        if (IsUnion(field.value.type)) {
+          auto union_name =
+              (IsVector(field.value.type))
+                  ? GenTypeGet_ObjectAPI(field.value.type.VectorType(), opts)
+                  : type_name;
+          code += "  [Newtonsoft.Json.JsonConverter(typeof(" + union_name +
+                  "_JsonConverter))]\n";
+        }
+        if (field.attributes.Lookup("hash")) {
+          code += "  [Newtonsoft.Json.JsonIgnore()]\n";
+        }
+      }
+      code += "  public " + type_name + " " + camel_name + " { get; set; }\n";
+    }
+    // Generate Constructor
+    code += "\n";
+    code += "  public " + class_name + "() {\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (field.value.type.base_type == BASE_TYPE_UTYPE) continue;
+      if (field.value.type.element == BASE_TYPE_UTYPE) continue;
+      code += "    this." + MakeCamel(field.name) + " = ";
+      auto type_name = GenTypeGet_ObjectAPI(field.value.type, opts);
+      if (IsScalar(field.value.type.base_type)) {
+        code += GenDefaultValue(field) + ";\n";
+      } else {
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT: {
+            if (IsStruct(field.value.type)) {
+              code += "new " + type_name + "();\n";
+            } else {
+              code += "null;\n";
+            }
+            break;
+          }
+          case BASE_TYPE_ARRAY: {
+            code += "new " + type_name.substr(0, type_name.length() - 1) +
+                    NumToString(field.value.type.fixed_length) + "];\n";
+            break;
+          }
+          default: {
+            code += "null;\n";
+            break;
+          }
+        }
+      }
+    }
+    code += "  }\n";
+    // Generate Serialization
+    if (opts.cs_gen_json_serializer &&
+        parser_.root_struct_def_ == &struct_def) {
+      code += "\n";
+      code += "  public static " + class_name +
+              " DeserializeFromJson(string jsonText) {\n";
+      code += "    return Newtonsoft.Json.JsonConvert.DeserializeObject<" +
+              class_name + ">(jsonText);\n";
+      code += "  }\n";
+      code += "  public string SerializeToJson() {\n";
+      code +=
+          "    return Newtonsoft.Json.JsonConvert.SerializeObject(this, "
+          "Newtonsoft.Json.Formatting.Indented);\n";
+      code += "  }\n";
+    }
+    if (parser_.root_struct_def_ == &struct_def) {
+      code += "  public static " + class_name +
+              " DeserializeFromBinary(byte[] fbBuffer) {\n";
+      code += "    return " + struct_def.name + ".GetRootAs" + struct_def.name +
+              "(new ByteBuffer(fbBuffer)).UnPack();\n";
+      code += "  }\n";
+      code += "  public byte[] SerializeToBinary() {\n";
+      code += "    var fbb = new FlatBufferBuilder(0x10000);\n";
+      code += "    " + struct_def.name + ".Finish" + struct_def.name +
+              "Buffer(fbb, " + struct_def.name + ".Pack(fbb, this));\n";
+      code += "    return fbb.DataBuffer.ToSizedArray();\n";
+      code += "  }\n";
+    }
+    code += "}\n\n";
+  }
+
+  // This tracks the current namespace used to determine if a type need to be
+  // prefixed by its namespace
+  const Namespace *cur_name_space_;
+};
+}  // namespace csharp
+
+bool GenerateCSharp(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  csharp::CSharpGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_dart.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_dart.cpp
new file mode 100644
index 0000000..56c4a82
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_dart.cpp
@@ -0,0 +1,955 @@
+/*
+ * Copyright 2018 Dan Field
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+#include <cassert>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+namespace dart {
+
+const std::string _kFb = "fb";
+// see https://www.dartlang.org/guides/language/language-tour#keywords
+// yeild*, async*, and sync* shouldn't be problems anyway but keeping them in
+static const char *keywords[] = {
+  "abstract",   "deferred", "if",       "super",   "as",       "do",
+  "implements", "switch",   "assert",   "dynamic", "import",   "sync*",
+  "async",      "else",     "in",       "this",    "async*",   "enum",
+  "is",         "throw",    "await",    "export",  "library",  "true",
+  "break",      "external", "new",      "try",     "case",     "extends",
+  "null",       "typedef",  "catch",    "factory", "operator", "var",
+  "class",      "false",    "part",     "void",    "const",    "final",
+  "rethrow",    "while",    "continue", "finally", "return",   "with",
+  "covariant",  "for",      "set",      "yield",   "default",  "get",
+  "static",     "yield*"
+};
+
+// Iterate through all definitions we haven't generate code for (enums, structs,
+// and tables) and output them to a single file.
+class DartGenerator : public BaseGenerator {
+ public:
+  typedef std::map<std::string, std::string> namespace_code_map;
+
+  DartGenerator(const Parser &parser, const std::string &path,
+                const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", ".", "dart") {}
+  // Iterate through all definitions we haven't generate code for (enums,
+  // structs, and tables) and output them to a single file.
+  bool generate() {
+    std::string code;
+    namespace_code_map namespace_code;
+    GenerateEnums(&namespace_code);
+    GenerateStructs(&namespace_code);
+
+    for (auto kv = namespace_code.begin(); kv != namespace_code.end(); ++kv) {
+      code.clear();
+      code = code + "// " + FlatBuffersGeneratedWarning() + "\n";
+      code = code +
+             "// ignore_for_file: unused_import, unused_field, "
+             "unused_local_variable\n\n";
+
+      if (!kv->first.empty()) { code += "library " + kv->first + ";\n\n"; }
+
+      code += "import 'dart:typed_data' show Uint8List;\n";
+      code += "import 'package:flat_buffers/flat_buffers.dart' as " + _kFb +
+              ";\n\n";
+
+      for (auto kv2 = namespace_code.begin(); kv2 != namespace_code.end();
+           ++kv2) {
+        if (kv2->first != kv->first) {
+          code +=
+              "import '" +
+              GeneratedFileName(
+                  "./",
+                  file_name_ + (!kv2->first.empty() ? "_" + kv2->first : ""),
+                  parser_.opts) +
+              "' as " + ImportAliasName(kv2->first) + ";\n";
+        }
+      }
+      code += "\n";
+      code += kv->second;
+
+      if (!SaveFile(
+              GeneratedFileName(
+                  path_,
+                  file_name_ + (!kv->first.empty() ? "_" + kv->first : ""),
+                  parser_.opts)
+                  .c_str(),
+              code, false)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  static std::string ImportAliasName(const std::string &ns) {
+    std::string ret;
+    ret.assign(ns);
+    size_t pos = ret.find('.');
+    while (pos != std::string::npos) {
+      ret.replace(pos, 1, "_");
+      pos = ret.find('.', pos + 1);
+    }
+
+    return ret;
+  }
+
+  static std::string BuildNamespaceName(const Namespace &ns) {
+    if (ns.components.empty()) { return ""; }
+    std::stringstream sstream;
+    std::copy(ns.components.begin(), ns.components.end() - 1,
+              std::ostream_iterator<std::string>(sstream, "."));
+
+    auto ret = sstream.str() + ns.components.back();
+    for (size_t i = 0; i < ret.size(); i++) {
+      auto lower = CharToLower(ret[i]);
+      if (lower != ret[i]) {
+        ret[i] = lower;
+        if (i != 0 && ret[i - 1] != '.') {
+          ret.insert(i, "_");
+          i++;
+        }
+      }
+    }
+    // std::transform(ret.begin(), ret.end(), ret.begin(), CharToLower);
+    return ret;
+  }
+
+  void GenIncludeDependencies(std::string *code,
+                              const std::string &the_namespace) {
+    for (auto it = parser_.included_files_.begin();
+         it != parser_.included_files_.end(); ++it) {
+      if (it->second.empty()) continue;
+
+      auto noext = flatbuffers::StripExtension(it->second);
+      auto basename = flatbuffers::StripPath(noext);
+
+      *code +=
+          "import '" +
+          GeneratedFileName(
+              "", basename + (the_namespace == "" ? "" : "_" + the_namespace),
+              parser_.opts) +
+          "';\n";
+    }
+  }
+
+  static std::string EscapeKeyword(const std::string &name) {
+    for (size_t i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) {
+      if (name == keywords[i]) { return MakeCamel(name + "_", false); }
+    }
+
+    return MakeCamel(name, false);
+  }
+
+  void GenerateEnums(namespace_code_map *namespace_code) {
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      auto &enum_def = **it;
+      GenEnum(enum_def, namespace_code);  // enum_code_ptr);
+    }
+  }
+
+  void GenerateStructs(namespace_code_map *namespace_code) {
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      GenStruct(struct_def, namespace_code);
+    }
+  }
+
+  // Generate a documentation comment, if available.
+  static void GenDocComment(const std::vector<std::string> &dc,
+                            std::string *code_ptr,
+                            const std::string &extra_lines,
+                            const char *indent = nullptr) {
+    if (dc.empty() && extra_lines.empty()) {
+      // Don't output empty comment blocks with 0 lines of comment content.
+      return;
+    }
+
+    auto &code = *code_ptr;
+
+    for (auto it = dc.begin(); it != dc.end(); ++it) {
+      if (indent) code += indent;
+      code += "/// " + *it + "\n";
+    }
+    if (!extra_lines.empty()) {
+      if (!dc.empty()) {
+        if (indent) code += indent;
+        code += "///\n";
+      }
+      if (indent) code += indent;
+      std::string::size_type start = 0;
+      for (;;) {
+        auto end = extra_lines.find('\n', start);
+        if (end != std::string::npos) {
+          code += "/// " + extra_lines.substr(start, end - start) + "\n";
+          start = end + 1;
+        } else {
+          code += "/// " + extra_lines.substr(start) + "\n";
+          break;
+        }
+      }
+    }
+  }
+
+  static void GenDocComment(std::string *code_ptr,
+                            const std::string &extra_lines) {
+    GenDocComment(std::vector<std::string>(), code_ptr, extra_lines);
+  }
+
+  // Generate an enum declaration and an enum string lookup table.
+  void GenEnum(EnumDef &enum_def, namespace_code_map *namespace_code) {
+    if (enum_def.generated) return;
+    auto ns = BuildNamespaceName(*enum_def.defined_namespace);
+    std::string code;
+    GenDocComment(enum_def.doc_comment, &code, "");
+
+    auto name = enum_def.is_union ? enum_def.name + "TypeId" : enum_def.name;
+    auto is_bit_flags = enum_def.attributes.Lookup("bit_flags");
+
+    code += "class " + name + " {\n";
+    code += "  final int value;\n";
+    code += "  const " + name + "._(this.value);\n\n";
+    code += "  factory " + name + ".fromValue(int value) {\n";
+    code += "    if (value == null) value = 0;\n";
+
+    code += "    if (!values.containsKey(value)) {\n";
+    code +=
+        "      throw new StateError('Invalid value $value for bit flag enum ";
+    code += name + "');\n";
+    code += "    }\n";
+
+    code += "    return values[value];\n";
+    code += "  }\n\n";
+
+    // this is meaningless for bit_flags
+    // however, note that unlike "regular" dart enums this enum can still have
+    // holes.
+    if (!is_bit_flags) {
+      code += "  static const int minValue = " +
+              enum_def.ToString(*enum_def.MinValue()) + ";\n";
+      code += "  static const int maxValue = " +
+              enum_def.ToString(*enum_def.MaxValue()) + ";\n";
+    }
+
+    code +=
+        "  static bool containsValue(int value) =>"
+        " values.containsKey(value);\n\n";
+
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+
+      if (!ev.doc_comment.empty()) {
+        if (it != enum_def.Vals().begin()) { code += '\n'; }
+        GenDocComment(ev.doc_comment, &code, "", "  ");
+      }
+      code += "  static const " + name + " " + ev.name + " = ";
+      code += "const " + name + "._(" + enum_def.ToString(ev) + ");\n";
+    }
+
+    code += "  static const Map<int," + name + "> values = {";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      code += enum_def.ToString(ev) + ": " + ev.name + ",";
+    }
+    code += "};\n\n";
+
+    code += "  static const " + _kFb + ".Reader<" + name +
+            "> reader = const _" + name + "Reader();\n\n";
+    code += "  @override\n";
+    code += "  String toString() {\n";
+    code += "    return '" + name + "{value: $value}';\n";
+    code += "  }\n";
+    code += "}\n\n";
+
+    GenEnumReader(enum_def, name, &code);
+    (*namespace_code)[ns] += code;
+  }
+
+  void GenEnumReader(EnumDef &enum_def, const std::string &name,
+                     std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    code += "class _" + name + "Reader extends " + _kFb + ".Reader<" + name +
+            "> {\n";
+    code += "  const _" + name + "Reader();\n\n";
+    code += "  @override\n";
+    code += "  int get size => 1;\n\n";
+    code += "  @override\n";
+    code +=
+        "  " + name + " read(" + _kFb + ".BufferContext bc, int offset) =>\n";
+    code += "      new " + name + ".fromValue(const " + _kFb + "." +
+            GenType(enum_def.underlying_type) + "Reader().read(bc, offset));\n";
+    code += "}\n\n";
+  }
+
+  static std::string GenType(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_BOOL: return "Bool";
+      case BASE_TYPE_CHAR: return "Int8";
+      case BASE_TYPE_UTYPE:
+      case BASE_TYPE_UCHAR: return "Uint8";
+      case BASE_TYPE_SHORT: return "Int16";
+      case BASE_TYPE_USHORT: return "Uint16";
+      case BASE_TYPE_INT: return "Int32";
+      case BASE_TYPE_UINT: return "Uint32";
+      case BASE_TYPE_LONG: return "Int64";
+      case BASE_TYPE_ULONG: return "Uint64";
+      case BASE_TYPE_FLOAT: return "Float32";
+      case BASE_TYPE_DOUBLE: return "Float64";
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_VECTOR: return GenType(type.VectorType());
+      case BASE_TYPE_STRUCT: return type.struct_def->name;
+      case BASE_TYPE_UNION: return type.enum_def->name + "TypeId";
+      default: return "Table";
+    }
+  }
+
+  std::string GenReaderTypeName(const Type &type, Namespace *current_namespace,
+                                const FieldDef &def,
+                                bool parent_is_vector = false) {
+    if (type.base_type == BASE_TYPE_BOOL) {
+      return "const " + _kFb + ".BoolReader()";
+    } else if (IsVector(type)) {
+      return "const " + _kFb + ".ListReader<" +
+             GenDartTypeName(type.VectorType(), current_namespace, def) + ">(" +
+             GenReaderTypeName(type.VectorType(), current_namespace, def,
+                               true) +
+             ")";
+    } else if (IsString(type)) {
+      return "const " + _kFb + ".StringReader()";
+    }
+    if (IsScalar(type.base_type)) {
+      if (type.enum_def && parent_is_vector) {
+        return GenDartTypeName(type, current_namespace, def) + ".reader";
+      }
+      return "const " + _kFb + "." + GenType(type) + "Reader()";
+    } else {
+      return GenDartTypeName(type, current_namespace, def) + ".reader";
+    }
+  }
+
+  std::string GenDartTypeName(const Type &type, Namespace *current_namespace,
+                              const FieldDef &def, bool addBuilder = false) {
+    if (type.enum_def) {
+      if (type.enum_def->is_union && type.base_type != BASE_TYPE_UNION) {
+        return type.enum_def->name + "TypeId";
+      } else if (type.enum_def->is_union) {
+        return "dynamic";
+      } else if (type.base_type != BASE_TYPE_VECTOR) {
+        return type.enum_def->name;
+      }
+    }
+
+    switch (type.base_type) {
+      case BASE_TYPE_BOOL: return "bool";
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_ULONG:
+      case BASE_TYPE_INT:
+      case BASE_TYPE_UINT:
+      case BASE_TYPE_SHORT:
+      case BASE_TYPE_USHORT:
+      case BASE_TYPE_CHAR:
+      case BASE_TYPE_UCHAR: return "int";
+      case BASE_TYPE_FLOAT:
+      case BASE_TYPE_DOUBLE: return "double";
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_STRUCT:
+        return MaybeWrapNamespace(
+            type.struct_def->name + (addBuilder ? "ObjectBuilder" : ""),
+            current_namespace, def);
+      case BASE_TYPE_VECTOR:
+        return "List<" +
+               GenDartTypeName(type.VectorType(), current_namespace, def,
+                               addBuilder) +
+               ">";
+      default: assert(0); return "dynamic";
+    }
+  }
+
+  static const std::string MaybeWrapNamespace(const std::string &type_name,
+                                              Namespace *current_ns,
+                                              const FieldDef &field) {
+    auto curr_ns_str = BuildNamespaceName(*current_ns);
+    std::string field_ns_str = "";
+    if (field.value.type.struct_def) {
+      field_ns_str +=
+          BuildNamespaceName(*field.value.type.struct_def->defined_namespace);
+    } else if (field.value.type.enum_def) {
+      field_ns_str +=
+          BuildNamespaceName(*field.value.type.enum_def->defined_namespace);
+    }
+
+    if (field_ns_str != "" && field_ns_str != curr_ns_str) {
+      return ImportAliasName(field_ns_str) + "." + type_name;
+    } else {
+      return type_name;
+    }
+  }
+
+  // Generate an accessor struct with constructor for a flatbuffers struct.
+  void GenStruct(const StructDef &struct_def,
+                 namespace_code_map *namespace_code) {
+    if (struct_def.generated) return;
+
+    auto object_namespace = BuildNamespaceName(*struct_def.defined_namespace);
+    std::string code;
+
+    const auto &object_name = struct_def.name;
+
+    // Emit constructor
+
+    GenDocComment(struct_def.doc_comment, &code, "");
+
+    auto reader_name = "_" + object_name + "Reader";
+    auto builder_name = object_name + "Builder";
+    auto object_builder_name = object_name + "ObjectBuilder";
+
+    std::string reader_code, builder_code;
+
+    code += "class " + object_name + " {\n";
+
+    code += "  " + object_name + "._(this._bc, this._bcOffset);\n";
+    if (!struct_def.fixed) {
+      code += "  factory " + object_name + "(List<int> bytes) {\n";
+      code += "    " + _kFb + ".BufferContext rootRef = new " + _kFb +
+              ".BufferContext.fromBytes(bytes);\n";
+      code += "    return reader.read(rootRef, 0);\n";
+      code += "  }\n";
+    }
+
+    code += "\n";
+    code += "  static const " + _kFb + ".Reader<" + object_name +
+            "> reader = const " + reader_name + "();\n\n";
+
+    code += "  final " + _kFb + ".BufferContext _bc;\n";
+    code += "  final int _bcOffset;\n\n";
+
+    std::vector<std::pair<int, FieldDef *>> non_deprecated_fields;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto offset = static_cast<int>(it - struct_def.fields.vec.begin());
+      non_deprecated_fields.push_back(std::make_pair(offset, &field));
+    }
+
+    GenImplementationGetters(struct_def, non_deprecated_fields, &code);
+
+    code += "}\n\n";
+
+    GenReader(struct_def, &reader_name, &reader_code);
+    GenBuilder(struct_def, non_deprecated_fields, &builder_name, &builder_code);
+    GenObjectBuilder(struct_def, non_deprecated_fields, &object_builder_name,
+                     &builder_code);
+
+    code += reader_code;
+    code += builder_code;
+
+    (*namespace_code)[object_namespace] += code;
+  }
+
+  std::string NamespaceAliasFromUnionType(Namespace *root_namespace,
+                                          const Type &type) {
+    const std::vector<std::string> qualified_name_parts =
+        type.struct_def->defined_namespace->components;
+    if (std::equal(root_namespace->components.begin(),
+                   root_namespace->components.end(),
+                   qualified_name_parts.begin())) {
+      return type.struct_def->name;
+    }
+
+    std::string ns;
+
+    for (auto it = qualified_name_parts.begin();
+         it != qualified_name_parts.end(); ++it) {
+      auto &part = *it;
+
+      for (size_t i = 0; i < part.length(); i++) {
+        if (i && !isdigit(part[i]) && part[i] == CharToUpper(part[i])) {
+          ns += "_";
+          ns += CharToLower(part[i]);
+        } else {
+          ns += CharToLower(part[i]);
+        }
+      }
+      if (it != qualified_name_parts.end() - 1) { ns += "_"; }
+    }
+
+    return ns + "." + type.struct_def->name;
+  }
+
+  void GenImplementationGetters(
+      const StructDef &struct_def,
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      std::string field_name = MakeCamel(field.name, false);
+      std::string type_name = GenDartTypeName(
+          field.value.type, struct_def.defined_namespace, field, false);
+
+      GenDocComment(field.doc_comment, &code, "", "  ");
+
+      code += "  " + type_name + " get " + field_name;
+      if (field.value.type.base_type == BASE_TYPE_UNION) {
+        code += " {\n";
+        code += "    switch (" + field_name + "Type?.value) {\n";
+        auto &enum_def = *field.value.type.enum_def;
+        for (auto en_it = enum_def.Vals().begin() + 1;
+             en_it != enum_def.Vals().end(); ++en_it) {
+          auto &ev = **en_it;
+
+          auto enum_name = NamespaceAliasFromUnionType(
+              enum_def.defined_namespace, ev.union_type);
+          code += "      case " + enum_def.ToString(ev) + ": return " +
+                  enum_name + ".reader.vTableGet(_bc, _bcOffset, " +
+                  NumToString(field.value.offset) + ", null);\n";
+        }
+        code += "      default: return null;\n";
+        code += "    }\n";
+        code += "  }\n";
+      } else {
+        code += " => ";
+        if (field.value.type.enum_def &&
+            field.value.type.base_type != BASE_TYPE_VECTOR) {
+          code += "new " +
+                  GenDartTypeName(field.value.type,
+                                  struct_def.defined_namespace, field) +
+                  ".fromValue(";
+        }
+
+        code += GenReaderTypeName(field.value.type,
+                                  struct_def.defined_namespace, field);
+        if (struct_def.fixed) {
+          code +=
+              ".read(_bc, _bcOffset + " + NumToString(field.value.offset) + ")";
+        } else {
+          code += ".vTableGet(_bc, _bcOffset, " +
+                  NumToString(field.value.offset) + ", ";
+          if (!field.value.constant.empty() && field.value.constant != "0") {
+            if (IsBool(field.value.type.base_type)) {
+              code += "true";
+            } else if (field.value.constant == "nan" ||
+                       field.value.constant == "+nan" ||
+                       field.value.constant == "-nan") {
+              code += "double.nan";
+            } else if (field.value.constant == "inf" ||
+                       field.value.constant == "+inf") {
+              code += "double.infinity";
+            } else if (field.value.constant == "-inf") {
+              code += "double.negativeInfinity";
+            } else {
+              code += field.value.constant;
+            }
+          } else {
+            if (IsBool(field.value.type.base_type)) {
+              code += "false";
+            } else if (IsScalar(field.value.type.base_type)) {
+              code += "0";
+            } else {
+              code += "null";
+            }
+          }
+          code += ")";
+        }
+        if (field.value.type.enum_def &&
+            field.value.type.base_type != BASE_TYPE_VECTOR) {
+          code += ")";
+        }
+        code += ";\n";
+      }
+    }
+
+    code += "\n";
+
+    code += "  @override\n";
+    code += "  String toString() {\n";
+    code += "    return '" + struct_def.name + "{";
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+      code +=
+          MakeCamel(field.name, false) + ": $" + MakeCamel(field.name, false);
+      if (it != non_deprecated_fields.end() - 1) { code += ", "; }
+    }
+    code += "}';\n";
+    code += "  }\n";
+  }
+
+  void GenReader(const StructDef &struct_def, std::string *reader_name_ptr,
+                 std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto &reader_name = *reader_name_ptr;
+    auto &impl_name = struct_def.name;
+
+    code += "class " + reader_name + " extends " + _kFb;
+    if (struct_def.fixed) {
+      code += ".StructReader<";
+    } else {
+      code += ".TableReader<";
+    }
+    code += impl_name + "> {\n";
+    code += "  const " + reader_name + "();\n\n";
+
+    if (struct_def.fixed) {
+      code += "  @override\n";
+      code += "  int get size => " + NumToString(struct_def.bytesize) + ";\n\n";
+    }
+    code += "  @override\n";
+    code += "  " + impl_name +
+            " createObject(fb.BufferContext bc, int offset) => \n    new " +
+            impl_name + "._(bc, offset);\n";
+    code += "}\n\n";
+  }
+
+  void GenBuilder(const StructDef &struct_def,
+                  std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+                  std::string *builder_name_ptr, std::string *code_ptr) {
+    if (non_deprecated_fields.size() == 0) { return; }
+    auto &code = *code_ptr;
+    auto &builder_name = *builder_name_ptr;
+
+    code += "class " + builder_name + " {\n";
+    code += "  " + builder_name + "(this.fbBuilder) {\n";
+    code += "    assert(fbBuilder != null);\n";
+    code += "  }\n\n";
+    code += "  final " + _kFb + ".Builder fbBuilder;\n\n";
+
+    if (struct_def.fixed) {
+      StructBuilderBody(struct_def, non_deprecated_fields, code_ptr);
+    } else {
+      TableBuilderBody(struct_def, non_deprecated_fields, code_ptr);
+    }
+
+    code += "}\n\n";
+  }
+
+  void StructBuilderBody(
+      const StructDef &struct_def,
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    code += "  int finish(";
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      if (IsStruct(field.value.type)) {
+        code += "fb.StructBuilder";
+      } else {
+        code += GenDartTypeName(field.value.type, struct_def.defined_namespace,
+                                field);
+      }
+      code += " " + field.name;
+      if (it != non_deprecated_fields.end() - 1) { code += ", "; }
+    }
+    code += ") {\n";
+
+    for (auto it = non_deprecated_fields.rbegin();
+         it != non_deprecated_fields.rend(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      if (field.padding) {
+        code += "    fbBuilder.pad(" + NumToString(field.padding) + ");\n";
+      }
+
+      if (IsStruct(field.value.type)) {
+        code += "    " + field.name + "();\n";
+      } else {
+        code += "    fbBuilder.put" + GenType(field.value.type) + "(";
+        code += field.name;
+        if (field.value.type.enum_def) { code += "?.value"; }
+        code += ");\n";
+      }
+    }
+    code += "    return fbBuilder.offset;\n";
+    code += "  }\n\n";
+  }
+
+  void TableBuilderBody(
+      const StructDef &struct_def,
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    code += "  void begin() {\n";
+    code += "    fbBuilder.startTable();\n";
+    code += "  }\n\n";
+
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+      auto offset = pair.first;
+
+      if (IsScalar(field.value.type.base_type)) {
+        code += "  int add" + MakeCamel(field.name) + "(";
+        code += GenDartTypeName(field.value.type, struct_def.defined_namespace,
+                                field);
+        code += " " + MakeCamel(field.name, false) + ") {\n";
+        code += "    fbBuilder.add" + GenType(field.value.type) + "(" +
+                NumToString(offset) + ", ";
+        code += MakeCamel(field.name, false);
+        if (field.value.type.enum_def) { code += "?.value"; }
+        code += ");\n";
+      } else if (IsStruct(field.value.type)) {
+        code += "  int add" + MakeCamel(field.name) + "(int offset) {\n";
+        code +=
+            "    fbBuilder.addStruct(" + NumToString(offset) + ", offset);\n";
+      } else {
+        code += "  int add" + MakeCamel(field.name) + "Offset(int offset) {\n";
+        code +=
+            "    fbBuilder.addOffset(" + NumToString(offset) + ", offset);\n";
+      }
+      code += "    return fbBuilder.offset;\n";
+      code += "  }\n";
+    }
+
+    code += "\n";
+    code += "  int finish() {\n";
+    code += "    return fbBuilder.endTable();\n";
+    code += "  }\n";
+  }
+
+  void GenObjectBuilder(
+      const StructDef &struct_def,
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *builder_name_ptr, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto &builder_name = *builder_name_ptr;
+
+    code += "class " + builder_name + " extends " + _kFb + ".ObjectBuilder {\n";
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      code += "  final " +
+              GenDartTypeName(field.value.type, struct_def.defined_namespace,
+                              field, true) +
+              " _" + MakeCamel(field.name, false) + ";\n";
+    }
+    code += "\n";
+    code += "  " + builder_name + "(";
+
+    if (non_deprecated_fields.size() != 0) {
+      code += "{\n";
+      for (auto it = non_deprecated_fields.begin();
+           it != non_deprecated_fields.end(); ++it) {
+        auto pair = *it;
+        auto &field = *pair.second;
+
+        code += "    " +
+                GenDartTypeName(field.value.type, struct_def.defined_namespace,
+                                field, true) +
+                " " + MakeCamel(field.name, false) + ",\n";
+      }
+      code += "  })\n";
+      code += "      : ";
+      for (auto it = non_deprecated_fields.begin();
+           it != non_deprecated_fields.end(); ++it) {
+        auto pair = *it;
+        auto &field = *pair.second;
+
+        code += "_" + MakeCamel(field.name, false) + " = " +
+                MakeCamel(field.name, false);
+        if (it == non_deprecated_fields.end() - 1) {
+          code += ";\n\n";
+        } else {
+          code += ",\n        ";
+        }
+      }
+    } else {
+      code += ");\n\n";
+    }
+
+    code += "  /// Finish building, and store into the [fbBuilder].\n";
+    code += "  @override\n";
+    code += "  int finish(\n";
+    code += "    " + _kFb + ".Builder fbBuilder) {\n";
+    code += "    assert(fbBuilder != null);\n";
+
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      if (IsScalar(field.value.type.base_type) || IsStruct(field.value.type))
+        continue;
+
+      code += "    final int " + MakeCamel(field.name, false) + "Offset";
+      if (IsVector(field.value.type)) {
+        code +=
+            " = _" + MakeCamel(field.name, false) + "?.isNotEmpty == true\n";
+        code += "        ? fbBuilder.writeList";
+        switch (field.value.type.VectorType().base_type) {
+          case BASE_TYPE_STRING:
+            code += "(_" + MakeCamel(field.name, false) +
+                    ".map((b) => fbBuilder.writeString(b)).toList())";
+            break;
+          case BASE_TYPE_STRUCT:
+            if (field.value.type.struct_def->fixed) {
+              code += "OfStructs(_" + MakeCamel(field.name, false) + ")";
+            } else {
+              code += "(_" + MakeCamel(field.name, false) +
+                      ".map((b) => b.getOrCreateOffset(fbBuilder)).toList())";
+            }
+            break;
+          default:
+            code += GenType(field.value.type.VectorType()) + "(_" +
+                    MakeCamel(field.name, false);
+            if (field.value.type.enum_def) { code += ".map((f) => f.value)"; }
+            code += ")";
+        }
+        code += "\n        : null;\n";
+      } else if (IsString(field.value.type)) {
+        code += " = fbBuilder.writeString(_" + MakeCamel(field.name, false) +
+                ");\n";
+      } else {
+        code += " = _" + MakeCamel(field.name, false) +
+                "?.getOrCreateOffset(fbBuilder);\n";
+      }
+    }
+
+    code += "\n";
+    if (struct_def.fixed) {
+      StructObjectBuilderBody(non_deprecated_fields, code_ptr);
+    } else {
+      TableObjectBuilderBody(non_deprecated_fields, code_ptr);
+    }
+    code += "  }\n\n";
+
+    code += "  /// Convenience method to serialize to byte list.\n";
+    code += "  @override\n";
+    code += "  Uint8List toBytes([String fileIdentifier]) {\n";
+    code += "    " + _kFb + ".Builder fbBuilder = new ";
+    code += _kFb + ".Builder();\n";
+    code += "    int offset = finish(fbBuilder);\n";
+    code += "    return fbBuilder.finish(offset, fileIdentifier);\n";
+    code += "  }\n";
+    code += "}\n";
+  }
+
+  void StructObjectBuilderBody(
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *code_ptr, bool prependUnderscore = true) {
+    auto &code = *code_ptr;
+
+    for (auto it = non_deprecated_fields.rbegin();
+         it != non_deprecated_fields.rend(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+
+      if (field.padding) {
+        code += "    fbBuilder.pad(" + NumToString(field.padding) + ");\n";
+      }
+
+      if (IsStruct(field.value.type)) {
+        code += "    ";
+        if (prependUnderscore) { code += "_"; }
+        code += field.name + ".finish(fbBuilder);\n";
+      } else {
+        code += "    fbBuilder.put" + GenType(field.value.type) + "(";
+        if (prependUnderscore) { code += "_"; }
+        code += field.name;
+        if (field.value.type.enum_def) { code += "?.value"; }
+        code += ");\n";
+      }
+    }
+
+    code += "    return fbBuilder.offset;\n";
+  }
+
+  void TableObjectBuilderBody(
+      std::vector<std::pair<int, FieldDef *>> non_deprecated_fields,
+      std::string *code_ptr, bool prependUnderscore = true) {
+    std::string &code = *code_ptr;
+    code += "    fbBuilder.startTable();\n";
+
+    for (auto it = non_deprecated_fields.begin();
+         it != non_deprecated_fields.end(); ++it) {
+      auto pair = *it;
+      auto &field = *pair.second;
+      auto offset = pair.first;
+
+      if (IsScalar(field.value.type.base_type)) {
+        code += "    fbBuilder.add" + GenType(field.value.type) + "(" +
+                NumToString(offset) + ", ";
+        if (prependUnderscore) { code += "_"; }
+        code += MakeCamel(field.name, false);
+        if (field.value.type.enum_def) { code += "?.value"; }
+        code += ");\n";
+      } else if (IsStruct(field.value.type)) {
+        code += "    if (";
+        if (prependUnderscore) { code += "_"; }
+        code += MakeCamel(field.name, false) + " != null) {\n";
+        code += "      fbBuilder.addStruct(" + NumToString(offset) + ", ";
+        code += "_" + MakeCamel(field.name, false) + ".finish(fbBuilder));\n";
+        code += "    }\n";
+      } else {
+        code +=
+            "    if (" + MakeCamel(field.name, false) + "Offset != null) {\n";
+        code += "      fbBuilder.addOffset(" + NumToString(offset) + ", " +
+                MakeCamel(field.name, false) + "Offset);\n";
+        code += "    }\n";
+      }
+    }
+    code += "    return fbBuilder.endTable();\n";
+  }
+};
+}  // namespace dart
+
+bool GenerateDart(const Parser &parser, const std::string &path,
+                  const std::string &file_name) {
+  dart::DartGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+std::string DartMakeRule(const Parser &parser, const std::string &path,
+                         const std::string &file_name) {
+  assert(parser.opts.lang <= IDLOptions::kMAX);
+
+  auto filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  dart::DartGenerator generator(parser, path, file_name);
+  auto make_rule =
+      generator.GeneratedFileName(path, file_name, parser.opts) + ": ";
+
+  auto included_files = parser.GetIncludedFilesRecursive(file_name);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_fbs.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_fbs.cpp
new file mode 100644
index 0000000..35c1a7d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_fbs.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+static std::string GenType(const Type &type, bool underlying = false) {
+  switch (type.base_type) {
+    case BASE_TYPE_STRUCT:
+      return type.struct_def->defined_namespace->GetFullyQualifiedName(
+          type.struct_def->name);
+    case BASE_TYPE_VECTOR: return "[" + GenType(type.VectorType()) + "]";
+    default:
+      if (type.enum_def && !underlying) {
+        return type.enum_def->defined_namespace->GetFullyQualifiedName(
+            type.enum_def->name);
+      } else {
+        return kTypeNames[type.base_type];
+      }
+  }
+}
+
+static void GenNameSpace(const Namespace &name_space, std::string *_schema,
+                         const Namespace **last_namespace) {
+  if (*last_namespace == &name_space) return;
+  *last_namespace = &name_space;
+  auto &schema = *_schema;
+  schema += "namespace ";
+  for (auto it = name_space.components.begin();
+       it != name_space.components.end(); ++it) {
+    if (it != name_space.components.begin()) schema += ".";
+    schema += *it;
+  }
+  schema += ";\n\n";
+}
+
+// Generate a flatbuffer schema from the Parser's internal representation.
+std::string GenerateFBS(const Parser &parser, const std::string &file_name) {
+  // Proto namespaces may clash with table names, escape the ones that were
+  // generated from a table:
+  for (auto it = parser.namespaces_.begin(); it != parser.namespaces_.end();
+       ++it) {
+    auto &ns = **it;
+    for (size_t i = 0; i < ns.from_table; i++) {
+      ns.components[ns.components.size() - 1 - i] += "_";
+    }
+
+    if (parser.opts.proto_mode && !parser.opts.proto_namespace_suffix.empty()) {
+      // Since we know that all these namespaces come from a .proto, and all are
+      // being converted, we can simply apply this suffix to all of them.
+      ns.components.insert(ns.components.end() - ns.from_table,
+                           parser.opts.proto_namespace_suffix);
+    }
+  }
+
+  std::string schema;
+  schema += "// Generated from " + file_name + ".proto\n\n";
+  if (parser.opts.include_dependence_headers) {
+    // clang-format off
+    int num_includes = 0;
+    for (auto it = parser.included_files_.begin();
+         it != parser.included_files_.end(); ++it) {
+      if (it->second.empty())
+        continue;
+      std::string basename;
+      if(parser.opts.keep_include_path) {
+        basename = flatbuffers::StripExtension(it->second);
+      } else {
+        basename = flatbuffers::StripPath(
+                flatbuffers::StripExtension(it->second));
+      }
+      schema += "include \"" + basename + ".fbs\";\n";
+      num_includes++;
+    }
+    if (num_includes) schema += "\n";
+    // clang-format on
+  }
+  // Generate code for all the enum declarations.
+  const Namespace *last_namespace = nullptr;
+  for (auto enum_def_it = parser.enums_.vec.begin();
+       enum_def_it != parser.enums_.vec.end(); ++enum_def_it) {
+    EnumDef &enum_def = **enum_def_it;
+    if (parser.opts.include_dependence_headers && enum_def.generated) {
+      continue;
+    }
+    GenNameSpace(*enum_def.defined_namespace, &schema, &last_namespace);
+    GenComment(enum_def.doc_comment, &schema, nullptr);
+    if (enum_def.is_union)
+      schema += "union " + enum_def.name;
+    else
+      schema += "enum " + enum_def.name + " : ";
+    schema += GenType(enum_def.underlying_type, true) + " {\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, &schema, nullptr, "  ");
+      if (enum_def.is_union)
+        schema += "  " + GenType(ev.union_type) + ",\n";
+      else
+        schema += "  " + ev.name + " = " + enum_def.ToString(ev) + ",\n";
+    }
+    schema += "}\n\n";
+  }
+  // Generate code for all structs/tables.
+  for (auto it = parser.structs_.vec.begin(); it != parser.structs_.vec.end();
+       ++it) {
+    StructDef &struct_def = **it;
+    if (parser.opts.include_dependence_headers && struct_def.generated) {
+      continue;
+    }
+    GenNameSpace(*struct_def.defined_namespace, &schema, &last_namespace);
+    GenComment(struct_def.doc_comment, &schema, nullptr);
+    schema += "table " + struct_def.name + " {\n";
+    for (auto field_it = struct_def.fields.vec.begin();
+         field_it != struct_def.fields.vec.end(); ++field_it) {
+      auto &field = **field_it;
+      if (field.value.type.base_type != BASE_TYPE_UTYPE) {
+        GenComment(field.doc_comment, &schema, nullptr, "  ");
+        schema += "  " + field.name + ":" + GenType(field.value.type);
+        if (field.value.constant != "0") schema += " = " + field.value.constant;
+        if (field.IsRequired()) schema += " (required)";
+        schema += ";\n";
+      }
+    }
+    schema += "}\n\n";
+  }
+  return schema;
+}
+
+bool GenerateFBS(const Parser &parser, const std::string &path,
+                 const std::string &file_name) {
+  return SaveFile((path + file_name + ".fbs").c_str(),
+                  GenerateFBS(parser, file_name), false);
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_go.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_go.cpp
new file mode 100644
index 0000000..867f402
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_go.cpp
@@ -0,0 +1,1374 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <sstream>
+#include <string>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+#ifdef _WIN32
+#  include <direct.h>
+#  define PATH_SEPARATOR "\\"
+#  define mkdir(n, m) _mkdir(n)
+#else
+#  include <sys/stat.h>
+#  define PATH_SEPARATOR "/"
+#endif
+
+namespace flatbuffers {
+
+namespace go {
+
+// see https://golang.org/ref/spec#Keywords
+static const char *const g_golang_keywords[] = {
+  "break",  "default", "func",        "interface", "select", "case", "defer",
+  "go",     "map",     "struct",      "chan",      "else",   "goto", "package",
+  "switch", "const",   "fallthrough", "if",        "range",  "type", "continue",
+  "for",    "import",  "return",      "var",
+};
+
+static std::string GoIdentity(const std::string &name) {
+  for (size_t i = 0;
+       i < sizeof(g_golang_keywords) / sizeof(g_golang_keywords[0]); i++) {
+    if (name == g_golang_keywords[i]) { return MakeCamel(name + "_", false); }
+  }
+
+  return MakeCamel(name, false);
+}
+
+class GoGenerator : public BaseGenerator {
+ public:
+  GoGenerator(const Parser &parser, const std::string &path,
+              const std::string &file_name, const std::string &go_namespace)
+      : BaseGenerator(parser, path, file_name, "" /* not used*/,
+                      "" /* not used */, "go"),
+        cur_name_space_(nullptr) {
+    std::istringstream iss(go_namespace);
+    std::string component;
+    while (std::getline(iss, component, '.')) {
+      go_namespace_.components.push_back(component);
+    }
+  }
+
+  bool generate() {
+    std::string one_file_code;
+    bool needs_imports = false;
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      tracked_imported_namespaces_.clear();
+      needs_imports = false;
+      std::string enumcode;
+      GenEnum(**it, &enumcode);
+      if ((*it)->is_union && parser_.opts.generate_object_based_api) {
+        GenNativeUnion(**it, &enumcode);
+        GenNativeUnionPack(**it, &enumcode);
+        GenNativeUnionUnPack(**it, &enumcode);
+        needs_imports = true;
+      }
+      if (parser_.opts.one_file) {
+        one_file_code += enumcode;
+      } else {
+        if (!SaveType(**it, enumcode, needs_imports, true)) return false;
+      }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      tracked_imported_namespaces_.clear();
+      std::string declcode;
+      GenStruct(**it, &declcode);
+      if (parser_.opts.one_file) {
+        one_file_code += declcode;
+      } else {
+        if (!SaveType(**it, declcode, true, false)) return false;
+      }
+    }
+
+    if (parser_.opts.one_file) {
+      std::string code = "";
+      const bool is_enum = !parser_.enums_.vec.empty();
+      BeginFile(LastNamespacePart(go_namespace_), true, is_enum, &code);
+      code += one_file_code;
+      const std::string filename =
+          GeneratedFileName(path_, file_name_, parser_.opts);
+      return SaveFile(filename.c_str(), code, false);
+    }
+
+    return true;
+  }
+
+ private:
+  Namespace go_namespace_;
+  Namespace *cur_name_space_;
+
+  struct NamespacePtrLess {
+    bool operator()(const Namespace *a, const Namespace *b) const {
+      return *a < *b;
+    }
+  };
+  std::set<const Namespace *, NamespacePtrLess> tracked_imported_namespaces_;
+
+  // Most field accessors need to retrieve and test the field offset first,
+  // this is the prefix code for that.
+  std::string OffsetPrefix(const FieldDef &field) {
+    return "{\n\to := flatbuffers.UOffsetT(rcv._tab.Offset(" +
+           NumToString(field.value.offset) + "))\n\tif o != 0 {\n";
+  }
+
+  // Begin a class declaration.
+  void BeginClass(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "type " + struct_def.name + " struct {\n\t";
+
+    // _ is reserved in flatbuffers field names, so no chance of name conflict:
+    code += "_tab ";
+    code += struct_def.fixed ? "flatbuffers.Struct" : "flatbuffers.Table";
+    code += "\n}\n\n";
+  }
+
+  // Construct the name of the type for this enum.
+  std::string GetEnumTypeName(const EnumDef &enum_def) {
+    return WrapInNameSpaceAndTrack(enum_def.defined_namespace,
+                                   GoIdentity(enum_def.name));
+  }
+
+  // Create a type for the enum values.
+  void GenEnumType(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "type " + GetEnumTypeName(enum_def) + " ";
+    code += GenTypeBasic(enum_def.underlying_type) + "\n\n";
+  }
+
+  // Begin enum code with a class declaration.
+  void BeginEnum(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "const (\n";
+  }
+
+  // A single enum member.
+  void EnumMember(const EnumDef &enum_def, const EnumVal &ev,
+                  size_t max_name_length, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\t";
+    code += enum_def.name;
+    code += ev.name;
+    code += " ";
+    code += std::string(max_name_length - ev.name.length(), ' ');
+    code += GetEnumTypeName(enum_def);
+    code += " = ";
+    code += enum_def.ToString(ev) + "\n";
+  }
+
+  // End enum code.
+  void EndEnum(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += ")\n\n";
+  }
+
+  // Begin enum name map.
+  void BeginEnumNames(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "var EnumNames";
+    code += enum_def.name;
+    code += " = map[" + GetEnumTypeName(enum_def) + "]string{\n";
+  }
+
+  // A single enum name member.
+  void EnumNameMember(const EnumDef &enum_def, const EnumVal &ev,
+                      size_t max_name_length, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\t";
+    code += enum_def.name;
+    code += ev.name;
+    code += ": ";
+    code += std::string(max_name_length - ev.name.length(), ' ');
+    code += "\"";
+    code += ev.name;
+    code += "\",\n";
+  }
+
+  // End enum name map.
+  void EndEnumNames(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "}\n\n";
+  }
+
+  // Generate String() method on enum type.
+  void EnumStringer(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func (v " + enum_def.name + ") String() string {\n";
+    code += "\tif s, ok := EnumNames" + enum_def.name + "[v]; ok {\n";
+    code += "\t\treturn s\n";
+    code += "\t}\n";
+    code += "\treturn \"" + enum_def.name;
+    code += "(\" + strconv.FormatInt(int64(v), 10) + \")\"\n";
+    code += "}\n\n";
+  }
+
+  // Begin enum value map.
+  void BeginEnumValues(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "var EnumValues";
+    code += enum_def.name;
+    code += " = map[string]" + GetEnumTypeName(enum_def) + "{\n";
+  }
+
+  // A single enum value member.
+  void EnumValueMember(const EnumDef &enum_def, const EnumVal &ev,
+                       size_t max_name_length, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\t\"";
+    code += ev.name;
+    code += "\": ";
+    code += std::string(max_name_length - ev.name.length(), ' ');
+    code += enum_def.name;
+    code += ev.name;
+    code += ",\n";
+  }
+
+  // End enum value map.
+  void EndEnumValues(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "}\n\n";
+  }
+
+  // Initialize a new struct or table from existing data.
+  void NewRootTypeFromBuffer(const StructDef &struct_def,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string size_prefix[] = { "", "SizePrefixed" };
+
+    for (int i = 0; i < 2; i++) {
+      code += "func Get" + size_prefix[i] + "RootAs";
+      code += struct_def.name;
+      code += "(buf []byte, offset flatbuffers.UOffsetT) ";
+      code += "*" + struct_def.name + "";
+      code += " {\n";
+      if (i == 0) {
+        code += "\tn := flatbuffers.GetUOffsetT(buf[offset:])\n";
+      } else {
+        code +=
+            "\tn := "
+            "flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])\n";
+      }
+      code += "\tx := &" + struct_def.name + "{}\n";
+      if (i == 0) {
+        code += "\tx.Init(buf, n+offset)\n";
+      } else {
+        code += "\tx.Init(buf, n+offset+flatbuffers.SizeUint32)\n";
+      }
+      code += "\treturn x\n";
+      code += "}\n\n";
+    }
+  }
+
+  // Initialize an existing object with other data, to avoid an allocation.
+  void InitializeExisting(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += " Init(buf []byte, i flatbuffers.UOffsetT) ";
+    code += "{\n";
+    code += "\trcv._tab.Bytes = buf\n";
+    code += "\trcv._tab.Pos = i\n";
+    code += "}\n\n";
+  }
+
+  // Implement the table accessor
+  void GenTableAccessor(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += " Table() flatbuffers.Table ";
+    code += "{\n";
+
+    if (struct_def.fixed) {
+      code += "\treturn rcv._tab.Table\n";
+    } else {
+      code += "\treturn rcv._tab\n";
+    }
+    code += "}\n\n";
+  }
+
+  // Get the length of a vector.
+  void GetVectorLen(const StructDef &struct_def, const FieldDef &field,
+                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name) + "Length(";
+    code += ") int " + OffsetPrefix(field);
+    code += "\t\treturn rcv._tab.VectorLen(o)\n\t}\n";
+    code += "\treturn 0\n}\n\n";
+  }
+
+  // Get a [ubyte] vector as a byte slice.
+  void GetUByteSlice(const StructDef &struct_def, const FieldDef &field,
+                     std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name) + "Bytes(";
+    code += ") []byte " + OffsetPrefix(field);
+    code += "\t\treturn rcv._tab.ByteVector(o + rcv._tab.Pos)\n\t}\n";
+    code += "\treturn nil\n}\n\n";
+  }
+
+  // Get the value of a struct's scalar.
+  void GetScalarFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "() " + TypeName(field) + " {\n";
+    code += "\treturn " +
+            CastToEnum(field.value.type,
+                       getter + "(rcv._tab.Pos + flatbuffers.UOffsetT(" +
+                           NumToString(field.value.offset) + "))");
+    code += "\n}\n";
+  }
+
+  // Get the value of a table's scalar.
+  void GetScalarFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "() " + TypeName(field) + " ";
+    code += OffsetPrefix(field) + "\t\treturn ";
+    code += CastToEnum(field.value.type, getter + "(o + rcv._tab.Pos)");
+    code += "\n\t}\n";
+    code += "\treturn " + GenConstant(field) + "\n";
+    code += "}\n\n";
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Struct.
+  void GetStructFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "(obj *" + TypeName(field);
+    code += ") *" + TypeName(field);
+    code += " {\n";
+    code += "\tif obj == nil {\n";
+    code += "\t\tobj = new(" + TypeName(field) + ")\n";
+    code += "\t}\n";
+    code += "\tobj.Init(rcv._tab.Bytes, rcv._tab.Pos+";
+    code += NumToString(field.value.offset) + ")";
+    code += "\n\treturn obj\n";
+    code += "}\n";
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Table.
+  void GetStructFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "(obj *";
+    code += TypeName(field);
+    code += ") *" + TypeName(field) + " " + OffsetPrefix(field);
+    if (field.value.type.struct_def->fixed) {
+      code += "\t\tx := o + rcv._tab.Pos\n";
+    } else {
+      code += "\t\tx := rcv._tab.Indirect(o + rcv._tab.Pos)\n";
+    }
+    code += "\t\tif obj == nil {\n";
+    code += "\t\t\tobj = new(" + TypeName(field) + ")\n";
+    code += "\t\t}\n";
+    code += "\t\tobj.Init(rcv._tab.Bytes, x)\n";
+    code += "\t\treturn obj\n\t}\n\treturn nil\n";
+    code += "}\n\n";
+  }
+
+  // Get the value of a string.
+  void GetStringField(const StructDef &struct_def, const FieldDef &field,
+                      std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "() " + TypeName(field) + " ";
+    code += OffsetPrefix(field) + "\t\treturn " + GenGetter(field.value.type);
+    code += "(o + rcv._tab.Pos)\n\t}\n\treturn nil\n";
+    code += "}\n\n";
+  }
+
+  // Get the value of a union from an object.
+  void GetUnionField(const StructDef &struct_def, const FieldDef &field,
+                     std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name) + "(";
+    code += "obj " + GenTypePointer(field.value.type) + ") bool ";
+    code += OffsetPrefix(field);
+    code += "\t\t" + GenGetter(field.value.type);
+    code += "(obj, o)\n\t\treturn true\n\t}\n";
+    code += "\treturn false\n";
+    code += "}\n\n";
+  }
+
+  // Get the value of a vector's struct member.
+  void GetMemberOfVectorOfStruct(const StructDef &struct_def,
+                                 const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "(obj *" + TypeName(field);
+    code += ", j int) bool " + OffsetPrefix(field);
+    code += "\t\tx := rcv._tab.Vector(o)\n";
+    code += "\t\tx += flatbuffers.UOffsetT(j) * ";
+    code += NumToString(InlineSize(vectortype)) + "\n";
+    if (!(vectortype.struct_def->fixed)) {
+      code += "\t\tx = rcv._tab.Indirect(x)\n";
+    }
+    code += "\t\tobj.Init(rcv._tab.Bytes, x)\n";
+    code += "\t\treturn true\n\t}\n";
+    code += "\treturn false\n";
+    code += "}\n\n";
+  }
+
+  // Get the value of a vector's non-struct member.
+  void GetMemberOfVectorOfNonStruct(const StructDef &struct_def,
+                                    const FieldDef &field,
+                                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += " " + MakeCamel(field.name);
+    code += "(j int) " + TypeName(field) + " ";
+    code += OffsetPrefix(field);
+    code += "\t\ta := rcv._tab.Vector(o)\n";
+    code += "\t\treturn " +
+            CastToEnum(field.value.type,
+                       GenGetter(field.value.type) +
+                           "(a + flatbuffers.UOffsetT(j*" +
+                           NumToString(InlineSize(vectortype)) + "))");
+    code += "\n\t}\n";
+    if (IsString(vectortype)) {
+      code += "\treturn nil\n";
+    } else if (vectortype.base_type == BASE_TYPE_BOOL) {
+      code += "\treturn false\n";
+    } else {
+      code += "\treturn 0\n";
+    }
+    code += "}\n\n";
+  }
+
+  // Begin the creator function signature.
+  void BeginBuilderArgs(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    if (code.substr(code.length() - 2) != "\n\n") {
+      // a previous mutate has not put an extra new line
+      code += "\n";
+    }
+    code += "func Create" + struct_def.name;
+    code += "(builder *flatbuffers.Builder";
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void StructBuilderArgs(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        StructBuilderArgs(*field.value.type.struct_def,
+                          (nameprefix + (field.name + "_")).c_str(), code_ptr);
+      } else {
+        std::string &code = *code_ptr;
+        code += std::string(", ") + nameprefix;
+        code += GoIdentity(field.name);
+        code += " " + TypeName(field);
+      }
+    }
+  }
+
+  // End the creator function signature.
+  void EndBuilderArgs(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += ") flatbuffers.UOffsetT {\n";
+  }
+
+  // Recursively generate struct construction statements and instert manual
+  // padding.
+  void StructBuilderBody(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\tbuilder.Prep(" + NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ")\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      if (field.padding)
+        code += "\tbuilder.Pad(" + NumToString(field.padding) + ")\n";
+      if (IsStruct(field.value.type)) {
+        StructBuilderBody(*field.value.type.struct_def,
+                          (nameprefix + (field.name + "_")).c_str(), code_ptr);
+      } else {
+        code += "\tbuilder.Prepend" + GenMethod(field) + "(";
+        code += CastToBaseType(field.value.type,
+                               nameprefix + GoIdentity(field.name)) +
+                ")\n";
+      }
+    }
+  }
+
+  void EndBuilderBody(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\treturn builder.Offset()\n";
+    code += "}\n";
+  }
+
+  // Get the value of a table's starting offset.
+  void GetStartOfTable(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func " + struct_def.name + "Start";
+    code += "(builder *flatbuffers.Builder) {\n";
+    code += "\tbuilder.StartObject(";
+    code += NumToString(struct_def.fields.vec.size());
+    code += ")\n}\n";
+  }
+
+  // Set the value of a table's field.
+  void BuildFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                         const size_t offset, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func " + struct_def.name + "Add" + MakeCamel(field.name);
+    code += "(builder *flatbuffers.Builder, ";
+    code += GoIdentity(field.name) + " ";
+    if (!IsScalar(field.value.type.base_type) && (!struct_def.fixed)) {
+      code += "flatbuffers.UOffsetT";
+    } else {
+      code += TypeName(field);
+    }
+    code += ") {\n";
+    code += "\tbuilder.Prepend";
+    code += GenMethod(field) + "Slot(";
+    code += NumToString(offset) + ", ";
+    if (!IsScalar(field.value.type.base_type) && (!struct_def.fixed)) {
+      code += "flatbuffers.UOffsetT";
+      code += "(";
+      code += GoIdentity(field.name) + ")";
+    } else {
+      code += CastToBaseType(field.value.type, GoIdentity(field.name));
+    }
+    code += ", " + GenConstant(field);
+    code += ")\n}\n";
+  }
+
+  // Set the value of one of the members of a table's vector.
+  void BuildVectorOfTable(const StructDef &struct_def, const FieldDef &field,
+                          std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func " + struct_def.name + "Start";
+    code += MakeCamel(field.name);
+    code += "Vector(builder *flatbuffers.Builder, numElems int) ";
+    code += "flatbuffers.UOffsetT {\n\treturn builder.StartVector(";
+    auto vector_type = field.value.type.VectorType();
+    auto alignment = InlineAlignment(vector_type);
+    auto elem_size = InlineSize(vector_type);
+    code += NumToString(elem_size);
+    code += ", numElems, " + NumToString(alignment);
+    code += ")\n}\n";
+  }
+
+  // Get the offset of the end of a table.
+  void GetEndOffsetOnTable(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func " + struct_def.name + "End";
+    code += "(builder *flatbuffers.Builder) flatbuffers.UOffsetT ";
+    code += "{\n\treturn builder.EndObject()\n}\n";
+  }
+
+  // Generate the receiver for function signatures.
+  void GenReceiver(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func (rcv *" + struct_def.name + ")";
+  }
+
+  // Generate a struct field getter, conditioned on its child type(s).
+  void GenStructAccessor(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, nullptr, "");
+    if (IsScalar(field.value.type.base_type)) {
+      if (struct_def.fixed) {
+        GetScalarFieldOfStruct(struct_def, field, code_ptr);
+      } else {
+        GetScalarFieldOfTable(struct_def, field, code_ptr);
+      }
+    } else {
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT:
+          if (struct_def.fixed) {
+            GetStructFieldOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetStructFieldOfTable(struct_def, field, code_ptr);
+          }
+          break;
+        case BASE_TYPE_STRING:
+          GetStringField(struct_def, field, code_ptr);
+          break;
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GetMemberOfVectorOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetMemberOfVectorOfNonStruct(struct_def, field, code_ptr);
+          }
+          break;
+        }
+        case BASE_TYPE_UNION: GetUnionField(struct_def, field, code_ptr); break;
+        default: FLATBUFFERS_ASSERT(0);
+      }
+    }
+    if (IsVector(field.value.type)) {
+      GetVectorLen(struct_def, field, code_ptr);
+      if (field.value.type.element == BASE_TYPE_UCHAR) {
+        GetUByteSlice(struct_def, field, code_ptr);
+      }
+    }
+  }
+
+  // Mutate the value of a struct's scalar.
+  void MutateScalarFieldOfStruct(const StructDef &struct_def,
+                                 const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string type = MakeCamel(GenTypeBasic(field.value.type));
+    std::string setter = "rcv._tab.Mutate" + type;
+    GenReceiver(struct_def, code_ptr);
+    code += " Mutate" + MakeCamel(field.name);
+    code += "(n " + TypeName(field) + ") bool {\n\treturn " + setter;
+    code += "(rcv._tab.Pos+flatbuffers.UOffsetT(";
+    code += NumToString(field.value.offset) + "), ";
+    code += CastToBaseType(field.value.type, "n") + ")\n}\n\n";
+  }
+
+  // Mutate the value of a table's scalar.
+  void MutateScalarFieldOfTable(const StructDef &struct_def,
+                                const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string type = MakeCamel(GenTypeBasic(field.value.type));
+    std::string setter = "rcv._tab.Mutate" + type + "Slot";
+    GenReceiver(struct_def, code_ptr);
+    code += " Mutate" + MakeCamel(field.name);
+    code += "(n " + TypeName(field) + ") bool {\n\treturn ";
+    code += setter + "(" + NumToString(field.value.offset) + ", ";
+    code += CastToBaseType(field.value.type, "n") + ")\n";
+    code += "}\n\n";
+  }
+
+  // Mutate an element of a vector of scalars.
+  void MutateElementOfVectorOfNonStruct(const StructDef &struct_def,
+                                        const FieldDef &field,
+                                        std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+    std::string type = MakeCamel(GenTypeBasic(vectortype));
+    std::string setter = "rcv._tab.Mutate" + type;
+    GenReceiver(struct_def, code_ptr);
+    code += " Mutate" + MakeCamel(field.name);
+    code += "(j int, n " + TypeName(field) + ") bool ";
+    code += OffsetPrefix(field);
+    code += "\t\ta := rcv._tab.Vector(o)\n";
+    code += "\t\treturn " + setter + "(";
+    code += "a+flatbuffers.UOffsetT(j*";
+    code += NumToString(InlineSize(vectortype)) + "), ";
+    code += CastToBaseType(vectortype, "n") + ")\n";
+    code += "\t}\n";
+    code += "\treturn false\n";
+    code += "}\n\n";
+  }
+
+  // Generate a struct field setter, conditioned on its child type(s).
+  void GenStructMutator(const StructDef &struct_def, const FieldDef &field,
+                        std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, nullptr, "");
+    if (IsScalar(field.value.type.base_type)) {
+      if (struct_def.fixed) {
+        MutateScalarFieldOfStruct(struct_def, field, code_ptr);
+      } else {
+        MutateScalarFieldOfTable(struct_def, field, code_ptr);
+      }
+    } else if (IsVector(field.value.type)) {
+      if (IsScalar(field.value.type.element)) {
+        MutateElementOfVectorOfNonStruct(struct_def, field, code_ptr);
+      }
+    }
+  }
+
+  // Generate table constructors, conditioned on its members' types.
+  void GenTableBuilders(const StructDef &struct_def, std::string *code_ptr) {
+    GetStartOfTable(struct_def, code_ptr);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto offset = it - struct_def.fields.vec.begin();
+      BuildFieldOfTable(struct_def, field, offset, code_ptr);
+      if (IsVector(field.value.type)) {
+        BuildVectorOfTable(struct_def, field, code_ptr);
+      }
+    }
+
+    GetEndOffsetOnTable(struct_def, code_ptr);
+  }
+
+  // Generate struct or table methods.
+  void GenStruct(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+
+    cur_name_space_ = struct_def.defined_namespace;
+
+    GenComment(struct_def.doc_comment, code_ptr, nullptr);
+    if (parser_.opts.generate_object_based_api) {
+      GenNativeStruct(struct_def, code_ptr);
+    }
+    BeginClass(struct_def, code_ptr);
+    if (!struct_def.fixed) {
+      // Generate a special accessor for the table that has been declared as
+      // the root type.
+      NewRootTypeFromBuffer(struct_def, code_ptr);
+    }
+    // Generate the Init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    InitializeExisting(struct_def, code_ptr);
+    // Generate _tab accessor
+    GenTableAccessor(struct_def, code_ptr);
+
+    // Generate struct fields accessors
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      GenStructAccessor(struct_def, field, code_ptr);
+      GenStructMutator(struct_def, field, code_ptr);
+    }
+
+    // Generate builders
+    if (struct_def.fixed) {
+      // create a struct constructor function
+      GenStructBuilder(struct_def, code_ptr);
+    } else {
+      // Create a set of functions that allow table construction.
+      GenTableBuilders(struct_def, code_ptr);
+    }
+  }
+
+  void GenNativeStruct(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "type " + NativeName(struct_def) + " struct {\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.deprecated) continue;
+      if (IsScalar(field.value.type.base_type) &&
+          field.value.type.enum_def != nullptr &&
+          field.value.type.enum_def->is_union)
+        continue;
+      code += "\t" + MakeCamel(field.name) + " " +
+              NativeType(field.value.type) + "\n";
+    }
+    code += "}\n\n";
+
+    if (!struct_def.fixed) {
+      GenNativeTablePack(struct_def, code_ptr);
+      GenNativeTableUnPack(struct_def, code_ptr);
+    } else {
+      GenNativeStructPack(struct_def, code_ptr);
+      GenNativeStructUnPack(struct_def, code_ptr);
+    }
+  }
+
+  void GenNativeUnion(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "type " + NativeName(enum_def) + " struct {\n";
+    code += "\tType " + enum_def.name + "\n";
+    code += "\tValue interface{}\n";
+    code += "}\n\n";
+  }
+
+  void GenNativeUnionPack(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "func (t *" + NativeName(enum_def) +
+            ") Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {\n";
+    code += "\tif t == nil {\n\t\treturn 0\n\t}\n";
+
+    code += "\tswitch t.Type {\n";
+    for (auto it2 = enum_def.Vals().begin(); it2 != enum_def.Vals().end();
+         ++it2) {
+      const EnumVal &ev = **it2;
+      if (ev.IsZero()) continue;
+      code += "\tcase " + enum_def.name + ev.name + ":\n";
+      code += "\t\treturn t.Value.(" + NativeType(ev.union_type) +
+              ").Pack(builder)\n";
+    }
+    code += "\t}\n";
+    code += "\treturn 0\n";
+    code += "}\n\n";
+  }
+
+  void GenNativeUnionUnPack(const EnumDef &enum_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "func (rcv " + enum_def.name +
+            ") UnPack(table flatbuffers.Table) *" + NativeName(enum_def) +
+            " {\n";
+    code += "\tswitch rcv {\n";
+
+    for (auto it2 = enum_def.Vals().begin(); it2 != enum_def.Vals().end();
+         ++it2) {
+      const EnumVal &ev = **it2;
+      if (ev.IsZero()) continue;
+      code += "\tcase " + enum_def.name + ev.name + ":\n";
+      code += "\t\tx := " + ev.union_type.struct_def->name + "{_tab: table}\n";
+
+      code += "\t\treturn &" +
+              WrapInNameSpaceAndTrack(enum_def.defined_namespace,
+                                      NativeName(enum_def)) +
+              "{ Type: " + enum_def.name + ev.name + ", Value: x.UnPack() }\n";
+    }
+    code += "\t}\n";
+    code += "\treturn nil\n";
+    code += "}\n\n";
+  }
+
+  void GenNativeTablePack(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "func (t *" + NativeName(struct_def) +
+            ") Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {\n";
+    code += "\tif t == nil { return 0 }\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.deprecated) continue;
+      if (IsScalar(field.value.type.base_type)) continue;
+
+      std::string offset = MakeCamel(field.name, false) + "Offset";
+
+      if (IsString(field.value.type)) {
+        code += "\t" + offset + " := builder.CreateString(t." +
+                MakeCamel(field.name) + ")\n";
+      } else if (IsVector(field.value.type) &&
+                 field.value.type.element == BASE_TYPE_UCHAR &&
+                 field.value.type.enum_def == nullptr) {
+        code += "\t" + offset + " := flatbuffers.UOffsetT(0)\n";
+        code += "\tif t." + MakeCamel(field.name) + " != nil {\n";
+        code += "\t\t" + offset + " = builder.CreateByteString(t." +
+                MakeCamel(field.name) + ")\n";
+        code += "\t}\n";
+      } else if (IsVector(field.value.type)) {
+        code += "\t" + offset + " := flatbuffers.UOffsetT(0)\n";
+        code += "\tif t." + MakeCamel(field.name) + " != nil {\n";
+        std::string length = MakeCamel(field.name, false) + "Length";
+        std::string offsets = MakeCamel(field.name, false) + "Offsets";
+        code += "\t\t" + length + " := len(t." + MakeCamel(field.name) + ")\n";
+        if (field.value.type.element == BASE_TYPE_STRING) {
+          code += "\t\t" + offsets + " := make([]flatbuffers.UOffsetT, " +
+                  length + ")\n";
+          code += "\t\tfor j := 0; j < " + length + "; j++ {\n";
+          code += "\t\t\t" + offsets + "[j] = builder.CreateString(t." +
+                  MakeCamel(field.name) + "[j])\n";
+          code += "\t\t}\n";
+        } else if (field.value.type.element == BASE_TYPE_STRUCT &&
+                   !field.value.type.struct_def->fixed) {
+          code += "\t\t" + offsets + " := make([]flatbuffers.UOffsetT, " +
+                  length + ")\n";
+          code += "\t\tfor j := 0; j < " + length + "; j++ {\n";
+          code += "\t\t\t" + offsets + "[j] = t." + MakeCamel(field.name) +
+                  "[j].Pack(builder)\n";
+          code += "\t\t}\n";
+        }
+        code += "\t\t" + struct_def.name + "Start" + MakeCamel(field.name) +
+                "Vector(builder, " + length + ")\n";
+        code += "\t\tfor j := " + length + " - 1; j >= 0; j-- {\n";
+        if (IsScalar(field.value.type.element)) {
+          code += "\t\t\tbuilder.Prepend" +
+                  MakeCamel(GenTypeBasic(field.value.type.VectorType())) + "(" +
+                  CastToBaseType(field.value.type.VectorType(),
+                                 "t." + MakeCamel(field.name) + "[j]") +
+                  ")\n";
+        } else if (field.value.type.element == BASE_TYPE_STRUCT &&
+                   field.value.type.struct_def->fixed) {
+          code += "\t\t\tt." + MakeCamel(field.name) + "[j].Pack(builder)\n";
+        } else {
+          code += "\t\t\tbuilder.PrependUOffsetT(" + offsets + "[j])\n";
+        }
+        code += "\t\t}\n";
+        code += "\t\t" + offset + " = builder.EndVector(" + length + ")\n";
+        code += "\t}\n";
+      } else if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+        if (field.value.type.struct_def->fixed) continue;
+        code += "\t" + offset + " := t." + MakeCamel(field.name) +
+                ".Pack(builder)\n";
+      } else if (field.value.type.base_type == BASE_TYPE_UNION) {
+        code += "\t" + offset + " := t." + MakeCamel(field.name) +
+                ".Pack(builder)\n";
+        code += "\t\n";
+      } else {
+        FLATBUFFERS_ASSERT(0);
+      }
+    }
+    code += "\t" + struct_def.name + "Start(builder)\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.deprecated) continue;
+
+      std::string offset = MakeCamel(field.name, false) + "Offset";
+      if (IsScalar(field.value.type.base_type)) {
+        if (field.value.type.enum_def == nullptr ||
+            !field.value.type.enum_def->is_union) {
+          code += "\t" + struct_def.name + "Add" + MakeCamel(field.name) +
+                  "(builder, t." + MakeCamel(field.name) + ")\n";
+        }
+      } else {
+        if (field.value.type.base_type == BASE_TYPE_STRUCT &&
+            field.value.type.struct_def->fixed) {
+          code += "\t" + offset + " := t." + MakeCamel(field.name) +
+                  ".Pack(builder)\n";
+        } else if (field.value.type.enum_def != nullptr &&
+                   field.value.type.enum_def->is_union) {
+          code += "\tif t." + MakeCamel(field.name) + " != nil {\n";
+          code += "\t\t" + struct_def.name + "Add" +
+                  MakeCamel(field.name + UnionTypeFieldSuffix()) +
+                  "(builder, t." + MakeCamel(field.name) + ".Type)\n";
+          code += "\t}\n";
+        }
+        code += "\t" + struct_def.name + "Add" + MakeCamel(field.name) +
+                "(builder, " + offset + ")\n";
+      }
+    }
+    code += "\treturn " + struct_def.name + "End(builder)\n";
+    code += "}\n\n";
+  }
+
+  void GenNativeTableUnPack(const StructDef &struct_def,
+                            std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "func (rcv *" + struct_def.name + ") UnPackTo(t *" +
+            NativeName(struct_def) + ") {\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.deprecated) continue;
+      std::string field_name_camel = MakeCamel(field.name);
+      std::string length = MakeCamel(field.name, false) + "Length";
+      if (IsScalar(field.value.type.base_type)) {
+        if (field.value.type.enum_def != nullptr &&
+            field.value.type.enum_def->is_union)
+          continue;
+        code +=
+            "\tt." + field_name_camel + " = rcv." + field_name_camel + "()\n";
+      } else if (IsString(field.value.type)) {
+        code += "\tt." + field_name_camel + " = string(rcv." +
+                field_name_camel + "())\n";
+      } else if (IsVector(field.value.type) &&
+                 field.value.type.element == BASE_TYPE_UCHAR &&
+                 field.value.type.enum_def == nullptr) {
+        code += "\tt." + field_name_camel + " = rcv." + field_name_camel +
+                "Bytes()\n";
+      } else if (IsVector(field.value.type)) {
+        code += "\t" + length + " := rcv." + field_name_camel + "Length()\n";
+        code += "\tt." + field_name_camel + " = make(" +
+                NativeType(field.value.type) + ", " + length + ")\n";
+        code += "\tfor j := 0; j < " + length + "; j++ {\n";
+        if (field.value.type.element == BASE_TYPE_STRUCT) {
+          code += "\t\tx := " +
+                  WrapInNameSpaceAndTrack(*field.value.type.struct_def) +
+                  "{}\n";
+          code += "\t\trcv." + field_name_camel + "(&x, j)\n";
+        }
+        code += "\t\tt." + field_name_camel + "[j] = ";
+        if (IsScalar(field.value.type.element)) {
+          code += "rcv." + field_name_camel + "(j)";
+        } else if (field.value.type.element == BASE_TYPE_STRING) {
+          code += "string(rcv." + field_name_camel + "(j))";
+        } else if (field.value.type.element == BASE_TYPE_STRUCT) {
+          code += "x.UnPack()";
+        } else {
+          // TODO(iceboy): Support vector of unions.
+          FLATBUFFERS_ASSERT(0);
+        }
+        code += "\n";
+        code += "\t}\n";
+      } else if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+        code += "\tt." + field_name_camel + " = rcv." + field_name_camel +
+                "(nil).UnPack()\n";
+      } else if (field.value.type.base_type == BASE_TYPE_UNION) {
+        std::string field_table = MakeCamel(field.name, false) + "Table";
+        code += "\t" + field_table + " := flatbuffers.Table{}\n";
+        code +=
+            "\tif rcv." + MakeCamel(field.name) + "(&" + field_table + ") {\n";
+        code += "\t\tt." + field_name_camel + " = rcv." +
+                MakeCamel(field.name + UnionTypeFieldSuffix()) + "().UnPack(" +
+                field_table + ")\n";
+        code += "\t}\n";
+      } else {
+        FLATBUFFERS_ASSERT(0);
+      }
+    }
+    code += "}\n\n";
+
+    code += "func (rcv *" + struct_def.name + ") UnPack() *" +
+            NativeName(struct_def) + " {\n";
+    code += "\tif rcv == nil { return nil }\n";
+    code += "\tt := &" + NativeName(struct_def) + "{}\n";
+    code += "\trcv.UnPackTo(t)\n";
+    code += "\treturn t\n";
+    code += "}\n\n";
+  }
+
+  void GenNativeStructPack(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "func (t *" + NativeName(struct_def) +
+            ") Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {\n";
+    code += "\tif t == nil { return 0 }\n";
+    code += "\treturn Create" + struct_def.name + "(builder";
+    StructPackArgs(struct_def, "", code_ptr);
+    code += ")\n";
+    code += "}\n";
+  }
+
+  void StructPackArgs(const StructDef &struct_def, const char *nameprefix,
+                      std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+        StructPackArgs(*field.value.type.struct_def,
+                       (nameprefix + MakeCamel(field.name) + ".").c_str(),
+                       code_ptr);
+      } else {
+        code += std::string(", t.") + nameprefix + MakeCamel(field.name);
+      }
+    }
+  }
+
+  void GenNativeStructUnPack(const StructDef &struct_def,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "func (rcv *" + struct_def.name + ") UnPackTo(t *" +
+            NativeName(struct_def) + ") {\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const FieldDef &field = **it;
+      if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+        code += "\tt." + MakeCamel(field.name) + " = rcv." +
+                MakeCamel(field.name) + "(nil).UnPack()\n";
+      } else {
+        code += "\tt." + MakeCamel(field.name) + " = rcv." +
+                MakeCamel(field.name) + "()\n";
+      }
+    }
+    code += "}\n\n";
+
+    code += "func (rcv *" + struct_def.name + ") UnPack() *" +
+            NativeName(struct_def) + " {\n";
+    code += "\tif rcv == nil { return nil }\n";
+    code += "\tt := &" + NativeName(struct_def) + "{}\n";
+    code += "\trcv.UnPackTo(t)\n";
+    code += "\treturn t\n";
+    code += "}\n\n";
+  }
+
+  // Generate enum declarations.
+  void GenEnum(const EnumDef &enum_def, std::string *code_ptr) {
+    if (enum_def.generated) return;
+
+    auto max_name_length = MaxNameLength(enum_def);
+    cur_name_space_ = enum_def.defined_namespace;
+
+    GenComment(enum_def.doc_comment, code_ptr, nullptr);
+    GenEnumType(enum_def, code_ptr);
+    BeginEnum(code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const EnumVal &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, nullptr, "\t");
+      EnumMember(enum_def, ev, max_name_length, code_ptr);
+    }
+    EndEnum(code_ptr);
+
+    BeginEnumNames(enum_def, code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const EnumVal &ev = **it;
+      EnumNameMember(enum_def, ev, max_name_length, code_ptr);
+    }
+    EndEnumNames(code_ptr);
+
+    BeginEnumValues(enum_def, code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      EnumValueMember(enum_def, ev, max_name_length, code_ptr);
+    }
+    EndEnumValues(code_ptr);
+
+    EnumStringer(enum_def, code_ptr);
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetter(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "rcv._tab.ByteVector";
+      case BASE_TYPE_UNION: return "rcv._tab.Union";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      default: return "rcv._tab.Get" + MakeCamel(GenTypeBasic(type));
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const FieldDef &field) {
+    return IsScalar(field.value.type.base_type)
+               ? MakeCamel(GenTypeBasic(field.value.type))
+               : (IsStruct(field.value.type) ? "Struct" : "UOffsetT");
+  }
+
+  std::string GenTypeBasic(const Type &type) {
+    // clang-format off
+    static const char *ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, ...) \
+        #GTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return ctypename[type.base_type];
+  }
+
+  std::string GenTypePointer(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "[]byte";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return WrapInNameSpaceAndTrack(*type.struct_def);
+      case BASE_TYPE_UNION:
+        // fall through
+      default: return "*flatbuffers.Table";
+    }
+  }
+
+  std::string GenTypeGet(const Type &type) {
+    if (type.enum_def != nullptr) { return GetEnumTypeName(*type.enum_def); }
+    return IsScalar(type.base_type) ? GenTypeBasic(type) : GenTypePointer(type);
+  }
+
+  std::string TypeName(const FieldDef &field) {
+    return GenTypeGet(field.value.type);
+  }
+
+  // If type is an enum, returns value with a cast to the enum type, otherwise
+  // returns value as-is.
+  std::string CastToEnum(const Type &type, std::string value) {
+    if (type.enum_def == nullptr) {
+      return value;
+    } else {
+      return GenTypeGet(type) + "(" + value + ")";
+    }
+  }
+
+  // If type is an enum, returns value with a cast to the enum base type,
+  // otherwise returns value as-is.
+  std::string CastToBaseType(const Type &type, std::string value) {
+    if (type.enum_def == nullptr) {
+      return value;
+    } else {
+      return GenTypeBasic(type) + "(" + value + ")";
+    }
+  }
+
+  std::string GenConstant(const FieldDef &field) {
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_BOOL:
+        return field.value.constant == "0" ? "false" : "true";
+      default: return field.value.constant;
+    }
+  }
+
+  std::string NativeName(const StructDef &struct_def) {
+    return parser_.opts.object_prefix + struct_def.name +
+           parser_.opts.object_suffix;
+  }
+
+  std::string NativeName(const EnumDef &enum_def) {
+    return parser_.opts.object_prefix + enum_def.name +
+           parser_.opts.object_suffix;
+  }
+
+  std::string NativeType(const Type &type) {
+    if (IsScalar(type.base_type)) {
+      if (type.enum_def == nullptr) {
+        return GenTypeBasic(type);
+      } else {
+        return GetEnumTypeName(*type.enum_def);
+      }
+    } else if (IsString(type)) {
+      return "string";
+    } else if (IsVector(type)) {
+      return "[]" + NativeType(type.VectorType());
+    } else if (type.base_type == BASE_TYPE_STRUCT) {
+      return "*" + WrapInNameSpaceAndTrack(type.struct_def->defined_namespace,
+                                           NativeName(*type.struct_def));
+    } else if (type.base_type == BASE_TYPE_UNION) {
+      return "*" + WrapInNameSpaceAndTrack(type.enum_def->defined_namespace,
+                                           NativeName(*type.enum_def));
+    }
+    FLATBUFFERS_ASSERT(0);
+    return std::string();
+  }
+
+  // Create a struct with a builder and the struct's arguments.
+  void GenStructBuilder(const StructDef &struct_def, std::string *code_ptr) {
+    BeginBuilderArgs(struct_def, code_ptr);
+    StructBuilderArgs(struct_def, "", code_ptr);
+    EndBuilderArgs(code_ptr);
+
+    StructBuilderBody(struct_def, "", code_ptr);
+    EndBuilderBody(code_ptr);
+  }
+  // Begin by declaring namespace and imports.
+  void BeginFile(const std::string &name_space_name, const bool needs_imports,
+                 const bool is_enum, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code = code +
+           "// Code generated by the FlatBuffers compiler. DO NOT EDIT.\n\n";
+    code += "package " + name_space_name + "\n\n";
+    if (needs_imports) {
+      code += "import (\n";
+      if (is_enum) { code += "\t\"strconv\"\n\n"; }
+      if (!parser_.opts.go_import.empty()) {
+        code += "\tflatbuffers \"" + parser_.opts.go_import + "\"\n";
+      } else {
+        code += "\tflatbuffers \"github.com/google/flatbuffers/go\"\n";
+      }
+      if (tracked_imported_namespaces_.size() > 0) {
+        code += "\n";
+        for (auto it = tracked_imported_namespaces_.begin();
+             it != tracked_imported_namespaces_.end(); ++it) {
+          code += "\t" + NamespaceImportName(*it) + " \"" +
+                  NamespaceImportPath(*it) + "\"\n";
+        }
+      }
+      code += ")\n\n";
+    } else {
+      if (is_enum) { code += "import \"strconv\"\n\n"; }
+    }
+  }
+
+  // Save out the generated code for a Go Table type.
+  bool SaveType(const Definition &def, const std::string &classcode,
+                const bool needs_imports, const bool is_enum) {
+    if (!classcode.length()) return true;
+
+    Namespace &ns = go_namespace_.components.empty() ? *def.defined_namespace
+                                                     : go_namespace_;
+    std::string code = "";
+    BeginFile(LastNamespacePart(ns), needs_imports, is_enum, &code);
+    code += classcode;
+    // Strip extra newlines at end of file to make it gofmt-clean.
+    while (code.length() > 2 && code.substr(code.length() - 2) == "\n\n") {
+      code.pop_back();
+    }
+    std::string filename = NamespaceDir(ns) + def.name + ".go";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+  // Create the full name of the imported namespace (format: A__B__C).
+  std::string NamespaceImportName(const Namespace *ns) {
+    std::string s = "";
+    for (auto it = ns->components.begin(); it != ns->components.end(); ++it) {
+      if (s.size() == 0) {
+        s += *it;
+      } else {
+        s += "__" + *it;
+      }
+    }
+    return s;
+  }
+
+  // Create the full path for the imported namespace (format: A/B/C).
+  std::string NamespaceImportPath(const Namespace *ns) {
+    std::string s = "";
+    for (auto it = ns->components.begin(); it != ns->components.end(); ++it) {
+      if (s.size() == 0) {
+        s += *it;
+      } else {
+        s += "/" + *it;
+      }
+    }
+    return s;
+  }
+
+  // Ensure that a type is prefixed with its go package import name if it is
+  // used outside of its namespace.
+  std::string WrapInNameSpaceAndTrack(const Namespace *ns,
+                                      const std::string &name) {
+    if (CurrentNameSpace() == ns) return name;
+
+    tracked_imported_namespaces_.insert(ns);
+
+    std::string import_name = NamespaceImportName(ns);
+    return import_name + "." + name;
+  }
+
+  std::string WrapInNameSpaceAndTrack(const Definition &def) {
+    return WrapInNameSpaceAndTrack(def.defined_namespace, def.name);
+  }
+
+  const Namespace *CurrentNameSpace() const { return cur_name_space_; }
+
+  static size_t MaxNameLength(const EnumDef &enum_def) {
+    size_t max = 0;
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      max = std::max((*it)->name.length(), max);
+    }
+    return max;
+  }
+};
+}  // namespace go
+
+bool GenerateGo(const Parser &parser, const std::string &path,
+                const std::string &file_name) {
+  go::GoGenerator generator(parser, path, file_name, parser.opts.go_namespace);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_grpc.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_grpc.cpp
new file mode 100644
index 0000000..9aea745
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_grpc.cpp
@@ -0,0 +1,557 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+#include "src/compiler/cpp_generator.h"
+#include "src/compiler/go_generator.h"
+#include "src/compiler/java_generator.h"
+#include "src/compiler/python_generator.h"
+#include "src/compiler/swift_generator.h"
+#include "src/compiler/ts_generator.h"
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable : 4512)  // C4512: 'class' : assignment operator could
+// not be generated
+#endif
+
+namespace flatbuffers {
+
+class FlatBufMethod : public grpc_generator::Method {
+ public:
+  enum Streaming { kNone, kClient, kServer, kBiDi };
+
+  FlatBufMethod(const RPCCall *method) : method_(method) {
+    streaming_ = kNone;
+    auto val = method_->attributes.Lookup("streaming");
+    if (val) {
+      if (val->constant == "client") streaming_ = kClient;
+      if (val->constant == "server") streaming_ = kServer;
+      if (val->constant == "bidi") streaming_ = kBiDi;
+    }
+  }
+
+  grpc::string GetLeadingComments(const grpc::string) const { return ""; }
+
+  grpc::string GetTrailingComments(const grpc::string) const { return ""; }
+
+  std::vector<grpc::string> GetAllComments() const {
+    return method_->doc_comment;
+  }
+
+  std::string name() const { return method_->name; }
+
+  // TODO: This method need to incorporate namespace for C++ side. Other
+  // language bindings simply don't use this method.
+  std::string GRPCType(const StructDef &sd) const {
+    return "flatbuffers::grpc::Message<" + sd.name + ">";
+  }
+
+  std::vector<std::string> get_input_namespace_parts() const {
+    return (*method_->request).defined_namespace->components;
+  }
+
+  std::string get_input_type_name() const { return (*method_->request).name; }
+
+  std::vector<std::string> get_output_namespace_parts() const {
+    return (*method_->response).defined_namespace->components;
+  }
+
+  std::string get_output_type_name() const { return (*method_->response).name; }
+
+  bool get_module_and_message_path_input(grpc::string * /*str*/,
+                                         grpc::string /*generator_file_name*/,
+                                         bool /*generate_in_pb2_grpc*/,
+                                         grpc::string /*import_prefix*/) const {
+    return true;
+  }
+
+  bool get_module_and_message_path_output(
+      grpc::string * /*str*/, grpc::string /*generator_file_name*/,
+      bool /*generate_in_pb2_grpc*/, grpc::string /*import_prefix*/) const {
+    return true;
+  }
+
+  std::string get_fb_builder() const { return "builder"; }
+
+  std::string input_type_name() const { return GRPCType(*method_->request); }
+
+  std::string output_type_name() const { return GRPCType(*method_->response); }
+
+  bool NoStreaming() const { return streaming_ == kNone; }
+
+  bool ClientStreaming() const { return streaming_ == kClient; }
+
+  bool ServerStreaming() const { return streaming_ == kServer; }
+
+  bool BidiStreaming() const { return streaming_ == kBiDi; }
+
+ private:
+  const RPCCall *method_;
+  Streaming streaming_;
+};
+
+class FlatBufService : public grpc_generator::Service {
+ public:
+  FlatBufService(const ServiceDef *service) : service_(service) {}
+
+  grpc::string GetLeadingComments(const grpc::string) const { return ""; }
+
+  grpc::string GetTrailingComments(const grpc::string) const { return ""; }
+
+  std::vector<grpc::string> GetAllComments() const {
+    return service_->doc_comment;
+  }
+
+  std::vector<grpc::string> namespace_parts() const {
+    return service_->defined_namespace->components;
+  }
+
+  std::string name() const { return service_->name; }
+  bool is_internal() const {
+    return service_->Definition::attributes.Lookup("private") ? true : false;
+  }
+
+  int method_count() const {
+    return static_cast<int>(service_->calls.vec.size());
+  }
+
+  std::unique_ptr<const grpc_generator::Method> method(int i) const {
+    return std::unique_ptr<const grpc_generator::Method>(
+        new FlatBufMethod(service_->calls.vec[i]));
+  }
+
+ private:
+  const ServiceDef *service_;
+};
+
+class FlatBufPrinter : public grpc_generator::Printer {
+ public:
+  FlatBufPrinter(std::string *str, const char indentation_type)
+      : str_(str),
+        escape_char_('$'),
+        indent_(0),
+        indentation_size_(2),
+        indentation_type_(indentation_type) {}
+
+  void Print(const std::map<std::string, std::string> &vars,
+             const char *string_template) {
+    std::string s = string_template;
+    // Replace any occurrences of strings in "vars" that are surrounded
+    // by the escape character by what they're mapped to.
+    size_t pos;
+    while ((pos = s.find(escape_char_)) != std::string::npos) {
+      // Found an escape char, must also find the closing one.
+      size_t pos2 = s.find(escape_char_, pos + 1);
+      // If placeholder not closed, ignore.
+      if (pos2 == std::string::npos) break;
+      auto it = vars.find(s.substr(pos + 1, pos2 - pos - 1));
+      // If unknown placeholder, ignore.
+      if (it == vars.end()) break;
+      // Subtitute placeholder.
+      s.replace(pos, pos2 - pos + 1, it->second);
+    }
+    Print(s.c_str());
+  }
+
+  void Print(const char *s) {
+    if (s == nullptr || *s == '\0') { return; }
+    // Add this string, but for each part separated by \n, add indentation.
+    for (;;) {
+      // Current indentation.
+      str_->insert(str_->end(), indent_ * indentation_size_, indentation_type_);
+      // See if this contains more than one line.
+      const char *lf = strchr(s, '\n');
+      if (lf) {
+        (*str_) += std::string(s, lf + 1);
+        s = lf + 1;
+        if (!*s) break;  // Only continue if there's more lines.
+      } else {
+        (*str_) += s;
+        break;
+      }
+    }
+  }
+
+  void SetIndentationSize(const int size) {
+    FLATBUFFERS_ASSERT(str_->empty());
+    indentation_size_ = size;
+  }
+
+  void Indent() { indent_++; }
+
+  void Outdent() {
+    indent_--;
+    FLATBUFFERS_ASSERT(indent_ >= 0);
+  }
+
+ private:
+  std::string *str_;
+  char escape_char_;
+  int indent_;
+  int indentation_size_;
+  char indentation_type_;
+};
+
+class FlatBufFile : public grpc_generator::File {
+ public:
+  enum Language {
+    kLanguageGo,
+    kLanguageCpp,
+    kLanguageJava,
+    kLanguagePython,
+    kLanguageSwift,
+    kLanguageTS
+  };
+
+  FlatBufFile(const Parser &parser, const std::string &file_name,
+              Language language)
+      : parser_(parser), file_name_(file_name), language_(language) {}
+
+  FlatBufFile &operator=(const FlatBufFile &);
+
+  grpc::string GetLeadingComments(const grpc::string) const { return ""; }
+
+  grpc::string GetTrailingComments(const grpc::string) const { return ""; }
+
+  std::vector<grpc::string> GetAllComments() const {
+    return std::vector<grpc::string>();
+  }
+
+  std::string filename() const { return file_name_; }
+
+  std::string filename_without_ext() const {
+    return StripExtension(file_name_);
+  }
+
+  std::string message_header_ext() const { return "_generated.h"; }
+
+  std::string service_header_ext() const { return ".grpc.fb.h"; }
+
+  std::string package() const {
+    return parser_.current_namespace_->GetFullyQualifiedName("");
+  }
+
+  std::vector<std::string> package_parts() const {
+    return parser_.current_namespace_->components;
+  }
+
+  std::string additional_headers() const {
+    switch (language_) {
+      case kLanguageCpp: {
+        return "#include \"flatbuffers/grpc.h\"\n";
+      }
+      case kLanguageGo: {
+        return "import \"github.com/google/flatbuffers/go\"";
+      }
+      case kLanguageJava: {
+        return "import com.google.flatbuffers.grpc.FlatbuffersUtils;";
+      }
+      case kLanguagePython: {
+        return "";
+      }
+      case kLanguageSwift: {
+        return "";
+      }
+      case kLanguageTS: {
+        return "";
+      }
+    }
+    return "";
+  }
+
+  int service_count() const {
+    return static_cast<int>(parser_.services_.vec.size());
+  }
+
+  std::unique_ptr<const grpc_generator::Service> service(int i) const {
+    return std::unique_ptr<const grpc_generator::Service>(
+        new FlatBufService(parser_.services_.vec[i]));
+  }
+
+  std::unique_ptr<grpc_generator::Printer> CreatePrinter(
+      std::string *str, const char indentation_type = ' ') const {
+    return std::unique_ptr<grpc_generator::Printer>(
+        new FlatBufPrinter(str, indentation_type));
+  }
+
+ private:
+  const Parser &parser_;
+  const std::string &file_name_;
+  const Language language_;
+};
+
+class GoGRPCGenerator : public flatbuffers::BaseGenerator {
+ public:
+  GoGRPCGenerator(const Parser &parser, const std::string &path,
+                  const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", "" /*Unused*/, "go"),
+        parser_(parser),
+        path_(path),
+        file_name_(file_name) {}
+
+  bool generate() {
+    FlatBufFile file(parser_, file_name_, FlatBufFile::kLanguageGo);
+    grpc_go_generator::Parameters p;
+    p.custom_method_io_type = "flatbuffers.Builder";
+    for (int i = 0; i < file.service_count(); i++) {
+      auto service = file.service(i);
+      const Definition *def = parser_.services_.vec[i];
+      p.package_name = LastNamespacePart(*(def->defined_namespace));
+      p.service_prefix =
+          def->defined_namespace->GetFullyQualifiedName("");  // file.package();
+      std::string output =
+          grpc_go_generator::GenerateServiceSource(&file, service.get(), &p);
+      std::string filename =
+          NamespaceDir(*def->defined_namespace) + def->name + "_grpc.go";
+      if (!flatbuffers::SaveFile(filename.c_str(), output, false)) return false;
+    }
+    return true;
+  }
+
+ protected:
+  const Parser &parser_;
+  const std::string &path_, &file_name_;
+};
+
+bool GenerateGoGRPC(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+  return GoGRPCGenerator(parser, path, file_name).generate();
+}
+
+bool GenerateCppGRPC(const Parser &parser, const std::string &path,
+                     const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+
+  grpc_cpp_generator::Parameters generator_parameters;
+  // TODO(wvo): make the other parameters in this struct configurable.
+  generator_parameters.use_system_headers = true;
+
+  FlatBufFile fbfile(parser, file_name, FlatBufFile::kLanguageCpp);
+
+  std::string header_code =
+      grpc_cpp_generator::GetHeaderPrologue(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetHeaderIncludes(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetHeaderServices(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetHeaderEpilogue(&fbfile, generator_parameters);
+
+  std::string source_code =
+      grpc_cpp_generator::GetSourcePrologue(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetSourceIncludes(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetSourceServices(&fbfile, generator_parameters) +
+      grpc_cpp_generator::GetSourceEpilogue(&fbfile, generator_parameters);
+
+  return flatbuffers::SaveFile((path + file_name + ".grpc.fb.h").c_str(),
+                               header_code, false) &&
+         flatbuffers::SaveFile((path + file_name + ".grpc.fb.cc").c_str(),
+                               source_code, false);
+}
+
+class JavaGRPCGenerator : public flatbuffers::BaseGenerator {
+ public:
+  JavaGRPCGenerator(const Parser &parser, const std::string &path,
+                    const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", "." /*separator*/, "java") {}
+
+  bool generate() {
+    FlatBufFile file(parser_, file_name_, FlatBufFile::kLanguageJava);
+    grpc_java_generator::Parameters p;
+    for (int i = 0; i < file.service_count(); i++) {
+      auto service = file.service(i);
+      const Definition *def = parser_.services_.vec[i];
+      p.package_name =
+          def->defined_namespace->GetFullyQualifiedName("");  // file.package();
+      std::string output =
+          grpc_java_generator::GenerateServiceSource(&file, service.get(), &p);
+      std::string filename =
+          NamespaceDir(*def->defined_namespace) + def->name + "Grpc.java";
+      if (!flatbuffers::SaveFile(filename.c_str(), output, false)) return false;
+    }
+    return true;
+  }
+};
+
+bool GenerateJavaGRPC(const Parser &parser, const std::string &path,
+                      const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+  return JavaGRPCGenerator(parser, path, file_name).generate();
+}
+
+class PythonGRPCGenerator : public flatbuffers::BaseGenerator {
+ private:
+  CodeWriter code_;
+
+ public:
+  PythonGRPCGenerator(const Parser &parser, const std::string &filename)
+      : BaseGenerator(parser, "", filename, "", "" /*Unused*/, "swift") {}
+
+  bool generate() {
+    code_.Clear();
+    code_ +=
+        "# Generated by the gRPC Python protocol compiler plugin. "
+        "DO NOT EDIT!\n";
+    code_ += "import grpc\n";
+
+    FlatBufFile file(parser_, file_name_, FlatBufFile::kLanguagePython);
+
+    for (int i = 0; i < file.service_count(); i++) {
+      auto service = file.service(i);
+      code_ += grpc_python_generator::Generate(&file, service.get());
+    }
+    const auto final_code = code_.ToString();
+    const auto filename = GenerateFileName();
+    return SaveFile(filename.c_str(), final_code, false);
+  }
+
+  std::string GenerateFileName() {
+    std::string namespace_dir;
+    auto &namespaces = parser_.namespaces_.back()->components;
+    for (auto it = namespaces.begin(); it != namespaces.end(); ++it) {
+      if (it != namespaces.begin()) namespace_dir += kPathSeparator;
+      namespace_dir += *it;
+    }
+    std::string grpc_py_filename = namespace_dir;
+    if (!namespace_dir.empty()) grpc_py_filename += kPathSeparator;
+    return grpc_py_filename + file_name_ + "_grpc_fb.py";
+  }
+};
+
+bool GeneratePythonGRPC(const Parser &parser, const std::string & /*path*/,
+                        const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+
+  return PythonGRPCGenerator(parser, file_name).generate();
+}
+
+class SwiftGRPCGenerator : public flatbuffers::BaseGenerator {
+ private:
+  CodeWriter code_;
+
+ public:
+  SwiftGRPCGenerator(const Parser &parser, const std::string &path,
+                     const std::string &filename)
+      : BaseGenerator(parser, path, filename, "", "" /*Unused*/, "swift") {}
+
+  bool generate() {
+    code_.Clear();
+    code_ += "// Generated GRPC code for FlatBuffers swift!";
+    code_ += grpc_swift_generator::GenerateHeader();
+    FlatBufFile file(parser_, file_name_, FlatBufFile::kLanguageSwift);
+    for (int i = 0; i < file.service_count(); i++) {
+      auto service = file.service(i);
+      code_ += grpc_swift_generator::Generate(&file, service.get());
+    }
+    const auto final_code = code_.ToString();
+    const auto filename = GeneratedFileName(path_, file_name_);
+    return SaveFile(filename.c_str(), final_code, false);
+  }
+
+  static std::string GeneratedFileName(const std::string &path,
+                                       const std::string &file_name) {
+    return path + file_name + ".grpc.swift";
+  }
+};
+
+bool GenerateSwiftGRPC(const Parser &parser, const std::string &path,
+                       const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+  return SwiftGRPCGenerator(parser, path, file_name).generate();
+}
+
+class TSGRPCGenerator : public flatbuffers::BaseGenerator {
+ private:
+  CodeWriter code_;
+
+ public:
+  TSGRPCGenerator(const Parser &parser, const std::string &path,
+                  const std::string &filename)
+      : BaseGenerator(parser, path, filename, "", "" /*Unused*/, "ts") {}
+
+  bool generate() {
+    code_.Clear();
+    FlatBufFile file(parser_, file_name_, FlatBufFile::kLanguageTS);
+
+    for (int i = 0; i < file.service_count(); i++) {
+      auto service = file.service(i);
+      code_ += grpc_ts_generator::Generate(&file, service.get(), file_name_);
+      const auto ts_name = GeneratedFileName(path_, file_name_);
+      if (!SaveFile(ts_name.c_str(), code_.ToString(), false)) return false;
+
+      code_.Clear();
+      code_ += grpc_ts_generator::GenerateInterface(&file, service.get(),
+                                                    file_name_);
+      const auto ts_interface_name = GeneratedFileName(path_, file_name_, true);
+      if (!SaveFile(ts_interface_name.c_str(), code_.ToString(), false))
+        return false;
+    }
+    return true;
+  }
+
+  static std::string GeneratedFileName(const std::string &path,
+                                       const std::string &file_name,
+                                       const bool is_interface = false) {
+    if (is_interface) return path + file_name + "_grpc.d.ts";
+    return path + file_name + "_grpc.js";
+  }
+};
+
+bool GenerateTSGRPC(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  int nservices = 0;
+  for (auto it = parser.services_.vec.begin(); it != parser.services_.vec.end();
+       ++it) {
+    if (!(*it)->generated) nservices++;
+  }
+  if (!nservices) return true;
+  return TSGRPCGenerator(parser, path, file_name).generate();
+}
+
+}  // namespace flatbuffers
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_java.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_java.cpp
new file mode 100644
index 0000000..cfd3a55
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_java.cpp
@@ -0,0 +1,1244 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+#if defined(FLATBUFFERS_CPP98_STL)
+#  include <cctype>
+#endif  // defined(FLATBUFFERS_CPP98_STL)
+
+namespace flatbuffers {
+namespace java {
+
+static TypedFloatConstantGenerator JavaFloatGen("Double.", "Float.", "NaN",
+                                                "POSITIVE_INFINITY",
+                                                "NEGATIVE_INFINITY");
+
+static CommentConfig comment_config = {
+  "/**",
+  " *",
+  " */",
+};
+
+class JavaGenerator : public BaseGenerator {
+ public:
+  JavaGenerator(const Parser &parser, const std::string &path,
+                const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", ".", "java"),
+        cur_name_space_(nullptr) {}
+
+  JavaGenerator &operator=(const JavaGenerator &);
+  bool generate() {
+    std::string one_file_code;
+    cur_name_space_ = parser_.current_namespace_;
+
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      std::string enumcode;
+      auto &enum_def = **it;
+      if (!parser_.opts.one_file) cur_name_space_ = enum_def.defined_namespace;
+      GenEnum(enum_def, &enumcode);
+      if (parser_.opts.one_file) {
+        one_file_code += enumcode;
+      } else {
+        if (!SaveType(enum_def.name, *enum_def.defined_namespace, enumcode,
+                      /* needs_includes= */ false))
+          return false;
+      }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      std::string declcode;
+      auto &struct_def = **it;
+      if (!parser_.opts.one_file)
+        cur_name_space_ = struct_def.defined_namespace;
+      GenStruct(struct_def, &declcode);
+      if (parser_.opts.one_file) {
+        one_file_code += declcode;
+      } else {
+        if (!SaveType(struct_def.name, *struct_def.defined_namespace, declcode,
+                      /* needs_includes= */ true))
+          return false;
+      }
+    }
+
+    if (parser_.opts.one_file) {
+      return SaveType(file_name_, *parser_.current_namespace_, one_file_code,
+                      /* needs_includes= */ true);
+    }
+    return true;
+  }
+
+  // Save out the generated code for a single class while adding
+  // declaration boilerplate.
+  bool SaveType(const std::string &defname, const Namespace &ns,
+                const std::string &classcode, bool needs_includes) const {
+    if (!classcode.length()) return true;
+
+    std::string code;
+    code = "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    std::string namespace_name = FullNamespace(".", ns);
+    if (!namespace_name.empty()) {
+      code += "package " + namespace_name + ";";
+      code += "\n\n";
+    }
+    if (needs_includes) {
+      code +=
+          "import java.nio.*;\nimport java.lang.*;\nimport "
+          "java.util.*;\nimport com.google.flatbuffers.*;\n";
+      if (parser_.opts.gen_nullable) {
+        code += "\nimport javax.annotation.Nullable;\n";
+      }
+      if (parser_.opts.java_checkerframework) {
+        code += "\nimport org.checkerframework.dataflow.qual.Pure;\n";
+      }
+      code += "\n";
+    }
+
+    code += classcode;
+    if (!namespace_name.empty()) code += "";
+    auto filename = NamespaceDir(ns) + defname + ".java";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+  const Namespace *CurrentNameSpace() const { return cur_name_space_; }
+
+  std::string GenNullableAnnotation(const Type &t) const {
+    return parser_.opts.gen_nullable &&
+                   !IsScalar(DestinationType(t, true).base_type) &&
+                   t.base_type != BASE_TYPE_VECTOR
+               ? " @Nullable "
+               : "";
+  }
+
+  std::string GenPureAnnotation(const Type &t) const {
+    return parser_.opts.java_checkerframework &&
+                   !IsScalar(DestinationType(t, true).base_type)
+               ? " @Pure "
+               : "";
+  }
+
+  std::string GenTypeBasic(const Type &type) const {
+    // clang-format off
+    static const char * const java_typename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, ...) \
+        #JTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return java_typename[type.base_type];
+  }
+
+  std::string GenTypePointer(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return WrapInNameSpace(*type.struct_def);
+      case BASE_TYPE_UNION: FLATBUFFERS_FALLTHROUGH();  // else fall thru
+      default: return "Table";
+    }
+  }
+
+  std::string GenTypeGet(const Type &type) const {
+    return IsScalar(type.base_type)
+               ? GenTypeBasic(type)
+               : (IsArray(type) ? GenTypeGet(type.VectorType())
+                                : GenTypePointer(type));
+  }
+
+  // Find the destination type the user wants to receive the value in (e.g.
+  // one size higher signed types for unsigned serialized values in Java).
+  Type DestinationType(const Type &type, bool vectorelem) const {
+    switch (type.base_type) {
+      // We use int for both uchar/ushort, since that generally means less
+      // casting than using short for uchar.
+      case BASE_TYPE_UCHAR: return Type(BASE_TYPE_INT);
+      case BASE_TYPE_USHORT: return Type(BASE_TYPE_INT);
+      case BASE_TYPE_UINT: return Type(BASE_TYPE_LONG);
+      case BASE_TYPE_ARRAY:
+      case BASE_TYPE_VECTOR:
+        if (vectorelem) return DestinationType(type.VectorType(), vectorelem);
+        FLATBUFFERS_FALLTHROUGH();  // else fall thru
+      default: return type;
+    }
+  }
+
+  std::string GenOffsetType() const { return "int"; }
+
+  std::string GenOffsetConstruct(const std::string &variable_name) const {
+    return variable_name;
+  }
+
+  std::string GenVectorOffsetType() const { return "int"; }
+
+  // Generate destination type name
+  std::string GenTypeNameDest(const Type &type) const {
+    return GenTypeGet(DestinationType(type, true));
+  }
+
+  // Mask to turn serialized value into destination type value.
+  std::string DestinationMask(const Type &type, bool vectorelem) const {
+    switch (type.base_type) {
+      case BASE_TYPE_UCHAR: return " & 0xFF";
+      case BASE_TYPE_USHORT: return " & 0xFFFF";
+      case BASE_TYPE_UINT: return " & 0xFFFFFFFFL";
+      case BASE_TYPE_VECTOR:
+        if (vectorelem) return DestinationMask(type.VectorType(), vectorelem);
+        FLATBUFFERS_FALLTHROUGH();  // else fall thru
+      default: return "";
+    }
+  }
+
+  // Casts necessary to correctly read serialized data
+  std::string DestinationCast(const Type &type) const {
+    if (IsSeries(type)) {
+      return DestinationCast(type.VectorType());
+    } else {
+      // Cast necessary to correctly read serialized unsigned values.
+      if (type.base_type == BASE_TYPE_UINT) return "(long)";
+    }
+    return "";
+  }
+
+  // Cast statements for mutator method parameters.
+  // In Java, parameters representing unsigned numbers need to be cast down to
+  // their respective type. For example, a long holding an unsigned int value
+  // would be cast down to int before being put onto the buffer.
+  std::string SourceCast(const Type &type, bool castFromDest) const {
+    if (IsSeries(type)) {
+      return SourceCast(type.VectorType(), castFromDest);
+    } else {
+      if (castFromDest) {
+        if (type.base_type == BASE_TYPE_UINT)
+          return "(int)";
+        else if (type.base_type == BASE_TYPE_USHORT)
+          return "(short)";
+        else if (type.base_type == BASE_TYPE_UCHAR)
+          return "(byte)";
+      }
+    }
+    return "";
+  }
+
+  std::string SourceCast(const Type &type) const {
+    return SourceCast(type, true);
+  }
+
+  std::string SourceCastBasic(const Type &type, bool castFromDest) const {
+    return IsScalar(type.base_type) ? SourceCast(type, castFromDest) : "";
+  }
+
+  std::string SourceCastBasic(const Type &type) const {
+    return SourceCastBasic(type, true);
+  }
+
+  std::string GenEnumDefaultValue(const FieldDef &field) const {
+    auto &value = field.value;
+    FLATBUFFERS_ASSERT(value.type.enum_def);
+    auto &enum_def = *value.type.enum_def;
+    auto enum_val = enum_def.FindByValue(value.constant);
+    return enum_val ? (WrapInNameSpace(enum_def) + "." + enum_val->name)
+                    : value.constant;
+  }
+
+  std::string GenDefaultValue(const FieldDef &field) const {
+    auto &value = field.value;
+    auto constant = field.IsScalarOptional() ? "0" : value.constant;
+    auto longSuffix = "L";
+    switch (value.type.base_type) {
+      case BASE_TYPE_BOOL: return constant == "0" ? "false" : "true";
+      case BASE_TYPE_ULONG: {
+        // Converts the ulong into its bits signed equivalent
+        uint64_t defaultValue = StringToUInt(constant.c_str());
+        return NumToString(static_cast<int64_t>(defaultValue)) + longSuffix;
+      }
+      case BASE_TYPE_UINT:
+      case BASE_TYPE_LONG: return constant + longSuffix;
+      default:
+        if (IsFloat(value.type.base_type)) {
+          if (field.IsScalarOptional()) {
+            return value.type.base_type == BASE_TYPE_DOUBLE ? "0.0" : "0f";
+          }
+          return JavaFloatGen.GenFloatConstant(field);
+        } else {
+          return constant;
+        }
+    }
+  }
+
+  std::string GenDefaultValueBasic(const FieldDef &field) const {
+    auto &value = field.value;
+    if (!IsScalar(value.type.base_type)) { return "0"; }
+    return GenDefaultValue(field);
+  }
+
+  void GenEnum(EnumDef &enum_def, std::string *code_ptr) const {
+    std::string &code = *code_ptr;
+    if (enum_def.generated) return;
+
+    // Generate enum definitions of the form:
+    // public static (final) int name = value;
+    // In Java, we use ints rather than the Enum feature, because we want them
+    // to map directly to how they're used in C/C++ and file formats.
+    // That, and Java Enums are expensive, and not universally liked.
+    GenComment(enum_def.doc_comment, code_ptr, &comment_config);
+
+    if (enum_def.attributes.Lookup("private")) {
+      // For Java, we leave the enum unmarked to indicate package-private
+    } else {
+      code += "public ";
+    }
+    code += "final class " + enum_def.name;
+    code += " {\n";
+    code += "  private " + enum_def.name + "() { }\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, &comment_config, "  ");
+      code += "  public static final ";
+      code += GenTypeBasic(DestinationType(enum_def.underlying_type, false));
+      code += " ";
+      code += ev.name + " = ";
+      code += enum_def.ToString(ev);
+      code += ";\n";
+    }
+
+    // Generate a generate string table for enum values.
+    // Problem is, if values are very sparse that could generate really big
+    // tables. Ideally in that case we generate a map lookup instead, but for
+    // the moment we simply don't output a table at all.
+    auto range = enum_def.Distance();
+    // Average distance between values above which we consider a table
+    // "too sparse". Change at will.
+    static const uint64_t kMaxSparseness = 5;
+    if (range / static_cast<uint64_t>(enum_def.size()) < kMaxSparseness) {
+      code += "\n  public static final String";
+      code += "[] names = { ";
+      auto val = enum_def.Vals().front();
+      for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+           ++it) {
+        auto ev = *it;
+        for (auto k = enum_def.Distance(val, ev); k > 1; --k) code += "\"\", ";
+        val = ev;
+        code += "\"" + (*it)->name + "\", ";
+      }
+      code += "};\n\n";
+      code += "  public static ";
+      code += "String";
+      code += " " + MakeCamel("name", false);
+      code += "(int e) { return names[e";
+      if (enum_def.MinValue()->IsNonZero())
+        code += " - " + enum_def.MinValue()->name;
+      code += "]; }\n";
+    }
+
+    // Close the class
+    code += "}\n\n";
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetter(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "__string";
+      case BASE_TYPE_STRUCT: return "__struct";
+      case BASE_TYPE_UNION: return "__union";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      case BASE_TYPE_ARRAY: return GenGetter(type.VectorType());
+      default: {
+        std::string getter = "bb.get";
+        if (type.base_type == BASE_TYPE_BOOL) {
+          getter = "0!=" + getter;
+        } else if (GenTypeBasic(type) != "byte") {
+          getter += MakeCamel(GenTypeBasic(type));
+        }
+        return getter;
+      }
+    }
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetterForLookupByKey(flatbuffers::FieldDef *key_field,
+                                      const std::string &data_buffer,
+                                      const char *num = nullptr) const {
+    auto type = key_field->value.type;
+    auto dest_mask = DestinationMask(type, true);
+    auto dest_cast = DestinationCast(type);
+    auto getter = data_buffer + ".get";
+    if (GenTypeBasic(type) != "byte") {
+      getter += MakeCamel(GenTypeBasic(type));
+    }
+    getter = dest_cast + getter + "(" + GenOffsetGetter(key_field, num) + ")" +
+             dest_mask;
+    return getter;
+  }
+
+  // Direct mutation is only allowed for scalar fields.
+  // Hence a setter method will only be generated for such fields.
+  std::string GenSetter(const Type &type) const {
+    if (IsScalar(type.base_type)) {
+      std::string setter = "bb.put";
+      if (GenTypeBasic(type) != "byte" && type.base_type != BASE_TYPE_BOOL) {
+        setter += MakeCamel(GenTypeBasic(type));
+      }
+      return setter;
+    } else {
+      return "";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const Type &type) const {
+    return IsScalar(type.base_type) ? MakeCamel(GenTypeBasic(type))
+                                    : (IsStruct(type) ? "Struct" : "Offset");
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void GenStructArgs(const StructDef &struct_def, std::string *code_ptr,
+                     const char *nameprefix, size_t array_count = 0) const {
+    std::string &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      const auto array_field = IsArray(field_type);
+      const auto &type = array_field ? field_type.VectorType()
+                                     : DestinationType(field_type, false);
+      const auto array_cnt = array_field ? (array_count + 1) : array_count;
+      if (IsStruct(type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        GenStructArgs(*field_type.struct_def, code_ptr,
+                      (nameprefix + (field.name + "_")).c_str(), array_cnt);
+      } else {
+        code += ", ";
+        code += GenTypeBasic(type);
+        for (size_t i = 0; i < array_cnt; i++) code += "[]";
+        code += " ";
+        code += nameprefix;
+        code += MakeCamel(field.name, false);
+      }
+    }
+  }
+
+  // Recusively generate struct construction statements of the form:
+  // builder.putType(name);
+  // and insert manual padding.
+  void GenStructBody(const StructDef &struct_def, std::string *code_ptr,
+                     const char *nameprefix, size_t index = 0,
+                     bool in_array = false) const {
+    std::string &code = *code_ptr;
+    std::string indent((index + 1) * 2, ' ');
+    code += indent + "  builder.prep(";
+    code += NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ");\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      if (field.padding) {
+        code += indent + "  builder.pad(";
+        code += NumToString(field.padding) + ");\n";
+      }
+      if (IsStruct(field_type)) {
+        GenStructBody(*field_type.struct_def, code_ptr,
+                      (nameprefix + (field.name + "_")).c_str(), index,
+                      in_array);
+      } else {
+        const auto &type =
+            IsArray(field_type) ? field_type.VectorType() : field_type;
+        const auto index_var = "_idx" + NumToString(index);
+        if (IsArray(field_type)) {
+          code += indent + "  for (int " + index_var + " = ";
+          code += NumToString(field_type.fixed_length);
+          code += "; " + index_var + " > 0; " + index_var + "--) {\n";
+          in_array = true;
+        }
+        if (IsStruct(type)) {
+          GenStructBody(*field_type.struct_def, code_ptr,
+                        (nameprefix + (field.name + "_")).c_str(), index + 1,
+                        in_array);
+        } else {
+          code += IsArray(field_type) ? "  " : "";
+          code += indent + "  builder.put";
+          code += GenMethod(type) + "(";
+          code += SourceCast(type);
+          auto argname = nameprefix + MakeCamel(field.name, false);
+          code += argname;
+          size_t array_cnt = index + (IsArray(field_type) ? 1 : 0);
+          for (size_t i = 0; in_array && i < array_cnt; i++) {
+            code += "[_idx" + NumToString(i) + "-1]";
+          }
+          code += ");\n";
+        }
+        if (IsArray(field_type)) { code += indent + "  }\n"; }
+      }
+    }
+  }
+
+  std::string GenByteBufferLength(const char *bb_name) const {
+    std::string bb_len = bb_name;
+    bb_len += ".capacity()";
+    return bb_len;
+  }
+
+  std::string GenOffsetGetter(flatbuffers::FieldDef *key_field,
+                              const char *num = nullptr) const {
+    std::string key_offset = "";
+    key_offset += "__offset(" + NumToString(key_field->value.offset) + ", ";
+    if (num) {
+      key_offset += num;
+      key_offset += ", _bb)";
+    } else {
+      key_offset += GenByteBufferLength("bb");
+      key_offset += " - tableOffset, bb)";
+    }
+    return key_offset;
+  }
+
+  std::string GenLookupKeyGetter(flatbuffers::FieldDef *key_field) const {
+    std::string key_getter = "      ";
+    key_getter += "int tableOffset = ";
+    key_getter += "__indirect(vectorLocation + 4 * (start + middle)";
+    key_getter += ", bb);\n      ";
+    if (IsString(key_field->value.type)) {
+      key_getter += "int comp = ";
+      key_getter += "compareStrings(";
+      key_getter += GenOffsetGetter(key_field);
+      key_getter += ", byteKey, bb);\n";
+    } else {
+      auto get_val = GenGetterForLookupByKey(key_field, "bb");
+      key_getter += GenTypeNameDest(key_field->value.type) + " val = ";
+      key_getter += get_val + ";\n";
+      key_getter += "      int comp = val > key ? 1 : val < key ? -1 : 0;\n";
+    }
+    return key_getter;
+  }
+
+  std::string GenKeyGetter(flatbuffers::FieldDef *key_field) const {
+    std::string key_getter = "";
+    auto data_buffer = "_bb";
+    if (IsString(key_field->value.type)) {
+      key_getter += " return ";
+      key_getter += "";
+      key_getter += "compareStrings(";
+      key_getter += GenOffsetGetter(key_field, "o1") + ", ";
+      key_getter += GenOffsetGetter(key_field, "o2") + ", " + data_buffer + ")";
+      key_getter += ";";
+    } else {
+      auto field_getter = GenGetterForLookupByKey(key_field, data_buffer, "o1");
+      key_getter +=
+          "\n    " + GenTypeNameDest(key_field->value.type) + " val_1 = ";
+      key_getter +=
+          field_getter + ";\n    " + GenTypeNameDest(key_field->value.type);
+      key_getter += " val_2 = ";
+      field_getter = GenGetterForLookupByKey(key_field, data_buffer, "o2");
+      key_getter += field_getter + ";\n";
+      key_getter += "    return val_1 > val_2 ? 1 : val_1 < val_2 ? -1 : 0;\n ";
+    }
+    return key_getter;
+  }
+
+  void GenStruct(StructDef &struct_def, std::string *code_ptr) const {
+    if (struct_def.generated) return;
+    std::string &code = *code_ptr;
+
+    // Generate a struct accessor class, with methods of the form:
+    // public type name() { return bb.getType(i + offset); }
+    // or for tables of the form:
+    // public type name() {
+    //   int o = __offset(offset); return o != 0 ? bb.getType(o + i) : default;
+    // }
+    GenComment(struct_def.doc_comment, code_ptr, &comment_config);
+
+    if (parser_.opts.gen_generated) {
+      code += "@javax.annotation.Generated(value=\"flatc\")\n";
+    }
+    code += "@SuppressWarnings(\"unused\")\n";
+    if (struct_def.attributes.Lookup("private")) {
+      // For Java, we leave the struct unmarked to indicate package-private
+    } else {
+      code += "public ";
+    }
+    code += "final class " + struct_def.name;
+    code += " extends ";
+    code += struct_def.fixed ? "Struct" : "Table";
+    code += " {\n";
+
+    if (!struct_def.fixed) {
+      // Generate verson check method.
+      // Force compile time error if not using the same version runtime.
+      code += "  public static void ValidateVersion() {";
+      code += " Constants.";
+      code += "FLATBUFFERS_2_0_0(); ";
+      code += "}\n";
+
+      // Generate a special accessor for the table that when used as the root
+      // of a FlatBuffer
+      std::string method_name = "getRootAs" + struct_def.name;
+      std::string method_signature =
+          "  public static " + struct_def.name + " " + method_name;
+
+      // create convenience method that doesn't require an existing object
+      code += method_signature + "(ByteBuffer _bb) ";
+      code += "{ return " + method_name + "(_bb, new " + struct_def.name +
+              "()); }\n";
+
+      // create method that allows object reuse
+      code +=
+          method_signature + "(ByteBuffer _bb, " + struct_def.name + " obj) { ";
+      code += "_bb.order(ByteOrder.LITTLE_ENDIAN); ";
+      code += "return (obj.__assign(_bb.getInt(_bb.";
+      code += "position()";
+      code += ") + _bb.";
+      code += "position()";
+      code += ", _bb)); }\n";
+      if (parser_.root_struct_def_ == &struct_def) {
+        if (parser_.file_identifier_.length()) {
+          // Check if a buffer has the identifier.
+          code += "  public static ";
+          code += "boolean " + struct_def.name;
+          code += "BufferHasIdentifier(ByteBuffer _bb) { return ";
+          code += "__has_identifier(_bb, \"";
+          code += parser_.file_identifier_;
+          code += "\"); }\n";
+        }
+      }
+    }
+    // Generate the __init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    code += "  public void __init(int _i, ByteBuffer _bb) ";
+    code += "{ ";
+    code += "__reset(_i, _bb); ";
+    code += "}\n";
+    code +=
+        "  public " + struct_def.name + " __assign(int _i, ByteBuffer _bb) ";
+    code += "{ __init(_i, _bb); return this; }\n\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      GenComment(field.doc_comment, code_ptr, &comment_config, "  ");
+      std::string type_name = GenTypeGet(field.value.type);
+      std::string type_name_dest = GenTypeNameDest(field.value.type);
+      std::string conditional_cast = "";
+      std::string optional = "";
+      std::string dest_mask = DestinationMask(field.value.type, true);
+      std::string dest_cast = DestinationCast(field.value.type);
+      std::string src_cast = SourceCast(field.value.type);
+      std::string method_start =
+          "  public " +
+          (field.IsRequired() ? "" : GenNullableAnnotation(field.value.type)) +
+          GenPureAnnotation(field.value.type) + type_name_dest + optional +
+          " " + MakeCamel(field.name, false);
+      std::string obj = "obj";
+
+      // Most field accessors need to retrieve and test the field offset first,
+      // this is the prefix code for that:
+      auto offset_prefix =
+          IsArray(field.value.type)
+              ? " { return "
+              : (" { int o = __offset(" + NumToString(field.value.offset) +
+                 "); return o != 0 ? ");
+      // Generate the accessors that don't do object reuse.
+      if (field.value.type.base_type == BASE_TYPE_STRUCT) {
+        // Calls the accessor that takes an accessor object with a new object.
+        code += method_start + "() { return ";
+        code += MakeCamel(field.name, false);
+        code += "(new ";
+        code += type_name + "()); }\n";
+      } else if (IsVector(field.value.type) &&
+                 field.value.type.element == BASE_TYPE_STRUCT) {
+        // Accessors for vectors of structs also take accessor objects, this
+        // generates a variant without that argument.
+        code += method_start + "(int j) { return ";
+        code += MakeCamel(field.name, false);
+        code += "(new " + type_name + "(), j); }\n";
+      }
+
+      if (field.IsScalarOptional()) { code += GenOptionalScalarCheck(field); }
+      std::string getter = dest_cast + GenGetter(field.value.type);
+      code += method_start;
+      std::string default_cast = "";
+      std::string member_suffix = "; ";
+      if (IsScalar(field.value.type.base_type)) {
+        code += "()";
+        member_suffix += "";
+        if (struct_def.fixed) {
+          code += " { return " + getter;
+          code += "(bb_pos + ";
+          code += NumToString(field.value.offset) + ")";
+          code += dest_mask;
+        } else {
+          code += offset_prefix + getter;
+          code += "(o + bb_pos)" + dest_mask;
+          code += " : " + default_cast;
+          code += GenDefaultValue(field);
+        }
+      } else {
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT:
+            code += "(" + type_name + " obj)";
+            if (struct_def.fixed) {
+              code += " { return " + obj + ".__assign(";
+              code += "bb_pos + " + NumToString(field.value.offset) + ", ";
+              code += "bb)";
+            } else {
+              code += offset_prefix + conditional_cast;
+              code += obj + ".__assign(";
+              code += field.value.type.struct_def->fixed
+                          ? "o + bb_pos"
+                          : "__indirect(o + bb_pos)";
+              code += ", bb) : null";
+            }
+            break;
+          case BASE_TYPE_STRING:
+            code += "()";
+            member_suffix += "";
+            code += offset_prefix + getter + "(o + ";
+            code += "bb_pos) : null";
+            break;
+          case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();  // fall thru
+          case BASE_TYPE_VECTOR: {
+            auto vectortype = field.value.type.VectorType();
+            code += "(";
+            if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              code += type_name + " obj, ";
+              getter = obj + ".__assign";
+            } else if (vectortype.base_type == BASE_TYPE_UNION) {
+              code += type_name + " obj, ";
+            }
+            code += "int j)";
+            const auto body = offset_prefix + conditional_cast + getter + "(";
+            if (vectortype.base_type == BASE_TYPE_UNION) {
+              code += body + "obj, ";
+            } else {
+              code += body;
+            }
+            std::string index;
+            if (IsArray(field.value.type)) {
+              index += "bb_pos + " + NumToString(field.value.offset) + " + ";
+            } else {
+              index += "__vector(o) + ";
+            }
+            index += "j * " + NumToString(InlineSize(vectortype));
+            if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              code += vectortype.struct_def->fixed
+                          ? index
+                          : "__indirect(" + index + ")";
+              code += ", bb";
+            } else {
+              code += index;
+            }
+            code += ")" + dest_mask;
+            if (!IsArray(field.value.type)) {
+              code += " : ";
+              code +=
+                  field.value.type.element == BASE_TYPE_BOOL
+                      ? "false"
+                      : (IsScalar(field.value.type.element) ? default_cast + "0"
+                                                            : "null");
+            }
+
+            break;
+          }
+          case BASE_TYPE_UNION:
+            code += "(" + type_name + " obj)" + offset_prefix + getter;
+            code += "(obj, o + bb_pos) : null";
+            break;
+          default: FLATBUFFERS_ASSERT(0);
+        }
+      }
+      code += member_suffix;
+      code += "}\n";
+      if (IsVector(field.value.type)) {
+        code += "  public int " + MakeCamel(field.name, false);
+        code += "Length";
+        code += "()";
+        code += offset_prefix;
+        code += "__vector_len(o) : 0; ";
+        code += "";
+        code += "}\n";
+        // See if we should generate a by-key accessor.
+        if (field.value.type.element == BASE_TYPE_STRUCT &&
+            !field.value.type.struct_def->fixed) {
+          auto &sd = *field.value.type.struct_def;
+          auto &fields = sd.fields.vec;
+          for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+            auto &key_field = **kit;
+            if (key_field.key) {
+              auto qualified_name = WrapInNameSpace(sd);
+              code += "  public " + qualified_name + " ";
+              code += MakeCamel(field.name, false) + "ByKey(";
+              code += GenTypeNameDest(key_field.value.type) + " key)";
+              code += offset_prefix;
+              code += qualified_name + ".__lookup_by_key(";
+              code += "null, ";
+              code += "__vector(o), key, ";
+              code += "bb) : null; ";
+              code += "}\n";
+              code += "  public " + qualified_name + " ";
+              code += MakeCamel(field.name, false) + "ByKey(";
+              code += qualified_name + " obj, ";
+              code += GenTypeNameDest(key_field.value.type) + " key)";
+              code += offset_prefix;
+              code += qualified_name + ".__lookup_by_key(obj, ";
+              code += "__vector(o), key, ";
+              code += "bb) : null; ";
+              code += "}\n";
+              break;
+            }
+          }
+        }
+      }
+      // Generate the accessors for vector of structs with vector access object
+      if (IsVector(field.value.type)) {
+        std::string vector_type_name;
+        const auto &element_base_type = field.value.type.VectorType().base_type;
+        if (IsScalar(element_base_type)) {
+          vector_type_name = MakeCamel(type_name, true) + "Vector";
+        } else if (element_base_type == BASE_TYPE_STRING) {
+          vector_type_name = "StringVector";
+        } else if (element_base_type == BASE_TYPE_UNION) {
+          vector_type_name = "UnionVector";
+        } else {
+          vector_type_name = type_name + ".Vector";
+        }
+        auto vector_method_start = GenNullableAnnotation(field.value.type) +
+                                   "  public " + vector_type_name + optional +
+                                   " " + MakeCamel(field.name, false) +
+                                   "Vector";
+        code += vector_method_start + "() { return ";
+        code += MakeCamel(field.name, false) + "Vector";
+        code += "(new " + vector_type_name + "()); }\n";
+        code += vector_method_start + "(" + vector_type_name + " obj)";
+        code += offset_prefix + conditional_cast + obj + ".__assign(";
+        code += "__vector(o), ";
+        if (!IsScalar(element_base_type)) {
+          auto vectortype = field.value.type.VectorType();
+          code += NumToString(InlineSize(vectortype)) + ", ";
+        }
+        code += "bb) : null" + member_suffix + "}\n";
+      }
+      // Generate a ByteBuffer accessor for strings & vectors of scalars.
+      if ((IsVector(field.value.type) &&
+           IsScalar(field.value.type.VectorType().base_type)) ||
+          IsString(field.value.type)) {
+        code += "  public ByteBuffer ";
+        code += MakeCamel(field.name, false);
+        code += "AsByteBuffer() { return ";
+        code += "__vector_as_bytebuffer(";
+        code += NumToString(field.value.offset) + ", ";
+        code += NumToString(IsString(field.value.type)
+                                ? 1
+                                : InlineSize(field.value.type.VectorType()));
+        code += "); }\n";
+        code += "  public ByteBuffer ";
+        code += MakeCamel(field.name, false);
+        code += "InByteBuffer(ByteBuffer _bb) { return ";
+        code += "__vector_in_bytebuffer(_bb, ";
+        code += NumToString(field.value.offset) + ", ";
+        code += NumToString(IsString(field.value.type)
+                                ? 1
+                                : InlineSize(field.value.type.VectorType()));
+        code += "); }\n";
+      }
+      // generate object accessors if is nested_flatbuffer
+      if (field.nested_flatbuffer) {
+        auto nested_type_name = WrapInNameSpace(*field.nested_flatbuffer);
+        auto nested_method_name =
+            MakeCamel(field.name, false) + "As" + field.nested_flatbuffer->name;
+        auto get_nested_method_name = nested_method_name;
+        code += "  public " + nested_type_name + " ";
+        code += nested_method_name + "() { return ";
+        code +=
+            get_nested_method_name + "(new " + nested_type_name + "()); }\n";
+        code += "  public " + nested_type_name + " ";
+        code += get_nested_method_name + "(";
+        code += nested_type_name + " obj";
+        code += ") { int o = __offset(";
+        code += NumToString(field.value.offset) + "); ";
+        code += "return o != 0 ? " + conditional_cast + obj + ".__assign(";
+        code += "";
+        code += "__indirect(__vector(o)), ";
+        code += "bb) : null; }\n";
+      }
+      // Generate mutators for scalar fields or vectors of scalars.
+      if (parser_.opts.mutable_buffer) {
+        auto is_series = (IsSeries(field.value.type));
+        const auto &underlying_type =
+            is_series ? field.value.type.VectorType() : field.value.type;
+        // Boolean parameters have to be explicitly converted to byte
+        // representation.
+        auto setter_parameter = underlying_type.base_type == BASE_TYPE_BOOL
+                                    ? "(byte)(" + field.name + " ? 1 : 0)"
+                                    : field.name;
+        auto mutator_prefix = MakeCamel("mutate", false);
+        // A vector mutator also needs the index of the vector element it should
+        // mutate.
+        auto mutator_params = (is_series ? "(int j, " : "(") +
+                              GenTypeNameDest(underlying_type) + " " +
+                              field.name + ") { ";
+        auto setter_index =
+            is_series
+                ? (IsArray(field.value.type)
+                       ? "bb_pos + " + NumToString(field.value.offset)
+                       : "__vector(o)") +
+                      +" + j * " + NumToString(InlineSize(underlying_type))
+                : (struct_def.fixed
+                       ? "bb_pos + " + NumToString(field.value.offset)
+                       : "o + bb_pos");
+        if (IsScalar(underlying_type.base_type) && !IsUnion(field.value.type)) {
+          code += "  public ";
+          code += struct_def.fixed ? "void " : "boolean ";
+          code += mutator_prefix + MakeCamel(field.name, true);
+          code += mutator_params;
+          if (struct_def.fixed) {
+            code += GenSetter(underlying_type) + "(" + setter_index + ", ";
+            code += src_cast + setter_parameter + "); }\n";
+          } else {
+            code += "int o = __offset(";
+            code += NumToString(field.value.offset) + ");";
+            code += " if (o != 0) { " + GenSetter(underlying_type);
+            code += "(" + setter_index + ", " + src_cast + setter_parameter +
+                    "); return true; } else { return false; } }\n";
+          }
+        }
+      }
+      if (parser_.opts.java_primitive_has_method &&
+          IsScalar(field.value.type.base_type) && !struct_def.fixed) {
+        auto vt_offset_constant = "  public static final int VT_" +
+                                  MakeScreamingCamel(field.name) + " = " +
+                                  NumToString(field.value.offset) + ";";
+
+        code += vt_offset_constant;
+        code += "\n";
+      }
+    }
+    code += "\n";
+    flatbuffers::FieldDef *key_field = nullptr;
+    if (struct_def.fixed) {
+      // create a struct constructor function
+      code += "  public static " + GenOffsetType() + " ";
+      code += "create";
+      code += struct_def.name + "(FlatBufferBuilder builder";
+      GenStructArgs(struct_def, code_ptr, "");
+      code += ") {\n";
+      GenStructBody(struct_def, code_ptr, "");
+      code += "    return ";
+      code += GenOffsetConstruct("builder." + std::string("offset()"));
+      code += ";\n  }\n";
+    } else {
+      // Generate a method that creates a table in one go. This is only possible
+      // when the table has no struct fields, since those have to be created
+      // inline, and there's no way to do so in Java.
+      bool has_no_struct_fields = true;
+      int num_fields = 0;
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        if (IsStruct(field.value.type)) {
+          has_no_struct_fields = false;
+        } else {
+          num_fields++;
+        }
+      }
+      // JVM specifications restrict default constructor params to be < 255.
+      // Longs and doubles take up 2 units, so we set the limit to be < 127.
+      if (has_no_struct_fields && num_fields && num_fields < 127) {
+        // Generate a table constructor of the form:
+        // public static int createName(FlatBufferBuilder builder, args...)
+        code += "  public static " + GenOffsetType() + " ";
+        code += "create" + struct_def.name;
+        code += "(FlatBufferBuilder builder";
+        for (auto it = struct_def.fields.vec.begin();
+             it != struct_def.fields.vec.end(); ++it) {
+          auto &field = **it;
+          if (field.deprecated) continue;
+          code += ",\n      ";
+          code += GenTypeBasic(DestinationType(field.value.type, false));
+          code += " ";
+          code += field.name;
+          if (!IsScalar(field.value.type.base_type)) code += "Offset";
+        }
+        code += ") {\n    builder.";
+        code += "startTable(";
+        code += NumToString(struct_def.fields.vec.size()) + ");\n";
+        for (size_t size = struct_def.sortbysize ? sizeof(largest_scalar_t) : 1;
+             size; size /= 2) {
+          for (auto it = struct_def.fields.vec.rbegin();
+               it != struct_def.fields.vec.rend(); ++it) {
+            auto &field = **it;
+            if (!field.deprecated &&
+                (!struct_def.sortbysize ||
+                 size == SizeOf(field.value.type.base_type))) {
+              code += "    " + struct_def.name + ".";
+              code += "add";
+              code += MakeCamel(field.name) + "(builder, " + field.name;
+              if (!IsScalar(field.value.type.base_type)) code += "Offset";
+              code += ");\n";
+            }
+          }
+        }
+        code += "    return " + struct_def.name + ".";
+        code += "end" + struct_def.name;
+        code += "(builder);\n  }\n\n";
+      }
+      // Generate a set of static methods that allow table construction,
+      // of the form:
+      // public static void addName(FlatBufferBuilder builder, short name)
+      // { builder.addShort(id, name, default); }
+      // Unlike the Create function, these always work.
+      code += "  public static void start";
+      code += struct_def.name;
+      code += "(FlatBufferBuilder builder) { builder.";
+      code += "startTable(";
+      code += NumToString(struct_def.fields.vec.size()) + "); }\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        if (field.key) key_field = &field;
+        code += "  public static void add";
+        code += MakeCamel(field.name);
+        code += "(FlatBufferBuilder builder, ";
+        code += GenTypeBasic(DestinationType(field.value.type, false));
+        auto argname = MakeCamel(field.name, false);
+        if (!IsScalar(field.value.type.base_type)) argname += "Offset";
+        code += " " + argname + ") { builder.add";
+        code += GenMethod(field.value.type) + "(";
+        code += NumToString(it - struct_def.fields.vec.begin()) + ", ";
+        code += SourceCastBasic(field.value.type);
+        code += argname;
+        code += ", ";
+        code += SourceCastBasic(field.value.type);
+        code += GenDefaultValue(field);
+        code += "); }\n";
+        if (IsVector(field.value.type)) {
+          auto vector_type = field.value.type.VectorType();
+          auto alignment = InlineAlignment(vector_type);
+          auto elem_size = InlineSize(vector_type);
+          if (!IsStruct(vector_type)) {
+            // generate a method to create a vector from a java array.
+            if ((vector_type.base_type == BASE_TYPE_CHAR ||
+                 vector_type.base_type == BASE_TYPE_UCHAR)) {
+              // Handle byte[] and ByteBuffers separately for Java
+              code += "  public static " + GenVectorOffsetType() + " ";
+              code += "create";
+              code += MakeCamel(field.name);
+              code += "Vector(FlatBufferBuilder builder, byte[] data) ";
+              code += "{ return builder.createByteVector(data); }\n";
+
+              code += "  public static " + GenVectorOffsetType() + " ";
+              code += "create";
+              code += MakeCamel(field.name);
+              code += "Vector(FlatBufferBuilder builder, ByteBuffer data) ";
+              code += "{ return builder.createByteVector(data); }\n";
+            } else {
+              code += "  public static " + GenVectorOffsetType() + " ";
+              code += "create";
+              code += MakeCamel(field.name);
+              code += "Vector(FlatBufferBuilder builder, ";
+              code += GenTypeBasic(vector_type) + "[] data) ";
+              code += "{ builder.startVector(";
+              code += NumToString(elem_size);
+              code += ", data.length, ";
+              code += NumToString(alignment);
+              code += "); for (int i = data.";
+              code += "length - 1; i >= 0; i--) builder.";
+              code += "add";
+              code += GenMethod(vector_type);
+              code += "(";
+              code += SourceCastBasic(vector_type, false);
+              code += "data[i]";
+              code += "); return ";
+              code += "builder.endVector(); }\n";
+            }
+          }
+          // Generate a method to start a vector, data to be added manually
+          // after.
+          code += "  public static void start";
+          code += MakeCamel(field.name);
+          code += "Vector(FlatBufferBuilder builder, int numElems) ";
+          code += "{ builder.startVector(";
+          code += NumToString(elem_size);
+          code += ", numElems, " + NumToString(alignment);
+          code += "); }\n";
+        }
+      }
+      code += "  public static " + GenOffsetType() + " ";
+      code += "end" + struct_def.name;
+      code += "(FlatBufferBuilder builder) {\n    int o = builder.";
+      code += "endTable();\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (!field.deprecated && field.IsRequired()) {
+          code += "    builder.required(o, ";
+          code += NumToString(field.value.offset);
+          code += ");  // " + field.name + "\n";
+        }
+      }
+      code += "    return " + GenOffsetConstruct("o") + ";\n  }\n";
+      if (parser_.root_struct_def_ == &struct_def) {
+        std::string size_prefix[] = { "", "SizePrefixed" };
+        for (int i = 0; i < 2; ++i) {
+          code += "  public static void ";
+          code += "finish" + size_prefix[i] + struct_def.name;
+          code += "Buffer(FlatBufferBuilder builder, " + GenOffsetType();
+          code += " offset) {";
+          code += " builder.finish" + size_prefix[i] + "(offset";
+
+          if (parser_.file_identifier_.length())
+            code += ", \"" + parser_.file_identifier_ + "\"";
+          code += "); }\n";
+        }
+      }
+    }
+    // Only generate key compare function for table,
+    // because `key_field` is not set for struct
+    if (struct_def.has_key && !struct_def.fixed) {
+      FLATBUFFERS_ASSERT(key_field);
+      code += "\n  @Override\n  protected int keysCompare(";
+      code += "Integer o1, Integer o2, ByteBuffer _bb) {";
+      code += GenKeyGetter(key_field);
+      code += " }\n";
+
+      code += "\n  public static " + struct_def.name;
+      code += " __lookup_by_key(";
+      code += struct_def.name + " obj, ";
+      code += "int vectorLocation, ";
+      code += GenTypeNameDest(key_field->value.type);
+      code += " key, ByteBuffer bb) {\n";
+      if (IsString(key_field->value.type)) {
+        code += "    byte[] byteKey = ";
+        code += "key.getBytes(java.nio.charset.StandardCharsets.UTF_8);\n";
+      }
+      code += "    int span = ";
+      code += "bb.getInt(vectorLocation - 4);\n";
+      code += "    int start = 0;\n";
+      code += "    while (span != 0) {\n";
+      code += "      int middle = span / 2;\n";
+      code += GenLookupKeyGetter(key_field);
+      code += "      if (comp > 0) {\n";
+      code += "        span = middle;\n";
+      code += "      } else if (comp < 0) {\n";
+      code += "        middle++;\n";
+      code += "        start += middle;\n";
+      code += "        span -= middle;\n";
+      code += "      } else {\n";
+      code += "        return ";
+      code += "(obj == null ? new " + struct_def.name + "() : obj)";
+      code += ".__assign(tableOffset, bb);\n";
+      code += "      }\n    }\n";
+      code += "    return null;\n";
+      code += "  }\n";
+    }
+    GenVectorAccessObject(struct_def, code_ptr);
+    code += "}";
+    code += "\n\n";
+  }
+
+  std::string GenOptionalScalarCheck(FieldDef &field) const {
+    if (!field.IsScalarOptional()) return "";
+    return "  public boolean has" + MakeCamel(field.name, true) +
+           "() { return 0 != __offset(" + NumToString(field.value.offset) +
+           "); }\n";
+  }
+
+  void GenVectorAccessObject(StructDef &struct_def,
+                             std::string *code_ptr) const {
+    auto &code = *code_ptr;
+    // Generate a vector of structs accessor class.
+    code += "\n";
+    code += "  ";
+    if (!struct_def.attributes.Lookup("private")) code += "public ";
+    code += "static ";
+    code += "final ";
+    code += "class Vector extends ";
+    code += "BaseVector {\n";
+
+    // Generate the __assign method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    std::string method_indent = "    ";
+    code += method_indent + "public Vector ";
+    code += "__assign(int _vector, int _element_size, ByteBuffer _bb) { ";
+    code += "__reset(_vector, _element_size, _bb); return this; }\n\n";
+
+    auto type_name = struct_def.name;
+    auto method_start = method_indent + "public " + type_name + " get";
+    // Generate the accessors that don't do object reuse.
+    code += method_start + "(int j) { return get";
+    code += "(new " + type_name + "(), j); }\n";
+    code += method_start + "(" + type_name + " obj, int j) { ";
+    code += " return obj.__assign(";
+    std::string index = "__element(j)";
+    code += struct_def.fixed ? index : "__indirect(" + index + ", bb)";
+    code += ", bb); }\n";
+    // See if we should generate a by-key accessor.
+    if (!struct_def.fixed) {
+      auto &fields = struct_def.fields.vec;
+      for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+        auto &key_field = **kit;
+        if (key_field.key) {
+          auto nullable_annotation =
+              parser_.opts.gen_nullable ? "@Nullable " : "";
+          code += method_indent + nullable_annotation;
+          code += "public " + type_name + " ";
+          code += "getByKey(";
+          code += GenTypeNameDest(key_field.value.type) + " key) { ";
+          code += " return __lookup_by_key(null, ";
+          code += "__vector(), key, ";
+          code += "bb); ";
+          code += "}\n";
+          code += method_indent + nullable_annotation;
+          code += "public " + type_name + " ";
+          code += "getByKey(";
+          code += type_name + " obj, ";
+          code += GenTypeNameDest(key_field.value.type) + " key) { ";
+          code += " return __lookup_by_key(obj, ";
+          code += "__vector(), key, ";
+          code += "bb); ";
+          code += "}\n";
+          break;
+        }
+      }
+    }
+    code += "  }\n";
+  }
+
+  // This tracks the current namespace used to determine if a type need to be
+  // prefixed by its namespace
+  const Namespace *cur_name_space_;
+};
+}  // namespace java
+
+bool GenerateJava(const Parser &parser, const std::string &path,
+                  const std::string &file_name) {
+  java::JavaGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_json_schema.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_json_schema.cpp
new file mode 100644
index 0000000..d58bb84
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_json_schema.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+namespace jsons {
+
+template<class T> std::string GenFullName(const T *enum_def) {
+  std::string full_name;
+  const auto &name_spaces = enum_def->defined_namespace->components;
+  for (auto ns = name_spaces.cbegin(); ns != name_spaces.cend(); ++ns) {
+    full_name.append(*ns + "_");
+  }
+  full_name.append(enum_def->name);
+  return full_name;
+}
+
+template<class T> std::string GenTypeRef(const T *enum_def) {
+  return "\"$ref\" : \"#/definitions/" + GenFullName(enum_def) + "\"";
+}
+
+std::string GenType(const std::string &name) {
+  return "\"type\" : \"" + name + "\"";
+}
+
+std::string GenType(BaseType type) {
+  switch (type) {
+    case BASE_TYPE_BOOL: return "\"type\" : \"boolean\"";
+    case BASE_TYPE_CHAR:
+      return "\"type\" : \"integer\", \"minimum\" : " +
+             NumToString(std::numeric_limits<int8_t>::min()) +
+             ", \"maximum\" : " +
+             NumToString(std::numeric_limits<int8_t>::max());
+    case BASE_TYPE_UCHAR:
+      return "\"type\" : \"integer\", \"minimum\" : 0, \"maximum\" :" +
+             NumToString(std::numeric_limits<uint8_t>::max());
+    case BASE_TYPE_SHORT:
+      return "\"type\" : \"integer\", \"minimum\" : " +
+             NumToString(std::numeric_limits<int16_t>::min()) +
+             ", \"maximum\" : " +
+             NumToString(std::numeric_limits<int16_t>::max());
+    case BASE_TYPE_USHORT:
+      return "\"type\" : \"integer\", \"minimum\" : 0, \"maximum\" : " +
+             NumToString(std::numeric_limits<uint16_t>::max());
+    case BASE_TYPE_INT:
+      return "\"type\" : \"integer\", \"minimum\" : " +
+             NumToString(std::numeric_limits<int32_t>::min()) +
+             ", \"maximum\" : " +
+             NumToString(std::numeric_limits<int32_t>::max());
+    case BASE_TYPE_UINT:
+      return "\"type\" : \"integer\", \"minimum\" : 0, \"maximum\" : " +
+             NumToString(std::numeric_limits<uint32_t>::max());
+    case BASE_TYPE_LONG:
+      return "\"type\" : \"integer\", \"minimum\" : " +
+             NumToString(std::numeric_limits<int64_t>::min()) +
+             ", \"maximum\" : " +
+             NumToString(std::numeric_limits<int64_t>::max());
+    case BASE_TYPE_ULONG:
+      return "\"type\" : \"integer\", \"minimum\" : 0, \"maximum\" : " +
+             NumToString(std::numeric_limits<uint64_t>::max());
+    case BASE_TYPE_FLOAT:
+    case BASE_TYPE_DOUBLE: return "\"type\" : \"number\"";
+    case BASE_TYPE_STRING: return "\"type\" : \"string\"";
+    default: return "";
+  }
+}
+
+std::string GenBaseType(const Type &type) {
+  if (type.struct_def != nullptr) { return GenTypeRef(type.struct_def); }
+  if (type.enum_def != nullptr) { return GenTypeRef(type.enum_def); }
+  return GenType(type.base_type);
+}
+
+std::string GenArrayType(const Type &type) {
+  std::string element_type;
+  if (type.struct_def != nullptr) {
+    element_type = GenTypeRef(type.struct_def);
+  } else if (type.enum_def != nullptr) {
+    element_type = GenTypeRef(type.enum_def);
+  } else {
+    element_type = GenType(type.element);
+  }
+
+  return "\"type\" : \"array\", \"items\" : {" + element_type + "}";
+}
+
+std::string GenType(const Type &type) {
+  switch (type.base_type) {
+    case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();  // fall thru
+    case BASE_TYPE_VECTOR: {
+      return GenArrayType(type);
+    }
+    case BASE_TYPE_STRUCT: {
+      return GenTypeRef(type.struct_def);
+    }
+    case BASE_TYPE_UNION: {
+      std::string union_type_string("\"anyOf\": [");
+      const auto &union_types = type.enum_def->Vals();
+      for (auto ut = union_types.cbegin(); ut < union_types.cend(); ++ut) {
+        const auto &union_type = *ut;
+        if (union_type->union_type.base_type == BASE_TYPE_NONE) { continue; }
+        if (union_type->union_type.base_type == BASE_TYPE_STRUCT) {
+          union_type_string.append(
+              "{ " + GenTypeRef(union_type->union_type.struct_def) + " }");
+        }
+        if (union_type != *type.enum_def->Vals().rbegin()) {
+          union_type_string.append(",");
+        }
+      }
+      union_type_string.append("]");
+      return union_type_string;
+    }
+    case BASE_TYPE_UTYPE: return GenTypeRef(type.enum_def);
+    default: {
+      return GenBaseType(type);
+    }
+  }
+}
+
+class JsonSchemaGenerator : public BaseGenerator {
+ private:
+  std::string code_;
+
+ public:
+  JsonSchemaGenerator(const Parser &parser, const std::string &path,
+                      const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", "", "json") {}
+
+  explicit JsonSchemaGenerator(const BaseGenerator &base_generator)
+      : BaseGenerator(base_generator) {}
+
+  std::string GeneratedFileName(const std::string &path,
+                                const std::string &file_name,
+                                const IDLOptions &options /* unused */) const {
+    (void)options;
+    return path + file_name + ".schema.json";
+  }
+
+  // If indentation is less than 0, that indicates we don't want any newlines
+  // either.
+  std::string NewLine() const {
+    return parser_.opts.indent_step >= 0 ? "\n" : "";
+  }
+
+  std::string Indent(int indent) const {
+    const auto num_spaces = indent * std::max(parser_.opts.indent_step, 0);
+    return std::string(num_spaces, ' ');
+  }
+
+  bool generate() {
+    code_ = "";
+    if (parser_.root_struct_def_ == nullptr) { return false; }
+    code_ += "{" + NewLine();
+    code_ += Indent(1) +
+             "\"$schema\": \"https://json-schema.org/draft/2019-09/schema\"," +
+             NewLine();
+    code_ += Indent(1) + "\"definitions\": {" + NewLine();
+    for (auto e = parser_.enums_.vec.cbegin(); e != parser_.enums_.vec.cend();
+         ++e) {
+      code_ += Indent(2) + "\"" + GenFullName(*e) + "\" : {" + NewLine();
+      code_ += Indent(3) + GenType("string") + "," + NewLine();
+      auto enumdef(Indent(3) + "\"enum\": [");
+      for (auto enum_value = (*e)->Vals().begin();
+           enum_value != (*e)->Vals().end(); ++enum_value) {
+        enumdef.append("\"" + (*enum_value)->name + "\"");
+        if (*enum_value != (*e)->Vals().back()) { enumdef.append(", "); }
+      }
+      enumdef.append("]");
+      code_ += enumdef + NewLine();
+      code_ += Indent(2) + "}," + NewLine();  // close type
+    }
+    for (auto s = parser_.structs_.vec.cbegin();
+         s != parser_.structs_.vec.cend(); ++s) {
+      const auto &structure = *s;
+      code_ += Indent(2) + "\"" + GenFullName(structure) + "\" : {" + NewLine();
+      code_ += Indent(3) + GenType("object") + "," + NewLine();
+      std::string comment;
+      const auto &comment_lines = structure->doc_comment;
+      for (auto comment_line = comment_lines.cbegin();
+           comment_line != comment_lines.cend(); ++comment_line) {
+        comment.append(*comment_line);
+      }
+      if (!comment.empty()) {
+        std::string description;
+        if (!EscapeString(comment.c_str(), comment.length(), &description, true,
+                          true)) {
+          return false;
+        }
+        code_ +=
+            Indent(3) + "\"description\" : " + description + "," + NewLine();
+      }
+      code_ += Indent(3) + "\"properties\" : {" + NewLine();
+
+      const auto &properties = structure->fields.vec;
+      for (auto prop = properties.cbegin(); prop != properties.cend(); ++prop) {
+        const auto &property = *prop;
+        std::string arrayInfo = "";
+        if (IsArray(property->value.type)) {
+          arrayInfo = "," + NewLine() + Indent(8) + "\"minItems\": " +
+                      NumToString(property->value.type.fixed_length) + "," +
+                      NewLine() + Indent(8) + "\"maxItems\": " +
+                      NumToString(property->value.type.fixed_length);
+        }
+        std::string deprecated_info = "";
+        if (property->deprecated) {
+          deprecated_info =
+              "," + NewLine() + Indent(8) + "\"deprecated\" : true,";
+        }
+        std::string typeLine = Indent(4) + "\"" + property->name + "\"";
+        typeLine += " : {" + NewLine() + Indent(8);
+        typeLine += GenType(property->value.type);
+        typeLine += arrayInfo;
+        typeLine += deprecated_info;
+        typeLine += NewLine() + Indent(7) + "}";
+        if (property != properties.back()) { typeLine.append(","); }
+        code_ += typeLine + NewLine();
+      }
+      code_ += Indent(3) + "}," + NewLine();  // close properties
+
+      std::vector<FieldDef *> requiredProperties;
+      std::copy_if(properties.begin(), properties.end(),
+                   back_inserter(requiredProperties),
+                   [](FieldDef const *prop) { return prop->IsRequired(); });
+      if (!requiredProperties.empty()) {
+        auto required_string(Indent(3) + "\"required\" : [");
+        for (auto req_prop = requiredProperties.cbegin();
+             req_prop != requiredProperties.cend(); ++req_prop) {
+          required_string.append("\"" + (*req_prop)->name + "\"");
+          if (*req_prop != requiredProperties.back()) {
+            required_string.append(", ");
+          }
+        }
+        required_string.append("],");
+        code_ += required_string + NewLine();
+      }
+      code_ += Indent(3) + "\"additionalProperties\" : false" + NewLine();
+      auto closeType(Indent(2) + "}");
+      if (*s != parser_.structs_.vec.back()) { closeType.append(","); }
+      code_ += closeType + NewLine();  // close type
+    }
+    code_ += Indent(1) + "}," + NewLine();  // close definitions
+
+    // mark root type
+    code_ += Indent(1) + "\"$ref\" : \"#/definitions/" +
+             GenFullName(parser_.root_struct_def_) + "\"" + NewLine();
+
+    code_ += "}" + NewLine();  // close schema root
+    return true;
+  }
+
+  bool save() const {
+    const auto file_path = GeneratedFileName(path_, file_name_, parser_.opts);
+    return SaveFile(file_path.c_str(), code_, false);
+  }
+
+  const std::string getJson() { return code_; }
+};
+}  // namespace jsons
+
+bool GenerateJsonSchema(const Parser &parser, const std::string &path,
+                        const std::string &file_name) {
+  jsons::JsonSchemaGenerator generator(parser, path, file_name);
+  if (!generator.generate()) { return false; }
+  return generator.save();
+}
+
+bool GenerateJsonSchema(const Parser &parser, std::string *json) {
+  jsons::JsonSchemaGenerator generator(parser, "", "");
+  if (!generator.generate()) { return false; }
+  *json = generator.getJson();
+  return true;
+}
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_kotlin.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_kotlin.cpp
new file mode 100644
index 0000000..fb4ce87
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_kotlin.cpp
@@ -0,0 +1,1527 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <functional>
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+#if defined(FLATBUFFERS_CPP98_STL)
+#  include <cctype>
+#endif  // defined(FLATBUFFERS_CPP98_STL)
+
+namespace flatbuffers {
+
+namespace kotlin {
+
+typedef std::map<std::string, std::pair<std::string, std::string> > FbbParamMap;
+static TypedFloatConstantGenerator KotlinFloatGen("Double.", "Float.", "NaN",
+                                                  "POSITIVE_INFINITY",
+                                                  "NEGATIVE_INFINITY");
+
+static const CommentConfig comment_config = { "/**", " *", " */" };
+static const std::string ident_pad = "    ";
+static const char *keywords[] = {
+  "package",  "as",     "typealias", "class",  "this",   "super",
+  "val",      "var",    "fun",       "for",    "null",   "true",
+  "false",    "is",     "in",        "throw",  "return", "break",
+  "continue", "object", "if",        "try",    "else",   "while",
+  "do",       "when",   "interface", "typeof", "Any",    "Character"
+};
+
+// Escape Keywords
+static std::string Esc(const std::string &name) {
+  for (size_t i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) {
+    if (name == keywords[i]) { return MakeCamel(name + "_", false); }
+  }
+
+  return MakeCamel(name, false);
+}
+
+class KotlinGenerator : public BaseGenerator {
+ public:
+  KotlinGenerator(const Parser &parser, const std::string &path,
+                  const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", ".", "kt"),
+        cur_name_space_(nullptr) {}
+
+  KotlinGenerator &operator=(const KotlinGenerator &);
+  bool generate() FLATBUFFERS_OVERRIDE {
+    std::string one_file_code;
+
+    cur_name_space_ = parser_.current_namespace_;
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      CodeWriter enumWriter(ident_pad);
+      auto &enum_def = **it;
+      if (!parser_.opts.one_file) cur_name_space_ = enum_def.defined_namespace;
+      GenEnum(enum_def, enumWriter);
+      if (parser_.opts.one_file) {
+        one_file_code += enumWriter.ToString();
+      } else {
+        if (!SaveType(enum_def.name, *enum_def.defined_namespace,
+                      enumWriter.ToString(), false))
+          return false;
+      }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      CodeWriter structWriter(ident_pad);
+      auto &struct_def = **it;
+      if (!parser_.opts.one_file)
+        cur_name_space_ = struct_def.defined_namespace;
+      GenStruct(struct_def, structWriter, parser_.opts);
+      if (parser_.opts.one_file) {
+        one_file_code += structWriter.ToString();
+      } else {
+        if (!SaveType(struct_def.name, *struct_def.defined_namespace,
+                      structWriter.ToString(), true))
+          return false;
+      }
+    }
+
+    if (parser_.opts.one_file) {
+      return SaveType(file_name_, *parser_.current_namespace_, one_file_code,
+                      true);
+    }
+    return true;
+  }
+
+  // Save out the generated code for a single class while adding
+  // declaration boilerplate.
+  bool SaveType(const std::string &defname, const Namespace &ns,
+                const std::string &classcode, bool needs_includes) const {
+    if (!classcode.length()) return true;
+
+    std::string code =
+        "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    std::string namespace_name = FullNamespace(".", ns);
+    if (!namespace_name.empty()) {
+      code += "package " + namespace_name;
+      code += "\n\n";
+    }
+    if (needs_includes) {
+      code += "import java.nio.*\n";
+      code += "import kotlin.math.sign\n";
+      code += "import com.google.flatbuffers.*\n\n";
+    }
+    code += classcode;
+    auto filename = NamespaceDir(ns) + defname + ".kt";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+  const Namespace *CurrentNameSpace() const FLATBUFFERS_OVERRIDE {
+    return cur_name_space_;
+  }
+
+  static bool IsEnum(const Type &type) {
+    return type.enum_def != nullptr && IsInteger(type.base_type);
+  }
+
+  static std::string GenTypeBasic(const BaseType &type) {
+    // clang-format off
+    static const char * const kotlin_typename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, RTYPE, KTYPE, ...) \
+        #KTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return kotlin_typename[type];
+  }
+
+  std::string GenTypePointer(const Type &type) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return WrapInNameSpace(*type.struct_def);
+      default: return "Table";
+    }
+  }
+
+  // with the addition of optional scalar types,
+  // we are adding the nullable '?' operator to return type of a field.
+  std::string GetterReturnType(const FieldDef &field) const {
+    auto base_type = field.value.type.base_type;
+
+    auto r_type = GenTypeGet(field.value.type);
+    if (field.IsScalarOptional() ||
+        // string, structs and unions
+        (base_type == BASE_TYPE_STRING || base_type == BASE_TYPE_STRUCT ||
+         base_type == BASE_TYPE_UNION) ||
+        // vector of anything not scalar
+        (base_type == BASE_TYPE_VECTOR &&
+         !IsScalar(field.value.type.VectorType().base_type))) {
+      r_type += "?";
+    }
+    return r_type;
+  }
+
+  std::string GenTypeGet(const Type &type) const {
+    return IsScalar(type.base_type) ? GenTypeBasic(type.base_type)
+                                    : GenTypePointer(type);
+  }
+
+  std::string GenEnumDefaultValue(const FieldDef &field) const {
+    auto &value = field.value;
+    FLATBUFFERS_ASSERT(value.type.enum_def);
+    auto &enum_def = *value.type.enum_def;
+    auto enum_val = enum_def.FindByValue(value.constant);
+    return enum_val ? (WrapInNameSpace(enum_def) + "." + enum_val->name)
+                    : value.constant;
+  }
+
+  // Generate default values to compare against a default value when
+  // `force_defaults` is `false`.
+  // Main differences are:
+  // - Floats are upcasted to doubles
+  // - Unsigned are casted to signed
+  std::string GenFBBDefaultValue(const FieldDef &field) const {
+    if (field.IsScalarOptional()) {
+      // although default value is null, java API forces us to present a real
+      // default value for scalars, while adding a field to the buffer. This is
+      // not a problem because the default can be representing just by not
+      // calling builder.addMyField()
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_DOUBLE:
+        case BASE_TYPE_FLOAT: return "0.0";
+        case BASE_TYPE_BOOL: return "false";
+        default: return "0";
+      }
+    }
+    auto out = GenDefaultValue(field, true);
+    // All FlatBufferBuilder default floating point values are doubles
+    if (field.value.type.base_type == BASE_TYPE_FLOAT) {
+      if (out.find("Float") != std::string::npos) {
+        out.replace(0, 5, "Double");
+      }
+    }
+    // Guarantee all values are doubles
+    if (out.back() == 'f') out.pop_back();
+    return out;
+  }
+
+  // FlatBufferBuilder only store signed types, so this function
+  // returns a cast for unsigned values
+  std::string GenFBBValueCast(const FieldDef &field) const {
+    if (IsUnsigned(field.value.type.base_type)) {
+      return CastToSigned(field.value.type);
+    }
+    return "";
+  }
+
+  std::string GenDefaultValue(const FieldDef &field,
+                              bool force_signed = false) const {
+    auto &value = field.value;
+    auto base_type = field.value.type.base_type;
+
+    if (field.IsScalarOptional()) { return "null"; }
+    if (IsFloat(base_type)) {
+      auto val = KotlinFloatGen.GenFloatConstant(field);
+      if (base_type == BASE_TYPE_DOUBLE && val.back() == 'f') {
+        val.pop_back();
+      }
+      return val;
+    }
+
+    if (base_type == BASE_TYPE_BOOL) {
+      return value.constant == "0" ? "false" : "true";
+    }
+
+    std::string suffix = "";
+
+    if (base_type == BASE_TYPE_LONG || !force_signed) {
+      suffix = LiteralSuffix(base_type);
+    }
+    return value.constant + suffix;
+  }
+
+  void GenEnum(EnumDef &enum_def, CodeWriter &writer) const {
+    if (enum_def.generated) return;
+
+    GenerateComment(enum_def.doc_comment, writer, &comment_config);
+
+    writer += "@Suppress(\"unused\")";
+    writer += "@ExperimentalUnsignedTypes";
+    writer += "class " + Esc(enum_def.name) + " private constructor() {";
+    writer.IncrementIdentLevel();
+
+    GenerateCompanionObject(writer, [&]() {
+      // Write all properties
+      auto vals = enum_def.Vals();
+      for (auto it = vals.begin(); it != vals.end(); ++it) {
+        auto &ev = **it;
+        auto field_type = GenTypeBasic(enum_def.underlying_type.base_type);
+        auto val = enum_def.ToString(ev);
+        auto suffix = LiteralSuffix(enum_def.underlying_type.base_type);
+        writer.SetValue("name", Esc(ev.name));
+        writer.SetValue("type", field_type);
+        writer.SetValue("val", val + suffix);
+        GenerateComment(ev.doc_comment, writer, &comment_config);
+        writer += "const val {{name}}: {{type}} = {{val}}";
+      }
+
+      // Generate a generate string table for enum values.
+      // Problem is, if values are very sparse that could generate really
+      // big tables. Ideally in that case we generate a map lookup
+      // instead, but for the moment we simply don't output a table at all.
+      auto range = enum_def.Distance();
+      // Average distance between values above which we consider a table
+      // "too sparse". Change at will.
+      static const uint64_t kMaxSparseness = 5;
+      if (range / static_cast<uint64_t>(enum_def.size()) < kMaxSparseness) {
+        GeneratePropertyOneLine(writer, "names", "Array<String>", [&]() {
+          writer += "arrayOf(\\";
+          auto val = enum_def.Vals().front();
+          for (auto it = vals.begin(); it != vals.end(); ++it) {
+            auto ev = *it;
+            for (auto k = enum_def.Distance(val, ev); k > 1; --k)
+              writer += "\"\", \\";
+            val = ev;
+            writer += "\"" + (*it)->name + "\"\\";
+            if (it + 1 != vals.end()) { writer += ", \\"; }
+          }
+          writer += ")";
+        });
+        GenerateFunOneLine(
+            writer, "name", "e: Int", "String",
+            [&]() {
+              writer += "names[e\\";
+              if (enum_def.MinValue()->IsNonZero())
+                writer += " - " + enum_def.MinValue()->name + ".toInt()\\";
+              writer += "]";
+            },
+            parser_.opts.gen_jvmstatic);
+      }
+    });
+    writer.DecrementIdentLevel();
+    writer += "}";
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string ByteBufferGetter(const Type &type,
+                               std::string bb_var_name) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "__string";
+      case BASE_TYPE_STRUCT: return "__struct";
+      case BASE_TYPE_UNION: return "__union";
+      case BASE_TYPE_VECTOR:
+        return ByteBufferGetter(type.VectorType(), bb_var_name);
+      case BASE_TYPE_INT:
+      case BASE_TYPE_UINT: return bb_var_name + ".getInt";
+      case BASE_TYPE_SHORT:
+      case BASE_TYPE_USHORT: return bb_var_name + ".getShort";
+      case BASE_TYPE_ULONG:
+      case BASE_TYPE_LONG: return bb_var_name + ".getLong";
+      case BASE_TYPE_FLOAT: return bb_var_name + ".getFloat";
+      case BASE_TYPE_DOUBLE: return bb_var_name + ".getDouble";
+      case BASE_TYPE_CHAR:
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_NONE:
+      case BASE_TYPE_UTYPE: return bb_var_name + ".get";
+      case BASE_TYPE_BOOL: return "0.toByte() != " + bb_var_name + ".get";
+      default:
+        return bb_var_name + ".get" + MakeCamel(GenTypeBasic(type.base_type));
+    }
+  }
+
+  std::string ByteBufferSetter(const Type &type) const {
+    if (IsScalar(type.base_type)) {
+      switch (type.base_type) {
+        case BASE_TYPE_INT:
+        case BASE_TYPE_UINT: return "bb.putInt";
+        case BASE_TYPE_SHORT:
+        case BASE_TYPE_USHORT: return "bb.putShort";
+        case BASE_TYPE_ULONG:
+        case BASE_TYPE_LONG: return "bb.putLong";
+        case BASE_TYPE_FLOAT: return "bb.putFloat";
+        case BASE_TYPE_DOUBLE: return "bb.putDouble";
+        case BASE_TYPE_CHAR:
+        case BASE_TYPE_UCHAR:
+        case BASE_TYPE_BOOL:
+        case BASE_TYPE_NONE:
+        case BASE_TYPE_UTYPE: return "bb.put";
+        default: return "bb.put" + MakeCamel(GenTypeBasic(type.base_type));
+      }
+    }
+    return "";
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenLookupByKey(flatbuffers::FieldDef *key_field,
+                             const std::string &bb_var_name,
+                             const char *num = nullptr) const {
+    auto type = key_field->value.type;
+    return ByteBufferGetter(type, bb_var_name) + "(" +
+           GenOffsetGetter(key_field, num) + ")";
+  }
+
+  // Returns the method name for use with add/put calls.
+  static std::string GenMethod(const Type &type) {
+    return IsScalar(type.base_type) ? ToSignedType(type)
+                                    : (IsStruct(type) ? "Struct" : "Offset");
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  static void GenStructArgs(const StructDef &struct_def, CodeWriter &writer,
+                            const char *nameprefix) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure
+        // names don't clash, and to make it obvious these arguments are
+        // constructing a nested struct, prefix the name with the field
+        // name.
+        GenStructArgs(*field.value.type.struct_def, writer,
+                      (nameprefix + (field.name + "_")).c_str());
+      } else {
+        writer += std::string(", ") + nameprefix + "\\";
+        writer += MakeCamel(field.name) + ": \\";
+        writer += GenTypeBasic(field.value.type.base_type) + "\\";
+      }
+    }
+  }
+
+  // Recusively generate struct construction statements of the form:
+  // builder.putType(name);
+  // and insert manual padding.
+  static void GenStructBody(const StructDef &struct_def, CodeWriter &writer,
+                            const char *nameprefix) {
+    writer.SetValue("align", NumToString(struct_def.minalign));
+    writer.SetValue("size", NumToString(struct_def.bytesize));
+    writer += "builder.prep({{align}}, {{size}})";
+    auto fields_vec = struct_def.fields.vec;
+    for (auto it = fields_vec.rbegin(); it != fields_vec.rend(); ++it) {
+      auto &field = **it;
+
+      if (field.padding) {
+        writer.SetValue("pad", NumToString(field.padding));
+        writer += "builder.pad({{pad}})";
+      }
+      if (IsStruct(field.value.type)) {
+        GenStructBody(*field.value.type.struct_def, writer,
+                      (nameprefix + (field.name + "_")).c_str());
+      } else {
+        writer.SetValue("type", GenMethod(field.value.type));
+        writer.SetValue("argname", nameprefix + MakeCamel(field.name, false));
+        writer.SetValue("cast", CastToSigned(field.value.type));
+        writer += "builder.put{{type}}({{argname}}{{cast}})";
+      }
+    }
+  }
+
+  std::string GenByteBufferLength(const char *bb_name) const {
+    std::string bb_len = bb_name;
+    bb_len += ".capacity()";
+    return bb_len;
+  }
+
+  std::string GenOffsetGetter(flatbuffers::FieldDef *key_field,
+                              const char *num = nullptr) const {
+    std::string key_offset =
+        "__offset(" + NumToString(key_field->value.offset) + ", ";
+    if (num) {
+      key_offset += num;
+      key_offset += ", _bb)";
+    } else {
+      key_offset += GenByteBufferLength("bb");
+      key_offset += " - tableOffset, bb)";
+    }
+    return key_offset;
+  }
+
+  void GenStruct(StructDef &struct_def, CodeWriter &writer,
+                 IDLOptions options) const {
+    if (struct_def.generated) return;
+
+    GenerateComment(struct_def.doc_comment, writer, &comment_config);
+    auto fixed = struct_def.fixed;
+
+    writer.SetValue("struct_name", Esc(struct_def.name));
+    writer.SetValue("superclass", fixed ? "Struct" : "Table");
+
+    writer += "@Suppress(\"unused\")";
+    writer += "@ExperimentalUnsignedTypes";
+    writer += "class {{struct_name}} : {{superclass}}() {\n";
+
+    writer.IncrementIdentLevel();
+
+    {
+      // Generate the __init() method that sets the field in a pre-existing
+      // accessor object. This is to allow object reuse.
+      GenerateFun(writer, "__init", "_i: Int, _bb: ByteBuffer", "",
+                  [&]() { writer += "__reset(_i, _bb)"; });
+
+      // Generate assign method
+      GenerateFun(writer, "__assign", "_i: Int, _bb: ByteBuffer",
+                  Esc(struct_def.name), [&]() {
+                    writer += "__init(_i, _bb)";
+                    writer += "return this";
+                  });
+
+      // Generate all getters
+      GenerateStructGetters(struct_def, writer);
+
+      // Generate Static Fields
+      GenerateCompanionObject(writer, [&]() {
+        if (!struct_def.fixed) {
+          FieldDef *key_field = nullptr;
+
+          // Generate verson check method.
+          // Force compile time error if not using the same version
+          // runtime.
+          GenerateFunOneLine(
+              writer, "validateVersion", "", "",
+              [&]() { writer += "Constants.FLATBUFFERS_2_0_0()"; },
+              options.gen_jvmstatic);
+
+          GenerateGetRootAsAccessors(Esc(struct_def.name), writer, options);
+          GenerateBufferHasIdentifier(struct_def, writer, options);
+          GenerateTableCreator(struct_def, writer, options);
+
+          GenerateStartStructMethod(struct_def, writer, options);
+
+          // Static Add for fields
+          auto fields = struct_def.fields.vec;
+          int field_pos = -1;
+          for (auto it = fields.begin(); it != fields.end(); ++it) {
+            auto &field = **it;
+            field_pos++;
+            if (field.deprecated) continue;
+            if (field.key) key_field = &field;
+            GenerateAddField(NumToString(field_pos), field, writer, options);
+
+            if (IsVector(field.value.type)) {
+              auto vector_type = field.value.type.VectorType();
+              if (!IsStruct(vector_type)) {
+                GenerateCreateVectorField(field, writer, options);
+              }
+              GenerateStartVectorField(field, writer, options);
+            }
+          }
+
+          GenerateEndStructMethod(struct_def, writer, options);
+          auto file_identifier = parser_.file_identifier_;
+          if (parser_.root_struct_def_ == &struct_def) {
+            GenerateFinishStructBuffer(struct_def, file_identifier, writer,
+                                       options);
+            GenerateFinishSizePrefixed(struct_def, file_identifier, writer,
+                                       options);
+          }
+
+          if (struct_def.has_key) {
+            GenerateLookupByKey(key_field, struct_def, writer, options);
+          }
+        } else {
+          GenerateStaticConstructor(struct_def, writer, options);
+        }
+      });
+    }
+
+    // class closing
+    writer.DecrementIdentLevel();
+    writer += "}";
+  }
+
+  // TODO: move key_field to reference instead of pointer
+  void GenerateLookupByKey(FieldDef *key_field, StructDef &struct_def,
+                           CodeWriter &writer, const IDLOptions options) const {
+    std::stringstream params;
+    params << "obj: " << Esc(struct_def.name) << "?"
+           << ", ";
+    params << "vectorLocation: Int, ";
+    params << "key: " << GenTypeGet(key_field->value.type) << ", ";
+    params << "bb: ByteBuffer";
+
+    auto statements = [&]() {
+      auto base_type = key_field->value.type.base_type;
+      writer.SetValue("struct_name", Esc(struct_def.name));
+      if (base_type == BASE_TYPE_STRING) {
+        writer +=
+            "val byteKey = key."
+            "toByteArray(java.nio.charset.StandardCharsets.UTF_8)";
+      }
+      writer += "var span = bb.getInt(vectorLocation - 4)";
+      writer += "var start = 0";
+      writer += "while (span != 0) {";
+      writer.IncrementIdentLevel();
+      writer += "var middle = span / 2";
+      writer +=
+          "val tableOffset = __indirect(vector"
+          "Location + 4 * (start + middle), bb)";
+      if (IsString(key_field->value.type)) {
+        writer += "val comp = compareStrings(\\";
+        writer += GenOffsetGetter(key_field) + "\\";
+        writer += ", byteKey, bb)";
+      } else {
+        auto cast = CastToUsigned(key_field->value.type);
+        auto get_val = GenLookupByKey(key_field, "bb");
+        writer += "val value = " + get_val + cast;
+        writer += "val comp = value.compareTo(key)";
+      }
+      writer += "when {";
+      writer.IncrementIdentLevel();
+      writer += "comp > 0 -> span = middle";
+      writer += "comp < 0 -> {";
+      writer.IncrementIdentLevel();
+      writer += "middle++";
+      writer += "start += middle";
+      writer += "span -= middle";
+      writer.DecrementIdentLevel();
+      writer += "}";  // end comp < 0
+      writer += "else -> {";
+      writer.IncrementIdentLevel();
+      writer += "return (obj ?: {{struct_name}}()).__assign(tableOffset, bb)";
+      writer.DecrementIdentLevel();
+      writer += "}";  // end else
+      writer.DecrementIdentLevel();
+      writer += "}";  // end when
+      writer.DecrementIdentLevel();
+      writer += "}";  // end while
+      writer += "return null";
+    };
+    GenerateFun(writer, "__lookup_by_key", params.str(),
+                Esc(struct_def.name) + "?", statements, options.gen_jvmstatic);
+  }
+
+  void GenerateFinishSizePrefixed(StructDef &struct_def,
+                                  const std::string &identifier,
+                                  CodeWriter &writer,
+                                  const IDLOptions options) const {
+    auto id = identifier.length() > 0 ? ", \"" + identifier + "\"" : "";
+    auto params = "builder: FlatBufferBuilder, offset: Int";
+    auto method_name = "finishSizePrefixed" + Esc(struct_def.name) + "Buffer";
+    GenerateFunOneLine(
+        writer, method_name, params, "",
+        [&]() { writer += "builder.finishSizePrefixed(offset" + id + ")"; },
+        options.gen_jvmstatic);
+  }
+  void GenerateFinishStructBuffer(StructDef &struct_def,
+                                  const std::string &identifier,
+                                  CodeWriter &writer,
+                                  const IDLOptions options) const {
+    auto id = identifier.length() > 0 ? ", \"" + identifier + "\"" : "";
+    auto params = "builder: FlatBufferBuilder, offset: Int";
+    auto method_name = "finish" + Esc(struct_def.name) + "Buffer";
+    GenerateFunOneLine(
+        writer, method_name, params, "",
+        [&]() { writer += "builder.finish(offset" + id + ")"; },
+        options.gen_jvmstatic);
+  }
+
+  void GenerateEndStructMethod(StructDef &struct_def, CodeWriter &writer,
+                               const IDLOptions options) const {
+    // Generate end{{TableName}}(builder: FlatBufferBuilder) method
+    auto name = "end" + Esc(struct_def.name);
+    auto params = "builder: FlatBufferBuilder";
+    auto returns = "Int";
+    auto field_vec = struct_def.fields.vec;
+
+    GenerateFun(
+        writer, name, params, returns,
+        [&]() {
+          writer += "val o = builder.endTable()";
+          writer.IncrementIdentLevel();
+          for (auto it = field_vec.begin(); it != field_vec.end(); ++it) {
+            auto &field = **it;
+            if (field.deprecated || !field.IsRequired()) { continue; }
+            writer.SetValue("offset", NumToString(field.value.offset));
+            writer += "builder.required(o, {{offset}})";
+          }
+          writer.DecrementIdentLevel();
+          writer += "return o";
+        },
+        options.gen_jvmstatic);
+  }
+
+  // Generate a method to create a vector from a Kotlin array.
+  void GenerateCreateVectorField(FieldDef &field, CodeWriter &writer,
+                                 const IDLOptions options) const {
+    auto vector_type = field.value.type.VectorType();
+    auto method_name = "create" + MakeCamel(Esc(field.name)) + "Vector";
+    auto params = "builder: FlatBufferBuilder, data: " +
+                  GenTypeBasic(vector_type.base_type) + "Array";
+    writer.SetValue("size", NumToString(InlineSize(vector_type)));
+    writer.SetValue("align", NumToString(InlineAlignment(vector_type)));
+    writer.SetValue("root", GenMethod(vector_type));
+    writer.SetValue("cast", CastToSigned(vector_type));
+
+    GenerateFun(
+        writer, method_name, params, "Int",
+        [&]() {
+          writer += "builder.startVector({{size}}, data.size, {{align}})";
+          writer += "for (i in data.size - 1 downTo 0) {";
+          writer.IncrementIdentLevel();
+          writer += "builder.add{{root}}(data[i]{{cast}})";
+          writer.DecrementIdentLevel();
+          writer += "}";
+          writer += "return builder.endVector()";
+        },
+        options.gen_jvmstatic);
+  }
+
+  void GenerateStartVectorField(FieldDef &field, CodeWriter &writer,
+                                const IDLOptions options) const {
+    // Generate a method to start a vector, data to be added manually
+    // after.
+    auto vector_type = field.value.type.VectorType();
+    auto params = "builder: FlatBufferBuilder, numElems: Int";
+    writer.SetValue("size", NumToString(InlineSize(vector_type)));
+    writer.SetValue("align", NumToString(InlineAlignment(vector_type)));
+
+    GenerateFunOneLine(
+        writer, "start" + MakeCamel(Esc(field.name) + "Vector", true), params,
+        "",
+        [&]() {
+          writer += "builder.startVector({{size}}, numElems, {{align}})";
+        },
+        options.gen_jvmstatic);
+  }
+
+  void GenerateAddField(std::string field_pos, FieldDef &field,
+                        CodeWriter &writer, const IDLOptions options) const {
+    auto field_type = GenTypeBasic(field.value.type.base_type);
+    auto secondArg = MakeCamel(Esc(field.name), false) + ": " + field_type;
+
+    GenerateFunOneLine(
+        writer, "add" + MakeCamel(Esc(field.name), true),
+        "builder: FlatBufferBuilder, " + secondArg, "",
+        [&]() {
+          auto method = GenMethod(field.value.type);
+          writer.SetValue("field_name", MakeCamel(Esc(field.name), false));
+          writer.SetValue("method_name", method);
+          writer.SetValue("pos", field_pos);
+          writer.SetValue("default", GenFBBDefaultValue(field));
+          writer.SetValue("cast", GenFBBValueCast(field));
+
+          writer += "builder.add{{method_name}}({{pos}}, \\";
+          writer += "{{field_name}}{{cast}}, {{default}})";
+        },
+        options.gen_jvmstatic);
+  }
+
+  static std::string ToSignedType(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_UINT: return GenTypeBasic(BASE_TYPE_INT);
+      case BASE_TYPE_ULONG: return GenTypeBasic(BASE_TYPE_LONG);
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_NONE:
+      case BASE_TYPE_UTYPE: return GenTypeBasic(BASE_TYPE_CHAR);
+      case BASE_TYPE_USHORT: return GenTypeBasic(BASE_TYPE_SHORT);
+      case BASE_TYPE_VECTOR: return ToSignedType(type.VectorType());
+      default: return GenTypeBasic(type.base_type);
+    }
+  }
+
+  static std::string FlexBufferBuilderCast(const std::string &method,
+                                           FieldDef &field, bool isFirst) {
+    auto field_type = GenTypeBasic(field.value.type.base_type);
+    std::string to_type;
+    if (method == "Boolean")
+      to_type = "Boolean";
+    else if (method == "Long")
+      to_type = "Long";
+    else if (method == "Int" || method == "Offset" || method == "Struct")
+      to_type = "Int";
+    else if (method == "Byte" || method.empty())
+      to_type = isFirst ? "Byte" : "Int";
+    else if (method == "Short")
+      to_type = isFirst ? "Short" : "Int";
+    else if (method == "Double")
+      to_type = "Double";
+    else if (method == "Float")
+      to_type = isFirst ? "Float" : "Double";
+    else if (method == "UByte")
+
+      if (field_type != to_type) return ".to" + to_type + "()";
+    return "";
+  }
+
+  // fun startMonster(builder: FlatBufferBuilder) = builder.startTable(11)
+  void GenerateStartStructMethod(StructDef &struct_def, CodeWriter &code,
+                                 const IDLOptions options) const {
+    GenerateFunOneLine(
+        code, "start" + Esc(struct_def.name), "builder: FlatBufferBuilder", "",
+        [&]() {
+          code += "builder.startTable(" +
+                  NumToString(struct_def.fields.vec.size()) + ")";
+        },
+        options.gen_jvmstatic);
+  }
+
+  void GenerateTableCreator(StructDef &struct_def, CodeWriter &writer,
+                            const IDLOptions options) const {
+    // Generate a method that creates a table in one go. This is only possible
+    // when the table has no struct fields, since those have to be created
+    // inline, and there's no way to do so in Java.
+    bool has_no_struct_fields = true;
+    int num_fields = 0;
+    auto fields_vec = struct_def.fields.vec;
+
+    for (auto it = fields_vec.begin(); it != fields_vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (IsStruct(field.value.type)) {
+        has_no_struct_fields = false;
+      } else {
+        num_fields++;
+      }
+    }
+    // JVM specifications restrict default constructor params to be < 255.
+    // Longs and doubles take up 2 units, so we set the limit to be < 127.
+    if (has_no_struct_fields && num_fields && num_fields < 127) {
+      // Generate a table constructor of the form:
+      // public static int createName(FlatBufferBuilder builder, args...)
+
+      auto name = "create" + Esc(struct_def.name);
+      std::stringstream params;
+      params << "builder: FlatBufferBuilder";
+      for (auto it = fields_vec.begin(); it != fields_vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        params << ", " << MakeCamel(Esc(field.name), false);
+        if (!IsScalar(field.value.type.base_type)) {
+          params << "Offset: ";
+        } else {
+          params << ": ";
+        }
+        auto optional = field.IsScalarOptional() ? "?" : "";
+        params << GenTypeBasic(field.value.type.base_type) << optional;
+      }
+
+      GenerateFun(
+          writer, name, params.str(), "Int",
+          [&]() {
+            writer.SetValue("vec_size", NumToString(fields_vec.size()));
+
+            writer += "builder.startTable({{vec_size}})";
+
+            auto sortbysize = struct_def.sortbysize;
+            auto largest = sortbysize ? sizeof(largest_scalar_t) : 1;
+            for (size_t size = largest; size; size /= 2) {
+              for (auto it = fields_vec.rbegin(); it != fields_vec.rend();
+                   ++it) {
+                auto &field = **it;
+                auto base_type_size = SizeOf(field.value.type.base_type);
+                if (!field.deprecated &&
+                    (!sortbysize || size == base_type_size)) {
+                  writer.SetValue("camel_field_name",
+                                  MakeCamel(Esc(field.name), true));
+                  writer.SetValue("field_name",
+                                  MakeCamel(Esc(field.name), false));
+
+                  // we wrap on null check for scalar optionals
+                  writer += field.IsScalarOptional()
+                                ? "{{field_name}}?.run { \\"
+                                : "\\";
+
+                  writer += "add{{camel_field_name}}(builder, {{field_name}}\\";
+                  if (!IsScalar(field.value.type.base_type)) {
+                    writer += "Offset\\";
+                  }
+                  // we wrap on null check for scalar optionals
+                  writer += field.IsScalarOptional() ? ") }" : ")";
+                }
+              }
+            }
+            writer += "return end{{struct_name}}(builder)";
+          },
+          options.gen_jvmstatic);
+    }
+  }
+  void GenerateBufferHasIdentifier(StructDef &struct_def, CodeWriter &writer,
+                                   IDLOptions options) const {
+    auto file_identifier = parser_.file_identifier_;
+    // Check if a buffer has the identifier.
+    if (parser_.root_struct_def_ != &struct_def || !file_identifier.length())
+      return;
+    auto name = MakeCamel(Esc(struct_def.name), false);
+    GenerateFunOneLine(
+        writer, name + "BufferHasIdentifier", "_bb: ByteBuffer", "Boolean",
+        [&]() {
+          writer += "__has_identifier(_bb, \"" + file_identifier + "\")";
+        },
+        options.gen_jvmstatic);
+  }
+
+  void GenerateStructGetters(StructDef &struct_def, CodeWriter &writer) const {
+    auto fields_vec = struct_def.fields.vec;
+    FieldDef *key_field = nullptr;
+    for (auto it = fields_vec.begin(); it != fields_vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (field.key) key_field = &field;
+
+      GenerateComment(field.doc_comment, writer, &comment_config);
+
+      auto field_name = MakeCamel(Esc(field.name), false);
+      auto field_type = GenTypeGet(field.value.type);
+      auto field_default_value = GenDefaultValue(field);
+      auto return_type = GetterReturnType(field);
+      auto bbgetter = ByteBufferGetter(field.value.type, "bb");
+      auto ucast = CastToUsigned(field);
+      auto offset_val = NumToString(field.value.offset);
+      auto offset_prefix =
+          "val o = __offset(" + offset_val + "); return o != 0 ? ";
+      auto value_base_type = field.value.type.base_type;
+      // Most field accessors need to retrieve and test the field offset
+      // first, this is the offset value for that:
+      writer.SetValue("offset", NumToString(field.value.offset));
+      writer.SetValue("return_type", return_type);
+      writer.SetValue("field_type", field_type);
+      writer.SetValue("field_name", field_name);
+      writer.SetValue("field_default", field_default_value);
+      writer.SetValue("bbgetter", bbgetter);
+      writer.SetValue("ucast", ucast);
+
+      // Generate the accessors that don't do object reuse.
+      if (value_base_type == BASE_TYPE_STRUCT) {
+        // Calls the accessor that takes an accessor object with a
+        // new object.
+        // val pos
+        //     get() = pos(Vec3())
+        GenerateGetterOneLine(writer, field_name, return_type, [&]() {
+          writer += "{{field_name}}({{field_type}}())";
+        });
+      } else if (value_base_type == BASE_TYPE_VECTOR &&
+                 field.value.type.element == BASE_TYPE_STRUCT) {
+        // Accessors for vectors of structs also take accessor objects,
+        // this generates a variant without that argument.
+        // ex: fun weapons(j: Int) = weapons(Weapon(), j)
+        GenerateFunOneLine(writer, field_name, "j: Int", return_type, [&]() {
+          writer += "{{field_name}}({{field_type}}(), j)";
+        });
+      }
+
+      if (IsScalar(value_base_type)) {
+        if (struct_def.fixed) {
+          GenerateGetterOneLine(writer, field_name, return_type, [&]() {
+            writer += "{{bbgetter}}(bb_pos + {{offset}}){{ucast}}";
+          });
+        } else {
+          GenerateGetter(writer, field_name, return_type, [&]() {
+            writer += "val o = __offset({{offset}})";
+            writer +=
+                "return if(o != 0) {{bbgetter}}"
+                "(o + bb_pos){{ucast}} else "
+                "{{field_default}}";
+          });
+        }
+      } else {
+        switch (value_base_type) {
+          case BASE_TYPE_STRUCT:
+            if (struct_def.fixed) {
+              // create getter with object reuse
+              // ex:
+              // fun pos(obj: Vec3) : Vec3? = obj.__assign(bb_pos + 4, bb)
+              // ? adds nullability annotation
+              GenerateFunOneLine(
+                  writer, field_name, "obj: " + field_type, return_type,
+                  [&]() { writer += "obj.__assign(bb_pos + {{offset}}, bb)"; });
+            } else {
+              // create getter with object reuse
+              // ex:
+              //  fun pos(obj: Vec3) : Vec3? {
+              //      val o = __offset(4)
+              //      return if(o != 0) {
+              //          obj.__assign(o + bb_pos, bb)
+              //      else {
+              //          null
+              //      }
+              //  }
+              // ? adds nullability annotation
+              GenerateFun(
+                  writer, field_name, "obj: " + field_type, return_type, [&]() {
+                    auto fixed = field.value.type.struct_def->fixed;
+
+                    writer.SetValue("seek", Indirect("o + bb_pos", fixed));
+                    OffsetWrapper(
+                        writer, offset_val,
+                        [&]() { writer += "obj.__assign({{seek}}, bb)"; },
+                        [&]() { writer += "null"; });
+                  });
+            }
+            break;
+          case BASE_TYPE_STRING:
+            // create string getter
+            // e.g.
+            // val Name : String?
+            //     get() = {
+            //         val o = __offset(10)
+            //         return if (o != 0) __string(o + bb_pos) else null
+            //     }
+            // ? adds nullability annotation
+            GenerateGetter(writer, field_name, return_type, [&]() {
+              writer += "val o = __offset({{offset}})";
+              writer += "return if (o != 0) __string(o + bb_pos) else null";
+            });
+            break;
+          case BASE_TYPE_VECTOR: {
+            // e.g.
+            // fun inventory(j: Int) : UByte {
+            //     val o = __offset(14)
+            //     return if (o != 0) {
+            //         bb.get(__vector(o) + j * 1).toUByte()
+            //     } else {
+            //        0
+            //     }
+            // }
+
+            auto vectortype = field.value.type.VectorType();
+            std::string params = "j: Int";
+
+            if (vectortype.base_type == BASE_TYPE_STRUCT ||
+                vectortype.base_type == BASE_TYPE_UNION) {
+              params = "obj: " + field_type + ", j: Int";
+            }
+
+            GenerateFun(writer, field_name, params, return_type, [&]() {
+              auto inline_size = NumToString(InlineSize(vectortype));
+              auto index = "__vector(o) + j * " + inline_size;
+              auto not_found = NotFoundReturn(field.value.type.element);
+              auto found = "";
+              writer.SetValue("index", index);
+              switch (vectortype.base_type) {
+                case BASE_TYPE_STRUCT: {
+                  bool fixed = vectortype.struct_def->fixed;
+                  writer.SetValue("index", Indirect(index, fixed));
+                  found = "obj.__assign({{index}}, bb)";
+                  break;
+                }
+                case BASE_TYPE_UNION:
+                  found = "{{bbgetter}}(obj, {{index}}){{ucast}}";
+                  break;
+                default: found = "{{bbgetter}}({{index}}){{ucast}}";
+              }
+              OffsetWrapper(
+                  writer, offset_val, [&]() { writer += found; },
+                  [&]() { writer += not_found; });
+            });
+            break;
+          }
+          case BASE_TYPE_UNION:
+            GenerateFun(
+                writer, field_name, "obj: " + field_type, return_type, [&]() {
+                  writer += OffsetWrapperOneLine(
+                      offset_val, bbgetter + "(obj, o + bb_pos)", "null");
+                });
+            break;
+          default: FLATBUFFERS_ASSERT(0);
+        }
+      }
+
+      if (value_base_type == BASE_TYPE_VECTOR) {
+        // Generate Lenght functions for vectors
+        GenerateGetter(writer, field_name + "Length", "Int", [&]() {
+          writer += OffsetWrapperOneLine(offset_val, "__vector_len(o)", "0");
+        });
+
+        // See if we should generate a by-key accessor.
+        if (field.value.type.element == BASE_TYPE_STRUCT &&
+            !field.value.type.struct_def->fixed) {
+          auto &sd = *field.value.type.struct_def;
+          auto &fields = sd.fields.vec;
+          for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+            auto &kfield = **kit;
+            if (kfield.key) {
+              auto qualified_name = WrapInNameSpace(sd);
+              auto name = MakeCamel(Esc(field.name), false) + "ByKey";
+              auto params = "key: " + GenTypeGet(kfield.value.type);
+              auto rtype = qualified_name + "?";
+              GenerateFun(writer, name, params, rtype, [&]() {
+                OffsetWrapper(
+                    writer, offset_val,
+                    [&]() {
+                      writer += qualified_name +
+                                ".__lookup_by_key(null, __vector(o), key, bb)";
+                    },
+                    [&]() { writer += "null"; });
+              });
+
+              auto param2 = "obj: " + qualified_name +
+                            ", key: " + GenTypeGet(kfield.value.type);
+              GenerateFun(writer, name, param2, rtype, [&]() {
+                OffsetWrapper(
+                    writer, offset_val,
+                    [&]() {
+                      writer += qualified_name +
+                                ".__lookup_by_key(obj, __vector(o), key, bb)";
+                    },
+                    [&]() { writer += "null"; });
+              });
+
+              break;
+            }
+          }
+        }
+      }
+
+      if ((value_base_type == BASE_TYPE_VECTOR &&
+           IsScalar(field.value.type.VectorType().base_type)) ||
+          value_base_type == BASE_TYPE_STRING) {
+        auto end_idx =
+            NumToString(value_base_type == BASE_TYPE_STRING
+                            ? 1
+                            : InlineSize(field.value.type.VectorType()));
+        // Generate a ByteBuffer accessor for strings & vectors of scalars.
+        // e.g.
+        // val inventoryByteBuffer: ByteBuffer
+        //     get =  __vector_as_bytebuffer(14, 1)
+
+        GenerateGetterOneLine(
+            writer, field_name + "AsByteBuffer", "ByteBuffer", [&]() {
+              writer.SetValue("end", end_idx);
+              writer += "__vector_as_bytebuffer({{offset}}, {{end}})";
+            });
+
+        // Generate a ByteBuffer accessor for strings & vectors of scalars.
+        // e.g.
+        // fun inventoryInByteBuffer(_bb: Bytebuffer):
+        //     ByteBuffer = __vector_as_bytebuffer(_bb, 14, 1)
+        GenerateFunOneLine(
+            writer, field_name + "InByteBuffer", "_bb: ByteBuffer",
+            "ByteBuffer", [&]() {
+              writer.SetValue("end", end_idx);
+              writer += "__vector_in_bytebuffer(_bb, {{offset}}, {{end}})";
+            });
+      }
+
+      // generate object accessors if is nested_flatbuffer
+      // fun testnestedflatbufferAsMonster() : Monster?
+      //{ return testnestedflatbufferAsMonster(new Monster()); }
+
+      if (field.nested_flatbuffer) {
+        auto nested_type_name = WrapInNameSpace(*field.nested_flatbuffer);
+        auto nested_method_name =
+            field_name + "As" + field.nested_flatbuffer->name;
+
+        GenerateGetterOneLine(
+            writer, nested_method_name, nested_type_name + "?", [&]() {
+              writer += nested_method_name + "(" + nested_type_name + "())";
+            });
+
+        GenerateFun(writer, nested_method_name, "obj: " + nested_type_name,
+                    nested_type_name + "?", [&]() {
+                      OffsetWrapper(
+                          writer, offset_val,
+                          [&]() {
+                            writer +=
+                                "obj.__assign(__indirect(__vector(o)), bb)";
+                          },
+                          [&]() { writer += "null"; });
+                    });
+      }
+
+      // Generate mutators for scalar fields or vectors of scalars.
+      if (parser_.opts.mutable_buffer) {
+        auto value_type = field.value.type;
+        auto underlying_type = value_base_type == BASE_TYPE_VECTOR
+                                   ? value_type.VectorType()
+                                   : value_type;
+        auto name = "mutate" + MakeCamel(Esc(field.name), true);
+        auto size = NumToString(InlineSize(underlying_type));
+        auto params = Esc(field.name) + ": " + GenTypeGet(underlying_type);
+        // A vector mutator also needs the index of the vector element it should
+        // mutate.
+        if (value_base_type == BASE_TYPE_VECTOR) params.insert(0, "j: Int, ");
+
+        // Boolean parameters have to be explicitly converted to byte
+        // representation.
+        auto setter_parameter =
+            underlying_type.base_type == BASE_TYPE_BOOL
+                ? "(if(" + Esc(field.name) + ") 1 else 0).toByte()"
+                : Esc(field.name);
+
+        auto setter_index =
+            value_base_type == BASE_TYPE_VECTOR
+                ? "__vector(o) + j * " + size
+                : (struct_def.fixed ? "bb_pos + " + offset_val : "o + bb_pos");
+        if (IsScalar(value_base_type) ||
+            (value_base_type == BASE_TYPE_VECTOR &&
+             IsScalar(value_type.VectorType().base_type))) {
+          auto statements = [&]() {
+            writer.SetValue("bbsetter", ByteBufferSetter(underlying_type));
+            writer.SetValue("index", setter_index);
+            writer.SetValue("params", setter_parameter);
+            writer.SetValue("cast", CastToSigned(field));
+            if (struct_def.fixed) {
+              writer += "{{bbsetter}}({{index}}, {{params}}{{cast}})";
+            } else {
+              OffsetWrapper(
+                  writer, offset_val,
+                  [&]() {
+                    writer += "{{bbsetter}}({{index}}, {{params}}{{cast}})";
+                    writer += "true";
+                  },
+                  [&]() { writer += "false"; });
+            }
+          };
+
+          if (struct_def.fixed) {
+            GenerateFunOneLine(writer, name, params, "ByteBuffer", statements);
+          } else {
+            GenerateFun(writer, name, params, "Boolean", statements);
+          }
+        }
+      }
+    }
+    if (struct_def.has_key && !struct_def.fixed) {
+      // Key Comparison method
+      GenerateOverrideFun(
+          writer, "keysCompare", "o1: Int, o2: Int, _bb: ByteBuffer", "Int",
+          [&]() {
+            if (IsString(key_field->value.type)) {
+              writer.SetValue("offset", NumToString(key_field->value.offset));
+              writer +=
+                  " return compareStrings(__offset({{offset}}, o1, "
+                  "_bb), __offset({{offset}}, o2, _bb), _bb)";
+
+            } else {
+              auto getter1 = GenLookupByKey(key_field, "_bb", "o1");
+              auto getter2 = GenLookupByKey(key_field, "_bb", "o2");
+              writer += "val val_1 = " + getter1;
+              writer += "val val_2 = " + getter2;
+              writer += "return (val_1 - val_2).sign";
+            }
+          });
+    }
+  }
+
+  static std::string CastToUsigned(const FieldDef &field) {
+    return CastToUsigned(field.value.type);
+  }
+
+  static std::string CastToUsigned(const Type type) {
+    switch (type.base_type) {
+      case BASE_TYPE_UINT: return ".toUInt()";
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_UTYPE: return ".toUByte()";
+      case BASE_TYPE_USHORT: return ".toUShort()";
+      case BASE_TYPE_ULONG: return ".toULong()";
+      case BASE_TYPE_VECTOR: return CastToUsigned(type.VectorType());
+      default: return "";
+    }
+  }
+
+  static std::string CastToSigned(const FieldDef &field) {
+    return CastToSigned(field.value.type);
+  }
+
+  static std::string CastToSigned(const Type type) {
+    switch (type.base_type) {
+      case BASE_TYPE_UINT: return ".toInt()";
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_UTYPE: return ".toByte()";
+      case BASE_TYPE_USHORT: return ".toShort()";
+      case BASE_TYPE_ULONG: return ".toLong()";
+      case BASE_TYPE_VECTOR: return CastToSigned(type.VectorType());
+      default: return "";
+    }
+  }
+
+  static std::string LiteralSuffix(const BaseType type) {
+    switch (type) {
+      case BASE_TYPE_UINT:
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_UTYPE:
+      case BASE_TYPE_USHORT: return "u";
+      case BASE_TYPE_ULONG: return "UL";
+      case BASE_TYPE_LONG: return "L";
+      default: return "";
+    }
+  }
+
+  void GenerateCompanionObject(CodeWriter &code,
+                               const std::function<void()> &callback) const {
+    code += "companion object {";
+    code.IncrementIdentLevel();
+    callback();
+    code.DecrementIdentLevel();
+    code += "}";
+  }
+
+  // Generate a documentation comment, if available.
+  void GenerateComment(const std::vector<std::string> &dc, CodeWriter &writer,
+                       const CommentConfig *config) const {
+    if (dc.begin() == dc.end()) {
+      // Don't output empty comment blocks with 0 lines of comment content.
+      return;
+    }
+
+    if (config != nullptr && config->first_line != nullptr) {
+      writer += std::string(config->first_line);
+    }
+    std::string line_prefix =
+        ((config != nullptr && config->content_line_prefix != nullptr)
+             ? config->content_line_prefix
+             : "///");
+    for (auto it = dc.begin(); it != dc.end(); ++it) {
+      writer += line_prefix + *it;
+    }
+    if (config != nullptr && config->last_line != nullptr) {
+      writer += std::string(config->last_line);
+    }
+  }
+
+  static void GenerateGetRootAsAccessors(const std::string &struct_name,
+                                         CodeWriter &writer,
+                                         IDLOptions options) {
+    // Generate a special accessor for the table that when used as the root
+    // ex: fun getRootAsMonster(_bb: ByteBuffer): Monster {...}
+    writer.SetValue("gr_name", struct_name);
+    writer.SetValue("gr_method", "getRootAs" + struct_name);
+
+    // create convenience method that doesn't require an existing object
+    GenerateJvmStaticAnnotation(writer, options.gen_jvmstatic);
+    writer += "fun {{gr_method}}(_bb: ByteBuffer): {{gr_name}} = \\";
+    writer += "{{gr_method}}(_bb, {{gr_name}}())";
+
+    // create method that allows object reuse
+    // ex: fun Monster getRootAsMonster(_bb: ByteBuffer, obj: Monster) {...}
+    GenerateJvmStaticAnnotation(writer, options.gen_jvmstatic);
+    writer +=
+        "fun {{gr_method}}"
+        "(_bb: ByteBuffer, obj: {{gr_name}}): {{gr_name}} {";
+    writer.IncrementIdentLevel();
+    writer += "_bb.order(ByteOrder.LITTLE_ENDIAN)";
+    writer +=
+        "return (obj.__assign(_bb.getInt(_bb.position())"
+        " + _bb.position(), _bb))";
+    writer.DecrementIdentLevel();
+    writer += "}";
+  }
+
+  static void GenerateStaticConstructor(const StructDef &struct_def,
+                                        CodeWriter &code,
+                                        const IDLOptions options) {
+    // create a struct constructor function
+    auto params = StructConstructorParams(struct_def);
+    GenerateFun(
+        code, "create" + Esc(struct_def.name), params, "Int",
+        [&]() {
+          GenStructBody(struct_def, code, "");
+          code += "return builder.offset()";
+        },
+        options.gen_jvmstatic);
+  }
+
+  static std::string StructConstructorParams(const StructDef &struct_def,
+                                             const std::string &prefix = "") {
+    // builder: FlatBufferBuilder
+    std::stringstream out;
+    auto field_vec = struct_def.fields.vec;
+    if (prefix.empty()) { out << "builder: FlatBufferBuilder"; }
+    for (auto it = field_vec.begin(); it != field_vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure
+        // names don't clash, and to make it obvious these arguments are
+        // constructing a nested struct, prefix the name with the field
+        // name.
+        out << StructConstructorParams(*field.value.type.struct_def,
+                                       prefix + (Esc(field.name) + "_"));
+      } else {
+        out << ", " << prefix << MakeCamel(Esc(field.name), false) << ": "
+            << GenTypeBasic(field.value.type.base_type);
+      }
+    }
+    return out.str();
+  }
+
+  static void GeneratePropertyOneLine(CodeWriter &writer,
+                                      const std::string &name,
+                                      const std::string &type,
+                                      const std::function<void()> &body) {
+    // Generates Kotlin getter for properties
+    // e.g.:
+    // val prop: Mytype = x
+    writer.SetValue("_name", name);
+    writer.SetValue("_type", type);
+    writer += "val {{_name}} : {{_type}} = \\";
+    body();
+  }
+  static void GenerateGetterOneLine(CodeWriter &writer, const std::string &name,
+                                    const std::string &type,
+                                    const std::function<void()> &body) {
+    // Generates Kotlin getter for properties
+    // e.g.:
+    // val prop: Mytype get() = x
+    writer.SetValue("_name", name);
+    writer.SetValue("_type", type);
+    writer += "val {{_name}} : {{_type}} get() = \\";
+    body();
+  }
+
+  static void GenerateGetter(CodeWriter &writer, const std::string &name,
+                             const std::string &type,
+                             const std::function<void()> &body) {
+    // Generates Kotlin getter for properties
+    // e.g.:
+    // val prop: Mytype
+    //     get() = {
+    //       return x
+    //     }
+    writer.SetValue("name", name);
+    writer.SetValue("type", type);
+    writer += "val {{name}} : {{type}}";
+    writer.IncrementIdentLevel();
+    writer += "get() {";
+    writer.IncrementIdentLevel();
+    body();
+    writer.DecrementIdentLevel();
+    writer += "}";
+    writer.DecrementIdentLevel();
+  }
+
+  static void GenerateFun(CodeWriter &writer, const std::string &name,
+                          const std::string &params,
+                          const std::string &returnType,
+                          const std::function<void()> &body,
+                          bool gen_jvmstatic = false) {
+    // Generates Kotlin function
+    // e.g.:
+    // fun path(j: Int): Vec3 {
+    //     return path(Vec3(), j)
+    // }
+    auto noreturn = returnType.empty();
+    writer.SetValue("name", name);
+    writer.SetValue("params", params);
+    writer.SetValue("return_type", noreturn ? "" : ": " + returnType);
+    GenerateJvmStaticAnnotation(writer, gen_jvmstatic);
+    writer += "fun {{name}}({{params}}) {{return_type}} {";
+    writer.IncrementIdentLevel();
+    body();
+    writer.DecrementIdentLevel();
+    writer += "}";
+  }
+
+  static void GenerateFunOneLine(CodeWriter &writer, const std::string &name,
+                                 const std::string &params,
+                                 const std::string &returnType,
+                                 const std::function<void()> &body,
+                                 bool gen_jvmstatic = false) {
+    // Generates Kotlin function
+    // e.g.:
+    // fun path(j: Int): Vec3 = return path(Vec3(), j)
+    writer.SetValue("name", name);
+    writer.SetValue("params", params);
+    writer.SetValue("return_type_p",
+                    returnType.empty() ? "" : " : " + returnType);
+    GenerateJvmStaticAnnotation(writer, gen_jvmstatic);
+    writer += "fun {{name}}({{params}}){{return_type_p}} = \\";
+    body();
+  }
+
+  static void GenerateOverrideFun(CodeWriter &writer, const std::string &name,
+                                  const std::string &params,
+                                  const std::string &returnType,
+                                  const std::function<void()> &body) {
+    // Generates Kotlin function
+    // e.g.:
+    // override fun path(j: Int): Vec3 = return path(Vec3(), j)
+    writer += "override \\";
+    GenerateFun(writer, name, params, returnType, body);
+  }
+
+  static void GenerateOverrideFunOneLine(CodeWriter &writer,
+                                         const std::string &name,
+                                         const std::string &params,
+                                         const std::string &returnType,
+                                         const std::string &statement) {
+    // Generates Kotlin function
+    // e.g.:
+    // override fun path(j: Int): Vec3 = return path(Vec3(), j)
+    writer.SetValue("name", name);
+    writer.SetValue("params", params);
+    writer.SetValue("return_type",
+                    returnType.empty() ? "" : " : " + returnType);
+    writer += "override fun {{name}}({{params}}){{return_type}} = \\";
+    writer += statement;
+  }
+
+  static std::string OffsetWrapperOneLine(const std::string &offset,
+                                          const std::string &found,
+                                          const std::string &not_found) {
+    return "val o = __offset(" + offset + "); return if (o != 0) " + found +
+           " else " + not_found;
+  }
+
+  static void OffsetWrapper(CodeWriter &code, const std::string &offset,
+                            const std::function<void()> &found,
+                            const std::function<void()> &not_found) {
+    code += "val o = __offset(" + offset + ")";
+    code += "return if (o != 0) {";
+    code.IncrementIdentLevel();
+    found();
+    code.DecrementIdentLevel();
+    code += "} else {";
+    code.IncrementIdentLevel();
+    not_found();
+    code.DecrementIdentLevel();
+    code += "}";
+  }
+
+  static std::string Indirect(const std::string &index, bool fixed) {
+    // We apply __indirect() and struct is not fixed.
+    if (!fixed) return "__indirect(" + index + ")";
+    return index;
+  }
+
+  static std::string NotFoundReturn(BaseType el) {
+    switch (el) {
+      case BASE_TYPE_FLOAT: return "0.0f";
+      case BASE_TYPE_DOUBLE: return "0.0";
+      case BASE_TYPE_BOOL: return "false";
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_INT:
+      case BASE_TYPE_CHAR:
+      case BASE_TYPE_SHORT: return "0";
+      case BASE_TYPE_UINT:
+      case BASE_TYPE_UCHAR:
+      case BASE_TYPE_USHORT:
+      case BASE_TYPE_UTYPE: return "0u";
+      case BASE_TYPE_ULONG: return "0uL";
+      default: return "null";
+    }
+  }
+
+  // Prepend @JvmStatic to methods in companion object.
+  static void GenerateJvmStaticAnnotation(CodeWriter &code,
+                                          bool gen_jvmstatic) {
+    if (gen_jvmstatic) { code += "@JvmStatic"; }
+  }
+
+  // This tracks the current namespace used to determine if a type need to be
+  // prefixed by its namespace
+  const Namespace *cur_name_space_;
+};
+}  // namespace kotlin
+
+bool GenerateKotlin(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  kotlin::KotlinGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lobster.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lobster.cpp
new file mode 100644
index 0000000..6fdd6dc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lobster.cpp
@@ -0,0 +1,391 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+namespace lobster {
+
+class LobsterGenerator : public BaseGenerator {
+ public:
+  LobsterGenerator(const Parser &parser, const std::string &path,
+                   const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "" /* not used */, "_",
+                      "lobster") {
+    static const char *const keywords[] = {
+      "nil",    "true",    "false",     "return",  "struct",    "class",
+      "import", "int",     "float",     "string",  "any",       "def",
+      "is",     "from",    "program",   "private", "coroutine", "resource",
+      "enum",   "typeof",  "var",       "let",     "pakfile",   "switch",
+      "case",   "default", "namespace", "not",     "and",       "or",
+      "bool",
+    };
+    keywords_.insert(std::begin(keywords), std::end(keywords));
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : name + "_";
+  }
+
+  std::string NormalizedName(const Definition &definition) const {
+    return EscapeKeyword(definition.name);
+  }
+
+  std::string NormalizedName(const EnumVal &ev) const {
+    return EscapeKeyword(ev.name);
+  }
+
+  std::string NamespacedName(const Definition &def) {
+    return WrapInNameSpace(def.defined_namespace, NormalizedName(def));
+  }
+
+  std::string GenTypeName(const Type &type) {
+    auto bits = NumToString(SizeOf(type.base_type) * 8);
+    if (IsInteger(type.base_type)) return "int" + bits;
+    if (IsFloat(type.base_type)) return "float" + bits;
+    if (IsString(type)) return "string";
+    if (type.base_type == BASE_TYPE_STRUCT) return "table";
+    return "none";
+  }
+
+  std::string LobsterType(const Type &type) {
+    if (IsFloat(type.base_type)) return "float";
+    if (IsScalar(type.base_type) && type.enum_def)
+      return NormalizedName(*type.enum_def);
+    if (!IsScalar(type.base_type)) return "flatbuffers_offset";
+    return "int";
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const Type &type) {
+    return IsScalar(type.base_type)
+               ? MakeCamel(GenTypeBasic(type))
+               : (IsStruct(type) ? "Struct" : "UOffsetTRelative");
+  }
+
+  // This uses Python names for now..
+  std::string GenTypeBasic(const Type &type) {
+    // clang-format off
+    static const char *ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, ...) \
+        #PTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return ctypename[type.base_type];
+  }
+
+  // Generate a struct field, conditioned on its child type(s).
+  void GenStructAccessor(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, nullptr, "    ");
+    std::string &code = *code_ptr;
+    auto offsets = NumToString(field.value.offset);
+    auto def = "    def " + NormalizedName(field);
+    if (IsScalar(field.value.type.base_type)) {
+      std::string acc;
+      if (struct_def.fixed) {
+        acc = "buf_.read_" + GenTypeName(field.value.type) + "_le(pos_ + " +
+              offsets + ")";
+
+      } else {
+        auto defval = field.IsOptional() ? "0" : field.value.constant;
+        acc = "buf_.flatbuffers_field_" + GenTypeName(field.value.type) +
+              "(pos_, " + offsets + ", " + defval + ")";
+      }
+      if (field.value.type.enum_def)
+        acc = NormalizedName(*field.value.type.enum_def) + "(" + acc + ")";
+      if (field.IsOptional())
+        acc += ", buf_.flatbuffers_field_present(pos_, " + offsets + ")";
+      code += def + "():\n        return " + acc + "\n";
+      return;
+    }
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_STRUCT: {
+        auto name = NamespacedName(*field.value.type.struct_def);
+        code += def + "():\n        ";
+        if (struct_def.fixed) {
+          code += "return " + name + "{ buf_, pos_ + " + offsets + " }\n";
+        } else {
+          code += std::string("let o = buf_.flatbuffers_field_") +
+                  (field.value.type.struct_def->fixed ? "struct" : "table") +
+                  "(pos_, " + offsets + ")\n        return if o: " + name +
+                  " { buf_, o } else: nil\n";
+        }
+        break;
+      }
+      case BASE_TYPE_STRING:
+        code += def +
+                "():\n        return buf_.flatbuffers_field_string(pos_, " +
+                offsets + ")\n";
+        break;
+      case BASE_TYPE_VECTOR: {
+        auto vectortype = field.value.type.VectorType();
+        code += def + "(i:int):\n        return ";
+        if (vectortype.base_type == BASE_TYPE_STRUCT) {
+          auto start = "buf_.flatbuffers_field_vector(pos_, " + offsets +
+                       ") + i * " + NumToString(InlineSize(vectortype));
+          if (!(vectortype.struct_def->fixed)) {
+            start = "buf_.flatbuffers_indirect(" + start + ")";
+          }
+          code += NamespacedName(*field.value.type.struct_def) + " { buf_, " +
+                  start + " }\n";
+        } else {
+          if (IsString(vectortype))
+            code += "buf_.flatbuffers_string";
+          else
+            code += "buf_.read_" + GenTypeName(vectortype) + "_le";
+          code += "(buf_.flatbuffers_field_vector(pos_, " + offsets +
+                  ") + i * " + NumToString(InlineSize(vectortype)) + ")\n";
+        }
+        break;
+      }
+      case BASE_TYPE_UNION: {
+        for (auto it = field.value.type.enum_def->Vals().begin();
+             it != field.value.type.enum_def->Vals().end(); ++it) {
+          auto &ev = **it;
+          if (ev.IsNonZero()) {
+            code += def + "_as_" + ev.name + "():\n        return " +
+                    NamespacedName(*ev.union_type.struct_def) +
+                    " { buf_, buf_.flatbuffers_field_table(pos_, " + offsets +
+                    ") }\n";
+          }
+        }
+        break;
+      }
+      default: FLATBUFFERS_ASSERT(0);
+    }
+    if (IsVector(field.value.type)) {
+      code += def +
+              "_length():\n        return "
+              "buf_.flatbuffers_field_vector_len(pos_, " +
+              offsets + ")\n";
+    }
+  }
+
+  // Generate table constructors, conditioned on its members' types.
+  void GenTableBuilders(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "struct " + NormalizedName(struct_def) +
+            "Builder:\n    b_:flatbuffers_builder\n";
+    code += "    def start():\n        b_.StartObject(" +
+            NumToString(struct_def.fields.vec.size()) +
+            ")\n        return this\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto offset = it - struct_def.fields.vec.begin();
+      code += "    def add_" + NormalizedName(field) + "(" +
+              NormalizedName(field) + ":" + LobsterType(field.value.type) +
+              "):\n        b_.Prepend" + GenMethod(field.value.type) + "Slot(" +
+              NumToString(offset) + ", " + NormalizedName(field);
+      if (IsScalar(field.value.type.base_type) && !field.IsOptional())
+        code += ", " + field.value.constant;
+      code += ")\n        return this\n";
+    }
+    code += "    def end():\n        return b_.EndObject()\n\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (IsVector(field.value.type)) {
+        code += "def " + NormalizedName(struct_def) + "Start" +
+                MakeCamel(NormalizedName(field)) +
+                "Vector(b_:flatbuffers_builder, n_:int):\n    b_.StartVector(";
+        auto vector_type = field.value.type.VectorType();
+        auto alignment = InlineAlignment(vector_type);
+        auto elem_size = InlineSize(vector_type);
+        code +=
+            NumToString(elem_size) + ", n_, " + NumToString(alignment) + ")\n";
+        if (vector_type.base_type != BASE_TYPE_STRUCT ||
+            !vector_type.struct_def->fixed) {
+          code += "def " + NormalizedName(struct_def) + "Create" +
+                  MakeCamel(NormalizedName(field)) +
+                  "Vector(b_:flatbuffers_builder, v_:[" +
+                  LobsterType(vector_type) + "]):\n    b_.StartVector(" +
+                  NumToString(elem_size) + ", v_.length, " +
+                  NumToString(alignment) + ")\n    reverse(v_) e_: b_.Prepend" +
+                  GenMethod(vector_type) +
+                  "(e_)\n    return b_.EndVector(v_.length)\n";
+        }
+        code += "\n";
+      }
+    }
+  }
+
+  void GenStructPreDecl(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+    std::string &code = *code_ptr;
+    CheckNameSpace(struct_def, &code);
+    code += "class " + NormalizedName(struct_def) + "\n\n";
+  }
+
+  // Generate struct or table methods.
+  void GenStruct(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+    std::string &code = *code_ptr;
+    CheckNameSpace(struct_def, &code);
+    GenComment(struct_def.doc_comment, code_ptr, nullptr, "");
+    code += "class " + NormalizedName(struct_def) + " : flatbuffers_handle\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      GenStructAccessor(struct_def, field, code_ptr);
+    }
+    code += "\n";
+    if (!struct_def.fixed) {
+      // Generate a special accessor for the table that has been declared as
+      // the root type.
+      code += "def GetRootAs" + NormalizedName(struct_def) +
+              "(buf:string): return " + NormalizedName(struct_def) +
+              " { buf, buf.flatbuffers_indirect(0) }\n\n";
+    }
+    if (struct_def.fixed) {
+      // create a struct constructor function
+      GenStructBuilder(struct_def, code_ptr);
+    } else {
+      // Create a set of functions that allow table construction.
+      GenTableBuilders(struct_def, code_ptr);
+    }
+  }
+
+  // Generate enum declarations.
+  void GenEnum(const EnumDef &enum_def, std::string *code_ptr) {
+    if (enum_def.generated) return;
+    std::string &code = *code_ptr;
+    CheckNameSpace(enum_def, &code);
+    GenComment(enum_def.doc_comment, code_ptr, nullptr, "");
+    code += "enum " + NormalizedName(enum_def) + ":\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, nullptr, "    ");
+      code += "    " + enum_def.name + "_" + NormalizedName(ev) + " = " +
+              enum_def.ToString(ev) + "\n";
+    }
+    code += "\n";
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void StructBuilderArgs(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        StructBuilderArgs(*field.value.type.struct_def,
+                          (nameprefix + (NormalizedName(field) + "_")).c_str(),
+                          code_ptr);
+      } else {
+        std::string &code = *code_ptr;
+        code += ", " + (nameprefix + NormalizedName(field)) + ":" +
+                LobsterType(field.value.type);
+      }
+    }
+  }
+
+  // Recursively generate struct construction statements and instert manual
+  // padding.
+  void StructBuilderBody(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "    b_.Prep(" + NumToString(struct_def.minalign) + ", " +
+            NumToString(struct_def.bytesize) + ")\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      if (field.padding)
+        code += "    b_.Pad(" + NumToString(field.padding) + ")\n";
+      if (IsStruct(field.value.type)) {
+        StructBuilderBody(*field.value.type.struct_def,
+                          (nameprefix + (NormalizedName(field) + "_")).c_str(),
+                          code_ptr);
+      } else {
+        code += "    b_.Prepend" + GenMethod(field.value.type) + "(" +
+                nameprefix + NormalizedName(field) + ")\n";
+      }
+    }
+  }
+
+  // Create a struct with a builder and the struct's arguments.
+  void GenStructBuilder(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code +=
+        "def Create" + NormalizedName(struct_def) + "(b_:flatbuffers_builder";
+    StructBuilderArgs(struct_def, "", code_ptr);
+    code += "):\n";
+    StructBuilderBody(struct_def, "", code_ptr);
+    code += "    return b_.Offset()\n\n";
+  }
+
+  void CheckNameSpace(const Definition &def, std::string *code_ptr) {
+    auto ns = GetNameSpace(def);
+    if (ns == current_namespace_) return;
+    current_namespace_ = ns;
+    std::string &code = *code_ptr;
+    code += "namespace " + ns + "\n\n";
+  }
+
+  bool generate() {
+    std::string code;
+    code += std::string("// ") + FlatBuffersGeneratedWarning() +
+            "\nimport flatbuffers\n\n";
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      auto &enum_def = **it;
+      GenEnum(enum_def, &code);
+    }
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      GenStructPreDecl(struct_def, &code);
+    }
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      GenStruct(struct_def, &code);
+    }
+    return SaveFile(GeneratedFileName(path_, file_name_, parser_.opts).c_str(),
+                    code, false);
+  }
+
+ private:
+  std::unordered_set<std::string> keywords_;
+  std::string current_namespace_;
+};
+
+}  // namespace lobster
+
+bool GenerateLobster(const Parser &parser, const std::string &path,
+                     const std::string &file_name) {
+  lobster::LobsterGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lua.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lua.cpp
new file mode 100644
index 0000000..9efc435
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_lua.cpp
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <string>
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+namespace lua {
+
+// Hardcode spaces per indentation.
+const CommentConfig def_comment = { nullptr, "--", nullptr };
+const char *Indent = "    ";
+const char *Comment = "-- ";
+const char *End = "end\n";
+const char *EndFunc = "end\n";
+const char *SelfData = "self.view";
+const char *SelfDataPos = "self.view.pos";
+const char *SelfDataBytes = "self.view.bytes";
+
+class LuaGenerator : public BaseGenerator {
+ public:
+  LuaGenerator(const Parser &parser, const std::string &path,
+               const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "" /* not used */,
+                      "" /* not used */, "lua") {
+    static const char *const keywords[] = {
+      "and",      "break",  "do",   "else", "elseif", "end",  "false", "for",
+      "function", "goto",   "if",   "in",   "local",  "nil",  "not",   "or",
+      "repeat",   "return", "then", "true", "until",  "while"
+    };
+    keywords_.insert(std::begin(keywords), std::end(keywords));
+  }
+
+  // Most field accessors need to retrieve and test the field offset first,
+  // this is the prefix code for that.
+  std::string OffsetPrefix(const FieldDef &field) {
+    return std::string(Indent) + "local o = " + SelfData + ":Offset(" +
+           NumToString(field.value.offset) + ")\n" + Indent +
+           "if o ~= 0 then\n";
+  }
+
+  // Begin a class declaration.
+  void BeginClass(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "local " + NormalizedName(struct_def) + " = {} -- the module\n";
+    code += "local " + NormalizedMetaName(struct_def) +
+            " = {} -- the class metatable\n";
+    code += "\n";
+  }
+
+  // Begin enum code with a class declaration.
+  void BeginEnum(const std::string &class_name, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "local " + class_name + " = {\n";
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : "_" + name;
+  }
+
+  std::string NormalizedName(const Definition &definition) const {
+    return EscapeKeyword(definition.name);
+  }
+
+  std::string NormalizedName(const EnumVal &ev) const {
+    return EscapeKeyword(ev.name);
+  }
+
+  std::string NormalizedMetaName(const Definition &definition) const {
+    return EscapeKeyword(definition.name) + "_mt";
+  }
+
+  // A single enum member.
+  void EnumMember(const EnumDef &enum_def, const EnumVal &ev,
+                  std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += std::string(Indent) + NormalizedName(ev) + " = " +
+            enum_def.ToString(ev) + ",\n";
+  }
+
+  // End enum code.
+  void EndEnum(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "}\n";
+  }
+
+  void GenerateNewObjectPrototype(const StructDef &struct_def,
+                                  std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "function " + NormalizedName(struct_def) + ".New()\n";
+    code += std::string(Indent) + "local o = {}\n";
+    code += std::string(Indent) +
+            "setmetatable(o, {__index = " + NormalizedMetaName(struct_def) +
+            "})\n";
+    code += std::string(Indent) + "return o\n";
+    code += EndFunc;
+  }
+
+  // Initialize a new struct or table from existing data.
+  void NewRootTypeFromBuffer(const StructDef &struct_def,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "function " + NormalizedName(struct_def) + ".GetRootAs" +
+            NormalizedName(struct_def) + "(buf, offset)\n";
+    code += std::string(Indent) + "if type(buf) == \"string\" then\n";
+    code += std::string(Indent) + Indent +
+            "buf = flatbuffers.binaryArray.New(buf)\n";
+    code += std::string(Indent) + "end\n";
+    code += std::string(Indent) +
+            "local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)\n";
+    code += std::string(Indent) + "local o = " + NormalizedName(struct_def) +
+            ".New()\n";
+    code += std::string(Indent) + "o:Init(buf, n + offset)\n";
+    code += std::string(Indent) + "return o\n";
+    code += EndFunc;
+  }
+
+  // Initialize an existing object with other data, to avoid an allocation.
+  void InitializeExisting(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += "Init(buf, pos)\n";
+    code +=
+        std::string(Indent) + SelfData + " = flatbuffers.view.New(buf, pos)\n";
+    code += EndFunc;
+  }
+
+  // Get the length of a vector.
+  void GetVectorLen(const StructDef &struct_def, const FieldDef &field,
+                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "Length()\n";
+    code += OffsetPrefix(field);
+    code +=
+        std::string(Indent) + Indent + "return " + SelfData + ":VectorLen(o)\n";
+    code += std::string(Indent) + End;
+    code += std::string(Indent) + "return 0\n";
+    code += EndFunc;
+  }
+
+  // Get the value of a struct's scalar.
+  void GetScalarFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "()\n";
+    code += std::string(Indent) + "return " + getter;
+    code += std::string(SelfDataPos) + " + " + NumToString(field.value.offset) +
+            ")\n";
+    code += EndFunc;
+  }
+
+  // Get the value of a table's scalar.
+  void GetScalarFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "()\n";
+    code += OffsetPrefix(field);
+    getter += std::string("o + ") + SelfDataPos + ")";
+    auto is_bool = field.value.type.base_type == BASE_TYPE_BOOL;
+    if (is_bool) { getter = "(" + getter + " ~= 0)"; }
+    code += std::string(Indent) + Indent + "return " + getter + "\n";
+    code += std::string(Indent) + End;
+    std::string default_value;
+    if (is_bool) {
+      default_value = field.value.constant == "0" ? "false" : "true";
+    } else {
+      default_value = field.value.constant;
+    }
+    code += std::string(Indent) + "return " + default_value + "\n";
+    code += EndFunc;
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Struct.
+  void GetStructFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(obj)\n";
+    code += std::string(Indent) + "obj:Init(" + SelfDataBytes + ", " +
+            SelfDataPos + " + ";
+    code += NumToString(field.value.offset) + ")\n";
+    code += std::string(Indent) + "return obj\n";
+    code += EndFunc;
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Table.
+  void GetStructFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "()\n";
+    code += OffsetPrefix(field);
+    if (field.value.type.struct_def->fixed) {
+      code +=
+          std::string(Indent) + Indent + "local x = o + " + SelfDataPos + "\n";
+    } else {
+      code += std::string(Indent) + Indent + "local x = " + SelfData +
+              ":Indirect(o + " + SelfDataPos + ")\n";
+    }
+    code += std::string(Indent) + Indent + "local obj = require('" +
+            TypeNameWithNamespace(field) + "').New()\n";
+    code +=
+        std::string(Indent) + Indent + "obj:Init(" + SelfDataBytes + ", x)\n";
+    code += std::string(Indent) + Indent + "return obj\n";
+    code += std::string(Indent) + End;
+    code += EndFunc;
+  }
+
+  // Get the value of a string.
+  void GetStringField(const StructDef &struct_def, const FieldDef &field,
+                      std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "()\n";
+    code += OffsetPrefix(field);
+    code +=
+        std::string(Indent) + Indent + "return " + GenGetter(field.value.type);
+    code += std::string("o + ") + SelfDataPos + ")\n";
+    code += std::string(Indent) + End;
+    code += EndFunc;
+  }
+
+  // Get the value of a union from an object.
+  void GetUnionField(const StructDef &struct_def, const FieldDef &field,
+                     std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "()\n";
+    code += OffsetPrefix(field);
+
+    // TODO(rw): this works and is not the good way to it:
+    // bool is_native_table = TypeName(field) == "*flatbuffers.Table";
+    // if (is_native_table) {
+    //  code += std::string(Indent) + Indent + "from flatbuffers.table import
+    //  Table\n";
+    //} else {
+    //  code += std::string(Indent) + Indent +
+    //  code += "from ." + TypeName(field) + " import " + TypeName(field) +
+    //  "\n";
+    //}
+    code +=
+        std::string(Indent) + Indent +
+        "local obj = "
+        "flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)\n";
+    code += std::string(Indent) + Indent + GenGetter(field.value.type) +
+            "obj, o)\n";
+    code += std::string(Indent) + Indent + "return obj\n";
+    code += std::string(Indent) + End;
+    code += EndFunc;
+  }
+
+  // Get the value of a vector's struct member.
+  void GetMemberOfVectorOfStruct(const StructDef &struct_def,
+                                 const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(j)\n";
+    code += OffsetPrefix(field);
+    code +=
+        std::string(Indent) + Indent + "local x = " + SelfData + ":Vector(o)\n";
+    code += std::string(Indent) + Indent + "x = x + ((j-1) * ";
+    code += NumToString(InlineSize(vectortype)) + ")\n";
+    if (!(vectortype.struct_def->fixed)) {
+      code +=
+          std::string(Indent) + Indent + "x = " + SelfData + ":Indirect(x)\n";
+    }
+    code += std::string(Indent) + Indent + "local obj = require('" +
+            TypeNameWithNamespace(field) + "').New()\n";
+    code +=
+        std::string(Indent) + Indent + "obj:Init(" + SelfDataBytes + ", x)\n";
+    code += std::string(Indent) + Indent + "return obj\n";
+    code += std::string(Indent) + End;
+    code += EndFunc;
+  }
+
+  // Get the value of a vector's non-struct member. Uses a named return
+  // argument to conveniently set the zero value for the result.
+  void GetMemberOfVectorOfNonStruct(const StructDef &struct_def,
+                                    const FieldDef &field,
+                                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(j)\n";
+    code += OffsetPrefix(field);
+    code +=
+        std::string(Indent) + Indent + "local a = " + SelfData + ":Vector(o)\n";
+    code += std::string(Indent) + Indent;
+    code += "return " + GenGetter(field.value.type);
+    code += "a + ((j-1) * ";
+    code += NumToString(InlineSize(vectortype)) + "))\n";
+    code += std::string(Indent) + End;
+    if (IsString(vectortype)) {
+      code += std::string(Indent) + "return ''\n";
+    } else {
+      code += std::string(Indent) + "return 0\n";
+    }
+    code += EndFunc;
+  }
+
+  // Access a byte/ubyte vector as a string
+  void AccessByteVectorAsString(const StructDef &struct_def,
+                                const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "AsString(start, stop)\n";
+    code += std::string(Indent) + "return " + SelfData + ":VectorAsString(" +
+            NumToString(field.value.offset) + ", start, stop)\n";
+    code += EndFunc;
+  }
+
+  // Begin the creator function signature.
+  void BeginBuilderArgs(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += "function " + NormalizedName(struct_def) + ".Create" +
+            NormalizedName(struct_def);
+    code += "(builder";
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void StructBuilderArgs(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        StructBuilderArgs(*field.value.type.struct_def,
+                          (nameprefix + (NormalizedName(field) + "_")).c_str(),
+                          code_ptr);
+      } else {
+        std::string &code = *code_ptr;
+        code += std::string(", ") + nameprefix;
+        code += MakeCamel(NormalizedName(field), false);
+      }
+    }
+  }
+
+  // End the creator function signature.
+  void EndBuilderArgs(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += ")\n";
+  }
+
+  // Recursively generate struct construction statements and instert manual
+  // padding.
+  void StructBuilderBody(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += std::string(Indent) + "builder:Prep(" +
+            NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ")\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      if (field.padding)
+        code += std::string(Indent) + "builder:Pad(" +
+                NumToString(field.padding) + ")\n";
+      if (IsStruct(field.value.type)) {
+        StructBuilderBody(*field.value.type.struct_def,
+                          (nameprefix + (NormalizedName(field) + "_")).c_str(),
+                          code_ptr);
+      } else {
+        code +=
+            std::string(Indent) + "builder:Prepend" + GenMethod(field) + "(";
+        code += nameprefix + MakeCamel(NormalizedName(field), false) + ")\n";
+      }
+    }
+  }
+
+  void EndBuilderBody(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += std::string(Indent) + "return builder:Offset()\n";
+    code += EndFunc;
+  }
+
+  // Get the value of a table's starting offset.
+  void GetStartOfTable(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "function " + NormalizedName(struct_def) + ".Start";
+    code += "(builder) ";
+    code += "builder:StartObject(";
+    code += NumToString(struct_def.fields.vec.size());
+    code += ") end\n";
+  }
+
+  // Set the value of a table's field.
+  void BuildFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                         const size_t offset, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "function " + NormalizedName(struct_def) + ".Add" +
+            MakeCamel(NormalizedName(field));
+    code += "(builder, ";
+    code += MakeCamel(NormalizedName(field), false);
+    code += ") ";
+    code += "builder:Prepend";
+    code += GenMethod(field) + "Slot(";
+    code += NumToString(offset) + ", ";
+    // todo: i don't need to cast in Lua, but am I missing something?
+    //    if (!IsScalar(field.value.type.base_type) && (!struct_def.fixed)) {
+    //      code += "flatbuffers.N.UOffsetTFlags.py_type";
+    //      code += "(";
+    //      code += MakeCamel(NormalizedName(field), false) + ")";
+    //    } else {
+    code += MakeCamel(NormalizedName(field), false);
+    //    }
+    code += ", " + field.value.constant;
+    code += ") end\n";
+  }
+
+  // Set the value of one of the members of a table's vector.
+  void BuildVectorOfTable(const StructDef &struct_def, const FieldDef &field,
+                          std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "function " + NormalizedName(struct_def) + ".Start";
+    code += MakeCamel(NormalizedName(field));
+    code += "Vector(builder, numElems) return builder:StartVector(";
+    auto vector_type = field.value.type.VectorType();
+    auto alignment = InlineAlignment(vector_type);
+    auto elem_size = InlineSize(vector_type);
+    code += NumToString(elem_size);
+    code += ", numElems, " + NumToString(alignment);
+    code += ") end\n";
+  }
+
+  // Get the offset of the end of a table.
+  void GetEndOffsetOnTable(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "function " + NormalizedName(struct_def) + ".End";
+    code += "(builder) ";
+    code += "return builder:EndObject() end\n";
+  }
+
+  // Generate the receiver for function signatures.
+  void GenReceiver(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "function " + NormalizedMetaName(struct_def) + ":";
+  }
+
+  // Generate a struct field, conditioned on its child type(s).
+  void GenStructAccessor(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, &def_comment);
+    if (IsScalar(field.value.type.base_type)) {
+      if (struct_def.fixed) {
+        GetScalarFieldOfStruct(struct_def, field, code_ptr);
+      } else {
+        GetScalarFieldOfTable(struct_def, field, code_ptr);
+      }
+    } else {
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT:
+          if (struct_def.fixed) {
+            GetStructFieldOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetStructFieldOfTable(struct_def, field, code_ptr);
+          }
+          break;
+        case BASE_TYPE_STRING:
+          GetStringField(struct_def, field, code_ptr);
+          break;
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GetMemberOfVectorOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetMemberOfVectorOfNonStruct(struct_def, field, code_ptr);
+            if (vectortype.base_type == BASE_TYPE_CHAR ||
+                vectortype.base_type == BASE_TYPE_UCHAR) {
+              AccessByteVectorAsString(struct_def, field, code_ptr);
+            }
+          }
+          break;
+        }
+        case BASE_TYPE_UNION: GetUnionField(struct_def, field, code_ptr); break;
+        default: FLATBUFFERS_ASSERT(0);
+      }
+    }
+    if (IsVector(field.value.type)) {
+      GetVectorLen(struct_def, field, code_ptr);
+    }
+  }
+
+  // Generate table constructors, conditioned on its members' types.
+  void GenTableBuilders(const StructDef &struct_def, std::string *code_ptr) {
+    GetStartOfTable(struct_def, code_ptr);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto offset = it - struct_def.fields.vec.begin();
+      BuildFieldOfTable(struct_def, field, offset, code_ptr);
+      if (IsVector(field.value.type)) {
+        BuildVectorOfTable(struct_def, field, code_ptr);
+      }
+    }
+
+    GetEndOffsetOnTable(struct_def, code_ptr);
+  }
+
+  // Generate struct or table methods.
+  void GenStruct(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+
+    GenComment(struct_def.doc_comment, code_ptr, &def_comment);
+    BeginClass(struct_def, code_ptr);
+
+    GenerateNewObjectPrototype(struct_def, code_ptr);
+
+    if (!struct_def.fixed) {
+      // Generate a special accessor for the table that has been declared as
+      // the root type.
+      NewRootTypeFromBuffer(struct_def, code_ptr);
+    }
+
+    // Generate the Init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    InitializeExisting(struct_def, code_ptr);
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      GenStructAccessor(struct_def, field, code_ptr);
+    }
+
+    if (struct_def.fixed) {
+      // create a struct constructor function
+      GenStructBuilder(struct_def, code_ptr);
+    } else {
+      // Create a set of functions that allow table construction.
+      GenTableBuilders(struct_def, code_ptr);
+    }
+  }
+
+  // Generate enum declarations.
+  void GenEnum(const EnumDef &enum_def, std::string *code_ptr) {
+    if (enum_def.generated) return;
+
+    GenComment(enum_def.doc_comment, code_ptr, &def_comment);
+    BeginEnum(NormalizedName(enum_def), code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, &def_comment, Indent);
+      EnumMember(enum_def, ev, code_ptr);
+    }
+    EndEnum(code_ptr);
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetter(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return std::string(SelfData) + ":String(";
+      case BASE_TYPE_UNION: return std::string(SelfData) + ":Union(";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      default:
+        return std::string(SelfData) + ":Get(flatbuffers.N." +
+               MakeCamel(GenTypeGet(type)) + ", ";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const FieldDef &field) {
+    return IsScalar(field.value.type.base_type)
+               ? MakeCamel(GenTypeBasic(field.value.type))
+               : (IsStruct(field.value.type) ? "Struct" : "UOffsetTRelative");
+  }
+
+  std::string GenTypeBasic(const Type &type) {
+    // clang-format off
+    static const char *ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, ...) \
+        #PTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return ctypename[type.base_type];
+  }
+
+  std::string GenTypePointer(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "string";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return type.struct_def->name;
+      case BASE_TYPE_UNION:
+        // fall through
+      default: return "*flatbuffers.Table";
+    }
+  }
+
+  std::string GenTypeGet(const Type &type) {
+    return IsScalar(type.base_type) ? GenTypeBasic(type) : GenTypePointer(type);
+  }
+
+  std::string GetNamespace(const Type &type) {
+    return type.struct_def->defined_namespace->GetFullyQualifiedName(
+        type.struct_def->name);
+  }
+
+  std::string TypeName(const FieldDef &field) {
+    return GenTypeGet(field.value.type);
+  }
+
+  std::string TypeNameWithNamespace(const FieldDef &field) {
+    return GetNamespace(field.value.type);
+  }
+
+  // Create a struct with a builder and the struct's arguments.
+  void GenStructBuilder(const StructDef &struct_def, std::string *code_ptr) {
+    BeginBuilderArgs(struct_def, code_ptr);
+    StructBuilderArgs(struct_def, "", code_ptr);
+    EndBuilderArgs(code_ptr);
+
+    StructBuilderBody(struct_def, "", code_ptr);
+    EndBuilderBody(code_ptr);
+  }
+
+  bool generate() {
+    if (!generateEnums()) return false;
+    if (!generateStructs()) return false;
+    return true;
+  }
+
+ private:
+  bool generateEnums() {
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      auto &enum_def = **it;
+      std::string enumcode;
+      GenEnum(enum_def, &enumcode);
+      if (!SaveType(enum_def, enumcode, false)) return false;
+    }
+    return true;
+  }
+
+  bool generateStructs() {
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      std::string declcode;
+      GenStruct(struct_def, &declcode);
+      if (!SaveType(struct_def, declcode, true)) return false;
+    }
+    return true;
+  }
+
+  // Begin by declaring namespace and imports.
+  void BeginFile(const std::string &name_space_name, const bool needs_imports,
+                 std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += std::string(Comment) + FlatBuffersGeneratedWarning() + "\n\n";
+    code += std::string(Comment) + "namespace: " + name_space_name + "\n\n";
+    if (needs_imports) {
+      code += "local flatbuffers = require('flatbuffers')\n\n";
+    }
+  }
+
+  // Save out the generated code for a Lua Table type.
+  bool SaveType(const Definition &def, const std::string &classcode,
+                bool needs_imports) {
+    if (!classcode.length()) return true;
+
+    std::string namespace_dir = path_;
+    auto &namespaces = def.defined_namespace->components;
+    for (auto it = namespaces.begin(); it != namespaces.end(); ++it) {
+      if (it != namespaces.begin()) namespace_dir += kPathSeparator;
+      namespace_dir += *it;
+      // std::string init_py_filename = namespace_dir + "/__init__.py";
+      // SaveFile(init_py_filename.c_str(), "", false);
+    }
+
+    std::string code = "";
+    BeginFile(LastNamespacePart(*def.defined_namespace), needs_imports, &code);
+    code += classcode;
+    code += "\n";
+    code +=
+        "return " + NormalizedName(def) + " " + Comment + "return the module";
+    std::string filename =
+        NamespaceDir(*def.defined_namespace) + NormalizedName(def) + ".lua";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+ private:
+  std::unordered_set<std::string> keywords_;
+};
+
+}  // namespace lua
+
+bool GenerateLua(const Parser &parser, const std::string &path,
+                 const std::string &file_name) {
+  lua::LuaGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_php.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_php.cpp
new file mode 100644
index 0000000..602d229
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_php.cpp
@@ -0,0 +1,939 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <string>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+namespace php {
+// Hardcode spaces per indentation.
+const std::string Indent = "    ";
+class PhpGenerator : public BaseGenerator {
+ public:
+  PhpGenerator(const Parser &parser, const std::string &path,
+               const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "\\", "\\", "php") {}
+  bool generate() {
+    if (!GenerateEnums()) return false;
+    if (!GenerateStructs()) return false;
+    return true;
+  }
+
+ private:
+  bool GenerateEnums() {
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      auto &enum_def = **it;
+      std::string enumcode;
+      GenEnum(enum_def, &enumcode);
+      if (!SaveType(enum_def, enumcode, false)) return false;
+    }
+    return true;
+  }
+
+  bool GenerateStructs() {
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      std::string declcode;
+      GenStruct(struct_def, &declcode);
+      if (!SaveType(struct_def, declcode, true)) return false;
+    }
+    return true;
+  }
+
+  // Begin by declaring namespace and imports.
+  void BeginFile(const std::string &name_space_name, const bool needs_imports,
+                 std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "<?php\n";
+    code = code + "// " + FlatBuffersGeneratedWarning() + "\n\n";
+
+    if (!name_space_name.empty()) {
+      code += "namespace " + name_space_name + ";\n\n";
+    }
+
+    if (needs_imports) {
+      code += "use \\Google\\FlatBuffers\\Struct;\n";
+      code += "use \\Google\\FlatBuffers\\Table;\n";
+      code += "use \\Google\\FlatBuffers\\ByteBuffer;\n";
+      code += "use \\Google\\FlatBuffers\\FlatBufferBuilder;\n";
+      code += "\n";
+    }
+  }
+
+  // Save out the generated code for a Php Table type.
+  bool SaveType(const Definition &def, const std::string &classcode,
+                bool needs_imports) {
+    if (!classcode.length()) return true;
+
+    std::string code = "";
+    BeginFile(FullNamespace("\\", *def.defined_namespace), needs_imports,
+              &code);
+    code += classcode;
+
+    std::string filename =
+        NamespaceDir(*def.defined_namespace) + def.name + ".php";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+  // Begin a class declaration.
+  static void BeginClass(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    if (struct_def.fixed) {
+      code += "class " + struct_def.name + " extends Struct\n";
+    } else {
+      code += "class " + struct_def.name + " extends Table\n";
+    }
+    code += "{\n";
+  }
+
+  static void EndClass(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "}\n";
+  }
+
+  // Begin enum code with a class declaration.
+  static void BeginEnum(const std::string &class_name, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "class " + class_name + "\n{\n";
+  }
+
+  // A single enum member.
+  static void EnumMember(const EnumDef &enum_def, const EnumVal &ev,
+                         std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += Indent + "const ";
+    code += ev.name;
+    code += " = ";
+    code += enum_def.ToString(ev) + ";\n";
+  }
+
+  // End enum code.
+  static void EndEnum(std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "}\n";
+  }
+
+  // Initialize a new struct or table from existing data.
+  static void NewRootTypeFromBuffer(const StructDef &struct_def,
+                                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param ByteBuffer $bb\n";
+    code += Indent + " * @return " + struct_def.name + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function getRootAs";
+    code += struct_def.name;
+    code += "(ByteBuffer $bb)\n";
+    code += Indent + "{\n";
+
+    code += Indent + Indent + "$obj = new " + struct_def.name + "();\n";
+    code += Indent + Indent;
+    code += "return ($obj->init($bb->getInt($bb->getPosition())";
+    code += " + $bb->getPosition(), $bb));\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Initialize an existing object with other data, to avoid an allocation.
+  static void InitializeExisting(const StructDef &struct_def,
+                                 std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param int $_i offset\n";
+    code += Indent + " * @param ByteBuffer $_bb\n";
+    code += Indent + " * @return " + struct_def.name + "\n";
+    code += Indent + " **/\n";
+    code += Indent + "public function init($_i, ByteBuffer $_bb)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$this->bb_pos = $_i;\n";
+    code += Indent + Indent + "$this->bb = $_bb;\n";
+    code += Indent + Indent + "return $this;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the length of a vector.
+  static void GetVectorLen(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return int\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name) + "Length()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(";
+    code += NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent;
+    code += "return $o != 0 ? $this->__vector_len($o) : 0;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get a [ubyte] vector as a byte array.
+  static void GetUByte(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return string\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name) + "Bytes()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "return $this->__vector_as_bytes(";
+    code += NumToString(field.value.offset) + ");\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a struct's scalar.
+  static void GetScalarFieldOfStruct(const FieldDef &field,
+                                     std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return ";
+    code += GenTypeGet(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function " + getter;
+    code += MakeCamel(field.name) + "()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "return ";
+
+    code += "$this->bb->get";
+    code += MakeCamel(GenTypeGet(field.value.type));
+    code += "($this->bb_pos + ";
+    code += NumToString(field.value.offset) + ")";
+    code += ";\n";
+
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a table's scalar.
+  void GetScalarFieldOfTable(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return " + GenTypeGet(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n" + Indent + Indent +
+            "return $o != 0 ? ";
+    code += "$this->bb->get";
+    code += MakeCamel(GenTypeGet(field.value.type)) + "($o + $this->bb_pos)";
+    code += " : " + GenDefaultValue(field.value) + ";\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Struct.
+  void GetStructFieldOfStruct(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return " + GenTypeGet(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name) + "()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$obj = new ";
+    code += GenTypeGet(field.value.type) + "();\n";
+    code += Indent + Indent + "$obj->init($this->bb_pos + ";
+    code += NumToString(field.value.offset) + ", $this->bb);";
+    code += "\n" + Indent + Indent + "return $obj;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Table.
+  void GetStructFieldOfTable(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$obj = new ";
+    code += MakeCamel(GenTypeGet(field.value.type)) + "();\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent;
+    code += "return $o != 0 ? $obj->init(";
+    if (field.value.type.struct_def->fixed) {
+      code += "$o + $this->bb_pos, $this->bb) : ";
+    } else {
+      code += "$this->__indirect($o + $this->bb_pos), $this->bb) : ";
+    }
+    code += GenDefaultValue(field.value) + ";\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a string.
+  void GetStringField(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "()\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent;
+    code += "return $o != 0 ? $this->__string($o + $this->bb_pos) : ";
+    code += GenDefaultValue(field.value) + ";\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a union from an object.
+  void GetUnionField(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return" + GenTypeBasic(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name) + "($obj)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent;
+    code += "return $o != 0 ? $this->__union($obj, $o) : null;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a vector's struct member.
+  void GetMemberOfVectorOfStruct(const StructDef &struct_def,
+                                 const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    code += Indent + "/**\n";
+    code += Indent + " * @return" + GenTypeBasic(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "($j)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent + "$obj = new ";
+    code += MakeCamel(GenTypeGet(field.value.type)) + "();\n";
+
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_STRUCT:
+        if (struct_def.fixed) {
+          code += Indent + Indent;
+          code += "return $o != 0 ? $obj->init($this->bb_pos +" +
+                  NumToString(field.value.offset) + ", $this->bb) : null;\n";
+        } else {
+          code += Indent + Indent + "return $o != 0 ? $obj->init(";
+          code += field.value.type.struct_def->fixed
+                      ? "$o + $this->bb_pos"
+                      : "$this->__indirect($o + $this->bb_pos)";
+          code += ", $this->bb) : null;\n";
+        }
+        break;
+      case BASE_TYPE_STRING:
+        code += "// base_type_string\n";
+        // TODO(chobie): do we need this?
+        break;
+      case BASE_TYPE_VECTOR:
+        if (vectortype.base_type == BASE_TYPE_STRUCT) {
+          code += Indent + Indent + "return $o != 0 ? $obj->init(";
+          if (vectortype.struct_def->fixed) {
+            code += "$this->__vector($o) + $j *";
+            code += NumToString(InlineSize(vectortype));
+          } else {
+            code += "$this->__indirect($this->__vector($o) + $j * ";
+            code += NumToString(InlineSize(vectortype)) + ")";
+          }
+          code += ", $this->bb) : null;\n";
+        }
+        break;
+      case BASE_TYPE_UNION:
+        code += Indent + Indent + "return $o != 0 ? $this->";
+        code += GenGetter(field.value.type) + "($obj, $o); null;\n";
+        break;
+      default: break;
+    }
+
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a vector's non-struct member. Uses a named return
+  // argument to conveniently set the zero value for the result.
+  void GetMemberOfVectorOfNonStruct(const FieldDef &field,
+                                    std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param int offset\n";
+    code += Indent + " * @return " + GenTypeGet(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "($j)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+
+    if (IsString(field.value.type.VectorType())) {
+      code += Indent + Indent;
+      code += "return $o != 0 ? $this->__string($this->__vector($o) + $j * ";
+      code += NumToString(InlineSize(vectortype)) + ") : ";
+      code += GenDefaultValue(field.value) + ";\n";
+    } else {
+      code += Indent + Indent + "return $o != 0 ? $this->bb->get";
+      code += MakeCamel(GenTypeGet(field.value.type));
+      code += "($this->__vector($o) + $j * ";
+      code += NumToString(InlineSize(vectortype)) + ") : ";
+      code += GenDefaultValue(field.value) + ";\n";
+    }
+    code += Indent + "}\n\n";
+  }
+
+  // Get the value of a vector's union member. Uses a named return
+  // argument to conveniently set the zero value for the result.
+  void GetMemberOfVectorOfUnion(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param int offset\n";
+    code += Indent + " * @return " + GenTypeGet(field.value.type) + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public function get";
+    code += MakeCamel(field.name);
+    code += "($j, $obj)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $this->__offset(" +
+            NumToString(field.value.offset) + ");\n";
+    code += Indent + Indent + "return $o != 0 ? ";
+    code += "$this->__union($obj, $this->__vector($o) + $j * ";
+    code += NumToString(InlineSize(vectortype)) + " - $this->bb_pos) : null;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  static void StructBuilderArgs(const StructDef &struct_def,
+                                const char *nameprefix, std::string *code_ptr) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious
+        // these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        StructBuilderArgs(*field.value.type.struct_def,
+                          (nameprefix + (field.name + "_")).c_str(), code_ptr);
+      } else {
+        std::string &code = *code_ptr;
+        code += std::string(", $") + nameprefix;
+        code += MakeCamel(field.name, false);
+      }
+    }
+  }
+
+  // Recursively generate struct construction statements and instert manual
+  // padding.
+  static void StructBuilderBody(const StructDef &struct_def,
+                                const char *nameprefix, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += Indent + Indent + "$builder->prep(";
+    code += NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ");\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      if (field.padding) {
+        code += Indent + Indent + "$builder->pad(";
+        code += NumToString(field.padding) + ");\n";
+      }
+      if (IsStruct(field.value.type)) {
+        StructBuilderBody(*field.value.type.struct_def,
+                          (nameprefix + (field.name + "_")).c_str(), code_ptr);
+      } else {
+        code += Indent + Indent + "$builder->put" + GenMethod(field) + "($";
+        code += nameprefix + MakeCamel(field.name, false) + ");\n";
+      }
+    }
+  }
+
+  // Get the value of a table's starting offset.
+  static void GetStartOfTable(const StructDef &struct_def,
+                              std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @return void\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function start" + struct_def.name;
+    code += "(FlatBufferBuilder $builder)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$builder->StartObject(";
+    code += NumToString(struct_def.fields.vec.size());
+    code += ");\n";
+    code += Indent + "}\n\n";
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @return " + struct_def.name + "\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function create" + struct_def.name;
+    code += "(FlatBufferBuilder $builder, ";
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+
+      if (field.deprecated) continue;
+      code += "$" + field.name;
+      if (!(it == (--struct_def.fields.vec.end()))) { code += ", "; }
+    }
+    code += ")\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$builder->startObject(";
+    code += NumToString(struct_def.fields.vec.size());
+    code += ");\n";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      code += Indent + Indent + "self::add";
+      code += MakeCamel(field.name) + "($builder, $" + field.name + ");\n";
+    }
+
+    code += Indent + Indent + "$o = $builder->endObject();\n";
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (!field.deprecated && field.IsRequired()) {
+        code += Indent + Indent + "$builder->required($o, ";
+        code += NumToString(field.value.offset);
+        code += ");  // " + field.name + "\n";
+      }
+    }
+    code += Indent + Indent + "return $o;\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Set the value of a table's field.
+  static void BuildFieldOfTable(const FieldDef &field, const size_t offset,
+                                std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @param " + GenTypeBasic(field.value.type) + "\n";
+    code += Indent + " * @return void\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function ";
+    code += "add" + MakeCamel(field.name);
+    code += "(FlatBufferBuilder $builder, ";
+    code += "$" + MakeCamel(field.name, false);
+    code += ")\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$builder->add";
+    code += GenMethod(field) + "X(";
+    code += NumToString(offset) + ", ";
+
+    code += "$" + MakeCamel(field.name, false);
+    code += ", ";
+
+    if (field.value.type.base_type == BASE_TYPE_BOOL) {
+      code += "false";
+    } else {
+      code += field.value.constant;
+    }
+    code += ");\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Set the value of one of the members of a table's vector.
+  static void BuildVectorOfTable(const FieldDef &field, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    auto vector_type = field.value.type.VectorType();
+    auto alignment = InlineAlignment(vector_type);
+    auto elem_size = InlineSize(vector_type);
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @param array offset array\n";
+    code += Indent + " * @return int vector offset\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function create";
+    code += MakeCamel(field.name);
+    code += "Vector(FlatBufferBuilder $builder, array $data)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$builder->startVector(";
+    code += NumToString(elem_size);
+    code += ", count($data), " + NumToString(alignment);
+    code += ");\n";
+    code += Indent + Indent;
+    code += "for ($i = count($data) - 1; $i >= 0; $i--) {\n";
+    if (IsScalar(field.value.type.VectorType().base_type)) {
+      code += Indent + Indent + Indent;
+      code += "$builder->put";
+      code += MakeCamel(GenTypeBasic(field.value.type.VectorType()));
+      code += "($data[$i]);\n";
+    } else {
+      code += Indent + Indent + Indent;
+      code += "$builder->putOffset($data[$i]);\n";
+    }
+    code += Indent + Indent + "}\n";
+    code += Indent + Indent + "return $builder->endVector();\n";
+    code += Indent + "}\n\n";
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @param int $numElems\n";
+    code += Indent + " * @return void\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function start";
+    code += MakeCamel(field.name);
+    code += "Vector(FlatBufferBuilder $builder, $numElems)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$builder->startVector(";
+    code += NumToString(elem_size);
+    code += ", $numElems, " + NumToString(alignment);
+    code += ");\n";
+    code += Indent + "}\n\n";
+  }
+
+  // Get the offset of the end of a table.
+  void GetEndOffsetOnTable(const StructDef &struct_def, std::string *code_ptr) {
+    std::string &code = *code_ptr;
+
+    code += Indent + "/**\n";
+    code += Indent + " * @param FlatBufferBuilder $builder\n";
+    code += Indent + " * @return int table offset\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function end" + struct_def.name;
+    code += "(FlatBufferBuilder $builder)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "$o = $builder->endObject();\n";
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (!field.deprecated && field.IsRequired()) {
+        code += Indent + Indent + "$builder->required($o, ";
+        code += NumToString(field.value.offset);
+        code += ");  // " + field.name + "\n";
+      }
+    }
+    code += Indent + Indent + "return $o;\n";
+    code += Indent + "}\n";
+
+    if (parser_.root_struct_def_ == &struct_def) {
+      code += "\n";
+      code += Indent + "public static function finish";
+      code += struct_def.name;
+      code += "Buffer(FlatBufferBuilder $builder, $offset)\n";
+      code += Indent + "{\n";
+      code += Indent + Indent + "$builder->finish($offset";
+
+      if (parser_.file_identifier_.length())
+        code += ", \"" + parser_.file_identifier_ + "\"";
+      code += ");\n";
+      code += Indent + "}\n";
+    }
+  }
+
+  // Generate a struct field, conditioned on its child type(s).
+  void GenStructAccessor(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, nullptr, Indent.c_str());
+
+    if (IsScalar(field.value.type.base_type)) {
+      if (struct_def.fixed) {
+        GetScalarFieldOfStruct(field, code_ptr);
+      } else {
+        GetScalarFieldOfTable(field, code_ptr);
+      }
+    } else {
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT:
+          if (struct_def.fixed) {
+            GetStructFieldOfStruct(field, code_ptr);
+          } else {
+            GetStructFieldOfTable(field, code_ptr);
+          }
+          break;
+        case BASE_TYPE_STRING: GetStringField(field, code_ptr); break;
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_UNION) {
+            GetMemberOfVectorOfUnion(field, code_ptr);
+          } else if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GetMemberOfVectorOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetMemberOfVectorOfNonStruct(field, code_ptr);
+          }
+          break;
+        }
+        case BASE_TYPE_UNION: GetUnionField(field, code_ptr); break;
+        default: FLATBUFFERS_ASSERT(0);
+      }
+    }
+    if (IsVector(field.value.type)) {
+      GetVectorLen(field, code_ptr);
+      if (field.value.type.element == BASE_TYPE_UCHAR) {
+        GetUByte(field, code_ptr);
+      }
+    }
+  }
+
+  // Generate table constructors, conditioned on its members' types.
+  void GenTableBuilders(const StructDef &struct_def, std::string *code_ptr) {
+    GetStartOfTable(struct_def, code_ptr);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto offset = it - struct_def.fields.vec.begin();
+      if (field.value.type.base_type == BASE_TYPE_UNION) {
+        std::string &code = *code_ptr;
+        code += Indent + "public static function add";
+        code += MakeCamel(field.name);
+        code += "(FlatBufferBuilder $builder, $offset)\n";
+        code += Indent + "{\n";
+        code += Indent + Indent + "$builder->addOffsetX(";
+        code += NumToString(offset) + ", $offset, 0);\n";
+        code += Indent + "}\n\n";
+      } else {
+        BuildFieldOfTable(field, offset, code_ptr);
+      }
+      if (IsVector(field.value.type)) { BuildVectorOfTable(field, code_ptr); }
+    }
+
+    GetEndOffsetOnTable(struct_def, code_ptr);
+  }
+
+  // Generate struct or table methods.
+  void GenStruct(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+
+    GenComment(struct_def.doc_comment, code_ptr, nullptr);
+    BeginClass(struct_def, code_ptr);
+
+    if (!struct_def.fixed) {
+      // Generate a special accessor for the table that has been declared as
+      // the root type.
+      NewRootTypeFromBuffer(struct_def, code_ptr);
+    }
+
+    std::string &code = *code_ptr;
+    if (!struct_def.fixed) {
+      if (parser_.file_identifier_.length()) {
+        // Return the identifier
+        code += Indent + "public static function " + struct_def.name;
+        code += "Identifier()\n";
+        code += Indent + "{\n";
+        code += Indent + Indent + "return \"";
+        code += parser_.file_identifier_ + "\";\n";
+        code += Indent + "}\n\n";
+
+        // Check if a buffer has the identifier.
+        code += Indent + "public static function " + struct_def.name;
+        code += "BufferHasIdentifier(ByteBuffer $buf)\n";
+        code += Indent + "{\n";
+        code += Indent + Indent + "return self::";
+        code += "__has_identifier($buf, self::";
+        code += struct_def.name + "Identifier());\n";
+        code += Indent + "}\n\n";
+      }
+
+      if (parser_.file_extension_.length()) {
+        // Return the extension
+        code += Indent + "public static function " + struct_def.name;
+        code += "Extension()\n";
+        code += Indent + "{\n";
+        code += Indent + Indent + "return \"" + parser_.file_extension_;
+        code += "\";\n";
+        code += Indent + "}\n\n";
+      }
+    }
+
+    // Generate the Init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    InitializeExisting(struct_def, code_ptr);
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      GenStructAccessor(struct_def, field, code_ptr);
+    }
+
+    if (struct_def.fixed) {
+      // create a struct constructor function
+      GenStructBuilder(struct_def, code_ptr);
+    } else {
+      // Create a set of functions that allow table construction.
+      GenTableBuilders(struct_def, code_ptr);
+    }
+    EndClass(code_ptr);
+  }
+
+  // Generate enum declarations.
+  static void GenEnum(const EnumDef &enum_def, std::string *code_ptr) {
+    if (enum_def.generated) return;
+
+    GenComment(enum_def.doc_comment, code_ptr, nullptr);
+    BeginEnum(enum_def.name, code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, nullptr, Indent.c_str());
+      EnumMember(enum_def, ev, code_ptr);
+    }
+
+    std::string &code = *code_ptr;
+    code += "\n";
+    code += Indent + "private static $names = array(\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      code += Indent + Indent + enum_def.name + "::" + ev.name + "=>" + "\"" +
+              ev.name + "\",\n";
+    }
+
+    code += Indent + ");\n\n";
+    code += Indent + "public static function Name($e)\n";
+    code += Indent + "{\n";
+    code += Indent + Indent + "if (!isset(self::$names[$e])) {\n";
+    code += Indent + Indent + Indent + "throw new \\Exception();\n";
+    code += Indent + Indent + "}\n";
+    code += Indent + Indent + "return self::$names[$e];\n";
+    code += Indent + "}\n";
+    EndEnum(code_ptr);
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  static std::string GenGetter(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "__string";
+      case BASE_TYPE_STRUCT: return "__struct";
+      case BASE_TYPE_UNION: return "__union";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      default: return "Get";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  static std::string GenMethod(const FieldDef &field) {
+    return IsScalar(field.value.type.base_type)
+               ? MakeCamel(GenTypeBasic(field.value.type))
+               : (IsStruct(field.value.type) ? "Struct" : "Offset");
+  }
+
+  static std::string GenTypeBasic(const Type &type) {
+    // clang-format off
+    static const char *ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, ...) \
+        #NTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return ctypename[type.base_type];
+  }
+
+  std::string GenDefaultValue(const Value &value) {
+    if (value.type.enum_def) {
+      if (auto val = value.type.enum_def->FindByValue(value.constant)) {
+        return WrapInNameSpace(*value.type.enum_def) + "::" + val->name;
+      }
+    }
+
+    switch (value.type.base_type) {
+      case BASE_TYPE_BOOL: return value.constant == "0" ? "false" : "true";
+
+      case BASE_TYPE_STRING: return "null";
+
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_ULONG:
+        if (value.constant != "0") {
+          int64_t constant = StringToInt(value.constant.c_str());
+          return NumToString(constant);
+        }
+        return "0";
+
+      default: return value.constant;
+    }
+  }
+
+  static std::string GenTypePointer(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "string";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return type.struct_def->name;
+      case BASE_TYPE_UNION:
+        // fall through
+      default: return "Table";
+    }
+  }
+
+  static std::string GenTypeGet(const Type &type) {
+    return IsScalar(type.base_type) ? GenTypeBasic(type) : GenTypePointer(type);
+  }
+
+  // Create a struct with a builder and the struct's arguments.
+  static void GenStructBuilder(const StructDef &struct_def,
+                               std::string *code_ptr) {
+    std::string &code = *code_ptr;
+    code += "\n";
+    code += Indent + "/**\n";
+    code += Indent + " * @return int offset\n";
+    code += Indent + " */\n";
+    code += Indent + "public static function create" + struct_def.name;
+    code += "(FlatBufferBuilder $builder";
+    StructBuilderArgs(struct_def, "", code_ptr);
+    code += ")\n";
+    code += Indent + "{\n";
+
+    StructBuilderBody(struct_def, "", code_ptr);
+
+    code += Indent + Indent + "return $builder->offset();\n";
+    code += Indent + "}\n";
+  }
+};
+}  // namespace php
+
+bool GeneratePhp(const Parser &parser, const std::string &path,
+                 const std::string &file_name) {
+  php::PhpGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_python.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_python.cpp
new file mode 100644
index 0000000..b3f394e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_python.cpp
@@ -0,0 +1,1782 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include <cctype>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+namespace python {
+
+// Hardcode spaces per indentation.
+const CommentConfig def_comment = { nullptr, "#", nullptr };
+const std::string Indent = "    ";
+
+class PythonGenerator : public BaseGenerator {
+ public:
+  PythonGenerator(const Parser &parser, const std::string &path,
+                  const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "" /* not used */,
+                      "" /* not used */, "py"),
+        float_const_gen_("float('nan')", "float('inf')", "float('-inf')") {
+    static const char *const keywords[] = {
+      "False",   "None",     "True",     "and",    "as",   "assert", "break",
+      "class",   "continue", "def",      "del",    "elif", "else",   "except",
+      "finally", "for",      "from",     "global", "if",   "import", "in",
+      "is",      "lambda",   "nonlocal", "not",    "or",   "pass",   "raise",
+      "return",  "try",      "while",    "with",   "yield"
+    };
+    keywords_.insert(std::begin(keywords), std::end(keywords));
+  }
+
+  // Most field accessors need to retrieve and test the field offset first,
+  // this is the prefix code for that.
+  std::string OffsetPrefix(const FieldDef &field) {
+    return "\n" + Indent + Indent +
+           "o = flatbuffers.number_types.UOffsetTFlags.py_type" +
+           "(self._tab.Offset(" + NumToString(field.value.offset) + "))\n" +
+           Indent + Indent + "if o != 0:\n";
+  }
+
+  // Begin a class declaration.
+  void BeginClass(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "class " + NormalizedName(struct_def) + "(object):\n";
+    code += Indent + "__slots__ = ['_tab']";
+    code += "\n\n";
+  }
+
+  // Begin enum code with a class declaration.
+  void BeginEnum(const std::string &class_name, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "class " + class_name + "(object):\n";
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : name + "_";
+  }
+
+  std::string NormalizedName(const Definition &definition) const {
+    return EscapeKeyword(definition.name);
+  }
+
+  std::string NormalizedName(const EnumVal &ev) const {
+    return EscapeKeyword(ev.name);
+  }
+
+  // Converts the name of a definition into upper Camel format.
+  std::string MakeUpperCamel(const Definition &definition) const {
+    return MakeCamel(NormalizedName(definition), true);
+  }
+
+  // Converts the name of a definition into lower Camel format.
+  std::string MakeLowerCamel(const Definition &definition) const {
+    auto name = MakeCamel(NormalizedName(definition), false);
+    name[0] = CharToLower(name[0]);
+    return name;
+  }
+
+  // Starts a new line and then indents.
+  std::string GenIndents(int num) {
+    return "\n" + std::string(num * Indent.length(), ' ');
+  }
+
+  // A single enum member.
+  void EnumMember(const EnumDef &enum_def, const EnumVal &ev,
+                  std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += Indent;
+    code += NormalizedName(ev);
+    code += " = ";
+    code += enum_def.ToString(ev) + "\n";
+  }
+
+  // End enum code.
+  void EndEnum(std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "\n";
+  }
+
+  // Initialize a new struct or table from existing data.
+  void NewRootTypeFromBuffer(const StructDef &struct_def,
+                             std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    code += Indent + "@classmethod\n";
+    code += Indent + "def GetRootAs";
+    code += "(cls, buf, offset=0):";
+    code += "\n";
+    code += Indent + Indent;
+    code += "n = flatbuffers.encode.Get";
+    code += "(flatbuffers.packer.uoffset, buf, offset)\n";
+    code += Indent + Indent + "x = " + NormalizedName(struct_def) + "()\n";
+    code += Indent + Indent + "x.Init(buf, n + offset)\n";
+    code += Indent + Indent + "return x\n";
+    code += "\n";
+
+    // Add an alias with the old name
+    code += Indent + "@classmethod\n";
+    code += Indent + "def GetRootAs";
+    code += NormalizedName(struct_def);
+    code += "(cls, buf, offset=0):\n";
+    code += Indent + Indent + "\"\"\"This method is deprecated. Please switch to GetRootAs.\"\"\"\n";
+    code += Indent + Indent + "return cls.GetRootAs(buf, offset)\n";
+  }
+
+  // Initialize an existing object with other data, to avoid an allocation.
+  void InitializeExisting(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += "Init(self, buf, pos):\n";
+    code += Indent + Indent + "self._tab = flatbuffers.table.Table(buf, pos)\n";
+    code += "\n";
+  }
+
+  // Get the length of a vector.
+  void GetVectorLen(const StructDef &struct_def, const FieldDef &field,
+                    std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "Length(self";
+    code += "):" + OffsetPrefix(field);
+    code += Indent + Indent + Indent + "return self._tab.VectorLen(o)\n";
+    code += Indent + Indent + "return 0\n\n";
+  }
+
+  // Determines whether a vector is none or not.
+  void GetVectorIsNone(const StructDef &struct_def, const FieldDef &field,
+                       std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "IsNone(self";
+    code += "):";
+    code += GenIndents(2) +
+            "o = flatbuffers.number_types.UOffsetTFlags.py_type" +
+            "(self._tab.Offset(" + NumToString(field.value.offset) + "))";
+    code += GenIndents(2) + "return o == 0";
+    code += "\n\n";
+  }
+
+  // Get the value of a struct's scalar.
+  void GetScalarFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self): return " + getter;
+    code += "self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(";
+    code += NumToString(field.value.offset) + "))\n";
+  }
+
+  // Get the value of a table's scalar.
+  void GetScalarFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    auto &code = *code_ptr;
+    std::string getter = GenGetter(field.value.type);
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self):";
+    code += OffsetPrefix(field);
+    getter += "o + self._tab.Pos)";
+    auto is_bool = IsBool(field.value.type.base_type);
+    if (is_bool) { getter = "bool(" + getter + ")"; }
+    code += Indent + Indent + Indent + "return " + getter + "\n";
+    std::string default_value;
+    if (is_bool) {
+      default_value = field.value.constant == "0" ? "False" : "True";
+    } else {
+      default_value = IsFloat(field.value.type.base_type)
+                          ? float_const_gen_.GenFloatConstant(field)
+                          : field.value.constant;
+    }
+    code += Indent + Indent + "return " + default_value + "\n\n";
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Struct.
+  void GetStructFieldOfStruct(const StructDef &struct_def,
+                              const FieldDef &field, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self, obj):\n";
+    code += Indent + Indent + "obj.Init(self._tab.Bytes, self._tab.Pos + ";
+    code += NumToString(field.value.offset) + ")";
+    code += "\n" + Indent + Indent + "return obj\n\n";
+  }
+
+  // Get the value of a fixed size array.
+  void GetArrayOfStruct(const StructDef &struct_def, const FieldDef &field,
+                        std::string *code_ptr) {
+    auto &code = *code_ptr;
+    const auto vec_type = field.value.type.VectorType();
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    if (IsStruct(vec_type)) {
+      code += "(self, obj, i):\n";
+      code += Indent + Indent + "obj.Init(self._tab.Bytes, self._tab.Pos + ";
+      code += NumToString(field.value.offset) + " + i * ";
+      code += NumToString(InlineSize(vec_type));
+      code += ")\n" + Indent + Indent + "return obj\n\n";
+    } else {
+      auto getter = GenGetter(vec_type);
+      code += "(self): return [" + getter;
+      code += "self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(";
+      code += NumToString(field.value.offset) + " + i * ";
+      code += NumToString(InlineSize(vec_type));
+      code += ")) for i in range(";
+      code += NumToString(field.value.type.fixed_length) + ")]\n";
+    }
+  }
+
+  // Get a struct by initializing an existing struct.
+  // Specific to Table.
+  void GetStructFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_ptr) {
+    auto &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self):";
+    code += OffsetPrefix(field);
+    if (field.value.type.struct_def->fixed) {
+      code += Indent + Indent + Indent + "x = o + self._tab.Pos\n";
+    } else {
+      code += Indent + Indent + Indent;
+      code += "x = self._tab.Indirect(o + self._tab.Pos)\n";
+    }
+    if (parser_.opts.include_dependence_headers) {
+      code += Indent + Indent + Indent;
+      code += "from " + GenPackageReference(field.value.type) + " import " +
+              TypeName(field) + "\n";
+    }
+    code += Indent + Indent + Indent + "obj = " + TypeName(field) + "()\n";
+    code += Indent + Indent + Indent + "obj.Init(self._tab.Bytes, x)\n";
+    code += Indent + Indent + Indent + "return obj\n";
+    code += Indent + Indent + "return None\n\n";
+  }
+
+  // Get the value of a string.
+  void GetStringField(const StructDef &struct_def, const FieldDef &field,
+                      std::string *code_ptr) {
+    auto &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self):";
+    code += OffsetPrefix(field);
+    code += Indent + Indent + Indent + "return " + GenGetter(field.value.type);
+    code += "o + self._tab.Pos)\n";
+    code += Indent + Indent + "return None\n\n";
+  }
+
+  // Get the value of a union from an object.
+  void GetUnionField(const StructDef &struct_def, const FieldDef &field,
+                     std::string *code_ptr) {
+    auto &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "(self):";
+    code += OffsetPrefix(field);
+
+    // TODO(rw): this works and is not the good way to it:
+    bool is_native_table = TypeName(field) == "*flatbuffers.Table";
+    if (is_native_table) {
+      code +=
+          Indent + Indent + Indent + "from flatbuffers.table import Table\n";
+    } else if (parser_.opts.include_dependence_headers) {
+      code += Indent + Indent + Indent;
+      code += "from " + GenPackageReference(field.value.type) + " import " +
+              TypeName(field) + "\n";
+    }
+    code += Indent + Indent + Indent + "obj = Table(bytearray(), 0)\n";
+    code += Indent + Indent + Indent + GenGetter(field.value.type);
+    code += "obj, o)\n" + Indent + Indent + Indent + "return obj\n";
+    code += Indent + Indent + "return None\n\n";
+  }
+
+  // Generate the package reference when importing a struct or enum from its
+  // module.
+  std::string GenPackageReference(const Type &type) {
+    Namespace *namespaces;
+    if (type.struct_def) {
+      namespaces = type.struct_def->defined_namespace;
+    } else if (type.enum_def) {
+      namespaces = type.enum_def->defined_namespace;
+    } else {
+      return "." + GenTypeGet(type);
+    }
+
+    return namespaces->GetFullyQualifiedName(GenTypeGet(type));
+  }
+
+  // Get the value of a vector's struct member.
+  void GetMemberOfVectorOfStruct(const StructDef &struct_def,
+                                 const FieldDef &field, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self, j):" + OffsetPrefix(field);
+    code += Indent + Indent + Indent + "x = self._tab.Vector(o)\n";
+    code += Indent + Indent + Indent;
+    code += "x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * ";
+    code += NumToString(InlineSize(vectortype)) + "\n";
+    if (!(vectortype.struct_def->fixed)) {
+      code += Indent + Indent + Indent + "x = self._tab.Indirect(x)\n";
+    }
+    if (parser_.opts.include_dependence_headers) {
+      code += Indent + Indent + Indent;
+      code += "from " + GenPackageReference(field.value.type) + " import " +
+              TypeName(field) + "\n";
+    }
+    code += Indent + Indent + Indent + "obj = " + TypeName(field) + "()\n";
+    code += Indent + Indent + Indent + "obj.Init(self._tab.Bytes, x)\n";
+    code += Indent + Indent + Indent + "return obj\n";
+    code += Indent + Indent + "return None\n\n";
+  }
+
+  // Get the value of a vector's non-struct member. Uses a named return
+  // argument to conveniently set the zero value for the result.
+  void GetMemberOfVectorOfNonStruct(const StructDef &struct_def,
+                                    const FieldDef &field,
+                                    std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field));
+    code += "(self, j):";
+    code += OffsetPrefix(field);
+    code += Indent + Indent + Indent + "a = self._tab.Vector(o)\n";
+    code += Indent + Indent + Indent;
+    code += "return " + GenGetter(field.value.type);
+    code += "a + flatbuffers.number_types.UOffsetTFlags.py_type(j * ";
+    code += NumToString(InlineSize(vectortype)) + "))\n";
+    if (IsString(vectortype)) {
+      code += Indent + Indent + "return \"\"\n";
+    } else {
+      code += Indent + Indent + "return 0\n";
+    }
+    code += "\n";
+  }
+
+  // Returns a non-struct vector as a numpy array. Much faster
+  // than iterating over the vector element by element.
+  void GetVectorOfNonStructAsNumpy(const StructDef &struct_def,
+                                   const FieldDef &field,
+                                   std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto vectortype = field.value.type.VectorType();
+
+    // Currently, we only support accessing as numpy array if
+    // the vector type is a scalar.
+    if (!(IsScalar(vectortype.base_type))) { return; }
+
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "AsNumpy(self):";
+    code += OffsetPrefix(field);
+
+    code += Indent + Indent + Indent;
+    code += "return ";
+    code += "self._tab.GetVectorAsNumpy(flatbuffers.number_types.";
+    code += MakeCamel(GenTypeGet(field.value.type));
+    code += "Flags, o)\n";
+
+    if (IsString(vectortype)) {
+      code += Indent + Indent + "return \"\"\n";
+    } else {
+      code += Indent + Indent + "return 0\n";
+    }
+    code += "\n";
+  }
+
+  // Returns a nested flatbuffer as itself.
+  void GetVectorAsNestedFlatbuffer(const StructDef &struct_def,
+                                   const FieldDef &field,
+                                   std::string *code_ptr) {
+    auto nested = field.attributes.Lookup("nested_flatbuffer");
+    if (!nested) { return; }  // There is no nested flatbuffer.
+
+    std::string unqualified_name = nested->constant;
+    std::string qualified_name = nested->constant;
+    auto nested_root = parser_.LookupStruct(nested->constant);
+    if (nested_root == nullptr) {
+      qualified_name =
+          parser_.current_namespace_->GetFullyQualifiedName(nested->constant);
+      nested_root = parser_.LookupStruct(qualified_name);
+    }
+    FLATBUFFERS_ASSERT(nested_root);  // Guaranteed to exist by parser.
+    (void)nested_root;
+
+    auto &code = *code_ptr;
+    GenReceiver(struct_def, code_ptr);
+    code += MakeCamel(NormalizedName(field)) + "NestedRoot(self):";
+
+    code += OffsetPrefix(field);
+
+    code += Indent + Indent + Indent;
+    code += "from " + qualified_name + " import " + unqualified_name + "\n";
+    code += Indent + Indent + Indent + "return " + unqualified_name;
+    code += ".GetRootAs";
+    code += "(self._tab.Bytes, self._tab.Vector(o))\n";
+    code += Indent + Indent + "return 0\n";
+    code += "\n";
+  }
+
+  // Begin the creator function signature.
+  void BeginBuilderArgs(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+
+    code += "\n";
+    code += "def Create" + NormalizedName(struct_def);
+    code += "(builder";
+  }
+
+  // Recursively generate arguments for a constructor, to deal with nested
+  // structs.
+  void StructBuilderArgs(const StructDef &struct_def,
+                         const std::string nameprefix,
+                         const std::string namesuffix, bool has_field_name,
+                         const std::string fieldname_suffix,
+                         std::string *code_ptr) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      const auto &type =
+          IsArray(field_type) ? field_type.VectorType() : field_type;
+      if (IsStruct(type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        auto subprefix = nameprefix;
+        if (has_field_name) {
+          subprefix += NormalizedName(field) + fieldname_suffix;
+        }
+        StructBuilderArgs(*field.value.type.struct_def, subprefix, namesuffix,
+                          has_field_name, fieldname_suffix, code_ptr);
+      } else {
+        auto &code = *code_ptr;
+        code += std::string(", ") + nameprefix;
+        if (has_field_name) { code += MakeCamel(NormalizedName(field), false); }
+        code += namesuffix;
+      }
+    }
+  }
+
+  // End the creator function signature.
+  void EndBuilderArgs(std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "):\n";
+  }
+
+  // Recursively generate struct construction statements and instert manual
+  // padding.
+  void StructBuilderBody(const StructDef &struct_def, const char *nameprefix,
+                         std::string *code_ptr, size_t index = 0,
+                         bool in_array = false) {
+    auto &code = *code_ptr;
+    std::string indent(index * 4, ' ');
+    code +=
+        indent + "    builder.Prep(" + NumToString(struct_def.minalign) + ", ";
+    code += NumToString(struct_def.bytesize) + ")\n";
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      const auto &field_type = field.value.type;
+      const auto &type =
+          IsArray(field_type) ? field_type.VectorType() : field_type;
+      if (field.padding)
+        code +=
+            indent + "    builder.Pad(" + NumToString(field.padding) + ")\n";
+      if (IsStruct(field_type)) {
+        StructBuilderBody(*field_type.struct_def,
+                          (nameprefix + (NormalizedName(field) + "_")).c_str(),
+                          code_ptr, index, in_array);
+      } else {
+        const auto index_var = "_idx" + NumToString(index);
+        if (IsArray(field_type)) {
+          code += indent + "    for " + index_var + " in range(";
+          code += NumToString(field_type.fixed_length);
+          code += " , 0, -1):\n";
+          in_array = true;
+        }
+        if (IsStruct(type)) {
+          StructBuilderBody(
+              *field_type.struct_def,
+              (nameprefix + (NormalizedName(field) + "_")).c_str(), code_ptr,
+              index + 1, in_array);
+        } else {
+          code += IsArray(field_type) ? "    " : "";
+          code += indent + "    builder.Prepend" + GenMethod(field) + "(";
+          code += nameprefix + MakeCamel(NormalizedName(field), false);
+          size_t array_cnt = index + (IsArray(field_type) ? 1 : 0);
+          for (size_t i = 0; in_array && i < array_cnt; i++) {
+            code += "[_idx" + NumToString(i) + "-1]";
+          }
+          code += ")\n";
+        }
+      }
+    }
+  }
+
+  void EndBuilderBody(std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "    return builder.Offset()\n";
+  }
+
+  // Get the value of a table's starting offset.
+  void GetStartOfTable(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "def Start(builder): ";
+    code += "builder.StartObject(";
+    code += NumToString(struct_def.fields.vec.size());
+    code += ")\n";
+
+    // Add alias with the old name.
+    code += "def " + NormalizedName(struct_def) + "Start(builder):\n";
+    code += Indent + "\"\"\"This method is deprecated. Please switch to Start.\"\"\"\n";
+    code += Indent + "return Start(builder)\n";
+  }
+
+  // Set the value of a table's field.
+  void BuildFieldOfTable(const StructDef &struct_def, const FieldDef &field,
+                         const size_t offset, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "def Add" + MakeCamel(NormalizedName(field));
+    code += "(builder, ";
+    code += MakeCamel(NormalizedName(field), false);
+    code += "): ";
+    code += "builder.Prepend";
+    code += GenMethod(field) + "Slot(";
+    code += NumToString(offset) + ", ";
+    if (!IsScalar(field.value.type.base_type) && (!struct_def.fixed)) {
+      code += "flatbuffers.number_types.UOffsetTFlags.py_type";
+      code += "(";
+      code += MakeCamel(NormalizedName(field), false) + ")";
+    } else {
+      code += MakeCamel(NormalizedName(field), false);
+    }
+    code += ", ";
+    code += IsFloat(field.value.type.base_type)
+                ? float_const_gen_.GenFloatConstant(field)
+                : field.value.constant;
+    code += ")\n";
+
+    // Add alias with the old name.
+    code += "def " + NormalizedName(struct_def) + "Add" + MakeCamel(NormalizedName(field));
+    code += "(builder, ";
+    code += MakeCamel(NormalizedName(field), false);
+    code += "):\n";
+    code += Indent + "\"\"\"This method is deprecated. Please switch to Add";
+    code += MakeCamel(NormalizedName(field)) + ".\"\"\"\n";
+    code += Indent + "return Add" + MakeCamel(NormalizedName(field));
+    code += "(builder, ";
+    code += MakeCamel(NormalizedName(field), false);
+    code += ")\n";
+
+    // Add alias with the old name.
+  }
+
+  // Set the value of one of the members of a table's vector.
+  void BuildVectorOfTable(const StructDef &struct_def, const FieldDef &field,
+                          std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "def Start";
+    code += MakeCamel(NormalizedName(field));
+    code += "Vector(builder, numElems): return builder.StartVector(";
+    auto vector_type = field.value.type.VectorType();
+    auto alignment = InlineAlignment(vector_type);
+    auto elem_size = InlineSize(vector_type);
+    code += NumToString(elem_size);
+    code += ", numElems, " + NumToString(alignment);
+    code += ")\n";
+
+    // Add alias with the old name.
+    code += "def " + NormalizedName(struct_def) + "Start";
+    code += MakeCamel(NormalizedName(field));
+    code += "Vector(builder, numElems):\n";
+    code += Indent + "\"\"\"This method is deprecated. Please switch to Start.\"\"\"\n";
+    code += Indent + "return Start";
+    code += MakeCamel(NormalizedName(field));
+    code += "Vector(builder, numElems)\n";
+  }
+
+  // Set the value of one of the members of a table's vector and fills in the
+  // elements from a bytearray. This is for simplifying the use of nested
+  // flatbuffers.
+  void BuildVectorOfTableFromBytes(const FieldDef &field, std::string *code_ptr) {
+    auto nested = field.attributes.Lookup("nested_flatbuffer");
+    if (!nested) { return; }  // There is no nested flatbuffer.
+
+    std::string unqualified_name = nested->constant;
+    std::string qualified_name = nested->constant;
+    auto nested_root = parser_.LookupStruct(nested->constant);
+    if (nested_root == nullptr) {
+      qualified_name =
+          parser_.current_namespace_->GetFullyQualifiedName(nested->constant);
+      nested_root = parser_.LookupStruct(qualified_name);
+    }
+    FLATBUFFERS_ASSERT(nested_root);  // Guaranteed to exist by parser.
+    (void)nested_root;
+
+    auto &code = *code_ptr;
+    code += "def MakeVectorFromBytes(builder, bytes):\n";
+    code += Indent + "builder.StartVector(";
+    auto vector_type = field.value.type.VectorType();
+    auto alignment = InlineAlignment(vector_type);
+    auto elem_size = InlineSize(vector_type);
+    code += NumToString(elem_size);
+    code += ", len(bytes), " + NumToString(alignment);
+    code += ")\n";
+    code += Indent + "builder.head = builder.head - len(bytes)\n";
+    code += Indent + "builder.Bytes[builder.head : builder.head + len(bytes)]";
+    code += " = bytes\n";
+    code += Indent + "return builder.EndVector()\n";
+
+    // Add alias with the old name.
+    code += "def Make" + MakeCamel(NormalizedName(field));
+    code += "VectorFromBytes(builder, bytes):\n";
+    code += Indent + "builder.StartVector(";
+    code += NumToString(elem_size);
+    code += ", len(bytes), " + NumToString(alignment);
+    code += ")\n";
+    code += Indent + "builder.head = builder.head - len(bytes)\n";
+    code += Indent + "builder.Bytes[builder.head : builder.head + len(bytes)]";
+    code += " = bytes\n";
+    code += Indent + "return builder.EndVector()\n";
+  }
+
+  // Get the offset of the end of a table.
+  void GetEndOffsetOnTable(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "def End(builder): return builder.EndObject()\n";
+
+    // Add alias with the old name.
+    code += "def " + NormalizedName(struct_def) + "End(builder):\n";
+    code += Indent + "\"\"\"This method is deprecated. Please switch to End.\"\"\"\n";
+    code += Indent + "return End(builder)";
+  }
+
+  // Generate the receiver for function signatures.
+  void GenReceiver(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += Indent + "# " + NormalizedName(struct_def) + "\n";
+    code += Indent + "def ";
+  }
+
+  // Generate a struct field, conditioned on its child type(s).
+  void GenStructAccessor(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    GenComment(field.doc_comment, code_ptr, &def_comment, Indent.c_str());
+    if (IsScalar(field.value.type.base_type)) {
+      if (struct_def.fixed) {
+        GetScalarFieldOfStruct(struct_def, field, code_ptr);
+      } else {
+        GetScalarFieldOfTable(struct_def, field, code_ptr);
+      }
+    } else if (IsArray(field.value.type)) {
+      GetArrayOfStruct(struct_def, field, code_ptr);
+    } else {
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT:
+          if (struct_def.fixed) {
+            GetStructFieldOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetStructFieldOfTable(struct_def, field, code_ptr);
+          }
+          break;
+        case BASE_TYPE_STRING:
+          GetStringField(struct_def, field, code_ptr);
+          break;
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GetMemberOfVectorOfStruct(struct_def, field, code_ptr);
+          } else {
+            GetMemberOfVectorOfNonStruct(struct_def, field, code_ptr);
+            GetVectorOfNonStructAsNumpy(struct_def, field, code_ptr);
+            GetVectorAsNestedFlatbuffer(struct_def, field, code_ptr);
+          }
+          break;
+        }
+        case BASE_TYPE_UNION: GetUnionField(struct_def, field, code_ptr); break;
+        default: FLATBUFFERS_ASSERT(0);
+      }
+    }
+    if (IsVector(field.value.type) || IsArray(field.value.type)) {
+      GetVectorLen(struct_def, field, code_ptr);
+      GetVectorIsNone(struct_def, field, code_ptr);
+    }
+  }
+
+  // Generate struct sizeof.
+  void GenStructSizeOf(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += Indent + "@classmethod\n";
+    code += Indent + "def SizeOf(cls):\n";
+    code +=
+        Indent + Indent + "return " + NumToString(struct_def.bytesize) + "\n";
+    code += "\n";
+  }
+
+  // Generate table constructors, conditioned on its members' types.
+  void GenTableBuilders(const StructDef &struct_def, std::string *code_ptr) {
+    GetStartOfTable(struct_def, code_ptr);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto offset = it - struct_def.fields.vec.begin();
+      BuildFieldOfTable(struct_def, field, offset, code_ptr);
+      if (IsVector(field.value.type)) {
+        BuildVectorOfTable(struct_def, field, code_ptr);
+        BuildVectorOfTableFromBytes(field, code_ptr);
+      }
+    }
+
+    GetEndOffsetOnTable(struct_def, code_ptr);
+  }
+
+  // Generate function to check for proper file identifier
+  void GenHasFileIdentifier(const StructDef &struct_def,
+                            std::string *code_ptr) {
+    auto &code = *code_ptr;
+    std::string escapedID;
+    // In the event any of file_identifier characters are special(NULL, \, etc),
+    // problems occur. To prevent this, convert all chars to their hex-escaped
+    // equivalent.
+    for (auto it = parser_.file_identifier_.begin();
+         it != parser_.file_identifier_.end(); ++it) {
+      escapedID += "\\x" + IntToStringHex(*it, 2);
+    }
+
+    code += Indent + "@classmethod\n";
+    code += Indent + "def " + NormalizedName(struct_def);
+    code += "BufferHasIdentifier(cls, buf, offset, size_prefixed=False):";
+    code += "\n";
+    code += Indent + Indent;
+    code += "return flatbuffers.util.BufferHasIdentifier(buf, offset, b\"";
+    code += escapedID;
+    code += "\", size_prefixed=size_prefixed)\n";
+    code += "\n";
+  }
+
+  // Generates struct or table methods.
+  void GenStruct(const StructDef &struct_def, std::string *code_ptr) {
+    if (struct_def.generated) return;
+
+    GenComment(struct_def.doc_comment, code_ptr, &def_comment);
+    BeginClass(struct_def, code_ptr);
+    if (!struct_def.fixed) {
+      // Generate a special accessor for the table that has been declared as
+      // the root type.
+      NewRootTypeFromBuffer(struct_def, code_ptr);
+      if (parser_.file_identifier_.length()) {
+        // Generate a special function to test file_identifier
+        GenHasFileIdentifier(struct_def, code_ptr);
+      }
+    } else {
+      // Generates the SizeOf method for all structs.
+      GenStructSizeOf(struct_def, code_ptr);
+    }
+    // Generates the Init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    InitializeExisting(struct_def, code_ptr);
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      GenStructAccessor(struct_def, field, code_ptr);
+    }
+
+    if (struct_def.fixed) {
+      // creates a struct constructor function
+      GenStructBuilder(struct_def, code_ptr);
+    } else {
+      // Creates a set of functions that allow table construction.
+      GenTableBuilders(struct_def, code_ptr);
+    }
+  }
+
+  void GenReceiverForObjectAPI(const StructDef &struct_def,
+                               std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += GenIndents(1) + "# " + NormalizedName(struct_def) + "T";
+    code += GenIndents(1) + "def ";
+  }
+
+  void BeginClassForObjectAPI(const StructDef &struct_def,
+                              std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code += "\n";
+    code += "class " + NormalizedName(struct_def) + "T(object):";
+    code += "\n";
+  }
+
+  // Gets the accoresponding python builtin type of a BaseType for scalars and
+  // string.
+  std::string GetBasePythonTypeForScalarAndString(const BaseType &base_type) {
+    if (IsBool(base_type)) {
+      return "bool";
+    } else if (IsFloat(base_type)) {
+      return "float";
+    } else if (IsInteger(base_type)) {
+      return "int";
+    } else if (base_type == BASE_TYPE_STRING) {
+      return "str";
+    } else {
+      FLATBUFFERS_ASSERT(false && "base_type is not a scalar or string type.");
+      return "";
+    }
+  }
+
+  std::string GetDefaultValue(const FieldDef &field) {
+    BaseType base_type = field.value.type.base_type;
+    if (IsBool(base_type)) {
+      return field.value.constant == "0" ? "False" : "True";
+    } else if (IsFloat(base_type)) {
+      return float_const_gen_.GenFloatConstant(field);
+    } else if (IsInteger(base_type)) {
+      return field.value.constant;
+    } else {
+      // For string, struct, and table.
+      return "None";
+    }
+  }
+
+  void GenUnionInit(const FieldDef &field, std::string *field_types_ptr,
+                    std::set<std::string> *import_list,
+                    std::set<std::string> *import_typing_list) {
+    // Gets all possible types in the union.
+    import_typing_list->insert("Union");
+    auto &field_types = *field_types_ptr;
+    field_types = "Union[";
+
+    std::string separator_string = ", ";
+    auto enum_def = field.value.type.enum_def;
+    for (auto it = enum_def->Vals().begin(); it != enum_def->Vals().end();
+         ++it) {
+      auto &ev = **it;
+      // Union only supports string and table.
+      std::string field_type;
+      switch (ev.union_type.base_type) {
+        case BASE_TYPE_STRUCT:
+          field_type = GenTypeGet(ev.union_type) + "T";
+          if (parser_.opts.include_dependence_headers) {
+            auto package_reference = GenPackageReference(ev.union_type);
+            field_type = package_reference + "." + field_type;
+            import_list->insert("import " + package_reference);
+          }
+          break;
+        case BASE_TYPE_STRING: field_type += "str"; break;
+        case BASE_TYPE_NONE: field_type += "None"; break;
+        default: break;
+      }
+      field_types += field_type + separator_string;
+    }
+
+    // Removes the last separator_string.
+    field_types.erase(field_types.length() - separator_string.size());
+    field_types += "]";
+
+    // Gets the import lists for the union.
+    if (parser_.opts.include_dependence_headers) {
+      // The package reference is generated based on enum_def, instead
+      // of struct_def in field.type. That's why GenPackageReference() is
+      // not used.
+      Namespace *namespaces = field.value.type.enum_def->defined_namespace;
+      auto package_reference = namespaces->GetFullyQualifiedName(
+          MakeUpperCamel(*(field.value.type.enum_def)));
+      auto union_name = MakeUpperCamel(*(field.value.type.enum_def));
+      import_list->insert("import " + package_reference);
+    }
+  }
+
+  void GenStructInit(const FieldDef &field, std::string *field_type_ptr,
+                     std::set<std::string> *import_list,
+                     std::set<std::string> *import_typing_list) {
+    import_typing_list->insert("Optional");
+    auto &field_type = *field_type_ptr;
+    if (parser_.opts.include_dependence_headers) {
+      auto package_reference = GenPackageReference(field.value.type);
+      field_type = package_reference + "." + TypeName(field) + "T]";
+      import_list->insert("import " + package_reference);
+    } else {
+      field_type = TypeName(field) + "T]";
+    }
+    field_type = "Optional[" + field_type;
+  }
+
+  void GenVectorInit(const FieldDef &field, std::string *field_type_ptr,
+                     std::set<std::string> *import_list,
+                     std::set<std::string> *import_typing_list) {
+    import_typing_list->insert("List");
+    auto &field_type = *field_type_ptr;
+    auto base_type = field.value.type.VectorType().base_type;
+    if (base_type == BASE_TYPE_STRUCT) {
+      field_type = GenTypeGet(field.value.type.VectorType()) + "T]";
+      if (parser_.opts.include_dependence_headers) {
+        auto package_reference =
+            GenPackageReference(field.value.type.VectorType());
+        field_type = package_reference + "." +
+                     GenTypeGet(field.value.type.VectorType()) + "T]";
+        import_list->insert("import " + package_reference);
+      }
+      field_type = "List[" + field_type;
+    } else {
+      field_type =
+          "List[" + GetBasePythonTypeForScalarAndString(base_type) + "]";
+    }
+  }
+
+  void GenInitialize(const StructDef &struct_def, std::string *code_ptr,
+                     std::set<std::string> *import_list) {
+    std::string code;
+    std::set<std::string> import_typing_list;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      // Determines field type, default value, and typing imports.
+      auto base_type = field.value.type.base_type;
+      std::string field_type;
+      switch (base_type) {
+        case BASE_TYPE_UNION: {
+          GenUnionInit(field, &field_type, import_list, &import_typing_list);
+          break;
+        }
+        case BASE_TYPE_STRUCT: {
+          GenStructInit(field, &field_type, import_list, &import_typing_list);
+          break;
+        }
+        case BASE_TYPE_VECTOR:
+        case BASE_TYPE_ARRAY: {
+          GenVectorInit(field, &field_type, import_list, &import_typing_list);
+          break;
+        }
+        default:
+          // Scalar or sting fields.
+          field_type = GetBasePythonTypeForScalarAndString(base_type);
+          break;
+      }
+
+      auto default_value = GetDefaultValue(field);
+      // Wrties the init statement.
+      auto field_instance_name = MakeLowerCamel(field);
+      code += GenIndents(2) + "self." + field_instance_name + " = " +
+              default_value + "  # type: " + field_type;
+    }
+
+    // Writes __init__ method.
+    auto &code_base = *code_ptr;
+    GenReceiverForObjectAPI(struct_def, code_ptr);
+    code_base += "__init__(self):";
+    if (code.empty()) {
+      code_base += GenIndents(2) + "pass";
+    } else {
+      code_base += code;
+    }
+    code_base += "\n";
+
+    // Merges the typing imports into import_list.
+    if (!import_typing_list.empty()) {
+      // Adds the try statement.
+      std::string typing_imports = "try:";
+      typing_imports += GenIndents(1) + "from typing import ";
+      std::string separator_string = ", ";
+      for (auto it = import_typing_list.begin(); it != import_typing_list.end();
+           ++it) {
+        const std::string &im = *it;
+        typing_imports += im + separator_string;
+      }
+      // Removes the last separator_string.
+      typing_imports.erase(typing_imports.length() - separator_string.size());
+
+      // Adds the except statement.
+      typing_imports += "\n";
+      typing_imports += "except:";
+      typing_imports += GenIndents(1) + "pass";
+      import_list->insert(typing_imports);
+    }
+
+    // Removes the import of the struct itself, if applied.
+    auto package_reference =
+        struct_def.defined_namespace->GetFullyQualifiedName(
+            MakeUpperCamel(struct_def));
+    auto struct_import = "import " + package_reference;
+    import_list->erase(struct_import);
+  }
+
+  void InitializeFromBuf(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto instance_name = MakeLowerCamel(struct_def);
+    auto struct_name = NormalizedName(struct_def);
+
+    code += GenIndents(1) + "@classmethod";
+    code += GenIndents(1) + "def InitFromBuf(cls, buf, pos):";
+    code += GenIndents(2) + instance_name + " = " + struct_name + "()";
+    code += GenIndents(2) + instance_name + ".Init(buf, pos)";
+    code += GenIndents(2) + "return cls.InitFromObj(" + instance_name + ")";
+    code += "\n";
+  }
+
+  void InitializeFromObjForObject(const StructDef &struct_def,
+                                  std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto instance_name = MakeLowerCamel(struct_def);
+    auto struct_name = NormalizedName(struct_def);
+
+    code += GenIndents(1) + "@classmethod";
+    code += GenIndents(1) + "def InitFromObj(cls, " + instance_name + "):";
+    code += GenIndents(2) + "x = " + struct_name + "T()";
+    code += GenIndents(2) + "x._UnPack(" + instance_name + ")";
+    code += GenIndents(2) + "return x";
+    code += "\n";
+  }
+
+  void GenUnPackForStruct(const StructDef &struct_def, const FieldDef &field,
+                          std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto field_type = TypeName(field);
+
+    if (parser_.opts.include_dependence_headers) {
+      auto package_reference = GenPackageReference(field.value.type);
+      field_type = package_reference + "." + TypeName(field);
+    }
+
+    code += GenIndents(2) + "if " + struct_instance_name + "." +
+            field_accessor_name + "(";
+    // if field is a struct, we need to create an instance for it first.
+    if (struct_def.fixed && field.value.type.base_type == BASE_TYPE_STRUCT) {
+      code += field_type + "()";
+    }
+    code += ") is not None:";
+    code += GenIndents(3) + "self." + field_instance_name + " = " + field_type +
+            "T.InitFromObj(" + struct_instance_name + "." +
+            field_accessor_name + "(";
+    // A struct's accessor requires a struct buf instance.
+    if (struct_def.fixed && field.value.type.base_type == BASE_TYPE_STRUCT) {
+      code += field_type + "()";
+    }
+    code += "))";
+  }
+
+  void GenUnPackForUnion(const StructDef &struct_def, const FieldDef &field,
+                         std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+    auto union_name = MakeUpperCamel(*(field.value.type.enum_def));
+
+    if (parser_.opts.include_dependence_headers) {
+      Namespace *namespaces = field.value.type.enum_def->defined_namespace;
+      auto package_reference = namespaces->GetFullyQualifiedName(
+          MakeUpperCamel(*(field.value.type.enum_def)));
+      union_name = package_reference + "." + union_name;
+    }
+    code += GenIndents(2) + "self." + field_instance_name + " = " + union_name +
+            "Creator(" + "self." + field_instance_name + "Type, " +
+            struct_instance_name + "." + field_accessor_name + "())";
+  }
+
+  void GenUnPackForStructVector(const StructDef &struct_def,
+                                const FieldDef &field, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+
+    code += GenIndents(2) + "if not " + struct_instance_name + "." +
+            field_accessor_name + "IsNone():";
+    code += GenIndents(3) + "self." + field_instance_name + " = []";
+    code += GenIndents(3) + "for i in range(" + struct_instance_name + "." +
+            field_accessor_name + "Length()):";
+
+    auto field_type_name = TypeName(field);
+    auto one_instance = field_type_name + "_";
+    one_instance[0] = CharToLower(one_instance[0]);
+
+    if (parser_.opts.include_dependence_headers) {
+      auto package_reference = GenPackageReference(field.value.type);
+      field_type_name = package_reference + "." + TypeName(field);
+    }
+
+    code += GenIndents(4) + "if " + struct_instance_name + "." +
+            field_accessor_name + "(i) is None:";
+    code += GenIndents(5) + "self." + field_instance_name + ".append(None)";
+    code += GenIndents(4) + "else:";
+    code += GenIndents(5) + one_instance + " = " + field_type_name +
+            "T.InitFromObj(" + struct_instance_name + "." +
+            field_accessor_name + "(i))";
+    code += GenIndents(5) + "self." + field_instance_name + ".append(" +
+            one_instance + ")";
+  }
+
+  void GenUnpackforScalarVectorHelper(const StructDef &struct_def,
+                                      const FieldDef &field,
+                                      std::string *code_ptr, int indents) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+
+    code += GenIndents(indents) + "self." + field_instance_name + " = []";
+    code += GenIndents(indents) + "for i in range(" + struct_instance_name +
+            "." + field_accessor_name + "Length()):";
+    code += GenIndents(indents + 1) + "self." + field_instance_name +
+            ".append(" + struct_instance_name + "." + field_accessor_name +
+            "(i))";
+  }
+
+  void GenUnPackForScalarVector(const StructDef &struct_def,
+                                const FieldDef &field, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+
+    code += GenIndents(2) + "if not " + struct_instance_name + "." +
+            field_accessor_name + "IsNone():";
+
+    // String does not have the AsNumpy method.
+    if (!(IsScalar(field.value.type.VectorType().base_type))) {
+      GenUnpackforScalarVectorHelper(struct_def, field, code_ptr, 3);
+      return;
+    }
+
+    code += GenIndents(3) + "if np is None:";
+    GenUnpackforScalarVectorHelper(struct_def, field, code_ptr, 4);
+
+    // If numpy exists, use the AsNumpy method to optimize the unpack speed.
+    code += GenIndents(3) + "else:";
+    code += GenIndents(4) + "self." + field_instance_name + " = " +
+            struct_instance_name + "." + field_accessor_name + "AsNumpy()";
+  }
+
+  void GenUnPackForScalar(const StructDef &struct_def, const FieldDef &field,
+                          std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+
+    code += GenIndents(2) + "self." + field_instance_name + " = " +
+            struct_instance_name + "." + field_accessor_name + "()";
+  }
+
+  // Generates the UnPack method for the object class.
+  void GenUnPack(const StructDef &struct_def, std::string *code_ptr) {
+    std::string code;
+    // Items that needs to be imported. No duplicate modules will be imported.
+    std::set<std::string> import_list;
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto field_type = TypeName(field);
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT: {
+          GenUnPackForStruct(struct_def, field, &code);
+          break;
+        }
+        case BASE_TYPE_UNION: {
+          GenUnPackForUnion(struct_def, field, &code);
+          break;
+        }
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GenUnPackForStructVector(struct_def, field, &code);
+          } else {
+            GenUnPackForScalarVector(struct_def, field, &code);
+          }
+          break;
+        }
+        case BASE_TYPE_ARRAY: {
+          GenUnPackForScalarVector(struct_def, field, &code);
+          break;
+        }
+        default: GenUnPackForScalar(struct_def, field, &code);
+      }
+    }
+
+    // Writes import statements and code into the generated file.
+    auto &code_base = *code_ptr;
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+    auto struct_name = MakeUpperCamel(struct_def);
+
+    GenReceiverForObjectAPI(struct_def, code_ptr);
+    code_base += "_UnPack(self, " + struct_instance_name + "):";
+    code_base += GenIndents(2) + "if " + struct_instance_name + " is None:";
+    code_base += GenIndents(3) + "return";
+
+    // Write the import statements.
+    for (std::set<std::string>::iterator it = import_list.begin();
+         it != import_list.end(); ++it) {
+      code_base += GenIndents(2) + *it;
+    }
+
+    // Write the code.
+    code_base += code;
+    code_base += "\n";
+  }
+
+  void GenPackForStruct(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto struct_name = MakeUpperCamel(struct_def);
+
+    GenReceiverForObjectAPI(struct_def, code_ptr);
+    code += "Pack(self, builder):";
+    code += GenIndents(2) + "return Create" + struct_name + "(builder";
+
+    StructBuilderArgs(struct_def,
+                      /* nameprefix = */ "self.",
+                      /* namesuffix = */ "",
+                      /* has_field_name = */ true,
+                      /* fieldname_suffix = */ ".", code_ptr);
+    code += ")\n";
+  }
+
+  void GenPackForStructVectorField(const StructDef &struct_def,
+                                   const FieldDef &field,
+                                   std::string *code_prefix_ptr,
+                                   std::string *code_ptr) {
+    auto &code_prefix = *code_prefix_ptr;
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto struct_name = NormalizedName(struct_def);
+    auto field_accessor_name = MakeUpperCamel(field);
+
+    // Creates the field.
+    code_prefix +=
+        GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    if (field.value.type.struct_def->fixed) {
+      code_prefix += GenIndents(3) + "Start" +
+                     field_accessor_name + "Vector(builder, len(self." +
+                     field_instance_name + "))";
+      code_prefix += GenIndents(3) + "for i in reversed(range(len(self." +
+                     field_instance_name + "))):";
+      code_prefix +=
+          GenIndents(4) + "self." + field_instance_name + "[i].Pack(builder)";
+      code_prefix +=
+          GenIndents(3) + field_instance_name + " = builder.EndVector()";
+    } else {
+      // If the vector is a struct vector, we need to first build accessor for
+      // each struct element.
+      code_prefix += GenIndents(3) + field_instance_name + "list = []";
+      code_prefix += GenIndents(3);
+      code_prefix += "for i in range(len(self." + field_instance_name + ")):";
+      code_prefix += GenIndents(4) + field_instance_name + "list.append(self." +
+                     field_instance_name + "[i].Pack(builder))";
+
+      code_prefix += GenIndents(3) + "Start" +
+                     field_accessor_name + "Vector(builder, len(self." +
+                     field_instance_name + "))";
+      code_prefix += GenIndents(3) + "for i in reversed(range(len(self." +
+                     field_instance_name + "))):";
+      code_prefix += GenIndents(4) + "builder.PrependUOffsetTRelative" + "(" +
+                     field_instance_name + "list[i])";
+      code_prefix +=
+          GenIndents(3) + field_instance_name + " = builder.EndVector()";
+    }
+
+    // Adds the field into the struct.
+    code += GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    code += GenIndents(3) + "Add" + field_accessor_name +
+            "(builder, " + field_instance_name + ")";
+  }
+
+  void GenPackForScalarVectorFieldHelper(const StructDef &struct_def,
+                                         const FieldDef &field,
+                                         std::string *code_ptr, int indents) {
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_name = NormalizedName(struct_def);
+    auto vectortype = field.value.type.VectorType();
+
+    code += GenIndents(indents) + "Start" + field_accessor_name +
+            "Vector(builder, len(self." + field_instance_name + "))";
+    code += GenIndents(indents) + "for i in reversed(range(len(self." +
+            field_instance_name + "))):";
+    code += GenIndents(indents + 1) + "builder.Prepend";
+
+    std::string type_name;
+    switch (vectortype.base_type) {
+      case BASE_TYPE_BOOL: type_name = "Bool"; break;
+      case BASE_TYPE_CHAR: type_name = "Byte"; break;
+      case BASE_TYPE_UCHAR: type_name = "Uint8"; break;
+      case BASE_TYPE_SHORT: type_name = "Int16"; break;
+      case BASE_TYPE_USHORT: type_name = "Uint16"; break;
+      case BASE_TYPE_INT: type_name = "Int32"; break;
+      case BASE_TYPE_UINT: type_name = "Uint32"; break;
+      case BASE_TYPE_LONG: type_name = "Int64"; break;
+      case BASE_TYPE_ULONG: type_name = "Uint64"; break;
+      case BASE_TYPE_FLOAT: type_name = "Float32"; break;
+      case BASE_TYPE_DOUBLE: type_name = "Float64"; break;
+      case BASE_TYPE_STRING: type_name = "UOffsetTRelative"; break;
+      default: type_name = "VOffsetT"; break;
+    }
+    code += type_name;
+  }
+
+  void GenPackForScalarVectorField(const StructDef &struct_def,
+                                   const FieldDef &field,
+                                   std::string *code_prefix_ptr,
+                                   std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto &code_prefix = *code_prefix_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_name = NormalizedName(struct_def);
+
+    // Adds the field into the struct.
+    code += GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    code += GenIndents(3) + "Add" + field_accessor_name +
+            "(builder, " + field_instance_name + ")";
+
+    // Creates the field.
+    code_prefix +=
+        GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    // If the vector is a string vector, we need to first build accessor for
+    // each string element. And this generated code, needs to be
+    // placed ahead of code_prefix.
+    auto vectortype = field.value.type.VectorType();
+    if (IsString(vectortype)) {
+      code_prefix += GenIndents(3) + MakeLowerCamel(field) + "list = []";
+      code_prefix += GenIndents(3) + "for i in range(len(self." +
+                     field_instance_name + ")):";
+      code_prefix += GenIndents(4) + MakeLowerCamel(field) +
+                     "list.append(builder.CreateString(self." +
+                     field_instance_name + "[i]))";
+      GenPackForScalarVectorFieldHelper(struct_def, field, code_prefix_ptr, 3);
+      code_prefix += "(" + MakeLowerCamel(field) + "list[i])";
+      code_prefix +=
+          GenIndents(3) + field_instance_name + " = builder.EndVector()";
+      return;
+    }
+
+    code_prefix += GenIndents(3) + "if np is not None and type(self." +
+                   field_instance_name + ") is np.ndarray:";
+    code_prefix += GenIndents(4) + field_instance_name +
+                   " = builder.CreateNumpyVector(self." + field_instance_name +
+                   ")";
+    code_prefix += GenIndents(3) + "else:";
+    GenPackForScalarVectorFieldHelper(struct_def, field, code_prefix_ptr, 4);
+    code_prefix += "(self." + field_instance_name + "[i])";
+    code_prefix +=
+        GenIndents(4) + field_instance_name + " = builder.EndVector()";
+  }
+
+  void GenPackForStructField(const StructDef &struct_def, const FieldDef &field,
+                             std::string *code_prefix_ptr,
+                             std::string *code_ptr) {
+    auto &code_prefix = *code_prefix_ptr;
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_name = NormalizedName(struct_def);
+
+    if (field.value.type.struct_def->fixed) {
+      // Pure struct fields need to be created along with their parent
+      // structs.
+      code +=
+          GenIndents(2) + "if self." + field_instance_name + " is not None:";
+      code += GenIndents(3) + field_instance_name + " = self." +
+              field_instance_name + ".Pack(builder)";
+    } else {
+      // Tables need to be created before their parent structs are created.
+      code_prefix +=
+          GenIndents(2) + "if self." + field_instance_name + " is not None:";
+      code_prefix += GenIndents(3) + field_instance_name + " = self." +
+                     field_instance_name + ".Pack(builder)";
+      code +=
+          GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    }
+
+    code += GenIndents(3) + "Add" + field_accessor_name +
+            "(builder, " + field_instance_name + ")";
+  }
+
+  void GenPackForUnionField(const StructDef &struct_def, const FieldDef &field,
+                            std::string *code_prefix_ptr,
+                            std::string *code_ptr) {
+    auto &code_prefix = *code_prefix_ptr;
+    auto &code = *code_ptr;
+    auto field_instance_name = MakeLowerCamel(field);
+
+    auto field_accessor_name = MakeUpperCamel(field);
+    auto struct_name = NormalizedName(struct_def);
+
+    // TODO(luwa): TypeT should be moved under the None check as well.
+    code_prefix +=
+        GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    code_prefix += GenIndents(3) + field_instance_name + " = self." +
+                   field_instance_name + ".Pack(builder)";
+    code += GenIndents(2) + "if self." + field_instance_name + " is not None:";
+    code += GenIndents(3) + "Add" + field_accessor_name +
+            "(builder, " + field_instance_name + ")";
+  }
+
+  void GenPackForTable(const StructDef &struct_def, std::string *code_ptr) {
+    auto &code_base = *code_ptr;
+    std::string code, code_prefix;
+    auto struct_instance_name = MakeLowerCamel(struct_def);
+    auto struct_name = NormalizedName(struct_def);
+
+    GenReceiverForObjectAPI(struct_def, code_ptr);
+    code_base += "Pack(self, builder):";
+    code += GenIndents(2) + "Start(builder)";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto field_accessor_name = MakeUpperCamel(field);
+      auto field_instance_name = MakeLowerCamel(field);
+
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_STRUCT: {
+          GenPackForStructField(struct_def, field, &code_prefix, &code);
+          break;
+        }
+        case BASE_TYPE_UNION: {
+          GenPackForUnionField(struct_def, field, &code_prefix, &code);
+          break;
+        }
+        case BASE_TYPE_VECTOR: {
+          auto vectortype = field.value.type.VectorType();
+          if (vectortype.base_type == BASE_TYPE_STRUCT) {
+            GenPackForStructVectorField(struct_def, field, &code_prefix, &code);
+          } else {
+            GenPackForScalarVectorField(struct_def, field, &code_prefix, &code);
+          }
+          break;
+        }
+        case BASE_TYPE_ARRAY: {
+          GenPackForScalarVectorField(struct_def, field, &code_prefix, &code);
+          break;
+        }
+        case BASE_TYPE_STRING: {
+          code_prefix += GenIndents(2) + "if self." + field_instance_name +
+                         " is not None:";
+          code_prefix += GenIndents(3) + field_instance_name +
+                         " = builder.CreateString(self." + field_instance_name +
+                         ")";
+          code += GenIndents(2) + "if self." + field_instance_name +
+                  " is not None:";
+          code += GenIndents(3) + "Add" + field_accessor_name +
+                  "(builder, " + field_instance_name + ")";
+          break;
+        }
+        default:
+          // Generates code for scalar values. If the value equals to the
+          // default value, builder will automatically ignore it. So we don't
+          // need to check the value ahead.
+          code += GenIndents(2) + "Add" + field_accessor_name +
+                  "(builder, self." + field_instance_name + ")";
+          break;
+      }
+    }
+
+    code += GenIndents(2) + struct_instance_name + " = " + "End(builder)";
+    code += GenIndents(2) + "return " + struct_instance_name;
+
+    code_base += code_prefix + code;
+    code_base += "\n";
+  }
+
+  void GenStructForObjectAPI(const StructDef &struct_def,
+                             std::string *code_ptr) {
+    if (struct_def.generated) return;
+
+    std::set<std::string> import_list;
+    std::string code;
+
+    // Creates an object class for a struct or a table
+    BeginClassForObjectAPI(struct_def, &code);
+
+    GenInitialize(struct_def, &code, &import_list);
+
+    InitializeFromBuf(struct_def, &code);
+
+    InitializeFromObjForObject(struct_def, &code);
+
+    GenUnPack(struct_def, &code);
+
+    if (struct_def.fixed) {
+      GenPackForStruct(struct_def, &code);
+    } else {
+      GenPackForTable(struct_def, &code);
+    }
+
+    // Adds the imports at top.
+    auto &code_base = *code_ptr;
+    code_base += "\n";
+    for (auto it = import_list.begin(); it != import_list.end(); it++) {
+      auto im = *it;
+      code_base += im + "\n";
+    }
+    code_base += code;
+  }
+
+  void GenUnionCreatorForStruct(const EnumDef &enum_def, const EnumVal &ev,
+                                std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto union_name = NormalizedName(enum_def);
+    auto field_name = NormalizedName(ev);
+    auto field_type = GenTypeGet(ev.union_type) + "T";
+
+    code += GenIndents(1) + "if unionType == " + union_name + "()." +
+            field_name + ":";
+    if (parser_.opts.include_dependence_headers) {
+      auto package_reference = GenPackageReference(ev.union_type);
+      code += GenIndents(2) + "import " + package_reference;
+      field_type = package_reference + "." + field_type;
+    }
+    code += GenIndents(2) + "return " + field_type +
+            ".InitFromBuf(table.Bytes, table.Pos)";
+  }
+
+  void GenUnionCreatorForString(const EnumDef &enum_def, const EnumVal &ev,
+                                std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto union_name = NormalizedName(enum_def);
+    auto field_name = NormalizedName(ev);
+
+    code += GenIndents(1) + "if unionType == " + union_name + "()." +
+            field_name + ":";
+    code += GenIndents(2) + "tab = Table(table.Bytes, table.Pos)";
+    code += GenIndents(2) + "union = tab.String(table.Pos)";
+    code += GenIndents(2) + "return union";
+  }
+
+  // Creates an union object based on union type.
+  void GenUnionCreator(const EnumDef &enum_def, std::string *code_ptr) {
+    auto &code = *code_ptr;
+    auto union_name = MakeUpperCamel(enum_def);
+
+    code += "\n";
+    code += "def " + union_name + "Creator(unionType, table):";
+    code += GenIndents(1) + "from flatbuffers.table import Table";
+    code += GenIndents(1) + "if not isinstance(table, Table):";
+    code += GenIndents(2) + "return None";
+
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      // Union only supports string and table.
+      switch (ev.union_type.base_type) {
+        case BASE_TYPE_STRUCT:
+          GenUnionCreatorForStruct(enum_def, ev, &code);
+          break;
+        case BASE_TYPE_STRING:
+          GenUnionCreatorForString(enum_def, ev, &code);
+          break;
+        default: break;
+      }
+    }
+    code += GenIndents(1) + "return None";
+    code += "\n";
+  }
+
+  // Generate enum declarations.
+  void GenEnum(const EnumDef &enum_def, std::string *code_ptr) {
+    if (enum_def.generated) return;
+
+    GenComment(enum_def.doc_comment, code_ptr, &def_comment);
+    BeginEnum(NormalizedName(enum_def), code_ptr);
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      GenComment(ev.doc_comment, code_ptr, &def_comment, Indent.c_str());
+      EnumMember(enum_def, ev, code_ptr);
+    }
+    EndEnum(code_ptr);
+  }
+
+  // Returns the function name that is able to read a value of the given type.
+  std::string GenGetter(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "self._tab.String(";
+      case BASE_TYPE_UNION: return "self._tab.Union(";
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType());
+      default:
+        return "self._tab.Get(flatbuffers.number_types." +
+               MakeCamel(GenTypeGet(type)) + "Flags, ";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  std::string GenMethod(const FieldDef &field) {
+    return (IsScalar(field.value.type.base_type) || IsArray(field.value.type))
+               ? MakeCamel(GenTypeBasic(field.value.type))
+               : (IsStruct(field.value.type) ? "Struct" : "UOffsetTRelative");
+  }
+
+  std::string GenTypeBasic(const Type &type) {
+    // clang-format off
+    static const char *ctypename[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, ...) \
+        #PTYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    return ctypename[IsArray(type) ? type.VectorType().base_type
+                                   : type.base_type];
+  }
+
+  std::string GenTypePointer(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "string";
+      case BASE_TYPE_VECTOR: return GenTypeGet(type.VectorType());
+      case BASE_TYPE_STRUCT: return type.struct_def->name;
+      case BASE_TYPE_UNION:
+        // fall through
+      default: return "*flatbuffers.Table";
+    }
+  }
+
+  std::string GenTypeGet(const Type &type) {
+    return IsScalar(type.base_type) ? GenTypeBasic(type) : GenTypePointer(type);
+  }
+
+  std::string TypeName(const FieldDef &field) {
+    return GenTypeGet(field.value.type);
+  }
+
+  // Create a struct with a builder and the struct's arguments.
+  void GenStructBuilder(const StructDef &struct_def, std::string *code_ptr) {
+    BeginBuilderArgs(struct_def, code_ptr);
+    StructBuilderArgs(struct_def,
+                      /* nameprefix = */ "",
+                      /* namesuffix = */ "",
+                      /* has_field_name = */ true,
+                      /* fieldname_suffix = */ "_", code_ptr);
+    EndBuilderArgs(code_ptr);
+
+    StructBuilderBody(struct_def, "", code_ptr);
+    EndBuilderBody(code_ptr);
+  }
+
+  bool generate() {
+    if (!generateEnums()) return false;
+    if (!generateStructs()) return false;
+    return true;
+  }
+
+ private:
+  bool generateEnums() {
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      auto &enum_def = **it;
+      std::string enumcode;
+      GenEnum(enum_def, &enumcode);
+      if (parser_.opts.generate_object_based_api & enum_def.is_union) {
+        GenUnionCreator(enum_def, &enumcode);
+      }
+      if (!SaveType(enum_def, enumcode, false)) return false;
+    }
+    return true;
+  }
+
+  bool generateStructs() {
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      auto &struct_def = **it;
+      std::string declcode;
+      GenStruct(struct_def, &declcode);
+      if (parser_.opts.generate_object_based_api) {
+        GenStructForObjectAPI(struct_def, &declcode);
+      }
+      if (!SaveType(struct_def, declcode, true)) return false;
+    }
+    return true;
+  }
+
+  // Begin by declaring namespace and imports.
+  void BeginFile(const std::string &name_space_name, const bool needs_imports,
+                 std::string *code_ptr) {
+    auto &code = *code_ptr;
+    code = code + "# " + FlatBuffersGeneratedWarning() + "\n\n";
+    code += "# namespace: " + name_space_name + "\n\n";
+    if (needs_imports) {
+      code += "import flatbuffers\n";
+      code += "from flatbuffers.compat import import_numpy\n";
+      code += "np = import_numpy()\n\n";
+    }
+  }
+
+  // Save out the generated code for a Python Table type.
+  bool SaveType(const Definition &def, const std::string &classcode,
+                bool needs_imports) {
+    if (!classcode.length()) return true;
+
+    std::string namespace_dir = path_;
+    auto &namespaces = def.defined_namespace->components;
+    for (auto it = namespaces.begin(); it != namespaces.end(); ++it) {
+      if (it != namespaces.begin()) namespace_dir += kPathSeparator;
+      namespace_dir += *it;
+      std::string init_py_filename = namespace_dir + "/__init__.py";
+      SaveFile(init_py_filename.c_str(), "", false);
+    }
+
+    std::string code = "";
+    BeginFile(LastNamespacePart(*def.defined_namespace), needs_imports, &code);
+    code += classcode;
+    std::string filename =
+        NamespaceDir(*def.defined_namespace) + NormalizedName(def) + ".py";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+ private:
+  std::unordered_set<std::string> keywords_;
+  const SimpleFloatConstantGenerator float_const_gen_;
+};
+
+}  // namespace python
+
+bool GeneratePython(const Parser &parser, const std::string &path,
+                    const std::string &file_name) {
+  python::PythonGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_rust.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_rust.cpp
new file mode 100644
index 0000000..455780c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_rust.cpp
@@ -0,0 +1,2817 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// Convert a camelCaseIdentifier or CamelCaseIdentifier to a
+// snake_case_identifier.
+std::string MakeSnakeCase(const std::string &in) {
+  std::string s;
+  for (size_t i = 0; i < in.length(); i++) {
+    if (i == 0) {
+      s += CharToLower(in[0]);
+    } else if (in[i] == '_') {
+      s += '_';
+    } else if (!islower(in[i])) {
+      // Prevent duplicate underscores for Upper_Snake_Case strings
+      // and UPPERCASE strings.
+      if (islower(in[i - 1])) { s += '_'; }
+      s += CharToLower(in[i]);
+    } else {
+      s += in[i];
+    }
+  }
+  return s;
+}
+
+// Convert a string to all uppercase.
+std::string MakeUpper(const std::string &in) {
+  std::string s;
+  for (size_t i = 0; i < in.length(); i++) { s += CharToUpper(in[i]); }
+  return s;
+}
+
+// Encapsulate all logical field types in this enum. This allows us to write
+// field logic based on type switches, instead of branches on the properties
+// set on the Type.
+// TODO(rw): for backwards compatibility, we can't use a strict `enum class`
+//           declaration here. could we use the `-Wswitch-enum` warning to
+//           achieve the same effect?
+enum FullType {
+  ftInteger = 0,
+  ftFloat = 1,
+  ftBool = 2,
+
+  ftStruct = 3,
+  ftTable = 4,
+
+  ftEnumKey = 5,
+  ftUnionKey = 6,
+
+  ftUnionValue = 7,
+
+  // TODO(rw): bytestring?
+  ftString = 8,
+
+  ftVectorOfInteger = 9,
+  ftVectorOfFloat = 10,
+  ftVectorOfBool = 11,
+  ftVectorOfEnumKey = 12,
+  ftVectorOfStruct = 13,
+  ftVectorOfTable = 14,
+  ftVectorOfString = 15,
+  ftVectorOfUnionValue = 16,
+
+  ftArrayOfBuiltin = 17,
+  ftArrayOfEnum = 18,
+  ftArrayOfStruct = 19,
+};
+
+// Convert a Type to a FullType (exhaustive).
+FullType GetFullType(const Type &type) {
+  // N.B. The order of these conditionals matters for some types.
+
+  if (IsString(type)) {
+    return ftString;
+  } else if (type.base_type == BASE_TYPE_STRUCT) {
+    if (type.struct_def->fixed) {
+      return ftStruct;
+    } else {
+      return ftTable;
+    }
+  } else if (IsVector(type)) {
+    switch (GetFullType(type.VectorType())) {
+      case ftInteger: {
+        return ftVectorOfInteger;
+      }
+      case ftFloat: {
+        return ftVectorOfFloat;
+      }
+      case ftBool: {
+        return ftVectorOfBool;
+      }
+      case ftStruct: {
+        return ftVectorOfStruct;
+      }
+      case ftTable: {
+        return ftVectorOfTable;
+      }
+      case ftString: {
+        return ftVectorOfString;
+      }
+      case ftEnumKey: {
+        return ftVectorOfEnumKey;
+      }
+      case ftUnionKey:
+      case ftUnionValue: {
+        FLATBUFFERS_ASSERT(false && "vectors of unions are unsupported");
+        break;
+      }
+      default: {
+        FLATBUFFERS_ASSERT(false && "vector of vectors are unsupported");
+      }
+    }
+  } else if (IsArray(type)) {
+    switch (GetFullType(type.VectorType())) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool: {
+        return ftArrayOfBuiltin;
+      }
+      case ftStruct: {
+        return ftArrayOfStruct;
+      }
+      case ftEnumKey: {
+        return ftArrayOfEnum;
+      }
+      default: {
+        FLATBUFFERS_ASSERT(false && "Unsupported type for fixed array");
+      }
+    }
+  } else if (type.enum_def != nullptr) {
+    if (type.enum_def->is_union) {
+      if (type.base_type == BASE_TYPE_UNION) {
+        return ftUnionValue;
+      } else if (IsInteger(type.base_type)) {
+        return ftUnionKey;
+      } else {
+        FLATBUFFERS_ASSERT(false && "unknown union field type");
+      }
+    } else {
+      return ftEnumKey;
+    }
+  } else if (IsScalar(type.base_type)) {
+    if (IsBool(type.base_type)) {
+      return ftBool;
+    } else if (IsInteger(type.base_type)) {
+      return ftInteger;
+    } else if (IsFloat(type.base_type)) {
+      return ftFloat;
+    } else {
+      FLATBUFFERS_ASSERT(false && "unknown number type");
+    }
+  }
+
+  FLATBUFFERS_ASSERT(false && "completely unknown type");
+
+  // this is only to satisfy the compiler's return analysis.
+  return ftBool;
+}
+
+// If the second parameter is false then wrap the first with Option<...>
+std::string WrapInOptionIfNotRequired(std::string s, bool required) {
+  if (required) {
+    return s;
+  } else {
+    return "Option<" + s + ">";
+  }
+}
+
+// If the second parameter is false then add .unwrap()
+std::string AddUnwrapIfRequired(std::string s, bool required) {
+  if (required) {
+    return s + ".unwrap()";
+  } else {
+    return s;
+  }
+}
+
+bool IsBitFlagsEnum(const EnumDef &enum_def) {
+  return enum_def.attributes.Lookup("bit_flags") != nullptr;
+}
+bool IsBitFlagsEnum(const FieldDef &field) {
+  EnumDef *ed = field.value.type.enum_def;
+  return ed && IsBitFlagsEnum(*ed);
+}
+
+// TableArgs make required non-scalars "Option<_>".
+// TODO(cneo): Rework how we do defaults and stuff.
+bool IsOptionalToBuilder(const FieldDef &field) {
+  return field.IsOptional() || !IsScalar(field.value.type.base_type);
+}
+
+namespace rust {
+
+class RustGenerator : public BaseGenerator {
+ public:
+  RustGenerator(const Parser &parser, const std::string &path,
+                const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", "::", "rs"),
+        cur_name_space_(nullptr) {
+    const char *keywords[] = {
+      // clang-format off
+      // list taken from:
+      // https://doc.rust-lang.org/book/second-edition/appendix-01-keywords.html
+      //
+      // we write keywords one per line so that we can easily compare them with
+      // changes to that webpage in the future.
+
+      // currently-used keywords
+      "as",
+      "break",
+      "const",
+      "continue",
+      "crate",
+      "else",
+      "enum",
+      "extern",
+      "false",
+      "fn",
+      "for",
+      "if",
+      "impl",
+      "in",
+      "let",
+      "loop",
+      "match",
+      "mod",
+      "move",
+      "mut",
+      "pub",
+      "ref",
+      "return",
+      "Self",
+      "self",
+      "static",
+      "struct",
+      "super",
+      "trait",
+      "true",
+      "type",
+      "unsafe",
+      "use",
+      "where",
+      "while",
+
+      // future possible keywords
+      "abstract",
+      "alignof",
+      "become",
+      "box",
+      "do",
+      "final",
+      "macro",
+      "offsetof",
+      "override",
+      "priv",
+      "proc",
+      "pure",
+      "sizeof",
+      "typeof",
+      "unsized",
+      "virtual",
+      "yield",
+
+      // other rust terms we should not use
+      "std",
+      "usize",
+      "isize",
+      "u8",
+      "i8",
+      "u16",
+      "i16",
+      "u32",
+      "i32",
+      "u64",
+      "i64",
+      "u128",
+      "i128",
+      "f32",
+      "f64",
+
+      // These are terms the code generator can implement on types.
+      //
+      // In Rust, the trait resolution rules (as described at
+      // https://github.com/rust-lang/rust/issues/26007) mean that, as long
+      // as we impl table accessors as inherent methods, we'll never create
+      // conflicts with these keywords. However, that's a fairly nuanced
+      // implementation detail, and how we implement methods could change in
+      // the future. as a result, we proactively block these out as reserved
+      // words.
+      "follow",
+      "push",
+      "size",
+      "alignment",
+      "to_little_endian",
+      "from_little_endian",
+      nullptr,
+
+      // used by Enum constants
+      "ENUM_MAX",
+      "ENUM_MIN",
+      "ENUM_VALUES",
+    };
+    for (auto kw = keywords; *kw; kw++) keywords_.insert(*kw);
+  }
+
+  // Iterate through all definitions we haven't generated code for (enums,
+  // structs, and tables) and output them to a single file.
+  bool generate() {
+    code_.Clear();
+    code_ += "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    assert(!cur_name_space_);
+
+    // Generate imports for the global scope in case no namespace is used
+    // in the schema file.
+    GenNamespaceImports(0);
+    code_ += "";
+
+    // Generate all code in their namespaces, once, because Rust does not
+    // permit re-opening modules.
+    //
+    // TODO(rw): Use a set data structure to reduce namespace evaluations from
+    //           O(n**2) to O(n).
+    for (auto ns_it = parser_.namespaces_.begin();
+         ns_it != parser_.namespaces_.end(); ++ns_it) {
+      const auto &ns = *ns_it;
+
+      // Generate code for all the enum declarations.
+      for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+           ++it) {
+        const auto &enum_def = **it;
+        if (enum_def.defined_namespace == ns && !enum_def.generated) {
+          SetNameSpace(enum_def.defined_namespace);
+          GenEnum(enum_def);
+        }
+      }
+
+      // Generate code for all structs.
+      for (auto it = parser_.structs_.vec.begin();
+           it != parser_.structs_.vec.end(); ++it) {
+        const auto &struct_def = **it;
+        if (struct_def.defined_namespace == ns && struct_def.fixed &&
+            !struct_def.generated) {
+          SetNameSpace(struct_def.defined_namespace);
+          GenStruct(struct_def);
+        }
+      }
+
+      // Generate code for all tables.
+      for (auto it = parser_.structs_.vec.begin();
+           it != parser_.structs_.vec.end(); ++it) {
+        const auto &struct_def = **it;
+        if (struct_def.defined_namespace == ns && !struct_def.fixed &&
+            !struct_def.generated) {
+          SetNameSpace(struct_def.defined_namespace);
+          GenTable(struct_def);
+          if (parser_.opts.generate_object_based_api) {
+            GenTableObject(struct_def);
+          }
+        }
+      }
+
+      // Generate global helper functions.
+      if (parser_.root_struct_def_) {
+        auto &struct_def = *parser_.root_struct_def_;
+        if (struct_def.defined_namespace != ns) { continue; }
+        SetNameSpace(struct_def.defined_namespace);
+        GenRootTableFuncs(struct_def);
+      }
+    }
+    if (cur_name_space_) SetNameSpace(nullptr);
+
+    const auto file_path = GeneratedFileName(path_, file_name_, parser_.opts);
+    const auto final_code = code_.ToString();
+    return SaveFile(file_path.c_str(), final_code, false);
+  }
+
+ private:
+  CodeWriter code_;
+
+  std::set<std::string> keywords_;
+
+  // This tracks the current namespace so we can insert namespace declarations.
+  const Namespace *cur_name_space_;
+
+  const Namespace *CurrentNameSpace() const { return cur_name_space_; }
+
+  // Determine if a Type needs a lifetime template parameter when used in the
+  // Rust builder args.
+  bool TableBuilderTypeNeedsLifetime(const Type &type) const {
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool:
+      case ftEnumKey:
+      case ftUnionKey:
+      case ftUnionValue: {
+        return false;
+      }
+      default: {
+        return true;
+      }
+    }
+  }
+
+  // Determine if a table args rust type needs a lifetime template parameter.
+  bool TableBuilderArgsNeedsLifetime(const StructDef &struct_def) const {
+    FLATBUFFERS_ASSERT(!struct_def.fixed);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      if (field.deprecated) { continue; }
+
+      if (TableBuilderTypeNeedsLifetime(field.value.type)) { return true; }
+    }
+
+    return false;
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : name + "_";
+  }
+  std::string NamespacedNativeName(const Definition &def) {
+    return WrapInNameSpace(def.defined_namespace, NativeName(def));
+  }
+
+  std::string NativeName(const Definition &def) {
+    return parser_.opts.object_prefix + Name(def) + parser_.opts.object_suffix;
+  }
+
+  std::string Name(const Definition &def) const {
+    return EscapeKeyword(def.name);
+  }
+
+  std::string Name(const EnumVal &ev) const { return EscapeKeyword(ev.name); }
+
+  std::string WrapInNameSpace(const Definition &def) const {
+    return WrapInNameSpace(def.defined_namespace, Name(def));
+  }
+  std::string WrapInNameSpace(const Namespace *ns,
+                              const std::string &name) const {
+    if (CurrentNameSpace() == ns) return name;
+    std::string prefix = GetRelativeNamespaceTraversal(CurrentNameSpace(), ns);
+    return prefix + name;
+  }
+
+  // Determine the namespace traversal needed from the Rust crate root.
+  // This may be useful in the future for referring to included files, but is
+  // currently unused.
+  std::string GetAbsoluteNamespaceTraversal(const Namespace *dst) const {
+    std::stringstream stream;
+
+    stream << "::";
+    for (auto d = dst->components.begin(); d != dst->components.end(); ++d) {
+      stream << MakeSnakeCase(*d) + "::";
+    }
+    return stream.str();
+  }
+
+  // Determine the relative namespace traversal needed to reference one
+  // namespace from another namespace. This is useful because it does not force
+  // the user to have a particular file layout. (If we output absolute
+  // namespace paths, that may require users to organize their Rust crates in a
+  // particular way.)
+  std::string GetRelativeNamespaceTraversal(const Namespace *src,
+                                            const Namespace *dst) const {
+    // calculate the path needed to reference dst from src.
+    // example: f(A::B::C, A::B::C) -> (none)
+    // example: f(A::B::C, A::B)    -> super::
+    // example: f(A::B::C, A::B::D) -> super::D
+    // example: f(A::B::C, A)       -> super::super::
+    // example: f(A::B::C, D)       -> super::super::super::D
+    // example: f(A::B::C, D::E)    -> super::super::super::D::E
+    // example: f(A, D::E)          -> super::D::E
+    // does not include leaf object (typically a struct type).
+
+    size_t i = 0;
+    std::stringstream stream;
+
+    auto s = src->components.begin();
+    auto d = dst->components.begin();
+    for (;;) {
+      if (s == src->components.end()) { break; }
+      if (d == dst->components.end()) { break; }
+      if (*s != *d) { break; }
+      ++s;
+      ++d;
+      ++i;
+    }
+
+    for (; s != src->components.end(); ++s) { stream << "super::"; }
+    for (; d != dst->components.end(); ++d) {
+      stream << MakeSnakeCase(*d) + "::";
+    }
+    return stream.str();
+  }
+
+  // Generate a comment from the schema.
+  void GenComment(const std::vector<std::string> &dc, const char *prefix = "") {
+    std::string text;
+    ::flatbuffers::GenComment(dc, &text, nullptr, prefix);
+    code_ += text + "\\";
+  }
+
+  // Return a Rust type from the table in idl.h.
+  std::string GetTypeBasic(const Type &type) const {
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool:
+      case ftEnumKey:
+      case ftUnionKey: {
+        break;
+      }
+      default: {
+        FLATBUFFERS_ASSERT(false && "incorrect type given");
+      }
+    }
+
+    // clang-format off
+    static const char * const ctypename[] = {
+    #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, \
+                           RTYPE, ...) \
+      #RTYPE,
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+    #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+
+    if (type.enum_def) { return WrapInNameSpace(*type.enum_def); }
+    return ctypename[type.base_type];
+  }
+
+  // Look up the native type for an enum. This will always be an integer like
+  // u8, i32, etc.
+  std::string GetEnumTypeForDecl(const Type &type) {
+    const auto ft = GetFullType(type);
+    if (!(ft == ftEnumKey || ft == ftUnionKey)) {
+      FLATBUFFERS_ASSERT(false && "precondition failed in GetEnumTypeForDecl");
+    }
+
+    // clang-format off
+    static const char *ctypename[] = {
+    #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, \
+                           RTYPE, ...) \
+      #RTYPE,
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+    #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+
+    // Enums can be bools, but their Rust representation must be a u8, as used
+    // in the repr attribute (#[repr(bool)] is an invalid attribute).
+    if (type.base_type == BASE_TYPE_BOOL) return "u8";
+    return ctypename[type.base_type];
+  }
+
+  // Return a Rust type for any type (scalar, table, struct) specifically for
+  // using a FlatBuffer.
+  std::string GetTypeGet(const Type &type) const {
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool:
+      case ftEnumKey:
+      case ftUnionKey: {
+        return GetTypeBasic(type);
+      }
+      case ftArrayOfBuiltin:
+      case ftArrayOfEnum:
+      case ftArrayOfStruct: {
+        return "[" + GetTypeGet(type.VectorType()) + "; " +
+               NumToString(type.fixed_length) + "]";
+      }
+      case ftTable: {
+        return WrapInNameSpace(type.struct_def->defined_namespace,
+                               type.struct_def->name) +
+               "<'a>";
+      }
+      default: {
+        return WrapInNameSpace(type.struct_def->defined_namespace,
+                               type.struct_def->name);
+      }
+    }
+  }
+
+  std::string GetEnumValue(const EnumDef &enum_def,
+                           const EnumVal &enum_val) const {
+    return Name(enum_def) + "::" + Name(enum_val);
+  }
+
+  // 1 suffix since old C++ can't figure out the overload.
+  void ForAllEnumValues1(const EnumDef &enum_def,
+                         std::function<void(const EnumVal &)> cb) {
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const auto &ev = **it;
+      code_.SetValue("VARIANT", Name(ev));
+      code_.SetValue("VALUE", enum_def.ToString(ev));
+      cb(ev);
+    }
+  }
+  void ForAllEnumValues(const EnumDef &enum_def, std::function<void()> cb) {
+    std::function<void(const EnumVal &)> wrapped = [&](const EnumVal &unused) {
+      (void)unused;
+      cb();
+    };
+    ForAllEnumValues1(enum_def, wrapped);
+  }
+  // Generate an enum declaration,
+  // an enum string lookup table,
+  // an enum match function,
+  // and an enum array of values
+  void GenEnum(const EnumDef &enum_def) {
+    code_.SetValue("ENUM_NAME", Name(enum_def));
+    code_.SetValue("BASE_TYPE", GetEnumTypeForDecl(enum_def.underlying_type));
+    code_.SetValue("ENUM_NAME_SNAKE", MakeSnakeCase(Name(enum_def)));
+    code_.SetValue("ENUM_NAME_CAPS", MakeUpper(MakeSnakeCase(Name(enum_def))));
+    const EnumVal *minv = enum_def.MinValue();
+    const EnumVal *maxv = enum_def.MaxValue();
+    FLATBUFFERS_ASSERT(minv && maxv);
+    code_.SetValue("ENUM_MIN_BASE_VALUE", enum_def.ToString(*minv));
+    code_.SetValue("ENUM_MAX_BASE_VALUE", enum_def.ToString(*maxv));
+
+    if (IsBitFlagsEnum(enum_def)) {
+      // Defer to the convenient and canonical bitflags crate. We declare it in
+      // a module to #allow camel case constants in a smaller scope. This
+      // matches Flatbuffers c-modeled enums where variants are associated
+      // constants but in camel case.
+      code_ += "#[allow(non_upper_case_globals)]";
+      code_ += "mod bitflags_{{ENUM_NAME_SNAKE}} {";
+      code_ += "  flatbuffers::bitflags::bitflags! {";
+      GenComment(enum_def.doc_comment, "    ");
+      code_ += "    #[derive(Default)]";
+      code_ += "    pub struct {{ENUM_NAME}}: {{BASE_TYPE}} {";
+      ForAllEnumValues1(enum_def, [&](const EnumVal &ev) {
+        this->GenComment(ev.doc_comment, "      ");
+        code_ += "      const {{VARIANT}} = {{VALUE}};";
+      });
+      code_ += "    }";
+      code_ += "  }";
+      code_ += "}";
+      code_ += "pub use self::bitflags_{{ENUM_NAME_SNAKE}}::{{ENUM_NAME}};";
+      code_ += "";
+
+      code_.SetValue("FROM_BASE", "unsafe { Self::from_bits_unchecked(b) }");
+      code_.SetValue("INTO_BASE", "self.bits()");
+    } else {
+      // Normal, c-modelled enums.
+      // Deprecated associated constants;
+      const std::string deprecation_warning =
+          "#[deprecated(since = \"2.0.0\", note = \"Use associated constants"
+          " instead. This will no longer be generated in 2021.\")]";
+      code_ += deprecation_warning;
+      code_ +=
+          "pub const ENUM_MIN_{{ENUM_NAME_CAPS}}: {{BASE_TYPE}}"
+          " = {{ENUM_MIN_BASE_VALUE}};";
+      code_ += deprecation_warning;
+      code_ +=
+          "pub const ENUM_MAX_{{ENUM_NAME_CAPS}}: {{BASE_TYPE}}"
+          " = {{ENUM_MAX_BASE_VALUE}};";
+      auto num_fields = NumToString(enum_def.size());
+      code_ += deprecation_warning;
+      code_ += "#[allow(non_camel_case_types)]";
+      code_ += "pub const ENUM_VALUES_{{ENUM_NAME_CAPS}}: [{{ENUM_NAME}}; " +
+               num_fields + "] = [";
+      ForAllEnumValues1(enum_def, [&](const EnumVal &ev) {
+        code_ += "  " + GetEnumValue(enum_def, ev) + ",";
+      });
+      code_ += "];";
+      code_ += "";
+
+      GenComment(enum_def.doc_comment);
+      // Derive Default to be 0. flatc enforces this when the enum
+      // is put into a struct, though this isn't documented behavior, it is
+      // needed to derive defaults in struct objects.
+      code_ +=
+          "#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, "
+          "Default)]";
+      code_ += "#[repr(transparent)]";
+      code_ += "pub struct {{ENUM_NAME}}(pub {{BASE_TYPE}});";
+      code_ += "#[allow(non_upper_case_globals)]";
+      code_ += "impl {{ENUM_NAME}} {";
+      ForAllEnumValues1(enum_def, [&](const EnumVal &ev) {
+        this->GenComment(ev.doc_comment, "  ");
+        code_ += "  pub const {{VARIANT}}: Self = Self({{VALUE}});";
+      });
+      code_ += "";
+      // Generate Associated constants
+      code_ += "  pub const ENUM_MIN: {{BASE_TYPE}} = {{ENUM_MIN_BASE_VALUE}};";
+      code_ += "  pub const ENUM_MAX: {{BASE_TYPE}} = {{ENUM_MAX_BASE_VALUE}};";
+      code_ += "  pub const ENUM_VALUES: &'static [Self] = &[";
+      ForAllEnumValues(enum_def, [&]() { code_ += "    Self::{{VARIANT}},"; });
+      code_ += "  ];";
+      code_ += "  /// Returns the variant's name or \"\" if unknown.";
+      code_ += "  pub fn variant_name(self) -> Option<&'static str> {";
+      code_ += "    match self {";
+      ForAllEnumValues(enum_def, [&]() {
+        code_ += "      Self::{{VARIANT}} => Some(\"{{VARIANT}}\"),";
+      });
+      code_ += "      _ => None,";
+      code_ += "    }";
+      code_ += "  }";
+      code_ += "}";
+
+      // Generate Debug. Unknown variants are printed like "<UNKNOWN 42>".
+      code_ += "impl std::fmt::Debug for {{ENUM_NAME}} {";
+      code_ +=
+          "  fn fmt(&self, f: &mut std::fmt::Formatter) ->"
+          " std::fmt::Result {";
+      code_ += "    if let Some(name) = self.variant_name() {";
+      code_ += "      f.write_str(name)";
+      code_ += "    } else {";
+      code_ += "      f.write_fmt(format_args!(\"<UNKNOWN {:?}>\", self.0))";
+      code_ += "    }";
+      code_ += "  }";
+      code_ += "}";
+
+      code_.SetValue("FROM_BASE", "Self(b)");
+      code_.SetValue("INTO_BASE", "self.0");
+    }
+
+    // Generate Follow and Push so we can serialize and stuff.
+    code_ += "impl<'a> flatbuffers::Follow<'a> for {{ENUM_NAME}} {";
+    code_ += "  type Inner = Self;";
+    code_ += "  #[inline]";
+    code_ += "  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {";
+    code_ += "    let b = unsafe {";
+    code_ += "      flatbuffers::read_scalar_at::<{{BASE_TYPE}}>(buf, loc)";
+    code_ += "    };";
+    code_ += "    {{FROM_BASE}}";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+    code_ += "impl flatbuffers::Push for {{ENUM_NAME}} {";
+    code_ += "    type Output = {{ENUM_NAME}};";
+    code_ += "    #[inline]";
+    code_ += "    fn push(&self, dst: &mut [u8], _rest: &[u8]) {";
+    code_ +=
+        "        unsafe { flatbuffers::emplace_scalar::<{{BASE_TYPE}}>"
+        "(dst, {{INTO_BASE}}); }";
+    code_ += "    }";
+    code_ += "}";
+    code_ += "";
+    code_ += "impl flatbuffers::EndianScalar for {{ENUM_NAME}} {";
+    code_ += "  #[inline]";
+    code_ += "  fn to_little_endian(self) -> Self {";
+    code_ += "    let b = {{BASE_TYPE}}::to_le({{INTO_BASE}});";
+    code_ += "    {{FROM_BASE}}";
+    code_ += "  }";
+    code_ += "  #[inline]";
+    code_ += "  #[allow(clippy::wrong_self_convention)]";
+    code_ += "  fn from_little_endian(self) -> Self {";
+    code_ += "    let b = {{BASE_TYPE}}::from_le({{INTO_BASE}});";
+    code_ += "    {{FROM_BASE}}";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+
+    // Generate verifier - deferring to the base type.
+    code_ += "impl<'a> flatbuffers::Verifiable for {{ENUM_NAME}} {";
+    code_ += "  #[inline]";
+    code_ += "  fn run_verifier(";
+    code_ += "    v: &mut flatbuffers::Verifier, pos: usize";
+    code_ += "  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {";
+    code_ += "    use self::flatbuffers::Verifiable;";
+    code_ += "    {{BASE_TYPE}}::run_verifier(v, pos)";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+    // Enums are basically integers.
+    code_ += "impl flatbuffers::SimpleToVerifyInSlice for {{ENUM_NAME}} {}";
+
+    if (enum_def.is_union) {
+      // Generate typesafe offset(s) for unions
+      code_.SetValue("NAME", Name(enum_def));
+      code_.SetValue("UNION_OFFSET_NAME", Name(enum_def) + "UnionTableOffset");
+      code_ += "pub struct {{UNION_OFFSET_NAME}} {}";
+      code_ += "";
+      if (parser_.opts.generate_object_based_api) { GenUnionObject(enum_def); }
+    }
+  }
+
+  // CASPER: dedup Object versions from non object versions.
+  void ForAllUnionObjectVariantsBesidesNone(const EnumDef &enum_def,
+                                            std::function<void()> cb) {
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &enum_val = **it;
+      if (enum_val.union_type.base_type == BASE_TYPE_NONE) continue;
+      code_.SetValue("VARIANT_NAME", Name(enum_val));
+      code_.SetValue("NATIVE_VARIANT", MakeCamel(Name(enum_val)));
+      code_.SetValue("U_ELEMENT_NAME", MakeSnakeCase(Name(enum_val)));
+      code_.SetValue("U_ELEMENT_TABLE_TYPE",
+                     NamespacedNativeName(*enum_val.union_type.struct_def));
+      cb();
+    }
+  }
+  void GenUnionObject(const EnumDef &enum_def) {
+    code_.SetValue("ENUM_NAME", Name(enum_def));
+    code_.SetValue("ENUM_NAME_SNAKE", MakeSnakeCase(Name(enum_def)));
+    code_.SetValue("NATIVE_NAME", NativeName(enum_def));
+
+    // Generate native union.
+    code_ += "#[non_exhaustive]";
+    code_ += "#[derive(Debug, Clone, PartialEq)]";
+    code_ += "pub enum {{NATIVE_NAME}} {";
+    code_ += "  NONE,";
+    ForAllUnionObjectVariantsBesidesNone(enum_def, [&] {
+      code_ += "  {{NATIVE_VARIANT}}(Box<{{U_ELEMENT_TABLE_TYPE}}>),";
+    });
+    code_ += "}";
+    // Generate Default (NONE).
+    code_ += "impl Default for {{NATIVE_NAME}} {";
+    code_ += "  fn default() -> Self {";
+    code_ += "    Self::NONE";
+    code_ += "  }";
+    code_ += "}";
+
+    // Generate native union methods.
+    code_ += "impl {{NATIVE_NAME}} {";
+
+    // Get flatbuffers union key.
+    // CASPER: add docstrings?
+    code_ += "  pub fn {{ENUM_NAME_SNAKE}}_type(&self) -> {{ENUM_NAME}} {";
+    code_ += "    match self {";
+    code_ += "      Self::NONE => {{ENUM_NAME}}::NONE,";
+    ForAllUnionObjectVariantsBesidesNone(enum_def, [&] {
+      code_ +=
+          "      Self::{{NATIVE_VARIANT}}(_) => {{ENUM_NAME}}::"
+          "{{VARIANT_NAME}},";
+    });
+    code_ += "    }";
+    code_ += "  }";
+    // Pack flatbuffers union value
+    code_ +=
+        "  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder)"
+        " -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>"
+        " {";
+    code_ += "    match self {";
+    code_ += "      Self::NONE => None,";
+    ForAllUnionObjectVariantsBesidesNone(enum_def, [&] {
+      code_ +=
+          "      Self::{{NATIVE_VARIANT}}(v) => "
+          "Some(v.pack(fbb).as_union_value()),";
+    });
+    code_ += "    }";
+    code_ += "  }";
+
+    // Generate some accessors;
+    ForAllUnionObjectVariantsBesidesNone(enum_def, [&] {
+      // Move accessor.
+      code_ +=
+          "  /// If the union variant matches, return the owned "
+          "{{U_ELEMENT_TABLE_TYPE}}, setting the union to NONE.";
+      code_ +=
+          "  pub fn take_{{U_ELEMENT_NAME}}(&mut self) -> "
+          "Option<Box<{{U_ELEMENT_TABLE_TYPE}}>> {";
+      code_ += "    if let Self::{{NATIVE_VARIANT}}(_) = self {";
+      code_ += "      let v = std::mem::replace(self, Self::NONE);";
+      code_ += "      if let Self::{{NATIVE_VARIANT}}(w) = v {";
+      code_ += "        Some(w)";
+      code_ += "      } else {";
+      code_ += "        unreachable!()";
+      code_ += "      }";
+      code_ += "    } else {";
+      code_ += "      None";
+      code_ += "    }";
+      code_ += "  }";
+      // Immutable reference accessor.
+      code_ +=
+          "  /// If the union variant matches, return a reference to the "
+          "{{U_ELEMENT_TABLE_TYPE}}.";
+      code_ +=
+          "  pub fn as_{{U_ELEMENT_NAME}}(&self) -> "
+          "Option<&{{U_ELEMENT_TABLE_TYPE}}> {";
+      code_ +=
+          "    if let Self::{{NATIVE_VARIANT}}(v) = self "
+          "{ Some(v.as_ref()) } else { None }";
+      code_ += "  }";
+      // Mutable reference accessor.
+      code_ +=
+          "  /// If the union variant matches, return a mutable reference"
+          " to the {{U_ELEMENT_TABLE_TYPE}}.";
+      code_ +=
+          "  pub fn as_{{U_ELEMENT_NAME}}_mut(&mut self) -> "
+          "Option<&mut {{U_ELEMENT_TABLE_TYPE}}> {";
+      code_ +=
+          "    if let Self::{{NATIVE_VARIANT}}(v) = self "
+          "{ Some(v.as_mut()) } else { None }";
+      code_ += "  }";
+    });
+    code_ += "}";  // End union methods impl.
+  }
+
+  std::string GetFieldOffsetName(const FieldDef &field) {
+    return "VT_" + MakeUpper(Name(field));
+  }
+
+  enum DefaultContext { kBuilder, kAccessor, kObject };
+  std::string GetDefaultValue(const FieldDef &field,
+                              const DefaultContext context) {
+    if (context == kBuilder) {
+      // Builders and Args structs model nonscalars "optional" even if they're
+      // required or have defaults according to the schema. I guess its because
+      // WIPOffset is not nullable.
+      if (!IsScalar(field.value.type.base_type) || field.IsOptional()) {
+        return "None";
+      }
+    } else {
+      // This for defaults in objects.
+      // Unions have a NONE variant instead of using Rust's None.
+      if (field.IsOptional() && !IsUnion(field.value.type)) { return "None"; }
+    }
+    switch (GetFullType(field.value.type)) {
+      case ftInteger:
+      case ftFloat: {
+        return field.value.constant;
+      }
+      case ftBool: {
+        return field.value.constant == "0" ? "false" : "true";
+      }
+      case ftUnionKey:
+      case ftEnumKey: {
+        auto ev = field.value.type.enum_def->FindByValue(field.value.constant);
+        if (!ev) return "Default::default()";  // Bitflags enum.
+        return WrapInNameSpace(field.value.type.enum_def->defined_namespace,
+                               GetEnumValue(*field.value.type.enum_def, *ev));
+      }
+      case ftUnionValue: {
+        return ObjectFieldType(field, true) + "::NONE";
+      }
+      case ftString: {
+        // Required fields do not have defaults defined by the schema, but we
+        // need one for Rust's Default trait so we use empty string. The usual
+        // value of field.value.constant is `0`, which is non-sensical except
+        // maybe to c++ (nullptr == 0).
+        // TODO: Escape strings?
+        const std::string defval =
+            field.IsRequired() ? "\"\"" : "\"" + field.value.constant + "\"";
+        if (context == kObject) return defval + ".to_string()";
+        if (context == kAccessor) return "&" + defval;
+        FLATBUFFERS_ASSERT("Unreachable.");
+        return "INVALID_CODE_GENERATION";
+      }
+
+      case ftArrayOfStruct:
+      case ftArrayOfEnum:
+      case ftArrayOfBuiltin:
+      case ftVectorOfBool:
+      case ftVectorOfFloat:
+      case ftVectorOfInteger:
+      case ftVectorOfString:
+      case ftVectorOfStruct:
+      case ftVectorOfTable:
+      case ftVectorOfEnumKey:
+      case ftVectorOfUnionValue:
+      case ftStruct:
+      case ftTable: {
+        // We only support empty vectors which matches the defaults for
+        // &[T] and Vec<T> anyway.
+        //
+        // For required structs and tables fields, we defer to their object API
+        // defaults. This works so long as there's nothing recursive happening,
+        // but `table Infinity { i: Infinity (required); }` does compile.
+        return "Default::default()";
+      }
+    }
+    FLATBUFFERS_ASSERT("Unreachable.");
+    return "INVALID_CODE_GENERATION";
+  }
+
+  // Create the return type for fields in the *BuilderArgs structs that are
+  // used to create Tables.
+  //
+  // Note: we could make all inputs to the BuilderArgs be an Option, as well
+  // as all outputs. But, the UX of Flatbuffers is that the user doesn't get to
+  // know if the value is default or not, because there are three ways to
+  // return a default value:
+  // 1) return a stored value that happens to be the default,
+  // 2) return a hardcoded value because the relevant vtable field is not in
+  //    the vtable, or
+  // 3) return a hardcoded value because the vtable field value is set to zero.
+  std::string TableBuilderArgsDefnType(const FieldDef &field,
+                                       const std::string &lifetime) {
+    const Type &type = field.value.type;
+    auto WrapOption = [&](std::string s) {
+      return IsOptionalToBuilder(field) ? "Option<" + s + ">" : s;
+    };
+    auto WrapVector = [&](std::string ty) {
+      return WrapOption("flatbuffers::WIPOffset<flatbuffers::Vector<" +
+                        lifetime + ", " + ty + ">>");
+    };
+    auto WrapUOffsetsVector = [&](std::string ty) {
+      return WrapVector("flatbuffers::ForwardsUOffset<" + ty + ">");
+    };
+
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool: {
+        return WrapOption(GetTypeBasic(type));
+      }
+      case ftStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption("&" + lifetime + " " + typname);
+      }
+      case ftTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption("flatbuffers::WIPOffset<" + typname + "<" + lifetime +
+                          ">>");
+      }
+      case ftString: {
+        return WrapOption("flatbuffers::WIPOffset<&" + lifetime + " str>");
+      }
+      case ftEnumKey:
+      case ftUnionKey: {
+        return WrapOption(WrapInNameSpace(*type.enum_def));
+      }
+      case ftUnionValue: {
+        return "Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>";
+      }
+
+      case ftVectorOfInteger:
+      case ftVectorOfBool:
+      case ftVectorOfFloat: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        return WrapVector(typname);
+      }
+      case ftVectorOfEnumKey: {
+        const auto typname = WrapInNameSpace(*type.enum_def);
+        return WrapVector(typname);
+      }
+      case ftVectorOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapVector(typname);
+      }
+      case ftVectorOfTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapUOffsetsVector(typname + "<" + lifetime + ">");
+      }
+      case ftVectorOfString: {
+        return WrapUOffsetsVector("&" + lifetime + " str");
+      }
+      case ftVectorOfUnionValue: {
+        return WrapUOffsetsVector("flatbuffers::Table<" + lifetime + ">");
+      }
+      case ftArrayOfEnum:
+      case ftArrayOfStruct:
+      case ftArrayOfBuiltin: {
+        FLATBUFFERS_ASSERT(false && "arrays are not supported within tables");
+        return "ARRAYS_NOT_SUPPORTED_IN_TABLES";
+      }
+    }
+    return "INVALID_CODE_GENERATION";  // for return analysis
+  }
+
+  std::string ObjectFieldType(const FieldDef &field, bool in_a_table) {
+    const Type &type = field.value.type;
+    std::string ty;
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftBool:
+      case ftFloat: {
+        ty = GetTypeBasic(type);
+        break;
+      }
+      case ftString: {
+        ty = "String";
+        break;
+      }
+      case ftStruct: {
+        ty = NamespacedNativeName(*type.struct_def);
+        break;
+      }
+      case ftTable: {
+        // Since Tables can contain themselves, Box is required to avoid
+        // infinite types.
+        ty = "Box<" + NamespacedNativeName(*type.struct_def) + ">";
+        break;
+      }
+      case ftUnionKey: {
+        // There is no native "UnionKey", natively, unions are rust enums with
+        // newtype-struct-variants.
+        return "INVALID_CODE_GENERATION";
+      }
+      case ftUnionValue: {
+        ty = NamespacedNativeName(*type.enum_def);
+        break;
+      }
+      case ftEnumKey: {
+        ty = WrapInNameSpace(*type.enum_def);
+        break;
+      }
+      // Vectors are in tables and are optional
+      case ftVectorOfEnumKey: {
+        ty = "Vec<" + WrapInNameSpace(*type.VectorType().enum_def) + ">";
+        break;
+      }
+      case ftVectorOfInteger:
+      case ftVectorOfBool:
+      case ftVectorOfFloat: {
+        ty = "Vec<" + GetTypeBasic(type.VectorType()) + ">";
+        break;
+      }
+      case ftVectorOfString: {
+        ty = "Vec<String>";
+        break;
+      }
+      case ftVectorOfTable:
+      case ftVectorOfStruct: {
+        ty = NamespacedNativeName(*type.VectorType().struct_def);
+        ty = "Vec<" + ty + ">";
+        break;
+      }
+      case ftVectorOfUnionValue: {
+        FLATBUFFERS_ASSERT(false && "vectors of unions are not yet supported");
+        return "INVALID_CODE_GENERATION";  // OH NO!
+      }
+      case ftArrayOfEnum: {
+        ty = "[" + WrapInNameSpace(*type.VectorType().enum_def) + "; " +
+             NumToString(type.fixed_length) + "]";
+        break;
+      }
+      case ftArrayOfStruct: {
+        ty = "[" + NamespacedNativeName(*type.VectorType().struct_def) + "; " +
+             NumToString(type.fixed_length) + "]";
+        break;
+      }
+      case ftArrayOfBuiltin: {
+        ty = "[" + GetTypeBasic(type.VectorType()) + "; " +
+             NumToString(type.fixed_length) + "]";
+        break;
+      }
+    }
+    if (in_a_table && !IsUnion(type) && field.IsOptional()) {
+      return "Option<" + ty + ">";
+    } else {
+      return ty;
+    }
+  }
+
+  std::string TableBuilderArgsAddFuncType(const FieldDef &field,
+                                          const std::string &lifetime) {
+    const Type &type = field.value.type;
+
+    switch (GetFullType(field.value.type)) {
+      case ftVectorOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime + ", " +
+               typname + ">>";
+      }
+      case ftVectorOfTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime +
+               ", flatbuffers::ForwardsUOffset<" + typname + "<" + lifetime +
+               ">>>>";
+      }
+      case ftVectorOfInteger:
+      case ftVectorOfBool:
+      case ftVectorOfFloat: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime + ", " +
+               typname + ">>";
+      }
+      case ftVectorOfString: {
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime +
+               ", flatbuffers::ForwardsUOffset<&" + lifetime + " str>>>";
+      }
+      case ftVectorOfEnumKey: {
+        const auto typname = WrapInNameSpace(*type.enum_def);
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime + ", " +
+               typname + ">>";
+      }
+      case ftVectorOfUnionValue: {
+        return "flatbuffers::WIPOffset<flatbuffers::Vector<" + lifetime +
+               ", flatbuffers::ForwardsUOffset<flatbuffers::Table<" + lifetime +
+               ">>>";
+      }
+      case ftEnumKey:
+      case ftUnionKey: {
+        const auto typname = WrapInNameSpace(*type.enum_def);
+        return typname;
+      }
+      case ftStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "&" + typname + "";
+      }
+      case ftTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "flatbuffers::WIPOffset<" + typname + "<" + lifetime + ">>";
+      }
+      case ftInteger:
+      case ftBool:
+      case ftFloat: {
+        return GetTypeBasic(type);
+      }
+      case ftString: {
+        return "flatbuffers::WIPOffset<&" + lifetime + " str>";
+      }
+      case ftUnionValue: {
+        return "flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>";
+      }
+      case ftArrayOfBuiltin: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        return "flatbuffers::Array<" + lifetime + ", " + typname + ", " +
+               NumToString(type.fixed_length) + ">";
+      }
+      case ftArrayOfEnum: {
+        const auto typname = WrapInNameSpace(*type.enum_def);
+        return "flatbuffers::Array<" + lifetime + ", " + typname + ", " +
+               NumToString(type.fixed_length) + ">";
+      }
+      case ftArrayOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "flatbuffers::Array<" + lifetime + ", " + typname + ", " +
+               NumToString(type.fixed_length) + ">";
+      }
+    }
+
+    return "INVALID_CODE_GENERATION";  // for return analysis
+  }
+
+  std::string TableBuilderArgsAddFuncBody(const FieldDef &field) {
+    const Type &type = field.value.type;
+
+    switch (GetFullType(field.value.type)) {
+      case ftInteger:
+      case ftBool:
+      case ftFloat: {
+        const auto typname = GetTypeBasic(field.value.type);
+        return (field.IsOptional() ? "self.fbb_.push_slot_always::<"
+                                   : "self.fbb_.push_slot::<") +
+               typname + ">";
+      }
+      case ftEnumKey:
+      case ftUnionKey: {
+        const auto underlying_typname = GetTypeBasic(type);
+        return (field.IsOptional() ? "self.fbb_.push_slot_always::<"
+                                   : "self.fbb_.push_slot::<") +
+               underlying_typname + ">";
+      }
+
+      case ftStruct: {
+        const std::string typname = WrapInNameSpace(*type.struct_def);
+        return "self.fbb_.push_slot_always::<&" + typname + ">";
+      }
+      case ftTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return "self.fbb_.push_slot_always::<flatbuffers::WIPOffset<" +
+               typname + ">>";
+      }
+
+      case ftUnionValue:
+      case ftString:
+      case ftVectorOfInteger:
+      case ftVectorOfFloat:
+      case ftVectorOfBool:
+      case ftVectorOfEnumKey:
+      case ftVectorOfStruct:
+      case ftVectorOfTable:
+      case ftVectorOfString:
+      case ftVectorOfUnionValue: {
+        return "self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>";
+      }
+      case ftArrayOfEnum:
+      case ftArrayOfStruct:
+      case ftArrayOfBuiltin: {
+        FLATBUFFERS_ASSERT(false && "arrays are not supported within tables");
+        return "ARRAYS_NOT_SUPPORTED_IN_TABLES";
+      }
+    }
+    return "INVALID_CODE_GENERATION";  // for return analysis
+  }
+
+  std::string GenTableAccessorFuncReturnType(const FieldDef &field,
+                                             const std::string &lifetime) {
+    const Type &type = field.value.type;
+    const auto WrapOption = [&](std::string s) {
+      return field.IsOptional() ? "Option<" + s + ">" : s;
+    };
+
+    switch (GetFullType(field.value.type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool: {
+        return WrapOption(GetTypeBasic(type));
+      }
+      case ftStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption("&" + lifetime + " " + typname);
+      }
+      case ftTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption(typname + "<" + lifetime + ">");
+      }
+      case ftEnumKey:
+      case ftUnionKey: {
+        return WrapOption(WrapInNameSpace(*type.enum_def));
+      }
+
+      case ftUnionValue: {
+        return WrapOption("flatbuffers::Table<" + lifetime + ">");
+      }
+      case ftString: {
+        return WrapOption("&" + lifetime + " str");
+      }
+      case ftVectorOfInteger:
+      case ftVectorOfBool:
+      case ftVectorOfFloat: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        const auto vector_type =
+            IsOneByte(type.VectorType().base_type)
+                ? "&" + lifetime + " [" + typname + "]"
+                : "flatbuffers::Vector<" + lifetime + ", " + typname + ">";
+        return WrapOption(vector_type);
+      }
+      case ftVectorOfEnumKey: {
+        const auto typname = WrapInNameSpace(*type.enum_def);
+        return WrapOption("flatbuffers::Vector<" + lifetime + ", " + typname +
+                          ">");
+      }
+      case ftVectorOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption("&" + lifetime + " [" + typname + "]");
+      }
+      case ftVectorOfTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapOption("flatbuffers::Vector<" + lifetime +
+                          ", flatbuffers::ForwardsUOffset<" + typname + "<" +
+                          lifetime + ">>>");
+      }
+      case ftVectorOfString: {
+        return WrapOption("flatbuffers::Vector<" + lifetime +
+                          ", flatbuffers::ForwardsUOffset<&" + lifetime +
+                          " str>>");
+      }
+      case ftVectorOfUnionValue: {
+        FLATBUFFERS_ASSERT(false && "vectors of unions are not yet supported");
+        // TODO(rw): when we do support these, we should consider using the
+        //           Into trait to convert tables to typesafe union values.
+        return "INVALID_CODE_GENERATION";  // for return analysis
+      }
+      case ftArrayOfEnum:
+      case ftArrayOfStruct:
+      case ftArrayOfBuiltin: {
+        FLATBUFFERS_ASSERT(false && "arrays are not supported within tables");
+        return "ARRAYS_NOT_SUPPORTED_IN_TABLES";
+      }
+    }
+    return "INVALID_CODE_GENERATION";  // for return analysis
+  }
+
+  std::string FollowType(const Type &type, const std::string &lifetime) {
+    // IsVector... This can be made iterative?
+
+    const auto WrapForwardsUOffset = [](std::string ty) -> std::string {
+      return "flatbuffers::ForwardsUOffset<" + ty + ">";
+    };
+    const auto WrapVector = [&](std::string ty) -> std::string {
+      return "flatbuffers::Vector<" + lifetime + ", " + ty + ">";
+    };
+    const auto WrapArray = [&](std::string ty, uint16_t length) -> std::string {
+      return "flatbuffers::Array<" + lifetime + ", " + ty + ", " +
+             NumToString(length) + ">";
+    };
+    switch (GetFullType(type)) {
+      case ftInteger:
+      case ftFloat:
+      case ftBool: {
+        return GetTypeBasic(type);
+      }
+      case ftStruct: {
+        return WrapInNameSpace(*type.struct_def);
+      }
+      case ftUnionKey:
+      case ftEnumKey: {
+        return WrapInNameSpace(*type.enum_def);
+      }
+      case ftTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapForwardsUOffset(typname);
+      }
+      case ftUnionValue: {
+        return WrapForwardsUOffset("flatbuffers::Table<" + lifetime + ">");
+      }
+      case ftString: {
+        return WrapForwardsUOffset("&str");
+      }
+      case ftVectorOfInteger:
+      case ftVectorOfBool:
+      case ftVectorOfFloat: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        return WrapForwardsUOffset(WrapVector(typname));
+      }
+      case ftVectorOfEnumKey: {
+        const auto typname = WrapInNameSpace(*type.VectorType().enum_def);
+        return WrapForwardsUOffset(WrapVector(typname));
+      }
+      case ftVectorOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapForwardsUOffset(WrapVector(typname));
+      }
+      case ftVectorOfTable: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapForwardsUOffset(WrapVector(WrapForwardsUOffset(typname)));
+      }
+      case ftVectorOfString: {
+        return WrapForwardsUOffset(
+            WrapVector(WrapForwardsUOffset("&" + lifetime + " str")));
+      }
+      case ftVectorOfUnionValue: {
+        FLATBUFFERS_ASSERT(false && "vectors of unions are not yet supported");
+        return "INVALID_CODE_GENERATION";  // for return analysis
+      }
+      case ftArrayOfEnum: {
+        const auto typname = WrapInNameSpace(*type.VectorType().enum_def);
+        return WrapArray(typname, type.fixed_length);
+      }
+      case ftArrayOfStruct: {
+        const auto typname = WrapInNameSpace(*type.struct_def);
+        return WrapArray(typname, type.fixed_length);
+      }
+      case ftArrayOfBuiltin: {
+        const auto typname = GetTypeBasic(type.VectorType());
+        return WrapArray(typname, type.fixed_length);
+      }
+    }
+    return "INVALID_CODE_GENERATION";  // for return analysis
+  }
+
+  std::string GenTableAccessorFuncBody(const FieldDef &field,
+                                       const std::string &lifetime) {
+    const std::string vt_offset = GetFieldOffsetName(field);
+    const std::string typname = FollowType(field.value.type, lifetime);
+    // Default-y fields (scalars so far) are neither optional nor required.
+    const std::string default_value =
+        !(field.IsOptional() || field.IsRequired())
+            ? "Some(" + GetDefaultValue(field, kAccessor) + ")"
+            : "None";
+    const std::string unwrap = field.IsOptional() ? "" : ".unwrap()";
+
+    const auto t = GetFullType(field.value.type);
+
+    // TODO(caspern): Shouldn't 1byte VectorOfEnumKey be slice too?
+    const std::string safe_slice =
+        (t == ftVectorOfStruct ||
+         ((t == ftVectorOfBool || t == ftVectorOfFloat ||
+           t == ftVectorOfInteger) &&
+          IsOneByte(field.value.type.VectorType().base_type)))
+            ? ".map(|v| v.safe_slice())"
+            : "";
+
+    return "self._tab.get::<" + typname + ">({{STRUCT_NAME}}::" + vt_offset +
+           ", " + default_value + ")" + safe_slice + unwrap;
+  }
+
+  // Generates a fully-qualified name getter for use with --gen-name-strings
+  void GenFullyQualifiedNameGetter(const StructDef &struct_def,
+                                   const std::string &name) {
+    code_ += "    pub const fn get_fully_qualified_name() -> &'static str {";
+    code_ += "        \"" +
+             struct_def.defined_namespace->GetFullyQualifiedName(name) + "\"";
+    code_ += "    }";
+    code_ += "";
+  }
+
+  void ForAllUnionVariantsBesidesNone(
+      const EnumDef &def, std::function<void(const EnumVal &ev)> cb) {
+    FLATBUFFERS_ASSERT(def.is_union);
+
+    for (auto it = def.Vals().begin(); it != def.Vals().end(); ++it) {
+      const EnumVal &ev = **it;
+      // TODO(cneo): Can variants be deprecated, should we skip them?
+      if (ev.union_type.base_type == BASE_TYPE_NONE) { continue; }
+      code_.SetValue(
+          "U_ELEMENT_ENUM_TYPE",
+          WrapInNameSpace(def.defined_namespace, GetEnumValue(def, ev)));
+      code_.SetValue(
+          "U_ELEMENT_TABLE_TYPE",
+          WrapInNameSpace(ev.union_type.struct_def->defined_namespace,
+                          ev.union_type.struct_def->name));
+      code_.SetValue("U_ELEMENT_NAME", MakeSnakeCase(Name(ev)));
+      cb(ev);
+    }
+  }
+
+  void ForAllTableFields(const StructDef &struct_def,
+                         std::function<void(const FieldDef &)> cb,
+                         bool reversed = false) {
+    // TODO(cneo): Remove `reversed` overload. It's only here to minimize the
+    // diff when refactoring to the `ForAllX` helper functions.
+    auto go = [&](const FieldDef &field) {
+      if (field.deprecated) return;
+      code_.SetValue("OFFSET_NAME", GetFieldOffsetName(field));
+      code_.SetValue("OFFSET_VALUE", NumToString(field.value.offset));
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("BLDR_DEF_VAL", GetDefaultValue(field, kBuilder));
+      cb(field);
+    };
+    const auto &fields = struct_def.fields.vec;
+    if (reversed) {
+      for (auto it = fields.rbegin(); it != fields.rend(); ++it) go(**it);
+    } else {
+      for (auto it = fields.begin(); it != fields.end(); ++it) go(**it);
+    }
+  }
+  // Generate an accessor struct, builder struct, and create function for a
+  // table.
+  void GenTable(const StructDef &struct_def) {
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+    code_.SetValue("OFFSET_TYPELABEL", Name(struct_def) + "Offset");
+    code_.SetValue("STRUCT_NAME_SNAKECASE", MakeSnakeCase(Name(struct_def)));
+
+    // Generate an offset type, the base type, the Follow impl, and the
+    // init_from_table impl.
+    code_ += "pub enum {{OFFSET_TYPELABEL}} {}";
+    code_ += "#[derive(Copy, Clone, PartialEq)]";
+    code_ += "";
+
+    GenComment(struct_def.doc_comment);
+
+    code_ += "pub struct {{STRUCT_NAME}}<'a> {";
+    code_ += "  pub _tab: flatbuffers::Table<'a>,";
+    code_ += "}";
+    code_ += "";
+    code_ += "impl<'a> flatbuffers::Follow<'a> for {{STRUCT_NAME}}<'a> {";
+    code_ += "    type Inner = {{STRUCT_NAME}}<'a>;";
+    code_ += "    #[inline]";
+    code_ += "    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {";
+    code_ += "        Self { _tab: flatbuffers::Table { buf, loc } }";
+    code_ += "    }";
+    code_ += "}";
+    code_ += "";
+    code_ += "impl<'a> {{STRUCT_NAME}}<'a> {";
+
+    if (parser_.opts.generate_name_strings) {
+      GenFullyQualifiedNameGetter(struct_def, struct_def.name);
+    }
+
+    code_ += "    #[inline]";
+    code_ +=
+        "    pub fn init_from_table(table: flatbuffers::Table<'a>) -> "
+        "Self {";
+    code_ += "        {{STRUCT_NAME}} { _tab: table }";
+    code_ += "    }";
+
+    // Generate a convenient create* function that uses the above builder
+    // to create a table in one function call.
+    code_.SetValue("MAYBE_US", struct_def.fields.vec.size() == 0 ? "_" : "");
+    code_.SetValue("MAYBE_LT",
+                   TableBuilderArgsNeedsLifetime(struct_def) ? "<'args>" : "");
+    code_ += "    #[allow(unused_mut)]";
+    code_ += "    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(";
+    code_ +=
+        "        _fbb: "
+        "&'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,";
+    code_ +=
+        "        {{MAYBE_US}}args: &'args {{STRUCT_NAME}}Args{{MAYBE_LT}})"
+        " -> flatbuffers::WIPOffset<{{STRUCT_NAME}}<'bldr>> {";
+
+    code_ += "      let mut builder = {{STRUCT_NAME}}Builder::new(_fbb);";
+    for (size_t size = struct_def.sortbysize ? sizeof(largest_scalar_t) : 1;
+         size; size /= 2) {
+      ForAllTableFields(
+          struct_def,
+          [&](const FieldDef &field) {
+            if (struct_def.sortbysize &&
+                size != SizeOf(field.value.type.base_type))
+              return;
+            if (IsOptionalToBuilder(field)) {
+              code_ +=
+                  "      if let Some(x) = args.{{FIELD_NAME}} "
+                  "{ builder.add_{{FIELD_NAME}}(x); }";
+            } else {
+              code_ += "      builder.add_{{FIELD_NAME}}(args.{{FIELD_NAME}});";
+            }
+          },
+          /*reverse=*/true);
+    }
+    code_ += "      builder.finish()";
+    code_ += "    }";
+    code_ += "";
+    // Generate Object API Packer function.
+    if (parser_.opts.generate_object_based_api) {
+      // TODO(cneo): Replace more for loops with ForAllX stuff.
+      // TODO(cneo): Manage indentation with IncrementIdentLevel?
+      code_.SetValue("OBJECT_NAME", NativeName(struct_def));
+      code_ += "    pub fn unpack(&self) -> {{OBJECT_NAME}} {";
+      ForAllObjectTableFields(struct_def, [&](const FieldDef &field) {
+        const Type &type = field.value.type;
+        switch (GetFullType(type)) {
+          case ftInteger:
+          case ftBool:
+          case ftFloat:
+          case ftEnumKey: {
+            code_ += "      let {{FIELD_NAME}} = self.{{FIELD_NAME}}();";
+            return;
+          }
+          case ftUnionKey: return;
+          case ftUnionValue: {
+            const auto &enum_def = *type.enum_def;
+            code_.SetValue("ENUM_NAME", WrapInNameSpace(enum_def));
+            code_.SetValue("NATIVE_ENUM_NAME", NamespacedNativeName(enum_def));
+            code_ +=
+                "      let {{FIELD_NAME}} = match "
+                "self.{{FIELD_NAME}}_type() {";
+            code_ +=
+                "        {{ENUM_NAME}}::NONE =>"
+                " {{NATIVE_ENUM_NAME}}::NONE,";
+            ForAllUnionObjectVariantsBesidesNone(enum_def, [&] {
+              code_ +=
+                  "        {{ENUM_NAME}}::{{VARIANT_NAME}} => "
+                  "{{NATIVE_ENUM_NAME}}::{{NATIVE_VARIANT}}(Box::new(";
+              code_ +=
+                  "          self.{{FIELD_NAME}}_as_"
+                  "{{U_ELEMENT_NAME}}()";
+              code_ +=
+                  "              .expect(\"Invalid union table, "
+                  "expected `{{ENUM_NAME}}::{{VARIANT_NAME}}`.\")";
+              code_ += "              .unpack()";
+              code_ += "        )),";
+            });
+            // Maybe we shouldn't throw away unknown discriminants?
+            code_ += "        _ => {{NATIVE_ENUM_NAME}}::NONE,";
+            code_ += "      };";
+            return;
+          }
+          // The rest of the types need special handling based on if the field
+          // is optional or not.
+          case ftString: {
+            code_.SetValue("EXPR", "x.to_string()");
+            break;
+          }
+          case ftStruct: {
+            code_.SetValue("EXPR", "x.unpack()");
+            break;
+          }
+          case ftTable: {
+            code_.SetValue("EXPR", "Box::new(x.unpack())");
+            break;
+          }
+          case ftVectorOfInteger:
+          case ftVectorOfBool: {
+            if (IsOneByte(type.VectorType().base_type)) {
+              // 1 byte stuff is viewed w/ slice instead of flatbuffer::Vector
+              // and thus needs to be cloned out of the slice.
+              code_.SetValue("EXPR", "x.to_vec()");
+              break;
+            }
+            code_.SetValue("EXPR", "x.into_iter().collect()");
+            break;
+          }
+          case ftVectorOfFloat:
+          case ftVectorOfEnumKey: {
+            code_.SetValue("EXPR", "x.into_iter().collect()");
+            break;
+          }
+          case ftVectorOfString: {
+            code_.SetValue("EXPR", "x.iter().map(|s| s.to_string()).collect()");
+            break;
+          }
+          case ftVectorOfStruct:
+          case ftVectorOfTable: {
+            code_.SetValue("EXPR", "x.iter().map(|t| t.unpack()).collect()");
+            break;
+          }
+          case ftVectorOfUnionValue: {
+            FLATBUFFERS_ASSERT(false && "vectors of unions not yet supported");
+            return;
+          }
+          case ftArrayOfEnum:
+          case ftArrayOfStruct:
+          case ftArrayOfBuiltin: {
+            FLATBUFFERS_ASSERT(false &&
+                               "arrays are not supported within tables");
+            return;
+          }
+        }
+        if (field.IsOptional()) {
+          code_ += "      let {{FIELD_NAME}} = self.{{FIELD_NAME}}().map(|x| {";
+          code_ += "        {{EXPR}}";
+          code_ += "      });";
+        } else {
+          code_ += "      let {{FIELD_NAME}} = {";
+          code_ += "        let x = self.{{FIELD_NAME}}();";
+          code_ += "        {{EXPR}}";
+          code_ += "      };";
+        }
+      });
+      code_ += "      {{OBJECT_NAME}} {";
+      ForAllObjectTableFields(struct_def, [&](const FieldDef &field) {
+        if (field.value.type.base_type == BASE_TYPE_UTYPE) return;
+        code_ += "        {{FIELD_NAME}},";
+      });
+      code_ += "      }";
+      code_ += "    }";
+    }
+
+    // Generate field id constants.
+    ForAllTableFields(struct_def, [&](const FieldDef &unused) {
+      (void)unused;
+      code_ +=
+          "    pub const {{OFFSET_NAME}}: flatbuffers::VOffsetT = "
+          "{{OFFSET_VALUE}};";
+    });
+    if (struct_def.fields.vec.size() > 0) code_ += "";
+
+    // Generate the accessors. Each has one of two forms:
+    //
+    // If a value can be None:
+    //   pub fn name(&'a self) -> Option<user_facing_type> {
+    //     self._tab.get::<internal_type>(offset, defaultval)
+    //   }
+    //
+    // If a value is always Some:
+    //   pub fn name(&'a self) -> user_facing_type {
+    //     self._tab.get::<internal_type>(offset, defaultval).unwrap()
+    //   }
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      code_.SetValue("RETURN_TYPE",
+                     GenTableAccessorFuncReturnType(field, "'a"));
+
+      this->GenComment(field.doc_comment, "  ");
+      code_ += "  #[inline]";
+      code_ += "  pub fn {{FIELD_NAME}}(&self) -> {{RETURN_TYPE}} {";
+      code_ += "    " + GenTableAccessorFuncBody(field, "'a");
+      code_ += "  }";
+
+      // Generate a comparison function for this field if it is a key.
+      if (field.key) { GenKeyFieldMethods(field); }
+
+      // Generate a nested flatbuffer field, if applicable.
+      auto nested = field.attributes.Lookup("nested_flatbuffer");
+      if (nested) {
+        std::string qualified_name = nested->constant;
+        auto nested_root = parser_.LookupStruct(nested->constant);
+        if (nested_root == nullptr) {
+          qualified_name = parser_.current_namespace_->GetFullyQualifiedName(
+              nested->constant);
+          nested_root = parser_.LookupStruct(qualified_name);
+        }
+        FLATBUFFERS_ASSERT(nested_root);  // Guaranteed to exist by parser.
+
+        code_.SetValue("NESTED", WrapInNameSpace(*nested_root));
+        code_ += "  pub fn {{FIELD_NAME}}_nested_flatbuffer(&'a self) -> \\";
+        if (field.IsRequired()) {
+          code_ += "{{NESTED}}<'a> {";
+          code_ += "    let data = self.{{FIELD_NAME}}();";
+          code_ += "    use flatbuffers::Follow;";
+          code_ +=
+              "    <flatbuffers::ForwardsUOffset<{{NESTED}}<'a>>>"
+              "::follow(data, 0)";
+        } else {
+          code_ += "Option<{{NESTED}}<'a>> {";
+          code_ += "    self.{{FIELD_NAME}}().map(|data| {";
+          code_ += "      use flatbuffers::Follow;";
+          code_ +=
+              "      <flatbuffers::ForwardsUOffset<{{NESTED}}<'a>>>"
+              "::follow(data, 0)";
+          code_ += "    })";
+        }
+        code_ += "  }";
+      }
+    });
+
+    // Explicit specializations for union accessors
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      if (field.value.type.base_type != BASE_TYPE_UNION) return;
+      code_.SetValue("FIELD_TYPE_FIELD_NAME", field.name);
+      ForAllUnionVariantsBesidesNone(
+          *field.value.type.enum_def, [&](const EnumVal &unused) {
+            (void)unused;
+            code_ += "  #[inline]";
+            code_ += "  #[allow(non_snake_case)]";
+            code_ +=
+                "  pub fn {{FIELD_NAME}}_as_{{U_ELEMENT_NAME}}(&self) -> "
+                "Option<{{U_ELEMENT_TABLE_TYPE}}<'a>> {";
+            // If the user defined schemas name a field that clashes with a
+            // language reserved word, flatc will try to escape the field name
+            // by appending an underscore. This works well for most cases,
+            // except one. When generating union accessors (and referring to
+            // them internally within the code generated here), an extra
+            // underscore will be appended to the name, causing build failures.
+            //
+            // This only happens when unions have members that overlap with
+            // language reserved words.
+            //
+            // To avoid this problem the type field name is used unescaped here:
+            code_ +=
+                "    if self.{{FIELD_TYPE_FIELD_NAME}}_type() == "
+                "{{U_ELEMENT_ENUM_TYPE}} {";
+
+            // The following logic is not tested in the integration test,
+            // as of April 10, 2020
+            if (field.IsRequired()) {
+              code_ += "      let u = self.{{FIELD_NAME}}();";
+              code_ +=
+                  "      Some({{U_ELEMENT_TABLE_TYPE}}::init_from_table(u))";
+            } else {
+              code_ +=
+                  "      self.{{FIELD_NAME}}().map("
+                  "{{U_ELEMENT_TABLE_TYPE}}::init_from_table)";
+            }
+            code_ += "    } else {";
+            code_ += "      None";
+            code_ += "    }";
+            code_ += "  }";
+            code_ += "";
+          });
+    });
+    code_ += "}";  // End of table impl.
+    code_ += "";
+
+    // Generate Verifier;
+    code_ += "impl flatbuffers::Verifiable for {{STRUCT_NAME}}<'_> {";
+    code_ += "  #[inline]";
+    code_ += "  fn run_verifier(";
+    code_ += "    v: &mut flatbuffers::Verifier, pos: usize";
+    code_ += "  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {";
+    code_ += "    use self::flatbuffers::Verifiable;";
+    code_ += "    v.visit_table(pos)?\\";
+    // Escape newline and insert it onthe next line so we can end the builder
+    // with a nice semicolon.
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      if (GetFullType(field.value.type) == ftUnionKey) return;
+
+      code_.SetValue("IS_REQ", field.IsRequired() ? "true" : "false");
+      if (GetFullType(field.value.type) != ftUnionValue) {
+        // All types besides unions.
+        code_.SetValue("TY", FollowType(field.value.type, "'_"));
+        code_ +=
+            "\n     .visit_field::<{{TY}}>(&\"{{FIELD_NAME}}\", "
+            "Self::{{OFFSET_NAME}}, {{IS_REQ}})?\\";
+        return;
+      }
+      // Unions.
+      EnumDef &union_def = *field.value.type.enum_def;
+      code_.SetValue("UNION_TYPE", WrapInNameSpace(union_def));
+      code_ +=
+          "\n     .visit_union::<{{UNION_TYPE}}, _>("
+          "&\"{{FIELD_NAME}}_type\", Self::{{OFFSET_NAME}}_TYPE, "
+          "&\"{{FIELD_NAME}}\", Self::{{OFFSET_NAME}}, {{IS_REQ}}, "
+          "|key, v, pos| {";
+      code_ += "        match key {";
+      ForAllUnionVariantsBesidesNone(union_def, [&](const EnumVal &unused) {
+        (void)unused;
+        code_ +=
+            "          {{U_ELEMENT_ENUM_TYPE}} => v.verify_union_variant::"
+            "<flatbuffers::ForwardsUOffset<{{U_ELEMENT_TABLE_TYPE}}>>("
+            "\"{{U_ELEMENT_ENUM_TYPE}}\", pos),";
+      });
+      code_ += "          _ => Ok(()),";
+      code_ += "        }";
+      code_ += "     })?\\";
+    });
+    code_ += "\n     .finish();";
+    code_ += "    Ok(())";
+    code_ += "  }";
+    code_ += "}";
+
+    // Generate an args struct:
+    code_.SetValue("MAYBE_LT",
+                   TableBuilderArgsNeedsLifetime(struct_def) ? "<'a>" : "");
+    code_ += "pub struct {{STRUCT_NAME}}Args{{MAYBE_LT}} {";
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      code_.SetValue("PARAM_TYPE", TableBuilderArgsDefnType(field, "'a"));
+      code_ += "    pub {{FIELD_NAME}}: {{PARAM_TYPE}},";
+    });
+    code_ += "}";
+
+    // Generate an impl of Default for the *Args type:
+    code_ += "impl<'a> Default for {{STRUCT_NAME}}Args{{MAYBE_LT}} {";
+    code_ += "    #[inline]";
+    code_ += "    fn default() -> Self {";
+    code_ += "        {{STRUCT_NAME}}Args {";
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      code_ += "            {{FIELD_NAME}}: {{BLDR_DEF_VAL}},\\";
+      code_ += field.IsRequired() ? " // required field" : "";
+    });
+    code_ += "        }";
+    code_ += "    }";
+    code_ += "}";
+
+    // Generate a builder struct:
+    code_ += "pub struct {{STRUCT_NAME}}Builder<'a: 'b, 'b> {";
+    code_ += "  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,";
+    code_ +=
+        "  start_: flatbuffers::WIPOffset<"
+        "flatbuffers::TableUnfinishedWIPOffset>,";
+    code_ += "}";
+
+    // Generate builder functions:
+    code_ += "impl<'a: 'b, 'b> {{STRUCT_NAME}}Builder<'a, 'b> {";
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      const bool is_scalar = IsScalar(field.value.type.base_type);
+      std::string offset = GetFieldOffsetName(field);
+      // Generate functions to add data, which take one of two forms.
+      //
+      // If a value has a default:
+      //   fn add_x(x_: type) {
+      //     fbb_.push_slot::<type>(offset, x_, Some(default));
+      //   }
+      //
+      // If a value does not have a default:
+      //   fn add_x(x_: type) {
+      //     fbb_.push_slot_always::<type>(offset, x_);
+      //   }
+      code_.SetValue("FIELD_OFFSET", Name(struct_def) + "::" + offset);
+      code_.SetValue("FIELD_TYPE", TableBuilderArgsAddFuncType(field, "'b "));
+      code_.SetValue("FUNC_BODY", TableBuilderArgsAddFuncBody(field));
+      code_ += "  #[inline]";
+      code_ +=
+          "  pub fn add_{{FIELD_NAME}}(&mut self, {{FIELD_NAME}}: "
+          "{{FIELD_TYPE}}) {";
+      if (is_scalar && !field.IsOptional()) {
+        code_ +=
+            "    {{FUNC_BODY}}({{FIELD_OFFSET}}, {{FIELD_NAME}}, "
+            "{{BLDR_DEF_VAL}});";
+      } else {
+        code_ += "    {{FUNC_BODY}}({{FIELD_OFFSET}}, {{FIELD_NAME}});";
+      }
+      code_ += "  }";
+    });
+
+    // Struct initializer (all fields required);
+    code_ += "  #[inline]";
+    code_ +=
+        "  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> "
+        "{{STRUCT_NAME}}Builder<'a, 'b> {";
+    code_.SetValue("NUM_FIELDS", NumToString(struct_def.fields.vec.size()));
+    code_ += "    let start = _fbb.start_table();";
+    code_ += "    {{STRUCT_NAME}}Builder {";
+    code_ += "      fbb_: _fbb,";
+    code_ += "      start_: start,";
+    code_ += "    }";
+    code_ += "  }";
+
+    // finish() function.
+    code_ += "  #[inline]";
+    code_ +=
+        "  pub fn finish(self) -> "
+        "flatbuffers::WIPOffset<{{STRUCT_NAME}}<'a>> {";
+    code_ += "    let o = self.fbb_.end_table(self.start_);";
+
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      if (!field.IsRequired()) return;
+      code_ +=
+          "    self.fbb_.required(o, {{STRUCT_NAME}}::{{OFFSET_NAME}},"
+          "\"{{FIELD_NAME}}\");";
+    });
+    code_ += "    flatbuffers::WIPOffset::new(o.value())";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+
+    code_ += "impl std::fmt::Debug for {{STRUCT_NAME}}<'_> {";
+    code_ +=
+        "  fn fmt(&self, f: &mut std::fmt::Formatter<'_>"
+        ") -> std::fmt::Result {";
+    code_ += "    let mut ds = f.debug_struct(\"{{STRUCT_NAME}}\");";
+    ForAllTableFields(struct_def, [&](const FieldDef &field) {
+      if (GetFullType(field.value.type) == ftUnionValue) {
+        // Generate a match statement to handle unions properly.
+        code_.SetValue("KEY_TYPE", GenTableAccessorFuncReturnType(field, ""));
+        code_.SetValue("FIELD_TYPE_FIELD_NAME", field.name);
+        code_.SetValue("UNION_ERR",
+                       "&\"InvalidFlatbuffer: Union discriminant"
+                       " does not match value.\"");
+
+        code_ += "      match self.{{FIELD_NAME}}_type() {";
+        ForAllUnionVariantsBesidesNone(
+            *field.value.type.enum_def, [&](const EnumVal &unused) {
+              (void)unused;
+              code_ += "        {{U_ELEMENT_ENUM_TYPE}} => {";
+              code_ +=
+                  "          if let Some(x) = "
+                  "self.{{FIELD_TYPE_FIELD_NAME}}_as_"
+                  "{{U_ELEMENT_NAME}}() {";
+              code_ += "            ds.field(\"{{FIELD_NAME}}\", &x)";
+              code_ += "          } else {";
+              code_ +=
+                  "            ds.field(\"{{FIELD_NAME}}\", {{UNION_ERR}})";
+              code_ += "          }";
+              code_ += "        },";
+            });
+        code_ += "        _ => {";
+        code_ += "          let x: Option<()> = None;";
+        code_ += "          ds.field(\"{{FIELD_NAME}}\", &x)";
+        code_ += "        },";
+        code_ += "      };";
+      } else {
+        // Most fields.
+        code_ += "      ds.field(\"{{FIELD_NAME}}\", &self.{{FIELD_NAME}}());";
+      }
+    });
+    code_ += "      ds.finish()";
+    code_ += "  }";
+    code_ += "}";
+  }
+
+  void GenTableObject(const StructDef &table) {
+    code_.SetValue("OBJECT_NAME", NativeName(table));
+    code_.SetValue("STRUCT_NAME", Name(table));
+
+    // Generate the native object.
+    code_ += "#[non_exhaustive]";
+    code_ += "#[derive(Debug, Clone, PartialEq)]";
+    code_ += "pub struct {{OBJECT_NAME}} {";
+    ForAllObjectTableFields(table, [&](const FieldDef &field) {
+      // Union objects combine both the union discriminant and value, so we
+      // skip making a field for the discriminant.
+      if (field.value.type.base_type == BASE_TYPE_UTYPE) return;
+      code_ += "  pub {{FIELD_NAME}}: {{FIELD_OBJECT_TYPE}},";
+    });
+    code_ += "}";
+
+    code_ += "impl Default for {{OBJECT_NAME}} {";
+    code_ += "  fn default() -> Self {";
+    code_ += "    Self {";
+    ForAllObjectTableFields(table, [&](const FieldDef &field) {
+      if (field.value.type.base_type == BASE_TYPE_UTYPE) return;
+      std::string default_value = GetDefaultValue(field, kObject);
+      code_ += "      {{FIELD_NAME}}: " + default_value + ",";
+    });
+    code_ += "    }";
+    code_ += "  }";
+    code_ += "}";
+
+    // TODO(cneo): Generate defaults for Native tables. However, since structs
+    // may be required, they, and therefore enums need defaults.
+
+    // Generate pack function.
+    code_ += "impl {{OBJECT_NAME}} {";
+    code_ += "  pub fn pack<'b>(";
+    code_ += "    &self,";
+    code_ += "    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>";
+    code_ += "  ) -> flatbuffers::WIPOffset<{{STRUCT_NAME}}<'b>> {";
+    // First we generate variables for each field and then later assemble them
+    // using "StructArgs" to more easily manage ownership of the builder.
+    ForAllObjectTableFields(table, [&](const FieldDef &field) {
+      const Type &type = field.value.type;
+      switch (GetFullType(type)) {
+        case ftInteger:
+        case ftBool:
+        case ftFloat:
+        case ftEnumKey: {
+          code_ += "    let {{FIELD_NAME}} = self.{{FIELD_NAME}};";
+          return;
+        }
+        case ftUnionKey: return;  // Generate union type with union value.
+        case ftUnionValue: {
+          code_.SetValue("SNAKE_CASE_ENUM_NAME",
+                         MakeSnakeCase(Name(*field.value.type.enum_def)));
+          code_ +=
+              "    let {{FIELD_NAME}}_type = "
+              "self.{{FIELD_NAME}}.{{SNAKE_CASE_ENUM_NAME}}_type();";
+          code_ += "    let {{FIELD_NAME}} = self.{{FIELD_NAME}}.pack(_fbb);";
+          return;
+        }
+        // The rest of the types require special casing around optionalness
+        // due to "required" annotation.
+        case ftString: {
+          MapNativeTableField(field, "_fbb.create_string(x)");
+          return;
+        }
+        case ftStruct: {
+          // Hold the struct in a variable so we can reference it.
+          if (field.IsRequired()) {
+            code_ +=
+                "    let {{FIELD_NAME}}_tmp = "
+                "Some(self.{{FIELD_NAME}}.pack());";
+          } else {
+            code_ +=
+                "    let {{FIELD_NAME}}_tmp = self.{{FIELD_NAME}}"
+                ".as_ref().map(|x| x.pack());";
+          }
+          code_ += "    let {{FIELD_NAME}} = {{FIELD_NAME}}_tmp.as_ref();";
+
+          return;
+        }
+        case ftTable: {
+          MapNativeTableField(field, "x.pack(_fbb)");
+          return;
+        }
+        case ftVectorOfEnumKey:
+        case ftVectorOfInteger:
+        case ftVectorOfBool:
+        case ftVectorOfFloat: {
+          MapNativeTableField(field, "_fbb.create_vector(x)");
+          return;
+        }
+        case ftVectorOfStruct: {
+          MapNativeTableField(
+              field,
+              "let w: Vec<_> = x.iter().map(|t| t.pack()).collect();"
+              "_fbb.create_vector(&w)");
+          return;
+        }
+        case ftVectorOfString: {
+          // TODO(cneo): create_vector* should be more generic to avoid
+          // allocations.
+
+          MapNativeTableField(
+              field,
+              "let w: Vec<_> = x.iter().map(|s| s.as_ref()).collect();"
+              "_fbb.create_vector_of_strings(&w)");
+          return;
+        }
+        case ftVectorOfTable: {
+          MapNativeTableField(
+              field,
+              "let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();"
+              "_fbb.create_vector(&w)");
+          return;
+        }
+        case ftVectorOfUnionValue: {
+          FLATBUFFERS_ASSERT(false && "vectors of unions not yet supported");
+          return;
+        }
+        case ftArrayOfEnum:
+        case ftArrayOfStruct:
+        case ftArrayOfBuiltin: {
+          FLATBUFFERS_ASSERT(false && "arrays are not supported within tables");
+          return;
+        }
+      }
+    });
+    code_ += "    {{STRUCT_NAME}}::create(_fbb, &{{STRUCT_NAME}}Args{";
+    ForAllObjectTableFields(table, [&](const FieldDef &field) {
+      (void)field;  // Unused.
+      code_ += "      {{FIELD_NAME}},";
+    });
+    code_ += "    })";
+    code_ += "  }";
+    code_ += "}";
+  }
+  void ForAllObjectTableFields(const StructDef &table,
+                               std::function<void(const FieldDef &)> cb) {
+    const std::vector<FieldDef *> &v = table.fields.vec;
+    for (auto it = v.begin(); it != v.end(); it++) {
+      const FieldDef &field = **it;
+      if (field.deprecated) continue;
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_OBJECT_TYPE", ObjectFieldType(field, true));
+      cb(field);
+    }
+  }
+  void MapNativeTableField(const FieldDef &field, const std::string &expr) {
+    if (field.IsOptional()) {
+      code_ += "    let {{FIELD_NAME}} = self.{{FIELD_NAME}}.as_ref().map(|x|{";
+      code_ += "      " + expr;
+      code_ += "    });";
+    } else {
+      // For some reason Args has optional types for required fields.
+      // TODO(cneo): Fix this... but its a breaking change?
+      code_ += "    let {{FIELD_NAME}} = Some({";
+      code_ += "      let x = &self.{{FIELD_NAME}};";
+      code_ += "      " + expr;
+      code_ += "    });";
+    }
+  }
+
+  // Generate functions to compare tables and structs by key. This function
+  // must only be called if the field key is defined.
+  void GenKeyFieldMethods(const FieldDef &field) {
+    FLATBUFFERS_ASSERT(field.key);
+
+    code_.SetValue("KEY_TYPE", GenTableAccessorFuncReturnType(field, ""));
+
+    code_ += "  #[inline]";
+    code_ +=
+        "  pub fn key_compare_less_than(&self, o: &{{STRUCT_NAME}}) -> "
+        " bool {";
+    code_ += "    self.{{FIELD_NAME}}() < o.{{FIELD_NAME}}()";
+    code_ += "  }";
+    code_ += "";
+    code_ += "  #[inline]";
+    code_ +=
+        "  pub fn key_compare_with_value(&self, val: {{KEY_TYPE}}) -> "
+        " ::std::cmp::Ordering {";
+    code_ += "    let key = self.{{FIELD_NAME}}();";
+    code_ += "    key.cmp(&val)";
+    code_ += "  }";
+  }
+
+  // Generate functions for accessing the root table object. This function
+  // must only be called if the root table is defined.
+  void GenRootTableFuncs(const StructDef &struct_def) {
+    FLATBUFFERS_ASSERT(parser_.root_struct_def_ && "root table not defined");
+    auto name = Name(struct_def);
+
+    code_.SetValue("STRUCT_NAME", name);
+    code_.SetValue("STRUCT_NAME_SNAKECASE", MakeSnakeCase(name));
+    code_.SetValue("STRUCT_NAME_CAPS", MakeUpper(MakeSnakeCase(name)));
+
+    // The root datatype accessors:
+    code_ += "#[inline]";
+    code_ +=
+        "#[deprecated(since=\"2.0.0\", "
+        "note=\"Deprecated in favor of `root_as...` methods.\")]";
+    code_ +=
+        "pub fn get_root_as_{{STRUCT_NAME_SNAKECASE}}<'a>(buf: &'a [u8])"
+        " -> {{STRUCT_NAME}}<'a> {";
+    code_ +=
+        "  unsafe { flatbuffers::root_unchecked::<{{STRUCT_NAME}}"
+        "<'a>>(buf) }";
+    code_ += "}";
+    code_ += "";
+
+    code_ += "#[inline]";
+    code_ +=
+        "#[deprecated(since=\"2.0.0\", "
+        "note=\"Deprecated in favor of `root_as...` methods.\")]";
+    code_ +=
+        "pub fn get_size_prefixed_root_as_{{STRUCT_NAME_SNAKECASE}}"
+        "<'a>(buf: &'a [u8]) -> {{STRUCT_NAME}}<'a> {";
+    code_ +=
+        "  unsafe { flatbuffers::size_prefixed_root_unchecked::<{{STRUCT_NAME}}"
+        "<'a>>(buf) }";
+    code_ += "}";
+    code_ += "";
+    // Default verifier root fns.
+    code_ += "#[inline]";
+    code_ += "/// Verifies that a buffer of bytes contains a `{{STRUCT_NAME}}`";
+    code_ += "/// and returns it.";
+    code_ += "/// Note that verification is still experimental and may not";
+    code_ += "/// catch every error, or be maximally performant. For the";
+    code_ += "/// previous, unchecked, behavior use";
+    code_ += "/// `root_as_{{STRUCT_NAME_SNAKECASE}}_unchecked`.";
+    code_ +=
+        "pub fn root_as_{{STRUCT_NAME_SNAKECASE}}(buf: &[u8]) "
+        "-> Result<{{STRUCT_NAME}}, flatbuffers::InvalidFlatbuffer> {";
+    code_ += "  flatbuffers::root::<{{STRUCT_NAME}}>(buf)";
+    code_ += "}";
+    code_ += "#[inline]";
+    code_ += "/// Verifies that a buffer of bytes contains a size prefixed";
+    code_ += "/// `{{STRUCT_NAME}}` and returns it.";
+    code_ += "/// Note that verification is still experimental and may not";
+    code_ += "/// catch every error, or be maximally performant. For the";
+    code_ += "/// previous, unchecked, behavior use";
+    code_ += "/// `size_prefixed_root_as_{{STRUCT_NAME_SNAKECASE}}_unchecked`.";
+    code_ +=
+        "pub fn size_prefixed_root_as_{{STRUCT_NAME_SNAKECASE}}"
+        "(buf: &[u8]) -> Result<{{STRUCT_NAME}}, "
+        "flatbuffers::InvalidFlatbuffer> {";
+    code_ += "  flatbuffers::size_prefixed_root::<{{STRUCT_NAME}}>(buf)";
+    code_ += "}";
+    // Verifier with options root fns.
+    code_ += "#[inline]";
+    code_ += "/// Verifies, with the given options, that a buffer of bytes";
+    code_ += "/// contains a `{{STRUCT_NAME}}` and returns it.";
+    code_ += "/// Note that verification is still experimental and may not";
+    code_ += "/// catch every error, or be maximally performant. For the";
+    code_ += "/// previous, unchecked, behavior use";
+    code_ += "/// `root_as_{{STRUCT_NAME_SNAKECASE}}_unchecked`.";
+    code_ += "pub fn root_as_{{STRUCT_NAME_SNAKECASE}}_with_opts<'b, 'o>(";
+    code_ += "  opts: &'o flatbuffers::VerifierOptions,";
+    code_ += "  buf: &'b [u8],";
+    code_ +=
+        ") -> Result<{{STRUCT_NAME}}<'b>, flatbuffers::InvalidFlatbuffer>"
+        " {";
+    code_ += "  flatbuffers::root_with_opts::<{{STRUCT_NAME}}<'b>>(opts, buf)";
+    code_ += "}";
+    code_ += "#[inline]";
+    code_ += "/// Verifies, with the given verifier options, that a buffer of";
+    code_ += "/// bytes contains a size prefixed `{{STRUCT_NAME}}` and returns";
+    code_ += "/// it. Note that verification is still experimental and may not";
+    code_ += "/// catch every error, or be maximally performant. For the";
+    code_ += "/// previous, unchecked, behavior use";
+    code_ += "/// `root_as_{{STRUCT_NAME_SNAKECASE}}_unchecked`.";
+    code_ +=
+        "pub fn size_prefixed_root_as_{{STRUCT_NAME_SNAKECASE}}_with_opts"
+        "<'b, 'o>(";
+    code_ += "  opts: &'o flatbuffers::VerifierOptions,";
+    code_ += "  buf: &'b [u8],";
+    code_ +=
+        ") -> Result<{{STRUCT_NAME}}<'b>, flatbuffers::InvalidFlatbuffer>"
+        " {";
+    code_ +=
+        "  flatbuffers::size_prefixed_root_with_opts::<{{STRUCT_NAME}}"
+        "<'b>>(opts, buf)";
+    code_ += "}";
+    // Unchecked root fns.
+    code_ += "#[inline]";
+    code_ +=
+        "/// Assumes, without verification, that a buffer of bytes "
+        "contains a {{STRUCT_NAME}} and returns it.";
+    code_ += "/// # Safety";
+    code_ +=
+        "/// Callers must trust the given bytes do indeed contain a valid"
+        " `{{STRUCT_NAME}}`.";
+    code_ +=
+        "pub unsafe fn root_as_{{STRUCT_NAME_SNAKECASE}}_unchecked"
+        "(buf: &[u8]) -> {{STRUCT_NAME}} {";
+    code_ += "  flatbuffers::root_unchecked::<{{STRUCT_NAME}}>(buf)";
+    code_ += "}";
+    code_ += "#[inline]";
+    code_ +=
+        "/// Assumes, without verification, that a buffer of bytes "
+        "contains a size prefixed {{STRUCT_NAME}} and returns it.";
+    code_ += "/// # Safety";
+    code_ +=
+        "/// Callers must trust the given bytes do indeed contain a valid"
+        " size prefixed `{{STRUCT_NAME}}`.";
+    code_ +=
+        "pub unsafe fn size_prefixed_root_as_{{STRUCT_NAME_SNAKECASE}}"
+        "_unchecked(buf: &[u8]) -> {{STRUCT_NAME}} {";
+    code_ +=
+        "  flatbuffers::size_prefixed_root_unchecked::<{{STRUCT_NAME}}>"
+        "(buf)";
+    code_ += "}";
+
+    if (parser_.file_identifier_.length()) {
+      // Declare the identifier
+      // (no lifetime needed as constants have static lifetimes by default)
+      code_ += "pub const {{STRUCT_NAME_CAPS}}_IDENTIFIER: &str\\";
+      code_ += " = \"" + parser_.file_identifier_ + "\";";
+      code_ += "";
+
+      // Check if a buffer has the identifier.
+      code_ += "#[inline]";
+      code_ += "pub fn {{STRUCT_NAME_SNAKECASE}}_buffer_has_identifier\\";
+      code_ += "(buf: &[u8]) -> bool {";
+      code_ += "  flatbuffers::buffer_has_identifier(buf, \\";
+      code_ += "{{STRUCT_NAME_CAPS}}_IDENTIFIER, false)";
+      code_ += "}";
+      code_ += "";
+      code_ += "#[inline]";
+      code_ += "pub fn {{STRUCT_NAME_SNAKECASE}}_size_prefixed\\";
+      code_ += "_buffer_has_identifier(buf: &[u8]) -> bool {";
+      code_ += "  flatbuffers::buffer_has_identifier(buf, \\";
+      code_ += "{{STRUCT_NAME_CAPS}}_IDENTIFIER, true)";
+      code_ += "}";
+      code_ += "";
+    }
+
+    if (parser_.file_extension_.length()) {
+      // Return the extension
+      code_ += "pub const {{STRUCT_NAME_CAPS}}_EXTENSION: &str = \\";
+      code_ += "\"" + parser_.file_extension_ + "\";";
+      code_ += "";
+    }
+
+    // Finish a buffer with a given root object:
+    code_.SetValue("OFFSET_TYPELABEL", Name(struct_def) + "Offset");
+    code_ += "#[inline]";
+    code_ += "pub fn finish_{{STRUCT_NAME_SNAKECASE}}_buffer<'a, 'b>(";
+    code_ += "    fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>,";
+    code_ += "    root: flatbuffers::WIPOffset<{{STRUCT_NAME}}<'a>>) {";
+    if (parser_.file_identifier_.length()) {
+      code_ += "  fbb.finish(root, Some({{STRUCT_NAME_CAPS}}_IDENTIFIER));";
+    } else {
+      code_ += "  fbb.finish(root, None);";
+    }
+    code_ += "}";
+    code_ += "";
+    code_ += "#[inline]";
+    code_ +=
+        "pub fn finish_size_prefixed_{{STRUCT_NAME_SNAKECASE}}_buffer"
+        "<'a, 'b>("
+        "fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, "
+        "root: flatbuffers::WIPOffset<{{STRUCT_NAME}}<'a>>) {";
+    if (parser_.file_identifier_.length()) {
+      code_ +=
+          "  fbb.finish_size_prefixed(root, "
+          "Some({{STRUCT_NAME_CAPS}}_IDENTIFIER));";
+    } else {
+      code_ += "  fbb.finish_size_prefixed(root, None);";
+    }
+    code_ += "}";
+  }
+
+  static void GenPadding(
+      const FieldDef &field, std::string *code_ptr, int *id,
+      const std::function<void(int bits, std::string *code_ptr, int *id)> &f) {
+    if (field.padding) {
+      for (int i = 0; i < 4; i++) {
+        if (static_cast<int>(field.padding) & (1 << i)) {
+          f((1 << i) * 8, code_ptr, id);
+        }
+      }
+      assert(!(field.padding & ~0xF));
+    }
+  }
+
+  static void PaddingDefinition(int bits, std::string *code_ptr, int *id) {
+    *code_ptr +=
+        "  padding" + NumToString((*id)++) + "__: u" + NumToString(bits) + ",";
+  }
+
+  static void PaddingInitializer(int bits, std::string *code_ptr, int *id) {
+    (void)bits;
+    *code_ptr += "padding" + NumToString((*id)++) + "__: 0,";
+  }
+
+  void ForAllStructFields(const StructDef &struct_def,
+                          std::function<void(const FieldDef &field)> cb) {
+    size_t offset_to_field = 0;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      const auto &field = **it;
+      code_.SetValue("FIELD_TYPE", GetTypeGet(field.value.type));
+      code_.SetValue("FIELD_OBJECT_TYPE", ObjectFieldType(field, false));
+      code_.SetValue("FIELD_NAME", Name(field));
+      code_.SetValue("FIELD_OFFSET", NumToString(offset_to_field));
+      code_.SetValue(
+          "REF",
+          IsStruct(field.value.type) || IsArray(field.value.type) ? "&" : "");
+      cb(field);
+      const size_t size = InlineSize(field.value.type);
+      offset_to_field += size + field.padding;
+    }
+  }
+  // Generate an accessor struct with constructor for a flatbuffers struct.
+  void GenStruct(const StructDef &struct_def) {
+    // Generates manual padding and alignment.
+    // Variables are private because they contain little endian data on all
+    // platforms.
+    GenComment(struct_def.doc_comment);
+    code_.SetValue("ALIGN", NumToString(struct_def.minalign));
+    code_.SetValue("STRUCT_NAME", Name(struct_def));
+    code_.SetValue("STRUCT_SIZE", NumToString(struct_def.bytesize));
+
+    // We represent Flatbuffers-structs in Rust-u8-arrays since the data may be
+    // of the wrong endianness and alignment 1.
+    //
+    // PartialEq is useful to derive because we can correctly compare structs
+    // for equality by just comparing their underlying byte data. This doesn't
+    // hold for PartialOrd/Ord.
+    code_ += "// struct {{STRUCT_NAME}}, aligned to {{ALIGN}}";
+    code_ += "#[repr(transparent)]";
+    code_ += "#[derive(Clone, Copy, PartialEq)]";
+    code_ += "pub struct {{STRUCT_NAME}}(pub [u8; {{STRUCT_SIZE}}]);";
+    code_ += "impl Default for {{STRUCT_NAME}} { ";
+    code_ += "  fn default() -> Self { ";
+    code_ += "    Self([0; {{STRUCT_SIZE}}])";
+    code_ += "  }";
+    code_ += "}";
+
+    // Debug for structs.
+    code_ += "impl std::fmt::Debug for {{STRUCT_NAME}} {";
+    code_ +=
+        "  fn fmt(&self, f: &mut std::fmt::Formatter"
+        ") -> std::fmt::Result {";
+    code_ += "    f.debug_struct(\"{{STRUCT_NAME}}\")";
+    ForAllStructFields(struct_def, [&](const FieldDef &unused) {
+      (void)unused;
+      code_ += "      .field(\"{{FIELD_NAME}}\", &self.{{FIELD_NAME}}())";
+    });
+    code_ += "      .finish()";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "";
+
+    // Generate impls for SafeSliceAccess (because all structs are endian-safe),
+    // Follow for the value type, Follow for the reference type, Push for the
+    // value type, and Push for the reference type.
+    code_ += "impl flatbuffers::SimpleToVerifyInSlice for {{STRUCT_NAME}} {}";
+    code_ += "impl flatbuffers::SafeSliceAccess for {{STRUCT_NAME}} {}";
+    code_ += "impl<'a> flatbuffers::Follow<'a> for {{STRUCT_NAME}} {";
+    code_ += "  type Inner = &'a {{STRUCT_NAME}};";
+    code_ += "  #[inline]";
+    code_ += "  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {";
+    code_ += "    <&'a {{STRUCT_NAME}}>::follow(buf, loc)";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "impl<'a> flatbuffers::Follow<'a> for &'a {{STRUCT_NAME}} {";
+    code_ += "  type Inner = &'a {{STRUCT_NAME}};";
+    code_ += "  #[inline]";
+    code_ += "  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {";
+    code_ += "    flatbuffers::follow_cast_ref::<{{STRUCT_NAME}}>(buf, loc)";
+    code_ += "  }";
+    code_ += "}";
+    code_ += "impl<'b> flatbuffers::Push for {{STRUCT_NAME}} {";
+    code_ += "    type Output = {{STRUCT_NAME}};";
+    code_ += "    #[inline]";
+    code_ += "    fn push(&self, dst: &mut [u8], _rest: &[u8]) {";
+    code_ += "        let src = unsafe {";
+    code_ +=
+        "            ::std::slice::from_raw_parts("
+        "self as *const {{STRUCT_NAME}} as *const u8, Self::size())";
+    code_ += "        };";
+    code_ += "        dst.copy_from_slice(src);";
+    code_ += "    }";
+    code_ += "}";
+    code_ += "impl<'b> flatbuffers::Push for &'b {{STRUCT_NAME}} {";
+    code_ += "    type Output = {{STRUCT_NAME}};";
+    code_ += "";
+    code_ += "    #[inline]";
+    code_ += "    fn push(&self, dst: &mut [u8], _rest: &[u8]) {";
+    code_ += "        let src = unsafe {";
+    code_ +=
+        "            ::std::slice::from_raw_parts("
+        "*self as *const {{STRUCT_NAME}} as *const u8, Self::size())";
+    code_ += "        };";
+    code_ += "        dst.copy_from_slice(src);";
+    code_ += "    }";
+    code_ += "}";
+    code_ += "";
+
+    // Generate verifier: Structs are simple so presence and alignment are
+    // all that need to be checked.
+    code_ += "impl<'a> flatbuffers::Verifiable for {{STRUCT_NAME}} {";
+    code_ += "  #[inline]";
+    code_ += "  fn run_verifier(";
+    code_ += "    v: &mut flatbuffers::Verifier, pos: usize";
+    code_ += "  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {";
+    code_ += "    use self::flatbuffers::Verifiable;";
+    code_ += "    v.in_buffer::<Self>(pos)";
+    code_ += "  }";
+    code_ += "}";
+
+    // Generate a constructor that takes all fields as arguments.
+    code_ += "impl<'a> {{STRUCT_NAME}} {";
+    code_ += "  #[allow(clippy::too_many_arguments)]";
+    code_ += "  pub fn new(";
+    ForAllStructFields(struct_def, [&](const FieldDef &unused) {
+      (void)unused;
+      code_ += "    {{FIELD_NAME}}: {{REF}}{{FIELD_TYPE}},";
+    });
+    code_ += "  ) -> Self {";
+    code_ += "    let mut s = Self([0; {{STRUCT_SIZE}}]);";
+    ForAllStructFields(struct_def, [&](const FieldDef &unused) {
+      (void)unused;
+      code_ += "    s.set_{{FIELD_NAME}}({{REF}}{{FIELD_NAME}});";
+    });
+    code_ += "    s";
+    code_ += "  }";
+    code_ += "";
+
+    if (parser_.opts.generate_name_strings) {
+      GenFullyQualifiedNameGetter(struct_def, struct_def.name);
+    }
+
+    // Generate accessor methods for the struct.
+    ForAllStructFields(struct_def, [&](const FieldDef &field) {
+      this->GenComment(field.doc_comment, "  ");
+      // Getter.
+      if (IsStruct(field.value.type)) {
+        code_ += "  pub fn {{FIELD_NAME}}(&self) -> &{{FIELD_TYPE}} {";
+        code_ +=
+            "    unsafe {"
+            " &*(self.0[{{FIELD_OFFSET}}..].as_ptr() as *const"
+            " {{FIELD_TYPE}}) }";
+      } else if (IsArray(field.value.type)) {
+        code_.SetValue("ARRAY_SIZE",
+                       NumToString(field.value.type.fixed_length));
+        code_.SetValue("ARRAY_ITEM", GetTypeGet(field.value.type.VectorType()));
+        code_ +=
+            "  pub fn {{FIELD_NAME}}(&'a self) -> "
+            "flatbuffers::Array<'a, {{ARRAY_ITEM}}, {{ARRAY_SIZE}}> {";
+        code_ += "    flatbuffers::Array::follow(&self.0, {{FIELD_OFFSET}})";
+      } else {
+        code_ += "  pub fn {{FIELD_NAME}}(&self) -> {{FIELD_TYPE}} {";
+        code_ +=
+            "    let mut mem = core::mem::MaybeUninit::"
+            "<{{FIELD_TYPE}}>::uninit();";
+        code_ += "    unsafe {";
+        code_ += "      core::ptr::copy_nonoverlapping(";
+        code_ += "        self.0[{{FIELD_OFFSET}}..].as_ptr(),";
+        code_ += "        mem.as_mut_ptr() as *mut u8,";
+        code_ += "        core::mem::size_of::<{{FIELD_TYPE}}>(),";
+        code_ += "      );";
+        code_ += "      mem.assume_init()";
+        code_ += "    }.from_little_endian()";
+      }
+      code_ += "  }\n";
+      // Setter.
+      if (IsStruct(field.value.type)) {
+        code_.SetValue("FIELD_SIZE", NumToString(InlineSize(field.value.type)));
+        code_ += "  pub fn set_{{FIELD_NAME}}(&mut self, x: &{{FIELD_TYPE}}) {";
+        code_ +=
+            "    self.0[{{FIELD_OFFSET}}..{{FIELD_OFFSET}}+{{FIELD_SIZE}}]"
+            ".copy_from_slice(&x.0)";
+      } else if (IsArray(field.value.type)) {
+        if (GetFullType(field.value.type) == ftArrayOfBuiltin) {
+          code_.SetValue("ARRAY_ITEM",
+                         GetTypeGet(field.value.type.VectorType()));
+          code_.SetValue(
+              "ARRAY_ITEM_SIZE",
+              NumToString(InlineSize(field.value.type.VectorType())));
+          code_ +=
+              "  pub fn set_{{FIELD_NAME}}(&mut self, items: &{{FIELD_TYPE}}) "
+              "{";
+          code_ +=
+              "    flatbuffers::emplace_scalar_array(&mut self.0, "
+              "{{FIELD_OFFSET}}, items);";
+        } else {
+          code_.SetValue("FIELD_SIZE",
+                         NumToString(InlineSize(field.value.type)));
+          code_ +=
+              "  pub fn set_{{FIELD_NAME}}(&mut self, x: &{{FIELD_TYPE}}) {";
+          code_ += "    unsafe {";
+          code_ += "      std::ptr::copy(";
+          code_ += "        x.as_ptr() as *const u8,";
+          code_ += "        self.0.as_mut_ptr().add({{FIELD_OFFSET}}),";
+          code_ += "        {{FIELD_SIZE}},";
+          code_ += "      );";
+          code_ += "    }";
+        }
+      } else {
+        code_ += "  pub fn set_{{FIELD_NAME}}(&mut self, x: {{FIELD_TYPE}}) {";
+        code_ += "    let x_le = x.to_little_endian();";
+        code_ += "    unsafe {";
+        code_ += "      core::ptr::copy_nonoverlapping(";
+        code_ += "        &x_le as *const {{FIELD_TYPE}} as *const u8,";
+        code_ += "        self.0[{{FIELD_OFFSET}}..].as_mut_ptr(),";
+        code_ += "        core::mem::size_of::<{{FIELD_TYPE}}>(),";
+        code_ += "      );";
+        code_ += "    }";
+      }
+      code_ += "  }\n";
+
+      // Generate a comparison function for this field if it is a key.
+      if (field.key) { GenKeyFieldMethods(field); }
+    });
+
+    // Generate Object API unpack method.
+    if (parser_.opts.generate_object_based_api) {
+      code_.SetValue("NATIVE_STRUCT_NAME", NativeName(struct_def));
+      code_ += "  pub fn unpack(&self) -> {{NATIVE_STRUCT_NAME}} {";
+      code_ += "    {{NATIVE_STRUCT_NAME}} {";
+      ForAllStructFields(struct_def, [&](const FieldDef &field) {
+        if (IsArray(field.value.type)) {
+          if (GetFullType(field.value.type) == ftArrayOfStruct) {
+            code_ +=
+                "      {{FIELD_NAME}}: { let {{FIELD_NAME}} = "
+                "self.{{FIELD_NAME}}(); flatbuffers::array_init(|i| "
+                "{{FIELD_NAME}}.get(i).unpack()) },";
+          } else {
+            code_ += "      {{FIELD_NAME}}: self.{{FIELD_NAME}}().into(),";
+          }
+        } else {
+          std::string unpack = IsStruct(field.value.type) ? ".unpack()" : "";
+          code_ += "      {{FIELD_NAME}}: self.{{FIELD_NAME}}()" + unpack + ",";
+        }
+      });
+      code_ += "    }";
+      code_ += "  }";
+    }
+
+    code_ += "}";  // End impl Struct methods.
+    code_ += "";
+
+    // Generate Struct Object.
+    if (parser_.opts.generate_object_based_api) {
+      // Struct declaration
+      code_ += "#[derive(Debug, Clone, PartialEq, Default)]";
+      code_ += "pub struct {{NATIVE_STRUCT_NAME}} {";
+      ForAllStructFields(struct_def, [&](const FieldDef &field) {
+        (void)field;  // unused.
+        code_ += "  pub {{FIELD_NAME}}: {{FIELD_OBJECT_TYPE}},";
+      });
+      code_ += "}";
+      // The `pack` method that turns the native struct into its Flatbuffers
+      // counterpart.
+      code_ += "impl {{NATIVE_STRUCT_NAME}} {";
+      code_ += "  pub fn pack(&self) -> {{STRUCT_NAME}} {";
+      code_ += "    {{STRUCT_NAME}}::new(";
+      ForAllStructFields(struct_def, [&](const FieldDef &field) {
+        if (IsStruct(field.value.type)) {
+          code_ += "      &self.{{FIELD_NAME}}.pack(),";
+        } else if (IsArray(field.value.type)) {
+          if (GetFullType(field.value.type) == ftArrayOfStruct) {
+            code_ +=
+                "      &flatbuffers::array_init(|i| "
+                "self.{{FIELD_NAME}}[i].pack()),";
+          } else {
+            code_ += "      &self.{{FIELD_NAME}},";
+          }
+        } else {
+          code_ += "      self.{{FIELD_NAME}},";
+        }
+      });
+      code_ += "    )";
+      code_ += "  }";
+      code_ += "}";
+      code_ += "";
+    }
+  }
+
+  void GenNamespaceImports(const int white_spaces) {
+    // DO not use global attributes (i.e. #![...]) since it interferes
+    // with users who include! generated files.
+    // See: https://github.com/google/flatbuffers/issues/6261
+    std::string indent = std::string(white_spaces, ' ');
+    code_ += "";
+    if (!parser_.opts.generate_all) {
+      for (auto it = parser_.included_files_.begin();
+           it != parser_.included_files_.end(); ++it) {
+        if (it->second.empty()) continue;
+        auto noext = flatbuffers::StripExtension(it->second);
+        auto basename = flatbuffers::StripPath(noext);
+
+        if (parser_.opts.include_prefix.empty()) {
+          code_ += indent + "use crate::" + basename +
+                   parser_.opts.filename_suffix + "::*;";
+        } else {
+          auto prefix = parser_.opts.include_prefix;
+          prefix.pop_back();
+
+          code_ += indent + "use crate::" + prefix + "::" + basename +
+                   parser_.opts.filename_suffix + "::*;";
+        }
+      }
+    }
+    code_ += indent + "use std::mem;";
+    code_ += indent + "use std::cmp::Ordering;";
+    code_ += "";
+    code_ += indent + "extern crate flatbuffers;";
+    code_ += indent + "use self::flatbuffers::{EndianScalar, Follow};";
+  }
+
+  // Set up the correct namespace. This opens a namespace if the current
+  // namespace is different from the target namespace. This function
+  // closes and opens the namespaces only as necessary.
+  //
+  // The file must start and end with an empty (or null) namespace so that
+  // namespaces are properly opened and closed.
+  void SetNameSpace(const Namespace *ns) {
+    if (cur_name_space_ == ns) { return; }
+
+    // Compute the size of the longest common namespace prefix.
+    // If cur_name_space is A::B::C::D and ns is A::B::E::F::G,
+    // the common prefix is A::B:: and we have old_size = 4, new_size = 5
+    // and common_prefix_size = 2
+    size_t old_size = cur_name_space_ ? cur_name_space_->components.size() : 0;
+    size_t new_size = ns ? ns->components.size() : 0;
+
+    size_t common_prefix_size = 0;
+    while (common_prefix_size < old_size && common_prefix_size < new_size &&
+           ns->components[common_prefix_size] ==
+               cur_name_space_->components[common_prefix_size]) {
+      common_prefix_size++;
+    }
+
+    // Close cur_name_space in reverse order to reach the common prefix.
+    // In the previous example, D then C are closed.
+    for (size_t j = old_size; j > common_prefix_size; --j) {
+      code_ += "}  // pub mod " + cur_name_space_->components[j - 1];
+    }
+    if (old_size != common_prefix_size) { code_ += ""; }
+
+    // open namespace parts to reach the ns namespace
+    // in the previous example, E, then F, then G are opened
+    for (auto j = common_prefix_size; j != new_size; ++j) {
+      code_ += "#[allow(unused_imports, dead_code)]";
+      code_ += "pub mod " + MakeSnakeCase(ns->components[j]) + " {";
+      // Generate local namespace imports.
+      GenNamespaceImports(2);
+    }
+    if (new_size != common_prefix_size) { code_ += ""; }
+
+    cur_name_space_ = ns;
+  }
+};
+
+}  // namespace rust
+
+bool GenerateRust(const Parser &parser, const std::string &path,
+                  const std::string &file_name) {
+  rust::RustGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+std::string RustMakeRule(const Parser &parser, const std::string &path,
+                         const std::string &file_name) {
+  std::string filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  rust::RustGenerator generator(parser, path, file_name);
+  std::string make_rule =
+      generator.GeneratedFileName(path, filebase, parser.opts) + ": ";
+
+  auto included_files = parser.GetIncludedFilesRecursive(file_name);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
+
+// TODO(rw): Generated code should import other generated files.
+// TODO(rw): Generated code should refer to namespaces in included files in a
+//           way that makes them referrable.
+// TODO(rw): Generated code should indent according to nesting level.
+// TODO(rw): Generated code should generate endian-safe Debug impls.
+// TODO(rw): Generated code could use a Rust-only enum type to access unions,
+//           instead of making the user use _type() to manually switch.
+// TODO(maxburke): There should be test schemas added that use language
+//           keywords as fields of structs, tables, unions, enums, to make sure
+//           that internal code generated references escaped names correctly.
+// TODO(maxburke): We should see if there is a more flexible way of resolving
+//           module paths for use declarations. Right now if schemas refer to
+//           other flatbuffer files, the include paths in emitted Rust bindings
+//           are crate-relative which may undesirable.
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_swift.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_swift.cpp
new file mode 100644
index 0000000..3fffd39
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_swift.cpp
@@ -0,0 +1,1575 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cctype>
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+namespace swift {
+
+inline std::string GenIndirect(const std::string &reading) {
+  return "{{ACCESS}}.indirect(" + reading + ")";
+}
+
+inline std::string GenArrayMainBody(const std::string &optional) {
+  return "{{ACCESS_TYPE}} func {{VALUENAME}}(at index: Int32) -> "
+         "{{VALUETYPE}}" +
+         optional + " { ";
+}
+
+class SwiftGenerator : public BaseGenerator {
+ private:
+  CodeWriter code_;
+  std::unordered_set<std::string> keywords_;
+  int namespace_depth;
+
+ public:
+  SwiftGenerator(const Parser &parser, const std::string &path,
+                 const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", "_", "swift") {
+    namespace_depth = 0;
+    code_.SetPadding("  ");
+    static const char *const keywords[] = {
+      "associatedtype",
+      "class",
+      "deinit",
+      "enum",
+      "extension",
+      "fileprivate",
+      "func",
+      "import",
+      "init",
+      "inout",
+      "internal",
+      "let",
+      "open",
+      "operator",
+      "private",
+      "protocol",
+      "public",
+      "rethrows",
+      "static",
+      "struct",
+      "subscript",
+      "typealias",
+      "var",
+      "break",
+      "case",
+      "continue",
+      "default",
+      "defer",
+      "do",
+      "else",
+      "fallthrough",
+      "for",
+      "guard",
+      "if",
+      "in",
+      "repeat",
+      "return",
+      "switch",
+      "where",
+      "while",
+      "Any",
+      "catch",
+      "false",
+      "is",
+      "nil",
+      "super",
+      "self",
+      "Self",
+      "throw",
+      "throws",
+      "true",
+      "try",
+      "associativity",
+      "convenience",
+      "dynamic",
+      "didSet",
+      "final",
+      "get",
+      "infix",
+      "indirect",
+      "lazy",
+      "left",
+      "mutating",
+      "none",
+      "nonmutating",
+      "optional",
+      "override",
+      "postfix",
+      "precedence",
+      "prefix",
+      "Protocol",
+      "required",
+      "right",
+      "set",
+      "Type",
+      "unowned",
+      "weak",
+      "willSet",
+      "Void",
+      nullptr,
+    };
+    for (auto kw = keywords; *kw; kw++) keywords_.insert(*kw);
+  }
+
+  bool generate() {
+    code_.Clear();
+    code_.SetValue("ACCESS", "_accessor");
+    code_.SetValue("TABLEOFFSET", "VTOFFSET");
+    code_ += "// " + std::string(FlatBuffersGeneratedWarning());
+    code_ += "// swiftlint:disable all";
+    code_ += "// swiftformat:disable all\n";
+    code_ += "import FlatBuffers\n";
+    // Generate code for all the enum declarations.
+
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      const auto &enum_def = **it;
+      if (!enum_def.generated) { GenEnum(enum_def); }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (struct_def.fixed && !struct_def.generated) {
+        GenStructReader(struct_def);
+        GenMutableStructReader(struct_def);
+      }
+    }
+
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      const auto &struct_def = **it;
+      if (!struct_def.fixed && !struct_def.generated) {
+        GenTable(struct_def);
+        if (parser_.opts.generate_object_based_api) {
+          GenObjectAPI(struct_def);
+        }
+      }
+    }
+
+    const auto filename = GeneratedFileName(path_, file_name_, parser_.opts);
+    const auto final_code = code_.ToString();
+    return SaveFile(filename.c_str(), final_code, false);
+  }
+
+  void mark(const std::string &str) {
+    code_.SetValue("MARKVALUE", str);
+    code_ += "\n// MARK: - {{MARKVALUE}}\n";
+  }
+
+  // MARK: - Generating structs
+
+  // Generates the reader for swift
+  void GenStructReader(const StructDef &struct_def) {
+    auto is_private_access = struct_def.attributes.Lookup("private");
+    code_.SetValue("ACCESS_TYPE", is_private_access ? "internal" : "public");
+    GenComment(struct_def.doc_comment);
+    code_.SetValue("STRUCTNAME", NameWrappedInNameSpace(struct_def));
+    code_ += "{{ACCESS_TYPE}} struct {{STRUCTNAME}}: NativeStruct\\";
+    if (parser_.opts.generate_object_based_api) code_ += ", NativeObject\\";
+    code_ += " {";
+    code_ += "";
+    Indent();
+    code_ += ValidateFunc();
+    code_ += "";
+    int padding_id = 0;
+    std::string constructor = "";
+    std::vector<std::string> base_constructor;
+    std::vector<std::string> main_constructor;
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      if (!constructor.empty()) constructor += ", ";
+
+      auto name = Name(field);
+      auto type = GenType(field.value.type);
+      code_.SetValue("VALUENAME", name);
+      if (IsEnum(field.value.type)) {
+        code_.SetValue("BASEVALUE", GenTypeBasic(field.value.type, false));
+      }
+      code_.SetValue("VALUETYPE", type);
+      GenComment(field.doc_comment);
+      std::string valueType =
+          IsEnum(field.value.type) ? "{{BASEVALUE}}" : "{{VALUETYPE}}";
+      code_ += "private var _{{VALUENAME}}: " + valueType;
+      auto accessing_value = IsEnum(field.value.type) ? ".value" : "";
+      auto base_value =
+          IsStruct(field.value.type) ? (type + "()") : field.value.constant;
+
+      main_constructor.push_back("_" + name + " = " + name + accessing_value);
+      base_constructor.push_back("_" + name + " = " + base_value);
+
+      if (field.padding) { GenPadding(field, &padding_id); }
+      constructor += name + ": " + type;
+    }
+    code_ += "";
+    BuildObjectConstructor(main_constructor, constructor);
+    BuildObjectConstructor(base_constructor, "");
+
+    if (parser_.opts.generate_object_based_api)
+      GenerateObjectAPIStructConstructor(struct_def);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto name = Name(field);
+      auto type = GenType(field.value.type);
+      code_.SetValue("VALUENAME", name);
+      code_.SetValue("VALUETYPE", type);
+      GenComment(field.doc_comment);
+      if (!IsEnum(field.value.type)) {
+        code_ += GenReaderMainBody() + "_{{VALUENAME}} }";
+      } else if (IsEnum(field.value.type)) {
+        code_ +=
+            GenReaderMainBody() + "{{VALUETYPE}}(rawValue: _{{VALUENAME}})! }";
+      }
+    }
+    Outdent();
+    code_ += "}\n";
+  }
+
+  void GenMutableStructReader(const StructDef &struct_def) {
+    GenObjectHeader(struct_def);
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto offset = NumToString(field.value.offset);
+      auto name = Name(field);
+      auto type = GenType(field.value.type);
+      code_.SetValue("VALUENAME", name);
+      if (IsEnum(field.value.type)) {
+        code_.SetValue("BASEVALUE", GenTypeBasic(field.value.type, false));
+      }
+      code_.SetValue("VALUETYPE", type);
+      code_.SetValue("OFFSET", offset);
+      if (IsScalar(field.value.type.base_type) && !IsEnum(field.value.type)) {
+        code_ +=
+            GenReaderMainBody() + "return " + GenReader("VALUETYPE") + " }";
+      } else if (IsEnum(field.value.type)) {
+        code_.SetValue("BASEVALUE", GenTypeBasic(field.value.type, false));
+        code_ += GenReaderMainBody() + "return " +
+                 GenEnumConstructor("{{OFFSET}}") + "?? " +
+                 GenEnumDefaultValue(field) + " }";
+      } else if (IsStruct(field.value.type)) {
+        code_.SetValue("VALUETYPE", GenType(field.value.type) + Mutable());
+        code_ += GenReaderMainBody() + "return " +
+                 GenConstructor("{{ACCESS}}.postion + {{OFFSET}}");
+      }
+      if (parser_.opts.mutable_buffer && !IsStruct(field.value.type))
+        code_ += GenMutate("{{OFFSET}}", "", IsEnum(field.value.type));
+    }
+
+    if (parser_.opts.generate_object_based_api) {
+      GenerateObjectAPIExtensionHeader(NameWrappedInNameSpace(struct_def));
+      code_ += "return builder.create(struct: obj)";
+      Outdent();
+      code_ += "}";
+    }
+    Outdent();
+    code_ += "}\n";
+  }
+
+  // Generates the create function for swift
+  void GenStructWriter(const StructDef &struct_def) {
+    auto is_private_access = struct_def.attributes.Lookup("private");
+    code_.SetValue("ACCESS_TYPE", is_private_access ? "internal" : "public");
+    code_.SetValue("STRUCTNAME", NameWrappedInNameSpace(struct_def));
+    code_.SetValue("SHORT_STRUCTNAME", Name(struct_def));
+    code_ += "extension {{STRUCTNAME}} {";
+    Indent();
+    code_ += "@discardableResult";
+    code_ +=
+        "{{ACCESS_TYPE}} static func create{{SHORT_STRUCTNAME}}(builder: inout "
+        "FlatBufferBuilder, \\";
+    std::string func_header = "";
+    GenerateStructArgs(struct_def, &func_header, "", "");
+    code_ += func_header.substr(0, func_header.size() - 2) + "\\";
+    code_ += ") -> Offset {";
+    Indent();
+    code_ +=
+        "builder.createStructOf(size: {{STRUCTNAME}}.size, alignment: "
+        "{{STRUCTNAME}}.alignment)";
+    code_ += "return builder.endStruct()";
+    Outdent();
+    code_ += "}\n";
+    Outdent();
+    code_ += "}\n";
+  }
+
+  void GenerateStructArgs(const StructDef &struct_def, std::string *code_ptr,
+                          const std::string &nameprefix,
+                          const std::string &object_name,
+                          const std::string &obj_api_named = "",
+                          bool is_obj_api = false) {
+    auto &code = *code_ptr;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      const auto &field_type = field.value.type;
+      if (IsStruct(field.value.type)) {
+        GenerateStructArgs(
+            *field_type.struct_def, code_ptr, (nameprefix + field.name),
+            (object_name + "." + field.name), obj_api_named, is_obj_api);
+      } else {
+        auto name = Name(field);
+        auto type = GenType(field.value.type);
+        if (!is_obj_api) {
+          code += nameprefix + name + ": " + type;
+          if (!IsEnum(field.value.type)) {
+            code += " = ";
+            auto is_bool = IsBool(field.value.type.base_type);
+            auto constant =
+                is_bool ? ("0" == field.value.constant ? "false" : "true")
+                        : field.value.constant;
+            code += constant;
+          }
+          code += ", ";
+          continue;
+        }
+        code +=
+            nameprefix + name + ": " + obj_api_named + object_name + "." + name;
+        code += ", ";
+      }
+    }
+  }
+
+  // MARK: - Table Generator
+
+  // Generates the reader for swift
+  void GenTable(const StructDef &struct_def) {
+    auto is_private_access = struct_def.attributes.Lookup("private");
+    code_.SetValue("ACCESS_TYPE", is_private_access ? "internal" : "public");
+
+    GenObjectHeader(struct_def);
+    GenTableAccessors(struct_def);
+    GenTableReader(struct_def);
+    GenTableWriter(struct_def);
+    if (parser_.opts.generate_object_based_api)
+      GenerateObjectAPITableExtension(struct_def);
+    Outdent();
+    code_ += "}\n";
+  }
+
+  // Generates the reader for swift
+  void GenTableAccessors(const StructDef &struct_def) {
+    // Generate field id constants.
+    if (struct_def.fields.vec.size() > 0) {
+      code_ += "private enum {{TABLEOFFSET}}: VOffset {";
+      Indent();
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        const auto &field = **it;
+        if (field.deprecated) { continue; }
+        code_.SetValue("OFFSET_NAME", Name(field));
+        code_.SetValue("OFFSET_VALUE", NumToString(field.value.offset));
+        code_ += "case {{OFFSET_NAME}} = {{OFFSET_VALUE}}";
+      }
+      code_ += "var v: Int32 { Int32(self.rawValue) }";
+      code_ += "var p: VOffset { self.rawValue }";
+      Outdent();
+      code_ += "}";
+      code_ += "";
+    }
+  }
+
+  void GenObjectHeader(const StructDef &struct_def) {
+    GenComment(struct_def.doc_comment);
+
+    code_.SetValue("SHORT_STRUCTNAME", Name(struct_def));
+    code_.SetValue("STRUCTNAME", NameWrappedInNameSpace(struct_def));
+    code_.SetValue("OBJECTTYPE", struct_def.fixed ? "Struct" : "Table");
+    code_.SetValue("MUTABLE", struct_def.fixed ? Mutable() : "");
+    code_ +=
+        "{{ACCESS_TYPE}} struct {{STRUCTNAME}}{{MUTABLE}}: FlatBufferObject\\";
+    if (!struct_def.fixed && parser_.opts.generate_object_based_api)
+      code_ += ", ObjectAPIPacker\\";
+    code_ += " {\n";
+    Indent();
+    code_ += ValidateFunc();
+    code_ +=
+        "{{ACCESS_TYPE}} var __buffer: ByteBuffer! { return {{ACCESS}}.bb }";
+    code_ += "private var {{ACCESS}}: {{OBJECTTYPE}}\n";
+    if (!struct_def.fixed) {
+      if (parser_.file_identifier_.length()) {
+        code_.SetValue("FILENAME", parser_.file_identifier_);
+        code_ +=
+            "{{ACCESS_TYPE}} static func finish(_ fbb: inout "
+            "FlatBufferBuilder, end: "
+            "Offset, prefix: Bool = false) { fbb.finish(offset: end, "
+            "fileId: "
+            "\"{{FILENAME}}\", addPrefix: prefix) }";
+      }
+      code_ +=
+          "{{ACCESS_TYPE}} static func getRootAs{{SHORT_STRUCTNAME}}(bb: "
+          "ByteBuffer) -> "
+          "{{STRUCTNAME}} { return {{STRUCTNAME}}(Table(bb: bb, position: "
+          "Int32(bb.read(def: UOffset.self, position: bb.reader)) + "
+          "Int32(bb.reader))) }\n";
+      code_ += "private init(_ t: Table) { {{ACCESS}} = t }";
+    }
+    code_ +=
+        "{{ACCESS_TYPE}} init(_ bb: ByteBuffer, o: Int32) { {{ACCESS}} = "
+        "{{OBJECTTYPE}}(bb: "
+        "bb, position: o) }";
+    code_ += "";
+  }
+
+  void GenTableWriter(const StructDef &struct_def) {
+    flatbuffers::FieldDef *key_field = nullptr;
+    std::vector<std::string> require_fields;
+    std::vector<std::string> create_func_body;
+    std::vector<std::string> create_func_header;
+    auto should_generate_create = struct_def.fields.vec.size() != 0;
+
+    code_.SetValue("NUMBEROFFIELDS", NumToString(struct_def.fields.vec.size()));
+    code_ +=
+        "{{ACCESS_TYPE}} static func start{{SHORT_STRUCTNAME}}(_ fbb: inout "
+        "FlatBufferBuilder) -> "
+        "UOffset { fbb.startTable(with: {{NUMBEROFFIELDS}}) }";
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      if (field.key) key_field = &field;
+      if (field.IsRequired())
+        require_fields.push_back(NumToString(field.value.offset));
+
+      GenTableWriterFields(field, &create_func_body, &create_func_header);
+    }
+    code_ +=
+        "{{ACCESS_TYPE}} static func end{{SHORT_STRUCTNAME}}(_ fbb: inout "
+        "FlatBufferBuilder, "
+        "start: "
+        "UOffset) -> Offset { let end = Offset(offset: "
+        "fbb.endTable(at: start))\\";
+    if (require_fields.capacity() != 0) {
+      std::string fields = "";
+      for (auto it = require_fields.begin(); it != require_fields.end(); ++it)
+        fields += *it + ", ";
+      code_.SetValue("FIELDS", fields.substr(0, fields.size() - 2));
+      code_ += "; fbb.require(table: end, fields: [{{FIELDS}}])\\";
+    }
+    code_ += "; return end }";
+
+    if (should_generate_create) {
+      code_ += "{{ACCESS_TYPE}} static func create{{SHORT_STRUCTNAME}}(";
+      Indent();
+      code_ += "_ fbb: inout FlatBufferBuilder,";
+      for (auto it = create_func_header.begin(); it < create_func_header.end();
+           ++it) {
+        code_ += *it + "\\";
+        if (it < create_func_header.end() - 1) code_ += ",";
+      }
+      code_ += "";
+      Outdent();
+      code_ += ") -> Offset {";
+      Indent();
+      code_ += "let __start = {{STRUCTNAME}}.start{{SHORT_STRUCTNAME}}(&fbb)";
+      for (auto it = create_func_body.begin(); it < create_func_body.end();
+           ++it) {
+        code_ += *it;
+      }
+      code_ +=
+          "return {{STRUCTNAME}}.end{{SHORT_STRUCTNAME}}(&fbb, start: __start)";
+      Outdent();
+      code_ += "}";
+    }
+
+    std::string spacing = "";
+
+    if (key_field != nullptr && !struct_def.fixed && struct_def.has_key) {
+      code_.SetValue("VALUENAME", NameWrappedInNameSpace(struct_def));
+      code_.SetValue("SHORT_VALUENAME", Name(struct_def));
+      code_.SetValue("VOFFSET", NumToString(key_field->value.offset));
+
+      code_ +=
+          "{{ACCESS_TYPE}} static func "
+          "sortVectorOf{{SHORT_VALUENAME}}(offsets:[Offset], "
+          "_ fbb: inout FlatBufferBuilder) -> Offset {";
+      Indent();
+      code_ += spacing + "var off = offsets";
+      code_ +=
+          spacing +
+          "off.sort { Table.compare(Table.offset(Int32($1.o), vOffset: "
+          "{{VOFFSET}}, fbb: fbb.buffer), Table.offset(Int32($0.o), vOffset: "
+          "{{VOFFSET}}, fbb: fbb.buffer), fbb: fbb.buffer) < 0 } ";
+      code_ += spacing + "return fbb.createVector(ofOffsets: off)";
+      Outdent();
+      code_ += "}";
+      GenLookup(*key_field);
+    }
+  }
+
+  void GenTableWriterFields(const FieldDef &field,
+                            std::vector<std::string> *create_body,
+                            std::vector<std::string> *create_header) {
+    std::string builder_string = ", _ fbb: inout FlatBufferBuilder) { ";
+    auto &create_func_body = *create_body;
+    auto &create_func_header = *create_header;
+    auto name = Name(field);
+    auto type = GenType(field.value.type);
+    auto opt_scalar =
+        field.IsOptional() && IsScalar(field.value.type.base_type);
+    auto nullable_type = opt_scalar ? type + "?" : type;
+    code_.SetValue("VALUENAME", name);
+    code_.SetValue("VALUETYPE", nullable_type);
+    code_.SetValue("OFFSET", name);
+    code_.SetValue("CONSTANT", field.value.constant);
+    std::string check_if_vector =
+        (IsVector(field.value.type) || IsArray(field.value.type)) ? "VectorOf("
+                                                                  : "(";
+    auto body = "add" + check_if_vector + name + ": ";
+    code_ += "{{ACCESS_TYPE}} static func " + body + "\\";
+
+    create_func_body.push_back("{{STRUCTNAME}}." + body + name + ", &fbb)");
+
+    if (IsScalar(field.value.type.base_type) &&
+        !IsBool(field.value.type.base_type)) {
+      std::string is_enum = IsEnum(field.value.type) ? ".rawValue" : "";
+      std::string optional_enum =
+          IsEnum(field.value.type) ? ("?" + is_enum) : "";
+      code_ +=
+          "{{VALUETYPE}}" + builder_string + "fbb.add(element: {{VALUENAME}}\\";
+
+      code_ += field.IsOptional() ? (optional_enum + "\\")
+                                  : (is_enum + ", def: {{CONSTANT}}\\");
+
+      code_ += ", at: {{TABLEOFFSET}}.{{OFFSET}}.p) }";
+
+      auto default_value =
+          IsEnum(field.value.type)
+              ? (field.IsOptional() ? "nil" : GenEnumDefaultValue(field))
+              : field.value.constant;
+      create_func_header.push_back(
+          "" + name + ": " + nullable_type + " = " +
+          (field.IsOptional() ? "nil" : default_value));
+      return;
+    }
+
+    if (IsBool(field.value.type.base_type)) {
+      std::string default_value =
+          "0" == field.value.constant ? "false" : "true";
+
+      code_.SetValue("CONSTANT", default_value);
+      code_.SetValue("VALUETYPE", field.IsOptional() ? "Bool?" : "Bool");
+      code_ += "{{VALUETYPE}}" + builder_string +
+               "fbb.add(element: {{VALUENAME}},\\";
+      code_ += field.IsOptional() ? "\\" : " def: {{CONSTANT}},";
+      code_ += " at: {{TABLEOFFSET}}.{{OFFSET}}.p) }";
+      create_func_header.push_back(
+          name + ": " + nullable_type + " = " +
+          (field.IsOptional() ? "nil" : default_value));
+      return;
+    }
+
+    if (IsStruct(field.value.type)) {
+      auto create_struct =
+          "guard let {{VALUENAME}} = {{VALUENAME}} else { return };"
+          " fbb.create(struct: {{VALUENAME}}, position: "
+          "{{TABLEOFFSET}}.{{OFFSET}}.p) }";
+      code_ += type + "?" + builder_string + create_struct;
+      /// Optional hard coded since structs are always optional
+      create_func_header.push_back(name + ": " + type + "? = nil");
+      return;
+    }
+
+    auto camel_case_name =
+        MakeCamel(name, false) +
+        (IsVector(field.value.type) || IsArray(field.value.type)
+             ? "VectorOffset"
+             : "Offset");
+    create_func_header.push_back(camel_case_name + " " + name + ": " +
+                                 "Offset = Offset()");
+    auto reader_type =
+        IsStruct(field.value.type) && field.value.type.struct_def->fixed
+            ? "structOffset: {{TABLEOFFSET}}.{{OFFSET}}.p) }"
+            : "offset: {{VALUENAME}}, at: {{TABLEOFFSET}}.{{OFFSET}}.p) }";
+    code_ += "Offset" + builder_string + "fbb.add(" + reader_type;
+
+    auto vectortype = field.value.type.VectorType();
+
+    if ((vectortype.base_type == BASE_TYPE_STRUCT &&
+         field.value.type.struct_def->fixed) &&
+        (IsVector(field.value.type) || IsArray(field.value.type))) {
+      auto field_name = NameWrappedInNameSpace(*vectortype.struct_def);
+      code_ += "public static func startVectorOf" + MakeCamel(name, true) +
+               "(_ size: Int, in builder: inout "
+               "FlatBufferBuilder) {";
+      Indent();
+      code_ += "builder.startVector(size * MemoryLayout<" + field_name +
+               ">.size, elementSize: MemoryLayout<" + field_name +
+               ">.alignment)";
+      Outdent();
+      code_ += "}";
+    }
+  }
+
+  void GenTableReader(const StructDef &struct_def) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      GenTableReaderFields(field);
+    }
+  }
+
+  void GenTableReaderFields(const FieldDef &field) {
+    auto offset = NumToString(field.value.offset);
+    auto name = Name(field);
+    auto type = GenType(field.value.type);
+    code_.SetValue("VALUENAME", name);
+    code_.SetValue("VALUETYPE", type);
+    code_.SetValue("OFFSET", name);
+    code_.SetValue("CONSTANT", field.value.constant);
+    std::string def_Val = field.IsDefault() ? "{{CONSTANT}}" : "nil";
+    std::string optional = field.IsOptional() ? "?" : "";
+    auto const_string = "return o == 0 ? " + def_Val + " : ";
+    GenComment(field.doc_comment);
+    if (IsScalar(field.value.type.base_type) && !IsEnum(field.value.type) &&
+        !IsBool(field.value.type.base_type)) {
+      code_ += GenReaderMainBody(optional) + GenOffset() + const_string +
+               GenReader("VALUETYPE", "o") + " }";
+      if (parser_.opts.mutable_buffer) code_ += GenMutate("o", GenOffset());
+      return;
+    }
+
+    if (IsBool(field.value.type.base_type)) {
+      std::string default_value =
+          "0" == field.value.constant ? "false" : "true";
+      code_.SetValue("CONSTANT", default_value);
+      code_.SetValue("VALUETYPE", "Bool");
+      code_ += GenReaderMainBody(optional) + "\\";
+      code_.SetValue("VALUETYPE", "Byte");
+      code_ += GenOffset() + "return o == 0 ? {{CONSTANT}} : 0 != " +
+               GenReader("VALUETYPE", "o") + " }";
+      if (parser_.opts.mutable_buffer) code_ += GenMutate("o", GenOffset());
+      return;
+    }
+
+    if (IsEnum(field.value.type)) {
+      auto default_value =
+          field.IsOptional() ? "nil" : GenEnumDefaultValue(field);
+      code_.SetValue("BASEVALUE", GenTypeBasic(field.value.type, false));
+      code_ += GenReaderMainBody(optional) + "\\";
+      code_ += GenOffset() + "return o == 0 ? " + default_value + " : " +
+               GenEnumConstructor("o") + "?? " + default_value + " }";
+      if (parser_.opts.mutable_buffer && !IsUnion(field.value.type))
+        code_ += GenMutate("o", GenOffset(), true);
+      return;
+    }
+
+    std::string is_required = field.IsRequired() ? "!" : "?";
+    auto required_reader = field.IsRequired() ? "return " : const_string;
+
+    if (IsStruct(field.value.type) && field.value.type.struct_def->fixed) {
+      code_.SetValue("VALUETYPE", GenType(field.value.type));
+      code_.SetValue("CONSTANT", "nil");
+      code_ += GenReaderMainBody(is_required) + GenOffset() + required_reader +
+               "{{ACCESS}}.readBuffer(of: {{VALUETYPE}}.self, at: o) }";
+      code_.SetValue("VALUENAME", "mutable" + MakeCamel(name));
+      code_.SetValue("VALUETYPE", GenType(field.value.type) + Mutable());
+      code_.SetValue("CONSTANT", "nil");
+      code_ += GenReaderMainBody(is_required) + GenOffset() + required_reader +
+               GenConstructor("o + {{ACCESS}}.postion");
+      return;
+    }
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_STRUCT:
+        code_.SetValue("VALUETYPE", GenType(field.value.type));
+        code_.SetValue("CONSTANT", "nil");
+        code_ += GenReaderMainBody(is_required) + GenOffset() +
+                 required_reader +
+                 GenConstructor(GenIndirect("o + {{ACCESS}}.postion"));
+        break;
+
+      case BASE_TYPE_STRING: {
+        auto default_string = "\"" + field.value.constant + "\"";
+        code_.SetValue("VALUETYPE", GenType(field.value.type));
+        code_.SetValue("CONSTANT", field.IsDefault() ? default_string : "nil");
+        code_ += GenReaderMainBody(is_required) + GenOffset() +
+                 required_reader + "{{ACCESS}}.string(at: o) }";
+        code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}SegmentArray: [UInt8]" +
+                 is_required +
+                 " { return "
+                 "{{ACCESS}}.getVector(at: {{TABLEOFFSET}}.{{OFFSET}}.v) }";
+        break;
+      }
+      case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();  // fall thru
+      case BASE_TYPE_VECTOR: GenTableReaderVectorFields(field); break;
+      case BASE_TYPE_UNION:
+        code_.SetValue("CONSTANT", "nil");
+        code_ +=
+            "{{ACCESS_TYPE}} func {{VALUENAME}}<T: "
+            "FlatbuffersInitializable>(type: "
+            "T.Type) -> T" +
+            is_required + " { " + GenOffset() + required_reader +
+            "{{ACCESS}}.union(o) }";
+        break;
+      default: FLATBUFFERS_ASSERT(0);
+    }
+  }
+
+  void GenTableReaderVectorFields(const FieldDef &field) {
+    std::string const_string = "return o == 0 ? {{CONSTANT}} : ";
+    auto vectortype = field.value.type.VectorType();
+    code_.SetValue("SIZE", NumToString(InlineSize(vectortype)));
+    code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}Count: Int32 { " + GenOffset() +
+             "return o == 0 ? 0 : {{ACCESS}}.vector(count: o) }";
+    code_.SetValue("CONSTANT",
+                   IsScalar(vectortype.base_type) == true ? "0" : "nil");
+    auto nullable = IsScalar(vectortype.base_type) == true ? "" : "?";
+    nullable = IsEnum(vectortype) == true ? "?" : nullable;
+
+    if (vectortype.base_type != BASE_TYPE_UNION) {
+      code_ += GenArrayMainBody(nullable) + GenOffset() + "\\";
+    } else {
+      code_ +=
+          "{{ACCESS_TYPE}} func {{VALUENAME}}<T: FlatbuffersInitializable>(at "
+          "index: "
+          "Int32, type: T.Type) -> T? { " +
+          GenOffset() + "\\";
+    }
+
+    if (IsBool(vectortype.base_type)) {
+      code_.SetValue("CONSTANT", field.value.offset == 0 ? "false" : "true");
+      code_.SetValue("VALUETYPE", "Bool");
+    }
+
+    if (!IsEnum(vectortype)) code_ += const_string + "\\";
+
+    if (IsScalar(vectortype.base_type) && !IsEnum(vectortype) &&
+        !IsBool(field.value.type.base_type)) {
+      code_ +=
+          "{{ACCESS}}.directRead(of: {{VALUETYPE}}.self, offset: "
+          "{{ACCESS}}.vector(at: o) + index * {{SIZE}}) }";
+      code_ +=
+          "{{ACCESS_TYPE}} var {{VALUENAME}}: [{{VALUETYPE}}] { return "
+          "{{ACCESS}}.getVector(at: {{TABLEOFFSET}}.{{OFFSET}}.v) ?? [] }";
+      if (parser_.opts.mutable_buffer) code_ += GenMutateArray();
+      return;
+    }
+
+    if (vectortype.base_type == BASE_TYPE_STRUCT &&
+        field.value.type.struct_def->fixed) {
+      code_ +=
+          "{{ACCESS}}.directRead(of: {{VALUETYPE}}.self, offset: "
+          "{{ACCESS}}.vector(at: o) + index * {{SIZE}}) }";
+      code_.SetValue("VALUENAME", "mutable" + MakeCamel(Name(field)));
+      code_.SetValue("VALUETYPE", GenType(field.value.type) + Mutable());
+      code_ += GenArrayMainBody(nullable) + GenOffset() + const_string +
+               GenConstructor("{{ACCESS}}.vector(at: o) + index * {{SIZE}}");
+
+      return;
+    }
+
+    if (IsString(vectortype)) {
+      code_ +=
+          "{{ACCESS}}.directString(at: {{ACCESS}}.vector(at: o) + "
+          "index * {{SIZE}}) }";
+      return;
+    }
+
+    if (IsEnum(vectortype)) {
+      code_.SetValue("BASEVALUE", GenTypeBasic(vectortype, false));
+      code_ += "return o == 0 ? {{VALUETYPE}}" + GenEnumDefaultValue(field) +
+               " : {{VALUETYPE}}(rawValue: {{ACCESS}}.directRead(of: "
+               "{{BASEVALUE}}.self, offset: {{ACCESS}}.vector(at: o) + "
+               "index * {{SIZE}})) }";
+      return;
+    }
+    if (vectortype.base_type == BASE_TYPE_UNION) {
+      code_ +=
+          "{{ACCESS}}.directUnion({{ACCESS}}.vector(at: o) + "
+          "index * {{SIZE}}) }";
+      return;
+    }
+
+    if (vectortype.base_type == BASE_TYPE_STRUCT &&
+        !field.value.type.struct_def->fixed) {
+      code_ += GenConstructor(
+          "{{ACCESS}}.indirect({{ACCESS}}.vector(at: o) + index * "
+          "{{SIZE}})");
+      auto &sd = *field.value.type.struct_def;
+      auto &fields = sd.fields.vec;
+      for (auto kit = fields.begin(); kit != fields.end(); ++kit) {
+        auto &key_field = **kit;
+        if (key_field.key) {
+          GenByKeyFunctions(key_field);
+          break;
+        }
+      }
+    }
+  }
+
+  void GenByKeyFunctions(const FieldDef &key_field) {
+    code_.SetValue("TYPE", GenType(key_field.value.type));
+    code_ +=
+        "{{ACCESS_TYPE}} func {{VALUENAME}}By(key: {{TYPE}}) -> {{VALUETYPE}}? "
+        "{ \\";
+    code_ += GenOffset() +
+             "return o == 0 ? nil : {{VALUETYPE}}.lookupByKey(vector: "
+             "{{ACCESS}}.vector(at: o), key: key, fbb: {{ACCESS}}.bb) }";
+  }
+
+  void GenEnum(const EnumDef &enum_def) {
+    if (enum_def.generated) return;
+    auto is_private_access = enum_def.attributes.Lookup("private");
+    code_.SetValue("ACCESS_TYPE", is_private_access ? "internal" : "public");
+    code_.SetValue("ENUM_NAME", NameWrappedInNameSpace(enum_def));
+    code_.SetValue("BASE_TYPE", GenTypeBasic(enum_def.underlying_type, false));
+    GenComment(enum_def.doc_comment);
+    code_ += "{{ACCESS_TYPE}} enum {{ENUM_NAME}}: {{BASE_TYPE}}, Enum {";
+    Indent();
+    code_ += "{{ACCESS_TYPE}} typealias T = {{BASE_TYPE}}";
+    code_ +=
+        "{{ACCESS_TYPE}} static var byteSize: Int { return "
+        "MemoryLayout<{{BASE_TYPE}}>.size "
+        "}";
+    code_ +=
+        "{{ACCESS_TYPE}} var value: {{BASE_TYPE}} { return self.rawValue }";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      const auto &ev = **it;
+      auto name = Name(ev);
+      code_.SetValue("KEY", name);
+      code_.SetValue("VALUE", enum_def.ToString(ev));
+      GenComment(ev.doc_comment);
+      code_ += "case {{KEY}} = {{VALUE}}";
+    }
+    code_ += "\n";
+    AddMinOrMaxEnumValue(Name(*enum_def.MaxValue()), "max");
+    AddMinOrMaxEnumValue(Name(*enum_def.MinValue()), "min");
+    Outdent();
+    code_ += "}\n";
+    if (parser_.opts.generate_object_based_api && enum_def.is_union) {
+      code_ += "{{ACCESS_TYPE}} struct {{ENUM_NAME}}Union {";
+      Indent();
+      code_ += "{{ACCESS_TYPE}} var type: {{ENUM_NAME}}";
+      code_ += "{{ACCESS_TYPE}} var value: NativeObject?";
+      code_ +=
+          "{{ACCESS_TYPE}} init(_ v: NativeObject?, type: {{ENUM_NAME}}) {";
+      Indent();
+      code_ += "self.type = type";
+      code_ += "self.value = v";
+      Outdent();
+      code_ += "}";
+      code_ +=
+          "{{ACCESS_TYPE}} func pack(builder: inout FlatBufferBuilder) -> "
+          "Offset {";
+      Indent();
+      BuildUnionEnumSwitchCaseWritter(enum_def);
+      Outdent();
+      code_ += "}";
+      Outdent();
+      code_ += "}";
+    }
+  }
+
+  // MARK: - Object API
+
+  void GenerateObjectAPIExtensionHeader(std::string name) {
+    code_ += "\n";
+    code_ += "{{ACCESS_TYPE}} mutating func unpack() -> " + name + " {";
+    Indent();
+    code_ += "return " + name + "(&self)";
+    Outdent();
+    code_ += "}";
+    code_ +=
+        "{{ACCESS_TYPE}} static func pack(_ builder: inout FlatBufferBuilder, "
+        "obj: "
+        "inout " +
+        name + "?) -> Offset {";
+    Indent();
+    code_ += "guard var obj = obj else { return Offset() }";
+    code_ += "return pack(&builder, obj: &obj)";
+    Outdent();
+    code_ += "}";
+    code_ += "";
+    code_ +=
+        "{{ACCESS_TYPE}} static func pack(_ builder: inout FlatBufferBuilder, "
+        "obj: "
+        "inout " +
+        name + ") -> Offset {";
+    Indent();
+  }
+
+  void GenerateObjectAPIStructConstructor(const StructDef &struct_def) {
+    code_ +=
+        "{{ACCESS_TYPE}} init(_ _t: inout {{STRUCTNAME}}" + Mutable() + ") {";
+    Indent();
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      auto name = Name(field);
+      auto type = GenType(field.value.type);
+      code_.SetValue("VALUENAME", name);
+      if (IsStruct(field.value.type)) {
+        code_ += "var _v{{VALUENAME}} = _t.{{VALUENAME}}";
+        code_ += "_{{VALUENAME}} = _v{{VALUENAME}}.unpack()";
+        continue;
+      }
+      std::string is_enum = IsEnum(field.value.type) ? ".value" : "";
+      code_ += "_{{VALUENAME}} = _t.{{VALUENAME}}" + is_enum;
+    }
+    Outdent();
+    code_ += "}\n";
+  }
+
+  void GenObjectAPI(const StructDef &struct_def) {
+    code_ += "{{ACCESS_TYPE}} class " + ObjectAPIName("{{STRUCTNAME}}") +
+             ": NativeObject {\n";
+    std::vector<std::string> buffer_constructor;
+    std::vector<std::string> base_constructor;
+    Indent();
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      BuildObjectAPIConstructorBody(field, struct_def.fixed, buffer_constructor,
+                                    base_constructor);
+    }
+    code_ += "";
+    BuildObjectConstructor(buffer_constructor,
+                           "_ _t: inout " + NameWrappedInNameSpace(struct_def));
+    BuildObjectConstructor(base_constructor);
+    if (!struct_def.fixed)
+      code_ +=
+          "{{ACCESS_TYPE}} func serialize() -> ByteBuffer { return "
+          "serialize(type: "
+          "{{STRUCTNAME}}.self) }\n";
+    Outdent();
+    code_ += "}";
+  }
+
+  void GenerateObjectAPITableExtension(const StructDef &struct_def) {
+    GenerateObjectAPIExtensionHeader(ObjectAPIName("{{STRUCTNAME}}"));
+    std::vector<std::string> unpack_body;
+    std::string builder = ", &builder)";
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto name = Name(field);
+      auto type = GenType(field.value.type);
+      std::string check_if_vector =
+          (IsVector(field.value.type) || IsArray(field.value.type))
+              ? "VectorOf("
+              : "(";
+      std::string body = "add" + check_if_vector + name + ": ";
+      switch (field.value.type.base_type) {
+        case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();
+        case BASE_TYPE_VECTOR: {
+          GenerateVectorObjectAPITableExtension(field, name, type);
+          unpack_body.push_back("{{STRUCTNAME}}." + body + "__" + name +
+                                builder);
+          break;
+        }
+        case BASE_TYPE_UNION: {
+          code_ += "let __" + name + " = obj." + name +
+                   "?.pack(builder: &builder) ?? Offset()";
+          unpack_body.push_back("if let o = obj." + name + "?.type {");
+          unpack_body.push_back("  {{STRUCTNAME}}.add(" + name + "Type: o" +
+                                builder);
+          unpack_body.push_back("  {{STRUCTNAME}}." + body + "__" + name +
+                                builder);
+          unpack_body.push_back("}\n");
+          break;
+        }
+        case BASE_TYPE_STRUCT: {
+          if (field.value.type.struct_def &&
+              field.value.type.struct_def->fixed) {
+            // This is a Struct (IsStruct), not a table. We create
+            // a native swift object in this case.
+            std::string code;
+            GenerateStructArgs(*field.value.type.struct_def, &code, "", "",
+                               "$0", true);
+            code = code.substr(0, code.size() - 2);
+            unpack_body.push_back("{{STRUCTNAME}}." + body + "obj." + name +
+                                  builder);
+          } else {
+            code_ += "let __" + name + " = " + type +
+                     ".pack(&builder, obj: &obj." + name + ")";
+            unpack_body.push_back("{{STRUCTNAME}}." + body + "__" + name +
+                                  builder);
+          }
+          break;
+        }
+        case BASE_TYPE_STRING: {
+          unpack_body.push_back("{{STRUCTNAME}}." + body + "__" + name +
+                                builder);
+          if (field.IsRequired()) {
+            code_ +=
+                "let __" + name + " = builder.create(string: obj." + name + ")";
+          } else {
+            BuildingOptionalObjects(name, "builder.create(string: s)");
+          }
+          break;
+        }
+        case BASE_TYPE_UTYPE: break;
+        default:
+          unpack_body.push_back("{{STRUCTNAME}}." + body + "obj." + name +
+                                builder);
+      }
+    }
+    code_ += "let __root = {{STRUCTNAME}}.start{{SHORT_STRUCTNAME}}(&builder)";
+    for (auto it = unpack_body.begin(); it < unpack_body.end(); it++)
+      code_ += *it;
+    code_ +=
+        "return {{STRUCTNAME}}.end{{SHORT_STRUCTNAME}}(&builder, start: "
+        "__root)";
+    Outdent();
+    code_ += "}";
+  }
+
+  void GenerateVectorObjectAPITableExtension(const FieldDef &field,
+                                             const std::string &name,
+                                             const std::string &type) {
+    auto vectortype = field.value.type.VectorType();
+    switch (vectortype.base_type) {
+      case BASE_TYPE_UNION: {
+        code_ += "var __" + name + "__: [Offset] = []";
+        code_ += "for i in obj." + name + " {";
+        Indent();
+        code_ += "guard let off = i?.pack(builder: &builder) else { continue }";
+        code_ += "__" + name + "__.append(off)";
+        Outdent();
+        code_ += "}";
+        code_ += "let __" + name + " = builder.createVector(ofOffsets: __" +
+                 name + "__)";
+        code_ += "let __" + name + "Type = builder.createVector(obj." + name +
+                 ".compactMap { $0?.type })";
+        break;
+      }
+      case BASE_TYPE_UTYPE: break;
+      case BASE_TYPE_STRUCT: {
+        if (field.value.type.struct_def &&
+            !field.value.type.struct_def->fixed) {
+          code_ += "var __" + name + "__: [Offset] = []";
+          code_ += "for var i in obj." + name + " {";
+          Indent();
+          code_ +=
+              "__" + name + "__.append(" + type + ".pack(&builder, obj: &i))";
+          Outdent();
+          code_ += "}";
+          code_ += "let __" + name + " = builder.createVector(ofOffsets: __" +
+                   name + "__)";
+        } else {
+          code_ += "{{STRUCTNAME}}.startVectorOf" + MakeCamel(name, true) +
+                   "(obj." + name + ".count, in: &builder)";
+          std::string code;
+          GenerateStructArgs(*field.value.type.struct_def, &code, "", "", "_o",
+                             true);
+          code = code.substr(0, code.size() - 2);
+          code_ += "for i in obj." + name + " {";
+          Indent();
+          code_ += "guard let _o = i else { continue }";
+          code_ += "builder.create(struct: _o)";
+          Outdent();
+          code_ += "}";
+          code_ += "let __" + name + " = builder.endVector(len: obj." + name +
+                   ".count)";
+        }
+        break;
+      }
+      case BASE_TYPE_STRING: {
+        code_ += "let __" + name + " = builder.createVector(ofStrings: obj." +
+                 name + ".compactMap({ $0 }) )";
+        break;
+      }
+      default: {
+        code_ += "let __" + name + " = builder.createVector(obj." + name + ")";
+        break;
+      }
+    }
+  }
+
+  void BuildingOptionalObjects(const std::string &name,
+                               const std::string &body_front) {
+    code_ += "let __" + name + ": Offset";
+    code_ += "if let s = obj." + name + " {";
+    Indent();
+    code_ += "__" + name + " = " + body_front;
+    Outdent();
+    code_ += "} else {";
+    Indent();
+    code_ += "__" + name + " = Offset()";
+    Outdent();
+    code_ += "}";
+    code_ += "";
+  }
+
+  void BuildObjectConstructor(const std::vector<std::string> &body,
+                              const std::string &header = "") {
+    code_.SetValue("HEADER", header);
+    code_ += "{{ACCESS_TYPE}} init({{HEADER}}) {";
+    Indent();
+    for (auto it = body.begin(); it < body.end(); ++it) code_ += *it;
+    Outdent();
+    code_ += "}\n";
+  }
+
+  void BuildObjectAPIConstructorBody(
+      const FieldDef &field, bool is_fixed,
+      std::vector<std::string> &buffer_constructor,
+      std::vector<std::string> &base_constructor) {
+    auto name = Name(field);
+    auto type = GenType(field.value.type);
+    code_.SetValue("VALUENAME", name);
+    code_.SetValue("VALUETYPE", type);
+    std::string is_required = field.IsRequired() ? "" : "?";
+
+    switch (field.value.type.base_type) {
+      case BASE_TYPE_STRUCT: {
+        type = GenType(field.value.type, true);
+        code_.SetValue("VALUETYPE", type);
+        auto optional =
+            (field.value.type.struct_def && field.value.type.struct_def->fixed);
+        std::string question_mark =
+            (field.IsRequired() || (optional && is_fixed) ? "" : "?");
+
+        code_ +=
+            "{{ACCESS_TYPE}} var {{VALUENAME}}: {{VALUETYPE}}" + question_mark;
+        base_constructor.push_back("" + name + " = " + type + "()");
+
+        if (field.value.type.struct_def->fixed) {
+          buffer_constructor.push_back("" + name + " = _t." + name);
+        } else {
+          buffer_constructor.push_back("var __" + name + " = _t." + name);
+          buffer_constructor.push_back(
+              "" + name + " = __" + name +
+              (field.IsRequired() ? "!" : question_mark) + ".unpack()");
+        }
+        break;
+      }
+      case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();
+      case BASE_TYPE_VECTOR: {
+        BuildObjectAPIConstructorBodyVectors(field, name, buffer_constructor,
+                                             base_constructor, "    ");
+        break;
+      }
+      case BASE_TYPE_STRING: {
+        code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: String" + is_required;
+        buffer_constructor.push_back(name + " = _t." + name);
+
+        if (field.IsRequired()) {
+          std::string default_value =
+              field.IsDefault() ? field.value.constant : "";
+          base_constructor.push_back(name + " = \"" + default_value + "\"");
+          break;
+        }
+        if (field.IsDefault() && !field.IsRequired()) {
+          std::string value = field.IsDefault() ? field.value.constant : "nil";
+          base_constructor.push_back(name + " = \"" + value + "\"");
+        }
+        break;
+      }
+      case BASE_TYPE_UTYPE: break;
+      case BASE_TYPE_UNION: {
+        BuildUnionEnumSwitchCase(*field.value.type.enum_def, name,
+                                 buffer_constructor);
+        break;
+      }
+      default: {
+        buffer_constructor.push_back(name + " = _t." + name);
+        std::string nullable = field.IsOptional() ? "?" : "";
+        if (IsScalar(field.value.type.base_type) &&
+            !IsBool(field.value.type.base_type) && !IsEnum(field.value.type)) {
+          code_ +=
+              "{{ACCESS_TYPE}} var {{VALUENAME}}: {{VALUETYPE}}" + nullable;
+          if (!field.IsOptional())
+            base_constructor.push_back(name + " = " + field.value.constant);
+          break;
+        }
+
+        if (IsEnum(field.value.type)) {
+          auto default_value = IsEnum(field.value.type)
+                                   ? GenEnumDefaultValue(field)
+                                   : field.value.constant;
+          code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: {{VALUETYPE}}";
+          base_constructor.push_back(name + " = " + default_value);
+          break;
+        }
+
+        if (IsBool(field.value.type.base_type)) {
+          code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: Bool" + nullable;
+          std::string default_value =
+              "0" == field.value.constant ? "false" : "true";
+          if (!field.IsOptional())
+            base_constructor.push_back(name + " = " + default_value);
+        }
+      }
+    }
+  }
+
+  void BuildObjectAPIConstructorBodyVectors(
+      const FieldDef &field, const std::string &name,
+      std::vector<std::string> &buffer_constructor,
+      std::vector<std::string> &base_constructor,
+      const std::string &indentation) {
+    auto vectortype = field.value.type.VectorType();
+
+    if (vectortype.base_type != BASE_TYPE_UTYPE) {
+      buffer_constructor.push_back(name + " = []");
+      buffer_constructor.push_back("for index in 0..<_t." + name + "Count {");
+      base_constructor.push_back(name + " = []");
+    }
+
+    switch (vectortype.base_type) {
+      case BASE_TYPE_STRUCT: {
+        code_.SetValue("VALUETYPE", GenType(vectortype, true));
+        code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: [{{VALUETYPE}}?]";
+        if (!vectortype.struct_def->fixed) {
+          buffer_constructor.push_back(indentation + "var __v_ = _t." + name +
+                                       "(at: index)");
+          buffer_constructor.push_back(indentation + name +
+                                       ".append(__v_?.unpack())");
+        } else {
+          buffer_constructor.push_back(indentation + name + ".append(_t." +
+                                       name + "(at: index))");
+        }
+        break;
+      }
+      case BASE_TYPE_ARRAY: FLATBUFFERS_FALLTHROUGH();
+      case BASE_TYPE_VECTOR: {
+        break;
+      }
+      case BASE_TYPE_UNION: {
+        BuildUnionEnumSwitchCase(*field.value.type.enum_def, name,
+                                 buffer_constructor, indentation, true);
+        break;
+      }
+      case BASE_TYPE_UTYPE: break;
+      default: {
+        code_.SetValue(
+            "VALUETYPE",
+            (IsString(vectortype) ? "String?" : GenType(vectortype)));
+        code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: [{{VALUETYPE}}]";
+
+        if (IsEnum(vectortype) && vectortype.base_type != BASE_TYPE_UNION) {
+          auto default_value = IsEnum(field.value.type)
+                                   ? GenEnumDefaultValue(field)
+                                   : field.value.constant;
+          buffer_constructor.push_back(indentation + name + ".append(_t." +
+                                       name + "(at: index)!)");
+          break;
+        }
+        buffer_constructor.push_back(indentation + name + ".append(_t." + name +
+                                     "(at: index))");
+        break;
+      }
+    }
+    if (vectortype.base_type != BASE_TYPE_UTYPE)
+      buffer_constructor.push_back("}");
+  }
+
+  void BuildUnionEnumSwitchCaseWritter(const EnumDef &ev) {
+    auto field_name = Name(ev);
+    code_.SetValue("VALUETYPE", field_name);
+    code_ += "switch type {";
+    for (auto it = ev.Vals().begin(); it < ev.Vals().end(); ++it) {
+      auto field = **it;
+      auto ev_name = Name(field);
+      auto type = GenType(field.union_type);
+      auto is_struct = IsStruct(field.union_type) ? type + Mutable() : type;
+      if (field.union_type.base_type == BASE_TYPE_NONE) { continue; }
+      code_ += "case ." + ev_name + ":";
+      Indent();
+      code_ += "var __obj = value as? " + GenType(field.union_type, true);
+      code_ += "return " + is_struct + ".pack(&builder, obj: &__obj)";
+      Outdent();
+    }
+    code_ += "default: return Offset()";
+    code_ += "}";
+  }
+
+  void BuildUnionEnumSwitchCase(const EnumDef &ev, const std::string &name,
+                                std::vector<std::string> &buffer_constructor,
+                                const std::string &indentation = "",
+                                const bool is_vector = false) {
+    auto field_name = NameWrappedInNameSpace(ev);
+    code_.SetValue("VALUETYPE", field_name);
+    code_ += "{{ACCESS_TYPE}} var {{VALUENAME}}: \\";
+    code_ += is_vector ? "[{{VALUETYPE}}Union?]" : "{{VALUETYPE}}Union?";
+
+    auto vector_reader = is_vector ? "(at: index" : "";
+    buffer_constructor.push_back(indentation + "switch _t." + name + "Type" +
+                                 vector_reader + (is_vector ? ")" : "") + " {");
+
+    for (auto it = ev.Vals().begin(); it < ev.Vals().end(); ++it) {
+      auto field = **it;
+      auto ev_name = Name(field);
+      if (field.union_type.base_type == BASE_TYPE_NONE) { continue; }
+      auto type = IsStruct(field.union_type)
+                      ? GenType(field.union_type) + Mutable()
+                      : GenType(field.union_type);
+      buffer_constructor.push_back(indentation + "case ." + ev_name + ":");
+      buffer_constructor.push_back(
+          indentation + "  var _v = _t." + name + (is_vector ? "" : "(") +
+          vector_reader + (is_vector ? ", " : "") + "type: " + type + ".self)");
+      auto constructor =
+          field_name + "Union(_v?.unpack(), type: ." + ev_name + ")";
+      buffer_constructor.push_back(
+          indentation + "  " + name +
+          (is_vector ? ".append(" + constructor + ")" : " = " + constructor));
+    }
+    buffer_constructor.push_back(indentation + "default: break");
+    buffer_constructor.push_back(indentation + "}");
+  }
+
+  void AddMinOrMaxEnumValue(const std::string &str, const std::string &type) {
+    auto current_value = str;
+    code_.SetValue(type, current_value);
+    code_ += "{{ACCESS_TYPE}} static var " + type +
+             ": {{ENUM_NAME}} { return .{{" + type + "}} }";
+  }
+
+  void GenLookup(const FieldDef &key_field) {
+    code_.SetValue("OFFSET", NumToString(key_field.value.offset));
+    std::string offset_reader =
+        "Table.offset(Int32(fbb.capacity) - tableOffset, vOffset: {{OFFSET}}, "
+        "fbb: fbb)";
+
+    code_.SetValue("TYPE", GenType(key_field.value.type));
+    code_ +=
+        "fileprivate static func lookupByKey(vector: Int32, key: {{TYPE}}, "
+        "fbb: "
+        "ByteBuffer) -> {{VALUENAME}}? {";
+    Indent();
+    if (IsString(key_field.value.type))
+      code_ += "let key = key.utf8.map { $0 }";
+    code_ += "var span = fbb.read(def: Int32.self, position: Int(vector - 4))";
+    code_ += "var start: Int32 = 0";
+    code_ += "while span != 0 {";
+    Indent();
+    code_ += "var middle = span / 2";
+    code_ +=
+        "let tableOffset = Table.indirect(vector + 4 * (start + middle), fbb)";
+    if (IsString(key_field.value.type)) {
+      code_ += "let comp = Table.compare(" + offset_reader + ", key, fbb: fbb)";
+    } else {
+      code_ += "let comp = fbb.read(def: {{TYPE}}.self, position: Int(" +
+               offset_reader + "))";
+    }
+
+    code_ += "if comp > 0 {";
+    Indent();
+    code_ += "span = middle";
+    Outdent();
+    code_ += "} else if comp < 0 {";
+    Indent();
+    code_ += "middle += 1";
+    code_ += "start += middle";
+    code_ += "span -= middle";
+    Outdent();
+    code_ += "} else {";
+    Indent();
+    code_ += "return {{VALUENAME}}(fbb, o: tableOffset)";
+    Outdent();
+    code_ += "}";
+    Outdent();
+    code_ += "}";
+    code_ += "return nil";
+    Outdent();
+    code_ += "}";
+  }
+
+  inline void GenPadding(const FieldDef &field, int *id) {
+    if (field.padding) {
+      for (int i = 0; i < 4; i++) {
+        if (static_cast<int>(field.padding) & (1 << i)) {
+          auto bits = (1 << i) * 8;
+          code_ += "private let padding" + NumToString((*id)++) + "__: UInt" +
+                   NumToString(bits) + " = 0";
+        }
+      }
+      FLATBUFFERS_ASSERT(!(field.padding & ~0xF));
+    }
+  }
+
+  void GenComment(const std::vector<std::string> &dc) {
+    if (dc.begin() == dc.end()) {
+      // Don't output empty comment blocks with 0 lines of comment content.
+      return;
+    }
+    for (auto it = dc.begin(); it != dc.end(); ++it) { code_ += "/// " + *it; }
+  }
+
+  std::string GenOffset() {
+    return "let o = {{ACCESS}}.offset({{TABLEOFFSET}}.{{OFFSET}}.v); ";
+  }
+
+  std::string GenReaderMainBody(const std::string &optional = "") {
+    return "{{ACCESS_TYPE}} var {{VALUENAME}}: {{VALUETYPE}}" + optional +
+           " { ";
+  }
+
+  std::string GenReader(const std::string &type,
+                        const std::string &at = "{{OFFSET}}") {
+    return "{{ACCESS}}.readBuffer(of: {{" + type + "}}.self, at: " + at + ")";
+  }
+
+  std::string GenConstructor(const std::string &offset) {
+    return "{{VALUETYPE}}({{ACCESS}}.bb, o: " + offset + ") }";
+  }
+
+  std::string GenMutate(const std::string &offset,
+                        const std::string &get_offset, bool isRaw = false) {
+    return "@discardableResult {{ACCESS_TYPE}} func mutate({{VALUENAME}}: "
+           "{{VALUETYPE}}) -> Bool {" +
+           get_offset + " return {{ACCESS}}.mutate({{VALUENAME}}" +
+           (isRaw ? ".rawValue" : "") + ", index: " + offset + ") }";
+  }
+
+  std::string GenMutateArray() {
+    return "{{ACCESS_TYPE}} func mutate({{VALUENAME}}: {{VALUETYPE}}, at "
+           "index: "
+           "Int32) -> Bool { " +
+           GenOffset() +
+           "return {{ACCESS}}.directMutate({{VALUENAME}}, index: "
+           "{{ACCESS}}.vector(at: o) + index * {{SIZE}}) }";
+  }
+
+  std::string GenEnumDefaultValue(const FieldDef &field) {
+    auto &value = field.value;
+    FLATBUFFERS_ASSERT(value.type.enum_def);
+    auto &enum_def = *value.type.enum_def;
+    // Vector of enum defaults are always "[]" which never works.
+    const std::string constant = IsVector(value.type) ? "0" : value.constant;
+    auto enum_val = enum_def.FindByValue(constant);
+    std::string name;
+    if (enum_val) {
+      name = Name(*enum_val);
+    } else {
+      const auto &ev = **enum_def.Vals().begin();
+      name = Name(ev);
+    }
+    return "." + name;
+  }
+
+  std::string GenEnumConstructor(const std::string &at) {
+    return "{{VALUETYPE}}(rawValue: " + GenReader("BASEVALUE", at) + ") ";
+  }
+
+  std::string ValidateFunc() {
+    return "static func validateVersion() { FlatBuffersVersion_2_0_0() }";
+  }
+
+  std::string GenType(const Type &type,
+                      const bool should_consider_suffix = false) const {
+    return IsScalar(type.base_type)
+               ? GenTypeBasic(type)
+               : (IsArray(type) ? GenType(type.VectorType())
+                                : GenTypePointer(type, should_consider_suffix));
+  }
+
+  std::string GenTypePointer(const Type &type,
+                             const bool should_consider_suffix) const {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_VECTOR: return GenType(type.VectorType());
+      case BASE_TYPE_STRUCT: {
+        auto &struct_ = *type.struct_def;
+        if (should_consider_suffix && !struct_.fixed) {
+          return WrapInNameSpace(struct_.defined_namespace,
+                                 ObjectAPIName(Name(struct_)));
+        }
+        return WrapInNameSpace(struct_.defined_namespace, Name(struct_));
+      }
+      case BASE_TYPE_UNION:
+      default: return "FlatbuffersInitializable";
+    }
+  }
+
+  std::string GenTypeBasic(const Type &type) const {
+    return GenTypeBasic(type, true);
+  }
+
+  std::string ObjectAPIName(const std::string &name) const {
+    return parser_.opts.object_prefix + name + parser_.opts.object_suffix;
+  }
+
+  void Indent() { code_.IncrementIdentLevel(); }
+
+  void Outdent() { code_.DecrementIdentLevel(); }
+
+  std::string NameWrappedInNameSpace(const EnumDef &enum_def) const {
+    return WrapInNameSpace(enum_def.defined_namespace, Name(enum_def));
+  }
+
+  std::string NameWrappedInNameSpace(const StructDef &struct_def) const {
+    return WrapInNameSpace(struct_def.defined_namespace, Name(struct_def));
+  }
+
+  std::string GenTypeBasic(const Type &type, bool can_override) const {
+    // clang-format off
+    static const char * const swift_type[] = {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, RTYPE, KTYPE, STYPE) \
+        #STYPE,
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+    };
+    // clang-format on
+    if (can_override) {
+      if (type.enum_def) return NameWrappedInNameSpace(*type.enum_def);
+      if (type.base_type == BASE_TYPE_BOOL) return "Bool";
+    }
+    return swift_type[static_cast<int>(type.base_type)];
+  }
+
+  std::string EscapeKeyword(const std::string &name) const {
+    return keywords_.find(name) == keywords_.end() ? name : name + "_";
+  }
+
+  std::string Mutable() const { return "_Mutable"; }
+
+  std::string Name(const EnumVal &ev) const {
+    auto name = ev.name;
+    if (isupper(name.front())) {
+      std::transform(name.begin(), name.end(), name.begin(), CharToLower);
+    }
+    return EscapeKeyword(MakeCamel(name, false));
+  }
+
+  std::string Name(const Definition &def) const {
+    return EscapeKeyword(MakeCamel(def.name, false));
+  }
+};
+}  // namespace swift
+bool GenerateSwift(const Parser &parser, const std::string &path,
+                   const std::string &file_name) {
+  swift::SwiftGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_text.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_text.cpp
new file mode 100644
index 0000000..903c41e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_text.cpp
@@ -0,0 +1,414 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+struct PrintScalarTag {};
+struct PrintPointerTag {};
+template<typename T> struct PrintTag { typedef PrintScalarTag type; };
+template<> struct PrintTag<const void *> { typedef PrintPointerTag type; };
+
+struct JsonPrinter {
+  // If indentation is less than 0, that indicates we don't want any newlines
+  // either.
+  void AddNewLine() {
+    if (opts.indent_step >= 0) text += '\n';
+  }
+
+  void AddIndent(int ident) { text.append(ident, ' '); }
+
+  int Indent() const { return std::max(opts.indent_step, 0); }
+
+  // Output an identifier with or without quotes depending on strictness.
+  void OutputIdentifier(const std::string &name) {
+    if (opts.strict_json) text += '\"';
+    text += name;
+    if (opts.strict_json) text += '\"';
+  }
+
+  // Print (and its template specialization below for pointers) generate text
+  // for a single FlatBuffer value into JSON format.
+  // The general case for scalars:
+  template<typename T>
+  bool PrintScalar(T val, const Type &type, int /*indent*/) {
+    if (IsBool(type.base_type)) {
+      text += val != 0 ? "true" : "false";
+      return true;  // done
+    }
+
+    if (opts.output_enum_identifiers && type.enum_def) {
+      const auto &enum_def = *type.enum_def;
+      if (auto ev = enum_def.ReverseLookup(static_cast<int64_t>(val))) {
+        text += '\"';
+        text += ev->name;
+        text += '\"';
+        return true;  // done
+      } else if (val && enum_def.attributes.Lookup("bit_flags")) {
+        const auto entry_len = text.length();
+        const auto u64 = static_cast<uint64_t>(val);
+        uint64_t mask = 0;
+        text += '\"';
+        for (auto it = enum_def.Vals().begin(), e = enum_def.Vals().end();
+             it != e; ++it) {
+          auto f = (*it)->GetAsUInt64();
+          if (f & u64) {
+            mask |= f;
+            text += (*it)->name;
+            text += ' ';
+          }
+        }
+        // Don't slice if (u64 != mask)
+        if (mask && (u64 == mask)) {
+          text[text.length() - 1] = '\"';
+          return true;  // done
+        }
+        text.resize(entry_len);  // restore
+      }
+      // print as numeric value
+    }
+
+    text += NumToString(val);
+    return true;
+  }
+
+  void AddComma() {
+    if (!opts.protobuf_ascii_alike) text += ',';
+  }
+
+  // Print a vector or an array of JSON values, comma seperated, wrapped in
+  // "[]".
+  template<typename Container>
+  bool PrintContainer(PrintScalarTag, const Container &c, size_t size,
+                      const Type &type, int indent, const uint8_t *) {
+    const auto elem_indent = indent + Indent();
+    text += '[';
+    AddNewLine();
+    for (uoffset_t i = 0; i < size; i++) {
+      if (i) {
+        AddComma();
+        AddNewLine();
+      }
+      AddIndent(elem_indent);
+      if (!PrintScalar(c[i], type, elem_indent)) { return false; }
+    }
+    AddNewLine();
+    AddIndent(indent);
+    text += ']';
+    return true;
+  }
+
+  // Print a vector or an array of JSON values, comma seperated, wrapped in
+  // "[]".
+  template<typename Container>
+  bool PrintContainer(PrintPointerTag, const Container &c, size_t size,
+                      const Type &type, int indent, const uint8_t *prev_val) {
+    const auto is_struct = IsStruct(type);
+    const auto elem_indent = indent + Indent();
+    text += '[';
+    AddNewLine();
+    for (uoffset_t i = 0; i < size; i++) {
+      if (i) {
+        AddComma();
+        AddNewLine();
+      }
+      AddIndent(elem_indent);
+      auto ptr = is_struct ? reinterpret_cast<const void *>(
+                                 c.Data() + type.struct_def->bytesize * i)
+                           : c[i];
+      if (!PrintOffset(ptr, type, elem_indent, prev_val,
+                       static_cast<soffset_t>(i))) {
+        return false;
+      }
+    }
+    AddNewLine();
+    AddIndent(indent);
+    text += ']';
+    return true;
+  }
+
+  template<typename T>
+  bool PrintVector(const void *val, const Type &type, int indent,
+                   const uint8_t *prev_val) {
+    typedef Vector<T> Container;
+    typedef typename PrintTag<typename Container::return_type>::type tag;
+    auto &vec = *reinterpret_cast<const Container *>(val);
+    return PrintContainer<Container>(tag(), vec, vec.size(), type, indent,
+                                     prev_val);
+  }
+
+  // Print an array a sequence of JSON values, comma separated, wrapped in "[]".
+  template<typename T>
+  bool PrintArray(const void *val, size_t size, const Type &type, int indent) {
+    typedef Array<T, 0xFFFF> Container;
+    typedef typename PrintTag<typename Container::return_type>::type tag;
+    auto &arr = *reinterpret_cast<const Container *>(val);
+    return PrintContainer<Container>(tag(), arr, size, type, indent, nullptr);
+  }
+
+  bool PrintOffset(const void *val, const Type &type, int indent,
+                   const uint8_t *prev_val, soffset_t vector_index) {
+    switch (type.base_type) {
+      case BASE_TYPE_UNION: {
+        // If this assert hits, you have an corrupt buffer, a union type field
+        // was not present or was out of range.
+        FLATBUFFERS_ASSERT(prev_val);
+        auto union_type_byte = *prev_val;  // Always a uint8_t.
+        if (vector_index >= 0) {
+          auto type_vec = reinterpret_cast<const Vector<uint8_t> *>(
+              prev_val + ReadScalar<uoffset_t>(prev_val));
+          union_type_byte = type_vec->Get(static_cast<uoffset_t>(vector_index));
+        }
+        auto enum_val = type.enum_def->ReverseLookup(union_type_byte, true);
+        if (enum_val) {
+          return PrintOffset(val, enum_val->union_type, indent, nullptr, -1);
+        } else {
+          return false;
+        }
+      }
+      case BASE_TYPE_STRUCT:
+        return GenStruct(*type.struct_def, reinterpret_cast<const Table *>(val),
+                         indent);
+      case BASE_TYPE_STRING: {
+        auto s = reinterpret_cast<const String *>(val);
+        return EscapeString(s->c_str(), s->size(), &text, opts.allow_non_utf8,
+                            opts.natural_utf8);
+      }
+      case BASE_TYPE_VECTOR: {
+        const auto vec_type = type.VectorType();
+        // Call PrintVector above specifically for each element type:
+        // clang-format off
+        switch (vec_type.base_type) {
+        #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+          case BASE_TYPE_ ## ENUM: \
+            if (!PrintVector<CTYPE>( \
+                  val, vec_type, indent, prev_val)) { \
+              return false; \
+            } \
+            break;
+          FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+        #undef FLATBUFFERS_TD
+        }
+        // clang-format on
+        return true;
+      }
+      case BASE_TYPE_ARRAY: {
+        const auto vec_type = type.VectorType();
+        // Call PrintArray above specifically for each element type:
+        // clang-format off
+        switch (vec_type.base_type) {
+        #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+          case BASE_TYPE_ ## ENUM: \
+            if (!PrintArray<CTYPE>( \
+                val, type.fixed_length, vec_type, indent)) { \
+            return false; \
+            } \
+            break;
+            FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+              // Arrays of scalars or structs are only possible.
+              FLATBUFFERS_GEN_TYPES_POINTER(FLATBUFFERS_TD)
+        #undef FLATBUFFERS_TD
+          case BASE_TYPE_ARRAY: FLATBUFFERS_ASSERT(0);
+        }
+        // clang-format on
+        return true;
+      }
+      default: FLATBUFFERS_ASSERT(0); return false;
+    }
+  }
+
+  template<typename T> static T GetFieldDefault(const FieldDef &fd) {
+    T val;
+    auto check = StringToNumber(fd.value.constant.c_str(), &val);
+    (void)check;
+    FLATBUFFERS_ASSERT(check);
+    return val;
+  }
+
+  // Generate text for a scalar field.
+  template<typename T>
+  bool GenField(const FieldDef &fd, const Table *table, bool fixed,
+                int indent) {
+    return PrintScalar(
+        fixed ? reinterpret_cast<const Struct *>(table)->GetField<T>(
+                    fd.value.offset)
+              : table->GetField<T>(fd.value.offset, GetFieldDefault<T>(fd)),
+        fd.value.type, indent);
+  }
+
+  // Generate text for non-scalar field.
+  bool GenFieldOffset(const FieldDef &fd, const Table *table, bool fixed,
+                      int indent, const uint8_t *prev_val) {
+    const void *val = nullptr;
+    if (fixed) {
+      // The only non-scalar fields in structs are structs or arrays.
+      FLATBUFFERS_ASSERT(IsStruct(fd.value.type) || IsArray(fd.value.type));
+      val = reinterpret_cast<const Struct *>(table)->GetStruct<const void *>(
+          fd.value.offset);
+    } else if (fd.flexbuffer) {
+      auto vec = table->GetPointer<const Vector<uint8_t> *>(fd.value.offset);
+      auto root = flexbuffers::GetRoot(vec->data(), vec->size());
+      root.ToString(true, opts.strict_json, text);
+      return true;
+    } else if (fd.nested_flatbuffer) {
+      auto vec = table->GetPointer<const Vector<uint8_t> *>(fd.value.offset);
+      auto root = GetRoot<Table>(vec->data());
+      return GenStruct(*fd.nested_flatbuffer, root, indent);
+    } else {
+      val = IsStruct(fd.value.type)
+                ? table->GetStruct<const void *>(fd.value.offset)
+                : table->GetPointer<const void *>(fd.value.offset);
+    }
+    return PrintOffset(val, fd.value.type, indent, prev_val, -1);
+  }
+
+  // Generate text for a struct or table, values separated by commas, indented,
+  // and bracketed by "{}"
+  bool GenStruct(const StructDef &struct_def, const Table *table, int indent) {
+    text += '{';
+    int fieldout = 0;
+    const uint8_t *prev_val = nullptr;
+    const auto elem_indent = indent + Indent();
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      FieldDef &fd = **it;
+      auto is_present = struct_def.fixed || table->CheckField(fd.value.offset);
+      auto output_anyway = (opts.output_default_scalars_in_json || fd.key) &&
+                           IsScalar(fd.value.type.base_type) && !fd.deprecated;
+      if (is_present || output_anyway) {
+        if (fieldout++) { AddComma(); }
+        AddNewLine();
+        AddIndent(elem_indent);
+        OutputIdentifier(fd.name);
+        if (!opts.protobuf_ascii_alike ||
+            (fd.value.type.base_type != BASE_TYPE_STRUCT &&
+             fd.value.type.base_type != BASE_TYPE_VECTOR))
+          text += ':';
+        text += ' ';
+        // clang-format off
+        switch (fd.value.type.base_type) {
+        #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+          case BASE_TYPE_ ## ENUM: \
+            if (!GenField<CTYPE>(fd, table, struct_def.fixed, elem_indent)) { \
+              return false; \
+            } \
+            break;
+            FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+        #undef FLATBUFFERS_TD
+        // Generate drop-thru case statements for all pointer types:
+        #define FLATBUFFERS_TD(ENUM, ...) \
+          case BASE_TYPE_ ## ENUM:
+              FLATBUFFERS_GEN_TYPES_POINTER(FLATBUFFERS_TD)
+              FLATBUFFERS_GEN_TYPE_ARRAY(FLATBUFFERS_TD)
+        #undef FLATBUFFERS_TD
+              if (!GenFieldOffset(fd, table, struct_def.fixed, elem_indent, prev_val)) {
+                return false;
+              }
+            break;
+        }
+        // clang-format on
+        // Track prev val for use with union types.
+        if (struct_def.fixed) {
+          prev_val = reinterpret_cast<const uint8_t *>(table) + fd.value.offset;
+        } else {
+          prev_val = table->GetAddressOf(fd.value.offset);
+        }
+      }
+    }
+    AddNewLine();
+    AddIndent(indent);
+    text += '}';
+    return true;
+  }
+
+  JsonPrinter(const Parser &parser, std::string &dest)
+      : opts(parser.opts), text(dest) {
+    text.reserve(1024);  // Reduce amount of inevitable reallocs.
+  }
+
+  const IDLOptions &opts;
+  std::string &text;
+};
+
+static bool GenerateTextImpl(const Parser &parser, const Table *table,
+                             const StructDef &struct_def, std::string *_text) {
+  JsonPrinter printer(parser, *_text);
+  if (!printer.GenStruct(struct_def, table, 0)) { return false; }
+  printer.AddNewLine();
+  return true;
+}
+
+// Generate a text representation of a flatbuffer in JSON format.
+bool GenerateTextFromTable(const Parser &parser, const void *table,
+                           const std::string &table_name, std::string *_text) {
+  auto struct_def = parser.LookupStruct(table_name);
+  if (struct_def == nullptr) { return false; }
+  auto root = static_cast<const Table *>(table);
+  return GenerateTextImpl(parser, root, *struct_def, _text);
+}
+
+// Generate a text representation of a flatbuffer in JSON format.
+bool GenerateText(const Parser &parser, const void *flatbuffer,
+                  std::string *_text) {
+  FLATBUFFERS_ASSERT(parser.root_struct_def_);  // call SetRootType()
+  auto root = parser.opts.size_prefixed ? GetSizePrefixedRoot<Table>(flatbuffer)
+                                        : GetRoot<Table>(flatbuffer);
+  return GenerateTextImpl(parser, root, *parser.root_struct_def_, _text);
+}
+
+static std::string TextFileName(const std::string &path,
+                                const std::string &file_name) {
+  return path + file_name + ".json";
+}
+
+bool GenerateTextFile(const Parser &parser, const std::string &path,
+                      const std::string &file_name) {
+  if (parser.opts.use_flexbuffers) {
+    std::string json;
+    parser.flex_root_.ToString(true, parser.opts.strict_json, json);
+    return flatbuffers::SaveFile(TextFileName(path, file_name).c_str(),
+                                 json.c_str(), json.size(), true);
+  }
+  if (!parser.builder_.GetSize() || !parser.root_struct_def_) return true;
+  std::string text;
+  if (!GenerateText(parser, parser.builder_.GetBufferPointer(), &text)) {
+    return false;
+  }
+  return flatbuffers::SaveFile(TextFileName(path, file_name).c_str(), text,
+                               false);
+}
+
+std::string TextMakeRule(const Parser &parser, const std::string &path,
+                         const std::string &file_name) {
+  if (!parser.builder_.GetSize() || !parser.root_struct_def_) return "";
+  std::string filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  std::string make_rule = TextFileName(path, filebase) + ": " + file_name;
+  auto included_files =
+      parser.GetIncludedFilesRecursive(parser.root_struct_def_->file);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_ts.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_ts.cpp
new file mode 100644
index 0000000..53e088f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_gen_ts.cpp
@@ -0,0 +1,1583 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// independent from idl_parser, since this code is not needed for most clients
+#include <algorithm>
+#include <cassert>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+struct ImportDefinition {
+  std::string name;
+  std::string statement;
+  const Definition *dependent;
+  const Definition *dependency;
+};
+
+enum AnnotationType { kParam = 0, kType = 1, kReturns = 2 };
+
+namespace ts {
+// Iterate through all definitions we haven't generate code for (enums, structs,
+// and tables) and output them to a single file.
+class TsGenerator : public BaseGenerator {
+ public:
+  typedef std::map<std::string, ImportDefinition> import_set;
+
+  TsGenerator(const Parser &parser, const std::string &path,
+              const std::string &file_name)
+      : BaseGenerator(parser, path, file_name, "", ".", "ts") {}
+  bool generate() {
+    generateEnums();
+    generateStructs();
+    return true;
+  }
+
+  // Save out the generated code for a single class while adding
+  // declaration boilerplate.
+  bool SaveType(const Definition &definition, const std::string &classcode,
+                import_set &imports, import_set &bare_imports) const {
+    if (!classcode.length()) return true;
+
+    std::string code =
+        "// " + std::string(FlatBuffersGeneratedWarning()) + "\n\n";
+
+    for (auto it = bare_imports.begin(); it != bare_imports.end(); it++)
+      code += it->second.statement + "\n";
+    if (!bare_imports.empty()) code += "\n";
+
+    for (auto it = imports.begin(); it != imports.end(); it++)
+      if (it->second.dependency != &definition)  // do not import itself
+        code += it->second.statement + "\n";
+    if (!imports.empty()) code += "\n\n";
+
+    code += classcode;
+    auto filename = NamespaceDir(*definition.defined_namespace, true) +
+                    ToDasherizedCase(definition.name) + ".ts";
+    return SaveFile(filename.c_str(), code, false);
+  }
+
+ private:
+  // Generate code for all enums.
+  void generateEnums() {
+    for (auto it = parser_.enums_.vec.begin(); it != parser_.enums_.vec.end();
+         ++it) {
+      import_set bare_imports;
+      import_set imports;
+      std::string enumcode;
+      auto &enum_def = **it;
+      GenEnum(enum_def, &enumcode, imports, false);
+      GenEnum(enum_def, &enumcode, imports, true);
+      SaveType(enum_def, enumcode, imports, bare_imports);
+    }
+  }
+
+  // Generate code for all structs.
+  void generateStructs() {
+    for (auto it = parser_.structs_.vec.begin();
+         it != parser_.structs_.vec.end(); ++it) {
+      import_set bare_imports;
+      import_set imports;
+      AddImport(bare_imports, "* as flatbuffers", "flatbuffers");
+      auto &struct_def = **it;
+      std::string declcode;
+      GenStruct(parser_, struct_def, &declcode, imports);
+      SaveType(struct_def, declcode, imports, bare_imports);
+    }
+  }
+
+  // Generate a documentation comment, if available.
+  static void GenDocComment(const std::vector<std::string> &dc,
+                            std::string *code_ptr,
+                            const char *indent = nullptr) {
+    if (dc.empty()) {
+      // Don't output empty comment blocks with 0 lines of comment content.
+      return;
+    }
+
+    std::string &code = *code_ptr;
+    if (indent) code += indent;
+    code += "/**\n";
+    for (auto it = dc.begin(); it != dc.end(); ++it) {
+      if (indent) code += indent;
+      code += " *" + *it + "\n";
+    }
+    if (indent) code += indent;
+    code += " */\n";
+  }
+
+  static void GenDocComment(std::string *code_ptr) {
+    GenDocComment(std::vector<std::string>(), code_ptr);
+  }
+
+  // Generate an enum declaration and an enum string lookup table.
+  void GenEnum(EnumDef &enum_def, std::string *code_ptr, import_set &imports,
+               bool reverse) {
+    if (enum_def.generated) return;
+    if (reverse) return;  // FIXME.
+    std::string &code = *code_ptr;
+    GenDocComment(enum_def.doc_comment, code_ptr);
+    std::string ns = GetNameSpace(enum_def);
+    std::string enum_def_name = enum_def.name + (reverse ? "Name" : "");
+    code += "export enum " + enum_def.name + "{\n";
+    for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end(); ++it) {
+      auto &ev = **it;
+      if (!ev.doc_comment.empty()) {
+        if (it != enum_def.Vals().begin()) { code += '\n'; }
+        GenDocComment(ev.doc_comment, code_ptr, "  ");
+      }
+
+      // Generate mapping between EnumName: EnumValue(int)
+      if (reverse) {
+        code += "  '" + enum_def.ToString(ev) + "'";
+        code += " = ";
+        code += "'" + ev.name + "'";
+      } else {
+        code += "  " + ev.name;
+        code += " = ";
+        code += enum_def.ToString(ev);
+      }
+
+      code += (it + 1) != enum_def.Vals().end() ? ",\n" : "\n";
+    }
+    code += "}";
+
+    if (enum_def.is_union) {
+      code += GenUnionConvFunc(enum_def.underlying_type, imports);
+    }
+
+    code += "\n\n";
+  }
+
+  static std::string GenType(const Type &type) {
+    switch (type.base_type) {
+      case BASE_TYPE_BOOL:
+      case BASE_TYPE_CHAR: return "Int8";
+      case BASE_TYPE_UTYPE:
+      case BASE_TYPE_UCHAR: return "Uint8";
+      case BASE_TYPE_SHORT: return "Int16";
+      case BASE_TYPE_USHORT: return "Uint16";
+      case BASE_TYPE_INT: return "Int32";
+      case BASE_TYPE_UINT: return "Uint32";
+      case BASE_TYPE_LONG: return "Int64";
+      case BASE_TYPE_ULONG: return "Uint64";
+      case BASE_TYPE_FLOAT: return "Float32";
+      case BASE_TYPE_DOUBLE: return "Float64";
+      case BASE_TYPE_STRING: return "String";
+      case BASE_TYPE_VECTOR: return GenType(type.VectorType());
+      case BASE_TYPE_STRUCT: return type.struct_def->name;
+      default: return "flatbuffers.Table";
+    }
+  }
+
+  std::string GenGetter(const Type &type, const std::string &arguments) {
+    switch (type.base_type) {
+      case BASE_TYPE_STRING: return GenBBAccess() + ".__string" + arguments;
+      case BASE_TYPE_STRUCT: return GenBBAccess() + ".__struct" + arguments;
+      case BASE_TYPE_UNION:
+        if (!UnionHasStringType(*type.enum_def)) {
+          return GenBBAccess() + ".__union" + arguments;
+        }
+        return GenBBAccess() + ".__union_with_string" + arguments;
+      case BASE_TYPE_VECTOR: return GenGetter(type.VectorType(), arguments);
+      default: {
+        auto getter =
+            GenBBAccess() + ".read" + MakeCamel(GenType(type)) + arguments;
+        if (type.base_type == BASE_TYPE_BOOL) { getter = "!!" + getter; }
+        return getter;
+      }
+    }
+  }
+
+  std::string GenBBAccess() const { return "this.bb!"; }
+
+  std::string GenDefaultValue(const FieldDef &field, const std::string &context,
+                              import_set &imports) {
+    if (field.IsScalarOptional()) { return "null"; }
+
+    const auto &value = field.value;
+    if (value.type.enum_def && value.type.base_type != BASE_TYPE_UNION &&
+        value.type.base_type != BASE_TYPE_VECTOR) {
+      if (auto val = value.type.enum_def->FindByValue(value.constant)) {
+        return AddImport(imports, *value.type.enum_def, *value.type.enum_def) +
+               "." + val->name;
+      } else {
+        return value.constant;
+      }
+    }
+
+    switch (value.type.base_type) {
+      case BASE_TYPE_BOOL: return value.constant == "0" ? "false" : "true";
+
+      case BASE_TYPE_STRING:
+      case BASE_TYPE_UNION:
+      case BASE_TYPE_STRUCT: {
+        return "null";
+      }
+
+      case BASE_TYPE_VECTOR: return "[]";
+
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_ULONG: {
+        int64_t constant = StringToInt(value.constant.c_str());
+        std::string createLong = context + ".createLong";
+        return createLong + "(" + NumToString(static_cast<int32_t>(constant)) +
+               ", " + NumToString(static_cast<int32_t>(constant >> 32)) + ")";
+      }
+
+      default: return value.constant;
+    }
+  }
+
+  std::string GenTypeName(import_set &imports, const Definition &owner,
+                          const Type &type, bool input,
+                          bool allowNull = false) {
+    if (!input) {
+      if (IsString(type) || type.base_type == BASE_TYPE_STRUCT) {
+        std::string name;
+        if (IsString(type)) {
+          name = "string|Uint8Array";
+        } else {
+          name = AddImport(imports, owner, *type.struct_def);
+        }
+        return allowNull ? (name + "|null") : name;
+      }
+    }
+
+    switch (type.base_type) {
+      case BASE_TYPE_BOOL: return allowNull ? "boolean|null" : "boolean";
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_ULONG:
+        return allowNull ? "flatbuffers.Long|null" : "flatbuffers.Long";
+      default:
+        if (IsScalar(type.base_type)) {
+          if (type.enum_def) {
+            const auto enum_name = AddImport(imports, owner, *type.enum_def);
+            return allowNull ? (enum_name + "|null") : enum_name;
+          }
+          return allowNull ? "number|null" : "number";
+        }
+        return "flatbuffers.Offset";
+    }
+  }
+
+  // Returns the method name for use with add/put calls.
+  static std::string GenWriteMethod(const Type &type) {
+    // Forward to signed versions since unsigned versions don't exist
+    switch (type.base_type) {
+      case BASE_TYPE_UTYPE:
+      case BASE_TYPE_UCHAR: return GenWriteMethod(Type(BASE_TYPE_CHAR));
+      case BASE_TYPE_USHORT: return GenWriteMethod(Type(BASE_TYPE_SHORT));
+      case BASE_TYPE_UINT: return GenWriteMethod(Type(BASE_TYPE_INT));
+      case BASE_TYPE_ULONG: return GenWriteMethod(Type(BASE_TYPE_LONG));
+      default: break;
+    }
+
+    return IsScalar(type.base_type) ? MakeCamel(GenType(type))
+                                    : (IsStruct(type) ? "Struct" : "Offset");
+  }
+
+  template<typename T> static std::string MaybeAdd(T value) {
+    return value != 0 ? " + " + NumToString(value) : "";
+  }
+
+  template<typename T> static std::string MaybeScale(T value) {
+    return value != 1 ? " * " + NumToString(value) : "";
+  }
+
+  void GenStructArgs(import_set &imports, const StructDef &struct_def,
+                     std::string *arguments, const std::string &nameprefix) {
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        GenStructArgs(imports, *field.value.type.struct_def, arguments,
+                      nameprefix + field.name + "_");
+      } else {
+        *arguments +=
+            ", " + nameprefix + field.name + ": " +
+            GenTypeName(imports, field, field.value.type, true, field.IsOptional());
+      }
+    }
+  }
+
+  static void GenStructBody(const StructDef &struct_def, std::string *body,
+                            const std::string &nameprefix) {
+    *body += "  builder.prep(";
+    *body += NumToString(struct_def.minalign) + ", ";
+    *body += NumToString(struct_def.bytesize) + ");\n";
+
+    for (auto it = struct_def.fields.vec.rbegin();
+         it != struct_def.fields.vec.rend(); ++it) {
+      auto &field = **it;
+      if (field.padding) {
+        *body += "  builder.pad(" + NumToString(field.padding) + ");\n";
+      }
+      if (IsStruct(field.value.type)) {
+        // Generate arguments for a struct inside a struct. To ensure names
+        // don't clash, and to make it obvious these arguments are constructing
+        // a nested struct, prefix the name with the field name.
+        GenStructBody(*field.value.type.struct_def, body,
+                      nameprefix + field.name + "_");
+      } else {
+        *body += "  builder.write" + GenWriteMethod(field.value.type) + "(";
+        if (field.value.type.base_type == BASE_TYPE_BOOL) { *body += "+"; }
+        *body += nameprefix + field.name + ");\n";
+      }
+    }
+  }
+
+  std::string GenerateNewExpression(const std::string &object_name) {
+    return "new " + object_name + "()";
+  }
+
+  void GenerateRootAccessor(StructDef &struct_def, std::string *code_ptr,
+                            std::string &code, const std::string &object_name,
+                            bool size_prefixed) {
+    if (!struct_def.fixed) {
+      GenDocComment(code_ptr);
+      std::string sizePrefixed("SizePrefixed");
+      code += "static get" + (size_prefixed ? sizePrefixed : "") + "Root" +
+              GetPrefixedName(struct_def, "As");
+      code += "(bb:flatbuffers.ByteBuffer, obj?:" + object_name +
+              "):" + object_name + " {\n";
+      if (size_prefixed) {
+        code +=
+            "  bb.setPosition(bb.position() + "
+            "flatbuffers.SIZE_PREFIX_LENGTH);\n";
+      }
+      code += "  return (obj || " + GenerateNewExpression(object_name);
+      code += ").__init(bb.readInt32(bb.position()) + bb.position(), bb);\n";
+      code += "}\n\n";
+    }
+  }
+
+  void GenerateFinisher(StructDef &struct_def, std::string *code_ptr,
+                        std::string &code, bool size_prefixed) {
+    if (parser_.root_struct_def_ == &struct_def) {
+      std::string sizePrefixed("SizePrefixed");
+      GenDocComment(code_ptr);
+
+      code += "static finish" + (size_prefixed ? sizePrefixed : "") +
+              GetPrefixedName(struct_def) + "Buffer";
+      code += "(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {\n";
+      code += "  builder.finish(offset";
+      if (!parser_.file_identifier_.empty()) {
+        code += ", '" + parser_.file_identifier_ + "'";
+      }
+      if (size_prefixed) {
+        if (parser_.file_identifier_.empty()) { code += ", undefined"; }
+        code += ", true";
+      }
+      code += ");\n";
+      code += "}\n\n";
+    }
+  }
+
+  static std::string GetObjApiClassName(const StructDef &sd,
+                                        const IDLOptions &opts) {
+    return GetObjApiClassName(sd.name, opts);
+  }
+
+  static std::string GetObjApiClassName(const std::string &name,
+                                        const IDLOptions &opts) {
+    return opts.object_prefix + name + opts.object_suffix;
+  }
+
+  bool UnionHasStringType(const EnumDef &union_enum) {
+    return std::any_of(union_enum.Vals().begin(), union_enum.Vals().end(),
+                       [](const EnumVal *ev) {
+                         return !ev->IsZero() && IsString(ev->union_type);
+                       });
+  }
+
+  std::string GenUnionGenericTypeTS(const EnumDef &union_enum) {
+    // TODO: make it work without any
+    // return std::string("T") + (UnionHasStringType(union_enum) ? "|string" :
+    // "");
+    return std::string("any") +
+           (UnionHasStringType(union_enum) ? "|string" : "");
+  }
+
+  std::string GenUnionTypeTS(const EnumDef &union_enum, import_set &imports) {
+    std::string ret;
+    std::set<std::string> type_list;
+
+    for (auto it = union_enum.Vals().begin(); it != union_enum.Vals().end();
+         ++it) {
+      const auto &ev = **it;
+      if (ev.IsZero()) { continue; }
+
+      std::string type = "";
+      if (IsString(ev.union_type)) {
+        type = "string";  // no need to wrap string type in namespace
+      } else if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+        type = AddImport(imports, union_enum, *ev.union_type.struct_def);
+      } else {
+        FLATBUFFERS_ASSERT(false);
+      }
+      type_list.insert(type);
+    }
+
+    for (auto it = type_list.begin(); it != type_list.end(); ++it) {
+      ret += *it + ((std::next(it) == type_list.end()) ? "" : "|");
+    }
+
+    return ret;
+  }
+
+  std::string AddImport(import_set &imports, const Definition &dependent,
+                        const StructDef &dependency) {
+    std::string ns;
+    const auto &depc_comps = dependency.defined_namespace->components;
+    for (auto it = depc_comps.begin(); it != depc_comps.end(); it++) ns += *it;
+    std::string unique_name = ns + dependency.name;
+    std::string import_name = dependency.name;
+    std::string long_import_name;
+    if (imports.find(unique_name) != imports.end())
+      return imports.find(unique_name)->second.name;
+    for (auto it = imports.begin(); it != imports.end(); it++) {
+      if (it->second.name == import_name) {
+        long_import_name = ns + import_name;
+        break;
+      }
+    }
+    std::string import_statement;
+    import_statement += "import { ";
+    if (long_import_name.empty()) {
+      import_statement += import_name;
+      if (parser_.opts.generate_object_based_api)
+        import_statement += ", " + import_name + "T";
+    } else {
+      import_statement += dependency.name + " as " + long_import_name;
+      if (parser_.opts.generate_object_based_api)
+        import_statement +=
+            ", " + dependency.name + "T as " + long_import_name + "T";
+    }
+    import_statement += " } from '";
+    std::string file_name;
+    const auto &dep_comps = dependent.defined_namespace->components;
+    for (size_t i = 0; i < dep_comps.size(); i++)
+      file_name += i == 0 ? ".." : (kPathSeparator + std::string(".."));
+    if (dep_comps.size() == 0) file_name += ".";
+    for (auto it = depc_comps.begin(); it != depc_comps.end(); it++)
+      file_name += kPathSeparator + ToDasherizedCase(*it);
+    file_name += kPathSeparator + ToDasherizedCase(dependency.name);
+    import_statement += file_name + "';";
+    ImportDefinition import;
+    import.name = long_import_name.empty() ? import_name : long_import_name;
+    import.statement = import_statement;
+    import.dependency = &dependency;
+    import.dependent = &dependent;
+    imports.insert(std::make_pair(unique_name, import));
+    return import.name;
+  }
+
+  // TODO: largely (but not identical) duplicated code from above couln't find a
+  // good way to refactor
+  std::string AddImport(import_set &imports, const Definition &dependent,
+                        const EnumDef &dependency) {
+    std::string ns;
+    const auto &depc_comps = dependency.defined_namespace->components;
+    for (auto it = depc_comps.begin(); it != depc_comps.end(); it++) ns += *it;
+    std::string unique_name = ns + dependency.name;
+    std::string import_name = dependency.name;
+    std::string long_import_name;
+    if (imports.find(unique_name) != imports.end())
+      return imports.find(unique_name)->second.name;
+    for (auto it = imports.begin(); it != imports.end(); it++) {
+      if (it->second.name == import_name) {
+        long_import_name = ns + import_name;
+        break;
+      }
+    }
+    std::string import_statement;
+    import_statement += "import { ";
+    if (long_import_name.empty())
+      import_statement += import_name;
+    else
+      import_statement += dependency.name + " as " + long_import_name;
+    if (dependency.is_union) {
+      import_statement += ", unionTo" + import_name;
+      import_statement += ", unionListTo" + import_name;
+    }
+    import_statement += " } from '";
+    std::string file_name;
+    const auto &dep_comps = dependent.defined_namespace->components;
+    for (size_t i = 0; i < dep_comps.size(); i++)
+      file_name += i == 0 ? ".." : (kPathSeparator + std::string(".."));
+    if (dep_comps.size() == 0) file_name += ".";
+    for (auto it = depc_comps.begin(); it != depc_comps.end(); it++)
+      file_name += kPathSeparator + ToDasherizedCase(*it);
+    file_name += kPathSeparator + ToDasherizedCase(dependency.name);
+    import_statement += file_name + "';";
+    ImportDefinition import;
+    import.name = long_import_name.empty() ? import_name : long_import_name;
+    import.statement = import_statement;
+    import.dependency = &dependency;
+    import.dependent = &dependent;
+    imports.insert(std::make_pair(unique_name, import));
+    return import.name;
+  }
+
+  void AddImport(import_set &imports, std::string import_name,
+                 std::string fileName) {
+    ImportDefinition import;
+    import.name = import_name;
+    import.statement = "import " + import_name + " from '" + fileName + "';";
+    imports.insert(std::make_pair(import_name, import));
+  }
+
+  // Generate a TS union type based on a union's enum
+  std::string GenObjApiUnionTypeTS(import_set &imports, const IDLOptions &opts,
+                                   const EnumDef &union_enum) {
+    std::string ret = "";
+    std::set<std::string> type_list;
+
+    for (auto it = union_enum.Vals().begin(); it != union_enum.Vals().end();
+         ++it) {
+      const auto &ev = **it;
+      if (ev.IsZero()) { continue; }
+
+      std::string type = "";
+      if (IsString(ev.union_type)) {
+        type = "string";  // no need to wrap string type in namespace
+      } else if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+        type = GetObjApiClassName(
+            AddImport(imports, union_enum, *ev.union_type.struct_def), opts);
+      } else {
+        FLATBUFFERS_ASSERT(false);
+      }
+      type_list.insert(type);
+    }
+
+    size_t totalPrinted = 0;
+    for (auto it = type_list.begin(); it != type_list.end(); ++it) {
+      ++totalPrinted;
+      ret += *it + ((totalPrinted == type_list.size()) ? "" : "|");
+    }
+
+    return ret;
+  }
+
+  std::string GenUnionConvFuncName(const EnumDef &enum_def) {
+    return "unionTo" + enum_def.name;
+  }
+
+  std::string GenUnionListConvFuncName(const EnumDef &enum_def) {
+    return "unionListTo" + enum_def.name;
+  }
+
+  std::string GenUnionConvFunc(const Type &union_type, import_set &imports) {
+    if (union_type.enum_def) {
+      const auto &enum_def = *union_type.enum_def;
+
+      const auto valid_union_type = GenUnionTypeTS(enum_def, imports);
+      const auto valid_union_type_with_null = valid_union_type + "|null";
+
+      auto ret = "\n\nexport function " + GenUnionConvFuncName(enum_def) +
+                 "(\n  type: " + enum_def.name +
+                 ",\n  accessor: (obj:" + valid_union_type + ") => " +
+                 valid_union_type_with_null +
+                 "\n): " + valid_union_type_with_null + " {\n";
+
+      const auto enum_type = AddImport(imports, enum_def, enum_def);
+
+      const auto union_enum_loop = [&](const std::string &accessor_str) {
+        ret += "  switch(" + enum_type + "[type]) {\n";
+        ret += "    case 'NONE': return null; \n";
+
+        for (auto it = enum_def.Vals().begin(); it != enum_def.Vals().end();
+             ++it) {
+          const auto &ev = **it;
+          if (ev.IsZero()) { continue; }
+
+          ret += "    case '" + ev.name + "': ";
+
+          if (IsString(ev.union_type)) {
+            ret += "return " + accessor_str + "'') as string;";
+          } else if (ev.union_type.base_type == BASE_TYPE_STRUCT) {
+            const auto type =
+                AddImport(imports, enum_def, *ev.union_type.struct_def);
+            ret += "return " + accessor_str + "new " + type + "())! as " +
+                   type + ";";
+          } else {
+            FLATBUFFERS_ASSERT(false);
+          }
+          ret += "\n";
+        }
+
+        ret += "    default: return null;\n";
+        ret += "  }\n";
+      };
+
+      union_enum_loop("accessor(");
+      ret += "}";
+
+      ret += "\n\nexport function " + GenUnionListConvFuncName(enum_def) +
+             "(\n  type: " + enum_def.name +
+             ", \n  accessor: (index: number, obj:" + valid_union_type +
+             ") => " + valid_union_type_with_null +
+             ", \n  index: number\n): " + valid_union_type_with_null + " {\n";
+      union_enum_loop("accessor(index, ");
+      ret += "}";
+
+      return ret;
+    }
+    FLATBUFFERS_ASSERT(0);
+    return "";
+  }
+
+  // Used for generating a short function that returns the correct class
+  // based on union enum type. Assume the context is inside the non object api
+  // type
+  std::string GenUnionValTS(import_set &imports, const std::string &field_name,
+                            const Type &union_type,
+                            const bool is_array = false) {
+    if (union_type.enum_def) {
+      const auto &enum_def = *union_type.enum_def;
+      const auto enum_type = AddImport(imports, enum_def, enum_def);
+      const std::string union_accessor = "this." + field_name;
+
+      const auto union_has_string = UnionHasStringType(enum_def);
+      const auto field_binded_method = "this." + field_name + ".bind(this)";
+
+      std::string ret;
+
+      if (!is_array) {
+        const auto conversion_function = GenUnionConvFuncName(enum_def);
+        const auto target_enum = "this." + field_name + "Type()";
+
+        ret = "(() => {\n";
+        ret += "      let temp = " + conversion_function + "(" + target_enum +
+               ", " + field_binded_method + ");\n";
+        ret += "      if(temp === null) { return null; }\n";
+        ret += union_has_string
+                   ? "      if(typeof temp === 'string') { return temp; }\n"
+                   : "";
+        ret += "      return temp.unpack()\n";
+        ret += "  })()";
+      } else {
+        const auto conversion_function = GenUnionListConvFuncName(enum_def);
+        const auto target_enum_accesor = "this." + field_name + "Type";
+        const auto target_enum_length = target_enum_accesor + "Length()";
+
+        ret = "(() => {\n";
+        ret += "    let ret = [];\n";
+        ret += "    for(let targetEnumIndex = 0; targetEnumIndex < " +
+               target_enum_length +
+               "; "
+               "++targetEnumIndex) {\n";
+        ret += "      let targetEnum = " + target_enum_accesor +
+               "(targetEnumIndex);\n";
+        ret += "      if(targetEnum === null || " + enum_type +
+               "[targetEnum!] === 'NONE') { "
+               "continue; }\n\n";
+        ret += "      let temp = " + conversion_function + "(targetEnum, " +
+               field_binded_method + ", targetEnumIndex);\n";
+        ret += "      if(temp === null) { continue; }\n";
+        ret += union_has_string ? "      if(typeof temp === 'string') { "
+                                  "ret.push(temp); continue; }\n"
+                                : "";
+        ret += "      ret.push(temp.unpack());\n";
+        ret += "    }\n";
+        ret += "    return ret;\n";
+        ret += "  })()";
+      }
+
+      return ret;
+    }
+
+    FLATBUFFERS_ASSERT(0);
+    return "";
+  }
+
+  static std::string GenNullCheckConditional(
+      const std::string &nullCheckVar, const std::string &trueVal,
+      const std::string &falseVal = "null") {
+    return "(" + nullCheckVar + " !== null ? " + trueVal + " : " + falseVal +
+           ")";
+  }
+
+  std::string GenStructMemberValueTS(const StructDef &struct_def,
+                                     const std::string &prefix,
+                                     const std::string &delimiter,
+                                     const bool nullCheck = true) {
+    std::string ret;
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+
+      const auto curr_member_accessor =
+          prefix + "." + MakeCamel(field.name, false);
+      if (IsStruct(field.value.type)) {
+        ret += GenStructMemberValueTS(*field.value.type.struct_def,
+                                      curr_member_accessor, delimiter);
+      } else {
+        if (nullCheck) {
+          ret +=
+              "(" + prefix + " === null ? 0 : " + curr_member_accessor + "!)";
+        } else {
+          ret += curr_member_accessor;
+        }
+      }
+
+      if (std::next(it) != struct_def.fields.vec.end()) { ret += delimiter; }
+    }
+
+    return ret;
+  }
+
+  void GenObjApi(const Parser &parser, StructDef &struct_def,
+                 std::string &obj_api_unpack_func, std::string &obj_api_class,
+                 import_set &imports) {
+    const auto class_name = GetObjApiClassName(struct_def, parser.opts);
+
+    std::string unpack_func = "\nunpack(): " + class_name +
+                              " {\n  return new " + class_name + "(" +
+                              (struct_def.fields.vec.empty() ? "" : "\n");
+    std::string unpack_to_func = "\nunpackTo(_o: " + class_name + "): void {" +
+                                 +(struct_def.fields.vec.empty() ? "" : "\n");
+
+    std::string constructor_func = "constructor(";
+    constructor_func += (struct_def.fields.vec.empty() ? "" : "\n");
+
+    const auto has_create =
+        struct_def.fixed || CanCreateFactoryMethod(struct_def);
+
+    std::string pack_func_prototype =
+        "\npack(builder:flatbuffers.Builder): flatbuffers.Offset {\n";
+
+    std::string pack_func_offset_decl;
+    std::string pack_func_create_call;
+
+    const auto struct_name = AddImport(imports, struct_def, struct_def);
+
+    if (has_create) {
+      pack_func_create_call = "  return " + struct_name + ".create" +
+                              GetPrefixedName(struct_def) + "(builder" +
+                              (struct_def.fields.vec.empty() ? "" : ",\n    ");
+    } else {
+      pack_func_create_call = "  " + struct_name + ".start" +
+                              GetPrefixedName(struct_def) + "(builder);\n";
+    }
+
+    if (struct_def.fixed) {
+      // when packing struct, nested struct's members instead of the struct's
+      // offset are used
+      pack_func_create_call +=
+          GenStructMemberValueTS(struct_def, "this", ",\n    ", false) + "\n  ";
+    }
+
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+
+      const auto field_name = MakeCamel(field.name, false);
+      const std::string field_binded_method =
+          "this." + field_name + ".bind(this)";
+
+      std::string field_val;
+      std::string field_type;
+      // a string that declares a variable containing the
+      // offset for things that can't be generated inline
+      // empty otw
+      std::string field_offset_decl;
+      // a string that contains values for things that can be created inline or
+      // the variable name from field_offset_decl
+      std::string field_offset_val;
+      const auto field_default_val =
+          GenDefaultValue(field, "flatbuffers", imports);
+
+      // Emit a scalar field
+      const auto is_string = IsString(field.value.type);
+      if (IsScalar(field.value.type.base_type) || is_string) {
+        const auto has_null_default = is_string || HasNullDefault(field);
+
+        field_type += GenTypeName(imports, field, field.value.type, false,
+                                  has_null_default);
+        field_val = "this." + field_name + "()";
+
+        if (field.value.type.base_type != BASE_TYPE_STRING) {
+          field_offset_val = "this." + field_name;
+        } else {
+          field_offset_decl = GenNullCheckConditional(
+              "this." + field_name,
+              "builder.createString(this." + field_name + "!)", "0");
+        }
+      }
+
+      // Emit an object field
+      else {
+        auto is_vector = false;
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT: {
+            const auto &sd = *field.value.type.struct_def;
+            field_type += GetObjApiClassName(sd, parser.opts);
+
+            const std::string field_accessor = "this." + field_name + "()";
+            field_val = GenNullCheckConditional(field_accessor,
+                                                field_accessor + "!.unpack()");
+            auto packing = GenNullCheckConditional(
+                "this." + field_name, "this." + field_name + "!.pack(builder)",
+                "0");
+
+            if (sd.fixed) {
+              field_offset_val = std::move(packing);
+            } else {
+              field_offset_decl = std::move(packing);
+            }
+
+            break;
+          }
+
+          case BASE_TYPE_VECTOR: {
+            auto vectortype = field.value.type.VectorType();
+            auto vectortypename =
+                GenTypeName(imports, struct_def, vectortype, false);
+            is_vector = true;
+
+            field_type = "(";
+
+            switch (vectortype.base_type) {
+              case BASE_TYPE_STRUCT: {
+                const auto &sd = *field.value.type.struct_def;
+                field_type += GetObjApiClassName(sd, parser.opts);
+                field_type += ")[]";
+
+                field_val = GenBBAccess() + ".createObjList(" +
+                            field_binded_method + ", this." + field_name +
+                            "Length())";
+
+                if (sd.fixed) {
+                  field_offset_decl =
+                      "builder.createStructOffsetList(this." + field_name +
+                      ", " + AddImport(imports, struct_def, struct_def) +
+                      ".start" + MakeCamel(field_name) + "Vector)";
+                } else {
+                  field_offset_decl =
+                      AddImport(imports, struct_def, struct_def) + ".create" +
+                      MakeCamel(field_name) +
+                      "Vector(builder, builder.createObjectOffsetList(" +
+                      "this." + field_name + "))";
+                }
+
+                break;
+              }
+
+              case BASE_TYPE_STRING: {
+                field_type += "string)[]";
+                field_val = GenBBAccess() + ".createScalarList(" +
+                            field_binded_method + ", this." + field_name +
+                            "Length())";
+                field_offset_decl =
+                    AddImport(imports, struct_def, struct_def) + ".create" +
+                    MakeCamel(field_name) +
+                    "Vector(builder, builder.createObjectOffsetList(" +
+                    "this." + field_name + "))";
+                break;
+              }
+
+              case BASE_TYPE_UNION: {
+                field_type += GenObjApiUnionTypeTS(imports, parser.opts,
+                                                   *(vectortype.enum_def));
+                field_type += ")[]";
+                field_val =
+                    GenUnionValTS(imports, field_name, vectortype, true);
+
+                field_offset_decl =
+                    AddImport(imports, struct_def, struct_def) + ".create" +
+                    MakeCamel(field_name) +
+                    "Vector(builder, builder.createObjectOffsetList(" +
+                    "this." + field_name + "))";
+
+                break;
+              }
+              default: {
+                if (vectortype.enum_def) {
+                  field_type += GenTypeName(imports, struct_def, vectortype,
+                                            false, HasNullDefault(field));
+                } else {
+                  field_type += vectortypename;
+                }
+                field_type += ")[]";
+                field_val = GenBBAccess() + ".createScalarList(" +
+                            field_binded_method + ", this." + field_name +
+                            "Length())";
+
+                field_offset_decl = AddImport(imports, struct_def, struct_def) +
+                                    ".create" + MakeCamel(field_name) +
+                                    "Vector(builder, this." + field_name + ")";
+
+                break;
+              }
+            }
+
+            break;
+          }
+
+          case BASE_TYPE_UNION: {
+            field_type += GenObjApiUnionTypeTS(imports, parser.opts,
+                                               *(field.value.type.enum_def));
+
+            field_val = GenUnionValTS(imports, field_name, field.value.type);
+            field_offset_decl =
+                "builder.createObjectOffset(this." + field_name + ")";
+            break;
+          }
+
+          default: FLATBUFFERS_ASSERT(0); break;
+        }
+
+        // length 0 vector is simply empty instead of null
+        field_type += is_vector ? "" : "|null";
+      }
+
+      if (!field_offset_decl.empty()) {
+        field_offset_decl =
+            "  const " + field_name + " = " + field_offset_decl + ";";
+      }
+      if (field_offset_val.empty()) { field_offset_val = field_name; }
+
+      unpack_func += "    " + field_val;
+      unpack_to_func += "  _o." + field_name + " = " + field_val + ";";
+
+      constructor_func += "  public " + field_name + ": " + field_type + " = " +
+                          field_default_val;
+
+      if (!struct_def.fixed) {
+        if (!field_offset_decl.empty()) {
+          pack_func_offset_decl += field_offset_decl + "\n";
+        }
+
+        if (has_create) {
+          pack_func_create_call += field_offset_val;
+        } else {
+          pack_func_create_call += "  " + struct_name + ".add" +
+                                   MakeCamel(field.name) + "(builder, " +
+                                   field_offset_val + ");\n";
+        }
+      }
+
+      if (std::next(it) != struct_def.fields.vec.end()) {
+        constructor_func += ",\n";
+
+        if (!struct_def.fixed && has_create) {
+          pack_func_create_call += ",\n    ";
+        }
+
+        unpack_func += ",\n";
+        unpack_to_func += "\n";
+      } else {
+        constructor_func += "\n";
+        if (!struct_def.fixed) {
+          pack_func_offset_decl += (pack_func_offset_decl.empty() ? "" : "\n");
+          pack_func_create_call += "\n  ";
+        }
+
+        unpack_func += "\n  ";
+        unpack_to_func += "\n";
+      }
+    }
+
+    constructor_func += "){}\n\n";
+
+    if (has_create) {
+      pack_func_create_call += ");";
+    } else {
+      pack_func_create_call += "return " + struct_name + ".end" +
+                               GetPrefixedName(struct_def) + "(builder);";
+    }
+
+    obj_api_class = "\nexport class " +
+                    GetObjApiClassName(struct_def, parser.opts) + " {\n";
+
+    obj_api_class += constructor_func;
+    obj_api_class += pack_func_prototype + pack_func_offset_decl +
+                     pack_func_create_call + "\n}";
+
+    obj_api_class += "\n}\n";
+
+    unpack_func += ");\n}";
+    unpack_to_func += "}\n";
+
+    obj_api_unpack_func = unpack_func + "\n\n" + unpack_to_func;
+  }
+
+  static bool CanCreateFactoryMethod(const StructDef &struct_def) {
+    // to preserve backwards compatibility, we allow the first field to be a
+    // struct
+    return struct_def.fields.vec.size() < 2 ||
+           std::all_of(std::begin(struct_def.fields.vec) + 1,
+                       std::end(struct_def.fields.vec),
+                       [](const FieldDef *f) -> bool {
+                         FLATBUFFERS_ASSERT(f != nullptr);
+                         return f->value.type.base_type != BASE_TYPE_STRUCT;
+                       });
+  }
+
+  // Generate an accessor struct with constructor for a flatbuffers struct.
+  void GenStruct(const Parser &parser, StructDef &struct_def,
+                 std::string *code_ptr, import_set &imports) {
+    if (struct_def.generated) return;
+    std::string &code = *code_ptr;
+
+    std::string object_name;
+    std::string object_namespace = GetNameSpace(struct_def);
+
+    // Emit constructor
+    object_name = struct_def.name;
+    GenDocComment(struct_def.doc_comment, code_ptr);
+    code += "export class " + struct_def.name;
+    code += " {\n";
+    code += "  bb: flatbuffers.ByteBuffer|null = null;\n";
+    code += "  bb_pos = 0;\n";
+
+    // Generate the __init method that sets the field in a pre-existing
+    // accessor object. This is to allow object reuse.
+    code +=
+        "__init(i:number, bb:flatbuffers.ByteBuffer):" + object_name + " {\n";
+    code += "  this.bb_pos = i;\n";
+    code += "  this.bb = bb;\n";
+    code += "  return this;\n";
+    code += "}\n\n";
+
+    // Generate special accessors for the table that when used as the root of a
+    // FlatBuffer
+    GenerateRootAccessor(struct_def, code_ptr, code, object_name, false);
+    GenerateRootAccessor(struct_def, code_ptr, code, object_name, true);
+
+    // Generate the identifier check method
+    if (!struct_def.fixed && parser_.root_struct_def_ == &struct_def &&
+        !parser_.file_identifier_.empty()) {
+      GenDocComment(code_ptr);
+      code +=
+          "static bufferHasIdentifier(bb:flatbuffers.ByteBuffer):boolean "
+          "{\n";
+      code += "  return bb.__has_identifier('" + parser_.file_identifier_;
+      code += "');\n}\n\n";
+    }
+
+    // Emit field accessors
+    for (auto it = struct_def.fields.vec.begin();
+         it != struct_def.fields.vec.end(); ++it) {
+      auto &field = **it;
+      if (field.deprecated) continue;
+      auto offset_prefix =
+          "  const offset = " + GenBBAccess() + ".__offset(this.bb_pos, " +
+          NumToString(field.value.offset) + ");\n  return offset ? ";
+
+      // Emit a scalar field
+      const auto is_string = IsString(field.value.type);
+      if (IsScalar(field.value.type.base_type) || is_string) {
+        const auto has_null_default = is_string || HasNullDefault(field);
+
+        GenDocComment(field.doc_comment, code_ptr);
+        std::string prefix = MakeCamel(field.name, false) + "(";
+        if (is_string) {
+          code += prefix + "):string|null\n";
+          code +=
+              prefix + "optionalEncoding:flatbuffers.Encoding" + "):" +
+              GenTypeName(imports, struct_def, field.value.type, false, true) +
+              "\n";
+          code += prefix + "optionalEncoding?:any";
+        } else {
+          code += prefix;
+        }
+        if (field.value.type.enum_def) {
+          code += "):" +
+                  GenTypeName(imports, struct_def, field.value.type, false,
+                              field.IsOptional()) +
+                  " {\n";
+        } else {
+          code += "):" +
+                  GenTypeName(imports, struct_def, field.value.type, false,
+                              has_null_default) +
+                  " {\n";
+        }
+
+        if (struct_def.fixed) {
+          code +=
+              "  return " +
+              GenGetter(field.value.type,
+                        "(this.bb_pos" + MaybeAdd(field.value.offset) + ")") +
+              ";\n";
+        } else {
+          std::string index = "this.bb_pos + offset";
+          if (is_string) { index += ", optionalEncoding"; }
+          code += offset_prefix +
+                  GenGetter(field.value.type, "(" + index + ")") + " : " +
+                  GenDefaultValue(field, GenBBAccess(), imports);
+          code += ";\n";
+        }
+      }
+
+      // Emit an object field
+      else {
+        switch (field.value.type.base_type) {
+          case BASE_TYPE_STRUCT: {
+            const auto type =
+                AddImport(imports, struct_def, *field.value.type.struct_def);
+            GenDocComment(field.doc_comment, code_ptr);
+            code += MakeCamel(field.name, false);
+            code += "(obj?:" + type + "):" + type + "|null {\n";
+
+            if (struct_def.fixed) {
+              code += "  return (obj || " + GenerateNewExpression(type);
+              code += ").__init(this.bb_pos";
+              code +=
+                  MaybeAdd(field.value.offset) + ", " + GenBBAccess() + ");\n";
+            } else {
+              code += offset_prefix + "(obj || " + GenerateNewExpression(type) +
+                      ").__init(";
+              code += field.value.type.struct_def->fixed
+                          ? "this.bb_pos + offset"
+                          : GenBBAccess() + ".__indirect(this.bb_pos + offset)";
+              code += ", " + GenBBAccess() + ") : null;\n";
+            }
+
+            break;
+          }
+
+          case BASE_TYPE_VECTOR: {
+            auto vectortype = field.value.type.VectorType();
+            auto vectortypename =
+                GenTypeName(imports, struct_def, vectortype, false);
+            auto inline_size = InlineSize(vectortype);
+            auto index = GenBBAccess() +
+                         ".__vector(this.bb_pos + offset) + index" +
+                         MaybeScale(inline_size);
+            std::string ret_type;
+            bool is_union = false;
+            switch (vectortype.base_type) {
+              case BASE_TYPE_STRUCT: ret_type = vectortypename; break;
+              case BASE_TYPE_STRING: ret_type = vectortypename; break;
+              case BASE_TYPE_UNION:
+                ret_type = "?flatbuffers.Table";
+                is_union = true;
+                break;
+              default: ret_type = vectortypename;
+            }
+            GenDocComment(field.doc_comment, code_ptr);
+            std::string prefix = MakeCamel(field.name, false);
+            // TODO: make it work without any
+            // if (is_union) { prefix += "<T extends flatbuffers.Table>"; }
+            if (is_union) { prefix += ""; }
+            prefix += "(index: number";
+            if (is_union) {
+              const auto union_type =
+                  GenUnionGenericTypeTS(*(field.value.type.enum_def));
+
+              vectortypename = union_type;
+              code += prefix + ", obj:" + union_type;
+            } else if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              code += prefix + ", obj?:" + vectortypename;
+            } else if (IsString(vectortype)) {
+              code += prefix + "):string\n";
+              code += prefix + ",optionalEncoding:flatbuffers.Encoding" +
+                      "):" + vectortypename + "\n";
+              code += prefix + ",optionalEncoding?:any";
+            } else {
+              code += prefix;
+            }
+            code += "):" + vectortypename + "|null {\n";
+
+            if (vectortype.base_type == BASE_TYPE_STRUCT) {
+              code += offset_prefix + "(obj || " +
+                      GenerateNewExpression(vectortypename);
+              code += ").__init(";
+              code += vectortype.struct_def->fixed
+                          ? index
+                          : GenBBAccess() + ".__indirect(" + index + ")";
+              code += ", " + GenBBAccess() + ")";
+            } else {
+              if (is_union) {
+                index = "obj, " + index;
+              } else if (IsString(vectortype)) {
+                index += ", optionalEncoding";
+              }
+              code += offset_prefix + GenGetter(vectortype, "(" + index + ")");
+            }
+            code += " : ";
+            if (field.value.type.element == BASE_TYPE_BOOL) {
+              code += "false";
+            } else if (field.value.type.element == BASE_TYPE_LONG ||
+                       field.value.type.element == BASE_TYPE_ULONG) {
+              code += GenBBAccess() + ".createLong(0, 0)";
+            } else if (IsScalar(field.value.type.element)) {
+              if (field.value.type.enum_def) {
+                code += field.value.constant;
+              } else {
+                code += "0";
+              }
+            } else {
+              code += "null";
+            }
+            code += ";\n";
+            break;
+          }
+
+          case BASE_TYPE_UNION: {
+            GenDocComment(field.doc_comment, code_ptr);
+            code += MakeCamel(field.name, false);
+
+            const auto &union_enum = *(field.value.type.enum_def);
+            const auto union_type = GenUnionGenericTypeTS(union_enum);
+            code += "<T extends flatbuffers.Table>(obj:" + union_type +
+                    "):" + union_type +
+                    "|null "
+                    "{\n";
+
+            code += offset_prefix +
+                    GenGetter(field.value.type, "(obj, this.bb_pos + offset)") +
+                    " : null;\n";
+            break;
+          }
+          default: FLATBUFFERS_ASSERT(0);
+        }
+      }
+      code += "}\n\n";
+
+      // Adds the mutable scalar value to the output
+      if (IsScalar(field.value.type.base_type) && parser.opts.mutable_buffer &&
+          !IsUnion(field.value.type)) {
+        std::string type =
+            GenTypeName(imports, struct_def, field.value.type, true);
+
+        code += "mutate_" + field.name + "(value:" + type + "):boolean {\n";
+
+        if (struct_def.fixed) {
+          code += "  " + GenBBAccess() + ".write" +
+                  MakeCamel(GenType(field.value.type)) + "(this.bb_pos + " +
+                  NumToString(field.value.offset) + ", ";
+        } else {
+          code += "  const offset = " + GenBBAccess() +
+                  ".__offset(this.bb_pos, " + NumToString(field.value.offset) +
+                  ");\n\n";
+          code += "  if (offset === 0) {\n";
+          code += "    return false;\n";
+          code += "  }\n\n";
+
+          // special case for bools, which are treated as uint8
+          code += "  " + GenBBAccess() + ".write" +
+                  MakeCamel(GenType(field.value.type)) +
+                  "(this.bb_pos + offset, ";
+          if (field.value.type.base_type == BASE_TYPE_BOOL) { code += "+"; }
+        }
+
+        code += "value);\n";
+        code += "  return true;\n";
+        code += "}\n\n";
+      }
+
+      // Emit vector helpers
+      if (IsVector(field.value.type)) {
+        // Emit a length helper
+        GenDocComment(code_ptr);
+        code += MakeCamel(field.name, false);
+        code += "Length():number {\n" + offset_prefix;
+
+        code +=
+            GenBBAccess() + ".__vector_len(this.bb_pos + offset) : 0;\n}\n\n";
+
+        // For scalar types, emit a typed array helper
+        auto vectorType = field.value.type.VectorType();
+        if (IsScalar(vectorType.base_type) && !IsLong(vectorType.base_type)) {
+          GenDocComment(code_ptr);
+
+          code += MakeCamel(field.name, false);
+          code += "Array():" + GenType(vectorType) + "Array|null {\n" +
+                  offset_prefix;
+
+          code += "new " + GenType(vectorType) + "Array(" + GenBBAccess() +
+                  ".bytes().buffer, " + GenBBAccess() +
+                  ".bytes().byteOffset + " + GenBBAccess() +
+                  ".__vector(this.bb_pos + offset), " + GenBBAccess() +
+                  ".__vector_len(this.bb_pos + offset)) : null;\n}\n\n";
+        }
+      }
+    }
+
+    // Emit the fully qualified name
+    if (parser_.opts.generate_name_strings) {
+      GenDocComment(code_ptr);
+      code += "static getFullyQualifiedName():string {\n";
+      code += "  return '" + WrapInNameSpace(struct_def) + "';\n";
+      code += "}\n\n";
+    }
+
+    // Emit the size of the struct.
+    if (struct_def.fixed) {
+      GenDocComment(code_ptr);
+      code += "static sizeOf():number {\n";
+      code += "  return " + NumToString(struct_def.bytesize) + ";\n";
+      code += "}\n\n";
+    }
+
+    // Emit a factory constructor
+    if (struct_def.fixed) {
+      std::string arguments;
+      GenStructArgs(imports, struct_def, &arguments, "");
+      GenDocComment(code_ptr);
+
+      code += "static create" + GetPrefixedName(struct_def) +
+              "(builder:flatbuffers.Builder";
+      code += arguments + "):flatbuffers.Offset {\n";
+
+      GenStructBody(struct_def, &code, "");
+      code += "  return builder.offset();\n}\n\n";
+    } else {
+      // Generate a method to start building a new object
+      GenDocComment(code_ptr);
+
+      code += "static start" + GetPrefixedName(struct_def) +
+              "(builder:flatbuffers.Builder) {\n";
+
+      code += "  builder.startObject(" +
+              NumToString(struct_def.fields.vec.size()) + ");\n";
+      code += "}\n\n";
+
+      // Generate a set of static methods that allow table construction
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (field.deprecated) continue;
+        const auto argname = GetArgName(field);
+
+        // Generate the field insertion method
+        GenDocComment(code_ptr);
+        code += "static add" + MakeCamel(field.name);
+        code += "(builder:flatbuffers.Builder, " + argname + ":" +
+                GetArgType(imports, struct_def, field, false) + ") {\n";
+        code += "  builder.addField" + GenWriteMethod(field.value.type) + "(";
+        code += NumToString(it - struct_def.fields.vec.begin()) + ", ";
+        if (field.value.type.base_type == BASE_TYPE_BOOL) { code += "+"; }
+        code += argname + ", ";
+        if (!IsScalar(field.value.type.base_type)) {
+          code += "0";
+        } else if (HasNullDefault(field)) {
+          if (IsLong(field.value.type.base_type)) {
+            code += "builder.createLong(0, 0)";
+          } else {
+            code += "0";
+          }
+        } else {
+          if (field.value.type.base_type == BASE_TYPE_BOOL) { code += "+"; }
+          code += GenDefaultValue(field, "builder", imports);
+        }
+        code += ");\n}\n\n";
+
+        if (IsVector(field.value.type)) {
+          auto vector_type = field.value.type.VectorType();
+          auto alignment = InlineAlignment(vector_type);
+          auto elem_size = InlineSize(vector_type);
+
+          // Generate a method to create a vector from a JavaScript array
+          if (!IsStruct(vector_type)) {
+            GenDocComment(code_ptr);
+
+            const std::string sig_begin =
+                "static create" + MakeCamel(field.name) +
+                "Vector(builder:flatbuffers.Builder, data:";
+            const std::string sig_end = "):flatbuffers.Offset";
+            std::string type =
+                GenTypeName(imports, struct_def, vector_type, true) + "[]";
+            if (type == "number[]") {
+              const auto &array_type = GenType(vector_type);
+              // the old type should be deprecated in the future
+              std::string type_old = "number[]|Uint8Array";
+              std::string type_new = "number[]|" + array_type + "Array";
+              if (type_old == type_new) {
+                type = type_new;
+              } else {
+                // add function overloads
+                code += sig_begin + type_new + sig_end + ";\n";
+                code +=
+                    "/**\n * @deprecated This Uint8Array overload will "
+                    "be removed in the future.\n */\n";
+                code += sig_begin + type_old + sig_end + ";\n";
+                type = type_new + "|Uint8Array";
+              }
+            }
+            code += sig_begin + type + sig_end + " {\n";
+            code += "  builder.startVector(" + NumToString(elem_size);
+            code += ", data.length, " + NumToString(alignment) + ");\n";
+            code += "  for (let i = data.length - 1; i >= 0; i--) {\n";
+            code += "    builder.add" + GenWriteMethod(vector_type) + "(";
+            if (vector_type.base_type == BASE_TYPE_BOOL) { code += "+"; }
+            code += "data[i]!);\n";
+            code += "  }\n";
+            code += "  return builder.endVector();\n";
+            code += "}\n\n";
+          }
+
+          // Generate a method to start a vector, data to be added manually
+          // after
+          GenDocComment(code_ptr);
+
+          code += "static start" + MakeCamel(field.name);
+          code += "Vector(builder:flatbuffers.Builder, numElems:number) {\n";
+          code += "  builder.startVector(" + NumToString(elem_size);
+          code += ", numElems, " + NumToString(alignment) + ");\n";
+          code += "}\n\n";
+        }
+      }
+
+      // Generate a method to stop building a new object
+      GenDocComment(code_ptr);
+
+      code += "static end" + GetPrefixedName(struct_def);
+      code += "(builder:flatbuffers.Builder):flatbuffers.Offset {\n";
+
+      code += "  const offset = builder.endObject();\n";
+      for (auto it = struct_def.fields.vec.begin();
+           it != struct_def.fields.vec.end(); ++it) {
+        auto &field = **it;
+        if (!field.deprecated && field.IsRequired()) {
+          code += "  builder.requiredField(offset, ";
+          code += NumToString(field.value.offset);
+          code += ") // " + field.name + "\n";
+        }
+      }
+      code += "  return offset;\n";
+      code += "}\n\n";
+
+      // Generate the methods to complete buffer construction
+      GenerateFinisher(struct_def, code_ptr, code, false);
+      GenerateFinisher(struct_def, code_ptr, code, true);
+
+      // Generate a convenient CreateX function
+      if (CanCreateFactoryMethod(struct_def)) {
+        code += "static create" + GetPrefixedName(struct_def);
+        code += "(builder:flatbuffers.Builder";
+        for (auto it = struct_def.fields.vec.begin();
+             it != struct_def.fields.vec.end(); ++it) {
+          const auto &field = **it;
+          if (field.deprecated) continue;
+          code += ", " + GetArgName(field) + ":" +
+                  GetArgType(imports, struct_def, field, true);
+        }
+
+        code += "):flatbuffers.Offset {\n";
+        code += "  " + struct_def.name + ".start" +
+                GetPrefixedName(struct_def) + "(builder);\n";
+
+        std::string methodPrefix = struct_def.name;
+        for (auto it = struct_def.fields.vec.begin();
+             it != struct_def.fields.vec.end(); ++it) {
+          const auto &field = **it;
+          if (field.deprecated) continue;
+
+          const auto arg_name = GetArgName(field);
+
+          if (field.IsScalarOptional()) {
+            code += "  if (" + arg_name + " !== null)\n  ";
+          }
+
+          code += "  " + methodPrefix + ".add" + MakeCamel(field.name) + "(";
+          code += "builder, " + arg_name + ");\n";
+        }
+
+        code += "  return " + methodPrefix + ".end" +
+                GetPrefixedName(struct_def) + "(builder);\n";
+        code += "}\n";
+      }
+    }
+
+    if (!struct_def.fixed && parser_.services_.vec.size() != 0) {
+      auto name = GetPrefixedName(struct_def, "");
+      code += "\n";
+      code += "serialize():Uint8Array {\n";
+      code += "  return this.bb!.bytes();\n";
+      code += "}\n";
+
+      code += "\n";
+      code += "static deserialize(buffer: Uint8Array):" + name + " {\n";
+      code += "  return " + AddImport(imports, struct_def, struct_def) +
+              ".getRootAs" + name + "(new flatbuffers.ByteBuffer(buffer))\n";
+      code += "}\n";
+    }
+
+    if (parser_.opts.generate_object_based_api) {
+      std::string obj_api_class;
+      std::string obj_api_unpack_func;
+      GenObjApi(parser_, struct_def, obj_api_unpack_func, obj_api_class,
+                imports);
+
+      code += obj_api_unpack_func + "}\n" + obj_api_class;
+    } else {
+      code += "}\n";
+    }
+  }
+
+  static bool HasNullDefault(const FieldDef &field) {
+    return field.IsOptional() && field.value.constant == "null";
+  }
+
+  std::string GetArgType(import_set &imports, const Definition &owner,
+                         const FieldDef &field, bool allowNull) {
+    return GenTypeName(imports, owner, field.value.type, true,
+                       allowNull && field.IsOptional());
+  }
+
+  static std::string GetArgName(const FieldDef &field) {
+    auto argname = MakeCamel(field.name, false);
+    if (!IsScalar(field.value.type.base_type)) { argname += "Offset"; }
+
+    return argname;
+  }
+
+  std::string GetPrefixedName(const StructDef &struct_def,
+                              const char *prefix = "") {
+    return prefix + struct_def.name;
+  }
+};  // namespace ts
+}  // namespace ts
+
+bool GenerateTS(const Parser &parser, const std::string &path,
+                const std::string &file_name) {
+  ts::TsGenerator generator(parser, path, file_name);
+  return generator.generate();
+}
+
+std::string TSMakeRule(const Parser &parser, const std::string &path,
+                       const std::string &file_name) {
+  FLATBUFFERS_ASSERT(parser.opts.lang <= IDLOptions::kMAX);
+
+  std::string filebase =
+      flatbuffers::StripPath(flatbuffers::StripExtension(file_name));
+  ts::TsGenerator generator(parser, path, file_name);
+  std::string make_rule =
+      generator.GeneratedFileName(path, filebase, parser.opts) + ": ";
+
+  auto included_files = parser.GetIncludedFilesRecursive(file_name);
+  for (auto it = included_files.begin(); it != included_files.end(); ++it) {
+    make_rule += " " + *it;
+  }
+  return make_rule;
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/idl_parser.cpp b/3rdparty/TNN/third_party/flatbuffers/src/idl_parser.cpp
new file mode 100644
index 0000000..f04e5a3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/idl_parser.cpp
@@ -0,0 +1,3986 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <list>
+#include <string>
+#include <utility>
+
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// Reflects the version at the compiling time of binary(lib/dll/so).
+const char *FLATBUFFERS_VERSION() {
+  // clang-format off
+  return
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MAJOR) "."
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MINOR) "."
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_REVISION);
+  // clang-format on
+}
+
+const double kPi = 3.14159265358979323846;
+
+// clang-format off
+const char *const kTypeNames[] = {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, ...) \
+    IDLTYPE,
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+  nullptr
+};
+
+const char kTypeSizes[] = {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+    sizeof(CTYPE),
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+};
+// clang-format on
+
+// The enums in the reflection schema should match the ones we use internally.
+// Compare the last element to check if these go out of sync.
+static_assert(BASE_TYPE_UNION == static_cast<BaseType>(reflection::Union),
+              "enums don't match");
+
+// Any parsing calls have to be wrapped in this macro, which automates
+// handling of recursive error checking a bit. It will check the received
+// CheckedError object, and return straight away on error.
+#define ECHECK(call)           \
+  {                            \
+    auto ce = (call);          \
+    if (ce.Check()) return ce; \
+  }
+
+// These two functions are called hundreds of times below, so define a short
+// form:
+#define NEXT() ECHECK(Next())
+#define EXPECT(tok) ECHECK(Expect(tok))
+
+static bool ValidateUTF8(const std::string &str) {
+  const char *s = &str[0];
+  const char *const sEnd = s + str.length();
+  while (s < sEnd) {
+    if (FromUTF8(&s) < 0) { return false; }
+  }
+  return true;
+}
+
+static bool IsLowerSnakeCase(const std::string &str) {
+  for (size_t i = 0; i < str.length(); i++) {
+    char c = str[i];
+    if (!check_ascii_range(c, 'a', 'z') && !is_digit(c) && c != '_') {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Convert an underscore_based_identifier in to camelCase.
+// Also uppercases the first character if first is true.
+std::string MakeCamel(const std::string &in, bool first) {
+  std::string s;
+  for (size_t i = 0; i < in.length(); i++) {
+    if (!i && first)
+      s += CharToUpper(in[0]);
+    else if (in[i] == '_' && i + 1 < in.length())
+      s += CharToUpper(in[++i]);
+    else
+      s += in[i];
+  }
+  return s;
+}
+
+// Convert an underscore_based_identifier in to screaming snake case.
+std::string MakeScreamingCamel(const std::string &in) {
+  std::string s;
+  for (size_t i = 0; i < in.length(); i++) {
+    if (in[i] != '_')
+      s += CharToUpper(in[i]);
+    else
+      s += in[i];
+  }
+  return s;
+}
+
+void DeserializeDoc(std::vector<std::string> &doc,
+                    const Vector<Offset<String>> *documentation) {
+  if (documentation == nullptr) return;
+  for (uoffset_t index = 0; index < documentation->size(); index++)
+    doc.push_back(documentation->Get(index)->str());
+}
+
+void Parser::Message(const std::string &msg) {
+  if (!error_.empty()) error_ += "\n";  // log all warnings and errors
+  error_ += file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : "";
+  // clang-format off
+
+  #ifdef _WIN32  // MSVC alike
+    error_ +=
+        "(" + NumToString(line_) + ", " + NumToString(CursorPosition()) + ")";
+  #else  // gcc alike
+    if (file_being_parsed_.length()) error_ += ":";
+    error_ += NumToString(line_) + ": " + NumToString(CursorPosition());
+  #endif
+  // clang-format on
+  error_ += ": " + msg;
+}
+
+void Parser::Warning(const std::string &msg) {
+  if (!opts.no_warnings) Message("warning: " + msg);
+}
+
+CheckedError Parser::Error(const std::string &msg) {
+  Message("error: " + msg);
+  return CheckedError(true);
+}
+
+inline CheckedError NoError() { return CheckedError(false); }
+
+CheckedError Parser::RecurseError() {
+  return Error("maximum parsing depth " + NumToString(parse_depth_counter_) +
+               " reached");
+}
+
+class Parser::ParseDepthGuard {
+ public:
+  explicit ParseDepthGuard(Parser *parser_not_null)
+      : parser_(*parser_not_null), caller_depth_(parser_.parse_depth_counter_) {
+    FLATBUFFERS_ASSERT(caller_depth_ <= (FLATBUFFERS_MAX_PARSING_DEPTH) &&
+                       "Check() must be called to prevent stack overflow");
+    parser_.parse_depth_counter_ += 1;
+  }
+
+  ~ParseDepthGuard() { parser_.parse_depth_counter_ -= 1; }
+
+  CheckedError Check() {
+    return caller_depth_ >= (FLATBUFFERS_MAX_PARSING_DEPTH)
+               ? parser_.RecurseError()
+               : CheckedError(false);
+  }
+
+  FLATBUFFERS_DELETE_FUNC(ParseDepthGuard(const ParseDepthGuard &));
+  FLATBUFFERS_DELETE_FUNC(ParseDepthGuard &operator=(const ParseDepthGuard &));
+
+ private:
+  Parser &parser_;
+  const int caller_depth_;
+};
+
+template<typename T> std::string TypeToIntervalString() {
+  return "[" + NumToString((flatbuffers::numeric_limits<T>::lowest)()) + "; " +
+         NumToString((flatbuffers::numeric_limits<T>::max)()) + "]";
+}
+
+// atot: template version of atoi/atof: convert a string to an instance of T.
+template<typename T>
+bool atot_scalar(const char *s, T *val, bool_constant<false>) {
+  return StringToNumber(s, val);
+}
+
+template<typename T>
+bool atot_scalar(const char *s, T *val, bool_constant<true>) {
+  // Normalize NaN parsed from fbs or json to unsigned NaN.
+  if (false == StringToNumber(s, val)) return false;
+  *val = (*val != *val) ? std::fabs(*val) : *val;
+  return true;
+}
+
+template<typename T> CheckedError atot(const char *s, Parser &parser, T *val) {
+  auto done = atot_scalar(s, val, bool_constant<is_floating_point<T>::value>());
+  if (done) return NoError();
+  if (0 == *val)
+    return parser.Error("invalid number: \"" + std::string(s) + "\"");
+  else
+    return parser.Error("invalid number: \"" + std::string(s) + "\"" +
+                        ", constant does not fit " + TypeToIntervalString<T>());
+}
+template<>
+inline CheckedError atot<Offset<void>>(const char *s, Parser &parser,
+                                       Offset<void> *val) {
+  (void)parser;
+  *val = Offset<void>(atoi(s));
+  return NoError();
+}
+
+std::string Namespace::GetFullyQualifiedName(const std::string &name,
+                                             size_t max_components) const {
+  // Early exit if we don't have a defined namespace.
+  if (components.empty() || !max_components) { return name; }
+  std::string stream_str;
+  for (size_t i = 0; i < std::min(components.size(), max_components); i++) {
+    stream_str += components[i];
+    stream_str += '.';
+  }
+  if (!stream_str.empty()) stream_str.pop_back();
+  if (name.length()) {
+    stream_str += '.';
+    stream_str += name;
+  }
+  return stream_str;
+}
+
+template<typename T>
+T *LookupTableByName(const SymbolTable<T> &table, const std::string &name,
+                     const Namespace &current_namespace, size_t skip_top) {
+  const auto &components = current_namespace.components;
+  if (table.dict.empty()) return nullptr;
+  if (components.size() < skip_top) return nullptr;
+  const auto N = components.size() - skip_top;
+  std::string full_name;
+  for (size_t i = 0; i < N; i++) {
+    full_name += components[i];
+    full_name += '.';
+  }
+  for (size_t i = N; i > 0; i--) {
+    full_name += name;
+    auto obj = table.Lookup(full_name);
+    if (obj) return obj;
+    auto len = full_name.size() - components[i - 1].size() - 1 - name.size();
+    full_name.resize(len);
+  }
+  FLATBUFFERS_ASSERT(full_name.empty());
+  return table.Lookup(name);  // lookup in global namespace
+}
+
+// Declare tokens we'll use. Single character tokens are represented by their
+// ascii character code (e.g. '{'), others above 256.
+// clang-format off
+#define FLATBUFFERS_GEN_TOKENS(TD) \
+  TD(Eof, 256, "end of file") \
+  TD(StringConstant, 257, "string constant") \
+  TD(IntegerConstant, 258, "integer constant") \
+  TD(FloatConstant, 259, "float constant") \
+  TD(Identifier, 260, "identifier")
+#ifdef __GNUC__
+__extension__  // Stop GCC complaining about trailing comma with -Wpendantic.
+#endif
+enum {
+  #define FLATBUFFERS_TOKEN(NAME, VALUE, STRING) kToken ## NAME = VALUE,
+    FLATBUFFERS_GEN_TOKENS(FLATBUFFERS_TOKEN)
+  #undef FLATBUFFERS_TOKEN
+};
+
+static std::string TokenToString(int t) {
+  static const char * const tokens[] = {
+    #define FLATBUFFERS_TOKEN(NAME, VALUE, STRING) STRING,
+      FLATBUFFERS_GEN_TOKENS(FLATBUFFERS_TOKEN)
+    #undef FLATBUFFERS_TOKEN
+    #define FLATBUFFERS_TD(ENUM, IDLTYPE, ...) \
+      IDLTYPE,
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+    #undef FLATBUFFERS_TD
+  };
+  if (t < 256) {  // A single ascii char token.
+    std::string s;
+    s.append(1, static_cast<char>(t));
+    return s;
+  } else {       // Other tokens.
+    return tokens[t - 256];
+  }
+}
+// clang-format on
+
+std::string Parser::TokenToStringId(int t) const {
+  return t == kTokenIdentifier ? attribute_ : TokenToString(t);
+}
+
+// Parses exactly nibbles worth of hex digits into a number, or error.
+CheckedError Parser::ParseHexNum(int nibbles, uint64_t *val) {
+  FLATBUFFERS_ASSERT(nibbles > 0);
+  for (int i = 0; i < nibbles; i++)
+    if (!is_xdigit(cursor_[i]))
+      return Error("escape code must be followed by " + NumToString(nibbles) +
+                   " hex digits");
+  std::string target(cursor_, cursor_ + nibbles);
+  *val = StringToUInt(target.c_str(), 16);
+  cursor_ += nibbles;
+  return NoError();
+}
+
+CheckedError Parser::SkipByteOrderMark() {
+  if (static_cast<unsigned char>(*cursor_) != 0xef) return NoError();
+  cursor_++;
+  if (static_cast<unsigned char>(*cursor_) != 0xbb)
+    return Error("invalid utf-8 byte order mark");
+  cursor_++;
+  if (static_cast<unsigned char>(*cursor_) != 0xbf)
+    return Error("invalid utf-8 byte order mark");
+  cursor_++;
+  return NoError();
+}
+
+static inline bool IsIdentifierStart(char c) {
+  return is_alpha(c) || (c == '_');
+}
+
+CheckedError Parser::Next() {
+  doc_comment_.clear();
+  bool seen_newline = cursor_ == source_;
+  attribute_.clear();
+  attr_is_trivial_ascii_string_ = true;
+  for (;;) {
+    char c = *cursor_++;
+    token_ = c;
+    switch (c) {
+      case '\0':
+        cursor_--;
+        token_ = kTokenEof;
+        return NoError();
+      case ' ':
+      case '\r':
+      case '\t': break;
+      case '\n':
+        MarkNewLine();
+        seen_newline = true;
+        break;
+      case '{':
+      case '}':
+      case '(':
+      case ')':
+      case '[':
+      case ']':
+      case ',':
+      case ':':
+      case ';':
+      case '=': return NoError();
+      case '\"':
+      case '\'': {
+        int unicode_high_surrogate = -1;
+
+        while (*cursor_ != c) {
+          if (*cursor_ < ' ' && static_cast<signed char>(*cursor_) >= 0)
+            return Error("illegal character in string constant");
+          if (*cursor_ == '\\') {
+            attr_is_trivial_ascii_string_ = false;  // has escape sequence
+            cursor_++;
+            if (unicode_high_surrogate != -1 && *cursor_ != 'u') {
+              return Error(
+                  "illegal Unicode sequence (unpaired high surrogate)");
+            }
+            switch (*cursor_) {
+              case 'n':
+                attribute_ += '\n';
+                cursor_++;
+                break;
+              case 't':
+                attribute_ += '\t';
+                cursor_++;
+                break;
+              case 'r':
+                attribute_ += '\r';
+                cursor_++;
+                break;
+              case 'b':
+                attribute_ += '\b';
+                cursor_++;
+                break;
+              case 'f':
+                attribute_ += '\f';
+                cursor_++;
+                break;
+              case '\"':
+                attribute_ += '\"';
+                cursor_++;
+                break;
+              case '\'':
+                attribute_ += '\'';
+                cursor_++;
+                break;
+              case '\\':
+                attribute_ += '\\';
+                cursor_++;
+                break;
+              case '/':
+                attribute_ += '/';
+                cursor_++;
+                break;
+              case 'x': {  // Not in the JSON standard
+                cursor_++;
+                uint64_t val;
+                ECHECK(ParseHexNum(2, &val));
+                attribute_ += static_cast<char>(val);
+                break;
+              }
+              case 'u': {
+                cursor_++;
+                uint64_t val;
+                ECHECK(ParseHexNum(4, &val));
+                if (val >= 0xD800 && val <= 0xDBFF) {
+                  if (unicode_high_surrogate != -1) {
+                    return Error(
+                        "illegal Unicode sequence (multiple high surrogates)");
+                  } else {
+                    unicode_high_surrogate = static_cast<int>(val);
+                  }
+                } else if (val >= 0xDC00 && val <= 0xDFFF) {
+                  if (unicode_high_surrogate == -1) {
+                    return Error(
+                        "illegal Unicode sequence (unpaired low surrogate)");
+                  } else {
+                    int code_point = 0x10000 +
+                                     ((unicode_high_surrogate & 0x03FF) << 10) +
+                                     (val & 0x03FF);
+                    ToUTF8(code_point, &attribute_);
+                    unicode_high_surrogate = -1;
+                  }
+                } else {
+                  if (unicode_high_surrogate != -1) {
+                    return Error(
+                        "illegal Unicode sequence (unpaired high surrogate)");
+                  }
+                  ToUTF8(static_cast<int>(val), &attribute_);
+                }
+                break;
+              }
+              default: return Error("unknown escape code in string constant");
+            }
+          } else {  // printable chars + UTF-8 bytes
+            if (unicode_high_surrogate != -1) {
+              return Error(
+                  "illegal Unicode sequence (unpaired high surrogate)");
+            }
+            // reset if non-printable
+            attr_is_trivial_ascii_string_ &=
+                check_ascii_range(*cursor_, ' ', '~');
+
+            attribute_ += *cursor_++;
+          }
+        }
+        if (unicode_high_surrogate != -1) {
+          return Error("illegal Unicode sequence (unpaired high surrogate)");
+        }
+        cursor_++;
+        if (!attr_is_trivial_ascii_string_ && !opts.allow_non_utf8 &&
+            !ValidateUTF8(attribute_)) {
+          return Error("illegal UTF-8 sequence");
+        }
+        token_ = kTokenStringConstant;
+        return NoError();
+      }
+      case '/':
+        if (*cursor_ == '/') {
+          const char *start = ++cursor_;
+          while (*cursor_ && *cursor_ != '\n' && *cursor_ != '\r') cursor_++;
+          if (*start == '/') {  // documentation comment
+            if (!seen_newline)
+              return Error(
+                  "a documentation comment should be on a line on its own");
+            doc_comment_.push_back(std::string(start + 1, cursor_));
+          }
+          break;
+        } else if (*cursor_ == '*') {
+          cursor_++;
+          // TODO: make nested.
+          while (*cursor_ != '*' || cursor_[1] != '/') {
+            if (*cursor_ == '\n') MarkNewLine();
+            if (!*cursor_) return Error("end of file in comment");
+            cursor_++;
+          }
+          cursor_ += 2;
+          break;
+        }
+        FLATBUFFERS_FALLTHROUGH();  // else fall thru
+      default:
+        if (IsIdentifierStart(c)) {
+          // Collect all chars of an identifier:
+          const char *start = cursor_ - 1;
+          while (IsIdentifierStart(*cursor_) || is_digit(*cursor_)) cursor_++;
+          attribute_.append(start, cursor_);
+          token_ = kTokenIdentifier;
+          return NoError();
+        }
+
+        const auto has_sign = (c == '+') || (c == '-');
+        if (has_sign && IsIdentifierStart(*cursor_)) {
+          // '-'/'+' and following identifier - it could be a predefined
+          // constant. Return the sign in token_, see ParseSingleValue.
+          return NoError();
+        }
+
+        auto dot_lvl =
+            (c == '.') ? 0 : 1;  // dot_lvl==0 <=> exactly one '.' seen
+        if (!dot_lvl && !is_digit(*cursor_)) return NoError();  // enum?
+        // Parser accepts hexadecimal-floating-literal (see C++ 5.13.4).
+        if (is_digit(c) || has_sign || !dot_lvl) {
+          const auto start = cursor_ - 1;
+          auto start_digits = !is_digit(c) ? cursor_ : cursor_ - 1;
+          if (!is_digit(c) && is_digit(*cursor_)) {
+            start_digits = cursor_;  // see digit in cursor_ position
+            c = *cursor_++;
+          }
+          // hex-float can't begind with '.'
+          auto use_hex = dot_lvl && (c == '0') && is_alpha_char(*cursor_, 'X');
+          if (use_hex) start_digits = ++cursor_;  // '0x' is the prefix, skip it
+          // Read an integer number or mantisa of float-point number.
+          do {
+            if (use_hex) {
+              while (is_xdigit(*cursor_)) cursor_++;
+            } else {
+              while (is_digit(*cursor_)) cursor_++;
+            }
+          } while ((*cursor_ == '.') && (++cursor_) && (--dot_lvl >= 0));
+          // Exponent of float-point number.
+          if ((dot_lvl >= 0) && (cursor_ > start_digits)) {
+            // The exponent suffix of hexadecimal float number is mandatory.
+            if (use_hex && !dot_lvl) start_digits = cursor_;
+            if ((use_hex && is_alpha_char(*cursor_, 'P')) ||
+                is_alpha_char(*cursor_, 'E')) {
+              dot_lvl = 0;  // Emulate dot to signal about float-point number.
+              cursor_++;
+              if (*cursor_ == '+' || *cursor_ == '-') cursor_++;
+              start_digits = cursor_;  // the exponent-part has to have digits
+              // Exponent is decimal integer number
+              while (is_digit(*cursor_)) cursor_++;
+              if (*cursor_ == '.') {
+                cursor_++;  // If see a dot treat it as part of invalid number.
+                dot_lvl = -1;  // Fall thru to Error().
+              }
+            }
+          }
+          // Finalize.
+          if ((dot_lvl >= 0) && (cursor_ > start_digits)) {
+            attribute_.append(start, cursor_);
+            token_ = dot_lvl ? kTokenIntegerConstant : kTokenFloatConstant;
+            return NoError();
+          } else {
+            return Error("invalid number: " + std::string(start, cursor_));
+          }
+        }
+        std::string ch;
+        ch = c;
+        if (false == check_ascii_range(c, ' ', '~'))
+          ch = "code: " + NumToString(c);
+        return Error("illegal character: " + ch);
+    }
+  }
+}
+
+// Check if a given token is next.
+bool Parser::Is(int t) const { return t == token_; }
+
+bool Parser::IsIdent(const char *id) const {
+  return token_ == kTokenIdentifier && attribute_ == id;
+}
+
+// Expect a given token to be next, consume it, or error if not present.
+CheckedError Parser::Expect(int t) {
+  if (t != token_) {
+    return Error("expecting: " + TokenToString(t) +
+                 " instead got: " + TokenToStringId(token_));
+  }
+  NEXT();
+  return NoError();
+}
+
+CheckedError Parser::ParseNamespacing(std::string *id, std::string *last) {
+  while (Is('.')) {
+    NEXT();
+    *id += ".";
+    *id += attribute_;
+    if (last) *last = attribute_;
+    EXPECT(kTokenIdentifier);
+  }
+  return NoError();
+}
+
+EnumDef *Parser::LookupEnum(const std::string &id) {
+  // Search thru parent namespaces.
+  return LookupTableByName(enums_, id, *current_namespace_, 0);
+}
+
+StructDef *Parser::LookupStruct(const std::string &id) const {
+  auto sd = structs_.Lookup(id);
+  if (sd) sd->refcount++;
+  return sd;
+}
+
+StructDef *Parser::LookupStructThruParentNamespaces(
+    const std::string &id) const {
+  auto sd = LookupTableByName(structs_, id, *current_namespace_, 1);
+  if (sd) sd->refcount++;
+  return sd;
+}
+
+CheckedError Parser::ParseTypeIdent(Type &type) {
+  std::string id = attribute_;
+  EXPECT(kTokenIdentifier);
+  ECHECK(ParseNamespacing(&id, nullptr));
+  auto enum_def = LookupEnum(id);
+  if (enum_def) {
+    type = enum_def->underlying_type;
+    if (enum_def->is_union) type.base_type = BASE_TYPE_UNION;
+  } else {
+    type.base_type = BASE_TYPE_STRUCT;
+    type.struct_def = LookupCreateStruct(id);
+  }
+  return NoError();
+}
+
+// Parse any IDL type.
+CheckedError Parser::ParseType(Type &type) {
+  if (token_ == kTokenIdentifier) {
+    if (IsIdent("bool")) {
+      type.base_type = BASE_TYPE_BOOL;
+      NEXT();
+    } else if (IsIdent("byte") || IsIdent("int8")) {
+      type.base_type = BASE_TYPE_CHAR;
+      NEXT();
+    } else if (IsIdent("ubyte") || IsIdent("uint8")) {
+      type.base_type = BASE_TYPE_UCHAR;
+      NEXT();
+    } else if (IsIdent("short") || IsIdent("int16")) {
+      type.base_type = BASE_TYPE_SHORT;
+      NEXT();
+    } else if (IsIdent("ushort") || IsIdent("uint16")) {
+      type.base_type = BASE_TYPE_USHORT;
+      NEXT();
+    } else if (IsIdent("int") || IsIdent("int32")) {
+      type.base_type = BASE_TYPE_INT;
+      NEXT();
+    } else if (IsIdent("uint") || IsIdent("uint32")) {
+      type.base_type = BASE_TYPE_UINT;
+      NEXT();
+    } else if (IsIdent("long") || IsIdent("int64")) {
+      type.base_type = BASE_TYPE_LONG;
+      NEXT();
+    } else if (IsIdent("ulong") || IsIdent("uint64")) {
+      type.base_type = BASE_TYPE_ULONG;
+      NEXT();
+    } else if (IsIdent("float") || IsIdent("float32")) {
+      type.base_type = BASE_TYPE_FLOAT;
+      NEXT();
+    } else if (IsIdent("double") || IsIdent("float64")) {
+      type.base_type = BASE_TYPE_DOUBLE;
+      NEXT();
+    } else if (IsIdent("string")) {
+      type.base_type = BASE_TYPE_STRING;
+      NEXT();
+    } else {
+      ECHECK(ParseTypeIdent(type));
+    }
+  } else if (token_ == '[') {
+    ParseDepthGuard depth_guard(this);
+    ECHECK(depth_guard.Check());
+    NEXT();
+    Type subtype;
+    ECHECK(ParseType(subtype));
+    if (IsSeries(subtype)) {
+      // We could support this, but it will complicate things, and it's
+      // easier to work around with a struct around the inner vector.
+      return Error("nested vector types not supported (wrap in table first)");
+    }
+    if (token_ == ':') {
+      NEXT();
+      if (token_ != kTokenIntegerConstant) {
+        return Error("length of fixed-length array must be an integer value");
+      }
+      uint16_t fixed_length = 0;
+      bool check = StringToNumber(attribute_.c_str(), &fixed_length);
+      if (!check || fixed_length < 1) {
+        return Error(
+            "length of fixed-length array must be positive and fit to "
+            "uint16_t type");
+      }
+      type = Type(BASE_TYPE_ARRAY, subtype.struct_def, subtype.enum_def,
+                  fixed_length);
+      NEXT();
+    } else {
+      type = Type(BASE_TYPE_VECTOR, subtype.struct_def, subtype.enum_def);
+    }
+    type.element = subtype.base_type;
+    EXPECT(']');
+  } else {
+    return Error("illegal type syntax");
+  }
+  return NoError();
+}
+
+CheckedError Parser::AddField(StructDef &struct_def, const std::string &name,
+                              const Type &type, FieldDef **dest) {
+  auto &field = *new FieldDef();
+  field.value.offset =
+      FieldIndexToOffset(static_cast<voffset_t>(struct_def.fields.vec.size()));
+  field.name = name;
+  field.file = struct_def.file;
+  field.value.type = type;
+  if (struct_def.fixed) {  // statically compute the field offset
+    auto size = InlineSize(type);
+    auto alignment = InlineAlignment(type);
+    // structs_ need to have a predictable format, so we need to align to
+    // the largest scalar
+    struct_def.minalign = std::max(struct_def.minalign, alignment);
+    struct_def.PadLastField(alignment);
+    field.value.offset = static_cast<voffset_t>(struct_def.bytesize);
+    struct_def.bytesize += size;
+  }
+  if (struct_def.fields.Add(name, &field))
+    return Error("field already exists: " + name);
+  *dest = &field;
+  return NoError();
+}
+
+CheckedError Parser::ParseField(StructDef &struct_def) {
+  std::string name = attribute_;
+
+  if (LookupCreateStruct(name, false, false))
+    return Error("field name can not be the same as table/struct name");
+
+  if (!IsLowerSnakeCase(name)) {
+    Warning("field names should be lowercase snake_case, got: " + name);
+  }
+
+  std::vector<std::string> dc = doc_comment_;
+  EXPECT(kTokenIdentifier);
+  EXPECT(':');
+  Type type;
+  ECHECK(ParseType(type));
+
+  if (struct_def.fixed) {
+    auto valid = IsScalar(type.base_type) || IsStruct(type);
+    if (!valid && IsArray(type)) {
+      const auto &elem_type = type.VectorType();
+      valid |= IsScalar(elem_type.base_type) || IsStruct(elem_type);
+    }
+    if (!valid)
+      return Error("structs may contain only scalar or struct fields");
+  }
+
+  if (!struct_def.fixed && IsArray(type))
+    return Error("fixed-length array in table must be wrapped in struct");
+
+  if (IsArray(type)) {
+    advanced_features_ |= reflection::AdvancedArrayFeatures;
+    if (!SupportsAdvancedArrayFeatures()) {
+      return Error(
+          "Arrays are not yet supported in all "
+          "the specified programming languages.");
+    }
+  }
+
+  FieldDef *typefield = nullptr;
+  if (type.base_type == BASE_TYPE_UNION) {
+    // For union fields, add a second auto-generated field to hold the type,
+    // with a special suffix.
+    ECHECK(AddField(struct_def, name + UnionTypeFieldSuffix(),
+                    type.enum_def->underlying_type, &typefield));
+  } else if (IsVector(type) && type.element == BASE_TYPE_UNION) {
+    advanced_features_ |= reflection::AdvancedUnionFeatures;
+    // Only cpp, js and ts supports the union vector feature so far.
+    if (!SupportsAdvancedUnionFeatures()) {
+      return Error(
+          "Vectors of unions are not yet supported in at least one of "
+          "the specified programming languages.");
+    }
+    // For vector of union fields, add a second auto-generated vector field to
+    // hold the types, with a special suffix.
+    Type union_vector(BASE_TYPE_VECTOR, nullptr, type.enum_def);
+    union_vector.element = BASE_TYPE_UTYPE;
+    ECHECK(AddField(struct_def, name + UnionTypeFieldSuffix(), union_vector,
+                    &typefield));
+  }
+
+  FieldDef *field;
+  ECHECK(AddField(struct_def, name, type, &field));
+
+  if (token_ == '=') {
+    NEXT();
+    ECHECK(ParseSingleValue(&field->name, field->value, true));
+    if (IsStruct(type) || (struct_def.fixed && field->value.constant != "0"))
+      return Error(
+          "default values are not supported for struct fields, table fields, "
+          "or in structs.");
+    if (IsString(type) || IsVector(type)) {
+      advanced_features_ |= reflection::DefaultVectorsAndStrings;
+      if (field->value.constant != "0" && field->value.constant != "null" &&
+          !SupportsDefaultVectorsAndStrings()) {
+        return Error(
+            "Default values for strings and vectors are not supported in one "
+            "of the specified programming languages");
+      }
+    }
+
+    if (IsVector(type) && field->value.constant != "0" &&
+        field->value.constant != "[]") {
+      return Error("The only supported default for vectors is `[]`.");
+    }
+  }
+
+  // Append .0 if the value has not it (skip hex and scientific floats).
+  // This suffix needed for generated C++ code.
+  if (IsFloat(type.base_type)) {
+    auto &text = field->value.constant;
+    FLATBUFFERS_ASSERT(false == text.empty());
+    auto s = text.c_str();
+    while (*s == ' ') s++;
+    if (*s == '-' || *s == '+') s++;
+    // 1) A float constants (nan, inf, pi, etc) is a kind of identifier.
+    // 2) A float number needn't ".0" at the end if it has exponent.
+    if ((false == IsIdentifierStart(*s)) &&
+        (std::string::npos == field->value.constant.find_first_of(".eEpP"))) {
+      field->value.constant += ".0";
+    }
+  }
+
+  field->doc_comment = dc;
+  ECHECK(ParseMetaData(&field->attributes));
+  field->deprecated = field->attributes.Lookup("deprecated") != nullptr;
+  auto hash_name = field->attributes.Lookup("hash");
+  if (hash_name) {
+    switch ((IsVector(type)) ? type.element : type.base_type) {
+      case BASE_TYPE_SHORT:
+      case BASE_TYPE_USHORT: {
+        if (FindHashFunction16(hash_name->constant.c_str()) == nullptr)
+          return Error("Unknown hashing algorithm for 16 bit types: " +
+                       hash_name->constant);
+        break;
+      }
+      case BASE_TYPE_INT:
+      case BASE_TYPE_UINT: {
+        if (FindHashFunction32(hash_name->constant.c_str()) == nullptr)
+          return Error("Unknown hashing algorithm for 32 bit types: " +
+                       hash_name->constant);
+        break;
+      }
+      case BASE_TYPE_LONG:
+      case BASE_TYPE_ULONG: {
+        if (FindHashFunction64(hash_name->constant.c_str()) == nullptr)
+          return Error("Unknown hashing algorithm for 64 bit types: " +
+                       hash_name->constant);
+        break;
+      }
+      default:
+        return Error(
+            "only short, ushort, int, uint, long and ulong data types support "
+            "hashing.");
+    }
+  }
+
+  // For historical convenience reasons, string keys are assumed required.
+  // Scalars are kDefault unless otherwise specified.
+  // Nonscalars are kOptional unless required;
+  field->key = field->attributes.Lookup("key") != nullptr;
+  const bool required = field->attributes.Lookup("required") != nullptr ||
+                        (IsString(type) && field->key);
+  const bool default_str_or_vec =
+      ((IsString(type) || IsVector(type)) && field->value.constant != "0");
+  const bool optional = IsScalar(type.base_type)
+                            ? (field->value.constant == "null")
+                            : !(required || default_str_or_vec);
+  if (required && optional) {
+    return Error("Fields cannot be both optional and required.");
+  }
+  field->presence = FieldDef::MakeFieldPresence(optional, required);
+
+  if (required && (struct_def.fixed || IsScalar(type.base_type))) {
+    return Error("only non-scalar fields in tables may be 'required'");
+  }
+  if (field->key) {
+    if (struct_def.has_key) return Error("only one field may be set as 'key'");
+    struct_def.has_key = true;
+    if (!IsScalar(type.base_type) && !IsString(type)) {
+      return Error("'key' field must be string or scalar type");
+    }
+  }
+
+  if (field->IsScalarOptional()) {
+    advanced_features_ |= reflection::OptionalScalars;
+    if (type.enum_def && type.enum_def->Lookup("null")) {
+      FLATBUFFERS_ASSERT(IsInteger(type.base_type));
+      return Error(
+          "the default 'null' is reserved for declaring optional scalar "
+          "fields, it conflicts with declaration of enum '" +
+          type.enum_def->name + "'.");
+    }
+    if (field->attributes.Lookup("key")) {
+      return Error(
+          "only a non-optional scalar field can be used as a 'key' field");
+    }
+    if (!SupportsOptionalScalars()) {
+      return Error(
+          "Optional scalars are not yet supported in at least one the of "
+          "the specified programming languages.");
+    }
+  }
+
+  if (type.enum_def) {
+    // Verify the enum's type and default value.
+    const std::string &constant = field->value.constant;
+    if (type.base_type == BASE_TYPE_UNION) {
+      if (constant != "0") { return Error("Union defaults must be NONE"); }
+    } else if (IsVector(type)) {
+      if (constant != "0" && constant != "[]") {
+        return Error("Vector defaults may only be `[]`.");
+      }
+    } else if (IsArray(type)) {
+      if (constant != "0") {
+        return Error("Array defaults are not supported yet.");
+      }
+    } else {
+      if (!IsInteger(type.base_type)) {
+        return Error("Enums must have integer base types");
+      }
+      // Optional and bitflags enums may have default constants that are not
+      // their specified variants.
+      if (!field->IsOptional() &&
+          type.enum_def->attributes.Lookup("bit_flags") == nullptr) {
+        if (type.enum_def->FindByValue(constant) == nullptr) {
+          return Error("default value of `" + constant + "` for " + "field `" +
+                       name + "` is not part of enum `" + type.enum_def->name +
+                       "`.");
+        }
+      }
+    }
+  }
+
+  if (field->deprecated && struct_def.fixed)
+    return Error("can't deprecate fields in a struct");
+
+  auto cpp_type = field->attributes.Lookup("cpp_type");
+  if (cpp_type) {
+    if (!hash_name)
+      return Error("cpp_type can only be used with a hashed field");
+    /// forcing cpp_ptr_type to 'naked' if unset
+    auto cpp_ptr_type = field->attributes.Lookup("cpp_ptr_type");
+    if (!cpp_ptr_type) {
+      auto val = new Value();
+      val->type = cpp_type->type;
+      val->constant = "naked";
+      field->attributes.Add("cpp_ptr_type", val);
+    }
+  }
+
+  field->shared = field->attributes.Lookup("shared") != nullptr;
+  if (field->shared && field->value.type.base_type != BASE_TYPE_STRING)
+    return Error("shared can only be defined on strings");
+
+  auto field_native_custom_alloc =
+      field->attributes.Lookup("native_custom_alloc");
+  if (field_native_custom_alloc)
+    return Error(
+        "native_custom_alloc can only be used with a table or struct "
+        "definition");
+
+  field->native_inline = field->attributes.Lookup("native_inline") != nullptr;
+  if (field->native_inline && !IsStruct(field->value.type))
+    return Error("native_inline can only be defined on structs");
+
+  auto nested = field->attributes.Lookup("nested_flatbuffer");
+  if (nested) {
+    if (nested->type.base_type != BASE_TYPE_STRING)
+      return Error(
+          "nested_flatbuffer attribute must be a string (the root type)");
+    if (type.base_type != BASE_TYPE_VECTOR || type.element != BASE_TYPE_UCHAR)
+      return Error(
+          "nested_flatbuffer attribute may only apply to a vector of ubyte");
+    // This will cause an error if the root type of the nested flatbuffer
+    // wasn't defined elsewhere.
+    field->nested_flatbuffer = LookupCreateStruct(nested->constant);
+  }
+
+  if (field->attributes.Lookup("flexbuffer")) {
+    field->flexbuffer = true;
+    uses_flexbuffers_ = true;
+    if (type.base_type != BASE_TYPE_VECTOR || type.element != BASE_TYPE_UCHAR)
+      return Error("flexbuffer attribute may only apply to a vector of ubyte");
+  }
+
+  if (typefield) {
+    if (!IsScalar(typefield->value.type.base_type)) {
+      // this is a union vector field
+      typefield->presence = field->presence;
+    }
+    // If this field is a union, and it has a manually assigned id,
+    // the automatically added type field should have an id as well (of N - 1).
+    auto attr = field->attributes.Lookup("id");
+    if (attr) {
+      const auto &id_str = attr->constant;
+      voffset_t id = 0;
+      const auto done = !atot(id_str.c_str(), *this, &id).Check();
+      if (done && id > 0) {
+        auto val = new Value();
+        val->type = attr->type;
+        val->constant = NumToString(id - 1);
+        typefield->attributes.Add("id", val);
+      } else {
+        return Error(
+            "a union type effectively adds two fields with non-negative ids, "
+            "its id must be that of the second field (the first field is "
+            "the type field and not explicitly declared in the schema);\n"
+            "field: " +
+            field->name + ", id: " + id_str);
+      }
+    }
+    // if this field is a union that is deprecated,
+    // the automatically added type field should be deprecated as well
+    if (field->deprecated) { typefield->deprecated = true; }
+  }
+
+  EXPECT(';');
+  return NoError();
+}
+
+CheckedError Parser::ParseString(Value &val, bool use_string_pooling) {
+  auto s = attribute_;
+  EXPECT(kTokenStringConstant);
+  if (use_string_pooling) {
+    val.constant = NumToString(builder_.CreateSharedString(s).o);
+  } else {
+    val.constant = NumToString(builder_.CreateString(s).o);
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseComma() {
+  if (!opts.protobuf_ascii_alike) EXPECT(',');
+  return NoError();
+}
+
+CheckedError Parser::ParseAnyValue(Value &val, FieldDef *field,
+                                   size_t parent_fieldn,
+                                   const StructDef *parent_struct_def,
+                                   uoffset_t count, bool inside_vector) {
+  switch (val.type.base_type) {
+    case BASE_TYPE_UNION: {
+      FLATBUFFERS_ASSERT(field);
+      std::string constant;
+      Vector<uint8_t> *vector_of_union_types = nullptr;
+      // Find corresponding type field we may have already parsed.
+      for (auto elem = field_stack_.rbegin() + count;
+           elem != field_stack_.rbegin() + parent_fieldn + count; ++elem) {
+        auto &type = elem->second->value.type;
+        if (type.enum_def == val.type.enum_def) {
+          if (inside_vector) {
+            if (IsVector(type) && type.element == BASE_TYPE_UTYPE) {
+              // Vector of union type field.
+              uoffset_t offset;
+              ECHECK(atot(elem->first.constant.c_str(), *this, &offset));
+              vector_of_union_types = reinterpret_cast<Vector<uint8_t> *>(
+                  builder_.GetCurrentBufferPointer() + builder_.GetSize() -
+                  offset);
+              break;
+            }
+          } else {
+            if (type.base_type == BASE_TYPE_UTYPE) {
+              // Union type field.
+              constant = elem->first.constant;
+              break;
+            }
+          }
+        }
+      }
+      if (constant.empty() && !inside_vector) {
+        // We haven't seen the type field yet. Sadly a lot of JSON writers
+        // output these in alphabetical order, meaning it comes after this
+        // value. So we scan past the value to find it, then come back here.
+        // We currently don't do this for vectors of unions because the
+        // scanning/serialization logic would get very complicated.
+        auto type_name = field->name + UnionTypeFieldSuffix();
+        FLATBUFFERS_ASSERT(parent_struct_def);
+        auto type_field = parent_struct_def->fields.Lookup(type_name);
+        FLATBUFFERS_ASSERT(type_field);  // Guaranteed by ParseField().
+        // Remember where we are in the source file, so we can come back here.
+        auto backup = *static_cast<ParserState *>(this);
+        ECHECK(SkipAnyJsonValue());  // The table.
+        ECHECK(ParseComma());
+        auto next_name = attribute_;
+        if (Is(kTokenStringConstant)) {
+          NEXT();
+        } else {
+          EXPECT(kTokenIdentifier);
+        }
+        if (next_name == type_name) {
+          EXPECT(':');
+          ParseDepthGuard depth_guard(this);
+          ECHECK(depth_guard.Check());
+          Value type_val = type_field->value;
+          ECHECK(ParseAnyValue(type_val, type_field, 0, nullptr, 0));
+          constant = type_val.constant;
+          // Got the information we needed, now rewind:
+          *static_cast<ParserState *>(this) = backup;
+        }
+      }
+      if (constant.empty() && !vector_of_union_types) {
+        return Error("missing type field for this union value: " + field->name);
+      }
+      uint8_t enum_idx;
+      if (vector_of_union_types) {
+        enum_idx = vector_of_union_types->Get(count);
+      } else {
+        ECHECK(atot(constant.c_str(), *this, &enum_idx));
+      }
+      auto enum_val = val.type.enum_def->ReverseLookup(enum_idx, true);
+      if (!enum_val) return Error("illegal type id for: " + field->name);
+      if (enum_val->union_type.base_type == BASE_TYPE_STRUCT) {
+        ECHECK(ParseTable(*enum_val->union_type.struct_def, &val.constant,
+                          nullptr));
+        if (enum_val->union_type.struct_def->fixed) {
+          // All BASE_TYPE_UNION values are offsets, so turn this into one.
+          SerializeStruct(*enum_val->union_type.struct_def, val);
+          builder_.ClearOffsets();
+          val.constant = NumToString(builder_.GetSize());
+        }
+      } else if (IsString(enum_val->union_type)) {
+        ECHECK(ParseString(val, field->shared));
+      } else {
+        FLATBUFFERS_ASSERT(false);
+      }
+      break;
+    }
+    case BASE_TYPE_STRUCT:
+      ECHECK(ParseTable(*val.type.struct_def, &val.constant, nullptr));
+      break;
+    case BASE_TYPE_STRING: {
+      ECHECK(ParseString(val, field->shared));
+      break;
+    }
+    case BASE_TYPE_VECTOR: {
+      uoffset_t off;
+      ECHECK(ParseVector(val.type.VectorType(), &off, field, parent_fieldn));
+      val.constant = NumToString(off);
+      break;
+    }
+    case BASE_TYPE_ARRAY: {
+      ECHECK(ParseArray(val));
+      break;
+    }
+    case BASE_TYPE_INT:
+    case BASE_TYPE_UINT:
+    case BASE_TYPE_LONG:
+    case BASE_TYPE_ULONG: {
+      if (field && field->attributes.Lookup("hash") &&
+          (token_ == kTokenIdentifier || token_ == kTokenStringConstant)) {
+        ECHECK(ParseHash(val, field));
+      } else {
+        ECHECK(ParseSingleValue(field ? &field->name : nullptr, val, false));
+      }
+      break;
+    }
+    default:
+      ECHECK(ParseSingleValue(field ? &field->name : nullptr, val, false));
+      break;
+  }
+  return NoError();
+}
+
+void Parser::SerializeStruct(const StructDef &struct_def, const Value &val) {
+  SerializeStruct(builder_, struct_def, val);
+}
+
+void Parser::SerializeStruct(FlatBufferBuilder &builder,
+                             const StructDef &struct_def, const Value &val) {
+  FLATBUFFERS_ASSERT(val.constant.length() == struct_def.bytesize);
+  builder.Align(struct_def.minalign);
+  builder.PushBytes(reinterpret_cast<const uint8_t *>(val.constant.c_str()),
+                    struct_def.bytesize);
+  builder.AddStructOffset(val.offset, builder.GetSize());
+}
+
+template<typename F>
+CheckedError Parser::ParseTableDelimiters(size_t &fieldn,
+                                          const StructDef *struct_def, F body) {
+  // We allow tables both as JSON object{ .. } with field names
+  // or vector[..] with all fields in order
+  char terminator = '}';
+  bool is_nested_vector = struct_def && Is('[');
+  if (is_nested_vector) {
+    NEXT();
+    terminator = ']';
+  } else {
+    EXPECT('{');
+  }
+  for (;;) {
+    if ((!opts.strict_json || !fieldn) && Is(terminator)) break;
+    std::string name;
+    if (is_nested_vector) {
+      if (fieldn >= struct_def->fields.vec.size()) {
+        return Error("too many unnamed fields in nested array");
+      }
+      name = struct_def->fields.vec[fieldn]->name;
+    } else {
+      name = attribute_;
+      if (Is(kTokenStringConstant)) {
+        NEXT();
+      } else {
+        EXPECT(opts.strict_json ? kTokenStringConstant : kTokenIdentifier);
+      }
+      if (!opts.protobuf_ascii_alike || !(Is('{') || Is('['))) EXPECT(':');
+    }
+    ECHECK(body(name, fieldn, struct_def));
+    if (Is(terminator)) break;
+    ECHECK(ParseComma());
+  }
+  NEXT();
+  if (is_nested_vector && fieldn != struct_def->fields.vec.size()) {
+    return Error("wrong number of unnamed fields in table vector");
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseTable(const StructDef &struct_def, std::string *value,
+                                uoffset_t *ovalue) {
+  ParseDepthGuard depth_guard(this);
+  ECHECK(depth_guard.Check());
+
+  size_t fieldn_outer = 0;
+  auto err = ParseTableDelimiters(
+      fieldn_outer, &struct_def,
+      [&](const std::string &name, size_t &fieldn,
+          const StructDef *struct_def_inner) -> CheckedError {
+        if (name == "$schema") {
+          ECHECK(Expect(kTokenStringConstant));
+          return NoError();
+        }
+        auto field = struct_def_inner->fields.Lookup(name);
+        if (!field) {
+          if (!opts.skip_unexpected_fields_in_json) {
+            return Error("unknown field: " + name);
+          } else {
+            ECHECK(SkipAnyJsonValue());
+          }
+        } else {
+          if (IsIdent("null") && !IsScalar(field->value.type.base_type)) {
+            ECHECK(Next());  // Ignore this field.
+          } else {
+            Value val = field->value;
+            if (field->flexbuffer) {
+              flexbuffers::Builder builder(1024,
+                                           flexbuffers::BUILDER_FLAG_SHARE_ALL);
+              ECHECK(ParseFlexBufferValue(&builder));
+              builder.Finish();
+              // Force alignment for nested flexbuffer
+              builder_.ForceVectorAlignment(builder.GetSize(), sizeof(uint8_t),
+                                            sizeof(largest_scalar_t));
+              auto off = builder_.CreateVector(builder.GetBuffer());
+              val.constant = NumToString(off.o);
+            } else if (field->nested_flatbuffer) {
+              ECHECK(
+                  ParseNestedFlatbuffer(val, field, fieldn, struct_def_inner));
+            } else {
+              ECHECK(ParseAnyValue(val, field, fieldn, struct_def_inner, 0));
+            }
+            // Hardcoded insertion-sort with error-check.
+            // If fields are specified in order, then this loop exits
+            // immediately.
+            auto elem = field_stack_.rbegin();
+            for (; elem != field_stack_.rbegin() + fieldn; ++elem) {
+              auto existing_field = elem->second;
+              if (existing_field == field)
+                return Error("field set more than once: " + field->name);
+              if (existing_field->value.offset < field->value.offset) break;
+            }
+            // Note: elem points to before the insertion point, thus .base()
+            // points to the correct spot.
+            field_stack_.insert(elem.base(), std::make_pair(val, field));
+            fieldn++;
+          }
+        }
+        return NoError();
+      });
+  ECHECK(err);
+
+  // Check if all required fields are parsed.
+  for (auto field_it = struct_def.fields.vec.begin();
+       field_it != struct_def.fields.vec.end(); ++field_it) {
+    auto required_field = *field_it;
+    if (!required_field->IsRequired()) { continue; }
+    bool found = false;
+    for (auto pf_it = field_stack_.end() - fieldn_outer;
+         pf_it != field_stack_.end(); ++pf_it) {
+      auto parsed_field = pf_it->second;
+      if (parsed_field == required_field) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      return Error("required field is missing: " + required_field->name +
+                   " in " + struct_def.name);
+    }
+  }
+
+  if (struct_def.fixed && fieldn_outer != struct_def.fields.vec.size())
+    return Error("struct: wrong number of initializers: " + struct_def.name);
+
+  auto start = struct_def.fixed ? builder_.StartStruct(struct_def.minalign)
+                                : builder_.StartTable();
+
+  for (size_t size = struct_def.sortbysize ? sizeof(largest_scalar_t) : 1; size;
+       size /= 2) {
+    // Go through elements in reverse, since we're building the data backwards.
+    for (auto it = field_stack_.rbegin();
+         it != field_stack_.rbegin() + fieldn_outer; ++it) {
+      auto &field_value = it->first;
+      auto field = it->second;
+      if (!struct_def.sortbysize ||
+          size == SizeOf(field_value.type.base_type)) {
+        switch (field_value.type.base_type) {
+          // clang-format off
+          #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+            case BASE_TYPE_ ## ENUM: \
+              builder_.Pad(field->padding); \
+              if (struct_def.fixed) { \
+                CTYPE val; \
+                ECHECK(atot(field_value.constant.c_str(), *this, &val)); \
+                builder_.PushElement(val); \
+              } else { \
+                CTYPE val, valdef; \
+                ECHECK(atot(field_value.constant.c_str(), *this, &val)); \
+                ECHECK(atot(field->value.constant.c_str(), *this, &valdef)); \
+                builder_.AddElement(field_value.offset, val, valdef); \
+              } \
+              break;
+            FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+          #undef FLATBUFFERS_TD
+          #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+            case BASE_TYPE_ ## ENUM: \
+              builder_.Pad(field->padding); \
+              if (IsStruct(field->value.type)) { \
+                SerializeStruct(*field->value.type.struct_def, field_value); \
+              } else { \
+                CTYPE val; \
+                ECHECK(atot(field_value.constant.c_str(), *this, &val)); \
+                builder_.AddOffset(field_value.offset, val); \
+              } \
+              break;
+            FLATBUFFERS_GEN_TYPES_POINTER(FLATBUFFERS_TD)
+          #undef FLATBUFFERS_TD
+            case BASE_TYPE_ARRAY:
+              builder_.Pad(field->padding);
+              builder_.PushBytes(
+                reinterpret_cast<const uint8_t*>(field_value.constant.c_str()),
+                InlineSize(field_value.type));
+              break;
+            // clang-format on
+        }
+      }
+    }
+  }
+  for (size_t i = 0; i < fieldn_outer; i++) field_stack_.pop_back();
+
+  if (struct_def.fixed) {
+    builder_.ClearOffsets();
+    builder_.EndStruct();
+    FLATBUFFERS_ASSERT(value);
+    // Temporarily store this struct in the value string, since it is to
+    // be serialized in-place elsewhere.
+    value->assign(
+        reinterpret_cast<const char *>(builder_.GetCurrentBufferPointer()),
+        struct_def.bytesize);
+    builder_.PopBytes(struct_def.bytesize);
+    FLATBUFFERS_ASSERT(!ovalue);
+  } else {
+    auto val = builder_.EndTable(start);
+    if (ovalue) *ovalue = val;
+    if (value) *value = NumToString(val);
+  }
+  return NoError();
+}
+
+template<typename F>
+CheckedError Parser::ParseVectorDelimiters(uoffset_t &count, F body) {
+  EXPECT('[');
+  for (;;) {
+    if ((!opts.strict_json || !count) && Is(']')) break;
+    ECHECK(body(count));
+    count++;
+    if (Is(']')) break;
+    ECHECK(ParseComma());
+  }
+  NEXT();
+  return NoError();
+}
+
+static bool CompareSerializedScalars(const uint8_t *a, const uint8_t *b,
+                                     const FieldDef &key) {
+  switch (key.value.type.base_type) {
+#define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...)                       \
+  case BASE_TYPE_##ENUM: {                                              \
+    CTYPE def = static_cast<CTYPE>(0);                                  \
+    if (!a || !b) { StringToNumber(key.value.constant.c_str(), &def); } \
+    const auto av = a ? ReadScalar<CTYPE>(a) : def;                     \
+    const auto bv = b ? ReadScalar<CTYPE>(b) : def;                     \
+    return av < bv;                                                     \
+  }
+    FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+#undef FLATBUFFERS_TD
+    default: {
+      FLATBUFFERS_ASSERT(false && "scalar type expected");
+      return false;
+    }
+  }
+}
+
+static bool CompareTablesByScalarKey(const Offset<Table> *_a,
+                                     const Offset<Table> *_b,
+                                     const FieldDef &key) {
+  const voffset_t offset = key.value.offset;
+  // Indirect offset pointer to table pointer.
+  auto a = reinterpret_cast<const uint8_t *>(_a) + ReadScalar<uoffset_t>(_a);
+  auto b = reinterpret_cast<const uint8_t *>(_b) + ReadScalar<uoffset_t>(_b);
+  // Fetch field address from table.
+  a = reinterpret_cast<const Table *>(a)->GetAddressOf(offset);
+  b = reinterpret_cast<const Table *>(b)->GetAddressOf(offset);
+  return CompareSerializedScalars(a, b, key);
+}
+
+static bool CompareTablesByStringKey(const Offset<Table> *_a,
+                                     const Offset<Table> *_b,
+                                     const FieldDef &key) {
+  const voffset_t offset = key.value.offset;
+  // Indirect offset pointer to table pointer.
+  auto a = reinterpret_cast<const uint8_t *>(_a) + ReadScalar<uoffset_t>(_a);
+  auto b = reinterpret_cast<const uint8_t *>(_b) + ReadScalar<uoffset_t>(_b);
+  // Fetch field address from table.
+  a = reinterpret_cast<const Table *>(a)->GetAddressOf(offset);
+  b = reinterpret_cast<const Table *>(b)->GetAddressOf(offset);
+  if (a && b) {
+    // Indirect offset pointer to string pointer.
+    a += ReadScalar<uoffset_t>(a);
+    b += ReadScalar<uoffset_t>(b);
+    return *reinterpret_cast<const String *>(a) <
+           *reinterpret_cast<const String *>(b);
+  } else {
+    return a ? true : false;
+  }
+}
+
+static void SwapSerializedTables(Offset<Table> *a, Offset<Table> *b) {
+  // These are serialized offsets, so are relative where they are
+  // stored in memory, so compute the distance between these pointers:
+  ptrdiff_t diff = (b - a) * sizeof(Offset<Table>);
+  FLATBUFFERS_ASSERT(diff >= 0);  // Guaranteed by SimpleQsort.
+  auto udiff = static_cast<uoffset_t>(diff);
+  a->o = EndianScalar(ReadScalar<uoffset_t>(a) - udiff);
+  b->o = EndianScalar(ReadScalar<uoffset_t>(b) + udiff);
+  std::swap(*a, *b);
+}
+
+// See below for why we need our own sort :(
+template<typename T, typename F, typename S>
+void SimpleQsort(T *begin, T *end, size_t width, F comparator, S swapper) {
+  if (end - begin <= static_cast<ptrdiff_t>(width)) return;
+  auto l = begin + width;
+  auto r = end;
+  while (l < r) {
+    if (comparator(begin, l)) {
+      r -= width;
+      swapper(l, r);
+    } else {
+      l += width;
+    }
+  }
+  l -= width;
+  swapper(begin, l);
+  SimpleQsort(begin, l, width, comparator, swapper);
+  SimpleQsort(r, end, width, comparator, swapper);
+}
+
+CheckedError Parser::ParseAlignAttribute(const std::string &align_constant,
+                                         size_t min_align, size_t *align) {
+  // Use uint8_t to avoid problems with size_t==`unsigned long` on LP64.
+  uint8_t align_value;
+  if (StringToNumber(align_constant.c_str(), &align_value) &&
+      VerifyAlignmentRequirements(static_cast<size_t>(align_value),
+                                  min_align)) {
+    *align = align_value;
+    return NoError();
+  }
+  return Error("unexpected force_align value '" + align_constant +
+               "', alignment must be a power of two integer ranging from the "
+               "type\'s natural alignment " +
+               NumToString(min_align) + " to " +
+               NumToString(FLATBUFFERS_MAX_ALIGNMENT));
+}
+
+CheckedError Parser::ParseVector(const Type &type, uoffset_t *ovalue,
+                                 FieldDef *field, size_t fieldn) {
+  uoffset_t count = 0;
+  auto err = ParseVectorDelimiters(count, [&](uoffset_t &) -> CheckedError {
+    Value val;
+    val.type = type;
+    ECHECK(ParseAnyValue(val, field, fieldn, nullptr, count, true));
+    field_stack_.push_back(std::make_pair(val, nullptr));
+    return NoError();
+  });
+  ECHECK(err);
+
+  const size_t len = count * InlineSize(type) / InlineAlignment(type);
+  const size_t elemsize = InlineAlignment(type);
+  const auto force_align = field->attributes.Lookup("force_align");
+  if (force_align) {
+    size_t align;
+    ECHECK(ParseAlignAttribute(force_align->constant, 1, &align));
+    if (align > 1) { builder_.ForceVectorAlignment(len, elemsize, align); }
+  }
+
+  builder_.StartVector(len, elemsize);
+  for (uoffset_t i = 0; i < count; i++) {
+    // start at the back, since we're building the data backwards.
+    auto &val = field_stack_.back().first;
+    switch (val.type.base_type) {
+      // clang-format off
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE,...) \
+        case BASE_TYPE_ ## ENUM: \
+          if (IsStruct(val.type)) SerializeStruct(*val.type.struct_def, val); \
+          else { \
+             CTYPE elem; \
+             ECHECK(atot(val.constant.c_str(), *this, &elem)); \
+             builder_.PushElement(elem); \
+          } \
+          break;
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+      // clang-format on
+    }
+    field_stack_.pop_back();
+  }
+
+  builder_.ClearOffsets();
+  *ovalue = builder_.EndVector(count);
+
+  if (type.base_type == BASE_TYPE_STRUCT && type.struct_def->has_key) {
+    // We should sort this vector. Find the key first.
+    const FieldDef *key = nullptr;
+    for (auto it = type.struct_def->fields.vec.begin();
+         it != type.struct_def->fields.vec.end(); ++it) {
+      if ((*it)->key) {
+        key = (*it);
+        break;
+      }
+    }
+    FLATBUFFERS_ASSERT(key);
+    // Now sort it.
+    // We can't use std::sort because for structs the size is not known at
+    // compile time, and for tables our iterators dereference offsets, so can't
+    // be used to swap elements.
+    // And we can't use C qsort either, since that would force use to use
+    // globals, making parsing thread-unsafe.
+    // So for now, we use SimpleQsort above.
+    // TODO: replace with something better, preferably not recursive.
+
+    if (type.struct_def->fixed) {
+      const voffset_t offset = key->value.offset;
+      const size_t struct_size = type.struct_def->bytesize;
+      auto v =
+          reinterpret_cast<VectorOfAny *>(builder_.GetCurrentBufferPointer());
+      SimpleQsort<uint8_t>(
+          v->Data(), v->Data() + v->size() * type.struct_def->bytesize,
+          type.struct_def->bytesize,
+          [offset, key](const uint8_t *a, const uint8_t *b) -> bool {
+            return CompareSerializedScalars(a + offset, b + offset, *key);
+          },
+          [struct_size](uint8_t *a, uint8_t *b) {
+            // FIXME: faster?
+            for (size_t i = 0; i < struct_size; i++) { std::swap(a[i], b[i]); }
+          });
+    } else {
+      auto v = reinterpret_cast<Vector<Offset<Table>> *>(
+          builder_.GetCurrentBufferPointer());
+      // Here also can't use std::sort. We do have an iterator type for it,
+      // but it is non-standard as it will dereference the offsets, and thus
+      // can't be used to swap elements.
+      if (key->value.type.base_type == BASE_TYPE_STRING) {
+        SimpleQsort<Offset<Table>>(
+            v->data(), v->data() + v->size(), 1,
+            [key](const Offset<Table> *_a, const Offset<Table> *_b) -> bool {
+              return CompareTablesByStringKey(_a, _b, *key);
+            },
+            SwapSerializedTables);
+      } else {
+        SimpleQsort<Offset<Table>>(
+            v->data(), v->data() + v->size(), 1,
+            [key](const Offset<Table> *_a, const Offset<Table> *_b) -> bool {
+              return CompareTablesByScalarKey(_a, _b, *key);
+            },
+            SwapSerializedTables);
+      }
+    }
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseArray(Value &array) {
+  std::vector<Value> stack;
+  FlatBufferBuilder builder;
+  const auto &type = array.type.VectorType();
+  auto length = array.type.fixed_length;
+  uoffset_t count = 0;
+  auto err = ParseVectorDelimiters(count, [&](uoffset_t &) -> CheckedError {
+    vector_emplace_back(&stack, Value());
+    auto &val = stack.back();
+    val.type = type;
+    if (IsStruct(type)) {
+      ECHECK(ParseTable(*val.type.struct_def, &val.constant, nullptr));
+    } else {
+      ECHECK(ParseSingleValue(nullptr, val, false));
+    }
+    return NoError();
+  });
+  ECHECK(err);
+  if (length != count) return Error("Fixed-length array size is incorrect.");
+
+  for (auto it = stack.rbegin(); it != stack.rend(); ++it) {
+    auto &val = *it;
+    // clang-format off
+    switch (val.type.base_type) {
+      #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+        case BASE_TYPE_ ## ENUM: \
+          if (IsStruct(val.type)) { \
+            SerializeStruct(builder, *val.type.struct_def, val); \
+          } else { \
+            CTYPE elem; \
+            ECHECK(atot(val.constant.c_str(), *this, &elem)); \
+            builder.PushElement(elem); \
+          } \
+        break;
+        FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+      #undef FLATBUFFERS_TD
+      default: FLATBUFFERS_ASSERT(0);
+    }
+    // clang-format on
+  }
+
+  array.constant.assign(
+      reinterpret_cast<const char *>(builder.GetCurrentBufferPointer()),
+      InlineSize(array.type));
+  return NoError();
+}
+
+CheckedError Parser::ParseNestedFlatbuffer(Value &val, FieldDef *field,
+                                           size_t fieldn,
+                                           const StructDef *parent_struct_def) {
+  if (token_ == '[') {  // backwards compat for 'legacy' ubyte buffers
+    ECHECK(ParseAnyValue(val, field, fieldn, parent_struct_def, 0));
+  } else {
+    auto cursor_at_value_begin = cursor_;
+    ECHECK(SkipAnyJsonValue());
+    std::string substring(cursor_at_value_begin - 1, cursor_ - 1);
+
+    // Create and initialize new parser
+    Parser nested_parser;
+    FLATBUFFERS_ASSERT(field->nested_flatbuffer);
+    nested_parser.root_struct_def_ = field->nested_flatbuffer;
+    nested_parser.enums_ = enums_;
+    nested_parser.opts = opts;
+    nested_parser.uses_flexbuffers_ = uses_flexbuffers_;
+    nested_parser.parse_depth_counter_ = parse_depth_counter_;
+    // Parse JSON substring into new flatbuffer builder using nested_parser
+    bool ok = nested_parser.Parse(substring.c_str(), nullptr, nullptr);
+
+    // Clean nested_parser to avoid deleting the elements in
+    // the SymbolTables on destruction
+    nested_parser.enums_.dict.clear();
+    nested_parser.enums_.vec.clear();
+
+    if (!ok) { ECHECK(Error(nested_parser.error_)); }
+    // Force alignment for nested flatbuffer
+    builder_.ForceVectorAlignment(
+        nested_parser.builder_.GetSize(), sizeof(uint8_t),
+        nested_parser.builder_.GetBufferMinAlignment());
+
+    auto off = builder_.CreateVector(nested_parser.builder_.GetBufferPointer(),
+                                     nested_parser.builder_.GetSize());
+    val.constant = NumToString(off.o);
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseMetaData(SymbolTable<Value> *attributes) {
+  if (Is('(')) {
+    NEXT();
+    for (;;) {
+      auto name = attribute_;
+      if (false == (Is(kTokenIdentifier) || Is(kTokenStringConstant)))
+        return Error("attribute name must be either identifier or string: " +
+                     name);
+      if (known_attributes_.find(name) == known_attributes_.end())
+        return Error("user define attributes must be declared before use: " +
+                     name);
+      NEXT();
+      auto e = new Value();
+      if (attributes->Add(name, e)) Warning("attribute already found: " + name);
+      if (Is(':')) {
+        NEXT();
+        ECHECK(ParseSingleValue(&name, *e, true));
+      }
+      if (Is(')')) {
+        NEXT();
+        break;
+      }
+      EXPECT(',');
+    }
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseEnumFromString(const Type &type,
+                                         std::string *result) {
+  const auto base_type =
+      type.enum_def ? type.enum_def->underlying_type.base_type : type.base_type;
+  if (!IsInteger(base_type)) return Error("not a valid value for this field");
+  uint64_t u64 = 0;
+  for (size_t pos = 0; pos != std::string::npos;) {
+    const auto delim = attribute_.find_first_of(' ', pos);
+    const auto last = (std::string::npos == delim);
+    auto word = attribute_.substr(pos, !last ? delim - pos : std::string::npos);
+    pos = !last ? delim + 1 : std::string::npos;
+    const EnumVal *ev = nullptr;
+    if (type.enum_def) {
+      ev = type.enum_def->Lookup(word);
+    } else {
+      auto dot = word.find_first_of('.');
+      if (std::string::npos == dot)
+        return Error("enum values need to be qualified by an enum type");
+      auto enum_def_str = word.substr(0, dot);
+      const auto enum_def = LookupEnum(enum_def_str);
+      if (!enum_def) return Error("unknown enum: " + enum_def_str);
+      auto enum_val_str = word.substr(dot + 1);
+      ev = enum_def->Lookup(enum_val_str);
+    }
+    if (!ev) return Error("unknown enum value: " + word);
+    u64 |= ev->GetAsUInt64();
+  }
+  *result = IsUnsigned(base_type) ? NumToString(u64)
+                                  : NumToString(static_cast<int64_t>(u64));
+  return NoError();
+}
+
+CheckedError Parser::ParseHash(Value &e, FieldDef *field) {
+  FLATBUFFERS_ASSERT(field);
+  Value *hash_name = field->attributes.Lookup("hash");
+  switch (e.type.base_type) {
+    case BASE_TYPE_SHORT: {
+      auto hash = FindHashFunction16(hash_name->constant.c_str());
+      int16_t hashed_value = static_cast<int16_t>(hash(attribute_.c_str()));
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    case BASE_TYPE_USHORT: {
+      auto hash = FindHashFunction16(hash_name->constant.c_str());
+      uint16_t hashed_value = hash(attribute_.c_str());
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    case BASE_TYPE_INT: {
+      auto hash = FindHashFunction32(hash_name->constant.c_str());
+      int32_t hashed_value = static_cast<int32_t>(hash(attribute_.c_str()));
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    case BASE_TYPE_UINT: {
+      auto hash = FindHashFunction32(hash_name->constant.c_str());
+      uint32_t hashed_value = hash(attribute_.c_str());
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    case BASE_TYPE_LONG: {
+      auto hash = FindHashFunction64(hash_name->constant.c_str());
+      int64_t hashed_value = static_cast<int64_t>(hash(attribute_.c_str()));
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    case BASE_TYPE_ULONG: {
+      auto hash = FindHashFunction64(hash_name->constant.c_str());
+      uint64_t hashed_value = hash(attribute_.c_str());
+      e.constant = NumToString(hashed_value);
+      break;
+    }
+    default: FLATBUFFERS_ASSERT(0);
+  }
+  NEXT();
+  return NoError();
+}
+
+CheckedError Parser::TokenError() {
+  return Error("cannot parse value starting with: " + TokenToStringId(token_));
+}
+
+// Re-pack helper (ParseSingleValue) to normalize defaults of scalars.
+template<typename T> inline void SingleValueRepack(Value &e, T val) {
+  // Remove leading zeros.
+  if (IsInteger(e.type.base_type)) { e.constant = NumToString(val); }
+}
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+// Normalize defaults NaN to unsigned quiet-NaN(0) if value was parsed from
+// hex-float literal.
+static inline void SingleValueRepack(Value &e, float val) {
+  if (val != val) e.constant = "nan";
+}
+static inline void SingleValueRepack(Value &e, double val) {
+  if (val != val) e.constant = "nan";
+}
+#endif
+
+CheckedError Parser::ParseFunction(const std::string *name, Value &e) {
+  ParseDepthGuard depth_guard(this);
+  ECHECK(depth_guard.Check());
+
+  // Copy name, attribute will be changed on NEXT().
+  const auto functionname = attribute_;
+  if (!IsFloat(e.type.base_type)) {
+    return Error(functionname + ": type of argument mismatch, expecting: " +
+                 kTypeNames[BASE_TYPE_DOUBLE] +
+                 ", found: " + kTypeNames[e.type.base_type] +
+                 ", name: " + (name ? *name : "") + ", value: " + e.constant);
+  }
+  NEXT();
+  EXPECT('(');
+  ECHECK(ParseSingleValue(name, e, false));
+  EXPECT(')');
+  // calculate with double precision
+  double x, y = 0.0;
+  ECHECK(atot(e.constant.c_str(), *this, &x));
+  // clang-format off
+  auto func_match = false;
+  #define FLATBUFFERS_FN_DOUBLE(name, op) \
+    if (!func_match && functionname == name) { y = op; func_match = true; }
+  FLATBUFFERS_FN_DOUBLE("deg", x / kPi * 180);
+  FLATBUFFERS_FN_DOUBLE("rad", x * kPi / 180);
+  FLATBUFFERS_FN_DOUBLE("sin", sin(x));
+  FLATBUFFERS_FN_DOUBLE("cos", cos(x));
+  FLATBUFFERS_FN_DOUBLE("tan", tan(x));
+  FLATBUFFERS_FN_DOUBLE("asin", asin(x));
+  FLATBUFFERS_FN_DOUBLE("acos", acos(x));
+  FLATBUFFERS_FN_DOUBLE("atan", atan(x));
+  // TODO(wvo): add more useful conversion functions here.
+  #undef FLATBUFFERS_FN_DOUBLE
+  // clang-format on
+  if (true != func_match) {
+    return Error(std::string("Unknown conversion function: ") + functionname +
+                 ", field name: " + (name ? *name : "") +
+                 ", value: " + e.constant);
+  }
+  e.constant = NumToString(y);
+  return NoError();
+}
+
+CheckedError Parser::TryTypedValue(const std::string *name, int dtoken,
+                                   bool check, Value &e, BaseType req,
+                                   bool *destmatch) {
+  FLATBUFFERS_ASSERT(*destmatch == false && dtoken == token_);
+  *destmatch = true;
+  e.constant = attribute_;
+  // Check token match
+  if (!check) {
+    if (e.type.base_type == BASE_TYPE_NONE) {
+      e.type.base_type = req;
+    } else {
+      return Error(std::string("type mismatch: expecting: ") +
+                   kTypeNames[e.type.base_type] +
+                   ", found: " + kTypeNames[req] +
+                   ", name: " + (name ? *name : "") + ", value: " + e.constant);
+    }
+  }
+  // The exponent suffix of hexadecimal float-point number is mandatory.
+  // A hex-integer constant is forbidden as an initializer of float number.
+  if ((kTokenFloatConstant != dtoken) && IsFloat(e.type.base_type)) {
+    const auto &s = e.constant;
+    const auto k = s.find_first_of("0123456789.");
+    if ((std::string::npos != k) && (s.length() > (k + 1)) &&
+        (s[k] == '0' && is_alpha_char(s[k + 1], 'X')) &&
+        (std::string::npos == s.find_first_of("pP", k + 2))) {
+      return Error(
+          "invalid number, the exponent suffix of hexadecimal "
+          "floating-point literals is mandatory: \"" +
+          s + "\"");
+    }
+  }
+  NEXT();
+  return NoError();
+}
+
+CheckedError Parser::ParseSingleValue(const std::string *name, Value &e,
+                                      bool check_now) {
+  if (token_ == '+' || token_ == '-') {
+    const char sign = static_cast<char>(token_);
+    // Get an indentifier: NAN, INF, or function name like cos/sin/deg.
+    NEXT();
+    if (token_ != kTokenIdentifier) return Error("constant name expected");
+    attribute_.insert(0, 1, sign);
+  }
+
+  const auto in_type = e.type.base_type;
+  const auto is_tok_ident = (token_ == kTokenIdentifier);
+  const auto is_tok_string = (token_ == kTokenStringConstant);
+
+  // First see if this could be a conversion function.
+  if (is_tok_ident && *cursor_ == '(') { return ParseFunction(name, e); }
+
+  // clang-format off
+  auto match = false;
+
+  #define IF_ECHECK_(force, dtoken, check, req)    \
+    if (!match && ((dtoken) == token_) && ((check) || IsConstTrue(force))) \
+      ECHECK(TryTypedValue(name, dtoken, check, e, req, &match))
+  #define TRY_ECHECK(dtoken, check, req) IF_ECHECK_(false, dtoken, check, req)
+  #define FORCE_ECHECK(dtoken, check, req) IF_ECHECK_(true, dtoken, check, req)
+  // clang-format on
+
+  if (is_tok_ident || is_tok_string) {
+    const auto kTokenStringOrIdent = token_;
+    // The string type is a most probable type, check it first.
+    TRY_ECHECK(kTokenStringConstant, in_type == BASE_TYPE_STRING,
+               BASE_TYPE_STRING);
+
+    // avoid escaped and non-ascii in the string
+    if (!match && is_tok_string && IsScalar(in_type) &&
+        !attr_is_trivial_ascii_string_) {
+      return Error(
+          std::string("type mismatch or invalid value, an initializer of "
+                      "non-string field must be trivial ASCII string: type: ") +
+          kTypeNames[in_type] + ", name: " + (name ? *name : "") +
+          ", value: " + attribute_);
+    }
+
+    // A boolean as true/false. Boolean as Integer check below.
+    if (!match && IsBool(in_type)) {
+      auto is_true = attribute_ == "true";
+      if (is_true || attribute_ == "false") {
+        attribute_ = is_true ? "1" : "0";
+        // accepts both kTokenStringConstant and kTokenIdentifier
+        TRY_ECHECK(kTokenStringOrIdent, IsBool(in_type), BASE_TYPE_BOOL);
+      }
+    }
+    // Check for optional scalars.
+    if (!match && IsScalar(in_type) && attribute_ == "null") {
+      e.constant = "null";
+      NEXT();
+      match = true;
+    }
+    // Check if this could be a string/identifier enum value.
+    // Enum can have only true integer base type.
+    if (!match && IsInteger(in_type) && !IsBool(in_type) &&
+        IsIdentifierStart(*attribute_.c_str())) {
+      ECHECK(ParseEnumFromString(e.type, &e.constant));
+      NEXT();
+      match = true;
+    }
+    // Parse a float/integer number from the string.
+    // A "scalar-in-string" value needs extra checks.
+    if (!match && is_tok_string && IsScalar(in_type)) {
+      // Strip trailing whitespaces from attribute_.
+      auto last_non_ws = attribute_.find_last_not_of(' ');
+      if (std::string::npos != last_non_ws) attribute_.resize(last_non_ws + 1);
+      if (IsFloat(e.type.base_type)) {
+        // The functions strtod() and strtof() accept both 'nan' and
+        // 'nan(number)' literals. While 'nan(number)' is rejected by the parser
+        // as an unsupported function if is_tok_ident is true.
+        if (attribute_.find_last_of(')') != std::string::npos) {
+          return Error("invalid number: " + attribute_);
+        }
+      }
+    }
+    // Float numbers or nan, inf, pi, etc.
+    TRY_ECHECK(kTokenStringOrIdent, IsFloat(in_type), BASE_TYPE_FLOAT);
+    // An integer constant in string.
+    TRY_ECHECK(kTokenStringOrIdent, IsInteger(in_type), BASE_TYPE_INT);
+    // Unknown tokens will be interpreted as string type.
+    // An attribute value may be a scalar or string constant.
+    FORCE_ECHECK(kTokenStringConstant, in_type == BASE_TYPE_STRING,
+                 BASE_TYPE_STRING);
+  } else {
+    // Try a float number.
+    TRY_ECHECK(kTokenFloatConstant, IsFloat(in_type), BASE_TYPE_FLOAT);
+    // Integer token can init any scalar (integer of float).
+    FORCE_ECHECK(kTokenIntegerConstant, IsScalar(in_type), BASE_TYPE_INT);
+  }
+  // Match empty vectors for default-empty-vectors.
+  if (!match && IsVector(e.type) && token_ == '[') {
+    NEXT();
+    if (token_ != ']') { return Error("Expected `]` in vector default"); }
+    NEXT();
+    match = true;
+    e.constant = "[]";
+  }
+
+#undef FORCE_ECHECK
+#undef TRY_ECHECK
+#undef IF_ECHECK_
+
+  if (!match) {
+    std::string msg;
+    msg += "Cannot assign token starting with '" + TokenToStringId(token_) +
+           "' to value of <" + std::string(kTypeNames[in_type]) + "> type.";
+    return Error(msg);
+  }
+  const auto match_type = e.type.base_type;  // may differ from in_type
+  // The check_now flag must be true when parse a fbs-schema.
+  // This flag forces to check default scalar values or metadata of field.
+  // For JSON parser the flag should be false.
+  // If it is set for JSON each value will be checked twice (see ParseTable).
+  // Special case 'null' since atot can't handle that.
+  if (check_now && IsScalar(match_type) && e.constant != "null") {
+    // clang-format off
+    switch (match_type) {
+    #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+      case BASE_TYPE_ ## ENUM: {\
+          CTYPE val; \
+          ECHECK(atot(e.constant.c_str(), *this, &val)); \
+          SingleValueRepack(e, val); \
+        break; }
+    FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+    #undef FLATBUFFERS_TD
+    default: break;
+    }
+    // clang-format on
+  }
+  return NoError();
+}
+
+StructDef *Parser::LookupCreateStruct(const std::string &name,
+                                      bool create_if_new, bool definition) {
+  std::string qualified_name = current_namespace_->GetFullyQualifiedName(name);
+  // See if it exists pre-declared by an unqualified use.
+  auto struct_def = LookupStruct(name);
+  if (struct_def && struct_def->predecl) {
+    if (definition) {
+      // Make sure it has the current namespace, and is registered under its
+      // qualified name.
+      struct_def->defined_namespace = current_namespace_;
+      structs_.Move(name, qualified_name);
+    }
+    return struct_def;
+  }
+  // See if it exists pre-declared by an qualified use.
+  struct_def = LookupStruct(qualified_name);
+  if (struct_def && struct_def->predecl) {
+    if (definition) {
+      // Make sure it has the current namespace.
+      struct_def->defined_namespace = current_namespace_;
+    }
+    return struct_def;
+  }
+  if (!definition && !struct_def) {
+    struct_def = LookupStructThruParentNamespaces(name);
+  }
+  if (!struct_def && create_if_new) {
+    struct_def = new StructDef();
+    if (definition) {
+      structs_.Add(qualified_name, struct_def);
+      struct_def->name = name;
+      struct_def->defined_namespace = current_namespace_;
+    } else {
+      // Not a definition.
+      // Rather than failing, we create a "pre declared" StructDef, due to
+      // circular references, and check for errors at the end of parsing.
+      // It is defined in the current namespace, as the best guess what the
+      // final namespace will be.
+      structs_.Add(name, struct_def);
+      struct_def->name = name;
+      struct_def->defined_namespace = current_namespace_;
+      struct_def->original_location.reset(
+          new std::string(file_being_parsed_ + ":" + NumToString(line_)));
+    }
+  }
+  return struct_def;
+}
+
+const EnumVal *EnumDef::MinValue() const {
+  return vals.vec.empty() ? nullptr : vals.vec.front();
+}
+const EnumVal *EnumDef::MaxValue() const {
+  return vals.vec.empty() ? nullptr : vals.vec.back();
+}
+
+template<typename T> static uint64_t EnumDistanceImpl(T e1, T e2) {
+  if (e1 < e2) { std::swap(e1, e2); }  // use std for scalars
+  // Signed overflow may occur, use unsigned calculation.
+  // The unsigned overflow is well-defined by C++ standard (modulo 2^n).
+  return static_cast<uint64_t>(e1) - static_cast<uint64_t>(e2);
+}
+
+uint64_t EnumDef::Distance(const EnumVal *v1, const EnumVal *v2) const {
+  return IsUInt64() ? EnumDistanceImpl(v1->GetAsUInt64(), v2->GetAsUInt64())
+                    : EnumDistanceImpl(v1->GetAsInt64(), v2->GetAsInt64());
+}
+
+std::string EnumDef::AllFlags() const {
+  FLATBUFFERS_ASSERT(attributes.Lookup("bit_flags"));
+  uint64_t u64 = 0;
+  for (auto it = Vals().begin(); it != Vals().end(); ++it) {
+    u64 |= (*it)->GetAsUInt64();
+  }
+  return IsUInt64() ? NumToString(u64) : NumToString(static_cast<int64_t>(u64));
+}
+
+EnumVal *EnumDef::ReverseLookup(int64_t enum_idx,
+                                bool skip_union_default) const {
+  auto skip_first = static_cast<int>(is_union && skip_union_default);
+  for (auto it = Vals().begin() + skip_first; it != Vals().end(); ++it) {
+    if ((*it)->GetAsInt64() == enum_idx) { return *it; }
+  }
+  return nullptr;
+}
+
+EnumVal *EnumDef::FindByValue(const std::string &constant) const {
+  int64_t i64;
+  auto done = false;
+  if (IsUInt64()) {
+    uint64_t u64;  // avoid reinterpret_cast of pointers
+    done = StringToNumber(constant.c_str(), &u64);
+    i64 = static_cast<int64_t>(u64);
+  } else {
+    done = StringToNumber(constant.c_str(), &i64);
+  }
+  FLATBUFFERS_ASSERT(done);
+  if (!done) return nullptr;
+  return ReverseLookup(i64, false);
+}
+
+void EnumDef::SortByValue() {
+  auto &v = vals.vec;
+  if (IsUInt64())
+    std::sort(v.begin(), v.end(), [](const EnumVal *e1, const EnumVal *e2) {
+      return e1->GetAsUInt64() < e2->GetAsUInt64();
+    });
+  else
+    std::sort(v.begin(), v.end(), [](const EnumVal *e1, const EnumVal *e2) {
+      return e1->GetAsInt64() < e2->GetAsInt64();
+    });
+}
+
+void EnumDef::RemoveDuplicates() {
+  // This method depends form SymbolTable implementation!
+  // 1) vals.vec - owner (raw pointer)
+  // 2) vals.dict - access map
+  auto first = vals.vec.begin();
+  auto last = vals.vec.end();
+  if (first == last) return;
+  auto result = first;
+  while (++first != last) {
+    if ((*result)->value != (*first)->value) {
+      *(++result) = *first;
+    } else {
+      auto ev = *first;
+      for (auto it = vals.dict.begin(); it != vals.dict.end(); ++it) {
+        if (it->second == ev) it->second = *result;  // reassign
+      }
+      delete ev;  // delete enum value
+      *first = nullptr;
+    }
+  }
+  vals.vec.erase(++result, last);
+}
+
+template<typename T> void EnumDef::ChangeEnumValue(EnumVal *ev, T new_value) {
+  ev->value = static_cast<int64_t>(new_value);
+}
+
+namespace EnumHelper {
+template<BaseType E> struct EnumValType { typedef int64_t type; };
+template<> struct EnumValType<BASE_TYPE_ULONG> { typedef uint64_t type; };
+}  // namespace EnumHelper
+
+struct EnumValBuilder {
+  EnumVal *CreateEnumerator(const std::string &ev_name) {
+    FLATBUFFERS_ASSERT(!temp);
+    auto first = enum_def.vals.vec.empty();
+    user_value = first;
+    temp = new EnumVal(ev_name, first ? 0 : enum_def.vals.vec.back()->value);
+    return temp;
+  }
+
+  EnumVal *CreateEnumerator(const std::string &ev_name, int64_t val) {
+    FLATBUFFERS_ASSERT(!temp);
+    user_value = true;
+    temp = new EnumVal(ev_name, val);
+    return temp;
+  }
+
+  FLATBUFFERS_CHECKED_ERROR AcceptEnumerator(const std::string &name) {
+    FLATBUFFERS_ASSERT(temp);
+    ECHECK(ValidateValue(&temp->value, false == user_value));
+    FLATBUFFERS_ASSERT((temp->union_type.enum_def == nullptr) ||
+                       (temp->union_type.enum_def == &enum_def));
+    auto not_unique = enum_def.vals.Add(name, temp);
+    temp = nullptr;
+    if (not_unique) return parser.Error("enum value already exists: " + name);
+    return NoError();
+  }
+
+  FLATBUFFERS_CHECKED_ERROR AcceptEnumerator() {
+    return AcceptEnumerator(temp->name);
+  }
+
+  FLATBUFFERS_CHECKED_ERROR AssignEnumeratorValue(const std::string &value) {
+    user_value = true;
+    auto fit = false;
+    if (enum_def.IsUInt64()) {
+      uint64_t u64;
+      fit = StringToNumber(value.c_str(), &u64);
+      temp->value = static_cast<int64_t>(u64);  // well-defined since C++20.
+    } else {
+      int64_t i64;
+      fit = StringToNumber(value.c_str(), &i64);
+      temp->value = i64;
+    }
+    if (!fit) return parser.Error("enum value does not fit, \"" + value + "\"");
+    return NoError();
+  }
+
+  template<BaseType E, typename CTYPE>
+  inline FLATBUFFERS_CHECKED_ERROR ValidateImpl(int64_t *ev, int m) {
+    typedef typename EnumHelper::EnumValType<E>::type T;  // int64_t or uint64_t
+    static_assert(sizeof(T) == sizeof(int64_t), "invalid EnumValType");
+    const auto v = static_cast<T>(*ev);
+    auto up = static_cast<T>((flatbuffers::numeric_limits<CTYPE>::max)());
+    auto dn = static_cast<T>((flatbuffers::numeric_limits<CTYPE>::lowest)());
+    if (v < dn || v > (up - m)) {
+      return parser.Error("enum value does not fit, \"" + NumToString(v) +
+                          (m ? " + 1\"" : "\"") + " out of " +
+                          TypeToIntervalString<CTYPE>());
+    }
+    *ev = static_cast<int64_t>(v + m);  // well-defined since C++20.
+    return NoError();
+  }
+
+  FLATBUFFERS_CHECKED_ERROR ValidateValue(int64_t *ev, bool next) {
+    // clang-format off
+    switch (enum_def.underlying_type.base_type) {
+    #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...)                   \
+      case BASE_TYPE_##ENUM: {                                          \
+        if (!IsInteger(BASE_TYPE_##ENUM)) break;                        \
+        return ValidateImpl<BASE_TYPE_##ENUM, CTYPE>(ev, next ? 1 : 0); \
+      }
+      FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
+    #undef FLATBUFFERS_TD
+    default: break;
+    }
+    // clang-format on
+    return parser.Error("fatal: invalid enum underlying type");
+  }
+
+  EnumValBuilder(Parser &_parser, EnumDef &_enum_def)
+      : parser(_parser),
+        enum_def(_enum_def),
+        temp(nullptr),
+        user_value(false) {}
+
+  ~EnumValBuilder() { delete temp; }
+
+  Parser &parser;
+  EnumDef &enum_def;
+  EnumVal *temp;
+  bool user_value;
+};
+
+CheckedError Parser::ParseEnum(const bool is_union, EnumDef **dest) {
+  std::vector<std::string> enum_comment = doc_comment_;
+  NEXT();
+  std::string enum_name = attribute_;
+  EXPECT(kTokenIdentifier);
+  EnumDef *enum_def;
+  ECHECK(StartEnum(enum_name, is_union, &enum_def));
+  enum_def->doc_comment = enum_comment;
+  if (!is_union && !opts.proto_mode) {
+    // Give specialized error message, since this type spec used to
+    // be optional in the first FlatBuffers release.
+    if (!Is(':')) {
+      return Error(
+          "must specify the underlying integer type for this"
+          " enum (e.g. \': short\', which was the default).");
+    } else {
+      NEXT();
+    }
+    // Specify the integer type underlying this enum.
+    ECHECK(ParseType(enum_def->underlying_type));
+    if (!IsInteger(enum_def->underlying_type.base_type) ||
+        IsBool(enum_def->underlying_type.base_type))
+      return Error("underlying enum type must be integral");
+    // Make this type refer back to the enum it was derived from.
+    enum_def->underlying_type.enum_def = enum_def;
+  }
+  ECHECK(ParseMetaData(&enum_def->attributes));
+  const auto underlying_type = enum_def->underlying_type.base_type;
+  if (enum_def->attributes.Lookup("bit_flags") &&
+      !IsUnsigned(underlying_type)) {
+    // todo: Convert to the Error in the future?
+    Warning("underlying type of bit_flags enum must be unsigned");
+  }
+  EnumValBuilder evb(*this, *enum_def);
+  EXPECT('{');
+  // A lot of code generatos expect that an enum is not-empty.
+  if ((is_union || Is('}')) && !opts.proto_mode) {
+    evb.CreateEnumerator("NONE");
+    ECHECK(evb.AcceptEnumerator());
+  }
+  std::set<std::pair<BaseType, StructDef *>> union_types;
+  while (!Is('}')) {
+    if (opts.proto_mode && attribute_ == "option") {
+      ECHECK(ParseProtoOption());
+    } else {
+      auto &ev = *evb.CreateEnumerator(attribute_);
+      auto full_name = ev.name;
+      ev.doc_comment = doc_comment_;
+      EXPECT(kTokenIdentifier);
+      if (is_union) {
+        ECHECK(ParseNamespacing(&full_name, &ev.name));
+        if (opts.union_value_namespacing) {
+          // Since we can't namespace the actual enum identifiers, turn
+          // namespace parts into part of the identifier.
+          ev.name = full_name;
+          std::replace(ev.name.begin(), ev.name.end(), '.', '_');
+        }
+        if (Is(':')) {
+          NEXT();
+          ECHECK(ParseType(ev.union_type));
+          if (ev.union_type.base_type != BASE_TYPE_STRUCT &&
+              ev.union_type.base_type != BASE_TYPE_STRING)
+            return Error("union value type may only be table/struct/string");
+        } else {
+          ev.union_type = Type(BASE_TYPE_STRUCT, LookupCreateStruct(full_name));
+        }
+        if (!enum_def->uses_multiple_type_instances) {
+          auto ins = union_types.insert(std::make_pair(
+              ev.union_type.base_type, ev.union_type.struct_def));
+          enum_def->uses_multiple_type_instances = (false == ins.second);
+        }
+      }
+
+      if (Is('=')) {
+        NEXT();
+        ECHECK(evb.AssignEnumeratorValue(attribute_));
+        EXPECT(kTokenIntegerConstant);
+      }
+
+      ECHECK(evb.AcceptEnumerator());
+
+      if (opts.proto_mode && Is('[')) {
+        NEXT();
+        // ignore attributes on enums.
+        while (token_ != ']') NEXT();
+        NEXT();
+      }
+    }
+    if (!Is(opts.proto_mode ? ';' : ',')) break;
+    NEXT();
+  }
+  EXPECT('}');
+
+  // At this point, the enum can be empty if input is invalid proto-file.
+  if (!enum_def->size())
+    return Error("incomplete enum declaration, values not found");
+
+  if (enum_def->attributes.Lookup("bit_flags")) {
+    const auto base_width = static_cast<uint64_t>(8 * SizeOf(underlying_type));
+    for (auto it = enum_def->Vals().begin(); it != enum_def->Vals().end();
+         ++it) {
+      auto ev = *it;
+      const auto u = ev->GetAsUInt64();
+      // Stop manipulations with the sign.
+      if (!IsUnsigned(underlying_type) && u == (base_width - 1))
+        return Error("underlying type of bit_flags enum must be unsigned");
+      if (u >= base_width)
+        return Error("bit flag out of range of underlying integral type");
+      enum_def->ChangeEnumValue(ev, 1ULL << u);
+    }
+  }
+
+  enum_def->SortByValue();  // Must be sorted to use MinValue/MaxValue.
+
+  // Ensure enum value uniqueness.
+  auto prev_it = enum_def->Vals().begin();
+  for (auto it = prev_it + 1; it != enum_def->Vals().end(); ++it) {
+    auto prev_ev = *prev_it;
+    auto ev = *it;
+    if (prev_ev->GetAsUInt64() == ev->GetAsUInt64())
+      return Error("all enum values must be unique: " + prev_ev->name +
+                   " and " + ev->name + " are both " +
+                   NumToString(ev->GetAsInt64()));
+  }
+
+  if (dest) *dest = enum_def;
+  types_.Add(current_namespace_->GetFullyQualifiedName(enum_def->name),
+             new Type(BASE_TYPE_UNION, nullptr, enum_def));
+  return NoError();
+}
+
+CheckedError Parser::StartStruct(const std::string &name, StructDef **dest) {
+  auto &struct_def = *LookupCreateStruct(name, true, true);
+  if (!struct_def.predecl) return Error("datatype already exists: " + name);
+  struct_def.predecl = false;
+  struct_def.name = name;
+  struct_def.file = file_being_parsed_;
+  // Move this struct to the back of the vector just in case it was predeclared,
+  // to preserve declaration order.
+  *std::remove(structs_.vec.begin(), structs_.vec.end(), &struct_def) =
+      &struct_def;
+  *dest = &struct_def;
+  return NoError();
+}
+
+CheckedError Parser::CheckClash(std::vector<FieldDef *> &fields,
+                                StructDef *struct_def, const char *suffix,
+                                BaseType basetype) {
+  auto len = strlen(suffix);
+  for (auto it = fields.begin(); it != fields.end(); ++it) {
+    auto &fname = (*it)->name;
+    if (fname.length() > len &&
+        fname.compare(fname.length() - len, len, suffix) == 0 &&
+        (*it)->value.type.base_type != BASE_TYPE_UTYPE) {
+      auto field =
+          struct_def->fields.Lookup(fname.substr(0, fname.length() - len));
+      if (field && field->value.type.base_type == basetype)
+        return Error("Field " + fname +
+                     " would clash with generated functions for field " +
+                     field->name);
+    }
+  }
+  return NoError();
+}
+
+bool Parser::SupportsOptionalScalars(const flatbuffers::IDLOptions &opts) {
+  static FLATBUFFERS_CONSTEXPR unsigned long supported_langs =
+      IDLOptions::kRust | IDLOptions::kSwift | IDLOptions::kLobster |
+      IDLOptions::kKotlin | IDLOptions::kCpp | IDLOptions::kJava |
+      IDLOptions::kCSharp | IDLOptions::kTs | IDLOptions::kBinary;
+  unsigned long langs = opts.lang_to_generate;
+  return (langs > 0 && langs < IDLOptions::kMAX) && !(langs & ~supported_langs);
+}
+bool Parser::SupportsOptionalScalars() const {
+  // Check in general if a language isn't specified.
+  return opts.lang_to_generate == 0 || SupportsOptionalScalars(opts);
+}
+
+bool Parser::SupportsDefaultVectorsAndStrings() const {
+  static FLATBUFFERS_CONSTEXPR unsigned long supported_langs =
+      IDLOptions::kRust | IDLOptions::kSwift;
+  return !(opts.lang_to_generate & ~supported_langs);
+}
+
+bool Parser::SupportsAdvancedUnionFeatures() const {
+  return opts.lang_to_generate != 0 &&
+         (opts.lang_to_generate &
+          ~(IDLOptions::kCpp | IDLOptions::kTs | IDLOptions::kPhp |
+            IDLOptions::kJava | IDLOptions::kCSharp | IDLOptions::kKotlin |
+            IDLOptions::kBinary | IDLOptions::kSwift)) == 0;
+}
+
+bool Parser::SupportsAdvancedArrayFeatures() const {
+  return (opts.lang_to_generate &
+          ~(IDLOptions::kCpp | IDLOptions::kPython | IDLOptions::kJava |
+            IDLOptions::kCSharp | IDLOptions::kJsonSchema | IDLOptions::kJson |
+            IDLOptions::kBinary | IDLOptions::kRust)) == 0;
+}
+
+Namespace *Parser::UniqueNamespace(Namespace *ns) {
+  for (auto it = namespaces_.begin(); it != namespaces_.end(); ++it) {
+    if (ns->components == (*it)->components) {
+      delete ns;
+      return *it;
+    }
+  }
+  namespaces_.push_back(ns);
+  return ns;
+}
+
+std::string Parser::UnqualifiedName(const std::string &full_qualified_name) {
+  Namespace *ns = new Namespace();
+
+  std::size_t current, previous = 0;
+  current = full_qualified_name.find('.');
+  while (current != std::string::npos) {
+    ns->components.push_back(
+        full_qualified_name.substr(previous, current - previous));
+    previous = current + 1;
+    current = full_qualified_name.find('.', previous);
+  }
+  current_namespace_ = UniqueNamespace(ns);
+  return full_qualified_name.substr(previous, current - previous);
+}
+
+static bool compareFieldDefs(const FieldDef *a, const FieldDef *b) {
+  auto a_id = atoi(a->attributes.Lookup("id")->constant.c_str());
+  auto b_id = atoi(b->attributes.Lookup("id")->constant.c_str());
+  return a_id < b_id;
+}
+
+CheckedError Parser::ParseDecl() {
+  std::vector<std::string> dc = doc_comment_;
+  bool fixed = IsIdent("struct");
+  if (!fixed && !IsIdent("table")) return Error("declaration expected");
+  NEXT();
+  std::string name = attribute_;
+  EXPECT(kTokenIdentifier);
+  StructDef *struct_def;
+  ECHECK(StartStruct(name, &struct_def));
+  struct_def->doc_comment = dc;
+  struct_def->fixed = fixed;
+  ECHECK(ParseMetaData(&struct_def->attributes));
+  struct_def->sortbysize =
+      struct_def->attributes.Lookup("original_order") == nullptr && !fixed;
+  EXPECT('{');
+  while (token_ != '}') ECHECK(ParseField(*struct_def));
+  if (fixed) {
+    const auto force_align = struct_def->attributes.Lookup("force_align");
+    if (force_align) {
+      size_t align;
+      ECHECK(ParseAlignAttribute(force_align->constant, struct_def->minalign,
+                                 &align));
+      struct_def->minalign = align;
+    }
+    if (!struct_def->bytesize) return Error("size 0 structs not allowed");
+  }
+  struct_def->PadLastField(struct_def->minalign);
+  // Check if this is a table that has manual id assignments
+  auto &fields = struct_def->fields.vec;
+  if (!fixed && fields.size()) {
+    size_t num_id_fields = 0;
+    for (auto it = fields.begin(); it != fields.end(); ++it) {
+      if ((*it)->attributes.Lookup("id")) num_id_fields++;
+    }
+    // If any fields have ids..
+    if (num_id_fields || opts.require_explicit_ids) {
+      // Then all fields must have them.
+      if (num_id_fields != fields.size()) {
+        if (opts.require_explicit_ids) {
+          return Error(
+              "all fields must have an 'id' attribute when "
+              "--require-explicit-ids is used");
+        } else {
+          return Error(
+              "either all fields or no fields must have an 'id' attribute");
+        }
+      }
+      // Simply sort by id, then the fields are the same as if no ids had
+      // been specified.
+      std::sort(fields.begin(), fields.end(), compareFieldDefs);
+      // Verify we have a contiguous set, and reassign vtable offsets.
+      FLATBUFFERS_ASSERT(fields.size() <=
+                         flatbuffers::numeric_limits<voffset_t>::max());
+      for (voffset_t i = 0; i < static_cast<voffset_t>(fields.size()); i++) {
+        auto &field = *fields[i];
+        const auto &id_str = field.attributes.Lookup("id")->constant;
+        // Metadata values have a dynamic type, they can be `float`, 'int', or
+        // 'string`.
+        // The FieldIndexToOffset(i) expects the voffset_t so `id` is limited by
+        // this type.
+        voffset_t id = 0;
+        const auto done = !atot(id_str.c_str(), *this, &id).Check();
+        if (!done)
+          return Error("field id\'s must be non-negative number, field: " +
+                       field.name + ", id: " + id_str);
+        if (i != id)
+          return Error("field id\'s must be consecutive from 0, id " +
+                       NumToString(i) + " missing or set twice, field: " +
+                       field.name + ", id: " + id_str);
+        field.value.offset = FieldIndexToOffset(i);
+      }
+    }
+  }
+
+  ECHECK(
+      CheckClash(fields, struct_def, UnionTypeFieldSuffix(), BASE_TYPE_UNION));
+  ECHECK(CheckClash(fields, struct_def, "Type", BASE_TYPE_UNION));
+  ECHECK(CheckClash(fields, struct_def, "_length", BASE_TYPE_VECTOR));
+  ECHECK(CheckClash(fields, struct_def, "Length", BASE_TYPE_VECTOR));
+  ECHECK(CheckClash(fields, struct_def, "_byte_vector", BASE_TYPE_STRING));
+  ECHECK(CheckClash(fields, struct_def, "ByteVector", BASE_TYPE_STRING));
+  EXPECT('}');
+  types_.Add(current_namespace_->GetFullyQualifiedName(struct_def->name),
+             new Type(BASE_TYPE_STRUCT, struct_def, nullptr));
+  return NoError();
+}
+
+CheckedError Parser::ParseService() {
+  std::vector<std::string> service_comment = doc_comment_;
+  NEXT();
+  auto service_name = attribute_;
+  EXPECT(kTokenIdentifier);
+  auto &service_def = *new ServiceDef();
+  service_def.name = service_name;
+  service_def.file = file_being_parsed_;
+  service_def.doc_comment = service_comment;
+  service_def.defined_namespace = current_namespace_;
+  if (services_.Add(current_namespace_->GetFullyQualifiedName(service_name),
+                    &service_def))
+    return Error("service already exists: " + service_name);
+  ECHECK(ParseMetaData(&service_def.attributes));
+  EXPECT('{');
+  do {
+    std::vector<std::string> doc_comment = doc_comment_;
+    auto rpc_name = attribute_;
+    EXPECT(kTokenIdentifier);
+    EXPECT('(');
+    Type reqtype, resptype;
+    ECHECK(ParseTypeIdent(reqtype));
+    EXPECT(')');
+    EXPECT(':');
+    ECHECK(ParseTypeIdent(resptype));
+    if (reqtype.base_type != BASE_TYPE_STRUCT || reqtype.struct_def->fixed ||
+        resptype.base_type != BASE_TYPE_STRUCT || resptype.struct_def->fixed)
+      return Error("rpc request and response types must be tables");
+    auto &rpc = *new RPCCall();
+    rpc.name = rpc_name;
+    rpc.request = reqtype.struct_def;
+    rpc.response = resptype.struct_def;
+    rpc.doc_comment = doc_comment;
+    if (service_def.calls.Add(rpc_name, &rpc))
+      return Error("rpc already exists: " + rpc_name);
+    ECHECK(ParseMetaData(&rpc.attributes));
+    EXPECT(';');
+  } while (token_ != '}');
+  NEXT();
+  return NoError();
+}
+
+bool Parser::SetRootType(const char *name) {
+  root_struct_def_ = LookupStruct(name);
+  if (!root_struct_def_)
+    root_struct_def_ =
+        LookupStruct(current_namespace_->GetFullyQualifiedName(name));
+  return root_struct_def_ != nullptr;
+}
+
+void Parser::MarkGenerated() {
+  // This function marks all existing definitions as having already
+  // been generated, which signals no code for included files should be
+  // generated.
+  for (auto it = enums_.vec.begin(); it != enums_.vec.end(); ++it) {
+    (*it)->generated = true;
+  }
+  for (auto it = structs_.vec.begin(); it != structs_.vec.end(); ++it) {
+    if (!(*it)->predecl) { (*it)->generated = true; }
+  }
+  for (auto it = services_.vec.begin(); it != services_.vec.end(); ++it) {
+    (*it)->generated = true;
+  }
+}
+
+CheckedError Parser::ParseNamespace() {
+  NEXT();
+  auto ns = new Namespace();
+  namespaces_.push_back(ns);  // Store it here to not leak upon error.
+  if (token_ != ';') {
+    for (;;) {
+      ns->components.push_back(attribute_);
+      EXPECT(kTokenIdentifier);
+      if (Is('.')) NEXT() else break;
+    }
+  }
+  namespaces_.pop_back();
+  current_namespace_ = UniqueNamespace(ns);
+  EXPECT(';');
+  return NoError();
+}
+
+// Best effort parsing of .proto declarations, with the aim to turn them
+// in the closest corresponding FlatBuffer equivalent.
+// We parse everything as identifiers instead of keywords, since we don't
+// want protobuf keywords to become invalid identifiers in FlatBuffers.
+CheckedError Parser::ParseProtoDecl() {
+  bool isextend = IsIdent("extend");
+  if (IsIdent("package")) {
+    // These are identical in syntax to FlatBuffer's namespace decl.
+    ECHECK(ParseNamespace());
+  } else if (IsIdent("message") || isextend) {
+    std::vector<std::string> struct_comment = doc_comment_;
+    NEXT();
+    StructDef *struct_def = nullptr;
+    Namespace *parent_namespace = nullptr;
+    if (isextend) {
+      if (Is('.')) NEXT();  // qualified names may start with a . ?
+      auto id = attribute_;
+      EXPECT(kTokenIdentifier);
+      ECHECK(ParseNamespacing(&id, nullptr));
+      struct_def = LookupCreateStruct(id, false);
+      if (!struct_def)
+        return Error("cannot extend unknown message type: " + id);
+    } else {
+      std::string name = attribute_;
+      EXPECT(kTokenIdentifier);
+      ECHECK(StartStruct(name, &struct_def));
+      // Since message definitions can be nested, we create a new namespace.
+      auto ns = new Namespace();
+      // Copy of current namespace.
+      *ns = *current_namespace_;
+      // But with current message name.
+      ns->components.push_back(name);
+      ns->from_table++;
+      parent_namespace = current_namespace_;
+      current_namespace_ = UniqueNamespace(ns);
+    }
+    struct_def->doc_comment = struct_comment;
+    ECHECK(ParseProtoFields(struct_def, isextend, false));
+    if (!isextend) { current_namespace_ = parent_namespace; }
+    if (Is(';')) NEXT();
+  } else if (IsIdent("enum")) {
+    // These are almost the same, just with different terminator:
+    EnumDef *enum_def;
+    ECHECK(ParseEnum(false, &enum_def));
+    if (Is(';')) NEXT();
+    // Temp: remove any duplicates, as .fbs files can't handle them.
+    enum_def->RemoveDuplicates();
+  } else if (IsIdent("syntax")) {  // Skip these.
+    NEXT();
+    EXPECT('=');
+    EXPECT(kTokenStringConstant);
+    EXPECT(';');
+  } else if (IsIdent("option")) {  // Skip these.
+    ECHECK(ParseProtoOption());
+    EXPECT(';');
+  } else if (IsIdent("service")) {  // Skip these.
+    NEXT();
+    EXPECT(kTokenIdentifier);
+    ECHECK(ParseProtoCurliesOrIdent());
+  } else {
+    return Error("don\'t know how to parse .proto declaration starting with " +
+                 TokenToStringId(token_));
+  }
+  return NoError();
+}
+
+CheckedError Parser::StartEnum(const std::string &enum_name, bool is_union,
+                               EnumDef **dest) {
+  auto &enum_def = *new EnumDef();
+  enum_def.name = enum_name;
+  enum_def.file = file_being_parsed_;
+  enum_def.doc_comment = doc_comment_;
+  enum_def.is_union = is_union;
+  enum_def.defined_namespace = current_namespace_;
+  if (enums_.Add(current_namespace_->GetFullyQualifiedName(enum_name),
+                 &enum_def))
+    return Error("enum already exists: " + enum_name);
+  enum_def.underlying_type.base_type =
+      is_union ? BASE_TYPE_UTYPE : BASE_TYPE_INT;
+  enum_def.underlying_type.enum_def = &enum_def;
+  if (dest) *dest = &enum_def;
+  return NoError();
+}
+
+CheckedError Parser::ParseProtoFields(StructDef *struct_def, bool isextend,
+                                      bool inside_oneof) {
+  EXPECT('{');
+  while (token_ != '}') {
+    if (IsIdent("message") || IsIdent("extend") || IsIdent("enum")) {
+      // Nested declarations.
+      ECHECK(ParseProtoDecl());
+    } else if (IsIdent("extensions")) {  // Skip these.
+      NEXT();
+      EXPECT(kTokenIntegerConstant);
+      if (Is(kTokenIdentifier)) {
+        NEXT();  // to
+        NEXT();  // num
+      }
+      EXPECT(';');
+    } else if (IsIdent("option")) {  // Skip these.
+      ECHECK(ParseProtoOption());
+      EXPECT(';');
+    } else if (IsIdent("reserved")) {  // Skip these.
+      NEXT();
+      while (!Is(';')) { NEXT(); }  // A variety of formats, just skip.
+      NEXT();
+    } else {
+      std::vector<std::string> field_comment = doc_comment_;
+      // Parse the qualifier.
+      bool required = false;
+      bool repeated = false;
+      bool oneof = false;
+      if (!inside_oneof) {
+        if (IsIdent("optional")) {
+          // This is the default.
+          NEXT();
+        } else if (IsIdent("required")) {
+          required = true;
+          NEXT();
+        } else if (IsIdent("repeated")) {
+          repeated = true;
+          NEXT();
+        } else if (IsIdent("oneof")) {
+          oneof = true;
+          NEXT();
+        } else {
+          // can't error, proto3 allows decls without any of the above.
+        }
+      }
+      StructDef *anonymous_struct = nullptr;
+      EnumDef *oneof_union = nullptr;
+      Type type;
+      if (IsIdent("group") || oneof) {
+        if (!oneof) NEXT();
+        if (oneof && opts.proto_oneof_union) {
+          auto name = MakeCamel(attribute_, true) + "Union";
+          ECHECK(StartEnum(name, true, &oneof_union));
+          type = Type(BASE_TYPE_UNION, nullptr, oneof_union);
+        } else {
+          auto name = "Anonymous" + NumToString(anonymous_counter_++);
+          ECHECK(StartStruct(name, &anonymous_struct));
+          type = Type(BASE_TYPE_STRUCT, anonymous_struct);
+        }
+      } else {
+        ECHECK(ParseTypeFromProtoType(&type));
+      }
+      // Repeated elements get mapped to a vector.
+      if (repeated) {
+        type.element = type.base_type;
+        type.base_type = BASE_TYPE_VECTOR;
+        if (type.element == BASE_TYPE_VECTOR) {
+          // We have a vector or vectors, which FlatBuffers doesn't support.
+          // For now make it a vector of string (since the source is likely
+          // "repeated bytes").
+          // TODO(wvo): A better solution would be to wrap this in a table.
+          type.element = BASE_TYPE_STRING;
+        }
+      }
+      std::string name = attribute_;
+      EXPECT(kTokenIdentifier);
+      if (!oneof) {
+        // Parse the field id. Since we're just translating schemas, not
+        // any kind of binary compatibility, we can safely ignore these, and
+        // assign our own.
+        EXPECT('=');
+        EXPECT(kTokenIntegerConstant);
+      }
+      FieldDef *field = nullptr;
+      if (isextend) {
+        // We allow a field to be re-defined when extending.
+        // TODO: are there situations where that is problematic?
+        field = struct_def->fields.Lookup(name);
+      }
+      if (!field) ECHECK(AddField(*struct_def, name, type, &field));
+      field->doc_comment = field_comment;
+      if (!IsScalar(type.base_type) && required) {
+        field->presence = FieldDef::kRequired;
+      }
+      // See if there's a default specified.
+      if (Is('[')) {
+        NEXT();
+        for (;;) {
+          auto key = attribute_;
+          ECHECK(ParseProtoKey());
+          EXPECT('=');
+          auto val = attribute_;
+          ECHECK(ParseProtoCurliesOrIdent());
+          if (key == "default") {
+            // Temp: skip non-numeric and non-boolean defaults (enums).
+            auto numeric = strpbrk(val.c_str(), "0123456789-+.");
+            if (IsScalar(type.base_type) && numeric == val.c_str()) {
+              field->value.constant = val;
+            } else if (val == "true") {
+              field->value.constant = val;
+            }  // "false" is default, no need to handle explicitly.
+          } else if (key == "deprecated") {
+            field->deprecated = val == "true";
+          }
+          if (!Is(',')) break;
+          NEXT();
+        }
+        EXPECT(']');
+      }
+      if (anonymous_struct) {
+        ECHECK(ParseProtoFields(anonymous_struct, false, oneof));
+        if (Is(';')) NEXT();
+      } else if (oneof_union) {
+        // Parse into a temporary StructDef, then transfer fields into an
+        // EnumDef describing the oneof as a union.
+        StructDef oneof_struct;
+        ECHECK(ParseProtoFields(&oneof_struct, false, oneof));
+        if (Is(';')) NEXT();
+        for (auto field_it = oneof_struct.fields.vec.begin();
+             field_it != oneof_struct.fields.vec.end(); ++field_it) {
+          const auto &oneof_field = **field_it;
+          const auto &oneof_type = oneof_field.value.type;
+          if (oneof_type.base_type != BASE_TYPE_STRUCT ||
+              !oneof_type.struct_def || oneof_type.struct_def->fixed)
+            return Error("oneof '" + name +
+                         "' cannot be mapped to a union because member '" +
+                         oneof_field.name + "' is not a table type.");
+          EnumValBuilder evb(*this, *oneof_union);
+          auto ev = evb.CreateEnumerator(oneof_type.struct_def->name);
+          ev->union_type = oneof_type;
+          ev->doc_comment = oneof_field.doc_comment;
+          ECHECK(evb.AcceptEnumerator(oneof_field.name));
+        }
+      } else {
+        EXPECT(';');
+      }
+    }
+  }
+  NEXT();
+  return NoError();
+}
+
+CheckedError Parser::ParseProtoKey() {
+  if (token_ == '(') {
+    NEXT();
+    // Skip "(a.b)" style custom attributes.
+    while (token_ == '.' || token_ == kTokenIdentifier) NEXT();
+    EXPECT(')');
+    while (Is('.')) {
+      NEXT();
+      EXPECT(kTokenIdentifier);
+    }
+  } else {
+    EXPECT(kTokenIdentifier);
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseProtoCurliesOrIdent() {
+  if (Is('{')) {
+    NEXT();
+    for (int nesting = 1; nesting;) {
+      if (token_ == '{')
+        nesting++;
+      else if (token_ == '}')
+        nesting--;
+      NEXT();
+    }
+  } else {
+    NEXT();  // Any single token.
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseProtoOption() {
+  NEXT();
+  ECHECK(ParseProtoKey());
+  EXPECT('=');
+  ECHECK(ParseProtoCurliesOrIdent());
+  return NoError();
+}
+
+// Parse a protobuf type, and map it to the corresponding FlatBuffer one.
+CheckedError Parser::ParseTypeFromProtoType(Type *type) {
+  struct type_lookup {
+    const char *proto_type;
+    BaseType fb_type, element;
+  };
+  static type_lookup lookup[] = {
+    { "float", BASE_TYPE_FLOAT, BASE_TYPE_NONE },
+    { "double", BASE_TYPE_DOUBLE, BASE_TYPE_NONE },
+    { "int32", BASE_TYPE_INT, BASE_TYPE_NONE },
+    { "int64", BASE_TYPE_LONG, BASE_TYPE_NONE },
+    { "uint32", BASE_TYPE_UINT, BASE_TYPE_NONE },
+    { "uint64", BASE_TYPE_ULONG, BASE_TYPE_NONE },
+    { "sint32", BASE_TYPE_INT, BASE_TYPE_NONE },
+    { "sint64", BASE_TYPE_LONG, BASE_TYPE_NONE },
+    { "fixed32", BASE_TYPE_UINT, BASE_TYPE_NONE },
+    { "fixed64", BASE_TYPE_ULONG, BASE_TYPE_NONE },
+    { "sfixed32", BASE_TYPE_INT, BASE_TYPE_NONE },
+    { "sfixed64", BASE_TYPE_LONG, BASE_TYPE_NONE },
+    { "bool", BASE_TYPE_BOOL, BASE_TYPE_NONE },
+    { "string", BASE_TYPE_STRING, BASE_TYPE_NONE },
+    { "bytes", BASE_TYPE_VECTOR, BASE_TYPE_UCHAR },
+    { nullptr, BASE_TYPE_NONE, BASE_TYPE_NONE }
+  };
+  for (auto tl = lookup; tl->proto_type; tl++) {
+    if (attribute_ == tl->proto_type) {
+      type->base_type = tl->fb_type;
+      type->element = tl->element;
+      NEXT();
+      return NoError();
+    }
+  }
+  if (Is('.')) NEXT();  // qualified names may start with a . ?
+  ECHECK(ParseTypeIdent(*type));
+  return NoError();
+}
+
+CheckedError Parser::SkipAnyJsonValue() {
+  ParseDepthGuard depth_guard(this);
+  ECHECK(depth_guard.Check());
+
+  switch (token_) {
+    case '{': {
+      size_t fieldn_outer = 0;
+      return ParseTableDelimiters(fieldn_outer, nullptr,
+                                  [&](const std::string &, size_t &fieldn,
+                                      const StructDef *) -> CheckedError {
+                                    ECHECK(SkipAnyJsonValue());
+                                    fieldn++;
+                                    return NoError();
+                                  });
+    }
+    case '[': {
+      uoffset_t count = 0;
+      return ParseVectorDelimiters(count, [&](uoffset_t &) -> CheckedError {
+        return SkipAnyJsonValue();
+      });
+    }
+    case kTokenStringConstant:
+    case kTokenIntegerConstant:
+    case kTokenFloatConstant: NEXT(); break;
+    default:
+      if (IsIdent("true") || IsIdent("false") || IsIdent("null")) {
+        NEXT();
+      } else
+        return TokenError();
+  }
+  return NoError();
+}
+
+CheckedError Parser::ParseFlexBufferNumericConstant(
+    flexbuffers::Builder *builder) {
+  double d;
+  if (!StringToNumber(attribute_.c_str(), &d))
+    return Error("unexpected floating-point constant: " + attribute_);
+  builder->Double(d);
+  return NoError();
+}
+
+CheckedError Parser::ParseFlexBufferValue(flexbuffers::Builder *builder) {
+  ParseDepthGuard depth_guard(this);
+  ECHECK(depth_guard.Check());
+
+  switch (token_) {
+    case '{': {
+      auto start = builder->StartMap();
+      size_t fieldn_outer = 0;
+      auto err =
+          ParseTableDelimiters(fieldn_outer, nullptr,
+                               [&](const std::string &name, size_t &fieldn,
+                                   const StructDef *) -> CheckedError {
+                                 builder->Key(name);
+                                 ECHECK(ParseFlexBufferValue(builder));
+                                 fieldn++;
+                                 return NoError();
+                               });
+      ECHECK(err);
+      builder->EndMap(start);
+      if (builder->HasDuplicateKeys())
+        return Error("FlexBuffers map has duplicate keys");
+      break;
+    }
+    case '[': {
+      auto start = builder->StartVector();
+      uoffset_t count = 0;
+      ECHECK(ParseVectorDelimiters(count, [&](uoffset_t &) -> CheckedError {
+        return ParseFlexBufferValue(builder);
+      }));
+      builder->EndVector(start, false, false);
+      break;
+    }
+    case kTokenStringConstant:
+      builder->String(attribute_);
+      EXPECT(kTokenStringConstant);
+      break;
+    case kTokenIntegerConstant:
+      builder->Int(StringToInt(attribute_.c_str()));
+      EXPECT(kTokenIntegerConstant);
+      break;
+    case kTokenFloatConstant: {
+      double d;
+      StringToNumber(attribute_.c_str(), &d);
+      builder->Double(d);
+      EXPECT(kTokenFloatConstant);
+      break;
+    }
+    case '-':
+    case '+': {
+      // `[-+]?(nan|inf|infinity)`, see ParseSingleValue().
+      const auto sign = static_cast<char>(token_);
+      NEXT();
+      if (token_ != kTokenIdentifier)
+        return Error("floating-point constant expected");
+      attribute_.insert(0, 1, sign);
+      ECHECK(ParseFlexBufferNumericConstant(builder));
+      NEXT();
+      break;
+    }
+    default:
+      if (IsIdent("true")) {
+        builder->Bool(true);
+        NEXT();
+      } else if (IsIdent("false")) {
+        builder->Bool(false);
+        NEXT();
+      } else if (IsIdent("null")) {
+        builder->Null();
+        NEXT();
+      } else if (IsIdent("inf") || IsIdent("infinity") || IsIdent("nan")) {
+        ECHECK(ParseFlexBufferNumericConstant(builder));
+        NEXT();
+      } else
+        return TokenError();
+  }
+  return NoError();
+}
+
+bool Parser::ParseFlexBuffer(const char *source, const char *source_filename,
+                             flexbuffers::Builder *builder) {
+  const auto initial_depth = parse_depth_counter_;
+  (void)initial_depth;
+  auto ok = !StartParseFile(source, source_filename).Check() &&
+            !ParseFlexBufferValue(builder).Check();
+  if (ok) builder->Finish();
+  FLATBUFFERS_ASSERT(initial_depth == parse_depth_counter_);
+  return ok;
+}
+
+bool Parser::Parse(const char *source, const char **include_paths,
+                   const char *source_filename) {
+  const auto initial_depth = parse_depth_counter_;
+  (void)initial_depth;
+  bool r;
+
+  if (opts.use_flexbuffers) {
+    r = ParseFlexBuffer(source, source_filename, &flex_builder_);
+  } else {
+    r = !ParseRoot(source, include_paths, source_filename).Check();
+  }
+  FLATBUFFERS_ASSERT(initial_depth == parse_depth_counter_);
+  return r;
+}
+
+bool Parser::ParseJson(const char *json, const char *json_filename) {
+  const auto initial_depth = parse_depth_counter_;
+  (void)initial_depth;
+  builder_.Clear();
+  const auto done =
+      !StartParseFile(json, json_filename).Check() && !DoParseJson().Check();
+  FLATBUFFERS_ASSERT(initial_depth == parse_depth_counter_);
+  return done;
+}
+
+CheckedError Parser::StartParseFile(const char *source,
+                                    const char *source_filename) {
+  file_being_parsed_ = source_filename ? source_filename : "";
+  source_ = source;
+  ResetState(source_);
+  error_.clear();
+  ECHECK(SkipByteOrderMark());
+  NEXT();
+  if (Is(kTokenEof)) return Error("input file is empty");
+  return NoError();
+}
+
+CheckedError Parser::ParseRoot(const char *source, const char **include_paths,
+                               const char *source_filename) {
+  ECHECK(DoParse(source, include_paths, source_filename, nullptr));
+
+  // Check that all types were defined.
+  for (auto it = structs_.vec.begin(); it != structs_.vec.end();) {
+    auto &struct_def = **it;
+    if (struct_def.predecl) {
+      if (opts.proto_mode) {
+        // Protos allow enums to be used before declaration, so check if that
+        // is the case here.
+        EnumDef *enum_def = nullptr;
+        for (size_t components =
+                 struct_def.defined_namespace->components.size() + 1;
+             components && !enum_def; components--) {
+          auto qualified_name =
+              struct_def.defined_namespace->GetFullyQualifiedName(
+                  struct_def.name, components - 1);
+          enum_def = LookupEnum(qualified_name);
+        }
+        if (enum_def) {
+          // This is pretty slow, but a simple solution for now.
+          auto initial_count = struct_def.refcount;
+          for (auto struct_it = structs_.vec.begin();
+               struct_it != structs_.vec.end(); ++struct_it) {
+            auto &sd = **struct_it;
+            for (auto field_it = sd.fields.vec.begin();
+                 field_it != sd.fields.vec.end(); ++field_it) {
+              auto &field = **field_it;
+              if (field.value.type.struct_def == &struct_def) {
+                field.value.type.struct_def = nullptr;
+                field.value.type.enum_def = enum_def;
+                auto &bt = IsVector(field.value.type)
+                               ? field.value.type.element
+                               : field.value.type.base_type;
+                FLATBUFFERS_ASSERT(bt == BASE_TYPE_STRUCT);
+                bt = enum_def->underlying_type.base_type;
+                struct_def.refcount--;
+                enum_def->refcount++;
+              }
+            }
+          }
+          if (struct_def.refcount)
+            return Error("internal: " + NumToString(struct_def.refcount) + "/" +
+                         NumToString(initial_count) +
+                         " use(s) of pre-declaration enum not accounted for: " +
+                         enum_def->name);
+          structs_.dict.erase(structs_.dict.find(struct_def.name));
+          it = structs_.vec.erase(it);
+          delete &struct_def;
+          continue;  // Skip error.
+        }
+      }
+      auto err = "type referenced but not defined (check namespace): " +
+                 struct_def.name;
+      if (struct_def.original_location)
+        err += ", originally at: " + *struct_def.original_location;
+      return Error(err);
+    }
+    ++it;
+  }
+
+  // This check has to happen here and not earlier, because only now do we
+  // know for sure what the type of these are.
+  for (auto it = enums_.vec.begin(); it != enums_.vec.end(); ++it) {
+    auto &enum_def = **it;
+    if (enum_def.is_union) {
+      for (auto val_it = enum_def.Vals().begin();
+           val_it != enum_def.Vals().end(); ++val_it) {
+        auto &val = **val_it;
+        if (!SupportsAdvancedUnionFeatures() &&
+            (IsStruct(val.union_type) || IsString(val.union_type)))
+          return Error(
+              "only tables can be union elements in the generated language: " +
+              val.name);
+      }
+    }
+  }
+  // Parse JSON object only if the scheme has been parsed.
+  if (token_ == '{') { ECHECK(DoParseJson()); }
+  EXPECT(kTokenEof);
+  return NoError();
+}
+
+// Generate a unique hash for a file based on its name and contents (if any).
+static uint64_t HashFile(const char *source_filename, const char *source) {
+  uint64_t hash = 0;
+
+  if (source_filename)
+    hash = HashFnv1a<uint64_t>(StripPath(source_filename).c_str());
+
+  if (source && *source) hash ^= HashFnv1a<uint64_t>(source);
+
+  return hash;
+}
+
+CheckedError Parser::DoParse(const char *source, const char **include_paths,
+                             const char *source_filename,
+                             const char *include_filename) {
+  uint64_t source_hash = 0;
+  if (source_filename) {
+    // If the file is in-memory, don't include its contents in the hash as we
+    // won't be able to load them later.
+    if (FileExists(source_filename))
+      source_hash = HashFile(source_filename, source);
+    else
+      source_hash = HashFile(source_filename, nullptr);
+
+    if (included_files_.find(source_hash) == included_files_.end()) {
+      included_files_[source_hash] = include_filename ? include_filename : "";
+      files_included_per_file_[source_filename] = std::set<std::string>();
+    } else {
+      return NoError();
+    }
+  }
+  if (!include_paths) {
+    static const char *current_directory[] = { "", nullptr };
+    include_paths = current_directory;
+  }
+  field_stack_.clear();
+  builder_.Clear();
+  // Start with a blank namespace just in case this file doesn't have one.
+  current_namespace_ = empty_namespace_;
+
+  ECHECK(StartParseFile(source, source_filename));
+
+  // Includes must come before type declarations:
+  for (;;) {
+    // Parse pre-include proto statements if any:
+    if (opts.proto_mode && (attribute_ == "option" || attribute_ == "syntax" ||
+                            attribute_ == "package")) {
+      ECHECK(ParseProtoDecl());
+    } else if (IsIdent("native_include")) {
+      NEXT();
+      vector_emplace_back(&native_included_files_, attribute_);
+      EXPECT(kTokenStringConstant);
+      EXPECT(';');
+    } else if (IsIdent("include") || (opts.proto_mode && IsIdent("import"))) {
+      NEXT();
+      if (opts.proto_mode && attribute_ == "public") NEXT();
+      auto name = flatbuffers::PosixPath(attribute_.c_str());
+      EXPECT(kTokenStringConstant);
+      // Look for the file relative to the directory of the current file.
+      std::string filepath;
+      if (source_filename) {
+        auto source_file_directory =
+            flatbuffers::StripFileName(source_filename);
+        filepath = flatbuffers::ConCatPathFileName(source_file_directory, name);
+      }
+      if (filepath.empty() || !FileExists(filepath.c_str())) {
+        // Look for the file in include_paths.
+        for (auto paths = include_paths; paths && *paths; paths++) {
+          filepath = flatbuffers::ConCatPathFileName(*paths, name);
+          if (FileExists(filepath.c_str())) break;
+        }
+      }
+      if (filepath.empty())
+        return Error("unable to locate include file: " + name);
+      if (source_filename)
+        files_included_per_file_[source_filename].insert(filepath);
+
+      std::string contents;
+      bool file_loaded = LoadFile(filepath.c_str(), true, &contents);
+      if (included_files_.find(HashFile(filepath.c_str(), contents.c_str())) ==
+          included_files_.end()) {
+        // We found an include file that we have not parsed yet.
+        // Parse it.
+        if (!file_loaded) return Error("unable to load include file: " + name);
+        ECHECK(DoParse(contents.c_str(), include_paths, filepath.c_str(),
+                       name.c_str()));
+        // We generally do not want to output code for any included files:
+        if (!opts.generate_all) MarkGenerated();
+        // Reset these just in case the included file had them, and the
+        // parent doesn't.
+        root_struct_def_ = nullptr;
+        file_identifier_.clear();
+        file_extension_.clear();
+        // This is the easiest way to continue this file after an include:
+        // instead of saving and restoring all the state, we simply start the
+        // file anew. This will cause it to encounter the same include
+        // statement again, but this time it will skip it, because it was
+        // entered into included_files_.
+        // This is recursive, but only go as deep as the number of include
+        // statements.
+        included_files_.erase(source_hash);
+        return DoParse(source, include_paths, source_filename,
+                       include_filename);
+      }
+      EXPECT(';');
+    } else {
+      break;
+    }
+  }
+  // Now parse all other kinds of declarations:
+  while (token_ != kTokenEof) {
+    if (opts.proto_mode) {
+      ECHECK(ParseProtoDecl());
+    } else if (IsIdent("namespace")) {
+      ECHECK(ParseNamespace());
+    } else if (token_ == '{') {
+      return NoError();
+    } else if (IsIdent("enum")) {
+      ECHECK(ParseEnum(false, nullptr));
+    } else if (IsIdent("union")) {
+      ECHECK(ParseEnum(true, nullptr));
+    } else if (IsIdent("root_type")) {
+      NEXT();
+      auto root_type = attribute_;
+      EXPECT(kTokenIdentifier);
+      ECHECK(ParseNamespacing(&root_type, nullptr));
+      if (opts.root_type.empty()) {
+        if (!SetRootType(root_type.c_str()))
+          return Error("unknown root type: " + root_type);
+        if (root_struct_def_->fixed) return Error("root type must be a table");
+      }
+      EXPECT(';');
+    } else if (IsIdent("file_identifier")) {
+      NEXT();
+      file_identifier_ = attribute_;
+      EXPECT(kTokenStringConstant);
+      if (file_identifier_.length() != FlatBufferBuilder::kFileIdentifierLength)
+        return Error("file_identifier must be exactly " +
+                     NumToString(FlatBufferBuilder::kFileIdentifierLength) +
+                     " characters");
+      EXPECT(';');
+    } else if (IsIdent("file_extension")) {
+      NEXT();
+      file_extension_ = attribute_;
+      EXPECT(kTokenStringConstant);
+      EXPECT(';');
+    } else if (IsIdent("include")) {
+      return Error("includes must come before declarations");
+    } else if (IsIdent("attribute")) {
+      NEXT();
+      auto name = attribute_;
+      if (Is(kTokenIdentifier)) {
+        NEXT();
+      } else {
+        EXPECT(kTokenStringConstant);
+      }
+      EXPECT(';');
+      known_attributes_[name] = false;
+    } else if (IsIdent("rpc_service")) {
+      ECHECK(ParseService());
+    } else {
+      ECHECK(ParseDecl());
+    }
+  }
+  return NoError();
+}
+
+CheckedError Parser::DoParseJson() {
+  if (token_ != '{') {
+    EXPECT('{');
+  } else {
+    if (!root_struct_def_) return Error("no root type set to parse json with");
+    if (builder_.GetSize()) {
+      return Error("cannot have more than one json object in a file");
+    }
+    uoffset_t toff;
+    ECHECK(ParseTable(*root_struct_def_, nullptr, &toff));
+    if (opts.size_prefixed) {
+      builder_.FinishSizePrefixed(
+          Offset<Table>(toff),
+          file_identifier_.length() ? file_identifier_.c_str() : nullptr);
+    } else {
+      builder_.Finish(Offset<Table>(toff), file_identifier_.length()
+                                               ? file_identifier_.c_str()
+                                               : nullptr);
+    }
+  }
+  // Check that JSON file doesn't contain more objects or IDL directives.
+  // Comments after JSON are allowed.
+  EXPECT(kTokenEof);
+  return NoError();
+}
+
+std::set<std::string> Parser::GetIncludedFilesRecursive(
+    const std::string &file_name) const {
+  std::set<std::string> included_files;
+  std::list<std::string> to_process;
+
+  if (file_name.empty()) return included_files;
+  to_process.push_back(file_name);
+
+  while (!to_process.empty()) {
+    std::string current = to_process.front();
+    to_process.pop_front();
+    included_files.insert(current);
+
+    // Workaround the lack of const accessor in C++98 maps.
+    auto &new_files =
+        (*const_cast<std::map<std::string, std::set<std::string>> *>(
+            &files_included_per_file_))[current];
+    for (auto it = new_files.begin(); it != new_files.end(); ++it) {
+      if (included_files.find(*it) == included_files.end())
+        to_process.push_back(*it);
+    }
+  }
+
+  return included_files;
+}
+
+// Schema serialization functionality:
+
+template<typename T> bool compareName(const T *a, const T *b) {
+  return a->defined_namespace->GetFullyQualifiedName(a->name) <
+         b->defined_namespace->GetFullyQualifiedName(b->name);
+}
+
+template<typename T> void AssignIndices(const std::vector<T *> &defvec) {
+  // Pre-sort these vectors, such that we can set the correct indices for them.
+  auto vec = defvec;
+  std::sort(vec.begin(), vec.end(), compareName<T>);
+  for (int i = 0; i < static_cast<int>(vec.size()); i++) vec[i]->index = i;
+}
+
+void Parser::Serialize() {
+  builder_.Clear();
+  AssignIndices(structs_.vec);
+  AssignIndices(enums_.vec);
+  std::vector<Offset<reflection::Object>> object_offsets;
+  for (auto it = structs_.vec.begin(); it != structs_.vec.end(); ++it) {
+    auto offset = (*it)->Serialize(&builder_, *this);
+    object_offsets.push_back(offset);
+    (*it)->serialized_location = offset.o;
+  }
+  std::vector<Offset<reflection::Enum>> enum_offsets;
+  for (auto it = enums_.vec.begin(); it != enums_.vec.end(); ++it) {
+    auto offset = (*it)->Serialize(&builder_, *this);
+    enum_offsets.push_back(offset);
+    (*it)->serialized_location = offset.o;
+  }
+  std::vector<Offset<reflection::Service>> service_offsets;
+  for (auto it = services_.vec.begin(); it != services_.vec.end(); ++it) {
+    auto offset = (*it)->Serialize(&builder_, *this);
+    service_offsets.push_back(offset);
+    (*it)->serialized_location = offset.o;
+  }
+  auto objs__ = builder_.CreateVectorOfSortedTables(&object_offsets);
+  auto enum__ = builder_.CreateVectorOfSortedTables(&enum_offsets);
+  auto fiid__ = builder_.CreateString(file_identifier_);
+  auto fext__ = builder_.CreateString(file_extension_);
+  auto serv__ = builder_.CreateVectorOfSortedTables(&service_offsets);
+  auto schema_offset = reflection::CreateSchema(
+      builder_, objs__, enum__, fiid__, fext__,
+      (root_struct_def_ ? root_struct_def_->serialized_location : 0), serv__,
+      static_cast<reflection::AdvancedFeatures>(advanced_features_));
+  if (opts.size_prefixed) {
+    builder_.FinishSizePrefixed(schema_offset, reflection::SchemaIdentifier());
+  } else {
+    builder_.Finish(schema_offset, reflection::SchemaIdentifier());
+  }
+}
+
+static Namespace *GetNamespace(
+    const std::string &qualified_name, std::vector<Namespace *> &namespaces,
+    std::map<std::string, Namespace *> &namespaces_index) {
+  size_t dot = qualified_name.find_last_of('.');
+  std::string namespace_name = (dot != std::string::npos)
+                                   ? std::string(qualified_name.c_str(), dot)
+                                   : "";
+  Namespace *&ns = namespaces_index[namespace_name];
+
+  if (!ns) {
+    ns = new Namespace();
+    namespaces.push_back(ns);
+
+    size_t pos = 0;
+
+    for (;;) {
+      dot = qualified_name.find('.', pos);
+      if (dot == std::string::npos) { break; }
+      ns->components.push_back(qualified_name.substr(pos, dot - pos));
+      pos = dot + 1;
+    }
+  }
+
+  return ns;
+}
+
+Offset<reflection::Object> StructDef::Serialize(FlatBufferBuilder *builder,
+                                                const Parser &parser) const {
+  std::vector<Offset<reflection::Field>> field_offsets;
+  for (auto it = fields.vec.begin(); it != fields.vec.end(); ++it) {
+    field_offsets.push_back((*it)->Serialize(
+        builder, static_cast<uint16_t>(it - fields.vec.begin()), parser));
+  }
+  auto qualified_name = defined_namespace->GetFullyQualifiedName(name);
+  auto name__ = builder->CreateString(qualified_name);
+  auto flds__ = builder->CreateVectorOfSortedTables(&field_offsets);
+  auto attr__ = SerializeAttributes(builder, parser);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  return reflection::CreateObject(*builder, name__, flds__, fixed,
+                                  static_cast<int>(minalign),
+                                  static_cast<int>(bytesize), attr__, docs__);
+}
+
+bool StructDef::Deserialize(Parser &parser, const reflection::Object *object) {
+  if (!DeserializeAttributes(parser, object->attributes())) return false;
+  DeserializeDoc(doc_comment, object->documentation());
+  name = parser.UnqualifiedName(object->name()->str());
+  predecl = false;
+  sortbysize = attributes.Lookup("original_order") == nullptr && !fixed;
+  const auto &of = *(object->fields());
+  auto indexes = std::vector<uoffset_t>(of.size());
+  for (uoffset_t i = 0; i < of.size(); i++) indexes[of.Get(i)->id()] = i;
+  size_t tmp_struct_size = 0;
+  for (size_t i = 0; i < indexes.size(); i++) {
+    auto field = of.Get(indexes[i]);
+    auto field_def = new FieldDef();
+    if (!field_def->Deserialize(parser, field) ||
+        fields.Add(field_def->name, field_def)) {
+      delete field_def;
+      return false;
+    }
+    if (fixed) {
+      // Recompute padding since that's currently not serialized.
+      auto size = InlineSize(field_def->value.type);
+      auto next_field =
+          i + 1 < indexes.size() ? of.Get(indexes[i + 1]) : nullptr;
+      tmp_struct_size += size;
+      field_def->padding =
+          next_field ? (next_field->offset() - field_def->value.offset) - size
+                     : PaddingBytes(tmp_struct_size, minalign);
+      tmp_struct_size += field_def->padding;
+    }
+  }
+  FLATBUFFERS_ASSERT(static_cast<int>(tmp_struct_size) == object->bytesize());
+  return true;
+}
+
+Offset<reflection::Field> FieldDef::Serialize(FlatBufferBuilder *builder,
+                                              uint16_t id,
+                                              const Parser &parser) const {
+  auto name__ = builder->CreateString(name);
+  auto type__ = value.type.Serialize(builder);
+  auto attr__ = SerializeAttributes(builder, parser);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  double d;
+  StringToNumber(value.constant.c_str(), &d);
+  return reflection::CreateField(
+      *builder, name__, type__, id, value.offset,
+      // Is uint64>max(int64) tested?
+      IsInteger(value.type.base_type) ? StringToInt(value.constant.c_str()) : 0,
+      // result may be platform-dependent if underlying is float (not double)
+      IsFloat(value.type.base_type) ? d : 0.0, deprecated, IsRequired(), key,
+      attr__, docs__, IsOptional());
+  // TODO: value.constant is almost always "0", we could save quite a bit of
+  // space by sharing it. Same for common values of value.type.
+}
+
+bool FieldDef::Deserialize(Parser &parser, const reflection::Field *field) {
+  name = field->name()->str();
+  defined_namespace = parser.current_namespace_;
+  if (!value.type.Deserialize(parser, field->type())) return false;
+  value.offset = field->offset();
+  if (IsInteger(value.type.base_type)) {
+    value.constant = NumToString(field->default_integer());
+  } else if (IsFloat(value.type.base_type)) {
+    value.constant = FloatToString(field->default_real(), 16);
+  }
+  presence = FieldDef::MakeFieldPresence(field->optional(), field->required());
+  key = field->key();
+  if (!DeserializeAttributes(parser, field->attributes())) return false;
+  // TODO: this should probably be handled by a separate attribute
+  if (attributes.Lookup("flexbuffer")) {
+    flexbuffer = true;
+    parser.uses_flexbuffers_ = true;
+    if (value.type.base_type != BASE_TYPE_VECTOR ||
+        value.type.element != BASE_TYPE_UCHAR)
+      return false;
+  }
+  if (auto nested = attributes.Lookup("nested_flatbuffer")) {
+    auto nested_qualified_name =
+        parser.current_namespace_->GetFullyQualifiedName(nested->constant);
+    nested_flatbuffer = parser.LookupStruct(nested_qualified_name);
+    if (!nested_flatbuffer) return false;
+  }
+  shared = attributes.Lookup("shared") != nullptr;
+  DeserializeDoc(doc_comment, field->documentation());
+  return true;
+}
+
+Offset<reflection::RPCCall> RPCCall::Serialize(FlatBufferBuilder *builder,
+                                               const Parser &parser) const {
+  auto name__ = builder->CreateString(name);
+  auto attr__ = SerializeAttributes(builder, parser);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  return reflection::CreateRPCCall(
+      *builder, name__, request->serialized_location,
+      response->serialized_location, attr__, docs__);
+}
+
+bool RPCCall::Deserialize(Parser &parser, const reflection::RPCCall *call) {
+  name = call->name()->str();
+  if (!DeserializeAttributes(parser, call->attributes())) return false;
+  DeserializeDoc(doc_comment, call->documentation());
+  request = parser.structs_.Lookup(call->request()->name()->str());
+  response = parser.structs_.Lookup(call->response()->name()->str());
+  if (!request || !response) { return false; }
+  return true;
+}
+
+Offset<reflection::Service> ServiceDef::Serialize(FlatBufferBuilder *builder,
+                                                  const Parser &parser) const {
+  std::vector<Offset<reflection::RPCCall>> servicecall_offsets;
+  for (auto it = calls.vec.begin(); it != calls.vec.end(); ++it) {
+    servicecall_offsets.push_back((*it)->Serialize(builder, parser));
+  }
+  auto qualified_name = defined_namespace->GetFullyQualifiedName(name);
+  auto name__ = builder->CreateString(qualified_name);
+  auto call__ = builder->CreateVector(servicecall_offsets);
+  auto attr__ = SerializeAttributes(builder, parser);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  return reflection::CreateService(*builder, name__, call__, attr__, docs__);
+}
+
+bool ServiceDef::Deserialize(Parser &parser,
+                             const reflection::Service *service) {
+  name = parser.UnqualifiedName(service->name()->str());
+  if (service->calls()) {
+    for (uoffset_t i = 0; i < service->calls()->size(); ++i) {
+      auto call = new RPCCall();
+      if (!call->Deserialize(parser, service->calls()->Get(i)) ||
+          calls.Add(call->name, call)) {
+        delete call;
+        return false;
+      }
+    }
+  }
+  if (!DeserializeAttributes(parser, service->attributes())) return false;
+  DeserializeDoc(doc_comment, service->documentation());
+  return true;
+}
+
+Offset<reflection::Enum> EnumDef::Serialize(FlatBufferBuilder *builder,
+                                            const Parser &parser) const {
+  std::vector<Offset<reflection::EnumVal>> enumval_offsets;
+  for (auto it = vals.vec.begin(); it != vals.vec.end(); ++it) {
+    enumval_offsets.push_back((*it)->Serialize(builder, parser));
+  }
+  auto qualified_name = defined_namespace->GetFullyQualifiedName(name);
+  auto name__ = builder->CreateString(qualified_name);
+  auto vals__ = builder->CreateVector(enumval_offsets);
+  auto type__ = underlying_type.Serialize(builder);
+  auto attr__ = SerializeAttributes(builder, parser);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  return reflection::CreateEnum(*builder, name__, vals__, is_union, type__,
+                                attr__, docs__);
+}
+
+bool EnumDef::Deserialize(Parser &parser, const reflection::Enum *_enum) {
+  name = parser.UnqualifiedName(_enum->name()->str());
+  for (uoffset_t i = 0; i < _enum->values()->size(); ++i) {
+    auto val = new EnumVal();
+    if (!val->Deserialize(parser, _enum->values()->Get(i)) ||
+        vals.Add(val->name, val)) {
+      delete val;
+      return false;
+    }
+  }
+  is_union = _enum->is_union();
+  if (!underlying_type.Deserialize(parser, _enum->underlying_type())) {
+    return false;
+  }
+  if (!DeserializeAttributes(parser, _enum->attributes())) return false;
+  DeserializeDoc(doc_comment, _enum->documentation());
+  return true;
+}
+
+Offset<reflection::EnumVal> EnumVal::Serialize(FlatBufferBuilder *builder,
+                                               const Parser &parser) const {
+  auto name__ = builder->CreateString(name);
+  auto type__ = union_type.Serialize(builder);
+  auto docs__ = parser.opts.binary_schema_comments
+                    ? builder->CreateVectorOfStrings(doc_comment)
+                    : 0;
+  return reflection::CreateEnumVal(
+      *builder, name__, value,
+      union_type.struct_def ? union_type.struct_def->serialized_location : 0,
+      type__, docs__);
+}
+
+bool EnumVal::Deserialize(const Parser &parser,
+                          const reflection::EnumVal *val) {
+  name = val->name()->str();
+  value = val->value();
+  if (!union_type.Deserialize(parser, val->union_type())) return false;
+  DeserializeDoc(doc_comment, val->documentation());
+  return true;
+}
+
+Offset<reflection::Type> Type::Serialize(FlatBufferBuilder *builder) const {
+  return reflection::CreateType(
+      *builder, static_cast<reflection::BaseType>(base_type),
+      static_cast<reflection::BaseType>(element),
+      struct_def ? struct_def->index : (enum_def ? enum_def->index : -1),
+      fixed_length);
+}
+
+bool Type::Deserialize(const Parser &parser, const reflection::Type *type) {
+  if (type == nullptr) return true;
+  base_type = static_cast<BaseType>(type->base_type());
+  element = static_cast<BaseType>(type->element());
+  fixed_length = type->fixed_length();
+  if (type->index() >= 0) {
+    bool is_series = type->base_type() == reflection::Vector ||
+                     type->base_type() == reflection::Array;
+    if (type->base_type() == reflection::Obj ||
+        (is_series && type->element() == reflection::Obj)) {
+      if (static_cast<size_t>(type->index()) < parser.structs_.vec.size()) {
+        struct_def = parser.structs_.vec[type->index()];
+        struct_def->refcount++;
+      } else {
+        return false;
+      }
+    } else {
+      if (static_cast<size_t>(type->index()) < parser.enums_.vec.size()) {
+        enum_def = parser.enums_.vec[type->index()];
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+flatbuffers::Offset<
+    flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>>
+Definition::SerializeAttributes(FlatBufferBuilder *builder,
+                                const Parser &parser) const {
+  std::vector<flatbuffers::Offset<reflection::KeyValue>> attrs;
+  for (auto kv = attributes.dict.begin(); kv != attributes.dict.end(); ++kv) {
+    auto it = parser.known_attributes_.find(kv->first);
+    FLATBUFFERS_ASSERT(it != parser.known_attributes_.end());
+    if (parser.opts.binary_schema_builtins || !it->second) {
+      auto key = builder->CreateString(kv->first);
+      auto val = builder->CreateString(kv->second->constant);
+      attrs.push_back(reflection::CreateKeyValue(*builder, key, val));
+    }
+  }
+  if (attrs.size()) {
+    return builder->CreateVectorOfSortedTables(&attrs);
+  } else {
+    return 0;
+  }
+}
+
+bool Definition::DeserializeAttributes(
+    Parser &parser, const Vector<Offset<reflection::KeyValue>> *attrs) {
+  if (attrs == nullptr) return true;
+  for (uoffset_t i = 0; i < attrs->size(); ++i) {
+    auto kv = attrs->Get(i);
+    auto value = new Value();
+    if (kv->value()) { value->constant = kv->value()->str(); }
+    if (attributes.Add(kv->key()->str(), value)) {
+      delete value;
+      return false;
+    }
+    parser.known_attributes_[kv->key()->str()];
+  }
+  return true;
+}
+
+/************************************************************************/
+/* DESERIALIZATION                                                      */
+/************************************************************************/
+bool Parser::Deserialize(const uint8_t *buf, const size_t size) {
+  flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t *>(buf), size);
+  bool size_prefixed = false;
+  if (!reflection::SchemaBufferHasIdentifier(buf)) {
+    if (!flatbuffers::BufferHasIdentifier(buf, reflection::SchemaIdentifier(),
+                                          true))
+      return false;
+    else
+      size_prefixed = true;
+  }
+  auto verify_fn = size_prefixed ? &reflection::VerifySizePrefixedSchemaBuffer
+                                 : &reflection::VerifySchemaBuffer;
+  if (!verify_fn(verifier)) { return false; }
+  auto schema = size_prefixed ? reflection::GetSizePrefixedSchema(buf)
+                              : reflection::GetSchema(buf);
+  return Deserialize(schema);
+}
+
+bool Parser::Deserialize(const reflection::Schema *schema) {
+  file_identifier_ = schema->file_ident() ? schema->file_ident()->str() : "";
+  file_extension_ = schema->file_ext() ? schema->file_ext()->str() : "";
+  std::map<std::string, Namespace *> namespaces_index;
+
+  // Create defs without deserializing so references from fields to structs and
+  // enums can be resolved.
+  for (auto it = schema->objects()->begin(); it != schema->objects()->end();
+       ++it) {
+    auto struct_def = new StructDef();
+    struct_def->bytesize = it->bytesize();
+    struct_def->fixed = it->is_struct();
+    struct_def->minalign = it->minalign();
+    if (structs_.Add(it->name()->str(), struct_def)) {
+      delete struct_def;
+      return false;
+    }
+    auto type = new Type(BASE_TYPE_STRUCT, struct_def, nullptr);
+    if (types_.Add(it->name()->str(), type)) {
+      delete type;
+      return false;
+    }
+  }
+  for (auto it = schema->enums()->begin(); it != schema->enums()->end(); ++it) {
+    auto enum_def = new EnumDef();
+    if (enums_.Add(it->name()->str(), enum_def)) {
+      delete enum_def;
+      return false;
+    }
+    auto type = new Type(BASE_TYPE_UNION, nullptr, enum_def);
+    if (types_.Add(it->name()->str(), type)) {
+      delete type;
+      return false;
+    }
+  }
+
+  // Now fields can refer to structs and enums by index.
+  for (auto it = schema->objects()->begin(); it != schema->objects()->end();
+       ++it) {
+    std::string qualified_name = it->name()->str();
+    auto struct_def = structs_.Lookup(qualified_name);
+    struct_def->defined_namespace =
+        GetNamespace(qualified_name, namespaces_, namespaces_index);
+    if (!struct_def->Deserialize(*this, *it)) { return false; }
+    if (schema->root_table() == *it) { root_struct_def_ = struct_def; }
+  }
+  for (auto it = schema->enums()->begin(); it != schema->enums()->end(); ++it) {
+    std::string qualified_name = it->name()->str();
+    auto enum_def = enums_.Lookup(qualified_name);
+    enum_def->defined_namespace =
+        GetNamespace(qualified_name, namespaces_, namespaces_index);
+    if (!enum_def->Deserialize(*this, *it)) { return false; }
+  }
+
+  if (schema->services()) {
+    for (auto it = schema->services()->begin(); it != schema->services()->end();
+         ++it) {
+      std::string qualified_name = it->name()->str();
+      auto service_def = new ServiceDef();
+      service_def->defined_namespace =
+          GetNamespace(qualified_name, namespaces_, namespaces_index);
+      if (!service_def->Deserialize(*this, *it) ||
+          services_.Add(qualified_name, service_def)) {
+        delete service_def;
+        return false;
+      }
+    }
+  }
+  advanced_features_ = schema->advanced_features();
+  return true;
+}
+
+std::string Parser::ConformTo(const Parser &base) {
+  for (auto sit = structs_.vec.begin(); sit != structs_.vec.end(); ++sit) {
+    auto &struct_def = **sit;
+    auto qualified_name =
+        struct_def.defined_namespace->GetFullyQualifiedName(struct_def.name);
+    auto struct_def_base = base.LookupStruct(qualified_name);
+    if (!struct_def_base) continue;
+    for (auto fit = struct_def.fields.vec.begin();
+         fit != struct_def.fields.vec.end(); ++fit) {
+      auto &field = **fit;
+      auto field_base = struct_def_base->fields.Lookup(field.name);
+      if (field_base) {
+        if (field.value.offset != field_base->value.offset)
+          return "offsets differ for field: " + field.name;
+        if (field.value.constant != field_base->value.constant)
+          return "defaults differ for field: " + field.name;
+        if (!EqualByName(field.value.type, field_base->value.type))
+          return "types differ for field: " + field.name;
+      } else {
+        // Doesn't have to exist, deleting fields is fine.
+        // But we should check if there is a field that has the same offset
+        // but is incompatible (in the case of field renaming).
+        for (auto fbit = struct_def_base->fields.vec.begin();
+             fbit != struct_def_base->fields.vec.end(); ++fbit) {
+          field_base = *fbit;
+          if (field.value.offset == field_base->value.offset) {
+            if (!EqualByName(field.value.type, field_base->value.type))
+              return "field renamed to different type: " + field.name;
+            break;
+          }
+        }
+      }
+    }
+  }
+  for (auto eit = enums_.vec.begin(); eit != enums_.vec.end(); ++eit) {
+    auto &enum_def = **eit;
+    auto qualified_name =
+        enum_def.defined_namespace->GetFullyQualifiedName(enum_def.name);
+    auto enum_def_base = base.enums_.Lookup(qualified_name);
+    if (!enum_def_base) continue;
+    for (auto evit = enum_def.Vals().begin(); evit != enum_def.Vals().end();
+         ++evit) {
+      auto &enum_val = **evit;
+      auto enum_val_base = enum_def_base->Lookup(enum_val.name);
+      if (enum_val_base) {
+        if (enum_val != *enum_val_base)
+          return "values differ for enum: " + enum_val.name;
+      }
+    }
+  }
+  return "";
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/reflection.cpp b/3rdparty/TNN/third_party/flatbuffers/src/reflection.cpp
new file mode 100644
index 0000000..2dedcb4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/reflection.cpp
@@ -0,0 +1,713 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatbuffers/reflection.h"
+
+#include "flatbuffers/util.h"
+
+// Helper functionality for reflection.
+
+namespace flatbuffers {
+
+int64_t GetAnyValueI(reflection::BaseType type, const uint8_t *data) {
+// clang-format off
+  #define FLATBUFFERS_GET(T) static_cast<int64_t>(ReadScalar<T>(data))
+  switch (type) {
+    case reflection::UType:
+    case reflection::Bool:
+    case reflection::UByte:  return FLATBUFFERS_GET(uint8_t);
+    case reflection::Byte:   return FLATBUFFERS_GET(int8_t);
+    case reflection::Short:  return FLATBUFFERS_GET(int16_t);
+    case reflection::UShort: return FLATBUFFERS_GET(uint16_t);
+    case reflection::Int:    return FLATBUFFERS_GET(int32_t);
+    case reflection::UInt:   return FLATBUFFERS_GET(uint32_t);
+    case reflection::Long:   return FLATBUFFERS_GET(int64_t);
+    case reflection::ULong:  return FLATBUFFERS_GET(uint64_t);
+    case reflection::Float:  return FLATBUFFERS_GET(float);
+    case reflection::Double: return FLATBUFFERS_GET(double);
+    case reflection::String: {
+      auto s = reinterpret_cast<const String *>(ReadScalar<uoffset_t>(data) +
+                                                data);
+      return s ? StringToInt(s->c_str()) : 0;
+    }
+    default: return 0;  // Tables & vectors do not make sense.
+  }
+  #undef FLATBUFFERS_GET
+  // clang-format on
+}
+
+double GetAnyValueF(reflection::BaseType type, const uint8_t *data) {
+  switch (type) {
+    case reflection::Float: return static_cast<double>(ReadScalar<float>(data));
+    case reflection::Double: return ReadScalar<double>(data);
+    case reflection::String: {
+      auto s =
+          reinterpret_cast<const String *>(ReadScalar<uoffset_t>(data) + data);
+      if (s) {
+        double d;
+        StringToNumber(s->c_str(), &d);
+        return d;
+      } else {
+        return 0.0;
+      }
+    }
+    default: return static_cast<double>(GetAnyValueI(type, data));
+  }
+}
+
+std::string GetAnyValueS(reflection::BaseType type, const uint8_t *data,
+                         const reflection::Schema *schema, int type_index) {
+  switch (type) {
+    case reflection::Float:
+    case reflection::Double: return NumToString(GetAnyValueF(type, data));
+    case reflection::String: {
+      auto s =
+          reinterpret_cast<const String *>(ReadScalar<uoffset_t>(data) + data);
+      return s ? s->c_str() : "";
+    }
+    case reflection::Obj:
+      if (schema) {
+        // Convert the table to a string. This is mostly for debugging purposes,
+        // and does NOT promise to be JSON compliant.
+        // Also prefixes the type.
+        auto &objectdef = *schema->objects()->Get(type_index);
+        auto s = objectdef.name()->str();
+        if (objectdef.is_struct()) {
+          s += "(struct)";  // TODO: implement this as well.
+        } else {
+          auto table_field = reinterpret_cast<const Table *>(
+              ReadScalar<uoffset_t>(data) + data);
+          s += " { ";
+          auto fielddefs = objectdef.fields();
+          for (auto it = fielddefs->begin(); it != fielddefs->end(); ++it) {
+            auto &fielddef = **it;
+            if (!table_field->CheckField(fielddef.offset())) continue;
+            auto val = GetAnyFieldS(*table_field, fielddef, schema);
+            if (fielddef.type()->base_type() == reflection::String) {
+              std::string esc;
+              flatbuffers::EscapeString(val.c_str(), val.length(), &esc, true,
+                                        false);
+              val = esc;
+            }
+            s += fielddef.name()->str();
+            s += ": ";
+            s += val;
+            s += ", ";
+          }
+          s += "}";
+        }
+        return s;
+      } else {
+        return "(table)";
+      }
+    case reflection::Vector:
+      return "[(elements)]";                   // TODO: implement this as well.
+    case reflection::Union: return "(union)";  // TODO: implement this as well.
+    default: return NumToString(GetAnyValueI(type, data));
+  }
+}
+
+void SetAnyValueI(reflection::BaseType type, uint8_t *data, int64_t val) {
+// clang-format off
+  #define FLATBUFFERS_SET(T) WriteScalar(data, static_cast<T>(val))
+  switch (type) {
+    case reflection::UType:
+    case reflection::Bool:
+    case reflection::UByte:  FLATBUFFERS_SET(uint8_t ); break;
+    case reflection::Byte:   FLATBUFFERS_SET(int8_t  ); break;
+    case reflection::Short:  FLATBUFFERS_SET(int16_t ); break;
+    case reflection::UShort: FLATBUFFERS_SET(uint16_t); break;
+    case reflection::Int:    FLATBUFFERS_SET(int32_t ); break;
+    case reflection::UInt:   FLATBUFFERS_SET(uint32_t); break;
+    case reflection::Long:   FLATBUFFERS_SET(int64_t ); break;
+    case reflection::ULong:  FLATBUFFERS_SET(uint64_t); break;
+    case reflection::Float:  FLATBUFFERS_SET(float   ); break;
+    case reflection::Double: FLATBUFFERS_SET(double  ); break;
+    // TODO: support strings
+    default: break;
+  }
+  #undef FLATBUFFERS_SET
+  // clang-format on
+}
+
+void SetAnyValueF(reflection::BaseType type, uint8_t *data, double val) {
+  switch (type) {
+    case reflection::Float: WriteScalar(data, static_cast<float>(val)); break;
+    case reflection::Double: WriteScalar(data, val); break;
+    // TODO: support strings.
+    default: SetAnyValueI(type, data, static_cast<int64_t>(val)); break;
+  }
+}
+
+void SetAnyValueS(reflection::BaseType type, uint8_t *data, const char *val) {
+  switch (type) {
+    case reflection::Float:
+    case reflection::Double: {
+      double d;
+      StringToNumber(val, &d);
+      SetAnyValueF(type, data, d);
+      break;
+    }
+    // TODO: support strings.
+    default: SetAnyValueI(type, data, StringToInt(val)); break;
+  }
+}
+
+// Resize a FlatBuffer in-place by iterating through all offsets in the buffer
+// and adjusting them by "delta" if they straddle the start offset.
+// Once that is done, bytes can now be inserted/deleted safely.
+// "delta" may be negative (shrinking).
+// Unless "delta" is a multiple of the largest alignment, you'll create a small
+// amount of garbage space in the buffer (usually 0..7 bytes).
+// If your FlatBuffer's root table is not the schema's root table, you should
+// pass in your root_table type as well.
+class ResizeContext {
+ public:
+  ResizeContext(const reflection::Schema &schema, uoffset_t start, int delta,
+                std::vector<uint8_t> *flatbuf,
+                const reflection::Object *root_table = nullptr)
+      : schema_(schema),
+        startptr_(vector_data(*flatbuf) + start),
+        delta_(delta),
+        buf_(*flatbuf),
+        dag_check_(flatbuf->size() / sizeof(uoffset_t), false) {
+    auto mask = static_cast<int>(sizeof(largest_scalar_t) - 1);
+    delta_ = (delta_ + mask) & ~mask;
+    if (!delta_) return;  // We can't shrink by less than largest_scalar_t.
+    // Now change all the offsets by delta_.
+    auto root = GetAnyRoot(vector_data(buf_));
+    Straddle<uoffset_t, 1>(vector_data(buf_), root, vector_data(buf_));
+    ResizeTable(root_table ? *root_table : *schema.root_table(), root);
+    // We can now add or remove bytes at start.
+    if (delta_ > 0)
+      buf_.insert(buf_.begin() + start, delta_, 0);
+    else
+      buf_.erase(buf_.begin() + start + delta_, buf_.begin() + start);
+  }
+
+  // Check if the range between first (lower address) and second straddles
+  // the insertion point. If it does, change the offset at offsetloc (of
+  // type T, with direction D).
+  template<typename T, int D>
+  void Straddle(const void *first, const void *second, void *offsetloc) {
+    if (first <= startptr_ && second >= startptr_) {
+      WriteScalar<T>(offsetloc, ReadScalar<T>(offsetloc) + delta_ * D);
+      DagCheck(offsetloc) = true;
+    }
+  }
+
+  // This returns a boolean that records if the corresponding offset location
+  // has been modified already. If so, we can't even read the corresponding
+  // offset, since it is pointing to a location that is illegal until the
+  // resize actually happens.
+  // This must be checked for every offset, since we can't know which offsets
+  // will straddle and which won't.
+  uint8_t &DagCheck(const void *offsetloc) {
+    auto dag_idx = reinterpret_cast<const uoffset_t *>(offsetloc) -
+                   reinterpret_cast<const uoffset_t *>(vector_data(buf_));
+    return dag_check_[dag_idx];
+  }
+
+  void ResizeTable(const reflection::Object &objectdef, Table *table) {
+    if (DagCheck(table)) return;  // Table already visited.
+    auto vtable = table->GetVTable();
+    // Early out: since all fields inside the table must point forwards in
+    // memory, if the insertion point is before the table we can stop here.
+    auto tableloc = reinterpret_cast<uint8_t *>(table);
+    if (startptr_ <= tableloc) {
+      // Check if insertion point is between the table and a vtable that
+      // precedes it. This can't happen in current construction code, but check
+      // just in case we ever change the way flatbuffers are built.
+      Straddle<soffset_t, -1>(vtable, table, table);
+    } else {
+      // Check each field.
+      auto fielddefs = objectdef.fields();
+      for (auto it = fielddefs->begin(); it != fielddefs->end(); ++it) {
+        auto &fielddef = **it;
+        auto base_type = fielddef.type()->base_type();
+        // Ignore scalars.
+        if (base_type <= reflection::Double) continue;
+        // Ignore fields that are not stored.
+        auto offset = table->GetOptionalFieldOffset(fielddef.offset());
+        if (!offset) continue;
+        // Ignore structs.
+        auto subobjectdef =
+            base_type == reflection::Obj
+                ? schema_.objects()->Get(fielddef.type()->index())
+                : nullptr;
+        if (subobjectdef && subobjectdef->is_struct()) continue;
+        // Get this fields' offset, and read it if safe.
+        auto offsetloc = tableloc + offset;
+        if (DagCheck(offsetloc)) continue;  // This offset already visited.
+        auto ref = offsetloc + ReadScalar<uoffset_t>(offsetloc);
+        Straddle<uoffset_t, 1>(offsetloc, ref, offsetloc);
+        // Recurse.
+        switch (base_type) {
+          case reflection::Obj: {
+            ResizeTable(*subobjectdef, reinterpret_cast<Table *>(ref));
+            break;
+          }
+          case reflection::Vector: {
+            auto elem_type = fielddef.type()->element();
+            if (elem_type != reflection::Obj && elem_type != reflection::String)
+              break;
+            auto vec = reinterpret_cast<Vector<uoffset_t> *>(ref);
+            auto elemobjectdef =
+                elem_type == reflection::Obj
+                    ? schema_.objects()->Get(fielddef.type()->index())
+                    : nullptr;
+            if (elemobjectdef && elemobjectdef->is_struct()) break;
+            for (uoffset_t i = 0; i < vec->size(); i++) {
+              auto loc = vec->Data() + i * sizeof(uoffset_t);
+              if (DagCheck(loc)) continue;  // This offset already visited.
+              auto dest = loc + vec->Get(i);
+              Straddle<uoffset_t, 1>(loc, dest, loc);
+              if (elemobjectdef)
+                ResizeTable(*elemobjectdef, reinterpret_cast<Table *>(dest));
+            }
+            break;
+          }
+          case reflection::Union: {
+            ResizeTable(GetUnionType(schema_, objectdef, fielddef, *table),
+                        reinterpret_cast<Table *>(ref));
+            break;
+          }
+          case reflection::String: break;
+          default: FLATBUFFERS_ASSERT(false);
+        }
+      }
+      // Check if the vtable offset points beyond the insertion point.
+      // Must do this last, since GetOptionalFieldOffset above still reads
+      // this value.
+      Straddle<soffset_t, -1>(table, vtable, table);
+    }
+  }
+
+ private:
+  const reflection::Schema &schema_;
+  uint8_t *startptr_;
+  int delta_;
+  std::vector<uint8_t> &buf_;
+  std::vector<uint8_t> dag_check_;
+};
+
+void SetString(const reflection::Schema &schema, const std::string &val,
+               const String *str, std::vector<uint8_t> *flatbuf,
+               const reflection::Object *root_table) {
+  auto delta = static_cast<int>(val.size()) - static_cast<int>(str->size());
+  auto str_start = static_cast<uoffset_t>(
+      reinterpret_cast<const uint8_t *>(str) - vector_data(*flatbuf));
+  auto start = str_start + static_cast<uoffset_t>(sizeof(uoffset_t));
+  if (delta) {
+    // Clear the old string, since we don't want parts of it remaining.
+    memset(vector_data(*flatbuf) + start, 0, str->size());
+    // Different size, we must expand (or contract).
+    ResizeContext(schema, start, delta, flatbuf, root_table);
+    // Set the new length.
+    WriteScalar(vector_data(*flatbuf) + str_start,
+                static_cast<uoffset_t>(val.size()));
+  }
+  // Copy new data. Safe because we created the right amount of space.
+  memcpy(vector_data(*flatbuf) + start, val.c_str(), val.size() + 1);
+}
+
+uint8_t *ResizeAnyVector(const reflection::Schema &schema, uoffset_t newsize,
+                         const VectorOfAny *vec, uoffset_t num_elems,
+                         uoffset_t elem_size, std::vector<uint8_t> *flatbuf,
+                         const reflection::Object *root_table) {
+  auto delta_elem = static_cast<int>(newsize) - static_cast<int>(num_elems);
+  auto delta_bytes = delta_elem * static_cast<int>(elem_size);
+  auto vec_start =
+      reinterpret_cast<const uint8_t *>(vec) - vector_data(*flatbuf);
+  auto start = static_cast<uoffset_t>(vec_start + sizeof(uoffset_t) +
+                                      elem_size * num_elems);
+  if (delta_bytes) {
+    if (delta_elem < 0) {
+      // Clear elements we're throwing away, since some might remain in the
+      // buffer.
+      auto size_clear = -delta_elem * elem_size;
+      memset(vector_data(*flatbuf) + start - size_clear, 0, size_clear);
+    }
+    ResizeContext(schema, start, delta_bytes, flatbuf, root_table);
+    WriteScalar(vector_data(*flatbuf) + vec_start, newsize);  // Length field.
+    // Set new elements to 0.. this can be overwritten by the caller.
+    if (delta_elem > 0) {
+      memset(vector_data(*flatbuf) + start, 0, delta_elem * elem_size);
+    }
+  }
+  return vector_data(*flatbuf) + start;
+}
+
+const uint8_t *AddFlatBuffer(std::vector<uint8_t> &flatbuf,
+                             const uint8_t *newbuf, size_t newlen) {
+  // Align to sizeof(uoffset_t) past sizeof(largest_scalar_t) since we're
+  // going to chop off the root offset.
+  while ((flatbuf.size() & (sizeof(uoffset_t) - 1)) ||
+         !(flatbuf.size() & (sizeof(largest_scalar_t) - 1))) {
+    flatbuf.push_back(0);
+  }
+  auto insertion_point = static_cast<uoffset_t>(flatbuf.size());
+  // Insert the entire FlatBuffer minus the root pointer.
+  flatbuf.insert(flatbuf.end(), newbuf + sizeof(uoffset_t), newbuf + newlen);
+  auto root_offset = ReadScalar<uoffset_t>(newbuf) - sizeof(uoffset_t);
+  return vector_data(flatbuf) + insertion_point + root_offset;
+}
+
+void CopyInline(FlatBufferBuilder &fbb, const reflection::Field &fielddef,
+                const Table &table, size_t align, size_t size) {
+  fbb.Align(align);
+  fbb.PushBytes(table.GetStruct<const uint8_t *>(fielddef.offset()), size);
+  fbb.TrackField(fielddef.offset(), fbb.GetSize());
+}
+
+Offset<const Table *> CopyTable(FlatBufferBuilder &fbb,
+                                const reflection::Schema &schema,
+                                const reflection::Object &objectdef,
+                                const Table &table, bool use_string_pooling) {
+  // Before we can construct the table, we have to first generate any
+  // subobjects, and collect their offsets.
+  std::vector<uoffset_t> offsets;
+  auto fielddefs = objectdef.fields();
+  for (auto it = fielddefs->begin(); it != fielddefs->end(); ++it) {
+    auto &fielddef = **it;
+    // Skip if field is not present in the source.
+    if (!table.CheckField(fielddef.offset())) continue;
+    uoffset_t offset = 0;
+    switch (fielddef.type()->base_type()) {
+      case reflection::String: {
+        offset = use_string_pooling
+                     ? fbb.CreateSharedString(GetFieldS(table, fielddef)).o
+                     : fbb.CreateString(GetFieldS(table, fielddef)).o;
+        break;
+      }
+      case reflection::Obj: {
+        auto &subobjectdef = *schema.objects()->Get(fielddef.type()->index());
+        if (!subobjectdef.is_struct()) {
+          offset = CopyTable(fbb, schema, subobjectdef,
+                             *GetFieldT(table, fielddef), use_string_pooling)
+                       .o;
+        }
+        break;
+      }
+      case reflection::Union: {
+        auto &subobjectdef = GetUnionType(schema, objectdef, fielddef, table);
+        offset = CopyTable(fbb, schema, subobjectdef,
+                           *GetFieldT(table, fielddef), use_string_pooling)
+                     .o;
+        break;
+      }
+      case reflection::Vector: {
+        auto vec =
+            table.GetPointer<const Vector<Offset<Table>> *>(fielddef.offset());
+        auto element_base_type = fielddef.type()->element();
+        auto elemobjectdef =
+            element_base_type == reflection::Obj
+                ? schema.objects()->Get(fielddef.type()->index())
+                : nullptr;
+        switch (element_base_type) {
+          case reflection::String: {
+            std::vector<Offset<const String *>> elements(vec->size());
+            auto vec_s = reinterpret_cast<const Vector<Offset<String>> *>(vec);
+            for (uoffset_t i = 0; i < vec_s->size(); i++) {
+              elements[i] = use_string_pooling
+                                ? fbb.CreateSharedString(vec_s->Get(i)).o
+                                : fbb.CreateString(vec_s->Get(i)).o;
+            }
+            offset = fbb.CreateVector(elements).o;
+            break;
+          }
+          case reflection::Obj: {
+            if (!elemobjectdef->is_struct()) {
+              std::vector<Offset<const Table *>> elements(vec->size());
+              for (uoffset_t i = 0; i < vec->size(); i++) {
+                elements[i] = CopyTable(fbb, schema, *elemobjectdef,
+                                        *vec->Get(i), use_string_pooling);
+              }
+              offset = fbb.CreateVector(elements).o;
+              break;
+            }
+          }
+            FLATBUFFERS_FALLTHROUGH();  // fall thru
+          default: {                    // Scalars and structs.
+            auto element_size = GetTypeSize(element_base_type);
+            if (elemobjectdef && elemobjectdef->is_struct())
+              element_size = elemobjectdef->bytesize();
+            fbb.StartVector(vec->size(), element_size);
+            fbb.PushBytes(vec->Data(), element_size * vec->size());
+            offset = fbb.EndVector(vec->size());
+            break;
+          }
+        }
+        break;
+      }
+      default:  // Scalars.
+        break;
+    }
+    if (offset) { offsets.push_back(offset); }
+  }
+  // Now we can build the actual table from either offsets or scalar data.
+  auto start = objectdef.is_struct() ? fbb.StartStruct(objectdef.minalign())
+                                     : fbb.StartTable();
+  size_t offset_idx = 0;
+  for (auto it = fielddefs->begin(); it != fielddefs->end(); ++it) {
+    auto &fielddef = **it;
+    if (!table.CheckField(fielddef.offset())) continue;
+    auto base_type = fielddef.type()->base_type();
+    switch (base_type) {
+      case reflection::Obj: {
+        auto &subobjectdef = *schema.objects()->Get(fielddef.type()->index());
+        if (subobjectdef.is_struct()) {
+          CopyInline(fbb, fielddef, table, subobjectdef.minalign(),
+                     subobjectdef.bytesize());
+          break;
+        }
+      }
+        FLATBUFFERS_FALLTHROUGH();  // fall thru
+      case reflection::Union:
+      case reflection::String:
+      case reflection::Vector:
+        fbb.AddOffset(fielddef.offset(), Offset<void>(offsets[offset_idx++]));
+        break;
+      default: {  // Scalars.
+        auto size = GetTypeSize(base_type);
+        CopyInline(fbb, fielddef, table, size, size);
+        break;
+      }
+    }
+  }
+  FLATBUFFERS_ASSERT(offset_idx == offsets.size());
+  if (objectdef.is_struct()) {
+    fbb.ClearOffsets();
+    return fbb.EndStruct();
+  } else {
+    return fbb.EndTable(start);
+  }
+}
+
+bool VerifyStruct(flatbuffers::Verifier &v,
+                  const flatbuffers::Table &parent_table,
+                  voffset_t field_offset, const reflection::Object &obj,
+                  bool required) {
+  auto offset = parent_table.GetOptionalFieldOffset(field_offset);
+  if (required && !offset) { return false; }
+
+  return !offset || v.Verify(reinterpret_cast<const uint8_t *>(&parent_table),
+                             offset, obj.bytesize());
+}
+
+bool VerifyVectorOfStructs(flatbuffers::Verifier &v,
+                           const flatbuffers::Table &parent_table,
+                           voffset_t field_offset,
+                           const reflection::Object &obj, bool required) {
+  auto p = parent_table.GetPointer<const uint8_t *>(field_offset);
+  if (required && !p) { return false; }
+
+  return !p || v.VerifyVectorOrString(p, obj.bytesize());
+}
+
+// forward declare to resolve cyclic deps between VerifyObject and VerifyVector
+bool VerifyObject(flatbuffers::Verifier &v, const reflection::Schema &schema,
+                  const reflection::Object &obj,
+                  const flatbuffers::Table *table, bool required);
+
+bool VerifyUnion(flatbuffers::Verifier &v, const reflection::Schema &schema,
+                 uint8_t utype, const uint8_t *elem,
+                 const reflection::Field &union_field) {
+  if (!utype) return true;  // Not present.
+  auto fb_enum = schema.enums()->Get(union_field.type()->index());
+  if (utype >= fb_enum->values()->size()) return false;
+  auto elem_type = fb_enum->values()->Get(utype)->union_type();
+  switch (elem_type->base_type()) {
+    case reflection::Obj: {
+      auto elem_obj = schema.objects()->Get(elem_type->index());
+      if (elem_obj->is_struct()) {
+        return v.VerifyFromPointer(elem, elem_obj->bytesize());
+      } else {
+        return VerifyObject(v, schema, *elem_obj,
+                            reinterpret_cast<const flatbuffers::Table *>(elem),
+                            true);
+      }
+    }
+    case reflection::String:
+      return v.VerifyString(
+          reinterpret_cast<const flatbuffers::String *>(elem));
+    default: return false;
+  }
+}
+
+bool VerifyVector(flatbuffers::Verifier &v, const reflection::Schema &schema,
+                  const flatbuffers::Table &table,
+                  const reflection::Field &vec_field) {
+  FLATBUFFERS_ASSERT(vec_field.type()->base_type() == reflection::Vector);
+  if (!table.VerifyField<uoffset_t>(v, vec_field.offset())) return false;
+
+  switch (vec_field.type()->element()) {
+    case reflection::UType:
+      return v.VerifyVector(flatbuffers::GetFieldV<uint8_t>(table, vec_field));
+    case reflection::Bool:
+    case reflection::Byte:
+    case reflection::UByte:
+      return v.VerifyVector(flatbuffers::GetFieldV<int8_t>(table, vec_field));
+    case reflection::Short:
+    case reflection::UShort:
+      return v.VerifyVector(flatbuffers::GetFieldV<int16_t>(table, vec_field));
+    case reflection::Int:
+    case reflection::UInt:
+      return v.VerifyVector(flatbuffers::GetFieldV<int32_t>(table, vec_field));
+    case reflection::Long:
+    case reflection::ULong:
+      return v.VerifyVector(flatbuffers::GetFieldV<int64_t>(table, vec_field));
+    case reflection::Float:
+      return v.VerifyVector(flatbuffers::GetFieldV<float>(table, vec_field));
+    case reflection::Double:
+      return v.VerifyVector(flatbuffers::GetFieldV<double>(table, vec_field));
+    case reflection::String: {
+      auto vec_string =
+          flatbuffers::GetFieldV<flatbuffers::Offset<flatbuffers::String>>(
+              table, vec_field);
+      if (v.VerifyVector(vec_string) && v.VerifyVectorOfStrings(vec_string)) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+    case reflection::Obj: {
+      auto obj = schema.objects()->Get(vec_field.type()->index());
+      if (obj->is_struct()) {
+        return VerifyVectorOfStructs(v, table, vec_field.offset(), *obj,
+                                     vec_field.required());
+      } else {
+        auto vec =
+            flatbuffers::GetFieldV<flatbuffers::Offset<flatbuffers::Table>>(
+                table, vec_field);
+        if (!v.VerifyVector(vec)) return false;
+        if (!vec) return true;
+        for (uoffset_t j = 0; j < vec->size(); j++) {
+          if (!VerifyObject(v, schema, *obj, vec->Get(j), true)) {
+            return false;
+          }
+        }
+        return true;
+      }
+    }
+    case reflection::Union: {
+      auto vec = flatbuffers::GetFieldV<flatbuffers::Offset<uint8_t>>(
+          table, vec_field);
+      if (!v.VerifyVector(vec)) return false;
+      if (!vec) return true;
+      auto type_vec = table.GetPointer<Vector<uint8_t> *>(vec_field.offset() -
+                                                          sizeof(voffset_t));
+      if (!v.VerifyVector(type_vec)) return false;
+      for (uoffset_t j = 0; j < vec->size(); j++) {
+        //  get union type from the prev field
+        auto utype = type_vec->Get(j);
+        auto elem = vec->Get(j);
+        if (!VerifyUnion(v, schema, utype, elem, vec_field)) return false;
+      }
+      return true;
+    }
+    case reflection::Vector:
+    case reflection::None:
+    default: FLATBUFFERS_ASSERT(false); return false;
+  }
+}
+
+bool VerifyObject(flatbuffers::Verifier &v, const reflection::Schema &schema,
+                  const reflection::Object &obj,
+                  const flatbuffers::Table *table, bool required) {
+  if (!table) return !required;
+  if (!table->VerifyTableStart(v)) return false;
+  for (uoffset_t i = 0; i < obj.fields()->size(); i++) {
+    auto field_def = obj.fields()->Get(i);
+    switch (field_def->type()->base_type()) {
+      case reflection::None: FLATBUFFERS_ASSERT(false); break;
+      case reflection::UType:
+        if (!table->VerifyField<uint8_t>(v, field_def->offset())) return false;
+        break;
+      case reflection::Bool:
+      case reflection::Byte:
+      case reflection::UByte:
+        if (!table->VerifyField<int8_t>(v, field_def->offset())) return false;
+        break;
+      case reflection::Short:
+      case reflection::UShort:
+        if (!table->VerifyField<int16_t>(v, field_def->offset())) return false;
+        break;
+      case reflection::Int:
+      case reflection::UInt:
+        if (!table->VerifyField<int32_t>(v, field_def->offset())) return false;
+        break;
+      case reflection::Long:
+      case reflection::ULong:
+        if (!table->VerifyField<int64_t>(v, field_def->offset())) return false;
+        break;
+      case reflection::Float:
+        if (!table->VerifyField<float>(v, field_def->offset())) return false;
+        break;
+      case reflection::Double:
+        if (!table->VerifyField<double>(v, field_def->offset())) return false;
+        break;
+      case reflection::String:
+        if (!table->VerifyField<uoffset_t>(v, field_def->offset()) ||
+            !v.VerifyString(flatbuffers::GetFieldS(*table, *field_def))) {
+          return false;
+        }
+        break;
+      case reflection::Vector:
+        if (!VerifyVector(v, schema, *table, *field_def)) return false;
+        break;
+      case reflection::Obj: {
+        auto child_obj = schema.objects()->Get(field_def->type()->index());
+        if (child_obj->is_struct()) {
+          if (!VerifyStruct(v, *table, field_def->offset(), *child_obj,
+                            field_def->required())) {
+            return false;
+          }
+        } else {
+          if (!VerifyObject(v, schema, *child_obj,
+                            flatbuffers::GetFieldT(*table, *field_def),
+                            field_def->required())) {
+            return false;
+          }
+        }
+        break;
+      }
+      case reflection::Union: {
+        //  get union type from the prev field
+        voffset_t utype_offset = field_def->offset() - sizeof(voffset_t);
+        auto utype = table->GetField<uint8_t>(utype_offset, 0);
+        auto uval = reinterpret_cast<const uint8_t *>(
+            flatbuffers::GetFieldT(*table, *field_def));
+        if (!VerifyUnion(v, schema, utype, uval, *field_def)) { return false; }
+        break;
+      }
+      default: FLATBUFFERS_ASSERT(false); break;
+    }
+  }
+
+  if (!v.EndTable()) return false;
+
+  return true;
+}
+
+bool Verify(const reflection::Schema &schema, const reflection::Object &root,
+            const uint8_t *buf, size_t length, uoffset_t max_depth /*= 64*/,
+            uoffset_t max_tables /*= 1000000*/) {
+  Verifier v(buf, length, max_depth, max_tables);
+  return VerifyObject(v, schema, root, flatbuffers::GetAnyRoot(buf), true);
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/src/util.cpp b/3rdparty/TNN/third_party/flatbuffers/src/util.cpp
new file mode 100644
index 0000000..3670a01
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/src/util.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+// Dont't remove `format off`, it prevent reordering of win-includes.
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || \
+    defined(__QNXNTO__)
+#  define _POSIX_C_SOURCE 200809L
+#  define _XOPEN_SOURCE 700L
+#endif
+
+#ifdef _WIN32
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  ifdef _MSC_VER
+#    include <crtdbg.h>
+#  endif
+#  include <windows.h>  // Must be included before <direct.h>
+#  include <direct.h>
+#  include <winbase.h>
+#  undef interface  // This is also important because of reasons
+#endif
+// clang-format on
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/util.h"
+
+#include <sys/stat.h>
+#include <clocale>
+#include <cstdlib>
+#include <fstream>
+
+namespace flatbuffers {
+
+bool FileExistsRaw(const char *name) {
+  std::ifstream ifs(name);
+  return ifs.good();
+}
+
+bool LoadFileRaw(const char *name, bool binary, std::string *buf) {
+  if (DirExists(name)) return false;
+  std::ifstream ifs(name, binary ? std::ifstream::binary : std::ifstream::in);
+  if (!ifs.is_open()) return false;
+  if (binary) {
+    // The fastest way to read a file into a string.
+    ifs.seekg(0, std::ios::end);
+    auto size = ifs.tellg();
+    (*buf).resize(static_cast<size_t>(size));
+    ifs.seekg(0, std::ios::beg);
+    ifs.read(&(*buf)[0], (*buf).size());
+  } else {
+    // This is slower, but works correctly on all platforms for text files.
+    std::ostringstream oss;
+    oss << ifs.rdbuf();
+    *buf = oss.str();
+  }
+  return !ifs.bad();
+}
+
+static LoadFileFunction g_load_file_function = LoadFileRaw;
+static FileExistsFunction g_file_exists_function = FileExistsRaw;
+
+bool LoadFile(const char *name, bool binary, std::string *buf) {
+  FLATBUFFERS_ASSERT(g_load_file_function);
+  return g_load_file_function(name, binary, buf);
+}
+
+bool FileExists(const char *name) {
+  FLATBUFFERS_ASSERT(g_file_exists_function);
+  return g_file_exists_function(name);
+}
+
+bool DirExists(const char *name) {
+  // clang-format off
+
+  #ifdef _WIN32
+    #define flatbuffers_stat _stat
+    #define FLATBUFFERS_S_IFDIR _S_IFDIR
+  #else
+    #define flatbuffers_stat stat
+    #define FLATBUFFERS_S_IFDIR S_IFDIR
+  #endif
+  // clang-format on
+  struct flatbuffers_stat file_info;
+  if (flatbuffers_stat(name, &file_info) != 0) return false;
+  return (file_info.st_mode & FLATBUFFERS_S_IFDIR) != 0;
+}
+
+LoadFileFunction SetLoadFileFunction(LoadFileFunction load_file_function) {
+  LoadFileFunction previous_function = g_load_file_function;
+  g_load_file_function = load_file_function ? load_file_function : LoadFileRaw;
+  return previous_function;
+}
+
+FileExistsFunction SetFileExistsFunction(
+    FileExistsFunction file_exists_function) {
+  FileExistsFunction previous_function = g_file_exists_function;
+  g_file_exists_function =
+      file_exists_function ? file_exists_function : FileExistsRaw;
+  return previous_function;
+}
+
+bool SaveFile(const char *name, const char *buf, size_t len, bool binary) {
+  std::ofstream ofs(name, binary ? std::ofstream::binary : std::ofstream::out);
+  if (!ofs.is_open()) return false;
+  ofs.write(buf, len);
+  return !ofs.bad();
+}
+
+// We internally store paths in posix format ('/'). Paths supplied
+// by the user should go through PosixPath to ensure correct behavior
+// on Windows when paths are string-compared.
+
+static const char kPathSeparatorWindows = '\\';
+static const char *PathSeparatorSet = "\\/";  // Intentionally no ':'
+
+std::string StripExtension(const std::string &filepath) {
+  size_t i = filepath.find_last_of('.');
+  return i != std::string::npos ? filepath.substr(0, i) : filepath;
+}
+
+std::string GetExtension(const std::string &filepath) {
+  size_t i = filepath.find_last_of('.');
+  return i != std::string::npos ? filepath.substr(i + 1) : "";
+}
+
+std::string StripPath(const std::string &filepath) {
+  size_t i = filepath.find_last_of(PathSeparatorSet);
+  return i != std::string::npos ? filepath.substr(i + 1) : filepath;
+}
+
+std::string StripFileName(const std::string &filepath) {
+  size_t i = filepath.find_last_of(PathSeparatorSet);
+  return i != std::string::npos ? filepath.substr(0, i) : "";
+}
+
+std::string ConCatPathFileName(const std::string &path,
+                               const std::string &filename) {
+  std::string filepath = path;
+  if (filepath.length()) {
+    char &filepath_last_character = string_back(filepath);
+    if (filepath_last_character == kPathSeparatorWindows) {
+      filepath_last_character = kPathSeparator;
+    } else if (filepath_last_character != kPathSeparator) {
+      filepath += kPathSeparator;
+    }
+  }
+  filepath += filename;
+  // Ignore './' at the start of filepath.
+  if (filepath[0] == '.' && filepath[1] == kPathSeparator) {
+    filepath.erase(0, 2);
+  }
+  return filepath;
+}
+
+std::string PosixPath(const char *path) {
+  std::string p = path;
+  std::replace(p.begin(), p.end(), '\\', '/');
+  return p;
+}
+
+void EnsureDirExists(const std::string &filepath) {
+  auto parent = StripFileName(filepath);
+  if (parent.length()) EnsureDirExists(parent);
+    // clang-format off
+
+  #ifdef _WIN32
+    (void)_mkdir(filepath.c_str());
+  #else
+    mkdir(filepath.c_str(), S_IRWXU|S_IRGRP|S_IXGRP);
+  #endif
+  // clang-format on
+}
+
+std::string AbsolutePath(const std::string &filepath) {
+  // clang-format off
+
+  #ifdef FLATBUFFERS_NO_ABSOLUTE_PATH_RESOLUTION
+    return filepath;
+  #else
+    #ifdef _WIN32
+      char abs_path[MAX_PATH];
+      return GetFullPathNameA(filepath.c_str(), MAX_PATH, abs_path, nullptr)
+    #else
+      char *abs_path_temp = realpath(filepath.c_str(), nullptr);
+      bool success = abs_path_temp != nullptr;
+      std::string abs_path;
+      if(success) {
+        abs_path = abs_path_temp;
+        free(abs_path_temp);
+      }
+      return success
+    #endif
+      ? abs_path
+      : filepath;
+  #endif // FLATBUFFERS_NO_ABSOLUTE_PATH_RESOLUTION
+  // clang-format on
+}
+
+// Locale-independent code.
+#if defined(FLATBUFFERS_LOCALE_INDEPENDENT) && \
+    (FLATBUFFERS_LOCALE_INDEPENDENT > 0)
+
+// clang-format off
+// Allocate locale instance at startup of application.
+ClassicLocale ClassicLocale::instance_;
+
+#ifdef _MSC_VER
+  ClassicLocale::ClassicLocale()
+    : locale_(_create_locale(LC_ALL, "C")) {}
+  ClassicLocale::~ClassicLocale() { _free_locale(locale_); }
+#else
+  ClassicLocale::ClassicLocale()
+    : locale_(newlocale(LC_ALL, "C", nullptr)) {}
+  ClassicLocale::~ClassicLocale() { freelocale(locale_); }
+#endif
+// clang-format on
+
+#endif  // !FLATBUFFERS_LOCALE_INDEPENDENT
+
+std::string RemoveStringQuotes(const std::string &s) {
+  auto ch = *s.c_str();
+  return ((s.size() >= 2) && (ch == '\"' || ch == '\'') &&
+          (ch == string_back(s)))
+             ? s.substr(1, s.length() - 2)
+             : s;
+}
+
+bool SetGlobalTestLocale(const char *locale_name, std::string *_value) {
+  const auto the_locale = setlocale(LC_ALL, locale_name);
+  if (!the_locale) return false;
+  if (_value) *_value = std::string(the_locale);
+  return true;
+}
+
+bool ReadEnvironmentVariable(const char *var_name, std::string *_value) {
+#ifdef _MSC_VER
+  __pragma(warning(disable : 4996));  // _CRT_SECURE_NO_WARNINGS
+#endif
+  auto env_str = std::getenv(var_name);
+  if (!env_str) return false;
+  if (_value) *_value = std::string(env_str);
+  return true;
+}
+
+void SetupDefaultCRTReportMode() {
+  // clang-format off
+
+  #ifdef _MSC_VER
+    // By default, send all reports to STDOUT to prevent CI hangs.
+    // Enable assert report box [Abort|Retry|Ignore] if a debugger is present.
+    const int dbg_mode = (_CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG) |
+                         (IsDebuggerPresent() ? _CRTDBG_MODE_WNDW : 0);
+    (void)dbg_mode; // release mode fix
+    // CrtDebug reports to _CRT_WARN channel.
+    _CrtSetReportMode(_CRT_WARN, dbg_mode);
+    _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDOUT);
+    // The assert from <assert.h> reports to _CRT_ERROR channel
+    _CrtSetReportMode(_CRT_ERROR, dbg_mode);
+    _CrtSetReportFile(_CRT_ERROR, _CRTDBG_FILE_STDOUT);
+    // Internal CRT assert channel?
+    _CrtSetReportMode(_CRT_ASSERT, dbg_mode);
+    _CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDOUT);
+  #endif
+
+  // clang-format on
+}
+
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift.swiftformat b/3rdparty/TNN/third_party/flatbuffers/swift.swiftformat
new file mode 100644
index 0000000..8ef724d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift.swiftformat
@@ -0,0 +1,25 @@
+--swiftversion 5.1
+
+# format
+--indent 2
+
+# options
+--self remove # redundantSelf
+--importgrouping testable-bottom # sortedImports
+--trimwhitespace always
+--indentcase false
+--ifdef no-indent #indent
+--wraparguments before-first # wrapArguments
+--wrapparameters before-first # wrapArguments
+--closingparen same-line # wrapArguments
+--funcattributes prev-line # wrapAttributes
+--typeattributes prev-line # wrapAttributes
+
+# rules
+--rules todos,anyObjectProtocol,redundantParens,redundantReturn,redundantSelf,sortedImports,strongifiedSelf,trailingCommas,trailingSpace,wrapArguments,wrapMultilineStatementBraces,indent,wrapAttributes,void,fileHeader
+--disable trailingclosures
+
+--exclude **/*_generated.swift
+--exclude **/*.grpc.swift
+
+--header "/*\n * Copyright {year} Google Inc. All rights reserved.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */"
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/FlatBuffers.podspec b/3rdparty/TNN/third_party/flatbuffers/swift/FlatBuffers.podspec
new file mode 100644
index 0000000..5fdcb37
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/FlatBuffers.podspec
@@ -0,0 +1,21 @@
+Pod::Spec.new do |s|
+  s.name             = 'FlatBuffers'
+  s.version          = '1.0.0'
+  s.summary          = 'FlatBuffers: Memory Efficient Serialization Library'
+
+  s.description      = "FlatBuffers is a cross platform serialization library architected for
+  maximum memory efficiency. It allows you to directly access serialized
+  data without parsing/unpacking it first, while still having great 
+  forwards/backwards compatibility."
+
+  s.homepage         = 'https://github.com/google/flatbuffers'
+  s.license          = { :type => 'Apache2.0', :file => 'LICENSE' }
+  s.author           = { 'mustii' => 'mustii@mmk.one' }
+  s.source           = { :git => 'https://github.com/mustiikhalil/flatbuffers.git', :tag => s.version.to_s, :submodules => true }
+
+  s.ios.deployment_target = '11.0'
+  s.osx.deployment_target = '10.14'
+
+  s.swift_version = '5.0'
+  s.source_files = 'Sources/**/*'
+end
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/LICENSE b/3rdparty/TNN/third_party/flatbuffers/swift/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Package.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Package.swift
new file mode 100644
index 0000000..5d4c7cc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Package.swift
@@ -0,0 +1,35 @@
+// swift-tools-version:5.2
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "FlatBuffers",
+  platforms: [
+    .iOS(.v11),
+    .macOS(.v10_14),
+  ],
+  products: [
+    .library(
+      name: "FlatBuffers",
+      targets: ["FlatBuffers"]),
+  ],
+  targets: [
+    .target(
+      name: "FlatBuffers",
+      dependencies: []),
+  ])
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/README.md b/3rdparty/TNN/third_party/flatbuffers/swift/README.md
new file mode 100644
index 0000000..1b90ab6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/README.md
@@ -0,0 +1,17 @@
+FlatBuffers swift can be found in both SPM
+
+`.package(url: "https://github.com/mustiikhalil/flatbuffers.git", from: "X.Y.Z"),`
+
+and Cocoapods
+
+`pod 'FlatBuffers'`
+
+### Notes
+
+1- To report any error please use the main repository.
+
+2- `1.0.0` deprecates `MyGame_Example_Vec3.createVec3(builder: &fbb, x: 10, test2: .blue)` for `MyGame_Example_Vec3(x: 10, test2: .blue)`. This uses Swift native structs instead of workarounds that which leads to a huge performance increase when serializing structs. You can download the [binary here](https://github.com/google/flatbuffers/actions) and select the latest push to master
+
+### Contribute
+
+1- Always run `swift test --generate-linuxmain` whenever new test functions are added or removed
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/ByteBuffer.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/ByteBuffer.swift
new file mode 100644
index 0000000..b301e02
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/ByteBuffer.swift
@@ -0,0 +1,382 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+@frozen
+public struct ByteBuffer {
+
+  /// Storage is a container that would hold the memory pointer to solve the issue of
+  /// deallocating the memory that was held by (memory: UnsafeMutableRawPointer)
+  @usableFromInline
+  final class Storage {
+    // This storage doesn't own the memory, therefore, we won't deallocate on deinit.
+    private let unowned: Bool
+    /// pointer to the start of the buffer object in memory
+    var memory: UnsafeMutableRawPointer
+    /// Capacity of UInt8 the buffer can hold
+    var capacity: Int
+
+    @usableFromInline
+    init(count: Int, alignment: Int) {
+      memory = UnsafeMutableRawPointer.allocate(byteCount: count, alignment: alignment)
+      capacity = count
+      unowned = false
+    }
+
+    @usableFromInline
+    init(memory: UnsafeMutableRawPointer, capacity: Int, unowned: Bool) {
+      self.memory = memory
+      self.capacity = capacity
+      self.unowned = unowned
+    }
+
+    deinit {
+      if !unowned {
+        memory.deallocate()
+      }
+    }
+
+    @usableFromInline
+    func copy(from ptr: UnsafeRawPointer, count: Int) {
+      assert(
+        !unowned,
+        "copy should NOT be called on a buffer that is built by assumingMemoryBound")
+      memory.copyMemory(from: ptr, byteCount: count)
+    }
+
+    @usableFromInline
+    func initialize(for size: Int) {
+      assert(
+        !unowned,
+        "initalize should NOT be called on a buffer that is built by assumingMemoryBound")
+      memset(memory, 0, size)
+    }
+
+    /// Reallocates the buffer incase the object to be written doesnt fit in the current buffer
+    /// - Parameter size: Size of the current object
+    @usableFromInline
+    internal func reallocate(_ size: Int, writerSize: Int, alignment: Int) {
+      let currentWritingIndex = capacity &- writerSize
+      while capacity <= writerSize &+ size {
+        capacity = capacity << 1
+      }
+
+      /// solution take from Apple-NIO
+      capacity = capacity.convertToPowerofTwo
+
+      let newData = UnsafeMutableRawPointer.allocate(byteCount: capacity, alignment: alignment)
+      memset(newData, 0, capacity &- writerSize)
+      memcpy(
+        newData.advanced(by: capacity &- writerSize),
+        memory.advanced(by: currentWritingIndex),
+        writerSize)
+      memory.deallocate()
+      memory = newData
+    }
+  }
+
+  @usableFromInline var _storage: Storage
+
+  /// The size of the elements written to the buffer + their paddings
+  private var _writerSize: Int = 0
+  /// Aliginment of the current  memory being written to the buffer
+  internal var alignment = 1
+  /// Current Index which is being used to write to the buffer, it is written from the end to the start of the buffer
+  internal var writerIndex: Int { _storage.capacity &- _writerSize }
+
+  /// Reader is the position of the current Writer Index (capacity - size)
+  public var reader: Int { writerIndex }
+  /// Current size of the buffer
+  public var size: UOffset { UOffset(_writerSize) }
+  /// Public Pointer to the buffer object in memory. This should NOT be modified for any reason
+  public var memory: UnsafeMutableRawPointer { _storage.memory }
+  /// Current capacity for the buffer
+  public var capacity: Int { _storage.capacity }
+
+  /// Constructor that creates a Flatbuffer object from a UInt8
+  /// - Parameter bytes: Array of UInt8
+  public init(bytes: [UInt8]) {
+    var b = bytes
+    _storage = Storage(count: bytes.count, alignment: alignment)
+    _writerSize = _storage.capacity
+    b.withUnsafeMutableBytes { bufferPointer in
+      self._storage.copy(from: bufferPointer.baseAddress!, count: bytes.count)
+    }
+  }
+
+  /// Constructor that creates a Flatbuffer from the Swift Data type object
+  /// - Parameter data: Swift data Object
+  public init(data: Data) {
+    var b = data
+    _storage = Storage(count: data.count, alignment: alignment)
+    _writerSize = _storage.capacity
+    b.withUnsafeMutableBytes { bufferPointer in
+      self._storage.copy(from: bufferPointer.baseAddress!, count: data.count)
+    }
+  }
+
+  /// Constructor that creates a Flatbuffer instance with a size
+  /// - Parameter size: Length of the buffer
+  init(initialSize size: Int) {
+    let size = size.convertToPowerofTwo
+    _storage = Storage(count: size, alignment: alignment)
+    _storage.initialize(for: size)
+  }
+
+  #if swift(>=5.0)
+  /// Constructor that creates a Flatbuffer object from a ContiguousBytes
+  /// - Parameters:
+  ///   - contiguousBytes: Binary stripe to use as the buffer
+  ///   - count: amount of readable bytes
+  public init<Bytes: ContiguousBytes>(
+    contiguousBytes: Bytes,
+    count: Int)
+  {
+    _storage = Storage(count: count, alignment: alignment)
+    _writerSize = _storage.capacity
+    contiguousBytes.withUnsafeBytes { buf in
+      _storage.copy(from: buf.baseAddress!, count: buf.count)
+    }
+  }
+  #endif
+
+  /// Constructor that creates a Flatbuffer from unsafe memory region without copying
+  /// - Parameter assumingMemoryBound: The unsafe memory region
+  /// - Parameter capacity: The size of the given memory region
+  public init(assumingMemoryBound memory: UnsafeMutableRawPointer, capacity: Int) {
+    _storage = Storage(memory: memory, capacity: capacity, unowned: true)
+    _writerSize = capacity
+  }
+
+  /// Creates a copy of the buffer that's being built by calling sizedBuffer
+  /// - Parameters:
+  ///   - memory: Current memory of the buffer
+  ///   - count: count of bytes
+  internal init(memory: UnsafeMutableRawPointer, count: Int) {
+    _storage = Storage(count: count, alignment: alignment)
+    _storage.copy(from: memory, count: count)
+    _writerSize = _storage.capacity
+  }
+
+  /// Creates a copy of the existing flatbuffer, by copying it to a different memory.
+  /// - Parameters:
+  ///   - memory: Current memory of the buffer
+  ///   - count: count of bytes
+  ///   - removeBytes: Removes a number of bytes from the current size
+  internal init(memory: UnsafeMutableRawPointer, count: Int, removing removeBytes: Int) {
+    _storage = Storage(count: count, alignment: alignment)
+    _storage.copy(from: memory, count: count)
+    _writerSize = removeBytes
+  }
+
+  /// Fills the buffer with padding by adding to the writersize
+  /// - Parameter padding: Amount of padding between two to be serialized objects
+  @usableFromInline
+  mutating func fill(padding: Int) {
+    assert(padding >= 0, "Fill should be larger than or equal to zero")
+    ensureSpace(size: padding)
+    _writerSize = _writerSize &+ (MemoryLayout<UInt8>.size &* padding)
+  }
+
+  /// Adds an array of type Scalar to the buffer memory
+  /// - Parameter elements: An array of Scalars
+  @usableFromInline
+  mutating func push<T: Scalar>(elements: [T]) {
+    let size = elements.count &* MemoryLayout<T>.size
+    ensureSpace(size: size)
+    elements.reversed().forEach { s in
+      push(value: s, len: MemoryLayout.size(ofValue: s))
+    }
+  }
+
+  /// Adds an object of type NativeStruct into the buffer
+  /// - Parameters:
+  ///   - value: Object  that will be written to the buffer
+  ///   - size: size to subtract from the WriterIndex
+  @inline(__always)
+  mutating func push<T: NativeStruct>(struct value: T, size: Int) {
+    ensureSpace(size: size)
+    var v = value
+    memcpy(_storage.memory.advanced(by: writerIndex &- size), &v, size)
+    _writerSize = _writerSize &+ size
+  }
+
+  /// Adds an object of type Scalar into the buffer
+  /// - Parameters:
+  ///   - value: Object  that will be written to the buffer
+  ///   - len: Offset to subtract from the WriterIndex
+  @usableFromInline
+  mutating func push<T: Scalar>(value: T, len: Int) {
+    ensureSpace(size: len)
+    var v = value
+    memcpy(_storage.memory.advanced(by: writerIndex &- len), &v, len)
+    _writerSize = _writerSize &+ len
+  }
+
+  /// Adds a string to the buffer using swift.utf8 object
+  /// - Parameter str: String that will be added to the buffer
+  /// - Parameter len: length of the string
+  @usableFromInline
+  mutating func push(string str: String, len: Int) {
+    ensureSpace(size: len)
+    if str.utf8.withContiguousStorageIfAvailable({ self.push(bytes: $0, len: len) }) != nil {
+    } else {
+      let utf8View = str.utf8
+      for c in utf8View.reversed() {
+        push(value: c, len: 1)
+      }
+    }
+  }
+
+  /// Writes a string to Bytebuffer using UTF8View
+  /// - Parameters:
+  ///   - bytes: Pointer to the view
+  ///   - len: Size of string
+  @inline(__always)
+  mutating internal func push(
+    bytes: UnsafeBufferPointer<String.UTF8View.Element>,
+    len: Int) -> Bool
+  {
+    memcpy(
+      _storage.memory.advanced(by: writerIndex &- len),
+      UnsafeRawPointer(bytes.baseAddress!),
+      len)
+    _writerSize = _writerSize &+ len
+    return true
+  }
+
+  /// Write stores an object into the buffer directly or indirectly.
+  ///
+  /// Direct: ignores the capacity of buffer which would mean we are referring to the direct point in memory
+  /// indirect: takes into respect the current capacity of the buffer (capacity - index), writing to the buffer from the end
+  /// - Parameters:
+  ///   - value: Value that needs to be written to the buffer
+  ///   - index: index to write to
+  ///   - direct: Should take into consideration the capacity of the buffer
+  func write<T>(value: T, index: Int, direct: Bool = false) {
+    var index = index
+    if !direct {
+      index = _storage.capacity &- index
+    }
+    assert(index < _storage.capacity, "Write index is out of writing bound")
+    assert(index >= 0, "Writer index should be above zero")
+    _storage.memory.storeBytes(of: value, toByteOffset: index, as: T.self)
+  }
+
+  /// Makes sure that buffer has enouch space for each of the objects that will be written into it
+  /// - Parameter size: size of object
+  @discardableResult
+  @inline(__always)
+  mutating func ensureSpace(size: Int) -> Int {
+    if size &+ _writerSize > _storage.capacity {
+      _storage.reallocate(size, writerSize: _writerSize, alignment: alignment)
+    }
+    assert(size < FlatBufferMaxSize, "Buffer can't grow beyond 2 Gigabytes")
+    return size
+  }
+
+  /// pops the written VTable if it's already written into the buffer
+  /// - Parameter size: size of the `VTable`
+  @inline(__always)
+  mutating internal func pop(_ size: Int) {
+    assert((_writerSize &- size) > 0, "New size should NOT be a negative number")
+    memset(_storage.memory.advanced(by: writerIndex), 0, _writerSize &- size)
+    _writerSize = size
+  }
+
+  /// Clears the current size of the buffer
+  @inline(__always)
+  mutating public func clearSize() {
+    _writerSize = 0
+  }
+
+  /// Clears the current instance of the buffer, replacing it with new memory
+  @inline(__always)
+  mutating public func clear() {
+    _writerSize = 0
+    alignment = 1
+    _storage.initialize(for: _storage.capacity)
+  }
+
+  /// Reads an object from the buffer
+  /// - Parameters:
+  ///   - def: Type of the object
+  ///   - position: the index of the object in the buffer
+  public func read<T>(def: T.Type, position: Int) -> T {
+    assert(
+      position + MemoryLayout<T>.size <= _storage.capacity,
+      "Reading out of bounds is illegal")
+    return _storage.memory.advanced(by: position).load(as: T.self)
+  }
+
+  /// Reads a slice from the memory assuming a type of T
+  /// - Parameters:
+  ///   - index: index of the object to be read from the buffer
+  ///   - count: count of bytes in memory
+  @inline(__always)
+  public func readSlice<T>(
+    index: Int32,
+    count: Int32) -> [T]
+  {
+    let _index = Int(index)
+    let _count = Int(count)
+    assert(_index + _count <= _storage.capacity, "Reading out of bounds is illegal")
+    let start = _storage.memory.advanced(by: _index).assumingMemoryBound(to: T.self)
+    let array = UnsafeBufferPointer(start: start, count: _count)
+    return Array(array)
+  }
+
+  /// Reads a string from the buffer and encodes it to a swift string
+  /// - Parameters:
+  ///   - index: index of the string in the buffer
+  ///   - count: length of the string
+  ///   - type: Encoding of the string
+  @inline(__always)
+  public func readString(
+    at index: Int32,
+    count: Int32,
+    type: String.Encoding = .utf8) -> String?
+  {
+    let _index = Int(index)
+    let _count = Int(count)
+    assert(_index + _count <= _storage.capacity, "Reading out of bounds is illegal")
+    let start = _storage.memory.advanced(by: _index).assumingMemoryBound(to: UInt8.self)
+    let bufprt = UnsafeBufferPointer(start: start, count: _count)
+    return String(bytes: Array(bufprt), encoding: type)
+  }
+
+  /// Creates a new Flatbuffer object that's duplicated from the current one
+  /// - Parameter removeBytes: the amount of bytes to remove from the current Size
+  public func duplicate(removing removeBytes: Int = 0) -> ByteBuffer {
+    assert(removeBytes > 0, "Can NOT remove negative bytes")
+    assert(removeBytes < _storage.capacity, "Can NOT remove more bytes than the ones allocated")
+    return ByteBuffer(
+      memory: _storage.memory,
+      count: _storage.capacity,
+      removing: _writerSize &- removeBytes)
+  }
+}
+
+extension ByteBuffer: CustomDebugStringConvertible {
+
+  public var debugDescription: String {
+    """
+    buffer located at: \(_storage.memory), with capacity of \(_storage.capacity)
+    { writerSize: \(_writerSize), readerSize: \(reader), writerIndex: \(writerIndex) }
+    """
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Constants.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Constants.swift
new file mode 100644
index 0000000..470b725
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Constants.swift
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if os(Linux)
+import CoreFoundation
+#else
+import Foundation
+#endif
+
+/// A boolean to see if the system is littleEndian
+let isLitteEndian = CFByteOrderGetCurrent() == Int(CFByteOrderLittleEndian.rawValue)
+/// Constant for the file id length
+let FileIdLength = 4
+/// Type aliases
+public typealias Byte = UInt8
+public typealias UOffset = UInt32
+public typealias SOffset = Int32
+public typealias VOffset = UInt16
+/// Maximum size for a buffer
+public let FlatBufferMaxSize = UInt32.max << ((MemoryLayout<SOffset>.size * 8 - 1) - 1)
+
+/// Protocol that confirms all the numbers
+///
+/// Scalar is used to confirm all the numbers that can be represented in a FlatBuffer. It's used to write/read from the buffer.
+public protocol Scalar: Equatable {
+  associatedtype NumericValue
+  var convertedEndian: NumericValue { get }
+}
+
+extension Scalar where Self: FixedWidthInteger {
+  /// Converts the value from BigEndian to LittleEndian
+  ///
+  /// Converts values to little endian on machines that work with BigEndian, however this is NOT TESTED yet.
+  public var convertedEndian: NumericValue {
+    self as! Self.NumericValue
+  }
+}
+
+extension Double: Scalar {
+  public typealias NumericValue = UInt64
+
+  public var convertedEndian: UInt64 {
+    bitPattern.littleEndian
+  }
+}
+
+extension Float32: Scalar {
+  public typealias NumericValue = UInt32
+
+  public var convertedEndian: UInt32 {
+    bitPattern.littleEndian
+  }
+}
+
+extension Bool: Scalar {
+  public var convertedEndian: UInt8 {
+    self == true ? 1 : 0
+  }
+
+  public typealias NumericValue = UInt8
+}
+
+extension Int: Scalar {
+  public typealias NumericValue = Int
+}
+
+extension Int8: Scalar {
+  public typealias NumericValue = Int8
+}
+
+extension Int16: Scalar {
+  public typealias NumericValue = Int16
+}
+
+extension Int32: Scalar {
+  public typealias NumericValue = Int32
+}
+
+extension Int64: Scalar {
+  public typealias NumericValue = Int64
+}
+
+extension UInt8: Scalar {
+  public typealias NumericValue = UInt8
+}
+
+extension UInt16: Scalar {
+  public typealias NumericValue = UInt16
+}
+
+extension UInt32: Scalar {
+  public typealias NumericValue = UInt32
+}
+
+extension UInt64: Scalar {
+  public typealias NumericValue = UInt64
+}
+
+public func FlatBuffersVersion_2_0_0() {}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferBuilder.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferBuilder.swift
new file mode 100644
index 0000000..9a166ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferBuilder.swift
@@ -0,0 +1,596 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+@frozen
+public struct FlatBufferBuilder {
+
+  /// Storage for the Vtables used in the buffer are stored in here, so they would be written later in EndTable
+  @usableFromInline internal var _vtableStorage = VTableStorage()
+  /// Flatbuffer data will be written into
+  @usableFromInline internal var _bb: ByteBuffer
+
+  /// Reference Vtables that were already written to the buffer
+  private var _vtables: [UOffset] = []
+  /// A check if the buffer is being written into by a different table
+  private var isNested = false
+  /// Dictonary that stores a map of all the strings that were written to the buffer
+  private var stringOffsetMap: [String: Offset] = [:]
+  /// A check to see if finish(::) was ever called to retreive data object
+  private var finished = false
+  /// A check to see if the buffer should serialize Default values
+  private var serializeDefaults: Bool
+
+  /// Current alignment for the buffer
+  var _minAlignment: Int = 0 {
+    didSet {
+      _bb.alignment = _minAlignment
+    }
+  }
+
+  /// Gives a read access to the buffer's size
+  public var size: UOffset { _bb.size }
+  /// Data representation of the buffer
+  public var data: Data {
+    assert(finished, "Data shouldn't be called before finish()")
+    return Data(
+      bytes: _bb.memory.advanced(by: _bb.writerIndex),
+      count: _bb.capacity &- _bb.writerIndex)
+  }
+  /// Get's the fully sized buffer stored in memory
+  public var fullSizedByteArray: [UInt8] {
+    let ptr = UnsafeBufferPointer(
+      start: _bb.memory.assumingMemoryBound(to: UInt8.self),
+      count: _bb.capacity)
+    return Array(ptr)
+  }
+  /// Returns the written size of the buffer
+  public var sizedByteArray: [UInt8] {
+    assert(finished, "Data shouldn't be called before finish()")
+    let cp = _bb.capacity &- _bb.writerIndex
+    let start = _bb.memory.advanced(by: _bb.writerIndex)
+      .bindMemory(to: UInt8.self, capacity: cp)
+
+    let ptr = UnsafeBufferPointer(start: start, count: cp)
+    return Array(ptr)
+  }
+  /// Returns the buffer
+  public var buffer: ByteBuffer { _bb }
+
+  /// Returns A sized Buffer from the readable bytes
+  public var sizedBuffer: ByteBuffer {
+    assert(finished, "Data shouldn't be called before finish()")
+    return ByteBuffer(memory: _bb.memory.advanced(by: _bb.reader), count: Int(_bb.size))
+  }
+
+  // MARK: - Init
+
+  /// initialize the buffer with a size
+  /// - Parameters:
+  ///   - initialSize: Initial size for the buffer
+  ///   - force: Allows default to be serialized into the buffer
+  public init(initialSize: Int32 = 1024, serializeDefaults force: Bool = false) {
+    assert(initialSize > 0, "Size should be greater than zero!")
+    guard isLitteEndian else {
+      fatalError("Reading/Writing a buffer in big endian machine is not supported on swift")
+    }
+    serializeDefaults = force
+    _bb = ByteBuffer(initialSize: Int(initialSize))
+  }
+
+  /// Clears the buffer and the builder from it's data
+  mutating public func clear() {
+    _minAlignment = 0
+    isNested = false
+    stringOffsetMap = [:]
+    _vtables = []
+    _vtableStorage.clear()
+    _bb.clear()
+  }
+
+  // MARK: - Create Tables
+
+  /// Checks if the required fields were serialized into the buffer
+  /// - Parameters:
+  ///   - table: offset for the table
+  ///   - fields: Array of all the important fields to be serialized
+  mutating public func require(table: Offset, fields: [Int32]) {
+    for field in fields {
+      let start = _bb.capacity &- Int(table.o)
+      let startTable = start &- Int(_bb.read(def: Int32.self, position: start))
+      let isOkay = _bb.read(def: VOffset.self, position: startTable &+ Int(field)) != 0
+      assert(isOkay, "Flatbuffers requires the following field")
+    }
+  }
+
+  /// Finished the buffer by adding the file id and then calling finish
+  /// - Parameters:
+  ///   - offset: Offset of the table
+  ///   - fileId: Takes the fileId
+  ///   - prefix: if false it wont add the size of the buffer
+  mutating public func finish(offset: Offset, fileId: String, addPrefix prefix: Bool = false) {
+    let size = MemoryLayout<UOffset>.size
+    preAlign(len: size &+ (prefix ? size : 0) &+ FileIdLength, alignment: _minAlignment)
+    assert(fileId.count == FileIdLength, "Flatbuffers requires file id to be 4")
+    _bb.push(string: fileId, len: 4)
+    finish(offset: offset, addPrefix: prefix)
+  }
+
+  /// Finished the buffer by adding the file id, offset, and prefix to it.
+  /// - Parameters:
+  ///   - offset: Offset of the table
+  ///   - prefix: if false it wont add the size of the buffer
+  mutating public func finish(offset: Offset, addPrefix prefix: Bool = false) {
+    notNested()
+    let size = MemoryLayout<UOffset>.size
+    preAlign(len: size &+ (prefix ? size : 0), alignment: _minAlignment)
+    push(element: refer(to: offset.o))
+    if prefix { push(element: _bb.size) }
+    _vtableStorage.clear()
+    finished = true
+  }
+
+  /// starttable will let the builder know, that a new object is being serialized.
+  ///
+  /// The function will fatalerror if called while there is another object being serialized
+  /// - Parameter numOfFields: Number of elements to be written to the buffer
+  mutating public func startTable(with numOfFields: Int) -> UOffset {
+    notNested()
+    isNested = true
+    _vtableStorage.start(count: numOfFields)
+    return _bb.size
+  }
+
+  /// Endtable will let the builder know that the object that's written to it is completed
+  ///
+  /// This would be called after all the elements are serialized, it will add the vtable into the buffer.
+  /// it will fatalError in case the object is called without starttable, or the object has exceeded  the limit of
+  ///  2GB,
+  /// - Parameter startOffset:Start point of the object written
+  /// - returns: The root of the table
+  mutating public func endTable(at startOffset: UOffset)  -> UOffset {
+    assert(isNested, "Calling endtable without calling starttable")
+    let sizeofVoffset = MemoryLayout<VOffset>.size
+    let vTableOffset = push(element: SOffset(0))
+
+    let tableObjectSize = vTableOffset &- startOffset
+    assert(tableObjectSize < 0x10000, "Buffer can't grow beyond 2 Gigabytes")
+    let _max = Int(_vtableStorage.maxOffset) &+ sizeofVoffset
+
+    _bb.fill(padding: _max)
+    _bb.write(
+      value: VOffset(tableObjectSize),
+      index: _bb.writerIndex &+ sizeofVoffset,
+      direct: true)
+    _bb.write(value: VOffset(_max), index: _bb.writerIndex, direct: true)
+
+    var itr = 0
+    while itr < _vtableStorage.writtenIndex {
+      let loaded = _vtableStorage.load(at: itr)
+      itr = itr &+ _vtableStorage.size
+      guard loaded.offset != 0 else { continue }
+      let _index = (_bb.writerIndex &+ Int(loaded.position))
+      _bb.write(value: VOffset(vTableOffset &- loaded.offset), index: _index, direct: true)
+    }
+
+    _vtableStorage.clear()
+    let vt_use = _bb.size
+
+    var isAlreadyAdded: Int?
+
+    let vt2 = _bb.memory.advanced(by: _bb.writerIndex)
+    let len2 = vt2.load(fromByteOffset: 0, as: Int16.self)
+
+    for table in _vtables {
+      let position = _bb.capacity &- Int(table)
+      let vt1 = _bb.memory.advanced(by: position)
+      let len1 = _bb.read(def: Int16.self, position: position)
+      if len2 != len1 || 0 != memcmp(vt1, vt2, Int(len2)) { continue }
+
+      isAlreadyAdded = Int(table)
+      break
+    }
+
+    if let offset = isAlreadyAdded {
+      let vTableOff = Int(vTableOffset)
+      let space = _bb.capacity &- vTableOff
+      _bb.write(value: Int32(offset &- vTableOff), index: space, direct: true)
+      _bb.pop(_bb.capacity &- space)
+    } else {
+      _bb.write(value: Int32(vt_use &- vTableOffset), index: Int(vTableOffset))
+      _vtables.append(_bb.size)
+    }
+    isNested = false
+    return vTableOffset
+  }
+
+  // MARK: - Builds Buffer
+
+  /// asserts to see if the object is not nested
+  @usableFromInline
+  mutating internal func notNested()  {
+    assert(!isNested, "Object serialization must not be nested")
+  }
+
+  /// Changes the minimuim alignment of the buffer
+  /// - Parameter size: size of the current alignment
+  @inline(__always)
+  mutating internal func minAlignment(size: Int) {
+    if size > _minAlignment {
+      _minAlignment = size
+    }
+  }
+
+  /// Gets the padding for the current element
+  /// - Parameters:
+  ///   - bufSize: Current size of the buffer + the offset of the object to be written
+  ///   - elementSize: Element size
+  @inline(__always)
+  mutating internal func padding(bufSize: UInt32, elementSize: UInt32) -> UInt32 {
+    ((~bufSize) &+ 1) & (elementSize - 1)
+  }
+
+  /// Prealigns the buffer before writting a new object into the buffer
+  /// - Parameters:
+  ///   - len:Length of the object
+  ///   - alignment: Alignment type
+  @usableFromInline
+  mutating internal func preAlign(len: Int, alignment: Int) {
+    minAlignment(size: alignment)
+    _bb.fill(padding: Int(padding(
+      bufSize: _bb.size &+ UOffset(len),
+      elementSize: UOffset(alignment))))
+  }
+
+  /// Prealigns the buffer before writting a new object into the buffer
+  /// - Parameters:
+  ///   - len: Length of the object
+  ///   - type: Type of the object to be written
+  @usableFromInline
+  mutating internal func preAlign<T: Scalar>(len: Int, type: T.Type) {
+    preAlign(len: len, alignment: MemoryLayout<T>.size)
+  }
+
+  /// Refers to an object that's written in the buffer
+  /// - Parameter off: the objects index value
+  @usableFromInline
+  mutating internal func refer(to off: UOffset) -> UOffset {
+    let size = MemoryLayout<UOffset>.size
+    preAlign(len: size, alignment: size)
+    return _bb.size &- off &+ UInt32(size)
+  }
+
+  /// Tracks the elements written into the buffer
+  /// - Parameters:
+  ///   - offset: The offset of the element witten
+  ///   - position: The position of the element
+  @usableFromInline
+  mutating internal func track(offset: UOffset, at position: VOffset) {
+    _vtableStorage.add(loc: FieldLoc(offset: offset, position: position))
+  }
+
+  // MARK: - Inserting Vectors
+
+  /// Starts a vector of length and Element size
+  mutating public func startVector(_ len: Int, elementSize: Int) {
+    notNested()
+    isNested = true
+    preAlign(len: len &* elementSize, type: UOffset.self)
+    preAlign(len: len &* elementSize, alignment: elementSize)
+  }
+
+  /// Ends the vector of at length
+  ///
+  /// The current function will fatalError if startVector is called before serializing the vector
+  /// - Parameter len: Length of the buffer
+  mutating public func endVector(len: Int) -> Offset {
+    assert(isNested, "Calling endVector without calling startVector")
+    isNested = false
+    return Offset(offset: push(element: Int32(len)))
+  }
+
+  /// Creates a vector of type Scalar in the buffer
+  /// - Parameter elements: elements to be written into the buffer
+  /// - returns: Offset of the vector
+  mutating public func createVector<T: Scalar>(_ elements: [T]) -> Offset {
+    createVector(elements, size: elements.count)
+  }
+
+  ///  Creates a vector of type Scalar in the buffer
+  /// - Parameter elements: Elements to be written into the buffer
+  /// - Parameter size: Count of elements
+  /// - returns: Offset of the vector
+  mutating public func createVector<T: Scalar>(_ elements: [T], size: Int) -> Offset {
+    let size = size
+    startVector(size, elementSize: MemoryLayout<T>.size)
+    _bb.push(elements: elements)
+    return endVector(len: size)
+  }
+
+  /// Creates a vector of type Enums in the buffer
+  /// - Parameter elements: elements to be written into the buffer
+  /// - returns: Offset of the vector
+  mutating public func createVector<T: Enum>(_ elements: [T]) -> Offset {
+    createVector(elements, size: elements.count)
+  }
+
+  ///  Creates a vector of type Enums in the buffer
+  /// - Parameter elements: Elements to be written into the buffer
+  /// - Parameter size: Count of elements
+  /// - returns: Offset of the vector
+  mutating public func createVector<T: Enum>(_ elements: [T], size: Int) -> Offset {
+    let size = size
+    startVector(size, elementSize: T.byteSize)
+    for e in elements.reversed() {
+      _bb.push(value: e.value, len: T.byteSize)
+    }
+    return endVector(len: size)
+  }
+
+  /// Creates a vector of type Offsets  in the buffer
+  /// - Parameter offsets:Array of offsets of type T
+  /// - returns: Offset of the vector
+  mutating public func createVector(ofOffsets offsets: [Offset]) -> Offset {
+    createVector(ofOffsets: offsets, len: offsets.count)
+  }
+
+  ///  Creates a vector of type Offsets  in the buffer
+  /// - Parameter elements: Array of offsets of type T
+  /// - Parameter size: Count of elements
+  /// - returns: Offset of the vector
+  mutating public func createVector(ofOffsets offsets: [Offset], len: Int) -> Offset {
+    startVector(len, elementSize: MemoryLayout<Offset>.size)
+    for o in offsets.reversed() {
+      push(element: o)
+    }
+    return endVector(len: len)
+  }
+
+  /// Creates a vector of Strings
+  /// - Parameter str: a vector of strings that will be written into the buffer
+  /// - returns: Offset of the vector
+  mutating public func createVector(ofStrings str: [String]) -> Offset {
+    var offsets: [Offset] = []
+    for s in str {
+      offsets.append(create(string: s))
+    }
+    return createVector(ofOffsets: offsets)
+  }
+
+  /// Creates a vector of `Native swift structs` which were padded to flatbuffers standards
+  /// - Parameter structs: A vector of structs
+  /// - Returns: offset of the vector
+  mutating public func createVector<T: NativeStruct>(ofStructs structs: [T]) -> Offset {
+    startVector(structs.count * MemoryLayout<T>.size, elementSize: MemoryLayout<T>.alignment)
+    for i in structs.reversed() {
+      _ = create(struct: i)
+    }
+    return endVector(len: structs.count)
+  }
+
+  // MARK: - Inserting Structs
+
+  /// Fills the buffer with a native struct that's build and padded according to flatbuffers standards
+  /// - Parameters:
+  ///   - s: `Native swift` struct to insert
+  ///   - position: The  predefined position of the object
+  /// - Returns: offset of written struct
+  @discardableResult
+  mutating public func create<T: NativeStruct>(
+    struct s: T, position: VOffset) -> Offset
+  {
+    let offset = create(struct: s)
+    _vtableStorage.add(loc: FieldLoc(offset: _bb.size, position: VOffset(position)))
+    return offset
+  }
+
+  /// Fills the buffer with a native struct that's build and padded according to flatbuffers standards
+  /// - Parameters:
+  ///   - s: `Native swift` struct to insert
+  /// - Returns: offset of written struct
+  @discardableResult
+  mutating public func create<T: NativeStruct>(
+    struct s: T) -> Offset
+  {
+    let size = MemoryLayout<T>.size
+    preAlign(len: size, alignment: MemoryLayout<T>.alignment)
+    _bb.push(struct: s, size: size)
+    return Offset(offset: _bb.size)
+  }
+
+  // MARK: - Inserting Strings
+
+  /// Insets a string into the buffer using UTF8
+  /// - Parameter str: String to be serialized
+  /// - returns: The strings offset in the buffer
+  mutating public func create(string str: String?) -> Offset {
+    guard let str = str else { return Offset() }
+    let len = str.utf8.count
+    notNested()
+    preAlign(len: len &+ 1, type: UOffset.self)
+    _bb.fill(padding: 1)
+    _bb.push(string: str, len: len)
+    push(element: UOffset(len))
+    return Offset(offset: _bb.size)
+  }
+
+  /// Inserts a shared string to the buffer
+  ///
+  /// The function checks the stringOffsetmap if it's seen a similar string before
+  /// - Parameter str: String to be serialized
+  /// - returns: The strings offset in the buffer
+  mutating public func createShared(string str: String?) -> Offset {
+    guard let str = str else { return Offset() }
+    if let offset = stringOffsetMap[str] {
+      return offset
+    }
+    let offset = create(string: str)
+    stringOffsetMap[str] = offset
+    return offset
+  }
+
+  // MARK: - Inseting offsets
+
+  /// Adds the offset of an object into the buffer
+  /// - Parameters:
+  ///   - offset: Offset of another object to be written
+  ///   - position: The  predefined position of the object
+  mutating public func add(offset: Offset, at position: VOffset) {
+    if offset.isEmpty { return }
+    add(element: refer(to: offset.o), def: 0, at: position)
+  }
+
+  /// Pushes a value of type offset into the buffer
+  /// - Parameter o: Offset
+  /// - returns: Position of the offset
+  @discardableResult
+  mutating public func push(element o: Offset) -> UOffset {
+    push(element: refer(to: o.o))
+  }
+
+  // MARK: - Inserting Scalars to Buffer
+
+  /// Adds a value into the buffer of type Scalar
+  ///
+  /// - Parameters:
+  ///   - element: Element to insert
+  ///   - def: Default value for that element
+  ///   - position: The predefined position of the element
+  mutating public func add<T: Scalar>(element: T, def: T, at position: VOffset) {
+    if element == def && !serializeDefaults { return }
+    track(offset: push(element: element), at: position)
+  }
+
+  /// Adds a value into the buffer of type optional Scalar
+  /// - Parameters:
+  ///   - element: Optional element of type scalar
+  ///   - position: The predefined position of the element
+  mutating public func add<T: Scalar>(element: T?, at position: VOffset) {
+    guard let element = element else { return }
+    track(offset: push(element: element), at: position)
+  }
+
+  /// Pushes the values into the buffer
+  /// - Parameter element: Element to insert
+  /// - returns: Postion of the Element
+  @discardableResult
+  mutating public func push<T: Scalar>(element: T) -> UOffset {
+    let size = MemoryLayout<T>.size
+    preAlign(
+      len: size,
+      alignment: size)
+    _bb.push(value: element, len: size)
+    return _bb.size
+  }
+
+}
+
+extension FlatBufferBuilder: CustomDebugStringConvertible {
+
+  public var debugDescription: String {
+    """
+    buffer debug:
+    \(_bb)
+    builder debug:
+    { finished: \(finished), serializeDefaults: \(serializeDefaults), isNested: \(isNested) }
+    """
+  }
+
+  /// VTableStorage is a class to contain the VTable buffer that would be serialized into buffer
+  @usableFromInline
+  internal class VTableStorage {
+    /// Memory check since deallocating each time we want to clear would be expensive
+    /// and memory leaks would happen if we dont deallocate the first allocated memory.
+    /// memory is promised to be available before adding `FieldLoc`
+    private var memoryInUse = false
+    /// Size of FieldLoc in memory
+    let size = MemoryLayout<FieldLoc>.stride
+    /// Memeory buffer
+    var memory: UnsafeMutableRawBufferPointer!
+    /// Capacity of the current buffer
+    var capacity: Int = 0
+    /// Maximuim offset written to the class
+    var maxOffset: VOffset = 0
+    /// number of fields written into the buffer
+    var numOfFields: Int = 0
+    /// Last written Index
+    var writtenIndex: Int = 0
+    /// the amount of added elements into the buffer
+    var addedElements: Int { capacity - (numOfFields &* size) }
+
+    /// Creates the memory to store the buffer in
+    @usableFromInline
+    init() {
+      memory = UnsafeMutableRawBufferPointer.allocate(byteCount: 0, alignment: 0)
+    }
+
+    deinit {
+      memory.deallocate()
+    }
+
+    /// Builds a buffer with byte count of fieldloc.size * count of field numbers
+    /// - Parameter count: number of fields to be written
+    @inline(__always)
+    func start(count: Int) {
+      assert(count >= 0, "number of fields should NOT be negative")
+      let capacity = count &* size
+      ensure(space: capacity)
+    }
+
+    /// Adds a FieldLoc into the buffer, which would track how many have been written,
+    /// and max offset
+    /// - Parameter loc: Location of encoded element
+    func add(loc: FieldLoc) {
+      memory.baseAddress?.advanced(by: writtenIndex).storeBytes(of: loc, as: FieldLoc.self)
+      writtenIndex = writtenIndex &+ size
+      numOfFields = numOfFields &+ 1
+      maxOffset = max(loc.position, maxOffset)
+    }
+
+    /// Clears the data stored related to the encoded buffer
+    func clear() {
+      maxOffset = 0
+      numOfFields = 0
+      writtenIndex = 0
+    }
+
+    /// Ensure that the buffer has enough space instead of recreating the buffer each time.
+    /// - Parameter space: space required for the new vtable
+    @inline(__always)
+    func ensure(space: Int) {
+      guard space &+ writtenIndex > capacity else { return }
+      memory.deallocate()
+      memory = UnsafeMutableRawBufferPointer.allocate(byteCount: space, alignment: size)
+      capacity = space
+    }
+
+    /// Loads an object of type `FieldLoc` from buffer memory
+    /// - Parameter index: index of element
+    /// - Returns: a FieldLoc at index
+    @inline(__always)
+    func load(at index: Int) -> FieldLoc {
+      memory.load(fromByteOffset: index, as: FieldLoc.self)
+    }
+
+  }
+
+  internal struct FieldLoc {
+    var offset: UOffset
+    var position: VOffset
+  }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferObject.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferObject.swift
new file mode 100644
index 0000000..72325e7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBufferObject.swift
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+/// NativeStruct is a protocol that indicates if the struct is a native `swift` struct
+/// since now we will be serializing native structs into the buffer.
+public protocol NativeStruct {}
+
+/// FlatbuffersInitializable is a protocol that allows any object to be
+/// Initialized from a ByteBuffer
+public protocol FlatbuffersInitializable {
+  init(_ bb: ByteBuffer, o: Int32)
+}
+
+/// FlatbufferObject structures all the Flatbuffers objects
+public protocol FlatBufferObject: FlatbuffersInitializable {
+  var __buffer: ByteBuffer! { get }
+}
+
+public protocol ObjectAPIPacker {
+  associatedtype T
+  static func pack(_ builder: inout FlatBufferBuilder, obj: inout T?) -> Offset
+  static func pack(_ builder: inout FlatBufferBuilder, obj: inout T) -> Offset
+  mutating func unpack() -> T
+}
+
+public protocol Enum {
+  associatedtype T: Scalar
+  static var byteSize: Int { get }
+  var value: T { get }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBuffersUtils.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBuffersUtils.swift
new file mode 100644
index 0000000..f0a96db
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/FlatBuffersUtils.swift
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+public final class FlatBuffersUtils {
+
+  /// Gets the size of the prefix
+  /// - Parameter bb: Flatbuffer object
+  public static func getSizePrefix(bb: ByteBuffer) -> Int32 {
+    bb.read(def: Int32.self, position: bb.reader)
+  }
+
+  /// Removes the prefix by duplicating the Flatbuffer
+  /// - Parameter bb: Flatbuffer object
+  public static func removeSizePrefix(bb: ByteBuffer) -> ByteBuffer {
+    bb.duplicate(removing: MemoryLayout<Int32>.size)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Int+extension.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Int+extension.swift
new file mode 100644
index 0000000..76977ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Int+extension.swift
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+extension Int {
+
+  /// Moves the current int into the nearest power of two
+  ///
+  /// This is used since the UnsafeMutableRawPointer will face issues when writing/reading
+  /// if the buffer alignment exceeds that actual size of the buffer
+  var convertToPowerofTwo: Int {
+    guard self > 0 else { return 1 }
+    var n = UOffset(self)
+
+    #if arch(arm) || arch(i386)
+    let max = UInt32(Int.max)
+    #else
+    let max = UInt32.max
+    #endif
+
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    if n != max {
+      n += 1
+    }
+
+    return Int(n)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Message.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Message.swift
new file mode 100644
index 0000000..52ae487
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Message.swift
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public protocol FlatBufferGRPCMessage {
+
+  /// Raw pointer which would be pointing to the beginning of the readable bytes
+  var rawPointer: UnsafeMutableRawPointer { get }
+
+  /// Size of readable bytes in the buffer
+  var size: Int { get }
+
+  init(byteBuffer: ByteBuffer)
+}
+
+/// Message is a wrapper around Buffers to to able to send Flatbuffers `Buffers` through the
+/// GRPC library
+public struct Message<T: FlatBufferObject>: FlatBufferGRPCMessage {
+  internal var buffer: ByteBuffer
+
+  /// Returns the an object of type T that would be  read from the buffer
+  public var object: T {
+    T.init(
+      buffer,
+      o: Int32(buffer.read(def: UOffset.self, position: buffer.reader)) + Int32(buffer.reader))
+  }
+
+  public var rawPointer: UnsafeMutableRawPointer { buffer.memory.advanced(by: buffer.reader) }
+
+  public var size: Int { Int(buffer.size) }
+
+  /// Initializes the message with the type Flatbuffer.Bytebuffer that is transmitted over
+  /// GRPC
+  /// - Parameter byteBuffer: Flatbuffer ByteBuffer object
+  public init(byteBuffer: ByteBuffer) {
+    buffer = byteBuffer
+  }
+
+  /// Initializes the message by copying the buffer to the message to be sent.
+  /// from the builder
+  /// - Parameter builder: FlatbufferBuilder that has the bytes created in
+  /// - Note: Use  `builder.finish(offset)` before passing the builder without prefixing anything to it
+  public init(builder: inout FlatBufferBuilder) {
+    buffer = builder.sizedBuffer
+    builder.clear()
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Mutable.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Mutable.swift
new file mode 100644
index 0000000..60f0f12
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Mutable.swift
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+/// Mutable is a protocol that allows us to mutate Scalar values within the buffer
+public protocol Mutable {
+  /// makes Flatbuffer accessed within the Protocol
+  var bb: ByteBuffer { get }
+  /// makes position of the table/struct  accessed within the Protocol
+  var postion: Int32 { get }
+}
+
+extension Mutable {
+
+  /// Mutates the memory in the buffer, this is only called from the access function of table and structs
+  /// - Parameters:
+  ///   - value: New value to be inserted to the buffer
+  ///   - index: index of the Element
+  func mutate<T: Scalar>(value: T, o: Int32) -> Bool {
+    guard o != 0 else { return false }
+    bb.write(value: value, index: Int(o), direct: true)
+    return true
+  }
+}
+
+extension Mutable where Self == Table {
+
+  /// Mutates a value by calling mutate with respect to the position in the table
+  /// - Parameters:
+  ///   - value: New value to be inserted to the buffer
+  ///   - index: index of the Element
+  public func mutate<T: Scalar>(_ value: T, index: Int32) -> Bool {
+    guard index != 0 else { return false }
+    return mutate(value: value, o: index + postion)
+  }
+
+  /// Directly mutates the element by calling mutate
+  ///
+  /// Mutates the Element at index ignoring the current position by calling mutate
+  /// - Parameters:
+  ///   - value: New value to be inserted to the buffer
+  ///   - index: index of the Element
+  public func directMutate<T: Scalar>(_ value: T, index: Int32) -> Bool {
+    mutate(value: value, o: index)
+  }
+}
+
+extension Mutable where Self == Struct {
+
+  /// Mutates a value by calling mutate with respect to the position in the struct
+  /// - Parameters:
+  ///   - value: New value to be inserted to the buffer
+  ///   - index: index of the Element
+  public func mutate<T: Scalar>(_ value: T, index: Int32) -> Bool {
+    mutate(value: value, o: index + postion)
+  }
+
+  /// Directly mutates the element by calling mutate
+  ///
+  /// Mutates the Element at index ignoring the current position by calling mutate
+  /// - Parameters:
+  ///   - value: New value to be inserted to the buffer
+  ///   - index: index of the Element
+  public func directMutate<T: Scalar>(_ value: T, index: Int32) -> Bool {
+    mutate(value: value, o: index)
+  }
+}
+
+extension Struct: Mutable {}
+extension Table: Mutable {}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/NativeObject.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/NativeObject.swift
new file mode 100644
index 0000000..2f360f0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/NativeObject.swift
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+public protocol NativeObject {}
+
+extension NativeObject {
+
+  /// Serialize is a helper function that serailizes the data from the Object API to a bytebuffer directly th
+  /// - Parameter type: Type of the Flatbuffer object
+  /// - Returns: returns the encoded sized ByteBuffer
+  public func serialize<T: ObjectAPIPacker>(type: T.Type) -> ByteBuffer where T.T == Self {
+    var builder = FlatBufferBuilder(initialSize: 1024)
+    return serialize(builder: &builder, type: type.self)
+  }
+
+  /// Serialize is a helper function that serailizes the data from the Object API to a bytebuffer directly.
+  ///
+  /// - Parameters:
+  ///   - builder: A FlatBufferBuilder
+  ///   - type: Type of the Flatbuffer object
+  /// - Returns: returns the encoded sized ByteBuffer
+  /// - Note: The `serialize(builder:type)` can be considered as a function that allows you to create smaller builder instead of the default `1024`.
+  ///  It can be considered less expensive in terms of memory allocation
+  public func serialize<T: ObjectAPIPacker>(builder: inout FlatBufferBuilder, type: T.Type) -> ByteBuffer where T.T == Self {
+    var s = self
+    let root = type.pack(&builder, obj: &s)
+    builder.finish(offset: root)
+    return builder.sizedBuffer
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Offset.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Offset.swift
new file mode 100644
index 0000000..bd3f8a8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Offset.swift
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+/// Offset object for all the Objects that are written into the buffer
+public struct Offset {
+  /// Offset of the object in the buffer
+  public var o: UOffset
+  /// Returns false if the offset is equal to zero
+  public var isEmpty: Bool { o == 0 }
+
+  public init(offset: UOffset) { o = offset }
+  public init() { o = 0 }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/String+extension.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/String+extension.swift
new file mode 100644
index 0000000..7c9b06e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/String+extension.swift
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+extension String: FlatbuffersInitializable {
+
+  /// Initailizes a string from a Flatbuffers ByteBuffer
+  /// - Parameters:
+  ///   - bb: ByteBuffer containing the readable string
+  ///   - o: Current position
+  public init(_ bb: ByteBuffer, o: Int32) {
+    let count = bb.read(def: Int32.self, position: Int(o))
+    self = bb.readString(
+      at: Int32(MemoryLayout<Int32>.size) + o,
+      count: count) ?? ""
+  }
+}
+
+extension String: ObjectAPIPacker {
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout String?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout String) -> Offset {
+    builder.create(string: obj)
+  }
+
+  public mutating func unpack() -> String {
+    self
+  }
+
+}
+
+extension String: NativeObject {
+
+  public func serialize<T: ObjectAPIPacker>(type: T.Type) -> ByteBuffer where T.T == Self {
+    fatalError("serialize should never be called from string directly")
+  }
+
+  public func serialize<T: ObjectAPIPacker>(builder: inout FlatBufferBuilder, type: T.Type) -> ByteBuffer where T.T == Self {
+    fatalError("serialize should never be called from string directly")
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Struct.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Struct.swift
new file mode 100644
index 0000000..0b546d9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Struct.swift
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+@frozen
+public struct Struct {
+  public private(set) var bb: ByteBuffer
+  public private(set) var postion: Int32
+
+  public init(bb: ByteBuffer, position: Int32 = 0) {
+    self.bb = bb
+    postion = position
+  }
+
+  public func readBuffer<T: Scalar>(of type: T.Type, at o: Int32) -> T {
+    let r = bb.read(def: T.self, position: Int(o + postion))
+    return r
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Table.swift b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Table.swift
new file mode 100644
index 0000000..7c18555
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/swift/Sources/FlatBuffers/Table.swift
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+
+@frozen
+public struct Table {
+  public private(set) var bb: ByteBuffer
+  public private(set) var postion: Int32
+
+  public init(bb: ByteBuffer, position: Int32 = 0) {
+    guard isLitteEndian else {
+      fatalError("Reading/Writing a buffer in big endian machine is not supported on swift")
+    }
+    self.bb = bb
+    postion = position
+  }
+
+  public func offset(_ o: Int32) -> Int32 {
+    let vtable = postion - bb.read(def: Int32.self, position: Int(postion))
+    return o < bb.read(def: VOffset.self, position: Int(vtable)) ? Int32(bb.read(
+      def: Int16.self,
+      position: Int(vtable + o))) : 0
+  }
+
+  public func indirect(_ o: Int32) -> Int32 { o + bb.read(def: Int32.self, position: Int(o)) }
+
+  /// String reads from the buffer with respect to position of the current table.
+  /// - Parameter offset: Offset of the string
+  public func string(at offset: Int32) -> String? {
+    directString(at: offset + postion)
+  }
+
+  /// Direct string reads from the buffer disregarding the position of the table.
+  /// It would be preferable to use string unless the current position of the table is not needed
+  /// - Parameter offset: Offset of the string
+  public func directString(at offset: Int32) -> String? {
+    var offset = offset
+    offset += bb.read(def: Int32.self, position: Int(offset))
+    let count = bb.read(def: Int32.self, position: Int(offset))
+    let position = offset + Int32(MemoryLayout<Int32>.size)
+    return bb.readString(at: position, count: count)
+  }
+
+  /// Reads from the buffer with respect to the position in the table.
+  /// - Parameters:
+  ///   - type: Type of Element that needs to be read from the buffer
+  ///   - o: Offset of the Element
+  public func readBuffer<T>(of type: T.Type, at o: Int32) -> T {
+    directRead(of: T.self, offset: o + postion)
+  }
+
+  /// Reads from the buffer disregarding the position of the table.
+  /// It would be used when reading from an
+  ///   ```
+  ///   let offset = __t.offset(10)
+  ///   //Only used when the we already know what is the
+  ///   // position in the table since __t.vector(at:)
+  ///   // returns the index with respect to the position
+  ///   __t.directRead(of: Byte.self,
+  ///                  offset: __t.vector(at: offset) + index * 1)
+  ///   ```
+  /// - Parameters:
+  ///   - type: Type of Element that needs to be read from the buffer
+  ///   - o: Offset of the Element
+  public func directRead<T>(of type: T.Type, offset o: Int32) -> T {
+    let r = bb.read(def: T.self, position: Int(o))
+    return r
+  }
+
+  public func union<T: FlatbuffersInitializable>(_ o: Int32) -> T {
+    let o = o + postion
+    return directUnion(o)
+  }
+
+  public func directUnion<T: FlatbuffersInitializable>(_ o: Int32) -> T {
+    T.init(bb, o: o + bb.read(def: Int32.self, position: Int(o)))
+  }
+
+  public func getVector<T>(at off: Int32) -> [T]? {
+    let o = offset(off)
+    guard o != 0 else { return nil }
+    return bb.readSlice(index: vector(at: o), count: vector(count: o))
+  }
+
+  /// Vector count gets the count of Elements within the array
+  /// - Parameter o: start offset of the vector
+  /// - returns: Count of elements
+  public func vector(count o: Int32) -> Int32 {
+    var o = o
+    o += postion
+    o += bb.read(def: Int32.self, position: Int(o))
+    return bb.read(def: Int32.self, position: Int(o))
+  }
+
+  /// Vector start index in the buffer
+  /// - Parameter o:start offset of the vector
+  /// - returns: the start index of the vector
+  public func vector(at o: Int32) -> Int32 {
+    var o = o
+    o += postion
+    return o + bb.read(def: Int32.self, position: Int(o)) + 4
+  }
+
+  static public func indirect(_ o: Int32, _ fbb: ByteBuffer) -> Int32 { o + fbb.read(
+    def: Int32.self,
+    position: Int(o)) }
+
+  static public func offset(_ o: Int32, vOffset: Int32, fbb: ByteBuffer) -> Int32 {
+    let vTable = Int32(fbb.capacity) - o
+    return vTable + Int32(fbb.read(
+      def: Int16.self,
+      position: Int(vTable + vOffset - fbb.read(def: Int32.self, position: Int(vTable)))))
+  }
+
+  static public func compare(_ off1: Int32, _ off2: Int32, fbb: ByteBuffer) -> Int32 {
+    let memorySize = Int32(MemoryLayout<Int32>.size)
+    let _off1 = off1 + fbb.read(def: Int32.self, position: Int(off1))
+    let _off2 = off2 + fbb.read(def: Int32.self, position: Int(off2))
+    let len1 = fbb.read(def: Int32.self, position: Int(_off1))
+    let len2 = fbb.read(def: Int32.self, position: Int(_off2))
+    let startPos1 = _off1 + memorySize
+    let startPos2 = _off2 + memorySize
+    let minValue = min(len1, len2)
+    for i in 0...minValue {
+      let b1 = fbb.read(def: Int8.self, position: Int(i + startPos1))
+      let b2 = fbb.read(def: Int8.self, position: Int(i + startPos2))
+      if b1 != b2 {
+        return Int32(b2 - b1)
+      }
+    }
+    return len1 - len2
+  }
+
+  static public func compare(_ off1: Int32, _ key: [Byte], fbb: ByteBuffer) -> Int32 {
+    let memorySize = Int32(MemoryLayout<Int32>.size)
+    let _off1 = off1 + fbb.read(def: Int32.self, position: Int(off1))
+    let len1 = fbb.read(def: Int32.self, position: Int(_off1))
+    let len2 = Int32(key.count)
+    let startPos1 = _off1 + memorySize
+    let minValue = min(len1, len2)
+    for i in 0..<minValue {
+      let b = fbb.read(def: Int8.self, position: Int(i + startPos1))
+      let byte = key[Int(i)]
+      if b != byte {
+        return Int32(b - Int8(byte))
+      }
+    }
+    return len1 - len2
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/BUILD.bazel b/3rdparty/TNN/third_party/flatbuffers/tests/BUILD.bazel
new file mode 100644
index 0000000..856431c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/BUILD.bazel
@@ -0,0 +1,152 @@
+load("@rules_cc//cc:defs.bzl", "cc_test")
+load("//:build_defs.bzl", "flatbuffer_cc_library")
+
+package(default_visibility = ["//visibility:private"])
+
+# Test binary.
+cc_test(
+    name = "flatbuffers_test",
+    testonly = 1,
+    srcs = [
+        "evolution_test/evolution_v1_generated.h",
+        "evolution_test/evolution_v2_generated.h",
+        "monster_test_bfbs_generated.h",
+        "namespace_test/namespace_test1_generated.h",
+        "namespace_test/namespace_test2_generated.h",
+        "native_type_test_impl.cpp",
+        "native_type_test_impl.h",
+        "optional_scalars_generated.h",
+        "test.cpp",
+        "test_assert.cpp",
+        "test_assert.h",
+        "test_builder.cpp",
+        "test_builder.h",
+        "union_vector/union_vector_generated.h",
+    ],
+    copts = [
+        "-DFLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE",
+        "-DBAZEL_TEST_DATA_PATH",
+    ],
+    data = [
+        ":arrays_test.bfbs",
+        ":arrays_test.fbs",
+        ":arrays_test.golden",
+        ":evolution_test/evolution_v1.fbs",
+        ":evolution_test/evolution_v1.json",
+        ":evolution_test/evolution_v2.fbs",
+        ":evolution_test/evolution_v2.json",
+        ":include_test/include_test1.fbs",
+        ":include_test/sub/include_test2.fbs",
+        ":monster_extra.fbs",
+        ":monster_test.bfbs",
+        ":monster_test.fbs",
+        ":monsterdata_extra.json",
+        ":monsterdata_test.golden",
+        ":monsterdata_test.json",
+        ":native_type_test.fbs",
+        ":optional_scalars.fbs",
+        ":prototest/imported.proto",
+        ":prototest/test.golden",
+        ":prototest/test.proto",
+        ":prototest/test_include.golden",
+        ":prototest/test_suffix.golden",
+        ":prototest/test_union.golden",
+        ":prototest/test_union_include.golden",
+        ":prototest/test_union_suffix.golden",
+        ":unicode_test.json",
+        ":union_vector/union_vector.fbs",
+        ":union_vector/union_vector.json",
+    ],
+    includes = [
+        "",
+        "include/",
+    ],
+    deps = [
+        ":arrays_test_cc_fbs",
+        ":monster_extra_cc_fbs",
+        ":monster_test_cc_fbs",
+        ":native_type_test_cc_fbs",
+        "//:flatbuffers",
+    ],
+)
+
+# Test bzl rules
+
+cc_library(
+    name = "test_assert",
+    srcs = ["test_assert.cpp"],
+    hdrs = ["test_assert.h"],
+    visibility = ["//grpc/tests:__subpackages__"],
+    deps = ["//:flatbuffers"],
+)
+
+cc_library(
+    name = "test_builder",
+    srcs = ["test_builder.cpp"],
+    hdrs = ["test_builder.h"],
+    visibility = ["//grpc/tests:__subpackages__"],
+    deps = [
+        ":monster_test_grpc",
+        ":test_assert",
+        "//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "monster_test_grpc",
+    srcs = [
+        "monster_test.grpc.fb.cc",
+        "monster_test.grpc.fb.h",
+        "monster_test_generated.h",
+    ],
+    hdrs = [
+        "monster_test.grpc.fb.h",
+        "monster_test_generated.h",
+    ],
+    includes = ["."],
+    visibility = ["//grpc/tests:__subpackages__"],
+    deps = [
+        "//:flatbuffers",
+        "@com_github_grpc_grpc//:grpc++",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "monster_test_cc_fbs",
+    srcs = ["monster_test.fbs"],
+    include_paths = ["tests/include_test"],
+    includes = [
+        "include_test/include_test1.fbs",
+        "include_test/sub/include_test2.fbs",
+    ],
+    visibility = ["//grpc/tests:__subpackages__"],
+)
+
+flatbuffer_cc_library(
+    name = "monster_extra_cc_fbs",
+    srcs = ["monster_extra.fbs"],
+)
+
+flatbuffer_cc_library(
+    name = "arrays_test_cc_fbs",
+    srcs = ["arrays_test.fbs"],
+    flatc_args = [
+        "--gen-object-api",
+        "--gen-compare",
+        "--no-includes",
+        "--gen-mutable",
+        "--reflect-names",
+        "--cpp-ptr-type flatbuffers::unique_ptr",
+        "--scoped-enums",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "native_type_test_cc_fbs",
+    srcs = ["native_type_test.fbs"],
+    flatc_args = [
+        "--gen-object-api",
+        "--gen-mutable",
+        "--cpp-ptr-type flatbuffers::unique_ptr",
+    ],
+)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/DartTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/DartTest.sh
new file mode 100755
index 0000000..0aedb63
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/DartTest.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+#
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pushd "$(dirname $0)" >/dev/null
+
+command -v dart >/dev/null 2>&1 || { echo >&2 "Dart tests require dart to be in path but it's not installed.  Aborting."; exit 1; }
+# output required files to the dart folder so that pub will be able to
+# distribute them and more people can more easily run the dart tests
+../flatc --dart -I include_test -o ../dart/test monster_test.fbs
+cp monsterdata_test.mon ../dart/test
+
+cd ../dart
+
+# update packages
+dart pub get
+# Execute the sample.
+dart test/flat_buffers_test.dart
+
+# cleanup
+rm ../dart/test/monsterdata_test.mon
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Package.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Package.swift
new file mode 100644
index 0000000..2026247
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Package.swift
@@ -0,0 +1,32 @@
+// swift-tools-version:5.1
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "FlatBuffers.Benchmarks.swift",
+  platforms: [
+    .macOS(.v10_14),
+  ],
+  dependencies: [
+    .package(path: "../../swift"),
+  ],
+  targets: [
+    .target(
+      name: "FlatBuffers.Benchmarks.swift",
+      dependencies: ["FlatBuffers"]),
+  ])
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Sources/FlatBuffers.Benchmarks.swift/main.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Sources/FlatBuffers.Benchmarks.swift/main.swift
new file mode 100644
index 0000000..114e8f7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks.swift/Sources/FlatBuffers.Benchmarks.swift/main.swift
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import CoreFoundation
+import FlatBuffers
+
+struct Benchmark {
+  var name: String
+  var value: Double
+
+  var description: String { "\(String(format: "|\t%@\t\t|\t\t%fs\t|", name, value))"}
+}
+
+func run(name: String, runs: Int, action: () -> Void) -> Benchmark {
+  action()
+  let start = CFAbsoluteTimeGetCurrent()
+  for _ in 0..<runs {
+    action()
+  }
+  let ends = CFAbsoluteTimeGetCurrent()
+  let value = Double(ends - start) / Double(runs)
+  print("done \(name): in \(value)")
+  return Benchmark(name: name, value: value)
+}
+
+
+func createDocument(Benchmarks: [Benchmark]) -> String {
+  let separator = "-------------------------------------"
+  var document = "\(separator)\n"
+  document += "\(String(format: "|\t%@\t\t|\t\t%@\t\t|", "Name", "Scores"))\n"
+  document += "\(separator)\n"
+  for i in Benchmarks {
+    document += "\(i.description) \n"
+    document += "\(separator)\n"
+  }
+  return document
+}
+
+@inlinable
+func create10Strings() {
+  var fb = FlatBufferBuilder(initialSize: 1<<20)
+  for _ in 0..<10_000 {
+    _ = fb.create(string: "foobarbaz")
+  }
+}
+
+@inlinable
+func create100Strings(str: String) {
+  var fb = FlatBufferBuilder(initialSize: 1<<20)
+  for _ in 0..<10_000 {
+    _ = fb.create(string: str)
+  }
+}
+
+@inlinable
+func benchmarkFiveHundredAdds() {
+  var fb = FlatBufferBuilder(initialSize: 1024 * 1024 * 32)
+  for _ in 0..<500_000 {
+    let off = fb.create(string: "T")
+    let s = fb.startTable(with: 4)
+    fb.add(element: 3.2, def: 0, at: 2)
+    fb.add(element: 4.2, def: 0, at: 4)
+    fb.add(element: 5.2, def: 0, at: 6)
+    fb.add(offset: off, at: 8)
+    _ = fb.endTable(at: s)
+  }
+}
+
+@inlinable
+func benchmarkThreeMillionStructs() {
+  let structCount = 3_000_000
+
+  let rawSize = ((16 * 5) * structCount) / 1024
+
+  var fb = FlatBufferBuilder(initialSize: Int32(rawSize * 1600))
+
+  var offsets: [Offset] = []
+  for _ in 0..<structCount {
+    fb.startVector(5 * MemoryLayout<AA>.size, elementSize: MemoryLayout<AA>.alignment)
+    for _ in 0..<5 {
+      _ = fb.create(struct: AA(a: 2.4, b: 2.4))
+    }
+    let vector = fb.endVector(len: 5)
+    let start = fb.startTable(with: 1)
+    fb.add(offset: vector, at: 4)
+    offsets.append(Offset(offset: fb.endTable(at: start)))
+  }
+  let vector = fb.createVector(ofOffsets: offsets)
+  let start = fb.startTable(with: 1)
+  fb.add(offset: vector, at: 4)
+  let root = Offset(offset: fb.endTable(at: start))
+  fb.finish(offset: root)
+}
+
+@usableFromInline
+struct AA: NativeStruct {
+  public init(a: Double, b: Double) {
+    self.a = a
+    self.b = b
+  }
+  var a: Double
+  var b: Double
+
+}
+
+func benchmark(numberOfRuns runs: Int) {
+  var benchmarks: [Benchmark] = []
+  let str = (0...99).map { _ -> String in "x" }.joined()
+  benchmarks.append(run(name: "500_000", runs: runs, action: benchmarkFiveHundredAdds))
+  benchmarks.append(run(name: "10 str", runs: runs, action: create10Strings))
+  let hundredStr = run(name: "100 str", runs: runs) {
+    create100Strings(str: str)
+  }
+  benchmarks.append(run(name: "3M strc", runs: 1, action: benchmarkThreeMillionStructs))
+  benchmarks.append(hundredStr)
+  print(createDocument(Benchmarks: benchmarks))
+}
+
+benchmark(numberOfRuns: 20)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBufferBuilderBenchmark.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBufferBuilderBenchmark.cs
new file mode 100644
index 0000000..1df5ac3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBufferBuilderBenchmark.cs
@@ -0,0 +1,101 @@
+﻿/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using BenchmarkDotNet.Attributes;
+using MyGame.Example;
+
+namespace FlatBuffers.Benchmarks
+{
+    //[EtwProfiler] - needs elevated privileges
+    [MemoryDiagnoser]
+    public class FlatBufferBuilderBenchmark
+    {
+        private const int NumberOfRows = 10_000;
+
+        [Benchmark]
+        public void BuildNestedMonster()
+        {
+            const string nestedMonsterName = "NestedMonsterName";
+            const short nestedMonsterHp = 600;
+            const short nestedMonsterMana = 1024;
+
+            for (int i = 0; i < NumberOfRows; i++)
+            {
+                // Create nested buffer as a Monster type
+                var fbb1 = new FlatBufferBuilder(16);
+                var str1 = fbb1.CreateString(nestedMonsterName);
+                Monster.StartMonster(fbb1);
+                Monster.AddName(fbb1, str1);
+                Monster.AddHp(fbb1, nestedMonsterHp);
+                Monster.AddMana(fbb1, nestedMonsterMana);
+                var monster1 = Monster.EndMonster(fbb1);
+                Monster.FinishMonsterBuffer(fbb1, monster1);
+                var fbb1Bytes = fbb1.SizedByteArray();
+                fbb1 = null;
+
+                // Create a Monster which has the first buffer as a nested buffer
+                var fbb2 = new FlatBufferBuilder(16);
+                var str2 = fbb2.CreateString("My Monster");
+                var nestedBuffer = Monster.CreateTestnestedflatbufferVector(fbb2, fbb1Bytes);
+                Monster.StartMonster(fbb2);
+                Monster.AddName(fbb2, str2);
+                Monster.AddHp(fbb2, 50);
+                Monster.AddMana(fbb2, 32);
+                Monster.AddTestnestedflatbuffer(fbb2, nestedBuffer);
+                var monster = Monster.EndMonster(fbb2);
+                Monster.FinishMonsterBuffer(fbb2, monster);
+            }
+        }
+
+        [Benchmark]
+        public void BuildMonster()
+        {
+            for (int i = 0; i < NumberOfRows; i++)
+            {
+                var builder = new FlatBufferBuilder(16);
+                var str1 = builder.CreateString("MonsterName");
+                Monster.StartMonster(builder);
+                Monster.AddName(builder, str1);
+                Monster.AddHp(builder, 600);
+                Monster.AddMana(builder, 1024);
+                Monster.AddColor(builder, Color.Blue);
+                Monster.AddTestbool(builder, true);
+                Monster.AddTestf(builder, 0.3f);
+                Monster.AddTestf2(builder, 0.2f);
+                Monster.AddTestf3(builder, 0.1f);
+
+                var monster1 = Monster.EndMonster(builder);
+                Monster.FinishMonsterBuffer(builder, monster1);
+            }
+        }
+
+        [Benchmark]
+        public void TestTables()
+        {
+            FlatBufferBuilder builder = new FlatBufferBuilder(1024 * 1024 * 32);
+            for (int x = 0; x < 500000; ++x)
+            {
+                var offset = builder.CreateString("T");
+                builder.StartObject(4);
+                builder.AddDouble(3.2);
+                builder.AddDouble(4.2);
+                builder.AddDouble(5.2);
+                builder.AddOffset(offset.Value);
+                builder.EndObject();
+            }
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBuffers.Benchmarks.csproj b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBuffers.Benchmarks.csproj
new file mode 100644
index 0000000..b900384
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/FlatBuffers.Benchmarks.csproj
@@ -0,0 +1,21 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>netcoreapp2.1</TargetFramework>
+    <LangVersion>latest</LangVersion>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <DefineConstants>$(DefineConstants);UNSAFE_BYTEBUFFER;BYTEBUFFER_NO_BOUNDS_CHECK;ENABLE_SPAN_T</DefineConstants>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.11.3" />
+    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.11.3" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="..\..\net\FlatBuffers\*.cs" Link="FlatBuffers\%(FileName).cs" />
+    <Compile Include="..\MyGame\**\*.cs" Link="MyGame\Example\%(FileName).cs" />
+  </ItemGroup>
+
+</Project>
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/Program.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/Program.cs
new file mode 100644
index 0000000..9e63b4b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Benchmarks/Program.cs
@@ -0,0 +1,30 @@
+﻿/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using BenchmarkDotNet.Running;
+
+namespace FlatBuffers.Benchmarks
+{
+    public static class Program
+    {
+        public static void Main(string[] args)
+        {
+            BenchmarkSwitcher
+                .FromAssembly(typeof(Program).Assembly)
+                .Run(args);
+        }
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Package.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Package.swift
new file mode 100644
index 0000000..ec36de3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Package.swift
@@ -0,0 +1,35 @@
+// swift-tools-version:5.1
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "FlatBuffers.Test.Swift",
+  platforms: [
+    .iOS(.v11),
+    .macOS(.v10_14),
+  ],
+  dependencies: [
+    .package(path: "../../swift/"),
+    .package(url: "https://github.com/grpc/grpc-swift.git", from: "1.0.0"),
+  ],
+  targets: [
+    .target(name: "SwiftFlatBuffers"),
+    .testTarget(
+      name: "FlatBuffers.Test.SwiftTests",
+      dependencies: ["FlatBuffers", "GRPC"]),
+  ])
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Sources/SwiftFlatBuffers/main.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Sources/SwiftFlatBuffers/main.swift
new file mode 100644
index 0000000..a5a23a6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Sources/SwiftFlatBuffers/main.swift
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+print("Flatbuffers")
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/SwiftTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/SwiftTest.sh
new file mode 100755
index 0000000..5e1c6cd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/SwiftTest.sh
@@ -0,0 +1,13 @@
+swift_dir=`pwd`
+cd ..
+test_dir=`pwd`
+alias fbc='${test_dir}/../flatc'
+shopt -s expand_aliases
+
+cd ${swift_dir}/Tests/FlatBuffers.Test.SwiftTests
+fbc --swift --gen-mutable --grpc --gen-object-api -I ${test_dir}/include_test ${test_dir}/monster_test.fbs ${test_dir}/union_vector/union_vector.fbs
+fbc --swift ${test_dir}/optional_scalars.fbs
+fbc --swift --gen-object-api ${test_dir}/more_defaults.fbs
+cd ${swift_dir}
+swift build --build-tests
+swift test
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersMonsterWriterTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersMonsterWriterTests.swift
new file mode 100644
index 0000000..eb57785
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersMonsterWriterTests.swift
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+import XCTest
+@testable import FlatBuffers
+
+typealias Test = MyGame_Example_Test
+typealias Monster = MyGame_Example_Monster
+typealias Vec3 = MyGame_Example_Vec3
+typealias Stat = MyGame_Example_Stat
+
+class FlatBuffersMonsterWriterTests: XCTestCase {
+
+  func testData() {
+    let data = Data([48, 0, 0, 0, 77, 79, 78, 83, 0, 0, 0, 0, 36, 0, 72, 0, 40, 0, 0, 0, 38, 0, 32, 0, 0, 0, 28, 0, 0, 0, 27, 0, 20, 0, 16, 0, 12, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 0, 36, 0, 0, 0, 164, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 68, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 88, 0, 0, 0, 120, 0, 0, 0, 0, 0, 80, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 64, 2, 0, 5, 0, 6, 0, 0, 0, 2, 0, 0, 0, 64, 0, 0, 0, 48, 0, 0, 0, 2, 0, 0, 0, 30, 0, 40, 0, 10, 0, 20, 0, 152, 255, 255, 255, 4, 0, 0, 0, 4, 0, 0, 0, 70, 114, 101, 100, 0, 0, 0, 0, 5, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 50, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 49, 0, 0, 0, 9, 0, 0, 0, 77, 121, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0, 3, 0, 0, 0, 20, 0, 0, 0, 36, 0, 0, 0, 4, 0, 0, 0, 240, 255, 255, 255, 32, 0, 0, 0, 248, 255, 255, 255, 36, 0, 0, 0, 12, 0, 8, 0, 0, 0, 0, 0, 0, 0, 4, 0, 12, 0, 0, 0, 28, 0, 0, 0, 5, 0, 0, 0, 87, 105, 108, 109, 97, 0, 0, 0, 6, 0, 0, 0, 66, 97, 114, 110, 101, 121, 0, 0, 5, 0, 0, 0, 70, 114, 111, 100, 111, 0, 0, 0])
+    let _data = ByteBuffer(data: data)
+    readMonster(fb: _data)
+  }
+
+  func testReadFromOtherLanguages() {
+    let path = FileManager.default.currentDirectoryPath
+    let url = URL(fileURLWithPath: path, isDirectory: true).appendingPathComponent("monsterdata_test").appendingPathExtension("mon")
+    guard let data = try? Data(contentsOf: url) else { return }
+    let _data = ByteBuffer(data: data)
+    readMonster(fb: _data)
+  }
+
+  func testCreateMonster() {
+    let bytes = createMonster(withPrefix: false)
+    XCTAssertEqual(bytes.sizedByteArray, [48, 0, 0, 0, 77, 79, 78, 83, 0, 0, 0, 0, 36, 0, 72, 0, 40, 0, 0, 0, 38, 0, 32, 0, 0, 0, 28, 0, 0, 0, 27, 0, 20, 0, 16, 0, 12, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 0, 36, 0, 0, 0, 164, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 68, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 88, 0, 0, 0, 120, 0, 0, 0, 0, 0, 80, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 64, 2, 0, 5, 0, 6, 0, 0, 0, 2, 0, 0, 0, 64, 0, 0, 0, 48, 0, 0, 0, 2, 0, 0, 0, 30, 0, 40, 0, 10, 0, 20, 0, 152, 255, 255, 255, 4, 0, 0, 0, 4, 0, 0, 0, 70, 114, 101, 100, 0, 0, 0, 0, 5, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 50, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 49, 0, 0, 0, 9, 0, 0, 0, 77, 121, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0, 3, 0, 0, 0, 20, 0, 0, 0, 36, 0, 0, 0, 4, 0, 0, 0, 240, 255, 255, 255, 32, 0, 0, 0, 248, 255, 255, 255, 36, 0, 0, 0, 12, 0, 8, 0, 0, 0, 0, 0, 0, 0, 4, 0, 12, 0, 0, 0, 28, 0, 0, 0, 5, 0, 0, 0, 87, 105, 108, 109, 97, 0, 0, 0, 6, 0, 0, 0, 66, 97, 114, 110, 101, 121, 0, 0, 5, 0, 0, 0, 70, 114, 111, 100, 111, 0, 0, 0])
+    readMonster(fb: bytes.buffer)
+    mutateMonster(fb: bytes.buffer)
+    readMonster(fb: bytes.buffer)
+  }
+
+  func testCreateMonsterResizedBuffer() {
+    let bytes = createMonster(withPrefix: false)
+    XCTAssertEqual(bytes.sizedByteArray, [48, 0, 0, 0, 77, 79, 78, 83, 0, 0, 0, 0, 36, 0, 72, 0, 40, 0, 0, 0, 38, 0, 32, 0, 0, 0, 28, 0, 0, 0, 27, 0, 20, 0, 16, 0, 12, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 0, 36, 0, 0, 0, 164, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 68, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 88, 0, 0, 0, 120, 0, 0, 0, 0, 0, 80, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 64, 2, 0, 5, 0, 6, 0, 0, 0, 2, 0, 0, 0, 64, 0, 0, 0, 48, 0, 0, 0, 2, 0, 0, 0, 30, 0, 40, 0, 10, 0, 20, 0, 152, 255, 255, 255, 4, 0, 0, 0, 4, 0, 0, 0, 70, 114, 101, 100, 0, 0, 0, 0, 5, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 50, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 49, 0, 0, 0, 9, 0, 0, 0, 77, 121, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0, 3, 0, 0, 0, 20, 0, 0, 0, 36, 0, 0, 0, 4, 0, 0, 0, 240, 255, 255, 255, 32, 0, 0, 0, 248, 255, 255, 255, 36, 0, 0, 0, 12, 0, 8, 0, 0, 0, 0, 0, 0, 0, 4, 0, 12, 0, 0, 0, 28, 0, 0, 0, 5, 0, 0, 0, 87, 105, 108, 109, 97, 0, 0, 0, 6, 0, 0, 0, 66, 97, 114, 110, 101, 121, 0, 0, 5, 0, 0, 0, 70, 114, 111, 100, 111, 0, 0, 0])
+    readMonster(fb: bytes.sizedBuffer)
+  }
+
+  func testCreateMonsterPrefixed() {
+    let bytes = createMonster(withPrefix: true)
+    XCTAssertEqual(bytes.sizedByteArray, [44, 1, 0, 0, 44, 0, 0, 0, 77, 79, 78, 83, 36, 0, 72, 0, 40, 0, 0, 0, 38, 0, 32, 0, 0, 0, 28, 0, 0, 0, 27, 0, 20, 0, 16, 0, 12, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 0, 36, 0, 0, 0, 164, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 68, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 88, 0, 0, 0, 120, 0, 0, 0, 0, 0, 80, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 64, 2, 0, 5, 0, 6, 0, 0, 0, 2, 0, 0, 0, 64, 0, 0, 0, 48, 0, 0, 0, 2, 0, 0, 0, 30, 0, 40, 0, 10, 0, 20, 0, 152, 255, 255, 255, 4, 0, 0, 0, 4, 0, 0, 0, 70, 114, 101, 100, 0, 0, 0, 0, 5, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 50, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 49, 0, 0, 0, 9, 0, 0, 0, 77, 121, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0, 3, 0, 0, 0, 20, 0, 0, 0, 36, 0, 0, 0, 4, 0, 0, 0, 240, 255, 255, 255, 32, 0, 0, 0, 248, 255, 255, 255, 36, 0, 0, 0, 12, 0, 8, 0, 0, 0, 0, 0, 0, 0, 4, 0, 12, 0, 0, 0, 28, 0, 0, 0, 5, 0, 0, 0, 87, 105, 108, 109, 97, 0, 0, 0, 6, 0, 0, 0, 66, 97, 114, 110, 101, 121, 0, 0, 5, 0, 0, 0, 70, 114, 111, 100, 111, 0, 0, 0])
+
+    let newBuf = FlatBuffersUtils.removeSizePrefix(bb: bytes.buffer)
+    readMonster(fb: newBuf)
+  }
+
+  func testCreateMonsterUsingCreateMonsterMethodWithNilPos() {
+    var fbb = FlatBufferBuilder(initialSize: 1)
+    let name = fbb.create(string: "Frodo")
+    let mStart = Monster.startMonster(&fbb)
+    Monster.add(name: name, &fbb)
+    let root = Monster.endMonster(&fbb, start: mStart)
+    fbb.finish(offset: root)
+    let newMonster = Monster.getRootAsMonster(bb: fbb.sizedBuffer)
+    XCTAssertNil(newMonster.pos)
+    XCTAssertEqual(newMonster.name, "Frodo")
+  }
+
+  func testCreateMonsterUsingCreateMonsterMethodWithPosX() {
+    var fbb = FlatBufferBuilder(initialSize: 1)
+    let name = fbb.create(string: "Barney")
+    let mStart = Monster.startMonster(&fbb)
+    Monster.add(pos: MyGame_Example_Vec3(x: 10, y: 0, z: 0, test1: 0, test2: .blue, test3: .init()), &fbb)
+    Monster.add(name: name, &fbb)
+    let root = Monster.endMonster(&fbb, start: mStart)
+    fbb.finish(offset: root)
+
+    let newMonster = Monster.getRootAsMonster(bb: fbb.sizedBuffer)
+    XCTAssertEqual(newMonster.pos!.x, 10)
+    XCTAssertEqual(newMonster.name, "Barney")
+  }
+
+  func testReadMonsterFromUnsafePointerWithoutCopying() {
+    var array: [UInt8] = [48, 0, 0, 0, 77, 79, 78, 83, 0, 0, 0, 0, 36, 0, 72, 0, 40, 0, 0, 0, 38, 0, 32, 0, 0, 0, 28, 0, 0, 0, 27, 0, 20, 0, 16, 0, 12, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 0, 36, 0, 0, 0, 164, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 68, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 88, 0, 0, 0, 120, 0, 0, 0, 0, 0, 80, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 64, 2, 0, 5, 0, 6, 0, 0, 0, 2, 0, 0, 0, 64, 0, 0, 0, 48, 0, 0, 0, 2, 0, 0, 0, 30, 0, 40, 0, 10, 0, 20, 0, 152, 255, 255, 255, 4, 0, 0, 0, 4, 0, 0, 0, 70, 114, 101, 100, 0, 0, 0, 0, 5, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 50, 0, 0, 0, 5, 0, 0, 0, 116, 101, 115, 116, 49, 0, 0, 0, 9, 0, 0, 0, 77, 121, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0, 3, 0, 0, 0, 20, 0, 0, 0, 36, 0, 0, 0, 4, 0, 0, 0, 240, 255, 255, 255, 32, 0, 0, 0, 248, 255, 255, 255, 36, 0, 0, 0, 12, 0, 8, 0, 0, 0, 0, 0, 0, 0, 4, 0, 12, 0, 0, 0, 28, 0, 0, 0, 5, 0, 0, 0, 87, 105, 108, 109, 97, 0, 0, 0, 6, 0, 0, 0, 66, 97, 114, 110, 101, 121, 0, 0, 5, 0, 0, 0, 70, 114, 111, 100, 111, 0, 0, 0]
+    let unpacked = array.withUnsafeMutableBytes { (memory) -> MyGame_Example_MonsterT in
+      let bytes = ByteBuffer(assumingMemoryBound: memory.baseAddress!, capacity: memory.count)
+      var monster = Monster.getRootAsMonster(bb: bytes)
+      readFlatbufferMonster(monster: &monster)
+      let unpacked = monster.unpack()
+      return unpacked
+    }
+    readObjectApi(monster: unpacked)
+  }
+
+  func testArrayOfBools() {
+    let boolArray = [false, true, false, true, false, true, false]
+    var fbb = FlatBufferBuilder(initialSize: 1)
+    let name = fbb.create(string: "Frodo")
+    let bools = fbb.createVector(boolArray)
+    let root = Monster.createMonster(&fbb, nameOffset: name, testarrayofboolsVectorOffset: bools)
+    fbb.finish(offset: root)
+    let monster = Monster.getRootAsMonster(bb: fbb.sizedBuffer)
+
+    let values = monster.testarrayofbools
+
+    XCTAssertEqual(boolArray, values)
+
+    for i in 0..<monster.testarrayofboolsCount {
+      XCTAssertEqual(boolArray[Int(i)], monster.testarrayofbools(at: i))
+    }
+  }
+
+  func readMonster(fb: ByteBuffer) {
+    var monster = Monster.getRootAsMonster(bb: fb)
+    readFlatbufferMonster(monster: &monster)
+    let unpacked: MyGame_Example_MonsterT? = monster.unpack()
+    readObjectApi(monster: unpacked!)
+    guard let buffer = unpacked?.serialize() else { fatalError("Couldnt generate bytebuffer") }
+    var newMonster = Monster.getRootAsMonster(bb: buffer)
+    readFlatbufferMonster(monster: &newMonster)
+  }
+
+  func createMonster(withPrefix prefix: Bool) -> FlatBufferBuilder {
+    var fbb = FlatBufferBuilder(initialSize: 1)
+    let names = [fbb.create(string: "Frodo"), fbb.create(string: "Barney"), fbb.create(string: "Wilma")]
+    var offsets: [Offset] = []
+    let start1 = Monster.startMonster(&fbb)
+    Monster.add(name: names[0], &fbb)
+    offsets.append(Monster.endMonster(&fbb, start: start1))
+    let start2 = Monster.startMonster(&fbb)
+    Monster.add(name: names[1], &fbb)
+    offsets.append(Monster.endMonster(&fbb, start: start2))
+    let start3 = Monster.startMonster(&fbb)
+    Monster.add(name: names[2], &fbb)
+    offsets.append(Monster.endMonster(&fbb, start: start3))
+
+    let sortedArray = Monster.sortVectorOfMonster(offsets: offsets, &fbb)
+
+    let str = fbb.create(string: "MyMonster")
+    let test1 = fbb.create(string: "test1")
+    let test2 = fbb.create(string: "test2")
+    let _inv: [Byte] = [0, 1, 2, 3, 4]
+    let inv = fbb.createVector(_inv)
+
+    let fred = fbb.create(string: "Fred")
+    let mon1Start = Monster.startMonster(&fbb)
+    Monster.add(name: fred, &fbb)
+    let mon2 = Monster.endMonster(&fbb, start: mon1Start)
+
+    let test4 = fbb.createVector(ofStructs: [
+      MyGame_Example_Test(a: 30, b: 40),
+      MyGame_Example_Test(a: 10, b: 20),
+    ])
+
+    let stringTestVector = fbb.createVector(ofOffsets: [test1, test2])
+    let mStart = Monster.startMonster(&fbb)
+    Monster.add(pos: MyGame_Example_Vec3(x: 1, y: 2, z: 3, test1: 3, test2: .green, test3: .init(a: 5, b: 6)), &fbb)
+    Monster.add(hp: 80, &fbb)
+    Monster.add(name: str, &fbb)
+    Monster.addVectorOf(inventory: inv, &fbb)
+    Monster.add(testType: .monster, &fbb)
+    Monster.add(test: mon2, &fbb)
+    Monster.addVectorOf(test4: test4, &fbb)
+    Monster.addVectorOf(testarrayofstring: stringTestVector, &fbb)
+    Monster.add(testbool: true, &fbb)
+    Monster.addVectorOf(testarrayoftables: sortedArray, &fbb)
+    let end = Monster.endMonster(&fbb, start: mStart)
+    Monster.finish(&fbb, end: end, prefix: prefix)
+    return fbb
+  }
+
+  func mutateMonster(fb: ByteBuffer) {
+    let monster = Monster.getRootAsMonster(bb: fb)
+    XCTAssertFalse(monster.mutate(mana: 10))
+    XCTAssertEqual(monster.testarrayoftables(at: 0)?.name, "Barney")
+    XCTAssertEqual(monster.testarrayoftables(at: 1)?.name, "Frodo")
+    XCTAssertEqual(monster.testarrayoftables(at: 2)?.name, "Wilma")
+
+    // Example of searching for a table by the key
+    XCTAssertNotNil(monster.testarrayoftablesBy(key: "Frodo"))
+    XCTAssertNotNil(monster.testarrayoftablesBy(key: "Barney"))
+    XCTAssertNotNil(monster.testarrayoftablesBy(key: "Wilma"))
+
+    XCTAssertEqual(monster.testType, .monster)
+
+    XCTAssertEqual(monster.mutate(inventory: 1, at: 0), true)
+    XCTAssertEqual(monster.mutate(inventory: 2, at: 1), true)
+    XCTAssertEqual(monster.mutate(inventory: 3, at: 2), true)
+    XCTAssertEqual(monster.mutate(inventory: 4, at: 3), true)
+    XCTAssertEqual(monster.mutate(inventory: 5, at: 4), true)
+
+    for i in 0..<monster.inventoryCount {
+      XCTAssertEqual(monster.inventory(at: i), Byte(i + 1))
+    }
+
+    XCTAssertEqual(monster.mutate(inventory: 0, at: 0), true)
+    XCTAssertEqual(monster.mutate(inventory: 1, at: 1), true)
+    XCTAssertEqual(monster.mutate(inventory: 2, at: 2), true)
+    XCTAssertEqual(monster.mutate(inventory: 3, at: 3), true)
+    XCTAssertEqual(monster.mutate(inventory: 4, at: 4), true)
+
+    let vec = monster.mutablePos
+    XCTAssertEqual(vec?.x, 1)
+    XCTAssertTrue(vec?.mutate(x: 55.0) ?? false)
+    XCTAssertTrue(vec?.mutate(test1: 55) ?? false)
+    XCTAssertEqual(vec?.x, 55.0)
+    XCTAssertEqual(vec?.test1, 55.0)
+    XCTAssertTrue(vec?.mutate(x: 1) ?? false)
+    XCTAssertEqual(vec?.x, 1)
+    XCTAssertTrue(vec?.mutate(test1: 3) ?? false)
+  }
+
+  func readFlatbufferMonster(monster: inout MyGame_Example_Monster) {
+    XCTAssertEqual(monster.hp, 80)
+    XCTAssertEqual(monster.mana, 150)
+    XCTAssertEqual(monster.name, "MyMonster")
+    let pos = monster.pos
+    XCTAssertEqual(pos?.x, 1)
+    XCTAssertEqual(pos?.y, 2)
+    XCTAssertEqual(pos?.z, 3)
+    XCTAssertEqual(pos?.test1, 3)
+    XCTAssertEqual(pos?.test2, .green)
+    let test = pos?.test3
+    XCTAssertEqual(test?.a, 5)
+    XCTAssertEqual(test?.b, 6)
+    XCTAssertEqual(monster.testType, .monster)
+    let monster2 = monster.test(type: Monster.self)
+    XCTAssertEqual(monster2?.name, "Fred")
+
+    XCTAssertEqual(monster.mutate(mana: 10), false)
+
+    XCTAssertEqual(monster.mana, 150)
+    XCTAssertEqual(monster.inventoryCount, 5)
+    var sum: Byte = 0
+    for i in 0...monster.inventoryCount {
+      sum += monster.inventory(at: i)
+    }
+    XCTAssertEqual(sum, 10)
+    XCTAssertEqual(monster.test4Count, 2)
+
+    let test0 = monster.test4(at: 0)
+    let test1 = monster.test4(at: 1)
+    var sum0 = 0
+    var sum1 = 0
+    if let a = test0?.a, let b = test0?.b {
+      sum0 = Int(a) + Int(b)
+    }
+    if let a = test1?.a, let b = test1?.b {
+      sum1 = Int(a) + Int(b)
+    }
+    XCTAssertEqual(sum0 + sum1, 100)
+
+    let mutableTest0 = monster.mutableTest4(at: 0)
+    let mutableTest1 = monster.mutableTest4(at: 1)
+    var sum2 = 0
+    var sum3 = 0
+    if let a = mutableTest0?.a, let b = mutableTest0?.b {
+      sum2 = Int(a) + Int(b)
+    }
+    if let a = mutableTest1?.a, let b = mutableTest1?.b {
+      sum3 = Int(a) + Int(b)
+    }
+    XCTAssertEqual(sum2 + sum3, 100)
+
+    XCTAssertEqual(monster.testarrayofstringCount, 2)
+    XCTAssertEqual(monster.testarrayofstring(at: 0), "test1")
+    XCTAssertEqual(monster.testarrayofstring(at: 1), "test2")
+    XCTAssertEqual(monster.testbool, true)
+
+    let array = monster.nameSegmentArray
+    XCTAssertEqual(String(bytes: array ?? [], encoding: .utf8), "MyMonster")
+
+    if 0 == monster.testarrayofboolsCount  {
+      XCTAssertEqual(monster.testarrayofbools.isEmpty, true)
+    } else {
+      XCTAssertEqual(monster.testarrayofbools.isEmpty, false)
+    }
+  }
+
+  func readObjectApi(monster: MyGame_Example_MonsterT) {
+    XCTAssertEqual(monster.hp, 80)
+    XCTAssertEqual(monster.mana, 150)
+    XCTAssertEqual(monster.name, "MyMonster")
+    let pos = monster.pos
+    XCTAssertEqual(pos?.x, 1)
+    XCTAssertEqual(pos?.y, 2)
+    XCTAssertEqual(pos?.z, 3)
+    XCTAssertEqual(pos?.test1, 3)
+    XCTAssertEqual(pos?.test2, .green)
+    let test = pos?.test3
+    XCTAssertEqual(test?.a, 5)
+    XCTAssertEqual(test?.b, 6)
+    let monster2 = monster.test?.value as? MyGame_Example_MonsterT
+    XCTAssertEqual(monster2?.name, "Fred")
+    XCTAssertEqual(monster.mana, 150)
+    monster.mana = 10
+    XCTAssertEqual(monster.mana, 10)
+    monster.mana = 150
+    XCTAssertEqual(monster.mana, 150)
+
+    XCTAssertEqual(monster.inventory.count, 5)
+    var sum: Byte = 0
+    for i in monster.inventory {
+      sum += i
+    }
+    XCTAssertEqual(sum, 10)
+    XCTAssertEqual(monster.test4.count, 2)
+    let test0 = monster.test4[0]
+    let test1 = monster.test4[1]
+    var sum0 = 0
+    var sum1 = 0
+    if let a = test0?.a, let b = test0?.b {
+      sum0 = Int(a) + Int(b)
+    }
+    if let a = test1?.a, let b = test1?.b {
+      sum1 = Int(a) + Int(b)
+    }
+    XCTAssertEqual(sum0 + sum1, 100)
+    XCTAssertEqual(monster.testbool, true)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersStructsTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersStructsTests.swift
new file mode 100644
index 0000000..9b80bf3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersStructsTests.swift
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+@testable import FlatBuffers
+
+final class FlatBuffersStructsTests: XCTestCase {
+
+  func testWritingAndMutatingBools() {
+    var fbb = FlatBufferBuilder()
+    let start = TestMutatingBool.startTestMutatingBool(&fbb)
+    TestMutatingBool.add(b: Property(property: false), &fbb)
+    let root = TestMutatingBool.endTestMutatingBool(&fbb, start: start)
+    fbb.finish(offset: root)
+
+    let testMutatingBool = TestMutatingBool.getRootAsTestMutatingBool(bb: fbb.sizedBuffer)
+    let property = testMutatingBool.mutableB
+    XCTAssertEqual(property?.property, false)
+    property?.mutate(property: false)
+    XCTAssertEqual(property?.property, false)
+    property?.mutate(property: true)
+    XCTAssertEqual(property?.property, true)
+  }
+
+}
+
+struct Vec: NativeStruct {
+  var x: Float32
+  var y: Float32
+  var z: Float32
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersTests.swift
new file mode 100644
index 0000000..ab5a518
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersTests.swift
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+@testable import FlatBuffers
+
+final class FlatBuffersTests: XCTestCase {
+
+  let country = "Norway"
+
+  func testEndian() { XCTAssertEqual(isLitteEndian, true) }
+
+  func testOffset() {
+    let o = Offset()
+    let b = Offset(offset: 1)
+    XCTAssertEqual(o.isEmpty, true)
+    XCTAssertEqual(b.isEmpty, false)
+  }
+
+  func testCreateString() {
+    let helloWorld = "Hello, world!"
+    var b = FlatBufferBuilder(initialSize: 16)
+    XCTAssertEqual(b.create(string: country).o, 12)
+    XCTAssertEqual(b.create(string: helloWorld).o, 32)
+    b.clear()
+    XCTAssertEqual(b.create(string: helloWorld).o, 20)
+    XCTAssertEqual(b.create(string: country).o, 32)
+    b.clear()
+    XCTAssertEqual(b.create(string: String(repeating: "a", count: 257)).o, 264)
+  }
+
+  func testStartTable() {
+    var b = FlatBufferBuilder(initialSize: 16)
+    XCTAssertNoThrow(b.startTable(with: 0))
+    b.clear()
+    XCTAssertEqual(b.create(string: country).o, 12)
+    XCTAssertEqual(b.startTable(with: 0), 12)
+  }
+
+  func testCreateFinish() {
+    var b = FlatBufferBuilder(initialSize: 16)
+    let countryOff = Country.createCountry(builder: &b, name: country, log: 200, lan: 100)
+    b.finish(offset: countryOff)
+    let v: [UInt8] = [16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 4, 0, 8, 0, 12, 0, 10, 0, 0, 0, 12, 0, 0, 0, 100, 0, 0, 0, 200, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0]
+    XCTAssertEqual(b.sizedByteArray, v)
+  }
+
+  func testCreateFinishWithPrefix() {
+    var b = FlatBufferBuilder(initialSize: 16)
+    let countryOff = Country.createCountry(builder: &b, name: country, log: 200, lan: 100)
+    b.finish(offset: countryOff, addPrefix: true)
+    let v: [UInt8] = [44, 0, 0, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 4, 0, 8, 0, 12, 0, 10, 0, 0, 0, 12, 0, 0, 0, 100, 0, 0, 0, 200, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0]
+    XCTAssertEqual(b.sizedByteArray, v)
+  }
+
+  func testReadCountry() {
+    let v: [UInt8] = [16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 4, 0, 8, 0, 12, 0, 10, 0, 0, 0, 12, 0, 0, 0, 100, 0, 0, 0, 200, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0]
+    let buffer = ByteBuffer(bytes: v)
+    let c = Country.getRootAsCountry(buffer)
+    XCTAssertEqual(c.lan, 100)
+    XCTAssertEqual(c.log, 200)
+    XCTAssertEqual(c.nameVector, [78, 111, 114, 119, 97, 121])
+    XCTAssertEqual(c.name, country)
+  }
+
+  func testWriteNullableStrings() {
+    var b = FlatBufferBuilder()
+    XCTAssertTrue(b.create(string: nil).isEmpty)
+    XCTAssertTrue(b.createShared(string: nil).isEmpty)
+  }
+
+  func testWriteOptionalValues() {
+    var b = FlatBufferBuilder()
+    let root = optional_scalars_ScalarStuff.createScalarStuff(
+      &b,
+      justI8: 80,
+      maybeI8: nil,
+      justU8: 100,
+      maybeU8: 10,
+      maybeBool: true,
+      justEnum: .one,
+      maybeEnum: nil)
+    b.finish(offset: root)
+    let scalarTable = optional_scalars_ScalarStuff.getRootAsScalarStuff(bb: b.sizedBuffer)
+    XCTAssertEqual(scalarTable.justI8, 80)
+    XCTAssertNil(scalarTable.maybeI8)
+    XCTAssertEqual(scalarTable.maybeBool, true)
+    XCTAssertEqual(scalarTable.defaultI8, 42)
+    XCTAssertEqual(scalarTable.justU8, 100)
+    XCTAssertEqual(scalarTable.maybeU8, 10)
+    XCTAssertEqual(scalarTable.justEnum, .one)
+    XCTAssertNil(scalarTable.maybeEnum)
+  }
+}
+
+class Country {
+
+  static let offsets: (name: VOffset, lan: VOffset, lng: VOffset) = (4, 6, 8)
+  private var __t: Table
+
+  private init(_ t: Table) {
+    __t = t
+  }
+
+  var lan: Int32 { let o = __t.offset(6); return o == 0 ? 0 : __t.readBuffer(of: Int32.self, at: o) }
+  var log: Int32 { let o = __t.offset(8); return o == 0 ? 0 : __t.readBuffer(of: Int32.self, at: o) }
+  var nameVector: [UInt8]? { __t.getVector(at: 4) }
+  var name: String? { let o = __t.offset(4); return o == 0 ? nil : __t.string(at: o) }
+
+  @inlinable
+  static func getRootAsCountry(_ bb: ByteBuffer) -> Country {
+    Country(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: 0))))
+  }
+
+  @inlinable
+  static func createCountry(
+    builder: inout FlatBufferBuilder,
+    name: String,
+    log: Int32,
+    lan: Int32) -> Offset
+  {
+    createCountry(builder: &builder, offset: builder.create(string: name), log: log, lan: lan)
+  }
+
+  @inlinable
+  static func createCountry(
+    builder: inout FlatBufferBuilder,
+    offset: Offset,
+    log: Int32,
+    lan: Int32) -> Offset
+  {
+    let _start = builder.startTable(with: 3)
+    Country.add(builder: &builder, lng: log)
+    Country.add(builder: &builder, lan: lan)
+    Country.add(builder: &builder, name: offset)
+    return Country.end(builder: &builder, startOffset: _start)
+  }
+
+  @inlinable
+  static func end(builder: inout FlatBufferBuilder, startOffset: UOffset) -> Offset {
+    Offset(offset: builder.endTable(at: startOffset))
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, name: String) {
+    add(builder: &builder, name: builder.create(string: name))
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, name: Offset) {
+    builder.add(offset: name, at: Country.offsets.name)
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, lan: Int32) {
+    builder.add(element: lan, def: 0, at: Country.offsets.lan)
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, lng: Int32) {
+    builder.add(element: lng, def: 0, at: Country.offsets.lng)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersUnionTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersUnionTests.swift
new file mode 100644
index 0000000..9addb81
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersUnionTests.swift
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+@testable import FlatBuffers
+
+final class FlatBuffersUnionTests: XCTestCase {
+
+  func testCreateMonstor() {
+
+    var b = FlatBufferBuilder(initialSize: 20)
+    let dmg: Int16 = 5
+    let str = "Axe"
+    let axe = b.create(string: str)
+    let weapon = Weapon.createWeapon(builder: &b, offset: axe, dmg: dmg)
+    let weapons = b.createVector(ofOffsets: [weapon])
+    let root = LocalMonster.createMonster(
+      builder: &b,
+      offset: weapons,
+      equipment: .Weapon,
+      equippedOffset: weapon.o)
+    b.finish(offset: root)
+    let buffer = b.sizedByteArray
+    XCTAssertEqual(buffer, [16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 8, 0, 7, 0, 12, 0, 10, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 20, 0, 0, 0, 1, 0, 0, 0, 12, 0, 0, 0, 8, 0, 12, 0, 8, 0, 6, 0, 8, 0, 0, 0, 0, 0, 5, 0, 4, 0, 0, 0, 3, 0, 0, 0, 65, 120, 101, 0])
+    let monster = LocalMonster.getRootAsMonster(bb: ByteBuffer(bytes: buffer))
+    XCTAssertEqual(monster.weapon(at: 0)?.dmg, dmg)
+    XCTAssertEqual(monster.weapon(at: 0)?.name, str)
+    XCTAssertEqual(monster.weapon(at: 0)?.nameVector, [65, 120, 101])
+    let p: Weapon? = monster.equiped()
+    XCTAssertEqual(p?.dmg, dmg)
+    XCTAssertEqual(p?.name, str)
+    XCTAssertEqual(p?.nameVector, [65, 120, 101])
+  }
+
+  func testEndTableFinish() {
+    var builder = FlatBufferBuilder(initialSize: 20)
+    let sword = builder.create(string: "Sword")
+    let axe = builder.create(string: "Axe")
+    let weaponOne = Weapon.createWeapon(builder: &builder, offset: sword, dmg: 3)
+    let weaponTwo = Weapon.createWeapon(builder: &builder, offset: axe, dmg: 5)
+    let name = builder.create(string: "Orc")
+    let inventory: [UInt8] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    let inv = builder.createVector(inventory, size: 10)
+    let weapons = builder.createVector(ofOffsets: [weaponOne, weaponTwo])
+    let path = builder.createVector(ofStructs: [
+      Vec(x: 4.0, y: 5.0, z: 6.0),
+      Vec(x: 1.0, y: 2.0, z: 3.0),
+    ])
+    let orc = FinalMonster.createMonster(
+      builder: &builder,
+      position: Vec(x: 1, y: 2, z: 3),
+      hp: 300,
+      name: name,
+      inventory: inv,
+      color: .red,
+      weapons: weapons,
+      equipment: .Weapon,
+      equippedOffset: weaponTwo,
+      path: path)
+    builder.finish(offset: orc)
+    XCTAssertEqual(builder.sizedByteArray, [32, 0, 0, 0, 0, 0, 26, 0, 48, 0, 36, 0, 0, 0, 34, 0, 28, 0, 0, 0, 24, 0, 23, 0, 16, 0, 15, 0, 8, 0, 4, 0, 26, 0, 0, 0, 44, 0, 0, 0, 104, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 76, 0, 0, 0, 0, 0, 44, 1, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 2, 0, 0, 0, 0, 0, 128, 64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 2, 0, 0, 0, 52, 0, 0, 0, 28, 0, 0, 0, 10, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 3, 0, 0, 0, 79, 114, 99, 0, 244, 255, 255, 255, 0, 0, 5, 0, 24, 0, 0, 0, 8, 0, 12, 0, 8, 0, 6, 0, 8, 0, 0, 0, 0, 0, 3, 0, 12, 0, 0, 0, 3, 0, 0, 0, 65, 120, 101, 0, 5, 0, 0, 0, 83, 119, 111, 114, 100, 0, 0, 0])
+  }
+
+  func testEnumVector() {
+    let vectorOfEnums: [ColorsNameSpace.RGB] = [.blue, .green]
+
+    var builder = FlatBufferBuilder(initialSize: 1)
+    let off = builder.createVector(vectorOfEnums)
+    let start = ColorsNameSpace.Monster.startMonster(&builder)
+    ColorsNameSpace.Monster.add(colors: off, &builder)
+    let end = ColorsNameSpace.Monster.endMonster(&builder, start: start)
+    builder.finish(offset: end)
+    XCTAssertEqual(builder.sizedByteArray, [12, 0, 0, 0, 0, 0, 6, 0, 8, 0, 4, 0, 6, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0])
+    let monster = ColorsNameSpace.Monster.getRootAsMonster(bb: builder.buffer)
+    XCTAssertEqual(monster.colorsCount, 2)
+    XCTAssertEqual(monster.colors(at: 0), .blue)
+    XCTAssertEqual(monster.colors(at: 1), .green)
+  }
+
+  func testUnionVector() {
+    var fb = FlatBufferBuilder()
+
+    let swordDmg: Int32 = 8
+    let attackStart = Attacker.startAttacker(&fb)
+    Attacker.add(swordAttackDamage: swordDmg, &fb)
+    let attack = Attacker.endAttacker(&fb, start: attackStart)
+
+    let characterType: [Character] = [.belle, .mulan, .bookfan]
+
+    let characters = [
+      fb.create(struct: BookReader(booksRead: 7)),
+      attack,
+      fb.create(struct: BookReader(booksRead: 2)),
+    ]
+    let types = fb.createVector(characterType)
+    let characterVector = fb.createVector(ofOffsets: characters)
+    let end = Movie.createMovie(
+      &fb,
+      charactersTypeVectorOffset: types,
+      charactersVectorOffset: characterVector)
+    Movie.finish(&fb, end: end)
+
+    var movie = Movie.getRootAsMovie(bb: fb.buffer)
+    XCTAssertEqual(movie.charactersTypeCount, Int32(characterType.count))
+    XCTAssertEqual(movie.charactersCount, Int32(characters.count))
+
+    for i in 0..<movie.charactersTypeCount {
+      XCTAssertEqual(movie.charactersType(at: i), characterType[Int(i)])
+    }
+
+    XCTAssertEqual(movie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead, 7)
+    XCTAssertEqual(movie.characters(at: 1, type: Attacker.self)?.swordAttackDamage, swordDmg)
+    XCTAssertEqual(movie.characters(at: 2, type: BookReader_Mutable.self)?.booksRead, 2)
+
+    var objc: MovieT? = movie.unpack()
+    XCTAssertEqual(movie.charactersTypeCount, Int32(objc?.characters.count ?? 0))
+    XCTAssertEqual(movie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead, (objc?.characters[0]?.value as? BookReader)?.booksRead)
+    fb.clear()
+    let newMovie = Movie.pack(&fb, obj: &objc)
+    fb.finish(offset: newMovie)
+
+    let packedMovie = Movie.getRootAsMovie(bb: fb.buffer)
+
+    XCTAssertEqual(packedMovie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead, movie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead)
+    XCTAssertEqual(packedMovie.characters(at: 1, type: Attacker.self)?.swordAttackDamage, movie.characters(at: 1, type: Attacker.self)?.swordAttackDamage)
+    XCTAssertEqual(packedMovie.characters(at: 2, type: BookReader_Mutable.self)?.booksRead, movie.characters(at: 2, type: BookReader_Mutable.self)?.booksRead)
+  }
+
+  func testStringUnion() {
+    let string = "Awesome \\\\t\t\nstring!"
+    var fb = FlatBufferBuilder()
+    let stringOffset = fb.create(string: string)
+    let characterType: [Character] = [.bookfan, .other]
+
+    let characters = [
+      fb.create(struct: BookReader(booksRead: 7)),
+      stringOffset,
+    ]
+    let types = fb.createVector(characterType)
+    let characterVector = fb.createVector(ofOffsets: characters)
+
+    let end = Movie.createMovie(
+      &fb,
+      mainCharacterType: .other,
+      mainCharacterOffset: Offset(offset: stringOffset.o),
+      charactersTypeVectorOffset: types,
+      charactersVectorOffset: characterVector)
+    Movie.finish(&fb, end: end)
+
+    var movie = Movie.getRootAsMovie(bb: fb.sizedBuffer)
+    XCTAssertEqual(movie.mainCharacter(type: String.self), string)
+    XCTAssertEqual(movie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead, 7)
+    XCTAssertEqual(movie.characters(at: 1, type: String.self), string)
+
+    var objc: MovieT? = movie.unpack()
+    XCTAssertEqual(objc?.mainCharacter?.value as? String, string)
+    XCTAssertEqual((objc?.characters[0]?.value as? BookReader)?.booksRead, 7)
+    XCTAssertEqual(objc?.characters[1]?.value as? String, string)
+    fb.clear()
+    let newMovie = Movie.pack(&fb, obj: &objc)
+    fb.finish(offset: newMovie)
+
+    let packedMovie = Movie.getRootAsMovie(bb: fb.buffer)
+    XCTAssertEqual(packedMovie.mainCharacter(type: String.self), string)
+    XCTAssertEqual(packedMovie.characters(at: 0, type: BookReader_Mutable.self)?.booksRead, 7)
+    XCTAssertEqual(packedMovie.characters(at: 1, type: String.self), string)
+  }
+}
+
+public enum ColorsNameSpace {
+
+  enum RGB: Int32, Enum {
+    typealias T = Int32
+    static var byteSize: Int { MemoryLayout<Int32>.size }
+    var value: Int32 { rawValue }
+    case red = 0, green = 1, blue = 2
+  }
+
+  struct Monster: FlatBufferObject {
+    var __buffer: ByteBuffer! { _accessor.bb }
+
+    private var _accessor: Table
+    static func getRootAsMonster(bb: ByteBuffer) -> Monster { Monster(Table(
+      bb: bb,
+      position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+    init(_ t: Table) { _accessor = t }
+    init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+    public var colorsCount: Int32 { let o = _accessor.offset(4); return o == 0 ? 0 : _accessor.vector(count: o) }
+    public func colors(at index: Int32) -> ColorsNameSpace.RGB? { let o = _accessor.offset(4); return o == 0 ? ColorsNameSpace.RGB(rawValue: 0)! : ColorsNameSpace.RGB(rawValue: _accessor.directRead(
+      of: Int32.self,
+      offset: _accessor.vector(at: o) + index * 4)) }
+    static func startMonster(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 1) }
+    static func add(colors: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(
+      offset: colors,
+      at: 4)  }
+    static func endMonster(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  }
+}
+
+
+enum Equipment: Byte { case none, Weapon }
+
+enum Color3: Int8 { case red = 0, green, blue }
+
+struct FinalMonster {
+
+  @inlinable
+  static func createMonster(
+    builder: inout FlatBufferBuilder,
+    position: Vec,
+    hp: Int16,
+    name: Offset,
+    inventory: Offset,
+    color: Color3,
+    weapons: Offset,
+    equipment: Equipment = .none,
+    equippedOffset: Offset,
+    path: Offset) -> Offset
+  {
+    let start = builder.startTable(with: 11)
+    builder.create(struct: position, position: 4)
+    builder.add(element: hp, def: 100, at: 8)
+    builder.add(offset: name, at: 10)
+    builder.add(offset: inventory, at: 14)
+    builder.add(element: color.rawValue, def: Color3.green.rawValue, at: 16)
+    builder.add(offset: weapons, at: 18)
+    builder.add(element: equipment.rawValue, def: Equipment.none.rawValue, at: 20)
+    builder.add(offset: equippedOffset, at: 22)
+    builder.add(offset: path, at: 24)
+    return Offset(offset: builder.endTable(at: start))
+  }
+}
+
+struct LocalMonster {
+
+  private var __t: Table
+
+  init(_ fb: ByteBuffer, o: Int32) { __t = Table(bb: fb, position: o) }
+  init(_ t: Table) { __t = t }
+
+  func weapon(at index: Int32) -> Weapon? { let o = __t.offset(4); return o == 0 ? nil : Weapon.assign(
+    __t.indirect(__t.vector(at: o) + (index * 4)),
+    __t.bb) }
+
+  func equiped<T: FlatBufferObject>() -> T? {
+    let o = __t.offset(8); return o == 0 ? nil : __t.union(o)
+  }
+
+  static func getRootAsMonster(bb: ByteBuffer) -> LocalMonster {
+    LocalMonster(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: 0))))
+  }
+
+  @inlinable
+  static func createMonster(
+    builder: inout FlatBufferBuilder,
+    offset: Offset,
+    equipment: Equipment = .none,
+    equippedOffset: UOffset) -> Offset
+  {
+    let start = builder.startTable(with: 3)
+    builder.add(element: equippedOffset, def: 0, at: 8)
+    builder.add(offset: offset, at: 4)
+    builder.add(element: equipment.rawValue, def: Equipment.none.rawValue, at: 6)
+    return Offset(offset: builder.endTable(at: start))
+  }
+}
+
+struct Weapon: FlatBufferObject {
+
+  var __buffer: ByteBuffer! { __t.bb }
+
+  static let offsets: (name: VOffset, dmg: VOffset) = (4, 6)
+  private var __t: Table
+
+  init(_ t: Table) { __t = t }
+  init(_ fb: ByteBuffer, o: Int32) { __t = Table(bb: fb, position: o)}
+
+  var dmg: Int16 { let o = __t.offset(6); return o == 0 ? 0 : __t.readBuffer(of: Int16.self, at: o) }
+  var nameVector: [UInt8]? { __t.getVector(at: 4) }
+  var name: String? { let o = __t.offset(4); return o == 0 ? nil : __t.string(at: o) }
+
+  static func assign(_ i: Int32, _ bb: ByteBuffer) -> Weapon { Weapon(Table(bb: bb, position: i)) }
+
+  @inlinable
+  static func createWeapon(
+    builder: inout FlatBufferBuilder,
+    offset: Offset,
+    dmg: Int16) -> Offset
+  {
+    let _start = builder.startTable(with: 2)
+    Weapon.add(builder: &builder, name: offset)
+    Weapon.add(builder: &builder, dmg: dmg)
+    return Weapon.end(builder: &builder, startOffset: _start)
+  }
+
+  @inlinable
+  static func end(builder: inout FlatBufferBuilder, startOffset: UOffset) -> Offset {
+    Offset(offset: builder.endTable(at: startOffset))
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, name: Offset) {
+    builder.add(offset: name, at: Weapon.offsets.name)
+  }
+
+  @inlinable
+  static func add(builder: inout FlatBufferBuilder, dmg: Int16) {
+    builder.add(element: dmg, def: 0, at: Weapon.offsets.dmg)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersVectorsTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersVectorsTests.swift
new file mode 100644
index 0000000..9f0db17
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatBuffersVectorsTests.swift
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+@testable import FlatBuffers
+
+final class FlatBuffersVectors: XCTestCase {
+
+  func testCreatingTwoCountries() {
+    let norway = "Norway"
+    let denmark = "Denmark"
+    var b = FlatBufferBuilder(initialSize: 20)
+    let noStr = b.create(string: norway)
+    let deStr = b.create(string: denmark)
+    let n = Country.createCountry(builder: &b, offset: noStr, log: 888, lan: 700)
+    let d = Country.createCountry(builder: &b, offset: deStr, log: 200, lan: 100)
+    let vector = [n, d]
+    let vectorOffset = b.createVector(ofOffsets: vector)
+    b.finish(offset: vectorOffset)
+    XCTAssertEqual(b.sizedByteArray, [4, 0, 0, 0, 2, 0, 0, 0, 48, 0, 0, 0, 16, 0, 0, 0, 0, 0, 10, 0, 18, 0, 4, 0, 8, 0, 12, 0, 10, 0, 0, 0, 40, 0, 0, 0, 100, 0, 0, 0, 200, 0, 0, 0, 0, 0, 10, 0, 16, 0, 4, 0, 8, 0, 12, 0, 10, 0, 0, 0, 24, 0, 0, 0, 188, 2, 0, 0, 120, 3, 0, 0, 7, 0, 0, 0, 68, 101, 110, 109, 97, 114, 107, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0])
+  }
+
+  func testCreateIntArray() {
+    let numbers: [Int32] = [1, 2, 3, 4, 5]
+    var b = FlatBufferBuilder(initialSize: 20)
+    let o = b.createVector(numbers, size: numbers.count)
+    b.finish(offset: o)
+    XCTAssertEqual(b.sizedByteArray, [4, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0])
+  }
+
+  func testCreateEmptyIntArray() {
+    let numbers: [Int32] = []
+    var b = FlatBufferBuilder(initialSize: 20)
+    let o = b.createVector(numbers, size: numbers.count)
+    b.finish(offset: o)
+    XCTAssertEqual(b.sizedByteArray, [4, 0, 0, 0, 0, 0, 0, 0])
+  }
+
+  func testCreateVectorOfStrings() {
+    let strs = ["Denmark", "Norway"]
+    var b = FlatBufferBuilder(initialSize: 20)
+    let o = b.createVector(ofStrings: strs)
+    b.finish(offset: o)
+    XCTAssertEqual(b.sizedByteArray, [4, 0, 0, 0, 2, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0, 7, 0, 0, 0, 68, 101, 110, 109, 97, 114, 107, 0])
+  }
+  func testCreateSharedStringVector() {
+    let norway = "Norway"
+    let denmark = "Denmark"
+    var b = FlatBufferBuilder(initialSize: 20)
+    let noStr = b.createShared(string: norway)
+    let deStr = b.createShared(string: denmark)
+    let _noStr = b.createShared(string: norway)
+    let _deStr = b.createShared(string: denmark)
+    let v = [noStr, deStr, _noStr, _deStr]
+    let end = b.createVector(ofOffsets: v)
+    b.finish(offset: end)
+    XCTAssertEqual(b.sizedByteArray, [4, 0, 0, 0, 4, 0, 0, 0, 28, 0, 0, 0, 12, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 7, 0, 0, 0, 68, 101, 110, 109, 97, 114, 107, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0])
+  }
+
+  func testReadInt32Array() {
+    let data: [Int32] = [1, 2, 3, 4, 5]
+    var b = FlatBufferBuilder(initialSize: 20)
+    let v = Numbers.createNumbersVector(b: &b, array: data)
+    let end = Numbers.createNumbers(b: &b, o: v)
+    b.finish(offset: end)
+    let number = Numbers.getRootAsNumbers(ByteBuffer(bytes: b.sizedByteArray))
+    XCTAssertEqual(number.vArrayInt32, [1, 2, 3, 4, 5])
+  }
+
+  func testReadDoubleArray() {
+    let data: [Double] = [1, 2, 3, 4, 5]
+    var b = FlatBufferBuilder(initialSize: 20)
+    let v = Numbers.createNumbersVector(b: &b, array: data)
+    let end = Numbers.createNumbers(b: &b, o: v)
+    b.finish(offset: end)
+    let number = Numbers.getRootAsNumbers(ByteBuffer(bytes: b.sizedByteArray))
+    XCTAssertEqual(number.vArrayDouble, [1, 2, 3, 4, 5])
+  }
+}
+
+struct Numbers {
+
+  private var __t: Table
+
+  private init(_ t: Table) {
+    __t = t
+  }
+
+  @inlinable
+  static func getRootAsNumbers(_ bb: ByteBuffer) -> Numbers {
+    Numbers(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: 0))))
+  }
+
+  var vArrayInt: [Int]? { __t.getVector(at: 4) }
+  var vArrayInt32: [Int32]? { __t.getVector(at: 4) }
+  var vArrayDouble: [Double]? { __t.getVector(at: 4) }
+  var vArrayFloat: [Float32]? { __t.getVector(at: 4) }
+
+  static func createNumbersVector(b: inout FlatBufferBuilder, array: [Int]) -> Offset {
+    b.createVector(array, size: array.count)
+  }
+
+  static func createNumbersVector(b: inout FlatBufferBuilder, array: [Int32]) -> Offset {
+    b.createVector(array, size: array.count)
+  }
+
+  static func createNumbersVector(b: inout FlatBufferBuilder, array: [Double]) -> Offset {
+    b.createVector(array, size: array.count)
+  }
+
+  static func createNumbersVector(b: inout FlatBufferBuilder, array: [Float32]) -> Offset {
+    b.createVector(array, size: array.count)
+  }
+
+  static func createNumbers(b: inout FlatBufferBuilder, o: Offset) -> Offset {
+    let start = b.startTable(with: 1)
+    b.add(offset: o, at: 4)
+    return Offset(offset: b.endTable(at: start))
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersDoubleTests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersDoubleTests.swift
new file mode 100644
index 0000000..b9b8086
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersDoubleTests.swift
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+@testable import FlatBuffers
+
+final class FlatBuffersDoubleTests: XCTestCase {
+
+  let country = "Norway"
+
+  func testCreateFinish() {
+    var b = FlatBufferBuilder(initialSize: 16)
+    let countryOff = CountryDouble.createCountry(builder: &b, name: country, log: 200, lan: 100)
+    b.finish(offset: countryOff)
+    let v: [UInt8] = [16, 0, 0, 0, 0, 0, 10, 0, 28, 0, 4, 0, 8, 0, 16, 0, 10, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, 64, 0, 0, 0, 0, 0, 0, 105, 64, 0, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0]
+    XCTAssertEqual(b.sizedByteArray, v)
+  }
+
+  func testCreateFinishWithPrefix() {
+    var b = FlatBufferBuilder(initialSize: 16)
+    let countryOff = CountryDouble.createCountry(builder: &b, name: country, log: 200, lan: 100)
+    b.finish(offset: countryOff, addPrefix: true)
+    let v: [UInt8] = [60, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 28, 0, 4, 0, 8, 0, 16, 0, 10, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, 64, 0, 0, 0, 0, 0, 0, 105, 64, 0, 0, 0, 0, 6, 0, 0, 0, 78, 111, 114, 119, 97, 121, 0, 0]
+    XCTAssertEqual(b.sizedByteArray, v)
+  }
+}
+
+class CountryDouble {
+
+  static let offsets: (name: VOffset, lan: VOffset, lng: VOffset) = (4, 6, 8)
+
+  private var table: Table
+
+  private init(table t: Table) { table = t }
+
+  static func getRootAsCountry(_ bb: ByteBuffer) -> CountryDouble {
+    let pos = bb.read(def: Int32.self, position: Int(bb.size))
+    return CountryDouble(table: Table(bb: bb, position: Int32(pos)))
+  }
+
+  static func createCountry(
+    builder: inout FlatBufferBuilder,
+    name: String,
+    log: Double,
+    lan: Double) -> Offset
+  {
+    createCountry(builder: &builder, offset: builder.create(string: name), log: log, lan: lan)
+  }
+
+  static func createCountry(
+    builder: inout FlatBufferBuilder,
+    offset: Offset,
+    log: Double,
+    lan: Double) -> Offset
+  {
+    let _start = builder.startTable(with: 3)
+    CountryDouble.add(builder: &builder, lng: log)
+    CountryDouble.add(builder: &builder, lan: lan)
+    CountryDouble.add(builder: &builder, name: offset)
+    return CountryDouble.end(builder: &builder, startOffset: _start)
+  }
+
+  static func end(builder: inout FlatBufferBuilder, startOffset: UOffset) -> Offset {
+    Offset(offset: builder.endTable(at: startOffset))
+  }
+
+  static func add(builder: inout FlatBufferBuilder, name: String) {
+    add(builder: &builder, name: builder.create(string: name))
+  }
+
+  static func add(builder: inout FlatBufferBuilder, name: Offset) {
+    builder.add(offset: name, at: Country.offsets.name)
+  }
+
+  static func add(builder: inout FlatBufferBuilder, lan: Double) {
+    builder.add(element: lan, def: 0, at: Country.offsets.lan)
+  }
+
+  static func add(builder: inout FlatBufferBuilder, lng: Double) {
+    builder.add(element: lng, def: 0, at: Country.offsets.lng)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersMoreDefaults.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersMoreDefaults.swift
new file mode 100644
index 0000000..2839d2a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/FlatbuffersMoreDefaults.swift
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Foundation
+import XCTest
+@testable import FlatBuffers
+
+class FlatBuffersMoreDefaults: XCTestCase {
+
+  func testFlatbuffersObject() {
+    var fbb = FlatBufferBuilder()
+    let root = MoreDefaults.createMoreDefaults(&fbb)
+    fbb.finish(offset: root)
+    let defaults = MoreDefaults.getRootAsMoreDefaults(bb: fbb.sizedBuffer)
+    XCTAssertEqual(defaults.emptyString, "")
+    XCTAssertEqual(defaults.someString, "some")
+    XCTAssertEqual(defaults.ints, [])
+    XCTAssertEqual(defaults.floats, [])
+    XCTAssertEqual(defaults.bools, [])
+    XCTAssertEqual(defaults.intsCount, 0)
+    XCTAssertEqual(defaults.floatsCount, 0)
+    XCTAssertEqual(defaults.abcsCount, 0)
+    XCTAssertEqual(defaults.boolsCount, 0)
+  }
+
+  func testFlatbuffersObjectAPI() {
+    var fbb = FlatBufferBuilder()
+    let defaults = MoreDefaultsT()
+    XCTAssertEqual(defaults.emptyString, "")
+    XCTAssertEqual(defaults.someString, "some")
+    XCTAssertEqual(defaults.ints, [])
+    XCTAssertEqual(defaults.floats, [])
+    XCTAssertEqual(defaults.abcs, [])
+    XCTAssertEqual(defaults.bools, [])
+
+    let buffer = defaults.serialize(builder: &fbb, type: MoreDefaults.self)
+    let fDefaults = MoreDefaults.getRootAsMoreDefaults(bb: buffer)
+    XCTAssertEqual(fDefaults.emptyString, "")
+    XCTAssertEqual(fDefaults.someString, "some")
+    XCTAssertEqual(fDefaults.ints, [])
+    XCTAssertEqual(fDefaults.floats, [])
+    XCTAssertEqual(fDefaults.intsCount, 0)
+    XCTAssertEqual(fDefaults.floatsCount, 0)
+    XCTAssertEqual(fDefaults.abcsCount, 0)
+    XCTAssertEqual(fDefaults.boolsCount, 0)
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/MutatingBool_generated.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/MutatingBool_generated.swift
new file mode 100644
index 0000000..8b011fc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/MutatingBool_generated.swift
@@ -0,0 +1,49 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import FlatBuffers
+
+public struct Property: NativeStruct {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  public var property: Bool
+}
+
+public struct Property_Mutable: FlatBufferObject {
+
+    static func validateVersion() { FlatBuffersVersion_2_0_0() }
+    public var __buffer: ByteBuffer! { return _accessor.bb }
+    private var _accessor: Struct
+
+    public static var size = 1
+    public static var alignment = 1
+    public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+    public var property: Bool { return _accessor.readBuffer(of: Bool.self, at: 0) }
+    @discardableResult public func mutate(property: Bool) -> Bool { return _accessor.mutate(property, index: 0) }
+}
+
+public struct TestMutatingBool: FlatBufferObject {
+
+    static func validateVersion() { FlatBuffersVersion_2_0_0() }
+    public var __buffer: ByteBuffer! { return _accessor.bb }
+    private var _accessor: Table
+
+    public static func getRootAsTestMutatingBool(bb: ByteBuffer) -> TestMutatingBool { return TestMutatingBool(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+    private init(_ t: Table) { _accessor = t }
+    public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+    enum VTOFFSET: VOffset {
+        case b = 4
+        var v: Int32 { Int32(self.rawValue) }
+        var p: VOffset { self.rawValue }
+    }
+
+    public var b: Property? { let o = _accessor.offset(VTOFFSET.b.v); return o == 0 ? nil : _accessor.readBuffer(of: Property.self, at: o) }
+    public var mutableB: Property_Mutable? { let o = _accessor.offset(VTOFFSET.b.v); return o == 0 ? nil : Property_Mutable(_accessor.bb, o: o + _accessor.postion) }
+    public static func startTestMutatingBool(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 1) }
+    public static func add(b: Property?, _ fbb: inout FlatBufferBuilder) { guard let b = b else { return }; _ = fbb.create(struct: b, position: VTOFFSET.b.p) }
+    public static func endTestMutatingBool(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/XCTestManifests.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/XCTestManifests.swift
new file mode 100644
index 0000000..8ae9111
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/XCTestManifests.swift
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if !canImport(ObjectiveC)
+import XCTest
+
+extension FlatBuffersDoubleTests {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersDoubleTests = [
+    ("testCreateFinish", testCreateFinish),
+    ("testCreateFinishWithPrefix", testCreateFinishWithPrefix),
+  ]
+}
+
+extension FlatBuffersMonsterWriterTests {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersMonsterWriterTests = [
+    ("testArrayOfBools", testArrayOfBools),
+    ("testCreateMonster", testCreateMonster),
+    ("testCreateMonsterPrefixed", testCreateMonsterPrefixed),
+    ("testCreateMonsterResizedBuffer", testCreateMonsterResizedBuffer),
+    ("testCreateMonsterUsingCreateMonsterMethodWithNilPos", testCreateMonsterUsingCreateMonsterMethodWithNilPos),
+    ("testCreateMonsterUsingCreateMonsterMethodWithPosX", testCreateMonsterUsingCreateMonsterMethodWithPosX),
+    ("testData", testData),
+    ("testReadFromOtherLanguages", testReadFromOtherLanguages),
+    ("testReadMonsterFromUnsafePointerWithoutCopying", testReadMonsterFromUnsafePointerWithoutCopying),
+  ]
+}
+
+extension FlatBuffersMoreDefaults {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersMoreDefaults = [
+    ("testFlatbuffersObject", testFlatbuffersObject),
+    ("testFlatbuffersObjectAPI", testFlatbuffersObjectAPI),
+  ]
+}
+
+extension FlatBuffersStructsTests {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersStructsTests = [
+    ("testWritingAndMutatingBools", testWritingAndMutatingBools),
+  ]
+}
+
+extension FlatBuffersTests {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersTests = [
+    ("testCreateFinish", testCreateFinish),
+    ("testCreateFinishWithPrefix", testCreateFinishWithPrefix),
+    ("testCreateString", testCreateString),
+    ("testEndian", testEndian),
+    ("testOffset", testOffset),
+    ("testReadCountry", testReadCountry),
+    ("testStartTable", testStartTable),
+    ("testWriteNullableStrings", testWriteNullableStrings),
+    ("testWriteOptionalValues", testWriteOptionalValues),
+  ]
+}
+
+extension FlatBuffersUnionTests {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersUnionTests = [
+    ("testCreateMonstor", testCreateMonstor),
+    ("testEndTableFinish", testEndTableFinish),
+    ("testEnumVector", testEnumVector),
+    ("testStringUnion", testStringUnion),
+    ("testUnionVector", testUnionVector),
+  ]
+}
+
+extension FlatBuffersVectors {
+  // DO NOT MODIFY: This is autogenerated, use:
+  //   `swift test --generate-linuxmain`
+  // to regenerate.
+  static let __allTests__FlatBuffersVectors = [
+    ("testCreateEmptyIntArray", testCreateEmptyIntArray),
+    ("testCreateIntArray", testCreateIntArray),
+    ("testCreateSharedStringVector", testCreateSharedStringVector),
+    ("testCreateVectorOfStrings", testCreateVectorOfStrings),
+    ("testCreatingTwoCountries", testCreatingTwoCountries),
+    ("testReadDoubleArray", testReadDoubleArray),
+    ("testReadInt32Array", testReadInt32Array),
+  ]
+}
+
+public func __allTests() -> [XCTestCaseEntry] {
+  [
+    testCase(FlatBuffersDoubleTests.__allTests__FlatBuffersDoubleTests),
+    testCase(FlatBuffersMonsterWriterTests.__allTests__FlatBuffersMonsterWriterTests),
+    testCase(FlatBuffersMoreDefaults.__allTests__FlatBuffersMoreDefaults),
+    testCase(FlatBuffersStructsTests.__allTests__FlatBuffersStructsTests),
+    testCase(FlatBuffersTests.__allTests__FlatBuffersTests),
+    testCase(FlatBuffersUnionTests.__allTests__FlatBuffersUnionTests),
+    testCase(FlatBuffersVectors.__allTests__FlatBuffersVectors),
+  ]
+}
+#endif
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test.grpc.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test.grpc.swift
new file mode 100644
index 0000000..b5fd535
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test.grpc.swift
@@ -0,0 +1,208 @@
+// Generated GRPC code for FlatBuffers swift!
+/// The following code is generated by the Flatbuffers library which might not be in sync with grpc-swift
+/// in case of an issue please open github issue, though it would be maintained
+
+// swiftlint:disable all
+// swiftformat:disable all
+
+import Foundation
+import GRPC
+import NIO
+import NIOHTTP1
+import FlatBuffers
+
+public protocol GRPCFlatBufPayload: GRPCPayload, FlatBufferGRPCMessage {}
+public extension GRPCFlatBufPayload {
+  init(serializedByteBuffer: inout NIO.ByteBuffer) throws {
+    self.init(byteBuffer: FlatBuffers.ByteBuffer(contiguousBytes: serializedByteBuffer.readableBytesView, count: serializedByteBuffer.readableBytes))
+  }
+  func serialize(into buffer: inout NIO.ByteBuffer) throws {
+    let buf = UnsafeRawBufferPointer(start: self.rawPointer, count: Int(self.size))
+    buffer.writeBytes(buf)
+  }
+}
+extension Message: GRPCFlatBufPayload {}
+
+/// Usage: instantiate MyGame_Example_MonsterStorageServiceClient, then call methods of this protocol to make API calls.
+public protocol MyGame_Example_MonsterStorageClientProtocol: GRPCClient {
+
+  var serviceName: String { get }
+
+  var interceptors: MyGame_Example_MonsterStorageClientInterceptorFactoryProtocol? { get }
+
+  func Store(
+    _ request: Message<MyGame_Example_Monster>
+    , callOptions: CallOptions?
+  ) -> UnaryCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>
+
+  func Retrieve(
+    _ request: Message<MyGame_Example_Stat>
+    , callOptions: CallOptions?,
+    handler: @escaping (Message<MyGame_Example_Monster>) -> Void
+  ) -> ServerStreamingCall<Message<MyGame_Example_Stat>, Message<MyGame_Example_Monster>>
+
+  func GetMaxHitPoint(
+    callOptions: CallOptions?
+  ) -> ClientStreamingCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>
+
+  func GetMinMaxHitPoints(
+    callOptions: CallOptions?,
+    handler: @escaping (Message<MyGame_Example_Stat> ) -> Void
+  ) -> BidirectionalStreamingCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>
+
+}
+
+extension MyGame_Example_MonsterStorageClientProtocol {
+
+  public var serviceName: String { "MyGame.Example.MonsterStorage" }
+
+  public func Store(
+    _ request: Message<MyGame_Example_Monster>
+    , callOptions: CallOptions? = nil
+  ) -> UnaryCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>> {
+    return self.makeUnaryCall(
+      path: "/MyGame.Example.MonsterStorage/Store",
+      request: request,
+      callOptions: callOptions ?? self.defaultCallOptions,
+      interceptors: self.interceptors?.makeStoreInterceptors() ?? []
+    )
+  }
+
+  public func Retrieve(
+    _ request: Message<MyGame_Example_Stat>
+    , callOptions: CallOptions? = nil,
+    handler: @escaping (Message<MyGame_Example_Monster>) -> Void
+  ) -> ServerStreamingCall<Message<MyGame_Example_Stat>, Message<MyGame_Example_Monster>> {
+    return self.makeServerStreamingCall(
+      path: "/MyGame.Example.MonsterStorage/Retrieve",
+      request: request,
+      callOptions: callOptions ?? self.defaultCallOptions,
+      interceptors: self.interceptors?.makeRetrieveInterceptors() ?? [],
+      handler: handler
+    )
+  }
+
+  public func GetMaxHitPoint(
+    callOptions: CallOptions? = nil
+  ) -> ClientStreamingCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>> {
+    return self.makeClientStreamingCall(
+      path: "/MyGame.Example.MonsterStorage/GetMaxHitPoint",
+      callOptions: callOptions ?? self.defaultCallOptions,
+      interceptors: self.interceptors?.makeGetMaxHitPointInterceptors() ?? []
+    )
+  }
+
+  public func GetMinMaxHitPoints(
+    callOptions: CallOptions? = nil,
+    handler: @escaping (Message<MyGame_Example_Stat> ) -> Void
+  ) -> BidirectionalStreamingCall<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>> {
+    return self.makeBidirectionalStreamingCall(
+      path: "/MyGame.Example.MonsterStorage/GetMinMaxHitPoints",
+      callOptions: callOptions ?? self.defaultCallOptions,
+      interceptors: self.interceptors?.makeGetMinMaxHitPointsInterceptors() ?? [],
+      handler: handler
+    )
+  }
+}
+
+public protocol MyGame_Example_MonsterStorageClientInterceptorFactoryProtocol {
+  /// - Returns: Interceptors to use when invoking 'Store'.
+  func makeStoreInterceptors() -> [ClientInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+  /// - Returns: Interceptors to use when invoking 'Retrieve'.
+  func makeRetrieveInterceptors() -> [ClientInterceptor<Message<MyGame_Example_Stat>, Message<MyGame_Example_Monster>>]
+
+  /// - Returns: Interceptors to use when invoking 'GetMaxHitPoint'.
+  func makeGetMaxHitPointInterceptors() -> [ClientInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+  /// - Returns: Interceptors to use when invoking 'GetMinMaxHitPoints'.
+  func makeGetMinMaxHitPointsInterceptors() -> [ClientInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+}
+
+public final class MyGame_Example_MonsterStorageServiceClient: MyGame_Example_MonsterStorageClientProtocol {
+  public let channel: GRPCChannel
+  public var defaultCallOptions: CallOptions
+  public var interceptors: MyGame_Example_MonsterStorageClientInterceptorFactoryProtocol?
+
+  public init(
+    channel: GRPCChannel,
+    defaultCallOptions: CallOptions = CallOptions(),
+    interceptors: MyGame_Example_MonsterStorageClientInterceptorFactoryProtocol? = nil
+  ) {
+    self.channel = channel
+    self.defaultCallOptions = defaultCallOptions
+    self.interceptors = interceptors
+  }
+}
+
+public protocol MyGame_Example_MonsterStorageProvider: CallHandlerProvider {
+  var interceptors: MyGame_Example_MonsterStorageServerInterceptorFactoryProtocol? { get }
+  func Store(request: Message<MyGame_Example_Monster>, context: StatusOnlyCallContext) -> EventLoopFuture<Message<MyGame_Example_Stat>>
+  func Retrieve(request: Message<MyGame_Example_Stat>, context: StreamingResponseCallContext<Message<MyGame_Example_Monster>>) -> EventLoopFuture<GRPCStatus>
+  func GetMaxHitPoint(context: UnaryResponseCallContext<Message<MyGame_Example_Stat>>) -> EventLoopFuture<(StreamEvent<Message<MyGame_Example_Monster>>) -> Void>
+  func GetMinMaxHitPoints(context: StreamingResponseCallContext<Message<MyGame_Example_Stat>>) -> EventLoopFuture<(StreamEvent<Message<MyGame_Example_Monster>>) -> Void>
+}
+
+public extension MyGame_Example_MonsterStorageProvider {
+
+  var serviceName: Substring { return "MyGame.Example.MonsterStorage" }
+
+  func handle(method name: Substring, context: CallHandlerContext) -> GRPCServerHandlerProtocol? {
+    switch name {
+    case "Store":
+    return UnaryServerHandler(
+      context: context,
+      requestDeserializer: GRPCPayloadDeserializer<Message<MyGame_Example_Monster>>(),
+      responseSerializer: GRPCPayloadSerializer<Message<MyGame_Example_Stat>>(),
+      interceptors: self.interceptors?.makeStoreInterceptors() ?? [],
+      userFunction: self.Store(request:context:))
+
+    case "Retrieve":
+    return ServerStreamingServerHandler(
+      context: context,
+      requestDeserializer: GRPCPayloadDeserializer<Message<MyGame_Example_Stat>>(),
+      responseSerializer: GRPCPayloadSerializer<Message<MyGame_Example_Monster>>(),
+      interceptors: self.interceptors?.makeRetrieveInterceptors() ?? [],
+      userFunction: self.Retrieve(request:context:))
+
+    case "GetMaxHitPoint":
+    return ClientStreamingServerHandler(
+      context: context,
+      requestDeserializer: GRPCPayloadDeserializer<Message<MyGame_Example_Monster>>(),
+      responseSerializer: GRPCPayloadSerializer<Message<MyGame_Example_Stat>>(),
+      interceptors: self.interceptors?.makeGetMaxHitPointInterceptors() ?? [],
+      observerFactory: self.GetMaxHitPoint(context:))
+
+    case "GetMinMaxHitPoints":
+    return BidirectionalStreamingServerHandler(
+      context: context,
+      requestDeserializer: GRPCPayloadDeserializer<Message<MyGame_Example_Monster>>(),
+      responseSerializer: GRPCPayloadSerializer<Message<MyGame_Example_Stat>>(),
+      interceptors: self.interceptors?.makeGetMinMaxHitPointsInterceptors() ?? [],
+      observerFactory: self.GetMinMaxHitPoints(context:))
+
+    default: return nil;
+    }
+  }
+
+}
+
+public protocol MyGame_Example_MonsterStorageServerInterceptorFactoryProtocol {
+  /// - Returns: Interceptors to use when handling 'Store'.
+  ///   Defaults to calling `self.makeInterceptors()`.
+  func makeStoreInterceptors() -> [ServerInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+  /// - Returns: Interceptors to use when handling 'Retrieve'.
+  ///   Defaults to calling `self.makeInterceptors()`.
+  func makeRetrieveInterceptors() -> [ServerInterceptor<Message<MyGame_Example_Stat>, Message<MyGame_Example_Monster>>]
+
+  /// - Returns: Interceptors to use when handling 'GetMaxHitPoint'.
+  ///   Defaults to calling `self.makeInterceptors()`.
+  func makeGetMaxHitPointInterceptors() -> [ServerInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+  /// - Returns: Interceptors to use when handling 'GetMinMaxHitPoints'.
+  ///   Defaults to calling `self.makeInterceptors()`.
+  func makeGetMinMaxHitPointsInterceptors() -> [ServerInterceptor<Message<MyGame_Example_Monster>, Message<MyGame_Example_Stat>>]
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test_generated.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test_generated.swift
new file mode 100644
index 0000000..d955bb6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/monster_test_generated.swift
@@ -0,0 +1,1692 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// swiftlint:disable all
+// swiftformat:disable all
+
+import FlatBuffers
+
+///  Composite components of Monster color.
+public enum MyGame_Example_Color: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case red = 1
+  ///  \brief color Green
+  ///  Green is bit_flag with value (1u << 1)
+  case green = 2
+  ///  \brief color Blue (1u << 3)
+  case blue = 8
+  
+
+  public static var max: MyGame_Example_Color { return .blue }
+  public static var min: MyGame_Example_Color { return .red }
+}
+
+public enum MyGame_Example_Race: Int8, Enum {
+  public typealias T = Int8
+  public static var byteSize: Int { return MemoryLayout<Int8>.size }
+  public var value: Int8 { return self.rawValue }
+  case none_ = -1
+  case human = 0
+  case dwarf = 1
+  case elf = 2
+  
+
+  public static var max: MyGame_Example_Race { return .elf }
+  public static var min: MyGame_Example_Race { return .none_ }
+}
+
+public enum MyGame_Example_Any_: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case none_ = 0
+  case monster = 1
+  case testsimpletablewithenum = 2
+  case mygameExample2Monster = 3
+  
+
+  public static var max: MyGame_Example_Any_ { return .mygameExample2Monster }
+  public static var min: MyGame_Example_Any_ { return .none_ }
+}
+
+public struct MyGame_Example_Any_Union {
+  public var type: MyGame_Example_Any_
+  public var value: NativeObject?
+  public init(_ v: NativeObject?, type: MyGame_Example_Any_) {
+    self.type = type
+    self.value = v
+  }
+  public func pack(builder: inout FlatBufferBuilder) -> Offset {
+    switch type {
+    case .monster:
+      var __obj = value as? MyGame_Example_MonsterT
+      return MyGame_Example_Monster.pack(&builder, obj: &__obj)
+    case .testsimpletablewithenum:
+      var __obj = value as? MyGame_Example_TestSimpleTableWithEnumT
+      return MyGame_Example_TestSimpleTableWithEnum.pack(&builder, obj: &__obj)
+    case .mygameExample2Monster:
+      var __obj = value as? MyGame_Example2_MonsterT
+      return MyGame_Example2_Monster.pack(&builder, obj: &__obj)
+    default: return Offset()
+    }
+  }
+}
+public enum MyGame_Example_AnyUniqueAliases: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case none_ = 0
+  case m = 1
+  case ts = 2
+  case m2 = 3
+  
+
+  public static var max: MyGame_Example_AnyUniqueAliases { return .m2 }
+  public static var min: MyGame_Example_AnyUniqueAliases { return .none_ }
+}
+
+public struct MyGame_Example_AnyUniqueAliasesUnion {
+  public var type: MyGame_Example_AnyUniqueAliases
+  public var value: NativeObject?
+  public init(_ v: NativeObject?, type: MyGame_Example_AnyUniqueAliases) {
+    self.type = type
+    self.value = v
+  }
+  public func pack(builder: inout FlatBufferBuilder) -> Offset {
+    switch type {
+    case .m:
+      var __obj = value as? MyGame_Example_MonsterT
+      return MyGame_Example_Monster.pack(&builder, obj: &__obj)
+    case .ts:
+      var __obj = value as? MyGame_Example_TestSimpleTableWithEnumT
+      return MyGame_Example_TestSimpleTableWithEnum.pack(&builder, obj: &__obj)
+    case .m2:
+      var __obj = value as? MyGame_Example2_MonsterT
+      return MyGame_Example2_Monster.pack(&builder, obj: &__obj)
+    default: return Offset()
+    }
+  }
+}
+public enum MyGame_Example_AnyAmbiguousAliases: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case none_ = 0
+  case m1 = 1
+  case m2 = 2
+  case m3 = 3
+  
+
+  public static var max: MyGame_Example_AnyAmbiguousAliases { return .m3 }
+  public static var min: MyGame_Example_AnyAmbiguousAliases { return .none_ }
+}
+
+public struct MyGame_Example_AnyAmbiguousAliasesUnion {
+  public var type: MyGame_Example_AnyAmbiguousAliases
+  public var value: NativeObject?
+  public init(_ v: NativeObject?, type: MyGame_Example_AnyAmbiguousAliases) {
+    self.type = type
+    self.value = v
+  }
+  public func pack(builder: inout FlatBufferBuilder) -> Offset {
+    switch type {
+    case .m1:
+      var __obj = value as? MyGame_Example_MonsterT
+      return MyGame_Example_Monster.pack(&builder, obj: &__obj)
+    case .m2:
+      var __obj = value as? MyGame_Example_MonsterT
+      return MyGame_Example_Monster.pack(&builder, obj: &__obj)
+    case .m3:
+      var __obj = value as? MyGame_Example_MonsterT
+      return MyGame_Example_Monster.pack(&builder, obj: &__obj)
+    default: return Offset()
+    }
+  }
+}
+public struct MyGame_Example_Test: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _a: Int16
+  private var _b: Int8
+  private let padding0__: UInt8 = 0
+
+  public init(a: Int16, b: Int8) {
+    _a = a
+    _b = b
+  }
+
+  public init() {
+    _a = 0
+    _b = 0
+  }
+
+  public init(_ _t: inout MyGame_Example_Test_Mutable) {
+    _a = _t.a
+    _b = _t.b
+  }
+
+  public var a: Int16 { _a }
+  public var b: Int8 { _b }
+}
+
+public struct MyGame_Example_Test_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var a: Int16 { return _accessor.readBuffer(of: Int16.self, at: 0) }
+  @discardableResult public func mutate(a: Int16) -> Bool { return _accessor.mutate(a, index: 0) }
+  public var b: Int8 { return _accessor.readBuffer(of: Int8.self, at: 2) }
+  @discardableResult public func mutate(b: Int8) -> Bool { return _accessor.mutate(b, index: 2) }
+  
+
+  public mutating func unpack() -> MyGame_Example_Test {
+    return MyGame_Example_Test(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Test?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Test) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct MyGame_Example_Vec3: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _x: Float32
+  private var _y: Float32
+  private var _z: Float32
+  private let padding0__: UInt32 = 0
+  private var _test1: Double
+  private var _test2: UInt8
+  private let padding1__: UInt8 = 0
+  private var _test3: MyGame_Example_Test
+  private let padding2__: UInt16 = 0
+
+  public init(x: Float32, y: Float32, z: Float32, test1: Double, test2: MyGame_Example_Color, test3: MyGame_Example_Test) {
+    _x = x
+    _y = y
+    _z = z
+    _test1 = test1
+    _test2 = test2.value
+    _test3 = test3
+  }
+
+  public init() {
+    _x = 0.0
+    _y = 0.0
+    _z = 0.0
+    _test1 = 0.0
+    _test2 = 0
+    _test3 = MyGame_Example_Test()
+  }
+
+  public init(_ _t: inout MyGame_Example_Vec3_Mutable) {
+    _x = _t.x
+    _y = _t.y
+    _z = _t.z
+    _test1 = _t.test1
+    _test2 = _t.test2.value
+    var _vtest3 = _t.test3
+    _test3 = _vtest3.unpack()
+  }
+
+  public var x: Float32 { _x }
+  public var y: Float32 { _y }
+  public var z: Float32 { _z }
+  public var test1: Double { _test1 }
+  public var test2: MyGame_Example_Color { MyGame_Example_Color(rawValue: _test2)! }
+  public var test3: MyGame_Example_Test { _test3 }
+}
+
+public struct MyGame_Example_Vec3_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var x: Float32 { return _accessor.readBuffer(of: Float32.self, at: 0) }
+  @discardableResult public func mutate(x: Float32) -> Bool { return _accessor.mutate(x, index: 0) }
+  public var y: Float32 { return _accessor.readBuffer(of: Float32.self, at: 4) }
+  @discardableResult public func mutate(y: Float32) -> Bool { return _accessor.mutate(y, index: 4) }
+  public var z: Float32 { return _accessor.readBuffer(of: Float32.self, at: 8) }
+  @discardableResult public func mutate(z: Float32) -> Bool { return _accessor.mutate(z, index: 8) }
+  public var test1: Double { return _accessor.readBuffer(of: Double.self, at: 16) }
+  @discardableResult public func mutate(test1: Double) -> Bool { return _accessor.mutate(test1, index: 16) }
+  public var test2: MyGame_Example_Color { return MyGame_Example_Color(rawValue: _accessor.readBuffer(of: UInt8.self, at: 24)) ?? .red }
+  @discardableResult public func mutate(test2: MyGame_Example_Color) -> Bool { return _accessor.mutate(test2.rawValue, index: 24) }
+  public var test3: MyGame_Example_Test_Mutable { return MyGame_Example_Test_Mutable(_accessor.bb, o: _accessor.postion + 26) }
+  
+
+  public mutating func unpack() -> MyGame_Example_Vec3 {
+    return MyGame_Example_Vec3(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Vec3?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Vec3) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct MyGame_Example_Ability: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _id: UInt32
+  private var _distance: UInt32
+
+  public init(id: UInt32, distance: UInt32) {
+    _id = id
+    _distance = distance
+  }
+
+  public init() {
+    _id = 0
+    _distance = 0
+  }
+
+  public init(_ _t: inout MyGame_Example_Ability_Mutable) {
+    _id = _t.id
+    _distance = _t.distance
+  }
+
+  public var id: UInt32 { _id }
+  public var distance: UInt32 { _distance }
+}
+
+public struct MyGame_Example_Ability_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var id: UInt32 { return _accessor.readBuffer(of: UInt32.self, at: 0) }
+  @discardableResult public func mutate(id: UInt32) -> Bool { return _accessor.mutate(id, index: 0) }
+  public var distance: UInt32 { return _accessor.readBuffer(of: UInt32.self, at: 4) }
+  @discardableResult public func mutate(distance: UInt32) -> Bool { return _accessor.mutate(distance, index: 4) }
+  
+
+  public mutating func unpack() -> MyGame_Example_Ability {
+    return MyGame_Example_Ability(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Ability?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_Ability) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct MyGame_Example_StructOfStructs: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _a: MyGame_Example_Ability
+  private var _b: MyGame_Example_Test
+  private var _c: MyGame_Example_Ability
+
+  public init(a: MyGame_Example_Ability, b: MyGame_Example_Test, c: MyGame_Example_Ability) {
+    _a = a
+    _b = b
+    _c = c
+  }
+
+  public init() {
+    _a = MyGame_Example_Ability()
+    _b = MyGame_Example_Test()
+    _c = MyGame_Example_Ability()
+  }
+
+  public init(_ _t: inout MyGame_Example_StructOfStructs_Mutable) {
+    var _va = _t.a
+    _a = _va.unpack()
+    var _vb = _t.b
+    _b = _vb.unpack()
+    var _vc = _t.c
+    _c = _vc.unpack()
+  }
+
+  public var a: MyGame_Example_Ability { _a }
+  public var b: MyGame_Example_Test { _b }
+  public var c: MyGame_Example_Ability { _c }
+}
+
+public struct MyGame_Example_StructOfStructs_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var a: MyGame_Example_Ability_Mutable { return MyGame_Example_Ability_Mutable(_accessor.bb, o: _accessor.postion + 0) }
+  public var b: MyGame_Example_Test_Mutable { return MyGame_Example_Test_Mutable(_accessor.bb, o: _accessor.postion + 8) }
+  public var c: MyGame_Example_Ability_Mutable { return MyGame_Example_Ability_Mutable(_accessor.bb, o: _accessor.postion + 12) }
+  
+
+  public mutating func unpack() -> MyGame_Example_StructOfStructs {
+    return MyGame_Example_StructOfStructs(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_StructOfStructs?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_StructOfStructs) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct MyGame_InParentNamespace: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsInParentNamespace(bb: ByteBuffer) -> MyGame_InParentNamespace { return MyGame_InParentNamespace(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  public static func startInParentNamespace(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 0) }
+  public static func endInParentNamespace(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  
+
+  public mutating func unpack() -> MyGame_InParentNamespaceT {
+    return MyGame_InParentNamespaceT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_InParentNamespaceT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_InParentNamespaceT) -> Offset {
+    let __root = MyGame_InParentNamespace.startInParentNamespace(&builder)
+    return MyGame_InParentNamespace.endInParentNamespace(&builder, start: __root)
+  }
+}
+
+public class MyGame_InParentNamespaceT: NativeObject {
+
+
+  public init(_ _t: inout MyGame_InParentNamespace) {
+  }
+
+  public init() {
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_InParentNamespace.self) }
+
+}
+public struct MyGame_Example2_Monster: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsMonster(bb: ByteBuffer) -> MyGame_Example2_Monster { return MyGame_Example2_Monster(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  public static func startMonster(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 0) }
+  public static func endMonster(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  
+
+  public mutating func unpack() -> MyGame_Example2_MonsterT {
+    return MyGame_Example2_MonsterT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example2_MonsterT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example2_MonsterT) -> Offset {
+    let __root = MyGame_Example2_Monster.startMonster(&builder)
+    return MyGame_Example2_Monster.endMonster(&builder, start: __root)
+  }
+}
+
+public class MyGame_Example2_MonsterT: NativeObject {
+
+
+  public init(_ _t: inout MyGame_Example2_Monster) {
+  }
+
+  public init() {
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_Example2_Monster.self) }
+
+}
+internal struct MyGame_Example_TestSimpleTableWithEnum: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  internal var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  internal static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  internal static func getRootAsTestSimpleTableWithEnum(bb: ByteBuffer) -> MyGame_Example_TestSimpleTableWithEnum { return MyGame_Example_TestSimpleTableWithEnum(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  internal init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case color = 4
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  internal var color: MyGame_Example_Color { let o = _accessor.offset(VTOFFSET.color.v); return o == 0 ? .green : MyGame_Example_Color(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .green }
+  @discardableResult internal func mutate(color: MyGame_Example_Color) -> Bool {let o = _accessor.offset(VTOFFSET.color.v);  return _accessor.mutate(color.rawValue, index: o) }
+  internal static func startTestSimpleTableWithEnum(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 1) }
+  internal static func add(color: MyGame_Example_Color, _ fbb: inout FlatBufferBuilder) { fbb.add(element: color.rawValue, def: 2, at: VTOFFSET.color.p) }
+  internal static func endTestSimpleTableWithEnum(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  internal static func createTestSimpleTableWithEnum(
+    _ fbb: inout FlatBufferBuilder,
+    color: MyGame_Example_Color = .green
+  ) -> Offset {
+    let __start = MyGame_Example_TestSimpleTableWithEnum.startTestSimpleTableWithEnum(&fbb)
+    MyGame_Example_TestSimpleTableWithEnum.add(color: color, &fbb)
+    return MyGame_Example_TestSimpleTableWithEnum.endTestSimpleTableWithEnum(&fbb, start: __start)
+  }
+  
+
+  internal mutating func unpack() -> MyGame_Example_TestSimpleTableWithEnumT {
+    return MyGame_Example_TestSimpleTableWithEnumT(&self)
+  }
+  internal static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_TestSimpleTableWithEnumT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  internal static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_TestSimpleTableWithEnumT) -> Offset {
+    let __root = MyGame_Example_TestSimpleTableWithEnum.startTestSimpleTableWithEnum(&builder)
+    MyGame_Example_TestSimpleTableWithEnum.add(color: obj.color, &builder)
+    return MyGame_Example_TestSimpleTableWithEnum.endTestSimpleTableWithEnum(&builder, start: __root)
+  }
+}
+
+internal class MyGame_Example_TestSimpleTableWithEnumT: NativeObject {
+
+  internal var color: MyGame_Example_Color
+
+  internal init(_ _t: inout MyGame_Example_TestSimpleTableWithEnum) {
+    color = _t.color
+  }
+
+  internal init() {
+    color = .green
+  }
+
+  internal func serialize() -> ByteBuffer { return serialize(type: MyGame_Example_TestSimpleTableWithEnum.self) }
+
+}
+public struct MyGame_Example_Stat: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsStat(bb: ByteBuffer) -> MyGame_Example_Stat { return MyGame_Example_Stat(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case id = 4
+    case val = 6
+    case count = 8
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var id: String? { let o = _accessor.offset(VTOFFSET.id.v); return o == 0 ? nil : _accessor.string(at: o) }
+  public var idSegmentArray: [UInt8]? { return _accessor.getVector(at: VTOFFSET.id.v) }
+  public var val: Int64 { let o = _accessor.offset(VTOFFSET.val.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int64.self, at: o) }
+  @discardableResult public func mutate(val: Int64) -> Bool {let o = _accessor.offset(VTOFFSET.val.v);  return _accessor.mutate(val, index: o) }
+  public var count: UInt16 { let o = _accessor.offset(VTOFFSET.count.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt16.self, at: o) }
+  @discardableResult public func mutate(count: UInt16) -> Bool {let o = _accessor.offset(VTOFFSET.count.v);  return _accessor.mutate(count, index: o) }
+  public static func startStat(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 3) }
+  public static func add(id: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: id, at: VTOFFSET.id.p) }
+  public static func add(val: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: val, def: 0, at: VTOFFSET.val.p) }
+  public static func add(count: UInt16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: count, def: 0, at: VTOFFSET.count.p) }
+  public static func endStat(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createStat(
+    _ fbb: inout FlatBufferBuilder,
+    idOffset id: Offset = Offset(),
+    val: Int64 = 0,
+    count: UInt16 = 0
+  ) -> Offset {
+    let __start = MyGame_Example_Stat.startStat(&fbb)
+    MyGame_Example_Stat.add(id: id, &fbb)
+    MyGame_Example_Stat.add(val: val, &fbb)
+    MyGame_Example_Stat.add(count: count, &fbb)
+    return MyGame_Example_Stat.endStat(&fbb, start: __start)
+  }
+  public static func sortVectorOfStat(offsets:[Offset], _ fbb: inout FlatBufferBuilder) -> Offset {
+    var off = offsets
+    off.sort { Table.compare(Table.offset(Int32($1.o), vOffset: 8, fbb: fbb.buffer), Table.offset(Int32($0.o), vOffset: 8, fbb: fbb.buffer), fbb: fbb.buffer) < 0 } 
+    return fbb.createVector(ofOffsets: off)
+  }
+  fileprivate static func lookupByKey(vector: Int32, key: UInt16, fbb: ByteBuffer) -> MyGame_Example_Stat? {
+    var span = fbb.read(def: Int32.self, position: Int(vector - 4))
+    var start: Int32 = 0
+    while span != 0 {
+      var middle = span / 2
+      let tableOffset = Table.indirect(vector + 4 * (start + middle), fbb)
+      let comp = fbb.read(def: UInt16.self, position: Int(Table.offset(Int32(fbb.capacity) - tableOffset, vOffset: 8, fbb: fbb)))
+      if comp > 0 {
+        span = middle
+      } else if comp < 0 {
+        middle += 1
+        start += middle
+        span -= middle
+      } else {
+        return MyGame_Example_Stat(fbb, o: tableOffset)
+      }
+    }
+    return nil
+  }
+  
+
+  public mutating func unpack() -> MyGame_Example_StatT {
+    return MyGame_Example_StatT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_StatT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_StatT) -> Offset {
+    let __id: Offset
+    if let s = obj.id {
+      __id = builder.create(string: s)
+    } else {
+      __id = Offset()
+    }
+
+    let __root = MyGame_Example_Stat.startStat(&builder)
+    MyGame_Example_Stat.add(id: __id, &builder)
+    MyGame_Example_Stat.add(val: obj.val, &builder)
+    MyGame_Example_Stat.add(count: obj.count, &builder)
+    return MyGame_Example_Stat.endStat(&builder, start: __root)
+  }
+}
+
+public class MyGame_Example_StatT: NativeObject {
+
+  public var id: String?
+  public var val: Int64
+  public var count: UInt16
+
+  public init(_ _t: inout MyGame_Example_Stat) {
+    id = _t.id
+    val = _t.val
+    count = _t.count
+  }
+
+  public init() {
+    val = 0
+    count = 0
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_Example_Stat.self) }
+
+}
+public struct MyGame_Example_Referrable: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsReferrable(bb: ByteBuffer) -> MyGame_Example_Referrable { return MyGame_Example_Referrable(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case id = 4
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var id: UInt64 { let o = _accessor.offset(VTOFFSET.id.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(id: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.id.v);  return _accessor.mutate(id, index: o) }
+  public static func startReferrable(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 1) }
+  public static func add(id: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: id, def: 0, at: VTOFFSET.id.p) }
+  public static func endReferrable(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createReferrable(
+    _ fbb: inout FlatBufferBuilder,
+    id: UInt64 = 0
+  ) -> Offset {
+    let __start = MyGame_Example_Referrable.startReferrable(&fbb)
+    MyGame_Example_Referrable.add(id: id, &fbb)
+    return MyGame_Example_Referrable.endReferrable(&fbb, start: __start)
+  }
+  public static func sortVectorOfReferrable(offsets:[Offset], _ fbb: inout FlatBufferBuilder) -> Offset {
+    var off = offsets
+    off.sort { Table.compare(Table.offset(Int32($1.o), vOffset: 4, fbb: fbb.buffer), Table.offset(Int32($0.o), vOffset: 4, fbb: fbb.buffer), fbb: fbb.buffer) < 0 } 
+    return fbb.createVector(ofOffsets: off)
+  }
+  fileprivate static func lookupByKey(vector: Int32, key: UInt64, fbb: ByteBuffer) -> MyGame_Example_Referrable? {
+    var span = fbb.read(def: Int32.self, position: Int(vector - 4))
+    var start: Int32 = 0
+    while span != 0 {
+      var middle = span / 2
+      let tableOffset = Table.indirect(vector + 4 * (start + middle), fbb)
+      let comp = fbb.read(def: UInt64.self, position: Int(Table.offset(Int32(fbb.capacity) - tableOffset, vOffset: 4, fbb: fbb)))
+      if comp > 0 {
+        span = middle
+      } else if comp < 0 {
+        middle += 1
+        start += middle
+        span -= middle
+      } else {
+        return MyGame_Example_Referrable(fbb, o: tableOffset)
+      }
+    }
+    return nil
+  }
+  
+
+  public mutating func unpack() -> MyGame_Example_ReferrableT {
+    return MyGame_Example_ReferrableT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_ReferrableT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_ReferrableT) -> Offset {
+    let __root = MyGame_Example_Referrable.startReferrable(&builder)
+    MyGame_Example_Referrable.add(id: obj.id, &builder)
+    return MyGame_Example_Referrable.endReferrable(&builder, start: __root)
+  }
+}
+
+public class MyGame_Example_ReferrableT: NativeObject {
+
+  public var id: UInt64
+
+  public init(_ _t: inout MyGame_Example_Referrable) {
+    id = _t.id
+  }
+
+  public init() {
+    id = 0
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_Example_Referrable.self) }
+
+}
+///  an example documentation comment: "monster object"
+public struct MyGame_Example_Monster: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsMonster(bb: ByteBuffer) -> MyGame_Example_Monster { return MyGame_Example_Monster(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case pos = 4
+    case mana = 6
+    case hp = 8
+    case name = 10
+    case inventory = 14
+    case color = 16
+    case testType = 18
+    case test = 20
+    case test4 = 22
+    case testarrayofstring = 24
+    case testarrayoftables = 26
+    case enemy = 28
+    case testnestedflatbuffer = 30
+    case testempty = 32
+    case testbool = 34
+    case testhashs32Fnv1 = 36
+    case testhashu32Fnv1 = 38
+    case testhashs64Fnv1 = 40
+    case testhashu64Fnv1 = 42
+    case testhashs32Fnv1a = 44
+    case testhashu32Fnv1a = 46
+    case testhashs64Fnv1a = 48
+    case testhashu64Fnv1a = 50
+    case testarrayofbools = 52
+    case testf = 54
+    case testf2 = 56
+    case testf3 = 58
+    case testarrayofstring2 = 60
+    case testarrayofsortedstruct = 62
+    case flex = 64
+    case test5 = 66
+    case vectorOfLongs = 68
+    case vectorOfDoubles = 70
+    case parentNamespaceTest = 72
+    case vectorOfReferrables = 74
+    case singleWeakReference = 76
+    case vectorOfWeakReferences = 78
+    case vectorOfStrongReferrables = 80
+    case coOwningReference = 82
+    case vectorOfCoOwningReferences = 84
+    case nonOwningReference = 86
+    case vectorOfNonOwningReferences = 88
+    case anyUniqueType = 90
+    case anyUnique = 92
+    case anyAmbiguousType = 94
+    case anyAmbiguous = 96
+    case vectorOfEnums = 98
+    case signedEnum = 100
+    case testrequirednestedflatbuffer = 102
+    case scalarKeySortedTables = 104
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var pos: MyGame_Example_Vec3? { let o = _accessor.offset(VTOFFSET.pos.v); return o == 0 ? nil : _accessor.readBuffer(of: MyGame_Example_Vec3.self, at: o) }
+  public var mutablePos: MyGame_Example_Vec3_Mutable? { let o = _accessor.offset(VTOFFSET.pos.v); return o == 0 ? nil : MyGame_Example_Vec3_Mutable(_accessor.bb, o: o + _accessor.postion) }
+  public var mana: Int16 { let o = _accessor.offset(VTOFFSET.mana.v); return o == 0 ? 150 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(mana: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.mana.v);  return _accessor.mutate(mana, index: o) }
+  public var hp: Int16 { let o = _accessor.offset(VTOFFSET.hp.v); return o == 0 ? 100 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(hp: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.hp.v);  return _accessor.mutate(hp, index: o) }
+  public var name: String! { let o = _accessor.offset(VTOFFSET.name.v); return _accessor.string(at: o) }
+  public var nameSegmentArray: [UInt8]! { return _accessor.getVector(at: VTOFFSET.name.v) }
+  public var inventoryCount: Int32 { let o = _accessor.offset(VTOFFSET.inventory.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func inventory(at index: Int32) -> UInt8 { let o = _accessor.offset(VTOFFSET.inventory.v); return o == 0 ? 0 : _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var inventory: [UInt8] { return _accessor.getVector(at: VTOFFSET.inventory.v) ?? [] }
+  public func mutate(inventory: UInt8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.inventory.v); return _accessor.directMutate(inventory, index: _accessor.vector(at: o) + index * 1) }
+  public var color: MyGame_Example_Color { let o = _accessor.offset(VTOFFSET.color.v); return o == 0 ? .blue : MyGame_Example_Color(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .blue }
+  @discardableResult public func mutate(color: MyGame_Example_Color) -> Bool {let o = _accessor.offset(VTOFFSET.color.v);  return _accessor.mutate(color.rawValue, index: o) }
+  public var testType: MyGame_Example_Any_ { let o = _accessor.offset(VTOFFSET.testType.v); return o == 0 ? .none_ : MyGame_Example_Any_(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .none_ }
+  public func test<T: FlatbuffersInitializable>(type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.test.v); return o == 0 ? nil : _accessor.union(o) }
+  public var test4Count: Int32 { let o = _accessor.offset(VTOFFSET.test4.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func test4(at index: Int32) -> MyGame_Example_Test? { let o = _accessor.offset(VTOFFSET.test4.v); return o == 0 ? nil : _accessor.directRead(of: MyGame_Example_Test.self, offset: _accessor.vector(at: o) + index * 4) }
+  public func mutableTest4(at index: Int32) -> MyGame_Example_Test_Mutable? { let o = _accessor.offset(VTOFFSET.test4.v); return o == 0 ? nil : MyGame_Example_Test_Mutable(_accessor.bb, o: _accessor.vector(at: o) + index * 4) }
+  public var testarrayofstringCount: Int32 { let o = _accessor.offset(VTOFFSET.testarrayofstring.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testarrayofstring(at index: Int32) -> String? { let o = _accessor.offset(VTOFFSET.testarrayofstring.v); return o == 0 ? nil : _accessor.directString(at: _accessor.vector(at: o) + index * 4) }
+  ///  an example documentation comment: this will end up in the generated code
+  ///  multiline too
+  public var testarrayoftablesCount: Int32 { let o = _accessor.offset(VTOFFSET.testarrayoftables.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testarrayoftables(at index: Int32) -> MyGame_Example_Monster? { let o = _accessor.offset(VTOFFSET.testarrayoftables.v); return o == 0 ? nil : MyGame_Example_Monster(_accessor.bb, o: _accessor.indirect(_accessor.vector(at: o) + index * 4)) }
+  public func testarrayoftablesBy(key: String) -> MyGame_Example_Monster? { let o = _accessor.offset(VTOFFSET.testarrayoftables.v); return o == 0 ? nil : MyGame_Example_Monster.lookupByKey(vector: _accessor.vector(at: o), key: key, fbb: _accessor.bb) }
+  public var enemy: MyGame_Example_Monster? { let o = _accessor.offset(VTOFFSET.enemy.v); return o == 0 ? nil : MyGame_Example_Monster(_accessor.bb, o: _accessor.indirect(o + _accessor.postion)) }
+  public var testnestedflatbufferCount: Int32 { let o = _accessor.offset(VTOFFSET.testnestedflatbuffer.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testnestedflatbuffer(at index: Int32) -> UInt8 { let o = _accessor.offset(VTOFFSET.testnestedflatbuffer.v); return o == 0 ? 0 : _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var testnestedflatbuffer: [UInt8] { return _accessor.getVector(at: VTOFFSET.testnestedflatbuffer.v) ?? [] }
+  public func mutate(testnestedflatbuffer: UInt8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.testnestedflatbuffer.v); return _accessor.directMutate(testnestedflatbuffer, index: _accessor.vector(at: o) + index * 1) }
+  public var testempty: MyGame_Example_Stat? { let o = _accessor.offset(VTOFFSET.testempty.v); return o == 0 ? nil : MyGame_Example_Stat(_accessor.bb, o: _accessor.indirect(o + _accessor.postion)) }
+  public var testbool: Bool { let o = _accessor.offset(VTOFFSET.testbool.v); return o == 0 ? false : 0 != _accessor.readBuffer(of: Byte.self, at: o) }
+  @discardableResult public func mutate(testbool: Byte) -> Bool {let o = _accessor.offset(VTOFFSET.testbool.v);  return _accessor.mutate(testbool, index: o) }
+  public var testhashs32Fnv1: Int32 { let o = _accessor.offset(VTOFFSET.testhashs32Fnv1.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int32.self, at: o) }
+  @discardableResult public func mutate(testhashs32Fnv1: Int32) -> Bool {let o = _accessor.offset(VTOFFSET.testhashs32Fnv1.v);  return _accessor.mutate(testhashs32Fnv1, index: o) }
+  public var testhashu32Fnv1: UInt32 { let o = _accessor.offset(VTOFFSET.testhashu32Fnv1.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt32.self, at: o) }
+  @discardableResult public func mutate(testhashu32Fnv1: UInt32) -> Bool {let o = _accessor.offset(VTOFFSET.testhashu32Fnv1.v);  return _accessor.mutate(testhashu32Fnv1, index: o) }
+  public var testhashs64Fnv1: Int64 { let o = _accessor.offset(VTOFFSET.testhashs64Fnv1.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int64.self, at: o) }
+  @discardableResult public func mutate(testhashs64Fnv1: Int64) -> Bool {let o = _accessor.offset(VTOFFSET.testhashs64Fnv1.v);  return _accessor.mutate(testhashs64Fnv1, index: o) }
+  public var testhashu64Fnv1: UInt64 { let o = _accessor.offset(VTOFFSET.testhashu64Fnv1.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(testhashu64Fnv1: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.testhashu64Fnv1.v);  return _accessor.mutate(testhashu64Fnv1, index: o) }
+  public var testhashs32Fnv1a: Int32 { let o = _accessor.offset(VTOFFSET.testhashs32Fnv1a.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int32.self, at: o) }
+  @discardableResult public func mutate(testhashs32Fnv1a: Int32) -> Bool {let o = _accessor.offset(VTOFFSET.testhashs32Fnv1a.v);  return _accessor.mutate(testhashs32Fnv1a, index: o) }
+  public var testhashu32Fnv1a: UInt32 { let o = _accessor.offset(VTOFFSET.testhashu32Fnv1a.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt32.self, at: o) }
+  @discardableResult public func mutate(testhashu32Fnv1a: UInt32) -> Bool {let o = _accessor.offset(VTOFFSET.testhashu32Fnv1a.v);  return _accessor.mutate(testhashu32Fnv1a, index: o) }
+  public var testhashs64Fnv1a: Int64 { let o = _accessor.offset(VTOFFSET.testhashs64Fnv1a.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int64.self, at: o) }
+  @discardableResult public func mutate(testhashs64Fnv1a: Int64) -> Bool {let o = _accessor.offset(VTOFFSET.testhashs64Fnv1a.v);  return _accessor.mutate(testhashs64Fnv1a, index: o) }
+  public var testhashu64Fnv1a: UInt64 { let o = _accessor.offset(VTOFFSET.testhashu64Fnv1a.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(testhashu64Fnv1a: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.testhashu64Fnv1a.v);  return _accessor.mutate(testhashu64Fnv1a, index: o) }
+  public var testarrayofboolsCount: Int32 { let o = _accessor.offset(VTOFFSET.testarrayofbools.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testarrayofbools(at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.testarrayofbools.v); return o == 0 ? true : _accessor.directRead(of: Bool.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var testarrayofbools: [Bool] { return _accessor.getVector(at: VTOFFSET.testarrayofbools.v) ?? [] }
+  public func mutate(testarrayofbools: Bool, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.testarrayofbools.v); return _accessor.directMutate(testarrayofbools, index: _accessor.vector(at: o) + index * 1) }
+  public var testf: Float32 { let o = _accessor.offset(VTOFFSET.testf.v); return o == 0 ? 3.14159 : _accessor.readBuffer(of: Float32.self, at: o) }
+  @discardableResult public func mutate(testf: Float32) -> Bool {let o = _accessor.offset(VTOFFSET.testf.v);  return _accessor.mutate(testf, index: o) }
+  public var testf2: Float32 { let o = _accessor.offset(VTOFFSET.testf2.v); return o == 0 ? 3.0 : _accessor.readBuffer(of: Float32.self, at: o) }
+  @discardableResult public func mutate(testf2: Float32) -> Bool {let o = _accessor.offset(VTOFFSET.testf2.v);  return _accessor.mutate(testf2, index: o) }
+  public var testf3: Float32 { let o = _accessor.offset(VTOFFSET.testf3.v); return o == 0 ? 0.0 : _accessor.readBuffer(of: Float32.self, at: o) }
+  @discardableResult public func mutate(testf3: Float32) -> Bool {let o = _accessor.offset(VTOFFSET.testf3.v);  return _accessor.mutate(testf3, index: o) }
+  public var testarrayofstring2Count: Int32 { let o = _accessor.offset(VTOFFSET.testarrayofstring2.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testarrayofstring2(at index: Int32) -> String? { let o = _accessor.offset(VTOFFSET.testarrayofstring2.v); return o == 0 ? nil : _accessor.directString(at: _accessor.vector(at: o) + index * 4) }
+  public var testarrayofsortedstructCount: Int32 { let o = _accessor.offset(VTOFFSET.testarrayofsortedstruct.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testarrayofsortedstruct(at index: Int32) -> MyGame_Example_Ability? { let o = _accessor.offset(VTOFFSET.testarrayofsortedstruct.v); return o == 0 ? nil : _accessor.directRead(of: MyGame_Example_Ability.self, offset: _accessor.vector(at: o) + index * 8) }
+  public func mutableTestarrayofsortedstruct(at index: Int32) -> MyGame_Example_Ability_Mutable? { let o = _accessor.offset(VTOFFSET.testarrayofsortedstruct.v); return o == 0 ? nil : MyGame_Example_Ability_Mutable(_accessor.bb, o: _accessor.vector(at: o) + index * 8) }
+  public var flexCount: Int32 { let o = _accessor.offset(VTOFFSET.flex.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func flex(at index: Int32) -> UInt8 { let o = _accessor.offset(VTOFFSET.flex.v); return o == 0 ? 0 : _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var flex: [UInt8] { return _accessor.getVector(at: VTOFFSET.flex.v) ?? [] }
+  public func mutate(flex: UInt8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.flex.v); return _accessor.directMutate(flex, index: _accessor.vector(at: o) + index * 1) }
+  public var test5Count: Int32 { let o = _accessor.offset(VTOFFSET.test5.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func test5(at index: Int32) -> MyGame_Example_Test? { let o = _accessor.offset(VTOFFSET.test5.v); return o == 0 ? nil : _accessor.directRead(of: MyGame_Example_Test.self, offset: _accessor.vector(at: o) + index * 4) }
+  public func mutableTest5(at index: Int32) -> MyGame_Example_Test_Mutable? { let o = _accessor.offset(VTOFFSET.test5.v); return o == 0 ? nil : MyGame_Example_Test_Mutable(_accessor.bb, o: _accessor.vector(at: o) + index * 4) }
+  public var vectorOfLongsCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfLongs.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfLongs(at index: Int32) -> Int64 { let o = _accessor.offset(VTOFFSET.vectorOfLongs.v); return o == 0 ? 0 : _accessor.directRead(of: Int64.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfLongs: [Int64] { return _accessor.getVector(at: VTOFFSET.vectorOfLongs.v) ?? [] }
+  public func mutate(vectorOfLongs: Int64, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vectorOfLongs.v); return _accessor.directMutate(vectorOfLongs, index: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfDoublesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfDoubles.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfDoubles(at index: Int32) -> Double { let o = _accessor.offset(VTOFFSET.vectorOfDoubles.v); return o == 0 ? 0 : _accessor.directRead(of: Double.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfDoubles: [Double] { return _accessor.getVector(at: VTOFFSET.vectorOfDoubles.v) ?? [] }
+  public func mutate(vectorOfDoubles: Double, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vectorOfDoubles.v); return _accessor.directMutate(vectorOfDoubles, index: _accessor.vector(at: o) + index * 8) }
+  public var parentNamespaceTest: MyGame_InParentNamespace? { let o = _accessor.offset(VTOFFSET.parentNamespaceTest.v); return o == 0 ? nil : MyGame_InParentNamespace(_accessor.bb, o: _accessor.indirect(o + _accessor.postion)) }
+  public var vectorOfReferrablesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfReferrables.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfReferrables(at index: Int32) -> MyGame_Example_Referrable? { let o = _accessor.offset(VTOFFSET.vectorOfReferrables.v); return o == 0 ? nil : MyGame_Example_Referrable(_accessor.bb, o: _accessor.indirect(_accessor.vector(at: o) + index * 4)) }
+  public func vectorOfReferrablesBy(key: UInt64) -> MyGame_Example_Referrable? { let o = _accessor.offset(VTOFFSET.vectorOfReferrables.v); return o == 0 ? nil : MyGame_Example_Referrable.lookupByKey(vector: _accessor.vector(at: o), key: key, fbb: _accessor.bb) }
+  public var singleWeakReference: UInt64 { let o = _accessor.offset(VTOFFSET.singleWeakReference.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(singleWeakReference: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.singleWeakReference.v);  return _accessor.mutate(singleWeakReference, index: o) }
+  public var vectorOfWeakReferencesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfWeakReferences.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfWeakReferences(at index: Int32) -> UInt64 { let o = _accessor.offset(VTOFFSET.vectorOfWeakReferences.v); return o == 0 ? 0 : _accessor.directRead(of: UInt64.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfWeakReferences: [UInt64] { return _accessor.getVector(at: VTOFFSET.vectorOfWeakReferences.v) ?? [] }
+  public func mutate(vectorOfWeakReferences: UInt64, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vectorOfWeakReferences.v); return _accessor.directMutate(vectorOfWeakReferences, index: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfStrongReferrablesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfStrongReferrables.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfStrongReferrables(at index: Int32) -> MyGame_Example_Referrable? { let o = _accessor.offset(VTOFFSET.vectorOfStrongReferrables.v); return o == 0 ? nil : MyGame_Example_Referrable(_accessor.bb, o: _accessor.indirect(_accessor.vector(at: o) + index * 4)) }
+  public func vectorOfStrongReferrablesBy(key: UInt64) -> MyGame_Example_Referrable? { let o = _accessor.offset(VTOFFSET.vectorOfStrongReferrables.v); return o == 0 ? nil : MyGame_Example_Referrable.lookupByKey(vector: _accessor.vector(at: o), key: key, fbb: _accessor.bb) }
+  public var coOwningReference: UInt64 { let o = _accessor.offset(VTOFFSET.coOwningReference.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(coOwningReference: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.coOwningReference.v);  return _accessor.mutate(coOwningReference, index: o) }
+  public var vectorOfCoOwningReferencesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfCoOwningReferences.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfCoOwningReferences(at index: Int32) -> UInt64 { let o = _accessor.offset(VTOFFSET.vectorOfCoOwningReferences.v); return o == 0 ? 0 : _accessor.directRead(of: UInt64.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfCoOwningReferences: [UInt64] { return _accessor.getVector(at: VTOFFSET.vectorOfCoOwningReferences.v) ?? [] }
+  public func mutate(vectorOfCoOwningReferences: UInt64, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vectorOfCoOwningReferences.v); return _accessor.directMutate(vectorOfCoOwningReferences, index: _accessor.vector(at: o) + index * 8) }
+  public var nonOwningReference: UInt64 { let o = _accessor.offset(VTOFFSET.nonOwningReference.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(nonOwningReference: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.nonOwningReference.v);  return _accessor.mutate(nonOwningReference, index: o) }
+  public var vectorOfNonOwningReferencesCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfNonOwningReferences.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfNonOwningReferences(at index: Int32) -> UInt64 { let o = _accessor.offset(VTOFFSET.vectorOfNonOwningReferences.v); return o == 0 ? 0 : _accessor.directRead(of: UInt64.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vectorOfNonOwningReferences: [UInt64] { return _accessor.getVector(at: VTOFFSET.vectorOfNonOwningReferences.v) ?? [] }
+  public func mutate(vectorOfNonOwningReferences: UInt64, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vectorOfNonOwningReferences.v); return _accessor.directMutate(vectorOfNonOwningReferences, index: _accessor.vector(at: o) + index * 8) }
+  public var anyUniqueType: MyGame_Example_AnyUniqueAliases { let o = _accessor.offset(VTOFFSET.anyUniqueType.v); return o == 0 ? .none_ : MyGame_Example_AnyUniqueAliases(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .none_ }
+  public func anyUnique<T: FlatbuffersInitializable>(type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.anyUnique.v); return o == 0 ? nil : _accessor.union(o) }
+  public var anyAmbiguousType: MyGame_Example_AnyAmbiguousAliases { let o = _accessor.offset(VTOFFSET.anyAmbiguousType.v); return o == 0 ? .none_ : MyGame_Example_AnyAmbiguousAliases(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .none_ }
+  public func anyAmbiguous<T: FlatbuffersInitializable>(type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.anyAmbiguous.v); return o == 0 ? nil : _accessor.union(o) }
+  public var vectorOfEnumsCount: Int32 { let o = _accessor.offset(VTOFFSET.vectorOfEnums.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vectorOfEnums(at index: Int32) -> MyGame_Example_Color? { let o = _accessor.offset(VTOFFSET.vectorOfEnums.v); return o == 0 ? MyGame_Example_Color.red : MyGame_Example_Color(rawValue: _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1)) }
+  public var signedEnum: MyGame_Example_Race { let o = _accessor.offset(VTOFFSET.signedEnum.v); return o == 0 ? .none_ : MyGame_Example_Race(rawValue: _accessor.readBuffer(of: Int8.self, at: o)) ?? .none_ }
+  @discardableResult public func mutate(signedEnum: MyGame_Example_Race) -> Bool {let o = _accessor.offset(VTOFFSET.signedEnum.v);  return _accessor.mutate(signedEnum.rawValue, index: o) }
+  public var testrequirednestedflatbufferCount: Int32 { let o = _accessor.offset(VTOFFSET.testrequirednestedflatbuffer.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func testrequirednestedflatbuffer(at index: Int32) -> UInt8 { let o = _accessor.offset(VTOFFSET.testrequirednestedflatbuffer.v); return o == 0 ? 0 : _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var testrequirednestedflatbuffer: [UInt8] { return _accessor.getVector(at: VTOFFSET.testrequirednestedflatbuffer.v) ?? [] }
+  public func mutate(testrequirednestedflatbuffer: UInt8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.testrequirednestedflatbuffer.v); return _accessor.directMutate(testrequirednestedflatbuffer, index: _accessor.vector(at: o) + index * 1) }
+  public var scalarKeySortedTablesCount: Int32 { let o = _accessor.offset(VTOFFSET.scalarKeySortedTables.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func scalarKeySortedTables(at index: Int32) -> MyGame_Example_Stat? { let o = _accessor.offset(VTOFFSET.scalarKeySortedTables.v); return o == 0 ? nil : MyGame_Example_Stat(_accessor.bb, o: _accessor.indirect(_accessor.vector(at: o) + index * 4)) }
+  public func scalarKeySortedTablesBy(key: UInt16) -> MyGame_Example_Stat? { let o = _accessor.offset(VTOFFSET.scalarKeySortedTables.v); return o == 0 ? nil : MyGame_Example_Stat.lookupByKey(vector: _accessor.vector(at: o), key: key, fbb: _accessor.bb) }
+  public static func startMonster(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 51) }
+  public static func add(pos: MyGame_Example_Vec3?, _ fbb: inout FlatBufferBuilder) { guard let pos = pos else { return }; fbb.create(struct: pos, position: VTOFFSET.pos.p) }
+  public static func add(mana: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: mana, def: 150, at: VTOFFSET.mana.p) }
+  public static func add(hp: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: hp, def: 100, at: VTOFFSET.hp.p) }
+  public static func add(name: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: name, at: VTOFFSET.name.p) }
+  public static func addVectorOf(inventory: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: inventory, at: VTOFFSET.inventory.p) }
+  public static func add(color: MyGame_Example_Color, _ fbb: inout FlatBufferBuilder) { fbb.add(element: color.rawValue, def: 8, at: VTOFFSET.color.p) }
+  public static func add(testType: MyGame_Example_Any_, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testType.rawValue, def: 0, at: VTOFFSET.testType.p) }
+  public static func add(test: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: test, at: VTOFFSET.test.p) }
+  public static func addVectorOf(test4: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: test4, at: VTOFFSET.test4.p) }
+  public static func startVectorOfTest4(_ size: Int, in builder: inout FlatBufferBuilder) {
+    builder.startVector(size * MemoryLayout<MyGame_Example_Test>.size, elementSize: MemoryLayout<MyGame_Example_Test>.alignment)
+  }
+  public static func addVectorOf(testarrayofstring: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testarrayofstring, at: VTOFFSET.testarrayofstring.p) }
+  public static func addVectorOf(testarrayoftables: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testarrayoftables, at: VTOFFSET.testarrayoftables.p) }
+  public static func add(enemy: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: enemy, at: VTOFFSET.enemy.p) }
+  public static func addVectorOf(testnestedflatbuffer: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testnestedflatbuffer, at: VTOFFSET.testnestedflatbuffer.p) }
+  public static func add(testempty: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testempty, at: VTOFFSET.testempty.p) }
+  public static func add(testbool: Bool, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testbool, def: false,
+   at: VTOFFSET.testbool.p) }
+  public static func add(testhashs32Fnv1: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashs32Fnv1, def: 0, at: VTOFFSET.testhashs32Fnv1.p) }
+  public static func add(testhashu32Fnv1: UInt32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashu32Fnv1, def: 0, at: VTOFFSET.testhashu32Fnv1.p) }
+  public static func add(testhashs64Fnv1: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashs64Fnv1, def: 0, at: VTOFFSET.testhashs64Fnv1.p) }
+  public static func add(testhashu64Fnv1: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashu64Fnv1, def: 0, at: VTOFFSET.testhashu64Fnv1.p) }
+  public static func add(testhashs32Fnv1a: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashs32Fnv1a, def: 0, at: VTOFFSET.testhashs32Fnv1a.p) }
+  public static func add(testhashu32Fnv1a: UInt32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashu32Fnv1a, def: 0, at: VTOFFSET.testhashu32Fnv1a.p) }
+  public static func add(testhashs64Fnv1a: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashs64Fnv1a, def: 0, at: VTOFFSET.testhashs64Fnv1a.p) }
+  public static func add(testhashu64Fnv1a: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testhashu64Fnv1a, def: 0, at: VTOFFSET.testhashu64Fnv1a.p) }
+  public static func addVectorOf(testarrayofbools: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testarrayofbools, at: VTOFFSET.testarrayofbools.p) }
+  public static func add(testf: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testf, def: 3.14159, at: VTOFFSET.testf.p) }
+  public static func add(testf2: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testf2, def: 3.0, at: VTOFFSET.testf2.p) }
+  public static func add(testf3: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: testf3, def: 0.0, at: VTOFFSET.testf3.p) }
+  public static func addVectorOf(testarrayofstring2: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testarrayofstring2, at: VTOFFSET.testarrayofstring2.p) }
+  public static func addVectorOf(testarrayofsortedstruct: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testarrayofsortedstruct, at: VTOFFSET.testarrayofsortedstruct.p) }
+  public static func startVectorOfTestarrayofsortedstruct(_ size: Int, in builder: inout FlatBufferBuilder) {
+    builder.startVector(size * MemoryLayout<MyGame_Example_Ability>.size, elementSize: MemoryLayout<MyGame_Example_Ability>.alignment)
+  }
+  public static func addVectorOf(flex: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: flex, at: VTOFFSET.flex.p) }
+  public static func addVectorOf(test5: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: test5, at: VTOFFSET.test5.p) }
+  public static func startVectorOfTest5(_ size: Int, in builder: inout FlatBufferBuilder) {
+    builder.startVector(size * MemoryLayout<MyGame_Example_Test>.size, elementSize: MemoryLayout<MyGame_Example_Test>.alignment)
+  }
+  public static func addVectorOf(vectorOfLongs: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfLongs, at: VTOFFSET.vectorOfLongs.p) }
+  public static func addVectorOf(vectorOfDoubles: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfDoubles, at: VTOFFSET.vectorOfDoubles.p) }
+  public static func add(parentNamespaceTest: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: parentNamespaceTest, at: VTOFFSET.parentNamespaceTest.p) }
+  public static func addVectorOf(vectorOfReferrables: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfReferrables, at: VTOFFSET.vectorOfReferrables.p) }
+  public static func add(singleWeakReference: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: singleWeakReference, def: 0, at: VTOFFSET.singleWeakReference.p) }
+  public static func addVectorOf(vectorOfWeakReferences: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfWeakReferences, at: VTOFFSET.vectorOfWeakReferences.p) }
+  public static func addVectorOf(vectorOfStrongReferrables: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfStrongReferrables, at: VTOFFSET.vectorOfStrongReferrables.p) }
+  public static func add(coOwningReference: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: coOwningReference, def: 0, at: VTOFFSET.coOwningReference.p) }
+  public static func addVectorOf(vectorOfCoOwningReferences: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfCoOwningReferences, at: VTOFFSET.vectorOfCoOwningReferences.p) }
+  public static func add(nonOwningReference: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: nonOwningReference, def: 0, at: VTOFFSET.nonOwningReference.p) }
+  public static func addVectorOf(vectorOfNonOwningReferences: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfNonOwningReferences, at: VTOFFSET.vectorOfNonOwningReferences.p) }
+  public static func add(anyUniqueType: MyGame_Example_AnyUniqueAliases, _ fbb: inout FlatBufferBuilder) { fbb.add(element: anyUniqueType.rawValue, def: 0, at: VTOFFSET.anyUniqueType.p) }
+  public static func add(anyUnique: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: anyUnique, at: VTOFFSET.anyUnique.p) }
+  public static func add(anyAmbiguousType: MyGame_Example_AnyAmbiguousAliases, _ fbb: inout FlatBufferBuilder) { fbb.add(element: anyAmbiguousType.rawValue, def: 0, at: VTOFFSET.anyAmbiguousType.p) }
+  public static func add(anyAmbiguous: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: anyAmbiguous, at: VTOFFSET.anyAmbiguous.p) }
+  public static func addVectorOf(vectorOfEnums: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vectorOfEnums, at: VTOFFSET.vectorOfEnums.p) }
+  public static func add(signedEnum: MyGame_Example_Race, _ fbb: inout FlatBufferBuilder) { fbb.add(element: signedEnum.rawValue, def: -1, at: VTOFFSET.signedEnum.p) }
+  public static func addVectorOf(testrequirednestedflatbuffer: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: testrequirednestedflatbuffer, at: VTOFFSET.testrequirednestedflatbuffer.p) }
+  public static func addVectorOf(scalarKeySortedTables: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: scalarKeySortedTables, at: VTOFFSET.scalarKeySortedTables.p) }
+  public static func endMonster(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); fbb.require(table: end, fields: [10]); return end }
+  public static func createMonster(
+    _ fbb: inout FlatBufferBuilder,
+    pos: MyGame_Example_Vec3? = nil,
+    mana: Int16 = 150,
+    hp: Int16 = 100,
+    nameOffset name: Offset = Offset(),
+    inventoryVectorOffset inventory: Offset = Offset(),
+    color: MyGame_Example_Color = .blue,
+    testType: MyGame_Example_Any_ = .none_,
+    testOffset test: Offset = Offset(),
+    test4VectorOffset test4: Offset = Offset(),
+    testarrayofstringVectorOffset testarrayofstring: Offset = Offset(),
+    testarrayoftablesVectorOffset testarrayoftables: Offset = Offset(),
+    enemyOffset enemy: Offset = Offset(),
+    testnestedflatbufferVectorOffset testnestedflatbuffer: Offset = Offset(),
+    testemptyOffset testempty: Offset = Offset(),
+    testbool: Bool = false,
+    testhashs32Fnv1: Int32 = 0,
+    testhashu32Fnv1: UInt32 = 0,
+    testhashs64Fnv1: Int64 = 0,
+    testhashu64Fnv1: UInt64 = 0,
+    testhashs32Fnv1a: Int32 = 0,
+    testhashu32Fnv1a: UInt32 = 0,
+    testhashs64Fnv1a: Int64 = 0,
+    testhashu64Fnv1a: UInt64 = 0,
+    testarrayofboolsVectorOffset testarrayofbools: Offset = Offset(),
+    testf: Float32 = 3.14159,
+    testf2: Float32 = 3.0,
+    testf3: Float32 = 0.0,
+    testarrayofstring2VectorOffset testarrayofstring2: Offset = Offset(),
+    testarrayofsortedstructVectorOffset testarrayofsortedstruct: Offset = Offset(),
+    flexVectorOffset flex: Offset = Offset(),
+    test5VectorOffset test5: Offset = Offset(),
+    vectorOfLongsVectorOffset vectorOfLongs: Offset = Offset(),
+    vectorOfDoublesVectorOffset vectorOfDoubles: Offset = Offset(),
+    parentNamespaceTestOffset parentNamespaceTest: Offset = Offset(),
+    vectorOfReferrablesVectorOffset vectorOfReferrables: Offset = Offset(),
+    singleWeakReference: UInt64 = 0,
+    vectorOfWeakReferencesVectorOffset vectorOfWeakReferences: Offset = Offset(),
+    vectorOfStrongReferrablesVectorOffset vectorOfStrongReferrables: Offset = Offset(),
+    coOwningReference: UInt64 = 0,
+    vectorOfCoOwningReferencesVectorOffset vectorOfCoOwningReferences: Offset = Offset(),
+    nonOwningReference: UInt64 = 0,
+    vectorOfNonOwningReferencesVectorOffset vectorOfNonOwningReferences: Offset = Offset(),
+    anyUniqueType: MyGame_Example_AnyUniqueAliases = .none_,
+    anyUniqueOffset anyUnique: Offset = Offset(),
+    anyAmbiguousType: MyGame_Example_AnyAmbiguousAliases = .none_,
+    anyAmbiguousOffset anyAmbiguous: Offset = Offset(),
+    vectorOfEnumsVectorOffset vectorOfEnums: Offset = Offset(),
+    signedEnum: MyGame_Example_Race = .none_,
+    testrequirednestedflatbufferVectorOffset testrequirednestedflatbuffer: Offset = Offset(),
+    scalarKeySortedTablesVectorOffset scalarKeySortedTables: Offset = Offset()
+  ) -> Offset {
+    let __start = MyGame_Example_Monster.startMonster(&fbb)
+    MyGame_Example_Monster.add(pos: pos, &fbb)
+    MyGame_Example_Monster.add(mana: mana, &fbb)
+    MyGame_Example_Monster.add(hp: hp, &fbb)
+    MyGame_Example_Monster.add(name: name, &fbb)
+    MyGame_Example_Monster.addVectorOf(inventory: inventory, &fbb)
+    MyGame_Example_Monster.add(color: color, &fbb)
+    MyGame_Example_Monster.add(testType: testType, &fbb)
+    MyGame_Example_Monster.add(test: test, &fbb)
+    MyGame_Example_Monster.addVectorOf(test4: test4, &fbb)
+    MyGame_Example_Monster.addVectorOf(testarrayofstring: testarrayofstring, &fbb)
+    MyGame_Example_Monster.addVectorOf(testarrayoftables: testarrayoftables, &fbb)
+    MyGame_Example_Monster.add(enemy: enemy, &fbb)
+    MyGame_Example_Monster.addVectorOf(testnestedflatbuffer: testnestedflatbuffer, &fbb)
+    MyGame_Example_Monster.add(testempty: testempty, &fbb)
+    MyGame_Example_Monster.add(testbool: testbool, &fbb)
+    MyGame_Example_Monster.add(testhashs32Fnv1: testhashs32Fnv1, &fbb)
+    MyGame_Example_Monster.add(testhashu32Fnv1: testhashu32Fnv1, &fbb)
+    MyGame_Example_Monster.add(testhashs64Fnv1: testhashs64Fnv1, &fbb)
+    MyGame_Example_Monster.add(testhashu64Fnv1: testhashu64Fnv1, &fbb)
+    MyGame_Example_Monster.add(testhashs32Fnv1a: testhashs32Fnv1a, &fbb)
+    MyGame_Example_Monster.add(testhashu32Fnv1a: testhashu32Fnv1a, &fbb)
+    MyGame_Example_Monster.add(testhashs64Fnv1a: testhashs64Fnv1a, &fbb)
+    MyGame_Example_Monster.add(testhashu64Fnv1a: testhashu64Fnv1a, &fbb)
+    MyGame_Example_Monster.addVectorOf(testarrayofbools: testarrayofbools, &fbb)
+    MyGame_Example_Monster.add(testf: testf, &fbb)
+    MyGame_Example_Monster.add(testf2: testf2, &fbb)
+    MyGame_Example_Monster.add(testf3: testf3, &fbb)
+    MyGame_Example_Monster.addVectorOf(testarrayofstring2: testarrayofstring2, &fbb)
+    MyGame_Example_Monster.addVectorOf(testarrayofsortedstruct: testarrayofsortedstruct, &fbb)
+    MyGame_Example_Monster.addVectorOf(flex: flex, &fbb)
+    MyGame_Example_Monster.addVectorOf(test5: test5, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfLongs: vectorOfLongs, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfDoubles: vectorOfDoubles, &fbb)
+    MyGame_Example_Monster.add(parentNamespaceTest: parentNamespaceTest, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfReferrables: vectorOfReferrables, &fbb)
+    MyGame_Example_Monster.add(singleWeakReference: singleWeakReference, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfWeakReferences: vectorOfWeakReferences, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfStrongReferrables: vectorOfStrongReferrables, &fbb)
+    MyGame_Example_Monster.add(coOwningReference: coOwningReference, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfCoOwningReferences: vectorOfCoOwningReferences, &fbb)
+    MyGame_Example_Monster.add(nonOwningReference: nonOwningReference, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfNonOwningReferences: vectorOfNonOwningReferences, &fbb)
+    MyGame_Example_Monster.add(anyUniqueType: anyUniqueType, &fbb)
+    MyGame_Example_Monster.add(anyUnique: anyUnique, &fbb)
+    MyGame_Example_Monster.add(anyAmbiguousType: anyAmbiguousType, &fbb)
+    MyGame_Example_Monster.add(anyAmbiguous: anyAmbiguous, &fbb)
+    MyGame_Example_Monster.addVectorOf(vectorOfEnums: vectorOfEnums, &fbb)
+    MyGame_Example_Monster.add(signedEnum: signedEnum, &fbb)
+    MyGame_Example_Monster.addVectorOf(testrequirednestedflatbuffer: testrequirednestedflatbuffer, &fbb)
+    MyGame_Example_Monster.addVectorOf(scalarKeySortedTables: scalarKeySortedTables, &fbb)
+    return MyGame_Example_Monster.endMonster(&fbb, start: __start)
+  }
+  public static func sortVectorOfMonster(offsets:[Offset], _ fbb: inout FlatBufferBuilder) -> Offset {
+    var off = offsets
+    off.sort { Table.compare(Table.offset(Int32($1.o), vOffset: 10, fbb: fbb.buffer), Table.offset(Int32($0.o), vOffset: 10, fbb: fbb.buffer), fbb: fbb.buffer) < 0 } 
+    return fbb.createVector(ofOffsets: off)
+  }
+  fileprivate static func lookupByKey(vector: Int32, key: String, fbb: ByteBuffer) -> MyGame_Example_Monster? {
+    let key = key.utf8.map { $0 }
+    var span = fbb.read(def: Int32.self, position: Int(vector - 4))
+    var start: Int32 = 0
+    while span != 0 {
+      var middle = span / 2
+      let tableOffset = Table.indirect(vector + 4 * (start + middle), fbb)
+      let comp = Table.compare(Table.offset(Int32(fbb.capacity) - tableOffset, vOffset: 10, fbb: fbb), key, fbb: fbb)
+      if comp > 0 {
+        span = middle
+      } else if comp < 0 {
+        middle += 1
+        start += middle
+        span -= middle
+      } else {
+        return MyGame_Example_Monster(fbb, o: tableOffset)
+      }
+    }
+    return nil
+  }
+  
+
+  public mutating func unpack() -> MyGame_Example_MonsterT {
+    return MyGame_Example_MonsterT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_MonsterT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_MonsterT) -> Offset {
+    let __name = builder.create(string: obj.name)
+    let __inventory = builder.createVector(obj.inventory)
+    let __test = obj.test?.pack(builder: &builder) ?? Offset()
+    MyGame_Example_Monster.startVectorOfTest4(obj.test4.count, in: &builder)
+    for i in obj.test4 {
+      guard let _o = i else { continue }
+      builder.create(struct: _o)
+    }
+    let __test4 = builder.endVector(len: obj.test4.count)
+    let __testarrayofstring = builder.createVector(ofStrings: obj.testarrayofstring.compactMap({ $0 }) )
+    var __testarrayoftables__: [Offset] = []
+    for var i in obj.testarrayoftables {
+      __testarrayoftables__.append(MyGame_Example_Monster.pack(&builder, obj: &i))
+    }
+    let __testarrayoftables = builder.createVector(ofOffsets: __testarrayoftables__)
+    let __enemy = MyGame_Example_Monster.pack(&builder, obj: &obj.enemy)
+    let __testnestedflatbuffer = builder.createVector(obj.testnestedflatbuffer)
+    let __testempty = MyGame_Example_Stat.pack(&builder, obj: &obj.testempty)
+    let __testarrayofbools = builder.createVector(obj.testarrayofbools)
+    let __testarrayofstring2 = builder.createVector(ofStrings: obj.testarrayofstring2.compactMap({ $0 }) )
+    MyGame_Example_Monster.startVectorOfTestarrayofsortedstruct(obj.testarrayofsortedstruct.count, in: &builder)
+    for i in obj.testarrayofsortedstruct {
+      guard let _o = i else { continue }
+      builder.create(struct: _o)
+    }
+    let __testarrayofsortedstruct = builder.endVector(len: obj.testarrayofsortedstruct.count)
+    let __flex = builder.createVector(obj.flex)
+    MyGame_Example_Monster.startVectorOfTest5(obj.test5.count, in: &builder)
+    for i in obj.test5 {
+      guard let _o = i else { continue }
+      builder.create(struct: _o)
+    }
+    let __test5 = builder.endVector(len: obj.test5.count)
+    let __vectorOfLongs = builder.createVector(obj.vectorOfLongs)
+    let __vectorOfDoubles = builder.createVector(obj.vectorOfDoubles)
+    let __parentNamespaceTest = MyGame_InParentNamespace.pack(&builder, obj: &obj.parentNamespaceTest)
+    var __vectorOfReferrables__: [Offset] = []
+    for var i in obj.vectorOfReferrables {
+      __vectorOfReferrables__.append(MyGame_Example_Referrable.pack(&builder, obj: &i))
+    }
+    let __vectorOfReferrables = builder.createVector(ofOffsets: __vectorOfReferrables__)
+    let __vectorOfWeakReferences = builder.createVector(obj.vectorOfWeakReferences)
+    var __vectorOfStrongReferrables__: [Offset] = []
+    for var i in obj.vectorOfStrongReferrables {
+      __vectorOfStrongReferrables__.append(MyGame_Example_Referrable.pack(&builder, obj: &i))
+    }
+    let __vectorOfStrongReferrables = builder.createVector(ofOffsets: __vectorOfStrongReferrables__)
+    let __vectorOfCoOwningReferences = builder.createVector(obj.vectorOfCoOwningReferences)
+    let __vectorOfNonOwningReferences = builder.createVector(obj.vectorOfNonOwningReferences)
+    let __anyUnique = obj.anyUnique?.pack(builder: &builder) ?? Offset()
+    let __anyAmbiguous = obj.anyAmbiguous?.pack(builder: &builder) ?? Offset()
+    let __vectorOfEnums = builder.createVector(obj.vectorOfEnums)
+    let __testrequirednestedflatbuffer = builder.createVector(obj.testrequirednestedflatbuffer)
+    var __scalarKeySortedTables__: [Offset] = []
+    for var i in obj.scalarKeySortedTables {
+      __scalarKeySortedTables__.append(MyGame_Example_Stat.pack(&builder, obj: &i))
+    }
+    let __scalarKeySortedTables = builder.createVector(ofOffsets: __scalarKeySortedTables__)
+    let __root = MyGame_Example_Monster.startMonster(&builder)
+    MyGame_Example_Monster.add(pos: obj.pos, &builder)
+    MyGame_Example_Monster.add(mana: obj.mana, &builder)
+    MyGame_Example_Monster.add(hp: obj.hp, &builder)
+    MyGame_Example_Monster.add(name: __name, &builder)
+    MyGame_Example_Monster.addVectorOf(inventory: __inventory, &builder)
+    MyGame_Example_Monster.add(color: obj.color, &builder)
+    if let o = obj.test?.type {
+      MyGame_Example_Monster.add(testType: o, &builder)
+      MyGame_Example_Monster.add(test: __test, &builder)
+    }
+
+    MyGame_Example_Monster.addVectorOf(test4: __test4, &builder)
+    MyGame_Example_Monster.addVectorOf(testarrayofstring: __testarrayofstring, &builder)
+    MyGame_Example_Monster.addVectorOf(testarrayoftables: __testarrayoftables, &builder)
+    MyGame_Example_Monster.add(enemy: __enemy, &builder)
+    MyGame_Example_Monster.addVectorOf(testnestedflatbuffer: __testnestedflatbuffer, &builder)
+    MyGame_Example_Monster.add(testempty: __testempty, &builder)
+    MyGame_Example_Monster.add(testbool: obj.testbool, &builder)
+    MyGame_Example_Monster.add(testhashs32Fnv1: obj.testhashs32Fnv1, &builder)
+    MyGame_Example_Monster.add(testhashu32Fnv1: obj.testhashu32Fnv1, &builder)
+    MyGame_Example_Monster.add(testhashs64Fnv1: obj.testhashs64Fnv1, &builder)
+    MyGame_Example_Monster.add(testhashu64Fnv1: obj.testhashu64Fnv1, &builder)
+    MyGame_Example_Monster.add(testhashs32Fnv1a: obj.testhashs32Fnv1a, &builder)
+    MyGame_Example_Monster.add(testhashu32Fnv1a: obj.testhashu32Fnv1a, &builder)
+    MyGame_Example_Monster.add(testhashs64Fnv1a: obj.testhashs64Fnv1a, &builder)
+    MyGame_Example_Monster.add(testhashu64Fnv1a: obj.testhashu64Fnv1a, &builder)
+    MyGame_Example_Monster.addVectorOf(testarrayofbools: __testarrayofbools, &builder)
+    MyGame_Example_Monster.add(testf: obj.testf, &builder)
+    MyGame_Example_Monster.add(testf2: obj.testf2, &builder)
+    MyGame_Example_Monster.add(testf3: obj.testf3, &builder)
+    MyGame_Example_Monster.addVectorOf(testarrayofstring2: __testarrayofstring2, &builder)
+    MyGame_Example_Monster.addVectorOf(testarrayofsortedstruct: __testarrayofsortedstruct, &builder)
+    MyGame_Example_Monster.addVectorOf(flex: __flex, &builder)
+    MyGame_Example_Monster.addVectorOf(test5: __test5, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfLongs: __vectorOfLongs, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfDoubles: __vectorOfDoubles, &builder)
+    MyGame_Example_Monster.add(parentNamespaceTest: __parentNamespaceTest, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfReferrables: __vectorOfReferrables, &builder)
+    MyGame_Example_Monster.add(singleWeakReference: obj.singleWeakReference, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfWeakReferences: __vectorOfWeakReferences, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfStrongReferrables: __vectorOfStrongReferrables, &builder)
+    MyGame_Example_Monster.add(coOwningReference: obj.coOwningReference, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfCoOwningReferences: __vectorOfCoOwningReferences, &builder)
+    MyGame_Example_Monster.add(nonOwningReference: obj.nonOwningReference, &builder)
+    MyGame_Example_Monster.addVectorOf(vectorOfNonOwningReferences: __vectorOfNonOwningReferences, &builder)
+    if let o = obj.anyUnique?.type {
+      MyGame_Example_Monster.add(anyUniqueType: o, &builder)
+      MyGame_Example_Monster.add(anyUnique: __anyUnique, &builder)
+    }
+
+    if let o = obj.anyAmbiguous?.type {
+      MyGame_Example_Monster.add(anyAmbiguousType: o, &builder)
+      MyGame_Example_Monster.add(anyAmbiguous: __anyAmbiguous, &builder)
+    }
+
+    MyGame_Example_Monster.addVectorOf(vectorOfEnums: __vectorOfEnums, &builder)
+    MyGame_Example_Monster.add(signedEnum: obj.signedEnum, &builder)
+    MyGame_Example_Monster.addVectorOf(testrequirednestedflatbuffer: __testrequirednestedflatbuffer, &builder)
+    MyGame_Example_Monster.addVectorOf(scalarKeySortedTables: __scalarKeySortedTables, &builder)
+    return MyGame_Example_Monster.endMonster(&builder, start: __root)
+  }
+}
+
+public class MyGame_Example_MonsterT: NativeObject {
+
+  public var pos: MyGame_Example_Vec3?
+  public var mana: Int16
+  public var hp: Int16
+  public var name: String
+  public var inventory: [UInt8]
+  public var color: MyGame_Example_Color
+  public var test: MyGame_Example_Any_Union?
+  public var test4: [MyGame_Example_Test?]
+  public var testarrayofstring: [String?]
+  public var testarrayoftables: [MyGame_Example_MonsterT?]
+  public var enemy: MyGame_Example_MonsterT?
+  public var testnestedflatbuffer: [UInt8]
+  public var testempty: MyGame_Example_StatT?
+  public var testbool: Bool
+  public var testhashs32Fnv1: Int32
+  public var testhashu32Fnv1: UInt32
+  public var testhashs64Fnv1: Int64
+  public var testhashu64Fnv1: UInt64
+  public var testhashs32Fnv1a: Int32
+  public var testhashu32Fnv1a: UInt32
+  public var testhashs64Fnv1a: Int64
+  public var testhashu64Fnv1a: UInt64
+  public var testarrayofbools: [Bool]
+  public var testf: Float32
+  public var testf2: Float32
+  public var testf3: Float32
+  public var testarrayofstring2: [String?]
+  public var testarrayofsortedstruct: [MyGame_Example_Ability?]
+  public var flex: [UInt8]
+  public var test5: [MyGame_Example_Test?]
+  public var vectorOfLongs: [Int64]
+  public var vectorOfDoubles: [Double]
+  public var parentNamespaceTest: MyGame_InParentNamespaceT?
+  public var vectorOfReferrables: [MyGame_Example_ReferrableT?]
+  public var singleWeakReference: UInt64
+  public var vectorOfWeakReferences: [UInt64]
+  public var vectorOfStrongReferrables: [MyGame_Example_ReferrableT?]
+  public var coOwningReference: UInt64
+  public var vectorOfCoOwningReferences: [UInt64]
+  public var nonOwningReference: UInt64
+  public var vectorOfNonOwningReferences: [UInt64]
+  public var anyUnique: MyGame_Example_AnyUniqueAliasesUnion?
+  public var anyAmbiguous: MyGame_Example_AnyAmbiguousAliasesUnion?
+  public var vectorOfEnums: [MyGame_Example_Color]
+  public var signedEnum: MyGame_Example_Race
+  public var testrequirednestedflatbuffer: [UInt8]
+  public var scalarKeySortedTables: [MyGame_Example_StatT?]
+
+  public init(_ _t: inout MyGame_Example_Monster) {
+    pos = _t.pos
+    mana = _t.mana
+    hp = _t.hp
+    name = _t.name
+    inventory = []
+    for index in 0..<_t.inventoryCount {
+        inventory.append(_t.inventory(at: index))
+    }
+    color = _t.color
+    switch _t.testType {
+    case .monster:
+      var _v = _t.test(type: MyGame_Example_Monster.self)
+      test = MyGame_Example_Any_Union(_v?.unpack(), type: .monster)
+    case .testsimpletablewithenum:
+      var _v = _t.test(type: MyGame_Example_TestSimpleTableWithEnum.self)
+      test = MyGame_Example_Any_Union(_v?.unpack(), type: .testsimpletablewithenum)
+    case .mygameExample2Monster:
+      var _v = _t.test(type: MyGame_Example2_Monster.self)
+      test = MyGame_Example_Any_Union(_v?.unpack(), type: .mygameExample2Monster)
+    default: break
+    }
+    test4 = []
+    for index in 0..<_t.test4Count {
+        test4.append(_t.test4(at: index))
+    }
+    testarrayofstring = []
+    for index in 0..<_t.testarrayofstringCount {
+        testarrayofstring.append(_t.testarrayofstring(at: index))
+    }
+    testarrayoftables = []
+    for index in 0..<_t.testarrayoftablesCount {
+        var __v_ = _t.testarrayoftables(at: index)
+        testarrayoftables.append(__v_?.unpack())
+    }
+    var __enemy = _t.enemy
+    enemy = __enemy?.unpack()
+    testnestedflatbuffer = []
+    for index in 0..<_t.testnestedflatbufferCount {
+        testnestedflatbuffer.append(_t.testnestedflatbuffer(at: index))
+    }
+    var __testempty = _t.testempty
+    testempty = __testempty?.unpack()
+    testbool = _t.testbool
+    testhashs32Fnv1 = _t.testhashs32Fnv1
+    testhashu32Fnv1 = _t.testhashu32Fnv1
+    testhashs64Fnv1 = _t.testhashs64Fnv1
+    testhashu64Fnv1 = _t.testhashu64Fnv1
+    testhashs32Fnv1a = _t.testhashs32Fnv1a
+    testhashu32Fnv1a = _t.testhashu32Fnv1a
+    testhashs64Fnv1a = _t.testhashs64Fnv1a
+    testhashu64Fnv1a = _t.testhashu64Fnv1a
+    testarrayofbools = []
+    for index in 0..<_t.testarrayofboolsCount {
+        testarrayofbools.append(_t.testarrayofbools(at: index))
+    }
+    testf = _t.testf
+    testf2 = _t.testf2
+    testf3 = _t.testf3
+    testarrayofstring2 = []
+    for index in 0..<_t.testarrayofstring2Count {
+        testarrayofstring2.append(_t.testarrayofstring2(at: index))
+    }
+    testarrayofsortedstruct = []
+    for index in 0..<_t.testarrayofsortedstructCount {
+        testarrayofsortedstruct.append(_t.testarrayofsortedstruct(at: index))
+    }
+    flex = []
+    for index in 0..<_t.flexCount {
+        flex.append(_t.flex(at: index))
+    }
+    test5 = []
+    for index in 0..<_t.test5Count {
+        test5.append(_t.test5(at: index))
+    }
+    vectorOfLongs = []
+    for index in 0..<_t.vectorOfLongsCount {
+        vectorOfLongs.append(_t.vectorOfLongs(at: index))
+    }
+    vectorOfDoubles = []
+    for index in 0..<_t.vectorOfDoublesCount {
+        vectorOfDoubles.append(_t.vectorOfDoubles(at: index))
+    }
+    var __parentNamespaceTest = _t.parentNamespaceTest
+    parentNamespaceTest = __parentNamespaceTest?.unpack()
+    vectorOfReferrables = []
+    for index in 0..<_t.vectorOfReferrablesCount {
+        var __v_ = _t.vectorOfReferrables(at: index)
+        vectorOfReferrables.append(__v_?.unpack())
+    }
+    singleWeakReference = _t.singleWeakReference
+    vectorOfWeakReferences = []
+    for index in 0..<_t.vectorOfWeakReferencesCount {
+        vectorOfWeakReferences.append(_t.vectorOfWeakReferences(at: index))
+    }
+    vectorOfStrongReferrables = []
+    for index in 0..<_t.vectorOfStrongReferrablesCount {
+        var __v_ = _t.vectorOfStrongReferrables(at: index)
+        vectorOfStrongReferrables.append(__v_?.unpack())
+    }
+    coOwningReference = _t.coOwningReference
+    vectorOfCoOwningReferences = []
+    for index in 0..<_t.vectorOfCoOwningReferencesCount {
+        vectorOfCoOwningReferences.append(_t.vectorOfCoOwningReferences(at: index))
+    }
+    nonOwningReference = _t.nonOwningReference
+    vectorOfNonOwningReferences = []
+    for index in 0..<_t.vectorOfNonOwningReferencesCount {
+        vectorOfNonOwningReferences.append(_t.vectorOfNonOwningReferences(at: index))
+    }
+    switch _t.anyUniqueType {
+    case .m:
+      var _v = _t.anyUnique(type: MyGame_Example_Monster.self)
+      anyUnique = MyGame_Example_AnyUniqueAliasesUnion(_v?.unpack(), type: .m)
+    case .ts:
+      var _v = _t.anyUnique(type: MyGame_Example_TestSimpleTableWithEnum.self)
+      anyUnique = MyGame_Example_AnyUniqueAliasesUnion(_v?.unpack(), type: .ts)
+    case .m2:
+      var _v = _t.anyUnique(type: MyGame_Example2_Monster.self)
+      anyUnique = MyGame_Example_AnyUniqueAliasesUnion(_v?.unpack(), type: .m2)
+    default: break
+    }
+    switch _t.anyAmbiguousType {
+    case .m1:
+      var _v = _t.anyAmbiguous(type: MyGame_Example_Monster.self)
+      anyAmbiguous = MyGame_Example_AnyAmbiguousAliasesUnion(_v?.unpack(), type: .m1)
+    case .m2:
+      var _v = _t.anyAmbiguous(type: MyGame_Example_Monster.self)
+      anyAmbiguous = MyGame_Example_AnyAmbiguousAliasesUnion(_v?.unpack(), type: .m2)
+    case .m3:
+      var _v = _t.anyAmbiguous(type: MyGame_Example_Monster.self)
+      anyAmbiguous = MyGame_Example_AnyAmbiguousAliasesUnion(_v?.unpack(), type: .m3)
+    default: break
+    }
+    vectorOfEnums = []
+    for index in 0..<_t.vectorOfEnumsCount {
+        vectorOfEnums.append(_t.vectorOfEnums(at: index)!)
+    }
+    signedEnum = _t.signedEnum
+    testrequirednestedflatbuffer = []
+    for index in 0..<_t.testrequirednestedflatbufferCount {
+        testrequirednestedflatbuffer.append(_t.testrequirednestedflatbuffer(at: index))
+    }
+    scalarKeySortedTables = []
+    for index in 0..<_t.scalarKeySortedTablesCount {
+        var __v_ = _t.scalarKeySortedTables(at: index)
+        scalarKeySortedTables.append(__v_?.unpack())
+    }
+  }
+
+  public init() {
+    pos = MyGame_Example_Vec3()
+    mana = 150
+    hp = 100
+    name = ""
+    inventory = []
+    color = .blue
+    test4 = []
+    testarrayofstring = []
+    testarrayoftables = []
+    enemy = MyGame_Example_MonsterT()
+    testnestedflatbuffer = []
+    testempty = MyGame_Example_StatT()
+    testbool = false
+    testhashs32Fnv1 = 0
+    testhashu32Fnv1 = 0
+    testhashs64Fnv1 = 0
+    testhashu64Fnv1 = 0
+    testhashs32Fnv1a = 0
+    testhashu32Fnv1a = 0
+    testhashs64Fnv1a = 0
+    testhashu64Fnv1a = 0
+    testarrayofbools = []
+    testf = 3.14159
+    testf2 = 3.0
+    testf3 = 0.0
+    testarrayofstring2 = []
+    testarrayofsortedstruct = []
+    flex = []
+    test5 = []
+    vectorOfLongs = []
+    vectorOfDoubles = []
+    parentNamespaceTest = MyGame_InParentNamespaceT()
+    vectorOfReferrables = []
+    singleWeakReference = 0
+    vectorOfWeakReferences = []
+    vectorOfStrongReferrables = []
+    coOwningReference = 0
+    vectorOfCoOwningReferences = []
+    nonOwningReference = 0
+    vectorOfNonOwningReferences = []
+    vectorOfEnums = []
+    signedEnum = .none_
+    testrequirednestedflatbuffer = []
+    scalarKeySortedTables = []
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_Example_Monster.self) }
+
+}
+public struct MyGame_Example_TypeAliases: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MONS", addPrefix: prefix) }
+  public static func getRootAsTypeAliases(bb: ByteBuffer) -> MyGame_Example_TypeAliases { return MyGame_Example_TypeAliases(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case i8 = 4
+    case u8 = 6
+    case i16 = 8
+    case u16 = 10
+    case i32 = 12
+    case u32 = 14
+    case i64 = 16
+    case u64 = 18
+    case f32 = 20
+    case f64 = 22
+    case v8 = 24
+    case vf64 = 26
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var i8: Int8 { let o = _accessor.offset(VTOFFSET.i8.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int8.self, at: o) }
+  @discardableResult public func mutate(i8: Int8) -> Bool {let o = _accessor.offset(VTOFFSET.i8.v);  return _accessor.mutate(i8, index: o) }
+  public var u8: UInt8 { let o = _accessor.offset(VTOFFSET.u8.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt8.self, at: o) }
+  @discardableResult public func mutate(u8: UInt8) -> Bool {let o = _accessor.offset(VTOFFSET.u8.v);  return _accessor.mutate(u8, index: o) }
+  public var i16: Int16 { let o = _accessor.offset(VTOFFSET.i16.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int16.self, at: o) }
+  @discardableResult public func mutate(i16: Int16) -> Bool {let o = _accessor.offset(VTOFFSET.i16.v);  return _accessor.mutate(i16, index: o) }
+  public var u16: UInt16 { let o = _accessor.offset(VTOFFSET.u16.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt16.self, at: o) }
+  @discardableResult public func mutate(u16: UInt16) -> Bool {let o = _accessor.offset(VTOFFSET.u16.v);  return _accessor.mutate(u16, index: o) }
+  public var i32: Int32 { let o = _accessor.offset(VTOFFSET.i32.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int32.self, at: o) }
+  @discardableResult public func mutate(i32: Int32) -> Bool {let o = _accessor.offset(VTOFFSET.i32.v);  return _accessor.mutate(i32, index: o) }
+  public var u32: UInt32 { let o = _accessor.offset(VTOFFSET.u32.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt32.self, at: o) }
+  @discardableResult public func mutate(u32: UInt32) -> Bool {let o = _accessor.offset(VTOFFSET.u32.v);  return _accessor.mutate(u32, index: o) }
+  public var i64: Int64 { let o = _accessor.offset(VTOFFSET.i64.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int64.self, at: o) }
+  @discardableResult public func mutate(i64: Int64) -> Bool {let o = _accessor.offset(VTOFFSET.i64.v);  return _accessor.mutate(i64, index: o) }
+  public var u64: UInt64 { let o = _accessor.offset(VTOFFSET.u64.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  @discardableResult public func mutate(u64: UInt64) -> Bool {let o = _accessor.offset(VTOFFSET.u64.v);  return _accessor.mutate(u64, index: o) }
+  public var f32: Float32 { let o = _accessor.offset(VTOFFSET.f32.v); return o == 0 ? 0.0 : _accessor.readBuffer(of: Float32.self, at: o) }
+  @discardableResult public func mutate(f32: Float32) -> Bool {let o = _accessor.offset(VTOFFSET.f32.v);  return _accessor.mutate(f32, index: o) }
+  public var f64: Double { let o = _accessor.offset(VTOFFSET.f64.v); return o == 0 ? 0.0 : _accessor.readBuffer(of: Double.self, at: o) }
+  @discardableResult public func mutate(f64: Double) -> Bool {let o = _accessor.offset(VTOFFSET.f64.v);  return _accessor.mutate(f64, index: o) }
+  public var v8Count: Int32 { let o = _accessor.offset(VTOFFSET.v8.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func v8(at index: Int32) -> Int8 { let o = _accessor.offset(VTOFFSET.v8.v); return o == 0 ? 0 : _accessor.directRead(of: Int8.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var v8: [Int8] { return _accessor.getVector(at: VTOFFSET.v8.v) ?? [] }
+  public func mutate(v8: Int8, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.v8.v); return _accessor.directMutate(v8, index: _accessor.vector(at: o) + index * 1) }
+  public var vf64Count: Int32 { let o = _accessor.offset(VTOFFSET.vf64.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func vf64(at index: Int32) -> Double { let o = _accessor.offset(VTOFFSET.vf64.v); return o == 0 ? 0 : _accessor.directRead(of: Double.self, offset: _accessor.vector(at: o) + index * 8) }
+  public var vf64: [Double] { return _accessor.getVector(at: VTOFFSET.vf64.v) ?? [] }
+  public func mutate(vf64: Double, at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.vf64.v); return _accessor.directMutate(vf64, index: _accessor.vector(at: o) + index * 8) }
+  public static func startTypeAliases(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 12) }
+  public static func add(i8: Int8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: i8, def: 0, at: VTOFFSET.i8.p) }
+  public static func add(u8: UInt8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: u8, def: 0, at: VTOFFSET.u8.p) }
+  public static func add(i16: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: i16, def: 0, at: VTOFFSET.i16.p) }
+  public static func add(u16: UInt16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: u16, def: 0, at: VTOFFSET.u16.p) }
+  public static func add(i32: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: i32, def: 0, at: VTOFFSET.i32.p) }
+  public static func add(u32: UInt32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: u32, def: 0, at: VTOFFSET.u32.p) }
+  public static func add(i64: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: i64, def: 0, at: VTOFFSET.i64.p) }
+  public static func add(u64: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: u64, def: 0, at: VTOFFSET.u64.p) }
+  public static func add(f32: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: f32, def: 0.0, at: VTOFFSET.f32.p) }
+  public static func add(f64: Double, _ fbb: inout FlatBufferBuilder) { fbb.add(element: f64, def: 0.0, at: VTOFFSET.f64.p) }
+  public static func addVectorOf(v8: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: v8, at: VTOFFSET.v8.p) }
+  public static func addVectorOf(vf64: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: vf64, at: VTOFFSET.vf64.p) }
+  public static func endTypeAliases(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createTypeAliases(
+    _ fbb: inout FlatBufferBuilder,
+    i8: Int8 = 0,
+    u8: UInt8 = 0,
+    i16: Int16 = 0,
+    u16: UInt16 = 0,
+    i32: Int32 = 0,
+    u32: UInt32 = 0,
+    i64: Int64 = 0,
+    u64: UInt64 = 0,
+    f32: Float32 = 0.0,
+    f64: Double = 0.0,
+    v8VectorOffset v8: Offset = Offset(),
+    vf64VectorOffset vf64: Offset = Offset()
+  ) -> Offset {
+    let __start = MyGame_Example_TypeAliases.startTypeAliases(&fbb)
+    MyGame_Example_TypeAliases.add(i8: i8, &fbb)
+    MyGame_Example_TypeAliases.add(u8: u8, &fbb)
+    MyGame_Example_TypeAliases.add(i16: i16, &fbb)
+    MyGame_Example_TypeAliases.add(u16: u16, &fbb)
+    MyGame_Example_TypeAliases.add(i32: i32, &fbb)
+    MyGame_Example_TypeAliases.add(u32: u32, &fbb)
+    MyGame_Example_TypeAliases.add(i64: i64, &fbb)
+    MyGame_Example_TypeAliases.add(u64: u64, &fbb)
+    MyGame_Example_TypeAliases.add(f32: f32, &fbb)
+    MyGame_Example_TypeAliases.add(f64: f64, &fbb)
+    MyGame_Example_TypeAliases.addVectorOf(v8: v8, &fbb)
+    MyGame_Example_TypeAliases.addVectorOf(vf64: vf64, &fbb)
+    return MyGame_Example_TypeAliases.endTypeAliases(&fbb, start: __start)
+  }
+  
+
+  public mutating func unpack() -> MyGame_Example_TypeAliasesT {
+    return MyGame_Example_TypeAliasesT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_TypeAliasesT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MyGame_Example_TypeAliasesT) -> Offset {
+    let __v8 = builder.createVector(obj.v8)
+    let __vf64 = builder.createVector(obj.vf64)
+    let __root = MyGame_Example_TypeAliases.startTypeAliases(&builder)
+    MyGame_Example_TypeAliases.add(i8: obj.i8, &builder)
+    MyGame_Example_TypeAliases.add(u8: obj.u8, &builder)
+    MyGame_Example_TypeAliases.add(i16: obj.i16, &builder)
+    MyGame_Example_TypeAliases.add(u16: obj.u16, &builder)
+    MyGame_Example_TypeAliases.add(i32: obj.i32, &builder)
+    MyGame_Example_TypeAliases.add(u32: obj.u32, &builder)
+    MyGame_Example_TypeAliases.add(i64: obj.i64, &builder)
+    MyGame_Example_TypeAliases.add(u64: obj.u64, &builder)
+    MyGame_Example_TypeAliases.add(f32: obj.f32, &builder)
+    MyGame_Example_TypeAliases.add(f64: obj.f64, &builder)
+    MyGame_Example_TypeAliases.addVectorOf(v8: __v8, &builder)
+    MyGame_Example_TypeAliases.addVectorOf(vf64: __vf64, &builder)
+    return MyGame_Example_TypeAliases.endTypeAliases(&builder, start: __root)
+  }
+}
+
+public class MyGame_Example_TypeAliasesT: NativeObject {
+
+  public var i8: Int8
+  public var u8: UInt8
+  public var i16: Int16
+  public var u16: UInt16
+  public var i32: Int32
+  public var u32: UInt32
+  public var i64: Int64
+  public var u64: UInt64
+  public var f32: Float32
+  public var f64: Double
+  public var v8: [Int8]
+  public var vf64: [Double]
+
+  public init(_ _t: inout MyGame_Example_TypeAliases) {
+    i8 = _t.i8
+    u8 = _t.u8
+    i16 = _t.i16
+    u16 = _t.u16
+    i32 = _t.i32
+    u32 = _t.u32
+    i64 = _t.i64
+    u64 = _t.u64
+    f32 = _t.f32
+    f64 = _t.f64
+    v8 = []
+    for index in 0..<_t.v8Count {
+        v8.append(_t.v8(at: index))
+    }
+    vf64 = []
+    for index in 0..<_t.vf64Count {
+        vf64.append(_t.vf64(at: index))
+    }
+  }
+
+  public init() {
+    i8 = 0
+    u8 = 0
+    i16 = 0
+    u16 = 0
+    i32 = 0
+    u32 = 0
+    i64 = 0
+    u64 = 0
+    f32 = 0.0
+    f64 = 0.0
+    v8 = []
+    vf64 = []
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MyGame_Example_TypeAliases.self) }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/more_defaults_generated.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/more_defaults_generated.swift
new file mode 100644
index 0000000..aa8d572
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/more_defaults_generated.swift
@@ -0,0 +1,164 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// swiftlint:disable all
+// swiftformat:disable all
+
+import FlatBuffers
+
+public enum ABC: Int32, Enum {
+  public typealias T = Int32
+  public static var byteSize: Int { return MemoryLayout<Int32>.size }
+  public var value: Int32 { return self.rawValue }
+  case a = 0
+  case b = 1
+  case c = 2
+  
+
+  public static var max: ABC { return .c }
+  public static var min: ABC { return .a }
+}
+
+public struct MoreDefaults: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func getRootAsMoreDefaults(bb: ByteBuffer) -> MoreDefaults { return MoreDefaults(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case ints = 4
+    case floats = 6
+    case emptyString = 8
+    case someString = 10
+    case abcs = 12
+    case bools = 14
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var intsCount: Int32 { let o = _accessor.offset(VTOFFSET.ints.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func ints(at index: Int32) -> Int32 { let o = _accessor.offset(VTOFFSET.ints.v); return o == 0 ? 0 : _accessor.directRead(of: Int32.self, offset: _accessor.vector(at: o) + index * 4) }
+  public var ints: [Int32] { return _accessor.getVector(at: VTOFFSET.ints.v) ?? [] }
+  public var floatsCount: Int32 { let o = _accessor.offset(VTOFFSET.floats.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func floats(at index: Int32) -> Float32 { let o = _accessor.offset(VTOFFSET.floats.v); return o == 0 ? 0 : _accessor.directRead(of: Float32.self, offset: _accessor.vector(at: o) + index * 4) }
+  public var floats: [Float32] { return _accessor.getVector(at: VTOFFSET.floats.v) ?? [] }
+  public var emptyString: String? { let o = _accessor.offset(VTOFFSET.emptyString.v); return o == 0 ? "" : _accessor.string(at: o) }
+  public var emptyStringSegmentArray: [UInt8]? { return _accessor.getVector(at: VTOFFSET.emptyString.v) }
+  public var someString: String? { let o = _accessor.offset(VTOFFSET.someString.v); return o == 0 ? "some" : _accessor.string(at: o) }
+  public var someStringSegmentArray: [UInt8]? { return _accessor.getVector(at: VTOFFSET.someString.v) }
+  public var abcsCount: Int32 { let o = _accessor.offset(VTOFFSET.abcs.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func abcs(at index: Int32) -> ABC? { let o = _accessor.offset(VTOFFSET.abcs.v); return o == 0 ? ABC.a : ABC(rawValue: _accessor.directRead(of: Int32.self, offset: _accessor.vector(at: o) + index * 4)) }
+  public var boolsCount: Int32 { let o = _accessor.offset(VTOFFSET.bools.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func bools(at index: Int32) -> Bool { let o = _accessor.offset(VTOFFSET.bools.v); return o == 0 ? true : _accessor.directRead(of: Bool.self, offset: _accessor.vector(at: o) + index * 1) }
+  public var bools: [Bool] { return _accessor.getVector(at: VTOFFSET.bools.v) ?? [] }
+  public static func startMoreDefaults(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 6) }
+  public static func addVectorOf(ints: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: ints, at: VTOFFSET.ints.p) }
+  public static func addVectorOf(floats: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: floats, at: VTOFFSET.floats.p) }
+  public static func add(emptyString: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: emptyString, at: VTOFFSET.emptyString.p) }
+  public static func add(someString: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: someString, at: VTOFFSET.someString.p) }
+  public static func addVectorOf(abcs: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: abcs, at: VTOFFSET.abcs.p) }
+  public static func addVectorOf(bools: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: bools, at: VTOFFSET.bools.p) }
+  public static func endMoreDefaults(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createMoreDefaults(
+    _ fbb: inout FlatBufferBuilder,
+    intsVectorOffset ints: Offset = Offset(),
+    floatsVectorOffset floats: Offset = Offset(),
+    emptyStringOffset emptyString: Offset = Offset(),
+    someStringOffset someString: Offset = Offset(),
+    abcsVectorOffset abcs: Offset = Offset(),
+    boolsVectorOffset bools: Offset = Offset()
+  ) -> Offset {
+    let __start = MoreDefaults.startMoreDefaults(&fbb)
+    MoreDefaults.addVectorOf(ints: ints, &fbb)
+    MoreDefaults.addVectorOf(floats: floats, &fbb)
+    MoreDefaults.add(emptyString: emptyString, &fbb)
+    MoreDefaults.add(someString: someString, &fbb)
+    MoreDefaults.addVectorOf(abcs: abcs, &fbb)
+    MoreDefaults.addVectorOf(bools: bools, &fbb)
+    return MoreDefaults.endMoreDefaults(&fbb, start: __start)
+  }
+  
+
+  public mutating func unpack() -> MoreDefaultsT {
+    return MoreDefaultsT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MoreDefaultsT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MoreDefaultsT) -> Offset {
+    let __ints = builder.createVector(obj.ints)
+    let __floats = builder.createVector(obj.floats)
+    let __emptyString: Offset
+    if let s = obj.emptyString {
+      __emptyString = builder.create(string: s)
+    } else {
+      __emptyString = Offset()
+    }
+
+    let __someString: Offset
+    if let s = obj.someString {
+      __someString = builder.create(string: s)
+    } else {
+      __someString = Offset()
+    }
+
+    let __abcs = builder.createVector(obj.abcs)
+    let __bools = builder.createVector(obj.bools)
+    let __root = MoreDefaults.startMoreDefaults(&builder)
+    MoreDefaults.addVectorOf(ints: __ints, &builder)
+    MoreDefaults.addVectorOf(floats: __floats, &builder)
+    MoreDefaults.add(emptyString: __emptyString, &builder)
+    MoreDefaults.add(someString: __someString, &builder)
+    MoreDefaults.addVectorOf(abcs: __abcs, &builder)
+    MoreDefaults.addVectorOf(bools: __bools, &builder)
+    return MoreDefaults.endMoreDefaults(&builder, start: __root)
+  }
+}
+
+public class MoreDefaultsT: NativeObject {
+
+  public var ints: [Int32]
+  public var floats: [Float32]
+  public var emptyString: String?
+  public var someString: String?
+  public var abcs: [ABC]
+  public var bools: [Bool]
+
+  public init(_ _t: inout MoreDefaults) {
+    ints = []
+    for index in 0..<_t.intsCount {
+        ints.append(_t.ints(at: index))
+    }
+    floats = []
+    for index in 0..<_t.floatsCount {
+        floats.append(_t.floats(at: index))
+    }
+    emptyString = _t.emptyString
+    someString = _t.someString
+    abcs = []
+    for index in 0..<_t.abcsCount {
+        abcs.append(_t.abcs(at: index)!)
+    }
+    bools = []
+    for index in 0..<_t.boolsCount {
+        bools.append(_t.bools(at: index))
+    }
+  }
+
+  public init() {
+    ints = []
+    floats = []
+    emptyString = ""
+    someString = "some"
+    abcs = []
+    bools = []
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: MoreDefaults.self) }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/optional_scalars_generated.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/optional_scalars_generated.swift
new file mode 100644
index 0000000..85f9555
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/optional_scalars_generated.swift
@@ -0,0 +1,228 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// swiftlint:disable all
+// swiftformat:disable all
+
+import FlatBuffers
+
+public enum optional_scalars_OptionalByte: Int8, Enum {
+  public typealias T = Int8
+  public static var byteSize: Int { return MemoryLayout<Int8>.size }
+  public var value: Int8 { return self.rawValue }
+  case none_ = 0
+  case one = 1
+  case two = 2
+  
+
+  public static var max: optional_scalars_OptionalByte { return .two }
+  public static var min: optional_scalars_OptionalByte { return .none_ }
+}
+
+public struct optional_scalars_ScalarStuff: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "NULL", addPrefix: prefix) }
+  public static func getRootAsScalarStuff(bb: ByteBuffer) -> optional_scalars_ScalarStuff { return optional_scalars_ScalarStuff(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case justI8 = 4
+    case maybeI8 = 6
+    case defaultI8 = 8
+    case justU8 = 10
+    case maybeU8 = 12
+    case defaultU8 = 14
+    case justI16 = 16
+    case maybeI16 = 18
+    case defaultI16 = 20
+    case justU16 = 22
+    case maybeU16 = 24
+    case defaultU16 = 26
+    case justI32 = 28
+    case maybeI32 = 30
+    case defaultI32 = 32
+    case justU32 = 34
+    case maybeU32 = 36
+    case defaultU32 = 38
+    case justI64 = 40
+    case maybeI64 = 42
+    case defaultI64 = 44
+    case justU64 = 46
+    case maybeU64 = 48
+    case defaultU64 = 50
+    case justF32 = 52
+    case maybeF32 = 54
+    case defaultF32 = 56
+    case justF64 = 58
+    case maybeF64 = 60
+    case defaultF64 = 62
+    case justBool = 64
+    case maybeBool = 66
+    case defaultBool = 68
+    case justEnum = 70
+    case maybeEnum = 72
+    case defaultEnum = 74
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var justI8: Int8 { let o = _accessor.offset(VTOFFSET.justI8.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int8.self, at: o) }
+  public var maybeI8: Int8? { let o = _accessor.offset(VTOFFSET.maybeI8.v); return o == 0 ? nil : _accessor.readBuffer(of: Int8.self, at: o) }
+  public var defaultI8: Int8 { let o = _accessor.offset(VTOFFSET.defaultI8.v); return o == 0 ? 42 : _accessor.readBuffer(of: Int8.self, at: o) }
+  public var justU8: UInt8 { let o = _accessor.offset(VTOFFSET.justU8.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt8.self, at: o) }
+  public var maybeU8: UInt8? { let o = _accessor.offset(VTOFFSET.maybeU8.v); return o == 0 ? nil : _accessor.readBuffer(of: UInt8.self, at: o) }
+  public var defaultU8: UInt8 { let o = _accessor.offset(VTOFFSET.defaultU8.v); return o == 0 ? 42 : _accessor.readBuffer(of: UInt8.self, at: o) }
+  public var justI16: Int16 { let o = _accessor.offset(VTOFFSET.justI16.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int16.self, at: o) }
+  public var maybeI16: Int16? { let o = _accessor.offset(VTOFFSET.maybeI16.v); return o == 0 ? nil : _accessor.readBuffer(of: Int16.self, at: o) }
+  public var defaultI16: Int16 { let o = _accessor.offset(VTOFFSET.defaultI16.v); return o == 0 ? 42 : _accessor.readBuffer(of: Int16.self, at: o) }
+  public var justU16: UInt16 { let o = _accessor.offset(VTOFFSET.justU16.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt16.self, at: o) }
+  public var maybeU16: UInt16? { let o = _accessor.offset(VTOFFSET.maybeU16.v); return o == 0 ? nil : _accessor.readBuffer(of: UInt16.self, at: o) }
+  public var defaultU16: UInt16 { let o = _accessor.offset(VTOFFSET.defaultU16.v); return o == 0 ? 42 : _accessor.readBuffer(of: UInt16.self, at: o) }
+  public var justI32: Int32 { let o = _accessor.offset(VTOFFSET.justI32.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int32.self, at: o) }
+  public var maybeI32: Int32? { let o = _accessor.offset(VTOFFSET.maybeI32.v); return o == 0 ? nil : _accessor.readBuffer(of: Int32.self, at: o) }
+  public var defaultI32: Int32 { let o = _accessor.offset(VTOFFSET.defaultI32.v); return o == 0 ? 42 : _accessor.readBuffer(of: Int32.self, at: o) }
+  public var justU32: UInt32 { let o = _accessor.offset(VTOFFSET.justU32.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt32.self, at: o) }
+  public var maybeU32: UInt32? { let o = _accessor.offset(VTOFFSET.maybeU32.v); return o == 0 ? nil : _accessor.readBuffer(of: UInt32.self, at: o) }
+  public var defaultU32: UInt32 { let o = _accessor.offset(VTOFFSET.defaultU32.v); return o == 0 ? 42 : _accessor.readBuffer(of: UInt32.self, at: o) }
+  public var justI64: Int64 { let o = _accessor.offset(VTOFFSET.justI64.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int64.self, at: o) }
+  public var maybeI64: Int64? { let o = _accessor.offset(VTOFFSET.maybeI64.v); return o == 0 ? nil : _accessor.readBuffer(of: Int64.self, at: o) }
+  public var defaultI64: Int64 { let o = _accessor.offset(VTOFFSET.defaultI64.v); return o == 0 ? 42 : _accessor.readBuffer(of: Int64.self, at: o) }
+  public var justU64: UInt64 { let o = _accessor.offset(VTOFFSET.justU64.v); return o == 0 ? 0 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  public var maybeU64: UInt64? { let o = _accessor.offset(VTOFFSET.maybeU64.v); return o == 0 ? nil : _accessor.readBuffer(of: UInt64.self, at: o) }
+  public var defaultU64: UInt64 { let o = _accessor.offset(VTOFFSET.defaultU64.v); return o == 0 ? 42 : _accessor.readBuffer(of: UInt64.self, at: o) }
+  public var justF32: Float32 { let o = _accessor.offset(VTOFFSET.justF32.v); return o == 0 ? 0.0 : _accessor.readBuffer(of: Float32.self, at: o) }
+  public var maybeF32: Float32? { let o = _accessor.offset(VTOFFSET.maybeF32.v); return o == 0 ? nil : _accessor.readBuffer(of: Float32.self, at: o) }
+  public var defaultF32: Float32 { let o = _accessor.offset(VTOFFSET.defaultF32.v); return o == 0 ? 42.0 : _accessor.readBuffer(of: Float32.self, at: o) }
+  public var justF64: Double { let o = _accessor.offset(VTOFFSET.justF64.v); return o == 0 ? 0.0 : _accessor.readBuffer(of: Double.self, at: o) }
+  public var maybeF64: Double? { let o = _accessor.offset(VTOFFSET.maybeF64.v); return o == 0 ? nil : _accessor.readBuffer(of: Double.self, at: o) }
+  public var defaultF64: Double { let o = _accessor.offset(VTOFFSET.defaultF64.v); return o == 0 ? 42.0 : _accessor.readBuffer(of: Double.self, at: o) }
+  public var justBool: Bool { let o = _accessor.offset(VTOFFSET.justBool.v); return o == 0 ? false : 0 != _accessor.readBuffer(of: Byte.self, at: o) }
+  public var maybeBool: Bool? { let o = _accessor.offset(VTOFFSET.maybeBool.v); return o == 0 ? true : 0 != _accessor.readBuffer(of: Byte.self, at: o) }
+  public var defaultBool: Bool { let o = _accessor.offset(VTOFFSET.defaultBool.v); return o == 0 ? true : 0 != _accessor.readBuffer(of: Byte.self, at: o) }
+  public var justEnum: optional_scalars_OptionalByte { let o = _accessor.offset(VTOFFSET.justEnum.v); return o == 0 ? .none_ : optional_scalars_OptionalByte(rawValue: _accessor.readBuffer(of: Int8.self, at: o)) ?? .none_ }
+  public var maybeEnum: optional_scalars_OptionalByte? { let o = _accessor.offset(VTOFFSET.maybeEnum.v); return o == 0 ? nil : optional_scalars_OptionalByte(rawValue: _accessor.readBuffer(of: Int8.self, at: o)) ?? nil }
+  public var defaultEnum: optional_scalars_OptionalByte { let o = _accessor.offset(VTOFFSET.defaultEnum.v); return o == 0 ? .one : optional_scalars_OptionalByte(rawValue: _accessor.readBuffer(of: Int8.self, at: o)) ?? .one }
+  public static func startScalarStuff(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 36) }
+  public static func add(justI8: Int8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justI8, def: 0, at: VTOFFSET.justI8.p) }
+  public static func add(maybeI8: Int8?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeI8, at: VTOFFSET.maybeI8.p) }
+  public static func add(defaultI8: Int8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultI8, def: 42, at: VTOFFSET.defaultI8.p) }
+  public static func add(justU8: UInt8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justU8, def: 0, at: VTOFFSET.justU8.p) }
+  public static func add(maybeU8: UInt8?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeU8, at: VTOFFSET.maybeU8.p) }
+  public static func add(defaultU8: UInt8, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultU8, def: 42, at: VTOFFSET.defaultU8.p) }
+  public static func add(justI16: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justI16, def: 0, at: VTOFFSET.justI16.p) }
+  public static func add(maybeI16: Int16?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeI16, at: VTOFFSET.maybeI16.p) }
+  public static func add(defaultI16: Int16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultI16, def: 42, at: VTOFFSET.defaultI16.p) }
+  public static func add(justU16: UInt16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justU16, def: 0, at: VTOFFSET.justU16.p) }
+  public static func add(maybeU16: UInt16?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeU16, at: VTOFFSET.maybeU16.p) }
+  public static func add(defaultU16: UInt16, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultU16, def: 42, at: VTOFFSET.defaultU16.p) }
+  public static func add(justI32: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justI32, def: 0, at: VTOFFSET.justI32.p) }
+  public static func add(maybeI32: Int32?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeI32, at: VTOFFSET.maybeI32.p) }
+  public static func add(defaultI32: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultI32, def: 42, at: VTOFFSET.defaultI32.p) }
+  public static func add(justU32: UInt32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justU32, def: 0, at: VTOFFSET.justU32.p) }
+  public static func add(maybeU32: UInt32?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeU32, at: VTOFFSET.maybeU32.p) }
+  public static func add(defaultU32: UInt32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultU32, def: 42, at: VTOFFSET.defaultU32.p) }
+  public static func add(justI64: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justI64, def: 0, at: VTOFFSET.justI64.p) }
+  public static func add(maybeI64: Int64?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeI64, at: VTOFFSET.maybeI64.p) }
+  public static func add(defaultI64: Int64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultI64, def: 42, at: VTOFFSET.defaultI64.p) }
+  public static func add(justU64: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justU64, def: 0, at: VTOFFSET.justU64.p) }
+  public static func add(maybeU64: UInt64?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeU64, at: VTOFFSET.maybeU64.p) }
+  public static func add(defaultU64: UInt64, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultU64, def: 42, at: VTOFFSET.defaultU64.p) }
+  public static func add(justF32: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justF32, def: 0.0, at: VTOFFSET.justF32.p) }
+  public static func add(maybeF32: Float32?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeF32, at: VTOFFSET.maybeF32.p) }
+  public static func add(defaultF32: Float32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultF32, def: 42.0, at: VTOFFSET.defaultF32.p) }
+  public static func add(justF64: Double, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justF64, def: 0.0, at: VTOFFSET.justF64.p) }
+  public static func add(maybeF64: Double?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeF64, at: VTOFFSET.maybeF64.p) }
+  public static func add(defaultF64: Double, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultF64, def: 42.0, at: VTOFFSET.defaultF64.p) }
+  public static func add(justBool: Bool, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justBool, def: false,
+   at: VTOFFSET.justBool.p) }
+  public static func add(maybeBool: Bool?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeBool, at: VTOFFSET.maybeBool.p) }
+  public static func add(defaultBool: Bool, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultBool, def: true,
+   at: VTOFFSET.defaultBool.p) }
+  public static func add(justEnum: optional_scalars_OptionalByte, _ fbb: inout FlatBufferBuilder) { fbb.add(element: justEnum.rawValue, def: 0, at: VTOFFSET.justEnum.p) }
+  public static func add(maybeEnum: optional_scalars_OptionalByte?, _ fbb: inout FlatBufferBuilder) { fbb.add(element: maybeEnum?.rawValue, at: VTOFFSET.maybeEnum.p) }
+  public static func add(defaultEnum: optional_scalars_OptionalByte, _ fbb: inout FlatBufferBuilder) { fbb.add(element: defaultEnum.rawValue, def: 1, at: VTOFFSET.defaultEnum.p) }
+  public static func endScalarStuff(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createScalarStuff(
+    _ fbb: inout FlatBufferBuilder,
+    justI8: Int8 = 0,
+    maybeI8: Int8? = nil,
+    defaultI8: Int8 = 42,
+    justU8: UInt8 = 0,
+    maybeU8: UInt8? = nil,
+    defaultU8: UInt8 = 42,
+    justI16: Int16 = 0,
+    maybeI16: Int16? = nil,
+    defaultI16: Int16 = 42,
+    justU16: UInt16 = 0,
+    maybeU16: UInt16? = nil,
+    defaultU16: UInt16 = 42,
+    justI32: Int32 = 0,
+    maybeI32: Int32? = nil,
+    defaultI32: Int32 = 42,
+    justU32: UInt32 = 0,
+    maybeU32: UInt32? = nil,
+    defaultU32: UInt32 = 42,
+    justI64: Int64 = 0,
+    maybeI64: Int64? = nil,
+    defaultI64: Int64 = 42,
+    justU64: UInt64 = 0,
+    maybeU64: UInt64? = nil,
+    defaultU64: UInt64 = 42,
+    justF32: Float32 = 0.0,
+    maybeF32: Float32? = nil,
+    defaultF32: Float32 = 42.0,
+    justF64: Double = 0.0,
+    maybeF64: Double? = nil,
+    defaultF64: Double = 42.0,
+    justBool: Bool = false,
+    maybeBool: Bool? = nil,
+    defaultBool: Bool = true,
+    justEnum: optional_scalars_OptionalByte = .none_,
+    maybeEnum: optional_scalars_OptionalByte? = nil,
+    defaultEnum: optional_scalars_OptionalByte = .one
+  ) -> Offset {
+    let __start = optional_scalars_ScalarStuff.startScalarStuff(&fbb)
+    optional_scalars_ScalarStuff.add(justI8: justI8, &fbb)
+    optional_scalars_ScalarStuff.add(maybeI8: maybeI8, &fbb)
+    optional_scalars_ScalarStuff.add(defaultI8: defaultI8, &fbb)
+    optional_scalars_ScalarStuff.add(justU8: justU8, &fbb)
+    optional_scalars_ScalarStuff.add(maybeU8: maybeU8, &fbb)
+    optional_scalars_ScalarStuff.add(defaultU8: defaultU8, &fbb)
+    optional_scalars_ScalarStuff.add(justI16: justI16, &fbb)
+    optional_scalars_ScalarStuff.add(maybeI16: maybeI16, &fbb)
+    optional_scalars_ScalarStuff.add(defaultI16: defaultI16, &fbb)
+    optional_scalars_ScalarStuff.add(justU16: justU16, &fbb)
+    optional_scalars_ScalarStuff.add(maybeU16: maybeU16, &fbb)
+    optional_scalars_ScalarStuff.add(defaultU16: defaultU16, &fbb)
+    optional_scalars_ScalarStuff.add(justI32: justI32, &fbb)
+    optional_scalars_ScalarStuff.add(maybeI32: maybeI32, &fbb)
+    optional_scalars_ScalarStuff.add(defaultI32: defaultI32, &fbb)
+    optional_scalars_ScalarStuff.add(justU32: justU32, &fbb)
+    optional_scalars_ScalarStuff.add(maybeU32: maybeU32, &fbb)
+    optional_scalars_ScalarStuff.add(defaultU32: defaultU32, &fbb)
+    optional_scalars_ScalarStuff.add(justI64: justI64, &fbb)
+    optional_scalars_ScalarStuff.add(maybeI64: maybeI64, &fbb)
+    optional_scalars_ScalarStuff.add(defaultI64: defaultI64, &fbb)
+    optional_scalars_ScalarStuff.add(justU64: justU64, &fbb)
+    optional_scalars_ScalarStuff.add(maybeU64: maybeU64, &fbb)
+    optional_scalars_ScalarStuff.add(defaultU64: defaultU64, &fbb)
+    optional_scalars_ScalarStuff.add(justF32: justF32, &fbb)
+    optional_scalars_ScalarStuff.add(maybeF32: maybeF32, &fbb)
+    optional_scalars_ScalarStuff.add(defaultF32: defaultF32, &fbb)
+    optional_scalars_ScalarStuff.add(justF64: justF64, &fbb)
+    optional_scalars_ScalarStuff.add(maybeF64: maybeF64, &fbb)
+    optional_scalars_ScalarStuff.add(defaultF64: defaultF64, &fbb)
+    optional_scalars_ScalarStuff.add(justBool: justBool, &fbb)
+    optional_scalars_ScalarStuff.add(maybeBool: maybeBool, &fbb)
+    optional_scalars_ScalarStuff.add(defaultBool: defaultBool, &fbb)
+    optional_scalars_ScalarStuff.add(justEnum: justEnum, &fbb)
+    optional_scalars_ScalarStuff.add(maybeEnum: maybeEnum, &fbb)
+    optional_scalars_ScalarStuff.add(defaultEnum: defaultEnum, &fbb)
+    return optional_scalars_ScalarStuff.endScalarStuff(&fbb, start: __start)
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/union_vector_generated.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/union_vector_generated.swift
new file mode 100644
index 0000000..46edc51
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests/union_vector_generated.swift
@@ -0,0 +1,347 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// swiftlint:disable all
+// swiftformat:disable all
+
+import FlatBuffers
+
+public enum Character: UInt8, Enum {
+  public typealias T = UInt8
+  public static var byteSize: Int { return MemoryLayout<UInt8>.size }
+  public var value: UInt8 { return self.rawValue }
+  case none_ = 0
+  case mulan = 1
+  case rapunzel = 2
+  case belle = 3
+  case bookfan = 4
+  case other = 5
+  case unused = 6
+  
+
+  public static var max: Character { return .unused }
+  public static var min: Character { return .none_ }
+}
+
+public struct CharacterUnion {
+  public var type: Character
+  public var value: NativeObject?
+  public init(_ v: NativeObject?, type: Character) {
+    self.type = type
+    self.value = v
+  }
+  public func pack(builder: inout FlatBufferBuilder) -> Offset {
+    switch type {
+    case .mulan:
+      var __obj = value as? AttackerT
+      return Attacker.pack(&builder, obj: &__obj)
+    case .rapunzel:
+      var __obj = value as? Rapunzel
+      return Rapunzel_Mutable.pack(&builder, obj: &__obj)
+    case .belle:
+      var __obj = value as? BookReader
+      return BookReader_Mutable.pack(&builder, obj: &__obj)
+    case .bookfan:
+      var __obj = value as? BookReader
+      return BookReader_Mutable.pack(&builder, obj: &__obj)
+    case .other:
+      var __obj = value as? String
+      return String.pack(&builder, obj: &__obj)
+    case .unused:
+      var __obj = value as? String
+      return String.pack(&builder, obj: &__obj)
+    default: return Offset()
+    }
+  }
+}
+public struct Rapunzel: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _hairLength: Int32
+
+  public init(hairLength: Int32) {
+    _hairLength = hairLength
+  }
+
+  public init() {
+    _hairLength = 0
+  }
+
+  public init(_ _t: inout Rapunzel_Mutable) {
+    _hairLength = _t.hairLength
+  }
+
+  public var hairLength: Int32 { _hairLength }
+}
+
+public struct Rapunzel_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var hairLength: Int32 { return _accessor.readBuffer(of: Int32.self, at: 0) }
+  @discardableResult public func mutate(hairLength: Int32) -> Bool { return _accessor.mutate(hairLength, index: 0) }
+  
+
+  public mutating func unpack() -> Rapunzel {
+    return Rapunzel(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout Rapunzel?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout Rapunzel) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct BookReader: NativeStruct, NativeObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+
+  private var _booksRead: Int32
+
+  public init(booksRead: Int32) {
+    _booksRead = booksRead
+  }
+
+  public init() {
+    _booksRead = 0
+  }
+
+  public init(_ _t: inout BookReader_Mutable) {
+    _booksRead = _t.booksRead
+  }
+
+  public var booksRead: Int32 { _booksRead }
+}
+
+public struct BookReader_Mutable: FlatBufferObject {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Struct
+
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Struct(bb: bb, position: o) }
+
+  public var booksRead: Int32 { return _accessor.readBuffer(of: Int32.self, at: 0) }
+  @discardableResult public func mutate(booksRead: Int32) -> Bool { return _accessor.mutate(booksRead, index: 0) }
+  
+
+  public mutating func unpack() -> BookReader {
+    return BookReader(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout BookReader?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout BookReader) -> Offset {
+    return builder.create(struct: obj)
+  }
+}
+
+public struct Attacker: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MOVI", addPrefix: prefix) }
+  public static func getRootAsAttacker(bb: ByteBuffer) -> Attacker { return Attacker(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case swordAttackDamage = 4
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var swordAttackDamage: Int32 { let o = _accessor.offset(VTOFFSET.swordAttackDamage.v); return o == 0 ? 0 : _accessor.readBuffer(of: Int32.self, at: o) }
+  @discardableResult public func mutate(swordAttackDamage: Int32) -> Bool {let o = _accessor.offset(VTOFFSET.swordAttackDamage.v);  return _accessor.mutate(swordAttackDamage, index: o) }
+  public static func startAttacker(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 1) }
+  public static func add(swordAttackDamage: Int32, _ fbb: inout FlatBufferBuilder) { fbb.add(element: swordAttackDamage, def: 0, at: VTOFFSET.swordAttackDamage.p) }
+  public static func endAttacker(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createAttacker(
+    _ fbb: inout FlatBufferBuilder,
+    swordAttackDamage: Int32 = 0
+  ) -> Offset {
+    let __start = Attacker.startAttacker(&fbb)
+    Attacker.add(swordAttackDamage: swordAttackDamage, &fbb)
+    return Attacker.endAttacker(&fbb, start: __start)
+  }
+  
+
+  public mutating func unpack() -> AttackerT {
+    return AttackerT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout AttackerT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout AttackerT) -> Offset {
+    let __root = Attacker.startAttacker(&builder)
+    Attacker.add(swordAttackDamage: obj.swordAttackDamage, &builder)
+    return Attacker.endAttacker(&builder, start: __root)
+  }
+}
+
+public class AttackerT: NativeObject {
+
+  public var swordAttackDamage: Int32
+
+  public init(_ _t: inout Attacker) {
+    swordAttackDamage = _t.swordAttackDamage
+  }
+
+  public init() {
+    swordAttackDamage = 0
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: Attacker.self) }
+
+}
+public struct Movie: FlatBufferObject, ObjectAPIPacker {
+
+  static func validateVersion() { FlatBuffersVersion_2_0_0() }
+  public var __buffer: ByteBuffer! { return _accessor.bb }
+  private var _accessor: Table
+
+  public static func finish(_ fbb: inout FlatBufferBuilder, end: Offset, prefix: Bool = false) { fbb.finish(offset: end, fileId: "MOVI", addPrefix: prefix) }
+  public static func getRootAsMovie(bb: ByteBuffer) -> Movie { return Movie(Table(bb: bb, position: Int32(bb.read(def: UOffset.self, position: bb.reader)) + Int32(bb.reader))) }
+
+  private init(_ t: Table) { _accessor = t }
+  public init(_ bb: ByteBuffer, o: Int32) { _accessor = Table(bb: bb, position: o) }
+
+  private enum VTOFFSET: VOffset {
+    case mainCharacterType = 4
+    case mainCharacter = 6
+    case charactersType = 8
+    case characters = 10
+    var v: Int32 { Int32(self.rawValue) }
+    var p: VOffset { self.rawValue }
+  }
+
+  public var mainCharacterType: Character { let o = _accessor.offset(VTOFFSET.mainCharacterType.v); return o == 0 ? .none_ : Character(rawValue: _accessor.readBuffer(of: UInt8.self, at: o)) ?? .none_ }
+  public func mainCharacter<T: FlatbuffersInitializable>(type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.mainCharacter.v); return o == 0 ? nil : _accessor.union(o) }
+  public var charactersTypeCount: Int32 { let o = _accessor.offset(VTOFFSET.charactersType.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func charactersType(at index: Int32) -> Character? { let o = _accessor.offset(VTOFFSET.charactersType.v); return o == 0 ? Character.none_ : Character(rawValue: _accessor.directRead(of: UInt8.self, offset: _accessor.vector(at: o) + index * 1)) }
+  public var charactersCount: Int32 { let o = _accessor.offset(VTOFFSET.characters.v); return o == 0 ? 0 : _accessor.vector(count: o) }
+  public func characters<T: FlatbuffersInitializable>(at index: Int32, type: T.Type) -> T? { let o = _accessor.offset(VTOFFSET.characters.v); return o == 0 ? nil : _accessor.directUnion(_accessor.vector(at: o) + index * 4) }
+  public static func startMovie(_ fbb: inout FlatBufferBuilder) -> UOffset { fbb.startTable(with: 4) }
+  public static func add(mainCharacterType: Character, _ fbb: inout FlatBufferBuilder) { fbb.add(element: mainCharacterType.rawValue, def: 0, at: VTOFFSET.mainCharacterType.p) }
+  public static func add(mainCharacter: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: mainCharacter, at: VTOFFSET.mainCharacter.p) }
+  public static func addVectorOf(charactersType: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: charactersType, at: VTOFFSET.charactersType.p) }
+  public static func addVectorOf(characters: Offset, _ fbb: inout FlatBufferBuilder) { fbb.add(offset: characters, at: VTOFFSET.characters.p) }
+  public static func endMovie(_ fbb: inout FlatBufferBuilder, start: UOffset) -> Offset { let end = Offset(offset: fbb.endTable(at: start)); return end }
+  public static func createMovie(
+    _ fbb: inout FlatBufferBuilder,
+    mainCharacterType: Character = .none_,
+    mainCharacterOffset mainCharacter: Offset = Offset(),
+    charactersTypeVectorOffset charactersType: Offset = Offset(),
+    charactersVectorOffset characters: Offset = Offset()
+  ) -> Offset {
+    let __start = Movie.startMovie(&fbb)
+    Movie.add(mainCharacterType: mainCharacterType, &fbb)
+    Movie.add(mainCharacter: mainCharacter, &fbb)
+    Movie.addVectorOf(charactersType: charactersType, &fbb)
+    Movie.addVectorOf(characters: characters, &fbb)
+    return Movie.endMovie(&fbb, start: __start)
+  }
+  
+
+  public mutating func unpack() -> MovieT {
+    return MovieT(&self)
+  }
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MovieT?) -> Offset {
+    guard var obj = obj else { return Offset() }
+    return pack(&builder, obj: &obj)
+  }
+
+  public static func pack(_ builder: inout FlatBufferBuilder, obj: inout MovieT) -> Offset {
+    let __mainCharacter = obj.mainCharacter?.pack(builder: &builder) ?? Offset()
+    var __characters__: [Offset] = []
+    for i in obj.characters {
+      guard let off = i?.pack(builder: &builder) else { continue }
+      __characters__.append(off)
+    }
+    let __characters = builder.createVector(ofOffsets: __characters__)
+    let __charactersType = builder.createVector(obj.characters.compactMap { $0?.type })
+    let __root = Movie.startMovie(&builder)
+    if let o = obj.mainCharacter?.type {
+      Movie.add(mainCharacterType: o, &builder)
+      Movie.add(mainCharacter: __mainCharacter, &builder)
+    }
+
+    Movie.addVectorOf(charactersType: __charactersType, &builder)
+    Movie.addVectorOf(characters: __characters, &builder)
+    return Movie.endMovie(&builder, start: __root)
+  }
+}
+
+public class MovieT: NativeObject {
+
+  public var mainCharacter: CharacterUnion?
+  public var characters: [CharacterUnion?]
+
+  public init(_ _t: inout Movie) {
+    switch _t.mainCharacterType {
+    case .mulan:
+      var _v = _t.mainCharacter(type: Attacker.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .mulan)
+    case .rapunzel:
+      var _v = _t.mainCharacter(type: Rapunzel_Mutable.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .rapunzel)
+    case .belle:
+      var _v = _t.mainCharacter(type: BookReader_Mutable.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .belle)
+    case .bookfan:
+      var _v = _t.mainCharacter(type: BookReader_Mutable.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .bookfan)
+    case .other:
+      var _v = _t.mainCharacter(type: String.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .other)
+    case .unused:
+      var _v = _t.mainCharacter(type: String.self)
+      mainCharacter = CharacterUnion(_v?.unpack(), type: .unused)
+    default: break
+    }
+    characters = []
+    for index in 0..<_t.charactersCount {
+        switch _t.charactersType(at: index) {
+        case .mulan:
+          var _v = _t.characters(at: index, type: Attacker.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .mulan))
+        case .rapunzel:
+          var _v = _t.characters(at: index, type: Rapunzel_Mutable.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .rapunzel))
+        case .belle:
+          var _v = _t.characters(at: index, type: BookReader_Mutable.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .belle))
+        case .bookfan:
+          var _v = _t.characters(at: index, type: BookReader_Mutable.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .bookfan))
+        case .other:
+          var _v = _t.characters(at: index, type: String.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .other))
+        case .unused:
+          var _v = _t.characters(at: index, type: String.self)
+          characters.append(CharacterUnion(_v?.unpack(), type: .unused))
+        default: break
+        }
+    }
+  }
+
+  public init() {
+    characters = []
+  }
+
+  public func serialize() -> ByteBuffer { return serialize(type: Movie.self) }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/LinuxMain.swift b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/LinuxMain.swift
new file mode 100644
index 0000000..d909d07
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/Tests/LinuxMain.swift
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import XCTest
+
+import FlatBuffers_Test_SwiftTests
+
+var tests = [XCTestCaseEntry]()
+tests += FlatBuffers_Test_SwiftTests.__allTests()
+
+XCTMain(tests)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/monsterdata_test.mon b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/monsterdata_test.mon
new file mode 100644
index 0000000..ba6cf27
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test.Swift/monsterdata_test.mon differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/.gitignore b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/.gitignore
new file mode 100644
index 0000000..badc827
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/.gitignore
@@ -0,0 +1,4 @@
+dotnet-install.sh
+.dotnet_tmp/
+.tmp/
+packages/
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Assert.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Assert.cs
new file mode 100644
index 0000000..488c338
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Assert.cs
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace FlatBuffers.Test
+{
+
+    public class AssertFailedException : Exception
+    {
+        private readonly object _expected;
+        private readonly object _actual;
+
+        public AssertFailedException(object expected, object actual)
+        {
+            _expected = expected;
+            _actual = actual;
+        }
+
+        public override string Message
+        {
+            get { return string.Format("Expected {0} but saw {1}", _expected, _actual); }
+        }
+    }
+
+    public class AssertArrayFailedException : Exception
+    {
+        private readonly int _index;
+        private readonly object _expected;
+        private readonly object _actual;
+
+        public AssertArrayFailedException(int index, object expected, object actual)
+        {
+            _index = index;
+            _expected = expected;
+            _actual = actual;
+        }
+
+        public override string Message
+        {
+            get { return string.Format("Expected {0} at index {1} but saw {2}", _expected, _index, _actual); }
+        }
+    }
+
+    public class AssertUnexpectedThrowException : Exception
+    {
+        private readonly object _expected;
+
+        public AssertUnexpectedThrowException(object expected)
+        {
+            _expected = expected;
+        }
+
+        public override string Message
+        {
+            get { return string.Format("Expected exception of type {0}", _expected); }
+        }
+    }
+
+    public static class Assert
+    {
+        public static void AreEqual<T>(T expected, T actual)
+        {
+            if (!expected.Equals(actual))
+            {
+                throw new AssertFailedException(expected, actual);
+            }
+        }
+
+        public static void ArrayEqual<T>(T[] expected, T[] actual)
+        {
+            if (expected.Length != actual.Length)
+            {
+                throw new AssertFailedException(expected, actual);
+            }
+
+            for(var i = 0; i < expected.Length; ++i)
+            {
+                if (!expected[i].Equals(actual[i]))
+                {
+                    throw new AssertArrayFailedException(i, expected, actual);
+                }
+            }
+        }
+
+        public static void IsTrue(bool value)
+        {
+            if (!value)
+            {
+                throw new AssertFailedException(true, value);
+            }
+        }
+
+        public static void IsFalse(bool value)
+        {
+            if (value)
+            {
+                throw new AssertFailedException(false, value);
+            }
+        }
+
+        public static void Throws<T>(Action action) where T : Exception
+        {
+            var caught = false;
+            try
+            {
+                action();
+            }
+            catch (T)
+            {
+                caught = true;
+            }
+
+            if (!caught)
+            {
+                throw new AssertUnexpectedThrowException(typeof (T));
+            }
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/ByteBufferTests.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/ByteBufferTests.cs
new file mode 100644
index 0000000..98e917c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/ByteBufferTests.cs
@@ -0,0 +1,635 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace FlatBuffers.Test
+{
+    [FlatBuffersTestClass]
+    public class ByteBufferTests
+    {
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Length_MatchesBufferLength()
+        {
+            var buffer = new byte[1000];
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual(buffer.Length, uut.Length);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutBytePopulatesBufferAtZeroOffset()
+        {
+            var buffer = new byte[1];
+            var uut = new ByteBuffer(buffer);
+            uut.PutByte(0, (byte)99);
+
+            Assert.AreEqual((byte)99, buffer[0]);
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutByteCannotPutAtOffsetPastLength()
+        {
+            var uut = new ByteBuffer(1);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutByte(1, 99));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutShortPopulatesBufferCorrectly()
+        {
+            var buffer = new byte[2];
+            var uut = new ByteBuffer(buffer);
+            uut.PutShort(0, (short)1);
+
+            // Ensure Endianness was written correctly
+            Assert.AreEqual((byte)1, buffer[0]);
+            Assert.AreEqual((byte)0, buffer[1]);
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutShortCannotPutAtOffsetPastLength()
+        {
+            var uut = new ByteBuffer(2);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutShort(2, 99));
+        }
+#endif
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutShortChecksLength()
+        {
+            var uut = new ByteBuffer(1);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutShort(0, 99));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutShortChecksLengthAndOffset()
+        {
+            var uut = new ByteBuffer(2);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutShort(1, 99));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutIntPopulatesBufferCorrectly()
+        {
+            var buffer = new byte[4];
+            var uut = new ByteBuffer(buffer);
+            uut.PutInt(0, 0x0A0B0C0D);
+
+            // Ensure Endianness was written correctly
+            Assert.AreEqual(0x0D, buffer[0]);
+            Assert.AreEqual(0x0C, buffer[1]);
+            Assert.AreEqual(0x0B, buffer[2]);
+            Assert.AreEqual(0x0A, buffer[3]);
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutIntCannotPutAtOffsetPastLength()
+        {
+            var uut = new ByteBuffer(4);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutInt(2, 0x0A0B0C0D));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutIntChecksLength()
+        {
+            var uut = new ByteBuffer(1);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutInt(0, 0x0A0B0C0D));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutIntChecksLengthAndOffset()
+        {
+            var uut = new ByteBuffer(4);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutInt(2, 0x0A0B0C0D));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutLongPopulatesBufferCorrectly()
+        {
+            var buffer = new byte[8];
+            var uut = new ByteBuffer(buffer);
+            uut.PutLong(0, 0x010203040A0B0C0D);
+
+            // Ensure Endianness was written correctly
+            Assert.AreEqual(0x0D, buffer[0]);
+            Assert.AreEqual(0x0C, buffer[1]);
+            Assert.AreEqual(0x0B, buffer[2]);
+            Assert.AreEqual(0x0A, buffer[3]);
+            Assert.AreEqual(0x04, buffer[4]);
+            Assert.AreEqual(0x03, buffer[5]);
+            Assert.AreEqual(0x02, buffer[6]);
+            Assert.AreEqual(0x01, buffer[7]);
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutLongCannotPutAtOffsetPastLength()
+        {
+            var uut = new ByteBuffer(8);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutLong(2, 0x010203040A0B0C0D));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutLongChecksLength()
+        {
+            var uut = new ByteBuffer(1);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutLong(0, 0x010203040A0B0C0D));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_PutLongChecksLengthAndOffset()
+        {
+            var uut = new ByteBuffer(8);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.PutLong(2, 0x010203040A0B0C0D));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetByteReturnsCorrectData()
+        {
+            var buffer = new byte[1];
+            buffer[0] = 99;
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual((byte)99, uut.Get(0));
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetByteChecksOffset()
+        {
+            var uut = new ByteBuffer(1);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.Get(1));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetShortReturnsCorrectData()
+        {
+            var buffer = new byte[2];
+            buffer[0] = 1;
+            buffer[1] = 0;
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual(1, uut.GetShort(0));
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetShortChecksOffset()
+        {
+            var uut = new ByteBuffer(2);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetShort(2));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetShortChecksLength()
+        {
+            var uut = new ByteBuffer(2);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetShort(1));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetIntReturnsCorrectData()
+        {
+            var buffer = new byte[4];
+            buffer[0] = 0x0D;
+            buffer[1] = 0x0C;
+            buffer[2] = 0x0B;
+            buffer[3] = 0x0A;
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual(0x0A0B0C0D, uut.GetInt(0));
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetIntChecksOffset()
+        {
+            var uut = new ByteBuffer(4);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetInt(4));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetIntChecksLength()
+        {
+            var uut = new ByteBuffer(2);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetInt(0));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetLongReturnsCorrectData()
+        {
+            var buffer = new byte[8];
+            buffer[0] = 0x0D;
+            buffer[1] = 0x0C;
+            buffer[2] = 0x0B;
+            buffer[3] = 0x0A;
+            buffer[4] = 0x04;
+            buffer[5] = 0x03;
+            buffer[6] = 0x02;
+            buffer[7] = 0x01;
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual(0x010203040A0B0C0D, uut.GetLong(0));
+        }
+
+#if !BYTEBUFFER_NO_BOUNDS_CHECK
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetLongChecksOffset()
+        {
+            var uut = new ByteBuffer(8);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetLong(8));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_GetLongChecksLength()
+        {
+            var uut = new ByteBuffer(7);
+            Assert.Throws<ArgumentOutOfRangeException>(() => uut.GetLong(0));
+        }
+#endif
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_ReverseBytesUshort()
+        {
+            const ushort original = (ushort)0x1234U;
+            var reverse = ByteBuffer.ReverseBytes(original);
+            Assert.AreEqual(0x3412U, reverse);
+
+            var rereverse = ByteBuffer.ReverseBytes(reverse);
+            Assert.AreEqual(original, rereverse);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_ReverseBytesUint()
+        {
+            const uint original = 0x12345678;
+            var reverse = ByteBuffer.ReverseBytes(original);
+            Assert.AreEqual(0x78563412U, reverse);
+
+            var rereverse = ByteBuffer.ReverseBytes(reverse);
+            Assert.AreEqual(original, rereverse);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_ReverseBytesUlong()
+        {
+            const ulong original = 0x1234567890ABCDEFUL;
+            var reverse = ByteBuffer.ReverseBytes(original);
+            Assert.AreEqual(0xEFCDAB9078563412UL, reverse);
+
+            var rereverse = ByteBuffer.ReverseBytes(reverse);
+            Assert.AreEqual(original, rereverse);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_ToFullArray_MatchesBuffer()
+        {
+            var buffer = new byte[4];
+            buffer[0] = 0x0D;
+            buffer[1] = 0x0C;
+            buffer[2] = 0x0B;
+            buffer[3] = 0x0A;
+            var uut = new ByteBuffer(buffer);
+            Assert.ArrayEqual(buffer, uut.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_ToSizedArray_MatchesBuffer()
+        {
+            var buffer = new byte[4];
+            buffer[0] = 0x0D;
+            buffer[1] = 0x0C;
+            buffer[2] = 0x0B;
+            buffer[3] = 0x0A;
+            var uut = new ByteBuffer(buffer);
+            Assert.ArrayEqual(buffer, uut.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Duplicate_MatchesBuffer()
+        {
+            var buffer = new byte[4];
+            buffer[0] = 0x0D;
+            buffer[1] = 0x0C;
+            buffer[2] = 0x0B;
+            buffer[3] = 0x0A;
+            var uut = new ByteBuffer(buffer);
+            Assert.AreEqual(0x0A0B0C0D, uut.GetInt(0));
+
+            // Advance by two bytes
+            uut.Position = 2; uut = uut.Duplicate();
+            Assert.AreEqual(0x0A0B, uut.GetShort(2));
+
+            // Advance by one more byte
+            uut.Position = 1; uut = uut.Duplicate();
+            Assert.AreEqual(0x0A, uut.Get(3));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_To_Array_Float()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var fData = new float[len];
+            fData[0] = 1.0079F;
+            fData[1] = 4.0026F;
+            fData[2] = 6.941F;
+            fData[3] = 9.0122F;
+            fData[4] = 10.811F;
+            fData[5] = 12.0107F;
+            fData[6] = 14.0067F;
+            fData[7] = 15.9994F;
+            fData[8] = 18.9984F;
+
+            // Tranfer it to a byte array
+            var buffer = new byte[sizeof(float) * fData.Length];
+            Buffer.BlockCopy(fData, 0, buffer, 0, buffer.Length);
+
+            // Create the Byte Buffer from byte array
+            var uut = new ByteBuffer(buffer);
+
+            // Get the full array back out and ensure they are equivalent
+            var bbArray = uut.ToArray<float>(0, len);
+            Assert.ArrayEqual(fData, bbArray);
+
+            // Get a portion of the full array back out and ensure the
+            // subrange agrees
+            var bbArray2 = uut.ToArray<float>(4, len - 1);
+            Assert.AreEqual(bbArray2.Length, len - 1);
+            for (int i = 1; i < len - 1; i++)
+            {
+                Assert.AreEqual(fData[i], bbArray2[i - 1]);
+            }
+
+            // Get a sub portion of the full array back out and ensure the
+            // subrange agrees
+            var bbArray3 = uut.ToArray<float>(8, len - 4);
+            Assert.AreEqual(bbArray3.Length, len - 4);
+            for (int i = 2; i < len - 4; i++)
+            {
+                Assert.AreEqual(fData[i], bbArray3[i - 2]);
+            }
+        }
+
+        public void ByteBuffer_Put_Array_Helper<T>(T[] data, int typeSize)
+            where T : struct
+        {
+            // Create the Byte Buffer
+            var uut = new ByteBuffer(1024);
+
+            // Put the data into the buffer and make sure the offset is
+            // calculated correctly
+            int nOffset = uut.Put(1024, data);
+            Assert.AreEqual(1024 - typeSize * data.Length, nOffset);
+
+            // Get the full array back out and ensure they are equivalent
+            var bbArray = uut.ToArray<T>(nOffset, data.Length);
+            Assert.ArrayEqual(data, bbArray);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Float()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new float[len];
+            data[0] = 1.0079F;
+            data[1] = 4.0026F;
+            data[2] = 6.941F;
+            data[3] = 9.0122F;
+            data[4] = 10.811F;
+            data[5] = 12.0107F;
+            data[6] = 14.0067F;
+            data[7] = 15.9994F;
+            data[8] = 18.9984F;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(float));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Double()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new double[len];
+            data[0] = 1.0079;
+            data[1] = 4.0026;
+            data[2] = 6.941;
+            data[3] = 9.0122;
+            data[4] = 10.811;
+            data[5] = 12.0107;
+            data[6] = 14.0067;
+            data[7] = 15.9994;
+            data[8] = 18.9984;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(double));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Int()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new int[len];
+            data[0] = 1;
+            data[1] = 4;
+            data[2] = 6;
+            data[3] = 9;
+            data[4] = 10;
+            data[5] = 12;
+            data[6] = 14;
+            data[7] = 15;
+            data[8] = 18;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(int));
+        }
+
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_UInt()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new uint[len];
+            data[0] = 1;
+            data[1] = 4;
+            data[2] = 6;
+            data[3] = 9;
+            data[4] = 10;
+            data[5] = 12;
+            data[6] = 14;
+            data[7] = 15;
+            data[8] = 18;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(uint));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Bool()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new bool[len];
+            data[0] = true;
+            data[1] = true;
+            data[2] = false;
+            data[3] = true;
+            data[4] = false;
+            data[5] = true;
+            data[6] = true;
+            data[7] = true;
+            data[8] = false;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(bool));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Long()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new long[len];
+            data[0] = 1;
+            data[1] = 4;
+            data[2] = 6;
+            data[3] = 9;
+            data[4] = 10;
+            data[5] = 12;
+            data[6] = 14;
+            data[7] = 15;
+            data[8] = 18;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(long));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Byte()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new byte[len];
+            data[0] = 1;
+            data[1] = 4;
+            data[2] = 6;
+            data[3] = 9;
+            data[4] = 10;
+            data[5] = 12;
+            data[6] = 14;
+            data[7] = 15;
+            data[8] = 18;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(byte));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_SByte()
+        {
+            const int len = 9;
+
+            // Construct the data array
+            var data = new sbyte[len];
+            data[0] = 1;
+            data[1] = 4;
+            data[2] = 6;
+            data[3] = 9;
+            data[4] = 10;
+            data[5] = 12;
+            data[6] = 14;
+            data[7] = 15;
+            data[8] = 18;
+
+            ByteBuffer_Put_Array_Helper(data, sizeof(sbyte));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Null_Throws()
+        {
+            // Create the Byte Buffer
+            var uut = new ByteBuffer(1024);
+
+            // create a null array and try to put it into the buffer
+            float[] data = null;
+            Assert.Throws<ArgumentNullException>(() => uut.Put(1024, data));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_Empty_Throws()
+        {
+            // Create the Byte Buffer
+            var uut = new ByteBuffer(1024);
+
+            // create an array of length == 0, and try to put it into the buffer
+            float[] data = new float[0];
+            Assert.Throws<ArgumentException>(() => uut.Put(1024, data));
+        }
+
+        #pragma warning disable 0169
+        // These are purposely not used and the warning is suppress
+        private struct dummyStruct
+        {
+            int a;
+            float b;
+        }
+        #pragma warning restore 0169
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Put_Array_IncorrectType_Throws()
+        {
+            // Create the Byte Buffer
+            var uut = new ByteBuffer(1024);
+
+            // Create an array of dummy structures that shouldn't be
+            // able to be put into the buffer
+            var data = new dummyStruct[10];
+            Assert.Throws<ArgumentException>(() => uut.Put(1024, data));
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Get_Double()
+        {
+            var uut = new ByteBuffer(1024);
+            double value = 3.14159265;
+            uut.PutDouble(900, value);
+            double getValue = uut.GetDouble(900);
+            Assert.AreEqual(value, getValue);
+        }
+
+        [FlatBuffersTestMethod]
+        public void ByteBuffer_Get_Float()
+        {
+            var uut = new ByteBuffer(1024);
+            float value = 3.14159265F;
+            uut.PutFloat(900, value);
+            double getValue = uut.GetFloat(900);
+            Assert.AreEqual(value, getValue);
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBufferBuilderTests.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBufferBuilderTests.cs
new file mode 100644
index 0000000..e2b72c7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBufferBuilderTests.cs
@@ -0,0 +1,598 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace FlatBuffers.Test
+{
+    [FlatBuffersTestClass]
+    public class FlatBufferBuilderTests
+    {
+        private FlatBufferBuilder CreateBuffer(bool forceDefaults = true)
+        {
+            var fbb = new FlatBufferBuilder(16) {ForceDefaults = forceDefaults};
+            fbb.StartTable(1);
+            return fbb;
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddBool_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddBool(0, false, false);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(bool), endOffset-storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddSByte_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddSbyte(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(sbyte), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddByte_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddByte(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(byte), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddShort_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddShort(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(short), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddUShort_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddUshort(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(ushort), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddInt_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddInt(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(int), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddUInt_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddUint(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(uint), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddLong_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddLong(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(long), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddULong_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddUlong(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(ulong), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddFloat_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddFloat(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(float), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WithForceDefaults_WhenAddDouble_AndDefaultValue_OffsetIncreasesBySize()
+        {
+            var fbb = CreateBuffer();
+            var storedOffset = fbb.Offset;
+            fbb.AddDouble(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(double), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddBool_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddBool(0, false, false);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddSByte_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddSbyte(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddByte_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddByte(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddShort_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddShort(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddUShort_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUshort(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddInt_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddInt(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddUInt_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUint(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddLong_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddLong(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddULong_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUlong(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddFloat_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddFloat(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddDouble_AndDefaultValue_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddDouble(0, 0, 0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableBool_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddBool(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+                [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableSbyte_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddSbyte(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableByte_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddByte(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableShort_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddShort(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableUShort_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUshort(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableInt_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddInt(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableUInt_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUint(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableLong_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddLong(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableULong_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUlong(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableFloat_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddFloat(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableDouble_OffsetIsUnchanged()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddDouble(0, null);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+         [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableBool_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddBool(0, true);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(bool), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableSbyte_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddSbyte(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(sbyte), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableByte_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddByte(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(byte), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableShort_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddShort(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(short), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableUShort_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUshort(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(ushort), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableInt_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddInt(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(int), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableUInt_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUint(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(uint), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableLong_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddLong(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(long), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableULong_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddUlong(0, 1);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(ulong), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableFloat_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddFloat(0, 1.0F);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(float), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_WhenAddNullableDouble_OffsetIsChangedBySize()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+            fbb.AddDouble(0, 1.0);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(sizeof(double), endOffset - storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_Array_Float()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+
+            const int len = 9;
+
+            // Construct the data array
+            var data = new float[len];
+            data[0] = 1.0079F;
+            data[1] = 4.0026F;
+            data[2] = 6.941F;
+            data[3] = 9.0122F;
+            data[4] = 10.811F;
+            data[5] = 12.0107F;
+            data[6] = 14.0067F;
+            data[7] = 15.9994F;
+            data[8] = 18.9984F;
+
+            fbb.Add(data);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset + sizeof(float) * data.Length);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_Array_Bool()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+
+            const int len = 9;
+
+            // Construct the data array
+            var data = new bool[len];
+            data[0] = true;
+            data[1] = true;
+            data[2] = false;
+            data[3] = true;
+            data[4] = false;
+            data[5] = true;
+            data[6] = true;
+            data[7] = true;
+            data[8] = false;
+
+            fbb.Add(data);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset + sizeof(bool) * data.Length);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_Array_Double()
+        {
+            var fbb = CreateBuffer(false);
+            var storedOffset = fbb.Offset;
+
+            const int len = 9;
+
+            // Construct the data array
+            var data = new double[len];
+            data[0] = 1.0079;
+            data[1] = 4.0026;
+            data[2] = 6.941;
+            data[3] = 9.0122;
+            data[4] = 10.811;
+            data[5] = 12.0107;
+            data[6] = 14.0067;
+            data[7] = 15.9994;
+            data[8] = 18.9984;
+
+            fbb.Add(data);
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset + sizeof(double) * data.Length);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_Array_Null_Throws()
+        {
+            var fbb = CreateBuffer(false);
+
+            // Construct the data array
+            float[] data = null;
+
+            Assert.Throws<ArgumentNullException>(() => fbb.Add(data));
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_Array_Empty_Noop()
+        {
+            var fbb = CreateBuffer(false);
+
+            var storedOffset = fbb.Offset;
+
+            // Construct an empty data array
+            float[] data = new float[0];
+            fbb.Add(data);
+
+            // Make sure the offset didn't change since nothing
+            // was really added
+            var endOffset = fbb.Offset;
+            Assert.AreEqual(endOffset, storedOffset);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Add_null_String()
+        {
+            var fbb = new FlatBufferBuilder(16);
+            string s = null;
+            Assert.AreEqual(fbb.CreateSharedString(s).Value, 0);
+            Assert.AreEqual(fbb.CreateString(s).Value, 0);
+        }
+
+        [FlatBuffersTestMethod]
+        public void FlatBufferBuilder_Empty_Builder()
+        {
+            var fbb = new FlatBufferBuilder(16);
+            var str = "Hello";
+            var flatbuffer = "Flatbuffers!";
+            var strOffset = fbb.CreateSharedString(str);
+            var flatbufferOffset = fbb.CreateSharedString(flatbuffer);
+            fbb.Clear();
+            var flatbufferOffset2 = fbb.CreateSharedString(flatbuffer);
+            var strOffset2 = fbb.CreateSharedString(str);
+            Assert.IsFalse(strOffset.Value == strOffset2.Value);
+            Assert.IsFalse(flatbufferOffset.Value == flatbufferOffset2.Value);
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Core.Test.csproj b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Core.Test.csproj
new file mode 100644
index 0000000..201a17c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Core.Test.csproj
@@ -0,0 +1,165 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <Compile Remove="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+
+  <PropertyGroup Condition="'$(UnsafeByteBuffer)' == 'true'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <DefineConstants>$(DefineConstants);UNSAFE_BYTEBUFFER</DefineConstants>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(EnableSpanT)' == 'true'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <DefineConstants>$(DefineConstants);ENABLE_SPAN_T</DefineConstants>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <Compile Include="..\..\net\FlatBuffers\ByteBuffer.cs">
+      <Link>FlatBuffers\ByteBuffer.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\ByteBufferUtil.cs">
+      <Link>FlatBuffers\ByteBufferUtil.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\IFlatbufferObject.cs">
+      <Link>FlatBuffers\IFlatbufferObject.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Offset.cs">
+      <Link>FlatBuffers\Offset.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\FlatBufferBuilder.cs">
+      <Link>FlatBuffers\FlatBufferBuilder.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\FlatBufferConstants.cs">
+      <Link>FlatBuffers\FlatBufferConstants.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Struct.cs">
+      <Link>FlatBuffers\Struct.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Table.cs">
+      <Link>FlatBuffers\Table.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example2\Monster.cs">
+      <Link>MyGame\Example2\Monster.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Any.cs">
+      <Link>MyGame\Example\Any.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\AnyAmbiguousAliases.cs">
+      <Link>MyGame\Example\AnyAmbiguousAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\AnyUniqueAliases.cs">
+      <Link>MyGame\Example\AnyUniqueAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Color.cs">
+      <Link>MyGame\Example\Color.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Race.cs">
+      <Link>MyGame\Example\Race.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Monster.cs">
+      <Link>MyGame\Example\Monster.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Referrable.cs">
+      <Link>MyGame\Example\Referrable.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Stat.cs">
+      <Link>MyGame\Example\Stat.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Test.cs">
+      <Link>MyGame\Example\Test.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TestSimpleTableWithEnum.cs">
+      <Link>MyGame\Example\TestSimpleTableWithEnum.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TypeAliases.cs">
+      <Link>MyGame\Example\TypeAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Vec3.cs">
+      <Link>MyGame\Example\Vec3.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Ability.cs">
+      <Link>MyGame\Example\Ability.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\ArrayTable.cs">
+      <Link>MyGame\Example\ArrayTable.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\ArrayStruct.cs">
+      <Link>MyGame\Example\ArrayStruct.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\NestedStruct.cs">
+      <Link>MyGame\Example\NestedStruct.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TestEnum.cs">
+      <Link>MyGame\Example\TestEnum.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\InParentNamespace.cs">
+      <Link>MyGame\InParentNamespace.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\EnumInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\EnumInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\StructInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\StructInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\TableInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\TableInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\UnionInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\UnionInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\TableInFirstNS.cs">
+      <Link>NamespaceA\TableInFirstNS.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Attacker.cs">
+      <Link>union_vector\Attacker.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\BookReader.cs">
+      <Link>union_vector\BookReader.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Character.cs">
+      <Link>union_vector\Character.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Movie.cs">
+      <Link>union_vector\Movie.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Rapunzel.cs">
+      <Link>union_vector\Rapunzel.cs</Link>
+    </Compile>
+    <Compile Include="..\optional_scalars\OptionalByte.cs">
+      <Link>optional_scalars\OptionalByte.cs</Link>
+    </Compile>
+    <Compile Include="..\optional_scalars\ScalarStuff.cs">
+      <Link>optional_scalars\ScalarStuff.cs</Link>
+    </Compile>
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Remove=".gitignore" />
+    <None Remove="clean.sh" />
+    <None Remove="NetTest.sh" />
+    <None Remove="packages.config" />
+    <None Remove="README.md" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Content Include="..\monsterdata_test.mon">
+      <Link>Resources\monsterdata_test.mon</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+    <Content Include="..\monsterdata_test.json">
+      <Link>Resources\monsterdata_test.json</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Newtonsoft.Json" Version="12.0.3" />
+  </ItemGroup>
+
+</Project>
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Test.csproj b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Test.csproj
new file mode 100644
index 0000000..c59a8c2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffers.Test.csproj
@@ -0,0 +1,205 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{9DB0B5E7-757E-4BD1-A5F6-279390331254}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>FlatBuffers.Test</RootNamespace>
+    <AssemblyName>FlatBuffers.Test</AssemblyName>
+    <TargetFrameworkVersion>v3.5</TargetFrameworkVersion>
+    <RuntimeIdentifier>win</RuntimeIdentifier>
+    <LangVersion>4</LangVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>TRACE;DEBUG</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup>
+    <StartupObject />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(UnsafeByteBuffer)' == 'true'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <DefineConstants>$(DefineConstants);UNSAFE_BYTEBUFFER</DefineConstants>
+  </PropertyGroup>
+  <ItemGroup>
+    <PackageReference Include="Newtonsoft.Json">
+      <Version>12.0.3</Version>
+    </PackageReference>
+  </ItemGroup>
+  <ItemGroup>
+    <Reference Include="Newtonsoft.Json, Version=12.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
+      <HintPath>packages\Newtonsoft.Json.12.0.3\lib\net35\Newtonsoft.Json.dll</HintPath>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core">
+    </Reference>
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="..\..\net\FlatBuffers\ByteBuffer.cs">
+      <Link>FlatBuffers\ByteBuffer.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\ByteBufferUtil.cs">
+      <Link>FlatBuffers\ByteBufferUtil.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\IFlatbufferObject.cs">
+      <Link>FlatBuffers\IFlatbufferObject.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Offset.cs">
+      <Link>FlatBuffers\Offset.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\FlatBufferBuilder.cs">
+      <Link>FlatBuffers\FlatBufferBuilder.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\FlatBufferConstants.cs">
+      <Link>FlatBuffers\FlatBufferConstants.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Struct.cs">
+      <Link>FlatBuffers\Struct.cs</Link>
+    </Compile>
+    <Compile Include="..\..\net\FlatBuffers\Table.cs">
+      <Link>FlatBuffers\Table.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example2\Monster.cs">
+      <Link>MyGame\Example2\Monster.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Any.cs">
+      <Link>MyGame\Example\Any.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\AnyAmbiguousAliases.cs">
+      <Link>MyGame\Example\AnyAmbiguousAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\AnyUniqueAliases.cs">
+      <Link>MyGame\Example\AnyUniqueAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Color.cs">
+      <Link>MyGame\Example\Color.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Race.cs">
+      <Link>MyGame\Example\Race.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Monster.cs">
+      <Link>MyGame\Example\Monster.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Referrable.cs">
+      <Link>MyGame\Example\Referrable.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Stat.cs">
+      <Link>MyGame\Example\Stat.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Test.cs">
+      <Link>MyGame\Example\Test.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TestSimpleTableWithEnum.cs">
+      <Link>MyGame\Example\TestSimpleTableWithEnum.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TypeAliases.cs">
+      <Link>MyGame\Example\TypeAliases.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Vec3.cs">
+      <Link>MyGame\Example\Vec3.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\Ability.cs">
+      <Link>MyGame\Example\Ability.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\ArrayTable.cs">
+      <Link>MyGame\Example\ArrayTable.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\ArrayStruct.cs">
+      <Link>MyGame\Example\ArrayStruct.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\NestedStruct.cs">
+      <Link>MyGame\Example\NestedStruct.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\Example\TestEnum.cs">
+      <Link>MyGame\Example\TestEnum.cs</Link>
+    </Compile>
+    <Compile Include="..\MyGame\InParentNamespace.cs">
+      <Link>MyGame\InParentNamespace.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\EnumInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\EnumInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\StructInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\StructInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\TableInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\TableInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\NamespaceB\UnionInNestedNS.cs">
+      <Link>NamespaceA\NamespaceB\UnionInNestedNS.cs</Link>
+    </Compile>
+    <Compile Include="..\namespace_test\NamespaceA\TableInFirstNS.cs">
+      <Link>NamespaceA\TableInFirstNS.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Attacker.cs">
+      <Link>union_vector\Attacker.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\BookReader.cs">
+      <Link>union_vector\BookReader.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Character.cs">
+      <Link>union_vector\Character.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Movie.cs">
+      <Link>union_vector\Movie.cs</Link>
+    </Compile>
+    <Compile Include="..\union_vector\Rapunzel.cs">
+      <Link>union_vector\Rapunzel.cs</Link>
+    </Compile>
+    <Compile Include="..\optional_scalars\OptionalByte.cs">
+      <Link>optional_scalars\OptionalByte.cs</Link>
+    </Compile>
+    <Compile Include="..\optional_scalars\ScalarStuff.cs">
+      <Link>optional_scalars\ScalarStuff.cs</Link>
+    </Compile>
+    <Compile Include="Assert.cs" />
+    <Compile Include="ByteBufferTests.cs" />
+    <Compile Include="FlatBufferBuilderTests.cs" />
+    <Compile Include="FlatBuffersFuzzTests.cs" />
+    <Compile Include="FlatBuffersTestClassAttribute.cs" />
+    <Compile Include="FlatBuffersTestMethodAttribute.cs" />
+    <Compile Include="FuzzTestData.cs" />
+    <Compile Include="Lcg.cs" />
+    <Compile Include="Program.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="FlatBuffersExampleTests.cs" />
+    <Compile Include="TestTable.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <Content Include="..\monsterdata_test.mon">
+      <Link>Resources\monsterdata_test.mon</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+    <Content Include="..\monsterdata_test.json">
+      <Link>Resources\monsterdata_test.json</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersExampleTests.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersExampleTests.cs
new file mode 100644
index 0000000..b4093d4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersExampleTests.cs
@@ -0,0 +1,1101 @@
+﻿/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Text;
+using System.Threading;
+using MyGame.Example;
+using optional_scalars;
+
+namespace FlatBuffers.Test
+{
+    [FlatBuffersTestClass]
+    public class FlatBuffersExampleTests
+    {
+        public void RunTests()
+        {
+            CanCreateNewFlatBufferFromScratch();
+            CanReadCppGeneratedWireFile();
+            TestEnums();
+        }
+
+        [FlatBuffersTestMethod]
+        public void CanCreateNewFlatBufferFromScratch()
+        {
+            CanCreateNewFlatBufferFromScratch(true);
+            CanCreateNewFlatBufferFromScratch(false);
+        }
+
+        private void CanCreateNewFlatBufferFromScratch(bool sizePrefix)
+        {
+            // Second, let's create a FlatBuffer from scratch in C#, and test it also.
+            // We use an initial size of 1 to exercise the reallocation algorithm,
+            // normally a size larger than the typical FlatBuffer you generate would be
+            // better for performance.
+            var fbb = new FlatBufferBuilder(1);
+
+            StringOffset[] names = { fbb.CreateString("Frodo"), fbb.CreateString("Barney"), fbb.CreateString("Wilma") };
+            Offset<Monster>[] off = new Offset<Monster>[3];
+            Monster.StartMonster(fbb);
+            Monster.AddName(fbb, names[0]);
+            off[0] = Monster.EndMonster(fbb);
+            Monster.StartMonster(fbb);
+            Monster.AddName(fbb, names[1]);
+            off[1] = Monster.EndMonster(fbb);
+            Monster.StartMonster(fbb);
+            Monster.AddName(fbb, names[2]);
+            off[2] = Monster.EndMonster(fbb);
+            var sortMons = Monster.CreateSortedVectorOfMonster(fbb, off);
+
+            // We set up the same values as monsterdata.json:
+
+            var str = fbb.CreateString("MyMonster");
+            var test1 = fbb.CreateString("test1");
+            var test2 = fbb.CreateString("test2");
+
+
+            Monster.StartInventoryVector(fbb, 5);
+            for (int i = 4; i >= 0; i--)
+            {
+                fbb.AddByte((byte)i);
+            }
+            var inv = fbb.EndVector();
+
+            var fred = fbb.CreateString("Fred");
+            Monster.StartMonster(fbb);
+            Monster.AddName(fbb, fred);
+            var mon2 = Monster.EndMonster(fbb);
+
+            Monster.StartTest4Vector(fbb, 2);
+            MyGame.Example.Test.CreateTest(fbb, (short)10, (sbyte)20);
+            MyGame.Example.Test.CreateTest(fbb, (short)30, (sbyte)40);
+            var test4 = fbb.EndVector();
+
+            Monster.StartTestarrayofstringVector(fbb, 2);
+            fbb.AddOffset(test2.Value);
+            fbb.AddOffset(test1.Value);
+            var testArrayOfString = fbb.EndVector();
+
+            Monster.StartMonster(fbb);
+            Monster.AddPos(fbb, Vec3.CreateVec3(fbb, 1.0f, 2.0f, 3.0f, 3.0,
+                                                     Color.Green, (short)5, (sbyte)6));
+            Monster.AddHp(fbb, (short)80);
+            Monster.AddName(fbb, str);
+            Monster.AddInventory(fbb, inv);
+            Monster.AddTestType(fbb, Any.Monster);
+            Monster.AddTest(fbb, mon2.Value);
+            Monster.AddTest4(fbb, test4);
+            Monster.AddTestarrayofstring(fbb, testArrayOfString);
+            Monster.AddTestbool(fbb, true);
+            Monster.AddTestarrayoftables(fbb, sortMons);
+            var mon = Monster.EndMonster(fbb);
+
+            if (sizePrefix)
+            {
+                Monster.FinishSizePrefixedMonsterBuffer(fbb, mon);
+            }
+            else
+            {
+                Monster.FinishMonsterBuffer(fbb, mon);
+            }
+
+            // Dump to output directory so we can inspect later, if needed
+            #if ENABLE_SPAN_T
+            var data = fbb.DataBuffer.ToSizedArray();
+            string filename = @".tmp/monsterdata_cstest" + (sizePrefix ? "_sp" : "") + ".mon";
+            File.WriteAllBytes(filename, data);
+            #else
+            using (var ms = fbb.DataBuffer.ToMemoryStream(fbb.DataBuffer.Position, fbb.Offset))
+            {
+                var data = ms.ToArray();
+                string filename = @".tmp/monsterdata_cstest" + (sizePrefix ? "_sp" : "") + ".mon";
+                File.WriteAllBytes(filename, data);
+            }
+            #endif
+
+            // Remove the size prefix if necessary for further testing
+            ByteBuffer dataBuffer = fbb.DataBuffer;
+            if (sizePrefix)
+            {
+                Assert.AreEqual(ByteBufferUtil.GetSizePrefix(dataBuffer) + FlatBufferConstants.SizePrefixLength,
+                                dataBuffer.Length - dataBuffer.Position);
+                dataBuffer = ByteBufferUtil.RemoveSizePrefix(dataBuffer);
+            }
+
+            // Now assert the buffer
+            TestBuffer(dataBuffer);
+
+            //Attempt to mutate Monster fields and check whether the buffer has been mutated properly
+            // revert to original values after testing
+            Monster monster = Monster.GetRootAsMonster(dataBuffer);
+            
+
+            // mana is optional and does not exist in the buffer so the mutation should fail
+            // the mana field should retain its default value
+            Assert.AreEqual(monster.MutateMana((short)10), false);
+            Assert.AreEqual(monster.Mana, (short)150);
+
+            // Accessing a vector of sorted by the key tables
+            Assert.AreEqual(monster.Testarrayoftables(0).Value.Name, "Barney");
+            Assert.AreEqual(monster.Testarrayoftables(1).Value.Name, "Frodo");
+            Assert.AreEqual(monster.Testarrayoftables(2).Value.Name, "Wilma");
+
+            // Example of searching for a table by the key
+            Assert.IsTrue(monster.TestarrayoftablesByKey("Frodo") != null);
+            Assert.IsTrue(monster.TestarrayoftablesByKey("Barney") != null);
+            Assert.IsTrue(monster.TestarrayoftablesByKey("Wilma") != null);
+
+            // testType is an existing field
+            Assert.AreEqual(monster.TestType, Any.Monster);
+
+            //mutate the inventory vector
+            Assert.AreEqual(monster.MutateInventory(0, 1), true);
+            Assert.AreEqual(monster.MutateInventory(1, 2), true);
+            Assert.AreEqual(monster.MutateInventory(2, 3), true);
+            Assert.AreEqual(monster.MutateInventory(3, 4), true);
+            Assert.AreEqual(monster.MutateInventory(4, 5), true);
+
+            for (int i = 0; i < monster.InventoryLength; i++)
+            {
+                Assert.AreEqual(monster.Inventory(i), i + 1);
+            }
+
+            //reverse mutation
+            Assert.AreEqual(monster.MutateInventory(0, 0), true);
+            Assert.AreEqual(monster.MutateInventory(1, 1), true);
+            Assert.AreEqual(monster.MutateInventory(2, 2), true);
+            Assert.AreEqual(monster.MutateInventory(3, 3), true);
+            Assert.AreEqual(monster.MutateInventory(4, 4), true);
+
+            // get a struct field and edit one of its fields
+            Vec3 pos = (Vec3)monster.Pos;
+            Assert.AreEqual(pos.X, 1.0f);
+            pos.MutateX(55.0f);
+            Assert.AreEqual(pos.X, 55.0f);
+            pos.MutateX(1.0f);
+            Assert.AreEqual(pos.X, 1.0f);
+
+            TestBuffer(dataBuffer);
+            TestObjectAPI(Monster.GetRootAsMonster(dataBuffer));
+        }
+
+        private void TestBuffer(ByteBuffer bb)
+        {
+            Monster monster = Monster.GetRootAsMonster(bb);
+
+            Assert.AreEqual(80, monster.Hp);
+            Assert.AreEqual(150, monster.Mana);
+            Assert.AreEqual("MyMonster", monster.Name);
+
+            var pos = monster.Pos.Value;
+            Assert.AreEqual(1.0f, pos.X);
+            Assert.AreEqual(2.0f, pos.Y);
+            Assert.AreEqual(3.0f, pos.Z);
+
+            Assert.AreEqual(3.0f, pos.Test1);
+            Assert.AreEqual(Color.Green, pos.Test2);
+            var t = (MyGame.Example.Test)pos.Test3;
+            Assert.AreEqual((short)5, t.A);
+            Assert.AreEqual((sbyte)6, t.B);
+
+            Assert.AreEqual(Any.Monster, monster.TestType);
+
+            var monster2 = monster.Test<Monster>().Value;
+            Assert.AreEqual("Fred", monster2.Name);
+
+
+            Assert.AreEqual(5, monster.InventoryLength);
+            var invsum = 0;
+            for (var i = 0; i < monster.InventoryLength; i++)
+            {
+                invsum += monster.Inventory(i);
+            }
+            Assert.AreEqual(10, invsum);
+
+            // Get the inventory as an array and subtract the
+            // sum to get it back to 0
+            var inventoryArray = monster.GetInventoryArray();
+            Assert.AreEqual(5, inventoryArray.Length);
+            foreach(var inv in inventoryArray)
+            {
+                invsum -= inv;
+            }
+            Assert.AreEqual(0, invsum);
+
+            var test0 = monster.Test4(0).Value;
+            var test1 = monster.Test4(1).Value;
+            Assert.AreEqual(2, monster.Test4Length);
+
+            Assert.AreEqual(100, test0.A + test0.B + test1.A + test1.B);
+
+            Assert.AreEqual(2, monster.TestarrayofstringLength);
+            Assert.AreEqual("test1", monster.Testarrayofstring(0));
+            Assert.AreEqual("test2", monster.Testarrayofstring(1));
+
+            Assert.AreEqual(true, monster.Testbool);
+
+            #if ENABLE_SPAN_T
+            var nameBytes = monster.GetNameBytes();
+            Assert.AreEqual("MyMonster", Encoding.UTF8.GetString(nameBytes.ToArray(), 0, nameBytes.Length));
+
+            if (0 == monster.TestarrayofboolsLength)
+            {
+                Assert.IsFalse(monster.GetTestarrayofboolsBytes().Length != 0);
+            }
+            else
+            {
+                Assert.IsTrue(monster.GetTestarrayofboolsBytes().Length != 0);
+            }
+
+            var longArrayBytes = monster.GetVectorOfLongsBytes();
+            Assert.IsTrue(monster.VectorOfLongsLength * 8 == longArrayBytes.Length);
+
+            var doubleArrayBytes = monster.GetVectorOfDoublesBytes();
+            Assert.IsTrue(monster.VectorOfDoublesLength * 8 == doubleArrayBytes.Length);
+            #else
+            var nameBytes = monster.GetNameBytes().Value;
+            Assert.AreEqual("MyMonster", Encoding.UTF8.GetString(nameBytes.Array, nameBytes.Offset, nameBytes.Count));
+
+            if (0 == monster.TestarrayofboolsLength)
+            {
+                Assert.IsFalse(monster.GetTestarrayofboolsBytes().HasValue);
+            }
+            else
+            {
+                Assert.IsTrue(monster.GetTestarrayofboolsBytes().HasValue);
+            }
+            #endif
+        }
+
+        [FlatBuffersTestMethod]
+        public void CanReadCppGeneratedWireFile()
+        {
+            var data = File.ReadAllBytes(@"../monsterdata_test.mon");
+            var bb = new ByteBuffer(data);
+            TestBuffer(bb);
+            TestObjectAPI(Monster.GetRootAsMonster(bb));
+        }
+
+        [FlatBuffersTestMethod]
+        public void CanReadJsonFile()
+        {
+            var jsonText = File.ReadAllText(@"../monsterdata_test.json");
+            var mon = MonsterT.DeserializeFromJson(jsonText);
+            var fbb = new FlatBufferBuilder(1);
+            fbb.Finish(Monster.Pack(fbb, mon).Value);
+            TestBuffer(fbb.DataBuffer);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestEnums()
+        {
+            Assert.AreEqual("Red", Color.Red.ToString());
+            Assert.AreEqual("Blue", Color.Blue.ToString());
+            Assert.AreEqual("NONE", Any.NONE.ToString());
+            Assert.AreEqual("Monster", Any.Monster.ToString());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVectorOfEnums()
+        {
+            const string monsterName = "TestVectorOfEnumsMonster";
+            var colorVec = new Color[] { Color.Red, Color.Green, Color.Blue };
+            var fbb = new FlatBufferBuilder(32);
+            var str1 = fbb.CreateString(monsterName);
+            var vec1 = Monster.CreateVectorOfEnumsVector(fbb, colorVec);
+            Monster.StartMonster(fbb);
+            Monster.AddName(fbb, str1);
+            Monster.AddVectorOfEnums(fbb, vec1);
+            var monster1 = Monster.EndMonster(fbb);
+            Monster.FinishMonsterBuffer(fbb, monster1);
+
+            var mons = Monster.GetRootAsMonster(fbb.DataBuffer);
+            var colors = mons.GetVectorOfEnumsArray();
+            Assert.ArrayEqual(colorVec, colors);
+
+            TestObjectAPI(mons);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestNestedFlatBuffer()
+        {
+            const string nestedMonsterName = "NestedMonsterName";
+            const short nestedMonsterHp = 600;
+            const short nestedMonsterMana = 1024;
+            // Create nested buffer as a Monster type
+            var fbb1 = new FlatBufferBuilder(16);
+            var str1 = fbb1.CreateString(nestedMonsterName);
+            Monster.StartMonster(fbb1);
+            Monster.AddName(fbb1, str1);
+            Monster.AddHp(fbb1, nestedMonsterHp);
+            Monster.AddMana(fbb1, nestedMonsterMana);
+            var monster1 = Monster.EndMonster(fbb1);
+            Monster.FinishMonsterBuffer(fbb1, monster1);
+            var fbb1Bytes = fbb1.SizedByteArray();
+            fbb1 = null;
+
+            // Create a Monster which has the first buffer as a nested buffer
+            var fbb2 = new FlatBufferBuilder(16);
+            var str2 = fbb2.CreateString("My Monster");
+            var nestedBuffer = Monster.CreateTestnestedflatbufferVector(fbb2, fbb1Bytes);
+            Monster.StartMonster(fbb2);
+            Monster.AddName(fbb2, str2);
+            Monster.AddHp(fbb2, 50);
+            Monster.AddMana(fbb2, 32);
+            Monster.AddTestnestedflatbuffer(fbb2, nestedBuffer);
+            var monster = Monster.EndMonster(fbb2);
+            Monster.FinishMonsterBuffer(fbb2, monster);
+
+            // Now test the data extracted from the nested buffer
+            var mons = Monster.GetRootAsMonster(fbb2.DataBuffer);
+            var nestedMonster = mons.GetTestnestedflatbufferAsMonster().Value;
+
+            Assert.AreEqual(nestedMonsterMana, nestedMonster.Mana);
+            Assert.AreEqual(nestedMonsterHp, nestedMonster.Hp);
+            Assert.AreEqual(nestedMonsterName, nestedMonster.Name);
+
+            TestObjectAPI(mons);
+            TestObjectAPI(nestedMonster);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestFixedLenghtArrays()
+        {
+            FlatBufferBuilder builder = new FlatBufferBuilder(100);
+
+            float   a;
+            int[]   b = new int[15];
+            sbyte   c;
+            int[,]  d_a = new int[2, 2];
+            TestEnum[]  d_b = new TestEnum[2];
+            TestEnum[,] d_c = new TestEnum[2, 2];
+            long[,]     d_d = new long[2, 2];
+            int         e;
+            long[]      f = new long[2];
+
+            a = 0.5f;
+            for (int i = 0; i < 15; i++) b[i] = i;
+            c = 1;
+            d_a[0, 0] = 1;
+            d_a[0, 1] = 2;
+            d_a[1, 0] = 3;
+            d_a[1, 1] = 4;
+            d_b[0] = TestEnum.B;
+            d_b[1] = TestEnum.C;
+            d_c[0, 0] = TestEnum.A;
+            d_c[0, 1] = TestEnum.B;
+            d_c[1, 0] = TestEnum.C;
+            d_c[1, 1] = TestEnum.B;
+            d_d[0, 0] = -1;
+            d_d[0, 1] = 1;
+            d_d[1, 0] = -2;
+            d_d[1, 1] = 2;
+            e = 2;
+            f[0] = -1;
+            f[1] = 1;
+
+            Offset<ArrayStruct> arrayOffset = ArrayStruct.CreateArrayStruct(
+                builder, a, b, c, d_a, d_b, d_c, d_d, e, f);
+
+            // Create a table with the ArrayStruct.
+            ArrayTable.StartArrayTable(builder);
+            ArrayTable.AddA(builder, arrayOffset);
+            Offset<ArrayTable> tableOffset = ArrayTable.EndArrayTable(builder);
+
+            ArrayTable.FinishArrayTableBuffer(builder, tableOffset);
+
+            ArrayTable table = ArrayTable.GetRootAsArrayTable(builder.DataBuffer);
+
+            Assert.AreEqual(table.A.Value.A, 0.5f);
+            for (int i = 0; i < 15; i++) Assert.AreEqual(table.A.Value.B(i), i);
+            Assert.AreEqual(table.A.Value.C, (sbyte)1);
+            Assert.AreEqual(table.A.Value.D(0).A(0), 1);
+            Assert.AreEqual(table.A.Value.D(0).A(1), 2);
+            Assert.AreEqual(table.A.Value.D(1).A(0), 3);
+            Assert.AreEqual(table.A.Value.D(1).A(1), 4);
+            Assert.AreEqual(table.A.Value.D(0).B, TestEnum.B);
+            Assert.AreEqual(table.A.Value.D(1).B, TestEnum.C);
+            Assert.AreEqual(table.A.Value.D(0).C(0), TestEnum.A);
+            Assert.AreEqual(table.A.Value.D(0).C(1), TestEnum.B);
+            Assert.AreEqual(table.A.Value.D(1).C(0), TestEnum.C);
+            Assert.AreEqual(table.A.Value.D(1).C(1), TestEnum.B);
+            Assert.AreEqual(table.A.Value.D(0).D(0), -1);
+            Assert.AreEqual(table.A.Value.D(0).D(1), 1);
+            Assert.AreEqual(table.A.Value.D(1).D(0), -2);
+            Assert.AreEqual(table.A.Value.D(1).D(1), 2);
+            Assert.AreEqual(table.A.Value.E, 2);
+            Assert.AreEqual(table.A.Value.F(0), -1);
+            Assert.AreEqual(table.A.Value.F(1), 1);
+
+            TestObjectAPI(table);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestUnionVector()
+        {
+            var fbb = new FlatBufferBuilder(100);
+            var rapunzel = Rapunzel.CreateRapunzel(fbb, 40).Value;
+
+            var characterTypes = new[]
+            {
+                Character.MuLan,
+                Character.Belle,
+                Character.Other,
+            };
+            var characterTypesOffset = Movie.CreateCharactersTypeVector(fbb, characterTypes);
+
+            var characters = new[]
+            {
+                Attacker.CreateAttacker(fbb, 10).Value,
+                BookReader.CreateBookReader(fbb, 20).Value,
+                fbb.CreateSharedString("Chip").Value,
+            };
+            var charactersOffset = Movie.CreateCharactersVector(fbb, characters);
+
+            var movieOffset = Movie.CreateMovie(
+                fbb,
+                Character.Rapunzel,
+                rapunzel,
+                characterTypesOffset,
+                charactersOffset);
+            Movie.FinishMovieBuffer(fbb, movieOffset);
+
+            var movie = Movie.GetRootAsMovie(fbb.DataBuffer);
+            Assert.AreEqual(Character.Rapunzel, movie.MainCharacterType);
+            Assert.AreEqual(40, movie.MainCharacter<Rapunzel>().Value.HairLength);
+
+            Assert.AreEqual(3, movie.CharactersLength);
+            Assert.AreEqual(Character.MuLan, movie.CharactersType(0));
+            Assert.AreEqual(10, movie.Characters<Attacker>(0).Value.SwordAttackDamage);
+            Assert.AreEqual(Character.Belle, movie.CharactersType(1));
+            Assert.AreEqual(20, movie.Characters<BookReader>(1).Value.BooksRead);
+            Assert.AreEqual(Character.Other, movie.CharactersType(2));
+            Assert.AreEqual("Chip", movie.CharactersAsString(2));
+
+            TestObjectAPI(movie);
+        }
+
+        private void AreEqual(Monster a, MonsterT b)
+        {
+            Assert.AreEqual(a.Hp, b.Hp);
+            Assert.AreEqual(a.Mana, b.Mana);
+            Assert.AreEqual(a.Name, b.Name);
+
+            var posA = a.Pos;
+            var posB = b.Pos;
+            if (posA != null)
+            {
+                Assert.AreEqual(posA.Value.X, posB.X);
+                Assert.AreEqual(posA.Value.Y, posB.Y);
+                Assert.AreEqual(posA.Value.Z, posB.Z);
+
+                Assert.AreEqual(posA.Value.Test1, posB.Test1);
+                Assert.AreEqual(posA.Value.Test2, posB.Test2);
+                var tA = posA.Value.Test3;
+                var tB = posB.Test3;
+                Assert.AreEqual(tA.A, tB.A);
+                Assert.AreEqual(tA.B, tB.B);
+            }
+
+            Assert.AreEqual(a.TestType, b.Test.Type);
+            if (a.TestType == Any.Monster)
+            {
+                var monster2A = a.Test<Monster>().Value;
+                var monster2B = b.Test.AsMonster();
+                Assert.AreEqual(monster2A.Name, monster2B.Name);
+            }
+
+            Assert.AreEqual(a.InventoryLength, b.Inventory.Count);
+            for (var i = 0; i < a.InventoryLength; ++i)
+            {
+                Assert.AreEqual(a.Inventory(i), b.Inventory[i]);
+            }
+
+            var inventoryArray = a.GetInventoryArray();
+            var inventoryArrayLength = inventoryArray == null ? 0 : inventoryArray.Length;
+            Assert.AreEqual(inventoryArrayLength, b.Inventory.Count);
+            for (var i = 0; i < inventoryArrayLength; ++i)
+            {
+                Assert.AreEqual(inventoryArray[i], b.Inventory[i]);
+            }
+
+            Assert.AreEqual(a.Test4Length, b.Test4.Count);
+            for (var i = 0; i < a.Test4Length; ++i)
+            {
+                var t4A = a.Test4(i);
+                var t4B = b.Test4[i];
+                Assert.AreEqual(t4A.Value.A, t4B.A);
+                Assert.AreEqual(t4A.Value.B, t4B.B);
+            }
+
+            Assert.AreEqual(a.TestarrayofstringLength, b.Testarrayofstring.Count);
+            for (var i = 0; i < a.TestarrayofstringLength; ++i)
+            {
+                Assert.AreEqual(a.Testarrayofstring(i), b.Testarrayofstring[i]);
+            }
+
+            Assert.AreEqual(a.Testbool, b.Testbool);
+
+            Assert.AreEqual(a.TestarrayofboolsLength, b.Testarrayofbools.Count);
+            for (var i = 0; i < a.TestarrayofboolsLength; ++i)
+            {
+                Assert.AreEqual(a.Testarrayofbools(i), b.Testarrayofbools[i]);
+            }
+
+            Assert.AreEqual(a.VectorOfLongsLength, b.VectorOfLongs.Count);
+            for (var i = 0; i < a.VectorOfLongsLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfLongs(i), b.VectorOfLongs[i]);
+            }
+
+            Assert.AreEqual(a.VectorOfDoublesLength, b.VectorOfDoubles.Count);
+            for (var i = 0; i < a.VectorOfDoublesLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfDoubles(i), b.VectorOfDoubles[i]);
+            }
+
+            Assert.AreEqual(a.VectorOfEnumsLength, b.VectorOfEnums.Count);
+            for (var i = 0; i < a.VectorOfEnumsLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfEnums(i), b.VectorOfEnums[i]);
+            }
+        }
+
+        private void AreEqual(Monster a, Monster b)
+        {
+            Assert.AreEqual(a.Hp, b.Hp);
+            Assert.AreEqual(a.Mana, b.Mana);
+            Assert.AreEqual(a.Name, b.Name);
+
+            var posA = a.Pos;
+            var posB = b.Pos;
+            if (posA != null)
+            {
+                Assert.AreEqual(posA.Value.X, posB.Value.X);
+                Assert.AreEqual(posA.Value.Y, posB.Value.Y);
+                Assert.AreEqual(posA.Value.Z, posB.Value.Z);
+
+                Assert.AreEqual(posA.Value.Test1, posB.Value.Test1);
+                Assert.AreEqual(posA.Value.Test2, posB.Value.Test2);
+                var tA = posA.Value.Test3;
+                var tB = posB.Value.Test3;
+                Assert.AreEqual(tA.A, tB.A);
+                Assert.AreEqual(tA.B, tB.B);
+            }
+
+            Assert.AreEqual(a.TestType, b.TestType);
+            if (a.TestType == Any.Monster)
+            {
+                var monster2A = a.Test<Monster>().Value;
+                var monster2B = b.Test<Monster>().Value;
+                Assert.AreEqual(monster2A.Name, monster2B.Name);
+            }
+
+            Assert.AreEqual(a.InventoryLength, b.InventoryLength);
+            for (var i = 0; i < a.InventoryLength; ++i)
+            {
+                Assert.AreEqual(a.Inventory(i), b.Inventory(i));
+            }
+
+            var inventoryArrayA = a.GetInventoryArray();
+            var inventoryArrayALength = inventoryArrayA == null ? 0 : inventoryArrayA.Length;
+            var inventoryArrayB = b.GetInventoryArray();
+            var inventoryArrayBLength = inventoryArrayB == null ? 0 : inventoryArrayB.Length;
+            Assert.AreEqual(inventoryArrayALength, inventoryArrayBLength);
+            for (var i = 0; i < inventoryArrayALength; ++i)
+            {
+                Assert.AreEqual(inventoryArrayA[i], inventoryArrayB[i]);
+            }
+
+            Assert.AreEqual(a.Test4Length, b.Test4Length);
+            for (var i = 0; i < a.Test4Length; ++i)
+            {
+                var t4A = a.Test4(i);
+                var t4B = b.Test4(i);
+                Assert.AreEqual(t4A.Value.A, t4B.Value.A);
+                Assert.AreEqual(t4A.Value.B, t4B.Value.B);
+            }
+
+            Assert.AreEqual(a.TestarrayofstringLength, b.TestarrayofstringLength);
+            for (var i = 0; i < a.TestarrayofstringLength; ++i)
+            {
+                Assert.AreEqual(a.Testarrayofstring(i), b.Testarrayofstring(i));
+            }
+
+            Assert.AreEqual(a.Testbool, b.Testbool);
+
+            Assert.AreEqual(a.TestarrayofboolsLength, b.TestarrayofboolsLength);
+            for (var i = 0; i < a.TestarrayofboolsLength; ++i)
+            {
+                Assert.AreEqual(a.Testarrayofbools(i), b.Testarrayofbools(i));
+            }
+
+            Assert.AreEqual(a.VectorOfLongsLength, b.VectorOfLongsLength);
+            for (var i = 0; i < a.VectorOfLongsLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfLongs(i), b.VectorOfLongs(i));
+            }
+
+            Assert.AreEqual(a.VectorOfDoublesLength, b.VectorOfDoublesLength);
+            for (var i = 0; i < a.VectorOfDoublesLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfDoubles(i), b.VectorOfDoubles(i));
+            }
+
+            Assert.AreEqual(a.VectorOfEnumsLength, b.VectorOfEnumsLength);
+            for (var i = 0; i < a.VectorOfEnumsLength; ++i)
+            {
+                Assert.AreEqual(a.VectorOfEnums(i), b.VectorOfEnums(i));
+            }
+        }
+
+        private void TestObjectAPI(Monster a)
+        {
+            var b = a.UnPack();
+            AreEqual(a, b);
+
+            var fbb = new FlatBufferBuilder(1);
+            fbb.Finish(Monster.Pack(fbb, b).Value);
+            var c = Monster.GetRootAsMonster(fbb.DataBuffer);
+            AreEqual(a, c);
+
+            var jsonText = b.SerializeToJson();
+            var d = MonsterT.DeserializeFromJson(jsonText);
+            AreEqual(a, d);
+
+            var fbBuffer = b.SerializeToBinary();
+            Assert.IsTrue(Monster.MonsterBufferHasIdentifier(new ByteBuffer(fbBuffer)));
+            var e = MonsterT.DeserializeFromBinary(fbBuffer);
+            AreEqual(a, e);
+        }
+
+        private void AreEqual(ArrayTable a, ArrayTableT b)
+        {
+            Assert.AreEqual(a.A.Value.A, b.A.A);
+
+            for (int i = 0; i < 15; ++i)
+            {
+                Assert.AreEqual(a.A.Value.B(i), b.A.B[i]);
+            }
+
+            Assert.AreEqual(a.A.Value.C, b.A.C);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                var ad = a.A.Value.D(i);
+                var bd = b.A.D[i];
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.A(j), bd.A[j]);
+                }
+
+                Assert.AreEqual(ad.B, bd.B);
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.C(j), bd.C[j]);
+                }
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.D(j), bd.D[j]);
+                }
+            }
+
+            Assert.AreEqual(a.A.Value.E, b.A.E);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                Assert.AreEqual(a.A.Value.F(i), b.A.F[i]);
+            }
+        }
+
+        private void AreEqual(ArrayTable a, ArrayTable b)
+        {
+            Assert.AreEqual(a.A.Value.A, b.A.Value.A);
+
+            for (int i = 0; i < 15; ++i)
+            {
+                Assert.AreEqual(a.A.Value.B(i), b.A.Value.B(i));
+            }
+
+            Assert.AreEqual(a.A.Value.C, b.A.Value.C);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                var ad = a.A.Value.D(i);
+                var bd = b.A.Value.D(i);
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.A(j), bd.A(j));
+                }
+
+                Assert.AreEqual(ad.B, bd.B);
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.C(j), bd.C(j));
+                }
+
+                for (int j = 0; j < 2; ++j)
+                {
+                    Assert.AreEqual(ad.D(j), bd.D(j));
+                }
+            }
+
+            Assert.AreEqual(a.A.Value.E, b.A.Value.E);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                Assert.AreEqual(a.A.Value.F(i), b.A.Value.F(i));
+            }
+        }
+
+        private void TestObjectAPI(ArrayTable a)
+        {
+            var b = a.UnPack();
+            AreEqual(a, b);
+
+            var fbb = new FlatBufferBuilder(1);
+            fbb.Finish(ArrayTable.Pack(fbb, b).Value);
+            var c = ArrayTable.GetRootAsArrayTable(fbb.DataBuffer);
+            AreEqual(a, c);
+
+            var jsonText = b.SerializeToJson();
+            var d = ArrayTableT.DeserializeFromJson(jsonText);
+            AreEqual(a, d);
+
+            var fbBuffer = b.SerializeToBinary();
+            Assert.IsTrue(ArrayTable.ArrayTableBufferHasIdentifier(new ByteBuffer(fbBuffer)));
+            var e = ArrayTableT.DeserializeFromBinary(fbBuffer);
+            AreEqual(a, e);
+        }
+
+        private void AreEqual(Movie a, MovieT b)
+        {
+            Assert.AreEqual(a.MainCharacterType, b.MainCharacter.Type);
+            Assert.AreEqual(a.MainCharacter<Rapunzel>().Value.HairLength, b.MainCharacter.AsRapunzel().HairLength);
+
+            Assert.AreEqual(a.CharactersLength, b.Characters.Count);
+            Assert.AreEqual(a.CharactersType(0), b.Characters[0].Type);
+            Assert.AreEqual(a.Characters<Attacker>(0).Value.SwordAttackDamage, b.Characters[0].AsMuLan().SwordAttackDamage);
+            Assert.AreEqual(a.CharactersType(1), b.Characters[1].Type);
+            Assert.AreEqual(a.Characters<BookReader>(1).Value.BooksRead, b.Characters[1].AsBelle().BooksRead);
+            Assert.AreEqual(a.CharactersType(2), b.Characters[2].Type);
+            Assert.AreEqual(a.CharactersAsString(2), b.Characters[2].AsOther());
+        }
+
+        private void AreEqual(Movie a, Movie b)
+        {
+            Assert.AreEqual(a.MainCharacterType, b.MainCharacterType);
+            Assert.AreEqual(a.MainCharacter<Rapunzel>().Value.HairLength, b.MainCharacter<Rapunzel>().Value.HairLength);
+
+            Assert.AreEqual(a.CharactersLength, b.CharactersLength);
+            Assert.AreEqual(a.CharactersType(0), b.CharactersType(0));
+            Assert.AreEqual(a.Characters<Attacker>(0).Value.SwordAttackDamage, b.Characters<Attacker>(0).Value.SwordAttackDamage);
+            Assert.AreEqual(a.CharactersType(1), b.CharactersType(1));
+            Assert.AreEqual(a.Characters<BookReader>(1).Value.BooksRead, b.Characters<BookReader>(1).Value.BooksRead);
+            Assert.AreEqual(a.CharactersType(2), b.CharactersType(2));
+            Assert.AreEqual(a.CharactersAsString(2), b.CharactersAsString(2));
+        }
+
+        private void TestObjectAPI(Movie a)
+        {
+            var b = a.UnPack();
+            AreEqual(a, b);
+
+            var fbb = new FlatBufferBuilder(1);
+            fbb.Finish(Movie.Pack(fbb, b).Value);
+            var c = Movie.GetRootAsMovie(fbb.DataBuffer);
+            AreEqual(a, c);
+
+            var jsonText = b.SerializeToJson();
+            var d = MovieT.DeserializeFromJson(jsonText);
+            AreEqual(a, d);
+
+            var fbBuffer = b.SerializeToBinary();
+            Assert.IsTrue(Movie.MovieBufferHasIdentifier(new ByteBuffer(fbBuffer)));
+            var e = MovieT.DeserializeFromBinary(fbBuffer);
+            AreEqual(a, e);
+        }
+
+        // For use in TestParallelAccess test case.
+        static private int _comparisons = 0;
+        static private int _failures = 0;
+        static private void KeepComparing(Monster mon, int count, float floatValue, double doubleValue)
+        {
+            int i = 0;
+            while (++i <= count)
+            {
+                Interlocked.Add(ref _comparisons, 1);
+                if(mon.Pos.Value.Test1 != doubleValue || mon.Pos.Value.Z != floatValue) {
+                    Interlocked.Add(ref _failures, 1);
+                }
+            }
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestParallelAccess() {
+            // Tests that reading from a flatbuffer over multiple threads is thread-safe in regard to double and float
+            // values, since they previously were non-thread safe
+            const float floatValue = 3.141592F;
+            const double doubleValue = 1.618033988;
+
+            var fbb = new FlatBufferBuilder(1);
+            var str = fbb.CreateString("ParallelTest");
+            Monster.StartMonster(fbb);
+            Monster.AddPos(fbb, Vec3.CreateVec3(fbb, 1.0f, 2.0f, floatValue, doubleValue,
+                                                     Color.Green, (short)5, (sbyte)6));
+
+            Monster.AddName(fbb, str);
+            Monster.FinishMonsterBuffer(fbb, Monster.EndMonster(fbb));
+
+            var mon = Monster.GetRootAsMonster(fbb.DataBuffer);
+
+            var pos = mon.Pos.Value;
+            Assert.AreEqual(pos.Test1, doubleValue);
+            Assert.AreEqual(pos.Z, floatValue);
+
+            const int thread_count = 10;
+            const int reps = 1000000;
+
+            // Need to use raw Threads since Tasks are not supported in .NET 3.5
+            Thread[] threads = new Thread[thread_count];
+            for(int i = 0; i < thread_count; i++) {
+               threads[i] = new Thread(() => KeepComparing(mon, reps, floatValue, doubleValue));
+            }
+            for(int i = 0; i < thread_count; i++) {
+               threads[i].Start();
+            }
+            for(int i = 0; i < thread_count; i++) {
+               threads[i].Join();
+            }
+
+            // Make sure the threads actually did the comparisons.
+            Assert.AreEqual(thread_count * reps, _comparisons);
+
+            // Make sure we never read the values incorrectly.
+            Assert.AreEqual(0, _failures);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestScalarOptional_EmptyBuffer() {
+            var fbb = new FlatBufferBuilder(1);
+            ScalarStuff.StartScalarStuff(fbb);
+            var offset = ScalarStuff.EndScalarStuff(fbb);
+            ScalarStuff.FinishScalarStuffBuffer(fbb, offset);
+
+            ScalarStuff scalarStuff = ScalarStuff.GetRootAsScalarStuff(fbb.DataBuffer);
+            Assert.AreEqual((sbyte)0, scalarStuff.JustI8);
+            Assert.AreEqual(null, scalarStuff.MaybeI8);
+            Assert.AreEqual((sbyte)42, scalarStuff.DefaultI8);
+            Assert.AreEqual((byte)0, scalarStuff.JustU8);
+            Assert.AreEqual(null, scalarStuff.MaybeU8);
+            Assert.AreEqual((byte)42, scalarStuff.DefaultU8);
+
+            Assert.AreEqual((short)0, scalarStuff.JustI16);
+            Assert.AreEqual(null, scalarStuff.MaybeI16);
+            Assert.AreEqual((short)42, scalarStuff.DefaultI16);
+            Assert.AreEqual((ushort)0, scalarStuff.JustU16);
+            Assert.AreEqual(null, scalarStuff.MaybeU16);
+            Assert.AreEqual((ushort)42, scalarStuff.DefaultU16);
+
+            Assert.AreEqual((int)0, scalarStuff.JustI32);
+            Assert.AreEqual(null, scalarStuff.MaybeI32);
+            Assert.AreEqual((int)42, scalarStuff.DefaultI32);
+            Assert.AreEqual((uint)0, scalarStuff.JustU32);
+            Assert.AreEqual(null, scalarStuff.MaybeU32);
+            Assert.AreEqual((uint)42, scalarStuff.DefaultU32);
+
+            Assert.AreEqual((long)0, scalarStuff.JustI64);
+            Assert.AreEqual(null, scalarStuff.MaybeI64);
+            Assert.AreEqual((long)42, scalarStuff.DefaultI64);
+            Assert.AreEqual((ulong)0, scalarStuff.JustU64);
+            Assert.AreEqual(null, scalarStuff.MaybeU64);
+            Assert.AreEqual((ulong)42, scalarStuff.DefaultU64);
+
+            Assert.AreEqual((float)0.0F, scalarStuff.JustF32);
+            Assert.AreEqual(null, scalarStuff.MaybeF32);
+            Assert.AreEqual((float)42.0F, scalarStuff.DefaultF32);
+
+            Assert.AreEqual((double)0.0, scalarStuff.JustF64);
+            Assert.AreEqual(null, scalarStuff.MaybeF64);
+            Assert.AreEqual((double)42.0, scalarStuff.DefaultF64);
+
+            Assert.AreEqual(false, scalarStuff.JustBool);
+            Assert.AreEqual(null, scalarStuff.MaybeBool);
+            Assert.AreEqual(true, scalarStuff.DefaultBool);
+
+            Assert.AreEqual(OptionalByte.None, scalarStuff.JustEnum);
+            Assert.AreEqual(null, scalarStuff.MaybeEnum);
+            Assert.AreEqual(OptionalByte.One, scalarStuff.DefaultEnum);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestScalarOptional_Construction() {
+            var fbb = new FlatBufferBuilder(1);
+            ScalarStuff.StartScalarStuff(fbb);
+            ScalarStuff.AddJustI8(fbb, 5);
+            ScalarStuff.AddMaybeI8(fbb, 5);
+            ScalarStuff.AddDefaultI8(fbb, 5);
+            ScalarStuff.AddJustU8(fbb, 6);
+            ScalarStuff.AddMaybeU8(fbb, 6);
+            ScalarStuff.AddDefaultU8(fbb, 6);
+
+            ScalarStuff.AddJustI16(fbb, 7);
+            ScalarStuff.AddMaybeI16(fbb, 7);
+            ScalarStuff.AddDefaultI16(fbb, 7);
+            ScalarStuff.AddJustU16(fbb, 8);
+            ScalarStuff.AddMaybeU16(fbb, 8);
+            ScalarStuff.AddDefaultU16(fbb, 8);
+
+            ScalarStuff.AddJustI32(fbb, 9);
+            ScalarStuff.AddMaybeI32(fbb, 9);
+            ScalarStuff.AddDefaultI32(fbb, 9);
+            ScalarStuff.AddJustU32(fbb, 10);
+            ScalarStuff.AddMaybeU32(fbb, 10);
+            ScalarStuff.AddDefaultU32(fbb, 10);
+
+            ScalarStuff.AddJustI64(fbb, 11);
+            ScalarStuff.AddMaybeI64(fbb, 11);
+            ScalarStuff.AddDefaultI64(fbb, 11);
+            ScalarStuff.AddJustU64(fbb, 12);
+            ScalarStuff.AddMaybeU64(fbb, 12);
+            ScalarStuff.AddDefaultU64(fbb, 12);
+
+            ScalarStuff.AddJustF32(fbb, 13.0f);
+            ScalarStuff.AddMaybeF32(fbb, 13.0f);
+            ScalarStuff.AddDefaultF32(fbb, 13.0f);
+            ScalarStuff.AddJustF64(fbb, 14.0);
+            ScalarStuff.AddMaybeF64(fbb, 14.0);
+            ScalarStuff.AddDefaultF64(fbb, 14.0);
+
+            ScalarStuff.AddJustBool(fbb, true);
+            ScalarStuff.AddMaybeBool(fbb, true);
+            ScalarStuff.AddDefaultBool(fbb, false); // note this is the opposite
+
+            ScalarStuff.AddJustEnum(fbb, OptionalByte.Two);
+            ScalarStuff.AddMaybeEnum(fbb, OptionalByte.Two);
+            ScalarStuff.AddDefaultEnum(fbb, OptionalByte.Two);
+
+            var offset = ScalarStuff.EndScalarStuff(fbb);
+            ScalarStuff.FinishScalarStuffBuffer(fbb, offset);
+
+            ScalarStuff scalarStuff = ScalarStuff.GetRootAsScalarStuff(fbb.DataBuffer);
+            Assert.AreEqual((sbyte)5, scalarStuff.JustI8);
+            Assert.AreEqual((sbyte)5, scalarStuff.MaybeI8);
+            Assert.AreEqual((sbyte)5, scalarStuff.DefaultI8);
+            Assert.AreEqual((byte)6, scalarStuff.JustU8);
+            Assert.AreEqual((byte)6, scalarStuff.MaybeU8);
+            Assert.AreEqual((byte)6, scalarStuff.DefaultU8);
+
+            Assert.AreEqual((short)7, scalarStuff.JustI16);
+            Assert.AreEqual((short)7, scalarStuff.MaybeI16);
+            Assert.AreEqual((short)7, scalarStuff.DefaultI16);
+            Assert.AreEqual((ushort)8, scalarStuff.JustU16);
+            Assert.AreEqual((ushort)8, scalarStuff.MaybeU16);
+            Assert.AreEqual((ushort)8, scalarStuff.DefaultU16);
+
+            Assert.AreEqual((int)9, scalarStuff.JustI32);
+            Assert.AreEqual((int)9, scalarStuff.MaybeI32);
+            Assert.AreEqual((int)9, scalarStuff.DefaultI32);
+            Assert.AreEqual((uint)10, scalarStuff.JustU32);
+            Assert.AreEqual((uint)10, scalarStuff.MaybeU32);
+            Assert.AreEqual((uint)10, scalarStuff.DefaultU32);
+
+            Assert.AreEqual((long)11, scalarStuff.JustI64);
+            Assert.AreEqual((long)11, scalarStuff.MaybeI64);
+            Assert.AreEqual((long)11, scalarStuff.DefaultI64);
+            Assert.AreEqual((ulong)12, scalarStuff.JustU64);
+            Assert.AreEqual((ulong)12, scalarStuff.MaybeU64);
+            Assert.AreEqual((ulong)12, scalarStuff.DefaultU64);
+
+            Assert.AreEqual((float)13.0F, scalarStuff.JustF32);
+            Assert.AreEqual((float)13.0F, scalarStuff.MaybeF32);
+            Assert.AreEqual((float)13.0F, scalarStuff.DefaultF32);
+
+            Assert.AreEqual((double)14.0, scalarStuff.JustF64);
+            Assert.AreEqual((double)14.0, scalarStuff.MaybeF64);
+            Assert.AreEqual((double)14.0, scalarStuff.DefaultF64);
+
+            Assert.AreEqual(true, scalarStuff.JustBool);
+            Assert.AreEqual(true, scalarStuff.MaybeBool);
+            Assert.AreEqual(false, scalarStuff.DefaultBool);
+
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.JustEnum);
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.MaybeEnum);
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.DefaultEnum);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestScalarOptional_Construction_CreatorMethod() {
+            var fbb = new FlatBufferBuilder(1);
+
+            var offset = ScalarStuff.CreateScalarStuff(fbb,5,5,5,6,6,6,7,7,7,
+                8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13.0f,13.0f,13.0f,14.0,
+                14.0,14.0,true,true,false,OptionalByte.Two,OptionalByte.Two,
+                OptionalByte.Two);
+            ScalarStuff.FinishScalarStuffBuffer(fbb, offset);
+
+            ScalarStuff scalarStuff = ScalarStuff.GetRootAsScalarStuff(fbb.DataBuffer);
+            Assert.AreEqual((sbyte)5, scalarStuff.JustI8);
+            Assert.AreEqual((sbyte)5, scalarStuff.MaybeI8);
+            Assert.AreEqual((sbyte)5, scalarStuff.DefaultI8);
+            Assert.AreEqual((byte)6, scalarStuff.JustU8);
+            Assert.AreEqual((byte)6, scalarStuff.MaybeU8);
+            Assert.AreEqual((byte)6, scalarStuff.DefaultU8);
+
+            Assert.AreEqual((short)7, scalarStuff.JustI16);
+            Assert.AreEqual((short)7, scalarStuff.MaybeI16);
+            Assert.AreEqual((short)7, scalarStuff.DefaultI16);
+            Assert.AreEqual((ushort)8, scalarStuff.JustU16);
+            Assert.AreEqual((ushort)8, scalarStuff.MaybeU16);
+            Assert.AreEqual((ushort)8, scalarStuff.DefaultU16);
+
+            Assert.AreEqual((int)9, scalarStuff.JustI32);
+            Assert.AreEqual((int)9, scalarStuff.MaybeI32);
+            Assert.AreEqual((int)9, scalarStuff.DefaultI32);
+            Assert.AreEqual((uint)10, scalarStuff.JustU32);
+            Assert.AreEqual((uint)10, scalarStuff.MaybeU32);
+            Assert.AreEqual((uint)10, scalarStuff.DefaultU32);
+
+            Assert.AreEqual((long)11, scalarStuff.JustI64);
+            Assert.AreEqual((long)11, scalarStuff.MaybeI64);
+            Assert.AreEqual((long)11, scalarStuff.DefaultI64);
+            Assert.AreEqual((ulong)12, scalarStuff.JustU64);
+            Assert.AreEqual((ulong)12, scalarStuff.MaybeU64);
+            Assert.AreEqual((ulong)12, scalarStuff.DefaultU64);
+
+            Assert.AreEqual((float)13.0F, scalarStuff.JustF32);
+            Assert.AreEqual((float)13.0F, scalarStuff.MaybeF32);
+            Assert.AreEqual((float)13.0F, scalarStuff.DefaultF32);
+
+            Assert.AreEqual((double)14.0, scalarStuff.JustF64);
+            Assert.AreEqual((double)14.0, scalarStuff.MaybeF64);
+            Assert.AreEqual((double)14.0, scalarStuff.DefaultF64);
+
+            Assert.AreEqual(true, scalarStuff.JustBool);
+            Assert.AreEqual(true, scalarStuff.MaybeBool);
+            Assert.AreEqual(false, scalarStuff.DefaultBool);
+
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.JustEnum);
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.MaybeEnum);
+            Assert.AreEqual(OptionalByte.Two, scalarStuff.DefaultEnum);
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersFuzzTests.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersFuzzTests.cs
new file mode 100644
index 0000000..b6c60ea
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersFuzzTests.cs
@@ -0,0 +1,811 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace FlatBuffers.Test
+{
+    [FlatBuffersTestClass]
+    public class FlatBuffersFuzzTests
+    {
+        private readonly Lcg _lcg = new Lcg();
+
+        [FlatBuffersTestMethod]
+        public void TestObjects()
+        {
+            CheckObjects(11, 100);
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestNumbers()
+        {
+            var builder = new FlatBufferBuilder(1);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddBool(true);
+            Assert.ArrayEqual(new byte[] { 1 }, builder.DataBuffer.ToFullArray());
+            builder.AddSbyte(-127);
+            Assert.ArrayEqual(new byte[] { 129, 1 }, builder.DataBuffer.ToFullArray());
+            builder.AddByte(255);
+            Assert.ArrayEqual(new byte[] { 0, 255, 129, 1 }, builder.DataBuffer.ToFullArray()); // First pad
+            builder.AddShort(-32222);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0x22, 0x82, 0, 255, 129, 1 }, builder.DataBuffer.ToFullArray()); // Second pad
+            builder.AddUshort(0xFEEE);
+            Assert.ArrayEqual(new byte[] { 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1 }, builder.DataBuffer.ToFullArray()); // no pad
+            builder.AddInt(-53687092);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1 }, builder.DataBuffer.ToFullArray()); // third pad
+            builder.AddUint(0x98765432);
+            Assert.ArrayEqual(new byte[] { 0x32, 0x54, 0x76, 0x98, 204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1 }, builder.DataBuffer.ToFullArray()); // no pad
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestNumbers64()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.AddUlong(0x1122334455667788);
+            Assert.ArrayEqual(new byte[] { 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 }, builder.DataBuffer.ToFullArray());
+
+            builder = new FlatBufferBuilder(1);
+            builder.AddLong(0x1122334455667788);
+            Assert.ArrayEqual(new byte[] { 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVector_1xUInt8()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(byte), 1, 1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddByte(1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 1, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.EndVector();
+            Assert.ArrayEqual(new byte[] { 1, 0, 0, 0, 1, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVector_2xUint8()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(byte), 2, 1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddByte(1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 1, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddByte(2);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 2, 1, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.EndVector();
+            Assert.ArrayEqual(new byte[] { 2, 0, 0, 0, 2, 1, 0, 0 }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVector_1xUInt16()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(ushort), 1, 1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddUshort(1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 1, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.EndVector();
+            Assert.ArrayEqual(new byte[] { 1, 0, 0, 0, 1, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVector_2xUInt16()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(ushort), 2, 1);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddUshort(0xABCD);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0, 0, 0xCD, 0xAB }, builder.DataBuffer.ToFullArray());
+            builder.AddUshort(0xDCBA);
+            Assert.ArrayEqual(new byte[] { 0, 0, 0, 0, 0xBA, 0xDC, 0xCD, 0xAB }, builder.DataBuffer.ToFullArray());
+            builder.EndVector();
+            Assert.ArrayEqual(new byte[] { 2, 0, 0, 0, 0xBA, 0xDC, 0xCD, 0xAB }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestCreateAsciiString()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.CreateString("foo");
+            Assert.ArrayEqual(new byte[] { 3, 0, 0, 0, (byte)'f', (byte)'o', (byte)'o', 0 }, builder.DataBuffer.ToFullArray());
+
+            builder.CreateString("moop");
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,  // Padding to 32 bytes
+                4, 0, 0, 0,
+                (byte)'m', (byte)'o', (byte)'o', (byte)'p',
+                0, 0, 0, 0, // zero terminator with 3 byte pad
+                3, 0, 0, 0,
+                (byte)'f', (byte)'o', (byte)'o', 0
+            }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestCreateSharedAsciiString()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.CreateSharedString("foo");
+            Assert.ArrayEqual(new byte[] { 3, 0, 0, 0, (byte)'f', (byte)'o', (byte)'o', 0 }, builder.DataBuffer.ToFullArray());
+
+            builder.CreateSharedString("foo");
+            Assert.ArrayEqual(new byte[] { 3, 0, 0, 0, (byte)'f', (byte)'o', (byte)'o', 0 }, builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestCreateArbitarytring()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.CreateString("\x01\x02\x03");
+            Assert.ArrayEqual(new byte[]
+            {
+                3, 0, 0, 0,
+                0x01, 0x02, 0x03, 0
+            }, builder.DataBuffer.ToFullArray()); // No padding
+            builder.CreateString("\x04\x05\x06\x07");
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,  // Padding to 32 bytes
+                4, 0, 0, 0,
+                0x04, 0x05, 0x06, 0x07,
+                0, 0, 0, 0, // zero terminator with 3 byte pad
+                3, 0, 0, 0,
+                0x01, 0x02, 0x03, 0
+            }, builder.DataBuffer.ToFullArray()); // No padding
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestEmptyVTable()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(0);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                4, 0, 4, 0,
+                4, 0, 0, 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithOneBool()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(1);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddBool(0, true, false);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, // padding to 16 bytes
+                6, 0, // vtable bytes
+                8, 0, // object length inc vtable offset
+                7, 0, // start of bool value
+                6, 0, 0, 0, // int32 offset for start of vtable
+                0, 0, 0, // padding
+                1, // value 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithOneBool_DefaultValue()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(1);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddBool(0, false, false);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                // No padding.
+                4, 0, // vtable bytes
+                4, 0, // end of object from here
+                // entry 0 is not stored (trimmed end of vtable)
+                4, 0, 0, 0, // int32 offset for start of vtable
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithOneInt16()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(1);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddShort(0, 0x789A, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, // padding to 16 bytes
+                6, 0, // vtable bytes
+                8, 0, // object length inc vtable offset
+                6, 0, // start of int16 value
+                6, 0, 0, 0, // int32 offset for start of vtable
+                0, 0, // padding
+                0x9A, 0x78, //value 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithTwoInt16()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(2);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddShort(0, 0x3456, 0);
+            builder.AddShort(1, 0x789A, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                8, 0, // vtable bytes
+                8, 0, // object length inc vtable offset
+                6, 0, // start of int16 value 0
+                4, 0, // start of int16 value 1
+                8, 0, 0, 0, // int32 offset for start of vtable
+                0x9A, 0x78, // value 1
+                0x56, 0x34, // value 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithInt16AndBool()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(2);
+            Assert.ArrayEqual(new byte[] { 0 }, builder.DataBuffer.ToFullArray());
+            builder.AddShort(0, 0x3456, 0);
+            builder.AddBool(1, true, false);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                8, 0, // vtable bytes
+                8, 0, // object length inc vtable offset
+                6, 0, // start of int16 value 0
+                5, 0, // start of bool value 1
+                8, 0, 0, 0, // int32 offset for start of vtable
+                0, 1, // padding + value 1
+                0x56, 0x34, // value 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithEmptyVector()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(byte), 0, 1);
+            var vecEnd = builder.EndVector();
+
+            builder.StartTable(1);
+
+            builder.AddOffset(0, vecEnd.Value, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0,       // Padding to 32 bytes
+                6, 0, // vtable bytes
+                8, 0, // object length inc vtable offset
+                4, 0, // start of vector offset value 0
+                6, 0, 0, 0, // int32 offset for start of vtable
+                4, 0, 0, 0,
+                0, 0, 0, 0,
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithEmptyVectorAndScalars()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(byte), 0, 1);
+            var vecEnd = builder.EndVector();
+
+            builder.StartTable(2);
+            builder.AddShort(0, 55, 0);
+            builder.AddOffset(1, vecEnd.Value, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0, // Padding to 32 bytes
+                8, 0, // vtable bytes
+                12, 0, // object length inc vtable offset
+                10, 0,     // offset to int16 value 0
+                4, 0, // start of vector offset value 1
+                8, 0, 0, 0, // int32 offset for start of vtable
+                8, 0, 0, 0, // value 1
+                0, 0, 55, 0, // value 0
+                0, 0, 0, 0, // length of vector (not in sctruc)
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWith_1xInt16_and_Vector_or_2xInt16()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(short), 2, 1);
+            builder.AddShort(0x1234);
+            builder.AddShort(0x5678);
+            var vecEnd = builder.EndVector();
+
+            builder.StartTable(2);
+            builder.AddOffset(1, vecEnd.Value, 0);
+            builder.AddShort(0, 55, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0, // Padding to 32 bytes
+                8, 0, // vtable bytes
+                12, 0, // object length
+                6, 0,     // start of value 0 from end of vtable
+                8, 0,     // start of value 1 from end of buffer
+                8, 0, 0, 0, // int32 offset for start of vtable
+                0, 0, 55, 0,    // padding + value 0
+                4, 0, 0, 0, // position of vector from here
+                2, 0, 0, 0, // length of vector
+                0x78, 0x56,       // vector value 0
+                0x34, 0x12,       // vector value 1
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithAStruct_of_int8_int16_int32()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(1);
+            builder.Prep(4+4+4, 0);
+            builder.AddSbyte(55);
+            builder.Pad(3);
+            builder.AddShort(0x1234);
+            builder.Pad(2);
+            builder.AddInt(0x12345678);
+            var structStart = builder.Offset;
+            builder.AddStruct(0, structStart, 0);
+            builder.EndTable();
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, // Padding to 32 bytes
+                6, 0, // vtable bytes
+                16, 0, // object length
+                4, 0,     // start of struct from here
+                6, 0, 0, 0, // int32 offset for start of vtable
+                0x78, 0x56, 0x34, 0x12,  // struct value 2
+                0x00, 0x00, 0x34, 0x12, // struct value 1
+                0x00, 0x00, 0x00, 55, // struct value 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithAVectorOf_2xStructOf_2xInt8()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartVector(sizeof(byte)*2, 2, 1);
+            builder.AddByte(33);
+            builder.AddByte(44);
+            builder.AddByte(55);
+            builder.AddByte(66);
+            var vecEnd = builder.EndVector();
+
+            builder.StartTable(1);
+            builder.AddOffset(0, vecEnd.Value, 0);
+            builder.EndTable();
+
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, // Padding to 32 bytes
+                6, 0, // vtable bytes
+                8, 0, // object length
+                4, 0,     // offset of vector offset
+                6, 0, 0, 0, // int32 offset for start of vtable
+                4, 0, 0, 0, // Vector start offset
+                2, 0, 0, 0, // Vector len
+                66, // vector 1, 1
+                55, // vector 1, 0
+                44, // vector 0, 1
+                33, // vector 0, 0
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestVTableWithSomeElements()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(2);
+            builder.AddByte(0, 33, 0);
+            builder.AddShort(1, 66, 0);
+            var off = builder.EndTable();
+            builder.Finish(off);
+
+            byte[] padded = new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0, //Padding to 32 bytes
+                12, 0, 0, 0,     // root of table, pointing to vtable offset
+                8, 0, // vtable bytes
+                8, 0, // object length
+                7, 0, // start of value 0
+                4, 0, // start of value 1
+                8, 0, 0, 0, // int32 offset for start of vtable
+                66, 0, // value 1
+                0, 33, // value 0
+
+            };
+            Assert.ArrayEqual(padded, builder.DataBuffer.ToFullArray());
+
+            // no padding in sized array
+            byte[] unpadded = new byte[padded.Length - 12];
+            Buffer.BlockCopy(padded, 12, unpadded, 0, unpadded.Length);
+            Assert.ArrayEqual(unpadded, builder.DataBuffer.ToSizedArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestTwoFinishTable()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(2);
+            builder.AddByte(0, 33, 0);
+            builder.AddByte(1, 44, 0);
+            var off0 = builder.EndTable();
+            builder.Finish(off0);
+
+            builder.StartTable(3);
+            builder.AddByte(0, 55, 0);
+            builder.AddByte(1, 66, 0);
+            builder.AddByte(2, 77, 0);
+            var off1 = builder.EndTable();
+            builder.Finish(off1);
+
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,       // padding to 64 bytes
+                16, 0, 0, 0,     // root of table, pointing to vtable offset (obj1)
+                0, 0, // padding
+
+                10, 0, // vtable bytes
+                8, 0, // object length
+                7, 0, // start of value 0
+                6, 0, // start of value 1
+                5, 0, // start of value 2
+                10, 0, 0, 0, // int32 offset for start of vtable
+                0, // pad
+                77, // values 2, 1, 0
+                66,
+                55,
+
+                12, 0, 0, 0,     // root of table, pointing to vtable offset (obj0)
+                8, 0, // vtable bytes
+                8, 0, // object length
+                7, 0, // start of value 0
+                6, 0, // start of value 1
+                8, 0, 0, 0, // int32 offset for start of vtable
+                0, 0, // pad
+                44, // value 1, 0
+                33,
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestBunchOfBools()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(8);
+            for (var i = 0; i < 8; i++)
+            {
+                builder.AddBool(i, true, false);
+            }
+            var off = builder.EndTable();
+            builder.Finish(off);
+
+            byte[] padded = new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,       // padding to 64 bytes
+
+                24, 0, 0, 0,     // root of table, pointing to vtable offset (obj0)
+                20, 0, // vtable bytes
+                12, 0, // object length
+                11, 0, // start of value 0
+                10, 0, // start of value 1
+                9, 0, // start of value 2
+                8, 0, // start of value 3
+                7, 0, // start of value 4
+                6, 0, // start of value 5
+                5, 0, // start of value 6
+                4, 0, // start of value 7
+
+                20, 0, 0, 0, // int32 offset for start of vtable
+
+                1, 1, 1, 1,  // values
+                1, 1, 1, 1,
+
+            };
+            Assert.ArrayEqual(padded, builder.DataBuffer.ToFullArray());
+
+            // no padding in sized array
+            byte[] unpadded = new byte[padded.Length - 28];
+            Buffer.BlockCopy(padded, 28, unpadded, 0, unpadded.Length);
+            Assert.ArrayEqual(unpadded, builder.DataBuffer.ToSizedArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestBunchOfBoolsSizePrefixed()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(8);
+            for (var i = 0; i < 8; i++)
+            {
+                builder.AddBool(i, true, false);
+            }
+            var off = builder.EndTable();
+            builder.FinishSizePrefixed(off);
+
+            byte[] padded = new byte[]
+            {
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0, 0,      // padding to 64 bytes
+
+                36, 0, 0, 0,     // size prefix
+                24, 0, 0, 0,     // root of table, pointing to vtable offset (obj0)
+                20, 0, // vtable bytes
+                12, 0, // object length
+                11, 0, // start of value 0
+                10, 0, // start of value 1
+                9, 0, // start of value 2
+                8, 0, // start of value 3
+                7, 0, // start of value 4
+                6, 0, // start of value 5
+                5, 0, // start of value 6
+                4, 0, // start of value 7
+
+                20, 0, 0, 0, // int32 offset for start of vtable
+
+                1, 1, 1, 1,  // values
+                1, 1, 1, 1,
+
+            };
+            Assert.ArrayEqual(padded, builder.DataBuffer.ToFullArray());
+
+            // no padding in sized array
+            byte[] unpadded = new byte[padded.Length - 24];
+            Buffer.BlockCopy(padded, 24, unpadded, 0, unpadded.Length);
+            Assert.ArrayEqual(unpadded, builder.DataBuffer.ToSizedArray());
+        }
+
+        [FlatBuffersTestMethod]
+        public void TestWithFloat()
+        {
+            var builder = new FlatBufferBuilder(1);
+            builder.StartTable(1);
+            builder.AddFloat(0, 1, 0);
+            builder.EndTable();
+
+
+            Assert.ArrayEqual(new byte[]
+            {
+                0, 0,
+                6, 0, // vtable bytes
+                8, 0, // object length
+                4, 0, // start of value 0
+                6, 0, 0, 0, // int32 offset for start of vtable
+                0, 0, 128, 63,  // value
+
+            },
+                builder.DataBuffer.ToFullArray());
+        }
+
+        private void CheckObjects(int fieldCount, int objectCount)
+        {
+            _lcg.Reset();
+
+            const int testValuesMax = 11;
+
+            var builder = new FlatBufferBuilder(1);
+
+            var objects = new int[objectCount];
+
+            for (var i = 0; i < objectCount; ++i)
+            {
+                builder.StartTable(fieldCount);
+
+                for (var j = 0; j < fieldCount; ++j)
+                {
+                    var fieldType = _lcg.Next()%testValuesMax;
+
+                    switch (fieldType)
+                    {
+                        case 0:
+                        {
+                            builder.AddBool(j, FuzzTestData.BoolValue, false);
+                            break;
+                        }
+                        case 1:
+                        {
+                            builder.AddSbyte(j, FuzzTestData.Int8Value, 0);
+                            break;
+                        }
+                        case 2:
+                        {
+                            builder.AddByte(j, FuzzTestData.UInt8Value, 0);
+                            break;
+                        }
+                        case 3:
+                        {
+                            builder.AddShort(j, FuzzTestData.Int16Value, 0);
+                            break;
+                        }
+                        case 4:
+                        {
+                            builder.AddUshort(j, FuzzTestData.UInt16Value, 0);
+                            break;
+                        }
+                        case 5:
+                        {
+                            builder.AddInt(j, FuzzTestData.Int32Value, 0);
+                            break;
+                        }
+                        case 6:
+                        {
+                            builder.AddUint(j, FuzzTestData.UInt32Value, 0);
+                            break;
+                        }
+                        case 7:
+                        {
+                            builder.AddLong(j, FuzzTestData.Int64Value, 0);
+                            break;
+                        }
+                        case 8:
+                        {
+                            builder.AddUlong(j, FuzzTestData.UInt64Value, 0);
+                            break;
+                        }
+                        case 9:
+                        {
+                            builder.AddFloat(j, FuzzTestData.Float32Value, 0);
+                            break;
+                        }
+                        case 10:
+                        {
+                            builder.AddDouble(j, FuzzTestData.Float64Value, 0);
+                            break;
+                        }
+                        default:
+                            throw new Exception("Unreachable");
+                    }
+
+                }
+
+                var offset = builder.EndTable();
+
+                // Store the object offset
+                objects[i] = offset;
+            }
+
+            _lcg.Reset();
+
+            // Test all objects are readable and return expected values...
+            for (var i = 0; i < objectCount; ++i)
+            {
+                var table = new TestTable(builder.DataBuffer, builder.DataBuffer.Length - objects[i]);
+
+                for (var j = 0; j < fieldCount; ++j)
+                {
+                    var fieldType = _lcg.Next() % testValuesMax;
+                    var fc = 2 + j; // 2 == VtableMetadataFields
+                    var f = fc * 2;
+
+                    switch (fieldType)
+                    {
+                        case 0:
+                        {
+                            Assert.AreEqual(FuzzTestData.BoolValue, table.GetSlot(f, false));
+                            break;
+                        }
+                        case 1:
+                        {
+                            Assert.AreEqual(FuzzTestData.Int8Value, table.GetSlot(f, (sbyte)0));
+                            break;
+                        }
+                        case 2:
+                        {
+                            Assert.AreEqual(FuzzTestData.UInt8Value, table.GetSlot(f, (byte)0));
+                            break;
+                        }
+                        case 3:
+                        {
+                            Assert.AreEqual(FuzzTestData.Int16Value, table.GetSlot(f, (short)0));
+                            break;
+                        }
+                        case 4:
+                        {
+                            Assert.AreEqual(FuzzTestData.UInt16Value, table.GetSlot(f, (ushort)0));
+                            break;
+                        }
+                        case 5:
+                        {
+                            Assert.AreEqual(FuzzTestData.Int32Value, table.GetSlot(f, (int)0));
+                            break;
+                        }
+                        case 6:
+                        {
+                            Assert.AreEqual(FuzzTestData.UInt32Value, table.GetSlot(f, (uint)0));
+                            break;
+                        }
+                        case 7:
+                        {
+                            Assert.AreEqual(FuzzTestData.Int64Value, table.GetSlot(f, (long)0));
+                            break;
+                        }
+                        case 8:
+                        {
+                            Assert.AreEqual(FuzzTestData.UInt64Value, table.GetSlot(f, (ulong)0));
+                            break;
+                        }
+                        case 9:
+                        {
+                            Assert.AreEqual(FuzzTestData.Float32Value, table.GetSlot(f, (float)0));
+                            break;
+                        }
+                        case 10:
+                        {
+                            Assert.AreEqual(FuzzTestData.Float64Value, table.GetSlot(f, (double)0));
+                            break;
+                        }
+                        default:
+                            throw new Exception("Unreachable");
+                    }
+
+                }
+
+            }
+
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestClassAttribute.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestClassAttribute.cs
new file mode 100644
index 0000000..f31e38b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestClassAttribute.cs
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace FlatBuffers.Test
+{
+    [AttributeUsage(AttributeTargets.Class)]
+    public class FlatBuffersTestClassAttribute : Attribute
+    {
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestMethodAttribute.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestMethodAttribute.cs
new file mode 100644
index 0000000..989dae5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FlatBuffersTestMethodAttribute.cs
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using System;
+
+namespace FlatBuffers.Test
+{
+    [AttributeUsage(AttributeTargets.Method)]
+    public class FlatBuffersTestMethodAttribute : Attribute
+    {
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FuzzTestData.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FuzzTestData.cs
new file mode 100644
index 0000000..119e44e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/FuzzTestData.cs
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace FlatBuffers.Test
+{
+    internal static class FuzzTestData
+    {
+        private static readonly byte[] _overflowInt32 = new byte[] {0x83, 0x33, 0x33, 0x33};
+        private static readonly byte[] _overflowInt64 = new byte[] { 0x84, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44 };
+
+        public static readonly bool BoolValue = true;
+        public static readonly  sbyte Int8Value = -127;        // 0x81
+        public static readonly  byte UInt8Value = 255;         // 0xFF
+        public static readonly  short Int16Value = -32222;     // 0x8222;
+        public static readonly  ushort UInt16Value = 65262;      // 0xFEEE
+        public static readonly int Int32Value = BitConverter.ToInt32(_overflowInt32, 0);
+        public static readonly uint UInt32Value = 0xFDDDDDDD;
+        public static readonly long Int64Value = BitConverter.ToInt64(_overflowInt64, 0);
+        public static readonly ulong UInt64Value = 0xFCCCCCCCCCCCCCCC;
+        public static readonly float Float32Value = 3.14159f;
+        public static readonly double Float64Value = 3.14159265359;
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Lcg.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Lcg.cs
new file mode 100644
index 0000000..c329ed8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Lcg.cs
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace FlatBuffers.Test
+{
+    /// <summary>
+    /// Lcg Pseudo RNG
+    /// </summary>
+    internal sealed class Lcg
+    {
+        private const uint InitialValue = 10000;
+        private uint _state;
+
+        public Lcg()
+        {
+            _state = InitialValue;
+        }
+
+        public uint Next()
+        {
+            return (_state = 69069 * _state + 362437);
+        }
+
+        public void Reset()
+        {
+            _state = InitialValue;
+        }
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.bat b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.bat
new file mode 100644
index 0000000..3b88b53
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.bat
@@ -0,0 +1,20 @@
+@echo off
+@REM Builds a .NET solution file, adds the project, builds it
+@REM and executes it. Cleans up all generated files and directories.
+
+set TEMP_BIN=.tmp
+
+@REM Run the .NET Core tests
+set CORE_FILE=FlatBuffers.Core.Test
+set CORE_PROJ_FILE=%CORE_FILE%.csproj
+set CORE_SLN_FILE=%CORE_FILE%.sln
+dotnet new sln --force --name %CORE_FILE%
+dotnet sln %CORE_SLN_FILE% add %CORE_PROJ_FILE%
+dotnet build -c Release -o %TEMP_BIN% -v quiet %CORE_PROJ_FILE%
+%TEMP_BIN%\%CORE_FILE%.exe
+del /f %CORE_SLN_FILE%
+
+@REM TODO(dbaileychess): Support the other configurations in NetTest.sh
+
+@REM remove the temp bin directory, with files (/S) and quietly (/Q)
+RD /S /Q %TEMP_BIN%
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.sh
new file mode 100755
index 0000000..f8adf29
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/NetTest.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+
+PROJ_FILE=FlatBuffers.Test.csproj
+CORE_PROJ_FILE=FlatBuffers.Core.Test.csproj
+
+TEMP_DOTNET_DIR=.dotnet_tmp
+TEMP_BIN=.tmp
+
+[ -d $TEMP_DOTNET_DIR ] || mkdir $TEMP_DOTNET_DIR
+
+[ -f dotnet-install.sh ] || curl -OL https://dot.net/v1/dotnet-install.sh
+
+./dotnet-install.sh --version latest --install-dir $TEMP_DOTNET_DIR
+
+DOTNET=$TEMP_DOTNET_DIR/dotnet
+
+$DOTNET new sln
+$DOTNET sln add $PROJ_FILE
+$DOTNET restore -r linux-x64 $PROJ_FILE
+
+# Testing C# on Linux using Mono.
+msbuild -property:Configuration=Release,OutputPath=$TEMP_BIN -verbosity:minimal $PROJ_FILE
+mono $TEMP_BIN/FlatBuffers.Test.exe
+rm -fr $TEMP_BIN
+
+# Repeat with unsafe versions
+msbuild -property:Configuration=Release,UnsafeByteBuffer=true,OutputPath=$TEMP_BIN -verbosity:minimal $PROJ_FILE
+mono $TEMP_BIN/FlatBuffers.Test.exe
+rm -fr $TEMP_BIN
+
+rm FlatBuffers.Test.sln
+rm -rf obj
+
+$DOTNET new sln
+$DOTNET sln add $CORE_PROJ_FILE
+$DOTNET restore -r linux-x64 $CORE_PROJ_FILE
+
+# Testing C# on Linux using .Net Core.
+msbuild -property:Configuration=Release,OutputPath=$TEMP_BIN -verbosity:minimal $CORE_PROJ_FILE
+$TEMP_BIN/FlatBuffers.Core.Test.exe
+rm -fr $TEMP_BIN
+
+# Repeat with unsafe versions
+msbuild -property:Configuration=Release,UnsafeByteBuffer=true,OutputPath=$TEMP_BIN -verbosity:minimal $CORE_PROJ_FILE
+$TEMP_BIN/FlatBuffers.Core.Test.exe
+rm -fr $TEMP_BIN
+
+# Repeat with SpanT versions
+msbuild -property:Configuration=Release,EnableSpanT=true,OutputPath=$TEMP_BIN -verbosity:minimal $CORE_PROJ_FILE
+$TEMP_BIN/FlatBuffers.Core.Test.exe
+rm -fr $TEMP_BIN
+
+rm FlatBuffers.Core.Test.sln
+rm -rf obj
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Program.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Program.cs
new file mode 100644
index 0000000..f8cec4e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Program.cs
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Reflection;
+
+namespace FlatBuffers.Test
+{
+    static class Program
+    {
+        public static int Main(string[] args)
+        {
+            var testResults = new List<bool>();
+
+            var testClasses = Assembly.GetExecutingAssembly().GetExportedTypes()
+                .Where(t => t.IsClass && t.GetCustomAttributes(typeof (FlatBuffersTestClassAttribute), false).Length > 0);
+
+            foreach (var testClass in testClasses)
+            {
+                var methods = testClass.GetMethods(BindingFlags.Public |
+                                                         BindingFlags.Instance)
+                          .Where(m => m.GetCustomAttributes(typeof(FlatBuffersTestMethodAttribute), false).Length > 0);
+
+                var inst = Activator.CreateInstance(testClass);
+
+                foreach (var method in methods)
+                {
+                    try
+                    {
+                        method.Invoke(inst, new object[] { });
+                        testResults.Add(true);
+                    }
+                    catch (Exception ex)
+                    {
+                        Console.WriteLine("{0}: FAILED when invoking {1} with error {2}",
+                            testClass.Name ,method.Name, ex.GetBaseException());
+                        testResults.Add(false);
+                    }
+                }
+            }
+
+            var failedCount = testResults.Count(i => i == false);
+
+            Console.WriteLine("{0} tests run, {1} failed", testResults.Count, failedCount);
+
+            if (failedCount > 0)
+            {
+                return -1;
+            }
+            return 0;
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Properties/AssemblyInfo.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..2e33f08
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/Properties/AssemblyInfo.cs
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("FlatBuffers.Test")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("FlatBuffers.Test")]
+[assembly: AssemblyCopyright("Copyright (c) 2014  Google Inc")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components.  If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("a1d58a51-3e74-4ae9-aac7-5a399c9eed1a")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/README.md b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/README.md
new file mode 100644
index 0000000..41b6983
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/README.md
@@ -0,0 +1,38 @@
+# .NET Tests
+
+## Running on Linux
+
+### Prerequisites
+To run the tests on a Linux a few prerequisites are needed:
+
+1) mono
+2) msbuild
+
+### Running
+
+To run the tests:
+
+```
+./NetTest.sh
+```
+
+This will download the .NET installer and core SDK if those are not already
+installed. Then it will build the tests using `msbuild` and run the resulting
+test binary with `mono`.
+
+After running the tests, the downloaded .NET installer and SDK are *not* removed
+as they can be reused in subsequent invocations. The files are ignored by git by
+default, and can remain in the working directory.
+
+### Cleaning
+
+If you want to clean up the downloaded .NET installer and SDK, run:
+
+```
+./clean.sh
+```
+
+This will wipe away the downloaded files and directories. Those will be
+automatically re-downloaded when running `NetTest.sh`.
+
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/TestTable.cs b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/TestTable.cs
new file mode 100644
index 0000000..4f663f0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/TestTable.cs
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2016 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace FlatBuffers.Test
+{
+    /// <summary>
+    /// A test Table object that gives easy access to the slot data
+    /// </summary>
+    internal struct TestTable
+    {
+        Table t;
+
+        public TestTable(ByteBuffer bb, int pos)
+        {
+          t = new Table(pos, bb);
+        }
+
+        public bool GetSlot(int slot, bool def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetSbyte(t.bb_pos + off) != 0;
+        }
+
+        public sbyte GetSlot(int slot, sbyte def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetSbyte(t.bb_pos + off);
+        }
+
+        public byte GetSlot(int slot, byte def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.Get(t.bb_pos + off);
+        }
+
+        public short GetSlot(int slot, short def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetShort(t.bb_pos + off);
+        }
+
+        public ushort GetSlot(int slot, ushort def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetUshort(t.bb_pos + off);
+        }
+
+        public int GetSlot(int slot, int def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetInt(t.bb_pos + off);
+        }
+
+        public uint GetSlot(int slot, uint def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetUint(t.bb_pos + off);
+        }
+
+        public long GetSlot(int slot, long def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetLong(t.bb_pos + off);
+        }
+
+        public ulong GetSlot(int slot, ulong def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetUlong(t.bb_pos + off);
+        }
+
+        public float GetSlot(int slot, float def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetFloat(t.bb_pos + off);
+        }
+
+        public double GetSlot(int slot, double def)
+        {
+            var off = t.__offset(slot);
+
+            if (off == 0)
+            {
+                return def;
+            }
+            return t.bb.GetDouble(t.bb_pos + off);
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/clean.sh b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/clean.sh
new file mode 100755
index 0000000..41f6a4a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/clean.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+# Remove files and directory that are needed to build and run the .NET tests.
+# The script NetTest.sh installs these as needed.
+
+[ -d .dotnet_tmp ] && rm -rf .dotnet_tmp
+[ -d packages ] && rm -rf packages
+[ -d .tmp ] && rm -rf .tmp
+[ -f nuget.exe ] && rm nuget.exe
+[ -f dotnet-intall.sh ] && rm dotnet-install.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/packages.config b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/packages.config
new file mode 100644
index 0000000..d766d04
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/FlatBuffers.Test/packages.config
@@ -0,0 +1,4 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="Newtonsoft.Json" version="12.0.3" targetFramework="net35" />
+</packages>
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/GoTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/GoTest.sh
new file mode 100755
index 0000000..8cbadcb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/GoTest.sh
@@ -0,0 +1,75 @@
+#!/bin/bash -eu
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pushd "$(dirname $0)" >/dev/null
+test_dir="$(pwd)"
+go_path=${test_dir}/go_gen
+go_src=${go_path}/src
+
+# Emit Go code for the example schema in the test dir:
+../flatc -g --gen-object-api -I include_test monster_test.fbs
+
+# Go requires a particular layout of files in order to link multiple packages.
+# Copy flatbuffer Go files to their own package directories to compile the
+# test binary:
+mkdir -p ${go_src}/MyGame/Example
+mkdir -p ${go_src}/MyGame/Example2
+mkdir -p ${go_src}/github.com/google/flatbuffers/go
+mkdir -p ${go_src}/flatbuffers_test
+
+cp -a MyGame/*.go ./go_gen/src/MyGame/
+cp -a MyGame/Example/*.go ./go_gen/src/MyGame/Example/
+cp -a MyGame/Example2/*.go ./go_gen/src/MyGame/Example2/
+# do not compile the gRPC generated files, which are not tested by go_test.go
+# below, but have their own test.
+rm ./go_gen/src/MyGame/Example/*_grpc.go
+cp -a ../go/* ./go_gen/src/github.com/google/flatbuffers/go
+cp -a ./go_test.go ./go_gen/src/flatbuffers_test/
+
+# Run tests with necessary flags.
+# Developers may wish to see more detail by appending the verbosity flag
+# -test.v to arguments for this command, as in:
+#   go -test -test.v ...
+# Developers may also wish to run benchmarks, which may be achieved with the
+# flag -test.bench and the wildcard regexp ".":
+#   go -test -test.bench=. ...
+GOPATH=${go_path} go test flatbuffers_test \
+                     --coverpkg=github.com/google/flatbuffers/go \
+                     --cpp_data=${test_dir}/monsterdata_test.mon \
+                     --out_data=${test_dir}/monsterdata_go_wire.mon \
+                     --bench=. \
+                     --benchtime=3s \
+                     --fuzz=true \
+                     --fuzz_fields=4 \
+                     --fuzz_objects=10000
+
+GO_TEST_RESULT=$?
+rm -rf ${go_path}/{pkg,src}
+if [[ $GO_TEST_RESULT  == 0 ]]; then
+    echo "OK: Go tests passed."
+else
+    echo "KO: Go tests failed."
+    exit 1
+fi
+
+NOT_FMT_FILES=$(gofmt -l MyGame)
+if [[ ${NOT_FMT_FILES} != "" ]]; then
+    echo "These files are not well gofmt'ed:"
+    echo
+    echo "${NOT_FMT_FILES}"
+    # enable this when enums are properly formated
+    # exit 1
+fi
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptFlexBuffersTest.js b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptFlexBuffersTest.js
new file mode 100644
index 0000000..d2462c6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptFlexBuffersTest.js
@@ -0,0 +1,389 @@
+// Run this using JavaScriptTest.sh
+import assert from 'assert'
+import fs from 'fs'
+import * as flexbuffers from 'flatbuffers/js/flexbuffers'
+
+function main() {
+  testSingleValueBuffers();
+  testGoldBuffer();
+  testEncode();
+  testIndirectAdd();
+  testIndirectWithCache();
+  testMapBuilder();
+  testRoundTrip();
+  testRoundTripWithBuilder();
+  testDeduplicationOff();
+  testBugWhereOffestWereStoredAsIntInsteadOfUInt();
+
+  console.log('FlexBuffers test: completed successfully');
+}
+
+function testSingleValueBuffers() {
+  {
+    const ref = flexbuffers.toReference(new Uint8Array([0, 0, 1]).buffer);
+    assert.strictEqual(true, ref.isNull());
+  }
+
+  function _assert(object, buffer) {
+    assert.deepStrictEqual(flexbuffers.toObject(new Uint8Array(buffer).buffer), object);
+  }
+  _assert(true, [1, 104, 1]);
+  _assert(false, [0, 104, 1]);
+  _assert(25, [25, 4, 1]);
+  _assert(-25, [231, 4, 1]);
+  _assert(230, [230, 8, 1]);
+  _assert(230, [230, 0, 5, 2]);
+  _assert(-1025, [255, 251, 5, 2]);
+  _assert(1025, [1, 4, 9, 2]);
+  _assert(2147483647, [255, 255, 255, 127, 6, 4]);
+  _assert(-2147483648, [0, 0, 0, 128, 6, 4]);
+  _assert(4294967295n, [255, 255, 255, 255, 0, 0, 0, 0, 7, 8]);
+  _assert(9223372036854775807n, [255, 255, 255, 255, 255, 255, 255, 127, 7, 8]);
+  _assert(-9223372036854775808n, [0, 0, 0, 0, 0, 0, 0, 128, 7, 8]);
+  _assert(18446744073709551615n, [255, 255, 255, 255, 255, 255, 255, 255, 11, 8]);
+  _assert(4.5, [0, 0, 144, 64, 14, 4]);
+  _assert(0.10000000149011612, [205, 204, 204, 61, 14, 4]);
+  _assert(0.1, [154, 153, 153, 153, 153, 153, 185, 63, 15, 8]);
+  _assert(-1025, [255, 251, 5, 2]);
+  _assert("Maxim", [5, 77, 97, 120, 105, 109, 0, 6, 20, 1]);
+  _assert("hello 😱", [10, 104, 101, 108, 108, 111, 32, 240, 159, 152, 177, 0, 11, 20, 1]);
+  _assert({a:12}, [97, 0, 1, 3, 1, 1, 1, 12, 4, 2, 36, 1]);
+  _assert({"":45, "a": 12}, [0, 97, 0, 2, 4, 4, 2, 1, 2, 45, 12, 4, 4, 4, 36, 1]);
+}
+
+function testEncode() {
+  function _assert(value, buffer) {
+    assert.deepStrictEqual(flexbuffers.encode(value), new Uint8Array(buffer));
+  }
+  _assert(null, [0, 0, 1]);
+  _assert(true, [1, 104, 1]);
+  _assert(false, [0, 104, 1]);
+  _assert(1, [1, 4, 1]);
+  _assert(230, [230, 0, 5, 2]);
+  _assert(1025, [1, 4, 5, 2]);
+  _assert(-1025, [255, 251, 5, 2]);
+  _assert(0x100000001, [1, 0, 0, 0, 1, 0, 0, 0, 7, 8]);
+  _assert(0.1, [154, 153, 153, 153, 153, 153, 185, 63, 15, 8]);
+  _assert(0.5, [0, 0, 0, 63, 14, 4]);
+  _assert(new Uint8Array([1, 2, 3]), [3, 1, 2, 3, 3, 100, 1]);
+  _assert("Maxim", [5, 77, 97, 120, 105, 109, 0, 6, 20, 1]);
+  _assert("hello 😱", [10, 104, 101, 108, 108, 111, 32, 240, 159, 152, 177, 0, 11, 20, 1]);
+  _assert([1, 2], [1, 2, 2, 64, 1]);
+  _assert([-1, 256], [255, 255, 0, 1, 4, 65, 1]);
+  _assert([-45, 256000], [211, 255, 255, 255, 0, 232, 3, 0, 8, 66, 1]);
+  _assert([1.1, -256.0], [2, 0, 0, 0, 0, 0, 0, 0, 154, 153, 153, 153, 153, 153, 241, 63, 0, 255, 255, 255, 255, 255, 255, 255, 15, 5, 18, 43, 1]);
+  _assert([1, 2, 4], [1, 2, 4, 3, 76, 1]);
+  _assert([-1, 256, 4], [255, 255, 0, 1, 4, 0, 6, 77, 1]);
+  _assert([[61], 64], [1, 61, 2, 2, 64, 44, 4, 4, 40, 1]);
+  _assert(["foo", "bar", "baz"], [3, 102, 111, 111, 0, 3, 98, 97, 114, 0, 3, 98, 97, 122, 0, 3, 15, 11, 7, 3, 60, 1]);
+  _assert(["foo", "bar", "baz", "foo", "bar", "baz"], [3, 102, 111, 111, 0, 3, 98, 97, 114, 0, 3, 98, 97, 122, 0, 6, 15, 11, 7, 18, 14, 10, 6, 60, 1]);
+  _assert([true, false, true], [3, 1, 0, 1, 3, 144, 1]);
+  _assert(['foo', 1, -5, 1.3, true], [
+    3, 102, 111, 111, 0, 0, 0, 0,
+    5, 0, 0, 0, 0, 0, 0, 0,
+    15, 0, 0, 0, 0, 0, 0, 0,
+    1, 0, 0, 0, 0, 0, 0, 0,
+    251, 255, 255, 255, 255, 255, 255, 255,
+    205, 204, 204, 204, 204, 204, 244, 63,
+    1, 0, 0, 0, 0, 0, 0, 0,
+    20, 4, 4, 15, 104, 45, 43, 1
+  ]);
+  _assert([1, 3.3, 'max', true, null, false], [
+    3, 109, 97, 120, 0, 0, 0, 0,
+    6, 0, 0, 0, 0, 0, 0, 0,
+    1, 0, 0, 0, 0, 0, 0, 0,
+    102, 102, 102, 102, 102, 102, 10, 64,
+    31, 0, 0, 0, 0, 0, 0, 0,
+    1, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    4, 15, 20, 104, 0, 104, 54, 43, 1
+  ]);
+  _assert({"a": 12}, [97, 0, 1, 3, 1, 1, 1, 12, 4, 2, 36, 1]);
+  _assert({"a": 12, "":45}, [0, 97, 0, 2, 4, 4, 2, 1, 2, 45, 12, 4, 4, 4, 36, 1]);
+  // JS currently does not support key vector offset sharing
+  _assert([{'something':12}, {'something': 45}], [
+    115, 111, 109, 101, 116, 104, 105, 110, 103, 0,
+    1, 11, 1, 1, 1, 12, 4, 6, 1, 1, 45, 4, 2, 8, 4, 36, 36, 4, 40, 1
+  ]);
+}
+
+function testDeduplicationOff() {
+  let buffer = flexbuffers.encode([{'something':12}, {'something': 45}], 1, true, true, false);
+  assert.deepStrictEqual(buffer, new Uint8Array([
+    115, 111, 109, 101, 116, 104, 105, 110, 103,
+    0,   1,  11,   1,   1,   1,  12,   4,   1,
+    18,   1,   1,   1,  45,   4,   2,  10,   4,
+    36,  36,   4,  40,   1
+  ]));
+
+  buffer = flexbuffers.encode([{'something':12}, {'something': 45}], 1, true, false, false);
+  assert.deepStrictEqual(buffer, new Uint8Array([
+    115, 111, 109, 101, 116, 104, 105, 110, 103,   0,
+    1,  11,   1,   1,   1,  12,   4, 115, 111, 109,
+    101, 116, 104, 105, 110, 103,   0,   1,  11,   1,
+    1,   1,  45,   4,   2,  20,   4,  36,  36,   4,
+    40,   1
+  ]));
+
+  buffer = flexbuffers.encode(['something', 'something', 'dark'], 1, true, false, false);
+  assert.deepStrictEqual(buffer, new Uint8Array([
+    9, 115, 111, 109, 101, 116, 104,
+    105, 110, 103,   0,   4, 100,  97,
+    114, 107,   0,   3,  17,  18,   8,
+    3,  60,   1
+  ]));
+
+  buffer = flexbuffers.encode(['something', 'something', 'dark'], 1, false, false, false);
+  assert.deepStrictEqual(buffer, new Uint8Array([
+    9, 115, 111, 109, 101, 116, 104, 105, 110,
+    103,   0,   9, 115, 111, 109, 101, 116, 104,
+    105, 110, 103,   0,   4, 100,  97, 114, 107,
+    0,   3,  28,  18,   8,   3,  60,   1
+  ]));
+}
+
+function testIndirectAdd() {
+  function _assertInt(buffer, value, indirect = false, cache = false) {
+    const builder = flexbuffers.builder();
+    builder.addInt(value, indirect, cache);
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+  function _assertUInt(buffer, value, indirect = false, cache = false) {
+    const builder = flexbuffers.builder();
+    builder.addUInt(value, indirect, cache);
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+  function _assertFloat(buffer, value, indirect = false, cache = false) {
+    const builder = flexbuffers.builder();
+    builder.addFloat(value, indirect, cache);
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+  _assertInt([0, 4, 1], 0);
+  _assertInt([0, 1, 24, 1], 0, true);
+  _assertInt([255, 0, 5, 2], 255);
+
+  _assertUInt([0, 8, 1], 0);
+  _assertUInt([0, 1, 28, 1], 0, true);
+  _assertUInt([255, 8, 1], 255);
+
+  _assertUInt([185, 115, 175, 118, 250, 84, 8, 0, 11, 8], 2345234523452345);
+  _assertUInt([185, 115, 175, 118, 250, 84, 8, 0, 8, 31, 1], 2345234523452345, true);
+  _assertInt([185, 115, 175, 118, 250, 84, 8, 0, 7, 8], 2345234523452345);
+  _assertInt([185, 115, 175, 118, 250, 84, 8, 0, 8, 27, 1], 2345234523452345, true);
+
+  _assertFloat([154, 153, 153, 153, 153, 153, 185, 63, 15, 8], 0.1);
+  _assertFloat([154, 153, 153, 153, 153, 153, 185, 63, 8, 35, 1], 0.1, true);
+  _assertFloat([0, 0, 0, 0, 14, 4], 0);
+  _assertFloat([0, 0, 0, 0, 4, 34, 1], 0, true);
+}
+
+function testIndirectWithCache() {
+  function _assertInt(buffer, values) {
+    const builder = flexbuffers.builder();
+    builder.startVector();
+    values.forEach(v => {
+      builder.addInt(v, true, true)
+    });
+    builder.end();
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+
+  function _assertUInt(buffer, values) {
+    const builder = flexbuffers.builder();
+    builder.startVector();
+    values.forEach(v => {
+      builder.addUInt(v, true, true);
+    });
+    builder.end();
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+
+  function _assertFloat(buffer, values) {
+    const builder = flexbuffers.builder();
+    builder.startVector();
+    values.forEach(v => {
+      builder.addFloat(v, true, true);
+    });
+    builder.end();
+    const data = builder.finish();
+    assert.deepStrictEqual(data, new Uint8Array(buffer));
+  }
+
+  _assertInt(
+    [185, 115, 175, 118, 250, 84, 8, 0, 4, 9, 10, 11, 12, 27, 27, 27, 27, 8, 40, 1],
+    [2345234523452345, 2345234523452345, 2345234523452345, 2345234523452345]
+  );
+
+  _assertUInt(
+    [185, 115, 175, 118, 250, 84, 8, 0, 4, 9, 10, 11, 12, 31, 31, 31, 31, 8, 40, 1],
+    [2345234523452345, 2345234523452345, 2345234523452345, 2345234523452345]
+  );
+
+  _assertFloat(
+    [154, 153, 153, 153, 153, 153, 185, 63, 4, 9, 10, 11, 12, 35, 35, 35, 35, 8, 40, 1],
+    [0.1, 0.1, 0.1, 0.1]
+  );
+}
+
+function testMapBuilder() {
+  const builder = flexbuffers.builder();
+  builder.startMap();
+  builder.addKey('a');
+  builder.add(12);
+  builder.addKey('');
+  builder.add(45);
+  builder.end();
+  const data = builder.finish();
+  assert.deepStrictEqual(data, new Uint8Array([97, 0, 0, 2, 2, 5, 2, 1, 2, 45, 12, 4, 4, 4, 36, 1]));
+}
+
+function testRoundTrip() {
+  const example = {
+    "age": 35,
+    "flags": [true, false, true, true],
+    "weight": 72.5,
+    "name": "Maxim",
+    "address": {
+      "city": "Bla",
+      "zip": "12345",
+      "countryCode": "XX",
+    }
+  };
+
+  function _assert(value) {
+    let buffer = flexbuffers.encode(value, 1);
+    let o = flexbuffers.toObject(buffer.buffer);
+    assert.deepStrictEqual(o, value);
+  }
+
+  _assert(example);
+  _assert(0x100000001n);
+}
+
+function testRoundTripWithBuilder() {
+  const example = {
+    "age": 35,
+    "flags": [true, false, true, true],
+    "weight": 72.5,
+    "name": "Maxim",
+    "address": {
+      "city": "Bla",
+      "zip": "12345",
+      "countryCode": "XX",
+    }
+  };
+
+  const builder = flexbuffers.builder();
+  builder.startMap();
+
+  builder.addKey('age');
+  builder.add(35);
+
+  builder.addKey('flags');
+  builder.startVector();
+  builder.add(true);
+  builder.add(false);
+  builder.add(true);
+  builder.add(true);
+  builder.end();
+
+  builder.addKey("weight");
+  builder.add(72.5);
+
+  builder.addKey("name");
+  builder.add("Maxim");
+
+  builder.addKey("address");
+
+  builder.startMap();
+  builder.addKey("city");
+  builder.add("Bla");
+  builder.addKey("zip");
+  builder.add("12345");
+  builder.addKey("countryCode");
+  builder.add("XX");
+  builder.end();
+
+  builder.end();
+
+  const data = builder.finish();
+  let o = flexbuffers.toObject(data.buffer);
+  assert.deepStrictEqual(o, example);
+
+  let root = flexbuffers.toReference(data.buffer);
+  assert.strictEqual(root.isMap(), true);
+  assert.strictEqual(root.get("age").numericValue(), 35);
+  assert.strictEqual(root.get("age").intValue(), 35);
+  assert.strictEqual(root.get("name").stringValue(), "Maxim");
+  assert.strictEqual(root.get("weight").floatValue(), 72.5);
+  assert.strictEqual(root.get("weight").numericValue(), 72.5);
+  let flags = root.get("flags");
+  assert.strictEqual(flags.isVector(), true);
+  assert.strictEqual(flags.length(), 4);
+  assert.strictEqual(flags.get(0).boolValue(), true);
+  assert.strictEqual(flags.get(1).boolValue(), false);
+  assert.strictEqual(flags.get(2).boolValue(), true);
+  assert.strictEqual(flags.get(3).boolValue(), true);
+
+  let address = root.get("address");
+  assert.strictEqual(address.isMap(), true);
+  assert.strictEqual(address.length(), 3);
+  assert.strictEqual(address.get("city").stringValue(), "Bla");
+  assert.strictEqual(address.get("zip").stringValue(), "12345");
+  assert.strictEqual(address.get("countryCode").stringValue(), "XX");
+}
+
+function testGoldBuffer() {
+  const data = new Uint8Array(fs.readFileSync('gold_flexbuffer_example.bin')).buffer;
+  const b1 = flexbuffers.toReference(data).get("bools").get(1);
+  assert.strictEqual(b1.isBool(), true);
+  assert.strictEqual(b1.boolValue(), false);
+
+  const blob = flexbuffers.toReference(data).get("vec").get(3);
+  assert.strictEqual(blob.isBlob(), true);
+  assert.deepStrictEqual(blob.blobValue(), new Uint8Array([77]));
+
+  const o = flexbuffers.toObject(data);
+  assert.deepStrictEqual(o, {
+    bool: true,
+    bools: [true, false, true, false],
+    bar: [1, 2, 3],
+    bar3: [1, 2, 3],
+    foo: 100,
+    mymap: {foo:'Fred'},
+    vec: [-100, 'Fred', 4, new Uint8Array([77]), false, 4]
+  });
+}
+
+function testBugWhereOffestWereStoredAsIntInsteadOfUInt() {
+  // Reported in https://github.com/google/flatbuffers/issues/5949#issuecomment-688421193
+  const object = {'channels_in': 64, 'dilation_height_factor': 1, 'dilation_width_factor': 1, 'fused_activation_function': 1, 'pad_values': 1, 'padding': 0, 'stride_height': 1, 'stride_width': 1};
+  let data1 = flexbuffers.encode(object);
+  const data = [99, 104, 97, 110, 110, 101, 108, 115, 95, 105, 110, 0,
+    100, 105, 108, 97, 116, 105, 111, 110, 95, 104, 101, 105, 103, 104, 116, 95, 102, 97, 99, 116, 111, 114, 0,
+    100, 105, 108, 97, 116, 105, 111, 110, 95, 119, 105, 100, 116, 104, 95, 102, 97, 99, 116, 111, 114, 0,
+    102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0,
+    112, 97, 100, 95, 118, 97, 108, 117, 101, 115, 0, 112, 97, 100, 100, 105, 110, 103, 0,
+    115, 116, 114, 105, 100, 101, 95, 104, 101, 105, 103, 104, 116, 0,
+    115, 116, 114, 105, 100, 101, 95, 119, 105, 100, 116, 104, 0,
+    8, 130, 119, 97, 76, 51, 41, 34, 21, 8, 1, 8, 64, 1, 1, 1, 1, 0, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 16, 36, 1];
+  let object2 = flexbuffers.toObject(new Uint8Array(data).buffer);
+  let object1 = flexbuffers.toObject(new Uint8Array(data1).buffer);
+  assert.deepStrictEqual(object, object2);
+  assert.deepStrictEqual(object, object1);
+  assert.strictEqual(data.length, data1.length);
+  let ref = flexbuffers.toReference(new Uint8Array(data).buffer);
+  assert.strictEqual(ref.isMap(), true);
+  assert.strictEqual(ref.length(), 8);
+  assert.strictEqual(ref.get("channels_in").numericValue(), 64);
+  assert.strictEqual(ref.get("padding").isNumber(), true);
+}
+
+main();
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptTest.js b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptTest.js
new file mode 100644
index 0000000..7a83af2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptTest.js
@@ -0,0 +1,450 @@
+// Run this using JavaScriptTest.sh
+import assert from 'assert'
+import fs from 'fs'
+import * as flatbuffers from 'flatbuffers'
+
+import { Monster, MonsterT } from './my-game/example/monster'
+import { Test } from './my-game/example/test'
+import { Stat } from './my-game/example/stat'
+import { Vec3 } from './my-game/example/vec3'
+import { Color } from './my-game/example/color';
+import { Any } from './my-game/example/any';
+
+function main() {
+
+  // First, let's test reading a FlatBuffer generated by C++ code:
+  // This file was generated from monsterdata_test.json
+  var data = new Uint8Array(fs.readFileSync('monsterdata_test.mon'));
+
+  // Now test it:
+
+  var bb = new flatbuffers.ByteBuffer(data);
+  testBuffer(bb);
+
+  // Second, let's create a FlatBuffer from scratch in JavaScript, and test it also.
+  // We use an initial size of 1 to exercise the reallocation algorithm,
+  // normally a size larger than the typical FlatBuffer you generate would be
+  // better for performance.
+  var fbb = new flatbuffers.Builder(1);
+  createMonster(fbb);
+  serializeAndTest(fbb);
+  testObjApiPack(fbb);
+
+  // clear the builder, repeat tests
+  var clearIterations = 100;
+  var startingCapacity = fbb.bb.capacity();
+  for (var i = 0; i < clearIterations; i++) {
+    fbb.clear();
+    createMonster(fbb);
+    serializeAndTest(fbb);
+    testObjApiPack(fbb);
+  }
+  // the capacity of our buffer shouldn't increase with the same size payload
+  assert.strictEqual(fbb.bb.capacity(), startingCapacity);
+
+  test64bit();
+  testUnicode();
+  fuzzTest1();
+  testNullStrings();
+  testSharedStrings();
+
+  console.log('FlatBuffers test: completed successfully');
+}
+
+function createMonster(fbb) {
+  // We set up the same values as monsterdata.json:
+
+  var str = fbb.createString('MyMonster');
+
+  var inv = Monster.createInventoryVector(fbb, [0, 1, 2, 3, 4]);
+
+  var fred = fbb.createString('Fred');
+  Monster.startMonster(fbb);
+  Monster.addName(fbb, fred);
+  var mon2 = Monster.endMonster(fbb);
+
+  Monster.startTest4Vector(fbb, 2);
+  Test.createTest(fbb, 10, 20);
+  Test.createTest(fbb, 30, 40);
+  var test4 = fbb.endVector();
+
+  var testArrayOfString = Monster.createTestarrayofstringVector(fbb, [
+    fbb.createString('test1'),
+    fbb.createString('test2')
+  ]);
+
+  Monster.startMonster(fbb);
+  Monster.addPos(fbb, Vec3.createVec3(fbb, 1, 2, 3, 3, Color.Green, 5, 6));
+  Monster.addHp(fbb, 80);
+  Monster.addName(fbb, str);
+  Monster.addInventory(fbb, inv);
+  Monster.addTestType(fbb, Any.Monster);
+  Monster.addTest(fbb, mon2);
+  Monster.addTest4(fbb, test4);
+  Monster.addTestarrayofstring(fbb, testArrayOfString);
+  Monster.addTestbool(fbb, true);
+  var mon = Monster.endMonster(fbb);
+
+  Monster.finishMonsterBuffer(fbb, mon);
+}
+
+function serializeAndTest(fbb) {
+  // Write the result to a file for debugging purposes:
+  // Note that the binaries are not necessarily identical, since the JSON
+  // parser may serialize in a slightly different order than the above
+  // JavaScript code. They are functionally equivalent though.
+
+  fs.writeFileSync('monsterdata_javascript_wire.mon', new Buffer(fbb.asUint8Array()));
+
+  // Tests mutation first.  This will verify that we did not trample any other
+  // part of the byte buffer.
+  testMutation(fbb.dataBuffer());
+
+  testBuffer(fbb.dataBuffer());
+}
+
+function testMutation(bb) {
+  var monster = Monster.getRootAsMonster(bb);
+
+  monster.mutate_hp(120);
+  assert.strictEqual(monster.hp(), 120);
+
+  monster.mutate_hp(80);
+  assert.strictEqual(monster.hp(), 80);
+
+  var manaRes = monster.mutate_mana(10);
+  assert.strictEqual(manaRes, false);  // Field was NOT present, because default value.
+
+  // TODO: There is not the availability to mutate structs or vectors.
+}
+
+function testObjApiPack(fbb) {
+  fbb.clear();
+  createMonster(fbb);
+  let monster_t = Monster.getRootAsMonster(fbb.dataBuffer()).unpack();
+  fbb.clear();
+  Monster.finishMonsterBuffer(fbb, monster_t.pack(fbb));
+  serializeAndTest(fbb);
+}
+
+function testObjApiUnpack(monster) {
+  assert.strictEqual(monster.hp, 80);
+  assert.strictEqual(monster.mana, 150); // default
+
+  assert.strictEqual(monster.name, 'MyMonster');
+
+  let pos = monster.pos;
+  assert.strictEqual(pos.x, 1);
+  assert.strictEqual(pos.y, 2);
+  assert.strictEqual(pos.z, 3);
+  assert.strictEqual(pos.test1, 3);
+  assert.strictEqual(pos.test2, Color.Green);
+  let test3 = pos.test3;
+  assert.strictEqual(test3.a, 5);
+  assert.strictEqual(test3.b, 6);
+
+  assert.strictEqual(monster.testType, Any.Monster);
+  let monster2 = monster.test;
+  assert.strictEqual(monster2 != null, true);
+  assert.strictEqual(monster2 instanceof MonsterT, true);
+  assert.strictEqual(monster2.name, 'Fred');
+
+  assert.strictEqual(monster.inventory.length, 5);
+  let invsum = 0;
+  for (let i = 0; i < monster.inventory.length; i++) {
+    invsum += monster.inventory[i];
+  }
+  assert.strictEqual(invsum, 10);
+
+  let test_0 = monster.test4[0];
+  let test_1 = monster.test4[1];
+  assert.strictEqual(monster.test4.length, 2);
+  assert.strictEqual(test_0.a + test_0.b + test_1.a + test_1.b, 100);
+
+  assert.strictEqual(monster.testarrayofstring.length, 2);
+  assert.strictEqual(monster.testarrayofstring[0], 'test1');
+  assert.strictEqual(monster.testarrayofstring[1], 'test2');
+
+  assert.strictEqual(monster.testbool, true);
+}
+
+function testBuffer(bb) {
+  assert.ok(Monster.bufferHasIdentifier(bb));
+
+  var monster = Monster.getRootAsMonster(bb);
+
+  assert.strictEqual(monster.hp(), 80);
+  assert.strictEqual(monster.mana(), 150); // default
+
+  assert.strictEqual(monster.name(), 'MyMonster');
+
+  var pos = monster.pos();
+  assert.strictEqual(pos.x(), 1);
+  assert.strictEqual(pos.y(), 2);
+  assert.strictEqual(pos.z(), 3);
+  assert.strictEqual(pos.test1(), 3);
+  assert.strictEqual(pos.test2(), Color.Green);
+  var t = pos.test3();
+  assert.strictEqual(t.a(), 5);
+  assert.strictEqual(t.b(), 6);
+
+  assert.strictEqual(monster.testType(), Any.Monster);
+  var monster2 = new Monster();
+  assert.strictEqual(monster.test(monster2) != null, true);
+  assert.strictEqual(monster2.name(), 'Fred');
+
+  assert.strictEqual(monster.inventoryLength(), 5);
+  var invsum = 0;
+  for (var i = 0; i < monster.inventoryLength(); i++) {
+    invsum += monster.inventory(i);
+  }
+  assert.strictEqual(invsum, 10);
+
+  var invsum2 = 0;
+  var invArr = monster.inventoryArray();
+  for (var i = 0; i < invArr.length; i++) {
+    invsum2 += invArr[i];
+  }
+  assert.strictEqual(invsum2, 10);
+
+  var test_0 = monster.test4(0);
+  var test_1 = monster.test4(1);
+  assert.strictEqual(monster.test4Length(), 2);
+  assert.strictEqual(test_0.a() + test_0.b() + test_1.a() + test_1.b(), 100);
+
+  assert.strictEqual(monster.testarrayofstringLength(), 2);
+  assert.strictEqual(monster.testarrayofstring(0), 'test1');
+  assert.strictEqual(monster.testarrayofstring(1), 'test2');
+
+  assert.strictEqual(monster.testbool(), true);
+
+  let monster_t = monster.unpack();
+  testObjApiUnpack(monster_t);
+
+  let monster2_t = new MonsterT();
+  monster.unpackTo(monster2_t);
+  testObjApiUnpack(monster2_t);
+}
+
+function test64bit() {
+  var fbb = new flatbuffers.Builder();
+  var required = fbb.createString('required');
+
+  Stat.startStat(fbb);
+  var stat2 = Stat.endStat(fbb);
+
+  Monster.startMonster(fbb);
+  Monster.addName(fbb, required);
+  Monster.addTestempty(fbb, stat2);
+  var mon2 = Monster.endMonster(fbb);
+
+  Stat.startStat(fbb);
+  // 2541551405100253985 = 0x87654321(low part) + 0x23456789 * 0x100000000(high part);
+  Stat.addVal(fbb, new flatbuffers.Long(0x87654321, 0x23456789));    // the low part is Uint32
+  var stat = Stat.endStat(fbb);
+
+  Monster.startMonster(fbb);
+  Monster.addName(fbb, required);
+  Monster.addEnemy(fbb, mon2);
+  Monster.addTestempty(fbb, stat);
+  var mon = Monster.endMonster(fbb);
+
+  Monster.finishMonsterBuffer(fbb, mon);
+  var bytes = fbb.asUint8Array();
+
+  ////////////////////////////////////////////////////////////////
+
+  var bb = new flatbuffers.ByteBuffer(bytes);
+  assert.ok(Monster.bufferHasIdentifier(bb));
+  var mon = Monster.getRootAsMonster(bb);
+
+  var stat = mon.testempty();
+  assert.strictEqual(stat != null, true);
+  assert.strictEqual(stat.val() != null, true);
+  assert.strictEqual(stat.val().toFloat64(), 2541551405100253985);
+
+  var mon2 = mon.enemy();
+  assert.strictEqual(mon2 != null, true);
+  stat = mon2.testempty();
+  assert.strictEqual(stat != null, true);
+  assert.strictEqual(stat.val() != null, true);
+  assert.strictEqual(stat.val().low, 0); // default value
+  assert.strictEqual(stat.val().high, 0);
+}
+
+function testUnicode() {
+  var correct = fs.readFileSync('unicode_test.mon');
+  var json = JSON.parse(fs.readFileSync('unicode_test.json', 'utf8'));
+
+  // Test reading
+  function testReadingUnicode(bb) {
+    var monster = Monster.getRootAsMonster(bb);
+    assert.strictEqual(monster.name(), json.name);
+    assert.deepEqual(new Buffer(monster.name(flatbuffers.Encoding.UTF8_BYTES)), new Buffer(json.name));
+    assert.strictEqual(monster.testarrayoftablesLength(), json.testarrayoftables.length);
+    json.testarrayoftables.forEach(function(table, i) {
+      var value = monster.testarrayoftables(i);
+      assert.strictEqual(value.name(), table.name);
+      assert.deepEqual(new Buffer(value.name(flatbuffers.Encoding.UTF8_BYTES)), new Buffer(table.name));
+    });
+    assert.strictEqual(monster.testarrayofstringLength(), json.testarrayofstring.length);
+    json.testarrayofstring.forEach(function(string, i) {
+      assert.strictEqual(monster.testarrayofstring(i), string);
+      assert.deepEqual(new Buffer(monster.testarrayofstring(i, flatbuffers.Encoding.UTF8_BYTES)), new Buffer(string));
+    });
+  }
+  testReadingUnicode(new flatbuffers.ByteBuffer(new Uint8Array(correct)));
+
+  // Test writing
+  var fbb = new flatbuffers.Builder();
+  var name = fbb.createString(json.name);
+  var testarrayoftablesOffsets = json.testarrayoftables.map(function(table) {
+    var name = fbb.createString(new Uint8Array(new Buffer(table.name)));
+    Monster.startMonster(fbb);
+    Monster.addName(fbb, name);
+    return Monster.endMonster(fbb);
+  });
+  var testarrayoftablesOffset = Monster.createTestarrayoftablesVector(fbb,
+    testarrayoftablesOffsets);
+  var testarrayofstringOffset = Monster.createTestarrayofstringVector(fbb,
+    json.testarrayofstring.map(function(string) { return fbb.createString(string); }));
+  Monster.startMonster(fbb);
+  Monster.addTestarrayofstring(fbb, testarrayofstringOffset);
+  Monster.addTestarrayoftables(fbb, testarrayoftablesOffset);
+  Monster.addName(fbb, name);
+  Monster.finishSizePrefixedMonsterBuffer(fbb, Monster.endMonster(fbb));
+  var bb = new flatbuffers.ByteBuffer(fbb.asUint8Array())
+  bb.setPosition(4);
+  testReadingUnicode(bb);
+}
+
+var __imul = Math.imul ? Math.imul : function(a, b) {
+  var ah = a >> 16 & 65535;
+  var bh = b >> 16 & 65535;
+  var al = a & 65535;
+  var bl = b & 65535;
+  return al * bl + (ah * bl + al * bh << 16) | 0;
+};
+
+// Include simple random number generator to ensure results will be the
+// same cross platform.
+// http://en.wikipedia.org/wiki/Park%E2%80%93Miller_random_number_generator
+var lcg_seed = 48271;
+
+function lcg_rand() {
+  return lcg_seed = (__imul(lcg_seed, 279470273) >>> 0) % 4294967291;
+}
+
+function lcg_reset() {
+  lcg_seed = 48271;
+}
+
+// Converts a Field ID to a virtual table offset.
+function fieldIndexToOffset(field_id) {
+  // Should correspond to what EndTable() below builds up.
+  var fixed_fields = 2;  // Vtable size and Object Size.
+  return (field_id + fixed_fields) * 2;
+}
+
+// Low level stress/fuzz test: serialize/deserialize a variety of
+// different kinds of data in different combinations
+function fuzzTest1() {
+
+  // Values we're testing against: chosen to ensure no bits get chopped
+  // off anywhere, and also be different from eachother.
+  var bool_val   = true;
+  var char_val   = -127;  // 0x81
+  var uchar_val  = 0xFF;
+  var short_val  = -32222; // 0x8222;
+  var ushort_val = 0xFEEE;
+  var int_val    = 0x83333333 | 0;
+  var uint_val   = 0xFDDDDDDD;
+  var long_val   = new flatbuffers.Long(0x44444444, 0x84444444);
+  var ulong_val  = new flatbuffers.Long(0xCCCCCCCC, 0xFCCCCCCC);
+  var float_val  = new Float32Array([3.14159])[0];
+  var double_val = 3.14159265359;
+
+  var test_values_max = 11;
+  var fields_per_object = 4;
+  var num_fuzz_objects = 10000;  // The higher, the more thorough :)
+
+  var builder = new flatbuffers.Builder();
+
+  lcg_reset();  // Keep it deterministic.
+
+  var objects = [];
+
+  // Generate num_fuzz_objects random objects each consisting of
+  // fields_per_object fields, each of a random type.
+  for (var i = 0; i < num_fuzz_objects; i++) {
+    builder.startObject(fields_per_object);
+    for (var f = 0; f < fields_per_object; f++) {
+      var choice = lcg_rand() % test_values_max;
+      switch (choice) {
+        case 0:  builder.addFieldInt8(f, bool_val,   0); break;
+        case 1:  builder.addFieldInt8(f, char_val,   0); break;
+        case 2:  builder.addFieldInt8(f, uchar_val,  0); break;
+        case 3:  builder.addFieldInt16(f, short_val,  0); break;
+        case 4:  builder.addFieldInt16(f, ushort_val, 0); break;
+        case 5:  builder.addFieldInt32(f, int_val,    0); break;
+        case 6:  builder.addFieldInt32(f, uint_val,   0); break;
+        case 7:  builder.addFieldInt64(f, long_val,   flatbuffers.Long.ZERO); break;
+        case 8:  builder.addFieldInt64(f, ulong_val,  flatbuffers.Long.ZERO); break;
+        case 9:  builder.addFieldFloat32(f, float_val,  0); break;
+        case 10: builder.addFieldFloat64(f, double_val, 0); break;
+      }
+    }
+    objects.push(builder.endObject());
+  }
+  builder.prep(8, 0);  // Align whole buffer.
+
+  lcg_reset();  // Reset.
+
+  builder.finish(objects[objects.length - 1]);
+  var bytes = new Uint8Array(builder.asUint8Array());
+  var view = new DataView(bytes.buffer);
+
+  // Test that all objects we generated are readable and return the
+  // expected values. We generate random objects in the same order
+  // so this is deterministic.
+  for (var i = 0; i < num_fuzz_objects; i++) {
+    var offset = bytes.length - objects[i];
+    for (var f = 0; f < fields_per_object; f++) {
+      var choice = lcg_rand() % test_values_max;
+      var vtable_offset = fieldIndexToOffset(f);
+      var vtable = offset - view.getInt32(offset, true);
+      assert.ok(vtable_offset < view.getInt16(vtable, true));
+      var field_offset = offset + view.getInt16(vtable + vtable_offset, true);
+      switch (choice) {
+        case 0:  assert.strictEqual(!!view.getInt8(field_offset), bool_val); break;
+        case 1:  assert.strictEqual(view.getInt8(field_offset), char_val); break;
+        case 2:  assert.strictEqual(view.getUint8(field_offset), uchar_val); break;
+        case 3:  assert.strictEqual(view.getInt16(field_offset, true), short_val); break;
+        case 4:  assert.strictEqual(view.getUint16(field_offset, true), ushort_val); break;
+        case 5:  assert.strictEqual(view.getInt32(field_offset, true), int_val); break;
+        case 6:  assert.strictEqual(view.getUint32(field_offset, true), uint_val); break;
+        case 7:  assert.strictEqual(view.getInt32(field_offset, true), long_val.low); assert.strictEqual(view.getInt32(field_offset + 4, true), long_val.high); break;
+        case 8:  assert.strictEqual(view.getInt32(field_offset, true), ulong_val.low); assert.strictEqual(view.getInt32(field_offset + 4, true), ulong_val.high); break;
+        case 9:  assert.strictEqual(view.getFloat32(field_offset, true), float_val); break;
+        case 10: assert.strictEqual(view.getFloat64(field_offset, true), double_val); break;
+      }
+    }
+  }
+}
+
+function testSharedStrings() {
+  var shared_string = "Hello world";
+  var builder = new flatbuffers.Builder();
+  let mainOffset = builder.createSharedString(shared_string);
+  assert.strictEqual(builder.createSharedString(shared_string), mainOffset);
+}
+
+function testNullStrings() {
+  var builder = new flatbuffers.Builder();
+  assert.strictEqual(builder.createString(null), 0);
+  assert.strictEqual(builder.createSharedString(null), 0);
+  assert.strictEqual(builder.createString(undefined), 0);
+  assert.strictEqual(builder.createSharedString(undefined), 0);
+}
+
+main();
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptUnionVectorTest.js b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptUnionVectorTest.js
new file mode 100644
index 0000000..b80e37f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaScriptUnionVectorTest.js
@@ -0,0 +1,111 @@
+import assert from 'assert'
+import * as flatbuffers from 'flatbuffers'
+
+import { Character } from './union_vector/character'
+import { BookReader, BookReaderT } from './union_vector/book-reader'
+import { Attacker, AttackerT } from './union_vector/attacker'
+import { Movie, MovieT } from './union_vector/movie'
+
+var charTypes = [
+  Character.Belle,
+  Character.MuLan,
+  Character.BookFan,
+  Character.Other
+];
+
+function testMovieBuf(movie) {
+  assert.strictEqual(movie.charactersTypeLength(), charTypes.length);
+  assert.strictEqual(movie.charactersLength(), movie.charactersTypeLength());
+
+  for (var i = 0; i < charTypes.length; ++i) {
+    assert.strictEqual(movie.charactersType(i), charTypes[i]);
+  }
+
+  var bookReader7 = movie.characters(0, new BookReader());
+  assert.strictEqual(bookReader7.booksRead(), 7);
+
+  var attacker = movie.characters(1, new Attacker());
+  assert.strictEqual(attacker.swordAttackDamage(), 5);
+
+  var bookReader2 = movie.characters(2, new BookReader());
+  assert.strictEqual(bookReader2.booksRead(), 2);
+
+  var other = movie.characters(3, '');
+  assert.strictEqual(other, "I am other");
+}
+
+function testMovieUnpack(movie) {
+  assert.strictEqual(movie.charactersType.length, charTypes.length);
+  assert.strictEqual(movie.characters.length, movie.charactersType.length);
+
+  for (var i = 0; i < charTypes.length; ++i) {
+    assert.strictEqual(movie.charactersType[i], charTypes[i]);
+  }
+
+  var bookReader7 = movie.characters[0];
+  assert.strictEqual(bookReader7 instanceof BookReaderT, true);
+  assert.strictEqual(bookReader7.booksRead, 7);
+
+  var attacker = movie.characters[1];
+  assert.strictEqual(attacker instanceof AttackerT, true);
+  assert.strictEqual(attacker.swordAttackDamage, 5);
+
+  var bookReader2 = movie.characters[2];
+  assert.strictEqual(bookReader2 instanceof BookReaderT, true);
+  assert.strictEqual(bookReader2.booksRead, 2);
+
+  var other = movie.characters[3];
+  assert.strictEqual(other, "I am other");
+}
+
+function createMovie(fbb) {
+  Attacker.startAttacker(fbb);
+  Attacker.addSwordAttackDamage(fbb, 5);
+  var attackerOffset = Attacker.endAttacker(fbb);
+
+  var charTypesOffset = Movie.createCharactersTypeVector(fbb, charTypes);
+  var charsOffset = 0;
+
+  let otherOffset = fbb.createString("I am other");
+
+  charsOffset = Movie.createCharactersVector(
+    fbb,
+    [
+      BookReader.createBookReader(fbb, 7),
+      attackerOffset,
+      BookReader.createBookReader(fbb, 2),
+      otherOffset
+    ]
+  );
+
+  Movie.startMovie(fbb);
+  Movie.addCharactersType(fbb, charTypesOffset);
+  Movie.addCharacters(fbb, charsOffset);
+  Movie.finishMovieBuffer(fbb, Movie.endMovie(fbb))
+}
+
+function main() {
+  var fbb = new flatbuffers.Builder();
+
+  createMovie(fbb);
+
+  var buf = new flatbuffers.ByteBuffer(fbb.asUint8Array());
+
+  var movie = Movie.getRootAsMovie(buf);
+  testMovieBuf(movie);
+
+  testMovieUnpack(movie.unpack());
+
+  var movie_to = new MovieT();
+  movie.unpackTo(movie_to);
+  testMovieUnpack(movie_to);
+
+  fbb.clear();
+  Movie.finishMovieBuffer(fbb, movie_to.pack(fbb));
+  var unpackBuf = new flatbuffers.ByteBuffer(fbb.asUint8Array());
+  testMovieBuf(Movie.getRootAsMovie(unpackBuf));
+  
+  console.log('FlatBuffers union vector test: completed successfully');
+}
+
+main();
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.bat b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.bat
new file mode 100644
index 0000000..921815a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.bat
@@ -0,0 +1,21 @@
+@echo off
+rem Copyright 2014 Google Inc. All rights reserved.
+rem
+rem Licensed under the Apache License, Version 2.0 (the "License");
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+
+rem Compile then run the Java test.
+
+set batch_file_dir=%~d0%~p0
+
+javac -g -classpath %batch_file_dir%\..\java;%batch_file_dir%;%batch_file_dir%\namespace_test;%batch_file_dir%\union_vector JavaTest.java
+java -classpath %batch_file_dir%\..\java;%batch_file_dir%;%batch_file_dir%\namespace_test;%batch_file_dir%\union_vector JavaTest
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.java b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.java
new file mode 100644
index 0000000..23ebb5b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.java
@@ -0,0 +1,1393 @@
+
+import static com.google.flatbuffers.Constants.*;
+
+import MyGame.Example.*;
+import optional_scalars.ScalarStuff;
+import optional_scalars.OptionalByte;
+import MyGame.MonsterExtra;
+import NamespaceA.*;
+import NamespaceA.NamespaceB.*;
+import com.google.flatbuffers.ByteBufferUtil;
+import com.google.flatbuffers.ByteVector;
+import com.google.flatbuffers.FlatBufferBuilder;
+import com.google.flatbuffers.FlexBuffers;
+import com.google.flatbuffers.FlexBuffersBuilder;
+import com.google.flatbuffers.StringVector;
+import com.google.flatbuffers.UnionVector;
+
+import com.google.flatbuffers.FlexBuffers.FlexBufferException;
+import com.google.flatbuffers.FlexBuffers.Reference;
+import com.google.flatbuffers.FlexBuffers.Vector;
+import com.google.flatbuffers.ArrayReadWriteBuf;
+import com.google.flatbuffers.FlexBuffers.KeyVector;
+
+import java.io.*;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.CharBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+class JavaTest {
+    public static void main(String[] args) {
+
+        // First, let's test reading a FlatBuffer generated by C++ code:
+        // This file was generated from monsterdata_test.json
+
+        byte[] data = null;
+        File file = new File("monsterdata_test.mon");
+        RandomAccessFile f = null;
+        try {
+            f = new RandomAccessFile(file, "r");
+            data = new byte[(int)f.length()];
+            f.readFully(data);
+            f.close();
+        } catch(java.io.IOException e) {
+            System.out.println("FlatBuffers test: couldn't read file");
+            return;
+        }
+
+        // Now test it:
+
+        ByteBuffer bb = ByteBuffer.wrap(data);
+        TestBuffer(bb);
+
+        // Second, let's create a FlatBuffer from scratch in Java, and test it also.
+        // We use an initial size of 1 to exercise the reallocation algorithm,
+        // normally a size larger than the typical FlatBuffer you generate would be
+        // better for performance.
+        FlatBufferBuilder fbb = new FlatBufferBuilder(1);
+
+        TestBuilderBasics(fbb, true);
+        TestBuilderBasics(fbb, false);
+
+        TestExtendedBuffer(fbb.dataBuffer().asReadOnlyBuffer());
+
+        TestNamespaceNesting();
+
+        TestNestedFlatBuffer();
+
+        TestCreateByteVector();
+
+        TestCreateUninitializedVector();
+
+        TestByteBufferFactory();
+
+        TestSizedInputStream();
+
+        TestVectorOfUnions();
+
+        TestFixedLengthArrays();
+
+        TestFlexBuffers();
+
+        TestVectorOfBytes();
+
+        TestSharedStringPool();
+
+        TestScalarOptional();
+
+        System.out.println("FlatBuffers test: completed successfully");
+    }
+
+    static void TestEnums() {
+      TestEq(Color.name(Color.Red), "Red");
+      TestEq(Color.name(Color.Blue), "Blue");
+      TestEq(Any.name(Any.NONE), "NONE");
+      TestEq(Any.name(Any.Monster), "Monster");
+    }
+
+    static void TestBuffer(ByteBuffer bb) {
+        TestEq(Monster.MonsterBufferHasIdentifier(bb), true);
+
+        Monster monster = Monster.getRootAsMonster(bb);
+
+        TestEq(monster.hp(), (short)80);
+        TestEq(monster.mana(), (short)150);  // default
+
+        TestEq(monster.name(), "MyMonster");
+        // monster.friendly() // can't access, deprecated
+
+        Vec3 pos = monster.pos();
+        TestEq(pos.x(), 1.0f);
+        TestEq(pos.y(), 2.0f);
+        TestEq(pos.z(), 3.0f);
+        TestEq(pos.test1(), 3.0);
+        // issue: int != byte
+        TestEq(pos.test2(), (int) Color.Green);
+        Test t = pos.test3();
+        TestEq(t.a(), (short)5);
+        TestEq(t.b(), (byte)6);
+
+        TestEq(monster.testType(), (byte)Any.Monster);
+        Monster monster2 = new Monster();
+        TestEq(monster.test(monster2) != null, true);
+        TestEq(monster2.name(), "Fred");
+
+        TestEq(monster.inventoryLength(), 5);
+        int invsum = 0;
+        for (int i = 0; i < monster.inventoryLength(); i++)
+            invsum += monster.inventory(i);
+        TestEq(invsum, 10);
+
+        // Method using a vector access object:
+        ByteVector inventoryVector = monster.inventoryVector();
+        TestEq(inventoryVector.length(), 5);
+        invsum = 0;
+        for (int i = 0; i < inventoryVector.length(); i++)
+            invsum += inventoryVector.getAsUnsigned(i);
+        TestEq(invsum, 10);
+
+        // Alternative way of accessing a vector:
+        ByteBuffer ibb = monster.inventoryAsByteBuffer();
+        invsum = 0;
+        while (ibb.position() < ibb.limit())
+            invsum += ibb.get();
+        TestEq(invsum, 10);
+
+        Test test_0 = monster.test4(0);
+        Test test_1 = monster.test4(1);
+        TestEq(monster.test4Length(), 2);
+        TestEq(test_0.a() + test_0.b() + test_1.a() + test_1.b(), 100);
+
+        Test.Vector test4Vector = monster.test4Vector();
+        test_0 = test4Vector.get(0);
+        test_1 = test4Vector.get(1);
+        TestEq(test4Vector.length(), 2);
+        TestEq(test_0.a() + test_0.b() + test_1.a() + test_1.b(), 100);
+
+        TestEq(monster.testarrayofstringLength(), 2);
+        TestEq(monster.testarrayofstring(0),"test1");
+        TestEq(monster.testarrayofstring(1),"test2");
+
+        // Method using a vector access object:
+        StringVector testarrayofstringVector = monster.testarrayofstringVector();
+        TestEq(testarrayofstringVector.length(), 2);
+        TestEq(testarrayofstringVector.get(0),"test1");
+        TestEq(testarrayofstringVector.get(1),"test2");
+
+        TestEq(monster.testbool(), true);
+    }
+
+    // this method checks additional fields not present in the binary buffer read from file
+    // these new tests are performed on top of the regular tests
+    static void TestExtendedBuffer(ByteBuffer bb) {
+        TestBuffer(bb);
+
+        Monster monster = Monster.getRootAsMonster(bb);
+
+        TestEq(monster.testhashu32Fnv1(), Integer.MAX_VALUE + 1L);
+    }
+
+    static void TestNamespaceNesting() {
+        // reference / manipulate these to verify compilation
+        FlatBufferBuilder fbb = new FlatBufferBuilder(1);
+
+        TableInNestedNS.startTableInNestedNS(fbb);
+        TableInNestedNS.addFoo(fbb, 1234);
+        int nestedTableOff = TableInNestedNS.endTableInNestedNS(fbb);
+
+        TableInFirstNS.startTableInFirstNS(fbb);
+        TableInFirstNS.addFooTable(fbb, nestedTableOff);
+        int off = TableInFirstNS.endTableInFirstNS(fbb);
+    }
+
+    static void TestNestedFlatBuffer() {
+        final String nestedMonsterName = "NestedMonsterName";
+        final short nestedMonsterHp = 600;
+        final short nestedMonsterMana = 1024;
+
+        FlatBufferBuilder fbb1 = new FlatBufferBuilder(16);
+        int str1 = fbb1.createString(nestedMonsterName);
+        Monster.startMonster(fbb1);
+        Monster.addName(fbb1, str1);
+        Monster.addHp(fbb1, nestedMonsterHp);
+        Monster.addMana(fbb1, nestedMonsterMana);
+        int monster1 = Monster.endMonster(fbb1);
+        Monster.finishMonsterBuffer(fbb1, monster1);
+        byte[] fbb1Bytes = fbb1.sizedByteArray();
+        fbb1 = null;
+
+        FlatBufferBuilder fbb2 = new FlatBufferBuilder(16);
+        int str2 = fbb2.createString("My Monster");
+        int nestedBuffer = Monster.createTestnestedflatbufferVector(fbb2, fbb1Bytes);
+        Monster.startMonster(fbb2);
+        Monster.addName(fbb2, str2);
+        Monster.addHp(fbb2, (short)50);
+        Monster.addMana(fbb2, (short)32);
+        Monster.addTestnestedflatbuffer(fbb2, nestedBuffer);
+        int monster = Monster.endMonster(fbb2);
+        Monster.finishMonsterBuffer(fbb2, monster);
+
+        // Now test the data extracted from the nested buffer
+        Monster mons = Monster.getRootAsMonster(fbb2.dataBuffer());
+        Monster nestedMonster = mons.testnestedflatbufferAsMonster();
+
+        TestEq(nestedMonsterMana, nestedMonster.mana());
+        TestEq(nestedMonsterHp, nestedMonster.hp());
+        TestEq(nestedMonsterName, nestedMonster.name());
+    }
+
+    static void TestCreateByteVector() {
+        FlatBufferBuilder fbb = new FlatBufferBuilder(16);
+        int str = fbb.createString("MyMonster");
+        byte[] inventory = new byte[] { 0, 1, 2, 3, 4 };
+        int vec = fbb.createByteVector(inventory);
+        Monster.startMonster(fbb);
+        Monster.addInventory(fbb, vec);
+        Monster.addName(fbb, str);
+        int monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject.inventory(1), (int)inventory[1]);
+        TestEq(monsterObject.inventoryLength(), inventory.length);
+        ByteVector inventoryVector = monsterObject.inventoryVector();
+        TestEq(inventoryVector.getAsUnsigned(1), (int)inventory[1]);
+        TestEq(inventoryVector.length(), inventory.length);
+
+        TestEq(ByteBuffer.wrap(inventory), monsterObject.inventoryAsByteBuffer());
+    }
+
+    static void TestCreateUninitializedVector() {
+        FlatBufferBuilder fbb = new FlatBufferBuilder(16);
+        int str = fbb.createString("MyMonster");
+        byte[] inventory = new byte[] { 0, 1, 2, 3, 4 };
+        ByteBuffer bb = fbb.createUnintializedVector(1, inventory.length, 1);
+        for (byte i:inventory) {
+            bb.put(i);
+        }
+        int vec = fbb.endVector();
+        Monster.startMonster(fbb);
+        Monster.addInventory(fbb, vec);
+        Monster.addName(fbb, str);
+        int monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject.inventory(1), (int)inventory[1]);
+        TestEq(monsterObject.inventoryLength(), inventory.length);
+        ByteVector inventoryVector = monsterObject.inventoryVector();
+        TestEq(inventoryVector.getAsUnsigned(1), (int)inventory[1]);
+        TestEq(inventoryVector.length(), inventory.length);
+        TestEq(ByteBuffer.wrap(inventory), monsterObject.inventoryAsByteBuffer());
+    }
+
+    static void TestByteBufferFactory() {
+        final class MappedByteBufferFactory extends FlatBufferBuilder.ByteBufferFactory {
+            @Override
+            public ByteBuffer newByteBuffer(int capacity) {
+                ByteBuffer bb;
+                try {
+                    RandomAccessFile f = new RandomAccessFile("javatest.bin", "rw");
+                    bb =  f.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, capacity).order(ByteOrder.LITTLE_ENDIAN);
+                    f.close();
+                } catch(Throwable e) {
+                    System.out.println("FlatBuffers test: couldn't map ByteBuffer to a file");
+                    bb = null;
+                }
+                return bb;
+            }
+        }
+
+        FlatBufferBuilder fbb = new FlatBufferBuilder(1, new MappedByteBufferFactory());
+
+        TestBuilderBasics(fbb, false);
+    }
+
+    static void TestSizedInputStream() {
+        // Test on default FlatBufferBuilder that uses HeapByteBuffer
+        FlatBufferBuilder fbb = new FlatBufferBuilder(1);
+
+        TestBuilderBasics(fbb, false);
+
+        InputStream in = fbb.sizedInputStream();
+        byte[] array = fbb.sizedByteArray();
+        int count = 0;
+        int currentVal = 0;
+
+        while (currentVal != -1 && count < array.length) {
+            try {
+                currentVal = in.read();
+            } catch(java.io.IOException e) {
+                System.out.println("FlatBuffers test: couldn't read from InputStream");
+                return;
+            }
+            TestEq((byte)currentVal, array[count]);
+            count++;
+        }
+        TestEq(count, array.length);
+    }
+
+    static void TestBuilderBasics(FlatBufferBuilder fbb, boolean sizePrefix) {
+        int[] names = {fbb.createString("Frodo"), fbb.createString("Barney"), fbb.createString("Wilma")};
+        int[] off = new int[3];
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, names[0]);
+        off[0] = Monster.endMonster(fbb);
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, names[1]);
+        off[1] = Monster.endMonster(fbb);
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, names[2]);
+        off[2] = Monster.endMonster(fbb);
+        int sortMons = fbb.createSortedVectorOfTables(new Monster(), off);
+
+        // We set up the same values as monsterdata.json:
+
+        int str = fbb.createString("MyMonster");
+
+        int inv = Monster.createInventoryVector(fbb, new byte[] { 0, 1, 2, 3, 4 });
+
+        int fred = fbb.createString("Fred");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, fred);
+        int mon2 = Monster.endMonster(fbb);
+
+        Monster.startTest4Vector(fbb, 2);
+        Test.createTest(fbb, (short)10, (byte)20);
+        Test.createTest(fbb, (short)30, (byte)40);
+        int test4 = fbb.endVector();
+
+        int testArrayOfString = Monster.createTestarrayofstringVector(fbb, new int[] {
+                fbb.createString("test1"),
+                fbb.createString("test2")
+        });
+
+        Monster.startMonster(fbb);
+        Monster.addPos(fbb, Vec3.createVec3(fbb, 1.0f, 2.0f, 3.0f, 3.0,
+                Color.Green, (short)5, (byte)6));
+        Monster.addHp(fbb, (short)80);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, inv);
+        Monster.addTestType(fbb, (byte)Any.Monster);
+        Monster.addTest(fbb, mon2);
+        Monster.addTest4(fbb, test4);
+        Monster.addTestarrayofstring(fbb, testArrayOfString);
+        Monster.addTestbool(fbb, true);
+        Monster.addTesthashu32Fnv1(fbb, Integer.MAX_VALUE + 1L);
+        Monster.addTestarrayoftables(fbb, sortMons);
+        int mon = Monster.endMonster(fbb);
+
+        if (sizePrefix) {
+            Monster.finishSizePrefixedMonsterBuffer(fbb, mon);
+        } else {
+            Monster.finishMonsterBuffer(fbb, mon);
+        }
+
+        // Write the result to a file for debugging purposes:
+        // Note that the binaries are not necessarily identical, since the JSON
+        // parser may serialize in a slightly different order than the above
+        // Java code. They are functionally equivalent though.
+
+        try {
+            String filename = "monsterdata_java_wire" + (sizePrefix ? "_sp" : "") + ".mon";
+            FileChannel fc = new FileOutputStream(filename).getChannel();
+            fc.write(fbb.dataBuffer().duplicate());
+            fc.close();
+        } catch(java.io.IOException e) {
+            System.out.println("FlatBuffers test: couldn't write file");
+            return;
+        }
+
+        // Test it:
+        ByteBuffer dataBuffer = fbb.dataBuffer();
+        if (sizePrefix) {
+            TestEq(ByteBufferUtil.getSizePrefix(dataBuffer) + SIZE_PREFIX_LENGTH,
+                   dataBuffer.remaining());
+            dataBuffer = ByteBufferUtil.removeSizePrefix(dataBuffer);
+        }
+        TestExtendedBuffer(dataBuffer);
+
+        // Make sure it also works with read only ByteBuffers. This is slower,
+        // since creating strings incurs an additional copy
+        // (see Table.__string).
+        TestExtendedBuffer(dataBuffer.asReadOnlyBuffer());
+
+        TestEnums();
+
+        //Attempt to mutate Monster fields and check whether the buffer has been mutated properly
+        // revert to original values after testing
+        Monster monster = Monster.getRootAsMonster(dataBuffer);
+
+        // mana is optional and does not exist in the buffer so the mutation should fail
+        // the mana field should retain its default value
+        TestEq(monster.mutateMana((short)10), false);
+        TestEq(monster.mana(), (short)150);
+
+        // Accessing a vector of sorted by the key tables
+        TestEq(monster.testarrayoftables(0).name(), "Barney");
+        TestEq(monster.testarrayoftables(1).name(), "Frodo");
+        TestEq(monster.testarrayoftables(2).name(), "Wilma");
+        Monster.Vector testarrayoftablesVector = monster.testarrayoftablesVector();
+        TestEq(testarrayoftablesVector.get(0).name(), "Barney");
+        TestEq(testarrayoftablesVector.get(1).name(), "Frodo");
+        TestEq(testarrayoftablesVector.get(2).name(), "Wilma");
+
+        // Example of searching for a table by the key
+        TestEq(monster.testarrayoftablesByKey("Frodo").name(), "Frodo");
+        TestEq(monster.testarrayoftablesByKey("Barney").name(), "Barney");
+        TestEq(monster.testarrayoftablesByKey("Wilma").name(), "Wilma");
+        TestEq(testarrayoftablesVector.getByKey("Frodo").name(), "Frodo");
+        TestEq(testarrayoftablesVector.getByKey("Barney").name(), "Barney");
+        TestEq(testarrayoftablesVector.getByKey("Wilma").name(), "Wilma");
+
+        // testType is an existing field and mutating it should succeed
+        TestEq(monster.testType(), (byte)Any.Monster);
+
+        //mutate the inventory vector
+        TestEq(monster.mutateInventory(0, 1), true);
+        TestEq(monster.mutateInventory(1, 2), true);
+        TestEq(monster.mutateInventory(2, 3), true);
+        TestEq(monster.mutateInventory(3, 4), true);
+        TestEq(monster.mutateInventory(4, 5), true);
+
+        for (int i = 0; i < monster.inventoryLength(); i++) {
+            TestEq(monster.inventory(i), i + 1);
+        }
+        ByteVector inventoryVector =  monster.inventoryVector();
+        for (int i = 0; i < inventoryVector.length(); i++) {
+            TestEq((int)inventoryVector.get(i), i + 1);
+        }
+
+        //reverse mutation
+        TestEq(monster.mutateInventory(0, 0), true);
+        TestEq(monster.mutateInventory(1, 1), true);
+        TestEq(monster.mutateInventory(2, 2), true);
+        TestEq(monster.mutateInventory(3, 3), true);
+        TestEq(monster.mutateInventory(4, 4), true);
+
+        // get a struct field and edit one of its fields
+        Vec3 pos = monster.pos();
+        TestEq(pos.x(), 1.0f);
+        pos.mutateX(55.0f);
+        TestEq(pos.x(), 55.0f);
+        pos.mutateX(1.0f);
+        TestEq(pos.x(), 1.0f);
+    }
+
+    static void TestVectorOfUnions() {
+        final FlatBufferBuilder fbb = new FlatBufferBuilder();
+
+        final int swordAttackDamage = 1;
+
+        final int[] characterVector = new int[] {
+            Attacker.createAttacker(fbb, swordAttackDamage),
+        };
+
+        final byte[] characterTypeVector = new byte[]{
+            Character.MuLan,
+        };
+
+        Movie.finishMovieBuffer(
+            fbb,
+            Movie.createMovie(
+                fbb,
+                (byte)0,
+                (byte)0,
+                Movie.createCharactersTypeVector(fbb, characterTypeVector),
+                Movie.createCharactersVector(fbb, characterVector)
+            )
+        );
+
+        final Movie movie = Movie.getRootAsMovie(fbb.dataBuffer());
+        ByteVector charactersTypeByteVector = movie.charactersTypeVector();
+        UnionVector charactersVector = movie.charactersVector();
+
+        TestEq(movie.charactersTypeLength(), characterTypeVector.length);
+        TestEq(charactersTypeByteVector.length(), characterTypeVector.length);
+        TestEq(movie.charactersLength(), characterVector.length);
+        TestEq(charactersVector.length(), characterVector.length);
+
+        TestEq(movie.charactersType(0), characterTypeVector[0]);
+        TestEq(charactersTypeByteVector.get(0), characterTypeVector[0]);
+
+        TestEq(((Attacker)movie.characters(new Attacker(), 0)).swordAttackDamage(), swordAttackDamage);
+    }
+
+    static void TestFixedLengthArrays() {
+        FlatBufferBuilder builder = new FlatBufferBuilder(0);
+
+        float       a;
+        int[]       b = new int[15];
+        byte        c;
+        int[][]     d_a = new int[2][2];
+        byte[]      d_b = new byte[2];
+        byte[][]    d_c = new byte[2][2];
+        long[][]    d_d = new long[2][2];
+        int         e;
+        long[]      f = new long[2];
+
+        a = 0.5f;
+        for (int i = 0; i < 15; i++) b[i] = i;
+        c = 1;
+        d_a[0][0] = 1;
+        d_a[0][1] = 2;
+        d_a[1][0] = 3;
+        d_a[1][1] = 4;
+        d_b[0] = TestEnum.B;
+        d_b[1] = TestEnum.C;
+        d_c[0][0] = TestEnum.A;
+        d_c[0][1] = TestEnum.B;
+        d_c[1][0] = TestEnum.C;
+        d_c[1][1] = TestEnum.B;
+        d_d[0][0] = -1;
+        d_d[0][1] = 1;
+        d_d[1][0] = -2;
+        d_d[1][1] = 2;
+        e = 2;
+        f[0] = -1;
+        f[1] = 1;
+
+        int arrayOffset = ArrayStruct.createArrayStruct(builder,
+            a, b, c, d_a, d_b, d_c, d_d, e, f);
+
+        // Create a table with the ArrayStruct.
+        ArrayTable.startArrayTable(builder);
+        ArrayTable.addA(builder, arrayOffset);
+        int tableOffset = ArrayTable.endArrayTable(builder);
+
+        ArrayTable.finishArrayTableBuffer(builder, tableOffset);
+
+        ArrayTable table = ArrayTable.getRootAsArrayTable(builder.dataBuffer());
+        NestedStruct nested = new NestedStruct();
+
+        TestEq(table.a().a(), 0.5f);
+        for (int i = 0; i < 15; i++) TestEq(table.a().b(i), i);
+        TestEq(table.a().c(), (byte)1);
+        TestEq(table.a().d(nested, 0).a(0), 1);
+        TestEq(table.a().d(nested, 0).a(1), 2);
+        TestEq(table.a().d(nested, 1).a(0), 3);
+        TestEq(table.a().d(nested, 1).a(1), 4);
+        TestEq(table.a().d(nested, 0).b(), TestEnum.B);
+        TestEq(table.a().d(nested, 1).b(), TestEnum.C);
+        TestEq(table.a().d(nested, 0).c(0), TestEnum.A);
+        TestEq(table.a().d(nested, 0).c(1), TestEnum.B);
+        TestEq(table.a().d(nested, 1).c(0), TestEnum.C);
+        TestEq(table.a().d(nested, 1).c(1), TestEnum.B);
+        TestEq(table.a().d(nested, 0).d(0), (long)-1);
+        TestEq(table.a().d(nested, 0).d(1), (long)1);
+        TestEq(table.a().d(nested, 1).d(0), (long)-2);
+        TestEq(table.a().d(nested, 1).d(1), (long)2);
+        TestEq(table.a().e(), 2);
+        TestEq(table.a().f(0), (long)-1);
+        TestEq(table.a().f(1), (long)1);
+    }
+
+    public static void testFlexBuffersTest() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512),
+                FlexBuffersBuilder.BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+        testFlexBuffersTest(builder);
+        int bufferLimit1 = ((ArrayReadWriteBuf) builder.getBuffer()).limit();
+
+        // Repeat after clearing the builder to ensure the builder is reusable
+        builder.clear();
+        testFlexBuffersTest(builder);
+        int bufferLimit2 = ((ArrayReadWriteBuf) builder.getBuffer()).limit();
+        TestEq(bufferLimit1, bufferLimit2);
+    }
+
+    public static void testFlexBuffersTest(FlexBuffersBuilder builder) {
+        // Write the equivalent of:
+        // { vec: [ -100, "Fred", 4.0, false ], bar: [ 1, 2, 3 ], bar3: [ 1, 2, 3 ],
+        // foo: 100, bool: true, mymap: { foo: "Fred" } }
+        // It's possible to do this without std::function support as well.
+        int map1 = builder.startMap();
+
+        int vec1 = builder.startVector();
+        builder.putInt(-100);
+        builder.putString("Fred");
+        builder.putBlob(new byte[]{(byte) 77});
+        builder.putBoolean(false);
+        builder.putInt(Long.MAX_VALUE);
+
+        int map2 = builder.startMap();
+        builder.putInt("test", 200);
+        builder.endMap(null, map2);
+
+        builder.putFloat(150.9);
+        builder.putFloat(150.9999998);
+        builder.endVector("vec", vec1, false, false);
+
+        vec1 = builder.startVector();
+        builder.putInt(1);
+        builder.putInt(2);
+        builder.putInt(3);
+        builder.endVector("bar", vec1, true, false);
+
+        vec1 = builder.startVector();
+        builder.putBoolean(true);
+        builder.putBoolean(false);
+        builder.putBoolean(true);
+        builder.putBoolean(false);
+        builder.endVector("bools", vec1, true, false);
+
+        builder.putBoolean("bool", true);
+        builder.putFloat("foo", 100);
+
+        map2 = builder.startMap();
+        builder.putString("bar", "Fred");  // Testing key and string reuse.
+        builder.putInt("int", -120);
+        builder.putFloat("float", -123.0f);
+        builder.putBlob("blob", new byte[]{ 65, 67 });
+        builder.endMap("mymap", map2);
+
+        builder.endMap(null, map1);
+        builder.finish();
+
+        FlexBuffers.Map m = FlexBuffers.getRoot(builder.getBuffer()).asMap();
+
+        TestEq(m.size(), 6);
+
+        // test empty (an null)
+        TestEq(m.get("no_key").asString(), ""); // empty if fail
+        TestEq(m.get("no_key").asMap(), FlexBuffers.Map.empty()); // empty if fail
+        TestEq(m.get("no_key").asKey(), FlexBuffers.Key.empty()); // empty if fail
+        TestEq(m.get("no_key").asVector(), FlexBuffers.Vector.empty()); // empty if fail
+        TestEq(m.get("no_key").asBlob(), FlexBuffers.Blob.empty()); // empty if fail
+        assert(m.get("no_key").asVector().isEmpty()); // empty if fail
+
+        // testing "vec" field
+        FlexBuffers.Vector vec = m.get("vec").asVector();
+        TestEq(vec.size(), 8);
+        TestEq(vec.get(0).asLong(), (long) -100);
+        TestEq(vec.get(1).asString(), "Fred");
+        TestEq(vec.get(2).isBlob(), true);
+        TestEq(vec.get(2).asBlob().size(), 1);
+        TestEq(vec.get(2).asBlob().data().get(0), (byte) 77);
+        TestEq(vec.get(3).isBoolean(), true);   // Check if type is a bool
+        TestEq(vec.get(3).asBoolean(), false);  // Check if value is false
+        TestEq(vec.get(4).asLong(), Long.MAX_VALUE);
+        TestEq(vec.get(5).isMap(), true);
+        TestEq(vec.get(5).asMap().get("test").asInt(), 200);
+        TestEq(Float.compare((float)vec.get(6).asFloat(), 150.9f), 0);
+        TestEq(Double.compare(vec.get(7).asFloat(), 150.9999998), 0);
+        TestEq((long)0, (long)vec.get(1).asLong()); //conversion fail returns 0 as C++
+
+        // bar vector
+        FlexBuffers.Vector tvec = m.get("bar").asVector();
+        TestEq(tvec.size(), 3);
+        TestEq(tvec.get(0).asInt(), 1);
+        TestEq(tvec.get(1).asInt(), 2);
+        TestEq(tvec.get(2).asInt(), 3);
+        TestEq(((FlexBuffers.TypedVector) tvec).getElemType(), FlexBuffers.FBT_INT);
+
+        // bools vector
+        FlexBuffers.Vector bvec = m.get("bools").asVector();
+        TestEq(bvec.size(), 4);
+        TestEq(bvec.get(0).asBoolean(), true);
+        TestEq(bvec.get(1).asBoolean(), false);
+        TestEq(bvec.get(2).asBoolean(), true);
+        TestEq(bvec.get(3).asBoolean(), false);
+        TestEq(((FlexBuffers.TypedVector) bvec).getElemType(), FlexBuffers.FBT_BOOL);
+
+
+        TestEq((float)m.get("foo").asFloat(), (float) 100);
+        TestEq(m.get("unknown").isNull(), true);
+
+        // mymap vector
+        FlexBuffers.Map mymap = m.get("mymap").asMap();
+        TestEq(mymap.keys().get(0), m.keys().get(0)); // These should be equal by pointer equality, since key and value are shared.
+        TestEq(mymap.keys().get(0).toString(), "bar");
+        TestEq(mymap.values().get(0).asString(), vec.get(1).asString());
+        TestEq(mymap.get("int").asInt(), -120);
+        TestEq((float)mymap.get("float").asFloat(), -123.0f);
+        TestEq(Arrays.equals(mymap.get("blob").asBlob().getBytes(), new byte[]{ 65, 67 }), true);
+        TestEq(mymap.get("blob").asBlob().toString(), "AC");
+        TestEq(mymap.get("blob").toString(), "\"AC\"");
+    }
+
+    public static void testFlexBufferVectorStrings() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(10000000));
+
+        int size = 3000;
+        StringBuilder sb = new StringBuilder();
+        for (int i=0; i< size; i++) {
+            sb.append("a");
+        }
+
+        String text = sb.toString();
+        TestEq(text.length(), size);
+
+        int pos = builder.startVector();
+
+        for (int i=0; i<size; i++) {
+            builder.putString(text);
+        }
+
+        try {
+            builder.endVector(null, pos, true, false);
+            // this should raise an exception as
+            // typed vector of string was deprecated
+            assert false;
+        } catch(FlexBufferException fb) {
+            // no op
+        }
+        // we finish the vector again as non-typed
+        builder.endVector(null, pos, false, false);
+
+        ByteBuffer b = builder.finish();
+        Vector v = FlexBuffers.getRoot(b).asVector();
+
+        TestEq(v.size(), size);
+        for (int i=0; i<size; i++) {
+            TestEq(v.get(i).asString().length(), size);
+            TestEq(v.get(i).asString(), text);
+        }
+    }
+
+    public static void testDeprecatedTypedVectorString() {
+        // tests whether we are able to support reading deprecated typed vector string
+        // data is equivalent to [ "abc", "abc", "abc", "abc"]
+        byte[] data = new byte[] {0x03, 0x61, 0x62, 0x63, 0x00, 0x03, 0x61, 0x62, 0x63, 0x00,
+            0x03, 0x61, 0x62, 0x63, 0x00, 0x03, 0x61, 0x62, 0x63, 0x00, 0x04, 0x14, 0x10,
+             0x0c, 0x08, 0x04, 0x3c, 0x01};
+        Reference ref = FlexBuffers.getRoot(ByteBuffer.wrap(data));
+        TestEq(ref.getType(), FlexBuffers.FBT_VECTOR_STRING_DEPRECATED);
+        TestEq(ref.isTypedVector(), true);
+        Vector vec = ref.asVector();
+        for (int i=0; i< vec.size(); i++) {
+            TestEq("abc", vec.get(i).asString());
+        }
+    }
+
+    public static void testSingleElementBoolean() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(100));
+        builder.putBoolean(true);
+        ByteBuffer b = builder.finish();
+        assert(FlexBuffers.getRoot(b).asBoolean());
+    }
+
+    public static void testSingleElementByte() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putInt(10);
+        ByteBuffer b = builder.finish();
+        TestEq(10, FlexBuffers.getRoot(b).asInt());
+    }
+
+    public static void testSingleElementShort() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putInt(Short.MAX_VALUE);
+        ByteBuffer b = builder.finish();
+        TestEq(Short.MAX_VALUE, (short)FlexBuffers.getRoot(b).asInt());
+    }
+
+    public static void testSingleElementInt() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putInt(Integer.MIN_VALUE);
+        ByteBuffer b = builder.finish();
+        TestEq(Integer.MIN_VALUE, FlexBuffers.getRoot(b).asInt());
+    }
+
+    public static void testSingleElementLong() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putInt(Long.MAX_VALUE);
+        ByteBuffer b = builder.finish();
+        TestEq(Long.MAX_VALUE, FlexBuffers.getRoot(b).asLong());
+    }
+
+    public static void testSingleElementFloat() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putFloat(Float.MAX_VALUE);
+        ByteBuffer b = builder.finish();
+        TestEq(Float.compare(Float.MAX_VALUE, (float) FlexBuffers.getRoot(b).asFloat()), 0);
+    }
+
+    public static void testSingleElementDouble() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putFloat(Double.MAX_VALUE);
+        ByteBuffer b = builder.finish();
+        TestEq(Double.compare(Double.MAX_VALUE, FlexBuffers.getRoot(b).asFloat()), 0);
+    }
+
+    public static void testSingleElementBigString() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(10000));
+        StringBuilder sb = new StringBuilder();
+
+        for (int i=0; i< 3000; i++) {
+            sb.append("a");
+        }
+
+        builder.putString(sb.toString());
+        ByteBuffer b = builder.finish();
+
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+
+        TestEq(FlexBuffers.FBT_STRING, r.getType());
+        TestEq(sb.toString(), r.asString());
+    }
+
+    public static void testSingleElementSmallString() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(10000));
+
+        builder.putString("aa");
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+
+        TestEq(FlexBuffers.FBT_STRING, r.getType());
+        TestEq("aa", r.asString());
+    }
+
+    public static void testSingleElementBlob() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putBlob(new byte[]{5, 124, 118, -1});
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+        byte[] result = r.asBlob().getBytes();
+        TestEq((byte)5, result[0]);
+        TestEq((byte)124, result[1]);
+        TestEq((byte)118, result[2]);
+        TestEq((byte)-1, result[3]);
+    }
+
+    public static void testSingleElementUByte() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putUInt(0xFF);
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+        TestEq(255, (int)r.asUInt());
+    }
+
+    public static void testSingleElementUShort() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putUInt(0xFFFF);
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+        TestEq(65535, (int)r.asUInt());
+    }
+
+    public static void testSingleElementUInt() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        builder.putUInt(0xFFFF_FFFFL);
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+        TestEq(4294967295L, r.asUInt());
+    }
+
+    public static void testSingleFixedTypeVector() {
+
+        int[] ints = new int[]{5, 124, 118, -1};
+        float[] floats = new float[]{5.5f, 124.124f, 118.118f, -1.1f};
+        String[] strings = new String[]{"This", "is", "a", "typed", "array"};
+        boolean[] booleans = new boolean[]{false, true, true, false};
+
+
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512),
+                FlexBuffersBuilder.BUILDER_FLAG_NONE);
+
+        int mapPos = builder.startMap();
+
+        int vecPos = builder.startVector();
+        for (final int i : ints) {
+            builder.putInt(i);
+        }
+        builder.endVector("ints", vecPos, true, false);
+
+        vecPos = builder.startVector();
+        for (final float i : floats) {
+            builder.putFloat(i);
+        }
+        builder.endVector("floats", vecPos, true, false);
+
+        vecPos = builder.startVector();
+        for (final boolean i : booleans) {
+            builder.putBoolean(i);
+        }
+        builder.endVector("booleans", vecPos, true, false);
+
+        builder.endMap(null, mapPos);
+
+
+        ByteBuffer b = builder.finish();
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b);
+        assert(r.asMap().get("ints").isTypedVector());
+        assert(r.asMap().get("floats").isTypedVector());
+        assert(r.asMap().get("booleans").isTypedVector());
+    }
+
+    public static void testSingleElementVector() {
+        FlexBuffersBuilder b = new FlexBuffersBuilder();
+
+        int vecPos = b.startVector();
+        b.putInt(99);
+        b.putString("wow");
+        int vecpos2 = b.startVector();
+        b.putInt(99);
+        b.putString("wow");
+        b.endVector(null, vecpos2, false, false);
+        b.endVector(null, vecPos, false, false);
+        b.finish();
+
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b.getBuffer());
+        TestEq(FlexBuffers.FBT_VECTOR, r.getType());
+        FlexBuffers.Vector vec = FlexBuffers.getRoot(b.getBuffer()).asVector();
+        TestEq(3, vec.size());
+        TestEq(99, vec.get(0).asInt());
+        TestEq("wow", vec.get(1).asString());
+        TestEq("[ 99, \"wow\" ]", vec.get(2).toString());
+        TestEq("[ 99, \"wow\", [ 99, \"wow\" ] ]", FlexBuffers.getRoot(b.getBuffer()).toString());
+    }
+
+    public static void testSingleElementMap() {
+        FlexBuffersBuilder b = new FlexBuffersBuilder();
+
+        int mapPost = b.startMap();
+        b.putInt("myInt", 0x7fffffbbbfffffffL);
+        b.putString("myString", "wow");
+        b.putString("myString2", "incredible");
+        int start = b.startVector();
+        b.putInt(99);
+        b.putString("wow");
+        b.endVector("myVec", start, false, false);
+
+        b.putFloat("double", 0x1.ffffbbbffffffP+1023);
+        b.endMap(null, mapPost);
+        b.finish();
+
+        FlexBuffers.Reference r = FlexBuffers.getRoot(b.getBuffer());
+        TestEq(FlexBuffers.FBT_MAP, r.getType());
+        FlexBuffers.Map map = FlexBuffers.getRoot(b.getBuffer()).asMap();
+        TestEq(5, map.size());
+        TestEq(0x7fffffbbbfffffffL, map.get("myInt").asLong());
+        TestEq("wow", map.get("myString").asString());
+        TestEq("incredible", map.get("myString2").asString());
+        TestEq(99, map.get("myVec").asVector().get(0).asInt());
+        TestEq("wow", map.get("myVec").asVector().get(1).asString());
+        TestEq(Double.compare(0x1.ffffbbbffffffP+1023, map.get("double").asFloat()), 0);
+        TestEq("{ \"double\" : 1.7976894783391937E308, \"myInt\" : 9223371743723257855, \"myString\" : \"wow\", \"myString2\" : \"incredible\", \"myVec\" : [ 99, \"wow\" ] }",
+                FlexBuffers.getRoot(b.getBuffer()).toString());
+    }
+
+    public static void testFlexBuferEmpty() {
+        FlexBuffers.Blob blob = FlexBuffers.Blob.empty();
+        FlexBuffers.Map ary = FlexBuffers.Map.empty();
+        FlexBuffers.Vector map = FlexBuffers.Vector.empty();
+        FlexBuffers.TypedVector typedAry = FlexBuffers.TypedVector.empty();
+        TestEq(blob.size(), 0);
+        TestEq(map.size(), 0);
+        TestEq(ary.size(), 0);
+        TestEq(typedAry.size(), 0);
+    }
+
+    public static void testHashMapToMap() {
+        int entriesCount = 12;
+
+        HashMap<String, String> source =  new HashMap<>();
+        for (int i = 0; i < entriesCount; i++) {
+            source.put("foo_param_" + i, "foo_value_" + i);
+        }
+
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(1000);
+        int mapStart = builder.startMap();
+        for (Map.Entry<String, String> entry : source.entrySet()) {
+            builder.putString(entry.getKey(), entry.getValue());
+        }
+        builder.endMap(null, mapStart);
+        ByteBuffer bb = builder.finish();
+        bb.rewind();
+
+        FlexBuffers.Reference rootReference = FlexBuffers.getRoot(bb);
+
+        TestEq(rootReference.isMap(), true);
+
+        FlexBuffers.Map flexMap = rootReference.asMap();
+
+        FlexBuffers.KeyVector keys = flexMap.keys();
+        FlexBuffers.Vector values = flexMap.values();
+
+        TestEq(entriesCount, keys.size());
+        TestEq(entriesCount, values.size());
+
+        HashMap<String, String> result =  new HashMap<>();
+        for (int i = 0; i < keys.size(); i++) {
+            result.put(keys.get(i).toString(), values.get(i).asString());
+        }
+
+        TestEq(source, result);
+    }
+
+    public static void testBuilderGrowth() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder();
+        String someString = "This is a small string";
+        builder.putString(someString);
+        ByteBuffer b = builder.finish();
+        TestEq(someString, FlexBuffers.getRoot(b).asString());
+
+        FlexBuffersBuilder failBuilder = new FlexBuffersBuilder(ByteBuffer.allocate(1));
+        failBuilder.putString(someString);
+    }
+    
+    public static void testFlexBuffersUtf8Map() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512),
+                FlexBuffersBuilder.BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+
+        String key0 = "😨 face1";
+        String key1 = "😩 face2";
+        String key2 = "😨 face3";
+        String key3 = "trademark ®";
+        String key4 = "€ euro";
+        String utf8keys[] = { "😨 face1", "😩 face2", "😨 face3", "trademark ®", "€ euro"};
+
+        int map = builder.startMap();
+
+        for (int i=0; i< utf8keys.length; i++) {
+            builder.putString(utf8keys[i], utf8keys[i]);  // Testing key and string reuse.
+        }
+        builder.endMap(null, map);
+        builder.finish();
+
+        FlexBuffers.Map m = FlexBuffers.getRoot(builder.getBuffer()).asMap();
+
+        TestEq(m.size(), 5);
+
+        KeyVector kv = m.keys();
+        for (int i=0; i< utf8keys.length; i++) {
+            TestEq(kv.get(i).toString(), m.get(i).asString());
+        }
+
+        TestEq(m.get(key0).asString(), utf8keys[0]);
+        TestEq(m.get(key1).asString(), utf8keys[1]);
+        TestEq(m.get(key2).asString(), utf8keys[2]);
+        TestEq(m.get(key3).asString(), utf8keys[3]);
+        TestEq(m.get(key4).asString(), utf8keys[4]);
+    }
+
+    public static void TestFlexBuffers() {
+        testSingleElementByte();
+        testSingleElementShort();
+        testSingleElementInt();
+        testSingleElementLong();
+        testSingleElementFloat();
+        testSingleElementDouble();
+        testSingleElementSmallString();
+        testSingleElementBigString();
+        testSingleElementBlob();
+        testSingleElementVector();
+        testSingleFixedTypeVector();
+        testSingleElementUShort();
+        testSingleElementUInt();
+        testSingleElementUByte();
+        testSingleElementMap();
+        testFlexBuffersTest();
+        testHashMapToMap();
+        testFlexBuferEmpty();
+        testFlexBufferVectorStrings();
+        testDeprecatedTypedVectorString();
+        testBuilderGrowth();
+        testFlexBuffersUtf8Map();
+    }
+
+    static void TestVectorOfBytes() {
+        FlatBufferBuilder fbb = new FlatBufferBuilder(16);
+        int str = fbb.createString("ByteMonster");
+        byte[] data = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        int offset = Monster.createInventoryVector(fbb, data);
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        int monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject.inventoryLength(), data.length);
+        TestEq(monsterObject.inventory(4), (int) data[4]);
+        TestEq(ByteBuffer.wrap(data), monsterObject.inventoryAsByteBuffer());
+
+        fbb.clear();
+        ByteBuffer bb = ByteBuffer.wrap(data);
+        offset = fbb.createByteVector(bb);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject2 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject2.inventoryLength(), data.length);
+        for (int i = 0; i < data.length; i++) {
+          TestEq(monsterObject2.inventory(i), (int) bb.get(i));
+        }
+
+        fbb.clear();
+        offset = fbb.createByteVector(data, 3, 4);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject3 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject3.inventoryLength(), 4);
+        TestEq(monsterObject3.inventory(0), (int) data[3]);
+
+        fbb.clear();
+        bb = ByteBuffer.wrap(data);
+        offset = Monster.createInventoryVector(fbb, bb);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject4 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject4.inventoryLength(), data.length);
+        TestEq(monsterObject4.inventory(8), (int) 8);
+
+        fbb.clear();
+        byte[] largeData = new byte[1024];
+        offset = fbb.createByteVector(largeData);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject5 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject5.inventoryLength(), largeData.length);
+        TestEq(monsterObject5.inventory(25), (int) largeData[25]);
+
+        fbb.clear();
+        bb = ByteBuffer.wrap(largeData);
+        bb.position(512);
+        ByteBuffer bb2 = bb.slice();
+        TestEq(bb2.arrayOffset(), 512);
+        offset = fbb.createByteVector(bb2);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject6 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject6.inventoryLength(), 512);
+        TestEq(monsterObject6.inventory(0), (int) largeData[512]);
+
+        fbb.clear();
+        bb = ByteBuffer.wrap(largeData);
+        bb.limit(256);
+        offset = fbb.createByteVector(bb);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject7 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject7.inventoryLength(), 256);
+
+        fbb.clear();
+        bb = ByteBuffer.allocateDirect(2048);
+        offset = fbb.createByteVector(bb);
+        str = fbb.createString("ByteMonster");
+        Monster.startMonster(fbb);
+        Monster.addName(fbb, str);
+        Monster.addInventory(fbb, offset);
+        monster1 = Monster.endMonster(fbb);
+        Monster.finishMonsterBuffer(fbb, monster1);
+        Monster monsterObject8 = Monster.getRootAsMonster(fbb.dataBuffer());
+
+        TestEq(monsterObject8.inventoryLength(), 2048);
+    }
+
+    static void TestSharedStringPool() {
+        FlatBufferBuilder fb = new FlatBufferBuilder(1);
+        String testString = "My string";
+        int offset = fb.createSharedString(testString);
+        for (int i=0; i< 10; i++) {
+            TestEq(offset, fb.createSharedString(testString));
+        }
+    }
+
+    static void TestScalarOptional() {
+        FlatBufferBuilder fbb = new FlatBufferBuilder(1);
+        ScalarStuff.startScalarStuff(fbb);
+        int pos = ScalarStuff.endScalarStuff(fbb);
+        fbb.finish(pos);
+
+        ScalarStuff scalarStuff = ScalarStuff.getRootAsScalarStuff(fbb.dataBuffer());
+        TestEq(scalarStuff.justI8(), (byte)0);
+        TestEq(scalarStuff.maybeI8(), (byte)0);
+        TestEq(scalarStuff.defaultI8(), (byte)42);
+        TestEq(scalarStuff.justU8(), 0);
+        TestEq(scalarStuff.maybeU8(), 0);
+        TestEq(scalarStuff.defaultU8(), 42);
+        TestEq(scalarStuff.justI16(), (short)0);
+        TestEq(scalarStuff.maybeI16(), (short)0);
+        TestEq(scalarStuff.defaultI16(), (short)42);
+        TestEq(scalarStuff.justU16(), 0);
+        TestEq(scalarStuff.maybeU16(), 0);
+        TestEq(scalarStuff.defaultU16(), 42);
+        TestEq(scalarStuff.justI32(), 0);
+        TestEq(scalarStuff.maybeI32(), 0);
+        TestEq(scalarStuff.defaultI32(), 42);
+        TestEq(scalarStuff.justU32(), 0L);
+        TestEq(scalarStuff.maybeU32(), 0L);
+        TestEq(scalarStuff.defaultU32(), 42L);
+        TestEq(scalarStuff.justI64(), 0L);
+        TestEq(scalarStuff.maybeI64(), 0L);
+        TestEq(scalarStuff.defaultI64(), 42L);
+        TestEq(scalarStuff.justU64(), 0L);
+        TestEq(scalarStuff.maybeU64(), 0L);
+        TestEq(scalarStuff.defaultU64(), 42L);
+        TestEq(scalarStuff.justF32(), 0.0f);
+        TestEq(scalarStuff.maybeF32(), 0f);
+        TestEq(scalarStuff.defaultF32(), 42.0f);
+        TestEq(scalarStuff.justF64(), 0.0);
+        TestEq(scalarStuff.maybeF64(), 0.0);
+        TestEq(scalarStuff.defaultF64(), 42.0);
+        TestEq(scalarStuff.justBool(), false);
+        TestEq(scalarStuff.maybeBool(), false);
+        TestEq(scalarStuff.defaultBool(), true);
+        TestEq(scalarStuff.justEnum(), OptionalByte.None);
+        TestEq(scalarStuff.maybeEnum(), OptionalByte.None);
+        TestEq(scalarStuff.defaultEnum(), OptionalByte.One);
+
+        TestEq(scalarStuff.hasMaybeI8(), false);
+        TestEq(scalarStuff.hasMaybeI16(), false);
+        TestEq(scalarStuff.hasMaybeI32(), false);
+        TestEq(scalarStuff.hasMaybeI64(), false);
+        TestEq(scalarStuff.hasMaybeU8(), false);
+        TestEq(scalarStuff.hasMaybeU16(), false);
+        TestEq(scalarStuff.hasMaybeU32(), false);
+        TestEq(scalarStuff.hasMaybeU64(), false);
+        TestEq(scalarStuff.hasMaybeF32(), false);
+        TestEq(scalarStuff.hasMaybeF64(), false);
+        TestEq(scalarStuff.hasMaybeBool(), false);
+        TestEq(scalarStuff.hasMaybeEnum(), false);
+
+        fbb.clear();
+
+        ScalarStuff.startScalarStuff(fbb);
+        ScalarStuff.addJustI8(fbb, (byte)5);
+        ScalarStuff.addMaybeI8(fbb, (byte)5);
+        ScalarStuff.addDefaultI8(fbb, (byte)5);
+        ScalarStuff.addJustU8(fbb, 6);
+        ScalarStuff.addMaybeU8(fbb, 6);
+        ScalarStuff.addDefaultU8(fbb, 6);
+        ScalarStuff.addJustI16(fbb, (short)7);
+        ScalarStuff.addMaybeI16(fbb, (short)7);
+        ScalarStuff.addDefaultI16(fbb, (short)7);
+        ScalarStuff.addJustU16(fbb, 8);
+        ScalarStuff.addMaybeU16(fbb, 8);
+        ScalarStuff.addDefaultU16(fbb, 8);
+        ScalarStuff.addJustI32(fbb, 9);
+        ScalarStuff.addMaybeI32(fbb, 9);
+        ScalarStuff.addDefaultI32(fbb, 9);
+        ScalarStuff.addJustU32(fbb, (long)10);
+        ScalarStuff.addMaybeU32(fbb, (long)10);
+        ScalarStuff.addDefaultU32(fbb, (long)10);
+        ScalarStuff.addJustI64(fbb, 11L);
+        ScalarStuff.addMaybeI64(fbb, 11L);
+        ScalarStuff.addDefaultI64(fbb, 11L);
+        ScalarStuff.addJustU64(fbb, 12L);
+        ScalarStuff.addMaybeU64(fbb, 12L);
+        ScalarStuff.addDefaultU64(fbb, 12L);
+        ScalarStuff.addJustF32(fbb, 13.0f);
+        ScalarStuff.addMaybeF32(fbb, 13.0f);
+        ScalarStuff.addDefaultF32(fbb, 13.0f);
+        ScalarStuff.addJustF64(fbb, 14.0);
+        ScalarStuff.addMaybeF64(fbb, 14.0);
+        ScalarStuff.addDefaultF64(fbb, 14.0);
+        ScalarStuff.addJustBool(fbb, true);
+        ScalarStuff.addMaybeBool(fbb, true);
+        ScalarStuff.addDefaultBool(fbb, true);
+        ScalarStuff.addJustEnum(fbb, OptionalByte.Two);
+        ScalarStuff.addMaybeEnum(fbb, OptionalByte.Two);
+        ScalarStuff.addDefaultEnum(fbb, OptionalByte.Two);
+
+        pos = ScalarStuff.endScalarStuff(fbb);
+
+        fbb.finish(pos);
+
+        scalarStuff = ScalarStuff.getRootAsScalarStuff(fbb.dataBuffer());
+
+        TestEq(scalarStuff.justI8(), (byte)5);
+        TestEq(scalarStuff.maybeI8(), (byte)5);
+        TestEq(scalarStuff.defaultI8(), (byte)5);
+        TestEq(scalarStuff.justU8(), 6);
+        TestEq(scalarStuff.maybeU8(), 6);
+        TestEq(scalarStuff.defaultU8(), 6);
+        TestEq(scalarStuff.justI16(), (short)7);
+        TestEq(scalarStuff.maybeI16(), (short)7);
+        TestEq(scalarStuff.defaultI16(), (short)7);
+        TestEq(scalarStuff.justU16(), 8);
+        TestEq(scalarStuff.maybeU16(), 8);
+        TestEq(scalarStuff.defaultU16(), 8);
+        TestEq(scalarStuff.justI32(), 9);
+        TestEq(scalarStuff.maybeI32(), 9);
+        TestEq(scalarStuff.defaultI32(), 9);
+        TestEq(scalarStuff.justU32(), 10L);
+        TestEq(scalarStuff.maybeU32(), 10L);
+        TestEq(scalarStuff.defaultU32(), 10L);
+        TestEq(scalarStuff.justI64(), 11L);
+        TestEq(scalarStuff.maybeI64(), 11L);
+        TestEq(scalarStuff.defaultI64(), 11L);
+        TestEq(scalarStuff.justU64(), 12L);
+        TestEq(scalarStuff.maybeU64(), 12L);
+        TestEq(scalarStuff.defaultU64(), 12L);
+        TestEq(scalarStuff.justF32(), 13.0f);
+        TestEq(scalarStuff.maybeF32(), 13.0f);
+        TestEq(scalarStuff.defaultF32(), 13.0f);
+        TestEq(scalarStuff.justF64(), 14.0);
+        TestEq(scalarStuff.maybeF64(), 14.0);
+        TestEq(scalarStuff.defaultF64(), 14.0);
+        TestEq(scalarStuff.justBool(), true);
+        TestEq(scalarStuff.maybeBool(), true);
+        TestEq(scalarStuff.defaultBool(), true);
+        TestEq(scalarStuff.justEnum(), OptionalByte.Two);
+        TestEq(scalarStuff.maybeEnum(), OptionalByte.Two);
+        TestEq(scalarStuff.defaultEnum(), OptionalByte.Two);
+
+        TestEq(scalarStuff.hasMaybeI8(), true);
+        TestEq(scalarStuff.hasMaybeI16(), true);
+        TestEq(scalarStuff.hasMaybeI32(), true);
+        TestEq(scalarStuff.hasMaybeI64(), true);
+        TestEq(scalarStuff.hasMaybeU8(), true);
+        TestEq(scalarStuff.hasMaybeU16(), true);
+        TestEq(scalarStuff.hasMaybeU32(), true);
+        TestEq(scalarStuff.hasMaybeU64(), true);
+        TestEq(scalarStuff.hasMaybeF32(), true);
+        TestEq(scalarStuff.hasMaybeF64(), true);
+        TestEq(scalarStuff.hasMaybeBool(), true);
+        TestEq(scalarStuff.hasMaybeEnum(), true);
+    }
+
+    static <T> void TestEq(T a, T b) {
+        if ((a == null && a != b) || (a != null && !a.equals(b))) {
+            System.out.println("" + a.getClass().getName() + " " + b.getClass().getName());
+            System.out.println("FlatBuffers test FAILED: \'" + a + "\' != \'" + b + "\'");
+            new Throwable().printStackTrace();
+            assert false;
+            System.exit(1);
+        }
+    }
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.sh
new file mode 100755
index 0000000..099447e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/JavaTest.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+
+echo Compile then run the Java test.
+
+java -version
+
+testdir=$(dirname $0)
+
+targetdir="${testdir}/target"
+
+if [[ -e "${targetdir}" ]]; then
+    echo "cleaning target"
+    rm -rf "${targetdir}"
+fi
+
+mkdir -v "${targetdir}"
+
+if ! find "${testdir}/../java" -type f -name "*.class" -delete; then
+    echo "failed to clean .class files from java directory" >&2
+    exit 1
+fi
+
+javac -d "${targetdir}" -classpath "${testdir}/optional_scalars:${testdir}/../java:${testdir}:${testdir}/namespace_test:${testdir}/union_vector" "${testdir}/JavaTest.java"
+
+(cd "${testdir}" && java -ea -classpath "${targetdir}" JavaTest )
+
+rm -rf "${targetdir}"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.kt b/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.kt
new file mode 100644
index 0000000..cfb7056
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.kt
@@ -0,0 +1,601 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import MyGame.Example.*
+import optional_scalars.*
+import com.google.flatbuffers.ByteBufferUtil
+import com.google.flatbuffers.FlatBufferBuilder
+import NamespaceA.*
+import NamespaceA.NamespaceB.*
+import NamespaceA.NamespaceB.TableInNestedNS
+import java.io.File
+import java.io.FileOutputStream
+import java.io.InputStream
+import java.io.RandomAccessFile
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+import java.nio.channels.FileChannel
+
+import com.google.flatbuffers.Constants.SIZE_PREFIX_LENGTH
+
+@kotlin.ExperimentalUnsignedTypes
+class KotlinTest {
+
+  companion object {
+    @JvmStatic
+    fun main(args: Array<String>) {
+
+        // First, let's test reading a FlatBuffer generated by C++ code:
+        // This file was generated from monsterdata_test.json
+
+        val data = RandomAccessFile(File("monsterdata_test.mon"), "r").use {
+            val temp = ByteArray(it.length().toInt())
+            it.readFully(temp)
+            temp
+        }
+
+        // Now test it:
+
+        val bb = ByteBuffer.wrap(data)
+        TestBuffer(bb)
+
+        // Second, let's create a FlatBuffer from scratch in Java, and test it also.
+        // We use an initial size of 1 to exercise the reallocation algorithm,
+        // normally a size larger than the typical FlatBuffer you generate would be
+        // better for performance.
+        val fbb = FlatBufferBuilder(1)
+
+        TestBuilderBasics(fbb, true)
+        TestBuilderBasics(fbb, false)
+
+        TestExtendedBuffer(fbb.dataBuffer().asReadOnlyBuffer())
+
+        TestNamespaceNesting()
+
+        TestNestedFlatBuffer()
+
+        TestCreateByteVector()
+
+        TestCreateUninitializedVector()
+
+        TestByteBufferFactory()
+
+        TestSizedInputStream()
+
+        TestVectorOfUnions()
+
+        TestSharedStringPool()
+        TestScalarOptional()
+        println("FlatBuffers test: completed successfully")
+    }
+
+    fun TestEnums() {
+        assert(Color.name(Color.Red.toInt()) == "Red")
+        assert(Color.name(Color.Blue.toInt()) == "Blue")
+        assert(Any_.name(Any_.NONE.toInt()) == "NONE")
+        assert(Any_.name(Any_.Monster.toInt()) == "Monster")
+    }
+
+    fun TestBuffer(bb: ByteBuffer) {
+        assert(Monster.MonsterBufferHasIdentifier(bb) == true)
+
+        val monster = Monster.getRootAsMonster(bb)
+
+        assert(monster.hp == 80.toShort())
+        assert(monster.mana == 150.toShort())  // default
+
+        assert(monster.name == "MyMonster")
+        // monster.friendly() // can't access, deprecated
+
+        val pos = monster.pos!!
+        assert(pos.x == 1.0f)
+        assert(pos.y == 2.0f)
+        assert(pos.z == 3.0f)
+        assert(pos.test1 == 3.0)
+        // issue: int != byte
+        assert(pos.test2 == Color.Green)
+        val t = pos.test3!!
+        assert(t.a == 5.toShort())
+        assert(t.b == 6.toByte())
+
+        assert(monster.testType == Any_.Monster)
+        val monster2 = Monster()
+        assert(monster.test(monster2) != null == true)
+        assert(monster2.name == "Fred")
+
+        assert(monster.inventoryLength == 5)
+        var invsum = 0u
+        for (i in 0 until monster.inventoryLength)
+            invsum += monster.inventory(i)
+        assert(invsum == 10u)
+
+        // Alternative way of accessing a vector:
+        val ibb = monster.inventoryAsByteBuffer
+        invsum = 0u
+        while (ibb.position() < ibb.limit())
+            invsum += ibb.get().toUInt()
+        assert(invsum == 10u)
+
+
+        val test_0 = monster.test4(0)!!
+        val test_1 = monster.test4(1)!!
+        assert(monster.test4Length == 2)
+        assert(test_0.a + test_0.b + test_1.a + test_1.b == 100)
+
+        assert(monster.testarrayofstringLength == 2)
+        assert(monster.testarrayofstring(0) == "test1")
+        assert(monster.testarrayofstring(1) == "test2")
+
+        assert(monster.testbool == true)
+    }
+
+    // this method checks additional fields not present in the binary buffer read from file
+    // these new tests are performed on top of the regular tests
+    fun TestExtendedBuffer(bb: ByteBuffer) {
+        TestBuffer(bb)
+
+        val monster = Monster.getRootAsMonster(bb)
+
+        assert(monster.testhashu32Fnv1 == (Integer.MAX_VALUE + 1L).toUInt())
+    }
+
+    fun TestNamespaceNesting() {
+        // reference / manipulate these to verify compilation
+        val fbb = FlatBufferBuilder(1)
+
+        TableInNestedNS.startTableInNestedNS(fbb)
+        TableInNestedNS.addFoo(fbb, 1234)
+        val nestedTableOff = TableInNestedNS.endTableInNestedNS(fbb)
+
+        TableInFirstNS.startTableInFirstNS(fbb)
+        TableInFirstNS.addFooTable(fbb, nestedTableOff)
+    }
+
+    fun TestNestedFlatBuffer() {
+        val nestedMonsterName = "NestedMonsterName"
+        val nestedMonsterHp: Short = 600
+        val nestedMonsterMana: Short = 1024
+
+        var fbb1: FlatBufferBuilder? = FlatBufferBuilder(16)
+        val str1 = fbb1!!.createString(nestedMonsterName)
+        Monster.startMonster(fbb1)
+        Monster.addName(fbb1, str1)
+        Monster.addHp(fbb1, nestedMonsterHp)
+        Monster.addMana(fbb1, nestedMonsterMana)
+        val monster1 = Monster.endMonster(fbb1)
+        Monster.finishMonsterBuffer(fbb1, monster1)
+        val fbb1Bytes = fbb1.sizedByteArray()
+        
+        val fbb2 = FlatBufferBuilder(16)
+        val str2 = fbb2.createString("My Monster")
+        val nestedBuffer = Monster.createTestnestedflatbufferVector(fbb2, fbb1Bytes.asUByteArray())
+        Monster.startMonster(fbb2)
+        Monster.addName(fbb2, str2)
+        Monster.addHp(fbb2, 50.toShort())
+        Monster.addMana(fbb2, 32.toShort())
+        Monster.addTestnestedflatbuffer(fbb2, nestedBuffer)
+        val monster = Monster.endMonster(fbb2)
+        Monster.finishMonsterBuffer(fbb2, monster)
+
+        // Now test the data extracted from the nested buffer
+        val mons = Monster.getRootAsMonster(fbb2.dataBuffer())
+        val nestedMonster = mons.testnestedflatbufferAsMonster!!
+
+        assert(nestedMonsterMana == nestedMonster.mana)
+        assert(nestedMonsterHp == nestedMonster.hp)
+        assert(nestedMonsterName == nestedMonster.name)
+    }
+
+    fun TestCreateByteVector() {
+        val fbb = FlatBufferBuilder(16)
+        val str = fbb.createString("MyMonster")
+        val inventory = byteArrayOf(0, 1, 2, 3, 4)
+        val vec = fbb.createByteVector(inventory)
+        Monster.startMonster(fbb)
+        Monster.addInventory(fbb, vec)
+        Monster.addName(fbb, str)
+        val monster1 = Monster.endMonster(fbb)
+        Monster.finishMonsterBuffer(fbb, monster1)
+        val monsterObject = Monster.getRootAsMonster(fbb.dataBuffer())
+
+        assert(monsterObject.inventory(1) == inventory[1].toUByte())
+        assert(monsterObject.inventoryLength == inventory.size)
+        assert(ByteBuffer.wrap(inventory) == monsterObject.inventoryAsByteBuffer)
+    }
+
+    fun TestCreateUninitializedVector() {
+        val fbb = FlatBufferBuilder(16)
+        val str = fbb.createString("MyMonster")
+        val inventory = byteArrayOf(0, 1, 2, 3, 4)
+        val bb = fbb.createUnintializedVector(1, inventory.size, 1)
+        for (i in inventory) {
+            bb.put(i)
+        }
+        val vec = fbb.endVector()
+        Monster.startMonster(fbb)
+        Monster.addInventory(fbb, vec)
+        Monster.addName(fbb, str)
+        val monster1 = Monster.endMonster(fbb)
+        Monster.finishMonsterBuffer(fbb, monster1)
+        val monsterObject = Monster.getRootAsMonster(fbb.dataBuffer())
+
+        assert(monsterObject.inventory(1) == inventory[1].toUByte())
+        assert(monsterObject.inventoryLength == inventory.size)
+        assert(ByteBuffer.wrap(inventory) == monsterObject.inventoryAsByteBuffer)
+    }
+
+    fun TestByteBufferFactory() {
+        class MappedByteBufferFactory : FlatBufferBuilder.ByteBufferFactory() {
+            override fun newByteBuffer(capacity: Int): ByteBuffer? {
+                var bb: ByteBuffer?
+                try {
+                    bb = RandomAccessFile("javatest.bin", "rw").channel.map(
+                        FileChannel.MapMode.READ_WRITE,
+                        0,
+                        capacity.toLong()
+                    ).order(ByteOrder.LITTLE_ENDIAN)
+                } catch (e: Throwable) {
+                    println("FlatBuffers test: couldn't map ByteBuffer to a file")
+                    bb = null
+                }
+
+                return bb
+            }
+        }
+
+        val fbb = FlatBufferBuilder(1, MappedByteBufferFactory())
+
+        TestBuilderBasics(fbb, false)
+    }
+
+    fun TestSizedInputStream() {
+        // Test on default FlatBufferBuilder that uses HeapByteBuffer
+        val fbb = FlatBufferBuilder(1)
+
+        TestBuilderBasics(fbb, false)
+
+        val `in` = fbb.sizedInputStream()
+        val array = fbb.sizedByteArray()
+        var count = 0
+        var currentVal = 0
+
+        while (currentVal != -1 && count < array.size) {
+            try {
+                currentVal = `in`.read()
+            } catch (e: java.io.IOException) {
+                println("FlatBuffers test: couldn't read from InputStream")
+                return
+            }
+
+            assert(currentVal.toByte() == array[count])
+            count++
+        }
+        assert(count == array.size)
+    }
+
+    fun TestBuilderBasics(fbb: FlatBufferBuilder, sizePrefix: Boolean) {
+        val names = intArrayOf(fbb.createString("Frodo"), fbb.createString("Barney"), fbb.createString("Wilma"))
+        val off = IntArray(3)
+        Monster.startMonster(fbb)
+        Monster.addName(fbb, names[0])
+        off[0] = Monster.endMonster(fbb)
+        Monster.startMonster(fbb)
+        Monster.addName(fbb, names[1])
+        off[1] = Monster.endMonster(fbb)
+        Monster.startMonster(fbb)
+        Monster.addName(fbb, names[2])
+        off[2] = Monster.endMonster(fbb)
+        val sortMons = fbb.createSortedVectorOfTables(Monster(), off)
+
+        // We set up the same values as monsterdata.json:
+
+        val str = fbb.createString("MyMonster")
+
+        val inv = Monster.createInventoryVector(fbb, byteArrayOf(0, 1, 2, 3, 4).asUByteArray())
+
+        val fred = fbb.createString("Fred")
+        Monster.startMonster(fbb)
+        Monster.addName(fbb, fred)
+        val mon2 = Monster.endMonster(fbb)
+
+        Monster.startTest4Vector(fbb, 2)
+        Test.createTest(fbb, 10.toShort(), 20.toByte())
+        Test.createTest(fbb, 30.toShort(), 40.toByte())
+        val test4 = fbb.endVector()
+
+        val testArrayOfString =
+            Monster.createTestarrayofstringVector(fbb, intArrayOf(fbb.createString("test1"), fbb.createString("test2")))
+
+        Monster.startMonster(fbb)
+        Monster.addPos(
+            fbb, Vec3.createVec3(
+                fbb, 1.0f, 2.0f, 3.0f, 3.0,
+                Color.Green, 5.toShort(), 6.toByte()
+            )
+        )
+        Monster.addHp(fbb, 80.toShort())
+        Monster.addName(fbb, str)
+        Monster.addInventory(fbb, inv)
+        Monster.addTestType(fbb, Any_.Monster)
+        Monster.addTest(fbb, mon2)
+        Monster.addTest4(fbb, test4)
+        Monster.addTestarrayofstring(fbb, testArrayOfString)
+        Monster.addTestbool(fbb, true)
+        Monster.addTesthashu32Fnv1(fbb, (Integer.MAX_VALUE + 1L).toUInt())
+        Monster.addTestarrayoftables(fbb, sortMons)
+        val mon = Monster.endMonster(fbb)
+
+        if (sizePrefix) {
+            Monster.finishSizePrefixedMonsterBuffer(fbb, mon)
+        } else {
+            Monster.finishMonsterBuffer(fbb, mon)
+        }
+
+        // Write the result to a file for debugging purposes:
+        // Note that the binaries are not necessarily identical, since the JSON
+        // parser may serialize in a slightly different order than the above
+        // Java code. They are functionally equivalent though.
+
+        try {
+            val filename = "monsterdata_java_wire" + (if (sizePrefix) "_sp" else "") + ".mon"
+            val fc = FileOutputStream(filename).channel
+            fc.write(fbb.dataBuffer().duplicate())
+            fc.close()
+        } catch (e: java.io.IOException) {
+            println("FlatBuffers test: couldn't write file")
+            return
+        }
+
+        // Test it:
+        var dataBuffer = fbb.dataBuffer()
+        if (sizePrefix) {
+            assert(
+                ByteBufferUtil.getSizePrefix(dataBuffer) + SIZE_PREFIX_LENGTH ==
+                dataBuffer.remaining()
+            )
+            dataBuffer = ByteBufferUtil.removeSizePrefix(dataBuffer)
+        }
+        TestExtendedBuffer(dataBuffer)
+
+        // Make sure it also works with read only ByteBuffers. This is slower,
+        // since creating strings incurs an additional copy
+        // (see Table.__string).
+        TestExtendedBuffer(dataBuffer.asReadOnlyBuffer())
+
+        TestEnums()
+
+        //Attempt to mutate Monster fields and check whether the buffer has been mutated properly
+        // revert to original values after testing
+        val monster = Monster.getRootAsMonster(dataBuffer)
+
+        // mana is optional and does not exist in the buffer so the mutation should fail
+        // the mana field should retain its default value
+        assert(monster.mutateMana(10.toShort()) == false)
+        assert(monster.mana == 150.toShort())
+
+        // Accessing a vector of sorted by the key tables
+        assert(monster.testarrayoftables(0)!!.name == "Barney")
+        assert(monster.testarrayoftables(1)!!.name == "Frodo")
+        assert(monster.testarrayoftables(2)!!.name == "Wilma")
+
+        // Example of searching for a table by the key
+        assert(monster.testarrayoftablesByKey("Frodo")!!.name == "Frodo")
+        assert(monster.testarrayoftablesByKey("Barney")!!.name == "Barney")
+        assert(monster.testarrayoftablesByKey("Wilma")!!.name == "Wilma")
+
+        // testType is an existing field and mutating it should succeed
+        assert(monster.testType == Any_.Monster)
+        assert(monster.mutateTestType(Any_.NONE) == true)
+        assert(monster.testType == Any_.NONE)
+        assert(monster.mutateTestType(Any_.Monster) == true)
+        assert(monster.testType == Any_.Monster)
+
+        //mutate the inventory vector
+        assert(monster.mutateInventory(0, 1u) == true)
+        assert(monster.mutateInventory(1, 2u) == true)
+        assert(monster.mutateInventory(2, 3u) == true)
+        assert(monster.mutateInventory(3, 4u) == true)
+        assert(monster.mutateInventory(4, 5u) == true)
+
+        for (i in 0 until monster.inventoryLength) {
+            assert(monster.inventory(i) == (i.toUByte() + 1u).toUByte())
+        }
+
+        //reverse mutation
+        assert(monster.mutateInventory(0, 0u) == true)
+        assert(monster.mutateInventory(1, 1u) == true)
+        assert(monster.mutateInventory(2, 2u) == true)
+        assert(monster.mutateInventory(3, 3u) == true)
+        assert(monster.mutateInventory(4, 4u) == true)
+
+        // get a struct field and edit one of its fields
+        val pos = monster.pos!!
+        assert(pos.x == 1.0f)
+        pos.mutateX(55.0f)
+        assert(pos.x == 55.0f)
+        pos.mutateX(1.0f)
+        assert(pos.x == 1.0f)
+    }
+
+    fun TestVectorOfUnions() {
+        val fbb = FlatBufferBuilder()
+
+        val swordAttackDamage = 1
+
+        val characterVector = intArrayOf(Attacker.createAttacker(fbb, swordAttackDamage))
+
+        val characterTypeVector = ubyteArrayOf(Character_.MuLan)
+
+        Movie.finishMovieBuffer(
+            fbb,
+            Movie.createMovie(
+                fbb,
+                0u,
+                0,
+                Movie.createCharactersTypeVector(fbb, characterTypeVector),
+                Movie.createCharactersVector(fbb, characterVector)
+            )
+        )
+
+        val movie = Movie.getRootAsMovie(fbb.dataBuffer())
+
+        assert(movie.charactersTypeLength == characterTypeVector.size)
+        assert(movie.charactersLength == characterVector.size)
+
+        assert(movie.charactersType(0) == characterTypeVector[0])
+
+        assert((movie.characters(Attacker(), 0) as Attacker).swordAttackDamage == swordAttackDamage)
+    }
+
+    fun TestSharedStringPool() {
+        val fb = FlatBufferBuilder(1);
+        val testString = "My string";
+        val offset = fb.createSharedString(testString);
+        for (i in 0..10) {
+            assert(offset == fb.createSharedString(testString));
+        }
+    }
+
+    fun TestScalarOptional() {
+        val fbb = FlatBufferBuilder(1)
+        ScalarStuff.startScalarStuff(fbb)
+        var pos = ScalarStuff.endScalarStuff(fbb)
+        fbb.finish(pos)
+
+        var scalarStuff = ScalarStuff.getRootAsScalarStuff(fbb.dataBuffer())
+
+        assert(scalarStuff.justI8  == 0.toByte())
+        assert(scalarStuff.maybeI8 == null)
+        assert(scalarStuff.defaultI8 == 42.toByte())
+        assert(scalarStuff.justU8 == 0.toUByte())
+        assert(scalarStuff.maybeU8 == null)
+        assert(scalarStuff.defaultU8 == 42.toUByte())
+        assert(scalarStuff.justI16 == 0.toShort())
+        assert(scalarStuff.maybeI16 == null)
+        assert(scalarStuff.defaultI16 == 42.toShort())
+        assert(scalarStuff.justU16 == 0.toUShort())
+        assert(scalarStuff.maybeU16 == null)
+        assert(scalarStuff.defaultU16 == 42.toUShort())
+        assert(scalarStuff.justI32 == 0)
+        assert(scalarStuff.maybeI32 == null)
+        assert(scalarStuff.defaultI32 == 42)
+        assert(scalarStuff.justU32 == 0.toUInt())
+        assert(scalarStuff.maybeU32 == null)
+        assert(scalarStuff.defaultU32 == 42U)
+        assert(scalarStuff.justI64 == 0L)
+        assert(scalarStuff.maybeI64 == null)
+        assert(scalarStuff.defaultI64 == 42L)
+        assert(scalarStuff.justU64 == 0UL)
+        assert(scalarStuff.maybeU64 == null)
+        assert(scalarStuff.defaultU64 == 42UL)
+        assert(scalarStuff.justF32 == 0.0f)
+        assert(scalarStuff.maybeF32 == null)
+        assert(scalarStuff.defaultF32 == 42.0f)
+        assert(scalarStuff.justF64 == 0.0)
+        assert(scalarStuff.maybeF64 == null)
+        assert(scalarStuff.defaultF64 == 42.0)
+        assert(scalarStuff.justBool == false)
+        assert(scalarStuff.maybeBool == null)
+        assert(scalarStuff.defaultBool == true)
+        assert(scalarStuff.justEnum == OptionalByte.None)
+        assert(scalarStuff.maybeEnum == null)
+        assert(scalarStuff.defaultEnum == OptionalByte.One)
+
+        fbb.clear()
+ 
+        ScalarStuff.startScalarStuff(fbb)
+        ScalarStuff.addJustI8(fbb, 5.toByte())
+        ScalarStuff.addMaybeI8(fbb, 5.toByte())
+        ScalarStuff.addDefaultI8(fbb, 5.toByte())
+        ScalarStuff.addJustU8(fbb, 6.toUByte())
+        ScalarStuff.addMaybeU8(fbb, 6.toUByte())
+        ScalarStuff.addDefaultU8(fbb, 6.toUByte())
+        ScalarStuff.addJustI16(fbb, 7.toShort())
+        ScalarStuff.addMaybeI16(fbb, 7.toShort())
+        ScalarStuff.addDefaultI16(fbb, 7.toShort())
+        ScalarStuff.addJustU16(fbb, 8.toUShort())
+        ScalarStuff.addMaybeU16(fbb, 8.toUShort())
+        ScalarStuff.addDefaultU16(fbb, 8.toUShort())
+        ScalarStuff.addJustI32(fbb, 9)
+        ScalarStuff.addMaybeI32(fbb, 9)
+        ScalarStuff.addDefaultI32(fbb, 9)
+        ScalarStuff.addJustU32(fbb, 10.toUInt())
+        ScalarStuff.addMaybeU32(fbb, 10.toUInt())
+        ScalarStuff.addDefaultU32(fbb, 10.toUInt())
+        ScalarStuff.addJustI64(fbb, 11L)
+        ScalarStuff.addMaybeI64(fbb, 11L)
+        ScalarStuff.addDefaultI64(fbb, 11L)
+        ScalarStuff.addJustU64(fbb, 12UL)
+        ScalarStuff.addMaybeU64(fbb, 12UL)
+        ScalarStuff.addDefaultU64(fbb, 12UL)
+        ScalarStuff.addJustF32(fbb, 13.0f)
+        ScalarStuff.addMaybeF32(fbb, 13.0f)
+        ScalarStuff.addDefaultF32(fbb, 13.0f)
+        ScalarStuff.addJustF64(fbb, 14.0)
+        ScalarStuff.addMaybeF64(fbb, 14.0)
+        ScalarStuff.addDefaultF64(fbb, 14.0)
+        ScalarStuff.addJustBool(fbb, true)
+        ScalarStuff.addMaybeBool(fbb, true)
+        ScalarStuff.addDefaultBool(fbb, true)
+        ScalarStuff.addJustEnum(fbb, OptionalByte.Two)
+        ScalarStuff.addMaybeEnum(fbb, OptionalByte.Two)
+        ScalarStuff.addDefaultEnum(fbb, OptionalByte.Two)
+
+        pos = ScalarStuff.endScalarStuff(fbb)
+
+        fbb.finish(pos)
+
+        scalarStuff = ScalarStuff.getRootAsScalarStuff(fbb.dataBuffer())
+
+        assert(scalarStuff.justI8  == 5.toByte())
+        assert(scalarStuff.maybeI8 == 5.toByte())
+        assert(scalarStuff.defaultI8 == 5.toByte())
+        assert(scalarStuff.justU8 == 6.toUByte())
+        assert(scalarStuff.maybeU8 == 6.toUByte())
+        assert(scalarStuff.defaultU8 == 6.toUByte())
+        assert(scalarStuff.justI16 == 7.toShort())
+        assert(scalarStuff.maybeI16 == 7.toShort())
+        assert(scalarStuff.defaultI16 == 7.toShort())
+        assert(scalarStuff.justU16 == 8.toUShort())
+        assert(scalarStuff.maybeU16 == 8.toUShort())
+        assert(scalarStuff.defaultU16 == 8.toUShort())
+        assert(scalarStuff.justI32 == 9)
+        assert(scalarStuff.maybeI32 == 9)
+        assert(scalarStuff.defaultI32 == 9)
+        assert(scalarStuff.justU32 == 10u)
+        assert(scalarStuff.maybeU32 == 10u)
+        assert(scalarStuff.defaultU32 == 10u)
+        assert(scalarStuff.justI64 == 11L)
+        assert(scalarStuff.maybeI64 == 11L)
+        assert(scalarStuff.defaultI64 == 11L)
+        assert(scalarStuff.justU64 == 12UL)
+        assert(scalarStuff.maybeU64 == 12UL)
+        assert(scalarStuff.defaultU64 == 12UL)
+        assert(scalarStuff.justF32 == 13.0f)
+        assert(scalarStuff.maybeF32 == 13.0f)
+        assert(scalarStuff.defaultF32 == 13.0f)
+        assert(scalarStuff.justF64 == 14.0)
+        assert(scalarStuff.maybeF64 == 14.0)
+        assert(scalarStuff.defaultF64 == 14.0)
+        assert(scalarStuff.justBool == true)
+        assert(scalarStuff.maybeBool == true)
+        assert(scalarStuff.defaultBool == true)
+        assert(scalarStuff.justEnum == OptionalByte.Two)
+        assert(scalarStuff.maybeEnum == OptionalByte.Two)
+        assert(scalarStuff.defaultEnum == OptionalByte.Two)
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.sh
new file mode 100755
index 0000000..e41ce3a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/KotlinTest.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo Compile then run the Kotlin test.
+
+testdir=$(dirname $0)
+targetdir="${testdir}/kotlin"
+
+if [[ -e "${targetdir}" ]]; then
+    echo "cleaning target"
+    rm -rf "${targetdir}"
+fi
+
+mkdir -v "${targetdir}"
+
+if ! find "${testdir}/../java" -type f -name "*.class" -delete; then
+    echo "failed to clean .class files from java directory" >&2
+    exit 1
+fi
+
+all_kt_files=`find . -name "*.kt" -print`
+
+# Compile java FlatBuffer library 
+javac ${testdir}/../java/com/google/flatbuffers/*.java -d $targetdir
+# Compile Kotlin files
+kotlinc $all_kt_files -classpath $targetdir -include-runtime -d $targetdir
+# Make jar
+jar cvf ${testdir}/kotlin_test.jar -C $targetdir . > /dev/null
+# Run test
+kotlin -J"-ea" -cp ${testdir}/kotlin_test.jar KotlinTest
+# clean up
+rm -rf $targetdir
+rm ${testdir}/kotlin_test.jar
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.bat b/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.bat
new file mode 100644
index 0000000..e4e3fd8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.bat
@@ -0,0 +1,9 @@
+set buildtype=Release
+if "%1"=="-b" set buildtype=%2
+
+..\%buildtype%\flatc.exe --lua -I include_test monster_test.fbs
+
+echo Run with LuaJIT:
+luajit.exe luatest.lua
+echo Run with Lua 5.3:
+lua53.exe luatest.lua
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.sh
new file mode 100755
index 0000000..d736a6b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/LuaTest.sh
@@ -0,0 +1,31 @@
+#!/bin/bash -eu
+#
+# Copyright 2019 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pushd "$(dirname $0)" >/dev/null
+test_dir="$(pwd)"
+
+${test_dir}/../flatc --lua -I include_test monster_test.fbs
+
+declare -a versions=(luajit lua5.1 lua5.2 lua5.3 lua5.4)
+
+for i in "${versions[@]}"
+do
+    if command -v $i &> /dev/null
+    then
+        echo "[$i]"
+        $i luatest.lua
+    fi
+done
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.cs
new file mode 100644
index 0000000..bd49ccd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.cs
@@ -0,0 +1,62 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Ability : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public Ability __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public uint Id { get { return __p.bb.GetUint(__p.bb_pos + 0); } }
+  public void MutateId(uint id) { __p.bb.PutUint(__p.bb_pos + 0, id); }
+  public uint Distance { get { return __p.bb.GetUint(__p.bb_pos + 4); } }
+  public void MutateDistance(uint distance) { __p.bb.PutUint(__p.bb_pos + 4, distance); }
+
+  public static Offset<MyGame.Example.Ability> CreateAbility(FlatBufferBuilder builder, uint Id, uint Distance) {
+    builder.Prep(4, 8);
+    builder.PutUint(Distance);
+    builder.PutUint(Id);
+    return new Offset<MyGame.Example.Ability>(builder.Offset);
+  }
+  public AbilityT UnPack() {
+    var _o = new AbilityT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(AbilityT _o) {
+    _o.Id = this.Id;
+    _o.Distance = this.Distance;
+  }
+  public static Offset<MyGame.Example.Ability> Pack(FlatBufferBuilder builder, AbilityT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Ability>);
+    return CreateAbility(
+      builder,
+      _o.Id,
+      _o.Distance);
+  }
+};
+
+public class AbilityT
+{
+  [Newtonsoft.Json.JsonProperty("id")]
+  public uint Id { get; set; }
+  [Newtonsoft.Json.JsonProperty("distance")]
+  public uint Distance { get; set; }
+
+  public AbilityT() {
+    this.Id = 0;
+    this.Distance = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.go
new file mode 100644
index 0000000..9852a75
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.go
@@ -0,0 +1,62 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type AbilityT struct {
+	Id uint32
+	Distance uint32
+}
+
+func (t *AbilityT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	return CreateAbility(builder, t.Id, t.Distance)
+}
+func (rcv *Ability) UnPackTo(t *AbilityT) {
+	t.Id = rcv.Id()
+	t.Distance = rcv.Distance()
+}
+
+func (rcv *Ability) UnPack() *AbilityT {
+	if rcv == nil { return nil }
+	t := &AbilityT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Ability struct {
+	_tab flatbuffers.Struct
+}
+
+func (rcv *Ability) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Ability) Table() flatbuffers.Table {
+	return rcv._tab.Table
+}
+
+func (rcv *Ability) Id() uint32 {
+	return rcv._tab.GetUint32(rcv._tab.Pos + flatbuffers.UOffsetT(0))
+}
+func (rcv *Ability) MutateId(n uint32) bool {
+	return rcv._tab.MutateUint32(rcv._tab.Pos+flatbuffers.UOffsetT(0), n)
+}
+
+func (rcv *Ability) Distance() uint32 {
+	return rcv._tab.GetUint32(rcv._tab.Pos + flatbuffers.UOffsetT(4))
+}
+func (rcv *Ability) MutateDistance(n uint32) bool {
+	return rcv._tab.MutateUint32(rcv._tab.Pos+flatbuffers.UOffsetT(4), n)
+}
+
+func CreateAbility(builder *flatbuffers.Builder, id uint32, distance uint32) flatbuffers.UOffsetT {
+	builder.Prep(4, 8)
+	builder.PrependUint32(distance)
+	builder.PrependUint32(id)
+	return builder.Offset()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.java
new file mode 100644
index 0000000..df5fe71
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.java
@@ -0,0 +1,34 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Ability extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Ability __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public long id() { return (long)bb.getInt(bb_pos + 0) & 0xFFFFFFFFL; }
+  public void mutateId(long id) { bb.putInt(bb_pos + 0, (int)id); }
+  public long distance() { return (long)bb.getInt(bb_pos + 4) & 0xFFFFFFFFL; }
+  public void mutateDistance(long distance) { bb.putInt(bb_pos + 4, (int)distance); }
+
+  public static int createAbility(FlatBufferBuilder builder, long id, long distance) {
+    builder.prep(4, 8);
+    builder.putInt((int)distance);
+    builder.putInt((int)id);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Ability get(int j) { return get(new Ability(), j); }
+    public Ability get(Ability obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.kt
new file mode 100644
index 0000000..1b644d6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.kt
@@ -0,0 +1,32 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Ability : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Ability {
+        __init(_i, _bb)
+        return this
+    }
+    val id : UInt get() = bb.getInt(bb_pos + 0).toUInt()
+    fun mutateId(id: UInt) : ByteBuffer = bb.putInt(bb_pos + 0, id.toInt())
+    val distance : UInt get() = bb.getInt(bb_pos + 4).toUInt()
+    fun mutateDistance(distance: UInt) : ByteBuffer = bb.putInt(bb_pos + 4, distance.toInt())
+    companion object {
+        fun createAbility(builder: FlatBufferBuilder, id: UInt, distance: UInt) : Int {
+            builder.prep(4, 8)
+            builder.putInt(distance.toInt())
+            builder.putInt(id.toInt())
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.lua
new file mode 100644
index 0000000..7fb664a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.lua
@@ -0,0 +1,31 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local Ability = {} -- the module
+local Ability_mt = {} -- the class metatable
+
+function Ability.New()
+    local o = {}
+    setmetatable(o, {__index = Ability_mt})
+    return o
+end
+function Ability_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Ability_mt:Id()
+    return self.view:Get(flatbuffers.N.Uint32, self.view.pos + 0)
+end
+function Ability_mt:Distance()
+    return self.view:Get(flatbuffers.N.Uint32, self.view.pos + 4)
+end
+function Ability.CreateAbility(builder, id, distance)
+    builder:Prep(4, 8)
+    builder:PrependUint32(distance)
+    builder:PrependUint32(id)
+    return builder:Offset()
+end
+
+return Ability -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.php
new file mode 100644
index 0000000..c09eca3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.php
@@ -0,0 +1,52 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Ability extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Ability
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return uint
+     */
+    public function GetId()
+    {
+        return $this->bb->getUint($this->bb_pos + 0);
+    }
+
+    /**
+     * @return uint
+     */
+    public function GetDistance()
+    {
+        return $this->bb->getUint($this->bb_pos + 4);
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createAbility(FlatBufferBuilder $builder, $id, $distance)
+    {
+        $builder->prep(4, 8);
+        $builder->putUint($distance);
+        $builder->putUint($id);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.py
new file mode 100644
index 0000000..e57dfd7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Ability.py
@@ -0,0 +1,60 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Ability(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 8
+
+    # Ability
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Ability
+    def Id(self): return self._tab.Get(flatbuffers.number_types.Uint32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
+    # Ability
+    def Distance(self): return self._tab.Get(flatbuffers.number_types.Uint32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4))
+
+def CreateAbility(builder, id, distance):
+    builder.Prep(4, 8)
+    builder.PrependUint32(distance)
+    builder.PrependUint32(id)
+    return builder.Offset()
+
+
+class AbilityT(object):
+
+    # AbilityT
+    def __init__(self):
+        self.id = 0  # type: int
+        self.distance = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        ability = Ability()
+        ability.Init(buf, pos)
+        return cls.InitFromObj(ability)
+
+    @classmethod
+    def InitFromObj(cls, ability):
+        x = AbilityT()
+        x._UnPack(ability)
+        return x
+
+    # AbilityT
+    def _UnPack(self, ability):
+        if ability is None:
+            return
+        self.id = ability.Id()
+        self.distance = ability.Distance()
+
+    # AbilityT
+    def Pack(self, builder):
+        return CreateAbility(builder, self.id, self.distance)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.cs
new file mode 100644
index 0000000..edf98ef
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.cs
@@ -0,0 +1,85 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum Any : byte
+{
+  NONE = 0,
+  Monster = 1,
+  TestSimpleTableWithEnum = 2,
+  MyGame_Example2_Monster = 3,
+};
+
+public class AnyUnion {
+  public Any Type { get; set; }
+  public object Value { get; set; }
+
+  public AnyUnion() {
+    this.Type = Any.NONE;
+    this.Value = null;
+  }
+
+  public T As<T>() where T : class { return this.Value as T; }
+  public MyGame.Example.MonsterT AsMonster() { return this.As<MyGame.Example.MonsterT>(); }
+  internal MyGame.Example.TestSimpleTableWithEnumT AsTestSimpleTableWithEnum() { return this.As<MyGame.Example.TestSimpleTableWithEnumT>(); }
+  public MyGame.Example2.MonsterT AsMyGame_Example2_Monster() { return this.As<MyGame.Example2.MonsterT>(); }
+
+  public static int Pack(FlatBuffers.FlatBufferBuilder builder, AnyUnion _o) {
+    switch (_o.Type) {
+      default: return 0;
+      case Any.Monster: return MyGame.Example.Monster.Pack(builder, _o.AsMonster()).Value;
+      case Any.TestSimpleTableWithEnum: return MyGame.Example.TestSimpleTableWithEnum.Pack(builder, _o.AsTestSimpleTableWithEnum()).Value;
+      case Any.MyGame_Example2_Monster: return MyGame.Example2.Monster.Pack(builder, _o.AsMyGame_Example2_Monster()).Value;
+    }
+  }
+}
+
+public class AnyUnion_JsonConverter : Newtonsoft.Json.JsonConverter {
+  public override bool CanConvert(System.Type objectType) {
+    return objectType == typeof(AnyUnion) || objectType == typeof(System.Collections.Generic.List<AnyUnion>);
+  }
+  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, object value, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = value as System.Collections.Generic.List<AnyUnion>;
+    if (_olist != null) {
+      writer.WriteStartArray();
+      foreach (var _o in _olist) { this.WriteJson(writer, _o, serializer); }
+      writer.WriteEndArray();
+    } else {
+      this.WriteJson(writer, value as AnyUnion, serializer);
+    }
+  }
+  public void WriteJson(Newtonsoft.Json.JsonWriter writer, AnyUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return;
+    serializer.Serialize(writer, _o.Value);
+  }
+  public override object ReadJson(Newtonsoft.Json.JsonReader reader, System.Type objectType, object existingValue, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = existingValue as System.Collections.Generic.List<AnyUnion>;
+    if (_olist != null) {
+      for (var _j = 0; _j < _olist.Count; ++_j) {
+        reader.Read();
+        _olist[_j] = this.ReadJson(reader, _olist[_j], serializer);
+      }
+      reader.Read();
+      return _olist;
+    } else {
+      return this.ReadJson(reader, existingValue as AnyUnion, serializer);
+    }
+  }
+  public AnyUnion ReadJson(Newtonsoft.Json.JsonReader reader, AnyUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return null;
+    switch (_o.Type) {
+      default: break;
+      case Any.Monster: _o.Value = serializer.Deserialize<MyGame.Example.MonsterT>(reader); break;
+      case Any.TestSimpleTableWithEnum: _o.Value = serializer.Deserialize<MyGame.Example.TestSimpleTableWithEnumT>(reader); break;
+      case Any.MyGame_Example2_Monster: _o.Value = serializer.Deserialize<MyGame.Example2.MonsterT>(reader); break;
+    }
+    return _o;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.go
new file mode 100644
index 0000000..14b66b5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.go
@@ -0,0 +1,76 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	"strconv"
+
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	MyGame__Example2 "MyGame/Example2"
+)
+
+type Any byte
+
+const (
+	AnyNONE                    Any = 0
+	AnyMonster                 Any = 1
+	AnyTestSimpleTableWithEnum Any = 2
+	AnyMyGame_Example2_Monster Any = 3
+)
+
+var EnumNamesAny = map[Any]string{
+	AnyNONE:                    "NONE",
+	AnyMonster:                 "Monster",
+	AnyTestSimpleTableWithEnum: "TestSimpleTableWithEnum",
+	AnyMyGame_Example2_Monster: "MyGame_Example2_Monster",
+}
+
+var EnumValuesAny = map[string]Any{
+	"NONE":                    AnyNONE,
+	"Monster":                 AnyMonster,
+	"TestSimpleTableWithEnum": AnyTestSimpleTableWithEnum,
+	"MyGame_Example2_Monster": AnyMyGame_Example2_Monster,
+}
+
+func (v Any) String() string {
+	if s, ok := EnumNamesAny[v]; ok {
+		return s
+	}
+	return "Any(" + strconv.FormatInt(int64(v), 10) + ")"
+}
+
+type AnyT struct {
+	Type Any
+	Value interface{}
+}
+
+func (t *AnyT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil {
+		return 0
+	}
+	switch t.Type {
+	case AnyMonster:
+		return t.Value.(*MonsterT).Pack(builder)
+	case AnyTestSimpleTableWithEnum:
+		return t.Value.(*TestSimpleTableWithEnumT).Pack(builder)
+	case AnyMyGame_Example2_Monster:
+		return t.Value.(*MyGame__Example2.MonsterT).Pack(builder)
+	}
+	return 0
+}
+
+func (rcv Any) UnPack(table flatbuffers.Table) *AnyT {
+	switch rcv {
+	case AnyMonster:
+		x := Monster{_tab: table}
+		return &AnyT{ Type: AnyMonster, Value: x.UnPack() }
+	case AnyTestSimpleTableWithEnum:
+		x := TestSimpleTableWithEnum{_tab: table}
+		return &AnyT{ Type: AnyTestSimpleTableWithEnum, Value: x.UnPack() }
+	case AnyMyGame_Example2_Monster:
+		x := Monster{_tab: table}
+		return &AnyT{ Type: AnyMyGame_Example2_Monster, Value: x.UnPack() }
+	}
+	return nil
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.java
new file mode 100644
index 0000000..6e4fb76
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.java
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+public final class Any {
+  private Any() { }
+  public static final byte NONE = 0;
+  public static final byte Monster = 1;
+  public static final byte TestSimpleTableWithEnum = 2;
+  public static final byte MyGame_Example2_Monster = 3;
+
+  public static final String[] names = { "NONE", "Monster", "TestSimpleTableWithEnum", "MyGame_Example2_Monster", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.kt
new file mode 100644
index 0000000..f1a4dfe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.kt
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Any_ private constructor() {
+    companion object {
+        const val NONE: UByte = 0u
+        const val Monster: UByte = 1u
+        const val TestSimpleTableWithEnum: UByte = 2u
+        const val MyGameExample2Monster: UByte = 3u
+        val names : Array<String> = arrayOf("NONE", "Monster", "TestSimpleTableWithEnum", "MyGame_Example2_Monster")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.lua
new file mode 100644
index 0000000..03225ba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.lua
@@ -0,0 +1,12 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local Any = {
+    NONE = 0,
+    Monster = 1,
+    TestSimpleTableWithEnum = 2,
+    MyGame_Example2_Monster = 3,
+}
+
+return Any -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.php
new file mode 100644
index 0000000..929caaf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.php
@@ -0,0 +1,27 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+class Any
+{
+    const NONE = 0;
+    const Monster = 1;
+    const TestSimpleTableWithEnum = 2;
+    const MyGame_Example2_Monster = 3;
+
+    private static $names = array(
+        Any::NONE=>"NONE",
+        Any::Monster=>"Monster",
+        Any::TestSimpleTableWithEnum=>"TestSimpleTableWithEnum",
+        Any::MyGame_Example2_Monster=>"MyGame_Example2_Monster",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.py
new file mode 100644
index 0000000..b10d35d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Any.py
@@ -0,0 +1,25 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+class Any(object):
+    NONE = 0
+    Monster = 1
+    TestSimpleTableWithEnum = 2
+    MyGame_Example2_Monster = 3
+
+
+def AnyCreator(unionType, table):
+    from flatbuffers.table import Table
+    if not isinstance(table, Table):
+        return None
+    if unionType == Any().Monster:
+        import MyGame.Example.Monster
+        return MyGame.Example.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == Any().TestSimpleTableWithEnum:
+        import MyGame.Example.TestSimpleTableWithEnum
+        return MyGame.Example.TestSimpleTableWithEnum.TestSimpleTableWithEnumT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == Any().MyGame_Example2_Monster:
+        import MyGame.Example2.Monster
+        return MyGame.Example2.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    return None
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.cs
new file mode 100644
index 0000000..07deadc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.cs
@@ -0,0 +1,85 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum AnyAmbiguousAliases : byte
+{
+  NONE = 0,
+  M1 = 1,
+  M2 = 2,
+  M3 = 3,
+};
+
+public class AnyAmbiguousAliasesUnion {
+  public AnyAmbiguousAliases Type { get; set; }
+  public object Value { get; set; }
+
+  public AnyAmbiguousAliasesUnion() {
+    this.Type = AnyAmbiguousAliases.NONE;
+    this.Value = null;
+  }
+
+  public T As<T>() where T : class { return this.Value as T; }
+  public MyGame.Example.MonsterT AsM1() { return this.As<MyGame.Example.MonsterT>(); }
+  public MyGame.Example.MonsterT AsM2() { return this.As<MyGame.Example.MonsterT>(); }
+  public MyGame.Example.MonsterT AsM3() { return this.As<MyGame.Example.MonsterT>(); }
+
+  public static int Pack(FlatBuffers.FlatBufferBuilder builder, AnyAmbiguousAliasesUnion _o) {
+    switch (_o.Type) {
+      default: return 0;
+      case AnyAmbiguousAliases.M1: return MyGame.Example.Monster.Pack(builder, _o.AsM1()).Value;
+      case AnyAmbiguousAliases.M2: return MyGame.Example.Monster.Pack(builder, _o.AsM2()).Value;
+      case AnyAmbiguousAliases.M3: return MyGame.Example.Monster.Pack(builder, _o.AsM3()).Value;
+    }
+  }
+}
+
+public class AnyAmbiguousAliasesUnion_JsonConverter : Newtonsoft.Json.JsonConverter {
+  public override bool CanConvert(System.Type objectType) {
+    return objectType == typeof(AnyAmbiguousAliasesUnion) || objectType == typeof(System.Collections.Generic.List<AnyAmbiguousAliasesUnion>);
+  }
+  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, object value, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = value as System.Collections.Generic.List<AnyAmbiguousAliasesUnion>;
+    if (_olist != null) {
+      writer.WriteStartArray();
+      foreach (var _o in _olist) { this.WriteJson(writer, _o, serializer); }
+      writer.WriteEndArray();
+    } else {
+      this.WriteJson(writer, value as AnyAmbiguousAliasesUnion, serializer);
+    }
+  }
+  public void WriteJson(Newtonsoft.Json.JsonWriter writer, AnyAmbiguousAliasesUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return;
+    serializer.Serialize(writer, _o.Value);
+  }
+  public override object ReadJson(Newtonsoft.Json.JsonReader reader, System.Type objectType, object existingValue, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = existingValue as System.Collections.Generic.List<AnyAmbiguousAliasesUnion>;
+    if (_olist != null) {
+      for (var _j = 0; _j < _olist.Count; ++_j) {
+        reader.Read();
+        _olist[_j] = this.ReadJson(reader, _olist[_j], serializer);
+      }
+      reader.Read();
+      return _olist;
+    } else {
+      return this.ReadJson(reader, existingValue as AnyAmbiguousAliasesUnion, serializer);
+    }
+  }
+  public AnyAmbiguousAliasesUnion ReadJson(Newtonsoft.Json.JsonReader reader, AnyAmbiguousAliasesUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return null;
+    switch (_o.Type) {
+      default: break;
+      case AnyAmbiguousAliases.M1: _o.Value = serializer.Deserialize<MyGame.Example.MonsterT>(reader); break;
+      case AnyAmbiguousAliases.M2: _o.Value = serializer.Deserialize<MyGame.Example.MonsterT>(reader); break;
+      case AnyAmbiguousAliases.M3: _o.Value = serializer.Deserialize<MyGame.Example.MonsterT>(reader); break;
+    }
+    return _o;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.go
new file mode 100644
index 0000000..8a088db
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.go
@@ -0,0 +1,74 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	"strconv"
+
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type AnyAmbiguousAliases byte
+
+const (
+	AnyAmbiguousAliasesNONE AnyAmbiguousAliases = 0
+	AnyAmbiguousAliasesM1   AnyAmbiguousAliases = 1
+	AnyAmbiguousAliasesM2   AnyAmbiguousAliases = 2
+	AnyAmbiguousAliasesM3   AnyAmbiguousAliases = 3
+)
+
+var EnumNamesAnyAmbiguousAliases = map[AnyAmbiguousAliases]string{
+	AnyAmbiguousAliasesNONE: "NONE",
+	AnyAmbiguousAliasesM1:   "M1",
+	AnyAmbiguousAliasesM2:   "M2",
+	AnyAmbiguousAliasesM3:   "M3",
+}
+
+var EnumValuesAnyAmbiguousAliases = map[string]AnyAmbiguousAliases{
+	"NONE": AnyAmbiguousAliasesNONE,
+	"M1":   AnyAmbiguousAliasesM1,
+	"M2":   AnyAmbiguousAliasesM2,
+	"M3":   AnyAmbiguousAliasesM3,
+}
+
+func (v AnyAmbiguousAliases) String() string {
+	if s, ok := EnumNamesAnyAmbiguousAliases[v]; ok {
+		return s
+	}
+	return "AnyAmbiguousAliases(" + strconv.FormatInt(int64(v), 10) + ")"
+}
+
+type AnyAmbiguousAliasesT struct {
+	Type AnyAmbiguousAliases
+	Value interface{}
+}
+
+func (t *AnyAmbiguousAliasesT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil {
+		return 0
+	}
+	switch t.Type {
+	case AnyAmbiguousAliasesM1:
+		return t.Value.(*MonsterT).Pack(builder)
+	case AnyAmbiguousAliasesM2:
+		return t.Value.(*MonsterT).Pack(builder)
+	case AnyAmbiguousAliasesM3:
+		return t.Value.(*MonsterT).Pack(builder)
+	}
+	return 0
+}
+
+func (rcv AnyAmbiguousAliases) UnPack(table flatbuffers.Table) *AnyAmbiguousAliasesT {
+	switch rcv {
+	case AnyAmbiguousAliasesM1:
+		x := Monster{_tab: table}
+		return &AnyAmbiguousAliasesT{ Type: AnyAmbiguousAliasesM1, Value: x.UnPack() }
+	case AnyAmbiguousAliasesM2:
+		x := Monster{_tab: table}
+		return &AnyAmbiguousAliasesT{ Type: AnyAmbiguousAliasesM2, Value: x.UnPack() }
+	case AnyAmbiguousAliasesM3:
+		x := Monster{_tab: table}
+		return &AnyAmbiguousAliasesT{ Type: AnyAmbiguousAliasesM3, Value: x.UnPack() }
+	}
+	return nil
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.java
new file mode 100644
index 0000000..b8a6870
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.java
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+public final class AnyAmbiguousAliases {
+  private AnyAmbiguousAliases() { }
+  public static final byte NONE = 0;
+  public static final byte M1 = 1;
+  public static final byte M2 = 2;
+  public static final byte M3 = 3;
+
+  public static final String[] names = { "NONE", "M1", "M2", "M3", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.kt
new file mode 100644
index 0000000..cee13c5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.kt
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class AnyAmbiguousAliases private constructor() {
+    companion object {
+        const val NONE: UByte = 0u
+        const val M1: UByte = 1u
+        const val M2: UByte = 2u
+        const val M3: UByte = 3u
+        val names : Array<String> = arrayOf("NONE", "M1", "M2", "M3")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.lua
new file mode 100644
index 0000000..dbe474b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.lua
@@ -0,0 +1,12 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local AnyAmbiguousAliases = {
+    NONE = 0,
+    M1 = 1,
+    M2 = 2,
+    M3 = 3,
+}
+
+return AnyAmbiguousAliases -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.php
new file mode 100644
index 0000000..13d318a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.php
@@ -0,0 +1,27 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+class AnyAmbiguousAliases
+{
+    const NONE = 0;
+    const M1 = 1;
+    const M2 = 2;
+    const M3 = 3;
+
+    private static $names = array(
+        AnyAmbiguousAliases::NONE=>"NONE",
+        AnyAmbiguousAliases::M1=>"M1",
+        AnyAmbiguousAliases::M2=>"M2",
+        AnyAmbiguousAliases::M3=>"M3",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.py
new file mode 100644
index 0000000..3fb4830
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyAmbiguousAliases.py
@@ -0,0 +1,25 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+class AnyAmbiguousAliases(object):
+    NONE = 0
+    M1 = 1
+    M2 = 2
+    M3 = 3
+
+
+def AnyAmbiguousAliasesCreator(unionType, table):
+    from flatbuffers.table import Table
+    if not isinstance(table, Table):
+        return None
+    if unionType == AnyAmbiguousAliases().M1:
+        import MyGame.Example.Monster
+        return MyGame.Example.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == AnyAmbiguousAliases().M2:
+        import MyGame.Example.Monster
+        return MyGame.Example.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == AnyAmbiguousAliases().M3:
+        import MyGame.Example.Monster
+        return MyGame.Example.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    return None
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.cs
new file mode 100644
index 0000000..3594952
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.cs
@@ -0,0 +1,85 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum AnyUniqueAliases : byte
+{
+  NONE = 0,
+  M = 1,
+  TS = 2,
+  M2 = 3,
+};
+
+public class AnyUniqueAliasesUnion {
+  public AnyUniqueAliases Type { get; set; }
+  public object Value { get; set; }
+
+  public AnyUniqueAliasesUnion() {
+    this.Type = AnyUniqueAliases.NONE;
+    this.Value = null;
+  }
+
+  public T As<T>() where T : class { return this.Value as T; }
+  public MyGame.Example.MonsterT AsM() { return this.As<MyGame.Example.MonsterT>(); }
+  internal MyGame.Example.TestSimpleTableWithEnumT AsTS() { return this.As<MyGame.Example.TestSimpleTableWithEnumT>(); }
+  public MyGame.Example2.MonsterT AsM2() { return this.As<MyGame.Example2.MonsterT>(); }
+
+  public static int Pack(FlatBuffers.FlatBufferBuilder builder, AnyUniqueAliasesUnion _o) {
+    switch (_o.Type) {
+      default: return 0;
+      case AnyUniqueAliases.M: return MyGame.Example.Monster.Pack(builder, _o.AsM()).Value;
+      case AnyUniqueAliases.TS: return MyGame.Example.TestSimpleTableWithEnum.Pack(builder, _o.AsTS()).Value;
+      case AnyUniqueAliases.M2: return MyGame.Example2.Monster.Pack(builder, _o.AsM2()).Value;
+    }
+  }
+}
+
+public class AnyUniqueAliasesUnion_JsonConverter : Newtonsoft.Json.JsonConverter {
+  public override bool CanConvert(System.Type objectType) {
+    return objectType == typeof(AnyUniqueAliasesUnion) || objectType == typeof(System.Collections.Generic.List<AnyUniqueAliasesUnion>);
+  }
+  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, object value, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = value as System.Collections.Generic.List<AnyUniqueAliasesUnion>;
+    if (_olist != null) {
+      writer.WriteStartArray();
+      foreach (var _o in _olist) { this.WriteJson(writer, _o, serializer); }
+      writer.WriteEndArray();
+    } else {
+      this.WriteJson(writer, value as AnyUniqueAliasesUnion, serializer);
+    }
+  }
+  public void WriteJson(Newtonsoft.Json.JsonWriter writer, AnyUniqueAliasesUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return;
+    serializer.Serialize(writer, _o.Value);
+  }
+  public override object ReadJson(Newtonsoft.Json.JsonReader reader, System.Type objectType, object existingValue, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = existingValue as System.Collections.Generic.List<AnyUniqueAliasesUnion>;
+    if (_olist != null) {
+      for (var _j = 0; _j < _olist.Count; ++_j) {
+        reader.Read();
+        _olist[_j] = this.ReadJson(reader, _olist[_j], serializer);
+      }
+      reader.Read();
+      return _olist;
+    } else {
+      return this.ReadJson(reader, existingValue as AnyUniqueAliasesUnion, serializer);
+    }
+  }
+  public AnyUniqueAliasesUnion ReadJson(Newtonsoft.Json.JsonReader reader, AnyUniqueAliasesUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return null;
+    switch (_o.Type) {
+      default: break;
+      case AnyUniqueAliases.M: _o.Value = serializer.Deserialize<MyGame.Example.MonsterT>(reader); break;
+      case AnyUniqueAliases.TS: _o.Value = serializer.Deserialize<MyGame.Example.TestSimpleTableWithEnumT>(reader); break;
+      case AnyUniqueAliases.M2: _o.Value = serializer.Deserialize<MyGame.Example2.MonsterT>(reader); break;
+    }
+    return _o;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.go
new file mode 100644
index 0000000..2a52ebe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.go
@@ -0,0 +1,76 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	"strconv"
+
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	MyGame__Example2 "MyGame/Example2"
+)
+
+type AnyUniqueAliases byte
+
+const (
+	AnyUniqueAliasesNONE AnyUniqueAliases = 0
+	AnyUniqueAliasesM    AnyUniqueAliases = 1
+	AnyUniqueAliasesTS   AnyUniqueAliases = 2
+	AnyUniqueAliasesM2   AnyUniqueAliases = 3
+)
+
+var EnumNamesAnyUniqueAliases = map[AnyUniqueAliases]string{
+	AnyUniqueAliasesNONE: "NONE",
+	AnyUniqueAliasesM:    "M",
+	AnyUniqueAliasesTS:   "TS",
+	AnyUniqueAliasesM2:   "M2",
+}
+
+var EnumValuesAnyUniqueAliases = map[string]AnyUniqueAliases{
+	"NONE": AnyUniqueAliasesNONE,
+	"M":    AnyUniqueAliasesM,
+	"TS":   AnyUniqueAliasesTS,
+	"M2":   AnyUniqueAliasesM2,
+}
+
+func (v AnyUniqueAliases) String() string {
+	if s, ok := EnumNamesAnyUniqueAliases[v]; ok {
+		return s
+	}
+	return "AnyUniqueAliases(" + strconv.FormatInt(int64(v), 10) + ")"
+}
+
+type AnyUniqueAliasesT struct {
+	Type AnyUniqueAliases
+	Value interface{}
+}
+
+func (t *AnyUniqueAliasesT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil {
+		return 0
+	}
+	switch t.Type {
+	case AnyUniqueAliasesM:
+		return t.Value.(*MonsterT).Pack(builder)
+	case AnyUniqueAliasesTS:
+		return t.Value.(*TestSimpleTableWithEnumT).Pack(builder)
+	case AnyUniqueAliasesM2:
+		return t.Value.(*MyGame__Example2.MonsterT).Pack(builder)
+	}
+	return 0
+}
+
+func (rcv AnyUniqueAliases) UnPack(table flatbuffers.Table) *AnyUniqueAliasesT {
+	switch rcv {
+	case AnyUniqueAliasesM:
+		x := Monster{_tab: table}
+		return &AnyUniqueAliasesT{ Type: AnyUniqueAliasesM, Value: x.UnPack() }
+	case AnyUniqueAliasesTS:
+		x := TestSimpleTableWithEnum{_tab: table}
+		return &AnyUniqueAliasesT{ Type: AnyUniqueAliasesTS, Value: x.UnPack() }
+	case AnyUniqueAliasesM2:
+		x := Monster{_tab: table}
+		return &AnyUniqueAliasesT{ Type: AnyUniqueAliasesM2, Value: x.UnPack() }
+	}
+	return nil
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.java
new file mode 100644
index 0000000..1f32945
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.java
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+public final class AnyUniqueAliases {
+  private AnyUniqueAliases() { }
+  public static final byte NONE = 0;
+  public static final byte M = 1;
+  public static final byte TS = 2;
+  public static final byte M2 = 3;
+
+  public static final String[] names = { "NONE", "M", "TS", "M2", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.kt
new file mode 100644
index 0000000..1902d5d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.kt
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class AnyUniqueAliases private constructor() {
+    companion object {
+        const val NONE: UByte = 0u
+        const val M: UByte = 1u
+        const val TS: UByte = 2u
+        const val M2: UByte = 3u
+        val names : Array<String> = arrayOf("NONE", "M", "TS", "M2")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.lua
new file mode 100644
index 0000000..9bfeb80
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.lua
@@ -0,0 +1,12 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local AnyUniqueAliases = {
+    NONE = 0,
+    M = 1,
+    TS = 2,
+    M2 = 3,
+}
+
+return AnyUniqueAliases -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.php
new file mode 100644
index 0000000..830d8b5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.php
@@ -0,0 +1,27 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+class AnyUniqueAliases
+{
+    const NONE = 0;
+    const M = 1;
+    const TS = 2;
+    const M2 = 3;
+
+    private static $names = array(
+        AnyUniqueAliases::NONE=>"NONE",
+        AnyUniqueAliases::M=>"M",
+        AnyUniqueAliases::TS=>"TS",
+        AnyUniqueAliases::M2=>"M2",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.py
new file mode 100644
index 0000000..cf89fc2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/AnyUniqueAliases.py
@@ -0,0 +1,25 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+class AnyUniqueAliases(object):
+    NONE = 0
+    M = 1
+    TS = 2
+    M2 = 3
+
+
+def AnyUniqueAliasesCreator(unionType, table):
+    from flatbuffers.table import Table
+    if not isinstance(table, Table):
+        return None
+    if unionType == AnyUniqueAliases().M:
+        import MyGame.Example.Monster
+        return MyGame.Example.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == AnyUniqueAliases().TS:
+        import MyGame.Example.TestSimpleTableWithEnum
+        return MyGame.Example.TestSimpleTableWithEnum.TestSimpleTableWithEnumT.InitFromBuf(table.Bytes, table.Pos)
+    if unionType == AnyUniqueAliases().M2:
+        import MyGame.Example2.Monster
+        return MyGame.Example2.Monster.MonsterT.InitFromBuf(table.Bytes, table.Pos)
+    return None
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.cs
new file mode 100644
index 0000000..41c088d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.cs
@@ -0,0 +1,128 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct ArrayStruct : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public ArrayStruct __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public float A { get { return __p.bb.GetFloat(__p.bb_pos + 0); } }
+  public void MutateA(float a) { __p.bb.PutFloat(__p.bb_pos + 0, a); }
+  public int B(int j) { return __p.bb.GetInt(__p.bb_pos + 4 + j * 4); }
+  public void MutateB(int j, int b) { __p.bb.PutInt(__p.bb_pos + 4 + j * 4, b); }
+  public sbyte C { get { return __p.bb.GetSbyte(__p.bb_pos + 64); } }
+  public void MutateC(sbyte c) { __p.bb.PutSbyte(__p.bb_pos + 64, c); }
+  public MyGame.Example.NestedStruct D(int j) { return (new MyGame.Example.NestedStruct()).__assign(__p.bb_pos + 72 + j * 32, __p.bb); }
+  public int E { get { return __p.bb.GetInt(__p.bb_pos + 136); } }
+  public void MutateE(int e) { __p.bb.PutInt(__p.bb_pos + 136, e); }
+  public long F(int j) { return __p.bb.GetLong(__p.bb_pos + 144 + j * 8); }
+  public void MutateF(int j, long f) { __p.bb.PutLong(__p.bb_pos + 144 + j * 8, f); }
+
+  public static Offset<MyGame.Example.ArrayStruct> CreateArrayStruct(FlatBufferBuilder builder, float A, int[] B, sbyte C, int[,] d_A, MyGame.Example.TestEnum[] d_B, MyGame.Example.TestEnum[,] d_C, long[,] d_D, int E, long[] F) {
+    builder.Prep(8, 160);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.PutLong(F[_idx0-1]);
+    }
+    builder.Pad(4);
+    builder.PutInt(E);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.Prep(8, 32);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.PutLong(d_D[_idx0-1,_idx1-1]);
+      }
+      builder.Pad(5);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.PutSbyte((sbyte)d_C[_idx0-1,_idx1-1]);
+      }
+      builder.PutSbyte((sbyte)d_B[_idx0-1]);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.PutInt(d_A[_idx0-1,_idx1-1]);
+      }
+    }
+    builder.Pad(7);
+    builder.PutSbyte(C);
+    for (int _idx0 = 15; _idx0 > 0; _idx0--) {
+      builder.PutInt(B[_idx0-1]);
+    }
+    builder.PutFloat(A);
+    return new Offset<MyGame.Example.ArrayStruct>(builder.Offset);
+  }
+  public ArrayStructT UnPack() {
+    var _o = new ArrayStructT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(ArrayStructT _o) {
+    _o.A = this.A;
+    _o.B = new int[15];
+    for (var _j = 0; _j < 15; ++_j) { _o.B[_j] = this.B(_j); }
+    _o.C = this.C;
+    _o.D = new MyGame.Example.NestedStructT[2];
+    for (var _j = 0; _j < 2; ++_j) { _o.D[_j] = this.D(_j).UnPack(); }
+    _o.E = this.E;
+    _o.F = new long[2];
+    for (var _j = 0; _j < 2; ++_j) { _o.F[_j] = this.F(_j); }
+  }
+  public static Offset<MyGame.Example.ArrayStruct> Pack(FlatBufferBuilder builder, ArrayStructT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.ArrayStruct>);
+    var _b = _o.B;
+    var _d_a = new int[2,2];
+    for (var idx0 = 0; idx0 < 2; ++idx0) {for (var idx1 = 0; idx1 < 2; ++idx1) {_d_a[idx0,idx1] = _o.D[idx0].A[idx1];}}
+    var _d_b = new MyGame.Example.TestEnum[2];
+    for (var idx0 = 0; idx0 < 2; ++idx0) {_d_b[idx0] = _o.D[idx0].B;}
+    var _d_c = new MyGame.Example.TestEnum[2,2];
+    for (var idx0 = 0; idx0 < 2; ++idx0) {for (var idx1 = 0; idx1 < 2; ++idx1) {_d_c[idx0,idx1] = _o.D[idx0].C[idx1];}}
+    var _d_d = new long[2,2];
+    for (var idx0 = 0; idx0 < 2; ++idx0) {for (var idx1 = 0; idx1 < 2; ++idx1) {_d_d[idx0,idx1] = _o.D[idx0].D[idx1];}}
+    var _f = _o.F;
+    return CreateArrayStruct(
+      builder,
+      _o.A,
+      _b,
+      _o.C,
+      _d_a,
+      _d_b,
+      _d_c,
+      _d_d,
+      _o.E,
+      _f);
+  }
+};
+
+public class ArrayStructT
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public float A { get; set; }
+  [Newtonsoft.Json.JsonProperty("b")]
+  public int[] B { get; set; }
+  [Newtonsoft.Json.JsonProperty("c")]
+  public sbyte C { get; set; }
+  [Newtonsoft.Json.JsonProperty("d")]
+  public MyGame.Example.NestedStructT[] D { get; set; }
+  [Newtonsoft.Json.JsonProperty("e")]
+  public int E { get; set; }
+  [Newtonsoft.Json.JsonProperty("f")]
+  public long[] F { get; set; }
+
+  public ArrayStructT() {
+    this.A = 0.0f;
+    this.B = new int[15];
+    this.C = 0;
+    this.D = new MyGame.Example.NestedStructT[2];
+    this.E = 0;
+    this.F = new long[2];
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.java
new file mode 100644
index 0000000..5c5b3f6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.java
@@ -0,0 +1,64 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class ArrayStruct extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public ArrayStruct __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public float a() { return bb.getFloat(bb_pos + 0); }
+  public void mutateA(float a) { bb.putFloat(bb_pos + 0, a); }
+  public int b(int j) { return bb.getInt(bb_pos + 4 + j * 4); }
+  public void mutateB(int j, int b) { bb.putInt(bb_pos + 4 + j * 4, b); }
+  public byte c() { return bb.get(bb_pos + 64); }
+  public void mutateC(byte c) { bb.put(bb_pos + 64, c); }
+  public MyGame.Example.NestedStruct d(MyGame.Example.NestedStruct obj, int j) { return obj.__assign(bb_pos + 72 + j * 32, bb); }
+  public int e() { return bb.getInt(bb_pos + 136); }
+  public void mutateE(int e) { bb.putInt(bb_pos + 136, e); }
+  public long f(int j) { return bb.getLong(bb_pos + 144 + j * 8); }
+  public void mutateF(int j, long f) { bb.putLong(bb_pos + 144 + j * 8, f); }
+
+  public static int createArrayStruct(FlatBufferBuilder builder, float a, int[] b, byte c, int[][] d_a, byte[] d_b, byte[][] d_c, long[][] d_d, int e, long[] f) {
+    builder.prep(8, 160);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.putLong(f[_idx0-1]);
+    }
+    builder.pad(4);
+    builder.putInt(e);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.prep(8, 32);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.putLong(d_d[_idx0-1][_idx1-1]);
+      }
+      builder.pad(5);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.putByte(d_c[_idx0-1][_idx1-1]);
+      }
+      builder.putByte(d_b[_idx0-1]);
+      for (int _idx1 = 2; _idx1 > 0; _idx1--) {
+        builder.putInt(d_a[_idx0-1][_idx1-1]);
+      }
+    }
+    builder.pad(7);
+    builder.putByte(c);
+    for (int _idx0 = 15; _idx0 > 0; _idx0--) {
+      builder.putInt(b[_idx0-1]);
+    }
+    builder.putFloat(a);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public ArrayStruct get(int j) { return get(new ArrayStruct(), j); }
+    public ArrayStruct get(ArrayStruct obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.py
new file mode 100644
index 0000000..c80bf68
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayStruct.py
@@ -0,0 +1,152 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class ArrayStruct(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 160
+
+    # ArrayStruct
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArrayStruct
+    def A(self): return self._tab.Get(flatbuffers.number_types.Float32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
+    # ArrayStruct
+    def B(self): return [self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4 + i * 4)) for i in range(15)]
+    # ArrayStruct
+    def BLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ArrayStruct
+    def BIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+    # ArrayStruct
+    def C(self): return self._tab.Get(flatbuffers.number_types.Int8Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(64))
+    # ArrayStruct
+    def D(self, obj, i):
+        obj.Init(self._tab.Bytes, self._tab.Pos + 72 + i * 32)
+        return obj
+
+    # ArrayStruct
+    def DLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(72))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ArrayStruct
+    def DIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(72))
+        return o == 0
+
+    # ArrayStruct
+    def E(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(136))
+    # ArrayStruct
+    def F(self): return [self._tab.Get(flatbuffers.number_types.Int64Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(144 + i * 8)) for i in range(2)]
+    # ArrayStruct
+    def FLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(144))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ArrayStruct
+    def FIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(144))
+        return o == 0
+
+
+def CreateArrayStruct(builder, a, b, c, d_a, d_b, d_c, d_d, e, f):
+    builder.Prep(8, 160)
+    for _idx0 in range(2 , 0, -1):
+        builder.PrependInt64(f[_idx0-1])
+    builder.Pad(4)
+    builder.PrependInt32(e)
+    for _idx0 in range(2 , 0, -1):
+        builder.Prep(8, 32)
+        for _idx1 in range(2 , 0, -1):
+            builder.PrependInt64(d_d[_idx0-1][_idx1-1])
+        builder.Pad(5)
+        for _idx1 in range(2 , 0, -1):
+            builder.PrependInt8(d_c[_idx0-1][_idx1-1])
+        builder.PrependInt8(d_b[_idx0-1])
+        for _idx1 in range(2 , 0, -1):
+            builder.PrependInt32(d_a[_idx0-1][_idx1-1])
+    builder.Pad(7)
+    builder.PrependInt8(c)
+    for _idx0 in range(15 , 0, -1):
+        builder.PrependInt32(b[_idx0-1])
+    builder.PrependFloat32(a)
+    return builder.Offset()
+
+import MyGame.Example.NestedStruct
+try:
+    from typing import List
+except:
+    pass
+
+class ArrayStructT(object):
+
+    # ArrayStructT
+    def __init__(self):
+        self.a = 0.0  # type: float
+        self.b = None  # type: List[int]
+        self.c = 0  # type: int
+        self.d = None  # type: List[MyGame.Example.NestedStruct.NestedStructT]
+        self.e = 0  # type: int
+        self.f = None  # type: List[int]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        arrayStruct = ArrayStruct()
+        arrayStruct.Init(buf, pos)
+        return cls.InitFromObj(arrayStruct)
+
+    @classmethod
+    def InitFromObj(cls, arrayStruct):
+        x = ArrayStructT()
+        x._UnPack(arrayStruct)
+        return x
+
+    # ArrayStructT
+    def _UnPack(self, arrayStruct):
+        if arrayStruct is None:
+            return
+        self.a = arrayStruct.A()
+        if not arrayStruct.BIsNone():
+            if np is None:
+                self.b = []
+                for i in range(arrayStruct.BLength()):
+                    self.b.append(arrayStruct.B(i))
+            else:
+                self.b = arrayStruct.BAsNumpy()
+        self.c = arrayStruct.C()
+        if not arrayStruct.DIsNone():
+            self.d = []
+            for i in range(arrayStruct.DLength()):
+                self.d.append(arrayStruct.D(i))
+        self.e = arrayStruct.E()
+        if not arrayStruct.FIsNone():
+            if np is None:
+                self.f = []
+                for i in range(arrayStruct.FLength()):
+                    self.f.append(arrayStruct.F(i))
+            else:
+                self.f = arrayStruct.FAsNumpy()
+
+    # ArrayStructT
+    def Pack(self, builder):
+        return CreateArrayStruct(builder, self.a, self.b, self.c, self.d.a, self.d.b, self.d.c, self.d.d, self.e, self.f)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.cs
new file mode 100644
index 0000000..f622df5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.cs
@@ -0,0 +1,75 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct ArrayTable : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static ArrayTable GetRootAsArrayTable(ByteBuffer _bb) { return GetRootAsArrayTable(_bb, new ArrayTable()); }
+  public static ArrayTable GetRootAsArrayTable(ByteBuffer _bb, ArrayTable obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public static bool ArrayTableBufferHasIdentifier(ByteBuffer _bb) { return Table.__has_identifier(_bb, "ARRT"); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public ArrayTable __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.ArrayStruct? A { get { int o = __p.__offset(4); return o != 0 ? (MyGame.Example.ArrayStruct?)(new MyGame.Example.ArrayStruct()).__assign(o + __p.bb_pos, __p.bb) : null; } }
+
+  public static void StartArrayTable(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddA(FlatBufferBuilder builder, Offset<MyGame.Example.ArrayStruct> aOffset) { builder.AddStruct(0, aOffset.Value, 0); }
+  public static Offset<MyGame.Example.ArrayTable> EndArrayTable(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example.ArrayTable>(o);
+  }
+  public static void FinishArrayTableBuffer(FlatBufferBuilder builder, Offset<MyGame.Example.ArrayTable> offset) { builder.Finish(offset.Value, "ARRT"); }
+  public static void FinishSizePrefixedArrayTableBuffer(FlatBufferBuilder builder, Offset<MyGame.Example.ArrayTable> offset) { builder.FinishSizePrefixed(offset.Value, "ARRT"); }
+  public ArrayTableT UnPack() {
+    var _o = new ArrayTableT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(ArrayTableT _o) {
+    _o.A = this.A.HasValue ? this.A.Value.UnPack() : null;
+  }
+  public static Offset<MyGame.Example.ArrayTable> Pack(FlatBufferBuilder builder, ArrayTableT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.ArrayTable>);
+    StartArrayTable(builder);
+    AddA(builder, MyGame.Example.ArrayStruct.Pack(builder, _o.A));
+    return EndArrayTable(builder);
+  }
+};
+
+public class ArrayTableT
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public MyGame.Example.ArrayStructT A { get; set; }
+
+  public ArrayTableT() {
+    this.A = new MyGame.Example.ArrayStructT();
+  }
+
+  public static ArrayTableT DeserializeFromJson(string jsonText) {
+    return Newtonsoft.Json.JsonConvert.DeserializeObject<ArrayTableT>(jsonText);
+  }
+  public string SerializeToJson() {
+    return Newtonsoft.Json.JsonConvert.SerializeObject(this, Newtonsoft.Json.Formatting.Indented);
+  }
+  public static ArrayTableT DeserializeFromBinary(byte[] fbBuffer) {
+    return ArrayTable.GetRootAsArrayTable(new ByteBuffer(fbBuffer)).UnPack();
+  }
+  public byte[] SerializeToBinary() {
+    var fbb = new FlatBufferBuilder(0x10000);
+    ArrayTable.FinishArrayTableBuffer(fbb, ArrayTable.Pack(fbb, this));
+    return fbb.DataBuffer.ToSizedArray();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.java
new file mode 100644
index 0000000..c0ad977
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.java
@@ -0,0 +1,38 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class ArrayTable extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static ArrayTable getRootAsArrayTable(ByteBuffer _bb) { return getRootAsArrayTable(_bb, new ArrayTable()); }
+  public static ArrayTable getRootAsArrayTable(ByteBuffer _bb, ArrayTable obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public static boolean ArrayTableBufferHasIdentifier(ByteBuffer _bb) { return __has_identifier(_bb, "ARRT"); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public ArrayTable __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.ArrayStruct a() { return a(new MyGame.Example.ArrayStruct()); }
+  public MyGame.Example.ArrayStruct a(MyGame.Example.ArrayStruct obj) { int o = __offset(4); return o != 0 ? obj.__assign(o + bb_pos, bb) : null; }
+
+  public static void startArrayTable(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addA(FlatBufferBuilder builder, int aOffset) { builder.addStruct(0, aOffset, 0); }
+  public static int endArrayTable(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+  public static void finishArrayTableBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset, "ARRT"); }
+  public static void finishSizePrefixedArrayTableBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset, "ARRT"); }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public ArrayTable get(int j) { return get(new ArrayTable(), j); }
+    public ArrayTable get(ArrayTable obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.py
new file mode 100644
index 0000000..5ca8dad
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/ArrayTable.py
@@ -0,0 +1,92 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class ArrayTable(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ArrayTable()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsArrayTable(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def ArrayTableBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x41\x52\x52\x54", size_prefixed=size_prefixed)
+
+    # ArrayTable
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArrayTable
+    def A(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = o + self._tab.Pos
+            from MyGame.Example.ArrayStruct import ArrayStruct
+            obj = ArrayStruct()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+def Start(builder): builder.StartObject(1)
+def ArrayTableStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddA(builder, a): builder.PrependStructSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(a), 0)
+def ArrayTableAddA(builder, a):
+    """This method is deprecated. Please switch to AddA."""
+    return AddA(builder, a)
+def End(builder): return builder.EndObject()
+def ArrayTableEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+import MyGame.Example.ArrayStruct
+try:
+    from typing import Optional
+except:
+    pass
+
+class ArrayTableT(object):
+
+    # ArrayTableT
+    def __init__(self):
+        self.a = None  # type: Optional[MyGame.Example.ArrayStruct.ArrayStructT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        arrayTable = ArrayTable()
+        arrayTable.Init(buf, pos)
+        return cls.InitFromObj(arrayTable)
+
+    @classmethod
+    def InitFromObj(cls, arrayTable):
+        x = ArrayTableT()
+        x._UnPack(arrayTable)
+        return x
+
+    # ArrayTableT
+    def _UnPack(self, arrayTable):
+        if arrayTable is None:
+            return
+        if arrayTable.A() is not None:
+            self.a = MyGame.Example.ArrayStruct.ArrayStructT.InitFromObj(arrayTable.A())
+
+    # ArrayTableT
+    def Pack(self, builder):
+        Start(builder)
+        if self.a is not None:
+            a = self.a.Pack(builder)
+            AddA(builder, a)
+        arrayTable = End(builder)
+        return arrayTable
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.cs
new file mode 100644
index 0000000..1137a27
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.cs
@@ -0,0 +1,22 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+/// Composite components of Monster color.
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+[System.FlagsAttribute]
+public enum Color : byte
+{
+  Red = 1,
+  /// \brief color Green
+  /// Green is bit_flag with value (1u << 1)
+  Green = 2,
+  /// \brief color Blue (1u << 3)
+  Blue = 8,
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.go
new file mode 100644
index 0000000..9570ae4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.go
@@ -0,0 +1,36 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import "strconv"
+
+/// Composite components of Monster color.
+type Color byte
+
+const (
+	ColorRed   Color = 1
+	/// \brief color Green
+	/// Green is bit_flag with value (1u << 1)
+	ColorGreen Color = 2
+	/// \brief color Blue (1u << 3)
+	ColorBlue  Color = 8
+)
+
+var EnumNamesColor = map[Color]string{
+	ColorRed:   "Red",
+	ColorGreen: "Green",
+	ColorBlue:  "Blue",
+}
+
+var EnumValuesColor = map[string]Color{
+	"Red":   ColorRed,
+	"Green": ColorGreen,
+	"Blue":  ColorBlue,
+}
+
+func (v Color) String() string {
+	if s, ok := EnumNamesColor[v]; ok {
+		return s
+	}
+	return "Color(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.java
new file mode 100644
index 0000000..dd19a61
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.java
@@ -0,0 +1,25 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+/**
+ * Composite components of Monster color.
+ */
+public final class Color {
+  private Color() { }
+  public static final int Red = 1;
+  /**
+   * \brief color Green
+   * Green is bit_flag with value (1u << 1)
+   */
+  public static final int Green = 2;
+  /**
+   * \brief color Blue (1u << 3)
+   */
+  public static final int Blue = 8;
+
+  public static final String[] names = { "Red", "Green", "", "", "", "", "", "Blue", };
+
+  public static String name(int e) { return names[e - Red]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.kt
new file mode 100644
index 0000000..4c27ba3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.kt
@@ -0,0 +1,25 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+/**
+ * Composite components of Monster color.
+ */
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Color private constructor() {
+    companion object {
+        const val Red: UByte = 1u
+        /**
+         * \brief color Green
+         * Green is bit_flag with value (1u << 1)
+         */
+        const val Green: UByte = 2u
+        /**
+         * \brief color Blue (1u << 3)
+         */
+        const val Blue: UByte = 8u
+        val names : Array<String> = arrayOf("Red", "Green", "", "", "", "", "", "Blue")
+        fun name(e: Int) : String = names[e - Red.toInt()]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.lua
new file mode 100644
index 0000000..d4d2cbc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.lua
@@ -0,0 +1,15 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+-- Composite components of Monster color.
+local Color = {
+    Red = 1,
+    -- \brief color Green
+    -- Green is bit_flag with value (1u << 1)
+    Green = 2,
+    -- \brief color Blue (1u << 3)
+    Blue = 8,
+}
+
+return Color -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.php
new file mode 100644
index 0000000..8c32922
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.php
@@ -0,0 +1,29 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+/// Composite components of Monster color.
+class Color
+{
+    const Red = 1;
+    /// \brief color Green
+    /// Green is bit_flag with value (1u << 1)
+    const Green = 2;
+    /// \brief color Blue (1u << 3)
+    const Blue = 8;
+
+    private static $names = array(
+        Color::Red=>"Red",
+        Color::Green=>"Green",
+        Color::Blue=>"Blue",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.py
new file mode 100644
index 0000000..55aa821
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Color.py
@@ -0,0 +1,13 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+# Composite components of Monster color.
+class Color(object):
+    Red = 1
+    # \brief color Green
+    # Green is bit_flag with value (1u << 1)
+    Green = 2
+    # \brief color Blue (1u << 3)
+    Blue = 8
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.cs
new file mode 100644
index 0000000..d2f89c2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.cs
@@ -0,0 +1,957 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+/// an example documentation comment: "monster object"
+public struct Monster : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Monster GetRootAsMonster(ByteBuffer _bb) { return GetRootAsMonster(_bb, new Monster()); }
+  public static Monster GetRootAsMonster(ByteBuffer _bb, Monster obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public static bool MonsterBufferHasIdentifier(ByteBuffer _bb) { return Table.__has_identifier(_bb, "MONS"); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Monster __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.Vec3? Pos { get { int o = __p.__offset(4); return o != 0 ? (MyGame.Example.Vec3?)(new MyGame.Example.Vec3()).__assign(o + __p.bb_pos, __p.bb) : null; } }
+  public short Mana { get { int o = __p.__offset(6); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short)150; } }
+  public bool MutateMana(short mana) { int o = __p.__offset(6); if (o != 0) { __p.bb.PutShort(o + __p.bb_pos, mana); return true; } else { return false; } }
+  public short Hp { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short)100; } }
+  public bool MutateHp(short hp) { int o = __p.__offset(8); if (o != 0) { __p.bb.PutShort(o + __p.bb_pos, hp); return true; } else { return false; } }
+  public string Name { get { int o = __p.__offset(10); return o != 0 ? __p.__string(o + __p.bb_pos) : null; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetNameBytes() { return __p.__vector_as_span<byte>(10, 1); }
+#else
+  public ArraySegment<byte>? GetNameBytes() { return __p.__vector_as_arraysegment(10); }
+#endif
+  public byte[] GetNameArray() { return __p.__vector_as_array<byte>(10); }
+  public byte Inventory(int j) { int o = __p.__offset(14); return o != 0 ? __p.bb.Get(__p.__vector(o) + j * 1) : (byte)0; }
+  public int InventoryLength { get { int o = __p.__offset(14); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetInventoryBytes() { return __p.__vector_as_span<byte>(14, 1); }
+#else
+  public ArraySegment<byte>? GetInventoryBytes() { return __p.__vector_as_arraysegment(14); }
+#endif
+  public byte[] GetInventoryArray() { return __p.__vector_as_array<byte>(14); }
+  public bool MutateInventory(int j, byte inventory) { int o = __p.__offset(14); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, inventory); return true; } else { return false; } }
+  public MyGame.Example.Color Color { get { int o = __p.__offset(16); return o != 0 ? (MyGame.Example.Color)__p.bb.Get(o + __p.bb_pos) : MyGame.Example.Color.Blue; } }
+  public bool MutateColor(MyGame.Example.Color color) { int o = __p.__offset(16); if (o != 0) { __p.bb.Put(o + __p.bb_pos, (byte)color); return true; } else { return false; } }
+  public MyGame.Example.Any TestType { get { int o = __p.__offset(18); return o != 0 ? (MyGame.Example.Any)__p.bb.Get(o + __p.bb_pos) : MyGame.Example.Any.NONE; } }
+  public TTable? Test<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(20); return o != 0 ? (TTable?)__p.__union<TTable>(o + __p.bb_pos) : null; }
+  public MyGame.Example.Monster TestAsMonster() { return Test<MyGame.Example.Monster>().Value; }
+  internal MyGame.Example.TestSimpleTableWithEnum TestAsTestSimpleTableWithEnum() { return Test<MyGame.Example.TestSimpleTableWithEnum>().Value; }
+  public MyGame.Example2.Monster TestAsMyGame_Example2_Monster() { return Test<MyGame.Example2.Monster>().Value; }
+  public MyGame.Example.Test? Test4(int j) { int o = __p.__offset(22); return o != 0 ? (MyGame.Example.Test?)(new MyGame.Example.Test()).__assign(__p.__vector(o) + j * 4, __p.bb) : null; }
+  public int Test4Length { get { int o = __p.__offset(22); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public string Testarrayofstring(int j) { int o = __p.__offset(24); return o != 0 ? __p.__string(__p.__vector(o) + j * 4) : null; }
+  public int TestarrayofstringLength { get { int o = __p.__offset(24); return o != 0 ? __p.__vector_len(o) : 0; } }
+  /// an example documentation comment: this will end up in the generated code
+  /// multiline too
+  public MyGame.Example.Monster? Testarrayoftables(int j) { int o = __p.__offset(26); return o != 0 ? (MyGame.Example.Monster?)(new MyGame.Example.Monster()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int TestarrayoftablesLength { get { int o = __p.__offset(26); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public MyGame.Example.Monster? TestarrayoftablesByKey(string key) { int o = __p.__offset(26); return o != 0 ? MyGame.Example.Monster.__lookup_by_key(__p.__vector(o), key, __p.bb) : null; }
+  public MyGame.Example.Monster? Enemy { get { int o = __p.__offset(28); return o != 0 ? (MyGame.Example.Monster?)(new MyGame.Example.Monster()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+  public byte Testnestedflatbuffer(int j) { int o = __p.__offset(30); return o != 0 ? __p.bb.Get(__p.__vector(o) + j * 1) : (byte)0; }
+  public int TestnestedflatbufferLength { get { int o = __p.__offset(30); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetTestnestedflatbufferBytes() { return __p.__vector_as_span<byte>(30, 1); }
+#else
+  public ArraySegment<byte>? GetTestnestedflatbufferBytes() { return __p.__vector_as_arraysegment(30); }
+#endif
+  public byte[] GetTestnestedflatbufferArray() { return __p.__vector_as_array<byte>(30); }
+  public MyGame.Example.Monster? GetTestnestedflatbufferAsMonster() { int o = __p.__offset(30); return o != 0 ? (MyGame.Example.Monster?)(new MyGame.Example.Monster()).__assign(__p.__indirect(__p.__vector(o)), __p.bb) : null; }
+  public bool MutateTestnestedflatbuffer(int j, byte testnestedflatbuffer) { int o = __p.__offset(30); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, testnestedflatbuffer); return true; } else { return false; } }
+  public MyGame.Example.Stat? Testempty { get { int o = __p.__offset(32); return o != 0 ? (MyGame.Example.Stat?)(new MyGame.Example.Stat()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+  public bool Testbool { get { int o = __p.__offset(34); return o != 0 ? 0!=__p.bb.Get(o + __p.bb_pos) : (bool)false; } }
+  public bool MutateTestbool(bool testbool) { int o = __p.__offset(34); if (o != 0) { __p.bb.Put(o + __p.bb_pos, (byte)(testbool ? 1 : 0)); return true; } else { return false; } }
+  public int Testhashs32Fnv1 { get { int o = __p.__offset(36); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public bool MutateTesthashs32Fnv1(int testhashs32_fnv1) { int o = __p.__offset(36); if (o != 0) { __p.bb.PutInt(o + __p.bb_pos, testhashs32_fnv1); return true; } else { return false; } }
+  public uint Testhashu32Fnv1 { get { int o = __p.__offset(38); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint)0; } }
+  public bool MutateTesthashu32Fnv1(uint testhashu32_fnv1) { int o = __p.__offset(38); if (o != 0) { __p.bb.PutUint(o + __p.bb_pos, testhashu32_fnv1); return true; } else { return false; } }
+  public long Testhashs64Fnv1 { get { int o = __p.__offset(40); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public bool MutateTesthashs64Fnv1(long testhashs64_fnv1) { int o = __p.__offset(40); if (o != 0) { __p.bb.PutLong(o + __p.bb_pos, testhashs64_fnv1); return true; } else { return false; } }
+  public ulong Testhashu64Fnv1 { get { int o = __p.__offset(42); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateTesthashu64Fnv1(ulong testhashu64_fnv1) { int o = __p.__offset(42); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, testhashu64_fnv1); return true; } else { return false; } }
+  public int Testhashs32Fnv1a { get { int o = __p.__offset(44); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public bool MutateTesthashs32Fnv1a(int testhashs32_fnv1a) { int o = __p.__offset(44); if (o != 0) { __p.bb.PutInt(o + __p.bb_pos, testhashs32_fnv1a); return true; } else { return false; } }
+  public uint Testhashu32Fnv1a { get { int o = __p.__offset(46); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint)0; } }
+  public bool MutateTesthashu32Fnv1a(uint testhashu32_fnv1a) { int o = __p.__offset(46); if (o != 0) { __p.bb.PutUint(o + __p.bb_pos, testhashu32_fnv1a); return true; } else { return false; } }
+  public long Testhashs64Fnv1a { get { int o = __p.__offset(48); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public bool MutateTesthashs64Fnv1a(long testhashs64_fnv1a) { int o = __p.__offset(48); if (o != 0) { __p.bb.PutLong(o + __p.bb_pos, testhashs64_fnv1a); return true; } else { return false; } }
+  public ulong Testhashu64Fnv1a { get { int o = __p.__offset(50); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateTesthashu64Fnv1a(ulong testhashu64_fnv1a) { int o = __p.__offset(50); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, testhashu64_fnv1a); return true; } else { return false; } }
+  public bool Testarrayofbools(int j) { int o = __p.__offset(52); return o != 0 ? 0!=__p.bb.Get(__p.__vector(o) + j * 1) : false; }
+  public int TestarrayofboolsLength { get { int o = __p.__offset(52); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<bool> GetTestarrayofboolsBytes() { return __p.__vector_as_span<bool>(52, 1); }
+#else
+  public ArraySegment<byte>? GetTestarrayofboolsBytes() { return __p.__vector_as_arraysegment(52); }
+#endif
+  public bool[] GetTestarrayofboolsArray() { return __p.__vector_as_array<bool>(52); }
+  public bool MutateTestarrayofbools(int j, bool testarrayofbools) { int o = __p.__offset(52); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, (byte)(testarrayofbools ? 1 : 0)); return true; } else { return false; } }
+  public float Testf { get { int o = __p.__offset(54); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)3.14159f; } }
+  public bool MutateTestf(float testf) { int o = __p.__offset(54); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, testf); return true; } else { return false; } }
+  public float Testf2 { get { int o = __p.__offset(56); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)3.0f; } }
+  public bool MutateTestf2(float testf2) { int o = __p.__offset(56); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, testf2); return true; } else { return false; } }
+  public float Testf3 { get { int o = __p.__offset(58); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)0.0f; } }
+  public bool MutateTestf3(float testf3) { int o = __p.__offset(58); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, testf3); return true; } else { return false; } }
+  public string Testarrayofstring2(int j) { int o = __p.__offset(60); return o != 0 ? __p.__string(__p.__vector(o) + j * 4) : null; }
+  public int Testarrayofstring2Length { get { int o = __p.__offset(60); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public MyGame.Example.Ability? Testarrayofsortedstruct(int j) { int o = __p.__offset(62); return o != 0 ? (MyGame.Example.Ability?)(new MyGame.Example.Ability()).__assign(__p.__vector(o) + j * 8, __p.bb) : null; }
+  public int TestarrayofsortedstructLength { get { int o = __p.__offset(62); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public byte Flex(int j) { int o = __p.__offset(64); return o != 0 ? __p.bb.Get(__p.__vector(o) + j * 1) : (byte)0; }
+  public int FlexLength { get { int o = __p.__offset(64); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetFlexBytes() { return __p.__vector_as_span<byte>(64, 1); }
+#else
+  public ArraySegment<byte>? GetFlexBytes() { return __p.__vector_as_arraysegment(64); }
+#endif
+  public byte[] GetFlexArray() { return __p.__vector_as_array<byte>(64); }
+  public bool MutateFlex(int j, byte flex) { int o = __p.__offset(64); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, flex); return true; } else { return false; } }
+  public MyGame.Example.Test? Test5(int j) { int o = __p.__offset(66); return o != 0 ? (MyGame.Example.Test?)(new MyGame.Example.Test()).__assign(__p.__vector(o) + j * 4, __p.bb) : null; }
+  public int Test5Length { get { int o = __p.__offset(66); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public long VectorOfLongs(int j) { int o = __p.__offset(68); return o != 0 ? __p.bb.GetLong(__p.__vector(o) + j * 8) : (long)0; }
+  public int VectorOfLongsLength { get { int o = __p.__offset(68); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<long> GetVectorOfLongsBytes() { return __p.__vector_as_span<long>(68, 8); }
+#else
+  public ArraySegment<byte>? GetVectorOfLongsBytes() { return __p.__vector_as_arraysegment(68); }
+#endif
+  public long[] GetVectorOfLongsArray() { return __p.__vector_as_array<long>(68); }
+  public bool MutateVectorOfLongs(int j, long vector_of_longs) { int o = __p.__offset(68); if (o != 0) { __p.bb.PutLong(__p.__vector(o) + j * 8, vector_of_longs); return true; } else { return false; } }
+  public double VectorOfDoubles(int j) { int o = __p.__offset(70); return o != 0 ? __p.bb.GetDouble(__p.__vector(o) + j * 8) : (double)0; }
+  public int VectorOfDoublesLength { get { int o = __p.__offset(70); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<double> GetVectorOfDoublesBytes() { return __p.__vector_as_span<double>(70, 8); }
+#else
+  public ArraySegment<byte>? GetVectorOfDoublesBytes() { return __p.__vector_as_arraysegment(70); }
+#endif
+  public double[] GetVectorOfDoublesArray() { return __p.__vector_as_array<double>(70); }
+  public bool MutateVectorOfDoubles(int j, double vector_of_doubles) { int o = __p.__offset(70); if (o != 0) { __p.bb.PutDouble(__p.__vector(o) + j * 8, vector_of_doubles); return true; } else { return false; } }
+  public MyGame.InParentNamespace? ParentNamespaceTest { get { int o = __p.__offset(72); return o != 0 ? (MyGame.InParentNamespace?)(new MyGame.InParentNamespace()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+  public MyGame.Example.Referrable? VectorOfReferrables(int j) { int o = __p.__offset(74); return o != 0 ? (MyGame.Example.Referrable?)(new MyGame.Example.Referrable()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int VectorOfReferrablesLength { get { int o = __p.__offset(74); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public MyGame.Example.Referrable? VectorOfReferrablesByKey(ulong key) { int o = __p.__offset(74); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(__p.__vector(o), key, __p.bb) : null; }
+  public ulong SingleWeakReference { get { int o = __p.__offset(76); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateSingleWeakReference(ulong single_weak_reference) { int o = __p.__offset(76); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, single_weak_reference); return true; } else { return false; } }
+  public ulong VectorOfWeakReferences(int j) { int o = __p.__offset(78); return o != 0 ? __p.bb.GetUlong(__p.__vector(o) + j * 8) : (ulong)0; }
+  public int VectorOfWeakReferencesLength { get { int o = __p.__offset(78); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<ulong> GetVectorOfWeakReferencesBytes() { return __p.__vector_as_span<ulong>(78, 8); }
+#else
+  public ArraySegment<byte>? GetVectorOfWeakReferencesBytes() { return __p.__vector_as_arraysegment(78); }
+#endif
+  public ulong[] GetVectorOfWeakReferencesArray() { return __p.__vector_as_array<ulong>(78); }
+  public bool MutateVectorOfWeakReferences(int j, ulong vector_of_weak_references) { int o = __p.__offset(78); if (o != 0) { __p.bb.PutUlong(__p.__vector(o) + j * 8, vector_of_weak_references); return true; } else { return false; } }
+  public MyGame.Example.Referrable? VectorOfStrongReferrables(int j) { int o = __p.__offset(80); return o != 0 ? (MyGame.Example.Referrable?)(new MyGame.Example.Referrable()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int VectorOfStrongReferrablesLength { get { int o = __p.__offset(80); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public MyGame.Example.Referrable? VectorOfStrongReferrablesByKey(ulong key) { int o = __p.__offset(80); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(__p.__vector(o), key, __p.bb) : null; }
+  public ulong CoOwningReference { get { int o = __p.__offset(82); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateCoOwningReference(ulong co_owning_reference) { int o = __p.__offset(82); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, co_owning_reference); return true; } else { return false; } }
+  public ulong VectorOfCoOwningReferences(int j) { int o = __p.__offset(84); return o != 0 ? __p.bb.GetUlong(__p.__vector(o) + j * 8) : (ulong)0; }
+  public int VectorOfCoOwningReferencesLength { get { int o = __p.__offset(84); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<ulong> GetVectorOfCoOwningReferencesBytes() { return __p.__vector_as_span<ulong>(84, 8); }
+#else
+  public ArraySegment<byte>? GetVectorOfCoOwningReferencesBytes() { return __p.__vector_as_arraysegment(84); }
+#endif
+  public ulong[] GetVectorOfCoOwningReferencesArray() { return __p.__vector_as_array<ulong>(84); }
+  public bool MutateVectorOfCoOwningReferences(int j, ulong vector_of_co_owning_references) { int o = __p.__offset(84); if (o != 0) { __p.bb.PutUlong(__p.__vector(o) + j * 8, vector_of_co_owning_references); return true; } else { return false; } }
+  public ulong NonOwningReference { get { int o = __p.__offset(86); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateNonOwningReference(ulong non_owning_reference) { int o = __p.__offset(86); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, non_owning_reference); return true; } else { return false; } }
+  public ulong VectorOfNonOwningReferences(int j) { int o = __p.__offset(88); return o != 0 ? __p.bb.GetUlong(__p.__vector(o) + j * 8) : (ulong)0; }
+  public int VectorOfNonOwningReferencesLength { get { int o = __p.__offset(88); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<ulong> GetVectorOfNonOwningReferencesBytes() { return __p.__vector_as_span<ulong>(88, 8); }
+#else
+  public ArraySegment<byte>? GetVectorOfNonOwningReferencesBytes() { return __p.__vector_as_arraysegment(88); }
+#endif
+  public ulong[] GetVectorOfNonOwningReferencesArray() { return __p.__vector_as_array<ulong>(88); }
+  public bool MutateVectorOfNonOwningReferences(int j, ulong vector_of_non_owning_references) { int o = __p.__offset(88); if (o != 0) { __p.bb.PutUlong(__p.__vector(o) + j * 8, vector_of_non_owning_references); return true; } else { return false; } }
+  public MyGame.Example.AnyUniqueAliases AnyUniqueType { get { int o = __p.__offset(90); return o != 0 ? (MyGame.Example.AnyUniqueAliases)__p.bb.Get(o + __p.bb_pos) : MyGame.Example.AnyUniqueAliases.NONE; } }
+  public TTable? AnyUnique<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(92); return o != 0 ? (TTable?)__p.__union<TTable>(o + __p.bb_pos) : null; }
+  public MyGame.Example.Monster AnyUniqueAsM() { return AnyUnique<MyGame.Example.Monster>().Value; }
+  internal MyGame.Example.TestSimpleTableWithEnum AnyUniqueAsTS() { return AnyUnique<MyGame.Example.TestSimpleTableWithEnum>().Value; }
+  public MyGame.Example2.Monster AnyUniqueAsM2() { return AnyUnique<MyGame.Example2.Monster>().Value; }
+  public MyGame.Example.AnyAmbiguousAliases AnyAmbiguousType { get { int o = __p.__offset(94); return o != 0 ? (MyGame.Example.AnyAmbiguousAliases)__p.bb.Get(o + __p.bb_pos) : MyGame.Example.AnyAmbiguousAliases.NONE; } }
+  public TTable? AnyAmbiguous<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(96); return o != 0 ? (TTable?)__p.__union<TTable>(o + __p.bb_pos) : null; }
+  public MyGame.Example.Monster AnyAmbiguousAsM1() { return AnyAmbiguous<MyGame.Example.Monster>().Value; }
+  public MyGame.Example.Monster AnyAmbiguousAsM2() { return AnyAmbiguous<MyGame.Example.Monster>().Value; }
+  public MyGame.Example.Monster AnyAmbiguousAsM3() { return AnyAmbiguous<MyGame.Example.Monster>().Value; }
+  public MyGame.Example.Color VectorOfEnums(int j) { int o = __p.__offset(98); return o != 0 ? (MyGame.Example.Color)__p.bb.Get(__p.__vector(o) + j * 1) : (MyGame.Example.Color)0; }
+  public int VectorOfEnumsLength { get { int o = __p.__offset(98); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<MyGame.Example.Color> GetVectorOfEnumsBytes() { return __p.__vector_as_span<MyGame.Example.Color>(98, 1); }
+#else
+  public ArraySegment<byte>? GetVectorOfEnumsBytes() { return __p.__vector_as_arraysegment(98); }
+#endif
+  public MyGame.Example.Color[] GetVectorOfEnumsArray() { int o = __p.__offset(98); if (o == 0) return null; int p = __p.__vector(o); int l = __p.__vector_len(o); MyGame.Example.Color[] a = new MyGame.Example.Color[l]; for (int i = 0; i < l; i++) { a[i] = (MyGame.Example.Color)__p.bb.Get(p + i * 1); } return a; }
+  public bool MutateVectorOfEnums(int j, MyGame.Example.Color vector_of_enums) { int o = __p.__offset(98); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, (byte)vector_of_enums); return true; } else { return false; } }
+  public MyGame.Example.Race SignedEnum { get { int o = __p.__offset(100); return o != 0 ? (MyGame.Example.Race)__p.bb.GetSbyte(o + __p.bb_pos) : MyGame.Example.Race.None; } }
+  public bool MutateSignedEnum(MyGame.Example.Race signed_enum) { int o = __p.__offset(100); if (o != 0) { __p.bb.PutSbyte(o + __p.bb_pos, (sbyte)signed_enum); return true; } else { return false; } }
+  public byte Testrequirednestedflatbuffer(int j) { int o = __p.__offset(102); return o != 0 ? __p.bb.Get(__p.__vector(o) + j * 1) : (byte)0; }
+  public int TestrequirednestedflatbufferLength { get { int o = __p.__offset(102); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetTestrequirednestedflatbufferBytes() { return __p.__vector_as_span<byte>(102, 1); }
+#else
+  public ArraySegment<byte>? GetTestrequirednestedflatbufferBytes() { return __p.__vector_as_arraysegment(102); }
+#endif
+  public byte[] GetTestrequirednestedflatbufferArray() { return __p.__vector_as_array<byte>(102); }
+  public MyGame.Example.Monster? GetTestrequirednestedflatbufferAsMonster() { int o = __p.__offset(102); return o != 0 ? (MyGame.Example.Monster?)(new MyGame.Example.Monster()).__assign(__p.__indirect(__p.__vector(o)), __p.bb) : null; }
+  public bool MutateTestrequirednestedflatbuffer(int j, byte testrequirednestedflatbuffer) { int o = __p.__offset(102); if (o != 0) { __p.bb.Put(__p.__vector(o) + j * 1, testrequirednestedflatbuffer); return true; } else { return false; } }
+  public MyGame.Example.Stat? ScalarKeySortedTables(int j) { int o = __p.__offset(104); return o != 0 ? (MyGame.Example.Stat?)(new MyGame.Example.Stat()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; }
+  public int ScalarKeySortedTablesLength { get { int o = __p.__offset(104); return o != 0 ? __p.__vector_len(o) : 0; } }
+  public MyGame.Example.Stat? ScalarKeySortedTablesByKey(ushort key) { int o = __p.__offset(104); return o != 0 ? MyGame.Example.Stat.__lookup_by_key(__p.__vector(o), key, __p.bb) : null; }
+
+  public static Offset<MyGame.Example.Monster> CreateMonster(FlatBufferBuilder builder,
+      MyGame.Example.Vec3T pos = null,
+      short mana = 150,
+      short hp = 100,
+      StringOffset nameOffset = default(StringOffset),
+      VectorOffset inventoryOffset = default(VectorOffset),
+      MyGame.Example.Color color = MyGame.Example.Color.Blue,
+      MyGame.Example.Any test_type = MyGame.Example.Any.NONE,
+      int testOffset = 0,
+      VectorOffset test4Offset = default(VectorOffset),
+      VectorOffset testarrayofstringOffset = default(VectorOffset),
+      VectorOffset testarrayoftablesOffset = default(VectorOffset),
+      Offset<MyGame.Example.Monster> enemyOffset = default(Offset<MyGame.Example.Monster>),
+      VectorOffset testnestedflatbufferOffset = default(VectorOffset),
+      Offset<MyGame.Example.Stat> testemptyOffset = default(Offset<MyGame.Example.Stat>),
+      bool testbool = false,
+      int testhashs32_fnv1 = 0,
+      uint testhashu32_fnv1 = 0,
+      long testhashs64_fnv1 = 0,
+      ulong testhashu64_fnv1 = 0,
+      int testhashs32_fnv1a = 0,
+      uint testhashu32_fnv1a = 0,
+      long testhashs64_fnv1a = 0,
+      ulong testhashu64_fnv1a = 0,
+      VectorOffset testarrayofboolsOffset = default(VectorOffset),
+      float testf = 3.14159f,
+      float testf2 = 3.0f,
+      float testf3 = 0.0f,
+      VectorOffset testarrayofstring2Offset = default(VectorOffset),
+      VectorOffset testarrayofsortedstructOffset = default(VectorOffset),
+      VectorOffset flexOffset = default(VectorOffset),
+      VectorOffset test5Offset = default(VectorOffset),
+      VectorOffset vector_of_longsOffset = default(VectorOffset),
+      VectorOffset vector_of_doublesOffset = default(VectorOffset),
+      Offset<MyGame.InParentNamespace> parent_namespace_testOffset = default(Offset<MyGame.InParentNamespace>),
+      VectorOffset vector_of_referrablesOffset = default(VectorOffset),
+      ulong single_weak_reference = 0,
+      VectorOffset vector_of_weak_referencesOffset = default(VectorOffset),
+      VectorOffset vector_of_strong_referrablesOffset = default(VectorOffset),
+      ulong co_owning_reference = 0,
+      VectorOffset vector_of_co_owning_referencesOffset = default(VectorOffset),
+      ulong non_owning_reference = 0,
+      VectorOffset vector_of_non_owning_referencesOffset = default(VectorOffset),
+      MyGame.Example.AnyUniqueAliases any_unique_type = MyGame.Example.AnyUniqueAliases.NONE,
+      int any_uniqueOffset = 0,
+      MyGame.Example.AnyAmbiguousAliases any_ambiguous_type = MyGame.Example.AnyAmbiguousAliases.NONE,
+      int any_ambiguousOffset = 0,
+      VectorOffset vector_of_enumsOffset = default(VectorOffset),
+      MyGame.Example.Race signed_enum = MyGame.Example.Race.None,
+      VectorOffset testrequirednestedflatbufferOffset = default(VectorOffset),
+      VectorOffset scalar_key_sorted_tablesOffset = default(VectorOffset)) {
+    builder.StartTable(51);
+    Monster.AddNonOwningReference(builder, non_owning_reference);
+    Monster.AddCoOwningReference(builder, co_owning_reference);
+    Monster.AddSingleWeakReference(builder, single_weak_reference);
+    Monster.AddTesthashu64Fnv1a(builder, testhashu64_fnv1a);
+    Monster.AddTesthashs64Fnv1a(builder, testhashs64_fnv1a);
+    Monster.AddTesthashu64Fnv1(builder, testhashu64_fnv1);
+    Monster.AddTesthashs64Fnv1(builder, testhashs64_fnv1);
+    Monster.AddScalarKeySortedTables(builder, scalar_key_sorted_tablesOffset);
+    Monster.AddTestrequirednestedflatbuffer(builder, testrequirednestedflatbufferOffset);
+    Monster.AddVectorOfEnums(builder, vector_of_enumsOffset);
+    Monster.AddAnyAmbiguous(builder, any_ambiguousOffset);
+    Monster.AddAnyUnique(builder, any_uniqueOffset);
+    Monster.AddVectorOfNonOwningReferences(builder, vector_of_non_owning_referencesOffset);
+    Monster.AddVectorOfCoOwningReferences(builder, vector_of_co_owning_referencesOffset);
+    Monster.AddVectorOfStrongReferrables(builder, vector_of_strong_referrablesOffset);
+    Monster.AddVectorOfWeakReferences(builder, vector_of_weak_referencesOffset);
+    Monster.AddVectorOfReferrables(builder, vector_of_referrablesOffset);
+    Monster.AddParentNamespaceTest(builder, parent_namespace_testOffset);
+    Monster.AddVectorOfDoubles(builder, vector_of_doublesOffset);
+    Monster.AddVectorOfLongs(builder, vector_of_longsOffset);
+    Monster.AddTest5(builder, test5Offset);
+    Monster.AddFlex(builder, flexOffset);
+    Monster.AddTestarrayofsortedstruct(builder, testarrayofsortedstructOffset);
+    Monster.AddTestarrayofstring2(builder, testarrayofstring2Offset);
+    Monster.AddTestf3(builder, testf3);
+    Monster.AddTestf2(builder, testf2);
+    Monster.AddTestf(builder, testf);
+    Monster.AddTestarrayofbools(builder, testarrayofboolsOffset);
+    Monster.AddTesthashu32Fnv1a(builder, testhashu32_fnv1a);
+    Monster.AddTesthashs32Fnv1a(builder, testhashs32_fnv1a);
+    Monster.AddTesthashu32Fnv1(builder, testhashu32_fnv1);
+    Monster.AddTesthashs32Fnv1(builder, testhashs32_fnv1);
+    Monster.AddTestempty(builder, testemptyOffset);
+    Monster.AddTestnestedflatbuffer(builder, testnestedflatbufferOffset);
+    Monster.AddEnemy(builder, enemyOffset);
+    Monster.AddTestarrayoftables(builder, testarrayoftablesOffset);
+    Monster.AddTestarrayofstring(builder, testarrayofstringOffset);
+    Monster.AddTest4(builder, test4Offset);
+    Monster.AddTest(builder, testOffset);
+    Monster.AddInventory(builder, inventoryOffset);
+    Monster.AddName(builder, nameOffset);
+    Monster.AddPos(builder, MyGame.Example.Vec3.Pack(builder, pos));
+    Monster.AddHp(builder, hp);
+    Monster.AddMana(builder, mana);
+    Monster.AddSignedEnum(builder, signed_enum);
+    Monster.AddAnyAmbiguousType(builder, any_ambiguous_type);
+    Monster.AddAnyUniqueType(builder, any_unique_type);
+    Monster.AddTestbool(builder, testbool);
+    Monster.AddTestType(builder, test_type);
+    Monster.AddColor(builder, color);
+    return Monster.EndMonster(builder);
+  }
+
+  public static void StartMonster(FlatBufferBuilder builder) { builder.StartTable(51); }
+  public static void AddPos(FlatBufferBuilder builder, Offset<MyGame.Example.Vec3> posOffset) { builder.AddStruct(0, posOffset.Value, 0); }
+  public static void AddMana(FlatBufferBuilder builder, short mana) { builder.AddShort(1, mana, 150); }
+  public static void AddHp(FlatBufferBuilder builder, short hp) { builder.AddShort(2, hp, 100); }
+  public static void AddName(FlatBufferBuilder builder, StringOffset nameOffset) { builder.AddOffset(3, nameOffset.Value, 0); }
+  public static void AddInventory(FlatBufferBuilder builder, VectorOffset inventoryOffset) { builder.AddOffset(5, inventoryOffset.Value, 0); }
+  public static VectorOffset CreateInventoryVector(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateInventoryVectorBlock(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartInventoryVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddColor(FlatBufferBuilder builder, MyGame.Example.Color color) { builder.AddByte(6, (byte)color, 8); }
+  public static void AddTestType(FlatBufferBuilder builder, MyGame.Example.Any testType) { builder.AddByte(7, (byte)testType, 0); }
+  public static void AddTest(FlatBufferBuilder builder, int testOffset) { builder.AddOffset(8, testOffset, 0); }
+  public static void AddTest4(FlatBufferBuilder builder, VectorOffset test4Offset) { builder.AddOffset(9, test4Offset.Value, 0); }
+  public static void StartTest4Vector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 2); }
+  public static void AddTestarrayofstring(FlatBufferBuilder builder, VectorOffset testarrayofstringOffset) { builder.AddOffset(10, testarrayofstringOffset.Value, 0); }
+  public static VectorOffset CreateTestarrayofstringVector(FlatBufferBuilder builder, StringOffset[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateTestarrayofstringVectorBlock(FlatBufferBuilder builder, StringOffset[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestarrayofstringVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddTestarrayoftables(FlatBufferBuilder builder, VectorOffset testarrayoftablesOffset) { builder.AddOffset(11, testarrayoftablesOffset.Value, 0); }
+  public static VectorOffset CreateTestarrayoftablesVector(FlatBufferBuilder builder, Offset<MyGame.Example.Monster>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateTestarrayoftablesVectorBlock(FlatBufferBuilder builder, Offset<MyGame.Example.Monster>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestarrayoftablesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddEnemy(FlatBufferBuilder builder, Offset<MyGame.Example.Monster> enemyOffset) { builder.AddOffset(12, enemyOffset.Value, 0); }
+  public static void AddTestnestedflatbuffer(FlatBufferBuilder builder, VectorOffset testnestedflatbufferOffset) { builder.AddOffset(13, testnestedflatbufferOffset.Value, 0); }
+  public static VectorOffset CreateTestnestedflatbufferVector(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateTestnestedflatbufferVectorBlock(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestnestedflatbufferVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddTestempty(FlatBufferBuilder builder, Offset<MyGame.Example.Stat> testemptyOffset) { builder.AddOffset(14, testemptyOffset.Value, 0); }
+  public static void AddTestbool(FlatBufferBuilder builder, bool testbool) { builder.AddBool(15, testbool, false); }
+  public static void AddTesthashs32Fnv1(FlatBufferBuilder builder, int testhashs32Fnv1) { builder.AddInt(16, testhashs32Fnv1, 0); }
+  public static void AddTesthashu32Fnv1(FlatBufferBuilder builder, uint testhashu32Fnv1) { builder.AddUint(17, testhashu32Fnv1, 0); }
+  public static void AddTesthashs64Fnv1(FlatBufferBuilder builder, long testhashs64Fnv1) { builder.AddLong(18, testhashs64Fnv1, 0); }
+  public static void AddTesthashu64Fnv1(FlatBufferBuilder builder, ulong testhashu64Fnv1) { builder.AddUlong(19, testhashu64Fnv1, 0); }
+  public static void AddTesthashs32Fnv1a(FlatBufferBuilder builder, int testhashs32Fnv1a) { builder.AddInt(20, testhashs32Fnv1a, 0); }
+  public static void AddTesthashu32Fnv1a(FlatBufferBuilder builder, uint testhashu32Fnv1a) { builder.AddUint(21, testhashu32Fnv1a, 0); }
+  public static void AddTesthashs64Fnv1a(FlatBufferBuilder builder, long testhashs64Fnv1a) { builder.AddLong(22, testhashs64Fnv1a, 0); }
+  public static void AddTesthashu64Fnv1a(FlatBufferBuilder builder, ulong testhashu64Fnv1a) { builder.AddUlong(23, testhashu64Fnv1a, 0); }
+  public static void AddTestarrayofbools(FlatBufferBuilder builder, VectorOffset testarrayofboolsOffset) { builder.AddOffset(24, testarrayofboolsOffset.Value, 0); }
+  public static VectorOffset CreateTestarrayofboolsVector(FlatBufferBuilder builder, bool[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddBool(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateTestarrayofboolsVectorBlock(FlatBufferBuilder builder, bool[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestarrayofboolsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddTestf(FlatBufferBuilder builder, float testf) { builder.AddFloat(25, testf, 3.14159f); }
+  public static void AddTestf2(FlatBufferBuilder builder, float testf2) { builder.AddFloat(26, testf2, 3.0f); }
+  public static void AddTestf3(FlatBufferBuilder builder, float testf3) { builder.AddFloat(27, testf3, 0.0f); }
+  public static void AddTestarrayofstring2(FlatBufferBuilder builder, VectorOffset testarrayofstring2Offset) { builder.AddOffset(28, testarrayofstring2Offset.Value, 0); }
+  public static VectorOffset CreateTestarrayofstring2Vector(FlatBufferBuilder builder, StringOffset[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateTestarrayofstring2VectorBlock(FlatBufferBuilder builder, StringOffset[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestarrayofstring2Vector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddTestarrayofsortedstruct(FlatBufferBuilder builder, VectorOffset testarrayofsortedstructOffset) { builder.AddOffset(29, testarrayofsortedstructOffset.Value, 0); }
+  public static void StartTestarrayofsortedstructVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 4); }
+  public static void AddFlex(FlatBufferBuilder builder, VectorOffset flexOffset) { builder.AddOffset(30, flexOffset.Value, 0); }
+  public static VectorOffset CreateFlexVector(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateFlexVectorBlock(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartFlexVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddTest5(FlatBufferBuilder builder, VectorOffset test5Offset) { builder.AddOffset(31, test5Offset.Value, 0); }
+  public static void StartTest5Vector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 2); }
+  public static void AddVectorOfLongs(FlatBufferBuilder builder, VectorOffset vectorOfLongsOffset) { builder.AddOffset(32, vectorOfLongsOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfLongsVector(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddLong(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfLongsVectorBlock(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfLongsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddVectorOfDoubles(FlatBufferBuilder builder, VectorOffset vectorOfDoublesOffset) { builder.AddOffset(33, vectorOfDoublesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfDoublesVector(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddDouble(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfDoublesVectorBlock(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfDoublesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddParentNamespaceTest(FlatBufferBuilder builder, Offset<MyGame.InParentNamespace> parentNamespaceTestOffset) { builder.AddOffset(34, parentNamespaceTestOffset.Value, 0); }
+  public static void AddVectorOfReferrables(FlatBufferBuilder builder, VectorOffset vectorOfReferrablesOffset) { builder.AddOffset(35, vectorOfReferrablesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfReferrablesVector(FlatBufferBuilder builder, Offset<MyGame.Example.Referrable>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfReferrablesVectorBlock(FlatBufferBuilder builder, Offset<MyGame.Example.Referrable>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfReferrablesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddSingleWeakReference(FlatBufferBuilder builder, ulong singleWeakReference) { builder.AddUlong(36, singleWeakReference, 0); }
+  public static void AddVectorOfWeakReferences(FlatBufferBuilder builder, VectorOffset vectorOfWeakReferencesOffset) { builder.AddOffset(37, vectorOfWeakReferencesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfWeakReferencesVector(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddUlong(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfWeakReferencesVectorBlock(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfWeakReferencesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddVectorOfStrongReferrables(FlatBufferBuilder builder, VectorOffset vectorOfStrongReferrablesOffset) { builder.AddOffset(38, vectorOfStrongReferrablesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfStrongReferrablesVector(FlatBufferBuilder builder, Offset<MyGame.Example.Referrable>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfStrongReferrablesVectorBlock(FlatBufferBuilder builder, Offset<MyGame.Example.Referrable>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfStrongReferrablesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static void AddCoOwningReference(FlatBufferBuilder builder, ulong coOwningReference) { builder.AddUlong(39, coOwningReference, 0); }
+  public static void AddVectorOfCoOwningReferences(FlatBufferBuilder builder, VectorOffset vectorOfCoOwningReferencesOffset) { builder.AddOffset(40, vectorOfCoOwningReferencesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfCoOwningReferencesVector(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddUlong(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfCoOwningReferencesVectorBlock(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfCoOwningReferencesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddNonOwningReference(FlatBufferBuilder builder, ulong nonOwningReference) { builder.AddUlong(41, nonOwningReference, 0); }
+  public static void AddVectorOfNonOwningReferences(FlatBufferBuilder builder, VectorOffset vectorOfNonOwningReferencesOffset) { builder.AddOffset(42, vectorOfNonOwningReferencesOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfNonOwningReferencesVector(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddUlong(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfNonOwningReferencesVectorBlock(FlatBufferBuilder builder, ulong[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfNonOwningReferencesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddAnyUniqueType(FlatBufferBuilder builder, MyGame.Example.AnyUniqueAliases anyUniqueType) { builder.AddByte(43, (byte)anyUniqueType, 0); }
+  public static void AddAnyUnique(FlatBufferBuilder builder, int anyUniqueOffset) { builder.AddOffset(44, anyUniqueOffset, 0); }
+  public static void AddAnyAmbiguousType(FlatBufferBuilder builder, MyGame.Example.AnyAmbiguousAliases anyAmbiguousType) { builder.AddByte(45, (byte)anyAmbiguousType, 0); }
+  public static void AddAnyAmbiguous(FlatBufferBuilder builder, int anyAmbiguousOffset) { builder.AddOffset(46, anyAmbiguousOffset, 0); }
+  public static void AddVectorOfEnums(FlatBufferBuilder builder, VectorOffset vectorOfEnumsOffset) { builder.AddOffset(47, vectorOfEnumsOffset.Value, 0); }
+  public static VectorOffset CreateVectorOfEnumsVector(FlatBufferBuilder builder, MyGame.Example.Color[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte((byte)data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVectorOfEnumsVectorBlock(FlatBufferBuilder builder, MyGame.Example.Color[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartVectorOfEnumsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddSignedEnum(FlatBufferBuilder builder, MyGame.Example.Race signedEnum) { builder.AddSbyte(48, (sbyte)signedEnum, -1); }
+  public static void AddTestrequirednestedflatbuffer(FlatBufferBuilder builder, VectorOffset testrequirednestedflatbufferOffset) { builder.AddOffset(49, testrequirednestedflatbufferOffset.Value, 0); }
+  public static VectorOffset CreateTestrequirednestedflatbufferVector(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateTestrequirednestedflatbufferVectorBlock(FlatBufferBuilder builder, byte[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartTestrequirednestedflatbufferVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddScalarKeySortedTables(FlatBufferBuilder builder, VectorOffset scalarKeySortedTablesOffset) { builder.AddOffset(50, scalarKeySortedTablesOffset.Value, 0); }
+  public static VectorOffset CreateScalarKeySortedTablesVector(FlatBufferBuilder builder, Offset<MyGame.Example.Stat>[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i].Value); return builder.EndVector(); }
+  public static VectorOffset CreateScalarKeySortedTablesVectorBlock(FlatBufferBuilder builder, Offset<MyGame.Example.Stat>[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartScalarKeySortedTablesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static Offset<MyGame.Example.Monster> EndMonster(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    builder.Required(o, 10);  // name
+    return new Offset<MyGame.Example.Monster>(o);
+  }
+  public static void FinishMonsterBuffer(FlatBufferBuilder builder, Offset<MyGame.Example.Monster> offset) { builder.Finish(offset.Value, "MONS"); }
+  public static void FinishSizePrefixedMonsterBuffer(FlatBufferBuilder builder, Offset<MyGame.Example.Monster> offset) { builder.FinishSizePrefixed(offset.Value, "MONS"); }
+
+  public static VectorOffset CreateSortedVectorOfMonster(FlatBufferBuilder builder, Offset<Monster>[] offsets) {
+    Array.Sort(offsets, (Offset<Monster> o1, Offset<Monster> o2) => Table.CompareStrings(Table.__offset(10, o1.Value, builder.DataBuffer), Table.__offset(10, o2.Value, builder.DataBuffer), builder.DataBuffer));
+    return builder.CreateVectorOfTables(offsets);
+  }
+
+  public static Monster? __lookup_by_key(int vectorLocation, string key, ByteBuffer bb) {
+    byte[] byteKey = System.Text.Encoding.UTF8.GetBytes(key);
+    int span = bb.GetInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = Table.__indirect(vectorLocation + 4 * (start + middle), bb);
+      int comp = Table.CompareStrings(Table.__offset(10, bb.Length - tableOffset, bb), byteKey, bb);
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return new Monster().__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+  public MonsterT UnPack() {
+    var _o = new MonsterT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(MonsterT _o) {
+    _o.Pos = this.Pos.HasValue ? this.Pos.Value.UnPack() : null;
+    _o.Mana = this.Mana;
+    _o.Hp = this.Hp;
+    _o.Name = this.Name;
+    _o.Inventory = new List<byte>();
+    for (var _j = 0; _j < this.InventoryLength; ++_j) {_o.Inventory.Add(this.Inventory(_j));}
+    _o.Color = this.Color;
+    _o.Test = new MyGame.Example.AnyUnion();
+    _o.Test.Type = this.TestType;
+    switch (this.TestType) {
+      default: break;
+      case MyGame.Example.Any.Monster:
+        _o.Test.Value = this.Test<MyGame.Example.Monster>().HasValue ? this.Test<MyGame.Example.Monster>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.Any.TestSimpleTableWithEnum:
+        _o.Test.Value = this.Test<MyGame.Example.TestSimpleTableWithEnum>().HasValue ? this.Test<MyGame.Example.TestSimpleTableWithEnum>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.Any.MyGame_Example2_Monster:
+        _o.Test.Value = this.Test<MyGame.Example2.Monster>().HasValue ? this.Test<MyGame.Example2.Monster>().Value.UnPack() : null;
+        break;
+    }
+    _o.Test4 = new List<MyGame.Example.TestT>();
+    for (var _j = 0; _j < this.Test4Length; ++_j) {_o.Test4.Add(this.Test4(_j).HasValue ? this.Test4(_j).Value.UnPack() : null);}
+    _o.Testarrayofstring = new List<string>();
+    for (var _j = 0; _j < this.TestarrayofstringLength; ++_j) {_o.Testarrayofstring.Add(this.Testarrayofstring(_j));}
+    _o.Testarrayoftables = new List<MyGame.Example.MonsterT>();
+    for (var _j = 0; _j < this.TestarrayoftablesLength; ++_j) {_o.Testarrayoftables.Add(this.Testarrayoftables(_j).HasValue ? this.Testarrayoftables(_j).Value.UnPack() : null);}
+    _o.Enemy = this.Enemy.HasValue ? this.Enemy.Value.UnPack() : null;
+    _o.Testnestedflatbuffer = new List<byte>();
+    for (var _j = 0; _j < this.TestnestedflatbufferLength; ++_j) {_o.Testnestedflatbuffer.Add(this.Testnestedflatbuffer(_j));}
+    _o.Testempty = this.Testempty.HasValue ? this.Testempty.Value.UnPack() : null;
+    _o.Testbool = this.Testbool;
+    _o.Testhashs32Fnv1 = this.Testhashs32Fnv1;
+    _o.Testhashu32Fnv1 = this.Testhashu32Fnv1;
+    _o.Testhashs64Fnv1 = this.Testhashs64Fnv1;
+    _o.Testhashu64Fnv1 = this.Testhashu64Fnv1;
+    _o.Testhashs32Fnv1a = this.Testhashs32Fnv1a;
+    _o.Testhashu32Fnv1a = this.Testhashu32Fnv1a;
+    _o.Testhashs64Fnv1a = this.Testhashs64Fnv1a;
+    _o.Testhashu64Fnv1a = this.Testhashu64Fnv1a;
+    _o.Testarrayofbools = new List<bool>();
+    for (var _j = 0; _j < this.TestarrayofboolsLength; ++_j) {_o.Testarrayofbools.Add(this.Testarrayofbools(_j));}
+    _o.Testf = this.Testf;
+    _o.Testf2 = this.Testf2;
+    _o.Testf3 = this.Testf3;
+    _o.Testarrayofstring2 = new List<string>();
+    for (var _j = 0; _j < this.Testarrayofstring2Length; ++_j) {_o.Testarrayofstring2.Add(this.Testarrayofstring2(_j));}
+    _o.Testarrayofsortedstruct = new List<MyGame.Example.AbilityT>();
+    for (var _j = 0; _j < this.TestarrayofsortedstructLength; ++_j) {_o.Testarrayofsortedstruct.Add(this.Testarrayofsortedstruct(_j).HasValue ? this.Testarrayofsortedstruct(_j).Value.UnPack() : null);}
+    _o.Flex = new List<byte>();
+    for (var _j = 0; _j < this.FlexLength; ++_j) {_o.Flex.Add(this.Flex(_j));}
+    _o.Test5 = new List<MyGame.Example.TestT>();
+    for (var _j = 0; _j < this.Test5Length; ++_j) {_o.Test5.Add(this.Test5(_j).HasValue ? this.Test5(_j).Value.UnPack() : null);}
+    _o.VectorOfLongs = new List<long>();
+    for (var _j = 0; _j < this.VectorOfLongsLength; ++_j) {_o.VectorOfLongs.Add(this.VectorOfLongs(_j));}
+    _o.VectorOfDoubles = new List<double>();
+    for (var _j = 0; _j < this.VectorOfDoublesLength; ++_j) {_o.VectorOfDoubles.Add(this.VectorOfDoubles(_j));}
+    _o.ParentNamespaceTest = this.ParentNamespaceTest.HasValue ? this.ParentNamespaceTest.Value.UnPack() : null;
+    _o.VectorOfReferrables = new List<MyGame.Example.ReferrableT>();
+    for (var _j = 0; _j < this.VectorOfReferrablesLength; ++_j) {_o.VectorOfReferrables.Add(this.VectorOfReferrables(_j).HasValue ? this.VectorOfReferrables(_j).Value.UnPack() : null);}
+    _o.SingleWeakReference = this.SingleWeakReference;
+    _o.VectorOfWeakReferences = new List<ulong>();
+    for (var _j = 0; _j < this.VectorOfWeakReferencesLength; ++_j) {_o.VectorOfWeakReferences.Add(this.VectorOfWeakReferences(_j));}
+    _o.VectorOfStrongReferrables = new List<MyGame.Example.ReferrableT>();
+    for (var _j = 0; _j < this.VectorOfStrongReferrablesLength; ++_j) {_o.VectorOfStrongReferrables.Add(this.VectorOfStrongReferrables(_j).HasValue ? this.VectorOfStrongReferrables(_j).Value.UnPack() : null);}
+    _o.CoOwningReference = this.CoOwningReference;
+    _o.VectorOfCoOwningReferences = new List<ulong>();
+    for (var _j = 0; _j < this.VectorOfCoOwningReferencesLength; ++_j) {_o.VectorOfCoOwningReferences.Add(this.VectorOfCoOwningReferences(_j));}
+    _o.NonOwningReference = this.NonOwningReference;
+    _o.VectorOfNonOwningReferences = new List<ulong>();
+    for (var _j = 0; _j < this.VectorOfNonOwningReferencesLength; ++_j) {_o.VectorOfNonOwningReferences.Add(this.VectorOfNonOwningReferences(_j));}
+    _o.AnyUnique = new MyGame.Example.AnyUniqueAliasesUnion();
+    _o.AnyUnique.Type = this.AnyUniqueType;
+    switch (this.AnyUniqueType) {
+      default: break;
+      case MyGame.Example.AnyUniqueAliases.M:
+        _o.AnyUnique.Value = this.AnyUnique<MyGame.Example.Monster>().HasValue ? this.AnyUnique<MyGame.Example.Monster>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.AnyUniqueAliases.TS:
+        _o.AnyUnique.Value = this.AnyUnique<MyGame.Example.TestSimpleTableWithEnum>().HasValue ? this.AnyUnique<MyGame.Example.TestSimpleTableWithEnum>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.AnyUniqueAliases.M2:
+        _o.AnyUnique.Value = this.AnyUnique<MyGame.Example2.Monster>().HasValue ? this.AnyUnique<MyGame.Example2.Monster>().Value.UnPack() : null;
+        break;
+    }
+    _o.AnyAmbiguous = new MyGame.Example.AnyAmbiguousAliasesUnion();
+    _o.AnyAmbiguous.Type = this.AnyAmbiguousType;
+    switch (this.AnyAmbiguousType) {
+      default: break;
+      case MyGame.Example.AnyAmbiguousAliases.M1:
+        _o.AnyAmbiguous.Value = this.AnyAmbiguous<MyGame.Example.Monster>().HasValue ? this.AnyAmbiguous<MyGame.Example.Monster>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.AnyAmbiguousAliases.M2:
+        _o.AnyAmbiguous.Value = this.AnyAmbiguous<MyGame.Example.Monster>().HasValue ? this.AnyAmbiguous<MyGame.Example.Monster>().Value.UnPack() : null;
+        break;
+      case MyGame.Example.AnyAmbiguousAliases.M3:
+        _o.AnyAmbiguous.Value = this.AnyAmbiguous<MyGame.Example.Monster>().HasValue ? this.AnyAmbiguous<MyGame.Example.Monster>().Value.UnPack() : null;
+        break;
+    }
+    _o.VectorOfEnums = new List<MyGame.Example.Color>();
+    for (var _j = 0; _j < this.VectorOfEnumsLength; ++_j) {_o.VectorOfEnums.Add(this.VectorOfEnums(_j));}
+    _o.SignedEnum = this.SignedEnum;
+    _o.Testrequirednestedflatbuffer = new List<byte>();
+    for (var _j = 0; _j < this.TestrequirednestedflatbufferLength; ++_j) {_o.Testrequirednestedflatbuffer.Add(this.Testrequirednestedflatbuffer(_j));}
+    _o.ScalarKeySortedTables = new List<MyGame.Example.StatT>();
+    for (var _j = 0; _j < this.ScalarKeySortedTablesLength; ++_j) {_o.ScalarKeySortedTables.Add(this.ScalarKeySortedTables(_j).HasValue ? this.ScalarKeySortedTables(_j).Value.UnPack() : null);}
+  }
+  public static Offset<MyGame.Example.Monster> Pack(FlatBufferBuilder builder, MonsterT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Monster>);
+    var _name = _o.Name == null ? default(StringOffset) : builder.CreateString(_o.Name);
+    var _inventory = default(VectorOffset);
+    if (_o.Inventory != null) {
+      var __inventory = _o.Inventory.ToArray();
+      _inventory = CreateInventoryVector(builder, __inventory);
+    }
+    var _test_type = _o.Test == null ? MyGame.Example.Any.NONE : _o.Test.Type;
+    var _test = _o.Test == null ? 0 : MyGame.Example.AnyUnion.Pack(builder, _o.Test);
+    var _test4 = default(VectorOffset);
+    if (_o.Test4 != null) {
+      StartTest4Vector(builder, _o.Test4.Count);
+      for (var _j = _o.Test4.Count - 1; _j >= 0; --_j) { MyGame.Example.Test.Pack(builder, _o.Test4[_j]); }
+      _test4 = builder.EndVector();
+    }
+    var _testarrayofstring = default(VectorOffset);
+    if (_o.Testarrayofstring != null) {
+      var __testarrayofstring = new StringOffset[_o.Testarrayofstring.Count];
+      for (var _j = 0; _j < __testarrayofstring.Length; ++_j) { __testarrayofstring[_j] = builder.CreateString(_o.Testarrayofstring[_j]); }
+      _testarrayofstring = CreateTestarrayofstringVector(builder, __testarrayofstring);
+    }
+    var _testarrayoftables = default(VectorOffset);
+    if (_o.Testarrayoftables != null) {
+      var __testarrayoftables = new Offset<MyGame.Example.Monster>[_o.Testarrayoftables.Count];
+      for (var _j = 0; _j < __testarrayoftables.Length; ++_j) { __testarrayoftables[_j] = MyGame.Example.Monster.Pack(builder, _o.Testarrayoftables[_j]); }
+      _testarrayoftables = CreateTestarrayoftablesVector(builder, __testarrayoftables);
+    }
+    var _enemy = _o.Enemy == null ? default(Offset<MyGame.Example.Monster>) : MyGame.Example.Monster.Pack(builder, _o.Enemy);
+    var _testnestedflatbuffer = default(VectorOffset);
+    if (_o.Testnestedflatbuffer != null) {
+      var __testnestedflatbuffer = _o.Testnestedflatbuffer.ToArray();
+      _testnestedflatbuffer = CreateTestnestedflatbufferVector(builder, __testnestedflatbuffer);
+    }
+    var _testempty = _o.Testempty == null ? default(Offset<MyGame.Example.Stat>) : MyGame.Example.Stat.Pack(builder, _o.Testempty);
+    var _testarrayofbools = default(VectorOffset);
+    if (_o.Testarrayofbools != null) {
+      var __testarrayofbools = _o.Testarrayofbools.ToArray();
+      _testarrayofbools = CreateTestarrayofboolsVector(builder, __testarrayofbools);
+    }
+    var _testarrayofstring2 = default(VectorOffset);
+    if (_o.Testarrayofstring2 != null) {
+      var __testarrayofstring2 = new StringOffset[_o.Testarrayofstring2.Count];
+      for (var _j = 0; _j < __testarrayofstring2.Length; ++_j) { __testarrayofstring2[_j] = builder.CreateString(_o.Testarrayofstring2[_j]); }
+      _testarrayofstring2 = CreateTestarrayofstring2Vector(builder, __testarrayofstring2);
+    }
+    var _testarrayofsortedstruct = default(VectorOffset);
+    if (_o.Testarrayofsortedstruct != null) {
+      StartTestarrayofsortedstructVector(builder, _o.Testarrayofsortedstruct.Count);
+      for (var _j = _o.Testarrayofsortedstruct.Count - 1; _j >= 0; --_j) { MyGame.Example.Ability.Pack(builder, _o.Testarrayofsortedstruct[_j]); }
+      _testarrayofsortedstruct = builder.EndVector();
+    }
+    var _flex = default(VectorOffset);
+    if (_o.Flex != null) {
+      var __flex = _o.Flex.ToArray();
+      _flex = CreateFlexVector(builder, __flex);
+    }
+    var _test5 = default(VectorOffset);
+    if (_o.Test5 != null) {
+      StartTest5Vector(builder, _o.Test5.Count);
+      for (var _j = _o.Test5.Count - 1; _j >= 0; --_j) { MyGame.Example.Test.Pack(builder, _o.Test5[_j]); }
+      _test5 = builder.EndVector();
+    }
+    var _vector_of_longs = default(VectorOffset);
+    if (_o.VectorOfLongs != null) {
+      var __vector_of_longs = _o.VectorOfLongs.ToArray();
+      _vector_of_longs = CreateVectorOfLongsVector(builder, __vector_of_longs);
+    }
+    var _vector_of_doubles = default(VectorOffset);
+    if (_o.VectorOfDoubles != null) {
+      var __vector_of_doubles = _o.VectorOfDoubles.ToArray();
+      _vector_of_doubles = CreateVectorOfDoublesVector(builder, __vector_of_doubles);
+    }
+    var _parent_namespace_test = _o.ParentNamespaceTest == null ? default(Offset<MyGame.InParentNamespace>) : MyGame.InParentNamespace.Pack(builder, _o.ParentNamespaceTest);
+    var _vector_of_referrables = default(VectorOffset);
+    if (_o.VectorOfReferrables != null) {
+      var __vector_of_referrables = new Offset<MyGame.Example.Referrable>[_o.VectorOfReferrables.Count];
+      for (var _j = 0; _j < __vector_of_referrables.Length; ++_j) { __vector_of_referrables[_j] = MyGame.Example.Referrable.Pack(builder, _o.VectorOfReferrables[_j]); }
+      _vector_of_referrables = CreateVectorOfReferrablesVector(builder, __vector_of_referrables);
+    }
+    var _vector_of_weak_references = default(VectorOffset);
+    if (_o.VectorOfWeakReferences != null) {
+      var __vector_of_weak_references = _o.VectorOfWeakReferences.ToArray();
+      _vector_of_weak_references = CreateVectorOfWeakReferencesVector(builder, __vector_of_weak_references);
+    }
+    var _vector_of_strong_referrables = default(VectorOffset);
+    if (_o.VectorOfStrongReferrables != null) {
+      var __vector_of_strong_referrables = new Offset<MyGame.Example.Referrable>[_o.VectorOfStrongReferrables.Count];
+      for (var _j = 0; _j < __vector_of_strong_referrables.Length; ++_j) { __vector_of_strong_referrables[_j] = MyGame.Example.Referrable.Pack(builder, _o.VectorOfStrongReferrables[_j]); }
+      _vector_of_strong_referrables = CreateVectorOfStrongReferrablesVector(builder, __vector_of_strong_referrables);
+    }
+    var _vector_of_co_owning_references = default(VectorOffset);
+    if (_o.VectorOfCoOwningReferences != null) {
+      var __vector_of_co_owning_references = _o.VectorOfCoOwningReferences.ToArray();
+      _vector_of_co_owning_references = CreateVectorOfCoOwningReferencesVector(builder, __vector_of_co_owning_references);
+    }
+    var _vector_of_non_owning_references = default(VectorOffset);
+    if (_o.VectorOfNonOwningReferences != null) {
+      var __vector_of_non_owning_references = _o.VectorOfNonOwningReferences.ToArray();
+      _vector_of_non_owning_references = CreateVectorOfNonOwningReferencesVector(builder, __vector_of_non_owning_references);
+    }
+    var _any_unique_type = _o.AnyUnique == null ? MyGame.Example.AnyUniqueAliases.NONE : _o.AnyUnique.Type;
+    var _any_unique = _o.AnyUnique == null ? 0 : MyGame.Example.AnyUniqueAliasesUnion.Pack(builder, _o.AnyUnique);
+    var _any_ambiguous_type = _o.AnyAmbiguous == null ? MyGame.Example.AnyAmbiguousAliases.NONE : _o.AnyAmbiguous.Type;
+    var _any_ambiguous = _o.AnyAmbiguous == null ? 0 : MyGame.Example.AnyAmbiguousAliasesUnion.Pack(builder, _o.AnyAmbiguous);
+    var _vector_of_enums = default(VectorOffset);
+    if (_o.VectorOfEnums != null) {
+      var __vector_of_enums = _o.VectorOfEnums.ToArray();
+      _vector_of_enums = CreateVectorOfEnumsVector(builder, __vector_of_enums);
+    }
+    var _testrequirednestedflatbuffer = default(VectorOffset);
+    if (_o.Testrequirednestedflatbuffer != null) {
+      var __testrequirednestedflatbuffer = _o.Testrequirednestedflatbuffer.ToArray();
+      _testrequirednestedflatbuffer = CreateTestrequirednestedflatbufferVector(builder, __testrequirednestedflatbuffer);
+    }
+    var _scalar_key_sorted_tables = default(VectorOffset);
+    if (_o.ScalarKeySortedTables != null) {
+      var __scalar_key_sorted_tables = new Offset<MyGame.Example.Stat>[_o.ScalarKeySortedTables.Count];
+      for (var _j = 0; _j < __scalar_key_sorted_tables.Length; ++_j) { __scalar_key_sorted_tables[_j] = MyGame.Example.Stat.Pack(builder, _o.ScalarKeySortedTables[_j]); }
+      _scalar_key_sorted_tables = CreateScalarKeySortedTablesVector(builder, __scalar_key_sorted_tables);
+    }
+    return CreateMonster(
+      builder,
+      _o.Pos,
+      _o.Mana,
+      _o.Hp,
+      _name,
+      _inventory,
+      _o.Color,
+      _test_type,
+      _test,
+      _test4,
+      _testarrayofstring,
+      _testarrayoftables,
+      _enemy,
+      _testnestedflatbuffer,
+      _testempty,
+      _o.Testbool,
+      _o.Testhashs32Fnv1,
+      _o.Testhashu32Fnv1,
+      _o.Testhashs64Fnv1,
+      _o.Testhashu64Fnv1,
+      _o.Testhashs32Fnv1a,
+      _o.Testhashu32Fnv1a,
+      _o.Testhashs64Fnv1a,
+      _o.Testhashu64Fnv1a,
+      _testarrayofbools,
+      _o.Testf,
+      _o.Testf2,
+      _o.Testf3,
+      _testarrayofstring2,
+      _testarrayofsortedstruct,
+      _flex,
+      _test5,
+      _vector_of_longs,
+      _vector_of_doubles,
+      _parent_namespace_test,
+      _vector_of_referrables,
+      _o.SingleWeakReference,
+      _vector_of_weak_references,
+      _vector_of_strong_referrables,
+      _o.CoOwningReference,
+      _vector_of_co_owning_references,
+      _o.NonOwningReference,
+      _vector_of_non_owning_references,
+      _any_unique_type,
+      _any_unique,
+      _any_ambiguous_type,
+      _any_ambiguous,
+      _vector_of_enums,
+      _o.SignedEnum,
+      _testrequirednestedflatbuffer,
+      _scalar_key_sorted_tables);
+  }
+};
+
+public class MonsterT
+{
+  [Newtonsoft.Json.JsonProperty("pos")]
+  public MyGame.Example.Vec3T Pos { get; set; }
+  [Newtonsoft.Json.JsonProperty("mana")]
+  public short Mana { get; set; }
+  [Newtonsoft.Json.JsonProperty("hp")]
+  public short Hp { get; set; }
+  [Newtonsoft.Json.JsonProperty("name")]
+  public string Name { get; set; }
+  [Newtonsoft.Json.JsonProperty("inventory")]
+  public List<byte> Inventory { get; set; }
+  [Newtonsoft.Json.JsonProperty("color")]
+  public MyGame.Example.Color Color { get; set; }
+  [Newtonsoft.Json.JsonProperty("test_type")]
+  private MyGame.Example.Any TestType {
+    get {
+      return this.Test != null ? this.Test.Type : MyGame.Example.Any.NONE;
+    }
+    set {
+      this.Test = new MyGame.Example.AnyUnion();
+      this.Test.Type = value;
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("test")]
+  [Newtonsoft.Json.JsonConverter(typeof(MyGame.Example.AnyUnion_JsonConverter))]
+  public MyGame.Example.AnyUnion Test { get; set; }
+  [Newtonsoft.Json.JsonProperty("test4")]
+  public List<MyGame.Example.TestT> Test4 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testarrayofstring")]
+  public List<string> Testarrayofstring { get; set; }
+  [Newtonsoft.Json.JsonProperty("testarrayoftables")]
+  public List<MyGame.Example.MonsterT> Testarrayoftables { get; set; }
+  [Newtonsoft.Json.JsonProperty("enemy")]
+  public MyGame.Example.MonsterT Enemy { get; set; }
+  [Newtonsoft.Json.JsonProperty("testnestedflatbuffer")]
+  public List<byte> Testnestedflatbuffer { get; set; }
+  [Newtonsoft.Json.JsonProperty("testempty")]
+  public MyGame.Example.StatT Testempty { get; set; }
+  [Newtonsoft.Json.JsonProperty("testbool")]
+  public bool Testbool { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashs32_fnv1")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public int Testhashs32Fnv1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashu32_fnv1")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public uint Testhashu32Fnv1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashs64_fnv1")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public long Testhashs64Fnv1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashu64_fnv1")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong Testhashu64Fnv1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashs32_fnv1a")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public int Testhashs32Fnv1a { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashu32_fnv1a")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public uint Testhashu32Fnv1a { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashs64_fnv1a")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public long Testhashs64Fnv1a { get; set; }
+  [Newtonsoft.Json.JsonProperty("testhashu64_fnv1a")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong Testhashu64Fnv1a { get; set; }
+  [Newtonsoft.Json.JsonProperty("testarrayofbools")]
+  public List<bool> Testarrayofbools { get; set; }
+  [Newtonsoft.Json.JsonProperty("testf")]
+  public float Testf { get; set; }
+  [Newtonsoft.Json.JsonProperty("testf2")]
+  public float Testf2 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testf3")]
+  public float Testf3 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testarrayofstring2")]
+  public List<string> Testarrayofstring2 { get; set; }
+  [Newtonsoft.Json.JsonProperty("testarrayofsortedstruct")]
+  public List<MyGame.Example.AbilityT> Testarrayofsortedstruct { get; set; }
+  [Newtonsoft.Json.JsonProperty("flex")]
+  public List<byte> Flex { get; set; }
+  [Newtonsoft.Json.JsonProperty("test5")]
+  public List<MyGame.Example.TestT> Test5 { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_longs")]
+  public List<long> VectorOfLongs { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_doubles")]
+  public List<double> VectorOfDoubles { get; set; }
+  [Newtonsoft.Json.JsonProperty("parent_namespace_test")]
+  public MyGame.InParentNamespaceT ParentNamespaceTest { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_referrables")]
+  public List<MyGame.Example.ReferrableT> VectorOfReferrables { get; set; }
+  [Newtonsoft.Json.JsonProperty("single_weak_reference")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong SingleWeakReference { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_weak_references")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public List<ulong> VectorOfWeakReferences { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_strong_referrables")]
+  public List<MyGame.Example.ReferrableT> VectorOfStrongReferrables { get; set; }
+  [Newtonsoft.Json.JsonProperty("co_owning_reference")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong CoOwningReference { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_co_owning_references")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public List<ulong> VectorOfCoOwningReferences { get; set; }
+  [Newtonsoft.Json.JsonProperty("non_owning_reference")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong NonOwningReference { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_non_owning_references")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public List<ulong> VectorOfNonOwningReferences { get; set; }
+  [Newtonsoft.Json.JsonProperty("any_unique_type")]
+  private MyGame.Example.AnyUniqueAliases AnyUniqueType {
+    get {
+      return this.AnyUnique != null ? this.AnyUnique.Type : MyGame.Example.AnyUniqueAliases.NONE;
+    }
+    set {
+      this.AnyUnique = new MyGame.Example.AnyUniqueAliasesUnion();
+      this.AnyUnique.Type = value;
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("any_unique")]
+  [Newtonsoft.Json.JsonConverter(typeof(MyGame.Example.AnyUniqueAliasesUnion_JsonConverter))]
+  public MyGame.Example.AnyUniqueAliasesUnion AnyUnique { get; set; }
+  [Newtonsoft.Json.JsonProperty("any_ambiguous_type")]
+  private MyGame.Example.AnyAmbiguousAliases AnyAmbiguousType {
+    get {
+      return this.AnyAmbiguous != null ? this.AnyAmbiguous.Type : MyGame.Example.AnyAmbiguousAliases.NONE;
+    }
+    set {
+      this.AnyAmbiguous = new MyGame.Example.AnyAmbiguousAliasesUnion();
+      this.AnyAmbiguous.Type = value;
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("any_ambiguous")]
+  [Newtonsoft.Json.JsonConverter(typeof(MyGame.Example.AnyAmbiguousAliasesUnion_JsonConverter))]
+  public MyGame.Example.AnyAmbiguousAliasesUnion AnyAmbiguous { get; set; }
+  [Newtonsoft.Json.JsonProperty("vector_of_enums")]
+  public List<MyGame.Example.Color> VectorOfEnums { get; set; }
+  [Newtonsoft.Json.JsonProperty("signed_enum")]
+  public MyGame.Example.Race SignedEnum { get; set; }
+  [Newtonsoft.Json.JsonProperty("testrequirednestedflatbuffer")]
+  public List<byte> Testrequirednestedflatbuffer { get; set; }
+  [Newtonsoft.Json.JsonProperty("scalar_key_sorted_tables")]
+  public List<MyGame.Example.StatT> ScalarKeySortedTables { get; set; }
+
+  public MonsterT() {
+    this.Pos = new MyGame.Example.Vec3T();
+    this.Mana = 150;
+    this.Hp = 100;
+    this.Name = null;
+    this.Inventory = null;
+    this.Color = MyGame.Example.Color.Blue;
+    this.Test = null;
+    this.Test4 = null;
+    this.Testarrayofstring = null;
+    this.Testarrayoftables = null;
+    this.Enemy = null;
+    this.Testnestedflatbuffer = null;
+    this.Testempty = null;
+    this.Testbool = false;
+    this.Testhashs32Fnv1 = 0;
+    this.Testhashu32Fnv1 = 0;
+    this.Testhashs64Fnv1 = 0;
+    this.Testhashu64Fnv1 = 0;
+    this.Testhashs32Fnv1a = 0;
+    this.Testhashu32Fnv1a = 0;
+    this.Testhashs64Fnv1a = 0;
+    this.Testhashu64Fnv1a = 0;
+    this.Testarrayofbools = null;
+    this.Testf = 3.14159f;
+    this.Testf2 = 3.0f;
+    this.Testf3 = 0.0f;
+    this.Testarrayofstring2 = null;
+    this.Testarrayofsortedstruct = null;
+    this.Flex = null;
+    this.Test5 = null;
+    this.VectorOfLongs = null;
+    this.VectorOfDoubles = null;
+    this.ParentNamespaceTest = null;
+    this.VectorOfReferrables = null;
+    this.SingleWeakReference = 0;
+    this.VectorOfWeakReferences = null;
+    this.VectorOfStrongReferrables = null;
+    this.CoOwningReference = 0;
+    this.VectorOfCoOwningReferences = null;
+    this.NonOwningReference = 0;
+    this.VectorOfNonOwningReferences = null;
+    this.AnyUnique = null;
+    this.AnyAmbiguous = null;
+    this.VectorOfEnums = null;
+    this.SignedEnum = MyGame.Example.Race.None;
+    this.Testrequirednestedflatbuffer = null;
+    this.ScalarKeySortedTables = null;
+  }
+
+  public static MonsterT DeserializeFromJson(string jsonText) {
+    return Newtonsoft.Json.JsonConvert.DeserializeObject<MonsterT>(jsonText);
+  }
+  public string SerializeToJson() {
+    return Newtonsoft.Json.JsonConvert.SerializeObject(this, Newtonsoft.Json.Formatting.Indented);
+  }
+  public static MonsterT DeserializeFromBinary(byte[] fbBuffer) {
+    return Monster.GetRootAsMonster(new ByteBuffer(fbBuffer)).UnPack();
+  }
+  public byte[] SerializeToBinary() {
+    var fbb = new FlatBufferBuilder(0x10000);
+    Monster.FinishMonsterBuffer(fbb, Monster.Pack(fbb, this));
+    return fbb.DataBuffer.ToSizedArray();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.go
new file mode 100644
index 0000000..4992d59
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.go
@@ -0,0 +1,1557 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	MyGame "MyGame"
+)
+
+/// an example documentation comment: "monster object"
+type MonsterT struct {
+	Pos *Vec3T
+	Mana int16
+	Hp int16
+	Name string
+	Inventory []byte
+	Color Color
+	Test *AnyT
+	Test4 []*TestT
+	Testarrayofstring []string
+	Testarrayoftables []*MonsterT
+	Enemy *MonsterT
+	Testnestedflatbuffer []byte
+	Testempty *StatT
+	Testbool bool
+	Testhashs32Fnv1 int32
+	Testhashu32Fnv1 uint32
+	Testhashs64Fnv1 int64
+	Testhashu64Fnv1 uint64
+	Testhashs32Fnv1a int32
+	Testhashu32Fnv1a uint32
+	Testhashs64Fnv1a int64
+	Testhashu64Fnv1a uint64
+	Testarrayofbools []bool
+	Testf float32
+	Testf2 float32
+	Testf3 float32
+	Testarrayofstring2 []string
+	Testarrayofsortedstruct []*AbilityT
+	Flex []byte
+	Test5 []*TestT
+	VectorOfLongs []int64
+	VectorOfDoubles []float64
+	ParentNamespaceTest *MyGame.InParentNamespaceT
+	VectorOfReferrables []*ReferrableT
+	SingleWeakReference uint64
+	VectorOfWeakReferences []uint64
+	VectorOfStrongReferrables []*ReferrableT
+	CoOwningReference uint64
+	VectorOfCoOwningReferences []uint64
+	NonOwningReference uint64
+	VectorOfNonOwningReferences []uint64
+	AnyUnique *AnyUniqueAliasesT
+	AnyAmbiguous *AnyAmbiguousAliasesT
+	VectorOfEnums []Color
+	SignedEnum Race
+	Testrequirednestedflatbuffer []byte
+	ScalarKeySortedTables []*StatT
+}
+
+func (t *MonsterT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	nameOffset := builder.CreateString(t.Name)
+	inventoryOffset := flatbuffers.UOffsetT(0)
+	if t.Inventory != nil {
+		inventoryOffset = builder.CreateByteString(t.Inventory)
+	}
+	testOffset := t.Test.Pack(builder)
+	
+	test4Offset := flatbuffers.UOffsetT(0)
+	if t.Test4 != nil {
+		test4Length := len(t.Test4)
+		MonsterStartTest4Vector(builder, test4Length)
+		for j := test4Length - 1; j >= 0; j-- {
+			t.Test4[j].Pack(builder)
+		}
+		test4Offset = builder.EndVector(test4Length)
+	}
+	testarrayofstringOffset := flatbuffers.UOffsetT(0)
+	if t.Testarrayofstring != nil {
+		testarrayofstringLength := len(t.Testarrayofstring)
+		testarrayofstringOffsets := make([]flatbuffers.UOffsetT, testarrayofstringLength)
+		for j := 0; j < testarrayofstringLength; j++ {
+			testarrayofstringOffsets[j] = builder.CreateString(t.Testarrayofstring[j])
+		}
+		MonsterStartTestarrayofstringVector(builder, testarrayofstringLength)
+		for j := testarrayofstringLength - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(testarrayofstringOffsets[j])
+		}
+		testarrayofstringOffset = builder.EndVector(testarrayofstringLength)
+	}
+	testarrayoftablesOffset := flatbuffers.UOffsetT(0)
+	if t.Testarrayoftables != nil {
+		testarrayoftablesLength := len(t.Testarrayoftables)
+		testarrayoftablesOffsets := make([]flatbuffers.UOffsetT, testarrayoftablesLength)
+		for j := 0; j < testarrayoftablesLength; j++ {
+			testarrayoftablesOffsets[j] = t.Testarrayoftables[j].Pack(builder)
+		}
+		MonsterStartTestarrayoftablesVector(builder, testarrayoftablesLength)
+		for j := testarrayoftablesLength - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(testarrayoftablesOffsets[j])
+		}
+		testarrayoftablesOffset = builder.EndVector(testarrayoftablesLength)
+	}
+	enemyOffset := t.Enemy.Pack(builder)
+	testnestedflatbufferOffset := flatbuffers.UOffsetT(0)
+	if t.Testnestedflatbuffer != nil {
+		testnestedflatbufferOffset = builder.CreateByteString(t.Testnestedflatbuffer)
+	}
+	testemptyOffset := t.Testempty.Pack(builder)
+	testarrayofboolsOffset := flatbuffers.UOffsetT(0)
+	if t.Testarrayofbools != nil {
+		testarrayofboolsLength := len(t.Testarrayofbools)
+		MonsterStartTestarrayofboolsVector(builder, testarrayofboolsLength)
+		for j := testarrayofboolsLength - 1; j >= 0; j-- {
+			builder.PrependBool(t.Testarrayofbools[j])
+		}
+		testarrayofboolsOffset = builder.EndVector(testarrayofboolsLength)
+	}
+	testarrayofstring2Offset := flatbuffers.UOffsetT(0)
+	if t.Testarrayofstring2 != nil {
+		testarrayofstring2Length := len(t.Testarrayofstring2)
+		testarrayofstring2Offsets := make([]flatbuffers.UOffsetT, testarrayofstring2Length)
+		for j := 0; j < testarrayofstring2Length; j++ {
+			testarrayofstring2Offsets[j] = builder.CreateString(t.Testarrayofstring2[j])
+		}
+		MonsterStartTestarrayofstring2Vector(builder, testarrayofstring2Length)
+		for j := testarrayofstring2Length - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(testarrayofstring2Offsets[j])
+		}
+		testarrayofstring2Offset = builder.EndVector(testarrayofstring2Length)
+	}
+	testarrayofsortedstructOffset := flatbuffers.UOffsetT(0)
+	if t.Testarrayofsortedstruct != nil {
+		testarrayofsortedstructLength := len(t.Testarrayofsortedstruct)
+		MonsterStartTestarrayofsortedstructVector(builder, testarrayofsortedstructLength)
+		for j := testarrayofsortedstructLength - 1; j >= 0; j-- {
+			t.Testarrayofsortedstruct[j].Pack(builder)
+		}
+		testarrayofsortedstructOffset = builder.EndVector(testarrayofsortedstructLength)
+	}
+	flexOffset := flatbuffers.UOffsetT(0)
+	if t.Flex != nil {
+		flexOffset = builder.CreateByteString(t.Flex)
+	}
+	test5Offset := flatbuffers.UOffsetT(0)
+	if t.Test5 != nil {
+		test5Length := len(t.Test5)
+		MonsterStartTest5Vector(builder, test5Length)
+		for j := test5Length - 1; j >= 0; j-- {
+			t.Test5[j].Pack(builder)
+		}
+		test5Offset = builder.EndVector(test5Length)
+	}
+	vectorOfLongsOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfLongs != nil {
+		vectorOfLongsLength := len(t.VectorOfLongs)
+		MonsterStartVectorOfLongsVector(builder, vectorOfLongsLength)
+		for j := vectorOfLongsLength - 1; j >= 0; j-- {
+			builder.PrependInt64(t.VectorOfLongs[j])
+		}
+		vectorOfLongsOffset = builder.EndVector(vectorOfLongsLength)
+	}
+	vectorOfDoublesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfDoubles != nil {
+		vectorOfDoublesLength := len(t.VectorOfDoubles)
+		MonsterStartVectorOfDoublesVector(builder, vectorOfDoublesLength)
+		for j := vectorOfDoublesLength - 1; j >= 0; j-- {
+			builder.PrependFloat64(t.VectorOfDoubles[j])
+		}
+		vectorOfDoublesOffset = builder.EndVector(vectorOfDoublesLength)
+	}
+	parentNamespaceTestOffset := t.ParentNamespaceTest.Pack(builder)
+	vectorOfReferrablesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfReferrables != nil {
+		vectorOfReferrablesLength := len(t.VectorOfReferrables)
+		vectorOfReferrablesOffsets := make([]flatbuffers.UOffsetT, vectorOfReferrablesLength)
+		for j := 0; j < vectorOfReferrablesLength; j++ {
+			vectorOfReferrablesOffsets[j] = t.VectorOfReferrables[j].Pack(builder)
+		}
+		MonsterStartVectorOfReferrablesVector(builder, vectorOfReferrablesLength)
+		for j := vectorOfReferrablesLength - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(vectorOfReferrablesOffsets[j])
+		}
+		vectorOfReferrablesOffset = builder.EndVector(vectorOfReferrablesLength)
+	}
+	vectorOfWeakReferencesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfWeakReferences != nil {
+		vectorOfWeakReferencesLength := len(t.VectorOfWeakReferences)
+		MonsterStartVectorOfWeakReferencesVector(builder, vectorOfWeakReferencesLength)
+		for j := vectorOfWeakReferencesLength - 1; j >= 0; j-- {
+			builder.PrependUint64(t.VectorOfWeakReferences[j])
+		}
+		vectorOfWeakReferencesOffset = builder.EndVector(vectorOfWeakReferencesLength)
+	}
+	vectorOfStrongReferrablesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfStrongReferrables != nil {
+		vectorOfStrongReferrablesLength := len(t.VectorOfStrongReferrables)
+		vectorOfStrongReferrablesOffsets := make([]flatbuffers.UOffsetT, vectorOfStrongReferrablesLength)
+		for j := 0; j < vectorOfStrongReferrablesLength; j++ {
+			vectorOfStrongReferrablesOffsets[j] = t.VectorOfStrongReferrables[j].Pack(builder)
+		}
+		MonsterStartVectorOfStrongReferrablesVector(builder, vectorOfStrongReferrablesLength)
+		for j := vectorOfStrongReferrablesLength - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(vectorOfStrongReferrablesOffsets[j])
+		}
+		vectorOfStrongReferrablesOffset = builder.EndVector(vectorOfStrongReferrablesLength)
+	}
+	vectorOfCoOwningReferencesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfCoOwningReferences != nil {
+		vectorOfCoOwningReferencesLength := len(t.VectorOfCoOwningReferences)
+		MonsterStartVectorOfCoOwningReferencesVector(builder, vectorOfCoOwningReferencesLength)
+		for j := vectorOfCoOwningReferencesLength - 1; j >= 0; j-- {
+			builder.PrependUint64(t.VectorOfCoOwningReferences[j])
+		}
+		vectorOfCoOwningReferencesOffset = builder.EndVector(vectorOfCoOwningReferencesLength)
+	}
+	vectorOfNonOwningReferencesOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfNonOwningReferences != nil {
+		vectorOfNonOwningReferencesLength := len(t.VectorOfNonOwningReferences)
+		MonsterStartVectorOfNonOwningReferencesVector(builder, vectorOfNonOwningReferencesLength)
+		for j := vectorOfNonOwningReferencesLength - 1; j >= 0; j-- {
+			builder.PrependUint64(t.VectorOfNonOwningReferences[j])
+		}
+		vectorOfNonOwningReferencesOffset = builder.EndVector(vectorOfNonOwningReferencesLength)
+	}
+	anyUniqueOffset := t.AnyUnique.Pack(builder)
+	
+	anyAmbiguousOffset := t.AnyAmbiguous.Pack(builder)
+	
+	vectorOfEnumsOffset := flatbuffers.UOffsetT(0)
+	if t.VectorOfEnums != nil {
+		vectorOfEnumsLength := len(t.VectorOfEnums)
+		MonsterStartVectorOfEnumsVector(builder, vectorOfEnumsLength)
+		for j := vectorOfEnumsLength - 1; j >= 0; j-- {
+			builder.PrependByte(byte(t.VectorOfEnums[j]))
+		}
+		vectorOfEnumsOffset = builder.EndVector(vectorOfEnumsLength)
+	}
+	testrequirednestedflatbufferOffset := flatbuffers.UOffsetT(0)
+	if t.Testrequirednestedflatbuffer != nil {
+		testrequirednestedflatbufferOffset = builder.CreateByteString(t.Testrequirednestedflatbuffer)
+	}
+	scalarKeySortedTablesOffset := flatbuffers.UOffsetT(0)
+	if t.ScalarKeySortedTables != nil {
+		scalarKeySortedTablesLength := len(t.ScalarKeySortedTables)
+		scalarKeySortedTablesOffsets := make([]flatbuffers.UOffsetT, scalarKeySortedTablesLength)
+		for j := 0; j < scalarKeySortedTablesLength; j++ {
+			scalarKeySortedTablesOffsets[j] = t.ScalarKeySortedTables[j].Pack(builder)
+		}
+		MonsterStartScalarKeySortedTablesVector(builder, scalarKeySortedTablesLength)
+		for j := scalarKeySortedTablesLength - 1; j >= 0; j-- {
+			builder.PrependUOffsetT(scalarKeySortedTablesOffsets[j])
+		}
+		scalarKeySortedTablesOffset = builder.EndVector(scalarKeySortedTablesLength)
+	}
+	MonsterStart(builder)
+	posOffset := t.Pos.Pack(builder)
+	MonsterAddPos(builder, posOffset)
+	MonsterAddMana(builder, t.Mana)
+	MonsterAddHp(builder, t.Hp)
+	MonsterAddName(builder, nameOffset)
+	MonsterAddInventory(builder, inventoryOffset)
+	MonsterAddColor(builder, t.Color)
+	if t.Test != nil {
+		MonsterAddTestType(builder, t.Test.Type)
+	}
+	MonsterAddTest(builder, testOffset)
+	MonsterAddTest4(builder, test4Offset)
+	MonsterAddTestarrayofstring(builder, testarrayofstringOffset)
+	MonsterAddTestarrayoftables(builder, testarrayoftablesOffset)
+	MonsterAddEnemy(builder, enemyOffset)
+	MonsterAddTestnestedflatbuffer(builder, testnestedflatbufferOffset)
+	MonsterAddTestempty(builder, testemptyOffset)
+	MonsterAddTestbool(builder, t.Testbool)
+	MonsterAddTesthashs32Fnv1(builder, t.Testhashs32Fnv1)
+	MonsterAddTesthashu32Fnv1(builder, t.Testhashu32Fnv1)
+	MonsterAddTesthashs64Fnv1(builder, t.Testhashs64Fnv1)
+	MonsterAddTesthashu64Fnv1(builder, t.Testhashu64Fnv1)
+	MonsterAddTesthashs32Fnv1a(builder, t.Testhashs32Fnv1a)
+	MonsterAddTesthashu32Fnv1a(builder, t.Testhashu32Fnv1a)
+	MonsterAddTesthashs64Fnv1a(builder, t.Testhashs64Fnv1a)
+	MonsterAddTesthashu64Fnv1a(builder, t.Testhashu64Fnv1a)
+	MonsterAddTestarrayofbools(builder, testarrayofboolsOffset)
+	MonsterAddTestf(builder, t.Testf)
+	MonsterAddTestf2(builder, t.Testf2)
+	MonsterAddTestf3(builder, t.Testf3)
+	MonsterAddTestarrayofstring2(builder, testarrayofstring2Offset)
+	MonsterAddTestarrayofsortedstruct(builder, testarrayofsortedstructOffset)
+	MonsterAddFlex(builder, flexOffset)
+	MonsterAddTest5(builder, test5Offset)
+	MonsterAddVectorOfLongs(builder, vectorOfLongsOffset)
+	MonsterAddVectorOfDoubles(builder, vectorOfDoublesOffset)
+	MonsterAddParentNamespaceTest(builder, parentNamespaceTestOffset)
+	MonsterAddVectorOfReferrables(builder, vectorOfReferrablesOffset)
+	MonsterAddSingleWeakReference(builder, t.SingleWeakReference)
+	MonsterAddVectorOfWeakReferences(builder, vectorOfWeakReferencesOffset)
+	MonsterAddVectorOfStrongReferrables(builder, vectorOfStrongReferrablesOffset)
+	MonsterAddCoOwningReference(builder, t.CoOwningReference)
+	MonsterAddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferencesOffset)
+	MonsterAddNonOwningReference(builder, t.NonOwningReference)
+	MonsterAddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferencesOffset)
+	if t.AnyUnique != nil {
+		MonsterAddAnyUniqueType(builder, t.AnyUnique.Type)
+	}
+	MonsterAddAnyUnique(builder, anyUniqueOffset)
+	if t.AnyAmbiguous != nil {
+		MonsterAddAnyAmbiguousType(builder, t.AnyAmbiguous.Type)
+	}
+	MonsterAddAnyAmbiguous(builder, anyAmbiguousOffset)
+	MonsterAddVectorOfEnums(builder, vectorOfEnumsOffset)
+	MonsterAddSignedEnum(builder, t.SignedEnum)
+	MonsterAddTestrequirednestedflatbuffer(builder, testrequirednestedflatbufferOffset)
+	MonsterAddScalarKeySortedTables(builder, scalarKeySortedTablesOffset)
+	return MonsterEnd(builder)
+}
+
+func (rcv *Monster) UnPackTo(t *MonsterT) {
+	t.Pos = rcv.Pos(nil).UnPack()
+	t.Mana = rcv.Mana()
+	t.Hp = rcv.Hp()
+	t.Name = string(rcv.Name())
+	t.Inventory = rcv.InventoryBytes()
+	t.Color = rcv.Color()
+	testTable := flatbuffers.Table{}
+	if rcv.Test(&testTable) {
+		t.Test = rcv.TestType().UnPack(testTable)
+	}
+	test4Length := rcv.Test4Length()
+	t.Test4 = make([]*TestT, test4Length)
+	for j := 0; j < test4Length; j++ {
+		x := Test{}
+		rcv.Test4(&x, j)
+		t.Test4[j] = x.UnPack()
+	}
+	testarrayofstringLength := rcv.TestarrayofstringLength()
+	t.Testarrayofstring = make([]string, testarrayofstringLength)
+	for j := 0; j < testarrayofstringLength; j++ {
+		t.Testarrayofstring[j] = string(rcv.Testarrayofstring(j))
+	}
+	testarrayoftablesLength := rcv.TestarrayoftablesLength()
+	t.Testarrayoftables = make([]*MonsterT, testarrayoftablesLength)
+	for j := 0; j < testarrayoftablesLength; j++ {
+		x := Monster{}
+		rcv.Testarrayoftables(&x, j)
+		t.Testarrayoftables[j] = x.UnPack()
+	}
+	t.Enemy = rcv.Enemy(nil).UnPack()
+	t.Testnestedflatbuffer = rcv.TestnestedflatbufferBytes()
+	t.Testempty = rcv.Testempty(nil).UnPack()
+	t.Testbool = rcv.Testbool()
+	t.Testhashs32Fnv1 = rcv.Testhashs32Fnv1()
+	t.Testhashu32Fnv1 = rcv.Testhashu32Fnv1()
+	t.Testhashs64Fnv1 = rcv.Testhashs64Fnv1()
+	t.Testhashu64Fnv1 = rcv.Testhashu64Fnv1()
+	t.Testhashs32Fnv1a = rcv.Testhashs32Fnv1a()
+	t.Testhashu32Fnv1a = rcv.Testhashu32Fnv1a()
+	t.Testhashs64Fnv1a = rcv.Testhashs64Fnv1a()
+	t.Testhashu64Fnv1a = rcv.Testhashu64Fnv1a()
+	testarrayofboolsLength := rcv.TestarrayofboolsLength()
+	t.Testarrayofbools = make([]bool, testarrayofboolsLength)
+	for j := 0; j < testarrayofboolsLength; j++ {
+		t.Testarrayofbools[j] = rcv.Testarrayofbools(j)
+	}
+	t.Testf = rcv.Testf()
+	t.Testf2 = rcv.Testf2()
+	t.Testf3 = rcv.Testf3()
+	testarrayofstring2Length := rcv.Testarrayofstring2Length()
+	t.Testarrayofstring2 = make([]string, testarrayofstring2Length)
+	for j := 0; j < testarrayofstring2Length; j++ {
+		t.Testarrayofstring2[j] = string(rcv.Testarrayofstring2(j))
+	}
+	testarrayofsortedstructLength := rcv.TestarrayofsortedstructLength()
+	t.Testarrayofsortedstruct = make([]*AbilityT, testarrayofsortedstructLength)
+	for j := 0; j < testarrayofsortedstructLength; j++ {
+		x := Ability{}
+		rcv.Testarrayofsortedstruct(&x, j)
+		t.Testarrayofsortedstruct[j] = x.UnPack()
+	}
+	t.Flex = rcv.FlexBytes()
+	test5Length := rcv.Test5Length()
+	t.Test5 = make([]*TestT, test5Length)
+	for j := 0; j < test5Length; j++ {
+		x := Test{}
+		rcv.Test5(&x, j)
+		t.Test5[j] = x.UnPack()
+	}
+	vectorOfLongsLength := rcv.VectorOfLongsLength()
+	t.VectorOfLongs = make([]int64, vectorOfLongsLength)
+	for j := 0; j < vectorOfLongsLength; j++ {
+		t.VectorOfLongs[j] = rcv.VectorOfLongs(j)
+	}
+	vectorOfDoublesLength := rcv.VectorOfDoublesLength()
+	t.VectorOfDoubles = make([]float64, vectorOfDoublesLength)
+	for j := 0; j < vectorOfDoublesLength; j++ {
+		t.VectorOfDoubles[j] = rcv.VectorOfDoubles(j)
+	}
+	t.ParentNamespaceTest = rcv.ParentNamespaceTest(nil).UnPack()
+	vectorOfReferrablesLength := rcv.VectorOfReferrablesLength()
+	t.VectorOfReferrables = make([]*ReferrableT, vectorOfReferrablesLength)
+	for j := 0; j < vectorOfReferrablesLength; j++ {
+		x := Referrable{}
+		rcv.VectorOfReferrables(&x, j)
+		t.VectorOfReferrables[j] = x.UnPack()
+	}
+	t.SingleWeakReference = rcv.SingleWeakReference()
+	vectorOfWeakReferencesLength := rcv.VectorOfWeakReferencesLength()
+	t.VectorOfWeakReferences = make([]uint64, vectorOfWeakReferencesLength)
+	for j := 0; j < vectorOfWeakReferencesLength; j++ {
+		t.VectorOfWeakReferences[j] = rcv.VectorOfWeakReferences(j)
+	}
+	vectorOfStrongReferrablesLength := rcv.VectorOfStrongReferrablesLength()
+	t.VectorOfStrongReferrables = make([]*ReferrableT, vectorOfStrongReferrablesLength)
+	for j := 0; j < vectorOfStrongReferrablesLength; j++ {
+		x := Referrable{}
+		rcv.VectorOfStrongReferrables(&x, j)
+		t.VectorOfStrongReferrables[j] = x.UnPack()
+	}
+	t.CoOwningReference = rcv.CoOwningReference()
+	vectorOfCoOwningReferencesLength := rcv.VectorOfCoOwningReferencesLength()
+	t.VectorOfCoOwningReferences = make([]uint64, vectorOfCoOwningReferencesLength)
+	for j := 0; j < vectorOfCoOwningReferencesLength; j++ {
+		t.VectorOfCoOwningReferences[j] = rcv.VectorOfCoOwningReferences(j)
+	}
+	t.NonOwningReference = rcv.NonOwningReference()
+	vectorOfNonOwningReferencesLength := rcv.VectorOfNonOwningReferencesLength()
+	t.VectorOfNonOwningReferences = make([]uint64, vectorOfNonOwningReferencesLength)
+	for j := 0; j < vectorOfNonOwningReferencesLength; j++ {
+		t.VectorOfNonOwningReferences[j] = rcv.VectorOfNonOwningReferences(j)
+	}
+	anyUniqueTable := flatbuffers.Table{}
+	if rcv.AnyUnique(&anyUniqueTable) {
+		t.AnyUnique = rcv.AnyUniqueType().UnPack(anyUniqueTable)
+	}
+	anyAmbiguousTable := flatbuffers.Table{}
+	if rcv.AnyAmbiguous(&anyAmbiguousTable) {
+		t.AnyAmbiguous = rcv.AnyAmbiguousType().UnPack(anyAmbiguousTable)
+	}
+	vectorOfEnumsLength := rcv.VectorOfEnumsLength()
+	t.VectorOfEnums = make([]Color, vectorOfEnumsLength)
+	for j := 0; j < vectorOfEnumsLength; j++ {
+		t.VectorOfEnums[j] = rcv.VectorOfEnums(j)
+	}
+	t.SignedEnum = rcv.SignedEnum()
+	t.Testrequirednestedflatbuffer = rcv.TestrequirednestedflatbufferBytes()
+	scalarKeySortedTablesLength := rcv.ScalarKeySortedTablesLength()
+	t.ScalarKeySortedTables = make([]*StatT, scalarKeySortedTablesLength)
+	for j := 0; j < scalarKeySortedTablesLength; j++ {
+		x := Stat{}
+		rcv.ScalarKeySortedTables(&x, j)
+		t.ScalarKeySortedTables[j] = x.UnPack()
+	}
+}
+
+func (rcv *Monster) UnPack() *MonsterT {
+	if rcv == nil { return nil }
+	t := &MonsterT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Monster struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsMonster(buf []byte, offset flatbuffers.UOffsetT) *Monster {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &Monster{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsMonster(buf []byte, offset flatbuffers.UOffsetT) *Monster {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &Monster{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *Monster) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Monster) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *Monster) Pos(obj *Vec3) *Vec3 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := o + rcv._tab.Pos
+		if obj == nil {
+			obj = new(Vec3)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *Monster) Mana() int16 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+	}
+	return 150
+}
+
+func (rcv *Monster) MutateMana(n int16) bool {
+	return rcv._tab.MutateInt16Slot(6, n)
+}
+
+func (rcv *Monster) Hp() int16 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+	}
+	return 100
+}
+
+func (rcv *Monster) MutateHp(n int16) bool {
+	return rcv._tab.MutateInt16Slot(8, n)
+}
+
+func (rcv *Monster) Name() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) Inventory(j int) byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(14))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetByte(a + flatbuffers.UOffsetT(j*1))
+	}
+	return 0
+}
+
+func (rcv *Monster) InventoryLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(14))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) InventoryBytes() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(14))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) MutateInventory(j int, n byte) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(14))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateByte(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *Monster) Color() Color {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(16))
+	if o != 0 {
+		return Color(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 8
+}
+
+func (rcv *Monster) MutateColor(n Color) bool {
+	return rcv._tab.MutateByteSlot(16, byte(n))
+}
+
+func (rcv *Monster) TestType() Any {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(18))
+	if o != 0 {
+		return Any(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTestType(n Any) bool {
+	return rcv._tab.MutateByteSlot(18, byte(n))
+}
+
+func (rcv *Monster) Test(obj *flatbuffers.Table) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(20))
+	if o != 0 {
+		rcv._tab.Union(obj, o)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) Test4(obj *Test, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(22))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) Test4Length() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(22))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) Testarrayofstring(j int) []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(24))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.ByteVector(a + flatbuffers.UOffsetT(j*4))
+	}
+	return nil
+}
+
+func (rcv *Monster) TestarrayofstringLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(24))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// an example documentation comment: this will end up in the generated code
+/// multiline too
+func (rcv *Monster) Testarrayoftables(obj *Monster, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(26))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		x = rcv._tab.Indirect(x)
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) TestarrayoftablesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(26))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// an example documentation comment: this will end up in the generated code
+/// multiline too
+func (rcv *Monster) Enemy(obj *Monster) *Monster {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(28))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Monster)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *Monster) Testnestedflatbuffer(j int) byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(30))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetByte(a + flatbuffers.UOffsetT(j*1))
+	}
+	return 0
+}
+
+func (rcv *Monster) TestnestedflatbufferLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(30))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) TestnestedflatbufferBytes() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(30))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) MutateTestnestedflatbuffer(j int, n byte) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(30))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateByte(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *Monster) Testempty(obj *Stat) *Stat {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(32))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(Stat)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *Monster) Testbool() bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(34))
+	if o != 0 {
+		return rcv._tab.GetBool(o + rcv._tab.Pos)
+	}
+	return false
+}
+
+func (rcv *Monster) MutateTestbool(n bool) bool {
+	return rcv._tab.MutateBoolSlot(34, n)
+}
+
+func (rcv *Monster) Testhashs32Fnv1() int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(36))
+	if o != 0 {
+		return rcv._tab.GetInt32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashs32Fnv1(n int32) bool {
+	return rcv._tab.MutateInt32Slot(36, n)
+}
+
+func (rcv *Monster) Testhashu32Fnv1() uint32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(38))
+	if o != 0 {
+		return rcv._tab.GetUint32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashu32Fnv1(n uint32) bool {
+	return rcv._tab.MutateUint32Slot(38, n)
+}
+
+func (rcv *Monster) Testhashs64Fnv1() int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(40))
+	if o != 0 {
+		return rcv._tab.GetInt64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashs64Fnv1(n int64) bool {
+	return rcv._tab.MutateInt64Slot(40, n)
+}
+
+func (rcv *Monster) Testhashu64Fnv1() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(42))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashu64Fnv1(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(42, n)
+}
+
+func (rcv *Monster) Testhashs32Fnv1a() int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(44))
+	if o != 0 {
+		return rcv._tab.GetInt32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashs32Fnv1a(n int32) bool {
+	return rcv._tab.MutateInt32Slot(44, n)
+}
+
+func (rcv *Monster) Testhashu32Fnv1a() uint32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(46))
+	if o != 0 {
+		return rcv._tab.GetUint32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashu32Fnv1a(n uint32) bool {
+	return rcv._tab.MutateUint32Slot(46, n)
+}
+
+func (rcv *Monster) Testhashs64Fnv1a() int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(48))
+	if o != 0 {
+		return rcv._tab.GetInt64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashs64Fnv1a(n int64) bool {
+	return rcv._tab.MutateInt64Slot(48, n)
+}
+
+func (rcv *Monster) Testhashu64Fnv1a() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(50))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTesthashu64Fnv1a(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(50, n)
+}
+
+func (rcv *Monster) Testarrayofbools(j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(52))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetBool(a + flatbuffers.UOffsetT(j*1))
+	}
+	return false
+}
+
+func (rcv *Monster) TestarrayofboolsLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(52))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateTestarrayofbools(j int, n bool) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(52))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateBool(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *Monster) Testf() float32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(54))
+	if o != 0 {
+		return rcv._tab.GetFloat32(o + rcv._tab.Pos)
+	}
+	return 3.14159
+}
+
+func (rcv *Monster) MutateTestf(n float32) bool {
+	return rcv._tab.MutateFloat32Slot(54, n)
+}
+
+func (rcv *Monster) Testf2() float32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(56))
+	if o != 0 {
+		return rcv._tab.GetFloat32(o + rcv._tab.Pos)
+	}
+	return 3.0
+}
+
+func (rcv *Monster) MutateTestf2(n float32) bool {
+	return rcv._tab.MutateFloat32Slot(56, n)
+}
+
+func (rcv *Monster) Testf3() float32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(58))
+	if o != 0 {
+		return rcv._tab.GetFloat32(o + rcv._tab.Pos)
+	}
+	return 0.0
+}
+
+func (rcv *Monster) MutateTestf3(n float32) bool {
+	return rcv._tab.MutateFloat32Slot(58, n)
+}
+
+func (rcv *Monster) Testarrayofstring2(j int) []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(60))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.ByteVector(a + flatbuffers.UOffsetT(j*4))
+	}
+	return nil
+}
+
+func (rcv *Monster) Testarrayofstring2Length() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(60))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) Testarrayofsortedstruct(obj *Ability, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(62))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 8
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) TestarrayofsortedstructLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(62))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) Flex(j int) byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(64))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetByte(a + flatbuffers.UOffsetT(j*1))
+	}
+	return 0
+}
+
+func (rcv *Monster) FlexLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(64))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) FlexBytes() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(64))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) MutateFlex(j int, n byte) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(64))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateByte(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *Monster) Test5(obj *Test, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(66))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) Test5Length() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(66))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfLongs(j int) int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(68))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfLongsLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(68))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateVectorOfLongs(j int, n int64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(68))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func (rcv *Monster) VectorOfDoubles(j int) float64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(70))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetFloat64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfDoublesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(70))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateVectorOfDoubles(j int, n float64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(70))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateFloat64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func (rcv *Monster) ParentNamespaceTest(obj *MyGame.InParentNamespace) *MyGame.InParentNamespace {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(72))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(MyGame.InParentNamespace)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *Monster) VectorOfReferrables(obj *Referrable, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(74))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		x = rcv._tab.Indirect(x)
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) VectorOfReferrablesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(74))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) SingleWeakReference() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(76))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateSingleWeakReference(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(76, n)
+}
+
+func (rcv *Monster) VectorOfWeakReferences(j int) uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(78))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetUint64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfWeakReferencesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(78))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateVectorOfWeakReferences(j int, n uint64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(78))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateUint64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func (rcv *Monster) VectorOfStrongReferrables(obj *Referrable, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(80))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		x = rcv._tab.Indirect(x)
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) VectorOfStrongReferrablesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(80))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) CoOwningReference() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(82))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateCoOwningReference(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(82, n)
+}
+
+func (rcv *Monster) VectorOfCoOwningReferences(j int) uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(84))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetUint64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfCoOwningReferencesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(84))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateVectorOfCoOwningReferences(j int, n uint64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(84))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateUint64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func (rcv *Monster) NonOwningReference() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(86))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateNonOwningReference(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(86, n)
+}
+
+func (rcv *Monster) VectorOfNonOwningReferences(j int) uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(88))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetUint64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfNonOwningReferencesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(88))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateVectorOfNonOwningReferences(j int, n uint64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(88))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateUint64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func (rcv *Monster) AnyUniqueType() AnyUniqueAliases {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(90))
+	if o != 0 {
+		return AnyUniqueAliases(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateAnyUniqueType(n AnyUniqueAliases) bool {
+	return rcv._tab.MutateByteSlot(90, byte(n))
+}
+
+func (rcv *Monster) AnyUnique(obj *flatbuffers.Table) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(92))
+	if o != 0 {
+		rcv._tab.Union(obj, o)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) AnyAmbiguousType() AnyAmbiguousAliases {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(94))
+	if o != 0 {
+		return AnyAmbiguousAliases(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *Monster) MutateAnyAmbiguousType(n AnyAmbiguousAliases) bool {
+	return rcv._tab.MutateByteSlot(94, byte(n))
+}
+
+func (rcv *Monster) AnyAmbiguous(obj *flatbuffers.Table) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(96))
+	if o != 0 {
+		rcv._tab.Union(obj, o)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) VectorOfEnums(j int) Color {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(98))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return Color(rcv._tab.GetByte(a + flatbuffers.UOffsetT(j*1)))
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfEnumsLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(98))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) VectorOfEnumsBytes() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(98))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) MutateVectorOfEnums(j int, n Color) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(98))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateByte(a+flatbuffers.UOffsetT(j*1), byte(n))
+	}
+	return false
+}
+
+func (rcv *Monster) SignedEnum() Race {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(100))
+	if o != 0 {
+		return Race(rcv._tab.GetInt8(o + rcv._tab.Pos))
+	}
+	return -1
+}
+
+func (rcv *Monster) MutateSignedEnum(n Race) bool {
+	return rcv._tab.MutateInt8Slot(100, int8(n))
+}
+
+func (rcv *Monster) Testrequirednestedflatbuffer(j int) byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(102))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetByte(a + flatbuffers.UOffsetT(j*1))
+	}
+	return 0
+}
+
+func (rcv *Monster) TestrequirednestedflatbufferLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(102))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *Monster) TestrequirednestedflatbufferBytes() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(102))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Monster) MutateTestrequirednestedflatbuffer(j int, n byte) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(102))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateByte(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *Monster) ScalarKeySortedTables(obj *Stat, j int) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(104))
+	if o != 0 {
+		x := rcv._tab.Vector(o)
+		x += flatbuffers.UOffsetT(j) * 4
+		x = rcv._tab.Indirect(x)
+		obj.Init(rcv._tab.Bytes, x)
+		return true
+	}
+	return false
+}
+
+func (rcv *Monster) ScalarKeySortedTablesLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(104))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func MonsterStart(builder *flatbuffers.Builder) {
+	builder.StartObject(51)
+}
+func MonsterAddPos(builder *flatbuffers.Builder, pos flatbuffers.UOffsetT) {
+	builder.PrependStructSlot(0, flatbuffers.UOffsetT(pos), 0)
+}
+func MonsterAddMana(builder *flatbuffers.Builder, mana int16) {
+	builder.PrependInt16Slot(1, mana, 150)
+}
+func MonsterAddHp(builder *flatbuffers.Builder, hp int16) {
+	builder.PrependInt16Slot(2, hp, 100)
+}
+func MonsterAddName(builder *flatbuffers.Builder, name flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(name), 0)
+}
+func MonsterAddInventory(builder *flatbuffers.Builder, inventory flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(5, flatbuffers.UOffsetT(inventory), 0)
+}
+func MonsterStartInventoryVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddColor(builder *flatbuffers.Builder, color Color) {
+	builder.PrependByteSlot(6, byte(color), 8)
+}
+func MonsterAddTestType(builder *flatbuffers.Builder, testType Any) {
+	builder.PrependByteSlot(7, byte(testType), 0)
+}
+func MonsterAddTest(builder *flatbuffers.Builder, test flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(8, flatbuffers.UOffsetT(test), 0)
+}
+func MonsterAddTest4(builder *flatbuffers.Builder, test4 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(9, flatbuffers.UOffsetT(test4), 0)
+}
+func MonsterStartTest4Vector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 2)
+}
+func MonsterAddTestarrayofstring(builder *flatbuffers.Builder, testarrayofstring flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(10, flatbuffers.UOffsetT(testarrayofstring), 0)
+}
+func MonsterStartTestarrayofstringVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterAddTestarrayoftables(builder *flatbuffers.Builder, testarrayoftables flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(11, flatbuffers.UOffsetT(testarrayoftables), 0)
+}
+func MonsterStartTestarrayoftablesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterAddEnemy(builder *flatbuffers.Builder, enemy flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(12, flatbuffers.UOffsetT(enemy), 0)
+}
+func MonsterAddTestnestedflatbuffer(builder *flatbuffers.Builder, testnestedflatbuffer flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(13, flatbuffers.UOffsetT(testnestedflatbuffer), 0)
+}
+func MonsterStartTestnestedflatbufferVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddTestempty(builder *flatbuffers.Builder, testempty flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(14, flatbuffers.UOffsetT(testempty), 0)
+}
+func MonsterAddTestbool(builder *flatbuffers.Builder, testbool bool) {
+	builder.PrependBoolSlot(15, testbool, false)
+}
+func MonsterAddTesthashs32Fnv1(builder *flatbuffers.Builder, testhashs32Fnv1 int32) {
+	builder.PrependInt32Slot(16, testhashs32Fnv1, 0)
+}
+func MonsterAddTesthashu32Fnv1(builder *flatbuffers.Builder, testhashu32Fnv1 uint32) {
+	builder.PrependUint32Slot(17, testhashu32Fnv1, 0)
+}
+func MonsterAddTesthashs64Fnv1(builder *flatbuffers.Builder, testhashs64Fnv1 int64) {
+	builder.PrependInt64Slot(18, testhashs64Fnv1, 0)
+}
+func MonsterAddTesthashu64Fnv1(builder *flatbuffers.Builder, testhashu64Fnv1 uint64) {
+	builder.PrependUint64Slot(19, testhashu64Fnv1, 0)
+}
+func MonsterAddTesthashs32Fnv1a(builder *flatbuffers.Builder, testhashs32Fnv1a int32) {
+	builder.PrependInt32Slot(20, testhashs32Fnv1a, 0)
+}
+func MonsterAddTesthashu32Fnv1a(builder *flatbuffers.Builder, testhashu32Fnv1a uint32) {
+	builder.PrependUint32Slot(21, testhashu32Fnv1a, 0)
+}
+func MonsterAddTesthashs64Fnv1a(builder *flatbuffers.Builder, testhashs64Fnv1a int64) {
+	builder.PrependInt64Slot(22, testhashs64Fnv1a, 0)
+}
+func MonsterAddTesthashu64Fnv1a(builder *flatbuffers.Builder, testhashu64Fnv1a uint64) {
+	builder.PrependUint64Slot(23, testhashu64Fnv1a, 0)
+}
+func MonsterAddTestarrayofbools(builder *flatbuffers.Builder, testarrayofbools flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(24, flatbuffers.UOffsetT(testarrayofbools), 0)
+}
+func MonsterStartTestarrayofboolsVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddTestf(builder *flatbuffers.Builder, testf float32) {
+	builder.PrependFloat32Slot(25, testf, 3.14159)
+}
+func MonsterAddTestf2(builder *flatbuffers.Builder, testf2 float32) {
+	builder.PrependFloat32Slot(26, testf2, 3.0)
+}
+func MonsterAddTestf3(builder *flatbuffers.Builder, testf3 float32) {
+	builder.PrependFloat32Slot(27, testf3, 0.0)
+}
+func MonsterAddTestarrayofstring2(builder *flatbuffers.Builder, testarrayofstring2 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(28, flatbuffers.UOffsetT(testarrayofstring2), 0)
+}
+func MonsterStartTestarrayofstring2Vector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterAddTestarrayofsortedstruct(builder *flatbuffers.Builder, testarrayofsortedstruct flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(29, flatbuffers.UOffsetT(testarrayofsortedstruct), 0)
+}
+func MonsterStartTestarrayofsortedstructVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 4)
+}
+func MonsterAddFlex(builder *flatbuffers.Builder, flex flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(30, flatbuffers.UOffsetT(flex), 0)
+}
+func MonsterStartFlexVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddTest5(builder *flatbuffers.Builder, test5 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(31, flatbuffers.UOffsetT(test5), 0)
+}
+func MonsterStartTest5Vector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 2)
+}
+func MonsterAddVectorOfLongs(builder *flatbuffers.Builder, vectorOfLongs flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(32, flatbuffers.UOffsetT(vectorOfLongs), 0)
+}
+func MonsterStartVectorOfLongsVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func MonsterAddVectorOfDoubles(builder *flatbuffers.Builder, vectorOfDoubles flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(33, flatbuffers.UOffsetT(vectorOfDoubles), 0)
+}
+func MonsterStartVectorOfDoublesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func MonsterAddParentNamespaceTest(builder *flatbuffers.Builder, parentNamespaceTest flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(34, flatbuffers.UOffsetT(parentNamespaceTest), 0)
+}
+func MonsterAddVectorOfReferrables(builder *flatbuffers.Builder, vectorOfReferrables flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(35, flatbuffers.UOffsetT(vectorOfReferrables), 0)
+}
+func MonsterStartVectorOfReferrablesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterAddSingleWeakReference(builder *flatbuffers.Builder, singleWeakReference uint64) {
+	builder.PrependUint64Slot(36, singleWeakReference, 0)
+}
+func MonsterAddVectorOfWeakReferences(builder *flatbuffers.Builder, vectorOfWeakReferences flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(37, flatbuffers.UOffsetT(vectorOfWeakReferences), 0)
+}
+func MonsterStartVectorOfWeakReferencesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func MonsterAddVectorOfStrongReferrables(builder *flatbuffers.Builder, vectorOfStrongReferrables flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(38, flatbuffers.UOffsetT(vectorOfStrongReferrables), 0)
+}
+func MonsterStartVectorOfStrongReferrablesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterAddCoOwningReference(builder *flatbuffers.Builder, coOwningReference uint64) {
+	builder.PrependUint64Slot(39, coOwningReference, 0)
+}
+func MonsterAddVectorOfCoOwningReferences(builder *flatbuffers.Builder, vectorOfCoOwningReferences flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(40, flatbuffers.UOffsetT(vectorOfCoOwningReferences), 0)
+}
+func MonsterStartVectorOfCoOwningReferencesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func MonsterAddNonOwningReference(builder *flatbuffers.Builder, nonOwningReference uint64) {
+	builder.PrependUint64Slot(41, nonOwningReference, 0)
+}
+func MonsterAddVectorOfNonOwningReferences(builder *flatbuffers.Builder, vectorOfNonOwningReferences flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(42, flatbuffers.UOffsetT(vectorOfNonOwningReferences), 0)
+}
+func MonsterStartVectorOfNonOwningReferencesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func MonsterAddAnyUniqueType(builder *flatbuffers.Builder, anyUniqueType AnyUniqueAliases) {
+	builder.PrependByteSlot(43, byte(anyUniqueType), 0)
+}
+func MonsterAddAnyUnique(builder *flatbuffers.Builder, anyUnique flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(44, flatbuffers.UOffsetT(anyUnique), 0)
+}
+func MonsterAddAnyAmbiguousType(builder *flatbuffers.Builder, anyAmbiguousType AnyAmbiguousAliases) {
+	builder.PrependByteSlot(45, byte(anyAmbiguousType), 0)
+}
+func MonsterAddAnyAmbiguous(builder *flatbuffers.Builder, anyAmbiguous flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(46, flatbuffers.UOffsetT(anyAmbiguous), 0)
+}
+func MonsterAddVectorOfEnums(builder *flatbuffers.Builder, vectorOfEnums flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(47, flatbuffers.UOffsetT(vectorOfEnums), 0)
+}
+func MonsterStartVectorOfEnumsVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddSignedEnum(builder *flatbuffers.Builder, signedEnum Race) {
+	builder.PrependInt8Slot(48, int8(signedEnum), -1)
+}
+func MonsterAddTestrequirednestedflatbuffer(builder *flatbuffers.Builder, testrequirednestedflatbuffer flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(49, flatbuffers.UOffsetT(testrequirednestedflatbuffer), 0)
+}
+func MonsterStartTestrequirednestedflatbufferVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func MonsterAddScalarKeySortedTables(builder *flatbuffers.Builder, scalarKeySortedTables flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(50, flatbuffers.UOffsetT(scalarKeySortedTables), 0)
+}
+func MonsterStartScalarKeySortedTablesVector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(4, numElems, 4)
+}
+func MonsterEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.java
new file mode 100644
index 0000000..27f4806
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.java
@@ -0,0 +1,348 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+/**
+ * an example documentation comment: "monster object"
+ */
+@SuppressWarnings("unused")
+public final class Monster extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Monster getRootAsMonster(ByteBuffer _bb) { return getRootAsMonster(_bb, new Monster()); }
+  public static Monster getRootAsMonster(ByteBuffer _bb, Monster obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public static boolean MonsterBufferHasIdentifier(ByteBuffer _bb) { return __has_identifier(_bb, "MONS"); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Monster __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.Vec3 pos() { return pos(new MyGame.Example.Vec3()); }
+  public MyGame.Example.Vec3 pos(MyGame.Example.Vec3 obj) { int o = __offset(4); return o != 0 ? obj.__assign(o + bb_pos, bb) : null; }
+  public short mana() { int o = __offset(6); return o != 0 ? bb.getShort(o + bb_pos) : 150; }
+  public boolean mutateMana(short mana) { int o = __offset(6); if (o != 0) { bb.putShort(o + bb_pos, mana); return true; } else { return false; } }
+  public short hp() { int o = __offset(8); return o != 0 ? bb.getShort(o + bb_pos) : 100; }
+  public boolean mutateHp(short hp) { int o = __offset(8); if (o != 0) { bb.putShort(o + bb_pos, hp); return true; } else { return false; } }
+  public String name() { int o = __offset(10); return o != 0 ? __string(o + bb_pos) : null; }
+  public ByteBuffer nameAsByteBuffer() { return __vector_as_bytebuffer(10, 1); }
+  public ByteBuffer nameInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 10, 1); }
+  public int inventory(int j) { int o = __offset(14); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; }
+  public int inventoryLength() { int o = __offset(14); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector inventoryVector() { return inventoryVector(new ByteVector()); }
+  public ByteVector inventoryVector(ByteVector obj) { int o = __offset(14); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer inventoryAsByteBuffer() { return __vector_as_bytebuffer(14, 1); }
+  public ByteBuffer inventoryInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 14, 1); }
+  public boolean mutateInventory(int j, int inventory) { int o = __offset(14); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)inventory); return true; } else { return false; } }
+  public int color() { int o = __offset(16); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 8; }
+  public boolean mutateColor(int color) { int o = __offset(16); if (o != 0) { bb.put(o + bb_pos, (byte)color); return true; } else { return false; } }
+  public byte testType() { int o = __offset(18); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public Table test(Table obj) { int o = __offset(20); return o != 0 ? __union(obj, o + bb_pos) : null; }
+  public MyGame.Example.Test test4(int j) { return test4(new MyGame.Example.Test(), j); }
+  public MyGame.Example.Test test4(MyGame.Example.Test obj, int j) { int o = __offset(22); return o != 0 ? obj.__assign(__vector(o) + j * 4, bb) : null; }
+  public int test4Length() { int o = __offset(22); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Test.Vector test4Vector() { return test4Vector(new MyGame.Example.Test.Vector()); }
+  public MyGame.Example.Test.Vector test4Vector(MyGame.Example.Test.Vector obj) { int o = __offset(22); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public String testarrayofstring(int j) { int o = __offset(24); return o != 0 ? __string(__vector(o) + j * 4) : null; }
+  public int testarrayofstringLength() { int o = __offset(24); return o != 0 ? __vector_len(o) : 0; }
+  public StringVector testarrayofstringVector() { return testarrayofstringVector(new StringVector()); }
+  public StringVector testarrayofstringVector(StringVector obj) { int o = __offset(24); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  /**
+   * an example documentation comment: this will end up in the generated code
+   * multiline too
+   */
+  public MyGame.Example.Monster testarrayoftables(int j) { return testarrayoftables(new MyGame.Example.Monster(), j); }
+  public MyGame.Example.Monster testarrayoftables(MyGame.Example.Monster obj, int j) { int o = __offset(26); return o != 0 ? obj.__assign(__indirect(__vector(o) + j * 4), bb) : null; }
+  public int testarrayoftablesLength() { int o = __offset(26); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Monster testarrayoftablesByKey(String key) { int o = __offset(26); return o != 0 ? MyGame.Example.Monster.__lookup_by_key(null, __vector(o), key, bb) : null; }
+  public MyGame.Example.Monster testarrayoftablesByKey(MyGame.Example.Monster obj, String key) { int o = __offset(26); return o != 0 ? MyGame.Example.Monster.__lookup_by_key(obj, __vector(o), key, bb) : null; }
+  public MyGame.Example.Monster.Vector testarrayoftablesVector() { return testarrayoftablesVector(new MyGame.Example.Monster.Vector()); }
+  public MyGame.Example.Monster.Vector testarrayoftablesVector(MyGame.Example.Monster.Vector obj) { int o = __offset(26); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public MyGame.Example.Monster enemy() { return enemy(new MyGame.Example.Monster()); }
+  public MyGame.Example.Monster enemy(MyGame.Example.Monster obj) { int o = __offset(28); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+  public int testnestedflatbuffer(int j) { int o = __offset(30); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; }
+  public int testnestedflatbufferLength() { int o = __offset(30); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector testnestedflatbufferVector() { return testnestedflatbufferVector(new ByteVector()); }
+  public ByteVector testnestedflatbufferVector(ByteVector obj) { int o = __offset(30); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer testnestedflatbufferAsByteBuffer() { return __vector_as_bytebuffer(30, 1); }
+  public ByteBuffer testnestedflatbufferInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 30, 1); }
+  public MyGame.Example.Monster testnestedflatbufferAsMonster() { return testnestedflatbufferAsMonster(new MyGame.Example.Monster()); }
+  public MyGame.Example.Monster testnestedflatbufferAsMonster(MyGame.Example.Monster obj) { int o = __offset(30); return o != 0 ? obj.__assign(__indirect(__vector(o)), bb) : null; }
+  public boolean mutateTestnestedflatbuffer(int j, int testnestedflatbuffer) { int o = __offset(30); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)testnestedflatbuffer); return true; } else { return false; } }
+  public MyGame.Example.Stat testempty() { return testempty(new MyGame.Example.Stat()); }
+  public MyGame.Example.Stat testempty(MyGame.Example.Stat obj) { int o = __offset(32); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+  public boolean testbool() { int o = __offset(34); return o != 0 ? 0!=bb.get(o + bb_pos) : false; }
+  public boolean mutateTestbool(boolean testbool) { int o = __offset(34); if (o != 0) { bb.put(o + bb_pos, (byte)(testbool ? 1 : 0)); return true; } else { return false; } }
+  public int testhashs32Fnv1() { int o = __offset(36); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean mutateTesthashs32Fnv1(int testhashs32_fnv1) { int o = __offset(36); if (o != 0) { bb.putInt(o + bb_pos, testhashs32_fnv1); return true; } else { return false; } }
+  public long testhashu32Fnv1() { int o = __offset(38); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 0L; }
+  public boolean mutateTesthashu32Fnv1(long testhashu32_fnv1) { int o = __offset(38); if (o != 0) { bb.putInt(o + bb_pos, (int)testhashu32_fnv1); return true; } else { return false; } }
+  public long testhashs64Fnv1() { int o = __offset(40); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateTesthashs64Fnv1(long testhashs64_fnv1) { int o = __offset(40); if (o != 0) { bb.putLong(o + bb_pos, testhashs64_fnv1); return true; } else { return false; } }
+  public long testhashu64Fnv1() { int o = __offset(42); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateTesthashu64Fnv1(long testhashu64_fnv1) { int o = __offset(42); if (o != 0) { bb.putLong(o + bb_pos, testhashu64_fnv1); return true; } else { return false; } }
+  public int testhashs32Fnv1a() { int o = __offset(44); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean mutateTesthashs32Fnv1a(int testhashs32_fnv1a) { int o = __offset(44); if (o != 0) { bb.putInt(o + bb_pos, testhashs32_fnv1a); return true; } else { return false; } }
+  public long testhashu32Fnv1a() { int o = __offset(46); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 0L; }
+  public boolean mutateTesthashu32Fnv1a(long testhashu32_fnv1a) { int o = __offset(46); if (o != 0) { bb.putInt(o + bb_pos, (int)testhashu32_fnv1a); return true; } else { return false; } }
+  public long testhashs64Fnv1a() { int o = __offset(48); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateTesthashs64Fnv1a(long testhashs64_fnv1a) { int o = __offset(48); if (o != 0) { bb.putLong(o + bb_pos, testhashs64_fnv1a); return true; } else { return false; } }
+  public long testhashu64Fnv1a() { int o = __offset(50); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateTesthashu64Fnv1a(long testhashu64_fnv1a) { int o = __offset(50); if (o != 0) { bb.putLong(o + bb_pos, testhashu64_fnv1a); return true; } else { return false; } }
+  public boolean testarrayofbools(int j) { int o = __offset(52); return o != 0 ? 0!=bb.get(__vector(o) + j * 1) : false; }
+  public int testarrayofboolsLength() { int o = __offset(52); return o != 0 ? __vector_len(o) : 0; }
+  public BooleanVector testarrayofboolsVector() { return testarrayofboolsVector(new BooleanVector()); }
+  public BooleanVector testarrayofboolsVector(BooleanVector obj) { int o = __offset(52); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer testarrayofboolsAsByteBuffer() { return __vector_as_bytebuffer(52, 1); }
+  public ByteBuffer testarrayofboolsInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 52, 1); }
+  public boolean mutateTestarrayofbools(int j, boolean testarrayofbools) { int o = __offset(52); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)(testarrayofbools ? 1 : 0)); return true; } else { return false; } }
+  public float testf() { int o = __offset(54); return o != 0 ? bb.getFloat(o + bb_pos) : 3.14159f; }
+  public boolean mutateTestf(float testf) { int o = __offset(54); if (o != 0) { bb.putFloat(o + bb_pos, testf); return true; } else { return false; } }
+  public float testf2() { int o = __offset(56); return o != 0 ? bb.getFloat(o + bb_pos) : 3.0f; }
+  public boolean mutateTestf2(float testf2) { int o = __offset(56); if (o != 0) { bb.putFloat(o + bb_pos, testf2); return true; } else { return false; } }
+  public float testf3() { int o = __offset(58); return o != 0 ? bb.getFloat(o + bb_pos) : 0.0f; }
+  public boolean mutateTestf3(float testf3) { int o = __offset(58); if (o != 0) { bb.putFloat(o + bb_pos, testf3); return true; } else { return false; } }
+  public String testarrayofstring2(int j) { int o = __offset(60); return o != 0 ? __string(__vector(o) + j * 4) : null; }
+  public int testarrayofstring2Length() { int o = __offset(60); return o != 0 ? __vector_len(o) : 0; }
+  public StringVector testarrayofstring2Vector() { return testarrayofstring2Vector(new StringVector()); }
+  public StringVector testarrayofstring2Vector(StringVector obj) { int o = __offset(60); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public MyGame.Example.Ability testarrayofsortedstruct(int j) { return testarrayofsortedstruct(new MyGame.Example.Ability(), j); }
+  public MyGame.Example.Ability testarrayofsortedstruct(MyGame.Example.Ability obj, int j) { int o = __offset(62); return o != 0 ? obj.__assign(__vector(o) + j * 8, bb) : null; }
+  public int testarrayofsortedstructLength() { int o = __offset(62); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Ability.Vector testarrayofsortedstructVector() { return testarrayofsortedstructVector(new MyGame.Example.Ability.Vector()); }
+  public MyGame.Example.Ability.Vector testarrayofsortedstructVector(MyGame.Example.Ability.Vector obj) { int o = __offset(62); return o != 0 ? obj.__assign(__vector(o), 8, bb) : null; }
+  public int flex(int j) { int o = __offset(64); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; }
+  public int flexLength() { int o = __offset(64); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector flexVector() { return flexVector(new ByteVector()); }
+  public ByteVector flexVector(ByteVector obj) { int o = __offset(64); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer flexAsByteBuffer() { return __vector_as_bytebuffer(64, 1); }
+  public ByteBuffer flexInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 64, 1); }
+  public boolean mutateFlex(int j, int flex) { int o = __offset(64); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)flex); return true; } else { return false; } }
+  public MyGame.Example.Test test5(int j) { return test5(new MyGame.Example.Test(), j); }
+  public MyGame.Example.Test test5(MyGame.Example.Test obj, int j) { int o = __offset(66); return o != 0 ? obj.__assign(__vector(o) + j * 4, bb) : null; }
+  public int test5Length() { int o = __offset(66); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Test.Vector test5Vector() { return test5Vector(new MyGame.Example.Test.Vector()); }
+  public MyGame.Example.Test.Vector test5Vector(MyGame.Example.Test.Vector obj) { int o = __offset(66); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public long vectorOfLongs(int j) { int o = __offset(68); return o != 0 ? bb.getLong(__vector(o) + j * 8) : 0; }
+  public int vectorOfLongsLength() { int o = __offset(68); return o != 0 ? __vector_len(o) : 0; }
+  public LongVector vectorOfLongsVector() { return vectorOfLongsVector(new LongVector()); }
+  public LongVector vectorOfLongsVector(LongVector obj) { int o = __offset(68); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfLongsAsByteBuffer() { return __vector_as_bytebuffer(68, 8); }
+  public ByteBuffer vectorOfLongsInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 68, 8); }
+  public boolean mutateVectorOfLongs(int j, long vector_of_longs) { int o = __offset(68); if (o != 0) { bb.putLong(__vector(o) + j * 8, vector_of_longs); return true; } else { return false; } }
+  public double vectorOfDoubles(int j) { int o = __offset(70); return o != 0 ? bb.getDouble(__vector(o) + j * 8) : 0; }
+  public int vectorOfDoublesLength() { int o = __offset(70); return o != 0 ? __vector_len(o) : 0; }
+  public DoubleVector vectorOfDoublesVector() { return vectorOfDoublesVector(new DoubleVector()); }
+  public DoubleVector vectorOfDoublesVector(DoubleVector obj) { int o = __offset(70); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfDoublesAsByteBuffer() { return __vector_as_bytebuffer(70, 8); }
+  public ByteBuffer vectorOfDoublesInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 70, 8); }
+  public boolean mutateVectorOfDoubles(int j, double vector_of_doubles) { int o = __offset(70); if (o != 0) { bb.putDouble(__vector(o) + j * 8, vector_of_doubles); return true; } else { return false; } }
+  public MyGame.InParentNamespace parentNamespaceTest() { return parentNamespaceTest(new MyGame.InParentNamespace()); }
+  public MyGame.InParentNamespace parentNamespaceTest(MyGame.InParentNamespace obj) { int o = __offset(72); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+  public MyGame.Example.Referrable vectorOfReferrables(int j) { return vectorOfReferrables(new MyGame.Example.Referrable(), j); }
+  public MyGame.Example.Referrable vectorOfReferrables(MyGame.Example.Referrable obj, int j) { int o = __offset(74); return o != 0 ? obj.__assign(__indirect(__vector(o) + j * 4), bb) : null; }
+  public int vectorOfReferrablesLength() { int o = __offset(74); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Referrable vectorOfReferrablesByKey(long key) { int o = __offset(74); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(null, __vector(o), key, bb) : null; }
+  public MyGame.Example.Referrable vectorOfReferrablesByKey(MyGame.Example.Referrable obj, long key) { int o = __offset(74); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(obj, __vector(o), key, bb) : null; }
+  public MyGame.Example.Referrable.Vector vectorOfReferrablesVector() { return vectorOfReferrablesVector(new MyGame.Example.Referrable.Vector()); }
+  public MyGame.Example.Referrable.Vector vectorOfReferrablesVector(MyGame.Example.Referrable.Vector obj) { int o = __offset(74); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public long singleWeakReference() { int o = __offset(76); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateSingleWeakReference(long single_weak_reference) { int o = __offset(76); if (o != 0) { bb.putLong(o + bb_pos, single_weak_reference); return true; } else { return false; } }
+  public long vectorOfWeakReferences(int j) { int o = __offset(78); return o != 0 ? bb.getLong(__vector(o) + j * 8) : 0; }
+  public int vectorOfWeakReferencesLength() { int o = __offset(78); return o != 0 ? __vector_len(o) : 0; }
+  public LongVector vectorOfWeakReferencesVector() { return vectorOfWeakReferencesVector(new LongVector()); }
+  public LongVector vectorOfWeakReferencesVector(LongVector obj) { int o = __offset(78); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfWeakReferencesAsByteBuffer() { return __vector_as_bytebuffer(78, 8); }
+  public ByteBuffer vectorOfWeakReferencesInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 78, 8); }
+  public boolean mutateVectorOfWeakReferences(int j, long vector_of_weak_references) { int o = __offset(78); if (o != 0) { bb.putLong(__vector(o) + j * 8, vector_of_weak_references); return true; } else { return false; } }
+  public MyGame.Example.Referrable vectorOfStrongReferrables(int j) { return vectorOfStrongReferrables(new MyGame.Example.Referrable(), j); }
+  public MyGame.Example.Referrable vectorOfStrongReferrables(MyGame.Example.Referrable obj, int j) { int o = __offset(80); return o != 0 ? obj.__assign(__indirect(__vector(o) + j * 4), bb) : null; }
+  public int vectorOfStrongReferrablesLength() { int o = __offset(80); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Referrable vectorOfStrongReferrablesByKey(long key) { int o = __offset(80); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(null, __vector(o), key, bb) : null; }
+  public MyGame.Example.Referrable vectorOfStrongReferrablesByKey(MyGame.Example.Referrable obj, long key) { int o = __offset(80); return o != 0 ? MyGame.Example.Referrable.__lookup_by_key(obj, __vector(o), key, bb) : null; }
+  public MyGame.Example.Referrable.Vector vectorOfStrongReferrablesVector() { return vectorOfStrongReferrablesVector(new MyGame.Example.Referrable.Vector()); }
+  public MyGame.Example.Referrable.Vector vectorOfStrongReferrablesVector(MyGame.Example.Referrable.Vector obj) { int o = __offset(80); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+  public long coOwningReference() { int o = __offset(82); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateCoOwningReference(long co_owning_reference) { int o = __offset(82); if (o != 0) { bb.putLong(o + bb_pos, co_owning_reference); return true; } else { return false; } }
+  public long vectorOfCoOwningReferences(int j) { int o = __offset(84); return o != 0 ? bb.getLong(__vector(o) + j * 8) : 0; }
+  public int vectorOfCoOwningReferencesLength() { int o = __offset(84); return o != 0 ? __vector_len(o) : 0; }
+  public LongVector vectorOfCoOwningReferencesVector() { return vectorOfCoOwningReferencesVector(new LongVector()); }
+  public LongVector vectorOfCoOwningReferencesVector(LongVector obj) { int o = __offset(84); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfCoOwningReferencesAsByteBuffer() { return __vector_as_bytebuffer(84, 8); }
+  public ByteBuffer vectorOfCoOwningReferencesInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 84, 8); }
+  public boolean mutateVectorOfCoOwningReferences(int j, long vector_of_co_owning_references) { int o = __offset(84); if (o != 0) { bb.putLong(__vector(o) + j * 8, vector_of_co_owning_references); return true; } else { return false; } }
+  public long nonOwningReference() { int o = __offset(86); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateNonOwningReference(long non_owning_reference) { int o = __offset(86); if (o != 0) { bb.putLong(o + bb_pos, non_owning_reference); return true; } else { return false; } }
+  public long vectorOfNonOwningReferences(int j) { int o = __offset(88); return o != 0 ? bb.getLong(__vector(o) + j * 8) : 0; }
+  public int vectorOfNonOwningReferencesLength() { int o = __offset(88); return o != 0 ? __vector_len(o) : 0; }
+  public LongVector vectorOfNonOwningReferencesVector() { return vectorOfNonOwningReferencesVector(new LongVector()); }
+  public LongVector vectorOfNonOwningReferencesVector(LongVector obj) { int o = __offset(88); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfNonOwningReferencesAsByteBuffer() { return __vector_as_bytebuffer(88, 8); }
+  public ByteBuffer vectorOfNonOwningReferencesInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 88, 8); }
+  public boolean mutateVectorOfNonOwningReferences(int j, long vector_of_non_owning_references) { int o = __offset(88); if (o != 0) { bb.putLong(__vector(o) + j * 8, vector_of_non_owning_references); return true; } else { return false; } }
+  public byte anyUniqueType() { int o = __offset(90); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public Table anyUnique(Table obj) { int o = __offset(92); return o != 0 ? __union(obj, o + bb_pos) : null; }
+  public byte anyAmbiguousType() { int o = __offset(94); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public Table anyAmbiguous(Table obj) { int o = __offset(96); return o != 0 ? __union(obj, o + bb_pos) : null; }
+  public int vectorOfEnums(int j) { int o = __offset(98); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; }
+  public int vectorOfEnumsLength() { int o = __offset(98); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector vectorOfEnumsVector() { return vectorOfEnumsVector(new ByteVector()); }
+  public ByteVector vectorOfEnumsVector(ByteVector obj) { int o = __offset(98); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vectorOfEnumsAsByteBuffer() { return __vector_as_bytebuffer(98, 1); }
+  public ByteBuffer vectorOfEnumsInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 98, 1); }
+  public boolean mutateVectorOfEnums(int j, int vector_of_enums) { int o = __offset(98); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)vector_of_enums); return true; } else { return false; } }
+  public byte signedEnum() { int o = __offset(100); return o != 0 ? bb.get(o + bb_pos) : -1; }
+  public boolean mutateSignedEnum(byte signed_enum) { int o = __offset(100); if (o != 0) { bb.put(o + bb_pos, signed_enum); return true; } else { return false; } }
+  public int testrequirednestedflatbuffer(int j) { int o = __offset(102); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; }
+  public int testrequirednestedflatbufferLength() { int o = __offset(102); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector testrequirednestedflatbufferVector() { return testrequirednestedflatbufferVector(new ByteVector()); }
+  public ByteVector testrequirednestedflatbufferVector(ByteVector obj) { int o = __offset(102); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer testrequirednestedflatbufferAsByteBuffer() { return __vector_as_bytebuffer(102, 1); }
+  public ByteBuffer testrequirednestedflatbufferInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 102, 1); }
+  public MyGame.Example.Monster testrequirednestedflatbufferAsMonster() { return testrequirednestedflatbufferAsMonster(new MyGame.Example.Monster()); }
+  public MyGame.Example.Monster testrequirednestedflatbufferAsMonster(MyGame.Example.Monster obj) { int o = __offset(102); return o != 0 ? obj.__assign(__indirect(__vector(o)), bb) : null; }
+  public boolean mutateTestrequirednestedflatbuffer(int j, int testrequirednestedflatbuffer) { int o = __offset(102); if (o != 0) { bb.put(__vector(o) + j * 1, (byte)testrequirednestedflatbuffer); return true; } else { return false; } }
+  public MyGame.Example.Stat scalarKeySortedTables(int j) { return scalarKeySortedTables(new MyGame.Example.Stat(), j); }
+  public MyGame.Example.Stat scalarKeySortedTables(MyGame.Example.Stat obj, int j) { int o = __offset(104); return o != 0 ? obj.__assign(__indirect(__vector(o) + j * 4), bb) : null; }
+  public int scalarKeySortedTablesLength() { int o = __offset(104); return o != 0 ? __vector_len(o) : 0; }
+  public MyGame.Example.Stat scalarKeySortedTablesByKey(int key) { int o = __offset(104); return o != 0 ? MyGame.Example.Stat.__lookup_by_key(null, __vector(o), key, bb) : null; }
+  public MyGame.Example.Stat scalarKeySortedTablesByKey(MyGame.Example.Stat obj, int key) { int o = __offset(104); return o != 0 ? MyGame.Example.Stat.__lookup_by_key(obj, __vector(o), key, bb) : null; }
+  public MyGame.Example.Stat.Vector scalarKeySortedTablesVector() { return scalarKeySortedTablesVector(new MyGame.Example.Stat.Vector()); }
+  public MyGame.Example.Stat.Vector scalarKeySortedTablesVector(MyGame.Example.Stat.Vector obj) { int o = __offset(104); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+
+  public static void startMonster(FlatBufferBuilder builder) { builder.startTable(51); }
+  public static void addPos(FlatBufferBuilder builder, int posOffset) { builder.addStruct(0, posOffset, 0); }
+  public static void addMana(FlatBufferBuilder builder, short mana) { builder.addShort(1, mana, 150); }
+  public static void addHp(FlatBufferBuilder builder, short hp) { builder.addShort(2, hp, 100); }
+  public static void addName(FlatBufferBuilder builder, int nameOffset) { builder.addOffset(3, nameOffset, 0); }
+  public static void addInventory(FlatBufferBuilder builder, int inventoryOffset) { builder.addOffset(5, inventoryOffset, 0); }
+  public static int createInventoryVector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createInventoryVector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startInventoryVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addColor(FlatBufferBuilder builder, int color) { builder.addByte(6, (byte)color, (byte)8); }
+  public static void addTestType(FlatBufferBuilder builder, byte testType) { builder.addByte(7, testType, 0); }
+  public static void addTest(FlatBufferBuilder builder, int testOffset) { builder.addOffset(8, testOffset, 0); }
+  public static void addTest4(FlatBufferBuilder builder, int test4Offset) { builder.addOffset(9, test4Offset, 0); }
+  public static void startTest4Vector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 2); }
+  public static void addTestarrayofstring(FlatBufferBuilder builder, int testarrayofstringOffset) { builder.addOffset(10, testarrayofstringOffset, 0); }
+  public static int createTestarrayofstringVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startTestarrayofstringVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static void addTestarrayoftables(FlatBufferBuilder builder, int testarrayoftablesOffset) { builder.addOffset(11, testarrayoftablesOffset, 0); }
+  public static int createTestarrayoftablesVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startTestarrayoftablesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static void addEnemy(FlatBufferBuilder builder, int enemyOffset) { builder.addOffset(12, enemyOffset, 0); }
+  public static void addTestnestedflatbuffer(FlatBufferBuilder builder, int testnestedflatbufferOffset) { builder.addOffset(13, testnestedflatbufferOffset, 0); }
+  public static int createTestnestedflatbufferVector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createTestnestedflatbufferVector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startTestnestedflatbufferVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addTestempty(FlatBufferBuilder builder, int testemptyOffset) { builder.addOffset(14, testemptyOffset, 0); }
+  public static void addTestbool(FlatBufferBuilder builder, boolean testbool) { builder.addBoolean(15, testbool, false); }
+  public static void addTesthashs32Fnv1(FlatBufferBuilder builder, int testhashs32Fnv1) { builder.addInt(16, testhashs32Fnv1, 0); }
+  public static void addTesthashu32Fnv1(FlatBufferBuilder builder, long testhashu32Fnv1) { builder.addInt(17, (int)testhashu32Fnv1, (int)0L); }
+  public static void addTesthashs64Fnv1(FlatBufferBuilder builder, long testhashs64Fnv1) { builder.addLong(18, testhashs64Fnv1, 0L); }
+  public static void addTesthashu64Fnv1(FlatBufferBuilder builder, long testhashu64Fnv1) { builder.addLong(19, testhashu64Fnv1, 0L); }
+  public static void addTesthashs32Fnv1a(FlatBufferBuilder builder, int testhashs32Fnv1a) { builder.addInt(20, testhashs32Fnv1a, 0); }
+  public static void addTesthashu32Fnv1a(FlatBufferBuilder builder, long testhashu32Fnv1a) { builder.addInt(21, (int)testhashu32Fnv1a, (int)0L); }
+  public static void addTesthashs64Fnv1a(FlatBufferBuilder builder, long testhashs64Fnv1a) { builder.addLong(22, testhashs64Fnv1a, 0L); }
+  public static void addTesthashu64Fnv1a(FlatBufferBuilder builder, long testhashu64Fnv1a) { builder.addLong(23, testhashu64Fnv1a, 0L); }
+  public static void addTestarrayofbools(FlatBufferBuilder builder, int testarrayofboolsOffset) { builder.addOffset(24, testarrayofboolsOffset, 0); }
+  public static int createTestarrayofboolsVector(FlatBufferBuilder builder, boolean[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addBoolean(data[i]); return builder.endVector(); }
+  public static void startTestarrayofboolsVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addTestf(FlatBufferBuilder builder, float testf) { builder.addFloat(25, testf, 3.14159f); }
+  public static void addTestf2(FlatBufferBuilder builder, float testf2) { builder.addFloat(26, testf2, 3.0f); }
+  public static void addTestf3(FlatBufferBuilder builder, float testf3) { builder.addFloat(27, testf3, 0.0f); }
+  public static void addTestarrayofstring2(FlatBufferBuilder builder, int testarrayofstring2Offset) { builder.addOffset(28, testarrayofstring2Offset, 0); }
+  public static int createTestarrayofstring2Vector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startTestarrayofstring2Vector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static void addTestarrayofsortedstruct(FlatBufferBuilder builder, int testarrayofsortedstructOffset) { builder.addOffset(29, testarrayofsortedstructOffset, 0); }
+  public static void startTestarrayofsortedstructVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 4); }
+  public static void addFlex(FlatBufferBuilder builder, int flexOffset) { builder.addOffset(30, flexOffset, 0); }
+  public static int createFlexVector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createFlexVector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startFlexVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addTest5(FlatBufferBuilder builder, int test5Offset) { builder.addOffset(31, test5Offset, 0); }
+  public static void startTest5Vector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 2); }
+  public static void addVectorOfLongs(FlatBufferBuilder builder, int vectorOfLongsOffset) { builder.addOffset(32, vectorOfLongsOffset, 0); }
+  public static int createVectorOfLongsVector(FlatBufferBuilder builder, long[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addLong(data[i]); return builder.endVector(); }
+  public static void startVectorOfLongsVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addVectorOfDoubles(FlatBufferBuilder builder, int vectorOfDoublesOffset) { builder.addOffset(33, vectorOfDoublesOffset, 0); }
+  public static int createVectorOfDoublesVector(FlatBufferBuilder builder, double[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addDouble(data[i]); return builder.endVector(); }
+  public static void startVectorOfDoublesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addParentNamespaceTest(FlatBufferBuilder builder, int parentNamespaceTestOffset) { builder.addOffset(34, parentNamespaceTestOffset, 0); }
+  public static void addVectorOfReferrables(FlatBufferBuilder builder, int vectorOfReferrablesOffset) { builder.addOffset(35, vectorOfReferrablesOffset, 0); }
+  public static int createVectorOfReferrablesVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startVectorOfReferrablesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static void addSingleWeakReference(FlatBufferBuilder builder, long singleWeakReference) { builder.addLong(36, singleWeakReference, 0L); }
+  public static void addVectorOfWeakReferences(FlatBufferBuilder builder, int vectorOfWeakReferencesOffset) { builder.addOffset(37, vectorOfWeakReferencesOffset, 0); }
+  public static int createVectorOfWeakReferencesVector(FlatBufferBuilder builder, long[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addLong(data[i]); return builder.endVector(); }
+  public static void startVectorOfWeakReferencesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addVectorOfStrongReferrables(FlatBufferBuilder builder, int vectorOfStrongReferrablesOffset) { builder.addOffset(38, vectorOfStrongReferrablesOffset, 0); }
+  public static int createVectorOfStrongReferrablesVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startVectorOfStrongReferrablesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static void addCoOwningReference(FlatBufferBuilder builder, long coOwningReference) { builder.addLong(39, coOwningReference, 0L); }
+  public static void addVectorOfCoOwningReferences(FlatBufferBuilder builder, int vectorOfCoOwningReferencesOffset) { builder.addOffset(40, vectorOfCoOwningReferencesOffset, 0); }
+  public static int createVectorOfCoOwningReferencesVector(FlatBufferBuilder builder, long[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addLong(data[i]); return builder.endVector(); }
+  public static void startVectorOfCoOwningReferencesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addNonOwningReference(FlatBufferBuilder builder, long nonOwningReference) { builder.addLong(41, nonOwningReference, 0L); }
+  public static void addVectorOfNonOwningReferences(FlatBufferBuilder builder, int vectorOfNonOwningReferencesOffset) { builder.addOffset(42, vectorOfNonOwningReferencesOffset, 0); }
+  public static int createVectorOfNonOwningReferencesVector(FlatBufferBuilder builder, long[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addLong(data[i]); return builder.endVector(); }
+  public static void startVectorOfNonOwningReferencesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addAnyUniqueType(FlatBufferBuilder builder, byte anyUniqueType) { builder.addByte(43, anyUniqueType, 0); }
+  public static void addAnyUnique(FlatBufferBuilder builder, int anyUniqueOffset) { builder.addOffset(44, anyUniqueOffset, 0); }
+  public static void addAnyAmbiguousType(FlatBufferBuilder builder, byte anyAmbiguousType) { builder.addByte(45, anyAmbiguousType, 0); }
+  public static void addAnyAmbiguous(FlatBufferBuilder builder, int anyAmbiguousOffset) { builder.addOffset(46, anyAmbiguousOffset, 0); }
+  public static void addVectorOfEnums(FlatBufferBuilder builder, int vectorOfEnumsOffset) { builder.addOffset(47, vectorOfEnumsOffset, 0); }
+  public static int createVectorOfEnumsVector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createVectorOfEnumsVector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startVectorOfEnumsVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addSignedEnum(FlatBufferBuilder builder, byte signedEnum) { builder.addByte(48, signedEnum, -1); }
+  public static void addTestrequirednestedflatbuffer(FlatBufferBuilder builder, int testrequirednestedflatbufferOffset) { builder.addOffset(49, testrequirednestedflatbufferOffset, 0); }
+  public static int createTestrequirednestedflatbufferVector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createTestrequirednestedflatbufferVector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startTestrequirednestedflatbufferVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addScalarKeySortedTables(FlatBufferBuilder builder, int scalarKeySortedTablesOffset) { builder.addOffset(50, scalarKeySortedTablesOffset, 0); }
+  public static int createScalarKeySortedTablesVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startScalarKeySortedTablesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static int endMonster(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    builder.required(o, 10);  // name
+    return o;
+  }
+  public static void finishMonsterBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset, "MONS"); }
+  public static void finishSizePrefixedMonsterBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset, "MONS"); }
+
+  @Override
+  protected int keysCompare(Integer o1, Integer o2, ByteBuffer _bb) { return compareStrings(__offset(10, o1, _bb), __offset(10, o2, _bb), _bb); }
+
+  public static Monster __lookup_by_key(Monster obj, int vectorLocation, String key, ByteBuffer bb) {
+    byte[] byteKey = key.getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    int span = bb.getInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb);
+      int comp = compareStrings(__offset(10, bb.capacity() - tableOffset, bb), byteKey, bb);
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return (obj == null ? new Monster() : obj).__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Monster get(int j) { return get(new Monster(), j); }
+    public Monster get(Monster obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+    public Monster getByKey(String key) {  return __lookup_by_key(null, __vector(), key, bb); }
+    public Monster getByKey(Monster obj, String key) {  return __lookup_by_key(obj, __vector(), key, bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.kt
new file mode 100644
index 0000000..2f120a1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.kt
@@ -0,0 +1,1068 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+/**
+ * an example documentation comment: "monster object"
+ */
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Monster : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Monster {
+        __init(_i, _bb)
+        return this
+    }
+    val pos : MyGame.Example.Vec3? get() = pos(MyGame.Example.Vec3())
+    fun pos(obj: MyGame.Example.Vec3) : MyGame.Example.Vec3? {
+        val o = __offset(4)
+        return if (o != 0) {
+            obj.__assign(o + bb_pos, bb)
+        } else {
+            null
+        }
+    }
+    val mana : Short
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.getShort(o + bb_pos) else 150
+        }
+    fun mutateMana(mana: Short) : Boolean {
+        val o = __offset(6)
+        return if (o != 0) {
+            bb.putShort(o + bb_pos, mana)
+            true
+        } else {
+            false
+        }
+    }
+    val hp : Short
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.getShort(o + bb_pos) else 100
+        }
+    fun mutateHp(hp: Short) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.putShort(o + bb_pos, hp)
+            true
+        } else {
+            false
+        }
+    }
+    val name : String?
+        get() {
+            val o = __offset(10)
+            return if (o != 0) __string(o + bb_pos) else null
+        }
+    val nameAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(10, 1)
+    fun nameInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 10, 1)
+    fun inventory(j: Int) : UByte {
+        val o = __offset(14)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val inventoryLength : Int
+        get() {
+            val o = __offset(14); return if (o != 0) __vector_len(o) else 0
+        }
+    val inventoryAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(14, 1)
+    fun inventoryInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 14, 1)
+    fun mutateInventory(j: Int, inventory: UByte) : Boolean {
+        val o = __offset(14)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, inventory.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val color : UByte
+        get() {
+            val o = __offset(16)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 8u
+        }
+    fun mutateColor(color: UByte) : Boolean {
+        val o = __offset(16)
+        return if (o != 0) {
+            bb.put(o + bb_pos, color.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val testType : UByte
+        get() {
+            val o = __offset(18)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateTestType(testType: UByte) : Boolean {
+        val o = __offset(18)
+        return if (o != 0) {
+            bb.put(o + bb_pos, testType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun test(obj: Table) : Table? {
+        val o = __offset(20); return if (o != 0) __union(obj, o + bb_pos) else null
+    }
+    fun test4(j: Int) : MyGame.Example.Test? = test4(MyGame.Example.Test(), j)
+    fun test4(obj: MyGame.Example.Test, j: Int) : MyGame.Example.Test? {
+        val o = __offset(22)
+        return if (o != 0) {
+            obj.__assign(__vector(o) + j * 4, bb)
+        } else {
+            null
+        }
+    }
+    val test4Length : Int
+        get() {
+            val o = __offset(22); return if (o != 0) __vector_len(o) else 0
+        }
+    fun testarrayofstring(j: Int) : String? {
+        val o = __offset(24)
+        return if (o != 0) {
+            __string(__vector(o) + j * 4)
+        } else {
+            null
+        }
+    }
+    val testarrayofstringLength : Int
+        get() {
+            val o = __offset(24); return if (o != 0) __vector_len(o) else 0
+        }
+    /**
+     * an example documentation comment: this will end up in the generated code
+     * multiline too
+     */
+    fun testarrayoftables(j: Int) : MyGame.Example.Monster? = testarrayoftables(MyGame.Example.Monster(), j)
+    fun testarrayoftables(obj: MyGame.Example.Monster, j: Int) : MyGame.Example.Monster? {
+        val o = __offset(26)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o) + j * 4), bb)
+        } else {
+            null
+        }
+    }
+    val testarrayoftablesLength : Int
+        get() {
+            val o = __offset(26); return if (o != 0) __vector_len(o) else 0
+        }
+    fun testarrayoftablesByKey(key: String) : MyGame.Example.Monster? {
+        val o = __offset(26)
+        return if (o != 0) {
+            MyGame.Example.Monster.__lookup_by_key(null, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    fun testarrayoftablesByKey(obj: MyGame.Example.Monster, key: String) : MyGame.Example.Monster? {
+        val o = __offset(26)
+        return if (o != 0) {
+            MyGame.Example.Monster.__lookup_by_key(obj, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    val enemy : MyGame.Example.Monster? get() = enemy(MyGame.Example.Monster())
+    fun enemy(obj: MyGame.Example.Monster) : MyGame.Example.Monster? {
+        val o = __offset(28)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    fun testnestedflatbuffer(j: Int) : UByte {
+        val o = __offset(30)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val testnestedflatbufferLength : Int
+        get() {
+            val o = __offset(30); return if (o != 0) __vector_len(o) else 0
+        }
+    val testnestedflatbufferAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(30, 1)
+    fun testnestedflatbufferInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 30, 1)
+    val testnestedflatbufferAsMonster : MyGame.Example.Monster? get() = testnestedflatbufferAsMonster(MyGame.Example.Monster())
+    fun testnestedflatbufferAsMonster(obj: MyGame.Example.Monster) : MyGame.Example.Monster? {
+        val o = __offset(30)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o)), bb)
+        } else {
+            null
+        }
+    }
+    fun mutateTestnestedflatbuffer(j: Int, testnestedflatbuffer: UByte) : Boolean {
+        val o = __offset(30)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, testnestedflatbuffer.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val testempty : MyGame.Example.Stat? get() = testempty(MyGame.Example.Stat())
+    fun testempty(obj: MyGame.Example.Stat) : MyGame.Example.Stat? {
+        val o = __offset(32)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    val testbool : Boolean
+        get() {
+            val o = __offset(34)
+            return if(o != 0) 0.toByte() != bb.get(o + bb_pos) else false
+        }
+    fun mutateTestbool(testbool: Boolean) : Boolean {
+        val o = __offset(34)
+        return if (o != 0) {
+            bb.put(o + bb_pos, (if(testbool) 1 else 0).toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val testhashs32Fnv1 : Int
+        get() {
+            val o = __offset(36)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    fun mutateTesthashs32Fnv1(testhashs32Fnv1: Int) : Boolean {
+        val o = __offset(36)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, testhashs32Fnv1)
+            true
+        } else {
+            false
+        }
+    }
+    val testhashu32Fnv1 : UInt
+        get() {
+            val o = __offset(38)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else 0u
+        }
+    fun mutateTesthashu32Fnv1(testhashu32Fnv1: UInt) : Boolean {
+        val o = __offset(38)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, testhashu32Fnv1.toInt())
+            true
+        } else {
+            false
+        }
+    }
+    val testhashs64Fnv1 : Long
+        get() {
+            val o = __offset(40)
+            return if(o != 0) bb.getLong(o + bb_pos) else 0L
+        }
+    fun mutateTesthashs64Fnv1(testhashs64Fnv1: Long) : Boolean {
+        val o = __offset(40)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, testhashs64Fnv1)
+            true
+        } else {
+            false
+        }
+    }
+    val testhashu64Fnv1 : ULong
+        get() {
+            val o = __offset(42)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateTesthashu64Fnv1(testhashu64Fnv1: ULong) : Boolean {
+        val o = __offset(42)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, testhashu64Fnv1.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    val testhashs32Fnv1a : Int
+        get() {
+            val o = __offset(44)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    fun mutateTesthashs32Fnv1a(testhashs32Fnv1a: Int) : Boolean {
+        val o = __offset(44)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, testhashs32Fnv1a)
+            true
+        } else {
+            false
+        }
+    }
+    val testhashu32Fnv1a : UInt
+        get() {
+            val o = __offset(46)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else 0u
+        }
+    fun mutateTesthashu32Fnv1a(testhashu32Fnv1a: UInt) : Boolean {
+        val o = __offset(46)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, testhashu32Fnv1a.toInt())
+            true
+        } else {
+            false
+        }
+    }
+    val testhashs64Fnv1a : Long
+        get() {
+            val o = __offset(48)
+            return if(o != 0) bb.getLong(o + bb_pos) else 0L
+        }
+    fun mutateTesthashs64Fnv1a(testhashs64Fnv1a: Long) : Boolean {
+        val o = __offset(48)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, testhashs64Fnv1a)
+            true
+        } else {
+            false
+        }
+    }
+    val testhashu64Fnv1a : ULong
+        get() {
+            val o = __offset(50)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateTesthashu64Fnv1a(testhashu64Fnv1a: ULong) : Boolean {
+        val o = __offset(50)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, testhashu64Fnv1a.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    fun testarrayofbools(j: Int) : Boolean {
+        val o = __offset(52)
+        return if (o != 0) {
+            0.toByte() != bb.get(__vector(o) + j * 1)
+        } else {
+            false
+        }
+    }
+    val testarrayofboolsLength : Int
+        get() {
+            val o = __offset(52); return if (o != 0) __vector_len(o) else 0
+        }
+    val testarrayofboolsAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(52, 1)
+    fun testarrayofboolsInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 52, 1)
+    fun mutateTestarrayofbools(j: Int, testarrayofbools: Boolean) : Boolean {
+        val o = __offset(52)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, (if(testarrayofbools) 1 else 0).toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val testf : Float
+        get() {
+            val o = __offset(54)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 3.14159f
+        }
+    fun mutateTestf(testf: Float) : Boolean {
+        val o = __offset(54)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, testf)
+            true
+        } else {
+            false
+        }
+    }
+    val testf2 : Float
+        get() {
+            val o = __offset(56)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 3.0f
+        }
+    fun mutateTestf2(testf2: Float) : Boolean {
+        val o = __offset(56)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, testf2)
+            true
+        } else {
+            false
+        }
+    }
+    val testf3 : Float
+        get() {
+            val o = __offset(58)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 0.0f
+        }
+    fun mutateTestf3(testf3: Float) : Boolean {
+        val o = __offset(58)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, testf3)
+            true
+        } else {
+            false
+        }
+    }
+    fun testarrayofstring2(j: Int) : String? {
+        val o = __offset(60)
+        return if (o != 0) {
+            __string(__vector(o) + j * 4)
+        } else {
+            null
+        }
+    }
+    val testarrayofstring2Length : Int
+        get() {
+            val o = __offset(60); return if (o != 0) __vector_len(o) else 0
+        }
+    fun testarrayofsortedstruct(j: Int) : MyGame.Example.Ability? = testarrayofsortedstruct(MyGame.Example.Ability(), j)
+    fun testarrayofsortedstruct(obj: MyGame.Example.Ability, j: Int) : MyGame.Example.Ability? {
+        val o = __offset(62)
+        return if (o != 0) {
+            obj.__assign(__vector(o) + j * 8, bb)
+        } else {
+            null
+        }
+    }
+    val testarrayofsortedstructLength : Int
+        get() {
+            val o = __offset(62); return if (o != 0) __vector_len(o) else 0
+        }
+    fun flex(j: Int) : UByte {
+        val o = __offset(64)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val flexLength : Int
+        get() {
+            val o = __offset(64); return if (o != 0) __vector_len(o) else 0
+        }
+    val flexAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(64, 1)
+    fun flexInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 64, 1)
+    fun mutateFlex(j: Int, flex: UByte) : Boolean {
+        val o = __offset(64)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, flex.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun test5(j: Int) : MyGame.Example.Test? = test5(MyGame.Example.Test(), j)
+    fun test5(obj: MyGame.Example.Test, j: Int) : MyGame.Example.Test? {
+        val o = __offset(66)
+        return if (o != 0) {
+            obj.__assign(__vector(o) + j * 4, bb)
+        } else {
+            null
+        }
+    }
+    val test5Length : Int
+        get() {
+            val o = __offset(66); return if (o != 0) __vector_len(o) else 0
+        }
+    fun vectorOfLongs(j: Int) : Long {
+        val o = __offset(68)
+        return if (o != 0) {
+            bb.getLong(__vector(o) + j * 8)
+        } else {
+            0
+        }
+    }
+    val vectorOfLongsLength : Int
+        get() {
+            val o = __offset(68); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfLongsAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(68, 8)
+    fun vectorOfLongsInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 68, 8)
+    fun mutateVectorOfLongs(j: Int, vectorOfLongs: Long) : Boolean {
+        val o = __offset(68)
+        return if (o != 0) {
+            bb.putLong(__vector(o) + j * 8, vectorOfLongs)
+            true
+        } else {
+            false
+        }
+    }
+    fun vectorOfDoubles(j: Int) : Double {
+        val o = __offset(70)
+        return if (o != 0) {
+            bb.getDouble(__vector(o) + j * 8)
+        } else {
+            0.0
+        }
+    }
+    val vectorOfDoublesLength : Int
+        get() {
+            val o = __offset(70); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfDoublesAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(70, 8)
+    fun vectorOfDoublesInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 70, 8)
+    fun mutateVectorOfDoubles(j: Int, vectorOfDoubles: Double) : Boolean {
+        val o = __offset(70)
+        return if (o != 0) {
+            bb.putDouble(__vector(o) + j * 8, vectorOfDoubles)
+            true
+        } else {
+            false
+        }
+    }
+    val parentNamespaceTest : MyGame.InParentNamespace? get() = parentNamespaceTest(MyGame.InParentNamespace())
+    fun parentNamespaceTest(obj: MyGame.InParentNamespace) : MyGame.InParentNamespace? {
+        val o = __offset(72)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    fun vectorOfReferrables(j: Int) : MyGame.Example.Referrable? = vectorOfReferrables(MyGame.Example.Referrable(), j)
+    fun vectorOfReferrables(obj: MyGame.Example.Referrable, j: Int) : MyGame.Example.Referrable? {
+        val o = __offset(74)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o) + j * 4), bb)
+        } else {
+            null
+        }
+    }
+    val vectorOfReferrablesLength : Int
+        get() {
+            val o = __offset(74); return if (o != 0) __vector_len(o) else 0
+        }
+    fun vectorOfReferrablesByKey(key: ULong) : MyGame.Example.Referrable? {
+        val o = __offset(74)
+        return if (o != 0) {
+            MyGame.Example.Referrable.__lookup_by_key(null, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    fun vectorOfReferrablesByKey(obj: MyGame.Example.Referrable, key: ULong) : MyGame.Example.Referrable? {
+        val o = __offset(74)
+        return if (o != 0) {
+            MyGame.Example.Referrable.__lookup_by_key(obj, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    val singleWeakReference : ULong
+        get() {
+            val o = __offset(76)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateSingleWeakReference(singleWeakReference: ULong) : Boolean {
+        val o = __offset(76)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, singleWeakReference.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    fun vectorOfWeakReferences(j: Int) : ULong {
+        val o = __offset(78)
+        return if (o != 0) {
+            bb.getLong(__vector(o) + j * 8).toULong()
+        } else {
+            0uL
+        }
+    }
+    val vectorOfWeakReferencesLength : Int
+        get() {
+            val o = __offset(78); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfWeakReferencesAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(78, 8)
+    fun vectorOfWeakReferencesInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 78, 8)
+    fun mutateVectorOfWeakReferences(j: Int, vectorOfWeakReferences: ULong) : Boolean {
+        val o = __offset(78)
+        return if (o != 0) {
+            bb.putLong(__vector(o) + j * 8, vectorOfWeakReferences.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    fun vectorOfStrongReferrables(j: Int) : MyGame.Example.Referrable? = vectorOfStrongReferrables(MyGame.Example.Referrable(), j)
+    fun vectorOfStrongReferrables(obj: MyGame.Example.Referrable, j: Int) : MyGame.Example.Referrable? {
+        val o = __offset(80)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o) + j * 4), bb)
+        } else {
+            null
+        }
+    }
+    val vectorOfStrongReferrablesLength : Int
+        get() {
+            val o = __offset(80); return if (o != 0) __vector_len(o) else 0
+        }
+    fun vectorOfStrongReferrablesByKey(key: ULong) : MyGame.Example.Referrable? {
+        val o = __offset(80)
+        return if (o != 0) {
+            MyGame.Example.Referrable.__lookup_by_key(null, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    fun vectorOfStrongReferrablesByKey(obj: MyGame.Example.Referrable, key: ULong) : MyGame.Example.Referrable? {
+        val o = __offset(80)
+        return if (o != 0) {
+            MyGame.Example.Referrable.__lookup_by_key(obj, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    val coOwningReference : ULong
+        get() {
+            val o = __offset(82)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateCoOwningReference(coOwningReference: ULong) : Boolean {
+        val o = __offset(82)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, coOwningReference.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    fun vectorOfCoOwningReferences(j: Int) : ULong {
+        val o = __offset(84)
+        return if (o != 0) {
+            bb.getLong(__vector(o) + j * 8).toULong()
+        } else {
+            0uL
+        }
+    }
+    val vectorOfCoOwningReferencesLength : Int
+        get() {
+            val o = __offset(84); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfCoOwningReferencesAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(84, 8)
+    fun vectorOfCoOwningReferencesInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 84, 8)
+    fun mutateVectorOfCoOwningReferences(j: Int, vectorOfCoOwningReferences: ULong) : Boolean {
+        val o = __offset(84)
+        return if (o != 0) {
+            bb.putLong(__vector(o) + j * 8, vectorOfCoOwningReferences.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    val nonOwningReference : ULong
+        get() {
+            val o = __offset(86)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateNonOwningReference(nonOwningReference: ULong) : Boolean {
+        val o = __offset(86)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, nonOwningReference.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    fun vectorOfNonOwningReferences(j: Int) : ULong {
+        val o = __offset(88)
+        return if (o != 0) {
+            bb.getLong(__vector(o) + j * 8).toULong()
+        } else {
+            0uL
+        }
+    }
+    val vectorOfNonOwningReferencesLength : Int
+        get() {
+            val o = __offset(88); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfNonOwningReferencesAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(88, 8)
+    fun vectorOfNonOwningReferencesInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 88, 8)
+    fun mutateVectorOfNonOwningReferences(j: Int, vectorOfNonOwningReferences: ULong) : Boolean {
+        val o = __offset(88)
+        return if (o != 0) {
+            bb.putLong(__vector(o) + j * 8, vectorOfNonOwningReferences.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    val anyUniqueType : UByte
+        get() {
+            val o = __offset(90)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateAnyUniqueType(anyUniqueType: UByte) : Boolean {
+        val o = __offset(90)
+        return if (o != 0) {
+            bb.put(o + bb_pos, anyUniqueType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun anyUnique(obj: Table) : Table? {
+        val o = __offset(92); return if (o != 0) __union(obj, o + bb_pos) else null
+    }
+    val anyAmbiguousType : UByte
+        get() {
+            val o = __offset(94)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateAnyAmbiguousType(anyAmbiguousType: UByte) : Boolean {
+        val o = __offset(94)
+        return if (o != 0) {
+            bb.put(o + bb_pos, anyAmbiguousType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun anyAmbiguous(obj: Table) : Table? {
+        val o = __offset(96); return if (o != 0) __union(obj, o + bb_pos) else null
+    }
+    fun vectorOfEnums(j: Int) : UByte {
+        val o = __offset(98)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val vectorOfEnumsLength : Int
+        get() {
+            val o = __offset(98); return if (o != 0) __vector_len(o) else 0
+        }
+    val vectorOfEnumsAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(98, 1)
+    fun vectorOfEnumsInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 98, 1)
+    fun mutateVectorOfEnums(j: Int, vectorOfEnums: UByte) : Boolean {
+        val o = __offset(98)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, vectorOfEnums.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val signedEnum : Byte
+        get() {
+            val o = __offset(100)
+            return if(o != 0) bb.get(o + bb_pos) else -1
+        }
+    fun mutateSignedEnum(signedEnum: Byte) : Boolean {
+        val o = __offset(100)
+        return if (o != 0) {
+            bb.put(o + bb_pos, signedEnum)
+            true
+        } else {
+            false
+        }
+    }
+    fun testrequirednestedflatbuffer(j: Int) : UByte {
+        val o = __offset(102)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val testrequirednestedflatbufferLength : Int
+        get() {
+            val o = __offset(102); return if (o != 0) __vector_len(o) else 0
+        }
+    val testrequirednestedflatbufferAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(102, 1)
+    fun testrequirednestedflatbufferInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 102, 1)
+    val testrequirednestedflatbufferAsMonster : MyGame.Example.Monster? get() = testrequirednestedflatbufferAsMonster(MyGame.Example.Monster())
+    fun testrequirednestedflatbufferAsMonster(obj: MyGame.Example.Monster) : MyGame.Example.Monster? {
+        val o = __offset(102)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o)), bb)
+        } else {
+            null
+        }
+    }
+    fun mutateTestrequirednestedflatbuffer(j: Int, testrequirednestedflatbuffer: UByte) : Boolean {
+        val o = __offset(102)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, testrequirednestedflatbuffer.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun scalarKeySortedTables(j: Int) : MyGame.Example.Stat? = scalarKeySortedTables(MyGame.Example.Stat(), j)
+    fun scalarKeySortedTables(obj: MyGame.Example.Stat, j: Int) : MyGame.Example.Stat? {
+        val o = __offset(104)
+        return if (o != 0) {
+            obj.__assign(__indirect(__vector(o) + j * 4), bb)
+        } else {
+            null
+        }
+    }
+    val scalarKeySortedTablesLength : Int
+        get() {
+            val o = __offset(104); return if (o != 0) __vector_len(o) else 0
+        }
+    fun scalarKeySortedTablesByKey(key: UShort) : MyGame.Example.Stat? {
+        val o = __offset(104)
+        return if (o != 0) {
+            MyGame.Example.Stat.__lookup_by_key(null, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    fun scalarKeySortedTablesByKey(obj: MyGame.Example.Stat, key: UShort) : MyGame.Example.Stat? {
+        val o = __offset(104)
+        return if (o != 0) {
+            MyGame.Example.Stat.__lookup_by_key(obj, __vector(o), key, bb)
+        } else {
+            null
+        }
+    }
+    override fun keysCompare(o1: Int, o2: Int, _bb: ByteBuffer) : Int {
+         return compareStrings(__offset(10, o1, _bb), __offset(10, o2, _bb), _bb)
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsMonster(_bb: ByteBuffer): Monster = getRootAsMonster(_bb, Monster())
+        fun getRootAsMonster(_bb: ByteBuffer, obj: Monster): Monster {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun MonsterBufferHasIdentifier(_bb: ByteBuffer) : Boolean = __has_identifier(_bb, "MONS")
+        fun startMonster(builder: FlatBufferBuilder) = builder.startTable(51)
+        fun addPos(builder: FlatBufferBuilder, pos: Int) = builder.addStruct(0, pos, 0)
+        fun addMana(builder: FlatBufferBuilder, mana: Short) = builder.addShort(1, mana, 150)
+        fun addHp(builder: FlatBufferBuilder, hp: Short) = builder.addShort(2, hp, 100)
+        fun addName(builder: FlatBufferBuilder, name: Int) = builder.addOffset(3, name, 0)
+        fun addInventory(builder: FlatBufferBuilder, inventory: Int) = builder.addOffset(5, inventory, 0)
+        fun createInventoryVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startInventoryVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addColor(builder: FlatBufferBuilder, color: UByte) = builder.addByte(6, color.toByte(), 8)
+        fun addTestType(builder: FlatBufferBuilder, testType: UByte) = builder.addByte(7, testType.toByte(), 0)
+        fun addTest(builder: FlatBufferBuilder, test: Int) = builder.addOffset(8, test, 0)
+        fun addTest4(builder: FlatBufferBuilder, test4: Int) = builder.addOffset(9, test4, 0)
+        fun startTest4Vector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 2)
+        fun addTestarrayofstring(builder: FlatBufferBuilder, testarrayofstring: Int) = builder.addOffset(10, testarrayofstring, 0)
+        fun createTestarrayofstringVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startTestarrayofstringVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun addTestarrayoftables(builder: FlatBufferBuilder, testarrayoftables: Int) = builder.addOffset(11, testarrayoftables, 0)
+        fun createTestarrayoftablesVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startTestarrayoftablesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun addEnemy(builder: FlatBufferBuilder, enemy: Int) = builder.addOffset(12, enemy, 0)
+        fun addTestnestedflatbuffer(builder: FlatBufferBuilder, testnestedflatbuffer: Int) = builder.addOffset(13, testnestedflatbuffer, 0)
+        fun createTestnestedflatbufferVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startTestnestedflatbufferVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addTestempty(builder: FlatBufferBuilder, testempty: Int) = builder.addOffset(14, testempty, 0)
+        fun addTestbool(builder: FlatBufferBuilder, testbool: Boolean) = builder.addBoolean(15, testbool, false)
+        fun addTesthashs32Fnv1(builder: FlatBufferBuilder, testhashs32Fnv1: Int) = builder.addInt(16, testhashs32Fnv1, 0)
+        fun addTesthashu32Fnv1(builder: FlatBufferBuilder, testhashu32Fnv1: UInt) = builder.addInt(17, testhashu32Fnv1.toInt(), 0)
+        fun addTesthashs64Fnv1(builder: FlatBufferBuilder, testhashs64Fnv1: Long) = builder.addLong(18, testhashs64Fnv1, 0L)
+        fun addTesthashu64Fnv1(builder: FlatBufferBuilder, testhashu64Fnv1: ULong) = builder.addLong(19, testhashu64Fnv1.toLong(), 0)
+        fun addTesthashs32Fnv1a(builder: FlatBufferBuilder, testhashs32Fnv1a: Int) = builder.addInt(20, testhashs32Fnv1a, 0)
+        fun addTesthashu32Fnv1a(builder: FlatBufferBuilder, testhashu32Fnv1a: UInt) = builder.addInt(21, testhashu32Fnv1a.toInt(), 0)
+        fun addTesthashs64Fnv1a(builder: FlatBufferBuilder, testhashs64Fnv1a: Long) = builder.addLong(22, testhashs64Fnv1a, 0L)
+        fun addTesthashu64Fnv1a(builder: FlatBufferBuilder, testhashu64Fnv1a: ULong) = builder.addLong(23, testhashu64Fnv1a.toLong(), 0)
+        fun addTestarrayofbools(builder: FlatBufferBuilder, testarrayofbools: Int) = builder.addOffset(24, testarrayofbools, 0)
+        fun createTestarrayofboolsVector(builder: FlatBufferBuilder, data: BooleanArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addBoolean(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startTestarrayofboolsVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addTestf(builder: FlatBufferBuilder, testf: Float) = builder.addFloat(25, testf, 3.14159)
+        fun addTestf2(builder: FlatBufferBuilder, testf2: Float) = builder.addFloat(26, testf2, 3.0)
+        fun addTestf3(builder: FlatBufferBuilder, testf3: Float) = builder.addFloat(27, testf3, 0.0)
+        fun addTestarrayofstring2(builder: FlatBufferBuilder, testarrayofstring2: Int) = builder.addOffset(28, testarrayofstring2, 0)
+        fun createTestarrayofstring2Vector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startTestarrayofstring2Vector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun addTestarrayofsortedstruct(builder: FlatBufferBuilder, testarrayofsortedstruct: Int) = builder.addOffset(29, testarrayofsortedstruct, 0)
+        fun startTestarrayofsortedstructVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 4)
+        fun addFlex(builder: FlatBufferBuilder, flex: Int) = builder.addOffset(30, flex, 0)
+        fun createFlexVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startFlexVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addTest5(builder: FlatBufferBuilder, test5: Int) = builder.addOffset(31, test5, 0)
+        fun startTest5Vector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 2)
+        fun addVectorOfLongs(builder: FlatBufferBuilder, vectorOfLongs: Int) = builder.addOffset(32, vectorOfLongs, 0)
+        fun createVectorOfLongsVector(builder: FlatBufferBuilder, data: LongArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addLong(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfLongsVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addVectorOfDoubles(builder: FlatBufferBuilder, vectorOfDoubles: Int) = builder.addOffset(33, vectorOfDoubles, 0)
+        fun createVectorOfDoublesVector(builder: FlatBufferBuilder, data: DoubleArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addDouble(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfDoublesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addParentNamespaceTest(builder: FlatBufferBuilder, parentNamespaceTest: Int) = builder.addOffset(34, parentNamespaceTest, 0)
+        fun addVectorOfReferrables(builder: FlatBufferBuilder, vectorOfReferrables: Int) = builder.addOffset(35, vectorOfReferrables, 0)
+        fun createVectorOfReferrablesVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfReferrablesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun addSingleWeakReference(builder: FlatBufferBuilder, singleWeakReference: ULong) = builder.addLong(36, singleWeakReference.toLong(), 0)
+        fun addVectorOfWeakReferences(builder: FlatBufferBuilder, vectorOfWeakReferences: Int) = builder.addOffset(37, vectorOfWeakReferences, 0)
+        fun createVectorOfWeakReferencesVector(builder: FlatBufferBuilder, data: ULongArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addLong(data[i].toLong())
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfWeakReferencesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addVectorOfStrongReferrables(builder: FlatBufferBuilder, vectorOfStrongReferrables: Int) = builder.addOffset(38, vectorOfStrongReferrables, 0)
+        fun createVectorOfStrongReferrablesVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfStrongReferrablesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun addCoOwningReference(builder: FlatBufferBuilder, coOwningReference: ULong) = builder.addLong(39, coOwningReference.toLong(), 0)
+        fun addVectorOfCoOwningReferences(builder: FlatBufferBuilder, vectorOfCoOwningReferences: Int) = builder.addOffset(40, vectorOfCoOwningReferences, 0)
+        fun createVectorOfCoOwningReferencesVector(builder: FlatBufferBuilder, data: ULongArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addLong(data[i].toLong())
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfCoOwningReferencesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addNonOwningReference(builder: FlatBufferBuilder, nonOwningReference: ULong) = builder.addLong(41, nonOwningReference.toLong(), 0)
+        fun addVectorOfNonOwningReferences(builder: FlatBufferBuilder, vectorOfNonOwningReferences: Int) = builder.addOffset(42, vectorOfNonOwningReferences, 0)
+        fun createVectorOfNonOwningReferencesVector(builder: FlatBufferBuilder, data: ULongArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addLong(data[i].toLong())
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfNonOwningReferencesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addAnyUniqueType(builder: FlatBufferBuilder, anyUniqueType: UByte) = builder.addByte(43, anyUniqueType.toByte(), 0)
+        fun addAnyUnique(builder: FlatBufferBuilder, anyUnique: Int) = builder.addOffset(44, anyUnique, 0)
+        fun addAnyAmbiguousType(builder: FlatBufferBuilder, anyAmbiguousType: UByte) = builder.addByte(45, anyAmbiguousType.toByte(), 0)
+        fun addAnyAmbiguous(builder: FlatBufferBuilder, anyAmbiguous: Int) = builder.addOffset(46, anyAmbiguous, 0)
+        fun addVectorOfEnums(builder: FlatBufferBuilder, vectorOfEnums: Int) = builder.addOffset(47, vectorOfEnums, 0)
+        fun createVectorOfEnumsVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startVectorOfEnumsVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addSignedEnum(builder: FlatBufferBuilder, signedEnum: Byte) = builder.addByte(48, signedEnum, -1)
+        fun addTestrequirednestedflatbuffer(builder: FlatBufferBuilder, testrequirednestedflatbuffer: Int) = builder.addOffset(49, testrequirednestedflatbuffer, 0)
+        fun createTestrequirednestedflatbufferVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startTestrequirednestedflatbufferVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addScalarKeySortedTables(builder: FlatBufferBuilder, scalarKeySortedTables: Int) = builder.addOffset(50, scalarKeySortedTables, 0)
+        fun createScalarKeySortedTablesVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startScalarKeySortedTablesVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun endMonster(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+                builder.required(o, 10)
+            return o
+        }
+        fun finishMonsterBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finish(offset, "MONS")
+        fun finishSizePrefixedMonsterBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finishSizePrefixed(offset, "MONS")
+        fun __lookup_by_key(obj: Monster?, vectorLocation: Int, key: String, bb: ByteBuffer) : Monster? {
+            val byteKey = key.toByteArray(java.nio.charset.StandardCharsets.UTF_8)
+            var span = bb.getInt(vectorLocation - 4)
+            var start = 0
+            while (span != 0) {
+                var middle = span / 2
+                val tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb)
+                val comp = compareStrings(__offset(10, bb.capacity() - tableOffset, bb), byteKey, bb)
+                when {
+                    comp > 0 -> span = middle
+                    comp < 0 -> {
+                        middle++
+                        start += middle
+                        span -= middle
+                    }
+                    else -> {
+                        return (obj ?: Monster()).__assign(tableOffset, bb)
+                    }
+                }
+            }
+            return null
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.lua
new file mode 100644
index 0000000..26b59d3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.lua
@@ -0,0 +1,656 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+-- an example documentation comment: "monster object"
+local Monster = {} -- the module
+local Monster_mt = {} -- the class metatable
+
+function Monster.New()
+    local o = {}
+    setmetatable(o, {__index = Monster_mt})
+    return o
+end
+function Monster.GetRootAsMonster(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Monster.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Monster_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Monster_mt:Pos()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        local x = o + self.view.pos
+        local obj = require('MyGame.Example.Vec3').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Mana()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 150
+end
+function Monster_mt:Hp()
+    local o = self.view:Offset(8)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 100
+end
+function Monster_mt:Name()
+    local o = self.view:Offset(10)
+    if o ~= 0 then
+        return self.view:String(o + self.view.pos)
+    end
+end
+function Monster_mt:Inventory(j)
+    local o = self.view:Offset(14)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:InventoryAsString(start, stop)
+    return self.view:VectorAsString(14, start, stop)
+end
+function Monster_mt:InventoryLength()
+    local o = self.view:Offset(14)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Color()
+    local o = self.view:Offset(16)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 8
+end
+function Monster_mt:TestType()
+    local o = self.view:Offset(18)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Test()
+    local o = self.view:Offset(20)
+    if o ~= 0 then
+        local obj = flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)
+        self.view:Union(obj, o)
+        return obj
+    end
+end
+function Monster_mt:Test4(j)
+    local o = self.view:Offset(22)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        local obj = require('MyGame.Example.Test').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Test4Length()
+    local o = self.view:Offset(22)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Testarrayofstring(j)
+    local o = self.view:Offset(24)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:String(a + ((j-1) * 4))
+    end
+    return ''
+end
+function Monster_mt:TestarrayofstringLength()
+    local o = self.view:Offset(24)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+-- an example documentation comment: this will end up in the generated code
+-- multiline too
+function Monster_mt:Testarrayoftables(j)
+    local o = self.view:Offset(26)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        x = self.view:Indirect(x)
+        local obj = require('MyGame.Example.Monster').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:TestarrayoftablesLength()
+    local o = self.view:Offset(26)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Enemy()
+    local o = self.view:Offset(28)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('MyGame.Example.Monster').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Testnestedflatbuffer(j)
+    local o = self.view:Offset(30)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:TestnestedflatbufferAsString(start, stop)
+    return self.view:VectorAsString(30, start, stop)
+end
+function Monster_mt:TestnestedflatbufferLength()
+    local o = self.view:Offset(30)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Testempty()
+    local o = self.view:Offset(32)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('MyGame.Example.Stat').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Testbool()
+    local o = self.view:Offset(34)
+    if o ~= 0 then
+        return (self.view:Get(flatbuffers.N.Bool, o + self.view.pos) ~= 0)
+    end
+    return false
+end
+function Monster_mt:Testhashs32Fnv1()
+    local o = self.view:Offset(36)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int32, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashu32Fnv1()
+    local o = self.view:Offset(38)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint32, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashs64Fnv1()
+    local o = self.view:Offset(40)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashu64Fnv1()
+    local o = self.view:Offset(42)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashs32Fnv1a()
+    local o = self.view:Offset(44)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int32, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashu32Fnv1a()
+    local o = self.view:Offset(46)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint32, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashs64Fnv1a()
+    local o = self.view:Offset(48)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testhashu64Fnv1a()
+    local o = self.view:Offset(50)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:Testarrayofbools(j)
+    local o = self.view:Offset(52)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Bool, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:TestarrayofboolsLength()
+    local o = self.view:Offset(52)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Testf()
+    local o = self.view:Offset(54)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Float32, o + self.view.pos)
+    end
+    return 3.14159
+end
+function Monster_mt:Testf2()
+    local o = self.view:Offset(56)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Float32, o + self.view.pos)
+    end
+    return 3.0
+end
+function Monster_mt:Testf3()
+    local o = self.view:Offset(58)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Float32, o + self.view.pos)
+    end
+    return 0.0
+end
+function Monster_mt:Testarrayofstring2(j)
+    local o = self.view:Offset(60)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:String(a + ((j-1) * 4))
+    end
+    return ''
+end
+function Monster_mt:Testarrayofstring2Length()
+    local o = self.view:Offset(60)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Testarrayofsortedstruct(j)
+    local o = self.view:Offset(62)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 8)
+        local obj = require('MyGame.Example.Ability').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:TestarrayofsortedstructLength()
+    local o = self.view:Offset(62)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Flex(j)
+    local o = self.view:Offset(64)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:FlexAsString(start, stop)
+    return self.view:VectorAsString(64, start, stop)
+end
+function Monster_mt:FlexLength()
+    local o = self.view:Offset(64)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:Test5(j)
+    local o = self.view:Offset(66)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        local obj = require('MyGame.Example.Test').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:Test5Length()
+    local o = self.view:Offset(66)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:VectorOfLongs(j)
+    local o = self.view:Offset(68)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Int64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function Monster_mt:VectorOfLongsLength()
+    local o = self.view:Offset(68)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:VectorOfDoubles(j)
+    local o = self.view:Offset(70)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Float64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function Monster_mt:VectorOfDoublesLength()
+    local o = self.view:Offset(70)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:ParentNamespaceTest()
+    local o = self.view:Offset(72)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('MyGame.InParentNamespace').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:VectorOfReferrables(j)
+    local o = self.view:Offset(74)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        x = self.view:Indirect(x)
+        local obj = require('MyGame.Example.Referrable').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:VectorOfReferrablesLength()
+    local o = self.view:Offset(74)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:SingleWeakReference()
+    local o = self.view:Offset(76)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:VectorOfWeakReferences(j)
+    local o = self.view:Offset(78)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function Monster_mt:VectorOfWeakReferencesLength()
+    local o = self.view:Offset(78)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:VectorOfStrongReferrables(j)
+    local o = self.view:Offset(80)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        x = self.view:Indirect(x)
+        local obj = require('MyGame.Example.Referrable').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:VectorOfStrongReferrablesLength()
+    local o = self.view:Offset(80)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:CoOwningReference()
+    local o = self.view:Offset(82)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:VectorOfCoOwningReferences(j)
+    local o = self.view:Offset(84)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function Monster_mt:VectorOfCoOwningReferencesLength()
+    local o = self.view:Offset(84)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:NonOwningReference()
+    local o = self.view:Offset(86)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:VectorOfNonOwningReferences(j)
+    local o = self.view:Offset(88)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function Monster_mt:VectorOfNonOwningReferencesLength()
+    local o = self.view:Offset(88)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:AnyUniqueType()
+    local o = self.view:Offset(90)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:AnyUnique()
+    local o = self.view:Offset(92)
+    if o ~= 0 then
+        local obj = flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)
+        self.view:Union(obj, o)
+        return obj
+    end
+end
+function Monster_mt:AnyAmbiguousType()
+    local o = self.view:Offset(94)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function Monster_mt:AnyAmbiguous()
+    local o = self.view:Offset(96)
+    if o ~= 0 then
+        local obj = flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)
+        self.view:Union(obj, o)
+        return obj
+    end
+end
+function Monster_mt:VectorOfEnums(j)
+    local o = self.view:Offset(98)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:VectorOfEnumsAsString(start, stop)
+    return self.view:VectorAsString(98, start, stop)
+end
+function Monster_mt:VectorOfEnumsLength()
+    local o = self.view:Offset(98)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:SignedEnum()
+    local o = self.view:Offset(100)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int8, o + self.view.pos)
+    end
+    return -1
+end
+function Monster_mt:Testrequirednestedflatbuffer(j)
+    local o = self.view:Offset(102)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Uint8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function Monster_mt:TestrequirednestedflatbufferAsString(start, stop)
+    return self.view:VectorAsString(102, start, stop)
+end
+function Monster_mt:TestrequirednestedflatbufferLength()
+    local o = self.view:Offset(102)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster_mt:ScalarKeySortedTables(j)
+    local o = self.view:Offset(104)
+    if o ~= 0 then
+        local x = self.view:Vector(o)
+        x = x + ((j-1) * 4)
+        x = self.view:Indirect(x)
+        local obj = require('MyGame.Example.Stat').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function Monster_mt:ScalarKeySortedTablesLength()
+    local o = self.view:Offset(104)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function Monster.Start(builder) builder:StartObject(51) end
+function Monster.AddPos(builder, pos) builder:PrependStructSlot(0, pos, 0) end
+function Monster.AddMana(builder, mana) builder:PrependInt16Slot(1, mana, 150) end
+function Monster.AddHp(builder, hp) builder:PrependInt16Slot(2, hp, 100) end
+function Monster.AddName(builder, name) builder:PrependUOffsetTRelativeSlot(3, name, 0) end
+function Monster.AddInventory(builder, inventory) builder:PrependUOffsetTRelativeSlot(5, inventory, 0) end
+function Monster.StartInventoryVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddColor(builder, color) builder:PrependUint8Slot(6, color, 8) end
+function Monster.AddTestType(builder, testType) builder:PrependUint8Slot(7, testType, 0) end
+function Monster.AddTest(builder, test) builder:PrependUOffsetTRelativeSlot(8, test, 0) end
+function Monster.AddTest4(builder, test4) builder:PrependUOffsetTRelativeSlot(9, test4, 0) end
+function Monster.StartTest4Vector(builder, numElems) return builder:StartVector(4, numElems, 2) end
+function Monster.AddTestarrayofstring(builder, testarrayofstring) builder:PrependUOffsetTRelativeSlot(10, testarrayofstring, 0) end
+function Monster.StartTestarrayofstringVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddTestarrayoftables(builder, testarrayoftables) builder:PrependUOffsetTRelativeSlot(11, testarrayoftables, 0) end
+function Monster.StartTestarrayoftablesVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddEnemy(builder, enemy) builder:PrependUOffsetTRelativeSlot(12, enemy, 0) end
+function Monster.AddTestnestedflatbuffer(builder, testnestedflatbuffer) builder:PrependUOffsetTRelativeSlot(13, testnestedflatbuffer, 0) end
+function Monster.StartTestnestedflatbufferVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddTestempty(builder, testempty) builder:PrependUOffsetTRelativeSlot(14, testempty, 0) end
+function Monster.AddTestbool(builder, testbool) builder:PrependBoolSlot(15, testbool, 0) end
+function Monster.AddTesthashs32Fnv1(builder, testhashs32Fnv1) builder:PrependInt32Slot(16, testhashs32Fnv1, 0) end
+function Monster.AddTesthashu32Fnv1(builder, testhashu32Fnv1) builder:PrependUint32Slot(17, testhashu32Fnv1, 0) end
+function Monster.AddTesthashs64Fnv1(builder, testhashs64Fnv1) builder:PrependInt64Slot(18, testhashs64Fnv1, 0) end
+function Monster.AddTesthashu64Fnv1(builder, testhashu64Fnv1) builder:PrependUint64Slot(19, testhashu64Fnv1, 0) end
+function Monster.AddTesthashs32Fnv1a(builder, testhashs32Fnv1a) builder:PrependInt32Slot(20, testhashs32Fnv1a, 0) end
+function Monster.AddTesthashu32Fnv1a(builder, testhashu32Fnv1a) builder:PrependUint32Slot(21, testhashu32Fnv1a, 0) end
+function Monster.AddTesthashs64Fnv1a(builder, testhashs64Fnv1a) builder:PrependInt64Slot(22, testhashs64Fnv1a, 0) end
+function Monster.AddTesthashu64Fnv1a(builder, testhashu64Fnv1a) builder:PrependUint64Slot(23, testhashu64Fnv1a, 0) end
+function Monster.AddTestarrayofbools(builder, testarrayofbools) builder:PrependUOffsetTRelativeSlot(24, testarrayofbools, 0) end
+function Monster.StartTestarrayofboolsVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddTestf(builder, testf) builder:PrependFloat32Slot(25, testf, 3.14159) end
+function Monster.AddTestf2(builder, testf2) builder:PrependFloat32Slot(26, testf2, 3.0) end
+function Monster.AddTestf3(builder, testf3) builder:PrependFloat32Slot(27, testf3, 0.0) end
+function Monster.AddTestarrayofstring2(builder, testarrayofstring2) builder:PrependUOffsetTRelativeSlot(28, testarrayofstring2, 0) end
+function Monster.StartTestarrayofstring2Vector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddTestarrayofsortedstruct(builder, testarrayofsortedstruct) builder:PrependUOffsetTRelativeSlot(29, testarrayofsortedstruct, 0) end
+function Monster.StartTestarrayofsortedstructVector(builder, numElems) return builder:StartVector(8, numElems, 4) end
+function Monster.AddFlex(builder, flex) builder:PrependUOffsetTRelativeSlot(30, flex, 0) end
+function Monster.StartFlexVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddTest5(builder, test5) builder:PrependUOffsetTRelativeSlot(31, test5, 0) end
+function Monster.StartTest5Vector(builder, numElems) return builder:StartVector(4, numElems, 2) end
+function Monster.AddVectorOfLongs(builder, vectorOfLongs) builder:PrependUOffsetTRelativeSlot(32, vectorOfLongs, 0) end
+function Monster.StartVectorOfLongsVector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function Monster.AddVectorOfDoubles(builder, vectorOfDoubles) builder:PrependUOffsetTRelativeSlot(33, vectorOfDoubles, 0) end
+function Monster.StartVectorOfDoublesVector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function Monster.AddParentNamespaceTest(builder, parentNamespaceTest) builder:PrependUOffsetTRelativeSlot(34, parentNamespaceTest, 0) end
+function Monster.AddVectorOfReferrables(builder, vectorOfReferrables) builder:PrependUOffsetTRelativeSlot(35, vectorOfReferrables, 0) end
+function Monster.StartVectorOfReferrablesVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddSingleWeakReference(builder, singleWeakReference) builder:PrependUint64Slot(36, singleWeakReference, 0) end
+function Monster.AddVectorOfWeakReferences(builder, vectorOfWeakReferences) builder:PrependUOffsetTRelativeSlot(37, vectorOfWeakReferences, 0) end
+function Monster.StartVectorOfWeakReferencesVector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function Monster.AddVectorOfStrongReferrables(builder, vectorOfStrongReferrables) builder:PrependUOffsetTRelativeSlot(38, vectorOfStrongReferrables, 0) end
+function Monster.StartVectorOfStrongReferrablesVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.AddCoOwningReference(builder, coOwningReference) builder:PrependUint64Slot(39, coOwningReference, 0) end
+function Monster.AddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences) builder:PrependUOffsetTRelativeSlot(40, vectorOfCoOwningReferences, 0) end
+function Monster.StartVectorOfCoOwningReferencesVector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function Monster.AddNonOwningReference(builder, nonOwningReference) builder:PrependUint64Slot(41, nonOwningReference, 0) end
+function Monster.AddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences) builder:PrependUOffsetTRelativeSlot(42, vectorOfNonOwningReferences, 0) end
+function Monster.StartVectorOfNonOwningReferencesVector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function Monster.AddAnyUniqueType(builder, anyUniqueType) builder:PrependUint8Slot(43, anyUniqueType, 0) end
+function Monster.AddAnyUnique(builder, anyUnique) builder:PrependUOffsetTRelativeSlot(44, anyUnique, 0) end
+function Monster.AddAnyAmbiguousType(builder, anyAmbiguousType) builder:PrependUint8Slot(45, anyAmbiguousType, 0) end
+function Monster.AddAnyAmbiguous(builder, anyAmbiguous) builder:PrependUOffsetTRelativeSlot(46, anyAmbiguous, 0) end
+function Monster.AddVectorOfEnums(builder, vectorOfEnums) builder:PrependUOffsetTRelativeSlot(47, vectorOfEnums, 0) end
+function Monster.StartVectorOfEnumsVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddSignedEnum(builder, signedEnum) builder:PrependInt8Slot(48, signedEnum, -1) end
+function Monster.AddTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer) builder:PrependUOffsetTRelativeSlot(49, testrequirednestedflatbuffer, 0) end
+function Monster.StartTestrequirednestedflatbufferVector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function Monster.AddScalarKeySortedTables(builder, scalarKeySortedTables) builder:PrependUOffsetTRelativeSlot(50, scalarKeySortedTables, 0) end
+function Monster.StartScalarKeySortedTablesVector(builder, numElems) return builder:StartVector(4, numElems, 4) end
+function Monster.End(builder) return builder:EndObject() end
+
+return Monster -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.php
new file mode 100644
index 0000000..17e2a67
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.php
@@ -0,0 +1,1783 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+/// an example documentation comment: "monster object"
+class Monster extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Monster
+     */
+    public static function getRootAsMonster(ByteBuffer $bb)
+    {
+        $obj = new Monster();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function MonsterIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function MonsterBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::MonsterIdentifier());
+    }
+
+    public static function MonsterExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Monster
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getPos()
+    {
+        $obj = new Vec3();
+        $o = $this->__offset(4);
+        return $o != 0 ? $obj->init($o + $this->bb_pos, $this->bb) : 0;
+    }
+
+    /**
+     * @return short
+     */
+    public function getMana()
+    {
+        $o = $this->__offset(6);
+        return $o != 0 ? $this->bb->getShort($o + $this->bb_pos) : 150;
+    }
+
+    /**
+     * @return short
+     */
+    public function getHp()
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->bb->getShort($o + $this->bb_pos) : 100;
+    }
+
+    public function getName()
+    {
+        $o = $this->__offset(10);
+        return $o != 0 ? $this->__string($o + $this->bb_pos) : null;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getInventory($j)
+    {
+        $o = $this->__offset(14);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getInventoryLength()
+    {
+        $o = $this->__offset(14);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return string
+     */
+    public function getInventoryBytes()
+    {
+        return $this->__vector_as_bytes(14);
+    }
+
+    /**
+     * @return byte
+     */
+    public function getColor()
+    {
+        $o = $this->__offset(16);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \MyGame\Example\Color::Blue;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getTestType()
+    {
+        $o = $this->__offset(18);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \MyGame\Example\Any::NONE;
+    }
+
+    /**
+     * @returnint
+     */
+    public function getTest($obj)
+    {
+        $o = $this->__offset(20);
+        return $o != 0 ? $this->__union($obj, $o) : null;
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getTest4($j)
+    {
+        $o = $this->__offset(22);
+        $obj = new Test();
+        return $o != 0 ? $obj->init($this->__vector($o) + $j *4, $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTest4Length()
+    {
+        $o = $this->__offset(22);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return string
+     */
+    public function getTestarrayofstring($j)
+    {
+        $o = $this->__offset(24);
+        return $o != 0 ? $this->__string($this->__vector($o) + $j * 4) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestarrayofstringLength()
+    {
+        $o = $this->__offset(24);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /// an example documentation comment: this will end up in the generated code
+    /// multiline too
+    /**
+     * @returnVectorOffset
+     */
+    public function getTestarrayoftables($j)
+    {
+        $o = $this->__offset(26);
+        $obj = new Monster();
+        return $o != 0 ? $obj->init($this->__indirect($this->__vector($o) + $j * 4), $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestarrayoftablesLength()
+    {
+        $o = $this->__offset(26);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    public function getEnemy()
+    {
+        $obj = new Monster();
+        $o = $this->__offset(28);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getTestnestedflatbuffer($j)
+    {
+        $o = $this->__offset(30);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestnestedflatbufferLength()
+    {
+        $o = $this->__offset(30);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return string
+     */
+    public function getTestnestedflatbufferBytes()
+    {
+        return $this->__vector_as_bytes(30);
+    }
+
+    public function getTestempty()
+    {
+        $obj = new Stat();
+        $o = $this->__offset(32);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @return bool
+     */
+    public function getTestbool()
+    {
+        $o = $this->__offset(34);
+        return $o != 0 ? $this->bb->getBool($o + $this->bb_pos) : false;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTesthashs32Fnv1()
+    {
+        $o = $this->__offset(36);
+        return $o != 0 ? $this->bb->getInt($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return uint
+     */
+    public function getTesthashu32Fnv1()
+    {
+        $o = $this->__offset(38);
+        return $o != 0 ? $this->bb->getUint($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return long
+     */
+    public function getTesthashs64Fnv1()
+    {
+        $o = $this->__offset(40);
+        return $o != 0 ? $this->bb->getLong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getTesthashu64Fnv1()
+    {
+        $o = $this->__offset(42);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTesthashs32Fnv1a()
+    {
+        $o = $this->__offset(44);
+        return $o != 0 ? $this->bb->getInt($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return uint
+     */
+    public function getTesthashu32Fnv1a()
+    {
+        $o = $this->__offset(46);
+        return $o != 0 ? $this->bb->getUint($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return long
+     */
+    public function getTesthashs64Fnv1a()
+    {
+        $o = $this->__offset(48);
+        return $o != 0 ? $this->bb->getLong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getTesthashu64Fnv1a()
+    {
+        $o = $this->__offset(50);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return bool
+     */
+    public function getTestarrayofbools($j)
+    {
+        $o = $this->__offset(52);
+        return $o != 0 ? $this->bb->getBool($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestarrayofboolsLength()
+    {
+        $o = $this->__offset(52);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return float
+     */
+    public function getTestf()
+    {
+        $o = $this->__offset(54);
+        return $o != 0 ? $this->bb->getFloat($o + $this->bb_pos) : 3.14159;
+    }
+
+    /**
+     * @return float
+     */
+    public function getTestf2()
+    {
+        $o = $this->__offset(56);
+        return $o != 0 ? $this->bb->getFloat($o + $this->bb_pos) : 3.0;
+    }
+
+    /**
+     * @return float
+     */
+    public function getTestf3()
+    {
+        $o = $this->__offset(58);
+        return $o != 0 ? $this->bb->getFloat($o + $this->bb_pos) : 0.0;
+    }
+
+    /**
+     * @param int offset
+     * @return string
+     */
+    public function getTestarrayofstring2($j)
+    {
+        $o = $this->__offset(60);
+        return $o != 0 ? $this->__string($this->__vector($o) + $j * 4) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestarrayofstring2Length()
+    {
+        $o = $this->__offset(60);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getTestarrayofsortedstruct($j)
+    {
+        $o = $this->__offset(62);
+        $obj = new Ability();
+        return $o != 0 ? $obj->init($this->__vector($o) + $j *8, $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestarrayofsortedstructLength()
+    {
+        $o = $this->__offset(62);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getFlex($j)
+    {
+        $o = $this->__offset(64);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getFlexLength()
+    {
+        $o = $this->__offset(64);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return string
+     */
+    public function getFlexBytes()
+    {
+        return $this->__vector_as_bytes(64);
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getTest5($j)
+    {
+        $o = $this->__offset(66);
+        $obj = new Test();
+        return $o != 0 ? $obj->init($this->__vector($o) + $j *4, $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTest5Length()
+    {
+        $o = $this->__offset(66);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return long
+     */
+    public function getVectorOfLongs($j)
+    {
+        $o = $this->__offset(68);
+        return $o != 0 ? $this->bb->getLong($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfLongsLength()
+    {
+        $o = $this->__offset(68);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return double
+     */
+    public function getVectorOfDoubles($j)
+    {
+        $o = $this->__offset(70);
+        return $o != 0 ? $this->bb->getDouble($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfDoublesLength()
+    {
+        $o = $this->__offset(70);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    public function getParentNamespaceTest()
+    {
+        $obj = new InParentNamespace();
+        $o = $this->__offset(72);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getVectorOfReferrables($j)
+    {
+        $o = $this->__offset(74);
+        $obj = new Referrable();
+        return $o != 0 ? $obj->init($this->__indirect($this->__vector($o) + $j * 4), $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfReferrablesLength()
+    {
+        $o = $this->__offset(74);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getSingleWeakReference()
+    {
+        $o = $this->__offset(76);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return ulong
+     */
+    public function getVectorOfWeakReferences($j)
+    {
+        $o = $this->__offset(78);
+        return $o != 0 ? $this->bb->getUlong($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfWeakReferencesLength()
+    {
+        $o = $this->__offset(78);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getVectorOfStrongReferrables($j)
+    {
+        $o = $this->__offset(80);
+        $obj = new Referrable();
+        return $o != 0 ? $obj->init($this->__indirect($this->__vector($o) + $j * 4), $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfStrongReferrablesLength()
+    {
+        $o = $this->__offset(80);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getCoOwningReference()
+    {
+        $o = $this->__offset(82);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return ulong
+     */
+    public function getVectorOfCoOwningReferences($j)
+    {
+        $o = $this->__offset(84);
+        return $o != 0 ? $this->bb->getUlong($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfCoOwningReferencesLength()
+    {
+        $o = $this->__offset(84);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getNonOwningReference()
+    {
+        $o = $this->__offset(86);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return ulong
+     */
+    public function getVectorOfNonOwningReferences($j)
+    {
+        $o = $this->__offset(88);
+        return $o != 0 ? $this->bb->getUlong($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfNonOwningReferencesLength()
+    {
+        $o = $this->__offset(88);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getAnyUniqueType()
+    {
+        $o = $this->__offset(90);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \MyGame\Example\AnyUniqueAliases::NONE;
+    }
+
+    /**
+     * @returnint
+     */
+    public function getAnyUnique($obj)
+    {
+        $o = $this->__offset(92);
+        return $o != 0 ? $this->__union($obj, $o) : null;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getAnyAmbiguousType()
+    {
+        $o = $this->__offset(94);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \MyGame\Example\AnyAmbiguousAliases::NONE;
+    }
+
+    /**
+     * @returnint
+     */
+    public function getAnyAmbiguous($obj)
+    {
+        $o = $this->__offset(96);
+        return $o != 0 ? $this->__union($obj, $o) : null;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getVectorOfEnums($j)
+    {
+        $o = $this->__offset(98);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVectorOfEnumsLength()
+    {
+        $o = $this->__offset(98);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return string
+     */
+    public function getVectorOfEnumsBytes()
+    {
+        return $this->__vector_as_bytes(98);
+    }
+
+    /**
+     * @return sbyte
+     */
+    public function getSignedEnum()
+    {
+        $o = $this->__offset(100);
+        return $o != 0 ? $this->bb->getSbyte($o + $this->bb_pos) : \MyGame\Example\Race::None;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getTestrequirednestedflatbuffer($j)
+    {
+        $o = $this->__offset(102);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getTestrequirednestedflatbufferLength()
+    {
+        $o = $this->__offset(102);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @return string
+     */
+    public function getTestrequirednestedflatbufferBytes()
+    {
+        return $this->__vector_as_bytes(102);
+    }
+
+    /**
+     * @returnVectorOffset
+     */
+    public function getScalarKeySortedTables($j)
+    {
+        $o = $this->__offset(104);
+        $obj = new Stat();
+        return $o != 0 ? $obj->init($this->__indirect($this->__vector($o) + $j * 4), $this->bb) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getScalarKeySortedTablesLength()
+    {
+        $o = $this->__offset(104);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startMonster(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(51);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Monster
+     */
+    public static function createMonster(FlatBufferBuilder $builder, $pos, $mana, $hp, $name, $inventory, $color, $test_type, $test, $test4, $testarrayofstring, $testarrayoftables, $enemy, $testnestedflatbuffer, $testempty, $testbool, $testhashs32_fnv1, $testhashu32_fnv1, $testhashs64_fnv1, $testhashu64_fnv1, $testhashs32_fnv1a, $testhashu32_fnv1a, $testhashs64_fnv1a, $testhashu64_fnv1a, $testarrayofbools, $testf, $testf2, $testf3, $testarrayofstring2, $testarrayofsortedstruct, $flex, $test5, $vector_of_longs, $vector_of_doubles, $parent_namespace_test, $vector_of_referrables, $single_weak_reference, $vector_of_weak_references, $vector_of_strong_referrables, $co_owning_reference, $vector_of_co_owning_references, $non_owning_reference, $vector_of_non_owning_references, $any_unique_type, $any_unique, $any_ambiguous_type, $any_ambiguous, $vector_of_enums, $signed_enum, $testrequirednestedflatbuffer, $scalar_key_sorted_tables)
+    {
+        $builder->startObject(51);
+        self::addPos($builder, $pos);
+        self::addMana($builder, $mana);
+        self::addHp($builder, $hp);
+        self::addName($builder, $name);
+        self::addInventory($builder, $inventory);
+        self::addColor($builder, $color);
+        self::addTestType($builder, $test_type);
+        self::addTest($builder, $test);
+        self::addTest4($builder, $test4);
+        self::addTestarrayofstring($builder, $testarrayofstring);
+        self::addTestarrayoftables($builder, $testarrayoftables);
+        self::addEnemy($builder, $enemy);
+        self::addTestnestedflatbuffer($builder, $testnestedflatbuffer);
+        self::addTestempty($builder, $testempty);
+        self::addTestbool($builder, $testbool);
+        self::addTesthashs32Fnv1($builder, $testhashs32_fnv1);
+        self::addTesthashu32Fnv1($builder, $testhashu32_fnv1);
+        self::addTesthashs64Fnv1($builder, $testhashs64_fnv1);
+        self::addTesthashu64Fnv1($builder, $testhashu64_fnv1);
+        self::addTesthashs32Fnv1a($builder, $testhashs32_fnv1a);
+        self::addTesthashu32Fnv1a($builder, $testhashu32_fnv1a);
+        self::addTesthashs64Fnv1a($builder, $testhashs64_fnv1a);
+        self::addTesthashu64Fnv1a($builder, $testhashu64_fnv1a);
+        self::addTestarrayofbools($builder, $testarrayofbools);
+        self::addTestf($builder, $testf);
+        self::addTestf2($builder, $testf2);
+        self::addTestf3($builder, $testf3);
+        self::addTestarrayofstring2($builder, $testarrayofstring2);
+        self::addTestarrayofsortedstruct($builder, $testarrayofsortedstruct);
+        self::addFlex($builder, $flex);
+        self::addTest5($builder, $test5);
+        self::addVectorOfLongs($builder, $vector_of_longs);
+        self::addVectorOfDoubles($builder, $vector_of_doubles);
+        self::addParentNamespaceTest($builder, $parent_namespace_test);
+        self::addVectorOfReferrables($builder, $vector_of_referrables);
+        self::addSingleWeakReference($builder, $single_weak_reference);
+        self::addVectorOfWeakReferences($builder, $vector_of_weak_references);
+        self::addVectorOfStrongReferrables($builder, $vector_of_strong_referrables);
+        self::addCoOwningReference($builder, $co_owning_reference);
+        self::addVectorOfCoOwningReferences($builder, $vector_of_co_owning_references);
+        self::addNonOwningReference($builder, $non_owning_reference);
+        self::addVectorOfNonOwningReferences($builder, $vector_of_non_owning_references);
+        self::addAnyUniqueType($builder, $any_unique_type);
+        self::addAnyUnique($builder, $any_unique);
+        self::addAnyAmbiguousType($builder, $any_ambiguous_type);
+        self::addAnyAmbiguous($builder, $any_ambiguous);
+        self::addVectorOfEnums($builder, $vector_of_enums);
+        self::addSignedEnum($builder, $signed_enum);
+        self::addTestrequirednestedflatbuffer($builder, $testrequirednestedflatbuffer);
+        self::addScalarKeySortedTables($builder, $scalar_key_sorted_tables);
+        $o = $builder->endObject();
+        $builder->required($o, 10);  // name
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addPos(FlatBufferBuilder $builder, $pos)
+    {
+        $builder->addStructX(0, $pos, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param short
+     * @return void
+     */
+    public static function addMana(FlatBufferBuilder $builder, $mana)
+    {
+        $builder->addShortX(1, $mana, 150);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param short
+     * @return void
+     */
+    public static function addHp(FlatBufferBuilder $builder, $hp)
+    {
+        $builder->addShortX(2, $hp, 100);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param StringOffset
+     * @return void
+     */
+    public static function addName(FlatBufferBuilder $builder, $name)
+    {
+        $builder->addOffsetX(3, $name, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addInventory(FlatBufferBuilder $builder, $inventory)
+    {
+        $builder->addOffsetX(5, $inventory, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createInventoryVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startInventoryVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addColor(FlatBufferBuilder $builder, $color)
+    {
+        $builder->addByteX(6, $color, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addTestType(FlatBufferBuilder $builder, $testType)
+    {
+        $builder->addByteX(7, $testType, 0);
+    }
+
+    public static function addTest(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->addOffsetX(8, $offset, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTest4(FlatBufferBuilder $builder, $test4)
+    {
+        $builder->addOffsetX(9, $test4, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTest4Vector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 2);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTest4Vector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 2);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestarrayofstring(FlatBufferBuilder $builder, $testarrayofstring)
+    {
+        $builder->addOffsetX(10, $testarrayofstring, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestarrayofstringVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestarrayofstringVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestarrayoftables(FlatBufferBuilder $builder, $testarrayoftables)
+    {
+        $builder->addOffsetX(11, $testarrayoftables, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestarrayoftablesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestarrayoftablesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addEnemy(FlatBufferBuilder $builder, $enemy)
+    {
+        $builder->addOffsetX(12, $enemy, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestnestedflatbuffer(FlatBufferBuilder $builder, $testnestedflatbuffer)
+    {
+        $builder->addOffsetX(13, $testnestedflatbuffer, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestnestedflatbufferVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestnestedflatbufferVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addTestempty(FlatBufferBuilder $builder, $testempty)
+    {
+        $builder->addOffsetX(14, $testempty, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param bool
+     * @return void
+     */
+    public static function addTestbool(FlatBufferBuilder $builder, $testbool)
+    {
+        $builder->addBoolX(15, $testbool, false);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addTesthashs32Fnv1(FlatBufferBuilder $builder, $testhashs32Fnv1)
+    {
+        $builder->addIntX(16, $testhashs32Fnv1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param uint
+     * @return void
+     */
+    public static function addTesthashu32Fnv1(FlatBufferBuilder $builder, $testhashu32Fnv1)
+    {
+        $builder->addUintX(17, $testhashu32Fnv1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param long
+     * @return void
+     */
+    public static function addTesthashs64Fnv1(FlatBufferBuilder $builder, $testhashs64Fnv1)
+    {
+        $builder->addLongX(18, $testhashs64Fnv1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addTesthashu64Fnv1(FlatBufferBuilder $builder, $testhashu64Fnv1)
+    {
+        $builder->addUlongX(19, $testhashu64Fnv1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addTesthashs32Fnv1a(FlatBufferBuilder $builder, $testhashs32Fnv1a)
+    {
+        $builder->addIntX(20, $testhashs32Fnv1a, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param uint
+     * @return void
+     */
+    public static function addTesthashu32Fnv1a(FlatBufferBuilder $builder, $testhashu32Fnv1a)
+    {
+        $builder->addUintX(21, $testhashu32Fnv1a, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param long
+     * @return void
+     */
+    public static function addTesthashs64Fnv1a(FlatBufferBuilder $builder, $testhashs64Fnv1a)
+    {
+        $builder->addLongX(22, $testhashs64Fnv1a, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addTesthashu64Fnv1a(FlatBufferBuilder $builder, $testhashu64Fnv1a)
+    {
+        $builder->addUlongX(23, $testhashu64Fnv1a, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestarrayofbools(FlatBufferBuilder $builder, $testarrayofbools)
+    {
+        $builder->addOffsetX(24, $testarrayofbools, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestarrayofboolsVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putBool($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestarrayofboolsVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param float
+     * @return void
+     */
+    public static function addTestf(FlatBufferBuilder $builder, $testf)
+    {
+        $builder->addFloatX(25, $testf, 3.14159);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param float
+     * @return void
+     */
+    public static function addTestf2(FlatBufferBuilder $builder, $testf2)
+    {
+        $builder->addFloatX(26, $testf2, 3.0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param float
+     * @return void
+     */
+    public static function addTestf3(FlatBufferBuilder $builder, $testf3)
+    {
+        $builder->addFloatX(27, $testf3, 0.0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestarrayofstring2(FlatBufferBuilder $builder, $testarrayofstring2)
+    {
+        $builder->addOffsetX(28, $testarrayofstring2, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestarrayofstring2Vector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestarrayofstring2Vector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestarrayofsortedstruct(FlatBufferBuilder $builder, $testarrayofsortedstruct)
+    {
+        $builder->addOffsetX(29, $testarrayofsortedstruct, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestarrayofsortedstructVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestarrayofsortedstructVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addFlex(FlatBufferBuilder $builder, $flex)
+    {
+        $builder->addOffsetX(30, $flex, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createFlexVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startFlexVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTest5(FlatBufferBuilder $builder, $test5)
+    {
+        $builder->addOffsetX(31, $test5, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTest5Vector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 2);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTest5Vector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 2);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfLongs(FlatBufferBuilder $builder, $vectorOfLongs)
+    {
+        $builder->addOffsetX(32, $vectorOfLongs, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfLongsVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putLong($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfLongsVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfDoubles(FlatBufferBuilder $builder, $vectorOfDoubles)
+    {
+        $builder->addOffsetX(33, $vectorOfDoubles, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfDoublesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putDouble($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfDoublesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addParentNamespaceTest(FlatBufferBuilder $builder, $parentNamespaceTest)
+    {
+        $builder->addOffsetX(34, $parentNamespaceTest, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfReferrables(FlatBufferBuilder $builder, $vectorOfReferrables)
+    {
+        $builder->addOffsetX(35, $vectorOfReferrables, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfReferrablesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfReferrablesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addSingleWeakReference(FlatBufferBuilder $builder, $singleWeakReference)
+    {
+        $builder->addUlongX(36, $singleWeakReference, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfWeakReferences(FlatBufferBuilder $builder, $vectorOfWeakReferences)
+    {
+        $builder->addOffsetX(37, $vectorOfWeakReferences, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfWeakReferencesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putUlong($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfWeakReferencesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfStrongReferrables(FlatBufferBuilder $builder, $vectorOfStrongReferrables)
+    {
+        $builder->addOffsetX(38, $vectorOfStrongReferrables, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfStrongReferrablesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfStrongReferrablesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addCoOwningReference(FlatBufferBuilder $builder, $coOwningReference)
+    {
+        $builder->addUlongX(39, $coOwningReference, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfCoOwningReferences(FlatBufferBuilder $builder, $vectorOfCoOwningReferences)
+    {
+        $builder->addOffsetX(40, $vectorOfCoOwningReferences, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfCoOwningReferencesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putUlong($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfCoOwningReferencesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addNonOwningReference(FlatBufferBuilder $builder, $nonOwningReference)
+    {
+        $builder->addUlongX(41, $nonOwningReference, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfNonOwningReferences(FlatBufferBuilder $builder, $vectorOfNonOwningReferences)
+    {
+        $builder->addOffsetX(42, $vectorOfNonOwningReferences, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfNonOwningReferencesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putUlong($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfNonOwningReferencesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addAnyUniqueType(FlatBufferBuilder $builder, $anyUniqueType)
+    {
+        $builder->addByteX(43, $anyUniqueType, 0);
+    }
+
+    public static function addAnyUnique(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->addOffsetX(44, $offset, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addAnyAmbiguousType(FlatBufferBuilder $builder, $anyAmbiguousType)
+    {
+        $builder->addByteX(45, $anyAmbiguousType, 0);
+    }
+
+    public static function addAnyAmbiguous(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->addOffsetX(46, $offset, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVectorOfEnums(FlatBufferBuilder $builder, $vectorOfEnums)
+    {
+        $builder->addOffsetX(47, $vectorOfEnums, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVectorOfEnumsVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVectorOfEnumsVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param sbyte
+     * @return void
+     */
+    public static function addSignedEnum(FlatBufferBuilder $builder, $signedEnum)
+    {
+        $builder->addSbyteX(48, $signedEnum, -1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addTestrequirednestedflatbuffer(FlatBufferBuilder $builder, $testrequirednestedflatbuffer)
+    {
+        $builder->addOffsetX(49, $testrequirednestedflatbuffer, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createTestrequirednestedflatbufferVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startTestrequirednestedflatbufferVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addScalarKeySortedTables(FlatBufferBuilder $builder, $scalarKeySortedTables)
+    {
+        $builder->addOffsetX(50, $scalarKeySortedTables, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createScalarKeySortedTablesVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startScalarKeySortedTablesVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endMonster(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        $builder->required($o, 10);  // name
+        return $o;
+    }
+
+    public static function finishMonsterBuffer(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->finish($offset, "MONS");
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.py
new file mode 100644
index 0000000..33f69a5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Monster.py
@@ -0,0 +1,1613 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+# an example documentation comment: "monster object"
+class Monster(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Monster()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMonster(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def MonsterBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # Monster
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Monster
+    def Pos(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = o + self._tab.Pos
+            from MyGame.Example.Vec3 import Vec3
+            obj = Vec3()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def Mana(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int16Flags, o + self._tab.Pos)
+        return 150
+
+    # Monster
+    def Hp(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int16Flags, o + self._tab.Pos)
+        return 100
+
+    # Monster
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Monster
+    def Inventory(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def InventoryAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Monster
+    def InventoryLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def InventoryIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        return o == 0
+
+    # Monster
+    def Color(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 8
+
+    # Monster
+    def TestType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Test(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # Monster
+    def Test4(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            from MyGame.Example.Test import Test
+            obj = Test()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def Test4Length(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def Test4IsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        return o == 0
+
+    # Monster
+    def Testarrayofstring(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.String(a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return ""
+
+    # Monster
+    def TestarrayofstringLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestarrayofstringIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        return o == 0
+
+    # an example documentation comment: this will end up in the generated code
+    # multiline too
+    # Monster
+    def Testarrayoftables(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from MyGame.Example.Monster import Monster
+            obj = Monster()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def TestarrayoftablesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestarrayoftablesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        return o == 0
+
+    # Monster
+    def Enemy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(28))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from MyGame.Example.Monster import Monster
+            obj = Monster()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def Testnestedflatbuffer(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def TestnestedflatbufferAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Monster
+    def TestnestedflatbufferNestedRoot(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            from MyGame.Example.Monster import Monster
+            return Monster.GetRootAs(self._tab.Bytes, self._tab.Vector(o))
+        return 0
+
+    # Monster
+    def TestnestedflatbufferLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestnestedflatbufferIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(30))
+        return o == 0
+
+    # Monster
+    def Testempty(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(32))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from MyGame.Example.Stat import Stat
+            obj = Stat()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def Testbool(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(34))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # Monster
+    def Testhashs32Fnv1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(36))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashu32Fnv1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(38))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashs64Fnv1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(40))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashu64Fnv1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(42))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashs32Fnv1a(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(44))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashu32Fnv1a(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(46))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashs64Fnv1a(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(48))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testhashu64Fnv1a(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(50))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def Testarrayofbools(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(52))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.BoolFlags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def TestarrayofboolsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(52))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o)
+        return 0
+
+    # Monster
+    def TestarrayofboolsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(52))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestarrayofboolsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(52))
+        return o == 0
+
+    # Monster
+    def Testf(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(54))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 3.14159
+
+    # Monster
+    def Testf2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(56))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 3.0
+
+    # Monster
+    def Testf3(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(58))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # Monster
+    def Testarrayofstring2(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(60))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.String(a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return ""
+
+    # Monster
+    def Testarrayofstring2Length(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(60))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def Testarrayofstring2IsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(60))
+        return o == 0
+
+    # Monster
+    def Testarrayofsortedstruct(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(62))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 8
+            from MyGame.Example.Ability import Ability
+            obj = Ability()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def TestarrayofsortedstructLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(62))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestarrayofsortedstructIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(62))
+        return o == 0
+
+    # Monster
+    def Flex(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(64))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def FlexAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(64))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Monster
+    def FlexLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(64))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def FlexIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(64))
+        return o == 0
+
+    # Monster
+    def Test5(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(66))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            from MyGame.Example.Test import Test
+            obj = Test()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def Test5Length(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(66))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def Test5IsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(66))
+        return o == 0
+
+    # Monster
+    def VectorOfLongs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(68))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # Monster
+    def VectorOfLongsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(68))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfLongsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(68))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfLongsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(68))
+        return o == 0
+
+    # Monster
+    def VectorOfDoubles(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(70))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # Monster
+    def VectorOfDoublesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(70))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float64Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfDoublesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(70))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfDoublesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(70))
+        return o == 0
+
+    # Monster
+    def ParentNamespaceTest(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(72))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from MyGame.InParentNamespace import InParentNamespace
+            obj = InParentNamespace()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def VectorOfReferrables(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(74))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from MyGame.Example.Referrable import Referrable
+            obj = Referrable()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def VectorOfReferrablesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(74))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfReferrablesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(74))
+        return o == 0
+
+    # Monster
+    def SingleWeakReference(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(76))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def VectorOfWeakReferences(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(78))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # Monster
+    def VectorOfWeakReferencesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(78))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint64Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfWeakReferencesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(78))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfWeakReferencesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(78))
+        return o == 0
+
+    # Monster
+    def VectorOfStrongReferrables(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(80))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from MyGame.Example.Referrable import Referrable
+            obj = Referrable()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def VectorOfStrongReferrablesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(80))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfStrongReferrablesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(80))
+        return o == 0
+
+    # Monster
+    def CoOwningReference(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(82))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def VectorOfCoOwningReferences(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(84))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # Monster
+    def VectorOfCoOwningReferencesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(84))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint64Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfCoOwningReferencesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(84))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfCoOwningReferencesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(84))
+        return o == 0
+
+    # Monster
+    def NonOwningReference(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(86))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def VectorOfNonOwningReferences(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(88))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # Monster
+    def VectorOfNonOwningReferencesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(88))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint64Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfNonOwningReferencesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(88))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfNonOwningReferencesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(88))
+        return o == 0
+
+    # Monster
+    def AnyUniqueType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(90))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def AnyUnique(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(92))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # Monster
+    def AnyAmbiguousType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(94))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Monster
+    def AnyAmbiguous(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(96))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # Monster
+    def VectorOfEnums(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(98))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def VectorOfEnumsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(98))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Monster
+    def VectorOfEnumsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(98))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def VectorOfEnumsIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(98))
+        return o == 0
+
+    # Monster
+    def SignedEnum(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(100))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return -1
+
+    # Monster
+    def Testrequirednestedflatbuffer(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(102))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Monster
+    def TestrequirednestedflatbufferAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(102))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Monster
+    def TestrequirednestedflatbufferNestedRoot(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(102))
+        if o != 0:
+            from MyGame.Example.Monster import Monster
+            return Monster.GetRootAs(self._tab.Bytes, self._tab.Vector(o))
+        return 0
+
+    # Monster
+    def TestrequirednestedflatbufferLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(102))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def TestrequirednestedflatbufferIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(102))
+        return o == 0
+
+    # Monster
+    def ScalarKeySortedTables(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(104))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from MyGame.Example.Stat import Stat
+            obj = Stat()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Monster
+    def ScalarKeySortedTablesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(104))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Monster
+    def ScalarKeySortedTablesIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(104))
+        return o == 0
+
+def Start(builder): builder.StartObject(51)
+def MonsterStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddPos(builder, pos): builder.PrependStructSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(pos), 0)
+def MonsterAddPos(builder, pos):
+    """This method is deprecated. Please switch to AddPos."""
+    return AddPos(builder, pos)
+def AddMana(builder, mana): builder.PrependInt16Slot(1, mana, 150)
+def MonsterAddMana(builder, mana):
+    """This method is deprecated. Please switch to AddMana."""
+    return AddMana(builder, mana)
+def AddHp(builder, hp): builder.PrependInt16Slot(2, hp, 100)
+def MonsterAddHp(builder, hp):
+    """This method is deprecated. Please switch to AddHp."""
+    return AddHp(builder, hp)
+def AddName(builder, name): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def MonsterAddName(builder, name):
+    """This method is deprecated. Please switch to AddName."""
+    return AddName(builder, name)
+def AddInventory(builder, inventory): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(inventory), 0)
+def MonsterAddInventory(builder, inventory):
+    """This method is deprecated. Please switch to AddInventory."""
+    return AddInventory(builder, inventory)
+def StartInventoryVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartInventoryVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartInventoryVector(builder, numElems)
+def AddColor(builder, color): builder.PrependUint8Slot(6, color, 8)
+def MonsterAddColor(builder, color):
+    """This method is deprecated. Please switch to AddColor."""
+    return AddColor(builder, color)
+def AddTestType(builder, testType): builder.PrependUint8Slot(7, testType, 0)
+def MonsterAddTestType(builder, testType):
+    """This method is deprecated. Please switch to AddTestType."""
+    return AddTestType(builder, testType)
+def AddTest(builder, test): builder.PrependUOffsetTRelativeSlot(8, flatbuffers.number_types.UOffsetTFlags.py_type(test), 0)
+def MonsterAddTest(builder, test):
+    """This method is deprecated. Please switch to AddTest."""
+    return AddTest(builder, test)
+def AddTest4(builder, test4): builder.PrependUOffsetTRelativeSlot(9, flatbuffers.number_types.UOffsetTFlags.py_type(test4), 0)
+def MonsterAddTest4(builder, test4):
+    """This method is deprecated. Please switch to AddTest4."""
+    return AddTest4(builder, test4)
+def StartTest4Vector(builder, numElems): return builder.StartVector(4, numElems, 2)
+def MonsterStartTest4Vector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTest4Vector(builder, numElems)
+def AddTestarrayofstring(builder, testarrayofstring): builder.PrependUOffsetTRelativeSlot(10, flatbuffers.number_types.UOffsetTFlags.py_type(testarrayofstring), 0)
+def MonsterAddTestarrayofstring(builder, testarrayofstring):
+    """This method is deprecated. Please switch to AddTestarrayofstring."""
+    return AddTestarrayofstring(builder, testarrayofstring)
+def StartTestarrayofstringVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartTestarrayofstringVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestarrayofstringVector(builder, numElems)
+def AddTestarrayoftables(builder, testarrayoftables): builder.PrependUOffsetTRelativeSlot(11, flatbuffers.number_types.UOffsetTFlags.py_type(testarrayoftables), 0)
+def MonsterAddTestarrayoftables(builder, testarrayoftables):
+    """This method is deprecated. Please switch to AddTestarrayoftables."""
+    return AddTestarrayoftables(builder, testarrayoftables)
+def StartTestarrayoftablesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartTestarrayoftablesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestarrayoftablesVector(builder, numElems)
+def AddEnemy(builder, enemy): builder.PrependUOffsetTRelativeSlot(12, flatbuffers.number_types.UOffsetTFlags.py_type(enemy), 0)
+def MonsterAddEnemy(builder, enemy):
+    """This method is deprecated. Please switch to AddEnemy."""
+    return AddEnemy(builder, enemy)
+def AddTestnestedflatbuffer(builder, testnestedflatbuffer): builder.PrependUOffsetTRelativeSlot(13, flatbuffers.number_types.UOffsetTFlags.py_type(testnestedflatbuffer), 0)
+def MonsterAddTestnestedflatbuffer(builder, testnestedflatbuffer):
+    """This method is deprecated. Please switch to AddTestnestedflatbuffer."""
+    return AddTestnestedflatbuffer(builder, testnestedflatbuffer)
+def StartTestnestedflatbufferVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartTestnestedflatbufferVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestnestedflatbufferVector(builder, numElems)
+def MakeVectorFromBytes(builder, bytes):
+    builder.StartVector(1, len(bytes), 1)
+    builder.head = builder.head - len(bytes)
+    builder.Bytes[builder.head : builder.head + len(bytes)] = bytes
+    return builder.EndVector()
+def MakeTestnestedflatbufferVectorFromBytes(builder, bytes):
+    builder.StartVector(1, len(bytes), 1)
+    builder.head = builder.head - len(bytes)
+    builder.Bytes[builder.head : builder.head + len(bytes)] = bytes
+    return builder.EndVector()
+def AddTestempty(builder, testempty): builder.PrependUOffsetTRelativeSlot(14, flatbuffers.number_types.UOffsetTFlags.py_type(testempty), 0)
+def MonsterAddTestempty(builder, testempty):
+    """This method is deprecated. Please switch to AddTestempty."""
+    return AddTestempty(builder, testempty)
+def AddTestbool(builder, testbool): builder.PrependBoolSlot(15, testbool, 0)
+def MonsterAddTestbool(builder, testbool):
+    """This method is deprecated. Please switch to AddTestbool."""
+    return AddTestbool(builder, testbool)
+def AddTesthashs32Fnv1(builder, testhashs32Fnv1): builder.PrependInt32Slot(16, testhashs32Fnv1, 0)
+def MonsterAddTesthashs32Fnv1(builder, testhashs32Fnv1):
+    """This method is deprecated. Please switch to AddTesthashs32Fnv1."""
+    return AddTesthashs32Fnv1(builder, testhashs32Fnv1)
+def AddTesthashu32Fnv1(builder, testhashu32Fnv1): builder.PrependUint32Slot(17, testhashu32Fnv1, 0)
+def MonsterAddTesthashu32Fnv1(builder, testhashu32Fnv1):
+    """This method is deprecated. Please switch to AddTesthashu32Fnv1."""
+    return AddTesthashu32Fnv1(builder, testhashu32Fnv1)
+def AddTesthashs64Fnv1(builder, testhashs64Fnv1): builder.PrependInt64Slot(18, testhashs64Fnv1, 0)
+def MonsterAddTesthashs64Fnv1(builder, testhashs64Fnv1):
+    """This method is deprecated. Please switch to AddTesthashs64Fnv1."""
+    return AddTesthashs64Fnv1(builder, testhashs64Fnv1)
+def AddTesthashu64Fnv1(builder, testhashu64Fnv1): builder.PrependUint64Slot(19, testhashu64Fnv1, 0)
+def MonsterAddTesthashu64Fnv1(builder, testhashu64Fnv1):
+    """This method is deprecated. Please switch to AddTesthashu64Fnv1."""
+    return AddTesthashu64Fnv1(builder, testhashu64Fnv1)
+def AddTesthashs32Fnv1a(builder, testhashs32Fnv1a): builder.PrependInt32Slot(20, testhashs32Fnv1a, 0)
+def MonsterAddTesthashs32Fnv1a(builder, testhashs32Fnv1a):
+    """This method is deprecated. Please switch to AddTesthashs32Fnv1a."""
+    return AddTesthashs32Fnv1a(builder, testhashs32Fnv1a)
+def AddTesthashu32Fnv1a(builder, testhashu32Fnv1a): builder.PrependUint32Slot(21, testhashu32Fnv1a, 0)
+def MonsterAddTesthashu32Fnv1a(builder, testhashu32Fnv1a):
+    """This method is deprecated. Please switch to AddTesthashu32Fnv1a."""
+    return AddTesthashu32Fnv1a(builder, testhashu32Fnv1a)
+def AddTesthashs64Fnv1a(builder, testhashs64Fnv1a): builder.PrependInt64Slot(22, testhashs64Fnv1a, 0)
+def MonsterAddTesthashs64Fnv1a(builder, testhashs64Fnv1a):
+    """This method is deprecated. Please switch to AddTesthashs64Fnv1a."""
+    return AddTesthashs64Fnv1a(builder, testhashs64Fnv1a)
+def AddTesthashu64Fnv1a(builder, testhashu64Fnv1a): builder.PrependUint64Slot(23, testhashu64Fnv1a, 0)
+def MonsterAddTesthashu64Fnv1a(builder, testhashu64Fnv1a):
+    """This method is deprecated. Please switch to AddTesthashu64Fnv1a."""
+    return AddTesthashu64Fnv1a(builder, testhashu64Fnv1a)
+def AddTestarrayofbools(builder, testarrayofbools): builder.PrependUOffsetTRelativeSlot(24, flatbuffers.number_types.UOffsetTFlags.py_type(testarrayofbools), 0)
+def MonsterAddTestarrayofbools(builder, testarrayofbools):
+    """This method is deprecated. Please switch to AddTestarrayofbools."""
+    return AddTestarrayofbools(builder, testarrayofbools)
+def StartTestarrayofboolsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartTestarrayofboolsVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestarrayofboolsVector(builder, numElems)
+def AddTestf(builder, testf): builder.PrependFloat32Slot(25, testf, 3.14159)
+def MonsterAddTestf(builder, testf):
+    """This method is deprecated. Please switch to AddTestf."""
+    return AddTestf(builder, testf)
+def AddTestf2(builder, testf2): builder.PrependFloat32Slot(26, testf2, 3.0)
+def MonsterAddTestf2(builder, testf2):
+    """This method is deprecated. Please switch to AddTestf2."""
+    return AddTestf2(builder, testf2)
+def AddTestf3(builder, testf3): builder.PrependFloat32Slot(27, testf3, 0.0)
+def MonsterAddTestf3(builder, testf3):
+    """This method is deprecated. Please switch to AddTestf3."""
+    return AddTestf3(builder, testf3)
+def AddTestarrayofstring2(builder, testarrayofstring2): builder.PrependUOffsetTRelativeSlot(28, flatbuffers.number_types.UOffsetTFlags.py_type(testarrayofstring2), 0)
+def MonsterAddTestarrayofstring2(builder, testarrayofstring2):
+    """This method is deprecated. Please switch to AddTestarrayofstring2."""
+    return AddTestarrayofstring2(builder, testarrayofstring2)
+def StartTestarrayofstring2Vector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartTestarrayofstring2Vector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestarrayofstring2Vector(builder, numElems)
+def AddTestarrayofsortedstruct(builder, testarrayofsortedstruct): builder.PrependUOffsetTRelativeSlot(29, flatbuffers.number_types.UOffsetTFlags.py_type(testarrayofsortedstruct), 0)
+def MonsterAddTestarrayofsortedstruct(builder, testarrayofsortedstruct):
+    """This method is deprecated. Please switch to AddTestarrayofsortedstruct."""
+    return AddTestarrayofsortedstruct(builder, testarrayofsortedstruct)
+def StartTestarrayofsortedstructVector(builder, numElems): return builder.StartVector(8, numElems, 4)
+def MonsterStartTestarrayofsortedstructVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestarrayofsortedstructVector(builder, numElems)
+def AddFlex(builder, flex): builder.PrependUOffsetTRelativeSlot(30, flatbuffers.number_types.UOffsetTFlags.py_type(flex), 0)
+def MonsterAddFlex(builder, flex):
+    """This method is deprecated. Please switch to AddFlex."""
+    return AddFlex(builder, flex)
+def StartFlexVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartFlexVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartFlexVector(builder, numElems)
+def AddTest5(builder, test5): builder.PrependUOffsetTRelativeSlot(31, flatbuffers.number_types.UOffsetTFlags.py_type(test5), 0)
+def MonsterAddTest5(builder, test5):
+    """This method is deprecated. Please switch to AddTest5."""
+    return AddTest5(builder, test5)
+def StartTest5Vector(builder, numElems): return builder.StartVector(4, numElems, 2)
+def MonsterStartTest5Vector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTest5Vector(builder, numElems)
+def AddVectorOfLongs(builder, vectorOfLongs): builder.PrependUOffsetTRelativeSlot(32, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfLongs), 0)
+def MonsterAddVectorOfLongs(builder, vectorOfLongs):
+    """This method is deprecated. Please switch to AddVectorOfLongs."""
+    return AddVectorOfLongs(builder, vectorOfLongs)
+def StartVectorOfLongsVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterStartVectorOfLongsVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfLongsVector(builder, numElems)
+def AddVectorOfDoubles(builder, vectorOfDoubles): builder.PrependUOffsetTRelativeSlot(33, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfDoubles), 0)
+def MonsterAddVectorOfDoubles(builder, vectorOfDoubles):
+    """This method is deprecated. Please switch to AddVectorOfDoubles."""
+    return AddVectorOfDoubles(builder, vectorOfDoubles)
+def StartVectorOfDoublesVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterStartVectorOfDoublesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfDoublesVector(builder, numElems)
+def AddParentNamespaceTest(builder, parentNamespaceTest): builder.PrependUOffsetTRelativeSlot(34, flatbuffers.number_types.UOffsetTFlags.py_type(parentNamespaceTest), 0)
+def MonsterAddParentNamespaceTest(builder, parentNamespaceTest):
+    """This method is deprecated. Please switch to AddParentNamespaceTest."""
+    return AddParentNamespaceTest(builder, parentNamespaceTest)
+def AddVectorOfReferrables(builder, vectorOfReferrables): builder.PrependUOffsetTRelativeSlot(35, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfReferrables), 0)
+def MonsterAddVectorOfReferrables(builder, vectorOfReferrables):
+    """This method is deprecated. Please switch to AddVectorOfReferrables."""
+    return AddVectorOfReferrables(builder, vectorOfReferrables)
+def StartVectorOfReferrablesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartVectorOfReferrablesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfReferrablesVector(builder, numElems)
+def AddSingleWeakReference(builder, singleWeakReference): builder.PrependUint64Slot(36, singleWeakReference, 0)
+def MonsterAddSingleWeakReference(builder, singleWeakReference):
+    """This method is deprecated. Please switch to AddSingleWeakReference."""
+    return AddSingleWeakReference(builder, singleWeakReference)
+def AddVectorOfWeakReferences(builder, vectorOfWeakReferences): builder.PrependUOffsetTRelativeSlot(37, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfWeakReferences), 0)
+def MonsterAddVectorOfWeakReferences(builder, vectorOfWeakReferences):
+    """This method is deprecated. Please switch to AddVectorOfWeakReferences."""
+    return AddVectorOfWeakReferences(builder, vectorOfWeakReferences)
+def StartVectorOfWeakReferencesVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterStartVectorOfWeakReferencesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfWeakReferencesVector(builder, numElems)
+def AddVectorOfStrongReferrables(builder, vectorOfStrongReferrables): builder.PrependUOffsetTRelativeSlot(38, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfStrongReferrables), 0)
+def MonsterAddVectorOfStrongReferrables(builder, vectorOfStrongReferrables):
+    """This method is deprecated. Please switch to AddVectorOfStrongReferrables."""
+    return AddVectorOfStrongReferrables(builder, vectorOfStrongReferrables)
+def StartVectorOfStrongReferrablesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartVectorOfStrongReferrablesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfStrongReferrablesVector(builder, numElems)
+def AddCoOwningReference(builder, coOwningReference): builder.PrependUint64Slot(39, coOwningReference, 0)
+def MonsterAddCoOwningReference(builder, coOwningReference):
+    """This method is deprecated. Please switch to AddCoOwningReference."""
+    return AddCoOwningReference(builder, coOwningReference)
+def AddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences): builder.PrependUOffsetTRelativeSlot(40, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfCoOwningReferences), 0)
+def MonsterAddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences):
+    """This method is deprecated. Please switch to AddVectorOfCoOwningReferences."""
+    return AddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences)
+def StartVectorOfCoOwningReferencesVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterStartVectorOfCoOwningReferencesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfCoOwningReferencesVector(builder, numElems)
+def AddNonOwningReference(builder, nonOwningReference): builder.PrependUint64Slot(41, nonOwningReference, 0)
+def MonsterAddNonOwningReference(builder, nonOwningReference):
+    """This method is deprecated. Please switch to AddNonOwningReference."""
+    return AddNonOwningReference(builder, nonOwningReference)
+def AddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences): builder.PrependUOffsetTRelativeSlot(42, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfNonOwningReferences), 0)
+def MonsterAddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences):
+    """This method is deprecated. Please switch to AddVectorOfNonOwningReferences."""
+    return AddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences)
+def StartVectorOfNonOwningReferencesVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterStartVectorOfNonOwningReferencesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfNonOwningReferencesVector(builder, numElems)
+def AddAnyUniqueType(builder, anyUniqueType): builder.PrependUint8Slot(43, anyUniqueType, 0)
+def MonsterAddAnyUniqueType(builder, anyUniqueType):
+    """This method is deprecated. Please switch to AddAnyUniqueType."""
+    return AddAnyUniqueType(builder, anyUniqueType)
+def AddAnyUnique(builder, anyUnique): builder.PrependUOffsetTRelativeSlot(44, flatbuffers.number_types.UOffsetTFlags.py_type(anyUnique), 0)
+def MonsterAddAnyUnique(builder, anyUnique):
+    """This method is deprecated. Please switch to AddAnyUnique."""
+    return AddAnyUnique(builder, anyUnique)
+def AddAnyAmbiguousType(builder, anyAmbiguousType): builder.PrependUint8Slot(45, anyAmbiguousType, 0)
+def MonsterAddAnyAmbiguousType(builder, anyAmbiguousType):
+    """This method is deprecated. Please switch to AddAnyAmbiguousType."""
+    return AddAnyAmbiguousType(builder, anyAmbiguousType)
+def AddAnyAmbiguous(builder, anyAmbiguous): builder.PrependUOffsetTRelativeSlot(46, flatbuffers.number_types.UOffsetTFlags.py_type(anyAmbiguous), 0)
+def MonsterAddAnyAmbiguous(builder, anyAmbiguous):
+    """This method is deprecated. Please switch to AddAnyAmbiguous."""
+    return AddAnyAmbiguous(builder, anyAmbiguous)
+def AddVectorOfEnums(builder, vectorOfEnums): builder.PrependUOffsetTRelativeSlot(47, flatbuffers.number_types.UOffsetTFlags.py_type(vectorOfEnums), 0)
+def MonsterAddVectorOfEnums(builder, vectorOfEnums):
+    """This method is deprecated. Please switch to AddVectorOfEnums."""
+    return AddVectorOfEnums(builder, vectorOfEnums)
+def StartVectorOfEnumsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartVectorOfEnumsVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVectorOfEnumsVector(builder, numElems)
+def AddSignedEnum(builder, signedEnum): builder.PrependInt8Slot(48, signedEnum, -1)
+def MonsterAddSignedEnum(builder, signedEnum):
+    """This method is deprecated. Please switch to AddSignedEnum."""
+    return AddSignedEnum(builder, signedEnum)
+def AddTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer): builder.PrependUOffsetTRelativeSlot(49, flatbuffers.number_types.UOffsetTFlags.py_type(testrequirednestedflatbuffer), 0)
+def MonsterAddTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer):
+    """This method is deprecated. Please switch to AddTestrequirednestedflatbuffer."""
+    return AddTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer)
+def StartTestrequirednestedflatbufferVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def MonsterStartTestrequirednestedflatbufferVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartTestrequirednestedflatbufferVector(builder, numElems)
+def MakeVectorFromBytes(builder, bytes):
+    builder.StartVector(1, len(bytes), 1)
+    builder.head = builder.head - len(bytes)
+    builder.Bytes[builder.head : builder.head + len(bytes)] = bytes
+    return builder.EndVector()
+def MakeTestrequirednestedflatbufferVectorFromBytes(builder, bytes):
+    builder.StartVector(1, len(bytes), 1)
+    builder.head = builder.head - len(bytes)
+    builder.Bytes[builder.head : builder.head + len(bytes)] = bytes
+    return builder.EndVector()
+def AddScalarKeySortedTables(builder, scalarKeySortedTables): builder.PrependUOffsetTRelativeSlot(50, flatbuffers.number_types.UOffsetTFlags.py_type(scalarKeySortedTables), 0)
+def MonsterAddScalarKeySortedTables(builder, scalarKeySortedTables):
+    """This method is deprecated. Please switch to AddScalarKeySortedTables."""
+    return AddScalarKeySortedTables(builder, scalarKeySortedTables)
+def StartScalarKeySortedTablesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterStartScalarKeySortedTablesVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartScalarKeySortedTablesVector(builder, numElems)
+def End(builder): return builder.EndObject()
+def MonsterEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+import MyGame.Example.Ability
+import MyGame.Example.Any
+import MyGame.Example.AnyAmbiguousAliases
+import MyGame.Example.AnyUniqueAliases
+import MyGame.Example.Referrable
+import MyGame.Example.Stat
+import MyGame.Example.Test
+import MyGame.Example.TestSimpleTableWithEnum
+import MyGame.Example.Vec3
+import MyGame.Example2.Monster
+import MyGame.InParentNamespace
+try:
+    from typing import List, Optional, Union
+except:
+    pass
+
+class MonsterT(object):
+
+    # MonsterT
+    def __init__(self):
+        self.pos = None  # type: Optional[MyGame.Example.Vec3.Vec3T]
+        self.mana = 150  # type: int
+        self.hp = 100  # type: int
+        self.name = None  # type: str
+        self.inventory = None  # type: List[int]
+        self.color = 8  # type: int
+        self.testType = 0  # type: int
+        self.test = None  # type: Union[None, MyGame.Example.Monster.MonsterT, MyGame.Example.TestSimpleTableWithEnum.TestSimpleTableWithEnumT, MyGame.Example2.Monster.MonsterT]
+        self.test4 = None  # type: List[MyGame.Example.Test.TestT]
+        self.testarrayofstring = None  # type: List[str]
+        self.testarrayoftables = None  # type: List[MyGame.Example.Monster.MonsterT]
+        self.enemy = None  # type: Optional[MyGame.Example.Monster.MonsterT]
+        self.testnestedflatbuffer = None  # type: List[int]
+        self.testempty = None  # type: Optional[MyGame.Example.Stat.StatT]
+        self.testbool = False  # type: bool
+        self.testhashs32Fnv1 = 0  # type: int
+        self.testhashu32Fnv1 = 0  # type: int
+        self.testhashs64Fnv1 = 0  # type: int
+        self.testhashu64Fnv1 = 0  # type: int
+        self.testhashs32Fnv1a = 0  # type: int
+        self.testhashu32Fnv1a = 0  # type: int
+        self.testhashs64Fnv1a = 0  # type: int
+        self.testhashu64Fnv1a = 0  # type: int
+        self.testarrayofbools = None  # type: List[bool]
+        self.testf = 3.14159  # type: float
+        self.testf2 = 3.0  # type: float
+        self.testf3 = 0.0  # type: float
+        self.testarrayofstring2 = None  # type: List[str]
+        self.testarrayofsortedstruct = None  # type: List[MyGame.Example.Ability.AbilityT]
+        self.flex = None  # type: List[int]
+        self.test5 = None  # type: List[MyGame.Example.Test.TestT]
+        self.vectorOfLongs = None  # type: List[int]
+        self.vectorOfDoubles = None  # type: List[float]
+        self.parentNamespaceTest = None  # type: Optional[MyGame.InParentNamespace.InParentNamespaceT]
+        self.vectorOfReferrables = None  # type: List[MyGame.Example.Referrable.ReferrableT]
+        self.singleWeakReference = 0  # type: int
+        self.vectorOfWeakReferences = None  # type: List[int]
+        self.vectorOfStrongReferrables = None  # type: List[MyGame.Example.Referrable.ReferrableT]
+        self.coOwningReference = 0  # type: int
+        self.vectorOfCoOwningReferences = None  # type: List[int]
+        self.nonOwningReference = 0  # type: int
+        self.vectorOfNonOwningReferences = None  # type: List[int]
+        self.anyUniqueType = 0  # type: int
+        self.anyUnique = None  # type: Union[None, MyGame.Example.Monster.MonsterT, MyGame.Example.TestSimpleTableWithEnum.TestSimpleTableWithEnumT, MyGame.Example2.Monster.MonsterT]
+        self.anyAmbiguousType = 0  # type: int
+        self.anyAmbiguous = None  # type: Union[None, MyGame.Example.Monster.MonsterT, MyGame.Example.Monster.MonsterT, MyGame.Example.Monster.MonsterT]
+        self.vectorOfEnums = None  # type: List[int]
+        self.signedEnum = -1  # type: int
+        self.testrequirednestedflatbuffer = None  # type: List[int]
+        self.scalarKeySortedTables = None  # type: List[MyGame.Example.Stat.StatT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        monster = Monster()
+        monster.Init(buf, pos)
+        return cls.InitFromObj(monster)
+
+    @classmethod
+    def InitFromObj(cls, monster):
+        x = MonsterT()
+        x._UnPack(monster)
+        return x
+
+    # MonsterT
+    def _UnPack(self, monster):
+        if monster is None:
+            return
+        if monster.Pos() is not None:
+            self.pos = MyGame.Example.Vec3.Vec3T.InitFromObj(monster.Pos())
+        self.mana = monster.Mana()
+        self.hp = monster.Hp()
+        self.name = monster.Name()
+        if not monster.InventoryIsNone():
+            if np is None:
+                self.inventory = []
+                for i in range(monster.InventoryLength()):
+                    self.inventory.append(monster.Inventory(i))
+            else:
+                self.inventory = monster.InventoryAsNumpy()
+        self.color = monster.Color()
+        self.testType = monster.TestType()
+        self.test = MyGame.Example.Any.AnyCreator(self.testType, monster.Test())
+        if not monster.Test4IsNone():
+            self.test4 = []
+            for i in range(monster.Test4Length()):
+                if monster.Test4(i) is None:
+                    self.test4.append(None)
+                else:
+                    test_ = MyGame.Example.Test.TestT.InitFromObj(monster.Test4(i))
+                    self.test4.append(test_)
+        if not monster.TestarrayofstringIsNone():
+            self.testarrayofstring = []
+            for i in range(monster.TestarrayofstringLength()):
+                self.testarrayofstring.append(monster.Testarrayofstring(i))
+        if not monster.TestarrayoftablesIsNone():
+            self.testarrayoftables = []
+            for i in range(monster.TestarrayoftablesLength()):
+                if monster.Testarrayoftables(i) is None:
+                    self.testarrayoftables.append(None)
+                else:
+                    monster_ = MyGame.Example.Monster.MonsterT.InitFromObj(monster.Testarrayoftables(i))
+                    self.testarrayoftables.append(monster_)
+        if monster.Enemy() is not None:
+            self.enemy = MyGame.Example.Monster.MonsterT.InitFromObj(monster.Enemy())
+        if not monster.TestnestedflatbufferIsNone():
+            if np is None:
+                self.testnestedflatbuffer = []
+                for i in range(monster.TestnestedflatbufferLength()):
+                    self.testnestedflatbuffer.append(monster.Testnestedflatbuffer(i))
+            else:
+                self.testnestedflatbuffer = monster.TestnestedflatbufferAsNumpy()
+        if monster.Testempty() is not None:
+            self.testempty = MyGame.Example.Stat.StatT.InitFromObj(monster.Testempty())
+        self.testbool = monster.Testbool()
+        self.testhashs32Fnv1 = monster.Testhashs32Fnv1()
+        self.testhashu32Fnv1 = monster.Testhashu32Fnv1()
+        self.testhashs64Fnv1 = monster.Testhashs64Fnv1()
+        self.testhashu64Fnv1 = monster.Testhashu64Fnv1()
+        self.testhashs32Fnv1a = monster.Testhashs32Fnv1a()
+        self.testhashu32Fnv1a = monster.Testhashu32Fnv1a()
+        self.testhashs64Fnv1a = monster.Testhashs64Fnv1a()
+        self.testhashu64Fnv1a = monster.Testhashu64Fnv1a()
+        if not monster.TestarrayofboolsIsNone():
+            if np is None:
+                self.testarrayofbools = []
+                for i in range(monster.TestarrayofboolsLength()):
+                    self.testarrayofbools.append(monster.Testarrayofbools(i))
+            else:
+                self.testarrayofbools = monster.TestarrayofboolsAsNumpy()
+        self.testf = monster.Testf()
+        self.testf2 = monster.Testf2()
+        self.testf3 = monster.Testf3()
+        if not monster.Testarrayofstring2IsNone():
+            self.testarrayofstring2 = []
+            for i in range(monster.Testarrayofstring2Length()):
+                self.testarrayofstring2.append(monster.Testarrayofstring2(i))
+        if not monster.TestarrayofsortedstructIsNone():
+            self.testarrayofsortedstruct = []
+            for i in range(monster.TestarrayofsortedstructLength()):
+                if monster.Testarrayofsortedstruct(i) is None:
+                    self.testarrayofsortedstruct.append(None)
+                else:
+                    ability_ = MyGame.Example.Ability.AbilityT.InitFromObj(monster.Testarrayofsortedstruct(i))
+                    self.testarrayofsortedstruct.append(ability_)
+        if not monster.FlexIsNone():
+            if np is None:
+                self.flex = []
+                for i in range(monster.FlexLength()):
+                    self.flex.append(monster.Flex(i))
+            else:
+                self.flex = monster.FlexAsNumpy()
+        if not monster.Test5IsNone():
+            self.test5 = []
+            for i in range(monster.Test5Length()):
+                if monster.Test5(i) is None:
+                    self.test5.append(None)
+                else:
+                    test_ = MyGame.Example.Test.TestT.InitFromObj(monster.Test5(i))
+                    self.test5.append(test_)
+        if not monster.VectorOfLongsIsNone():
+            if np is None:
+                self.vectorOfLongs = []
+                for i in range(monster.VectorOfLongsLength()):
+                    self.vectorOfLongs.append(monster.VectorOfLongs(i))
+            else:
+                self.vectorOfLongs = monster.VectorOfLongsAsNumpy()
+        if not monster.VectorOfDoublesIsNone():
+            if np is None:
+                self.vectorOfDoubles = []
+                for i in range(monster.VectorOfDoublesLength()):
+                    self.vectorOfDoubles.append(monster.VectorOfDoubles(i))
+            else:
+                self.vectorOfDoubles = monster.VectorOfDoublesAsNumpy()
+        if monster.ParentNamespaceTest() is not None:
+            self.parentNamespaceTest = MyGame.InParentNamespace.InParentNamespaceT.InitFromObj(monster.ParentNamespaceTest())
+        if not monster.VectorOfReferrablesIsNone():
+            self.vectorOfReferrables = []
+            for i in range(monster.VectorOfReferrablesLength()):
+                if monster.VectorOfReferrables(i) is None:
+                    self.vectorOfReferrables.append(None)
+                else:
+                    referrable_ = MyGame.Example.Referrable.ReferrableT.InitFromObj(monster.VectorOfReferrables(i))
+                    self.vectorOfReferrables.append(referrable_)
+        self.singleWeakReference = monster.SingleWeakReference()
+        if not monster.VectorOfWeakReferencesIsNone():
+            if np is None:
+                self.vectorOfWeakReferences = []
+                for i in range(monster.VectorOfWeakReferencesLength()):
+                    self.vectorOfWeakReferences.append(monster.VectorOfWeakReferences(i))
+            else:
+                self.vectorOfWeakReferences = monster.VectorOfWeakReferencesAsNumpy()
+        if not monster.VectorOfStrongReferrablesIsNone():
+            self.vectorOfStrongReferrables = []
+            for i in range(monster.VectorOfStrongReferrablesLength()):
+                if monster.VectorOfStrongReferrables(i) is None:
+                    self.vectorOfStrongReferrables.append(None)
+                else:
+                    referrable_ = MyGame.Example.Referrable.ReferrableT.InitFromObj(monster.VectorOfStrongReferrables(i))
+                    self.vectorOfStrongReferrables.append(referrable_)
+        self.coOwningReference = monster.CoOwningReference()
+        if not monster.VectorOfCoOwningReferencesIsNone():
+            if np is None:
+                self.vectorOfCoOwningReferences = []
+                for i in range(monster.VectorOfCoOwningReferencesLength()):
+                    self.vectorOfCoOwningReferences.append(monster.VectorOfCoOwningReferences(i))
+            else:
+                self.vectorOfCoOwningReferences = monster.VectorOfCoOwningReferencesAsNumpy()
+        self.nonOwningReference = monster.NonOwningReference()
+        if not monster.VectorOfNonOwningReferencesIsNone():
+            if np is None:
+                self.vectorOfNonOwningReferences = []
+                for i in range(monster.VectorOfNonOwningReferencesLength()):
+                    self.vectorOfNonOwningReferences.append(monster.VectorOfNonOwningReferences(i))
+            else:
+                self.vectorOfNonOwningReferences = monster.VectorOfNonOwningReferencesAsNumpy()
+        self.anyUniqueType = monster.AnyUniqueType()
+        self.anyUnique = MyGame.Example.AnyUniqueAliases.AnyUniqueAliasesCreator(self.anyUniqueType, monster.AnyUnique())
+        self.anyAmbiguousType = monster.AnyAmbiguousType()
+        self.anyAmbiguous = MyGame.Example.AnyAmbiguousAliases.AnyAmbiguousAliasesCreator(self.anyAmbiguousType, monster.AnyAmbiguous())
+        if not monster.VectorOfEnumsIsNone():
+            if np is None:
+                self.vectorOfEnums = []
+                for i in range(monster.VectorOfEnumsLength()):
+                    self.vectorOfEnums.append(monster.VectorOfEnums(i))
+            else:
+                self.vectorOfEnums = monster.VectorOfEnumsAsNumpy()
+        self.signedEnum = monster.SignedEnum()
+        if not monster.TestrequirednestedflatbufferIsNone():
+            if np is None:
+                self.testrequirednestedflatbuffer = []
+                for i in range(monster.TestrequirednestedflatbufferLength()):
+                    self.testrequirednestedflatbuffer.append(monster.Testrequirednestedflatbuffer(i))
+            else:
+                self.testrequirednestedflatbuffer = monster.TestrequirednestedflatbufferAsNumpy()
+        if not monster.ScalarKeySortedTablesIsNone():
+            self.scalarKeySortedTables = []
+            for i in range(monster.ScalarKeySortedTablesLength()):
+                if monster.ScalarKeySortedTables(i) is None:
+                    self.scalarKeySortedTables.append(None)
+                else:
+                    stat_ = MyGame.Example.Stat.StatT.InitFromObj(monster.ScalarKeySortedTables(i))
+                    self.scalarKeySortedTables.append(stat_)
+
+    # MonsterT
+    def Pack(self, builder):
+        if self.name is not None:
+            name = builder.CreateString(self.name)
+        if self.inventory is not None:
+            if np is not None and type(self.inventory) is np.ndarray:
+                inventory = builder.CreateNumpyVector(self.inventory)
+            else:
+                StartInventoryVector(builder, len(self.inventory))
+                for i in reversed(range(len(self.inventory))):
+                    builder.PrependUint8(self.inventory[i])
+                inventory = builder.EndVector()
+        if self.test is not None:
+            test = self.test.Pack(builder)
+        if self.test4 is not None:
+            StartTest4Vector(builder, len(self.test4))
+            for i in reversed(range(len(self.test4))):
+                self.test4[i].Pack(builder)
+            test4 = builder.EndVector()
+        if self.testarrayofstring is not None:
+            testarrayofstringlist = []
+            for i in range(len(self.testarrayofstring)):
+                testarrayofstringlist.append(builder.CreateString(self.testarrayofstring[i]))
+            StartTestarrayofstringVector(builder, len(self.testarrayofstring))
+            for i in reversed(range(len(self.testarrayofstring))):
+                builder.PrependUOffsetTRelative(testarrayofstringlist[i])
+            testarrayofstring = builder.EndVector()
+        if self.testarrayoftables is not None:
+            testarrayoftableslist = []
+            for i in range(len(self.testarrayoftables)):
+                testarrayoftableslist.append(self.testarrayoftables[i].Pack(builder))
+            StartTestarrayoftablesVector(builder, len(self.testarrayoftables))
+            for i in reversed(range(len(self.testarrayoftables))):
+                builder.PrependUOffsetTRelative(testarrayoftableslist[i])
+            testarrayoftables = builder.EndVector()
+        if self.enemy is not None:
+            enemy = self.enemy.Pack(builder)
+        if self.testnestedflatbuffer is not None:
+            if np is not None and type(self.testnestedflatbuffer) is np.ndarray:
+                testnestedflatbuffer = builder.CreateNumpyVector(self.testnestedflatbuffer)
+            else:
+                StartTestnestedflatbufferVector(builder, len(self.testnestedflatbuffer))
+                for i in reversed(range(len(self.testnestedflatbuffer))):
+                    builder.PrependUint8(self.testnestedflatbuffer[i])
+                testnestedflatbuffer = builder.EndVector()
+        if self.testempty is not None:
+            testempty = self.testempty.Pack(builder)
+        if self.testarrayofbools is not None:
+            if np is not None and type(self.testarrayofbools) is np.ndarray:
+                testarrayofbools = builder.CreateNumpyVector(self.testarrayofbools)
+            else:
+                StartTestarrayofboolsVector(builder, len(self.testarrayofbools))
+                for i in reversed(range(len(self.testarrayofbools))):
+                    builder.PrependBool(self.testarrayofbools[i])
+                testarrayofbools = builder.EndVector()
+        if self.testarrayofstring2 is not None:
+            testarrayofstring2list = []
+            for i in range(len(self.testarrayofstring2)):
+                testarrayofstring2list.append(builder.CreateString(self.testarrayofstring2[i]))
+            StartTestarrayofstring2Vector(builder, len(self.testarrayofstring2))
+            for i in reversed(range(len(self.testarrayofstring2))):
+                builder.PrependUOffsetTRelative(testarrayofstring2list[i])
+            testarrayofstring2 = builder.EndVector()
+        if self.testarrayofsortedstruct is not None:
+            StartTestarrayofsortedstructVector(builder, len(self.testarrayofsortedstruct))
+            for i in reversed(range(len(self.testarrayofsortedstruct))):
+                self.testarrayofsortedstruct[i].Pack(builder)
+            testarrayofsortedstruct = builder.EndVector()
+        if self.flex is not None:
+            if np is not None and type(self.flex) is np.ndarray:
+                flex = builder.CreateNumpyVector(self.flex)
+            else:
+                StartFlexVector(builder, len(self.flex))
+                for i in reversed(range(len(self.flex))):
+                    builder.PrependUint8(self.flex[i])
+                flex = builder.EndVector()
+        if self.test5 is not None:
+            StartTest5Vector(builder, len(self.test5))
+            for i in reversed(range(len(self.test5))):
+                self.test5[i].Pack(builder)
+            test5 = builder.EndVector()
+        if self.vectorOfLongs is not None:
+            if np is not None and type(self.vectorOfLongs) is np.ndarray:
+                vectorOfLongs = builder.CreateNumpyVector(self.vectorOfLongs)
+            else:
+                StartVectorOfLongsVector(builder, len(self.vectorOfLongs))
+                for i in reversed(range(len(self.vectorOfLongs))):
+                    builder.PrependInt64(self.vectorOfLongs[i])
+                vectorOfLongs = builder.EndVector()
+        if self.vectorOfDoubles is not None:
+            if np is not None and type(self.vectorOfDoubles) is np.ndarray:
+                vectorOfDoubles = builder.CreateNumpyVector(self.vectorOfDoubles)
+            else:
+                StartVectorOfDoublesVector(builder, len(self.vectorOfDoubles))
+                for i in reversed(range(len(self.vectorOfDoubles))):
+                    builder.PrependFloat64(self.vectorOfDoubles[i])
+                vectorOfDoubles = builder.EndVector()
+        if self.parentNamespaceTest is not None:
+            parentNamespaceTest = self.parentNamespaceTest.Pack(builder)
+        if self.vectorOfReferrables is not None:
+            vectorOfReferrableslist = []
+            for i in range(len(self.vectorOfReferrables)):
+                vectorOfReferrableslist.append(self.vectorOfReferrables[i].Pack(builder))
+            StartVectorOfReferrablesVector(builder, len(self.vectorOfReferrables))
+            for i in reversed(range(len(self.vectorOfReferrables))):
+                builder.PrependUOffsetTRelative(vectorOfReferrableslist[i])
+            vectorOfReferrables = builder.EndVector()
+        if self.vectorOfWeakReferences is not None:
+            if np is not None and type(self.vectorOfWeakReferences) is np.ndarray:
+                vectorOfWeakReferences = builder.CreateNumpyVector(self.vectorOfWeakReferences)
+            else:
+                StartVectorOfWeakReferencesVector(builder, len(self.vectorOfWeakReferences))
+                for i in reversed(range(len(self.vectorOfWeakReferences))):
+                    builder.PrependUint64(self.vectorOfWeakReferences[i])
+                vectorOfWeakReferences = builder.EndVector()
+        if self.vectorOfStrongReferrables is not None:
+            vectorOfStrongReferrableslist = []
+            for i in range(len(self.vectorOfStrongReferrables)):
+                vectorOfStrongReferrableslist.append(self.vectorOfStrongReferrables[i].Pack(builder))
+            StartVectorOfStrongReferrablesVector(builder, len(self.vectorOfStrongReferrables))
+            for i in reversed(range(len(self.vectorOfStrongReferrables))):
+                builder.PrependUOffsetTRelative(vectorOfStrongReferrableslist[i])
+            vectorOfStrongReferrables = builder.EndVector()
+        if self.vectorOfCoOwningReferences is not None:
+            if np is not None and type(self.vectorOfCoOwningReferences) is np.ndarray:
+                vectorOfCoOwningReferences = builder.CreateNumpyVector(self.vectorOfCoOwningReferences)
+            else:
+                StartVectorOfCoOwningReferencesVector(builder, len(self.vectorOfCoOwningReferences))
+                for i in reversed(range(len(self.vectorOfCoOwningReferences))):
+                    builder.PrependUint64(self.vectorOfCoOwningReferences[i])
+                vectorOfCoOwningReferences = builder.EndVector()
+        if self.vectorOfNonOwningReferences is not None:
+            if np is not None and type(self.vectorOfNonOwningReferences) is np.ndarray:
+                vectorOfNonOwningReferences = builder.CreateNumpyVector(self.vectorOfNonOwningReferences)
+            else:
+                StartVectorOfNonOwningReferencesVector(builder, len(self.vectorOfNonOwningReferences))
+                for i in reversed(range(len(self.vectorOfNonOwningReferences))):
+                    builder.PrependUint64(self.vectorOfNonOwningReferences[i])
+                vectorOfNonOwningReferences = builder.EndVector()
+        if self.anyUnique is not None:
+            anyUnique = self.anyUnique.Pack(builder)
+        if self.anyAmbiguous is not None:
+            anyAmbiguous = self.anyAmbiguous.Pack(builder)
+        if self.vectorOfEnums is not None:
+            if np is not None and type(self.vectorOfEnums) is np.ndarray:
+                vectorOfEnums = builder.CreateNumpyVector(self.vectorOfEnums)
+            else:
+                StartVectorOfEnumsVector(builder, len(self.vectorOfEnums))
+                for i in reversed(range(len(self.vectorOfEnums))):
+                    builder.PrependUint8(self.vectorOfEnums[i])
+                vectorOfEnums = builder.EndVector()
+        if self.testrequirednestedflatbuffer is not None:
+            if np is not None and type(self.testrequirednestedflatbuffer) is np.ndarray:
+                testrequirednestedflatbuffer = builder.CreateNumpyVector(self.testrequirednestedflatbuffer)
+            else:
+                StartTestrequirednestedflatbufferVector(builder, len(self.testrequirednestedflatbuffer))
+                for i in reversed(range(len(self.testrequirednestedflatbuffer))):
+                    builder.PrependUint8(self.testrequirednestedflatbuffer[i])
+                testrequirednestedflatbuffer = builder.EndVector()
+        if self.scalarKeySortedTables is not None:
+            scalarKeySortedTableslist = []
+            for i in range(len(self.scalarKeySortedTables)):
+                scalarKeySortedTableslist.append(self.scalarKeySortedTables[i].Pack(builder))
+            StartScalarKeySortedTablesVector(builder, len(self.scalarKeySortedTables))
+            for i in reversed(range(len(self.scalarKeySortedTables))):
+                builder.PrependUOffsetTRelative(scalarKeySortedTableslist[i])
+            scalarKeySortedTables = builder.EndVector()
+        Start(builder)
+        if self.pos is not None:
+            pos = self.pos.Pack(builder)
+            AddPos(builder, pos)
+        AddMana(builder, self.mana)
+        AddHp(builder, self.hp)
+        if self.name is not None:
+            AddName(builder, name)
+        if self.inventory is not None:
+            AddInventory(builder, inventory)
+        AddColor(builder, self.color)
+        AddTestType(builder, self.testType)
+        if self.test is not None:
+            AddTest(builder, test)
+        if self.test4 is not None:
+            AddTest4(builder, test4)
+        if self.testarrayofstring is not None:
+            AddTestarrayofstring(builder, testarrayofstring)
+        if self.testarrayoftables is not None:
+            AddTestarrayoftables(builder, testarrayoftables)
+        if self.enemy is not None:
+            AddEnemy(builder, enemy)
+        if self.testnestedflatbuffer is not None:
+            AddTestnestedflatbuffer(builder, testnestedflatbuffer)
+        if self.testempty is not None:
+            AddTestempty(builder, testempty)
+        AddTestbool(builder, self.testbool)
+        AddTesthashs32Fnv1(builder, self.testhashs32Fnv1)
+        AddTesthashu32Fnv1(builder, self.testhashu32Fnv1)
+        AddTesthashs64Fnv1(builder, self.testhashs64Fnv1)
+        AddTesthashu64Fnv1(builder, self.testhashu64Fnv1)
+        AddTesthashs32Fnv1a(builder, self.testhashs32Fnv1a)
+        AddTesthashu32Fnv1a(builder, self.testhashu32Fnv1a)
+        AddTesthashs64Fnv1a(builder, self.testhashs64Fnv1a)
+        AddTesthashu64Fnv1a(builder, self.testhashu64Fnv1a)
+        if self.testarrayofbools is not None:
+            AddTestarrayofbools(builder, testarrayofbools)
+        AddTestf(builder, self.testf)
+        AddTestf2(builder, self.testf2)
+        AddTestf3(builder, self.testf3)
+        if self.testarrayofstring2 is not None:
+            AddTestarrayofstring2(builder, testarrayofstring2)
+        if self.testarrayofsortedstruct is not None:
+            AddTestarrayofsortedstruct(builder, testarrayofsortedstruct)
+        if self.flex is not None:
+            AddFlex(builder, flex)
+        if self.test5 is not None:
+            AddTest5(builder, test5)
+        if self.vectorOfLongs is not None:
+            AddVectorOfLongs(builder, vectorOfLongs)
+        if self.vectorOfDoubles is not None:
+            AddVectorOfDoubles(builder, vectorOfDoubles)
+        if self.parentNamespaceTest is not None:
+            AddParentNamespaceTest(builder, parentNamespaceTest)
+        if self.vectorOfReferrables is not None:
+            AddVectorOfReferrables(builder, vectorOfReferrables)
+        AddSingleWeakReference(builder, self.singleWeakReference)
+        if self.vectorOfWeakReferences is not None:
+            AddVectorOfWeakReferences(builder, vectorOfWeakReferences)
+        if self.vectorOfStrongReferrables is not None:
+            AddVectorOfStrongReferrables(builder, vectorOfStrongReferrables)
+        AddCoOwningReference(builder, self.coOwningReference)
+        if self.vectorOfCoOwningReferences is not None:
+            AddVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences)
+        AddNonOwningReference(builder, self.nonOwningReference)
+        if self.vectorOfNonOwningReferences is not None:
+            AddVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences)
+        AddAnyUniqueType(builder, self.anyUniqueType)
+        if self.anyUnique is not None:
+            AddAnyUnique(builder, anyUnique)
+        AddAnyAmbiguousType(builder, self.anyAmbiguousType)
+        if self.anyAmbiguous is not None:
+            AddAnyAmbiguous(builder, anyAmbiguous)
+        if self.vectorOfEnums is not None:
+            AddVectorOfEnums(builder, vectorOfEnums)
+        AddSignedEnum(builder, self.signedEnum)
+        if self.testrequirednestedflatbuffer is not None:
+            AddTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer)
+        if self.scalarKeySortedTables is not None:
+            AddScalarKeySortedTables(builder, scalarKeySortedTables)
+        monster = End(builder)
+        return monster
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorageGrpc.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorageGrpc.java
new file mode 100644
index 0000000..40103ae
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorageGrpc.java
@@ -0,0 +1,470 @@
+//Generated by flatc compiler (version 2.0.0)
+//If you make any local changes, they will be lost
+//source: monster_test.fbs
+
+package MyGame.Example;
+
+import com.google.flatbuffers.grpc.FlatbuffersUtils;
+
+import java.nio.ByteBuffer;
+import static io.grpc.MethodDescriptor.generateFullMethodName;
+import static io.grpc.stub.ClientCalls.asyncBidiStreamingCall;
+import static io.grpc.stub.ClientCalls.asyncClientStreamingCall;
+import static io.grpc.stub.ClientCalls.asyncServerStreamingCall;
+import static io.grpc.stub.ClientCalls.asyncUnaryCall;
+import static io.grpc.stub.ClientCalls.blockingServerStreamingCall;
+import static io.grpc.stub.ClientCalls.blockingUnaryCall;
+import static io.grpc.stub.ClientCalls.futureUnaryCall;
+import static io.grpc.stub.ServerCalls.asyncBidiStreamingCall;
+import static io.grpc.stub.ServerCalls.asyncClientStreamingCall;
+import static io.grpc.stub.ServerCalls.asyncServerStreamingCall;
+import static io.grpc.stub.ServerCalls.asyncUnaryCall;
+import static io.grpc.stub.ServerCalls.asyncUnimplementedStreamingCall;
+import static io.grpc.stub.ServerCalls.asyncUnimplementedUnaryCall;
+
+/**
+ */
+@javax.annotation.Generated(
+    value = "by gRPC proto compiler",
+    comments = "Source: monster_test.fbs")
+public final class MonsterStorageGrpc {
+
+  private MonsterStorageGrpc() {}
+  
+  public static final String SERVICE_NAME = "MyGame.Example.MonsterStorage";
+  
+  // Static method descriptors that strictly reflect the proto.
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  @java.lang.Deprecated // Use {@link #getStoreMethod()} instead. 
+  public static final io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> METHOD_STORE = getStoreMethod();
+  
+  private static volatile io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getStoreMethod;
+  
+  private static volatile FlatbuffersUtils.FBExtactor<MyGame.Example.Monster> extractorOfMonster;
+  private static FlatbuffersUtils.FBExtactor<MyGame.Example.Monster> getExtractorOfMonster() {
+      if (extractorOfMonster != null) return extractorOfMonster;
+      synchronized (MonsterStorageGrpc.class) {
+          if (extractorOfMonster != null) return extractorOfMonster;
+          extractorOfMonster = new FlatbuffersUtils.FBExtactor<MyGame.Example.Monster>() {
+              public MyGame.Example.Monster extract (ByteBuffer buffer) {
+                  return MyGame.Example.Monster.getRootAsMonster(buffer);
+              }
+          };
+          return extractorOfMonster;
+      }
+  }
+  
+  private static volatile FlatbuffersUtils.FBExtactor<MyGame.Example.Stat> extractorOfStat;
+  private static FlatbuffersUtils.FBExtactor<MyGame.Example.Stat> getExtractorOfStat() {
+      if (extractorOfStat != null) return extractorOfStat;
+      synchronized (MonsterStorageGrpc.class) {
+          if (extractorOfStat != null) return extractorOfStat;
+          extractorOfStat = new FlatbuffersUtils.FBExtactor<MyGame.Example.Stat>() {
+              public MyGame.Example.Stat extract (ByteBuffer buffer) {
+                  return MyGame.Example.Stat.getRootAsStat(buffer);
+              }
+          };
+          return extractorOfStat;
+      }
+  }
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  public static io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getStoreMethod() {
+    io.grpc.MethodDescriptor<MyGame.Example.Monster, MyGame.Example.Stat> getStoreMethod;
+    if ((getStoreMethod = MonsterStorageGrpc.getStoreMethod) == null) {
+      synchronized (MonsterStorageGrpc.class) {
+        if ((getStoreMethod = MonsterStorageGrpc.getStoreMethod) == null) {
+          MonsterStorageGrpc.getStoreMethod = getStoreMethod = 
+              io.grpc.MethodDescriptor.<MyGame.Example.Monster, MyGame.Example.Stat>newBuilder()
+              .setType(io.grpc.MethodDescriptor.MethodType.UNARY)
+              .setFullMethodName(generateFullMethodName(
+                  "MyGame.Example.MonsterStorage", "Store"))
+              .setSampledToLocalTracing(true)
+              .setRequestMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Monster.class, getExtractorOfMonster()))
+              .setResponseMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Stat.class, getExtractorOfStat()))
+                  .setSchemaDescriptor(null)
+                  .build();
+          }
+        }
+     }
+     return getStoreMethod;
+  }
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  @java.lang.Deprecated // Use {@link #getRetrieveMethod()} instead. 
+  public static final io.grpc.MethodDescriptor<MyGame.Example.Stat,
+      MyGame.Example.Monster> METHOD_RETRIEVE = getRetrieveMethod();
+  
+  private static volatile io.grpc.MethodDescriptor<MyGame.Example.Stat,
+      MyGame.Example.Monster> getRetrieveMethod;
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  public static io.grpc.MethodDescriptor<MyGame.Example.Stat,
+      MyGame.Example.Monster> getRetrieveMethod() {
+    io.grpc.MethodDescriptor<MyGame.Example.Stat, MyGame.Example.Monster> getRetrieveMethod;
+    if ((getRetrieveMethod = MonsterStorageGrpc.getRetrieveMethod) == null) {
+      synchronized (MonsterStorageGrpc.class) {
+        if ((getRetrieveMethod = MonsterStorageGrpc.getRetrieveMethod) == null) {
+          MonsterStorageGrpc.getRetrieveMethod = getRetrieveMethod = 
+              io.grpc.MethodDescriptor.<MyGame.Example.Stat, MyGame.Example.Monster>newBuilder()
+              .setType(io.grpc.MethodDescriptor.MethodType.SERVER_STREAMING)
+              .setFullMethodName(generateFullMethodName(
+                  "MyGame.Example.MonsterStorage", "Retrieve"))
+              .setSampledToLocalTracing(true)
+              .setRequestMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Stat.class, getExtractorOfStat()))
+              .setResponseMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Monster.class, getExtractorOfMonster()))
+                  .setSchemaDescriptor(null)
+                  .build();
+          }
+        }
+     }
+     return getRetrieveMethod;
+  }
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  @java.lang.Deprecated // Use {@link #getGetMaxHitPointMethod()} instead. 
+  public static final io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> METHOD_GET_MAX_HIT_POINT = getGetMaxHitPointMethod();
+  
+  private static volatile io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getGetMaxHitPointMethod;
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  public static io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getGetMaxHitPointMethod() {
+    io.grpc.MethodDescriptor<MyGame.Example.Monster, MyGame.Example.Stat> getGetMaxHitPointMethod;
+    if ((getGetMaxHitPointMethod = MonsterStorageGrpc.getGetMaxHitPointMethod) == null) {
+      synchronized (MonsterStorageGrpc.class) {
+        if ((getGetMaxHitPointMethod = MonsterStorageGrpc.getGetMaxHitPointMethod) == null) {
+          MonsterStorageGrpc.getGetMaxHitPointMethod = getGetMaxHitPointMethod = 
+              io.grpc.MethodDescriptor.<MyGame.Example.Monster, MyGame.Example.Stat>newBuilder()
+              .setType(io.grpc.MethodDescriptor.MethodType.CLIENT_STREAMING)
+              .setFullMethodName(generateFullMethodName(
+                  "MyGame.Example.MonsterStorage", "GetMaxHitPoint"))
+              .setSampledToLocalTracing(true)
+              .setRequestMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Monster.class, getExtractorOfMonster()))
+              .setResponseMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Stat.class, getExtractorOfStat()))
+                  .setSchemaDescriptor(null)
+                  .build();
+          }
+        }
+     }
+     return getGetMaxHitPointMethod;
+  }
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  @java.lang.Deprecated // Use {@link #getGetMinMaxHitPointsMethod()} instead. 
+  public static final io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> METHOD_GET_MIN_MAX_HIT_POINTS = getGetMinMaxHitPointsMethod();
+  
+  private static volatile io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getGetMinMaxHitPointsMethod;
+  
+  @io.grpc.ExperimentalApi("https://github.com/grpc/grpc-java/issues/1901")
+  public static io.grpc.MethodDescriptor<MyGame.Example.Monster,
+      MyGame.Example.Stat> getGetMinMaxHitPointsMethod() {
+    io.grpc.MethodDescriptor<MyGame.Example.Monster, MyGame.Example.Stat> getGetMinMaxHitPointsMethod;
+    if ((getGetMinMaxHitPointsMethod = MonsterStorageGrpc.getGetMinMaxHitPointsMethod) == null) {
+      synchronized (MonsterStorageGrpc.class) {
+        if ((getGetMinMaxHitPointsMethod = MonsterStorageGrpc.getGetMinMaxHitPointsMethod) == null) {
+          MonsterStorageGrpc.getGetMinMaxHitPointsMethod = getGetMinMaxHitPointsMethod = 
+              io.grpc.MethodDescriptor.<MyGame.Example.Monster, MyGame.Example.Stat>newBuilder()
+              .setType(io.grpc.MethodDescriptor.MethodType.BIDI_STREAMING)
+              .setFullMethodName(generateFullMethodName(
+                  "MyGame.Example.MonsterStorage", "GetMinMaxHitPoints"))
+              .setSampledToLocalTracing(true)
+              .setRequestMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Monster.class, getExtractorOfMonster()))
+              .setResponseMarshaller(FlatbuffersUtils.marshaller(
+                  MyGame.Example.Stat.class, getExtractorOfStat()))
+                  .setSchemaDescriptor(null)
+                  .build();
+          }
+        }
+     }
+     return getGetMinMaxHitPointsMethod;
+  }
+  
+  /**
+   * Creates a new async stub that supports all call types for the service
+   */
+  public static MonsterStorageStub newStub(io.grpc.Channel channel) {
+    return new MonsterStorageStub(channel);
+  }
+  
+  /**
+   * Creates a new blocking-style stub that supports unary and streaming output calls on the service
+   */
+  public static MonsterStorageBlockingStub newBlockingStub(
+      io.grpc.Channel channel) {
+    return new MonsterStorageBlockingStub(channel);
+  }
+  
+  /**
+   * Creates a new ListenableFuture-style stub that supports unary calls on the service
+   */
+  public static MonsterStorageFutureStub newFutureStub(
+      io.grpc.Channel channel) {
+    return new MonsterStorageFutureStub(channel);
+  }
+  
+  /**
+   */
+  public static abstract class MonsterStorageImplBase implements io.grpc.BindableService {
+    
+    /**
+     */
+    public     void store(MyGame.Example.Monster request,
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      asyncUnimplementedUnaryCall(getStoreMethod(), responseObserver);
+    }
+    
+    /**
+     */
+    public     void retrieve(MyGame.Example.Stat request,
+        io.grpc.stub.StreamObserver<MyGame.Example.Monster> responseObserver)     {
+      asyncUnimplementedUnaryCall(getRetrieveMethod(), responseObserver);
+    }
+    
+    /**
+     */
+    public     io.grpc.stub.StreamObserver<MyGame.Example.Monster> getMaxHitPoint(
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      return asyncUnimplementedStreamingCall(getGetMaxHitPointMethod(), responseObserver);
+    }
+    
+    /**
+     */
+    public     io.grpc.stub.StreamObserver<MyGame.Example.Monster> getMinMaxHitPoints(
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      return asyncUnimplementedStreamingCall(getGetMinMaxHitPointsMethod(), responseObserver);
+    }
+    
+    @java.lang.Override public final io.grpc.ServerServiceDefinition bindService() {
+      return io.grpc.ServerServiceDefinition.builder(getServiceDescriptor())
+          .addMethod(
+            getStoreMethod(),
+            asyncUnaryCall(
+              new MethodHandlers<
+                MyGame.Example.Monster,
+                MyGame.Example.Stat>(
+                  this, METHODID_STORE)))
+          .addMethod(
+            getRetrieveMethod(),
+            asyncServerStreamingCall(
+              new MethodHandlers<
+                MyGame.Example.Stat,
+                MyGame.Example.Monster>(
+                  this, METHODID_RETRIEVE)))
+          .addMethod(
+            getGetMaxHitPointMethod(),
+            asyncClientStreamingCall(
+              new MethodHandlers<
+                MyGame.Example.Monster,
+                MyGame.Example.Stat>(
+                  this, METHODID_GET_MAX_HIT_POINT)))
+          .addMethod(
+            getGetMinMaxHitPointsMethod(),
+            asyncBidiStreamingCall(
+              new MethodHandlers<
+                MyGame.Example.Monster,
+                MyGame.Example.Stat>(
+                  this, METHODID_GET_MIN_MAX_HIT_POINTS)))
+          .build();
+    }
+  }
+  
+  /**
+   */
+  public static final class MonsterStorageStub extends io.grpc.stub.AbstractStub<MonsterStorageStub> {
+    private MonsterStorageStub(io.grpc.Channel channel) {
+      super(channel);
+    }
+    
+    private MonsterStorageStub(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      super(channel, callOptions);
+    }
+    
+    @java.lang.Override
+    protected MonsterStorageStub build(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      return new MonsterStorageStub(channel, callOptions);
+    }
+    
+    /**
+     */
+    public     void store(MyGame.Example.Monster request,
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      asyncUnaryCall(
+          getChannel().newCall(getStoreMethod(), getCallOptions()), request, responseObserver);
+    }
+    
+    /**
+     */
+    public     void retrieve(MyGame.Example.Stat request,
+        io.grpc.stub.StreamObserver<MyGame.Example.Monster> responseObserver)     {
+      asyncServerStreamingCall(
+          getChannel().newCall(getRetrieveMethod(), getCallOptions()), request, responseObserver);
+    }
+    
+    /**
+     */
+    public     io.grpc.stub.StreamObserver<MyGame.Example.Monster> getMaxHitPoint(
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      return asyncClientStreamingCall(
+          getChannel().newCall(getGetMaxHitPointMethod(), getCallOptions()), responseObserver);
+    }
+    
+    /**
+     */
+    public     io.grpc.stub.StreamObserver<MyGame.Example.Monster> getMinMaxHitPoints(
+        io.grpc.stub.StreamObserver<MyGame.Example.Stat> responseObserver)     {
+      return asyncBidiStreamingCall(
+          getChannel().newCall(getGetMinMaxHitPointsMethod(), getCallOptions()), responseObserver);
+    }
+  }
+  
+  /**
+   */
+  public static final class MonsterStorageBlockingStub extends io.grpc.stub.AbstractStub<MonsterStorageBlockingStub> {
+    private MonsterStorageBlockingStub(io.grpc.Channel channel) {
+      super(channel);
+    }
+    
+    private MonsterStorageBlockingStub(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      super(channel, callOptions);
+    }
+    
+    @java.lang.Override
+    protected MonsterStorageBlockingStub build(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      return new MonsterStorageBlockingStub(channel, callOptions);
+    }
+    
+    /**
+     */
+    public     MyGame.Example.Stat store(MyGame.Example.Monster request)     {
+      return blockingUnaryCall(
+          getChannel(), getStoreMethod(), getCallOptions(), request);
+    }
+    
+    /**
+     */
+    public     java.util.Iterator<MyGame.Example.Monster> retrieve(
+        MyGame.Example.Stat request)     {
+      return blockingServerStreamingCall(
+          getChannel(), getRetrieveMethod(), getCallOptions(), request);
+    }
+  }
+  
+  /**
+   */
+  public static final class MonsterStorageFutureStub extends io.grpc.stub.AbstractStub<MonsterStorageFutureStub> {
+    private MonsterStorageFutureStub(io.grpc.Channel channel) {
+      super(channel);
+    }
+    
+    private MonsterStorageFutureStub(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      super(channel, callOptions);
+    }
+    
+    @java.lang.Override
+    protected MonsterStorageFutureStub build(io.grpc.Channel channel,
+        io.grpc.CallOptions callOptions) {
+      return new MonsterStorageFutureStub(channel, callOptions);
+    }
+    
+    /**
+     */
+    public     com.google.common.util.concurrent.ListenableFuture<MyGame.Example.Stat> store(
+        MyGame.Example.Monster request)     {
+      return futureUnaryCall(
+          getChannel().newCall(getStoreMethod(), getCallOptions()), request);
+    }
+  }
+  
+  private static final int METHODID_STORE = 0;
+  private static final int METHODID_RETRIEVE = 1;
+  private static final int METHODID_GET_MIN_MAX_HIT_POINTS = 2;
+  private static final int METHODID_GET_MAX_HIT_POINT = 3;
+  
+  private static final class MethodHandlers<Req, Resp> implements
+      io.grpc.stub.ServerCalls.UnaryMethod<Req, Resp>,
+      io.grpc.stub.ServerCalls.ServerStreamingMethod<Req, Resp>,
+      io.grpc.stub.ServerCalls.ClientStreamingMethod<Req, Resp>,
+      io.grpc.stub.ServerCalls.BidiStreamingMethod<Req, Resp> {
+    private final MonsterStorageImplBase serviceImpl;
+    private final int methodId;
+  
+    MethodHandlers(MonsterStorageImplBase serviceImpl, int methodId) {
+      this.serviceImpl = serviceImpl;
+      this.methodId = methodId;
+    }
+  
+    @java.lang.Override
+    @java.lang.SuppressWarnings("unchecked")
+    public void invoke(Req request, io.grpc.stub.StreamObserver<Resp> responseObserver) {
+      switch (methodId) {
+        case METHODID_STORE:
+          serviceImpl.store((MyGame.Example.Monster) request,
+              (io.grpc.stub.StreamObserver<MyGame.Example.Stat>) responseObserver);
+          break;
+        case METHODID_RETRIEVE:
+          serviceImpl.retrieve((MyGame.Example.Stat) request,
+              (io.grpc.stub.StreamObserver<MyGame.Example.Monster>) responseObserver);
+          break;
+        default:
+          throw new AssertionError();
+      }
+    }
+    
+    @java.lang.Override
+    @java.lang.SuppressWarnings("unchecked")
+    public io.grpc.stub.StreamObserver<Req> invoke(
+        io.grpc.stub.StreamObserver<Resp> responseObserver) {
+      switch (methodId) {
+        case METHODID_GET_MAX_HIT_POINT:
+          return (io.grpc.stub.StreamObserver<Req>) serviceImpl.getMaxHitPoint(
+              (io.grpc.stub.StreamObserver<MyGame.Example.Stat>) responseObserver);
+        case METHODID_GET_MIN_MAX_HIT_POINTS:
+          return (io.grpc.stub.StreamObserver<Req>) serviceImpl.getMinMaxHitPoints(
+              (io.grpc.stub.StreamObserver<MyGame.Example.Stat>) responseObserver);
+        default:
+          throw new AssertionError();
+      }
+    }
+  }
+  
+  private static volatile io.grpc.ServiceDescriptor serviceDescriptor;
+  
+  public static io.grpc.ServiceDescriptor getServiceDescriptor() {
+    io.grpc.ServiceDescriptor result = serviceDescriptor;
+    if (result == null) {
+      synchronized (MonsterStorageGrpc.class) {
+        result = serviceDescriptor;
+        if (result == null) {
+          serviceDescriptor = result = io.grpc.ServiceDescriptor.newBuilder(SERVICE_NAME)              
+              .setSchemaDescriptor(null)              
+              .addMethod(getStoreMethod())              
+              .addMethod(getRetrieveMethod())              
+              .addMethod(getGetMaxHitPointMethod())              
+              .addMethod(getGetMinMaxHitPointsMethod())              
+              .build();
+        }
+      }
+    }
+    return result;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorage_grpc.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorage_grpc.go
new file mode 100644
index 0000000..4d9f104
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/MonsterStorage_grpc.go
@@ -0,0 +1,302 @@
+//Generated by gRPC Go plugin
+//If you make any local changes, they will be lost
+//source: monster_test
+
+package Example
+
+import (
+	context "context"
+	flatbuffers "github.com/google/flatbuffers/go"
+	grpc "google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// Client API for MonsterStorage service
+type MonsterStorageClient interface {
+	Store(ctx context.Context, in *flatbuffers.Builder,
+		opts ...grpc.CallOption) (*Stat, error)
+	Retrieve(ctx context.Context, in *flatbuffers.Builder,
+		opts ...grpc.CallOption) (MonsterStorage_RetrieveClient, error)
+	GetMaxHitPoint(ctx context.Context,
+		opts ...grpc.CallOption) (MonsterStorage_GetMaxHitPointClient, error)
+	GetMinMaxHitPoints(ctx context.Context,
+		opts ...grpc.CallOption) (MonsterStorage_GetMinMaxHitPointsClient, error)
+}
+
+type monsterStorageClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewMonsterStorageClient(cc grpc.ClientConnInterface) MonsterStorageClient {
+	return &monsterStorageClient{cc}
+}
+
+func (c *monsterStorageClient) Store(ctx context.Context, in *flatbuffers.Builder,
+	opts ...grpc.CallOption) (*Stat, error) {
+	out := new(Stat)
+	err := c.cc.Invoke(ctx, "/MyGame.Example.MonsterStorage/Store", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *monsterStorageClient) Retrieve(ctx context.Context, in *flatbuffers.Builder,
+	opts ...grpc.CallOption) (MonsterStorage_RetrieveClient, error) {
+	stream, err := c.cc.NewStream(ctx, &_MonsterStorage_serviceDesc.Streams[0], "/MyGame.Example.MonsterStorage/Retrieve", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &monsterStorageRetrieveClient{stream}
+	if err := x.ClientStream.SendMsg(in); err != nil {
+		return nil, err
+	}
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	return x, nil
+}
+
+type MonsterStorage_RetrieveClient interface {
+	Recv() (*Monster, error)
+	grpc.ClientStream
+}
+
+type monsterStorageRetrieveClient struct {
+	grpc.ClientStream
+}
+
+func (x *monsterStorageRetrieveClient) Recv() (*Monster, error) {
+	m := new(Monster)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (c *monsterStorageClient) GetMaxHitPoint(ctx context.Context,
+	opts ...grpc.CallOption) (MonsterStorage_GetMaxHitPointClient, error) {
+	stream, err := c.cc.NewStream(ctx, &_MonsterStorage_serviceDesc.Streams[1], "/MyGame.Example.MonsterStorage/GetMaxHitPoint", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &monsterStorageGetMaxHitPointClient{stream}
+	return x, nil
+}
+
+type MonsterStorage_GetMaxHitPointClient interface {
+	Send(*flatbuffers.Builder) error
+	CloseAndRecv() (*Stat, error)
+	grpc.ClientStream
+}
+
+type monsterStorageGetMaxHitPointClient struct {
+	grpc.ClientStream
+}
+
+func (x *monsterStorageGetMaxHitPointClient) Send(m *flatbuffers.Builder) error {
+	return x.ClientStream.SendMsg(m)
+}
+
+func (x *monsterStorageGetMaxHitPointClient) CloseAndRecv() (*Stat, error) {
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	m := new(Stat)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (c *monsterStorageClient) GetMinMaxHitPoints(ctx context.Context,
+	opts ...grpc.CallOption) (MonsterStorage_GetMinMaxHitPointsClient, error) {
+	stream, err := c.cc.NewStream(ctx, &_MonsterStorage_serviceDesc.Streams[2], "/MyGame.Example.MonsterStorage/GetMinMaxHitPoints", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &monsterStorageGetMinMaxHitPointsClient{stream}
+	return x, nil
+}
+
+type MonsterStorage_GetMinMaxHitPointsClient interface {
+	Send(*flatbuffers.Builder) error
+	Recv() (*Stat, error)
+	grpc.ClientStream
+}
+
+type monsterStorageGetMinMaxHitPointsClient struct {
+	grpc.ClientStream
+}
+
+func (x *monsterStorageGetMinMaxHitPointsClient) Send(m *flatbuffers.Builder) error {
+	return x.ClientStream.SendMsg(m)
+}
+
+func (x *monsterStorageGetMinMaxHitPointsClient) Recv() (*Stat, error) {
+	m := new(Stat)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+// Server API for MonsterStorage service
+type MonsterStorageServer interface {
+	Store(context.Context, *Monster) (*flatbuffers.Builder, error)
+	Retrieve(*Stat, MonsterStorage_RetrieveServer) error
+	GetMaxHitPoint(MonsterStorage_GetMaxHitPointServer) error
+	GetMinMaxHitPoints(MonsterStorage_GetMinMaxHitPointsServer) error
+	mustEmbedUnimplementedMonsterStorageServer()
+}
+
+type UnimplementedMonsterStorageServer struct {
+}
+
+func (UnimplementedMonsterStorageServer) Store(context.Context, *Monster) (*flatbuffers.Builder, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Store not implemented")
+}
+
+func (UnimplementedMonsterStorageServer) Retrieve(*Stat, MonsterStorage_RetrieveServer) error {
+	return status.Errorf(codes.Unimplemented, "method Retrieve not implemented")
+}
+
+func (UnimplementedMonsterStorageServer) GetMaxHitPoint(MonsterStorage_GetMaxHitPointServer) error {
+	return status.Errorf(codes.Unimplemented, "method GetMaxHitPoint not implemented")
+}
+
+func (UnimplementedMonsterStorageServer) GetMinMaxHitPoints(MonsterStorage_GetMinMaxHitPointsServer) error {
+	return status.Errorf(codes.Unimplemented, "method GetMinMaxHitPoints not implemented")
+}
+
+func (UnimplementedMonsterStorageServer) mustEmbedUnimplementedMonsterStorageServer() {}
+
+type UnsafeMonsterStorageServer interface {
+	mustEmbedUnimplementedMonsterStorageServer()
+}
+
+func RegisterMonsterStorageServer(s grpc.ServiceRegistrar, srv MonsterStorageServer) {
+	s.RegisterService(&_MonsterStorage_serviceDesc, srv)
+}
+
+func _MonsterStorage_Store_Handler(srv interface{}, ctx context.Context,
+	dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(Monster)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(MonsterStorageServer).Store(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/MyGame.Example.MonsterStorage/Store",
+	}
+
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(MonsterStorageServer).Store(ctx, req.(*Monster))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+func _MonsterStorage_Retrieve_Handler(srv interface{}, stream grpc.ServerStream) error {
+	m := new(Stat)
+	if err := stream.RecvMsg(m); err != nil {
+		return err
+	}
+	return srv.(MonsterStorageServer).Retrieve(m, &monsterStorageRetrieveServer{stream})
+}
+
+type MonsterStorage_RetrieveServer interface {
+	Send(*flatbuffers.Builder) error
+	grpc.ServerStream
+}
+
+type monsterStorageRetrieveServer struct {
+	grpc.ServerStream
+}
+
+func (x *monsterStorageRetrieveServer) Send(m *flatbuffers.Builder) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func _MonsterStorage_GetMaxHitPoint_Handler(srv interface{}, stream grpc.ServerStream) error {
+	return srv.(MonsterStorageServer).GetMaxHitPoint(&monsterStorageGetMaxHitPointServer{stream})
+}
+
+type MonsterStorage_GetMaxHitPointServer interface {
+	Recv() (*Monster, error)
+	SendAndClose(*flatbuffers.Builder) error
+	grpc.ServerStream
+}
+
+type monsterStorageGetMaxHitPointServer struct {
+	grpc.ServerStream
+}
+
+func (x *monsterStorageGetMaxHitPointServer) Recv() (*Monster, error) {
+	m := new(Monster)
+	if err := x.ServerStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (x *monsterStorageGetMaxHitPointServer) SendAndClose(m *flatbuffers.Builder) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func _MonsterStorage_GetMinMaxHitPoints_Handler(srv interface{}, stream grpc.ServerStream) error {
+	return srv.(MonsterStorageServer).GetMinMaxHitPoints(&monsterStorageGetMinMaxHitPointsServer{stream})
+}
+
+type MonsterStorage_GetMinMaxHitPointsServer interface {
+	Send(*flatbuffers.Builder) error
+	Recv() (*Monster, error)
+	grpc.ServerStream
+}
+
+type monsterStorageGetMinMaxHitPointsServer struct {
+	grpc.ServerStream
+}
+
+func (x *monsterStorageGetMinMaxHitPointsServer) Send(m *flatbuffers.Builder) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func (x *monsterStorageGetMinMaxHitPointsServer) Recv() (*Monster, error) {
+	m := new(Monster)
+	if err := x.ServerStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+var _MonsterStorage_serviceDesc = grpc.ServiceDesc{
+	ServiceName: "MyGame.Example.MonsterStorage",
+	HandlerType: (*MonsterStorageServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "Store",
+			Handler:    _MonsterStorage_Store_Handler,
+		},
+	},
+	Streams: []grpc.StreamDesc{
+		{
+			StreamName:    "Retrieve",
+			Handler:       _MonsterStorage_Retrieve_Handler,
+			ServerStreams: true,
+		},
+		{
+			StreamName:    "GetMaxHitPoint",
+			Handler:       _MonsterStorage_GetMaxHitPoint_Handler,
+			ClientStreams: true,
+		},
+		{
+			StreamName:    "GetMinMaxHitPoints",
+			Handler:       _MonsterStorage_GetMinMaxHitPoints_Handler,
+			ServerStreams: true,
+			ClientStreams: true,
+		},
+	},
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.cs
new file mode 100644
index 0000000..13e3498
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.cs
@@ -0,0 +1,91 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct NestedStruct : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public NestedStruct __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int A(int j) { return __p.bb.GetInt(__p.bb_pos + 0 + j * 4); }
+  public void MutateA(int j, int a) { __p.bb.PutInt(__p.bb_pos + 0 + j * 4, a); }
+  public MyGame.Example.TestEnum B { get { return (MyGame.Example.TestEnum)__p.bb.GetSbyte(__p.bb_pos + 8); } }
+  public void MutateB(MyGame.Example.TestEnum b) { __p.bb.PutSbyte(__p.bb_pos + 8, (sbyte)b); }
+  public MyGame.Example.TestEnum C(int j) { return (MyGame.Example.TestEnum)__p.bb.GetSbyte(__p.bb_pos + 9 + j * 1); }
+  public void MutateC(int j, MyGame.Example.TestEnum c) { __p.bb.PutSbyte(__p.bb_pos + 9 + j * 1, (sbyte)c); }
+  public long D(int j) { return __p.bb.GetLong(__p.bb_pos + 16 + j * 8); }
+  public void MutateD(int j, long d) { __p.bb.PutLong(__p.bb_pos + 16 + j * 8, d); }
+
+  public static Offset<MyGame.Example.NestedStruct> CreateNestedStruct(FlatBufferBuilder builder, int[] A, MyGame.Example.TestEnum B, MyGame.Example.TestEnum[] C, long[] D) {
+    builder.Prep(8, 32);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.PutLong(D[_idx0-1]);
+    }
+    builder.Pad(5);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.PutSbyte((sbyte)C[_idx0-1]);
+    }
+    builder.PutSbyte((sbyte)B);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.PutInt(A[_idx0-1]);
+    }
+    return new Offset<MyGame.Example.NestedStruct>(builder.Offset);
+  }
+  public NestedStructT UnPack() {
+    var _o = new NestedStructT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(NestedStructT _o) {
+    _o.A = new int[2];
+    for (var _j = 0; _j < 2; ++_j) { _o.A[_j] = this.A(_j); }
+    _o.B = this.B;
+    _o.C = new MyGame.Example.TestEnum[2];
+    for (var _j = 0; _j < 2; ++_j) { _o.C[_j] = this.C(_j); }
+    _o.D = new long[2];
+    for (var _j = 0; _j < 2; ++_j) { _o.D[_j] = this.D(_j); }
+  }
+  public static Offset<MyGame.Example.NestedStruct> Pack(FlatBufferBuilder builder, NestedStructT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.NestedStruct>);
+    var _a = _o.A;
+    var _c = _o.C;
+    var _d = _o.D;
+    return CreateNestedStruct(
+      builder,
+      _a,
+      _o.B,
+      _c,
+      _d);
+  }
+};
+
+public class NestedStructT
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public int[] A { get; set; }
+  [Newtonsoft.Json.JsonProperty("b")]
+  public MyGame.Example.TestEnum B { get; set; }
+  [Newtonsoft.Json.JsonProperty("c")]
+  public MyGame.Example.TestEnum[] C { get; set; }
+  [Newtonsoft.Json.JsonProperty("d")]
+  public long[] D { get; set; }
+
+  public NestedStructT() {
+    this.A = new int[2];
+    this.B = MyGame.Example.TestEnum.A;
+    this.C = new MyGame.Example.TestEnum[2];
+    this.D = new long[2];
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.java
new file mode 100644
index 0000000..fbfedd8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.java
@@ -0,0 +1,47 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class NestedStruct extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public NestedStruct __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int a(int j) { return bb.getInt(bb_pos + 0 + j * 4); }
+  public void mutateA(int j, int a) { bb.putInt(bb_pos + 0 + j * 4, a); }
+  public byte b() { return bb.get(bb_pos + 8); }
+  public void mutateB(byte b) { bb.put(bb_pos + 8, b); }
+  public byte c(int j) { return bb.get(bb_pos + 9 + j * 1); }
+  public void mutateC(int j, byte c) { bb.put(bb_pos + 9 + j * 1, c); }
+  public long d(int j) { return bb.getLong(bb_pos + 16 + j * 8); }
+  public void mutateD(int j, long d) { bb.putLong(bb_pos + 16 + j * 8, d); }
+
+  public static int createNestedStruct(FlatBufferBuilder builder, int[] a, byte b, byte[] c, long[] d) {
+    builder.prep(8, 32);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.putLong(d[_idx0-1]);
+    }
+    builder.pad(5);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.putByte(c[_idx0-1]);
+    }
+    builder.putByte(b);
+    for (int _idx0 = 2; _idx0 > 0; _idx0--) {
+      builder.putInt(a[_idx0-1]);
+    }
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public NestedStruct get(int j) { return get(new NestedStruct(), j); }
+    public NestedStruct get(NestedStruct obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.py
new file mode 100644
index 0000000..a9db014
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/NestedStruct.py
@@ -0,0 +1,132 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class NestedStruct(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 32
+
+    # NestedStruct
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # NestedStruct
+    def A(self): return [self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0 + i * 4)) for i in range(2)]
+    # NestedStruct
+    def ALength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(0))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # NestedStruct
+    def AIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(0))
+        return o == 0
+
+    # NestedStruct
+    def B(self): return self._tab.Get(flatbuffers.number_types.Int8Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(8))
+    # NestedStruct
+    def C(self): return [self._tab.Get(flatbuffers.number_types.Int8Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(9 + i * 1)) for i in range(2)]
+    # NestedStruct
+    def CLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(9))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # NestedStruct
+    def CIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(9))
+        return o == 0
+
+    # NestedStruct
+    def D(self): return [self._tab.Get(flatbuffers.number_types.Int64Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(16 + i * 8)) for i in range(2)]
+    # NestedStruct
+    def DLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # NestedStruct
+    def DIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        return o == 0
+
+
+def CreateNestedStruct(builder, a, b, c, d):
+    builder.Prep(8, 32)
+    for _idx0 in range(2 , 0, -1):
+        builder.PrependInt64(d[_idx0-1])
+    builder.Pad(5)
+    for _idx0 in range(2 , 0, -1):
+        builder.PrependInt8(c[_idx0-1])
+    builder.PrependInt8(b)
+    for _idx0 in range(2 , 0, -1):
+        builder.PrependInt32(a[_idx0-1])
+    return builder.Offset()
+
+try:
+    from typing import List
+except:
+    pass
+
+class NestedStructT(object):
+
+    # NestedStructT
+    def __init__(self):
+        self.a = None  # type: List[int]
+        self.b = 0  # type: int
+        self.c = None  # type: List[int]
+        self.d = None  # type: List[int]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        nestedStruct = NestedStruct()
+        nestedStruct.Init(buf, pos)
+        return cls.InitFromObj(nestedStruct)
+
+    @classmethod
+    def InitFromObj(cls, nestedStruct):
+        x = NestedStructT()
+        x._UnPack(nestedStruct)
+        return x
+
+    # NestedStructT
+    def _UnPack(self, nestedStruct):
+        if nestedStruct is None:
+            return
+        if not nestedStruct.AIsNone():
+            if np is None:
+                self.a = []
+                for i in range(nestedStruct.ALength()):
+                    self.a.append(nestedStruct.A(i))
+            else:
+                self.a = nestedStruct.AAsNumpy()
+        self.b = nestedStruct.B()
+        if not nestedStruct.CIsNone():
+            if np is None:
+                self.c = []
+                for i in range(nestedStruct.CLength()):
+                    self.c.append(nestedStruct.C(i))
+            else:
+                self.c = nestedStruct.CAsNumpy()
+        if not nestedStruct.DIsNone():
+            if np is None:
+                self.d = []
+                for i in range(nestedStruct.DLength()):
+                    self.d.append(nestedStruct.D(i))
+            else:
+                self.d = nestedStruct.DAsNumpy()
+
+    # NestedStructT
+    def Pack(self, builder):
+        return CreateNestedStruct(builder, self.a, self.b, self.c, self.d)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.cs
new file mode 100644
index 0000000..8a1959a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.cs
@@ -0,0 +1,18 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum Race : sbyte
+{
+  None = -1,
+  Human = 0,
+  Dwarf = 1,
+  Elf = 2,
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.go
new file mode 100644
index 0000000..0762bca
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.go
@@ -0,0 +1,35 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import "strconv"
+
+type Race int8
+
+const (
+	RaceNone  Race = -1
+	RaceHuman Race = 0
+	RaceDwarf Race = 1
+	RaceElf   Race = 2
+)
+
+var EnumNamesRace = map[Race]string{
+	RaceNone:  "None",
+	RaceHuman: "Human",
+	RaceDwarf: "Dwarf",
+	RaceElf:   "Elf",
+}
+
+var EnumValuesRace = map[string]Race{
+	"None":  RaceNone,
+	"Human": RaceHuman,
+	"Dwarf": RaceDwarf,
+	"Elf":   RaceElf,
+}
+
+func (v Race) String() string {
+	if s, ok := EnumNamesRace[v]; ok {
+		return s
+	}
+	return "Race(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.java
new file mode 100644
index 0000000..0dfd80b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.java
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+public final class Race {
+  private Race() { }
+  public static final byte None = -1;
+  public static final byte Human = 0;
+  public static final byte Dwarf = 1;
+  public static final byte Elf = 2;
+
+  public static final String[] names = { "None", "Human", "Dwarf", "Elf", };
+
+  public static String name(int e) { return names[e - None]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.kt
new file mode 100644
index 0000000..6eb9534
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.kt
@@ -0,0 +1,16 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Race private constructor() {
+    companion object {
+        const val None: Byte = -1
+        const val Human: Byte = 0
+        const val Dwarf: Byte = 1
+        const val Elf: Byte = 2
+        val names : Array<String> = arrayOf("None", "Human", "Dwarf", "Elf")
+        fun name(e: Int) : String = names[e - None.toInt()]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.lua
new file mode 100644
index 0000000..646f374
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.lua
@@ -0,0 +1,12 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local Race = {
+    None = -1,
+    Human = 0,
+    Dwarf = 1,
+    Elf = 2,
+}
+
+return Race -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.php
new file mode 100644
index 0000000..3f05c43
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.php
@@ -0,0 +1,27 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+class Race
+{
+    const None = -1;
+    const Human = 0;
+    const Dwarf = 1;
+    const Elf = 2;
+
+    private static $names = array(
+        Race::None=>"None",
+        Race::Human=>"Human",
+        Race::Dwarf=>"Dwarf",
+        Race::Elf=>"Elf",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.py
new file mode 100644
index 0000000..a39c6ea
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Race.py
@@ -0,0 +1,10 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+class Race(object):
+    None_ = -1
+    Human = 0
+    Dwarf = 1
+    Elf = 2
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.cs
new file mode 100644
index 0000000..0c6584b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.cs
@@ -0,0 +1,91 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Referrable : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Referrable GetRootAsReferrable(ByteBuffer _bb) { return GetRootAsReferrable(_bb, new Referrable()); }
+  public static Referrable GetRootAsReferrable(ByteBuffer _bb, Referrable obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Referrable __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public ulong Id { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateId(ulong id) { int o = __p.__offset(4); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, id); return true; } else { return false; } }
+
+  public static Offset<MyGame.Example.Referrable> CreateReferrable(FlatBufferBuilder builder,
+      ulong id = 0) {
+    builder.StartTable(1);
+    Referrable.AddId(builder, id);
+    return Referrable.EndReferrable(builder);
+  }
+
+  public static void StartReferrable(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddId(FlatBufferBuilder builder, ulong id) { builder.AddUlong(0, id, 0); }
+  public static Offset<MyGame.Example.Referrable> EndReferrable(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example.Referrable>(o);
+  }
+
+  public static VectorOffset CreateSortedVectorOfReferrable(FlatBufferBuilder builder, Offset<Referrable>[] offsets) {
+    Array.Sort(offsets, (Offset<Referrable> o1, Offset<Referrable> o2) => builder.DataBuffer.GetUlong(Table.__offset(4, o1.Value, builder.DataBuffer)).CompareTo(builder.DataBuffer.GetUlong(Table.__offset(4, o2.Value, builder.DataBuffer))));
+    return builder.CreateVectorOfTables(offsets);
+  }
+
+  public static Referrable? __lookup_by_key(int vectorLocation, ulong key, ByteBuffer bb) {
+    int span = bb.GetInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = Table.__indirect(vectorLocation + 4 * (start + middle), bb);
+      int comp = bb.GetUlong(Table.__offset(4, bb.Length - tableOffset, bb)).CompareTo(key);
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return new Referrable().__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+  public ReferrableT UnPack() {
+    var _o = new ReferrableT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(ReferrableT _o) {
+    _o.Id = this.Id;
+  }
+  public static Offset<MyGame.Example.Referrable> Pack(FlatBufferBuilder builder, ReferrableT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Referrable>);
+    return CreateReferrable(
+      builder,
+      _o.Id);
+  }
+};
+
+public class ReferrableT
+{
+  [Newtonsoft.Json.JsonProperty("id")]
+  [Newtonsoft.Json.JsonIgnore()]
+  public ulong Id { get; set; }
+
+  public ReferrableT() {
+    this.Id = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.go
new file mode 100644
index 0000000..66c5972
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.go
@@ -0,0 +1,78 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type ReferrableT struct {
+	Id uint64
+}
+
+func (t *ReferrableT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	ReferrableStart(builder)
+	ReferrableAddId(builder, t.Id)
+	return ReferrableEnd(builder)
+}
+
+func (rcv *Referrable) UnPackTo(t *ReferrableT) {
+	t.Id = rcv.Id()
+}
+
+func (rcv *Referrable) UnPack() *ReferrableT {
+	if rcv == nil { return nil }
+	t := &ReferrableT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Referrable struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsReferrable(buf []byte, offset flatbuffers.UOffsetT) *Referrable {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &Referrable{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsReferrable(buf []byte, offset flatbuffers.UOffsetT) *Referrable {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &Referrable{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *Referrable) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Referrable) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *Referrable) Id() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Referrable) MutateId(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(4, n)
+}
+
+func ReferrableStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func ReferrableAddId(builder *flatbuffers.Builder, id uint64) {
+	builder.PrependUint64Slot(0, id, 0)
+}
+func ReferrableEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.java
new file mode 100644
index 0000000..415eef3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.java
@@ -0,0 +1,72 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Referrable extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Referrable getRootAsReferrable(ByteBuffer _bb) { return getRootAsReferrable(_bb, new Referrable()); }
+  public static Referrable getRootAsReferrable(ByteBuffer _bb, Referrable obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Referrable __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public long id() { int o = __offset(4); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateId(long id) { int o = __offset(4); if (o != 0) { bb.putLong(o + bb_pos, id); return true; } else { return false; } }
+
+  public static int createReferrable(FlatBufferBuilder builder,
+      long id) {
+    builder.startTable(1);
+    Referrable.addId(builder, id);
+    return Referrable.endReferrable(builder);
+  }
+
+  public static void startReferrable(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addId(FlatBufferBuilder builder, long id) { builder.addLong(0, id, 0L); }
+  public static int endReferrable(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  @Override
+  protected int keysCompare(Integer o1, Integer o2, ByteBuffer _bb) {
+    long val_1 = _bb.getLong(__offset(4, o1, _bb));
+    long val_2 = _bb.getLong(__offset(4, o2, _bb));
+    return val_1 > val_2 ? 1 : val_1 < val_2 ? -1 : 0;
+  }
+
+  public static Referrable __lookup_by_key(Referrable obj, int vectorLocation, long key, ByteBuffer bb) {
+    int span = bb.getInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb);
+      long val = bb.getLong(__offset(4, bb.capacity() - tableOffset, bb));
+      int comp = val > key ? 1 : val < key ? -1 : 0;
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return (obj == null ? new Referrable() : obj).__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Referrable get(int j) { return get(new Referrable(), j); }
+    public Referrable get(Referrable obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+    public Referrable getByKey(long key) {  return __lookup_by_key(null, __vector(), key, bb); }
+    public Referrable getByKey(Referrable obj, long key) {  return __lookup_by_key(obj, __vector(), key, bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.kt
new file mode 100644
index 0000000..b434419
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.kt
@@ -0,0 +1,80 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Referrable : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Referrable {
+        __init(_i, _bb)
+        return this
+    }
+    val id : ULong
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateId(id: ULong) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, id.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    override fun keysCompare(o1: Int, o2: Int, _bb: ByteBuffer) : Int {
+        val val_1 = _bb.getLong(__offset(4, o1, _bb))
+        val val_2 = _bb.getLong(__offset(4, o2, _bb))
+        return (val_1 - val_2).sign
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsReferrable(_bb: ByteBuffer): Referrable = getRootAsReferrable(_bb, Referrable())
+        fun getRootAsReferrable(_bb: ByteBuffer, obj: Referrable): Referrable {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createReferrable(builder: FlatBufferBuilder, id: ULong) : Int {
+            builder.startTable(1)
+            addId(builder, id)
+            return endReferrable(builder)
+        }
+        fun startReferrable(builder: FlatBufferBuilder) = builder.startTable(1)
+        fun addId(builder: FlatBufferBuilder, id: ULong) = builder.addLong(0, id.toLong(), 0)
+        fun endReferrable(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun __lookup_by_key(obj: Referrable?, vectorLocation: Int, key: ULong, bb: ByteBuffer) : Referrable? {
+            var span = bb.getInt(vectorLocation - 4)
+            var start = 0
+            while (span != 0) {
+                var middle = span / 2
+                val tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb)
+                val value = bb.getLong(__offset(4, bb.capacity() - tableOffset, bb)).toULong()
+                val comp = value.compareTo(key)
+                when {
+                    comp > 0 -> span = middle
+                    comp < 0 -> {
+                        middle++
+                        start += middle
+                        span -= middle
+                    }
+                    else -> {
+                        return (obj ?: Referrable()).__assign(tableOffset, bb)
+                    }
+                }
+            }
+            return null
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.lua
new file mode 100644
index 0000000..bb78f43
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.lua
@@ -0,0 +1,38 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local Referrable = {} -- the module
+local Referrable_mt = {} -- the class metatable
+
+function Referrable.New()
+    local o = {}
+    setmetatable(o, {__index = Referrable_mt})
+    return o
+end
+function Referrable.GetRootAsReferrable(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Referrable.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Referrable_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Referrable_mt:Id()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function Referrable.Start(builder) builder:StartObject(1) end
+function Referrable.AddId(builder, id) builder:PrependUint64Slot(0, id, 0) end
+function Referrable.End(builder) return builder:EndObject() end
+
+return Referrable -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.php
new file mode 100644
index 0000000..5844011
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.php
@@ -0,0 +1,99 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Referrable extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Referrable
+     */
+    public static function getRootAsReferrable(ByteBuffer $bb)
+    {
+        $obj = new Referrable();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function ReferrableIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function ReferrableBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::ReferrableIdentifier());
+    }
+
+    public static function ReferrableExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Referrable
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getId()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startReferrable(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Referrable
+     */
+    public static function createReferrable(FlatBufferBuilder $builder, $id)
+    {
+        $builder->startObject(1);
+        self::addId($builder, $id);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addId(FlatBufferBuilder $builder, $id)
+    {
+        $builder->addUlongX(0, $id, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endReferrable(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.py
new file mode 100644
index 0000000..d751c27
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Referrable.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Referrable(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Referrable()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsReferrable(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def ReferrableBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # Referrable
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Referrable
+    def Id(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+def Start(builder): builder.StartObject(1)
+def ReferrableStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddId(builder, id): builder.PrependUint64Slot(0, id, 0)
+def ReferrableAddId(builder, id):
+    """This method is deprecated. Please switch to AddId."""
+    return AddId(builder, id)
+def End(builder): return builder.EndObject()
+def ReferrableEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class ReferrableT(object):
+
+    # ReferrableT
+    def __init__(self):
+        self.id = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        referrable = Referrable()
+        referrable.Init(buf, pos)
+        return cls.InitFromObj(referrable)
+
+    @classmethod
+    def InitFromObj(cls, referrable):
+        x = ReferrableT()
+        x._UnPack(referrable)
+        return x
+
+    # ReferrableT
+    def _UnPack(self, referrable):
+        if referrable is None:
+            return
+        self.id = referrable.Id()
+
+    # ReferrableT
+    def Pack(self, builder):
+        Start(builder)
+        AddId(builder, self.id)
+        referrable = End(builder)
+        return referrable
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.cs
new file mode 100644
index 0000000..809f587
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.cs
@@ -0,0 +1,116 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Stat : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Stat GetRootAsStat(ByteBuffer _bb) { return GetRootAsStat(_bb, new Stat()); }
+  public static Stat GetRootAsStat(ByteBuffer _bb, Stat obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Stat __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public string Id { get { int o = __p.__offset(4); return o != 0 ? __p.__string(o + __p.bb_pos) : null; } }
+#if ENABLE_SPAN_T
+  public Span<byte> GetIdBytes() { return __p.__vector_as_span<byte>(4, 1); }
+#else
+  public ArraySegment<byte>? GetIdBytes() { return __p.__vector_as_arraysegment(4); }
+#endif
+  public byte[] GetIdArray() { return __p.__vector_as_array<byte>(4); }
+  public long Val { get { int o = __p.__offset(6); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public bool MutateVal(long val) { int o = __p.__offset(6); if (o != 0) { __p.bb.PutLong(o + __p.bb_pos, val); return true; } else { return false; } }
+  public ushort Count { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetUshort(o + __p.bb_pos) : (ushort)0; } }
+  public bool MutateCount(ushort count) { int o = __p.__offset(8); if (o != 0) { __p.bb.PutUshort(o + __p.bb_pos, count); return true; } else { return false; } }
+
+  public static Offset<MyGame.Example.Stat> CreateStat(FlatBufferBuilder builder,
+      StringOffset idOffset = default(StringOffset),
+      long val = 0,
+      ushort count = 0) {
+    builder.StartTable(3);
+    Stat.AddVal(builder, val);
+    Stat.AddId(builder, idOffset);
+    Stat.AddCount(builder, count);
+    return Stat.EndStat(builder);
+  }
+
+  public static void StartStat(FlatBufferBuilder builder) { builder.StartTable(3); }
+  public static void AddId(FlatBufferBuilder builder, StringOffset idOffset) { builder.AddOffset(0, idOffset.Value, 0); }
+  public static void AddVal(FlatBufferBuilder builder, long val) { builder.AddLong(1, val, 0); }
+  public static void AddCount(FlatBufferBuilder builder, ushort count) { builder.AddUshort(2, count, 0); }
+  public static Offset<MyGame.Example.Stat> EndStat(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example.Stat>(o);
+  }
+
+  public static VectorOffset CreateSortedVectorOfStat(FlatBufferBuilder builder, Offset<Stat>[] offsets) {
+    Array.Sort(offsets, (Offset<Stat> o1, Offset<Stat> o2) => builder.DataBuffer.GetUshort(Table.__offset(8, o1.Value, builder.DataBuffer)).CompareTo(builder.DataBuffer.GetUshort(Table.__offset(8, o2.Value, builder.DataBuffer))));
+    return builder.CreateVectorOfTables(offsets);
+  }
+
+  public static Stat? __lookup_by_key(int vectorLocation, ushort key, ByteBuffer bb) {
+    int span = bb.GetInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = Table.__indirect(vectorLocation + 4 * (start + middle), bb);
+      int comp = bb.GetUshort(Table.__offset(8, bb.Length - tableOffset, bb)).CompareTo(key);
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return new Stat().__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+  public StatT UnPack() {
+    var _o = new StatT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(StatT _o) {
+    _o.Id = this.Id;
+    _o.Val = this.Val;
+    _o.Count = this.Count;
+  }
+  public static Offset<MyGame.Example.Stat> Pack(FlatBufferBuilder builder, StatT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Stat>);
+    var _id = _o.Id == null ? default(StringOffset) : builder.CreateString(_o.Id);
+    return CreateStat(
+      builder,
+      _id,
+      _o.Val,
+      _o.Count);
+  }
+};
+
+public class StatT
+{
+  [Newtonsoft.Json.JsonProperty("id")]
+  public string Id { get; set; }
+  [Newtonsoft.Json.JsonProperty("val")]
+  public long Val { get; set; }
+  [Newtonsoft.Json.JsonProperty("count")]
+  public ushort Count { get; set; }
+
+  public StatT() {
+    this.Id = null;
+    this.Val = 0;
+    this.Count = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.go
new file mode 100644
index 0000000..5c060d1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.go
@@ -0,0 +1,111 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type StatT struct {
+	Id string
+	Val int64
+	Count uint16
+}
+
+func (t *StatT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	idOffset := builder.CreateString(t.Id)
+	StatStart(builder)
+	StatAddId(builder, idOffset)
+	StatAddVal(builder, t.Val)
+	StatAddCount(builder, t.Count)
+	return StatEnd(builder)
+}
+
+func (rcv *Stat) UnPackTo(t *StatT) {
+	t.Id = string(rcv.Id())
+	t.Val = rcv.Val()
+	t.Count = rcv.Count()
+}
+
+func (rcv *Stat) UnPack() *StatT {
+	if rcv == nil { return nil }
+	t := &StatT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Stat struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsStat(buf []byte, offset flatbuffers.UOffsetT) *Stat {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &Stat{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsStat(buf []byte, offset flatbuffers.UOffsetT) *Stat {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &Stat{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *Stat) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Stat) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *Stat) Id() []byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.ByteVector(o + rcv._tab.Pos)
+	}
+	return nil
+}
+
+func (rcv *Stat) Val() int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return rcv._tab.GetInt64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Stat) MutateVal(n int64) bool {
+	return rcv._tab.MutateInt64Slot(6, n)
+}
+
+func (rcv *Stat) Count() uint16 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		return rcv._tab.GetUint16(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *Stat) MutateCount(n uint16) bool {
+	return rcv._tab.MutateUint16Slot(8, n)
+}
+
+func StatStart(builder *flatbuffers.Builder) {
+	builder.StartObject(3)
+}
+func StatAddId(builder *flatbuffers.Builder, id flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(id), 0)
+}
+func StatAddVal(builder *flatbuffers.Builder, val int64) {
+	builder.PrependInt64Slot(1, val, 0)
+}
+func StatAddCount(builder *flatbuffers.Builder, count uint16) {
+	builder.PrependUint16Slot(2, count, 0)
+}
+func StatEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.java
new file mode 100644
index 0000000..b2fbab7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.java
@@ -0,0 +1,83 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Stat extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Stat getRootAsStat(ByteBuffer _bb) { return getRootAsStat(_bb, new Stat()); }
+  public static Stat getRootAsStat(ByteBuffer _bb, Stat obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Stat __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public String id() { int o = __offset(4); return o != 0 ? __string(o + bb_pos) : null; }
+  public ByteBuffer idAsByteBuffer() { return __vector_as_bytebuffer(4, 1); }
+  public ByteBuffer idInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 4, 1); }
+  public long val() { int o = __offset(6); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateVal(long val) { int o = __offset(6); if (o != 0) { bb.putLong(o + bb_pos, val); return true; } else { return false; } }
+  public int count() { int o = __offset(8); return o != 0 ? bb.getShort(o + bb_pos) & 0xFFFF : 0; }
+  public boolean mutateCount(int count) { int o = __offset(8); if (o != 0) { bb.putShort(o + bb_pos, (short)count); return true; } else { return false; } }
+
+  public static int createStat(FlatBufferBuilder builder,
+      int idOffset,
+      long val,
+      int count) {
+    builder.startTable(3);
+    Stat.addVal(builder, val);
+    Stat.addId(builder, idOffset);
+    Stat.addCount(builder, count);
+    return Stat.endStat(builder);
+  }
+
+  public static void startStat(FlatBufferBuilder builder) { builder.startTable(3); }
+  public static void addId(FlatBufferBuilder builder, int idOffset) { builder.addOffset(0, idOffset, 0); }
+  public static void addVal(FlatBufferBuilder builder, long val) { builder.addLong(1, val, 0L); }
+  public static void addCount(FlatBufferBuilder builder, int count) { builder.addShort(2, (short)count, (short)0); }
+  public static int endStat(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  @Override
+  protected int keysCompare(Integer o1, Integer o2, ByteBuffer _bb) {
+    int val_1 = _bb.getShort(__offset(8, o1, _bb)) & 0xFFFF;
+    int val_2 = _bb.getShort(__offset(8, o2, _bb)) & 0xFFFF;
+    return val_1 > val_2 ? 1 : val_1 < val_2 ? -1 : 0;
+  }
+
+  public static Stat __lookup_by_key(Stat obj, int vectorLocation, int key, ByteBuffer bb) {
+    int span = bb.getInt(vectorLocation - 4);
+    int start = 0;
+    while (span != 0) {
+      int middle = span / 2;
+      int tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb);
+      int val = bb.getShort(__offset(8, bb.capacity() - tableOffset, bb)) & 0xFFFF;
+      int comp = val > key ? 1 : val < key ? -1 : 0;
+      if (comp > 0) {
+        span = middle;
+      } else if (comp < 0) {
+        middle++;
+        start += middle;
+        span -= middle;
+      } else {
+        return (obj == null ? new Stat() : obj).__assign(tableOffset, bb);
+      }
+    }
+    return null;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Stat get(int j) { return get(new Stat(), j); }
+    public Stat get(Stat obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+    public Stat getByKey(int key) {  return __lookup_by_key(null, __vector(), key, bb); }
+    public Stat getByKey(Stat obj, int key) {  return __lookup_by_key(obj, __vector(), key, bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.kt
new file mode 100644
index 0000000..43755b4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.kt
@@ -0,0 +1,105 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Stat : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Stat {
+        __init(_i, _bb)
+        return this
+    }
+    val id : String?
+        get() {
+            val o = __offset(4)
+            return if (o != 0) __string(o + bb_pos) else null
+        }
+    val idAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(4, 1)
+    fun idInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 4, 1)
+    val val_ : Long
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.getLong(o + bb_pos) else 0L
+        }
+    fun mutateVal_(val_: Long) : Boolean {
+        val o = __offset(6)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, val_)
+            true
+        } else {
+            false
+        }
+    }
+    val count : UShort
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else 0u
+        }
+    fun mutateCount(count: UShort) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.putShort(o + bb_pos, count.toShort())
+            true
+        } else {
+            false
+        }
+    }
+    override fun keysCompare(o1: Int, o2: Int, _bb: ByteBuffer) : Int {
+        val val_1 = _bb.getShort(__offset(8, o1, _bb))
+        val val_2 = _bb.getShort(__offset(8, o2, _bb))
+        return (val_1 - val_2).sign
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsStat(_bb: ByteBuffer): Stat = getRootAsStat(_bb, Stat())
+        fun getRootAsStat(_bb: ByteBuffer, obj: Stat): Stat {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createStat(builder: FlatBufferBuilder, idOffset: Int, val_: Long, count: UShort) : Int {
+            builder.startTable(3)
+            addVal_(builder, val_)
+            addId(builder, idOffset)
+            addCount(builder, count)
+            return endStat(builder)
+        }
+        fun startStat(builder: FlatBufferBuilder) = builder.startTable(3)
+        fun addId(builder: FlatBufferBuilder, id: Int) = builder.addOffset(0, id, 0)
+        fun addVal_(builder: FlatBufferBuilder, val_: Long) = builder.addLong(1, val_, 0L)
+        fun addCount(builder: FlatBufferBuilder, count: UShort) = builder.addShort(2, count.toShort(), 0)
+        fun endStat(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun __lookup_by_key(obj: Stat?, vectorLocation: Int, key: UShort, bb: ByteBuffer) : Stat? {
+            var span = bb.getInt(vectorLocation - 4)
+            var start = 0
+            while (span != 0) {
+                var middle = span / 2
+                val tableOffset = __indirect(vectorLocation + 4 * (start + middle), bb)
+                val value = bb.getShort(__offset(8, bb.capacity() - tableOffset, bb)).toUShort()
+                val comp = value.compareTo(key)
+                when {
+                    comp > 0 -> span = middle
+                    comp < 0 -> {
+                        middle++
+                        start += middle
+                        span -= middle
+                    }
+                    else -> {
+                        return (obj ?: Stat()).__assign(tableOffset, bb)
+                    }
+                }
+            }
+            return null
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.lua
new file mode 100644
index 0000000..d7fd058
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.lua
@@ -0,0 +1,53 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local Stat = {} -- the module
+local Stat_mt = {} -- the class metatable
+
+function Stat.New()
+    local o = {}
+    setmetatable(o, {__index = Stat_mt})
+    return o
+end
+function Stat.GetRootAsStat(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Stat.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Stat_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Stat_mt:Id()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:String(o + self.view.pos)
+    end
+end
+function Stat_mt:Val()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int64, o + self.view.pos)
+    end
+    return 0
+end
+function Stat_mt:Count()
+    local o = self.view:Offset(8)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint16, o + self.view.pos)
+    end
+    return 0
+end
+function Stat.Start(builder) builder:StartObject(3) end
+function Stat.AddId(builder, id) builder:PrependUOffsetTRelativeSlot(0, id, 0) end
+function Stat.AddVal(builder, val) builder:PrependInt64Slot(1, val, 0) end
+function Stat.AddCount(builder, count) builder:PrependUint16Slot(2, count, 0) end
+function Stat.End(builder) return builder:EndObject() end
+
+return Stat -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.php
new file mode 100644
index 0000000..6ef7034
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.php
@@ -0,0 +1,136 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Stat extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Stat
+     */
+    public static function getRootAsStat(ByteBuffer $bb)
+    {
+        $obj = new Stat();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function StatIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function StatBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::StatIdentifier());
+    }
+
+    public static function StatExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Stat
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getId()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->__string($o + $this->bb_pos) : null;
+    }
+
+    /**
+     * @return long
+     */
+    public function getVal()
+    {
+        $o = $this->__offset(6);
+        return $o != 0 ? $this->bb->getLong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return ushort
+     */
+    public function getCount()
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->bb->getUshort($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startStat(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(3);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Stat
+     */
+    public static function createStat(FlatBufferBuilder $builder, $id, $val, $count)
+    {
+        $builder->startObject(3);
+        self::addId($builder, $id);
+        self::addVal($builder, $val);
+        self::addCount($builder, $count);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param StringOffset
+     * @return void
+     */
+    public static function addId(FlatBufferBuilder $builder, $id)
+    {
+        $builder->addOffsetX(0, $id, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param long
+     * @return void
+     */
+    public static function addVal(FlatBufferBuilder $builder, $val)
+    {
+        $builder->addLongX(1, $val, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ushort
+     * @return void
+     */
+    public static function addCount(FlatBufferBuilder $builder, $count)
+    {
+        $builder->addUshortX(2, $count, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endStat(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.py
new file mode 100644
index 0000000..aae8b32
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Stat.py
@@ -0,0 +1,111 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Stat(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Stat()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsStat(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def StatBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # Stat
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Stat
+    def Id(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Stat
+    def Val(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # Stat
+    def Count(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint16Flags, o + self._tab.Pos)
+        return 0
+
+def Start(builder): builder.StartObject(3)
+def StatStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddId(builder, id): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(id), 0)
+def StatAddId(builder, id):
+    """This method is deprecated. Please switch to AddId."""
+    return AddId(builder, id)
+def AddVal(builder, val): builder.PrependInt64Slot(1, val, 0)
+def StatAddVal(builder, val):
+    """This method is deprecated. Please switch to AddVal."""
+    return AddVal(builder, val)
+def AddCount(builder, count): builder.PrependUint16Slot(2, count, 0)
+def StatAddCount(builder, count):
+    """This method is deprecated. Please switch to AddCount."""
+    return AddCount(builder, count)
+def End(builder): return builder.EndObject()
+def StatEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class StatT(object):
+
+    # StatT
+    def __init__(self):
+        self.id = None  # type: str
+        self.val = 0  # type: int
+        self.count = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        stat = Stat()
+        stat.Init(buf, pos)
+        return cls.InitFromObj(stat)
+
+    @classmethod
+    def InitFromObj(cls, stat):
+        x = StatT()
+        x._UnPack(stat)
+        return x
+
+    # StatT
+    def _UnPack(self, stat):
+        if stat is None:
+            return
+        self.id = stat.Id()
+        self.val = stat.Val()
+        self.count = stat.Count()
+
+    # StatT
+    def Pack(self, builder):
+        if self.id is not None:
+            id = builder.CreateString(self.id)
+        Start(builder)
+        if self.id is not None:
+            AddId(builder, id)
+        AddVal(builder, self.val)
+        AddCount(builder, self.count)
+        stat = End(builder)
+        return stat
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.cs
new file mode 100644
index 0000000..cdd82fb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.cs
@@ -0,0 +1,83 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct StructOfStructs : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public StructOfStructs __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.Ability A { get { return (new MyGame.Example.Ability()).__assign(__p.bb_pos + 0, __p.bb); } }
+  public MyGame.Example.Test B { get { return (new MyGame.Example.Test()).__assign(__p.bb_pos + 8, __p.bb); } }
+  public MyGame.Example.Ability C { get { return (new MyGame.Example.Ability()).__assign(__p.bb_pos + 12, __p.bb); } }
+
+  public static Offset<MyGame.Example.StructOfStructs> CreateStructOfStructs(FlatBufferBuilder builder, uint a_Id, uint a_Distance, short b_A, sbyte b_B, uint c_Id, uint c_Distance) {
+    builder.Prep(4, 20);
+    builder.Prep(4, 8);
+    builder.PutUint(c_Distance);
+    builder.PutUint(c_Id);
+    builder.Prep(2, 4);
+    builder.Pad(1);
+    builder.PutSbyte(b_B);
+    builder.PutShort(b_A);
+    builder.Prep(4, 8);
+    builder.PutUint(a_Distance);
+    builder.PutUint(a_Id);
+    return new Offset<MyGame.Example.StructOfStructs>(builder.Offset);
+  }
+  public StructOfStructsT UnPack() {
+    var _o = new StructOfStructsT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(StructOfStructsT _o) {
+    _o.A = this.A.UnPack();
+    _o.B = this.B.UnPack();
+    _o.C = this.C.UnPack();
+  }
+  public static Offset<MyGame.Example.StructOfStructs> Pack(FlatBufferBuilder builder, StructOfStructsT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.StructOfStructs>);
+    var _a_id = _o.A.Id;
+    var _a_distance = _o.A.Distance;
+    var _b_a = _o.B.A;
+    var _b_b = _o.B.B;
+    var _c_id = _o.C.Id;
+    var _c_distance = _o.C.Distance;
+    return CreateStructOfStructs(
+      builder,
+      _a_id,
+      _a_distance,
+      _b_a,
+      _b_b,
+      _c_id,
+      _c_distance);
+  }
+};
+
+public class StructOfStructsT
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public MyGame.Example.AbilityT A { get; set; }
+  [Newtonsoft.Json.JsonProperty("b")]
+  public MyGame.Example.TestT B { get; set; }
+  [Newtonsoft.Json.JsonProperty("c")]
+  public MyGame.Example.AbilityT C { get; set; }
+
+  public StructOfStructsT() {
+    this.A = new MyGame.Example.AbilityT();
+    this.B = new MyGame.Example.TestT();
+    this.C = new MyGame.Example.AbilityT();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.go
new file mode 100644
index 0000000..35ccfcb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.go
@@ -0,0 +1,80 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type StructOfStructsT struct {
+	A *AbilityT
+	B *TestT
+	C *AbilityT
+}
+
+func (t *StructOfStructsT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	return CreateStructOfStructs(builder, t.A.Id, t.A.Distance, t.B.A, t.B.B, t.C.Id, t.C.Distance)
+}
+func (rcv *StructOfStructs) UnPackTo(t *StructOfStructsT) {
+	t.A = rcv.A(nil).UnPack()
+	t.B = rcv.B(nil).UnPack()
+	t.C = rcv.C(nil).UnPack()
+}
+
+func (rcv *StructOfStructs) UnPack() *StructOfStructsT {
+	if rcv == nil { return nil }
+	t := &StructOfStructsT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type StructOfStructs struct {
+	_tab flatbuffers.Struct
+}
+
+func (rcv *StructOfStructs) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *StructOfStructs) Table() flatbuffers.Table {
+	return rcv._tab.Table
+}
+
+func (rcv *StructOfStructs) A(obj *Ability) *Ability {
+	if obj == nil {
+		obj = new(Ability)
+	}
+	obj.Init(rcv._tab.Bytes, rcv._tab.Pos+0)
+	return obj
+}
+func (rcv *StructOfStructs) B(obj *Test) *Test {
+	if obj == nil {
+		obj = new(Test)
+	}
+	obj.Init(rcv._tab.Bytes, rcv._tab.Pos+8)
+	return obj
+}
+func (rcv *StructOfStructs) C(obj *Ability) *Ability {
+	if obj == nil {
+		obj = new(Ability)
+	}
+	obj.Init(rcv._tab.Bytes, rcv._tab.Pos+12)
+	return obj
+}
+
+func CreateStructOfStructs(builder *flatbuffers.Builder, a_id uint32, a_distance uint32, b_a int16, b_b int8, c_id uint32, c_distance uint32) flatbuffers.UOffsetT {
+	builder.Prep(4, 20)
+	builder.Prep(4, 8)
+	builder.PrependUint32(c_distance)
+	builder.PrependUint32(c_id)
+	builder.Prep(2, 4)
+	builder.Pad(1)
+	builder.PrependInt8(b_b)
+	builder.PrependInt16(b_a)
+	builder.Prep(4, 8)
+	builder.PrependUint32(a_distance)
+	builder.PrependUint32(a_id)
+	return builder.Offset()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.java
new file mode 100644
index 0000000..ea1ec13
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.java
@@ -0,0 +1,44 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class StructOfStructs extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public StructOfStructs __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.Ability a() { return a(new MyGame.Example.Ability()); }
+  public MyGame.Example.Ability a(MyGame.Example.Ability obj) { return obj.__assign(bb_pos + 0, bb); }
+  public MyGame.Example.Test b() { return b(new MyGame.Example.Test()); }
+  public MyGame.Example.Test b(MyGame.Example.Test obj) { return obj.__assign(bb_pos + 8, bb); }
+  public MyGame.Example.Ability c() { return c(new MyGame.Example.Ability()); }
+  public MyGame.Example.Ability c(MyGame.Example.Ability obj) { return obj.__assign(bb_pos + 12, bb); }
+
+  public static int createStructOfStructs(FlatBufferBuilder builder, long a_id, long a_distance, short b_a, byte b_b, long c_id, long c_distance) {
+    builder.prep(4, 20);
+    builder.prep(4, 8);
+    builder.putInt((int)c_distance);
+    builder.putInt((int)c_id);
+    builder.prep(2, 4);
+    builder.pad(1);
+    builder.putByte(b_b);
+    builder.putShort(b_a);
+    builder.prep(4, 8);
+    builder.putInt((int)a_distance);
+    builder.putInt((int)a_id);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public StructOfStructs get(int j) { return get(new StructOfStructs(), j); }
+    public StructOfStructs get(StructOfStructs obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.kt
new file mode 100644
index 0000000..2a6bf9b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.kt
@@ -0,0 +1,42 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class StructOfStructs : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : StructOfStructs {
+        __init(_i, _bb)
+        return this
+    }
+    val a : MyGame.Example.Ability? get() = a(MyGame.Example.Ability())
+    fun a(obj: MyGame.Example.Ability) : MyGame.Example.Ability? = obj.__assign(bb_pos + 0, bb)
+    val b : MyGame.Example.Test? get() = b(MyGame.Example.Test())
+    fun b(obj: MyGame.Example.Test) : MyGame.Example.Test? = obj.__assign(bb_pos + 8, bb)
+    val c : MyGame.Example.Ability? get() = c(MyGame.Example.Ability())
+    fun c(obj: MyGame.Example.Ability) : MyGame.Example.Ability? = obj.__assign(bb_pos + 12, bb)
+    companion object {
+        fun createStructOfStructs(builder: FlatBufferBuilder, a_id: UInt, a_distance: UInt, b_a: Short, b_b: Byte, c_id: UInt, c_distance: UInt) : Int {
+            builder.prep(4, 20)
+            builder.prep(4, 8)
+            builder.putInt(c_distance.toInt())
+            builder.putInt(c_id.toInt())
+            builder.prep(2, 4)
+            builder.pad(1)
+            builder.putByte(b_b)
+            builder.putShort(b_a)
+            builder.prep(4, 8)
+            builder.putInt(a_distance.toInt())
+            builder.putInt(a_id.toInt())
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.lua
new file mode 100644
index 0000000..827e942
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.lua
@@ -0,0 +1,45 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local StructOfStructs = {} -- the module
+local StructOfStructs_mt = {} -- the class metatable
+
+function StructOfStructs.New()
+    local o = {}
+    setmetatable(o, {__index = StructOfStructs_mt})
+    return o
+end
+function StructOfStructs_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function StructOfStructs_mt:A(obj)
+    obj:Init(self.view.bytes, self.view.pos + 0)
+    return obj
+end
+function StructOfStructs_mt:B(obj)
+    obj:Init(self.view.bytes, self.view.pos + 8)
+    return obj
+end
+function StructOfStructs_mt:C(obj)
+    obj:Init(self.view.bytes, self.view.pos + 12)
+    return obj
+end
+function StructOfStructs.CreateStructOfStructs(builder, a_id, a_distance, b_a, b_b, c_id, c_distance)
+    builder:Prep(4, 20)
+    builder:Prep(4, 8)
+    builder:PrependUint32(c_distance)
+    builder:PrependUint32(c_id)
+    builder:Prep(2, 4)
+    builder:Pad(1)
+    builder:PrependInt8(b_b)
+    builder:PrependInt16(b_a)
+    builder:Prep(4, 8)
+    builder:PrependUint32(a_distance)
+    builder:PrependUint32(a_id)
+    return builder:Offset()
+end
+
+return StructOfStructs -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.php
new file mode 100644
index 0000000..fe5fac3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.php
@@ -0,0 +1,74 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class StructOfStructs extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return StructOfStructs
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return Ability
+     */
+    public function getA()
+    {
+        $obj = new Ability();
+        $obj->init($this->bb_pos + 0, $this->bb);
+        return $obj;
+    }
+
+    /**
+     * @return Test
+     */
+    public function getB()
+    {
+        $obj = new Test();
+        $obj->init($this->bb_pos + 8, $this->bb);
+        return $obj;
+    }
+
+    /**
+     * @return Ability
+     */
+    public function getC()
+    {
+        $obj = new Ability();
+        $obj->init($this->bb_pos + 12, $this->bb);
+        return $obj;
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createStructOfStructs(FlatBufferBuilder $builder, $a_id, $a_distance, $b_a, $b_b, $c_id, $c_distance)
+    {
+        $builder->prep(4, 20);
+        $builder->prep(4, 8);
+        $builder->putUint($c_distance);
+        $builder->putUint($c_id);
+        $builder->prep(2, 4);
+        $builder->pad(1);
+        $builder->putSbyte($b_b);
+        $builder->putShort($b_a);
+        $builder->prep(4, 8);
+        $builder->putUint($a_distance);
+        $builder->putUint($a_id);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.py
new file mode 100644
index 0000000..0dcf3d5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/StructOfStructs.py
@@ -0,0 +1,90 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class StructOfStructs(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 20
+
+    # StructOfStructs
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StructOfStructs
+    def A(self, obj):
+        obj.Init(self._tab.Bytes, self._tab.Pos + 0)
+        return obj
+
+    # StructOfStructs
+    def B(self, obj):
+        obj.Init(self._tab.Bytes, self._tab.Pos + 8)
+        return obj
+
+    # StructOfStructs
+    def C(self, obj):
+        obj.Init(self._tab.Bytes, self._tab.Pos + 12)
+        return obj
+
+
+def CreateStructOfStructs(builder, a_id, a_distance, b_a, b_b, c_id, c_distance):
+    builder.Prep(4, 20)
+    builder.Prep(4, 8)
+    builder.PrependUint32(c_distance)
+    builder.PrependUint32(c_id)
+    builder.Prep(2, 4)
+    builder.Pad(1)
+    builder.PrependInt8(b_b)
+    builder.PrependInt16(b_a)
+    builder.Prep(4, 8)
+    builder.PrependUint32(a_distance)
+    builder.PrependUint32(a_id)
+    return builder.Offset()
+
+import MyGame.Example.Ability
+import MyGame.Example.Test
+try:
+    from typing import Optional
+except:
+    pass
+
+class StructOfStructsT(object):
+
+    # StructOfStructsT
+    def __init__(self):
+        self.a = None  # type: Optional[MyGame.Example.Ability.AbilityT]
+        self.b = None  # type: Optional[MyGame.Example.Test.TestT]
+        self.c = None  # type: Optional[MyGame.Example.Ability.AbilityT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        structOfStructs = StructOfStructs()
+        structOfStructs.Init(buf, pos)
+        return cls.InitFromObj(structOfStructs)
+
+    @classmethod
+    def InitFromObj(cls, structOfStructs):
+        x = StructOfStructsT()
+        x._UnPack(structOfStructs)
+        return x
+
+    # StructOfStructsT
+    def _UnPack(self, structOfStructs):
+        if structOfStructs is None:
+            return
+        if structOfStructs.A(MyGame.Example.Ability.Ability()) is not None:
+            self.a = MyGame.Example.Ability.AbilityT.InitFromObj(structOfStructs.A(MyGame.Example.Ability.Ability()))
+        if structOfStructs.B(MyGame.Example.Test.Test()) is not None:
+            self.b = MyGame.Example.Test.TestT.InitFromObj(structOfStructs.B(MyGame.Example.Test.Test()))
+        if structOfStructs.C(MyGame.Example.Ability.Ability()) is not None:
+            self.c = MyGame.Example.Ability.AbilityT.InitFromObj(structOfStructs.C(MyGame.Example.Ability.Ability()))
+
+    # StructOfStructsT
+    def Pack(self, builder):
+        return CreateStructOfStructs(builder, self.a.id, self.a.distance, self.b.a, self.b.b, self.c.id, self.c.distance)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.cs
new file mode 100644
index 0000000..a6419ed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.cs
@@ -0,0 +1,63 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Test : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public Test __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public short A { get { return __p.bb.GetShort(__p.bb_pos + 0); } }
+  public void MutateA(short a) { __p.bb.PutShort(__p.bb_pos + 0, a); }
+  public sbyte B { get { return __p.bb.GetSbyte(__p.bb_pos + 2); } }
+  public void MutateB(sbyte b) { __p.bb.PutSbyte(__p.bb_pos + 2, b); }
+
+  public static Offset<MyGame.Example.Test> CreateTest(FlatBufferBuilder builder, short A, sbyte B) {
+    builder.Prep(2, 4);
+    builder.Pad(1);
+    builder.PutSbyte(B);
+    builder.PutShort(A);
+    return new Offset<MyGame.Example.Test>(builder.Offset);
+  }
+  public TestT UnPack() {
+    var _o = new TestT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TestT _o) {
+    _o.A = this.A;
+    _o.B = this.B;
+  }
+  public static Offset<MyGame.Example.Test> Pack(FlatBufferBuilder builder, TestT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Test>);
+    return CreateTest(
+      builder,
+      _o.A,
+      _o.B);
+  }
+};
+
+public class TestT
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public short A { get; set; }
+  [Newtonsoft.Json.JsonProperty("b")]
+  public sbyte B { get; set; }
+
+  public TestT() {
+    this.A = 0;
+    this.B = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.go
new file mode 100644
index 0000000..cbf7e29
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.go
@@ -0,0 +1,63 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type TestT struct {
+	A int16
+	B int8
+}
+
+func (t *TestT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	return CreateTest(builder, t.A, t.B)
+}
+func (rcv *Test) UnPackTo(t *TestT) {
+	t.A = rcv.A()
+	t.B = rcv.B()
+}
+
+func (rcv *Test) UnPack() *TestT {
+	if rcv == nil { return nil }
+	t := &TestT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Test struct {
+	_tab flatbuffers.Struct
+}
+
+func (rcv *Test) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Test) Table() flatbuffers.Table {
+	return rcv._tab.Table
+}
+
+func (rcv *Test) A() int16 {
+	return rcv._tab.GetInt16(rcv._tab.Pos + flatbuffers.UOffsetT(0))
+}
+func (rcv *Test) MutateA(n int16) bool {
+	return rcv._tab.MutateInt16(rcv._tab.Pos+flatbuffers.UOffsetT(0), n)
+}
+
+func (rcv *Test) B() int8 {
+	return rcv._tab.GetInt8(rcv._tab.Pos + flatbuffers.UOffsetT(2))
+}
+func (rcv *Test) MutateB(n int8) bool {
+	return rcv._tab.MutateInt8(rcv._tab.Pos+flatbuffers.UOffsetT(2), n)
+}
+
+func CreateTest(builder *flatbuffers.Builder, a int16, b int8) flatbuffers.UOffsetT {
+	builder.Prep(2, 4)
+	builder.Pad(1)
+	builder.PrependInt8(b)
+	builder.PrependInt16(a)
+	return builder.Offset()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.java
new file mode 100644
index 0000000..c4ffc41
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.java
@@ -0,0 +1,35 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Test extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Test __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public short a() { return bb.getShort(bb_pos + 0); }
+  public void mutateA(short a) { bb.putShort(bb_pos + 0, a); }
+  public byte b() { return bb.get(bb_pos + 2); }
+  public void mutateB(byte b) { bb.put(bb_pos + 2, b); }
+
+  public static int createTest(FlatBufferBuilder builder, short a, byte b) {
+    builder.prep(2, 4);
+    builder.pad(1);
+    builder.putByte(b);
+    builder.putShort(a);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Test get(int j) { return get(new Test(), j); }
+    public Test get(Test obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.kt
new file mode 100644
index 0000000..f2ceed6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.kt
@@ -0,0 +1,33 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Test : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Test {
+        __init(_i, _bb)
+        return this
+    }
+    val a : Short get() = bb.getShort(bb_pos + 0)
+    fun mutateA(a: Short) : ByteBuffer = bb.putShort(bb_pos + 0, a)
+    val b : Byte get() = bb.get(bb_pos + 2)
+    fun mutateB(b: Byte) : ByteBuffer = bb.put(bb_pos + 2, b)
+    companion object {
+        fun createTest(builder: FlatBufferBuilder, a: Short, b: Byte) : Int {
+            builder.prep(2, 4)
+            builder.pad(1)
+            builder.putByte(b)
+            builder.putShort(a)
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.lua
new file mode 100644
index 0000000..154067b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.lua
@@ -0,0 +1,32 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local Test = {} -- the module
+local Test_mt = {} -- the class metatable
+
+function Test.New()
+    local o = {}
+    setmetatable(o, {__index = Test_mt})
+    return o
+end
+function Test_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Test_mt:A()
+    return self.view:Get(flatbuffers.N.Int16, self.view.pos + 0)
+end
+function Test_mt:B()
+    return self.view:Get(flatbuffers.N.Int8, self.view.pos + 2)
+end
+function Test.CreateTest(builder, a, b)
+    builder:Prep(2, 4)
+    builder:Pad(1)
+    builder:PrependInt8(b)
+    builder:PrependInt16(a)
+    return builder:Offset()
+end
+
+return Test -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.php
new file mode 100644
index 0000000..13cced0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.php
@@ -0,0 +1,53 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Test extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Test
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return short
+     */
+    public function GetA()
+    {
+        return $this->bb->getShort($this->bb_pos + 0);
+    }
+
+    /**
+     * @return sbyte
+     */
+    public function GetB()
+    {
+        return $this->bb->getSbyte($this->bb_pos + 2);
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createTest(FlatBufferBuilder $builder, $a, $b)
+    {
+        $builder->prep(2, 4);
+        $builder->pad(1);
+        $builder->putSbyte($b);
+        $builder->putShort($a);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.py
new file mode 100644
index 0000000..8357ec2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Test.py
@@ -0,0 +1,61 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Test(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 4
+
+    # Test
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Test
+    def A(self): return self._tab.Get(flatbuffers.number_types.Int16Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
+    # Test
+    def B(self): return self._tab.Get(flatbuffers.number_types.Int8Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(2))
+
+def CreateTest(builder, a, b):
+    builder.Prep(2, 4)
+    builder.Pad(1)
+    builder.PrependInt8(b)
+    builder.PrependInt16(a)
+    return builder.Offset()
+
+
+class TestT(object):
+
+    # TestT
+    def __init__(self):
+        self.a = 0  # type: int
+        self.b = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        test = Test()
+        test.Init(buf, pos)
+        return cls.InitFromObj(test)
+
+    @classmethod
+    def InitFromObj(cls, test):
+        x = TestT()
+        x._UnPack(test)
+        return x
+
+    # TestT
+    def _UnPack(self, test):
+        if test is None:
+            return
+        self.a = test.A()
+        self.b = test.B()
+
+    # TestT
+    def Pack(self, builder):
+        return CreateTest(builder, self.a, self.b)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.cs
new file mode 100644
index 0000000..6dfd6b5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.cs
@@ -0,0 +1,17 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum TestEnum : sbyte
+{
+  A = 0,
+  B = 1,
+  C = 2,
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.java
new file mode 100644
index 0000000..411bf8e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.java
@@ -0,0 +1,15 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+public final class TestEnum {
+  private TestEnum() { }
+  public static final byte A = 0;
+  public static final byte B = 1;
+  public static final byte C = 2;
+
+  public static final String[] names = { "A", "B", "C", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.kt
new file mode 100644
index 0000000..ca4d7f8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.kt
@@ -0,0 +1,14 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+@Suppress("unused")
+class TestEnum private constructor() {
+    companion object {
+        const val A: Byte = 0
+        const val B: Byte = 1
+        const val C: Byte = 2
+        val names : Array<String> = arrayOf("A", "B", "C")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.py
new file mode 100644
index 0000000..d49f10a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestEnum.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+class TestEnum(object):
+    A = 0
+    B = 1
+    C = 2
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.cs
new file mode 100644
index 0000000..bd1fba8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.cs
@@ -0,0 +1,65 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+internal partial struct TestSimpleTableWithEnum : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static TestSimpleTableWithEnum GetRootAsTestSimpleTableWithEnum(ByteBuffer _bb) { return GetRootAsTestSimpleTableWithEnum(_bb, new TestSimpleTableWithEnum()); }
+  public static TestSimpleTableWithEnum GetRootAsTestSimpleTableWithEnum(ByteBuffer _bb, TestSimpleTableWithEnum obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public TestSimpleTableWithEnum __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public MyGame.Example.Color Color { get { int o = __p.__offset(4); return o != 0 ? (MyGame.Example.Color)__p.bb.Get(o + __p.bb_pos) : MyGame.Example.Color.Green; } }
+  public bool MutateColor(MyGame.Example.Color color) { int o = __p.__offset(4); if (o != 0) { __p.bb.Put(o + __p.bb_pos, (byte)color); return true; } else { return false; } }
+
+  public static Offset<MyGame.Example.TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(FlatBufferBuilder builder,
+      MyGame.Example.Color color = MyGame.Example.Color.Green) {
+    builder.StartTable(1);
+    TestSimpleTableWithEnum.AddColor(builder, color);
+    return TestSimpleTableWithEnum.EndTestSimpleTableWithEnum(builder);
+  }
+
+  public static void StartTestSimpleTableWithEnum(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddColor(FlatBufferBuilder builder, MyGame.Example.Color color) { builder.AddByte(0, (byte)color, 2); }
+  public static Offset<MyGame.Example.TestSimpleTableWithEnum> EndTestSimpleTableWithEnum(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example.TestSimpleTableWithEnum>(o);
+  }
+  public TestSimpleTableWithEnumT UnPack() {
+    var _o = new TestSimpleTableWithEnumT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TestSimpleTableWithEnumT _o) {
+    _o.Color = this.Color;
+  }
+  public static Offset<MyGame.Example.TestSimpleTableWithEnum> Pack(FlatBufferBuilder builder, TestSimpleTableWithEnumT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.TestSimpleTableWithEnum>);
+    return CreateTestSimpleTableWithEnum(
+      builder,
+      _o.Color);
+  }
+};
+
+internal partial class TestSimpleTableWithEnumT
+{
+  [Newtonsoft.Json.JsonProperty("color")]
+  public MyGame.Example.Color Color { get; set; }
+
+  public TestSimpleTableWithEnumT() {
+    this.Color = MyGame.Example.Color.Green;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.go
new file mode 100644
index 0000000..b8cde12
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.go
@@ -0,0 +1,78 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type TestSimpleTableWithEnumT struct {
+	Color Color
+}
+
+func (t *TestSimpleTableWithEnumT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	TestSimpleTableWithEnumStart(builder)
+	TestSimpleTableWithEnumAddColor(builder, t.Color)
+	return TestSimpleTableWithEnumEnd(builder)
+}
+
+func (rcv *TestSimpleTableWithEnum) UnPackTo(t *TestSimpleTableWithEnumT) {
+	t.Color = rcv.Color()
+}
+
+func (rcv *TestSimpleTableWithEnum) UnPack() *TestSimpleTableWithEnumT {
+	if rcv == nil { return nil }
+	t := &TestSimpleTableWithEnumT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type TestSimpleTableWithEnum struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsTestSimpleTableWithEnum(buf []byte, offset flatbuffers.UOffsetT) *TestSimpleTableWithEnum {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &TestSimpleTableWithEnum{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsTestSimpleTableWithEnum(buf []byte, offset flatbuffers.UOffsetT) *TestSimpleTableWithEnum {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &TestSimpleTableWithEnum{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *TestSimpleTableWithEnum) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TestSimpleTableWithEnum) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *TestSimpleTableWithEnum) Color() Color {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return Color(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 2
+}
+
+func (rcv *TestSimpleTableWithEnum) MutateColor(n Color) bool {
+	return rcv._tab.MutateByteSlot(4, byte(n))
+}
+
+func TestSimpleTableWithEnumStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func TestSimpleTableWithEnumAddColor(builder *flatbuffers.Builder, color Color) {
+	builder.PrependByteSlot(0, byte(color), 2)
+}
+func TestSimpleTableWithEnumEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.java
new file mode 100644
index 0000000..9ba265b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.java
@@ -0,0 +1,42 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+final class TestSimpleTableWithEnum extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static TestSimpleTableWithEnum getRootAsTestSimpleTableWithEnum(ByteBuffer _bb) { return getRootAsTestSimpleTableWithEnum(_bb, new TestSimpleTableWithEnum()); }
+  public static TestSimpleTableWithEnum getRootAsTestSimpleTableWithEnum(ByteBuffer _bb, TestSimpleTableWithEnum obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public TestSimpleTableWithEnum __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int color() { int o = __offset(4); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 2; }
+  public boolean mutateColor(int color) { int o = __offset(4); if (o != 0) { bb.put(o + bb_pos, (byte)color); return true; } else { return false; } }
+
+  public static int createTestSimpleTableWithEnum(FlatBufferBuilder builder,
+      int color) {
+    builder.startTable(1);
+    TestSimpleTableWithEnum.addColor(builder, color);
+    return TestSimpleTableWithEnum.endTestSimpleTableWithEnum(builder);
+  }
+
+  public static void startTestSimpleTableWithEnum(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addColor(FlatBufferBuilder builder, int color) { builder.addByte(0, (byte)color, (byte)2); }
+  public static int endTestSimpleTableWithEnum(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public TestSimpleTableWithEnum get(int j) { return get(new TestSimpleTableWithEnum(), j); }
+    public TestSimpleTableWithEnum get(TestSimpleTableWithEnum obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.kt
new file mode 100644
index 0000000..faf98cc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.kt
@@ -0,0 +1,53 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class TestSimpleTableWithEnum : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : TestSimpleTableWithEnum {
+        __init(_i, _bb)
+        return this
+    }
+    val color : UByte
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 2u
+        }
+    fun mutateColor(color: UByte) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.put(o + bb_pos, color.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsTestSimpleTableWithEnum(_bb: ByteBuffer): TestSimpleTableWithEnum = getRootAsTestSimpleTableWithEnum(_bb, TestSimpleTableWithEnum())
+        fun getRootAsTestSimpleTableWithEnum(_bb: ByteBuffer, obj: TestSimpleTableWithEnum): TestSimpleTableWithEnum {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createTestSimpleTableWithEnum(builder: FlatBufferBuilder, color: UByte) : Int {
+            builder.startTable(1)
+            addColor(builder, color)
+            return endTestSimpleTableWithEnum(builder)
+        }
+        fun startTestSimpleTableWithEnum(builder: FlatBufferBuilder) = builder.startTable(1)
+        fun addColor(builder: FlatBufferBuilder, color: UByte) = builder.addByte(0, color.toByte(), 2)
+        fun endTestSimpleTableWithEnum(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.lua
new file mode 100644
index 0000000..5c95bf1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.lua
@@ -0,0 +1,38 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local TestSimpleTableWithEnum = {} -- the module
+local TestSimpleTableWithEnum_mt = {} -- the class metatable
+
+function TestSimpleTableWithEnum.New()
+    local o = {}
+    setmetatable(o, {__index = TestSimpleTableWithEnum_mt})
+    return o
+end
+function TestSimpleTableWithEnum.GetRootAsTestSimpleTableWithEnum(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = TestSimpleTableWithEnum.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function TestSimpleTableWithEnum_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function TestSimpleTableWithEnum_mt:Color()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 2
+end
+function TestSimpleTableWithEnum.Start(builder) builder:StartObject(1) end
+function TestSimpleTableWithEnum.AddColor(builder, color) builder:PrependUint8Slot(0, color, 2) end
+function TestSimpleTableWithEnum.End(builder) return builder:EndObject() end
+
+return TestSimpleTableWithEnum -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.php
new file mode 100644
index 0000000..6429f8d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.php
@@ -0,0 +1,99 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TestSimpleTableWithEnum extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TestSimpleTableWithEnum
+     */
+    public static function getRootAsTestSimpleTableWithEnum(ByteBuffer $bb)
+    {
+        $obj = new TestSimpleTableWithEnum();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function TestSimpleTableWithEnumIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function TestSimpleTableWithEnumBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::TestSimpleTableWithEnumIdentifier());
+    }
+
+    public static function TestSimpleTableWithEnumExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TestSimpleTableWithEnum
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getColor()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \MyGame\Example\Color::Green;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTestSimpleTableWithEnum(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TestSimpleTableWithEnum
+     */
+    public static function createTestSimpleTableWithEnum(FlatBufferBuilder $builder, $color)
+    {
+        $builder->startObject(1);
+        self::addColor($builder, $color);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addColor(FlatBufferBuilder $builder, $color)
+    {
+        $builder->addByteX(0, $color, 2);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTestSimpleTableWithEnum(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.py
new file mode 100644
index 0000000..1e1e904
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TestSimpleTableWithEnum.py
@@ -0,0 +1,80 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class TestSimpleTableWithEnum(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TestSimpleTableWithEnum()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTestSimpleTableWithEnum(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def TestSimpleTableWithEnumBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # TestSimpleTableWithEnum
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TestSimpleTableWithEnum
+    def Color(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 2
+
+def Start(builder): builder.StartObject(1)
+def TestSimpleTableWithEnumStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddColor(builder, color): builder.PrependUint8Slot(0, color, 2)
+def TestSimpleTableWithEnumAddColor(builder, color):
+    """This method is deprecated. Please switch to AddColor."""
+    return AddColor(builder, color)
+def End(builder): return builder.EndObject()
+def TestSimpleTableWithEnumEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class TestSimpleTableWithEnumT(object):
+
+    # TestSimpleTableWithEnumT
+    def __init__(self):
+        self.color = 2  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        testSimpleTableWithEnum = TestSimpleTableWithEnum()
+        testSimpleTableWithEnum.Init(buf, pos)
+        return cls.InitFromObj(testSimpleTableWithEnum)
+
+    @classmethod
+    def InitFromObj(cls, testSimpleTableWithEnum):
+        x = TestSimpleTableWithEnumT()
+        x._UnPack(testSimpleTableWithEnum)
+        return x
+
+    # TestSimpleTableWithEnumT
+    def _UnPack(self, testSimpleTableWithEnum):
+        if testSimpleTableWithEnum is None:
+            return
+        self.color = testSimpleTableWithEnum.Color()
+
+    # TestSimpleTableWithEnumT
+    def Pack(self, builder):
+        Start(builder)
+        AddColor(builder, self.color)
+        testSimpleTableWithEnum = End(builder)
+        return testSimpleTableWithEnum
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.cs
new file mode 100644
index 0000000..584cceb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.cs
@@ -0,0 +1,207 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct TypeAliases : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static TypeAliases GetRootAsTypeAliases(ByteBuffer _bb) { return GetRootAsTypeAliases(_bb, new TypeAliases()); }
+  public static TypeAliases GetRootAsTypeAliases(ByteBuffer _bb, TypeAliases obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public TypeAliases __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public sbyte I8 { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetSbyte(o + __p.bb_pos) : (sbyte)0; } }
+  public bool MutateI8(sbyte i8) { int o = __p.__offset(4); if (o != 0) { __p.bb.PutSbyte(o + __p.bb_pos, i8); return true; } else { return false; } }
+  public byte U8 { get { int o = __p.__offset(6); return o != 0 ? __p.bb.Get(o + __p.bb_pos) : (byte)0; } }
+  public bool MutateU8(byte u8) { int o = __p.__offset(6); if (o != 0) { __p.bb.Put(o + __p.bb_pos, u8); return true; } else { return false; } }
+  public short I16 { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short)0; } }
+  public bool MutateI16(short i16) { int o = __p.__offset(8); if (o != 0) { __p.bb.PutShort(o + __p.bb_pos, i16); return true; } else { return false; } }
+  public ushort U16 { get { int o = __p.__offset(10); return o != 0 ? __p.bb.GetUshort(o + __p.bb_pos) : (ushort)0; } }
+  public bool MutateU16(ushort u16) { int o = __p.__offset(10); if (o != 0) { __p.bb.PutUshort(o + __p.bb_pos, u16); return true; } else { return false; } }
+  public int I32 { get { int o = __p.__offset(12); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public bool MutateI32(int i32) { int o = __p.__offset(12); if (o != 0) { __p.bb.PutInt(o + __p.bb_pos, i32); return true; } else { return false; } }
+  public uint U32 { get { int o = __p.__offset(14); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint)0; } }
+  public bool MutateU32(uint u32) { int o = __p.__offset(14); if (o != 0) { __p.bb.PutUint(o + __p.bb_pos, u32); return true; } else { return false; } }
+  public long I64 { get { int o = __p.__offset(16); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public bool MutateI64(long i64) { int o = __p.__offset(16); if (o != 0) { __p.bb.PutLong(o + __p.bb_pos, i64); return true; } else { return false; } }
+  public ulong U64 { get { int o = __p.__offset(18); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public bool MutateU64(ulong u64) { int o = __p.__offset(18); if (o != 0) { __p.bb.PutUlong(o + __p.bb_pos, u64); return true; } else { return false; } }
+  public float F32 { get { int o = __p.__offset(20); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)0.0f; } }
+  public bool MutateF32(float f32) { int o = __p.__offset(20); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, f32); return true; } else { return false; } }
+  public double F64 { get { int o = __p.__offset(22); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)0.0; } }
+  public bool MutateF64(double f64) { int o = __p.__offset(22); if (o != 0) { __p.bb.PutDouble(o + __p.bb_pos, f64); return true; } else { return false; } }
+  public sbyte V8(int j) { int o = __p.__offset(24); return o != 0 ? __p.bb.GetSbyte(__p.__vector(o) + j * 1) : (sbyte)0; }
+  public int V8Length { get { int o = __p.__offset(24); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<sbyte> GetV8Bytes() { return __p.__vector_as_span<sbyte>(24, 1); }
+#else
+  public ArraySegment<byte>? GetV8Bytes() { return __p.__vector_as_arraysegment(24); }
+#endif
+  public sbyte[] GetV8Array() { return __p.__vector_as_array<sbyte>(24); }
+  public bool MutateV8(int j, sbyte v8) { int o = __p.__offset(24); if (o != 0) { __p.bb.PutSbyte(__p.__vector(o) + j * 1, v8); return true; } else { return false; } }
+  public double Vf64(int j) { int o = __p.__offset(26); return o != 0 ? __p.bb.GetDouble(__p.__vector(o) + j * 8) : (double)0; }
+  public int Vf64Length { get { int o = __p.__offset(26); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<double> GetVf64Bytes() { return __p.__vector_as_span<double>(26, 8); }
+#else
+  public ArraySegment<byte>? GetVf64Bytes() { return __p.__vector_as_arraysegment(26); }
+#endif
+  public double[] GetVf64Array() { return __p.__vector_as_array<double>(26); }
+  public bool MutateVf64(int j, double vf64) { int o = __p.__offset(26); if (o != 0) { __p.bb.PutDouble(__p.__vector(o) + j * 8, vf64); return true; } else { return false; } }
+
+  public static Offset<MyGame.Example.TypeAliases> CreateTypeAliases(FlatBufferBuilder builder,
+      sbyte i8 = 0,
+      byte u8 = 0,
+      short i16 = 0,
+      ushort u16 = 0,
+      int i32 = 0,
+      uint u32 = 0,
+      long i64 = 0,
+      ulong u64 = 0,
+      float f32 = 0.0f,
+      double f64 = 0.0,
+      VectorOffset v8Offset = default(VectorOffset),
+      VectorOffset vf64Offset = default(VectorOffset)) {
+    builder.StartTable(12);
+    TypeAliases.AddF64(builder, f64);
+    TypeAliases.AddU64(builder, u64);
+    TypeAliases.AddI64(builder, i64);
+    TypeAliases.AddVf64(builder, vf64Offset);
+    TypeAliases.AddV8(builder, v8Offset);
+    TypeAliases.AddF32(builder, f32);
+    TypeAliases.AddU32(builder, u32);
+    TypeAliases.AddI32(builder, i32);
+    TypeAliases.AddU16(builder, u16);
+    TypeAliases.AddI16(builder, i16);
+    TypeAliases.AddU8(builder, u8);
+    TypeAliases.AddI8(builder, i8);
+    return TypeAliases.EndTypeAliases(builder);
+  }
+
+  public static void StartTypeAliases(FlatBufferBuilder builder) { builder.StartTable(12); }
+  public static void AddI8(FlatBufferBuilder builder, sbyte i8) { builder.AddSbyte(0, i8, 0); }
+  public static void AddU8(FlatBufferBuilder builder, byte u8) { builder.AddByte(1, u8, 0); }
+  public static void AddI16(FlatBufferBuilder builder, short i16) { builder.AddShort(2, i16, 0); }
+  public static void AddU16(FlatBufferBuilder builder, ushort u16) { builder.AddUshort(3, u16, 0); }
+  public static void AddI32(FlatBufferBuilder builder, int i32) { builder.AddInt(4, i32, 0); }
+  public static void AddU32(FlatBufferBuilder builder, uint u32) { builder.AddUint(5, u32, 0); }
+  public static void AddI64(FlatBufferBuilder builder, long i64) { builder.AddLong(6, i64, 0); }
+  public static void AddU64(FlatBufferBuilder builder, ulong u64) { builder.AddUlong(7, u64, 0); }
+  public static void AddF32(FlatBufferBuilder builder, float f32) { builder.AddFloat(8, f32, 0.0f); }
+  public static void AddF64(FlatBufferBuilder builder, double f64) { builder.AddDouble(9, f64, 0.0); }
+  public static void AddV8(FlatBufferBuilder builder, VectorOffset v8Offset) { builder.AddOffset(10, v8Offset.Value, 0); }
+  public static VectorOffset CreateV8Vector(FlatBufferBuilder builder, sbyte[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddSbyte(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateV8VectorBlock(FlatBufferBuilder builder, sbyte[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartV8Vector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddVf64(FlatBufferBuilder builder, VectorOffset vf64Offset) { builder.AddOffset(11, vf64Offset.Value, 0); }
+  public static VectorOffset CreateVf64Vector(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddDouble(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateVf64VectorBlock(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartVf64Vector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static Offset<MyGame.Example.TypeAliases> EndTypeAliases(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example.TypeAliases>(o);
+  }
+  public TypeAliasesT UnPack() {
+    var _o = new TypeAliasesT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TypeAliasesT _o) {
+    _o.I8 = this.I8;
+    _o.U8 = this.U8;
+    _o.I16 = this.I16;
+    _o.U16 = this.U16;
+    _o.I32 = this.I32;
+    _o.U32 = this.U32;
+    _o.I64 = this.I64;
+    _o.U64 = this.U64;
+    _o.F32 = this.F32;
+    _o.F64 = this.F64;
+    _o.V8 = new List<sbyte>();
+    for (var _j = 0; _j < this.V8Length; ++_j) {_o.V8.Add(this.V8(_j));}
+    _o.Vf64 = new List<double>();
+    for (var _j = 0; _j < this.Vf64Length; ++_j) {_o.Vf64.Add(this.Vf64(_j));}
+  }
+  public static Offset<MyGame.Example.TypeAliases> Pack(FlatBufferBuilder builder, TypeAliasesT _o) {
+    if (_o == null) return default(Offset<MyGame.Example.TypeAliases>);
+    var _v8 = default(VectorOffset);
+    if (_o.V8 != null) {
+      var __v8 = _o.V8.ToArray();
+      _v8 = CreateV8Vector(builder, __v8);
+    }
+    var _vf64 = default(VectorOffset);
+    if (_o.Vf64 != null) {
+      var __vf64 = _o.Vf64.ToArray();
+      _vf64 = CreateVf64Vector(builder, __vf64);
+    }
+    return CreateTypeAliases(
+      builder,
+      _o.I8,
+      _o.U8,
+      _o.I16,
+      _o.U16,
+      _o.I32,
+      _o.U32,
+      _o.I64,
+      _o.U64,
+      _o.F32,
+      _o.F64,
+      _v8,
+      _vf64);
+  }
+};
+
+public class TypeAliasesT
+{
+  [Newtonsoft.Json.JsonProperty("i8")]
+  public sbyte I8 { get; set; }
+  [Newtonsoft.Json.JsonProperty("u8")]
+  public byte U8 { get; set; }
+  [Newtonsoft.Json.JsonProperty("i16")]
+  public short I16 { get; set; }
+  [Newtonsoft.Json.JsonProperty("u16")]
+  public ushort U16 { get; set; }
+  [Newtonsoft.Json.JsonProperty("i32")]
+  public int I32 { get; set; }
+  [Newtonsoft.Json.JsonProperty("u32")]
+  public uint U32 { get; set; }
+  [Newtonsoft.Json.JsonProperty("i64")]
+  public long I64 { get; set; }
+  [Newtonsoft.Json.JsonProperty("u64")]
+  public ulong U64 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f32")]
+  public float F32 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f64")]
+  public double F64 { get; set; }
+  [Newtonsoft.Json.JsonProperty("v8")]
+  public List<sbyte> V8 { get; set; }
+  [Newtonsoft.Json.JsonProperty("vf64")]
+  public List<double> Vf64 { get; set; }
+
+  public TypeAliasesT() {
+    this.I8 = 0;
+    this.U8 = 0;
+    this.I16 = 0;
+    this.U16 = 0;
+    this.I32 = 0;
+    this.U32 = 0;
+    this.I64 = 0;
+    this.U64 = 0;
+    this.F32 = 0.0f;
+    this.F64 = 0.0;
+    this.V8 = null;
+    this.Vf64 = null;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.go
new file mode 100644
index 0000000..d018fa1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.go
@@ -0,0 +1,336 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type TypeAliasesT struct {
+	I8 int8
+	U8 byte
+	I16 int16
+	U16 uint16
+	I32 int32
+	U32 uint32
+	I64 int64
+	U64 uint64
+	F32 float32
+	F64 float64
+	V8 []int8
+	Vf64 []float64
+}
+
+func (t *TypeAliasesT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	v8Offset := flatbuffers.UOffsetT(0)
+	if t.V8 != nil {
+		v8Length := len(t.V8)
+		TypeAliasesStartV8Vector(builder, v8Length)
+		for j := v8Length - 1; j >= 0; j-- {
+			builder.PrependInt8(t.V8[j])
+		}
+		v8Offset = builder.EndVector(v8Length)
+	}
+	vf64Offset := flatbuffers.UOffsetT(0)
+	if t.Vf64 != nil {
+		vf64Length := len(t.Vf64)
+		TypeAliasesStartVf64Vector(builder, vf64Length)
+		for j := vf64Length - 1; j >= 0; j-- {
+			builder.PrependFloat64(t.Vf64[j])
+		}
+		vf64Offset = builder.EndVector(vf64Length)
+	}
+	TypeAliasesStart(builder)
+	TypeAliasesAddI8(builder, t.I8)
+	TypeAliasesAddU8(builder, t.U8)
+	TypeAliasesAddI16(builder, t.I16)
+	TypeAliasesAddU16(builder, t.U16)
+	TypeAliasesAddI32(builder, t.I32)
+	TypeAliasesAddU32(builder, t.U32)
+	TypeAliasesAddI64(builder, t.I64)
+	TypeAliasesAddU64(builder, t.U64)
+	TypeAliasesAddF32(builder, t.F32)
+	TypeAliasesAddF64(builder, t.F64)
+	TypeAliasesAddV8(builder, v8Offset)
+	TypeAliasesAddVf64(builder, vf64Offset)
+	return TypeAliasesEnd(builder)
+}
+
+func (rcv *TypeAliases) UnPackTo(t *TypeAliasesT) {
+	t.I8 = rcv.I8()
+	t.U8 = rcv.U8()
+	t.I16 = rcv.I16()
+	t.U16 = rcv.U16()
+	t.I32 = rcv.I32()
+	t.U32 = rcv.U32()
+	t.I64 = rcv.I64()
+	t.U64 = rcv.U64()
+	t.F32 = rcv.F32()
+	t.F64 = rcv.F64()
+	v8Length := rcv.V8Length()
+	t.V8 = make([]int8, v8Length)
+	for j := 0; j < v8Length; j++ {
+		t.V8[j] = rcv.V8(j)
+	}
+	vf64Length := rcv.Vf64Length()
+	t.Vf64 = make([]float64, vf64Length)
+	for j := 0; j < vf64Length; j++ {
+		t.Vf64[j] = rcv.Vf64(j)
+	}
+}
+
+func (rcv *TypeAliases) UnPack() *TypeAliasesT {
+	if rcv == nil { return nil }
+	t := &TypeAliasesT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type TypeAliases struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsTypeAliases(buf []byte, offset flatbuffers.UOffsetT) *TypeAliases {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &TypeAliases{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsTypeAliases(buf []byte, offset flatbuffers.UOffsetT) *TypeAliases {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &TypeAliases{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *TypeAliases) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TypeAliases) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *TypeAliases) I8() int8 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.GetInt8(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateI8(n int8) bool {
+	return rcv._tab.MutateInt8Slot(4, n)
+}
+
+func (rcv *TypeAliases) U8() byte {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return rcv._tab.GetByte(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateU8(n byte) bool {
+	return rcv._tab.MutateByteSlot(6, n)
+}
+
+func (rcv *TypeAliases) I16() int16 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		return rcv._tab.GetInt16(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateI16(n int16) bool {
+	return rcv._tab.MutateInt16Slot(8, n)
+}
+
+func (rcv *TypeAliases) U16() uint16 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		return rcv._tab.GetUint16(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateU16(n uint16) bool {
+	return rcv._tab.MutateUint16Slot(10, n)
+}
+
+func (rcv *TypeAliases) I32() int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		return rcv._tab.GetInt32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateI32(n int32) bool {
+	return rcv._tab.MutateInt32Slot(12, n)
+}
+
+func (rcv *TypeAliases) U32() uint32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(14))
+	if o != 0 {
+		return rcv._tab.GetUint32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateU32(n uint32) bool {
+	return rcv._tab.MutateUint32Slot(14, n)
+}
+
+func (rcv *TypeAliases) I64() int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(16))
+	if o != 0 {
+		return rcv._tab.GetInt64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateI64(n int64) bool {
+	return rcv._tab.MutateInt64Slot(16, n)
+}
+
+func (rcv *TypeAliases) U64() uint64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(18))
+	if o != 0 {
+		return rcv._tab.GetUint64(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateU64(n uint64) bool {
+	return rcv._tab.MutateUint64Slot(18, n)
+}
+
+func (rcv *TypeAliases) F32() float32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(20))
+	if o != 0 {
+		return rcv._tab.GetFloat32(o + rcv._tab.Pos)
+	}
+	return 0.0
+}
+
+func (rcv *TypeAliases) MutateF32(n float32) bool {
+	return rcv._tab.MutateFloat32Slot(20, n)
+}
+
+func (rcv *TypeAliases) F64() float64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(22))
+	if o != 0 {
+		return rcv._tab.GetFloat64(o + rcv._tab.Pos)
+	}
+	return 0.0
+}
+
+func (rcv *TypeAliases) MutateF64(n float64) bool {
+	return rcv._tab.MutateFloat64Slot(22, n)
+}
+
+func (rcv *TypeAliases) V8(j int) int8 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(24))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt8(a + flatbuffers.UOffsetT(j*1))
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) V8Length() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(24))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateV8(j int, n int8) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(24))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt8(a+flatbuffers.UOffsetT(j*1), n)
+	}
+	return false
+}
+
+func (rcv *TypeAliases) Vf64(j int) float64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(26))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetFloat64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) Vf64Length() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(26))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+func (rcv *TypeAliases) MutateVf64(j int, n float64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(26))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateFloat64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false
+}
+
+func TypeAliasesStart(builder *flatbuffers.Builder) {
+	builder.StartObject(12)
+}
+func TypeAliasesAddI8(builder *flatbuffers.Builder, i8 int8) {
+	builder.PrependInt8Slot(0, i8, 0)
+}
+func TypeAliasesAddU8(builder *flatbuffers.Builder, u8 byte) {
+	builder.PrependByteSlot(1, u8, 0)
+}
+func TypeAliasesAddI16(builder *flatbuffers.Builder, i16 int16) {
+	builder.PrependInt16Slot(2, i16, 0)
+}
+func TypeAliasesAddU16(builder *flatbuffers.Builder, u16 uint16) {
+	builder.PrependUint16Slot(3, u16, 0)
+}
+func TypeAliasesAddI32(builder *flatbuffers.Builder, i32 int32) {
+	builder.PrependInt32Slot(4, i32, 0)
+}
+func TypeAliasesAddU32(builder *flatbuffers.Builder, u32 uint32) {
+	builder.PrependUint32Slot(5, u32, 0)
+}
+func TypeAliasesAddI64(builder *flatbuffers.Builder, i64 int64) {
+	builder.PrependInt64Slot(6, i64, 0)
+}
+func TypeAliasesAddU64(builder *flatbuffers.Builder, u64 uint64) {
+	builder.PrependUint64Slot(7, u64, 0)
+}
+func TypeAliasesAddF32(builder *flatbuffers.Builder, f32 float32) {
+	builder.PrependFloat32Slot(8, f32, 0.0)
+}
+func TypeAliasesAddF64(builder *flatbuffers.Builder, f64 float64) {
+	builder.PrependFloat64Slot(9, f64, 0.0)
+}
+func TypeAliasesAddV8(builder *flatbuffers.Builder, v8 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(10, flatbuffers.UOffsetT(v8), 0)
+}
+func TypeAliasesStartV8Vector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(1, numElems, 1)
+}
+func TypeAliasesAddVf64(builder *flatbuffers.Builder, vf64 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(11, flatbuffers.UOffsetT(vf64), 0)
+}
+func TypeAliasesStartVf64Vector(builder *flatbuffers.Builder, numElems int) flatbuffers.UOffsetT {
+	return builder.StartVector(8, numElems, 8)
+}
+func TypeAliasesEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.java
new file mode 100644
index 0000000..a088ad1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.java
@@ -0,0 +1,112 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class TypeAliases extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static TypeAliases getRootAsTypeAliases(ByteBuffer _bb) { return getRootAsTypeAliases(_bb, new TypeAliases()); }
+  public static TypeAliases getRootAsTypeAliases(ByteBuffer _bb, TypeAliases obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public TypeAliases __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public byte i8() { int o = __offset(4); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public boolean mutateI8(byte i8) { int o = __offset(4); if (o != 0) { bb.put(o + bb_pos, i8); return true; } else { return false; } }
+  public int u8() { int o = __offset(6); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 0; }
+  public boolean mutateU8(int u8) { int o = __offset(6); if (o != 0) { bb.put(o + bb_pos, (byte)u8); return true; } else { return false; } }
+  public short i16() { int o = __offset(8); return o != 0 ? bb.getShort(o + bb_pos) : 0; }
+  public boolean mutateI16(short i16) { int o = __offset(8); if (o != 0) { bb.putShort(o + bb_pos, i16); return true; } else { return false; } }
+  public int u16() { int o = __offset(10); return o != 0 ? bb.getShort(o + bb_pos) & 0xFFFF : 0; }
+  public boolean mutateU16(int u16) { int o = __offset(10); if (o != 0) { bb.putShort(o + bb_pos, (short)u16); return true; } else { return false; } }
+  public int i32() { int o = __offset(12); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean mutateI32(int i32) { int o = __offset(12); if (o != 0) { bb.putInt(o + bb_pos, i32); return true; } else { return false; } }
+  public long u32() { int o = __offset(14); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 0L; }
+  public boolean mutateU32(long u32) { int o = __offset(14); if (o != 0) { bb.putInt(o + bb_pos, (int)u32); return true; } else { return false; } }
+  public long i64() { int o = __offset(16); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateI64(long i64) { int o = __offset(16); if (o != 0) { bb.putLong(o + bb_pos, i64); return true; } else { return false; } }
+  public long u64() { int o = __offset(18); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean mutateU64(long u64) { int o = __offset(18); if (o != 0) { bb.putLong(o + bb_pos, u64); return true; } else { return false; } }
+  public float f32() { int o = __offset(20); return o != 0 ? bb.getFloat(o + bb_pos) : 0.0f; }
+  public boolean mutateF32(float f32) { int o = __offset(20); if (o != 0) { bb.putFloat(o + bb_pos, f32); return true; } else { return false; } }
+  public double f64() { int o = __offset(22); return o != 0 ? bb.getDouble(o + bb_pos) : 0.0; }
+  public boolean mutateF64(double f64) { int o = __offset(22); if (o != 0) { bb.putDouble(o + bb_pos, f64); return true; } else { return false; } }
+  public byte v8(int j) { int o = __offset(24); return o != 0 ? bb.get(__vector(o) + j * 1) : 0; }
+  public int v8Length() { int o = __offset(24); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector v8Vector() { return v8Vector(new ByteVector()); }
+  public ByteVector v8Vector(ByteVector obj) { int o = __offset(24); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer v8AsByteBuffer() { return __vector_as_bytebuffer(24, 1); }
+  public ByteBuffer v8InByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 24, 1); }
+  public boolean mutateV8(int j, byte v8) { int o = __offset(24); if (o != 0) { bb.put(__vector(o) + j * 1, v8); return true; } else { return false; } }
+  public double vf64(int j) { int o = __offset(26); return o != 0 ? bb.getDouble(__vector(o) + j * 8) : 0; }
+  public int vf64Length() { int o = __offset(26); return o != 0 ? __vector_len(o) : 0; }
+  public DoubleVector vf64Vector() { return vf64Vector(new DoubleVector()); }
+  public DoubleVector vf64Vector(DoubleVector obj) { int o = __offset(26); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer vf64AsByteBuffer() { return __vector_as_bytebuffer(26, 8); }
+  public ByteBuffer vf64InByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 26, 8); }
+  public boolean mutateVf64(int j, double vf64) { int o = __offset(26); if (o != 0) { bb.putDouble(__vector(o) + j * 8, vf64); return true; } else { return false; } }
+
+  public static int createTypeAliases(FlatBufferBuilder builder,
+      byte i8,
+      int u8,
+      short i16,
+      int u16,
+      int i32,
+      long u32,
+      long i64,
+      long u64,
+      float f32,
+      double f64,
+      int v8Offset,
+      int vf64Offset) {
+    builder.startTable(12);
+    TypeAliases.addF64(builder, f64);
+    TypeAliases.addU64(builder, u64);
+    TypeAliases.addI64(builder, i64);
+    TypeAliases.addVf64(builder, vf64Offset);
+    TypeAliases.addV8(builder, v8Offset);
+    TypeAliases.addF32(builder, f32);
+    TypeAliases.addU32(builder, u32);
+    TypeAliases.addI32(builder, i32);
+    TypeAliases.addU16(builder, u16);
+    TypeAliases.addI16(builder, i16);
+    TypeAliases.addU8(builder, u8);
+    TypeAliases.addI8(builder, i8);
+    return TypeAliases.endTypeAliases(builder);
+  }
+
+  public static void startTypeAliases(FlatBufferBuilder builder) { builder.startTable(12); }
+  public static void addI8(FlatBufferBuilder builder, byte i8) { builder.addByte(0, i8, 0); }
+  public static void addU8(FlatBufferBuilder builder, int u8) { builder.addByte(1, (byte)u8, (byte)0); }
+  public static void addI16(FlatBufferBuilder builder, short i16) { builder.addShort(2, i16, 0); }
+  public static void addU16(FlatBufferBuilder builder, int u16) { builder.addShort(3, (short)u16, (short)0); }
+  public static void addI32(FlatBufferBuilder builder, int i32) { builder.addInt(4, i32, 0); }
+  public static void addU32(FlatBufferBuilder builder, long u32) { builder.addInt(5, (int)u32, (int)0L); }
+  public static void addI64(FlatBufferBuilder builder, long i64) { builder.addLong(6, i64, 0L); }
+  public static void addU64(FlatBufferBuilder builder, long u64) { builder.addLong(7, u64, 0L); }
+  public static void addF32(FlatBufferBuilder builder, float f32) { builder.addFloat(8, f32, 0.0f); }
+  public static void addF64(FlatBufferBuilder builder, double f64) { builder.addDouble(9, f64, 0.0); }
+  public static void addV8(FlatBufferBuilder builder, int v8Offset) { builder.addOffset(10, v8Offset, 0); }
+  public static int createV8Vector(FlatBufferBuilder builder, byte[] data) { return builder.createByteVector(data); }
+  public static int createV8Vector(FlatBufferBuilder builder, ByteBuffer data) { return builder.createByteVector(data); }
+  public static void startV8Vector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addVf64(FlatBufferBuilder builder, int vf64Offset) { builder.addOffset(11, vf64Offset, 0); }
+  public static int createVf64Vector(FlatBufferBuilder builder, double[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addDouble(data[i]); return builder.endVector(); }
+  public static void startVf64Vector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static int endTypeAliases(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public TypeAliases get(int j) { return get(new TypeAliases(), j); }
+    public TypeAliases get(TypeAliases obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.kt
new file mode 100644
index 0000000..64e0952
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.kt
@@ -0,0 +1,263 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class TypeAliases : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : TypeAliases {
+        __init(_i, _bb)
+        return this
+    }
+    val i8 : Byte
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.get(o + bb_pos) else 0
+        }
+    fun mutateI8(i8: Byte) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.put(o + bb_pos, i8)
+            true
+        } else {
+            false
+        }
+    }
+    val u8 : UByte
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateU8(u8: UByte) : Boolean {
+        val o = __offset(6)
+        return if (o != 0) {
+            bb.put(o + bb_pos, u8.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    val i16 : Short
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.getShort(o + bb_pos) else 0
+        }
+    fun mutateI16(i16: Short) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.putShort(o + bb_pos, i16)
+            true
+        } else {
+            false
+        }
+    }
+    val u16 : UShort
+        get() {
+            val o = __offset(10)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else 0u
+        }
+    fun mutateU16(u16: UShort) : Boolean {
+        val o = __offset(10)
+        return if (o != 0) {
+            bb.putShort(o + bb_pos, u16.toShort())
+            true
+        } else {
+            false
+        }
+    }
+    val i32 : Int
+        get() {
+            val o = __offset(12)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    fun mutateI32(i32: Int) : Boolean {
+        val o = __offset(12)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, i32)
+            true
+        } else {
+            false
+        }
+    }
+    val u32 : UInt
+        get() {
+            val o = __offset(14)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else 0u
+        }
+    fun mutateU32(u32: UInt) : Boolean {
+        val o = __offset(14)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, u32.toInt())
+            true
+        } else {
+            false
+        }
+    }
+    val i64 : Long
+        get() {
+            val o = __offset(16)
+            return if(o != 0) bb.getLong(o + bb_pos) else 0L
+        }
+    fun mutateI64(i64: Long) : Boolean {
+        val o = __offset(16)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, i64)
+            true
+        } else {
+            false
+        }
+    }
+    val u64 : ULong
+        get() {
+            val o = __offset(18)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    fun mutateU64(u64: ULong) : Boolean {
+        val o = __offset(18)
+        return if (o != 0) {
+            bb.putLong(o + bb_pos, u64.toLong())
+            true
+        } else {
+            false
+        }
+    }
+    val f32 : Float
+        get() {
+            val o = __offset(20)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 0.0f
+        }
+    fun mutateF32(f32: Float) : Boolean {
+        val o = __offset(20)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, f32)
+            true
+        } else {
+            false
+        }
+    }
+    val f64 : Double
+        get() {
+            val o = __offset(22)
+            return if(o != 0) bb.getDouble(o + bb_pos) else 0.0
+        }
+    fun mutateF64(f64: Double) : Boolean {
+        val o = __offset(22)
+        return if (o != 0) {
+            bb.putDouble(o + bb_pos, f64)
+            true
+        } else {
+            false
+        }
+    }
+    fun v8(j: Int) : Byte {
+        val o = __offset(24)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1)
+        } else {
+            0
+        }
+    }
+    val v8Length : Int
+        get() {
+            val o = __offset(24); return if (o != 0) __vector_len(o) else 0
+        }
+    val v8AsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(24, 1)
+    fun v8InByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 24, 1)
+    fun mutateV8(j: Int, v8: Byte) : Boolean {
+        val o = __offset(24)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, v8)
+            true
+        } else {
+            false
+        }
+    }
+    fun vf64(j: Int) : Double {
+        val o = __offset(26)
+        return if (o != 0) {
+            bb.getDouble(__vector(o) + j * 8)
+        } else {
+            0.0
+        }
+    }
+    val vf64Length : Int
+        get() {
+            val o = __offset(26); return if (o != 0) __vector_len(o) else 0
+        }
+    val vf64AsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(26, 8)
+    fun vf64InByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 26, 8)
+    fun mutateVf64(j: Int, vf64: Double) : Boolean {
+        val o = __offset(26)
+        return if (o != 0) {
+            bb.putDouble(__vector(o) + j * 8, vf64)
+            true
+        } else {
+            false
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsTypeAliases(_bb: ByteBuffer): TypeAliases = getRootAsTypeAliases(_bb, TypeAliases())
+        fun getRootAsTypeAliases(_bb: ByteBuffer, obj: TypeAliases): TypeAliases {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createTypeAliases(builder: FlatBufferBuilder, i8: Byte, u8: UByte, i16: Short, u16: UShort, i32: Int, u32: UInt, i64: Long, u64: ULong, f32: Float, f64: Double, v8Offset: Int, vf64Offset: Int) : Int {
+            builder.startTable(12)
+            addF64(builder, f64)
+            addU64(builder, u64)
+            addI64(builder, i64)
+            addVf64(builder, vf64Offset)
+            addV8(builder, v8Offset)
+            addF32(builder, f32)
+            addU32(builder, u32)
+            addI32(builder, i32)
+            addU16(builder, u16)
+            addI16(builder, i16)
+            addU8(builder, u8)
+            addI8(builder, i8)
+            return endTypeAliases(builder)
+        }
+        fun startTypeAliases(builder: FlatBufferBuilder) = builder.startTable(12)
+        fun addI8(builder: FlatBufferBuilder, i8: Byte) = builder.addByte(0, i8, 0)
+        fun addU8(builder: FlatBufferBuilder, u8: UByte) = builder.addByte(1, u8.toByte(), 0)
+        fun addI16(builder: FlatBufferBuilder, i16: Short) = builder.addShort(2, i16, 0)
+        fun addU16(builder: FlatBufferBuilder, u16: UShort) = builder.addShort(3, u16.toShort(), 0)
+        fun addI32(builder: FlatBufferBuilder, i32: Int) = builder.addInt(4, i32, 0)
+        fun addU32(builder: FlatBufferBuilder, u32: UInt) = builder.addInt(5, u32.toInt(), 0)
+        fun addI64(builder: FlatBufferBuilder, i64: Long) = builder.addLong(6, i64, 0L)
+        fun addU64(builder: FlatBufferBuilder, u64: ULong) = builder.addLong(7, u64.toLong(), 0)
+        fun addF32(builder: FlatBufferBuilder, f32: Float) = builder.addFloat(8, f32, 0.0)
+        fun addF64(builder: FlatBufferBuilder, f64: Double) = builder.addDouble(9, f64, 0.0)
+        fun addV8(builder: FlatBufferBuilder, v8: Int) = builder.addOffset(10, v8, 0)
+        fun createV8Vector(builder: FlatBufferBuilder, data: ByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startV8Vector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addVf64(builder: FlatBufferBuilder, vf64: Int) = builder.addOffset(11, vf64, 0)
+        fun createVf64Vector(builder: FlatBufferBuilder, data: DoubleArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addDouble(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startVf64Vector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun endTypeAliases(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.lua
new file mode 100644
index 0000000..e9c680b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.lua
@@ -0,0 +1,147 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local TypeAliases = {} -- the module
+local TypeAliases_mt = {} -- the class metatable
+
+function TypeAliases.New()
+    local o = {}
+    setmetatable(o, {__index = TypeAliases_mt})
+    return o
+end
+function TypeAliases.GetRootAsTypeAliases(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = TypeAliases.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function TypeAliases_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function TypeAliases_mt:I8()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int8, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:U8()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:I16()
+    local o = self.view:Offset(8)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int16, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:U16()
+    local o = self.view:Offset(10)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint16, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:I32()
+    local o = self.view:Offset(12)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int32, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:U32()
+    local o = self.view:Offset(14)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint32, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:I64()
+    local o = self.view:Offset(16)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int64, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:U64()
+    local o = self.view:Offset(18)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint64, o + self.view.pos)
+    end
+    return 0
+end
+function TypeAliases_mt:F32()
+    local o = self.view:Offset(20)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Float32, o + self.view.pos)
+    end
+    return 0.0
+end
+function TypeAliases_mt:F64()
+    local o = self.view:Offset(22)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Float64, o + self.view.pos)
+    end
+    return 0.0
+end
+function TypeAliases_mt:V8(j)
+    local o = self.view:Offset(24)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Int8, a + ((j-1) * 1))
+    end
+    return 0
+end
+function TypeAliases_mt:V8AsString(start, stop)
+    return self.view:VectorAsString(24, start, stop)
+end
+function TypeAliases_mt:V8Length()
+    local o = self.view:Offset(24)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function TypeAliases_mt:Vf64(j)
+    local o = self.view:Offset(26)
+    if o ~= 0 then
+        local a = self.view:Vector(o)
+        return self.view:Get(flatbuffers.N.Float64, a + ((j-1) * 8))
+    end
+    return 0
+end
+function TypeAliases_mt:Vf64Length()
+    local o = self.view:Offset(26)
+    if o ~= 0 then
+        return self.view:VectorLen(o)
+    end
+    return 0
+end
+function TypeAliases.Start(builder) builder:StartObject(12) end
+function TypeAliases.AddI8(builder, i8) builder:PrependInt8Slot(0, i8, 0) end
+function TypeAliases.AddU8(builder, u8) builder:PrependUint8Slot(1, u8, 0) end
+function TypeAliases.AddI16(builder, i16) builder:PrependInt16Slot(2, i16, 0) end
+function TypeAliases.AddU16(builder, u16) builder:PrependUint16Slot(3, u16, 0) end
+function TypeAliases.AddI32(builder, i32) builder:PrependInt32Slot(4, i32, 0) end
+function TypeAliases.AddU32(builder, u32) builder:PrependUint32Slot(5, u32, 0) end
+function TypeAliases.AddI64(builder, i64) builder:PrependInt64Slot(6, i64, 0) end
+function TypeAliases.AddU64(builder, u64) builder:PrependUint64Slot(7, u64, 0) end
+function TypeAliases.AddF32(builder, f32) builder:PrependFloat32Slot(8, f32, 0.0) end
+function TypeAliases.AddF64(builder, f64) builder:PrependFloat64Slot(9, f64, 0.0) end
+function TypeAliases.AddV8(builder, v8) builder:PrependUOffsetTRelativeSlot(10, v8, 0) end
+function TypeAliases.StartV8Vector(builder, numElems) return builder:StartVector(1, numElems, 1) end
+function TypeAliases.AddVf64(builder, vf64) builder:PrependUOffsetTRelativeSlot(11, vf64, 0) end
+function TypeAliases.StartVf64Vector(builder, numElems) return builder:StartVector(8, numElems, 8) end
+function TypeAliases.End(builder) return builder:EndObject() end
+
+return TypeAliases -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.php
new file mode 100644
index 0000000..7629897
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.php
@@ -0,0 +1,387 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TypeAliases extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TypeAliases
+     */
+    public static function getRootAsTypeAliases(ByteBuffer $bb)
+    {
+        $obj = new TypeAliases();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function TypeAliasesIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function TypeAliasesBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::TypeAliasesIdentifier());
+    }
+
+    public static function TypeAliasesExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TypeAliases
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return sbyte
+     */
+    public function getI8()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getSbyte($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getU8()
+    {
+        $o = $this->__offset(6);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return short
+     */
+    public function getI16()
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->bb->getShort($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return ushort
+     */
+    public function getU16()
+    {
+        $o = $this->__offset(10);
+        return $o != 0 ? $this->bb->getUshort($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getI32()
+    {
+        $o = $this->__offset(12);
+        return $o != 0 ? $this->bb->getInt($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return uint
+     */
+    public function getU32()
+    {
+        $o = $this->__offset(14);
+        return $o != 0 ? $this->bb->getUint($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return long
+     */
+    public function getI64()
+    {
+        $o = $this->__offset(16);
+        return $o != 0 ? $this->bb->getLong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return ulong
+     */
+    public function getU64()
+    {
+        $o = $this->__offset(18);
+        return $o != 0 ? $this->bb->getUlong($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @return float
+     */
+    public function getF32()
+    {
+        $o = $this->__offset(20);
+        return $o != 0 ? $this->bb->getFloat($o + $this->bb_pos) : 0.0;
+    }
+
+    /**
+     * @return double
+     */
+    public function getF64()
+    {
+        $o = $this->__offset(22);
+        return $o != 0 ? $this->bb->getDouble($o + $this->bb_pos) : 0.0;
+    }
+
+    /**
+     * @param int offset
+     * @return sbyte
+     */
+    public function getV8($j)
+    {
+        $o = $this->__offset(24);
+        return $o != 0 ? $this->bb->getSbyte($this->__vector($o) + $j * 1) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getV8Length()
+    {
+        $o = $this->__offset(24);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return double
+     */
+    public function getVf64($j)
+    {
+        $o = $this->__offset(26);
+        return $o != 0 ? $this->bb->getDouble($this->__vector($o) + $j * 8) : 0;
+    }
+
+    /**
+     * @return int
+     */
+    public function getVf64Length()
+    {
+        $o = $this->__offset(26);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTypeAliases(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(12);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TypeAliases
+     */
+    public static function createTypeAliases(FlatBufferBuilder $builder, $i8, $u8, $i16, $u16, $i32, $u32, $i64, $u64, $f32, $f64, $v8, $vf64)
+    {
+        $builder->startObject(12);
+        self::addI8($builder, $i8);
+        self::addU8($builder, $u8);
+        self::addI16($builder, $i16);
+        self::addU16($builder, $u16);
+        self::addI32($builder, $i32);
+        self::addU32($builder, $u32);
+        self::addI64($builder, $i64);
+        self::addU64($builder, $u64);
+        self::addF32($builder, $f32);
+        self::addF64($builder, $f64);
+        self::addV8($builder, $v8);
+        self::addVf64($builder, $vf64);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param sbyte
+     * @return void
+     */
+    public static function addI8(FlatBufferBuilder $builder, $i8)
+    {
+        $builder->addSbyteX(0, $i8, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addU8(FlatBufferBuilder $builder, $u8)
+    {
+        $builder->addByteX(1, $u8, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param short
+     * @return void
+     */
+    public static function addI16(FlatBufferBuilder $builder, $i16)
+    {
+        $builder->addShortX(2, $i16, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ushort
+     * @return void
+     */
+    public static function addU16(FlatBufferBuilder $builder, $u16)
+    {
+        $builder->addUshortX(3, $u16, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addI32(FlatBufferBuilder $builder, $i32)
+    {
+        $builder->addIntX(4, $i32, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param uint
+     * @return void
+     */
+    public static function addU32(FlatBufferBuilder $builder, $u32)
+    {
+        $builder->addUintX(5, $u32, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param long
+     * @return void
+     */
+    public static function addI64(FlatBufferBuilder $builder, $i64)
+    {
+        $builder->addLongX(6, $i64, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param ulong
+     * @return void
+     */
+    public static function addU64(FlatBufferBuilder $builder, $u64)
+    {
+        $builder->addUlongX(7, $u64, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param float
+     * @return void
+     */
+    public static function addF32(FlatBufferBuilder $builder, $f32)
+    {
+        $builder->addFloatX(8, $f32, 0.0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param double
+     * @return void
+     */
+    public static function addF64(FlatBufferBuilder $builder, $f64)
+    {
+        $builder->addDoubleX(9, $f64, 0.0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addV8(FlatBufferBuilder $builder, $v8)
+    {
+        $builder->addOffsetX(10, $v8, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createV8Vector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putSbyte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startV8Vector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addVf64(FlatBufferBuilder $builder, $vf64)
+    {
+        $builder->addOffsetX(11, $vf64, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createVf64Vector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(8, count($data), 8);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putDouble($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startVf64Vector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(8, $numElems, 8);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTypeAliases(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.py
new file mode 100644
index 0000000..2908975
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/TypeAliases.py
@@ -0,0 +1,316 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class TypeAliases(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TypeAliases()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTypeAliases(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def TypeAliasesBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # TypeAliases
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TypeAliases
+    def I8(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def U8(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def I16(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int16Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def U16(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint16Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def I32(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def U32(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def I64(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def U64(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint64Flags, o + self._tab.Pos)
+        return 0
+
+    # TypeAliases
+    def F32(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # TypeAliases
+    def F64(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, o + self._tab.Pos)
+        return 0.0
+
+    # TypeAliases
+    def V8(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # TypeAliases
+    def V8AsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int8Flags, o)
+        return 0
+
+    # TypeAliases
+    def V8Length(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # TypeAliases
+    def V8IsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(24))
+        return o == 0
+
+    # TypeAliases
+    def Vf64(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # TypeAliases
+    def Vf64AsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float64Flags, o)
+        return 0
+
+    # TypeAliases
+    def Vf64Length(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # TypeAliases
+    def Vf64IsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(26))
+        return o == 0
+
+def Start(builder): builder.StartObject(12)
+def TypeAliasesStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddI8(builder, i8): builder.PrependInt8Slot(0, i8, 0)
+def TypeAliasesAddI8(builder, i8):
+    """This method is deprecated. Please switch to AddI8."""
+    return AddI8(builder, i8)
+def AddU8(builder, u8): builder.PrependUint8Slot(1, u8, 0)
+def TypeAliasesAddU8(builder, u8):
+    """This method is deprecated. Please switch to AddU8."""
+    return AddU8(builder, u8)
+def AddI16(builder, i16): builder.PrependInt16Slot(2, i16, 0)
+def TypeAliasesAddI16(builder, i16):
+    """This method is deprecated. Please switch to AddI16."""
+    return AddI16(builder, i16)
+def AddU16(builder, u16): builder.PrependUint16Slot(3, u16, 0)
+def TypeAliasesAddU16(builder, u16):
+    """This method is deprecated. Please switch to AddU16."""
+    return AddU16(builder, u16)
+def AddI32(builder, i32): builder.PrependInt32Slot(4, i32, 0)
+def TypeAliasesAddI32(builder, i32):
+    """This method is deprecated. Please switch to AddI32."""
+    return AddI32(builder, i32)
+def AddU32(builder, u32): builder.PrependUint32Slot(5, u32, 0)
+def TypeAliasesAddU32(builder, u32):
+    """This method is deprecated. Please switch to AddU32."""
+    return AddU32(builder, u32)
+def AddI64(builder, i64): builder.PrependInt64Slot(6, i64, 0)
+def TypeAliasesAddI64(builder, i64):
+    """This method is deprecated. Please switch to AddI64."""
+    return AddI64(builder, i64)
+def AddU64(builder, u64): builder.PrependUint64Slot(7, u64, 0)
+def TypeAliasesAddU64(builder, u64):
+    """This method is deprecated. Please switch to AddU64."""
+    return AddU64(builder, u64)
+def AddF32(builder, f32): builder.PrependFloat32Slot(8, f32, 0.0)
+def TypeAliasesAddF32(builder, f32):
+    """This method is deprecated. Please switch to AddF32."""
+    return AddF32(builder, f32)
+def AddF64(builder, f64): builder.PrependFloat64Slot(9, f64, 0.0)
+def TypeAliasesAddF64(builder, f64):
+    """This method is deprecated. Please switch to AddF64."""
+    return AddF64(builder, f64)
+def AddV8(builder, v8): builder.PrependUOffsetTRelativeSlot(10, flatbuffers.number_types.UOffsetTFlags.py_type(v8), 0)
+def TypeAliasesAddV8(builder, v8):
+    """This method is deprecated. Please switch to AddV8."""
+    return AddV8(builder, v8)
+def StartV8Vector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def TypeAliasesStartV8Vector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartV8Vector(builder, numElems)
+def AddVf64(builder, vf64): builder.PrependUOffsetTRelativeSlot(11, flatbuffers.number_types.UOffsetTFlags.py_type(vf64), 0)
+def TypeAliasesAddVf64(builder, vf64):
+    """This method is deprecated. Please switch to AddVf64."""
+    return AddVf64(builder, vf64)
+def StartVf64Vector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def TypeAliasesStartVf64Vector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartVf64Vector(builder, numElems)
+def End(builder): return builder.EndObject()
+def TypeAliasesEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+try:
+    from typing import List
+except:
+    pass
+
+class TypeAliasesT(object):
+
+    # TypeAliasesT
+    def __init__(self):
+        self.i8 = 0  # type: int
+        self.u8 = 0  # type: int
+        self.i16 = 0  # type: int
+        self.u16 = 0  # type: int
+        self.i32 = 0  # type: int
+        self.u32 = 0  # type: int
+        self.i64 = 0  # type: int
+        self.u64 = 0  # type: int
+        self.f32 = 0.0  # type: float
+        self.f64 = 0.0  # type: float
+        self.v8 = None  # type: List[int]
+        self.vf64 = None  # type: List[float]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        typeAliases = TypeAliases()
+        typeAliases.Init(buf, pos)
+        return cls.InitFromObj(typeAliases)
+
+    @classmethod
+    def InitFromObj(cls, typeAliases):
+        x = TypeAliasesT()
+        x._UnPack(typeAliases)
+        return x
+
+    # TypeAliasesT
+    def _UnPack(self, typeAliases):
+        if typeAliases is None:
+            return
+        self.i8 = typeAliases.I8()
+        self.u8 = typeAliases.U8()
+        self.i16 = typeAliases.I16()
+        self.u16 = typeAliases.U16()
+        self.i32 = typeAliases.I32()
+        self.u32 = typeAliases.U32()
+        self.i64 = typeAliases.I64()
+        self.u64 = typeAliases.U64()
+        self.f32 = typeAliases.F32()
+        self.f64 = typeAliases.F64()
+        if not typeAliases.V8IsNone():
+            if np is None:
+                self.v8 = []
+                for i in range(typeAliases.V8Length()):
+                    self.v8.append(typeAliases.V8(i))
+            else:
+                self.v8 = typeAliases.V8AsNumpy()
+        if not typeAliases.Vf64IsNone():
+            if np is None:
+                self.vf64 = []
+                for i in range(typeAliases.Vf64Length()):
+                    self.vf64.append(typeAliases.Vf64(i))
+            else:
+                self.vf64 = typeAliases.Vf64AsNumpy()
+
+    # TypeAliasesT
+    def Pack(self, builder):
+        if self.v8 is not None:
+            if np is not None and type(self.v8) is np.ndarray:
+                v8 = builder.CreateNumpyVector(self.v8)
+            else:
+                StartV8Vector(builder, len(self.v8))
+                for i in reversed(range(len(self.v8))):
+                    builder.PrependByte(self.v8[i])
+                v8 = builder.EndVector()
+        if self.vf64 is not None:
+            if np is not None and type(self.vf64) is np.ndarray:
+                vf64 = builder.CreateNumpyVector(self.vf64)
+            else:
+                StartVf64Vector(builder, len(self.vf64))
+                for i in reversed(range(len(self.vf64))):
+                    builder.PrependFloat64(self.vf64[i])
+                vf64 = builder.EndVector()
+        Start(builder)
+        AddI8(builder, self.i8)
+        AddU8(builder, self.u8)
+        AddI16(builder, self.i16)
+        AddU16(builder, self.u16)
+        AddI32(builder, self.i32)
+        AddU32(builder, self.u32)
+        AddI64(builder, self.i64)
+        AddU64(builder, self.u64)
+        AddF32(builder, self.f32)
+        AddF64(builder, self.f64)
+        if self.v8 is not None:
+            AddV8(builder, v8)
+        if self.vf64 is not None:
+            AddVf64(builder, vf64)
+        typeAliases = End(builder)
+        return typeAliases
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.cs
new file mode 100644
index 0000000..54fc8f9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.cs
@@ -0,0 +1,102 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Vec3 : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public Vec3 __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public float X { get { return __p.bb.GetFloat(__p.bb_pos + 0); } }
+  public void MutateX(float x) { __p.bb.PutFloat(__p.bb_pos + 0, x); }
+  public float Y { get { return __p.bb.GetFloat(__p.bb_pos + 4); } }
+  public void MutateY(float y) { __p.bb.PutFloat(__p.bb_pos + 4, y); }
+  public float Z { get { return __p.bb.GetFloat(__p.bb_pos + 8); } }
+  public void MutateZ(float z) { __p.bb.PutFloat(__p.bb_pos + 8, z); }
+  public double Test1 { get { return __p.bb.GetDouble(__p.bb_pos + 16); } }
+  public void MutateTest1(double test1) { __p.bb.PutDouble(__p.bb_pos + 16, test1); }
+  public MyGame.Example.Color Test2 { get { return (MyGame.Example.Color)__p.bb.Get(__p.bb_pos + 24); } }
+  public void MutateTest2(MyGame.Example.Color test2) { __p.bb.Put(__p.bb_pos + 24, (byte)test2); }
+  public MyGame.Example.Test Test3 { get { return (new MyGame.Example.Test()).__assign(__p.bb_pos + 26, __p.bb); } }
+
+  public static Offset<MyGame.Example.Vec3> CreateVec3(FlatBufferBuilder builder, float X, float Y, float Z, double Test1, MyGame.Example.Color Test2, short test3_A, sbyte test3_B) {
+    builder.Prep(8, 32);
+    builder.Pad(2);
+    builder.Prep(2, 4);
+    builder.Pad(1);
+    builder.PutSbyte(test3_B);
+    builder.PutShort(test3_A);
+    builder.Pad(1);
+    builder.PutByte((byte)Test2);
+    builder.PutDouble(Test1);
+    builder.Pad(4);
+    builder.PutFloat(Z);
+    builder.PutFloat(Y);
+    builder.PutFloat(X);
+    return new Offset<MyGame.Example.Vec3>(builder.Offset);
+  }
+  public Vec3T UnPack() {
+    var _o = new Vec3T();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(Vec3T _o) {
+    _o.X = this.X;
+    _o.Y = this.Y;
+    _o.Z = this.Z;
+    _o.Test1 = this.Test1;
+    _o.Test2 = this.Test2;
+    _o.Test3 = this.Test3.UnPack();
+  }
+  public static Offset<MyGame.Example.Vec3> Pack(FlatBufferBuilder builder, Vec3T _o) {
+    if (_o == null) return default(Offset<MyGame.Example.Vec3>);
+    var _test3_a = _o.Test3.A;
+    var _test3_b = _o.Test3.B;
+    return CreateVec3(
+      builder,
+      _o.X,
+      _o.Y,
+      _o.Z,
+      _o.Test1,
+      _o.Test2,
+      _test3_a,
+      _test3_b);
+  }
+};
+
+public class Vec3T
+{
+  [Newtonsoft.Json.JsonProperty("x")]
+  public float X { get; set; }
+  [Newtonsoft.Json.JsonProperty("y")]
+  public float Y { get; set; }
+  [Newtonsoft.Json.JsonProperty("z")]
+  public float Z { get; set; }
+  [Newtonsoft.Json.JsonProperty("test1")]
+  public double Test1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("test2")]
+  public MyGame.Example.Color Test2 { get; set; }
+  [Newtonsoft.Json.JsonProperty("test3")]
+  public MyGame.Example.TestT Test3 { get; set; }
+
+  public Vec3T() {
+    this.X = 0.0f;
+    this.Y = 0.0f;
+    this.Z = 0.0f;
+    this.Test1 = 0.0;
+    this.Test2 = 0;
+    this.Test3 = new MyGame.Example.TestT();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.go
new file mode 100644
index 0000000..08311ed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.go
@@ -0,0 +1,109 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type Vec3T struct {
+	X float32
+	Y float32
+	Z float32
+	Test1 float64
+	Test2 Color
+	Test3 *TestT
+}
+
+func (t *Vec3T) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	return CreateVec3(builder, t.X, t.Y, t.Z, t.Test1, t.Test2, t.Test3.A, t.Test3.B)
+}
+func (rcv *Vec3) UnPackTo(t *Vec3T) {
+	t.X = rcv.X()
+	t.Y = rcv.Y()
+	t.Z = rcv.Z()
+	t.Test1 = rcv.Test1()
+	t.Test2 = rcv.Test2()
+	t.Test3 = rcv.Test3(nil).UnPack()
+}
+
+func (rcv *Vec3) UnPack() *Vec3T {
+	if rcv == nil { return nil }
+	t := &Vec3T{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Vec3 struct {
+	_tab flatbuffers.Struct
+}
+
+func (rcv *Vec3) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Vec3) Table() flatbuffers.Table {
+	return rcv._tab.Table
+}
+
+func (rcv *Vec3) X() float32 {
+	return rcv._tab.GetFloat32(rcv._tab.Pos + flatbuffers.UOffsetT(0))
+}
+func (rcv *Vec3) MutateX(n float32) bool {
+	return rcv._tab.MutateFloat32(rcv._tab.Pos+flatbuffers.UOffsetT(0), n)
+}
+
+func (rcv *Vec3) Y() float32 {
+	return rcv._tab.GetFloat32(rcv._tab.Pos + flatbuffers.UOffsetT(4))
+}
+func (rcv *Vec3) MutateY(n float32) bool {
+	return rcv._tab.MutateFloat32(rcv._tab.Pos+flatbuffers.UOffsetT(4), n)
+}
+
+func (rcv *Vec3) Z() float32 {
+	return rcv._tab.GetFloat32(rcv._tab.Pos + flatbuffers.UOffsetT(8))
+}
+func (rcv *Vec3) MutateZ(n float32) bool {
+	return rcv._tab.MutateFloat32(rcv._tab.Pos+flatbuffers.UOffsetT(8), n)
+}
+
+func (rcv *Vec3) Test1() float64 {
+	return rcv._tab.GetFloat64(rcv._tab.Pos + flatbuffers.UOffsetT(16))
+}
+func (rcv *Vec3) MutateTest1(n float64) bool {
+	return rcv._tab.MutateFloat64(rcv._tab.Pos+flatbuffers.UOffsetT(16), n)
+}
+
+func (rcv *Vec3) Test2() Color {
+	return Color(rcv._tab.GetByte(rcv._tab.Pos + flatbuffers.UOffsetT(24)))
+}
+func (rcv *Vec3) MutateTest2(n Color) bool {
+	return rcv._tab.MutateByte(rcv._tab.Pos+flatbuffers.UOffsetT(24), byte(n))
+}
+
+func (rcv *Vec3) Test3(obj *Test) *Test {
+	if obj == nil {
+		obj = new(Test)
+	}
+	obj.Init(rcv._tab.Bytes, rcv._tab.Pos+26)
+	return obj
+}
+
+func CreateVec3(builder *flatbuffers.Builder, x float32, y float32, z float32, test1 float64, test2 Color, test3_a int16, test3_b int8) flatbuffers.UOffsetT {
+	builder.Prep(8, 32)
+	builder.Pad(2)
+	builder.Prep(2, 4)
+	builder.Pad(1)
+	builder.PrependInt8(test3_b)
+	builder.PrependInt16(test3_a)
+	builder.Pad(1)
+	builder.PrependByte(byte(test2))
+	builder.PrependFloat64(test1)
+	builder.Pad(4)
+	builder.PrependFloat32(z)
+	builder.PrependFloat32(y)
+	builder.PrependFloat32(x)
+	return builder.Offset()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.java
new file mode 100644
index 0000000..89d38a1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.java
@@ -0,0 +1,52 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Vec3 extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Vec3 __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public float x() { return bb.getFloat(bb_pos + 0); }
+  public void mutateX(float x) { bb.putFloat(bb_pos + 0, x); }
+  public float y() { return bb.getFloat(bb_pos + 4); }
+  public void mutateY(float y) { bb.putFloat(bb_pos + 4, y); }
+  public float z() { return bb.getFloat(bb_pos + 8); }
+  public void mutateZ(float z) { bb.putFloat(bb_pos + 8, z); }
+  public double test1() { return bb.getDouble(bb_pos + 16); }
+  public void mutateTest1(double test1) { bb.putDouble(bb_pos + 16, test1); }
+  public int test2() { return bb.get(bb_pos + 24) & 0xFF; }
+  public void mutateTest2(int test2) { bb.put(bb_pos + 24, (byte)test2); }
+  public MyGame.Example.Test test3() { return test3(new MyGame.Example.Test()); }
+  public MyGame.Example.Test test3(MyGame.Example.Test obj) { return obj.__assign(bb_pos + 26, bb); }
+
+  public static int createVec3(FlatBufferBuilder builder, float x, float y, float z, double test1, int test2, short test3_a, byte test3_b) {
+    builder.prep(8, 32);
+    builder.pad(2);
+    builder.prep(2, 4);
+    builder.pad(1);
+    builder.putByte(test3_b);
+    builder.putShort(test3_a);
+    builder.pad(1);
+    builder.putByte((byte)test2);
+    builder.putDouble(test1);
+    builder.pad(4);
+    builder.putFloat(z);
+    builder.putFloat(y);
+    builder.putFloat(x);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Vec3 get(int j) { return get(new Vec3(), j); }
+    public Vec3 get(Vec3 obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.kt
new file mode 100644
index 0000000..90f1b4a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.kt
@@ -0,0 +1,50 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Vec3 : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Vec3 {
+        __init(_i, _bb)
+        return this
+    }
+    val x : Float get() = bb.getFloat(bb_pos + 0)
+    fun mutateX(x: Float) : ByteBuffer = bb.putFloat(bb_pos + 0, x)
+    val y : Float get() = bb.getFloat(bb_pos + 4)
+    fun mutateY(y: Float) : ByteBuffer = bb.putFloat(bb_pos + 4, y)
+    val z : Float get() = bb.getFloat(bb_pos + 8)
+    fun mutateZ(z: Float) : ByteBuffer = bb.putFloat(bb_pos + 8, z)
+    val test1 : Double get() = bb.getDouble(bb_pos + 16)
+    fun mutateTest1(test1: Double) : ByteBuffer = bb.putDouble(bb_pos + 16, test1)
+    val test2 : UByte get() = bb.get(bb_pos + 24).toUByte()
+    fun mutateTest2(test2: UByte) : ByteBuffer = bb.put(bb_pos + 24, test2.toByte())
+    val test3 : MyGame.Example.Test? get() = test3(MyGame.Example.Test())
+    fun test3(obj: MyGame.Example.Test) : MyGame.Example.Test? = obj.__assign(bb_pos + 26, bb)
+    companion object {
+        fun createVec3(builder: FlatBufferBuilder, x: Float, y: Float, z: Float, test1: Double, test2: UByte, test3_a: Short, test3_b: Byte) : Int {
+            builder.prep(8, 32)
+            builder.pad(2)
+            builder.prep(2, 4)
+            builder.pad(1)
+            builder.putByte(test3_b)
+            builder.putShort(test3_a)
+            builder.pad(1)
+            builder.putByte(test2.toByte())
+            builder.putDouble(test1)
+            builder.pad(4)
+            builder.putFloat(z)
+            builder.putFloat(y)
+            builder.putFloat(x)
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.lua
new file mode 100644
index 0000000..24d4cc1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.lua
@@ -0,0 +1,54 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example
+
+local flatbuffers = require('flatbuffers')
+
+local Vec3 = {} -- the module
+local Vec3_mt = {} -- the class metatable
+
+function Vec3.New()
+    local o = {}
+    setmetatable(o, {__index = Vec3_mt})
+    return o
+end
+function Vec3_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Vec3_mt:X()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 0)
+end
+function Vec3_mt:Y()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 4)
+end
+function Vec3_mt:Z()
+    return self.view:Get(flatbuffers.N.Float32, self.view.pos + 8)
+end
+function Vec3_mt:Test1()
+    return self.view:Get(flatbuffers.N.Float64, self.view.pos + 16)
+end
+function Vec3_mt:Test2()
+    return self.view:Get(flatbuffers.N.Uint8, self.view.pos + 24)
+end
+function Vec3_mt:Test3(obj)
+    obj:Init(self.view.bytes, self.view.pos + 26)
+    return obj
+end
+function Vec3.CreateVec3(builder, x, y, z, test1, test2, test3_a, test3_b)
+    builder:Prep(8, 32)
+    builder:Pad(2)
+    builder:Prep(2, 4)
+    builder:Pad(1)
+    builder:PrependInt8(test3_b)
+    builder:PrependInt16(test3_a)
+    builder:Pad(1)
+    builder:PrependUint8(test2)
+    builder:PrependFloat64(test1)
+    builder:Pad(4)
+    builder:PrependFloat32(z)
+    builder:PrependFloat32(y)
+    builder:PrependFloat32(x)
+    return builder:Offset()
+end
+
+return Vec3 -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.php
new file mode 100644
index 0000000..4d149e6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.php
@@ -0,0 +1,96 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Vec3 extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Vec3
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return float
+     */
+    public function GetX()
+    {
+        return $this->bb->getFloat($this->bb_pos + 0);
+    }
+
+    /**
+     * @return float
+     */
+    public function GetY()
+    {
+        return $this->bb->getFloat($this->bb_pos + 4);
+    }
+
+    /**
+     * @return float
+     */
+    public function GetZ()
+    {
+        return $this->bb->getFloat($this->bb_pos + 8);
+    }
+
+    /**
+     * @return double
+     */
+    public function GetTest1()
+    {
+        return $this->bb->getDouble($this->bb_pos + 16);
+    }
+
+    /**
+     * @return byte
+     */
+    public function GetTest2()
+    {
+        return $this->bb->getByte($this->bb_pos + 24);
+    }
+
+    /**
+     * @return Test
+     */
+    public function getTest3()
+    {
+        $obj = new Test();
+        $obj->init($this->bb_pos + 26, $this->bb);
+        return $obj;
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createVec3(FlatBufferBuilder $builder, $x, $y, $z, $test1, $test2, $test3_a, $test3_b)
+    {
+        $builder->prep(8, 32);
+        $builder->pad(2);
+        $builder->prep(2, 4);
+        $builder->pad(1);
+        $builder->putSbyte($test3_b);
+        $builder->putShort($test3_a);
+        $builder->pad(1);
+        $builder->putByte($test2);
+        $builder->putDouble($test1);
+        $builder->pad(4);
+        $builder->putFloat($z);
+        $builder->putFloat($y);
+        $builder->putFloat($x);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.py
new file mode 100644
index 0000000..69cd511
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/Vec3.py
@@ -0,0 +1,95 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Vec3(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 32
+
+    # Vec3
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Vec3
+    def X(self): return self._tab.Get(flatbuffers.number_types.Float32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
+    # Vec3
+    def Y(self): return self._tab.Get(flatbuffers.number_types.Float32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4))
+    # Vec3
+    def Z(self): return self._tab.Get(flatbuffers.number_types.Float32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(8))
+    # Vec3
+    def Test1(self): return self._tab.Get(flatbuffers.number_types.Float64Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(16))
+    # Vec3
+    def Test2(self): return self._tab.Get(flatbuffers.number_types.Uint8Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(24))
+    # Vec3
+    def Test3(self, obj):
+        obj.Init(self._tab.Bytes, self._tab.Pos + 26)
+        return obj
+
+
+def CreateVec3(builder, x, y, z, test1, test2, test3_a, test3_b):
+    builder.Prep(8, 32)
+    builder.Pad(2)
+    builder.Prep(2, 4)
+    builder.Pad(1)
+    builder.PrependInt8(test3_b)
+    builder.PrependInt16(test3_a)
+    builder.Pad(1)
+    builder.PrependUint8(test2)
+    builder.PrependFloat64(test1)
+    builder.Pad(4)
+    builder.PrependFloat32(z)
+    builder.PrependFloat32(y)
+    builder.PrependFloat32(x)
+    return builder.Offset()
+
+import MyGame.Example.Test
+try:
+    from typing import Optional
+except:
+    pass
+
+class Vec3T(object):
+
+    # Vec3T
+    def __init__(self):
+        self.x = 0.0  # type: float
+        self.y = 0.0  # type: float
+        self.z = 0.0  # type: float
+        self.test1 = 0.0  # type: float
+        self.test2 = 0  # type: int
+        self.test3 = None  # type: Optional[MyGame.Example.Test.TestT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        vec3 = Vec3()
+        vec3.Init(buf, pos)
+        return cls.InitFromObj(vec3)
+
+    @classmethod
+    def InitFromObj(cls, vec3):
+        x = Vec3T()
+        x._UnPack(vec3)
+        return x
+
+    # Vec3T
+    def _UnPack(self, vec3):
+        if vec3 is None:
+            return
+        self.x = vec3.X()
+        self.y = vec3.Y()
+        self.z = vec3.Z()
+        self.test1 = vec3.Test1()
+        self.test2 = vec3.Test2()
+        if vec3.Test3(MyGame.Example.Test.Test()) is not None:
+            self.test3 = MyGame.Example.Test.TestT.InitFromObj(vec3.Test3(MyGame.Example.Test.Test()))
+
+    # Vec3T
+    def Pack(self, builder):
+        return CreateVec3(builder, self.x, self.y, self.z, self.test1, self.test2, self.test3.a, self.test3.b)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/monster_test_grpc_fb.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/monster_test_grpc_fb.py
new file mode 100644
index 0000000..8375c98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example/monster_test_grpc_fb.py
@@ -0,0 +1,241 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+import grpc
+
+
+class MonsterStorageStub(object):
+  
+  def __init__(self, channel):
+    """Constructor.
+    
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.Store = channel.unary_unary(
+        '/MyGame.Example.MonsterStorage/Store',
+        
+        
+        )
+    self.Retrieve = channel.unary_stream(
+        '/MyGame.Example.MonsterStorage/Retrieve',
+        
+        
+        )
+    self.GetMaxHitPoint = channel.stream_unary(
+        '/MyGame.Example.MonsterStorage/GetMaxHitPoint',
+        
+        
+        )
+    self.GetMinMaxHitPoints = channel.unary_unary(
+        '/MyGame.Example.MonsterStorage/GetMinMaxHitPoints',
+        
+        
+        )
+
+
+class MonsterStorageServicer(object):
+  
+  def Store(self, request, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+  
+  def Retrieve(self, request, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+  
+  def GetMaxHitPoint(self, request_iterator, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+  
+  def GetMinMaxHitPoints(self, request, context):
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_MonsterStorageServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'Store': grpc.unary_unary_rpc_method_handler(
+          servicer.Store,
+          
+          
+      ),
+      'Retrieve': grpc.unary_stream_rpc_method_handler(
+          servicer.Retrieve,
+          
+          
+      ),
+      'GetMaxHitPoint': grpc.stream_unary_rpc_method_handler(
+          servicer.GetMaxHitPoint,
+          
+          
+      ),
+      'GetMinMaxHitPoints': grpc.unary_unary_rpc_method_handler(
+          servicer.GetMinMaxHitPoints,
+          
+          
+      ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'MyGame.Example.MonsterStorage', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
+try:
+  # THESE ELEMENTS WILL BE DEPRECATED.
+  # Please use the generated *_pb2_grpc.py files instead.
+  import grpc
+  from grpc.beta import implementations as beta_implementations
+  from grpc.beta import interfaces as beta_interfaces
+  from grpc.framework.common import cardinality
+  from grpc.framework.interfaces.face import utilities as face_utilities
+  
+  
+  class MonsterStorageStub(object):
+    
+    def __init__(self, channel):
+      """Constructor.
+      
+      Args:
+        channel: A grpc.Channel.
+      """
+      self.Store = channel.unary_unary(
+          '/MyGame.Example.MonsterStorage/Store',
+          
+          
+          )
+      self.Retrieve = channel.unary_stream(
+          '/MyGame.Example.MonsterStorage/Retrieve',
+          
+          
+          )
+      self.GetMaxHitPoint = channel.stream_unary(
+          '/MyGame.Example.MonsterStorage/GetMaxHitPoint',
+          
+          
+          )
+      self.GetMinMaxHitPoints = channel.unary_unary(
+          '/MyGame.Example.MonsterStorage/GetMinMaxHitPoints',
+          
+          
+          )
+  
+  
+  class MonsterStorageServicer(object):
+    
+    def Store(self, request, context):
+      context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+      context.set_details('Method not implemented!')
+      raise NotImplementedError('Method not implemented!')
+    
+    def Retrieve(self, request, context):
+      context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+      context.set_details('Method not implemented!')
+      raise NotImplementedError('Method not implemented!')
+    
+    def GetMaxHitPoint(self, request_iterator, context):
+      context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+      context.set_details('Method not implemented!')
+      raise NotImplementedError('Method not implemented!')
+    
+    def GetMinMaxHitPoints(self, request, context):
+      context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+      context.set_details('Method not implemented!')
+      raise NotImplementedError('Method not implemented!')
+  
+  
+  def add_MonsterStorageServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+        'Store': grpc.unary_unary_rpc_method_handler(
+            servicer.Store,
+            
+            
+        ),
+        'Retrieve': grpc.unary_stream_rpc_method_handler(
+            servicer.Retrieve,
+            
+            
+        ),
+        'GetMaxHitPoint': grpc.stream_unary_rpc_method_handler(
+            servicer.GetMaxHitPoint,
+            
+            
+        ),
+        'GetMinMaxHitPoints': grpc.unary_unary_rpc_method_handler(
+            servicer.GetMinMaxHitPoints,
+            
+            
+        ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+        'MyGame.Example.MonsterStorage', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+  
+  
+  class BetaMonsterStorageServicer(object):
+    """The Beta API is deprecated for 0.15.0 and later.
+    
+    It is recommended to use the GA API (classes and functions in this
+    file not marked beta) for all further purposes. This class was generated
+    only to ease transition from grpcio<0.15.0 to grpcio>=0.15.0."""
+    def Store(self, request, context):
+      context.code(beta_interfaces.StatusCode.UNIMPLEMENTED)
+    def Retrieve(self, request, context):
+      context.code(beta_interfaces.StatusCode.UNIMPLEMENTED)
+    def GetMaxHitPoint(self, request_iterator, context):
+      context.code(beta_interfaces.StatusCode.UNIMPLEMENTED)
+    def GetMinMaxHitPoints(self, request, context):
+      context.code(beta_interfaces.StatusCode.UNIMPLEMENTED)
+  
+  
+  class BetaMonsterStorageStub(object):
+    """The Beta API is deprecated for 0.15.0 and later.
+    
+    It is recommended to use the GA API (classes and functions in this
+    file not marked beta) for all further purposes. This class was generated
+    only to ease transition from grpcio<0.15.0 to grpcio>=0.15.0."""
+    def Store(self, request, timeout, metadata=None, with_call=False, protocol_options=None):
+      raise NotImplementedError()
+    Store.future = None
+    def Retrieve(self, request, timeout, metadata=None, with_call=False, protocol_options=None):
+      raise NotImplementedError()
+    def GetMaxHitPoint(self, request_iterator, timeout, metadata=None, with_call=False, protocol_options=None):
+      raise NotImplementedError()
+    GetMaxHitPoint.future = None
+    def GetMinMaxHitPoints(self, request, timeout, metadata=None, with_call=False, protocol_options=None):
+      raise NotImplementedError()
+    GetMinMaxHitPoints.future = None
+  
+  
+  def beta_create_MonsterStorage_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
+    """The Beta API is deprecated for 0.15.0 and later.
+    
+    It is recommended to use the GA API (classes and functions in this
+    file not marked beta) for all further purposes. This function was
+    generated only to ease transition from grpcio<0.15.0 to grpcio>=0.15.0"""
+    method_implementations = {
+      ('MyGame.Example.MonsterStorage', 'GetMaxHitPoint'): face_utilities.stream_unary_inline(servicer.GetMaxHitPoint),
+      ('MyGame.Example.MonsterStorage', 'GetMinMaxHitPoints'): face_utilities.unary_unary_inline(servicer.GetMinMaxHitPoints),
+      ('MyGame.Example.MonsterStorage', 'Retrieve'): face_utilities.unary_stream_inline(servicer.Retrieve),
+      ('MyGame.Example.MonsterStorage', 'Store'): face_utilities.unary_unary_inline(servicer.Store),
+    }
+    server_options = beta_implementations.server_options(thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
+    return beta_implementations.server(method_implementations, options=server_options)
+  
+  
+  def beta_create_MonsterStorage_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
+    """The Beta API is deprecated for 0.15.0 and later.
+    
+    It is recommended to use the GA API (classes and functions in this
+    file not marked beta) for all further purposes. This function was
+    generated only to ease transition from grpcio<0.15.0 to grpcio>=0.15.0"""
+    cardinalities = {
+      'GetMaxHitPoint': cardinality.Cardinality.STREAM_UNARY,
+      'GetMinMaxHitPoints': cardinality.Cardinality.UNARY_UNARY,
+      'Retrieve': cardinality.Cardinality.UNARY_STREAM,
+      'Store': cardinality.Cardinality.UNARY_UNARY,
+    }
+    stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, thread_pool=pool, thread_pool_size=pool_size)
+    return beta_implementations.dynamic_stub(channel, 'MyGame.Example.MonsterStorage', cardinalities, options=stub_options)
+except ImportError:
+  pass
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.cs
new file mode 100644
index 0000000..fc19160
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.cs
@@ -0,0 +1,50 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame.Example2
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Monster : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Monster GetRootAsMonster(ByteBuffer _bb) { return GetRootAsMonster(_bb, new Monster()); }
+  public static Monster GetRootAsMonster(ByteBuffer _bb, Monster obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Monster __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void StartMonster(FlatBufferBuilder builder) { builder.StartTable(0); }
+  public static Offset<MyGame.Example2.Monster> EndMonster(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.Example2.Monster>(o);
+  }
+  public MonsterT UnPack() {
+    var _o = new MonsterT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(MonsterT _o) {
+  }
+  public static Offset<MyGame.Example2.Monster> Pack(FlatBufferBuilder builder, MonsterT _o) {
+    if (_o == null) return default(Offset<MyGame.Example2.Monster>);
+    StartMonster(builder);
+    return EndMonster(builder);
+  }
+};
+
+public class MonsterT
+{
+
+  public MonsterT() {
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.go
new file mode 100644
index 0000000..792011f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.go
@@ -0,0 +1,60 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package Example2
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type MonsterT struct {
+}
+
+func (t *MonsterT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	MonsterStart(builder)
+	return MonsterEnd(builder)
+}
+
+func (rcv *Monster) UnPackTo(t *MonsterT) {
+}
+
+func (rcv *Monster) UnPack() *MonsterT {
+	if rcv == nil { return nil }
+	t := &MonsterT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type Monster struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsMonster(buf []byte, offset flatbuffers.UOffsetT) *Monster {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &Monster{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsMonster(buf []byte, offset flatbuffers.UOffsetT) *Monster {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &Monster{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *Monster) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *Monster) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func MonsterStart(builder *flatbuffers.Builder) {
+	builder.StartObject(0)
+}
+func MonsterEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.java
new file mode 100644
index 0000000..66d1e14
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.java
@@ -0,0 +1,32 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example2;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Monster extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Monster getRootAsMonster(ByteBuffer _bb) { return getRootAsMonster(_bb, new Monster()); }
+  public static Monster getRootAsMonster(ByteBuffer _bb, Monster obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Monster __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void startMonster(FlatBufferBuilder builder) { builder.startTable(0); }
+  public static int endMonster(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Monster get(int j) { return get(new Monster(), j); }
+    public Monster get(Monster obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.kt
new file mode 100644
index 0000000..29caf52
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.kt
@@ -0,0 +1,33 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame.Example2
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Monster : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Monster {
+        __init(_i, _bb)
+        return this
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsMonster(_bb: ByteBuffer): Monster = getRootAsMonster(_bb, Monster())
+        fun getRootAsMonster(_bb: ByteBuffer, obj: Monster): Monster {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun startMonster(builder: FlatBufferBuilder) = builder.startTable(0)
+        fun endMonster(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.lua
new file mode 100644
index 0000000..670ca00
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.lua
@@ -0,0 +1,30 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: Example2
+
+local flatbuffers = require('flatbuffers')
+
+local Monster = {} -- the module
+local Monster_mt = {} -- the class metatable
+
+function Monster.New()
+    local o = {}
+    setmetatable(o, {__index = Monster_mt})
+    return o
+end
+function Monster.GetRootAsMonster(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = Monster.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function Monster_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function Monster.Start(builder) builder:StartObject(0) end
+function Monster.End(builder) return builder:EndObject() end
+
+return Monster -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.php
new file mode 100644
index 0000000..b00f150
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.php
@@ -0,0 +1,79 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame\Example2;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Monster extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Monster
+     */
+    public static function getRootAsMonster(ByteBuffer $bb)
+    {
+        $obj = new Monster();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function MonsterIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function MonsterBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::MonsterIdentifier());
+    }
+
+    public static function MonsterExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Monster
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startMonster(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Monster
+     */
+    public static function createMonster(FlatBufferBuilder $builder, )
+    {
+        $builder->startObject(0);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endMonster(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.py
new file mode 100644
index 0000000..f1cafe6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/Monster.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: Example2
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class Monster(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Monster()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMonster(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def MonsterBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # Monster
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def Start(builder): builder.StartObject(0)
+def MonsterStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def End(builder): return builder.EndObject()
+def MonsterEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class MonsterT(object):
+
+    # MonsterT
+    def __init__(self):
+        pass
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        monster = Monster()
+        monster.Init(buf, pos)
+        return cls.InitFromObj(monster)
+
+    @classmethod
+    def InitFromObj(cls, monster):
+        x = MonsterT()
+        x._UnPack(monster)
+        return x
+
+    # MonsterT
+    def _UnPack(self, monster):
+        if monster is None:
+            return
+
+    # MonsterT
+    def Pack(self, builder):
+        Start(builder)
+        monster = End(builder)
+        return monster
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/Example2/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.cs
new file mode 100644
index 0000000..163cd9c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.cs
@@ -0,0 +1,50 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct InParentNamespace : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static InParentNamespace GetRootAsInParentNamespace(ByteBuffer _bb) { return GetRootAsInParentNamespace(_bb, new InParentNamespace()); }
+  public static InParentNamespace GetRootAsInParentNamespace(ByteBuffer _bb, InParentNamespace obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public InParentNamespace __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void StartInParentNamespace(FlatBufferBuilder builder) { builder.StartTable(0); }
+  public static Offset<MyGame.InParentNamespace> EndInParentNamespace(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.InParentNamespace>(o);
+  }
+  public InParentNamespaceT UnPack() {
+    var _o = new InParentNamespaceT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(InParentNamespaceT _o) {
+  }
+  public static Offset<MyGame.InParentNamespace> Pack(FlatBufferBuilder builder, InParentNamespaceT _o) {
+    if (_o == null) return default(Offset<MyGame.InParentNamespace>);
+    StartInParentNamespace(builder);
+    return EndInParentNamespace(builder);
+  }
+};
+
+public class InParentNamespaceT
+{
+
+  public InParentNamespaceT() {
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.go b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.go
new file mode 100644
index 0000000..2c4a4e0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.go
@@ -0,0 +1,60 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package MyGame
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type InParentNamespaceT struct {
+}
+
+func (t *InParentNamespaceT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	InParentNamespaceStart(builder)
+	return InParentNamespaceEnd(builder)
+}
+
+func (rcv *InParentNamespace) UnPackTo(t *InParentNamespaceT) {
+}
+
+func (rcv *InParentNamespace) UnPack() *InParentNamespaceT {
+	if rcv == nil { return nil }
+	t := &InParentNamespaceT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type InParentNamespace struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsInParentNamespace(buf []byte, offset flatbuffers.UOffsetT) *InParentNamespace {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &InParentNamespace{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsInParentNamespace(buf []byte, offset flatbuffers.UOffsetT) *InParentNamespace {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &InParentNamespace{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *InParentNamespace) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *InParentNamespace) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func InParentNamespaceStart(builder *flatbuffers.Builder) {
+	builder.StartObject(0)
+}
+func InParentNamespaceEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.java
new file mode 100644
index 0000000..8116d19
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.java
@@ -0,0 +1,32 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class InParentNamespace extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static InParentNamespace getRootAsInParentNamespace(ByteBuffer _bb) { return getRootAsInParentNamespace(_bb, new InParentNamespace()); }
+  public static InParentNamespace getRootAsInParentNamespace(ByteBuffer _bb, InParentNamespace obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public InParentNamespace __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+
+  public static void startInParentNamespace(FlatBufferBuilder builder) { builder.startTable(0); }
+  public static int endInParentNamespace(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public InParentNamespace get(int j) { return get(new InParentNamespace(), j); }
+    public InParentNamespace get(InParentNamespace obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.kt
new file mode 100644
index 0000000..b268f0d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.kt
@@ -0,0 +1,33 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class InParentNamespace : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : InParentNamespace {
+        __init(_i, _bb)
+        return this
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsInParentNamespace(_bb: ByteBuffer): InParentNamespace = getRootAsInParentNamespace(_bb, InParentNamespace())
+        fun getRootAsInParentNamespace(_bb: ByteBuffer, obj: InParentNamespace): InParentNamespace {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun startInParentNamespace(builder: FlatBufferBuilder) = builder.startTable(0)
+        fun endInParentNamespace(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.lua b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.lua
new file mode 100644
index 0000000..8a754b9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.lua
@@ -0,0 +1,30 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: MyGame
+
+local flatbuffers = require('flatbuffers')
+
+local InParentNamespace = {} -- the module
+local InParentNamespace_mt = {} -- the class metatable
+
+function InParentNamespace.New()
+    local o = {}
+    setmetatable(o, {__index = InParentNamespace_mt})
+    return o
+end
+function InParentNamespace.GetRootAsInParentNamespace(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = InParentNamespace.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function InParentNamespace_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function InParentNamespace.Start(builder) builder:StartObject(0) end
+function InParentNamespace.End(builder) return builder:EndObject() end
+
+return InParentNamespace -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.php b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.php
new file mode 100644
index 0000000..e13a4f3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.php
@@ -0,0 +1,79 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace MyGame;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class InParentNamespace extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return InParentNamespace
+     */
+    public static function getRootAsInParentNamespace(ByteBuffer $bb)
+    {
+        $obj = new InParentNamespace();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function InParentNamespaceIdentifier()
+    {
+        return "MONS";
+    }
+
+    public static function InParentNamespaceBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::InParentNamespaceIdentifier());
+    }
+
+    public static function InParentNamespaceExtension()
+    {
+        return "mon";
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return InParentNamespace
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startInParentNamespace(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return InParentNamespace
+     */
+    public static function createInParentNamespace(FlatBufferBuilder $builder, )
+    {
+        $builder->startObject(0);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endInParentNamespace(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.py
new file mode 100644
index 0000000..b76a37b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/InParentNamespace.py
@@ -0,0 +1,67 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MyGame
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class InParentNamespace(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = InParentNamespace()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsInParentNamespace(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def InParentNamespaceBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x53", size_prefixed=size_prefixed)
+
+    # InParentNamespace
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def Start(builder): builder.StartObject(0)
+def InParentNamespaceStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def End(builder): return builder.EndObject()
+def InParentNamespaceEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class InParentNamespaceT(object):
+
+    # InParentNamespaceT
+    def __init__(self):
+        pass
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        inParentNamespace = InParentNamespace()
+        inParentNamespace.Init(buf, pos)
+        return cls.InitFromObj(inParentNamespace)
+
+    @classmethod
+    def InitFromObj(cls, inParentNamespace):
+        x = InParentNamespaceT()
+        x._UnPack(inParentNamespace)
+        return x
+
+    # InParentNamespaceT
+    def _UnPack(self, inParentNamespace):
+        if inParentNamespace is None:
+            return
+
+    # InParentNamespaceT
+    def Pack(self, builder):
+        Start(builder)
+        inParentNamespace = End(builder)
+        return inParentNamespace
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.cs b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.cs
new file mode 100644
index 0000000..f3d5573
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.cs
@@ -0,0 +1,205 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace MyGame
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct MonsterExtra : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static MonsterExtra GetRootAsMonsterExtra(ByteBuffer _bb) { return GetRootAsMonsterExtra(_bb, new MonsterExtra()); }
+  public static MonsterExtra GetRootAsMonsterExtra(ByteBuffer _bb, MonsterExtra obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public static bool MonsterExtraBufferHasIdentifier(ByteBuffer _bb) { return Table.__has_identifier(_bb, "MONE"); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public MonsterExtra __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public double D0 { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)Double.NaN; } }
+  public bool MutateD0(double d0) { int o = __p.__offset(4); if (o != 0) { __p.bb.PutDouble(o + __p.bb_pos, d0); return true; } else { return false; } }
+  public double D1 { get { int o = __p.__offset(6); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)Double.NaN; } }
+  public bool MutateD1(double d1) { int o = __p.__offset(6); if (o != 0) { __p.bb.PutDouble(o + __p.bb_pos, d1); return true; } else { return false; } }
+  public double D2 { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)Double.PositiveInfinity; } }
+  public bool MutateD2(double d2) { int o = __p.__offset(8); if (o != 0) { __p.bb.PutDouble(o + __p.bb_pos, d2); return true; } else { return false; } }
+  public double D3 { get { int o = __p.__offset(10); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)Double.NegativeInfinity; } }
+  public bool MutateD3(double d3) { int o = __p.__offset(10); if (o != 0) { __p.bb.PutDouble(o + __p.bb_pos, d3); return true; } else { return false; } }
+  public float F0 { get { int o = __p.__offset(12); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)Single.NaN; } }
+  public bool MutateF0(float f0) { int o = __p.__offset(12); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, f0); return true; } else { return false; } }
+  public float F1 { get { int o = __p.__offset(14); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)Single.NaN; } }
+  public bool MutateF1(float f1) { int o = __p.__offset(14); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, f1); return true; } else { return false; } }
+  public float F2 { get { int o = __p.__offset(16); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)Single.PositiveInfinity; } }
+  public bool MutateF2(float f2) { int o = __p.__offset(16); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, f2); return true; } else { return false; } }
+  public float F3 { get { int o = __p.__offset(18); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)Single.NegativeInfinity; } }
+  public bool MutateF3(float f3) { int o = __p.__offset(18); if (o != 0) { __p.bb.PutFloat(o + __p.bb_pos, f3); return true; } else { return false; } }
+  public double Dvec(int j) { int o = __p.__offset(20); return o != 0 ? __p.bb.GetDouble(__p.__vector(o) + j * 8) : (double)0; }
+  public int DvecLength { get { int o = __p.__offset(20); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<double> GetDvecBytes() { return __p.__vector_as_span<double>(20, 8); }
+#else
+  public ArraySegment<byte>? GetDvecBytes() { return __p.__vector_as_arraysegment(20); }
+#endif
+  public double[] GetDvecArray() { return __p.__vector_as_array<double>(20); }
+  public bool MutateDvec(int j, double dvec) { int o = __p.__offset(20); if (o != 0) { __p.bb.PutDouble(__p.__vector(o) + j * 8, dvec); return true; } else { return false; } }
+  public float Fvec(int j) { int o = __p.__offset(22); return o != 0 ? __p.bb.GetFloat(__p.__vector(o) + j * 4) : (float)0; }
+  public int FvecLength { get { int o = __p.__offset(22); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<float> GetFvecBytes() { return __p.__vector_as_span<float>(22, 4); }
+#else
+  public ArraySegment<byte>? GetFvecBytes() { return __p.__vector_as_arraysegment(22); }
+#endif
+  public float[] GetFvecArray() { return __p.__vector_as_array<float>(22); }
+  public bool MutateFvec(int j, float fvec) { int o = __p.__offset(22); if (o != 0) { __p.bb.PutFloat(__p.__vector(o) + j * 4, fvec); return true; } else { return false; } }
+
+  public static Offset<MyGame.MonsterExtra> CreateMonsterExtra(FlatBufferBuilder builder,
+      double d0 = Double.NaN,
+      double d1 = Double.NaN,
+      double d2 = Double.PositiveInfinity,
+      double d3 = Double.NegativeInfinity,
+      float f0 = Single.NaN,
+      float f1 = Single.NaN,
+      float f2 = Single.PositiveInfinity,
+      float f3 = Single.NegativeInfinity,
+      VectorOffset dvecOffset = default(VectorOffset),
+      VectorOffset fvecOffset = default(VectorOffset)) {
+    builder.StartTable(11);
+    MonsterExtra.AddD3(builder, d3);
+    MonsterExtra.AddD2(builder, d2);
+    MonsterExtra.AddD1(builder, d1);
+    MonsterExtra.AddD0(builder, d0);
+    MonsterExtra.AddFvec(builder, fvecOffset);
+    MonsterExtra.AddDvec(builder, dvecOffset);
+    MonsterExtra.AddF3(builder, f3);
+    MonsterExtra.AddF2(builder, f2);
+    MonsterExtra.AddF1(builder, f1);
+    MonsterExtra.AddF0(builder, f0);
+    return MonsterExtra.EndMonsterExtra(builder);
+  }
+
+  public static void StartMonsterExtra(FlatBufferBuilder builder) { builder.StartTable(11); }
+  public static void AddD0(FlatBufferBuilder builder, double d0) { builder.AddDouble(0, d0, Double.NaN); }
+  public static void AddD1(FlatBufferBuilder builder, double d1) { builder.AddDouble(1, d1, Double.NaN); }
+  public static void AddD2(FlatBufferBuilder builder, double d2) { builder.AddDouble(2, d2, Double.PositiveInfinity); }
+  public static void AddD3(FlatBufferBuilder builder, double d3) { builder.AddDouble(3, d3, Double.NegativeInfinity); }
+  public static void AddF0(FlatBufferBuilder builder, float f0) { builder.AddFloat(4, f0, Single.NaN); }
+  public static void AddF1(FlatBufferBuilder builder, float f1) { builder.AddFloat(5, f1, Single.NaN); }
+  public static void AddF2(FlatBufferBuilder builder, float f2) { builder.AddFloat(6, f2, Single.PositiveInfinity); }
+  public static void AddF3(FlatBufferBuilder builder, float f3) { builder.AddFloat(7, f3, Single.NegativeInfinity); }
+  public static void AddDvec(FlatBufferBuilder builder, VectorOffset dvecOffset) { builder.AddOffset(8, dvecOffset.Value, 0); }
+  public static VectorOffset CreateDvecVector(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddDouble(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateDvecVectorBlock(FlatBufferBuilder builder, double[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); }
+  public static void StartDvecVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); }
+  public static void AddFvec(FlatBufferBuilder builder, VectorOffset fvecOffset) { builder.AddOffset(9, fvecOffset.Value, 0); }
+  public static VectorOffset CreateFvecVector(FlatBufferBuilder builder, float[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddFloat(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateFvecVectorBlock(FlatBufferBuilder builder, float[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartFvecVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static Offset<MyGame.MonsterExtra> EndMonsterExtra(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<MyGame.MonsterExtra>(o);
+  }
+  public static void FinishMonsterExtraBuffer(FlatBufferBuilder builder, Offset<MyGame.MonsterExtra> offset) { builder.Finish(offset.Value, "MONE"); }
+  public static void FinishSizePrefixedMonsterExtraBuffer(FlatBufferBuilder builder, Offset<MyGame.MonsterExtra> offset) { builder.FinishSizePrefixed(offset.Value, "MONE"); }
+  public MonsterExtraT UnPack() {
+    var _o = new MonsterExtraT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(MonsterExtraT _o) {
+    _o.D0 = this.D0;
+    _o.D1 = this.D1;
+    _o.D2 = this.D2;
+    _o.D3 = this.D3;
+    _o.F0 = this.F0;
+    _o.F1 = this.F1;
+    _o.F2 = this.F2;
+    _o.F3 = this.F3;
+    _o.Dvec = new List<double>();
+    for (var _j = 0; _j < this.DvecLength; ++_j) {_o.Dvec.Add(this.Dvec(_j));}
+    _o.Fvec = new List<float>();
+    for (var _j = 0; _j < this.FvecLength; ++_j) {_o.Fvec.Add(this.Fvec(_j));}
+  }
+  public static Offset<MyGame.MonsterExtra> Pack(FlatBufferBuilder builder, MonsterExtraT _o) {
+    if (_o == null) return default(Offset<MyGame.MonsterExtra>);
+    var _dvec = default(VectorOffset);
+    if (_o.Dvec != null) {
+      var __dvec = _o.Dvec.ToArray();
+      _dvec = CreateDvecVector(builder, __dvec);
+    }
+    var _fvec = default(VectorOffset);
+    if (_o.Fvec != null) {
+      var __fvec = _o.Fvec.ToArray();
+      _fvec = CreateFvecVector(builder, __fvec);
+    }
+    return CreateMonsterExtra(
+      builder,
+      _o.D0,
+      _o.D1,
+      _o.D2,
+      _o.D3,
+      _o.F0,
+      _o.F1,
+      _o.F2,
+      _o.F3,
+      _dvec,
+      _fvec);
+  }
+};
+
+public class MonsterExtraT
+{
+  [Newtonsoft.Json.JsonProperty("d0")]
+  public double D0 { get; set; }
+  [Newtonsoft.Json.JsonProperty("d1")]
+  public double D1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("d2")]
+  public double D2 { get; set; }
+  [Newtonsoft.Json.JsonProperty("d3")]
+  public double D3 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f0")]
+  public float F0 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f1")]
+  public float F1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f2")]
+  public float F2 { get; set; }
+  [Newtonsoft.Json.JsonProperty("f3")]
+  public float F3 { get; set; }
+  [Newtonsoft.Json.JsonProperty("dvec")]
+  public List<double> Dvec { get; set; }
+  [Newtonsoft.Json.JsonProperty("fvec")]
+  public List<float> Fvec { get; set; }
+
+  public MonsterExtraT() {
+    this.D0 = Double.NaN;
+    this.D1 = Double.NaN;
+    this.D2 = Double.PositiveInfinity;
+    this.D3 = Double.NegativeInfinity;
+    this.F0 = Single.NaN;
+    this.F1 = Single.NaN;
+    this.F2 = Single.PositiveInfinity;
+    this.F3 = Single.NegativeInfinity;
+    this.Dvec = null;
+    this.Fvec = null;
+  }
+
+  public static MonsterExtraT DeserializeFromJson(string jsonText) {
+    return Newtonsoft.Json.JsonConvert.DeserializeObject<MonsterExtraT>(jsonText);
+  }
+  public string SerializeToJson() {
+    return Newtonsoft.Json.JsonConvert.SerializeObject(this, Newtonsoft.Json.Formatting.Indented);
+  }
+  public static MonsterExtraT DeserializeFromBinary(byte[] fbBuffer) {
+    return MonsterExtra.GetRootAsMonsterExtra(new ByteBuffer(fbBuffer)).UnPack();
+  }
+  public byte[] SerializeToBinary() {
+    var fbb = new FlatBufferBuilder(0x10000);
+    MonsterExtra.FinishMonsterExtraBuffer(fbb, MonsterExtra.Pack(fbb, this));
+    return fbb.DataBuffer.ToSizedArray();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.java b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.java
new file mode 100644
index 0000000..b8171f8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.java
@@ -0,0 +1,104 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class MonsterExtra extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static MonsterExtra getRootAsMonsterExtra(ByteBuffer _bb) { return getRootAsMonsterExtra(_bb, new MonsterExtra()); }
+  public static MonsterExtra getRootAsMonsterExtra(ByteBuffer _bb, MonsterExtra obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public static boolean MonsterExtraBufferHasIdentifier(ByteBuffer _bb) { return __has_identifier(_bb, "MONE"); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public MonsterExtra __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public double d0() { int o = __offset(4); return o != 0 ? bb.getDouble(o + bb_pos) : Double.NaN; }
+  public boolean mutateD0(double d0) { int o = __offset(4); if (o != 0) { bb.putDouble(o + bb_pos, d0); return true; } else { return false; } }
+  public double d1() { int o = __offset(6); return o != 0 ? bb.getDouble(o + bb_pos) : Double.NaN; }
+  public boolean mutateD1(double d1) { int o = __offset(6); if (o != 0) { bb.putDouble(o + bb_pos, d1); return true; } else { return false; } }
+  public double d2() { int o = __offset(8); return o != 0 ? bb.getDouble(o + bb_pos) : Double.POSITIVE_INFINITY; }
+  public boolean mutateD2(double d2) { int o = __offset(8); if (o != 0) { bb.putDouble(o + bb_pos, d2); return true; } else { return false; } }
+  public double d3() { int o = __offset(10); return o != 0 ? bb.getDouble(o + bb_pos) : Double.NEGATIVE_INFINITY; }
+  public boolean mutateD3(double d3) { int o = __offset(10); if (o != 0) { bb.putDouble(o + bb_pos, d3); return true; } else { return false; } }
+  public float f0() { int o = __offset(12); return o != 0 ? bb.getFloat(o + bb_pos) : Float.NaN; }
+  public boolean mutateF0(float f0) { int o = __offset(12); if (o != 0) { bb.putFloat(o + bb_pos, f0); return true; } else { return false; } }
+  public float f1() { int o = __offset(14); return o != 0 ? bb.getFloat(o + bb_pos) : Float.NaN; }
+  public boolean mutateF1(float f1) { int o = __offset(14); if (o != 0) { bb.putFloat(o + bb_pos, f1); return true; } else { return false; } }
+  public float f2() { int o = __offset(16); return o != 0 ? bb.getFloat(o + bb_pos) : Float.POSITIVE_INFINITY; }
+  public boolean mutateF2(float f2) { int o = __offset(16); if (o != 0) { bb.putFloat(o + bb_pos, f2); return true; } else { return false; } }
+  public float f3() { int o = __offset(18); return o != 0 ? bb.getFloat(o + bb_pos) : Float.NEGATIVE_INFINITY; }
+  public boolean mutateF3(float f3) { int o = __offset(18); if (o != 0) { bb.putFloat(o + bb_pos, f3); return true; } else { return false; } }
+  public double dvec(int j) { int o = __offset(20); return o != 0 ? bb.getDouble(__vector(o) + j * 8) : 0; }
+  public int dvecLength() { int o = __offset(20); return o != 0 ? __vector_len(o) : 0; }
+  public DoubleVector dvecVector() { return dvecVector(new DoubleVector()); }
+  public DoubleVector dvecVector(DoubleVector obj) { int o = __offset(20); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer dvecAsByteBuffer() { return __vector_as_bytebuffer(20, 8); }
+  public ByteBuffer dvecInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 20, 8); }
+  public boolean mutateDvec(int j, double dvec) { int o = __offset(20); if (o != 0) { bb.putDouble(__vector(o) + j * 8, dvec); return true; } else { return false; } }
+  public float fvec(int j) { int o = __offset(22); return o != 0 ? bb.getFloat(__vector(o) + j * 4) : 0; }
+  public int fvecLength() { int o = __offset(22); return o != 0 ? __vector_len(o) : 0; }
+  public FloatVector fvecVector() { return fvecVector(new FloatVector()); }
+  public FloatVector fvecVector(FloatVector obj) { int o = __offset(22); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer fvecAsByteBuffer() { return __vector_as_bytebuffer(22, 4); }
+  public ByteBuffer fvecInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 22, 4); }
+  public boolean mutateFvec(int j, float fvec) { int o = __offset(22); if (o != 0) { bb.putFloat(__vector(o) + j * 4, fvec); return true; } else { return false; } }
+
+  public static int createMonsterExtra(FlatBufferBuilder builder,
+      double d0,
+      double d1,
+      double d2,
+      double d3,
+      float f0,
+      float f1,
+      float f2,
+      float f3,
+      int dvecOffset,
+      int fvecOffset) {
+    builder.startTable(11);
+    MonsterExtra.addD3(builder, d3);
+    MonsterExtra.addD2(builder, d2);
+    MonsterExtra.addD1(builder, d1);
+    MonsterExtra.addD0(builder, d0);
+    MonsterExtra.addFvec(builder, fvecOffset);
+    MonsterExtra.addDvec(builder, dvecOffset);
+    MonsterExtra.addF3(builder, f3);
+    MonsterExtra.addF2(builder, f2);
+    MonsterExtra.addF1(builder, f1);
+    MonsterExtra.addF0(builder, f0);
+    return MonsterExtra.endMonsterExtra(builder);
+  }
+
+  public static void startMonsterExtra(FlatBufferBuilder builder) { builder.startTable(11); }
+  public static void addD0(FlatBufferBuilder builder, double d0) { builder.addDouble(0, d0, Double.NaN); }
+  public static void addD1(FlatBufferBuilder builder, double d1) { builder.addDouble(1, d1, Double.NaN); }
+  public static void addD2(FlatBufferBuilder builder, double d2) { builder.addDouble(2, d2, Double.POSITIVE_INFINITY); }
+  public static void addD3(FlatBufferBuilder builder, double d3) { builder.addDouble(3, d3, Double.NEGATIVE_INFINITY); }
+  public static void addF0(FlatBufferBuilder builder, float f0) { builder.addFloat(4, f0, Float.NaN); }
+  public static void addF1(FlatBufferBuilder builder, float f1) { builder.addFloat(5, f1, Float.NaN); }
+  public static void addF2(FlatBufferBuilder builder, float f2) { builder.addFloat(6, f2, Float.POSITIVE_INFINITY); }
+  public static void addF3(FlatBufferBuilder builder, float f3) { builder.addFloat(7, f3, Float.NEGATIVE_INFINITY); }
+  public static void addDvec(FlatBufferBuilder builder, int dvecOffset) { builder.addOffset(8, dvecOffset, 0); }
+  public static int createDvecVector(FlatBufferBuilder builder, double[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addDouble(data[i]); return builder.endVector(); }
+  public static void startDvecVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); }
+  public static void addFvec(FlatBufferBuilder builder, int fvecOffset) { builder.addOffset(9, fvecOffset, 0); }
+  public static int createFvecVector(FlatBufferBuilder builder, float[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addFloat(data[i]); return builder.endVector(); }
+  public static void startFvecVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static int endMonsterExtra(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+  public static void finishMonsterExtraBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset, "MONE"); }
+  public static void finishSizePrefixedMonsterExtraBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset, "MONE"); }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public MonsterExtra get(int j) { return get(new MonsterExtra(), j); }
+    public MonsterExtra get(MonsterExtra obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.kt b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.kt
new file mode 100644
index 0000000..f55cfb5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.kt
@@ -0,0 +1,234 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package MyGame
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class MonsterExtra : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : MonsterExtra {
+        __init(_i, _bb)
+        return this
+    }
+    val d0 : Double
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.getDouble(o + bb_pos) else Double.NaN
+        }
+    fun mutateD0(d0: Double) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.putDouble(o + bb_pos, d0)
+            true
+        } else {
+            false
+        }
+    }
+    val d1 : Double
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.getDouble(o + bb_pos) else Double.NaN
+        }
+    fun mutateD1(d1: Double) : Boolean {
+        val o = __offset(6)
+        return if (o != 0) {
+            bb.putDouble(o + bb_pos, d1)
+            true
+        } else {
+            false
+        }
+    }
+    val d2 : Double
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.getDouble(o + bb_pos) else Double.POSITIVE_INFINITY
+        }
+    fun mutateD2(d2: Double) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.putDouble(o + bb_pos, d2)
+            true
+        } else {
+            false
+        }
+    }
+    val d3 : Double
+        get() {
+            val o = __offset(10)
+            return if(o != 0) bb.getDouble(o + bb_pos) else Double.NEGATIVE_INFINITY
+        }
+    fun mutateD3(d3: Double) : Boolean {
+        val o = __offset(10)
+        return if (o != 0) {
+            bb.putDouble(o + bb_pos, d3)
+            true
+        } else {
+            false
+        }
+    }
+    val f0 : Float
+        get() {
+            val o = __offset(12)
+            return if(o != 0) bb.getFloat(o + bb_pos) else Float.NaN
+        }
+    fun mutateF0(f0: Float) : Boolean {
+        val o = __offset(12)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, f0)
+            true
+        } else {
+            false
+        }
+    }
+    val f1 : Float
+        get() {
+            val o = __offset(14)
+            return if(o != 0) bb.getFloat(o + bb_pos) else Float.NaN
+        }
+    fun mutateF1(f1: Float) : Boolean {
+        val o = __offset(14)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, f1)
+            true
+        } else {
+            false
+        }
+    }
+    val f2 : Float
+        get() {
+            val o = __offset(16)
+            return if(o != 0) bb.getFloat(o + bb_pos) else Float.POSITIVE_INFINITY
+        }
+    fun mutateF2(f2: Float) : Boolean {
+        val o = __offset(16)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, f2)
+            true
+        } else {
+            false
+        }
+    }
+    val f3 : Float
+        get() {
+            val o = __offset(18)
+            return if(o != 0) bb.getFloat(o + bb_pos) else Float.NEGATIVE_INFINITY
+        }
+    fun mutateF3(f3: Float) : Boolean {
+        val o = __offset(18)
+        return if (o != 0) {
+            bb.putFloat(o + bb_pos, f3)
+            true
+        } else {
+            false
+        }
+    }
+    fun dvec(j: Int) : Double {
+        val o = __offset(20)
+        return if (o != 0) {
+            bb.getDouble(__vector(o) + j * 8)
+        } else {
+            0.0
+        }
+    }
+    val dvecLength : Int
+        get() {
+            val o = __offset(20); return if (o != 0) __vector_len(o) else 0
+        }
+    val dvecAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(20, 8)
+    fun dvecInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 20, 8)
+    fun mutateDvec(j: Int, dvec: Double) : Boolean {
+        val o = __offset(20)
+        return if (o != 0) {
+            bb.putDouble(__vector(o) + j * 8, dvec)
+            true
+        } else {
+            false
+        }
+    }
+    fun fvec(j: Int) : Float {
+        val o = __offset(22)
+        return if (o != 0) {
+            bb.getFloat(__vector(o) + j * 4)
+        } else {
+            0.0f
+        }
+    }
+    val fvecLength : Int
+        get() {
+            val o = __offset(22); return if (o != 0) __vector_len(o) else 0
+        }
+    val fvecAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(22, 4)
+    fun fvecInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 22, 4)
+    fun mutateFvec(j: Int, fvec: Float) : Boolean {
+        val o = __offset(22)
+        return if (o != 0) {
+            bb.putFloat(__vector(o) + j * 4, fvec)
+            true
+        } else {
+            false
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsMonsterExtra(_bb: ByteBuffer): MonsterExtra = getRootAsMonsterExtra(_bb, MonsterExtra())
+        fun getRootAsMonsterExtra(_bb: ByteBuffer, obj: MonsterExtra): MonsterExtra {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun MonsterExtraBufferHasIdentifier(_bb: ByteBuffer) : Boolean = __has_identifier(_bb, "MONE")
+        fun createMonsterExtra(builder: FlatBufferBuilder, d0: Double, d1: Double, d2: Double, d3: Double, f0: Float, f1: Float, f2: Float, f3: Float, dvecOffset: Int, fvecOffset: Int) : Int {
+            builder.startTable(11)
+            addD3(builder, d3)
+            addD2(builder, d2)
+            addD1(builder, d1)
+            addD0(builder, d0)
+            addFvec(builder, fvecOffset)
+            addDvec(builder, dvecOffset)
+            addF3(builder, f3)
+            addF2(builder, f2)
+            addF1(builder, f1)
+            addF0(builder, f0)
+            return endMonsterExtra(builder)
+        }
+        fun startMonsterExtra(builder: FlatBufferBuilder) = builder.startTable(11)
+        fun addD0(builder: FlatBufferBuilder, d0: Double) = builder.addDouble(0, d0, Double.NaN)
+        fun addD1(builder: FlatBufferBuilder, d1: Double) = builder.addDouble(1, d1, Double.NaN)
+        fun addD2(builder: FlatBufferBuilder, d2: Double) = builder.addDouble(2, d2, Double.POSITIVE_INFINITY)
+        fun addD3(builder: FlatBufferBuilder, d3: Double) = builder.addDouble(3, d3, Double.NEGATIVE_INFINITY)
+        fun addF0(builder: FlatBufferBuilder, f0: Float) = builder.addFloat(4, f0, Double.NaN)
+        fun addF1(builder: FlatBufferBuilder, f1: Float) = builder.addFloat(5, f1, Double.NaN)
+        fun addF2(builder: FlatBufferBuilder, f2: Float) = builder.addFloat(6, f2, Double.POSITIVE_INFINITY)
+        fun addF3(builder: FlatBufferBuilder, f3: Float) = builder.addFloat(7, f3, Double.NEGATIVE_INFINITY)
+        fun addDvec(builder: FlatBufferBuilder, dvec: Int) = builder.addOffset(8, dvec, 0)
+        fun createDvecVector(builder: FlatBufferBuilder, data: DoubleArray) : Int {
+            builder.startVector(8, data.size, 8)
+            for (i in data.size - 1 downTo 0) {
+                builder.addDouble(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startDvecVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(8, numElems, 8)
+        fun addFvec(builder: FlatBufferBuilder, fvec: Int) = builder.addOffset(9, fvec, 0)
+        fun createFvecVector(builder: FlatBufferBuilder, data: FloatArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addFloat(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startFvecVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun endMonsterExtra(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun finishMonsterExtraBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finish(offset, "MONE")
+        fun finishSizePrefixedMonsterExtraBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finishSizePrefixed(offset, "MONE")
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.py
new file mode 100644
index 0000000..f362440
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/MonsterExtra.py
@@ -0,0 +1,288 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: MyGame
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class MonsterExtra(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MonsterExtra()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsMonsterExtra(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def MonsterExtraBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x4D\x4F\x4E\x45", size_prefixed=size_prefixed)
+
+    # MonsterExtra
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # MonsterExtra
+    def D0(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, o + self._tab.Pos)
+        return float('nan')
+
+    # MonsterExtra
+    def D1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, o + self._tab.Pos)
+        return float('nan')
+
+    # MonsterExtra
+    def D2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, o + self._tab.Pos)
+        return float('inf')
+
+    # MonsterExtra
+    def D3(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, o + self._tab.Pos)
+        return float('-inf')
+
+    # MonsterExtra
+    def F0(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return float('nan')
+
+    # MonsterExtra
+    def F1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return float('nan')
+
+    # MonsterExtra
+    def F2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return float('inf')
+
+    # MonsterExtra
+    def F3(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return float('-inf')
+
+    # MonsterExtra
+    def Dvec(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # MonsterExtra
+    def DvecAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float64Flags, o)
+        return 0
+
+    # MonsterExtra
+    def DvecLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # MonsterExtra
+    def DvecIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        return o == 0
+
+    # MonsterExtra
+    def Fvec(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # MonsterExtra
+    def FvecAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # MonsterExtra
+    def FvecLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # MonsterExtra
+    def FvecIsNone(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        return o == 0
+
+def Start(builder): builder.StartObject(11)
+def MonsterExtraStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddD0(builder, d0): builder.PrependFloat64Slot(0, d0, float('nan'))
+def MonsterExtraAddD0(builder, d0):
+    """This method is deprecated. Please switch to AddD0."""
+    return AddD0(builder, d0)
+def AddD1(builder, d1): builder.PrependFloat64Slot(1, d1, float('nan'))
+def MonsterExtraAddD1(builder, d1):
+    """This method is deprecated. Please switch to AddD1."""
+    return AddD1(builder, d1)
+def AddD2(builder, d2): builder.PrependFloat64Slot(2, d2, float('inf'))
+def MonsterExtraAddD2(builder, d2):
+    """This method is deprecated. Please switch to AddD2."""
+    return AddD2(builder, d2)
+def AddD3(builder, d3): builder.PrependFloat64Slot(3, d3, float('-inf'))
+def MonsterExtraAddD3(builder, d3):
+    """This method is deprecated. Please switch to AddD3."""
+    return AddD3(builder, d3)
+def AddF0(builder, f0): builder.PrependFloat32Slot(4, f0, float('nan'))
+def MonsterExtraAddF0(builder, f0):
+    """This method is deprecated. Please switch to AddF0."""
+    return AddF0(builder, f0)
+def AddF1(builder, f1): builder.PrependFloat32Slot(5, f1, float('nan'))
+def MonsterExtraAddF1(builder, f1):
+    """This method is deprecated. Please switch to AddF1."""
+    return AddF1(builder, f1)
+def AddF2(builder, f2): builder.PrependFloat32Slot(6, f2, float('inf'))
+def MonsterExtraAddF2(builder, f2):
+    """This method is deprecated. Please switch to AddF2."""
+    return AddF2(builder, f2)
+def AddF3(builder, f3): builder.PrependFloat32Slot(7, f3, float('-inf'))
+def MonsterExtraAddF3(builder, f3):
+    """This method is deprecated. Please switch to AddF3."""
+    return AddF3(builder, f3)
+def AddDvec(builder, dvec): builder.PrependUOffsetTRelativeSlot(8, flatbuffers.number_types.UOffsetTFlags.py_type(dvec), 0)
+def MonsterExtraAddDvec(builder, dvec):
+    """This method is deprecated. Please switch to AddDvec."""
+    return AddDvec(builder, dvec)
+def StartDvecVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def MonsterExtraStartDvecVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartDvecVector(builder, numElems)
+def AddFvec(builder, fvec): builder.PrependUOffsetTRelativeSlot(9, flatbuffers.number_types.UOffsetTFlags.py_type(fvec), 0)
+def MonsterExtraAddFvec(builder, fvec):
+    """This method is deprecated. Please switch to AddFvec."""
+    return AddFvec(builder, fvec)
+def StartFvecVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def MonsterExtraStartFvecVector(builder, numElems):
+    """This method is deprecated. Please switch to Start."""
+    return StartFvecVector(builder, numElems)
+def End(builder): return builder.EndObject()
+def MonsterExtraEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+try:
+    from typing import List
+except:
+    pass
+
+class MonsterExtraT(object):
+
+    # MonsterExtraT
+    def __init__(self):
+        self.d0 = float('nan')  # type: float
+        self.d1 = float('nan')  # type: float
+        self.d2 = float('inf')  # type: float
+        self.d3 = float('-inf')  # type: float
+        self.f0 = float('nan')  # type: float
+        self.f1 = float('nan')  # type: float
+        self.f2 = float('inf')  # type: float
+        self.f3 = float('-inf')  # type: float
+        self.dvec = None  # type: List[float]
+        self.fvec = None  # type: List[float]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        monsterExtra = MonsterExtra()
+        monsterExtra.Init(buf, pos)
+        return cls.InitFromObj(monsterExtra)
+
+    @classmethod
+    def InitFromObj(cls, monsterExtra):
+        x = MonsterExtraT()
+        x._UnPack(monsterExtra)
+        return x
+
+    # MonsterExtraT
+    def _UnPack(self, monsterExtra):
+        if monsterExtra is None:
+            return
+        self.d0 = monsterExtra.D0()
+        self.d1 = monsterExtra.D1()
+        self.d2 = monsterExtra.D2()
+        self.d3 = monsterExtra.D3()
+        self.f0 = monsterExtra.F0()
+        self.f1 = monsterExtra.F1()
+        self.f2 = monsterExtra.F2()
+        self.f3 = monsterExtra.F3()
+        if not monsterExtra.DvecIsNone():
+            if np is None:
+                self.dvec = []
+                for i in range(monsterExtra.DvecLength()):
+                    self.dvec.append(monsterExtra.Dvec(i))
+            else:
+                self.dvec = monsterExtra.DvecAsNumpy()
+        if not monsterExtra.FvecIsNone():
+            if np is None:
+                self.fvec = []
+                for i in range(monsterExtra.FvecLength()):
+                    self.fvec.append(monsterExtra.Fvec(i))
+            else:
+                self.fvec = monsterExtra.FvecAsNumpy()
+
+    # MonsterExtraT
+    def Pack(self, builder):
+        if self.dvec is not None:
+            if np is not None and type(self.dvec) is np.ndarray:
+                dvec = builder.CreateNumpyVector(self.dvec)
+            else:
+                StartDvecVector(builder, len(self.dvec))
+                for i in reversed(range(len(self.dvec))):
+                    builder.PrependFloat64(self.dvec[i])
+                dvec = builder.EndVector()
+        if self.fvec is not None:
+            if np is not None and type(self.fvec) is np.ndarray:
+                fvec = builder.CreateNumpyVector(self.fvec)
+            else:
+                StartFvecVector(builder, len(self.fvec))
+                for i in reversed(range(len(self.fvec))):
+                    builder.PrependFloat32(self.fvec[i])
+                fvec = builder.EndVector()
+        Start(builder)
+        AddD0(builder, self.d0)
+        AddD1(builder, self.d1)
+        AddD2(builder, self.d2)
+        AddD3(builder, self.d3)
+        AddF0(builder, self.f0)
+        AddF1(builder, self.f1)
+        AddF2(builder, self.f2)
+        AddF3(builder, self.f3)
+        if self.dvec is not None:
+            AddDvec(builder, dvec)
+        if self.fvec is not None:
+            AddFvec(builder, fvec)
+        monsterExtra = End(builder)
+        return monsterExtra
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/MyGame/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/PythonTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/PythonTest.sh
new file mode 100755
index 0000000..b387ec1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/PythonTest.sh
@@ -0,0 +1,85 @@
+#!/bin/bash -eu
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pushd "$(dirname $0)" >/dev/null
+test_dir="$(pwd)"
+gen_code_path=${test_dir}
+runtime_library_dir=${test_dir}/../python
+
+# Emit Python code for the example schema in the test dir:
+${test_dir}/../flatc -p -o ${gen_code_path} -I include_test monster_test.fbs --gen-object-api
+${test_dir}/../flatc -p -o ${gen_code_path} -I include_test monster_extra.fbs --gen-object-api
+
+# Syntax: run_tests <interpreter> <benchmark vtable dedupes>
+#                   <benchmark read count> <benchmark build count>
+interpreters_tested=()
+function run_tests() {
+  if $(which ${1} >/dev/null); then
+    echo "Testing with interpreter: ${1}"
+    PYTHONDONTWRITEBYTECODE=1 \
+    JYTHONDONTWRITEBYTECODE=1 \
+    PYTHONPATH=${runtime_library_dir}:${gen_code_path} \
+    JYTHONPATH=${runtime_library_dir}:${gen_code_path} \
+    COMPARE_GENERATED_TO_GO=0 \
+    COMPARE_GENERATED_TO_JAVA=0 \
+    $1 py_test.py $2 $3 $4
+    if [ $1 = python3 ]; then
+      PYTHONDONTWRITEBYTECODE=1 \
+      PYTHONPATH=${runtime_library_dir}:${gen_code_path} \
+      $1 py_flexbuffers_test.py
+    fi
+    interpreters_tested+=(${1})
+    echo
+  fi
+}
+
+# Run test suite with these interpreters. The arguments are benchmark counts.
+run_tests python2.6 100 100 100
+run_tests python2.7 100 100 100
+run_tests python3 100 100 100
+run_tests pypy 100 100 100
+
+# NOTE: We'd like to support python2.5 in the future.
+
+# NOTE: Jython 2.7.0 fails due to a bug in the stdlib `struct` library:
+#       http://bugs.jython.org/issue2188
+
+if [ ${#interpreters_tested[@]} -eq 0 ]; then
+  echo "No Python interpeters found on this system, could not run tests."
+  exit 1
+fi
+
+# Run test suite with default python intereter.
+# (If the Python program `coverage` is available, it will be run, too.
+#  Install `coverage` with `pip install coverage`.)
+if $(which coverage >/dev/null); then
+  echo 'Found coverage utility, running coverage with default Python:'
+
+  PYTHONDONTWRITEBYTECODE=1 \
+  PYTHONPATH=${runtime_library_dir}:${gen_code_path} \
+  coverage run --source=flatbuffers,MyGame py_test.py 0 0 0 > /dev/null
+
+  echo
+  cov_result=`coverage report --omit="*flatbuffers/vendor*,*py_test*" \
+              | tail -n 1 | awk ' { print $4 } '`
+  echo "Code coverage: ${cov_result}"
+else
+  echo -n "Did not find coverage utility for default Python, skipping. "
+  echo "Install with 'pip install coverage'."
+fi
+
+echo
+echo "OK: all tests passed for ${#interpreters_tested[@]} interpreters: ${interpreters_tested[@]}."
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.bat b/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.bat
new file mode 100644
index 0000000..1473ffb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.bat
@@ -0,0 +1,25 @@
+@echo off
+rem Copyright 2018 Google Inc. All rights reserved.
+rem
+rem Licensed under the Apache License, Version 2.0 (the "License");
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+
+rem Compile then run the Rust test.
+
+rem TODO(rw): how do we make this script abort the calling script in appveyor?
+
+cd rust_usage_test
+cargo test -- --quiet || exit /b 1
+cargo run --bin=flatbuffers_alloc_check || exit /b 1
+cargo run --bin=flexbuffers_alloc_check || exit /b 1
+cargo run --bin=monster_example || exit /b 1
+cd ..
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.sh
new file mode 100755
index 0000000..7a7894a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/RustTest.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -e
+#
+# Copyright 2018 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ "$1" == "mips-unknown-linux-gnu" ]]; then
+    TARGET_FLAG="--target mips-unknown-linux-gnu"
+    export CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc
+    export CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu"
+fi
+
+
+function check_test_result() {
+    if [[ $? == 0 ]]; then
+        echo OK: $1 passed.
+    else
+        echo KO: $1 failed.
+        exit 1
+    fi
+}
+
+cd ./rust_usage_test
+cargo test $TARGET_FLAG -- --quiet
+check_test_result "Rust tests"
+
+
+cargo run $TARGET_FLAG --bin=flatbuffers_alloc_check
+check_test_result "Rust flatbuffers heap alloc test"
+
+cargo run $TARGET_FLAG --bin=flexbuffers_alloc_check
+check_test_result "Rust flexbuffers heap alloc test"
+
+# TODO(caspern): Fix this.
+#   Temporarily disabled due to error in upstream configuration
+#   https://github.com/google/flatbuffers/issues/6491
+#
+# rustup component add clippy
+# cargo clippy $TARGET_FLAG
+# check_test_result "No Cargo clippy lints test"
+
+cargo bench $TARGET_FLAG
+
+# This test is dependent on flatc.
+if [[ -f ../../flatc ]]; then
+    cd outdir
+    cargo test
+    check_test_result "Rust generated file in \$OUT_DIR"
+    cd ..
+fi
+
+# RUST_NIGHTLY environment variable set in dockerfile.
+if [[ $RUST_NIGHTLY == 1 ]]; then
+  rustup +nightly component add miri
+  MIRIFLAGS="-Zmiri-disable-isolation" cargo +nightly miri test
+fi
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/TestAll.sh b/3rdparty/TNN/third_party/flatbuffers/tests/TestAll.sh
new file mode 100755
index 0000000..06548a2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/TestAll.sh
@@ -0,0 +1,59 @@
+echo "************************ Java:"
+
+sh JavaTest.sh
+
+echo "************************ Kotlin:"
+
+sh KotlinTest.sh
+
+echo "************************ Go:"
+
+sh GoTest.sh
+
+echo "************************ Python:"
+
+sh PythonTest.sh
+
+echo "************************ TypeScript:"
+
+sh TypeScriptTest.sh
+
+echo "************************ C++:"
+
+cd ..
+./flattests
+cd tests
+
+echo "************************ C#:"
+
+cd FlatBuffers.Test
+sh NetTest.sh
+cd ..
+
+echo "************************ PHP:"
+
+php phpTest.php
+sh phpUnionVectorTest.sh
+
+echo "************************ Dart:"
+
+sh DartTest.sh
+
+echo "************************ Rust:"
+
+sh RustTest.sh
+
+echo "************************ Lobster:"
+
+# TODO: test if available.
+# lobster lobstertest.lobster
+
+echo "************************ C:"
+
+echo "(in a different repo)"
+
+echo "************************ Swift:"
+
+cd FlatBuffers.Test.Swift
+sh SwiftTest.sh
+cd ..
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.bat b/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.bat
new file mode 100755
index 0000000..d11804b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.bat
@@ -0,0 +1,8 @@
+npm install
+../flatc.exe --ts --gen-name-strings --gen-mutable --gen-object-api -I include_test monster_test.fbs
+../flatc.exe --gen-object-api -b -I include_test monster_test.fbs unicode_test.json
+../flatc.exe --ts --gen-name-strings --gen-mutable --gen-object-api -o union_vector union_vector/union_vector.fbs
+tsc
+node -r esm JavaScriptTest
+node -r esm JavaScriptUnionVectorTest
+node -r esm JavaScriptFlexBuffersTest
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.sh
new file mode 100755
index 0000000..613b259
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/TypeScriptTest.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+#
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# clean node_modules to make sure we depend on latest local flatbuffers at ../
+rm -rf node_modules
+npm install
+
+if [ -x ../flatc ]; then
+    ../flatc --ts --gen-name-strings --gen-mutable --gen-object-api -I include_test monster_test.fbs
+    ../flatc --gen-object-api -b -I include_test monster_test.fbs unicode_test.json
+    ../flatc --ts --gen-name-strings --gen-mutable --gen-object-api -o union_vector union_vector/union_vector.fbs
+fi
+tsc
+node -r esm JavaScriptTest
+node -r esm JavaScriptUnionVectorTest
+node -r esm JavaScriptFlexBuffersTest
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.bfbs b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.bfbs
new file mode 100644
index 0000000..d6b6f73
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.bfbs differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.fbs
new file mode 100644
index 0000000..90cb0d7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.fbs
@@ -0,0 +1,27 @@
+namespace MyGame.Example;
+
+enum TestEnum : byte { A, B, C }
+
+struct NestedStruct{
+  a:[int:2];
+  b:TestEnum;
+  c:[TestEnum:2];
+  d:[int64:2];
+}
+
+struct ArrayStruct{
+  a:float;
+  b:[int:0xF];
+  c:byte;
+  d:[NestedStruct:2];
+  e:int32;
+  f:[int64:2];
+}
+
+table ArrayTable{
+  a:ArrayStruct;
+}
+
+root_type ArrayTable;
+file_identifier "ARRT";
+file_extension "mon";
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.golden b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.golden
new file mode 100644
index 0000000..c7037d7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.golden
@@ -0,0 +1,23 @@
+{
+  a : {
+    a: 12.34,
+    b: [1,2,3,4,5,6,7,8,9,0xA,0xB,0xC,0xD,0xE,0xF],
+    c: -127,
+    d: [
+      {
+        a : [-1,2],
+        b : A,
+        c : [C, B],
+        d : [0x1122334455667788, -0x1122334455667788]
+      },
+      {
+        a : [3,-4],
+        b : B,
+        c : [B, A],
+        d : [-0x1122334455667788, 0x1122334455667788]
+      }
+    ],
+    e: 1,
+    f: [-0x8000000000000000, 0x7FFFFFFFFFFFFFFF]
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.schema.json b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.schema.json
new file mode 100644
index 0000000..c33dc6c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test.schema.json
@@ -0,0 +1,73 @@
+{
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
+  "definitions": {
+    "MyGame_Example_TestEnum" : {
+      "type" : "string",
+      "enum": ["A", "B", "C"]
+    },
+    "MyGame_Example_NestedStruct" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647},
+                "minItems": 2,
+                "maxItems": 2
+              },
+        "b" : {
+                "$ref" : "#/definitions/MyGame_Example_TestEnum"
+              },
+        "c" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_TestEnum"},
+                "minItems": 2,
+                "maxItems": 2
+              },
+        "d" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807},
+                "minItems": 2,
+                "maxItems": 2
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_ArrayStruct" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "type" : "number"
+              },
+        "b" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647},
+                "minItems": 15,
+                "maxItems": 15
+              },
+        "c" : {
+                "type" : "integer", "minimum" : -128, "maximum" : 127
+              },
+        "d" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_NestedStruct"},
+                "minItems": 2,
+                "maxItems": 2
+              },
+        "e" : {
+                "type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647
+              },
+        "f" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807},
+                "minItems": 2,
+                "maxItems": 2
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_ArrayTable" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "$ref" : "#/definitions/MyGame_Example_ArrayStruct"
+              }
+      },
+      "additionalProperties" : false
+    }
+  },
+  "$ref" : "#/definitions/MyGame_Example_ArrayTable"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.h
new file mode 100644
index 0000000..d7660cf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.h
@@ -0,0 +1,509 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_ARRAYSTEST_MYGAME_EXAMPLE_H_
+#define FLATBUFFERS_GENERATED_ARRAYSTEST_MYGAME_EXAMPLE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace MyGame {
+namespace Example {
+
+struct NestedStruct;
+
+struct ArrayStruct;
+
+struct ArrayTable;
+struct ArrayTableBuilder;
+struct ArrayTableT;
+
+bool operator==(const NestedStruct &lhs, const NestedStruct &rhs);
+bool operator!=(const NestedStruct &lhs, const NestedStruct &rhs);
+bool operator==(const ArrayStruct &lhs, const ArrayStruct &rhs);
+bool operator!=(const ArrayStruct &lhs, const ArrayStruct &rhs);
+bool operator==(const ArrayTableT &lhs, const ArrayTableT &rhs);
+bool operator!=(const ArrayTableT &lhs, const ArrayTableT &rhs);
+
+inline const flatbuffers::TypeTable *NestedStructTypeTable();
+
+inline const flatbuffers::TypeTable *ArrayStructTypeTable();
+
+inline const flatbuffers::TypeTable *ArrayTableTypeTable();
+
+enum class TestEnum : int8_t {
+  A = 0,
+  B = 1,
+  C = 2,
+  MIN = A,
+  MAX = C
+};
+
+inline const TestEnum (&EnumValuesTestEnum())[3] {
+  static const TestEnum values[] = {
+    TestEnum::A,
+    TestEnum::B,
+    TestEnum::C
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTestEnum() {
+  static const char * const names[4] = {
+    "A",
+    "B",
+    "C",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTestEnum(TestEnum e) {
+  if (flatbuffers::IsOutRange(e, TestEnum::A, TestEnum::C)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTestEnum()[index];
+}
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) NestedStruct FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t a_[2];
+  int8_t b_;
+  int8_t c_[2];
+  int8_t padding0__;  int32_t padding1__;
+  int64_t d_[2];
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NestedStructTypeTable();
+  }
+  NestedStruct()
+      : a_(),
+        b_(0),
+        c_(),
+        padding0__(0),
+        padding1__(0),
+        d_() {
+    (void)padding0__;
+    (void)padding1__;
+  }
+  NestedStruct(MyGame::Example::TestEnum _b)
+      : a_(),
+        b_(flatbuffers::EndianScalar(static_cast<int8_t>(_b))),
+        c_(),
+        padding0__(0),
+        padding1__(0),
+        d_() {
+    (void)padding0__;
+    (void)padding1__;
+  }
+  NestedStruct(flatbuffers::span<const int32_t, 2> _a, MyGame::Example::TestEnum _b, flatbuffers::span<const MyGame::Example::TestEnum, 2> _c, flatbuffers::span<const int64_t, 2> _d)
+      : b_(flatbuffers::EndianScalar(static_cast<int8_t>(_b))),
+        padding0__(0),
+        padding1__(0) {
+    flatbuffers::CastToArray(a_).CopyFromSpan(_a);
+    flatbuffers::CastToArrayOfEnum<MyGame::Example::TestEnum>(c_).CopyFromSpan(_c);
+    (void)padding0__;
+    (void)padding1__;
+    flatbuffers::CastToArray(d_).CopyFromSpan(_d);
+  }
+  const flatbuffers::Array<int32_t, 2> *a() const {
+    return &flatbuffers::CastToArray(a_);
+  }
+  flatbuffers::Array<int32_t, 2> *mutable_a() {
+    return &flatbuffers::CastToArray(a_);
+  }
+  MyGame::Example::TestEnum b() const {
+    return static_cast<MyGame::Example::TestEnum>(flatbuffers::EndianScalar(b_));
+  }
+  void mutate_b(MyGame::Example::TestEnum _b) {
+    flatbuffers::WriteScalar(&b_, static_cast<int8_t>(_b));
+  }
+  const flatbuffers::Array<MyGame::Example::TestEnum, 2> *c() const {
+    return &flatbuffers::CastToArrayOfEnum<MyGame::Example::TestEnum>(c_);
+  }
+  flatbuffers::Array<MyGame::Example::TestEnum, 2> *mutable_c() {
+    return &flatbuffers::CastToArrayOfEnum<MyGame::Example::TestEnum>(c_);
+  }
+  const flatbuffers::Array<int64_t, 2> *d() const {
+    return &flatbuffers::CastToArray(d_);
+  }
+  flatbuffers::Array<int64_t, 2> *mutable_d() {
+    return &flatbuffers::CastToArray(d_);
+  }
+};
+FLATBUFFERS_STRUCT_END(NestedStruct, 32);
+
+inline bool operator==(const NestedStruct &lhs, const NestedStruct &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b()) &&
+      (lhs.c() == rhs.c()) &&
+      (lhs.d() == rhs.d());
+}
+
+inline bool operator!=(const NestedStruct &lhs, const NestedStruct &rhs) {
+    return !(lhs == rhs);
+}
+
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) ArrayStruct FLATBUFFERS_FINAL_CLASS {
+ private:
+  float a_;
+  int32_t b_[15];
+  int8_t c_;
+  int8_t padding0__;  int16_t padding1__;  int32_t padding2__;
+  MyGame::Example::NestedStruct d_[2];
+  int32_t e_;
+  int32_t padding3__;
+  int64_t f_[2];
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArrayStructTypeTable();
+  }
+  ArrayStruct()
+      : a_(0),
+        b_(),
+        c_(0),
+        padding0__(0),
+        padding1__(0),
+        padding2__(0),
+        d_(),
+        e_(0),
+        padding3__(0),
+        f_() {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+    (void)padding3__;
+  }
+  ArrayStruct(float _a, int8_t _c, int32_t _e)
+      : a_(flatbuffers::EndianScalar(_a)),
+        b_(),
+        c_(flatbuffers::EndianScalar(_c)),
+        padding0__(0),
+        padding1__(0),
+        padding2__(0),
+        d_(),
+        e_(flatbuffers::EndianScalar(_e)),
+        padding3__(0),
+        f_() {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+    (void)padding3__;
+  }
+  ArrayStruct(float _a, flatbuffers::span<const int32_t, 15> _b, int8_t _c, flatbuffers::span<const MyGame::Example::NestedStruct, 2> _d, int32_t _e, flatbuffers::span<const int64_t, 2> _f)
+      : a_(flatbuffers::EndianScalar(_a)),
+        c_(flatbuffers::EndianScalar(_c)),
+        padding0__(0),
+        padding1__(0),
+        padding2__(0),
+        e_(flatbuffers::EndianScalar(_e)),
+        padding3__(0) {
+    flatbuffers::CastToArray(b_).CopyFromSpan(_b);
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+    flatbuffers::CastToArray(d_).CopyFromSpan(_d);
+    (void)padding3__;
+    flatbuffers::CastToArray(f_).CopyFromSpan(_f);
+  }
+  float a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  void mutate_a(float _a) {
+    flatbuffers::WriteScalar(&a_, _a);
+  }
+  const flatbuffers::Array<int32_t, 15> *b() const {
+    return &flatbuffers::CastToArray(b_);
+  }
+  flatbuffers::Array<int32_t, 15> *mutable_b() {
+    return &flatbuffers::CastToArray(b_);
+  }
+  int8_t c() const {
+    return flatbuffers::EndianScalar(c_);
+  }
+  void mutate_c(int8_t _c) {
+    flatbuffers::WriteScalar(&c_, _c);
+  }
+  const flatbuffers::Array<MyGame::Example::NestedStruct, 2> *d() const {
+    return &flatbuffers::CastToArray(d_);
+  }
+  flatbuffers::Array<MyGame::Example::NestedStruct, 2> *mutable_d() {
+    return &flatbuffers::CastToArray(d_);
+  }
+  int32_t e() const {
+    return flatbuffers::EndianScalar(e_);
+  }
+  void mutate_e(int32_t _e) {
+    flatbuffers::WriteScalar(&e_, _e);
+  }
+  const flatbuffers::Array<int64_t, 2> *f() const {
+    return &flatbuffers::CastToArray(f_);
+  }
+  flatbuffers::Array<int64_t, 2> *mutable_f() {
+    return &flatbuffers::CastToArray(f_);
+  }
+};
+FLATBUFFERS_STRUCT_END(ArrayStruct, 160);
+
+inline bool operator==(const ArrayStruct &lhs, const ArrayStruct &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b()) &&
+      (lhs.c() == rhs.c()) &&
+      (lhs.d() == rhs.d()) &&
+      (lhs.e() == rhs.e()) &&
+      (lhs.f() == rhs.f());
+}
+
+inline bool operator!=(const ArrayStruct &lhs, const ArrayStruct &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct ArrayTableT : public flatbuffers::NativeTable {
+  typedef ArrayTable TableType;
+  flatbuffers::unique_ptr<MyGame::Example::ArrayStruct> a{};
+};
+
+inline bool operator==(const ArrayTableT &lhs, const ArrayTableT &rhs) {
+  return
+      (lhs.a == rhs.a);
+}
+
+inline bool operator!=(const ArrayTableT &lhs, const ArrayTableT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct ArrayTable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArrayTableT NativeTableType;
+  typedef ArrayTableBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArrayTableTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4
+  };
+  const MyGame::Example::ArrayStruct *a() const {
+    return GetStruct<const MyGame::Example::ArrayStruct *>(VT_A);
+  }
+  MyGame::Example::ArrayStruct *mutable_a() {
+    return GetStruct<MyGame::Example::ArrayStruct *>(VT_A);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<MyGame::Example::ArrayStruct>(verifier, VT_A) &&
+           verifier.EndTable();
+  }
+  ArrayTableT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArrayTableT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArrayTable> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArrayTableT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArrayTableBuilder {
+  typedef ArrayTable Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(const MyGame::Example::ArrayStruct *a) {
+    fbb_.AddStruct(ArrayTable::VT_A, a);
+  }
+  explicit ArrayTableBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ArrayTable> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArrayTable>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArrayTable> CreateArrayTable(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Example::ArrayStruct *a = 0) {
+  ArrayTableBuilder builder_(_fbb);
+  builder_.add_a(a);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArrayTable> CreateArrayTable(flatbuffers::FlatBufferBuilder &_fbb, const ArrayTableT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline ArrayTableT *ArrayTable::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArrayTableT>(new ArrayTableT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArrayTable::UnPackTo(ArrayTableT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = a(); if (_e) _o->a = flatbuffers::unique_ptr<MyGame::Example::ArrayStruct>(new MyGame::Example::ArrayStruct(*_e)); }
+}
+
+inline flatbuffers::Offset<ArrayTable> ArrayTable::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArrayTableT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArrayTable(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArrayTable> CreateArrayTable(flatbuffers::FlatBufferBuilder &_fbb, const ArrayTableT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArrayTableT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _a = _o->a ? _o->a.get() : 0;
+  return MyGame::Example::CreateArrayTable(
+      _fbb,
+      _a);
+}
+
+inline const flatbuffers::TypeTable *TestEnumTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::TestEnumTypeTable
+  };
+  static const char * const names[] = {
+    "A",
+    "B",
+    "C"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *NestedStructTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 1, 0 },
+    { flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::TestEnumTypeTable
+  };
+  static const int16_t array_sizes[] = { 2, 2, 2,  };
+  static const int64_t values[] = { 0, 8, 9, 16, 32 };
+  static const char * const names[] = {
+    "a",
+    "b",
+    "c",
+    "d"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 4, type_codes, type_refs, array_sizes, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ArrayStructTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::NestedStructTypeTable
+  };
+  static const int16_t array_sizes[] = { 15, 2, 2,  };
+  static const int64_t values[] = { 0, 4, 64, 72, 136, 144, 160 };
+  static const char * const names[] = {
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 6, type_codes, type_refs, array_sizes, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ArrayTableTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ArrayStructTypeTable
+  };
+  static const char * const names[] = {
+    "a"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const MyGame::Example::ArrayTable *GetArrayTable(const void *buf) {
+  return flatbuffers::GetRoot<MyGame::Example::ArrayTable>(buf);
+}
+
+inline const MyGame::Example::ArrayTable *GetSizePrefixedArrayTable(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<MyGame::Example::ArrayTable>(buf);
+}
+
+inline ArrayTable *GetMutableArrayTable(void *buf) {
+  return flatbuffers::GetMutableRoot<ArrayTable>(buf);
+}
+
+inline const char *ArrayTableIdentifier() {
+  return "ARRT";
+}
+
+inline bool ArrayTableBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ArrayTableIdentifier());
+}
+
+inline bool VerifyArrayTableBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<MyGame::Example::ArrayTable>(ArrayTableIdentifier());
+}
+
+inline bool VerifySizePrefixedArrayTableBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<MyGame::Example::ArrayTable>(ArrayTableIdentifier());
+}
+
+inline const char *ArrayTableExtension() {
+  return "mon";
+}
+
+inline void FinishArrayTableBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::ArrayTable> root) {
+  fbb.Finish(root, ArrayTableIdentifier());
+}
+
+inline void FinishSizePrefixedArrayTableBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::ArrayTable> root) {
+  fbb.FinishSizePrefixed(root, ArrayTableIdentifier());
+}
+
+inline flatbuffers::unique_ptr<MyGame::Example::ArrayTableT> UnPackArrayTable(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Example::ArrayTableT>(GetArrayTable(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<MyGame::Example::ArrayTableT> UnPackSizePrefixedArrayTable(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Example::ArrayTableT>(GetSizePrefixedArrayTable(buf)->UnPack(res));
+}
+
+}  // namespace Example
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_ARRAYSTEST_MYGAME_EXAMPLE_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.rs
new file mode 100644
index 0000000..e8c6161
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/arrays_test_generated.rs
@@ -0,0 +1,727 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod my_game {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+#[allow(unused_imports, dead_code)]
+pub mod example {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_TEST_ENUM: i8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_TEST_ENUM: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_TEST_ENUM: [TestEnum; 3] = [
+  TestEnum::A,
+  TestEnum::B,
+  TestEnum::C,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct TestEnum(pub i8);
+#[allow(non_upper_case_globals)]
+impl TestEnum {
+  pub const A: Self = Self(0);
+  pub const B: Self = Self(1);
+  pub const C: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = 0;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::A,
+    Self::B,
+    Self::C,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::A => Some("A"),
+      Self::B => Some("B"),
+      Self::C => Some("C"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for TestEnum {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for TestEnum {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for TestEnum {
+    type Output = TestEnum;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for TestEnum {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for TestEnum {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for TestEnum {}
+// struct NestedStruct, aligned to 8
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct NestedStruct(pub [u8; 32]);
+impl Default for NestedStruct { 
+  fn default() -> Self { 
+    Self([0; 32])
+  }
+}
+impl std::fmt::Debug for NestedStruct {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("NestedStruct")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .field("c", &self.c())
+      .field("d", &self.d())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for NestedStruct {}
+impl flatbuffers::SafeSliceAccess for NestedStruct {}
+impl<'a> flatbuffers::Follow<'a> for NestedStruct {
+  type Inner = &'a NestedStruct;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a NestedStruct>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a NestedStruct {
+  type Inner = &'a NestedStruct;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<NestedStruct>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for NestedStruct {
+    type Output = NestedStruct;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const NestedStruct as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b NestedStruct {
+    type Output = NestedStruct;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const NestedStruct as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for NestedStruct {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> NestedStruct {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: &[i32; 2],
+    b: TestEnum,
+    c: &[TestEnum; 2],
+    d: &[i64; 2],
+  ) -> Self {
+    let mut s = Self([0; 32]);
+    s.set_a(&a);
+    s.set_b(b);
+    s.set_c(&c);
+    s.set_d(&d);
+    s
+  }
+
+  pub fn a(&'a self) -> flatbuffers::Array<'a, i32, 2> {
+    flatbuffers::Array::follow(&self.0, 0)
+  }
+
+  pub fn set_a(&mut self, items: &[i32; 2]) {
+    flatbuffers::emplace_scalar_array(&mut self.0, 0, items);
+  }
+
+  pub fn b(&self) -> TestEnum {
+    let mut mem = core::mem::MaybeUninit::<TestEnum>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[8..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<TestEnum>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_b(&mut self, x: TestEnum) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const TestEnum as *const u8,
+        self.0[8..].as_mut_ptr(),
+        core::mem::size_of::<TestEnum>(),
+      );
+    }
+  }
+
+  pub fn c(&'a self) -> flatbuffers::Array<'a, TestEnum, 2> {
+    flatbuffers::Array::follow(&self.0, 9)
+  }
+
+  pub fn set_c(&mut self, x: &[TestEnum; 2]) {
+    unsafe {
+      std::ptr::copy(
+        x.as_ptr() as *const u8,
+        self.0.as_mut_ptr().add(9),
+        2,
+      );
+    }
+  }
+
+  pub fn d(&'a self) -> flatbuffers::Array<'a, i64, 2> {
+    flatbuffers::Array::follow(&self.0, 16)
+  }
+
+  pub fn set_d(&mut self, items: &[i64; 2]) {
+    flatbuffers::emplace_scalar_array(&mut self.0, 16, items);
+  }
+
+  pub fn unpack(&self) -> NestedStructT {
+    NestedStructT {
+      a: self.a().into(),
+      b: self.b(),
+      c: self.c().into(),
+      d: self.d().into(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct NestedStructT {
+  pub a: [i32; 2],
+  pub b: TestEnum,
+  pub c: [TestEnum; 2],
+  pub d: [i64; 2],
+}
+impl NestedStructT {
+  pub fn pack(&self) -> NestedStruct {
+    NestedStruct::new(
+      &self.a,
+      self.b,
+      &self.c,
+      &self.d,
+    )
+  }
+}
+
+// struct ArrayStruct, aligned to 8
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct ArrayStruct(pub [u8; 160]);
+impl Default for ArrayStruct { 
+  fn default() -> Self { 
+    Self([0; 160])
+  }
+}
+impl std::fmt::Debug for ArrayStruct {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("ArrayStruct")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .field("c", &self.c())
+      .field("d", &self.d())
+      .field("e", &self.e())
+      .field("f", &self.f())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for ArrayStruct {}
+impl flatbuffers::SafeSliceAccess for ArrayStruct {}
+impl<'a> flatbuffers::Follow<'a> for ArrayStruct {
+  type Inner = &'a ArrayStruct;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a ArrayStruct>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a ArrayStruct {
+  type Inner = &'a ArrayStruct;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<ArrayStruct>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for ArrayStruct {
+    type Output = ArrayStruct;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const ArrayStruct as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b ArrayStruct {
+    type Output = ArrayStruct;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const ArrayStruct as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for ArrayStruct {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> ArrayStruct {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: f32,
+    b: &[i32; 15],
+    c: i8,
+    d: &[NestedStruct; 2],
+    e: i32,
+    f: &[i64; 2],
+  ) -> Self {
+    let mut s = Self([0; 160]);
+    s.set_a(a);
+    s.set_b(&b);
+    s.set_c(c);
+    s.set_d(&d);
+    s.set_e(e);
+    s.set_f(&f);
+    s
+  }
+
+  pub fn a(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_a(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn b(&'a self) -> flatbuffers::Array<'a, i32, 15> {
+    flatbuffers::Array::follow(&self.0, 4)
+  }
+
+  pub fn set_b(&mut self, items: &[i32; 15]) {
+    flatbuffers::emplace_scalar_array(&mut self.0, 4, items);
+  }
+
+  pub fn c(&self) -> i8 {
+    let mut mem = core::mem::MaybeUninit::<i8>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[64..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i8>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_c(&mut self, x: i8) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i8 as *const u8,
+        self.0[64..].as_mut_ptr(),
+        core::mem::size_of::<i8>(),
+      );
+    }
+  }
+
+  pub fn d(&'a self) -> flatbuffers::Array<'a, NestedStruct, 2> {
+    flatbuffers::Array::follow(&self.0, 72)
+  }
+
+  pub fn set_d(&mut self, x: &[NestedStruct; 2]) {
+    unsafe {
+      std::ptr::copy(
+        x.as_ptr() as *const u8,
+        self.0.as_mut_ptr().add(72),
+        64,
+      );
+    }
+  }
+
+  pub fn e(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[136..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_e(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[136..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+  pub fn f(&'a self) -> flatbuffers::Array<'a, i64, 2> {
+    flatbuffers::Array::follow(&self.0, 144)
+  }
+
+  pub fn set_f(&mut self, items: &[i64; 2]) {
+    flatbuffers::emplace_scalar_array(&mut self.0, 144, items);
+  }
+
+  pub fn unpack(&self) -> ArrayStructT {
+    ArrayStructT {
+      a: self.a(),
+      b: self.b().into(),
+      c: self.c(),
+      d: { let d = self.d(); flatbuffers::array_init(|i| d.get(i).unpack()) },
+      e: self.e(),
+      f: self.f().into(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct ArrayStructT {
+  pub a: f32,
+  pub b: [i32; 15],
+  pub c: i8,
+  pub d: [NestedStructT; 2],
+  pub e: i32,
+  pub f: [i64; 2],
+}
+impl ArrayStructT {
+  pub fn pack(&self) -> ArrayStruct {
+    ArrayStruct::new(
+      self.a,
+      &self.b,
+      self.c,
+      &flatbuffers::array_init(|i| self.d[i].pack()),
+      self.e,
+      &self.f,
+    )
+  }
+}
+
+pub enum ArrayTableOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct ArrayTable<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for ArrayTable<'a> {
+    type Inner = ArrayTable<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> ArrayTable<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        ArrayTable { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args ArrayTableArgs<'args>) -> flatbuffers::WIPOffset<ArrayTable<'bldr>> {
+      let mut builder = ArrayTableBuilder::new(_fbb);
+      if let Some(x) = args.a { builder.add_a(x); }
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> ArrayTableT {
+      let a = self.a().map(|x| {
+        x.unpack()
+      });
+      ArrayTableT {
+        a,
+      }
+    }
+    pub const VT_A: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn a(&self) -> Option<&'a ArrayStruct> {
+    self._tab.get::<ArrayStruct>(ArrayTable::VT_A, None)
+  }
+}
+
+impl flatbuffers::Verifiable for ArrayTable<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<ArrayStruct>(&"a", Self::VT_A, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct ArrayTableArgs<'a> {
+    pub a: Option<&'a ArrayStruct>,
+}
+impl<'a> Default for ArrayTableArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        ArrayTableArgs {
+            a: None,
+        }
+    }
+}
+pub struct ArrayTableBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> ArrayTableBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_a(&mut self, a: &ArrayStruct) {
+    self.fbb_.push_slot_always::<&ArrayStruct>(ArrayTable::VT_A, a);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> ArrayTableBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    ArrayTableBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<ArrayTable<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for ArrayTable<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("ArrayTable");
+      ds.field("a", &self.a());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct ArrayTableT {
+  pub a: Option<ArrayStructT>,
+}
+impl Default for ArrayTableT {
+  fn default() -> Self {
+    Self {
+      a: None,
+    }
+  }
+}
+impl ArrayTableT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<ArrayTable<'b>> {
+    let a_tmp = self.a.as_ref().map(|x| x.pack());
+    let a = a_tmp.as_ref();
+    ArrayTable::create(_fbb, &ArrayTableArgs{
+      a,
+    })
+  }
+}
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_root_as_array_table<'a>(buf: &'a [u8]) -> ArrayTable<'a> {
+  unsafe { flatbuffers::root_unchecked::<ArrayTable<'a>>(buf) }
+}
+
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_size_prefixed_root_as_array_table<'a>(buf: &'a [u8]) -> ArrayTable<'a> {
+  unsafe { flatbuffers::size_prefixed_root_unchecked::<ArrayTable<'a>>(buf) }
+}
+
+#[inline]
+/// Verifies that a buffer of bytes contains a `ArrayTable`
+/// and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_array_table_unchecked`.
+pub fn root_as_array_table(buf: &[u8]) -> Result<ArrayTable, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root::<ArrayTable>(buf)
+}
+#[inline]
+/// Verifies that a buffer of bytes contains a size prefixed
+/// `ArrayTable` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `size_prefixed_root_as_array_table_unchecked`.
+pub fn size_prefixed_root_as_array_table(buf: &[u8]) -> Result<ArrayTable, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root::<ArrayTable>(buf)
+}
+#[inline]
+/// Verifies, with the given options, that a buffer of bytes
+/// contains a `ArrayTable` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_array_table_unchecked`.
+pub fn root_as_array_table_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<ArrayTable<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root_with_opts::<ArrayTable<'b>>(opts, buf)
+}
+#[inline]
+/// Verifies, with the given verifier options, that a buffer of
+/// bytes contains a size prefixed `ArrayTable` and returns
+/// it. Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_array_table_unchecked`.
+pub fn size_prefixed_root_as_array_table_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<ArrayTable<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root_with_opts::<ArrayTable<'b>>(opts, buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a ArrayTable and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid `ArrayTable`.
+pub unsafe fn root_as_array_table_unchecked(buf: &[u8]) -> ArrayTable {
+  flatbuffers::root_unchecked::<ArrayTable>(buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a size prefixed ArrayTable and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid size prefixed `ArrayTable`.
+pub unsafe fn size_prefixed_root_as_array_table_unchecked(buf: &[u8]) -> ArrayTable {
+  flatbuffers::size_prefixed_root_unchecked::<ArrayTable>(buf)
+}
+pub const ARRAY_TABLE_IDENTIFIER: &str = "ARRT";
+
+#[inline]
+pub fn array_table_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, ARRAY_TABLE_IDENTIFIER, false)
+}
+
+#[inline]
+pub fn array_table_size_prefixed_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, ARRAY_TABLE_IDENTIFIER, true)
+}
+
+pub const ARRAY_TABLE_EXTENSION: &str = "mon";
+
+#[inline]
+pub fn finish_array_table_buffer<'a, 'b>(
+    fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+    root: flatbuffers::WIPOffset<ArrayTable<'a>>) {
+  fbb.finish(root, Some(ARRAY_TABLE_IDENTIFIER));
+}
+
+#[inline]
+pub fn finish_size_prefixed_array_table_buffer<'a, 'b>(fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, root: flatbuffers::WIPOffset<ArrayTable<'a>>) {
+  fbb.finish_size_prefixed(root, Some(ARRAY_TABLE_IDENTIFIER));
+}
+}  // pub mod Example
+}  // pub mod MyGame
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/monster_test_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/monster_test_generated.h
new file mode 100644
index 0000000..6f69ae9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/monster_test_generated.h
@@ -0,0 +1,3730 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
+#define FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+
+namespace MyGame {
+
+struct InParentNamespace;
+struct InParentNamespaceBuilder;
+struct InParentNamespaceT;
+
+namespace Example2 {
+
+struct Monster;
+struct MonsterBuilder;
+struct MonsterT;
+
+}  // namespace Example2
+
+namespace Example {
+
+struct Test;
+
+struct TestSimpleTableWithEnum;
+struct TestSimpleTableWithEnumBuilder;
+struct TestSimpleTableWithEnumT;
+
+struct Vec3;
+
+struct Ability;
+
+struct StructOfStructs;
+
+struct Stat;
+struct StatBuilder;
+struct StatT;
+
+struct Referrable;
+struct ReferrableBuilder;
+struct ReferrableT;
+
+struct Monster;
+struct MonsterBuilder;
+struct MonsterT;
+
+struct TypeAliases;
+struct TypeAliasesBuilder;
+struct TypeAliasesT;
+
+}  // namespace Example
+
+inline const flatbuffers::TypeTable *InParentNamespaceTypeTable();
+
+namespace Example2 {
+
+inline const flatbuffers::TypeTable *MonsterTypeTable();
+
+}  // namespace Example2
+
+namespace Example {
+
+inline const flatbuffers::TypeTable *TestTypeTable();
+
+inline const flatbuffers::TypeTable *TestSimpleTableWithEnumTypeTable();
+
+inline const flatbuffers::TypeTable *Vec3TypeTable();
+
+inline const flatbuffers::TypeTable *AbilityTypeTable();
+
+inline const flatbuffers::TypeTable *StructOfStructsTypeTable();
+
+inline const flatbuffers::TypeTable *StatTypeTable();
+
+inline const flatbuffers::TypeTable *ReferrableTypeTable();
+
+inline const flatbuffers::TypeTable *MonsterTypeTable();
+
+inline const flatbuffers::TypeTable *TypeAliasesTypeTable();
+
+/// Composite components of Monster color.
+enum class Color : uint8_t {
+  Red = 1,
+  /// \brief color Green
+  /// Green is bit_flag with value (1u << 1)
+  Green = 2,
+  /// \brief color Blue (1u << 3)
+  Blue = 8,
+  NONE = 0,
+  ANY = 11
+};
+FLATBUFFERS_DEFINE_BITMASK_OPERATORS(Color, uint8_t)
+
+inline const Color (&EnumValuesColor())[3] {
+  static const Color values[] = {
+    Color::Red,
+    Color::Green,
+    Color::Blue
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesColor() {
+  static const char * const names[9] = {
+    "Red",
+    "Green",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "Blue",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameColor(Color e) {
+  if (flatbuffers::IsOutRange(e, Color::Red, Color::Blue)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(Color::Red);
+  return EnumNamesColor()[index];
+}
+
+enum class Race : int8_t {
+  None = -1,
+  Human = 0,
+  Dwarf = 1,
+  Elf = 2,
+  MIN = None,
+  MAX = Elf
+};
+
+inline const Race (&EnumValuesRace())[4] {
+  static const Race values[] = {
+    Race::None,
+    Race::Human,
+    Race::Dwarf,
+    Race::Elf
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesRace() {
+  static const char * const names[5] = {
+    "None",
+    "Human",
+    "Dwarf",
+    "Elf",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameRace(Race e) {
+  if (flatbuffers::IsOutRange(e, Race::None, Race::Elf)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(Race::None);
+  return EnumNamesRace()[index];
+}
+
+enum class Any : uint8_t {
+  NONE = 0,
+  Monster = 1,
+  TestSimpleTableWithEnum = 2,
+  MyGame_Example2_Monster = 3,
+  MIN = NONE,
+  MAX = MyGame_Example2_Monster
+};
+
+inline const Any (&EnumValuesAny())[4] {
+  static const Any values[] = {
+    Any::NONE,
+    Any::Monster,
+    Any::TestSimpleTableWithEnum,
+    Any::MyGame_Example2_Monster
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAny() {
+  static const char * const names[5] = {
+    "NONE",
+    "Monster",
+    "TestSimpleTableWithEnum",
+    "MyGame_Example2_Monster",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAny(Any e) {
+  if (flatbuffers::IsOutRange(e, Any::NONE, Any::MyGame_Example2_Monster)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAny()[index];
+}
+
+template<typename T> struct AnyTraits {
+  static const Any enum_value = Any::NONE;
+};
+
+template<> struct AnyTraits<MyGame::Example::Monster> {
+  static const Any enum_value = Any::Monster;
+};
+
+template<> struct AnyTraits<MyGame::Example::TestSimpleTableWithEnum> {
+  static const Any enum_value = Any::TestSimpleTableWithEnum;
+};
+
+template<> struct AnyTraits<MyGame::Example2::Monster> {
+  static const Any enum_value = Any::MyGame_Example2_Monster;
+};
+
+struct AnyUnion {
+  Any type;
+  void *value;
+
+  AnyUnion() : type(Any::NONE), value(nullptr) {}
+  AnyUnion(AnyUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(Any::NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyUnion(const AnyUnion &);
+  AnyUnion &operator=(const AnyUnion &u)
+    { AnyUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyUnion &operator=(AnyUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = AnyTraits<typename RT::TableType>::enum_value;
+    if (type != Any::NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, Any type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsMonster() {
+    return type == Any::Monster ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsMonster() const {
+    return type == Any::Monster ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::TestSimpleTableWithEnumT *AsTestSimpleTableWithEnum() {
+    return type == Any::TestSimpleTableWithEnum ?
+      reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnumT *AsTestSimpleTableWithEnum() const {
+    return type == Any::TestSimpleTableWithEnum ?
+      reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  MyGame::Example2::MonsterT *AsMyGame_Example2_Monster() {
+    return type == Any::MyGame_Example2_Monster ?
+      reinterpret_cast<MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example2::MonsterT *AsMyGame_Example2_Monster() const {
+    return type == Any::MyGame_Example2_Monster ?
+      reinterpret_cast<const MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+};
+
+bool VerifyAny(flatbuffers::Verifier &verifier, const void *obj, Any type);
+bool VerifyAnyVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum class AnyUniqueAliases : uint8_t {
+  NONE = 0,
+  M = 1,
+  TS = 2,
+  M2 = 3,
+  MIN = NONE,
+  MAX = M2
+};
+
+inline const AnyUniqueAliases (&EnumValuesAnyUniqueAliases())[4] {
+  static const AnyUniqueAliases values[] = {
+    AnyUniqueAliases::NONE,
+    AnyUniqueAliases::M,
+    AnyUniqueAliases::TS,
+    AnyUniqueAliases::M2
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAnyUniqueAliases() {
+  static const char * const names[5] = {
+    "NONE",
+    "M",
+    "TS",
+    "M2",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAnyUniqueAliases(AnyUniqueAliases e) {
+  if (flatbuffers::IsOutRange(e, AnyUniqueAliases::NONE, AnyUniqueAliases::M2)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAnyUniqueAliases()[index];
+}
+
+template<typename T> struct AnyUniqueAliasesTraits {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases::NONE;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example::Monster> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases::M;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example::TestSimpleTableWithEnum> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases::TS;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example2::Monster> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases::M2;
+};
+
+struct AnyUniqueAliasesUnion {
+  AnyUniqueAliases type;
+  void *value;
+
+  AnyUniqueAliasesUnion() : type(AnyUniqueAliases::NONE), value(nullptr) {}
+  AnyUniqueAliasesUnion(AnyUniqueAliasesUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(AnyUniqueAliases::NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyUniqueAliasesUnion(const AnyUniqueAliasesUnion &);
+  AnyUniqueAliasesUnion &operator=(const AnyUniqueAliasesUnion &u)
+    { AnyUniqueAliasesUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyUniqueAliasesUnion &operator=(AnyUniqueAliasesUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyUniqueAliasesUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = AnyUniqueAliasesTraits<typename RT::TableType>::enum_value;
+    if (type != AnyUniqueAliases::NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, AnyUniqueAliases type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsM() {
+    return type == AnyUniqueAliases::M ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM() const {
+    return type == AnyUniqueAliases::M ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::TestSimpleTableWithEnumT *AsTS() {
+    return type == AnyUniqueAliases::TS ?
+      reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnumT *AsTS() const {
+    return type == AnyUniqueAliases::TS ?
+      reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  MyGame::Example2::MonsterT *AsM2() {
+    return type == AnyUniqueAliases::M2 ?
+      reinterpret_cast<MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example2::MonsterT *AsM2() const {
+    return type == AnyUniqueAliases::M2 ?
+      reinterpret_cast<const MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+};
+
+bool VerifyAnyUniqueAliases(flatbuffers::Verifier &verifier, const void *obj, AnyUniqueAliases type);
+bool VerifyAnyUniqueAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum class AnyAmbiguousAliases : uint8_t {
+  NONE = 0,
+  M1 = 1,
+  M2 = 2,
+  M3 = 3,
+  MIN = NONE,
+  MAX = M3
+};
+
+inline const AnyAmbiguousAliases (&EnumValuesAnyAmbiguousAliases())[4] {
+  static const AnyAmbiguousAliases values[] = {
+    AnyAmbiguousAliases::NONE,
+    AnyAmbiguousAliases::M1,
+    AnyAmbiguousAliases::M2,
+    AnyAmbiguousAliases::M3
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAnyAmbiguousAliases() {
+  static const char * const names[5] = {
+    "NONE",
+    "M1",
+    "M2",
+    "M3",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAnyAmbiguousAliases(AnyAmbiguousAliases e) {
+  if (flatbuffers::IsOutRange(e, AnyAmbiguousAliases::NONE, AnyAmbiguousAliases::M3)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAnyAmbiguousAliases()[index];
+}
+
+struct AnyAmbiguousAliasesUnion {
+  AnyAmbiguousAliases type;
+  void *value;
+
+  AnyAmbiguousAliasesUnion() : type(AnyAmbiguousAliases::NONE), value(nullptr) {}
+  AnyAmbiguousAliasesUnion(AnyAmbiguousAliasesUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(AnyAmbiguousAliases::NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyAmbiguousAliasesUnion(const AnyAmbiguousAliasesUnion &);
+  AnyAmbiguousAliasesUnion &operator=(const AnyAmbiguousAliasesUnion &u)
+    { AnyAmbiguousAliasesUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyAmbiguousAliasesUnion &operator=(AnyAmbiguousAliasesUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyAmbiguousAliasesUnion() { Reset(); }
+
+  void Reset();
+
+  static void *UnPack(const void *obj, AnyAmbiguousAliases type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsM1() {
+    return type == AnyAmbiguousAliases::M1 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM1() const {
+    return type == AnyAmbiguousAliases::M1 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::MonsterT *AsM2() {
+    return type == AnyAmbiguousAliases::M2 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM2() const {
+    return type == AnyAmbiguousAliases::M2 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::MonsterT *AsM3() {
+    return type == AnyAmbiguousAliases::M3 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM3() const {
+    return type == AnyAmbiguousAliases::M3 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+};
+
+bool VerifyAnyAmbiguousAliases(flatbuffers::Verifier &verifier, const void *obj, AnyAmbiguousAliases type);
+bool VerifyAnyAmbiguousAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Test FLATBUFFERS_FINAL_CLASS {
+ private:
+  int16_t a_;
+  int8_t b_;
+  int8_t padding0__;
+
+ public:
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TestTypeTable();
+  }
+  Test()
+      : a_(0),
+        b_(0),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  Test(int16_t _a, int8_t _b)
+      : a_(flatbuffers::EndianScalar(_a)),
+        b_(flatbuffers::EndianScalar(_b)),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  int16_t a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  void mutate_a(int16_t _a) {
+    flatbuffers::WriteScalar(&a_, _a);
+  }
+  int8_t b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+  void mutate_b(int8_t _b) {
+    flatbuffers::WriteScalar(&b_, _b);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return a();
+    else if constexpr (Index == 1) return b();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+};
+FLATBUFFERS_STRUCT_END(Test, 4);
+
+struct Test::Traits {
+  using type = Test;
+  static constexpr auto name = "Test";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Test";
+  static constexpr std::array<const char *, 2> field_names = {
+    "a",
+    "b"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 2;
+};
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Vec3 FLATBUFFERS_FINAL_CLASS {
+ private:
+  float x_;
+  float y_;
+  float z_;
+  int32_t padding0__;
+  double test1_;
+  uint8_t test2_;
+  int8_t padding1__;
+  MyGame::Example::Test test3_;
+  int16_t padding2__;
+
+ public:
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Vec3TypeTable();
+  }
+  Vec3()
+      : x_(0),
+        y_(0),
+        z_(0),
+        padding0__(0),
+        test1_(0),
+        test2_(0),
+        padding1__(0),
+        test3_(),
+        padding2__(0) {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+  }
+  Vec3(float _x, float _y, float _z, double _test1, MyGame::Example::Color _test2, const MyGame::Example::Test &_test3)
+      : x_(flatbuffers::EndianScalar(_x)),
+        y_(flatbuffers::EndianScalar(_y)),
+        z_(flatbuffers::EndianScalar(_z)),
+        padding0__(0),
+        test1_(flatbuffers::EndianScalar(_test1)),
+        test2_(flatbuffers::EndianScalar(static_cast<uint8_t>(_test2))),
+        padding1__(0),
+        test3_(_test3),
+        padding2__(0) {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+  }
+  float x() const {
+    return flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(float _x) {
+    flatbuffers::WriteScalar(&x_, _x);
+  }
+  float y() const {
+    return flatbuffers::EndianScalar(y_);
+  }
+  void mutate_y(float _y) {
+    flatbuffers::WriteScalar(&y_, _y);
+  }
+  float z() const {
+    return flatbuffers::EndianScalar(z_);
+  }
+  void mutate_z(float _z) {
+    flatbuffers::WriteScalar(&z_, _z);
+  }
+  double test1() const {
+    return flatbuffers::EndianScalar(test1_);
+  }
+  void mutate_test1(double _test1) {
+    flatbuffers::WriteScalar(&test1_, _test1);
+  }
+  MyGame::Example::Color test2() const {
+    return static_cast<MyGame::Example::Color>(flatbuffers::EndianScalar(test2_));
+  }
+  void mutate_test2(MyGame::Example::Color _test2) {
+    flatbuffers::WriteScalar(&test2_, static_cast<uint8_t>(_test2));
+  }
+  const MyGame::Example::Test &test3() const {
+    return test3_;
+  }
+  MyGame::Example::Test &mutable_test3() {
+    return test3_;
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return x();
+    else if constexpr (Index == 1) return y();
+    else if constexpr (Index == 2) return z();
+    else if constexpr (Index == 3) return test1();
+    else if constexpr (Index == 4) return test2();
+    else if constexpr (Index == 5) return test3();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+};
+FLATBUFFERS_STRUCT_END(Vec3, 32);
+
+struct Vec3::Traits {
+  using type = Vec3;
+  static constexpr auto name = "Vec3";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Vec3";
+  static constexpr std::array<const char *, 6> field_names = {
+    "x",
+    "y",
+    "z",
+    "test1",
+    "test2",
+    "test3"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 6;
+};
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Ability FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t id_;
+  uint32_t distance_;
+
+ public:
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AbilityTypeTable();
+  }
+  Ability()
+      : id_(0),
+        distance_(0) {
+  }
+  Ability(uint32_t _id, uint32_t _distance)
+      : id_(flatbuffers::EndianScalar(_id)),
+        distance_(flatbuffers::EndianScalar(_distance)) {
+  }
+  uint32_t id() const {
+    return flatbuffers::EndianScalar(id_);
+  }
+  void mutate_id(uint32_t _id) {
+    flatbuffers::WriteScalar(&id_, _id);
+  }
+  bool KeyCompareLessThan(const Ability *o) const {
+    return id() < o->id();
+  }
+  int KeyCompareWithValue(uint32_t val) const {
+    return static_cast<int>(id() > val) - static_cast<int>(id() < val);
+  }
+  uint32_t distance() const {
+    return flatbuffers::EndianScalar(distance_);
+  }
+  void mutate_distance(uint32_t _distance) {
+    flatbuffers::WriteScalar(&distance_, _distance);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return id();
+    else if constexpr (Index == 1) return distance();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+};
+FLATBUFFERS_STRUCT_END(Ability, 8);
+
+struct Ability::Traits {
+  using type = Ability;
+  static constexpr auto name = "Ability";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Ability";
+  static constexpr std::array<const char *, 2> field_names = {
+    "id",
+    "distance"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 2;
+};
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) StructOfStructs FLATBUFFERS_FINAL_CLASS {
+ private:
+  MyGame::Example::Ability a_;
+  MyGame::Example::Test b_;
+  MyGame::Example::Ability c_;
+
+ public:
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StructOfStructsTypeTable();
+  }
+  StructOfStructs()
+      : a_(),
+        b_(),
+        c_() {
+  }
+  StructOfStructs(const MyGame::Example::Ability &_a, const MyGame::Example::Test &_b, const MyGame::Example::Ability &_c)
+      : a_(_a),
+        b_(_b),
+        c_(_c) {
+  }
+  const MyGame::Example::Ability &a() const {
+    return a_;
+  }
+  MyGame::Example::Ability &mutable_a() {
+    return a_;
+  }
+  const MyGame::Example::Test &b() const {
+    return b_;
+  }
+  MyGame::Example::Test &mutable_b() {
+    return b_;
+  }
+  const MyGame::Example::Ability &c() const {
+    return c_;
+  }
+  MyGame::Example::Ability &mutable_c() {
+    return c_;
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return a();
+    else if constexpr (Index == 1) return b();
+    else if constexpr (Index == 2) return c();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+};
+FLATBUFFERS_STRUCT_END(StructOfStructs, 20);
+
+struct StructOfStructs::Traits {
+  using type = StructOfStructs;
+  static constexpr auto name = "StructOfStructs";
+  static constexpr auto fully_qualified_name = "MyGame.Example.StructOfStructs";
+  static constexpr std::array<const char *, 3> field_names = {
+    "a",
+    "b",
+    "c"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 3;
+};
+
+}  // namespace Example
+
+struct InParentNamespaceT : public flatbuffers::NativeTable {
+  typedef InParentNamespace TableType;
+};
+
+struct InParentNamespace FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef InParentNamespaceT NativeTableType;
+  typedef InParentNamespaceBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return InParentNamespaceTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  InParentNamespaceT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InParentNamespaceT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<InParentNamespace> Pack(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InParentNamespaceBuilder {
+  typedef InParentNamespace Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit InParentNamespaceBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<InParentNamespace> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<InParentNamespace>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  InParentNamespaceBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct InParentNamespace::Traits {
+  using type = InParentNamespace;
+  static auto constexpr Create = CreateInParentNamespace;
+  static constexpr auto name = "InParentNamespace";
+  static constexpr auto fully_qualified_name = "MyGame.InParentNamespace";
+  static constexpr std::array<const char *, 0> field_names = {};
+  static constexpr size_t fields_number = 0;
+};
+
+flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace Example2 {
+
+struct MonsterT : public flatbuffers::NativeTable {
+  typedef Monster TableType;
+};
+
+struct Monster FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterT NativeTableType;
+  typedef MonsterBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MonsterT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Monster> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MonsterBuilder {
+  typedef Monster Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Monster> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Monster>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Monster> CreateMonster(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MonsterBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct Monster::Traits {
+  using type = Monster;
+  static auto constexpr Create = CreateMonster;
+  static constexpr auto name = "Monster";
+  static constexpr auto fully_qualified_name = "MyGame.Example2.Monster";
+  static constexpr std::array<const char *, 0> field_names = {};
+  static constexpr size_t fields_number = 0;
+};
+
+flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace Example2
+
+namespace Example {
+
+struct TestSimpleTableWithEnumT : public flatbuffers::NativeTable {
+  typedef TestSimpleTableWithEnum TableType;
+  MyGame::Example::Color color = MyGame::Example::Color::Green;
+};
+
+struct TestSimpleTableWithEnum FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TestSimpleTableWithEnumT NativeTableType;
+  typedef TestSimpleTableWithEnumBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TestSimpleTableWithEnumTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COLOR = 4
+  };
+  MyGame::Example::Color color() const {
+    return static_cast<MyGame::Example::Color>(GetField<uint8_t>(VT_COLOR, 2));
+  }
+  bool mutate_color(MyGame::Example::Color _color) {
+    return SetField<uint8_t>(VT_COLOR, static_cast<uint8_t>(_color), 2);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return color();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_COLOR) &&
+           verifier.EndTable();
+  }
+  TestSimpleTableWithEnumT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TestSimpleTableWithEnumT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TestSimpleTableWithEnum> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TestSimpleTableWithEnumBuilder {
+  typedef TestSimpleTableWithEnum Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_color(MyGame::Example::Color color) {
+    fbb_.AddElement<uint8_t>(TestSimpleTableWithEnum::VT_COLOR, static_cast<uint8_t>(color), 2);
+  }
+  explicit TestSimpleTableWithEnumBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TestSimpleTableWithEnum> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TestSimpleTableWithEnum>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MyGame::Example::Color color = MyGame::Example::Color::Green) {
+  TestSimpleTableWithEnumBuilder builder_(_fbb);
+  builder_.add_color(color);
+  return builder_.Finish();
+}
+
+struct TestSimpleTableWithEnum::Traits {
+  using type = TestSimpleTableWithEnum;
+  static auto constexpr Create = CreateTestSimpleTableWithEnum;
+  static constexpr auto name = "TestSimpleTableWithEnum";
+  static constexpr auto fully_qualified_name = "MyGame.Example.TestSimpleTableWithEnum";
+  static constexpr std::array<const char *, 1> field_names = {
+    "color"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 1;
+};
+
+flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StatT : public flatbuffers::NativeTable {
+  typedef Stat TableType;
+  std::string id{};
+  int64_t val = 0;
+  uint16_t count = 0;
+};
+
+struct Stat FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef StatT NativeTableType;
+  typedef StatBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StatTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4,
+    VT_VAL = 6,
+    VT_COUNT = 8
+  };
+  const flatbuffers::String *id() const {
+    return GetPointer<const flatbuffers::String *>(VT_ID);
+  }
+  flatbuffers::String *mutable_id() {
+    return GetPointer<flatbuffers::String *>(VT_ID);
+  }
+  int64_t val() const {
+    return GetField<int64_t>(VT_VAL, 0);
+  }
+  bool mutate_val(int64_t _val) {
+    return SetField<int64_t>(VT_VAL, _val, 0);
+  }
+  uint16_t count() const {
+    return GetField<uint16_t>(VT_COUNT, 0);
+  }
+  bool mutate_count(uint16_t _count) {
+    return SetField<uint16_t>(VT_COUNT, _count, 0);
+  }
+  bool KeyCompareLessThan(const Stat *o) const {
+    return count() < o->count();
+  }
+  int KeyCompareWithValue(uint16_t val) const {
+    return static_cast<int>(count() > val) - static_cast<int>(count() < val);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return id();
+    else if constexpr (Index == 1) return val();
+    else if constexpr (Index == 2) return count();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyString(id()) &&
+           VerifyField<int64_t>(verifier, VT_VAL) &&
+           VerifyField<uint16_t>(verifier, VT_COUNT) &&
+           verifier.EndTable();
+  }
+  StatT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StatT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Stat> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StatT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StatBuilder {
+  typedef Stat Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_id(flatbuffers::Offset<flatbuffers::String> id) {
+    fbb_.AddOffset(Stat::VT_ID, id);
+  }
+  void add_val(int64_t val) {
+    fbb_.AddElement<int64_t>(Stat::VT_VAL, val, 0);
+  }
+  void add_count(uint16_t count) {
+    fbb_.AddElement<uint16_t>(Stat::VT_COUNT, count, 0);
+  }
+  explicit StatBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Stat> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Stat>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Stat> CreateStat(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> id = 0,
+    int64_t val = 0,
+    uint16_t count = 0) {
+  StatBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_id(id);
+  builder_.add_count(count);
+  return builder_.Finish();
+}
+
+struct Stat::Traits {
+  using type = Stat;
+  static auto constexpr Create = CreateStat;
+  static constexpr auto name = "Stat";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Stat";
+  static constexpr std::array<const char *, 3> field_names = {
+    "id",
+    "val",
+    "count"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 3;
+};
+
+inline flatbuffers::Offset<Stat> CreateStatDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *id = nullptr,
+    int64_t val = 0,
+    uint16_t count = 0) {
+  auto id__ = id ? _fbb.CreateString(id) : 0;
+  return MyGame::Example::CreateStat(
+      _fbb,
+      id__,
+      val,
+      count);
+}
+
+flatbuffers::Offset<Stat> CreateStat(flatbuffers::FlatBufferBuilder &_fbb, const StatT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReferrableT : public flatbuffers::NativeTable {
+  typedef Referrable TableType;
+  uint64_t id = 0;
+};
+
+struct Referrable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReferrableT NativeTableType;
+  typedef ReferrableBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReferrableTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4
+  };
+  uint64_t id() const {
+    return GetField<uint64_t>(VT_ID, 0);
+  }
+  bool mutate_id(uint64_t _id) {
+    return SetField<uint64_t>(VT_ID, _id, 0);
+  }
+  bool KeyCompareLessThan(const Referrable *o) const {
+    return id() < o->id();
+  }
+  int KeyCompareWithValue(uint64_t val) const {
+    return static_cast<int>(id() > val) - static_cast<int>(id() < val);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return id();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_ID) &&
+           verifier.EndTable();
+  }
+  ReferrableT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReferrableT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Referrable> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReferrableBuilder {
+  typedef Referrable Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_id(uint64_t id) {
+    fbb_.AddElement<uint64_t>(Referrable::VT_ID, id, 0);
+  }
+  explicit ReferrableBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Referrable> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Referrable>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Referrable> CreateReferrable(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t id = 0) {
+  ReferrableBuilder builder_(_fbb);
+  builder_.add_id(id);
+  return builder_.Finish();
+}
+
+struct Referrable::Traits {
+  using type = Referrable;
+  static auto constexpr Create = CreateReferrable;
+  static constexpr auto name = "Referrable";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Referrable";
+  static constexpr std::array<const char *, 1> field_names = {
+    "id"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 1;
+};
+
+flatbuffers::Offset<Referrable> CreateReferrable(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MonsterT : public flatbuffers::NativeTable {
+  typedef Monster TableType;
+  std::unique_ptr<MyGame::Example::Vec3> pos{};
+  int16_t mana = 150;
+  int16_t hp = 100;
+  std::string name{};
+  std::vector<uint8_t> inventory{};
+  MyGame::Example::Color color = MyGame::Example::Color::Blue;
+  MyGame::Example::AnyUnion test{};
+  std::vector<MyGame::Example::Test> test4{};
+  std::vector<std::string> testarrayofstring{};
+  std::vector<std::unique_ptr<MyGame::Example::MonsterT>> testarrayoftables{};
+  std::unique_ptr<MyGame::Example::MonsterT> enemy{};
+  std::vector<uint8_t> testnestedflatbuffer{};
+  std::unique_ptr<MyGame::Example::StatT> testempty{};
+  bool testbool = false;
+  int32_t testhashs32_fnv1 = 0;
+  uint32_t testhashu32_fnv1 = 0;
+  int64_t testhashs64_fnv1 = 0;
+  uint64_t testhashu64_fnv1 = 0;
+  int32_t testhashs32_fnv1a = 0;
+  Stat *testhashu32_fnv1a = nullptr;
+  int64_t testhashs64_fnv1a = 0;
+  uint64_t testhashu64_fnv1a = 0;
+  std::vector<bool> testarrayofbools{};
+  float testf = 3.14159f;
+  float testf2 = 3.0f;
+  float testf3 = 0.0f;
+  std::vector<std::string> testarrayofstring2{};
+  std::vector<MyGame::Example::Ability> testarrayofsortedstruct{};
+  std::vector<uint8_t> flex{};
+  std::vector<MyGame::Example::Test> test5{};
+  std::vector<int64_t> vector_of_longs{};
+  std::vector<double> vector_of_doubles{};
+  std::unique_ptr<MyGame::InParentNamespaceT> parent_namespace_test{};
+  std::vector<std::unique_ptr<MyGame::Example::ReferrableT>> vector_of_referrables{};
+  ReferrableT *single_weak_reference = nullptr;
+  std::vector<ReferrableT *> vector_of_weak_references{};
+  std::vector<std::unique_ptr<MyGame::Example::ReferrableT>> vector_of_strong_referrables{};
+  ReferrableT *co_owning_reference = nullptr;
+  std::vector<std::unique_ptr<ReferrableT>> vector_of_co_owning_references{};
+  ReferrableT *non_owning_reference = nullptr;
+  std::vector<ReferrableT *> vector_of_non_owning_references{};
+  MyGame::Example::AnyUniqueAliasesUnion any_unique{};
+  MyGame::Example::AnyAmbiguousAliasesUnion any_ambiguous{};
+  std::vector<MyGame::Example::Color> vector_of_enums{};
+  MyGame::Example::Race signed_enum = MyGame::Example::Race::None;
+  std::vector<uint8_t> testrequirednestedflatbuffer{};
+  std::vector<std::unique_ptr<MyGame::Example::StatT>> scalar_key_sorted_tables{};
+};
+
+/// an example documentation comment: "monster object"
+struct Monster FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterT NativeTableType;
+  typedef MonsterBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_POS = 4,
+    VT_MANA = 6,
+    VT_HP = 8,
+    VT_NAME = 10,
+    VT_INVENTORY = 14,
+    VT_COLOR = 16,
+    VT_TEST_TYPE = 18,
+    VT_TEST = 20,
+    VT_TEST4 = 22,
+    VT_TESTARRAYOFSTRING = 24,
+    VT_TESTARRAYOFTABLES = 26,
+    VT_ENEMY = 28,
+    VT_TESTNESTEDFLATBUFFER = 30,
+    VT_TESTEMPTY = 32,
+    VT_TESTBOOL = 34,
+    VT_TESTHASHS32_FNV1 = 36,
+    VT_TESTHASHU32_FNV1 = 38,
+    VT_TESTHASHS64_FNV1 = 40,
+    VT_TESTHASHU64_FNV1 = 42,
+    VT_TESTHASHS32_FNV1A = 44,
+    VT_TESTHASHU32_FNV1A = 46,
+    VT_TESTHASHS64_FNV1A = 48,
+    VT_TESTHASHU64_FNV1A = 50,
+    VT_TESTARRAYOFBOOLS = 52,
+    VT_TESTF = 54,
+    VT_TESTF2 = 56,
+    VT_TESTF3 = 58,
+    VT_TESTARRAYOFSTRING2 = 60,
+    VT_TESTARRAYOFSORTEDSTRUCT = 62,
+    VT_FLEX = 64,
+    VT_TEST5 = 66,
+    VT_VECTOR_OF_LONGS = 68,
+    VT_VECTOR_OF_DOUBLES = 70,
+    VT_PARENT_NAMESPACE_TEST = 72,
+    VT_VECTOR_OF_REFERRABLES = 74,
+    VT_SINGLE_WEAK_REFERENCE = 76,
+    VT_VECTOR_OF_WEAK_REFERENCES = 78,
+    VT_VECTOR_OF_STRONG_REFERRABLES = 80,
+    VT_CO_OWNING_REFERENCE = 82,
+    VT_VECTOR_OF_CO_OWNING_REFERENCES = 84,
+    VT_NON_OWNING_REFERENCE = 86,
+    VT_VECTOR_OF_NON_OWNING_REFERENCES = 88,
+    VT_ANY_UNIQUE_TYPE = 90,
+    VT_ANY_UNIQUE = 92,
+    VT_ANY_AMBIGUOUS_TYPE = 94,
+    VT_ANY_AMBIGUOUS = 96,
+    VT_VECTOR_OF_ENUMS = 98,
+    VT_SIGNED_ENUM = 100,
+    VT_TESTREQUIREDNESTEDFLATBUFFER = 102,
+    VT_SCALAR_KEY_SORTED_TABLES = 104
+  };
+  const MyGame::Example::Vec3 *pos() const {
+    return GetStruct<const MyGame::Example::Vec3 *>(VT_POS);
+  }
+  MyGame::Example::Vec3 *mutable_pos() {
+    return GetStruct<MyGame::Example::Vec3 *>(VT_POS);
+  }
+  int16_t mana() const {
+    return GetField<int16_t>(VT_MANA, 150);
+  }
+  bool mutate_mana(int16_t _mana) {
+    return SetField<int16_t>(VT_MANA, _mana, 150);
+  }
+  int16_t hp() const {
+    return GetField<int16_t>(VT_HP, 100);
+  }
+  bool mutate_hp(int16_t _hp) {
+    return SetField<int16_t>(VT_HP, _hp, 100);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  flatbuffers::String *mutable_name() {
+    return GetPointer<flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Monster *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const flatbuffers::Vector<uint8_t> *inventory() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_inventory() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  MyGame::Example::Color color() const {
+    return static_cast<MyGame::Example::Color>(GetField<uint8_t>(VT_COLOR, 8));
+  }
+  bool mutate_color(MyGame::Example::Color _color) {
+    return SetField<uint8_t>(VT_COLOR, static_cast<uint8_t>(_color), 8);
+  }
+  MyGame::Example::Any test_type() const {
+    return static_cast<MyGame::Example::Any>(GetField<uint8_t>(VT_TEST_TYPE, 0));
+  }
+  const void *test() const {
+    return GetPointer<const void *>(VT_TEST);
+  }
+  template<typename T> const T *test_as() const;
+  const MyGame::Example::Monster *test_as_Monster() const {
+    return test_type() == MyGame::Example::Any::Monster ? static_cast<const MyGame::Example::Monster *>(test()) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnum *test_as_TestSimpleTableWithEnum() const {
+    return test_type() == MyGame::Example::Any::TestSimpleTableWithEnum ? static_cast<const MyGame::Example::TestSimpleTableWithEnum *>(test()) : nullptr;
+  }
+  const MyGame::Example2::Monster *test_as_MyGame_Example2_Monster() const {
+    return test_type() == MyGame::Example::Any::MyGame_Example2_Monster ? static_cast<const MyGame::Example2::Monster *>(test()) : nullptr;
+  }
+  void *mutable_test() {
+    return GetPointer<void *>(VT_TEST);
+  }
+  const flatbuffers::Vector<const MyGame::Example::Test *> *test4() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST4);
+  }
+  flatbuffers::Vector<const MyGame::Example::Test *> *mutable_test4() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST4);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *mutable_testarrayofstring() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING);
+  }
+  /// an example documentation comment: this will end up in the generated code
+  /// multiline too
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *testarrayoftables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *>(VT_TESTARRAYOFTABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *mutable_testarrayoftables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *>(VT_TESTARRAYOFTABLES);
+  }
+  const MyGame::Example::Monster *enemy() const {
+    return GetPointer<const MyGame::Example::Monster *>(VT_ENEMY);
+  }
+  MyGame::Example::Monster *mutable_enemy() {
+    return GetPointer<MyGame::Example::Monster *>(VT_ENEMY);
+  }
+  const flatbuffers::Vector<uint8_t> *testnestedflatbuffer() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTNESTEDFLATBUFFER);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testnestedflatbuffer() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTNESTEDFLATBUFFER);
+  }
+  const MyGame::Example::Monster *testnestedflatbuffer_nested_root() const {
+    return flatbuffers::GetRoot<MyGame::Example::Monster>(testnestedflatbuffer()->Data());
+  }
+  const MyGame::Example::Stat *testempty() const {
+    return GetPointer<const MyGame::Example::Stat *>(VT_TESTEMPTY);
+  }
+  MyGame::Example::Stat *mutable_testempty() {
+    return GetPointer<MyGame::Example::Stat *>(VT_TESTEMPTY);
+  }
+  bool testbool() const {
+    return GetField<uint8_t>(VT_TESTBOOL, 0) != 0;
+  }
+  bool mutate_testbool(bool _testbool) {
+    return SetField<uint8_t>(VT_TESTBOOL, static_cast<uint8_t>(_testbool), 0);
+  }
+  int32_t testhashs32_fnv1() const {
+    return GetField<int32_t>(VT_TESTHASHS32_FNV1, 0);
+  }
+  bool mutate_testhashs32_fnv1(int32_t _testhashs32_fnv1) {
+    return SetField<int32_t>(VT_TESTHASHS32_FNV1, _testhashs32_fnv1, 0);
+  }
+  uint32_t testhashu32_fnv1() const {
+    return GetField<uint32_t>(VT_TESTHASHU32_FNV1, 0);
+  }
+  bool mutate_testhashu32_fnv1(uint32_t _testhashu32_fnv1) {
+    return SetField<uint32_t>(VT_TESTHASHU32_FNV1, _testhashu32_fnv1, 0);
+  }
+  int64_t testhashs64_fnv1() const {
+    return GetField<int64_t>(VT_TESTHASHS64_FNV1, 0);
+  }
+  bool mutate_testhashs64_fnv1(int64_t _testhashs64_fnv1) {
+    return SetField<int64_t>(VT_TESTHASHS64_FNV1, _testhashs64_fnv1, 0);
+  }
+  uint64_t testhashu64_fnv1() const {
+    return GetField<uint64_t>(VT_TESTHASHU64_FNV1, 0);
+  }
+  bool mutate_testhashu64_fnv1(uint64_t _testhashu64_fnv1) {
+    return SetField<uint64_t>(VT_TESTHASHU64_FNV1, _testhashu64_fnv1, 0);
+  }
+  int32_t testhashs32_fnv1a() const {
+    return GetField<int32_t>(VT_TESTHASHS32_FNV1A, 0);
+  }
+  bool mutate_testhashs32_fnv1a(int32_t _testhashs32_fnv1a) {
+    return SetField<int32_t>(VT_TESTHASHS32_FNV1A, _testhashs32_fnv1a, 0);
+  }
+  uint32_t testhashu32_fnv1a() const {
+    return GetField<uint32_t>(VT_TESTHASHU32_FNV1A, 0);
+  }
+  bool mutate_testhashu32_fnv1a(uint32_t _testhashu32_fnv1a) {
+    return SetField<uint32_t>(VT_TESTHASHU32_FNV1A, _testhashu32_fnv1a, 0);
+  }
+  int64_t testhashs64_fnv1a() const {
+    return GetField<int64_t>(VT_TESTHASHS64_FNV1A, 0);
+  }
+  bool mutate_testhashs64_fnv1a(int64_t _testhashs64_fnv1a) {
+    return SetField<int64_t>(VT_TESTHASHS64_FNV1A, _testhashs64_fnv1a, 0);
+  }
+  uint64_t testhashu64_fnv1a() const {
+    return GetField<uint64_t>(VT_TESTHASHU64_FNV1A, 0);
+  }
+  bool mutate_testhashu64_fnv1a(uint64_t _testhashu64_fnv1a) {
+    return SetField<uint64_t>(VT_TESTHASHU64_FNV1A, _testhashu64_fnv1a, 0);
+  }
+  const flatbuffers::Vector<uint8_t> *testarrayofbools() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTARRAYOFBOOLS);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testarrayofbools() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTARRAYOFBOOLS);
+  }
+  float testf() const {
+    return GetField<float>(VT_TESTF, 3.14159f);
+  }
+  bool mutate_testf(float _testf) {
+    return SetField<float>(VT_TESTF, _testf, 3.14159f);
+  }
+  float testf2() const {
+    return GetField<float>(VT_TESTF2, 3.0f);
+  }
+  bool mutate_testf2(float _testf2) {
+    return SetField<float>(VT_TESTF2, _testf2, 3.0f);
+  }
+  float testf3() const {
+    return GetField<float>(VT_TESTF3, 0.0f);
+  }
+  bool mutate_testf3(float _testf3) {
+    return SetField<float>(VT_TESTF3, _testf3, 0.0f);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring2() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING2);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *mutable_testarrayofstring2() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING2);
+  }
+  const flatbuffers::Vector<const MyGame::Example::Ability *> *testarrayofsortedstruct() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Ability *> *>(VT_TESTARRAYOFSORTEDSTRUCT);
+  }
+  flatbuffers::Vector<const MyGame::Example::Ability *> *mutable_testarrayofsortedstruct() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Ability *> *>(VT_TESTARRAYOFSORTEDSTRUCT);
+  }
+  const flatbuffers::Vector<uint8_t> *flex() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_FLEX);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_flex() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_FLEX);
+  }
+  flexbuffers::Reference flex_flexbuffer_root() const {
+    return flexbuffers::GetRoot(flex()->Data(), flex()->size());
+  }
+  const flatbuffers::Vector<const MyGame::Example::Test *> *test5() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST5);
+  }
+  flatbuffers::Vector<const MyGame::Example::Test *> *mutable_test5() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST5);
+  }
+  const flatbuffers::Vector<int64_t> *vector_of_longs() const {
+    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_VECTOR_OF_LONGS);
+  }
+  flatbuffers::Vector<int64_t> *mutable_vector_of_longs() {
+    return GetPointer<flatbuffers::Vector<int64_t> *>(VT_VECTOR_OF_LONGS);
+  }
+  const flatbuffers::Vector<double> *vector_of_doubles() const {
+    return GetPointer<const flatbuffers::Vector<double> *>(VT_VECTOR_OF_DOUBLES);
+  }
+  flatbuffers::Vector<double> *mutable_vector_of_doubles() {
+    return GetPointer<flatbuffers::Vector<double> *>(VT_VECTOR_OF_DOUBLES);
+  }
+  const MyGame::InParentNamespace *parent_namespace_test() const {
+    return GetPointer<const MyGame::InParentNamespace *>(VT_PARENT_NAMESPACE_TEST);
+  }
+  MyGame::InParentNamespace *mutable_parent_namespace_test() {
+    return GetPointer<MyGame::InParentNamespace *>(VT_PARENT_NAMESPACE_TEST);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_referrables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_REFERRABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *mutable_vector_of_referrables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_REFERRABLES);
+  }
+  uint64_t single_weak_reference() const {
+    return GetField<uint64_t>(VT_SINGLE_WEAK_REFERENCE, 0);
+  }
+  bool mutate_single_weak_reference(uint64_t _single_weak_reference) {
+    return SetField<uint64_t>(VT_SINGLE_WEAK_REFERENCE, _single_weak_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_weak_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_WEAK_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_weak_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_WEAK_REFERENCES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_strong_referrables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_STRONG_REFERRABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *mutable_vector_of_strong_referrables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_STRONG_REFERRABLES);
+  }
+  uint64_t co_owning_reference() const {
+    return GetField<uint64_t>(VT_CO_OWNING_REFERENCE, 0);
+  }
+  bool mutate_co_owning_reference(uint64_t _co_owning_reference) {
+    return SetField<uint64_t>(VT_CO_OWNING_REFERENCE, _co_owning_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_co_owning_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_CO_OWNING_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_co_owning_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_CO_OWNING_REFERENCES);
+  }
+  uint64_t non_owning_reference() const {
+    return GetField<uint64_t>(VT_NON_OWNING_REFERENCE, 0);
+  }
+  bool mutate_non_owning_reference(uint64_t _non_owning_reference) {
+    return SetField<uint64_t>(VT_NON_OWNING_REFERENCE, _non_owning_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_non_owning_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_NON_OWNING_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_non_owning_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_NON_OWNING_REFERENCES);
+  }
+  MyGame::Example::AnyUniqueAliases any_unique_type() const {
+    return static_cast<MyGame::Example::AnyUniqueAliases>(GetField<uint8_t>(VT_ANY_UNIQUE_TYPE, 0));
+  }
+  const void *any_unique() const {
+    return GetPointer<const void *>(VT_ANY_UNIQUE);
+  }
+  template<typename T> const T *any_unique_as() const;
+  const MyGame::Example::Monster *any_unique_as_M() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases::M ? static_cast<const MyGame::Example::Monster *>(any_unique()) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnum *any_unique_as_TS() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases::TS ? static_cast<const MyGame::Example::TestSimpleTableWithEnum *>(any_unique()) : nullptr;
+  }
+  const MyGame::Example2::Monster *any_unique_as_M2() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases::M2 ? static_cast<const MyGame::Example2::Monster *>(any_unique()) : nullptr;
+  }
+  void *mutable_any_unique() {
+    return GetPointer<void *>(VT_ANY_UNIQUE);
+  }
+  MyGame::Example::AnyAmbiguousAliases any_ambiguous_type() const {
+    return static_cast<MyGame::Example::AnyAmbiguousAliases>(GetField<uint8_t>(VT_ANY_AMBIGUOUS_TYPE, 0));
+  }
+  const void *any_ambiguous() const {
+    return GetPointer<const void *>(VT_ANY_AMBIGUOUS);
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M1() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases::M1 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M2() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases::M2 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M3() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases::M3 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  void *mutable_any_ambiguous() {
+    return GetPointer<void *>(VT_ANY_AMBIGUOUS);
+  }
+  const flatbuffers::Vector<MyGame::Example::Color> *vector_of_enums() const {
+    return GetPointer<const flatbuffers::Vector<MyGame::Example::Color> *>(VT_VECTOR_OF_ENUMS);
+  }
+  flatbuffers::Vector<MyGame::Example::Color> *mutable_vector_of_enums() {
+    return GetPointer<flatbuffers::Vector<MyGame::Example::Color> *>(VT_VECTOR_OF_ENUMS);
+  }
+  MyGame::Example::Race signed_enum() const {
+    return static_cast<MyGame::Example::Race>(GetField<int8_t>(VT_SIGNED_ENUM, -1));
+  }
+  bool mutate_signed_enum(MyGame::Example::Race _signed_enum) {
+    return SetField<int8_t>(VT_SIGNED_ENUM, static_cast<int8_t>(_signed_enum), -1);
+  }
+  const flatbuffers::Vector<uint8_t> *testrequirednestedflatbuffer() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTREQUIREDNESTEDFLATBUFFER);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testrequirednestedflatbuffer() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTREQUIREDNESTEDFLATBUFFER);
+  }
+  const MyGame::Example::Monster *testrequirednestedflatbuffer_nested_root() const {
+    return flatbuffers::GetRoot<MyGame::Example::Monster>(testrequirednestedflatbuffer()->Data());
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *scalar_key_sorted_tables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *>(VT_SCALAR_KEY_SORTED_TABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *mutable_scalar_key_sorted_tables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *>(VT_SCALAR_KEY_SORTED_TABLES);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return pos();
+    else if constexpr (Index == 1) return mana();
+    else if constexpr (Index == 2) return hp();
+    else if constexpr (Index == 3) return name();
+    else if constexpr (Index == 4) return inventory();
+    else if constexpr (Index == 5) return color();
+    else if constexpr (Index == 6) return test_type();
+    else if constexpr (Index == 7) return test();
+    else if constexpr (Index == 8) return test4();
+    else if constexpr (Index == 9) return testarrayofstring();
+    else if constexpr (Index == 10) return testarrayoftables();
+    else if constexpr (Index == 11) return enemy();
+    else if constexpr (Index == 12) return testnestedflatbuffer();
+    else if constexpr (Index == 13) return testempty();
+    else if constexpr (Index == 14) return testbool();
+    else if constexpr (Index == 15) return testhashs32_fnv1();
+    else if constexpr (Index == 16) return testhashu32_fnv1();
+    else if constexpr (Index == 17) return testhashs64_fnv1();
+    else if constexpr (Index == 18) return testhashu64_fnv1();
+    else if constexpr (Index == 19) return testhashs32_fnv1a();
+    else if constexpr (Index == 20) return testhashu32_fnv1a();
+    else if constexpr (Index == 21) return testhashs64_fnv1a();
+    else if constexpr (Index == 22) return testhashu64_fnv1a();
+    else if constexpr (Index == 23) return testarrayofbools();
+    else if constexpr (Index == 24) return testf();
+    else if constexpr (Index == 25) return testf2();
+    else if constexpr (Index == 26) return testf3();
+    else if constexpr (Index == 27) return testarrayofstring2();
+    else if constexpr (Index == 28) return testarrayofsortedstruct();
+    else if constexpr (Index == 29) return flex();
+    else if constexpr (Index == 30) return test5();
+    else if constexpr (Index == 31) return vector_of_longs();
+    else if constexpr (Index == 32) return vector_of_doubles();
+    else if constexpr (Index == 33) return parent_namespace_test();
+    else if constexpr (Index == 34) return vector_of_referrables();
+    else if constexpr (Index == 35) return single_weak_reference();
+    else if constexpr (Index == 36) return vector_of_weak_references();
+    else if constexpr (Index == 37) return vector_of_strong_referrables();
+    else if constexpr (Index == 38) return co_owning_reference();
+    else if constexpr (Index == 39) return vector_of_co_owning_references();
+    else if constexpr (Index == 40) return non_owning_reference();
+    else if constexpr (Index == 41) return vector_of_non_owning_references();
+    else if constexpr (Index == 42) return any_unique_type();
+    else if constexpr (Index == 43) return any_unique();
+    else if constexpr (Index == 44) return any_ambiguous_type();
+    else if constexpr (Index == 45) return any_ambiguous();
+    else if constexpr (Index == 46) return vector_of_enums();
+    else if constexpr (Index == 47) return signed_enum();
+    else if constexpr (Index == 48) return testrequirednestedflatbuffer();
+    else if constexpr (Index == 49) return scalar_key_sorted_tables();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<MyGame::Example::Vec3>(verifier, VT_POS) &&
+           VerifyField<int16_t>(verifier, VT_MANA) &&
+           VerifyField<int16_t>(verifier, VT_HP) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_INVENTORY) &&
+           verifier.VerifyVector(inventory()) &&
+           VerifyField<uint8_t>(verifier, VT_COLOR) &&
+           VerifyField<uint8_t>(verifier, VT_TEST_TYPE) &&
+           VerifyOffset(verifier, VT_TEST) &&
+           VerifyAny(verifier, test(), test_type()) &&
+           VerifyOffset(verifier, VT_TEST4) &&
+           verifier.VerifyVector(test4()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSTRING) &&
+           verifier.VerifyVector(testarrayofstring()) &&
+           verifier.VerifyVectorOfStrings(testarrayofstring()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFTABLES) &&
+           verifier.VerifyVector(testarrayoftables()) &&
+           verifier.VerifyVectorOfTables(testarrayoftables()) &&
+           VerifyOffset(verifier, VT_ENEMY) &&
+           verifier.VerifyTable(enemy()) &&
+           VerifyOffset(verifier, VT_TESTNESTEDFLATBUFFER) &&
+           verifier.VerifyVector(testnestedflatbuffer()) &&
+           VerifyOffset(verifier, VT_TESTEMPTY) &&
+           verifier.VerifyTable(testempty()) &&
+           VerifyField<uint8_t>(verifier, VT_TESTBOOL) &&
+           VerifyField<int32_t>(verifier, VT_TESTHASHS32_FNV1) &&
+           VerifyField<uint32_t>(verifier, VT_TESTHASHU32_FNV1) &&
+           VerifyField<int64_t>(verifier, VT_TESTHASHS64_FNV1) &&
+           VerifyField<uint64_t>(verifier, VT_TESTHASHU64_FNV1) &&
+           VerifyField<int32_t>(verifier, VT_TESTHASHS32_FNV1A) &&
+           VerifyField<uint32_t>(verifier, VT_TESTHASHU32_FNV1A) &&
+           VerifyField<int64_t>(verifier, VT_TESTHASHS64_FNV1A) &&
+           VerifyField<uint64_t>(verifier, VT_TESTHASHU64_FNV1A) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFBOOLS) &&
+           verifier.VerifyVector(testarrayofbools()) &&
+           VerifyField<float>(verifier, VT_TESTF) &&
+           VerifyField<float>(verifier, VT_TESTF2) &&
+           VerifyField<float>(verifier, VT_TESTF3) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSTRING2) &&
+           verifier.VerifyVector(testarrayofstring2()) &&
+           verifier.VerifyVectorOfStrings(testarrayofstring2()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSORTEDSTRUCT) &&
+           verifier.VerifyVector(testarrayofsortedstruct()) &&
+           VerifyOffset(verifier, VT_FLEX) &&
+           verifier.VerifyVector(flex()) &&
+           VerifyOffset(verifier, VT_TEST5) &&
+           verifier.VerifyVector(test5()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_LONGS) &&
+           verifier.VerifyVector(vector_of_longs()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_DOUBLES) &&
+           verifier.VerifyVector(vector_of_doubles()) &&
+           VerifyOffset(verifier, VT_PARENT_NAMESPACE_TEST) &&
+           verifier.VerifyTable(parent_namespace_test()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_REFERRABLES) &&
+           verifier.VerifyVector(vector_of_referrables()) &&
+           verifier.VerifyVectorOfTables(vector_of_referrables()) &&
+           VerifyField<uint64_t>(verifier, VT_SINGLE_WEAK_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_WEAK_REFERENCES) &&
+           verifier.VerifyVector(vector_of_weak_references()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_STRONG_REFERRABLES) &&
+           verifier.VerifyVector(vector_of_strong_referrables()) &&
+           verifier.VerifyVectorOfTables(vector_of_strong_referrables()) &&
+           VerifyField<uint64_t>(verifier, VT_CO_OWNING_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_CO_OWNING_REFERENCES) &&
+           verifier.VerifyVector(vector_of_co_owning_references()) &&
+           VerifyField<uint64_t>(verifier, VT_NON_OWNING_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_NON_OWNING_REFERENCES) &&
+           verifier.VerifyVector(vector_of_non_owning_references()) &&
+           VerifyField<uint8_t>(verifier, VT_ANY_UNIQUE_TYPE) &&
+           VerifyOffset(verifier, VT_ANY_UNIQUE) &&
+           VerifyAnyUniqueAliases(verifier, any_unique(), any_unique_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ANY_AMBIGUOUS_TYPE) &&
+           VerifyOffset(verifier, VT_ANY_AMBIGUOUS) &&
+           VerifyAnyAmbiguousAliases(verifier, any_ambiguous(), any_ambiguous_type()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_ENUMS) &&
+           verifier.VerifyVector(vector_of_enums()) &&
+           VerifyField<int8_t>(verifier, VT_SIGNED_ENUM) &&
+           VerifyOffset(verifier, VT_TESTREQUIREDNESTEDFLATBUFFER) &&
+           verifier.VerifyVector(testrequirednestedflatbuffer()) &&
+           VerifyOffset(verifier, VT_SCALAR_KEY_SORTED_TABLES) &&
+           verifier.VerifyVector(scalar_key_sorted_tables()) &&
+           verifier.VerifyVectorOfTables(scalar_key_sorted_tables()) &&
+           verifier.EndTable();
+  }
+  MonsterT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Monster> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const MyGame::Example::Monster *Monster::test_as<MyGame::Example::Monster>() const {
+  return test_as_Monster();
+}
+
+template<> inline const MyGame::Example::TestSimpleTableWithEnum *Monster::test_as<MyGame::Example::TestSimpleTableWithEnum>() const {
+  return test_as_TestSimpleTableWithEnum();
+}
+
+template<> inline const MyGame::Example2::Monster *Monster::test_as<MyGame::Example2::Monster>() const {
+  return test_as_MyGame_Example2_Monster();
+}
+
+template<> inline const MyGame::Example::Monster *Monster::any_unique_as<MyGame::Example::Monster>() const {
+  return any_unique_as_M();
+}
+
+template<> inline const MyGame::Example::TestSimpleTableWithEnum *Monster::any_unique_as<MyGame::Example::TestSimpleTableWithEnum>() const {
+  return any_unique_as_TS();
+}
+
+template<> inline const MyGame::Example2::Monster *Monster::any_unique_as<MyGame::Example2::Monster>() const {
+  return any_unique_as_M2();
+}
+
+struct MonsterBuilder {
+  typedef Monster Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_pos(const MyGame::Example::Vec3 *pos) {
+    fbb_.AddStruct(Monster::VT_POS, pos);
+  }
+  void add_mana(int16_t mana) {
+    fbb_.AddElement<int16_t>(Monster::VT_MANA, mana, 150);
+  }
+  void add_hp(int16_t hp) {
+    fbb_.AddElement<int16_t>(Monster::VT_HP, hp, 100);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Monster::VT_NAME, name);
+  }
+  void add_inventory(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory) {
+    fbb_.AddOffset(Monster::VT_INVENTORY, inventory);
+  }
+  void add_color(MyGame::Example::Color color) {
+    fbb_.AddElement<uint8_t>(Monster::VT_COLOR, static_cast<uint8_t>(color), 8);
+  }
+  void add_test_type(MyGame::Example::Any test_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_TEST_TYPE, static_cast<uint8_t>(test_type), 0);
+  }
+  void add_test(flatbuffers::Offset<void> test) {
+    fbb_.AddOffset(Monster::VT_TEST, test);
+  }
+  void add_test4(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test4) {
+    fbb_.AddOffset(Monster::VT_TEST4, test4);
+  }
+  void add_testarrayofstring(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSTRING, testarrayofstring);
+  }
+  void add_testarrayoftables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>>> testarrayoftables) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFTABLES, testarrayoftables);
+  }
+  void add_enemy(flatbuffers::Offset<MyGame::Example::Monster> enemy) {
+    fbb_.AddOffset(Monster::VT_ENEMY, enemy);
+  }
+  void add_testnestedflatbuffer(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testnestedflatbuffer) {
+    fbb_.AddOffset(Monster::VT_TESTNESTEDFLATBUFFER, testnestedflatbuffer);
+  }
+  void add_testempty(flatbuffers::Offset<MyGame::Example::Stat> testempty) {
+    fbb_.AddOffset(Monster::VT_TESTEMPTY, testempty);
+  }
+  void add_testbool(bool testbool) {
+    fbb_.AddElement<uint8_t>(Monster::VT_TESTBOOL, static_cast<uint8_t>(testbool), 0);
+  }
+  void add_testhashs32_fnv1(int32_t testhashs32_fnv1) {
+    fbb_.AddElement<int32_t>(Monster::VT_TESTHASHS32_FNV1, testhashs32_fnv1, 0);
+  }
+  void add_testhashu32_fnv1(uint32_t testhashu32_fnv1) {
+    fbb_.AddElement<uint32_t>(Monster::VT_TESTHASHU32_FNV1, testhashu32_fnv1, 0);
+  }
+  void add_testhashs64_fnv1(int64_t testhashs64_fnv1) {
+    fbb_.AddElement<int64_t>(Monster::VT_TESTHASHS64_FNV1, testhashs64_fnv1, 0);
+  }
+  void add_testhashu64_fnv1(uint64_t testhashu64_fnv1) {
+    fbb_.AddElement<uint64_t>(Monster::VT_TESTHASHU64_FNV1, testhashu64_fnv1, 0);
+  }
+  void add_testhashs32_fnv1a(int32_t testhashs32_fnv1a) {
+    fbb_.AddElement<int32_t>(Monster::VT_TESTHASHS32_FNV1A, testhashs32_fnv1a, 0);
+  }
+  void add_testhashu32_fnv1a(uint32_t testhashu32_fnv1a) {
+    fbb_.AddElement<uint32_t>(Monster::VT_TESTHASHU32_FNV1A, testhashu32_fnv1a, 0);
+  }
+  void add_testhashs64_fnv1a(int64_t testhashs64_fnv1a) {
+    fbb_.AddElement<int64_t>(Monster::VT_TESTHASHS64_FNV1A, testhashs64_fnv1a, 0);
+  }
+  void add_testhashu64_fnv1a(uint64_t testhashu64_fnv1a) {
+    fbb_.AddElement<uint64_t>(Monster::VT_TESTHASHU64_FNV1A, testhashu64_fnv1a, 0);
+  }
+  void add_testarrayofbools(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testarrayofbools) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFBOOLS, testarrayofbools);
+  }
+  void add_testf(float testf) {
+    fbb_.AddElement<float>(Monster::VT_TESTF, testf, 3.14159f);
+  }
+  void add_testf2(float testf2) {
+    fbb_.AddElement<float>(Monster::VT_TESTF2, testf2, 3.0f);
+  }
+  void add_testf3(float testf3) {
+    fbb_.AddElement<float>(Monster::VT_TESTF3, testf3, 0.0f);
+  }
+  void add_testarrayofstring2(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring2) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSTRING2, testarrayofstring2);
+  }
+  void add_testarrayofsortedstruct(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Ability *>> testarrayofsortedstruct) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSORTEDSTRUCT, testarrayofsortedstruct);
+  }
+  void add_flex(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> flex) {
+    fbb_.AddOffset(Monster::VT_FLEX, flex);
+  }
+  void add_test5(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test5) {
+    fbb_.AddOffset(Monster::VT_TEST5, test5);
+  }
+  void add_vector_of_longs(flatbuffers::Offset<flatbuffers::Vector<int64_t>> vector_of_longs) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_LONGS, vector_of_longs);
+  }
+  void add_vector_of_doubles(flatbuffers::Offset<flatbuffers::Vector<double>> vector_of_doubles) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_DOUBLES, vector_of_doubles);
+  }
+  void add_parent_namespace_test(flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test) {
+    fbb_.AddOffset(Monster::VT_PARENT_NAMESPACE_TEST, parent_namespace_test);
+  }
+  void add_vector_of_referrables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_referrables) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_REFERRABLES, vector_of_referrables);
+  }
+  void add_single_weak_reference(uint64_t single_weak_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_SINGLE_WEAK_REFERENCE, single_weak_reference, 0);
+  }
+  void add_vector_of_weak_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_weak_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_WEAK_REFERENCES, vector_of_weak_references);
+  }
+  void add_vector_of_strong_referrables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_strong_referrables) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_STRONG_REFERRABLES, vector_of_strong_referrables);
+  }
+  void add_co_owning_reference(uint64_t co_owning_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_CO_OWNING_REFERENCE, co_owning_reference, 0);
+  }
+  void add_vector_of_co_owning_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_co_owning_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_CO_OWNING_REFERENCES, vector_of_co_owning_references);
+  }
+  void add_non_owning_reference(uint64_t non_owning_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_NON_OWNING_REFERENCE, non_owning_reference, 0);
+  }
+  void add_vector_of_non_owning_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_non_owning_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_NON_OWNING_REFERENCES, vector_of_non_owning_references);
+  }
+  void add_any_unique_type(MyGame::Example::AnyUniqueAliases any_unique_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_ANY_UNIQUE_TYPE, static_cast<uint8_t>(any_unique_type), 0);
+  }
+  void add_any_unique(flatbuffers::Offset<void> any_unique) {
+    fbb_.AddOffset(Monster::VT_ANY_UNIQUE, any_unique);
+  }
+  void add_any_ambiguous_type(MyGame::Example::AnyAmbiguousAliases any_ambiguous_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_ANY_AMBIGUOUS_TYPE, static_cast<uint8_t>(any_ambiguous_type), 0);
+  }
+  void add_any_ambiguous(flatbuffers::Offset<void> any_ambiguous) {
+    fbb_.AddOffset(Monster::VT_ANY_AMBIGUOUS, any_ambiguous);
+  }
+  void add_vector_of_enums(flatbuffers::Offset<flatbuffers::Vector<MyGame::Example::Color>> vector_of_enums) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_ENUMS, vector_of_enums);
+  }
+  void add_signed_enum(MyGame::Example::Race signed_enum) {
+    fbb_.AddElement<int8_t>(Monster::VT_SIGNED_ENUM, static_cast<int8_t>(signed_enum), -1);
+  }
+  void add_testrequirednestedflatbuffer(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testrequirednestedflatbuffer) {
+    fbb_.AddOffset(Monster::VT_TESTREQUIREDNESTEDFLATBUFFER, testrequirednestedflatbuffer);
+  }
+  void add_scalar_key_sorted_tables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>>> scalar_key_sorted_tables) {
+    fbb_.AddOffset(Monster::VT_SCALAR_KEY_SORTED_TABLES, scalar_key_sorted_tables);
+  }
+  explicit MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Monster> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Monster>(end);
+    fbb_.Required(o, Monster::VT_NAME);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Monster> CreateMonster(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Example::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory = 0,
+    MyGame::Example::Color color = MyGame::Example::Color::Blue,
+    MyGame::Example::Any test_type = MyGame::Example::Any::NONE,
+    flatbuffers::Offset<void> test = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test4 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>>> testarrayoftables = 0,
+    flatbuffers::Offset<MyGame::Example::Monster> enemy = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testnestedflatbuffer = 0,
+    flatbuffers::Offset<MyGame::Example::Stat> testempty = 0,
+    bool testbool = false,
+    int32_t testhashs32_fnv1 = 0,
+    uint32_t testhashu32_fnv1 = 0,
+    int64_t testhashs64_fnv1 = 0,
+    uint64_t testhashu64_fnv1 = 0,
+    int32_t testhashs32_fnv1a = 0,
+    uint32_t testhashu32_fnv1a = 0,
+    int64_t testhashs64_fnv1a = 0,
+    uint64_t testhashu64_fnv1a = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testarrayofbools = 0,
+    float testf = 3.14159f,
+    float testf2 = 3.0f,
+    float testf3 = 0.0f,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring2 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Ability *>> testarrayofsortedstruct = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> flex = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test5 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> vector_of_longs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<double>> vector_of_doubles = 0,
+    flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_referrables = 0,
+    uint64_t single_weak_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_weak_references = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_strong_referrables = 0,
+    uint64_t co_owning_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_co_owning_references = 0,
+    uint64_t non_owning_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_non_owning_references = 0,
+    MyGame::Example::AnyUniqueAliases any_unique_type = MyGame::Example::AnyUniqueAliases::NONE,
+    flatbuffers::Offset<void> any_unique = 0,
+    MyGame::Example::AnyAmbiguousAliases any_ambiguous_type = MyGame::Example::AnyAmbiguousAliases::NONE,
+    flatbuffers::Offset<void> any_ambiguous = 0,
+    flatbuffers::Offset<flatbuffers::Vector<MyGame::Example::Color>> vector_of_enums = 0,
+    MyGame::Example::Race signed_enum = MyGame::Example::Race::None,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testrequirednestedflatbuffer = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>>> scalar_key_sorted_tables = 0) {
+  MonsterBuilder builder_(_fbb);
+  builder_.add_non_owning_reference(non_owning_reference);
+  builder_.add_co_owning_reference(co_owning_reference);
+  builder_.add_single_weak_reference(single_weak_reference);
+  builder_.add_testhashu64_fnv1a(testhashu64_fnv1a);
+  builder_.add_testhashs64_fnv1a(testhashs64_fnv1a);
+  builder_.add_testhashu64_fnv1(testhashu64_fnv1);
+  builder_.add_testhashs64_fnv1(testhashs64_fnv1);
+  builder_.add_scalar_key_sorted_tables(scalar_key_sorted_tables);
+  builder_.add_testrequirednestedflatbuffer(testrequirednestedflatbuffer);
+  builder_.add_vector_of_enums(vector_of_enums);
+  builder_.add_any_ambiguous(any_ambiguous);
+  builder_.add_any_unique(any_unique);
+  builder_.add_vector_of_non_owning_references(vector_of_non_owning_references);
+  builder_.add_vector_of_co_owning_references(vector_of_co_owning_references);
+  builder_.add_vector_of_strong_referrables(vector_of_strong_referrables);
+  builder_.add_vector_of_weak_references(vector_of_weak_references);
+  builder_.add_vector_of_referrables(vector_of_referrables);
+  builder_.add_parent_namespace_test(parent_namespace_test);
+  builder_.add_vector_of_doubles(vector_of_doubles);
+  builder_.add_vector_of_longs(vector_of_longs);
+  builder_.add_test5(test5);
+  builder_.add_flex(flex);
+  builder_.add_testarrayofsortedstruct(testarrayofsortedstruct);
+  builder_.add_testarrayofstring2(testarrayofstring2);
+  builder_.add_testf3(testf3);
+  builder_.add_testf2(testf2);
+  builder_.add_testf(testf);
+  builder_.add_testarrayofbools(testarrayofbools);
+  builder_.add_testhashu32_fnv1a(testhashu32_fnv1a);
+  builder_.add_testhashs32_fnv1a(testhashs32_fnv1a);
+  builder_.add_testhashu32_fnv1(testhashu32_fnv1);
+  builder_.add_testhashs32_fnv1(testhashs32_fnv1);
+  builder_.add_testempty(testempty);
+  builder_.add_testnestedflatbuffer(testnestedflatbuffer);
+  builder_.add_enemy(enemy);
+  builder_.add_testarrayoftables(testarrayoftables);
+  builder_.add_testarrayofstring(testarrayofstring);
+  builder_.add_test4(test4);
+  builder_.add_test(test);
+  builder_.add_inventory(inventory);
+  builder_.add_name(name);
+  builder_.add_pos(pos);
+  builder_.add_hp(hp);
+  builder_.add_mana(mana);
+  builder_.add_signed_enum(signed_enum);
+  builder_.add_any_ambiguous_type(any_ambiguous_type);
+  builder_.add_any_unique_type(any_unique_type);
+  builder_.add_testbool(testbool);
+  builder_.add_test_type(test_type);
+  builder_.add_color(color);
+  return builder_.Finish();
+}
+
+struct Monster::Traits {
+  using type = Monster;
+  static auto constexpr Create = CreateMonster;
+  static constexpr auto name = "Monster";
+  static constexpr auto fully_qualified_name = "MyGame.Example.Monster";
+  static constexpr std::array<const char *, 50> field_names = {
+    "pos",
+    "mana",
+    "hp",
+    "name",
+    "inventory",
+    "color",
+    "test_type",
+    "test",
+    "test4",
+    "testarrayofstring",
+    "testarrayoftables",
+    "enemy",
+    "testnestedflatbuffer",
+    "testempty",
+    "testbool",
+    "testhashs32_fnv1",
+    "testhashu32_fnv1",
+    "testhashs64_fnv1",
+    "testhashu64_fnv1",
+    "testhashs32_fnv1a",
+    "testhashu32_fnv1a",
+    "testhashs64_fnv1a",
+    "testhashu64_fnv1a",
+    "testarrayofbools",
+    "testf",
+    "testf2",
+    "testf3",
+    "testarrayofstring2",
+    "testarrayofsortedstruct",
+    "flex",
+    "test5",
+    "vector_of_longs",
+    "vector_of_doubles",
+    "parent_namespace_test",
+    "vector_of_referrables",
+    "single_weak_reference",
+    "vector_of_weak_references",
+    "vector_of_strong_referrables",
+    "co_owning_reference",
+    "vector_of_co_owning_references",
+    "non_owning_reference",
+    "vector_of_non_owning_references",
+    "any_unique_type",
+    "any_unique",
+    "any_ambiguous_type",
+    "any_ambiguous",
+    "vector_of_enums",
+    "signed_enum",
+    "testrequirednestedflatbuffer",
+    "scalar_key_sorted_tables"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 50;
+};
+
+inline flatbuffers::Offset<Monster> CreateMonsterDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Example::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    const char *name = nullptr,
+    const std::vector<uint8_t> *inventory = nullptr,
+    MyGame::Example::Color color = MyGame::Example::Color::Blue,
+    MyGame::Example::Any test_type = MyGame::Example::Any::NONE,
+    flatbuffers::Offset<void> test = 0,
+    const std::vector<MyGame::Example::Test> *test4 = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Monster>> *testarrayoftables = nullptr,
+    flatbuffers::Offset<MyGame::Example::Monster> enemy = 0,
+    const std::vector<uint8_t> *testnestedflatbuffer = nullptr,
+    flatbuffers::Offset<MyGame::Example::Stat> testempty = 0,
+    bool testbool = false,
+    int32_t testhashs32_fnv1 = 0,
+    uint32_t testhashu32_fnv1 = 0,
+    int64_t testhashs64_fnv1 = 0,
+    uint64_t testhashu64_fnv1 = 0,
+    int32_t testhashs32_fnv1a = 0,
+    uint32_t testhashu32_fnv1a = 0,
+    int64_t testhashs64_fnv1a = 0,
+    uint64_t testhashu64_fnv1a = 0,
+    const std::vector<uint8_t> *testarrayofbools = nullptr,
+    float testf = 3.14159f,
+    float testf2 = 3.0f,
+    float testf3 = 0.0f,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring2 = nullptr,
+    std::vector<MyGame::Example::Ability> *testarrayofsortedstruct = nullptr,
+    const std::vector<uint8_t> *flex = nullptr,
+    const std::vector<MyGame::Example::Test> *test5 = nullptr,
+    const std::vector<int64_t> *vector_of_longs = nullptr,
+    const std::vector<double> *vector_of_doubles = nullptr,
+    flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test = 0,
+    std::vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_referrables = nullptr,
+    uint64_t single_weak_reference = 0,
+    const std::vector<uint64_t> *vector_of_weak_references = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_strong_referrables = nullptr,
+    uint64_t co_owning_reference = 0,
+    const std::vector<uint64_t> *vector_of_co_owning_references = nullptr,
+    uint64_t non_owning_reference = 0,
+    const std::vector<uint64_t> *vector_of_non_owning_references = nullptr,
+    MyGame::Example::AnyUniqueAliases any_unique_type = MyGame::Example::AnyUniqueAliases::NONE,
+    flatbuffers::Offset<void> any_unique = 0,
+    MyGame::Example::AnyAmbiguousAliases any_ambiguous_type = MyGame::Example::AnyAmbiguousAliases::NONE,
+    flatbuffers::Offset<void> any_ambiguous = 0,
+    const std::vector<MyGame::Example::Color> *vector_of_enums = nullptr,
+    MyGame::Example::Race signed_enum = MyGame::Example::Race::None,
+    const std::vector<uint8_t> *testrequirednestedflatbuffer = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Stat>> *scalar_key_sorted_tables = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto inventory__ = inventory ? _fbb.CreateVector<uint8_t>(*inventory) : 0;
+  auto test4__ = test4 ? _fbb.CreateVectorOfStructs<MyGame::Example::Test>(*test4) : 0;
+  auto testarrayofstring__ = testarrayofstring ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*testarrayofstring) : 0;
+  auto testarrayoftables__ = testarrayoftables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Monster>(testarrayoftables) : 0;
+  auto testnestedflatbuffer__ = testnestedflatbuffer ? _fbb.CreateVector<uint8_t>(*testnestedflatbuffer) : 0;
+  auto testarrayofbools__ = testarrayofbools ? _fbb.CreateVector<uint8_t>(*testarrayofbools) : 0;
+  auto testarrayofstring2__ = testarrayofstring2 ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*testarrayofstring2) : 0;
+  auto testarrayofsortedstruct__ = testarrayofsortedstruct ? _fbb.CreateVectorOfSortedStructs<MyGame::Example::Ability>(testarrayofsortedstruct) : 0;
+  auto flex__ = flex ? _fbb.CreateVector<uint8_t>(*flex) : 0;
+  auto test5__ = test5 ? _fbb.CreateVectorOfStructs<MyGame::Example::Test>(*test5) : 0;
+  auto vector_of_longs__ = vector_of_longs ? _fbb.CreateVector<int64_t>(*vector_of_longs) : 0;
+  auto vector_of_doubles__ = vector_of_doubles ? _fbb.CreateVector<double>(*vector_of_doubles) : 0;
+  auto vector_of_referrables__ = vector_of_referrables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Referrable>(vector_of_referrables) : 0;
+  auto vector_of_weak_references__ = vector_of_weak_references ? _fbb.CreateVector<uint64_t>(*vector_of_weak_references) : 0;
+  auto vector_of_strong_referrables__ = vector_of_strong_referrables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Referrable>(vector_of_strong_referrables) : 0;
+  auto vector_of_co_owning_references__ = vector_of_co_owning_references ? _fbb.CreateVector<uint64_t>(*vector_of_co_owning_references) : 0;
+  auto vector_of_non_owning_references__ = vector_of_non_owning_references ? _fbb.CreateVector<uint64_t>(*vector_of_non_owning_references) : 0;
+  auto vector_of_enums__ = vector_of_enums ? _fbb.CreateVector<MyGame::Example::Color>(*vector_of_enums) : 0;
+  auto testrequirednestedflatbuffer__ = testrequirednestedflatbuffer ? _fbb.CreateVector<uint8_t>(*testrequirednestedflatbuffer) : 0;
+  auto scalar_key_sorted_tables__ = scalar_key_sorted_tables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Stat>(scalar_key_sorted_tables) : 0;
+  return MyGame::Example::CreateMonster(
+      _fbb,
+      pos,
+      mana,
+      hp,
+      name__,
+      inventory__,
+      color,
+      test_type,
+      test,
+      test4__,
+      testarrayofstring__,
+      testarrayoftables__,
+      enemy,
+      testnestedflatbuffer__,
+      testempty,
+      testbool,
+      testhashs32_fnv1,
+      testhashu32_fnv1,
+      testhashs64_fnv1,
+      testhashu64_fnv1,
+      testhashs32_fnv1a,
+      testhashu32_fnv1a,
+      testhashs64_fnv1a,
+      testhashu64_fnv1a,
+      testarrayofbools__,
+      testf,
+      testf2,
+      testf3,
+      testarrayofstring2__,
+      testarrayofsortedstruct__,
+      flex__,
+      test5__,
+      vector_of_longs__,
+      vector_of_doubles__,
+      parent_namespace_test,
+      vector_of_referrables__,
+      single_weak_reference,
+      vector_of_weak_references__,
+      vector_of_strong_referrables__,
+      co_owning_reference,
+      vector_of_co_owning_references__,
+      non_owning_reference,
+      vector_of_non_owning_references__,
+      any_unique_type,
+      any_unique,
+      any_ambiguous_type,
+      any_ambiguous,
+      vector_of_enums__,
+      signed_enum,
+      testrequirednestedflatbuffer__,
+      scalar_key_sorted_tables__);
+}
+
+flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TypeAliasesT : public flatbuffers::NativeTable {
+  typedef TypeAliases TableType;
+  int8_t i8 = 0;
+  uint8_t u8 = 0;
+  int16_t i16 = 0;
+  uint16_t u16 = 0;
+  int32_t i32 = 0;
+  uint32_t u32 = 0;
+  int64_t i64 = 0;
+  uint64_t u64 = 0;
+  float f32 = 0.0f;
+  double f64 = 0.0;
+  std::vector<int8_t> v8{};
+  std::vector<double> vf64{};
+};
+
+struct TypeAliases FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TypeAliasesT NativeTableType;
+  typedef TypeAliasesBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TypeAliasesTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_I8 = 4,
+    VT_U8 = 6,
+    VT_I16 = 8,
+    VT_U16 = 10,
+    VT_I32 = 12,
+    VT_U32 = 14,
+    VT_I64 = 16,
+    VT_U64 = 18,
+    VT_F32 = 20,
+    VT_F64 = 22,
+    VT_V8 = 24,
+    VT_VF64 = 26
+  };
+  int8_t i8() const {
+    return GetField<int8_t>(VT_I8, 0);
+  }
+  bool mutate_i8(int8_t _i8) {
+    return SetField<int8_t>(VT_I8, _i8, 0);
+  }
+  uint8_t u8() const {
+    return GetField<uint8_t>(VT_U8, 0);
+  }
+  bool mutate_u8(uint8_t _u8) {
+    return SetField<uint8_t>(VT_U8, _u8, 0);
+  }
+  int16_t i16() const {
+    return GetField<int16_t>(VT_I16, 0);
+  }
+  bool mutate_i16(int16_t _i16) {
+    return SetField<int16_t>(VT_I16, _i16, 0);
+  }
+  uint16_t u16() const {
+    return GetField<uint16_t>(VT_U16, 0);
+  }
+  bool mutate_u16(uint16_t _u16) {
+    return SetField<uint16_t>(VT_U16, _u16, 0);
+  }
+  int32_t i32() const {
+    return GetField<int32_t>(VT_I32, 0);
+  }
+  bool mutate_i32(int32_t _i32) {
+    return SetField<int32_t>(VT_I32, _i32, 0);
+  }
+  uint32_t u32() const {
+    return GetField<uint32_t>(VT_U32, 0);
+  }
+  bool mutate_u32(uint32_t _u32) {
+    return SetField<uint32_t>(VT_U32, _u32, 0);
+  }
+  int64_t i64() const {
+    return GetField<int64_t>(VT_I64, 0);
+  }
+  bool mutate_i64(int64_t _i64) {
+    return SetField<int64_t>(VT_I64, _i64, 0);
+  }
+  uint64_t u64() const {
+    return GetField<uint64_t>(VT_U64, 0);
+  }
+  bool mutate_u64(uint64_t _u64) {
+    return SetField<uint64_t>(VT_U64, _u64, 0);
+  }
+  float f32() const {
+    return GetField<float>(VT_F32, 0.0f);
+  }
+  bool mutate_f32(float _f32) {
+    return SetField<float>(VT_F32, _f32, 0.0f);
+  }
+  double f64() const {
+    return GetField<double>(VT_F64, 0.0);
+  }
+  bool mutate_f64(double _f64) {
+    return SetField<double>(VT_F64, _f64, 0.0);
+  }
+  const flatbuffers::Vector<int8_t> *v8() const {
+    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_V8);
+  }
+  flatbuffers::Vector<int8_t> *mutable_v8() {
+    return GetPointer<flatbuffers::Vector<int8_t> *>(VT_V8);
+  }
+  const flatbuffers::Vector<double> *vf64() const {
+    return GetPointer<const flatbuffers::Vector<double> *>(VT_VF64);
+  }
+  flatbuffers::Vector<double> *mutable_vf64() {
+    return GetPointer<flatbuffers::Vector<double> *>(VT_VF64);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return i8();
+    else if constexpr (Index == 1) return u8();
+    else if constexpr (Index == 2) return i16();
+    else if constexpr (Index == 3) return u16();
+    else if constexpr (Index == 4) return i32();
+    else if constexpr (Index == 5) return u32();
+    else if constexpr (Index == 6) return i64();
+    else if constexpr (Index == 7) return u64();
+    else if constexpr (Index == 8) return f32();
+    else if constexpr (Index == 9) return f64();
+    else if constexpr (Index == 10) return v8();
+    else if constexpr (Index == 11) return vf64();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_I8) &&
+           VerifyField<uint8_t>(verifier, VT_U8) &&
+           VerifyField<int16_t>(verifier, VT_I16) &&
+           VerifyField<uint16_t>(verifier, VT_U16) &&
+           VerifyField<int32_t>(verifier, VT_I32) &&
+           VerifyField<uint32_t>(verifier, VT_U32) &&
+           VerifyField<int64_t>(verifier, VT_I64) &&
+           VerifyField<uint64_t>(verifier, VT_U64) &&
+           VerifyField<float>(verifier, VT_F32) &&
+           VerifyField<double>(verifier, VT_F64) &&
+           VerifyOffset(verifier, VT_V8) &&
+           verifier.VerifyVector(v8()) &&
+           VerifyOffset(verifier, VT_VF64) &&
+           verifier.VerifyVector(vf64()) &&
+           verifier.EndTable();
+  }
+  TypeAliasesT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TypeAliasesT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TypeAliases> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TypeAliasesBuilder {
+  typedef TypeAliases Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_i8(int8_t i8) {
+    fbb_.AddElement<int8_t>(TypeAliases::VT_I8, i8, 0);
+  }
+  void add_u8(uint8_t u8) {
+    fbb_.AddElement<uint8_t>(TypeAliases::VT_U8, u8, 0);
+  }
+  void add_i16(int16_t i16) {
+    fbb_.AddElement<int16_t>(TypeAliases::VT_I16, i16, 0);
+  }
+  void add_u16(uint16_t u16) {
+    fbb_.AddElement<uint16_t>(TypeAliases::VT_U16, u16, 0);
+  }
+  void add_i32(int32_t i32) {
+    fbb_.AddElement<int32_t>(TypeAliases::VT_I32, i32, 0);
+  }
+  void add_u32(uint32_t u32) {
+    fbb_.AddElement<uint32_t>(TypeAliases::VT_U32, u32, 0);
+  }
+  void add_i64(int64_t i64) {
+    fbb_.AddElement<int64_t>(TypeAliases::VT_I64, i64, 0);
+  }
+  void add_u64(uint64_t u64) {
+    fbb_.AddElement<uint64_t>(TypeAliases::VT_U64, u64, 0);
+  }
+  void add_f32(float f32) {
+    fbb_.AddElement<float>(TypeAliases::VT_F32, f32, 0.0f);
+  }
+  void add_f64(double f64) {
+    fbb_.AddElement<double>(TypeAliases::VT_F64, f64, 0.0);
+  }
+  void add_v8(flatbuffers::Offset<flatbuffers::Vector<int8_t>> v8) {
+    fbb_.AddOffset(TypeAliases::VT_V8, v8);
+  }
+  void add_vf64(flatbuffers::Offset<flatbuffers::Vector<double>> vf64) {
+    fbb_.AddOffset(TypeAliases::VT_VF64, vf64);
+  }
+  explicit TypeAliasesBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TypeAliases> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TypeAliases>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliases(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t i8 = 0,
+    uint8_t u8 = 0,
+    int16_t i16 = 0,
+    uint16_t u16 = 0,
+    int32_t i32 = 0,
+    uint32_t u32 = 0,
+    int64_t i64 = 0,
+    uint64_t u64 = 0,
+    float f32 = 0.0f,
+    double f64 = 0.0,
+    flatbuffers::Offset<flatbuffers::Vector<int8_t>> v8 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<double>> vf64 = 0) {
+  TypeAliasesBuilder builder_(_fbb);
+  builder_.add_f64(f64);
+  builder_.add_u64(u64);
+  builder_.add_i64(i64);
+  builder_.add_vf64(vf64);
+  builder_.add_v8(v8);
+  builder_.add_f32(f32);
+  builder_.add_u32(u32);
+  builder_.add_i32(i32);
+  builder_.add_u16(u16);
+  builder_.add_i16(i16);
+  builder_.add_u8(u8);
+  builder_.add_i8(i8);
+  return builder_.Finish();
+}
+
+struct TypeAliases::Traits {
+  using type = TypeAliases;
+  static auto constexpr Create = CreateTypeAliases;
+  static constexpr auto name = "TypeAliases";
+  static constexpr auto fully_qualified_name = "MyGame.Example.TypeAliases";
+  static constexpr std::array<const char *, 12> field_names = {
+    "i8",
+    "u8",
+    "i16",
+    "u16",
+    "i32",
+    "u32",
+    "i64",
+    "u64",
+    "f32",
+    "f64",
+    "v8",
+    "vf64"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 12;
+};
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliasesDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t i8 = 0,
+    uint8_t u8 = 0,
+    int16_t i16 = 0,
+    uint16_t u16 = 0,
+    int32_t i32 = 0,
+    uint32_t u32 = 0,
+    int64_t i64 = 0,
+    uint64_t u64 = 0,
+    float f32 = 0.0f,
+    double f64 = 0.0,
+    const std::vector<int8_t> *v8 = nullptr,
+    const std::vector<double> *vf64 = nullptr) {
+  auto v8__ = v8 ? _fbb.CreateVector<int8_t>(*v8) : 0;
+  auto vf64__ = vf64 ? _fbb.CreateVector<double>(*vf64) : 0;
+  return MyGame::Example::CreateTypeAliases(
+      _fbb,
+      i8,
+      u8,
+      i16,
+      u16,
+      i32,
+      u32,
+      i64,
+      u64,
+      f32,
+      f64,
+      v8__,
+      vf64__);
+}
+
+flatbuffers::Offset<TypeAliases> CreateTypeAliases(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace Example
+
+inline InParentNamespaceT *InParentNamespace::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<InParentNamespaceT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InParentNamespace::UnPackTo(InParentNamespaceT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<InParentNamespace> InParentNamespace::Pack(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInParentNamespace(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const InParentNamespaceT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return MyGame::CreateInParentNamespace(
+      _fbb);
+}
+
+namespace Example2 {
+
+inline MonsterT *Monster::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<MonsterT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Monster::UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<Monster> Monster::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonster(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return MyGame::Example2::CreateMonster(
+      _fbb);
+}
+
+}  // namespace Example2
+
+namespace Example {
+
+inline TestSimpleTableWithEnumT *TestSimpleTableWithEnum::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<TestSimpleTableWithEnumT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TestSimpleTableWithEnum::UnPackTo(TestSimpleTableWithEnumT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = color(); _o->color = _e; }
+}
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> TestSimpleTableWithEnum::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTestSimpleTableWithEnum(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TestSimpleTableWithEnumT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _color = _o->color;
+  return MyGame::Example::CreateTestSimpleTableWithEnum(
+      _fbb,
+      _color);
+}
+
+inline StatT *Stat::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<StatT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Stat::UnPackTo(StatT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); if (_e) _o->id = _e->str(); }
+  { auto _e = val(); _o->val = _e; }
+  { auto _e = count(); _o->count = _e; }
+}
+
+inline flatbuffers::Offset<Stat> Stat::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StatT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStat(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Stat> CreateStat(flatbuffers::FlatBufferBuilder &_fbb, const StatT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StatT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id.empty() ? 0 : _fbb.CreateString(_o->id);
+  auto _val = _o->val;
+  auto _count = _o->count;
+  return MyGame::Example::CreateStat(
+      _fbb,
+      _id,
+      _val,
+      _count);
+}
+
+inline ReferrableT *Referrable::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<ReferrableT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Referrable::UnPackTo(ReferrableT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); _o->id = _e; }
+}
+
+inline flatbuffers::Offset<Referrable> Referrable::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReferrable(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Referrable> CreateReferrable(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReferrableT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id;
+  return MyGame::Example::CreateReferrable(
+      _fbb,
+      _id);
+}
+
+inline MonsterT *Monster::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<MonsterT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Monster::UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = pos(); if (_e) _o->pos = std::unique_ptr<MyGame::Example::Vec3>(new MyGame::Example::Vec3(*_e)); }
+  { auto _e = mana(); _o->mana = _e; }
+  { auto _e = hp(); _o->hp = _e; }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = inventory(); if (_e) { _o->inventory.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->inventory.begin()); } }
+  { auto _e = color(); _o->color = _e; }
+  { auto _e = test_type(); _o->test.type = _e; }
+  { auto _e = test(); if (_e) _o->test.value = MyGame::Example::AnyUnion::UnPack(_e, test_type(), _resolver); }
+  { auto _e = test4(); if (_e) { _o->test4.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->test4[_i] = *_e->Get(_i); } } }
+  { auto _e = testarrayofstring(); if (_e) { _o->testarrayofstring.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofstring[_i] = _e->Get(_i)->str(); } } }
+  { auto _e = testarrayoftables(); if (_e) { _o->testarrayoftables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayoftables[_i] = std::unique_ptr<MyGame::Example::MonsterT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = enemy(); if (_e) _o->enemy = std::unique_ptr<MyGame::Example::MonsterT>(_e->UnPack(_resolver)); }
+  { auto _e = testnestedflatbuffer(); if (_e) { _o->testnestedflatbuffer.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->testnestedflatbuffer.begin()); } }
+  { auto _e = testempty(); if (_e) _o->testempty = std::unique_ptr<MyGame::Example::StatT>(_e->UnPack(_resolver)); }
+  { auto _e = testbool(); _o->testbool = _e; }
+  { auto _e = testhashs32_fnv1(); _o->testhashs32_fnv1 = _e; }
+  { auto _e = testhashu32_fnv1(); _o->testhashu32_fnv1 = _e; }
+  { auto _e = testhashs64_fnv1(); _o->testhashs64_fnv1 = _e; }
+  { auto _e = testhashu64_fnv1(); _o->testhashu64_fnv1 = _e; }
+  { auto _e = testhashs32_fnv1a(); _o->testhashs32_fnv1a = _e; }
+  { auto _e = testhashu32_fnv1a(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->testhashu32_fnv1a), static_cast<flatbuffers::hash_value_t>(_e)); else _o->testhashu32_fnv1a = nullptr; }
+  { auto _e = testhashs64_fnv1a(); _o->testhashs64_fnv1a = _e; }
+  { auto _e = testhashu64_fnv1a(); _o->testhashu64_fnv1a = _e; }
+  { auto _e = testarrayofbools(); if (_e) { _o->testarrayofbools.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofbools[_i] = _e->Get(_i) != 0; } } }
+  { auto _e = testf(); _o->testf = _e; }
+  { auto _e = testf2(); _o->testf2 = _e; }
+  { auto _e = testf3(); _o->testf3 = _e; }
+  { auto _e = testarrayofstring2(); if (_e) { _o->testarrayofstring2.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofstring2[_i] = _e->Get(_i)->str(); } } }
+  { auto _e = testarrayofsortedstruct(); if (_e) { _o->testarrayofsortedstruct.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofsortedstruct[_i] = *_e->Get(_i); } } }
+  { auto _e = flex(); if (_e) { _o->flex.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->flex.begin()); } }
+  { auto _e = test5(); if (_e) { _o->test5.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->test5[_i] = *_e->Get(_i); } } }
+  { auto _e = vector_of_longs(); if (_e) { _o->vector_of_longs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_longs[_i] = _e->Get(_i); } } }
+  { auto _e = vector_of_doubles(); if (_e) { _o->vector_of_doubles.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_doubles[_i] = _e->Get(_i); } } }
+  { auto _e = parent_namespace_test(); if (_e) _o->parent_namespace_test = std::unique_ptr<MyGame::InParentNamespaceT>(_e->UnPack(_resolver)); }
+  { auto _e = vector_of_referrables(); if (_e) { _o->vector_of_referrables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_referrables[_i] = std::unique_ptr<MyGame::Example::ReferrableT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = single_weak_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->single_weak_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->single_weak_reference = nullptr; }
+  { auto _e = vector_of_weak_references(); if (_e) { _o->vector_of_weak_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, naked
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_weak_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i))); else _o->vector_of_weak_references[_i] = nullptr; } } }
+  { auto _e = vector_of_strong_referrables(); if (_e) { _o->vector_of_strong_referrables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_strong_referrables[_i] = std::unique_ptr<MyGame::Example::ReferrableT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = co_owning_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->co_owning_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->co_owning_reference = nullptr; }
+  { auto _e = vector_of_co_owning_references(); if (_e) { _o->vector_of_co_owning_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, default_ptr_type
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_co_owning_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i)));/* else do nothing */; } } }
+  { auto _e = non_owning_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->non_owning_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->non_owning_reference = nullptr; }
+  { auto _e = vector_of_non_owning_references(); if (_e) { _o->vector_of_non_owning_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, naked
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_non_owning_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i))); else _o->vector_of_non_owning_references[_i] = nullptr; } } }
+  { auto _e = any_unique_type(); _o->any_unique.type = _e; }
+  { auto _e = any_unique(); if (_e) _o->any_unique.value = MyGame::Example::AnyUniqueAliasesUnion::UnPack(_e, any_unique_type(), _resolver); }
+  { auto _e = any_ambiguous_type(); _o->any_ambiguous.type = _e; }
+  { auto _e = any_ambiguous(); if (_e) _o->any_ambiguous.value = MyGame::Example::AnyAmbiguousAliasesUnion::UnPack(_e, any_ambiguous_type(), _resolver); }
+  { auto _e = vector_of_enums(); if (_e) { _o->vector_of_enums.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_enums[_i] = static_cast<MyGame::Example::Color>(_e->Get(_i)); } } }
+  { auto _e = signed_enum(); _o->signed_enum = _e; }
+  { auto _e = testrequirednestedflatbuffer(); if (_e) { _o->testrequirednestedflatbuffer.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->testrequirednestedflatbuffer.begin()); } }
+  { auto _e = scalar_key_sorted_tables(); if (_e) { _o->scalar_key_sorted_tables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scalar_key_sorted_tables[_i] = std::unique_ptr<MyGame::Example::StatT>(_e->Get(_i)->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<Monster> Monster::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonster(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _pos = _o->pos ? _o->pos.get() : 0;
+  auto _mana = _o->mana;
+  auto _hp = _o->hp;
+  auto _name = _fbb.CreateString(_o->name);
+  auto _inventory = _o->inventory.size() ? _fbb.CreateVector(_o->inventory) : 0;
+  auto _color = _o->color;
+  auto _test_type = _o->test.type;
+  auto _test = _o->test.Pack(_fbb);
+  auto _test4 = _o->test4.size() ? _fbb.CreateVectorOfStructs(_o->test4) : 0;
+  auto _testarrayofstring = _o->testarrayofstring.size() ? _fbb.CreateVectorOfStrings(_o->testarrayofstring) : 0;
+  auto _testarrayoftables = _o->testarrayoftables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Monster>> (_o->testarrayoftables.size(), [](size_t i, _VectorArgs *__va) { return CreateMonster(*__va->__fbb, __va->__o->testarrayoftables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _enemy = _o->enemy ? CreateMonster(_fbb, _o->enemy.get(), _rehasher) : 0;
+  auto _testnestedflatbuffer = _o->testnestedflatbuffer.size() ? _fbb.CreateVector(_o->testnestedflatbuffer) : 0;
+  auto _testempty = _o->testempty ? CreateStat(_fbb, _o->testempty.get(), _rehasher) : 0;
+  auto _testbool = _o->testbool;
+  auto _testhashs32_fnv1 = _o->testhashs32_fnv1;
+  auto _testhashu32_fnv1 = _o->testhashu32_fnv1;
+  auto _testhashs64_fnv1 = _o->testhashs64_fnv1;
+  auto _testhashu64_fnv1 = _o->testhashu64_fnv1;
+  auto _testhashs32_fnv1a = _o->testhashs32_fnv1a;
+  auto _testhashu32_fnv1a = _rehasher ? static_cast<uint32_t>((*_rehasher)(_o->testhashu32_fnv1a)) : 0;
+  auto _testhashs64_fnv1a = _o->testhashs64_fnv1a;
+  auto _testhashu64_fnv1a = _o->testhashu64_fnv1a;
+  auto _testarrayofbools = _o->testarrayofbools.size() ? _fbb.CreateVector(_o->testarrayofbools) : 0;
+  auto _testf = _o->testf;
+  auto _testf2 = _o->testf2;
+  auto _testf3 = _o->testf3;
+  auto _testarrayofstring2 = _o->testarrayofstring2.size() ? _fbb.CreateVectorOfStrings(_o->testarrayofstring2) : 0;
+  auto _testarrayofsortedstruct = _o->testarrayofsortedstruct.size() ? _fbb.CreateVectorOfStructs(_o->testarrayofsortedstruct) : 0;
+  auto _flex = _o->flex.size() ? _fbb.CreateVector(_o->flex) : 0;
+  auto _test5 = _o->test5.size() ? _fbb.CreateVectorOfStructs(_o->test5) : 0;
+  auto _vector_of_longs = _o->vector_of_longs.size() ? _fbb.CreateVector(_o->vector_of_longs) : 0;
+  auto _vector_of_doubles = _o->vector_of_doubles.size() ? _fbb.CreateVector(_o->vector_of_doubles) : 0;
+  auto _parent_namespace_test = _o->parent_namespace_test ? CreateInParentNamespace(_fbb, _o->parent_namespace_test.get(), _rehasher) : 0;
+  auto _vector_of_referrables = _o->vector_of_referrables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Referrable>> (_o->vector_of_referrables.size(), [](size_t i, _VectorArgs *__va) { return CreateReferrable(*__va->__fbb, __va->__o->vector_of_referrables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _single_weak_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->single_weak_reference)) : 0;
+  auto _vector_of_weak_references = _o->vector_of_weak_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_weak_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_weak_references[i])) : 0; }, &_va ) : 0;
+  auto _vector_of_strong_referrables = _o->vector_of_strong_referrables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Referrable>> (_o->vector_of_strong_referrables.size(), [](size_t i, _VectorArgs *__va) { return CreateReferrable(*__va->__fbb, __va->__o->vector_of_strong_referrables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _co_owning_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->co_owning_reference)) : 0;
+  auto _vector_of_co_owning_references = _o->vector_of_co_owning_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_co_owning_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_co_owning_references[i].get())) : 0; }, &_va ) : 0;
+  auto _non_owning_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->non_owning_reference)) : 0;
+  auto _vector_of_non_owning_references = _o->vector_of_non_owning_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_non_owning_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_non_owning_references[i])) : 0; }, &_va ) : 0;
+  auto _any_unique_type = _o->any_unique.type;
+  auto _any_unique = _o->any_unique.Pack(_fbb);
+  auto _any_ambiguous_type = _o->any_ambiguous.type;
+  auto _any_ambiguous = _o->any_ambiguous.Pack(_fbb);
+  auto _vector_of_enums = _o->vector_of_enums.size() ? _fbb.CreateVector(_o->vector_of_enums) : 0;
+  auto _signed_enum = _o->signed_enum;
+  auto _testrequirednestedflatbuffer = _o->testrequirednestedflatbuffer.size() ? _fbb.CreateVector(_o->testrequirednestedflatbuffer) : 0;
+  auto _scalar_key_sorted_tables = _o->scalar_key_sorted_tables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Stat>> (_o->scalar_key_sorted_tables.size(), [](size_t i, _VectorArgs *__va) { return CreateStat(*__va->__fbb, __va->__o->scalar_key_sorted_tables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return MyGame::Example::CreateMonster(
+      _fbb,
+      _pos,
+      _mana,
+      _hp,
+      _name,
+      _inventory,
+      _color,
+      _test_type,
+      _test,
+      _test4,
+      _testarrayofstring,
+      _testarrayoftables,
+      _enemy,
+      _testnestedflatbuffer,
+      _testempty,
+      _testbool,
+      _testhashs32_fnv1,
+      _testhashu32_fnv1,
+      _testhashs64_fnv1,
+      _testhashu64_fnv1,
+      _testhashs32_fnv1a,
+      _testhashu32_fnv1a,
+      _testhashs64_fnv1a,
+      _testhashu64_fnv1a,
+      _testarrayofbools,
+      _testf,
+      _testf2,
+      _testf3,
+      _testarrayofstring2,
+      _testarrayofsortedstruct,
+      _flex,
+      _test5,
+      _vector_of_longs,
+      _vector_of_doubles,
+      _parent_namespace_test,
+      _vector_of_referrables,
+      _single_weak_reference,
+      _vector_of_weak_references,
+      _vector_of_strong_referrables,
+      _co_owning_reference,
+      _vector_of_co_owning_references,
+      _non_owning_reference,
+      _vector_of_non_owning_references,
+      _any_unique_type,
+      _any_unique,
+      _any_ambiguous_type,
+      _any_ambiguous,
+      _vector_of_enums,
+      _signed_enum,
+      _testrequirednestedflatbuffer,
+      _scalar_key_sorted_tables);
+}
+
+inline TypeAliasesT *TypeAliases::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<TypeAliasesT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TypeAliases::UnPackTo(TypeAliasesT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = i8(); _o->i8 = _e; }
+  { auto _e = u8(); _o->u8 = _e; }
+  { auto _e = i16(); _o->i16 = _e; }
+  { auto _e = u16(); _o->u16 = _e; }
+  { auto _e = i32(); _o->i32 = _e; }
+  { auto _e = u32(); _o->u32 = _e; }
+  { auto _e = i64(); _o->i64 = _e; }
+  { auto _e = u64(); _o->u64 = _e; }
+  { auto _e = f32(); _o->f32 = _e; }
+  { auto _e = f64(); _o->f64 = _e; }
+  { auto _e = v8(); if (_e) { _o->v8.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->v8.begin()); } }
+  { auto _e = vf64(); if (_e) { _o->vf64.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vf64[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<TypeAliases> TypeAliases::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTypeAliases(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliases(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TypeAliasesT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _i8 = _o->i8;
+  auto _u8 = _o->u8;
+  auto _i16 = _o->i16;
+  auto _u16 = _o->u16;
+  auto _i32 = _o->i32;
+  auto _u32 = _o->u32;
+  auto _i64 = _o->i64;
+  auto _u64 = _o->u64;
+  auto _f32 = _o->f32;
+  auto _f64 = _o->f64;
+  auto _v8 = _o->v8.size() ? _fbb.CreateVector(_o->v8) : 0;
+  auto _vf64 = _o->vf64.size() ? _fbb.CreateVector(_o->vf64) : 0;
+  return MyGame::Example::CreateTypeAliases(
+      _fbb,
+      _i8,
+      _u8,
+      _i16,
+      _u16,
+      _i32,
+      _u32,
+      _i64,
+      _u64,
+      _f32,
+      _f64,
+      _v8,
+      _vf64);
+}
+
+inline bool VerifyAny(flatbuffers::Verifier &verifier, const void *obj, Any type) {
+  switch (type) {
+    case Any::NONE: {
+      return true;
+    }
+    case Any::Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Any::TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Any::MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAny(
+        verifier,  values->Get(i), types->GetEnum<Any>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyUnion::UnPack(const void *obj, Any type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case Any::Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case Any::TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case Any::MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case Any::Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case Any::TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      return CreateTestSimpleTableWithEnum(_fbb, ptr, _rehasher).Union();
+    }
+    case Any::MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyUnion::AnyUnion(const AnyUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case Any::Monster: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case Any::TestSimpleTableWithEnum: {
+      value = new MyGame::Example::TestSimpleTableWithEnumT(*reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(u.value));
+      break;
+    }
+    case Any::MyGame_Example2_Monster: {
+      value = new MyGame::Example2::MonsterT(*reinterpret_cast<MyGame::Example2::MonsterT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyUnion::Reset() {
+  switch (type) {
+    case Any::Monster: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case Any::TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      delete ptr;
+      break;
+    }
+    case Any::MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<MyGame::Example2::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = Any::NONE;
+}
+
+inline bool VerifyAnyUniqueAliases(flatbuffers::Verifier &verifier, const void *obj, AnyUniqueAliases type) {
+  switch (type) {
+    case AnyUniqueAliases::NONE: {
+      return true;
+    }
+    case AnyUniqueAliases::M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyUniqueAliases::TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyUniqueAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyUniqueAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAnyUniqueAliases(
+        verifier,  values->Get(i), types->GetEnum<AnyUniqueAliases>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyUniqueAliasesUnion::UnPack(const void *obj, AnyUniqueAliases type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case AnyUniqueAliases::M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyUniqueAliases::TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyUniqueAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyUniqueAliasesUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case AnyUniqueAliases::M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyUniqueAliases::TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      return CreateTestSimpleTableWithEnum(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyUniqueAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyUniqueAliasesUnion::AnyUniqueAliasesUnion(const AnyUniqueAliasesUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case AnyUniqueAliases::M: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyUniqueAliases::TS: {
+      value = new MyGame::Example::TestSimpleTableWithEnumT(*reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(u.value));
+      break;
+    }
+    case AnyUniqueAliases::M2: {
+      value = new MyGame::Example2::MonsterT(*reinterpret_cast<MyGame::Example2::MonsterT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyUniqueAliasesUnion::Reset() {
+  switch (type) {
+    case AnyUniqueAliases::M: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyUniqueAliases::TS: {
+      auto ptr = reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyUniqueAliases::M2: {
+      auto ptr = reinterpret_cast<MyGame::Example2::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = AnyUniqueAliases::NONE;
+}
+
+inline bool VerifyAnyAmbiguousAliases(flatbuffers::Verifier &verifier, const void *obj, AnyAmbiguousAliases type) {
+  switch (type) {
+    case AnyAmbiguousAliases::NONE: {
+      return true;
+    }
+    case AnyAmbiguousAliases::M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyAmbiguousAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyAmbiguousAliases::M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyAmbiguousAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAnyAmbiguousAliases(
+        verifier,  values->Get(i), types->GetEnum<AnyAmbiguousAliases>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyAmbiguousAliasesUnion::UnPack(const void *obj, AnyAmbiguousAliases type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case AnyAmbiguousAliases::M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyAmbiguousAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyAmbiguousAliases::M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyAmbiguousAliasesUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case AnyAmbiguousAliases::M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyAmbiguousAliases::M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyAmbiguousAliases::M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyAmbiguousAliasesUnion::AnyAmbiguousAliasesUnion(const AnyAmbiguousAliasesUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case AnyAmbiguousAliases::M1: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyAmbiguousAliases::M2: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyAmbiguousAliases::M3: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyAmbiguousAliasesUnion::Reset() {
+  switch (type) {
+    case AnyAmbiguousAliases::M1: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyAmbiguousAliases::M2: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyAmbiguousAliases::M3: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = AnyAmbiguousAliases::NONE;
+}
+
+inline const flatbuffers::TypeTable *ColorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_UCHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable
+  };
+  static const int64_t values[] = { 1, 2, 8 };
+  static const char * const names[] = {
+    "Red",
+    "Green",
+    "Blue"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RaceTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::RaceTypeTable
+  };
+  static const int64_t values[] = { -1, 0, 1, 2 };
+  static const char * const names[] = {
+    "None",
+    "Human",
+    "Dwarf",
+    "Elf"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 4, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::TestSimpleTableWithEnumTypeTable,
+    MyGame::Example2::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Monster",
+    "TestSimpleTableWithEnum",
+    "MyGame_Example2_Monster"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyUniqueAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::TestSimpleTableWithEnumTypeTable,
+    MyGame::Example2::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "M",
+    "TS",
+    "M2"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyAmbiguousAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "M1",
+    "M2",
+    "M3"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace Example
+
+inline const flatbuffers::TypeTable *InParentNamespaceTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+namespace Example2 {
+
+inline const flatbuffers::TypeTable *MonsterTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+}  // namespace Example2
+
+namespace Example {
+
+inline const flatbuffers::TypeTable *TestTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 2, 4 };
+  static const char * const names[] = {
+    "a",
+    "b"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 2, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TestSimpleTableWithEnumTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable
+  };
+  static const char * const names[] = {
+    "color"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Vec3TypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable,
+    MyGame::Example::TestTypeTable
+  };
+  static const int64_t values[] = { 0, 4, 8, 16, 24, 26, 32 };
+  static const char * const names[] = {
+    "x",
+    "y",
+    "z",
+    "test1",
+    "test2",
+    "test3"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 6, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AbilityTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8 };
+  static const char * const names[] = {
+    "id",
+    "distance"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 2, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StructOfStructsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::AbilityTypeTable,
+    MyGame::Example::TestTypeTable
+  };
+  static const int64_t values[] = { 0, 8, 12, 20 };
+  static const char * const names[] = {
+    "a",
+    "b",
+    "c"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 3, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StatTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "id",
+    "val",
+    "count"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReferrableTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_ULONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "id"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MonsterTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_UCHAR, 0, 1 },
+    { flatbuffers::ET_UTYPE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 1, 3 },
+    { flatbuffers::ET_STRING, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 4 },
+    { flatbuffers::ET_SEQUENCE, 0, 4 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 5 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_BOOL, 1, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_STRING, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 6 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 3 },
+    { flatbuffers::ET_LONG, 1, -1 },
+    { flatbuffers::ET_DOUBLE, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 7 },
+    { flatbuffers::ET_SEQUENCE, 1, 8 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 8 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_UTYPE, 0, 9 },
+    { flatbuffers::ET_SEQUENCE, 0, 9 },
+    { flatbuffers::ET_UTYPE, 0, 10 },
+    { flatbuffers::ET_SEQUENCE, 0, 10 },
+    { flatbuffers::ET_UCHAR, 1, 1 },
+    { flatbuffers::ET_CHAR, 0, 11 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 5 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::Vec3TypeTable,
+    MyGame::Example::ColorTypeTable,
+    MyGame::Example::AnyTypeTable,
+    MyGame::Example::TestTypeTable,
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::StatTypeTable,
+    MyGame::Example::AbilityTypeTable,
+    MyGame::InParentNamespaceTypeTable,
+    MyGame::Example::ReferrableTypeTable,
+    MyGame::Example::AnyUniqueAliasesTypeTable,
+    MyGame::Example::AnyAmbiguousAliasesTypeTable,
+    MyGame::Example::RaceTypeTable
+  };
+  static const char * const names[] = {
+    "pos",
+    "mana",
+    "hp",
+    "name",
+    "friendly",
+    "inventory",
+    "color",
+    "test_type",
+    "test",
+    "test4",
+    "testarrayofstring",
+    "testarrayoftables",
+    "enemy",
+    "testnestedflatbuffer",
+    "testempty",
+    "testbool",
+    "testhashs32_fnv1",
+    "testhashu32_fnv1",
+    "testhashs64_fnv1",
+    "testhashu64_fnv1",
+    "testhashs32_fnv1a",
+    "testhashu32_fnv1a",
+    "testhashs64_fnv1a",
+    "testhashu64_fnv1a",
+    "testarrayofbools",
+    "testf",
+    "testf2",
+    "testf3",
+    "testarrayofstring2",
+    "testarrayofsortedstruct",
+    "flex",
+    "test5",
+    "vector_of_longs",
+    "vector_of_doubles",
+    "parent_namespace_test",
+    "vector_of_referrables",
+    "single_weak_reference",
+    "vector_of_weak_references",
+    "vector_of_strong_referrables",
+    "co_owning_reference",
+    "vector_of_co_owning_references",
+    "non_owning_reference",
+    "vector_of_non_owning_references",
+    "any_unique_type",
+    "any_unique",
+    "any_ambiguous_type",
+    "any_ambiguous",
+    "vector_of_enums",
+    "signed_enum",
+    "testrequirednestedflatbuffer",
+    "scalar_key_sorted_tables"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 51, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TypeAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_CHAR, 1, -1 },
+    { flatbuffers::ET_DOUBLE, 1, -1 }
+  };
+  static const char * const names[] = {
+    "i8",
+    "u8",
+    "i16",
+    "u16",
+    "i32",
+    "u32",
+    "i64",
+    "u64",
+    "f32",
+    "f64",
+    "v8",
+    "vf64"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 12, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const MyGame::Example::Monster *GetMonster(const void *buf) {
+  return flatbuffers::GetRoot<MyGame::Example::Monster>(buf);
+}
+
+inline const MyGame::Example::Monster *GetSizePrefixedMonster(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<MyGame::Example::Monster>(buf);
+}
+
+inline Monster *GetMutableMonster(void *buf) {
+  return flatbuffers::GetMutableRoot<Monster>(buf);
+}
+
+inline const char *MonsterIdentifier() {
+  return "MONS";
+}
+
+inline bool MonsterBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, MonsterIdentifier());
+}
+
+inline bool VerifyMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<MyGame::Example::Monster>(MonsterIdentifier());
+}
+
+inline bool VerifySizePrefixedMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<MyGame::Example::Monster>(MonsterIdentifier());
+}
+
+inline const char *MonsterExtension() {
+  return "mon";
+}
+
+inline void FinishMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::Monster> root) {
+  fbb.Finish(root, MonsterIdentifier());
+}
+
+inline void FinishSizePrefixedMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::Monster> root) {
+  fbb.FinishSizePrefixed(root, MonsterIdentifier());
+}
+
+inline std::unique_ptr<MyGame::Example::MonsterT> UnPackMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<MyGame::Example::MonsterT>(GetMonster(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<MyGame::Example::MonsterT> UnPackSizePrefixedMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<MyGame::Example::MonsterT>(GetSizePrefixedMonster(buf)->UnPack(res));
+}
+
+}  // namespace Example
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars2_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars2_generated.h
new file mode 100644
index 0000000..1aba093
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars2_generated.h
@@ -0,0 +1,902 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_OPTIONALSCALARS2_OPTIONAL_SCALARS_H_
+#define FLATBUFFERS_GENERATED_OPTIONALSCALARS2_OPTIONAL_SCALARS_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace optional_scalars {
+
+struct ScalarStuff;
+struct ScalarStuffBuilder;
+struct ScalarStuffT;
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable();
+
+enum class OptionalByte : int8_t {
+  None = 0,
+  One = 1,
+  Two = 2,
+  MIN = None,
+  MAX = Two
+};
+
+inline const OptionalByte (&EnumValuesOptionalByte())[3] {
+  static const OptionalByte values[] = {
+    OptionalByte::None,
+    OptionalByte::One,
+    OptionalByte::Two
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOptionalByte() {
+  static const char * const names[4] = {
+    "None",
+    "One",
+    "Two",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOptionalByte(OptionalByte e) {
+  if (flatbuffers::IsOutRange(e, OptionalByte::None, OptionalByte::Two)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOptionalByte()[index];
+}
+
+struct ScalarStuffT : public flatbuffers::NativeTable {
+  typedef ScalarStuff TableType;
+  int8_t just_i8 = 0;
+  flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt;
+  int8_t default_i8 = 42;
+  uint8_t just_u8 = 0;
+  flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt;
+  uint8_t default_u8 = 42;
+  int16_t just_i16 = 0;
+  flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt;
+  int16_t default_i16 = 42;
+  uint16_t just_u16 = 0;
+  flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt;
+  uint16_t default_u16 = 42;
+  int32_t just_i32 = 0;
+  flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt;
+  int32_t default_i32 = 42;
+  uint32_t just_u32 = 0;
+  flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt;
+  uint32_t default_u32 = 42;
+  int64_t just_i64 = 0;
+  flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt;
+  int64_t default_i64 = 42LL;
+  uint64_t just_u64 = 0;
+  flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt;
+  uint64_t default_u64 = 42ULL;
+  float just_f32 = 0.0f;
+  flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt;
+  float default_f32 = 42.0f;
+  double just_f64 = 0.0;
+  flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt;
+  double default_f64 = 42.0;
+  bool just_bool = false;
+  flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt;
+  bool default_bool = true;
+  optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte::None;
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt;
+  optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte::One;
+};
+
+struct ScalarStuff FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ScalarStuffT NativeTableType;
+  typedef ScalarStuffBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ScalarStuffTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_JUST_I8 = 4,
+    VT_MAYBE_I8 = 6,
+    VT_DEFAULT_I8 = 8,
+    VT_JUST_U8 = 10,
+    VT_MAYBE_U8 = 12,
+    VT_DEFAULT_U8 = 14,
+    VT_JUST_I16 = 16,
+    VT_MAYBE_I16 = 18,
+    VT_DEFAULT_I16 = 20,
+    VT_JUST_U16 = 22,
+    VT_MAYBE_U16 = 24,
+    VT_DEFAULT_U16 = 26,
+    VT_JUST_I32 = 28,
+    VT_MAYBE_I32 = 30,
+    VT_DEFAULT_I32 = 32,
+    VT_JUST_U32 = 34,
+    VT_MAYBE_U32 = 36,
+    VT_DEFAULT_U32 = 38,
+    VT_JUST_I64 = 40,
+    VT_MAYBE_I64 = 42,
+    VT_DEFAULT_I64 = 44,
+    VT_JUST_U64 = 46,
+    VT_MAYBE_U64 = 48,
+    VT_DEFAULT_U64 = 50,
+    VT_JUST_F32 = 52,
+    VT_MAYBE_F32 = 54,
+    VT_DEFAULT_F32 = 56,
+    VT_JUST_F64 = 58,
+    VT_MAYBE_F64 = 60,
+    VT_DEFAULT_F64 = 62,
+    VT_JUST_BOOL = 64,
+    VT_MAYBE_BOOL = 66,
+    VT_DEFAULT_BOOL = 68,
+    VT_JUST_ENUM = 70,
+    VT_MAYBE_ENUM = 72,
+    VT_DEFAULT_ENUM = 74
+  };
+  int8_t just_i8() const {
+    return GetField<int8_t>(VT_JUST_I8, 0);
+  }
+  bool mutate_just_i8(int8_t _just_i8) {
+    return SetField<int8_t>(VT_JUST_I8, _just_i8, 0);
+  }
+  flatbuffers::Optional<int8_t> maybe_i8() const {
+    return GetOptional<int8_t, int8_t>(VT_MAYBE_I8);
+  }
+  bool mutate_maybe_i8(int8_t _maybe_i8) {
+    return SetField<int8_t>(VT_MAYBE_I8, _maybe_i8);
+  }
+  int8_t default_i8() const {
+    return GetField<int8_t>(VT_DEFAULT_I8, 42);
+  }
+  bool mutate_default_i8(int8_t _default_i8) {
+    return SetField<int8_t>(VT_DEFAULT_I8, _default_i8, 42);
+  }
+  uint8_t just_u8() const {
+    return GetField<uint8_t>(VT_JUST_U8, 0);
+  }
+  bool mutate_just_u8(uint8_t _just_u8) {
+    return SetField<uint8_t>(VT_JUST_U8, _just_u8, 0);
+  }
+  flatbuffers::Optional<uint8_t> maybe_u8() const {
+    return GetOptional<uint8_t, uint8_t>(VT_MAYBE_U8);
+  }
+  bool mutate_maybe_u8(uint8_t _maybe_u8) {
+    return SetField<uint8_t>(VT_MAYBE_U8, _maybe_u8);
+  }
+  uint8_t default_u8() const {
+    return GetField<uint8_t>(VT_DEFAULT_U8, 42);
+  }
+  bool mutate_default_u8(uint8_t _default_u8) {
+    return SetField<uint8_t>(VT_DEFAULT_U8, _default_u8, 42);
+  }
+  int16_t just_i16() const {
+    return GetField<int16_t>(VT_JUST_I16, 0);
+  }
+  bool mutate_just_i16(int16_t _just_i16) {
+    return SetField<int16_t>(VT_JUST_I16, _just_i16, 0);
+  }
+  flatbuffers::Optional<int16_t> maybe_i16() const {
+    return GetOptional<int16_t, int16_t>(VT_MAYBE_I16);
+  }
+  bool mutate_maybe_i16(int16_t _maybe_i16) {
+    return SetField<int16_t>(VT_MAYBE_I16, _maybe_i16);
+  }
+  int16_t default_i16() const {
+    return GetField<int16_t>(VT_DEFAULT_I16, 42);
+  }
+  bool mutate_default_i16(int16_t _default_i16) {
+    return SetField<int16_t>(VT_DEFAULT_I16, _default_i16, 42);
+  }
+  uint16_t just_u16() const {
+    return GetField<uint16_t>(VT_JUST_U16, 0);
+  }
+  bool mutate_just_u16(uint16_t _just_u16) {
+    return SetField<uint16_t>(VT_JUST_U16, _just_u16, 0);
+  }
+  flatbuffers::Optional<uint16_t> maybe_u16() const {
+    return GetOptional<uint16_t, uint16_t>(VT_MAYBE_U16);
+  }
+  bool mutate_maybe_u16(uint16_t _maybe_u16) {
+    return SetField<uint16_t>(VT_MAYBE_U16, _maybe_u16);
+  }
+  uint16_t default_u16() const {
+    return GetField<uint16_t>(VT_DEFAULT_U16, 42);
+  }
+  bool mutate_default_u16(uint16_t _default_u16) {
+    return SetField<uint16_t>(VT_DEFAULT_U16, _default_u16, 42);
+  }
+  int32_t just_i32() const {
+    return GetField<int32_t>(VT_JUST_I32, 0);
+  }
+  bool mutate_just_i32(int32_t _just_i32) {
+    return SetField<int32_t>(VT_JUST_I32, _just_i32, 0);
+  }
+  flatbuffers::Optional<int32_t> maybe_i32() const {
+    return GetOptional<int32_t, int32_t>(VT_MAYBE_I32);
+  }
+  bool mutate_maybe_i32(int32_t _maybe_i32) {
+    return SetField<int32_t>(VT_MAYBE_I32, _maybe_i32);
+  }
+  int32_t default_i32() const {
+    return GetField<int32_t>(VT_DEFAULT_I32, 42);
+  }
+  bool mutate_default_i32(int32_t _default_i32) {
+    return SetField<int32_t>(VT_DEFAULT_I32, _default_i32, 42);
+  }
+  uint32_t just_u32() const {
+    return GetField<uint32_t>(VT_JUST_U32, 0);
+  }
+  bool mutate_just_u32(uint32_t _just_u32) {
+    return SetField<uint32_t>(VT_JUST_U32, _just_u32, 0);
+  }
+  flatbuffers::Optional<uint32_t> maybe_u32() const {
+    return GetOptional<uint32_t, uint32_t>(VT_MAYBE_U32);
+  }
+  bool mutate_maybe_u32(uint32_t _maybe_u32) {
+    return SetField<uint32_t>(VT_MAYBE_U32, _maybe_u32);
+  }
+  uint32_t default_u32() const {
+    return GetField<uint32_t>(VT_DEFAULT_U32, 42);
+  }
+  bool mutate_default_u32(uint32_t _default_u32) {
+    return SetField<uint32_t>(VT_DEFAULT_U32, _default_u32, 42);
+  }
+  int64_t just_i64() const {
+    return GetField<int64_t>(VT_JUST_I64, 0);
+  }
+  bool mutate_just_i64(int64_t _just_i64) {
+    return SetField<int64_t>(VT_JUST_I64, _just_i64, 0);
+  }
+  flatbuffers::Optional<int64_t> maybe_i64() const {
+    return GetOptional<int64_t, int64_t>(VT_MAYBE_I64);
+  }
+  bool mutate_maybe_i64(int64_t _maybe_i64) {
+    return SetField<int64_t>(VT_MAYBE_I64, _maybe_i64);
+  }
+  int64_t default_i64() const {
+    return GetField<int64_t>(VT_DEFAULT_I64, 42LL);
+  }
+  bool mutate_default_i64(int64_t _default_i64) {
+    return SetField<int64_t>(VT_DEFAULT_I64, _default_i64, 42LL);
+  }
+  uint64_t just_u64() const {
+    return GetField<uint64_t>(VT_JUST_U64, 0);
+  }
+  bool mutate_just_u64(uint64_t _just_u64) {
+    return SetField<uint64_t>(VT_JUST_U64, _just_u64, 0);
+  }
+  flatbuffers::Optional<uint64_t> maybe_u64() const {
+    return GetOptional<uint64_t, uint64_t>(VT_MAYBE_U64);
+  }
+  bool mutate_maybe_u64(uint64_t _maybe_u64) {
+    return SetField<uint64_t>(VT_MAYBE_U64, _maybe_u64);
+  }
+  uint64_t default_u64() const {
+    return GetField<uint64_t>(VT_DEFAULT_U64, 42ULL);
+  }
+  bool mutate_default_u64(uint64_t _default_u64) {
+    return SetField<uint64_t>(VT_DEFAULT_U64, _default_u64, 42ULL);
+  }
+  float just_f32() const {
+    return GetField<float>(VT_JUST_F32, 0.0f);
+  }
+  bool mutate_just_f32(float _just_f32) {
+    return SetField<float>(VT_JUST_F32, _just_f32, 0.0f);
+  }
+  flatbuffers::Optional<float> maybe_f32() const {
+    return GetOptional<float, float>(VT_MAYBE_F32);
+  }
+  bool mutate_maybe_f32(float _maybe_f32) {
+    return SetField<float>(VT_MAYBE_F32, _maybe_f32);
+  }
+  float default_f32() const {
+    return GetField<float>(VT_DEFAULT_F32, 42.0f);
+  }
+  bool mutate_default_f32(float _default_f32) {
+    return SetField<float>(VT_DEFAULT_F32, _default_f32, 42.0f);
+  }
+  double just_f64() const {
+    return GetField<double>(VT_JUST_F64, 0.0);
+  }
+  bool mutate_just_f64(double _just_f64) {
+    return SetField<double>(VT_JUST_F64, _just_f64, 0.0);
+  }
+  flatbuffers::Optional<double> maybe_f64() const {
+    return GetOptional<double, double>(VT_MAYBE_F64);
+  }
+  bool mutate_maybe_f64(double _maybe_f64) {
+    return SetField<double>(VT_MAYBE_F64, _maybe_f64);
+  }
+  double default_f64() const {
+    return GetField<double>(VT_DEFAULT_F64, 42.0);
+  }
+  bool mutate_default_f64(double _default_f64) {
+    return SetField<double>(VT_DEFAULT_F64, _default_f64, 42.0);
+  }
+  bool just_bool() const {
+    return GetField<uint8_t>(VT_JUST_BOOL, 0) != 0;
+  }
+  bool mutate_just_bool(bool _just_bool) {
+    return SetField<uint8_t>(VT_JUST_BOOL, static_cast<uint8_t>(_just_bool), 0);
+  }
+  flatbuffers::Optional<bool> maybe_bool() const {
+    return GetOptional<uint8_t, bool>(VT_MAYBE_BOOL);
+  }
+  bool mutate_maybe_bool(bool _maybe_bool) {
+    return SetField<uint8_t>(VT_MAYBE_BOOL, static_cast<uint8_t>(_maybe_bool));
+  }
+  bool default_bool() const {
+    return GetField<uint8_t>(VT_DEFAULT_BOOL, 1) != 0;
+  }
+  bool mutate_default_bool(bool _default_bool) {
+    return SetField<uint8_t>(VT_DEFAULT_BOOL, static_cast<uint8_t>(_default_bool), 1);
+  }
+  optional_scalars::OptionalByte just_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_JUST_ENUM, 0));
+  }
+  bool mutate_just_enum(optional_scalars::OptionalByte _just_enum) {
+    return SetField<int8_t>(VT_JUST_ENUM, static_cast<int8_t>(_just_enum), 0);
+  }
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum() const {
+    return GetOptional<int8_t, optional_scalars::OptionalByte>(VT_MAYBE_ENUM);
+  }
+  bool mutate_maybe_enum(optional_scalars::OptionalByte _maybe_enum) {
+    return SetField<int8_t>(VT_MAYBE_ENUM, static_cast<int8_t>(_maybe_enum));
+  }
+  optional_scalars::OptionalByte default_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_DEFAULT_ENUM, 1));
+  }
+  bool mutate_default_enum(optional_scalars::OptionalByte _default_enum) {
+    return SetField<int8_t>(VT_DEFAULT_ENUM, static_cast<int8_t>(_default_enum), 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_JUST_I8) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_I8) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_I8) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_U8) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_U8) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_U8) &&
+           VerifyField<int16_t>(verifier, VT_JUST_I16) &&
+           VerifyField<int16_t>(verifier, VT_MAYBE_I16) &&
+           VerifyField<int16_t>(verifier, VT_DEFAULT_I16) &&
+           VerifyField<uint16_t>(verifier, VT_JUST_U16) &&
+           VerifyField<uint16_t>(verifier, VT_MAYBE_U16) &&
+           VerifyField<uint16_t>(verifier, VT_DEFAULT_U16) &&
+           VerifyField<int32_t>(verifier, VT_JUST_I32) &&
+           VerifyField<int32_t>(verifier, VT_MAYBE_I32) &&
+           VerifyField<int32_t>(verifier, VT_DEFAULT_I32) &&
+           VerifyField<uint32_t>(verifier, VT_JUST_U32) &&
+           VerifyField<uint32_t>(verifier, VT_MAYBE_U32) &&
+           VerifyField<uint32_t>(verifier, VT_DEFAULT_U32) &&
+           VerifyField<int64_t>(verifier, VT_JUST_I64) &&
+           VerifyField<int64_t>(verifier, VT_MAYBE_I64) &&
+           VerifyField<int64_t>(verifier, VT_DEFAULT_I64) &&
+           VerifyField<uint64_t>(verifier, VT_JUST_U64) &&
+           VerifyField<uint64_t>(verifier, VT_MAYBE_U64) &&
+           VerifyField<uint64_t>(verifier, VT_DEFAULT_U64) &&
+           VerifyField<float>(verifier, VT_JUST_F32) &&
+           VerifyField<float>(verifier, VT_MAYBE_F32) &&
+           VerifyField<float>(verifier, VT_DEFAULT_F32) &&
+           VerifyField<double>(verifier, VT_JUST_F64) &&
+           VerifyField<double>(verifier, VT_MAYBE_F64) &&
+           VerifyField<double>(verifier, VT_DEFAULT_F64) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_BOOL) &&
+           VerifyField<int8_t>(verifier, VT_JUST_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_ENUM) &&
+           verifier.EndTable();
+  }
+  ScalarStuffT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ScalarStuff> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ScalarStuffBuilder {
+  typedef ScalarStuff Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_just_i8(int8_t just_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_I8, just_i8, 0);
+  }
+  void add_maybe_i8(int8_t maybe_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_I8, maybe_i8);
+  }
+  void add_default_i8(int8_t default_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_I8, default_i8, 42);
+  }
+  void add_just_u8(uint8_t just_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_U8, just_u8, 0);
+  }
+  void add_maybe_u8(uint8_t maybe_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_U8, maybe_u8);
+  }
+  void add_default_u8(uint8_t default_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_U8, default_u8, 42);
+  }
+  void add_just_i16(int16_t just_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_JUST_I16, just_i16, 0);
+  }
+  void add_maybe_i16(int16_t maybe_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_MAYBE_I16, maybe_i16);
+  }
+  void add_default_i16(int16_t default_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_DEFAULT_I16, default_i16, 42);
+  }
+  void add_just_u16(uint16_t just_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_JUST_U16, just_u16, 0);
+  }
+  void add_maybe_u16(uint16_t maybe_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_MAYBE_U16, maybe_u16);
+  }
+  void add_default_u16(uint16_t default_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_DEFAULT_U16, default_u16, 42);
+  }
+  void add_just_i32(int32_t just_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_JUST_I32, just_i32, 0);
+  }
+  void add_maybe_i32(int32_t maybe_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_MAYBE_I32, maybe_i32);
+  }
+  void add_default_i32(int32_t default_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_DEFAULT_I32, default_i32, 42);
+  }
+  void add_just_u32(uint32_t just_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_JUST_U32, just_u32, 0);
+  }
+  void add_maybe_u32(uint32_t maybe_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_MAYBE_U32, maybe_u32);
+  }
+  void add_default_u32(uint32_t default_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_DEFAULT_U32, default_u32, 42);
+  }
+  void add_just_i64(int64_t just_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_JUST_I64, just_i64, 0);
+  }
+  void add_maybe_i64(int64_t maybe_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_MAYBE_I64, maybe_i64);
+  }
+  void add_default_i64(int64_t default_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_DEFAULT_I64, default_i64, 42LL);
+  }
+  void add_just_u64(uint64_t just_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_JUST_U64, just_u64, 0);
+  }
+  void add_maybe_u64(uint64_t maybe_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_MAYBE_U64, maybe_u64);
+  }
+  void add_default_u64(uint64_t default_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_DEFAULT_U64, default_u64, 42ULL);
+  }
+  void add_just_f32(float just_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_JUST_F32, just_f32, 0.0f);
+  }
+  void add_maybe_f32(float maybe_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_MAYBE_F32, maybe_f32);
+  }
+  void add_default_f32(float default_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_DEFAULT_F32, default_f32, 42.0f);
+  }
+  void add_just_f64(double just_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_JUST_F64, just_f64, 0.0);
+  }
+  void add_maybe_f64(double maybe_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_MAYBE_F64, maybe_f64);
+  }
+  void add_default_f64(double default_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_DEFAULT_F64, default_f64, 42.0);
+  }
+  void add_just_bool(bool just_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_BOOL, static_cast<uint8_t>(just_bool), 0);
+  }
+  void add_maybe_bool(bool maybe_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_BOOL, static_cast<uint8_t>(maybe_bool));
+  }
+  void add_default_bool(bool default_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_BOOL, static_cast<uint8_t>(default_bool), 1);
+  }
+  void add_just_enum(optional_scalars::OptionalByte just_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_ENUM, static_cast<int8_t>(just_enum), 0);
+  }
+  void add_maybe_enum(optional_scalars::OptionalByte maybe_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_ENUM, static_cast<int8_t>(maybe_enum));
+  }
+  void add_default_enum(optional_scalars::OptionalByte default_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_ENUM, static_cast<int8_t>(default_enum), 1);
+  }
+  explicit ScalarStuffBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ScalarStuff> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScalarStuff>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t just_i8 = 0,
+    flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt,
+    int8_t default_i8 = 42,
+    uint8_t just_u8 = 0,
+    flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt,
+    uint8_t default_u8 = 42,
+    int16_t just_i16 = 0,
+    flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt,
+    int16_t default_i16 = 42,
+    uint16_t just_u16 = 0,
+    flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt,
+    uint16_t default_u16 = 42,
+    int32_t just_i32 = 0,
+    flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt,
+    int32_t default_i32 = 42,
+    uint32_t just_u32 = 0,
+    flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt,
+    uint32_t default_u32 = 42,
+    int64_t just_i64 = 0,
+    flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt,
+    int64_t default_i64 = 42LL,
+    uint64_t just_u64 = 0,
+    flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt,
+    uint64_t default_u64 = 42ULL,
+    float just_f32 = 0.0f,
+    flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt,
+    float default_f32 = 42.0f,
+    double just_f64 = 0.0,
+    flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt,
+    double default_f64 = 42.0,
+    bool just_bool = false,
+    flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt,
+    bool default_bool = true,
+    optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte::None,
+    flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt,
+    optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte::One) {
+  ScalarStuffBuilder builder_(_fbb);
+  builder_.add_default_f64(default_f64);
+  if(maybe_f64) { builder_.add_maybe_f64(*maybe_f64); }
+  builder_.add_just_f64(just_f64);
+  builder_.add_default_u64(default_u64);
+  if(maybe_u64) { builder_.add_maybe_u64(*maybe_u64); }
+  builder_.add_just_u64(just_u64);
+  builder_.add_default_i64(default_i64);
+  if(maybe_i64) { builder_.add_maybe_i64(*maybe_i64); }
+  builder_.add_just_i64(just_i64);
+  builder_.add_default_f32(default_f32);
+  if(maybe_f32) { builder_.add_maybe_f32(*maybe_f32); }
+  builder_.add_just_f32(just_f32);
+  builder_.add_default_u32(default_u32);
+  if(maybe_u32) { builder_.add_maybe_u32(*maybe_u32); }
+  builder_.add_just_u32(just_u32);
+  builder_.add_default_i32(default_i32);
+  if(maybe_i32) { builder_.add_maybe_i32(*maybe_i32); }
+  builder_.add_just_i32(just_i32);
+  builder_.add_default_u16(default_u16);
+  if(maybe_u16) { builder_.add_maybe_u16(*maybe_u16); }
+  builder_.add_just_u16(just_u16);
+  builder_.add_default_i16(default_i16);
+  if(maybe_i16) { builder_.add_maybe_i16(*maybe_i16); }
+  builder_.add_just_i16(just_i16);
+  builder_.add_default_enum(default_enum);
+  if(maybe_enum) { builder_.add_maybe_enum(*maybe_enum); }
+  builder_.add_just_enum(just_enum);
+  builder_.add_default_bool(default_bool);
+  if(maybe_bool) { builder_.add_maybe_bool(*maybe_bool); }
+  builder_.add_just_bool(just_bool);
+  builder_.add_default_u8(default_u8);
+  if(maybe_u8) { builder_.add_maybe_u8(*maybe_u8); }
+  builder_.add_just_u8(just_u8);
+  builder_.add_default_i8(default_i8);
+  if(maybe_i8) { builder_.add_maybe_i8(*maybe_i8); }
+  builder_.add_just_i8(just_i8);
+  return builder_.Finish();
+}
+
+struct ScalarStuff::Traits {
+  using type = ScalarStuff;
+  static auto constexpr Create = CreateScalarStuff;
+};
+
+flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline ScalarStuffT *ScalarStuff::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<ScalarStuffT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ScalarStuff::UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = just_i8(); _o->just_i8 = _e; }
+  { auto _e = maybe_i8(); _o->maybe_i8 = _e; }
+  { auto _e = default_i8(); _o->default_i8 = _e; }
+  { auto _e = just_u8(); _o->just_u8 = _e; }
+  { auto _e = maybe_u8(); _o->maybe_u8 = _e; }
+  { auto _e = default_u8(); _o->default_u8 = _e; }
+  { auto _e = just_i16(); _o->just_i16 = _e; }
+  { auto _e = maybe_i16(); _o->maybe_i16 = _e; }
+  { auto _e = default_i16(); _o->default_i16 = _e; }
+  { auto _e = just_u16(); _o->just_u16 = _e; }
+  { auto _e = maybe_u16(); _o->maybe_u16 = _e; }
+  { auto _e = default_u16(); _o->default_u16 = _e; }
+  { auto _e = just_i32(); _o->just_i32 = _e; }
+  { auto _e = maybe_i32(); _o->maybe_i32 = _e; }
+  { auto _e = default_i32(); _o->default_i32 = _e; }
+  { auto _e = just_u32(); _o->just_u32 = _e; }
+  { auto _e = maybe_u32(); _o->maybe_u32 = _e; }
+  { auto _e = default_u32(); _o->default_u32 = _e; }
+  { auto _e = just_i64(); _o->just_i64 = _e; }
+  { auto _e = maybe_i64(); _o->maybe_i64 = _e; }
+  { auto _e = default_i64(); _o->default_i64 = _e; }
+  { auto _e = just_u64(); _o->just_u64 = _e; }
+  { auto _e = maybe_u64(); _o->maybe_u64 = _e; }
+  { auto _e = default_u64(); _o->default_u64 = _e; }
+  { auto _e = just_f32(); _o->just_f32 = _e; }
+  { auto _e = maybe_f32(); _o->maybe_f32 = _e; }
+  { auto _e = default_f32(); _o->default_f32 = _e; }
+  { auto _e = just_f64(); _o->just_f64 = _e; }
+  { auto _e = maybe_f64(); _o->maybe_f64 = _e; }
+  { auto _e = default_f64(); _o->default_f64 = _e; }
+  { auto _e = just_bool(); _o->just_bool = _e; }
+  { auto _e = maybe_bool(); _o->maybe_bool = _e; }
+  { auto _e = default_bool(); _o->default_bool = _e; }
+  { auto _e = just_enum(); _o->just_enum = _e; }
+  { auto _e = maybe_enum(); _o->maybe_enum = _e; }
+  { auto _e = default_enum(); _o->default_enum = _e; }
+}
+
+inline flatbuffers::Offset<ScalarStuff> ScalarStuff::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateScalarStuff(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ScalarStuffT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _just_i8 = _o->just_i8;
+  auto _maybe_i8 = _o->maybe_i8;
+  auto _default_i8 = _o->default_i8;
+  auto _just_u8 = _o->just_u8;
+  auto _maybe_u8 = _o->maybe_u8;
+  auto _default_u8 = _o->default_u8;
+  auto _just_i16 = _o->just_i16;
+  auto _maybe_i16 = _o->maybe_i16;
+  auto _default_i16 = _o->default_i16;
+  auto _just_u16 = _o->just_u16;
+  auto _maybe_u16 = _o->maybe_u16;
+  auto _default_u16 = _o->default_u16;
+  auto _just_i32 = _o->just_i32;
+  auto _maybe_i32 = _o->maybe_i32;
+  auto _default_i32 = _o->default_i32;
+  auto _just_u32 = _o->just_u32;
+  auto _maybe_u32 = _o->maybe_u32;
+  auto _default_u32 = _o->default_u32;
+  auto _just_i64 = _o->just_i64;
+  auto _maybe_i64 = _o->maybe_i64;
+  auto _default_i64 = _o->default_i64;
+  auto _just_u64 = _o->just_u64;
+  auto _maybe_u64 = _o->maybe_u64;
+  auto _default_u64 = _o->default_u64;
+  auto _just_f32 = _o->just_f32;
+  auto _maybe_f32 = _o->maybe_f32;
+  auto _default_f32 = _o->default_f32;
+  auto _just_f64 = _o->just_f64;
+  auto _maybe_f64 = _o->maybe_f64;
+  auto _default_f64 = _o->default_f64;
+  auto _just_bool = _o->just_bool;
+  auto _maybe_bool = _o->maybe_bool;
+  auto _default_bool = _o->default_bool;
+  auto _just_enum = _o->just_enum;
+  auto _maybe_enum = _o->maybe_enum;
+  auto _default_enum = _o->default_enum;
+  return optional_scalars::CreateScalarStuff(
+      _fbb,
+      _just_i8,
+      _maybe_i8,
+      _default_i8,
+      _just_u8,
+      _maybe_u8,
+      _default_u8,
+      _just_i16,
+      _maybe_i16,
+      _default_i16,
+      _just_u16,
+      _maybe_u16,
+      _default_u16,
+      _just_i32,
+      _maybe_i32,
+      _default_i32,
+      _just_u32,
+      _maybe_u32,
+      _default_u32,
+      _just_i64,
+      _maybe_i64,
+      _default_i64,
+      _just_u64,
+      _maybe_u64,
+      _default_u64,
+      _just_f32,
+      _maybe_f32,
+      _default_f32,
+      _just_f64,
+      _maybe_f64,
+      _default_f64,
+      _just_bool,
+      _maybe_bool,
+      _default_bool,
+      _just_enum,
+      _maybe_enum,
+      _default_enum);
+}
+
+inline const flatbuffers::TypeTable *OptionalByteTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "None",
+    "One",
+    "Two"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "just_i8",
+    "maybe_i8",
+    "default_i8",
+    "just_u8",
+    "maybe_u8",
+    "default_u8",
+    "just_i16",
+    "maybe_i16",
+    "default_i16",
+    "just_u16",
+    "maybe_u16",
+    "default_u16",
+    "just_i32",
+    "maybe_i32",
+    "default_i32",
+    "just_u32",
+    "maybe_u32",
+    "default_u32",
+    "just_i64",
+    "maybe_i64",
+    "default_i64",
+    "just_u64",
+    "maybe_u64",
+    "default_u64",
+    "just_f32",
+    "maybe_f32",
+    "default_f32",
+    "just_f64",
+    "maybe_f64",
+    "default_f64",
+    "just_bool",
+    "maybe_bool",
+    "default_bool",
+    "just_enum",
+    "maybe_enum",
+    "default_enum"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 36, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const optional_scalars::ScalarStuff *GetScalarStuff(const void *buf) {
+  return flatbuffers::GetRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline const optional_scalars::ScalarStuff *GetSizePrefixedScalarStuff(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline ScalarStuff *GetMutableScalarStuff(void *buf) {
+  return flatbuffers::GetMutableRoot<ScalarStuff>(buf);
+}
+
+inline const char *ScalarStuffIdentifier() {
+  return "NULL";
+}
+
+inline bool ScalarStuffBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ScalarStuffIdentifier());
+}
+
+inline bool VerifyScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline bool VerifySizePrefixedScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline const char *ScalarStuffExtension() {
+  return "mon";
+}
+
+inline void FinishScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.Finish(root, ScalarStuffIdentifier());
+}
+
+inline void FinishSizePrefixedScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.FinishSizePrefixed(root, ScalarStuffIdentifier());
+}
+
+inline std::unique_ptr<optional_scalars::ScalarStuffT> UnPackScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<optional_scalars::ScalarStuffT>(GetScalarStuff(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<optional_scalars::ScalarStuffT> UnPackSizePrefixedScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<optional_scalars::ScalarStuffT>(GetSizePrefixedScalarStuff(buf)->UnPack(res));
+}
+
+}  // namespace optional_scalars
+
+#endif  // FLATBUFFERS_GENERATED_OPTIONALSCALARS2_OPTIONAL_SCALARS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars_generated.h
new file mode 100644
index 0000000..4e99e7c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/generated_cpp17/optional_scalars_generated.h
@@ -0,0 +1,985 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
+#define FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace optional_scalars {
+
+struct ScalarStuff;
+struct ScalarStuffBuilder;
+struct ScalarStuffT;
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable();
+
+enum class OptionalByte : int8_t {
+  None = 0,
+  One = 1,
+  Two = 2,
+  MIN = None,
+  MAX = Two
+};
+
+inline const OptionalByte (&EnumValuesOptionalByte())[3] {
+  static const OptionalByte values[] = {
+    OptionalByte::None,
+    OptionalByte::One,
+    OptionalByte::Two
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOptionalByte() {
+  static const char * const names[4] = {
+    "None",
+    "One",
+    "Two",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOptionalByte(OptionalByte e) {
+  if (flatbuffers::IsOutRange(e, OptionalByte::None, OptionalByte::Two)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOptionalByte()[index];
+}
+
+struct ScalarStuffT : public flatbuffers::NativeTable {
+  typedef ScalarStuff TableType;
+  int8_t just_i8 = 0;
+  flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt;
+  int8_t default_i8 = 42;
+  uint8_t just_u8 = 0;
+  flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt;
+  uint8_t default_u8 = 42;
+  int16_t just_i16 = 0;
+  flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt;
+  int16_t default_i16 = 42;
+  uint16_t just_u16 = 0;
+  flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt;
+  uint16_t default_u16 = 42;
+  int32_t just_i32 = 0;
+  flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt;
+  int32_t default_i32 = 42;
+  uint32_t just_u32 = 0;
+  flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt;
+  uint32_t default_u32 = 42;
+  int64_t just_i64 = 0;
+  flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt;
+  int64_t default_i64 = 42LL;
+  uint64_t just_u64 = 0;
+  flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt;
+  uint64_t default_u64 = 42ULL;
+  float just_f32 = 0.0f;
+  flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt;
+  float default_f32 = 42.0f;
+  double just_f64 = 0.0;
+  flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt;
+  double default_f64 = 42.0;
+  bool just_bool = false;
+  flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt;
+  bool default_bool = true;
+  optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte::None;
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt;
+  optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte::One;
+};
+
+struct ScalarStuff FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ScalarStuffT NativeTableType;
+  typedef ScalarStuffBuilder Builder;
+  struct Traits;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ScalarStuffTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_JUST_I8 = 4,
+    VT_MAYBE_I8 = 6,
+    VT_DEFAULT_I8 = 8,
+    VT_JUST_U8 = 10,
+    VT_MAYBE_U8 = 12,
+    VT_DEFAULT_U8 = 14,
+    VT_JUST_I16 = 16,
+    VT_MAYBE_I16 = 18,
+    VT_DEFAULT_I16 = 20,
+    VT_JUST_U16 = 22,
+    VT_MAYBE_U16 = 24,
+    VT_DEFAULT_U16 = 26,
+    VT_JUST_I32 = 28,
+    VT_MAYBE_I32 = 30,
+    VT_DEFAULT_I32 = 32,
+    VT_JUST_U32 = 34,
+    VT_MAYBE_U32 = 36,
+    VT_DEFAULT_U32 = 38,
+    VT_JUST_I64 = 40,
+    VT_MAYBE_I64 = 42,
+    VT_DEFAULT_I64 = 44,
+    VT_JUST_U64 = 46,
+    VT_MAYBE_U64 = 48,
+    VT_DEFAULT_U64 = 50,
+    VT_JUST_F32 = 52,
+    VT_MAYBE_F32 = 54,
+    VT_DEFAULT_F32 = 56,
+    VT_JUST_F64 = 58,
+    VT_MAYBE_F64 = 60,
+    VT_DEFAULT_F64 = 62,
+    VT_JUST_BOOL = 64,
+    VT_MAYBE_BOOL = 66,
+    VT_DEFAULT_BOOL = 68,
+    VT_JUST_ENUM = 70,
+    VT_MAYBE_ENUM = 72,
+    VT_DEFAULT_ENUM = 74
+  };
+  int8_t just_i8() const {
+    return GetField<int8_t>(VT_JUST_I8, 0);
+  }
+  bool mutate_just_i8(int8_t _just_i8) {
+    return SetField<int8_t>(VT_JUST_I8, _just_i8, 0);
+  }
+  flatbuffers::Optional<int8_t> maybe_i8() const {
+    return GetOptional<int8_t, int8_t>(VT_MAYBE_I8);
+  }
+  bool mutate_maybe_i8(int8_t _maybe_i8) {
+    return SetField<int8_t>(VT_MAYBE_I8, _maybe_i8);
+  }
+  int8_t default_i8() const {
+    return GetField<int8_t>(VT_DEFAULT_I8, 42);
+  }
+  bool mutate_default_i8(int8_t _default_i8) {
+    return SetField<int8_t>(VT_DEFAULT_I8, _default_i8, 42);
+  }
+  uint8_t just_u8() const {
+    return GetField<uint8_t>(VT_JUST_U8, 0);
+  }
+  bool mutate_just_u8(uint8_t _just_u8) {
+    return SetField<uint8_t>(VT_JUST_U8, _just_u8, 0);
+  }
+  flatbuffers::Optional<uint8_t> maybe_u8() const {
+    return GetOptional<uint8_t, uint8_t>(VT_MAYBE_U8);
+  }
+  bool mutate_maybe_u8(uint8_t _maybe_u8) {
+    return SetField<uint8_t>(VT_MAYBE_U8, _maybe_u8);
+  }
+  uint8_t default_u8() const {
+    return GetField<uint8_t>(VT_DEFAULT_U8, 42);
+  }
+  bool mutate_default_u8(uint8_t _default_u8) {
+    return SetField<uint8_t>(VT_DEFAULT_U8, _default_u8, 42);
+  }
+  int16_t just_i16() const {
+    return GetField<int16_t>(VT_JUST_I16, 0);
+  }
+  bool mutate_just_i16(int16_t _just_i16) {
+    return SetField<int16_t>(VT_JUST_I16, _just_i16, 0);
+  }
+  flatbuffers::Optional<int16_t> maybe_i16() const {
+    return GetOptional<int16_t, int16_t>(VT_MAYBE_I16);
+  }
+  bool mutate_maybe_i16(int16_t _maybe_i16) {
+    return SetField<int16_t>(VT_MAYBE_I16, _maybe_i16);
+  }
+  int16_t default_i16() const {
+    return GetField<int16_t>(VT_DEFAULT_I16, 42);
+  }
+  bool mutate_default_i16(int16_t _default_i16) {
+    return SetField<int16_t>(VT_DEFAULT_I16, _default_i16, 42);
+  }
+  uint16_t just_u16() const {
+    return GetField<uint16_t>(VT_JUST_U16, 0);
+  }
+  bool mutate_just_u16(uint16_t _just_u16) {
+    return SetField<uint16_t>(VT_JUST_U16, _just_u16, 0);
+  }
+  flatbuffers::Optional<uint16_t> maybe_u16() const {
+    return GetOptional<uint16_t, uint16_t>(VT_MAYBE_U16);
+  }
+  bool mutate_maybe_u16(uint16_t _maybe_u16) {
+    return SetField<uint16_t>(VT_MAYBE_U16, _maybe_u16);
+  }
+  uint16_t default_u16() const {
+    return GetField<uint16_t>(VT_DEFAULT_U16, 42);
+  }
+  bool mutate_default_u16(uint16_t _default_u16) {
+    return SetField<uint16_t>(VT_DEFAULT_U16, _default_u16, 42);
+  }
+  int32_t just_i32() const {
+    return GetField<int32_t>(VT_JUST_I32, 0);
+  }
+  bool mutate_just_i32(int32_t _just_i32) {
+    return SetField<int32_t>(VT_JUST_I32, _just_i32, 0);
+  }
+  flatbuffers::Optional<int32_t> maybe_i32() const {
+    return GetOptional<int32_t, int32_t>(VT_MAYBE_I32);
+  }
+  bool mutate_maybe_i32(int32_t _maybe_i32) {
+    return SetField<int32_t>(VT_MAYBE_I32, _maybe_i32);
+  }
+  int32_t default_i32() const {
+    return GetField<int32_t>(VT_DEFAULT_I32, 42);
+  }
+  bool mutate_default_i32(int32_t _default_i32) {
+    return SetField<int32_t>(VT_DEFAULT_I32, _default_i32, 42);
+  }
+  uint32_t just_u32() const {
+    return GetField<uint32_t>(VT_JUST_U32, 0);
+  }
+  bool mutate_just_u32(uint32_t _just_u32) {
+    return SetField<uint32_t>(VT_JUST_U32, _just_u32, 0);
+  }
+  flatbuffers::Optional<uint32_t> maybe_u32() const {
+    return GetOptional<uint32_t, uint32_t>(VT_MAYBE_U32);
+  }
+  bool mutate_maybe_u32(uint32_t _maybe_u32) {
+    return SetField<uint32_t>(VT_MAYBE_U32, _maybe_u32);
+  }
+  uint32_t default_u32() const {
+    return GetField<uint32_t>(VT_DEFAULT_U32, 42);
+  }
+  bool mutate_default_u32(uint32_t _default_u32) {
+    return SetField<uint32_t>(VT_DEFAULT_U32, _default_u32, 42);
+  }
+  int64_t just_i64() const {
+    return GetField<int64_t>(VT_JUST_I64, 0);
+  }
+  bool mutate_just_i64(int64_t _just_i64) {
+    return SetField<int64_t>(VT_JUST_I64, _just_i64, 0);
+  }
+  flatbuffers::Optional<int64_t> maybe_i64() const {
+    return GetOptional<int64_t, int64_t>(VT_MAYBE_I64);
+  }
+  bool mutate_maybe_i64(int64_t _maybe_i64) {
+    return SetField<int64_t>(VT_MAYBE_I64, _maybe_i64);
+  }
+  int64_t default_i64() const {
+    return GetField<int64_t>(VT_DEFAULT_I64, 42LL);
+  }
+  bool mutate_default_i64(int64_t _default_i64) {
+    return SetField<int64_t>(VT_DEFAULT_I64, _default_i64, 42LL);
+  }
+  uint64_t just_u64() const {
+    return GetField<uint64_t>(VT_JUST_U64, 0);
+  }
+  bool mutate_just_u64(uint64_t _just_u64) {
+    return SetField<uint64_t>(VT_JUST_U64, _just_u64, 0);
+  }
+  flatbuffers::Optional<uint64_t> maybe_u64() const {
+    return GetOptional<uint64_t, uint64_t>(VT_MAYBE_U64);
+  }
+  bool mutate_maybe_u64(uint64_t _maybe_u64) {
+    return SetField<uint64_t>(VT_MAYBE_U64, _maybe_u64);
+  }
+  uint64_t default_u64() const {
+    return GetField<uint64_t>(VT_DEFAULT_U64, 42ULL);
+  }
+  bool mutate_default_u64(uint64_t _default_u64) {
+    return SetField<uint64_t>(VT_DEFAULT_U64, _default_u64, 42ULL);
+  }
+  float just_f32() const {
+    return GetField<float>(VT_JUST_F32, 0.0f);
+  }
+  bool mutate_just_f32(float _just_f32) {
+    return SetField<float>(VT_JUST_F32, _just_f32, 0.0f);
+  }
+  flatbuffers::Optional<float> maybe_f32() const {
+    return GetOptional<float, float>(VT_MAYBE_F32);
+  }
+  bool mutate_maybe_f32(float _maybe_f32) {
+    return SetField<float>(VT_MAYBE_F32, _maybe_f32);
+  }
+  float default_f32() const {
+    return GetField<float>(VT_DEFAULT_F32, 42.0f);
+  }
+  bool mutate_default_f32(float _default_f32) {
+    return SetField<float>(VT_DEFAULT_F32, _default_f32, 42.0f);
+  }
+  double just_f64() const {
+    return GetField<double>(VT_JUST_F64, 0.0);
+  }
+  bool mutate_just_f64(double _just_f64) {
+    return SetField<double>(VT_JUST_F64, _just_f64, 0.0);
+  }
+  flatbuffers::Optional<double> maybe_f64() const {
+    return GetOptional<double, double>(VT_MAYBE_F64);
+  }
+  bool mutate_maybe_f64(double _maybe_f64) {
+    return SetField<double>(VT_MAYBE_F64, _maybe_f64);
+  }
+  double default_f64() const {
+    return GetField<double>(VT_DEFAULT_F64, 42.0);
+  }
+  bool mutate_default_f64(double _default_f64) {
+    return SetField<double>(VT_DEFAULT_F64, _default_f64, 42.0);
+  }
+  bool just_bool() const {
+    return GetField<uint8_t>(VT_JUST_BOOL, 0) != 0;
+  }
+  bool mutate_just_bool(bool _just_bool) {
+    return SetField<uint8_t>(VT_JUST_BOOL, static_cast<uint8_t>(_just_bool), 0);
+  }
+  flatbuffers::Optional<bool> maybe_bool() const {
+    return GetOptional<uint8_t, bool>(VT_MAYBE_BOOL);
+  }
+  bool mutate_maybe_bool(bool _maybe_bool) {
+    return SetField<uint8_t>(VT_MAYBE_BOOL, static_cast<uint8_t>(_maybe_bool));
+  }
+  bool default_bool() const {
+    return GetField<uint8_t>(VT_DEFAULT_BOOL, 1) != 0;
+  }
+  bool mutate_default_bool(bool _default_bool) {
+    return SetField<uint8_t>(VT_DEFAULT_BOOL, static_cast<uint8_t>(_default_bool), 1);
+  }
+  optional_scalars::OptionalByte just_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_JUST_ENUM, 0));
+  }
+  bool mutate_just_enum(optional_scalars::OptionalByte _just_enum) {
+    return SetField<int8_t>(VT_JUST_ENUM, static_cast<int8_t>(_just_enum), 0);
+  }
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum() const {
+    return GetOptional<int8_t, optional_scalars::OptionalByte>(VT_MAYBE_ENUM);
+  }
+  bool mutate_maybe_enum(optional_scalars::OptionalByte _maybe_enum) {
+    return SetField<int8_t>(VT_MAYBE_ENUM, static_cast<int8_t>(_maybe_enum));
+  }
+  optional_scalars::OptionalByte default_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_DEFAULT_ENUM, 1));
+  }
+  bool mutate_default_enum(optional_scalars::OptionalByte _default_enum) {
+    return SetField<int8_t>(VT_DEFAULT_ENUM, static_cast<int8_t>(_default_enum), 1);
+  }
+  template<size_t Index>
+  auto get_field() const {
+         if constexpr (Index == 0) return just_i8();
+    else if constexpr (Index == 1) return maybe_i8();
+    else if constexpr (Index == 2) return default_i8();
+    else if constexpr (Index == 3) return just_u8();
+    else if constexpr (Index == 4) return maybe_u8();
+    else if constexpr (Index == 5) return default_u8();
+    else if constexpr (Index == 6) return just_i16();
+    else if constexpr (Index == 7) return maybe_i16();
+    else if constexpr (Index == 8) return default_i16();
+    else if constexpr (Index == 9) return just_u16();
+    else if constexpr (Index == 10) return maybe_u16();
+    else if constexpr (Index == 11) return default_u16();
+    else if constexpr (Index == 12) return just_i32();
+    else if constexpr (Index == 13) return maybe_i32();
+    else if constexpr (Index == 14) return default_i32();
+    else if constexpr (Index == 15) return just_u32();
+    else if constexpr (Index == 16) return maybe_u32();
+    else if constexpr (Index == 17) return default_u32();
+    else if constexpr (Index == 18) return just_i64();
+    else if constexpr (Index == 19) return maybe_i64();
+    else if constexpr (Index == 20) return default_i64();
+    else if constexpr (Index == 21) return just_u64();
+    else if constexpr (Index == 22) return maybe_u64();
+    else if constexpr (Index == 23) return default_u64();
+    else if constexpr (Index == 24) return just_f32();
+    else if constexpr (Index == 25) return maybe_f32();
+    else if constexpr (Index == 26) return default_f32();
+    else if constexpr (Index == 27) return just_f64();
+    else if constexpr (Index == 28) return maybe_f64();
+    else if constexpr (Index == 29) return default_f64();
+    else if constexpr (Index == 30) return just_bool();
+    else if constexpr (Index == 31) return maybe_bool();
+    else if constexpr (Index == 32) return default_bool();
+    else if constexpr (Index == 33) return just_enum();
+    else if constexpr (Index == 34) return maybe_enum();
+    else if constexpr (Index == 35) return default_enum();
+    else static_assert(Index != Index, "Invalid Field Index");
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_JUST_I8) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_I8) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_I8) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_U8) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_U8) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_U8) &&
+           VerifyField<int16_t>(verifier, VT_JUST_I16) &&
+           VerifyField<int16_t>(verifier, VT_MAYBE_I16) &&
+           VerifyField<int16_t>(verifier, VT_DEFAULT_I16) &&
+           VerifyField<uint16_t>(verifier, VT_JUST_U16) &&
+           VerifyField<uint16_t>(verifier, VT_MAYBE_U16) &&
+           VerifyField<uint16_t>(verifier, VT_DEFAULT_U16) &&
+           VerifyField<int32_t>(verifier, VT_JUST_I32) &&
+           VerifyField<int32_t>(verifier, VT_MAYBE_I32) &&
+           VerifyField<int32_t>(verifier, VT_DEFAULT_I32) &&
+           VerifyField<uint32_t>(verifier, VT_JUST_U32) &&
+           VerifyField<uint32_t>(verifier, VT_MAYBE_U32) &&
+           VerifyField<uint32_t>(verifier, VT_DEFAULT_U32) &&
+           VerifyField<int64_t>(verifier, VT_JUST_I64) &&
+           VerifyField<int64_t>(verifier, VT_MAYBE_I64) &&
+           VerifyField<int64_t>(verifier, VT_DEFAULT_I64) &&
+           VerifyField<uint64_t>(verifier, VT_JUST_U64) &&
+           VerifyField<uint64_t>(verifier, VT_MAYBE_U64) &&
+           VerifyField<uint64_t>(verifier, VT_DEFAULT_U64) &&
+           VerifyField<float>(verifier, VT_JUST_F32) &&
+           VerifyField<float>(verifier, VT_MAYBE_F32) &&
+           VerifyField<float>(verifier, VT_DEFAULT_F32) &&
+           VerifyField<double>(verifier, VT_JUST_F64) &&
+           VerifyField<double>(verifier, VT_MAYBE_F64) &&
+           VerifyField<double>(verifier, VT_DEFAULT_F64) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_BOOL) &&
+           VerifyField<int8_t>(verifier, VT_JUST_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_ENUM) &&
+           verifier.EndTable();
+  }
+  ScalarStuffT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ScalarStuff> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ScalarStuffBuilder {
+  typedef ScalarStuff Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_just_i8(int8_t just_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_I8, just_i8, 0);
+  }
+  void add_maybe_i8(int8_t maybe_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_I8, maybe_i8);
+  }
+  void add_default_i8(int8_t default_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_I8, default_i8, 42);
+  }
+  void add_just_u8(uint8_t just_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_U8, just_u8, 0);
+  }
+  void add_maybe_u8(uint8_t maybe_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_U8, maybe_u8);
+  }
+  void add_default_u8(uint8_t default_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_U8, default_u8, 42);
+  }
+  void add_just_i16(int16_t just_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_JUST_I16, just_i16, 0);
+  }
+  void add_maybe_i16(int16_t maybe_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_MAYBE_I16, maybe_i16);
+  }
+  void add_default_i16(int16_t default_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_DEFAULT_I16, default_i16, 42);
+  }
+  void add_just_u16(uint16_t just_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_JUST_U16, just_u16, 0);
+  }
+  void add_maybe_u16(uint16_t maybe_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_MAYBE_U16, maybe_u16);
+  }
+  void add_default_u16(uint16_t default_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_DEFAULT_U16, default_u16, 42);
+  }
+  void add_just_i32(int32_t just_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_JUST_I32, just_i32, 0);
+  }
+  void add_maybe_i32(int32_t maybe_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_MAYBE_I32, maybe_i32);
+  }
+  void add_default_i32(int32_t default_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_DEFAULT_I32, default_i32, 42);
+  }
+  void add_just_u32(uint32_t just_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_JUST_U32, just_u32, 0);
+  }
+  void add_maybe_u32(uint32_t maybe_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_MAYBE_U32, maybe_u32);
+  }
+  void add_default_u32(uint32_t default_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_DEFAULT_U32, default_u32, 42);
+  }
+  void add_just_i64(int64_t just_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_JUST_I64, just_i64, 0);
+  }
+  void add_maybe_i64(int64_t maybe_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_MAYBE_I64, maybe_i64);
+  }
+  void add_default_i64(int64_t default_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_DEFAULT_I64, default_i64, 42LL);
+  }
+  void add_just_u64(uint64_t just_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_JUST_U64, just_u64, 0);
+  }
+  void add_maybe_u64(uint64_t maybe_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_MAYBE_U64, maybe_u64);
+  }
+  void add_default_u64(uint64_t default_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_DEFAULT_U64, default_u64, 42ULL);
+  }
+  void add_just_f32(float just_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_JUST_F32, just_f32, 0.0f);
+  }
+  void add_maybe_f32(float maybe_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_MAYBE_F32, maybe_f32);
+  }
+  void add_default_f32(float default_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_DEFAULT_F32, default_f32, 42.0f);
+  }
+  void add_just_f64(double just_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_JUST_F64, just_f64, 0.0);
+  }
+  void add_maybe_f64(double maybe_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_MAYBE_F64, maybe_f64);
+  }
+  void add_default_f64(double default_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_DEFAULT_F64, default_f64, 42.0);
+  }
+  void add_just_bool(bool just_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_BOOL, static_cast<uint8_t>(just_bool), 0);
+  }
+  void add_maybe_bool(bool maybe_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_BOOL, static_cast<uint8_t>(maybe_bool));
+  }
+  void add_default_bool(bool default_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_BOOL, static_cast<uint8_t>(default_bool), 1);
+  }
+  void add_just_enum(optional_scalars::OptionalByte just_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_ENUM, static_cast<int8_t>(just_enum), 0);
+  }
+  void add_maybe_enum(optional_scalars::OptionalByte maybe_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_ENUM, static_cast<int8_t>(maybe_enum));
+  }
+  void add_default_enum(optional_scalars::OptionalByte default_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_ENUM, static_cast<int8_t>(default_enum), 1);
+  }
+  explicit ScalarStuffBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ScalarStuff> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScalarStuff>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t just_i8 = 0,
+    flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt,
+    int8_t default_i8 = 42,
+    uint8_t just_u8 = 0,
+    flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt,
+    uint8_t default_u8 = 42,
+    int16_t just_i16 = 0,
+    flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt,
+    int16_t default_i16 = 42,
+    uint16_t just_u16 = 0,
+    flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt,
+    uint16_t default_u16 = 42,
+    int32_t just_i32 = 0,
+    flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt,
+    int32_t default_i32 = 42,
+    uint32_t just_u32 = 0,
+    flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt,
+    uint32_t default_u32 = 42,
+    int64_t just_i64 = 0,
+    flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt,
+    int64_t default_i64 = 42LL,
+    uint64_t just_u64 = 0,
+    flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt,
+    uint64_t default_u64 = 42ULL,
+    float just_f32 = 0.0f,
+    flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt,
+    float default_f32 = 42.0f,
+    double just_f64 = 0.0,
+    flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt,
+    double default_f64 = 42.0,
+    bool just_bool = false,
+    flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt,
+    bool default_bool = true,
+    optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte::None,
+    flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt,
+    optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte::One) {
+  ScalarStuffBuilder builder_(_fbb);
+  builder_.add_default_f64(default_f64);
+  if(maybe_f64) { builder_.add_maybe_f64(*maybe_f64); }
+  builder_.add_just_f64(just_f64);
+  builder_.add_default_u64(default_u64);
+  if(maybe_u64) { builder_.add_maybe_u64(*maybe_u64); }
+  builder_.add_just_u64(just_u64);
+  builder_.add_default_i64(default_i64);
+  if(maybe_i64) { builder_.add_maybe_i64(*maybe_i64); }
+  builder_.add_just_i64(just_i64);
+  builder_.add_default_f32(default_f32);
+  if(maybe_f32) { builder_.add_maybe_f32(*maybe_f32); }
+  builder_.add_just_f32(just_f32);
+  builder_.add_default_u32(default_u32);
+  if(maybe_u32) { builder_.add_maybe_u32(*maybe_u32); }
+  builder_.add_just_u32(just_u32);
+  builder_.add_default_i32(default_i32);
+  if(maybe_i32) { builder_.add_maybe_i32(*maybe_i32); }
+  builder_.add_just_i32(just_i32);
+  builder_.add_default_u16(default_u16);
+  if(maybe_u16) { builder_.add_maybe_u16(*maybe_u16); }
+  builder_.add_just_u16(just_u16);
+  builder_.add_default_i16(default_i16);
+  if(maybe_i16) { builder_.add_maybe_i16(*maybe_i16); }
+  builder_.add_just_i16(just_i16);
+  builder_.add_default_enum(default_enum);
+  if(maybe_enum) { builder_.add_maybe_enum(*maybe_enum); }
+  builder_.add_just_enum(just_enum);
+  builder_.add_default_bool(default_bool);
+  if(maybe_bool) { builder_.add_maybe_bool(*maybe_bool); }
+  builder_.add_just_bool(just_bool);
+  builder_.add_default_u8(default_u8);
+  if(maybe_u8) { builder_.add_maybe_u8(*maybe_u8); }
+  builder_.add_just_u8(just_u8);
+  builder_.add_default_i8(default_i8);
+  if(maybe_i8) { builder_.add_maybe_i8(*maybe_i8); }
+  builder_.add_just_i8(just_i8);
+  return builder_.Finish();
+}
+
+struct ScalarStuff::Traits {
+  using type = ScalarStuff;
+  static auto constexpr Create = CreateScalarStuff;
+  static constexpr auto name = "ScalarStuff";
+  static constexpr auto fully_qualified_name = "optional_scalars.ScalarStuff";
+  static constexpr std::array<const char *, 36> field_names = {
+    "just_i8",
+    "maybe_i8",
+    "default_i8",
+    "just_u8",
+    "maybe_u8",
+    "default_u8",
+    "just_i16",
+    "maybe_i16",
+    "default_i16",
+    "just_u16",
+    "maybe_u16",
+    "default_u16",
+    "just_i32",
+    "maybe_i32",
+    "default_i32",
+    "just_u32",
+    "maybe_u32",
+    "default_u32",
+    "just_i64",
+    "maybe_i64",
+    "default_i64",
+    "just_u64",
+    "maybe_u64",
+    "default_u64",
+    "just_f32",
+    "maybe_f32",
+    "default_f32",
+    "just_f64",
+    "maybe_f64",
+    "default_f64",
+    "just_bool",
+    "maybe_bool",
+    "default_bool",
+    "just_enum",
+    "maybe_enum",
+    "default_enum"
+  };
+  template<size_t Index>
+  using FieldType = decltype(std::declval<type>().get_field<Index>());
+  static constexpr size_t fields_number = 36;
+};
+
+flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline ScalarStuffT *ScalarStuff::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::make_unique<ScalarStuffT>();
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ScalarStuff::UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = just_i8(); _o->just_i8 = _e; }
+  { auto _e = maybe_i8(); _o->maybe_i8 = _e; }
+  { auto _e = default_i8(); _o->default_i8 = _e; }
+  { auto _e = just_u8(); _o->just_u8 = _e; }
+  { auto _e = maybe_u8(); _o->maybe_u8 = _e; }
+  { auto _e = default_u8(); _o->default_u8 = _e; }
+  { auto _e = just_i16(); _o->just_i16 = _e; }
+  { auto _e = maybe_i16(); _o->maybe_i16 = _e; }
+  { auto _e = default_i16(); _o->default_i16 = _e; }
+  { auto _e = just_u16(); _o->just_u16 = _e; }
+  { auto _e = maybe_u16(); _o->maybe_u16 = _e; }
+  { auto _e = default_u16(); _o->default_u16 = _e; }
+  { auto _e = just_i32(); _o->just_i32 = _e; }
+  { auto _e = maybe_i32(); _o->maybe_i32 = _e; }
+  { auto _e = default_i32(); _o->default_i32 = _e; }
+  { auto _e = just_u32(); _o->just_u32 = _e; }
+  { auto _e = maybe_u32(); _o->maybe_u32 = _e; }
+  { auto _e = default_u32(); _o->default_u32 = _e; }
+  { auto _e = just_i64(); _o->just_i64 = _e; }
+  { auto _e = maybe_i64(); _o->maybe_i64 = _e; }
+  { auto _e = default_i64(); _o->default_i64 = _e; }
+  { auto _e = just_u64(); _o->just_u64 = _e; }
+  { auto _e = maybe_u64(); _o->maybe_u64 = _e; }
+  { auto _e = default_u64(); _o->default_u64 = _e; }
+  { auto _e = just_f32(); _o->just_f32 = _e; }
+  { auto _e = maybe_f32(); _o->maybe_f32 = _e; }
+  { auto _e = default_f32(); _o->default_f32 = _e; }
+  { auto _e = just_f64(); _o->just_f64 = _e; }
+  { auto _e = maybe_f64(); _o->maybe_f64 = _e; }
+  { auto _e = default_f64(); _o->default_f64 = _e; }
+  { auto _e = just_bool(); _o->just_bool = _e; }
+  { auto _e = maybe_bool(); _o->maybe_bool = _e; }
+  { auto _e = default_bool(); _o->default_bool = _e; }
+  { auto _e = just_enum(); _o->just_enum = _e; }
+  { auto _e = maybe_enum(); _o->maybe_enum = _e; }
+  { auto _e = default_enum(); _o->default_enum = _e; }
+}
+
+inline flatbuffers::Offset<ScalarStuff> ScalarStuff::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateScalarStuff(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ScalarStuffT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _just_i8 = _o->just_i8;
+  auto _maybe_i8 = _o->maybe_i8;
+  auto _default_i8 = _o->default_i8;
+  auto _just_u8 = _o->just_u8;
+  auto _maybe_u8 = _o->maybe_u8;
+  auto _default_u8 = _o->default_u8;
+  auto _just_i16 = _o->just_i16;
+  auto _maybe_i16 = _o->maybe_i16;
+  auto _default_i16 = _o->default_i16;
+  auto _just_u16 = _o->just_u16;
+  auto _maybe_u16 = _o->maybe_u16;
+  auto _default_u16 = _o->default_u16;
+  auto _just_i32 = _o->just_i32;
+  auto _maybe_i32 = _o->maybe_i32;
+  auto _default_i32 = _o->default_i32;
+  auto _just_u32 = _o->just_u32;
+  auto _maybe_u32 = _o->maybe_u32;
+  auto _default_u32 = _o->default_u32;
+  auto _just_i64 = _o->just_i64;
+  auto _maybe_i64 = _o->maybe_i64;
+  auto _default_i64 = _o->default_i64;
+  auto _just_u64 = _o->just_u64;
+  auto _maybe_u64 = _o->maybe_u64;
+  auto _default_u64 = _o->default_u64;
+  auto _just_f32 = _o->just_f32;
+  auto _maybe_f32 = _o->maybe_f32;
+  auto _default_f32 = _o->default_f32;
+  auto _just_f64 = _o->just_f64;
+  auto _maybe_f64 = _o->maybe_f64;
+  auto _default_f64 = _o->default_f64;
+  auto _just_bool = _o->just_bool;
+  auto _maybe_bool = _o->maybe_bool;
+  auto _default_bool = _o->default_bool;
+  auto _just_enum = _o->just_enum;
+  auto _maybe_enum = _o->maybe_enum;
+  auto _default_enum = _o->default_enum;
+  return optional_scalars::CreateScalarStuff(
+      _fbb,
+      _just_i8,
+      _maybe_i8,
+      _default_i8,
+      _just_u8,
+      _maybe_u8,
+      _default_u8,
+      _just_i16,
+      _maybe_i16,
+      _default_i16,
+      _just_u16,
+      _maybe_u16,
+      _default_u16,
+      _just_i32,
+      _maybe_i32,
+      _default_i32,
+      _just_u32,
+      _maybe_u32,
+      _default_u32,
+      _just_i64,
+      _maybe_i64,
+      _default_i64,
+      _just_u64,
+      _maybe_u64,
+      _default_u64,
+      _just_f32,
+      _maybe_f32,
+      _default_f32,
+      _just_f64,
+      _maybe_f64,
+      _default_f64,
+      _just_bool,
+      _maybe_bool,
+      _default_bool,
+      _just_enum,
+      _maybe_enum,
+      _default_enum);
+}
+
+inline const flatbuffers::TypeTable *OptionalByteTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "None",
+    "One",
+    "Two"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "just_i8",
+    "maybe_i8",
+    "default_i8",
+    "just_u8",
+    "maybe_u8",
+    "default_u8",
+    "just_i16",
+    "maybe_i16",
+    "default_i16",
+    "just_u16",
+    "maybe_u16",
+    "default_u16",
+    "just_i32",
+    "maybe_i32",
+    "default_i32",
+    "just_u32",
+    "maybe_u32",
+    "default_u32",
+    "just_i64",
+    "maybe_i64",
+    "default_i64",
+    "just_u64",
+    "maybe_u64",
+    "default_u64",
+    "just_f32",
+    "maybe_f32",
+    "default_f32",
+    "just_f64",
+    "maybe_f64",
+    "default_f64",
+    "just_bool",
+    "maybe_bool",
+    "default_bool",
+    "just_enum",
+    "maybe_enum",
+    "default_enum"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 36, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const optional_scalars::ScalarStuff *GetScalarStuff(const void *buf) {
+  return flatbuffers::GetRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline const optional_scalars::ScalarStuff *GetSizePrefixedScalarStuff(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline ScalarStuff *GetMutableScalarStuff(void *buf) {
+  return flatbuffers::GetMutableRoot<ScalarStuff>(buf);
+}
+
+inline const char *ScalarStuffIdentifier() {
+  return "NULL";
+}
+
+inline bool ScalarStuffBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ScalarStuffIdentifier());
+}
+
+inline bool VerifyScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline bool VerifySizePrefixedScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline const char *ScalarStuffExtension() {
+  return "mon";
+}
+
+inline void FinishScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.Finish(root, ScalarStuffIdentifier());
+}
+
+inline void FinishSizePrefixedScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.FinishSizePrefixed(root, ScalarStuffIdentifier());
+}
+
+inline std::unique_ptr<optional_scalars::ScalarStuffT> UnPackScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<optional_scalars::ScalarStuffT>(GetScalarStuff(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<optional_scalars::ScalarStuffT> UnPackSizePrefixedScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<optional_scalars::ScalarStuffT>(GetSizePrefixedScalarStuff(buf)->UnPack(res));
+}
+
+}  // namespace optional_scalars
+
+#endif  // FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/stringify_util.h b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/stringify_util.h
new file mode 100644
index 0000000..dc94722
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/stringify_util.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This contains some utilities/examples for how to leverage the static reflec-
+// tion features of tables and structs in the C++17 code generation to recur-
+// sively produce a string representation of any Flatbuffer table or struct use
+// compile-time iteration over the fields. Note that this code is completely
+// generic in that it makes no reference to any particular Flatbuffer type.
+
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+
+namespace cpp17 {
+
+// User calls this; need to forward declare it since it is called recursively.
+template<typename T>
+std::optional<std::string> StringifyFlatbufferValue(
+    T &&val, const std::string &indent = "");
+
+namespace detail {
+
+/*******************************************************************************
+** Metaprogramming helpers for detecting Flatbuffers Tables, Structs, & Vectors.
+*******************************************************************************/
+template<typename FBS, typename = void>
+struct is_flatbuffers_table_or_struct : std::false_type {};
+
+// We know it's a table or struct when it has a Traits subclass.
+template<typename FBS>
+struct is_flatbuffers_table_or_struct<FBS, std::void_t<typename FBS::Traits>>
+    : std::true_type {};
+
+template<typename FBS>
+inline constexpr bool is_flatbuffers_table_or_struct_v =
+    is_flatbuffers_table_or_struct<FBS>::value;
+
+template<typename T> struct is_flatbuffers_vector : std::false_type {};
+
+template<typename T>
+struct is_flatbuffers_vector<flatbuffers::Vector<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_flatbuffers_vector_v = is_flatbuffers_vector<T>::value;
+
+/*******************************************************************************
+** Compile-time Iteration & Recursive Stringification over Flatbuffers types.
+*******************************************************************************/
+template<size_t Index, typename FBS>
+std::string AddStringifiedField(const FBS &fbs, const std::string &indent) {
+  auto value_string =
+      StringifyFlatbufferValue(fbs.template get_field<Index>(), indent);
+  if (!value_string) { return ""; }
+  return indent + FBS::Traits::field_names[Index] + " = " + *value_string +
+         "\n";
+}
+
+template<typename FBS, size_t... Indexes>
+std::string StringifyTableOrStructImpl(const FBS &fbs,
+                                       const std::string &indent,
+                                       std::index_sequence<Indexes...>) {
+  // This line is where the compile-time iteration happens!
+  return (AddStringifiedField<Indexes>(fbs, indent) + ...);
+}
+
+template<typename FBS>
+std::string StringifyTableOrStruct(const FBS &fbs, const std::string &indent) {
+  static constexpr size_t field_count = FBS::Traits::fields_number;
+  std::string out;
+  if constexpr (field_count > 0) {
+    out = std::string(FBS::Traits::fully_qualified_name) + "{\n" +
+          StringifyTableOrStructImpl(fbs, indent + "  ",
+                                     std::make_index_sequence<field_count>{}) +
+          indent + '}';
+  }
+  return out;
+}
+
+template<typename T>
+std::string StringifyVector(const flatbuffers::Vector<T> &vec,
+                            const std::string &indent) {
+  const auto prologue = indent + std::string("  ");
+  const auto epilogue = std::string(",\n");
+  std::string text;
+  text += "[\n";
+  for (auto it = vec.cbegin(), end = vec.cend(); it != end; ++it) {
+    text += prologue;
+    text += StringifyFlatbufferValue(*it).value_or("(field absent)");
+    text += epilogue;
+  }
+  if (vec.cbegin() != vec.cend()) {
+    text.resize(text.size() - epilogue.size());
+  }
+  text += '\n' + indent + ']';
+  return text;
+}
+
+template<typename T> std::string StringifyArithmeticType(T val) {
+  return flatbuffers::NumToString(val);
+}
+
+}  // namespace detail
+
+/*******************************************************************************
+** Take any flatbuffer type (table, struct, Vector, int...) and stringify it.
+*******************************************************************************/
+template<typename T>
+std::optional<std::string> StringifyFlatbufferValue(T &&val,
+                                                    const std::string &indent) {
+  constexpr bool is_pointer = std::is_pointer_v<std::remove_reference_t<T>>;
+  if constexpr (is_pointer) {
+    if (val == nullptr) return std::nullopt;  // Field is absent.
+  }
+  using decayed =
+      std::decay_t<std::remove_pointer_t<std::remove_reference_t<T>>>;
+
+  // Is it a Flatbuffers Table or Struct?
+  if constexpr (detail::is_flatbuffers_table_or_struct_v<decayed>) {
+    // We have a nested table or struct; use recursion!
+    if constexpr (is_pointer)
+      return detail::StringifyTableOrStruct(*val, indent);
+    else
+      return detail::StringifyTableOrStruct(val, indent);
+  }
+
+  // Is it an 8-bit number?  If so, print it like an int (not char).
+  else if constexpr (std::is_same_v<decayed, int8_t> ||
+                     std::is_same_v<decayed, uint8_t>) {
+    return detail::StringifyArithmeticType(static_cast<int>(val));
+  }
+
+  // Is it an enum? If so, print it like an int, since Flatbuffers doesn't yet
+  // have type-based reflection for enums, so we can't print the enum's name :(
+  else if constexpr (std::is_enum_v<decayed>) {
+    return StringifyFlatbufferValue(
+        static_cast<std::underlying_type_t<decayed>>(val), indent);
+  }
+
+  // Is it an int, double, float, uint32_t, etc.?
+  else if constexpr (std::is_arithmetic_v<decayed>) {
+    return detail::StringifyArithmeticType(val);
+  }
+
+  // Is it a Flatbuffers string?
+  else if constexpr (std::is_same_v<decayed, flatbuffers::String>) {
+    return '"' + val->str() + '"';
+  }
+
+  // Is it a Flatbuffers Vector?
+  else if constexpr (detail::is_flatbuffers_vector_v<decayed>) {
+    return detail::StringifyVector(*val, indent);
+  }
+
+  // Is it a void pointer?
+  else if constexpr (std::is_same_v<decayed, void>) {
+    // Can't format it.
+    return std::nullopt;
+  }
+
+  else {
+    // Not sure how to format this type, whatever it is.
+    static_assert(sizeof(T) != sizeof(T),
+                  "Do not know how to format this type T (the compiler error "
+                  "should tell you nearby what T is).");
+  }
+}
+
+}  // namespace cpp17
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/test_cpp17.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/test_cpp17.cpp
new file mode 100644
index 0000000..47083d3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/cpp17/test_cpp17.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This is a sandbox for modeling C++17 code generator.
+// C++17 code generator: "flatc --cpp_std c++17".
+// Warning:
+// This is an experimental feature and could change at any time.
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/minireflect.h"
+#include "flatbuffers/registry.h"
+#include "flatbuffers/util.h"
+#include "stringify_util.h"
+#include "test_assert.h"
+
+// Embed generated code into an isolated namespace.
+namespace cpp17 {
+#include "generated_cpp17/monster_test_generated.h"
+#include "generated_cpp17/optional_scalars_generated.h"
+}  // namespace cpp17
+
+namespace cpp11 {
+#include "../monster_test_generated.h"
+#include "../optional_scalars_generated.h"
+}  // namespace cpp11
+
+using ::cpp17::MyGame::Example::Monster;
+using ::cpp17::MyGame::Example::Vec3;
+
+/*******************************************************************************
+** Build some FB objects.
+*******************************************************************************/
+const Monster *BuildMonster(flatbuffers::FlatBufferBuilder &fbb) {
+  using ::cpp17::MyGame::Example::Color;
+  using ::cpp17::MyGame::Example::MonsterBuilder;
+  using ::cpp17::MyGame::Example::Test;
+  auto name = fbb.CreateString("my_monster");
+  auto inventory = fbb.CreateVector(std::vector<uint8_t>{ 4, 5, 6, 7 });
+  MonsterBuilder builder(fbb);
+  auto vec3 = Vec3{ /*x=*/1.1f,
+                    /*y=*/2.2f,
+                    /*z=*/3.3f,
+                    /*test1=*/6.6,
+                    /*test2=*/Color::Green,
+                    /*test3=*/
+                    Test(
+                        /*a=*/11,
+                        /*b=*/90) };
+  builder.add_pos(&vec3);
+  builder.add_name(name);
+  builder.add_mana(1);
+  builder.add_hp(2);
+  builder.add_testbool(true);
+  builder.add_testhashs32_fnv1(4);
+  builder.add_testhashu32_fnv1(5);
+  builder.add_testhashs64_fnv1(6);
+  builder.add_testhashu64_fnv1(7);
+  builder.add_testhashs32_fnv1a(8);
+  builder.add_testhashu32_fnv1a(9);
+  builder.add_testhashs64_fnv1a(10);
+  builder.add_testhashu64_fnv1a(11);
+  builder.add_testf(12.1f);
+  builder.add_testf2(13.1f);
+  builder.add_testf3(14.1f);
+  builder.add_single_weak_reference(15);
+  builder.add_co_owning_reference(16);
+  builder.add_non_owning_reference(17);
+  builder.add_inventory(inventory);
+  fbb.Finish(builder.Finish());
+  const Monster *monster =
+      flatbuffers::GetRoot<Monster>(fbb.GetBufferPointer());
+  return monster;
+}
+
+/*******************************************************************************
+** Test Case: Static Field Reflection Traits for Table & Structs.
+*******************************************************************************/
+// This test tests & demonstrates the power of the static reflection. Using it,
+// we can given any Flatbuffer type to a generic function and it will be able to
+// produce is full recursive string representation of it.
+//
+// This test covers all types: primitive types, structs, tables, Vectors, etc.
+//
+void StringifyAnyFlatbuffersTypeTest() {
+  flatbuffers::FlatBufferBuilder fbb;
+  // We are using a Monster here, but we could have used any type, because the
+  // code that follows is totally generic!
+  const auto *monster = BuildMonster(fbb);
+
+  std::string expected = R"(MyGame.Example.Monster{
+        pos = MyGame.Example.Vec3{
+          x = 1.1
+          y = 2.2
+          z = 3.3
+          test1 = 6.6
+          test2 = 2
+          test3 = MyGame.Example.Test{
+            a = 11
+            b = 90
+          }
+        }
+        mana = 1
+        hp = 2
+        name = "my_monster"
+        inventory = [
+          4,
+          5,
+          6,
+          7
+        ]
+        color = 8
+        test_type = 0
+        testbool = 1
+        testhashs32_fnv1 = 4
+        testhashu32_fnv1 = 5
+        testhashs64_fnv1 = 6
+        testhashu64_fnv1 = 7
+        testhashs32_fnv1a = 8
+        testhashu32_fnv1a = 9
+        testhashs64_fnv1a = 10
+        testhashu64_fnv1a = 11
+        testf = 12.1
+        testf2 = 13.1
+        testf3 = 14.1
+        single_weak_reference = 15
+        co_owning_reference = 16
+        non_owning_reference = 17
+        any_unique_type = 0
+        any_ambiguous_type = 0
+        signed_enum = -1
+      })";
+
+  // Call a generic function that has no specific knowledge of the flatbuffer we
+  // are passing in; it should use only static reflection to produce a string
+  // representations of the field names and values recursively. We give it an
+  // initial indentation so that the result can be compared with our raw string
+  // above, which we wanted to indent so that it will look nicer in this code.
+  //
+  // A note about JSON: as can be seen from the string above, this produces a
+  // JSON-like notation, but we are not using any of Flatbuffers' JSON infra to
+  // produce this! It is produced entirely using compile-time reflection, and
+  // thus does not require any runtime access to the *.fbs definition files!
+  std::optional<std::string> result =
+      cpp17::StringifyFlatbufferValue(*monster, /*indent=*/"      ");
+
+  TEST_ASSERT(result.has_value());
+  TEST_EQ_STR(expected.c_str(), result->c_str());
+}
+
+/*******************************************************************************
+** Test Traits::FieldType
+*******************************************************************************/
+using pos_type = Monster::Traits::FieldType<0>;
+static_assert(std::is_same_v<pos_type, const Vec3*>);
+
+using mana_type = Monster::Traits::FieldType<1>;
+static_assert(std::is_same_v<mana_type, int16_t>);
+
+using name_type = Monster::Traits::FieldType<3>;
+static_assert(std::is_same_v<name_type, const flatbuffers::String*>);
+
+/*******************************************************************************
+** Generic Create Function Test.
+*******************************************************************************/
+void CreateTableByTypeTest() {
+  flatbuffers::FlatBufferBuilder builder;
+
+  // We will create an object of this type using only the type.
+  using type_to_create_t = cpp17::MyGame::Example::Stat;
+
+  [&builder] {
+    auto id_str = builder.CreateString("my_id");
+    auto table = type_to_create_t::Traits::Create(builder, id_str, 42, 7);
+    // Be sure that the correct return type was inferred.
+    static_assert(
+        std::is_same_v<decltype(table), flatbuffers::Offset<type_to_create_t>>);
+    builder.Finish(table);
+  }();
+
+  // Access it.
+  auto stat =
+      flatbuffers::GetRoot<type_to_create_t>(builder.GetBufferPointer());
+  TEST_EQ_STR(stat->id()->c_str(), "my_id");
+  TEST_EQ(stat->val(), 42);
+  TEST_EQ(stat->count(), 7);
+}
+
+void OptionalScalarsTest() {
+  static_assert(
+      std::is_same<flatbuffers::Optional<float>, std::optional<float>>::value);
+  static_assert(std::is_same<flatbuffers::nullopt_t, std::nullopt_t>::value);
+
+  // test C++ nullable
+  flatbuffers::FlatBufferBuilder fbb;
+  FinishScalarStuffBuffer(fbb, cpp17::optional_scalars::CreateScalarStuff(
+                                   fbb, 1, static_cast<int8_t>(2)));
+  auto opts =
+      cpp17::optional_scalars::GetMutableScalarStuff(fbb.GetBufferPointer());
+  TEST_ASSERT(!opts->maybe_bool());
+  TEST_ASSERT(!opts->maybe_f32().has_value());
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 2);
+  TEST_ASSERT(opts->mutate_maybe_i8(3));
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 3);
+  TEST_ASSERT(!opts->mutate_maybe_i16(-10));
+
+  cpp17::optional_scalars::ScalarStuffT obj;
+  opts->UnPackTo(&obj);
+  TEST_ASSERT(!obj.maybe_bool);
+  TEST_ASSERT(!obj.maybe_f32.has_value());
+  TEST_ASSERT(obj.maybe_i8.has_value() && obj.maybe_i8.value() == 3);
+  TEST_ASSERT(obj.maybe_i8 && *obj.maybe_i8 == 3);
+  obj.maybe_i32 = -1;
+
+  fbb.Clear();
+  FinishScalarStuffBuffer(
+      fbb, cpp17::optional_scalars::ScalarStuff::Pack(fbb, &obj));
+  opts = cpp17::optional_scalars::GetMutableScalarStuff(fbb.GetBufferPointer());
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 3);
+  TEST_ASSERT(opts->maybe_i32().has_value());
+  TEST_EQ(opts->maybe_i32().value(), -1);
+
+  TEST_EQ(std::optional<int32_t>(opts->maybe_i32()).value(), -1);
+  TEST_EQ(std::optional<int64_t>(opts->maybe_i32()).value(), -1);
+  TEST_ASSERT(opts->maybe_i32() == std::optional<int64_t>(-1));
+}
+
+int FlatBufferCpp17Tests() {
+  CreateTableByTypeTest();
+  OptionalScalarsTest();
+  StringifyAnyFlatbuffersTypeTest();
+  return 0;
+}
+
+int main(int /*argc*/, const char * /*argv*/[]) {
+  InitTestEngine();
+
+  FlatBufferCpp17Tests();
+
+  if (!testing_fails) {
+    TEST_OUTPUT_LINE("C++17: ALL TESTS PASSED");
+  } else {
+    TEST_OUTPUT_LINE("C++17: %d FAILED TESTS", testing_fails);
+  }
+  return CloseTestEngine();
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.build_flatc_debian_stretch b/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.build_flatc_debian_stretch
new file mode 100644
index 0000000..197a555
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.build_flatc_debian_stretch
@@ -0,0 +1,9 @@
+FROM debian:9.6-slim as base
+RUN apt -qq update >/dev/null
+RUN apt -qq install -y cmake make build-essential >/dev/null
+FROM base
+WORKDIR /code
+ADD . .
+RUN cmake -G "Unix Makefiles"
+RUN make flatc
+RUN ls flatc
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.cpp.debian_buster b/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.cpp.debian_buster
new file mode 100644
index 0000000..7b0cce2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/Dockerfile.testing.cpp.debian_buster
@@ -0,0 +1,10 @@
+FROM debian:10.1-slim as base
+RUN apt -qq update >/dev/null
+RUN apt -qq install -y cmake make build-essential >/dev/null
+RUN apt -qq install -y autoconf git libtool >/dev/null
+RUN apt -qq install -y clang >/dev/null
+FROM base
+# Travis machines have 2 cores. Can be redefined with 'run --env PAR_JOBS=N'.
+ENV JOBS=2
+WORKDIR /flatbuffers
+ADD . .
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.php.hhvm_2019_01_16 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.php.hhvm_2019_01_16
new file mode 100644
index 0000000..e5023fa
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.php.hhvm_2019_01_16
@@ -0,0 +1,18 @@
+# This does not pass tests due to the following error:
+#
+# Fatal error: Uncaught exception 'InvalidArgumentException' with message 'Google\FlatBuffers\ByteBuffer::getX() expects parameter 1 by reference, but the call was not annotated with '&'. in /code/php/FlatbufferBuilder.php:971
+# Stack trace:
+# #0 /code/tests/phpTest.php(277): Google\FlatBuffers\FlatbufferBuilder->sizedByteArray()
+# #1 /code/tests/phpTest.php(79): fuzzTest1()
+# #2 /code/tests/phpTest.php(86): main()
+# #3 {main}
+#   thrown in in /code/php/FlatbufferBuilder.php:971
+FROM hhvm/hhvm:2019.01.16 as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN hhvm --version
+RUN hhvm phpTest.php
+RUN ../flatc --php -o php union_vector/union_vector.fbs
+RUN hhvm phpUnionVectorTest.php
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.cpython_with_conda b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.cpython_with_conda
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.cpython_with_numpy b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.cpython_with_numpy
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py2 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py2
new file mode 100644
index 0000000..849b92e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py2
@@ -0,0 +1,7 @@
+FROM pypy:2-6.0.0-slim as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN pypy --version
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py3 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py3
new file mode 100644
index 0000000..2df6b2a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/TODO.Dockerfile.testing.python.pypy_6_0_0_py3
@@ -0,0 +1,7 @@
+FROM pypy:3-6.0.0-slim as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN pypy --version
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/build_flatc.run.sh b/3rdparty/TNN/third_party/flatbuffers/tests/docker/build_flatc.run.sh
new file mode 100755
index 0000000..3473765
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/build_flatc.run.sh
@@ -0,0 +1,15 @@
+set -e
+
+JOBS=${JOBS:-1}
+config=$1
+echo ""
+echo "Build 'flatc' compiler for '$config'"
+
+cmake . -DCMAKE_BUILD_TYPE=$config \
+  -DFLATBUFFERS_BUILD_FLATC=1 -DFLATBUFFERS_STATIC_FLATC=1 \
+  -DFLATBUFFERS_BUILD_TESTS=0 -DFLATBUFFERS_INSTALL=0
+cmake --build . --target flatc --clean-first -- -j$JOBS
+
+echo "Check generated code"
+scripts/check-generate-code.sh
+echo "Done"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/cpp_test.run.sh b/3rdparty/TNN/third_party/flatbuffers/tests/docker/cpp_test.run.sh
new file mode 100755
index 0000000..e05b74d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/cpp_test.run.sh
@@ -0,0 +1,20 @@
+set -e
+
+JOBS=${JOBS:-1}
+export UBSAN_OPTIONS=halt_on_error=1
+export ASAN_OPTIONS=halt_on_error=1
+export MAKEFLAGS="-j$JOBS"
+
+config=$1
+echo ""
+echo "Build Flatbuffers project for '$config' with jobs=$JOBS"
+
+cmake . -DCMAKE_BUILD_TYPE=$config \
+  -DFLATBUFFERS_BUILD_TESTS=ON -DFLATBUFFERS_CODE_SANITIZE=ON
+cmake --build . --target all --clean-first -- -j$JOBS
+ctest --extra-verbose --output-on-failure -j$JOBS
+
+echo "Check generated code"
+scripts/check-generate-code.sh
+
+echo "C++ tests done"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.csharp.mono_5_18 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.csharp.mono_5_18
new file mode 100644
index 0000000..e6ba550
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.csharp.mono_5_18
@@ -0,0 +1,8 @@
+FROM mono:5.18 as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN mono --version
+WORKDIR /code/tests/FlatBuffers.Test
+RUN sh NetTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.golang.1_11 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.golang.1_11
new file mode 100644
index 0000000..81707ea
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.golang.1_11
@@ -0,0 +1,7 @@
+FROM golang:1.11-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN go version
+RUN ./GoTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_10_0_2 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_10_0_2
new file mode 100644
index 0000000..82c3add
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_10_0_2
@@ -0,0 +1,7 @@
+FROM openjdk:10.0.2-jdk-slim-sid as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN java -version
+RUN ./JavaTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_11_0_1 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_11_0_1
new file mode 100644
index 0000000..ac1c3cc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.java.openjdk_11_0_1
@@ -0,0 +1,7 @@
+FROM openjdk:11.0.1-jdk-slim-sid as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN java -version
+RUN ./JavaTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.12_20_1 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.12_20_1
new file mode 100644
index 0000000..013661f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.12_20_1
@@ -0,0 +1,6 @@
+FROM node:12.20.1-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+RUN npm install
+RUN npm test
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.14_15_4 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.14_15_4
new file mode 100644
index 0000000..fb546bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.node.14_15_4
@@ -0,0 +1,6 @@
+FROM node:14.15.4-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+RUN npm install
+RUN npm test
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.php.zend_7_3 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.php.zend_7_3
new file mode 100644
index 0000000..6cdf43c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.php.zend_7_3
@@ -0,0 +1,8 @@
+FROM php:7.3-cli-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN php --version
+RUN php phpTest.php
+RUN sh phpUnionVectorTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_2_7_15 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_2_7_15
new file mode 100644
index 0000000..e68303e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_2_7_15
@@ -0,0 +1,8 @@
+FROM python:2.7.15-slim-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN python --version
+RUN pip install coverage
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_3_7_1 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_3_7_1
new file mode 100644
index 0000000..7c2f15c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.cpython_3_7_1
@@ -0,0 +1,8 @@
+FROM python:3.7.1-slim-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN python --version
+RUN pip install coverage
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_2_7_15 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_2_7_15
new file mode 100644
index 0000000..718c5ac
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_2_7_15
@@ -0,0 +1,9 @@
+FROM python:2.7.15-slim-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN python --version
+RUN pip install numpy
+RUN pip install coverage
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_3_7_1 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_3_7_1
new file mode 100644
index 0000000..1de2c26
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.python.numpy.cpython_3_7_1
@@ -0,0 +1,9 @@
+FROM python:3.7.1-slim-stretch as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN python --version
+RUN pip install numpy
+RUN pip install coverage
+RUN ./PythonTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.1_51_0 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.1_51_0
new file mode 100644
index 0000000..38c1d26
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.1_51_0
@@ -0,0 +1,7 @@
+FROM rust:1.51.0-slim as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN rustc --version
+RUN ./RustTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.big_endian.1_51_0 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.big_endian.1_51_0
new file mode 100644
index 0000000..aa35ed7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.big_endian.1_51_0
@@ -0,0 +1,15 @@
+FROM rust:1.51.0-slim as base
+RUN apt -qq update -y && apt -qq install -y \
+    gcc-mips-linux-gnu \
+    libexpat1 \
+    libmagic1 \
+    libmpdec2 \
+    libreadline7 \
+    qemu-user
+RUN rustup target add mips-unknown-linux-gnu
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN rustc --version
+RUN ./RustTest.sh mips-unknown-linux-gnu
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.nightly b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.nightly
new file mode 100644
index 0000000..5194d6d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.rust.nightly
@@ -0,0 +1,8 @@
+FROM rustlang/rust:nightly-stretch-slim as base
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN rustc --version
+RUN export RUST_NIGHTLY=1
+RUN ./RustTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.swift_5_2 b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.swift_5_2
new file mode 100644
index 0000000..b309477
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/docker/languages/Dockerfile.testing.swift_5_2
@@ -0,0 +1,8 @@
+FROM swift:5.2
+WORKDIR /code
+ADD . .
+RUN cp flatc_debian_stretch flatc
+WORKDIR /code/tests
+RUN swift --version
+WORKDIR /code/tests/FlatBuffers.Test.Swift
+RUN sh SwiftTest.sh
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.fbs
new file mode 100644
index 0000000..e9d3b8d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.fbs
@@ -0,0 +1,40 @@
+namespace Evolution.V1;
+
+table TableA {
+    a:float;
+    b:int;
+}
+
+table TableB {
+    a:int;
+}
+
+enum Enum : byte {
+    King,
+    Queen
+}
+
+union Union {
+    TableA,
+    TableB
+}
+
+struct Struct {
+    a:int;
+    b:double;
+}
+
+table Root {
+    a:int;
+    b:bool;
+    c:Union;
+    d:Enum;
+    e:TableA;
+    f:Struct;
+    g:[int];
+    h:[TableB];
+    i:int = 1234;
+    j:Union;
+}
+
+root_type Root;
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.json b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.json
new file mode 100644
index 0000000..c90fdb9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1.json
@@ -0,0 +1,29 @@
+{
+  "a": 42,
+  "b": true,
+  "c_type": "TableB",
+  "c": {
+    "a": 15
+  },
+  "d": "King",
+  "e": {
+    "a": 3.1452,
+    "b": 325
+  },
+  "f":{
+    "a": 16,
+    "b": 243.980943
+  },
+  "g": [ 7, 8, 9],
+  "h": [
+    {
+      "a": 212
+    },
+    {
+      "a": 459
+    }
+  ],
+  "j": {
+    "a": 984
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1_generated.h
new file mode 100644
index 0000000..a9c2b7f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v1_generated.h
@@ -0,0 +1,513 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_EVOLUTIONV1_EVOLUTION_V1_H_
+#define FLATBUFFERS_GENERATED_EVOLUTIONV1_EVOLUTION_V1_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace Evolution {
+namespace V1 {
+
+struct TableA;
+struct TableABuilder;
+
+struct TableB;
+struct TableBBuilder;
+
+struct Struct;
+
+struct Root;
+struct RootBuilder;
+
+enum class Enum : int8_t {
+  King = 0,
+  Queen = 1,
+  MIN = King,
+  MAX = Queen
+};
+
+inline const Enum (&EnumValuesEnum())[2] {
+  static const Enum values[] = {
+    Enum::King,
+    Enum::Queen
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnum() {
+  static const char * const names[3] = {
+    "King",
+    "Queen",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnum(Enum e) {
+  if (flatbuffers::IsOutRange(e, Enum::King, Enum::Queen)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnum()[index];
+}
+
+enum class Union : uint8_t {
+  NONE = 0,
+  TableA = 1,
+  TableB = 2,
+  MIN = NONE,
+  MAX = TableB
+};
+
+inline const Union (&EnumValuesUnion())[3] {
+  static const Union values[] = {
+    Union::NONE,
+    Union::TableA,
+    Union::TableB
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesUnion() {
+  static const char * const names[4] = {
+    "NONE",
+    "TableA",
+    "TableB",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameUnion(Union e) {
+  if (flatbuffers::IsOutRange(e, Union::NONE, Union::TableB)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnion()[index];
+}
+
+template<typename T> struct UnionTraits {
+  static const Union enum_value = Union::NONE;
+};
+
+template<> struct UnionTraits<Evolution::V1::TableA> {
+  static const Union enum_value = Union::TableA;
+};
+
+template<> struct UnionTraits<Evolution::V1::TableB> {
+  static const Union enum_value = Union::TableB;
+};
+
+bool VerifyUnion(flatbuffers::Verifier &verifier, const void *obj, Union type);
+bool VerifyUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Struct FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t a_;
+  int32_t padding0__;
+  double b_;
+
+ public:
+  Struct()
+      : a_(0),
+        padding0__(0),
+        b_(0) {
+    (void)padding0__;
+  }
+  Struct(int32_t _a, double _b)
+      : a_(flatbuffers::EndianScalar(_a)),
+        padding0__(0),
+        b_(flatbuffers::EndianScalar(_b)) {
+    (void)padding0__;
+  }
+  int32_t a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  double b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+};
+FLATBUFFERS_STRUCT_END(Struct, 16);
+
+inline bool operator==(const Struct &lhs, const Struct &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b());
+}
+
+inline bool operator!=(const Struct &lhs, const Struct &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableA FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableABuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4,
+    VT_B = 6
+  };
+  float a() const {
+    return GetField<float>(VT_A, 0.0f);
+  }
+  int32_t b() const {
+    return GetField<int32_t>(VT_B, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_A) &&
+           VerifyField<int32_t>(verifier, VT_B) &&
+           verifier.EndTable();
+  }
+};
+
+struct TableABuilder {
+  typedef TableA Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(float a) {
+    fbb_.AddElement<float>(TableA::VT_A, a, 0.0f);
+  }
+  void add_b(int32_t b) {
+    fbb_.AddElement<int32_t>(TableA::VT_B, b, 0);
+  }
+  explicit TableABuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableA> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableA>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableA> CreateTableA(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float a = 0.0f,
+    int32_t b = 0) {
+  TableABuilder builder_(_fbb);
+  builder_.add_b(b);
+  builder_.add_a(a);
+  return builder_.Finish();
+}
+
+struct TableB FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableBBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4
+  };
+  int32_t a() const {
+    return GetField<int32_t>(VT_A, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_A) &&
+           verifier.EndTable();
+  }
+};
+
+struct TableBBuilder {
+  typedef TableB Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(int32_t a) {
+    fbb_.AddElement<int32_t>(TableB::VT_A, a, 0);
+  }
+  explicit TableBBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableB> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableB>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableB> CreateTableB(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t a = 0) {
+  TableBBuilder builder_(_fbb);
+  builder_.add_a(a);
+  return builder_.Finish();
+}
+
+struct Root FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RootBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4,
+    VT_B = 6,
+    VT_C_TYPE = 8,
+    VT_C = 10,
+    VT_D = 12,
+    VT_E = 14,
+    VT_F = 16,
+    VT_G = 18,
+    VT_H = 20,
+    VT_I = 22,
+    VT_J_TYPE = 24,
+    VT_J = 26
+  };
+  int32_t a() const {
+    return GetField<int32_t>(VT_A, 0);
+  }
+  bool b() const {
+    return GetField<uint8_t>(VT_B, 0) != 0;
+  }
+  Evolution::V1::Union c_type() const {
+    return static_cast<Evolution::V1::Union>(GetField<uint8_t>(VT_C_TYPE, 0));
+  }
+  const void *c() const {
+    return GetPointer<const void *>(VT_C);
+  }
+  template<typename T> const T *c_as() const;
+  const Evolution::V1::TableA *c_as_TableA() const {
+    return c_type() == Evolution::V1::Union::TableA ? static_cast<const Evolution::V1::TableA *>(c()) : nullptr;
+  }
+  const Evolution::V1::TableB *c_as_TableB() const {
+    return c_type() == Evolution::V1::Union::TableB ? static_cast<const Evolution::V1::TableB *>(c()) : nullptr;
+  }
+  Evolution::V1::Enum d() const {
+    return static_cast<Evolution::V1::Enum>(GetField<int8_t>(VT_D, 0));
+  }
+  const Evolution::V1::TableA *e() const {
+    return GetPointer<const Evolution::V1::TableA *>(VT_E);
+  }
+  const Evolution::V1::Struct *f() const {
+    return GetStruct<const Evolution::V1::Struct *>(VT_F);
+  }
+  const flatbuffers::Vector<int32_t> *g() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_G);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Evolution::V1::TableB>> *h() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Evolution::V1::TableB>> *>(VT_H);
+  }
+  int32_t i() const {
+    return GetField<int32_t>(VT_I, 1234);
+  }
+  Evolution::V1::Union j_type() const {
+    return static_cast<Evolution::V1::Union>(GetField<uint8_t>(VT_J_TYPE, 0));
+  }
+  const void *j() const {
+    return GetPointer<const void *>(VT_J);
+  }
+  template<typename T> const T *j_as() const;
+  const Evolution::V1::TableA *j_as_TableA() const {
+    return j_type() == Evolution::V1::Union::TableA ? static_cast<const Evolution::V1::TableA *>(j()) : nullptr;
+  }
+  const Evolution::V1::TableB *j_as_TableB() const {
+    return j_type() == Evolution::V1::Union::TableB ? static_cast<const Evolution::V1::TableB *>(j()) : nullptr;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_A) &&
+           VerifyField<uint8_t>(verifier, VT_B) &&
+           VerifyField<uint8_t>(verifier, VT_C_TYPE) &&
+           VerifyOffset(verifier, VT_C) &&
+           VerifyUnion(verifier, c(), c_type()) &&
+           VerifyField<int8_t>(verifier, VT_D) &&
+           VerifyOffset(verifier, VT_E) &&
+           verifier.VerifyTable(e()) &&
+           VerifyField<Evolution::V1::Struct>(verifier, VT_F) &&
+           VerifyOffset(verifier, VT_G) &&
+           verifier.VerifyVector(g()) &&
+           VerifyOffset(verifier, VT_H) &&
+           verifier.VerifyVector(h()) &&
+           verifier.VerifyVectorOfTables(h()) &&
+           VerifyField<int32_t>(verifier, VT_I) &&
+           VerifyField<uint8_t>(verifier, VT_J_TYPE) &&
+           VerifyOffset(verifier, VT_J) &&
+           VerifyUnion(verifier, j(), j_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const Evolution::V1::TableA *Root::c_as<Evolution::V1::TableA>() const {
+  return c_as_TableA();
+}
+
+template<> inline const Evolution::V1::TableB *Root::c_as<Evolution::V1::TableB>() const {
+  return c_as_TableB();
+}
+
+template<> inline const Evolution::V1::TableA *Root::j_as<Evolution::V1::TableA>() const {
+  return j_as_TableA();
+}
+
+template<> inline const Evolution::V1::TableB *Root::j_as<Evolution::V1::TableB>() const {
+  return j_as_TableB();
+}
+
+struct RootBuilder {
+  typedef Root Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(int32_t a) {
+    fbb_.AddElement<int32_t>(Root::VT_A, a, 0);
+  }
+  void add_b(bool b) {
+    fbb_.AddElement<uint8_t>(Root::VT_B, static_cast<uint8_t>(b), 0);
+  }
+  void add_c_type(Evolution::V1::Union c_type) {
+    fbb_.AddElement<uint8_t>(Root::VT_C_TYPE, static_cast<uint8_t>(c_type), 0);
+  }
+  void add_c(flatbuffers::Offset<void> c) {
+    fbb_.AddOffset(Root::VT_C, c);
+  }
+  void add_d(Evolution::V1::Enum d) {
+    fbb_.AddElement<int8_t>(Root::VT_D, static_cast<int8_t>(d), 0);
+  }
+  void add_e(flatbuffers::Offset<Evolution::V1::TableA> e) {
+    fbb_.AddOffset(Root::VT_E, e);
+  }
+  void add_f(const Evolution::V1::Struct *f) {
+    fbb_.AddStruct(Root::VT_F, f);
+  }
+  void add_g(flatbuffers::Offset<flatbuffers::Vector<int32_t>> g) {
+    fbb_.AddOffset(Root::VT_G, g);
+  }
+  void add_h(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Evolution::V1::TableB>>> h) {
+    fbb_.AddOffset(Root::VT_H, h);
+  }
+  void add_i(int32_t i) {
+    fbb_.AddElement<int32_t>(Root::VT_I, i, 1234);
+  }
+  void add_j_type(Evolution::V1::Union j_type) {
+    fbb_.AddElement<uint8_t>(Root::VT_J_TYPE, static_cast<uint8_t>(j_type), 0);
+  }
+  void add_j(flatbuffers::Offset<void> j) {
+    fbb_.AddOffset(Root::VT_J, j);
+  }
+  explicit RootBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Root> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Root>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Root> CreateRoot(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t a = 0,
+    bool b = false,
+    Evolution::V1::Union c_type = Evolution::V1::Union::NONE,
+    flatbuffers::Offset<void> c = 0,
+    Evolution::V1::Enum d = Evolution::V1::Enum::King,
+    flatbuffers::Offset<Evolution::V1::TableA> e = 0,
+    const Evolution::V1::Struct *f = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> g = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Evolution::V1::TableB>>> h = 0,
+    int32_t i = 1234,
+    Evolution::V1::Union j_type = Evolution::V1::Union::NONE,
+    flatbuffers::Offset<void> j = 0) {
+  RootBuilder builder_(_fbb);
+  builder_.add_j(j);
+  builder_.add_i(i);
+  builder_.add_h(h);
+  builder_.add_g(g);
+  builder_.add_f(f);
+  builder_.add_e(e);
+  builder_.add_c(c);
+  builder_.add_a(a);
+  builder_.add_j_type(j_type);
+  builder_.add_d(d);
+  builder_.add_c_type(c_type);
+  builder_.add_b(b);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Root> CreateRootDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t a = 0,
+    bool b = false,
+    Evolution::V1::Union c_type = Evolution::V1::Union::NONE,
+    flatbuffers::Offset<void> c = 0,
+    Evolution::V1::Enum d = Evolution::V1::Enum::King,
+    flatbuffers::Offset<Evolution::V1::TableA> e = 0,
+    const Evolution::V1::Struct *f = 0,
+    const std::vector<int32_t> *g = nullptr,
+    const std::vector<flatbuffers::Offset<Evolution::V1::TableB>> *h = nullptr,
+    int32_t i = 1234,
+    Evolution::V1::Union j_type = Evolution::V1::Union::NONE,
+    flatbuffers::Offset<void> j = 0) {
+  auto g__ = g ? _fbb.CreateVector<int32_t>(*g) : 0;
+  auto h__ = h ? _fbb.CreateVector<flatbuffers::Offset<Evolution::V1::TableB>>(*h) : 0;
+  return Evolution::V1::CreateRoot(
+      _fbb,
+      a,
+      b,
+      c_type,
+      c,
+      d,
+      e,
+      f,
+      g__,
+      h__,
+      i,
+      j_type,
+      j);
+}
+
+inline bool VerifyUnion(flatbuffers::Verifier &verifier, const void *obj, Union type) {
+  switch (type) {
+    case Union::NONE: {
+      return true;
+    }
+    case Union::TableA: {
+      auto ptr = reinterpret_cast<const Evolution::V1::TableA *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Union::TableB: {
+      auto ptr = reinterpret_cast<const Evolution::V1::TableB *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyUnion(
+        verifier,  values->Get(i), types->GetEnum<Union>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const Evolution::V1::Root *GetRoot(const void *buf) {
+  return flatbuffers::GetRoot<Evolution::V1::Root>(buf);
+}
+
+inline const Evolution::V1::Root *GetSizePrefixedRoot(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<Evolution::V1::Root>(buf);
+}
+
+inline bool VerifyRootBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<Evolution::V1::Root>(nullptr);
+}
+
+inline bool VerifySizePrefixedRootBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<Evolution::V1::Root>(nullptr);
+}
+
+inline void FinishRootBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Evolution::V1::Root> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedRootBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Evolution::V1::Root> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace V1
+}  // namespace Evolution
+
+#endif  // FLATBUFFERS_GENERATED_EVOLUTIONV1_EVOLUTION_V1_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.fbs
new file mode 100644
index 0000000..c7e1ef9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.fbs
@@ -0,0 +1,51 @@
+namespace Evolution.V2;
+
+table TableA {
+    b:int (id: 1);      // swapped with 'a'
+    a:float (id: 0);    // swapped with 'b'
+    c:string (id: 2);   // new in v2
+}
+
+table TableB {
+    a:int;
+}
+
+table TableC {          // new in v2
+    a:double;
+    b:string;
+}
+
+enum Enum : byte {
+    King,
+    Queen,
+    Rook,               // new in v2
+    Bishop              // new in v2
+}
+
+union Union {
+    TableA,
+    TableB,
+    TableC
+}
+
+struct Struct {
+    a:int;
+    b:double;
+}
+
+table Root {
+    a:int (deprecated); // deprecated in v2
+    b:bool;
+    c:Union;
+    d:Enum;
+    e:TableA;
+    ff:Struct;          // renamed from 'f' in v1
+    g:[int];
+    h:[TableB];
+    i:uint = 1234;
+    j:Union (deprecated); // deprecated in v2
+    k:TableC;           // new in v2
+    l:uint8 = 56;       // new in v2
+}
+
+root_type Root;
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.json b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.json
new file mode 100644
index 0000000..b170eb2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2.json
@@ -0,0 +1,34 @@
+{
+  "b": false,
+  "c_type": "TableC",
+  "c": {
+    "a": 984.2494
+  },
+  "d": "Bishop",
+  "e": {
+    "a": 3.1452,
+    "b": 435,
+    "c": "yummy yummy fig bar bar"
+  },
+  "ff":{
+    "a": 35,
+    "b": 243.980943
+  },
+  "g": [ 7, 8, 10],
+  "h": [
+    {
+      "a": 212
+    },
+    {
+      "a": 459
+    },
+    {
+      "a": 333
+    }
+  ],
+  "i": 4321,
+  "k": {
+    "a": 9874.342,
+    "b": "more please"
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2_generated.h
new file mode 100644
index 0000000..303d94d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/evolution_test/evolution_v2_generated.h
@@ -0,0 +1,600 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_EVOLUTIONV2_EVOLUTION_V2_H_
+#define FLATBUFFERS_GENERATED_EVOLUTIONV2_EVOLUTION_V2_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace Evolution {
+namespace V2 {
+
+struct TableA;
+struct TableABuilder;
+
+struct TableB;
+struct TableBBuilder;
+
+struct TableC;
+struct TableCBuilder;
+
+struct Struct;
+
+struct Root;
+struct RootBuilder;
+
+enum class Enum : int8_t {
+  King = 0,
+  Queen = 1,
+  Rook = 2,
+  Bishop = 3,
+  MIN = King,
+  MAX = Bishop
+};
+
+inline const Enum (&EnumValuesEnum())[4] {
+  static const Enum values[] = {
+    Enum::King,
+    Enum::Queen,
+    Enum::Rook,
+    Enum::Bishop
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnum() {
+  static const char * const names[5] = {
+    "King",
+    "Queen",
+    "Rook",
+    "Bishop",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnum(Enum e) {
+  if (flatbuffers::IsOutRange(e, Enum::King, Enum::Bishop)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnum()[index];
+}
+
+enum class Union : uint8_t {
+  NONE = 0,
+  TableA = 1,
+  TableB = 2,
+  TableC = 3,
+  MIN = NONE,
+  MAX = TableC
+};
+
+inline const Union (&EnumValuesUnion())[4] {
+  static const Union values[] = {
+    Union::NONE,
+    Union::TableA,
+    Union::TableB,
+    Union::TableC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesUnion() {
+  static const char * const names[5] = {
+    "NONE",
+    "TableA",
+    "TableB",
+    "TableC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameUnion(Union e) {
+  if (flatbuffers::IsOutRange(e, Union::NONE, Union::TableC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnion()[index];
+}
+
+template<typename T> struct UnionTraits {
+  static const Union enum_value = Union::NONE;
+};
+
+template<> struct UnionTraits<Evolution::V2::TableA> {
+  static const Union enum_value = Union::TableA;
+};
+
+template<> struct UnionTraits<Evolution::V2::TableB> {
+  static const Union enum_value = Union::TableB;
+};
+
+template<> struct UnionTraits<Evolution::V2::TableC> {
+  static const Union enum_value = Union::TableC;
+};
+
+bool VerifyUnion(flatbuffers::Verifier &verifier, const void *obj, Union type);
+bool VerifyUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Struct FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t a_;
+  int32_t padding0__;
+  double b_;
+
+ public:
+  Struct()
+      : a_(0),
+        padding0__(0),
+        b_(0) {
+    (void)padding0__;
+  }
+  Struct(int32_t _a, double _b)
+      : a_(flatbuffers::EndianScalar(_a)),
+        padding0__(0),
+        b_(flatbuffers::EndianScalar(_b)) {
+    (void)padding0__;
+  }
+  int32_t a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  double b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+};
+FLATBUFFERS_STRUCT_END(Struct, 16);
+
+inline bool operator==(const Struct &lhs, const Struct &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b());
+}
+
+inline bool operator!=(const Struct &lhs, const Struct &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableA FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableABuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4,
+    VT_B = 6,
+    VT_C = 8
+  };
+  float a() const {
+    return GetField<float>(VT_A, 0.0f);
+  }
+  int32_t b() const {
+    return GetField<int32_t>(VT_B, 0);
+  }
+  const flatbuffers::String *c() const {
+    return GetPointer<const flatbuffers::String *>(VT_C);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_A) &&
+           VerifyField<int32_t>(verifier, VT_B) &&
+           VerifyOffset(verifier, VT_C) &&
+           verifier.VerifyString(c()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TableABuilder {
+  typedef TableA Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(float a) {
+    fbb_.AddElement<float>(TableA::VT_A, a, 0.0f);
+  }
+  void add_b(int32_t b) {
+    fbb_.AddElement<int32_t>(TableA::VT_B, b, 0);
+  }
+  void add_c(flatbuffers::Offset<flatbuffers::String> c) {
+    fbb_.AddOffset(TableA::VT_C, c);
+  }
+  explicit TableABuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableA> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableA>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableA> CreateTableA(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float a = 0.0f,
+    int32_t b = 0,
+    flatbuffers::Offset<flatbuffers::String> c = 0) {
+  TableABuilder builder_(_fbb);
+  builder_.add_c(c);
+  builder_.add_b(b);
+  builder_.add_a(a);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TableA> CreateTableADirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float a = 0.0f,
+    int32_t b = 0,
+    const char *c = nullptr) {
+  auto c__ = c ? _fbb.CreateString(c) : 0;
+  return Evolution::V2::CreateTableA(
+      _fbb,
+      a,
+      b,
+      c__);
+}
+
+struct TableB FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableBBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4
+  };
+  int32_t a() const {
+    return GetField<int32_t>(VT_A, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_A) &&
+           verifier.EndTable();
+  }
+};
+
+struct TableBBuilder {
+  typedef TableB Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(int32_t a) {
+    fbb_.AddElement<int32_t>(TableB::VT_A, a, 0);
+  }
+  explicit TableBBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableB> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableB>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableB> CreateTableB(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t a = 0) {
+  TableBBuilder builder_(_fbb);
+  builder_.add_a(a);
+  return builder_.Finish();
+}
+
+struct TableC FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableCBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_A = 4,
+    VT_B = 6
+  };
+  double a() const {
+    return GetField<double>(VT_A, 0.0);
+  }
+  const flatbuffers::String *b() const {
+    return GetPointer<const flatbuffers::String *>(VT_B);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<double>(verifier, VT_A) &&
+           VerifyOffset(verifier, VT_B) &&
+           verifier.VerifyString(b()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TableCBuilder {
+  typedef TableC Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_a(double a) {
+    fbb_.AddElement<double>(TableC::VT_A, a, 0.0);
+  }
+  void add_b(flatbuffers::Offset<flatbuffers::String> b) {
+    fbb_.AddOffset(TableC::VT_B, b);
+  }
+  explicit TableCBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableC> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableC>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableC> CreateTableC(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    double a = 0.0,
+    flatbuffers::Offset<flatbuffers::String> b = 0) {
+  TableCBuilder builder_(_fbb);
+  builder_.add_a(a);
+  builder_.add_b(b);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TableC> CreateTableCDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    double a = 0.0,
+    const char *b = nullptr) {
+  auto b__ = b ? _fbb.CreateString(b) : 0;
+  return Evolution::V2::CreateTableC(
+      _fbb,
+      a,
+      b__);
+}
+
+struct Root FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RootBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_B = 6,
+    VT_C_TYPE = 8,
+    VT_C = 10,
+    VT_D = 12,
+    VT_E = 14,
+    VT_FF = 16,
+    VT_G = 18,
+    VT_H = 20,
+    VT_I = 22,
+    VT_K = 28,
+    VT_L = 30
+  };
+  bool b() const {
+    return GetField<uint8_t>(VT_B, 0) != 0;
+  }
+  Evolution::V2::Union c_type() const {
+    return static_cast<Evolution::V2::Union>(GetField<uint8_t>(VT_C_TYPE, 0));
+  }
+  const void *c() const {
+    return GetPointer<const void *>(VT_C);
+  }
+  template<typename T> const T *c_as() const;
+  const Evolution::V2::TableA *c_as_TableA() const {
+    return c_type() == Evolution::V2::Union::TableA ? static_cast<const Evolution::V2::TableA *>(c()) : nullptr;
+  }
+  const Evolution::V2::TableB *c_as_TableB() const {
+    return c_type() == Evolution::V2::Union::TableB ? static_cast<const Evolution::V2::TableB *>(c()) : nullptr;
+  }
+  const Evolution::V2::TableC *c_as_TableC() const {
+    return c_type() == Evolution::V2::Union::TableC ? static_cast<const Evolution::V2::TableC *>(c()) : nullptr;
+  }
+  Evolution::V2::Enum d() const {
+    return static_cast<Evolution::V2::Enum>(GetField<int8_t>(VT_D, 0));
+  }
+  const Evolution::V2::TableA *e() const {
+    return GetPointer<const Evolution::V2::TableA *>(VT_E);
+  }
+  const Evolution::V2::Struct *ff() const {
+    return GetStruct<const Evolution::V2::Struct *>(VT_FF);
+  }
+  const flatbuffers::Vector<int32_t> *g() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_G);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Evolution::V2::TableB>> *h() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Evolution::V2::TableB>> *>(VT_H);
+  }
+  uint32_t i() const {
+    return GetField<uint32_t>(VT_I, 1234);
+  }
+  const Evolution::V2::TableC *k() const {
+    return GetPointer<const Evolution::V2::TableC *>(VT_K);
+  }
+  uint8_t l() const {
+    return GetField<uint8_t>(VT_L, 56);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_B) &&
+           VerifyField<uint8_t>(verifier, VT_C_TYPE) &&
+           VerifyOffset(verifier, VT_C) &&
+           VerifyUnion(verifier, c(), c_type()) &&
+           VerifyField<int8_t>(verifier, VT_D) &&
+           VerifyOffset(verifier, VT_E) &&
+           verifier.VerifyTable(e()) &&
+           VerifyField<Evolution::V2::Struct>(verifier, VT_FF) &&
+           VerifyOffset(verifier, VT_G) &&
+           verifier.VerifyVector(g()) &&
+           VerifyOffset(verifier, VT_H) &&
+           verifier.VerifyVector(h()) &&
+           verifier.VerifyVectorOfTables(h()) &&
+           VerifyField<uint32_t>(verifier, VT_I) &&
+           VerifyOffset(verifier, VT_K) &&
+           verifier.VerifyTable(k()) &&
+           VerifyField<uint8_t>(verifier, VT_L) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const Evolution::V2::TableA *Root::c_as<Evolution::V2::TableA>() const {
+  return c_as_TableA();
+}
+
+template<> inline const Evolution::V2::TableB *Root::c_as<Evolution::V2::TableB>() const {
+  return c_as_TableB();
+}
+
+template<> inline const Evolution::V2::TableC *Root::c_as<Evolution::V2::TableC>() const {
+  return c_as_TableC();
+}
+
+struct RootBuilder {
+  typedef Root Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_b(bool b) {
+    fbb_.AddElement<uint8_t>(Root::VT_B, static_cast<uint8_t>(b), 0);
+  }
+  void add_c_type(Evolution::V2::Union c_type) {
+    fbb_.AddElement<uint8_t>(Root::VT_C_TYPE, static_cast<uint8_t>(c_type), 0);
+  }
+  void add_c(flatbuffers::Offset<void> c) {
+    fbb_.AddOffset(Root::VT_C, c);
+  }
+  void add_d(Evolution::V2::Enum d) {
+    fbb_.AddElement<int8_t>(Root::VT_D, static_cast<int8_t>(d), 0);
+  }
+  void add_e(flatbuffers::Offset<Evolution::V2::TableA> e) {
+    fbb_.AddOffset(Root::VT_E, e);
+  }
+  void add_ff(const Evolution::V2::Struct *ff) {
+    fbb_.AddStruct(Root::VT_FF, ff);
+  }
+  void add_g(flatbuffers::Offset<flatbuffers::Vector<int32_t>> g) {
+    fbb_.AddOffset(Root::VT_G, g);
+  }
+  void add_h(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Evolution::V2::TableB>>> h) {
+    fbb_.AddOffset(Root::VT_H, h);
+  }
+  void add_i(uint32_t i) {
+    fbb_.AddElement<uint32_t>(Root::VT_I, i, 1234);
+  }
+  void add_k(flatbuffers::Offset<Evolution::V2::TableC> k) {
+    fbb_.AddOffset(Root::VT_K, k);
+  }
+  void add_l(uint8_t l) {
+    fbb_.AddElement<uint8_t>(Root::VT_L, l, 56);
+  }
+  explicit RootBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Root> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Root>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Root> CreateRoot(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool b = false,
+    Evolution::V2::Union c_type = Evolution::V2::Union::NONE,
+    flatbuffers::Offset<void> c = 0,
+    Evolution::V2::Enum d = Evolution::V2::Enum::King,
+    flatbuffers::Offset<Evolution::V2::TableA> e = 0,
+    const Evolution::V2::Struct *ff = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> g = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Evolution::V2::TableB>>> h = 0,
+    uint32_t i = 1234,
+    flatbuffers::Offset<Evolution::V2::TableC> k = 0,
+    uint8_t l = 56) {
+  RootBuilder builder_(_fbb);
+  builder_.add_k(k);
+  builder_.add_i(i);
+  builder_.add_h(h);
+  builder_.add_g(g);
+  builder_.add_ff(ff);
+  builder_.add_e(e);
+  builder_.add_c(c);
+  builder_.add_l(l);
+  builder_.add_d(d);
+  builder_.add_c_type(c_type);
+  builder_.add_b(b);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Root> CreateRootDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool b = false,
+    Evolution::V2::Union c_type = Evolution::V2::Union::NONE,
+    flatbuffers::Offset<void> c = 0,
+    Evolution::V2::Enum d = Evolution::V2::Enum::King,
+    flatbuffers::Offset<Evolution::V2::TableA> e = 0,
+    const Evolution::V2::Struct *ff = 0,
+    const std::vector<int32_t> *g = nullptr,
+    const std::vector<flatbuffers::Offset<Evolution::V2::TableB>> *h = nullptr,
+    uint32_t i = 1234,
+    flatbuffers::Offset<Evolution::V2::TableC> k = 0,
+    uint8_t l = 56) {
+  auto g__ = g ? _fbb.CreateVector<int32_t>(*g) : 0;
+  auto h__ = h ? _fbb.CreateVector<flatbuffers::Offset<Evolution::V2::TableB>>(*h) : 0;
+  return Evolution::V2::CreateRoot(
+      _fbb,
+      b,
+      c_type,
+      c,
+      d,
+      e,
+      ff,
+      g__,
+      h__,
+      i,
+      k,
+      l);
+}
+
+inline bool VerifyUnion(flatbuffers::Verifier &verifier, const void *obj, Union type) {
+  switch (type) {
+    case Union::NONE: {
+      return true;
+    }
+    case Union::TableA: {
+      auto ptr = reinterpret_cast<const Evolution::V2::TableA *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Union::TableB: {
+      auto ptr = reinterpret_cast<const Evolution::V2::TableB *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Union::TableC: {
+      auto ptr = reinterpret_cast<const Evolution::V2::TableC *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyUnion(
+        verifier,  values->Get(i), types->GetEnum<Union>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const Evolution::V2::Root *GetRoot(const void *buf) {
+  return flatbuffers::GetRoot<Evolution::V2::Root>(buf);
+}
+
+inline const Evolution::V2::Root *GetSizePrefixedRoot(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<Evolution::V2::Root>(buf);
+}
+
+inline bool VerifyRootBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<Evolution::V2::Root>(nullptr);
+}
+
+inline bool VerifySizePrefixedRootBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<Evolution::V2::Root>(nullptr);
+}
+
+inline void FinishRootBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Evolution::V2::Root> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedRootBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Evolution::V2::Root> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace V2
+}  // namespace Evolution
+
+#endif  // FLATBUFFERS_GENERATED_EVOLUTIONV2_EVOLUTION_V2_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/CMakeLists.txt b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000..e57343b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/CMakeLists.txt
@@ -0,0 +1,190 @@
+cmake_minimum_required(VERSION 3.9)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+project(FlatBuffersFuzzerTests)
+
+option(BUILD_DEBUGGER "Compile a debugger with main() and without libFuzzer" OFF)
+
+if(NOT DEFINED FLATBUFFERS_MAX_PARSING_DEPTH)
+  # Force checking of RecursionError in the test
+  set(FLATBUFFERS_MAX_PARSING_DEPTH 24)
+endif()
+message(STATUS "FLATBUFFERS_MAX_PARSING_DEPTH: ${FLATBUFFERS_MAX_PARSING_DEPTH}")
+
+# Usage '-fsanitize=address' doesn't allowed with '-fsanitize=memory'.
+# MemorySanitizer will not work out-of-the-box, and will instead report false
+# positives coming from uninstrumented code. Need to re-build both C++ standard
+# library: https://github.com/google/sanitizers/wiki/MemorySanitizerLibcxxHowTo
+option(USE_ASAN "Use fuzzers with ASASN" OFF)
+option(USE_MSAN "Use fuzzers with MSASN" OFF)
+option(OSS_FUZZ "Set this option to use flags by oss-fuzz" OFF)
+
+# Use Clang linker.
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld")
+
+# add_link_options(-stdlib=libc++)
+
+add_compile_options(
+  # -stdlib=libc++ # Use Clang libc++ instead of GNU.
+  -std=c++17
+  -Wall
+  -pedantic
+  -Werror
+  -Wextra
+  -Wno-unused-parameter
+  -fsigned-char
+  -fno-omit-frame-pointer
+  -g # Generate source-level debug information
+  # -flto # enable link-time optimisation
+)
+
+# https://llvm.org/docs/Passes.html save IR to see call graph make one bitcode
+# file:> llvm-link *.bc -o out.bc print call-graph:> opt out.bc -analyze -print-
+# callgraph &> callgraph.txt set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -save-temps
+# -flto")
+
+# A special target with fuzzer+sanitizer flags.
+add_library(fuzzer_config INTERFACE)
+
+target_compile_options(
+  fuzzer_config
+  INTERFACE
+    $<$<NOT:$<BOOL:${OSS_FUZZ}>>:
+      -fsanitize-coverage=edge,trace-cmp
+    >
+    $<$<BOOL:${USE_ASAN}>:
+      -fsanitize=fuzzer,undefined,address
+    >
+    $<$<BOOL:${USE_MSAN}>:
+      -fsanitize=fuzzer,undefined,memory
+      -fsanitize-memory-track-origins=2
+    >
+    $<$<BOOL:${OSS_FUZZ}>:
+      ${CXX}
+      ${CXXFLAGS}
+    >
+)
+
+target_link_libraries(
+  fuzzer_config
+  INTERFACE
+    $<$<BOOL:${USE_ASAN}>:
+      -fsanitize=fuzzer,undefined,address
+    >
+    $<$<BOOL:${USE_MSAN}>:
+      -fsanitize=fuzzer,undefined,memory
+    >
+    $<$<BOOL:${OSS_FUZZ}>:
+      $ENV{LIB_FUZZING_ENGINE}
+    >
+)
+
+set(FLATBUFFERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+set(FlatBuffers_Library_SRCS
+    ${FLATBUFFERS_DIR}/include/flatbuffers/base.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/flatbuffers.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/hash.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/idl.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/util.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/reflection.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/reflection_generated.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/stl_emulation.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/flexbuffers.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/registry.h
+    ${FLATBUFFERS_DIR}/include/flatbuffers/minireflect.h
+    ${FLATBUFFERS_DIR}/src/idl_parser.cpp
+    ${FLATBUFFERS_DIR}/src/idl_gen_text.cpp
+    ${FLATBUFFERS_DIR}/src/reflection.cpp
+    ${FLATBUFFERS_DIR}/src/util.cpp
+    ${FLATBUFFERS_DIR}/tests/test_assert.cpp
+)
+
+include_directories(${FLATBUFFERS_DIR}/include)
+include_directories(${FLATBUFFERS_DIR}/tests)
+
+add_library(flatbuffers_fuzzed STATIC ${FlatBuffers_Library_SRCS})
+# Use PUBLIC to force 'fuzzer_config' for all dependent targets
+target_link_libraries(flatbuffers_fuzzed PUBLIC fuzzer_config)
+
+# FLATBUFFERS_ASSERT should assert in Release as well. Redefine
+# FLATBUFFERS_ASSERT macro definition. Declare as PUBLIC to cover asserts in all
+# included header files.
+target_compile_definitions(
+  flatbuffers_fuzzed
+  PUBLIC
+    FLATBUFFERS_ASSERT=fuzzer_assert_impl
+    FLATBUFFERS_ASSERT_INCLUDE="${CMAKE_CURRENT_SOURCE_DIR}/fuzzer_assert.h"
+  PRIVATE
+    FLATBUFFERS_MAX_PARSING_DEPTH=${FLATBUFFERS_MAX_PARSING_DEPTH}
+)
+
+# Setup fuzzer tests.
+
+add_executable(scalar_fuzzer flatbuffers_scalar_fuzzer.cc)
+target_link_libraries(scalar_fuzzer PRIVATE flatbuffers_fuzzed)
+
+add_executable(parser_fuzzer flatbuffers_parser_fuzzer.cc)
+target_link_libraries(parser_fuzzer PRIVATE flatbuffers_fuzzed)
+
+add_executable(verifier_fuzzer flatbuffers_verifier_fuzzer.cc)
+target_link_libraries(verifier_fuzzer PRIVATE flatbuffers_fuzzed)
+
+add_executable(monster_fuzzer flatbuffers_monster_fuzzer.cc)
+target_link_libraries(monster_fuzzer PRIVATE flatbuffers_fuzzed)
+add_custom_command(
+  TARGET monster_fuzzer PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+  ${CMAKE_SOURCE_DIR}/../monster_test.bfbs
+  ${CMAKE_CURRENT_BINARY_DIR}/monster_test.bfbs)
+  
+
+# Build debugger for weird cases found with fuzzer.
+if(BUILD_DEBUGGER)
+  add_library(flatbuffers_nonfuzz STATIC ${FlatBuffers_Library_SRCS})
+  target_compile_options(
+    flatbuffers_nonfuzz
+    PUBLIC
+      $<$<BOOL:${USE_ASAN}>:
+        -fsanitize=undefined,address
+      >
+      -fno-limit-debug-info
+  )
+  
+  target_link_libraries(
+    flatbuffers_nonfuzz
+    PUBLIC
+      $<$<BOOL:${USE_ASAN}>:
+        -fsanitize=undefined,address
+      >
+  )
+
+  target_compile_definitions(
+    flatbuffers_nonfuzz
+    PUBLIC
+      FLATBUFFERS_ASSERT=fuzzer_assert_impl
+      FLATBUFFERS_ASSERT_INCLUDE="${CMAKE_CURRENT_SOURCE_DIR}/fuzzer_assert.h"
+    PRIVATE
+      FLATBUFFERS_MAX_PARSING_DEPTH=${FLATBUFFERS_MAX_PARSING_DEPTH}
+  )
+  add_executable(scalar_debug
+    flatbuffers_scalar_fuzzer.cc
+    scalar_debug.cpp
+  )
+  target_link_libraries(scalar_debug PRIVATE flatbuffers_nonfuzz)
+
+  add_executable(monster_debug
+    flatbuffers_monster_fuzzer.cc
+    monster_debug.cpp
+  )
+  target_link_libraries(monster_debug PRIVATE flatbuffers_nonfuzz)
+  add_custom_command(
+    TARGET monster_debug PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_SOURCE_DIR}/../monster_test.bfbs
+    ${CMAKE_CURRENT_BINARY_DIR}/monster_test.bfbs)
+  
+endif(BUILD_DEBUGGER)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_monster_fuzzer.cc b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_monster_fuzzer.cc
new file mode 100644
index 0000000..8981c2f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_monster_fuzzer.cc
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <clocale>
+#include <filesystem>
+#include <string>
+
+#include "cpp17/generated_cpp17/monster_test_generated.h"
+#include "flatbuffers/idl.h"
+#include "test_init.h"
+
+namespace fs = std::filesystem;
+
+// Utility for test run.
+OneTimeTestInit OneTimeTestInit::one_time_init_;
+// The current executable path (see LLVMFuzzerInitialize).
+static fs::path exe_path_;
+
+namespace {
+
+static constexpr size_t kMinInputLength = 1;
+static constexpr size_t kMaxInputLength = 16384;
+
+static constexpr uint8_t flags_strict_json = 0x80;
+static constexpr uint8_t flags_skip_unexpected_fields_in_json = 0x40;
+static constexpr uint8_t flags_allow_non_utf8 = 0x20;
+
+bool TestFileExists(fs::path file_path) {
+  if (file_path.has_filename() && fs::exists(file_path)) return true;
+
+  TEST_OUTPUT_LINE("@DEBUG: file '%s' not found", file_path.string().c_str());
+  for (const auto &entry : fs::directory_iterator(file_path.parent_path())) {
+    TEST_OUTPUT_LINE("@DEBUG: parent path entry: '%s'", entry.path().string().c_str());
+  }
+  return false;
+}
+
+std::string LoadBinarySchema(const char *file_name) {
+  const auto file_path = exe_path_.parent_path() / file_name;
+  TEST_EQ(true, TestFileExists(file_path));
+  std::string schemafile;
+  TEST_EQ(true, flatbuffers::LoadFile(file_path.string().c_str(), true, &schemafile));
+
+  flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t *>(schemafile.c_str()), schemafile.size());
+  TEST_EQ(true, reflection::VerifySchemaBuffer(verifier));
+  return schemafile;
+}
+
+std::string do_test(const flatbuffers::IDLOptions &opts,
+                    const std::string input_json, const bool check_parser) {
+  // once loaded from disk
+  static const std::string schemafile = LoadBinarySchema("monster_test.bfbs");
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parser;
+  TEST_EQ(true, parser.Deserialize(
+                    reinterpret_cast<const uint8_t *>(schemafile.c_str()),
+                    schemafile.size()));
+  // (re)define parser options
+  parser.opts = opts;
+
+  std::string jsongen;
+  if (parser.ParseJson(input_json.c_str())) {
+    flatbuffers::Verifier verifier(parser.builder_.GetBufferPointer(),
+                                   parser.builder_.GetSize());
+    TEST_EQ(true, MyGame::Example::VerifyMonsterBuffer(verifier));
+    TEST_ASSERT(
+        GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen));
+  } else if (check_parser) {
+    TEST_OUTPUT_LINE("parser failed with JSON:\n%s", input_json.c_str());
+    TEST_EQ_STR("", parser.error_.c_str());
+    TEST_ASSERT(false);
+  }
+  return jsongen;
+};
+}  // namespace
+
+// https://google.github.io/oss-fuzz/further-reading/fuzzer-environment/
+// Current working directory
+// You should not make any assumptions about the current working directory of
+// your fuzz target. If you need to load data files, please use argv[0] to get
+// the directory where your fuzz target executable is located.
+// You must not modify argv[0].
+extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  (void)argc;
+  exe_path_ = (*argv)[0];
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // Reserve one byte for Parser flags and one byte for repetition counter.
+  if (size < 3) return 0;
+  const uint8_t flags = data[0];
+  (void)data[1];  //  reserved
+  data += 2;
+  size -= 2;  // bypass
+
+  const std::string original(reinterpret_cast<const char *>(data), size);
+  auto input = std::string(original.c_str());  // until '\0'
+  if (input.size() < kMinInputLength || input.size() > kMaxInputLength)
+    return 0;
+
+  flatbuffers::IDLOptions opts;
+  opts.strict_json = (flags & flags_strict_json);
+  opts.skip_unexpected_fields_in_json =
+      (flags & flags_skip_unexpected_fields_in_json);
+  opts.allow_non_utf8 = (flags & flags_allow_non_utf8);
+
+  const std::string jsongen_1 = do_test(opts, input, false);
+  if (!jsongen_1.empty()) {
+    const std::string jsongen_2 = do_test(opts, jsongen_1, true);
+    TEST_EQ(jsongen_1, jsongen_2);
+  }
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_parser_fuzzer.cc b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_parser_fuzzer.cc
new file mode 100644
index 0000000..0b74f2c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_parser_fuzzer.cc
@@ -0,0 +1,55 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <stddef.h>
+#include <stdint.h>
+#include <clocale>
+#include <string>
+
+#include "flatbuffers/idl.h"
+#include "test_init.h"
+
+static constexpr size_t kMinInputLength = 1;
+static constexpr size_t kMaxInputLength = 16384;
+
+static constexpr uint8_t flags_strict_json = 0x80;
+static constexpr uint8_t flags_skip_unexpected_fields_in_json = 0x40;
+static constexpr uint8_t flags_allow_non_utf8 = 0x20;
+
+// Utility for test run.
+OneTimeTestInit OneTimeTestInit::one_time_init_;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // Reserve one byte for Parser flags and one byte for repetition counter.
+  if (size < 3) return 0;
+  const uint8_t flags = data[0];
+  (void)data[1];  //  reserved
+  data += 2;
+  size -= 2;  // bypass
+
+  const std::string original(reinterpret_cast<const char *>(data), size);
+  auto input = std::string(original.c_str());  // until '\0'
+  if (input.size() < kMinInputLength || input.size() > kMaxInputLength)
+    return 0;
+
+  flatbuffers::IDLOptions opts;
+  opts.strict_json = (flags & flags_strict_json);
+  opts.skip_unexpected_fields_in_json =
+      (flags & flags_skip_unexpected_fields_in_json);
+  opts.allow_non_utf8 = (flags & flags_allow_non_utf8);
+
+  flatbuffers::Parser parser(opts);
+
+  // Guarantee 0-termination in the input.
+  auto parse_input = input.c_str();
+
+  // Check Parser.
+  parser.Parse(parse_input);
+  // TODO:
+  // Need to add additional checks for inputs passed Parse(parse_input) successfully:
+  // 1. Serialization to bfbs.
+  // 2. Generation of a default object.
+  // 3. Verification of the object using reflection.
+  // 3. Printing to json.
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_scalar_fuzzer.cc b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_scalar_fuzzer.cc
new file mode 100644
index 0000000..faa069e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_scalar_fuzzer.cc
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <clocale>
+#include <memory>
+#include <regex>
+#include <string>
+
+#include "flatbuffers/idl.h"
+#include "test_init.h"
+
+static constexpr size_t kMinInputLength = 1;
+static constexpr size_t kMaxInputLength = 3000;
+
+static constexpr uint8_t flags_scalar_type = 0x0F;  // type of scalar value
+static constexpr uint8_t flags_quotes_kind = 0x10;  // quote " or '
+// reserved for future: json {named} or [unnamed]
+// static constexpr uint8_t flags_json_bracer = 0x20;
+
+// Find all 'subj' sub-strings and replace first character of sub-string.
+// BreakSequence("testest","tes", 'X') -> "XesXest".
+// BreakSequence("xxx","xx", 'Y') -> "YYx".
+static void BreakSequence(std::string &s, const char *subj, char repl) {
+  size_t pos = 0;
+  while (pos = s.find(subj, pos), pos != std::string::npos) {
+    s.at(pos) = repl;
+    pos++;
+  }
+}
+
+// Remove all leading and trailing symbols matched with pattern set.
+// StripString("xy{xy}y", "xy") -> "{xy}"
+static std::string StripString(const std::string &s, const char *pattern,
+                               size_t *pos = nullptr) {
+  if (pos) *pos = 0;
+  // leading
+  auto first = s.find_first_not_of(pattern);
+  if (std::string::npos == first) return "";
+  if (pos) *pos = first;
+  // trailing
+  auto last = s.find_last_not_of(pattern);
+  assert(last < s.length());
+  assert(first <= last);
+  return s.substr(first, last - first + 1);
+}
+
+class RegexMatcher {
+ protected:
+  virtual bool MatchNumber(const std::string &input) const = 0;
+
+ public:
+  virtual ~RegexMatcher() = default;
+
+  struct MatchResult {
+    size_t pos{ 0 };
+    size_t len{ 0 };
+    bool res{ false };
+    bool quoted{ false };
+  };
+
+  MatchResult Match(const std::string &input) const {
+    MatchResult r;
+    // strip leading and trailing "spaces" accepted by flatbuffer
+    auto test = StripString(input, "\t\r\n ", &r.pos);
+    r.len = test.size();
+    // check quotes
+    if (test.size() >= 2) {
+      auto fch = test.front();
+      auto lch = test.back();
+      r.quoted = (fch == lch) && (fch == '\'' || fch == '\"');
+      if (r.quoted) {
+        // remove quotes for regex test
+        test = test.substr(1, test.size() - 2);
+      }
+    }
+    // Fast check:
+    if (test.empty()) return r;
+    // A string with a valid scalar shouldn't have non-ascii or non-printable
+    // symbols.
+    for (auto c : test) {
+      if ((c < ' ') || (c > '~')) return r;
+    }
+    // Check with regex
+    r.res = MatchNumber(test);
+    return r;
+  }
+
+  bool MatchRegexList(const std::string &input,
+                      const std::vector<std::regex> &re_list) const {
+    auto str = StripString(input, " ");
+    if (str.empty()) return false;
+    for (auto &re : re_list) {
+      std::smatch match;
+      if (std::regex_match(str, match, re)) return true;
+    }
+    return false;
+  }
+};
+
+class IntegerRegex : public RegexMatcher {
+ protected:
+  bool MatchNumber(const std::string &input) const override {
+    static const std::vector<std::regex> re_list = {
+      std::regex{ R"(^[-+]?[0-9]+$)", std::regex_constants::optimize },
+
+      std::regex{ R"(^[-+]?0[xX][0-9a-fA-F]+$)",
+                  std::regex_constants::optimize }
+    };
+    return MatchRegexList(input, re_list);
+  }
+
+ public:
+  IntegerRegex() = default;
+  virtual ~IntegerRegex() = default;
+};
+
+class UIntegerRegex : public RegexMatcher {
+ protected:
+  bool MatchNumber(const std::string &input) const override {
+    static const std::vector<std::regex> re_list = {
+      std::regex{ R"(^[+]?[0-9]+$)", std::regex_constants::optimize },
+      std::regex{ R"(^[+]?0[xX][0-9a-fA-F]+$)",
+                  std::regex_constants::optimize },
+      // accept -0 number
+      std::regex{ R"(^[-](?:0[xX])?0+$)", std::regex_constants::optimize }
+    };
+    return MatchRegexList(input, re_list);
+  }
+
+ public:
+  UIntegerRegex() = default;
+  virtual ~UIntegerRegex() = default;
+};
+
+class BooleanRegex : public IntegerRegex {
+ protected:
+  bool MatchNumber(const std::string &input) const override {
+    if (input == "true" || input == "false") return true;
+    return IntegerRegex::MatchNumber(input);
+  }
+
+ public:
+  BooleanRegex() = default;
+  virtual ~BooleanRegex() = default;
+};
+
+class FloatRegex : public RegexMatcher {
+ protected:
+  bool MatchNumber(const std::string &input) const override {
+    static const std::vector<std::regex> re_list = {
+      // hex-float
+      std::regex{
+          R"(^[-+]?0[xX](?:(?:[.][0-9a-fA-F]+)|(?:[0-9a-fA-F]+[.][0-9a-fA-F]*)|(?:[0-9a-fA-F]+))[pP][-+]?[0-9]+$)",
+          std::regex_constants::optimize },
+      // dec-float
+      std::regex{
+          R"(^[-+]?(?:(?:[.][0-9]+)|(?:[0-9]+[.][0-9]*)|(?:[0-9]+))(?:[eE][-+]?[0-9]+)?$)",
+          std::regex_constants::optimize },
+
+      std::regex{ R"(^[-+]?(?:nan|inf|infinity)$)",
+                  std::regex_constants::optimize | std::regex_constants::icase }
+    };
+    return MatchRegexList(input, re_list);
+  }
+
+ public:
+  FloatRegex() = default;
+  virtual ~FloatRegex() = default;
+};
+
+class ScalarReferenceResult {
+ private:
+  ScalarReferenceResult(const char *_type, RegexMatcher::MatchResult _matched)
+      : type(_type), matched(_matched) {}
+
+ public:
+  // Decode scalar type and check if the input string satisfies the scalar type.
+  static ScalarReferenceResult Check(uint8_t code, const std::string &input) {
+    switch (code) {
+      case 0x0: return { "double", FloatRegex().Match(input) };
+      case 0x1: return { "float", FloatRegex().Match(input) };
+      case 0x2: return { "int8", IntegerRegex().Match(input) };
+      case 0x3: return { "int16", IntegerRegex().Match(input) };
+      case 0x4: return { "int32", IntegerRegex().Match(input) };
+      case 0x5: return { "int64", IntegerRegex().Match(input) };
+      case 0x6: return { "uint8", UIntegerRegex().Match(input) };
+      case 0x7: return { "uint16", UIntegerRegex().Match(input) };
+      case 0x8: return { "uint32", UIntegerRegex().Match(input) };
+      case 0x9: return { "uint64", UIntegerRegex().Match(input) };
+      case 0xA: return { "bool", BooleanRegex().Match(input) };
+      default: return { "float", FloatRegex().Match(input) };
+    };
+  }
+
+  const char *type;
+  const RegexMatcher::MatchResult matched;
+};
+
+bool Parse(flatbuffers::Parser &parser, const std::string &json,
+           std::string *_text) {
+  auto done = parser.ParseJson(json.c_str());
+  if (done) {
+    TEST_EQ(GenerateText(parser, parser.builder_.GetBufferPointer(), _text),
+            true);
+  } else {
+    *_text = parser.error_;
+  }
+  return done;
+}
+
+// Utility for test run.
+OneTimeTestInit OneTimeTestInit::one_time_init_;
+
+// llvm std::regex have problem with stack overflow, limit maximum length.
+// ./scalar_fuzzer -max_len=3000
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // Reserve one byte for Parser flags and one byte for repetition counter.
+  if (size < 3) return 0;
+  const uint8_t flags = data[0];
+  // normalize to ascii alphabet
+  const int extra_rep_number =
+      std::max(5, (data[1] > '0' ? (data[1] - '0') : 0));
+  data += 2;
+  size -= 2;  // bypass
+
+  // Guarantee 0-termination.
+  const std::string original(reinterpret_cast<const char *>(data), size);
+  auto input = std::string(original.c_str());  // until '\0'
+  if (input.size() < kMinInputLength || input.size() > kMaxInputLength)
+    return 0;
+
+  // Break comments in json to avoid complexity with regex matcher.
+  // The string " 12345 /* text */" will be accepted if insert it to string
+  // expression: "table X { Y: " + " 12345 /* text */" + "; }.
+  // But strings like this will complicate regex matcher.
+  // We reject this by transform "/* text */ 12345" to "@* text */ 12345".
+  BreakSequence(input, "//", '@');  // "//" -> "@/"
+  BreakSequence(input, "/*", '@');  // "/*" -> "@*"
+  // { "$schema: "text" } is exceptional case.
+  // This key:value ignored by the parser. Numbers can not have $.
+  BreakSequence(input, "$schema", '@');  // "$schema" -> "@schema"
+  // Break all known scalar functions (todo: add them to regex?):
+  for (auto f : { "deg", "rad", "sin", "cos", "tan", "asin", "acos", "atan" }) {
+    BreakSequence(input, f, '_');  // ident -> ident
+  }
+
+  // Extract type of scalar from 'flags' and check if the input string satisfies
+  // the scalar type.
+  const auto ref_res =
+      ScalarReferenceResult::Check(flags & flags_scalar_type, input);
+  auto &recheck = ref_res.matched;
+
+  // Create parser
+  flatbuffers::IDLOptions opts;
+  opts.force_defaults = true;
+  opts.output_default_scalars_in_json = true;
+  opts.indent_step = -1;
+  opts.strict_json = true;
+
+  flatbuffers::Parser parser(opts);
+  auto schema =
+      "table X { Y: " + std::string(ref_res.type) + "; } root_type X;";
+  TEST_EQ_FUNC(parser.Parse(schema.c_str()), true);
+
+  // The fuzzer can adjust the number repetition if a side-effects have found.
+  // Each test should pass at least two times to ensure that the parser doesn't
+  // have any hidden-states or locale-depended effects.
+  for (auto cnt = 0; cnt < (extra_rep_number + 2); cnt++) {
+    // Each even run (0,2,4..) will test locale independed code.
+    auto use_locale = !!OneTimeTestInit::test_locale() && (0 == (cnt % 2));
+    // Set new locale.
+    if (use_locale) {
+      FLATBUFFERS_ASSERT(setlocale(LC_ALL, OneTimeTestInit::test_locale()));
+    }
+
+    // Parse original input as-is.
+    auto orig_scalar = "{\"Y\" : " + input + "}";
+    std::string orig_back;
+    auto orig_done = Parse(parser, orig_scalar, &orig_back);
+
+    if (recheck.res != orig_done) {
+      // look for "does not fit" or "doesn't fit" or "out of range"
+      auto not_fit =
+          (true == recheck.res)
+              ? ((orig_back.find("does not fit") != std::string::npos) ||
+                 (orig_back.find("out of range") != std::string::npos))
+              : false;
+
+      if (false == not_fit) {
+        TEST_OUTPUT_LINE("Stage 1 failed: Parser(%d) != Regex(%d)", orig_done,
+                         recheck.res);
+        TEST_EQ_STR(orig_back.c_str(),
+                    input.substr(recheck.pos, recheck.len).c_str());
+        TEST_EQ_FUNC(orig_done, recheck.res);
+      }
+    }
+
+    // Try to make quoted string and test it.
+    std::string qouted_input;
+    if (true == recheck.quoted) {
+      // we can't simply remove quotes, they may be nested "'12'".
+      // Original string "\'12\'" converted to "'12'".
+      // The string can be an invalid string by JSON rules, but after quotes
+      // removed can transform to valid.
+      assert(recheck.len >= 2);
+    } else {
+      const auto quote = (flags & flags_quotes_kind) ? '\"' : '\'';
+      qouted_input = input;  // copy
+      qouted_input.insert(recheck.pos + recheck.len, 1, quote);
+      qouted_input.insert(recheck.pos, 1, quote);
+    }
+
+    // Test quoted version of the string
+    if (!qouted_input.empty()) {
+      auto fix_scalar = "{\"Y\" : " + qouted_input + "}";
+      std::string fix_back;
+      auto fix_done = Parse(parser, fix_scalar, &fix_back);
+
+      if (orig_done != fix_done) {
+        TEST_OUTPUT_LINE("Stage 2 failed: Parser(%d) != Regex(%d)", fix_done,
+                         orig_done);
+        TEST_EQ_STR(fix_back.c_str(), orig_back.c_str());
+      }
+      if (orig_done) { TEST_EQ_STR(fix_back.c_str(), orig_back.c_str()); }
+      TEST_EQ_FUNC(fix_done, orig_done);
+    }
+
+    // Create new parser and test default value
+    if (true == orig_done) {
+      flatbuffers::Parser def_parser(opts);  // re-use options
+      auto def_schema = "table X { Y: " + std::string(ref_res.type) + " = " +
+                        input + "; } root_type X;" +
+                        "{}";  // <- with empty json {}!
+
+      auto def_done = def_parser.Parse(def_schema.c_str());
+      if (false == def_done) {
+        TEST_OUTPUT_LINE("Stage 3.1 failed with _error = %s",
+                         def_parser.error_.c_str());
+        FLATBUFFERS_ASSERT(false);
+      }
+      // Compare with print.
+      std::string ref_string, def_string;
+      FLATBUFFERS_ASSERT(GenerateText(
+          parser, parser.builder_.GetBufferPointer(), &ref_string));
+      FLATBUFFERS_ASSERT(GenerateText(
+          def_parser, def_parser.builder_.GetBufferPointer(), &def_string));
+      if (ref_string != def_string) {
+        TEST_OUTPUT_LINE("Stage 3.2 failed: '%s' != '%s'", def_string.c_str(),
+                         ref_string.c_str());
+        FLATBUFFERS_ASSERT(false);
+      }
+    }
+
+    // Restore locale.
+    if (use_locale) { FLATBUFFERS_ASSERT(setlocale(LC_ALL, "C")); }
+  }
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_verifier_fuzzer.cc b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_verifier_fuzzer.cc
new file mode 100644
index 0000000..cb32a85
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/flatbuffers_verifier_fuzzer.cc
@@ -0,0 +1,14 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+
+#include "cpp17/generated_cpp17/monster_test_generated.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  flatbuffers::Verifier verifier(data, size);
+  MyGame::Example::VerifyMonsterBuffer(verifier);
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/fuzzer_assert.h b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/fuzzer_assert.h
new file mode 100644
index 0000000..41a4164
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/fuzzer_assert.h
@@ -0,0 +1,14 @@
+#ifndef FUZZER_ASSERT_IMPL_H_
+#define FUZZER_ASSERT_IMPL_H_
+
+#if defined(_MSC_VER)
+extern "C" void __debugbreak();
+#define __builtin_trap __debugbreak
+#else // Clang
+extern "C" void __builtin_trap(void);
+#endif
+
+// Declare Debug/Release independed assert macro.
+#define fuzzer_assert_impl(x) (!!(x) ? static_cast<void>(0) : __builtin_trap())
+
+#endif // !FUZZER_ASSERT_IMPL_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_debug.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_debug.cpp
new file mode 100644
index 0000000..c7a02e1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_debug.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <assert.h>
+
+#include "flatbuffers/util.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    std::cerr << "Usage: monster_debug <path to fuzzer crash file>\n";
+    return 0;
+  }
+  std::string crash_file_name(argv[1]);
+  std::string crash_file_data;
+  auto done =
+      flatbuffers::LoadFile(crash_file_name.c_str(), true, &crash_file_data);
+  if (!done) {
+    std::cerr << "Can not load file: '" << crash_file_name << "'";
+    return -1;
+  }
+  if (crash_file_data.size() < 3) {
+    std::cerr << "Invalid file data: '" << crash_file_data << "'";
+    return -2;
+  }
+  auto rc = LLVMFuzzerTestOneInput(
+      reinterpret_cast<const uint8_t *>(crash_file_data.data()),
+      crash_file_data.size());
+  std::cout << "LLVMFuzzerTestOneInput finished with code " << rc << "\n\n";
+  return rc;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_fuzzer.dict b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_fuzzer.dict
new file mode 100644
index 0000000..a06e1e0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/monster_fuzzer.dict
@@ -0,0 +1,60 @@
+"{"
+"}"
+"["
+"]"
+"\""
+"'"
+"\\"
+"//"
+":"
+","
+" "
+"\\n"
+"\\r"
+"/*"
+"*/"
+"true"
+"false"
+"null"
+"\\u"
+"\\b"
+"\\f"
+"\\t"
+"."
+"e"
+"e+"
+"e-"
+"E"
+"E+"
+"E-"
+"0x"
+"p"
+"a"
+"b"
+"Monster"
+"pos"
+"hp"
+"name"
+"weapons"
+"damage"
+"equipped_type"
+"equipped"
+"inventory"
+"vector_of_longs"
+"vector_of_doubles"
+"test_type"
+"test"
+"test1"
+"test2"
+"test4"
+"test3"
+"test5"
+"enemy"
+"Weapon"
+"Green"
+"Red"
+"Blue"
+"testarrayofstring"
+"testarrayofbools"
+"testbool"
+"flex"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/parser_fuzzer.dict b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/parser_fuzzer.dict
new file mode 100644
index 0000000..44c18da
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/parser_fuzzer.dict
@@ -0,0 +1,101 @@
+"struct"
+"table"
+"enum"
+"union"
+"include"
+"namespace"
+"attribute"
+"null"
+"NULL"
+"byte"
+"int8"
+"ubyte"
+"uint8"
+"bool"
+"short"
+"int16"
+"ushort"
+"uint16"
+"int"
+"int32"
+"uint"
+"uint32"
+"float"
+"float32"
+"long"
+"int64"
+"ulong"
+"uint64"
+"double"
+"float64"
+"root_type"
+"file_identifier"
+"file_extension"
+"{"
+"}"
+"["
+"]"
+"\""
+"'"
+"\\"
+"//"
+":"
+","
+" "
+"\\n"
+"\\r"
+"/*"
+"*/"
+"true"
+"false"
+"null"
+"\\u"
+"\\b"
+"\\f"
+"\\t"
+"."
+"e"
+"e+"
+"e-"
+"E"
+"E+"
+"E-"
+"0x"
+"p"
+"a"
+"b"
+"Monster"
+"pos"
+"hp"
+"name"
+"weapons"
+"damage"
+"equipped_type"
+"equipped"
+"inventory"
+"vector_of_longs"
+"vector_of_doubles"
+"test_type"
+"test"
+"test1"
+"test2"
+"test4"
+"test3"
+"test5"
+"enemy"
+"Weapon"
+"Green"
+"Red"
+"Blue"
+"testarrayofstring"
+"testarrayofbools"
+"testbool"
+"testhashs32_fnv1"
+"testhashu32_fnv1"
+"testhashs64_fnv1"
+"testhashu64_fnv1"
+"testhashs32_fnv1a"
+"testhashu32_fnv1a"
+"testhashs64_fnv1a"
+"testhashu64_fnv1a"
+"flex"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/readme.md b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/readme.md
new file mode 100644
index 0000000..b2c7db4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/readme.md
@@ -0,0 +1,71 @@
+# Test Flatbuffers library with help of libFuzzer
+Test suite of Flatbuffers library has fuzzer section with tests are based on libFuzzer library.
+
+> LibFuzzer is in-process, coverage-guided, evolutionary fuzzing engine.
+LibFuzzer is linked with the library under test, and feeds fuzzed inputs to the library via a specific fuzzing entrypoint (aka “target function”);
+the fuzzer then tracks which areas of the code are reached, and generates mutations on the corpus of input data in order to maximize the code coverage.
+The code coverage information for libFuzzer is provided by LLVM’s SanitizerCoverage instrumentation.
+
+For details about **libFuzzer** see: https://llvm.org/docs/LibFuzzer.html
+
+To build and run these tests LLVM compiler (with clang frontend) and CMake should be installed before.
+
+The fuzzer section include three tests:
+- `verifier_fuzzer` checks stability of deserialization engine for `Monster` schema;
+- `parser_fuzzer` checks stability of schema and json parser under various inputs;
+- `scalar_parser` focused on validation of the parser while parse numeric scalars in schema and/or json files;
+
+## Run tests with a specific locale
+The grammar of the Flatbuffers library is based on printable-ASCII characters.
+By design, the Flatbuffers library should be independent of the global or thread locales used by an end-user application.
+Set environment variable `FLATBUFFERS_TEST_LOCALE` to run a fuzzer with a specific C-locale:
+```sh
+>FLATBUFFERS_TEST_LOCALE="" ./scalar_parser
+>FLATBUFFERS_TEST_LOCALE="ru_RU.CP1251" ./parser_fuzzer
+```
+
+## Run fuzzer
+These are examples of running a fuzzer.
+Flags may vary and depend on a version of the libFuzzer library.
+For details, run a fuzzer with `-help` flag: `./parser_fuzzer -help=1`
+
+`./verifier_fuzzer ../.corpus_verifier/ ../.seed_verifier/`
+
+`./parser_fuzzer -only_ascii=1  -max_len=500 -dict=../parser_fbs.dict ../.corpus_parser/ ../.seed_parser/`
+
+`./monster_fuzzer -only_ascii=1 -max_len=500 -dict=../monster_json.dict ../.corpus_monster/ ../.seed_monster/`
+
+`./scalar_fuzzer -use_value_profile=1 -max_len=500 -dict=../scalar_json.dict ../.corpus_scalar/ ../.seed_scalar/`
+
+Flag `-only_ascii=1` is useful for fast number-compatibility checking while run `scalar_fuzzer`.
+
+Run with a specific C-locale:
+`FLATBUFFERS_TEST_LOCALE="ru_RU.CP1251" ./scalar_fuzzer -reduce_depth=1 -use_value_profile=1 -shrink=1 -max_len=3000 -timeout=10 -rss_limit_mb=2048 ../.corpus_parser/ ../.seed_parser/`
+
+
+## Merge (minimize) corpus
+The **libFuzzer** allow to filter (minimize) corpus with help of `-merge` flag:
+> -merge
+    If set to 1, any corpus inputs from the 2nd, 3rd etc. corpus directories that trigger new code coverage will be merged into the first corpus directory.
+    Defaults to 0. This flag can be used to minimize a corpus.
+
+Merge several corpuses to a seed directory (a new collected corpus to the seed collection, for example):
+`./verifier_fuzzer -merge=1 ../.seed_verifier/ ../.corpus_verifier/`
+`./parser_fuzzer -merge=1 ../.seed_parser/ ../.corpus_parser/`
+`./monster_fuzzer -merge=1 ../.seed_monster/ ../.corpus_monster/`
+`./scalar_fuzzer -merge=1 ../.seed_scalar/ ../.corpus_scalar/`
+
+## Know limitations
+- LLVM 7.0 std::regex library has problem with stack overflow, maximum length of input for `scalar_fuzzer` run should be limited to 3000.
+  Example: `./scalar_fuzzer -max_len=3000`
+
+# Fuzzing control
+
+## Set timeout or memory limit
+
+`-timeout=10 -rss_limit_mb=2048 -jobs=4 -workers=4`.
+
+## Force stop on first UBSAN error
+
+- `export UBSAN_OPTIONS=halt_on_error=1`
+- `export ASAN_OPTIONS=halt_on_error=1`
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_debug.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_debug.cpp
new file mode 100644
index 0000000..d0c9b40
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_debug.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include "flatbuffers/util.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    std::cerr << "Usage: scalar_debug <path to fuzzer crash file>\n";
+    return 0;
+  }
+  std::string crash_file_name(argv[1]);
+  std::string crash_file_data;
+  auto done =
+      flatbuffers::LoadFile(crash_file_name.c_str(), true, &crash_file_data);
+  if (!done) {
+    std::cerr << "Can not load file: '" << crash_file_name << "'";
+    return -1;
+  }
+  if (crash_file_data.size() < 3) {
+    std::cerr << "Invalid file data: '" << crash_file_data << "'";
+    return -2;
+  }
+  auto rc = LLVMFuzzerTestOneInput(
+      reinterpret_cast<const uint8_t *>(crash_file_data.data()),
+      crash_file_data.size());
+  std::cout << "LLVMFuzzerTestOneInput finished with code " << rc << "\n\n";
+  return rc;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_fuzzer.dict b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_fuzzer.dict
new file mode 100644
index 0000000..3b2fbc8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/scalar_fuzzer.dict
@@ -0,0 +1,37 @@
+"-"
+"+"
+"."
+"e"
+"e+"
+"e-"
+"E"
+"E+"
+"E-"
+"0x"
+"-0x"
+"p"
+"0"
+"1"
+"2"
+"3"
+"4"
+"5"
+"6"
+"7"
+"8"
+"9"
+"a"
+"b"
+"c"
+"d"
+"e"
+"f"
+"nan"
+"-nan"
+"+nan"
+"inf"
+"+inf"
+"-inf"
+"infinity"
+"+infinity"
+"-infinity"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/test_init.h b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/test_init.h
new file mode 100644
index 0000000..6c9113d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/fuzzer/test_init.h
@@ -0,0 +1,52 @@
+
+#ifndef FUZZER_TEST_INIT_H_
+#define FUZZER_TEST_INIT_H_
+
+#include "fuzzer_assert.h"
+#include "test_assert.h"
+
+// Utility for test run.
+struct OneTimeTestInit {
+  // Declare trap for the Flatbuffers test engine.
+  // This hook terminate program both in Debug and Release.
+  static bool TestFailListener(const char *expval, const char *val,
+                               const char *exp, const char *file, int line,
+                               const char *func = 0) {
+    (void)expval;
+    (void)val;
+    (void)exp;
+    (void)file;
+    (void)line;
+    (void)func;
+    // FLATBUFFERS_ASSERT redefined to be fully independent of the Flatbuffers
+    // library implementation (see test_assert.h for details).
+    fuzzer_assert_impl(false);  // terminate
+    return false;
+  }
+
+  OneTimeTestInit() : has_locale_(false) {
+    // Fuzzer test should be independent of the test engine implementation.
+    // This hook will terminate test if TEST_EQ/TEST_ASSERT asserted.
+    InitTestEngine(OneTimeTestInit::TestFailListener);
+
+    // Read a locale for the test.
+    if (flatbuffers::ReadEnvironmentVariable("FLATBUFFERS_TEST_LOCALE",
+                                             &test_locale_)) {
+      TEST_OUTPUT_LINE("The environment variable FLATBUFFERS_TEST_LOCALE=%s",
+                       test_locale_.c_str());
+      test_locale_ = flatbuffers::RemoveStringQuotes(test_locale_);
+      has_locale_ = true;
+    }
+  }
+
+  static const char *test_locale() {
+    return one_time_init_.has_locale_ ? nullptr
+                                      : one_time_init_.test_locale_.c_str();
+  }
+
+  bool has_locale_;
+  std::string test_locale_;
+  static OneTimeTestInit one_time_init_;
+};
+
+#endif  // !FUZZER_TEST_INIT_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.bat b/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.bat
new file mode 100644
index 0000000..d151588
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.bat
@@ -0,0 +1,103 @@
+:: Copyright 2015 Google Inc. All rights reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+
+@SETLOCAL
+
+set buildtype=Release
+if "%1"=="-b" set buildtype=%2
+
+set commandline=%*
+
+
+if NOT "%commandline%"=="%commandline:--cpp-std c++0x=%" (
+  set TEST_CPP_FLAGS=--cpp-std c++0x
+) else (
+  @rem --cpp-std is defined by flatc default settings.
+  set TEST_CPP_FLAGS=
+)
+
+set TEST_CPP_FLAGS=--gen-compare --cpp-ptr-type flatbuffers::unique_ptr %TEST_CPP_FLAGS%
+set TEST_CS_FLAGS=--cs-gen-json-serializer
+set TEST_TS_FLAGS=--gen-name-strings
+set TEST_BASE_FLAGS=--reflect-names --gen-mutable --gen-object-api
+set TEST_RUST_FLAGS=%TEST_BASE_FLAGS% --gen-name-strings
+set TEST_NOINCL_FLAGS=%TEST_BASE_FLAGS% --no-includes
+
+..\%buildtype%\flatc.exe --binary --cpp --java --kotlin --csharp --dart --go --lobster --lua --ts --php --grpc ^
+%TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% %TEST_CS_FLAGS% -I include_test monster_test.fbs monsterdata_test.json || goto FAIL
+..\%buildtype%\flatc.exe --rust %TEST_RUST_FLAGS% -I include_test monster_test.fbs monsterdata_test.json || goto FAIL
+
+..\%buildtype%\flatc.exe --python %TEST_BASE_FLAGS% -I include_test monster_test.fbs monsterdata_test.json || goto FAIL
+
+..\%buildtype%\flatc.exe --binary --cpp --java --csharp --dart --go --lobster --lua --ts --php --python ^
+%TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% %TEST_CS_FLAGS% %TEST_TS_FLAGS% -o namespace_test namespace_test/namespace_test1.fbs namespace_test/namespace_test2.fbs || goto FAIL
+
+@rem For Rust we currently generate two independent schemas, with namespace_test2
+@rem duplicating the types in namespace_test1
+..\%buildtype%\flatc.exe --rust --gen-all %TEST_RUST_FLAGS% -o namespace_test namespace_test/namespace_test1.fbs namespace_test/namespace_test2.fbs || goto FAIL
+
+..\%buildtype%\flatc.exe --cpp --java --csharp --ts --php %TEST_BASE_FLAGS% %TEST_CPP_FLAGS% %TEST_CS_FLAGS% %TEST_TS_FLAGS% -o union_vector ./union_vector/union_vector.fbs || goto FAIL
+..\%buildtype%\flatc.exe --ts --gen-name-strings --gen-mutable %TEST_BASE_FLAGS% %TEST_TS_FLAGS% -I include_test monster_test.fbs
+..\%buildtype%\flatc.exe %TEST_BASE_FLAGS% %TEST_TS_FLAGS% -b -I include_test monster_test.fbs unicode_test.json
+..\%buildtype%\flatc.exe --ts --gen-name-strings %TEST_BASE_FLAGS% %TEST_TS_FLAGS% -o union_vector union_vector/union_vector.fbs
+..\%buildtype%\flatc.exe --rust -I include_test -o include_test include_test/include_test1.fbs || goto FAIL
+..\%buildtype%\flatc.exe --rust -I include_test -o include_test/sub include_test/sub/include_test2.fbs || goto FAIL
+..\%buildtype%\flatc.exe -b --schema --bfbs-comments --bfbs-builtins -I include_test monster_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe --cpp --bfbs-comments --bfbs-builtins --bfbs-gen-embed %TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% -I include_test monster_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe -b --schema --bfbs-comments --bfbs-builtins -I include_test arrays_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe --jsonschema --schema -I include_test monster_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe --cpp --java --csharp --jsonschema --rust %TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% %TEST_CS_FLAGS% --scoped-enums arrays_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe --python %TEST_BASE_FLAGS% arrays_test.fbs || goto FAIL
+..\%buildtype%\flatc.exe --cpp %TEST_BASE_FLAGS% --cpp-ptr-type flatbuffers::unique_ptr native_type_test.fbs || goto FAIL
+
+@rem Generate the optional scalar code for tests.
+..\%buildtype%\flatc.exe --java --kotlin --lobster --ts optional_scalars.fbs || goto FAIL
+..\%buildtype%\flatc.exe --csharp --rust --gen-object-api optional_scalars.fbs || goto FAIL
+..\%buildtype%\flatc.exe %TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% --cpp optional_scalars.fbs || goto FAIL
+
+@rem Generate the schema evolution tests
+..\%buildtype%\flatc.exe --cpp --scoped-enums %TEST_CPP_FLAGS% -o evolution_test ./evolution_test/evolution_v1.fbs ./evolution_test/evolution_v2.fbs || goto FAIL
+
+if NOT "%MONSTER_EXTRA%"=="skip" (
+  @echo Generate MosterExtra
+  ..\%buildtype%\flatc.exe --cpp --java --csharp %TEST_NOINCL_FLAGS% %TEST_CPP_FLAGS% %TEST_CS_FLAGS% monster_extra.fbs monsterdata_extra.json || goto FAIL
+  ..\%buildtype%\flatc.exe --python %TEST_BASE_FLAGS% monster_extra.fbs monsterdata_extra.json || goto FAIL
+) else (
+  @echo monster_extra.fbs skipped (the strtod function from MSVC2013 or older doesn't support NaN/Inf arguments)
+)
+
+set TEST_CPP17_FLAGS=--cpp --cpp-std c++17 --cpp-static-reflection -o ./cpp17/generated_cpp17 %TEST_NOINCL_FLAGS%
+if NOT "%MONSTER_EXTRA%"=="skip" (
+  @rem Flag c++17 requires Clang6, GCC7, MSVC2017 (_MSC_VER >= 1914)  or higher.
+  ..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% -I include_test monster_test.fbs  || goto FAIL
+  ..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% optional_scalars.fbs || goto FAIL
+  @rem..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% arrays_test.fbs                   || goto FAIL
+  @rem..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% native_type_test.fbs              || goto FAIL
+  @rem..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% monster_extra.fbs                 || goto FAIL
+  @rem..\%buildtype%\flatc.exe %TEST_CPP17_FLAGS% ./union_vector/union_vector.fbs   || goto FAIL
+)
+
+cd ../samples
+..\%buildtype%\flatc.exe --cpp --lobster %TEST_BASE_FLAGS% %TEST_CPP_FLAGS% monster.fbs || goto FAIL
+..\%buildtype%\flatc.exe -b --schema --bfbs-comments --bfbs-builtins monster.fbs || goto FAIL
+cd ../reflection
+call generate_code.bat %1 %2 || goto FAIL
+
+set EXITCODE=0
+goto SUCCESS
+:FAIL
+set EXITCODE=1
+:SUCCESS
+cd ../tests
+EXIT /B %EXITCODE%
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.sh b/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.sh
new file mode 100755
index 0000000..12add89
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/generate_code.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+#
+# Copyright 2021 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+commandline="'$*'"
+
+if [[ $commandline == *"--cpp-std c++0x"* ]]; then
+  TEST_CPP_FLAGS="--cpp-std c++0x"
+else
+  # --cpp-std is defined by flatc default settings.
+  TEST_CPP_FLAGS=
+fi
+
+TEST_CPP_FLAGS="--gen-compare --cpp-ptr-type flatbuffers::unique_ptr $TEST_CPP_FLAGS"
+TEST_CS_FLAGS="--cs-gen-json-serializer"
+TEST_TS_FLAGS="--gen-name-strings"
+TEST_BASE_FLAGS="--reflect-names --gen-mutable --gen-object-api"
+TEST_RUST_FLAGS="$TEST_BASE_FLAGS --gen-name-strings"
+TEST_NOINCL_FLAGS="$TEST_BASE_FLAGS --no-includes"
+
+../flatc --binary --cpp --java --kotlin  --csharp --dart --go --lobster --lua --ts --php --grpc \
+$TEST_NOINCL_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS -I include_test monster_test.fbs monsterdata_test.json
+../flatc --rust $TEST_RUST_FLAGS -I include_test monster_test.fbs monsterdata_test.json
+
+../flatc --python $TEST_BASE_FLAGS -I include_test monster_test.fbs monsterdata_test.json
+
+../flatc --cpp --java --kotlin --csharp --dart --go --binary --lobster --lua --ts --php --python \
+$TEST_NOINCL_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS $TEST_TS_FLAGS -o namespace_test namespace_test/namespace_test1.fbs namespace_test/namespace_test2.fbs
+
+# For Rust we currently generate two independent schemas, with namespace_test2
+# duplicating the types in namespace_test1
+../flatc --rust --gen-all $TEST_RUST_FLAGS -o namespace_test namespace_test/namespace_test1.fbs namespace_test/namespace_test2.fbs
+
+../flatc --cpp --java --kotlin --csharp --ts --php $TEST_BASE_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS $TEST_TS_FLAGS -o union_vector ./union_vector/union_vector.fbs
+../flatc --ts --gen-name-strings --gen-mutable $TEST_BASE_FLAGS $TEST_TS_FLAGS -I include_test monster_test.fbs
+../flatc $TEST_BASE_FLAGS $TEST_TS_FLAGS -b -I include_test monster_test.fbs unicode_test.json
+../flatc --ts --gen-name-strings $TEST_BASE_FLAGS $TEST_TS_FLAGS -o union_vector union_vector/union_vector.fbs
+../flatc --rust -I include_test -o include_test include_test/include_test1.fbs
+../flatc --rust -I include_test -o include_test/sub include_test/sub/include_test2.fbs
+../flatc -b --schema --bfbs-comments --bfbs-builtins -I include_test monster_test.fbs
+../flatc --cpp --bfbs-comments --bfbs-builtins --bfbs-gen-embed $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS -I include_test monster_test.fbs
+../flatc -b --schema --bfbs-comments --bfbs-builtins -I include_test arrays_test.fbs
+../flatc --jsonschema --schema -I include_test monster_test.fbs
+../flatc --cpp --java --kotlin --csharp --python $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS monster_extra.fbs monsterdata_extra.json
+../flatc --cpp --java --csharp --jsonschema --rust $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS --scoped-enums arrays_test.fbs
+../flatc --python $TEST_BASE_FLAGS arrays_test.fbs
+../flatc --dart monster_extra.fbs
+
+# Generate optional scalar code for tests.
+../flatc --java --kotlin --lobster --ts optional_scalars.fbs
+../flatc --csharp --rust --gen-object-api optional_scalars.fbs
+../flatc $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS --cpp optional_scalars.fbs
+
+# Generate string/vector default code for tests
+../flatc --rust --gen-object-api more_defaults.fbs
+
+# Generate the schema evolution tests
+../flatc --cpp --scoped-enums $TEST_CPP_FLAGS -o evolution_test ./evolution_test/evolution_v*.fbs
+
+working_dir=`pwd`
+cd FlatBuffers.Test.Swift/Tests/FlatBuffers.Test.SwiftTests
+$working_dir/../flatc --swift --grpc $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS -I ${working_dir}/include_test ${working_dir}/monster_test.fbs
+$working_dir/../flatc --swift $TEST_BASE_FLAGS $TEST_CPP_FLAGS $TEST_CS_FLAGS ${working_dir}/union_vector/union_vector.fbs
+$working_dir/../flatc --swift ${working_dir}/optional_scalars.fbs
+$working_dir/../flatc --swift --gen-object-api ${working_dir}/more_defaults.fbs
+cd $working_dir
+
+# Tests if the --filename-suffix and --filename-ext works and produces the same
+# outputs.
+../flatc --cpp --filename-suffix _suffix --filename-ext hpp $TEST_NOINCL_FLAGS $TEST_CPP_FLAGS -I include_test monster_test.fbs
+if [ -f "monster_test_suffix.hpp" ]; then
+  if ! cmp -s "monster_test_suffix.hpp" "monster_test_generated.h"; then
+    echo "[Error] Filename suffix option did not produce identical results"
+  fi
+  rm "monster_test_suffix.hpp"
+else
+  echo "[Error] Filename suffix option did not produce a file"
+fi
+
+# Flag c++17 requires Clang6, GCC7, MSVC2017 (_MSC_VER >= 1914)  or higher.
+TEST_CPP17_FLAGS="--cpp --cpp-std c++17 --cpp-static-reflection -o ./cpp17/generated_cpp17 $TEST_NOINCL_FLAGS"
+../flatc $TEST_CPP17_FLAGS -I include_test monster_test.fbs
+../flatc $TEST_CPP17_FLAGS optional_scalars.fbs
+
+cd ../samples
+../flatc --cpp --rust --lobster $TEST_BASE_FLAGS $TEST_CPP_FLAGS monster.fbs
+../flatc -b --schema --bfbs-comments --bfbs-builtins monster.fbs
+cd ../reflection
+./generate_code.sh --cpp-std c++0x
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/go_test.go b/3rdparty/TNN/third_party/flatbuffers/tests/go_test.go
new file mode 100644
index 0000000..5d408f6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/go_test.go
@@ -0,0 +1,2018 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	mygame "MyGame"          // refers to generated code
+	example "MyGame/Example" // refers to generated code
+
+	"bytes"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"reflect"
+	"sort"
+	"testing"
+	"testing/quick"
+
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+var (
+	cppData, javaData, outData string
+	fuzz                       bool
+	fuzzFields, fuzzObjects    int
+)
+
+func init() {
+	flag.StringVar(&cppData, "cpp_data", "",
+		"location of monsterdata_test.mon to verify against (required)")
+	flag.StringVar(&javaData, "java_data", "",
+		"location of monsterdata_java_wire.mon to verify against (optional)")
+	flag.StringVar(&outData, "out_data", "",
+		"location to write generated Go data")
+	flag.BoolVar(&fuzz, "fuzz", false, "perform fuzzing")
+	flag.IntVar(&fuzzFields, "fuzz_fields", 4, "fields per fuzzer object")
+	flag.IntVar(&fuzzObjects, "fuzz_objects", 10000,
+		"number of fuzzer objects (higher is slower and more thorough")
+}
+
+// Store specific byte patterns in these variables for the fuzzer. These
+// values are taken verbatim from the C++ function FuzzTest1.
+var (
+	overflowingInt32Val = flatbuffers.GetInt32([]byte{0x83, 0x33, 0x33, 0x33})
+	overflowingInt64Val = flatbuffers.GetInt64([]byte{0x84, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44})
+)
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	if cppData == "" {
+		fmt.Fprintf(os.Stderr, "cpp_data argument is required\n")
+		os.Exit(1)
+	}
+	os.Exit(m.Run())
+}
+
+// TestAll runs all checks, failing if any errors occur.
+func TestAll(t *testing.T) {
+	// Verify that the Go FlatBuffers runtime library generates the
+	// expected bytes (does not use any schema):
+	CheckByteLayout(t.Fatalf)
+	CheckMutateMethods(t.Fatalf)
+
+	// Verify that panics are raised during exceptional conditions:
+	CheckNotInObjectError(t.Fatalf)
+	CheckStringIsNestedError(t.Fatalf)
+	CheckByteStringIsNestedError(t.Fatalf)
+	CheckStructIsNotInlineError(t.Fatalf)
+	CheckFinishedBytesError(t.Fatalf)
+	CheckSharedStrings(t.Fatalf)
+	CheckEmptiedBuilder(t.Fatalf)
+
+	// Verify that GetRootAs works for non-root tables
+	CheckGetRootAsForNonRootTable(t.Fatalf)
+	CheckTableAccessors(t.Fatalf)
+
+	// Verify that using the generated Go code builds a buffer without
+	// returning errors:
+	generated, off := CheckGeneratedBuild(false, t.Fatalf)
+
+	// Verify that the buffer generated by Go code is readable by the
+	// generated Go code:
+	CheckReadBuffer(generated, off, false, t.Fatalf)
+	CheckMutateBuffer(generated, off, false, t.Fatalf)
+	CheckObjectAPI(generated, off, false, t.Fatalf)
+
+	// Verify that the buffer generated by C++ code is readable by the
+	// generated Go code:
+	monsterDataCpp, err := ioutil.ReadFile(cppData)
+	if err != nil {
+		t.Fatal(err)
+	}
+	CheckReadBuffer(monsterDataCpp, 0, false, t.Fatalf)
+	CheckMutateBuffer(monsterDataCpp, 0, false, t.Fatalf)
+	CheckObjectAPI(monsterDataCpp, 0, false, t.Fatalf)
+
+	// Verify that vtables are deduplicated when written:
+	CheckVtableDeduplication(t.Fatalf)
+
+	// Verify the enum names
+	CheckEnumNames(t.Fatalf)
+
+	// Verify enum String methods
+	CheckEnumString(t.Fatalf)
+
+	// Verify the enum values maps
+	CheckEnumValues(t.Fatalf)
+
+	// Verify that the Go code used in FlatBuffers documentation passes
+	// some sanity checks:
+	CheckDocExample(generated, off, t.Fatalf)
+
+	// Check Builder.CreateByteVector
+	CheckCreateByteVector(t.Fatalf)
+
+	// Check a parent namespace import
+	CheckParentNamespace(t.Fatalf)
+
+	// Check size-prefixed flatbuffers
+	CheckSizePrefixedBuffer(t.Fatalf)
+
+	// If the filename of the FlatBuffers file generated by the Java test
+	// is given, check that Go code can read it, and that Go code
+	// generates an identical buffer when used to create the example data:
+	if javaData != "" {
+		monsterDataJava, err := ioutil.ReadFile(javaData)
+		if err != nil {
+			t.Fatal(err)
+		}
+		CheckReadBuffer(monsterDataJava, 0, false, t.Fatalf)
+		CheckByteEquality(generated[off:], monsterDataJava, t.Fatalf)
+	}
+
+	// Verify that various fuzzing scenarios produce a valid FlatBuffer.
+	if fuzz {
+		checkFuzz(fuzzFields, fuzzObjects, t.Fatalf)
+	}
+
+	// Write the generated buffer out to a file:
+	err = ioutil.WriteFile(outData, generated[off:], os.FileMode(0644))
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+// CheckReadBuffer checks that the given buffer is evaluated correctly
+// as the example Monster.
+func CheckReadBuffer(buf []byte, offset flatbuffers.UOffsetT, sizePrefix bool, fail func(string, ...interface{})) {
+	// try the two ways of generating a monster
+	var monster1 *example.Monster
+	monster2 := &example.Monster{}
+
+	if sizePrefix {
+		monster1 = example.GetSizePrefixedRootAsMonster(buf, offset)
+		flatbuffers.GetSizePrefixedRootAs(buf, offset, monster2)
+	} else {
+		monster1 = example.GetRootAsMonster(buf, offset)
+		flatbuffers.GetRootAs(buf, offset, monster2)
+	}
+
+	for _, monster := range []*example.Monster{monster1, monster2} {
+		if got := monster.Hp(); 80 != got {
+			fail(FailString("hp", 80, got))
+		}
+
+		// default
+		if got := monster.Mana(); 150 != got {
+			fail(FailString("mana", 150, got))
+		}
+
+		if got := monster.Name(); !bytes.Equal([]byte("MyMonster"), got) {
+			fail(FailString("name", "MyMonster", got))
+		}
+
+		if got := monster.Color(); example.ColorBlue != got {
+			fail(FailString("color", example.ColorBlue, got))
+		}
+
+		if got := monster.Testbool(); true != got {
+			fail(FailString("testbool", true, got))
+		}
+
+		// initialize a Vec3 from Pos()
+		vec := new(example.Vec3)
+		vec = monster.Pos(vec)
+		if vec == nil {
+			fail("vec3 initialization failed")
+		}
+
+		// check that new allocs equal given ones:
+		vec2 := monster.Pos(nil)
+		if !reflect.DeepEqual(vec, vec2) {
+			fail("fresh allocation failed")
+		}
+
+		// verify the properties of the Vec3
+		if got := vec.X(); float32(1.0) != got {
+			fail(FailString("Pos.X", float32(1.0), got))
+		}
+
+		if got := vec.Y(); float32(2.0) != got {
+			fail(FailString("Pos.Y", float32(2.0), got))
+		}
+
+		if got := vec.Z(); float32(3.0) != got {
+			fail(FailString("Pos.Z", float32(3.0), got))
+		}
+
+		if got := vec.Test1(); float64(3.0) != got {
+			fail(FailString("Pos.Test1", float64(3.0), got))
+		}
+
+		if got := vec.Test2(); example.ColorGreen != got {
+			fail(FailString("Pos.Test2", example.ColorGreen, got))
+		}
+
+		// initialize a Test from Test3(...)
+		t := new(example.Test)
+		t = vec.Test3(t)
+		if t == nil {
+			fail("vec.Test3(&t) failed")
+		}
+
+		// check that new allocs equal given ones:
+		t2 := vec.Test3(nil)
+		if !reflect.DeepEqual(t, t2) {
+			fail("fresh allocation failed")
+		}
+
+		// verify the properties of the Test
+		if got := t.A(); int16(5) != got {
+			fail(FailString("t.A()", int16(5), got))
+		}
+
+		if got := t.B(); int8(6) != got {
+			fail(FailString("t.B()", int8(6), got))
+		}
+
+		if got := monster.TestType(); example.AnyMonster != got {
+			fail(FailString("monster.TestType()", example.AnyMonster, got))
+		}
+
+		// initialize a Table from a union field Test(...)
+		var table2 flatbuffers.Table
+		if ok := monster.Test(&table2); !ok {
+			fail("monster.Test(&monster2) failed")
+		}
+
+		// initialize a Monster from the Table from the union
+		var monster2 example.Monster
+		monster2.Init(table2.Bytes, table2.Pos)
+
+		if got := monster2.Name(); !bytes.Equal([]byte("Fred"), got) {
+			fail(FailString("monster2.Name()", "Fred", got))
+		}
+
+		inventorySlice := monster.InventoryBytes()
+		if len(inventorySlice) != monster.InventoryLength() {
+			fail(FailString("len(monster.InventoryBytes) != monster.InventoryLength", len(inventorySlice), monster.InventoryLength()))
+		}
+
+		if got := monster.InventoryLength(); 5 != got {
+			fail(FailString("monster.InventoryLength", 5, got))
+		}
+
+		invsum := 0
+		l := monster.InventoryLength()
+		for i := 0; i < l; i++ {
+			v := monster.Inventory(i)
+			if v != inventorySlice[i] {
+				fail(FailString("monster inventory slice[i] != Inventory(i)", v, inventorySlice[i]))
+			}
+			invsum += int(v)
+		}
+		if invsum != 10 {
+			fail(FailString("monster inventory sum", 10, invsum))
+		}
+
+		if got := monster.Test4Length(); 2 != got {
+			fail(FailString("monster.Test4Length()", 2, got))
+		}
+
+		var test0 example.Test
+		ok := monster.Test4(&test0, 0)
+		if !ok {
+			fail(FailString("monster.Test4(&test0, 0)", true, ok))
+		}
+
+		var test1 example.Test
+		ok = monster.Test4(&test1, 1)
+		if !ok {
+			fail(FailString("monster.Test4(&test1, 1)", true, ok))
+		}
+
+		// the position of test0 and test1 are swapped in monsterdata_java_wire
+		// and monsterdata_test_wire, so ignore ordering
+		v0 := test0.A()
+		v1 := test0.B()
+		v2 := test1.A()
+		v3 := test1.B()
+		sum := int(v0) + int(v1) + int(v2) + int(v3)
+
+		if 100 != sum {
+			fail(FailString("test0 and test1 sum", 100, sum))
+		}
+
+		if got := monster.TestarrayofstringLength(); 2 != got {
+			fail(FailString("Testarrayofstring length", 2, got))
+		}
+
+		if got := monster.Testarrayofstring(0); !bytes.Equal([]byte("test1"), got) {
+			fail(FailString("Testarrayofstring(0)", "test1", got))
+		}
+
+		if got := monster.Testarrayofstring(1); !bytes.Equal([]byte("test2"), got) {
+			fail(FailString("Testarrayofstring(1)", "test2", got))
+		}
+	}
+}
+
+// CheckMutateBuffer checks that the given buffer can be mutated correctly
+// as the example Monster. Only available scalar values are mutated.
+func CheckMutateBuffer(org []byte, offset flatbuffers.UOffsetT, sizePrefix bool, fail func(string, ...interface{})) {
+	// make a copy to mutate
+	buf := make([]byte, len(org))
+	copy(buf, org)
+
+	// load monster data from the buffer
+	var monster *example.Monster
+	if sizePrefix {
+		monster = example.GetSizePrefixedRootAsMonster(buf, offset)
+	} else {
+		monster = example.GetRootAsMonster(buf, offset)
+	}
+
+	// test case struct
+	type testcase struct {
+		field  string
+		testfn func() bool
+	}
+
+	testForOriginalValues := []testcase{
+		testcase{"Hp", func() bool { return monster.Hp() == 80 }},
+		testcase{"Mana", func() bool { return monster.Mana() == 150 }},
+		testcase{"Testbool", func() bool { return monster.Testbool() == true }},
+		testcase{"Pos.X'", func() bool { return monster.Pos(nil).X() == float32(1.0) }},
+		testcase{"Pos.Y'", func() bool { return monster.Pos(nil).Y() == float32(2.0) }},
+		testcase{"Pos.Z'", func() bool { return monster.Pos(nil).Z() == float32(3.0) }},
+		testcase{"Pos.Test1'", func() bool { return monster.Pos(nil).Test1() == float64(3.0) }},
+		testcase{"Pos.Test2'", func() bool { return monster.Pos(nil).Test2() == example.ColorGreen }},
+		testcase{"Pos.Test3.A", func() bool { return monster.Pos(nil).Test3(nil).A() == int16(5) }},
+		testcase{"Pos.Test3.B", func() bool { return monster.Pos(nil).Test3(nil).B() == int8(6) }},
+		testcase{"Inventory[2]", func() bool { return monster.Inventory(2) == byte(2) }},
+	}
+
+	testMutability := []testcase{
+		testcase{"Hp", func() bool { return monster.MutateHp(70) }},
+		testcase{"Mana", func() bool { return !monster.MutateMana(140) }},
+		testcase{"Testbool", func() bool { return monster.MutateTestbool(false) }},
+		testcase{"Pos.X", func() bool { return monster.Pos(nil).MutateX(10.0) }},
+		testcase{"Pos.Y", func() bool { return monster.Pos(nil).MutateY(20.0) }},
+		testcase{"Pos.Z", func() bool { return monster.Pos(nil).MutateZ(30.0) }},
+		testcase{"Pos.Test1", func() bool { return monster.Pos(nil).MutateTest1(30.0) }},
+		testcase{"Pos.Test2", func() bool { return monster.Pos(nil).MutateTest2(example.ColorBlue) }},
+		testcase{"Pos.Test3.A", func() bool { return monster.Pos(nil).Test3(nil).MutateA(50) }},
+		testcase{"Pos.Test3.B", func() bool { return monster.Pos(nil).Test3(nil).MutateB(60) }},
+		testcase{"Inventory[2]", func() bool { return monster.MutateInventory(2, 200) }},
+	}
+
+	testForMutatedValues := []testcase{
+		testcase{"Hp", func() bool { return monster.Hp() == 70 }},
+		testcase{"Mana", func() bool { return monster.Mana() == 150 }},
+		testcase{"Testbool", func() bool { return monster.Testbool() == false }},
+		testcase{"Pos.X'", func() bool { return monster.Pos(nil).X() == float32(10.0) }},
+		testcase{"Pos.Y'", func() bool { return monster.Pos(nil).Y() == float32(20.0) }},
+		testcase{"Pos.Z'", func() bool { return monster.Pos(nil).Z() == float32(30.0) }},
+		testcase{"Pos.Test1'", func() bool { return monster.Pos(nil).Test1() == float64(30.0) }},
+		testcase{"Pos.Test2'", func() bool { return monster.Pos(nil).Test2() == example.ColorBlue }},
+		testcase{"Pos.Test3.A", func() bool { return monster.Pos(nil).Test3(nil).A() == int16(50) }},
+		testcase{"Pos.Test3.B", func() bool { return monster.Pos(nil).Test3(nil).B() == int8(60) }},
+		testcase{"Inventory[2]", func() bool { return monster.Inventory(2) == byte(200) }},
+	}
+
+	testInvalidEnumValues := []testcase{
+		testcase{"Pos.Test2", func() bool { return monster.Pos(nil).MutateTest2(example.Color(20)) }},
+		testcase{"Pos.Test2", func() bool { return monster.Pos(nil).Test2() == example.Color(20) }},
+	}
+
+	// make sure original values are okay
+	for _, t := range testForOriginalValues {
+		if !t.testfn() {
+			fail("field '" + t.field + "' doesn't have the expected original value")
+		}
+	}
+
+	// try to mutate fields and check mutability
+	for _, t := range testMutability {
+		if !t.testfn() {
+			fail(FailString("field '"+t.field+"' failed mutability test", true, false))
+		}
+	}
+
+	// test whether values have changed
+	for _, t := range testForMutatedValues {
+		if !t.testfn() {
+			fail("field '" + t.field + "' doesn't have the expected mutated value")
+		}
+	}
+
+	// make sure the buffer has changed
+	if reflect.DeepEqual(buf, org) {
+		fail("mutate buffer failed")
+	}
+
+	// To make sure the buffer has changed accordingly
+	// Read data from the buffer and verify all fields
+	if sizePrefix {
+		monster = example.GetSizePrefixedRootAsMonster(buf, offset)
+	} else {
+		monster = example.GetRootAsMonster(buf, offset)
+	}
+
+	for _, t := range testForMutatedValues {
+		if !t.testfn() {
+			fail("field '" + t.field + "' doesn't have the expected mutated value")
+		}
+	}
+
+	// a couple extra tests for "invalid" enum values, which don't correspond to
+	// anything in the schema, but are allowed
+	for _, t := range testInvalidEnumValues {
+		if !t.testfn() {
+			fail("field '" + t.field + "' doesn't work with an invalid enum value")
+		}
+	}
+
+	// reverting all fields to original values should
+	// re-create the original buffer. Mutate all fields
+	// back to their original values and compare buffers.
+	// This test is done to make sure mutations do not do
+	// any unnecessary changes to the buffer.
+	if sizePrefix {
+		monster = example.GetSizePrefixedRootAsMonster(buf, offset)
+	} else {
+		monster = example.GetRootAsMonster(buf, offset)
+	}
+
+	monster.MutateHp(80)
+	monster.MutateTestbool(true)
+	monster.Pos(nil).MutateX(1.0)
+	monster.Pos(nil).MutateY(2.0)
+	monster.Pos(nil).MutateZ(3.0)
+	monster.Pos(nil).MutateTest1(3.0)
+	monster.Pos(nil).MutateTest2(example.ColorGreen)
+	monster.Pos(nil).Test3(nil).MutateA(5)
+	monster.Pos(nil).Test3(nil).MutateB(6)
+	monster.MutateInventory(2, 2)
+
+	for _, t := range testForOriginalValues {
+		if !t.testfn() {
+			fail("field '" + t.field + "' doesn't have the expected original value")
+		}
+	}
+
+	// buffer should have original values
+	if !reflect.DeepEqual(buf, org) {
+		fail("revert changes failed")
+	}
+}
+
+func CheckObjectAPI(buf []byte, offset flatbuffers.UOffsetT, sizePrefix bool, fail func(string, ...interface{})) {
+	var monster *example.MonsterT
+
+	if sizePrefix {
+		monster = example.GetSizePrefixedRootAsMonster(buf, offset).UnPack()
+	} else {
+		monster = example.GetRootAsMonster(buf, offset).UnPack()
+	}
+
+	if got := monster.Hp; 80 != got {
+		fail(FailString("hp", 80, got))
+	}
+
+	// default
+	if got := monster.Mana; 150 != got {
+		fail(FailString("mana", 150, got))
+	}
+
+	builder := flatbuffers.NewBuilder(0)
+	builder.Finish(monster.Pack(builder))
+	monster2 := example.GetRootAsMonster(builder.FinishedBytes(), 0).UnPack()
+	if !reflect.DeepEqual(monster, monster2) {
+		fail(FailString("Pack/Unpack()", monster, monster2))
+	}
+}
+
+// Low level stress/fuzz test: serialize/deserialize a variety of
+// different kinds of data in different combinations
+func checkFuzz(fuzzFields, fuzzObjects int, fail func(string, ...interface{})) {
+
+	// Values we're testing against: chosen to ensure no bits get chopped
+	// off anywhere, and also be different from eachother.
+	boolVal := true
+	int8Val := int8(-127) // 0x81
+	uint8Val := uint8(0xFF)
+	int16Val := int16(-32222) // 0x8222
+	uint16Val := uint16(0xFEEE)
+	int32Val := int32(overflowingInt32Val)
+	uint32Val := uint32(0xFDDDDDDD)
+	int64Val := int64(overflowingInt64Val)
+	uint64Val := uint64(0xFCCCCCCCCCCCCCCC)
+	float32Val := float32(3.14159)
+	float64Val := float64(3.14159265359)
+
+	testValuesMax := 11 // hardcoded to the number of scalar types
+
+	builder := flatbuffers.NewBuilder(0)
+	l := NewLCG()
+
+	objects := make([]flatbuffers.UOffsetT, fuzzObjects)
+
+	// Generate fuzzObjects random objects each consisting of
+	// fuzzFields fields, each of a random type.
+	for i := 0; i < fuzzObjects; i++ {
+		builder.StartObject(fuzzFields)
+
+		for f := 0; f < fuzzFields; f++ {
+			choice := l.Next() % uint32(testValuesMax)
+			switch choice {
+			case 0:
+				builder.PrependBoolSlot(int(f), boolVal, false)
+			case 1:
+				builder.PrependInt8Slot(int(f), int8Val, 0)
+			case 2:
+				builder.PrependUint8Slot(int(f), uint8Val, 0)
+			case 3:
+				builder.PrependInt16Slot(int(f), int16Val, 0)
+			case 4:
+				builder.PrependUint16Slot(int(f), uint16Val, 0)
+			case 5:
+				builder.PrependInt32Slot(int(f), int32Val, 0)
+			case 6:
+				builder.PrependUint32Slot(int(f), uint32Val, 0)
+			case 7:
+				builder.PrependInt64Slot(int(f), int64Val, 0)
+			case 8:
+				builder.PrependUint64Slot(int(f), uint64Val, 0)
+			case 9:
+				builder.PrependFloat32Slot(int(f), float32Val, 0)
+			case 10:
+				builder.PrependFloat64Slot(int(f), float64Val, 0)
+			}
+		}
+
+		off := builder.EndObject()
+
+		// store the offset from the end of the builder buffer,
+		// since it will keep growing:
+		objects[i] = off
+	}
+
+	// Do some bookkeeping to generate stats on fuzzes:
+	stats := map[string]int{}
+	check := func(desc string, want, got interface{}) {
+		stats[desc]++
+		if want != got {
+			fail("%s want %v got %v", desc, want, got)
+		}
+	}
+
+	l = NewLCG() // Reset.
+
+	// Test that all objects we generated are readable and return the
+	// expected values. We generate random objects in the same order
+	// so this is deterministic.
+	for i := 0; i < fuzzObjects; i++ {
+
+		table := &flatbuffers.Table{
+			Bytes: builder.Bytes,
+			Pos:   flatbuffers.UOffsetT(len(builder.Bytes)) - objects[i],
+		}
+
+		for j := 0; j < fuzzFields; j++ {
+			f := flatbuffers.VOffsetT((flatbuffers.VtableMetadataFields + j) * flatbuffers.SizeVOffsetT)
+			choice := l.Next() % uint32(testValuesMax)
+
+			switch choice {
+			case 0:
+				check("bool", boolVal, table.GetBoolSlot(f, false))
+			case 1:
+				check("int8", int8Val, table.GetInt8Slot(f, 0))
+			case 2:
+				check("uint8", uint8Val, table.GetUint8Slot(f, 0))
+			case 3:
+				check("int16", int16Val, table.GetInt16Slot(f, 0))
+			case 4:
+				check("uint16", uint16Val, table.GetUint16Slot(f, 0))
+			case 5:
+				check("int32", int32Val, table.GetInt32Slot(f, 0))
+			case 6:
+				check("uint32", uint32Val, table.GetUint32Slot(f, 0))
+			case 7:
+				check("int64", int64Val, table.GetInt64Slot(f, 0))
+			case 8:
+				check("uint64", uint64Val, table.GetUint64Slot(f, 0))
+			case 9:
+				check("float32", float32Val, table.GetFloat32Slot(f, 0))
+			case 10:
+				check("float64", float64Val, table.GetFloat64Slot(f, 0))
+			}
+		}
+	}
+
+	// If enough checks were made, verify that all scalar types were used:
+	if fuzzFields*fuzzObjects >= testValuesMax {
+		if len(stats) != testValuesMax {
+			fail("fuzzing failed to test all scalar types")
+		}
+	}
+
+	// Print some counts, if needed:
+	if testing.Verbose() {
+		if fuzzFields == 0 || fuzzObjects == 0 {
+			fmt.Printf("fuzz\tfields: %d\tobjects: %d\t[none]\t%d\n",
+				fuzzFields, fuzzObjects, 0)
+		} else {
+			keys := make([]string, 0, len(stats))
+			for k := range stats {
+				keys = append(keys, k)
+			}
+			sort.Strings(keys)
+			for _, k := range keys {
+				fmt.Printf("fuzz\tfields: %d\tobjects: %d\t%s\t%d\n",
+					fuzzFields, fuzzObjects, k, stats[k])
+			}
+		}
+	}
+
+	return
+}
+
+// FailString makes a message for when expectations differ from reality.
+func FailString(name string, want, got interface{}) string {
+	return fmt.Sprintf("bad %s: want %#v got %#v", name, want, got)
+}
+
+// CheckByteLayout verifies the bytes of a Builder in various scenarios.
+func CheckByteLayout(fail func(string, ...interface{})) {
+	var b *flatbuffers.Builder
+
+	var i int
+	check := func(want []byte) {
+		i++
+		got := b.Bytes[b.Head():]
+		if !bytes.Equal(want, got) {
+			fail("case %d: want\n%v\nbut got\n%v\n", i, want, got)
+		}
+	}
+
+	// test 1: numbers
+
+	b = flatbuffers.NewBuilder(0)
+	check([]byte{})
+	b.PrependBool(true)
+	check([]byte{1})
+	b.PrependInt8(-127)
+	check([]byte{129, 1})
+	b.PrependUint8(255)
+	check([]byte{255, 129, 1})
+	b.PrependInt16(-32222)
+	check([]byte{0x22, 0x82, 0, 255, 129, 1}) // first pad
+	b.PrependUint16(0xFEEE)
+	check([]byte{0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1}) // no pad this time
+	b.PrependInt32(-53687092)
+	check([]byte{204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1})
+	b.PrependUint32(0x98765432)
+	check([]byte{0x32, 0x54, 0x76, 0x98, 204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1})
+
+	// test 1b: numbers 2
+
+	b = flatbuffers.NewBuilder(0)
+	b.PrependUint64(0x1122334455667788)
+	check([]byte{0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11})
+
+	// test 2: 1xbyte vector
+
+	b = flatbuffers.NewBuilder(0)
+	check([]byte{})
+	b.StartVector(flatbuffers.SizeByte, 1, 1)
+	check([]byte{0, 0, 0}) // align to 4bytes
+	b.PrependByte(1)
+	check([]byte{1, 0, 0, 0})
+	b.EndVector(1)
+	check([]byte{1, 0, 0, 0, 1, 0, 0, 0}) // padding
+
+	// test 3: 2xbyte vector
+
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeByte, 2, 1)
+	check([]byte{0, 0}) // align to 4bytes
+	b.PrependByte(1)
+	check([]byte{1, 0, 0})
+	b.PrependByte(2)
+	check([]byte{2, 1, 0, 0})
+	b.EndVector(2)
+	check([]byte{2, 0, 0, 0, 2, 1, 0, 0}) // padding
+
+	// test 3b: 11xbyte vector matches builder size
+
+	b = flatbuffers.NewBuilder(12)
+	b.StartVector(flatbuffers.SizeByte, 8, 1)
+	start := []byte{}
+	check(start)
+	for i := 1; i < 12; i++ {
+		b.PrependByte(byte(i))
+		start = append([]byte{byte(i)}, start...)
+		check(start)
+	}
+	b.EndVector(8)
+	check(append([]byte{8, 0, 0, 0}, start...))
+
+	// test 4: 1xuint16 vector
+
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeUint16, 1, 1)
+	check([]byte{0, 0}) // align to 4bytes
+	b.PrependUint16(1)
+	check([]byte{1, 0, 0, 0})
+	b.EndVector(1)
+	check([]byte{1, 0, 0, 0, 1, 0, 0, 0}) // padding
+
+	// test 5: 2xuint16 vector
+
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeUint16, 2, 1)
+	check([]byte{}) // align to 4bytes
+	b.PrependUint16(0xABCD)
+	check([]byte{0xCD, 0xAB})
+	b.PrependUint16(0xDCBA)
+	check([]byte{0xBA, 0xDC, 0xCD, 0xAB})
+	b.EndVector(2)
+	check([]byte{2, 0, 0, 0, 0xBA, 0xDC, 0xCD, 0xAB})
+
+	// test 6: CreateString
+
+	b = flatbuffers.NewBuilder(0)
+	b.CreateString("foo")
+	check([]byte{3, 0, 0, 0, 'f', 'o', 'o', 0}) // 0-terminated, no pad
+	b.CreateString("moop")
+	check([]byte{4, 0, 0, 0, 'm', 'o', 'o', 'p', 0, 0, 0, 0, // 0-terminated, 3-byte pad
+		3, 0, 0, 0, 'f', 'o', 'o', 0})
+
+	// test 6b: CreateString unicode
+
+	b = flatbuffers.NewBuilder(0)
+	// These characters are chinese from blog.golang.org/strings
+	// We use escape codes here so that editors without unicode support
+	// aren't bothered:
+	uni_str := "\u65e5\u672c\u8a9e"
+	b.CreateString(uni_str)
+	check([]byte{9, 0, 0, 0, 230, 151, 165, 230, 156, 172, 232, 170, 158, 0, //  null-terminated, 2-byte pad
+		0, 0})
+
+	// test 6c: CreateByteString
+
+	b = flatbuffers.NewBuilder(0)
+	b.CreateByteString([]byte("foo"))
+	check([]byte{3, 0, 0, 0, 'f', 'o', 'o', 0}) // 0-terminated, no pad
+	b.CreateByteString([]byte("moop"))
+	check([]byte{4, 0, 0, 0, 'm', 'o', 'o', 'p', 0, 0, 0, 0, // 0-terminated, 3-byte pad
+		3, 0, 0, 0, 'f', 'o', 'o', 0})
+
+	// test 7: empty vtable
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(0)
+	check([]byte{})
+	b.EndObject()
+	check([]byte{4, 0, 4, 0, 4, 0, 0, 0})
+
+	// test 8: vtable with one true bool
+	b = flatbuffers.NewBuilder(0)
+	check([]byte{})
+	b.StartObject(1)
+	check([]byte{})
+	b.PrependBoolSlot(0, true, false)
+	b.EndObject()
+	check([]byte{
+		6, 0, // vtable bytes
+		8, 0, // length of object including vtable offset
+		7, 0, // start of bool value
+		6, 0, 0, 0, // offset for start of vtable (int32)
+		0, 0, 0, // padded to 4 bytes
+		1, // bool value
+	})
+
+	// test 9: vtable with one default bool
+	b = flatbuffers.NewBuilder(0)
+	check([]byte{})
+	b.StartObject(1)
+	check([]byte{})
+	b.PrependBoolSlot(0, false, false)
+	b.EndObject()
+	check([]byte{
+		4, 0, // vtable bytes
+		4, 0, // end of object from here
+		// entry 1 is zero and not stored.
+		4, 0, 0, 0, // offset for start of vtable (int32)
+	})
+
+	// test 10: vtable with one int16
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(1)
+	b.PrependInt16Slot(0, 0x789A, 0)
+	b.EndObject()
+	check([]byte{
+		6, 0, // vtable bytes
+		8, 0, // end of object from here
+		6, 0, // offset to value
+		6, 0, 0, 0, // offset for start of vtable (int32)
+		0, 0, // padding to 4 bytes
+		0x9A, 0x78,
+	})
+
+	// test 11: vtable with two int16
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(2)
+	b.PrependInt16Slot(0, 0x3456, 0)
+	b.PrependInt16Slot(1, 0x789A, 0)
+	b.EndObject()
+	check([]byte{
+		8, 0, // vtable bytes
+		8, 0, // end of object from here
+		6, 0, // offset to value 0
+		4, 0, // offset to value 1
+		8, 0, 0, 0, // offset for start of vtable (int32)
+		0x9A, 0x78, // value 1
+		0x56, 0x34, // value 0
+	})
+
+	// test 12: vtable with int16 and bool
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(2)
+	b.PrependInt16Slot(0, 0x3456, 0)
+	b.PrependBoolSlot(1, true, false)
+	b.EndObject()
+	check([]byte{
+		8, 0, // vtable bytes
+		8, 0, // end of object from here
+		6, 0, // offset to value 0
+		5, 0, // offset to value 1
+		8, 0, 0, 0, // offset for start of vtable (int32)
+		0,          // padding
+		1,          // value 1
+		0x56, 0x34, // value 0
+	})
+
+	// test 12: vtable with empty vector
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeByte, 0, 1)
+	vecend := b.EndVector(0)
+	b.StartObject(1)
+	b.PrependUOffsetTSlot(0, vecend, 0)
+	b.EndObject()
+	check([]byte{
+		6, 0, // vtable bytes
+		8, 0,
+		4, 0, // offset to vector offset
+		6, 0, 0, 0, // offset for start of vtable (int32)
+		4, 0, 0, 0,
+		0, 0, 0, 0, // length of vector (not in struct)
+	})
+
+	// test 12b: vtable with empty vector of byte and some scalars
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeByte, 0, 1)
+	vecend = b.EndVector(0)
+	b.StartObject(2)
+	b.PrependInt16Slot(0, 55, 0)
+	b.PrependUOffsetTSlot(1, vecend, 0)
+	b.EndObject()
+	check([]byte{
+		8, 0, // vtable bytes
+		12, 0,
+		10, 0, // offset to value 0
+		4, 0, // offset to vector offset
+		8, 0, 0, 0, // vtable loc
+		8, 0, 0, 0, // value 1
+		0, 0, 55, 0, // value 0
+
+		0, 0, 0, 0, // length of vector (not in struct)
+	})
+
+	// test 13: vtable with 1 int16 and 2-vector of int16
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeInt16, 2, 1)
+	b.PrependInt16(0x1234)
+	b.PrependInt16(0x5678)
+	vecend = b.EndVector(2)
+	b.StartObject(2)
+	b.PrependUOffsetTSlot(1, vecend, 0)
+	b.PrependInt16Slot(0, 55, 0)
+	b.EndObject()
+	check([]byte{
+		8, 0, // vtable bytes
+		12, 0, // length of object
+		6, 0, // start of value 0 from end of vtable
+		8, 0, // start of value 1 from end of buffer
+		8, 0, 0, 0, // offset for start of vtable (int32)
+		0, 0, // padding
+		55, 0, // value 0
+		4, 0, 0, 0, // vector position from here
+		2, 0, 0, 0, // length of vector (uint32)
+		0x78, 0x56, // vector value 1
+		0x34, 0x12, // vector value 0
+	})
+
+	// test 14: vtable with 1 struct of 1 int8, 1 int16, 1 int32
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(1)
+	b.Prep(4+4+4, 0)
+	b.PrependInt8(55)
+	b.Pad(3)
+	b.PrependInt16(0x1234)
+	b.Pad(2)
+	b.PrependInt32(0x12345678)
+	structStart := b.Offset()
+	b.PrependStructSlot(0, structStart, 0)
+	b.EndObject()
+	check([]byte{
+		6, 0, // vtable bytes
+		16, 0, // end of object from here
+		4, 0, // start of struct from here
+		6, 0, 0, 0, // offset for start of vtable (int32)
+		0x78, 0x56, 0x34, 0x12, // value 2
+		0, 0, // padding
+		0x34, 0x12, // value 1
+		0, 0, 0, // padding
+		55, // value 0
+	})
+
+	// test 15: vtable with 1 vector of 2 struct of 2 int8
+	b = flatbuffers.NewBuilder(0)
+	b.StartVector(flatbuffers.SizeInt8*2, 2, 1)
+	b.PrependInt8(33)
+	b.PrependInt8(44)
+	b.PrependInt8(55)
+	b.PrependInt8(66)
+	vecend = b.EndVector(2)
+	b.StartObject(1)
+	b.PrependUOffsetTSlot(0, vecend, 0)
+	b.EndObject()
+	check([]byte{
+		6, 0, // vtable bytes
+		8, 0,
+		4, 0, // offset of vector offset
+		6, 0, 0, 0, // offset for start of vtable (int32)
+		4, 0, 0, 0, // vector start offset
+
+		2, 0, 0, 0, // vector length
+		66, // vector value 1,1
+		55, // vector value 1,0
+		44, // vector value 0,1
+		33, // vector value 0,0
+	})
+
+	// test 16: table with some elements
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(2)
+	b.PrependInt8Slot(0, 33, 0)
+	b.PrependInt16Slot(1, 66, 0)
+	off := b.EndObject()
+	b.Finish(off)
+
+	check([]byte{
+		12, 0, 0, 0, // root of table: points to vtable offset
+
+		8, 0, // vtable bytes
+		8, 0, // end of object from here
+		7, 0, // start of value 0
+		4, 0, // start of value 1
+
+		8, 0, 0, 0, // offset for start of vtable (int32)
+
+		66, 0, // value 1
+		0,  // padding
+		33, // value 0
+	})
+
+	// test 17: one unfinished table and one finished table
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(2)
+	b.PrependInt8Slot(0, 33, 0)
+	b.PrependInt8Slot(1, 44, 0)
+	off = b.EndObject()
+	b.Finish(off)
+
+	b.StartObject(3)
+	b.PrependInt8Slot(0, 55, 0)
+	b.PrependInt8Slot(1, 66, 0)
+	b.PrependInt8Slot(2, 77, 0)
+	off = b.EndObject()
+	b.Finish(off)
+
+	check([]byte{
+		16, 0, 0, 0, // root of table: points to object
+		0, 0, // padding
+
+		10, 0, // vtable bytes
+		8, 0, // size of object
+		7, 0, // start of value 0
+		6, 0, // start of value 1
+		5, 0, // start of value 2
+		10, 0, 0, 0, // offset for start of vtable (int32)
+		0,  // padding
+		77, // value 2
+		66, // value 1
+		55, // value 0
+
+		12, 0, 0, 0, // root of table: points to object
+
+		8, 0, // vtable bytes
+		8, 0, // size of object
+		7, 0, // start of value 0
+		6, 0, // start of value 1
+		8, 0, 0, 0, // offset for start of vtable (int32)
+		0, 0, // padding
+		44, // value 1
+		33, // value 0
+	})
+
+	// test 18: a bunch of bools
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(8)
+	b.PrependBoolSlot(0, true, false)
+	b.PrependBoolSlot(1, true, false)
+	b.PrependBoolSlot(2, true, false)
+	b.PrependBoolSlot(3, true, false)
+	b.PrependBoolSlot(4, true, false)
+	b.PrependBoolSlot(5, true, false)
+	b.PrependBoolSlot(6, true, false)
+	b.PrependBoolSlot(7, true, false)
+	off = b.EndObject()
+	b.Finish(off)
+
+	check([]byte{
+		24, 0, 0, 0, // root of table: points to vtable offset
+
+		20, 0, // vtable bytes
+		12, 0, // size of object
+		11, 0, // start of value 0
+		10, 0, // start of value 1
+		9, 0, // start of value 2
+		8, 0, // start of value 3
+		7, 0, // start of value 4
+		6, 0, // start of value 5
+		5, 0, // start of value 6
+		4, 0, // start of value 7
+		20, 0, 0, 0, // vtable offset
+
+		1, // value 7
+		1, // value 6
+		1, // value 5
+		1, // value 4
+		1, // value 3
+		1, // value 2
+		1, // value 1
+		1, // value 0
+	})
+
+	// test 19: three bools
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(3)
+	b.PrependBoolSlot(0, true, false)
+	b.PrependBoolSlot(1, true, false)
+	b.PrependBoolSlot(2, true, false)
+	off = b.EndObject()
+	b.Finish(off)
+
+	check([]byte{
+		16, 0, 0, 0, // root of table: points to vtable offset
+
+		0, 0, // padding
+
+		10, 0, // vtable bytes
+		8, 0, // size of object
+		7, 0, // start of value 0
+		6, 0, // start of value 1
+		5, 0, // start of value 2
+		10, 0, 0, 0, // vtable offset from here
+
+		0, // padding
+		1, // value 2
+		1, // value 1
+		1, // value 0
+	})
+
+	// test 20: some floats
+	b = flatbuffers.NewBuilder(0)
+	b.StartObject(1)
+	b.PrependFloat32Slot(0, 1.0, 0.0)
+	off = b.EndObject()
+
+	check([]byte{
+		6, 0, // vtable bytes
+		8, 0, // size of object
+		4, 0, // start of value 0
+		6, 0, 0, 0, // vtable offset
+
+		0, 0, 128, 63, // value 0
+	})
+}
+
+// CheckManualBuild builds a Monster manually.
+func CheckManualBuild(fail func(string, ...interface{})) ([]byte, flatbuffers.UOffsetT) {
+	b := flatbuffers.NewBuilder(0)
+	str := b.CreateString("MyMonster")
+
+	b.StartVector(1, 5, 1)
+	b.PrependByte(4)
+	b.PrependByte(3)
+	b.PrependByte(2)
+	b.PrependByte(1)
+	b.PrependByte(0)
+	inv := b.EndVector(5)
+
+	b.StartObject(13)
+	b.PrependInt16Slot(2, 20, 100)
+	mon2 := b.EndObject()
+
+	// Test4Vector
+	b.StartVector(4, 2, 1)
+
+	// Test 0
+	b.Prep(2, 4)
+	b.Pad(1)
+	b.PlaceInt8(20)
+	b.PlaceInt16(10)
+
+	// Test 1
+	b.Prep(2, 4)
+	b.Pad(1)
+	b.PlaceInt8(40)
+	b.PlaceInt16(30)
+
+	// end testvector
+	test4 := b.EndVector(2)
+
+	b.StartObject(13)
+
+	// a vec3
+	b.Prep(16, 32)
+	b.Pad(2)
+	b.Prep(2, 4)
+	b.Pad(1)
+	b.PlaceByte(6)
+	b.PlaceInt16(5)
+	b.Pad(1)
+	b.PlaceByte(4)
+	b.PlaceFloat64(3.0)
+	b.Pad(4)
+	b.PlaceFloat32(3.0)
+	b.PlaceFloat32(2.0)
+	b.PlaceFloat32(1.0)
+	vec3Loc := b.Offset()
+	// end vec3
+
+	b.PrependStructSlot(0, vec3Loc, 0) // vec3. noop
+	b.PrependInt16Slot(2, 80, 100)     // hp
+	b.PrependUOffsetTSlot(3, str, 0)
+	b.PrependUOffsetTSlot(5, inv, 0) // inventory
+	b.PrependByteSlot(7, 1, 0)
+	b.PrependUOffsetTSlot(8, mon2, 0)
+	b.PrependUOffsetTSlot(9, test4, 0)
+	mon := b.EndObject()
+
+	b.Finish(mon)
+
+	return b.Bytes, b.Head()
+}
+
+func CheckGetRootAsForNonRootTable(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+	str := b.CreateString("MyStat")
+	example.StatStart(b)
+	example.StatAddId(b, str)
+	example.StatAddVal(b, 12345678)
+	example.StatAddCount(b, 12345)
+	stat_end := example.StatEnd(b)
+	b.Finish(stat_end)
+
+	stat := example.GetRootAsStat(b.Bytes, b.Head())
+
+	if got := stat.Id(); !bytes.Equal([]byte("MyStat"), got) {
+		fail(FailString("stat.Id()", "MyStat", got))
+	}
+
+	if got := stat.Val(); 12345678 != got {
+		fail(FailString("stat.Val()", 12345678, got))
+	}
+
+	if got := stat.Count(); 12345 != got {
+		fail(FailString("stat.Count()", 12345, got))
+	}
+}
+
+// CheckGeneratedBuild uses generated code to build the example Monster.
+func CheckGeneratedBuild(sizePrefix bool, fail func(string, ...interface{})) ([]byte, flatbuffers.UOffsetT) {
+	b := flatbuffers.NewBuilder(0)
+	str := b.CreateString("MyMonster")
+	test1 := b.CreateString("test1")
+	test2 := b.CreateString("test2")
+	fred := b.CreateString("Fred")
+
+	example.MonsterStartInventoryVector(b, 5)
+	b.PrependByte(4)
+	b.PrependByte(3)
+	b.PrependByte(2)
+	b.PrependByte(1)
+	b.PrependByte(0)
+	inv := b.EndVector(5)
+
+	example.MonsterStart(b)
+	example.MonsterAddName(b, fred)
+	mon2 := example.MonsterEnd(b)
+
+	example.MonsterStartTest4Vector(b, 2)
+	example.CreateTest(b, 10, 20)
+	example.CreateTest(b, 30, 40)
+	test4 := b.EndVector(2)
+
+	example.MonsterStartTestarrayofstringVector(b, 2)
+	b.PrependUOffsetT(test2)
+	b.PrependUOffsetT(test1)
+	testArrayOfString := b.EndVector(2)
+
+	example.MonsterStart(b)
+
+	pos := example.CreateVec3(b, 1.0, 2.0, 3.0, 3.0, example.ColorGreen, 5, 6)
+	example.MonsterAddPos(b, pos)
+
+	example.MonsterAddHp(b, 80)
+	example.MonsterAddName(b, str)
+	example.MonsterAddTestbool(b, true)
+	example.MonsterAddInventory(b, inv)
+	example.MonsterAddTestType(b, 1)
+	example.MonsterAddTest(b, mon2)
+	example.MonsterAddTest4(b, test4)
+	example.MonsterAddTestarrayofstring(b, testArrayOfString)
+	mon := example.MonsterEnd(b)
+
+	if sizePrefix {
+		b.FinishSizePrefixed(mon)
+	} else {
+		b.Finish(mon)
+	}
+
+	return b.Bytes, b.Head()
+}
+
+// CheckTableAccessors checks that the table accessors work as expected.
+func CheckTableAccessors(fail func(string, ...interface{})) {
+	// test struct accessor
+	b := flatbuffers.NewBuilder(0)
+	pos := example.CreateVec3(b, 1.0, 2.0, 3.0, 3.0, 4, 5, 6)
+	b.Finish(pos)
+	vec3Bytes := b.FinishedBytes()
+	vec3 := &example.Vec3{}
+	flatbuffers.GetRootAs(vec3Bytes, 0, vec3)
+
+	if bytes.Compare(vec3Bytes, vec3.Table().Bytes) != 0 {
+		fail("invalid vec3 table")
+	}
+
+	// test table accessor
+	b = flatbuffers.NewBuilder(0)
+	str := b.CreateString("MyStat")
+	example.StatStart(b)
+	example.StatAddId(b, str)
+	example.StatAddVal(b, 12345678)
+	example.StatAddCount(b, 12345)
+	pos = example.StatEnd(b)
+	b.Finish(pos)
+	statBytes := b.FinishedBytes()
+	stat := &example.Stat{}
+	flatbuffers.GetRootAs(statBytes, 0, stat)
+
+	if bytes.Compare(statBytes, stat.Table().Bytes) != 0 {
+		fail("invalid stat table")
+	}
+}
+
+// CheckVtableDeduplication verifies that vtables are deduplicated.
+func CheckVtableDeduplication(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+
+	b.StartObject(4)
+	b.PrependByteSlot(0, 0, 0)
+	b.PrependByteSlot(1, 11, 0)
+	b.PrependByteSlot(2, 22, 0)
+	b.PrependInt16Slot(3, 33, 0)
+	obj0 := b.EndObject()
+
+	b.StartObject(4)
+	b.PrependByteSlot(0, 0, 0)
+	b.PrependByteSlot(1, 44, 0)
+	b.PrependByteSlot(2, 55, 0)
+	b.PrependInt16Slot(3, 66, 0)
+	obj1 := b.EndObject()
+
+	b.StartObject(4)
+	b.PrependByteSlot(0, 0, 0)
+	b.PrependByteSlot(1, 77, 0)
+	b.PrependByteSlot(2, 88, 0)
+	b.PrependInt16Slot(3, 99, 0)
+	obj2 := b.EndObject()
+
+	got := b.Bytes[b.Head():]
+
+	want := []byte{
+		240, 255, 255, 255, // == -12. offset to dedupped vtable.
+		99, 0,
+		88,
+		77,
+		248, 255, 255, 255, // == -8. offset to dedupped vtable.
+		66, 0,
+		55,
+		44,
+		12, 0,
+		8, 0,
+		0, 0,
+		7, 0,
+		6, 0,
+		4, 0,
+		12, 0, 0, 0,
+		33, 0,
+		22,
+		11,
+	}
+
+	if !bytes.Equal(want, got) {
+		fail("testVtableDeduplication want:\n%d %v\nbut got:\n%d %v\n",
+			len(want), want, len(got), got)
+	}
+
+	table0 := &flatbuffers.Table{Bytes: b.Bytes, Pos: flatbuffers.UOffsetT(len(b.Bytes)) - obj0}
+	table1 := &flatbuffers.Table{Bytes: b.Bytes, Pos: flatbuffers.UOffsetT(len(b.Bytes)) - obj1}
+	table2 := &flatbuffers.Table{Bytes: b.Bytes, Pos: flatbuffers.UOffsetT(len(b.Bytes)) - obj2}
+
+	testTable := func(tab *flatbuffers.Table, a flatbuffers.VOffsetT, b, c, d byte) {
+		// vtable size
+		if got := tab.GetVOffsetTSlot(0, 0); 12 != got {
+			fail("failed 0, 0: %d", got)
+		}
+		// object size
+		if got := tab.GetVOffsetTSlot(2, 0); 8 != got {
+			fail("failed 2, 0: %d", got)
+		}
+		// default value
+		if got := tab.GetVOffsetTSlot(4, 0); a != got {
+			fail("failed 4, 0: %d", got)
+		}
+		if got := tab.GetByteSlot(6, 0); b != got {
+			fail("failed 6, 0: %d", got)
+		}
+		if val := tab.GetByteSlot(8, 0); c != val {
+			fail("failed 8, 0: %d", got)
+		}
+		if got := tab.GetByteSlot(10, 0); d != got {
+			fail("failed 10, 0: %d", got)
+		}
+	}
+
+	testTable(table0, 0, 11, 22, 33)
+	testTable(table1, 0, 44, 55, 66)
+	testTable(table2, 0, 77, 88, 99)
+}
+
+// CheckNotInObjectError verifies that `EndObject` fails if not inside an
+// object.
+func CheckNotInObjectError(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+
+	defer func() {
+		r := recover()
+		if r == nil {
+			fail("expected panic in CheckNotInObjectError")
+		}
+	}()
+	b.EndObject()
+}
+
+// CheckStringIsNestedError verifies that a string can not be created inside
+// another object.
+func CheckStringIsNestedError(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+	b.StartObject(0)
+	defer func() {
+		r := recover()
+		if r == nil {
+			fail("expected panic in CheckStringIsNestedError")
+		}
+	}()
+	b.CreateString("foo")
+}
+
+func CheckEmptiedBuilder(fail func(string, ...interface{})) {
+	f := func(a, b string) bool {
+		if a == b {
+			return true
+		}
+
+		builder := flatbuffers.NewBuilder(0)
+
+		a1 := builder.CreateSharedString(a)
+		b1 := builder.CreateSharedString(b)
+		builder.Reset()
+		b2 := builder.CreateSharedString(b)
+		a2 := builder.CreateSharedString(a)
+
+		return !(a1 == a2 || b1 == b2)
+	}
+	if err := quick.Check(f, nil); err != nil {
+		fail("expected different offset")
+	}
+}
+
+func CheckSharedStrings(fail func(string, ...interface{})) {
+	f := func(strings []string) bool {
+		b := flatbuffers.NewBuilder(0)
+		for _, s1 := range strings {
+			for _, s2 := range strings {
+				off1 := b.CreateSharedString(s1)
+				off2 := b.CreateSharedString(s2)
+
+				if (s1 == s2) && (off1 != off2) {
+					return false
+				}
+				if (s1 != s2) && (off1 == off2) {
+					return false
+				}
+			}
+		}
+		return true
+	}
+	if err := quick.Check(f, nil); err != nil {
+		fail("expected same offset")
+	}
+}
+
+// CheckByteStringIsNestedError verifies that a bytestring can not be created
+// inside another object.
+func CheckByteStringIsNestedError(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+	b.StartObject(0)
+	defer func() {
+		r := recover()
+		if r == nil {
+			fail("expected panic in CheckByteStringIsNestedError")
+		}
+	}()
+	b.CreateByteString([]byte("foo"))
+}
+
+// CheckStructIsNotInlineError verifies that writing a struct in a location
+// away from where it is used will cause a panic.
+func CheckStructIsNotInlineError(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+	b.StartObject(0)
+	defer func() {
+		r := recover()
+		if r == nil {
+			fail("expected panic in CheckStructIsNotInlineError")
+		}
+	}()
+	b.PrependStructSlot(0, 1, 0)
+}
+
+// CheckFinishedBytesError verifies that `FinishedBytes` panics if the table
+// is not finished.
+func CheckFinishedBytesError(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+
+	defer func() {
+		r := recover()
+		if r == nil {
+			fail("expected panic in CheckFinishedBytesError")
+		}
+	}()
+	b.FinishedBytes()
+}
+
+// CheckEnumNames checks that the generated enum names are correct.
+func CheckEnumNames(fail func(string, ...interface{})) {
+	{
+		want := map[example.Any]string{
+			example.AnyNONE:                    "NONE",
+			example.AnyMonster:                 "Monster",
+			example.AnyTestSimpleTableWithEnum: "TestSimpleTableWithEnum",
+			example.AnyMyGame_Example2_Monster: "MyGame_Example2_Monster",
+		}
+		got := example.EnumNamesAny
+		if !reflect.DeepEqual(got, want) {
+			fail("enum name is not equal")
+		}
+	}
+	{
+		want := map[example.Color]string{
+			example.ColorRed:   "Red",
+			example.ColorGreen: "Green",
+			example.ColorBlue:  "Blue",
+		}
+		got := example.EnumNamesColor
+		if !reflect.DeepEqual(got, want) {
+			fail("enum name is not equal")
+		}
+	}
+}
+
+// CheckEnumString checks the String method on generated enum types.
+func CheckEnumString(fail func(string, ...interface{})) {
+	if got := example.AnyMonster.String(); got != "Monster" {
+		fail("Monster.String: %q != %q", got, "Monster")
+	}
+	if got := fmt.Sprintf("color: %s", example.ColorGreen); got != "color: Green" {
+		fail("color.String: %q != %q", got, "color: Green")
+	}
+}
+
+// CheckEnumValues checks that the generated enum values maps are correct.
+func CheckEnumValues(fail func(string, ...interface{})) {
+	{
+		want := map[string]example.Any{
+			"NONE":                    example.AnyNONE,
+			"Monster":                 example.AnyMonster,
+			"TestSimpleTableWithEnum": example.AnyTestSimpleTableWithEnum,
+			"MyGame_Example2_Monster": example.AnyMyGame_Example2_Monster,
+		}
+		got := example.EnumValuesAny
+		if !reflect.DeepEqual(got, want) {
+			fail("enum name is not equal")
+		}
+	}
+	{
+		want := map[string]example.Color{
+			"Red":   example.ColorRed,
+			"Green": example.ColorGreen,
+			"Blue":  example.ColorBlue,
+		}
+		got := example.EnumValuesColor
+		if !reflect.DeepEqual(got, want) {
+			fail("enum name is not equal")
+		}
+	}
+}
+
+// CheckDocExample checks that the code given in FlatBuffers documentation
+// is syntactically correct.
+func CheckDocExample(buf []byte, off flatbuffers.UOffsetT, fail func(string, ...interface{})) {
+	monster := example.GetRootAsMonster(buf, off)
+	_ = monster.Hp()
+	_ = monster.Pos(nil)
+	for i := 0; i < monster.InventoryLength(); i++ {
+		_ = monster.Inventory(i) // do something here
+	}
+
+	builder := flatbuffers.NewBuilder(0)
+
+	example.MonsterStartInventoryVector(builder, 5)
+	for i := 4; i >= 0; i-- {
+		builder.PrependByte(byte(i))
+	}
+	inv := builder.EndVector(5)
+
+	str := builder.CreateString("MyMonster")
+	example.MonsterStart(builder)
+	example.MonsterAddPos(builder, example.CreateVec3(builder, 1.0, 2.0, 3.0, 3.0, example.Color(4), 5, 6))
+	example.MonsterAddHp(builder, 80)
+	example.MonsterAddName(builder, str)
+	example.MonsterAddInventory(builder, inv)
+	example.MonsterAddTestType(builder, 1)
+	example.MonsterAddColor(builder, example.ColorRed)
+	// example.MonsterAddTest(builder, mon2)
+	// example.MonsterAddTest4(builder, test4s)
+	_ = example.MonsterEnd(builder)
+}
+
+func CheckCreateByteVector(fail func(string, ...interface{})) {
+	raw := [30]byte{}
+	for i := 0; i < len(raw); i++ {
+		raw[i] = byte(i)
+	}
+
+	for size := 0; size < len(raw); size++ {
+		b1 := flatbuffers.NewBuilder(0)
+		b2 := flatbuffers.NewBuilder(0)
+		b1.StartVector(1, size, 1)
+		for i := size - 1; i >= 0; i-- {
+			b1.PrependByte(raw[i])
+		}
+		b1.EndVector(size)
+		b2.CreateByteVector(raw[:size])
+		CheckByteEquality(b1.Bytes, b2.Bytes, fail)
+	}
+}
+
+func CheckParentNamespace(fail func(string, ...interface{})) {
+	var empty, nonempty []byte
+
+	// create monster with an empty parent namespace field
+	{
+		builder := flatbuffers.NewBuilder(0)
+
+		example.MonsterStart(builder)
+		m := example.MonsterEnd(builder)
+		builder.Finish(m)
+
+		empty = make([]byte, len(builder.FinishedBytes()))
+		copy(empty, builder.FinishedBytes())
+	}
+
+	// create monster with a non-empty parent namespace field
+	{
+		builder := flatbuffers.NewBuilder(0)
+		mygame.InParentNamespaceStart(builder)
+		pn := mygame.InParentNamespaceEnd(builder)
+
+		example.MonsterStart(builder)
+		example.MonsterAddParentNamespaceTest(builder, pn)
+		m := example.MonsterEnd(builder)
+
+		builder.Finish(m)
+
+		nonempty = make([]byte, len(builder.FinishedBytes()))
+		copy(nonempty, builder.FinishedBytes())
+	}
+
+	// read monster with empty parent namespace field
+	{
+		m := example.GetRootAsMonster(empty, 0)
+		if m.ParentNamespaceTest(nil) != nil {
+			fail("expected nil ParentNamespaceTest for empty field")
+		}
+	}
+
+	// read monster with non-empty parent namespace field
+	{
+		m := example.GetRootAsMonster(nonempty, 0)
+		if m.ParentNamespaceTest(nil) == nil {
+			fail("expected non-nil ParentNamespaceTest for non-empty field")
+		}
+	}
+}
+
+func CheckSizePrefixedBuffer(fail func(string, ...interface{})) {
+	// Generate a size-prefixed flatbuffer
+	generated, off := CheckGeneratedBuild(true, fail)
+
+	// Check that the size prefix is the size of monsterdata_go_wire.mon minus 4
+	size := flatbuffers.GetSizePrefix(generated, off)
+	if size != 220 {
+		fail("mismatch between size prefix and expected size")
+	}
+
+	// Check that the buffer can be used as expected
+	CheckReadBuffer(generated, off, true, fail)
+	CheckMutateBuffer(generated, off, true, fail)
+	CheckObjectAPI(generated, off, true, fail)
+
+	// Write generated bfufer out to a file
+	if err := ioutil.WriteFile(outData+".sp", generated[off:], os.FileMode(0644)); err != nil {
+		fail("failed to write file: %s", err)
+	}
+}
+
+// Include simple random number generator to ensure results will be the
+// same cross platform.
+// http://en.wikipedia.org/wiki/Park%E2%80%93Miller_random_number_generator
+type LCG uint32
+
+const InitialLCGSeed = 48271
+
+func NewLCG() *LCG {
+	n := uint32(InitialLCGSeed)
+	l := LCG(n)
+	return &l
+}
+
+func (lcg *LCG) Reset() {
+	*lcg = InitialLCGSeed
+}
+
+func (lcg *LCG) Next() uint32 {
+	n := uint32((uint64(*lcg) * uint64(279470273)) % uint64(4294967291))
+	*lcg = LCG(n)
+	return n
+}
+
+// CheckByteEquality verifies that two byte buffers are the same.
+func CheckByteEquality(a, b []byte, fail func(string, ...interface{})) {
+	if !bytes.Equal(a, b) {
+		fail("objects are not byte-wise equal")
+	}
+}
+
+// CheckMutateMethods checks all mutate methods one by one
+func CheckMutateMethods(fail func(string, ...interface{})) {
+	b := flatbuffers.NewBuilder(0)
+	b.StartObject(15)
+	b.PrependBoolSlot(0, true, false)
+	b.PrependByteSlot(1, 1, 0)
+	b.PrependUint8Slot(2, 2, 0)
+	b.PrependUint16Slot(3, 3, 0)
+	b.PrependUint32Slot(4, 4, 0)
+	b.PrependUint64Slot(5, 5, 0)
+	b.PrependInt8Slot(6, 6, 0)
+	b.PrependInt16Slot(7, 7, 0)
+	b.PrependInt32Slot(8, 8, 0)
+	b.PrependInt64Slot(9, 9, 0)
+	b.PrependFloat32Slot(10, 10, 0)
+	b.PrependFloat64Slot(11, 11, 0)
+
+	b.PrependUOffsetTSlot(12, 12, 0)
+	uoVal := b.Offset() - 12
+
+	b.PrependVOffsetT(13)
+	b.Slot(13)
+
+	b.PrependSOffsetT(14)
+	b.Slot(14)
+	soVal := flatbuffers.SOffsetT(b.Offset() - 14)
+
+	offset := b.EndObject()
+
+	t := &flatbuffers.Table{
+		Bytes: b.Bytes,
+		Pos:   flatbuffers.UOffsetT(len(b.Bytes)) - offset,
+	}
+
+	calcVOffsetT := func(slot int) (vtableOffset flatbuffers.VOffsetT) {
+		return flatbuffers.VOffsetT((flatbuffers.VtableMetadataFields + slot) * flatbuffers.SizeVOffsetT)
+	}
+	calcUOffsetT := func(vtableOffset flatbuffers.VOffsetT) (valueOffset flatbuffers.UOffsetT) {
+		return t.Pos + flatbuffers.UOffsetT(t.Offset(vtableOffset))
+	}
+
+	type testcase struct {
+		field  string
+		testfn func() bool
+	}
+
+	testForOriginalValues := []testcase{
+		testcase{"BoolSlot", func() bool { return t.GetBoolSlot(calcVOffsetT(0), true) == true }},
+		testcase{"ByteSlot", func() bool { return t.GetByteSlot(calcVOffsetT(1), 1) == 1 }},
+		testcase{"Uint8Slot", func() bool { return t.GetUint8Slot(calcVOffsetT(2), 2) == 2 }},
+		testcase{"Uint16Slot", func() bool { return t.GetUint16Slot(calcVOffsetT(3), 3) == 3 }},
+		testcase{"Uint32Slot", func() bool { return t.GetUint32Slot(calcVOffsetT(4), 4) == 4 }},
+		testcase{"Uint64Slot", func() bool { return t.GetUint64Slot(calcVOffsetT(5), 5) == 5 }},
+		testcase{"Int8Slot", func() bool { return t.GetInt8Slot(calcVOffsetT(6), 6) == 6 }},
+		testcase{"Int16Slot", func() bool { return t.GetInt16Slot(calcVOffsetT(7), 7) == 7 }},
+		testcase{"Int32Slot", func() bool { return t.GetInt32Slot(calcVOffsetT(8), 8) == 8 }},
+		testcase{"Int64Slot", func() bool { return t.GetInt64Slot(calcVOffsetT(9), 9) == 9 }},
+		testcase{"Float32Slot", func() bool { return t.GetFloat32Slot(calcVOffsetT(10), 10) == 10 }},
+		testcase{"Float64Slot", func() bool { return t.GetFloat64Slot(calcVOffsetT(11), 11) == 11 }},
+		testcase{"UOffsetTSlot", func() bool { return t.GetUOffsetT(calcUOffsetT(calcVOffsetT(12))) == uoVal }},
+		testcase{"VOffsetTSlot", func() bool { return t.GetVOffsetT(calcUOffsetT(calcVOffsetT(13))) == 13 }},
+		testcase{"SOffsetTSlot", func() bool { return t.GetSOffsetT(calcUOffsetT(calcVOffsetT(14))) == soVal }},
+	}
+
+	testMutability := []testcase{
+		testcase{"BoolSlot", func() bool { return t.MutateBoolSlot(calcVOffsetT(0), false) }},
+		testcase{"ByteSlot", func() bool { return t.MutateByteSlot(calcVOffsetT(1), 2) }},
+		testcase{"Uint8Slot", func() bool { return t.MutateUint8Slot(calcVOffsetT(2), 4) }},
+		testcase{"Uint16Slot", func() bool { return t.MutateUint16Slot(calcVOffsetT(3), 6) }},
+		testcase{"Uint32Slot", func() bool { return t.MutateUint32Slot(calcVOffsetT(4), 8) }},
+		testcase{"Uint64Slot", func() bool { return t.MutateUint64Slot(calcVOffsetT(5), 10) }},
+		testcase{"Int8Slot", func() bool { return t.MutateInt8Slot(calcVOffsetT(6), 12) }},
+		testcase{"Int16Slot", func() bool { return t.MutateInt16Slot(calcVOffsetT(7), 14) }},
+		testcase{"Int32Slot", func() bool { return t.MutateInt32Slot(calcVOffsetT(8), 16) }},
+		testcase{"Int64Slot", func() bool { return t.MutateInt64Slot(calcVOffsetT(9), 18) }},
+		testcase{"Float32Slot", func() bool { return t.MutateFloat32Slot(calcVOffsetT(10), 20) }},
+		testcase{"Float64Slot", func() bool { return t.MutateFloat64Slot(calcVOffsetT(11), 22) }},
+		testcase{"UOffsetTSlot", func() bool { return t.MutateUOffsetT(calcUOffsetT(calcVOffsetT(12)), 24) }},
+		testcase{"VOffsetTSlot", func() bool { return t.MutateVOffsetT(calcUOffsetT(calcVOffsetT(13)), 26) }},
+		testcase{"SOffsetTSlot", func() bool { return t.MutateSOffsetT(calcUOffsetT(calcVOffsetT(14)), 28) }},
+	}
+
+	testMutabilityWithoutSlot := []testcase{
+		testcase{"BoolSlot", func() bool { return t.MutateBoolSlot(calcVOffsetT(16), false) }},
+		testcase{"ByteSlot", func() bool { return t.MutateByteSlot(calcVOffsetT(16), 2) }},
+		testcase{"Uint8Slot", func() bool { return t.MutateUint8Slot(calcVOffsetT(16), 2) }},
+		testcase{"Uint16Slot", func() bool { return t.MutateUint16Slot(calcVOffsetT(16), 2) }},
+		testcase{"Uint32Slot", func() bool { return t.MutateUint32Slot(calcVOffsetT(16), 2) }},
+		testcase{"Uint64Slot", func() bool { return t.MutateUint64Slot(calcVOffsetT(16), 2) }},
+		testcase{"Int8Slot", func() bool { return t.MutateInt8Slot(calcVOffsetT(16), 2) }},
+		testcase{"Int16Slot", func() bool { return t.MutateInt16Slot(calcVOffsetT(16), 2) }},
+		testcase{"Int32Slot", func() bool { return t.MutateInt32Slot(calcVOffsetT(16), 2) }},
+		testcase{"Int64Slot", func() bool { return t.MutateInt64Slot(calcVOffsetT(16), 2) }},
+		testcase{"Float32Slot", func() bool { return t.MutateFloat32Slot(calcVOffsetT(16), 2) }},
+		testcase{"Float64Slot", func() bool { return t.MutateFloat64Slot(calcVOffsetT(16), 2) }},
+	}
+
+	testForMutatedValues := []testcase{
+		testcase{"BoolSlot", func() bool { return t.GetBoolSlot(calcVOffsetT(0), true) == false }},
+		testcase{"ByteSlot", func() bool { return t.GetByteSlot(calcVOffsetT(1), 1) == 2 }},
+		testcase{"Uint8Slot", func() bool { return t.GetUint8Slot(calcVOffsetT(2), 1) == 4 }},
+		testcase{"Uint16Slot", func() bool { return t.GetUint16Slot(calcVOffsetT(3), 1) == 6 }},
+		testcase{"Uint32Slot", func() bool { return t.GetUint32Slot(calcVOffsetT(4), 1) == 8 }},
+		testcase{"Uint64Slot", func() bool { return t.GetUint64Slot(calcVOffsetT(5), 1) == 10 }},
+		testcase{"Int8Slot", func() bool { return t.GetInt8Slot(calcVOffsetT(6), 1) == 12 }},
+		testcase{"Int16Slot", func() bool { return t.GetInt16Slot(calcVOffsetT(7), 1) == 14 }},
+		testcase{"Int32Slot", func() bool { return t.GetInt32Slot(calcVOffsetT(8), 1) == 16 }},
+		testcase{"Int64Slot", func() bool { return t.GetInt64Slot(calcVOffsetT(9), 1) == 18 }},
+		testcase{"Float32Slot", func() bool { return t.GetFloat32Slot(calcVOffsetT(10), 1) == 20 }},
+		testcase{"Float64Slot", func() bool { return t.GetFloat64Slot(calcVOffsetT(11), 1) == 22 }},
+		testcase{"UOffsetTSlot", func() bool { return t.GetUOffsetT(calcUOffsetT(calcVOffsetT(12))) == 24 }},
+		testcase{"VOffsetTSlot", func() bool { return t.GetVOffsetT(calcUOffsetT(calcVOffsetT(13))) == 26 }},
+		testcase{"SOffsetTSlot", func() bool { return t.GetSOffsetT(calcUOffsetT(calcVOffsetT(14))) == 28 }},
+	}
+
+	// make sure original values are okay
+	for _, t := range testForOriginalValues {
+		if !t.testfn() {
+			fail(t.field + "' field doesn't have the expected original value")
+		}
+	}
+
+	// try to mutate fields and check mutability
+	for _, t := range testMutability {
+		if !t.testfn() {
+			fail(FailString(t.field+"' field failed mutability test", "passed", "failed"))
+		}
+	}
+
+	// try to mutate fields and check mutability
+	// these have wrong slots so should fail
+	for _, t := range testMutabilityWithoutSlot {
+		if t.testfn() {
+			fail(FailString(t.field+"' field failed no slot mutability test", "failed", "passed"))
+		}
+	}
+
+	// test whether values have changed
+	for _, t := range testForMutatedValues {
+		if !t.testfn() {
+			fail(t.field + "' field doesn't have the expected mutated value")
+		}
+	}
+}
+
+// BenchmarkVtableDeduplication measures the speed of vtable deduplication
+// by creating prePop vtables, then populating b.N objects with a
+// different single vtable.
+//
+// When b.N is large (as in long benchmarks), memory usage may be high.
+func BenchmarkVtableDeduplication(b *testing.B) {
+	prePop := 10
+	builder := flatbuffers.NewBuilder(0)
+
+	// pre-populate some vtables:
+	for i := 0; i < prePop; i++ {
+		builder.StartObject(i)
+		for j := 0; j < i; j++ {
+			builder.PrependInt16Slot(j, int16(j), 0)
+		}
+		builder.EndObject()
+	}
+
+	// benchmark deduplication of a new vtable:
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		lim := prePop
+
+		builder.StartObject(lim)
+		for j := 0; j < lim; j++ {
+			builder.PrependInt16Slot(j, int16(j), 0)
+		}
+		builder.EndObject()
+	}
+}
+
+// BenchmarkParseGold measures the speed of parsing the 'gold' data
+// used throughout this test suite.
+func BenchmarkParseGold(b *testing.B) {
+	buf, offset := CheckGeneratedBuild(false, b.Fatalf)
+	monster := example.GetRootAsMonster(buf, offset)
+
+	// use these to prevent allocations:
+	reuse_pos := example.Vec3{}
+	reuse_test3 := example.Test{}
+	reuse_table2 := flatbuffers.Table{}
+	reuse_monster2 := example.Monster{}
+	reuse_test4_0 := example.Test{}
+	reuse_test4_1 := example.Test{}
+
+	b.SetBytes(int64(len(buf[offset:])))
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		monster.Hp()
+		monster.Mana()
+		name := monster.Name()
+		_ = name[0]
+		_ = name[len(name)-1]
+
+		monster.Pos(&reuse_pos)
+		reuse_pos.X()
+		reuse_pos.Y()
+		reuse_pos.Z()
+		reuse_pos.Test1()
+		reuse_pos.Test2()
+		reuse_pos.Test3(&reuse_test3)
+		reuse_test3.A()
+		reuse_test3.B()
+		monster.TestType()
+		monster.Test(&reuse_table2)
+		reuse_monster2.Init(reuse_table2.Bytes, reuse_table2.Pos)
+		name2 := reuse_monster2.Name()
+		_ = name2[0]
+		_ = name2[len(name2)-1]
+		monster.InventoryLength()
+		l := monster.InventoryLength()
+		for i := 0; i < l; i++ {
+			monster.Inventory(i)
+		}
+		monster.Test4Length()
+		monster.Test4(&reuse_test4_0, 0)
+		monster.Test4(&reuse_test4_1, 1)
+
+		reuse_test4_0.A()
+		reuse_test4_0.B()
+		reuse_test4_1.A()
+		reuse_test4_1.B()
+
+		monster.TestarrayofstringLength()
+		str0 := monster.Testarrayofstring(0)
+		_ = str0[0]
+		_ = str0[len(str0)-1]
+		str1 := monster.Testarrayofstring(1)
+		_ = str1[0]
+		_ = str1[len(str1)-1]
+	}
+}
+
+// BenchmarkBuildGold uses generated code to build the example Monster.
+func BenchmarkBuildGold(b *testing.B) {
+	buf, offset := CheckGeneratedBuild(false, b.Fatalf)
+	bytes_length := int64(len(buf[offset:]))
+
+	reuse_str := "MyMonster"
+	reuse_test1 := "test1"
+	reuse_test2 := "test2"
+	reuse_fred := "Fred"
+
+	b.SetBytes(bytes_length)
+	bldr := flatbuffers.NewBuilder(0)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		bldr.Reset()
+
+		str := bldr.CreateString(reuse_str)
+		test1 := bldr.CreateString(reuse_test1)
+		test2 := bldr.CreateString(reuse_test2)
+		fred := bldr.CreateString(reuse_fred)
+
+		example.MonsterStartInventoryVector(bldr, 5)
+		bldr.PrependByte(4)
+		bldr.PrependByte(3)
+		bldr.PrependByte(2)
+		bldr.PrependByte(1)
+		bldr.PrependByte(0)
+		inv := bldr.EndVector(5)
+
+		example.MonsterStart(bldr)
+		example.MonsterAddName(bldr, fred)
+		mon2 := example.MonsterEnd(bldr)
+
+		example.MonsterStartTest4Vector(bldr, 2)
+		example.CreateTest(bldr, 10, 20)
+		example.CreateTest(bldr, 30, 40)
+		test4 := bldr.EndVector(2)
+
+		example.MonsterStartTestarrayofstringVector(bldr, 2)
+		bldr.PrependUOffsetT(test2)
+		bldr.PrependUOffsetT(test1)
+		testArrayOfString := bldr.EndVector(2)
+
+		example.MonsterStart(bldr)
+
+		pos := example.CreateVec3(bldr, 1.0, 2.0, 3.0, 3.0, example.ColorGreen, 5, 6)
+		example.MonsterAddPos(bldr, pos)
+
+		example.MonsterAddHp(bldr, 80)
+		example.MonsterAddName(bldr, str)
+		example.MonsterAddInventory(bldr, inv)
+		example.MonsterAddTestType(bldr, 1)
+		example.MonsterAddTest(bldr, mon2)
+		example.MonsterAddTest4(bldr, test4)
+		example.MonsterAddTestarrayofstring(bldr, testArrayOfString)
+		mon := example.MonsterEnd(bldr)
+
+		bldr.Finish(mon)
+	}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/gold_flexbuffer_example.bin b/3rdparty/TNN/third_party/flatbuffers/tests/gold_flexbuffer_example.bin
new file mode 100644
index 0000000..f9d24b1
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/gold_flexbuffer_example.bin differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1.fbs
new file mode 100644
index 0000000..804856a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1.fbs
@@ -0,0 +1,7 @@
+include "sub/include_test2.fbs";
+include "sub/include_test2.fbs";  // should be skipped
+include "include_test1.fbs";  // should be skipped
+
+table TableA {
+  b:MyGame.OtherNameSpace.TableB;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1_generated.rs
new file mode 100644
index 0000000..44cfaa7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/include_test1_generated.rs
@@ -0,0 +1,102 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use crate::include_test2_generated::*;
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+pub enum TableAOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableA<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableA<'a> {
+    type Inner = TableA<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableA<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableA { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableAArgs<'args>) -> flatbuffers::WIPOffset<TableA<'bldr>> {
+      let mut builder = TableABuilder::new(_fbb);
+      if let Some(x) = args.b { builder.add_b(x); }
+      builder.finish()
+    }
+
+    pub const VT_B: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn b(&self) -> Option<my_game::other_name_space::TableB<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<my_game::other_name_space::TableB>>(TableA::VT_B, None)
+  }
+}
+
+impl flatbuffers::Verifiable for TableA<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<my_game::other_name_space::TableB>>(&"b", Self::VT_B, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableAArgs<'a> {
+    pub b: Option<flatbuffers::WIPOffset<my_game::other_name_space::TableB<'a>>>,
+}
+impl<'a> Default for TableAArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        TableAArgs {
+            b: None,
+        }
+    }
+}
+pub struct TableABuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableABuilder<'a, 'b> {
+  #[inline]
+  pub fn add_b(&mut self, b: flatbuffers::WIPOffset<my_game::other_name_space::TableB<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<my_game::other_name_space::TableB>>(TableA::VT_B, b);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableABuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableABuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableA<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableA<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableA");
+      ds.field("b", &self.b());
+      ds.finish()
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2.fbs
new file mode 100644
index 0000000..7225735
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2.fbs
@@ -0,0 +1,12 @@
+include "include_test1.fbs";
+include "sub/include_test2.fbs";    // should be skipped
+
+namespace MyGame.OtherNameSpace;
+
+enum FromInclude:long { IncludeVal }
+
+struct Unused { a:int; }
+
+table TableB {
+  a:TableA;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2_generated.rs
new file mode 100644
index 0000000..92e09bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/include_test/sub/include_test2_generated.rs
@@ -0,0 +1,306 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use crate::include_test1_generated::*;
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod my_game {
+
+  use crate::include_test1_generated::*;
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+#[allow(unused_imports, dead_code)]
+pub mod other_name_space {
+
+  use crate::include_test1_generated::*;
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_FROM_INCLUDE: i64 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_FROM_INCLUDE: i64 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_FROM_INCLUDE: [FromInclude; 1] = [
+  FromInclude::IncludeVal,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct FromInclude(pub i64);
+#[allow(non_upper_case_globals)]
+impl FromInclude {
+  pub const IncludeVal: Self = Self(0);
+
+  pub const ENUM_MIN: i64 = 0;
+  pub const ENUM_MAX: i64 = 0;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::IncludeVal,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::IncludeVal => Some("IncludeVal"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for FromInclude {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for FromInclude {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i64>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for FromInclude {
+    type Output = FromInclude;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i64>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for FromInclude {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i64::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i64::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for FromInclude {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i64::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for FromInclude {}
+// struct Unused, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct Unused(pub [u8; 4]);
+impl Default for Unused { 
+  fn default() -> Self { 
+    Self([0; 4])
+  }
+}
+impl std::fmt::Debug for Unused {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("Unused")
+      .field("a", &self.a())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Unused {}
+impl flatbuffers::SafeSliceAccess for Unused {}
+impl<'a> flatbuffers::Follow<'a> for Unused {
+  type Inner = &'a Unused;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a Unused>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a Unused {
+  type Inner = &'a Unused;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<Unused>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for Unused {
+    type Output = Unused;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const Unused as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b Unused {
+    type Output = Unused;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const Unused as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for Unused {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> Unused {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: i32,
+  ) -> Self {
+    let mut s = Self([0; 4]);
+    s.set_a(a);
+    s
+  }
+
+  pub fn a(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_a(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+}
+
+pub enum TableBOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableB<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableB<'a> {
+    type Inner = TableB<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableB<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableB { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableBArgs<'args>) -> flatbuffers::WIPOffset<TableB<'bldr>> {
+      let mut builder = TableBBuilder::new(_fbb);
+      if let Some(x) = args.a { builder.add_a(x); }
+      builder.finish()
+    }
+
+    pub const VT_A: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn a(&self) -> Option<super::super::TableA<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<super::super::TableA>>(TableB::VT_A, None)
+  }
+}
+
+impl flatbuffers::Verifiable for TableB<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<super::super::TableA>>(&"a", Self::VT_A, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableBArgs<'a> {
+    pub a: Option<flatbuffers::WIPOffset<super::super::TableA<'a>>>,
+}
+impl<'a> Default for TableBArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        TableBArgs {
+            a: None,
+        }
+    }
+}
+pub struct TableBBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableBBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_a(&mut self, a: flatbuffers::WIPOffset<super::super::TableA<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<super::super::TableA>>(TableB::VT_A, a);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableBBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableBBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableB<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableB<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableB");
+      ds.field("a", &self.a());
+      ds.finish()
+  }
+}
+}  // pub mod OtherNameSpace
+}  // pub mod MyGame
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/javatest.bin b/3rdparty/TNN/third_party/flatbuffers/tests/javatest.bin
new file mode 100644
index 0000000..804dbba
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/javatest.bin differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/lobstertest.lobster b/3rdparty/TNN/third_party/flatbuffers/tests/lobstertest.lobster
new file mode 100644
index 0000000..a0f81ce
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/lobstertest.lobster
@@ -0,0 +1,198 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import from "../lobster/"
+import monster_test_generated
+import optional_scalars_generated
+
+def check_read_buffer(buf):
+    // CheckReadBuffer checks that the given buffer is evaluated correctly as the example Monster.
+    let monster = MyGame_Example_GetRootAsMonster(buf)
+
+    assert monster.hp == 80
+    assert monster.mana == 150
+    assert monster.name == "MyMonster"
+
+    let vec = monster.pos
+    assert vec
+    assert vec.x == 1.0
+    assert vec.y == 2.0
+    assert vec.z == 3.0
+    assert vec.test1 == 3.0
+    assert vec.test2 == 2
+
+    let t = vec.test3
+    assert t
+    assert t.a == 5
+    assert t.b == 6
+
+    assert monster.test_type == MyGame_Example_Any_Monster
+    assert monster.test_as_Monster.name == "Fred"
+
+    assert monster.inventory_length == 5
+    assert sum(map(monster.inventory_length) i: monster.inventory(i)) == 10
+
+    for(5) i:
+        assert monster.vector_of_longs(i) == pow(10, i * 2)
+
+    assert equal([-1.7976931348623157e+308, 0, 1.7976931348623157e+308],
+                 (map(monster.vector_of_doubles_length) i: monster.vector_of_doubles(i)))
+
+    assert monster.test4_length == 2
+    let test0 = monster.test4(0)
+    let test1 = monster.test4(1)
+    assert test0.a + test0.b + test1.a + test1.b == 100
+
+    assert monster.testarrayofstring_length == 2
+    assert monster.testarrayofstring(0) == "test1"
+    assert monster.testarrayofstring(1) == "test2"
+
+    assert monster.testarrayoftables_length == 0
+    assert monster.testnestedflatbuffer_length == 0
+    assert not monster.testempty()
+
+def make_monster_from_generated_code():
+    // Use generated code to build the example Monster.
+    let b = flatbuffers_builder {}
+
+    let name = b.CreateString("MyMonster")
+    let fred = b.CreateString("Fred")
+
+    let inv = b.MyGame_Example_MonsterCreateInventoryVector([ 0, 1, 2, 3, 4 ])
+
+    let mon2 = MyGame_Example_MonsterBuilder { b }
+        .start()
+        .add_name(fred)
+        .end()
+
+    b.MyGame_Example_MonsterStartTest4Vector(2)
+    b.MyGame_Example_CreateTest(10, 20)
+    b.MyGame_Example_CreateTest(30, 40)
+    let test4 = b.EndVector(2)
+
+    let test_array_of_string = b.MyGame_Example_MonsterCreateTestarrayofstringVector(
+                                   [ b.CreateString("test1"), b.CreateString("test2") ])
+
+    let vector_of_longs = b.MyGame_Example_MonsterCreateVectorOfLongsVector(
+                              [ 1, 100, 10000, 1000000, 100000000 ])
+
+    let vector_of_doubles = b.MyGame_Example_MonsterCreateVectorOfDoublesVector(
+                                [ -1.7976931348623157e+308, 0, 1.7976931348623157e+308 ])
+
+    let mon = MyGame_Example_MonsterBuilder { b }
+        .start()
+        .add_pos(b.MyGame_Example_CreateVec3(1.0, 2.0, 3.0, 3.0,
+                 MyGame_Example_Color_Green, 5, 6))
+        .add_hp(80)
+        .add_name(name)
+        .add_inventory(inv)
+        .add_test_type(MyGame_Example_Any_Monster)
+        .add_test(mon2)
+        .add_test4(test4)
+        .add_testarrayofstring(test_array_of_string)
+        .add_vector_of_longs(vector_of_longs)
+        .add_vector_of_doubles(vector_of_doubles)
+        .end()
+
+    b.Finish(mon)
+
+    return b.SizedCopy()
+
+def test_optional_scalars():
+    def build(add_fields):
+        let b = flatbuffers_builder {}
+        let ss = optional_scalars_ScalarStuffBuilder { b }.start()
+        if add_fields:
+            ss.add_just_i8(1)
+            ss.add_maybe_i8(1)
+            ss.add_default_i8(1)
+            ss.add_just_f64(1.0)
+            ss.add_maybe_f64(1.0)
+            ss.add_default_f64(1.0)
+            ss.add_just_bool(true)
+            ss.add_maybe_bool(true)
+            ss.add_default_bool(true)
+            ss.add_just_enum(optional_scalars_OptionalByte_Two)
+            ss.add_maybe_enum(optional_scalars_OptionalByte_Two)
+            ss.add_default_enum(optional_scalars_OptionalByte_Two)
+        b.Finish(ss.end())
+        return optional_scalars_GetRootAsScalarStuff(b.SizedCopy())
+
+    var root = build(true)
+
+    assert root.just_i8() == 1 and root.default_i8() == 1
+    var maybe_val_i8, maybe_present_i8 = root.maybe_i8()
+    assert maybe_val_i8 == 1 and maybe_present_i8 == true
+
+    assert root.just_f64() == 1.0 and root.default_f64() == 1.0
+    var maybe_val_f64, maybe_present_f64 = root.maybe_f64()
+    assert maybe_val_f64 == 1.0 and maybe_present_f64 == true
+
+    assert root.just_bool() == true and root.default_bool() == true
+    var maybe_val_bool, maybe_present_bool = root.maybe_bool()
+    assert maybe_val_bool == true and maybe_present_bool == true
+
+    assert root.just_enum() == optional_scalars_OptionalByte_Two and root.default_enum() == optional_scalars_OptionalByte_Two
+    var maybe_val_enum, maybe_present_enum = root.maybe_enum()
+    assert maybe_val_enum == optional_scalars_OptionalByte_Two and maybe_present_enum == true
+
+    root = build(false)
+
+    assert root.just_i8() == 0 and root.default_i8() == 42
+    maybe_val_i8, maybe_present_i8 = root.maybe_i8()
+    assert maybe_val_i8 == 0 and maybe_present_i8 == false
+
+    assert root.just_f64() == 0.0 and root.default_f64() == 42.0
+    maybe_val_f64, maybe_present_f64 = root.maybe_f64()
+    assert maybe_val_f64 == 0.0 and maybe_present_f64 == false
+
+    assert root.just_bool() == false and root.default_bool() == true
+    maybe_val_bool, maybe_present_bool = root.maybe_bool()
+    assert maybe_val_bool == false and maybe_present_bool == false
+
+    assert root.just_enum() == optional_scalars_OptionalByte_None and root.default_enum() == optional_scalars_OptionalByte_One
+    maybe_val_enum, maybe_present_enum = root.maybe_enum()
+    assert maybe_val_enum == optional_scalars_OptionalByte_None and maybe_present_enum == false
+
+
+// Verify that the canonical flatbuffer file (produced by the C++ implementation)
+// is readable by the generated Lobster code.
+let fb2 = read_file("monsterdata_test.mon")
+assert fb2
+check_read_buffer(fb2)
+
+// Verify that using the generated Lobster code builds a buffer without
+// returning errors, and is interpreted correctly.
+let fb1 = make_monster_from_generated_code()
+check_read_buffer(fb1)
+// Write the result to file for no good reason.
+write_file("monsterdata_lobster_wire.mon", fb1)
+
+// Test converting the buffer to JSON and parsing the JSON back again.
+let schema = read_file("monster_test.fbs")
+assert schema
+let includedirs = [ "include_test" ]
+// Convert binary to JSON:
+let json, err1 = flatbuffers_binary_to_json(schema, fb1, includedirs)
+assert not err1
+// Parse JSON back to binary:
+let fb3, err2 = flatbuffers_json_to_binary(schema, json, includedirs)
+assert not err2
+// Check the resulting binary again (full roundtrip test):
+check_read_buffer(fb3)
+
+// Additional tests.
+test_optional_scalars()
+
+print "Lobster test succesful!"
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/luatest.lua b/3rdparty/TNN/third_party/flatbuffers/tests/luatest.lua
new file mode 100644
index 0000000..1a70f5f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/luatest.lua
@@ -0,0 +1,428 @@
+package.path = string.format("../lua/?.lua;./?.lua;%s",package.path)
+local compat = require("flatbuffers.compat")
+
+local performBenchmarkTests = false
+
+if #arg > 1 then
+    print("usage: lua luatests [benchmark]");
+    return
+elseif #arg > 0 then 
+    if(arg[1] == "benchmark") then
+        performBenchmarkTests = true
+    end 
+end
+
+local function checkReadBuffer(buf, offset, sizePrefix)
+    offset = offset or 0
+    
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    
+    if sizePrefix then               
+        local size = flatbuffers.N.Int32:Unpack(buf, offset)
+        assert(size == buf.size - offset - 4)
+        offset = offset + flatbuffers.N.Int32.bytewidth
+    end    
+    
+    local mon = monster.GetRootAsMonster(buf, offset)
+    assert(mon:Hp() == 80, "Monster Hp is not 80")
+    assert(mon:Mana() == 150, "Monster Mana is not 150")
+    assert(mon:Name() == "MyMonster", "Monster Name is not MyMonster")
+    assert(mon:Testbool() == true)
+    
+    local vec = assert(mon:Pos(), "Monster Position is nil")
+    assert(vec:X() == 1.0)
+    assert(vec:Y() == 2.0)
+    assert(vec:Z() == 3.0)
+    assert(vec:Test1() == 3.0)
+    assert(vec:Test2() == 2)
+    
+    local t = require("MyGame.Example.Test").New()
+    t = assert(vec:Test3(t))
+    
+    assert(t:A() == 5)
+    assert(t:B() == 6)
+    
+    local ut = require("MyGame.Example.Any")
+    assert(mon:TestType() == ut.Monster)
+    
+    local table2 = mon:Test()
+    assert(getmetatable(table2) == "flatbuffers.view.mt")
+    
+    local mon2 = monster.New()
+    mon2:Init(table2.bytes, table2.pos)
+    
+    assert(mon2:Name() == "Fred")
+    
+    assert(mon:InventoryLength() == 5)
+    local invsum = 0
+    for i=1,mon:InventoryLength() do
+        local v = mon:Inventory(i)
+        invsum = invsum + v
+    end
+    assert(invsum == 10)
+    
+    for i=1,5 do
+        assert(mon:VectorOfLongs(i) == 10^((i-1)*2))
+    end
+    
+    local dbls = { -1.7976931348623157e+308, 0, 1.7976931348623157e+308}
+    for i=1,mon:VectorOfDoublesLength() do
+        assert(mon:VectorOfDoubles(i) == dbls[i])
+    end
+    
+    assert(mon:Test4Length() == 2)
+    
+    local test0 = mon:Test4(1)
+    local test1 = mon:Test4(2)
+    
+    local v0 = test0:A()
+    local v1 = test0:B()
+    local v2 = test1:A()
+    local v3 = test1:B()
+    
+    local sumtest12 = v0 + v1 + v2 + v3
+    assert(sumtest12 == 100)
+    
+    assert(mon:TestarrayofstringLength() == 2)
+    assert(mon:Testarrayofstring(1) == "test1")
+    assert(mon:Testarrayofstring(2) == "test2")
+    
+    assert(mon:TestarrayoftablesLength() == 0)
+    assert(mon:TestnestedflatbufferLength() == 0)
+    assert(mon:Testempty() == nil)
+end
+
+local function generateMonster(sizePrefix, b)
+    if b then b:Clear() end
+    b = b or flatbuffers.Builder(0)
+    local str = b:CreateString("MyMonster")
+    local test1 = b:CreateString("test1")
+    local test2 = b:CreateString("test2")
+    local fred = b:CreateString("Fred")
+    
+    monster.StartInventoryVector(b, 5)
+    b:PrependByte(4)
+    b:PrependByte(3)
+    b:PrependByte(2)
+    b:PrependByte(1)
+    b:PrependByte(0)
+    local inv = b:EndVector(5)
+    
+    monster.Start(b)
+    monster.AddName(b, fred)
+    local mon2 = monster.End(b)
+    
+    monster.StartTest4Vector(b, 2)
+    test.CreateTest(b, 10, 20)
+    test.CreateTest(b, 30, 40)
+    local test4 = b:EndVector(2)
+    
+    monster.StartTestarrayofstringVector(b, 2)
+    b:PrependUOffsetTRelative(test2)
+    b:PrependUOffsetTRelative(test1)
+    local testArrayOfString = b:EndVector(2)
+    
+    monster.StartVectorOfLongsVector(b, 5)
+    b:PrependInt64(100000000)
+    b:PrependInt64(1000000)
+    b:PrependInt64(10000)
+    b:PrependInt64(100)
+    b:PrependInt64(1)
+    local vectorOfLongs = b:EndVector(5)
+    
+    monster.StartVectorOfDoublesVector(b, 3)
+    b:PrependFloat64(1.7976931348623157e+308)
+    b:PrependFloat64(0)
+    b:PrependFloat64(-1.7976931348623157e+308)
+    local vectorOfDoubles = b:EndVector(3)
+    
+    monster.Start(b)
+    local pos = vec3.CreateVec3(b, 1.0, 2.0, 3.0, 3.0, 2, 5, 6)
+    monster.AddPos(b, pos)
+    
+    monster.AddHp(b, 80)
+    monster.AddName(b, str)
+    monster.AddInventory(b, inv)
+    monster.AddTestType(b, 1)
+    monster.AddTest(b, mon2)
+    monster.AddTest4(b, test4)
+    monster.AddTestbool(b, true)
+    monster.AddTestbool(b, false)
+    monster.AddTestbool(b, null)
+    monster.AddTestbool(b,"true")
+    monster.AddTestarrayofstring(b, testArrayOfString)
+    monster.AddVectorOfLongs(b, vectorOfLongs)
+    monster.AddVectorOfDoubles(b, vectorOfDoubles)
+    local mon = monster.End(b)
+    
+    if sizePrefix then
+        b:FinishSizePrefixed(mon)
+    else
+        b:Finish(mon)
+    end
+    return b:Output(true), b:Head()
+end
+
+local function sizePrefix(sizePrefix)
+    local buf,offset = generateMonster(sizePrefix)
+    checkReadBuffer(buf, offset, sizePrefix)
+end
+
+local function fbbClear()
+    -- Generate a builder that will be 'cleared' and reused to create two different objects.
+    local fbb = flatbuffers.Builder(0)
+
+    -- First use the builder to read the normal monster data and verify it works
+    local buf, offset = generateMonster(false, fbb)
+    checkReadBuffer(buf, offset, false)
+
+    -- Then clear the builder to be used again
+    fbb:Clear()
+
+    -- Storage for the built monsters
+    local monsters = {}
+    local lastBuf
+
+    -- Make another builder that will be use identically to the 'cleared' one so outputs can be compared. Build both the
+    -- Cleared builder and new builder in the exact same way, so we can compare their results
+    for i, builder in ipairs({fbb, flatbuffers.Builder(0)}) do
+        local strOffset = builder:CreateString("Hi there")
+        monster.Start(builder)
+        monster.AddPos(builder, vec3.CreateVec3(builder, 3.0, 2.0, 1.0, 17.0, 3, 100, 123))
+        monster.AddName(builder, strOffset)
+        monster.AddMana(builder, 123)
+        builder:Finish(monster.End(builder))
+        local buf = builder:Output(false)
+        if not lastBuf then
+            lastBuf = buf
+        else
+            -- the output, sized-buffer should be identical
+            assert(lastBuf == buf, "Monster output buffers are not identical")
+        end
+        monsters[i] = monster.GetRootAsMonster(flatbuffers.binaryArray.New(buf), 0)
+    end
+
+    -- Check that all the fields for the generated monsters are as we expect
+    for i, monster in ipairs(monsters) do
+        assert(monster:Name() == "Hi there", "Monster Name is not 'Hi There' for monster "..i)
+        -- HP is default to 100 in the schema, but we change it in generateMonster to 80, so this is a good test to
+        -- see if the cleared builder really clears the data.
+        assert(monster:Hp() == 100, "HP doesn't equal the default value for monster "..i)
+        assert(monster:Mana() == 123, "Monster Mana is not '123' for monster "..i)
+        assert(monster:Pos():X() == 3.0, "Monster vec3.X is not '3' for monster "..i)
+    end
+end
+
+local function testCanonicalData()
+    local f = assert(io.open('monsterdata_test.mon', 'rb'))
+    local wireData = f:read("*a")
+    f:close()    
+    checkReadBuffer(wireData)  
+end    
+    
+local function testCreateEmptyString()
+    local b = flatbuffers.Builder(0)
+    local str = b:CreateString("")
+    monster.Start(b)
+    monster.AddName(b, str)
+    b:Finish(monster.End(b))
+    local s = b:Output()
+    local data = flatbuffers.binaryArray.New(s)
+    local mon = monster.GetRootAsMonster(data, 0)
+    assert(mon:Name() == "")
+end
+
+local function benchmarkMakeMonster(count, reuseBuilder)
+    local fbb = reuseBuilder and flatbuffers.Builder(0)
+    local length = #(generateMonster(false, fbb))
+
+    local s = os.clock()
+    for i=1,count do
+        generateMonster(false, fbb)
+    end
+    local e = os.clock()    
+
+    local dur = (e - s)
+    local rate = count / (dur * 1000)
+    local data = (length * count) / (1024 * 1024)
+    local dataRate = data / dur
+    
+    print(string.format('built %d %d-byte flatbuffers in %.2fsec: %.2f/msec, %.2fMB/sec',
+        count, length, dur, rate, dataRate))
+end
+
+local function benchmarkReadBuffer(count)
+    local f = assert(io.open('monsterdata_test.mon', 'rb'))
+    local buf = f:read("*a")
+    f:close()    
+        
+    local s = os.clock()
+    for i=1,count do
+        checkReadBuffer(buf)
+    end
+    local e = os.clock()
+        
+    local dur = (e - s)
+    local rate = count / (dur * 1000)
+    local data = (#buf * count) / (1024 * 1024)
+    local dataRate = data / dur
+    
+    print(string.format('traversed %d %d-byte flatbuffers in %.2fsec: %.2f/msec, %.2fMB/sec',
+        count, #buf, dur, rate, dataRate))
+end
+
+local function getRootAs_canAcceptString()
+    local f = assert(io.open('monsterdata_test.mon', 'rb'))
+    local wireData = f:read("*a")
+    f:close() 
+    assert(type(wireData) == "string", "Data is not a string");
+    local mon = monster.GetRootAsMonster(wireData, 0)
+    assert(mon:Hp() == 80, "Monster Hp is not 80")
+end
+    
+local function testAccessByteVectorAsString()
+    local f = assert(io.open('monsterdata_test.mon', 'rb'))
+    local wireData = f:read("*a")
+    f:close()
+    local mon = monster.GetRootAsMonster(wireData, 0)
+    -- the data of byte array Inventory is [0, 1, 2, 3, 4]
+    local s = mon:InventoryAsString(1, 3)
+    assert(#s == 3)
+    for i = 1, #s do
+        assert(string.byte(s, i) == i - 1)
+    end
+
+    local s = mon:InventoryAsString(2, 5)
+    assert(#s == 4)
+    for i = 1, #s do
+        assert(string.byte(s, i) == i)
+    end
+
+    local s = mon:InventoryAsString(5, 5)
+    assert(#s == 1)
+    assert(string.byte(s, 1) == 4)
+
+    local s = mon:InventoryAsString(2)
+    assert(#s == 4)
+    for i = 1, #s do
+        assert(string.byte(s, i) == i)
+    end
+
+    local s = mon:InventoryAsString()
+    assert(#s == 5)
+    for i = 1, #s do
+        assert(string.byte(s, i) == i - 1)
+    end
+end
+
+local tests = 
+{ 
+    {   
+        f = sizePrefix, 
+        d = "Test size prefix",
+        args = {{true}, {false}}
+    },
+    {
+        f = fbbClear,
+        d = "FlatBufferBuilder Clear",
+    },
+    {   
+        f = testCanonicalData, 
+        d = "Tests Canonical flatbuffer file included in repo"       
+    },
+    {
+        f = testCreateEmptyString,
+        d = "Avoid infinite loop when creating empty string"
+    },
+    {
+        f = getRootAs_canAcceptString,
+        d = "Tests that GetRootAs<type>() generated methods accept strings"
+    },
+    {
+        f = testAccessByteVectorAsString,
+        d = "Access byte vector as string"
+    },
+}
+
+local benchmarks = 
+{
+    {
+        f = benchmarkMakeMonster,
+        d = "Benchmark making monsters",
+        args = {
+            {100}, 
+            {1000},
+            {10000},
+            {10000, true}
+        }
+    },   
+    {
+        f = benchmarkReadBuffer,
+        d = "Benchmark reading monsters",
+        args = {
+            {100}, 
+            {1000},
+            {10000},
+            -- uncomment following to run 1 million to compare. 
+            -- Took ~141 seconds on my machine
+            --{1000000},
+        }
+    }, 
+}
+
+local result, err = xpcall(function()
+    flatbuffers = assert(require("flatbuffers"))
+    monster = assert(require("MyGame.Example.Monster"))  
+    test = assert(require("MyGame.Example.Test"))
+    vec3 = assert(require("MyGame.Example.Vec3"))
+    
+    local function buildArgList(tbl)
+        local s = ""
+        for _,item in ipairs(tbl) do
+            s = s .. tostring(item) .. ","          
+        end
+        return s:sub(1,-2)
+    end
+    
+    if performBenchmarkTests then
+        for _,benchmark in ipairs(benchmarks) do
+            table.insert(tests, benchmark)
+        end
+    end
+
+    local testsPassed, testsFailed = 0,0
+    for _,test in ipairs(tests) do
+        local allargs = test.args or {{}}
+        for _,args in ipairs(allargs) do
+            local results, err = xpcall(test.f,debug.traceback, table.unpack(args))        
+            if results then
+                testsPassed = testsPassed + 1
+            else
+                testsFailed = testsFailed + 1
+                print(string.format(" Test [%s](%s) failed: \n\t%s", 
+                        test.d or "", 
+                        buildArgList(args),
+                        err)) 
+            end
+        end
+    end
+    
+    local totalTests = testsPassed + testsFailed
+    print(string.format("# of test passed: %d / %d (%.2f%%)",
+        testsPassed, 
+        totalTests, 
+        totalTests ~= 0 
+            and 100 * (testsPassed / totalTests) 
+            or 0)
+        )
+    
+    return 0
+end, debug.traceback)
+
+if not result then
+    print("Unable to run tests due to test framework error: ",err)
+end
+
+os.exit(result and 0 or -1)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra.fbs
new file mode 100644
index 0000000..7cadf45
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra.fbs
@@ -0,0 +1,22 @@
+namespace MyGame;
+
+// Not all programming languages support this extra table.
+table MonsterExtra {
+  // Float-point values with NaN and Inf defaults.
+  d0:double = nan;
+  d1:double = -nan; // parser must ignore sign of NaN
+  d2:double = +inf;
+  d3:double = -inf;
+  f0:float = -nan; // parser must ignore sign of NaN
+  f1:float = +nan;
+  f2:float = +inf;
+  f3:float = -inf;
+  dvec : [double];
+  fvec : [float];
+  deprec:int (deprecated);
+}
+
+root_type MonsterExtra;
+
+file_identifier "MONE";
+file_extension "mon";
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_generated.h
new file mode 100644
index 0000000..be4801f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_generated.h
@@ -0,0 +1,400 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MONSTEREXTRA_MYGAME_H_
+#define FLATBUFFERS_GENERATED_MONSTEREXTRA_MYGAME_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace MyGame {
+
+struct MonsterExtra;
+struct MonsterExtraBuilder;
+struct MonsterExtraT;
+
+bool operator==(const MonsterExtraT &lhs, const MonsterExtraT &rhs);
+bool operator!=(const MonsterExtraT &lhs, const MonsterExtraT &rhs);
+
+inline const flatbuffers::TypeTable *MonsterExtraTypeTable();
+
+struct MonsterExtraT : public flatbuffers::NativeTable {
+  typedef MonsterExtra TableType;
+  double d0 = std::numeric_limits<double>::quiet_NaN();
+  double d1 = std::numeric_limits<double>::quiet_NaN();
+  double d2 = std::numeric_limits<double>::infinity();
+  double d3 = -std::numeric_limits<double>::infinity();
+  float f0 = std::numeric_limits<float>::quiet_NaN();
+  float f1 = std::numeric_limits<float>::quiet_NaN();
+  float f2 = std::numeric_limits<float>::infinity();
+  float f3 = -std::numeric_limits<float>::infinity();
+  std::vector<double> dvec{};
+  std::vector<float> fvec{};
+};
+
+inline bool operator==(const MonsterExtraT &lhs, const MonsterExtraT &rhs) {
+  return
+      (lhs.d0 == rhs.d0) &&
+      (lhs.d1 == rhs.d1) &&
+      (lhs.d2 == rhs.d2) &&
+      (lhs.d3 == rhs.d3) &&
+      (lhs.f0 == rhs.f0) &&
+      (lhs.f1 == rhs.f1) &&
+      (lhs.f2 == rhs.f2) &&
+      (lhs.f3 == rhs.f3) &&
+      (lhs.dvec == rhs.dvec) &&
+      (lhs.fvec == rhs.fvec);
+}
+
+inline bool operator!=(const MonsterExtraT &lhs, const MonsterExtraT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct MonsterExtra FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterExtraT NativeTableType;
+  typedef MonsterExtraBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterExtraTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_D0 = 4,
+    VT_D1 = 6,
+    VT_D2 = 8,
+    VT_D3 = 10,
+    VT_F0 = 12,
+    VT_F1 = 14,
+    VT_F2 = 16,
+    VT_F3 = 18,
+    VT_DVEC = 20,
+    VT_FVEC = 22
+  };
+  double d0() const {
+    return GetField<double>(VT_D0, std::numeric_limits<double>::quiet_NaN());
+  }
+  bool mutate_d0(double _d0) {
+    return SetField<double>(VT_D0, _d0, std::numeric_limits<double>::quiet_NaN());
+  }
+  double d1() const {
+    return GetField<double>(VT_D1, std::numeric_limits<double>::quiet_NaN());
+  }
+  bool mutate_d1(double _d1) {
+    return SetField<double>(VT_D1, _d1, std::numeric_limits<double>::quiet_NaN());
+  }
+  double d2() const {
+    return GetField<double>(VT_D2, std::numeric_limits<double>::infinity());
+  }
+  bool mutate_d2(double _d2) {
+    return SetField<double>(VT_D2, _d2, std::numeric_limits<double>::infinity());
+  }
+  double d3() const {
+    return GetField<double>(VT_D3, -std::numeric_limits<double>::infinity());
+  }
+  bool mutate_d3(double _d3) {
+    return SetField<double>(VT_D3, _d3, -std::numeric_limits<double>::infinity());
+  }
+  float f0() const {
+    return GetField<float>(VT_F0, std::numeric_limits<float>::quiet_NaN());
+  }
+  bool mutate_f0(float _f0) {
+    return SetField<float>(VT_F0, _f0, std::numeric_limits<float>::quiet_NaN());
+  }
+  float f1() const {
+    return GetField<float>(VT_F1, std::numeric_limits<float>::quiet_NaN());
+  }
+  bool mutate_f1(float _f1) {
+    return SetField<float>(VT_F1, _f1, std::numeric_limits<float>::quiet_NaN());
+  }
+  float f2() const {
+    return GetField<float>(VT_F2, std::numeric_limits<float>::infinity());
+  }
+  bool mutate_f2(float _f2) {
+    return SetField<float>(VT_F2, _f2, std::numeric_limits<float>::infinity());
+  }
+  float f3() const {
+    return GetField<float>(VT_F3, -std::numeric_limits<float>::infinity());
+  }
+  bool mutate_f3(float _f3) {
+    return SetField<float>(VT_F3, _f3, -std::numeric_limits<float>::infinity());
+  }
+  const flatbuffers::Vector<double> *dvec() const {
+    return GetPointer<const flatbuffers::Vector<double> *>(VT_DVEC);
+  }
+  flatbuffers::Vector<double> *mutable_dvec() {
+    return GetPointer<flatbuffers::Vector<double> *>(VT_DVEC);
+  }
+  const flatbuffers::Vector<float> *fvec() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_FVEC);
+  }
+  flatbuffers::Vector<float> *mutable_fvec() {
+    return GetPointer<flatbuffers::Vector<float> *>(VT_FVEC);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<double>(verifier, VT_D0) &&
+           VerifyField<double>(verifier, VT_D1) &&
+           VerifyField<double>(verifier, VT_D2) &&
+           VerifyField<double>(verifier, VT_D3) &&
+           VerifyField<float>(verifier, VT_F0) &&
+           VerifyField<float>(verifier, VT_F1) &&
+           VerifyField<float>(verifier, VT_F2) &&
+           VerifyField<float>(verifier, VT_F3) &&
+           VerifyOffset(verifier, VT_DVEC) &&
+           verifier.VerifyVector(dvec()) &&
+           VerifyOffset(verifier, VT_FVEC) &&
+           verifier.VerifyVector(fvec()) &&
+           verifier.EndTable();
+  }
+  MonsterExtraT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterExtraT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MonsterExtra> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterExtraT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MonsterExtraBuilder {
+  typedef MonsterExtra Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_d0(double d0) {
+    fbb_.AddElement<double>(MonsterExtra::VT_D0, d0, std::numeric_limits<double>::quiet_NaN());
+  }
+  void add_d1(double d1) {
+    fbb_.AddElement<double>(MonsterExtra::VT_D1, d1, std::numeric_limits<double>::quiet_NaN());
+  }
+  void add_d2(double d2) {
+    fbb_.AddElement<double>(MonsterExtra::VT_D2, d2, std::numeric_limits<double>::infinity());
+  }
+  void add_d3(double d3) {
+    fbb_.AddElement<double>(MonsterExtra::VT_D3, d3, -std::numeric_limits<double>::infinity());
+  }
+  void add_f0(float f0) {
+    fbb_.AddElement<float>(MonsterExtra::VT_F0, f0, std::numeric_limits<float>::quiet_NaN());
+  }
+  void add_f1(float f1) {
+    fbb_.AddElement<float>(MonsterExtra::VT_F1, f1, std::numeric_limits<float>::quiet_NaN());
+  }
+  void add_f2(float f2) {
+    fbb_.AddElement<float>(MonsterExtra::VT_F2, f2, std::numeric_limits<float>::infinity());
+  }
+  void add_f3(float f3) {
+    fbb_.AddElement<float>(MonsterExtra::VT_F3, f3, -std::numeric_limits<float>::infinity());
+  }
+  void add_dvec(flatbuffers::Offset<flatbuffers::Vector<double>> dvec) {
+    fbb_.AddOffset(MonsterExtra::VT_DVEC, dvec);
+  }
+  void add_fvec(flatbuffers::Offset<flatbuffers::Vector<float>> fvec) {
+    fbb_.AddOffset(MonsterExtra::VT_FVEC, fvec);
+  }
+  explicit MonsterExtraBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MonsterExtra> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MonsterExtra>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MonsterExtra> CreateMonsterExtra(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    double d0 = std::numeric_limits<double>::quiet_NaN(),
+    double d1 = std::numeric_limits<double>::quiet_NaN(),
+    double d2 = std::numeric_limits<double>::infinity(),
+    double d3 = -std::numeric_limits<double>::infinity(),
+    float f0 = std::numeric_limits<float>::quiet_NaN(),
+    float f1 = std::numeric_limits<float>::quiet_NaN(),
+    float f2 = std::numeric_limits<float>::infinity(),
+    float f3 = -std::numeric_limits<float>::infinity(),
+    flatbuffers::Offset<flatbuffers::Vector<double>> dvec = 0,
+    flatbuffers::Offset<flatbuffers::Vector<float>> fvec = 0) {
+  MonsterExtraBuilder builder_(_fbb);
+  builder_.add_d3(d3);
+  builder_.add_d2(d2);
+  builder_.add_d1(d1);
+  builder_.add_d0(d0);
+  builder_.add_fvec(fvec);
+  builder_.add_dvec(dvec);
+  builder_.add_f3(f3);
+  builder_.add_f2(f2);
+  builder_.add_f1(f1);
+  builder_.add_f0(f0);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<MonsterExtra> CreateMonsterExtraDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    double d0 = std::numeric_limits<double>::quiet_NaN(),
+    double d1 = std::numeric_limits<double>::quiet_NaN(),
+    double d2 = std::numeric_limits<double>::infinity(),
+    double d3 = -std::numeric_limits<double>::infinity(),
+    float f0 = std::numeric_limits<float>::quiet_NaN(),
+    float f1 = std::numeric_limits<float>::quiet_NaN(),
+    float f2 = std::numeric_limits<float>::infinity(),
+    float f3 = -std::numeric_limits<float>::infinity(),
+    const std::vector<double> *dvec = nullptr,
+    const std::vector<float> *fvec = nullptr) {
+  auto dvec__ = dvec ? _fbb.CreateVector<double>(*dvec) : 0;
+  auto fvec__ = fvec ? _fbb.CreateVector<float>(*fvec) : 0;
+  return MyGame::CreateMonsterExtra(
+      _fbb,
+      d0,
+      d1,
+      d2,
+      d3,
+      f0,
+      f1,
+      f2,
+      f3,
+      dvec__,
+      fvec__);
+}
+
+flatbuffers::Offset<MonsterExtra> CreateMonsterExtra(flatbuffers::FlatBufferBuilder &_fbb, const MonsterExtraT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline MonsterExtraT *MonsterExtra::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MonsterExtraT>(new MonsterExtraT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MonsterExtra::UnPackTo(MonsterExtraT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = d0(); _o->d0 = _e; }
+  { auto _e = d1(); _o->d1 = _e; }
+  { auto _e = d2(); _o->d2 = _e; }
+  { auto _e = d3(); _o->d3 = _e; }
+  { auto _e = f0(); _o->f0 = _e; }
+  { auto _e = f1(); _o->f1 = _e; }
+  { auto _e = f2(); _o->f2 = _e; }
+  { auto _e = f3(); _o->f3 = _e; }
+  { auto _e = dvec(); if (_e) { _o->dvec.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dvec[_i] = _e->Get(_i); } } }
+  { auto _e = fvec(); if (_e) { _o->fvec.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->fvec[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<MonsterExtra> MonsterExtra::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterExtraT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonsterExtra(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MonsterExtra> CreateMonsterExtra(flatbuffers::FlatBufferBuilder &_fbb, const MonsterExtraT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterExtraT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _d0 = _o->d0;
+  auto _d1 = _o->d1;
+  auto _d2 = _o->d2;
+  auto _d3 = _o->d3;
+  auto _f0 = _o->f0;
+  auto _f1 = _o->f1;
+  auto _f2 = _o->f2;
+  auto _f3 = _o->f3;
+  auto _dvec = _o->dvec.size() ? _fbb.CreateVector(_o->dvec) : 0;
+  auto _fvec = _o->fvec.size() ? _fbb.CreateVector(_o->fvec) : 0;
+  return MyGame::CreateMonsterExtra(
+      _fbb,
+      _d0,
+      _d1,
+      _d2,
+      _d3,
+      _f0,
+      _f1,
+      _f2,
+      _f3,
+      _dvec,
+      _fvec);
+}
+
+inline const flatbuffers::TypeTable *MonsterExtraTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 1, -1 },
+    { flatbuffers::ET_FLOAT, 1, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "d0",
+    "d1",
+    "d2",
+    "d3",
+    "f0",
+    "f1",
+    "f2",
+    "f3",
+    "dvec",
+    "fvec",
+    "deprec"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 11, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const MyGame::MonsterExtra *GetMonsterExtra(const void *buf) {
+  return flatbuffers::GetRoot<MyGame::MonsterExtra>(buf);
+}
+
+inline const MyGame::MonsterExtra *GetSizePrefixedMonsterExtra(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<MyGame::MonsterExtra>(buf);
+}
+
+inline MonsterExtra *GetMutableMonsterExtra(void *buf) {
+  return flatbuffers::GetMutableRoot<MonsterExtra>(buf);
+}
+
+inline const char *MonsterExtraIdentifier() {
+  return "MONE";
+}
+
+inline bool MonsterExtraBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, MonsterExtraIdentifier());
+}
+
+inline bool VerifyMonsterExtraBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<MyGame::MonsterExtra>(MonsterExtraIdentifier());
+}
+
+inline bool VerifySizePrefixedMonsterExtraBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<MyGame::MonsterExtra>(MonsterExtraIdentifier());
+}
+
+inline const char *MonsterExtraExtension() {
+  return "mon";
+}
+
+inline void FinishMonsterExtraBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::MonsterExtra> root) {
+  fbb.Finish(root, MonsterExtraIdentifier());
+}
+
+inline void FinishSizePrefixedMonsterExtraBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::MonsterExtra> root) {
+  fbb.FinishSizePrefixed(root, MonsterExtraIdentifier());
+}
+
+inline flatbuffers::unique_ptr<MyGame::MonsterExtraT> UnPackMonsterExtra(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::MonsterExtraT>(GetMonsterExtra(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<MyGame::MonsterExtraT> UnPackSizePrefixedMonsterExtra(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::MonsterExtraT>(GetSizePrefixedMonsterExtra(buf)->UnPack(res));
+}
+
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_MONSTEREXTRA_MYGAME_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_my_game_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_my_game_generated.dart
new file mode 100644
index 0000000..676641d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_extra_my_game_generated.dart
@@ -0,0 +1,176 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+
+class MonsterExtra {
+  MonsterExtra._(this._bc, this._bcOffset);
+  factory MonsterExtra(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<MonsterExtra> reader = const _MonsterExtraReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  double get d0 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 4, double.nan);
+  double get d1 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 6, double.nan);
+  double get d2 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 8, double.infinity);
+  double get d3 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 10, double.negativeInfinity);
+  double get f0 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 12, double.nan);
+  double get f1 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 14, double.nan);
+  double get f2 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 16, double.infinity);
+  double get f3 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 18, double.negativeInfinity);
+  List<double> get dvec => const fb.ListReader<double>(const fb.Float64Reader()).vTableGet(_bc, _bcOffset, 20, null);
+  List<double> get fvec => const fb.ListReader<double>(const fb.Float32Reader()).vTableGet(_bc, _bcOffset, 22, null);
+
+  @override
+  String toString() {
+    return 'MonsterExtra{d0: $d0, d1: $d1, d2: $d2, d3: $d3, f0: $f0, f1: $f1, f2: $f2, f3: $f3, dvec: $dvec, fvec: $fvec}';
+  }
+}
+
+class _MonsterExtraReader extends fb.TableReader<MonsterExtra> {
+  const _MonsterExtraReader();
+
+  @override
+  MonsterExtra createObject(fb.BufferContext bc, int offset) => 
+    new MonsterExtra._(bc, offset);
+}
+
+class MonsterExtraBuilder {
+  MonsterExtraBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addD0(double d0) {
+    fbBuilder.addFloat64(0, d0);
+    return fbBuilder.offset;
+  }
+  int addD1(double d1) {
+    fbBuilder.addFloat64(1, d1);
+    return fbBuilder.offset;
+  }
+  int addD2(double d2) {
+    fbBuilder.addFloat64(2, d2);
+    return fbBuilder.offset;
+  }
+  int addD3(double d3) {
+    fbBuilder.addFloat64(3, d3);
+    return fbBuilder.offset;
+  }
+  int addF0(double f0) {
+    fbBuilder.addFloat32(4, f0);
+    return fbBuilder.offset;
+  }
+  int addF1(double f1) {
+    fbBuilder.addFloat32(5, f1);
+    return fbBuilder.offset;
+  }
+  int addF2(double f2) {
+    fbBuilder.addFloat32(6, f2);
+    return fbBuilder.offset;
+  }
+  int addF3(double f3) {
+    fbBuilder.addFloat32(7, f3);
+    return fbBuilder.offset;
+  }
+  int addDvecOffset(int offset) {
+    fbBuilder.addOffset(8, offset);
+    return fbBuilder.offset;
+  }
+  int addFvecOffset(int offset) {
+    fbBuilder.addOffset(9, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class MonsterExtraObjectBuilder extends fb.ObjectBuilder {
+  final double _d0;
+  final double _d1;
+  final double _d2;
+  final double _d3;
+  final double _f0;
+  final double _f1;
+  final double _f2;
+  final double _f3;
+  final List<double> _dvec;
+  final List<double> _fvec;
+
+  MonsterExtraObjectBuilder({
+    double d0,
+    double d1,
+    double d2,
+    double d3,
+    double f0,
+    double f1,
+    double f2,
+    double f3,
+    List<double> dvec,
+    List<double> fvec,
+  })
+      : _d0 = d0,
+        _d1 = d1,
+        _d2 = d2,
+        _d3 = d3,
+        _f0 = f0,
+        _f1 = f1,
+        _f2 = f2,
+        _f3 = f3,
+        _dvec = dvec,
+        _fvec = fvec;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int dvecOffset = _dvec?.isNotEmpty == true
+        ? fbBuilder.writeListFloat64(_dvec)
+        : null;
+    final int fvecOffset = _fvec?.isNotEmpty == true
+        ? fbBuilder.writeListFloat32(_fvec)
+        : null;
+
+    fbBuilder.startTable();
+    fbBuilder.addFloat64(0, _d0);
+    fbBuilder.addFloat64(1, _d1);
+    fbBuilder.addFloat64(2, _d2);
+    fbBuilder.addFloat64(3, _d3);
+    fbBuilder.addFloat32(4, _f0);
+    fbBuilder.addFloat32(5, _f1);
+    fbBuilder.addFloat32(6, _f2);
+    fbBuilder.addFloat32(7, _f3);
+    if (dvecOffset != null) {
+      fbBuilder.addOffset(8, dvecOffset);
+    }
+    if (fvecOffset != null) {
+      fbBuilder.addOffset(9, fvecOffset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.bfbs b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.bfbs
new file mode 100644
index 0000000..12e0150
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.bfbs differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.fbs
new file mode 100644
index 0000000..fdd5acf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.fbs
@@ -0,0 +1,154 @@
+// test schema file
+
+include "include_test1.fbs";
+
+namespace MyGame;
+
+table InParentNamespace {}
+
+namespace MyGame.Example2;
+
+table Monster {}  // Test having same name as below, but in different namespace.
+
+namespace MyGame.Example;
+
+attribute "priority";
+
+/// Composite components of Monster color.
+enum Color:ubyte (bit_flags) {
+  Red = 0, // color Red = (1u << 0)
+  /// \brief color Green
+  /// Green is bit_flag with value (1u << 1)
+  Green,
+  /// \brief color Blue (1u << 3)
+  Blue = 3,
+}
+
+enum Race:byte {
+  None = -1,
+  Human = 0,
+  Dwarf,
+  Elf,
+}
+
+union Any { Monster, TestSimpleTableWithEnum, MyGame.Example2.Monster }
+
+union AnyUniqueAliases { M: Monster, TS: TestSimpleTableWithEnum, M2: MyGame.Example2.Monster }
+union AnyAmbiguousAliases { M1: Monster, M2: Monster, M3: Monster }
+
+struct Test { a:short; b:byte; }
+
+table TestSimpleTableWithEnum (csharp_partial, private) {
+  color: Color = Green;
+}
+
+struct Vec3 (force_align: 8) {
+  x:float;
+  y:float;
+  z:float;
+  test1:double;
+  test2:Color;
+  test3:Test;
+}
+
+struct Ability {
+  id:uint(key);
+  distance:uint;
+}
+
+struct StructOfStructs {
+  a: Ability;
+  b: Test;
+  c: Ability;
+}
+
+table Stat {
+  id:string;
+  val:long;
+  count:ushort (key);
+}
+
+table Referrable {
+  id:ulong(key, hash:"fnv1a_64");
+}
+
+/// an example documentation comment: "monster object"
+table Monster {
+  pos:Vec3 (id: 0);
+  hp:short = 100 (id: 2);
+  mana:short = 150 (id: 1);
+  name:string (id: 3, key);
+  color:Color = Blue (id: 6);
+  inventory:[ubyte] (id: 5);
+  friendly:bool = false (deprecated, priority: 1, id: 4);
+  /// an example documentation comment: this will end up in the generated code
+  /// multiline too
+  testarrayoftables:[Monster] (id: 11);
+  testarrayofstring:[string] (id: 10);
+  testarrayofstring2:[string] (id: 28);
+  testarrayofbools:[bool] (id: 24);
+  testarrayofsortedstruct:[Ability] (id: 29);
+  enemy:MyGame.Example.Monster (id:12);  // Test referring by full namespace.
+  test:Any (id: 8);
+  test4:[Test] (id: 9);
+  test5:[Test] (id: 31);
+  testnestedflatbuffer:[ubyte] (id:13, nested_flatbuffer: "Monster");
+  testempty:Stat (id:14);
+  testbool:bool (id:15);
+  testhashs32_fnv1:int (id:16, hash:"fnv1_32");
+  testhashu32_fnv1:uint (id:17, hash:"fnv1_32");
+  testhashs64_fnv1:long (id:18, hash:"fnv1_64");
+  testhashu64_fnv1:ulong (id:19, hash:"fnv1_64");
+  testhashs32_fnv1a:int (id:20, hash:"fnv1a_32");
+  testhashu32_fnv1a:uint (id:21, hash:"fnv1a_32", cpp_type:"Stat");
+  testhashs64_fnv1a:long (id:22, hash:"fnv1a_64");
+  testhashu64_fnv1a:ulong (id:23, hash:"fnv1a_64");
+  testf:float = 3.14159 (id:25);
+  testf2:float = 3 (id:26);
+  testf3:float (id:27);
+  flex:[ubyte] (id:30, flexbuffer);
+  vector_of_longs:[long] (id:32);
+  vector_of_doubles:[double] (id:33);
+  parent_namespace_test:InParentNamespace (id:34);
+  vector_of_referrables:[Referrable](id:35);
+  single_weak_reference:ulong(id:36, hash:"fnv1a_64", cpp_type:"ReferrableT");
+  vector_of_weak_references:[ulong](id:37, hash:"fnv1a_64", cpp_type:"ReferrableT");
+  vector_of_strong_referrables:[Referrable](id:38, cpp_ptr_type:"default_ptr_type");                 //was shared_ptr
+  co_owning_reference:ulong(id:39, hash:"fnv1a_64", cpp_type:"ReferrableT", cpp_ptr_type:"naked");  //was shared_ptr as well
+  vector_of_co_owning_references:[ulong](id:40, hash:"fnv1a_64", cpp_type:"ReferrableT", cpp_ptr_type:"default_ptr_type", cpp_ptr_type_get:".get()");  //was shared_ptr
+  non_owning_reference:ulong(id:41, hash:"fnv1a_64", cpp_type:"ReferrableT", cpp_ptr_type:"naked", cpp_ptr_type_get:"");                              //was weak_ptr
+  vector_of_non_owning_references:[ulong](id:42, hash:"fnv1a_64", cpp_type:"ReferrableT", cpp_ptr_type:"naked", cpp_ptr_type_get:"");                 //was weak_ptr
+  any_unique:AnyUniqueAliases(id:44);
+  any_ambiguous:AnyAmbiguousAliases (id:46);
+  vector_of_enums:[Color] (id:47);
+  signed_enum:Race = None (id:48);
+  testrequirednestedflatbuffer:[ubyte] (id:49, nested_flatbuffer: "Monster");
+  scalar_key_sorted_tables:[Stat] (id: 50);
+}
+
+table TypeAliases {
+    i8:int8;
+    u8:uint8;
+    i16:int16;
+    u16:uint16;
+    i32:int32;
+    u32:uint32;
+    i64:int64;
+    u64:uint64;
+    f32:float32;
+    f64:float64;
+    v8:[int8];
+    vf64:[float64];
+}
+
+rpc_service MonsterStorage {
+  Store(Monster):Stat (streaming: "none");
+  Retrieve(Stat):Monster (streaming: "server", idempotent);
+  GetMaxHitPoint(Monster):Stat (streaming: "client");
+  GetMinMaxHitPoints(Monster):Stat (streaming: "bidi");
+}
+
+root_type Monster;
+
+file_identifier "MONS";
+file_extension "mon";
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.cc b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.cc
new file mode 100644
index 0000000..0b07cc7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.cc
@@ -0,0 +1,142 @@
+// Generated by the gRPC C++ plugin.
+// If you make any local change, they will be lost.
+// source: monster_test
+
+#include "monster_test_generated.h"
+#include "monster_test.grpc.fb.h"
+
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/channel_interface.h>
+#include <grpcpp/impl/codegen/client_unary_call.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/rpc_service_method.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+namespace MyGame {
+namespace Example {
+
+static const char* MonsterStorage_method_names[] = {
+  "/MyGame.Example.MonsterStorage/Store",
+  "/MyGame.Example.MonsterStorage/Retrieve",
+  "/MyGame.Example.MonsterStorage/GetMaxHitPoint",
+  "/MyGame.Example.MonsterStorage/GetMinMaxHitPoints",
+};
+
+std::unique_ptr< MonsterStorage::Stub> MonsterStorage::NewStub(const std::shared_ptr< ::grpc::ChannelInterface>& channel, const ::grpc::StubOptions& options) {
+  std::unique_ptr< MonsterStorage::Stub> stub(new MonsterStorage::Stub(channel));
+  return stub;
+}
+
+MonsterStorage::Stub::Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+  : channel_(channel)  , rpcmethod_Store_(MonsterStorage_method_names[0], ::grpc::internal::RpcMethod::NORMAL_RPC, channel)
+  , rpcmethod_Retrieve_(MonsterStorage_method_names[1], ::grpc::internal::RpcMethod::SERVER_STREAMING, channel)
+  , rpcmethod_GetMaxHitPoint_(MonsterStorage_method_names[2], ::grpc::internal::RpcMethod::CLIENT_STREAMING, channel)
+  , rpcmethod_GetMinMaxHitPoints_(MonsterStorage_method_names[3], ::grpc::internal::RpcMethod::BIDI_STREAMING, channel)
+  {}
+  
+::grpc::Status MonsterStorage::Stub::Store(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, flatbuffers::grpc::Message<Stat>* response) {
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Store_, context, request, response);
+}
+
+::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>* MonsterStorage::Stub::AsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory< flatbuffers::grpc::Message<Stat>>::Create(channel_.get(), cq, rpcmethod_Store_, context, request, true);
+}
+
+::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>* MonsterStorage::Stub::PrepareAsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory< flatbuffers::grpc::Message<Stat>>::Create(channel_.get(), cq, rpcmethod_Store_, context, request, false);
+}
+
+::grpc::ClientReader< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::RetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request) {
+  return ::grpc::internal::ClientReaderFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), rpcmethod_Retrieve_, context, request);
+}
+
+::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::AsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq, void* tag) {
+  return ::grpc::internal::ClientAsyncReaderFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), cq, rpcmethod_Retrieve_, context, request, true, tag);
+}
+
+::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::PrepareAsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncReaderFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), cq, rpcmethod_Retrieve_, context, request, false, nullptr);
+}
+
+::grpc::ClientWriter< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::GetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response) {
+  return ::grpc::internal::ClientWriterFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), rpcmethod_GetMaxHitPoint_, context, response);
+}
+
+::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::AsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq, void* tag) {
+  return ::grpc::internal::ClientAsyncWriterFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), cq, rpcmethod_GetMaxHitPoint_, context, response, true, tag);
+}
+
+::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>* MonsterStorage::Stub::PrepareAsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncWriterFactory< flatbuffers::grpc::Message<Monster>>::Create(channel_.get(), cq, rpcmethod_GetMaxHitPoint_, context, response, false, nullptr);
+}
+
+::grpc::ClientReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* MonsterStorage::Stub::GetMinMaxHitPointsRaw(::grpc::ClientContext* context) {
+  return ::grpc::internal::ClientReaderWriterFactory< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>::Create(channel_.get(), rpcmethod_GetMinMaxHitPoints_, context);
+}
+
+::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* MonsterStorage::Stub::AsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) {
+  return ::grpc::internal::ClientAsyncReaderWriterFactory< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>::Create(channel_.get(), cq, rpcmethod_GetMinMaxHitPoints_, context, true, tag);
+}
+
+::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* MonsterStorage::Stub::PrepareAsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncReaderWriterFactory< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>::Create(channel_.get(), cq, rpcmethod_GetMinMaxHitPoints_, context, false, nullptr);
+}
+
+MonsterStorage::Service::Service() {
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      MonsterStorage_method_names[0],
+      ::grpc::internal::RpcMethod::NORMAL_RPC,
+      new ::grpc::internal::RpcMethodHandler< MonsterStorage::Service, flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>(
+          std::mem_fn(&MonsterStorage::Service::Store), this)));
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      MonsterStorage_method_names[1],
+      ::grpc::internal::RpcMethod::SERVER_STREAMING,
+      new ::grpc::internal::ServerStreamingHandler< MonsterStorage::Service, flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>(
+          std::mem_fn(&MonsterStorage::Service::Retrieve), this)));
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      MonsterStorage_method_names[2],
+      ::grpc::internal::RpcMethod::CLIENT_STREAMING,
+      new ::grpc::internal::ClientStreamingHandler< MonsterStorage::Service, flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>(
+          std::mem_fn(&MonsterStorage::Service::GetMaxHitPoint), this)));
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      MonsterStorage_method_names[3],
+      ::grpc::internal::RpcMethod::BIDI_STREAMING,
+      new ::grpc::internal::BidiStreamingHandler< MonsterStorage::Service, flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>(
+          std::mem_fn(&MonsterStorage::Service::GetMinMaxHitPoints), this)));
+}
+
+MonsterStorage::Service::~Service() {
+}
+
+::grpc::Status MonsterStorage::Service::Store(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Monster>* request, flatbuffers::grpc::Message<Stat>* response) {
+  (void) context;
+  (void) request;
+  (void) response;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+::grpc::Status MonsterStorage::Service::Retrieve(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerWriter< flatbuffers::grpc::Message<Monster>>* writer) {
+  (void) context;
+  (void) request;
+  (void) writer;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+::grpc::Status MonsterStorage::Service::GetMaxHitPoint(::grpc::ServerContext* context, ::grpc::ServerReader< flatbuffers::grpc::Message<Monster>>* reader, flatbuffers::grpc::Message<Stat>* response) {
+  (void) context;
+  (void) reader;
+  (void) response;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+::grpc::Status MonsterStorage::Service::GetMinMaxHitPoints(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* stream) {
+  (void) context;
+  (void) stream;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+
+}  // namespace MyGame
+}  // namespace Example
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.h b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.h
new file mode 100644
index 0000000..1b33847
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.grpc.fb.h
@@ -0,0 +1,350 @@
+// Generated by the gRPC C++ plugin.
+// If you make any local change, they will be lost.
+// source: monster_test
+#ifndef GRPC_monster_5ftest__INCLUDED
+#define GRPC_monster_5ftest__INCLUDED
+
+#include "monster_test_generated.h"
+#include "flatbuffers/grpc.h"
+
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/proto_utils.h>
+#include <grpcpp/impl/codegen/rpc_method.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/stub_options.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class ServerCompletionQueue;
+class ServerContext;
+}  // namespace grpc
+
+namespace MyGame {
+namespace Example {
+
+class MonsterStorage final {
+ public:
+  static constexpr char const* service_full_name() {
+    return "MyGame.Example.MonsterStorage";
+  }
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status Store(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, flatbuffers::grpc::Message<Stat>* response) = 0;
+    std::unique_ptr< ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>> AsyncStore(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>>(AsyncStoreRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>> PrepareAsyncStore(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>>(PrepareAsyncStoreRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientReaderInterface< flatbuffers::grpc::Message<Monster>>> Retrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request) {
+      return std::unique_ptr< ::grpc::ClientReaderInterface< flatbuffers::grpc::Message<Monster>>>(RetrieveRaw(context, request));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>> AsyncRetrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>>(AsyncRetrieveRaw(context, request, cq, tag));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>> PrepareAsyncRetrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>>(PrepareAsyncRetrieveRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientWriterInterface< flatbuffers::grpc::Message<Monster>>> GetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response) {
+      return std::unique_ptr< ::grpc::ClientWriterInterface< flatbuffers::grpc::Message<Monster>>>(GetMaxHitPointRaw(context, response));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>> AsyncGetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>>(AsyncGetMaxHitPointRaw(context, response, cq, tag));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>> PrepareAsyncGetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>>(PrepareAsyncGetMaxHitPointRaw(context, response, cq));
+    }
+    std::unique_ptr< ::grpc::ClientReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> GetMinMaxHitPoints(::grpc::ClientContext* context) {
+      return std::unique_ptr< ::grpc::ClientReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(GetMinMaxHitPointsRaw(context));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> AsyncGetMinMaxHitPoints(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(AsyncGetMinMaxHitPointsRaw(context, cq, tag));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> PrepareAsyncGetMinMaxHitPoints(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(PrepareAsyncGetMinMaxHitPointsRaw(context, cq));
+    }
+  private:
+    virtual ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>* AsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientAsyncResponseReaderInterface< flatbuffers::grpc::Message<Stat>>* PrepareAsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientReaderInterface< flatbuffers::grpc::Message<Monster>>* RetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request) = 0;
+    virtual ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>* AsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq, void* tag) = 0;
+    virtual ::grpc::ClientAsyncReaderInterface< flatbuffers::grpc::Message<Monster>>* PrepareAsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientWriterInterface< flatbuffers::grpc::Message<Monster>>* GetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response) = 0;
+    virtual ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>* AsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq, void* tag) = 0;
+    virtual ::grpc::ClientAsyncWriterInterface< flatbuffers::grpc::Message<Monster>>* PrepareAsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* GetMinMaxHitPointsRaw(::grpc::ClientContext* context) = 0;
+    virtual ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* AsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) = 0;
+    virtual ::grpc::ClientAsyncReaderWriterInterface< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* PrepareAsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status Store(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, flatbuffers::grpc::Message<Stat>* response) override;
+    std::unique_ptr< ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>> AsyncStore(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>>(AsyncStoreRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>> PrepareAsyncStore(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>>(PrepareAsyncStoreRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientReader< flatbuffers::grpc::Message<Monster>>> Retrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request) {
+      return std::unique_ptr< ::grpc::ClientReader< flatbuffers::grpc::Message<Monster>>>(RetrieveRaw(context, request));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>> AsyncRetrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>>(AsyncRetrieveRaw(context, request, cq, tag));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>> PrepareAsyncRetrieve(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>>(PrepareAsyncRetrieveRaw(context, request, cq));
+    }
+    std::unique_ptr< ::grpc::ClientWriter< flatbuffers::grpc::Message<Monster>>> GetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response) {
+      return std::unique_ptr< ::grpc::ClientWriter< flatbuffers::grpc::Message<Monster>>>(GetMaxHitPointRaw(context, response));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>> AsyncGetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>>(AsyncGetMaxHitPointRaw(context, response, cq, tag));
+    }
+    std::unique_ptr< ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>> PrepareAsyncGetMaxHitPoint(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>>(PrepareAsyncGetMaxHitPointRaw(context, response, cq));
+    }
+    std::unique_ptr< ::grpc::ClientReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> GetMinMaxHitPoints(::grpc::ClientContext* context) {
+      return std::unique_ptr< ::grpc::ClientReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(GetMinMaxHitPointsRaw(context));
+    }
+    std::unique_ptr<  ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> AsyncGetMinMaxHitPoints(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(AsyncGetMinMaxHitPointsRaw(context, cq, tag));
+    }
+    std::unique_ptr<  ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>> PrepareAsyncGetMinMaxHitPoints(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr< ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>>(PrepareAsyncGetMinMaxHitPointsRaw(context, cq));
+    }
+  
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>* AsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientAsyncResponseReader< flatbuffers::grpc::Message<Stat>>* PrepareAsyncStoreRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Monster>& request, ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientReader< flatbuffers::grpc::Message<Monster>>* RetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request) override;
+    ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>* AsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq, void* tag) override;
+    ::grpc::ClientAsyncReader< flatbuffers::grpc::Message<Monster>>* PrepareAsyncRetrieveRaw(::grpc::ClientContext* context, const flatbuffers::grpc::Message<Stat>& request, ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientWriter< flatbuffers::grpc::Message<Monster>>* GetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response) override;
+    ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>* AsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq, void* tag) override;
+    ::grpc::ClientAsyncWriter< flatbuffers::grpc::Message<Monster>>* PrepareAsyncGetMaxHitPointRaw(::grpc::ClientContext* context, flatbuffers::grpc::Message<Stat>* response, ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* GetMinMaxHitPointsRaw(::grpc::ClientContext* context) override;
+    ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* AsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) override;
+    ::grpc::ClientAsyncReaderWriter< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>* PrepareAsyncGetMinMaxHitPointsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq) override;
+    const ::grpc::internal::RpcMethod rpcmethod_Store_;
+    const ::grpc::internal::RpcMethod rpcmethod_Retrieve_;
+    const ::grpc::internal::RpcMethod rpcmethod_GetMaxHitPoint_;
+    const ::grpc::internal::RpcMethod rpcmethod_GetMinMaxHitPoints_;
+  };
+  static std::unique_ptr<Stub> NewStub(const std::shared_ptr< ::grpc::ChannelInterface>& channel, const ::grpc::StubOptions& options = ::grpc::StubOptions());
+  
+  class Service : public ::grpc::Service {
+   public:
+    Service();
+    virtual ~Service();
+    virtual ::grpc::Status Store(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Monster>* request, flatbuffers::grpc::Message<Stat>* response);
+    virtual ::grpc::Status Retrieve(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerWriter< flatbuffers::grpc::Message<Monster>>* writer);
+    virtual ::grpc::Status GetMaxHitPoint(::grpc::ServerContext* context, ::grpc::ServerReader< flatbuffers::grpc::Message<Monster>>* reader, flatbuffers::grpc::Message<Stat>* response);
+    virtual ::grpc::Status GetMinMaxHitPoints(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* stream);
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_Store : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithAsyncMethod_Store() {
+      ::grpc::Service::MarkMethodAsync(0);
+    }
+    ~WithAsyncMethod_Store() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status Store(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Monster>* request, flatbuffers::grpc::Message<Stat>* response) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestStore(::grpc::ServerContext* context, flatbuffers::grpc::Message<Monster>* request, ::grpc::ServerAsyncResponseWriter< flatbuffers::grpc::Message<Stat>>* response, ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq, void *tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response, new_call_cq, notification_cq, tag);
+    }
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_Retrieve : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithAsyncMethod_Retrieve() {
+      ::grpc::Service::MarkMethodAsync(1);
+    }
+    ~WithAsyncMethod_Retrieve() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status Retrieve(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerWriter< flatbuffers::grpc::Message<Monster>>* writer) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestRetrieve(::grpc::ServerContext* context, flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerAsyncWriter< flatbuffers::grpc::Message<Monster>>* writer, ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq, void *tag) {
+      ::grpc::Service::RequestAsyncServerStreaming(1, context, request, writer, new_call_cq, notification_cq, tag);
+    }
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_GetMaxHitPoint : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithAsyncMethod_GetMaxHitPoint() {
+      ::grpc::Service::MarkMethodAsync(2);
+    }
+    ~WithAsyncMethod_GetMaxHitPoint() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetMaxHitPoint(::grpc::ServerContext* context, ::grpc::ServerReader< flatbuffers::grpc::Message<Monster>>* reader, flatbuffers::grpc::Message<Stat>* response) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestGetMaxHitPoint(::grpc::ServerContext* context, ::grpc::ServerAsyncReader< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* reader, ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq, void *tag) {
+      ::grpc::Service::RequestAsyncClientStreaming(2, context, reader, new_call_cq, notification_cq, tag);
+    }
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_GetMinMaxHitPoints : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithAsyncMethod_GetMinMaxHitPoints() {
+      ::grpc::Service::MarkMethodAsync(3);
+    }
+    ~WithAsyncMethod_GetMinMaxHitPoints() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetMinMaxHitPoints(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* stream) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestGetMinMaxHitPoints(::grpc::ServerContext* context, ::grpc::ServerAsyncReaderWriter< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* stream, ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq, void *tag) {
+      ::grpc::Service::RequestAsyncBidiStreaming(3, context, stream, new_call_cq, notification_cq, tag);
+    }
+  };
+  typedef   WithAsyncMethod_Store<  WithAsyncMethod_Retrieve<  WithAsyncMethod_GetMaxHitPoint<  WithAsyncMethod_GetMinMaxHitPoints<  Service   >   >   >   >   AsyncService;
+  template <class BaseClass>
+  class WithGenericMethod_Store : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithGenericMethod_Store() {
+      ::grpc::Service::MarkMethodGeneric(0);
+    }
+    ~WithGenericMethod_Store() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status Store(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Monster>* request, flatbuffers::grpc::Message<Stat>* response) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithGenericMethod_Retrieve : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithGenericMethod_Retrieve() {
+      ::grpc::Service::MarkMethodGeneric(1);
+    }
+    ~WithGenericMethod_Retrieve() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status Retrieve(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerWriter< flatbuffers::grpc::Message<Monster>>* writer) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithGenericMethod_GetMaxHitPoint : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithGenericMethod_GetMaxHitPoint() {
+      ::grpc::Service::MarkMethodGeneric(2);
+    }
+    ~WithGenericMethod_GetMaxHitPoint() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetMaxHitPoint(::grpc::ServerContext* context, ::grpc::ServerReader< flatbuffers::grpc::Message<Monster>>* reader, flatbuffers::grpc::Message<Stat>* response) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithGenericMethod_GetMinMaxHitPoints : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithGenericMethod_GetMinMaxHitPoints() {
+      ::grpc::Service::MarkMethodGeneric(3);
+    }
+    ~WithGenericMethod_GetMinMaxHitPoints() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetMinMaxHitPoints(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>* stream) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithStreamedUnaryMethod_Store : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithStreamedUnaryMethod_Store() {
+      ::grpc::Service::MarkMethodStreamed(0,
+        new ::grpc::internal::StreamedUnaryHandler< flatbuffers::grpc::Message<Monster>, flatbuffers::grpc::Message<Stat>>(std::bind(&WithStreamedUnaryMethod_Store<BaseClass>::StreamedStore, this, std::placeholders::_1, std::placeholders::_2)));
+    }
+    ~WithStreamedUnaryMethod_Store() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable regular version of this method
+    ::grpc::Status Store(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Monster>* request, flatbuffers::grpc::Message<Stat>* response) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    // replace default version of method with streamed unary
+    virtual ::grpc::Status StreamedStore(::grpc::ServerContext* context, ::grpc::ServerUnaryStreamer< flatbuffers::grpc::Message<Monster>,flatbuffers::grpc::Message<Stat>>* server_unary_streamer) = 0;
+  };
+  typedef   WithStreamedUnaryMethod_Store<  Service   >   StreamedUnaryService;
+  template <class BaseClass>
+  class WithSplitStreamingMethod_Retrieve : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service *service) {}
+   public:
+    WithSplitStreamingMethod_Retrieve() {
+      ::grpc::Service::MarkMethodStreamed(1,
+        new ::grpc::internal::SplitServerStreamingHandler< flatbuffers::grpc::Message<Stat>, flatbuffers::grpc::Message<Monster>>(std::bind(&WithSplitStreamingMethod_Retrieve<BaseClass>::StreamedRetrieve, this, std::placeholders::_1, std::placeholders::_2)));
+    }
+    ~WithSplitStreamingMethod_Retrieve() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable regular version of this method
+    ::grpc::Status Retrieve(::grpc::ServerContext* context, const flatbuffers::grpc::Message<Stat>* request, ::grpc::ServerWriter< flatbuffers::grpc::Message<Monster>>* writer) final override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    // replace default version of method with split streamed
+    virtual ::grpc::Status StreamedRetrieve(::grpc::ServerContext* context, ::grpc::ServerSplitStreamer< flatbuffers::grpc::Message<Stat>,flatbuffers::grpc::Message<Monster>>* server_split_streamer) = 0;
+  };
+  typedef   WithSplitStreamingMethod_Retrieve<  Service   >   SplitStreamedService;
+  typedef   WithStreamedUnaryMethod_Store<  WithSplitStreamingMethod_Retrieve<  Service   >   >   StreamedService;
+};
+
+}  // namespace Example
+}  // namespace MyGame
+
+
+#endif  // GRPC_monster_5ftest__INCLUDED
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.schema.json b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.schema.json
new file mode 100644
index 0000000..091baba
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test.schema.json
@@ -0,0 +1,369 @@
+{
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
+  "definitions": {
+    "MyGame_OtherNameSpace_FromInclude" : {
+      "type" : "string",
+      "enum": ["IncludeVal"]
+    },
+    "MyGame_Example_Color" : {
+      "type" : "string",
+      "enum": ["Red", "Green", "Blue"]
+    },
+    "MyGame_Example_Race" : {
+      "type" : "string",
+      "enum": ["None", "Human", "Dwarf", "Elf"]
+    },
+    "MyGame_Example_Any" : {
+      "type" : "string",
+      "enum": ["NONE", "Monster", "TestSimpleTableWithEnum", "MyGame_Example2_Monster"]
+    },
+    "MyGame_Example_AnyUniqueAliases" : {
+      "type" : "string",
+      "enum": ["NONE", "M", "TS", "M2"]
+    },
+    "MyGame_Example_AnyAmbiguousAliases" : {
+      "type" : "string",
+      "enum": ["NONE", "M1", "M2", "M3"]
+    },
+    "MyGame_OtherNameSpace_Unused" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_OtherNameSpace_TableB" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "$ref" : "#/definitions/TableA"
+              }
+      },
+      "additionalProperties" : false
+    },
+    "TableA" : {
+      "type" : "object",
+      "properties" : {
+        "b" : {
+                "$ref" : "#/definitions/MyGame_OtherNameSpace_TableB"
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_InParentNamespace" : {
+      "type" : "object",
+      "properties" : {
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example2_Monster" : {
+      "type" : "object",
+      "properties" : {
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Test" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "type" : "integer", "minimum" : -32768, "maximum" : 32767
+              },
+        "b" : {
+                "type" : "integer", "minimum" : -128, "maximum" : 127
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_TestSimpleTableWithEnum" : {
+      "type" : "object",
+      "properties" : {
+        "color" : {
+                "$ref" : "#/definitions/MyGame_Example_Color"
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Vec3" : {
+      "type" : "object",
+      "properties" : {
+        "x" : {
+                "type" : "number"
+              },
+        "y" : {
+                "type" : "number"
+              },
+        "z" : {
+                "type" : "number"
+              },
+        "test1" : {
+                "type" : "number"
+              },
+        "test2" : {
+                "$ref" : "#/definitions/MyGame_Example_Color"
+              },
+        "test3" : {
+                "$ref" : "#/definitions/MyGame_Example_Test"
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Ability" : {
+      "type" : "object",
+      "properties" : {
+        "id" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 4294967295
+              },
+        "distance" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 4294967295
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_StructOfStructs" : {
+      "type" : "object",
+      "properties" : {
+        "a" : {
+                "$ref" : "#/definitions/MyGame_Example_Ability"
+              },
+        "b" : {
+                "$ref" : "#/definitions/MyGame_Example_Test"
+              },
+        "c" : {
+                "$ref" : "#/definitions/MyGame_Example_Ability"
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Stat" : {
+      "type" : "object",
+      "properties" : {
+        "id" : {
+                "type" : "string"
+              },
+        "val" : {
+                "type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807
+              },
+        "count" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 65535
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Referrable" : {
+      "type" : "object",
+      "properties" : {
+        "id" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              }
+      },
+      "additionalProperties" : false
+    },
+    "MyGame_Example_Monster" : {
+      "type" : "object",
+      "description" : " an example documentation comment: \"monster object\"",
+      "properties" : {
+        "pos" : {
+                "$ref" : "#/definitions/MyGame_Example_Vec3"
+              },
+        "mana" : {
+                "type" : "integer", "minimum" : -32768, "maximum" : 32767
+              },
+        "hp" : {
+                "type" : "integer", "minimum" : -32768, "maximum" : 32767
+              },
+        "name" : {
+                "type" : "string"
+              },
+        "friendly" : {
+                "type" : "boolean",
+                "deprecated" : true,
+              },
+        "inventory" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" :255}
+              },
+        "color" : {
+                "$ref" : "#/definitions/MyGame_Example_Color"
+              },
+        "test_type" : {
+                "$ref" : "#/definitions/MyGame_Example_Any"
+              },
+        "test" : {
+                "anyOf": [{ "$ref" : "#/definitions/MyGame_Example_Monster" },{ "$ref" : "#/definitions/MyGame_Example_TestSimpleTableWithEnum" },{ "$ref" : "#/definitions/MyGame_Example2_Monster" }]
+              },
+        "test4" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Test"}
+              },
+        "testarrayofstring" : {
+                "type" : "array", "items" : {"type" : "string"}
+              },
+        "testarrayoftables" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Monster"}
+              },
+        "enemy" : {
+                "$ref" : "#/definitions/MyGame_Example_Monster"
+              },
+        "testnestedflatbuffer" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" :255}
+              },
+        "testempty" : {
+                "$ref" : "#/definitions/MyGame_Example_Stat"
+              },
+        "testbool" : {
+                "type" : "boolean"
+              },
+        "testhashs32_fnv1" : {
+                "type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647
+              },
+        "testhashu32_fnv1" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 4294967295
+              },
+        "testhashs64_fnv1" : {
+                "type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807
+              },
+        "testhashu64_fnv1" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "testhashs32_fnv1a" : {
+                "type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647
+              },
+        "testhashu32_fnv1a" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 4294967295
+              },
+        "testhashs64_fnv1a" : {
+                "type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807
+              },
+        "testhashu64_fnv1a" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "testarrayofbools" : {
+                "type" : "array", "items" : {"type" : "boolean"}
+              },
+        "testf" : {
+                "type" : "number"
+              },
+        "testf2" : {
+                "type" : "number"
+              },
+        "testf3" : {
+                "type" : "number"
+              },
+        "testarrayofstring2" : {
+                "type" : "array", "items" : {"type" : "string"}
+              },
+        "testarrayofsortedstruct" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Ability"}
+              },
+        "flex" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" :255}
+              },
+        "test5" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Test"}
+              },
+        "vector_of_longs" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807}
+              },
+        "vector_of_doubles" : {
+                "type" : "array", "items" : {"type" : "number"}
+              },
+        "parent_namespace_test" : {
+                "$ref" : "#/definitions/MyGame_InParentNamespace"
+              },
+        "vector_of_referrables" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Referrable"}
+              },
+        "single_weak_reference" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "vector_of_weak_references" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615}
+              },
+        "vector_of_strong_referrables" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Referrable"}
+              },
+        "co_owning_reference" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "vector_of_co_owning_references" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615}
+              },
+        "non_owning_reference" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "vector_of_non_owning_references" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615}
+              },
+        "any_unique_type" : {
+                "$ref" : "#/definitions/MyGame_Example_AnyUniqueAliases"
+              },
+        "any_unique" : {
+                "anyOf": [{ "$ref" : "#/definitions/MyGame_Example_Monster" },{ "$ref" : "#/definitions/MyGame_Example_TestSimpleTableWithEnum" },{ "$ref" : "#/definitions/MyGame_Example2_Monster" }]
+              },
+        "any_ambiguous_type" : {
+                "$ref" : "#/definitions/MyGame_Example_AnyAmbiguousAliases"
+              },
+        "any_ambiguous" : {
+                "anyOf": [{ "$ref" : "#/definitions/MyGame_Example_Monster" },{ "$ref" : "#/definitions/MyGame_Example_Monster" },{ "$ref" : "#/definitions/MyGame_Example_Monster" }]
+              },
+        "vector_of_enums" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Color"}
+              },
+        "signed_enum" : {
+                "$ref" : "#/definitions/MyGame_Example_Race"
+              },
+        "testrequirednestedflatbuffer" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : 0, "maximum" :255}
+              },
+        "scalar_key_sorted_tables" : {
+                "type" : "array", "items" : {"$ref" : "#/definitions/MyGame_Example_Stat"}
+              }
+      },
+      "required" : ["name"],
+      "additionalProperties" : false
+    },
+    "MyGame_Example_TypeAliases" : {
+      "type" : "object",
+      "properties" : {
+        "i8" : {
+                "type" : "integer", "minimum" : -128, "maximum" : 127
+              },
+        "u8" : {
+                "type" : "integer", "minimum" : 0, "maximum" :255
+              },
+        "i16" : {
+                "type" : "integer", "minimum" : -32768, "maximum" : 32767
+              },
+        "u16" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 65535
+              },
+        "i32" : {
+                "type" : "integer", "minimum" : -2147483648, "maximum" : 2147483647
+              },
+        "u32" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 4294967295
+              },
+        "i64" : {
+                "type" : "integer", "minimum" : -9223372036854775808, "maximum" : 9223372036854775807
+              },
+        "u64" : {
+                "type" : "integer", "minimum" : 0, "maximum" : 18446744073709551615
+              },
+        "f32" : {
+                "type" : "number"
+              },
+        "f64" : {
+                "type" : "number"
+              },
+        "v8" : {
+                "type" : "array", "items" : {"type" : "integer", "minimum" : -128, "maximum" : 127}
+              },
+        "vf64" : {
+                "type" : "array", "items" : {"type" : "number"}
+              }
+      },
+      "additionalProperties" : false
+    }
+  },
+  "$ref" : "#/definitions/MyGame_Example_Monster"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_bfbs_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_bfbs_generated.h
new file mode 100644
index 0000000..d1f3ddd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_bfbs_generated.h
@@ -0,0 +1,679 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_BFBS_H_
+#define FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_BFBS_H_
+
+namespace MyGame {
+namespace Example {
+
+struct MonsterBinarySchema {
+  static const uint8_t *data() {
+    // Buffer containing the binary schema.
+    static const uint8_t bfbsData[12944] = {
+      0x1C,0x00,0x00,0x00,0x42,0x46,0x42,0x53,0x00,0x00,0x00,0x00,0x10,0x00,0x1C,0x00,0x04,0x00,0x08,0x00,
+      0x0C,0x00,0x10,0x00,0x14,0x00,0x18,0x00,0x10,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x20,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x2C,0x0C,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x7C,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x6D,0x6F,0x6E,0x00,0x04,0x00,0x00,0x00,0x4D,0x4F,0x4E,0x53,
+      0x00,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0xD0,0x04,0x00,0x00,0x54,0x02,0x00,0x00,0x7C,0x03,0x00,0x00,
+      0x2C,0x07,0x00,0x00,0x10,0x06,0x00,0x00,0xF0,0x08,0x00,0x00,0x0E,0x00,0x00,0x00,0xB0,0x2B,0x00,0x00,
+      0xE4,0x0B,0x00,0x00,0xD4,0x28,0x00,0x00,0xA4,0x29,0x00,0x00,0xA4,0x2A,0x00,0x00,0x48,0x2F,0x00,0x00,
+      0x40,0x2E,0x00,0x00,0x58,0x09,0x00,0x00,0x84,0x2C,0x00,0x00,0xE4,0x2F,0x00,0x00,0x18,0x30,0x00,0x00,
+      0xB8,0x30,0x00,0x00,0x5C,0x31,0x00,0x00,0x48,0x30,0x00,0x00,0x0C,0x00,0x10,0x00,0x04,0x00,0x08,0x00,
+      0x00,0x00,0x0C,0x00,0x0C,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0xF4,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
+      0x28,0x00,0x00,0x00,0x1D,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,
+      0x6C,0x65,0x2E,0x4D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x53,0x74,0x6F,0x72,0x61,0x67,0x65,0x00,0x00,0x00,
+      0xBA,0xFE,0xFF,0xFF,0x48,0x00,0x00,0x00,0x50,0x0B,0x00,0x00,0x14,0x29,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x14,0xD2,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x62,0x69,0x64,0x69,0x00,0x00,0x00,0x00,
+      0x09,0x00,0x00,0x00,0x73,0x74,0x72,0x65,0x61,0x6D,0x69,0x6E,0x67,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
+      0x47,0x65,0x74,0x4D,0x69,0x6E,0x4D,0x61,0x78,0x48,0x69,0x74,0x50,0x6F,0x69,0x6E,0x74,0x73,0x00,0x00,
+      0x1E,0xFF,0xFF,0xFF,0x48,0x00,0x00,0x00,0xEC,0x0A,0x00,0x00,0xB0,0x28,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x78,0xD2,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x63,0x6C,0x69,0x65,0x6E,0x74,0x00,0x00,
+      0x09,0x00,0x00,0x00,0x73,0x74,0x72,0x65,0x61,0x6D,0x69,0x6E,0x67,0x00,0x00,0x00,0x0E,0x00,0x00,0x00,
+      0x47,0x65,0x74,0x4D,0x61,0x78,0x48,0x69,0x74,0x50,0x6F,0x69,0x6E,0x74,0x00,0x00,0x7E,0xFF,0xFF,0xFF,
+      0x70,0x00,0x00,0x00,0x54,0x28,0x00,0x00,0x88,0x0A,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xDC,0xD2,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x73,0x65,0x72,0x76,0x65,0x72,0x00,0x00,
+      0x09,0x00,0x00,0x00,0x73,0x74,0x72,0x65,0x61,0x6D,0x69,0x6E,0x67,0x00,0x00,0x00,0x04,0xD3,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x0A,0x00,0x00,0x00,
+      0x69,0x64,0x65,0x6D,0x70,0x6F,0x74,0x65,0x6E,0x74,0x00,0x00,0x08,0x00,0x00,0x00,0x52,0x65,0x74,0x72,
+      0x69,0x65,0x76,0x65,0x00,0x00,0x0E,0x00,0x18,0x00,0x04,0x00,0x08,0x00,0x0C,0x00,0x10,0x00,0x14,0x00,
+      0x0E,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0xFC,0x09,0x00,0x00,0xC0,0x27,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0xD3,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6E,0x6F,0x6E,0x65,0x00,0x00,0x00,0x00,
+      0x09,0x00,0x00,0x00,0x73,0x74,0x72,0x65,0x61,0x6D,0x69,0x6E,0x67,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
+      0x53,0x74,0x6F,0x72,0x65,0x00,0x00,0x00,0x98,0xFD,0xFF,0xFF,0x00,0x00,0x00,0x01,0x34,0x00,0x00,0x00,
+      0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xFA,0xD0,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xDC,0x00,0x00,0x00,0xA4,0x00,0x00,0x00,
+      0x68,0x00,0x00,0x00,0x2C,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x41,0x6E,0x79,0x41,0x6D,0x62,0x69,0x67,0x75,0x6F,0x75,0x73,0x41,
+      0x6C,0x69,0x61,0x73,0x65,0x73,0x00,0x00,0x66,0xFE,0xFF,0xFF,0x2C,0x00,0x00,0x00,0x40,0x09,0x00,0x00,
+      0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x66,0xD1,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x4D,0x33,0x00,0x00,0x9E,0xFE,0xFF,0xFF,0x2C,0x00,0x00,0x00,0x08,0x09,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x9E,0xD1,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x4D,0x32,0x00,0x00,
+      0x76,0xFD,0xFF,0xFF,0x28,0x00,0x00,0x00,0xD0,0x08,0x00,0x00,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xD2,0xD1,0xFF,0xFF,0x00,0x00,0x00,0x0F,
+      0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x4D,0x31,0x00,0x00,0x0A,0xFA,0xFF,0xFF,0x14,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xFC,0xF9,0xFF,0xFF,0x04,0x00,0x00,0x00,
+      0x4E,0x4F,0x4E,0x45,0x00,0x00,0x00,0x00,0xC4,0xFE,0xFF,0xFF,0x00,0x00,0x00,0x01,0x34,0x00,0x00,0x00,
+      0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x26,0xD2,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xEC,0x00,0x00,0x00,0xB0,0x00,0x00,0x00,
+      0x64,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x1F,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x41,0x6E,0x79,0x55,0x6E,0x69,0x71,0x75,0x65,0x41,0x6C,0x69,0x61,
+      0x73,0x65,0x73,0x00,0x8E,0xFF,0xFF,0xFF,0x2C,0x00,0x00,0x00,0x38,0x2C,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x8E,0xD2,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x09,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x4D,0x32,0x00,0x00,
+      0xC6,0xFF,0xFF,0xFF,0x2C,0x00,0x00,0x00,0x50,0x2A,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xC6,0xD2,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x54,0x53,0x00,0x00,0x00,0x00,0x0E,0x00,
+      0x20,0x00,0x04,0x00,0x14,0x00,0x08,0x00,0x0C,0x00,0x10,0x00,0x0E,0x00,0x00,0x00,0x2C,0x00,0x00,0x00,
+      0x98,0x07,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x0E,0xD3,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x01,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x4D,0x00,0x00,0x00,0x46,0xFB,0xFF,0xFF,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x38,0xFB,0xFF,0xFF,0x04,0x00,0x00,0x00,0x4E,0x4F,0x4E,0x45,
+      0x00,0x00,0x00,0x00,0x10,0x00,0x18,0x00,0x08,0x00,0x0C,0x00,0x07,0x00,0x10,0x00,0x00,0x00,0x14,0x00,
+      0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x34,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x72,0xD3,0xFF,0xFF,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0xC4,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,
+      0x12,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x41,
+      0x6E,0x79,0x00,0x00,0x6E,0xFF,0xFF,0xFF,0x28,0x00,0x00,0x00,0xF8,0x2A,0x00,0x00,0x14,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xCA,0xD3,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x09,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x5F,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x32,0x5F,0x4D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x00,0xB6,0xFF,0xFF,0xFF,
+      0x28,0x00,0x00,0x00,0x00,0x29,0x00,0x00,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x12,0xD4,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x06,0x00,0x00,0x00,
+      0x17,0x00,0x00,0x00,0x54,0x65,0x73,0x74,0x53,0x69,0x6D,0x70,0x6C,0x65,0x54,0x61,0x62,0x6C,0x65,0x57,
+      0x69,0x74,0x68,0x45,0x6E,0x75,0x6D,0x00,0x00,0x00,0x0E,0x00,0x1C,0x00,0x04,0x00,0x14,0x00,0x08,0x00,
+      0x0C,0x00,0x10,0x00,0x0E,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x38,0x06,0x00,0x00,0x14,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x6A,0xD4,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x4D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x00,
+      0xA6,0xFC,0xFF,0xFF,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x98,0xFC,0xFF,0xFF,0x04,0x00,0x00,0x00,0x4E,0x4F,0x4E,0x45,0x00,0x00,0x00,0x00,0x2C,0xFD,0xFF,0xFF,
+      0x34,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xBE,0xD4,0xFF,0xFF,0x00,0x00,0x00,0x03,0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xB0,0x00,0x00,0x00,
+      0x88,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,
+      0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x52,0x61,0x63,0x65,0x00,0xD6,0xFF,0xFF,0xFF,
+      0x1C,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x14,0xFD,0xFF,0xFF,0x03,0x00,0x00,0x00,0x45,0x6C,0x66,0x00,0x00,0x00,0x0E,0x00,
+      0x18,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x0C,0x00,0x0E,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,
+      0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x4C,0xFD,0xFF,0xFF,0x05,0x00,0x00,0x00,0x44,0x77,0x61,0x72,0x66,0x00,0x00,0x00,0x7E,0xFD,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x70,0xFD,0xFF,0xFF,
+      0x05,0x00,0x00,0x00,0x48,0x75,0x6D,0x61,0x6E,0x00,0x00,0x00,0x3E,0xFE,0xFF,0xFF,0x20,0x00,0x00,0x00,
+      0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0xA0,0xFD,0xFF,0xFF,0x04,0x00,0x00,0x00,0x4E,0x6F,0x6E,0x65,0x00,0x00,0x00,0x00,
+      0x10,0x00,0x18,0x00,0x04,0x00,0x08,0x00,0x00,0x00,0x0C,0x00,0x10,0x00,0x14,0x00,0x10,0x00,0x00,0x00,
+      0x90,0x00,0x00,0x00,0x7C,0x00,0x00,0x00,0x6C,0x00,0x00,0x00,0x3C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x20,0x43,0x6F,0x6D,0x70,0x6F,0x73,0x69,
+      0x74,0x65,0x20,0x63,0x6F,0x6D,0x70,0x6F,0x6E,0x65,0x6E,0x74,0x73,0x20,0x6F,0x66,0x20,0x4D,0x6F,0x6E,
+      0x73,0x74,0x65,0x72,0x20,0x63,0x6F,0x6C,0x6F,0x72,0x2E,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xD0,0xD8,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x09,0x00,0x00,0x00,0x62,0x69,0x74,0x5F,0x66,0x6C,0x61,0x67,0x73,0x00,0x00,0x00,0x36,0xD6,0xFF,0xFF,
+      0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x7C,0x00,0x00,0x00,
+      0x20,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,
+      0x6C,0x65,0x2E,0x43,0x6F,0x6C,0x6F,0x72,0x00,0x00,0x00,0x00,0x2E,0xFF,0xFF,0xFF,0x48,0x00,0x00,0x00,
+      0x40,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x20,0x5C,0x62,0x72,0x69,0x65,0x66,0x20,
+      0x63,0x6F,0x6C,0x6F,0x72,0x20,0x42,0x6C,0x75,0x65,0x20,0x28,0x31,0x75,0x20,0x3C,0x3C,0x20,0x33,0x29,
+      0x00,0x00,0x00,0x00,0xB8,0xFE,0xFF,0xFF,0x04,0x00,0x00,0x00,0x42,0x6C,0x75,0x65,0x00,0x00,0x00,0x00,
+      0x86,0xFF,0xFF,0xFF,0x6C,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x27,0x00,0x00,0x00,0x20,0x47,0x72,0x65,0x65,0x6E,0x20,0x69,0x73,0x20,0x62,0x69,0x74,0x5F,0x66,0x6C,
+      0x61,0x67,0x20,0x77,0x69,0x74,0x68,0x20,0x76,0x61,0x6C,0x75,0x65,0x20,0x28,0x31,0x75,0x20,0x3C,0x3C,
+      0x20,0x31,0x29,0x00,0x13,0x00,0x00,0x00,0x20,0x5C,0x62,0x72,0x69,0x65,0x66,0x20,0x63,0x6F,0x6C,0x6F,
+      0x72,0x20,0x47,0x72,0x65,0x65,0x6E,0x00,0x34,0xFF,0xFF,0xFF,0x05,0x00,0x00,0x00,0x47,0x72,0x65,0x65,
+      0x6E,0x00,0x0E,0x00,0x1C,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x0C,0x00,0x0E,0x00,0x00,0x00,
+      0x20,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x70,0xFF,0xFF,0xFF,0x03,0x00,0x00,0x00,0x52,0x65,0x64,0x00,
+      0x10,0x00,0x14,0x00,0x04,0x00,0x08,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
+      0x28,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xA2,0xD7,0xFF,0xFF,0x00,0x00,0x00,0x09,0x05,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
+      0x21,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x4F,0x74,0x68,0x65,0x72,0x4E,0x61,0x6D,0x65,
+      0x53,0x70,0x61,0x63,0x65,0x2E,0x46,0x72,0x6F,0x6D,0x49,0x6E,0x63,0x6C,0x75,0x64,0x65,0x00,0x0E,0x00,
+      0x10,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x08,0x00,0x0C,0x00,0x0E,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x00,0x00,
+      0x0A,0x00,0x00,0x00,0x49,0x6E,0x63,0x6C,0x75,0x64,0x65,0x56,0x61,0x6C,0x00,0x00,0xA2,0xD8,0xFF,0xFF,
+      0x48,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0xD4,0x00,0x00,0x00,0xA8,0x00,0x00,0x00,0xBC,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
+      0x14,0x01,0x00,0x00,0x1C,0x02,0x00,0x00,0x84,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0xDC,0x00,0x00,0x00,
+      0xC8,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x1A,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,
+      0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x54,0x79,0x70,0x65,0x41,0x6C,0x69,0x61,0x73,
+      0x65,0x73,0x00,0x00,0xB8,0xDC,0xFF,0xFF,0x00,0x00,0x00,0x01,0x0B,0x00,0x1A,0x00,0x18,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xDC,0xE3,0xFF,0xFF,0x00,0x00,0x0E,0x0C,
+      0x04,0x00,0x00,0x00,0x76,0x66,0x36,0x34,0x00,0x00,0x00,0x00,0xE8,0xDC,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x0A,0x00,0x18,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x0C,0xE4,0xFF,0xFF,0x00,0x00,0x0E,0x03,0x02,0x00,0x00,0x00,0x76,0x38,0x00,0x00,0x9A,0xDA,0xFF,0xFF,
+      0x09,0x00,0x16,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x4E,0xD8,0xFF,0xFF,0x00,0x00,0x00,0x0C,0x03,0x00,0x00,0x00,0x66,0x36,0x34,0x00,0xC2,0xDA,0xFF,0xFF,
+      0x08,0x00,0x14,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x76,0xD8,0xFF,0xFF,0x00,0x00,0x00,0x0B,0x03,0x00,0x00,0x00,0x66,0x33,0x32,0x00,0xEA,0xDA,0xFF,0xFF,
+      0x07,0x00,0x12,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x9E,0xD8,0xFF,0xFF,0x00,0x00,0x00,0x0A,0x03,0x00,0x00,0x00,0x75,0x36,0x34,0x00,0x12,0xDB,0xFF,0xFF,
+      0x06,0x00,0x10,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xC6,0xD8,0xFF,0xFF,0x00,0x00,0x00,0x09,0x03,0x00,0x00,0x00,0x69,0x36,0x34,0x00,0x3A,0xDB,0xFF,0xFF,
+      0x05,0x00,0x0E,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xEE,0xD8,0xFF,0xFF,0x00,0x00,0x00,0x08,0x03,0x00,0x00,0x00,0x75,0x33,0x32,0x00,0x62,0xDB,0xFF,0xFF,
+      0x04,0x00,0x0C,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x16,0xD9,0xFF,0xFF,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x00,0x69,0x33,0x32,0x00,0x8A,0xDB,0xFF,0xFF,
+      0x03,0x00,0x0A,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x3E,0xD9,0xFF,0xFF,0x00,0x00,0x00,0x06,0x03,0x00,0x00,0x00,0x75,0x31,0x36,0x00,0xB2,0xDB,0xFF,0xFF,
+      0x02,0x00,0x08,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x66,0xD9,0xFF,0xFF,0x00,0x00,0x00,0x05,0x03,0x00,0x00,0x00,0x69,0x31,0x36,0x00,0xDA,0xDB,0xFF,0xFF,
+      0x01,0x00,0x06,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x8E,0xD9,0xFF,0xFF,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x00,0x75,0x38,0x00,0x00,0x00,0x00,0x1A,0x00,
+      0x14,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x10,0x00,0x1A,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xD2,0xD9,0xFF,0xFF,0x00,0x00,0x00,0x03,0x02,0x00,0x00,0x00,
+      0x69,0x38,0x00,0x00,0x16,0xDB,0xFF,0xFF,0x20,0x01,0x00,0x00,0x4C,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x20,0x61,0x6E,0x20,
+      0x65,0x78,0x61,0x6D,0x70,0x6C,0x65,0x20,0x64,0x6F,0x63,0x75,0x6D,0x65,0x6E,0x74,0x61,0x74,0x69,0x6F,
+      0x6E,0x20,0x63,0x6F,0x6D,0x6D,0x65,0x6E,0x74,0x3A,0x20,0x22,0x6D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x20,
+      0x6F,0x62,0x6A,0x65,0x63,0x74,0x22,0x00,0x33,0x00,0x00,0x00,0xC8,0x02,0x00,0x00,0x28,0x03,0x00,0x00,
+      0x88,0x03,0x00,0x00,0xE4,0x03,0x00,0x00,0xC8,0x07,0x00,0x00,0x08,0x19,0x00,0x00,0x38,0x16,0x00,0x00,
+      0x38,0x0D,0x00,0x00,0xF8,0x19,0x00,0x00,0x30,0x1B,0x00,0x00,0x74,0x19,0x00,0x00,0xA0,0x1B,0x00,0x00,
+      0xAC,0x1A,0x00,0x00,0x48,0x05,0x00,0x00,0x90,0x0B,0x00,0x00,0x0C,0x1C,0x00,0x00,0xA8,0x00,0x00,0x00,
+      0xB8,0x01,0x00,0x00,0x24,0x0A,0x00,0x00,0x00,0x18,0x00,0x00,0x94,0x17,0x00,0x00,0xA4,0x0C,0x00,0x00,
+      0x68,0x18,0x00,0x00,0x90,0x0F,0x00,0x00,0x74,0x0D,0x00,0x00,0x1C,0x17,0x00,0x00,0xD8,0x0D,0x00,0x00,
+      0x40,0x16,0x00,0x00,0x8C,0x14,0x00,0x00,0xE0,0x14,0x00,0x00,0x14,0x0F,0x00,0x00,0x98,0x0E,0x00,0x00,
+      0x24,0x0E,0x00,0x00,0xF0,0x13,0x00,0x00,0xC8,0x11,0x00,0x00,0xD8,0x12,0x00,0x00,0x4C,0x10,0x00,0x00,
+      0x58,0x13,0x00,0x00,0xD0,0x10,0x00,0x00,0x40,0x12,0x00,0x00,0xB0,0x0F,0x00,0x00,0x10,0x15,0x00,0x00,
+      0xB0,0x00,0x00,0x00,0xF0,0x05,0x00,0x00,0x84,0x0B,0x00,0x00,0xB0,0x01,0x00,0x00,0xE0,0x0B,0x00,0x00,
+      0x94,0x03,0x00,0x00,0x9C,0x0A,0x00,0x00,0x00,0x08,0x00,0x00,0xAC,0x08,0x00,0x00,0x16,0x00,0x00,0x00,
+      0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x4D,0x6F,0x6E,0x73,0x74,
+      0x65,0x72,0x00,0x00,0x68,0xE7,0xFF,0xFF,0x00,0x00,0x00,0x01,0x32,0x00,0x68,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0xB0,0xDE,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x35,0x30,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x02,0xE9,0xFF,0xFF,0x00,0x00,0x0E,0x0F,
+      0x03,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x73,0x63,0x61,0x6C,0x61,0x72,0x5F,0x6B,0x65,0x79,0x5F,0x73,
+      0x6F,0x72,0x74,0x65,0x64,0x5F,0x74,0x61,0x62,0x6C,0x65,0x73,0x00,0x00,0x00,0x00,0xD8,0xE7,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x31,0x00,0x66,0x00,0x74,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x24,0xDF,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x4D,0x6F,0x6E,0x73,
+      0x74,0x65,0x72,0x00,0x11,0x00,0x00,0x00,0x6E,0x65,0x73,0x74,0x65,0x64,0x5F,0x66,0x6C,0x61,0x74,0x62,
+      0x75,0x66,0x66,0x65,0x72,0x00,0x00,0x00,0x54,0xDF,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x34,0x39,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xF0,0xE7,0xFF,0xFF,
+      0x00,0x00,0x0E,0x04,0x1C,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x72,0x65,0x71,0x75,0x69,0x72,0x65,0x64,
+      0x6E,0x65,0x73,0x74,0x65,0x64,0x66,0x6C,0x61,0x74,0x62,0x75,0x66,0x66,0x65,0x72,0x00,0x00,0x00,0x00,
+      0x4A,0xE6,0xFF,0xFF,0x30,0x00,0x64,0x00,0x50,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xCC,0xDF,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x34,0x38,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x2A,0xDD,0xFF,0xFF,
+      0x00,0x00,0x00,0x03,0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x73,0x69,0x67,0x6E,0x65,0x64,0x5F,0x65,
+      0x6E,0x75,0x6D,0x00,0xE4,0xE8,0xFF,0xFF,0x00,0x00,0x00,0x01,0x2F,0x00,0x62,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x2C,0xE0,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x34,0x37,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x7E,0xEA,0xFF,0xFF,0x00,0x00,0x0E,0x04,
+      0x03,0x00,0x00,0x00,0x0F,0x00,0x00,0x00,0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x65,0x6E,
+      0x75,0x6D,0x73,0x00,0x48,0xE9,0xFF,0xFF,0x00,0x00,0x00,0x01,0x2E,0x00,0x60,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x90,0xE0,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x34,0x36,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xEE,0xDD,0xFF,0xFF,0x00,0x00,0x00,0x10,
+      0x01,0x00,0x00,0x00,0x0D,0x00,0x00,0x00,0x61,0x6E,0x79,0x5F,0x61,0x6D,0x62,0x69,0x67,0x75,0x6F,0x75,
+      0x73,0x00,0x00,0x00,0x86,0xEA,0xFF,0xFF,0x2D,0x00,0x5E,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xF0,0xE0,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x35,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x4E,0xDE,0xFF,0xFF,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00,
+      0x12,0x00,0x00,0x00,0x61,0x6E,0x79,0x5F,0x61,0x6D,0x62,0x69,0x67,0x75,0x6F,0x75,0x73,0x5F,0x74,0x79,
+      0x70,0x65,0x00,0x00,0x10,0xEA,0xFF,0xFF,0x00,0x00,0x00,0x01,0x2C,0x00,0x5C,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x58,0xE1,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x34,0x34,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xB6,0xDE,0xFF,0xFF,0x00,0x00,0x00,0x10,
+      0x02,0x00,0x00,0x00,0x0A,0x00,0x00,0x00,0x61,0x6E,0x79,0x5F,0x75,0x6E,0x69,0x71,0x75,0x65,0x00,0x00,
+      0x4A,0xEB,0xFF,0xFF,0x2B,0x00,0x5A,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xB4,0xE1,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x33,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x12,0xDF,0xFF,0xFF,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x0F,0x00,0x00,0x00,
+      0x61,0x6E,0x79,0x5F,0x75,0x6E,0x69,0x71,0x75,0x65,0x5F,0x74,0x79,0x70,0x65,0x00,0xD0,0xEA,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x2A,0x00,0x58,0x00,0xFC,0x00,0x00,0x00,0xF0,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xB0,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
+      0x50,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x28,0xE2,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x32,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0x44,0xE2,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,
+      0x61,0x5F,0x36,0x34,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,
+      0x6C,0xE2,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,
+      0x72,0x72,0x61,0x62,0x6C,0x65,0x54,0x00,0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0x98,0xE2,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,
+      0x5F,0x67,0x65,0x74,0x00,0x00,0x00,0x00,0xC4,0xE2,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x05,0x00,0x00,0x00,0x6E,0x61,0x6B,0x65,0x64,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,
+      0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0x70,0xEB,0xFF,0xFF,0x00,0x00,0x0E,0x0A,
+      0x1F,0x00,0x00,0x00,0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x6E,0x6F,0x6E,0x5F,0x6F,0x77,
+      0x6E,0x69,0x6E,0x67,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,0x73,0x00,0xD6,0xEC,0xFF,0xFF,
+      0x29,0x00,0x56,0x00,0xFC,0x00,0x00,0x00,0xF0,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xB0,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x50,0xE3,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x34,0x31,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x6C,0xE3,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x94,0xE3,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,
+      0x6C,0x65,0x54,0x00,0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,
+      0xC0,0xE3,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x5F,0x67,0x65,0x74,
+      0x00,0x00,0x00,0x00,0xEC,0xE3,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
+      0x6E,0x61,0x6B,0x65,0x64,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,
+      0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0xB2,0xE0,0xFF,0xFF,0x00,0x00,0x00,0x0A,0x14,0x00,0x00,0x00,
+      0x6E,0x6F,0x6E,0x5F,0x6F,0x77,0x6E,0x69,0x6E,0x67,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,
+      0x00,0x00,0x00,0x00,0x1C,0xED,0xFF,0xFF,0x00,0x00,0x00,0x01,0x28,0x00,0x54,0x00,0x0C,0x01,0x00,0x00,
+      0x00,0x01,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
+      0xB4,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x74,0xE4,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x30,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x90,0xE4,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0xB8,0xE4,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,0x6C,0x65,0x54,0x00,0x08,0x00,0x00,0x00,
+      0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0xE4,0xE4,0xFF,0xFF,0x14,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2E,0x67,0x65,0x74,0x28,0x29,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x5F,0x67,0x65,0x74,0x00,0x00,0x00,0x00,
+      0x14,0xE5,0xFF,0xFF,0x20,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x64,0x65,0x66,0x61,
+      0x75,0x6C,0x74,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0xCC,0xED,0xFF,0xFF,
+      0x00,0x00,0x0E,0x0A,0x1E,0x00,0x00,0x00,0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x63,0x6F,
+      0x5F,0x6F,0x77,0x6E,0x69,0x6E,0x67,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,0x73,0x00,0x00,
+      0x32,0xEF,0xFF,0xFF,0x27,0x00,0x52,0x00,0xCC,0x00,0x00,0x00,0xC0,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xA8,0xE5,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x33,0x39,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xC4,0xE5,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0xEC,0xE5,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,
+      0x6C,0x65,0x54,0x00,0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,
+      0x18,0xE6,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x6E,0x61,0x6B,0x65,
+      0x64,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0xDE,0xE2,0xFF,0xFF,0x00,0x00,0x00,0x0A,0x13,0x00,0x00,0x00,0x63,0x6F,0x5F,0x6F,
+      0x77,0x6E,0x69,0x6E,0x67,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,0x00,0x44,0xEF,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x26,0x00,0x50,0x00,0x80,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x90,0xE6,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x33,0x38,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xAC,0xE6,0xFF,0xFF,0x20,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x64,0x65,0x66,0x61,0x75,0x6C,0x74,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0x1A,0xF1,0xFF,0xFF,0x00,0x00,0x0E,0x0F,0x02,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,
+      0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x73,0x74,0x72,0x6F,0x6E,0x67,0x5F,0x72,0x65,0x66,
+      0x65,0x72,0x72,0x61,0x62,0x6C,0x65,0x73,0x00,0x00,0x00,0x00,0xF4,0xEF,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x25,0x00,0x4E,0x00,0xCC,0x00,0x00,0x00,0xC0,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x48,0xE7,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x33,0x37,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x64,0xE7,0xFF,0xFF,0x18,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,0x00,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x8C,0xE7,0xFF,0xFF,0x18,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,0x6C,0x65,0x54,0x00,
+      0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0xB8,0xE7,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x6E,0x61,0x6B,0x65,0x64,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,
+      0x64,0xF0,0xFF,0xFF,0x00,0x00,0x0E,0x0A,0x19,0x00,0x00,0x00,0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,
+      0x66,0x5F,0x77,0x65,0x61,0x6B,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,0x73,0x00,0x00,0x00,
+      0xC6,0xF1,0xFF,0xFF,0x24,0x00,0x4C,0x00,0xCC,0x00,0x00,0x00,0xC0,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x3C,0xE8,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x33,0x36,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x58,0xE8,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x80,0xE8,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,
+      0x6C,0x65,0x54,0x00,0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,
+      0xAC,0xE8,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x6E,0x61,0x6B,0x65,
+      0x64,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0x72,0xE5,0xFF,0xFF,0x00,0x00,0x00,0x0A,0x15,0x00,0x00,0x00,0x73,0x69,0x6E,0x67,
+      0x6C,0x65,0x5F,0x77,0x65,0x61,0x6B,0x5F,0x72,0x65,0x66,0x65,0x72,0x65,0x6E,0x63,0x65,0x00,0x00,0x00,
+      0xDC,0xF1,0xFF,0xFF,0x00,0x00,0x00,0x01,0x23,0x00,0x4A,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x24,0xE9,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x33,0x35,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x76,0xF3,0xFF,0xFF,0x00,0x00,0x0E,0x0F,0x02,0x00,0x00,0x00,
+      0x15,0x00,0x00,0x00,0x76,0x65,0x63,0x74,0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x72,0x65,0x66,0x65,0x72,0x72,
+      0x61,0x62,0x6C,0x65,0x73,0x00,0x00,0x00,0x48,0xF2,0xFF,0xFF,0x00,0x00,0x00,0x01,0x22,0x00,0x48,0x00,
+      0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x90,0xE9,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x33,0x34,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xEE,0xE6,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x0A,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x70,0x61,0x72,0x65,0x6E,0x74,0x5F,0x6E,
+      0x61,0x6D,0x65,0x73,0x70,0x61,0x63,0x65,0x5F,0x74,0x65,0x73,0x74,0x00,0x00,0x00,0xB4,0xF2,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x21,0x00,0x46,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xFC,0xE9,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x33,0x33,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x98,0xF2,0xFF,0xFF,0x00,0x00,0x0E,0x0C,0x11,0x00,0x00,0x00,0x76,0x65,0x63,0x74,
+      0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x64,0x6F,0x75,0x62,0x6C,0x65,0x73,0x00,0x00,0x00,0x18,0xF3,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x20,0x00,0x44,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x60,0xEA,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x33,0x32,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0xFC,0xF2,0xFF,0xFF,0x00,0x00,0x0E,0x09,0x0F,0x00,0x00,0x00,0x76,0x65,0x63,0x74,
+      0x6F,0x72,0x5F,0x6F,0x66,0x5F,0x6C,0x6F,0x6E,0x67,0x73,0x00,0x78,0xF3,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x1F,0x00,0x42,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xC0,0xEA,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x33,0x31,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0x12,0xF5,0xFF,0xFF,0x00,0x00,0x0E,0x0F,0x05,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x35,0x00,0x00,0x00,0xD4,0xF3,0xFF,0xFF,0x00,0x00,0x00,0x01,0x1E,0x00,0x40,0x00,0x68,0x00,0x00,0x00,
+      0x5C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x20,0xEB,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x33,0x30,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x3C,0xEB,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x0A,0x00,0x00,0x00,
+      0x66,0x6C,0x65,0x78,0x62,0x75,0x66,0x66,0x65,0x72,0x00,0x00,0xE0,0xF3,0xFF,0xFF,0x00,0x00,0x0E,0x04,
+      0x04,0x00,0x00,0x00,0x66,0x6C,0x65,0x78,0x00,0x00,0x00,0x00,0x54,0xF4,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x1D,0x00,0x3E,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x9C,0xEB,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x39,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0xEE,0xF5,0xFF,0xFF,0x00,0x00,0x0E,0x0F,0x00,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x61,0x72,0x72,0x61,0x79,0x6F,0x66,0x73,0x6F,0x72,0x74,0x65,0x64,0x73,0x74,0x72,0x75,0x63,0x74,0x00,
+      0xC0,0xF4,0xFF,0xFF,0x00,0x00,0x00,0x01,0x1C,0x00,0x3C,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x08,0xEC,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x38,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xA4,0xF4,0xFF,0xFF,0x00,0x00,0x0E,0x0D,0x12,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x61,0x72,0x72,0x61,0x79,0x6F,0x66,0x73,0x74,0x72,0x69,0x6E,0x67,0x32,0x00,0x00,
+      0xFE,0xF5,0xFF,0xFF,0x1B,0x00,0x3A,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0xEC,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x37,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x1E,0xE9,0xFF,0xFF,0x00,0x00,0x00,0x0B,0x06,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x66,0x33,0x00,0x00,0x00,0x00,0x1A,0x00,0x20,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+      0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x1A,0x00,0x38,0x00,
+      0x48,0x00,0x00,0x00,0x3C,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x08,0x40,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xE0,0xEC,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x36,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x96,0xE9,0xFF,0xFF,0x00,0x00,0x00,0x0B,0x06,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x66,0x32,0x00,0x00,0x00,0x00,0x1A,0x00,0x24,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+      0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x19,0x00,0x36,0x00,
+      0x4C,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x6E,0x86,0x1B,0xF0,
+      0xF9,0x21,0x09,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x5C,0xED,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x35,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x12,0xEA,0xFF,0xFF,0x00,0x00,0x00,0x0B,0x05,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x66,0x00,0x00,0x00,0x6C,0xF6,0xFF,0xFF,0x00,0x00,0x00,0x01,0x18,0x00,0x34,0x00,
+      0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xB4,0xED,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x32,0x34,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x50,0xF6,0xFF,0xFF,
+      0x00,0x00,0x0E,0x02,0x10,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x61,0x72,0x72,0x61,0x79,0x6F,0x66,0x62,
+      0x6F,0x6F,0x6C,0x73,0x00,0x00,0x00,0x00,0xAA,0xF7,0xFF,0xFF,0x17,0x00,0x32,0x00,0x6C,0x00,0x00,0x00,
+      0x60,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x18,0xEE,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x32,0x33,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x34,0xEE,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0xF6,0xEA,0xFF,0xFF,
+      0x00,0x00,0x00,0x0A,0x11,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x75,0x36,0x34,0x5F,
+      0x66,0x6E,0x76,0x31,0x61,0x00,0x00,0x00,0x36,0xF8,0xFF,0xFF,0x16,0x00,0x30,0x00,0x6C,0x00,0x00,0x00,
+      0x60,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xA4,0xEE,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x32,0x32,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xC0,0xEE,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,
+      0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x82,0xEB,0xFF,0xFF,
+      0x00,0x00,0x00,0x09,0x11,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x73,0x36,0x34,0x5F,
+      0x66,0x6E,0x76,0x31,0x61,0x00,0x00,0x00,0xC2,0xF8,0xFF,0xFF,0x15,0x00,0x2E,0x00,0xC8,0x00,0x00,0x00,
+      0xBC,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x7C,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x38,0xEF,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x31,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x54,0xEF,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+      0x66,0x6E,0x76,0x31,0x61,0x5F,0x33,0x32,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,
+      0x00,0x00,0x00,0x00,0x7C,0xEF,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x53,0x74,0x61,0x74,0x00,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x74,0x79,0x70,0x65,
+      0x00,0x00,0x00,0x00,0xA4,0xEF,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
+      0x6E,0x61,0x6B,0x65,0x64,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x63,0x70,0x70,0x5F,0x70,0x74,0x72,0x5F,
+      0x74,0x79,0x70,0x65,0x00,0x00,0x00,0x00,0x6A,0xEC,0xFF,0xFF,0x00,0x00,0x00,0x08,0x11,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x75,0x33,0x32,0x5F,0x66,0x6E,0x76,0x31,0x61,0x00,0x00,0x00,
+      0xAA,0xF9,0xFF,0xFF,0x14,0x00,0x2C,0x00,0x6C,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x18,0xF0,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x32,0x30,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x34,0xF0,0xFF,0xFF,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x33,0x32,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0xF6,0xEC,0xFF,0xFF,0x00,0x00,0x00,0x07,0x11,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x73,0x33,0x32,0x5F,0x66,0x6E,0x76,0x31,0x61,0x00,0x00,0x00,
+      0x36,0xFA,0xFF,0xFF,0x13,0x00,0x2A,0x00,0x68,0x00,0x00,0x00,0x5C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xA4,0xF0,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x39,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xC0,0xF0,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x07,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x5F,0x36,0x34,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,
+      0x00,0x00,0x00,0x00,0x7E,0xED,0xFF,0xFF,0x00,0x00,0x00,0x0A,0x10,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x68,0x61,0x73,0x68,0x75,0x36,0x34,0x5F,0x66,0x6E,0x76,0x31,0x00,0x00,0x00,0x00,0xBE,0xFA,0xFF,0xFF,
+      0x12,0x00,0x28,0x00,0x68,0x00,0x00,0x00,0x5C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2C,0xF1,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x38,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x48,0xF1,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+      0x66,0x6E,0x76,0x31,0x5F,0x36,0x34,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,
+      0x06,0xEE,0xFF,0xFF,0x00,0x00,0x00,0x09,0x10,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,
+      0x73,0x36,0x34,0x5F,0x66,0x6E,0x76,0x31,0x00,0x00,0x00,0x00,0x46,0xFB,0xFF,0xFF,0x11,0x00,0x26,0x00,
+      0x68,0x00,0x00,0x00,0x5C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xB4,0xF1,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x37,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0xD0,0xF1,0xFF,0xFF,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,
+      0x5F,0x33,0x32,0x00,0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x8E,0xEE,0xFF,0xFF,
+      0x00,0x00,0x00,0x08,0x10,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x75,0x33,0x32,0x5F,
+      0x66,0x6E,0x76,0x31,0x00,0x00,0x00,0x00,0xCE,0xFB,0xFF,0xFF,0x10,0x00,0x24,0x00,0x68,0x00,0x00,0x00,
+      0x5C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x3C,0xF2,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x31,0x36,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x58,0xF2,0xFF,0xFF,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x5F,0x33,0x32,0x00,
+      0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x16,0xEF,0xFF,0xFF,0x00,0x00,0x00,0x07,
+      0x10,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x68,0x61,0x73,0x68,0x73,0x33,0x32,0x5F,0x66,0x6E,0x76,0x31,
+      0x00,0x00,0x00,0x00,0x56,0xFC,0xFF,0xFF,0x0F,0x00,0x22,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xC0,0xF2,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x35,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x76,0xEF,0xFF,0xFF,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x62,0x6F,0x6F,0x6C,0x00,0x00,0x00,0x00,0xD4,0xFB,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x0E,0x00,0x20,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1C,0xF3,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x34,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0x7A,0xF0,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x03,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x65,0x6D,0x70,0x74,0x79,0x00,0x00,0x00,0x34,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x01,0x0D,0x00,0x1E,0x00,
+      0x74,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x80,0xF3,0xFF,0xFF,0x14,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x4D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x00,0x11,0x00,0x00,0x00,
+      0x6E,0x65,0x73,0x74,0x65,0x64,0x5F,0x66,0x6C,0x61,0x74,0x62,0x75,0x66,0x66,0x65,0x72,0x00,0x00,0x00,
+      0xB0,0xF3,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x31,0x33,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x4C,0xFC,0xFF,0xFF,0x00,0x00,0x0E,0x04,0x14,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x6E,0x65,0x73,0x74,0x65,0x64,0x66,0x6C,0x61,0x74,0x62,0x75,0x66,0x66,0x65,0x72,
+      0x00,0x00,0x00,0x00,0xD0,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x01,0x0C,0x00,0x1C,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x18,0xF4,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x31,0x32,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x76,0xF1,0xFF,0xFF,0x00,0x00,0x00,0x0F,
+      0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x65,0x6E,0x65,0x6D,0x79,0x00,0x00,0x00,0x2C,0xFD,0xFF,0xFF,
+      0x00,0x00,0x00,0x01,0x0B,0x00,0x1A,0x00,0xB0,0x00,0x00,0x00,0xA0,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0E,0x00,0x00,0x00,
+      0x20,0x6D,0x75,0x6C,0x74,0x69,0x6C,0x69,0x6E,0x65,0x20,0x74,0x6F,0x6F,0x00,0x00,0x49,0x00,0x00,0x00,
+      0x20,0x61,0x6E,0x20,0x65,0x78,0x61,0x6D,0x70,0x6C,0x65,0x20,0x64,0x6F,0x63,0x75,0x6D,0x65,0x6E,0x74,
+      0x61,0x74,0x69,0x6F,0x6E,0x20,0x63,0x6F,0x6D,0x6D,0x65,0x6E,0x74,0x3A,0x20,0x74,0x68,0x69,0x73,0x20,
+      0x77,0x69,0x6C,0x6C,0x20,0x65,0x6E,0x64,0x20,0x75,0x70,0x20,0x69,0x6E,0x20,0x74,0x68,0x65,0x20,0x67,
+      0x65,0x6E,0x65,0x72,0x61,0x74,0x65,0x64,0x20,0x63,0x6F,0x64,0x65,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0xE0,0xF4,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x31,0x31,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x32,0xFF,0xFF,0xFF,0x00,0x00,0x0E,0x0F,
+      0x01,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x61,0x72,0x72,0x61,0x79,0x6F,0x66,0x74,
+      0x61,0x62,0x6C,0x65,0x73,0x00,0x00,0x00,0x00,0xFE,0xFF,0xFF,0x00,0x00,0x00,0x01,0x0A,0x00,0x18,0x00,
+      0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0xF5,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x31,0x30,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xE4,0xFD,0xFF,0xFF,
+      0x00,0x00,0x0E,0x0D,0x11,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x61,0x72,0x72,0x61,0x79,0x6F,0x66,0x73,
+      0x74,0x72,0x69,0x6E,0x67,0x00,0x00,0x00,0x64,0xFE,0xFF,0xFF,0x00,0x00,0x00,0x01,0x09,0x00,0x16,0x00,
+      0x50,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xAC,0xF5,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x00,0x00,0x0A,0x00,
+      0x0C,0x00,0x06,0x00,0x07,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x0E,0x0F,0x05,0x00,0x00,0x00,
+      0x05,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x34,0x00,0x00,0x00,0xCC,0xFE,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x08,0x00,0x14,0x00,0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x14,0xF6,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,
+      0x72,0xF3,0xFF,0xFF,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x00,0x00,0x1A,0x00,0x18,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x07,0x00,0x12,0x00,0x44,0x00,0x00,0x00,
+      0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x84,0xF6,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x37,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xE2,0xF3,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x00,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x5F,0x74,0x79,0x70,0x65,0x00,0x00,0x00,
+      0x6A,0xFD,0xFF,0xFF,0x06,0x00,0x10,0x00,0x50,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xEC,0xF6,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x4A,0xF4,0xFF,0xFF,
+      0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x63,0x6F,0x6C,0x6F,0x72,0x00,0x00,0x00,
+      0x1C,0x00,0x1C,0x00,0x0C,0x00,0x10,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x14,0x00,0x18,0x00,0x07,0x00,0x1C,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x05,0x00,0x0E,0x00,
+      0x48,0x00,0x00,0x00,0x3C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x64,0xF7,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x08,0x00,0x08,0x00,
+      0x06,0x00,0x07,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x0E,0x04,0x09,0x00,0x00,0x00,0x69,0x6E,0x76,0x65,
+      0x6E,0x74,0x6F,0x72,0x79,0x00,0x1A,0x00,0x1C,0x00,0x0C,0x00,0x10,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,
+      0x00,0x00,0x07,0x00,0x00,0x00,0x00,0x00,0x14,0x00,0x18,0x00,0x1A,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+      0x04,0x00,0x0C,0x00,0x90,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x4C,0x00,0x00,0x00,0x2C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xE8,0xF7,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
+      0x08,0x00,0x00,0x00,0x70,0x72,0x69,0x6F,0x72,0x69,0x74,0x79,0x00,0x00,0x00,0x00,0x0C,0xF8,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x28,0xF8,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x30,0x00,0x00,0x00,0x0A,0x00,0x00,0x00,0x64,0x65,0x70,0x72,0x65,0x63,0x61,0x74,0x65,0x64,0x00,0x00,
+      0xE6,0xF4,0xFF,0xFF,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00,0x66,0x72,0x69,0x65,0x6E,0x64,0x6C,0x79,
+      0x00,0x00,0x1A,0x00,0x1C,0x00,0x0C,0x00,0x10,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x06,0x00,0x07,0x00,0x14,0x00,0x18,0x00,0x1A,0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x03,0x00,0x0A,0x00,
+      0x60,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xA8,0xF8,0xFF,0xFF,0x10,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x6B,0x65,0x79,0x00,
+      0xC4,0xF8,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x7A,0xF5,0xFF,0xFF,0x00,0x00,0x00,0x0D,0x04,0x00,0x00,0x00,
+      0x6E,0x61,0x6D,0x65,0x00,0x00,0x00,0x00,0xA2,0xFF,0xFF,0xFF,0x02,0x00,0x08,0x00,0x4C,0x00,0x00,0x00,
+      0x40,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x24,0xF9,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0xDA,0xF5,0xFF,0xFF,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x00,0x68,0x70,0x00,0x00,
+      0x00,0x00,0x1A,0x00,0x24,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,0x06,0x00,0x18,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x01,0x00,0x06,0x00,0x4C,0x00,0x00,0x00,
+      0x40,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x9C,0xF9,0xFF,0xFF,
+      0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x52,0xF6,0xFF,0xFF,0x00,0x00,0x00,0x05,0x04,0x00,0x00,0x00,0x6D,0x61,0x6E,0x61,
+      0x00,0x00,0x00,0x00,0x1C,0x00,0x18,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x14,0x00,0x05,0x00,0x1C,0x00,0x00,0x00,0x00,0x01,0x04,0x00,
+      0x44,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x0C,0xFA,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x6A,0xF7,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x08,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x70,0x6F,0x73,0x00,0x0A,0xF8,0xFF,0xFF,
+      0x1C,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x3C,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x52,0x65,0x66,0x65,0x72,0x72,0x61,0x62,0x6C,0x65,0x00,0x1A,0x00,
+      0x18,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,
+      0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x00,0x01,0x04,0x00,0x6C,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0xBC,0xFA,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x30,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x6B,0x65,0x79,0x00,0xD8,0xFA,0xFF,0xFF,0x18,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x66,0x6E,0x76,0x31,0x61,0x5F,0x36,0x34,0x00,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x68,0x61,0x73,0x68,0x00,0x00,0x00,0x00,0x9A,0xF7,0xFF,0xFF,0x00,0x00,0x00,0x0A,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0xDE,0xF8,0xFF,0xFF,0x24,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+      0xBC,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x53,0x74,0x61,0x74,0x00,0x00,0x00,0x1A,0x00,0x1C,0x00,0x0C,0x00,
+      0x10,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x14,0x00,0x18,0x00,
+      0x1A,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00,0x08,0x00,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x94,0xFB,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x03,0x00,0x00,0x00,0x6B,0x65,0x79,0x00,0x4A,0xF8,0xFF,0xFF,0x00,0x00,0x00,0x06,0x05,0x00,0x00,0x00,
+      0x63,0x6F,0x75,0x6E,0x74,0x00,0x00,0x00,0xC2,0xFA,0xFF,0xFF,0x01,0x00,0x06,0x00,0x18,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x76,0xF8,0xFF,0xFF,0x00,0x00,0x00,0x09,
+      0x03,0x00,0x00,0x00,0x76,0x61,0x6C,0x00,0x64,0xF9,0xFF,0xFF,0x00,0x01,0x04,0x00,0x18,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x9E,0xF8,0xFF,0xFF,0x00,0x00,0x00,0x0D,
+      0x02,0x00,0x00,0x00,0x69,0x64,0x00,0x00,0x3A,0xF9,0xFF,0xFF,0x00,0x00,0x00,0x01,0x28,0x00,0x00,0x00,
+      0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x03,0x00,0x00,0x00,0xAC,0x00,0x00,0x00,0x5C,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x1E,0x00,0x00,0x00,
+      0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x53,0x74,0x72,0x75,0x63,
+      0x74,0x4F,0x66,0x53,0x74,0x72,0x75,0x63,0x74,0x73,0x00,0x00,0xE0,0xFD,0xFF,0xFF,0x00,0x00,0x00,0x01,
+      0x02,0x00,0x0C,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xC6,0xF9,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
+      0x10,0xFE,0xFF,0xFF,0x00,0x00,0x00,0x01,0x01,0x00,0x08,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xF6,0xF9,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x05,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1C,0x00,0x14,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x07,0x00,0x1C,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x01,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x3E,0xFA,0xFF,0xFF,0x00,0x00,0x00,0x0F,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
+      0x36,0xFA,0xFF,0xFF,0x00,0x00,0x00,0x01,0x24,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x08,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
+      0x68,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,
+      0x6C,0x65,0x2E,0x41,0x62,0x69,0x6C,0x69,0x74,0x79,0x00,0x00,0x56,0xFC,0xFF,0xFF,0x01,0x00,0x04,0x00,
+      0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x0A,0xFA,0xFF,0xFF,
+      0x00,0x00,0x00,0x08,0x08,0x00,0x00,0x00,0x64,0x69,0x73,0x74,0x61,0x6E,0x63,0x65,0x00,0x00,0x1A,0x00,
+      0x18,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,
+      0x10,0x00,0x14,0x00,0x1A,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x40,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xC4,0xFD,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x03,0x00,0x00,0x00,0x6B,0x65,0x79,0x00,0x7A,0xFA,0xFF,0xFF,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0x00,
+      0x69,0x64,0x00,0x00,0x00,0x00,0x12,0x00,0x20,0x00,0x08,0x00,0x0C,0x00,0x07,0x00,0x10,0x00,0x14,0x00,
+      0x18,0x00,0x1C,0x00,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x64,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
+      0x08,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x30,0xFE,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x0B,0x00,0x00,0x00,0x66,0x6F,0x72,0x63,0x65,0x5F,0x61,0x6C,
+      0x69,0x67,0x6E,0x00,0x06,0x00,0x00,0x00,0xB0,0x00,0x00,0x00,0x7C,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
+      0x20,0x01,0x00,0x00,0xF4,0x00,0x00,0x00,0xC8,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,
+      0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x56,0x65,0x63,0x33,0x00,0x1C,0x00,0x18,0x00,
+      0x0C,0x00,0x10,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x14,0x00,0x07,0x00,0x1C,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x05,0x00,0x1A,0x00,0x1C,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x0F,
+      0x05,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x74,0x65,0x73,0x74,0x33,0x00,0x00,0x00,0xD6,0xFD,0xFF,0xFF,
+      0x04,0x00,0x18,0x00,0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x32,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x74,0x65,0x73,0x74,
+      0x32,0x00,0x00,0x00,0x06,0xFE,0xFF,0xFF,0x03,0x00,0x10,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xBA,0xFB,0xFF,0xFF,0x00,0x00,0x00,0x0C,0x05,0x00,0x00,0x00,
+      0x74,0x65,0x73,0x74,0x31,0x00,0x00,0x00,0x32,0xFE,0xFF,0xFF,0x02,0x00,0x08,0x00,0x18,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xE6,0xFB,0xFF,0xFF,0x00,0x00,0x00,0x0B,
+      0x01,0x00,0x00,0x00,0x7A,0x00,0x00,0x00,0x5A,0xFE,0xFF,0xFF,0x01,0x00,0x04,0x00,0x18,0x00,0x00,0x00,
+      0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x0E,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x0B,
+      0x01,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x4E,0xFC,0xFF,0xFF,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0xFC,0xFF,0xFF,0x00,0x00,0x00,0x0B,0x01,0x00,0x00,0x00,
+      0x78,0x00,0x12,0x00,0x18,0x00,0x04,0x00,0x08,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x10,0x00,0x14,0x00,
+      0x12,0x00,0x00,0x00,0x7C,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0xE0,0xFF,0xFF,0xFF,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x07,0x00,0x00,0x00,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,0x08,0x00,
+      0x08,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+      0x0E,0x00,0x00,0x00,0x63,0x73,0x68,0x61,0x72,0x70,0x5F,0x70,0x61,0x72,0x74,0x69,0x61,0x6C,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x4C,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,
+      0x78,0x61,0x6D,0x70,0x6C,0x65,0x2E,0x54,0x65,0x73,0x74,0x53,0x69,0x6D,0x70,0x6C,0x65,0x54,0x61,0x62,
+      0x6C,0x65,0x57,0x69,0x74,0x68,0x45,0x6E,0x75,0x6D,0x00,0x00,0x00,0x00,0x1A,0x00,0x20,0x00,0x08,0x00,
+      0x0C,0x00,0x00,0x00,0x06,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,
+      0x1A,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x28,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xE6,0xFD,0xFF,0xFF,
+      0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x63,0x6F,0x6C,0x6F,0x72,0x00,0x00,0x00,
+      0xE2,0xFD,0xFF,0xFF,0x00,0x00,0x00,0x01,0x24,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
+      0x38,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,
+      0x6C,0x65,0x2E,0x54,0x65,0x73,0x74,0x00,0x00,0x00,0x1A,0x00,0x14,0x00,0x08,0x00,0x0C,0x00,0x04,0x00,
+      0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x1A,0x00,0x00,0x00,
+      0x01,0x00,0x02,0x00,0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0xCE,0xFD,0xFF,0xFF,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x0E,0xFE,0xFF,0xFF,
+      0x18,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xF2,0xFD,0xFF,0xFF,
+      0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x36,0xFF,0xFF,0xFF,0x18,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x17,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x45,0x78,0x61,0x6D,0x70,0x6C,0x65,0x32,0x2E,
+      0x4D,0x6F,0x6E,0x73,0x74,0x65,0x72,0x00,0x6E,0xFF,0xFF,0xFF,0x18,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
+      0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x49,0x6E,0x50,0x61,0x72,0x65,0x6E,0x74,0x4E,0x61,0x6D,0x65,0x73,
+      0x70,0x61,0x63,0x65,0x00,0x00,0x00,0x00,0xAA,0xFF,0xFF,0xFF,0x1C,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+      0x06,0x00,0x00,0x00,0x54,0x61,0x62,0x6C,0x65,0x41,0x00,0x00,0x80,0xFF,0xFF,0xFF,0x00,0x01,0x04,0x00,
+      0x1C,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x62,0xFF,0xFF,0xFF,
+      0x00,0x00,0x00,0x0F,0x0B,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x62,0x00,0x12,0x00,0x14,0x00,0x04,0x00,
+      0x08,0x00,0x00,0x00,0x0C,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x12,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,
+      0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x44,0x00,0x00,0x00,0x1C,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x4F,0x74,0x68,0x65,0x72,
+      0x4E,0x61,0x6D,0x65,0x53,0x70,0x61,0x63,0x65,0x2E,0x54,0x61,0x62,0x6C,0x65,0x42,0x00,0x00,0x00,0x00,
+      0x1C,0x00,0x14,0x00,0x08,0x00,0x0C,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x00,0x00,0x10,0x00,0x05,0x00,0x1C,0x00,0x00,0x00,0x00,0x01,0x04,0x00,0x28,0x00,0x00,0x00,
+      0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x0A,0x00,0x0C,0x00,0x07,0x00,
+      0x00,0x00,0x08,0x00,0x0A,0x00,0x00,0x00,0x00,0x00,0x00,0x0F,0x0D,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+      0x61,0x00,0x12,0x00,0x1C,0x00,0x08,0x00,0x0C,0x00,0x07,0x00,0x10,0x00,0x14,0x00,0x00,0x00,0x18,0x00,
+      0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x20,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+      0x04,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+      0x1C,0x00,0x00,0x00,0x4D,0x79,0x47,0x61,0x6D,0x65,0x2E,0x4F,0x74,0x68,0x65,0x72,0x4E,0x61,0x6D,0x65,
+      0x53,0x70,0x61,0x63,0x65,0x2E,0x55,0x6E,0x75,0x73,0x65,0x64,0x00,0x00,0x1A,0x00,0x10,0x00,0x04,0x00,
+      0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x0C,0x00,
+      0x1A,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+      0x00,0x00,0x06,0x00,0x08,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x00,
+      0x61,0x00,0x00,0x00
+    };
+    return bfbsData;
+  }
+  static size_t size() {
+    return 12944;
+  }
+  const uint8_t *begin() {
+    return data();
+  }
+  const uint8_t *end() {
+    return data() + size();
+  }
+};
+
+}  // namespace Example
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_BFBS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.h
new file mode 100644
index 0000000..4ec7ed0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.h
@@ -0,0 +1,3694 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
+#define FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+
+namespace MyGame {
+
+struct InParentNamespace;
+struct InParentNamespaceBuilder;
+struct InParentNamespaceT;
+
+namespace Example2 {
+
+struct Monster;
+struct MonsterBuilder;
+struct MonsterT;
+
+}  // namespace Example2
+
+namespace Example {
+
+struct Test;
+
+struct TestSimpleTableWithEnum;
+struct TestSimpleTableWithEnumBuilder;
+struct TestSimpleTableWithEnumT;
+
+struct Vec3;
+
+struct Ability;
+
+struct StructOfStructs;
+
+struct Stat;
+struct StatBuilder;
+struct StatT;
+
+struct Referrable;
+struct ReferrableBuilder;
+struct ReferrableT;
+
+struct Monster;
+struct MonsterBuilder;
+struct MonsterT;
+
+struct TypeAliases;
+struct TypeAliasesBuilder;
+struct TypeAliasesT;
+
+}  // namespace Example
+
+bool operator==(const InParentNamespaceT &lhs, const InParentNamespaceT &rhs);
+bool operator!=(const InParentNamespaceT &lhs, const InParentNamespaceT &rhs);
+namespace Example2 {
+
+bool operator==(const MonsterT &lhs, const MonsterT &rhs);
+bool operator!=(const MonsterT &lhs, const MonsterT &rhs);
+}  // namespace Example2
+
+namespace Example {
+
+bool operator==(const Test &lhs, const Test &rhs);
+bool operator!=(const Test &lhs, const Test &rhs);
+bool operator==(const TestSimpleTableWithEnumT &lhs, const TestSimpleTableWithEnumT &rhs);
+bool operator!=(const TestSimpleTableWithEnumT &lhs, const TestSimpleTableWithEnumT &rhs);
+bool operator==(const Vec3 &lhs, const Vec3 &rhs);
+bool operator!=(const Vec3 &lhs, const Vec3 &rhs);
+bool operator==(const Ability &lhs, const Ability &rhs);
+bool operator!=(const Ability &lhs, const Ability &rhs);
+bool operator==(const StructOfStructs &lhs, const StructOfStructs &rhs);
+bool operator!=(const StructOfStructs &lhs, const StructOfStructs &rhs);
+bool operator==(const StatT &lhs, const StatT &rhs);
+bool operator!=(const StatT &lhs, const StatT &rhs);
+bool operator==(const ReferrableT &lhs, const ReferrableT &rhs);
+bool operator!=(const ReferrableT &lhs, const ReferrableT &rhs);
+bool operator==(const MonsterT &lhs, const MonsterT &rhs);
+bool operator!=(const MonsterT &lhs, const MonsterT &rhs);
+bool operator==(const TypeAliasesT &lhs, const TypeAliasesT &rhs);
+bool operator!=(const TypeAliasesT &lhs, const TypeAliasesT &rhs);
+
+}  // namespace Example
+
+inline const flatbuffers::TypeTable *InParentNamespaceTypeTable();
+
+namespace Example2 {
+
+inline const flatbuffers::TypeTable *MonsterTypeTable();
+
+}  // namespace Example2
+
+namespace Example {
+
+inline const flatbuffers::TypeTable *TestTypeTable();
+
+inline const flatbuffers::TypeTable *TestSimpleTableWithEnumTypeTable();
+
+inline const flatbuffers::TypeTable *Vec3TypeTable();
+
+inline const flatbuffers::TypeTable *AbilityTypeTable();
+
+inline const flatbuffers::TypeTable *StructOfStructsTypeTable();
+
+inline const flatbuffers::TypeTable *StatTypeTable();
+
+inline const flatbuffers::TypeTable *ReferrableTypeTable();
+
+inline const flatbuffers::TypeTable *MonsterTypeTable();
+
+inline const flatbuffers::TypeTable *TypeAliasesTypeTable();
+
+/// Composite components of Monster color.
+enum Color : uint8_t {
+  Color_Red = 1,
+  /// \brief color Green
+  /// Green is bit_flag with value (1u << 1)
+  Color_Green = 2,
+  /// \brief color Blue (1u << 3)
+  Color_Blue = 8,
+  Color_NONE = 0,
+  Color_ANY = 11
+};
+
+inline const Color (&EnumValuesColor())[3] {
+  static const Color values[] = {
+    Color_Red,
+    Color_Green,
+    Color_Blue
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesColor() {
+  static const char * const names[9] = {
+    "Red",
+    "Green",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "Blue",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameColor(Color e) {
+  if (flatbuffers::IsOutRange(e, Color_Red, Color_Blue)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(Color_Red);
+  return EnumNamesColor()[index];
+}
+
+enum Race : int8_t {
+  Race_None = -1,
+  Race_Human = 0,
+  Race_Dwarf = 1,
+  Race_Elf = 2,
+  Race_MIN = Race_None,
+  Race_MAX = Race_Elf
+};
+
+inline const Race (&EnumValuesRace())[4] {
+  static const Race values[] = {
+    Race_None,
+    Race_Human,
+    Race_Dwarf,
+    Race_Elf
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesRace() {
+  static const char * const names[5] = {
+    "None",
+    "Human",
+    "Dwarf",
+    "Elf",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameRace(Race e) {
+  if (flatbuffers::IsOutRange(e, Race_None, Race_Elf)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(Race_None);
+  return EnumNamesRace()[index];
+}
+
+enum Any : uint8_t {
+  Any_NONE = 0,
+  Any_Monster = 1,
+  Any_TestSimpleTableWithEnum = 2,
+  Any_MyGame_Example2_Monster = 3,
+  Any_MIN = Any_NONE,
+  Any_MAX = Any_MyGame_Example2_Monster
+};
+
+inline const Any (&EnumValuesAny())[4] {
+  static const Any values[] = {
+    Any_NONE,
+    Any_Monster,
+    Any_TestSimpleTableWithEnum,
+    Any_MyGame_Example2_Monster
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAny() {
+  static const char * const names[5] = {
+    "NONE",
+    "Monster",
+    "TestSimpleTableWithEnum",
+    "MyGame_Example2_Monster",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAny(Any e) {
+  if (flatbuffers::IsOutRange(e, Any_NONE, Any_MyGame_Example2_Monster)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAny()[index];
+}
+
+template<typename T> struct AnyTraits {
+  static const Any enum_value = Any_NONE;
+};
+
+template<> struct AnyTraits<MyGame::Example::Monster> {
+  static const Any enum_value = Any_Monster;
+};
+
+template<> struct AnyTraits<MyGame::Example::TestSimpleTableWithEnum> {
+  static const Any enum_value = Any_TestSimpleTableWithEnum;
+};
+
+template<> struct AnyTraits<MyGame::Example2::Monster> {
+  static const Any enum_value = Any_MyGame_Example2_Monster;
+};
+
+struct AnyUnion {
+  Any type;
+  void *value;
+
+  AnyUnion() : type(Any_NONE), value(nullptr) {}
+  AnyUnion(AnyUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(Any_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyUnion(const AnyUnion &);
+  AnyUnion &operator=(const AnyUnion &u)
+    { AnyUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyUnion &operator=(AnyUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = AnyTraits<typename RT::TableType>::enum_value;
+    if (type != Any_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, Any type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsMonster() {
+    return type == Any_Monster ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsMonster() const {
+    return type == Any_Monster ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::TestSimpleTableWithEnumT *AsTestSimpleTableWithEnum() {
+    return type == Any_TestSimpleTableWithEnum ?
+      reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnumT *AsTestSimpleTableWithEnum() const {
+    return type == Any_TestSimpleTableWithEnum ?
+      reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  MyGame::Example2::MonsterT *AsMyGame_Example2_Monster() {
+    return type == Any_MyGame_Example2_Monster ?
+      reinterpret_cast<MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example2::MonsterT *AsMyGame_Example2_Monster() const {
+    return type == Any_MyGame_Example2_Monster ?
+      reinterpret_cast<const MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const AnyUnion &lhs, const AnyUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case Any_NONE: {
+      return true;
+    }
+    case Any_Monster: {
+      return *(reinterpret_cast<const MyGame::Example::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::MonsterT *>(rhs.value));
+    }
+    case Any_TestSimpleTableWithEnum: {
+      return *(reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(rhs.value));
+    }
+    case Any_MyGame_Example2_Monster: {
+      return *(reinterpret_cast<const MyGame::Example2::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example2::MonsterT *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const AnyUnion &lhs, const AnyUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyAny(flatbuffers::Verifier &verifier, const void *obj, Any type);
+bool VerifyAnyVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum AnyUniqueAliases : uint8_t {
+  AnyUniqueAliases_NONE = 0,
+  AnyUniqueAliases_M = 1,
+  AnyUniqueAliases_TS = 2,
+  AnyUniqueAliases_M2 = 3,
+  AnyUniqueAliases_MIN = AnyUniqueAliases_NONE,
+  AnyUniqueAliases_MAX = AnyUniqueAliases_M2
+};
+
+inline const AnyUniqueAliases (&EnumValuesAnyUniqueAliases())[4] {
+  static const AnyUniqueAliases values[] = {
+    AnyUniqueAliases_NONE,
+    AnyUniqueAliases_M,
+    AnyUniqueAliases_TS,
+    AnyUniqueAliases_M2
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAnyUniqueAliases() {
+  static const char * const names[5] = {
+    "NONE",
+    "M",
+    "TS",
+    "M2",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAnyUniqueAliases(AnyUniqueAliases e) {
+  if (flatbuffers::IsOutRange(e, AnyUniqueAliases_NONE, AnyUniqueAliases_M2)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAnyUniqueAliases()[index];
+}
+
+template<typename T> struct AnyUniqueAliasesTraits {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases_NONE;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example::Monster> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases_M;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example::TestSimpleTableWithEnum> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases_TS;
+};
+
+template<> struct AnyUniqueAliasesTraits<MyGame::Example2::Monster> {
+  static const AnyUniqueAliases enum_value = AnyUniqueAliases_M2;
+};
+
+struct AnyUniqueAliasesUnion {
+  AnyUniqueAliases type;
+  void *value;
+
+  AnyUniqueAliasesUnion() : type(AnyUniqueAliases_NONE), value(nullptr) {}
+  AnyUniqueAliasesUnion(AnyUniqueAliasesUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(AnyUniqueAliases_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyUniqueAliasesUnion(const AnyUniqueAliasesUnion &);
+  AnyUniqueAliasesUnion &operator=(const AnyUniqueAliasesUnion &u)
+    { AnyUniqueAliasesUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyUniqueAliasesUnion &operator=(AnyUniqueAliasesUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyUniqueAliasesUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = AnyUniqueAliasesTraits<typename RT::TableType>::enum_value;
+    if (type != AnyUniqueAliases_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, AnyUniqueAliases type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsM() {
+    return type == AnyUniqueAliases_M ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM() const {
+    return type == AnyUniqueAliases_M ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::TestSimpleTableWithEnumT *AsTS() {
+    return type == AnyUniqueAliases_TS ?
+      reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnumT *AsTS() const {
+    return type == AnyUniqueAliases_TS ?
+      reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value) : nullptr;
+  }
+  MyGame::Example2::MonsterT *AsM2() {
+    return type == AnyUniqueAliases_M2 ?
+      reinterpret_cast<MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example2::MonsterT *AsM2() const {
+    return type == AnyUniqueAliases_M2 ?
+      reinterpret_cast<const MyGame::Example2::MonsterT *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const AnyUniqueAliasesUnion &lhs, const AnyUniqueAliasesUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case AnyUniqueAliases_NONE: {
+      return true;
+    }
+    case AnyUniqueAliases_M: {
+      return *(reinterpret_cast<const MyGame::Example::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::MonsterT *>(rhs.value));
+    }
+    case AnyUniqueAliases_TS: {
+      return *(reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(rhs.value));
+    }
+    case AnyUniqueAliases_M2: {
+      return *(reinterpret_cast<const MyGame::Example2::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example2::MonsterT *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const AnyUniqueAliasesUnion &lhs, const AnyUniqueAliasesUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyAnyUniqueAliases(flatbuffers::Verifier &verifier, const void *obj, AnyUniqueAliases type);
+bool VerifyAnyUniqueAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum AnyAmbiguousAliases : uint8_t {
+  AnyAmbiguousAliases_NONE = 0,
+  AnyAmbiguousAliases_M1 = 1,
+  AnyAmbiguousAliases_M2 = 2,
+  AnyAmbiguousAliases_M3 = 3,
+  AnyAmbiguousAliases_MIN = AnyAmbiguousAliases_NONE,
+  AnyAmbiguousAliases_MAX = AnyAmbiguousAliases_M3
+};
+
+inline const AnyAmbiguousAliases (&EnumValuesAnyAmbiguousAliases())[4] {
+  static const AnyAmbiguousAliases values[] = {
+    AnyAmbiguousAliases_NONE,
+    AnyAmbiguousAliases_M1,
+    AnyAmbiguousAliases_M2,
+    AnyAmbiguousAliases_M3
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAnyAmbiguousAliases() {
+  static const char * const names[5] = {
+    "NONE",
+    "M1",
+    "M2",
+    "M3",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAnyAmbiguousAliases(AnyAmbiguousAliases e) {
+  if (flatbuffers::IsOutRange(e, AnyAmbiguousAliases_NONE, AnyAmbiguousAliases_M3)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAnyAmbiguousAliases()[index];
+}
+
+struct AnyAmbiguousAliasesUnion {
+  AnyAmbiguousAliases type;
+  void *value;
+
+  AnyAmbiguousAliasesUnion() : type(AnyAmbiguousAliases_NONE), value(nullptr) {}
+  AnyAmbiguousAliasesUnion(AnyAmbiguousAliasesUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(AnyAmbiguousAliases_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  AnyAmbiguousAliasesUnion(const AnyAmbiguousAliasesUnion &);
+  AnyAmbiguousAliasesUnion &operator=(const AnyAmbiguousAliasesUnion &u)
+    { AnyAmbiguousAliasesUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  AnyAmbiguousAliasesUnion &operator=(AnyAmbiguousAliasesUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~AnyAmbiguousAliasesUnion() { Reset(); }
+
+  void Reset();
+
+  static void *UnPack(const void *obj, AnyAmbiguousAliases type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  MyGame::Example::MonsterT *AsM1() {
+    return type == AnyAmbiguousAliases_M1 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM1() const {
+    return type == AnyAmbiguousAliases_M1 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::MonsterT *AsM2() {
+    return type == AnyAmbiguousAliases_M2 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM2() const {
+    return type == AnyAmbiguousAliases_M2 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  MyGame::Example::MonsterT *AsM3() {
+    return type == AnyAmbiguousAliases_M3 ?
+      reinterpret_cast<MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+  const MyGame::Example::MonsterT *AsM3() const {
+    return type == AnyAmbiguousAliases_M3 ?
+      reinterpret_cast<const MyGame::Example::MonsterT *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const AnyAmbiguousAliasesUnion &lhs, const AnyAmbiguousAliasesUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case AnyAmbiguousAliases_NONE: {
+      return true;
+    }
+    case AnyAmbiguousAliases_M1: {
+      return *(reinterpret_cast<const MyGame::Example::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::MonsterT *>(rhs.value));
+    }
+    case AnyAmbiguousAliases_M2: {
+      return *(reinterpret_cast<const MyGame::Example::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::MonsterT *>(rhs.value));
+    }
+    case AnyAmbiguousAliases_M3: {
+      return *(reinterpret_cast<const MyGame::Example::MonsterT *>(lhs.value)) ==
+             *(reinterpret_cast<const MyGame::Example::MonsterT *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const AnyAmbiguousAliasesUnion &lhs, const AnyAmbiguousAliasesUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyAnyAmbiguousAliases(flatbuffers::Verifier &verifier, const void *obj, AnyAmbiguousAliases type);
+bool VerifyAnyAmbiguousAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Test FLATBUFFERS_FINAL_CLASS {
+ private:
+  int16_t a_;
+  int8_t b_;
+  int8_t padding0__;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TestTypeTable();
+  }
+  Test()
+      : a_(0),
+        b_(0),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  Test(int16_t _a, int8_t _b)
+      : a_(flatbuffers::EndianScalar(_a)),
+        b_(flatbuffers::EndianScalar(_b)),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  int16_t a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  void mutate_a(int16_t _a) {
+    flatbuffers::WriteScalar(&a_, _a);
+  }
+  int8_t b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+  void mutate_b(int8_t _b) {
+    flatbuffers::WriteScalar(&b_, _b);
+  }
+};
+FLATBUFFERS_STRUCT_END(Test, 4);
+
+inline bool operator==(const Test &lhs, const Test &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b());
+}
+
+inline bool operator!=(const Test &lhs, const Test &rhs) {
+    return !(lhs == rhs);
+}
+
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Vec3 FLATBUFFERS_FINAL_CLASS {
+ private:
+  float x_;
+  float y_;
+  float z_;
+  int32_t padding0__;
+  double test1_;
+  uint8_t test2_;
+  int8_t padding1__;
+  MyGame::Example::Test test3_;
+  int16_t padding2__;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Vec3TypeTable();
+  }
+  Vec3()
+      : x_(0),
+        y_(0),
+        z_(0),
+        padding0__(0),
+        test1_(0),
+        test2_(0),
+        padding1__(0),
+        test3_(),
+        padding2__(0) {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+  }
+  Vec3(float _x, float _y, float _z, double _test1, MyGame::Example::Color _test2, const MyGame::Example::Test &_test3)
+      : x_(flatbuffers::EndianScalar(_x)),
+        y_(flatbuffers::EndianScalar(_y)),
+        z_(flatbuffers::EndianScalar(_z)),
+        padding0__(0),
+        test1_(flatbuffers::EndianScalar(_test1)),
+        test2_(flatbuffers::EndianScalar(static_cast<uint8_t>(_test2))),
+        padding1__(0),
+        test3_(_test3),
+        padding2__(0) {
+    (void)padding0__;
+    (void)padding1__;
+    (void)padding2__;
+  }
+  float x() const {
+    return flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(float _x) {
+    flatbuffers::WriteScalar(&x_, _x);
+  }
+  float y() const {
+    return flatbuffers::EndianScalar(y_);
+  }
+  void mutate_y(float _y) {
+    flatbuffers::WriteScalar(&y_, _y);
+  }
+  float z() const {
+    return flatbuffers::EndianScalar(z_);
+  }
+  void mutate_z(float _z) {
+    flatbuffers::WriteScalar(&z_, _z);
+  }
+  double test1() const {
+    return flatbuffers::EndianScalar(test1_);
+  }
+  void mutate_test1(double _test1) {
+    flatbuffers::WriteScalar(&test1_, _test1);
+  }
+  MyGame::Example::Color test2() const {
+    return static_cast<MyGame::Example::Color>(flatbuffers::EndianScalar(test2_));
+  }
+  void mutate_test2(MyGame::Example::Color _test2) {
+    flatbuffers::WriteScalar(&test2_, static_cast<uint8_t>(_test2));
+  }
+  const MyGame::Example::Test &test3() const {
+    return test3_;
+  }
+  MyGame::Example::Test &mutable_test3() {
+    return test3_;
+  }
+};
+FLATBUFFERS_STRUCT_END(Vec3, 32);
+
+inline bool operator==(const Vec3 &lhs, const Vec3 &rhs) {
+  return
+      (lhs.x() == rhs.x()) &&
+      (lhs.y() == rhs.y()) &&
+      (lhs.z() == rhs.z()) &&
+      (lhs.test1() == rhs.test1()) &&
+      (lhs.test2() == rhs.test2()) &&
+      (lhs.test3() == rhs.test3());
+}
+
+inline bool operator!=(const Vec3 &lhs, const Vec3 &rhs) {
+    return !(lhs == rhs);
+}
+
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Ability FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t id_;
+  uint32_t distance_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AbilityTypeTable();
+  }
+  Ability()
+      : id_(0),
+        distance_(0) {
+  }
+  Ability(uint32_t _id, uint32_t _distance)
+      : id_(flatbuffers::EndianScalar(_id)),
+        distance_(flatbuffers::EndianScalar(_distance)) {
+  }
+  uint32_t id() const {
+    return flatbuffers::EndianScalar(id_);
+  }
+  void mutate_id(uint32_t _id) {
+    flatbuffers::WriteScalar(&id_, _id);
+  }
+  bool KeyCompareLessThan(const Ability *o) const {
+    return id() < o->id();
+  }
+  int KeyCompareWithValue(uint32_t val) const {
+    return static_cast<int>(id() > val) - static_cast<int>(id() < val);
+  }
+  uint32_t distance() const {
+    return flatbuffers::EndianScalar(distance_);
+  }
+  void mutate_distance(uint32_t _distance) {
+    flatbuffers::WriteScalar(&distance_, _distance);
+  }
+};
+FLATBUFFERS_STRUCT_END(Ability, 8);
+
+inline bool operator==(const Ability &lhs, const Ability &rhs) {
+  return
+      (lhs.id() == rhs.id()) &&
+      (lhs.distance() == rhs.distance());
+}
+
+inline bool operator!=(const Ability &lhs, const Ability &rhs) {
+    return !(lhs == rhs);
+}
+
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) StructOfStructs FLATBUFFERS_FINAL_CLASS {
+ private:
+  MyGame::Example::Ability a_;
+  MyGame::Example::Test b_;
+  MyGame::Example::Ability c_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StructOfStructsTypeTable();
+  }
+  StructOfStructs()
+      : a_(),
+        b_(),
+        c_() {
+  }
+  StructOfStructs(const MyGame::Example::Ability &_a, const MyGame::Example::Test &_b, const MyGame::Example::Ability &_c)
+      : a_(_a),
+        b_(_b),
+        c_(_c) {
+  }
+  const MyGame::Example::Ability &a() const {
+    return a_;
+  }
+  MyGame::Example::Ability &mutable_a() {
+    return a_;
+  }
+  const MyGame::Example::Test &b() const {
+    return b_;
+  }
+  MyGame::Example::Test &mutable_b() {
+    return b_;
+  }
+  const MyGame::Example::Ability &c() const {
+    return c_;
+  }
+  MyGame::Example::Ability &mutable_c() {
+    return c_;
+  }
+};
+FLATBUFFERS_STRUCT_END(StructOfStructs, 20);
+
+inline bool operator==(const StructOfStructs &lhs, const StructOfStructs &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b()) &&
+      (lhs.c() == rhs.c());
+}
+
+inline bool operator!=(const StructOfStructs &lhs, const StructOfStructs &rhs) {
+    return !(lhs == rhs);
+}
+
+
+}  // namespace Example
+
+struct InParentNamespaceT : public flatbuffers::NativeTable {
+  typedef InParentNamespace TableType;
+};
+
+inline bool operator==(const InParentNamespaceT &, const InParentNamespaceT &) {
+  return true;
+}
+
+inline bool operator!=(const InParentNamespaceT &lhs, const InParentNamespaceT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct InParentNamespace FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef InParentNamespaceT NativeTableType;
+  typedef InParentNamespaceBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return InParentNamespaceTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  InParentNamespaceT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InParentNamespaceT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<InParentNamespace> Pack(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InParentNamespaceBuilder {
+  typedef InParentNamespace Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit InParentNamespaceBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<InParentNamespace> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<InParentNamespace>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  InParentNamespaceBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace Example2 {
+
+struct MonsterT : public flatbuffers::NativeTable {
+  typedef Monster TableType;
+};
+
+inline bool operator==(const MonsterT &, const MonsterT &) {
+  return true;
+}
+
+inline bool operator!=(const MonsterT &lhs, const MonsterT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Monster FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterT NativeTableType;
+  typedef MonsterBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MonsterT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Monster> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MonsterBuilder {
+  typedef Monster Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Monster> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Monster>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Monster> CreateMonster(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MonsterBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace Example2
+
+namespace Example {
+
+struct TestSimpleTableWithEnumT : public flatbuffers::NativeTable {
+  typedef TestSimpleTableWithEnum TableType;
+  MyGame::Example::Color color = MyGame::Example::Color_Green;
+};
+
+inline bool operator==(const TestSimpleTableWithEnumT &lhs, const TestSimpleTableWithEnumT &rhs) {
+  return
+      (lhs.color == rhs.color);
+}
+
+inline bool operator!=(const TestSimpleTableWithEnumT &lhs, const TestSimpleTableWithEnumT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TestSimpleTableWithEnum FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TestSimpleTableWithEnumT NativeTableType;
+  typedef TestSimpleTableWithEnumBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TestSimpleTableWithEnumTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COLOR = 4
+  };
+  MyGame::Example::Color color() const {
+    return static_cast<MyGame::Example::Color>(GetField<uint8_t>(VT_COLOR, 2));
+  }
+  bool mutate_color(MyGame::Example::Color _color) {
+    return SetField<uint8_t>(VT_COLOR, static_cast<uint8_t>(_color), 2);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_COLOR) &&
+           verifier.EndTable();
+  }
+  TestSimpleTableWithEnumT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TestSimpleTableWithEnumT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TestSimpleTableWithEnum> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TestSimpleTableWithEnumBuilder {
+  typedef TestSimpleTableWithEnum Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_color(MyGame::Example::Color color) {
+    fbb_.AddElement<uint8_t>(TestSimpleTableWithEnum::VT_COLOR, static_cast<uint8_t>(color), 2);
+  }
+  explicit TestSimpleTableWithEnumBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TestSimpleTableWithEnum> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TestSimpleTableWithEnum>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MyGame::Example::Color color = MyGame::Example::Color_Green) {
+  TestSimpleTableWithEnumBuilder builder_(_fbb);
+  builder_.add_color(color);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StatT : public flatbuffers::NativeTable {
+  typedef Stat TableType;
+  std::string id{};
+  int64_t val = 0;
+  uint16_t count = 0;
+};
+
+inline bool operator==(const StatT &lhs, const StatT &rhs) {
+  return
+      (lhs.id == rhs.id) &&
+      (lhs.val == rhs.val) &&
+      (lhs.count == rhs.count);
+}
+
+inline bool operator!=(const StatT &lhs, const StatT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Stat FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef StatT NativeTableType;
+  typedef StatBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StatTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4,
+    VT_VAL = 6,
+    VT_COUNT = 8
+  };
+  const flatbuffers::String *id() const {
+    return GetPointer<const flatbuffers::String *>(VT_ID);
+  }
+  flatbuffers::String *mutable_id() {
+    return GetPointer<flatbuffers::String *>(VT_ID);
+  }
+  int64_t val() const {
+    return GetField<int64_t>(VT_VAL, 0);
+  }
+  bool mutate_val(int64_t _val) {
+    return SetField<int64_t>(VT_VAL, _val, 0);
+  }
+  uint16_t count() const {
+    return GetField<uint16_t>(VT_COUNT, 0);
+  }
+  bool mutate_count(uint16_t _count) {
+    return SetField<uint16_t>(VT_COUNT, _count, 0);
+  }
+  bool KeyCompareLessThan(const Stat *o) const {
+    return count() < o->count();
+  }
+  int KeyCompareWithValue(uint16_t val) const {
+    return static_cast<int>(count() > val) - static_cast<int>(count() < val);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ID) &&
+           verifier.VerifyString(id()) &&
+           VerifyField<int64_t>(verifier, VT_VAL) &&
+           VerifyField<uint16_t>(verifier, VT_COUNT) &&
+           verifier.EndTable();
+  }
+  StatT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StatT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Stat> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StatT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StatBuilder {
+  typedef Stat Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_id(flatbuffers::Offset<flatbuffers::String> id) {
+    fbb_.AddOffset(Stat::VT_ID, id);
+  }
+  void add_val(int64_t val) {
+    fbb_.AddElement<int64_t>(Stat::VT_VAL, val, 0);
+  }
+  void add_count(uint16_t count) {
+    fbb_.AddElement<uint16_t>(Stat::VT_COUNT, count, 0);
+  }
+  explicit StatBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Stat> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Stat>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Stat> CreateStat(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> id = 0,
+    int64_t val = 0,
+    uint16_t count = 0) {
+  StatBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_id(id);
+  builder_.add_count(count);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Stat> CreateStatDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *id = nullptr,
+    int64_t val = 0,
+    uint16_t count = 0) {
+  auto id__ = id ? _fbb.CreateString(id) : 0;
+  return MyGame::Example::CreateStat(
+      _fbb,
+      id__,
+      val,
+      count);
+}
+
+flatbuffers::Offset<Stat> CreateStat(flatbuffers::FlatBufferBuilder &_fbb, const StatT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReferrableT : public flatbuffers::NativeTable {
+  typedef Referrable TableType;
+  uint64_t id = 0;
+};
+
+inline bool operator==(const ReferrableT &lhs, const ReferrableT &rhs) {
+  return
+      (lhs.id == rhs.id);
+}
+
+inline bool operator!=(const ReferrableT &lhs, const ReferrableT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Referrable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReferrableT NativeTableType;
+  typedef ReferrableBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReferrableTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4
+  };
+  uint64_t id() const {
+    return GetField<uint64_t>(VT_ID, 0);
+  }
+  bool mutate_id(uint64_t _id) {
+    return SetField<uint64_t>(VT_ID, _id, 0);
+  }
+  bool KeyCompareLessThan(const Referrable *o) const {
+    return id() < o->id();
+  }
+  int KeyCompareWithValue(uint64_t val) const {
+    return static_cast<int>(id() > val) - static_cast<int>(id() < val);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_ID) &&
+           verifier.EndTable();
+  }
+  ReferrableT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReferrableT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Referrable> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReferrableBuilder {
+  typedef Referrable Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_id(uint64_t id) {
+    fbb_.AddElement<uint64_t>(Referrable::VT_ID, id, 0);
+  }
+  explicit ReferrableBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Referrable> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Referrable>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Referrable> CreateReferrable(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t id = 0) {
+  ReferrableBuilder builder_(_fbb);
+  builder_.add_id(id);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Referrable> CreateReferrable(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MonsterT : public flatbuffers::NativeTable {
+  typedef Monster TableType;
+  flatbuffers::unique_ptr<MyGame::Example::Vec3> pos{};
+  int16_t mana = 150;
+  int16_t hp = 100;
+  std::string name{};
+  std::vector<uint8_t> inventory{};
+  MyGame::Example::Color color = MyGame::Example::Color_Blue;
+  MyGame::Example::AnyUnion test{};
+  std::vector<MyGame::Example::Test> test4{};
+  std::vector<std::string> testarrayofstring{};
+  std::vector<flatbuffers::unique_ptr<MyGame::Example::MonsterT>> testarrayoftables{};
+  flatbuffers::unique_ptr<MyGame::Example::MonsterT> enemy{};
+  std::vector<uint8_t> testnestedflatbuffer{};
+  flatbuffers::unique_ptr<MyGame::Example::StatT> testempty{};
+  bool testbool = false;
+  int32_t testhashs32_fnv1 = 0;
+  uint32_t testhashu32_fnv1 = 0;
+  int64_t testhashs64_fnv1 = 0;
+  uint64_t testhashu64_fnv1 = 0;
+  int32_t testhashs32_fnv1a = 0;
+  Stat *testhashu32_fnv1a = nullptr;
+  int64_t testhashs64_fnv1a = 0;
+  uint64_t testhashu64_fnv1a = 0;
+  std::vector<bool> testarrayofbools{};
+  float testf = 3.14159f;
+  float testf2 = 3.0f;
+  float testf3 = 0.0f;
+  std::vector<std::string> testarrayofstring2{};
+  std::vector<MyGame::Example::Ability> testarrayofsortedstruct{};
+  std::vector<uint8_t> flex{};
+  std::vector<MyGame::Example::Test> test5{};
+  std::vector<int64_t> vector_of_longs{};
+  std::vector<double> vector_of_doubles{};
+  flatbuffers::unique_ptr<MyGame::InParentNamespaceT> parent_namespace_test{};
+  std::vector<flatbuffers::unique_ptr<MyGame::Example::ReferrableT>> vector_of_referrables{};
+  ReferrableT *single_weak_reference = nullptr;
+  std::vector<ReferrableT *> vector_of_weak_references{};
+  std::vector<flatbuffers::unique_ptr<MyGame::Example::ReferrableT>> vector_of_strong_referrables{};
+  ReferrableT *co_owning_reference = nullptr;
+  std::vector<flatbuffers::unique_ptr<ReferrableT>> vector_of_co_owning_references{};
+  ReferrableT *non_owning_reference = nullptr;
+  std::vector<ReferrableT *> vector_of_non_owning_references{};
+  MyGame::Example::AnyUniqueAliasesUnion any_unique{};
+  MyGame::Example::AnyAmbiguousAliasesUnion any_ambiguous{};
+  std::vector<MyGame::Example::Color> vector_of_enums{};
+  MyGame::Example::Race signed_enum = MyGame::Example::Race_None;
+  std::vector<uint8_t> testrequirednestedflatbuffer{};
+  std::vector<flatbuffers::unique_ptr<MyGame::Example::StatT>> scalar_key_sorted_tables{};
+};
+
+inline bool operator==(const MonsterT &lhs, const MonsterT &rhs) {
+  return
+      (lhs.pos == rhs.pos) &&
+      (lhs.mana == rhs.mana) &&
+      (lhs.hp == rhs.hp) &&
+      (lhs.name == rhs.name) &&
+      (lhs.inventory == rhs.inventory) &&
+      (lhs.color == rhs.color) &&
+      (lhs.test == rhs.test) &&
+      (lhs.test4 == rhs.test4) &&
+      (lhs.testarrayofstring == rhs.testarrayofstring) &&
+      (lhs.testarrayoftables == rhs.testarrayoftables) &&
+      (lhs.enemy == rhs.enemy) &&
+      (lhs.testnestedflatbuffer == rhs.testnestedflatbuffer) &&
+      (lhs.testempty == rhs.testempty) &&
+      (lhs.testbool == rhs.testbool) &&
+      (lhs.testhashs32_fnv1 == rhs.testhashs32_fnv1) &&
+      (lhs.testhashu32_fnv1 == rhs.testhashu32_fnv1) &&
+      (lhs.testhashs64_fnv1 == rhs.testhashs64_fnv1) &&
+      (lhs.testhashu64_fnv1 == rhs.testhashu64_fnv1) &&
+      (lhs.testhashs32_fnv1a == rhs.testhashs32_fnv1a) &&
+      (lhs.testhashu32_fnv1a == rhs.testhashu32_fnv1a) &&
+      (lhs.testhashs64_fnv1a == rhs.testhashs64_fnv1a) &&
+      (lhs.testhashu64_fnv1a == rhs.testhashu64_fnv1a) &&
+      (lhs.testarrayofbools == rhs.testarrayofbools) &&
+      (lhs.testf == rhs.testf) &&
+      (lhs.testf2 == rhs.testf2) &&
+      (lhs.testf3 == rhs.testf3) &&
+      (lhs.testarrayofstring2 == rhs.testarrayofstring2) &&
+      (lhs.testarrayofsortedstruct == rhs.testarrayofsortedstruct) &&
+      (lhs.flex == rhs.flex) &&
+      (lhs.test5 == rhs.test5) &&
+      (lhs.vector_of_longs == rhs.vector_of_longs) &&
+      (lhs.vector_of_doubles == rhs.vector_of_doubles) &&
+      (lhs.parent_namespace_test == rhs.parent_namespace_test) &&
+      (lhs.vector_of_referrables == rhs.vector_of_referrables) &&
+      (lhs.single_weak_reference == rhs.single_weak_reference) &&
+      (lhs.vector_of_weak_references == rhs.vector_of_weak_references) &&
+      (lhs.vector_of_strong_referrables == rhs.vector_of_strong_referrables) &&
+      (lhs.co_owning_reference == rhs.co_owning_reference) &&
+      (lhs.vector_of_co_owning_references == rhs.vector_of_co_owning_references) &&
+      (lhs.non_owning_reference == rhs.non_owning_reference) &&
+      (lhs.vector_of_non_owning_references == rhs.vector_of_non_owning_references) &&
+      (lhs.any_unique == rhs.any_unique) &&
+      (lhs.any_ambiguous == rhs.any_ambiguous) &&
+      (lhs.vector_of_enums == rhs.vector_of_enums) &&
+      (lhs.signed_enum == rhs.signed_enum) &&
+      (lhs.testrequirednestedflatbuffer == rhs.testrequirednestedflatbuffer) &&
+      (lhs.scalar_key_sorted_tables == rhs.scalar_key_sorted_tables);
+}
+
+inline bool operator!=(const MonsterT &lhs, const MonsterT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+/// an example documentation comment: "monster object"
+struct Monster FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MonsterT NativeTableType;
+  typedef MonsterBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MonsterTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_POS = 4,
+    VT_MANA = 6,
+    VT_HP = 8,
+    VT_NAME = 10,
+    VT_INVENTORY = 14,
+    VT_COLOR = 16,
+    VT_TEST_TYPE = 18,
+    VT_TEST = 20,
+    VT_TEST4 = 22,
+    VT_TESTARRAYOFSTRING = 24,
+    VT_TESTARRAYOFTABLES = 26,
+    VT_ENEMY = 28,
+    VT_TESTNESTEDFLATBUFFER = 30,
+    VT_TESTEMPTY = 32,
+    VT_TESTBOOL = 34,
+    VT_TESTHASHS32_FNV1 = 36,
+    VT_TESTHASHU32_FNV1 = 38,
+    VT_TESTHASHS64_FNV1 = 40,
+    VT_TESTHASHU64_FNV1 = 42,
+    VT_TESTHASHS32_FNV1A = 44,
+    VT_TESTHASHU32_FNV1A = 46,
+    VT_TESTHASHS64_FNV1A = 48,
+    VT_TESTHASHU64_FNV1A = 50,
+    VT_TESTARRAYOFBOOLS = 52,
+    VT_TESTF = 54,
+    VT_TESTF2 = 56,
+    VT_TESTF3 = 58,
+    VT_TESTARRAYOFSTRING2 = 60,
+    VT_TESTARRAYOFSORTEDSTRUCT = 62,
+    VT_FLEX = 64,
+    VT_TEST5 = 66,
+    VT_VECTOR_OF_LONGS = 68,
+    VT_VECTOR_OF_DOUBLES = 70,
+    VT_PARENT_NAMESPACE_TEST = 72,
+    VT_VECTOR_OF_REFERRABLES = 74,
+    VT_SINGLE_WEAK_REFERENCE = 76,
+    VT_VECTOR_OF_WEAK_REFERENCES = 78,
+    VT_VECTOR_OF_STRONG_REFERRABLES = 80,
+    VT_CO_OWNING_REFERENCE = 82,
+    VT_VECTOR_OF_CO_OWNING_REFERENCES = 84,
+    VT_NON_OWNING_REFERENCE = 86,
+    VT_VECTOR_OF_NON_OWNING_REFERENCES = 88,
+    VT_ANY_UNIQUE_TYPE = 90,
+    VT_ANY_UNIQUE = 92,
+    VT_ANY_AMBIGUOUS_TYPE = 94,
+    VT_ANY_AMBIGUOUS = 96,
+    VT_VECTOR_OF_ENUMS = 98,
+    VT_SIGNED_ENUM = 100,
+    VT_TESTREQUIREDNESTEDFLATBUFFER = 102,
+    VT_SCALAR_KEY_SORTED_TABLES = 104
+  };
+  const MyGame::Example::Vec3 *pos() const {
+    return GetStruct<const MyGame::Example::Vec3 *>(VT_POS);
+  }
+  MyGame::Example::Vec3 *mutable_pos() {
+    return GetStruct<MyGame::Example::Vec3 *>(VT_POS);
+  }
+  int16_t mana() const {
+    return GetField<int16_t>(VT_MANA, 150);
+  }
+  bool mutate_mana(int16_t _mana) {
+    return SetField<int16_t>(VT_MANA, _mana, 150);
+  }
+  int16_t hp() const {
+    return GetField<int16_t>(VT_HP, 100);
+  }
+  bool mutate_hp(int16_t _hp) {
+    return SetField<int16_t>(VT_HP, _hp, 100);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  flatbuffers::String *mutable_name() {
+    return GetPointer<flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Monster *o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *val) const {
+    return strcmp(name()->c_str(), val);
+  }
+  const flatbuffers::Vector<uint8_t> *inventory() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_inventory() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_INVENTORY);
+  }
+  MyGame::Example::Color color() const {
+    return static_cast<MyGame::Example::Color>(GetField<uint8_t>(VT_COLOR, 8));
+  }
+  bool mutate_color(MyGame::Example::Color _color) {
+    return SetField<uint8_t>(VT_COLOR, static_cast<uint8_t>(_color), 8);
+  }
+  MyGame::Example::Any test_type() const {
+    return static_cast<MyGame::Example::Any>(GetField<uint8_t>(VT_TEST_TYPE, 0));
+  }
+  const void *test() const {
+    return GetPointer<const void *>(VT_TEST);
+  }
+  template<typename T> const T *test_as() const;
+  const MyGame::Example::Monster *test_as_Monster() const {
+    return test_type() == MyGame::Example::Any_Monster ? static_cast<const MyGame::Example::Monster *>(test()) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnum *test_as_TestSimpleTableWithEnum() const {
+    return test_type() == MyGame::Example::Any_TestSimpleTableWithEnum ? static_cast<const MyGame::Example::TestSimpleTableWithEnum *>(test()) : nullptr;
+  }
+  const MyGame::Example2::Monster *test_as_MyGame_Example2_Monster() const {
+    return test_type() == MyGame::Example::Any_MyGame_Example2_Monster ? static_cast<const MyGame::Example2::Monster *>(test()) : nullptr;
+  }
+  void *mutable_test() {
+    return GetPointer<void *>(VT_TEST);
+  }
+  const flatbuffers::Vector<const MyGame::Example::Test *> *test4() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST4);
+  }
+  flatbuffers::Vector<const MyGame::Example::Test *> *mutable_test4() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST4);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *mutable_testarrayofstring() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING);
+  }
+  /// an example documentation comment: this will end up in the generated code
+  /// multiline too
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *testarrayoftables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *>(VT_TESTARRAYOFTABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *mutable_testarrayoftables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>> *>(VT_TESTARRAYOFTABLES);
+  }
+  const MyGame::Example::Monster *enemy() const {
+    return GetPointer<const MyGame::Example::Monster *>(VT_ENEMY);
+  }
+  MyGame::Example::Monster *mutable_enemy() {
+    return GetPointer<MyGame::Example::Monster *>(VT_ENEMY);
+  }
+  const flatbuffers::Vector<uint8_t> *testnestedflatbuffer() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTNESTEDFLATBUFFER);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testnestedflatbuffer() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTNESTEDFLATBUFFER);
+  }
+  const MyGame::Example::Monster *testnestedflatbuffer_nested_root() const {
+    return flatbuffers::GetRoot<MyGame::Example::Monster>(testnestedflatbuffer()->Data());
+  }
+  const MyGame::Example::Stat *testempty() const {
+    return GetPointer<const MyGame::Example::Stat *>(VT_TESTEMPTY);
+  }
+  MyGame::Example::Stat *mutable_testempty() {
+    return GetPointer<MyGame::Example::Stat *>(VT_TESTEMPTY);
+  }
+  bool testbool() const {
+    return GetField<uint8_t>(VT_TESTBOOL, 0) != 0;
+  }
+  bool mutate_testbool(bool _testbool) {
+    return SetField<uint8_t>(VT_TESTBOOL, static_cast<uint8_t>(_testbool), 0);
+  }
+  int32_t testhashs32_fnv1() const {
+    return GetField<int32_t>(VT_TESTHASHS32_FNV1, 0);
+  }
+  bool mutate_testhashs32_fnv1(int32_t _testhashs32_fnv1) {
+    return SetField<int32_t>(VT_TESTHASHS32_FNV1, _testhashs32_fnv1, 0);
+  }
+  uint32_t testhashu32_fnv1() const {
+    return GetField<uint32_t>(VT_TESTHASHU32_FNV1, 0);
+  }
+  bool mutate_testhashu32_fnv1(uint32_t _testhashu32_fnv1) {
+    return SetField<uint32_t>(VT_TESTHASHU32_FNV1, _testhashu32_fnv1, 0);
+  }
+  int64_t testhashs64_fnv1() const {
+    return GetField<int64_t>(VT_TESTHASHS64_FNV1, 0);
+  }
+  bool mutate_testhashs64_fnv1(int64_t _testhashs64_fnv1) {
+    return SetField<int64_t>(VT_TESTHASHS64_FNV1, _testhashs64_fnv1, 0);
+  }
+  uint64_t testhashu64_fnv1() const {
+    return GetField<uint64_t>(VT_TESTHASHU64_FNV1, 0);
+  }
+  bool mutate_testhashu64_fnv1(uint64_t _testhashu64_fnv1) {
+    return SetField<uint64_t>(VT_TESTHASHU64_FNV1, _testhashu64_fnv1, 0);
+  }
+  int32_t testhashs32_fnv1a() const {
+    return GetField<int32_t>(VT_TESTHASHS32_FNV1A, 0);
+  }
+  bool mutate_testhashs32_fnv1a(int32_t _testhashs32_fnv1a) {
+    return SetField<int32_t>(VT_TESTHASHS32_FNV1A, _testhashs32_fnv1a, 0);
+  }
+  uint32_t testhashu32_fnv1a() const {
+    return GetField<uint32_t>(VT_TESTHASHU32_FNV1A, 0);
+  }
+  bool mutate_testhashu32_fnv1a(uint32_t _testhashu32_fnv1a) {
+    return SetField<uint32_t>(VT_TESTHASHU32_FNV1A, _testhashu32_fnv1a, 0);
+  }
+  int64_t testhashs64_fnv1a() const {
+    return GetField<int64_t>(VT_TESTHASHS64_FNV1A, 0);
+  }
+  bool mutate_testhashs64_fnv1a(int64_t _testhashs64_fnv1a) {
+    return SetField<int64_t>(VT_TESTHASHS64_FNV1A, _testhashs64_fnv1a, 0);
+  }
+  uint64_t testhashu64_fnv1a() const {
+    return GetField<uint64_t>(VT_TESTHASHU64_FNV1A, 0);
+  }
+  bool mutate_testhashu64_fnv1a(uint64_t _testhashu64_fnv1a) {
+    return SetField<uint64_t>(VT_TESTHASHU64_FNV1A, _testhashu64_fnv1a, 0);
+  }
+  const flatbuffers::Vector<uint8_t> *testarrayofbools() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTARRAYOFBOOLS);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testarrayofbools() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTARRAYOFBOOLS);
+  }
+  float testf() const {
+    return GetField<float>(VT_TESTF, 3.14159f);
+  }
+  bool mutate_testf(float _testf) {
+    return SetField<float>(VT_TESTF, _testf, 3.14159f);
+  }
+  float testf2() const {
+    return GetField<float>(VT_TESTF2, 3.0f);
+  }
+  bool mutate_testf2(float _testf2) {
+    return SetField<float>(VT_TESTF2, _testf2, 3.0f);
+  }
+  float testf3() const {
+    return GetField<float>(VT_TESTF3, 0.0f);
+  }
+  bool mutate_testf3(float _testf3) {
+    return SetField<float>(VT_TESTF3, _testf3, 0.0f);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring2() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING2);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *mutable_testarrayofstring2() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_TESTARRAYOFSTRING2);
+  }
+  const flatbuffers::Vector<const MyGame::Example::Ability *> *testarrayofsortedstruct() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Ability *> *>(VT_TESTARRAYOFSORTEDSTRUCT);
+  }
+  flatbuffers::Vector<const MyGame::Example::Ability *> *mutable_testarrayofsortedstruct() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Ability *> *>(VT_TESTARRAYOFSORTEDSTRUCT);
+  }
+  const flatbuffers::Vector<uint8_t> *flex() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_FLEX);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_flex() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_FLEX);
+  }
+  flexbuffers::Reference flex_flexbuffer_root() const {
+    return flexbuffers::GetRoot(flex()->Data(), flex()->size());
+  }
+  const flatbuffers::Vector<const MyGame::Example::Test *> *test5() const {
+    return GetPointer<const flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST5);
+  }
+  flatbuffers::Vector<const MyGame::Example::Test *> *mutable_test5() {
+    return GetPointer<flatbuffers::Vector<const MyGame::Example::Test *> *>(VT_TEST5);
+  }
+  const flatbuffers::Vector<int64_t> *vector_of_longs() const {
+    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_VECTOR_OF_LONGS);
+  }
+  flatbuffers::Vector<int64_t> *mutable_vector_of_longs() {
+    return GetPointer<flatbuffers::Vector<int64_t> *>(VT_VECTOR_OF_LONGS);
+  }
+  const flatbuffers::Vector<double> *vector_of_doubles() const {
+    return GetPointer<const flatbuffers::Vector<double> *>(VT_VECTOR_OF_DOUBLES);
+  }
+  flatbuffers::Vector<double> *mutable_vector_of_doubles() {
+    return GetPointer<flatbuffers::Vector<double> *>(VT_VECTOR_OF_DOUBLES);
+  }
+  const MyGame::InParentNamespace *parent_namespace_test() const {
+    return GetPointer<const MyGame::InParentNamespace *>(VT_PARENT_NAMESPACE_TEST);
+  }
+  MyGame::InParentNamespace *mutable_parent_namespace_test() {
+    return GetPointer<MyGame::InParentNamespace *>(VT_PARENT_NAMESPACE_TEST);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_referrables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_REFERRABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *mutable_vector_of_referrables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_REFERRABLES);
+  }
+  uint64_t single_weak_reference() const {
+    return GetField<uint64_t>(VT_SINGLE_WEAK_REFERENCE, 0);
+  }
+  bool mutate_single_weak_reference(uint64_t _single_weak_reference) {
+    return SetField<uint64_t>(VT_SINGLE_WEAK_REFERENCE, _single_weak_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_weak_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_WEAK_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_weak_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_WEAK_REFERENCES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_strong_referrables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_STRONG_REFERRABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *mutable_vector_of_strong_referrables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>> *>(VT_VECTOR_OF_STRONG_REFERRABLES);
+  }
+  uint64_t co_owning_reference() const {
+    return GetField<uint64_t>(VT_CO_OWNING_REFERENCE, 0);
+  }
+  bool mutate_co_owning_reference(uint64_t _co_owning_reference) {
+    return SetField<uint64_t>(VT_CO_OWNING_REFERENCE, _co_owning_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_co_owning_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_CO_OWNING_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_co_owning_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_CO_OWNING_REFERENCES);
+  }
+  uint64_t non_owning_reference() const {
+    return GetField<uint64_t>(VT_NON_OWNING_REFERENCE, 0);
+  }
+  bool mutate_non_owning_reference(uint64_t _non_owning_reference) {
+    return SetField<uint64_t>(VT_NON_OWNING_REFERENCE, _non_owning_reference, 0);
+  }
+  const flatbuffers::Vector<uint64_t> *vector_of_non_owning_references() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_NON_OWNING_REFERENCES);
+  }
+  flatbuffers::Vector<uint64_t> *mutable_vector_of_non_owning_references() {
+    return GetPointer<flatbuffers::Vector<uint64_t> *>(VT_VECTOR_OF_NON_OWNING_REFERENCES);
+  }
+  MyGame::Example::AnyUniqueAliases any_unique_type() const {
+    return static_cast<MyGame::Example::AnyUniqueAliases>(GetField<uint8_t>(VT_ANY_UNIQUE_TYPE, 0));
+  }
+  const void *any_unique() const {
+    return GetPointer<const void *>(VT_ANY_UNIQUE);
+  }
+  template<typename T> const T *any_unique_as() const;
+  const MyGame::Example::Monster *any_unique_as_M() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases_M ? static_cast<const MyGame::Example::Monster *>(any_unique()) : nullptr;
+  }
+  const MyGame::Example::TestSimpleTableWithEnum *any_unique_as_TS() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases_TS ? static_cast<const MyGame::Example::TestSimpleTableWithEnum *>(any_unique()) : nullptr;
+  }
+  const MyGame::Example2::Monster *any_unique_as_M2() const {
+    return any_unique_type() == MyGame::Example::AnyUniqueAliases_M2 ? static_cast<const MyGame::Example2::Monster *>(any_unique()) : nullptr;
+  }
+  void *mutable_any_unique() {
+    return GetPointer<void *>(VT_ANY_UNIQUE);
+  }
+  MyGame::Example::AnyAmbiguousAliases any_ambiguous_type() const {
+    return static_cast<MyGame::Example::AnyAmbiguousAliases>(GetField<uint8_t>(VT_ANY_AMBIGUOUS_TYPE, 0));
+  }
+  const void *any_ambiguous() const {
+    return GetPointer<const void *>(VT_ANY_AMBIGUOUS);
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M1() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases_M1 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M2() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases_M2 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  const MyGame::Example::Monster *any_ambiguous_as_M3() const {
+    return any_ambiguous_type() == MyGame::Example::AnyAmbiguousAliases_M3 ? static_cast<const MyGame::Example::Monster *>(any_ambiguous()) : nullptr;
+  }
+  void *mutable_any_ambiguous() {
+    return GetPointer<void *>(VT_ANY_AMBIGUOUS);
+  }
+  const flatbuffers::Vector<uint8_t> *vector_of_enums() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VECTOR_OF_ENUMS);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_vector_of_enums() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_VECTOR_OF_ENUMS);
+  }
+  MyGame::Example::Race signed_enum() const {
+    return static_cast<MyGame::Example::Race>(GetField<int8_t>(VT_SIGNED_ENUM, -1));
+  }
+  bool mutate_signed_enum(MyGame::Example::Race _signed_enum) {
+    return SetField<int8_t>(VT_SIGNED_ENUM, static_cast<int8_t>(_signed_enum), -1);
+  }
+  const flatbuffers::Vector<uint8_t> *testrequirednestedflatbuffer() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_TESTREQUIREDNESTEDFLATBUFFER);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_testrequirednestedflatbuffer() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_TESTREQUIREDNESTEDFLATBUFFER);
+  }
+  const MyGame::Example::Monster *testrequirednestedflatbuffer_nested_root() const {
+    return flatbuffers::GetRoot<MyGame::Example::Monster>(testrequirednestedflatbuffer()->Data());
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *scalar_key_sorted_tables() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *>(VT_SCALAR_KEY_SORTED_TABLES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *mutable_scalar_key_sorted_tables() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>> *>(VT_SCALAR_KEY_SORTED_TABLES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<MyGame::Example::Vec3>(verifier, VT_POS) &&
+           VerifyField<int16_t>(verifier, VT_MANA) &&
+           VerifyField<int16_t>(verifier, VT_HP) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_INVENTORY) &&
+           verifier.VerifyVector(inventory()) &&
+           VerifyField<uint8_t>(verifier, VT_COLOR) &&
+           VerifyField<uint8_t>(verifier, VT_TEST_TYPE) &&
+           VerifyOffset(verifier, VT_TEST) &&
+           VerifyAny(verifier, test(), test_type()) &&
+           VerifyOffset(verifier, VT_TEST4) &&
+           verifier.VerifyVector(test4()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSTRING) &&
+           verifier.VerifyVector(testarrayofstring()) &&
+           verifier.VerifyVectorOfStrings(testarrayofstring()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFTABLES) &&
+           verifier.VerifyVector(testarrayoftables()) &&
+           verifier.VerifyVectorOfTables(testarrayoftables()) &&
+           VerifyOffset(verifier, VT_ENEMY) &&
+           verifier.VerifyTable(enemy()) &&
+           VerifyOffset(verifier, VT_TESTNESTEDFLATBUFFER) &&
+           verifier.VerifyVector(testnestedflatbuffer()) &&
+           VerifyOffset(verifier, VT_TESTEMPTY) &&
+           verifier.VerifyTable(testempty()) &&
+           VerifyField<uint8_t>(verifier, VT_TESTBOOL) &&
+           VerifyField<int32_t>(verifier, VT_TESTHASHS32_FNV1) &&
+           VerifyField<uint32_t>(verifier, VT_TESTHASHU32_FNV1) &&
+           VerifyField<int64_t>(verifier, VT_TESTHASHS64_FNV1) &&
+           VerifyField<uint64_t>(verifier, VT_TESTHASHU64_FNV1) &&
+           VerifyField<int32_t>(verifier, VT_TESTHASHS32_FNV1A) &&
+           VerifyField<uint32_t>(verifier, VT_TESTHASHU32_FNV1A) &&
+           VerifyField<int64_t>(verifier, VT_TESTHASHS64_FNV1A) &&
+           VerifyField<uint64_t>(verifier, VT_TESTHASHU64_FNV1A) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFBOOLS) &&
+           verifier.VerifyVector(testarrayofbools()) &&
+           VerifyField<float>(verifier, VT_TESTF) &&
+           VerifyField<float>(verifier, VT_TESTF2) &&
+           VerifyField<float>(verifier, VT_TESTF3) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSTRING2) &&
+           verifier.VerifyVector(testarrayofstring2()) &&
+           verifier.VerifyVectorOfStrings(testarrayofstring2()) &&
+           VerifyOffset(verifier, VT_TESTARRAYOFSORTEDSTRUCT) &&
+           verifier.VerifyVector(testarrayofsortedstruct()) &&
+           VerifyOffset(verifier, VT_FLEX) &&
+           verifier.VerifyVector(flex()) &&
+           VerifyOffset(verifier, VT_TEST5) &&
+           verifier.VerifyVector(test5()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_LONGS) &&
+           verifier.VerifyVector(vector_of_longs()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_DOUBLES) &&
+           verifier.VerifyVector(vector_of_doubles()) &&
+           VerifyOffset(verifier, VT_PARENT_NAMESPACE_TEST) &&
+           verifier.VerifyTable(parent_namespace_test()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_REFERRABLES) &&
+           verifier.VerifyVector(vector_of_referrables()) &&
+           verifier.VerifyVectorOfTables(vector_of_referrables()) &&
+           VerifyField<uint64_t>(verifier, VT_SINGLE_WEAK_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_WEAK_REFERENCES) &&
+           verifier.VerifyVector(vector_of_weak_references()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_STRONG_REFERRABLES) &&
+           verifier.VerifyVector(vector_of_strong_referrables()) &&
+           verifier.VerifyVectorOfTables(vector_of_strong_referrables()) &&
+           VerifyField<uint64_t>(verifier, VT_CO_OWNING_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_CO_OWNING_REFERENCES) &&
+           verifier.VerifyVector(vector_of_co_owning_references()) &&
+           VerifyField<uint64_t>(verifier, VT_NON_OWNING_REFERENCE) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_NON_OWNING_REFERENCES) &&
+           verifier.VerifyVector(vector_of_non_owning_references()) &&
+           VerifyField<uint8_t>(verifier, VT_ANY_UNIQUE_TYPE) &&
+           VerifyOffset(verifier, VT_ANY_UNIQUE) &&
+           VerifyAnyUniqueAliases(verifier, any_unique(), any_unique_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ANY_AMBIGUOUS_TYPE) &&
+           VerifyOffset(verifier, VT_ANY_AMBIGUOUS) &&
+           VerifyAnyAmbiguousAliases(verifier, any_ambiguous(), any_ambiguous_type()) &&
+           VerifyOffset(verifier, VT_VECTOR_OF_ENUMS) &&
+           verifier.VerifyVector(vector_of_enums()) &&
+           VerifyField<int8_t>(verifier, VT_SIGNED_ENUM) &&
+           VerifyOffset(verifier, VT_TESTREQUIREDNESTEDFLATBUFFER) &&
+           verifier.VerifyVector(testrequirednestedflatbuffer()) &&
+           VerifyOffset(verifier, VT_SCALAR_KEY_SORTED_TABLES) &&
+           verifier.VerifyVector(scalar_key_sorted_tables()) &&
+           verifier.VerifyVectorOfTables(scalar_key_sorted_tables()) &&
+           verifier.EndTable();
+  }
+  MonsterT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Monster> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const MyGame::Example::Monster *Monster::test_as<MyGame::Example::Monster>() const {
+  return test_as_Monster();
+}
+
+template<> inline const MyGame::Example::TestSimpleTableWithEnum *Monster::test_as<MyGame::Example::TestSimpleTableWithEnum>() const {
+  return test_as_TestSimpleTableWithEnum();
+}
+
+template<> inline const MyGame::Example2::Monster *Monster::test_as<MyGame::Example2::Monster>() const {
+  return test_as_MyGame_Example2_Monster();
+}
+
+template<> inline const MyGame::Example::Monster *Monster::any_unique_as<MyGame::Example::Monster>() const {
+  return any_unique_as_M();
+}
+
+template<> inline const MyGame::Example::TestSimpleTableWithEnum *Monster::any_unique_as<MyGame::Example::TestSimpleTableWithEnum>() const {
+  return any_unique_as_TS();
+}
+
+template<> inline const MyGame::Example2::Monster *Monster::any_unique_as<MyGame::Example2::Monster>() const {
+  return any_unique_as_M2();
+}
+
+struct MonsterBuilder {
+  typedef Monster Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_pos(const MyGame::Example::Vec3 *pos) {
+    fbb_.AddStruct(Monster::VT_POS, pos);
+  }
+  void add_mana(int16_t mana) {
+    fbb_.AddElement<int16_t>(Monster::VT_MANA, mana, 150);
+  }
+  void add_hp(int16_t hp) {
+    fbb_.AddElement<int16_t>(Monster::VT_HP, hp, 100);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Monster::VT_NAME, name);
+  }
+  void add_inventory(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory) {
+    fbb_.AddOffset(Monster::VT_INVENTORY, inventory);
+  }
+  void add_color(MyGame::Example::Color color) {
+    fbb_.AddElement<uint8_t>(Monster::VT_COLOR, static_cast<uint8_t>(color), 8);
+  }
+  void add_test_type(MyGame::Example::Any test_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_TEST_TYPE, static_cast<uint8_t>(test_type), 0);
+  }
+  void add_test(flatbuffers::Offset<void> test) {
+    fbb_.AddOffset(Monster::VT_TEST, test);
+  }
+  void add_test4(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test4) {
+    fbb_.AddOffset(Monster::VT_TEST4, test4);
+  }
+  void add_testarrayofstring(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSTRING, testarrayofstring);
+  }
+  void add_testarrayoftables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>>> testarrayoftables) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFTABLES, testarrayoftables);
+  }
+  void add_enemy(flatbuffers::Offset<MyGame::Example::Monster> enemy) {
+    fbb_.AddOffset(Monster::VT_ENEMY, enemy);
+  }
+  void add_testnestedflatbuffer(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testnestedflatbuffer) {
+    fbb_.AddOffset(Monster::VT_TESTNESTEDFLATBUFFER, testnestedflatbuffer);
+  }
+  void add_testempty(flatbuffers::Offset<MyGame::Example::Stat> testempty) {
+    fbb_.AddOffset(Monster::VT_TESTEMPTY, testempty);
+  }
+  void add_testbool(bool testbool) {
+    fbb_.AddElement<uint8_t>(Monster::VT_TESTBOOL, static_cast<uint8_t>(testbool), 0);
+  }
+  void add_testhashs32_fnv1(int32_t testhashs32_fnv1) {
+    fbb_.AddElement<int32_t>(Monster::VT_TESTHASHS32_FNV1, testhashs32_fnv1, 0);
+  }
+  void add_testhashu32_fnv1(uint32_t testhashu32_fnv1) {
+    fbb_.AddElement<uint32_t>(Monster::VT_TESTHASHU32_FNV1, testhashu32_fnv1, 0);
+  }
+  void add_testhashs64_fnv1(int64_t testhashs64_fnv1) {
+    fbb_.AddElement<int64_t>(Monster::VT_TESTHASHS64_FNV1, testhashs64_fnv1, 0);
+  }
+  void add_testhashu64_fnv1(uint64_t testhashu64_fnv1) {
+    fbb_.AddElement<uint64_t>(Monster::VT_TESTHASHU64_FNV1, testhashu64_fnv1, 0);
+  }
+  void add_testhashs32_fnv1a(int32_t testhashs32_fnv1a) {
+    fbb_.AddElement<int32_t>(Monster::VT_TESTHASHS32_FNV1A, testhashs32_fnv1a, 0);
+  }
+  void add_testhashu32_fnv1a(uint32_t testhashu32_fnv1a) {
+    fbb_.AddElement<uint32_t>(Monster::VT_TESTHASHU32_FNV1A, testhashu32_fnv1a, 0);
+  }
+  void add_testhashs64_fnv1a(int64_t testhashs64_fnv1a) {
+    fbb_.AddElement<int64_t>(Monster::VT_TESTHASHS64_FNV1A, testhashs64_fnv1a, 0);
+  }
+  void add_testhashu64_fnv1a(uint64_t testhashu64_fnv1a) {
+    fbb_.AddElement<uint64_t>(Monster::VT_TESTHASHU64_FNV1A, testhashu64_fnv1a, 0);
+  }
+  void add_testarrayofbools(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testarrayofbools) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFBOOLS, testarrayofbools);
+  }
+  void add_testf(float testf) {
+    fbb_.AddElement<float>(Monster::VT_TESTF, testf, 3.14159f);
+  }
+  void add_testf2(float testf2) {
+    fbb_.AddElement<float>(Monster::VT_TESTF2, testf2, 3.0f);
+  }
+  void add_testf3(float testf3) {
+    fbb_.AddElement<float>(Monster::VT_TESTF3, testf3, 0.0f);
+  }
+  void add_testarrayofstring2(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring2) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSTRING2, testarrayofstring2);
+  }
+  void add_testarrayofsortedstruct(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Ability *>> testarrayofsortedstruct) {
+    fbb_.AddOffset(Monster::VT_TESTARRAYOFSORTEDSTRUCT, testarrayofsortedstruct);
+  }
+  void add_flex(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> flex) {
+    fbb_.AddOffset(Monster::VT_FLEX, flex);
+  }
+  void add_test5(flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test5) {
+    fbb_.AddOffset(Monster::VT_TEST5, test5);
+  }
+  void add_vector_of_longs(flatbuffers::Offset<flatbuffers::Vector<int64_t>> vector_of_longs) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_LONGS, vector_of_longs);
+  }
+  void add_vector_of_doubles(flatbuffers::Offset<flatbuffers::Vector<double>> vector_of_doubles) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_DOUBLES, vector_of_doubles);
+  }
+  void add_parent_namespace_test(flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test) {
+    fbb_.AddOffset(Monster::VT_PARENT_NAMESPACE_TEST, parent_namespace_test);
+  }
+  void add_vector_of_referrables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_referrables) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_REFERRABLES, vector_of_referrables);
+  }
+  void add_single_weak_reference(uint64_t single_weak_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_SINGLE_WEAK_REFERENCE, single_weak_reference, 0);
+  }
+  void add_vector_of_weak_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_weak_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_WEAK_REFERENCES, vector_of_weak_references);
+  }
+  void add_vector_of_strong_referrables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_strong_referrables) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_STRONG_REFERRABLES, vector_of_strong_referrables);
+  }
+  void add_co_owning_reference(uint64_t co_owning_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_CO_OWNING_REFERENCE, co_owning_reference, 0);
+  }
+  void add_vector_of_co_owning_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_co_owning_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_CO_OWNING_REFERENCES, vector_of_co_owning_references);
+  }
+  void add_non_owning_reference(uint64_t non_owning_reference) {
+    fbb_.AddElement<uint64_t>(Monster::VT_NON_OWNING_REFERENCE, non_owning_reference, 0);
+  }
+  void add_vector_of_non_owning_references(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_non_owning_references) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_NON_OWNING_REFERENCES, vector_of_non_owning_references);
+  }
+  void add_any_unique_type(MyGame::Example::AnyUniqueAliases any_unique_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_ANY_UNIQUE_TYPE, static_cast<uint8_t>(any_unique_type), 0);
+  }
+  void add_any_unique(flatbuffers::Offset<void> any_unique) {
+    fbb_.AddOffset(Monster::VT_ANY_UNIQUE, any_unique);
+  }
+  void add_any_ambiguous_type(MyGame::Example::AnyAmbiguousAliases any_ambiguous_type) {
+    fbb_.AddElement<uint8_t>(Monster::VT_ANY_AMBIGUOUS_TYPE, static_cast<uint8_t>(any_ambiguous_type), 0);
+  }
+  void add_any_ambiguous(flatbuffers::Offset<void> any_ambiguous) {
+    fbb_.AddOffset(Monster::VT_ANY_AMBIGUOUS, any_ambiguous);
+  }
+  void add_vector_of_enums(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> vector_of_enums) {
+    fbb_.AddOffset(Monster::VT_VECTOR_OF_ENUMS, vector_of_enums);
+  }
+  void add_signed_enum(MyGame::Example::Race signed_enum) {
+    fbb_.AddElement<int8_t>(Monster::VT_SIGNED_ENUM, static_cast<int8_t>(signed_enum), -1);
+  }
+  void add_testrequirednestedflatbuffer(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testrequirednestedflatbuffer) {
+    fbb_.AddOffset(Monster::VT_TESTREQUIREDNESTEDFLATBUFFER, testrequirednestedflatbuffer);
+  }
+  void add_scalar_key_sorted_tables(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>>> scalar_key_sorted_tables) {
+    fbb_.AddOffset(Monster::VT_SCALAR_KEY_SORTED_TABLES, scalar_key_sorted_tables);
+  }
+  explicit MonsterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Monster> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Monster>(end);
+    fbb_.Required(o, Monster::VT_NAME);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Monster> CreateMonster(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Example::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> inventory = 0,
+    MyGame::Example::Color color = MyGame::Example::Color_Blue,
+    MyGame::Example::Any test_type = MyGame::Example::Any_NONE,
+    flatbuffers::Offset<void> test = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test4 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Monster>>> testarrayoftables = 0,
+    flatbuffers::Offset<MyGame::Example::Monster> enemy = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testnestedflatbuffer = 0,
+    flatbuffers::Offset<MyGame::Example::Stat> testempty = 0,
+    bool testbool = false,
+    int32_t testhashs32_fnv1 = 0,
+    uint32_t testhashu32_fnv1 = 0,
+    int64_t testhashs64_fnv1 = 0,
+    uint64_t testhashu64_fnv1 = 0,
+    int32_t testhashs32_fnv1a = 0,
+    uint32_t testhashu32_fnv1a = 0,
+    int64_t testhashs64_fnv1a = 0,
+    uint64_t testhashu64_fnv1a = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testarrayofbools = 0,
+    float testf = 3.14159f,
+    float testf2 = 3.0f,
+    float testf3 = 0.0f,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> testarrayofstring2 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Ability *>> testarrayofsortedstruct = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> flex = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const MyGame::Example::Test *>> test5 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> vector_of_longs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<double>> vector_of_doubles = 0,
+    flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_referrables = 0,
+    uint64_t single_weak_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_weak_references = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Referrable>>> vector_of_strong_referrables = 0,
+    uint64_t co_owning_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_co_owning_references = 0,
+    uint64_t non_owning_reference = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> vector_of_non_owning_references = 0,
+    MyGame::Example::AnyUniqueAliases any_unique_type = MyGame::Example::AnyUniqueAliases_NONE,
+    flatbuffers::Offset<void> any_unique = 0,
+    MyGame::Example::AnyAmbiguousAliases any_ambiguous_type = MyGame::Example::AnyAmbiguousAliases_NONE,
+    flatbuffers::Offset<void> any_ambiguous = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> vector_of_enums = 0,
+    MyGame::Example::Race signed_enum = MyGame::Example::Race_None,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> testrequirednestedflatbuffer = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<MyGame::Example::Stat>>> scalar_key_sorted_tables = 0) {
+  MonsterBuilder builder_(_fbb);
+  builder_.add_non_owning_reference(non_owning_reference);
+  builder_.add_co_owning_reference(co_owning_reference);
+  builder_.add_single_weak_reference(single_weak_reference);
+  builder_.add_testhashu64_fnv1a(testhashu64_fnv1a);
+  builder_.add_testhashs64_fnv1a(testhashs64_fnv1a);
+  builder_.add_testhashu64_fnv1(testhashu64_fnv1);
+  builder_.add_testhashs64_fnv1(testhashs64_fnv1);
+  builder_.add_scalar_key_sorted_tables(scalar_key_sorted_tables);
+  builder_.add_testrequirednestedflatbuffer(testrequirednestedflatbuffer);
+  builder_.add_vector_of_enums(vector_of_enums);
+  builder_.add_any_ambiguous(any_ambiguous);
+  builder_.add_any_unique(any_unique);
+  builder_.add_vector_of_non_owning_references(vector_of_non_owning_references);
+  builder_.add_vector_of_co_owning_references(vector_of_co_owning_references);
+  builder_.add_vector_of_strong_referrables(vector_of_strong_referrables);
+  builder_.add_vector_of_weak_references(vector_of_weak_references);
+  builder_.add_vector_of_referrables(vector_of_referrables);
+  builder_.add_parent_namespace_test(parent_namespace_test);
+  builder_.add_vector_of_doubles(vector_of_doubles);
+  builder_.add_vector_of_longs(vector_of_longs);
+  builder_.add_test5(test5);
+  builder_.add_flex(flex);
+  builder_.add_testarrayofsortedstruct(testarrayofsortedstruct);
+  builder_.add_testarrayofstring2(testarrayofstring2);
+  builder_.add_testf3(testf3);
+  builder_.add_testf2(testf2);
+  builder_.add_testf(testf);
+  builder_.add_testarrayofbools(testarrayofbools);
+  builder_.add_testhashu32_fnv1a(testhashu32_fnv1a);
+  builder_.add_testhashs32_fnv1a(testhashs32_fnv1a);
+  builder_.add_testhashu32_fnv1(testhashu32_fnv1);
+  builder_.add_testhashs32_fnv1(testhashs32_fnv1);
+  builder_.add_testempty(testempty);
+  builder_.add_testnestedflatbuffer(testnestedflatbuffer);
+  builder_.add_enemy(enemy);
+  builder_.add_testarrayoftables(testarrayoftables);
+  builder_.add_testarrayofstring(testarrayofstring);
+  builder_.add_test4(test4);
+  builder_.add_test(test);
+  builder_.add_inventory(inventory);
+  builder_.add_name(name);
+  builder_.add_pos(pos);
+  builder_.add_hp(hp);
+  builder_.add_mana(mana);
+  builder_.add_signed_enum(signed_enum);
+  builder_.add_any_ambiguous_type(any_ambiguous_type);
+  builder_.add_any_unique_type(any_unique_type);
+  builder_.add_testbool(testbool);
+  builder_.add_test_type(test_type);
+  builder_.add_color(color);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Monster> CreateMonsterDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const MyGame::Example::Vec3 *pos = 0,
+    int16_t mana = 150,
+    int16_t hp = 100,
+    const char *name = nullptr,
+    const std::vector<uint8_t> *inventory = nullptr,
+    MyGame::Example::Color color = MyGame::Example::Color_Blue,
+    MyGame::Example::Any test_type = MyGame::Example::Any_NONE,
+    flatbuffers::Offset<void> test = 0,
+    const std::vector<MyGame::Example::Test> *test4 = nullptr,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Monster>> *testarrayoftables = nullptr,
+    flatbuffers::Offset<MyGame::Example::Monster> enemy = 0,
+    const std::vector<uint8_t> *testnestedflatbuffer = nullptr,
+    flatbuffers::Offset<MyGame::Example::Stat> testempty = 0,
+    bool testbool = false,
+    int32_t testhashs32_fnv1 = 0,
+    uint32_t testhashu32_fnv1 = 0,
+    int64_t testhashs64_fnv1 = 0,
+    uint64_t testhashu64_fnv1 = 0,
+    int32_t testhashs32_fnv1a = 0,
+    uint32_t testhashu32_fnv1a = 0,
+    int64_t testhashs64_fnv1a = 0,
+    uint64_t testhashu64_fnv1a = 0,
+    const std::vector<uint8_t> *testarrayofbools = nullptr,
+    float testf = 3.14159f,
+    float testf2 = 3.0f,
+    float testf3 = 0.0f,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring2 = nullptr,
+    std::vector<MyGame::Example::Ability> *testarrayofsortedstruct = nullptr,
+    const std::vector<uint8_t> *flex = nullptr,
+    const std::vector<MyGame::Example::Test> *test5 = nullptr,
+    const std::vector<int64_t> *vector_of_longs = nullptr,
+    const std::vector<double> *vector_of_doubles = nullptr,
+    flatbuffers::Offset<MyGame::InParentNamespace> parent_namespace_test = 0,
+    std::vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_referrables = nullptr,
+    uint64_t single_weak_reference = 0,
+    const std::vector<uint64_t> *vector_of_weak_references = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Referrable>> *vector_of_strong_referrables = nullptr,
+    uint64_t co_owning_reference = 0,
+    const std::vector<uint64_t> *vector_of_co_owning_references = nullptr,
+    uint64_t non_owning_reference = 0,
+    const std::vector<uint64_t> *vector_of_non_owning_references = nullptr,
+    MyGame::Example::AnyUniqueAliases any_unique_type = MyGame::Example::AnyUniqueAliases_NONE,
+    flatbuffers::Offset<void> any_unique = 0,
+    MyGame::Example::AnyAmbiguousAliases any_ambiguous_type = MyGame::Example::AnyAmbiguousAliases_NONE,
+    flatbuffers::Offset<void> any_ambiguous = 0,
+    const std::vector<uint8_t> *vector_of_enums = nullptr,
+    MyGame::Example::Race signed_enum = MyGame::Example::Race_None,
+    const std::vector<uint8_t> *testrequirednestedflatbuffer = nullptr,
+    std::vector<flatbuffers::Offset<MyGame::Example::Stat>> *scalar_key_sorted_tables = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto inventory__ = inventory ? _fbb.CreateVector<uint8_t>(*inventory) : 0;
+  auto test4__ = test4 ? _fbb.CreateVectorOfStructs<MyGame::Example::Test>(*test4) : 0;
+  auto testarrayofstring__ = testarrayofstring ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*testarrayofstring) : 0;
+  auto testarrayoftables__ = testarrayoftables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Monster>(testarrayoftables) : 0;
+  auto testnestedflatbuffer__ = testnestedflatbuffer ? _fbb.CreateVector<uint8_t>(*testnestedflatbuffer) : 0;
+  auto testarrayofbools__ = testarrayofbools ? _fbb.CreateVector<uint8_t>(*testarrayofbools) : 0;
+  auto testarrayofstring2__ = testarrayofstring2 ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*testarrayofstring2) : 0;
+  auto testarrayofsortedstruct__ = testarrayofsortedstruct ? _fbb.CreateVectorOfSortedStructs<MyGame::Example::Ability>(testarrayofsortedstruct) : 0;
+  auto flex__ = flex ? _fbb.CreateVector<uint8_t>(*flex) : 0;
+  auto test5__ = test5 ? _fbb.CreateVectorOfStructs<MyGame::Example::Test>(*test5) : 0;
+  auto vector_of_longs__ = vector_of_longs ? _fbb.CreateVector<int64_t>(*vector_of_longs) : 0;
+  auto vector_of_doubles__ = vector_of_doubles ? _fbb.CreateVector<double>(*vector_of_doubles) : 0;
+  auto vector_of_referrables__ = vector_of_referrables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Referrable>(vector_of_referrables) : 0;
+  auto vector_of_weak_references__ = vector_of_weak_references ? _fbb.CreateVector<uint64_t>(*vector_of_weak_references) : 0;
+  auto vector_of_strong_referrables__ = vector_of_strong_referrables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Referrable>(vector_of_strong_referrables) : 0;
+  auto vector_of_co_owning_references__ = vector_of_co_owning_references ? _fbb.CreateVector<uint64_t>(*vector_of_co_owning_references) : 0;
+  auto vector_of_non_owning_references__ = vector_of_non_owning_references ? _fbb.CreateVector<uint64_t>(*vector_of_non_owning_references) : 0;
+  auto vector_of_enums__ = vector_of_enums ? _fbb.CreateVector<uint8_t>(*vector_of_enums) : 0;
+  auto testrequirednestedflatbuffer__ = testrequirednestedflatbuffer ? _fbb.CreateVector<uint8_t>(*testrequirednestedflatbuffer) : 0;
+  auto scalar_key_sorted_tables__ = scalar_key_sorted_tables ? _fbb.CreateVectorOfSortedTables<MyGame::Example::Stat>(scalar_key_sorted_tables) : 0;
+  return MyGame::Example::CreateMonster(
+      _fbb,
+      pos,
+      mana,
+      hp,
+      name__,
+      inventory__,
+      color,
+      test_type,
+      test,
+      test4__,
+      testarrayofstring__,
+      testarrayoftables__,
+      enemy,
+      testnestedflatbuffer__,
+      testempty,
+      testbool,
+      testhashs32_fnv1,
+      testhashu32_fnv1,
+      testhashs64_fnv1,
+      testhashu64_fnv1,
+      testhashs32_fnv1a,
+      testhashu32_fnv1a,
+      testhashs64_fnv1a,
+      testhashu64_fnv1a,
+      testarrayofbools__,
+      testf,
+      testf2,
+      testf3,
+      testarrayofstring2__,
+      testarrayofsortedstruct__,
+      flex__,
+      test5__,
+      vector_of_longs__,
+      vector_of_doubles__,
+      parent_namespace_test,
+      vector_of_referrables__,
+      single_weak_reference,
+      vector_of_weak_references__,
+      vector_of_strong_referrables__,
+      co_owning_reference,
+      vector_of_co_owning_references__,
+      non_owning_reference,
+      vector_of_non_owning_references__,
+      any_unique_type,
+      any_unique,
+      any_ambiguous_type,
+      any_ambiguous,
+      vector_of_enums__,
+      signed_enum,
+      testrequirednestedflatbuffer__,
+      scalar_key_sorted_tables__);
+}
+
+flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TypeAliasesT : public flatbuffers::NativeTable {
+  typedef TypeAliases TableType;
+  int8_t i8 = 0;
+  uint8_t u8 = 0;
+  int16_t i16 = 0;
+  uint16_t u16 = 0;
+  int32_t i32 = 0;
+  uint32_t u32 = 0;
+  int64_t i64 = 0;
+  uint64_t u64 = 0;
+  float f32 = 0.0f;
+  double f64 = 0.0;
+  std::vector<int8_t> v8{};
+  std::vector<double> vf64{};
+};
+
+inline bool operator==(const TypeAliasesT &lhs, const TypeAliasesT &rhs) {
+  return
+      (lhs.i8 == rhs.i8) &&
+      (lhs.u8 == rhs.u8) &&
+      (lhs.i16 == rhs.i16) &&
+      (lhs.u16 == rhs.u16) &&
+      (lhs.i32 == rhs.i32) &&
+      (lhs.u32 == rhs.u32) &&
+      (lhs.i64 == rhs.i64) &&
+      (lhs.u64 == rhs.u64) &&
+      (lhs.f32 == rhs.f32) &&
+      (lhs.f64 == rhs.f64) &&
+      (lhs.v8 == rhs.v8) &&
+      (lhs.vf64 == rhs.vf64);
+}
+
+inline bool operator!=(const TypeAliasesT &lhs, const TypeAliasesT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TypeAliases FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TypeAliasesT NativeTableType;
+  typedef TypeAliasesBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TypeAliasesTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_I8 = 4,
+    VT_U8 = 6,
+    VT_I16 = 8,
+    VT_U16 = 10,
+    VT_I32 = 12,
+    VT_U32 = 14,
+    VT_I64 = 16,
+    VT_U64 = 18,
+    VT_F32 = 20,
+    VT_F64 = 22,
+    VT_V8 = 24,
+    VT_VF64 = 26
+  };
+  int8_t i8() const {
+    return GetField<int8_t>(VT_I8, 0);
+  }
+  bool mutate_i8(int8_t _i8) {
+    return SetField<int8_t>(VT_I8, _i8, 0);
+  }
+  uint8_t u8() const {
+    return GetField<uint8_t>(VT_U8, 0);
+  }
+  bool mutate_u8(uint8_t _u8) {
+    return SetField<uint8_t>(VT_U8, _u8, 0);
+  }
+  int16_t i16() const {
+    return GetField<int16_t>(VT_I16, 0);
+  }
+  bool mutate_i16(int16_t _i16) {
+    return SetField<int16_t>(VT_I16, _i16, 0);
+  }
+  uint16_t u16() const {
+    return GetField<uint16_t>(VT_U16, 0);
+  }
+  bool mutate_u16(uint16_t _u16) {
+    return SetField<uint16_t>(VT_U16, _u16, 0);
+  }
+  int32_t i32() const {
+    return GetField<int32_t>(VT_I32, 0);
+  }
+  bool mutate_i32(int32_t _i32) {
+    return SetField<int32_t>(VT_I32, _i32, 0);
+  }
+  uint32_t u32() const {
+    return GetField<uint32_t>(VT_U32, 0);
+  }
+  bool mutate_u32(uint32_t _u32) {
+    return SetField<uint32_t>(VT_U32, _u32, 0);
+  }
+  int64_t i64() const {
+    return GetField<int64_t>(VT_I64, 0);
+  }
+  bool mutate_i64(int64_t _i64) {
+    return SetField<int64_t>(VT_I64, _i64, 0);
+  }
+  uint64_t u64() const {
+    return GetField<uint64_t>(VT_U64, 0);
+  }
+  bool mutate_u64(uint64_t _u64) {
+    return SetField<uint64_t>(VT_U64, _u64, 0);
+  }
+  float f32() const {
+    return GetField<float>(VT_F32, 0.0f);
+  }
+  bool mutate_f32(float _f32) {
+    return SetField<float>(VT_F32, _f32, 0.0f);
+  }
+  double f64() const {
+    return GetField<double>(VT_F64, 0.0);
+  }
+  bool mutate_f64(double _f64) {
+    return SetField<double>(VT_F64, _f64, 0.0);
+  }
+  const flatbuffers::Vector<int8_t> *v8() const {
+    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_V8);
+  }
+  flatbuffers::Vector<int8_t> *mutable_v8() {
+    return GetPointer<flatbuffers::Vector<int8_t> *>(VT_V8);
+  }
+  const flatbuffers::Vector<double> *vf64() const {
+    return GetPointer<const flatbuffers::Vector<double> *>(VT_VF64);
+  }
+  flatbuffers::Vector<double> *mutable_vf64() {
+    return GetPointer<flatbuffers::Vector<double> *>(VT_VF64);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_I8) &&
+           VerifyField<uint8_t>(verifier, VT_U8) &&
+           VerifyField<int16_t>(verifier, VT_I16) &&
+           VerifyField<uint16_t>(verifier, VT_U16) &&
+           VerifyField<int32_t>(verifier, VT_I32) &&
+           VerifyField<uint32_t>(verifier, VT_U32) &&
+           VerifyField<int64_t>(verifier, VT_I64) &&
+           VerifyField<uint64_t>(verifier, VT_U64) &&
+           VerifyField<float>(verifier, VT_F32) &&
+           VerifyField<double>(verifier, VT_F64) &&
+           VerifyOffset(verifier, VT_V8) &&
+           verifier.VerifyVector(v8()) &&
+           VerifyOffset(verifier, VT_VF64) &&
+           verifier.VerifyVector(vf64()) &&
+           verifier.EndTable();
+  }
+  TypeAliasesT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TypeAliasesT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TypeAliases> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TypeAliasesBuilder {
+  typedef TypeAliases Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_i8(int8_t i8) {
+    fbb_.AddElement<int8_t>(TypeAliases::VT_I8, i8, 0);
+  }
+  void add_u8(uint8_t u8) {
+    fbb_.AddElement<uint8_t>(TypeAliases::VT_U8, u8, 0);
+  }
+  void add_i16(int16_t i16) {
+    fbb_.AddElement<int16_t>(TypeAliases::VT_I16, i16, 0);
+  }
+  void add_u16(uint16_t u16) {
+    fbb_.AddElement<uint16_t>(TypeAliases::VT_U16, u16, 0);
+  }
+  void add_i32(int32_t i32) {
+    fbb_.AddElement<int32_t>(TypeAliases::VT_I32, i32, 0);
+  }
+  void add_u32(uint32_t u32) {
+    fbb_.AddElement<uint32_t>(TypeAliases::VT_U32, u32, 0);
+  }
+  void add_i64(int64_t i64) {
+    fbb_.AddElement<int64_t>(TypeAliases::VT_I64, i64, 0);
+  }
+  void add_u64(uint64_t u64) {
+    fbb_.AddElement<uint64_t>(TypeAliases::VT_U64, u64, 0);
+  }
+  void add_f32(float f32) {
+    fbb_.AddElement<float>(TypeAliases::VT_F32, f32, 0.0f);
+  }
+  void add_f64(double f64) {
+    fbb_.AddElement<double>(TypeAliases::VT_F64, f64, 0.0);
+  }
+  void add_v8(flatbuffers::Offset<flatbuffers::Vector<int8_t>> v8) {
+    fbb_.AddOffset(TypeAliases::VT_V8, v8);
+  }
+  void add_vf64(flatbuffers::Offset<flatbuffers::Vector<double>> vf64) {
+    fbb_.AddOffset(TypeAliases::VT_VF64, vf64);
+  }
+  explicit TypeAliasesBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TypeAliases> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TypeAliases>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliases(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t i8 = 0,
+    uint8_t u8 = 0,
+    int16_t i16 = 0,
+    uint16_t u16 = 0,
+    int32_t i32 = 0,
+    uint32_t u32 = 0,
+    int64_t i64 = 0,
+    uint64_t u64 = 0,
+    float f32 = 0.0f,
+    double f64 = 0.0,
+    flatbuffers::Offset<flatbuffers::Vector<int8_t>> v8 = 0,
+    flatbuffers::Offset<flatbuffers::Vector<double>> vf64 = 0) {
+  TypeAliasesBuilder builder_(_fbb);
+  builder_.add_f64(f64);
+  builder_.add_u64(u64);
+  builder_.add_i64(i64);
+  builder_.add_vf64(vf64);
+  builder_.add_v8(v8);
+  builder_.add_f32(f32);
+  builder_.add_u32(u32);
+  builder_.add_i32(i32);
+  builder_.add_u16(u16);
+  builder_.add_i16(i16);
+  builder_.add_u8(u8);
+  builder_.add_i8(i8);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliasesDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t i8 = 0,
+    uint8_t u8 = 0,
+    int16_t i16 = 0,
+    uint16_t u16 = 0,
+    int32_t i32 = 0,
+    uint32_t u32 = 0,
+    int64_t i64 = 0,
+    uint64_t u64 = 0,
+    float f32 = 0.0f,
+    double f64 = 0.0,
+    const std::vector<int8_t> *v8 = nullptr,
+    const std::vector<double> *vf64 = nullptr) {
+  auto v8__ = v8 ? _fbb.CreateVector<int8_t>(*v8) : 0;
+  auto vf64__ = vf64 ? _fbb.CreateVector<double>(*vf64) : 0;
+  return MyGame::Example::CreateTypeAliases(
+      _fbb,
+      i8,
+      u8,
+      i16,
+      u16,
+      i32,
+      u32,
+      i64,
+      u64,
+      f32,
+      f64,
+      v8__,
+      vf64__);
+}
+
+flatbuffers::Offset<TypeAliases> CreateTypeAliases(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace Example
+
+inline InParentNamespaceT *InParentNamespace::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<InParentNamespaceT>(new InParentNamespaceT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InParentNamespace::UnPackTo(InParentNamespaceT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<InParentNamespace> InParentNamespace::Pack(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInParentNamespace(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<InParentNamespace> CreateInParentNamespace(flatbuffers::FlatBufferBuilder &_fbb, const InParentNamespaceT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const InParentNamespaceT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return MyGame::CreateInParentNamespace(
+      _fbb);
+}
+
+namespace Example2 {
+
+inline MonsterT *Monster::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MonsterT>(new MonsterT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Monster::UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<Monster> Monster::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonster(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return MyGame::Example2::CreateMonster(
+      _fbb);
+}
+
+}  // namespace Example2
+
+namespace Example {
+
+inline TestSimpleTableWithEnumT *TestSimpleTableWithEnum::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TestSimpleTableWithEnumT>(new TestSimpleTableWithEnumT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TestSimpleTableWithEnum::UnPackTo(TestSimpleTableWithEnumT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = color(); _o->color = _e; }
+}
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> TestSimpleTableWithEnum::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTestSimpleTableWithEnum(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TestSimpleTableWithEnum> CreateTestSimpleTableWithEnum(flatbuffers::FlatBufferBuilder &_fbb, const TestSimpleTableWithEnumT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TestSimpleTableWithEnumT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _color = _o->color;
+  return MyGame::Example::CreateTestSimpleTableWithEnum(
+      _fbb,
+      _color);
+}
+
+inline StatT *Stat::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StatT>(new StatT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Stat::UnPackTo(StatT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); if (_e) _o->id = _e->str(); }
+  { auto _e = val(); _o->val = _e; }
+  { auto _e = count(); _o->count = _e; }
+}
+
+inline flatbuffers::Offset<Stat> Stat::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StatT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStat(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Stat> CreateStat(flatbuffers::FlatBufferBuilder &_fbb, const StatT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StatT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id.empty() ? 0 : _fbb.CreateString(_o->id);
+  auto _val = _o->val;
+  auto _count = _o->count;
+  return MyGame::Example::CreateStat(
+      _fbb,
+      _id,
+      _val,
+      _count);
+}
+
+inline ReferrableT *Referrable::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReferrableT>(new ReferrableT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Referrable::UnPackTo(ReferrableT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); _o->id = _e; }
+}
+
+inline flatbuffers::Offset<Referrable> Referrable::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReferrable(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Referrable> CreateReferrable(flatbuffers::FlatBufferBuilder &_fbb, const ReferrableT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReferrableT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id;
+  return MyGame::Example::CreateReferrable(
+      _fbb,
+      _id);
+}
+
+inline MonsterT *Monster::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MonsterT>(new MonsterT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Monster::UnPackTo(MonsterT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = pos(); if (_e) _o->pos = flatbuffers::unique_ptr<MyGame::Example::Vec3>(new MyGame::Example::Vec3(*_e)); }
+  { auto _e = mana(); _o->mana = _e; }
+  { auto _e = hp(); _o->hp = _e; }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = inventory(); if (_e) { _o->inventory.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->inventory.begin()); } }
+  { auto _e = color(); _o->color = _e; }
+  { auto _e = test_type(); _o->test.type = _e; }
+  { auto _e = test(); if (_e) _o->test.value = MyGame::Example::AnyUnion::UnPack(_e, test_type(), _resolver); }
+  { auto _e = test4(); if (_e) { _o->test4.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->test4[_i] = *_e->Get(_i); } } }
+  { auto _e = testarrayofstring(); if (_e) { _o->testarrayofstring.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofstring[_i] = _e->Get(_i)->str(); } } }
+  { auto _e = testarrayoftables(); if (_e) { _o->testarrayoftables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayoftables[_i] = flatbuffers::unique_ptr<MyGame::Example::MonsterT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = enemy(); if (_e) _o->enemy = flatbuffers::unique_ptr<MyGame::Example::MonsterT>(_e->UnPack(_resolver)); }
+  { auto _e = testnestedflatbuffer(); if (_e) { _o->testnestedflatbuffer.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->testnestedflatbuffer.begin()); } }
+  { auto _e = testempty(); if (_e) _o->testempty = flatbuffers::unique_ptr<MyGame::Example::StatT>(_e->UnPack(_resolver)); }
+  { auto _e = testbool(); _o->testbool = _e; }
+  { auto _e = testhashs32_fnv1(); _o->testhashs32_fnv1 = _e; }
+  { auto _e = testhashu32_fnv1(); _o->testhashu32_fnv1 = _e; }
+  { auto _e = testhashs64_fnv1(); _o->testhashs64_fnv1 = _e; }
+  { auto _e = testhashu64_fnv1(); _o->testhashu64_fnv1 = _e; }
+  { auto _e = testhashs32_fnv1a(); _o->testhashs32_fnv1a = _e; }
+  { auto _e = testhashu32_fnv1a(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->testhashu32_fnv1a), static_cast<flatbuffers::hash_value_t>(_e)); else _o->testhashu32_fnv1a = nullptr; }
+  { auto _e = testhashs64_fnv1a(); _o->testhashs64_fnv1a = _e; }
+  { auto _e = testhashu64_fnv1a(); _o->testhashu64_fnv1a = _e; }
+  { auto _e = testarrayofbools(); if (_e) { _o->testarrayofbools.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofbools[_i] = _e->Get(_i) != 0; } } }
+  { auto _e = testf(); _o->testf = _e; }
+  { auto _e = testf2(); _o->testf2 = _e; }
+  { auto _e = testf3(); _o->testf3 = _e; }
+  { auto _e = testarrayofstring2(); if (_e) { _o->testarrayofstring2.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofstring2[_i] = _e->Get(_i)->str(); } } }
+  { auto _e = testarrayofsortedstruct(); if (_e) { _o->testarrayofsortedstruct.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->testarrayofsortedstruct[_i] = *_e->Get(_i); } } }
+  { auto _e = flex(); if (_e) { _o->flex.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->flex.begin()); } }
+  { auto _e = test5(); if (_e) { _o->test5.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->test5[_i] = *_e->Get(_i); } } }
+  { auto _e = vector_of_longs(); if (_e) { _o->vector_of_longs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_longs[_i] = _e->Get(_i); } } }
+  { auto _e = vector_of_doubles(); if (_e) { _o->vector_of_doubles.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_doubles[_i] = _e->Get(_i); } } }
+  { auto _e = parent_namespace_test(); if (_e) _o->parent_namespace_test = flatbuffers::unique_ptr<MyGame::InParentNamespaceT>(_e->UnPack(_resolver)); }
+  { auto _e = vector_of_referrables(); if (_e) { _o->vector_of_referrables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_referrables[_i] = flatbuffers::unique_ptr<MyGame::Example::ReferrableT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = single_weak_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->single_weak_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->single_weak_reference = nullptr; }
+  { auto _e = vector_of_weak_references(); if (_e) { _o->vector_of_weak_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, naked
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_weak_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i))); else _o->vector_of_weak_references[_i] = nullptr; } } }
+  { auto _e = vector_of_strong_referrables(); if (_e) { _o->vector_of_strong_referrables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_strong_referrables[_i] = flatbuffers::unique_ptr<MyGame::Example::ReferrableT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = co_owning_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->co_owning_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->co_owning_reference = nullptr; }
+  { auto _e = vector_of_co_owning_references(); if (_e) { _o->vector_of_co_owning_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, default_ptr_type
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_co_owning_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i)));/* else do nothing */; } } }
+  { auto _e = non_owning_reference(); //scalar resolver, naked 
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->non_owning_reference), static_cast<flatbuffers::hash_value_t>(_e)); else _o->non_owning_reference = nullptr; }
+  { auto _e = vector_of_non_owning_references(); if (_e) { _o->vector_of_non_owning_references.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { //vector resolver, naked
+if (_resolver) (*_resolver)(reinterpret_cast<void **>(&_o->vector_of_non_owning_references[_i]), static_cast<flatbuffers::hash_value_t>(_e->Get(_i))); else _o->vector_of_non_owning_references[_i] = nullptr; } } }
+  { auto _e = any_unique_type(); _o->any_unique.type = _e; }
+  { auto _e = any_unique(); if (_e) _o->any_unique.value = MyGame::Example::AnyUniqueAliasesUnion::UnPack(_e, any_unique_type(), _resolver); }
+  { auto _e = any_ambiguous_type(); _o->any_ambiguous.type = _e; }
+  { auto _e = any_ambiguous(); if (_e) _o->any_ambiguous.value = MyGame::Example::AnyAmbiguousAliasesUnion::UnPack(_e, any_ambiguous_type(), _resolver); }
+  { auto _e = vector_of_enums(); if (_e) { _o->vector_of_enums.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vector_of_enums[_i] = static_cast<MyGame::Example::Color>(_e->Get(_i)); } } }
+  { auto _e = signed_enum(); _o->signed_enum = _e; }
+  { auto _e = testrequirednestedflatbuffer(); if (_e) { _o->testrequirednestedflatbuffer.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->testrequirednestedflatbuffer.begin()); } }
+  { auto _e = scalar_key_sorted_tables(); if (_e) { _o->scalar_key_sorted_tables.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scalar_key_sorted_tables[_i] = flatbuffers::unique_ptr<MyGame::Example::StatT>(_e->Get(_i)->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<Monster> Monster::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMonster(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Monster> CreateMonster(flatbuffers::FlatBufferBuilder &_fbb, const MonsterT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MonsterT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _pos = _o->pos ? _o->pos.get() : 0;
+  auto _mana = _o->mana;
+  auto _hp = _o->hp;
+  auto _name = _fbb.CreateString(_o->name);
+  auto _inventory = _o->inventory.size() ? _fbb.CreateVector(_o->inventory) : 0;
+  auto _color = _o->color;
+  auto _test_type = _o->test.type;
+  auto _test = _o->test.Pack(_fbb);
+  auto _test4 = _o->test4.size() ? _fbb.CreateVectorOfStructs(_o->test4) : 0;
+  auto _testarrayofstring = _o->testarrayofstring.size() ? _fbb.CreateVectorOfStrings(_o->testarrayofstring) : 0;
+  auto _testarrayoftables = _o->testarrayoftables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Monster>> (_o->testarrayoftables.size(), [](size_t i, _VectorArgs *__va) { return CreateMonster(*__va->__fbb, __va->__o->testarrayoftables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _enemy = _o->enemy ? CreateMonster(_fbb, _o->enemy.get(), _rehasher) : 0;
+  auto _testnestedflatbuffer = _o->testnestedflatbuffer.size() ? _fbb.CreateVector(_o->testnestedflatbuffer) : 0;
+  auto _testempty = _o->testempty ? CreateStat(_fbb, _o->testempty.get(), _rehasher) : 0;
+  auto _testbool = _o->testbool;
+  auto _testhashs32_fnv1 = _o->testhashs32_fnv1;
+  auto _testhashu32_fnv1 = _o->testhashu32_fnv1;
+  auto _testhashs64_fnv1 = _o->testhashs64_fnv1;
+  auto _testhashu64_fnv1 = _o->testhashu64_fnv1;
+  auto _testhashs32_fnv1a = _o->testhashs32_fnv1a;
+  auto _testhashu32_fnv1a = _rehasher ? static_cast<uint32_t>((*_rehasher)(_o->testhashu32_fnv1a)) : 0;
+  auto _testhashs64_fnv1a = _o->testhashs64_fnv1a;
+  auto _testhashu64_fnv1a = _o->testhashu64_fnv1a;
+  auto _testarrayofbools = _o->testarrayofbools.size() ? _fbb.CreateVector(_o->testarrayofbools) : 0;
+  auto _testf = _o->testf;
+  auto _testf2 = _o->testf2;
+  auto _testf3 = _o->testf3;
+  auto _testarrayofstring2 = _o->testarrayofstring2.size() ? _fbb.CreateVectorOfStrings(_o->testarrayofstring2) : 0;
+  auto _testarrayofsortedstruct = _o->testarrayofsortedstruct.size() ? _fbb.CreateVectorOfStructs(_o->testarrayofsortedstruct) : 0;
+  auto _flex = _o->flex.size() ? _fbb.CreateVector(_o->flex) : 0;
+  auto _test5 = _o->test5.size() ? _fbb.CreateVectorOfStructs(_o->test5) : 0;
+  auto _vector_of_longs = _o->vector_of_longs.size() ? _fbb.CreateVector(_o->vector_of_longs) : 0;
+  auto _vector_of_doubles = _o->vector_of_doubles.size() ? _fbb.CreateVector(_o->vector_of_doubles) : 0;
+  auto _parent_namespace_test = _o->parent_namespace_test ? CreateInParentNamespace(_fbb, _o->parent_namespace_test.get(), _rehasher) : 0;
+  auto _vector_of_referrables = _o->vector_of_referrables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Referrable>> (_o->vector_of_referrables.size(), [](size_t i, _VectorArgs *__va) { return CreateReferrable(*__va->__fbb, __va->__o->vector_of_referrables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _single_weak_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->single_weak_reference)) : 0;
+  auto _vector_of_weak_references = _o->vector_of_weak_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_weak_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_weak_references[i])) : 0; }, &_va ) : 0;
+  auto _vector_of_strong_referrables = _o->vector_of_strong_referrables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Referrable>> (_o->vector_of_strong_referrables.size(), [](size_t i, _VectorArgs *__va) { return CreateReferrable(*__va->__fbb, __va->__o->vector_of_strong_referrables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _co_owning_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->co_owning_reference)) : 0;
+  auto _vector_of_co_owning_references = _o->vector_of_co_owning_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_co_owning_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_co_owning_references[i].get())) : 0; }, &_va ) : 0;
+  auto _non_owning_reference = _rehasher ? static_cast<uint64_t>((*_rehasher)(_o->non_owning_reference)) : 0;
+  auto _vector_of_non_owning_references = _o->vector_of_non_owning_references.size() ? _fbb.CreateVector<uint64_t>(_o->vector_of_non_owning_references.size(), [](size_t i, _VectorArgs *__va) { return __va->__rehasher ? static_cast<uint64_t>((*__va->__rehasher)(__va->__o->vector_of_non_owning_references[i])) : 0; }, &_va ) : 0;
+  auto _any_unique_type = _o->any_unique.type;
+  auto _any_unique = _o->any_unique.Pack(_fbb);
+  auto _any_ambiguous_type = _o->any_ambiguous.type;
+  auto _any_ambiguous = _o->any_ambiguous.Pack(_fbb);
+  auto _vector_of_enums = _o->vector_of_enums.size() ? _fbb.CreateVectorScalarCast<uint8_t>(flatbuffers::data(_o->vector_of_enums), _o->vector_of_enums.size()) : 0;
+  auto _signed_enum = _o->signed_enum;
+  auto _testrequirednestedflatbuffer = _o->testrequirednestedflatbuffer.size() ? _fbb.CreateVector(_o->testrequirednestedflatbuffer) : 0;
+  auto _scalar_key_sorted_tables = _o->scalar_key_sorted_tables.size() ? _fbb.CreateVector<flatbuffers::Offset<MyGame::Example::Stat>> (_o->scalar_key_sorted_tables.size(), [](size_t i, _VectorArgs *__va) { return CreateStat(*__va->__fbb, __va->__o->scalar_key_sorted_tables[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return MyGame::Example::CreateMonster(
+      _fbb,
+      _pos,
+      _mana,
+      _hp,
+      _name,
+      _inventory,
+      _color,
+      _test_type,
+      _test,
+      _test4,
+      _testarrayofstring,
+      _testarrayoftables,
+      _enemy,
+      _testnestedflatbuffer,
+      _testempty,
+      _testbool,
+      _testhashs32_fnv1,
+      _testhashu32_fnv1,
+      _testhashs64_fnv1,
+      _testhashu64_fnv1,
+      _testhashs32_fnv1a,
+      _testhashu32_fnv1a,
+      _testhashs64_fnv1a,
+      _testhashu64_fnv1a,
+      _testarrayofbools,
+      _testf,
+      _testf2,
+      _testf3,
+      _testarrayofstring2,
+      _testarrayofsortedstruct,
+      _flex,
+      _test5,
+      _vector_of_longs,
+      _vector_of_doubles,
+      _parent_namespace_test,
+      _vector_of_referrables,
+      _single_weak_reference,
+      _vector_of_weak_references,
+      _vector_of_strong_referrables,
+      _co_owning_reference,
+      _vector_of_co_owning_references,
+      _non_owning_reference,
+      _vector_of_non_owning_references,
+      _any_unique_type,
+      _any_unique,
+      _any_ambiguous_type,
+      _any_ambiguous,
+      _vector_of_enums,
+      _signed_enum,
+      _testrequirednestedflatbuffer,
+      _scalar_key_sorted_tables);
+}
+
+inline TypeAliasesT *TypeAliases::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TypeAliasesT>(new TypeAliasesT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TypeAliases::UnPackTo(TypeAliasesT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = i8(); _o->i8 = _e; }
+  { auto _e = u8(); _o->u8 = _e; }
+  { auto _e = i16(); _o->i16 = _e; }
+  { auto _e = u16(); _o->u16 = _e; }
+  { auto _e = i32(); _o->i32 = _e; }
+  { auto _e = u32(); _o->u32 = _e; }
+  { auto _e = i64(); _o->i64 = _e; }
+  { auto _e = u64(); _o->u64 = _e; }
+  { auto _e = f32(); _o->f32 = _e; }
+  { auto _e = f64(); _o->f64 = _e; }
+  { auto _e = v8(); if (_e) { _o->v8.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->v8.begin()); } }
+  { auto _e = vf64(); if (_e) { _o->vf64.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vf64[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<TypeAliases> TypeAliases::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTypeAliases(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TypeAliases> CreateTypeAliases(flatbuffers::FlatBufferBuilder &_fbb, const TypeAliasesT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TypeAliasesT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _i8 = _o->i8;
+  auto _u8 = _o->u8;
+  auto _i16 = _o->i16;
+  auto _u16 = _o->u16;
+  auto _i32 = _o->i32;
+  auto _u32 = _o->u32;
+  auto _i64 = _o->i64;
+  auto _u64 = _o->u64;
+  auto _f32 = _o->f32;
+  auto _f64 = _o->f64;
+  auto _v8 = _o->v8.size() ? _fbb.CreateVector(_o->v8) : 0;
+  auto _vf64 = _o->vf64.size() ? _fbb.CreateVector(_o->vf64) : 0;
+  return MyGame::Example::CreateTypeAliases(
+      _fbb,
+      _i8,
+      _u8,
+      _i16,
+      _u16,
+      _i32,
+      _u32,
+      _i64,
+      _u64,
+      _f32,
+      _f64,
+      _v8,
+      _vf64);
+}
+
+inline bool VerifyAny(flatbuffers::Verifier &verifier, const void *obj, Any type) {
+  switch (type) {
+    case Any_NONE: {
+      return true;
+    }
+    case Any_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Any_TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Any_MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAny(
+        verifier,  values->Get(i), types->GetEnum<Any>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyUnion::UnPack(const void *obj, Any type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case Any_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case Any_TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case Any_MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case Any_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case Any_TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      return CreateTestSimpleTableWithEnum(_fbb, ptr, _rehasher).Union();
+    }
+    case Any_MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyUnion::AnyUnion(const AnyUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case Any_Monster: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case Any_TestSimpleTableWithEnum: {
+      value = new MyGame::Example::TestSimpleTableWithEnumT(*reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(u.value));
+      break;
+    }
+    case Any_MyGame_Example2_Monster: {
+      value = new MyGame::Example2::MonsterT(*reinterpret_cast<MyGame::Example2::MonsterT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyUnion::Reset() {
+  switch (type) {
+    case Any_Monster: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case Any_TestSimpleTableWithEnum: {
+      auto ptr = reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      delete ptr;
+      break;
+    }
+    case Any_MyGame_Example2_Monster: {
+      auto ptr = reinterpret_cast<MyGame::Example2::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = Any_NONE;
+}
+
+inline bool VerifyAnyUniqueAliases(flatbuffers::Verifier &verifier, const void *obj, AnyUniqueAliases type) {
+  switch (type) {
+    case AnyUniqueAliases_NONE: {
+      return true;
+    }
+    case AnyUniqueAliases_M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyUniqueAliases_TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyUniqueAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyUniqueAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAnyUniqueAliases(
+        verifier,  values->Get(i), types->GetEnum<AnyUniqueAliases>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyUniqueAliasesUnion::UnPack(const void *obj, AnyUniqueAliases type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case AnyUniqueAliases_M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyUniqueAliases_TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnum *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyUniqueAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyUniqueAliasesUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case AnyUniqueAliases_M: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyUniqueAliases_TS: {
+      auto ptr = reinterpret_cast<const MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      return CreateTestSimpleTableWithEnum(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyUniqueAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example2::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyUniqueAliasesUnion::AnyUniqueAliasesUnion(const AnyUniqueAliasesUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case AnyUniqueAliases_M: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyUniqueAliases_TS: {
+      value = new MyGame::Example::TestSimpleTableWithEnumT(*reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(u.value));
+      break;
+    }
+    case AnyUniqueAliases_M2: {
+      value = new MyGame::Example2::MonsterT(*reinterpret_cast<MyGame::Example2::MonsterT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyUniqueAliasesUnion::Reset() {
+  switch (type) {
+    case AnyUniqueAliases_M: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyUniqueAliases_TS: {
+      auto ptr = reinterpret_cast<MyGame::Example::TestSimpleTableWithEnumT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyUniqueAliases_M2: {
+      auto ptr = reinterpret_cast<MyGame::Example2::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = AnyUniqueAliases_NONE;
+}
+
+inline bool VerifyAnyAmbiguousAliases(flatbuffers::Verifier &verifier, const void *obj, AnyAmbiguousAliases type) {
+  switch (type) {
+    case AnyAmbiguousAliases_NONE: {
+      return true;
+    }
+    case AnyAmbiguousAliases_M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyAmbiguousAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AnyAmbiguousAliases_M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAnyAmbiguousAliasesVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAnyAmbiguousAliases(
+        verifier,  values->Get(i), types->GetEnum<AnyAmbiguousAliases>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *AnyAmbiguousAliasesUnion::UnPack(const void *obj, AnyAmbiguousAliases type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case AnyAmbiguousAliases_M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyAmbiguousAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case AnyAmbiguousAliases_M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::Monster *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> AnyAmbiguousAliasesUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case AnyAmbiguousAliases_M1: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyAmbiguousAliases_M2: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    case AnyAmbiguousAliases_M3: {
+      auto ptr = reinterpret_cast<const MyGame::Example::MonsterT *>(value);
+      return CreateMonster(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline AnyAmbiguousAliasesUnion::AnyAmbiguousAliasesUnion(const AnyAmbiguousAliasesUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case AnyAmbiguousAliases_M1: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyAmbiguousAliases_M2: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    case AnyAmbiguousAliases_M3: {
+      FLATBUFFERS_ASSERT(false);  // MyGame::Example::MonsterT not copyable.
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void AnyAmbiguousAliasesUnion::Reset() {
+  switch (type) {
+    case AnyAmbiguousAliases_M1: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyAmbiguousAliases_M2: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    case AnyAmbiguousAliases_M3: {
+      auto ptr = reinterpret_cast<MyGame::Example::MonsterT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = AnyAmbiguousAliases_NONE;
+}
+
+inline const flatbuffers::TypeTable *ColorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_UCHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable
+  };
+  static const int64_t values[] = { 1, 2, 8 };
+  static const char * const names[] = {
+    "Red",
+    "Green",
+    "Blue"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RaceTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::RaceTypeTable
+  };
+  static const int64_t values[] = { -1, 0, 1, 2 };
+  static const char * const names[] = {
+    "None",
+    "Human",
+    "Dwarf",
+    "Elf"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 4, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::TestSimpleTableWithEnumTypeTable,
+    MyGame::Example2::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Monster",
+    "TestSimpleTableWithEnum",
+    "MyGame_Example2_Monster"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyUniqueAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::TestSimpleTableWithEnumTypeTable,
+    MyGame::Example2::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "M",
+    "TS",
+    "M2"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AnyAmbiguousAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::MonsterTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "M1",
+    "M2",
+    "M3"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace Example
+
+inline const flatbuffers::TypeTable *InParentNamespaceTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+namespace Example2 {
+
+inline const flatbuffers::TypeTable *MonsterTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+}  // namespace Example2
+
+namespace Example {
+
+inline const flatbuffers::TypeTable *TestTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 2, 4 };
+  static const char * const names[] = {
+    "a",
+    "b"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 2, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TestSimpleTableWithEnumTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable
+  };
+  static const char * const names[] = {
+    "color"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Vec3TypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::ColorTypeTable,
+    MyGame::Example::TestTypeTable
+  };
+  static const int64_t values[] = { 0, 4, 8, 16, 24, 26, 32 };
+  static const char * const names[] = {
+    "x",
+    "y",
+    "z",
+    "test1",
+    "test2",
+    "test3"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 6, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AbilityTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8 };
+  static const char * const names[] = {
+    "id",
+    "distance"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 2, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StructOfStructsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::AbilityTypeTable,
+    MyGame::Example::TestTypeTable
+  };
+  static const int64_t values[] = { 0, 8, 12, 20 };
+  static const char * const names[] = {
+    "a",
+    "b",
+    "c"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 3, type_codes, type_refs, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StatTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "id",
+    "val",
+    "count"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReferrableTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_ULONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "id"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MonsterTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_UCHAR, 0, 1 },
+    { flatbuffers::ET_UTYPE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 1, 3 },
+    { flatbuffers::ET_STRING, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 4 },
+    { flatbuffers::ET_SEQUENCE, 0, 4 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 5 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_BOOL, 1, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_STRING, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 6 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 3 },
+    { flatbuffers::ET_LONG, 1, -1 },
+    { flatbuffers::ET_DOUBLE, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 7 },
+    { flatbuffers::ET_SEQUENCE, 1, 8 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 8 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 1, -1 },
+    { flatbuffers::ET_UTYPE, 0, 9 },
+    { flatbuffers::ET_SEQUENCE, 0, 9 },
+    { flatbuffers::ET_UTYPE, 0, 10 },
+    { flatbuffers::ET_SEQUENCE, 0, 10 },
+    { flatbuffers::ET_UCHAR, 1, 1 },
+    { flatbuffers::ET_CHAR, 0, 11 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 5 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    MyGame::Example::Vec3TypeTable,
+    MyGame::Example::ColorTypeTable,
+    MyGame::Example::AnyTypeTable,
+    MyGame::Example::TestTypeTable,
+    MyGame::Example::MonsterTypeTable,
+    MyGame::Example::StatTypeTable,
+    MyGame::Example::AbilityTypeTable,
+    MyGame::InParentNamespaceTypeTable,
+    MyGame::Example::ReferrableTypeTable,
+    MyGame::Example::AnyUniqueAliasesTypeTable,
+    MyGame::Example::AnyAmbiguousAliasesTypeTable,
+    MyGame::Example::RaceTypeTable
+  };
+  static const char * const names[] = {
+    "pos",
+    "mana",
+    "hp",
+    "name",
+    "friendly",
+    "inventory",
+    "color",
+    "test_type",
+    "test",
+    "test4",
+    "testarrayofstring",
+    "testarrayoftables",
+    "enemy",
+    "testnestedflatbuffer",
+    "testempty",
+    "testbool",
+    "testhashs32_fnv1",
+    "testhashu32_fnv1",
+    "testhashs64_fnv1",
+    "testhashu64_fnv1",
+    "testhashs32_fnv1a",
+    "testhashu32_fnv1a",
+    "testhashs64_fnv1a",
+    "testhashu64_fnv1a",
+    "testarrayofbools",
+    "testf",
+    "testf2",
+    "testf3",
+    "testarrayofstring2",
+    "testarrayofsortedstruct",
+    "flex",
+    "test5",
+    "vector_of_longs",
+    "vector_of_doubles",
+    "parent_namespace_test",
+    "vector_of_referrables",
+    "single_weak_reference",
+    "vector_of_weak_references",
+    "vector_of_strong_referrables",
+    "co_owning_reference",
+    "vector_of_co_owning_references",
+    "non_owning_reference",
+    "vector_of_non_owning_references",
+    "any_unique_type",
+    "any_unique",
+    "any_ambiguous_type",
+    "any_ambiguous",
+    "vector_of_enums",
+    "signed_enum",
+    "testrequirednestedflatbuffer",
+    "scalar_key_sorted_tables"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 51, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TypeAliasesTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_CHAR, 1, -1 },
+    { flatbuffers::ET_DOUBLE, 1, -1 }
+  };
+  static const char * const names[] = {
+    "i8",
+    "u8",
+    "i16",
+    "u16",
+    "i32",
+    "u32",
+    "i64",
+    "u64",
+    "f32",
+    "f64",
+    "v8",
+    "vf64"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 12, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const MyGame::Example::Monster *GetMonster(const void *buf) {
+  return flatbuffers::GetRoot<MyGame::Example::Monster>(buf);
+}
+
+inline const MyGame::Example::Monster *GetSizePrefixedMonster(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<MyGame::Example::Monster>(buf);
+}
+
+inline Monster *GetMutableMonster(void *buf) {
+  return flatbuffers::GetMutableRoot<Monster>(buf);
+}
+
+inline const char *MonsterIdentifier() {
+  return "MONS";
+}
+
+inline bool MonsterBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, MonsterIdentifier());
+}
+
+inline bool VerifyMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<MyGame::Example::Monster>(MonsterIdentifier());
+}
+
+inline bool VerifySizePrefixedMonsterBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<MyGame::Example::Monster>(MonsterIdentifier());
+}
+
+inline const char *MonsterExtension() {
+  return "mon";
+}
+
+inline void FinishMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::Monster> root) {
+  fbb.Finish(root, MonsterIdentifier());
+}
+
+inline void FinishSizePrefixedMonsterBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<MyGame::Example::Monster> root) {
+  fbb.FinishSizePrefixed(root, MonsterIdentifier());
+}
+
+inline flatbuffers::unique_ptr<MyGame::Example::MonsterT> UnPackMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Example::MonsterT>(GetMonster(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<MyGame::Example::MonsterT> UnPackSizePrefixedMonster(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MyGame::Example::MonsterT>(GetSizePrefixedMonster(buf)->UnPack(res));
+}
+
+}  // namespace Example
+}  // namespace MyGame
+
+#endif  // FLATBUFFERS_GENERATED_MONSTERTEST_MYGAME_EXAMPLE_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.lobster b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.lobster
new file mode 100644
index 0000000..70e5065
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.lobster
@@ -0,0 +1,773 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import flatbuffers
+
+namespace MyGame_Example
+
+/// Composite components of Monster color.
+enum Color:
+    Color_Red = 1
+    /// \brief color Green
+    /// Green is bit_flag with value (1u << 1)
+    Color_Green = 2
+    /// \brief color Blue (1u << 3)
+    Color_Blue = 8
+
+enum Race:
+    Race_None = -1
+    Race_Human = 0
+    Race_Dwarf = 1
+    Race_Elf = 2
+
+enum Any:
+    Any_NONE = 0
+    Any_Monster = 1
+    Any_TestSimpleTableWithEnum = 2
+    Any_MyGame_Example2_Monster = 3
+
+enum AnyUniqueAliases:
+    AnyUniqueAliases_NONE = 0
+    AnyUniqueAliases_M = 1
+    AnyUniqueAliases_TS = 2
+    AnyUniqueAliases_M2 = 3
+
+enum AnyAmbiguousAliases:
+    AnyAmbiguousAliases_NONE = 0
+    AnyAmbiguousAliases_M1 = 1
+    AnyAmbiguousAliases_M2 = 2
+    AnyAmbiguousAliases_M3 = 3
+
+namespace MyGame
+
+class InParentNamespace
+
+namespace MyGame_Example2
+
+class Monster
+
+namespace MyGame_Example
+
+class Test
+
+class TestSimpleTableWithEnum
+
+class Vec3
+
+class Ability
+
+class StructOfStructs
+
+class Stat
+
+class Referrable
+
+class Monster
+
+class TypeAliases
+
+namespace MyGame
+
+class InParentNamespace : flatbuffers_handle
+
+def GetRootAsInParentNamespace(buf:string): return InParentNamespace { buf, buf.flatbuffers_indirect(0) }
+
+struct InParentNamespaceBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(0)
+        return this
+    def end():
+        return b_.EndObject()
+
+namespace MyGame_Example2
+
+class Monster : flatbuffers_handle
+
+def GetRootAsMonster(buf:string): return Monster { buf, buf.flatbuffers_indirect(0) }
+
+struct MonsterBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(0)
+        return this
+    def end():
+        return b_.EndObject()
+
+namespace MyGame_Example
+
+class Test : flatbuffers_handle
+    def a():
+        return buf_.read_int16_le(pos_ + 0)
+    def b():
+        return buf_.read_int8_le(pos_ + 2)
+
+def CreateTest(b_:flatbuffers_builder, a:int, b:int):
+    b_.Prep(2, 4)
+    b_.Pad(1)
+    b_.PrependInt8(b)
+    b_.PrependInt16(a)
+    return b_.Offset()
+
+class TestSimpleTableWithEnum : flatbuffers_handle
+    def color():
+        return Color(buf_.flatbuffers_field_int8(pos_, 4, 2))
+
+def GetRootAsTestSimpleTableWithEnum(buf:string): return TestSimpleTableWithEnum { buf, buf.flatbuffers_indirect(0) }
+
+struct TestSimpleTableWithEnumBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(1)
+        return this
+    def add_color(color:Color):
+        b_.PrependUint8Slot(0, color, 2)
+        return this
+    def end():
+        return b_.EndObject()
+
+class Vec3 : flatbuffers_handle
+    def x():
+        return buf_.read_float32_le(pos_ + 0)
+    def y():
+        return buf_.read_float32_le(pos_ + 4)
+    def z():
+        return buf_.read_float32_le(pos_ + 8)
+    def test1():
+        return buf_.read_float64_le(pos_ + 16)
+    def test2():
+        return Color(buf_.read_int8_le(pos_ + 24))
+    def test3():
+        return MyGame_Example_Test{ buf_, pos_ + 26 }
+
+def CreateVec3(b_:flatbuffers_builder, x:float, y:float, z:float, test1:float, test2:Color, test3_a:int, test3_b:int):
+    b_.Prep(8, 32)
+    b_.Pad(2)
+    b_.Prep(2, 4)
+    b_.Pad(1)
+    b_.PrependInt8(test3_b)
+    b_.PrependInt16(test3_a)
+    b_.Pad(1)
+    b_.PrependUint8(test2)
+    b_.PrependFloat64(test1)
+    b_.Pad(4)
+    b_.PrependFloat32(z)
+    b_.PrependFloat32(y)
+    b_.PrependFloat32(x)
+    return b_.Offset()
+
+class Ability : flatbuffers_handle
+    def id():
+        return buf_.read_int32_le(pos_ + 0)
+    def distance():
+        return buf_.read_int32_le(pos_ + 4)
+
+def CreateAbility(b_:flatbuffers_builder, id:int, distance:int):
+    b_.Prep(4, 8)
+    b_.PrependUint32(distance)
+    b_.PrependUint32(id)
+    return b_.Offset()
+
+class StructOfStructs : flatbuffers_handle
+    def a():
+        return MyGame_Example_Ability{ buf_, pos_ + 0 }
+    def b():
+        return MyGame_Example_Test{ buf_, pos_ + 8 }
+    def c():
+        return MyGame_Example_Ability{ buf_, pos_ + 12 }
+
+def CreateStructOfStructs(b_:flatbuffers_builder, a_id:int, a_distance:int, b_a:int, b_b:int, c_id:int, c_distance:int):
+    b_.Prep(4, 20)
+    b_.Prep(4, 8)
+    b_.PrependUint32(c_distance)
+    b_.PrependUint32(c_id)
+    b_.Prep(2, 4)
+    b_.Pad(1)
+    b_.PrependInt8(b_b)
+    b_.PrependInt16(b_a)
+    b_.Prep(4, 8)
+    b_.PrependUint32(a_distance)
+    b_.PrependUint32(a_id)
+    return b_.Offset()
+
+class Stat : flatbuffers_handle
+    def id():
+        return buf_.flatbuffers_field_string(pos_, 4)
+    def val():
+        return buf_.flatbuffers_field_int64(pos_, 6, 0)
+    def count():
+        return buf_.flatbuffers_field_int16(pos_, 8, 0)
+
+def GetRootAsStat(buf:string): return Stat { buf, buf.flatbuffers_indirect(0) }
+
+struct StatBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(3)
+        return this
+    def add_id(id:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(0, id)
+        return this
+    def add_val(val:int):
+        b_.PrependInt64Slot(1, val, 0)
+        return this
+    def add_count(count:int):
+        b_.PrependUint16Slot(2, count, 0)
+        return this
+    def end():
+        return b_.EndObject()
+
+class Referrable : flatbuffers_handle
+    def id():
+        return buf_.flatbuffers_field_int64(pos_, 4, 0)
+
+def GetRootAsReferrable(buf:string): return Referrable { buf, buf.flatbuffers_indirect(0) }
+
+struct ReferrableBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(1)
+        return this
+    def add_id(id:int):
+        b_.PrependUint64Slot(0, id, 0)
+        return this
+    def end():
+        return b_.EndObject()
+
+/// an example documentation comment: "monster object"
+class Monster : flatbuffers_handle
+    def pos():
+        let o = buf_.flatbuffers_field_struct(pos_, 4)
+        return if o: MyGame_Example_Vec3 { buf_, o } else: nil
+    def mana():
+        return buf_.flatbuffers_field_int16(pos_, 6, 150)
+    def hp():
+        return buf_.flatbuffers_field_int16(pos_, 8, 100)
+    def name():
+        return buf_.flatbuffers_field_string(pos_, 10)
+    def inventory(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 14) + i * 1)
+    def inventory_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 14)
+    def color():
+        return Color(buf_.flatbuffers_field_int8(pos_, 16, 8))
+    def test_type():
+        return Any(buf_.flatbuffers_field_int8(pos_, 18, 0))
+    def test_as_Monster():
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_field_table(pos_, 20) }
+    def test_as_TestSimpleTableWithEnum():
+        return MyGame_Example_TestSimpleTableWithEnum { buf_, buf_.flatbuffers_field_table(pos_, 20) }
+    def test_as_MyGame_Example2_Monster():
+        return MyGame_Example2_Monster { buf_, buf_.flatbuffers_field_table(pos_, 20) }
+    def test4(i:int):
+        return MyGame_Example_Test { buf_, buf_.flatbuffers_field_vector(pos_, 22) + i * 4 }
+    def test4_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 22)
+    def testarrayofstring(i:int):
+        return buf_.flatbuffers_string(buf_.flatbuffers_field_vector(pos_, 24) + i * 4)
+    def testarrayofstring_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 24)
+    /// an example documentation comment: this will end up in the generated code
+    /// multiline too
+    def testarrayoftables(i:int):
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_indirect(buf_.flatbuffers_field_vector(pos_, 26) + i * 4) }
+    def testarrayoftables_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 26)
+    def enemy():
+        let o = buf_.flatbuffers_field_table(pos_, 28)
+        return if o: MyGame_Example_Monster { buf_, o } else: nil
+    def testnestedflatbuffer(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 30) + i * 1)
+    def testnestedflatbuffer_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 30)
+    def testempty():
+        let o = buf_.flatbuffers_field_table(pos_, 32)
+        return if o: MyGame_Example_Stat { buf_, o } else: nil
+    def testbool():
+        return buf_.flatbuffers_field_int8(pos_, 34, 0)
+    def testhashs32_fnv1():
+        return buf_.flatbuffers_field_int32(pos_, 36, 0)
+    def testhashu32_fnv1():
+        return buf_.flatbuffers_field_int32(pos_, 38, 0)
+    def testhashs64_fnv1():
+        return buf_.flatbuffers_field_int64(pos_, 40, 0)
+    def testhashu64_fnv1():
+        return buf_.flatbuffers_field_int64(pos_, 42, 0)
+    def testhashs32_fnv1a():
+        return buf_.flatbuffers_field_int32(pos_, 44, 0)
+    def testhashu32_fnv1a():
+        return buf_.flatbuffers_field_int32(pos_, 46, 0)
+    def testhashs64_fnv1a():
+        return buf_.flatbuffers_field_int64(pos_, 48, 0)
+    def testhashu64_fnv1a():
+        return buf_.flatbuffers_field_int64(pos_, 50, 0)
+    def testarrayofbools(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 52) + i * 1)
+    def testarrayofbools_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 52)
+    def testf():
+        return buf_.flatbuffers_field_float32(pos_, 54, 3.14159)
+    def testf2():
+        return buf_.flatbuffers_field_float32(pos_, 56, 3.0)
+    def testf3():
+        return buf_.flatbuffers_field_float32(pos_, 58, 0.0)
+    def testarrayofstring2(i:int):
+        return buf_.flatbuffers_string(buf_.flatbuffers_field_vector(pos_, 60) + i * 4)
+    def testarrayofstring2_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 60)
+    def testarrayofsortedstruct(i:int):
+        return MyGame_Example_Ability { buf_, buf_.flatbuffers_field_vector(pos_, 62) + i * 8 }
+    def testarrayofsortedstruct_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 62)
+    def flex(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 64) + i * 1)
+    def flex_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 64)
+    def test5(i:int):
+        return MyGame_Example_Test { buf_, buf_.flatbuffers_field_vector(pos_, 66) + i * 4 }
+    def test5_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 66)
+    def vector_of_longs(i:int):
+        return buf_.read_int64_le(buf_.flatbuffers_field_vector(pos_, 68) + i * 8)
+    def vector_of_longs_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 68)
+    def vector_of_doubles(i:int):
+        return buf_.read_float64_le(buf_.flatbuffers_field_vector(pos_, 70) + i * 8)
+    def vector_of_doubles_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 70)
+    def parent_namespace_test():
+        let o = buf_.flatbuffers_field_table(pos_, 72)
+        return if o: MyGame_InParentNamespace { buf_, o } else: nil
+    def vector_of_referrables(i:int):
+        return MyGame_Example_Referrable { buf_, buf_.flatbuffers_indirect(buf_.flatbuffers_field_vector(pos_, 74) + i * 4) }
+    def vector_of_referrables_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 74)
+    def single_weak_reference():
+        return buf_.flatbuffers_field_int64(pos_, 76, 0)
+    def vector_of_weak_references(i:int):
+        return buf_.read_int64_le(buf_.flatbuffers_field_vector(pos_, 78) + i * 8)
+    def vector_of_weak_references_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 78)
+    def vector_of_strong_referrables(i:int):
+        return MyGame_Example_Referrable { buf_, buf_.flatbuffers_indirect(buf_.flatbuffers_field_vector(pos_, 80) + i * 4) }
+    def vector_of_strong_referrables_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 80)
+    def co_owning_reference():
+        return buf_.flatbuffers_field_int64(pos_, 82, 0)
+    def vector_of_co_owning_references(i:int):
+        return buf_.read_int64_le(buf_.flatbuffers_field_vector(pos_, 84) + i * 8)
+    def vector_of_co_owning_references_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 84)
+    def non_owning_reference():
+        return buf_.flatbuffers_field_int64(pos_, 86, 0)
+    def vector_of_non_owning_references(i:int):
+        return buf_.read_int64_le(buf_.flatbuffers_field_vector(pos_, 88) + i * 8)
+    def vector_of_non_owning_references_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 88)
+    def any_unique_type():
+        return AnyUniqueAliases(buf_.flatbuffers_field_int8(pos_, 90, 0))
+    def any_unique_as_M():
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_field_table(pos_, 92) }
+    def any_unique_as_TS():
+        return MyGame_Example_TestSimpleTableWithEnum { buf_, buf_.flatbuffers_field_table(pos_, 92) }
+    def any_unique_as_M2():
+        return MyGame_Example2_Monster { buf_, buf_.flatbuffers_field_table(pos_, 92) }
+    def any_ambiguous_type():
+        return AnyAmbiguousAliases(buf_.flatbuffers_field_int8(pos_, 94, 0))
+    def any_ambiguous_as_M1():
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_field_table(pos_, 96) }
+    def any_ambiguous_as_M2():
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_field_table(pos_, 96) }
+    def any_ambiguous_as_M3():
+        return MyGame_Example_Monster { buf_, buf_.flatbuffers_field_table(pos_, 96) }
+    def vector_of_enums(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 98) + i * 1)
+    def vector_of_enums_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 98)
+    def signed_enum():
+        return Race(buf_.flatbuffers_field_int8(pos_, 100, -1))
+    def testrequirednestedflatbuffer(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 102) + i * 1)
+    def testrequirednestedflatbuffer_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 102)
+    def scalar_key_sorted_tables(i:int):
+        return MyGame_Example_Stat { buf_, buf_.flatbuffers_indirect(buf_.flatbuffers_field_vector(pos_, 104) + i * 4) }
+    def scalar_key_sorted_tables_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 104)
+
+def GetRootAsMonster(buf:string): return Monster { buf, buf.flatbuffers_indirect(0) }
+
+struct MonsterBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(51)
+        return this
+    def add_pos(pos:flatbuffers_offset):
+        b_.PrependStructSlot(0, pos)
+        return this
+    def add_mana(mana:int):
+        b_.PrependInt16Slot(1, mana, 150)
+        return this
+    def add_hp(hp:int):
+        b_.PrependInt16Slot(2, hp, 100)
+        return this
+    def add_name(name:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(3, name)
+        return this
+    def add_inventory(inventory:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(5, inventory)
+        return this
+    def add_color(color:Color):
+        b_.PrependUint8Slot(6, color, 8)
+        return this
+    def add_test_type(test_type:Any):
+        b_.PrependUint8Slot(7, test_type, 0)
+        return this
+    def add_test(test:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(8, test)
+        return this
+    def add_test4(test4:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(9, test4)
+        return this
+    def add_testarrayofstring(testarrayofstring:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(10, testarrayofstring)
+        return this
+    def add_testarrayoftables(testarrayoftables:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(11, testarrayoftables)
+        return this
+    def add_enemy(enemy:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(12, enemy)
+        return this
+    def add_testnestedflatbuffer(testnestedflatbuffer:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(13, testnestedflatbuffer)
+        return this
+    def add_testempty(testempty:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(14, testempty)
+        return this
+    def add_testbool(testbool:int):
+        b_.PrependBoolSlot(15, testbool, 0)
+        return this
+    def add_testhashs32_fnv1(testhashs32_fnv1:int):
+        b_.PrependInt32Slot(16, testhashs32_fnv1, 0)
+        return this
+    def add_testhashu32_fnv1(testhashu32_fnv1:int):
+        b_.PrependUint32Slot(17, testhashu32_fnv1, 0)
+        return this
+    def add_testhashs64_fnv1(testhashs64_fnv1:int):
+        b_.PrependInt64Slot(18, testhashs64_fnv1, 0)
+        return this
+    def add_testhashu64_fnv1(testhashu64_fnv1:int):
+        b_.PrependUint64Slot(19, testhashu64_fnv1, 0)
+        return this
+    def add_testhashs32_fnv1a(testhashs32_fnv1a:int):
+        b_.PrependInt32Slot(20, testhashs32_fnv1a, 0)
+        return this
+    def add_testhashu32_fnv1a(testhashu32_fnv1a:int):
+        b_.PrependUint32Slot(21, testhashu32_fnv1a, 0)
+        return this
+    def add_testhashs64_fnv1a(testhashs64_fnv1a:int):
+        b_.PrependInt64Slot(22, testhashs64_fnv1a, 0)
+        return this
+    def add_testhashu64_fnv1a(testhashu64_fnv1a:int):
+        b_.PrependUint64Slot(23, testhashu64_fnv1a, 0)
+        return this
+    def add_testarrayofbools(testarrayofbools:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(24, testarrayofbools)
+        return this
+    def add_testf(testf:float):
+        b_.PrependFloat32Slot(25, testf, 3.14159)
+        return this
+    def add_testf2(testf2:float):
+        b_.PrependFloat32Slot(26, testf2, 3.0)
+        return this
+    def add_testf3(testf3:float):
+        b_.PrependFloat32Slot(27, testf3, 0.0)
+        return this
+    def add_testarrayofstring2(testarrayofstring2:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(28, testarrayofstring2)
+        return this
+    def add_testarrayofsortedstruct(testarrayofsortedstruct:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(29, testarrayofsortedstruct)
+        return this
+    def add_flex(flex:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(30, flex)
+        return this
+    def add_test5(test5:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(31, test5)
+        return this
+    def add_vector_of_longs(vector_of_longs:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(32, vector_of_longs)
+        return this
+    def add_vector_of_doubles(vector_of_doubles:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(33, vector_of_doubles)
+        return this
+    def add_parent_namespace_test(parent_namespace_test:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(34, parent_namespace_test)
+        return this
+    def add_vector_of_referrables(vector_of_referrables:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(35, vector_of_referrables)
+        return this
+    def add_single_weak_reference(single_weak_reference:int):
+        b_.PrependUint64Slot(36, single_weak_reference, 0)
+        return this
+    def add_vector_of_weak_references(vector_of_weak_references:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(37, vector_of_weak_references)
+        return this
+    def add_vector_of_strong_referrables(vector_of_strong_referrables:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(38, vector_of_strong_referrables)
+        return this
+    def add_co_owning_reference(co_owning_reference:int):
+        b_.PrependUint64Slot(39, co_owning_reference, 0)
+        return this
+    def add_vector_of_co_owning_references(vector_of_co_owning_references:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(40, vector_of_co_owning_references)
+        return this
+    def add_non_owning_reference(non_owning_reference:int):
+        b_.PrependUint64Slot(41, non_owning_reference, 0)
+        return this
+    def add_vector_of_non_owning_references(vector_of_non_owning_references:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(42, vector_of_non_owning_references)
+        return this
+    def add_any_unique_type(any_unique_type:AnyUniqueAliases):
+        b_.PrependUint8Slot(43, any_unique_type, 0)
+        return this
+    def add_any_unique(any_unique:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(44, any_unique)
+        return this
+    def add_any_ambiguous_type(any_ambiguous_type:AnyAmbiguousAliases):
+        b_.PrependUint8Slot(45, any_ambiguous_type, 0)
+        return this
+    def add_any_ambiguous(any_ambiguous:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(46, any_ambiguous)
+        return this
+    def add_vector_of_enums(vector_of_enums:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(47, vector_of_enums)
+        return this
+    def add_signed_enum(signed_enum:Race):
+        b_.PrependInt8Slot(48, signed_enum, -1)
+        return this
+    def add_testrequirednestedflatbuffer(testrequirednestedflatbuffer:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(49, testrequirednestedflatbuffer)
+        return this
+    def add_scalar_key_sorted_tables(scalar_key_sorted_tables:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(50, scalar_key_sorted_tables)
+        return this
+    def end():
+        return b_.EndObject()
+
+def MonsterStartInventoryVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateInventoryVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTest4Vector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 2)
+
+def MonsterStartTestarrayofstringVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateTestarrayofstringVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestarrayoftablesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateTestarrayoftablesVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestnestedflatbufferVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateTestnestedflatbufferVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestarrayofboolsVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateTestarrayofboolsVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependBool(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestarrayofstring2Vector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateTestarrayofstring2Vector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestarrayofsortedstructVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 4)
+
+def MonsterStartFlexVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateFlexVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTest5Vector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 2)
+
+def MonsterStartVectorOfLongsVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def MonsterCreateVectorOfLongsVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependInt64(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfDoublesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def MonsterCreateVectorOfDoublesVector(b_:flatbuffers_builder, v_:[float]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependFloat64(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfReferrablesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateVectorOfReferrablesVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfWeakReferencesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def MonsterCreateVectorOfWeakReferencesVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependUint64(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfStrongReferrablesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateVectorOfStrongReferrablesVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfCoOwningReferencesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def MonsterCreateVectorOfCoOwningReferencesVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependUint64(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfNonOwningReferencesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def MonsterCreateVectorOfNonOwningReferencesVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependUint64(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartVectorOfEnumsVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateVectorOfEnumsVector(b_:flatbuffers_builder, v_:[Color]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartTestrequirednestedflatbufferVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def MonsterCreateTestrequirednestedflatbufferVector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependUint8(e_)
+    return b_.EndVector(v_.length)
+
+def MonsterStartScalarKeySortedTablesVector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(4, n_, 4)
+def MonsterCreateScalarKeySortedTablesVector(b_:flatbuffers_builder, v_:[flatbuffers_offset]):
+    b_.StartVector(4, v_.length, 4)
+    reverse(v_) e_: b_.PrependUOffsetTRelative(e_)
+    return b_.EndVector(v_.length)
+
+class TypeAliases : flatbuffers_handle
+    def i8():
+        return buf_.flatbuffers_field_int8(pos_, 4, 0)
+    def u8():
+        return buf_.flatbuffers_field_int8(pos_, 6, 0)
+    def i16():
+        return buf_.flatbuffers_field_int16(pos_, 8, 0)
+    def u16():
+        return buf_.flatbuffers_field_int16(pos_, 10, 0)
+    def i32():
+        return buf_.flatbuffers_field_int32(pos_, 12, 0)
+    def u32():
+        return buf_.flatbuffers_field_int32(pos_, 14, 0)
+    def i64():
+        return buf_.flatbuffers_field_int64(pos_, 16, 0)
+    def u64():
+        return buf_.flatbuffers_field_int64(pos_, 18, 0)
+    def f32():
+        return buf_.flatbuffers_field_float32(pos_, 20, 0.0)
+    def f64():
+        return buf_.flatbuffers_field_float64(pos_, 22, 0.0)
+    def v8(i:int):
+        return buf_.read_int8_le(buf_.flatbuffers_field_vector(pos_, 24) + i * 1)
+    def v8_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 24)
+    def vf64(i:int):
+        return buf_.read_float64_le(buf_.flatbuffers_field_vector(pos_, 26) + i * 8)
+    def vf64_length():
+        return buf_.flatbuffers_field_vector_len(pos_, 26)
+
+def GetRootAsTypeAliases(buf:string): return TypeAliases { buf, buf.flatbuffers_indirect(0) }
+
+struct TypeAliasesBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(12)
+        return this
+    def add_i8(i8:int):
+        b_.PrependInt8Slot(0, i8, 0)
+        return this
+    def add_u8(u8:int):
+        b_.PrependUint8Slot(1, u8, 0)
+        return this
+    def add_i16(i16:int):
+        b_.PrependInt16Slot(2, i16, 0)
+        return this
+    def add_u16(u16:int):
+        b_.PrependUint16Slot(3, u16, 0)
+        return this
+    def add_i32(i32:int):
+        b_.PrependInt32Slot(4, i32, 0)
+        return this
+    def add_u32(u32:int):
+        b_.PrependUint32Slot(5, u32, 0)
+        return this
+    def add_i64(i64:int):
+        b_.PrependInt64Slot(6, i64, 0)
+        return this
+    def add_u64(u64:int):
+        b_.PrependUint64Slot(7, u64, 0)
+        return this
+    def add_f32(f32:float):
+        b_.PrependFloat32Slot(8, f32, 0.0)
+        return this
+    def add_f64(f64:float):
+        b_.PrependFloat64Slot(9, f64, 0.0)
+        return this
+    def add_v8(v8:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(10, v8)
+        return this
+    def add_vf64(vf64:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(11, vf64)
+        return this
+    def end():
+        return b_.EndObject()
+
+def TypeAliasesStartV8Vector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(1, n_, 1)
+def TypeAliasesCreateV8Vector(b_:flatbuffers_builder, v_:[int]):
+    b_.StartVector(1, v_.length, 1)
+    reverse(v_) e_: b_.PrependInt8(e_)
+    return b_.EndVector(v_.length)
+
+def TypeAliasesStartVf64Vector(b_:flatbuffers_builder, n_:int):
+    b_.StartVector(8, n_, 8)
+def TypeAliasesCreateVf64Vector(b_:flatbuffers_builder, v_:[float]):
+    b_.StartVector(8, v_.length, 8)
+    reverse(v_) e_: b_.PrependFloat64(e_)
+    return b_.EndVector(v_.length)
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.rs
new file mode 100644
index 0000000..ef1ee30
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_generated.rs
@@ -0,0 +1,4020 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use crate::include_test2_generated::*;
+use crate::include_test1_generated::*;
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod my_game {
+
+  use crate::include_test2_generated::*;
+  use crate::include_test1_generated::*;
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+pub enum InParentNamespaceOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct InParentNamespace<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for InParentNamespace<'a> {
+    type Inner = InParentNamespace<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> InParentNamespace<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.InParentNamespace"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        InParentNamespace { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        _args: &'args InParentNamespaceArgs) -> flatbuffers::WIPOffset<InParentNamespace<'bldr>> {
+      let mut builder = InParentNamespaceBuilder::new(_fbb);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> InParentNamespaceT {
+      InParentNamespaceT {
+      }
+    }
+}
+
+impl flatbuffers::Verifiable for InParentNamespace<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct InParentNamespaceArgs {
+}
+impl<'a> Default for InParentNamespaceArgs {
+    #[inline]
+    fn default() -> Self {
+        InParentNamespaceArgs {
+        }
+    }
+}
+pub struct InParentNamespaceBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> InParentNamespaceBuilder<'a, 'b> {
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> InParentNamespaceBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    InParentNamespaceBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<InParentNamespace<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for InParentNamespace<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("InParentNamespace");
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct InParentNamespaceT {
+}
+impl Default for InParentNamespaceT {
+  fn default() -> Self {
+    Self {
+    }
+  }
+}
+impl InParentNamespaceT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<InParentNamespace<'b>> {
+    InParentNamespace::create(_fbb, &InParentNamespaceArgs{
+    })
+  }
+}
+#[allow(unused_imports, dead_code)]
+pub mod example_2 {
+
+  use crate::include_test2_generated::*;
+  use crate::include_test1_generated::*;
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+pub enum MonsterOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct Monster<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Monster<'a> {
+    type Inner = Monster<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Monster<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example2.Monster"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Monster { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        _args: &'args MonsterArgs) -> flatbuffers::WIPOffset<Monster<'bldr>> {
+      let mut builder = MonsterBuilder::new(_fbb);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> MonsterT {
+      MonsterT {
+      }
+    }
+}
+
+impl flatbuffers::Verifiable for Monster<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct MonsterArgs {
+}
+impl<'a> Default for MonsterArgs {
+    #[inline]
+    fn default() -> Self {
+        MonsterArgs {
+        }
+    }
+}
+pub struct MonsterBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> MonsterBuilder<'a, 'b> {
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> MonsterBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    MonsterBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Monster<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Monster<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Monster");
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct MonsterT {
+}
+impl Default for MonsterT {
+  fn default() -> Self {
+    Self {
+    }
+  }
+}
+impl MonsterT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Monster<'b>> {
+    Monster::create(_fbb, &MonsterArgs{
+    })
+  }
+}
+}  // pub mod Example2
+
+#[allow(unused_imports, dead_code)]
+pub mod example {
+
+  use crate::include_test2_generated::*;
+  use crate::include_test1_generated::*;
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(non_upper_case_globals)]
+mod bitflags_color {
+  flatbuffers::bitflags::bitflags! {
+    /// Composite components of Monster color.
+    #[derive(Default)]
+    pub struct Color: u8 {
+      const Red = 1;
+      /// \brief color Green
+      /// Green is bit_flag with value (1u << 1)
+      const Green = 2;
+      /// \brief color Blue (1u << 3)
+      const Blue = 8;
+    }
+  }
+}
+pub use self::bitflags_color::Color;
+
+impl<'a> flatbuffers::Follow<'a> for Color {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    unsafe { Self::from_bits_unchecked(b) }
+  }
+}
+
+impl flatbuffers::Push for Color {
+    type Output = Color;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.bits()); }
+    }
+}
+
+impl flatbuffers::EndianScalar for Color {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.bits());
+    unsafe { Self::from_bits_unchecked(b) }
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.bits());
+    unsafe { Self::from_bits_unchecked(b) }
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for Color {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Color {}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_RACE: i8 = -1;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_RACE: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_RACE: [Race; 4] = [
+  Race::None,
+  Race::Human,
+  Race::Dwarf,
+  Race::Elf,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct Race(pub i8);
+#[allow(non_upper_case_globals)]
+impl Race {
+  pub const None: Self = Self(-1);
+  pub const Human: Self = Self(0);
+  pub const Dwarf: Self = Self(1);
+  pub const Elf: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = -1;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::None,
+    Self::Human,
+    Self::Dwarf,
+    Self::Elf,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::None => Some("None"),
+      Self::Human => Some("Human"),
+      Self::Dwarf => Some("Dwarf"),
+      Self::Elf => Some("Elf"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for Race {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for Race {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for Race {
+    type Output = Race;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for Race {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for Race {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Race {}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ANY: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ANY: u8 = 3;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ANY: [Any; 4] = [
+  Any::NONE,
+  Any::Monster,
+  Any::TestSimpleTableWithEnum,
+  Any::MyGame_Example2_Monster,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct Any(pub u8);
+#[allow(non_upper_case_globals)]
+impl Any {
+  pub const NONE: Self = Self(0);
+  pub const Monster: Self = Self(1);
+  pub const TestSimpleTableWithEnum: Self = Self(2);
+  pub const MyGame_Example2_Monster: Self = Self(3);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 3;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::Monster,
+    Self::TestSimpleTableWithEnum,
+    Self::MyGame_Example2_Monster,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::Monster => Some("Monster"),
+      Self::TestSimpleTableWithEnum => Some("TestSimpleTableWithEnum"),
+      Self::MyGame_Example2_Monster => Some("MyGame_Example2_Monster"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for Any {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for Any {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for Any {
+    type Output = Any;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for Any {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for Any {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Any {}
+pub struct AnyUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum AnyT {
+  NONE,
+  Monster(Box<MonsterT>),
+  TestSimpleTableWithEnum(Box<TestSimpleTableWithEnumT>),
+  MyGameExample2Monster(Box<super::example_2::MonsterT>),
+}
+impl Default for AnyT {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl AnyT {
+  pub fn any_type(&self) -> Any {
+    match self {
+      Self::NONE => Any::NONE,
+      Self::Monster(_) => Any::Monster,
+      Self::TestSimpleTableWithEnum(_) => Any::TestSimpleTableWithEnum,
+      Self::MyGameExample2Monster(_) => Any::MyGame_Example2_Monster,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::Monster(v) => Some(v.pack(fbb).as_union_value()),
+      Self::TestSimpleTableWithEnum(v) => Some(v.pack(fbb).as_union_value()),
+      Self::MyGameExample2Monster(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned MonsterT, setting the union to NONE.
+  pub fn take_monster(&mut self) -> Option<Box<MonsterT>> {
+    if let Self::Monster(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::Monster(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the MonsterT.
+  pub fn as_monster(&self) -> Option<&MonsterT> {
+    if let Self::Monster(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the MonsterT.
+  pub fn as_monster_mut(&mut self) -> Option<&mut MonsterT> {
+    if let Self::Monster(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned TestSimpleTableWithEnumT, setting the union to NONE.
+  pub fn take_test_simple_table_with_enum(&mut self) -> Option<Box<TestSimpleTableWithEnumT>> {
+    if let Self::TestSimpleTableWithEnum(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::TestSimpleTableWithEnum(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the TestSimpleTableWithEnumT.
+  pub fn as_test_simple_table_with_enum(&self) -> Option<&TestSimpleTableWithEnumT> {
+    if let Self::TestSimpleTableWithEnum(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the TestSimpleTableWithEnumT.
+  pub fn as_test_simple_table_with_enum_mut(&mut self) -> Option<&mut TestSimpleTableWithEnumT> {
+    if let Self::TestSimpleTableWithEnum(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned super::example_2::MonsterT, setting the union to NONE.
+  pub fn take_my_game_example_2_monster(&mut self) -> Option<Box<super::example_2::MonsterT>> {
+    if let Self::MyGameExample2Monster(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::MyGameExample2Monster(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the super::example_2::MonsterT.
+  pub fn as_my_game_example_2_monster(&self) -> Option<&super::example_2::MonsterT> {
+    if let Self::MyGameExample2Monster(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the super::example_2::MonsterT.
+  pub fn as_my_game_example_2_monster_mut(&mut self) -> Option<&mut super::example_2::MonsterT> {
+    if let Self::MyGameExample2Monster(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ANY_UNIQUE_ALIASES: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ANY_UNIQUE_ALIASES: u8 = 3;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ANY_UNIQUE_ALIASES: [AnyUniqueAliases; 4] = [
+  AnyUniqueAliases::NONE,
+  AnyUniqueAliases::M,
+  AnyUniqueAliases::TS,
+  AnyUniqueAliases::M2,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct AnyUniqueAliases(pub u8);
+#[allow(non_upper_case_globals)]
+impl AnyUniqueAliases {
+  pub const NONE: Self = Self(0);
+  pub const M: Self = Self(1);
+  pub const TS: Self = Self(2);
+  pub const M2: Self = Self(3);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 3;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::M,
+    Self::TS,
+    Self::M2,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::M => Some("M"),
+      Self::TS => Some("TS"),
+      Self::M2 => Some("M2"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for AnyUniqueAliases {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for AnyUniqueAliases {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for AnyUniqueAliases {
+    type Output = AnyUniqueAliases;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for AnyUniqueAliases {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for AnyUniqueAliases {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for AnyUniqueAliases {}
+pub struct AnyUniqueAliasesUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum AnyUniqueAliasesT {
+  NONE,
+  M(Box<MonsterT>),
+  TS(Box<TestSimpleTableWithEnumT>),
+  M2(Box<super::example_2::MonsterT>),
+}
+impl Default for AnyUniqueAliasesT {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl AnyUniqueAliasesT {
+  pub fn any_unique_aliases_type(&self) -> AnyUniqueAliases {
+    match self {
+      Self::NONE => AnyUniqueAliases::NONE,
+      Self::M(_) => AnyUniqueAliases::M,
+      Self::TS(_) => AnyUniqueAliases::TS,
+      Self::M2(_) => AnyUniqueAliases::M2,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::M(v) => Some(v.pack(fbb).as_union_value()),
+      Self::TS(v) => Some(v.pack(fbb).as_union_value()),
+      Self::M2(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned MonsterT, setting the union to NONE.
+  pub fn take_m(&mut self) -> Option<Box<MonsterT>> {
+    if let Self::M(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::M(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the MonsterT.
+  pub fn as_m(&self) -> Option<&MonsterT> {
+    if let Self::M(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the MonsterT.
+  pub fn as_m_mut(&mut self) -> Option<&mut MonsterT> {
+    if let Self::M(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned TestSimpleTableWithEnumT, setting the union to NONE.
+  pub fn take_ts(&mut self) -> Option<Box<TestSimpleTableWithEnumT>> {
+    if let Self::TS(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::TS(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the TestSimpleTableWithEnumT.
+  pub fn as_ts(&self) -> Option<&TestSimpleTableWithEnumT> {
+    if let Self::TS(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the TestSimpleTableWithEnumT.
+  pub fn as_ts_mut(&mut self) -> Option<&mut TestSimpleTableWithEnumT> {
+    if let Self::TS(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned super::example_2::MonsterT, setting the union to NONE.
+  pub fn take_m2(&mut self) -> Option<Box<super::example_2::MonsterT>> {
+    if let Self::M2(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::M2(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the super::example_2::MonsterT.
+  pub fn as_m2(&self) -> Option<&super::example_2::MonsterT> {
+    if let Self::M2(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the super::example_2::MonsterT.
+  pub fn as_m2_mut(&mut self) -> Option<&mut super::example_2::MonsterT> {
+    if let Self::M2(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ANY_AMBIGUOUS_ALIASES: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ANY_AMBIGUOUS_ALIASES: u8 = 3;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ANY_AMBIGUOUS_ALIASES: [AnyAmbiguousAliases; 4] = [
+  AnyAmbiguousAliases::NONE,
+  AnyAmbiguousAliases::M1,
+  AnyAmbiguousAliases::M2,
+  AnyAmbiguousAliases::M3,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct AnyAmbiguousAliases(pub u8);
+#[allow(non_upper_case_globals)]
+impl AnyAmbiguousAliases {
+  pub const NONE: Self = Self(0);
+  pub const M1: Self = Self(1);
+  pub const M2: Self = Self(2);
+  pub const M3: Self = Self(3);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 3;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::M1,
+    Self::M2,
+    Self::M3,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::M1 => Some("M1"),
+      Self::M2 => Some("M2"),
+      Self::M3 => Some("M3"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for AnyAmbiguousAliases {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for AnyAmbiguousAliases {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for AnyAmbiguousAliases {
+    type Output = AnyAmbiguousAliases;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for AnyAmbiguousAliases {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for AnyAmbiguousAliases {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for AnyAmbiguousAliases {}
+pub struct AnyAmbiguousAliasesUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum AnyAmbiguousAliasesT {
+  NONE,
+  M1(Box<MonsterT>),
+  M2(Box<MonsterT>),
+  M3(Box<MonsterT>),
+}
+impl Default for AnyAmbiguousAliasesT {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl AnyAmbiguousAliasesT {
+  pub fn any_ambiguous_aliases_type(&self) -> AnyAmbiguousAliases {
+    match self {
+      Self::NONE => AnyAmbiguousAliases::NONE,
+      Self::M1(_) => AnyAmbiguousAliases::M1,
+      Self::M2(_) => AnyAmbiguousAliases::M2,
+      Self::M3(_) => AnyAmbiguousAliases::M3,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::M1(v) => Some(v.pack(fbb).as_union_value()),
+      Self::M2(v) => Some(v.pack(fbb).as_union_value()),
+      Self::M3(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned MonsterT, setting the union to NONE.
+  pub fn take_m1(&mut self) -> Option<Box<MonsterT>> {
+    if let Self::M1(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::M1(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the MonsterT.
+  pub fn as_m1(&self) -> Option<&MonsterT> {
+    if let Self::M1(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the MonsterT.
+  pub fn as_m1_mut(&mut self) -> Option<&mut MonsterT> {
+    if let Self::M1(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned MonsterT, setting the union to NONE.
+  pub fn take_m2(&mut self) -> Option<Box<MonsterT>> {
+    if let Self::M2(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::M2(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the MonsterT.
+  pub fn as_m2(&self) -> Option<&MonsterT> {
+    if let Self::M2(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the MonsterT.
+  pub fn as_m2_mut(&mut self) -> Option<&mut MonsterT> {
+    if let Self::M2(v) = self { Some(v.as_mut()) } else { None }
+  }
+  /// If the union variant matches, return the owned MonsterT, setting the union to NONE.
+  pub fn take_m3(&mut self) -> Option<Box<MonsterT>> {
+    if let Self::M3(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::M3(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the MonsterT.
+  pub fn as_m3(&self) -> Option<&MonsterT> {
+    if let Self::M3(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the MonsterT.
+  pub fn as_m3_mut(&mut self) -> Option<&mut MonsterT> {
+    if let Self::M3(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+// struct Test, aligned to 2
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct Test(pub [u8; 4]);
+impl Default for Test { 
+  fn default() -> Self { 
+    Self([0; 4])
+  }
+}
+impl std::fmt::Debug for Test {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("Test")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Test {}
+impl flatbuffers::SafeSliceAccess for Test {}
+impl<'a> flatbuffers::Follow<'a> for Test {
+  type Inner = &'a Test;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a Test>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a Test {
+  type Inner = &'a Test;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<Test>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for Test {
+    type Output = Test;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const Test as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b Test {
+    type Output = Test;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const Test as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for Test {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> Test {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: i16,
+    b: i8,
+  ) -> Self {
+    let mut s = Self([0; 4]);
+    s.set_a(a);
+    s.set_b(b);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Test"
+    }
+
+  pub fn a(&self) -> i16 {
+    let mut mem = core::mem::MaybeUninit::<i16>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i16>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_a(&mut self, x: i16) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i16 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<i16>(),
+      );
+    }
+  }
+
+  pub fn b(&self) -> i8 {
+    let mut mem = core::mem::MaybeUninit::<i8>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[2..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i8>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_b(&mut self, x: i8) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i8 as *const u8,
+        self.0[2..].as_mut_ptr(),
+        core::mem::size_of::<i8>(),
+      );
+    }
+  }
+
+  pub fn unpack(&self) -> TestT {
+    TestT {
+      a: self.a(),
+      b: self.b(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct TestT {
+  pub a: i16,
+  pub b: i8,
+}
+impl TestT {
+  pub fn pack(&self) -> Test {
+    Test::new(
+      self.a,
+      self.b,
+    )
+  }
+}
+
+// struct Vec3, aligned to 8
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct Vec3(pub [u8; 32]);
+impl Default for Vec3 { 
+  fn default() -> Self { 
+    Self([0; 32])
+  }
+}
+impl std::fmt::Debug for Vec3 {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("Vec3")
+      .field("x", &self.x())
+      .field("y", &self.y())
+      .field("z", &self.z())
+      .field("test1", &self.test1())
+      .field("test2", &self.test2())
+      .field("test3", &self.test3())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Vec3 {}
+impl flatbuffers::SafeSliceAccess for Vec3 {}
+impl<'a> flatbuffers::Follow<'a> for Vec3 {
+  type Inner = &'a Vec3;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a Vec3>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a Vec3 {
+  type Inner = &'a Vec3;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<Vec3>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const Vec3 as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b Vec3 {
+    type Output = Vec3;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const Vec3 as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for Vec3 {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> Vec3 {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    x: f32,
+    y: f32,
+    z: f32,
+    test1: f64,
+    test2: Color,
+    test3: &Test,
+  ) -> Self {
+    let mut s = Self([0; 32]);
+    s.set_x(x);
+    s.set_y(y);
+    s.set_z(z);
+    s.set_test1(test1);
+    s.set_test2(test2);
+    s.set_test3(&test3);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Vec3"
+    }
+
+  pub fn x(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_x(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn y(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[4..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_y(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[4..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn z(&self) -> f32 {
+    let mut mem = core::mem::MaybeUninit::<f32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[8..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_z(&mut self, x: f32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f32 as *const u8,
+        self.0[8..].as_mut_ptr(),
+        core::mem::size_of::<f32>(),
+      );
+    }
+  }
+
+  pub fn test1(&self) -> f64 {
+    let mut mem = core::mem::MaybeUninit::<f64>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[16..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<f64>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_test1(&mut self, x: f64) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const f64 as *const u8,
+        self.0[16..].as_mut_ptr(),
+        core::mem::size_of::<f64>(),
+      );
+    }
+  }
+
+  pub fn test2(&self) -> Color {
+    let mut mem = core::mem::MaybeUninit::<Color>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[24..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<Color>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_test2(&mut self, x: Color) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const Color as *const u8,
+        self.0[24..].as_mut_ptr(),
+        core::mem::size_of::<Color>(),
+      );
+    }
+  }
+
+  pub fn test3(&self) -> &Test {
+    unsafe { &*(self.0[26..].as_ptr() as *const Test) }
+  }
+
+  pub fn set_test3(&mut self, x: &Test) {
+    self.0[26..26+4].copy_from_slice(&x.0)
+  }
+
+  pub fn unpack(&self) -> Vec3T {
+    Vec3T {
+      x: self.x(),
+      y: self.y(),
+      z: self.z(),
+      test1: self.test1(),
+      test2: self.test2(),
+      test3: self.test3().unpack(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct Vec3T {
+  pub x: f32,
+  pub y: f32,
+  pub z: f32,
+  pub test1: f64,
+  pub test2: Color,
+  pub test3: TestT,
+}
+impl Vec3T {
+  pub fn pack(&self) -> Vec3 {
+    Vec3::new(
+      self.x,
+      self.y,
+      self.z,
+      self.test1,
+      self.test2,
+      &self.test3.pack(),
+    )
+  }
+}
+
+// struct Ability, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct Ability(pub [u8; 8]);
+impl Default for Ability { 
+  fn default() -> Self { 
+    Self([0; 8])
+  }
+}
+impl std::fmt::Debug for Ability {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("Ability")
+      .field("id", &self.id())
+      .field("distance", &self.distance())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for Ability {}
+impl flatbuffers::SafeSliceAccess for Ability {}
+impl<'a> flatbuffers::Follow<'a> for Ability {
+  type Inner = &'a Ability;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a Ability>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a Ability {
+  type Inner = &'a Ability;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<Ability>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for Ability {
+    type Output = Ability;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const Ability as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b Ability {
+    type Output = Ability;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const Ability as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for Ability {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> Ability {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    id: u32,
+    distance: u32,
+  ) -> Self {
+    let mut s = Self([0; 8]);
+    s.set_id(id);
+    s.set_distance(distance);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Ability"
+    }
+
+  pub fn id(&self) -> u32 {
+    let mut mem = core::mem::MaybeUninit::<u32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<u32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_id(&mut self, x: u32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const u32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<u32>(),
+      );
+    }
+  }
+
+  #[inline]
+  pub fn key_compare_less_than(&self, o: &Ability) ->  bool {
+    self.id() < o.id()
+  }
+
+  #[inline]
+  pub fn key_compare_with_value(&self, val: u32) ->  ::std::cmp::Ordering {
+    let key = self.id();
+    key.cmp(&val)
+  }
+  pub fn distance(&self) -> u32 {
+    let mut mem = core::mem::MaybeUninit::<u32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[4..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<u32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_distance(&mut self, x: u32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const u32 as *const u8,
+        self.0[4..].as_mut_ptr(),
+        core::mem::size_of::<u32>(),
+      );
+    }
+  }
+
+  pub fn unpack(&self) -> AbilityT {
+    AbilityT {
+      id: self.id(),
+      distance: self.distance(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct AbilityT {
+  pub id: u32,
+  pub distance: u32,
+}
+impl AbilityT {
+  pub fn pack(&self) -> Ability {
+    Ability::new(
+      self.id,
+      self.distance,
+    )
+  }
+}
+
+// struct StructOfStructs, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct StructOfStructs(pub [u8; 20]);
+impl Default for StructOfStructs { 
+  fn default() -> Self { 
+    Self([0; 20])
+  }
+}
+impl std::fmt::Debug for StructOfStructs {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("StructOfStructs")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .field("c", &self.c())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for StructOfStructs {}
+impl flatbuffers::SafeSliceAccess for StructOfStructs {}
+impl<'a> flatbuffers::Follow<'a> for StructOfStructs {
+  type Inner = &'a StructOfStructs;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a StructOfStructs>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a StructOfStructs {
+  type Inner = &'a StructOfStructs;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<StructOfStructs>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for StructOfStructs {
+    type Output = StructOfStructs;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const StructOfStructs as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b StructOfStructs {
+    type Output = StructOfStructs;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const StructOfStructs as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for StructOfStructs {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> StructOfStructs {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: &Ability,
+    b: &Test,
+    c: &Ability,
+  ) -> Self {
+    let mut s = Self([0; 20]);
+    s.set_a(&a);
+    s.set_b(&b);
+    s.set_c(&c);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.StructOfStructs"
+    }
+
+  pub fn a(&self) -> &Ability {
+    unsafe { &*(self.0[0..].as_ptr() as *const Ability) }
+  }
+
+  pub fn set_a(&mut self, x: &Ability) {
+    self.0[0..0+8].copy_from_slice(&x.0)
+  }
+
+  pub fn b(&self) -> &Test {
+    unsafe { &*(self.0[8..].as_ptr() as *const Test) }
+  }
+
+  pub fn set_b(&mut self, x: &Test) {
+    self.0[8..8+4].copy_from_slice(&x.0)
+  }
+
+  pub fn c(&self) -> &Ability {
+    unsafe { &*(self.0[12..].as_ptr() as *const Ability) }
+  }
+
+  pub fn set_c(&mut self, x: &Ability) {
+    self.0[12..12+8].copy_from_slice(&x.0)
+  }
+
+  pub fn unpack(&self) -> StructOfStructsT {
+    StructOfStructsT {
+      a: self.a().unpack(),
+      b: self.b().unpack(),
+      c: self.c().unpack(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct StructOfStructsT {
+  pub a: AbilityT,
+  pub b: TestT,
+  pub c: AbilityT,
+}
+impl StructOfStructsT {
+  pub fn pack(&self) -> StructOfStructs {
+    StructOfStructs::new(
+      &self.a.pack(),
+      &self.b.pack(),
+      &self.c.pack(),
+    )
+  }
+}
+
+pub enum TestSimpleTableWithEnumOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TestSimpleTableWithEnum<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TestSimpleTableWithEnum<'a> {
+    type Inner = TestSimpleTableWithEnum<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TestSimpleTableWithEnum<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.TestSimpleTableWithEnum"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TestSimpleTableWithEnum { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TestSimpleTableWithEnumArgs) -> flatbuffers::WIPOffset<TestSimpleTableWithEnum<'bldr>> {
+      let mut builder = TestSimpleTableWithEnumBuilder::new(_fbb);
+      builder.add_color(args.color);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TestSimpleTableWithEnumT {
+      let color = self.color();
+      TestSimpleTableWithEnumT {
+        color,
+      }
+    }
+    pub const VT_COLOR: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn color(&self) -> Color {
+    self._tab.get::<Color>(TestSimpleTableWithEnum::VT_COLOR, Some(Color::Green)).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for TestSimpleTableWithEnum<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<Color>(&"color", Self::VT_COLOR, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TestSimpleTableWithEnumArgs {
+    pub color: Color,
+}
+impl<'a> Default for TestSimpleTableWithEnumArgs {
+    #[inline]
+    fn default() -> Self {
+        TestSimpleTableWithEnumArgs {
+            color: Color::Green,
+        }
+    }
+}
+pub struct TestSimpleTableWithEnumBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TestSimpleTableWithEnumBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_color(&mut self, color: Color) {
+    self.fbb_.push_slot::<Color>(TestSimpleTableWithEnum::VT_COLOR, color, Color::Green);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TestSimpleTableWithEnumBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TestSimpleTableWithEnumBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TestSimpleTableWithEnum<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TestSimpleTableWithEnum<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TestSimpleTableWithEnum");
+      ds.field("color", &self.color());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TestSimpleTableWithEnumT {
+  pub color: Color,
+}
+impl Default for TestSimpleTableWithEnumT {
+  fn default() -> Self {
+    Self {
+      color: Color::Green,
+    }
+  }
+}
+impl TestSimpleTableWithEnumT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TestSimpleTableWithEnum<'b>> {
+    let color = self.color;
+    TestSimpleTableWithEnum::create(_fbb, &TestSimpleTableWithEnumArgs{
+      color,
+    })
+  }
+}
+pub enum StatOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct Stat<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Stat<'a> {
+    type Inner = Stat<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Stat<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Stat"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Stat { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args StatArgs<'args>) -> flatbuffers::WIPOffset<Stat<'bldr>> {
+      let mut builder = StatBuilder::new(_fbb);
+      builder.add_val(args.val);
+      if let Some(x) = args.id { builder.add_id(x); }
+      builder.add_count(args.count);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> StatT {
+      let id = self.id().map(|x| {
+        x.to_string()
+      });
+      let val = self.val();
+      let count = self.count();
+      StatT {
+        id,
+        val,
+        count,
+      }
+    }
+    pub const VT_ID: flatbuffers::VOffsetT = 4;
+    pub const VT_VAL: flatbuffers::VOffsetT = 6;
+    pub const VT_COUNT: flatbuffers::VOffsetT = 8;
+
+  #[inline]
+  pub fn id(&self) -> Option<&'a str> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(Stat::VT_ID, None)
+  }
+  #[inline]
+  pub fn val(&self) -> i64 {
+    self._tab.get::<i64>(Stat::VT_VAL, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn count(&self) -> u16 {
+    self._tab.get::<u16>(Stat::VT_COUNT, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn key_compare_less_than(&self, o: &Stat) ->  bool {
+    self.count() < o.count()
+  }
+
+  #[inline]
+  pub fn key_compare_with_value(&self, val: u16) ->  ::std::cmp::Ordering {
+    let key = self.count();
+    key.cmp(&val)
+  }
+}
+
+impl flatbuffers::Verifiable for Stat<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"id", Self::VT_ID, false)?
+     .visit_field::<i64>(&"val", Self::VT_VAL, false)?
+     .visit_field::<u16>(&"count", Self::VT_COUNT, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct StatArgs<'a> {
+    pub id: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub val: i64,
+    pub count: u16,
+}
+impl<'a> Default for StatArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        StatArgs {
+            id: None,
+            val: 0,
+            count: 0,
+        }
+    }
+}
+pub struct StatBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> StatBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_id(&mut self, id: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Stat::VT_ID, id);
+  }
+  #[inline]
+  pub fn add_val(&mut self, val: i64) {
+    self.fbb_.push_slot::<i64>(Stat::VT_VAL, val, 0);
+  }
+  #[inline]
+  pub fn add_count(&mut self, count: u16) {
+    self.fbb_.push_slot::<u16>(Stat::VT_COUNT, count, 0);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> StatBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    StatBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Stat<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Stat<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Stat");
+      ds.field("id", &self.id());
+      ds.field("val", &self.val());
+      ds.field("count", &self.count());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct StatT {
+  pub id: Option<String>,
+  pub val: i64,
+  pub count: u16,
+}
+impl Default for StatT {
+  fn default() -> Self {
+    Self {
+      id: None,
+      val: 0,
+      count: 0,
+    }
+  }
+}
+impl StatT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Stat<'b>> {
+    let id = self.id.as_ref().map(|x|{
+      _fbb.create_string(x)
+    });
+    let val = self.val;
+    let count = self.count;
+    Stat::create(_fbb, &StatArgs{
+      id,
+      val,
+      count,
+    })
+  }
+}
+pub enum ReferrableOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct Referrable<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Referrable<'a> {
+    type Inner = Referrable<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Referrable<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Referrable"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Referrable { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args ReferrableArgs) -> flatbuffers::WIPOffset<Referrable<'bldr>> {
+      let mut builder = ReferrableBuilder::new(_fbb);
+      builder.add_id(args.id);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> ReferrableT {
+      let id = self.id();
+      ReferrableT {
+        id,
+      }
+    }
+    pub const VT_ID: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn id(&self) -> u64 {
+    self._tab.get::<u64>(Referrable::VT_ID, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn key_compare_less_than(&self, o: &Referrable) ->  bool {
+    self.id() < o.id()
+  }
+
+  #[inline]
+  pub fn key_compare_with_value(&self, val: u64) ->  ::std::cmp::Ordering {
+    let key = self.id();
+    key.cmp(&val)
+  }
+}
+
+impl flatbuffers::Verifiable for Referrable<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<u64>(&"id", Self::VT_ID, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct ReferrableArgs {
+    pub id: u64,
+}
+impl<'a> Default for ReferrableArgs {
+    #[inline]
+    fn default() -> Self {
+        ReferrableArgs {
+            id: 0,
+        }
+    }
+}
+pub struct ReferrableBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> ReferrableBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_id(&mut self, id: u64) {
+    self.fbb_.push_slot::<u64>(Referrable::VT_ID, id, 0);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> ReferrableBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    ReferrableBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Referrable<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Referrable<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Referrable");
+      ds.field("id", &self.id());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct ReferrableT {
+  pub id: u64,
+}
+impl Default for ReferrableT {
+  fn default() -> Self {
+    Self {
+      id: 0,
+    }
+  }
+}
+impl ReferrableT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Referrable<'b>> {
+    let id = self.id;
+    Referrable::create(_fbb, &ReferrableArgs{
+      id,
+    })
+  }
+}
+pub enum MonsterOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+/// an example documentation comment: "monster object"
+pub struct Monster<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for Monster<'a> {
+    type Inner = Monster<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> Monster<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.Monster"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        Monster { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args MonsterArgs<'args>) -> flatbuffers::WIPOffset<Monster<'bldr>> {
+      let mut builder = MonsterBuilder::new(_fbb);
+      builder.add_non_owning_reference(args.non_owning_reference);
+      builder.add_co_owning_reference(args.co_owning_reference);
+      builder.add_single_weak_reference(args.single_weak_reference);
+      builder.add_testhashu64_fnv1a(args.testhashu64_fnv1a);
+      builder.add_testhashs64_fnv1a(args.testhashs64_fnv1a);
+      builder.add_testhashu64_fnv1(args.testhashu64_fnv1);
+      builder.add_testhashs64_fnv1(args.testhashs64_fnv1);
+      if let Some(x) = args.scalar_key_sorted_tables { builder.add_scalar_key_sorted_tables(x); }
+      if let Some(x) = args.testrequirednestedflatbuffer { builder.add_testrequirednestedflatbuffer(x); }
+      if let Some(x) = args.vector_of_enums { builder.add_vector_of_enums(x); }
+      if let Some(x) = args.any_ambiguous { builder.add_any_ambiguous(x); }
+      if let Some(x) = args.any_unique { builder.add_any_unique(x); }
+      if let Some(x) = args.vector_of_non_owning_references { builder.add_vector_of_non_owning_references(x); }
+      if let Some(x) = args.vector_of_co_owning_references { builder.add_vector_of_co_owning_references(x); }
+      if let Some(x) = args.vector_of_strong_referrables { builder.add_vector_of_strong_referrables(x); }
+      if let Some(x) = args.vector_of_weak_references { builder.add_vector_of_weak_references(x); }
+      if let Some(x) = args.vector_of_referrables { builder.add_vector_of_referrables(x); }
+      if let Some(x) = args.parent_namespace_test { builder.add_parent_namespace_test(x); }
+      if let Some(x) = args.vector_of_doubles { builder.add_vector_of_doubles(x); }
+      if let Some(x) = args.vector_of_longs { builder.add_vector_of_longs(x); }
+      if let Some(x) = args.test5 { builder.add_test5(x); }
+      if let Some(x) = args.flex { builder.add_flex(x); }
+      if let Some(x) = args.testarrayofsortedstruct { builder.add_testarrayofsortedstruct(x); }
+      if let Some(x) = args.testarrayofstring2 { builder.add_testarrayofstring2(x); }
+      builder.add_testf3(args.testf3);
+      builder.add_testf2(args.testf2);
+      builder.add_testf(args.testf);
+      if let Some(x) = args.testarrayofbools { builder.add_testarrayofbools(x); }
+      builder.add_testhashu32_fnv1a(args.testhashu32_fnv1a);
+      builder.add_testhashs32_fnv1a(args.testhashs32_fnv1a);
+      builder.add_testhashu32_fnv1(args.testhashu32_fnv1);
+      builder.add_testhashs32_fnv1(args.testhashs32_fnv1);
+      if let Some(x) = args.testempty { builder.add_testempty(x); }
+      if let Some(x) = args.testnestedflatbuffer { builder.add_testnestedflatbuffer(x); }
+      if let Some(x) = args.enemy { builder.add_enemy(x); }
+      if let Some(x) = args.testarrayoftables { builder.add_testarrayoftables(x); }
+      if let Some(x) = args.testarrayofstring { builder.add_testarrayofstring(x); }
+      if let Some(x) = args.test4 { builder.add_test4(x); }
+      if let Some(x) = args.test { builder.add_test(x); }
+      if let Some(x) = args.inventory { builder.add_inventory(x); }
+      if let Some(x) = args.name { builder.add_name(x); }
+      if let Some(x) = args.pos { builder.add_pos(x); }
+      builder.add_hp(args.hp);
+      builder.add_mana(args.mana);
+      builder.add_signed_enum(args.signed_enum);
+      builder.add_any_ambiguous_type(args.any_ambiguous_type);
+      builder.add_any_unique_type(args.any_unique_type);
+      builder.add_testbool(args.testbool);
+      builder.add_test_type(args.test_type);
+      builder.add_color(args.color);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> MonsterT {
+      let pos = self.pos().map(|x| {
+        x.unpack()
+      });
+      let mana = self.mana();
+      let hp = self.hp();
+      let name = {
+        let x = self.name();
+        x.to_string()
+      };
+      let inventory = self.inventory().map(|x| {
+        x.to_vec()
+      });
+      let color = self.color();
+      let test = match self.test_type() {
+        Any::NONE => AnyT::NONE,
+        Any::Monster => AnyT::Monster(Box::new(
+          self.test_as_monster()
+              .expect("Invalid union table, expected `Any::Monster`.")
+              .unpack()
+        )),
+        Any::TestSimpleTableWithEnum => AnyT::TestSimpleTableWithEnum(Box::new(
+          self.test_as_test_simple_table_with_enum()
+              .expect("Invalid union table, expected `Any::TestSimpleTableWithEnum`.")
+              .unpack()
+        )),
+        Any::MyGame_Example2_Monster => AnyT::MyGameExample2Monster(Box::new(
+          self.test_as_my_game_example_2_monster()
+              .expect("Invalid union table, expected `Any::MyGame_Example2_Monster`.")
+              .unpack()
+        )),
+        _ => AnyT::NONE,
+      };
+      let test4 = self.test4().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let testarrayofstring = self.testarrayofstring().map(|x| {
+        x.iter().map(|s| s.to_string()).collect()
+      });
+      let testarrayoftables = self.testarrayoftables().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let enemy = self.enemy().map(|x| {
+        Box::new(x.unpack())
+      });
+      let testnestedflatbuffer = self.testnestedflatbuffer().map(|x| {
+        x.to_vec()
+      });
+      let testempty = self.testempty().map(|x| {
+        Box::new(x.unpack())
+      });
+      let testbool = self.testbool();
+      let testhashs32_fnv1 = self.testhashs32_fnv1();
+      let testhashu32_fnv1 = self.testhashu32_fnv1();
+      let testhashs64_fnv1 = self.testhashs64_fnv1();
+      let testhashu64_fnv1 = self.testhashu64_fnv1();
+      let testhashs32_fnv1a = self.testhashs32_fnv1a();
+      let testhashu32_fnv1a = self.testhashu32_fnv1a();
+      let testhashs64_fnv1a = self.testhashs64_fnv1a();
+      let testhashu64_fnv1a = self.testhashu64_fnv1a();
+      let testarrayofbools = self.testarrayofbools().map(|x| {
+        x.to_vec()
+      });
+      let testf = self.testf();
+      let testf2 = self.testf2();
+      let testf3 = self.testf3();
+      let testarrayofstring2 = self.testarrayofstring2().map(|x| {
+        x.iter().map(|s| s.to_string()).collect()
+      });
+      let testarrayofsortedstruct = self.testarrayofsortedstruct().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let flex = self.flex().map(|x| {
+        x.to_vec()
+      });
+      let test5 = self.test5().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let vector_of_longs = self.vector_of_longs().map(|x| {
+        x.into_iter().collect()
+      });
+      let vector_of_doubles = self.vector_of_doubles().map(|x| {
+        x.into_iter().collect()
+      });
+      let parent_namespace_test = self.parent_namespace_test().map(|x| {
+        Box::new(x.unpack())
+      });
+      let vector_of_referrables = self.vector_of_referrables().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let single_weak_reference = self.single_weak_reference();
+      let vector_of_weak_references = self.vector_of_weak_references().map(|x| {
+        x.into_iter().collect()
+      });
+      let vector_of_strong_referrables = self.vector_of_strong_referrables().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      let co_owning_reference = self.co_owning_reference();
+      let vector_of_co_owning_references = self.vector_of_co_owning_references().map(|x| {
+        x.into_iter().collect()
+      });
+      let non_owning_reference = self.non_owning_reference();
+      let vector_of_non_owning_references = self.vector_of_non_owning_references().map(|x| {
+        x.into_iter().collect()
+      });
+      let any_unique = match self.any_unique_type() {
+        AnyUniqueAliases::NONE => AnyUniqueAliasesT::NONE,
+        AnyUniqueAliases::M => AnyUniqueAliasesT::M(Box::new(
+          self.any_unique_as_m()
+              .expect("Invalid union table, expected `AnyUniqueAliases::M`.")
+              .unpack()
+        )),
+        AnyUniqueAliases::TS => AnyUniqueAliasesT::TS(Box::new(
+          self.any_unique_as_ts()
+              .expect("Invalid union table, expected `AnyUniqueAliases::TS`.")
+              .unpack()
+        )),
+        AnyUniqueAliases::M2 => AnyUniqueAliasesT::M2(Box::new(
+          self.any_unique_as_m2()
+              .expect("Invalid union table, expected `AnyUniqueAliases::M2`.")
+              .unpack()
+        )),
+        _ => AnyUniqueAliasesT::NONE,
+      };
+      let any_ambiguous = match self.any_ambiguous_type() {
+        AnyAmbiguousAliases::NONE => AnyAmbiguousAliasesT::NONE,
+        AnyAmbiguousAliases::M1 => AnyAmbiguousAliasesT::M1(Box::new(
+          self.any_ambiguous_as_m1()
+              .expect("Invalid union table, expected `AnyAmbiguousAliases::M1`.")
+              .unpack()
+        )),
+        AnyAmbiguousAliases::M2 => AnyAmbiguousAliasesT::M2(Box::new(
+          self.any_ambiguous_as_m2()
+              .expect("Invalid union table, expected `AnyAmbiguousAliases::M2`.")
+              .unpack()
+        )),
+        AnyAmbiguousAliases::M3 => AnyAmbiguousAliasesT::M3(Box::new(
+          self.any_ambiguous_as_m3()
+              .expect("Invalid union table, expected `AnyAmbiguousAliases::M3`.")
+              .unpack()
+        )),
+        _ => AnyAmbiguousAliasesT::NONE,
+      };
+      let vector_of_enums = self.vector_of_enums().map(|x| {
+        x.into_iter().collect()
+      });
+      let signed_enum = self.signed_enum();
+      let testrequirednestedflatbuffer = self.testrequirednestedflatbuffer().map(|x| {
+        x.to_vec()
+      });
+      let scalar_key_sorted_tables = self.scalar_key_sorted_tables().map(|x| {
+        x.iter().map(|t| t.unpack()).collect()
+      });
+      MonsterT {
+        pos,
+        mana,
+        hp,
+        name,
+        inventory,
+        color,
+        test,
+        test4,
+        testarrayofstring,
+        testarrayoftables,
+        enemy,
+        testnestedflatbuffer,
+        testempty,
+        testbool,
+        testhashs32_fnv1,
+        testhashu32_fnv1,
+        testhashs64_fnv1,
+        testhashu64_fnv1,
+        testhashs32_fnv1a,
+        testhashu32_fnv1a,
+        testhashs64_fnv1a,
+        testhashu64_fnv1a,
+        testarrayofbools,
+        testf,
+        testf2,
+        testf3,
+        testarrayofstring2,
+        testarrayofsortedstruct,
+        flex,
+        test5,
+        vector_of_longs,
+        vector_of_doubles,
+        parent_namespace_test,
+        vector_of_referrables,
+        single_weak_reference,
+        vector_of_weak_references,
+        vector_of_strong_referrables,
+        co_owning_reference,
+        vector_of_co_owning_references,
+        non_owning_reference,
+        vector_of_non_owning_references,
+        any_unique,
+        any_ambiguous,
+        vector_of_enums,
+        signed_enum,
+        testrequirednestedflatbuffer,
+        scalar_key_sorted_tables,
+      }
+    }
+    pub const VT_POS: flatbuffers::VOffsetT = 4;
+    pub const VT_MANA: flatbuffers::VOffsetT = 6;
+    pub const VT_HP: flatbuffers::VOffsetT = 8;
+    pub const VT_NAME: flatbuffers::VOffsetT = 10;
+    pub const VT_INVENTORY: flatbuffers::VOffsetT = 14;
+    pub const VT_COLOR: flatbuffers::VOffsetT = 16;
+    pub const VT_TEST_TYPE: flatbuffers::VOffsetT = 18;
+    pub const VT_TEST: flatbuffers::VOffsetT = 20;
+    pub const VT_TEST4: flatbuffers::VOffsetT = 22;
+    pub const VT_TESTARRAYOFSTRING: flatbuffers::VOffsetT = 24;
+    pub const VT_TESTARRAYOFTABLES: flatbuffers::VOffsetT = 26;
+    pub const VT_ENEMY: flatbuffers::VOffsetT = 28;
+    pub const VT_TESTNESTEDFLATBUFFER: flatbuffers::VOffsetT = 30;
+    pub const VT_TESTEMPTY: flatbuffers::VOffsetT = 32;
+    pub const VT_TESTBOOL: flatbuffers::VOffsetT = 34;
+    pub const VT_TESTHASHS32_FNV1: flatbuffers::VOffsetT = 36;
+    pub const VT_TESTHASHU32_FNV1: flatbuffers::VOffsetT = 38;
+    pub const VT_TESTHASHS64_FNV1: flatbuffers::VOffsetT = 40;
+    pub const VT_TESTHASHU64_FNV1: flatbuffers::VOffsetT = 42;
+    pub const VT_TESTHASHS32_FNV1A: flatbuffers::VOffsetT = 44;
+    pub const VT_TESTHASHU32_FNV1A: flatbuffers::VOffsetT = 46;
+    pub const VT_TESTHASHS64_FNV1A: flatbuffers::VOffsetT = 48;
+    pub const VT_TESTHASHU64_FNV1A: flatbuffers::VOffsetT = 50;
+    pub const VT_TESTARRAYOFBOOLS: flatbuffers::VOffsetT = 52;
+    pub const VT_TESTF: flatbuffers::VOffsetT = 54;
+    pub const VT_TESTF2: flatbuffers::VOffsetT = 56;
+    pub const VT_TESTF3: flatbuffers::VOffsetT = 58;
+    pub const VT_TESTARRAYOFSTRING2: flatbuffers::VOffsetT = 60;
+    pub const VT_TESTARRAYOFSORTEDSTRUCT: flatbuffers::VOffsetT = 62;
+    pub const VT_FLEX: flatbuffers::VOffsetT = 64;
+    pub const VT_TEST5: flatbuffers::VOffsetT = 66;
+    pub const VT_VECTOR_OF_LONGS: flatbuffers::VOffsetT = 68;
+    pub const VT_VECTOR_OF_DOUBLES: flatbuffers::VOffsetT = 70;
+    pub const VT_PARENT_NAMESPACE_TEST: flatbuffers::VOffsetT = 72;
+    pub const VT_VECTOR_OF_REFERRABLES: flatbuffers::VOffsetT = 74;
+    pub const VT_SINGLE_WEAK_REFERENCE: flatbuffers::VOffsetT = 76;
+    pub const VT_VECTOR_OF_WEAK_REFERENCES: flatbuffers::VOffsetT = 78;
+    pub const VT_VECTOR_OF_STRONG_REFERRABLES: flatbuffers::VOffsetT = 80;
+    pub const VT_CO_OWNING_REFERENCE: flatbuffers::VOffsetT = 82;
+    pub const VT_VECTOR_OF_CO_OWNING_REFERENCES: flatbuffers::VOffsetT = 84;
+    pub const VT_NON_OWNING_REFERENCE: flatbuffers::VOffsetT = 86;
+    pub const VT_VECTOR_OF_NON_OWNING_REFERENCES: flatbuffers::VOffsetT = 88;
+    pub const VT_ANY_UNIQUE_TYPE: flatbuffers::VOffsetT = 90;
+    pub const VT_ANY_UNIQUE: flatbuffers::VOffsetT = 92;
+    pub const VT_ANY_AMBIGUOUS_TYPE: flatbuffers::VOffsetT = 94;
+    pub const VT_ANY_AMBIGUOUS: flatbuffers::VOffsetT = 96;
+    pub const VT_VECTOR_OF_ENUMS: flatbuffers::VOffsetT = 98;
+    pub const VT_SIGNED_ENUM: flatbuffers::VOffsetT = 100;
+    pub const VT_TESTREQUIREDNESTEDFLATBUFFER: flatbuffers::VOffsetT = 102;
+    pub const VT_SCALAR_KEY_SORTED_TABLES: flatbuffers::VOffsetT = 104;
+
+  #[inline]
+  pub fn pos(&self) -> Option<&'a Vec3> {
+    self._tab.get::<Vec3>(Monster::VT_POS, None)
+  }
+  #[inline]
+  pub fn mana(&self) -> i16 {
+    self._tab.get::<i16>(Monster::VT_MANA, Some(150)).unwrap()
+  }
+  #[inline]
+  pub fn hp(&self) -> i16 {
+    self._tab.get::<i16>(Monster::VT_HP, Some(100)).unwrap()
+  }
+  #[inline]
+  pub fn name(&self) -> &'a str {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(Monster::VT_NAME, None).unwrap()
+  }
+  #[inline]
+  pub fn key_compare_less_than(&self, o: &Monster) ->  bool {
+    self.name() < o.name()
+  }
+
+  #[inline]
+  pub fn key_compare_with_value(&self, val: & str) ->  ::std::cmp::Ordering {
+    let key = self.name();
+    key.cmp(&val)
+  }
+  #[inline]
+  pub fn inventory(&self) -> Option<&'a [u8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u8>>>(Monster::VT_INVENTORY, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn color(&self) -> Color {
+    self._tab.get::<Color>(Monster::VT_COLOR, Some(Color::Blue)).unwrap()
+  }
+  #[inline]
+  pub fn test_type(&self) -> Any {
+    self._tab.get::<Any>(Monster::VT_TEST_TYPE, Some(Any::NONE)).unwrap()
+  }
+  #[inline]
+  pub fn test(&self) -> Option<flatbuffers::Table<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Table<'a>>>(Monster::VT_TEST, None)
+  }
+  #[inline]
+  pub fn test4(&self) -> Option<&'a [Test]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, Test>>>(Monster::VT_TEST4, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn testarrayofstring(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>>>(Monster::VT_TESTARRAYOFSTRING, None)
+  }
+  /// an example documentation comment: this will end up in the generated code
+  /// multiline too
+  #[inline]
+  pub fn testarrayoftables(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Monster<'a>>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Monster>>>>(Monster::VT_TESTARRAYOFTABLES, None)
+  }
+  #[inline]
+  pub fn enemy(&self) -> Option<Monster<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<Monster>>(Monster::VT_ENEMY, None)
+  }
+  #[inline]
+  pub fn testnestedflatbuffer(&self) -> Option<&'a [u8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u8>>>(Monster::VT_TESTNESTEDFLATBUFFER, None).map(|v| v.safe_slice())
+  }
+  pub fn testnestedflatbuffer_nested_flatbuffer(&'a self) -> Option<Monster<'a>> {
+    self.testnestedflatbuffer().map(|data| {
+      use flatbuffers::Follow;
+      <flatbuffers::ForwardsUOffset<Monster<'a>>>::follow(data, 0)
+    })
+  }
+  #[inline]
+  pub fn testempty(&self) -> Option<Stat<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<Stat>>(Monster::VT_TESTEMPTY, None)
+  }
+  #[inline]
+  pub fn testbool(&self) -> bool {
+    self._tab.get::<bool>(Monster::VT_TESTBOOL, Some(false)).unwrap()
+  }
+  #[inline]
+  pub fn testhashs32_fnv1(&self) -> i32 {
+    self._tab.get::<i32>(Monster::VT_TESTHASHS32_FNV1, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashu32_fnv1(&self) -> u32 {
+    self._tab.get::<u32>(Monster::VT_TESTHASHU32_FNV1, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashs64_fnv1(&self) -> i64 {
+    self._tab.get::<i64>(Monster::VT_TESTHASHS64_FNV1, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashu64_fnv1(&self) -> u64 {
+    self._tab.get::<u64>(Monster::VT_TESTHASHU64_FNV1, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashs32_fnv1a(&self) -> i32 {
+    self._tab.get::<i32>(Monster::VT_TESTHASHS32_FNV1A, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashu32_fnv1a(&self) -> u32 {
+    self._tab.get::<u32>(Monster::VT_TESTHASHU32_FNV1A, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashs64_fnv1a(&self) -> i64 {
+    self._tab.get::<i64>(Monster::VT_TESTHASHS64_FNV1A, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testhashu64_fnv1a(&self) -> u64 {
+    self._tab.get::<u64>(Monster::VT_TESTHASHU64_FNV1A, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn testarrayofbools(&self) -> Option<&'a [bool]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, bool>>>(Monster::VT_TESTARRAYOFBOOLS, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn testf(&self) -> f32 {
+    self._tab.get::<f32>(Monster::VT_TESTF, Some(3.14159)).unwrap()
+  }
+  #[inline]
+  pub fn testf2(&self) -> f32 {
+    self._tab.get::<f32>(Monster::VT_TESTF2, Some(3.0)).unwrap()
+  }
+  #[inline]
+  pub fn testf3(&self) -> f32 {
+    self._tab.get::<f32>(Monster::VT_TESTF3, Some(0.0)).unwrap()
+  }
+  #[inline]
+  pub fn testarrayofstring2(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>>>(Monster::VT_TESTARRAYOFSTRING2, None)
+  }
+  #[inline]
+  pub fn testarrayofsortedstruct(&self) -> Option<&'a [Ability]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, Ability>>>(Monster::VT_TESTARRAYOFSORTEDSTRUCT, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn flex(&self) -> Option<&'a [u8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u8>>>(Monster::VT_FLEX, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn test5(&self) -> Option<&'a [Test]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, Test>>>(Monster::VT_TEST5, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn vector_of_longs(&self) -> Option<flatbuffers::Vector<'a, i64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, i64>>>(Monster::VT_VECTOR_OF_LONGS, None)
+  }
+  #[inline]
+  pub fn vector_of_doubles(&self) -> Option<flatbuffers::Vector<'a, f64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, f64>>>(Monster::VT_VECTOR_OF_DOUBLES, None)
+  }
+  #[inline]
+  pub fn parent_namespace_test(&self) -> Option<super::InParentNamespace<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<super::InParentNamespace>>(Monster::VT_PARENT_NAMESPACE_TEST, None)
+  }
+  #[inline]
+  pub fn vector_of_referrables(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable<'a>>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable>>>>(Monster::VT_VECTOR_OF_REFERRABLES, None)
+  }
+  #[inline]
+  pub fn single_weak_reference(&self) -> u64 {
+    self._tab.get::<u64>(Monster::VT_SINGLE_WEAK_REFERENCE, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn vector_of_weak_references(&self) -> Option<flatbuffers::Vector<'a, u64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u64>>>(Monster::VT_VECTOR_OF_WEAK_REFERENCES, None)
+  }
+  #[inline]
+  pub fn vector_of_strong_referrables(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable<'a>>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable>>>>(Monster::VT_VECTOR_OF_STRONG_REFERRABLES, None)
+  }
+  #[inline]
+  pub fn co_owning_reference(&self) -> u64 {
+    self._tab.get::<u64>(Monster::VT_CO_OWNING_REFERENCE, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn vector_of_co_owning_references(&self) -> Option<flatbuffers::Vector<'a, u64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u64>>>(Monster::VT_VECTOR_OF_CO_OWNING_REFERENCES, None)
+  }
+  #[inline]
+  pub fn non_owning_reference(&self) -> u64 {
+    self._tab.get::<u64>(Monster::VT_NON_OWNING_REFERENCE, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn vector_of_non_owning_references(&self) -> Option<flatbuffers::Vector<'a, u64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u64>>>(Monster::VT_VECTOR_OF_NON_OWNING_REFERENCES, None)
+  }
+  #[inline]
+  pub fn any_unique_type(&self) -> AnyUniqueAliases {
+    self._tab.get::<AnyUniqueAliases>(Monster::VT_ANY_UNIQUE_TYPE, Some(AnyUniqueAliases::NONE)).unwrap()
+  }
+  #[inline]
+  pub fn any_unique(&self) -> Option<flatbuffers::Table<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Table<'a>>>(Monster::VT_ANY_UNIQUE, None)
+  }
+  #[inline]
+  pub fn any_ambiguous_type(&self) -> AnyAmbiguousAliases {
+    self._tab.get::<AnyAmbiguousAliases>(Monster::VT_ANY_AMBIGUOUS_TYPE, Some(AnyAmbiguousAliases::NONE)).unwrap()
+  }
+  #[inline]
+  pub fn any_ambiguous(&self) -> Option<flatbuffers::Table<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Table<'a>>>(Monster::VT_ANY_AMBIGUOUS, None)
+  }
+  #[inline]
+  pub fn vector_of_enums(&self) -> Option<flatbuffers::Vector<'a, Color>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, Color>>>(Monster::VT_VECTOR_OF_ENUMS, None)
+  }
+  #[inline]
+  pub fn signed_enum(&self) -> Race {
+    self._tab.get::<Race>(Monster::VT_SIGNED_ENUM, Some(Race::None)).unwrap()
+  }
+  #[inline]
+  pub fn testrequirednestedflatbuffer(&self) -> Option<&'a [u8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, u8>>>(Monster::VT_TESTREQUIREDNESTEDFLATBUFFER, None).map(|v| v.safe_slice())
+  }
+  pub fn testrequirednestedflatbuffer_nested_flatbuffer(&'a self) -> Option<Monster<'a>> {
+    self.testrequirednestedflatbuffer().map(|data| {
+      use flatbuffers::Follow;
+      <flatbuffers::ForwardsUOffset<Monster<'a>>>::follow(data, 0)
+    })
+  }
+  #[inline]
+  pub fn scalar_key_sorted_tables(&self) -> Option<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Stat<'a>>>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Stat>>>>(Monster::VT_SCALAR_KEY_SORTED_TABLES, None)
+  }
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn test_as_monster(&self) -> Option<Monster<'a>> {
+    if self.test_type() == Any::Monster {
+      self.test().map(Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn test_as_test_simple_table_with_enum(&self) -> Option<TestSimpleTableWithEnum<'a>> {
+    if self.test_type() == Any::TestSimpleTableWithEnum {
+      self.test().map(TestSimpleTableWithEnum::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn test_as_my_game_example_2_monster(&self) -> Option<super::example_2::Monster<'a>> {
+    if self.test_type() == Any::MyGame_Example2_Monster {
+      self.test().map(super::example_2::Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_unique_as_m(&self) -> Option<Monster<'a>> {
+    if self.any_unique_type() == AnyUniqueAliases::M {
+      self.any_unique().map(Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_unique_as_ts(&self) -> Option<TestSimpleTableWithEnum<'a>> {
+    if self.any_unique_type() == AnyUniqueAliases::TS {
+      self.any_unique().map(TestSimpleTableWithEnum::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_unique_as_m2(&self) -> Option<super::example_2::Monster<'a>> {
+    if self.any_unique_type() == AnyUniqueAliases::M2 {
+      self.any_unique().map(super::example_2::Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_ambiguous_as_m1(&self) -> Option<Monster<'a>> {
+    if self.any_ambiguous_type() == AnyAmbiguousAliases::M1 {
+      self.any_ambiguous().map(Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_ambiguous_as_m2(&self) -> Option<Monster<'a>> {
+    if self.any_ambiguous_type() == AnyAmbiguousAliases::M2 {
+      self.any_ambiguous().map(Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn any_ambiguous_as_m3(&self) -> Option<Monster<'a>> {
+    if self.any_ambiguous_type() == AnyAmbiguousAliases::M3 {
+      self.any_ambiguous().map(Monster::init_from_table)
+    } else {
+      None
+    }
+  }
+
+}
+
+impl flatbuffers::Verifiable for Monster<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<Vec3>(&"pos", Self::VT_POS, false)?
+     .visit_field::<i16>(&"mana", Self::VT_MANA, false)?
+     .visit_field::<i16>(&"hp", Self::VT_HP, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"name", Self::VT_NAME, true)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u8>>>(&"inventory", Self::VT_INVENTORY, false)?
+     .visit_field::<Color>(&"color", Self::VT_COLOR, false)?
+     .visit_union::<Any, _>(&"test_type", Self::VT_TEST_TYPE, &"test", Self::VT_TEST, false, |key, v, pos| {
+        match key {
+          Any::Monster => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Monster>>("Any::Monster", pos),
+          Any::TestSimpleTableWithEnum => v.verify_union_variant::<flatbuffers::ForwardsUOffset<TestSimpleTableWithEnum>>("Any::TestSimpleTableWithEnum", pos),
+          Any::MyGame_Example2_Monster => v.verify_union_variant::<flatbuffers::ForwardsUOffset<super::example_2::Monster>>("Any::MyGame_Example2_Monster", pos),
+          _ => Ok(()),
+        }
+     })?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, Test>>>(&"test4", Self::VT_TEST4, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<&'_ str>>>>(&"testarrayofstring", Self::VT_TESTARRAYOFSTRING, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<Monster>>>>(&"testarrayoftables", Self::VT_TESTARRAYOFTABLES, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<Monster>>(&"enemy", Self::VT_ENEMY, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u8>>>(&"testnestedflatbuffer", Self::VT_TESTNESTEDFLATBUFFER, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<Stat>>(&"testempty", Self::VT_TESTEMPTY, false)?
+     .visit_field::<bool>(&"testbool", Self::VT_TESTBOOL, false)?
+     .visit_field::<i32>(&"testhashs32_fnv1", Self::VT_TESTHASHS32_FNV1, false)?
+     .visit_field::<u32>(&"testhashu32_fnv1", Self::VT_TESTHASHU32_FNV1, false)?
+     .visit_field::<i64>(&"testhashs64_fnv1", Self::VT_TESTHASHS64_FNV1, false)?
+     .visit_field::<u64>(&"testhashu64_fnv1", Self::VT_TESTHASHU64_FNV1, false)?
+     .visit_field::<i32>(&"testhashs32_fnv1a", Self::VT_TESTHASHS32_FNV1A, false)?
+     .visit_field::<u32>(&"testhashu32_fnv1a", Self::VT_TESTHASHU32_FNV1A, false)?
+     .visit_field::<i64>(&"testhashs64_fnv1a", Self::VT_TESTHASHS64_FNV1A, false)?
+     .visit_field::<u64>(&"testhashu64_fnv1a", Self::VT_TESTHASHU64_FNV1A, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, bool>>>(&"testarrayofbools", Self::VT_TESTARRAYOFBOOLS, false)?
+     .visit_field::<f32>(&"testf", Self::VT_TESTF, false)?
+     .visit_field::<f32>(&"testf2", Self::VT_TESTF2, false)?
+     .visit_field::<f32>(&"testf3", Self::VT_TESTF3, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<&'_ str>>>>(&"testarrayofstring2", Self::VT_TESTARRAYOFSTRING2, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, Ability>>>(&"testarrayofsortedstruct", Self::VT_TESTARRAYOFSORTEDSTRUCT, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u8>>>(&"flex", Self::VT_FLEX, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, Test>>>(&"test5", Self::VT_TEST5, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, i64>>>(&"vector_of_longs", Self::VT_VECTOR_OF_LONGS, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, f64>>>(&"vector_of_doubles", Self::VT_VECTOR_OF_DOUBLES, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<super::InParentNamespace>>(&"parent_namespace_test", Self::VT_PARENT_NAMESPACE_TEST, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<Referrable>>>>(&"vector_of_referrables", Self::VT_VECTOR_OF_REFERRABLES, false)?
+     .visit_field::<u64>(&"single_weak_reference", Self::VT_SINGLE_WEAK_REFERENCE, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u64>>>(&"vector_of_weak_references", Self::VT_VECTOR_OF_WEAK_REFERENCES, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<Referrable>>>>(&"vector_of_strong_referrables", Self::VT_VECTOR_OF_STRONG_REFERRABLES, false)?
+     .visit_field::<u64>(&"co_owning_reference", Self::VT_CO_OWNING_REFERENCE, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u64>>>(&"vector_of_co_owning_references", Self::VT_VECTOR_OF_CO_OWNING_REFERENCES, false)?
+     .visit_field::<u64>(&"non_owning_reference", Self::VT_NON_OWNING_REFERENCE, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u64>>>(&"vector_of_non_owning_references", Self::VT_VECTOR_OF_NON_OWNING_REFERENCES, false)?
+     .visit_union::<AnyUniqueAliases, _>(&"any_unique_type", Self::VT_ANY_UNIQUE_TYPE, &"any_unique", Self::VT_ANY_UNIQUE, false, |key, v, pos| {
+        match key {
+          AnyUniqueAliases::M => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Monster>>("AnyUniqueAliases::M", pos),
+          AnyUniqueAliases::TS => v.verify_union_variant::<flatbuffers::ForwardsUOffset<TestSimpleTableWithEnum>>("AnyUniqueAliases::TS", pos),
+          AnyUniqueAliases::M2 => v.verify_union_variant::<flatbuffers::ForwardsUOffset<super::example_2::Monster>>("AnyUniqueAliases::M2", pos),
+          _ => Ok(()),
+        }
+     })?
+     .visit_union::<AnyAmbiguousAliases, _>(&"any_ambiguous_type", Self::VT_ANY_AMBIGUOUS_TYPE, &"any_ambiguous", Self::VT_ANY_AMBIGUOUS, false, |key, v, pos| {
+        match key {
+          AnyAmbiguousAliases::M1 => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Monster>>("AnyAmbiguousAliases::M1", pos),
+          AnyAmbiguousAliases::M2 => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Monster>>("AnyAmbiguousAliases::M2", pos),
+          AnyAmbiguousAliases::M3 => v.verify_union_variant::<flatbuffers::ForwardsUOffset<Monster>>("AnyAmbiguousAliases::M3", pos),
+          _ => Ok(()),
+        }
+     })?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, Color>>>(&"vector_of_enums", Self::VT_VECTOR_OF_ENUMS, false)?
+     .visit_field::<Race>(&"signed_enum", Self::VT_SIGNED_ENUM, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, u8>>>(&"testrequirednestedflatbuffer", Self::VT_TESTREQUIREDNESTEDFLATBUFFER, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<Stat>>>>(&"scalar_key_sorted_tables", Self::VT_SCALAR_KEY_SORTED_TABLES, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct MonsterArgs<'a> {
+    pub pos: Option<&'a Vec3>,
+    pub mana: i16,
+    pub hp: i16,
+    pub name: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub inventory: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u8>>>,
+    pub color: Color,
+    pub test_type: Any,
+    pub test: Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>,
+    pub test4: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, Test>>>,
+    pub testarrayofstring: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>>>,
+    pub testarrayoftables: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Monster<'a>>>>>,
+    pub enemy: Option<flatbuffers::WIPOffset<Monster<'a>>>,
+    pub testnestedflatbuffer: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u8>>>,
+    pub testempty: Option<flatbuffers::WIPOffset<Stat<'a>>>,
+    pub testbool: bool,
+    pub testhashs32_fnv1: i32,
+    pub testhashu32_fnv1: u32,
+    pub testhashs64_fnv1: i64,
+    pub testhashu64_fnv1: u64,
+    pub testhashs32_fnv1a: i32,
+    pub testhashu32_fnv1a: u32,
+    pub testhashs64_fnv1a: i64,
+    pub testhashu64_fnv1a: u64,
+    pub testarrayofbools: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, bool>>>,
+    pub testf: f32,
+    pub testf2: f32,
+    pub testf3: f32,
+    pub testarrayofstring2: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<&'a str>>>>,
+    pub testarrayofsortedstruct: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, Ability>>>,
+    pub flex: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u8>>>,
+    pub test5: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, Test>>>,
+    pub vector_of_longs: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, i64>>>,
+    pub vector_of_doubles: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, f64>>>,
+    pub parent_namespace_test: Option<flatbuffers::WIPOffset<super::InParentNamespace<'a>>>,
+    pub vector_of_referrables: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable<'a>>>>>,
+    pub single_weak_reference: u64,
+    pub vector_of_weak_references: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u64>>>,
+    pub vector_of_strong_referrables: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Referrable<'a>>>>>,
+    pub co_owning_reference: u64,
+    pub vector_of_co_owning_references: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u64>>>,
+    pub non_owning_reference: u64,
+    pub vector_of_non_owning_references: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u64>>>,
+    pub any_unique_type: AnyUniqueAliases,
+    pub any_unique: Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>,
+    pub any_ambiguous_type: AnyAmbiguousAliases,
+    pub any_ambiguous: Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>,
+    pub vector_of_enums: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, Color>>>,
+    pub signed_enum: Race,
+    pub testrequirednestedflatbuffer: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, u8>>>,
+    pub scalar_key_sorted_tables: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset<Stat<'a>>>>>,
+}
+impl<'a> Default for MonsterArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        MonsterArgs {
+            pos: None,
+            mana: 150,
+            hp: 100,
+            name: None, // required field
+            inventory: None,
+            color: Color::Blue,
+            test_type: Any::NONE,
+            test: None,
+            test4: None,
+            testarrayofstring: None,
+            testarrayoftables: None,
+            enemy: None,
+            testnestedflatbuffer: None,
+            testempty: None,
+            testbool: false,
+            testhashs32_fnv1: 0,
+            testhashu32_fnv1: 0,
+            testhashs64_fnv1: 0,
+            testhashu64_fnv1: 0,
+            testhashs32_fnv1a: 0,
+            testhashu32_fnv1a: 0,
+            testhashs64_fnv1a: 0,
+            testhashu64_fnv1a: 0,
+            testarrayofbools: None,
+            testf: 3.14159,
+            testf2: 3.0,
+            testf3: 0.0,
+            testarrayofstring2: None,
+            testarrayofsortedstruct: None,
+            flex: None,
+            test5: None,
+            vector_of_longs: None,
+            vector_of_doubles: None,
+            parent_namespace_test: None,
+            vector_of_referrables: None,
+            single_weak_reference: 0,
+            vector_of_weak_references: None,
+            vector_of_strong_referrables: None,
+            co_owning_reference: 0,
+            vector_of_co_owning_references: None,
+            non_owning_reference: 0,
+            vector_of_non_owning_references: None,
+            any_unique_type: AnyUniqueAliases::NONE,
+            any_unique: None,
+            any_ambiguous_type: AnyAmbiguousAliases::NONE,
+            any_ambiguous: None,
+            vector_of_enums: None,
+            signed_enum: Race::None,
+            testrequirednestedflatbuffer: None,
+            scalar_key_sorted_tables: None,
+        }
+    }
+}
+pub struct MonsterBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> MonsterBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_pos(&mut self, pos: &Vec3) {
+    self.fbb_.push_slot_always::<&Vec3>(Monster::VT_POS, pos);
+  }
+  #[inline]
+  pub fn add_mana(&mut self, mana: i16) {
+    self.fbb_.push_slot::<i16>(Monster::VT_MANA, mana, 150);
+  }
+  #[inline]
+  pub fn add_hp(&mut self, hp: i16) {
+    self.fbb_.push_slot::<i16>(Monster::VT_HP, hp, 100);
+  }
+  #[inline]
+  pub fn add_name(&mut self, name: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_NAME, name);
+  }
+  #[inline]
+  pub fn add_inventory(&mut self, inventory: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_INVENTORY, inventory);
+  }
+  #[inline]
+  pub fn add_color(&mut self, color: Color) {
+    self.fbb_.push_slot::<Color>(Monster::VT_COLOR, color, Color::Blue);
+  }
+  #[inline]
+  pub fn add_test_type(&mut self, test_type: Any) {
+    self.fbb_.push_slot::<Any>(Monster::VT_TEST_TYPE, test_type, Any::NONE);
+  }
+  #[inline]
+  pub fn add_test(&mut self, test: flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TEST, test);
+  }
+  #[inline]
+  pub fn add_test4(&mut self, test4: flatbuffers::WIPOffset<flatbuffers::Vector<'b , Test>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TEST4, test4);
+  }
+  #[inline]
+  pub fn add_testarrayofstring(&mut self, testarrayofstring: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<&'b  str>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTARRAYOFSTRING, testarrayofstring);
+  }
+  #[inline]
+  pub fn add_testarrayoftables(&mut self, testarrayoftables: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<Monster<'b >>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTARRAYOFTABLES, testarrayoftables);
+  }
+  #[inline]
+  pub fn add_enemy(&mut self, enemy: flatbuffers::WIPOffset<Monster<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<Monster>>(Monster::VT_ENEMY, enemy);
+  }
+  #[inline]
+  pub fn add_testnestedflatbuffer(&mut self, testnestedflatbuffer: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTNESTEDFLATBUFFER, testnestedflatbuffer);
+  }
+  #[inline]
+  pub fn add_testempty(&mut self, testempty: flatbuffers::WIPOffset<Stat<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<Stat>>(Monster::VT_TESTEMPTY, testempty);
+  }
+  #[inline]
+  pub fn add_testbool(&mut self, testbool: bool) {
+    self.fbb_.push_slot::<bool>(Monster::VT_TESTBOOL, testbool, false);
+  }
+  #[inline]
+  pub fn add_testhashs32_fnv1(&mut self, testhashs32_fnv1: i32) {
+    self.fbb_.push_slot::<i32>(Monster::VT_TESTHASHS32_FNV1, testhashs32_fnv1, 0);
+  }
+  #[inline]
+  pub fn add_testhashu32_fnv1(&mut self, testhashu32_fnv1: u32) {
+    self.fbb_.push_slot::<u32>(Monster::VT_TESTHASHU32_FNV1, testhashu32_fnv1, 0);
+  }
+  #[inline]
+  pub fn add_testhashs64_fnv1(&mut self, testhashs64_fnv1: i64) {
+    self.fbb_.push_slot::<i64>(Monster::VT_TESTHASHS64_FNV1, testhashs64_fnv1, 0);
+  }
+  #[inline]
+  pub fn add_testhashu64_fnv1(&mut self, testhashu64_fnv1: u64) {
+    self.fbb_.push_slot::<u64>(Monster::VT_TESTHASHU64_FNV1, testhashu64_fnv1, 0);
+  }
+  #[inline]
+  pub fn add_testhashs32_fnv1a(&mut self, testhashs32_fnv1a: i32) {
+    self.fbb_.push_slot::<i32>(Monster::VT_TESTHASHS32_FNV1A, testhashs32_fnv1a, 0);
+  }
+  #[inline]
+  pub fn add_testhashu32_fnv1a(&mut self, testhashu32_fnv1a: u32) {
+    self.fbb_.push_slot::<u32>(Monster::VT_TESTHASHU32_FNV1A, testhashu32_fnv1a, 0);
+  }
+  #[inline]
+  pub fn add_testhashs64_fnv1a(&mut self, testhashs64_fnv1a: i64) {
+    self.fbb_.push_slot::<i64>(Monster::VT_TESTHASHS64_FNV1A, testhashs64_fnv1a, 0);
+  }
+  #[inline]
+  pub fn add_testhashu64_fnv1a(&mut self, testhashu64_fnv1a: u64) {
+    self.fbb_.push_slot::<u64>(Monster::VT_TESTHASHU64_FNV1A, testhashu64_fnv1a, 0);
+  }
+  #[inline]
+  pub fn add_testarrayofbools(&mut self, testarrayofbools: flatbuffers::WIPOffset<flatbuffers::Vector<'b , bool>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTARRAYOFBOOLS, testarrayofbools);
+  }
+  #[inline]
+  pub fn add_testf(&mut self, testf: f32) {
+    self.fbb_.push_slot::<f32>(Monster::VT_TESTF, testf, 3.14159);
+  }
+  #[inline]
+  pub fn add_testf2(&mut self, testf2: f32) {
+    self.fbb_.push_slot::<f32>(Monster::VT_TESTF2, testf2, 3.0);
+  }
+  #[inline]
+  pub fn add_testf3(&mut self, testf3: f32) {
+    self.fbb_.push_slot::<f32>(Monster::VT_TESTF3, testf3, 0.0);
+  }
+  #[inline]
+  pub fn add_testarrayofstring2(&mut self, testarrayofstring2: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<&'b  str>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTARRAYOFSTRING2, testarrayofstring2);
+  }
+  #[inline]
+  pub fn add_testarrayofsortedstruct(&mut self, testarrayofsortedstruct: flatbuffers::WIPOffset<flatbuffers::Vector<'b , Ability>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTARRAYOFSORTEDSTRUCT, testarrayofsortedstruct);
+  }
+  #[inline]
+  pub fn add_flex(&mut self, flex: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_FLEX, flex);
+  }
+  #[inline]
+  pub fn add_test5(&mut self, test5: flatbuffers::WIPOffset<flatbuffers::Vector<'b , Test>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TEST5, test5);
+  }
+  #[inline]
+  pub fn add_vector_of_longs(&mut self, vector_of_longs: flatbuffers::WIPOffset<flatbuffers::Vector<'b , i64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_LONGS, vector_of_longs);
+  }
+  #[inline]
+  pub fn add_vector_of_doubles(&mut self, vector_of_doubles: flatbuffers::WIPOffset<flatbuffers::Vector<'b , f64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_DOUBLES, vector_of_doubles);
+  }
+  #[inline]
+  pub fn add_parent_namespace_test(&mut self, parent_namespace_test: flatbuffers::WIPOffset<super::InParentNamespace<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<super::InParentNamespace>>(Monster::VT_PARENT_NAMESPACE_TEST, parent_namespace_test);
+  }
+  #[inline]
+  pub fn add_vector_of_referrables(&mut self, vector_of_referrables: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<Referrable<'b >>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_REFERRABLES, vector_of_referrables);
+  }
+  #[inline]
+  pub fn add_single_weak_reference(&mut self, single_weak_reference: u64) {
+    self.fbb_.push_slot::<u64>(Monster::VT_SINGLE_WEAK_REFERENCE, single_weak_reference, 0);
+  }
+  #[inline]
+  pub fn add_vector_of_weak_references(&mut self, vector_of_weak_references: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_WEAK_REFERENCES, vector_of_weak_references);
+  }
+  #[inline]
+  pub fn add_vector_of_strong_referrables(&mut self, vector_of_strong_referrables: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<Referrable<'b >>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_STRONG_REFERRABLES, vector_of_strong_referrables);
+  }
+  #[inline]
+  pub fn add_co_owning_reference(&mut self, co_owning_reference: u64) {
+    self.fbb_.push_slot::<u64>(Monster::VT_CO_OWNING_REFERENCE, co_owning_reference, 0);
+  }
+  #[inline]
+  pub fn add_vector_of_co_owning_references(&mut self, vector_of_co_owning_references: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_CO_OWNING_REFERENCES, vector_of_co_owning_references);
+  }
+  #[inline]
+  pub fn add_non_owning_reference(&mut self, non_owning_reference: u64) {
+    self.fbb_.push_slot::<u64>(Monster::VT_NON_OWNING_REFERENCE, non_owning_reference, 0);
+  }
+  #[inline]
+  pub fn add_vector_of_non_owning_references(&mut self, vector_of_non_owning_references: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_NON_OWNING_REFERENCES, vector_of_non_owning_references);
+  }
+  #[inline]
+  pub fn add_any_unique_type(&mut self, any_unique_type: AnyUniqueAliases) {
+    self.fbb_.push_slot::<AnyUniqueAliases>(Monster::VT_ANY_UNIQUE_TYPE, any_unique_type, AnyUniqueAliases::NONE);
+  }
+  #[inline]
+  pub fn add_any_unique(&mut self, any_unique: flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_ANY_UNIQUE, any_unique);
+  }
+  #[inline]
+  pub fn add_any_ambiguous_type(&mut self, any_ambiguous_type: AnyAmbiguousAliases) {
+    self.fbb_.push_slot::<AnyAmbiguousAliases>(Monster::VT_ANY_AMBIGUOUS_TYPE, any_ambiguous_type, AnyAmbiguousAliases::NONE);
+  }
+  #[inline]
+  pub fn add_any_ambiguous(&mut self, any_ambiguous: flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_ANY_AMBIGUOUS, any_ambiguous);
+  }
+  #[inline]
+  pub fn add_vector_of_enums(&mut self, vector_of_enums: flatbuffers::WIPOffset<flatbuffers::Vector<'b , Color>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_VECTOR_OF_ENUMS, vector_of_enums);
+  }
+  #[inline]
+  pub fn add_signed_enum(&mut self, signed_enum: Race) {
+    self.fbb_.push_slot::<Race>(Monster::VT_SIGNED_ENUM, signed_enum, Race::None);
+  }
+  #[inline]
+  pub fn add_testrequirednestedflatbuffer(&mut self, testrequirednestedflatbuffer: flatbuffers::WIPOffset<flatbuffers::Vector<'b , u8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_TESTREQUIREDNESTEDFLATBUFFER, testrequirednestedflatbuffer);
+  }
+  #[inline]
+  pub fn add_scalar_key_sorted_tables(&mut self, scalar_key_sorted_tables: flatbuffers::WIPOffset<flatbuffers::Vector<'b , flatbuffers::ForwardsUOffset<Stat<'b >>>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(Monster::VT_SCALAR_KEY_SORTED_TABLES, scalar_key_sorted_tables);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> MonsterBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    MonsterBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<Monster<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    self.fbb_.required(o, Monster::VT_NAME,"name");
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for Monster<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("Monster");
+      ds.field("pos", &self.pos());
+      ds.field("mana", &self.mana());
+      ds.field("hp", &self.hp());
+      ds.field("name", &self.name());
+      ds.field("inventory", &self.inventory());
+      ds.field("color", &self.color());
+      ds.field("test_type", &self.test_type());
+      match self.test_type() {
+        Any::Monster => {
+          if let Some(x) = self.test_as_monster() {
+            ds.field("test", &x)
+          } else {
+            ds.field("test", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        Any::TestSimpleTableWithEnum => {
+          if let Some(x) = self.test_as_test_simple_table_with_enum() {
+            ds.field("test", &x)
+          } else {
+            ds.field("test", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        Any::MyGame_Example2_Monster => {
+          if let Some(x) = self.test_as_my_game_example_2_monster() {
+            ds.field("test", &x)
+          } else {
+            ds.field("test", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        _ => {
+          let x: Option<()> = None;
+          ds.field("test", &x)
+        },
+      };
+      ds.field("test4", &self.test4());
+      ds.field("testarrayofstring", &self.testarrayofstring());
+      ds.field("testarrayoftables", &self.testarrayoftables());
+      ds.field("enemy", &self.enemy());
+      ds.field("testnestedflatbuffer", &self.testnestedflatbuffer());
+      ds.field("testempty", &self.testempty());
+      ds.field("testbool", &self.testbool());
+      ds.field("testhashs32_fnv1", &self.testhashs32_fnv1());
+      ds.field("testhashu32_fnv1", &self.testhashu32_fnv1());
+      ds.field("testhashs64_fnv1", &self.testhashs64_fnv1());
+      ds.field("testhashu64_fnv1", &self.testhashu64_fnv1());
+      ds.field("testhashs32_fnv1a", &self.testhashs32_fnv1a());
+      ds.field("testhashu32_fnv1a", &self.testhashu32_fnv1a());
+      ds.field("testhashs64_fnv1a", &self.testhashs64_fnv1a());
+      ds.field("testhashu64_fnv1a", &self.testhashu64_fnv1a());
+      ds.field("testarrayofbools", &self.testarrayofbools());
+      ds.field("testf", &self.testf());
+      ds.field("testf2", &self.testf2());
+      ds.field("testf3", &self.testf3());
+      ds.field("testarrayofstring2", &self.testarrayofstring2());
+      ds.field("testarrayofsortedstruct", &self.testarrayofsortedstruct());
+      ds.field("flex", &self.flex());
+      ds.field("test5", &self.test5());
+      ds.field("vector_of_longs", &self.vector_of_longs());
+      ds.field("vector_of_doubles", &self.vector_of_doubles());
+      ds.field("parent_namespace_test", &self.parent_namespace_test());
+      ds.field("vector_of_referrables", &self.vector_of_referrables());
+      ds.field("single_weak_reference", &self.single_weak_reference());
+      ds.field("vector_of_weak_references", &self.vector_of_weak_references());
+      ds.field("vector_of_strong_referrables", &self.vector_of_strong_referrables());
+      ds.field("co_owning_reference", &self.co_owning_reference());
+      ds.field("vector_of_co_owning_references", &self.vector_of_co_owning_references());
+      ds.field("non_owning_reference", &self.non_owning_reference());
+      ds.field("vector_of_non_owning_references", &self.vector_of_non_owning_references());
+      ds.field("any_unique_type", &self.any_unique_type());
+      match self.any_unique_type() {
+        AnyUniqueAliases::M => {
+          if let Some(x) = self.any_unique_as_m() {
+            ds.field("any_unique", &x)
+          } else {
+            ds.field("any_unique", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        AnyUniqueAliases::TS => {
+          if let Some(x) = self.any_unique_as_ts() {
+            ds.field("any_unique", &x)
+          } else {
+            ds.field("any_unique", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        AnyUniqueAliases::M2 => {
+          if let Some(x) = self.any_unique_as_m2() {
+            ds.field("any_unique", &x)
+          } else {
+            ds.field("any_unique", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        _ => {
+          let x: Option<()> = None;
+          ds.field("any_unique", &x)
+        },
+      };
+      ds.field("any_ambiguous_type", &self.any_ambiguous_type());
+      match self.any_ambiguous_type() {
+        AnyAmbiguousAliases::M1 => {
+          if let Some(x) = self.any_ambiguous_as_m1() {
+            ds.field("any_ambiguous", &x)
+          } else {
+            ds.field("any_ambiguous", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        AnyAmbiguousAliases::M2 => {
+          if let Some(x) = self.any_ambiguous_as_m2() {
+            ds.field("any_ambiguous", &x)
+          } else {
+            ds.field("any_ambiguous", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        AnyAmbiguousAliases::M3 => {
+          if let Some(x) = self.any_ambiguous_as_m3() {
+            ds.field("any_ambiguous", &x)
+          } else {
+            ds.field("any_ambiguous", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        _ => {
+          let x: Option<()> = None;
+          ds.field("any_ambiguous", &x)
+        },
+      };
+      ds.field("vector_of_enums", &self.vector_of_enums());
+      ds.field("signed_enum", &self.signed_enum());
+      ds.field("testrequirednestedflatbuffer", &self.testrequirednestedflatbuffer());
+      ds.field("scalar_key_sorted_tables", &self.scalar_key_sorted_tables());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct MonsterT {
+  pub pos: Option<Vec3T>,
+  pub mana: i16,
+  pub hp: i16,
+  pub name: String,
+  pub inventory: Option<Vec<u8>>,
+  pub color: Color,
+  pub test: AnyT,
+  pub test4: Option<Vec<TestT>>,
+  pub testarrayofstring: Option<Vec<String>>,
+  pub testarrayoftables: Option<Vec<MonsterT>>,
+  pub enemy: Option<Box<MonsterT>>,
+  pub testnestedflatbuffer: Option<Vec<u8>>,
+  pub testempty: Option<Box<StatT>>,
+  pub testbool: bool,
+  pub testhashs32_fnv1: i32,
+  pub testhashu32_fnv1: u32,
+  pub testhashs64_fnv1: i64,
+  pub testhashu64_fnv1: u64,
+  pub testhashs32_fnv1a: i32,
+  pub testhashu32_fnv1a: u32,
+  pub testhashs64_fnv1a: i64,
+  pub testhashu64_fnv1a: u64,
+  pub testarrayofbools: Option<Vec<bool>>,
+  pub testf: f32,
+  pub testf2: f32,
+  pub testf3: f32,
+  pub testarrayofstring2: Option<Vec<String>>,
+  pub testarrayofsortedstruct: Option<Vec<AbilityT>>,
+  pub flex: Option<Vec<u8>>,
+  pub test5: Option<Vec<TestT>>,
+  pub vector_of_longs: Option<Vec<i64>>,
+  pub vector_of_doubles: Option<Vec<f64>>,
+  pub parent_namespace_test: Option<Box<super::InParentNamespaceT>>,
+  pub vector_of_referrables: Option<Vec<ReferrableT>>,
+  pub single_weak_reference: u64,
+  pub vector_of_weak_references: Option<Vec<u64>>,
+  pub vector_of_strong_referrables: Option<Vec<ReferrableT>>,
+  pub co_owning_reference: u64,
+  pub vector_of_co_owning_references: Option<Vec<u64>>,
+  pub non_owning_reference: u64,
+  pub vector_of_non_owning_references: Option<Vec<u64>>,
+  pub any_unique: AnyUniqueAliasesT,
+  pub any_ambiguous: AnyAmbiguousAliasesT,
+  pub vector_of_enums: Option<Vec<Color>>,
+  pub signed_enum: Race,
+  pub testrequirednestedflatbuffer: Option<Vec<u8>>,
+  pub scalar_key_sorted_tables: Option<Vec<StatT>>,
+}
+impl Default for MonsterT {
+  fn default() -> Self {
+    Self {
+      pos: None,
+      mana: 150,
+      hp: 100,
+      name: "".to_string(),
+      inventory: None,
+      color: Color::Blue,
+      test: AnyT::NONE,
+      test4: None,
+      testarrayofstring: None,
+      testarrayoftables: None,
+      enemy: None,
+      testnestedflatbuffer: None,
+      testempty: None,
+      testbool: false,
+      testhashs32_fnv1: 0,
+      testhashu32_fnv1: 0,
+      testhashs64_fnv1: 0,
+      testhashu64_fnv1: 0,
+      testhashs32_fnv1a: 0,
+      testhashu32_fnv1a: 0,
+      testhashs64_fnv1a: 0,
+      testhashu64_fnv1a: 0,
+      testarrayofbools: None,
+      testf: 3.14159,
+      testf2: 3.0,
+      testf3: 0.0,
+      testarrayofstring2: None,
+      testarrayofsortedstruct: None,
+      flex: None,
+      test5: None,
+      vector_of_longs: None,
+      vector_of_doubles: None,
+      parent_namespace_test: None,
+      vector_of_referrables: None,
+      single_weak_reference: 0,
+      vector_of_weak_references: None,
+      vector_of_strong_referrables: None,
+      co_owning_reference: 0,
+      vector_of_co_owning_references: None,
+      non_owning_reference: 0,
+      vector_of_non_owning_references: None,
+      any_unique: AnyUniqueAliasesT::NONE,
+      any_ambiguous: AnyAmbiguousAliasesT::NONE,
+      vector_of_enums: None,
+      signed_enum: Race::None,
+      testrequirednestedflatbuffer: None,
+      scalar_key_sorted_tables: None,
+    }
+  }
+}
+impl MonsterT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<Monster<'b>> {
+    let pos_tmp = self.pos.as_ref().map(|x| x.pack());
+    let pos = pos_tmp.as_ref();
+    let mana = self.mana;
+    let hp = self.hp;
+    let name = Some({
+      let x = &self.name;
+      _fbb.create_string(x)
+    });
+    let inventory = self.inventory.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let color = self.color;
+    let test_type = self.test.any_type();
+    let test = self.test.pack(_fbb);
+    let test4 = self.test4.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack()).collect();_fbb.create_vector(&w)
+    });
+    let testarrayofstring = self.testarrayofstring.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|s| s.as_ref()).collect();_fbb.create_vector_of_strings(&w)
+    });
+    let testarrayoftables = self.testarrayoftables.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();_fbb.create_vector(&w)
+    });
+    let enemy = self.enemy.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    let testnestedflatbuffer = self.testnestedflatbuffer.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let testempty = self.testempty.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    let testbool = self.testbool;
+    let testhashs32_fnv1 = self.testhashs32_fnv1;
+    let testhashu32_fnv1 = self.testhashu32_fnv1;
+    let testhashs64_fnv1 = self.testhashs64_fnv1;
+    let testhashu64_fnv1 = self.testhashu64_fnv1;
+    let testhashs32_fnv1a = self.testhashs32_fnv1a;
+    let testhashu32_fnv1a = self.testhashu32_fnv1a;
+    let testhashs64_fnv1a = self.testhashs64_fnv1a;
+    let testhashu64_fnv1a = self.testhashu64_fnv1a;
+    let testarrayofbools = self.testarrayofbools.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let testf = self.testf;
+    let testf2 = self.testf2;
+    let testf3 = self.testf3;
+    let testarrayofstring2 = self.testarrayofstring2.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|s| s.as_ref()).collect();_fbb.create_vector_of_strings(&w)
+    });
+    let testarrayofsortedstruct = self.testarrayofsortedstruct.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack()).collect();_fbb.create_vector(&w)
+    });
+    let flex = self.flex.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let test5 = self.test5.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack()).collect();_fbb.create_vector(&w)
+    });
+    let vector_of_longs = self.vector_of_longs.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let vector_of_doubles = self.vector_of_doubles.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let parent_namespace_test = self.parent_namespace_test.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    let vector_of_referrables = self.vector_of_referrables.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();_fbb.create_vector(&w)
+    });
+    let single_weak_reference = self.single_weak_reference;
+    let vector_of_weak_references = self.vector_of_weak_references.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let vector_of_strong_referrables = self.vector_of_strong_referrables.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();_fbb.create_vector(&w)
+    });
+    let co_owning_reference = self.co_owning_reference;
+    let vector_of_co_owning_references = self.vector_of_co_owning_references.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let non_owning_reference = self.non_owning_reference;
+    let vector_of_non_owning_references = self.vector_of_non_owning_references.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let any_unique_type = self.any_unique.any_unique_aliases_type();
+    let any_unique = self.any_unique.pack(_fbb);
+    let any_ambiguous_type = self.any_ambiguous.any_ambiguous_aliases_type();
+    let any_ambiguous = self.any_ambiguous.pack(_fbb);
+    let vector_of_enums = self.vector_of_enums.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let signed_enum = self.signed_enum;
+    let testrequirednestedflatbuffer = self.testrequirednestedflatbuffer.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let scalar_key_sorted_tables = self.scalar_key_sorted_tables.as_ref().map(|x|{
+      let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect();_fbb.create_vector(&w)
+    });
+    Monster::create(_fbb, &MonsterArgs{
+      pos,
+      mana,
+      hp,
+      name,
+      inventory,
+      color,
+      test_type,
+      test,
+      test4,
+      testarrayofstring,
+      testarrayoftables,
+      enemy,
+      testnestedflatbuffer,
+      testempty,
+      testbool,
+      testhashs32_fnv1,
+      testhashu32_fnv1,
+      testhashs64_fnv1,
+      testhashu64_fnv1,
+      testhashs32_fnv1a,
+      testhashu32_fnv1a,
+      testhashs64_fnv1a,
+      testhashu64_fnv1a,
+      testarrayofbools,
+      testf,
+      testf2,
+      testf3,
+      testarrayofstring2,
+      testarrayofsortedstruct,
+      flex,
+      test5,
+      vector_of_longs,
+      vector_of_doubles,
+      parent_namespace_test,
+      vector_of_referrables,
+      single_weak_reference,
+      vector_of_weak_references,
+      vector_of_strong_referrables,
+      co_owning_reference,
+      vector_of_co_owning_references,
+      non_owning_reference,
+      vector_of_non_owning_references,
+      any_unique_type,
+      any_unique,
+      any_ambiguous_type,
+      any_ambiguous,
+      vector_of_enums,
+      signed_enum,
+      testrequirednestedflatbuffer,
+      scalar_key_sorted_tables,
+    })
+  }
+}
+pub enum TypeAliasesOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TypeAliases<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TypeAliases<'a> {
+    type Inner = TypeAliases<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TypeAliases<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "MyGame.Example.TypeAliases"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TypeAliases { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TypeAliasesArgs<'args>) -> flatbuffers::WIPOffset<TypeAliases<'bldr>> {
+      let mut builder = TypeAliasesBuilder::new(_fbb);
+      builder.add_f64_(args.f64_);
+      builder.add_u64_(args.u64_);
+      builder.add_i64_(args.i64_);
+      if let Some(x) = args.vf64 { builder.add_vf64(x); }
+      if let Some(x) = args.v8 { builder.add_v8(x); }
+      builder.add_f32_(args.f32_);
+      builder.add_u32_(args.u32_);
+      builder.add_i32_(args.i32_);
+      builder.add_u16_(args.u16_);
+      builder.add_i16_(args.i16_);
+      builder.add_u8_(args.u8_);
+      builder.add_i8_(args.i8_);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TypeAliasesT {
+      let i8_ = self.i8_();
+      let u8_ = self.u8_();
+      let i16_ = self.i16_();
+      let u16_ = self.u16_();
+      let i32_ = self.i32_();
+      let u32_ = self.u32_();
+      let i64_ = self.i64_();
+      let u64_ = self.u64_();
+      let f32_ = self.f32_();
+      let f64_ = self.f64_();
+      let v8 = self.v8().map(|x| {
+        x.to_vec()
+      });
+      let vf64 = self.vf64().map(|x| {
+        x.into_iter().collect()
+      });
+      TypeAliasesT {
+        i8_,
+        u8_,
+        i16_,
+        u16_,
+        i32_,
+        u32_,
+        i64_,
+        u64_,
+        f32_,
+        f64_,
+        v8,
+        vf64,
+      }
+    }
+    pub const VT_I8_: flatbuffers::VOffsetT = 4;
+    pub const VT_U8_: flatbuffers::VOffsetT = 6;
+    pub const VT_I16_: flatbuffers::VOffsetT = 8;
+    pub const VT_U16_: flatbuffers::VOffsetT = 10;
+    pub const VT_I32_: flatbuffers::VOffsetT = 12;
+    pub const VT_U32_: flatbuffers::VOffsetT = 14;
+    pub const VT_I64_: flatbuffers::VOffsetT = 16;
+    pub const VT_U64_: flatbuffers::VOffsetT = 18;
+    pub const VT_F32_: flatbuffers::VOffsetT = 20;
+    pub const VT_F64_: flatbuffers::VOffsetT = 22;
+    pub const VT_V8: flatbuffers::VOffsetT = 24;
+    pub const VT_VF64: flatbuffers::VOffsetT = 26;
+
+  #[inline]
+  pub fn i8_(&self) -> i8 {
+    self._tab.get::<i8>(TypeAliases::VT_I8_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn u8_(&self) -> u8 {
+    self._tab.get::<u8>(TypeAliases::VT_U8_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn i16_(&self) -> i16 {
+    self._tab.get::<i16>(TypeAliases::VT_I16_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn u16_(&self) -> u16 {
+    self._tab.get::<u16>(TypeAliases::VT_U16_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn i32_(&self) -> i32 {
+    self._tab.get::<i32>(TypeAliases::VT_I32_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn u32_(&self) -> u32 {
+    self._tab.get::<u32>(TypeAliases::VT_U32_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn i64_(&self) -> i64 {
+    self._tab.get::<i64>(TypeAliases::VT_I64_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn u64_(&self) -> u64 {
+    self._tab.get::<u64>(TypeAliases::VT_U64_, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn f32_(&self) -> f32 {
+    self._tab.get::<f32>(TypeAliases::VT_F32_, Some(0.0)).unwrap()
+  }
+  #[inline]
+  pub fn f64_(&self) -> f64 {
+    self._tab.get::<f64>(TypeAliases::VT_F64_, Some(0.0)).unwrap()
+  }
+  #[inline]
+  pub fn v8(&self) -> Option<&'a [i8]> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, i8>>>(TypeAliases::VT_V8, None).map(|v| v.safe_slice())
+  }
+  #[inline]
+  pub fn vf64(&self) -> Option<flatbuffers::Vector<'a, f64>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, f64>>>(TypeAliases::VT_VF64, None)
+  }
+}
+
+impl flatbuffers::Verifiable for TypeAliases<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<i8>(&"i8_", Self::VT_I8_, false)?
+     .visit_field::<u8>(&"u8_", Self::VT_U8_, false)?
+     .visit_field::<i16>(&"i16_", Self::VT_I16_, false)?
+     .visit_field::<u16>(&"u16_", Self::VT_U16_, false)?
+     .visit_field::<i32>(&"i32_", Self::VT_I32_, false)?
+     .visit_field::<u32>(&"u32_", Self::VT_U32_, false)?
+     .visit_field::<i64>(&"i64_", Self::VT_I64_, false)?
+     .visit_field::<u64>(&"u64_", Self::VT_U64_, false)?
+     .visit_field::<f32>(&"f32_", Self::VT_F32_, false)?
+     .visit_field::<f64>(&"f64_", Self::VT_F64_, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, i8>>>(&"v8", Self::VT_V8, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, f64>>>(&"vf64", Self::VT_VF64, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TypeAliasesArgs<'a> {
+    pub i8_: i8,
+    pub u8_: u8,
+    pub i16_: i16,
+    pub u16_: u16,
+    pub i32_: i32,
+    pub u32_: u32,
+    pub i64_: i64,
+    pub u64_: u64,
+    pub f32_: f32,
+    pub f64_: f64,
+    pub v8: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, i8>>>,
+    pub vf64: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, f64>>>,
+}
+impl<'a> Default for TypeAliasesArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        TypeAliasesArgs {
+            i8_: 0,
+            u8_: 0,
+            i16_: 0,
+            u16_: 0,
+            i32_: 0,
+            u32_: 0,
+            i64_: 0,
+            u64_: 0,
+            f32_: 0.0,
+            f64_: 0.0,
+            v8: None,
+            vf64: None,
+        }
+    }
+}
+pub struct TypeAliasesBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TypeAliasesBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_i8_(&mut self, i8_: i8) {
+    self.fbb_.push_slot::<i8>(TypeAliases::VT_I8_, i8_, 0);
+  }
+  #[inline]
+  pub fn add_u8_(&mut self, u8_: u8) {
+    self.fbb_.push_slot::<u8>(TypeAliases::VT_U8_, u8_, 0);
+  }
+  #[inline]
+  pub fn add_i16_(&mut self, i16_: i16) {
+    self.fbb_.push_slot::<i16>(TypeAliases::VT_I16_, i16_, 0);
+  }
+  #[inline]
+  pub fn add_u16_(&mut self, u16_: u16) {
+    self.fbb_.push_slot::<u16>(TypeAliases::VT_U16_, u16_, 0);
+  }
+  #[inline]
+  pub fn add_i32_(&mut self, i32_: i32) {
+    self.fbb_.push_slot::<i32>(TypeAliases::VT_I32_, i32_, 0);
+  }
+  #[inline]
+  pub fn add_u32_(&mut self, u32_: u32) {
+    self.fbb_.push_slot::<u32>(TypeAliases::VT_U32_, u32_, 0);
+  }
+  #[inline]
+  pub fn add_i64_(&mut self, i64_: i64) {
+    self.fbb_.push_slot::<i64>(TypeAliases::VT_I64_, i64_, 0);
+  }
+  #[inline]
+  pub fn add_u64_(&mut self, u64_: u64) {
+    self.fbb_.push_slot::<u64>(TypeAliases::VT_U64_, u64_, 0);
+  }
+  #[inline]
+  pub fn add_f32_(&mut self, f32_: f32) {
+    self.fbb_.push_slot::<f32>(TypeAliases::VT_F32_, f32_, 0.0);
+  }
+  #[inline]
+  pub fn add_f64_(&mut self, f64_: f64) {
+    self.fbb_.push_slot::<f64>(TypeAliases::VT_F64_, f64_, 0.0);
+  }
+  #[inline]
+  pub fn add_v8(&mut self, v8: flatbuffers::WIPOffset<flatbuffers::Vector<'b , i8>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(TypeAliases::VT_V8, v8);
+  }
+  #[inline]
+  pub fn add_vf64(&mut self, vf64: flatbuffers::WIPOffset<flatbuffers::Vector<'b , f64>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(TypeAliases::VT_VF64, vf64);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TypeAliasesBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TypeAliasesBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TypeAliases<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TypeAliases<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TypeAliases");
+      ds.field("i8_", &self.i8_());
+      ds.field("u8_", &self.u8_());
+      ds.field("i16_", &self.i16_());
+      ds.field("u16_", &self.u16_());
+      ds.field("i32_", &self.i32_());
+      ds.field("u32_", &self.u32_());
+      ds.field("i64_", &self.i64_());
+      ds.field("u64_", &self.u64_());
+      ds.field("f32_", &self.f32_());
+      ds.field("f64_", &self.f64_());
+      ds.field("v8", &self.v8());
+      ds.field("vf64", &self.vf64());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TypeAliasesT {
+  pub i8_: i8,
+  pub u8_: u8,
+  pub i16_: i16,
+  pub u16_: u16,
+  pub i32_: i32,
+  pub u32_: u32,
+  pub i64_: i64,
+  pub u64_: u64,
+  pub f32_: f32,
+  pub f64_: f64,
+  pub v8: Option<Vec<i8>>,
+  pub vf64: Option<Vec<f64>>,
+}
+impl Default for TypeAliasesT {
+  fn default() -> Self {
+    Self {
+      i8_: 0,
+      u8_: 0,
+      i16_: 0,
+      u16_: 0,
+      i32_: 0,
+      u32_: 0,
+      i64_: 0,
+      u64_: 0,
+      f32_: 0.0,
+      f64_: 0.0,
+      v8: None,
+      vf64: None,
+    }
+  }
+}
+impl TypeAliasesT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TypeAliases<'b>> {
+    let i8_ = self.i8_;
+    let u8_ = self.u8_;
+    let i16_ = self.i16_;
+    let u16_ = self.u16_;
+    let i32_ = self.i32_;
+    let u32_ = self.u32_;
+    let i64_ = self.i64_;
+    let u64_ = self.u64_;
+    let f32_ = self.f32_;
+    let f64_ = self.f64_;
+    let v8 = self.v8.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    let vf64 = self.vf64.as_ref().map(|x|{
+      _fbb.create_vector(x)
+    });
+    TypeAliases::create(_fbb, &TypeAliasesArgs{
+      i8_,
+      u8_,
+      i16_,
+      u16_,
+      i32_,
+      u32_,
+      i64_,
+      u64_,
+      f32_,
+      f64_,
+      v8,
+      vf64,
+    })
+  }
+}
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_root_as_monster<'a>(buf: &'a [u8]) -> Monster<'a> {
+  unsafe { flatbuffers::root_unchecked::<Monster<'a>>(buf) }
+}
+
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_size_prefixed_root_as_monster<'a>(buf: &'a [u8]) -> Monster<'a> {
+  unsafe { flatbuffers::size_prefixed_root_unchecked::<Monster<'a>>(buf) }
+}
+
+#[inline]
+/// Verifies that a buffer of bytes contains a `Monster`
+/// and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn root_as_monster(buf: &[u8]) -> Result<Monster, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root::<Monster>(buf)
+}
+#[inline]
+/// Verifies that a buffer of bytes contains a size prefixed
+/// `Monster` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `size_prefixed_root_as_monster_unchecked`.
+pub fn size_prefixed_root_as_monster(buf: &[u8]) -> Result<Monster, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root::<Monster>(buf)
+}
+#[inline]
+/// Verifies, with the given options, that a buffer of bytes
+/// contains a `Monster` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn root_as_monster_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<Monster<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root_with_opts::<Monster<'b>>(opts, buf)
+}
+#[inline]
+/// Verifies, with the given verifier options, that a buffer of
+/// bytes contains a size prefixed `Monster` and returns
+/// it. Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_monster_unchecked`.
+pub fn size_prefixed_root_as_monster_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<Monster<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root_with_opts::<Monster<'b>>(opts, buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a Monster and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid `Monster`.
+pub unsafe fn root_as_monster_unchecked(buf: &[u8]) -> Monster {
+  flatbuffers::root_unchecked::<Monster>(buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a size prefixed Monster and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid size prefixed `Monster`.
+pub unsafe fn size_prefixed_root_as_monster_unchecked(buf: &[u8]) -> Monster {
+  flatbuffers::size_prefixed_root_unchecked::<Monster>(buf)
+}
+pub const MONSTER_IDENTIFIER: &str = "MONS";
+
+#[inline]
+pub fn monster_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, MONSTER_IDENTIFIER, false)
+}
+
+#[inline]
+pub fn monster_size_prefixed_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, MONSTER_IDENTIFIER, true)
+}
+
+pub const MONSTER_EXTENSION: &str = "mon";
+
+#[inline]
+pub fn finish_monster_buffer<'a, 'b>(
+    fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+    root: flatbuffers::WIPOffset<Monster<'a>>) {
+  fbb.finish(root, Some(MONSTER_IDENTIFIER));
+}
+
+#[inline]
+pub fn finish_size_prefixed_monster_buffer<'a, 'b>(fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, root: flatbuffers::WIPOffset<Monster<'a>>) {
+  fbb.finish_size_prefixed(root, Some(MONSTER_IDENTIFIER));
+}
+}  // pub mod Example
+}  // pub mod MyGame
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.d.ts b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.d.ts
new file mode 100644
index 0000000..9afbaa8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.d.ts
@@ -0,0 +1,93 @@
+// Generated GRPC code for FlatBuffers TS *** DO NOT EDIT ***
+import * as flatbuffers from 'flatbuffers';
+import { Stat as MyGame_Example_Stat } from './my-game/example/stat';
+import { Monster as MyGame_Example_Monster } from './my-game/example/monster';
+
+import * as grpc from 'grpc';
+
+interface IMonsterStorageService extends grpc.ServiceDefinition<grpc.UntypedServiceImplementation> {
+  Store: IMonsterStorageService_IStore;
+  Retrieve: IMonsterStorageService_IRetrieve;
+  GetMaxHitPoint: IMonsterStorageService_IGetMaxHitPoint;
+  GetMinMaxHitPoints: IMonsterStorageService_IGetMinMaxHitPoints;
+}
+interface IMonsterStorageService_IStore extends grpc.MethodDefinition<MyGame_Example_Monster, MyGame_Example_Stat> {
+  path: string; // /MyGame.Example.MonsterStorage/Store
+  requestStream: boolean; // false
+  responseStream: boolean; // false
+  requestSerialize: grpc.serialize<MyGame_Example_Monster>;
+  requestDeserialize: grpc.deserialize<MyGame_Example_Monster>;
+  responseSerialize: grpc.serialize<MyGame_Example_Stat>;
+  responseDeserialize: grpc.deserialize<MyGame_Example_Stat>;
+}
+
+interface IMonsterStorageService_IRetrieve extends grpc.MethodDefinition<MyGame_Example_Stat, MyGame_Example_Monster> {
+  path: string; // /MyGame.Example.MonsterStorage/Retrieve
+  requestStream: boolean; // false
+  responseStream: boolean; // true
+  requestSerialize: grpc.serialize<MyGame_Example_Stat>;
+  requestDeserialize: grpc.deserialize<MyGame_Example_Stat>;
+  responseSerialize: grpc.serialize<MyGame_Example_Monster>;
+  responseDeserialize: grpc.deserialize<MyGame_Example_Monster>;
+}
+
+interface IMonsterStorageService_IGetMaxHitPoint extends grpc.MethodDefinition<MyGame_Example_Monster, MyGame_Example_Stat> {
+  path: string; // /MyGame.Example.MonsterStorage/GetMaxHitPoint
+  requestStream: boolean; // true
+  responseStream: boolean; // false
+  requestSerialize: grpc.serialize<MyGame_Example_Monster>;
+  requestDeserialize: grpc.deserialize<MyGame_Example_Monster>;
+  responseSerialize: grpc.serialize<MyGame_Example_Stat>;
+  responseDeserialize: grpc.deserialize<MyGame_Example_Stat>;
+}
+
+interface IMonsterStorageService_IGetMinMaxHitPoints extends grpc.MethodDefinition<MyGame_Example_Monster, MyGame_Example_Stat> {
+  path: string; // /MyGame.Example.MonsterStorage/GetMinMaxHitPoints
+  requestStream: boolean; // true
+  responseStream: boolean; // true
+  requestSerialize: grpc.serialize<MyGame_Example_Monster>;
+  requestDeserialize: grpc.deserialize<MyGame_Example_Monster>;
+  responseSerialize: grpc.serialize<MyGame_Example_Stat>;
+  responseDeserialize: grpc.deserialize<MyGame_Example_Stat>;
+}
+
+
+export const MonsterStorageService: IMonsterStorageService;
+
+export interface IMonsterStorageServer {
+  Store: grpc.handleUnaryCall<MyGame_Example_Monster, MyGame_Example_Stat>;
+  Retrieve: grpc.handleServerStreamingCall<MyGame_Example_Stat, MyGame_Example_Monster>;
+  GetMaxHitPoint: grpc.handleClientStreamingCall<MyGame_Example_Monster, MyGame_Example_Stat>;
+  GetMinMaxHitPoints: grpc.handleBidiStreamingCall<MyGame_Example_Monster, MyGame_Example_Stat>;
+}
+
+export interface IMonsterStorageClient {
+  Store(request: MyGame_Example_Monster, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  Store(request: MyGame_Example_Monster, metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  Store(request: MyGame_Example_Monster, metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  Retrieve(request: MyGame_Example_Stat, metadata: grpc.Metadata): grpc.ClientReadableStream<MyGame_Example_Monster>;
+  Retrieve(request: MyGame_Example_Stat, options: Partial<grpc.CallOptions>): grpc.ClientReadableStream<MyGame_Example_Monster>;
+  GetMaxHitPoint(callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  GetMaxHitPoint(metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  GetMaxHitPoint(options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  GetMaxHitPoint(metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  GetMinMaxHitPoints(): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+  GetMinMaxHitPoints(options: Partial<grpc.CallOptions>): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+  GetMinMaxHitPoints(metadata: grpc.Metadata, options?: Partial<grpc.CallOptions>): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+}
+
+export class MonsterStorageClient extends grpc.Client implements IMonsterStorageClient {
+  constructor(address: string, credentials: grpc.ChannelCredentials, options?: object);  public Store(request: MyGame_Example_Monster, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  public Store(request: MyGame_Example_Monster, metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  public Store(request: MyGame_Example_Monster, metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Stat) => void): grpc.ClientUnaryCall;
+  public Retrieve(request: MyGame_Example_Stat, metadata: grpc.Metadata): grpc.ClientReadableStream<MyGame_Example_Monster>;
+  public Retrieve(request: MyGame_Example_Stat, options: Partial<grpc.CallOptions>): grpc.ClientReadableStream<MyGame_Example_Monster>;
+  public GetMaxHitPoint(callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  public GetMaxHitPoint(metadata: grpc.Metadata, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  public GetMaxHitPoint(options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  public GetMaxHitPoint(metadata: grpc.Metadata, options: Partial<grpc.CallOptions>, callback: (error: grpc.ServiceError | null, response: MyGame_Example_Monster) => void): grpc.ClientWritableStream<MyGame_Example_Stat>;
+  public GetMinMaxHitPoints(): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+  public GetMinMaxHitPoints(options: Partial<grpc.CallOptions>): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+  public GetMinMaxHitPoints(metadata: grpc.Metadata, options?: Partial<grpc.CallOptions>): grpc.ClientDuplexStream<MyGame_Example_Monster, MyGame_Example_Stat>;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.js b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.js
new file mode 100644
index 0000000..e860854
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_grpc.js
@@ -0,0 +1,80 @@
+// Generated GRPC code for FlatBuffers TS *** DO NOT EDIT ***
+import * as flatbuffers from 'flatbuffers';
+import { Stat as MyGame_Example_Stat } from './my-game/example/stat';
+import { Monster as MyGame_Example_Monster } from './my-game/example/monster';
+
+var grpc = require('grpc');
+
+function serialize_MyGame_Example_Stat(buffer_args) {
+  if (!(buffer_args instanceof MyGame_Example_Stat)) {
+    throw new Error('Expected argument of type Stat');
+  }
+  return buffer_args.serialize();
+}
+
+function deserialize_MyGame_Example_Stat(buffer) {
+  return MyGame_Example_Stat.getRootAsStat(new flatbuffers.ByteBuffer(buffer))
+}
+
+
+function serialize_MyGame_Example_Monster(buffer_args) {
+  if (!(buffer_args instanceof MyGame_Example_Monster)) {
+    throw new Error('Expected argument of type Monster');
+  }
+  return buffer_args.serialize();
+}
+
+function deserialize_MyGame_Example_Monster(buffer) {
+  return MyGame_Example_Monster.getRootAsMonster(new flatbuffers.ByteBuffer(buffer))
+}
+
+
+
+
+var MonsterStorageService = exports.MonsterStorageService = {
+  Store: {
+    path: '/MyGame.Example.MonsterStorage/Store',
+    requestStream: false,
+    responseStream: false,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: MyGame_Example_Stat,
+    requestSerialize: serialize_MyGame_Example_Monster,
+    requestDeserialize: deserialize_MyGame_Example_Monster,
+    responseSerialize: serialize_MyGame_Example_Stat,
+    responseDeserialize: deserialize_MyGame_Example_Stat,
+  },
+  Retrieve: {
+    path: '/MyGame.Example.MonsterStorage/Retrieve',
+    requestStream: false,
+    responseStream: true,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: MyGame_Example_Monster,
+    requestSerialize: serialize_MyGame_Example_Stat,
+    requestDeserialize: deserialize_MyGame_Example_Stat,
+    responseSerialize: serialize_MyGame_Example_Monster,
+    responseDeserialize: deserialize_MyGame_Example_Monster,
+  },
+  GetMaxHitPoint: {
+    path: '/MyGame.Example.MonsterStorage/GetMaxHitPoint',
+    requestStream: true,
+    responseStream: false,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: MyGame_Example_Stat,
+    requestSerialize: serialize_MyGame_Example_Monster,
+    requestDeserialize: deserialize_MyGame_Example_Monster,
+    responseSerialize: serialize_MyGame_Example_Stat,
+    responseDeserialize: deserialize_MyGame_Example_Stat,
+  },
+  GetMinMaxHitPoints: {
+    path: '/MyGame.Example.MonsterStorage/GetMinMaxHitPoints',
+    requestStream: true,
+    responseStream: true,
+    requestType: flatbuffers.ByteBuffer,
+    responseType: MyGame_Example_Stat,
+    requestSerialize: serialize_MyGame_Example_Monster,
+    requestDeserialize: deserialize_MyGame_Example_Monster,
+    responseSerialize: serialize_MyGame_Example_Stat,
+    responseDeserialize: deserialize_MyGame_Example_Stat,
+  },
+};
+exports.MonsterStorageClient = grpc.makeGenericClientConstructor(MonsterStorageService);
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example2_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example2_generated.dart
new file mode 100644
index 0000000..eed14bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example2_generated.dart
@@ -0,0 +1,60 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game.example2;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game_generated.dart' as my_game;
+import './monster_test_my_game.example_generated.dart' as my_game_example;
+
+class Monster {
+  Monster._(this._bc, this._bcOffset);
+  factory Monster(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Monster> reader = const _MonsterReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+
+  @override
+  String toString() {
+    return 'Monster{}';
+  }
+}
+
+class _MonsterReader extends fb.TableReader<Monster> {
+  const _MonsterReader();
+
+  @override
+  Monster createObject(fb.BufferContext bc, int offset) => 
+    new Monster._(bc, offset);
+}
+
+class MonsterObjectBuilder extends fb.ObjectBuilder {
+
+  MonsterObjectBuilder();
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example_generated.dart
new file mode 100644
index 0000000..4a5f5c9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game.example_generated.dart
@@ -0,0 +1,1647 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game.example;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game_generated.dart' as my_game;
+import './monster_test_my_game.example2_generated.dart' as my_game_example2;
+
+///  Composite components of Monster color.
+class Color {
+  final int value;
+  const Color._(this.value);
+
+  factory Color.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum Color');
+    }
+    return values[value];
+  }
+
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const Color Red = const Color._(1);
+
+  ///  \brief color Green
+  ///  Green is bit_flag with value (1u << 1)
+  static const Color Green = const Color._(2);
+
+  ///  \brief color Blue (1u << 3)
+  static const Color Blue = const Color._(8);
+  static const Map<int,Color> values = {1: Red,2: Green,8: Blue,};
+
+  static const fb.Reader<Color> reader = const _ColorReader();
+
+  @override
+  String toString() {
+    return 'Color{value: $value}';
+  }
+}
+
+class _ColorReader extends fb.Reader<Color> {
+  const _ColorReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  Color read(fb.BufferContext bc, int offset) =>
+      new Color.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class Race {
+  final int value;
+  const Race._(this.value);
+
+  factory Race.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum Race');
+    }
+    return values[value];
+  }
+
+  static const int minValue = -1;
+  static const int maxValue = 2;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const Race None = const Race._(-1);
+  static const Race Human = const Race._(0);
+  static const Race Dwarf = const Race._(1);
+  static const Race Elf = const Race._(2);
+  static const Map<int,Race> values = {-1: None,0: Human,1: Dwarf,2: Elf,};
+
+  static const fb.Reader<Race> reader = const _RaceReader();
+
+  @override
+  String toString() {
+    return 'Race{value: $value}';
+  }
+}
+
+class _RaceReader extends fb.Reader<Race> {
+  const _RaceReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  Race read(fb.BufferContext bc, int offset) =>
+      new Race.fromValue(const fb.Int8Reader().read(bc, offset));
+}
+
+class AnyTypeId {
+  final int value;
+  const AnyTypeId._(this.value);
+
+  factory AnyTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyTypeId NONE = const AnyTypeId._(0);
+  static const AnyTypeId Monster = const AnyTypeId._(1);
+  static const AnyTypeId TestSimpleTableWithEnum = const AnyTypeId._(2);
+  static const AnyTypeId MyGame_Example2_Monster = const AnyTypeId._(3);
+  static const Map<int,AnyTypeId> values = {0: NONE,1: Monster,2: TestSimpleTableWithEnum,3: MyGame_Example2_Monster,};
+
+  static const fb.Reader<AnyTypeId> reader = const _AnyTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyTypeId{value: $value}';
+  }
+}
+
+class _AnyTypeIdReader extends fb.Reader<AnyTypeId> {
+  const _AnyTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class AnyUniqueAliasesTypeId {
+  final int value;
+  const AnyUniqueAliasesTypeId._(this.value);
+
+  factory AnyUniqueAliasesTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyUniqueAliasesTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyUniqueAliasesTypeId NONE = const AnyUniqueAliasesTypeId._(0);
+  static const AnyUniqueAliasesTypeId M = const AnyUniqueAliasesTypeId._(1);
+  static const AnyUniqueAliasesTypeId TS = const AnyUniqueAliasesTypeId._(2);
+  static const AnyUniqueAliasesTypeId M2 = const AnyUniqueAliasesTypeId._(3);
+  static const Map<int,AnyUniqueAliasesTypeId> values = {0: NONE,1: M,2: TS,3: M2,};
+
+  static const fb.Reader<AnyUniqueAliasesTypeId> reader = const _AnyUniqueAliasesTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyUniqueAliasesTypeId{value: $value}';
+  }
+}
+
+class _AnyUniqueAliasesTypeIdReader extends fb.Reader<AnyUniqueAliasesTypeId> {
+  const _AnyUniqueAliasesTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyUniqueAliasesTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyUniqueAliasesTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class AnyAmbiguousAliasesTypeId {
+  final int value;
+  const AnyAmbiguousAliasesTypeId._(this.value);
+
+  factory AnyAmbiguousAliasesTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum AnyAmbiguousAliasesTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 3;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const AnyAmbiguousAliasesTypeId NONE = const AnyAmbiguousAliasesTypeId._(0);
+  static const AnyAmbiguousAliasesTypeId M1 = const AnyAmbiguousAliasesTypeId._(1);
+  static const AnyAmbiguousAliasesTypeId M2 = const AnyAmbiguousAliasesTypeId._(2);
+  static const AnyAmbiguousAliasesTypeId M3 = const AnyAmbiguousAliasesTypeId._(3);
+  static const Map<int,AnyAmbiguousAliasesTypeId> values = {0: NONE,1: M1,2: M2,3: M3,};
+
+  static const fb.Reader<AnyAmbiguousAliasesTypeId> reader = const _AnyAmbiguousAliasesTypeIdReader();
+
+  @override
+  String toString() {
+    return 'AnyAmbiguousAliasesTypeId{value: $value}';
+  }
+}
+
+class _AnyAmbiguousAliasesTypeIdReader extends fb.Reader<AnyAmbiguousAliasesTypeId> {
+  const _AnyAmbiguousAliasesTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  AnyAmbiguousAliasesTypeId read(fb.BufferContext bc, int offset) =>
+      new AnyAmbiguousAliasesTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class Test {
+  Test._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Test> reader = const _TestReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get a => const fb.Int16Reader().read(_bc, _bcOffset + 0);
+  int get b => const fb.Int8Reader().read(_bc, _bcOffset + 2);
+
+  @override
+  String toString() {
+    return 'Test{a: $a, b: $b}';
+  }
+}
+
+class _TestReader extends fb.StructReader<Test> {
+  const _TestReader();
+
+  @override
+  int get size => 4;
+
+  @override
+  Test createObject(fb.BufferContext bc, int offset) => 
+    new Test._(bc, offset);
+}
+
+class TestBuilder {
+  TestBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(int a, int b) {
+    fbBuilder.pad(1);
+    fbBuilder.putInt8(b);
+    fbBuilder.putInt16(a);
+    return fbBuilder.offset;
+  }
+
+}
+
+class TestObjectBuilder extends fb.ObjectBuilder {
+  final int _a;
+  final int _b;
+
+  TestObjectBuilder({
+    int a,
+    int b,
+  })
+      : _a = a,
+        _b = b;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.pad(1);
+    fbBuilder.putInt8(_b);
+    fbBuilder.putInt16(_a);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class TestSimpleTableWithEnum {
+  TestSimpleTableWithEnum._(this._bc, this._bcOffset);
+  factory TestSimpleTableWithEnum(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TestSimpleTableWithEnum> reader = const _TestSimpleTableWithEnumReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Color get color => new Color.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 4, 2));
+
+  @override
+  String toString() {
+    return 'TestSimpleTableWithEnum{color: $color}';
+  }
+}
+
+class _TestSimpleTableWithEnumReader extends fb.TableReader<TestSimpleTableWithEnum> {
+  const _TestSimpleTableWithEnumReader();
+
+  @override
+  TestSimpleTableWithEnum createObject(fb.BufferContext bc, int offset) => 
+    new TestSimpleTableWithEnum._(bc, offset);
+}
+
+class TestSimpleTableWithEnumBuilder {
+  TestSimpleTableWithEnumBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addColor(Color color) {
+    fbBuilder.addUint8(0, color?.value);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TestSimpleTableWithEnumObjectBuilder extends fb.ObjectBuilder {
+  final Color _color;
+
+  TestSimpleTableWithEnumObjectBuilder({
+    Color color,
+  })
+      : _color = color;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    fbBuilder.addUint8(0, _color?.value);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Vec3 {
+  Vec3._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Vec3> reader = const _Vec3Reader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  double get x => const fb.Float32Reader().read(_bc, _bcOffset + 0);
+  double get y => const fb.Float32Reader().read(_bc, _bcOffset + 4);
+  double get z => const fb.Float32Reader().read(_bc, _bcOffset + 8);
+  double get test1 => const fb.Float64Reader().read(_bc, _bcOffset + 16);
+  Color get test2 => new Color.fromValue(const fb.Uint8Reader().read(_bc, _bcOffset + 24));
+  Test get test3 => Test.reader.read(_bc, _bcOffset + 26);
+
+  @override
+  String toString() {
+    return 'Vec3{x: $x, y: $y, z: $z, test1: $test1, test2: $test2, test3: $test3}';
+  }
+}
+
+class _Vec3Reader extends fb.StructReader<Vec3> {
+  const _Vec3Reader();
+
+  @override
+  int get size => 32;
+
+  @override
+  Vec3 createObject(fb.BufferContext bc, int offset) => 
+    new Vec3._(bc, offset);
+}
+
+class Vec3Builder {
+  Vec3Builder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(double x, double y, double z, double test1, Color test2, fb.StructBuilder test3) {
+    fbBuilder.pad(2);
+    test3();
+    fbBuilder.pad(1);
+    fbBuilder.putUint8(test2?.value);
+    fbBuilder.putFloat64(test1);
+    fbBuilder.pad(4);
+    fbBuilder.putFloat32(z);
+    fbBuilder.putFloat32(y);
+    fbBuilder.putFloat32(x);
+    return fbBuilder.offset;
+  }
+
+}
+
+class Vec3ObjectBuilder extends fb.ObjectBuilder {
+  final double _x;
+  final double _y;
+  final double _z;
+  final double _test1;
+  final Color _test2;
+  final TestObjectBuilder _test3;
+
+  Vec3ObjectBuilder({
+    double x,
+    double y,
+    double z,
+    double test1,
+    Color test2,
+    TestObjectBuilder test3,
+  })
+      : _x = x,
+        _y = y,
+        _z = z,
+        _test1 = test1,
+        _test2 = test2,
+        _test3 = test3;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.pad(2);
+    _test3.finish(fbBuilder);
+    fbBuilder.pad(1);
+    fbBuilder.putUint8(_test2?.value);
+    fbBuilder.putFloat64(_test1);
+    fbBuilder.pad(4);
+    fbBuilder.putFloat32(_z);
+    fbBuilder.putFloat32(_y);
+    fbBuilder.putFloat32(_x);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Ability {
+  Ability._(this._bc, this._bcOffset);
+
+  static const fb.Reader<Ability> reader = const _AbilityReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get id => const fb.Uint32Reader().read(_bc, _bcOffset + 0);
+  int get distance => const fb.Uint32Reader().read(_bc, _bcOffset + 4);
+
+  @override
+  String toString() {
+    return 'Ability{id: $id, distance: $distance}';
+  }
+}
+
+class _AbilityReader extends fb.StructReader<Ability> {
+  const _AbilityReader();
+
+  @override
+  int get size => 8;
+
+  @override
+  Ability createObject(fb.BufferContext bc, int offset) => 
+    new Ability._(bc, offset);
+}
+
+class AbilityBuilder {
+  AbilityBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(int id, int distance) {
+    fbBuilder.putUint32(distance);
+    fbBuilder.putUint32(id);
+    return fbBuilder.offset;
+  }
+
+}
+
+class AbilityObjectBuilder extends fb.ObjectBuilder {
+  final int _id;
+  final int _distance;
+
+  AbilityObjectBuilder({
+    int id,
+    int distance,
+  })
+      : _id = id,
+        _distance = distance;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.putUint32(_distance);
+    fbBuilder.putUint32(_id);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class StructOfStructs {
+  StructOfStructs._(this._bc, this._bcOffset);
+
+  static const fb.Reader<StructOfStructs> reader = const _StructOfStructsReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Ability get a => Ability.reader.read(_bc, _bcOffset + 0);
+  Test get b => Test.reader.read(_bc, _bcOffset + 8);
+  Ability get c => Ability.reader.read(_bc, _bcOffset + 12);
+
+  @override
+  String toString() {
+    return 'StructOfStructs{a: $a, b: $b, c: $c}';
+  }
+}
+
+class _StructOfStructsReader extends fb.StructReader<StructOfStructs> {
+  const _StructOfStructsReader();
+
+  @override
+  int get size => 20;
+
+  @override
+  StructOfStructs createObject(fb.BufferContext bc, int offset) => 
+    new StructOfStructs._(bc, offset);
+}
+
+class StructOfStructsBuilder {
+  StructOfStructsBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(fb.StructBuilder a, fb.StructBuilder b, fb.StructBuilder c) {
+    c();
+    b();
+    a();
+    return fbBuilder.offset;
+  }
+
+}
+
+class StructOfStructsObjectBuilder extends fb.ObjectBuilder {
+  final AbilityObjectBuilder _a;
+  final TestObjectBuilder _b;
+  final AbilityObjectBuilder _c;
+
+  StructOfStructsObjectBuilder({
+    AbilityObjectBuilder a,
+    TestObjectBuilder b,
+    AbilityObjectBuilder c,
+  })
+      : _a = a,
+        _b = b,
+        _c = c;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    _c.finish(fbBuilder);
+    _b.finish(fbBuilder);
+    _a.finish(fbBuilder);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Stat {
+  Stat._(this._bc, this._bcOffset);
+  factory Stat(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Stat> reader = const _StatReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  String get id => const fb.StringReader().vTableGet(_bc, _bcOffset, 4, null);
+  int get val => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 6, 0);
+  int get count => const fb.Uint16Reader().vTableGet(_bc, _bcOffset, 8, 0);
+
+  @override
+  String toString() {
+    return 'Stat{id: $id, val: $val, count: $count}';
+  }
+}
+
+class _StatReader extends fb.TableReader<Stat> {
+  const _StatReader();
+
+  @override
+  Stat createObject(fb.BufferContext bc, int offset) => 
+    new Stat._(bc, offset);
+}
+
+class StatBuilder {
+  StatBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addIdOffset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+  int addVal(int val) {
+    fbBuilder.addInt64(1, val);
+    return fbBuilder.offset;
+  }
+  int addCount(int count) {
+    fbBuilder.addUint16(2, count);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class StatObjectBuilder extends fb.ObjectBuilder {
+  final String _id;
+  final int _val;
+  final int _count;
+
+  StatObjectBuilder({
+    String id,
+    int val,
+    int count,
+  })
+      : _id = id,
+        _val = val,
+        _count = count;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int idOffset = fbBuilder.writeString(_id);
+
+    fbBuilder.startTable();
+    if (idOffset != null) {
+      fbBuilder.addOffset(0, idOffset);
+    }
+    fbBuilder.addInt64(1, _val);
+    fbBuilder.addUint16(2, _count);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class Referrable {
+  Referrable._(this._bc, this._bcOffset);
+  factory Referrable(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Referrable> reader = const _ReferrableReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get id => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 4, 0);
+
+  @override
+  String toString() {
+    return 'Referrable{id: $id}';
+  }
+}
+
+class _ReferrableReader extends fb.TableReader<Referrable> {
+  const _ReferrableReader();
+
+  @override
+  Referrable createObject(fb.BufferContext bc, int offset) => 
+    new Referrable._(bc, offset);
+}
+
+class ReferrableBuilder {
+  ReferrableBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addId(int id) {
+    fbBuilder.addUint64(0, id);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class ReferrableObjectBuilder extends fb.ObjectBuilder {
+  final int _id;
+
+  ReferrableObjectBuilder({
+    int id,
+  })
+      : _id = id;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    fbBuilder.addUint64(0, _id);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+///  an example documentation comment: "monster object"
+class Monster {
+  Monster._(this._bc, this._bcOffset);
+  factory Monster(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<Monster> reader = const _MonsterReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  Vec3 get pos => Vec3.reader.vTableGet(_bc, _bcOffset, 4, null);
+  int get mana => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 6, 150);
+  int get hp => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 8, 100);
+  String get name => const fb.StringReader().vTableGet(_bc, _bcOffset, 10, null);
+  List<int> get inventory => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 14, null);
+  Color get color => new Color.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 16, 8));
+  AnyTypeId get testType => new AnyTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 18, 0));
+  dynamic get test {
+    switch (testType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 20, null);
+      case 2: return TestSimpleTableWithEnum.reader.vTableGet(_bc, _bcOffset, 20, null);
+      case 3: return my_game_example2.Monster.reader.vTableGet(_bc, _bcOffset, 20, null);
+      default: return null;
+    }
+  }
+  List<Test> get test4 => const fb.ListReader<Test>(Test.reader).vTableGet(_bc, _bcOffset, 22, null);
+  List<String> get testarrayofstring => const fb.ListReader<String>(const fb.StringReader()).vTableGet(_bc, _bcOffset, 24, null);
+  ///  an example documentation comment: this will end up in the generated code
+  ///  multiline too
+  List<Monster> get testarrayoftables => const fb.ListReader<Monster>(Monster.reader).vTableGet(_bc, _bcOffset, 26, null);
+  Monster get enemy => Monster.reader.vTableGet(_bc, _bcOffset, 28, null);
+  List<int> get testnestedflatbuffer => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 30, null);
+  Stat get testempty => Stat.reader.vTableGet(_bc, _bcOffset, 32, null);
+  bool get testbool => const fb.BoolReader().vTableGet(_bc, _bcOffset, 34, false);
+  int get testhashs32Fnv1 => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 36, 0);
+  int get testhashu32Fnv1 => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 38, 0);
+  int get testhashs64Fnv1 => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 40, 0);
+  int get testhashu64Fnv1 => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 42, 0);
+  int get testhashs32Fnv1a => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 44, 0);
+  int get testhashu32Fnv1a => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 46, 0);
+  int get testhashs64Fnv1a => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 48, 0);
+  int get testhashu64Fnv1a => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 50, 0);
+  List<bool> get testarrayofbools => const fb.ListReader<bool>(const fb.BoolReader()).vTableGet(_bc, _bcOffset, 52, null);
+  double get testf => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 54, 3.14159);
+  double get testf2 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 56, 3.0);
+  double get testf3 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 58, 0.0);
+  List<String> get testarrayofstring2 => const fb.ListReader<String>(const fb.StringReader()).vTableGet(_bc, _bcOffset, 60, null);
+  List<Ability> get testarrayofsortedstruct => const fb.ListReader<Ability>(Ability.reader).vTableGet(_bc, _bcOffset, 62, null);
+  List<int> get flex => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 64, null);
+  List<Test> get test5 => const fb.ListReader<Test>(Test.reader).vTableGet(_bc, _bcOffset, 66, null);
+  List<int> get vectorOfLongs => const fb.ListReader<int>(const fb.Int64Reader()).vTableGet(_bc, _bcOffset, 68, null);
+  List<double> get vectorOfDoubles => const fb.ListReader<double>(const fb.Float64Reader()).vTableGet(_bc, _bcOffset, 70, null);
+  my_game.InParentNamespace get parentNamespaceTest => my_game.InParentNamespace.reader.vTableGet(_bc, _bcOffset, 72, null);
+  List<Referrable> get vectorOfReferrables => const fb.ListReader<Referrable>(Referrable.reader).vTableGet(_bc, _bcOffset, 74, null);
+  int get singleWeakReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 76, 0);
+  List<int> get vectorOfWeakReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 78, null);
+  List<Referrable> get vectorOfStrongReferrables => const fb.ListReader<Referrable>(Referrable.reader).vTableGet(_bc, _bcOffset, 80, null);
+  int get coOwningReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 82, 0);
+  List<int> get vectorOfCoOwningReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 84, null);
+  int get nonOwningReference => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 86, 0);
+  List<int> get vectorOfNonOwningReferences => const fb.ListReader<int>(const fb.Uint64Reader()).vTableGet(_bc, _bcOffset, 88, null);
+  AnyUniqueAliasesTypeId get anyUniqueType => new AnyUniqueAliasesTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 90, 0));
+  dynamic get anyUnique {
+    switch (anyUniqueType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 92, null);
+      case 2: return TestSimpleTableWithEnum.reader.vTableGet(_bc, _bcOffset, 92, null);
+      case 3: return my_game_example2.Monster.reader.vTableGet(_bc, _bcOffset, 92, null);
+      default: return null;
+    }
+  }
+  AnyAmbiguousAliasesTypeId get anyAmbiguousType => new AnyAmbiguousAliasesTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 94, 0));
+  dynamic get anyAmbiguous {
+    switch (anyAmbiguousType?.value) {
+      case 1: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      case 2: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      case 3: return Monster.reader.vTableGet(_bc, _bcOffset, 96, null);
+      default: return null;
+    }
+  }
+  List<Color> get vectorOfEnums => const fb.ListReader<Color>(Color.reader).vTableGet(_bc, _bcOffset, 98, null);
+  Race get signedEnum => new Race.fromValue(const fb.Int8Reader().vTableGet(_bc, _bcOffset, 100, -1));
+  List<int> get testrequirednestedflatbuffer => const fb.ListReader<int>(const fb.Uint8Reader()).vTableGet(_bc, _bcOffset, 102, null);
+  List<Stat> get scalarKeySortedTables => const fb.ListReader<Stat>(Stat.reader).vTableGet(_bc, _bcOffset, 104, null);
+
+  @override
+  String toString() {
+    return 'Monster{pos: $pos, mana: $mana, hp: $hp, name: $name, inventory: $inventory, color: $color, testType: $testType, test: $test, test4: $test4, testarrayofstring: $testarrayofstring, testarrayoftables: $testarrayoftables, enemy: $enemy, testnestedflatbuffer: $testnestedflatbuffer, testempty: $testempty, testbool: $testbool, testhashs32Fnv1: $testhashs32Fnv1, testhashu32Fnv1: $testhashu32Fnv1, testhashs64Fnv1: $testhashs64Fnv1, testhashu64Fnv1: $testhashu64Fnv1, testhashs32Fnv1a: $testhashs32Fnv1a, testhashu32Fnv1a: $testhashu32Fnv1a, testhashs64Fnv1a: $testhashs64Fnv1a, testhashu64Fnv1a: $testhashu64Fnv1a, testarrayofbools: $testarrayofbools, testf: $testf, testf2: $testf2, testf3: $testf3, testarrayofstring2: $testarrayofstring2, testarrayofsortedstruct: $testarrayofsortedstruct, flex: $flex, test5: $test5, vectorOfLongs: $vectorOfLongs, vectorOfDoubles: $vectorOfDoubles, parentNamespaceTest: $parentNamespaceTest, vectorOfReferrables: $vectorOfReferrables, singleWeakReference: $singleWeakReference, vectorOfWeakReferences: $vectorOfWeakReferences, vectorOfStrongReferrables: $vectorOfStrongReferrables, coOwningReference: $coOwningReference, vectorOfCoOwningReferences: $vectorOfCoOwningReferences, nonOwningReference: $nonOwningReference, vectorOfNonOwningReferences: $vectorOfNonOwningReferences, anyUniqueType: $anyUniqueType, anyUnique: $anyUnique, anyAmbiguousType: $anyAmbiguousType, anyAmbiguous: $anyAmbiguous, vectorOfEnums: $vectorOfEnums, signedEnum: $signedEnum, testrequirednestedflatbuffer: $testrequirednestedflatbuffer, scalarKeySortedTables: $scalarKeySortedTables}';
+  }
+}
+
+class _MonsterReader extends fb.TableReader<Monster> {
+  const _MonsterReader();
+
+  @override
+  Monster createObject(fb.BufferContext bc, int offset) => 
+    new Monster._(bc, offset);
+}
+
+class MonsterBuilder {
+  MonsterBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addPos(int offset) {
+    fbBuilder.addStruct(0, offset);
+    return fbBuilder.offset;
+  }
+  int addMana(int mana) {
+    fbBuilder.addInt16(1, mana);
+    return fbBuilder.offset;
+  }
+  int addHp(int hp) {
+    fbBuilder.addInt16(2, hp);
+    return fbBuilder.offset;
+  }
+  int addNameOffset(int offset) {
+    fbBuilder.addOffset(3, offset);
+    return fbBuilder.offset;
+  }
+  int addInventoryOffset(int offset) {
+    fbBuilder.addOffset(5, offset);
+    return fbBuilder.offset;
+  }
+  int addColor(Color color) {
+    fbBuilder.addUint8(6, color?.value);
+    return fbBuilder.offset;
+  }
+  int addTestType(AnyTypeId testType) {
+    fbBuilder.addUint8(7, testType?.value);
+    return fbBuilder.offset;
+  }
+  int addTestOffset(int offset) {
+    fbBuilder.addOffset(8, offset);
+    return fbBuilder.offset;
+  }
+  int addTest4Offset(int offset) {
+    fbBuilder.addOffset(9, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofstringOffset(int offset) {
+    fbBuilder.addOffset(10, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayoftablesOffset(int offset) {
+    fbBuilder.addOffset(11, offset);
+    return fbBuilder.offset;
+  }
+  int addEnemyOffset(int offset) {
+    fbBuilder.addOffset(12, offset);
+    return fbBuilder.offset;
+  }
+  int addTestnestedflatbufferOffset(int offset) {
+    fbBuilder.addOffset(13, offset);
+    return fbBuilder.offset;
+  }
+  int addTestemptyOffset(int offset) {
+    fbBuilder.addOffset(14, offset);
+    return fbBuilder.offset;
+  }
+  int addTestbool(bool testbool) {
+    fbBuilder.addBool(15, testbool);
+    return fbBuilder.offset;
+  }
+  int addTesthashs32Fnv1(int testhashs32Fnv1) {
+    fbBuilder.addInt32(16, testhashs32Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashu32Fnv1(int testhashu32Fnv1) {
+    fbBuilder.addUint32(17, testhashu32Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashs64Fnv1(int testhashs64Fnv1) {
+    fbBuilder.addInt64(18, testhashs64Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashu64Fnv1(int testhashu64Fnv1) {
+    fbBuilder.addUint64(19, testhashu64Fnv1);
+    return fbBuilder.offset;
+  }
+  int addTesthashs32Fnv1a(int testhashs32Fnv1a) {
+    fbBuilder.addInt32(20, testhashs32Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashu32Fnv1a(int testhashu32Fnv1a) {
+    fbBuilder.addUint32(21, testhashu32Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashs64Fnv1a(int testhashs64Fnv1a) {
+    fbBuilder.addInt64(22, testhashs64Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTesthashu64Fnv1a(int testhashu64Fnv1a) {
+    fbBuilder.addUint64(23, testhashu64Fnv1a);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofboolsOffset(int offset) {
+    fbBuilder.addOffset(24, offset);
+    return fbBuilder.offset;
+  }
+  int addTestf(double testf) {
+    fbBuilder.addFloat32(25, testf);
+    return fbBuilder.offset;
+  }
+  int addTestf2(double testf2) {
+    fbBuilder.addFloat32(26, testf2);
+    return fbBuilder.offset;
+  }
+  int addTestf3(double testf3) {
+    fbBuilder.addFloat32(27, testf3);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofstring2Offset(int offset) {
+    fbBuilder.addOffset(28, offset);
+    return fbBuilder.offset;
+  }
+  int addTestarrayofsortedstructOffset(int offset) {
+    fbBuilder.addOffset(29, offset);
+    return fbBuilder.offset;
+  }
+  int addFlexOffset(int offset) {
+    fbBuilder.addOffset(30, offset);
+    return fbBuilder.offset;
+  }
+  int addTest5Offset(int offset) {
+    fbBuilder.addOffset(31, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfLongsOffset(int offset) {
+    fbBuilder.addOffset(32, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfDoublesOffset(int offset) {
+    fbBuilder.addOffset(33, offset);
+    return fbBuilder.offset;
+  }
+  int addParentNamespaceTestOffset(int offset) {
+    fbBuilder.addOffset(34, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfReferrablesOffset(int offset) {
+    fbBuilder.addOffset(35, offset);
+    return fbBuilder.offset;
+  }
+  int addSingleWeakReference(int singleWeakReference) {
+    fbBuilder.addUint64(36, singleWeakReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfWeakReferencesOffset(int offset) {
+    fbBuilder.addOffset(37, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfStrongReferrablesOffset(int offset) {
+    fbBuilder.addOffset(38, offset);
+    return fbBuilder.offset;
+  }
+  int addCoOwningReference(int coOwningReference) {
+    fbBuilder.addUint64(39, coOwningReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfCoOwningReferencesOffset(int offset) {
+    fbBuilder.addOffset(40, offset);
+    return fbBuilder.offset;
+  }
+  int addNonOwningReference(int nonOwningReference) {
+    fbBuilder.addUint64(41, nonOwningReference);
+    return fbBuilder.offset;
+  }
+  int addVectorOfNonOwningReferencesOffset(int offset) {
+    fbBuilder.addOffset(42, offset);
+    return fbBuilder.offset;
+  }
+  int addAnyUniqueType(AnyUniqueAliasesTypeId anyUniqueType) {
+    fbBuilder.addUint8(43, anyUniqueType?.value);
+    return fbBuilder.offset;
+  }
+  int addAnyUniqueOffset(int offset) {
+    fbBuilder.addOffset(44, offset);
+    return fbBuilder.offset;
+  }
+  int addAnyAmbiguousType(AnyAmbiguousAliasesTypeId anyAmbiguousType) {
+    fbBuilder.addUint8(45, anyAmbiguousType?.value);
+    return fbBuilder.offset;
+  }
+  int addAnyAmbiguousOffset(int offset) {
+    fbBuilder.addOffset(46, offset);
+    return fbBuilder.offset;
+  }
+  int addVectorOfEnumsOffset(int offset) {
+    fbBuilder.addOffset(47, offset);
+    return fbBuilder.offset;
+  }
+  int addSignedEnum(Race signedEnum) {
+    fbBuilder.addInt8(48, signedEnum?.value);
+    return fbBuilder.offset;
+  }
+  int addTestrequirednestedflatbufferOffset(int offset) {
+    fbBuilder.addOffset(49, offset);
+    return fbBuilder.offset;
+  }
+  int addScalarKeySortedTablesOffset(int offset) {
+    fbBuilder.addOffset(50, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class MonsterObjectBuilder extends fb.ObjectBuilder {
+  final Vec3ObjectBuilder _pos;
+  final int _mana;
+  final int _hp;
+  final String _name;
+  final List<int> _inventory;
+  final Color _color;
+  final AnyTypeId _testType;
+  final dynamic _test;
+  final List<TestObjectBuilder> _test4;
+  final List<String> _testarrayofstring;
+  final List<MonsterObjectBuilder> _testarrayoftables;
+  final MonsterObjectBuilder _enemy;
+  final List<int> _testnestedflatbuffer;
+  final StatObjectBuilder _testempty;
+  final bool _testbool;
+  final int _testhashs32Fnv1;
+  final int _testhashu32Fnv1;
+  final int _testhashs64Fnv1;
+  final int _testhashu64Fnv1;
+  final int _testhashs32Fnv1a;
+  final int _testhashu32Fnv1a;
+  final int _testhashs64Fnv1a;
+  final int _testhashu64Fnv1a;
+  final List<bool> _testarrayofbools;
+  final double _testf;
+  final double _testf2;
+  final double _testf3;
+  final List<String> _testarrayofstring2;
+  final List<AbilityObjectBuilder> _testarrayofsortedstruct;
+  final List<int> _flex;
+  final List<TestObjectBuilder> _test5;
+  final List<int> _vectorOfLongs;
+  final List<double> _vectorOfDoubles;
+  final my_game.InParentNamespaceObjectBuilder _parentNamespaceTest;
+  final List<ReferrableObjectBuilder> _vectorOfReferrables;
+  final int _singleWeakReference;
+  final List<int> _vectorOfWeakReferences;
+  final List<ReferrableObjectBuilder> _vectorOfStrongReferrables;
+  final int _coOwningReference;
+  final List<int> _vectorOfCoOwningReferences;
+  final int _nonOwningReference;
+  final List<int> _vectorOfNonOwningReferences;
+  final AnyUniqueAliasesTypeId _anyUniqueType;
+  final dynamic _anyUnique;
+  final AnyAmbiguousAliasesTypeId _anyAmbiguousType;
+  final dynamic _anyAmbiguous;
+  final List<Color> _vectorOfEnums;
+  final Race _signedEnum;
+  final List<int> _testrequirednestedflatbuffer;
+  final List<StatObjectBuilder> _scalarKeySortedTables;
+
+  MonsterObjectBuilder({
+    Vec3ObjectBuilder pos,
+    int mana,
+    int hp,
+    String name,
+    List<int> inventory,
+    Color color,
+    AnyTypeId testType,
+    dynamic test,
+    List<TestObjectBuilder> test4,
+    List<String> testarrayofstring,
+    List<MonsterObjectBuilder> testarrayoftables,
+    MonsterObjectBuilder enemy,
+    List<int> testnestedflatbuffer,
+    StatObjectBuilder testempty,
+    bool testbool,
+    int testhashs32Fnv1,
+    int testhashu32Fnv1,
+    int testhashs64Fnv1,
+    int testhashu64Fnv1,
+    int testhashs32Fnv1a,
+    int testhashu32Fnv1a,
+    int testhashs64Fnv1a,
+    int testhashu64Fnv1a,
+    List<bool> testarrayofbools,
+    double testf,
+    double testf2,
+    double testf3,
+    List<String> testarrayofstring2,
+    List<AbilityObjectBuilder> testarrayofsortedstruct,
+    List<int> flex,
+    List<TestObjectBuilder> test5,
+    List<int> vectorOfLongs,
+    List<double> vectorOfDoubles,
+    my_game.InParentNamespaceObjectBuilder parentNamespaceTest,
+    List<ReferrableObjectBuilder> vectorOfReferrables,
+    int singleWeakReference,
+    List<int> vectorOfWeakReferences,
+    List<ReferrableObjectBuilder> vectorOfStrongReferrables,
+    int coOwningReference,
+    List<int> vectorOfCoOwningReferences,
+    int nonOwningReference,
+    List<int> vectorOfNonOwningReferences,
+    AnyUniqueAliasesTypeId anyUniqueType,
+    dynamic anyUnique,
+    AnyAmbiguousAliasesTypeId anyAmbiguousType,
+    dynamic anyAmbiguous,
+    List<Color> vectorOfEnums,
+    Race signedEnum,
+    List<int> testrequirednestedflatbuffer,
+    List<StatObjectBuilder> scalarKeySortedTables,
+  })
+      : _pos = pos,
+        _mana = mana,
+        _hp = hp,
+        _name = name,
+        _inventory = inventory,
+        _color = color,
+        _testType = testType,
+        _test = test,
+        _test4 = test4,
+        _testarrayofstring = testarrayofstring,
+        _testarrayoftables = testarrayoftables,
+        _enemy = enemy,
+        _testnestedflatbuffer = testnestedflatbuffer,
+        _testempty = testempty,
+        _testbool = testbool,
+        _testhashs32Fnv1 = testhashs32Fnv1,
+        _testhashu32Fnv1 = testhashu32Fnv1,
+        _testhashs64Fnv1 = testhashs64Fnv1,
+        _testhashu64Fnv1 = testhashu64Fnv1,
+        _testhashs32Fnv1a = testhashs32Fnv1a,
+        _testhashu32Fnv1a = testhashu32Fnv1a,
+        _testhashs64Fnv1a = testhashs64Fnv1a,
+        _testhashu64Fnv1a = testhashu64Fnv1a,
+        _testarrayofbools = testarrayofbools,
+        _testf = testf,
+        _testf2 = testf2,
+        _testf3 = testf3,
+        _testarrayofstring2 = testarrayofstring2,
+        _testarrayofsortedstruct = testarrayofsortedstruct,
+        _flex = flex,
+        _test5 = test5,
+        _vectorOfLongs = vectorOfLongs,
+        _vectorOfDoubles = vectorOfDoubles,
+        _parentNamespaceTest = parentNamespaceTest,
+        _vectorOfReferrables = vectorOfReferrables,
+        _singleWeakReference = singleWeakReference,
+        _vectorOfWeakReferences = vectorOfWeakReferences,
+        _vectorOfStrongReferrables = vectorOfStrongReferrables,
+        _coOwningReference = coOwningReference,
+        _vectorOfCoOwningReferences = vectorOfCoOwningReferences,
+        _nonOwningReference = nonOwningReference,
+        _vectorOfNonOwningReferences = vectorOfNonOwningReferences,
+        _anyUniqueType = anyUniqueType,
+        _anyUnique = anyUnique,
+        _anyAmbiguousType = anyAmbiguousType,
+        _anyAmbiguous = anyAmbiguous,
+        _vectorOfEnums = vectorOfEnums,
+        _signedEnum = signedEnum,
+        _testrequirednestedflatbuffer = testrequirednestedflatbuffer,
+        _scalarKeySortedTables = scalarKeySortedTables;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int nameOffset = fbBuilder.writeString(_name);
+    final int inventoryOffset = _inventory?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_inventory)
+        : null;
+    final int testOffset = _test?.getOrCreateOffset(fbBuilder);
+    final int test4Offset = _test4?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_test4)
+        : null;
+    final int testarrayofstringOffset = _testarrayofstring?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayofstring.map((b) => fbBuilder.writeString(b)).toList())
+        : null;
+    final int testarrayoftablesOffset = _testarrayoftables?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayoftables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int enemyOffset = _enemy?.getOrCreateOffset(fbBuilder);
+    final int testnestedflatbufferOffset = _testnestedflatbuffer?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_testnestedflatbuffer)
+        : null;
+    final int testemptyOffset = _testempty?.getOrCreateOffset(fbBuilder);
+    final int testarrayofboolsOffset = _testarrayofbools?.isNotEmpty == true
+        ? fbBuilder.writeListBool(_testarrayofbools)
+        : null;
+    final int testarrayofstring2Offset = _testarrayofstring2?.isNotEmpty == true
+        ? fbBuilder.writeList(_testarrayofstring2.map((b) => fbBuilder.writeString(b)).toList())
+        : null;
+    final int testarrayofsortedstructOffset = _testarrayofsortedstruct?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_testarrayofsortedstruct)
+        : null;
+    final int flexOffset = _flex?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_flex)
+        : null;
+    final int test5Offset = _test5?.isNotEmpty == true
+        ? fbBuilder.writeListOfStructs(_test5)
+        : null;
+    final int vectorOfLongsOffset = _vectorOfLongs?.isNotEmpty == true
+        ? fbBuilder.writeListInt64(_vectorOfLongs)
+        : null;
+    final int vectorOfDoublesOffset = _vectorOfDoubles?.isNotEmpty == true
+        ? fbBuilder.writeListFloat64(_vectorOfDoubles)
+        : null;
+    final int parentNamespaceTestOffset = _parentNamespaceTest?.getOrCreateOffset(fbBuilder);
+    final int vectorOfReferrablesOffset = _vectorOfReferrables?.isNotEmpty == true
+        ? fbBuilder.writeList(_vectorOfReferrables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int vectorOfWeakReferencesOffset = _vectorOfWeakReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfWeakReferences)
+        : null;
+    final int vectorOfStrongReferrablesOffset = _vectorOfStrongReferrables?.isNotEmpty == true
+        ? fbBuilder.writeList(_vectorOfStrongReferrables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+    final int vectorOfCoOwningReferencesOffset = _vectorOfCoOwningReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfCoOwningReferences)
+        : null;
+    final int vectorOfNonOwningReferencesOffset = _vectorOfNonOwningReferences?.isNotEmpty == true
+        ? fbBuilder.writeListUint64(_vectorOfNonOwningReferences)
+        : null;
+    final int anyUniqueOffset = _anyUnique?.getOrCreateOffset(fbBuilder);
+    final int anyAmbiguousOffset = _anyAmbiguous?.getOrCreateOffset(fbBuilder);
+    final int vectorOfEnumsOffset = _vectorOfEnums?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_vectorOfEnums.map((f) => f.value))
+        : null;
+    final int testrequirednestedflatbufferOffset = _testrequirednestedflatbuffer?.isNotEmpty == true
+        ? fbBuilder.writeListUint8(_testrequirednestedflatbuffer)
+        : null;
+    final int scalarKeySortedTablesOffset = _scalarKeySortedTables?.isNotEmpty == true
+        ? fbBuilder.writeList(_scalarKeySortedTables.map((b) => b.getOrCreateOffset(fbBuilder)).toList())
+        : null;
+
+    fbBuilder.startTable();
+    if (_pos != null) {
+      fbBuilder.addStruct(0, _pos.finish(fbBuilder));
+    }
+    fbBuilder.addInt16(1, _mana);
+    fbBuilder.addInt16(2, _hp);
+    if (nameOffset != null) {
+      fbBuilder.addOffset(3, nameOffset);
+    }
+    if (inventoryOffset != null) {
+      fbBuilder.addOffset(5, inventoryOffset);
+    }
+    fbBuilder.addUint8(6, _color?.value);
+    fbBuilder.addUint8(7, _testType?.value);
+    if (testOffset != null) {
+      fbBuilder.addOffset(8, testOffset);
+    }
+    if (test4Offset != null) {
+      fbBuilder.addOffset(9, test4Offset);
+    }
+    if (testarrayofstringOffset != null) {
+      fbBuilder.addOffset(10, testarrayofstringOffset);
+    }
+    if (testarrayoftablesOffset != null) {
+      fbBuilder.addOffset(11, testarrayoftablesOffset);
+    }
+    if (enemyOffset != null) {
+      fbBuilder.addOffset(12, enemyOffset);
+    }
+    if (testnestedflatbufferOffset != null) {
+      fbBuilder.addOffset(13, testnestedflatbufferOffset);
+    }
+    if (testemptyOffset != null) {
+      fbBuilder.addOffset(14, testemptyOffset);
+    }
+    fbBuilder.addBool(15, _testbool);
+    fbBuilder.addInt32(16, _testhashs32Fnv1);
+    fbBuilder.addUint32(17, _testhashu32Fnv1);
+    fbBuilder.addInt64(18, _testhashs64Fnv1);
+    fbBuilder.addUint64(19, _testhashu64Fnv1);
+    fbBuilder.addInt32(20, _testhashs32Fnv1a);
+    fbBuilder.addUint32(21, _testhashu32Fnv1a);
+    fbBuilder.addInt64(22, _testhashs64Fnv1a);
+    fbBuilder.addUint64(23, _testhashu64Fnv1a);
+    if (testarrayofboolsOffset != null) {
+      fbBuilder.addOffset(24, testarrayofboolsOffset);
+    }
+    fbBuilder.addFloat32(25, _testf);
+    fbBuilder.addFloat32(26, _testf2);
+    fbBuilder.addFloat32(27, _testf3);
+    if (testarrayofstring2Offset != null) {
+      fbBuilder.addOffset(28, testarrayofstring2Offset);
+    }
+    if (testarrayofsortedstructOffset != null) {
+      fbBuilder.addOffset(29, testarrayofsortedstructOffset);
+    }
+    if (flexOffset != null) {
+      fbBuilder.addOffset(30, flexOffset);
+    }
+    if (test5Offset != null) {
+      fbBuilder.addOffset(31, test5Offset);
+    }
+    if (vectorOfLongsOffset != null) {
+      fbBuilder.addOffset(32, vectorOfLongsOffset);
+    }
+    if (vectorOfDoublesOffset != null) {
+      fbBuilder.addOffset(33, vectorOfDoublesOffset);
+    }
+    if (parentNamespaceTestOffset != null) {
+      fbBuilder.addOffset(34, parentNamespaceTestOffset);
+    }
+    if (vectorOfReferrablesOffset != null) {
+      fbBuilder.addOffset(35, vectorOfReferrablesOffset);
+    }
+    fbBuilder.addUint64(36, _singleWeakReference);
+    if (vectorOfWeakReferencesOffset != null) {
+      fbBuilder.addOffset(37, vectorOfWeakReferencesOffset);
+    }
+    if (vectorOfStrongReferrablesOffset != null) {
+      fbBuilder.addOffset(38, vectorOfStrongReferrablesOffset);
+    }
+    fbBuilder.addUint64(39, _coOwningReference);
+    if (vectorOfCoOwningReferencesOffset != null) {
+      fbBuilder.addOffset(40, vectorOfCoOwningReferencesOffset);
+    }
+    fbBuilder.addUint64(41, _nonOwningReference);
+    if (vectorOfNonOwningReferencesOffset != null) {
+      fbBuilder.addOffset(42, vectorOfNonOwningReferencesOffset);
+    }
+    fbBuilder.addUint8(43, _anyUniqueType?.value);
+    if (anyUniqueOffset != null) {
+      fbBuilder.addOffset(44, anyUniqueOffset);
+    }
+    fbBuilder.addUint8(45, _anyAmbiguousType?.value);
+    if (anyAmbiguousOffset != null) {
+      fbBuilder.addOffset(46, anyAmbiguousOffset);
+    }
+    if (vectorOfEnumsOffset != null) {
+      fbBuilder.addOffset(47, vectorOfEnumsOffset);
+    }
+    fbBuilder.addInt8(48, _signedEnum?.value);
+    if (testrequirednestedflatbufferOffset != null) {
+      fbBuilder.addOffset(49, testrequirednestedflatbufferOffset);
+    }
+    if (scalarKeySortedTablesOffset != null) {
+      fbBuilder.addOffset(50, scalarKeySortedTablesOffset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class TypeAliases {
+  TypeAliases._(this._bc, this._bcOffset);
+  factory TypeAliases(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TypeAliases> reader = const _TypeAliasesReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get i8 => const fb.Int8Reader().vTableGet(_bc, _bcOffset, 4, 0);
+  int get u8 => const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 6, 0);
+  int get i16 => const fb.Int16Reader().vTableGet(_bc, _bcOffset, 8, 0);
+  int get u16 => const fb.Uint16Reader().vTableGet(_bc, _bcOffset, 10, 0);
+  int get i32 => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 12, 0);
+  int get u32 => const fb.Uint32Reader().vTableGet(_bc, _bcOffset, 14, 0);
+  int get i64 => const fb.Int64Reader().vTableGet(_bc, _bcOffset, 16, 0);
+  int get u64 => const fb.Uint64Reader().vTableGet(_bc, _bcOffset, 18, 0);
+  double get f32 => const fb.Float32Reader().vTableGet(_bc, _bcOffset, 20, 0.0);
+  double get f64 => const fb.Float64Reader().vTableGet(_bc, _bcOffset, 22, 0.0);
+  List<int> get v8 => const fb.ListReader<int>(const fb.Int8Reader()).vTableGet(_bc, _bcOffset, 24, null);
+  List<double> get vf64 => const fb.ListReader<double>(const fb.Float64Reader()).vTableGet(_bc, _bcOffset, 26, null);
+
+  @override
+  String toString() {
+    return 'TypeAliases{i8: $i8, u8: $u8, i16: $i16, u16: $u16, i32: $i32, u32: $u32, i64: $i64, u64: $u64, f32: $f32, f64: $f64, v8: $v8, vf64: $vf64}';
+  }
+}
+
+class _TypeAliasesReader extends fb.TableReader<TypeAliases> {
+  const _TypeAliasesReader();
+
+  @override
+  TypeAliases createObject(fb.BufferContext bc, int offset) => 
+    new TypeAliases._(bc, offset);
+}
+
+class TypeAliasesBuilder {
+  TypeAliasesBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addI8(int i8) {
+    fbBuilder.addInt8(0, i8);
+    return fbBuilder.offset;
+  }
+  int addU8(int u8) {
+    fbBuilder.addUint8(1, u8);
+    return fbBuilder.offset;
+  }
+  int addI16(int i16) {
+    fbBuilder.addInt16(2, i16);
+    return fbBuilder.offset;
+  }
+  int addU16(int u16) {
+    fbBuilder.addUint16(3, u16);
+    return fbBuilder.offset;
+  }
+  int addI32(int i32) {
+    fbBuilder.addInt32(4, i32);
+    return fbBuilder.offset;
+  }
+  int addU32(int u32) {
+    fbBuilder.addUint32(5, u32);
+    return fbBuilder.offset;
+  }
+  int addI64(int i64) {
+    fbBuilder.addInt64(6, i64);
+    return fbBuilder.offset;
+  }
+  int addU64(int u64) {
+    fbBuilder.addUint64(7, u64);
+    return fbBuilder.offset;
+  }
+  int addF32(double f32) {
+    fbBuilder.addFloat32(8, f32);
+    return fbBuilder.offset;
+  }
+  int addF64(double f64) {
+    fbBuilder.addFloat64(9, f64);
+    return fbBuilder.offset;
+  }
+  int addV8Offset(int offset) {
+    fbBuilder.addOffset(10, offset);
+    return fbBuilder.offset;
+  }
+  int addVf64Offset(int offset) {
+    fbBuilder.addOffset(11, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TypeAliasesObjectBuilder extends fb.ObjectBuilder {
+  final int _i8;
+  final int _u8;
+  final int _i16;
+  final int _u16;
+  final int _i32;
+  final int _u32;
+  final int _i64;
+  final int _u64;
+  final double _f32;
+  final double _f64;
+  final List<int> _v8;
+  final List<double> _vf64;
+
+  TypeAliasesObjectBuilder({
+    int i8,
+    int u8,
+    int i16,
+    int u16,
+    int i32,
+    int u32,
+    int i64,
+    int u64,
+    double f32,
+    double f64,
+    List<int> v8,
+    List<double> vf64,
+  })
+      : _i8 = i8,
+        _u8 = u8,
+        _i16 = i16,
+        _u16 = u16,
+        _i32 = i32,
+        _u32 = u32,
+        _i64 = i64,
+        _u64 = u64,
+        _f32 = f32,
+        _f64 = f64,
+        _v8 = v8,
+        _vf64 = vf64;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int v8Offset = _v8?.isNotEmpty == true
+        ? fbBuilder.writeListInt8(_v8)
+        : null;
+    final int vf64Offset = _vf64?.isNotEmpty == true
+        ? fbBuilder.writeListFloat64(_vf64)
+        : null;
+
+    fbBuilder.startTable();
+    fbBuilder.addInt8(0, _i8);
+    fbBuilder.addUint8(1, _u8);
+    fbBuilder.addInt16(2, _i16);
+    fbBuilder.addUint16(3, _u16);
+    fbBuilder.addInt32(4, _i32);
+    fbBuilder.addUint32(5, _u32);
+    fbBuilder.addInt64(6, _i64);
+    fbBuilder.addUint64(7, _u64);
+    fbBuilder.addFloat32(8, _f32);
+    fbBuilder.addFloat64(9, _f64);
+    if (v8Offset != null) {
+      fbBuilder.addOffset(10, v8Offset);
+    }
+    if (vf64Offset != null) {
+      fbBuilder.addOffset(11, vf64Offset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game_generated.dart
new file mode 100644
index 0000000..abd538c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monster_test_my_game_generated.dart
@@ -0,0 +1,60 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library my_game;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './monster_test_my_game.example_generated.dart' as my_game_example;
+import './monster_test_my_game.example2_generated.dart' as my_game_example2;
+
+class InParentNamespace {
+  InParentNamespace._(this._bc, this._bcOffset);
+  factory InParentNamespace(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<InParentNamespace> reader = const _InParentNamespaceReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+
+  @override
+  String toString() {
+    return 'InParentNamespace{}';
+  }
+}
+
+class _InParentNamespaceReader extends fb.TableReader<InParentNamespace> {
+  const _InParentNamespaceReader();
+
+  @override
+  InParentNamespace createObject(fb.BufferContext bc, int offset) => 
+    new InParentNamespace._(bc, offset);
+}
+
+class InParentNamespaceObjectBuilder extends fb.ObjectBuilder {
+
+  InParentNamespaceObjectBuilder();
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_extra.json b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_extra.json
new file mode 100644
index 0000000..53045cd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_extra.json
@@ -0,0 +1,15 @@
+{
+  // Initialize with non-default values.
+  d0 : -nan, // match with default
+  d1 : +inf,
+  d2 : -inf,
+  d3:   nan,
+  f0 : +nan, // match with default
+  f1 : -nan, // match with default
+  f2 : +inf, // match with default
+  f3 : -inf, // match with default
+  // Values should have exact binary representation 
+  // to avoid rounding effects in tests.
+  dvec : [2.0, +inf, -inf, nan,],
+  fvec : [1.0, -inf, +inf, nan],
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_go_wire.mon.sp b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_go_wire.mon.sp
new file mode 100644
index 0000000..cf3019c
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_go_wire.mon.sp differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_python_wire.mon b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_python_wire.mon
new file mode 100644
index 0000000..2fb956d
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_python_wire.mon differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.golden b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.golden
new file mode 100644
index 0000000..4bead97
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.golden
@@ -0,0 +1,82 @@
+{
+  pos: {
+    x: 1.0,
+    y: 2.0,
+    z: 3.0,
+    test1: 3.14159265359,
+    test2: "Green",
+    test3: {
+      a: 10,
+      b: 20
+    }
+  },
+  hp: 80,
+  name: "MyMonster",
+  inventory: [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9
+  ],
+  test_type: "Monster",
+  test: {
+    name: "Fred"
+  },
+  test4: [
+    {
+      a: 10,
+      b: 20
+    },
+    {
+      a: 30,
+      b: 40
+    }
+  ],
+  testarrayofstring: [
+    "bob",
+    "fred",
+    "bob",
+    "fred"
+  ],
+  testarrayoftables: [
+    {
+      hp: 1000,
+      name: "Barney"
+    },
+    {
+      name: "Fred"
+    },
+    {
+      name: "Wilma"
+    }
+  ],
+  testnestedflatbuffer: {
+    name: "NestedMonster"
+  },
+  testbool: true,
+  testhashs32_fnv1: -579221183,
+  testhashu32_fnv1: 3715746113,
+  testhashs64_fnv1: 7930699090847568257,
+  testhashu64_fnv1: 7930699090847568257,
+  testhashs32_fnv1a: -1904106383,
+  testhashu32_fnv1a: 2390860913,
+  testhashs64_fnv1a: 4898026182817603057,
+  testhashu64_fnv1a: 4898026182817603057,
+  flex: 1234,
+  test5: [
+    {
+      a: 10,
+      b: 20
+    },
+    {
+      a: 30,
+      b: 40
+    }
+  ]
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.json b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.json
new file mode 100644
index 0000000..c0b467c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.json
@@ -0,0 +1,84 @@
+{
+  pos: {
+    x: 1,
+    y: "2",
+    z: 3,
+    test1: 3,
+    test2: "Green",
+    test3: {
+      a: 5,
+      b: 6
+    }
+  },
+  hp: 80,
+  name: "MyMonster",
+  inventory: [
+    0,
+    1,
+    2,
+    3,
+    4
+  ],
+  vector_of_longs: [
+      1,
+      100,
+      10000,
+      1000000,
+      100000000
+  ],
+  vector_of_doubles: [
+      -1.7976931348623157e+308,
+      0,
+      1.7976931348623157e+308
+  ],
+  test_type: "Monster",
+  test: {
+    name: "Fred",
+    pos: null
+  },
+  test4: [
+    {
+      a: 10,
+      b: 20
+    },
+    {
+      b: "40",
+      a: 30
+    }
+  ],
+  test5: [
+    {
+      a: 10,
+      b: 20
+    },
+    {
+      b: "40",
+      a: 30
+    }
+  ],
+  testarrayofstring: [
+    "test1",
+    "test2"
+  ],
+  enemy: {
+    name: "Fred"
+  },
+  testarrayofbools:[
+    true, false, true
+  ],
+  testbool: true,
+  testhashs32_fnv1: "This string is being hashed!",
+  testhashu32_fnv1: "This string is being hashed!",
+  testhashs64_fnv1: "This string is being hashed!",
+  testhashu64_fnv1: "This string is being hashed!",
+  testhashs32_fnv1a: "This string is being hashed!",
+  testhashu32_fnv1a: "This string is being hashed!",
+  testhashs64_fnv1a: "This string is being hashed!",
+  testhashu64_fnv1a: "This string is being hashed!",
+  testarrayofsortedstruct:[
+    {id:5,distance:12}, {id:1,distance:21}, {id:0,distance:45}
+  ],
+  scalar_key_sorted_tables:[
+    {id:"hit",val:10,count:1}, {id:"miss",val:0,count:0}
+  ],
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.mon b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.mon
new file mode 100644
index 0000000..cd52947
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/monsterdata_test.mon differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults.fbs
new file mode 100644
index 0000000..da4976a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults.fbs
@@ -0,0 +1,12 @@
+
+enum ABC: int { A, B, C }
+
+
+table MoreDefaults {
+  ints: [int] = [];
+  floats: [float] = [     ];
+  empty_string: string = "";
+  some_string: string = "some";
+  abcs: [ABC] = [];
+  bools: [bool] = [];
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults_generated.rs
new file mode 100644
index 0000000..be818bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/more_defaults_generated.rs
@@ -0,0 +1,357 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ABC: i32 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ABC: i32 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ABC: [ABC; 3] = [
+  ABC::A,
+  ABC::B,
+  ABC::C,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct ABC(pub i32);
+#[allow(non_upper_case_globals)]
+impl ABC {
+  pub const A: Self = Self(0);
+  pub const B: Self = Self(1);
+  pub const C: Self = Self(2);
+
+  pub const ENUM_MIN: i32 = 0;
+  pub const ENUM_MAX: i32 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::A,
+    Self::B,
+    Self::C,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::A => Some("A"),
+      Self::B => Some("B"),
+      Self::C => Some("C"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for ABC {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for ABC {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i32>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for ABC {
+    type Output = ABC;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i32>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for ABC {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i32::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i32::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for ABC {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i32::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for ABC {}
+pub enum MoreDefaultsOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct MoreDefaults<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for MoreDefaults<'a> {
+    type Inner = MoreDefaults<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> MoreDefaults<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        MoreDefaults { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args MoreDefaultsArgs<'args>) -> flatbuffers::WIPOffset<MoreDefaults<'bldr>> {
+      let mut builder = MoreDefaultsBuilder::new(_fbb);
+      if let Some(x) = args.bools { builder.add_bools(x); }
+      if let Some(x) = args.abcs { builder.add_abcs(x); }
+      if let Some(x) = args.some_string { builder.add_some_string(x); }
+      if let Some(x) = args.empty_string { builder.add_empty_string(x); }
+      if let Some(x) = args.floats { builder.add_floats(x); }
+      if let Some(x) = args.ints { builder.add_ints(x); }
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> MoreDefaultsT {
+      let ints = {
+        let x = self.ints();
+        x.into_iter().collect()
+      };
+      let floats = {
+        let x = self.floats();
+        x.into_iter().collect()
+      };
+      let empty_string = {
+        let x = self.empty_string();
+        x.to_string()
+      };
+      let some_string = {
+        let x = self.some_string();
+        x.to_string()
+      };
+      let abcs = {
+        let x = self.abcs();
+        x.into_iter().collect()
+      };
+      let bools = {
+        let x = self.bools();
+        x.to_vec()
+      };
+      MoreDefaultsT {
+        ints,
+        floats,
+        empty_string,
+        some_string,
+        abcs,
+        bools,
+      }
+    }
+    pub const VT_INTS: flatbuffers::VOffsetT = 4;
+    pub const VT_FLOATS: flatbuffers::VOffsetT = 6;
+    pub const VT_EMPTY_STRING: flatbuffers::VOffsetT = 8;
+    pub const VT_SOME_STRING: flatbuffers::VOffsetT = 10;
+    pub const VT_ABCS: flatbuffers::VOffsetT = 12;
+    pub const VT_BOOLS: flatbuffers::VOffsetT = 14;
+
+  #[inline]
+  pub fn ints(&self) -> flatbuffers::Vector<'a, i32> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, i32>>>(MoreDefaults::VT_INTS, Some(Default::default())).unwrap()
+  }
+  #[inline]
+  pub fn floats(&self) -> flatbuffers::Vector<'a, f32> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, f32>>>(MoreDefaults::VT_FLOATS, Some(Default::default())).unwrap()
+  }
+  #[inline]
+  pub fn empty_string(&self) -> &'a str {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(MoreDefaults::VT_EMPTY_STRING, Some(&"")).unwrap()
+  }
+  #[inline]
+  pub fn some_string(&self) -> &'a str {
+    self._tab.get::<flatbuffers::ForwardsUOffset<&str>>(MoreDefaults::VT_SOME_STRING, Some(&"some")).unwrap()
+  }
+  #[inline]
+  pub fn abcs(&self) -> flatbuffers::Vector<'a, ABC> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, ABC>>>(MoreDefaults::VT_ABCS, Some(Default::default())).unwrap()
+  }
+  #[inline]
+  pub fn bools(&self) -> &'a [bool] {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'a, bool>>>(MoreDefaults::VT_BOOLS, Some(Default::default())).map(|v| v.safe_slice()).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for MoreDefaults<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, i32>>>(&"ints", Self::VT_INTS, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, f32>>>(&"floats", Self::VT_FLOATS, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"empty_string", Self::VT_EMPTY_STRING, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<&str>>(&"some_string", Self::VT_SOME_STRING, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, ABC>>>(&"abcs", Self::VT_ABCS, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<'_, bool>>>(&"bools", Self::VT_BOOLS, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct MoreDefaultsArgs<'a> {
+    pub ints: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, i32>>>,
+    pub floats: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, f32>>>,
+    pub empty_string: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub some_string: Option<flatbuffers::WIPOffset<&'a str>>,
+    pub abcs: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, ABC>>>,
+    pub bools: Option<flatbuffers::WIPOffset<flatbuffers::Vector<'a, bool>>>,
+}
+impl<'a> Default for MoreDefaultsArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        MoreDefaultsArgs {
+            ints: None,
+            floats: None,
+            empty_string: None,
+            some_string: None,
+            abcs: None,
+            bools: None,
+        }
+    }
+}
+pub struct MoreDefaultsBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> MoreDefaultsBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_ints(&mut self, ints: flatbuffers::WIPOffset<flatbuffers::Vector<'b , i32>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_INTS, ints);
+  }
+  #[inline]
+  pub fn add_floats(&mut self, floats: flatbuffers::WIPOffset<flatbuffers::Vector<'b , f32>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_FLOATS, floats);
+  }
+  #[inline]
+  pub fn add_empty_string(&mut self, empty_string: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_EMPTY_STRING, empty_string);
+  }
+  #[inline]
+  pub fn add_some_string(&mut self, some_string: flatbuffers::WIPOffset<&'b  str>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_SOME_STRING, some_string);
+  }
+  #[inline]
+  pub fn add_abcs(&mut self, abcs: flatbuffers::WIPOffset<flatbuffers::Vector<'b , ABC>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_ABCS, abcs);
+  }
+  #[inline]
+  pub fn add_bools(&mut self, bools: flatbuffers::WIPOffset<flatbuffers::Vector<'b , bool>>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(MoreDefaults::VT_BOOLS, bools);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> MoreDefaultsBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    MoreDefaultsBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<MoreDefaults<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for MoreDefaults<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("MoreDefaults");
+      ds.field("ints", &self.ints());
+      ds.field("floats", &self.floats());
+      ds.field("empty_string", &self.empty_string());
+      ds.field("some_string", &self.some_string());
+      ds.field("abcs", &self.abcs());
+      ds.field("bools", &self.bools());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct MoreDefaultsT {
+  pub ints: Vec<i32>,
+  pub floats: Vec<f32>,
+  pub empty_string: String,
+  pub some_string: String,
+  pub abcs: Vec<ABC>,
+  pub bools: Vec<bool>,
+}
+impl Default for MoreDefaultsT {
+  fn default() -> Self {
+    Self {
+      ints: Default::default(),
+      floats: Default::default(),
+      empty_string: "".to_string(),
+      some_string: "some".to_string(),
+      abcs: Default::default(),
+      bools: Default::default(),
+    }
+  }
+}
+impl MoreDefaultsT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<MoreDefaults<'b>> {
+    let ints = Some({
+      let x = &self.ints;
+      _fbb.create_vector(x)
+    });
+    let floats = Some({
+      let x = &self.floats;
+      _fbb.create_vector(x)
+    });
+    let empty_string = Some({
+      let x = &self.empty_string;
+      _fbb.create_string(x)
+    });
+    let some_string = Some({
+      let x = &self.some_string;
+      _fbb.create_string(x)
+    });
+    let abcs = Some({
+      let x = &self.abcs;
+      _fbb.create_vector(x)
+    });
+    let bools = Some({
+      let x = &self.bools;
+      _fbb.create_vector(x)
+    });
+    MoreDefaults::create(_fbb, &MoreDefaultsArgs{
+      ints,
+      floats,
+      empty_string,
+      some_string,
+      abcs,
+      bools,
+    })
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.js
new file mode 100644
index 0000000..9fea3d6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.js
@@ -0,0 +1,54 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export class Ability {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    id() {
+        return this.bb.readUint32(this.bb_pos);
+    }
+    mutate_id(value) {
+        this.bb.writeUint32(this.bb_pos + 0, value);
+        return true;
+    }
+    distance() {
+        return this.bb.readUint32(this.bb_pos + 4);
+    }
+    mutate_distance(value) {
+        this.bb.writeUint32(this.bb_pos + 4, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Ability';
+    }
+    static sizeOf() {
+        return 8;
+    }
+    static createAbility(builder, id, distance) {
+        builder.prep(4, 8);
+        builder.writeInt32(distance);
+        builder.writeInt32(id);
+        return builder.offset();
+    }
+    unpack() {
+        return new AbilityT(this.id(), this.distance());
+    }
+    unpackTo(_o) {
+        _o.id = this.id();
+        _o.distance = this.distance();
+    }
+}
+export class AbilityT {
+    constructor(id = 0, distance = 0) {
+        this.id = id;
+        this.distance = distance;
+    }
+    pack(builder) {
+        return Ability.createAbility(builder, this.id, this.distance);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.ts
new file mode 100644
index 0000000..26395fb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/ability.ts
@@ -0,0 +1,77 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Ability {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Ability {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+id():number {
+  return this.bb!.readUint32(this.bb_pos);
+}
+
+mutate_id(value:number):boolean {
+  this.bb!.writeUint32(this.bb_pos + 0, value);
+  return true;
+}
+
+distance():number {
+  return this.bb!.readUint32(this.bb_pos + 4);
+}
+
+mutate_distance(value:number):boolean {
+  this.bb!.writeUint32(this.bb_pos + 4, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Ability';
+}
+
+static sizeOf():number {
+  return 8;
+}
+
+static createAbility(builder:flatbuffers.Builder, id: number, distance: number):flatbuffers.Offset {
+  builder.prep(4, 8);
+  builder.writeInt32(distance);
+  builder.writeInt32(id);
+  return builder.offset();
+}
+
+
+unpack(): AbilityT {
+  return new AbilityT(
+    this.id(),
+    this.distance()
+  );
+}
+
+
+unpackTo(_o: AbilityT): void {
+  _o.id = this.id();
+  _o.distance = this.distance();
+}
+}
+
+export class AbilityT {
+constructor(
+  public id: number = 0,
+  public distance: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Ability.createAbility(builder,
+    this.id,
+    this.distance
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.js
new file mode 100644
index 0000000..7cd2a85
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.js
@@ -0,0 +1,27 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Monster } from '../../my-game/example/monster';
+export var AnyAmbiguousAliases;
+(function (AnyAmbiguousAliases) {
+    AnyAmbiguousAliases[AnyAmbiguousAliases["NONE"] = 0] = "NONE";
+    AnyAmbiguousAliases[AnyAmbiguousAliases["M1"] = 1] = "M1";
+    AnyAmbiguousAliases[AnyAmbiguousAliases["M2"] = 2] = "M2";
+    AnyAmbiguousAliases[AnyAmbiguousAliases["M3"] = 3] = "M3";
+})(AnyAmbiguousAliases || (AnyAmbiguousAliases = {}));
+export function unionToAnyAmbiguousAliases(type, accessor) {
+    switch (AnyAmbiguousAliases[type]) {
+        case 'NONE': return null;
+        case 'M1': return accessor(new Monster());
+        case 'M2': return accessor(new Monster());
+        case 'M3': return accessor(new Monster());
+        default: return null;
+    }
+}
+export function unionListToAnyAmbiguousAliases(type, accessor, index) {
+    switch (AnyAmbiguousAliases[type]) {
+        case 'NONE': return null;
+        case 'M1': return accessor(index, new Monster());
+        case 'M2': return accessor(index, new Monster());
+        case 'M3': return accessor(index, new Monster());
+        default: return null;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.ts
new file mode 100644
index 0000000..6687a41
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-ambiguous-aliases.ts
@@ -0,0 +1,39 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import { Monster, MonsterT } from '../../my-game/example/monster';
+
+
+export enum AnyAmbiguousAliases{
+  NONE = 0,
+  M1 = 1,
+  M2 = 2,
+  M3 = 3
+}
+
+export function unionToAnyAmbiguousAliases(
+  type: AnyAmbiguousAliases,
+  accessor: (obj:Monster) => Monster|null
+): Monster|null {
+  switch(AnyAmbiguousAliases[type]) {
+    case 'NONE': return null; 
+    case 'M1': return accessor(new Monster())! as Monster;
+    case 'M2': return accessor(new Monster())! as Monster;
+    case 'M3': return accessor(new Monster())! as Monster;
+    default: return null;
+  }
+}
+
+export function unionListToAnyAmbiguousAliases(
+  type: AnyAmbiguousAliases, 
+  accessor: (index: number, obj:Monster) => Monster|null, 
+  index: number
+): Monster|null {
+  switch(AnyAmbiguousAliases[type]) {
+    case 'NONE': return null; 
+    case 'M1': return accessor(index, new Monster())! as Monster;
+    case 'M2': return accessor(index, new Monster())! as Monster;
+    case 'M3': return accessor(index, new Monster())! as Monster;
+    default: return null;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.js
new file mode 100644
index 0000000..b5cc882
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.js
@@ -0,0 +1,29 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Monster as MyGameExample2Monster } from '../../my-game/example2/monster';
+import { Monster } from '../../my-game/example/monster';
+import { TestSimpleTableWithEnum } from '../../my-game/example/test-simple-table-with-enum';
+export var AnyUniqueAliases;
+(function (AnyUniqueAliases) {
+    AnyUniqueAliases[AnyUniqueAliases["NONE"] = 0] = "NONE";
+    AnyUniqueAliases[AnyUniqueAliases["M"] = 1] = "M";
+    AnyUniqueAliases[AnyUniqueAliases["TS"] = 2] = "TS";
+    AnyUniqueAliases[AnyUniqueAliases["M2"] = 3] = "M2";
+})(AnyUniqueAliases || (AnyUniqueAliases = {}));
+export function unionToAnyUniqueAliases(type, accessor) {
+    switch (AnyUniqueAliases[type]) {
+        case 'NONE': return null;
+        case 'M': return accessor(new Monster());
+        case 'TS': return accessor(new TestSimpleTableWithEnum());
+        case 'M2': return accessor(new MyGameExample2Monster());
+        default: return null;
+    }
+}
+export function unionListToAnyUniqueAliases(type, accessor, index) {
+    switch (AnyUniqueAliases[type]) {
+        case 'NONE': return null;
+        case 'M': return accessor(index, new Monster());
+        case 'TS': return accessor(index, new TestSimpleTableWithEnum());
+        case 'M2': return accessor(index, new MyGameExample2Monster());
+        default: return null;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.ts
new file mode 100644
index 0000000..5106148
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any-unique-aliases.ts
@@ -0,0 +1,41 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import { Monster as MyGameExample2Monster, MonsterT as MyGameExample2MonsterT } from '../../my-game/example2/monster';
+import { Monster, MonsterT } from '../../my-game/example/monster';
+import { TestSimpleTableWithEnum, TestSimpleTableWithEnumT } from '../../my-game/example/test-simple-table-with-enum';
+
+
+export enum AnyUniqueAliases{
+  NONE = 0,
+  M = 1,
+  TS = 2,
+  M2 = 3
+}
+
+export function unionToAnyUniqueAliases(
+  type: AnyUniqueAliases,
+  accessor: (obj:Monster|MyGameExample2Monster|TestSimpleTableWithEnum) => Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null
+): Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null {
+  switch(AnyUniqueAliases[type]) {
+    case 'NONE': return null; 
+    case 'M': return accessor(new Monster())! as Monster;
+    case 'TS': return accessor(new TestSimpleTableWithEnum())! as TestSimpleTableWithEnum;
+    case 'M2': return accessor(new MyGameExample2Monster())! as MyGameExample2Monster;
+    default: return null;
+  }
+}
+
+export function unionListToAnyUniqueAliases(
+  type: AnyUniqueAliases, 
+  accessor: (index: number, obj:Monster|MyGameExample2Monster|TestSimpleTableWithEnum) => Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null, 
+  index: number
+): Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null {
+  switch(AnyUniqueAliases[type]) {
+    case 'NONE': return null; 
+    case 'M': return accessor(index, new Monster())! as Monster;
+    case 'TS': return accessor(index, new TestSimpleTableWithEnum())! as TestSimpleTableWithEnum;
+    case 'M2': return accessor(index, new MyGameExample2Monster())! as MyGameExample2Monster;
+    default: return null;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.js
new file mode 100644
index 0000000..7349c07
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.js
@@ -0,0 +1,29 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Monster as MyGameExample2Monster } from '../../my-game/example2/monster';
+import { Monster } from '../../my-game/example/monster';
+import { TestSimpleTableWithEnum } from '../../my-game/example/test-simple-table-with-enum';
+export var Any;
+(function (Any) {
+    Any[Any["NONE"] = 0] = "NONE";
+    Any[Any["Monster"] = 1] = "Monster";
+    Any[Any["TestSimpleTableWithEnum"] = 2] = "TestSimpleTableWithEnum";
+    Any[Any["MyGame_Example2_Monster"] = 3] = "MyGame_Example2_Monster";
+})(Any || (Any = {}));
+export function unionToAny(type, accessor) {
+    switch (Any[type]) {
+        case 'NONE': return null;
+        case 'Monster': return accessor(new Monster());
+        case 'TestSimpleTableWithEnum': return accessor(new TestSimpleTableWithEnum());
+        case 'MyGame_Example2_Monster': return accessor(new MyGameExample2Monster());
+        default: return null;
+    }
+}
+export function unionListToAny(type, accessor, index) {
+    switch (Any[type]) {
+        case 'NONE': return null;
+        case 'Monster': return accessor(index, new Monster());
+        case 'TestSimpleTableWithEnum': return accessor(index, new TestSimpleTableWithEnum());
+        case 'MyGame_Example2_Monster': return accessor(index, new MyGameExample2Monster());
+        default: return null;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.ts
new file mode 100644
index 0000000..9d8b2a6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/any.ts
@@ -0,0 +1,41 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import { Monster as MyGameExample2Monster, MonsterT as MyGameExample2MonsterT } from '../../my-game/example2/monster';
+import { Monster, MonsterT } from '../../my-game/example/monster';
+import { TestSimpleTableWithEnum, TestSimpleTableWithEnumT } from '../../my-game/example/test-simple-table-with-enum';
+
+
+export enum Any{
+  NONE = 0,
+  Monster = 1,
+  TestSimpleTableWithEnum = 2,
+  MyGame_Example2_Monster = 3
+}
+
+export function unionToAny(
+  type: Any,
+  accessor: (obj:Monster|MyGameExample2Monster|TestSimpleTableWithEnum) => Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null
+): Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null {
+  switch(Any[type]) {
+    case 'NONE': return null; 
+    case 'Monster': return accessor(new Monster())! as Monster;
+    case 'TestSimpleTableWithEnum': return accessor(new TestSimpleTableWithEnum())! as TestSimpleTableWithEnum;
+    case 'MyGame_Example2_Monster': return accessor(new MyGameExample2Monster())! as MyGameExample2Monster;
+    default: return null;
+  }
+}
+
+export function unionListToAny(
+  type: Any, 
+  accessor: (index: number, obj:Monster|MyGameExample2Monster|TestSimpleTableWithEnum) => Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null, 
+  index: number
+): Monster|MyGameExample2Monster|TestSimpleTableWithEnum|null {
+  switch(Any[type]) {
+    case 'NONE': return null; 
+    case 'Monster': return accessor(index, new Monster())! as Monster;
+    case 'TestSimpleTableWithEnum': return accessor(index, new TestSimpleTableWithEnum())! as TestSimpleTableWithEnum;
+    case 'MyGame_Example2_Monster': return accessor(index, new MyGameExample2Monster())! as MyGameExample2Monster;
+    default: return null;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.js
new file mode 100644
index 0000000..f95f75e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.js
@@ -0,0 +1,17 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+/**
+ * Composite components of Monster color.
+ */
+export var Color;
+(function (Color) {
+    Color[Color["Red"] = 1] = "Red";
+    /**
+     * \brief color Green
+     * Green is bit_flag with value (1u << 1)
+     */
+    Color[Color["Green"] = 2] = "Green";
+    /**
+     * \brief color Blue (1u << 3)
+     */
+    Color[Color["Blue"] = 8] = "Blue";
+})(Color || (Color = {}));
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.ts
new file mode 100644
index 0000000..ceb31f8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/color.ts
@@ -0,0 +1,20 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+/**
+ * Composite components of Monster color.
+ */
+export enum Color{
+  Red = 1,
+
+  /**
+   * \brief color Green
+   * Green is bit_flag with value (1u << 1)
+   */
+  Green = 2,
+
+  /**
+   * \brief color Blue (1u << 3)
+   */
+  Blue = 8
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.js
new file mode 100644
index 0000000..201a84e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.js
@@ -0,0 +1,1079 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { Ability } from '../../my-game/example/ability';
+import { Any, unionToAny } from '../../my-game/example/any';
+import { AnyAmbiguousAliases, unionToAnyAmbiguousAliases } from '../../my-game/example/any-ambiguous-aliases';
+import { AnyUniqueAliases, unionToAnyUniqueAliases } from '../../my-game/example/any-unique-aliases';
+import { Color } from '../../my-game/example/color';
+import { Race } from '../../my-game/example/race';
+import { Referrable } from '../../my-game/example/referrable';
+import { Stat } from '../../my-game/example/stat';
+import { Test } from '../../my-game/example/test';
+import { Vec3 } from '../../my-game/example/vec3';
+import { InParentNamespace } from '../../my-game/in-parent-namespace';
+/**
+ * an example documentation comment: "monster object"
+ */
+export class Monster {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsMonster(bb, obj) {
+        return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsMonster(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static bufferHasIdentifier(bb) {
+        return bb.__has_identifier('MONS');
+    }
+    pos(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? (obj || new Vec3()).__init(this.bb_pos + offset, this.bb) : null;
+    }
+    mana() {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : 150;
+    }
+    mutate_mana(value) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt16(this.bb_pos + offset, value);
+        return true;
+    }
+    hp() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : 100;
+    }
+    mutate_hp(value) {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt16(this.bb_pos + offset, value);
+        return true;
+    }
+    name(optionalEncoding) {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.__string(this.bb_pos + offset, optionalEncoding) : null;
+    }
+    inventory(index) {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    inventoryLength() {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    inventoryArray() {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    color() {
+        const offset = this.bb.__offset(this.bb_pos, 16);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : Color.Blue;
+    }
+    mutate_color(value) {
+        const offset = this.bb.__offset(this.bb_pos, 16);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint8(this.bb_pos + offset, value);
+        return true;
+    }
+    testType() {
+        const offset = this.bb.__offset(this.bb_pos, 18);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : Any.NONE;
+    }
+    test(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 20);
+        return offset ? this.bb.__union(obj, this.bb_pos + offset) : null;
+    }
+    test4(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 22);
+        return offset ? (obj || new Test()).__init(this.bb.__vector(this.bb_pos + offset) + index * 4, this.bb) : null;
+    }
+    test4Length() {
+        const offset = this.bb.__offset(this.bb_pos, 22);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    testarrayofstring(index, optionalEncoding) {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? this.bb.__string(this.bb.__vector(this.bb_pos + offset) + index * 4, optionalEncoding) : null;
+    }
+    testarrayofstringLength() {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    /**
+     * an example documentation comment: this will end up in the generated code
+     * multiline too
+     */
+    testarrayoftables(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? (obj || new Monster()).__init(this.bb.__indirect(this.bb.__vector(this.bb_pos + offset) + index * 4), this.bb) : null;
+    }
+    testarrayoftablesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    enemy(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 28);
+        return offset ? (obj || new Monster()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    testnestedflatbuffer(index) {
+        const offset = this.bb.__offset(this.bb_pos, 30);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    testnestedflatbufferLength() {
+        const offset = this.bb.__offset(this.bb_pos, 30);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    testnestedflatbufferArray() {
+        const offset = this.bb.__offset(this.bb_pos, 30);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    testempty(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 32);
+        return offset ? (obj || new Stat()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    testbool() {
+        const offset = this.bb.__offset(this.bb_pos, 34);
+        return offset ? !!this.bb.readInt8(this.bb_pos + offset) : false;
+    }
+    mutate_testbool(value) {
+        const offset = this.bb.__offset(this.bb_pos, 34);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt8(this.bb_pos + offset, +value);
+        return true;
+    }
+    testhashs32Fnv1() {
+        const offset = this.bb.__offset(this.bb_pos, 36);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    mutate_testhashs32_fnv1(value) {
+        const offset = this.bb.__offset(this.bb_pos, 36);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt32(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashu32Fnv1() {
+        const offset = this.bb.__offset(this.bb_pos, 38);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : 0;
+    }
+    mutate_testhashu32_fnv1(value) {
+        const offset = this.bb.__offset(this.bb_pos, 38);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint32(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashs64Fnv1() {
+        const offset = this.bb.__offset(this.bb_pos, 40);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_testhashs64_fnv1(value) {
+        const offset = this.bb.__offset(this.bb_pos, 40);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt64(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashu64Fnv1() {
+        const offset = this.bb.__offset(this.bb_pos, 42);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_testhashu64_fnv1(value) {
+        const offset = this.bb.__offset(this.bb_pos, 42);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashs32Fnv1a() {
+        const offset = this.bb.__offset(this.bb_pos, 44);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    mutate_testhashs32_fnv1a(value) {
+        const offset = this.bb.__offset(this.bb_pos, 44);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt32(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashu32Fnv1a() {
+        const offset = this.bb.__offset(this.bb_pos, 46);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : 0;
+    }
+    mutate_testhashu32_fnv1a(value) {
+        const offset = this.bb.__offset(this.bb_pos, 46);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint32(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashs64Fnv1a() {
+        const offset = this.bb.__offset(this.bb_pos, 48);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_testhashs64_fnv1a(value) {
+        const offset = this.bb.__offset(this.bb_pos, 48);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt64(this.bb_pos + offset, value);
+        return true;
+    }
+    testhashu64Fnv1a() {
+        const offset = this.bb.__offset(this.bb_pos, 50);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_testhashu64_fnv1a(value) {
+        const offset = this.bb.__offset(this.bb_pos, 50);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    testarrayofbools(index) {
+        const offset = this.bb.__offset(this.bb_pos, 52);
+        return offset ? !!this.bb.readInt8(this.bb.__vector(this.bb_pos + offset) + index) : false;
+    }
+    testarrayofboolsLength() {
+        const offset = this.bb.__offset(this.bb_pos, 52);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    testarrayofboolsArray() {
+        const offset = this.bb.__offset(this.bb_pos, 52);
+        return offset ? new Int8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    testf() {
+        const offset = this.bb.__offset(this.bb_pos, 54);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 3.14159;
+    }
+    mutate_testf(value) {
+        const offset = this.bb.__offset(this.bb_pos, 54);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeFloat32(this.bb_pos + offset, value);
+        return true;
+    }
+    testf2() {
+        const offset = this.bb.__offset(this.bb_pos, 56);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 3.0;
+    }
+    mutate_testf2(value) {
+        const offset = this.bb.__offset(this.bb_pos, 56);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeFloat32(this.bb_pos + offset, value);
+        return true;
+    }
+    testf3() {
+        const offset = this.bb.__offset(this.bb_pos, 58);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 0.0;
+    }
+    mutate_testf3(value) {
+        const offset = this.bb.__offset(this.bb_pos, 58);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeFloat32(this.bb_pos + offset, value);
+        return true;
+    }
+    testarrayofstring2(index, optionalEncoding) {
+        const offset = this.bb.__offset(this.bb_pos, 60);
+        return offset ? this.bb.__string(this.bb.__vector(this.bb_pos + offset) + index * 4, optionalEncoding) : null;
+    }
+    testarrayofstring2Length() {
+        const offset = this.bb.__offset(this.bb_pos, 60);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    testarrayofsortedstruct(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 62);
+        return offset ? (obj || new Ability()).__init(this.bb.__vector(this.bb_pos + offset) + index * 8, this.bb) : null;
+    }
+    testarrayofsortedstructLength() {
+        const offset = this.bb.__offset(this.bb_pos, 62);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    flex(index) {
+        const offset = this.bb.__offset(this.bb_pos, 64);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    flexLength() {
+        const offset = this.bb.__offset(this.bb_pos, 64);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    flexArray() {
+        const offset = this.bb.__offset(this.bb_pos, 64);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    test5(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 66);
+        return offset ? (obj || new Test()).__init(this.bb.__vector(this.bb_pos + offset) + index * 4, this.bb) : null;
+    }
+    test5Length() {
+        const offset = this.bb.__offset(this.bb_pos, 66);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vectorOfLongs(index) {
+        const offset = this.bb.__offset(this.bb_pos, 68);
+        return offset ? this.bb.readInt64(this.bb.__vector(this.bb_pos + offset) + index * 8) : this.bb.createLong(0, 0);
+    }
+    vectorOfLongsLength() {
+        const offset = this.bb.__offset(this.bb_pos, 68);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vectorOfDoubles(index) {
+        const offset = this.bb.__offset(this.bb_pos, 70);
+        return offset ? this.bb.readFloat64(this.bb.__vector(this.bb_pos + offset) + index * 8) : 0;
+    }
+    vectorOfDoublesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 70);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vectorOfDoublesArray() {
+        const offset = this.bb.__offset(this.bb_pos, 70);
+        return offset ? new Float64Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    parentNamespaceTest(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 72);
+        return offset ? (obj || new InParentNamespace()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    vectorOfReferrables(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 74);
+        return offset ? (obj || new Referrable()).__init(this.bb.__indirect(this.bb.__vector(this.bb_pos + offset) + index * 4), this.bb) : null;
+    }
+    vectorOfReferrablesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 74);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    singleWeakReference() {
+        const offset = this.bb.__offset(this.bb_pos, 76);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_single_weak_reference(value) {
+        const offset = this.bb.__offset(this.bb_pos, 76);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    vectorOfWeakReferences(index) {
+        const offset = this.bb.__offset(this.bb_pos, 78);
+        return offset ? this.bb.readUint64(this.bb.__vector(this.bb_pos + offset) + index * 8) : this.bb.createLong(0, 0);
+    }
+    vectorOfWeakReferencesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 78);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vectorOfStrongReferrables(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 80);
+        return offset ? (obj || new Referrable()).__init(this.bb.__indirect(this.bb.__vector(this.bb_pos + offset) + index * 4), this.bb) : null;
+    }
+    vectorOfStrongReferrablesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 80);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    coOwningReference() {
+        const offset = this.bb.__offset(this.bb_pos, 82);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_co_owning_reference(value) {
+        const offset = this.bb.__offset(this.bb_pos, 82);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    vectorOfCoOwningReferences(index) {
+        const offset = this.bb.__offset(this.bb_pos, 84);
+        return offset ? this.bb.readUint64(this.bb.__vector(this.bb_pos + offset) + index * 8) : this.bb.createLong(0, 0);
+    }
+    vectorOfCoOwningReferencesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 84);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    nonOwningReference() {
+        const offset = this.bb.__offset(this.bb_pos, 86);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_non_owning_reference(value) {
+        const offset = this.bb.__offset(this.bb_pos, 86);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    vectorOfNonOwningReferences(index) {
+        const offset = this.bb.__offset(this.bb_pos, 88);
+        return offset ? this.bb.readUint64(this.bb.__vector(this.bb_pos + offset) + index * 8) : this.bb.createLong(0, 0);
+    }
+    vectorOfNonOwningReferencesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 88);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    anyUniqueType() {
+        const offset = this.bb.__offset(this.bb_pos, 90);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : AnyUniqueAliases.NONE;
+    }
+    anyUnique(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 92);
+        return offset ? this.bb.__union(obj, this.bb_pos + offset) : null;
+    }
+    anyAmbiguousType() {
+        const offset = this.bb.__offset(this.bb_pos, 94);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : AnyAmbiguousAliases.NONE;
+    }
+    anyAmbiguous(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 96);
+        return offset ? this.bb.__union(obj, this.bb_pos + offset) : null;
+    }
+    vectorOfEnums(index) {
+        const offset = this.bb.__offset(this.bb_pos, 98);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    vectorOfEnumsLength() {
+        const offset = this.bb.__offset(this.bb_pos, 98);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vectorOfEnumsArray() {
+        const offset = this.bb.__offset(this.bb_pos, 98);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    signedEnum() {
+        const offset = this.bb.__offset(this.bb_pos, 100);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : Race.None;
+    }
+    mutate_signed_enum(value) {
+        const offset = this.bb.__offset(this.bb_pos, 100);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt8(this.bb_pos + offset, value);
+        return true;
+    }
+    testrequirednestedflatbuffer(index) {
+        const offset = this.bb.__offset(this.bb_pos, 102);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    testrequirednestedflatbufferLength() {
+        const offset = this.bb.__offset(this.bb_pos, 102);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    testrequirednestedflatbufferArray() {
+        const offset = this.bb.__offset(this.bb_pos, 102);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    scalarKeySortedTables(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 104);
+        return offset ? (obj || new Stat()).__init(this.bb.__indirect(this.bb.__vector(this.bb_pos + offset) + index * 4), this.bb) : null;
+    }
+    scalarKeySortedTablesLength() {
+        const offset = this.bb.__offset(this.bb_pos, 104);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Monster';
+    }
+    static startMonster(builder) {
+        builder.startObject(51);
+    }
+    static addPos(builder, posOffset) {
+        builder.addFieldStruct(0, posOffset, 0);
+    }
+    static addMana(builder, mana) {
+        builder.addFieldInt16(1, mana, 150);
+    }
+    static addHp(builder, hp) {
+        builder.addFieldInt16(2, hp, 100);
+    }
+    static addName(builder, nameOffset) {
+        builder.addFieldOffset(3, nameOffset, 0);
+    }
+    static addInventory(builder, inventoryOffset) {
+        builder.addFieldOffset(5, inventoryOffset, 0);
+    }
+    static createInventoryVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startInventoryVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addColor(builder, color) {
+        builder.addFieldInt8(6, color, Color.Blue);
+    }
+    static addTestType(builder, testType) {
+        builder.addFieldInt8(7, testType, Any.NONE);
+    }
+    static addTest(builder, testOffset) {
+        builder.addFieldOffset(8, testOffset, 0);
+    }
+    static addTest4(builder, test4Offset) {
+        builder.addFieldOffset(9, test4Offset, 0);
+    }
+    static startTest4Vector(builder, numElems) {
+        builder.startVector(4, numElems, 2);
+    }
+    static addTestarrayofstring(builder, testarrayofstringOffset) {
+        builder.addFieldOffset(10, testarrayofstringOffset, 0);
+    }
+    static createTestarrayofstringVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestarrayofstringVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static addTestarrayoftables(builder, testarrayoftablesOffset) {
+        builder.addFieldOffset(11, testarrayoftablesOffset, 0);
+    }
+    static createTestarrayoftablesVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestarrayoftablesVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static addEnemy(builder, enemyOffset) {
+        builder.addFieldOffset(12, enemyOffset, 0);
+    }
+    static addTestnestedflatbuffer(builder, testnestedflatbufferOffset) {
+        builder.addFieldOffset(13, testnestedflatbufferOffset, 0);
+    }
+    static createTestnestedflatbufferVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestnestedflatbufferVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addTestempty(builder, testemptyOffset) {
+        builder.addFieldOffset(14, testemptyOffset, 0);
+    }
+    static addTestbool(builder, testbool) {
+        builder.addFieldInt8(15, +testbool, +false);
+    }
+    static addTesthashs32Fnv1(builder, testhashs32Fnv1) {
+        builder.addFieldInt32(16, testhashs32Fnv1, 0);
+    }
+    static addTesthashu32Fnv1(builder, testhashu32Fnv1) {
+        builder.addFieldInt32(17, testhashu32Fnv1, 0);
+    }
+    static addTesthashs64Fnv1(builder, testhashs64Fnv1) {
+        builder.addFieldInt64(18, testhashs64Fnv1, builder.createLong(0, 0));
+    }
+    static addTesthashu64Fnv1(builder, testhashu64Fnv1) {
+        builder.addFieldInt64(19, testhashu64Fnv1, builder.createLong(0, 0));
+    }
+    static addTesthashs32Fnv1a(builder, testhashs32Fnv1a) {
+        builder.addFieldInt32(20, testhashs32Fnv1a, 0);
+    }
+    static addTesthashu32Fnv1a(builder, testhashu32Fnv1a) {
+        builder.addFieldInt32(21, testhashu32Fnv1a, 0);
+    }
+    static addTesthashs64Fnv1a(builder, testhashs64Fnv1a) {
+        builder.addFieldInt64(22, testhashs64Fnv1a, builder.createLong(0, 0));
+    }
+    static addTesthashu64Fnv1a(builder, testhashu64Fnv1a) {
+        builder.addFieldInt64(23, testhashu64Fnv1a, builder.createLong(0, 0));
+    }
+    static addTestarrayofbools(builder, testarrayofboolsOffset) {
+        builder.addFieldOffset(24, testarrayofboolsOffset, 0);
+    }
+    static createTestarrayofboolsVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(+data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestarrayofboolsVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addTestf(builder, testf) {
+        builder.addFieldFloat32(25, testf, 3.14159);
+    }
+    static addTestf2(builder, testf2) {
+        builder.addFieldFloat32(26, testf2, 3.0);
+    }
+    static addTestf3(builder, testf3) {
+        builder.addFieldFloat32(27, testf3, 0.0);
+    }
+    static addTestarrayofstring2(builder, testarrayofstring2Offset) {
+        builder.addFieldOffset(28, testarrayofstring2Offset, 0);
+    }
+    static createTestarrayofstring2Vector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestarrayofstring2Vector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static addTestarrayofsortedstruct(builder, testarrayofsortedstructOffset) {
+        builder.addFieldOffset(29, testarrayofsortedstructOffset, 0);
+    }
+    static startTestarrayofsortedstructVector(builder, numElems) {
+        builder.startVector(8, numElems, 4);
+    }
+    static addFlex(builder, flexOffset) {
+        builder.addFieldOffset(30, flexOffset, 0);
+    }
+    static createFlexVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startFlexVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addTest5(builder, test5Offset) {
+        builder.addFieldOffset(31, test5Offset, 0);
+    }
+    static startTest5Vector(builder, numElems) {
+        builder.startVector(4, numElems, 2);
+    }
+    static addVectorOfLongs(builder, vectorOfLongsOffset) {
+        builder.addFieldOffset(32, vectorOfLongsOffset, 0);
+    }
+    static createVectorOfLongsVector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfLongsVector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static addVectorOfDoubles(builder, vectorOfDoublesOffset) {
+        builder.addFieldOffset(33, vectorOfDoublesOffset, 0);
+    }
+    static createVectorOfDoublesVector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addFloat64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfDoublesVector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static addParentNamespaceTest(builder, parentNamespaceTestOffset) {
+        builder.addFieldOffset(34, parentNamespaceTestOffset, 0);
+    }
+    static addVectorOfReferrables(builder, vectorOfReferrablesOffset) {
+        builder.addFieldOffset(35, vectorOfReferrablesOffset, 0);
+    }
+    static createVectorOfReferrablesVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfReferrablesVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static addSingleWeakReference(builder, singleWeakReference) {
+        builder.addFieldInt64(36, singleWeakReference, builder.createLong(0, 0));
+    }
+    static addVectorOfWeakReferences(builder, vectorOfWeakReferencesOffset) {
+        builder.addFieldOffset(37, vectorOfWeakReferencesOffset, 0);
+    }
+    static createVectorOfWeakReferencesVector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfWeakReferencesVector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static addVectorOfStrongReferrables(builder, vectorOfStrongReferrablesOffset) {
+        builder.addFieldOffset(38, vectorOfStrongReferrablesOffset, 0);
+    }
+    static createVectorOfStrongReferrablesVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfStrongReferrablesVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static addCoOwningReference(builder, coOwningReference) {
+        builder.addFieldInt64(39, coOwningReference, builder.createLong(0, 0));
+    }
+    static addVectorOfCoOwningReferences(builder, vectorOfCoOwningReferencesOffset) {
+        builder.addFieldOffset(40, vectorOfCoOwningReferencesOffset, 0);
+    }
+    static createVectorOfCoOwningReferencesVector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfCoOwningReferencesVector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static addNonOwningReference(builder, nonOwningReference) {
+        builder.addFieldInt64(41, nonOwningReference, builder.createLong(0, 0));
+    }
+    static addVectorOfNonOwningReferences(builder, vectorOfNonOwningReferencesOffset) {
+        builder.addFieldOffset(42, vectorOfNonOwningReferencesOffset, 0);
+    }
+    static createVectorOfNonOwningReferencesVector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfNonOwningReferencesVector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static addAnyUniqueType(builder, anyUniqueType) {
+        builder.addFieldInt8(43, anyUniqueType, AnyUniqueAliases.NONE);
+    }
+    static addAnyUnique(builder, anyUniqueOffset) {
+        builder.addFieldOffset(44, anyUniqueOffset, 0);
+    }
+    static addAnyAmbiguousType(builder, anyAmbiguousType) {
+        builder.addFieldInt8(45, anyAmbiguousType, AnyAmbiguousAliases.NONE);
+    }
+    static addAnyAmbiguous(builder, anyAmbiguousOffset) {
+        builder.addFieldOffset(46, anyAmbiguousOffset, 0);
+    }
+    static addVectorOfEnums(builder, vectorOfEnumsOffset) {
+        builder.addFieldOffset(47, vectorOfEnumsOffset, 0);
+    }
+    static createVectorOfEnumsVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVectorOfEnumsVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addSignedEnum(builder, signedEnum) {
+        builder.addFieldInt8(48, signedEnum, Race.None);
+    }
+    static addTestrequirednestedflatbuffer(builder, testrequirednestedflatbufferOffset) {
+        builder.addFieldOffset(49, testrequirednestedflatbufferOffset, 0);
+    }
+    static createTestrequirednestedflatbufferVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startTestrequirednestedflatbufferVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addScalarKeySortedTables(builder, scalarKeySortedTablesOffset) {
+        builder.addFieldOffset(50, scalarKeySortedTablesOffset, 0);
+    }
+    static createScalarKeySortedTablesVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startScalarKeySortedTablesVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static endMonster(builder) {
+        const offset = builder.endObject();
+        builder.requiredField(offset, 10); // name
+        return offset;
+    }
+    static finishMonsterBuffer(builder, offset) {
+        builder.finish(offset, 'MONS');
+    }
+    static finishSizePrefixedMonsterBuffer(builder, offset) {
+        builder.finish(offset, 'MONS', true);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return Monster.getRootAsMonster(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new MonsterT((this.pos() !== null ? this.pos().unpack() : null), this.mana(), this.hp(), this.name(), this.bb.createScalarList(this.inventory.bind(this), this.inventoryLength()), this.color(), this.testType(), (() => {
+            let temp = unionToAny(this.testType(), this.test.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })(), this.bb.createObjList(this.test4.bind(this), this.test4Length()), this.bb.createScalarList(this.testarrayofstring.bind(this), this.testarrayofstringLength()), this.bb.createObjList(this.testarrayoftables.bind(this), this.testarrayoftablesLength()), (this.enemy() !== null ? this.enemy().unpack() : null), this.bb.createScalarList(this.testnestedflatbuffer.bind(this), this.testnestedflatbufferLength()), (this.testempty() !== null ? this.testempty().unpack() : null), this.testbool(), this.testhashs32Fnv1(), this.testhashu32Fnv1(), this.testhashs64Fnv1(), this.testhashu64Fnv1(), this.testhashs32Fnv1a(), this.testhashu32Fnv1a(), this.testhashs64Fnv1a(), this.testhashu64Fnv1a(), this.bb.createScalarList(this.testarrayofbools.bind(this), this.testarrayofboolsLength()), this.testf(), this.testf2(), this.testf3(), this.bb.createScalarList(this.testarrayofstring2.bind(this), this.testarrayofstring2Length()), this.bb.createObjList(this.testarrayofsortedstruct.bind(this), this.testarrayofsortedstructLength()), this.bb.createScalarList(this.flex.bind(this), this.flexLength()), this.bb.createObjList(this.test5.bind(this), this.test5Length()), this.bb.createScalarList(this.vectorOfLongs.bind(this), this.vectorOfLongsLength()), this.bb.createScalarList(this.vectorOfDoubles.bind(this), this.vectorOfDoublesLength()), (this.parentNamespaceTest() !== null ? this.parentNamespaceTest().unpack() : null), this.bb.createObjList(this.vectorOfReferrables.bind(this), this.vectorOfReferrablesLength()), this.singleWeakReference(), this.bb.createScalarList(this.vectorOfWeakReferences.bind(this), this.vectorOfWeakReferencesLength()), this.bb.createObjList(this.vectorOfStrongReferrables.bind(this), this.vectorOfStrongReferrablesLength()), this.coOwningReference(), this.bb.createScalarList(this.vectorOfCoOwningReferences.bind(this), this.vectorOfCoOwningReferencesLength()), this.nonOwningReference(), this.bb.createScalarList(this.vectorOfNonOwningReferences.bind(this), this.vectorOfNonOwningReferencesLength()), this.anyUniqueType(), (() => {
+            let temp = unionToAnyUniqueAliases(this.anyUniqueType(), this.anyUnique.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })(), this.anyAmbiguousType(), (() => {
+            let temp = unionToAnyAmbiguousAliases(this.anyAmbiguousType(), this.anyAmbiguous.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })(), this.bb.createScalarList(this.vectorOfEnums.bind(this), this.vectorOfEnumsLength()), this.signedEnum(), this.bb.createScalarList(this.testrequirednestedflatbuffer.bind(this), this.testrequirednestedflatbufferLength()), this.bb.createObjList(this.scalarKeySortedTables.bind(this), this.scalarKeySortedTablesLength()));
+    }
+    unpackTo(_o) {
+        _o.pos = (this.pos() !== null ? this.pos().unpack() : null);
+        _o.mana = this.mana();
+        _o.hp = this.hp();
+        _o.name = this.name();
+        _o.inventory = this.bb.createScalarList(this.inventory.bind(this), this.inventoryLength());
+        _o.color = this.color();
+        _o.testType = this.testType();
+        _o.test = (() => {
+            let temp = unionToAny(this.testType(), this.test.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })();
+        _o.test4 = this.bb.createObjList(this.test4.bind(this), this.test4Length());
+        _o.testarrayofstring = this.bb.createScalarList(this.testarrayofstring.bind(this), this.testarrayofstringLength());
+        _o.testarrayoftables = this.bb.createObjList(this.testarrayoftables.bind(this), this.testarrayoftablesLength());
+        _o.enemy = (this.enemy() !== null ? this.enemy().unpack() : null);
+        _o.testnestedflatbuffer = this.bb.createScalarList(this.testnestedflatbuffer.bind(this), this.testnestedflatbufferLength());
+        _o.testempty = (this.testempty() !== null ? this.testempty().unpack() : null);
+        _o.testbool = this.testbool();
+        _o.testhashs32Fnv1 = this.testhashs32Fnv1();
+        _o.testhashu32Fnv1 = this.testhashu32Fnv1();
+        _o.testhashs64Fnv1 = this.testhashs64Fnv1();
+        _o.testhashu64Fnv1 = this.testhashu64Fnv1();
+        _o.testhashs32Fnv1a = this.testhashs32Fnv1a();
+        _o.testhashu32Fnv1a = this.testhashu32Fnv1a();
+        _o.testhashs64Fnv1a = this.testhashs64Fnv1a();
+        _o.testhashu64Fnv1a = this.testhashu64Fnv1a();
+        _o.testarrayofbools = this.bb.createScalarList(this.testarrayofbools.bind(this), this.testarrayofboolsLength());
+        _o.testf = this.testf();
+        _o.testf2 = this.testf2();
+        _o.testf3 = this.testf3();
+        _o.testarrayofstring2 = this.bb.createScalarList(this.testarrayofstring2.bind(this), this.testarrayofstring2Length());
+        _o.testarrayofsortedstruct = this.bb.createObjList(this.testarrayofsortedstruct.bind(this), this.testarrayofsortedstructLength());
+        _o.flex = this.bb.createScalarList(this.flex.bind(this), this.flexLength());
+        _o.test5 = this.bb.createObjList(this.test5.bind(this), this.test5Length());
+        _o.vectorOfLongs = this.bb.createScalarList(this.vectorOfLongs.bind(this), this.vectorOfLongsLength());
+        _o.vectorOfDoubles = this.bb.createScalarList(this.vectorOfDoubles.bind(this), this.vectorOfDoublesLength());
+        _o.parentNamespaceTest = (this.parentNamespaceTest() !== null ? this.parentNamespaceTest().unpack() : null);
+        _o.vectorOfReferrables = this.bb.createObjList(this.vectorOfReferrables.bind(this), this.vectorOfReferrablesLength());
+        _o.singleWeakReference = this.singleWeakReference();
+        _o.vectorOfWeakReferences = this.bb.createScalarList(this.vectorOfWeakReferences.bind(this), this.vectorOfWeakReferencesLength());
+        _o.vectorOfStrongReferrables = this.bb.createObjList(this.vectorOfStrongReferrables.bind(this), this.vectorOfStrongReferrablesLength());
+        _o.coOwningReference = this.coOwningReference();
+        _o.vectorOfCoOwningReferences = this.bb.createScalarList(this.vectorOfCoOwningReferences.bind(this), this.vectorOfCoOwningReferencesLength());
+        _o.nonOwningReference = this.nonOwningReference();
+        _o.vectorOfNonOwningReferences = this.bb.createScalarList(this.vectorOfNonOwningReferences.bind(this), this.vectorOfNonOwningReferencesLength());
+        _o.anyUniqueType = this.anyUniqueType();
+        _o.anyUnique = (() => {
+            let temp = unionToAnyUniqueAliases(this.anyUniqueType(), this.anyUnique.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })();
+        _o.anyAmbiguousType = this.anyAmbiguousType();
+        _o.anyAmbiguous = (() => {
+            let temp = unionToAnyAmbiguousAliases(this.anyAmbiguousType(), this.anyAmbiguous.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })();
+        _o.vectorOfEnums = this.bb.createScalarList(this.vectorOfEnums.bind(this), this.vectorOfEnumsLength());
+        _o.signedEnum = this.signedEnum();
+        _o.testrequirednestedflatbuffer = this.bb.createScalarList(this.testrequirednestedflatbuffer.bind(this), this.testrequirednestedflatbufferLength());
+        _o.scalarKeySortedTables = this.bb.createObjList(this.scalarKeySortedTables.bind(this), this.scalarKeySortedTablesLength());
+    }
+}
+export class MonsterT {
+    constructor(pos = null, mana = 150, hp = 100, name = null, inventory = [], color = Color.Blue, testType = Any.NONE, test = null, test4 = [], testarrayofstring = [], testarrayoftables = [], enemy = null, testnestedflatbuffer = [], testempty = null, testbool = false, testhashs32Fnv1 = 0, testhashu32Fnv1 = 0, testhashs64Fnv1 = flatbuffers.createLong(0, 0), testhashu64Fnv1 = flatbuffers.createLong(0, 0), testhashs32Fnv1a = 0, testhashu32Fnv1a = 0, testhashs64Fnv1a = flatbuffers.createLong(0, 0), testhashu64Fnv1a = flatbuffers.createLong(0, 0), testarrayofbools = [], testf = 3.14159, testf2 = 3.0, testf3 = 0.0, testarrayofstring2 = [], testarrayofsortedstruct = [], flex = [], test5 = [], vectorOfLongs = [], vectorOfDoubles = [], parentNamespaceTest = null, vectorOfReferrables = [], singleWeakReference = flatbuffers.createLong(0, 0), vectorOfWeakReferences = [], vectorOfStrongReferrables = [], coOwningReference = flatbuffers.createLong(0, 0), vectorOfCoOwningReferences = [], nonOwningReference = flatbuffers.createLong(0, 0), vectorOfNonOwningReferences = [], anyUniqueType = AnyUniqueAliases.NONE, anyUnique = null, anyAmbiguousType = AnyAmbiguousAliases.NONE, anyAmbiguous = null, vectorOfEnums = [], signedEnum = Race.None, testrequirednestedflatbuffer = [], scalarKeySortedTables = []) {
+        this.pos = pos;
+        this.mana = mana;
+        this.hp = hp;
+        this.name = name;
+        this.inventory = inventory;
+        this.color = color;
+        this.testType = testType;
+        this.test = test;
+        this.test4 = test4;
+        this.testarrayofstring = testarrayofstring;
+        this.testarrayoftables = testarrayoftables;
+        this.enemy = enemy;
+        this.testnestedflatbuffer = testnestedflatbuffer;
+        this.testempty = testempty;
+        this.testbool = testbool;
+        this.testhashs32Fnv1 = testhashs32Fnv1;
+        this.testhashu32Fnv1 = testhashu32Fnv1;
+        this.testhashs64Fnv1 = testhashs64Fnv1;
+        this.testhashu64Fnv1 = testhashu64Fnv1;
+        this.testhashs32Fnv1a = testhashs32Fnv1a;
+        this.testhashu32Fnv1a = testhashu32Fnv1a;
+        this.testhashs64Fnv1a = testhashs64Fnv1a;
+        this.testhashu64Fnv1a = testhashu64Fnv1a;
+        this.testarrayofbools = testarrayofbools;
+        this.testf = testf;
+        this.testf2 = testf2;
+        this.testf3 = testf3;
+        this.testarrayofstring2 = testarrayofstring2;
+        this.testarrayofsortedstruct = testarrayofsortedstruct;
+        this.flex = flex;
+        this.test5 = test5;
+        this.vectorOfLongs = vectorOfLongs;
+        this.vectorOfDoubles = vectorOfDoubles;
+        this.parentNamespaceTest = parentNamespaceTest;
+        this.vectorOfReferrables = vectorOfReferrables;
+        this.singleWeakReference = singleWeakReference;
+        this.vectorOfWeakReferences = vectorOfWeakReferences;
+        this.vectorOfStrongReferrables = vectorOfStrongReferrables;
+        this.coOwningReference = coOwningReference;
+        this.vectorOfCoOwningReferences = vectorOfCoOwningReferences;
+        this.nonOwningReference = nonOwningReference;
+        this.vectorOfNonOwningReferences = vectorOfNonOwningReferences;
+        this.anyUniqueType = anyUniqueType;
+        this.anyUnique = anyUnique;
+        this.anyAmbiguousType = anyAmbiguousType;
+        this.anyAmbiguous = anyAmbiguous;
+        this.vectorOfEnums = vectorOfEnums;
+        this.signedEnum = signedEnum;
+        this.testrequirednestedflatbuffer = testrequirednestedflatbuffer;
+        this.scalarKeySortedTables = scalarKeySortedTables;
+    }
+    pack(builder) {
+        const name = (this.name !== null ? builder.createString(this.name) : 0);
+        const inventory = Monster.createInventoryVector(builder, this.inventory);
+        const test = builder.createObjectOffset(this.test);
+        const test4 = builder.createStructOffsetList(this.test4, Monster.startTest4Vector);
+        const testarrayofstring = Monster.createTestarrayofstringVector(builder, builder.createObjectOffsetList(this.testarrayofstring));
+        const testarrayoftables = Monster.createTestarrayoftablesVector(builder, builder.createObjectOffsetList(this.testarrayoftables));
+        const enemy = (this.enemy !== null ? this.enemy.pack(builder) : 0);
+        const testnestedflatbuffer = Monster.createTestnestedflatbufferVector(builder, this.testnestedflatbuffer);
+        const testempty = (this.testempty !== null ? this.testempty.pack(builder) : 0);
+        const testarrayofbools = Monster.createTestarrayofboolsVector(builder, this.testarrayofbools);
+        const testarrayofstring2 = Monster.createTestarrayofstring2Vector(builder, builder.createObjectOffsetList(this.testarrayofstring2));
+        const testarrayofsortedstruct = builder.createStructOffsetList(this.testarrayofsortedstruct, Monster.startTestarrayofsortedstructVector);
+        const flex = Monster.createFlexVector(builder, this.flex);
+        const test5 = builder.createStructOffsetList(this.test5, Monster.startTest5Vector);
+        const vectorOfLongs = Monster.createVectorOfLongsVector(builder, this.vectorOfLongs);
+        const vectorOfDoubles = Monster.createVectorOfDoublesVector(builder, this.vectorOfDoubles);
+        const parentNamespaceTest = (this.parentNamespaceTest !== null ? this.parentNamespaceTest.pack(builder) : 0);
+        const vectorOfReferrables = Monster.createVectorOfReferrablesVector(builder, builder.createObjectOffsetList(this.vectorOfReferrables));
+        const vectorOfWeakReferences = Monster.createVectorOfWeakReferencesVector(builder, this.vectorOfWeakReferences);
+        const vectorOfStrongReferrables = Monster.createVectorOfStrongReferrablesVector(builder, builder.createObjectOffsetList(this.vectorOfStrongReferrables));
+        const vectorOfCoOwningReferences = Monster.createVectorOfCoOwningReferencesVector(builder, this.vectorOfCoOwningReferences);
+        const vectorOfNonOwningReferences = Monster.createVectorOfNonOwningReferencesVector(builder, this.vectorOfNonOwningReferences);
+        const anyUnique = builder.createObjectOffset(this.anyUnique);
+        const anyAmbiguous = builder.createObjectOffset(this.anyAmbiguous);
+        const vectorOfEnums = Monster.createVectorOfEnumsVector(builder, this.vectorOfEnums);
+        const testrequirednestedflatbuffer = Monster.createTestrequirednestedflatbufferVector(builder, this.testrequirednestedflatbuffer);
+        const scalarKeySortedTables = Monster.createScalarKeySortedTablesVector(builder, builder.createObjectOffsetList(this.scalarKeySortedTables));
+        Monster.startMonster(builder);
+        Monster.addPos(builder, (this.pos !== null ? this.pos.pack(builder) : 0));
+        Monster.addMana(builder, this.mana);
+        Monster.addHp(builder, this.hp);
+        Monster.addName(builder, name);
+        Monster.addInventory(builder, inventory);
+        Monster.addColor(builder, this.color);
+        Monster.addTestType(builder, this.testType);
+        Monster.addTest(builder, test);
+        Monster.addTest4(builder, test4);
+        Monster.addTestarrayofstring(builder, testarrayofstring);
+        Monster.addTestarrayoftables(builder, testarrayoftables);
+        Monster.addEnemy(builder, enemy);
+        Monster.addTestnestedflatbuffer(builder, testnestedflatbuffer);
+        Monster.addTestempty(builder, testempty);
+        Monster.addTestbool(builder, this.testbool);
+        Monster.addTesthashs32Fnv1(builder, this.testhashs32Fnv1);
+        Monster.addTesthashu32Fnv1(builder, this.testhashu32Fnv1);
+        Monster.addTesthashs64Fnv1(builder, this.testhashs64Fnv1);
+        Monster.addTesthashu64Fnv1(builder, this.testhashu64Fnv1);
+        Monster.addTesthashs32Fnv1a(builder, this.testhashs32Fnv1a);
+        Monster.addTesthashu32Fnv1a(builder, this.testhashu32Fnv1a);
+        Monster.addTesthashs64Fnv1a(builder, this.testhashs64Fnv1a);
+        Monster.addTesthashu64Fnv1a(builder, this.testhashu64Fnv1a);
+        Monster.addTestarrayofbools(builder, testarrayofbools);
+        Monster.addTestf(builder, this.testf);
+        Monster.addTestf2(builder, this.testf2);
+        Monster.addTestf3(builder, this.testf3);
+        Monster.addTestarrayofstring2(builder, testarrayofstring2);
+        Monster.addTestarrayofsortedstruct(builder, testarrayofsortedstruct);
+        Monster.addFlex(builder, flex);
+        Monster.addTest5(builder, test5);
+        Monster.addVectorOfLongs(builder, vectorOfLongs);
+        Monster.addVectorOfDoubles(builder, vectorOfDoubles);
+        Monster.addParentNamespaceTest(builder, parentNamespaceTest);
+        Monster.addVectorOfReferrables(builder, vectorOfReferrables);
+        Monster.addSingleWeakReference(builder, this.singleWeakReference);
+        Monster.addVectorOfWeakReferences(builder, vectorOfWeakReferences);
+        Monster.addVectorOfStrongReferrables(builder, vectorOfStrongReferrables);
+        Monster.addCoOwningReference(builder, this.coOwningReference);
+        Monster.addVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences);
+        Monster.addNonOwningReference(builder, this.nonOwningReference);
+        Monster.addVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences);
+        Monster.addAnyUniqueType(builder, this.anyUniqueType);
+        Monster.addAnyUnique(builder, anyUnique);
+        Monster.addAnyAmbiguousType(builder, this.anyAmbiguousType);
+        Monster.addAnyAmbiguous(builder, anyAmbiguous);
+        Monster.addVectorOfEnums(builder, vectorOfEnums);
+        Monster.addSignedEnum(builder, this.signedEnum);
+        Monster.addTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer);
+        Monster.addScalarKeySortedTables(builder, scalarKeySortedTables);
+        return Monster.endMonster(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.ts
new file mode 100644
index 0000000..b4ee6e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/monster.ts
@@ -0,0 +1,1373 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { Monster as MyGameExample2Monster, MonsterT as MyGameExample2MonsterT } from '../../my-game/example2/monster';
+import { Ability, AbilityT } from '../../my-game/example/ability';
+import { Any, unionToAny, unionListToAny } from '../../my-game/example/any';
+import { AnyAmbiguousAliases, unionToAnyAmbiguousAliases, unionListToAnyAmbiguousAliases } from '../../my-game/example/any-ambiguous-aliases';
+import { AnyUniqueAliases, unionToAnyUniqueAliases, unionListToAnyUniqueAliases } from '../../my-game/example/any-unique-aliases';
+import { Color } from '../../my-game/example/color';
+import { Race } from '../../my-game/example/race';
+import { Referrable, ReferrableT } from '../../my-game/example/referrable';
+import { Stat, StatT } from '../../my-game/example/stat';
+import { Test, TestT } from '../../my-game/example/test';
+import { TestSimpleTableWithEnum, TestSimpleTableWithEnumT } from '../../my-game/example/test-simple-table-with-enum';
+import { Vec3, Vec3T } from '../../my-game/example/vec3';
+import { InParentNamespace, InParentNamespaceT } from '../../my-game/in-parent-namespace';
+
+
+/**
+ * an example documentation comment: "monster object"
+ */
+export class Monster {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Monster {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsMonster(bb:flatbuffers.ByteBuffer, obj?:Monster):Monster {
+  return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsMonster(bb:flatbuffers.ByteBuffer, obj?:Monster):Monster {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static bufferHasIdentifier(bb:flatbuffers.ByteBuffer):boolean {
+  return bb.__has_identifier('MONS');
+}
+
+pos(obj?:Vec3):Vec3|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? (obj || new Vec3()).__init(this.bb_pos + offset, this.bb!) : null;
+}
+
+mana():number {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : 150;
+}
+
+mutate_mana(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt16(this.bb_pos + offset, value);
+  return true;
+}
+
+hp():number {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : 100;
+}
+
+mutate_hp(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt16(this.bb_pos + offset, value);
+  return true;
+}
+
+name():string|null
+name(optionalEncoding:flatbuffers.Encoding):string|Uint8Array|null
+name(optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null;
+}
+
+inventory(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+inventoryLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+inventoryArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+color():Color {
+  const offset = this.bb!.__offset(this.bb_pos, 16);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : Color.Blue;
+}
+
+mutate_color(value:Color):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 16);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint8(this.bb_pos + offset, value);
+  return true;
+}
+
+testType():Any {
+  const offset = this.bb!.__offset(this.bb_pos, 18);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : Any.NONE;
+}
+
+test<T extends flatbuffers.Table>(obj:any):any|null {
+  const offset = this.bb!.__offset(this.bb_pos, 20);
+  return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null;
+}
+
+test4(index: number, obj?:Test):Test|null {
+  const offset = this.bb!.__offset(this.bb_pos, 22);
+  return offset ? (obj || new Test()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 4, this.bb!) : null;
+}
+
+test4Length():number {
+  const offset = this.bb!.__offset(this.bb_pos, 22);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+testarrayofstring(index: number):string
+testarrayofstring(index: number,optionalEncoding:flatbuffers.Encoding):string|Uint8Array
+testarrayofstring(index: number,optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? this.bb!.__string(this.bb!.__vector(this.bb_pos + offset) + index * 4, optionalEncoding) : null;
+}
+
+testarrayofstringLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+/**
+ * an example documentation comment: this will end up in the generated code
+ * multiline too
+ */
+testarrayoftables(index: number, obj?:Monster):Monster|null {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? (obj || new Monster()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null;
+}
+
+testarrayoftablesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+enemy(obj?:Monster):Monster|null {
+  const offset = this.bb!.__offset(this.bb_pos, 28);
+  return offset ? (obj || new Monster()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+testnestedflatbuffer(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 30);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+testnestedflatbufferLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 30);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+testnestedflatbufferArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 30);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+testempty(obj?:Stat):Stat|null {
+  const offset = this.bb!.__offset(this.bb_pos, 32);
+  return offset ? (obj || new Stat()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+testbool():boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 34);
+  return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false;
+}
+
+mutate_testbool(value:boolean):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 34);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt8(this.bb_pos + offset, +value);
+  return true;
+}
+
+testhashs32Fnv1():number {
+  const offset = this.bb!.__offset(this.bb_pos, 36);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+mutate_testhashs32_fnv1(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 36);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt32(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashu32Fnv1():number {
+  const offset = this.bb!.__offset(this.bb_pos, 38);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : 0;
+}
+
+mutate_testhashu32_fnv1(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 38);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint32(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashs64Fnv1():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 40);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_testhashs64_fnv1(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 40);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt64(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashu64Fnv1():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 42);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_testhashu64_fnv1(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 42);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashs32Fnv1a():number {
+  const offset = this.bb!.__offset(this.bb_pos, 44);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+mutate_testhashs32_fnv1a(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 44);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt32(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashu32Fnv1a():number {
+  const offset = this.bb!.__offset(this.bb_pos, 46);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : 0;
+}
+
+mutate_testhashu32_fnv1a(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 46);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint32(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashs64Fnv1a():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 48);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_testhashs64_fnv1a(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 48);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt64(this.bb_pos + offset, value);
+  return true;
+}
+
+testhashu64Fnv1a():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 50);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_testhashu64_fnv1a(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 50);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+testarrayofbools(index: number):boolean|null {
+  const offset = this.bb!.__offset(this.bb_pos, 52);
+  return offset ? !!this.bb!.readInt8(this.bb!.__vector(this.bb_pos + offset) + index) : false;
+}
+
+testarrayofboolsLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 52);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+testarrayofboolsArray():Int8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 52);
+  return offset ? new Int8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+testf():number {
+  const offset = this.bb!.__offset(this.bb_pos, 54);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 3.14159;
+}
+
+mutate_testf(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 54);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeFloat32(this.bb_pos + offset, value);
+  return true;
+}
+
+testf2():number {
+  const offset = this.bb!.__offset(this.bb_pos, 56);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 3.0;
+}
+
+mutate_testf2(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 56);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeFloat32(this.bb_pos + offset, value);
+  return true;
+}
+
+testf3():number {
+  const offset = this.bb!.__offset(this.bb_pos, 58);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 0.0;
+}
+
+mutate_testf3(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 58);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeFloat32(this.bb_pos + offset, value);
+  return true;
+}
+
+testarrayofstring2(index: number):string
+testarrayofstring2(index: number,optionalEncoding:flatbuffers.Encoding):string|Uint8Array
+testarrayofstring2(index: number,optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 60);
+  return offset ? this.bb!.__string(this.bb!.__vector(this.bb_pos + offset) + index * 4, optionalEncoding) : null;
+}
+
+testarrayofstring2Length():number {
+  const offset = this.bb!.__offset(this.bb_pos, 60);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+testarrayofsortedstruct(index: number, obj?:Ability):Ability|null {
+  const offset = this.bb!.__offset(this.bb_pos, 62);
+  return offset ? (obj || new Ability()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 8, this.bb!) : null;
+}
+
+testarrayofsortedstructLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 62);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+flex(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 64);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+flexLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 64);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+flexArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 64);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+test5(index: number, obj?:Test):Test|null {
+  const offset = this.bb!.__offset(this.bb_pos, 66);
+  return offset ? (obj || new Test()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 4, this.bb!) : null;
+}
+
+test5Length():number {
+  const offset = this.bb!.__offset(this.bb_pos, 66);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vectorOfLongs(index: number):flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 68);
+  return offset ? this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : this.bb!.createLong(0, 0);
+}
+
+vectorOfLongsLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 68);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vectorOfDoubles(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 70);
+  return offset ? this.bb!.readFloat64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : 0;
+}
+
+vectorOfDoublesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 70);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vectorOfDoublesArray():Float64Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 70);
+  return offset ? new Float64Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+parentNamespaceTest(obj?:InParentNamespace):InParentNamespace|null {
+  const offset = this.bb!.__offset(this.bb_pos, 72);
+  return offset ? (obj || new InParentNamespace()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+vectorOfReferrables(index: number, obj?:Referrable):Referrable|null {
+  const offset = this.bb!.__offset(this.bb_pos, 74);
+  return offset ? (obj || new Referrable()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null;
+}
+
+vectorOfReferrablesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 74);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+singleWeakReference():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 76);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_single_weak_reference(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 76);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+vectorOfWeakReferences(index: number):flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 78);
+  return offset ? this.bb!.readUint64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : this.bb!.createLong(0, 0);
+}
+
+vectorOfWeakReferencesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 78);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vectorOfStrongReferrables(index: number, obj?:Referrable):Referrable|null {
+  const offset = this.bb!.__offset(this.bb_pos, 80);
+  return offset ? (obj || new Referrable()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null;
+}
+
+vectorOfStrongReferrablesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 80);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+coOwningReference():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 82);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_co_owning_reference(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 82);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+vectorOfCoOwningReferences(index: number):flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 84);
+  return offset ? this.bb!.readUint64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : this.bb!.createLong(0, 0);
+}
+
+vectorOfCoOwningReferencesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 84);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+nonOwningReference():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 86);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_non_owning_reference(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 86);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+vectorOfNonOwningReferences(index: number):flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 88);
+  return offset ? this.bb!.readUint64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : this.bb!.createLong(0, 0);
+}
+
+vectorOfNonOwningReferencesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 88);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+anyUniqueType():AnyUniqueAliases {
+  const offset = this.bb!.__offset(this.bb_pos, 90);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : AnyUniqueAliases.NONE;
+}
+
+anyUnique<T extends flatbuffers.Table>(obj:any):any|null {
+  const offset = this.bb!.__offset(this.bb_pos, 92);
+  return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null;
+}
+
+anyAmbiguousType():AnyAmbiguousAliases {
+  const offset = this.bb!.__offset(this.bb_pos, 94);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : AnyAmbiguousAliases.NONE;
+}
+
+anyAmbiguous<T extends flatbuffers.Table>(obj:any):any|null {
+  const offset = this.bb!.__offset(this.bb_pos, 96);
+  return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null;
+}
+
+vectorOfEnums(index: number):Color|null {
+  const offset = this.bb!.__offset(this.bb_pos, 98);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+vectorOfEnumsLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 98);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vectorOfEnumsArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 98);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+signedEnum():Race {
+  const offset = this.bb!.__offset(this.bb_pos, 100);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : Race.None;
+}
+
+mutate_signed_enum(value:Race):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 100);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt8(this.bb_pos + offset, value);
+  return true;
+}
+
+testrequirednestedflatbuffer(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 102);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+testrequirednestedflatbufferLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 102);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+testrequirednestedflatbufferArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 102);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+scalarKeySortedTables(index: number, obj?:Stat):Stat|null {
+  const offset = this.bb!.__offset(this.bb_pos, 104);
+  return offset ? (obj || new Stat()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null;
+}
+
+scalarKeySortedTablesLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 104);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Monster';
+}
+
+static startMonster(builder:flatbuffers.Builder) {
+  builder.startObject(51);
+}
+
+static addPos(builder:flatbuffers.Builder, posOffset:flatbuffers.Offset) {
+  builder.addFieldStruct(0, posOffset, 0);
+}
+
+static addMana(builder:flatbuffers.Builder, mana:number) {
+  builder.addFieldInt16(1, mana, 150);
+}
+
+static addHp(builder:flatbuffers.Builder, hp:number) {
+  builder.addFieldInt16(2, hp, 100);
+}
+
+static addName(builder:flatbuffers.Builder, nameOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(3, nameOffset, 0);
+}
+
+static addInventory(builder:flatbuffers.Builder, inventoryOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(5, inventoryOffset, 0);
+}
+
+static createInventoryVector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startInventoryVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addColor(builder:flatbuffers.Builder, color:Color) {
+  builder.addFieldInt8(6, color, Color.Blue);
+}
+
+static addTestType(builder:flatbuffers.Builder, testType:Any) {
+  builder.addFieldInt8(7, testType, Any.NONE);
+}
+
+static addTest(builder:flatbuffers.Builder, testOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(8, testOffset, 0);
+}
+
+static addTest4(builder:flatbuffers.Builder, test4Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(9, test4Offset, 0);
+}
+
+static startTest4Vector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 2);
+}
+
+static addTestarrayofstring(builder:flatbuffers.Builder, testarrayofstringOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(10, testarrayofstringOffset, 0);
+}
+
+static createTestarrayofstringVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestarrayofstringVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static addTestarrayoftables(builder:flatbuffers.Builder, testarrayoftablesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(11, testarrayoftablesOffset, 0);
+}
+
+static createTestarrayoftablesVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestarrayoftablesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static addEnemy(builder:flatbuffers.Builder, enemyOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(12, enemyOffset, 0);
+}
+
+static addTestnestedflatbuffer(builder:flatbuffers.Builder, testnestedflatbufferOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(13, testnestedflatbufferOffset, 0);
+}
+
+static createTestnestedflatbufferVector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestnestedflatbufferVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addTestempty(builder:flatbuffers.Builder, testemptyOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(14, testemptyOffset, 0);
+}
+
+static addTestbool(builder:flatbuffers.Builder, testbool:boolean) {
+  builder.addFieldInt8(15, +testbool, +false);
+}
+
+static addTesthashs32Fnv1(builder:flatbuffers.Builder, testhashs32Fnv1:number) {
+  builder.addFieldInt32(16, testhashs32Fnv1, 0);
+}
+
+static addTesthashu32Fnv1(builder:flatbuffers.Builder, testhashu32Fnv1:number) {
+  builder.addFieldInt32(17, testhashu32Fnv1, 0);
+}
+
+static addTesthashs64Fnv1(builder:flatbuffers.Builder, testhashs64Fnv1:flatbuffers.Long) {
+  builder.addFieldInt64(18, testhashs64Fnv1, builder.createLong(0, 0));
+}
+
+static addTesthashu64Fnv1(builder:flatbuffers.Builder, testhashu64Fnv1:flatbuffers.Long) {
+  builder.addFieldInt64(19, testhashu64Fnv1, builder.createLong(0, 0));
+}
+
+static addTesthashs32Fnv1a(builder:flatbuffers.Builder, testhashs32Fnv1a:number) {
+  builder.addFieldInt32(20, testhashs32Fnv1a, 0);
+}
+
+static addTesthashu32Fnv1a(builder:flatbuffers.Builder, testhashu32Fnv1a:number) {
+  builder.addFieldInt32(21, testhashu32Fnv1a, 0);
+}
+
+static addTesthashs64Fnv1a(builder:flatbuffers.Builder, testhashs64Fnv1a:flatbuffers.Long) {
+  builder.addFieldInt64(22, testhashs64Fnv1a, builder.createLong(0, 0));
+}
+
+static addTesthashu64Fnv1a(builder:flatbuffers.Builder, testhashu64Fnv1a:flatbuffers.Long) {
+  builder.addFieldInt64(23, testhashu64Fnv1a, builder.createLong(0, 0));
+}
+
+static addTestarrayofbools(builder:flatbuffers.Builder, testarrayofboolsOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(24, testarrayofboolsOffset, 0);
+}
+
+static createTestarrayofboolsVector(builder:flatbuffers.Builder, data:boolean[]):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(+data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestarrayofboolsVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addTestf(builder:flatbuffers.Builder, testf:number) {
+  builder.addFieldFloat32(25, testf, 3.14159);
+}
+
+static addTestf2(builder:flatbuffers.Builder, testf2:number) {
+  builder.addFieldFloat32(26, testf2, 3.0);
+}
+
+static addTestf3(builder:flatbuffers.Builder, testf3:number) {
+  builder.addFieldFloat32(27, testf3, 0.0);
+}
+
+static addTestarrayofstring2(builder:flatbuffers.Builder, testarrayofstring2Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(28, testarrayofstring2Offset, 0);
+}
+
+static createTestarrayofstring2Vector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestarrayofstring2Vector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static addTestarrayofsortedstruct(builder:flatbuffers.Builder, testarrayofsortedstructOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(29, testarrayofsortedstructOffset, 0);
+}
+
+static startTestarrayofsortedstructVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 4);
+}
+
+static addFlex(builder:flatbuffers.Builder, flexOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(30, flexOffset, 0);
+}
+
+static createFlexVector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startFlexVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addTest5(builder:flatbuffers.Builder, test5Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(31, test5Offset, 0);
+}
+
+static startTest5Vector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 2);
+}
+
+static addVectorOfLongs(builder:flatbuffers.Builder, vectorOfLongsOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(32, vectorOfLongsOffset, 0);
+}
+
+static createVectorOfLongsVector(builder:flatbuffers.Builder, data:flatbuffers.Long[]):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfLongsVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static addVectorOfDoubles(builder:flatbuffers.Builder, vectorOfDoublesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(33, vectorOfDoublesOffset, 0);
+}
+
+static createVectorOfDoublesVector(builder:flatbuffers.Builder, data:number[]|Float64Array):flatbuffers.Offset;
+/**
+ * @deprecated This Uint8Array overload will be removed in the future.
+ */
+static createVectorOfDoublesVector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset;
+static createVectorOfDoublesVector(builder:flatbuffers.Builder, data:number[]|Float64Array|Uint8Array):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addFloat64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfDoublesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static addParentNamespaceTest(builder:flatbuffers.Builder, parentNamespaceTestOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(34, parentNamespaceTestOffset, 0);
+}
+
+static addVectorOfReferrables(builder:flatbuffers.Builder, vectorOfReferrablesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(35, vectorOfReferrablesOffset, 0);
+}
+
+static createVectorOfReferrablesVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfReferrablesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static addSingleWeakReference(builder:flatbuffers.Builder, singleWeakReference:flatbuffers.Long) {
+  builder.addFieldInt64(36, singleWeakReference, builder.createLong(0, 0));
+}
+
+static addVectorOfWeakReferences(builder:flatbuffers.Builder, vectorOfWeakReferencesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(37, vectorOfWeakReferencesOffset, 0);
+}
+
+static createVectorOfWeakReferencesVector(builder:flatbuffers.Builder, data:flatbuffers.Long[]):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfWeakReferencesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static addVectorOfStrongReferrables(builder:flatbuffers.Builder, vectorOfStrongReferrablesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(38, vectorOfStrongReferrablesOffset, 0);
+}
+
+static createVectorOfStrongReferrablesVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfStrongReferrablesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static addCoOwningReference(builder:flatbuffers.Builder, coOwningReference:flatbuffers.Long) {
+  builder.addFieldInt64(39, coOwningReference, builder.createLong(0, 0));
+}
+
+static addVectorOfCoOwningReferences(builder:flatbuffers.Builder, vectorOfCoOwningReferencesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(40, vectorOfCoOwningReferencesOffset, 0);
+}
+
+static createVectorOfCoOwningReferencesVector(builder:flatbuffers.Builder, data:flatbuffers.Long[]):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfCoOwningReferencesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static addNonOwningReference(builder:flatbuffers.Builder, nonOwningReference:flatbuffers.Long) {
+  builder.addFieldInt64(41, nonOwningReference, builder.createLong(0, 0));
+}
+
+static addVectorOfNonOwningReferences(builder:flatbuffers.Builder, vectorOfNonOwningReferencesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(42, vectorOfNonOwningReferencesOffset, 0);
+}
+
+static createVectorOfNonOwningReferencesVector(builder:flatbuffers.Builder, data:flatbuffers.Long[]):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfNonOwningReferencesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static addAnyUniqueType(builder:flatbuffers.Builder, anyUniqueType:AnyUniqueAliases) {
+  builder.addFieldInt8(43, anyUniqueType, AnyUniqueAliases.NONE);
+}
+
+static addAnyUnique(builder:flatbuffers.Builder, anyUniqueOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(44, anyUniqueOffset, 0);
+}
+
+static addAnyAmbiguousType(builder:flatbuffers.Builder, anyAmbiguousType:AnyAmbiguousAliases) {
+  builder.addFieldInt8(45, anyAmbiguousType, AnyAmbiguousAliases.NONE);
+}
+
+static addAnyAmbiguous(builder:flatbuffers.Builder, anyAmbiguousOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(46, anyAmbiguousOffset, 0);
+}
+
+static addVectorOfEnums(builder:flatbuffers.Builder, vectorOfEnumsOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(47, vectorOfEnumsOffset, 0);
+}
+
+static createVectorOfEnumsVector(builder:flatbuffers.Builder, data:Color[]):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVectorOfEnumsVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addSignedEnum(builder:flatbuffers.Builder, signedEnum:Race) {
+  builder.addFieldInt8(48, signedEnum, Race.None);
+}
+
+static addTestrequirednestedflatbuffer(builder:flatbuffers.Builder, testrequirednestedflatbufferOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(49, testrequirednestedflatbufferOffset, 0);
+}
+
+static createTestrequirednestedflatbufferVector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startTestrequirednestedflatbufferVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addScalarKeySortedTables(builder:flatbuffers.Builder, scalarKeySortedTablesOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(50, scalarKeySortedTablesOffset, 0);
+}
+
+static createScalarKeySortedTablesVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startScalarKeySortedTablesVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static endMonster(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  builder.requiredField(offset, 10) // name
+  return offset;
+}
+
+static finishMonsterBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'MONS');
+}
+
+static finishSizePrefixedMonsterBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'MONS', true);
+}
+
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):Monster {
+  return Monster.getRootAsMonster(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): MonsterT {
+  return new MonsterT(
+    (this.pos() !== null ? this.pos()!.unpack() : null),
+    this.mana(),
+    this.hp(),
+    this.name(),
+    this.bb!.createScalarList(this.inventory.bind(this), this.inventoryLength()),
+    this.color(),
+    this.testType(),
+    (() => {
+      let temp = unionToAny(this.testType(), this.test.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })(),
+    this.bb!.createObjList(this.test4.bind(this), this.test4Length()),
+    this.bb!.createScalarList(this.testarrayofstring.bind(this), this.testarrayofstringLength()),
+    this.bb!.createObjList(this.testarrayoftables.bind(this), this.testarrayoftablesLength()),
+    (this.enemy() !== null ? this.enemy()!.unpack() : null),
+    this.bb!.createScalarList(this.testnestedflatbuffer.bind(this), this.testnestedflatbufferLength()),
+    (this.testempty() !== null ? this.testempty()!.unpack() : null),
+    this.testbool(),
+    this.testhashs32Fnv1(),
+    this.testhashu32Fnv1(),
+    this.testhashs64Fnv1(),
+    this.testhashu64Fnv1(),
+    this.testhashs32Fnv1a(),
+    this.testhashu32Fnv1a(),
+    this.testhashs64Fnv1a(),
+    this.testhashu64Fnv1a(),
+    this.bb!.createScalarList(this.testarrayofbools.bind(this), this.testarrayofboolsLength()),
+    this.testf(),
+    this.testf2(),
+    this.testf3(),
+    this.bb!.createScalarList(this.testarrayofstring2.bind(this), this.testarrayofstring2Length()),
+    this.bb!.createObjList(this.testarrayofsortedstruct.bind(this), this.testarrayofsortedstructLength()),
+    this.bb!.createScalarList(this.flex.bind(this), this.flexLength()),
+    this.bb!.createObjList(this.test5.bind(this), this.test5Length()),
+    this.bb!.createScalarList(this.vectorOfLongs.bind(this), this.vectorOfLongsLength()),
+    this.bb!.createScalarList(this.vectorOfDoubles.bind(this), this.vectorOfDoublesLength()),
+    (this.parentNamespaceTest() !== null ? this.parentNamespaceTest()!.unpack() : null),
+    this.bb!.createObjList(this.vectorOfReferrables.bind(this), this.vectorOfReferrablesLength()),
+    this.singleWeakReference(),
+    this.bb!.createScalarList(this.vectorOfWeakReferences.bind(this), this.vectorOfWeakReferencesLength()),
+    this.bb!.createObjList(this.vectorOfStrongReferrables.bind(this), this.vectorOfStrongReferrablesLength()),
+    this.coOwningReference(),
+    this.bb!.createScalarList(this.vectorOfCoOwningReferences.bind(this), this.vectorOfCoOwningReferencesLength()),
+    this.nonOwningReference(),
+    this.bb!.createScalarList(this.vectorOfNonOwningReferences.bind(this), this.vectorOfNonOwningReferencesLength()),
+    this.anyUniqueType(),
+    (() => {
+      let temp = unionToAnyUniqueAliases(this.anyUniqueType(), this.anyUnique.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })(),
+    this.anyAmbiguousType(),
+    (() => {
+      let temp = unionToAnyAmbiguousAliases(this.anyAmbiguousType(), this.anyAmbiguous.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })(),
+    this.bb!.createScalarList(this.vectorOfEnums.bind(this), this.vectorOfEnumsLength()),
+    this.signedEnum(),
+    this.bb!.createScalarList(this.testrequirednestedflatbuffer.bind(this), this.testrequirednestedflatbufferLength()),
+    this.bb!.createObjList(this.scalarKeySortedTables.bind(this), this.scalarKeySortedTablesLength())
+  );
+}
+
+
+unpackTo(_o: MonsterT): void {
+  _o.pos = (this.pos() !== null ? this.pos()!.unpack() : null);
+  _o.mana = this.mana();
+  _o.hp = this.hp();
+  _o.name = this.name();
+  _o.inventory = this.bb!.createScalarList(this.inventory.bind(this), this.inventoryLength());
+  _o.color = this.color();
+  _o.testType = this.testType();
+  _o.test = (() => {
+      let temp = unionToAny(this.testType(), this.test.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })();
+  _o.test4 = this.bb!.createObjList(this.test4.bind(this), this.test4Length());
+  _o.testarrayofstring = this.bb!.createScalarList(this.testarrayofstring.bind(this), this.testarrayofstringLength());
+  _o.testarrayoftables = this.bb!.createObjList(this.testarrayoftables.bind(this), this.testarrayoftablesLength());
+  _o.enemy = (this.enemy() !== null ? this.enemy()!.unpack() : null);
+  _o.testnestedflatbuffer = this.bb!.createScalarList(this.testnestedflatbuffer.bind(this), this.testnestedflatbufferLength());
+  _o.testempty = (this.testempty() !== null ? this.testempty()!.unpack() : null);
+  _o.testbool = this.testbool();
+  _o.testhashs32Fnv1 = this.testhashs32Fnv1();
+  _o.testhashu32Fnv1 = this.testhashu32Fnv1();
+  _o.testhashs64Fnv1 = this.testhashs64Fnv1();
+  _o.testhashu64Fnv1 = this.testhashu64Fnv1();
+  _o.testhashs32Fnv1a = this.testhashs32Fnv1a();
+  _o.testhashu32Fnv1a = this.testhashu32Fnv1a();
+  _o.testhashs64Fnv1a = this.testhashs64Fnv1a();
+  _o.testhashu64Fnv1a = this.testhashu64Fnv1a();
+  _o.testarrayofbools = this.bb!.createScalarList(this.testarrayofbools.bind(this), this.testarrayofboolsLength());
+  _o.testf = this.testf();
+  _o.testf2 = this.testf2();
+  _o.testf3 = this.testf3();
+  _o.testarrayofstring2 = this.bb!.createScalarList(this.testarrayofstring2.bind(this), this.testarrayofstring2Length());
+  _o.testarrayofsortedstruct = this.bb!.createObjList(this.testarrayofsortedstruct.bind(this), this.testarrayofsortedstructLength());
+  _o.flex = this.bb!.createScalarList(this.flex.bind(this), this.flexLength());
+  _o.test5 = this.bb!.createObjList(this.test5.bind(this), this.test5Length());
+  _o.vectorOfLongs = this.bb!.createScalarList(this.vectorOfLongs.bind(this), this.vectorOfLongsLength());
+  _o.vectorOfDoubles = this.bb!.createScalarList(this.vectorOfDoubles.bind(this), this.vectorOfDoublesLength());
+  _o.parentNamespaceTest = (this.parentNamespaceTest() !== null ? this.parentNamespaceTest()!.unpack() : null);
+  _o.vectorOfReferrables = this.bb!.createObjList(this.vectorOfReferrables.bind(this), this.vectorOfReferrablesLength());
+  _o.singleWeakReference = this.singleWeakReference();
+  _o.vectorOfWeakReferences = this.bb!.createScalarList(this.vectorOfWeakReferences.bind(this), this.vectorOfWeakReferencesLength());
+  _o.vectorOfStrongReferrables = this.bb!.createObjList(this.vectorOfStrongReferrables.bind(this), this.vectorOfStrongReferrablesLength());
+  _o.coOwningReference = this.coOwningReference();
+  _o.vectorOfCoOwningReferences = this.bb!.createScalarList(this.vectorOfCoOwningReferences.bind(this), this.vectorOfCoOwningReferencesLength());
+  _o.nonOwningReference = this.nonOwningReference();
+  _o.vectorOfNonOwningReferences = this.bb!.createScalarList(this.vectorOfNonOwningReferences.bind(this), this.vectorOfNonOwningReferencesLength());
+  _o.anyUniqueType = this.anyUniqueType();
+  _o.anyUnique = (() => {
+      let temp = unionToAnyUniqueAliases(this.anyUniqueType(), this.anyUnique.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })();
+  _o.anyAmbiguousType = this.anyAmbiguousType();
+  _o.anyAmbiguous = (() => {
+      let temp = unionToAnyAmbiguousAliases(this.anyAmbiguousType(), this.anyAmbiguous.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })();
+  _o.vectorOfEnums = this.bb!.createScalarList(this.vectorOfEnums.bind(this), this.vectorOfEnumsLength());
+  _o.signedEnum = this.signedEnum();
+  _o.testrequirednestedflatbuffer = this.bb!.createScalarList(this.testrequirednestedflatbuffer.bind(this), this.testrequirednestedflatbufferLength());
+  _o.scalarKeySortedTables = this.bb!.createObjList(this.scalarKeySortedTables.bind(this), this.scalarKeySortedTablesLength());
+}
+}
+
+export class MonsterT {
+constructor(
+  public pos: Vec3T|null = null,
+  public mana: number = 150,
+  public hp: number = 100,
+  public name: string|Uint8Array|null = null,
+  public inventory: (number)[] = [],
+  public color: Color = Color.Blue,
+  public testType: Any = Any.NONE,
+  public test: MonsterT|MyGameExample2MonsterT|TestSimpleTableWithEnumT|null = null,
+  public test4: (TestT)[] = [],
+  public testarrayofstring: (string)[] = [],
+  public testarrayoftables: (MonsterT)[] = [],
+  public enemy: MonsterT|null = null,
+  public testnestedflatbuffer: (number)[] = [],
+  public testempty: StatT|null = null,
+  public testbool: boolean = false,
+  public testhashs32Fnv1: number = 0,
+  public testhashu32Fnv1: number = 0,
+  public testhashs64Fnv1: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public testhashu64Fnv1: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public testhashs32Fnv1a: number = 0,
+  public testhashu32Fnv1a: number = 0,
+  public testhashs64Fnv1a: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public testhashu64Fnv1a: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public testarrayofbools: (boolean)[] = [],
+  public testf: number = 3.14159,
+  public testf2: number = 3.0,
+  public testf3: number = 0.0,
+  public testarrayofstring2: (string)[] = [],
+  public testarrayofsortedstruct: (AbilityT)[] = [],
+  public flex: (number)[] = [],
+  public test5: (TestT)[] = [],
+  public vectorOfLongs: (flatbuffers.Long)[] = [],
+  public vectorOfDoubles: (number)[] = [],
+  public parentNamespaceTest: InParentNamespaceT|null = null,
+  public vectorOfReferrables: (ReferrableT)[] = [],
+  public singleWeakReference: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public vectorOfWeakReferences: (flatbuffers.Long)[] = [],
+  public vectorOfStrongReferrables: (ReferrableT)[] = [],
+  public coOwningReference: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public vectorOfCoOwningReferences: (flatbuffers.Long)[] = [],
+  public nonOwningReference: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public vectorOfNonOwningReferences: (flatbuffers.Long)[] = [],
+  public anyUniqueType: AnyUniqueAliases = AnyUniqueAliases.NONE,
+  public anyUnique: MonsterT|MyGameExample2MonsterT|TestSimpleTableWithEnumT|null = null,
+  public anyAmbiguousType: AnyAmbiguousAliases = AnyAmbiguousAliases.NONE,
+  public anyAmbiguous: MonsterT|null = null,
+  public vectorOfEnums: (Color)[] = [],
+  public signedEnum: Race = Race.None,
+  public testrequirednestedflatbuffer: (number)[] = [],
+  public scalarKeySortedTables: (StatT)[] = []
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const name = (this.name !== null ? builder.createString(this.name!) : 0);
+  const inventory = Monster.createInventoryVector(builder, this.inventory);
+  const test = builder.createObjectOffset(this.test);
+  const test4 = builder.createStructOffsetList(this.test4, Monster.startTest4Vector);
+  const testarrayofstring = Monster.createTestarrayofstringVector(builder, builder.createObjectOffsetList(this.testarrayofstring));
+  const testarrayoftables = Monster.createTestarrayoftablesVector(builder, builder.createObjectOffsetList(this.testarrayoftables));
+  const enemy = (this.enemy !== null ? this.enemy!.pack(builder) : 0);
+  const testnestedflatbuffer = Monster.createTestnestedflatbufferVector(builder, this.testnestedflatbuffer);
+  const testempty = (this.testempty !== null ? this.testempty!.pack(builder) : 0);
+  const testarrayofbools = Monster.createTestarrayofboolsVector(builder, this.testarrayofbools);
+  const testarrayofstring2 = Monster.createTestarrayofstring2Vector(builder, builder.createObjectOffsetList(this.testarrayofstring2));
+  const testarrayofsortedstruct = builder.createStructOffsetList(this.testarrayofsortedstruct, Monster.startTestarrayofsortedstructVector);
+  const flex = Monster.createFlexVector(builder, this.flex);
+  const test5 = builder.createStructOffsetList(this.test5, Monster.startTest5Vector);
+  const vectorOfLongs = Monster.createVectorOfLongsVector(builder, this.vectorOfLongs);
+  const vectorOfDoubles = Monster.createVectorOfDoublesVector(builder, this.vectorOfDoubles);
+  const parentNamespaceTest = (this.parentNamespaceTest !== null ? this.parentNamespaceTest!.pack(builder) : 0);
+  const vectorOfReferrables = Monster.createVectorOfReferrablesVector(builder, builder.createObjectOffsetList(this.vectorOfReferrables));
+  const vectorOfWeakReferences = Monster.createVectorOfWeakReferencesVector(builder, this.vectorOfWeakReferences);
+  const vectorOfStrongReferrables = Monster.createVectorOfStrongReferrablesVector(builder, builder.createObjectOffsetList(this.vectorOfStrongReferrables));
+  const vectorOfCoOwningReferences = Monster.createVectorOfCoOwningReferencesVector(builder, this.vectorOfCoOwningReferences);
+  const vectorOfNonOwningReferences = Monster.createVectorOfNonOwningReferencesVector(builder, this.vectorOfNonOwningReferences);
+  const anyUnique = builder.createObjectOffset(this.anyUnique);
+  const anyAmbiguous = builder.createObjectOffset(this.anyAmbiguous);
+  const vectorOfEnums = Monster.createVectorOfEnumsVector(builder, this.vectorOfEnums);
+  const testrequirednestedflatbuffer = Monster.createTestrequirednestedflatbufferVector(builder, this.testrequirednestedflatbuffer);
+  const scalarKeySortedTables = Monster.createScalarKeySortedTablesVector(builder, builder.createObjectOffsetList(this.scalarKeySortedTables));
+
+  Monster.startMonster(builder);
+  Monster.addPos(builder, (this.pos !== null ? this.pos!.pack(builder) : 0));
+  Monster.addMana(builder, this.mana);
+  Monster.addHp(builder, this.hp);
+  Monster.addName(builder, name);
+  Monster.addInventory(builder, inventory);
+  Monster.addColor(builder, this.color);
+  Monster.addTestType(builder, this.testType);
+  Monster.addTest(builder, test);
+  Monster.addTest4(builder, test4);
+  Monster.addTestarrayofstring(builder, testarrayofstring);
+  Monster.addTestarrayoftables(builder, testarrayoftables);
+  Monster.addEnemy(builder, enemy);
+  Monster.addTestnestedflatbuffer(builder, testnestedflatbuffer);
+  Monster.addTestempty(builder, testempty);
+  Monster.addTestbool(builder, this.testbool);
+  Monster.addTesthashs32Fnv1(builder, this.testhashs32Fnv1);
+  Monster.addTesthashu32Fnv1(builder, this.testhashu32Fnv1);
+  Monster.addTesthashs64Fnv1(builder, this.testhashs64Fnv1);
+  Monster.addTesthashu64Fnv1(builder, this.testhashu64Fnv1);
+  Monster.addTesthashs32Fnv1a(builder, this.testhashs32Fnv1a);
+  Monster.addTesthashu32Fnv1a(builder, this.testhashu32Fnv1a);
+  Monster.addTesthashs64Fnv1a(builder, this.testhashs64Fnv1a);
+  Monster.addTesthashu64Fnv1a(builder, this.testhashu64Fnv1a);
+  Monster.addTestarrayofbools(builder, testarrayofbools);
+  Monster.addTestf(builder, this.testf);
+  Monster.addTestf2(builder, this.testf2);
+  Monster.addTestf3(builder, this.testf3);
+  Monster.addTestarrayofstring2(builder, testarrayofstring2);
+  Monster.addTestarrayofsortedstruct(builder, testarrayofsortedstruct);
+  Monster.addFlex(builder, flex);
+  Monster.addTest5(builder, test5);
+  Monster.addVectorOfLongs(builder, vectorOfLongs);
+  Monster.addVectorOfDoubles(builder, vectorOfDoubles);
+  Monster.addParentNamespaceTest(builder, parentNamespaceTest);
+  Monster.addVectorOfReferrables(builder, vectorOfReferrables);
+  Monster.addSingleWeakReference(builder, this.singleWeakReference);
+  Monster.addVectorOfWeakReferences(builder, vectorOfWeakReferences);
+  Monster.addVectorOfStrongReferrables(builder, vectorOfStrongReferrables);
+  Monster.addCoOwningReference(builder, this.coOwningReference);
+  Monster.addVectorOfCoOwningReferences(builder, vectorOfCoOwningReferences);
+  Monster.addNonOwningReference(builder, this.nonOwningReference);
+  Monster.addVectorOfNonOwningReferences(builder, vectorOfNonOwningReferences);
+  Monster.addAnyUniqueType(builder, this.anyUniqueType);
+  Monster.addAnyUnique(builder, anyUnique);
+  Monster.addAnyAmbiguousType(builder, this.anyAmbiguousType);
+  Monster.addAnyAmbiguous(builder, anyAmbiguous);
+  Monster.addVectorOfEnums(builder, vectorOfEnums);
+  Monster.addSignedEnum(builder, this.signedEnum);
+  Monster.addTestrequirednestedflatbuffer(builder, testrequirednestedflatbuffer);
+  Monster.addScalarKeySortedTables(builder, scalarKeySortedTables);
+
+  return Monster.endMonster(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.js
new file mode 100644
index 0000000..74f5105
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.js
@@ -0,0 +1,8 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export var Race;
+(function (Race) {
+    Race[Race["None"] = -1] = "None";
+    Race[Race["Human"] = 0] = "Human";
+    Race[Race["Dwarf"] = 1] = "Dwarf";
+    Race[Race["Elf"] = 2] = "Elf";
+})(Race || (Race = {}));
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.ts
new file mode 100644
index 0000000..06ca128
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/race.ts
@@ -0,0 +1,9 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+export enum Race{
+  None = -1,
+  Human = 0,
+  Dwarf = 1,
+  Elf = 2
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.js
new file mode 100644
index 0000000..98cc76a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.js
@@ -0,0 +1,70 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class Referrable {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsReferrable(bb, obj) {
+        return (obj || new Referrable()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsReferrable(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Referrable()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    id() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_id(value) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Referrable';
+    }
+    static startReferrable(builder) {
+        builder.startObject(1);
+    }
+    static addId(builder, id) {
+        builder.addFieldInt64(0, id, builder.createLong(0, 0));
+    }
+    static endReferrable(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createReferrable(builder, id) {
+        Referrable.startReferrable(builder);
+        Referrable.addId(builder, id);
+        return Referrable.endReferrable(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return Referrable.getRootAsReferrable(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new ReferrableT(this.id());
+    }
+    unpackTo(_o) {
+        _o.id = this.id();
+    }
+}
+export class ReferrableT {
+    constructor(id = flatbuffers.createLong(0, 0)) {
+        this.id = id;
+    }
+    pack(builder) {
+        return Referrable.createReferrable(builder, this.id);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.ts
new file mode 100644
index 0000000..ce24deb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/referrable.ts
@@ -0,0 +1,95 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Referrable {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Referrable {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsReferrable(bb:flatbuffers.ByteBuffer, obj?:Referrable):Referrable {
+  return (obj || new Referrable()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsReferrable(bb:flatbuffers.ByteBuffer, obj?:Referrable):Referrable {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Referrable()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+id():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_id(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Referrable';
+}
+
+static startReferrable(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addId(builder:flatbuffers.Builder, id:flatbuffers.Long) {
+  builder.addFieldInt64(0, id, builder.createLong(0, 0));
+}
+
+static endReferrable(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createReferrable(builder:flatbuffers.Builder, id:flatbuffers.Long):flatbuffers.Offset {
+  Referrable.startReferrable(builder);
+  Referrable.addId(builder, id);
+  return Referrable.endReferrable(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):Referrable {
+  return Referrable.getRootAsReferrable(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): ReferrableT {
+  return new ReferrableT(
+    this.id()
+  );
+}
+
+
+unpackTo(_o: ReferrableT): void {
+  _o.id = this.id();
+}
+}
+
+export class ReferrableT {
+constructor(
+  public id: flatbuffers.Long = flatbuffers.createLong(0, 0)
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Referrable.createReferrable(builder,
+    this.id
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.js
new file mode 100644
index 0000000..12746ed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.js
@@ -0,0 +1,99 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class Stat {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsStat(bb, obj) {
+        return (obj || new Stat()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsStat(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Stat()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    id(optionalEncoding) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.__string(this.bb_pos + offset, optionalEncoding) : null;
+    }
+    val() {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_val(value) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt64(this.bb_pos + offset, value);
+        return true;
+    }
+    count() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readUint16(this.bb_pos + offset) : 0;
+    }
+    mutate_count(value) {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint16(this.bb_pos + offset, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Stat';
+    }
+    static startStat(builder) {
+        builder.startObject(3);
+    }
+    static addId(builder, idOffset) {
+        builder.addFieldOffset(0, idOffset, 0);
+    }
+    static addVal(builder, val) {
+        builder.addFieldInt64(1, val, builder.createLong(0, 0));
+    }
+    static addCount(builder, count) {
+        builder.addFieldInt16(2, count, 0);
+    }
+    static endStat(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createStat(builder, idOffset, val, count) {
+        Stat.startStat(builder);
+        Stat.addId(builder, idOffset);
+        Stat.addVal(builder, val);
+        Stat.addCount(builder, count);
+        return Stat.endStat(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return Stat.getRootAsStat(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new StatT(this.id(), this.val(), this.count());
+    }
+    unpackTo(_o) {
+        _o.id = this.id();
+        _o.val = this.val();
+        _o.count = this.count();
+    }
+}
+export class StatT {
+    constructor(id = null, val = flatbuffers.createLong(0, 0), count = 0) {
+        this.id = id;
+        this.val = val;
+        this.count = count;
+    }
+    pack(builder) {
+        const id = (this.id !== null ? builder.createString(this.id) : 0);
+        return Stat.createStat(builder, id, this.val, this.count);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.ts
new file mode 100644
index 0000000..c0ef62c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/stat.ts
@@ -0,0 +1,138 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Stat {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Stat {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsStat(bb:flatbuffers.ByteBuffer, obj?:Stat):Stat {
+  return (obj || new Stat()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsStat(bb:flatbuffers.ByteBuffer, obj?:Stat):Stat {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Stat()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+id():string|null
+id(optionalEncoding:flatbuffers.Encoding):string|Uint8Array|null
+id(optionalEncoding?:any):string|Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null;
+}
+
+val():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_val(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt64(this.bb_pos + offset, value);
+  return true;
+}
+
+count():number {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readUint16(this.bb_pos + offset) : 0;
+}
+
+mutate_count(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint16(this.bb_pos + offset, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Stat';
+}
+
+static startStat(builder:flatbuffers.Builder) {
+  builder.startObject(3);
+}
+
+static addId(builder:flatbuffers.Builder, idOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, idOffset, 0);
+}
+
+static addVal(builder:flatbuffers.Builder, val:flatbuffers.Long) {
+  builder.addFieldInt64(1, val, builder.createLong(0, 0));
+}
+
+static addCount(builder:flatbuffers.Builder, count:number) {
+  builder.addFieldInt16(2, count, 0);
+}
+
+static endStat(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createStat(builder:flatbuffers.Builder, idOffset:flatbuffers.Offset, val:flatbuffers.Long, count:number):flatbuffers.Offset {
+  Stat.startStat(builder);
+  Stat.addId(builder, idOffset);
+  Stat.addVal(builder, val);
+  Stat.addCount(builder, count);
+  return Stat.endStat(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):Stat {
+  return Stat.getRootAsStat(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): StatT {
+  return new StatT(
+    this.id(),
+    this.val(),
+    this.count()
+  );
+}
+
+
+unpackTo(_o: StatT): void {
+  _o.id = this.id();
+  _o.val = this.val();
+  _o.count = this.count();
+}
+}
+
+export class StatT {
+constructor(
+  public id: string|Uint8Array|null = null,
+  public val: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public count: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const id = (this.id !== null ? builder.createString(this.id!) : 0);
+
+  return Stat.createStat(builder,
+    id,
+    this.val,
+    this.count
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.js
new file mode 100644
index 0000000..2debc29
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.js
@@ -0,0 +1,61 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Ability } from '../../my-game/example/ability';
+import { Test } from '../../my-game/example/test';
+export class StructOfStructs {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    a(obj) {
+        return (obj || new Ability()).__init(this.bb_pos, this.bb);
+    }
+    b(obj) {
+        return (obj || new Test()).__init(this.bb_pos + 8, this.bb);
+    }
+    c(obj) {
+        return (obj || new Ability()).__init(this.bb_pos + 12, this.bb);
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.StructOfStructs';
+    }
+    static sizeOf() {
+        return 20;
+    }
+    static createStructOfStructs(builder, a_id, a_distance, b_a, b_b, c_id, c_distance) {
+        builder.prep(4, 20);
+        builder.prep(4, 8);
+        builder.writeInt32(c_distance);
+        builder.writeInt32(c_id);
+        builder.prep(2, 4);
+        builder.pad(1);
+        builder.writeInt8(b_b);
+        builder.writeInt16(b_a);
+        builder.prep(4, 8);
+        builder.writeInt32(a_distance);
+        builder.writeInt32(a_id);
+        return builder.offset();
+    }
+    unpack() {
+        return new StructOfStructsT((this.a() !== null ? this.a().unpack() : null), (this.b() !== null ? this.b().unpack() : null), (this.c() !== null ? this.c().unpack() : null));
+    }
+    unpackTo(_o) {
+        _o.a = (this.a() !== null ? this.a().unpack() : null);
+        _o.b = (this.b() !== null ? this.b().unpack() : null);
+        _o.c = (this.c() !== null ? this.c().unpack() : null);
+    }
+}
+export class StructOfStructsT {
+    constructor(a = null, b = null, c = null) {
+        this.a = a;
+        this.b = b;
+        this.c = c;
+    }
+    pack(builder) {
+        return StructOfStructs.createStructOfStructs(builder, (this.a === null ? 0 : this.a.id), (this.a === null ? 0 : this.a.distance), (this.b === null ? 0 : this.b.a), (this.b === null ? 0 : this.b.b), (this.c === null ? 0 : this.c.id), (this.c === null ? 0 : this.c.distance));
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.ts
new file mode 100644
index 0000000..11eca7e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/struct-of-structs.ts
@@ -0,0 +1,88 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { Ability, AbilityT } from '../../my-game/example/ability';
+import { Test, TestT } from '../../my-game/example/test';
+
+
+export class StructOfStructs {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):StructOfStructs {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+a(obj?:Ability):Ability|null {
+  return (obj || new Ability()).__init(this.bb_pos, this.bb!);
+}
+
+b(obj?:Test):Test|null {
+  return (obj || new Test()).__init(this.bb_pos + 8, this.bb!);
+}
+
+c(obj?:Ability):Ability|null {
+  return (obj || new Ability()).__init(this.bb_pos + 12, this.bb!);
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.StructOfStructs';
+}
+
+static sizeOf():number {
+  return 20;
+}
+
+static createStructOfStructs(builder:flatbuffers.Builder, a_id: number, a_distance: number, b_a: number, b_b: number, c_id: number, c_distance: number):flatbuffers.Offset {
+  builder.prep(4, 20);
+  builder.prep(4, 8);
+  builder.writeInt32(c_distance);
+  builder.writeInt32(c_id);
+  builder.prep(2, 4);
+  builder.pad(1);
+  builder.writeInt8(b_b);
+  builder.writeInt16(b_a);
+  builder.prep(4, 8);
+  builder.writeInt32(a_distance);
+  builder.writeInt32(a_id);
+  return builder.offset();
+}
+
+
+unpack(): StructOfStructsT {
+  return new StructOfStructsT(
+    (this.a() !== null ? this.a()!.unpack() : null),
+    (this.b() !== null ? this.b()!.unpack() : null),
+    (this.c() !== null ? this.c()!.unpack() : null)
+  );
+}
+
+
+unpackTo(_o: StructOfStructsT): void {
+  _o.a = (this.a() !== null ? this.a()!.unpack() : null);
+  _o.b = (this.b() !== null ? this.b()!.unpack() : null);
+  _o.c = (this.c() !== null ? this.c()!.unpack() : null);
+}
+}
+
+export class StructOfStructsT {
+constructor(
+  public a: AbilityT|null = null,
+  public b: TestT|null = null,
+  public c: AbilityT|null = null
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return StructOfStructs.createStructOfStructs(builder,
+    (this.a === null ? 0 : this.a.id!),
+    (this.a === null ? 0 : this.a.distance!),
+    (this.b === null ? 0 : this.b.a!),
+    (this.b === null ? 0 : this.b.b!),
+    (this.c === null ? 0 : this.c.id!),
+    (this.c === null ? 0 : this.c.distance!)
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.js
new file mode 100644
index 0000000..b56bda4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.js
@@ -0,0 +1,71 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { Color } from '../../my-game/example/color';
+export class TestSimpleTableWithEnum {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsTestSimpleTableWithEnum(bb, obj) {
+        return (obj || new TestSimpleTableWithEnum()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsTestSimpleTableWithEnum(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new TestSimpleTableWithEnum()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    color() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : Color.Green;
+    }
+    mutate_color(value) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint8(this.bb_pos + offset, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.TestSimpleTableWithEnum';
+    }
+    static startTestSimpleTableWithEnum(builder) {
+        builder.startObject(1);
+    }
+    static addColor(builder, color) {
+        builder.addFieldInt8(0, color, Color.Green);
+    }
+    static endTestSimpleTableWithEnum(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createTestSimpleTableWithEnum(builder, color) {
+        TestSimpleTableWithEnum.startTestSimpleTableWithEnum(builder);
+        TestSimpleTableWithEnum.addColor(builder, color);
+        return TestSimpleTableWithEnum.endTestSimpleTableWithEnum(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return TestSimpleTableWithEnum.getRootAsTestSimpleTableWithEnum(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new TestSimpleTableWithEnumT(this.color());
+    }
+    unpackTo(_o) {
+        _o.color = this.color();
+    }
+}
+export class TestSimpleTableWithEnumT {
+    constructor(color = Color.Green) {
+        this.color = color;
+    }
+    pack(builder) {
+        return TestSimpleTableWithEnum.createTestSimpleTableWithEnum(builder, this.color);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.ts
new file mode 100644
index 0000000..99a541f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test-simple-table-with-enum.ts
@@ -0,0 +1,96 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { Color } from '../../my-game/example/color';
+
+
+export class TestSimpleTableWithEnum {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):TestSimpleTableWithEnum {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsTestSimpleTableWithEnum(bb:flatbuffers.ByteBuffer, obj?:TestSimpleTableWithEnum):TestSimpleTableWithEnum {
+  return (obj || new TestSimpleTableWithEnum()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsTestSimpleTableWithEnum(bb:flatbuffers.ByteBuffer, obj?:TestSimpleTableWithEnum):TestSimpleTableWithEnum {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new TestSimpleTableWithEnum()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+color():Color {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : Color.Green;
+}
+
+mutate_color(value:Color):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint8(this.bb_pos + offset, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.TestSimpleTableWithEnum';
+}
+
+static startTestSimpleTableWithEnum(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addColor(builder:flatbuffers.Builder, color:Color) {
+  builder.addFieldInt8(0, color, Color.Green);
+}
+
+static endTestSimpleTableWithEnum(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createTestSimpleTableWithEnum(builder:flatbuffers.Builder, color:Color):flatbuffers.Offset {
+  TestSimpleTableWithEnum.startTestSimpleTableWithEnum(builder);
+  TestSimpleTableWithEnum.addColor(builder, color);
+  return TestSimpleTableWithEnum.endTestSimpleTableWithEnum(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):TestSimpleTableWithEnum {
+  return TestSimpleTableWithEnum.getRootAsTestSimpleTableWithEnum(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): TestSimpleTableWithEnumT {
+  return new TestSimpleTableWithEnumT(
+    this.color()
+  );
+}
+
+
+unpackTo(_o: TestSimpleTableWithEnumT): void {
+  _o.color = this.color();
+}
+}
+
+export class TestSimpleTableWithEnumT {
+constructor(
+  public color: Color = Color.Green
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return TestSimpleTableWithEnum.createTestSimpleTableWithEnum(builder,
+    this.color
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.js
new file mode 100644
index 0000000..ba6ebfb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.js
@@ -0,0 +1,55 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export class Test {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    a() {
+        return this.bb.readInt16(this.bb_pos);
+    }
+    mutate_a(value) {
+        this.bb.writeInt16(this.bb_pos + 0, value);
+        return true;
+    }
+    b() {
+        return this.bb.readInt8(this.bb_pos + 2);
+    }
+    mutate_b(value) {
+        this.bb.writeInt8(this.bb_pos + 2, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Test';
+    }
+    static sizeOf() {
+        return 4;
+    }
+    static createTest(builder, a, b) {
+        builder.prep(2, 4);
+        builder.pad(1);
+        builder.writeInt8(b);
+        builder.writeInt16(a);
+        return builder.offset();
+    }
+    unpack() {
+        return new TestT(this.a(), this.b());
+    }
+    unpackTo(_o) {
+        _o.a = this.a();
+        _o.b = this.b();
+    }
+}
+export class TestT {
+    constructor(a = 0, b = 0) {
+        this.a = a;
+        this.b = b;
+    }
+    pack(builder) {
+        return Test.createTest(builder, this.a, this.b);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.ts
new file mode 100644
index 0000000..afed125
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/test.ts
@@ -0,0 +1,78 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Test {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Test {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+a():number {
+  return this.bb!.readInt16(this.bb_pos);
+}
+
+mutate_a(value:number):boolean {
+  this.bb!.writeInt16(this.bb_pos + 0, value);
+  return true;
+}
+
+b():number {
+  return this.bb!.readInt8(this.bb_pos + 2);
+}
+
+mutate_b(value:number):boolean {
+  this.bb!.writeInt8(this.bb_pos + 2, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Test';
+}
+
+static sizeOf():number {
+  return 4;
+}
+
+static createTest(builder:flatbuffers.Builder, a: number, b: number):flatbuffers.Offset {
+  builder.prep(2, 4);
+  builder.pad(1);
+  builder.writeInt8(b);
+  builder.writeInt16(a);
+  return builder.offset();
+}
+
+
+unpack(): TestT {
+  return new TestT(
+    this.a(),
+    this.b()
+  );
+}
+
+
+unpackTo(_o: TestT): void {
+  _o.a = this.a();
+  _o.b = this.b();
+}
+}
+
+export class TestT {
+constructor(
+  public a: number = 0,
+  public b: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Test.createTest(builder,
+    this.a,
+    this.b
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.js
new file mode 100644
index 0000000..9522abc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.js
@@ -0,0 +1,290 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class TypeAliases {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsTypeAliases(bb, obj) {
+        return (obj || new TypeAliases()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsTypeAliases(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new TypeAliases()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    i8() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : 0;
+    }
+    mutate_i8(value) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt8(this.bb_pos + offset, value);
+        return true;
+    }
+    u8() {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : 0;
+    }
+    mutate_u8(value) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint8(this.bb_pos + offset, value);
+        return true;
+    }
+    i16() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : 0;
+    }
+    mutate_i16(value) {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt16(this.bb_pos + offset, value);
+        return true;
+    }
+    u16() {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.readUint16(this.bb_pos + offset) : 0;
+    }
+    mutate_u16(value) {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint16(this.bb_pos + offset, value);
+        return true;
+    }
+    i32() {
+        const offset = this.bb.__offset(this.bb_pos, 12);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    mutate_i32(value) {
+        const offset = this.bb.__offset(this.bb_pos, 12);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt32(this.bb_pos + offset, value);
+        return true;
+    }
+    u32() {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : 0;
+    }
+    mutate_u32(value) {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint32(this.bb_pos + offset, value);
+        return true;
+    }
+    i64() {
+        const offset = this.bb.__offset(this.bb_pos, 16);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_i64(value) {
+        const offset = this.bb.__offset(this.bb_pos, 16);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt64(this.bb_pos + offset, value);
+        return true;
+    }
+    u64() {
+        const offset = this.bb.__offset(this.bb_pos, 18);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    mutate_u64(value) {
+        const offset = this.bb.__offset(this.bb_pos, 18);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeUint64(this.bb_pos + offset, value);
+        return true;
+    }
+    f32() {
+        const offset = this.bb.__offset(this.bb_pos, 20);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 0.0;
+    }
+    mutate_f32(value) {
+        const offset = this.bb.__offset(this.bb_pos, 20);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeFloat32(this.bb_pos + offset, value);
+        return true;
+    }
+    f64() {
+        const offset = this.bb.__offset(this.bb_pos, 22);
+        return offset ? this.bb.readFloat64(this.bb_pos + offset) : 0.0;
+    }
+    mutate_f64(value) {
+        const offset = this.bb.__offset(this.bb_pos, 22);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeFloat64(this.bb_pos + offset, value);
+        return true;
+    }
+    v8(index) {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? this.bb.readInt8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    v8Length() {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    v8Array() {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? new Int8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    vf64(index) {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? this.bb.readFloat64(this.bb.__vector(this.bb_pos + offset) + index * 8) : 0;
+    }
+    vf64Length() {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    vf64Array() {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? new Float64Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.TypeAliases';
+    }
+    static startTypeAliases(builder) {
+        builder.startObject(12);
+    }
+    static addI8(builder, i8) {
+        builder.addFieldInt8(0, i8, 0);
+    }
+    static addU8(builder, u8) {
+        builder.addFieldInt8(1, u8, 0);
+    }
+    static addI16(builder, i16) {
+        builder.addFieldInt16(2, i16, 0);
+    }
+    static addU16(builder, u16) {
+        builder.addFieldInt16(3, u16, 0);
+    }
+    static addI32(builder, i32) {
+        builder.addFieldInt32(4, i32, 0);
+    }
+    static addU32(builder, u32) {
+        builder.addFieldInt32(5, u32, 0);
+    }
+    static addI64(builder, i64) {
+        builder.addFieldInt64(6, i64, builder.createLong(0, 0));
+    }
+    static addU64(builder, u64) {
+        builder.addFieldInt64(7, u64, builder.createLong(0, 0));
+    }
+    static addF32(builder, f32) {
+        builder.addFieldFloat32(8, f32, 0.0);
+    }
+    static addF64(builder, f64) {
+        builder.addFieldFloat64(9, f64, 0.0);
+    }
+    static addV8(builder, v8Offset) {
+        builder.addFieldOffset(10, v8Offset, 0);
+    }
+    static createV8Vector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startV8Vector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addVf64(builder, vf64Offset) {
+        builder.addFieldOffset(11, vf64Offset, 0);
+    }
+    static createVf64Vector(builder, data) {
+        builder.startVector(8, data.length, 8);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addFloat64(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startVf64Vector(builder, numElems) {
+        builder.startVector(8, numElems, 8);
+    }
+    static endTypeAliases(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createTypeAliases(builder, i8, u8, i16, u16, i32, u32, i64, u64, f32, f64, v8Offset, vf64Offset) {
+        TypeAliases.startTypeAliases(builder);
+        TypeAliases.addI8(builder, i8);
+        TypeAliases.addU8(builder, u8);
+        TypeAliases.addI16(builder, i16);
+        TypeAliases.addU16(builder, u16);
+        TypeAliases.addI32(builder, i32);
+        TypeAliases.addU32(builder, u32);
+        TypeAliases.addI64(builder, i64);
+        TypeAliases.addU64(builder, u64);
+        TypeAliases.addF32(builder, f32);
+        TypeAliases.addF64(builder, f64);
+        TypeAliases.addV8(builder, v8Offset);
+        TypeAliases.addVf64(builder, vf64Offset);
+        return TypeAliases.endTypeAliases(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return TypeAliases.getRootAsTypeAliases(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new TypeAliasesT(this.i8(), this.u8(), this.i16(), this.u16(), this.i32(), this.u32(), this.i64(), this.u64(), this.f32(), this.f64(), this.bb.createScalarList(this.v8.bind(this), this.v8Length()), this.bb.createScalarList(this.vf64.bind(this), this.vf64Length()));
+    }
+    unpackTo(_o) {
+        _o.i8 = this.i8();
+        _o.u8 = this.u8();
+        _o.i16 = this.i16();
+        _o.u16 = this.u16();
+        _o.i32 = this.i32();
+        _o.u32 = this.u32();
+        _o.i64 = this.i64();
+        _o.u64 = this.u64();
+        _o.f32 = this.f32();
+        _o.f64 = this.f64();
+        _o.v8 = this.bb.createScalarList(this.v8.bind(this), this.v8Length());
+        _o.vf64 = this.bb.createScalarList(this.vf64.bind(this), this.vf64Length());
+    }
+}
+export class TypeAliasesT {
+    constructor(i8 = 0, u8 = 0, i16 = 0, u16 = 0, i32 = 0, u32 = 0, i64 = flatbuffers.createLong(0, 0), u64 = flatbuffers.createLong(0, 0), f32 = 0.0, f64 = 0.0, v8 = [], vf64 = []) {
+        this.i8 = i8;
+        this.u8 = u8;
+        this.i16 = i16;
+        this.u16 = u16;
+        this.i32 = i32;
+        this.u32 = u32;
+        this.i64 = i64;
+        this.u64 = u64;
+        this.f32 = f32;
+        this.f64 = f64;
+        this.v8 = v8;
+        this.vf64 = vf64;
+    }
+    pack(builder) {
+        const v8 = TypeAliases.createV8Vector(builder, this.v8);
+        const vf64 = TypeAliases.createVf64Vector(builder, this.vf64);
+        return TypeAliases.createTypeAliases(builder, this.i8, this.u8, this.i16, this.u16, this.i32, this.u32, this.i64, this.u64, this.f32, this.f64, v8, vf64);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.ts
new file mode 100644
index 0000000..58940d0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/type-aliases.ts
@@ -0,0 +1,405 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class TypeAliases {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):TypeAliases {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsTypeAliases(bb:flatbuffers.ByteBuffer, obj?:TypeAliases):TypeAliases {
+  return (obj || new TypeAliases()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsTypeAliases(bb:flatbuffers.ByteBuffer, obj?:TypeAliases):TypeAliases {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new TypeAliases()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+i8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : 0;
+}
+
+mutate_i8(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt8(this.bb_pos + offset, value);
+  return true;
+}
+
+u8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : 0;
+}
+
+mutate_u8(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint8(this.bb_pos + offset, value);
+  return true;
+}
+
+i16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : 0;
+}
+
+mutate_i16(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt16(this.bb_pos + offset, value);
+  return true;
+}
+
+u16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.readUint16(this.bb_pos + offset) : 0;
+}
+
+mutate_u16(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint16(this.bb_pos + offset, value);
+  return true;
+}
+
+i32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 12);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+mutate_i32(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 12);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt32(this.bb_pos + offset, value);
+  return true;
+}
+
+u32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : 0;
+}
+
+mutate_u32(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint32(this.bb_pos + offset, value);
+  return true;
+}
+
+i64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 16);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_i64(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 16);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt64(this.bb_pos + offset, value);
+  return true;
+}
+
+u64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 18);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+mutate_u64(value:flatbuffers.Long):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 18);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeUint64(this.bb_pos + offset, value);
+  return true;
+}
+
+f32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 20);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 0.0;
+}
+
+mutate_f32(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 20);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeFloat32(this.bb_pos + offset, value);
+  return true;
+}
+
+f64():number {
+  const offset = this.bb!.__offset(this.bb_pos, 22);
+  return offset ? this.bb!.readFloat64(this.bb_pos + offset) : 0.0;
+}
+
+mutate_f64(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 22);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeFloat64(this.bb_pos + offset, value);
+  return true;
+}
+
+v8(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? this.bb!.readInt8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+v8Length():number {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+v8Array():Int8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? new Int8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+vf64(index: number):number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? this.bb!.readFloat64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : 0;
+}
+
+vf64Length():number {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+vf64Array():Float64Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? new Float64Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.TypeAliases';
+}
+
+static startTypeAliases(builder:flatbuffers.Builder) {
+  builder.startObject(12);
+}
+
+static addI8(builder:flatbuffers.Builder, i8:number) {
+  builder.addFieldInt8(0, i8, 0);
+}
+
+static addU8(builder:flatbuffers.Builder, u8:number) {
+  builder.addFieldInt8(1, u8, 0);
+}
+
+static addI16(builder:flatbuffers.Builder, i16:number) {
+  builder.addFieldInt16(2, i16, 0);
+}
+
+static addU16(builder:flatbuffers.Builder, u16:number) {
+  builder.addFieldInt16(3, u16, 0);
+}
+
+static addI32(builder:flatbuffers.Builder, i32:number) {
+  builder.addFieldInt32(4, i32, 0);
+}
+
+static addU32(builder:flatbuffers.Builder, u32:number) {
+  builder.addFieldInt32(5, u32, 0);
+}
+
+static addI64(builder:flatbuffers.Builder, i64:flatbuffers.Long) {
+  builder.addFieldInt64(6, i64, builder.createLong(0, 0));
+}
+
+static addU64(builder:flatbuffers.Builder, u64:flatbuffers.Long) {
+  builder.addFieldInt64(7, u64, builder.createLong(0, 0));
+}
+
+static addF32(builder:flatbuffers.Builder, f32:number) {
+  builder.addFieldFloat32(8, f32, 0.0);
+}
+
+static addF64(builder:flatbuffers.Builder, f64:number) {
+  builder.addFieldFloat64(9, f64, 0.0);
+}
+
+static addV8(builder:flatbuffers.Builder, v8Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(10, v8Offset, 0);
+}
+
+static createV8Vector(builder:flatbuffers.Builder, data:number[]|Int8Array):flatbuffers.Offset;
+/**
+ * @deprecated This Uint8Array overload will be removed in the future.
+ */
+static createV8Vector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset;
+static createV8Vector(builder:flatbuffers.Builder, data:number[]|Int8Array|Uint8Array):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startV8Vector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addVf64(builder:flatbuffers.Builder, vf64Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(11, vf64Offset, 0);
+}
+
+static createVf64Vector(builder:flatbuffers.Builder, data:number[]|Float64Array):flatbuffers.Offset;
+/**
+ * @deprecated This Uint8Array overload will be removed in the future.
+ */
+static createVf64Vector(builder:flatbuffers.Builder, data:number[]|Uint8Array):flatbuffers.Offset;
+static createVf64Vector(builder:flatbuffers.Builder, data:number[]|Float64Array|Uint8Array):flatbuffers.Offset {
+  builder.startVector(8, data.length, 8);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addFloat64(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startVf64Vector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(8, numElems, 8);
+}
+
+static endTypeAliases(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createTypeAliases(builder:flatbuffers.Builder, i8:number, u8:number, i16:number, u16:number, i32:number, u32:number, i64:flatbuffers.Long, u64:flatbuffers.Long, f32:number, f64:number, v8Offset:flatbuffers.Offset, vf64Offset:flatbuffers.Offset):flatbuffers.Offset {
+  TypeAliases.startTypeAliases(builder);
+  TypeAliases.addI8(builder, i8);
+  TypeAliases.addU8(builder, u8);
+  TypeAliases.addI16(builder, i16);
+  TypeAliases.addU16(builder, u16);
+  TypeAliases.addI32(builder, i32);
+  TypeAliases.addU32(builder, u32);
+  TypeAliases.addI64(builder, i64);
+  TypeAliases.addU64(builder, u64);
+  TypeAliases.addF32(builder, f32);
+  TypeAliases.addF64(builder, f64);
+  TypeAliases.addV8(builder, v8Offset);
+  TypeAliases.addVf64(builder, vf64Offset);
+  return TypeAliases.endTypeAliases(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):TypeAliases {
+  return TypeAliases.getRootAsTypeAliases(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): TypeAliasesT {
+  return new TypeAliasesT(
+    this.i8(),
+    this.u8(),
+    this.i16(),
+    this.u16(),
+    this.i32(),
+    this.u32(),
+    this.i64(),
+    this.u64(),
+    this.f32(),
+    this.f64(),
+    this.bb!.createScalarList(this.v8.bind(this), this.v8Length()),
+    this.bb!.createScalarList(this.vf64.bind(this), this.vf64Length())
+  );
+}
+
+
+unpackTo(_o: TypeAliasesT): void {
+  _o.i8 = this.i8();
+  _o.u8 = this.u8();
+  _o.i16 = this.i16();
+  _o.u16 = this.u16();
+  _o.i32 = this.i32();
+  _o.u32 = this.u32();
+  _o.i64 = this.i64();
+  _o.u64 = this.u64();
+  _o.f32 = this.f32();
+  _o.f64 = this.f64();
+  _o.v8 = this.bb!.createScalarList(this.v8.bind(this), this.v8Length());
+  _o.vf64 = this.bb!.createScalarList(this.vf64.bind(this), this.vf64Length());
+}
+}
+
+export class TypeAliasesT {
+constructor(
+  public i8: number = 0,
+  public u8: number = 0,
+  public i16: number = 0,
+  public u16: number = 0,
+  public i32: number = 0,
+  public u32: number = 0,
+  public i64: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public u64: flatbuffers.Long = flatbuffers.createLong(0, 0),
+  public f32: number = 0.0,
+  public f64: number = 0.0,
+  public v8: (number)[] = [],
+  public vf64: (number)[] = []
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const v8 = TypeAliases.createV8Vector(builder, this.v8);
+  const vf64 = TypeAliases.createVf64Vector(builder, this.vf64);
+
+  return TypeAliases.createTypeAliases(builder,
+    this.i8,
+    this.u8,
+    this.i16,
+    this.u16,
+    this.i32,
+    this.u32,
+    this.i64,
+    this.u64,
+    this.f32,
+    this.f64,
+    v8,
+    vf64
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.js
new file mode 100644
index 0000000..5fd2447
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.js
@@ -0,0 +1,97 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Test } from '../../my-game/example/test';
+export class Vec3 {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    x() {
+        return this.bb.readFloat32(this.bb_pos);
+    }
+    mutate_x(value) {
+        this.bb.writeFloat32(this.bb_pos + 0, value);
+        return true;
+    }
+    y() {
+        return this.bb.readFloat32(this.bb_pos + 4);
+    }
+    mutate_y(value) {
+        this.bb.writeFloat32(this.bb_pos + 4, value);
+        return true;
+    }
+    z() {
+        return this.bb.readFloat32(this.bb_pos + 8);
+    }
+    mutate_z(value) {
+        this.bb.writeFloat32(this.bb_pos + 8, value);
+        return true;
+    }
+    test1() {
+        return this.bb.readFloat64(this.bb_pos + 16);
+    }
+    mutate_test1(value) {
+        this.bb.writeFloat64(this.bb_pos + 16, value);
+        return true;
+    }
+    test2() {
+        return this.bb.readUint8(this.bb_pos + 24);
+    }
+    mutate_test2(value) {
+        this.bb.writeUint8(this.bb_pos + 24, value);
+        return true;
+    }
+    test3(obj) {
+        return (obj || new Test()).__init(this.bb_pos + 26, this.bb);
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example.Vec3';
+    }
+    static sizeOf() {
+        return 32;
+    }
+    static createVec3(builder, x, y, z, test1, test2, test3_a, test3_b) {
+        builder.prep(8, 32);
+        builder.pad(2);
+        builder.prep(2, 4);
+        builder.pad(1);
+        builder.writeInt8(test3_b);
+        builder.writeInt16(test3_a);
+        builder.pad(1);
+        builder.writeInt8(test2);
+        builder.writeFloat64(test1);
+        builder.pad(4);
+        builder.writeFloat32(z);
+        builder.writeFloat32(y);
+        builder.writeFloat32(x);
+        return builder.offset();
+    }
+    unpack() {
+        return new Vec3T(this.x(), this.y(), this.z(), this.test1(), this.test2(), (this.test3() !== null ? this.test3().unpack() : null));
+    }
+    unpackTo(_o) {
+        _o.x = this.x();
+        _o.y = this.y();
+        _o.z = this.z();
+        _o.test1 = this.test1();
+        _o.test2 = this.test2();
+        _o.test3 = (this.test3() !== null ? this.test3().unpack() : null);
+    }
+}
+export class Vec3T {
+    constructor(x = 0.0, y = 0.0, z = 0.0, test1 = 0.0, test2 = 0, test3 = null) {
+        this.x = x;
+        this.y = y;
+        this.z = z;
+        this.test1 = test1;
+        this.test2 = test2;
+        this.test3 = test3;
+    }
+    pack(builder) {
+        return Vec3.createVec3(builder, this.x, this.y, this.z, this.test1, this.test2, (this.test3 === null ? 0 : this.test3.a), (this.test3 === null ? 0 : this.test3.b));
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.ts
new file mode 100644
index 0000000..9756977
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example/vec3.ts
@@ -0,0 +1,137 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { Color } from '../../my-game/example/color';
+import { Test, TestT } from '../../my-game/example/test';
+
+
+export class Vec3 {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Vec3 {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+x():number {
+  return this.bb!.readFloat32(this.bb_pos);
+}
+
+mutate_x(value:number):boolean {
+  this.bb!.writeFloat32(this.bb_pos + 0, value);
+  return true;
+}
+
+y():number {
+  return this.bb!.readFloat32(this.bb_pos + 4);
+}
+
+mutate_y(value:number):boolean {
+  this.bb!.writeFloat32(this.bb_pos + 4, value);
+  return true;
+}
+
+z():number {
+  return this.bb!.readFloat32(this.bb_pos + 8);
+}
+
+mutate_z(value:number):boolean {
+  this.bb!.writeFloat32(this.bb_pos + 8, value);
+  return true;
+}
+
+test1():number {
+  return this.bb!.readFloat64(this.bb_pos + 16);
+}
+
+mutate_test1(value:number):boolean {
+  this.bb!.writeFloat64(this.bb_pos + 16, value);
+  return true;
+}
+
+test2():Color {
+  return this.bb!.readUint8(this.bb_pos + 24);
+}
+
+mutate_test2(value:Color):boolean {
+  this.bb!.writeUint8(this.bb_pos + 24, value);
+  return true;
+}
+
+test3(obj?:Test):Test|null {
+  return (obj || new Test()).__init(this.bb_pos + 26, this.bb!);
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example.Vec3';
+}
+
+static sizeOf():number {
+  return 32;
+}
+
+static createVec3(builder:flatbuffers.Builder, x: number, y: number, z: number, test1: number, test2: Color, test3_a: number, test3_b: number):flatbuffers.Offset {
+  builder.prep(8, 32);
+  builder.pad(2);
+  builder.prep(2, 4);
+  builder.pad(1);
+  builder.writeInt8(test3_b);
+  builder.writeInt16(test3_a);
+  builder.pad(1);
+  builder.writeInt8(test2);
+  builder.writeFloat64(test1);
+  builder.pad(4);
+  builder.writeFloat32(z);
+  builder.writeFloat32(y);
+  builder.writeFloat32(x);
+  return builder.offset();
+}
+
+
+unpack(): Vec3T {
+  return new Vec3T(
+    this.x(),
+    this.y(),
+    this.z(),
+    this.test1(),
+    this.test2(),
+    (this.test3() !== null ? this.test3()!.unpack() : null)
+  );
+}
+
+
+unpackTo(_o: Vec3T): void {
+  _o.x = this.x();
+  _o.y = this.y();
+  _o.z = this.z();
+  _o.test1 = this.test1();
+  _o.test2 = this.test2();
+  _o.test3 = (this.test3() !== null ? this.test3()!.unpack() : null);
+}
+}
+
+export class Vec3T {
+constructor(
+  public x: number = 0.0,
+  public y: number = 0.0,
+  public z: number = 0.0,
+  public test1: number = 0.0,
+  public test2: Color = 0,
+  public test3: TestT|null = null
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Vec3.createVec3(builder,
+    this.x,
+    this.y,
+    this.z,
+    this.test1,
+    this.test2,
+    (this.test3 === null ? 0 : this.test3.a!),
+    (this.test3 === null ? 0 : this.test3.b!)
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.js
new file mode 100644
index 0000000..17f02b1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.js
@@ -0,0 +1,50 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class Monster {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsMonster(bb, obj) {
+        return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsMonster(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.Example2.Monster';
+    }
+    static startMonster(builder) {
+        builder.startObject(0);
+    }
+    static endMonster(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createMonster(builder) {
+        Monster.startMonster(builder);
+        return Monster.endMonster(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return Monster.getRootAsMonster(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new MonsterT();
+    }
+    unpackTo(_o) { }
+}
+export class MonsterT {
+    constructor() { }
+    pack(builder) {
+        return Monster.createMonster(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.ts
new file mode 100644
index 0000000..14d7685
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/example2/monster.ts
@@ -0,0 +1,66 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Monster {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Monster {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsMonster(bb:flatbuffers.ByteBuffer, obj?:Monster):Monster {
+  return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsMonster(bb:flatbuffers.ByteBuffer, obj?:Monster):Monster {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Monster()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.Example2.Monster';
+}
+
+static startMonster(builder:flatbuffers.Builder) {
+  builder.startObject(0);
+}
+
+static endMonster(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createMonster(builder:flatbuffers.Builder):flatbuffers.Offset {
+  Monster.startMonster(builder);
+  return Monster.endMonster(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):Monster {
+  return Monster.getRootAsMonster(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): MonsterT {
+  return new MonsterT();
+}
+
+
+unpackTo(_o: MonsterT): void {}
+}
+
+export class MonsterT {
+constructor(){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Monster.createMonster(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.js b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.js
new file mode 100644
index 0000000..4881741
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.js
@@ -0,0 +1,50 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class InParentNamespace {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsInParentNamespace(bb, obj) {
+        return (obj || new InParentNamespace()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsInParentNamespace(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new InParentNamespace()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getFullyQualifiedName() {
+        return 'MyGame.InParentNamespace';
+    }
+    static startInParentNamespace(builder) {
+        builder.startObject(0);
+    }
+    static endInParentNamespace(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createInParentNamespace(builder) {
+        InParentNamespace.startInParentNamespace(builder);
+        return InParentNamespace.endInParentNamespace(builder);
+    }
+    serialize() {
+        return this.bb.bytes();
+    }
+    static deserialize(buffer) {
+        return InParentNamespace.getRootAsInParentNamespace(new flatbuffers.ByteBuffer(buffer));
+    }
+    unpack() {
+        return new InParentNamespaceT();
+    }
+    unpackTo(_o) { }
+}
+export class InParentNamespaceT {
+    constructor() { }
+    pack(builder) {
+        return InParentNamespace.createInParentNamespace(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.ts b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.ts
new file mode 100644
index 0000000..8cfe9b6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/my-game/in-parent-namespace.ts
@@ -0,0 +1,66 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class InParentNamespace {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):InParentNamespace {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsInParentNamespace(bb:flatbuffers.ByteBuffer, obj?:InParentNamespace):InParentNamespace {
+  return (obj || new InParentNamespace()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsInParentNamespace(bb:flatbuffers.ByteBuffer, obj?:InParentNamespace):InParentNamespace {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new InParentNamespace()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getFullyQualifiedName():string {
+  return 'MyGame.InParentNamespace';
+}
+
+static startInParentNamespace(builder:flatbuffers.Builder) {
+  builder.startObject(0);
+}
+
+static endInParentNamespace(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createInParentNamespace(builder:flatbuffers.Builder):flatbuffers.Offset {
+  InParentNamespace.startInParentNamespace(builder);
+  return InParentNamespace.endInParentNamespace(builder);
+}
+
+serialize():Uint8Array {
+  return this.bb!.bytes();
+}
+
+static deserialize(buffer: Uint8Array):InParentNamespace {
+  return InParentNamespace.getRootAsInParentNamespace(new flatbuffers.ByteBuffer(buffer))
+}
+
+unpack(): InParentNamespaceT {
+  return new InParentNamespaceT();
+}
+
+
+unpackTo(_o: InParentNamespaceT): void {}
+}
+
+export class InParentNamespaceT {
+constructor(){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return InParentNamespace.createInParentNamespace(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.cs
new file mode 100644
index 0000000..bb17d7b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.cs
@@ -0,0 +1,17 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA.NamespaceB
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum EnumInNestedNS : sbyte
+{
+  A = 0,
+  B = 1,
+  C = 2,
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.go
new file mode 100644
index 0000000..6cec5ff
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.go
@@ -0,0 +1,32 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceB
+
+import "strconv"
+
+type EnumInNestedNS int8
+
+const (
+	EnumInNestedNSA EnumInNestedNS = 0
+	EnumInNestedNSB EnumInNestedNS = 1
+	EnumInNestedNSC EnumInNestedNS = 2
+)
+
+var EnumNamesEnumInNestedNS = map[EnumInNestedNS]string{
+	EnumInNestedNSA: "A",
+	EnumInNestedNSB: "B",
+	EnumInNestedNSC: "C",
+}
+
+var EnumValuesEnumInNestedNS = map[string]EnumInNestedNS{
+	"A": EnumInNestedNSA,
+	"B": EnumInNestedNSB,
+	"C": EnumInNestedNSC,
+}
+
+func (v EnumInNestedNS) String() string {
+	if s, ok := EnumNamesEnumInNestedNS[v]; ok {
+		return s
+	}
+	return "EnumInNestedNS(" + strconv.FormatInt(int64(v), 10) + ")"
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.java
new file mode 100644
index 0000000..e23cecc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.java
@@ -0,0 +1,15 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB;
+
+public final class EnumInNestedNS {
+  private EnumInNestedNS() { }
+  public static final byte A = 0;
+  public static final byte B = 1;
+  public static final byte C = 2;
+
+  public static final String[] names = { "A", "B", "C", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.kt
new file mode 100644
index 0000000..0ede58c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.kt
@@ -0,0 +1,15 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class EnumInNestedNS private constructor() {
+    companion object {
+        const val A: Byte = 0
+        const val B: Byte = 1
+        const val C: Byte = 2
+        val names : Array<String> = arrayOf("A", "B", "C")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.lua
new file mode 100644
index 0000000..60b1fb1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.lua
@@ -0,0 +1,11 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceB
+
+local EnumInNestedNS = {
+    A = 0,
+    B = 1,
+    C = 2,
+}
+
+return EnumInNestedNS -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.php
new file mode 100644
index 0000000..bcb22b7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.php
@@ -0,0 +1,25 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA\NamespaceB;
+
+class EnumInNestedNS
+{
+    const A = 0;
+    const B = 1;
+    const C = 2;
+
+    private static $names = array(
+        EnumInNestedNS::A=>"A",
+        EnumInNestedNS::B=>"B",
+        EnumInNestedNS::C=>"C",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.py
new file mode 100644
index 0000000..cb8218f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/EnumInNestedNS.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceB
+
+class EnumInNestedNS(object):
+    A = 0
+    B = 1
+    C = 2
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.cs
new file mode 100644
index 0000000..6fa9dd3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.cs
@@ -0,0 +1,62 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA.NamespaceB
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct StructInNestedNS : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public StructInNestedNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int A { get { return __p.bb.GetInt(__p.bb_pos + 0); } }
+  public void MutateA(int a) { __p.bb.PutInt(__p.bb_pos + 0, a); }
+  public int B { get { return __p.bb.GetInt(__p.bb_pos + 4); } }
+  public void MutateB(int b) { __p.bb.PutInt(__p.bb_pos + 4, b); }
+
+  public static Offset<NamespaceA.NamespaceB.StructInNestedNS> CreateStructInNestedNS(FlatBufferBuilder builder, int A, int B) {
+    builder.Prep(4, 8);
+    builder.PutInt(B);
+    builder.PutInt(A);
+    return new Offset<NamespaceA.NamespaceB.StructInNestedNS>(builder.Offset);
+  }
+  public StructInNestedNST UnPack() {
+    var _o = new StructInNestedNST();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(StructInNestedNST _o) {
+    _o.A = this.A;
+    _o.B = this.B;
+  }
+  public static Offset<NamespaceA.NamespaceB.StructInNestedNS> Pack(FlatBufferBuilder builder, StructInNestedNST _o) {
+    if (_o == null) return default(Offset<NamespaceA.NamespaceB.StructInNestedNS>);
+    return CreateStructInNestedNS(
+      builder,
+      _o.A,
+      _o.B);
+  }
+};
+
+public class StructInNestedNST
+{
+  [Newtonsoft.Json.JsonProperty("a")]
+  public int A { get; set; }
+  [Newtonsoft.Json.JsonProperty("b")]
+  public int B { get; set; }
+
+  public StructInNestedNST() {
+    this.A = 0;
+    this.B = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.go
new file mode 100644
index 0000000..854403f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.go
@@ -0,0 +1,62 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceB
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type StructInNestedNST struct {
+	A int32
+	B int32
+}
+
+func (t *StructInNestedNST) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	return CreateStructInNestedNS(builder, t.A, t.B)
+}
+func (rcv *StructInNestedNS) UnPackTo(t *StructInNestedNST) {
+	t.A = rcv.A()
+	t.B = rcv.B()
+}
+
+func (rcv *StructInNestedNS) UnPack() *StructInNestedNST {
+	if rcv == nil { return nil }
+	t := &StructInNestedNST{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type StructInNestedNS struct {
+	_tab flatbuffers.Struct
+}
+
+func (rcv *StructInNestedNS) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *StructInNestedNS) Table() flatbuffers.Table {
+	return rcv._tab.Table
+}
+
+func (rcv *StructInNestedNS) A() int32 {
+	return rcv._tab.GetInt32(rcv._tab.Pos + flatbuffers.UOffsetT(0))
+}
+func (rcv *StructInNestedNS) MutateA(n int32) bool {
+	return rcv._tab.MutateInt32(rcv._tab.Pos+flatbuffers.UOffsetT(0), n)
+}
+
+func (rcv *StructInNestedNS) B() int32 {
+	return rcv._tab.GetInt32(rcv._tab.Pos + flatbuffers.UOffsetT(4))
+}
+func (rcv *StructInNestedNS) MutateB(n int32) bool {
+	return rcv._tab.MutateInt32(rcv._tab.Pos+flatbuffers.UOffsetT(4), n)
+}
+
+func CreateStructInNestedNS(builder *flatbuffers.Builder, a int32, b int32) flatbuffers.UOffsetT {
+	builder.Prep(4, 8)
+	builder.PrependInt32(b)
+	builder.PrependInt32(a)
+	return builder.Offset()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.java
new file mode 100644
index 0000000..6505561
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.java
@@ -0,0 +1,34 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class StructInNestedNS extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public StructInNestedNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int a() { return bb.getInt(bb_pos + 0); }
+  public void mutateA(int a) { bb.putInt(bb_pos + 0, a); }
+  public int b() { return bb.getInt(bb_pos + 4); }
+  public void mutateB(int b) { bb.putInt(bb_pos + 4, b); }
+
+  public static int createStructInNestedNS(FlatBufferBuilder builder, int a, int b) {
+    builder.prep(4, 8);
+    builder.putInt(b);
+    builder.putInt(a);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public StructInNestedNS get(int j) { return get(new StructInNestedNS(), j); }
+    public StructInNestedNS get(StructInNestedNS obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.kt
new file mode 100644
index 0000000..0273bb1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.kt
@@ -0,0 +1,32 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class StructInNestedNS : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : StructInNestedNS {
+        __init(_i, _bb)
+        return this
+    }
+    val a : Int get() = bb.getInt(bb_pos + 0)
+    fun mutateA(a: Int) : ByteBuffer = bb.putInt(bb_pos + 0, a)
+    val b : Int get() = bb.getInt(bb_pos + 4)
+    fun mutateB(b: Int) : ByteBuffer = bb.putInt(bb_pos + 4, b)
+    companion object {
+        fun createStructInNestedNS(builder: FlatBufferBuilder, a: Int, b: Int) : Int {
+            builder.prep(4, 8)
+            builder.putInt(b)
+            builder.putInt(a)
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.lua
new file mode 100644
index 0000000..9ca7541
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.lua
@@ -0,0 +1,31 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceB
+
+local flatbuffers = require('flatbuffers')
+
+local StructInNestedNS = {} -- the module
+local StructInNestedNS_mt = {} -- the class metatable
+
+function StructInNestedNS.New()
+    local o = {}
+    setmetatable(o, {__index = StructInNestedNS_mt})
+    return o
+end
+function StructInNestedNS_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function StructInNestedNS_mt:A()
+    return self.view:Get(flatbuffers.N.Int32, self.view.pos + 0)
+end
+function StructInNestedNS_mt:B()
+    return self.view:Get(flatbuffers.N.Int32, self.view.pos + 4)
+end
+function StructInNestedNS.CreateStructInNestedNS(builder, a, b)
+    builder:Prep(4, 8)
+    builder:PrependInt32(b)
+    builder:PrependInt32(a)
+    return builder:Offset()
+end
+
+return StructInNestedNS -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.php
new file mode 100644
index 0000000..d305484
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.php
@@ -0,0 +1,52 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA\NamespaceB;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class StructInNestedNS extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return StructInNestedNS
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return int
+     */
+    public function GetA()
+    {
+        return $this->bb->getInt($this->bb_pos + 0);
+    }
+
+    /**
+     * @return int
+     */
+    public function GetB()
+    {
+        return $this->bb->getInt($this->bb_pos + 4);
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createStructInNestedNS(FlatBufferBuilder $builder, $a, $b)
+    {
+        $builder->prep(4, 8);
+        $builder->putInt($b);
+        $builder->putInt($a);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.py
new file mode 100644
index 0000000..f49495b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/StructInNestedNS.py
@@ -0,0 +1,60 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceB
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class StructInNestedNS(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def SizeOf(cls):
+        return 8
+
+    # StructInNestedNS
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StructInNestedNS
+    def A(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
+    # StructInNestedNS
+    def B(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4))
+
+def CreateStructInNestedNS(builder, a, b):
+    builder.Prep(4, 8)
+    builder.PrependInt32(b)
+    builder.PrependInt32(a)
+    return builder.Offset()
+
+
+class StructInNestedNST(object):
+
+    # StructInNestedNST
+    def __init__(self):
+        self.a = 0  # type: int
+        self.b = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        structInNestedNS = StructInNestedNS()
+        structInNestedNS.Init(buf, pos)
+        return cls.InitFromObj(structInNestedNS)
+
+    @classmethod
+    def InitFromObj(cls, structInNestedNS):
+        x = StructInNestedNST()
+        x._UnPack(structInNestedNS)
+        return x
+
+    # StructInNestedNST
+    def _UnPack(self, structInNestedNS):
+        if structInNestedNS is None:
+            return
+        self.a = structInNestedNS.A()
+        self.b = structInNestedNS.B()
+
+    # StructInNestedNST
+    def Pack(self, builder):
+        return CreateStructInNestedNS(builder, self.a, self.b)
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.cs
new file mode 100644
index 0000000..b947166
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.cs
@@ -0,0 +1,65 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA.NamespaceB
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct TableInNestedNS : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static TableInNestedNS GetRootAsTableInNestedNS(ByteBuffer _bb) { return GetRootAsTableInNestedNS(_bb, new TableInNestedNS()); }
+  public static TableInNestedNS GetRootAsTableInNestedNS(ByteBuffer _bb, TableInNestedNS obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public TableInNestedNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int Foo { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public bool MutateFoo(int foo) { int o = __p.__offset(4); if (o != 0) { __p.bb.PutInt(o + __p.bb_pos, foo); return true; } else { return false; } }
+
+  public static Offset<NamespaceA.NamespaceB.TableInNestedNS> CreateTableInNestedNS(FlatBufferBuilder builder,
+      int foo = 0) {
+    builder.StartTable(1);
+    TableInNestedNS.AddFoo(builder, foo);
+    return TableInNestedNS.EndTableInNestedNS(builder);
+  }
+
+  public static void StartTableInNestedNS(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddFoo(FlatBufferBuilder builder, int foo) { builder.AddInt(0, foo, 0); }
+  public static Offset<NamespaceA.NamespaceB.TableInNestedNS> EndTableInNestedNS(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<NamespaceA.NamespaceB.TableInNestedNS>(o);
+  }
+  public TableInNestedNST UnPack() {
+    var _o = new TableInNestedNST();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TableInNestedNST _o) {
+    _o.Foo = this.Foo;
+  }
+  public static Offset<NamespaceA.NamespaceB.TableInNestedNS> Pack(FlatBufferBuilder builder, TableInNestedNST _o) {
+    if (_o == null) return default(Offset<NamespaceA.NamespaceB.TableInNestedNS>);
+    return CreateTableInNestedNS(
+      builder,
+      _o.Foo);
+  }
+};
+
+public class TableInNestedNST
+{
+  [Newtonsoft.Json.JsonProperty("foo")]
+  public int Foo { get; set; }
+
+  public TableInNestedNST() {
+    this.Foo = 0;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.go
new file mode 100644
index 0000000..3782645
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.go
@@ -0,0 +1,78 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceB
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+
+type TableInNestedNST struct {
+	Foo int32
+}
+
+func (t *TableInNestedNST) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	TableInNestedNSStart(builder)
+	TableInNestedNSAddFoo(builder, t.Foo)
+	return TableInNestedNSEnd(builder)
+}
+
+func (rcv *TableInNestedNS) UnPackTo(t *TableInNestedNST) {
+	t.Foo = rcv.Foo()
+}
+
+func (rcv *TableInNestedNS) UnPack() *TableInNestedNST {
+	if rcv == nil { return nil }
+	t := &TableInNestedNST{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type TableInNestedNS struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsTableInNestedNS(buf []byte, offset flatbuffers.UOffsetT) *TableInNestedNS {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &TableInNestedNS{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsTableInNestedNS(buf []byte, offset flatbuffers.UOffsetT) *TableInNestedNS {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &TableInNestedNS{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *TableInNestedNS) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TableInNestedNS) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *TableInNestedNS) Foo() int32 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		return rcv._tab.GetInt32(o + rcv._tab.Pos)
+	}
+	return 0
+}
+
+func (rcv *TableInNestedNS) MutateFoo(n int32) bool {
+	return rcv._tab.MutateInt32Slot(4, n)
+}
+
+func TableInNestedNSStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func TableInNestedNSAddFoo(builder *flatbuffers.Builder, foo int32) {
+	builder.PrependInt32Slot(0, foo, 0)
+}
+func TableInNestedNSEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.java
new file mode 100644
index 0000000..53aaff0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.java
@@ -0,0 +1,42 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class TableInNestedNS extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static TableInNestedNS getRootAsTableInNestedNS(ByteBuffer _bb) { return getRootAsTableInNestedNS(_bb, new TableInNestedNS()); }
+  public static TableInNestedNS getRootAsTableInNestedNS(ByteBuffer _bb, TableInNestedNS obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public TableInNestedNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int foo() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean mutateFoo(int foo) { int o = __offset(4); if (o != 0) { bb.putInt(o + bb_pos, foo); return true; } else { return false; } }
+
+  public static int createTableInNestedNS(FlatBufferBuilder builder,
+      int foo) {
+    builder.startTable(1);
+    TableInNestedNS.addFoo(builder, foo);
+    return TableInNestedNS.endTableInNestedNS(builder);
+  }
+
+  public static void startTableInNestedNS(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addFoo(FlatBufferBuilder builder, int foo) { builder.addInt(0, foo, 0); }
+  public static int endTableInNestedNS(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public TableInNestedNS get(int j) { return get(new TableInNestedNS(), j); }
+    public TableInNestedNS get(TableInNestedNS obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.kt
new file mode 100644
index 0000000..cb4cc10
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.kt
@@ -0,0 +1,53 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class TableInNestedNS : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : TableInNestedNS {
+        __init(_i, _bb)
+        return this
+    }
+    val foo : Int
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    fun mutateFoo(foo: Int) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, foo)
+            true
+        } else {
+            false
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsTableInNestedNS(_bb: ByteBuffer): TableInNestedNS = getRootAsTableInNestedNS(_bb, TableInNestedNS())
+        fun getRootAsTableInNestedNS(_bb: ByteBuffer, obj: TableInNestedNS): TableInNestedNS {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createTableInNestedNS(builder: FlatBufferBuilder, foo: Int) : Int {
+            builder.startTable(1)
+            addFoo(builder, foo)
+            return endTableInNestedNS(builder)
+        }
+        fun startTableInNestedNS(builder: FlatBufferBuilder) = builder.startTable(1)
+        fun addFoo(builder: FlatBufferBuilder, foo: Int) = builder.addInt(0, foo, 0)
+        fun endTableInNestedNS(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.lua
new file mode 100644
index 0000000..af86203
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.lua
@@ -0,0 +1,38 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceB
+
+local flatbuffers = require('flatbuffers')
+
+local TableInNestedNS = {} -- the module
+local TableInNestedNS_mt = {} -- the class metatable
+
+function TableInNestedNS.New()
+    local o = {}
+    setmetatable(o, {__index = TableInNestedNS_mt})
+    return o
+end
+function TableInNestedNS.GetRootAsTableInNestedNS(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = TableInNestedNS.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function TableInNestedNS_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function TableInNestedNS_mt:Foo()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int32, o + self.view.pos)
+    end
+    return 0
+end
+function TableInNestedNS.Start(builder) builder:StartObject(1) end
+function TableInNestedNS.AddFoo(builder, foo) builder:PrependInt32Slot(0, foo, 0) end
+function TableInNestedNS.End(builder) return builder:EndObject() end
+
+return TableInNestedNS -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.php
new file mode 100644
index 0000000..d16379d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.php
@@ -0,0 +1,84 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA\NamespaceB;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TableInNestedNS extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TableInNestedNS
+     */
+    public static function getRootAsTableInNestedNS(ByteBuffer $bb)
+    {
+        $obj = new TableInNestedNS();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TableInNestedNS
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return int
+     */
+    public function getFoo()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getInt($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTableInNestedNS(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TableInNestedNS
+     */
+    public static function createTableInNestedNS(FlatBufferBuilder $builder, $foo)
+    {
+        $builder->startObject(1);
+        self::addFoo($builder, $foo);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addFoo(FlatBufferBuilder $builder, $foo)
+    {
+        $builder->addIntX(0, $foo, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTableInNestedNS(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.py
new file mode 100644
index 0000000..7a4ff9d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/TableInNestedNS.py
@@ -0,0 +1,76 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceB
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class TableInNestedNS(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TableInNestedNS()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTableInNestedNS(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # TableInNestedNS
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TableInNestedNS
+    def Foo(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def Start(builder): builder.StartObject(1)
+def TableInNestedNSStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddFoo(builder, foo): builder.PrependInt32Slot(0, foo, 0)
+def TableInNestedNSAddFoo(builder, foo):
+    """This method is deprecated. Please switch to AddFoo."""
+    return AddFoo(builder, foo)
+def End(builder): return builder.EndObject()
+def TableInNestedNSEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+
+class TableInNestedNST(object):
+
+    # TableInNestedNST
+    def __init__(self):
+        self.foo = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        tableInNestedNS = TableInNestedNS()
+        tableInNestedNS.Init(buf, pos)
+        return cls.InitFromObj(tableInNestedNS)
+
+    @classmethod
+    def InitFromObj(cls, tableInNestedNS):
+        x = TableInNestedNST()
+        x._UnPack(tableInNestedNS)
+        return x
+
+    # TableInNestedNST
+    def _UnPack(self, tableInNestedNS):
+        if tableInNestedNS is None:
+            return
+        self.foo = tableInNestedNS.Foo()
+
+    # TableInNestedNST
+    def Pack(self, builder):
+        Start(builder)
+        AddFoo(builder, self.foo)
+        tableInNestedNS = End(builder)
+        return tableInNestedNS
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.cs
new file mode 100644
index 0000000..9924f32
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.cs
@@ -0,0 +1,77 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA.NamespaceB
+{
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum UnionInNestedNS : byte
+{
+  NONE = 0,
+  TableInNestedNS = 1,
+};
+
+public class UnionInNestedNSUnion {
+  public UnionInNestedNS Type { get; set; }
+  public object Value { get; set; }
+
+  public UnionInNestedNSUnion() {
+    this.Type = UnionInNestedNS.NONE;
+    this.Value = null;
+  }
+
+  public T As<T>() where T : class { return this.Value as T; }
+  public NamespaceA.NamespaceB.TableInNestedNST AsTableInNestedNS() { return this.As<NamespaceA.NamespaceB.TableInNestedNST>(); }
+
+  public static int Pack(FlatBuffers.FlatBufferBuilder builder, UnionInNestedNSUnion _o) {
+    switch (_o.Type) {
+      default: return 0;
+      case UnionInNestedNS.TableInNestedNS: return NamespaceA.NamespaceB.TableInNestedNS.Pack(builder, _o.AsTableInNestedNS()).Value;
+    }
+  }
+}
+
+public class UnionInNestedNSUnion_JsonConverter : Newtonsoft.Json.JsonConverter {
+  public override bool CanConvert(System.Type objectType) {
+    return objectType == typeof(UnionInNestedNSUnion) || objectType == typeof(System.Collections.Generic.List<UnionInNestedNSUnion>);
+  }
+  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, object value, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = value as System.Collections.Generic.List<UnionInNestedNSUnion>;
+    if (_olist != null) {
+      writer.WriteStartArray();
+      foreach (var _o in _olist) { this.WriteJson(writer, _o, serializer); }
+      writer.WriteEndArray();
+    } else {
+      this.WriteJson(writer, value as UnionInNestedNSUnion, serializer);
+    }
+  }
+  public void WriteJson(Newtonsoft.Json.JsonWriter writer, UnionInNestedNSUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return;
+    serializer.Serialize(writer, _o.Value);
+  }
+  public override object ReadJson(Newtonsoft.Json.JsonReader reader, System.Type objectType, object existingValue, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = existingValue as System.Collections.Generic.List<UnionInNestedNSUnion>;
+    if (_olist != null) {
+      for (var _j = 0; _j < _olist.Count; ++_j) {
+        reader.Read();
+        _olist[_j] = this.ReadJson(reader, _olist[_j], serializer);
+      }
+      reader.Read();
+      return _olist;
+    } else {
+      return this.ReadJson(reader, existingValue as UnionInNestedNSUnion, serializer);
+    }
+  }
+  public UnionInNestedNSUnion ReadJson(Newtonsoft.Json.JsonReader reader, UnionInNestedNSUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return null;
+    switch (_o.Type) {
+      default: break;
+      case UnionInNestedNS.TableInNestedNS: _o.Value = serializer.Deserialize<NamespaceA.NamespaceB.TableInNestedNST>(reader); break;
+    }
+    return _o;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.go
new file mode 100644
index 0000000..f7b1d1c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.go
@@ -0,0 +1,36 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceB
+
+import (
+	"strconv"
+
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	NamespaceA__NamespaceB "NamespaceA/NamespaceB"
+)
+
+type UnionInNestedNST struct {
+	Type UnionInNestedNS
+	Value interface{}
+}
+
+func (t *UnionInNestedNST) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil {
+		return 0
+	}
+	switch t.Type {
+	case UnionInNestedNSTableInNestedNS:
+		return t.Value.(*NamespaceA__NamespaceB.TableInNestedNST).Pack(builder)
+	}
+	return 0
+}
+
+func (rcv UnionInNestedNS) UnPack(table flatbuffers.Table) *UnionInNestedNST {
+	switch rcv {
+	case UnionInNestedNSTableInNestedNS:
+		x := TableInNestedNS{_tab: table}
+		return &NamespaceA__NamespaceB.UnionInNestedNST{ Type: UnionInNestedNSTableInNestedNS, Value: x.UnPack() }
+	}
+	return nil
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.java
new file mode 100644
index 0000000..dc54b9e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.java
@@ -0,0 +1,14 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB;
+
+public final class UnionInNestedNS {
+  private UnionInNestedNS() { }
+  public static final byte NONE = 0;
+  public static final byte TableInNestedNS = 1;
+
+  public static final String[] names = { "NONE", "TableInNestedNS", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.kt
new file mode 100644
index 0000000..1c52f95
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.kt
@@ -0,0 +1,14 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA.NamespaceB
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class UnionInNestedNS private constructor() {
+    companion object {
+        const val NONE: UByte = 0u
+        const val TableInNestedNS: UByte = 1u
+        val names : Array<String> = arrayOf("NONE", "TableInNestedNS")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.lua
new file mode 100644
index 0000000..30bc93e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.lua
@@ -0,0 +1,10 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceB
+
+local UnionInNestedNS = {
+    NONE = 0,
+    TableInNestedNS = 1,
+}
+
+return UnionInNestedNS -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.php
new file mode 100644
index 0000000..ff948d7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.php
@@ -0,0 +1,23 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA\NamespaceB;
+
+class UnionInNestedNS
+{
+    const NONE = 0;
+    const TableInNestedNS = 1;
+
+    private static $names = array(
+        UnionInNestedNS::NONE=>"NONE",
+        UnionInNestedNS::TableInNestedNS=>"TableInNestedNS",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.py
new file mode 100644
index 0000000..66f03f2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/UnionInNestedNS.py
@@ -0,0 +1,12 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceB
+
+
+def UnionInNestedNSCreator(unionType, table):
+    from flatbuffers.table import Table
+    if not isinstance(table, Table):
+        return None
+    if unionType == UnionInNestedNS().TableInNestedNS:
+        return TableInNestedNST.InitFromBuf(table.Bytes, table.Pos)
+    return None
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/NamespaceB/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.cs
new file mode 100644
index 0000000..3cad231
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.cs
@@ -0,0 +1,65 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct SecondTableInA : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static SecondTableInA GetRootAsSecondTableInA(ByteBuffer _bb) { return GetRootAsSecondTableInA(_bb, new SecondTableInA()); }
+  public static SecondTableInA GetRootAsSecondTableInA(ByteBuffer _bb, SecondTableInA obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public SecondTableInA __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceC.TableInC? ReferToC { get { int o = __p.__offset(4); return o != 0 ? (NamespaceC.TableInC?)(new NamespaceC.TableInC()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+
+  public static Offset<NamespaceA.SecondTableInA> CreateSecondTableInA(FlatBufferBuilder builder,
+      Offset<NamespaceC.TableInC> refer_to_cOffset = default(Offset<NamespaceC.TableInC>)) {
+    builder.StartTable(1);
+    SecondTableInA.AddReferToC(builder, refer_to_cOffset);
+    return SecondTableInA.EndSecondTableInA(builder);
+  }
+
+  public static void StartSecondTableInA(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddReferToC(FlatBufferBuilder builder, Offset<NamespaceC.TableInC> referToCOffset) { builder.AddOffset(0, referToCOffset.Value, 0); }
+  public static Offset<NamespaceA.SecondTableInA> EndSecondTableInA(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<NamespaceA.SecondTableInA>(o);
+  }
+  public SecondTableInAT UnPack() {
+    var _o = new SecondTableInAT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(SecondTableInAT _o) {
+    _o.ReferToC = this.ReferToC.HasValue ? this.ReferToC.Value.UnPack() : null;
+  }
+  public static Offset<NamespaceA.SecondTableInA> Pack(FlatBufferBuilder builder, SecondTableInAT _o) {
+    if (_o == null) return default(Offset<NamespaceA.SecondTableInA>);
+    var _refer_to_c = _o.ReferToC == null ? default(Offset<NamespaceC.TableInC>) : NamespaceC.TableInC.Pack(builder, _o.ReferToC);
+    return CreateSecondTableInA(
+      builder,
+      _refer_to_c);
+  }
+};
+
+public class SecondTableInAT
+{
+  [Newtonsoft.Json.JsonProperty("refer_to_c")]
+  public NamespaceC.TableInCT ReferToC { get; set; }
+
+  public SecondTableInAT() {
+    this.ReferToC = null;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.go
new file mode 100644
index 0000000..f88a682
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.go
@@ -0,0 +1,82 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceA
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	NamespaceC "NamespaceC"
+)
+
+type SecondTableInAT struct {
+	ReferToC *NamespaceC.TableInCT
+}
+
+func (t *SecondTableInAT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	referToCOffset := t.ReferToC.Pack(builder)
+	SecondTableInAStart(builder)
+	SecondTableInAAddReferToC(builder, referToCOffset)
+	return SecondTableInAEnd(builder)
+}
+
+func (rcv *SecondTableInA) UnPackTo(t *SecondTableInAT) {
+	t.ReferToC = rcv.ReferToC(nil).UnPack()
+}
+
+func (rcv *SecondTableInA) UnPack() *SecondTableInAT {
+	if rcv == nil { return nil }
+	t := &SecondTableInAT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type SecondTableInA struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsSecondTableInA(buf []byte, offset flatbuffers.UOffsetT) *SecondTableInA {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &SecondTableInA{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsSecondTableInA(buf []byte, offset flatbuffers.UOffsetT) *SecondTableInA {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &SecondTableInA{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *SecondTableInA) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *SecondTableInA) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *SecondTableInA) ReferToC(obj *NamespaceC.TableInC) *NamespaceC.TableInC {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(NamespaceC.TableInC)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func SecondTableInAStart(builder *flatbuffers.Builder) {
+	builder.StartObject(1)
+}
+func SecondTableInAAddReferToC(builder *flatbuffers.Builder, referToC flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(referToC), 0)
+}
+func SecondTableInAEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.java
new file mode 100644
index 0000000..5e53048
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.java
@@ -0,0 +1,42 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class SecondTableInA extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static SecondTableInA getRootAsSecondTableInA(ByteBuffer _bb) { return getRootAsSecondTableInA(_bb, new SecondTableInA()); }
+  public static SecondTableInA getRootAsSecondTableInA(ByteBuffer _bb, SecondTableInA obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public SecondTableInA __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceC.TableInC referToC() { return referToC(new NamespaceC.TableInC()); }
+  public NamespaceC.TableInC referToC(NamespaceC.TableInC obj) { int o = __offset(4); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+
+  public static int createSecondTableInA(FlatBufferBuilder builder,
+      int refer_to_cOffset) {
+    builder.startTable(1);
+    SecondTableInA.addReferToC(builder, refer_to_cOffset);
+    return SecondTableInA.endSecondTableInA(builder);
+  }
+
+  public static void startSecondTableInA(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addReferToC(FlatBufferBuilder builder, int referToCOffset) { builder.addOffset(0, referToCOffset, 0); }
+  public static int endSecondTableInA(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public SecondTableInA get(int j) { return get(new SecondTableInA(), j); }
+    public SecondTableInA get(SecondTableInA obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.kt
new file mode 100644
index 0000000..a516070
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.kt
@@ -0,0 +1,48 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class SecondTableInA : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : SecondTableInA {
+        __init(_i, _bb)
+        return this
+    }
+    val referToC : NamespaceC.TableInC? get() = referToC(NamespaceC.TableInC())
+    fun referToC(obj: NamespaceC.TableInC) : NamespaceC.TableInC? {
+        val o = __offset(4)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsSecondTableInA(_bb: ByteBuffer): SecondTableInA = getRootAsSecondTableInA(_bb, SecondTableInA())
+        fun getRootAsSecondTableInA(_bb: ByteBuffer, obj: SecondTableInA): SecondTableInA {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createSecondTableInA(builder: FlatBufferBuilder, referToCOffset: Int) : Int {
+            builder.startTable(1)
+            addReferToC(builder, referToCOffset)
+            return endSecondTableInA(builder)
+        }
+        fun startSecondTableInA(builder: FlatBufferBuilder) = builder.startTable(1)
+        fun addReferToC(builder: FlatBufferBuilder, referToC: Int) = builder.addOffset(0, referToC, 0)
+        fun endSecondTableInA(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.lua
new file mode 100644
index 0000000..b953c12
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.lua
@@ -0,0 +1,40 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceA
+
+local flatbuffers = require('flatbuffers')
+
+local SecondTableInA = {} -- the module
+local SecondTableInA_mt = {} -- the class metatable
+
+function SecondTableInA.New()
+    local o = {}
+    setmetatable(o, {__index = SecondTableInA_mt})
+    return o
+end
+function SecondTableInA.GetRootAsSecondTableInA(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = SecondTableInA.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function SecondTableInA_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function SecondTableInA_mt:ReferToC()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('NamespaceC.TableInC').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function SecondTableInA.Start(builder) builder:StartObject(1) end
+function SecondTableInA.AddReferToC(builder, referToC) builder:PrependUOffsetTRelativeSlot(0, referToC, 0) end
+function SecondTableInA.End(builder) return builder:EndObject() end
+
+return SecondTableInA -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.php
new file mode 100644
index 0000000..c9bc65c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.php
@@ -0,0 +1,82 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class SecondTableInA extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return SecondTableInA
+     */
+    public static function getRootAsSecondTableInA(ByteBuffer $bb)
+    {
+        $obj = new SecondTableInA();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return SecondTableInA
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getReferToC()
+    {
+        $obj = new TableInC();
+        $o = $this->__offset(4);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startSecondTableInA(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return SecondTableInA
+     */
+    public static function createSecondTableInA(FlatBufferBuilder $builder, $refer_to_c)
+    {
+        $builder->startObject(1);
+        self::addReferToC($builder, $refer_to_c);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addReferToC(FlatBufferBuilder $builder, $referToC)
+    {
+        $builder->addOffsetX(0, $referToC, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endSecondTableInA(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.py
new file mode 100644
index 0000000..f97cf06
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/SecondTableInA.py
@@ -0,0 +1,87 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceA
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class SecondTableInA(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SecondTableInA()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsSecondTableInA(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # SecondTableInA
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SecondTableInA
+    def ReferToC(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            obj = TableInC()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+def Start(builder): builder.StartObject(1)
+def SecondTableInAStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddReferToC(builder, referToC): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(referToC), 0)
+def SecondTableInAAddReferToC(builder, referToC):
+    """This method is deprecated. Please switch to AddReferToC."""
+    return AddReferToC(builder, referToC)
+def End(builder): return builder.EndObject()
+def SecondTableInAEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+try:
+    from typing import Optional
+except:
+    pass
+
+class SecondTableInAT(object):
+
+    # SecondTableInAT
+    def __init__(self):
+        self.referToC = None  # type: Optional[TableInCT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        secondTableInA = SecondTableInA()
+        secondTableInA.Init(buf, pos)
+        return cls.InitFromObj(secondTableInA)
+
+    @classmethod
+    def InitFromObj(cls, secondTableInA):
+        x = SecondTableInAT()
+        x._UnPack(secondTableInA)
+        return x
+
+    # SecondTableInAT
+    def _UnPack(self, secondTableInA):
+        if secondTableInA is None:
+            return
+        if secondTableInA.ReferToC() is not None:
+            self.referToC = TableInCT.InitFromObj(secondTableInA.ReferToC())
+
+    # SecondTableInAT
+    def Pack(self, builder):
+        if self.referToC is not None:
+            referToC = self.referToC.Pack(builder)
+        Start(builder)
+        if self.referToC is not None:
+            AddReferToC(builder, referToC)
+        secondTableInA = End(builder)
+        return secondTableInA
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.cs
new file mode 100644
index 0000000..98f4e13
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.cs
@@ -0,0 +1,38 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA
+{
+
+using System;
+using FlatBuffers;
+
+public sealed class TableInC : Table {
+  public static TableInC GetRootAsTableInC(ByteBuffer _bb) { return GetRootAsTableInC(_bb, new TableInC()); }
+  public static TableInC GetRootAsTableInC(ByteBuffer _bb, TableInC obj) { return (obj.__init(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public TableInC __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; }
+
+  public NamespaceA.TableInFirstNS ReferToA1 { get { return GetReferToA1(new NamespaceA.TableInFirstNS()); } }
+  public NamespaceA.TableInFirstNS GetReferToA1(NamespaceA.TableInFirstNS obj) { int o = __offset(4); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; }
+  public SecondTableInA ReferToA2 { get { return GetReferToA2(new SecondTableInA()); } }
+  public SecondTableInA GetReferToA2(SecondTableInA obj) { int o = __offset(6); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; }
+
+  public static Offset<NamespaceC.TableInC> CreateTableInC(FlatBufferBuilder builder,
+      Offset<NamespaceA.TableInFirstNS> refer_to_a1Offset = default(Offset<NamespaceA.TableInFirstNS>),
+      Offset<SecondTableInA> refer_to_a2Offset = default(Offset<SecondTableInA>)) {
+    builder.StartObject(2);
+    TableInC.AddReferToA2(builder, refer_to_a2Offset);
+    TableInC.AddReferToA1(builder, refer_to_a1Offset);
+    return TableInC.EndTableInC(builder);
+  }
+
+  public static void StartTableInC(FlatBufferBuilder builder) { builder.StartObject(2); }
+  public static void AddReferToA1(FlatBufferBuilder builder, Offset<NamespaceA.TableInFirstNS> referToA1Offset) { builder.AddOffset(0, referToA1Offset.Value, 0); }
+  public static void AddReferToA2(FlatBufferBuilder builder, Offset<SecondTableInA> referToA2Offset) { builder.AddOffset(1, referToA2Offset.Value, 0); }
+  public static Offset<NamespaceC.TableInC> EndTableInC(FlatBufferBuilder builder) {
+    int o = builder.EndObject();
+    return new Offset<NamespaceC.TableInC>(o);
+  }
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.go
new file mode 100644
index 0000000..6f3d3f2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.go
@@ -0,0 +1,46 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+)
+type TableInC struct {
+	_tab flatbuffers.Table
+}
+
+func (rcv *TableInC) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TableInC) ReferToA1(obj *TableInFirstNS) *TableInFirstNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(TableInFirstNS)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *TableInC) ReferToA2(obj *SecondTableInA) *SecondTableInA {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(SecondTableInA)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func TableInCStart(builder *flatbuffers.Builder) { builder.StartObject(2) }
+func TableInCAddReferToA1(builder *flatbuffers.Builder, referToA1 flatbuffers.UOffsetT) { builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(referToA1), 0) }
+func TableInCAddReferToA2(builder *flatbuffers.Builder, referToA2 flatbuffers.UOffsetT) { builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(referToA2), 0) }
+func TableInCEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT { return builder.EndObject() }
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.php
new file mode 100644
index 0000000..49705f8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.php
@@ -0,0 +1,100 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TableInC extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TableInC
+     */
+    public static function getRootAsTableInC(ByteBuffer $bb)
+    {
+        $obj = new TableInC();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TableInC
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getReferToA1()
+    {
+        $obj = new TableInFirstNS();
+        $o = $this->__offset(4);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    public function getReferToA2()
+    {
+        $obj = new SecondTableInA();
+        $o = $this->__offset(6);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTableInC(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(2);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TableInC
+     */
+    public static function createTableInC(FlatBufferBuilder $builder, $refer_to_a1, $refer_to_a2)
+    {
+        $builder->startObject(2);
+        self::addReferToA1($builder, $refer_to_a1);
+        self::addReferToA2($builder, $refer_to_a2);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addReferToA1(FlatBufferBuilder $builder, $referToA1)
+    {
+        $builder->addOffsetX(0, $referToA1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addReferToA2(FlatBufferBuilder $builder, $referToA2)
+    {
+        $builder->addOffsetX(1, $referToA2, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTableInC(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.py
new file mode 100644
index 0000000..4afea1a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInC.py
@@ -0,0 +1,39 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceA
+
+import flatbuffers
+
+class TableInC(object):
+    __slots__ = ['_tab']
+
+    # TableInC
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TableInC
+    def ReferToA1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .TableInFirstNS import TableInFirstNS
+            obj = TableInFirstNS()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TableInC
+    def ReferToA2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .SecondTableInA import SecondTableInA
+            obj = SecondTableInA()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+def TableInCStart(builder): builder.StartObject(2)
+def TableInCAddReferToA1(builder, referToA1): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(referToA1), 0)
+def TableInCAddReferToA2(builder, referToA2): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(referToA2), 0)
+def TableInCEnd(builder): return builder.EndObject()
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.cs
new file mode 100644
index 0000000..112d4e8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.cs
@@ -0,0 +1,119 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceA
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct TableInFirstNS : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static TableInFirstNS GetRootAsTableInFirstNS(ByteBuffer _bb) { return GetRootAsTableInFirstNS(_bb, new TableInFirstNS()); }
+  public static TableInFirstNS GetRootAsTableInFirstNS(ByteBuffer _bb, TableInFirstNS obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public TableInFirstNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceA.NamespaceB.TableInNestedNS? FooTable { get { int o = __p.__offset(4); return o != 0 ? (NamespaceA.NamespaceB.TableInNestedNS?)(new NamespaceA.NamespaceB.TableInNestedNS()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+  public NamespaceA.NamespaceB.EnumInNestedNS FooEnum { get { int o = __p.__offset(6); return o != 0 ? (NamespaceA.NamespaceB.EnumInNestedNS)__p.bb.GetSbyte(o + __p.bb_pos) : NamespaceA.NamespaceB.EnumInNestedNS.A; } }
+  public bool MutateFooEnum(NamespaceA.NamespaceB.EnumInNestedNS foo_enum) { int o = __p.__offset(6); if (o != 0) { __p.bb.PutSbyte(o + __p.bb_pos, (sbyte)foo_enum); return true; } else { return false; } }
+  public NamespaceA.NamespaceB.UnionInNestedNS FooUnionType { get { int o = __p.__offset(8); return o != 0 ? (NamespaceA.NamespaceB.UnionInNestedNS)__p.bb.Get(o + __p.bb_pos) : NamespaceA.NamespaceB.UnionInNestedNS.NONE; } }
+  public TTable? FooUnion<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(10); return o != 0 ? (TTable?)__p.__union<TTable>(o + __p.bb_pos) : null; }
+  public NamespaceA.NamespaceB.TableInNestedNS FooUnionAsTableInNestedNS() { return FooUnion<NamespaceA.NamespaceB.TableInNestedNS>().Value; }
+  public NamespaceA.NamespaceB.StructInNestedNS? FooStruct { get { int o = __p.__offset(12); return o != 0 ? (NamespaceA.NamespaceB.StructInNestedNS?)(new NamespaceA.NamespaceB.StructInNestedNS()).__assign(o + __p.bb_pos, __p.bb) : null; } }
+
+  public static Offset<NamespaceA.TableInFirstNS> CreateTableInFirstNS(FlatBufferBuilder builder,
+      Offset<NamespaceA.NamespaceB.TableInNestedNS> foo_tableOffset = default(Offset<NamespaceA.NamespaceB.TableInNestedNS>),
+      NamespaceA.NamespaceB.EnumInNestedNS foo_enum = NamespaceA.NamespaceB.EnumInNestedNS.A,
+      NamespaceA.NamespaceB.UnionInNestedNS foo_union_type = NamespaceA.NamespaceB.UnionInNestedNS.NONE,
+      int foo_unionOffset = 0,
+      NamespaceA.NamespaceB.StructInNestedNST foo_struct = null) {
+    builder.StartTable(5);
+    TableInFirstNS.AddFooStruct(builder, NamespaceA.NamespaceB.StructInNestedNS.Pack(builder, foo_struct));
+    TableInFirstNS.AddFooUnion(builder, foo_unionOffset);
+    TableInFirstNS.AddFooTable(builder, foo_tableOffset);
+    TableInFirstNS.AddFooUnionType(builder, foo_union_type);
+    TableInFirstNS.AddFooEnum(builder, foo_enum);
+    return TableInFirstNS.EndTableInFirstNS(builder);
+  }
+
+  public static void StartTableInFirstNS(FlatBufferBuilder builder) { builder.StartTable(5); }
+  public static void AddFooTable(FlatBufferBuilder builder, Offset<NamespaceA.NamespaceB.TableInNestedNS> fooTableOffset) { builder.AddOffset(0, fooTableOffset.Value, 0); }
+  public static void AddFooEnum(FlatBufferBuilder builder, NamespaceA.NamespaceB.EnumInNestedNS fooEnum) { builder.AddSbyte(1, (sbyte)fooEnum, 0); }
+  public static void AddFooUnionType(FlatBufferBuilder builder, NamespaceA.NamespaceB.UnionInNestedNS fooUnionType) { builder.AddByte(2, (byte)fooUnionType, 0); }
+  public static void AddFooUnion(FlatBufferBuilder builder, int fooUnionOffset) { builder.AddOffset(3, fooUnionOffset, 0); }
+  public static void AddFooStruct(FlatBufferBuilder builder, Offset<NamespaceA.NamespaceB.StructInNestedNS> fooStructOffset) { builder.AddStruct(4, fooStructOffset.Value, 0); }
+  public static Offset<NamespaceA.TableInFirstNS> EndTableInFirstNS(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<NamespaceA.TableInFirstNS>(o);
+  }
+  public TableInFirstNST UnPack() {
+    var _o = new TableInFirstNST();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TableInFirstNST _o) {
+    _o.FooTable = this.FooTable.HasValue ? this.FooTable.Value.UnPack() : null;
+    _o.FooEnum = this.FooEnum;
+    _o.FooUnion = new NamespaceA.NamespaceB.UnionInNestedNSUnion();
+    _o.FooUnion.Type = this.FooUnionType;
+    switch (this.FooUnionType) {
+      default: break;
+      case NamespaceA.NamespaceB.UnionInNestedNS.TableInNestedNS:
+        _o.FooUnion.Value = this.FooUnion<NamespaceA.NamespaceB.TableInNestedNS>().HasValue ? this.FooUnion<NamespaceA.NamespaceB.TableInNestedNS>().Value.UnPack() : null;
+        break;
+    }
+    _o.FooStruct = this.FooStruct.HasValue ? this.FooStruct.Value.UnPack() : null;
+  }
+  public static Offset<NamespaceA.TableInFirstNS> Pack(FlatBufferBuilder builder, TableInFirstNST _o) {
+    if (_o == null) return default(Offset<NamespaceA.TableInFirstNS>);
+    var _foo_table = _o.FooTable == null ? default(Offset<NamespaceA.NamespaceB.TableInNestedNS>) : NamespaceA.NamespaceB.TableInNestedNS.Pack(builder, _o.FooTable);
+    var _foo_union_type = _o.FooUnion == null ? NamespaceA.NamespaceB.UnionInNestedNS.NONE : _o.FooUnion.Type;
+    var _foo_union = _o.FooUnion == null ? 0 : NamespaceA.NamespaceB.UnionInNestedNSUnion.Pack(builder, _o.FooUnion);
+    return CreateTableInFirstNS(
+      builder,
+      _foo_table,
+      _o.FooEnum,
+      _foo_union_type,
+      _foo_union,
+      _o.FooStruct);
+  }
+};
+
+public class TableInFirstNST
+{
+  [Newtonsoft.Json.JsonProperty("foo_table")]
+  public NamespaceA.NamespaceB.TableInNestedNST FooTable { get; set; }
+  [Newtonsoft.Json.JsonProperty("foo_enum")]
+  public NamespaceA.NamespaceB.EnumInNestedNS FooEnum { get; set; }
+  [Newtonsoft.Json.JsonProperty("foo_union_type")]
+  private NamespaceA.NamespaceB.UnionInNestedNS FooUnionType {
+    get {
+      return this.FooUnion != null ? this.FooUnion.Type : NamespaceA.NamespaceB.UnionInNestedNS.NONE;
+    }
+    set {
+      this.FooUnion = new NamespaceA.NamespaceB.UnionInNestedNSUnion();
+      this.FooUnion.Type = value;
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("foo_union")]
+  [Newtonsoft.Json.JsonConverter(typeof(NamespaceA.NamespaceB.UnionInNestedNSUnion_JsonConverter))]
+  public NamespaceA.NamespaceB.UnionInNestedNSUnion FooUnion { get; set; }
+  [Newtonsoft.Json.JsonProperty("foo_struct")]
+  public NamespaceA.NamespaceB.StructInNestedNST FooStruct { get; set; }
+
+  public TableInFirstNST() {
+    this.FooTable = null;
+    this.FooEnum = NamespaceA.NamespaceB.EnumInNestedNS.A;
+    this.FooUnion = null;
+    this.FooStruct = new NamespaceA.NamespaceB.StructInNestedNST();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.go
new file mode 100644
index 0000000..3feaaaa
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.go
@@ -0,0 +1,158 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceA
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	NamespaceA__NamespaceB "NamespaceA/NamespaceB"
+)
+
+type TableInFirstNST struct {
+	FooTable *NamespaceA__NamespaceB.TableInNestedNST
+	FooEnum NamespaceA__NamespaceB.EnumInNestedNS
+	FooUnion *NamespaceA__NamespaceB.UnionInNestedNST
+	FooStruct *NamespaceA__NamespaceB.StructInNestedNST
+}
+
+func (t *TableInFirstNST) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	fooTableOffset := t.FooTable.Pack(builder)
+	fooUnionOffset := t.FooUnion.Pack(builder)
+	
+	TableInFirstNSStart(builder)
+	TableInFirstNSAddFooTable(builder, fooTableOffset)
+	TableInFirstNSAddFooEnum(builder, t.FooEnum)
+	if t.FooUnion != nil {
+		TableInFirstNSAddFooUnionType(builder, t.FooUnion.Type)
+	}
+	TableInFirstNSAddFooUnion(builder, fooUnionOffset)
+	fooStructOffset := t.FooStruct.Pack(builder)
+	TableInFirstNSAddFooStruct(builder, fooStructOffset)
+	return TableInFirstNSEnd(builder)
+}
+
+func (rcv *TableInFirstNS) UnPackTo(t *TableInFirstNST) {
+	t.FooTable = rcv.FooTable(nil).UnPack()
+	t.FooEnum = rcv.FooEnum()
+	fooUnionTable := flatbuffers.Table{}
+	if rcv.FooUnion(&fooUnionTable) {
+		t.FooUnion = rcv.FooUnionType().UnPack(fooUnionTable)
+	}
+	t.FooStruct = rcv.FooStruct(nil).UnPack()
+}
+
+func (rcv *TableInFirstNS) UnPack() *TableInFirstNST {
+	if rcv == nil { return nil }
+	t := &TableInFirstNST{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type TableInFirstNS struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsTableInFirstNS(buf []byte, offset flatbuffers.UOffsetT) *TableInFirstNS {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &TableInFirstNS{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsTableInFirstNS(buf []byte, offset flatbuffers.UOffsetT) *TableInFirstNS {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &TableInFirstNS{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *TableInFirstNS) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TableInFirstNS) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *TableInFirstNS) FooTable(obj *NamespaceA__NamespaceB.TableInNestedNS) *NamespaceA__NamespaceB.TableInNestedNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(NamespaceA__NamespaceB.TableInNestedNS)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *TableInFirstNS) FooEnum() NamespaceA__NamespaceB.EnumInNestedNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		return NamespaceA__NamespaceB.EnumInNestedNS(rcv._tab.GetInt8(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *TableInFirstNS) MutateFooEnum(n NamespaceA__NamespaceB.EnumInNestedNS) bool {
+	return rcv._tab.MutateInt8Slot(6, int8(n))
+}
+
+func (rcv *TableInFirstNS) FooUnionType() NamespaceA__NamespaceB.UnionInNestedNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(8))
+	if o != 0 {
+		return NamespaceA__NamespaceB.UnionInNestedNS(rcv._tab.GetByte(o + rcv._tab.Pos))
+	}
+	return 0
+}
+
+func (rcv *TableInFirstNS) MutateFooUnionType(n NamespaceA__NamespaceB.UnionInNestedNS) bool {
+	return rcv._tab.MutateByteSlot(8, byte(n))
+}
+
+func (rcv *TableInFirstNS) FooUnion(obj *flatbuffers.Table) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(10))
+	if o != 0 {
+		rcv._tab.Union(obj, o)
+		return true
+	}
+	return false
+}
+
+func (rcv *TableInFirstNS) FooStruct(obj *NamespaceA__NamespaceB.StructInNestedNS) *NamespaceA__NamespaceB.StructInNestedNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		x := o + rcv._tab.Pos
+		if obj == nil {
+			obj = new(NamespaceA__NamespaceB.StructInNestedNS)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func TableInFirstNSStart(builder *flatbuffers.Builder) {
+	builder.StartObject(5)
+}
+func TableInFirstNSAddFooTable(builder *flatbuffers.Builder, fooTable flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(fooTable), 0)
+}
+func TableInFirstNSAddFooEnum(builder *flatbuffers.Builder, fooEnum NamespaceA__NamespaceB.EnumInNestedNS) {
+	builder.PrependInt8Slot(1, int8(fooEnum), 0)
+}
+func TableInFirstNSAddFooUnionType(builder *flatbuffers.Builder, fooUnionType NamespaceA__NamespaceB.UnionInNestedNS) {
+	builder.PrependByteSlot(2, byte(fooUnionType), 0)
+}
+func TableInFirstNSAddFooUnion(builder *flatbuffers.Builder, fooUnion flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(3, flatbuffers.UOffsetT(fooUnion), 0)
+}
+func TableInFirstNSAddFooStruct(builder *flatbuffers.Builder, fooStruct flatbuffers.UOffsetT) {
+	builder.PrependStructSlot(4, flatbuffers.UOffsetT(fooStruct), 0)
+}
+func TableInFirstNSEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.java
new file mode 100644
index 0000000..7af035f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.java
@@ -0,0 +1,45 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class TableInFirstNS extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static TableInFirstNS getRootAsTableInFirstNS(ByteBuffer _bb) { return getRootAsTableInFirstNS(_bb, new TableInFirstNS()); }
+  public static TableInFirstNS getRootAsTableInFirstNS(ByteBuffer _bb, TableInFirstNS obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public TableInFirstNS __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceA.NamespaceB.TableInNestedNS fooTable() { return fooTable(new NamespaceA.NamespaceB.TableInNestedNS()); }
+  public NamespaceA.NamespaceB.TableInNestedNS fooTable(NamespaceA.NamespaceB.TableInNestedNS obj) { int o = __offset(4); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+  public byte fooEnum() { int o = __offset(6); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public boolean mutateFooEnum(byte foo_enum) { int o = __offset(6); if (o != 0) { bb.put(o + bb_pos, foo_enum); return true; } else { return false; } }
+  public byte fooUnionType() { int o = __offset(8); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public Table fooUnion(Table obj) { int o = __offset(10); return o != 0 ? __union(obj, o + bb_pos) : null; }
+  public NamespaceA.NamespaceB.StructInNestedNS fooStruct() { return fooStruct(new NamespaceA.NamespaceB.StructInNestedNS()); }
+  public NamespaceA.NamespaceB.StructInNestedNS fooStruct(NamespaceA.NamespaceB.StructInNestedNS obj) { int o = __offset(12); return o != 0 ? obj.__assign(o + bb_pos, bb) : null; }
+
+  public static void startTableInFirstNS(FlatBufferBuilder builder) { builder.startTable(5); }
+  public static void addFooTable(FlatBufferBuilder builder, int fooTableOffset) { builder.addOffset(0, fooTableOffset, 0); }
+  public static void addFooEnum(FlatBufferBuilder builder, byte fooEnum) { builder.addByte(1, fooEnum, 0); }
+  public static void addFooUnionType(FlatBufferBuilder builder, byte fooUnionType) { builder.addByte(2, fooUnionType, 0); }
+  public static void addFooUnion(FlatBufferBuilder builder, int fooUnionOffset) { builder.addOffset(3, fooUnionOffset, 0); }
+  public static void addFooStruct(FlatBufferBuilder builder, int fooStructOffset) { builder.addStruct(4, fooStructOffset, 0); }
+  public static int endTableInFirstNS(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public TableInFirstNS get(int j) { return get(new TableInFirstNS(), j); }
+    public TableInFirstNS get(TableInFirstNS obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.kt
new file mode 100644
index 0000000..bec3319
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.kt
@@ -0,0 +1,87 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceA
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class TableInFirstNS : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : TableInFirstNS {
+        __init(_i, _bb)
+        return this
+    }
+    val fooTable : NamespaceA.NamespaceB.TableInNestedNS? get() = fooTable(NamespaceA.NamespaceB.TableInNestedNS())
+    fun fooTable(obj: NamespaceA.NamespaceB.TableInNestedNS) : NamespaceA.NamespaceB.TableInNestedNS? {
+        val o = __offset(4)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    val fooEnum : Byte
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.get(o + bb_pos) else 0
+        }
+    fun mutateFooEnum(fooEnum: Byte) : Boolean {
+        val o = __offset(6)
+        return if (o != 0) {
+            bb.put(o + bb_pos, fooEnum)
+            true
+        } else {
+            false
+        }
+    }
+    val fooUnionType : UByte
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateFooUnionType(fooUnionType: UByte) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.put(o + bb_pos, fooUnionType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun fooUnion(obj: Table) : Table? {
+        val o = __offset(10); return if (o != 0) __union(obj, o + bb_pos) else null
+    }
+    val fooStruct : NamespaceA.NamespaceB.StructInNestedNS? get() = fooStruct(NamespaceA.NamespaceB.StructInNestedNS())
+    fun fooStruct(obj: NamespaceA.NamespaceB.StructInNestedNS) : NamespaceA.NamespaceB.StructInNestedNS? {
+        val o = __offset(12)
+        return if (o != 0) {
+            obj.__assign(o + bb_pos, bb)
+        } else {
+            null
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsTableInFirstNS(_bb: ByteBuffer): TableInFirstNS = getRootAsTableInFirstNS(_bb, TableInFirstNS())
+        fun getRootAsTableInFirstNS(_bb: ByteBuffer, obj: TableInFirstNS): TableInFirstNS {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun startTableInFirstNS(builder: FlatBufferBuilder) = builder.startTable(5)
+        fun addFooTable(builder: FlatBufferBuilder, fooTable: Int) = builder.addOffset(0, fooTable, 0)
+        fun addFooEnum(builder: FlatBufferBuilder, fooEnum: Byte) = builder.addByte(1, fooEnum, 0)
+        fun addFooUnionType(builder: FlatBufferBuilder, fooUnionType: UByte) = builder.addByte(2, fooUnionType.toByte(), 0)
+        fun addFooUnion(builder: FlatBufferBuilder, fooUnion: Int) = builder.addOffset(3, fooUnion, 0)
+        fun addFooStruct(builder: FlatBufferBuilder, fooStruct: Int) = builder.addStruct(4, fooStruct, 0)
+        fun endTableInFirstNS(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.lua
new file mode 100644
index 0000000..1b62cc5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.lua
@@ -0,0 +1,75 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceA
+
+local flatbuffers = require('flatbuffers')
+
+local TableInFirstNS = {} -- the module
+local TableInFirstNS_mt = {} -- the class metatable
+
+function TableInFirstNS.New()
+    local o = {}
+    setmetatable(o, {__index = TableInFirstNS_mt})
+    return o
+end
+function TableInFirstNS.GetRootAsTableInFirstNS(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = TableInFirstNS.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function TableInFirstNS_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function TableInFirstNS_mt:FooTable()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('NamespaceA.NamespaceB.TableInNestedNS').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function TableInFirstNS_mt:FooEnum()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Int8, o + self.view.pos)
+    end
+    return 0
+end
+function TableInFirstNS_mt:FooUnionType()
+    local o = self.view:Offset(8)
+    if o ~= 0 then
+        return self.view:Get(flatbuffers.N.Uint8, o + self.view.pos)
+    end
+    return 0
+end
+function TableInFirstNS_mt:FooUnion()
+    local o = self.view:Offset(10)
+    if o ~= 0 then
+        local obj = flatbuffers.view.New(require('flatbuffers.binaryarray').New(0), 0)
+        self.view:Union(obj, o)
+        return obj
+    end
+end
+function TableInFirstNS_mt:FooStruct()
+    local o = self.view:Offset(12)
+    if o ~= 0 then
+        local x = o + self.view.pos
+        local obj = require('NamespaceA.NamespaceB.StructInNestedNS').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function TableInFirstNS.Start(builder) builder:StartObject(5) end
+function TableInFirstNS.AddFooTable(builder, fooTable) builder:PrependUOffsetTRelativeSlot(0, fooTable, 0) end
+function TableInFirstNS.AddFooEnum(builder, fooEnum) builder:PrependInt8Slot(1, fooEnum, 0) end
+function TableInFirstNS.AddFooUnionType(builder, fooUnionType) builder:PrependUint8Slot(2, fooUnionType, 0) end
+function TableInFirstNS.AddFooUnion(builder, fooUnion) builder:PrependUOffsetTRelativeSlot(3, fooUnion, 0) end
+function TableInFirstNS.AddFooStruct(builder, fooStruct) builder:PrependStructSlot(4, fooStruct, 0) end
+function TableInFirstNS.End(builder) return builder:EndObject() end
+
+return TableInFirstNS -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.php
new file mode 100644
index 0000000..637ead5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.php
@@ -0,0 +1,155 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceA;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TableInFirstNS extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TableInFirstNS
+     */
+    public static function getRootAsTableInFirstNS(ByteBuffer $bb)
+    {
+        $obj = new TableInFirstNS();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TableInFirstNS
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getFooTable()
+    {
+        $obj = new TableInNestedNS();
+        $o = $this->__offset(4);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @return sbyte
+     */
+    public function getFooEnum()
+    {
+        $o = $this->__offset(6);
+        return $o != 0 ? $this->bb->getSbyte($o + $this->bb_pos) : \NamespaceA\NamespaceB\EnumInNestedNS::A;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getFooUnionType()
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \NamespaceA\NamespaceB\UnionInNestedNS::NONE;
+    }
+
+    /**
+     * @returnint
+     */
+    public function getFooUnion($obj)
+    {
+        $o = $this->__offset(10);
+        return $o != 0 ? $this->__union($obj, $o) : null;
+    }
+
+    public function getFooStruct()
+    {
+        $obj = new StructInNestedNS();
+        $o = $this->__offset(12);
+        return $o != 0 ? $obj->init($o + $this->bb_pos, $this->bb) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTableInFirstNS(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(5);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TableInFirstNS
+     */
+    public static function createTableInFirstNS(FlatBufferBuilder $builder, $foo_table, $foo_enum, $foo_union_type, $foo_union, $foo_struct)
+    {
+        $builder->startObject(5);
+        self::addFooTable($builder, $foo_table);
+        self::addFooEnum($builder, $foo_enum);
+        self::addFooUnionType($builder, $foo_union_type);
+        self::addFooUnion($builder, $foo_union);
+        self::addFooStruct($builder, $foo_struct);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addFooTable(FlatBufferBuilder $builder, $fooTable)
+    {
+        $builder->addOffsetX(0, $fooTable, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param sbyte
+     * @return void
+     */
+    public static function addFooEnum(FlatBufferBuilder $builder, $fooEnum)
+    {
+        $builder->addSbyteX(1, $fooEnum, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addFooUnionType(FlatBufferBuilder $builder, $fooUnionType)
+    {
+        $builder->addByteX(2, $fooUnionType, 0);
+    }
+
+    public static function addFooUnion(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->addOffsetX(3, $offset, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addFooStruct(FlatBufferBuilder $builder, $fooStruct)
+    {
+        $builder->addStructX(4, $fooStruct, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTableInFirstNS(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.py
new file mode 100644
index 0000000..d2c7e4a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/TableInFirstNS.py
@@ -0,0 +1,155 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceA
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class TableInFirstNS(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TableInFirstNS()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTableInFirstNS(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # TableInFirstNS
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TableInFirstNS
+    def FooTable(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            obj = TableInNestedNS()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TableInFirstNS
+    def FooEnum(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # TableInFirstNS
+    def FooUnionType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # TableInFirstNS
+    def FooUnion(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # TableInFirstNS
+    def FooStruct(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            x = o + self._tab.Pos
+            obj = StructInNestedNS()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+def Start(builder): builder.StartObject(5)
+def TableInFirstNSStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddFooTable(builder, fooTable): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(fooTable), 0)
+def TableInFirstNSAddFooTable(builder, fooTable):
+    """This method is deprecated. Please switch to AddFooTable."""
+    return AddFooTable(builder, fooTable)
+def AddFooEnum(builder, fooEnum): builder.PrependInt8Slot(1, fooEnum, 0)
+def TableInFirstNSAddFooEnum(builder, fooEnum):
+    """This method is deprecated. Please switch to AddFooEnum."""
+    return AddFooEnum(builder, fooEnum)
+def AddFooUnionType(builder, fooUnionType): builder.PrependUint8Slot(2, fooUnionType, 0)
+def TableInFirstNSAddFooUnionType(builder, fooUnionType):
+    """This method is deprecated. Please switch to AddFooUnionType."""
+    return AddFooUnionType(builder, fooUnionType)
+def AddFooUnion(builder, fooUnion): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(fooUnion), 0)
+def TableInFirstNSAddFooUnion(builder, fooUnion):
+    """This method is deprecated. Please switch to AddFooUnion."""
+    return AddFooUnion(builder, fooUnion)
+def AddFooStruct(builder, fooStruct): builder.PrependStructSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(fooStruct), 0)
+def TableInFirstNSAddFooStruct(builder, fooStruct):
+    """This method is deprecated. Please switch to AddFooStruct."""
+    return AddFooStruct(builder, fooStruct)
+def End(builder): return builder.EndObject()
+def TableInFirstNSEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+try:
+    from typing import Optional, Union
+except:
+    pass
+
+class TableInFirstNST(object):
+
+    # TableInFirstNST
+    def __init__(self):
+        self.fooTable = None  # type: Optional[TableInNestedNST]
+        self.fooEnum = 0  # type: int
+        self.fooUnionType = 0  # type: int
+        self.fooUnion = None  # type: Union[None, TableInNestedNST]
+        self.fooStruct = None  # type: Optional[StructInNestedNST]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        tableInFirstNS = TableInFirstNS()
+        tableInFirstNS.Init(buf, pos)
+        return cls.InitFromObj(tableInFirstNS)
+
+    @classmethod
+    def InitFromObj(cls, tableInFirstNS):
+        x = TableInFirstNST()
+        x._UnPack(tableInFirstNS)
+        return x
+
+    # TableInFirstNST
+    def _UnPack(self, tableInFirstNS):
+        if tableInFirstNS is None:
+            return
+        if tableInFirstNS.FooTable() is not None:
+            self.fooTable = TableInNestedNST.InitFromObj(tableInFirstNS.FooTable())
+        self.fooEnum = tableInFirstNS.FooEnum()
+        self.fooUnionType = tableInFirstNS.FooUnionType()
+        self.fooUnion = UnionInNestedNSCreator(self.fooUnionType, tableInFirstNS.FooUnion())
+        if tableInFirstNS.FooStruct() is not None:
+            self.fooStruct = StructInNestedNST.InitFromObj(tableInFirstNS.FooStruct())
+
+    # TableInFirstNST
+    def Pack(self, builder):
+        if self.fooTable is not None:
+            fooTable = self.fooTable.Pack(builder)
+        if self.fooUnion is not None:
+            fooUnion = self.fooUnion.Pack(builder)
+        Start(builder)
+        if self.fooTable is not None:
+            AddFooTable(builder, fooTable)
+        AddFooEnum(builder, self.fooEnum)
+        AddFooUnionType(builder, self.fooUnionType)
+        if self.fooUnion is not None:
+            AddFooUnion(builder, fooUnion)
+        if self.fooStruct is not None:
+            fooStruct = self.fooStruct.Pack(builder)
+            AddFooStruct(builder, fooStruct)
+        tableInFirstNS = End(builder)
+        return tableInFirstNS
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceA/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.cs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.cs
new file mode 100644
index 0000000..2881003
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.cs
@@ -0,0 +1,75 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace NamespaceC
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct TableInC : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static TableInC GetRootAsTableInC(ByteBuffer _bb) { return GetRootAsTableInC(_bb, new TableInC()); }
+  public static TableInC GetRootAsTableInC(ByteBuffer _bb, TableInC obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public TableInC __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceA.TableInFirstNS? ReferToA1 { get { int o = __p.__offset(4); return o != 0 ? (NamespaceA.TableInFirstNS?)(new NamespaceA.TableInFirstNS()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+  public NamespaceA.SecondTableInA? ReferToA2 { get { int o = __p.__offset(6); return o != 0 ? (NamespaceA.SecondTableInA?)(new NamespaceA.SecondTableInA()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } }
+
+  public static Offset<NamespaceC.TableInC> CreateTableInC(FlatBufferBuilder builder,
+      Offset<NamespaceA.TableInFirstNS> refer_to_a1Offset = default(Offset<NamespaceA.TableInFirstNS>),
+      Offset<NamespaceA.SecondTableInA> refer_to_a2Offset = default(Offset<NamespaceA.SecondTableInA>)) {
+    builder.StartTable(2);
+    TableInC.AddReferToA2(builder, refer_to_a2Offset);
+    TableInC.AddReferToA1(builder, refer_to_a1Offset);
+    return TableInC.EndTableInC(builder);
+  }
+
+  public static void StartTableInC(FlatBufferBuilder builder) { builder.StartTable(2); }
+  public static void AddReferToA1(FlatBufferBuilder builder, Offset<NamespaceA.TableInFirstNS> referToA1Offset) { builder.AddOffset(0, referToA1Offset.Value, 0); }
+  public static void AddReferToA2(FlatBufferBuilder builder, Offset<NamespaceA.SecondTableInA> referToA2Offset) { builder.AddOffset(1, referToA2Offset.Value, 0); }
+  public static Offset<NamespaceC.TableInC> EndTableInC(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<NamespaceC.TableInC>(o);
+  }
+  public TableInCT UnPack() {
+    var _o = new TableInCT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(TableInCT _o) {
+    _o.ReferToA1 = this.ReferToA1.HasValue ? this.ReferToA1.Value.UnPack() : null;
+    _o.ReferToA2 = this.ReferToA2.HasValue ? this.ReferToA2.Value.UnPack() : null;
+  }
+  public static Offset<NamespaceC.TableInC> Pack(FlatBufferBuilder builder, TableInCT _o) {
+    if (_o == null) return default(Offset<NamespaceC.TableInC>);
+    var _refer_to_a1 = _o.ReferToA1 == null ? default(Offset<NamespaceA.TableInFirstNS>) : NamespaceA.TableInFirstNS.Pack(builder, _o.ReferToA1);
+    var _refer_to_a2 = _o.ReferToA2 == null ? default(Offset<NamespaceA.SecondTableInA>) : NamespaceA.SecondTableInA.Pack(builder, _o.ReferToA2);
+    return CreateTableInC(
+      builder,
+      _refer_to_a1,
+      _refer_to_a2);
+  }
+};
+
+public class TableInCT
+{
+  [Newtonsoft.Json.JsonProperty("refer_to_a1")]
+  public NamespaceA.TableInFirstNST ReferToA1 { get; set; }
+  [Newtonsoft.Json.JsonProperty("refer_to_a2")]
+  public NamespaceA.SecondTableInAT ReferToA2 { get; set; }
+
+  public TableInCT() {
+    this.ReferToA1 = null;
+    this.ReferToA2 = null;
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.go b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.go
new file mode 100644
index 0000000..f171634
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.go
@@ -0,0 +1,102 @@
+// Code generated by the FlatBuffers compiler. DO NOT EDIT.
+
+package NamespaceC
+
+import (
+	flatbuffers "github.com/google/flatbuffers/go"
+
+	NamespaceA "NamespaceA"
+)
+
+type TableInCT struct {
+	ReferToA1 *NamespaceA.TableInFirstNST
+	ReferToA2 *NamespaceA.SecondTableInAT
+}
+
+func (t *TableInCT) Pack(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	if t == nil { return 0 }
+	referToA1Offset := t.ReferToA1.Pack(builder)
+	referToA2Offset := t.ReferToA2.Pack(builder)
+	TableInCStart(builder)
+	TableInCAddReferToA1(builder, referToA1Offset)
+	TableInCAddReferToA2(builder, referToA2Offset)
+	return TableInCEnd(builder)
+}
+
+func (rcv *TableInC) UnPackTo(t *TableInCT) {
+	t.ReferToA1 = rcv.ReferToA1(nil).UnPack()
+	t.ReferToA2 = rcv.ReferToA2(nil).UnPack()
+}
+
+func (rcv *TableInC) UnPack() *TableInCT {
+	if rcv == nil { return nil }
+	t := &TableInCT{}
+	rcv.UnPackTo(t)
+	return t
+}
+
+type TableInC struct {
+	_tab flatbuffers.Table
+}
+
+func GetRootAsTableInC(buf []byte, offset flatbuffers.UOffsetT) *TableInC {
+	n := flatbuffers.GetUOffsetT(buf[offset:])
+	x := &TableInC{}
+	x.Init(buf, n+offset)
+	return x
+}
+
+func GetSizePrefixedRootAsTableInC(buf []byte, offset flatbuffers.UOffsetT) *TableInC {
+	n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:])
+	x := &TableInC{}
+	x.Init(buf, n+offset+flatbuffers.SizeUint32)
+	return x
+}
+
+func (rcv *TableInC) Init(buf []byte, i flatbuffers.UOffsetT) {
+	rcv._tab.Bytes = buf
+	rcv._tab.Pos = i
+}
+
+func (rcv *TableInC) Table() flatbuffers.Table {
+	return rcv._tab
+}
+
+func (rcv *TableInC) ReferToA1(obj *NamespaceA.TableInFirstNS) *NamespaceA.TableInFirstNS {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(4))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(NamespaceA.TableInFirstNS)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func (rcv *TableInC) ReferToA2(obj *NamespaceA.SecondTableInA) *NamespaceA.SecondTableInA {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(6))
+	if o != 0 {
+		x := rcv._tab.Indirect(o + rcv._tab.Pos)
+		if obj == nil {
+			obj = new(NamespaceA.SecondTableInA)
+		}
+		obj.Init(rcv._tab.Bytes, x)
+		return obj
+	}
+	return nil
+}
+
+func TableInCStart(builder *flatbuffers.Builder) {
+	builder.StartObject(2)
+}
+func TableInCAddReferToA1(builder *flatbuffers.Builder, referToA1 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(0, flatbuffers.UOffsetT(referToA1), 0)
+}
+func TableInCAddReferToA2(builder *flatbuffers.Builder, referToA2 flatbuffers.UOffsetT) {
+	builder.PrependUOffsetTSlot(1, flatbuffers.UOffsetT(referToA2), 0)
+}
+func TableInCEnd(builder *flatbuffers.Builder) flatbuffers.UOffsetT {
+	return builder.EndObject()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.java b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.java
new file mode 100644
index 0000000..3536125
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.java
@@ -0,0 +1,47 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceC;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class TableInC extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static TableInC getRootAsTableInC(ByteBuffer _bb) { return getRootAsTableInC(_bb, new TableInC()); }
+  public static TableInC getRootAsTableInC(ByteBuffer _bb, TableInC obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public TableInC __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public NamespaceA.TableInFirstNS referToA1() { return referToA1(new NamespaceA.TableInFirstNS()); }
+  public NamespaceA.TableInFirstNS referToA1(NamespaceA.TableInFirstNS obj) { int o = __offset(4); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+  public NamespaceA.SecondTableInA referToA2() { return referToA2(new NamespaceA.SecondTableInA()); }
+  public NamespaceA.SecondTableInA referToA2(NamespaceA.SecondTableInA obj) { int o = __offset(6); return o != 0 ? obj.__assign(__indirect(o + bb_pos), bb) : null; }
+
+  public static int createTableInC(FlatBufferBuilder builder,
+      int refer_to_a1Offset,
+      int refer_to_a2Offset) {
+    builder.startTable(2);
+    TableInC.addReferToA2(builder, refer_to_a2Offset);
+    TableInC.addReferToA1(builder, refer_to_a1Offset);
+    return TableInC.endTableInC(builder);
+  }
+
+  public static void startTableInC(FlatBufferBuilder builder) { builder.startTable(2); }
+  public static void addReferToA1(FlatBufferBuilder builder, int referToA1Offset) { builder.addOffset(0, referToA1Offset, 0); }
+  public static void addReferToA2(FlatBufferBuilder builder, int referToA2Offset) { builder.addOffset(1, referToA2Offset, 0); }
+  public static int endTableInC(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public TableInC get(int j) { return get(new TableInC(), j); }
+    public TableInC get(TableInC obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.kt b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.kt
new file mode 100644
index 0000000..7586e25
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.kt
@@ -0,0 +1,59 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package NamespaceC
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class TableInC : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : TableInC {
+        __init(_i, _bb)
+        return this
+    }
+    val referToA1 : NamespaceA.TableInFirstNS? get() = referToA1(NamespaceA.TableInFirstNS())
+    fun referToA1(obj: NamespaceA.TableInFirstNS) : NamespaceA.TableInFirstNS? {
+        val o = __offset(4)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    val referToA2 : NamespaceA.SecondTableInA? get() = referToA2(NamespaceA.SecondTableInA())
+    fun referToA2(obj: NamespaceA.SecondTableInA) : NamespaceA.SecondTableInA? {
+        val o = __offset(6)
+        return if (o != 0) {
+            obj.__assign(__indirect(o + bb_pos), bb)
+        } else {
+            null
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsTableInC(_bb: ByteBuffer): TableInC = getRootAsTableInC(_bb, TableInC())
+        fun getRootAsTableInC(_bb: ByteBuffer, obj: TableInC): TableInC {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createTableInC(builder: FlatBufferBuilder, referToA1Offset: Int, referToA2Offset: Int) : Int {
+            builder.startTable(2)
+            addReferToA2(builder, referToA2Offset)
+            addReferToA1(builder, referToA1Offset)
+            return endTableInC(builder)
+        }
+        fun startTableInC(builder: FlatBufferBuilder) = builder.startTable(2)
+        fun addReferToA1(builder: FlatBufferBuilder, referToA1: Int) = builder.addOffset(0, referToA1, 0)
+        fun addReferToA2(builder: FlatBufferBuilder, referToA2: Int) = builder.addOffset(1, referToA2, 0)
+        fun endTableInC(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.lua b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.lua
new file mode 100644
index 0000000..71e4842
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.lua
@@ -0,0 +1,50 @@
+-- automatically generated by the FlatBuffers compiler, do not modify
+
+-- namespace: NamespaceC
+
+local flatbuffers = require('flatbuffers')
+
+local TableInC = {} -- the module
+local TableInC_mt = {} -- the class metatable
+
+function TableInC.New()
+    local o = {}
+    setmetatable(o, {__index = TableInC_mt})
+    return o
+end
+function TableInC.GetRootAsTableInC(buf, offset)
+    if type(buf) == "string" then
+        buf = flatbuffers.binaryArray.New(buf)
+    end
+    local n = flatbuffers.N.UOffsetT:Unpack(buf, offset)
+    local o = TableInC.New()
+    o:Init(buf, n + offset)
+    return o
+end
+function TableInC_mt:Init(buf, pos)
+    self.view = flatbuffers.view.New(buf, pos)
+end
+function TableInC_mt:ReferToA1()
+    local o = self.view:Offset(4)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('NamespaceA.TableInFirstNS').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function TableInC_mt:ReferToA2()
+    local o = self.view:Offset(6)
+    if o ~= 0 then
+        local x = self.view:Indirect(o + self.view.pos)
+        local obj = require('NamespaceA.SecondTableInA').New()
+        obj:Init(self.view.bytes, x)
+        return obj
+    end
+end
+function TableInC.Start(builder) builder:StartObject(2) end
+function TableInC.AddReferToA1(builder, referToA1) builder:PrependUOffsetTRelativeSlot(0, referToA1, 0) end
+function TableInC.AddReferToA2(builder, referToA2) builder:PrependUOffsetTRelativeSlot(1, referToA2, 0) end
+function TableInC.End(builder) return builder:EndObject() end
+
+return TableInC -- return the module
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.php b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.php
new file mode 100644
index 0000000..116aea1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.php
@@ -0,0 +1,100 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+namespace NamespaceC;
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class TableInC extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return TableInC
+     */
+    public static function getRootAsTableInC(ByteBuffer $bb)
+    {
+        $obj = new TableInC();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return TableInC
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    public function getReferToA1()
+    {
+        $obj = new TableInFirstNS();
+        $o = $this->__offset(4);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    public function getReferToA2()
+    {
+        $obj = new SecondTableInA();
+        $o = $this->__offset(6);
+        return $o != 0 ? $obj->init($this->__indirect($o + $this->bb_pos), $this->bb) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startTableInC(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(2);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return TableInC
+     */
+    public static function createTableInC(FlatBufferBuilder $builder, $refer_to_a1, $refer_to_a2)
+    {
+        $builder->startObject(2);
+        self::addReferToA1($builder, $refer_to_a1);
+        self::addReferToA2($builder, $refer_to_a2);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addReferToA1(FlatBufferBuilder $builder, $referToA1)
+    {
+        $builder->addOffsetX(0, $referToA1, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addReferToA2(FlatBufferBuilder $builder, $referToA2)
+    {
+        $builder->addOffsetX(1, $referToA2, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endTableInC(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.py
new file mode 100644
index 0000000..b8d3b94
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/TableInC.py
@@ -0,0 +1,108 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: NamespaceC
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+np = import_numpy()
+
+class TableInC(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TableInC()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTableInC(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    # TableInC
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TableInC
+    def ReferToA1(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            obj = TableInFirstNS()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TableInC
+    def ReferToA2(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            obj = SecondTableInA()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+def Start(builder): builder.StartObject(2)
+def TableInCStart(builder):
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+def AddReferToA1(builder, referToA1): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(referToA1), 0)
+def TableInCAddReferToA1(builder, referToA1):
+    """This method is deprecated. Please switch to AddReferToA1."""
+    return AddReferToA1(builder, referToA1)
+def AddReferToA2(builder, referToA2): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(referToA2), 0)
+def TableInCAddReferToA2(builder, referToA2):
+    """This method is deprecated. Please switch to AddReferToA2."""
+    return AddReferToA2(builder, referToA2)
+def End(builder): return builder.EndObject()
+def TableInCEnd(builder):
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
+try:
+    from typing import Optional
+except:
+    pass
+
+class TableInCT(object):
+
+    # TableInCT
+    def __init__(self):
+        self.referToA1 = None  # type: Optional[TableInFirstNST]
+        self.referToA2 = None  # type: Optional[SecondTableInAT]
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        tableInC = TableInC()
+        tableInC.Init(buf, pos)
+        return cls.InitFromObj(tableInC)
+
+    @classmethod
+    def InitFromObj(cls, tableInC):
+        x = TableInCT()
+        x._UnPack(tableInC)
+        return x
+
+    # TableInCT
+    def _UnPack(self, tableInC):
+        if tableInC is None:
+            return
+        if tableInC.ReferToA1() is not None:
+            self.referToA1 = TableInFirstNST.InitFromObj(tableInC.ReferToA1())
+        if tableInC.ReferToA2() is not None:
+            self.referToA2 = SecondTableInAT.InitFromObj(tableInC.ReferToA2())
+
+    # TableInCT
+    def Pack(self, builder):
+        if self.referToA1 is not None:
+            referToA1 = self.referToA1.Pack(builder)
+        if self.referToA2 is not None:
+            referToA2 = self.referToA2.Pack(builder)
+        Start(builder)
+        if self.referToA1 is not None:
+            AddReferToA1(builder, referToA1)
+        if self.referToA2 is not None:
+            AddReferToA2(builder, referToA2)
+        tableInC = End(builder)
+        return tableInC
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/__init__.py b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/NamespaceC/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.js
new file mode 100644
index 0000000..9105ed4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.js
@@ -0,0 +1,7 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export var EnumInNestedNS;
+(function (EnumInNestedNS) {
+    EnumInNestedNS[EnumInNestedNS["A"] = 0] = "A";
+    EnumInNestedNS[EnumInNestedNS["B"] = 1] = "B";
+    EnumInNestedNS[EnumInNestedNS["C"] = 2] = "C";
+})(EnumInNestedNS || (EnumInNestedNS = {}));
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.ts
new file mode 100644
index 0000000..676b7e4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/enum-in-nested-n-s.ts
@@ -0,0 +1,8 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+export enum EnumInNestedNS{
+  A = 0,
+  B = 1,
+  C = 2
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.js
new file mode 100644
index 0000000..918e238
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.js
@@ -0,0 +1,54 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export class StructInNestedNS {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    a() {
+        return this.bb.readInt32(this.bb_pos);
+    }
+    mutate_a(value) {
+        this.bb.writeInt32(this.bb_pos + 0, value);
+        return true;
+    }
+    b() {
+        return this.bb.readInt32(this.bb_pos + 4);
+    }
+    mutate_b(value) {
+        this.bb.writeInt32(this.bb_pos + 4, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'NamespaceA.NamespaceB.StructInNestedNS';
+    }
+    static sizeOf() {
+        return 8;
+    }
+    static createStructInNestedNS(builder, a, b) {
+        builder.prep(4, 8);
+        builder.writeInt32(b);
+        builder.writeInt32(a);
+        return builder.offset();
+    }
+    unpack() {
+        return new StructInNestedNST(this.a(), this.b());
+    }
+    unpackTo(_o) {
+        _o.a = this.a();
+        _o.b = this.b();
+    }
+}
+export class StructInNestedNST {
+    constructor(a = 0, b = 0) {
+        this.a = a;
+        this.b = b;
+    }
+    pack(builder) {
+        return StructInNestedNS.createStructInNestedNS(builder, this.a, this.b);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.ts
new file mode 100644
index 0000000..4b10118
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/struct-in-nested-n-s.ts
@@ -0,0 +1,77 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class StructInNestedNS {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):StructInNestedNS {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+a():number {
+  return this.bb!.readInt32(this.bb_pos);
+}
+
+mutate_a(value:number):boolean {
+  this.bb!.writeInt32(this.bb_pos + 0, value);
+  return true;
+}
+
+b():number {
+  return this.bb!.readInt32(this.bb_pos + 4);
+}
+
+mutate_b(value:number):boolean {
+  this.bb!.writeInt32(this.bb_pos + 4, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'NamespaceA.NamespaceB.StructInNestedNS';
+}
+
+static sizeOf():number {
+  return 8;
+}
+
+static createStructInNestedNS(builder:flatbuffers.Builder, a: number, b: number):flatbuffers.Offset {
+  builder.prep(4, 8);
+  builder.writeInt32(b);
+  builder.writeInt32(a);
+  return builder.offset();
+}
+
+
+unpack(): StructInNestedNST {
+  return new StructInNestedNST(
+    this.a(),
+    this.b()
+  );
+}
+
+
+unpackTo(_o: StructInNestedNST): void {
+  _o.a = this.a();
+  _o.b = this.b();
+}
+}
+
+export class StructInNestedNST {
+constructor(
+  public a: number = 0,
+  public b: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return StructInNestedNS.createStructInNestedNS(builder,
+    this.a,
+    this.b
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.js
new file mode 100644
index 0000000..fca1668
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.js
@@ -0,0 +1,64 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class TableInNestedNS {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsTableInNestedNS(bb, obj) {
+        return (obj || new TableInNestedNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsTableInNestedNS(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new TableInNestedNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    foo() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    mutate_foo(value) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt32(this.bb_pos + offset, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'NamespaceA.NamespaceB.TableInNestedNS';
+    }
+    static startTableInNestedNS(builder) {
+        builder.startObject(1);
+    }
+    static addFoo(builder, foo) {
+        builder.addFieldInt32(0, foo, 0);
+    }
+    static endTableInNestedNS(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createTableInNestedNS(builder, foo) {
+        TableInNestedNS.startTableInNestedNS(builder);
+        TableInNestedNS.addFoo(builder, foo);
+        return TableInNestedNS.endTableInNestedNS(builder);
+    }
+    unpack() {
+        return new TableInNestedNST(this.foo());
+    }
+    unpackTo(_o) {
+        _o.foo = this.foo();
+    }
+}
+export class TableInNestedNST {
+    constructor(foo = 0) {
+        this.foo = foo;
+    }
+    pack(builder) {
+        return TableInNestedNS.createTableInNestedNS(builder, this.foo);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.ts
new file mode 100644
index 0000000..5279fdb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/table-in-nested-n-s.ts
@@ -0,0 +1,87 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class TableInNestedNS {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):TableInNestedNS {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsTableInNestedNS(bb:flatbuffers.ByteBuffer, obj?:TableInNestedNS):TableInNestedNS {
+  return (obj || new TableInNestedNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsTableInNestedNS(bb:flatbuffers.ByteBuffer, obj?:TableInNestedNS):TableInNestedNS {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new TableInNestedNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+foo():number {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+mutate_foo(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt32(this.bb_pos + offset, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'NamespaceA.NamespaceB.TableInNestedNS';
+}
+
+static startTableInNestedNS(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addFoo(builder:flatbuffers.Builder, foo:number) {
+  builder.addFieldInt32(0, foo, 0);
+}
+
+static endTableInNestedNS(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createTableInNestedNS(builder:flatbuffers.Builder, foo:number):flatbuffers.Offset {
+  TableInNestedNS.startTableInNestedNS(builder);
+  TableInNestedNS.addFoo(builder, foo);
+  return TableInNestedNS.endTableInNestedNS(builder);
+}
+
+unpack(): TableInNestedNST {
+  return new TableInNestedNST(
+    this.foo()
+  );
+}
+
+
+unpackTo(_o: TableInNestedNST): void {
+  _o.foo = this.foo();
+}
+}
+
+export class TableInNestedNST {
+constructor(
+  public foo: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return TableInNestedNS.createTableInNestedNS(builder,
+    this.foo
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.js
new file mode 100644
index 0000000..b820bce
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.js
@@ -0,0 +1,21 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { TableInNestedNS } from '../../namespace-a/namespace-b/table-in-nested-n-s';
+export var UnionInNestedNS;
+(function (UnionInNestedNS) {
+    UnionInNestedNS[UnionInNestedNS["NONE"] = 0] = "NONE";
+    UnionInNestedNS[UnionInNestedNS["TableInNestedNS"] = 1] = "TableInNestedNS";
+})(UnionInNestedNS || (UnionInNestedNS = {}));
+export function unionToUnionInNestedNS(type, accessor) {
+    switch (UnionInNestedNS[type]) {
+        case 'NONE': return null;
+        case 'TableInNestedNS': return accessor(new TableInNestedNS());
+        default: return null;
+    }
+}
+export function unionListToUnionInNestedNS(type, accessor, index) {
+    switch (UnionInNestedNS[type]) {
+        case 'NONE': return null;
+        case 'TableInNestedNS': return accessor(index, new TableInNestedNS());
+        default: return null;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.ts
new file mode 100644
index 0000000..441ebf7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/namespace-b/union-in-nested-n-s.ts
@@ -0,0 +1,33 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import { TableInNestedNS, TableInNestedNST } from '../../namespace-a/namespace-b/table-in-nested-n-s';
+
+
+export enum UnionInNestedNS{
+  NONE = 0,
+  TableInNestedNS = 1
+}
+
+export function unionToUnionInNestedNS(
+  type: UnionInNestedNS,
+  accessor: (obj:TableInNestedNS) => TableInNestedNS|null
+): TableInNestedNS|null {
+  switch(UnionInNestedNS[type]) {
+    case 'NONE': return null; 
+    case 'TableInNestedNS': return accessor(new TableInNestedNS())! as TableInNestedNS;
+    default: return null;
+  }
+}
+
+export function unionListToUnionInNestedNS(
+  type: UnionInNestedNS, 
+  accessor: (index: number, obj:TableInNestedNS) => TableInNestedNS|null, 
+  index: number
+): TableInNestedNS|null {
+  switch(UnionInNestedNS[type]) {
+    case 'NONE': return null; 
+    case 'TableInNestedNS': return accessor(index, new TableInNestedNS())! as TableInNestedNS;
+    default: return null;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.js
new file mode 100644
index 0000000..fe848e0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.js
@@ -0,0 +1,58 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { TableInC } from '../namespace-c/table-in-c';
+export class SecondTableInA {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsSecondTableInA(bb, obj) {
+        return (obj || new SecondTableInA()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsSecondTableInA(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new SecondTableInA()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    referToC(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? (obj || new TableInC()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    static getFullyQualifiedName() {
+        return 'NamespaceA.SecondTableInA';
+    }
+    static startSecondTableInA(builder) {
+        builder.startObject(1);
+    }
+    static addReferToC(builder, referToCOffset) {
+        builder.addFieldOffset(0, referToCOffset, 0);
+    }
+    static endSecondTableInA(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createSecondTableInA(builder, referToCOffset) {
+        SecondTableInA.startSecondTableInA(builder);
+        SecondTableInA.addReferToC(builder, referToCOffset);
+        return SecondTableInA.endSecondTableInA(builder);
+    }
+    unpack() {
+        return new SecondTableInAT((this.referToC() !== null ? this.referToC().unpack() : null));
+    }
+    unpackTo(_o) {
+        _o.referToC = (this.referToC() !== null ? this.referToC().unpack() : null);
+    }
+}
+export class SecondTableInAT {
+    constructor(referToC = null) {
+        this.referToC = referToC;
+    }
+    pack(builder) {
+        const referToC = (this.referToC !== null ? this.referToC.pack(builder) : 0);
+        return SecondTableInA.createSecondTableInA(builder, referToC);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.ts
new file mode 100644
index 0000000..9be3402
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/second-table-in-a.ts
@@ -0,0 +1,79 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { TableInC, TableInCT } from '../namespace-c/table-in-c';
+
+
+export class SecondTableInA {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):SecondTableInA {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsSecondTableInA(bb:flatbuffers.ByteBuffer, obj?:SecondTableInA):SecondTableInA {
+  return (obj || new SecondTableInA()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsSecondTableInA(bb:flatbuffers.ByteBuffer, obj?:SecondTableInA):SecondTableInA {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new SecondTableInA()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+referToC(obj?:TableInC):TableInC|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? (obj || new TableInC()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+static getFullyQualifiedName():string {
+  return 'NamespaceA.SecondTableInA';
+}
+
+static startSecondTableInA(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addReferToC(builder:flatbuffers.Builder, referToCOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, referToCOffset, 0);
+}
+
+static endSecondTableInA(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createSecondTableInA(builder:flatbuffers.Builder, referToCOffset:flatbuffers.Offset):flatbuffers.Offset {
+  SecondTableInA.startSecondTableInA(builder);
+  SecondTableInA.addReferToC(builder, referToCOffset);
+  return SecondTableInA.endSecondTableInA(builder);
+}
+
+unpack(): SecondTableInAT {
+  return new SecondTableInAT(
+    (this.referToC() !== null ? this.referToC()!.unpack() : null)
+  );
+}
+
+
+unpackTo(_o: SecondTableInAT): void {
+  _o.referToC = (this.referToC() !== null ? this.referToC()!.unpack() : null);
+}
+}
+
+export class SecondTableInAT {
+constructor(
+  public referToC: TableInCT|null = null
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const referToC = (this.referToC !== null ? this.referToC!.pack(builder) : 0);
+
+  return SecondTableInA.createSecondTableInA(builder,
+    referToC
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.js
new file mode 100644
index 0000000..91bcc85
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.js
@@ -0,0 +1,119 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { EnumInNestedNS } from '../namespace-a/namespace-b/enum-in-nested-n-s';
+import { StructInNestedNS } from '../namespace-a/namespace-b/struct-in-nested-n-s';
+import { TableInNestedNS } from '../namespace-a/namespace-b/table-in-nested-n-s';
+import { UnionInNestedNS, unionToUnionInNestedNS } from '../namespace-a/namespace-b/union-in-nested-n-s';
+export class TableInFirstNS {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsTableInFirstNS(bb, obj) {
+        return (obj || new TableInFirstNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsTableInFirstNS(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new TableInFirstNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    fooTable(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? (obj || new TableInNestedNS()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    fooEnum() {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : EnumInNestedNS.A;
+    }
+    mutate_foo_enum(value) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt8(this.bb_pos + offset, value);
+        return true;
+    }
+    fooUnionType() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : UnionInNestedNS.NONE;
+    }
+    fooUnion(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.__union(obj, this.bb_pos + offset) : null;
+    }
+    fooStruct(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 12);
+        return offset ? (obj || new StructInNestedNS()).__init(this.bb_pos + offset, this.bb) : null;
+    }
+    static getFullyQualifiedName() {
+        return 'NamespaceA.TableInFirstNS';
+    }
+    static startTableInFirstNS(builder) {
+        builder.startObject(5);
+    }
+    static addFooTable(builder, fooTableOffset) {
+        builder.addFieldOffset(0, fooTableOffset, 0);
+    }
+    static addFooEnum(builder, fooEnum) {
+        builder.addFieldInt8(1, fooEnum, EnumInNestedNS.A);
+    }
+    static addFooUnionType(builder, fooUnionType) {
+        builder.addFieldInt8(2, fooUnionType, UnionInNestedNS.NONE);
+    }
+    static addFooUnion(builder, fooUnionOffset) {
+        builder.addFieldOffset(3, fooUnionOffset, 0);
+    }
+    static addFooStruct(builder, fooStructOffset) {
+        builder.addFieldStruct(4, fooStructOffset, 0);
+    }
+    static endTableInFirstNS(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    unpack() {
+        return new TableInFirstNST((this.fooTable() !== null ? this.fooTable().unpack() : null), this.fooEnum(), this.fooUnionType(), (() => {
+            let temp = unionToUnionInNestedNS(this.fooUnionType(), this.fooUnion.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })(), (this.fooStruct() !== null ? this.fooStruct().unpack() : null));
+    }
+    unpackTo(_o) {
+        _o.fooTable = (this.fooTable() !== null ? this.fooTable().unpack() : null);
+        _o.fooEnum = this.fooEnum();
+        _o.fooUnionType = this.fooUnionType();
+        _o.fooUnion = (() => {
+            let temp = unionToUnionInNestedNS(this.fooUnionType(), this.fooUnion.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            return temp.unpack();
+        })();
+        _o.fooStruct = (this.fooStruct() !== null ? this.fooStruct().unpack() : null);
+    }
+}
+export class TableInFirstNST {
+    constructor(fooTable = null, fooEnum = EnumInNestedNS.A, fooUnionType = UnionInNestedNS.NONE, fooUnion = null, fooStruct = null) {
+        this.fooTable = fooTable;
+        this.fooEnum = fooEnum;
+        this.fooUnionType = fooUnionType;
+        this.fooUnion = fooUnion;
+        this.fooStruct = fooStruct;
+    }
+    pack(builder) {
+        const fooTable = (this.fooTable !== null ? this.fooTable.pack(builder) : 0);
+        const fooUnion = builder.createObjectOffset(this.fooUnion);
+        TableInFirstNS.startTableInFirstNS(builder);
+        TableInFirstNS.addFooTable(builder, fooTable);
+        TableInFirstNS.addFooEnum(builder, this.fooEnum);
+        TableInFirstNS.addFooUnionType(builder, this.fooUnionType);
+        TableInFirstNS.addFooUnion(builder, fooUnion);
+        TableInFirstNS.addFooStruct(builder, (this.fooStruct !== null ? this.fooStruct.pack(builder) : 0));
+        return TableInFirstNS.endTableInFirstNS(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.ts
new file mode 100644
index 0000000..8e4d706
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-a/table-in-first-n-s.ts
@@ -0,0 +1,150 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { EnumInNestedNS } from '../namespace-a/namespace-b/enum-in-nested-n-s';
+import { StructInNestedNS, StructInNestedNST } from '../namespace-a/namespace-b/struct-in-nested-n-s';
+import { TableInNestedNS, TableInNestedNST } from '../namespace-a/namespace-b/table-in-nested-n-s';
+import { UnionInNestedNS, unionToUnionInNestedNS, unionListToUnionInNestedNS } from '../namespace-a/namespace-b/union-in-nested-n-s';
+
+
+export class TableInFirstNS {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):TableInFirstNS {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsTableInFirstNS(bb:flatbuffers.ByteBuffer, obj?:TableInFirstNS):TableInFirstNS {
+  return (obj || new TableInFirstNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsTableInFirstNS(bb:flatbuffers.ByteBuffer, obj?:TableInFirstNS):TableInFirstNS {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new TableInFirstNS()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+fooTable(obj?:TableInNestedNS):TableInNestedNS|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? (obj || new TableInNestedNS()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+fooEnum():EnumInNestedNS {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : EnumInNestedNS.A;
+}
+
+mutate_foo_enum(value:EnumInNestedNS):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt8(this.bb_pos + offset, value);
+  return true;
+}
+
+fooUnionType():UnionInNestedNS {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : UnionInNestedNS.NONE;
+}
+
+fooUnion<T extends flatbuffers.Table>(obj:any):any|null {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null;
+}
+
+fooStruct(obj?:StructInNestedNS):StructInNestedNS|null {
+  const offset = this.bb!.__offset(this.bb_pos, 12);
+  return offset ? (obj || new StructInNestedNS()).__init(this.bb_pos + offset, this.bb!) : null;
+}
+
+static getFullyQualifiedName():string {
+  return 'NamespaceA.TableInFirstNS';
+}
+
+static startTableInFirstNS(builder:flatbuffers.Builder) {
+  builder.startObject(5);
+}
+
+static addFooTable(builder:flatbuffers.Builder, fooTableOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, fooTableOffset, 0);
+}
+
+static addFooEnum(builder:flatbuffers.Builder, fooEnum:EnumInNestedNS) {
+  builder.addFieldInt8(1, fooEnum, EnumInNestedNS.A);
+}
+
+static addFooUnionType(builder:flatbuffers.Builder, fooUnionType:UnionInNestedNS) {
+  builder.addFieldInt8(2, fooUnionType, UnionInNestedNS.NONE);
+}
+
+static addFooUnion(builder:flatbuffers.Builder, fooUnionOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(3, fooUnionOffset, 0);
+}
+
+static addFooStruct(builder:flatbuffers.Builder, fooStructOffset:flatbuffers.Offset) {
+  builder.addFieldStruct(4, fooStructOffset, 0);
+}
+
+static endTableInFirstNS(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+
+unpack(): TableInFirstNST {
+  return new TableInFirstNST(
+    (this.fooTable() !== null ? this.fooTable()!.unpack() : null),
+    this.fooEnum(),
+    this.fooUnionType(),
+    (() => {
+      let temp = unionToUnionInNestedNS(this.fooUnionType(), this.fooUnion.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })(),
+    (this.fooStruct() !== null ? this.fooStruct()!.unpack() : null)
+  );
+}
+
+
+unpackTo(_o: TableInFirstNST): void {
+  _o.fooTable = (this.fooTable() !== null ? this.fooTable()!.unpack() : null);
+  _o.fooEnum = this.fooEnum();
+  _o.fooUnionType = this.fooUnionType();
+  _o.fooUnion = (() => {
+      let temp = unionToUnionInNestedNS(this.fooUnionType(), this.fooUnion.bind(this));
+      if(temp === null) { return null; }
+      return temp.unpack()
+  })();
+  _o.fooStruct = (this.fooStruct() !== null ? this.fooStruct()!.unpack() : null);
+}
+}
+
+export class TableInFirstNST {
+constructor(
+  public fooTable: TableInNestedNST|null = null,
+  public fooEnum: EnumInNestedNS = EnumInNestedNS.A,
+  public fooUnionType: UnionInNestedNS = UnionInNestedNS.NONE,
+  public fooUnion: TableInNestedNST|null = null,
+  public fooStruct: StructInNestedNST|null = null
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const fooTable = (this.fooTable !== null ? this.fooTable!.pack(builder) : 0);
+  const fooUnion = builder.createObjectOffset(this.fooUnion);
+
+  TableInFirstNS.startTableInFirstNS(builder);
+  TableInFirstNS.addFooTable(builder, fooTable);
+  TableInFirstNS.addFooEnum(builder, this.fooEnum);
+  TableInFirstNS.addFooUnionType(builder, this.fooUnionType);
+  TableInFirstNS.addFooUnion(builder, fooUnion);
+  TableInFirstNS.addFooStruct(builder, (this.fooStruct !== null ? this.fooStruct!.pack(builder) : 0));
+
+  return TableInFirstNS.endTableInFirstNS(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.js b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.js
new file mode 100644
index 0000000..d097a3c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.js
@@ -0,0 +1,67 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { SecondTableInA } from '../namespace-a/second-table-in-a';
+import { TableInFirstNS } from '../namespace-a/table-in-first-n-s';
+export class TableInC {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsTableInC(bb, obj) {
+        return (obj || new TableInC()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsTableInC(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new TableInC()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    referToA1(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? (obj || new TableInFirstNS()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    referToA2(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? (obj || new SecondTableInA()).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null;
+    }
+    static getFullyQualifiedName() {
+        return 'NamespaceC.TableInC';
+    }
+    static startTableInC(builder) {
+        builder.startObject(2);
+    }
+    static addReferToA1(builder, referToA1Offset) {
+        builder.addFieldOffset(0, referToA1Offset, 0);
+    }
+    static addReferToA2(builder, referToA2Offset) {
+        builder.addFieldOffset(1, referToA2Offset, 0);
+    }
+    static endTableInC(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    unpack() {
+        return new TableInCT((this.referToA1() !== null ? this.referToA1().unpack() : null), (this.referToA2() !== null ? this.referToA2().unpack() : null));
+    }
+    unpackTo(_o) {
+        _o.referToA1 = (this.referToA1() !== null ? this.referToA1().unpack() : null);
+        _o.referToA2 = (this.referToA2() !== null ? this.referToA2().unpack() : null);
+    }
+}
+export class TableInCT {
+    constructor(referToA1 = null, referToA2 = null) {
+        this.referToA1 = referToA1;
+        this.referToA2 = referToA2;
+    }
+    pack(builder) {
+        const referToA1 = (this.referToA1 !== null ? this.referToA1.pack(builder) : 0);
+        const referToA2 = (this.referToA2 !== null ? this.referToA2.pack(builder) : 0);
+        TableInC.startTableInC(builder);
+        TableInC.addReferToA1(builder, referToA1);
+        TableInC.addReferToA2(builder, referToA2);
+        return TableInC.endTableInC(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.ts b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.ts
new file mode 100644
index 0000000..7b924b8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace-c/table-in-c.ts
@@ -0,0 +1,90 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { SecondTableInA, SecondTableInAT } from '../namespace-a/second-table-in-a';
+import { TableInFirstNS, TableInFirstNST } from '../namespace-a/table-in-first-n-s';
+
+
+export class TableInC {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):TableInC {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsTableInC(bb:flatbuffers.ByteBuffer, obj?:TableInC):TableInC {
+  return (obj || new TableInC()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsTableInC(bb:flatbuffers.ByteBuffer, obj?:TableInC):TableInC {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new TableInC()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+referToA1(obj?:TableInFirstNS):TableInFirstNS|null {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? (obj || new TableInFirstNS()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+referToA2(obj?:SecondTableInA):SecondTableInA|null {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? (obj || new SecondTableInA()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null;
+}
+
+static getFullyQualifiedName():string {
+  return 'NamespaceC.TableInC';
+}
+
+static startTableInC(builder:flatbuffers.Builder) {
+  builder.startObject(2);
+}
+
+static addReferToA1(builder:flatbuffers.Builder, referToA1Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(0, referToA1Offset, 0);
+}
+
+static addReferToA2(builder:flatbuffers.Builder, referToA2Offset:flatbuffers.Offset) {
+  builder.addFieldOffset(1, referToA2Offset, 0);
+}
+
+static endTableInC(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+
+unpack(): TableInCT {
+  return new TableInCT(
+    (this.referToA1() !== null ? this.referToA1()!.unpack() : null),
+    (this.referToA2() !== null ? this.referToA2()!.unpack() : null)
+  );
+}
+
+
+unpackTo(_o: TableInCT): void {
+  _o.referToA1 = (this.referToA1() !== null ? this.referToA1()!.unpack() : null);
+  _o.referToA2 = (this.referToA2() !== null ? this.referToA2()!.unpack() : null);
+}
+}
+
+export class TableInCT {
+constructor(
+  public referToA1: TableInFirstNST|null = null,
+  public referToA2: SecondTableInAT|null = null
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const referToA1 = (this.referToA1 !== null ? this.referToA1!.pack(builder) : 0);
+  const referToA2 = (this.referToA2 !== null ? this.referToA2!.pack(builder) : 0);
+
+  TableInC.startTableInC(builder);
+  TableInC.addReferToA1(builder, referToA1);
+  TableInC.addReferToA2(builder, referToA2);
+
+  return TableInC.endTableInC(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1.fbs
new file mode 100644
index 0000000..f0b9c16
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1.fbs
@@ -0,0 +1,21 @@
+namespace NamespaceA.NamespaceB;
+
+table TableInNestedNS
+{
+    foo:int;
+}
+
+union UnionInNestedNS {
+    TableInNestedNS,
+}
+
+enum EnumInNestedNS:byte
+{
+	A, B, C
+}
+
+struct StructInNestedNS
+{
+    a:int;
+	b:int;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.h
new file mode 100644
index 0000000..540fa25
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.h
@@ -0,0 +1,449 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_NAMESPACETEST1_NAMESPACEA_NAMESPACEB_H_
+#define FLATBUFFERS_GENERATED_NAMESPACETEST1_NAMESPACEA_NAMESPACEB_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace NamespaceA {
+namespace NamespaceB {
+
+struct TableInNestedNS;
+struct TableInNestedNSBuilder;
+struct TableInNestedNST;
+
+struct StructInNestedNS;
+
+bool operator==(const TableInNestedNST &lhs, const TableInNestedNST &rhs);
+bool operator!=(const TableInNestedNST &lhs, const TableInNestedNST &rhs);
+bool operator==(const StructInNestedNS &lhs, const StructInNestedNS &rhs);
+bool operator!=(const StructInNestedNS &lhs, const StructInNestedNS &rhs);
+
+inline const flatbuffers::TypeTable *TableInNestedNSTypeTable();
+
+inline const flatbuffers::TypeTable *StructInNestedNSTypeTable();
+
+enum UnionInNestedNS : uint8_t {
+  UnionInNestedNS_NONE = 0,
+  UnionInNestedNS_TableInNestedNS = 1,
+  UnionInNestedNS_MIN = UnionInNestedNS_NONE,
+  UnionInNestedNS_MAX = UnionInNestedNS_TableInNestedNS
+};
+
+inline const UnionInNestedNS (&EnumValuesUnionInNestedNS())[2] {
+  static const UnionInNestedNS values[] = {
+    UnionInNestedNS_NONE,
+    UnionInNestedNS_TableInNestedNS
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesUnionInNestedNS() {
+  static const char * const names[3] = {
+    "NONE",
+    "TableInNestedNS",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameUnionInNestedNS(UnionInNestedNS e) {
+  if (flatbuffers::IsOutRange(e, UnionInNestedNS_NONE, UnionInNestedNS_TableInNestedNS)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnionInNestedNS()[index];
+}
+
+template<typename T> struct UnionInNestedNSTraits {
+  static const UnionInNestedNS enum_value = UnionInNestedNS_NONE;
+};
+
+template<> struct UnionInNestedNSTraits<NamespaceA::NamespaceB::TableInNestedNS> {
+  static const UnionInNestedNS enum_value = UnionInNestedNS_TableInNestedNS;
+};
+
+struct UnionInNestedNSUnion {
+  UnionInNestedNS type;
+  void *value;
+
+  UnionInNestedNSUnion() : type(UnionInNestedNS_NONE), value(nullptr) {}
+  UnionInNestedNSUnion(UnionInNestedNSUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(UnionInNestedNS_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  UnionInNestedNSUnion(const UnionInNestedNSUnion &);
+  UnionInNestedNSUnion &operator=(const UnionInNestedNSUnion &u)
+    { UnionInNestedNSUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  UnionInNestedNSUnion &operator=(UnionInNestedNSUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~UnionInNestedNSUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = UnionInNestedNSTraits<typename RT::TableType>::enum_value;
+    if (type != UnionInNestedNS_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, UnionInNestedNS type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  NamespaceA::NamespaceB::TableInNestedNST *AsTableInNestedNS() {
+    return type == UnionInNestedNS_TableInNestedNS ?
+      reinterpret_cast<NamespaceA::NamespaceB::TableInNestedNST *>(value) : nullptr;
+  }
+  const NamespaceA::NamespaceB::TableInNestedNST *AsTableInNestedNS() const {
+    return type == UnionInNestedNS_TableInNestedNS ?
+      reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNST *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const UnionInNestedNSUnion &lhs, const UnionInNestedNSUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case UnionInNestedNS_NONE: {
+      return true;
+    }
+    case UnionInNestedNS_TableInNestedNS: {
+      return *(reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNST *>(lhs.value)) ==
+             *(reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNST *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const UnionInNestedNSUnion &lhs, const UnionInNestedNSUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyUnionInNestedNS(flatbuffers::Verifier &verifier, const void *obj, UnionInNestedNS type);
+bool VerifyUnionInNestedNSVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum EnumInNestedNS : int8_t {
+  EnumInNestedNS_A = 0,
+  EnumInNestedNS_B = 1,
+  EnumInNestedNS_C = 2,
+  EnumInNestedNS_MIN = EnumInNestedNS_A,
+  EnumInNestedNS_MAX = EnumInNestedNS_C
+};
+
+inline const EnumInNestedNS (&EnumValuesEnumInNestedNS())[3] {
+  static const EnumInNestedNS values[] = {
+    EnumInNestedNS_A,
+    EnumInNestedNS_B,
+    EnumInNestedNS_C
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnumInNestedNS() {
+  static const char * const names[4] = {
+    "A",
+    "B",
+    "C",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnumInNestedNS(EnumInNestedNS e) {
+  if (flatbuffers::IsOutRange(e, EnumInNestedNS_A, EnumInNestedNS_C)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnumInNestedNS()[index];
+}
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) StructInNestedNS FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t a_;
+  int32_t b_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StructInNestedNSTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.NamespaceB.StructInNestedNS";
+  }
+  StructInNestedNS()
+      : a_(0),
+        b_(0) {
+  }
+  StructInNestedNS(int32_t _a, int32_t _b)
+      : a_(flatbuffers::EndianScalar(_a)),
+        b_(flatbuffers::EndianScalar(_b)) {
+  }
+  int32_t a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  void mutate_a(int32_t _a) {
+    flatbuffers::WriteScalar(&a_, _a);
+  }
+  int32_t b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+  void mutate_b(int32_t _b) {
+    flatbuffers::WriteScalar(&b_, _b);
+  }
+};
+FLATBUFFERS_STRUCT_END(StructInNestedNS, 8);
+
+inline bool operator==(const StructInNestedNS &lhs, const StructInNestedNS &rhs) {
+  return
+      (lhs.a() == rhs.a()) &&
+      (lhs.b() == rhs.b());
+}
+
+inline bool operator!=(const StructInNestedNS &lhs, const StructInNestedNS &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableInNestedNST : public flatbuffers::NativeTable {
+  typedef TableInNestedNS TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.NamespaceB.TableInNestedNST";
+  }
+  int32_t foo = 0;
+};
+
+inline bool operator==(const TableInNestedNST &lhs, const TableInNestedNST &rhs) {
+  return
+      (lhs.foo == rhs.foo);
+}
+
+inline bool operator!=(const TableInNestedNST &lhs, const TableInNestedNST &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableInNestedNS FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableInNestedNST NativeTableType;
+  typedef TableInNestedNSBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TableInNestedNSTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.NamespaceB.TableInNestedNS";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FOO = 4
+  };
+  int32_t foo() const {
+    return GetField<int32_t>(VT_FOO, 0);
+  }
+  bool mutate_foo(int32_t _foo) {
+    return SetField<int32_t>(VT_FOO, _foo, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_FOO) &&
+           verifier.EndTable();
+  }
+  TableInNestedNST *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TableInNestedNST *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TableInNestedNS> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInNestedNST* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TableInNestedNSBuilder {
+  typedef TableInNestedNS Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_foo(int32_t foo) {
+    fbb_.AddElement<int32_t>(TableInNestedNS::VT_FOO, foo, 0);
+  }
+  explicit TableInNestedNSBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableInNestedNS> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableInNestedNS>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableInNestedNS> CreateTableInNestedNS(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t foo = 0) {
+  TableInNestedNSBuilder builder_(_fbb);
+  builder_.add_foo(foo);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TableInNestedNS> CreateTableInNestedNS(flatbuffers::FlatBufferBuilder &_fbb, const TableInNestedNST *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline TableInNestedNST *TableInNestedNS::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TableInNestedNST>(new TableInNestedNST());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TableInNestedNS::UnPackTo(TableInNestedNST *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = foo(); _o->foo = _e; }
+}
+
+inline flatbuffers::Offset<TableInNestedNS> TableInNestedNS::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInNestedNST* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTableInNestedNS(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TableInNestedNS> CreateTableInNestedNS(flatbuffers::FlatBufferBuilder &_fbb, const TableInNestedNST *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TableInNestedNST* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _foo = _o->foo;
+  return NamespaceA::NamespaceB::CreateTableInNestedNS(
+      _fbb,
+      _foo);
+}
+
+inline bool VerifyUnionInNestedNS(flatbuffers::Verifier &verifier, const void *obj, UnionInNestedNS type) {
+  switch (type) {
+    case UnionInNestedNS_NONE: {
+      return true;
+    }
+    case UnionInNestedNS_TableInNestedNS: {
+      auto ptr = reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNS *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyUnionInNestedNSVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyUnionInNestedNS(
+        verifier,  values->Get(i), types->GetEnum<UnionInNestedNS>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *UnionInNestedNSUnion::UnPack(const void *obj, UnionInNestedNS type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case UnionInNestedNS_TableInNestedNS: {
+      auto ptr = reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNS *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> UnionInNestedNSUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case UnionInNestedNS_TableInNestedNS: {
+      auto ptr = reinterpret_cast<const NamespaceA::NamespaceB::TableInNestedNST *>(value);
+      return CreateTableInNestedNS(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline UnionInNestedNSUnion::UnionInNestedNSUnion(const UnionInNestedNSUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case UnionInNestedNS_TableInNestedNS: {
+      value = new NamespaceA::NamespaceB::TableInNestedNST(*reinterpret_cast<NamespaceA::NamespaceB::TableInNestedNST *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void UnionInNestedNSUnion::Reset() {
+  switch (type) {
+    case UnionInNestedNS_TableInNestedNS: {
+      auto ptr = reinterpret_cast<NamespaceA::NamespaceB::TableInNestedNST *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = UnionInNestedNS_NONE;
+}
+
+inline const flatbuffers::TypeTable *UnionInNestedNSTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    NamespaceA::NamespaceB::TableInNestedNSTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "TableInNestedNS"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *EnumInNestedNSTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    NamespaceA::NamespaceB::EnumInNestedNSTypeTable
+  };
+  static const char * const names[] = {
+    "A",
+    "B",
+    "C"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TableInNestedNSTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "foo"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StructInNestedNSTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8 };
+  static const char * const names[] = {
+    "a",
+    "b"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 2, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+}  // namespace NamespaceB
+}  // namespace NamespaceA
+
+#endif  // FLATBUFFERS_GENERATED_NAMESPACETEST1_NAMESPACEA_NAMESPACEB_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.lobster b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.lobster
new file mode 100644
index 0000000..5721b8f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.lobster
@@ -0,0 +1,47 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import flatbuffers
+
+namespace NamespaceA_NamespaceB
+
+enum UnionInNestedNS:
+    UnionInNestedNS_NONE = 0
+    UnionInNestedNS_TableInNestedNS = 1
+
+enum EnumInNestedNS:
+    EnumInNestedNS_A = 0
+    EnumInNestedNS_B = 1
+    EnumInNestedNS_C = 2
+
+class TableInNestedNS
+
+class StructInNestedNS
+
+class TableInNestedNS : flatbuffers_handle
+    def foo():
+        return buf_.flatbuffers_field_int32(pos_, 4, 0)
+
+def GetRootAsTableInNestedNS(buf:string): return TableInNestedNS { buf, buf.flatbuffers_indirect(0) }
+
+struct TableInNestedNSBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(1)
+        return this
+    def add_foo(foo:int):
+        b_.PrependInt32Slot(0, foo, 0)
+        return this
+    def end():
+        return b_.EndObject()
+
+class StructInNestedNS : flatbuffers_handle
+    def a():
+        return buf_.read_int32_le(pos_ + 0)
+    def b():
+        return buf_.read_int32_le(pos_ + 4)
+
+def CreateStructInNestedNS(b_:flatbuffers_builder, a:int, b:int):
+    b_.Prep(4, 8)
+    b_.PrependInt32(b)
+    b_.PrependInt32(a)
+    return b_.Offset()
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.rs
new file mode 100644
index 0000000..37b0175
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_generated.rs
@@ -0,0 +1,529 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod namespace_a {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+#[allow(unused_imports, dead_code)]
+pub mod namespace_b {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_UNION_IN_NESTED_NS: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_UNION_IN_NESTED_NS: u8 = 1;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_UNION_IN_NESTED_NS: [UnionInNestedNS; 2] = [
+  UnionInNestedNS::NONE,
+  UnionInNestedNS::TableInNestedNS,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct UnionInNestedNS(pub u8);
+#[allow(non_upper_case_globals)]
+impl UnionInNestedNS {
+  pub const NONE: Self = Self(0);
+  pub const TableInNestedNS: Self = Self(1);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 1;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::TableInNestedNS,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::TableInNestedNS => Some("TableInNestedNS"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for UnionInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for UnionInNestedNS {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for UnionInNestedNS {
+    type Output = UnionInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for UnionInNestedNS {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for UnionInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for UnionInNestedNS {}
+pub struct UnionInNestedNSUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum UnionInNestedNST {
+  NONE,
+  TableInNestedNS(Box<TableInNestedNST>),
+}
+impl Default for UnionInNestedNST {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl UnionInNestedNST {
+  pub fn union_in_nested_ns_type(&self) -> UnionInNestedNS {
+    match self {
+      Self::NONE => UnionInNestedNS::NONE,
+      Self::TableInNestedNS(_) => UnionInNestedNS::TableInNestedNS,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::TableInNestedNS(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned TableInNestedNST, setting the union to NONE.
+  pub fn take_table_in_nested_ns(&mut self) -> Option<Box<TableInNestedNST>> {
+    if let Self::TableInNestedNS(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::TableInNestedNS(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the TableInNestedNST.
+  pub fn as_table_in_nested_ns(&self) -> Option<&TableInNestedNST> {
+    if let Self::TableInNestedNS(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the TableInNestedNST.
+  pub fn as_table_in_nested_ns_mut(&mut self) -> Option<&mut TableInNestedNST> {
+    if let Self::TableInNestedNS(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ENUM_IN_NESTED_NS: i8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ENUM_IN_NESTED_NS: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ENUM_IN_NESTED_NS: [EnumInNestedNS; 3] = [
+  EnumInNestedNS::A,
+  EnumInNestedNS::B,
+  EnumInNestedNS::C,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct EnumInNestedNS(pub i8);
+#[allow(non_upper_case_globals)]
+impl EnumInNestedNS {
+  pub const A: Self = Self(0);
+  pub const B: Self = Self(1);
+  pub const C: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = 0;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::A,
+    Self::B,
+    Self::C,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::A => Some("A"),
+      Self::B => Some("B"),
+      Self::C => Some("C"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for EnumInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for EnumInNestedNS {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for EnumInNestedNS {
+    type Output = EnumInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for EnumInNestedNS {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for EnumInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for EnumInNestedNS {}
+// struct StructInNestedNS, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct StructInNestedNS(pub [u8; 8]);
+impl Default for StructInNestedNS { 
+  fn default() -> Self { 
+    Self([0; 8])
+  }
+}
+impl std::fmt::Debug for StructInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("StructInNestedNS")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for StructInNestedNS {}
+impl flatbuffers::SafeSliceAccess for StructInNestedNS {}
+impl<'a> flatbuffers::Follow<'a> for StructInNestedNS {
+  type Inner = &'a StructInNestedNS;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a StructInNestedNS>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a StructInNestedNS {
+  type Inner = &'a StructInNestedNS;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<StructInNestedNS>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for StructInNestedNS {
+    type Output = StructInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const StructInNestedNS as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b StructInNestedNS {
+    type Output = StructInNestedNS;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const StructInNestedNS as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for StructInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> StructInNestedNS {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: i32,
+    b: i32,
+  ) -> Self {
+    let mut s = Self([0; 8]);
+    s.set_a(a);
+    s.set_b(b);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.NamespaceB.StructInNestedNS"
+    }
+
+  pub fn a(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_a(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+  pub fn b(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[4..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_b(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[4..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+  pub fn unpack(&self) -> StructInNestedNST {
+    StructInNestedNST {
+      a: self.a(),
+      b: self.b(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct StructInNestedNST {
+  pub a: i32,
+  pub b: i32,
+}
+impl StructInNestedNST {
+  pub fn pack(&self) -> StructInNestedNS {
+    StructInNestedNS::new(
+      self.a,
+      self.b,
+    )
+  }
+}
+
+pub enum TableInNestedNSOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableInNestedNS<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableInNestedNS<'a> {
+    type Inner = TableInNestedNS<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableInNestedNS<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.NamespaceB.TableInNestedNS"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableInNestedNS { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableInNestedNSArgs) -> flatbuffers::WIPOffset<TableInNestedNS<'bldr>> {
+      let mut builder = TableInNestedNSBuilder::new(_fbb);
+      builder.add_foo(args.foo);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TableInNestedNST {
+      let foo = self.foo();
+      TableInNestedNST {
+        foo,
+      }
+    }
+    pub const VT_FOO: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn foo(&self) -> i32 {
+    self._tab.get::<i32>(TableInNestedNS::VT_FOO, Some(0)).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for TableInNestedNS<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<i32>(&"foo", Self::VT_FOO, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableInNestedNSArgs {
+    pub foo: i32,
+}
+impl<'a> Default for TableInNestedNSArgs {
+    #[inline]
+    fn default() -> Self {
+        TableInNestedNSArgs {
+            foo: 0,
+        }
+    }
+}
+pub struct TableInNestedNSBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableInNestedNSBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_foo(&mut self, foo: i32) {
+    self.fbb_.push_slot::<i32>(TableInNestedNS::VT_FOO, foo, 0);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableInNestedNSBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableInNestedNSBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableInNestedNS<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableInNestedNS<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableInNestedNS");
+      ds.field("foo", &self.foo());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TableInNestedNST {
+  pub foo: i32,
+}
+impl Default for TableInNestedNST {
+  fn default() -> Self {
+    Self {
+      foo: 0,
+    }
+  }
+}
+impl TableInNestedNST {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TableInNestedNS<'b>> {
+    let foo = self.foo;
+    TableInNestedNS::create(_fbb, &TableInNestedNSArgs{
+      foo,
+    })
+  }
+}
+}  // pub mod NamespaceB
+}  // pub mod NamespaceA
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_namespace_a.namespace_b_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_namespace_a.namespace_b_generated.dart
new file mode 100644
index 0000000..4941a3c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test1_namespace_a.namespace_b_generated.dart
@@ -0,0 +1,237 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library namespace_a.namespace_b;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+
+class UnionInNestedNSTypeId {
+  final int value;
+  const UnionInNestedNSTypeId._(this.value);
+
+  factory UnionInNestedNSTypeId.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum UnionInNestedNSTypeId');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 1;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const UnionInNestedNSTypeId NONE = const UnionInNestedNSTypeId._(0);
+  static const UnionInNestedNSTypeId TableInNestedNS = const UnionInNestedNSTypeId._(1);
+  static const Map<int,UnionInNestedNSTypeId> values = {0: NONE,1: TableInNestedNS,};
+
+  static const fb.Reader<UnionInNestedNSTypeId> reader = const _UnionInNestedNSTypeIdReader();
+
+  @override
+  String toString() {
+    return 'UnionInNestedNSTypeId{value: $value}';
+  }
+}
+
+class _UnionInNestedNSTypeIdReader extends fb.Reader<UnionInNestedNSTypeId> {
+  const _UnionInNestedNSTypeIdReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  UnionInNestedNSTypeId read(fb.BufferContext bc, int offset) =>
+      new UnionInNestedNSTypeId.fromValue(const fb.Uint8Reader().read(bc, offset));
+}
+
+class EnumInNestedNS {
+  final int value;
+  const EnumInNestedNS._(this.value);
+
+  factory EnumInNestedNS.fromValue(int value) {
+    if (value == null) value = 0;
+    if (!values.containsKey(value)) {
+      throw new StateError('Invalid value $value for bit flag enum EnumInNestedNS');
+    }
+    return values[value];
+  }
+
+  static const int minValue = 0;
+  static const int maxValue = 2;
+  static bool containsValue(int value) => values.containsKey(value);
+
+  static const EnumInNestedNS A = const EnumInNestedNS._(0);
+  static const EnumInNestedNS B = const EnumInNestedNS._(1);
+  static const EnumInNestedNS C = const EnumInNestedNS._(2);
+  static const Map<int,EnumInNestedNS> values = {0: A,1: B,2: C,};
+
+  static const fb.Reader<EnumInNestedNS> reader = const _EnumInNestedNSReader();
+
+  @override
+  String toString() {
+    return 'EnumInNestedNS{value: $value}';
+  }
+}
+
+class _EnumInNestedNSReader extends fb.Reader<EnumInNestedNS> {
+  const _EnumInNestedNSReader();
+
+  @override
+  int get size => 1;
+
+  @override
+  EnumInNestedNS read(fb.BufferContext bc, int offset) =>
+      new EnumInNestedNS.fromValue(const fb.Int8Reader().read(bc, offset));
+}
+
+class TableInNestedNS {
+  TableInNestedNS._(this._bc, this._bcOffset);
+  factory TableInNestedNS(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TableInNestedNS> reader = const _TableInNestedNSReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get foo => const fb.Int32Reader().vTableGet(_bc, _bcOffset, 4, 0);
+
+  @override
+  String toString() {
+    return 'TableInNestedNS{foo: $foo}';
+  }
+}
+
+class _TableInNestedNSReader extends fb.TableReader<TableInNestedNS> {
+  const _TableInNestedNSReader();
+
+  @override
+  TableInNestedNS createObject(fb.BufferContext bc, int offset) => 
+    new TableInNestedNS._(bc, offset);
+}
+
+class TableInNestedNSBuilder {
+  TableInNestedNSBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addFoo(int foo) {
+    fbBuilder.addInt32(0, foo);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TableInNestedNSObjectBuilder extends fb.ObjectBuilder {
+  final int _foo;
+
+  TableInNestedNSObjectBuilder({
+    int foo,
+  })
+      : _foo = foo;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.startTable();
+    fbBuilder.addInt32(0, _foo);
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class StructInNestedNS {
+  StructInNestedNS._(this._bc, this._bcOffset);
+
+  static const fb.Reader<StructInNestedNS> reader = const _StructInNestedNSReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  int get a => const fb.Int32Reader().read(_bc, _bcOffset + 0);
+  int get b => const fb.Int32Reader().read(_bc, _bcOffset + 4);
+
+  @override
+  String toString() {
+    return 'StructInNestedNS{a: $a, b: $b}';
+  }
+}
+
+class _StructInNestedNSReader extends fb.StructReader<StructInNestedNS> {
+  const _StructInNestedNSReader();
+
+  @override
+  int get size => 8;
+
+  @override
+  StructInNestedNS createObject(fb.BufferContext bc, int offset) => 
+    new StructInNestedNS._(bc, offset);
+}
+
+class StructInNestedNSBuilder {
+  StructInNestedNSBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  int finish(int a, int b) {
+    fbBuilder.putInt32(b);
+    fbBuilder.putInt32(a);
+    return fbBuilder.offset;
+  }
+
+}
+
+class StructInNestedNSObjectBuilder extends fb.ObjectBuilder {
+  final int _a;
+  final int _b;
+
+  StructInNestedNSObjectBuilder({
+    int a,
+    int b,
+  })
+      : _a = a,
+        _b = b;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+
+    fbBuilder.putInt32(_b);
+    fbBuilder.putInt32(_a);
+    return fbBuilder.offset;
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2.fbs
new file mode 100644
index 0000000..e1440e7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2.fbs
@@ -0,0 +1,25 @@
+include "namespace_test1.fbs";
+
+namespace NamespaceA;
+
+table TableInFirstNS
+{
+    foo_table:NamespaceB.TableInNestedNS;
+	foo_enum:NamespaceB.EnumInNestedNS;
+    foo_union:NamespaceB.UnionInNestedNS;
+	foo_struct:NamespaceB.StructInNestedNS;
+}
+
+// Test switching namespaces inside a file.
+namespace NamespaceC;
+
+table TableInC {
+    refer_to_a1:NamespaceA.TableInFirstNS;
+    refer_to_a2:NamespaceA.SecondTableInA;
+}
+
+namespace NamespaceA;
+
+table SecondTableInA {
+    refer_to_c:NamespaceC.TableInC;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.h
new file mode 100644
index 0000000..968e240
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.h
@@ -0,0 +1,546 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_NAMESPACETEST2_NAMESPACEA_H_
+#define FLATBUFFERS_GENERATED_NAMESPACETEST2_NAMESPACEA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace NamespaceA {
+
+struct TableInFirstNS;
+struct TableInFirstNSBuilder;
+struct TableInFirstNST;
+
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+struct TableInC;
+struct TableInCBuilder;
+struct TableInCT;
+
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+struct SecondTableInA;
+struct SecondTableInABuilder;
+struct SecondTableInAT;
+
+bool operator==(const TableInFirstNST &lhs, const TableInFirstNST &rhs);
+bool operator!=(const TableInFirstNST &lhs, const TableInFirstNST &rhs);
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+bool operator==(const TableInCT &lhs, const TableInCT &rhs);
+bool operator!=(const TableInCT &lhs, const TableInCT &rhs);
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+bool operator==(const SecondTableInAT &lhs, const SecondTableInAT &rhs);
+bool operator!=(const SecondTableInAT &lhs, const SecondTableInAT &rhs);
+
+inline const flatbuffers::TypeTable *TableInFirstNSTypeTable();
+
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+inline const flatbuffers::TypeTable *TableInCTypeTable();
+
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+inline const flatbuffers::TypeTable *SecondTableInATypeTable();
+
+struct TableInFirstNST : public flatbuffers::NativeTable {
+  typedef TableInFirstNS TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.TableInFirstNST";
+  }
+  flatbuffers::unique_ptr<NamespaceA::NamespaceB::TableInNestedNST> foo_table{};
+  NamespaceA::NamespaceB::EnumInNestedNS foo_enum = NamespaceA::NamespaceB::EnumInNestedNS_A;
+  NamespaceA::NamespaceB::UnionInNestedNSUnion foo_union{};
+  flatbuffers::unique_ptr<NamespaceA::NamespaceB::StructInNestedNS> foo_struct{};
+};
+
+inline bool operator==(const TableInFirstNST &lhs, const TableInFirstNST &rhs) {
+  return
+      (lhs.foo_table == rhs.foo_table) &&
+      (lhs.foo_enum == rhs.foo_enum) &&
+      (lhs.foo_union == rhs.foo_union) &&
+      (lhs.foo_struct == rhs.foo_struct);
+}
+
+inline bool operator!=(const TableInFirstNST &lhs, const TableInFirstNST &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableInFirstNS FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableInFirstNST NativeTableType;
+  typedef TableInFirstNSBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TableInFirstNSTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.TableInFirstNS";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FOO_TABLE = 4,
+    VT_FOO_ENUM = 6,
+    VT_FOO_UNION_TYPE = 8,
+    VT_FOO_UNION = 10,
+    VT_FOO_STRUCT = 12
+  };
+  const NamespaceA::NamespaceB::TableInNestedNS *foo_table() const {
+    return GetPointer<const NamespaceA::NamespaceB::TableInNestedNS *>(VT_FOO_TABLE);
+  }
+  NamespaceA::NamespaceB::TableInNestedNS *mutable_foo_table() {
+    return GetPointer<NamespaceA::NamespaceB::TableInNestedNS *>(VT_FOO_TABLE);
+  }
+  NamespaceA::NamespaceB::EnumInNestedNS foo_enum() const {
+    return static_cast<NamespaceA::NamespaceB::EnumInNestedNS>(GetField<int8_t>(VT_FOO_ENUM, 0));
+  }
+  bool mutate_foo_enum(NamespaceA::NamespaceB::EnumInNestedNS _foo_enum) {
+    return SetField<int8_t>(VT_FOO_ENUM, static_cast<int8_t>(_foo_enum), 0);
+  }
+  NamespaceA::NamespaceB::UnionInNestedNS foo_union_type() const {
+    return static_cast<NamespaceA::NamespaceB::UnionInNestedNS>(GetField<uint8_t>(VT_FOO_UNION_TYPE, 0));
+  }
+  const void *foo_union() const {
+    return GetPointer<const void *>(VT_FOO_UNION);
+  }
+  template<typename T> const T *foo_union_as() const;
+  const NamespaceA::NamespaceB::TableInNestedNS *foo_union_as_TableInNestedNS() const {
+    return foo_union_type() == NamespaceA::NamespaceB::UnionInNestedNS_TableInNestedNS ? static_cast<const NamespaceA::NamespaceB::TableInNestedNS *>(foo_union()) : nullptr;
+  }
+  void *mutable_foo_union() {
+    return GetPointer<void *>(VT_FOO_UNION);
+  }
+  const NamespaceA::NamespaceB::StructInNestedNS *foo_struct() const {
+    return GetStruct<const NamespaceA::NamespaceB::StructInNestedNS *>(VT_FOO_STRUCT);
+  }
+  NamespaceA::NamespaceB::StructInNestedNS *mutable_foo_struct() {
+    return GetStruct<NamespaceA::NamespaceB::StructInNestedNS *>(VT_FOO_STRUCT);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_FOO_TABLE) &&
+           verifier.VerifyTable(foo_table()) &&
+           VerifyField<int8_t>(verifier, VT_FOO_ENUM) &&
+           VerifyField<uint8_t>(verifier, VT_FOO_UNION_TYPE) &&
+           VerifyOffset(verifier, VT_FOO_UNION) &&
+           VerifyUnionInNestedNS(verifier, foo_union(), foo_union_type()) &&
+           VerifyField<NamespaceA::NamespaceB::StructInNestedNS>(verifier, VT_FOO_STRUCT) &&
+           verifier.EndTable();
+  }
+  TableInFirstNST *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TableInFirstNST *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TableInFirstNS> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInFirstNST* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const NamespaceA::NamespaceB::TableInNestedNS *TableInFirstNS::foo_union_as<NamespaceA::NamespaceB::TableInNestedNS>() const {
+  return foo_union_as_TableInNestedNS();
+}
+
+struct TableInFirstNSBuilder {
+  typedef TableInFirstNS Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_foo_table(flatbuffers::Offset<NamespaceA::NamespaceB::TableInNestedNS> foo_table) {
+    fbb_.AddOffset(TableInFirstNS::VT_FOO_TABLE, foo_table);
+  }
+  void add_foo_enum(NamespaceA::NamespaceB::EnumInNestedNS foo_enum) {
+    fbb_.AddElement<int8_t>(TableInFirstNS::VT_FOO_ENUM, static_cast<int8_t>(foo_enum), 0);
+  }
+  void add_foo_union_type(NamespaceA::NamespaceB::UnionInNestedNS foo_union_type) {
+    fbb_.AddElement<uint8_t>(TableInFirstNS::VT_FOO_UNION_TYPE, static_cast<uint8_t>(foo_union_type), 0);
+  }
+  void add_foo_union(flatbuffers::Offset<void> foo_union) {
+    fbb_.AddOffset(TableInFirstNS::VT_FOO_UNION, foo_union);
+  }
+  void add_foo_struct(const NamespaceA::NamespaceB::StructInNestedNS *foo_struct) {
+    fbb_.AddStruct(TableInFirstNS::VT_FOO_STRUCT, foo_struct);
+  }
+  explicit TableInFirstNSBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableInFirstNS> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableInFirstNS>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableInFirstNS> CreateTableInFirstNS(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<NamespaceA::NamespaceB::TableInNestedNS> foo_table = 0,
+    NamespaceA::NamespaceB::EnumInNestedNS foo_enum = NamespaceA::NamespaceB::EnumInNestedNS_A,
+    NamespaceA::NamespaceB::UnionInNestedNS foo_union_type = NamespaceA::NamespaceB::UnionInNestedNS_NONE,
+    flatbuffers::Offset<void> foo_union = 0,
+    const NamespaceA::NamespaceB::StructInNestedNS *foo_struct = 0) {
+  TableInFirstNSBuilder builder_(_fbb);
+  builder_.add_foo_struct(foo_struct);
+  builder_.add_foo_union(foo_union);
+  builder_.add_foo_table(foo_table);
+  builder_.add_foo_union_type(foo_union_type);
+  builder_.add_foo_enum(foo_enum);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TableInFirstNS> CreateTableInFirstNS(flatbuffers::FlatBufferBuilder &_fbb, const TableInFirstNST *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+struct TableInCT : public flatbuffers::NativeTable {
+  typedef TableInC TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceC.TableInCT";
+  }
+  flatbuffers::unique_ptr<NamespaceA::TableInFirstNST> refer_to_a1{};
+  flatbuffers::unique_ptr<NamespaceA::SecondTableInAT> refer_to_a2{};
+};
+
+inline bool operator==(const TableInCT &lhs, const TableInCT &rhs) {
+  return
+      (lhs.refer_to_a1 == rhs.refer_to_a1) &&
+      (lhs.refer_to_a2 == rhs.refer_to_a2);
+}
+
+inline bool operator!=(const TableInCT &lhs, const TableInCT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct TableInC FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TableInCT NativeTableType;
+  typedef TableInCBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TableInCTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceC.TableInC";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_REFER_TO_A1 = 4,
+    VT_REFER_TO_A2 = 6
+  };
+  const NamespaceA::TableInFirstNS *refer_to_a1() const {
+    return GetPointer<const NamespaceA::TableInFirstNS *>(VT_REFER_TO_A1);
+  }
+  NamespaceA::TableInFirstNS *mutable_refer_to_a1() {
+    return GetPointer<NamespaceA::TableInFirstNS *>(VT_REFER_TO_A1);
+  }
+  const NamespaceA::SecondTableInA *refer_to_a2() const {
+    return GetPointer<const NamespaceA::SecondTableInA *>(VT_REFER_TO_A2);
+  }
+  NamespaceA::SecondTableInA *mutable_refer_to_a2() {
+    return GetPointer<NamespaceA::SecondTableInA *>(VT_REFER_TO_A2);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_REFER_TO_A1) &&
+           verifier.VerifyTable(refer_to_a1()) &&
+           VerifyOffset(verifier, VT_REFER_TO_A2) &&
+           verifier.VerifyTable(refer_to_a2()) &&
+           verifier.EndTable();
+  }
+  TableInCT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TableInCT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TableInC> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInCT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TableInCBuilder {
+  typedef TableInC Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_refer_to_a1(flatbuffers::Offset<NamespaceA::TableInFirstNS> refer_to_a1) {
+    fbb_.AddOffset(TableInC::VT_REFER_TO_A1, refer_to_a1);
+  }
+  void add_refer_to_a2(flatbuffers::Offset<NamespaceA::SecondTableInA> refer_to_a2) {
+    fbb_.AddOffset(TableInC::VT_REFER_TO_A2, refer_to_a2);
+  }
+  explicit TableInCBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TableInC> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TableInC>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TableInC> CreateTableInC(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<NamespaceA::TableInFirstNS> refer_to_a1 = 0,
+    flatbuffers::Offset<NamespaceA::SecondTableInA> refer_to_a2 = 0) {
+  TableInCBuilder builder_(_fbb);
+  builder_.add_refer_to_a2(refer_to_a2);
+  builder_.add_refer_to_a1(refer_to_a1);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TableInC> CreateTableInC(flatbuffers::FlatBufferBuilder &_fbb, const TableInCT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+struct SecondTableInAT : public flatbuffers::NativeTable {
+  typedef SecondTableInA TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.SecondTableInAT";
+  }
+  flatbuffers::unique_ptr<NamespaceC::TableInCT> refer_to_c{};
+};
+
+inline bool operator==(const SecondTableInAT &lhs, const SecondTableInAT &rhs) {
+  return
+      (lhs.refer_to_c == rhs.refer_to_c);
+}
+
+inline bool operator!=(const SecondTableInAT &lhs, const SecondTableInAT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct SecondTableInA FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SecondTableInAT NativeTableType;
+  typedef SecondTableInABuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SecondTableInATypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "NamespaceA.SecondTableInA";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_REFER_TO_C = 4
+  };
+  const NamespaceC::TableInC *refer_to_c() const {
+    return GetPointer<const NamespaceC::TableInC *>(VT_REFER_TO_C);
+  }
+  NamespaceC::TableInC *mutable_refer_to_c() {
+    return GetPointer<NamespaceC::TableInC *>(VT_REFER_TO_C);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_REFER_TO_C) &&
+           verifier.VerifyTable(refer_to_c()) &&
+           verifier.EndTable();
+  }
+  SecondTableInAT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SecondTableInAT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SecondTableInA> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SecondTableInAT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SecondTableInABuilder {
+  typedef SecondTableInA Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_refer_to_c(flatbuffers::Offset<NamespaceC::TableInC> refer_to_c) {
+    fbb_.AddOffset(SecondTableInA::VT_REFER_TO_C, refer_to_c);
+  }
+  explicit SecondTableInABuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SecondTableInA> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SecondTableInA>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SecondTableInA> CreateSecondTableInA(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<NamespaceC::TableInC> refer_to_c = 0) {
+  SecondTableInABuilder builder_(_fbb);
+  builder_.add_refer_to_c(refer_to_c);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SecondTableInA> CreateSecondTableInA(flatbuffers::FlatBufferBuilder &_fbb, const SecondTableInAT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline TableInFirstNST *TableInFirstNS::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TableInFirstNST>(new TableInFirstNST());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TableInFirstNS::UnPackTo(TableInFirstNST *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = foo_table(); if (_e) _o->foo_table = flatbuffers::unique_ptr<NamespaceA::NamespaceB::TableInNestedNST>(_e->UnPack(_resolver)); }
+  { auto _e = foo_enum(); _o->foo_enum = _e; }
+  { auto _e = foo_union_type(); _o->foo_union.type = _e; }
+  { auto _e = foo_union(); if (_e) _o->foo_union.value = NamespaceA::NamespaceB::UnionInNestedNSUnion::UnPack(_e, foo_union_type(), _resolver); }
+  { auto _e = foo_struct(); if (_e) _o->foo_struct = flatbuffers::unique_ptr<NamespaceA::NamespaceB::StructInNestedNS>(new NamespaceA::NamespaceB::StructInNestedNS(*_e)); }
+}
+
+inline flatbuffers::Offset<TableInFirstNS> TableInFirstNS::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInFirstNST* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTableInFirstNS(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TableInFirstNS> CreateTableInFirstNS(flatbuffers::FlatBufferBuilder &_fbb, const TableInFirstNST *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TableInFirstNST* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _foo_table = _o->foo_table ? CreateTableInNestedNS(_fbb, _o->foo_table.get(), _rehasher) : 0;
+  auto _foo_enum = _o->foo_enum;
+  auto _foo_union_type = _o->foo_union.type;
+  auto _foo_union = _o->foo_union.Pack(_fbb);
+  auto _foo_struct = _o->foo_struct ? _o->foo_struct.get() : 0;
+  return NamespaceA::CreateTableInFirstNS(
+      _fbb,
+      _foo_table,
+      _foo_enum,
+      _foo_union_type,
+      _foo_union,
+      _foo_struct);
+}
+
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+inline TableInCT *TableInC::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TableInCT>(new TableInCT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TableInC::UnPackTo(TableInCT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = refer_to_a1(); if (_e) _o->refer_to_a1 = flatbuffers::unique_ptr<NamespaceA::TableInFirstNST>(_e->UnPack(_resolver)); }
+  { auto _e = refer_to_a2(); if (_e) _o->refer_to_a2 = flatbuffers::unique_ptr<NamespaceA::SecondTableInAT>(_e->UnPack(_resolver)); }
+}
+
+inline flatbuffers::Offset<TableInC> TableInC::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TableInCT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTableInC(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TableInC> CreateTableInC(flatbuffers::FlatBufferBuilder &_fbb, const TableInCT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TableInCT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _refer_to_a1 = _o->refer_to_a1 ? CreateTableInFirstNS(_fbb, _o->refer_to_a1.get(), _rehasher) : 0;
+  auto _refer_to_a2 = _o->refer_to_a2 ? CreateSecondTableInA(_fbb, _o->refer_to_a2.get(), _rehasher) : 0;
+  return NamespaceC::CreateTableInC(
+      _fbb,
+      _refer_to_a1,
+      _refer_to_a2);
+}
+
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+inline SecondTableInAT *SecondTableInA::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SecondTableInAT>(new SecondTableInAT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SecondTableInA::UnPackTo(SecondTableInAT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = refer_to_c(); if (_e) _o->refer_to_c = flatbuffers::unique_ptr<NamespaceC::TableInCT>(_e->UnPack(_resolver)); }
+}
+
+inline flatbuffers::Offset<SecondTableInA> SecondTableInA::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SecondTableInAT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSecondTableInA(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SecondTableInA> CreateSecondTableInA(flatbuffers::FlatBufferBuilder &_fbb, const SecondTableInAT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SecondTableInAT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _refer_to_c = _o->refer_to_c ? CreateTableInC(_fbb, _o->refer_to_c.get(), _rehasher) : 0;
+  return NamespaceA::CreateSecondTableInA(
+      _fbb,
+      _refer_to_c);
+}
+
+inline const flatbuffers::TypeTable *TableInFirstNSTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_UTYPE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 3 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    NamespaceA::NamespaceB::TableInNestedNSTypeTable,
+    NamespaceA::NamespaceB::EnumInNestedNSTypeTable,
+    NamespaceA::NamespaceB::UnionInNestedNSTypeTable,
+    NamespaceA::NamespaceB::StructInNestedNSTypeTable
+  };
+  static const char * const names[] = {
+    "foo_table",
+    "foo_enum",
+    "foo_union_type",
+    "foo_union",
+    "foo_struct"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace NamespaceA
+
+namespace NamespaceC {
+
+inline const flatbuffers::TypeTable *TableInCTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    NamespaceA::TableInFirstNSTypeTable,
+    NamespaceA::SecondTableInATypeTable
+  };
+  static const char * const names[] = {
+    "refer_to_a1",
+    "refer_to_a2"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace NamespaceC
+
+namespace NamespaceA {
+
+inline const flatbuffers::TypeTable *SecondTableInATypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    NamespaceC::TableInCTypeTable
+  };
+  static const char * const names[] = {
+    "refer_to_c"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+}  // namespace NamespaceA
+
+#endif  // FLATBUFFERS_GENERATED_NAMESPACETEST2_NAMESPACEA_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.lobster b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.lobster
new file mode 100644
index 0000000..3820928
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.lobster
@@ -0,0 +1,100 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import flatbuffers
+
+namespace NamespaceA
+
+class TableInFirstNS
+
+namespace NamespaceC
+
+class TableInC
+
+namespace NamespaceA
+
+class SecondTableInA
+
+class TableInFirstNS : flatbuffers_handle
+    def foo_table():
+        let o = buf_.flatbuffers_field_table(pos_, 4)
+        return if o: NamespaceA_NamespaceB_TableInNestedNS { buf_, o } else: nil
+    def foo_enum():
+        return EnumInNestedNS(buf_.flatbuffers_field_int8(pos_, 6, 0))
+    def foo_union_type():
+        return UnionInNestedNS(buf_.flatbuffers_field_int8(pos_, 8, 0))
+    def foo_union_as_TableInNestedNS():
+        return NamespaceA_NamespaceB_TableInNestedNS { buf_, buf_.flatbuffers_field_table(pos_, 10) }
+    def foo_struct():
+        let o = buf_.flatbuffers_field_struct(pos_, 12)
+        return if o: NamespaceA_NamespaceB_StructInNestedNS { buf_, o } else: nil
+
+def GetRootAsTableInFirstNS(buf:string): return TableInFirstNS { buf, buf.flatbuffers_indirect(0) }
+
+struct TableInFirstNSBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(5)
+        return this
+    def add_foo_table(foo_table:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(0, foo_table)
+        return this
+    def add_foo_enum(foo_enum:EnumInNestedNS):
+        b_.PrependInt8Slot(1, foo_enum, 0)
+        return this
+    def add_foo_union_type(foo_union_type:UnionInNestedNS):
+        b_.PrependUint8Slot(2, foo_union_type, 0)
+        return this
+    def add_foo_union(foo_union:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(3, foo_union)
+        return this
+    def add_foo_struct(foo_struct:flatbuffers_offset):
+        b_.PrependStructSlot(4, foo_struct)
+        return this
+    def end():
+        return b_.EndObject()
+
+namespace NamespaceC
+
+class TableInC : flatbuffers_handle
+    def refer_to_a1():
+        let o = buf_.flatbuffers_field_table(pos_, 4)
+        return if o: NamespaceA_TableInFirstNS { buf_, o } else: nil
+    def refer_to_a2():
+        let o = buf_.flatbuffers_field_table(pos_, 6)
+        return if o: NamespaceA_SecondTableInA { buf_, o } else: nil
+
+def GetRootAsTableInC(buf:string): return TableInC { buf, buf.flatbuffers_indirect(0) }
+
+struct TableInCBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(2)
+        return this
+    def add_refer_to_a1(refer_to_a1:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(0, refer_to_a1)
+        return this
+    def add_refer_to_a2(refer_to_a2:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(1, refer_to_a2)
+        return this
+    def end():
+        return b_.EndObject()
+
+namespace NamespaceA
+
+class SecondTableInA : flatbuffers_handle
+    def refer_to_c():
+        let o = buf_.flatbuffers_field_table(pos_, 4)
+        return if o: NamespaceC_TableInC { buf_, o } else: nil
+
+def GetRootAsSecondTableInA(buf:string): return SecondTableInA { buf, buf.flatbuffers_indirect(0) }
+
+struct SecondTableInABuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(1)
+        return this
+    def add_refer_to_c(refer_to_c:flatbuffers_offset):
+        b_.PrependUOffsetTRelativeSlot(0, refer_to_c)
+        return this
+    def end():
+        return b_.EndObject()
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.rs
new file mode 100644
index 0000000..4db42a0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_generated.rs
@@ -0,0 +1,1062 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod namespace_a {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+#[allow(unused_imports, dead_code)]
+pub mod namespace_b {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_UNION_IN_NESTED_NS: u8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_UNION_IN_NESTED_NS: u8 = 1;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_UNION_IN_NESTED_NS: [UnionInNestedNS; 2] = [
+  UnionInNestedNS::NONE,
+  UnionInNestedNS::TableInNestedNS,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct UnionInNestedNS(pub u8);
+#[allow(non_upper_case_globals)]
+impl UnionInNestedNS {
+  pub const NONE: Self = Self(0);
+  pub const TableInNestedNS: Self = Self(1);
+
+  pub const ENUM_MIN: u8 = 0;
+  pub const ENUM_MAX: u8 = 1;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::NONE,
+    Self::TableInNestedNS,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::NONE => Some("NONE"),
+      Self::TableInNestedNS => Some("TableInNestedNS"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for UnionInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for UnionInNestedNS {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<u8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for UnionInNestedNS {
+    type Output = UnionInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<u8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for UnionInNestedNS {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = u8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = u8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for UnionInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    u8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for UnionInNestedNS {}
+pub struct UnionInNestedNSUnionTableOffset {}
+
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum UnionInNestedNST {
+  NONE,
+  TableInNestedNS(Box<TableInNestedNST>),
+}
+impl Default for UnionInNestedNST {
+  fn default() -> Self {
+    Self::NONE
+  }
+}
+impl UnionInNestedNST {
+  pub fn union_in_nested_ns_type(&self) -> UnionInNestedNS {
+    match self {
+      Self::NONE => UnionInNestedNS::NONE,
+      Self::TableInNestedNS(_) => UnionInNestedNS::TableInNestedNS,
+    }
+  }
+  pub fn pack(&self, fbb: &mut flatbuffers::FlatBufferBuilder) -> Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>> {
+    match self {
+      Self::NONE => None,
+      Self::TableInNestedNS(v) => Some(v.pack(fbb).as_union_value()),
+    }
+  }
+  /// If the union variant matches, return the owned TableInNestedNST, setting the union to NONE.
+  pub fn take_table_in_nested_ns(&mut self) -> Option<Box<TableInNestedNST>> {
+    if let Self::TableInNestedNS(_) = self {
+      let v = std::mem::replace(self, Self::NONE);
+      if let Self::TableInNestedNS(w) = v {
+        Some(w)
+      } else {
+        unreachable!()
+      }
+    } else {
+      None
+    }
+  }
+  /// If the union variant matches, return a reference to the TableInNestedNST.
+  pub fn as_table_in_nested_ns(&self) -> Option<&TableInNestedNST> {
+    if let Self::TableInNestedNS(v) = self { Some(v.as_ref()) } else { None }
+  }
+  /// If the union variant matches, return a mutable reference to the TableInNestedNST.
+  pub fn as_table_in_nested_ns_mut(&mut self) -> Option<&mut TableInNestedNST> {
+    if let Self::TableInNestedNS(v) = self { Some(v.as_mut()) } else { None }
+  }
+}
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_ENUM_IN_NESTED_NS: i8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_ENUM_IN_NESTED_NS: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_ENUM_IN_NESTED_NS: [EnumInNestedNS; 3] = [
+  EnumInNestedNS::A,
+  EnumInNestedNS::B,
+  EnumInNestedNS::C,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct EnumInNestedNS(pub i8);
+#[allow(non_upper_case_globals)]
+impl EnumInNestedNS {
+  pub const A: Self = Self(0);
+  pub const B: Self = Self(1);
+  pub const C: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = 0;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::A,
+    Self::B,
+    Self::C,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::A => Some("A"),
+      Self::B => Some("B"),
+      Self::C => Some("C"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for EnumInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for EnumInNestedNS {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for EnumInNestedNS {
+    type Output = EnumInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for EnumInNestedNS {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for EnumInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for EnumInNestedNS {}
+// struct StructInNestedNS, aligned to 4
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq)]
+pub struct StructInNestedNS(pub [u8; 8]);
+impl Default for StructInNestedNS { 
+  fn default() -> Self { 
+    Self([0; 8])
+  }
+}
+impl std::fmt::Debug for StructInNestedNS {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    f.debug_struct("StructInNestedNS")
+      .field("a", &self.a())
+      .field("b", &self.b())
+      .finish()
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for StructInNestedNS {}
+impl flatbuffers::SafeSliceAccess for StructInNestedNS {}
+impl<'a> flatbuffers::Follow<'a> for StructInNestedNS {
+  type Inner = &'a StructInNestedNS;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    <&'a StructInNestedNS>::follow(buf, loc)
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for &'a StructInNestedNS {
+  type Inner = &'a StructInNestedNS;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    flatbuffers::follow_cast_ref::<StructInNestedNS>(buf, loc)
+  }
+}
+impl<'b> flatbuffers::Push for StructInNestedNS {
+    type Output = StructInNestedNS;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(self as *const StructInNestedNS as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+impl<'b> flatbuffers::Push for &'b StructInNestedNS {
+    type Output = StructInNestedNS;
+
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        let src = unsafe {
+            ::std::slice::from_raw_parts(*self as *const StructInNestedNS as *const u8, Self::size())
+        };
+        dst.copy_from_slice(src);
+    }
+}
+
+impl<'a> flatbuffers::Verifiable for StructInNestedNS {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.in_buffer::<Self>(pos)
+  }
+}
+impl<'a> StructInNestedNS {
+  #[allow(clippy::too_many_arguments)]
+  pub fn new(
+    a: i32,
+    b: i32,
+  ) -> Self {
+    let mut s = Self([0; 8]);
+    s.set_a(a);
+    s.set_b(b);
+    s
+  }
+
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.NamespaceB.StructInNestedNS"
+    }
+
+  pub fn a(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[0..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_a(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[0..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+  pub fn b(&self) -> i32 {
+    let mut mem = core::mem::MaybeUninit::<i32>::uninit();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        self.0[4..].as_ptr(),
+        mem.as_mut_ptr() as *mut u8,
+        core::mem::size_of::<i32>(),
+      );
+      mem.assume_init()
+    }.from_little_endian()
+  }
+
+  pub fn set_b(&mut self, x: i32) {
+    let x_le = x.to_little_endian();
+    unsafe {
+      core::ptr::copy_nonoverlapping(
+        &x_le as *const i32 as *const u8,
+        self.0[4..].as_mut_ptr(),
+        core::mem::size_of::<i32>(),
+      );
+    }
+  }
+
+  pub fn unpack(&self) -> StructInNestedNST {
+    StructInNestedNST {
+      a: self.a(),
+      b: self.b(),
+    }
+  }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct StructInNestedNST {
+  pub a: i32,
+  pub b: i32,
+}
+impl StructInNestedNST {
+  pub fn pack(&self) -> StructInNestedNS {
+    StructInNestedNS::new(
+      self.a,
+      self.b,
+    )
+  }
+}
+
+pub enum TableInNestedNSOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableInNestedNS<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableInNestedNS<'a> {
+    type Inner = TableInNestedNS<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableInNestedNS<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.NamespaceB.TableInNestedNS"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableInNestedNS { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableInNestedNSArgs) -> flatbuffers::WIPOffset<TableInNestedNS<'bldr>> {
+      let mut builder = TableInNestedNSBuilder::new(_fbb);
+      builder.add_foo(args.foo);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TableInNestedNST {
+      let foo = self.foo();
+      TableInNestedNST {
+        foo,
+      }
+    }
+    pub const VT_FOO: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn foo(&self) -> i32 {
+    self._tab.get::<i32>(TableInNestedNS::VT_FOO, Some(0)).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for TableInNestedNS<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<i32>(&"foo", Self::VT_FOO, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableInNestedNSArgs {
+    pub foo: i32,
+}
+impl<'a> Default for TableInNestedNSArgs {
+    #[inline]
+    fn default() -> Self {
+        TableInNestedNSArgs {
+            foo: 0,
+        }
+    }
+}
+pub struct TableInNestedNSBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableInNestedNSBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_foo(&mut self, foo: i32) {
+    self.fbb_.push_slot::<i32>(TableInNestedNS::VT_FOO, foo, 0);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableInNestedNSBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableInNestedNSBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableInNestedNS<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableInNestedNS<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableInNestedNS");
+      ds.field("foo", &self.foo());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TableInNestedNST {
+  pub foo: i32,
+}
+impl Default for TableInNestedNST {
+  fn default() -> Self {
+    Self {
+      foo: 0,
+    }
+  }
+}
+impl TableInNestedNST {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TableInNestedNS<'b>> {
+    let foo = self.foo;
+    TableInNestedNS::create(_fbb, &TableInNestedNSArgs{
+      foo,
+    })
+  }
+}
+}  // pub mod NamespaceB
+
+pub enum TableInFirstNSOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableInFirstNS<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableInFirstNS<'a> {
+    type Inner = TableInFirstNS<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableInFirstNS<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.TableInFirstNS"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableInFirstNS { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableInFirstNSArgs<'args>) -> flatbuffers::WIPOffset<TableInFirstNS<'bldr>> {
+      let mut builder = TableInFirstNSBuilder::new(_fbb);
+      if let Some(x) = args.foo_struct { builder.add_foo_struct(x); }
+      if let Some(x) = args.foo_union { builder.add_foo_union(x); }
+      if let Some(x) = args.foo_table { builder.add_foo_table(x); }
+      builder.add_foo_union_type(args.foo_union_type);
+      builder.add_foo_enum(args.foo_enum);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TableInFirstNST {
+      let foo_table = self.foo_table().map(|x| {
+        Box::new(x.unpack())
+      });
+      let foo_enum = self.foo_enum();
+      let foo_union = match self.foo_union_type() {
+        namespace_b::UnionInNestedNS::NONE => namespace_b::UnionInNestedNST::NONE,
+        namespace_b::UnionInNestedNS::TableInNestedNS => namespace_b::UnionInNestedNST::TableInNestedNS(Box::new(
+          self.foo_union_as_table_in_nested_ns()
+              .expect("Invalid union table, expected `namespace_b::UnionInNestedNS::TableInNestedNS`.")
+              .unpack()
+        )),
+        _ => namespace_b::UnionInNestedNST::NONE,
+      };
+      let foo_struct = self.foo_struct().map(|x| {
+        x.unpack()
+      });
+      TableInFirstNST {
+        foo_table,
+        foo_enum,
+        foo_union,
+        foo_struct,
+      }
+    }
+    pub const VT_FOO_TABLE: flatbuffers::VOffsetT = 4;
+    pub const VT_FOO_ENUM: flatbuffers::VOffsetT = 6;
+    pub const VT_FOO_UNION_TYPE: flatbuffers::VOffsetT = 8;
+    pub const VT_FOO_UNION: flatbuffers::VOffsetT = 10;
+    pub const VT_FOO_STRUCT: flatbuffers::VOffsetT = 12;
+
+  #[inline]
+  pub fn foo_table(&self) -> Option<namespace_b::TableInNestedNS<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<namespace_b::TableInNestedNS>>(TableInFirstNS::VT_FOO_TABLE, None)
+  }
+  #[inline]
+  pub fn foo_enum(&self) -> namespace_b::EnumInNestedNS {
+    self._tab.get::<namespace_b::EnumInNestedNS>(TableInFirstNS::VT_FOO_ENUM, Some(namespace_b::EnumInNestedNS::A)).unwrap()
+  }
+  #[inline]
+  pub fn foo_union_type(&self) -> namespace_b::UnionInNestedNS {
+    self._tab.get::<namespace_b::UnionInNestedNS>(TableInFirstNS::VT_FOO_UNION_TYPE, Some(namespace_b::UnionInNestedNS::NONE)).unwrap()
+  }
+  #[inline]
+  pub fn foo_union(&self) -> Option<flatbuffers::Table<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Table<'a>>>(TableInFirstNS::VT_FOO_UNION, None)
+  }
+  #[inline]
+  pub fn foo_struct(&self) -> Option<&'a namespace_b::StructInNestedNS> {
+    self._tab.get::<namespace_b::StructInNestedNS>(TableInFirstNS::VT_FOO_STRUCT, None)
+  }
+  #[inline]
+  #[allow(non_snake_case)]
+  pub fn foo_union_as_table_in_nested_ns(&self) -> Option<namespace_b::TableInNestedNS<'a>> {
+    if self.foo_union_type() == namespace_b::UnionInNestedNS::TableInNestedNS {
+      self.foo_union().map(namespace_b::TableInNestedNS::init_from_table)
+    } else {
+      None
+    }
+  }
+
+}
+
+impl flatbuffers::Verifiable for TableInFirstNS<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<namespace_b::TableInNestedNS>>(&"foo_table", Self::VT_FOO_TABLE, false)?
+     .visit_field::<namespace_b::EnumInNestedNS>(&"foo_enum", Self::VT_FOO_ENUM, false)?
+     .visit_union::<namespace_b::UnionInNestedNS, _>(&"foo_union_type", Self::VT_FOO_UNION_TYPE, &"foo_union", Self::VT_FOO_UNION, false, |key, v, pos| {
+        match key {
+          namespace_b::UnionInNestedNS::TableInNestedNS => v.verify_union_variant::<flatbuffers::ForwardsUOffset<namespace_b::TableInNestedNS>>("namespace_b::UnionInNestedNS::TableInNestedNS", pos),
+          _ => Ok(()),
+        }
+     })?
+     .visit_field::<namespace_b::StructInNestedNS>(&"foo_struct", Self::VT_FOO_STRUCT, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableInFirstNSArgs<'a> {
+    pub foo_table: Option<flatbuffers::WIPOffset<namespace_b::TableInNestedNS<'a>>>,
+    pub foo_enum: namespace_b::EnumInNestedNS,
+    pub foo_union_type: namespace_b::UnionInNestedNS,
+    pub foo_union: Option<flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>>,
+    pub foo_struct: Option<&'a namespace_b::StructInNestedNS>,
+}
+impl<'a> Default for TableInFirstNSArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        TableInFirstNSArgs {
+            foo_table: None,
+            foo_enum: namespace_b::EnumInNestedNS::A,
+            foo_union_type: namespace_b::UnionInNestedNS::NONE,
+            foo_union: None,
+            foo_struct: None,
+        }
+    }
+}
+pub struct TableInFirstNSBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableInFirstNSBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_foo_table(&mut self, foo_table: flatbuffers::WIPOffset<namespace_b::TableInNestedNS<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<namespace_b::TableInNestedNS>>(TableInFirstNS::VT_FOO_TABLE, foo_table);
+  }
+  #[inline]
+  pub fn add_foo_enum(&mut self, foo_enum: namespace_b::EnumInNestedNS) {
+    self.fbb_.push_slot::<namespace_b::EnumInNestedNS>(TableInFirstNS::VT_FOO_ENUM, foo_enum, namespace_b::EnumInNestedNS::A);
+  }
+  #[inline]
+  pub fn add_foo_union_type(&mut self, foo_union_type: namespace_b::UnionInNestedNS) {
+    self.fbb_.push_slot::<namespace_b::UnionInNestedNS>(TableInFirstNS::VT_FOO_UNION_TYPE, foo_union_type, namespace_b::UnionInNestedNS::NONE);
+  }
+  #[inline]
+  pub fn add_foo_union(&mut self, foo_union: flatbuffers::WIPOffset<flatbuffers::UnionWIPOffset>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<_>>(TableInFirstNS::VT_FOO_UNION, foo_union);
+  }
+  #[inline]
+  pub fn add_foo_struct(&mut self, foo_struct: &namespace_b::StructInNestedNS) {
+    self.fbb_.push_slot_always::<&namespace_b::StructInNestedNS>(TableInFirstNS::VT_FOO_STRUCT, foo_struct);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableInFirstNSBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableInFirstNSBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableInFirstNS<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableInFirstNS<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableInFirstNS");
+      ds.field("foo_table", &self.foo_table());
+      ds.field("foo_enum", &self.foo_enum());
+      ds.field("foo_union_type", &self.foo_union_type());
+      match self.foo_union_type() {
+        namespace_b::UnionInNestedNS::TableInNestedNS => {
+          if let Some(x) = self.foo_union_as_table_in_nested_ns() {
+            ds.field("foo_union", &x)
+          } else {
+            ds.field("foo_union", &"InvalidFlatbuffer: Union discriminant does not match value.")
+          }
+        },
+        _ => {
+          let x: Option<()> = None;
+          ds.field("foo_union", &x)
+        },
+      };
+      ds.field("foo_struct", &self.foo_struct());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TableInFirstNST {
+  pub foo_table: Option<Box<namespace_b::TableInNestedNST>>,
+  pub foo_enum: namespace_b::EnumInNestedNS,
+  pub foo_union: namespace_b::UnionInNestedNST,
+  pub foo_struct: Option<namespace_b::StructInNestedNST>,
+}
+impl Default for TableInFirstNST {
+  fn default() -> Self {
+    Self {
+      foo_table: None,
+      foo_enum: namespace_b::EnumInNestedNS::A,
+      foo_union: namespace_b::UnionInNestedNST::NONE,
+      foo_struct: None,
+    }
+  }
+}
+impl TableInFirstNST {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TableInFirstNS<'b>> {
+    let foo_table = self.foo_table.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    let foo_enum = self.foo_enum;
+    let foo_union_type = self.foo_union.union_in_nested_ns_type();
+    let foo_union = self.foo_union.pack(_fbb);
+    let foo_struct_tmp = self.foo_struct.as_ref().map(|x| x.pack());
+    let foo_struct = foo_struct_tmp.as_ref();
+    TableInFirstNS::create(_fbb, &TableInFirstNSArgs{
+      foo_table,
+      foo_enum,
+      foo_union_type,
+      foo_union,
+      foo_struct,
+    })
+  }
+}
+pub enum SecondTableInAOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct SecondTableInA<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for SecondTableInA<'a> {
+    type Inner = SecondTableInA<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> SecondTableInA<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceA.SecondTableInA"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        SecondTableInA { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args SecondTableInAArgs<'args>) -> flatbuffers::WIPOffset<SecondTableInA<'bldr>> {
+      let mut builder = SecondTableInABuilder::new(_fbb);
+      if let Some(x) = args.refer_to_c { builder.add_refer_to_c(x); }
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> SecondTableInAT {
+      let refer_to_c = self.refer_to_c().map(|x| {
+        Box::new(x.unpack())
+      });
+      SecondTableInAT {
+        refer_to_c,
+      }
+    }
+    pub const VT_REFER_TO_C: flatbuffers::VOffsetT = 4;
+
+  #[inline]
+  pub fn refer_to_c(&self) -> Option<super::namespace_c::TableInC<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<super::namespace_c::TableInC>>(SecondTableInA::VT_REFER_TO_C, None)
+  }
+}
+
+impl flatbuffers::Verifiable for SecondTableInA<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<super::namespace_c::TableInC>>(&"refer_to_c", Self::VT_REFER_TO_C, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct SecondTableInAArgs<'a> {
+    pub refer_to_c: Option<flatbuffers::WIPOffset<super::namespace_c::TableInC<'a>>>,
+}
+impl<'a> Default for SecondTableInAArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        SecondTableInAArgs {
+            refer_to_c: None,
+        }
+    }
+}
+pub struct SecondTableInABuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> SecondTableInABuilder<'a, 'b> {
+  #[inline]
+  pub fn add_refer_to_c(&mut self, refer_to_c: flatbuffers::WIPOffset<super::namespace_c::TableInC<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<super::namespace_c::TableInC>>(SecondTableInA::VT_REFER_TO_C, refer_to_c);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> SecondTableInABuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    SecondTableInABuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<SecondTableInA<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for SecondTableInA<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("SecondTableInA");
+      ds.field("refer_to_c", &self.refer_to_c());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct SecondTableInAT {
+  pub refer_to_c: Option<Box<super::namespace_c::TableInCT>>,
+}
+impl Default for SecondTableInAT {
+  fn default() -> Self {
+    Self {
+      refer_to_c: None,
+    }
+  }
+}
+impl SecondTableInAT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<SecondTableInA<'b>> {
+    let refer_to_c = self.refer_to_c.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    SecondTableInA::create(_fbb, &SecondTableInAArgs{
+      refer_to_c,
+    })
+  }
+}
+}  // pub mod NamespaceA
+
+#[allow(unused_imports, dead_code)]
+pub mod namespace_c {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+pub enum TableInCOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct TableInC<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for TableInC<'a> {
+    type Inner = TableInC<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> TableInC<'a> {
+    pub const fn get_fully_qualified_name() -> &'static str {
+        "NamespaceC.TableInC"
+    }
+
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        TableInC { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args TableInCArgs<'args>) -> flatbuffers::WIPOffset<TableInC<'bldr>> {
+      let mut builder = TableInCBuilder::new(_fbb);
+      if let Some(x) = args.refer_to_a2 { builder.add_refer_to_a2(x); }
+      if let Some(x) = args.refer_to_a1 { builder.add_refer_to_a1(x); }
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> TableInCT {
+      let refer_to_a1 = self.refer_to_a1().map(|x| {
+        Box::new(x.unpack())
+      });
+      let refer_to_a2 = self.refer_to_a2().map(|x| {
+        Box::new(x.unpack())
+      });
+      TableInCT {
+        refer_to_a1,
+        refer_to_a2,
+      }
+    }
+    pub const VT_REFER_TO_A1: flatbuffers::VOffsetT = 4;
+    pub const VT_REFER_TO_A2: flatbuffers::VOffsetT = 6;
+
+  #[inline]
+  pub fn refer_to_a1(&self) -> Option<super::namespace_a::TableInFirstNS<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<super::namespace_a::TableInFirstNS>>(TableInC::VT_REFER_TO_A1, None)
+  }
+  #[inline]
+  pub fn refer_to_a2(&self) -> Option<super::namespace_a::SecondTableInA<'a>> {
+    self._tab.get::<flatbuffers::ForwardsUOffset<super::namespace_a::SecondTableInA>>(TableInC::VT_REFER_TO_A2, None)
+  }
+}
+
+impl flatbuffers::Verifiable for TableInC<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<flatbuffers::ForwardsUOffset<super::namespace_a::TableInFirstNS>>(&"refer_to_a1", Self::VT_REFER_TO_A1, false)?
+     .visit_field::<flatbuffers::ForwardsUOffset<super::namespace_a::SecondTableInA>>(&"refer_to_a2", Self::VT_REFER_TO_A2, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct TableInCArgs<'a> {
+    pub refer_to_a1: Option<flatbuffers::WIPOffset<super::namespace_a::TableInFirstNS<'a>>>,
+    pub refer_to_a2: Option<flatbuffers::WIPOffset<super::namespace_a::SecondTableInA<'a>>>,
+}
+impl<'a> Default for TableInCArgs<'a> {
+    #[inline]
+    fn default() -> Self {
+        TableInCArgs {
+            refer_to_a1: None,
+            refer_to_a2: None,
+        }
+    }
+}
+pub struct TableInCBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> TableInCBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_refer_to_a1(&mut self, refer_to_a1: flatbuffers::WIPOffset<super::namespace_a::TableInFirstNS<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<super::namespace_a::TableInFirstNS>>(TableInC::VT_REFER_TO_A1, refer_to_a1);
+  }
+  #[inline]
+  pub fn add_refer_to_a2(&mut self, refer_to_a2: flatbuffers::WIPOffset<super::namespace_a::SecondTableInA<'b >>) {
+    self.fbb_.push_slot_always::<flatbuffers::WIPOffset<super::namespace_a::SecondTableInA>>(TableInC::VT_REFER_TO_A2, refer_to_a2);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TableInCBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    TableInCBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<TableInC<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for TableInC<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("TableInC");
+      ds.field("refer_to_a1", &self.refer_to_a1());
+      ds.field("refer_to_a2", &self.refer_to_a2());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct TableInCT {
+  pub refer_to_a1: Option<Box<super::namespace_a::TableInFirstNST>>,
+  pub refer_to_a2: Option<Box<super::namespace_a::SecondTableInAT>>,
+}
+impl Default for TableInCT {
+  fn default() -> Self {
+    Self {
+      refer_to_a1: None,
+      refer_to_a2: None,
+    }
+  }
+}
+impl TableInCT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<TableInC<'b>> {
+    let refer_to_a1 = self.refer_to_a1.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    let refer_to_a2 = self.refer_to_a2.as_ref().map(|x|{
+      x.pack(_fbb)
+    });
+    TableInC::create(_fbb, &TableInCArgs{
+      refer_to_a1,
+      refer_to_a2,
+    })
+  }
+}
+}  // pub mod NamespaceC
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_a_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_a_generated.dart
new file mode 100644
index 0000000..8c08420
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_a_generated.dart
@@ -0,0 +1,214 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library namespace_a;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './namespace_test2_namespace_c_generated.dart' as namespace_c;
+
+class TableInFirstNS {
+  TableInFirstNS._(this._bc, this._bcOffset);
+  factory TableInFirstNS(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TableInFirstNS> reader = const _TableInFirstNSReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  namespace_a_namespace_b.TableInNestedNS get fooTable => namespace_a_namespace_b.TableInNestedNS.reader.vTableGet(_bc, _bcOffset, 4, null);
+  EnumInNestedNS get fooEnum => new EnumInNestedNS.fromValue(const fb.Int8Reader().vTableGet(_bc, _bcOffset, 6, 0));
+  UnionInNestedNSTypeId get fooUnionType => new UnionInNestedNSTypeId.fromValue(const fb.Uint8Reader().vTableGet(_bc, _bcOffset, 8, 0));
+  dynamic get fooUnion {
+    switch (fooUnionType?.value) {
+      case 1: return TableInNestedNS.reader.vTableGet(_bc, _bcOffset, 10, null);
+      default: return null;
+    }
+  }
+  namespace_a_namespace_b.StructInNestedNS get fooStruct => namespace_a_namespace_b.StructInNestedNS.reader.vTableGet(_bc, _bcOffset, 12, null);
+
+  @override
+  String toString() {
+    return 'TableInFirstNS{fooTable: $fooTable, fooEnum: $fooEnum, fooUnionType: $fooUnionType, fooUnion: $fooUnion, fooStruct: $fooStruct}';
+  }
+}
+
+class _TableInFirstNSReader extends fb.TableReader<TableInFirstNS> {
+  const _TableInFirstNSReader();
+
+  @override
+  TableInFirstNS createObject(fb.BufferContext bc, int offset) => 
+    new TableInFirstNS._(bc, offset);
+}
+
+class TableInFirstNSBuilder {
+  TableInFirstNSBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addFooTableOffset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+  int addFooEnum(EnumInNestedNS fooEnum) {
+    fbBuilder.addInt8(1, fooEnum?.value);
+    return fbBuilder.offset;
+  }
+  int addFooUnionType(UnionInNestedNSTypeId fooUnionType) {
+    fbBuilder.addUint8(2, fooUnionType?.value);
+    return fbBuilder.offset;
+  }
+  int addFooUnionOffset(int offset) {
+    fbBuilder.addOffset(3, offset);
+    return fbBuilder.offset;
+  }
+  int addFooStruct(int offset) {
+    fbBuilder.addStruct(4, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TableInFirstNSObjectBuilder extends fb.ObjectBuilder {
+  final namespace_a_namespace_b.TableInNestedNSObjectBuilder _fooTable;
+  final EnumInNestedNS _fooEnum;
+  final UnionInNestedNSTypeId _fooUnionType;
+  final dynamic _fooUnion;
+  final namespace_a_namespace_b.StructInNestedNSObjectBuilder _fooStruct;
+
+  TableInFirstNSObjectBuilder({
+    namespace_a_namespace_b.TableInNestedNSObjectBuilder fooTable,
+    EnumInNestedNS fooEnum,
+    UnionInNestedNSTypeId fooUnionType,
+    dynamic fooUnion,
+    namespace_a_namespace_b.StructInNestedNSObjectBuilder fooStruct,
+  })
+      : _fooTable = fooTable,
+        _fooEnum = fooEnum,
+        _fooUnionType = fooUnionType,
+        _fooUnion = fooUnion,
+        _fooStruct = fooStruct;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int fooTableOffset = _fooTable?.getOrCreateOffset(fbBuilder);
+    final int fooUnionOffset = _fooUnion?.getOrCreateOffset(fbBuilder);
+
+    fbBuilder.startTable();
+    if (fooTableOffset != null) {
+      fbBuilder.addOffset(0, fooTableOffset);
+    }
+    fbBuilder.addInt8(1, _fooEnum?.value);
+    fbBuilder.addUint8(2, _fooUnionType?.value);
+    if (fooUnionOffset != null) {
+      fbBuilder.addOffset(3, fooUnionOffset);
+    }
+    if (_fooStruct != null) {
+      fbBuilder.addStruct(4, _fooStruct.finish(fbBuilder));
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
+class SecondTableInA {
+  SecondTableInA._(this._bc, this._bcOffset);
+  factory SecondTableInA(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<SecondTableInA> reader = const _SecondTableInAReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  namespace_c.TableInC get referToC => namespace_c.TableInC.reader.vTableGet(_bc, _bcOffset, 4, null);
+
+  @override
+  String toString() {
+    return 'SecondTableInA{referToC: $referToC}';
+  }
+}
+
+class _SecondTableInAReader extends fb.TableReader<SecondTableInA> {
+  const _SecondTableInAReader();
+
+  @override
+  SecondTableInA createObject(fb.BufferContext bc, int offset) => 
+    new SecondTableInA._(bc, offset);
+}
+
+class SecondTableInABuilder {
+  SecondTableInABuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addReferToCOffset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class SecondTableInAObjectBuilder extends fb.ObjectBuilder {
+  final namespace_c.TableInCObjectBuilder _referToC;
+
+  SecondTableInAObjectBuilder({
+    namespace_c.TableInCObjectBuilder referToC,
+  })
+      : _referToC = referToC;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int referToCOffset = _referToC?.getOrCreateOffset(fbBuilder);
+
+    fbBuilder.startTable();
+    if (referToCOffset != null) {
+      fbBuilder.addOffset(0, referToCOffset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_c_generated.dart b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_c_generated.dart
new file mode 100644
index 0000000..edb6ffc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/namespace_test/namespace_test2_namespace_c_generated.dart
@@ -0,0 +1,101 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+// ignore_for_file: unused_import, unused_field, unused_local_variable
+
+library namespace_c;
+
+import 'dart:typed_data' show Uint8List;
+import 'package:flat_buffers/flat_buffers.dart' as fb;
+
+import './namespace_test2_namespace_a_generated.dart' as namespace_a;
+
+class TableInC {
+  TableInC._(this._bc, this._bcOffset);
+  factory TableInC(List<int> bytes) {
+    fb.BufferContext rootRef = new fb.BufferContext.fromBytes(bytes);
+    return reader.read(rootRef, 0);
+  }
+
+  static const fb.Reader<TableInC> reader = const _TableInCReader();
+
+  final fb.BufferContext _bc;
+  final int _bcOffset;
+
+  namespace_a.TableInFirstNS get referToA1 => namespace_a.TableInFirstNS.reader.vTableGet(_bc, _bcOffset, 4, null);
+  namespace_a.SecondTableInA get referToA2 => namespace_a.SecondTableInA.reader.vTableGet(_bc, _bcOffset, 6, null);
+
+  @override
+  String toString() {
+    return 'TableInC{referToA1: $referToA1, referToA2: $referToA2}';
+  }
+}
+
+class _TableInCReader extends fb.TableReader<TableInC> {
+  const _TableInCReader();
+
+  @override
+  TableInC createObject(fb.BufferContext bc, int offset) => 
+    new TableInC._(bc, offset);
+}
+
+class TableInCBuilder {
+  TableInCBuilder(this.fbBuilder) {
+    assert(fbBuilder != null);
+  }
+
+  final fb.Builder fbBuilder;
+
+  void begin() {
+    fbBuilder.startTable();
+  }
+
+  int addReferToA1Offset(int offset) {
+    fbBuilder.addOffset(0, offset);
+    return fbBuilder.offset;
+  }
+  int addReferToA2Offset(int offset) {
+    fbBuilder.addOffset(1, offset);
+    return fbBuilder.offset;
+  }
+
+  int finish() {
+    return fbBuilder.endTable();
+  }
+}
+
+class TableInCObjectBuilder extends fb.ObjectBuilder {
+  final namespace_a.TableInFirstNSObjectBuilder _referToA1;
+  final namespace_a.SecondTableInAObjectBuilder _referToA2;
+
+  TableInCObjectBuilder({
+    namespace_a.TableInFirstNSObjectBuilder referToA1,
+    namespace_a.SecondTableInAObjectBuilder referToA2,
+  })
+      : _referToA1 = referToA1,
+        _referToA2 = referToA2;
+
+  /// Finish building, and store into the [fbBuilder].
+  @override
+  int finish(
+    fb.Builder fbBuilder) {
+    assert(fbBuilder != null);
+    final int referToA1Offset = _referToA1?.getOrCreateOffset(fbBuilder);
+    final int referToA2Offset = _referToA2?.getOrCreateOffset(fbBuilder);
+
+    fbBuilder.startTable();
+    if (referToA1Offset != null) {
+      fbBuilder.addOffset(0, referToA1Offset);
+    }
+    if (referToA2Offset != null) {
+      fbBuilder.addOffset(1, referToA2Offset);
+    }
+    return fbBuilder.endTable();
+  }
+
+  /// Convenience method to serialize to byte list.
+  @override
+  Uint8List toBytes([String fileIdentifier]) {
+    fb.Builder fbBuilder = new fb.Builder();
+    int offset = finish(fbBuilder);
+    return fbBuilder.finish(offset, fileIdentifier);
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test.fbs
new file mode 100644
index 0000000..e22150f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test.fbs
@@ -0,0 +1,22 @@
+native_include "native_type_test_impl.h";
+
+namespace Geometry;
+
+struct Vector3D (native_type:"Native::Vector3D") {
+  x:float;
+  y:float;
+  z:float;
+}
+
+struct Vector3DAlt (native_type:"Native::Vector3D", native_type_pack_name:"Vector3DAlt") {
+  a:float;
+  b:float;
+  c:float;
+}
+
+table ApplicationData {
+  vectors:[Vector3D];
+  vectors_alt:[Vector3DAlt];
+}
+
+root_type ApplicationData;
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_generated.h
new file mode 100644
index 0000000..a4fbf98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_generated.h
@@ -0,0 +1,327 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_NATIVETYPETEST_GEOMETRY_H_
+#define FLATBUFFERS_GENERATED_NATIVETYPETEST_GEOMETRY_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "native_type_test_impl.h"
+
+namespace Geometry {
+
+struct Vector3D;
+
+struct Vector3DAlt;
+
+struct ApplicationData;
+struct ApplicationDataBuilder;
+struct ApplicationDataT;
+
+inline const flatbuffers::TypeTable *Vector3DTypeTable();
+
+inline const flatbuffers::TypeTable *Vector3DAltTypeTable();
+
+inline const flatbuffers::TypeTable *ApplicationDataTypeTable();
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Vector3D FLATBUFFERS_FINAL_CLASS {
+ private:
+  float x_;
+  float y_;
+  float z_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Vector3DTypeTable();
+  }
+  Vector3D()
+      : x_(0),
+        y_(0),
+        z_(0) {
+  }
+  Vector3D(float _x, float _y, float _z)
+      : x_(flatbuffers::EndianScalar(_x)),
+        y_(flatbuffers::EndianScalar(_y)),
+        z_(flatbuffers::EndianScalar(_z)) {
+  }
+  float x() const {
+    return flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(float _x) {
+    flatbuffers::WriteScalar(&x_, _x);
+  }
+  float y() const {
+    return flatbuffers::EndianScalar(y_);
+  }
+  void mutate_y(float _y) {
+    flatbuffers::WriteScalar(&y_, _y);
+  }
+  float z() const {
+    return flatbuffers::EndianScalar(z_);
+  }
+  void mutate_z(float _z) {
+    flatbuffers::WriteScalar(&z_, _z);
+  }
+};
+FLATBUFFERS_STRUCT_END(Vector3D, 12);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Vector3DAlt FLATBUFFERS_FINAL_CLASS {
+ private:
+  float a_;
+  float b_;
+  float c_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Vector3DAltTypeTable();
+  }
+  Vector3DAlt()
+      : a_(0),
+        b_(0),
+        c_(0) {
+  }
+  Vector3DAlt(float _a, float _b, float _c)
+      : a_(flatbuffers::EndianScalar(_a)),
+        b_(flatbuffers::EndianScalar(_b)),
+        c_(flatbuffers::EndianScalar(_c)) {
+  }
+  float a() const {
+    return flatbuffers::EndianScalar(a_);
+  }
+  void mutate_a(float _a) {
+    flatbuffers::WriteScalar(&a_, _a);
+  }
+  float b() const {
+    return flatbuffers::EndianScalar(b_);
+  }
+  void mutate_b(float _b) {
+    flatbuffers::WriteScalar(&b_, _b);
+  }
+  float c() const {
+    return flatbuffers::EndianScalar(c_);
+  }
+  void mutate_c(float _c) {
+    flatbuffers::WriteScalar(&c_, _c);
+  }
+};
+FLATBUFFERS_STRUCT_END(Vector3DAlt, 12);
+
+struct ApplicationDataT : public flatbuffers::NativeTable {
+  typedef ApplicationData TableType;
+  std::vector<Native::Vector3D> vectors{};
+  std::vector<Native::Vector3D> vectors_alt{};
+};
+
+struct ApplicationData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ApplicationDataT NativeTableType;
+  typedef ApplicationDataBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ApplicationDataTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VECTORS = 4,
+    VT_VECTORS_ALT = 6
+  };
+  const flatbuffers::Vector<const Geometry::Vector3D *> *vectors() const {
+    return GetPointer<const flatbuffers::Vector<const Geometry::Vector3D *> *>(VT_VECTORS);
+  }
+  flatbuffers::Vector<const Geometry::Vector3D *> *mutable_vectors() {
+    return GetPointer<flatbuffers::Vector<const Geometry::Vector3D *> *>(VT_VECTORS);
+  }
+  const flatbuffers::Vector<const Geometry::Vector3DAlt *> *vectors_alt() const {
+    return GetPointer<const flatbuffers::Vector<const Geometry::Vector3DAlt *> *>(VT_VECTORS_ALT);
+  }
+  flatbuffers::Vector<const Geometry::Vector3DAlt *> *mutable_vectors_alt() {
+    return GetPointer<flatbuffers::Vector<const Geometry::Vector3DAlt *> *>(VT_VECTORS_ALT);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VECTORS) &&
+           verifier.VerifyVector(vectors()) &&
+           VerifyOffset(verifier, VT_VECTORS_ALT) &&
+           verifier.VerifyVector(vectors_alt()) &&
+           verifier.EndTable();
+  }
+  ApplicationDataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ApplicationDataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ApplicationData> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ApplicationDataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ApplicationDataBuilder {
+  typedef ApplicationData Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_vectors(flatbuffers::Offset<flatbuffers::Vector<const Geometry::Vector3D *>> vectors) {
+    fbb_.AddOffset(ApplicationData::VT_VECTORS, vectors);
+  }
+  void add_vectors_alt(flatbuffers::Offset<flatbuffers::Vector<const Geometry::Vector3DAlt *>> vectors_alt) {
+    fbb_.AddOffset(ApplicationData::VT_VECTORS_ALT, vectors_alt);
+  }
+  explicit ApplicationDataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ApplicationData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ApplicationData>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ApplicationData> CreateApplicationData(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<const Geometry::Vector3D *>> vectors = 0,
+    flatbuffers::Offset<flatbuffers::Vector<const Geometry::Vector3DAlt *>> vectors_alt = 0) {
+  ApplicationDataBuilder builder_(_fbb);
+  builder_.add_vectors_alt(vectors_alt);
+  builder_.add_vectors(vectors);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ApplicationData> CreateApplicationDataDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<Geometry::Vector3D> *vectors = nullptr,
+    const std::vector<Geometry::Vector3DAlt> *vectors_alt = nullptr) {
+  auto vectors__ = vectors ? _fbb.CreateVectorOfStructs<Geometry::Vector3D>(*vectors) : 0;
+  auto vectors_alt__ = vectors_alt ? _fbb.CreateVectorOfStructs<Geometry::Vector3DAlt>(*vectors_alt) : 0;
+  return Geometry::CreateApplicationData(
+      _fbb,
+      vectors__,
+      vectors_alt__);
+}
+
+flatbuffers::Offset<ApplicationData> CreateApplicationData(flatbuffers::FlatBufferBuilder &_fbb, const ApplicationDataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline ApplicationDataT *ApplicationData::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ApplicationDataT>(new ApplicationDataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ApplicationData::UnPackTo(ApplicationDataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = vectors(); if (_e) { _o->vectors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vectors[_i] = flatbuffers::UnPack(*_e->Get(_i)); } } }
+  { auto _e = vectors_alt(); if (_e) { _o->vectors_alt.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->vectors_alt[_i] = flatbuffers::UnPackVector3DAlt(*_e->Get(_i)); } } }
+}
+
+inline flatbuffers::Offset<ApplicationData> ApplicationData::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ApplicationDataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateApplicationData(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ApplicationData> CreateApplicationData(flatbuffers::FlatBufferBuilder &_fbb, const ApplicationDataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ApplicationDataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _vectors = _o->vectors.size() ? _fbb.CreateVectorOfNativeStructs<Geometry::Vector3D, Native::Vector3D>(_o->vectors) : 0;
+  auto _vectors_alt = _o->vectors_alt.size() ? _fbb.CreateVectorOfNativeStructs<Geometry::Vector3DAlt, Native::Vector3D>(_o->vectors_alt, flatbuffers::PackVector3DAlt) : 0;
+  return Geometry::CreateApplicationData(
+      _fbb,
+      _vectors,
+      _vectors_alt);
+}
+
+inline const flatbuffers::TypeTable *Vector3DTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8, 12 };
+  static const char * const names[] = {
+    "x",
+    "y",
+    "z"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 3, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Vector3DAltTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4, 8, 12 };
+  static const char * const names[] = {
+    "a",
+    "b",
+    "c"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 3, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ApplicationDataTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_SEQUENCE, 1, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    Geometry::Vector3DTypeTable,
+    Geometry::Vector3DAltTypeTable
+  };
+  static const char * const names[] = {
+    "vectors",
+    "vectors_alt"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const Geometry::ApplicationData *GetApplicationData(const void *buf) {
+  return flatbuffers::GetRoot<Geometry::ApplicationData>(buf);
+}
+
+inline const Geometry::ApplicationData *GetSizePrefixedApplicationData(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<Geometry::ApplicationData>(buf);
+}
+
+inline ApplicationData *GetMutableApplicationData(void *buf) {
+  return flatbuffers::GetMutableRoot<ApplicationData>(buf);
+}
+
+inline bool VerifyApplicationDataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<Geometry::ApplicationData>(nullptr);
+}
+
+inline bool VerifySizePrefixedApplicationDataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<Geometry::ApplicationData>(nullptr);
+}
+
+inline void FinishApplicationDataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Geometry::ApplicationData> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedApplicationDataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Geometry::ApplicationData> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+inline flatbuffers::unique_ptr<Geometry::ApplicationDataT> UnPackApplicationData(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<Geometry::ApplicationDataT>(GetApplicationData(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<Geometry::ApplicationDataT> UnPackSizePrefixedApplicationData(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<Geometry::ApplicationDataT>(GetSizePrefixedApplicationData(buf)->UnPack(res));
+}
+
+}  // namespace Geometry
+
+#endif  // FLATBUFFERS_GENERATED_NATIVETYPETEST_GEOMETRY_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.cpp
new file mode 100644
index 0000000..b5b595b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.cpp
@@ -0,0 +1,21 @@
+#include "native_type_test_impl.h"
+
+#include "native_type_test_generated.h"
+
+namespace flatbuffers {
+Geometry::Vector3D Pack(const Native::Vector3D &obj) {
+  return Geometry::Vector3D(obj.x, obj.y, obj.z);
+}
+
+const Native::Vector3D UnPack(const Geometry::Vector3D &obj) {
+  return Native::Vector3D(obj.x(), obj.y(), obj.z());
+}
+
+Geometry::Vector3DAlt PackVector3DAlt(const Native::Vector3D &obj) {
+  return Geometry::Vector3DAlt(obj.x, obj.y, obj.z);
+}
+
+const Native::Vector3D UnPackVector3DAlt(const Geometry::Vector3DAlt &obj) {
+  return Native::Vector3D(obj.a(), obj.b(), obj.c());
+}
+}  // namespace flatbuffers
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.h b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.h
new file mode 100644
index 0000000..58c288a
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/native_type_test_impl.h
@@ -0,0 +1,35 @@
+#ifndef NATIVE_TYPE_TEST_IMPL_H
+#define NATIVE_TYPE_TEST_IMPL_H
+
+namespace Native {
+struct Vector3D {
+  float x;
+  float y;
+  float z;
+
+  Vector3D() {
+    x = 0;
+    y = 0;
+    z = 0;
+  };
+  Vector3D(float _x, float _y, float _z) {
+    this->x = _x;
+    this->y = _y;
+    this->z = _z;
+  }
+};
+}  // namespace Native
+
+namespace Geometry {
+struct Vector3D;
+struct Vector3DAlt;
+}  // namespace Geometry
+
+namespace flatbuffers {
+Geometry::Vector3D Pack(const Native::Vector3D &obj);
+const Native::Vector3D UnPack(const Geometry::Vector3D &obj);
+Geometry::Vector3DAlt PackVector3DAlt(const Native::Vector3D &obj);
+const Native::Vector3D UnPackVector3DAlt(const Geometry::Vector3DAlt &obj);
+}  // namespace flatbuffers
+
+#endif  // VECTOR3D_PACK_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars.fbs
new file mode 100644
index 0000000..260d443
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars.fbs
@@ -0,0 +1,59 @@
+namespace optional_scalars;
+
+enum OptionalByte: byte {
+  None = 0,
+  One = 1,
+  Two = 2,
+}
+
+// This table tests optional scalars in tables. It should be integrated with
+// the main monster test once most languages support optional scalars.
+table ScalarStuff {
+  just_i8: int8;
+  maybe_i8: int8 = null;
+  default_i8: int8 = 42;
+  just_u8: uint8;
+  maybe_u8: uint8 = null;
+  default_u8: uint8 = 42;
+
+  just_i16: int16;
+  maybe_i16: int16 = null;
+  default_i16: int16 = 42;
+  just_u16: uint16;
+  maybe_u16: uint16 = null;
+  default_u16: uint16 = 42;
+
+  just_i32: int32;
+  maybe_i32: int32 = null;
+  default_i32: int32 = 42;
+  just_u32: uint32;
+  maybe_u32: uint32 = null;
+  default_u32: uint32 = 42;
+
+  just_i64: int64;
+  maybe_i64: int64 = null;
+  default_i64: int64 = 42;
+  just_u64: uint64;
+  maybe_u64: uint64 = null;
+  default_u64: uint64 = 42;
+
+  just_f32: float32;
+  maybe_f32: float32 = null;
+  default_f32: float32 = 42;
+  just_f64: float64;
+  maybe_f64: float64 = null;
+  default_f64: float64 = 42;
+
+  just_bool: bool;
+  maybe_bool: bool = null;
+  default_bool: bool = true;
+
+  just_enum: OptionalByte;
+  maybe_enum: OptionalByte = null;
+  default_enum: OptionalByte = One;
+}
+
+root_type ScalarStuff;
+
+file_identifier "NULL";
+file_extension "mon";
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.cs b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.cs
new file mode 100644
index 0000000..79a2004
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.cs
@@ -0,0 +1,16 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace optional_scalars
+{
+
+public enum OptionalByte : sbyte
+{
+  None = 0,
+  One = 1,
+  Two = 2,
+};
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.java b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.java
new file mode 100644
index 0000000..2ca8475
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.java
@@ -0,0 +1,15 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package optional_scalars;
+
+public final class OptionalByte {
+  private OptionalByte() { }
+  public static final byte None = 0;
+  public static final byte One = 1;
+  public static final byte Two = 2;
+
+  public static final String[] names = { "None", "One", "Two", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.kt b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.kt
new file mode 100644
index 0000000..41fe6cd
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/OptionalByte.kt
@@ -0,0 +1,15 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package optional_scalars
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class OptionalByte private constructor() {
+    companion object {
+        const val None: Byte = 0
+        const val One: Byte = 1
+        const val Two: Byte = 2
+        val names : Array<String> = arrayOf("None", "One", "Two")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.cs b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.cs
new file mode 100644
index 0000000..9b48e47
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.cs
@@ -0,0 +1,354 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+namespace optional_scalars
+{
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct ScalarStuff : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static ScalarStuff GetRootAsScalarStuff(ByteBuffer _bb) { return GetRootAsScalarStuff(_bb, new ScalarStuff()); }
+  public static ScalarStuff GetRootAsScalarStuff(ByteBuffer _bb, ScalarStuff obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public static bool ScalarStuffBufferHasIdentifier(ByteBuffer _bb) { return Table.__has_identifier(_bb, "NULL"); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public ScalarStuff __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public sbyte JustI8 { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetSbyte(o + __p.bb_pos) : (sbyte)0; } }
+  public sbyte? MaybeI8 { get { int o = __p.__offset(6); return o != 0 ? __p.bb.GetSbyte(o + __p.bb_pos) : (sbyte?)null; } }
+  public sbyte DefaultI8 { get { int o = __p.__offset(8); return o != 0 ? __p.bb.GetSbyte(o + __p.bb_pos) : (sbyte)42; } }
+  public byte JustU8 { get { int o = __p.__offset(10); return o != 0 ? __p.bb.Get(o + __p.bb_pos) : (byte)0; } }
+  public byte? MaybeU8 { get { int o = __p.__offset(12); return o != 0 ? __p.bb.Get(o + __p.bb_pos) : (byte?)null; } }
+  public byte DefaultU8 { get { int o = __p.__offset(14); return o != 0 ? __p.bb.Get(o + __p.bb_pos) : (byte)42; } }
+  public short JustI16 { get { int o = __p.__offset(16); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short)0; } }
+  public short? MaybeI16 { get { int o = __p.__offset(18); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short?)null; } }
+  public short DefaultI16 { get { int o = __p.__offset(20); return o != 0 ? __p.bb.GetShort(o + __p.bb_pos) : (short)42; } }
+  public ushort JustU16 { get { int o = __p.__offset(22); return o != 0 ? __p.bb.GetUshort(o + __p.bb_pos) : (ushort)0; } }
+  public ushort? MaybeU16 { get { int o = __p.__offset(24); return o != 0 ? __p.bb.GetUshort(o + __p.bb_pos) : (ushort?)null; } }
+  public ushort DefaultU16 { get { int o = __p.__offset(26); return o != 0 ? __p.bb.GetUshort(o + __p.bb_pos) : (ushort)42; } }
+  public int JustI32 { get { int o = __p.__offset(28); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public int? MaybeI32 { get { int o = __p.__offset(30); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int?)null; } }
+  public int DefaultI32 { get { int o = __p.__offset(32); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)42; } }
+  public uint JustU32 { get { int o = __p.__offset(34); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint)0; } }
+  public uint? MaybeU32 { get { int o = __p.__offset(36); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint?)null; } }
+  public uint DefaultU32 { get { int o = __p.__offset(38); return o != 0 ? __p.bb.GetUint(o + __p.bb_pos) : (uint)42; } }
+  public long JustI64 { get { int o = __p.__offset(40); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)0; } }
+  public long? MaybeI64 { get { int o = __p.__offset(42); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long?)null; } }
+  public long DefaultI64 { get { int o = __p.__offset(44); return o != 0 ? __p.bb.GetLong(o + __p.bb_pos) : (long)42; } }
+  public ulong JustU64 { get { int o = __p.__offset(46); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)0; } }
+  public ulong? MaybeU64 { get { int o = __p.__offset(48); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong?)null; } }
+  public ulong DefaultU64 { get { int o = __p.__offset(50); return o != 0 ? __p.bb.GetUlong(o + __p.bb_pos) : (ulong)42; } }
+  public float JustF32 { get { int o = __p.__offset(52); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)0.0f; } }
+  public float? MaybeF32 { get { int o = __p.__offset(54); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float?)null; } }
+  public float DefaultF32 { get { int o = __p.__offset(56); return o != 0 ? __p.bb.GetFloat(o + __p.bb_pos) : (float)42.0f; } }
+  public double JustF64 { get { int o = __p.__offset(58); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)0.0; } }
+  public double? MaybeF64 { get { int o = __p.__offset(60); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double?)null; } }
+  public double DefaultF64 { get { int o = __p.__offset(62); return o != 0 ? __p.bb.GetDouble(o + __p.bb_pos) : (double)42.0; } }
+  public bool JustBool { get { int o = __p.__offset(64); return o != 0 ? 0!=__p.bb.Get(o + __p.bb_pos) : (bool)false; } }
+  public bool? MaybeBool { get { int o = __p.__offset(66); return o != 0 ? 0!=__p.bb.Get(o + __p.bb_pos) : (bool?)null; } }
+  public bool DefaultBool { get { int o = __p.__offset(68); return o != 0 ? 0!=__p.bb.Get(o + __p.bb_pos) : (bool)true; } }
+  public optional_scalars.OptionalByte JustEnum { get { int o = __p.__offset(70); return o != 0 ? (optional_scalars.OptionalByte)__p.bb.GetSbyte(o + __p.bb_pos) : optional_scalars.OptionalByte.None; } }
+  public optional_scalars.OptionalByte? MaybeEnum { get { int o = __p.__offset(72); return o != 0 ? (optional_scalars.OptionalByte)__p.bb.GetSbyte(o + __p.bb_pos) : (optional_scalars.OptionalByte?)null; } }
+  public optional_scalars.OptionalByte DefaultEnum { get { int o = __p.__offset(74); return o != 0 ? (optional_scalars.OptionalByte)__p.bb.GetSbyte(o + __p.bb_pos) : optional_scalars.OptionalByte.One; } }
+
+  public static Offset<optional_scalars.ScalarStuff> CreateScalarStuff(FlatBufferBuilder builder,
+      sbyte just_i8 = 0,
+      sbyte? maybe_i8 = null,
+      sbyte default_i8 = 42,
+      byte just_u8 = 0,
+      byte? maybe_u8 = null,
+      byte default_u8 = 42,
+      short just_i16 = 0,
+      short? maybe_i16 = null,
+      short default_i16 = 42,
+      ushort just_u16 = 0,
+      ushort? maybe_u16 = null,
+      ushort default_u16 = 42,
+      int just_i32 = 0,
+      int? maybe_i32 = null,
+      int default_i32 = 42,
+      uint just_u32 = 0,
+      uint? maybe_u32 = null,
+      uint default_u32 = 42,
+      long just_i64 = 0,
+      long? maybe_i64 = null,
+      long default_i64 = 42,
+      ulong just_u64 = 0,
+      ulong? maybe_u64 = null,
+      ulong default_u64 = 42,
+      float just_f32 = 0.0f,
+      float? maybe_f32 = null,
+      float default_f32 = 42.0f,
+      double just_f64 = 0.0,
+      double? maybe_f64 = null,
+      double default_f64 = 42.0,
+      bool just_bool = false,
+      bool? maybe_bool = null,
+      bool default_bool = true,
+      optional_scalars.OptionalByte just_enum = optional_scalars.OptionalByte.None,
+      optional_scalars.OptionalByte? maybe_enum = null,
+      optional_scalars.OptionalByte default_enum = optional_scalars.OptionalByte.One) {
+    builder.StartTable(36);
+    ScalarStuff.AddDefaultF64(builder, default_f64);
+    ScalarStuff.AddMaybeF64(builder, maybe_f64);
+    ScalarStuff.AddJustF64(builder, just_f64);
+    ScalarStuff.AddDefaultU64(builder, default_u64);
+    ScalarStuff.AddMaybeU64(builder, maybe_u64);
+    ScalarStuff.AddJustU64(builder, just_u64);
+    ScalarStuff.AddDefaultI64(builder, default_i64);
+    ScalarStuff.AddMaybeI64(builder, maybe_i64);
+    ScalarStuff.AddJustI64(builder, just_i64);
+    ScalarStuff.AddDefaultF32(builder, default_f32);
+    ScalarStuff.AddMaybeF32(builder, maybe_f32);
+    ScalarStuff.AddJustF32(builder, just_f32);
+    ScalarStuff.AddDefaultU32(builder, default_u32);
+    ScalarStuff.AddMaybeU32(builder, maybe_u32);
+    ScalarStuff.AddJustU32(builder, just_u32);
+    ScalarStuff.AddDefaultI32(builder, default_i32);
+    ScalarStuff.AddMaybeI32(builder, maybe_i32);
+    ScalarStuff.AddJustI32(builder, just_i32);
+    ScalarStuff.AddDefaultU16(builder, default_u16);
+    ScalarStuff.AddMaybeU16(builder, maybe_u16);
+    ScalarStuff.AddJustU16(builder, just_u16);
+    ScalarStuff.AddDefaultI16(builder, default_i16);
+    ScalarStuff.AddMaybeI16(builder, maybe_i16);
+    ScalarStuff.AddJustI16(builder, just_i16);
+    ScalarStuff.AddDefaultEnum(builder, default_enum);
+    ScalarStuff.AddMaybeEnum(builder, maybe_enum);
+    ScalarStuff.AddJustEnum(builder, just_enum);
+    ScalarStuff.AddDefaultBool(builder, default_bool);
+    ScalarStuff.AddMaybeBool(builder, maybe_bool);
+    ScalarStuff.AddJustBool(builder, just_bool);
+    ScalarStuff.AddDefaultU8(builder, default_u8);
+    ScalarStuff.AddMaybeU8(builder, maybe_u8);
+    ScalarStuff.AddJustU8(builder, just_u8);
+    ScalarStuff.AddDefaultI8(builder, default_i8);
+    ScalarStuff.AddMaybeI8(builder, maybe_i8);
+    ScalarStuff.AddJustI8(builder, just_i8);
+    return ScalarStuff.EndScalarStuff(builder);
+  }
+
+  public static void StartScalarStuff(FlatBufferBuilder builder) { builder.StartTable(36); }
+  public static void AddJustI8(FlatBufferBuilder builder, sbyte justI8) { builder.AddSbyte(0, justI8, 0); }
+  public static void AddMaybeI8(FlatBufferBuilder builder, sbyte? maybeI8) { builder.AddSbyte(1, maybeI8); }
+  public static void AddDefaultI8(FlatBufferBuilder builder, sbyte defaultI8) { builder.AddSbyte(2, defaultI8, 42); }
+  public static void AddJustU8(FlatBufferBuilder builder, byte justU8) { builder.AddByte(3, justU8, 0); }
+  public static void AddMaybeU8(FlatBufferBuilder builder, byte? maybeU8) { builder.AddByte(4, maybeU8); }
+  public static void AddDefaultU8(FlatBufferBuilder builder, byte defaultU8) { builder.AddByte(5, defaultU8, 42); }
+  public static void AddJustI16(FlatBufferBuilder builder, short justI16) { builder.AddShort(6, justI16, 0); }
+  public static void AddMaybeI16(FlatBufferBuilder builder, short? maybeI16) { builder.AddShort(7, maybeI16); }
+  public static void AddDefaultI16(FlatBufferBuilder builder, short defaultI16) { builder.AddShort(8, defaultI16, 42); }
+  public static void AddJustU16(FlatBufferBuilder builder, ushort justU16) { builder.AddUshort(9, justU16, 0); }
+  public static void AddMaybeU16(FlatBufferBuilder builder, ushort? maybeU16) { builder.AddUshort(10, maybeU16); }
+  public static void AddDefaultU16(FlatBufferBuilder builder, ushort defaultU16) { builder.AddUshort(11, defaultU16, 42); }
+  public static void AddJustI32(FlatBufferBuilder builder, int justI32) { builder.AddInt(12, justI32, 0); }
+  public static void AddMaybeI32(FlatBufferBuilder builder, int? maybeI32) { builder.AddInt(13, maybeI32); }
+  public static void AddDefaultI32(FlatBufferBuilder builder, int defaultI32) { builder.AddInt(14, defaultI32, 42); }
+  public static void AddJustU32(FlatBufferBuilder builder, uint justU32) { builder.AddUint(15, justU32, 0); }
+  public static void AddMaybeU32(FlatBufferBuilder builder, uint? maybeU32) { builder.AddUint(16, maybeU32); }
+  public static void AddDefaultU32(FlatBufferBuilder builder, uint defaultU32) { builder.AddUint(17, defaultU32, 42); }
+  public static void AddJustI64(FlatBufferBuilder builder, long justI64) { builder.AddLong(18, justI64, 0); }
+  public static void AddMaybeI64(FlatBufferBuilder builder, long? maybeI64) { builder.AddLong(19, maybeI64); }
+  public static void AddDefaultI64(FlatBufferBuilder builder, long defaultI64) { builder.AddLong(20, defaultI64, 42); }
+  public static void AddJustU64(FlatBufferBuilder builder, ulong justU64) { builder.AddUlong(21, justU64, 0); }
+  public static void AddMaybeU64(FlatBufferBuilder builder, ulong? maybeU64) { builder.AddUlong(22, maybeU64); }
+  public static void AddDefaultU64(FlatBufferBuilder builder, ulong defaultU64) { builder.AddUlong(23, defaultU64, 42); }
+  public static void AddJustF32(FlatBufferBuilder builder, float justF32) { builder.AddFloat(24, justF32, 0.0f); }
+  public static void AddMaybeF32(FlatBufferBuilder builder, float? maybeF32) { builder.AddFloat(25, maybeF32); }
+  public static void AddDefaultF32(FlatBufferBuilder builder, float defaultF32) { builder.AddFloat(26, defaultF32, 42.0f); }
+  public static void AddJustF64(FlatBufferBuilder builder, double justF64) { builder.AddDouble(27, justF64, 0.0); }
+  public static void AddMaybeF64(FlatBufferBuilder builder, double? maybeF64) { builder.AddDouble(28, maybeF64); }
+  public static void AddDefaultF64(FlatBufferBuilder builder, double defaultF64) { builder.AddDouble(29, defaultF64, 42.0); }
+  public static void AddJustBool(FlatBufferBuilder builder, bool justBool) { builder.AddBool(30, justBool, false); }
+  public static void AddMaybeBool(FlatBufferBuilder builder, bool? maybeBool) { builder.AddBool(31, maybeBool); }
+  public static void AddDefaultBool(FlatBufferBuilder builder, bool defaultBool) { builder.AddBool(32, defaultBool, true); }
+  public static void AddJustEnum(FlatBufferBuilder builder, optional_scalars.OptionalByte justEnum) { builder.AddSbyte(33, (sbyte)justEnum, 0); }
+  public static void AddMaybeEnum(FlatBufferBuilder builder, optional_scalars.OptionalByte? maybeEnum) { builder.AddSbyte(34, (sbyte)maybeEnum); }
+  public static void AddDefaultEnum(FlatBufferBuilder builder, optional_scalars.OptionalByte defaultEnum) { builder.AddSbyte(35, (sbyte)defaultEnum, 1); }
+  public static Offset<optional_scalars.ScalarStuff> EndScalarStuff(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<optional_scalars.ScalarStuff>(o);
+  }
+  public static void FinishScalarStuffBuffer(FlatBufferBuilder builder, Offset<optional_scalars.ScalarStuff> offset) { builder.Finish(offset.Value, "NULL"); }
+  public static void FinishSizePrefixedScalarStuffBuffer(FlatBufferBuilder builder, Offset<optional_scalars.ScalarStuff> offset) { builder.FinishSizePrefixed(offset.Value, "NULL"); }
+  public ScalarStuffT UnPack() {
+    var _o = new ScalarStuffT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(ScalarStuffT _o) {
+    _o.JustI8 = this.JustI8;
+    _o.MaybeI8 = this.MaybeI8;
+    _o.DefaultI8 = this.DefaultI8;
+    _o.JustU8 = this.JustU8;
+    _o.MaybeU8 = this.MaybeU8;
+    _o.DefaultU8 = this.DefaultU8;
+    _o.JustI16 = this.JustI16;
+    _o.MaybeI16 = this.MaybeI16;
+    _o.DefaultI16 = this.DefaultI16;
+    _o.JustU16 = this.JustU16;
+    _o.MaybeU16 = this.MaybeU16;
+    _o.DefaultU16 = this.DefaultU16;
+    _o.JustI32 = this.JustI32;
+    _o.MaybeI32 = this.MaybeI32;
+    _o.DefaultI32 = this.DefaultI32;
+    _o.JustU32 = this.JustU32;
+    _o.MaybeU32 = this.MaybeU32;
+    _o.DefaultU32 = this.DefaultU32;
+    _o.JustI64 = this.JustI64;
+    _o.MaybeI64 = this.MaybeI64;
+    _o.DefaultI64 = this.DefaultI64;
+    _o.JustU64 = this.JustU64;
+    _o.MaybeU64 = this.MaybeU64;
+    _o.DefaultU64 = this.DefaultU64;
+    _o.JustF32 = this.JustF32;
+    _o.MaybeF32 = this.MaybeF32;
+    _o.DefaultF32 = this.DefaultF32;
+    _o.JustF64 = this.JustF64;
+    _o.MaybeF64 = this.MaybeF64;
+    _o.DefaultF64 = this.DefaultF64;
+    _o.JustBool = this.JustBool;
+    _o.MaybeBool = this.MaybeBool;
+    _o.DefaultBool = this.DefaultBool;
+    _o.JustEnum = this.JustEnum;
+    _o.MaybeEnum = this.MaybeEnum;
+    _o.DefaultEnum = this.DefaultEnum;
+  }
+  public static Offset<optional_scalars.ScalarStuff> Pack(FlatBufferBuilder builder, ScalarStuffT _o) {
+    if (_o == null) return default(Offset<optional_scalars.ScalarStuff>);
+    return CreateScalarStuff(
+      builder,
+      _o.JustI8,
+      _o.MaybeI8,
+      _o.DefaultI8,
+      _o.JustU8,
+      _o.MaybeU8,
+      _o.DefaultU8,
+      _o.JustI16,
+      _o.MaybeI16,
+      _o.DefaultI16,
+      _o.JustU16,
+      _o.MaybeU16,
+      _o.DefaultU16,
+      _o.JustI32,
+      _o.MaybeI32,
+      _o.DefaultI32,
+      _o.JustU32,
+      _o.MaybeU32,
+      _o.DefaultU32,
+      _o.JustI64,
+      _o.MaybeI64,
+      _o.DefaultI64,
+      _o.JustU64,
+      _o.MaybeU64,
+      _o.DefaultU64,
+      _o.JustF32,
+      _o.MaybeF32,
+      _o.DefaultF32,
+      _o.JustF64,
+      _o.MaybeF64,
+      _o.DefaultF64,
+      _o.JustBool,
+      _o.MaybeBool,
+      _o.DefaultBool,
+      _o.JustEnum,
+      _o.MaybeEnum,
+      _o.DefaultEnum);
+  }
+};
+
+public class ScalarStuffT
+{
+  public sbyte JustI8 { get; set; }
+  public sbyte? MaybeI8 { get; set; }
+  public sbyte DefaultI8 { get; set; }
+  public byte JustU8 { get; set; }
+  public byte? MaybeU8 { get; set; }
+  public byte DefaultU8 { get; set; }
+  public short JustI16 { get; set; }
+  public short? MaybeI16 { get; set; }
+  public short DefaultI16 { get; set; }
+  public ushort JustU16 { get; set; }
+  public ushort? MaybeU16 { get; set; }
+  public ushort DefaultU16 { get; set; }
+  public int JustI32 { get; set; }
+  public int? MaybeI32 { get; set; }
+  public int DefaultI32 { get; set; }
+  public uint JustU32 { get; set; }
+  public uint? MaybeU32 { get; set; }
+  public uint DefaultU32 { get; set; }
+  public long JustI64 { get; set; }
+  public long? MaybeI64 { get; set; }
+  public long DefaultI64 { get; set; }
+  public ulong JustU64 { get; set; }
+  public ulong? MaybeU64 { get; set; }
+  public ulong DefaultU64 { get; set; }
+  public float JustF32 { get; set; }
+  public float? MaybeF32 { get; set; }
+  public float DefaultF32 { get; set; }
+  public double JustF64 { get; set; }
+  public double? MaybeF64 { get; set; }
+  public double DefaultF64 { get; set; }
+  public bool JustBool { get; set; }
+  public bool? MaybeBool { get; set; }
+  public bool DefaultBool { get; set; }
+  public optional_scalars.OptionalByte JustEnum { get; set; }
+  public optional_scalars.OptionalByte? MaybeEnum { get; set; }
+  public optional_scalars.OptionalByte DefaultEnum { get; set; }
+
+  public ScalarStuffT() {
+    this.JustI8 = 0;
+    this.MaybeI8 = null;
+    this.DefaultI8 = 42;
+    this.JustU8 = 0;
+    this.MaybeU8 = null;
+    this.DefaultU8 = 42;
+    this.JustI16 = 0;
+    this.MaybeI16 = null;
+    this.DefaultI16 = 42;
+    this.JustU16 = 0;
+    this.MaybeU16 = null;
+    this.DefaultU16 = 42;
+    this.JustI32 = 0;
+    this.MaybeI32 = null;
+    this.DefaultI32 = 42;
+    this.JustU32 = 0;
+    this.MaybeU32 = null;
+    this.DefaultU32 = 42;
+    this.JustI64 = 0;
+    this.MaybeI64 = null;
+    this.DefaultI64 = 42;
+    this.JustU64 = 0;
+    this.MaybeU64 = null;
+    this.DefaultU64 = 42;
+    this.JustF32 = 0.0f;
+    this.MaybeF32 = null;
+    this.DefaultF32 = 42.0f;
+    this.JustF64 = 0.0;
+    this.MaybeF64 = null;
+    this.DefaultF64 = 42.0;
+    this.JustBool = false;
+    this.MaybeBool = null;
+    this.DefaultBool = true;
+    this.JustEnum = optional_scalars.OptionalByte.None;
+    this.MaybeEnum = null;
+    this.DefaultEnum = optional_scalars.OptionalByte.One;
+  }
+  public static ScalarStuffT DeserializeFromBinary(byte[] fbBuffer) {
+    return ScalarStuff.GetRootAsScalarStuff(new ByteBuffer(fbBuffer)).UnPack();
+  }
+  public byte[] SerializeToBinary() {
+    var fbb = new FlatBufferBuilder(0x10000);
+    ScalarStuff.FinishScalarStuffBuffer(fbb, ScalarStuff.Pack(fbb, this));
+    return fbb.DataBuffer.ToSizedArray();
+  }
+}
+
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.java b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.java
new file mode 100644
index 0000000..86eaaa4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.java
@@ -0,0 +1,196 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package optional_scalars;
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class ScalarStuff extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static ScalarStuff getRootAsScalarStuff(ByteBuffer _bb) { return getRootAsScalarStuff(_bb, new ScalarStuff()); }
+  public static ScalarStuff getRootAsScalarStuff(ByteBuffer _bb, ScalarStuff obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public static boolean ScalarStuffBufferHasIdentifier(ByteBuffer _bb) { return __has_identifier(_bb, "NULL"); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public ScalarStuff __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public byte justI8() { int o = __offset(4); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public boolean hasMaybeI8() { return 0 != __offset(6); }
+  public byte maybeI8() { int o = __offset(6); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public byte defaultI8() { int o = __offset(8); return o != 0 ? bb.get(o + bb_pos) : 42; }
+  public int justU8() { int o = __offset(10); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 0; }
+  public boolean hasMaybeU8() { return 0 != __offset(12); }
+  public int maybeU8() { int o = __offset(12); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 0; }
+  public int defaultU8() { int o = __offset(14); return o != 0 ? bb.get(o + bb_pos) & 0xFF : 42; }
+  public short justI16() { int o = __offset(16); return o != 0 ? bb.getShort(o + bb_pos) : 0; }
+  public boolean hasMaybeI16() { return 0 != __offset(18); }
+  public short maybeI16() { int o = __offset(18); return o != 0 ? bb.getShort(o + bb_pos) : 0; }
+  public short defaultI16() { int o = __offset(20); return o != 0 ? bb.getShort(o + bb_pos) : 42; }
+  public int justU16() { int o = __offset(22); return o != 0 ? bb.getShort(o + bb_pos) & 0xFFFF : 0; }
+  public boolean hasMaybeU16() { return 0 != __offset(24); }
+  public int maybeU16() { int o = __offset(24); return o != 0 ? bb.getShort(o + bb_pos) & 0xFFFF : 0; }
+  public int defaultU16() { int o = __offset(26); return o != 0 ? bb.getShort(o + bb_pos) & 0xFFFF : 42; }
+  public int justI32() { int o = __offset(28); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean hasMaybeI32() { return 0 != __offset(30); }
+  public int maybeI32() { int o = __offset(30); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public int defaultI32() { int o = __offset(32); return o != 0 ? bb.getInt(o + bb_pos) : 42; }
+  public long justU32() { int o = __offset(34); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 0L; }
+  public boolean hasMaybeU32() { return 0 != __offset(36); }
+  public long maybeU32() { int o = __offset(36); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 0L; }
+  public long defaultU32() { int o = __offset(38); return o != 0 ? (long)bb.getInt(o + bb_pos) & 0xFFFFFFFFL : 42L; }
+  public long justI64() { int o = __offset(40); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean hasMaybeI64() { return 0 != __offset(42); }
+  public long maybeI64() { int o = __offset(42); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public long defaultI64() { int o = __offset(44); return o != 0 ? bb.getLong(o + bb_pos) : 42L; }
+  public long justU64() { int o = __offset(46); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public boolean hasMaybeU64() { return 0 != __offset(48); }
+  public long maybeU64() { int o = __offset(48); return o != 0 ? bb.getLong(o + bb_pos) : 0L; }
+  public long defaultU64() { int o = __offset(50); return o != 0 ? bb.getLong(o + bb_pos) : 42L; }
+  public float justF32() { int o = __offset(52); return o != 0 ? bb.getFloat(o + bb_pos) : 0.0f; }
+  public boolean hasMaybeF32() { return 0 != __offset(54); }
+  public float maybeF32() { int o = __offset(54); return o != 0 ? bb.getFloat(o + bb_pos) : 0f; }
+  public float defaultF32() { int o = __offset(56); return o != 0 ? bb.getFloat(o + bb_pos) : 42.0f; }
+  public double justF64() { int o = __offset(58); return o != 0 ? bb.getDouble(o + bb_pos) : 0.0; }
+  public boolean hasMaybeF64() { return 0 != __offset(60); }
+  public double maybeF64() { int o = __offset(60); return o != 0 ? bb.getDouble(o + bb_pos) : 0.0; }
+  public double defaultF64() { int o = __offset(62); return o != 0 ? bb.getDouble(o + bb_pos) : 42.0; }
+  public boolean justBool() { int o = __offset(64); return o != 0 ? 0!=bb.get(o + bb_pos) : false; }
+  public boolean hasMaybeBool() { return 0 != __offset(66); }
+  public boolean maybeBool() { int o = __offset(66); return o != 0 ? 0!=bb.get(o + bb_pos) : false; }
+  public boolean defaultBool() { int o = __offset(68); return o != 0 ? 0!=bb.get(o + bb_pos) : true; }
+  public byte justEnum() { int o = __offset(70); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public boolean hasMaybeEnum() { return 0 != __offset(72); }
+  public byte maybeEnum() { int o = __offset(72); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public byte defaultEnum() { int o = __offset(74); return o != 0 ? bb.get(o + bb_pos) : 1; }
+
+  public static int createScalarStuff(FlatBufferBuilder builder,
+      byte just_i8,
+      byte maybe_i8,
+      byte default_i8,
+      int just_u8,
+      int maybe_u8,
+      int default_u8,
+      short just_i16,
+      short maybe_i16,
+      short default_i16,
+      int just_u16,
+      int maybe_u16,
+      int default_u16,
+      int just_i32,
+      int maybe_i32,
+      int default_i32,
+      long just_u32,
+      long maybe_u32,
+      long default_u32,
+      long just_i64,
+      long maybe_i64,
+      long default_i64,
+      long just_u64,
+      long maybe_u64,
+      long default_u64,
+      float just_f32,
+      float maybe_f32,
+      float default_f32,
+      double just_f64,
+      double maybe_f64,
+      double default_f64,
+      boolean just_bool,
+      boolean maybe_bool,
+      boolean default_bool,
+      byte just_enum,
+      byte maybe_enum,
+      byte default_enum) {
+    builder.startTable(36);
+    ScalarStuff.addDefaultF64(builder, default_f64);
+    ScalarStuff.addMaybeF64(builder, maybe_f64);
+    ScalarStuff.addJustF64(builder, just_f64);
+    ScalarStuff.addDefaultU64(builder, default_u64);
+    ScalarStuff.addMaybeU64(builder, maybe_u64);
+    ScalarStuff.addJustU64(builder, just_u64);
+    ScalarStuff.addDefaultI64(builder, default_i64);
+    ScalarStuff.addMaybeI64(builder, maybe_i64);
+    ScalarStuff.addJustI64(builder, just_i64);
+    ScalarStuff.addDefaultF32(builder, default_f32);
+    ScalarStuff.addMaybeF32(builder, maybe_f32);
+    ScalarStuff.addJustF32(builder, just_f32);
+    ScalarStuff.addDefaultU32(builder, default_u32);
+    ScalarStuff.addMaybeU32(builder, maybe_u32);
+    ScalarStuff.addJustU32(builder, just_u32);
+    ScalarStuff.addDefaultI32(builder, default_i32);
+    ScalarStuff.addMaybeI32(builder, maybe_i32);
+    ScalarStuff.addJustI32(builder, just_i32);
+    ScalarStuff.addDefaultU16(builder, default_u16);
+    ScalarStuff.addMaybeU16(builder, maybe_u16);
+    ScalarStuff.addJustU16(builder, just_u16);
+    ScalarStuff.addDefaultI16(builder, default_i16);
+    ScalarStuff.addMaybeI16(builder, maybe_i16);
+    ScalarStuff.addJustI16(builder, just_i16);
+    ScalarStuff.addDefaultEnum(builder, default_enum);
+    ScalarStuff.addMaybeEnum(builder, maybe_enum);
+    ScalarStuff.addJustEnum(builder, just_enum);
+    ScalarStuff.addDefaultBool(builder, default_bool);
+    ScalarStuff.addMaybeBool(builder, maybe_bool);
+    ScalarStuff.addJustBool(builder, just_bool);
+    ScalarStuff.addDefaultU8(builder, default_u8);
+    ScalarStuff.addMaybeU8(builder, maybe_u8);
+    ScalarStuff.addJustU8(builder, just_u8);
+    ScalarStuff.addDefaultI8(builder, default_i8);
+    ScalarStuff.addMaybeI8(builder, maybe_i8);
+    ScalarStuff.addJustI8(builder, just_i8);
+    return ScalarStuff.endScalarStuff(builder);
+  }
+
+  public static void startScalarStuff(FlatBufferBuilder builder) { builder.startTable(36); }
+  public static void addJustI8(FlatBufferBuilder builder, byte justI8) { builder.addByte(0, justI8, 0); }
+  public static void addMaybeI8(FlatBufferBuilder builder, byte maybeI8) { builder.addByte(1, maybeI8, 0); }
+  public static void addDefaultI8(FlatBufferBuilder builder, byte defaultI8) { builder.addByte(2, defaultI8, 42); }
+  public static void addJustU8(FlatBufferBuilder builder, int justU8) { builder.addByte(3, (byte)justU8, (byte)0); }
+  public static void addMaybeU8(FlatBufferBuilder builder, int maybeU8) { builder.addByte(4, (byte)maybeU8, (byte)0); }
+  public static void addDefaultU8(FlatBufferBuilder builder, int defaultU8) { builder.addByte(5, (byte)defaultU8, (byte)42); }
+  public static void addJustI16(FlatBufferBuilder builder, short justI16) { builder.addShort(6, justI16, 0); }
+  public static void addMaybeI16(FlatBufferBuilder builder, short maybeI16) { builder.addShort(7, maybeI16, 0); }
+  public static void addDefaultI16(FlatBufferBuilder builder, short defaultI16) { builder.addShort(8, defaultI16, 42); }
+  public static void addJustU16(FlatBufferBuilder builder, int justU16) { builder.addShort(9, (short)justU16, (short)0); }
+  public static void addMaybeU16(FlatBufferBuilder builder, int maybeU16) { builder.addShort(10, (short)maybeU16, (short)0); }
+  public static void addDefaultU16(FlatBufferBuilder builder, int defaultU16) { builder.addShort(11, (short)defaultU16, (short)42); }
+  public static void addJustI32(FlatBufferBuilder builder, int justI32) { builder.addInt(12, justI32, 0); }
+  public static void addMaybeI32(FlatBufferBuilder builder, int maybeI32) { builder.addInt(13, maybeI32, 0); }
+  public static void addDefaultI32(FlatBufferBuilder builder, int defaultI32) { builder.addInt(14, defaultI32, 42); }
+  public static void addJustU32(FlatBufferBuilder builder, long justU32) { builder.addInt(15, (int)justU32, (int)0L); }
+  public static void addMaybeU32(FlatBufferBuilder builder, long maybeU32) { builder.addInt(16, (int)maybeU32, (int)0L); }
+  public static void addDefaultU32(FlatBufferBuilder builder, long defaultU32) { builder.addInt(17, (int)defaultU32, (int)42L); }
+  public static void addJustI64(FlatBufferBuilder builder, long justI64) { builder.addLong(18, justI64, 0L); }
+  public static void addMaybeI64(FlatBufferBuilder builder, long maybeI64) { builder.addLong(19, maybeI64, 0L); }
+  public static void addDefaultI64(FlatBufferBuilder builder, long defaultI64) { builder.addLong(20, defaultI64, 42L); }
+  public static void addJustU64(FlatBufferBuilder builder, long justU64) { builder.addLong(21, justU64, 0L); }
+  public static void addMaybeU64(FlatBufferBuilder builder, long maybeU64) { builder.addLong(22, maybeU64, 0L); }
+  public static void addDefaultU64(FlatBufferBuilder builder, long defaultU64) { builder.addLong(23, defaultU64, 42L); }
+  public static void addJustF32(FlatBufferBuilder builder, float justF32) { builder.addFloat(24, justF32, 0.0f); }
+  public static void addMaybeF32(FlatBufferBuilder builder, float maybeF32) { builder.addFloat(25, maybeF32, 0f); }
+  public static void addDefaultF32(FlatBufferBuilder builder, float defaultF32) { builder.addFloat(26, defaultF32, 42.0f); }
+  public static void addJustF64(FlatBufferBuilder builder, double justF64) { builder.addDouble(27, justF64, 0.0); }
+  public static void addMaybeF64(FlatBufferBuilder builder, double maybeF64) { builder.addDouble(28, maybeF64, 0.0); }
+  public static void addDefaultF64(FlatBufferBuilder builder, double defaultF64) { builder.addDouble(29, defaultF64, 42.0); }
+  public static void addJustBool(FlatBufferBuilder builder, boolean justBool) { builder.addBoolean(30, justBool, false); }
+  public static void addMaybeBool(FlatBufferBuilder builder, boolean maybeBool) { builder.addBoolean(31, maybeBool, false); }
+  public static void addDefaultBool(FlatBufferBuilder builder, boolean defaultBool) { builder.addBoolean(32, defaultBool, true); }
+  public static void addJustEnum(FlatBufferBuilder builder, byte justEnum) { builder.addByte(33, justEnum, 0); }
+  public static void addMaybeEnum(FlatBufferBuilder builder, byte maybeEnum) { builder.addByte(34, maybeEnum, 0); }
+  public static void addDefaultEnum(FlatBufferBuilder builder, byte defaultEnum) { builder.addByte(35, defaultEnum, 1); }
+  public static int endScalarStuff(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+  public static void finishScalarStuffBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset, "NULL"); }
+  public static void finishSizePrefixedScalarStuffBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset, "NULL"); }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public ScalarStuff get(int j) { return get(new ScalarStuff(), j); }
+    public ScalarStuff get(ScalarStuff obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.kt b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.kt
new file mode 100644
index 0000000..0c618a9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/ScalarStuff.kt
@@ -0,0 +1,292 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package optional_scalars
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class ScalarStuff : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : ScalarStuff {
+        __init(_i, _bb)
+        return this
+    }
+    val justI8 : Byte
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.get(o + bb_pos) else 0
+        }
+    val maybeI8 : Byte?
+        get() {
+            val o = __offset(6)
+            return if(o != 0) bb.get(o + bb_pos) else null
+        }
+    val defaultI8 : Byte
+        get() {
+            val o = __offset(8)
+            return if(o != 0) bb.get(o + bb_pos) else 42
+        }
+    val justU8 : UByte
+        get() {
+            val o = __offset(10)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    val maybeU8 : UByte?
+        get() {
+            val o = __offset(12)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else null
+        }
+    val defaultU8 : UByte
+        get() {
+            val o = __offset(14)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 42u
+        }
+    val justI16 : Short
+        get() {
+            val o = __offset(16)
+            return if(o != 0) bb.getShort(o + bb_pos) else 0
+        }
+    val maybeI16 : Short?
+        get() {
+            val o = __offset(18)
+            return if(o != 0) bb.getShort(o + bb_pos) else null
+        }
+    val defaultI16 : Short
+        get() {
+            val o = __offset(20)
+            return if(o != 0) bb.getShort(o + bb_pos) else 42
+        }
+    val justU16 : UShort
+        get() {
+            val o = __offset(22)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else 0u
+        }
+    val maybeU16 : UShort?
+        get() {
+            val o = __offset(24)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else null
+        }
+    val defaultU16 : UShort
+        get() {
+            val o = __offset(26)
+            return if(o != 0) bb.getShort(o + bb_pos).toUShort() else 42u
+        }
+    val justI32 : Int
+        get() {
+            val o = __offset(28)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    val maybeI32 : Int?
+        get() {
+            val o = __offset(30)
+            return if(o != 0) bb.getInt(o + bb_pos) else null
+        }
+    val defaultI32 : Int
+        get() {
+            val o = __offset(32)
+            return if(o != 0) bb.getInt(o + bb_pos) else 42
+        }
+    val justU32 : UInt
+        get() {
+            val o = __offset(34)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else 0u
+        }
+    val maybeU32 : UInt?
+        get() {
+            val o = __offset(36)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else null
+        }
+    val defaultU32 : UInt
+        get() {
+            val o = __offset(38)
+            return if(o != 0) bb.getInt(o + bb_pos).toUInt() else 42u
+        }
+    val justI64 : Long
+        get() {
+            val o = __offset(40)
+            return if(o != 0) bb.getLong(o + bb_pos) else 0L
+        }
+    val maybeI64 : Long?
+        get() {
+            val o = __offset(42)
+            return if(o != 0) bb.getLong(o + bb_pos) else null
+        }
+    val defaultI64 : Long
+        get() {
+            val o = __offset(44)
+            return if(o != 0) bb.getLong(o + bb_pos) else 42L
+        }
+    val justU64 : ULong
+        get() {
+            val o = __offset(46)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 0UL
+        }
+    val maybeU64 : ULong?
+        get() {
+            val o = __offset(48)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else null
+        }
+    val defaultU64 : ULong
+        get() {
+            val o = __offset(50)
+            return if(o != 0) bb.getLong(o + bb_pos).toULong() else 42UL
+        }
+    val justF32 : Float
+        get() {
+            val o = __offset(52)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 0.0f
+        }
+    val maybeF32 : Float?
+        get() {
+            val o = __offset(54)
+            return if(o != 0) bb.getFloat(o + bb_pos) else null
+        }
+    val defaultF32 : Float
+        get() {
+            val o = __offset(56)
+            return if(o != 0) bb.getFloat(o + bb_pos) else 42.0f
+        }
+    val justF64 : Double
+        get() {
+            val o = __offset(58)
+            return if(o != 0) bb.getDouble(o + bb_pos) else 0.0
+        }
+    val maybeF64 : Double?
+        get() {
+            val o = __offset(60)
+            return if(o != 0) bb.getDouble(o + bb_pos) else null
+        }
+    val defaultF64 : Double
+        get() {
+            val o = __offset(62)
+            return if(o != 0) bb.getDouble(o + bb_pos) else 42.0
+        }
+    val justBool : Boolean
+        get() {
+            val o = __offset(64)
+            return if(o != 0) 0.toByte() != bb.get(o + bb_pos) else false
+        }
+    val maybeBool : Boolean?
+        get() {
+            val o = __offset(66)
+            return if(o != 0) 0.toByte() != bb.get(o + bb_pos) else null
+        }
+    val defaultBool : Boolean
+        get() {
+            val o = __offset(68)
+            return if(o != 0) 0.toByte() != bb.get(o + bb_pos) else true
+        }
+    val justEnum : Byte
+        get() {
+            val o = __offset(70)
+            return if(o != 0) bb.get(o + bb_pos) else 0
+        }
+    val maybeEnum : Byte?
+        get() {
+            val o = __offset(72)
+            return if(o != 0) bb.get(o + bb_pos) else null
+        }
+    val defaultEnum : Byte
+        get() {
+            val o = __offset(74)
+            return if(o != 0) bb.get(o + bb_pos) else 1
+        }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsScalarStuff(_bb: ByteBuffer): ScalarStuff = getRootAsScalarStuff(_bb, ScalarStuff())
+        fun getRootAsScalarStuff(_bb: ByteBuffer, obj: ScalarStuff): ScalarStuff {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun ScalarStuffBufferHasIdentifier(_bb: ByteBuffer) : Boolean = __has_identifier(_bb, "NULL")
+        fun createScalarStuff(builder: FlatBufferBuilder, justI8: Byte, maybeI8: Byte?, defaultI8: Byte, justU8: UByte, maybeU8: UByte?, defaultU8: UByte, justI16: Short, maybeI16: Short?, defaultI16: Short, justU16: UShort, maybeU16: UShort?, defaultU16: UShort, justI32: Int, maybeI32: Int?, defaultI32: Int, justU32: UInt, maybeU32: UInt?, defaultU32: UInt, justI64: Long, maybeI64: Long?, defaultI64: Long, justU64: ULong, maybeU64: ULong?, defaultU64: ULong, justF32: Float, maybeF32: Float?, defaultF32: Float, justF64: Double, maybeF64: Double?, defaultF64: Double, justBool: Boolean, maybeBool: Boolean?, defaultBool: Boolean, justEnum: Byte, maybeEnum: Byte?, defaultEnum: Byte) : Int {
+            builder.startTable(36)
+            addDefaultF64(builder, defaultF64)
+            maybeF64?.run { addMaybeF64(builder, maybeF64) }
+            addJustF64(builder, justF64)
+            addDefaultU64(builder, defaultU64)
+            maybeU64?.run { addMaybeU64(builder, maybeU64) }
+            addJustU64(builder, justU64)
+            addDefaultI64(builder, defaultI64)
+            maybeI64?.run { addMaybeI64(builder, maybeI64) }
+            addJustI64(builder, justI64)
+            addDefaultF32(builder, defaultF32)
+            maybeF32?.run { addMaybeF32(builder, maybeF32) }
+            addJustF32(builder, justF32)
+            addDefaultU32(builder, defaultU32)
+            maybeU32?.run { addMaybeU32(builder, maybeU32) }
+            addJustU32(builder, justU32)
+            addDefaultI32(builder, defaultI32)
+            maybeI32?.run { addMaybeI32(builder, maybeI32) }
+            addJustI32(builder, justI32)
+            addDefaultU16(builder, defaultU16)
+            maybeU16?.run { addMaybeU16(builder, maybeU16) }
+            addJustU16(builder, justU16)
+            addDefaultI16(builder, defaultI16)
+            maybeI16?.run { addMaybeI16(builder, maybeI16) }
+            addJustI16(builder, justI16)
+            addDefaultEnum(builder, defaultEnum)
+            maybeEnum?.run { addMaybeEnum(builder, maybeEnum) }
+            addJustEnum(builder, justEnum)
+            addDefaultBool(builder, defaultBool)
+            maybeBool?.run { addMaybeBool(builder, maybeBool) }
+            addJustBool(builder, justBool)
+            addDefaultU8(builder, defaultU8)
+            maybeU8?.run { addMaybeU8(builder, maybeU8) }
+            addJustU8(builder, justU8)
+            addDefaultI8(builder, defaultI8)
+            maybeI8?.run { addMaybeI8(builder, maybeI8) }
+            addJustI8(builder, justI8)
+            return endScalarStuff(builder)
+        }
+        fun startScalarStuff(builder: FlatBufferBuilder) = builder.startTable(36)
+        fun addJustI8(builder: FlatBufferBuilder, justI8: Byte) = builder.addByte(0, justI8, 0)
+        fun addMaybeI8(builder: FlatBufferBuilder, maybeI8: Byte) = builder.addByte(1, maybeI8, 0)
+        fun addDefaultI8(builder: FlatBufferBuilder, defaultI8: Byte) = builder.addByte(2, defaultI8, 42)
+        fun addJustU8(builder: FlatBufferBuilder, justU8: UByte) = builder.addByte(3, justU8.toByte(), 0)
+        fun addMaybeU8(builder: FlatBufferBuilder, maybeU8: UByte) = builder.addByte(4, maybeU8.toByte(), 0)
+        fun addDefaultU8(builder: FlatBufferBuilder, defaultU8: UByte) = builder.addByte(5, defaultU8.toByte(), 42)
+        fun addJustI16(builder: FlatBufferBuilder, justI16: Short) = builder.addShort(6, justI16, 0)
+        fun addMaybeI16(builder: FlatBufferBuilder, maybeI16: Short) = builder.addShort(7, maybeI16, 0)
+        fun addDefaultI16(builder: FlatBufferBuilder, defaultI16: Short) = builder.addShort(8, defaultI16, 42)
+        fun addJustU16(builder: FlatBufferBuilder, justU16: UShort) = builder.addShort(9, justU16.toShort(), 0)
+        fun addMaybeU16(builder: FlatBufferBuilder, maybeU16: UShort) = builder.addShort(10, maybeU16.toShort(), 0)
+        fun addDefaultU16(builder: FlatBufferBuilder, defaultU16: UShort) = builder.addShort(11, defaultU16.toShort(), 42)
+        fun addJustI32(builder: FlatBufferBuilder, justI32: Int) = builder.addInt(12, justI32, 0)
+        fun addMaybeI32(builder: FlatBufferBuilder, maybeI32: Int) = builder.addInt(13, maybeI32, 0)
+        fun addDefaultI32(builder: FlatBufferBuilder, defaultI32: Int) = builder.addInt(14, defaultI32, 42)
+        fun addJustU32(builder: FlatBufferBuilder, justU32: UInt) = builder.addInt(15, justU32.toInt(), 0)
+        fun addMaybeU32(builder: FlatBufferBuilder, maybeU32: UInt) = builder.addInt(16, maybeU32.toInt(), 0)
+        fun addDefaultU32(builder: FlatBufferBuilder, defaultU32: UInt) = builder.addInt(17, defaultU32.toInt(), 42)
+        fun addJustI64(builder: FlatBufferBuilder, justI64: Long) = builder.addLong(18, justI64, 0L)
+        fun addMaybeI64(builder: FlatBufferBuilder, maybeI64: Long) = builder.addLong(19, maybeI64, 0)
+        fun addDefaultI64(builder: FlatBufferBuilder, defaultI64: Long) = builder.addLong(20, defaultI64, 42L)
+        fun addJustU64(builder: FlatBufferBuilder, justU64: ULong) = builder.addLong(21, justU64.toLong(), 0)
+        fun addMaybeU64(builder: FlatBufferBuilder, maybeU64: ULong) = builder.addLong(22, maybeU64.toLong(), 0)
+        fun addDefaultU64(builder: FlatBufferBuilder, defaultU64: ULong) = builder.addLong(23, defaultU64.toLong(), 42)
+        fun addJustF32(builder: FlatBufferBuilder, justF32: Float) = builder.addFloat(24, justF32, 0.0)
+        fun addMaybeF32(builder: FlatBufferBuilder, maybeF32: Float) = builder.addFloat(25, maybeF32, 0.0)
+        fun addDefaultF32(builder: FlatBufferBuilder, defaultF32: Float) = builder.addFloat(26, defaultF32, 42.0)
+        fun addJustF64(builder: FlatBufferBuilder, justF64: Double) = builder.addDouble(27, justF64, 0.0)
+        fun addMaybeF64(builder: FlatBufferBuilder, maybeF64: Double) = builder.addDouble(28, maybeF64, 0.0)
+        fun addDefaultF64(builder: FlatBufferBuilder, defaultF64: Double) = builder.addDouble(29, defaultF64, 42.0)
+        fun addJustBool(builder: FlatBufferBuilder, justBool: Boolean) = builder.addBoolean(30, justBool, false)
+        fun addMaybeBool(builder: FlatBufferBuilder, maybeBool: Boolean) = builder.addBoolean(31, maybeBool, false)
+        fun addDefaultBool(builder: FlatBufferBuilder, defaultBool: Boolean) = builder.addBoolean(32, defaultBool, true)
+        fun addJustEnum(builder: FlatBufferBuilder, justEnum: Byte) = builder.addByte(33, justEnum, 0)
+        fun addMaybeEnum(builder: FlatBufferBuilder, maybeEnum: Byte) = builder.addByte(34, maybeEnum, 0)
+        fun addDefaultEnum(builder: FlatBufferBuilder, defaultEnum: Byte) = builder.addByte(35, defaultEnum, 1)
+        fun endScalarStuff(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun finishScalarStuffBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finish(offset, "NULL")
+        fun finishSizePrefixedScalarStuffBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finishSizePrefixed(offset, "NULL")
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.js b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.js
new file mode 100644
index 0000000..8257f93
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.js
@@ -0,0 +1,7 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export var OptionalByte;
+(function (OptionalByte) {
+    OptionalByte[OptionalByte["None"] = 0] = "None";
+    OptionalByte[OptionalByte["One"] = 1] = "One";
+    OptionalByte[OptionalByte["Two"] = 2] = "Two";
+})(OptionalByte || (OptionalByte = {}));
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.ts b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.ts
new file mode 100644
index 0000000..1db479f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/optional-byte.ts
@@ -0,0 +1,8 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+export enum OptionalByte{
+  None = 0,
+  One = 1,
+  Two = 2
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.js b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.js
new file mode 100644
index 0000000..7e6f913
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.js
@@ -0,0 +1,341 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { OptionalByte } from '../optional_scalars/optional-byte';
+export class ScalarStuff {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsScalarStuff(bb, obj) {
+        return (obj || new ScalarStuff()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsScalarStuff(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new ScalarStuff()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static bufferHasIdentifier(bb) {
+        return bb.__has_identifier('NULL');
+    }
+    justI8() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : 0;
+    }
+    maybeI8() {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : null;
+    }
+    defaultI8() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : 42;
+    }
+    justU8() {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : 0;
+    }
+    maybeU8() {
+        const offset = this.bb.__offset(this.bb_pos, 12);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : null;
+    }
+    defaultU8() {
+        const offset = this.bb.__offset(this.bb_pos, 14);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : 42;
+    }
+    justI16() {
+        const offset = this.bb.__offset(this.bb_pos, 16);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : 0;
+    }
+    maybeI16() {
+        const offset = this.bb.__offset(this.bb_pos, 18);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : null;
+    }
+    defaultI16() {
+        const offset = this.bb.__offset(this.bb_pos, 20);
+        return offset ? this.bb.readInt16(this.bb_pos + offset) : 42;
+    }
+    justU16() {
+        const offset = this.bb.__offset(this.bb_pos, 22);
+        return offset ? this.bb.readUint16(this.bb_pos + offset) : 0;
+    }
+    maybeU16() {
+        const offset = this.bb.__offset(this.bb_pos, 24);
+        return offset ? this.bb.readUint16(this.bb_pos + offset) : null;
+    }
+    defaultU16() {
+        const offset = this.bb.__offset(this.bb_pos, 26);
+        return offset ? this.bb.readUint16(this.bb_pos + offset) : 42;
+    }
+    justI32() {
+        const offset = this.bb.__offset(this.bb_pos, 28);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    maybeI32() {
+        const offset = this.bb.__offset(this.bb_pos, 30);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : null;
+    }
+    defaultI32() {
+        const offset = this.bb.__offset(this.bb_pos, 32);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 42;
+    }
+    justU32() {
+        const offset = this.bb.__offset(this.bb_pos, 34);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : 0;
+    }
+    maybeU32() {
+        const offset = this.bb.__offset(this.bb_pos, 36);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : null;
+    }
+    defaultU32() {
+        const offset = this.bb.__offset(this.bb_pos, 38);
+        return offset ? this.bb.readUint32(this.bb_pos + offset) : 42;
+    }
+    justI64() {
+        const offset = this.bb.__offset(this.bb_pos, 40);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    maybeI64() {
+        const offset = this.bb.__offset(this.bb_pos, 42);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : null;
+    }
+    defaultI64() {
+        const offset = this.bb.__offset(this.bb_pos, 44);
+        return offset ? this.bb.readInt64(this.bb_pos + offset) : this.bb.createLong(42, 0);
+    }
+    justU64() {
+        const offset = this.bb.__offset(this.bb_pos, 46);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(0, 0);
+    }
+    maybeU64() {
+        const offset = this.bb.__offset(this.bb_pos, 48);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : null;
+    }
+    defaultU64() {
+        const offset = this.bb.__offset(this.bb_pos, 50);
+        return offset ? this.bb.readUint64(this.bb_pos + offset) : this.bb.createLong(42, 0);
+    }
+    justF32() {
+        const offset = this.bb.__offset(this.bb_pos, 52);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 0.0;
+    }
+    maybeF32() {
+        const offset = this.bb.__offset(this.bb_pos, 54);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : null;
+    }
+    defaultF32() {
+        const offset = this.bb.__offset(this.bb_pos, 56);
+        return offset ? this.bb.readFloat32(this.bb_pos + offset) : 42.0;
+    }
+    justF64() {
+        const offset = this.bb.__offset(this.bb_pos, 58);
+        return offset ? this.bb.readFloat64(this.bb_pos + offset) : 0.0;
+    }
+    maybeF64() {
+        const offset = this.bb.__offset(this.bb_pos, 60);
+        return offset ? this.bb.readFloat64(this.bb_pos + offset) : null;
+    }
+    defaultF64() {
+        const offset = this.bb.__offset(this.bb_pos, 62);
+        return offset ? this.bb.readFloat64(this.bb_pos + offset) : 42.0;
+    }
+    justBool() {
+        const offset = this.bb.__offset(this.bb_pos, 64);
+        return offset ? !!this.bb.readInt8(this.bb_pos + offset) : false;
+    }
+    maybeBool() {
+        const offset = this.bb.__offset(this.bb_pos, 66);
+        return offset ? !!this.bb.readInt8(this.bb_pos + offset) : null;
+    }
+    defaultBool() {
+        const offset = this.bb.__offset(this.bb_pos, 68);
+        return offset ? !!this.bb.readInt8(this.bb_pos + offset) : true;
+    }
+    justEnum() {
+        const offset = this.bb.__offset(this.bb_pos, 70);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : OptionalByte.None;
+    }
+    maybeEnum() {
+        const offset = this.bb.__offset(this.bb_pos, 72);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : null;
+    }
+    defaultEnum() {
+        const offset = this.bb.__offset(this.bb_pos, 74);
+        return offset ? this.bb.readInt8(this.bb_pos + offset) : OptionalByte.One;
+    }
+    static startScalarStuff(builder) {
+        builder.startObject(36);
+    }
+    static addJustI8(builder, justI8) {
+        builder.addFieldInt8(0, justI8, 0);
+    }
+    static addMaybeI8(builder, maybeI8) {
+        builder.addFieldInt8(1, maybeI8, 0);
+    }
+    static addDefaultI8(builder, defaultI8) {
+        builder.addFieldInt8(2, defaultI8, 42);
+    }
+    static addJustU8(builder, justU8) {
+        builder.addFieldInt8(3, justU8, 0);
+    }
+    static addMaybeU8(builder, maybeU8) {
+        builder.addFieldInt8(4, maybeU8, 0);
+    }
+    static addDefaultU8(builder, defaultU8) {
+        builder.addFieldInt8(5, defaultU8, 42);
+    }
+    static addJustI16(builder, justI16) {
+        builder.addFieldInt16(6, justI16, 0);
+    }
+    static addMaybeI16(builder, maybeI16) {
+        builder.addFieldInt16(7, maybeI16, 0);
+    }
+    static addDefaultI16(builder, defaultI16) {
+        builder.addFieldInt16(8, defaultI16, 42);
+    }
+    static addJustU16(builder, justU16) {
+        builder.addFieldInt16(9, justU16, 0);
+    }
+    static addMaybeU16(builder, maybeU16) {
+        builder.addFieldInt16(10, maybeU16, 0);
+    }
+    static addDefaultU16(builder, defaultU16) {
+        builder.addFieldInt16(11, defaultU16, 42);
+    }
+    static addJustI32(builder, justI32) {
+        builder.addFieldInt32(12, justI32, 0);
+    }
+    static addMaybeI32(builder, maybeI32) {
+        builder.addFieldInt32(13, maybeI32, 0);
+    }
+    static addDefaultI32(builder, defaultI32) {
+        builder.addFieldInt32(14, defaultI32, 42);
+    }
+    static addJustU32(builder, justU32) {
+        builder.addFieldInt32(15, justU32, 0);
+    }
+    static addMaybeU32(builder, maybeU32) {
+        builder.addFieldInt32(16, maybeU32, 0);
+    }
+    static addDefaultU32(builder, defaultU32) {
+        builder.addFieldInt32(17, defaultU32, 42);
+    }
+    static addJustI64(builder, justI64) {
+        builder.addFieldInt64(18, justI64, builder.createLong(0, 0));
+    }
+    static addMaybeI64(builder, maybeI64) {
+        builder.addFieldInt64(19, maybeI64, builder.createLong(0, 0));
+    }
+    static addDefaultI64(builder, defaultI64) {
+        builder.addFieldInt64(20, defaultI64, builder.createLong(42, 0));
+    }
+    static addJustU64(builder, justU64) {
+        builder.addFieldInt64(21, justU64, builder.createLong(0, 0));
+    }
+    static addMaybeU64(builder, maybeU64) {
+        builder.addFieldInt64(22, maybeU64, builder.createLong(0, 0));
+    }
+    static addDefaultU64(builder, defaultU64) {
+        builder.addFieldInt64(23, defaultU64, builder.createLong(42, 0));
+    }
+    static addJustF32(builder, justF32) {
+        builder.addFieldFloat32(24, justF32, 0.0);
+    }
+    static addMaybeF32(builder, maybeF32) {
+        builder.addFieldFloat32(25, maybeF32, 0);
+    }
+    static addDefaultF32(builder, defaultF32) {
+        builder.addFieldFloat32(26, defaultF32, 42.0);
+    }
+    static addJustF64(builder, justF64) {
+        builder.addFieldFloat64(27, justF64, 0.0);
+    }
+    static addMaybeF64(builder, maybeF64) {
+        builder.addFieldFloat64(28, maybeF64, 0);
+    }
+    static addDefaultF64(builder, defaultF64) {
+        builder.addFieldFloat64(29, defaultF64, 42.0);
+    }
+    static addJustBool(builder, justBool) {
+        builder.addFieldInt8(30, +justBool, +false);
+    }
+    static addMaybeBool(builder, maybeBool) {
+        builder.addFieldInt8(31, +maybeBool, 0);
+    }
+    static addDefaultBool(builder, defaultBool) {
+        builder.addFieldInt8(32, +defaultBool, +true);
+    }
+    static addJustEnum(builder, justEnum) {
+        builder.addFieldInt8(33, justEnum, OptionalByte.None);
+    }
+    static addMaybeEnum(builder, maybeEnum) {
+        builder.addFieldInt8(34, maybeEnum, 0);
+    }
+    static addDefaultEnum(builder, defaultEnum) {
+        builder.addFieldInt8(35, defaultEnum, OptionalByte.One);
+    }
+    static endScalarStuff(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static finishScalarStuffBuffer(builder, offset) {
+        builder.finish(offset, 'NULL');
+    }
+    static finishSizePrefixedScalarStuffBuffer(builder, offset) {
+        builder.finish(offset, 'NULL', true);
+    }
+    static createScalarStuff(builder, justI8, maybeI8, defaultI8, justU8, maybeU8, defaultU8, justI16, maybeI16, defaultI16, justU16, maybeU16, defaultU16, justI32, maybeI32, defaultI32, justU32, maybeU32, defaultU32, justI64, maybeI64, defaultI64, justU64, maybeU64, defaultU64, justF32, maybeF32, defaultF32, justF64, maybeF64, defaultF64, justBool, maybeBool, defaultBool, justEnum, maybeEnum, defaultEnum) {
+        ScalarStuff.startScalarStuff(builder);
+        ScalarStuff.addJustI8(builder, justI8);
+        if (maybeI8 !== null)
+            ScalarStuff.addMaybeI8(builder, maybeI8);
+        ScalarStuff.addDefaultI8(builder, defaultI8);
+        ScalarStuff.addJustU8(builder, justU8);
+        if (maybeU8 !== null)
+            ScalarStuff.addMaybeU8(builder, maybeU8);
+        ScalarStuff.addDefaultU8(builder, defaultU8);
+        ScalarStuff.addJustI16(builder, justI16);
+        if (maybeI16 !== null)
+            ScalarStuff.addMaybeI16(builder, maybeI16);
+        ScalarStuff.addDefaultI16(builder, defaultI16);
+        ScalarStuff.addJustU16(builder, justU16);
+        if (maybeU16 !== null)
+            ScalarStuff.addMaybeU16(builder, maybeU16);
+        ScalarStuff.addDefaultU16(builder, defaultU16);
+        ScalarStuff.addJustI32(builder, justI32);
+        if (maybeI32 !== null)
+            ScalarStuff.addMaybeI32(builder, maybeI32);
+        ScalarStuff.addDefaultI32(builder, defaultI32);
+        ScalarStuff.addJustU32(builder, justU32);
+        if (maybeU32 !== null)
+            ScalarStuff.addMaybeU32(builder, maybeU32);
+        ScalarStuff.addDefaultU32(builder, defaultU32);
+        ScalarStuff.addJustI64(builder, justI64);
+        if (maybeI64 !== null)
+            ScalarStuff.addMaybeI64(builder, maybeI64);
+        ScalarStuff.addDefaultI64(builder, defaultI64);
+        ScalarStuff.addJustU64(builder, justU64);
+        if (maybeU64 !== null)
+            ScalarStuff.addMaybeU64(builder, maybeU64);
+        ScalarStuff.addDefaultU64(builder, defaultU64);
+        ScalarStuff.addJustF32(builder, justF32);
+        if (maybeF32 !== null)
+            ScalarStuff.addMaybeF32(builder, maybeF32);
+        ScalarStuff.addDefaultF32(builder, defaultF32);
+        ScalarStuff.addJustF64(builder, justF64);
+        if (maybeF64 !== null)
+            ScalarStuff.addMaybeF64(builder, maybeF64);
+        ScalarStuff.addDefaultF64(builder, defaultF64);
+        ScalarStuff.addJustBool(builder, justBool);
+        if (maybeBool !== null)
+            ScalarStuff.addMaybeBool(builder, maybeBool);
+        ScalarStuff.addDefaultBool(builder, defaultBool);
+        ScalarStuff.addJustEnum(builder, justEnum);
+        if (maybeEnum !== null)
+            ScalarStuff.addMaybeEnum(builder, maybeEnum);
+        ScalarStuff.addDefaultEnum(builder, defaultEnum);
+        return ScalarStuff.endScalarStuff(builder);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.ts b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.ts
new file mode 100644
index 0000000..ed61137
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars/scalar-stuff.ts
@@ -0,0 +1,423 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { OptionalByte } from '../optional_scalars/optional-byte';
+
+
+export class ScalarStuff {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):ScalarStuff {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsScalarStuff(bb:flatbuffers.ByteBuffer, obj?:ScalarStuff):ScalarStuff {
+  return (obj || new ScalarStuff()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsScalarStuff(bb:flatbuffers.ByteBuffer, obj?:ScalarStuff):ScalarStuff {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new ScalarStuff()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static bufferHasIdentifier(bb:flatbuffers.ByteBuffer):boolean {
+  return bb.__has_identifier('NULL');
+}
+
+justI8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : 0;
+}
+
+maybeI8():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : null;
+}
+
+defaultI8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : 42;
+}
+
+justU8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : 0;
+}
+
+maybeU8():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 12);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : null;
+}
+
+defaultU8():number {
+  const offset = this.bb!.__offset(this.bb_pos, 14);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : 42;
+}
+
+justI16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 16);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : 0;
+}
+
+maybeI16():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 18);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : null;
+}
+
+defaultI16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 20);
+  return offset ? this.bb!.readInt16(this.bb_pos + offset) : 42;
+}
+
+justU16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 22);
+  return offset ? this.bb!.readUint16(this.bb_pos + offset) : 0;
+}
+
+maybeU16():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 24);
+  return offset ? this.bb!.readUint16(this.bb_pos + offset) : null;
+}
+
+defaultU16():number {
+  const offset = this.bb!.__offset(this.bb_pos, 26);
+  return offset ? this.bb!.readUint16(this.bb_pos + offset) : 42;
+}
+
+justI32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 28);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+maybeI32():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 30);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : null;
+}
+
+defaultI32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 32);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 42;
+}
+
+justU32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 34);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : 0;
+}
+
+maybeU32():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 36);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : null;
+}
+
+defaultU32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 38);
+  return offset ? this.bb!.readUint32(this.bb_pos + offset) : 42;
+}
+
+justI64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 40);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+maybeI64():flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 42);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : null;
+}
+
+defaultI64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 44);
+  return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(42, 0);
+}
+
+justU64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 46);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(0, 0);
+}
+
+maybeU64():flatbuffers.Long|null {
+  const offset = this.bb!.__offset(this.bb_pos, 48);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : null;
+}
+
+defaultU64():flatbuffers.Long {
+  const offset = this.bb!.__offset(this.bb_pos, 50);
+  return offset ? this.bb!.readUint64(this.bb_pos + offset) : this.bb!.createLong(42, 0);
+}
+
+justF32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 52);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 0.0;
+}
+
+maybeF32():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 54);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : null;
+}
+
+defaultF32():number {
+  const offset = this.bb!.__offset(this.bb_pos, 56);
+  return offset ? this.bb!.readFloat32(this.bb_pos + offset) : 42.0;
+}
+
+justF64():number {
+  const offset = this.bb!.__offset(this.bb_pos, 58);
+  return offset ? this.bb!.readFloat64(this.bb_pos + offset) : 0.0;
+}
+
+maybeF64():number|null {
+  const offset = this.bb!.__offset(this.bb_pos, 60);
+  return offset ? this.bb!.readFloat64(this.bb_pos + offset) : null;
+}
+
+defaultF64():number {
+  const offset = this.bb!.__offset(this.bb_pos, 62);
+  return offset ? this.bb!.readFloat64(this.bb_pos + offset) : 42.0;
+}
+
+justBool():boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 64);
+  return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false;
+}
+
+maybeBool():boolean|null {
+  const offset = this.bb!.__offset(this.bb_pos, 66);
+  return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : null;
+}
+
+defaultBool():boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 68);
+  return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : true;
+}
+
+justEnum():OptionalByte {
+  const offset = this.bb!.__offset(this.bb_pos, 70);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : OptionalByte.None;
+}
+
+maybeEnum():OptionalByte|null {
+  const offset = this.bb!.__offset(this.bb_pos, 72);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : null;
+}
+
+defaultEnum():OptionalByte {
+  const offset = this.bb!.__offset(this.bb_pos, 74);
+  return offset ? this.bb!.readInt8(this.bb_pos + offset) : OptionalByte.One;
+}
+
+static startScalarStuff(builder:flatbuffers.Builder) {
+  builder.startObject(36);
+}
+
+static addJustI8(builder:flatbuffers.Builder, justI8:number) {
+  builder.addFieldInt8(0, justI8, 0);
+}
+
+static addMaybeI8(builder:flatbuffers.Builder, maybeI8:number) {
+  builder.addFieldInt8(1, maybeI8, 0);
+}
+
+static addDefaultI8(builder:flatbuffers.Builder, defaultI8:number) {
+  builder.addFieldInt8(2, defaultI8, 42);
+}
+
+static addJustU8(builder:flatbuffers.Builder, justU8:number) {
+  builder.addFieldInt8(3, justU8, 0);
+}
+
+static addMaybeU8(builder:flatbuffers.Builder, maybeU8:number) {
+  builder.addFieldInt8(4, maybeU8, 0);
+}
+
+static addDefaultU8(builder:flatbuffers.Builder, defaultU8:number) {
+  builder.addFieldInt8(5, defaultU8, 42);
+}
+
+static addJustI16(builder:flatbuffers.Builder, justI16:number) {
+  builder.addFieldInt16(6, justI16, 0);
+}
+
+static addMaybeI16(builder:flatbuffers.Builder, maybeI16:number) {
+  builder.addFieldInt16(7, maybeI16, 0);
+}
+
+static addDefaultI16(builder:flatbuffers.Builder, defaultI16:number) {
+  builder.addFieldInt16(8, defaultI16, 42);
+}
+
+static addJustU16(builder:flatbuffers.Builder, justU16:number) {
+  builder.addFieldInt16(9, justU16, 0);
+}
+
+static addMaybeU16(builder:flatbuffers.Builder, maybeU16:number) {
+  builder.addFieldInt16(10, maybeU16, 0);
+}
+
+static addDefaultU16(builder:flatbuffers.Builder, defaultU16:number) {
+  builder.addFieldInt16(11, defaultU16, 42);
+}
+
+static addJustI32(builder:flatbuffers.Builder, justI32:number) {
+  builder.addFieldInt32(12, justI32, 0);
+}
+
+static addMaybeI32(builder:flatbuffers.Builder, maybeI32:number) {
+  builder.addFieldInt32(13, maybeI32, 0);
+}
+
+static addDefaultI32(builder:flatbuffers.Builder, defaultI32:number) {
+  builder.addFieldInt32(14, defaultI32, 42);
+}
+
+static addJustU32(builder:flatbuffers.Builder, justU32:number) {
+  builder.addFieldInt32(15, justU32, 0);
+}
+
+static addMaybeU32(builder:flatbuffers.Builder, maybeU32:number) {
+  builder.addFieldInt32(16, maybeU32, 0);
+}
+
+static addDefaultU32(builder:flatbuffers.Builder, defaultU32:number) {
+  builder.addFieldInt32(17, defaultU32, 42);
+}
+
+static addJustI64(builder:flatbuffers.Builder, justI64:flatbuffers.Long) {
+  builder.addFieldInt64(18, justI64, builder.createLong(0, 0));
+}
+
+static addMaybeI64(builder:flatbuffers.Builder, maybeI64:flatbuffers.Long) {
+  builder.addFieldInt64(19, maybeI64, builder.createLong(0, 0));
+}
+
+static addDefaultI64(builder:flatbuffers.Builder, defaultI64:flatbuffers.Long) {
+  builder.addFieldInt64(20, defaultI64, builder.createLong(42, 0));
+}
+
+static addJustU64(builder:flatbuffers.Builder, justU64:flatbuffers.Long) {
+  builder.addFieldInt64(21, justU64, builder.createLong(0, 0));
+}
+
+static addMaybeU64(builder:flatbuffers.Builder, maybeU64:flatbuffers.Long) {
+  builder.addFieldInt64(22, maybeU64, builder.createLong(0, 0));
+}
+
+static addDefaultU64(builder:flatbuffers.Builder, defaultU64:flatbuffers.Long) {
+  builder.addFieldInt64(23, defaultU64, builder.createLong(42, 0));
+}
+
+static addJustF32(builder:flatbuffers.Builder, justF32:number) {
+  builder.addFieldFloat32(24, justF32, 0.0);
+}
+
+static addMaybeF32(builder:flatbuffers.Builder, maybeF32:number) {
+  builder.addFieldFloat32(25, maybeF32, 0);
+}
+
+static addDefaultF32(builder:flatbuffers.Builder, defaultF32:number) {
+  builder.addFieldFloat32(26, defaultF32, 42.0);
+}
+
+static addJustF64(builder:flatbuffers.Builder, justF64:number) {
+  builder.addFieldFloat64(27, justF64, 0.0);
+}
+
+static addMaybeF64(builder:flatbuffers.Builder, maybeF64:number) {
+  builder.addFieldFloat64(28, maybeF64, 0);
+}
+
+static addDefaultF64(builder:flatbuffers.Builder, defaultF64:number) {
+  builder.addFieldFloat64(29, defaultF64, 42.0);
+}
+
+static addJustBool(builder:flatbuffers.Builder, justBool:boolean) {
+  builder.addFieldInt8(30, +justBool, +false);
+}
+
+static addMaybeBool(builder:flatbuffers.Builder, maybeBool:boolean) {
+  builder.addFieldInt8(31, +maybeBool, 0);
+}
+
+static addDefaultBool(builder:flatbuffers.Builder, defaultBool:boolean) {
+  builder.addFieldInt8(32, +defaultBool, +true);
+}
+
+static addJustEnum(builder:flatbuffers.Builder, justEnum:OptionalByte) {
+  builder.addFieldInt8(33, justEnum, OptionalByte.None);
+}
+
+static addMaybeEnum(builder:flatbuffers.Builder, maybeEnum:OptionalByte) {
+  builder.addFieldInt8(34, maybeEnum, 0);
+}
+
+static addDefaultEnum(builder:flatbuffers.Builder, defaultEnum:OptionalByte) {
+  builder.addFieldInt8(35, defaultEnum, OptionalByte.One);
+}
+
+static endScalarStuff(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static finishScalarStuffBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'NULL');
+}
+
+static finishSizePrefixedScalarStuffBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'NULL', true);
+}
+
+static createScalarStuff(builder:flatbuffers.Builder, justI8:number, maybeI8:number|null, defaultI8:number, justU8:number, maybeU8:number|null, defaultU8:number, justI16:number, maybeI16:number|null, defaultI16:number, justU16:number, maybeU16:number|null, defaultU16:number, justI32:number, maybeI32:number|null, defaultI32:number, justU32:number, maybeU32:number|null, defaultU32:number, justI64:flatbuffers.Long, maybeI64:flatbuffers.Long|null, defaultI64:flatbuffers.Long, justU64:flatbuffers.Long, maybeU64:flatbuffers.Long|null, defaultU64:flatbuffers.Long, justF32:number, maybeF32:number|null, defaultF32:number, justF64:number, maybeF64:number|null, defaultF64:number, justBool:boolean, maybeBool:boolean|null, defaultBool:boolean, justEnum:OptionalByte, maybeEnum:OptionalByte|null, defaultEnum:OptionalByte):flatbuffers.Offset {
+  ScalarStuff.startScalarStuff(builder);
+  ScalarStuff.addJustI8(builder, justI8);
+  if (maybeI8 !== null)
+    ScalarStuff.addMaybeI8(builder, maybeI8);
+  ScalarStuff.addDefaultI8(builder, defaultI8);
+  ScalarStuff.addJustU8(builder, justU8);
+  if (maybeU8 !== null)
+    ScalarStuff.addMaybeU8(builder, maybeU8);
+  ScalarStuff.addDefaultU8(builder, defaultU8);
+  ScalarStuff.addJustI16(builder, justI16);
+  if (maybeI16 !== null)
+    ScalarStuff.addMaybeI16(builder, maybeI16);
+  ScalarStuff.addDefaultI16(builder, defaultI16);
+  ScalarStuff.addJustU16(builder, justU16);
+  if (maybeU16 !== null)
+    ScalarStuff.addMaybeU16(builder, maybeU16);
+  ScalarStuff.addDefaultU16(builder, defaultU16);
+  ScalarStuff.addJustI32(builder, justI32);
+  if (maybeI32 !== null)
+    ScalarStuff.addMaybeI32(builder, maybeI32);
+  ScalarStuff.addDefaultI32(builder, defaultI32);
+  ScalarStuff.addJustU32(builder, justU32);
+  if (maybeU32 !== null)
+    ScalarStuff.addMaybeU32(builder, maybeU32);
+  ScalarStuff.addDefaultU32(builder, defaultU32);
+  ScalarStuff.addJustI64(builder, justI64);
+  if (maybeI64 !== null)
+    ScalarStuff.addMaybeI64(builder, maybeI64);
+  ScalarStuff.addDefaultI64(builder, defaultI64);
+  ScalarStuff.addJustU64(builder, justU64);
+  if (maybeU64 !== null)
+    ScalarStuff.addMaybeU64(builder, maybeU64);
+  ScalarStuff.addDefaultU64(builder, defaultU64);
+  ScalarStuff.addJustF32(builder, justF32);
+  if (maybeF32 !== null)
+    ScalarStuff.addMaybeF32(builder, maybeF32);
+  ScalarStuff.addDefaultF32(builder, defaultF32);
+  ScalarStuff.addJustF64(builder, justF64);
+  if (maybeF64 !== null)
+    ScalarStuff.addMaybeF64(builder, maybeF64);
+  ScalarStuff.addDefaultF64(builder, defaultF64);
+  ScalarStuff.addJustBool(builder, justBool);
+  if (maybeBool !== null)
+    ScalarStuff.addMaybeBool(builder, maybeBool);
+  ScalarStuff.addDefaultBool(builder, defaultBool);
+  ScalarStuff.addJustEnum(builder, justEnum);
+  if (maybeEnum !== null)
+    ScalarStuff.addMaybeEnum(builder, maybeEnum);
+  ScalarStuff.addDefaultEnum(builder, defaultEnum);
+  return ScalarStuff.endScalarStuff(builder);
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.h
new file mode 100644
index 0000000..963bfa4
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.h
@@ -0,0 +1,944 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
+#define FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace optional_scalars {
+
+struct ScalarStuff;
+struct ScalarStuffBuilder;
+struct ScalarStuffT;
+
+bool operator==(const ScalarStuffT &lhs, const ScalarStuffT &rhs);
+bool operator!=(const ScalarStuffT &lhs, const ScalarStuffT &rhs);
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable();
+
+enum OptionalByte : int8_t {
+  OptionalByte_None = 0,
+  OptionalByte_One = 1,
+  OptionalByte_Two = 2,
+  OptionalByte_MIN = OptionalByte_None,
+  OptionalByte_MAX = OptionalByte_Two
+};
+
+inline const OptionalByte (&EnumValuesOptionalByte())[3] {
+  static const OptionalByte values[] = {
+    OptionalByte_None,
+    OptionalByte_One,
+    OptionalByte_Two
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOptionalByte() {
+  static const char * const names[4] = {
+    "None",
+    "One",
+    "Two",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOptionalByte(OptionalByte e) {
+  if (flatbuffers::IsOutRange(e, OptionalByte_None, OptionalByte_Two)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOptionalByte()[index];
+}
+
+struct ScalarStuffT : public flatbuffers::NativeTable {
+  typedef ScalarStuff TableType;
+  int8_t just_i8 = 0;
+  flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt;
+  int8_t default_i8 = 42;
+  uint8_t just_u8 = 0;
+  flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt;
+  uint8_t default_u8 = 42;
+  int16_t just_i16 = 0;
+  flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt;
+  int16_t default_i16 = 42;
+  uint16_t just_u16 = 0;
+  flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt;
+  uint16_t default_u16 = 42;
+  int32_t just_i32 = 0;
+  flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt;
+  int32_t default_i32 = 42;
+  uint32_t just_u32 = 0;
+  flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt;
+  uint32_t default_u32 = 42;
+  int64_t just_i64 = 0;
+  flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt;
+  int64_t default_i64 = 42LL;
+  uint64_t just_u64 = 0;
+  flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt;
+  uint64_t default_u64 = 42ULL;
+  float just_f32 = 0.0f;
+  flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt;
+  float default_f32 = 42.0f;
+  double just_f64 = 0.0;
+  flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt;
+  double default_f64 = 42.0;
+  bool just_bool = false;
+  flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt;
+  bool default_bool = true;
+  optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte_None;
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt;
+  optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte_One;
+};
+
+inline bool operator==(const ScalarStuffT &lhs, const ScalarStuffT &rhs) {
+  return
+      (lhs.just_i8 == rhs.just_i8) &&
+      (lhs.maybe_i8 == rhs.maybe_i8) &&
+      (lhs.default_i8 == rhs.default_i8) &&
+      (lhs.just_u8 == rhs.just_u8) &&
+      (lhs.maybe_u8 == rhs.maybe_u8) &&
+      (lhs.default_u8 == rhs.default_u8) &&
+      (lhs.just_i16 == rhs.just_i16) &&
+      (lhs.maybe_i16 == rhs.maybe_i16) &&
+      (lhs.default_i16 == rhs.default_i16) &&
+      (lhs.just_u16 == rhs.just_u16) &&
+      (lhs.maybe_u16 == rhs.maybe_u16) &&
+      (lhs.default_u16 == rhs.default_u16) &&
+      (lhs.just_i32 == rhs.just_i32) &&
+      (lhs.maybe_i32 == rhs.maybe_i32) &&
+      (lhs.default_i32 == rhs.default_i32) &&
+      (lhs.just_u32 == rhs.just_u32) &&
+      (lhs.maybe_u32 == rhs.maybe_u32) &&
+      (lhs.default_u32 == rhs.default_u32) &&
+      (lhs.just_i64 == rhs.just_i64) &&
+      (lhs.maybe_i64 == rhs.maybe_i64) &&
+      (lhs.default_i64 == rhs.default_i64) &&
+      (lhs.just_u64 == rhs.just_u64) &&
+      (lhs.maybe_u64 == rhs.maybe_u64) &&
+      (lhs.default_u64 == rhs.default_u64) &&
+      (lhs.just_f32 == rhs.just_f32) &&
+      (lhs.maybe_f32 == rhs.maybe_f32) &&
+      (lhs.default_f32 == rhs.default_f32) &&
+      (lhs.just_f64 == rhs.just_f64) &&
+      (lhs.maybe_f64 == rhs.maybe_f64) &&
+      (lhs.default_f64 == rhs.default_f64) &&
+      (lhs.just_bool == rhs.just_bool) &&
+      (lhs.maybe_bool == rhs.maybe_bool) &&
+      (lhs.default_bool == rhs.default_bool) &&
+      (lhs.just_enum == rhs.just_enum) &&
+      (lhs.maybe_enum == rhs.maybe_enum) &&
+      (lhs.default_enum == rhs.default_enum);
+}
+
+inline bool operator!=(const ScalarStuffT &lhs, const ScalarStuffT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct ScalarStuff FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ScalarStuffT NativeTableType;
+  typedef ScalarStuffBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ScalarStuffTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_JUST_I8 = 4,
+    VT_MAYBE_I8 = 6,
+    VT_DEFAULT_I8 = 8,
+    VT_JUST_U8 = 10,
+    VT_MAYBE_U8 = 12,
+    VT_DEFAULT_U8 = 14,
+    VT_JUST_I16 = 16,
+    VT_MAYBE_I16 = 18,
+    VT_DEFAULT_I16 = 20,
+    VT_JUST_U16 = 22,
+    VT_MAYBE_U16 = 24,
+    VT_DEFAULT_U16 = 26,
+    VT_JUST_I32 = 28,
+    VT_MAYBE_I32 = 30,
+    VT_DEFAULT_I32 = 32,
+    VT_JUST_U32 = 34,
+    VT_MAYBE_U32 = 36,
+    VT_DEFAULT_U32 = 38,
+    VT_JUST_I64 = 40,
+    VT_MAYBE_I64 = 42,
+    VT_DEFAULT_I64 = 44,
+    VT_JUST_U64 = 46,
+    VT_MAYBE_U64 = 48,
+    VT_DEFAULT_U64 = 50,
+    VT_JUST_F32 = 52,
+    VT_MAYBE_F32 = 54,
+    VT_DEFAULT_F32 = 56,
+    VT_JUST_F64 = 58,
+    VT_MAYBE_F64 = 60,
+    VT_DEFAULT_F64 = 62,
+    VT_JUST_BOOL = 64,
+    VT_MAYBE_BOOL = 66,
+    VT_DEFAULT_BOOL = 68,
+    VT_JUST_ENUM = 70,
+    VT_MAYBE_ENUM = 72,
+    VT_DEFAULT_ENUM = 74
+  };
+  int8_t just_i8() const {
+    return GetField<int8_t>(VT_JUST_I8, 0);
+  }
+  bool mutate_just_i8(int8_t _just_i8) {
+    return SetField<int8_t>(VT_JUST_I8, _just_i8, 0);
+  }
+  flatbuffers::Optional<int8_t> maybe_i8() const {
+    return GetOptional<int8_t, int8_t>(VT_MAYBE_I8);
+  }
+  bool mutate_maybe_i8(int8_t _maybe_i8) {
+    return SetField<int8_t>(VT_MAYBE_I8, _maybe_i8);
+  }
+  int8_t default_i8() const {
+    return GetField<int8_t>(VT_DEFAULT_I8, 42);
+  }
+  bool mutate_default_i8(int8_t _default_i8) {
+    return SetField<int8_t>(VT_DEFAULT_I8, _default_i8, 42);
+  }
+  uint8_t just_u8() const {
+    return GetField<uint8_t>(VT_JUST_U8, 0);
+  }
+  bool mutate_just_u8(uint8_t _just_u8) {
+    return SetField<uint8_t>(VT_JUST_U8, _just_u8, 0);
+  }
+  flatbuffers::Optional<uint8_t> maybe_u8() const {
+    return GetOptional<uint8_t, uint8_t>(VT_MAYBE_U8);
+  }
+  bool mutate_maybe_u8(uint8_t _maybe_u8) {
+    return SetField<uint8_t>(VT_MAYBE_U8, _maybe_u8);
+  }
+  uint8_t default_u8() const {
+    return GetField<uint8_t>(VT_DEFAULT_U8, 42);
+  }
+  bool mutate_default_u8(uint8_t _default_u8) {
+    return SetField<uint8_t>(VT_DEFAULT_U8, _default_u8, 42);
+  }
+  int16_t just_i16() const {
+    return GetField<int16_t>(VT_JUST_I16, 0);
+  }
+  bool mutate_just_i16(int16_t _just_i16) {
+    return SetField<int16_t>(VT_JUST_I16, _just_i16, 0);
+  }
+  flatbuffers::Optional<int16_t> maybe_i16() const {
+    return GetOptional<int16_t, int16_t>(VT_MAYBE_I16);
+  }
+  bool mutate_maybe_i16(int16_t _maybe_i16) {
+    return SetField<int16_t>(VT_MAYBE_I16, _maybe_i16);
+  }
+  int16_t default_i16() const {
+    return GetField<int16_t>(VT_DEFAULT_I16, 42);
+  }
+  bool mutate_default_i16(int16_t _default_i16) {
+    return SetField<int16_t>(VT_DEFAULT_I16, _default_i16, 42);
+  }
+  uint16_t just_u16() const {
+    return GetField<uint16_t>(VT_JUST_U16, 0);
+  }
+  bool mutate_just_u16(uint16_t _just_u16) {
+    return SetField<uint16_t>(VT_JUST_U16, _just_u16, 0);
+  }
+  flatbuffers::Optional<uint16_t> maybe_u16() const {
+    return GetOptional<uint16_t, uint16_t>(VT_MAYBE_U16);
+  }
+  bool mutate_maybe_u16(uint16_t _maybe_u16) {
+    return SetField<uint16_t>(VT_MAYBE_U16, _maybe_u16);
+  }
+  uint16_t default_u16() const {
+    return GetField<uint16_t>(VT_DEFAULT_U16, 42);
+  }
+  bool mutate_default_u16(uint16_t _default_u16) {
+    return SetField<uint16_t>(VT_DEFAULT_U16, _default_u16, 42);
+  }
+  int32_t just_i32() const {
+    return GetField<int32_t>(VT_JUST_I32, 0);
+  }
+  bool mutate_just_i32(int32_t _just_i32) {
+    return SetField<int32_t>(VT_JUST_I32, _just_i32, 0);
+  }
+  flatbuffers::Optional<int32_t> maybe_i32() const {
+    return GetOptional<int32_t, int32_t>(VT_MAYBE_I32);
+  }
+  bool mutate_maybe_i32(int32_t _maybe_i32) {
+    return SetField<int32_t>(VT_MAYBE_I32, _maybe_i32);
+  }
+  int32_t default_i32() const {
+    return GetField<int32_t>(VT_DEFAULT_I32, 42);
+  }
+  bool mutate_default_i32(int32_t _default_i32) {
+    return SetField<int32_t>(VT_DEFAULT_I32, _default_i32, 42);
+  }
+  uint32_t just_u32() const {
+    return GetField<uint32_t>(VT_JUST_U32, 0);
+  }
+  bool mutate_just_u32(uint32_t _just_u32) {
+    return SetField<uint32_t>(VT_JUST_U32, _just_u32, 0);
+  }
+  flatbuffers::Optional<uint32_t> maybe_u32() const {
+    return GetOptional<uint32_t, uint32_t>(VT_MAYBE_U32);
+  }
+  bool mutate_maybe_u32(uint32_t _maybe_u32) {
+    return SetField<uint32_t>(VT_MAYBE_U32, _maybe_u32);
+  }
+  uint32_t default_u32() const {
+    return GetField<uint32_t>(VT_DEFAULT_U32, 42);
+  }
+  bool mutate_default_u32(uint32_t _default_u32) {
+    return SetField<uint32_t>(VT_DEFAULT_U32, _default_u32, 42);
+  }
+  int64_t just_i64() const {
+    return GetField<int64_t>(VT_JUST_I64, 0);
+  }
+  bool mutate_just_i64(int64_t _just_i64) {
+    return SetField<int64_t>(VT_JUST_I64, _just_i64, 0);
+  }
+  flatbuffers::Optional<int64_t> maybe_i64() const {
+    return GetOptional<int64_t, int64_t>(VT_MAYBE_I64);
+  }
+  bool mutate_maybe_i64(int64_t _maybe_i64) {
+    return SetField<int64_t>(VT_MAYBE_I64, _maybe_i64);
+  }
+  int64_t default_i64() const {
+    return GetField<int64_t>(VT_DEFAULT_I64, 42LL);
+  }
+  bool mutate_default_i64(int64_t _default_i64) {
+    return SetField<int64_t>(VT_DEFAULT_I64, _default_i64, 42LL);
+  }
+  uint64_t just_u64() const {
+    return GetField<uint64_t>(VT_JUST_U64, 0);
+  }
+  bool mutate_just_u64(uint64_t _just_u64) {
+    return SetField<uint64_t>(VT_JUST_U64, _just_u64, 0);
+  }
+  flatbuffers::Optional<uint64_t> maybe_u64() const {
+    return GetOptional<uint64_t, uint64_t>(VT_MAYBE_U64);
+  }
+  bool mutate_maybe_u64(uint64_t _maybe_u64) {
+    return SetField<uint64_t>(VT_MAYBE_U64, _maybe_u64);
+  }
+  uint64_t default_u64() const {
+    return GetField<uint64_t>(VT_DEFAULT_U64, 42ULL);
+  }
+  bool mutate_default_u64(uint64_t _default_u64) {
+    return SetField<uint64_t>(VT_DEFAULT_U64, _default_u64, 42ULL);
+  }
+  float just_f32() const {
+    return GetField<float>(VT_JUST_F32, 0.0f);
+  }
+  bool mutate_just_f32(float _just_f32) {
+    return SetField<float>(VT_JUST_F32, _just_f32, 0.0f);
+  }
+  flatbuffers::Optional<float> maybe_f32() const {
+    return GetOptional<float, float>(VT_MAYBE_F32);
+  }
+  bool mutate_maybe_f32(float _maybe_f32) {
+    return SetField<float>(VT_MAYBE_F32, _maybe_f32);
+  }
+  float default_f32() const {
+    return GetField<float>(VT_DEFAULT_F32, 42.0f);
+  }
+  bool mutate_default_f32(float _default_f32) {
+    return SetField<float>(VT_DEFAULT_F32, _default_f32, 42.0f);
+  }
+  double just_f64() const {
+    return GetField<double>(VT_JUST_F64, 0.0);
+  }
+  bool mutate_just_f64(double _just_f64) {
+    return SetField<double>(VT_JUST_F64, _just_f64, 0.0);
+  }
+  flatbuffers::Optional<double> maybe_f64() const {
+    return GetOptional<double, double>(VT_MAYBE_F64);
+  }
+  bool mutate_maybe_f64(double _maybe_f64) {
+    return SetField<double>(VT_MAYBE_F64, _maybe_f64);
+  }
+  double default_f64() const {
+    return GetField<double>(VT_DEFAULT_F64, 42.0);
+  }
+  bool mutate_default_f64(double _default_f64) {
+    return SetField<double>(VT_DEFAULT_F64, _default_f64, 42.0);
+  }
+  bool just_bool() const {
+    return GetField<uint8_t>(VT_JUST_BOOL, 0) != 0;
+  }
+  bool mutate_just_bool(bool _just_bool) {
+    return SetField<uint8_t>(VT_JUST_BOOL, static_cast<uint8_t>(_just_bool), 0);
+  }
+  flatbuffers::Optional<bool> maybe_bool() const {
+    return GetOptional<uint8_t, bool>(VT_MAYBE_BOOL);
+  }
+  bool mutate_maybe_bool(bool _maybe_bool) {
+    return SetField<uint8_t>(VT_MAYBE_BOOL, static_cast<uint8_t>(_maybe_bool));
+  }
+  bool default_bool() const {
+    return GetField<uint8_t>(VT_DEFAULT_BOOL, 1) != 0;
+  }
+  bool mutate_default_bool(bool _default_bool) {
+    return SetField<uint8_t>(VT_DEFAULT_BOOL, static_cast<uint8_t>(_default_bool), 1);
+  }
+  optional_scalars::OptionalByte just_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_JUST_ENUM, 0));
+  }
+  bool mutate_just_enum(optional_scalars::OptionalByte _just_enum) {
+    return SetField<int8_t>(VT_JUST_ENUM, static_cast<int8_t>(_just_enum), 0);
+  }
+  flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum() const {
+    return GetOptional<int8_t, optional_scalars::OptionalByte>(VT_MAYBE_ENUM);
+  }
+  bool mutate_maybe_enum(optional_scalars::OptionalByte _maybe_enum) {
+    return SetField<int8_t>(VT_MAYBE_ENUM, static_cast<int8_t>(_maybe_enum));
+  }
+  optional_scalars::OptionalByte default_enum() const {
+    return static_cast<optional_scalars::OptionalByte>(GetField<int8_t>(VT_DEFAULT_ENUM, 1));
+  }
+  bool mutate_default_enum(optional_scalars::OptionalByte _default_enum) {
+    return SetField<int8_t>(VT_DEFAULT_ENUM, static_cast<int8_t>(_default_enum), 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_JUST_I8) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_I8) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_I8) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_U8) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_U8) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_U8) &&
+           VerifyField<int16_t>(verifier, VT_JUST_I16) &&
+           VerifyField<int16_t>(verifier, VT_MAYBE_I16) &&
+           VerifyField<int16_t>(verifier, VT_DEFAULT_I16) &&
+           VerifyField<uint16_t>(verifier, VT_JUST_U16) &&
+           VerifyField<uint16_t>(verifier, VT_MAYBE_U16) &&
+           VerifyField<uint16_t>(verifier, VT_DEFAULT_U16) &&
+           VerifyField<int32_t>(verifier, VT_JUST_I32) &&
+           VerifyField<int32_t>(verifier, VT_MAYBE_I32) &&
+           VerifyField<int32_t>(verifier, VT_DEFAULT_I32) &&
+           VerifyField<uint32_t>(verifier, VT_JUST_U32) &&
+           VerifyField<uint32_t>(verifier, VT_MAYBE_U32) &&
+           VerifyField<uint32_t>(verifier, VT_DEFAULT_U32) &&
+           VerifyField<int64_t>(verifier, VT_JUST_I64) &&
+           VerifyField<int64_t>(verifier, VT_MAYBE_I64) &&
+           VerifyField<int64_t>(verifier, VT_DEFAULT_I64) &&
+           VerifyField<uint64_t>(verifier, VT_JUST_U64) &&
+           VerifyField<uint64_t>(verifier, VT_MAYBE_U64) &&
+           VerifyField<uint64_t>(verifier, VT_DEFAULT_U64) &&
+           VerifyField<float>(verifier, VT_JUST_F32) &&
+           VerifyField<float>(verifier, VT_MAYBE_F32) &&
+           VerifyField<float>(verifier, VT_DEFAULT_F32) &&
+           VerifyField<double>(verifier, VT_JUST_F64) &&
+           VerifyField<double>(verifier, VT_MAYBE_F64) &&
+           VerifyField<double>(verifier, VT_DEFAULT_F64) &&
+           VerifyField<uint8_t>(verifier, VT_JUST_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_MAYBE_BOOL) &&
+           VerifyField<uint8_t>(verifier, VT_DEFAULT_BOOL) &&
+           VerifyField<int8_t>(verifier, VT_JUST_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_MAYBE_ENUM) &&
+           VerifyField<int8_t>(verifier, VT_DEFAULT_ENUM) &&
+           verifier.EndTable();
+  }
+  ScalarStuffT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ScalarStuff> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ScalarStuffBuilder {
+  typedef ScalarStuff Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_just_i8(int8_t just_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_I8, just_i8, 0);
+  }
+  void add_maybe_i8(int8_t maybe_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_I8, maybe_i8);
+  }
+  void add_default_i8(int8_t default_i8) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_I8, default_i8, 42);
+  }
+  void add_just_u8(uint8_t just_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_U8, just_u8, 0);
+  }
+  void add_maybe_u8(uint8_t maybe_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_U8, maybe_u8);
+  }
+  void add_default_u8(uint8_t default_u8) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_U8, default_u8, 42);
+  }
+  void add_just_i16(int16_t just_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_JUST_I16, just_i16, 0);
+  }
+  void add_maybe_i16(int16_t maybe_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_MAYBE_I16, maybe_i16);
+  }
+  void add_default_i16(int16_t default_i16) {
+    fbb_.AddElement<int16_t>(ScalarStuff::VT_DEFAULT_I16, default_i16, 42);
+  }
+  void add_just_u16(uint16_t just_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_JUST_U16, just_u16, 0);
+  }
+  void add_maybe_u16(uint16_t maybe_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_MAYBE_U16, maybe_u16);
+  }
+  void add_default_u16(uint16_t default_u16) {
+    fbb_.AddElement<uint16_t>(ScalarStuff::VT_DEFAULT_U16, default_u16, 42);
+  }
+  void add_just_i32(int32_t just_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_JUST_I32, just_i32, 0);
+  }
+  void add_maybe_i32(int32_t maybe_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_MAYBE_I32, maybe_i32);
+  }
+  void add_default_i32(int32_t default_i32) {
+    fbb_.AddElement<int32_t>(ScalarStuff::VT_DEFAULT_I32, default_i32, 42);
+  }
+  void add_just_u32(uint32_t just_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_JUST_U32, just_u32, 0);
+  }
+  void add_maybe_u32(uint32_t maybe_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_MAYBE_U32, maybe_u32);
+  }
+  void add_default_u32(uint32_t default_u32) {
+    fbb_.AddElement<uint32_t>(ScalarStuff::VT_DEFAULT_U32, default_u32, 42);
+  }
+  void add_just_i64(int64_t just_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_JUST_I64, just_i64, 0);
+  }
+  void add_maybe_i64(int64_t maybe_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_MAYBE_I64, maybe_i64);
+  }
+  void add_default_i64(int64_t default_i64) {
+    fbb_.AddElement<int64_t>(ScalarStuff::VT_DEFAULT_I64, default_i64, 42LL);
+  }
+  void add_just_u64(uint64_t just_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_JUST_U64, just_u64, 0);
+  }
+  void add_maybe_u64(uint64_t maybe_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_MAYBE_U64, maybe_u64);
+  }
+  void add_default_u64(uint64_t default_u64) {
+    fbb_.AddElement<uint64_t>(ScalarStuff::VT_DEFAULT_U64, default_u64, 42ULL);
+  }
+  void add_just_f32(float just_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_JUST_F32, just_f32, 0.0f);
+  }
+  void add_maybe_f32(float maybe_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_MAYBE_F32, maybe_f32);
+  }
+  void add_default_f32(float default_f32) {
+    fbb_.AddElement<float>(ScalarStuff::VT_DEFAULT_F32, default_f32, 42.0f);
+  }
+  void add_just_f64(double just_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_JUST_F64, just_f64, 0.0);
+  }
+  void add_maybe_f64(double maybe_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_MAYBE_F64, maybe_f64);
+  }
+  void add_default_f64(double default_f64) {
+    fbb_.AddElement<double>(ScalarStuff::VT_DEFAULT_F64, default_f64, 42.0);
+  }
+  void add_just_bool(bool just_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_JUST_BOOL, static_cast<uint8_t>(just_bool), 0);
+  }
+  void add_maybe_bool(bool maybe_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_MAYBE_BOOL, static_cast<uint8_t>(maybe_bool));
+  }
+  void add_default_bool(bool default_bool) {
+    fbb_.AddElement<uint8_t>(ScalarStuff::VT_DEFAULT_BOOL, static_cast<uint8_t>(default_bool), 1);
+  }
+  void add_just_enum(optional_scalars::OptionalByte just_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_JUST_ENUM, static_cast<int8_t>(just_enum), 0);
+  }
+  void add_maybe_enum(optional_scalars::OptionalByte maybe_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_MAYBE_ENUM, static_cast<int8_t>(maybe_enum));
+  }
+  void add_default_enum(optional_scalars::OptionalByte default_enum) {
+    fbb_.AddElement<int8_t>(ScalarStuff::VT_DEFAULT_ENUM, static_cast<int8_t>(default_enum), 1);
+  }
+  explicit ScalarStuffBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ScalarStuff> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScalarStuff>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t just_i8 = 0,
+    flatbuffers::Optional<int8_t> maybe_i8 = flatbuffers::nullopt,
+    int8_t default_i8 = 42,
+    uint8_t just_u8 = 0,
+    flatbuffers::Optional<uint8_t> maybe_u8 = flatbuffers::nullopt,
+    uint8_t default_u8 = 42,
+    int16_t just_i16 = 0,
+    flatbuffers::Optional<int16_t> maybe_i16 = flatbuffers::nullopt,
+    int16_t default_i16 = 42,
+    uint16_t just_u16 = 0,
+    flatbuffers::Optional<uint16_t> maybe_u16 = flatbuffers::nullopt,
+    uint16_t default_u16 = 42,
+    int32_t just_i32 = 0,
+    flatbuffers::Optional<int32_t> maybe_i32 = flatbuffers::nullopt,
+    int32_t default_i32 = 42,
+    uint32_t just_u32 = 0,
+    flatbuffers::Optional<uint32_t> maybe_u32 = flatbuffers::nullopt,
+    uint32_t default_u32 = 42,
+    int64_t just_i64 = 0,
+    flatbuffers::Optional<int64_t> maybe_i64 = flatbuffers::nullopt,
+    int64_t default_i64 = 42LL,
+    uint64_t just_u64 = 0,
+    flatbuffers::Optional<uint64_t> maybe_u64 = flatbuffers::nullopt,
+    uint64_t default_u64 = 42ULL,
+    float just_f32 = 0.0f,
+    flatbuffers::Optional<float> maybe_f32 = flatbuffers::nullopt,
+    float default_f32 = 42.0f,
+    double just_f64 = 0.0,
+    flatbuffers::Optional<double> maybe_f64 = flatbuffers::nullopt,
+    double default_f64 = 42.0,
+    bool just_bool = false,
+    flatbuffers::Optional<bool> maybe_bool = flatbuffers::nullopt,
+    bool default_bool = true,
+    optional_scalars::OptionalByte just_enum = optional_scalars::OptionalByte_None,
+    flatbuffers::Optional<optional_scalars::OptionalByte> maybe_enum = flatbuffers::nullopt,
+    optional_scalars::OptionalByte default_enum = optional_scalars::OptionalByte_One) {
+  ScalarStuffBuilder builder_(_fbb);
+  builder_.add_default_f64(default_f64);
+  if(maybe_f64) { builder_.add_maybe_f64(*maybe_f64); }
+  builder_.add_just_f64(just_f64);
+  builder_.add_default_u64(default_u64);
+  if(maybe_u64) { builder_.add_maybe_u64(*maybe_u64); }
+  builder_.add_just_u64(just_u64);
+  builder_.add_default_i64(default_i64);
+  if(maybe_i64) { builder_.add_maybe_i64(*maybe_i64); }
+  builder_.add_just_i64(just_i64);
+  builder_.add_default_f32(default_f32);
+  if(maybe_f32) { builder_.add_maybe_f32(*maybe_f32); }
+  builder_.add_just_f32(just_f32);
+  builder_.add_default_u32(default_u32);
+  if(maybe_u32) { builder_.add_maybe_u32(*maybe_u32); }
+  builder_.add_just_u32(just_u32);
+  builder_.add_default_i32(default_i32);
+  if(maybe_i32) { builder_.add_maybe_i32(*maybe_i32); }
+  builder_.add_just_i32(just_i32);
+  builder_.add_default_u16(default_u16);
+  if(maybe_u16) { builder_.add_maybe_u16(*maybe_u16); }
+  builder_.add_just_u16(just_u16);
+  builder_.add_default_i16(default_i16);
+  if(maybe_i16) { builder_.add_maybe_i16(*maybe_i16); }
+  builder_.add_just_i16(just_i16);
+  builder_.add_default_enum(default_enum);
+  if(maybe_enum) { builder_.add_maybe_enum(*maybe_enum); }
+  builder_.add_just_enum(just_enum);
+  builder_.add_default_bool(default_bool);
+  if(maybe_bool) { builder_.add_maybe_bool(*maybe_bool); }
+  builder_.add_just_bool(just_bool);
+  builder_.add_default_u8(default_u8);
+  if(maybe_u8) { builder_.add_maybe_u8(*maybe_u8); }
+  builder_.add_just_u8(just_u8);
+  builder_.add_default_i8(default_i8);
+  if(maybe_i8) { builder_.add_maybe_i8(*maybe_i8); }
+  builder_.add_just_i8(just_i8);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline ScalarStuffT *ScalarStuff::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ScalarStuffT>(new ScalarStuffT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ScalarStuff::UnPackTo(ScalarStuffT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = just_i8(); _o->just_i8 = _e; }
+  { auto _e = maybe_i8(); _o->maybe_i8 = _e; }
+  { auto _e = default_i8(); _o->default_i8 = _e; }
+  { auto _e = just_u8(); _o->just_u8 = _e; }
+  { auto _e = maybe_u8(); _o->maybe_u8 = _e; }
+  { auto _e = default_u8(); _o->default_u8 = _e; }
+  { auto _e = just_i16(); _o->just_i16 = _e; }
+  { auto _e = maybe_i16(); _o->maybe_i16 = _e; }
+  { auto _e = default_i16(); _o->default_i16 = _e; }
+  { auto _e = just_u16(); _o->just_u16 = _e; }
+  { auto _e = maybe_u16(); _o->maybe_u16 = _e; }
+  { auto _e = default_u16(); _o->default_u16 = _e; }
+  { auto _e = just_i32(); _o->just_i32 = _e; }
+  { auto _e = maybe_i32(); _o->maybe_i32 = _e; }
+  { auto _e = default_i32(); _o->default_i32 = _e; }
+  { auto _e = just_u32(); _o->just_u32 = _e; }
+  { auto _e = maybe_u32(); _o->maybe_u32 = _e; }
+  { auto _e = default_u32(); _o->default_u32 = _e; }
+  { auto _e = just_i64(); _o->just_i64 = _e; }
+  { auto _e = maybe_i64(); _o->maybe_i64 = _e; }
+  { auto _e = default_i64(); _o->default_i64 = _e; }
+  { auto _e = just_u64(); _o->just_u64 = _e; }
+  { auto _e = maybe_u64(); _o->maybe_u64 = _e; }
+  { auto _e = default_u64(); _o->default_u64 = _e; }
+  { auto _e = just_f32(); _o->just_f32 = _e; }
+  { auto _e = maybe_f32(); _o->maybe_f32 = _e; }
+  { auto _e = default_f32(); _o->default_f32 = _e; }
+  { auto _e = just_f64(); _o->just_f64 = _e; }
+  { auto _e = maybe_f64(); _o->maybe_f64 = _e; }
+  { auto _e = default_f64(); _o->default_f64 = _e; }
+  { auto _e = just_bool(); _o->just_bool = _e; }
+  { auto _e = maybe_bool(); _o->maybe_bool = _e; }
+  { auto _e = default_bool(); _o->default_bool = _e; }
+  { auto _e = just_enum(); _o->just_enum = _e; }
+  { auto _e = maybe_enum(); _o->maybe_enum = _e; }
+  { auto _e = default_enum(); _o->default_enum = _e; }
+}
+
+inline flatbuffers::Offset<ScalarStuff> ScalarStuff::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateScalarStuff(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ScalarStuff> CreateScalarStuff(flatbuffers::FlatBufferBuilder &_fbb, const ScalarStuffT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ScalarStuffT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _just_i8 = _o->just_i8;
+  auto _maybe_i8 = _o->maybe_i8;
+  auto _default_i8 = _o->default_i8;
+  auto _just_u8 = _o->just_u8;
+  auto _maybe_u8 = _o->maybe_u8;
+  auto _default_u8 = _o->default_u8;
+  auto _just_i16 = _o->just_i16;
+  auto _maybe_i16 = _o->maybe_i16;
+  auto _default_i16 = _o->default_i16;
+  auto _just_u16 = _o->just_u16;
+  auto _maybe_u16 = _o->maybe_u16;
+  auto _default_u16 = _o->default_u16;
+  auto _just_i32 = _o->just_i32;
+  auto _maybe_i32 = _o->maybe_i32;
+  auto _default_i32 = _o->default_i32;
+  auto _just_u32 = _o->just_u32;
+  auto _maybe_u32 = _o->maybe_u32;
+  auto _default_u32 = _o->default_u32;
+  auto _just_i64 = _o->just_i64;
+  auto _maybe_i64 = _o->maybe_i64;
+  auto _default_i64 = _o->default_i64;
+  auto _just_u64 = _o->just_u64;
+  auto _maybe_u64 = _o->maybe_u64;
+  auto _default_u64 = _o->default_u64;
+  auto _just_f32 = _o->just_f32;
+  auto _maybe_f32 = _o->maybe_f32;
+  auto _default_f32 = _o->default_f32;
+  auto _just_f64 = _o->just_f64;
+  auto _maybe_f64 = _o->maybe_f64;
+  auto _default_f64 = _o->default_f64;
+  auto _just_bool = _o->just_bool;
+  auto _maybe_bool = _o->maybe_bool;
+  auto _default_bool = _o->default_bool;
+  auto _just_enum = _o->just_enum;
+  auto _maybe_enum = _o->maybe_enum;
+  auto _default_enum = _o->default_enum;
+  return optional_scalars::CreateScalarStuff(
+      _fbb,
+      _just_i8,
+      _maybe_i8,
+      _default_i8,
+      _just_u8,
+      _maybe_u8,
+      _default_u8,
+      _just_i16,
+      _maybe_i16,
+      _default_i16,
+      _just_u16,
+      _maybe_u16,
+      _default_u16,
+      _just_i32,
+      _maybe_i32,
+      _default_i32,
+      _just_u32,
+      _maybe_u32,
+      _default_u32,
+      _just_i64,
+      _maybe_i64,
+      _default_i64,
+      _just_u64,
+      _maybe_u64,
+      _default_u64,
+      _just_f32,
+      _maybe_f32,
+      _default_f32,
+      _just_f64,
+      _maybe_f64,
+      _default_f64,
+      _just_bool,
+      _maybe_bool,
+      _default_bool,
+      _just_enum,
+      _maybe_enum,
+      _default_enum);
+}
+
+inline const flatbuffers::TypeTable *OptionalByteTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "None",
+    "One",
+    "Two"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ScalarStuffTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_UCHAR, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_SHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_USHORT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_LONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_ULONG, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_DOUBLE, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    optional_scalars::OptionalByteTypeTable
+  };
+  static const char * const names[] = {
+    "just_i8",
+    "maybe_i8",
+    "default_i8",
+    "just_u8",
+    "maybe_u8",
+    "default_u8",
+    "just_i16",
+    "maybe_i16",
+    "default_i16",
+    "just_u16",
+    "maybe_u16",
+    "default_u16",
+    "just_i32",
+    "maybe_i32",
+    "default_i32",
+    "just_u32",
+    "maybe_u32",
+    "default_u32",
+    "just_i64",
+    "maybe_i64",
+    "default_i64",
+    "just_u64",
+    "maybe_u64",
+    "default_u64",
+    "just_f32",
+    "maybe_f32",
+    "default_f32",
+    "just_f64",
+    "maybe_f64",
+    "default_f64",
+    "just_bool",
+    "maybe_bool",
+    "default_bool",
+    "just_enum",
+    "maybe_enum",
+    "default_enum"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 36, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const optional_scalars::ScalarStuff *GetScalarStuff(const void *buf) {
+  return flatbuffers::GetRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline const optional_scalars::ScalarStuff *GetSizePrefixedScalarStuff(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<optional_scalars::ScalarStuff>(buf);
+}
+
+inline ScalarStuff *GetMutableScalarStuff(void *buf) {
+  return flatbuffers::GetMutableRoot<ScalarStuff>(buf);
+}
+
+inline const char *ScalarStuffIdentifier() {
+  return "NULL";
+}
+
+inline bool ScalarStuffBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ScalarStuffIdentifier());
+}
+
+inline bool VerifyScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline bool VerifySizePrefixedScalarStuffBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<optional_scalars::ScalarStuff>(ScalarStuffIdentifier());
+}
+
+inline const char *ScalarStuffExtension() {
+  return "mon";
+}
+
+inline void FinishScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.Finish(root, ScalarStuffIdentifier());
+}
+
+inline void FinishSizePrefixedScalarStuffBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<optional_scalars::ScalarStuff> root) {
+  fbb.FinishSizePrefixed(root, ScalarStuffIdentifier());
+}
+
+inline flatbuffers::unique_ptr<optional_scalars::ScalarStuffT> UnPackScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<optional_scalars::ScalarStuffT>(GetScalarStuff(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<optional_scalars::ScalarStuffT> UnPackSizePrefixedScalarStuff(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<optional_scalars::ScalarStuffT>(GetSizePrefixedScalarStuff(buf)->UnPack(res));
+}
+
+}  // namespace optional_scalars
+
+#endif  // FLATBUFFERS_GENERATED_OPTIONALSCALARS_OPTIONAL_SCALARS_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.lobster b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.lobster
new file mode 100644
index 0000000..d0f7aef
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.lobster
@@ -0,0 +1,204 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import flatbuffers
+
+namespace optional_scalars
+
+enum OptionalByte:
+    OptionalByte_None = 0
+    OptionalByte_One = 1
+    OptionalByte_Two = 2
+
+class ScalarStuff
+
+class ScalarStuff : flatbuffers_handle
+    def just_i8():
+        return buf_.flatbuffers_field_int8(pos_, 4, 0)
+    def maybe_i8():
+        return buf_.flatbuffers_field_int8(pos_, 6, 0), buf_.flatbuffers_field_present(pos_, 6)
+    def default_i8():
+        return buf_.flatbuffers_field_int8(pos_, 8, 42)
+    def just_u8():
+        return buf_.flatbuffers_field_int8(pos_, 10, 0)
+    def maybe_u8():
+        return buf_.flatbuffers_field_int8(pos_, 12, 0), buf_.flatbuffers_field_present(pos_, 12)
+    def default_u8():
+        return buf_.flatbuffers_field_int8(pos_, 14, 42)
+    def just_i16():
+        return buf_.flatbuffers_field_int16(pos_, 16, 0)
+    def maybe_i16():
+        return buf_.flatbuffers_field_int16(pos_, 18, 0), buf_.flatbuffers_field_present(pos_, 18)
+    def default_i16():
+        return buf_.flatbuffers_field_int16(pos_, 20, 42)
+    def just_u16():
+        return buf_.flatbuffers_field_int16(pos_, 22, 0)
+    def maybe_u16():
+        return buf_.flatbuffers_field_int16(pos_, 24, 0), buf_.flatbuffers_field_present(pos_, 24)
+    def default_u16():
+        return buf_.flatbuffers_field_int16(pos_, 26, 42)
+    def just_i32():
+        return buf_.flatbuffers_field_int32(pos_, 28, 0)
+    def maybe_i32():
+        return buf_.flatbuffers_field_int32(pos_, 30, 0), buf_.flatbuffers_field_present(pos_, 30)
+    def default_i32():
+        return buf_.flatbuffers_field_int32(pos_, 32, 42)
+    def just_u32():
+        return buf_.flatbuffers_field_int32(pos_, 34, 0)
+    def maybe_u32():
+        return buf_.flatbuffers_field_int32(pos_, 36, 0), buf_.flatbuffers_field_present(pos_, 36)
+    def default_u32():
+        return buf_.flatbuffers_field_int32(pos_, 38, 42)
+    def just_i64():
+        return buf_.flatbuffers_field_int64(pos_, 40, 0)
+    def maybe_i64():
+        return buf_.flatbuffers_field_int64(pos_, 42, 0), buf_.flatbuffers_field_present(pos_, 42)
+    def default_i64():
+        return buf_.flatbuffers_field_int64(pos_, 44, 42)
+    def just_u64():
+        return buf_.flatbuffers_field_int64(pos_, 46, 0)
+    def maybe_u64():
+        return buf_.flatbuffers_field_int64(pos_, 48, 0), buf_.flatbuffers_field_present(pos_, 48)
+    def default_u64():
+        return buf_.flatbuffers_field_int64(pos_, 50, 42)
+    def just_f32():
+        return buf_.flatbuffers_field_float32(pos_, 52, 0.0)
+    def maybe_f32():
+        return buf_.flatbuffers_field_float32(pos_, 54, 0), buf_.flatbuffers_field_present(pos_, 54)
+    def default_f32():
+        return buf_.flatbuffers_field_float32(pos_, 56, 42.0)
+    def just_f64():
+        return buf_.flatbuffers_field_float64(pos_, 58, 0.0)
+    def maybe_f64():
+        return buf_.flatbuffers_field_float64(pos_, 60, 0), buf_.flatbuffers_field_present(pos_, 60)
+    def default_f64():
+        return buf_.flatbuffers_field_float64(pos_, 62, 42.0)
+    def just_bool():
+        return buf_.flatbuffers_field_int8(pos_, 64, 0)
+    def maybe_bool():
+        return buf_.flatbuffers_field_int8(pos_, 66, 0), buf_.flatbuffers_field_present(pos_, 66)
+    def default_bool():
+        return buf_.flatbuffers_field_int8(pos_, 68, 1)
+    def just_enum():
+        return OptionalByte(buf_.flatbuffers_field_int8(pos_, 70, 0))
+    def maybe_enum():
+        return OptionalByte(buf_.flatbuffers_field_int8(pos_, 72, 0)), buf_.flatbuffers_field_present(pos_, 72)
+    def default_enum():
+        return OptionalByte(buf_.flatbuffers_field_int8(pos_, 74, 1))
+
+def GetRootAsScalarStuff(buf:string): return ScalarStuff { buf, buf.flatbuffers_indirect(0) }
+
+struct ScalarStuffBuilder:
+    b_:flatbuffers_builder
+    def start():
+        b_.StartObject(36)
+        return this
+    def add_just_i8(just_i8:int):
+        b_.PrependInt8Slot(0, just_i8, 0)
+        return this
+    def add_maybe_i8(maybe_i8:int):
+        b_.PrependInt8Slot(1, maybe_i8)
+        return this
+    def add_default_i8(default_i8:int):
+        b_.PrependInt8Slot(2, default_i8, 42)
+        return this
+    def add_just_u8(just_u8:int):
+        b_.PrependUint8Slot(3, just_u8, 0)
+        return this
+    def add_maybe_u8(maybe_u8:int):
+        b_.PrependUint8Slot(4, maybe_u8)
+        return this
+    def add_default_u8(default_u8:int):
+        b_.PrependUint8Slot(5, default_u8, 42)
+        return this
+    def add_just_i16(just_i16:int):
+        b_.PrependInt16Slot(6, just_i16, 0)
+        return this
+    def add_maybe_i16(maybe_i16:int):
+        b_.PrependInt16Slot(7, maybe_i16)
+        return this
+    def add_default_i16(default_i16:int):
+        b_.PrependInt16Slot(8, default_i16, 42)
+        return this
+    def add_just_u16(just_u16:int):
+        b_.PrependUint16Slot(9, just_u16, 0)
+        return this
+    def add_maybe_u16(maybe_u16:int):
+        b_.PrependUint16Slot(10, maybe_u16)
+        return this
+    def add_default_u16(default_u16:int):
+        b_.PrependUint16Slot(11, default_u16, 42)
+        return this
+    def add_just_i32(just_i32:int):
+        b_.PrependInt32Slot(12, just_i32, 0)
+        return this
+    def add_maybe_i32(maybe_i32:int):
+        b_.PrependInt32Slot(13, maybe_i32)
+        return this
+    def add_default_i32(default_i32:int):
+        b_.PrependInt32Slot(14, default_i32, 42)
+        return this
+    def add_just_u32(just_u32:int):
+        b_.PrependUint32Slot(15, just_u32, 0)
+        return this
+    def add_maybe_u32(maybe_u32:int):
+        b_.PrependUint32Slot(16, maybe_u32)
+        return this
+    def add_default_u32(default_u32:int):
+        b_.PrependUint32Slot(17, default_u32, 42)
+        return this
+    def add_just_i64(just_i64:int):
+        b_.PrependInt64Slot(18, just_i64, 0)
+        return this
+    def add_maybe_i64(maybe_i64:int):
+        b_.PrependInt64Slot(19, maybe_i64)
+        return this
+    def add_default_i64(default_i64:int):
+        b_.PrependInt64Slot(20, default_i64, 42)
+        return this
+    def add_just_u64(just_u64:int):
+        b_.PrependUint64Slot(21, just_u64, 0)
+        return this
+    def add_maybe_u64(maybe_u64:int):
+        b_.PrependUint64Slot(22, maybe_u64)
+        return this
+    def add_default_u64(default_u64:int):
+        b_.PrependUint64Slot(23, default_u64, 42)
+        return this
+    def add_just_f32(just_f32:float):
+        b_.PrependFloat32Slot(24, just_f32, 0.0)
+        return this
+    def add_maybe_f32(maybe_f32:float):
+        b_.PrependFloat32Slot(25, maybe_f32)
+        return this
+    def add_default_f32(default_f32:float):
+        b_.PrependFloat32Slot(26, default_f32, 42.0)
+        return this
+    def add_just_f64(just_f64:float):
+        b_.PrependFloat64Slot(27, just_f64, 0.0)
+        return this
+    def add_maybe_f64(maybe_f64:float):
+        b_.PrependFloat64Slot(28, maybe_f64)
+        return this
+    def add_default_f64(default_f64:float):
+        b_.PrependFloat64Slot(29, default_f64, 42.0)
+        return this
+    def add_just_bool(just_bool:int):
+        b_.PrependBoolSlot(30, just_bool, 0)
+        return this
+    def add_maybe_bool(maybe_bool:int):
+        b_.PrependBoolSlot(31, maybe_bool)
+        return this
+    def add_default_bool(default_bool:int):
+        b_.PrependBoolSlot(32, default_bool, 1)
+        return this
+    def add_just_enum(just_enum:OptionalByte):
+        b_.PrependInt8Slot(33, just_enum, 0)
+        return this
+    def add_maybe_enum(maybe_enum:OptionalByte):
+        b_.PrependInt8Slot(34, maybe_enum)
+        return this
+    def add_default_enum(default_enum:OptionalByte):
+        b_.PrependInt8Slot(35, default_enum, 1)
+        return this
+    def end():
+        return b_.EndObject()
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.rs b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.rs
new file mode 100644
index 0000000..392fdf5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/optional_scalars_generated.rs
@@ -0,0 +1,1029 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+
+use std::mem;
+use std::cmp::Ordering;
+
+extern crate flatbuffers;
+use self::flatbuffers::{EndianScalar, Follow};
+
+#[allow(unused_imports, dead_code)]
+pub mod optional_scalars {
+
+  use std::mem;
+  use std::cmp::Ordering;
+
+  extern crate flatbuffers;
+  use self::flatbuffers::{EndianScalar, Follow};
+
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MIN_OPTIONAL_BYTE: i8 = 0;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+pub const ENUM_MAX_OPTIONAL_BYTE: i8 = 2;
+#[deprecated(since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021.")]
+#[allow(non_camel_case_types)]
+pub const ENUM_VALUES_OPTIONAL_BYTE: [OptionalByte; 3] = [
+  OptionalByte::None,
+  OptionalByte::One,
+  OptionalByte::Two,
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+#[repr(transparent)]
+pub struct OptionalByte(pub i8);
+#[allow(non_upper_case_globals)]
+impl OptionalByte {
+  pub const None: Self = Self(0);
+  pub const One: Self = Self(1);
+  pub const Two: Self = Self(2);
+
+  pub const ENUM_MIN: i8 = 0;
+  pub const ENUM_MAX: i8 = 2;
+  pub const ENUM_VALUES: &'static [Self] = &[
+    Self::None,
+    Self::One,
+    Self::Two,
+  ];
+  /// Returns the variant's name or "" if unknown.
+  pub fn variant_name(self) -> Option<&'static str> {
+    match self {
+      Self::None => Some("None"),
+      Self::One => Some("One"),
+      Self::Two => Some("Two"),
+      _ => None,
+    }
+  }
+}
+impl std::fmt::Debug for OptionalByte {
+  fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    if let Some(name) = self.variant_name() {
+      f.write_str(name)
+    } else {
+      f.write_fmt(format_args!("<UNKNOWN {:?}>", self.0))
+    }
+  }
+}
+impl<'a> flatbuffers::Follow<'a> for OptionalByte {
+  type Inner = Self;
+  #[inline]
+  fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+    let b = unsafe {
+      flatbuffers::read_scalar_at::<i8>(buf, loc)
+    };
+    Self(b)
+  }
+}
+
+impl flatbuffers::Push for OptionalByte {
+    type Output = OptionalByte;
+    #[inline]
+    fn push(&self, dst: &mut [u8], _rest: &[u8]) {
+        unsafe { flatbuffers::emplace_scalar::<i8>(dst, self.0); }
+    }
+}
+
+impl flatbuffers::EndianScalar for OptionalByte {
+  #[inline]
+  fn to_little_endian(self) -> Self {
+    let b = i8::to_le(self.0);
+    Self(b)
+  }
+  #[inline]
+  #[allow(clippy::wrong_self_convention)]
+  fn from_little_endian(self) -> Self {
+    let b = i8::from_le(self.0);
+    Self(b)
+  }
+}
+
+impl<'a> flatbuffers::Verifiable for OptionalByte {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    i8::run_verifier(v, pos)
+  }
+}
+
+impl flatbuffers::SimpleToVerifyInSlice for OptionalByte {}
+pub enum ScalarStuffOffset {}
+#[derive(Copy, Clone, PartialEq)]
+
+pub struct ScalarStuff<'a> {
+  pub _tab: flatbuffers::Table<'a>,
+}
+
+impl<'a> flatbuffers::Follow<'a> for ScalarStuff<'a> {
+    type Inner = ScalarStuff<'a>;
+    #[inline]
+    fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+        Self { _tab: flatbuffers::Table { buf, loc } }
+    }
+}
+
+impl<'a> ScalarStuff<'a> {
+    #[inline]
+    pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self {
+        ScalarStuff { _tab: table }
+    }
+    #[allow(unused_mut)]
+    pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>(
+        _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>,
+        args: &'args ScalarStuffArgs) -> flatbuffers::WIPOffset<ScalarStuff<'bldr>> {
+      let mut builder = ScalarStuffBuilder::new(_fbb);
+      builder.add_default_f64(args.default_f64);
+      if let Some(x) = args.maybe_f64 { builder.add_maybe_f64(x); }
+      builder.add_just_f64(args.just_f64);
+      builder.add_default_u64(args.default_u64);
+      if let Some(x) = args.maybe_u64 { builder.add_maybe_u64(x); }
+      builder.add_just_u64(args.just_u64);
+      builder.add_default_i64(args.default_i64);
+      if let Some(x) = args.maybe_i64 { builder.add_maybe_i64(x); }
+      builder.add_just_i64(args.just_i64);
+      builder.add_default_f32(args.default_f32);
+      if let Some(x) = args.maybe_f32 { builder.add_maybe_f32(x); }
+      builder.add_just_f32(args.just_f32);
+      builder.add_default_u32(args.default_u32);
+      if let Some(x) = args.maybe_u32 { builder.add_maybe_u32(x); }
+      builder.add_just_u32(args.just_u32);
+      builder.add_default_i32(args.default_i32);
+      if let Some(x) = args.maybe_i32 { builder.add_maybe_i32(x); }
+      builder.add_just_i32(args.just_i32);
+      builder.add_default_u16(args.default_u16);
+      if let Some(x) = args.maybe_u16 { builder.add_maybe_u16(x); }
+      builder.add_just_u16(args.just_u16);
+      builder.add_default_i16(args.default_i16);
+      if let Some(x) = args.maybe_i16 { builder.add_maybe_i16(x); }
+      builder.add_just_i16(args.just_i16);
+      builder.add_default_enum(args.default_enum);
+      if let Some(x) = args.maybe_enum { builder.add_maybe_enum(x); }
+      builder.add_just_enum(args.just_enum);
+      builder.add_default_bool(args.default_bool);
+      if let Some(x) = args.maybe_bool { builder.add_maybe_bool(x); }
+      builder.add_just_bool(args.just_bool);
+      builder.add_default_u8(args.default_u8);
+      if let Some(x) = args.maybe_u8 { builder.add_maybe_u8(x); }
+      builder.add_just_u8(args.just_u8);
+      builder.add_default_i8(args.default_i8);
+      if let Some(x) = args.maybe_i8 { builder.add_maybe_i8(x); }
+      builder.add_just_i8(args.just_i8);
+      builder.finish()
+    }
+
+    pub fn unpack(&self) -> ScalarStuffT {
+      let just_i8 = self.just_i8();
+      let maybe_i8 = self.maybe_i8();
+      let default_i8 = self.default_i8();
+      let just_u8 = self.just_u8();
+      let maybe_u8 = self.maybe_u8();
+      let default_u8 = self.default_u8();
+      let just_i16 = self.just_i16();
+      let maybe_i16 = self.maybe_i16();
+      let default_i16 = self.default_i16();
+      let just_u16 = self.just_u16();
+      let maybe_u16 = self.maybe_u16();
+      let default_u16 = self.default_u16();
+      let just_i32 = self.just_i32();
+      let maybe_i32 = self.maybe_i32();
+      let default_i32 = self.default_i32();
+      let just_u32 = self.just_u32();
+      let maybe_u32 = self.maybe_u32();
+      let default_u32 = self.default_u32();
+      let just_i64 = self.just_i64();
+      let maybe_i64 = self.maybe_i64();
+      let default_i64 = self.default_i64();
+      let just_u64 = self.just_u64();
+      let maybe_u64 = self.maybe_u64();
+      let default_u64 = self.default_u64();
+      let just_f32 = self.just_f32();
+      let maybe_f32 = self.maybe_f32();
+      let default_f32 = self.default_f32();
+      let just_f64 = self.just_f64();
+      let maybe_f64 = self.maybe_f64();
+      let default_f64 = self.default_f64();
+      let just_bool = self.just_bool();
+      let maybe_bool = self.maybe_bool();
+      let default_bool = self.default_bool();
+      let just_enum = self.just_enum();
+      let maybe_enum = self.maybe_enum();
+      let default_enum = self.default_enum();
+      ScalarStuffT {
+        just_i8,
+        maybe_i8,
+        default_i8,
+        just_u8,
+        maybe_u8,
+        default_u8,
+        just_i16,
+        maybe_i16,
+        default_i16,
+        just_u16,
+        maybe_u16,
+        default_u16,
+        just_i32,
+        maybe_i32,
+        default_i32,
+        just_u32,
+        maybe_u32,
+        default_u32,
+        just_i64,
+        maybe_i64,
+        default_i64,
+        just_u64,
+        maybe_u64,
+        default_u64,
+        just_f32,
+        maybe_f32,
+        default_f32,
+        just_f64,
+        maybe_f64,
+        default_f64,
+        just_bool,
+        maybe_bool,
+        default_bool,
+        just_enum,
+        maybe_enum,
+        default_enum,
+      }
+    }
+    pub const VT_JUST_I8: flatbuffers::VOffsetT = 4;
+    pub const VT_MAYBE_I8: flatbuffers::VOffsetT = 6;
+    pub const VT_DEFAULT_I8: flatbuffers::VOffsetT = 8;
+    pub const VT_JUST_U8: flatbuffers::VOffsetT = 10;
+    pub const VT_MAYBE_U8: flatbuffers::VOffsetT = 12;
+    pub const VT_DEFAULT_U8: flatbuffers::VOffsetT = 14;
+    pub const VT_JUST_I16: flatbuffers::VOffsetT = 16;
+    pub const VT_MAYBE_I16: flatbuffers::VOffsetT = 18;
+    pub const VT_DEFAULT_I16: flatbuffers::VOffsetT = 20;
+    pub const VT_JUST_U16: flatbuffers::VOffsetT = 22;
+    pub const VT_MAYBE_U16: flatbuffers::VOffsetT = 24;
+    pub const VT_DEFAULT_U16: flatbuffers::VOffsetT = 26;
+    pub const VT_JUST_I32: flatbuffers::VOffsetT = 28;
+    pub const VT_MAYBE_I32: flatbuffers::VOffsetT = 30;
+    pub const VT_DEFAULT_I32: flatbuffers::VOffsetT = 32;
+    pub const VT_JUST_U32: flatbuffers::VOffsetT = 34;
+    pub const VT_MAYBE_U32: flatbuffers::VOffsetT = 36;
+    pub const VT_DEFAULT_U32: flatbuffers::VOffsetT = 38;
+    pub const VT_JUST_I64: flatbuffers::VOffsetT = 40;
+    pub const VT_MAYBE_I64: flatbuffers::VOffsetT = 42;
+    pub const VT_DEFAULT_I64: flatbuffers::VOffsetT = 44;
+    pub const VT_JUST_U64: flatbuffers::VOffsetT = 46;
+    pub const VT_MAYBE_U64: flatbuffers::VOffsetT = 48;
+    pub const VT_DEFAULT_U64: flatbuffers::VOffsetT = 50;
+    pub const VT_JUST_F32: flatbuffers::VOffsetT = 52;
+    pub const VT_MAYBE_F32: flatbuffers::VOffsetT = 54;
+    pub const VT_DEFAULT_F32: flatbuffers::VOffsetT = 56;
+    pub const VT_JUST_F64: flatbuffers::VOffsetT = 58;
+    pub const VT_MAYBE_F64: flatbuffers::VOffsetT = 60;
+    pub const VT_DEFAULT_F64: flatbuffers::VOffsetT = 62;
+    pub const VT_JUST_BOOL: flatbuffers::VOffsetT = 64;
+    pub const VT_MAYBE_BOOL: flatbuffers::VOffsetT = 66;
+    pub const VT_DEFAULT_BOOL: flatbuffers::VOffsetT = 68;
+    pub const VT_JUST_ENUM: flatbuffers::VOffsetT = 70;
+    pub const VT_MAYBE_ENUM: flatbuffers::VOffsetT = 72;
+    pub const VT_DEFAULT_ENUM: flatbuffers::VOffsetT = 74;
+
+  #[inline]
+  pub fn just_i8(&self) -> i8 {
+    self._tab.get::<i8>(ScalarStuff::VT_JUST_I8, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_i8(&self) -> Option<i8> {
+    self._tab.get::<i8>(ScalarStuff::VT_MAYBE_I8, None)
+  }
+  #[inline]
+  pub fn default_i8(&self) -> i8 {
+    self._tab.get::<i8>(ScalarStuff::VT_DEFAULT_I8, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_u8(&self) -> u8 {
+    self._tab.get::<u8>(ScalarStuff::VT_JUST_U8, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_u8(&self) -> Option<u8> {
+    self._tab.get::<u8>(ScalarStuff::VT_MAYBE_U8, None)
+  }
+  #[inline]
+  pub fn default_u8(&self) -> u8 {
+    self._tab.get::<u8>(ScalarStuff::VT_DEFAULT_U8, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_i16(&self) -> i16 {
+    self._tab.get::<i16>(ScalarStuff::VT_JUST_I16, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_i16(&self) -> Option<i16> {
+    self._tab.get::<i16>(ScalarStuff::VT_MAYBE_I16, None)
+  }
+  #[inline]
+  pub fn default_i16(&self) -> i16 {
+    self._tab.get::<i16>(ScalarStuff::VT_DEFAULT_I16, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_u16(&self) -> u16 {
+    self._tab.get::<u16>(ScalarStuff::VT_JUST_U16, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_u16(&self) -> Option<u16> {
+    self._tab.get::<u16>(ScalarStuff::VT_MAYBE_U16, None)
+  }
+  #[inline]
+  pub fn default_u16(&self) -> u16 {
+    self._tab.get::<u16>(ScalarStuff::VT_DEFAULT_U16, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_i32(&self) -> i32 {
+    self._tab.get::<i32>(ScalarStuff::VT_JUST_I32, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_i32(&self) -> Option<i32> {
+    self._tab.get::<i32>(ScalarStuff::VT_MAYBE_I32, None)
+  }
+  #[inline]
+  pub fn default_i32(&self) -> i32 {
+    self._tab.get::<i32>(ScalarStuff::VT_DEFAULT_I32, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_u32(&self) -> u32 {
+    self._tab.get::<u32>(ScalarStuff::VT_JUST_U32, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_u32(&self) -> Option<u32> {
+    self._tab.get::<u32>(ScalarStuff::VT_MAYBE_U32, None)
+  }
+  #[inline]
+  pub fn default_u32(&self) -> u32 {
+    self._tab.get::<u32>(ScalarStuff::VT_DEFAULT_U32, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_i64(&self) -> i64 {
+    self._tab.get::<i64>(ScalarStuff::VT_JUST_I64, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_i64(&self) -> Option<i64> {
+    self._tab.get::<i64>(ScalarStuff::VT_MAYBE_I64, None)
+  }
+  #[inline]
+  pub fn default_i64(&self) -> i64 {
+    self._tab.get::<i64>(ScalarStuff::VT_DEFAULT_I64, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_u64(&self) -> u64 {
+    self._tab.get::<u64>(ScalarStuff::VT_JUST_U64, Some(0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_u64(&self) -> Option<u64> {
+    self._tab.get::<u64>(ScalarStuff::VT_MAYBE_U64, None)
+  }
+  #[inline]
+  pub fn default_u64(&self) -> u64 {
+    self._tab.get::<u64>(ScalarStuff::VT_DEFAULT_U64, Some(42)).unwrap()
+  }
+  #[inline]
+  pub fn just_f32(&self) -> f32 {
+    self._tab.get::<f32>(ScalarStuff::VT_JUST_F32, Some(0.0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_f32(&self) -> Option<f32> {
+    self._tab.get::<f32>(ScalarStuff::VT_MAYBE_F32, None)
+  }
+  #[inline]
+  pub fn default_f32(&self) -> f32 {
+    self._tab.get::<f32>(ScalarStuff::VT_DEFAULT_F32, Some(42.0)).unwrap()
+  }
+  #[inline]
+  pub fn just_f64(&self) -> f64 {
+    self._tab.get::<f64>(ScalarStuff::VT_JUST_F64, Some(0.0)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_f64(&self) -> Option<f64> {
+    self._tab.get::<f64>(ScalarStuff::VT_MAYBE_F64, None)
+  }
+  #[inline]
+  pub fn default_f64(&self) -> f64 {
+    self._tab.get::<f64>(ScalarStuff::VT_DEFAULT_F64, Some(42.0)).unwrap()
+  }
+  #[inline]
+  pub fn just_bool(&self) -> bool {
+    self._tab.get::<bool>(ScalarStuff::VT_JUST_BOOL, Some(false)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_bool(&self) -> Option<bool> {
+    self._tab.get::<bool>(ScalarStuff::VT_MAYBE_BOOL, None)
+  }
+  #[inline]
+  pub fn default_bool(&self) -> bool {
+    self._tab.get::<bool>(ScalarStuff::VT_DEFAULT_BOOL, Some(true)).unwrap()
+  }
+  #[inline]
+  pub fn just_enum(&self) -> OptionalByte {
+    self._tab.get::<OptionalByte>(ScalarStuff::VT_JUST_ENUM, Some(OptionalByte::None)).unwrap()
+  }
+  #[inline]
+  pub fn maybe_enum(&self) -> Option<OptionalByte> {
+    self._tab.get::<OptionalByte>(ScalarStuff::VT_MAYBE_ENUM, None)
+  }
+  #[inline]
+  pub fn default_enum(&self) -> OptionalByte {
+    self._tab.get::<OptionalByte>(ScalarStuff::VT_DEFAULT_ENUM, Some(OptionalByte::One)).unwrap()
+  }
+}
+
+impl flatbuffers::Verifiable for ScalarStuff<'_> {
+  #[inline]
+  fn run_verifier(
+    v: &mut flatbuffers::Verifier, pos: usize
+  ) -> Result<(), flatbuffers::InvalidFlatbuffer> {
+    use self::flatbuffers::Verifiable;
+    v.visit_table(pos)?
+     .visit_field::<i8>(&"just_i8", Self::VT_JUST_I8, false)?
+     .visit_field::<i8>(&"maybe_i8", Self::VT_MAYBE_I8, false)?
+     .visit_field::<i8>(&"default_i8", Self::VT_DEFAULT_I8, false)?
+     .visit_field::<u8>(&"just_u8", Self::VT_JUST_U8, false)?
+     .visit_field::<u8>(&"maybe_u8", Self::VT_MAYBE_U8, false)?
+     .visit_field::<u8>(&"default_u8", Self::VT_DEFAULT_U8, false)?
+     .visit_field::<i16>(&"just_i16", Self::VT_JUST_I16, false)?
+     .visit_field::<i16>(&"maybe_i16", Self::VT_MAYBE_I16, false)?
+     .visit_field::<i16>(&"default_i16", Self::VT_DEFAULT_I16, false)?
+     .visit_field::<u16>(&"just_u16", Self::VT_JUST_U16, false)?
+     .visit_field::<u16>(&"maybe_u16", Self::VT_MAYBE_U16, false)?
+     .visit_field::<u16>(&"default_u16", Self::VT_DEFAULT_U16, false)?
+     .visit_field::<i32>(&"just_i32", Self::VT_JUST_I32, false)?
+     .visit_field::<i32>(&"maybe_i32", Self::VT_MAYBE_I32, false)?
+     .visit_field::<i32>(&"default_i32", Self::VT_DEFAULT_I32, false)?
+     .visit_field::<u32>(&"just_u32", Self::VT_JUST_U32, false)?
+     .visit_field::<u32>(&"maybe_u32", Self::VT_MAYBE_U32, false)?
+     .visit_field::<u32>(&"default_u32", Self::VT_DEFAULT_U32, false)?
+     .visit_field::<i64>(&"just_i64", Self::VT_JUST_I64, false)?
+     .visit_field::<i64>(&"maybe_i64", Self::VT_MAYBE_I64, false)?
+     .visit_field::<i64>(&"default_i64", Self::VT_DEFAULT_I64, false)?
+     .visit_field::<u64>(&"just_u64", Self::VT_JUST_U64, false)?
+     .visit_field::<u64>(&"maybe_u64", Self::VT_MAYBE_U64, false)?
+     .visit_field::<u64>(&"default_u64", Self::VT_DEFAULT_U64, false)?
+     .visit_field::<f32>(&"just_f32", Self::VT_JUST_F32, false)?
+     .visit_field::<f32>(&"maybe_f32", Self::VT_MAYBE_F32, false)?
+     .visit_field::<f32>(&"default_f32", Self::VT_DEFAULT_F32, false)?
+     .visit_field::<f64>(&"just_f64", Self::VT_JUST_F64, false)?
+     .visit_field::<f64>(&"maybe_f64", Self::VT_MAYBE_F64, false)?
+     .visit_field::<f64>(&"default_f64", Self::VT_DEFAULT_F64, false)?
+     .visit_field::<bool>(&"just_bool", Self::VT_JUST_BOOL, false)?
+     .visit_field::<bool>(&"maybe_bool", Self::VT_MAYBE_BOOL, false)?
+     .visit_field::<bool>(&"default_bool", Self::VT_DEFAULT_BOOL, false)?
+     .visit_field::<OptionalByte>(&"just_enum", Self::VT_JUST_ENUM, false)?
+     .visit_field::<OptionalByte>(&"maybe_enum", Self::VT_MAYBE_ENUM, false)?
+     .visit_field::<OptionalByte>(&"default_enum", Self::VT_DEFAULT_ENUM, false)?
+     .finish();
+    Ok(())
+  }
+}
+pub struct ScalarStuffArgs {
+    pub just_i8: i8,
+    pub maybe_i8: Option<i8>,
+    pub default_i8: i8,
+    pub just_u8: u8,
+    pub maybe_u8: Option<u8>,
+    pub default_u8: u8,
+    pub just_i16: i16,
+    pub maybe_i16: Option<i16>,
+    pub default_i16: i16,
+    pub just_u16: u16,
+    pub maybe_u16: Option<u16>,
+    pub default_u16: u16,
+    pub just_i32: i32,
+    pub maybe_i32: Option<i32>,
+    pub default_i32: i32,
+    pub just_u32: u32,
+    pub maybe_u32: Option<u32>,
+    pub default_u32: u32,
+    pub just_i64: i64,
+    pub maybe_i64: Option<i64>,
+    pub default_i64: i64,
+    pub just_u64: u64,
+    pub maybe_u64: Option<u64>,
+    pub default_u64: u64,
+    pub just_f32: f32,
+    pub maybe_f32: Option<f32>,
+    pub default_f32: f32,
+    pub just_f64: f64,
+    pub maybe_f64: Option<f64>,
+    pub default_f64: f64,
+    pub just_bool: bool,
+    pub maybe_bool: Option<bool>,
+    pub default_bool: bool,
+    pub just_enum: OptionalByte,
+    pub maybe_enum: Option<OptionalByte>,
+    pub default_enum: OptionalByte,
+}
+impl<'a> Default for ScalarStuffArgs {
+    #[inline]
+    fn default() -> Self {
+        ScalarStuffArgs {
+            just_i8: 0,
+            maybe_i8: None,
+            default_i8: 42,
+            just_u8: 0,
+            maybe_u8: None,
+            default_u8: 42,
+            just_i16: 0,
+            maybe_i16: None,
+            default_i16: 42,
+            just_u16: 0,
+            maybe_u16: None,
+            default_u16: 42,
+            just_i32: 0,
+            maybe_i32: None,
+            default_i32: 42,
+            just_u32: 0,
+            maybe_u32: None,
+            default_u32: 42,
+            just_i64: 0,
+            maybe_i64: None,
+            default_i64: 42,
+            just_u64: 0,
+            maybe_u64: None,
+            default_u64: 42,
+            just_f32: 0.0,
+            maybe_f32: None,
+            default_f32: 42.0,
+            just_f64: 0.0,
+            maybe_f64: None,
+            default_f64: 42.0,
+            just_bool: false,
+            maybe_bool: None,
+            default_bool: true,
+            just_enum: OptionalByte::None,
+            maybe_enum: None,
+            default_enum: OptionalByte::One,
+        }
+    }
+}
+pub struct ScalarStuffBuilder<'a: 'b, 'b> {
+  fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+  start_: flatbuffers::WIPOffset<flatbuffers::TableUnfinishedWIPOffset>,
+}
+impl<'a: 'b, 'b> ScalarStuffBuilder<'a, 'b> {
+  #[inline]
+  pub fn add_just_i8(&mut self, just_i8: i8) {
+    self.fbb_.push_slot::<i8>(ScalarStuff::VT_JUST_I8, just_i8, 0);
+  }
+  #[inline]
+  pub fn add_maybe_i8(&mut self, maybe_i8: i8) {
+    self.fbb_.push_slot_always::<i8>(ScalarStuff::VT_MAYBE_I8, maybe_i8);
+  }
+  #[inline]
+  pub fn add_default_i8(&mut self, default_i8: i8) {
+    self.fbb_.push_slot::<i8>(ScalarStuff::VT_DEFAULT_I8, default_i8, 42);
+  }
+  #[inline]
+  pub fn add_just_u8(&mut self, just_u8: u8) {
+    self.fbb_.push_slot::<u8>(ScalarStuff::VT_JUST_U8, just_u8, 0);
+  }
+  #[inline]
+  pub fn add_maybe_u8(&mut self, maybe_u8: u8) {
+    self.fbb_.push_slot_always::<u8>(ScalarStuff::VT_MAYBE_U8, maybe_u8);
+  }
+  #[inline]
+  pub fn add_default_u8(&mut self, default_u8: u8) {
+    self.fbb_.push_slot::<u8>(ScalarStuff::VT_DEFAULT_U8, default_u8, 42);
+  }
+  #[inline]
+  pub fn add_just_i16(&mut self, just_i16: i16) {
+    self.fbb_.push_slot::<i16>(ScalarStuff::VT_JUST_I16, just_i16, 0);
+  }
+  #[inline]
+  pub fn add_maybe_i16(&mut self, maybe_i16: i16) {
+    self.fbb_.push_slot_always::<i16>(ScalarStuff::VT_MAYBE_I16, maybe_i16);
+  }
+  #[inline]
+  pub fn add_default_i16(&mut self, default_i16: i16) {
+    self.fbb_.push_slot::<i16>(ScalarStuff::VT_DEFAULT_I16, default_i16, 42);
+  }
+  #[inline]
+  pub fn add_just_u16(&mut self, just_u16: u16) {
+    self.fbb_.push_slot::<u16>(ScalarStuff::VT_JUST_U16, just_u16, 0);
+  }
+  #[inline]
+  pub fn add_maybe_u16(&mut self, maybe_u16: u16) {
+    self.fbb_.push_slot_always::<u16>(ScalarStuff::VT_MAYBE_U16, maybe_u16);
+  }
+  #[inline]
+  pub fn add_default_u16(&mut self, default_u16: u16) {
+    self.fbb_.push_slot::<u16>(ScalarStuff::VT_DEFAULT_U16, default_u16, 42);
+  }
+  #[inline]
+  pub fn add_just_i32(&mut self, just_i32: i32) {
+    self.fbb_.push_slot::<i32>(ScalarStuff::VT_JUST_I32, just_i32, 0);
+  }
+  #[inline]
+  pub fn add_maybe_i32(&mut self, maybe_i32: i32) {
+    self.fbb_.push_slot_always::<i32>(ScalarStuff::VT_MAYBE_I32, maybe_i32);
+  }
+  #[inline]
+  pub fn add_default_i32(&mut self, default_i32: i32) {
+    self.fbb_.push_slot::<i32>(ScalarStuff::VT_DEFAULT_I32, default_i32, 42);
+  }
+  #[inline]
+  pub fn add_just_u32(&mut self, just_u32: u32) {
+    self.fbb_.push_slot::<u32>(ScalarStuff::VT_JUST_U32, just_u32, 0);
+  }
+  #[inline]
+  pub fn add_maybe_u32(&mut self, maybe_u32: u32) {
+    self.fbb_.push_slot_always::<u32>(ScalarStuff::VT_MAYBE_U32, maybe_u32);
+  }
+  #[inline]
+  pub fn add_default_u32(&mut self, default_u32: u32) {
+    self.fbb_.push_slot::<u32>(ScalarStuff::VT_DEFAULT_U32, default_u32, 42);
+  }
+  #[inline]
+  pub fn add_just_i64(&mut self, just_i64: i64) {
+    self.fbb_.push_slot::<i64>(ScalarStuff::VT_JUST_I64, just_i64, 0);
+  }
+  #[inline]
+  pub fn add_maybe_i64(&mut self, maybe_i64: i64) {
+    self.fbb_.push_slot_always::<i64>(ScalarStuff::VT_MAYBE_I64, maybe_i64);
+  }
+  #[inline]
+  pub fn add_default_i64(&mut self, default_i64: i64) {
+    self.fbb_.push_slot::<i64>(ScalarStuff::VT_DEFAULT_I64, default_i64, 42);
+  }
+  #[inline]
+  pub fn add_just_u64(&mut self, just_u64: u64) {
+    self.fbb_.push_slot::<u64>(ScalarStuff::VT_JUST_U64, just_u64, 0);
+  }
+  #[inline]
+  pub fn add_maybe_u64(&mut self, maybe_u64: u64) {
+    self.fbb_.push_slot_always::<u64>(ScalarStuff::VT_MAYBE_U64, maybe_u64);
+  }
+  #[inline]
+  pub fn add_default_u64(&mut self, default_u64: u64) {
+    self.fbb_.push_slot::<u64>(ScalarStuff::VT_DEFAULT_U64, default_u64, 42);
+  }
+  #[inline]
+  pub fn add_just_f32(&mut self, just_f32: f32) {
+    self.fbb_.push_slot::<f32>(ScalarStuff::VT_JUST_F32, just_f32, 0.0);
+  }
+  #[inline]
+  pub fn add_maybe_f32(&mut self, maybe_f32: f32) {
+    self.fbb_.push_slot_always::<f32>(ScalarStuff::VT_MAYBE_F32, maybe_f32);
+  }
+  #[inline]
+  pub fn add_default_f32(&mut self, default_f32: f32) {
+    self.fbb_.push_slot::<f32>(ScalarStuff::VT_DEFAULT_F32, default_f32, 42.0);
+  }
+  #[inline]
+  pub fn add_just_f64(&mut self, just_f64: f64) {
+    self.fbb_.push_slot::<f64>(ScalarStuff::VT_JUST_F64, just_f64, 0.0);
+  }
+  #[inline]
+  pub fn add_maybe_f64(&mut self, maybe_f64: f64) {
+    self.fbb_.push_slot_always::<f64>(ScalarStuff::VT_MAYBE_F64, maybe_f64);
+  }
+  #[inline]
+  pub fn add_default_f64(&mut self, default_f64: f64) {
+    self.fbb_.push_slot::<f64>(ScalarStuff::VT_DEFAULT_F64, default_f64, 42.0);
+  }
+  #[inline]
+  pub fn add_just_bool(&mut self, just_bool: bool) {
+    self.fbb_.push_slot::<bool>(ScalarStuff::VT_JUST_BOOL, just_bool, false);
+  }
+  #[inline]
+  pub fn add_maybe_bool(&mut self, maybe_bool: bool) {
+    self.fbb_.push_slot_always::<bool>(ScalarStuff::VT_MAYBE_BOOL, maybe_bool);
+  }
+  #[inline]
+  pub fn add_default_bool(&mut self, default_bool: bool) {
+    self.fbb_.push_slot::<bool>(ScalarStuff::VT_DEFAULT_BOOL, default_bool, true);
+  }
+  #[inline]
+  pub fn add_just_enum(&mut self, just_enum: OptionalByte) {
+    self.fbb_.push_slot::<OptionalByte>(ScalarStuff::VT_JUST_ENUM, just_enum, OptionalByte::None);
+  }
+  #[inline]
+  pub fn add_maybe_enum(&mut self, maybe_enum: OptionalByte) {
+    self.fbb_.push_slot_always::<OptionalByte>(ScalarStuff::VT_MAYBE_ENUM, maybe_enum);
+  }
+  #[inline]
+  pub fn add_default_enum(&mut self, default_enum: OptionalByte) {
+    self.fbb_.push_slot::<OptionalByte>(ScalarStuff::VT_DEFAULT_ENUM, default_enum, OptionalByte::One);
+  }
+  #[inline]
+  pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> ScalarStuffBuilder<'a, 'b> {
+    let start = _fbb.start_table();
+    ScalarStuffBuilder {
+      fbb_: _fbb,
+      start_: start,
+    }
+  }
+  #[inline]
+  pub fn finish(self) -> flatbuffers::WIPOffset<ScalarStuff<'a>> {
+    let o = self.fbb_.end_table(self.start_);
+    flatbuffers::WIPOffset::new(o.value())
+  }
+}
+
+impl std::fmt::Debug for ScalarStuff<'_> {
+  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let mut ds = f.debug_struct("ScalarStuff");
+      ds.field("just_i8", &self.just_i8());
+      ds.field("maybe_i8", &self.maybe_i8());
+      ds.field("default_i8", &self.default_i8());
+      ds.field("just_u8", &self.just_u8());
+      ds.field("maybe_u8", &self.maybe_u8());
+      ds.field("default_u8", &self.default_u8());
+      ds.field("just_i16", &self.just_i16());
+      ds.field("maybe_i16", &self.maybe_i16());
+      ds.field("default_i16", &self.default_i16());
+      ds.field("just_u16", &self.just_u16());
+      ds.field("maybe_u16", &self.maybe_u16());
+      ds.field("default_u16", &self.default_u16());
+      ds.field("just_i32", &self.just_i32());
+      ds.field("maybe_i32", &self.maybe_i32());
+      ds.field("default_i32", &self.default_i32());
+      ds.field("just_u32", &self.just_u32());
+      ds.field("maybe_u32", &self.maybe_u32());
+      ds.field("default_u32", &self.default_u32());
+      ds.field("just_i64", &self.just_i64());
+      ds.field("maybe_i64", &self.maybe_i64());
+      ds.field("default_i64", &self.default_i64());
+      ds.field("just_u64", &self.just_u64());
+      ds.field("maybe_u64", &self.maybe_u64());
+      ds.field("default_u64", &self.default_u64());
+      ds.field("just_f32", &self.just_f32());
+      ds.field("maybe_f32", &self.maybe_f32());
+      ds.field("default_f32", &self.default_f32());
+      ds.field("just_f64", &self.just_f64());
+      ds.field("maybe_f64", &self.maybe_f64());
+      ds.field("default_f64", &self.default_f64());
+      ds.field("just_bool", &self.just_bool());
+      ds.field("maybe_bool", &self.maybe_bool());
+      ds.field("default_bool", &self.default_bool());
+      ds.field("just_enum", &self.just_enum());
+      ds.field("maybe_enum", &self.maybe_enum());
+      ds.field("default_enum", &self.default_enum());
+      ds.finish()
+  }
+}
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub struct ScalarStuffT {
+  pub just_i8: i8,
+  pub maybe_i8: Option<i8>,
+  pub default_i8: i8,
+  pub just_u8: u8,
+  pub maybe_u8: Option<u8>,
+  pub default_u8: u8,
+  pub just_i16: i16,
+  pub maybe_i16: Option<i16>,
+  pub default_i16: i16,
+  pub just_u16: u16,
+  pub maybe_u16: Option<u16>,
+  pub default_u16: u16,
+  pub just_i32: i32,
+  pub maybe_i32: Option<i32>,
+  pub default_i32: i32,
+  pub just_u32: u32,
+  pub maybe_u32: Option<u32>,
+  pub default_u32: u32,
+  pub just_i64: i64,
+  pub maybe_i64: Option<i64>,
+  pub default_i64: i64,
+  pub just_u64: u64,
+  pub maybe_u64: Option<u64>,
+  pub default_u64: u64,
+  pub just_f32: f32,
+  pub maybe_f32: Option<f32>,
+  pub default_f32: f32,
+  pub just_f64: f64,
+  pub maybe_f64: Option<f64>,
+  pub default_f64: f64,
+  pub just_bool: bool,
+  pub maybe_bool: Option<bool>,
+  pub default_bool: bool,
+  pub just_enum: OptionalByte,
+  pub maybe_enum: Option<OptionalByte>,
+  pub default_enum: OptionalByte,
+}
+impl Default for ScalarStuffT {
+  fn default() -> Self {
+    Self {
+      just_i8: 0,
+      maybe_i8: None,
+      default_i8: 42,
+      just_u8: 0,
+      maybe_u8: None,
+      default_u8: 42,
+      just_i16: 0,
+      maybe_i16: None,
+      default_i16: 42,
+      just_u16: 0,
+      maybe_u16: None,
+      default_u16: 42,
+      just_i32: 0,
+      maybe_i32: None,
+      default_i32: 42,
+      just_u32: 0,
+      maybe_u32: None,
+      default_u32: 42,
+      just_i64: 0,
+      maybe_i64: None,
+      default_i64: 42,
+      just_u64: 0,
+      maybe_u64: None,
+      default_u64: 42,
+      just_f32: 0.0,
+      maybe_f32: None,
+      default_f32: 42.0,
+      just_f64: 0.0,
+      maybe_f64: None,
+      default_f64: 42.0,
+      just_bool: false,
+      maybe_bool: None,
+      default_bool: true,
+      just_enum: OptionalByte::None,
+      maybe_enum: None,
+      default_enum: OptionalByte::One,
+    }
+  }
+}
+impl ScalarStuffT {
+  pub fn pack<'b>(
+    &self,
+    _fbb: &mut flatbuffers::FlatBufferBuilder<'b>
+  ) -> flatbuffers::WIPOffset<ScalarStuff<'b>> {
+    let just_i8 = self.just_i8;
+    let maybe_i8 = self.maybe_i8;
+    let default_i8 = self.default_i8;
+    let just_u8 = self.just_u8;
+    let maybe_u8 = self.maybe_u8;
+    let default_u8 = self.default_u8;
+    let just_i16 = self.just_i16;
+    let maybe_i16 = self.maybe_i16;
+    let default_i16 = self.default_i16;
+    let just_u16 = self.just_u16;
+    let maybe_u16 = self.maybe_u16;
+    let default_u16 = self.default_u16;
+    let just_i32 = self.just_i32;
+    let maybe_i32 = self.maybe_i32;
+    let default_i32 = self.default_i32;
+    let just_u32 = self.just_u32;
+    let maybe_u32 = self.maybe_u32;
+    let default_u32 = self.default_u32;
+    let just_i64 = self.just_i64;
+    let maybe_i64 = self.maybe_i64;
+    let default_i64 = self.default_i64;
+    let just_u64 = self.just_u64;
+    let maybe_u64 = self.maybe_u64;
+    let default_u64 = self.default_u64;
+    let just_f32 = self.just_f32;
+    let maybe_f32 = self.maybe_f32;
+    let default_f32 = self.default_f32;
+    let just_f64 = self.just_f64;
+    let maybe_f64 = self.maybe_f64;
+    let default_f64 = self.default_f64;
+    let just_bool = self.just_bool;
+    let maybe_bool = self.maybe_bool;
+    let default_bool = self.default_bool;
+    let just_enum = self.just_enum;
+    let maybe_enum = self.maybe_enum;
+    let default_enum = self.default_enum;
+    ScalarStuff::create(_fbb, &ScalarStuffArgs{
+      just_i8,
+      maybe_i8,
+      default_i8,
+      just_u8,
+      maybe_u8,
+      default_u8,
+      just_i16,
+      maybe_i16,
+      default_i16,
+      just_u16,
+      maybe_u16,
+      default_u16,
+      just_i32,
+      maybe_i32,
+      default_i32,
+      just_u32,
+      maybe_u32,
+      default_u32,
+      just_i64,
+      maybe_i64,
+      default_i64,
+      just_u64,
+      maybe_u64,
+      default_u64,
+      just_f32,
+      maybe_f32,
+      default_f32,
+      just_f64,
+      maybe_f64,
+      default_f64,
+      just_bool,
+      maybe_bool,
+      default_bool,
+      just_enum,
+      maybe_enum,
+      default_enum,
+    })
+  }
+}
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_root_as_scalar_stuff<'a>(buf: &'a [u8]) -> ScalarStuff<'a> {
+  unsafe { flatbuffers::root_unchecked::<ScalarStuff<'a>>(buf) }
+}
+
+#[inline]
+#[deprecated(since="2.0.0", note="Deprecated in favor of `root_as...` methods.")]
+pub fn get_size_prefixed_root_as_scalar_stuff<'a>(buf: &'a [u8]) -> ScalarStuff<'a> {
+  unsafe { flatbuffers::size_prefixed_root_unchecked::<ScalarStuff<'a>>(buf) }
+}
+
+#[inline]
+/// Verifies that a buffer of bytes contains a `ScalarStuff`
+/// and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_scalar_stuff_unchecked`.
+pub fn root_as_scalar_stuff(buf: &[u8]) -> Result<ScalarStuff, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root::<ScalarStuff>(buf)
+}
+#[inline]
+/// Verifies that a buffer of bytes contains a size prefixed
+/// `ScalarStuff` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `size_prefixed_root_as_scalar_stuff_unchecked`.
+pub fn size_prefixed_root_as_scalar_stuff(buf: &[u8]) -> Result<ScalarStuff, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root::<ScalarStuff>(buf)
+}
+#[inline]
+/// Verifies, with the given options, that a buffer of bytes
+/// contains a `ScalarStuff` and returns it.
+/// Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_scalar_stuff_unchecked`.
+pub fn root_as_scalar_stuff_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<ScalarStuff<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::root_with_opts::<ScalarStuff<'b>>(opts, buf)
+}
+#[inline]
+/// Verifies, with the given verifier options, that a buffer of
+/// bytes contains a size prefixed `ScalarStuff` and returns
+/// it. Note that verification is still experimental and may not
+/// catch every error, or be maximally performant. For the
+/// previous, unchecked, behavior use
+/// `root_as_scalar_stuff_unchecked`.
+pub fn size_prefixed_root_as_scalar_stuff_with_opts<'b, 'o>(
+  opts: &'o flatbuffers::VerifierOptions,
+  buf: &'b [u8],
+) -> Result<ScalarStuff<'b>, flatbuffers::InvalidFlatbuffer> {
+  flatbuffers::size_prefixed_root_with_opts::<ScalarStuff<'b>>(opts, buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a ScalarStuff and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid `ScalarStuff`.
+pub unsafe fn root_as_scalar_stuff_unchecked(buf: &[u8]) -> ScalarStuff {
+  flatbuffers::root_unchecked::<ScalarStuff>(buf)
+}
+#[inline]
+/// Assumes, without verification, that a buffer of bytes contains a size prefixed ScalarStuff and returns it.
+/// # Safety
+/// Callers must trust the given bytes do indeed contain a valid size prefixed `ScalarStuff`.
+pub unsafe fn size_prefixed_root_as_scalar_stuff_unchecked(buf: &[u8]) -> ScalarStuff {
+  flatbuffers::size_prefixed_root_unchecked::<ScalarStuff>(buf)
+}
+pub const SCALAR_STUFF_IDENTIFIER: &str = "NULL";
+
+#[inline]
+pub fn scalar_stuff_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, SCALAR_STUFF_IDENTIFIER, false)
+}
+
+#[inline]
+pub fn scalar_stuff_size_prefixed_buffer_has_identifier(buf: &[u8]) -> bool {
+  flatbuffers::buffer_has_identifier(buf, SCALAR_STUFF_IDENTIFIER, true)
+}
+
+pub const SCALAR_STUFF_EXTENSION: &str = "mon";
+
+#[inline]
+pub fn finish_scalar_stuff_buffer<'a, 'b>(
+    fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>,
+    root: flatbuffers::WIPOffset<ScalarStuff<'a>>) {
+  fbb.finish(root, Some(SCALAR_STUFF_IDENTIFIER));
+}
+
+#[inline]
+pub fn finish_size_prefixed_scalar_stuff_buffer<'a, 'b>(fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, root: flatbuffers::WIPOffset<ScalarStuff<'a>>) {
+  fbb.finish_size_prefixed(root, Some(SCALAR_STUFF_IDENTIFIER));
+}
+}  // pub mod optional_scalars
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/package.json b/3rdparty/TNN/third_party/flatbuffers/tests/package.json
new file mode 100644
index 0000000..654e57f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/package.json
@@ -0,0 +1,7 @@
+{
+    "dependencies": {
+        "esm": "^3.2.25",
+        "flatbuffers": "../",
+        "grpc": "^1.24.6"
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/phpTest.php b/3rdparty/TNN/third_party/flatbuffers/tests/phpTest.php
new file mode 100644
index 0000000..c1447c2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/phpTest.php
@@ -0,0 +1,632 @@
+<?php
+// manual load for testing. please use PSR style autoloader when you use flatbuffers.
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Constants.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "ByteBuffer.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "FlatbufferBuilder.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Table.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Struct.php"));
+foreach (glob(join(DIRECTORY_SEPARATOR, array(dirname(__FILE__), "MyGame", "Example", "*.php"))) as $file) {
+    require $file;
+}
+
+function main()
+{
+    /// Begin Test
+    $assert = new Assert();
+
+    // First, let's test reading a FlatBuffer generated by C++ code:
+    // This file was generated from monsterdata_test.json
+
+    // Now test it:
+    $data = file_get_contents('monsterdata_test.mon');
+    $bb = Google\FlatBuffers\ByteBuffer::wrap($data);
+    test_buffer($assert, $bb);
+
+    // Second, let's create a FlatBuffer from scratch in JavaScript, and test it also.
+    // We use an initial size of 1 to exercise the reallocation algorithm,
+    // normally a size larger than the typical FlatBuffer you generate would be
+    // better for performance.
+    $fbb = new Google\FlatBuffers\FlatBufferBuilder(1);
+
+    // We set up the same values as monsterdata.json:
+    $str = $fbb->createString("MyMonster");
+    $name = $fbb->createString('Fred');
+    \MyGame\Example\Monster::startMonster($fbb);
+    \MyGame\Example\Monster::addName($fbb, $name);
+    $enemy = \MyGame\Example\Monster::endMonster($fbb);
+
+    $inv = \MyGame\Example\Monster::CreateInventoryVector($fbb, array(0, 1, 2, 3, 4));
+
+    $fred = $fbb->createString('Fred');
+    \MyGame\Example\Monster::StartMonster($fbb);
+    \MyGame\Example\Monster::AddName($fbb, $fred);
+    $mon2 = \MyGame\Example\Monster::EndMonster($fbb);
+
+    \MyGame\Example\Monster::StartTest4Vector($fbb, 2);
+    \MyGame\Example\Test::CreateTest($fbb, 10, 20);
+    \MyGame\Example\Test::CreateTest($fbb, 30, 40);
+    $test4 = $fbb->endVector();
+
+    $testArrayOfString = \MyGame\Example\Monster::CreateTestarrayofstringVector($fbb, array(
+        $fbb->createString('test1'),
+        $fbb->createString('test2')
+    ));
+
+    \MyGame\Example\Monster::StartMonster($fbb);
+    \MyGame\Example\Monster::AddPos($fbb, \MyGame\Example\Vec3::CreateVec3($fbb,
+        1.0, 2.0, 3.0, //float
+        3.0, // double
+        \MyGame\Example\Color::Green,
+        5, //short
+        6));
+    \MyGame\Example\Monster::AddHp($fbb, 80);
+    \MyGame\Example\Monster::AddName($fbb, $str);
+    \MyGame\Example\Monster::AddInventory($fbb, $inv);
+    \MyGame\Example\Monster::AddTestType($fbb, \MyGame\Example\Any::Monster);
+    \MyGame\Example\Monster::AddTest($fbb, $mon2);
+    \MyGame\Example\Monster::AddTest4($fbb, $test4);
+    \MyGame\Example\Monster::AddTestarrayofstring($fbb, $testArrayOfString);
+    \MyGame\Example\Monster::AddEnemy($fbb, $enemy);
+    \MyGame\Example\Monster::AddTestbool($fbb, true);
+    $mon = \MyGame\Example\Monster::EndMonster($fbb);
+
+    \MyGame\Example\Monster::FinishMonsterBuffer($fbb, $mon);
+
+    // Test it:
+    test_buffer($assert, $fbb->dataBuffer());
+
+    testByteBuffer($assert);
+    fuzzTest1($assert);
+//    testUnicode($assert);
+
+    echo 'FlatBuffers php test: completed successfully' . PHP_EOL;
+}
+
+try {
+    main();
+    exit(0);
+} catch(Exception $e) {
+    printf("Fatal error: Uncaught exception '%s' with message '%s. in %s:%d\n", get_class($e), $e->getMessage(), $e->getFile(), $e->getLine());
+    printf("Stack trace:\n");
+    echo $e->getTraceAsString() . PHP_EOL;
+    printf("  thrown in in %s:%d\n", $e->getFile(), $e->getLine());
+
+    die(-1);
+}
+
+function test_buffer(Assert $assert, Google\FlatBuffers\ByteBuffer $bb) {
+
+    $assert->ok(MyGame\Example\Monster::MonsterBufferHasIdentifier($bb));
+    $monster = \MyGame\Example\Monster::GetRootAsMonster($bb);
+
+    $assert->strictEqual($monster->GetHp(), 80);
+    $assert->strictEqual($monster->GetMana(), 150); // default
+
+    $assert->strictEqual($monster->GetName(), 'MyMonster');
+
+    $pos = $monster->GetPos();
+    $assert->strictEqual($pos->GetX(), 1.0);
+    $assert->strictEqual($pos->GetY(), 2.0);
+    $assert->strictEqual($pos->GetZ(), 3.0);
+
+    $assert->Equal($pos->GetTest1(), 3.0);
+    $assert->strictEqual($pos->GetTest2(), \MyGame\Example\Color::Green);
+
+    $t = $pos->GetTest3();
+    $assert->strictEqual($t->GetA(), 5);
+    $assert->strictEqual($t->GetB(), 6);
+    $assert->strictEqual($monster->GetTestType(), \MyGame\Example\Any::Monster);
+
+    $monster2 = new \MyGame\Example\Monster();
+    $assert->strictEqual($monster->GetTest($monster2) != null, true);
+    $assert->strictEqual($monster2->GetName(), 'Fred');
+
+    $assert->strictEqual($monster->GetInventoryLength(), 5);
+    $invsum = 0;
+    for ($i = 0; $i < $monster->GetInventoryLength(); $i++) {
+        $invsum += $monster->GetInventory($i);
+    }
+    $assert->strictEqual($invsum, 10);
+
+    $assert->strictEqual(bin2hex($monster->GetInventoryBytes()), "0001020304");
+
+    $test_0 = $monster->GetTest4(0);
+    $test_1 = $monster->GetTest4(1);
+    $assert->strictEqual($monster->GetTest4Length(), 2);
+    $assert->strictEqual($test_0->GetA() + $test_0->GetB() + $test_1->GetA() + $test_1->GetB(), 100);
+
+    $assert->strictEqual($monster->GetTestarrayofstringLength(), 2);
+    $assert->strictEqual($monster->GetTestarrayofstring(0), 'test1');
+    $assert->strictEqual($monster->GetTestarrayofstring(1), 'test2');
+
+    $fred = $monster->getEnemy();
+    $assert->Equal('Fred', $fred->getName());
+
+    $assert->strictEqual($monster->GetTestbool(), true);
+}
+
+//function testUnicode(Assert $assert) {
+//    // missing unicode_test.mon, implemented later
+//    $correct = file_get_contents('unicode_test.mon');
+//    $json = json_decode(file_get_contents('unicode_test.json'));
+//
+//    // Test reading
+//    $bb = flatbuffers\ByteBuffer::Wrap($correct);
+//    $monster = \MyGame\Example\Monster::GetRootAsMonster($bb);
+//    $assert->strictEqual($monster->GetName(), $json["name"]);
+//
+//    //$assert->deepEqual(new Buffer(monster.name(flatbuffers.Encoding.UTF8_BYTES)), new Buffer(json.name));
+//    //assert.strictEqual(monster.testarrayoftablesLength(), json.testarrayoftables.length);
+//    foreach ($json["testarrayoftables"]as $i => $table) {
+//        $value = $monster->GetTestArrayOfTables($i);
+//        $assert->strictEqual($value->GetName(), $table["name"]);
+//        //assert.deepEqual(new Buffer(value.name(flatbuffers.Encoding.UTF8_BYTES)), new Buffer(table.name));
+//    }
+//    $assert->strictEqual($monster->GetTestarrayofstringLength(), $json["testarrayofstring"]["length"]);
+//    foreach ($json["testarrayofstring"] as $i => $string) {
+//        $assert->strictEqual($monster->GetTestarrayofstring($i), $string);
+//        //assert.deepEqual(new Buffer(monster.testarrayofstring(i, flatbuffers.Encoding.UTF8_BYTES)), new Buffer(string));
+//    }
+//
+//    // Test writing
+//    $fbb = new FlatBuffers\FlatBufferBuilder(1);
+//    $name = $fbb->CreateString($json["name"]);
+//    $testarrayoftablesOffsets = array_map(function($table) use($fbb) {
+//        $name = $fbb->CreateString($table["name"]);
+//        \MyGame\Example\Monster::StartMonster($fbb);
+//        \MyGame\Example\Monster::AddName($fbb, $name);
+//        return \MyGame\Example\Monster::EndMonster($fbb);
+//    }, $json["testarrayoftables"]);
+//    $testarrayoftablesOffset = \MyGame\Example\Monster::CreateTestarrayoftablesVector($fbb,
+//            $testarrayoftablesOffsets);
+////    $testarrayofstringOffset = \MyGame\Example\Monster::CreateTestarrayofstringVector($fbb,
+////            $json["testarrayofstring"].map(function(string) { return fbb.createString(string); }));
+//
+//    \MyGame\Example\Monster::startMonster($fbb);
+//    \MyGame\Example\Monster::addTestarrayofstring($fbb, $testarrayoftablesOffset);
+//    \MyGame\Example\Monster::addTestarrayoftables($fbb, $testarrayoftablesOffset);
+//    \MyGame\Example\Monster::addName($fbb, $name);
+//    \MyGame\Example\Monster::finishMonsterBuffer($fbb, \MyGame\Example\Monster::endMonster($fbb));
+//    //;assert.deepEqual(new Buffer(fbb.asUint8Array()), correct);
+//}
+
+// Low level stress/fuzz test: serialize/deserialize a variety of
+// different kinds of data in different combinations
+function fuzzTest1(Assert $assert)
+{
+
+    // Values we're testing against: chosen to ensure no bits get chopped
+    // off anywhere, and also be different from eachother.
+    $bool_val = true;
+    $char_val = -127; // 0x81
+    $uchar_val = 0xFF;
+    $short_val = -32222; // 0x8222;
+    $ushort_val = 0xFEEE;
+    $int_val = 0x7fffffff | 0;
+    // for now
+    $uint_val = 1;
+    $long_val = 2;
+    $ulong_val = 3;
+
+//    var uint_val   = 0xFDDDDDDD;
+//    var long_val   = new flatbuffers.Long(0x44444444, 0x84444444);
+//    var ulong_val  = new flatbuffers.Long(0xCCCCCCCC, 0xFCCCCCCC);
+
+    $float_val = 3.14159;
+    $double_val = 3.14159265359;
+
+    $test_values_max = 11;
+    $fields_per_object = 4;
+    // current implementation is not good at encoding.
+    $num_fuzz_objects = 1000;
+    $builder = new Google\FlatBuffers\FlatBufferBuilder(1);
+
+    // can't use same implementation due to PHP_INTMAX overflow issue.
+    // we use mt_rand function to reproduce fuzzy test.
+    mt_srand(48271);
+    $objects = array();
+    // Generate num_fuzz_objects random objects each consisting of
+    // fields_per_object fields, each of a random type.
+    for ($i = 0; $i < $num_fuzz_objects; $i++) {
+        $builder->startObject($fields_per_object);
+        for ($f = 0; $f < $fields_per_object; $f++) {
+            $choice = mt_rand() % $test_values_max;
+            switch ($choice) {
+                case 0:
+                    $builder->addBoolX($f, $bool_val, 0);
+                    break;
+                case 1:
+                    $builder->addByteX($f, $char_val, 0);
+                    break;
+                case 2:
+                    $builder->addSbyteX($f, $uchar_val, 0);
+                    break;
+                case 3:
+                    $builder->addShortX($f, $short_val, 0);
+                    break;
+                case 4:
+                    $builder->addUshortX($f, $ushort_val, 0);
+                    break;
+                case 5:
+                    $builder->addIntX($f, $int_val, 0);
+                    break;
+                case 6:
+                    $builder->addUintX($f, $uint_val, 0);
+                    break;
+                case 7:
+                    $builder->addLongX($f, $long_val, 0);
+                    break;
+                case 8:
+                    $builder->addUlongX($f, $ulong_val, 0);
+                    break;
+                case 9:
+                    $builder->addFloatX($f, $float_val, 0);
+                    break;
+                case 10:
+                    $builder->addDoubleX($f, $double_val, 0);
+                    break;
+            }
+        }
+        $objects[] = $builder->endObject();
+    }
+    $builder->prep(8, 0); // Align whole buffer.
+
+    mt_srand(48271); // Reset
+    $builder->finish($objects[count($objects) - 1]);
+
+    $view = Google\FlatBuffers\ByteBuffer::wrap($builder->sizedByteArray());
+    for ($i = 0; $i < $num_fuzz_objects; $i++) {
+        $offset = $view->capacity() - $objects[$i];
+        for ($f = 0; $f < $fields_per_object; $f++) {
+            $choice = mt_rand() % $test_values_max;
+            $vtable_offset = fieldIndexToOffset($f);
+            $vtable = $offset - $view->getInt($offset);
+            $assert->ok($vtable_offset < $view->getShort($vtable));
+            $field_offset = $offset + $view->getShort($vtable + $vtable_offset);
+            switch ($choice) {
+                case 0:
+                    $assert->strictEqual(!!$view->getBool($field_offset), $bool_val);
+                    break;
+                case 1:
+                    $assert->strictEqual($view->getSbyte($field_offset), $char_val);
+                    break;
+                case 2:
+                    $assert->strictEqual($view->getByte($field_offset), $uchar_val);
+                    break;
+                case 3:
+                    $assert->strictEqual($view->getShort($field_offset), $short_val);
+                    break;
+                case 4:
+                    $assert->strictEqual($view->getUShort($field_offset), $ushort_val);
+                    break;
+                case 5:
+                    $assert->strictEqual($view->getInt($field_offset), $int_val);
+                    break;
+                case 6:
+                    $assert->strictEqual($view->getUint($field_offset), $uint_val);
+                    break;
+                case 7:
+                    if (PHP_INT_SIZE <= 4) break;
+                    $assert->strictEqual($view->getLong($field_offset), $long_val);
+                    break;
+                case 8:
+                    if (PHP_INT_SIZE <= 4) break;
+                    $assert->strictEqual($view->getUlong($field_offset), $ulong_val);
+                    break;
+                case 9:
+                    $assert->strictEqual(floor($view->getFloat($field_offset)), floor($float_val));
+                    break;
+                case 10:
+                    $assert->strictEqual($view->getDouble($field_offset), $double_val);
+                    break;
+            }
+        }
+    }
+}
+
+function fieldIndexToOffset($field_id) {
+    // Should correspond to what EndTable() below builds up.
+    $fixed_fields = 2;  // Vtable size and Object Size.
+    return ($field_id + $fixed_fields) * 2;
+}
+
+function testByteBuffer(Assert $assert) {
+
+    //Test: ByteBuffer_Length_MatchesBufferLength
+    $buffer = str_repeat("\0", 100);
+    $uut  = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal($uut->capacity(), strlen($buffer));
+
+    //Test: ByteBuffer_PutBytePopulatesBufferAtZeroOffset
+    $buffer = "\0";
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $uut->putByte(0, "\x63"); // 99
+    $assert->Equal("\x63", $uut->_buffer[0]); // don't share buffer as php user might confuse reference.
+
+    //Test: ByteBuffer_PutByteCannotPutAtOffsetPastLength
+    $buffer = "\0";
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putByte(1, "\x63"); // 99
+    });
+
+    //Test: ByteBuffer_PutShortPopulatesBufferCorrectly
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $uut->putShort(0, 1);
+
+    // Ensure Endiannes was written correctly
+    $assert->Equal(chr(0x01), $uut->_buffer[0]);
+    $assert->Equal(chr(0x00), $uut->_buffer[1]);
+
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $uut->putShort(0, -32768);
+
+    // Ensure Endiannes was written correctly
+    $assert->Equal(chr(0x00), $uut->_buffer[0]);
+    $assert->Equal(chr(0x80), $uut->_buffer[1]);
+
+    //Test: ByteBuffer_PutShortCannotPutAtOffsetPastLength
+    $buffer = "\0";
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putShort(2, "\x63"); // 99
+    });
+
+    //Test: ByteBuffer_PutShortChecksLength
+    $buffer = "\0";
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putShort(0, "\x63"); // 99
+    });
+
+    //Test: ByteBuffer_PutShortChecksLengthAndOffset
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putShort(1, "\x63"); // 99
+    });
+
+    //Test: ByteBuffer_PutIntPopulatesBufferCorrectly
+    $buffer = str_repeat("\0", 4);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $uut->putInt(0, 0x0A0B0C0D);
+    $assert->Equal(chr(0x0D), $uut->_buffer[0]);
+    $assert->Equal(chr(0x0C), $uut->_buffer[1]);
+    $assert->Equal(chr(0x0B), $uut->_buffer[2]);
+    $assert->Equal(chr(0x0A), $uut->_buffer[3]);
+
+    $buffer = str_repeat("\0", 4);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $uut->putInt(0, -2147483648);
+    $assert->Equal(chr(0x00), $uut->_buffer[0]);
+    $assert->Equal(chr(0x00), $uut->_buffer[1]);
+    $assert->Equal(chr(0x00), $uut->_buffer[2]);
+    $assert->Equal(chr(0x80), $uut->_buffer[3]);
+
+    //Test: ByteBuffer_PutIntCannotPutAtOffsetPastLength
+    $buffer = str_repeat("\0", 4);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putInt(2, 0x0A0B0C0D);
+    });
+
+    //Test: ByteBuffer_PutIntChecksLength
+    $buffer = str_repeat("\0", 1);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putInt(0, 0x0A0B0C0D);
+    });
+
+    //Test: ByteBuffer_PutIntChecksLengthAndOffset
+    $buffer = str_repeat("\0", 4);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->putInt(2, 0x0A0B0C0D);
+    });
+
+    if (PHP_INT_SIZE > 4) {
+        //Test: ByteBuffer_PutLongPopulatesBufferCorrectly
+        $buffer = str_repeat("\0", 8);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $uut->putLong(0, 0x010203040A0B0C0D);
+        $assert->Equal(chr(0x0D), $uut->_buffer[0]);
+        $assert->Equal(chr(0x0C), $uut->_buffer[1]);
+        $assert->Equal(chr(0x0B), $uut->_buffer[2]);
+        $assert->Equal(chr(0x0A), $uut->_buffer[3]);
+        $assert->Equal(chr(0x04), $uut->_buffer[4]);
+        $assert->Equal(chr(0x03), $uut->_buffer[5]);
+        $assert->Equal(chr(0x02), $uut->_buffer[6]);
+        $assert->Equal(chr(0x01), $uut->_buffer[7]);
+
+        //Test: ByteBuffer_PutLongCannotPutAtOffsetPastLength
+        $buffer = str_repeat("\0", 8);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+            $uut->putLong(2, 0x010203040A0B0C0D);
+        });
+
+        //Test: ByteBuffer_PutLongCannotPutAtOffsetPastLength
+        $buffer = str_repeat("\0", 1);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+            $uut->putLong(0, 0x010203040A0B0C0D);
+        });
+
+
+        //Test: ByteBuffer_PutLongChecksLengthAndOffset
+        $buffer = str_repeat("\0", 8);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+            $uut->putLong(2, 0x010203040A0B0C0D);
+        });
+    }
+
+    //Test: ByteBuffer_GetByteReturnsCorrectData
+    $buffer = str_repeat("\0", 1);
+    $buffer[0] = "\x63";
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal("\x63", $uut->get(0));
+
+    //Test: ByteBuffer_GetByteChecksOffset
+    $buffer = str_repeat("\0", 1);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->get(1);
+    });
+
+    //Test: ByteBuffer_GetShortReturnsCorrectData
+    $buffer = str_repeat("\0", 2);
+    $buffer[0] = chr(0x01);
+    $buffer[1] = chr(0x00);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(1, $uut->getShort(0));
+
+    //Test: ByteBuffer_GetShortReturnsCorrectData (signed value)
+    $buffer = str_repeat("\0", 2);
+    $buffer[0] = chr(0x00);
+    $buffer[1] = chr(0x80);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(-32768, $uut->getShort(0));
+
+    //Test: ByteBuffer_GetShortChecksOffset
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getShort(2);
+    });
+
+    //Test: ByteBuffer_GetShortChecksLength
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getShort(1);
+    });
+
+    //Test: ByteBuffer_GetIntReturnsCorrectData
+    $buffer = str_repeat("\0", 4);
+    $buffer[0] = chr(0x0D);
+    $buffer[1] = chr(0x0C);
+    $buffer[2] = chr(0x0B);
+    $buffer[3] = chr(0x0A);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(0x0A0B0C0D, $uut->getInt(0));
+
+    $buffer = str_repeat("\0", 4);
+    $buffer[0] = chr(0x00);
+    $buffer[1] = chr(0x00);
+    $buffer[2] = chr(0x00);
+    $buffer[3] = chr(0x80);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(-2147483648, $uut->getInt(0));
+
+    //Test: ByteBuffer_GetIntChecksOffset
+    $buffer = str_repeat("\0", 4);
+
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getInt(4);
+    });
+
+    //Test: ByteBuffer_GetIntChecksLength
+    $buffer = str_repeat("\0", 2);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getInt(0);
+    });
+
+    if (PHP_INT_SIZE > 4) {
+        //Test: ByteBuffer_GetLongReturnsCorrectData
+        $buffer = str_repeat("\0", 8);
+        $buffer[0] = chr(0x0D);
+        $buffer[1] = chr(0x0C);
+        $buffer[2] = chr(0x0B);
+        $buffer[3] = chr(0x0A);
+        $buffer[4] = chr(0x04);
+        $buffer[5] = chr(0x03);
+        $buffer[6] = chr(0x02);
+        $buffer[7] = chr(0x01);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $assert->Equal(0x010203040A0B0C0D, $uut->getLong(0));
+
+        //Test: Signed Long
+        $buffer = str_repeat("\0", 8);
+        $buffer[0] = chr(0x00);
+        $buffer[1] = chr(0x00);
+        $buffer[2] = chr(0x00);
+        $buffer[3] = chr(0x00);
+        $buffer[4] = chr(0x00);
+        $buffer[5] = chr(0x00);
+        $buffer[6] = chr(0x00);
+        $buffer[7] = chr(0x80);
+        $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+        $assert->Equal(-1 << 63, $uut->getLong(0));
+    }
+
+    //Test: ByteBuffer_GetLongChecksOffset
+    $buffer = str_repeat("\0", 8);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getLong(8);
+    });
+
+    //Test: ByteBuffer_GetLongChecksLength
+    $buffer = str_repeat("\0", 7);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Throws(new OutOfRangeException(), function()  use ($uut) {
+        $uut->getLong(0);
+    });
+
+    //Test: big endian
+    $buffer = str_repeat("\0", 2);
+    // 0xFF 0x00
+    // Little Endian: 255
+    // Big Endian: 65280
+    $buffer[0] = chr(0xff);
+    $buffer[1] = chr(0x00);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(65280, $uut->readLittleEndian(0, 2, true));
+
+    $buffer = str_repeat("\0", 4);
+    $buffer[0] = chr(0x0D);
+    $buffer[1] = chr(0x0C);
+    $buffer[2] = chr(0x0B);
+    $buffer[3] = chr(0x0A);
+    $uut = Google\FlatBuffers\ByteBuffer::wrap($buffer);
+    $assert->Equal(0x0D0C0B0A, $uut->readLittleEndian(0, 4, true));
+
+}
+
+class Assert {
+    public function ok($result, $message = "") {
+        if (!$result){
+            throw new Exception(!empty($message) ? $message : "{$result} is not true.");
+        }
+    }
+
+    public function Equal($result, $expected, $message = "") {
+        if ($result != $expected) {
+            throw new Exception(!empty($message) ? $message : "given the result {$result} is not equals as {$expected}");
+        }
+    }
+
+
+    public function strictEqual($result, $expected, $message = "") {
+        if ($result !== $expected) {
+            throw new Exception(!empty($message) ? $message : "given the result {$result} is not strict equals as {$expected}");
+        }
+    }
+
+    public function Throws($class, Callable $callback) {
+        try {
+            $callback();
+
+            throw new \Exception("passed statement don't throw an exception.");
+        } catch (\Exception $e) {
+            if (get_class($e) != get_class($class)) {
+                throw new Exception("passed statement doesn't throw " . get_class($class) . ". throwws " . get_class($e));
+            }
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.php b/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.php
new file mode 100644
index 0000000..4b5e258
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.php
@@ -0,0 +1,109 @@
+<?php
+
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Constants.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "ByteBuffer.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "FlatbufferBuilder.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Table.php"));
+require join(DIRECTORY_SEPARATOR, array(dirname(dirname(__FILE__)), "php", "Struct.php"));
+
+require join(DIRECTORY_SEPARATOR, array(dirname(__FILE__), "php", 'Attacker.php'));
+require join(DIRECTORY_SEPARATOR, array(dirname(__FILE__), "php", 'BookReader.php'));
+require join(DIRECTORY_SEPARATOR, array(dirname(__FILE__), "php", 'Character.php'));
+require join(DIRECTORY_SEPARATOR, array(dirname(__FILE__), "php", 'Movie.php'));
+
+class Assert {
+    public function ok($result, $message = "") {
+        if (!$result){
+            throw new Exception(!empty($message) ? $message : "{$result} is not true.");
+        }
+    }
+
+    public function Equal($result, $expected, $message = "") {
+        if ($result != $expected) {
+            throw new Exception(!empty($message) ? $message : "given the result {$result} is not equals as {$expected}");
+        }
+    }
+
+
+    public function strictEqual($result, $expected, $message = "") {
+        if ($result !== $expected) {
+            throw new Exception(!empty($message) ? $message : "given the result {$result} is not strict equals as {$expected}");
+        }
+    }
+
+    public function Throws($class, Callable $callback) {
+        try {
+            $callback();
+
+            throw new \Exception("passed statement don't throw an exception.");
+        } catch (\Exception $e) {
+            if (get_class($e) != get_class($class)) {
+                throw new Exception("passed statement doesn't throw " . get_class($class) . ". throwws " . get_class($e));
+            }
+        }
+    }
+}
+
+function main()
+{
+    $assert = new Assert();
+
+    $fbb = new Google\FlatBuffers\FlatBufferBuilder(1);
+
+    $charTypes = [
+        Character::Belle,
+        Character::MuLan,
+        Character::BookFan,
+    ];
+
+    Attacker::startAttacker($fbb);
+    Attacker::addSwordAttackDamage($fbb, 5);
+    $attackerOffset = Attacker::endAttacker($fbb);
+
+    $charTypesOffset = Movie::createCharactersTypeVector($fbb, $charTypes);
+    $charsOffset = Movie::createCharactersVector(
+        $fbb,
+        [
+            BookReader::createBookReader($fbb, 7),
+            $attackerOffset,
+            BookReader::createBookReader($fbb, 2),
+        ]
+    );
+
+    Movie::startMovie($fbb);
+    Movie::addCharactersType($fbb, $charTypesOffset);
+    Movie::addCharacters($fbb, $charsOffset);
+    Movie::finishMovieBuffer($fbb, Movie::endMovie($fbb));
+
+    $buf = Google\FlatBuffers\ByteBuffer::wrap($fbb->dataBuffer()->data());
+
+    $movie = Movie::getRootAsMovie($buf);
+
+    $assert->strictEqual($movie->getCharactersTypeLength(), count($charTypes));
+    $assert->strictEqual($movie->getCharactersLength(), $movie->getCharactersTypeLength());
+
+    for ($i = 0; $i < count($charTypes); ++$i) {
+        $assert->strictEqual($movie->getCharactersType($i), $charTypes[$i]);
+    }
+
+    $bookReader7 = $movie->getCharacters(0, new BookReader());
+    $assert->strictEqual($bookReader7->getBooksRead(), 7);
+
+    $attacker = $movie->getCharacters(1, new Attacker());
+    $assert->strictEqual($attacker->getSwordAttackDamage(), 5);
+
+    $bookReader2 = $movie->getCharacters(2, new BookReader());
+    $assert->strictEqual($bookReader2->getBooksRead(), 2);
+}
+
+try {
+    main();
+    exit(0);
+} catch(Exception $e) {
+    printf("Fatal error: Uncaught exception '%s' with message '%s. in %s:%d\n", get_class($e), $e->getMessage(), $e->getFile(), $e->getLine());
+    printf("Stack trace:\n");
+    echo $e->getTraceAsString() . PHP_EOL;
+    printf("  thrown in in %s:%d\n", $e->getFile(), $e->getLine());
+
+    die(-1);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.sh b/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.sh
new file mode 100755
index 0000000..a6c3f26
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/phpUnionVectorTest.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+../flatc --php -o php union_vector/union_vector.fbs
+php phpUnionVectorTest.php
+
+echo 'PHP union vector test passed'
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/imported.proto b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/imported.proto
new file mode 100644
index 0000000..5b43e4b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/imported.proto
@@ -0,0 +1,5 @@
+package proto.test;
+
+message ImportedMessage {
+    optional int32 a = 26;
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.golden
new file mode 100644
index 0000000..cf861b9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.golden
@@ -0,0 +1,59 @@
+// Generated from test.proto
+
+namespace proto.test;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+table ImportedMessage {
+  a:int;
+}
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.ProtoMessage_.Anonymous0;
+}
+
+namespace proto.test.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
+table Anonymous0 {
+  /// doc comment for s.
+  s:proto.test.ImportedMessage;
+  /// doc comment for t on 2
+  /// lines.
+  t:proto.test.ProtoMessage_.OtherMessage;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.proto b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.proto
new file mode 100644
index 0000000..45ce6c0
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test.proto
@@ -0,0 +1,59 @@
+// Sample .proto file that we can translate to the corresponding .fbs.
+
+option some_option = is_ignored;
+import "imported.proto";
+
+package proto.test;
+
+/// Enum doc comment.
+enum ProtoEnum {
+  option allow_alias = true;
+  NUL = 0;
+  FOO = 1;
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5;
+  // Aliases
+  FOO_A1 = 1;
+  BAR_A1 = 5;
+  FOO_A2 = 1;
+}
+
+/// 2nd table doc comment with
+/// many lines.
+message ProtoMessage {
+  // Ignored non-doc comment.
+  // A nested message declaration, will be moved to top level in .fbs
+  message OtherMessage {
+    optional double a = 26;
+    /// doc comment for b.
+    optional float b = 32 [default = 3.14149];
+  }
+  optional int32 c = 12 [default = 16];
+  optional int64 d = 1 [default = 0];
+  optional uint32 p = 1;
+  optional uint64 e = 2;
+  /// doc comment for f.
+  optional sint32 f = 3 [default = -1];
+  optional sint64 g = 4;
+  optional fixed32 h = 5;
+  optional fixed64 q = 6;
+  optional sfixed32 i = 7;
+  optional sfixed64 j = 8;
+  /// doc comment for k.
+  optional bool k = 9;
+  /// doc comment for l on 2
+  /// lines
+  required string l = 10;
+  optional bytes m = 11;
+  optional OtherMessage n = 12;
+  repeated string o = 14;
+  optional ImportedMessage z = 16;
+  /// doc comment for r.
+  oneof r {
+    /// doc comment for s.
+    ImportedMessage s = 17;
+    /// doc comment for t on 2
+    /// lines.
+    OtherMessage t = 18;
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_include.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_include.golden
new file mode 100644
index 0000000..16b92ed
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_include.golden
@@ -0,0 +1,57 @@
+// Generated from test.proto
+
+include "imported.fbs";
+
+namespace proto.test;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.ProtoMessage_.Anonymous0;
+}
+
+namespace proto.test.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
+table Anonymous0 {
+  /// doc comment for s.
+  s:proto.test.ImportedMessage;
+  /// doc comment for t on 2
+  /// lines.
+  t:proto.test.ProtoMessage_.OtherMessage;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_suffix.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_suffix.golden
new file mode 100644
index 0000000..94ffd26
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_suffix.golden
@@ -0,0 +1,59 @@
+// Generated from test.proto
+
+namespace proto.test.test_namespace_suffix;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+table ImportedMessage {
+  a:int;
+}
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.test_namespace_suffix.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.test_namespace_suffix.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.test_namespace_suffix.ProtoMessage_.Anonymous0;
+}
+
+namespace proto.test.test_namespace_suffix.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
+table Anonymous0 {
+  /// doc comment for s.
+  s:proto.test.test_namespace_suffix.ImportedMessage;
+  /// doc comment for t on 2
+  /// lines.
+  t:proto.test.test_namespace_suffix.ProtoMessage_.OtherMessage;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union.golden
new file mode 100644
index 0000000..18270eb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union.golden
@@ -0,0 +1,63 @@
+// Generated from test.proto
+
+namespace proto.test;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+namespace proto.test.ProtoMessage_;
+
+union RUnion {
+  /// doc comment for s.
+  proto.test.ImportedMessage,
+  /// doc comment for t on 2
+  /// lines.
+  proto.test.ProtoMessage_.OtherMessage,
+}
+
+namespace proto.test;
+
+table ImportedMessage {
+  a:int;
+}
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.ProtoMessage_.RUnion;
+}
+
+namespace proto.test.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_include.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_include.golden
new file mode 100644
index 0000000..1ce3ac3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_include.golden
@@ -0,0 +1,61 @@
+// Generated from test.proto
+
+include "imported.fbs";
+
+namespace proto.test;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+namespace proto.test.ProtoMessage_;
+
+union RUnion {
+  /// doc comment for s.
+  proto.test.ImportedMessage,
+  /// doc comment for t on 2
+  /// lines.
+  proto.test.ProtoMessage_.OtherMessage,
+}
+
+namespace proto.test;
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.ProtoMessage_.RUnion;
+}
+
+namespace proto.test.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_suffix.golden b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_suffix.golden
new file mode 100644
index 0000000..0b0f920
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/prototest/test_union_suffix.golden
@@ -0,0 +1,63 @@
+// Generated from test.proto
+
+namespace proto.test.test_namespace_suffix;
+
+/// Enum doc comment.
+enum ProtoEnum : int {
+  NUL = 0,
+  FOO = 1,
+  /// Enum 2nd value doc comment misaligned.
+  BAR = 5,
+}
+
+namespace proto.test.test_namespace_suffix.ProtoMessage_;
+
+union RUnion {
+  /// doc comment for s.
+  proto.test.test_namespace_suffix.ImportedMessage,
+  /// doc comment for t on 2
+  /// lines.
+  proto.test.test_namespace_suffix.ProtoMessage_.OtherMessage,
+}
+
+namespace proto.test.test_namespace_suffix;
+
+table ImportedMessage {
+  a:int;
+}
+
+/// 2nd table doc comment with
+/// many lines.
+table ProtoMessage {
+  c:int = 16;
+  d:long;
+  p:uint;
+  e:ulong;
+  /// doc comment for f.
+  f:int = -1;
+  g:long;
+  h:uint;
+  q:ulong;
+  i:int;
+  j:long;
+  /// doc comment for k.
+  k:bool;
+  /// doc comment for l on 2
+  /// lines
+  l:string (required);
+  m:[ubyte];
+  n:proto.test.test_namespace_suffix.ProtoMessage_.OtherMessage;
+  o:[string];
+  z:proto.test.test_namespace_suffix.ImportedMessage;
+  /// doc comment for r.
+  r:proto.test.test_namespace_suffix.ProtoMessage_.RUnion;
+}
+
+namespace proto.test.test_namespace_suffix.ProtoMessage_;
+
+table OtherMessage {
+  a:double;
+  /// doc comment for b.
+  b:float = 3.14149;
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/py_flexbuffers_test.py b/3rdparty/TNN/third_party/flatbuffers/tests/py_flexbuffers_test.py
new file mode 100644
index 0000000..6696b3e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/py_flexbuffers_test.py
@@ -0,0 +1,1523 @@
+# Lint as: python3
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for flexbuffers.py."""
+
+import array
+import os.path
+import struct
+import unittest
+
+from flatbuffers import flexbuffers
+
+Type = flexbuffers.Type
+
+LOG2 = {1: 0, 2: 1, 4: 2, 8: 3}
+
+GOLD_FLEXBUFFER_OBJ = {
+    'bar': [1, 2, 3],
+    'bar3': [1, 2, 3],
+    'bool': True,
+    'bools': [True, False, True, False],
+    'foo': 100.0,
+    'mymap': {'foo': 'Fred'},
+    'vec': [-100, 'Fred', 4.0, b'M', False, 4.0]
+}
+
+GOLD_FLEXBUFFER_FILE = 'gold_flexbuffer_example.bin'
+
+
+def read_test_file(name):
+  with open(os.path.join(os.path.dirname(__file__), name), 'rb') as f:
+    return f.read()
+
+
+def packed_type(type_, i):
+  return (type_ << 2) | LOG2[i]
+
+
+def uint_size(value):
+  """Returns number of bytes (power of two) to represent unsigned value."""
+  assert value >= 0
+
+  n = 8
+  while not value < (1 << n):
+    n *= 2
+  return n // 8
+
+
+def int_size(value):
+  """Returns number of bytes (power of two) to represent signed value."""
+  n = 8
+  while not -(1 << (n - 1)) <= value < (1 << (n - 1)):
+    n *= 2
+  return n // 8
+
+
+def uint_sizes(value):
+  return tuple(1 << i for i in range(LOG2[uint_size(value)], 4))
+
+
+def int_sizes(value):
+  return tuple(1 << i for i in range(LOG2[int_size(value)], 4))
+
+
+def int_bytes(value, byte_width):
+  return struct.pack({1: 'b', 2: 'h', 4: 'i', 8: 'q'}[byte_width], value)
+
+
+def uint_bytes(value, byte_width):
+  return struct.pack({1: 'B', 2: 'H', 4: 'I', 8: 'Q'}[byte_width], value)
+
+
+def float_bytes(value, byte_width):
+  return struct.pack({4: 'f', 8: 'd'}[byte_width], value)
+
+
+def min_value(type_, byte_width):
+  assert byte_width > 0
+
+  if type_ in (Type.INT, Type.INDIRECT_INT):
+    return -(1 << (8 * byte_width - 1))
+  elif type_ in (Type.UINT, Type.INDIRECT_UINT):
+    return 0
+  else:
+    raise ValueError('Unsupported type %s' % type_)
+
+
+def max_value(type_, byte_width):
+  assert byte_width > 0
+
+  if type_ in (Type.INT, Type.INDIRECT_INT):
+    return (1 << (8 * byte_width - 1)) - 1
+  elif type_ in (Type.UINT, Type.INDIRECT_UINT):
+    return (1 << 8 * byte_width) - 1
+  else:
+    raise ValueError('Unsupported type %s' % type_)
+
+
+def str_bytes(value, byte_width):
+  value_bytes = value.encode('utf-8')
+  return [*uint_bytes(len(value_bytes), byte_width), *value_bytes, 0]
+
+
+def key_bytes(value):
+  return [*value.encode('ascii'), 0]
+
+
+def encode_type(type_, value, byte_width=None):
+  fbb = flexbuffers.Builder()
+  add = fbb.Adder(type_)
+  if byte_width:
+    add(value, byte_width)
+  else:
+    add(value)
+  return fbb.Finish()
+
+
+INT_MIN_MAX_VALUES = (min_value(Type.INT, 1), max_value(Type.INT, 1),
+                      min_value(Type.INT, 2), max_value(Type.INT, 2),
+                      min_value(Type.INT, 4), max_value(Type.INT, 4),
+                      min_value(Type.INT, 8), max_value(Type.INT, 8))
+
+UINT_MIN_MAX_VALUES = (0, max_value(Type.UINT, 1), max_value(Type.UINT, 2),
+                       max_value(Type.UINT, 4), max_value(Type.UINT, 8))
+
+
+class UtilTest(unittest.TestCase):
+  """Tests to check FlexBuffer utility functions."""
+
+  def _test_type_predicate(self, pred, types):
+    for type_ in types:
+      with self.subTest(type=type_, pred=pred):
+        self.assertTrue(pred(type_))
+
+    for type_ in set(Type).difference(types):
+      with self.subTest(type=type_, pred=pred):
+        self.assertFalse(pred(type_))
+
+  def test_inline_types(self):
+    self._test_type_predicate(
+        Type.IsInline, (Type.NULL, Type.INT, Type.UINT, Type.FLOAT, Type.BOOL))
+
+  def test_typed_vector(self):
+    self._test_type_predicate(
+        Type.IsTypedVector,
+        (Type.VECTOR_INT, Type.VECTOR_UINT, Type.VECTOR_FLOAT, Type.VECTOR_KEY,
+         Type.VECTOR_STRING_DEPRECATED, Type.VECTOR_BOOL))
+
+    self._test_type_predicate(
+        Type.IsTypedVectorElementType,
+        (Type.INT, Type.UINT, Type.FLOAT, Type.KEY, Type.STRING, Type.BOOL))
+
+    with self.assertRaises(ValueError):
+      Type.ToTypedVectorElementType(Type.VECTOR)
+    self.assertIs(Type.ToTypedVectorElementType(Type.VECTOR_INT), Type.INT)
+    self.assertIs(Type.ToTypedVectorElementType(Type.VECTOR_UINT), Type.UINT)
+    self.assertIs(Type.ToTypedVectorElementType(Type.VECTOR_FLOAT), Type.FLOAT)
+    self.assertIs(Type.ToTypedVectorElementType(Type.VECTOR_KEY), Type.KEY)
+    self.assertIs(
+        Type.ToTypedVectorElementType(Type.VECTOR_STRING_DEPRECATED),
+        Type.STRING)
+    self.assertIs(Type.ToTypedVectorElementType(Type.VECTOR_BOOL), Type.BOOL)
+
+    with self.assertRaises(ValueError):
+      Type.ToTypedVector(Type.VECTOR)
+    self.assertIs(Type.ToTypedVector(Type.INT), Type.VECTOR_INT)
+    self.assertIs(Type.ToTypedVector(Type.UINT), Type.VECTOR_UINT)
+    self.assertIs(Type.ToTypedVector(Type.FLOAT), Type.VECTOR_FLOAT)
+    self.assertIs(Type.ToTypedVector(Type.KEY), Type.VECTOR_KEY)
+    self.assertIs(
+        Type.ToTypedVector(Type.STRING), Type.VECTOR_STRING_DEPRECATED)
+    self.assertIs(Type.ToTypedVector(Type.BOOL), Type.VECTOR_BOOL)
+
+  def test_fixed_typed_vector(self):
+    self._test_type_predicate(
+        Type.IsFixedTypedVector,
+        (Type.VECTOR_INT2, Type.VECTOR_UINT2, Type.VECTOR_FLOAT2,
+         Type.VECTOR_INT3, Type.VECTOR_UINT3, Type.VECTOR_FLOAT3,
+         Type.VECTOR_INT4, Type.VECTOR_UINT4, Type.VECTOR_FLOAT4))
+
+    self._test_type_predicate(Type.IsFixedTypedVectorElementType,
+                              (Type.INT, Type.UINT, Type.FLOAT))
+
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_INT2), (Type.INT, 2))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_UINT2), (Type.UINT, 2))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_FLOAT2), (Type.FLOAT, 2))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_INT3), (Type.INT, 3))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_UINT3), (Type.UINT, 3))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_FLOAT3), (Type.FLOAT, 3))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_INT4), (Type.INT, 4))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_UINT4), (Type.UINT, 4))
+    self.assertEqual(
+        Type.ToFixedTypedVectorElementType(Type.VECTOR_FLOAT4), (Type.FLOAT, 4))
+
+    # Invalid size
+    for type_ in Type.INT, Type.UINT, Type.FLOAT:
+      with self.assertRaises(ValueError):
+        Type.ToTypedVector(type_, 1)
+      with self.assertRaises(ValueError):
+        Type.ToTypedVector(type_, 5)
+
+    # Invalid element type
+    for length in 1, 2, 3, 4, 5:
+      with self.assertRaises(ValueError):
+        Type.ToTypedVector(Type.STRING, length)
+
+    self.assertIs(Type.ToTypedVector(Type.INT, 2), Type.VECTOR_INT2)
+    self.assertIs(Type.ToTypedVector(Type.INT, 3), Type.VECTOR_INT3)
+    self.assertIs(Type.ToTypedVector(Type.INT, 4), Type.VECTOR_INT4)
+
+    self.assertIs(Type.ToTypedVector(Type.UINT, 2), Type.VECTOR_UINT2)
+    self.assertIs(Type.ToTypedVector(Type.UINT, 3), Type.VECTOR_UINT3)
+    self.assertIs(Type.ToTypedVector(Type.UINT, 4), Type.VECTOR_UINT4)
+
+    self.assertIs(Type.ToTypedVector(Type.FLOAT, 2), Type.VECTOR_FLOAT2)
+    self.assertIs(Type.ToTypedVector(Type.FLOAT, 3), Type.VECTOR_FLOAT3)
+    self.assertIs(Type.ToTypedVector(Type.FLOAT, 4), Type.VECTOR_FLOAT4)
+
+  def test_width(self):
+    for x in range(1 << 10):
+      self.assertEqual(flexbuffers.BitWidth.U(x), LOG2[uint_size(x)])
+
+    for x in range(-(1 << 10), 1 << 10):
+      self.assertEqual(flexbuffers.BitWidth.I(x), LOG2[int_size(x)])
+
+  def test_padding(self):
+    self.assertEqual(flexbuffers._PaddingBytes(0, 4), 0)
+    self.assertEqual(flexbuffers._PaddingBytes(0, 8), 0)
+    self.assertEqual(flexbuffers._PaddingBytes(0, 16), 0)
+
+    self.assertEqual(flexbuffers._PaddingBytes(1, 8), 7)
+    self.assertEqual(flexbuffers._PaddingBytes(17, 8), 7)
+
+    self.assertEqual(flexbuffers._PaddingBytes(42, 2), 0)
+
+
+class DecoderTest(unittest.TestCase):
+  """Tests to check FlexBuffer decoding functions.
+
+  Common variable names used in the tests for compactness:
+    bw: byte_width
+    ebw: element_byte_width
+    kbw: key_byte_width
+    vbw: value_byte_width
+    tbw: type_byte_width
+
+  Having '_ignored' suffix means that variable doesn't affect the constructed
+  byte buffer size.
+  """
+
+  def test_null(self):
+    for bw in 1, 2, 4, 8:
+      for ebw_ignored in 1, 2, 4, 8:
+        with self.subTest(bw=bw, ebw_ignored=ebw_ignored):
+          data = bytes([
+              *uint_bytes(0, bw),
+              packed_type(Type.NULL, ebw_ignored),
+              bw,
+          ])
+
+          root = flexbuffers.GetRoot(data)
+          self.assertTrue(root.IsNull)
+          self.assertEqual(root.AsBool, False)
+          self.assertEqual(root.AsInt, 0)
+          self.assertEqual(root.AsFloat, 0.0)
+
+          for prop in (type(root).AsKey, type(root).AsString, type(root).AsBlob,
+                       type(root).AsVector, type(root).AsTypedVector,
+                       type(root).AsFixedTypedVector, type(root).AsMap):
+            with self.assertRaises(TypeError):
+              prop.fget(root)
+
+          self.assertEqual(root.Value, None)
+
+          self.assertIsNone(flexbuffers.Loads(data))
+
+  def test_bool(self):
+    for value in False, True:
+      for bw in 1, 2, 4, 8:
+        for ebw_ignored in 1, 2, 4, 8:
+          with self.subTest(bw=bw, ebw_ignored=ebw_ignored):
+            data = bytes([
+                *uint_bytes(int(value), bw),
+                packed_type(Type.BOOL, ebw_ignored),
+                bw,
+            ])
+
+            root = flexbuffers.GetRoot(data)
+            self.assertTrue(root.IsBool)
+            self.assertEqual(root.AsBool, value)
+            self.assertEqual(root.AsInt, int(value))
+            self.assertEqual(root.AsFloat, float(value))
+
+            for prop in (type(root).AsKey, type(root).AsString,
+                         type(root).AsBlob,
+                         type(root).AsVector, type(root).AsTypedVector,
+                         type(root).AsFixedTypedVector, type(root).AsMap):
+              with self.assertRaises(TypeError):
+                prop.fget(root)
+
+            self.assertEqual(root.Value, value)
+
+            self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_mutate_bool(self):
+    root = flexbuffers.GetRoot(flexbuffers.Dumps(True))
+    self.assertTrue(root.IsBool)
+    self.assertTrue(root.AsBool)
+
+    self.assertTrue(root.MutateBool(False))
+    self.assertTrue(root.IsBool)
+    self.assertFalse(root.AsBool)
+
+    self.assertTrue(root.MutateBool(True))
+    self.assertTrue(root.IsBool)
+    self.assertTrue(root.AsBool)
+
+  def _check_int(self, data, value):
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsInt)
+    self.assertEqual(root.AsInt, value)
+    self.assertEqual(root.AsBool, bool(value))
+    self.assertEqual(root.AsFloat, float(value))
+
+    for prop in (type(root).AsKey, type(root).AsString, type(root).AsBlob,
+                 type(root).AsVector, type(root).AsTypedVector,
+                 type(root).AsFixedTypedVector, type(root).AsMap):
+      with self.assertRaises(TypeError):
+        prop.fget(root)
+
+    self.assertEqual(root.Value, value)
+
+    self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_int(self):
+    for value in (0, 1, -1, 15, -17, *INT_MIN_MAX_VALUES):
+      for bw in int_sizes(value):
+        for ebw_ignored in 1, 2, 4, 8:
+          with self.subTest(value=value, bw=bw, ebw_ignored=ebw_ignored):
+            data = bytes([
+                *int_bytes(value, bw),
+                packed_type(Type.INT, ebw_ignored),
+                bw,
+            ])
+
+            self._check_int(data, value)
+
+  def test_indirect_int(self):
+    for value in (0, 1, -1, 15, -17, *INT_MIN_MAX_VALUES):
+      for bw in 1, 2, 4, 8:
+        for ebw in int_sizes(value):
+          with self.subTest(value=value, bw=bw, ebw=ebw):
+            data = bytes([
+                # Int
+                *int_bytes(value, ebw),
+                # Root
+                *uint_bytes(ebw, bw),
+                packed_type(Type.INDIRECT_INT, ebw),
+                bw,
+            ])
+            self._check_int(data, value)
+
+  def test_uint(self):
+    for value in (1, *UINT_MIN_MAX_VALUES):
+      for bw in uint_sizes(value):
+        for ebw_ignored in 1, 2, 4, 8:
+          with self.subTest(value=value, bw=bw, ebw_ignored=ebw_ignored):
+            data = bytes([
+                *uint_bytes(value, bw),
+                packed_type(Type.UINT, ebw_ignored),
+                bw,
+            ])
+
+            self._check_int(data, value)
+
+  def test_inidirect_uint(self):
+    for value in (1, *UINT_MIN_MAX_VALUES):
+      for bw in 1, 2, 4, 8:
+        for ebw in uint_sizes(value):
+          with self.subTest(value=value, bw=bw, ebw=ebw):
+            data = bytes([
+                # UInt
+                *uint_bytes(value, ebw),
+                # Root
+                *uint_bytes(ebw, bw),
+                packed_type(Type.INDIRECT_UINT, ebw),
+                bw,
+            ])
+
+            self._check_int(data, value)
+
+  def test_mutate_ints(self):
+    # Signed
+    for type_ in Type.INT, Type.INDIRECT_INT:
+      with self.subTest(type=type_):
+        root = flexbuffers.GetRoot(encode_type(type_, 56))
+        self.assertEqual(root.AsInt, 56)
+
+        for new_value in 0, 1, -1, -128, 127:
+          self.assertTrue(root.MutateInt(new_value))
+          self.assertEqual(root.AsInt, new_value)
+
+        for new_value in -129, 128:
+          self.assertFalse(root.MutateInt(new_value))
+
+    # Unsigned
+    for type_ in Type.UINT, Type.INDIRECT_UINT:
+      with self.subTest(type=type_):
+        root = flexbuffers.GetRoot(encode_type(type_, 1))
+        self.assertEqual(root.AsInt, 1)
+
+        for new_value in 0, 1, 255:
+          self.assertTrue(root.MutateInt(new_value))
+          self.assertEqual(root.AsInt, new_value)
+
+        self.assertFalse(root.MutateInt(256))
+
+    # Inside vector
+    fbb = flexbuffers.Builder()
+    fbb.VectorFromElements([13, 0, -15])
+    data = fbb.Finish()
+
+    self.assertEqual(flexbuffers.Loads(data), [13, 0, -15])
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[0].MutateInt(0))
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[1].MutateInt(-7))
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[2].MutateInt(45))
+    self.assertEqual(flexbuffers.Loads(data), [0, -7, 45])
+
+    # Inside map
+    fbb = flexbuffers.Builder()
+    fbb.MapFromElements({'x': -7, 'y': 46})
+    data = fbb.Finish()
+
+    self.assertEqual(flexbuffers.Loads(data), {'x': -7, 'y': 46})
+    self.assertTrue(flexbuffers.GetRoot(data).AsMap['x'].MutateInt(14))
+    self.assertTrue(flexbuffers.GetRoot(data).AsMap['y'].MutateInt(-1))
+    self.assertEqual(flexbuffers.Loads(data), {'x': 14, 'y': -1})
+
+  def _check_float(self, data, value):
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsFloat)
+    self.assertAlmostEqual(root.AsFloat, value)
+
+    for prop in (type(root).AsKey, type(root).AsString, type(root).AsBlob,
+                 type(root).AsVector, type(root).AsTypedVector,
+                 type(root).AsFixedTypedVector, type(root).AsMap):
+      with self.assertRaises(TypeError):
+        prop.fget(root)
+
+    self.assertAlmostEqual(root.Value, value)
+
+    self.assertAlmostEqual(flexbuffers.Loads(data), value)
+
+  def test_float(self):
+    for value in -1.0, 0.0, 1.0, 3.141592, 1.5e6:
+      for bw in 4, 8:
+        for ebw_ignored in 1, 2, 4, 8:
+          with self.subTest(value=value, bw=bw, ebw_ignored=ebw_ignored):
+            data = bytes([
+                *float_bytes(value, bw),
+                packed_type(Type.FLOAT, ebw_ignored),
+                bw,
+            ])
+
+            self._check_float(data, value)
+
+  def test_indirect_float(self):
+    for value in -1.0, 0.0, 1.0, 3.141592, 1.5e6:
+      for bw in 1, 2, 4, 8:
+        for ebw in 4, 8:
+          with self.subTest(value=value, bw=bw, ebw=ebw):
+            data = bytes([
+                # Float
+                *float_bytes(value, ebw),
+                # Root
+                *uint_bytes(ebw, bw),
+                packed_type(Type.INDIRECT_FLOAT, ebw),
+                bw,
+            ])
+
+            self._check_float(data, value)
+
+  def test_mutate_float(self):
+    for type_ in Type.FLOAT, Type.INDIRECT_FLOAT:
+      for bw in 4, 8:
+        value = 3.141592
+        root = flexbuffers.GetRoot(encode_type(type_, value, bw))
+        self.assertAlmostEqual(root.AsFloat, value)
+
+        value = 2.71828
+        self.assertTrue(root.MutateFloat(value))
+        self.assertAlmostEqual(root.AsFloat, value, places=5)
+
+    # Inside vector
+    data = flexbuffers.Dumps([2.4, 1.5, -7.2])
+
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[0].MutateFloat(0.0))
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[1].MutateFloat(15.2))
+    self.assertTrue(flexbuffers.GetRoot(data).AsVector[2].MutateFloat(-5.1))
+
+    for a, b in zip(flexbuffers.Loads(data), [0.0, 15.2, -5.1]):
+      self.assertAlmostEqual(a, b)
+
+  def test_string(self):
+    for value in 'red', 'green', 'blue', 'flatbuffers + flexbuffers':
+      value_bytes = value.encode('utf-8')
+      for bw in 1, 2, 4, 8:
+        for lbw in 1, 2, 4, 8:
+          with self.subTest(bw=bw, lbw=lbw):
+            data = bytes([
+                # String
+                *uint_bytes(len(value_bytes), lbw),
+                *value_bytes,
+                0,
+                # Root
+                *uint_bytes(len(value_bytes) + 1, bw),  # offset
+                packed_type(Type.STRING, lbw),
+                bw,
+            ])
+
+            root = flexbuffers.GetRoot(data)
+            self.assertTrue(root.IsString)
+            self.assertEqual(root.AsString, value)
+            self.assertEqual(root.Value, value)
+            self.assertEqual(root.AsInt, len(value))
+
+            self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_mutate_string(self):
+    data = encode_type(Type.STRING, '12345')
+
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsString)
+    self.assertEqual(root.AsString, '12345')
+
+    self.assertFalse(root.MutateString('543210'))
+
+    self.assertTrue(root.MutateString('54321'))
+    self.assertTrue(root.IsString)
+    self.assertEqual(root.AsString, '54321')
+
+    self.assertTrue(root.MutateString('543'))
+    self.assertTrue(root.IsString)
+    self.assertEqual(root.AsString, '543')
+
+    self.assertFalse(root.MutateString('54321'))
+
+  def test_empty_blob(self):
+    for bw in 1, 2, 4, 8:
+      for lbw in 1, 2, 4, 8:
+        with self.subTest(bw=bw, lbw=lbw):
+          data = bytes([
+              # Blob
+              *uint_bytes(0, lbw),
+              # Root
+              *uint_bytes(0, bw),
+              packed_type(Type.BLOB, lbw),
+              bw,
+          ])
+
+          root = flexbuffers.GetRoot(data)
+          self.assertTrue(root.IsBlob)
+          self.assertEqual(root.AsBlob, bytes())
+          self.assertEqual(root.Value, bytes())
+          self.assertEqual(flexbuffers.Loads(data), bytes())
+
+  def test_blob(self):
+    for blob in [], [215], [23, 75, 124, 0, 45, 15], 255 * [0]:
+      for bw in 1, 2, 4, 8:
+        for lbw in 1, 2, 4, 8:
+          with self.subTest(blob=blob, bw=bw, lbw=lbw):
+            data = bytes([
+                # Blob
+                *uint_bytes(len(blob), lbw),
+                *blob,
+                # Root
+                *uint_bytes(len(blob), bw),
+                packed_type(Type.BLOB, lbw),
+                bw,
+            ])
+
+            root = flexbuffers.GetRoot(data)
+            self.assertTrue(root.IsBlob)
+            self.assertEqual(root.AsBlob, bytes(blob))
+            self.assertEqual(root.Value, bytes(blob))
+            self.assertEqual(flexbuffers.Loads(data), bytes(blob))
+
+  def test_key(self):
+    for value in '', 'x', 'color':
+      for bw in 1, 2, 4, 8:
+        with self.subTest(value=value, bw=bw):
+          value_bytes = value.encode('ascii')
+          data = bytes([
+              # Key
+              *value_bytes,
+              0,
+              # Root
+              *uint_bytes(len(value_bytes) + 1, bw),
+              packed_type(Type.KEY, 1),
+              bw,
+          ])
+
+          root = flexbuffers.GetRoot(data)
+          self.assertTrue(root.IsKey)
+          self.assertEqual(root.AsKey, value)
+          self.assertEqual(root.Value, value)
+          self.assertEqual(flexbuffers.Loads(data), value)
+
+  def _check_fixed_typed_vector(self, data, vector, type_):
+    self.assertEqual(flexbuffers.Loads(data), vector)
+
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsFixedTypedVector)
+
+    v = root.AsFixedTypedVector
+    self.assertEqual(len(v), len(vector))
+    self.assertIs(v.ElementType, type_)
+    self.assertEqual([e.Value for e in v], vector)
+    self.assertSequenceEqual(v.Value, vector)
+
+    self.assertEqual(root.AsInt, len(vector))
+
+  def test_fixed_typed_vector_float(self):
+    for type_, vector in ((Type.VECTOR_FLOAT2, [-75.0, 34.89]),
+                          (Type.VECTOR_FLOAT3, [-75.0, 34.89, 12.0]),
+                          (Type.VECTOR_FLOAT4, [-75.0, 34.89, -1.0, 1.0])):
+      for bw in 1, 2, 4, 8:
+        for ebw in 4, 8:
+          with self.subTest(type=type_, vector=vector, bw=bw, ebw=ebw):
+            data = bytes([
+                # FixedTypedVector
+                *b''.join(float_bytes(e, ebw) for e in vector),
+                # Root
+                *uint_bytes(len(vector) * ebw, bw),
+                packed_type(type_, ebw),
+                bw,
+            ])
+
+            for a, b in zip(flexbuffers.Loads(data), vector):
+              self.assertAlmostEqual(a, b, places=2)
+
+  def test_fixed_typed_vector_int(self):
+    for type_, vector in ((Type.VECTOR_INT2, [0, -13]), (Type.VECTOR_INT3,
+                                                         [127, 0, -13]),
+                          (Type.VECTOR_INT4, [127, 0, -13, 0])):
+      for bw in 1, 2, 4, 8:
+        for ebw in 1, 2, 4, 8:
+          with self.subTest(type=type_, vector=vector, bw=bw, ebw=ebw):
+            data = bytes([
+                # FixedTypeVector
+                *b''.join(int_bytes(e, ebw) for e in vector),
+                # Root
+                *uint_bytes(ebw * len(vector), bw),
+                packed_type(type_, ebw),
+                bw,
+            ])
+
+            self._check_fixed_typed_vector(data, vector, Type.INT)
+
+  def test_fixed_typed_vector_uint(self):
+    for type_, vector in ((Type.VECTOR_UINT2, [0, 13]),
+                          (Type.VECTOR_UINT3, [127, 0, 13]), (Type.VECTOR_UINT4,
+                                                              [127, 0, 13, 0])):
+      for bw in 1, 2, 4, 8:
+        for ebw in 1, 2, 4, 8:
+          with self.subTest(type=type_, vector=vector, bw=bw, ebw=ebw):
+            data = bytes([
+                # FixedTypeVector
+                *b''.join(uint_bytes(e, ebw) for e in vector),
+                # Root
+                *uint_bytes(ebw * len(vector), bw),
+                packed_type(type_, ebw),
+                bw,
+            ])
+
+            self._check_fixed_typed_vector(data, vector, Type.UINT)
+
+  def _check_typed_vector(self, data, vector, type_):
+    self.assertEqual(flexbuffers.Loads(data), vector)
+
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsTypedVector)
+
+    v = root.AsTypedVector
+    self.assertIs(v.ElementType, type_)
+    self.assertEqual(len(v), len(vector))
+    self.assertEqual([e.Value for e in v], vector)
+    self.assertSequenceEqual(v.Value, vector)
+
+    self.assertEqual(root.AsInt, len(vector))
+
+  def test_empty_typed_vector(self):
+    for type_ in (Type.VECTOR_BOOL, Type.VECTOR_INT, Type.VECTOR_UINT,
+                  Type.VECTOR_FLOAT, Type.VECTOR_KEY,
+                  Type.VECTOR_STRING_DEPRECATED):
+      for bw in 1, 2, 4, 8:
+        for ebw in 1, 2, 4, 8:
+          with self.subTest(type=type_, bw=bw, ebw=ebw):
+            data = bytes([
+                # TypedVector[type_]
+                *uint_bytes(0, ebw),
+                # Root
+                *uint_bytes(0, bw),
+                packed_type(type_, ebw),
+                bw
+            ])
+
+            element_type = Type.ToTypedVectorElementType(type_)
+            if element_type == Type.STRING:
+              element_type = Type.KEY
+            self._check_typed_vector(data, [], element_type)
+
+  def test_typed_vector_bool(self):
+    vector = [True, False, False, False, True]
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 1, 2, 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # TypedVector[Type.BOOL]
+              *uint_bytes(len(vector), ebw),
+              *b''.join(uint_bytes(int(e), ebw) for e in vector),
+              # Root
+              *uint_bytes(len(vector) * ebw, bw),
+              packed_type(Type.VECTOR_BOOL, ebw),
+              bw,
+          ])
+          self._check_typed_vector(data, vector, Type.BOOL)
+
+  def test_typed_vector_int(self):
+    vector = [-100, 200, -300]
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 2, 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # TypedVector[Type.INT]
+              *uint_bytes(len(vector), ebw),
+              *b''.join(int_bytes(e, ebw) for e in vector),
+              # Root
+              *uint_bytes(len(vector) * ebw, bw),
+              packed_type(Type.VECTOR_INT, ebw),
+              bw,
+          ])
+          self._check_typed_vector(data, vector, Type.INT)
+
+  def test_typed_vector_uint(self):
+    vector = [100, 200, 300, 400, 0]
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 2, 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # TypedVector[Type.UINT]
+              *uint_bytes(len(vector), ebw),
+              *b''.join(int_bytes(e, ebw) for e in vector),
+              # Root
+              *uint_bytes(len(vector) * ebw, bw),
+              packed_type(Type.VECTOR_UINT, ebw),
+              bw,
+          ])
+          self._check_typed_vector(data, vector, Type.UINT)
+
+  def test_typed_vector_float(self):
+    vector = [3.64, -6.36, 3.14, 634.0, -42.0]
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # TypedVector[Type.FLOAT]
+              *uint_bytes(len(vector), ebw),
+              *b''.join(float_bytes(e, ebw) for e in vector),
+              # Root
+              *uint_bytes(ebw * len(vector), bw),
+              packed_type(Type.VECTOR_FLOAT, ebw),
+              bw,
+          ])
+
+          for a, b in zip(flexbuffers.Loads(data), vector):
+            self.assertAlmostEqual(a, b, places=2)
+
+  def test_typed_vector_key(self):
+    vector = ['red', 'green', 'blue']
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 1, 2, 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # Keys
+              *key_bytes(vector[0]),
+              *key_bytes(vector[1]),
+              *key_bytes(vector[2]),
+              # TypedVector[Type.KEY]
+              *uint_bytes(len(vector), ebw),
+              *uint_bytes(15 + 1 * ebw, ebw),  # offset to vector[0]
+              *uint_bytes(11 + 2 * ebw, ebw),  # offset to vector[1]
+              *uint_bytes(5 + 3 * ebw, ebw),  # offset to vector[2]
+              # Root
+              *uint_bytes(len(vector) * ebw, bw),  # offset to vector
+              packed_type(Type.VECTOR_KEY, ebw),
+              bw,
+          ])
+          self._check_typed_vector(data, vector, Type.KEY)
+
+  def test_typed_vector_string(self):
+    vector = ['red', 'green', 'blue']
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 1, 2, 4, 8:
+        with self.subTest(bw=bw, ebw=ebw):
+          data = bytes([
+              # Strings
+              *str_bytes(vector[0], 1),  # 5 bytes
+              *str_bytes(vector[1], 1),  # 7 bytes
+              *str_bytes(vector[2], 1),  # 6 bytes
+              # TypedVector[Type.STRING]
+              *uint_bytes(len(vector), ebw),
+              *uint_bytes(17 + 1 * ebw, ebw),  # offset to vector[0]
+              *uint_bytes(12 + 2 * ebw, ebw),  # offset to vector[1]
+              *uint_bytes(5 + 3 * ebw, ebw),  # offset to vector[2]
+              # Root
+              *uint_bytes(len(vector) * ebw, bw),  # offset to vector
+              packed_type(Type.VECTOR_STRING_DEPRECATED, ebw),
+              bw,
+          ])
+
+          # We have to pass Type.KEY because of Type.VECTOR_STRING_DEPRECATED.
+          self._check_typed_vector(data, vector, Type.KEY)
+
+  def test_typed_vector_string_deprecated(self):
+    # Check FlexBuffersDeprecatedTest() inside test.cpp for details.
+    vector = [300 * 'A', 'test']
+
+    fbb = flexbuffers.Builder()
+    with fbb.TypedVector():
+      for e in vector:
+        fbb.String(e)
+    data = fbb.Finish()
+
+    # We have to pass Type.KEY because of Type.VECTOR_STRING_DEPRECATED.
+    self._check_typed_vector(data, vector, Type.KEY)
+
+  def test_typed_vector_invalid(self):
+    fbb = flexbuffers.Builder()
+
+    with self.assertRaises(RuntimeError):
+      fbb.TypedVectorFromElements(['string', 423])
+
+  def test_empty_vector(self):
+    for bw in 1, 2, 4, 8:
+      for ebw in 1, 2, 4, 8:
+        data = bytes([
+            *uint_bytes(0, ebw),
+            # Root
+            *uint_bytes(0, bw),
+            packed_type(Type.VECTOR, ebw),
+            bw,
+        ])
+
+        root = flexbuffers.GetRoot(data)
+        self.assertTrue(root.IsVector)
+        self.assertEqual(len(root.AsVector), 0)
+
+        self.assertEqual(flexbuffers.Loads(data), [])
+
+  def test_vector1(self):
+    vector = [300, 400, 500]
+
+    for bw in 1, 2, 4, 8:
+      for ebw in 2, 4, 8:
+        for tbw_ignored in 1, 2, 4, 8:
+          with self.subTest(bw=bw, ebw=ebw, ignore=tbw_ignored):
+            data = bytes([
+                # Vector length
+                *uint_bytes(len(vector), ebw),
+                # Vector elements
+                *int_bytes(vector[0], ebw),
+                *int_bytes(vector[1], ebw),
+                *int_bytes(vector[2], ebw),
+                # Vector types
+                packed_type(Type.INT, tbw_ignored),
+                packed_type(Type.INT, tbw_ignored),
+                packed_type(Type.INT, tbw_ignored),
+                # Root
+                *uint_bytes(ebw * len(vector) + len(vector), bw),
+                packed_type(Type.VECTOR, ebw),
+                bw,
+            ])
+
+            root = flexbuffers.GetRoot(data)
+            self.assertTrue(root.IsVector)
+            self.assertFalse(root.IsMap)
+
+            v = root.AsVector
+            self.assertEqual(len(v), len(vector))
+
+            for i in range(len(v)):
+              self.assertTrue(v[i].IsInt)
+              self.assertEqual(v[i].AsInt, vector[i])
+
+            for i, e in enumerate(v):
+              self.assertTrue(e.IsInt)
+              self.assertEqual(e.AsInt, vector[i])
+
+            with self.assertRaises(IndexError):
+              v[-1].AsInt  # pylint: disable=pointless-statement
+
+            with self.assertRaises(IndexError):
+              v[3].AsInt  # pylint: disable=pointless-statement
+
+            with self.assertRaises(TypeError):
+              root.AsMap  # pylint: disable=pointless-statement
+
+            self.assertEqual(root.AsInt, len(vector))
+            self.assertEqual(root.AsFloat, float(len(vector)))
+
+            self.assertEqual(flexbuffers.Loads(data), vector)
+
+  def test_vector2(self):
+    vector = [1984, 'August', True]
+
+    for bw in 1, 2, 4, 8:
+      with self.subTest(bw=bw):
+        data = bytes([
+            *str_bytes(vector[1], 1),
+            # Vector
+            *uint_bytes(len(vector), 2),
+            *int_bytes(vector[0], 2),
+            *uint_bytes(11, 2),  # offset to 'August'
+            *uint_bytes(int(vector[2]), 2),
+            packed_type(Type.INT, 2),
+            packed_type(Type.STRING, 1),
+            packed_type(Type.BOOL, 2),
+            # Root
+            *uint_bytes(2 * len(vector) + len(vector), bw),  # offset to vector
+            packed_type(Type.VECTOR, 2),
+            bw,
+        ])
+        self.assertEqual(flexbuffers.Loads(data), vector)
+
+        root = flexbuffers.GetRoot(data)
+        self.assertTrue(root.IsVector)
+
+        v = root.AsVector
+        self.assertTrue(v[0].IsInt)
+        self.assertEqual(v[0].AsInt, 1984)
+
+        self.assertTrue(v[1].IsString)
+        self.assertEqual(v[1].AsString, 'August')
+
+        self.assertTrue(v[2].IsBool)
+        self.assertTrue(v[2].AsBool)
+
+        self.assertEqual(v.Value, vector)
+
+        self.assertEqual(root.AsInt, len(vector))
+
+  def test_empty_map(self):
+    for bw in 1, 2, 4, 8:
+      for kbw in 1, 2, 4, 8:
+        for vbw in 1, 2, 4, 8:
+          data = bytes([
+              *uint_bytes(0, kbw),  # Keys length
+              *uint_bytes(0, vbw),
+              *uint_bytes(kbw, vbw),
+              *uint_bytes(0, vbw),  # Values length
+              # Root
+              *uint_bytes(0, bw),
+              packed_type(Type.MAP, vbw),
+              bw,
+          ])
+
+          root = flexbuffers.GetRoot(data)
+          self.assertTrue(root.IsMap)
+          self.assertEqual(len(root.AsMap), 0)
+
+          self.assertEqual(flexbuffers.Loads(data), {})
+
+  def test_map(self):
+    value = {'foo': 13, 'bar': 14}
+
+    for bw in 1, 2, 4, 8:
+      for kbw in 1, 2, 4, 8:
+        for vbw in 1, 2, 4, 8:
+          with self.subTest(kbw=kbw, vbw=vbw, bw=bw):
+            data = bytes([
+                *key_bytes('foo'),  # 4 bytes
+                *key_bytes('bar'),  # 4 bytes
+                # Map
+                *uint_bytes(len(value), kbw),
+                *uint_bytes(4 + 1 * kbw, kbw),  # offset to 'bar'
+                *uint_bytes(8 + 2 * kbw, kbw),  # offset to 'foo'
+                *uint_bytes(len(value) * kbw, vbw),  # offset to keys
+                *uint_bytes(kbw, vbw),
+                *uint_bytes(len(value), vbw),
+                *int_bytes(value['bar'], vbw),
+                *int_bytes(value['foo'], vbw),
+                packed_type(Type.INT, vbw),
+                packed_type(Type.INT, vbw),
+                # Root
+                *uint_bytes(vbw * len(value) + len(value),
+                            bw),  # offset to values
+                packed_type(Type.MAP, vbw),
+                bw,
+            ])
+
+            root = flexbuffers.GetRoot(data)
+            self.assertTrue(root.IsMap)
+
+            m = root.AsMap
+            self.assertEqual(len(m), 2)
+            self.assertEqual(m[0].AsInt, 14)
+            self.assertEqual(m[1].AsInt, 13)
+
+            self.assertEqual(m['bar'].AsInt, 14)
+            self.assertEqual(m['foo'].AsInt, 13)
+
+            for invalid_key in 'a', 'b', 'no':
+              with self.assertRaises(KeyError):
+                m[invalid_key]  # pylint: disable=pointless-statement
+
+            values = m.Values
+            self.assertEqual(len(values), 2)
+            self.assertEqual(values[0].AsInt, 14)
+            self.assertEqual(values[1].AsInt, 13)
+
+            keys = m.Keys
+            self.assertEqual(len(keys), 2)
+            self.assertEqual(len(keys[0].AsKey), 3)
+            self.assertEqual(keys[0].AsKey, 'bar')
+            self.assertEqual(len(keys[1].AsKey), 3)
+            self.assertEqual(keys[1].AsKey, 'foo')
+
+            keys = [key.AsKey for key in keys]
+            self.assertEqual(sorted(keys), keys)
+
+            self.assertEqual(root.AsInt, len(value))
+
+            self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_alignment(self):
+    value = ['test', 7]
+
+    data = bytes([
+        *key_bytes('test'),  # 5 bytes: 'test' and \0
+        0,
+        0,
+        0,  # 3 bytes: alignment
+        # Vector
+        *uint_bytes(len(value), byte_width=8),
+        *uint_bytes(16, byte_width=8),
+        *uint_bytes(7, byte_width=8),
+        packed_type(Type.KEY, 1),
+        packed_type(Type.INT, 8),
+        # Root
+        *uint_bytes(8 * len(value) + len(value), 1),
+        packed_type(Type.VECTOR, 8),
+        1,
+    ])
+
+    self.assertEqual(flexbuffers.Loads(data), value)
+
+
+class EncoderTest(unittest.TestCase):
+  """Tests to check FlexBuffer encoding functions."""
+
+  def test_null(self):
+    def encode_null():
+      fbb = flexbuffers.Builder()
+      fbb.Null()
+      return fbb.Finish()
+
+    self.assertIsNone(flexbuffers.Loads(encode_null()))
+
+  def test_bool(self):
+    for value in False, True:
+      data = encode_type(Type.BOOL, value)
+      self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_int(self):
+    for byte_width in 1, 2, 4, 8:
+      for type_ in Type.INT, Type.INDIRECT_INT, Type.UINT, Type.INDIRECT_UINT:
+        with self.subTest(byte_width=byte_width, type=type_):
+          value = min_value(type_, byte_width)
+          data = encode_type(type_, value)
+          self.assertEqual(flexbuffers.Loads(data), value)
+
+          value = max_value(type_, byte_width)
+          data = encode_type(type_, value)
+          self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_float(self):
+    for value in 3.141592, 7.62, 999.99:
+      for type_ in Type.FLOAT, Type.INDIRECT_FLOAT:
+        with self.subTest(value=value, type=type_):
+          data = encode_type(type_, value)
+          self.assertEqual(flexbuffers.Loads(data), value)
+
+          data = encode_type(type_, value, 4)
+          self.assertAlmostEqual(flexbuffers.Loads(data), value, places=4)
+
+          data = encode_type(type_, value, 8)
+          self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_string(self):
+    for value in '', 'x', 'color', 'hello world':
+      with self.subTest(value=value):
+        data = encode_type(Type.STRING, value)
+        self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_blob(self):
+    for value in bytes(), bytes([240, 12, 143, 7]), bytes(1000 * [17]):
+      with self.subTest(value=value):
+        data = encode_type(Type.BLOB, value)
+        self.assertEqual(flexbuffers.Loads(data), value)
+
+  def test_key(self):
+    for value in '', 'color', 'hello world':
+      with self.subTest(value=value):
+        data = encode_type(Type.KEY, value)
+        self.assertEqual(flexbuffers.Loads(data), value)
+
+    with self.assertRaises(ValueError):
+      encode_type(Type.KEY, (b'\x00' * 10).decode('ascii'))
+
+  def test_vector(self):
+
+    def encode_vector(elements, element_type):
+      fbb = flexbuffers.Builder()
+      with fbb.Vector():
+        add = fbb.Adder(element_type)
+        for e in elements:
+          add(e)
+      return fbb.Finish()
+
+    def encode_vector_from_elements(elements):
+      fbb = flexbuffers.Builder()
+      fbb.VectorFromElements(elements)
+      return fbb.Finish()
+
+    for elements in [], [1435], [56, 23, 0, 6783]:
+      data = encode_vector(elements, Type.INT)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+    # Elements of different type: one by one
+    elements = [56.0, 'flexbuffers', 0, False, 75123]
+
+    fbb = flexbuffers.Builder()
+    with fbb.Vector():
+      fbb.Float(elements[0])
+      fbb.String(elements[1])
+      fbb.UInt(elements[2], 8)
+      fbb.Bool(elements[3])
+      fbb.Int(elements[4])
+    data = fbb.Finish()
+    self.assertEqual(flexbuffers.Loads(data), elements)
+
+    # Elements of different type: all at once
+    fbb = flexbuffers.Builder()
+    fbb.VectorFromElements(elements)
+    data = fbb.Finish()
+    self.assertEqual(flexbuffers.Loads(data), elements)
+
+  def test_nested_vectors(self):
+    fbb = flexbuffers.Builder()
+    with fbb.Vector():
+      fbb.String('begin')
+      fbb.IndirectInt(42)
+      with fbb.Vector():
+        for i in range(5):
+          fbb.Int(i)
+      fbb.String('end')
+    data = fbb.Finish()
+
+    self.assertEqual(
+        flexbuffers.Loads(data), ['begin', 42, [0, 1, 2, 3, 4], 'end'])
+
+  def test_big_vector(self):
+    n = 10 * 1000
+    fbb = flexbuffers.Builder()
+    with fbb.Vector():
+      for i in range(n):
+        fbb.Int(i)
+    self.assertEqual(flexbuffers.Loads(fbb.Finish()), list(range(n)))
+
+  def test_typed_vector(self):
+
+    def encode_typed_vector_from_elements(elements, element_type=None):
+      fbb = flexbuffers.Builder()
+      fbb.TypedVectorFromElements(elements, element_type)
+      return fbb.Finish()
+
+    for elements in [], [False], [True], [False, True, True, False, False]:
+      data = encode_typed_vector_from_elements(elements, Type.BOOL)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+    for elements in [], [23455], [351, -2, 0, 6783, 0, -10]:
+      data = encode_typed_vector_from_elements(elements, Type.INT)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+    for elements in [], [23455], [351, 2, 0, 6783, 0, 10]:
+      data = encode_typed_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements, Type.INT)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements, Type.UINT)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+    for elements in [], [7.0], [52.0, 51.2, 70.0, -4.0]:
+      data = encode_typed_vector_from_elements(elements, Type.FLOAT)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+    for elements in [], ['color'], ['x', 'y']:
+      data = encode_typed_vector_from_elements(elements, Type.KEY)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+      data = encode_typed_vector_from_elements(elements)
+      self.assertEqual(flexbuffers.Loads(data), elements)
+
+  def test_typed_vector_from_array(self):
+
+    def encode_array(typecode, values):
+      fbb = flexbuffers.Builder()
+      fbb.VectorFromElements(array.array(typecode, values))
+      return fbb.Finish()
+
+    values = [1.0, 3.14, -2.54, 0.0]
+    data = encode_array('f', values)
+    for a, b in zip(flexbuffers.Loads(data), values):
+      self.assertAlmostEqual(a, b, places=2)
+
+    values = [1.0, 3.14, -2.54, 0.0]
+    data = encode_array('d', values)
+    self.assertEqual(flexbuffers.Loads(data), values)
+
+    values = [1, -7, 9, 26, 12]
+    data = encode_array('i', values)
+    self.assertEqual(flexbuffers.Loads(data), values)
+
+    values = [0, 1, 2, 3, 4, 5, 6]
+    data = encode_array('I', values)
+    self.assertEqual(flexbuffers.Loads(data), values)
+
+  def test_fixed_typed_vector(self):
+
+    def encode_fixed_typed_vector(elements, element_type=None):
+      fbb = flexbuffers.Builder()
+      fbb.FixedTypedVectorFromElements(elements, element_type)
+      return fbb.Finish()
+
+    for elements in ((-2, 2), (1, 2, 3), (100, -100, 200, -200), (4.0, 7.0),
+                     (0.0, 1.0, 8.0), (9.0, 7.0, 1.0, 5.5)):
+      with self.subTest(elements=elements):
+        data = encode_fixed_typed_vector(elements)
+        self.assertSequenceEqual(flexbuffers.Loads(data), elements)
+
+    elements = [-170, 432, 0, -7]
+    data = encode_fixed_typed_vector(elements, Type.INT)
+    self.assertSequenceEqual(flexbuffers.Loads(data), elements)
+
+    with self.assertRaises(ValueError):
+      encode_fixed_typed_vector([])  # Invalid input length
+
+    with self.assertRaises(ValueError):
+      encode_fixed_typed_vector([1])  # Invalid input length
+
+    with self.assertRaises(ValueError):
+      encode_fixed_typed_vector([1, 2, 3, 4, 5])  # Invalid input length
+
+    with self.assertRaises(TypeError):
+      encode_fixed_typed_vector([1, 1.0])  # Invalid input types
+
+    with self.assertRaises(TypeError):
+      encode_fixed_typed_vector(['', ''])  # Invalid input types
+
+  def test_map_builder(self):
+
+    def get_keys(data):
+      return [key.AsKey for key in flexbuffers.GetRoot(data).AsMap.Keys]
+
+    # Empty map
+    fbb = flexbuffers.Builder()
+    with fbb.Map():
+      pass
+    data = fbb.Finish()
+
+    self.assertEqual(flexbuffers.Loads(data), {})
+
+    # Two-element map of Int
+    fbb = flexbuffers.Builder()
+    with fbb.Map():
+      fbb.Int('y', -2)
+      fbb.Int('x', 10)
+    data = fbb.Finish()
+
+    self.assertEqual(flexbuffers.Loads(data), {'x': 10, 'y': -2})
+
+    # Multiple-element map of vectors
+    fbb = flexbuffers.Builder()
+    with fbb.Map():
+      with fbb.Vector('v'):
+        fbb.Int(45)
+      with fbb.TypedVector('tv'):
+        fbb.Int(-7)
+      fbb.FixedTypedVectorFromElements('ftv', [-2.0, 1.0])
+    data = fbb.Finish()
+
+    self.assertEqual(
+        flexbuffers.Loads(data), {
+            'v': [45],
+            'tv': [-7],
+            'ftv': [-2.0, 1.0]
+        })
+
+    keys = get_keys(data)
+    self.assertEqual(sorted(keys), keys)
+
+    # Multiple-element map of different types
+    fbb = flexbuffers.Builder()
+    with fbb.Map():
+      fbb.Null('n')
+      fbb.Bool('b', False)
+      fbb.Int('i', -27)
+      fbb.UInt('u', 27)
+      fbb.Float('f', -0.85)
+      fbb.String('s', 'String')
+      fbb.Blob('bb', b'data')
+      fbb.IndirectInt('ii', -9500)
+      fbb.IndirectUInt('iu', 540)
+      fbb.IndirectFloat('if', 0.0)
+      fbb.VectorFromElements('v', [2, 1, 0.0])
+      fbb.TypedVectorFromElements('tv', [2, 1, 0])
+      fbb.FixedTypedVectorFromElements('ftv', [2.0, -6.0])
+    data = fbb.Finish()
+
+    self.assertEqual(
+        flexbuffers.Loads(data), {
+            'n': None,
+            'b': False,
+            'i': -27,
+            'u': 27,
+            'f': -0.85,
+            's': 'String',
+            'bb': b'data',
+            'ii': -9500,
+            'iu': 540,
+            'if': 0.0,
+            'v': [2, 1, 0.0],
+            'tv': [2, 1, 0],
+            'ftv': [2.0, -6.0]
+        })
+
+    keys = get_keys(data)
+    self.assertEqual(sorted(keys), keys)
+
+  def test_map_python(self):
+    maps = [
+        {},
+        {
+            'key': 'value'
+        },
+        {
+            'x': None,
+            'y': 3400,
+            'z': -7040
+        },
+        {
+            'zzz': 100,
+            'aaa': 5.0,
+            'ccc': ['Test', 32, False, None, True]
+        },
+        {
+            'name': ['John', 'Smith'],
+            'valid': True,
+            'note': None,
+            'address': {
+                'lines': [175, 'Alhambra'],
+                'city': 'San Francisco',
+                'zip': 94123,
+            },
+        },
+    ]
+
+    for m in maps:
+      self.assertEqual(flexbuffers.Loads(flexbuffers.Dumps(m)), m)
+
+  def test_gold_from_file(self):
+    data = read_test_file(GOLD_FLEXBUFFER_FILE)
+    self.assertEqual(flexbuffers.Loads(data), GOLD_FLEXBUFFER_OBJ)
+
+  def test_gold_from_builder(self):
+    fbb = flexbuffers.Builder()
+    with fbb.Map():
+      with fbb.Vector('vec'):
+        fbb.Int(-100)
+        fbb.String('Fred')
+        fbb.IndirectFloat(4.0)
+        i_f = fbb.LastValue
+        fbb.Blob(bytes([77]))
+        fbb.Bool(False)
+        fbb.ReuseValue(i_f)
+
+      vec = [1, 2, 3]
+      fbb.VectorFromElements('bar', vec)
+      fbb.FixedTypedVectorFromElements('bar3', [1, 2, 3])
+      fbb.VectorFromElements('bools', [True, False, True, False])
+      fbb.Bool('bool', True)
+      fbb.Float('foo', 100)
+      with fbb.Map('mymap'):
+        fbb.String('foo', 'Fred')
+    data = fbb.Finish()
+
+    self.assertEqual(flexbuffers.Loads(data), GOLD_FLEXBUFFER_OBJ)
+
+  def test_min_bit_width(self):
+    fbb = flexbuffers.Builder(force_min_bit_width=flexbuffers.BitWidth.W8)
+    fbb.TypedVectorFromElements([0, 1, 0, 1, 0])
+    data = fbb.Finish()
+
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsTypedVector)
+    self.assertEqual(root.AsTypedVector.ByteWidth, 1)
+
+    fbb = flexbuffers.Builder(force_min_bit_width=flexbuffers.BitWidth.W32)
+    fbb.TypedVectorFromElements([0, 1, 0, 1, 0])
+    data = fbb.Finish()
+
+    root = flexbuffers.GetRoot(data)
+    self.assertTrue(root.IsTypedVector)
+    self.assertEqual(root.AsTypedVector.ByteWidth, 4)
+
+  def test_share_keys(self):
+
+    def encode_key_vector(value, count, share_keys):
+      fbb = flexbuffers.Builder(share_keys=share_keys)
+      with fbb.Vector():
+        for _ in range(count):
+          fbb.Key(value)
+      return fbb.Finish(), fbb.KeyPool.Elements
+
+    data, pool = encode_key_vector('test', 10, share_keys=False)
+    self.assertEqual(len(pool), 0)
+    self.assertEqual(len(data), 74)
+    self.assertEqual(flexbuffers.Loads(data), 10 * ['test'])
+
+    data, pool = encode_key_vector('test', 10, share_keys=True)
+    self.assertEqual(len(pool), 1)
+    self.assertEqual(pool[0], 'test'.encode('ascii'))
+    self.assertEqual(len(data), 29)
+    self.assertEqual(flexbuffers.Loads(data), 10 * ['test'])
+
+  def test_share_strings(self):
+
+    def encode_string_vector(value, count, share_strings):
+      fbb = flexbuffers.Builder(share_strings=share_strings)
+      with fbb.Vector():
+        for _ in range(count):
+          fbb.String(value)
+      return fbb.Finish(), fbb.StringPool.Elements
+
+    data, pool = encode_string_vector('test', 10, share_strings=False)
+    self.assertEqual(len(pool), 0)
+    self.assertEqual(len(data), 84)
+    self.assertEqual(flexbuffers.Loads(data), 10 * ['test'])
+
+    data, pool = encode_string_vector('test', 10, share_strings=True)
+    self.assertEqual(len(pool), 1)
+    self.assertEqual(pool[0], 'test'.encode('utf-8'))
+    self.assertEqual(len(data), 30)
+    self.assertEqual(flexbuffers.Loads(data), 10 * ['test'])
+
+  def test_invalid_stack_size(self):
+    fbb = flexbuffers.Builder()
+
+    with self.assertRaises(RuntimeError):
+      fbb.Finish()
+
+    fbb.Int(100)
+    fbb.Int(200)
+    with self.assertRaises(RuntimeError):
+      fbb.Finish()
+
+    fbb.Clear()
+    fbb.Int(420)
+    fbb.Finish()
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/py_test.py b/3rdparty/TNN/third_party/flatbuffers/tests/py_test.py
new file mode 100644
index 0000000..a5d40fa
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/py_test.py
@@ -0,0 +1,2434 @@
+# coding=utf-8
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+import sys
+import imp
+PY_VERSION = sys.version_info[:2]
+
+import ctypes
+from collections import defaultdict
+import math
+import random
+import timeit
+import unittest
+
+from flatbuffers import compat
+from flatbuffers import util
+from flatbuffers.compat import range_func as compat_range
+from flatbuffers.compat import NumpyRequiredForThisFeature
+
+import flatbuffers
+from flatbuffers import number_types as N
+
+import MyGame  # refers to generated code
+import MyGame.Example  # refers to generated code
+import MyGame.Example.Any  # refers to generated code
+import MyGame.Example.Color  # refers to generated code
+import MyGame.Example.Monster  # refers to generated code
+import MyGame.Example.Test  # refers to generated code
+import MyGame.Example.Stat  # refers to generated code
+import MyGame.Example.Vec3  # refers to generated code
+import MyGame.MonsterExtra  # refers to generated code
+import MyGame.InParentNamespace # refers to generated code
+import MyGame.Example.ArrayTable  # refers to generated code
+import MyGame.Example.ArrayStruct  # refers to generated code
+import MyGame.Example.NestedStruct  # refers to generated code
+import MyGame.Example.TestEnum  # refers to generated code
+
+def assertRaises(test_case, fn, exception_class):
+    ''' Backwards-compatible assertion for exceptions raised. '''
+
+    exc = None
+    try:
+        fn()
+    except Exception as e:
+        exc = e
+    test_case.assertTrue(exc is not None)
+    test_case.assertTrue(isinstance(exc, exception_class))
+
+
+class TestWireFormat(unittest.TestCase):
+    def test_wire_format(self):
+        # Verify that using the generated Python code builds a buffer without
+        # returning errors, and is interpreted correctly, for size prefixed
+        # representation and regular:
+        for sizePrefix in [True, False]:
+            for file_identifier in [None, b"MONS"]:
+                gen_buf, gen_off = make_monster_from_generated_code(sizePrefix=sizePrefix, file_identifier=file_identifier)
+                CheckReadBuffer(gen_buf, gen_off, sizePrefix=sizePrefix, file_identifier=file_identifier)
+
+        # Verify that the canonical flatbuffer file is readable by the
+        # generated Python code. Note that context managers are not part of
+        # Python 2.5, so we use the simpler open/close methods here:
+        f = open('monsterdata_test.mon', 'rb')
+        canonicalWireData = f.read()
+        f.close()
+        CheckReadBuffer(bytearray(canonicalWireData), 0, file_identifier=b'MONS')
+
+        # Write the generated buffer out to a file:
+        f = open('monsterdata_python_wire.mon', 'wb')
+        f.write(gen_buf[gen_off:])
+        f.close()
+
+
+class TestObjectBasedAPI(unittest.TestCase):
+    ''' Tests the generated object based API.'''
+
+    def test_consistenty_with_repeated_pack_and_unpack(self):
+        ''' Checks the serialization and deserialization between a buffer and
+        its python object. It tests in the same way as the C++ object API test,
+        ObjectFlatBuffersTest in test.cpp. '''
+
+        buf, off = make_monster_from_generated_code()
+
+        # Turns a buffer into Python object (T class).
+        monster1 = MyGame.Example.Monster.Monster.GetRootAs(buf, off)
+        monsterT1 = MyGame.Example.Monster.MonsterT.InitFromObj(monster1)
+
+        for sizePrefix in [True, False]:
+            # Re-serialize the data into a buffer.
+            b1 = flatbuffers.Builder(0)
+            if sizePrefix:
+                b1.FinishSizePrefixed(monsterT1.Pack(b1))
+            else:
+                b1.Finish(monsterT1.Pack(b1))
+            CheckReadBuffer(b1.Bytes, b1.Head(), sizePrefix)
+
+        # Deserializes the buffer into Python object again.
+        monster2 = MyGame.Example.Monster.Monster.GetRootAs(b1.Bytes,
+                                                                   b1.Head())
+        # Re-serializes the data into a buffer for one more time.
+        monsterT2 = MyGame.Example.Monster.MonsterT.InitFromObj(monster2)
+        for sizePrefix in [True, False]:
+            # Re-serializes the data into a buffer
+            b2 = flatbuffers.Builder(0)
+            if sizePrefix:
+                b2.FinishSizePrefixed(monsterT2.Pack(b2))
+            else:
+                b2.Finish(monsterT2.Pack(b2))
+            CheckReadBuffer(b2.Bytes, b2.Head(), sizePrefix)
+
+    def test_default_values_with_pack_and_unpack(self):
+        ''' Serializes and deserializes between a buffer with default values (no
+        specific values are filled when the buffer is created) and its python
+        object. '''
+        # Creates a flatbuffer with default values.
+        b1 = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b1)
+        gen_mon = MyGame.Example.Monster.End(b1)
+        b1.Finish(gen_mon)
+
+        # Converts the flatbuffer into the object class.
+        monster1 = MyGame.Example.Monster.Monster.GetRootAs(b1.Bytes,
+                                                                   b1.Head())
+        monsterT1 = MyGame.Example.Monster.MonsterT.InitFromObj(monster1)
+
+        # Packs the object class into another flatbuffer.
+        b2 = flatbuffers.Builder(0)
+        b2.Finish(monsterT1.Pack(b2))
+        monster2 = MyGame.Example.Monster.Monster.GetRootAs(b2.Bytes,
+                                                                   b2.Head())
+        # Checks the default values.
+        self.assertTrue(monster2.Pos() is None)
+        self.assertEqual(monster2.Mana(),150)
+        self.assertEqual(monster2.Hp(), 100)
+        self.assertTrue(monster2.Name() is None)
+        self.assertEqual(monster2.Inventory(0), 0)
+        self.assertEqual(monster2.InventoryAsNumpy(), 0)
+        self.assertEqual(monster2.InventoryLength(), 0)
+        self.assertTrue(monster2.InventoryIsNone())
+        self.assertTrue(monster2.Color() is 8)
+        self.assertEqual(monster2.TestType(), 0)
+        self.assertTrue(monster2.Test() is None)
+        self.assertTrue(monster2.Test4(0) is None)
+        self.assertEqual(monster2.Test4Length(), 0)
+        self.assertTrue(monster2.Test4IsNone())
+        self.assertTrue(monster2.Testarrayofstring(0) is "")
+        self.assertEqual(monster2.TestarrayofstringLength(), 0)
+        self.assertTrue(monster2.TestarrayofstringIsNone())
+        self.assertTrue(monster2.Testarrayoftables(0) is None)
+        self.assertEqual(monster2.TestarrayoftablesLength(), 0)
+        self.assertTrue(monster2.TestarrayoftablesIsNone())
+        self.assertTrue(monster2.Enemy() is None)
+        self.assertEqual(monster2.Testnestedflatbuffer(0), 0)
+        self.assertEqual(monster2.TestnestedflatbufferAsNumpy(), 0)
+        self.assertEqual(monster2.TestnestedflatbufferLength(), 0)
+        self.assertTrue(monster2.TestnestedflatbufferIsNone())
+        self.assertTrue(monster2.Testempty() is None)
+        self.assertTrue(monster2.Testbool() is False)
+        self.assertEqual(monster2.Testhashs32Fnv1(), 0)
+        self.assertEqual(monster2.Testhashu32Fnv1(), 0)
+        self.assertEqual(monster2.Testhashs64Fnv1(), 0)
+        self.assertEqual(monster2.Testhashu64Fnv1(), 0)
+        self.assertEqual(monster2.Testhashs32Fnv1a(), 0)
+        self.assertEqual(monster2.Testhashu32Fnv1a(), 0)
+        self.assertEqual(monster2.Testhashs64Fnv1a(), 0)
+        self.assertEqual(monster2.Testhashu64Fnv1a(), 0)
+        self.assertEqual(monster2.Testarrayofbools(0), 0)
+        self.assertEqual(monster2.TestarrayofboolsAsNumpy(), 0)
+        self.assertEqual(monster2.TestarrayofboolsLength(), 0)
+        self.assertTrue(monster2.TestarrayofboolsIsNone())
+        self.assertEqual(monster2.Testf(), 3.14159)
+        self.assertEqual(monster2.Testf2(), 3.0)
+        self.assertEqual(monster2.Testf3(), 0.0)
+        self.assertTrue(monster2.Testarrayofstring2(0) is "")
+        self.assertEqual(monster2.Testarrayofstring2Length(), 0)
+        self.assertTrue(monster2.Testarrayofstring2IsNone())
+        self.assertTrue(monster2.Testarrayofsortedstruct(0) is None)
+        self.assertEqual(monster2.TestarrayofsortedstructLength(), 0)
+        self.assertTrue(monster2.TestarrayofsortedstructIsNone())
+        self.assertEqual(monster2.Flex(0), 0)
+        self.assertEqual(monster2.FlexAsNumpy(), 0)
+        self.assertEqual(monster2.FlexLength(), 0)
+        self.assertTrue(monster2.FlexIsNone())
+        self.assertTrue(monster2.Test5(0) is None)
+        self.assertEqual(monster2.Test5Length(), 0)
+        self.assertTrue(monster2.Test5IsNone())
+        self.assertEqual(monster2.VectorOfLongs(0), 0)
+        self.assertEqual(monster2.VectorOfLongsAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfLongsLength(), 0)
+        self.assertTrue(monster2.VectorOfLongsIsNone())
+        self.assertEqual(monster2.VectorOfDoubles(0), 0)
+        self.assertEqual(monster2.VectorOfDoublesAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfDoublesLength(), 0)
+        self.assertTrue(monster2.VectorOfDoublesIsNone())
+        self.assertTrue(monster2.ParentNamespaceTest() is None)
+        self.assertTrue(monster2.VectorOfReferrables(0) is None)
+        self.assertEqual(monster2.VectorOfReferrablesLength(), 0)
+        self.assertTrue(monster2.VectorOfReferrablesIsNone())
+        self.assertEqual(monster2.SingleWeakReference(), 0)
+        self.assertEqual(monster2.VectorOfWeakReferences(0), 0)
+        self.assertEqual(monster2.VectorOfWeakReferencesAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfWeakReferencesLength(), 0)
+        self.assertTrue(monster2.VectorOfWeakReferencesIsNone())
+        self.assertTrue(monster2.VectorOfStrongReferrables(0) is None)
+        self.assertEqual(monster2.VectorOfStrongReferrablesLength(), 0)
+        self.assertTrue(monster2.VectorOfStrongReferrablesIsNone())
+        self.assertEqual(monster2.CoOwningReference(), 0)
+        self.assertEqual(monster2.VectorOfCoOwningReferences(0), 0)
+        self.assertEqual(monster2.VectorOfCoOwningReferencesAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfCoOwningReferencesLength(), 0)
+        self.assertTrue(monster2.VectorOfCoOwningReferencesIsNone())
+        self.assertEqual(monster2.NonOwningReference(), 0)
+        self.assertEqual(monster2.VectorOfNonOwningReferences(0), 0)
+        self.assertEqual(monster2.VectorOfNonOwningReferencesAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfNonOwningReferencesLength(), 0)
+        self.assertTrue(monster2.VectorOfNonOwningReferencesIsNone())
+        self.assertEqual(monster2.AnyUniqueType(), 0)
+        self.assertTrue(monster2.AnyUnique() is None)
+        self.assertEqual(monster2.AnyAmbiguousType(), 0)
+        self.assertTrue(monster2.AnyAmbiguous() is None)
+        self.assertEqual(monster2.VectorOfEnums(0), 0)
+        self.assertEqual(monster2.VectorOfEnumsAsNumpy(), 0)
+        self.assertEqual(monster2.VectorOfEnumsLength(), 0)
+        self.assertTrue(monster2.VectorOfEnumsIsNone())
+
+
+class TestAllMutableCodePathsOfExampleSchema(unittest.TestCase):
+    ''' Tests the object API generated for monster_test.fbs for mutation
+        purposes. In each test, the default values will be changed through the
+        object API. We'll then pack the object class into the buf class and read
+        the updated values out from it to validate if the values are mutated as
+        expected.'''
+
+    def setUp(self, *args, **kwargs):
+        super(TestAllMutableCodePathsOfExampleSchema, self).setUp(*args,
+                                                                  **kwargs)
+        # Creates an empty monster flatbuffer, and loads it into the object
+        # class for future tests.
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b)
+        self.monsterT = self._create_and_load_object_class(b)
+
+    def _pack_and_load_buf_class(self, monsterT):
+        ''' Packs the object class into a flatbuffer and loads it into a buf
+        class.'''
+        b = flatbuffers.Builder(0)
+        b.Finish(monsterT.Pack(b))
+        monster = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                                  b.Head())
+        return monster
+
+    def _create_and_load_object_class(self, b):
+        ''' Finishs the creation of a monster flatbuffer and loads it into an
+        object class.'''
+        gen_mon = MyGame.Example.Monster.End(b)
+        b.Finish(gen_mon)
+        monster = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                                  b.Head())
+        monsterT = MyGame.Example.Monster.MonsterT()
+        monsterT.InitFromObj(monster)
+        return monsterT
+
+    def test_mutate_pos(self):
+        posT = MyGame.Example.Vec3.Vec3T()
+        posT.x = 4.0
+        posT.y = 5.0
+        posT.z = 6.0
+        posT.test1 = 6.0
+        posT.test2 = 7
+        test3T = MyGame.Example.Test.TestT()
+        test3T.a = 8
+        test3T.b = 9
+        posT.test3 = test3T
+        self.monsterT.pos = posT
+
+        # Packs the updated values.
+        monster = self._pack_and_load_buf_class(self.monsterT)
+
+        # Checks if values are loaded correctly into the object class.
+        pos = monster.Pos()
+
+        # Verifies the properties of the Vec3.
+        self.assertEqual(pos.X(), 4.0)
+        self.assertEqual(pos.Y(), 5.0)
+        self.assertEqual(pos.Z(), 6.0)
+        self.assertEqual(pos.Test1(), 6.0)
+        self.assertEqual(pos.Test2(), 7)
+        t3 = MyGame.Example.Test.Test()
+        t3 = pos.Test3(t3)
+        self.assertEqual(t3.A(), 8)
+        self.assertEqual(t3.B(), 9)
+
+    def test_mutate_mana(self):
+        self.monsterT.mana = 200
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Mana(), 200)
+
+    def test_mutate_hp(self):
+        self.monsterT.hp = 200
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Hp(), 200)
+
+    def test_mutate_name(self):
+        self.monsterT.name = "MyMonster"
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Name(), b"MyMonster")
+
+    def test_mutate_inventory(self):
+        self.monsterT.inventory = [1, 7, 8]
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Inventory(0), 1)
+        self.assertEqual(monster.Inventory(1), 7)
+        self.assertEqual(monster.Inventory(2), 8)
+
+    def test_empty_inventory(self):
+        self.monsterT.inventory = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.InventoryIsNone())
+
+    def test_mutate_color(self):
+        self.monsterT.color = MyGame.Example.Color.Color.Red
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Color(), MyGame.Example.Color.Color.Red)
+
+    def test_mutate_testtype(self):
+        self.monsterT.testType = MyGame.Example.Any.Any.Monster
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.TestType(), MyGame.Example.Any.Any.Monster)
+
+    def test_mutate_test(self):
+        testT = MyGame.Example.Monster.MonsterT()
+        testT.hp = 200
+        self.monsterT.test = testT
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        # Initializes a Table from a union field Test(...).
+        table = monster.Test()
+
+        # Initializes a Monster from the Table from the union.
+        test_monster = MyGame.Example.Monster.Monster()
+        test_monster.Init(table.Bytes, table.Pos)
+        self.assertEqual(test_monster.Hp(), 200)
+
+    def test_mutate_test4(self):
+        test0T = MyGame.Example.Test.TestT()
+        test0T.a = 10
+        test0T.b = 20
+        test1T = MyGame.Example.Test.TestT()
+        test1T.a = 30
+        test1T.b = 40
+        self.monsterT.test4 = [test0T, test1T]
+
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        test0 = monster.Test4(0)
+        self.assertEqual(test0.A(), 10)
+        self.assertEqual(test0.B(), 20)
+        test1 = monster.Test4(1)
+        self.assertEqual(test1.A(), 30)
+        self.assertEqual(test1.B(), 40)
+
+    def test_empty_test4(self):
+        self.monsterT.test4 = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.Test4IsNone())
+
+    def test_mutate_testarrayofstring(self):
+        self.monsterT.testarrayofstring = []
+        self.monsterT.testarrayofstring.append("test1")
+        self.monsterT.testarrayofstring.append("test2")
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testarrayofstring(0), b"test1")
+        self.assertEqual(monster.Testarrayofstring(1), b"test2")
+
+    def test_empty_testarrayofstring(self):
+        self.monsterT.testarrayofstring = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.TestarrayofstringIsNone())
+
+    def test_mutate_testarrayoftables(self):
+        monsterT0 = MyGame.Example.Monster.MonsterT()
+        monsterT0.hp = 200
+        monsterT1 = MyGame.Example.Monster.MonsterT()
+        monsterT1.hp = 400
+        self.monsterT.testarrayoftables = []
+        self.monsterT.testarrayoftables.append(monsterT0)
+        self.monsterT.testarrayoftables.append(monsterT1)
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testarrayoftables(0).Hp(), 200)
+        self.assertEqual(monster.Testarrayoftables(1).Hp(), 400)
+
+    def test_empty_testarrayoftables(self):
+        self.monsterT.testarrayoftables = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.TestarrayoftablesIsNone())
+
+    def test_mutate_enemy(self):
+        monsterT = MyGame.Example.Monster.MonsterT()
+        monsterT.hp = 200
+        self.monsterT.enemy = monsterT
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Enemy().Hp(), 200)
+
+    def test_mutate_testnestedflatbuffer(self):
+        self.monsterT.testnestedflatbuffer = [8, 2, 4]
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testnestedflatbuffer(0), 8)
+        self.assertEqual(monster.Testnestedflatbuffer(1), 2)
+        self.assertEqual(monster.Testnestedflatbuffer(2), 4)
+
+    def test_empty_testnestedflatbuffer(self):
+        self.monsterT.testnestedflatbuffer = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.TestnestedflatbufferIsNone())
+
+    def test_mutate_testbool(self):
+        self.monsterT.testbool = True
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertTrue(monster.Testbool())
+
+    def test_mutate_testhashes(self):
+        self.monsterT.testhashs32Fnv1 = 1
+        self.monsterT.testhashu32Fnv1 = 2
+        self.monsterT.testhashs64Fnv1 = 3
+        self.monsterT.testhashu64Fnv1 = 4
+        self.monsterT.testhashs32Fnv1a = 5
+        self.monsterT.testhashu32Fnv1a = 6
+        self.monsterT.testhashs64Fnv1a = 7
+        self.monsterT.testhashu64Fnv1a = 8
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testhashs32Fnv1(), 1)
+        self.assertEqual(monster.Testhashu32Fnv1(), 2)
+        self.assertEqual(monster.Testhashs64Fnv1(), 3)
+        self.assertEqual(monster.Testhashu64Fnv1(), 4)
+        self.assertEqual(monster.Testhashs32Fnv1a(), 5)
+        self.assertEqual(monster.Testhashu32Fnv1a(), 6)
+        self.assertEqual(monster.Testhashs64Fnv1a(), 7)
+        self.assertEqual(monster.Testhashu64Fnv1a(), 8)
+
+    def test_mutate_testarrayofbools(self):
+        self.monsterT.testarrayofbools = []
+        self.monsterT.testarrayofbools.append(True)
+        self.monsterT.testarrayofbools.append(True)
+        self.monsterT.testarrayofbools.append(False)
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testarrayofbools(0), True)
+        self.assertEqual(monster.Testarrayofbools(1), True)
+        self.assertEqual(monster.Testarrayofbools(2), False)
+
+    def test_empty_testarrayofbools(self):
+        self.monsterT.testarrayofbools = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.TestarrayofboolsIsNone())
+
+    def test_mutate_testf(self):
+        self.monsterT.testf = 2.0
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.Testf(), 2.0)
+
+    def test_mutate_vectoroflongs(self):
+        self.monsterT.vectorOfLongs = []
+        self.monsterT.vectorOfLongs.append(1)
+        self.monsterT.vectorOfLongs.append(100)
+        self.monsterT.vectorOfLongs.append(10000)
+        self.monsterT.vectorOfLongs.append(1000000)
+        self.monsterT.vectorOfLongs.append(100000000)
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.VectorOfLongs(0), 1)
+        self.assertEqual(monster.VectorOfLongs(1), 100)
+        self.assertEqual(monster.VectorOfLongs(2), 10000)
+        self.assertEqual(monster.VectorOfLongs(3), 1000000)
+        self.assertEqual(monster.VectorOfLongs(4), 100000000)
+
+    def test_empty_vectoroflongs(self):
+        self.monsterT.vectorOfLongs = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.VectorOfLongsIsNone())
+
+    def test_mutate_vectorofdoubles(self):
+        self.monsterT.vectorOfDoubles = []
+        self.monsterT.vectorOfDoubles.append(-1.7976931348623157e+308)
+        self.monsterT.vectorOfDoubles.append(0)
+        self.monsterT.vectorOfDoubles.append(1.7976931348623157e+308)
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.VectorOfDoubles(0), -1.7976931348623157e+308)
+        self.assertEqual(monster.VectorOfDoubles(1), 0)
+        self.assertEqual(monster.VectorOfDoubles(2), 1.7976931348623157e+308)
+
+    def test_empty_vectorofdoubles(self):
+        self.monsterT.vectorOfDoubles = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.VectorOfDoublesIsNone())
+
+    def test_mutate_parentnamespacetest(self):
+        self.monsterT.parentNamespaceTest = MyGame.InParentNamespace.InParentNamespaceT()
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertTrue(isinstance(monster.ParentNamespaceTest(),
+                                   MyGame.InParentNamespace.InParentNamespace))
+
+    def test_mutate_vectorofEnums(self):
+        self.monsterT.vectorOfEnums = []
+        self.monsterT.vectorOfEnums.append(MyGame.Example.Color.Color.Red)
+        self.monsterT.vectorOfEnums.append(MyGame.Example.Color.Color.Blue)
+        self.monsterT.vectorOfEnums.append(MyGame.Example.Color.Color.Red)
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertEqual(monster.VectorOfEnums(0),
+                         MyGame.Example.Color.Color.Red)
+        self.assertEqual(monster.VectorOfEnums(1),
+                         MyGame.Example.Color.Color.Blue)
+        self.assertEqual(monster.VectorOfEnums(2),
+                         MyGame.Example.Color.Color.Red)
+
+    def test_empty_vectorofEnums(self):
+        self.monsterT.vectorOfEnums = []
+        monster = self._pack_and_load_buf_class(self.monsterT)
+        self.assertFalse(monster.VectorOfEnumsIsNone())
+
+
+def CheckReadBuffer(buf, offset, sizePrefix=False, file_identifier=None):
+    ''' CheckReadBuffer checks that the given buffer is evaluated correctly
+        as the example Monster. '''
+
+    def asserter(stmt):
+        ''' An assertion helper that is separated from TestCase classes. '''
+        if not stmt:
+            raise AssertionError('CheckReadBuffer case failed')
+    if file_identifier:
+        # test prior to removal of size_prefix
+        asserter(util.GetBufferIdentifier(buf, offset, size_prefixed=sizePrefix) == file_identifier)
+        asserter(util.BufferHasIdentifier(buf, offset, file_identifier=file_identifier, size_prefixed=sizePrefix))
+        asserter(MyGame.Example.Monster.Monster.MonsterBufferHasIdentifier(buf, offset, size_prefixed=sizePrefix))
+    if sizePrefix:
+        size = util.GetSizePrefix(buf, offset)
+        asserter(size == len(buf[offset:])-4)
+        buf, offset = util.RemoveSizePrefix(buf, offset)
+    if file_identifier:
+        asserter(MyGame.Example.Monster.Monster.MonsterBufferHasIdentifier(buf, offset))
+    else:
+        asserter(not MyGame.Example.Monster.Monster.MonsterBufferHasIdentifier(buf, offset))
+    monster = MyGame.Example.Monster.Monster.GetRootAs(buf, offset)
+
+    asserter(monster.Hp() == 80)
+    asserter(monster.Mana() == 150)
+    asserter(monster.Name() == b'MyMonster')
+
+    # initialize a Vec3 from Pos()
+    vec = monster.Pos()
+    asserter(vec is not None)
+
+    # verify the properties of the Vec3
+    asserter(vec.X() == 1.0)
+    asserter(vec.Y() == 2.0)
+    asserter(vec.Z() == 3.0)
+    asserter(vec.Test1() == 3.0)
+    asserter(vec.Test2() == 2)
+
+    # initialize a Test from Test3(...)
+    t = MyGame.Example.Test.Test()
+    t = vec.Test3(t)
+    asserter(t is not None)
+
+    # verify the properties of the Test
+    asserter(t.A() == 5)
+    asserter(t.B() == 6)
+
+    # verify that the enum code matches the enum declaration:
+    union_type = MyGame.Example.Any.Any
+    asserter(monster.TestType() == union_type.Monster)
+
+    # initialize a Table from a union field Test(...)
+    table2 = monster.Test()
+    asserter(type(table2) is flatbuffers.table.Table)
+
+    # initialize a Monster from the Table from the union
+    monster2 = MyGame.Example.Monster.Monster()
+    monster2.Init(table2.Bytes, table2.Pos)
+
+    asserter(monster2.Name() == b"Fred")
+
+    # iterate through the first monster's inventory:
+    asserter(monster.InventoryLength() == 5)
+    asserter(not monster.InventoryIsNone())
+
+    invsum = 0
+    for i in compat_range(monster.InventoryLength()):
+        v = monster.Inventory(i)
+        invsum += int(v)
+    asserter(invsum == 10)
+
+    for i in range(5):
+        asserter(monster.VectorOfLongs(i) == 10 ** (i * 2))
+
+    asserter(not monster.VectorOfDoublesIsNone())
+    asserter(([-1.7976931348623157e+308, 0, 1.7976931348623157e+308]
+              == [monster.VectorOfDoubles(i)
+                  for i in range(monster.VectorOfDoublesLength())]))
+
+    try:
+        imp.find_module('numpy')
+        # if numpy exists, then we should be able to get the
+        # vector as a numpy array
+        import numpy as np
+
+        asserter(monster.InventoryAsNumpy().sum() == 10)
+        asserter(monster.InventoryAsNumpy().dtype == np.dtype('uint8'))
+
+        VectorOfLongs = monster.VectorOfLongsAsNumpy()
+        asserter(VectorOfLongs.dtype == np.dtype('int64'))
+        for i in range(5):
+            asserter(VectorOfLongs[i] == 10 ** (i * 2))
+
+        VectorOfDoubles = monster.VectorOfDoublesAsNumpy()
+        asserter(VectorOfDoubles.dtype == np.dtype('float64'))
+        asserter(VectorOfDoubles[0] == np.finfo('float64').min)
+        asserter(VectorOfDoubles[1] == 0.0)
+        asserter(VectorOfDoubles[2] == np.finfo('float64').max)
+
+    except ImportError:
+        # If numpy does not exist, trying to get vector as numpy
+        # array should raise NumpyRequiredForThisFeature. The way
+        # assertRaises has been implemented prevents us from
+        # asserting this error is raised outside of a test case.
+        pass
+
+    asserter(monster.Test4Length() == 2)
+    asserter(not monster.Test4IsNone())
+
+    # create a 'Test' object and populate it:
+    test0 = monster.Test4(0)
+    asserter(type(test0) is MyGame.Example.Test.Test)
+
+    test1 = monster.Test4(1)
+    asserter(type(test1) is MyGame.Example.Test.Test)
+
+    # the position of test0 and test1 are swapped in monsterdata_java_wire
+    # and monsterdata_test_wire, so ignore ordering
+    v0 = test0.A()
+    v1 = test0.B()
+    v2 = test1.A()
+    v3 = test1.B()
+    sumtest12 = int(v0) + int(v1) + int(v2) + int(v3)
+
+    asserter(sumtest12 == 100)
+
+    asserter(not monster.TestarrayofstringIsNone())
+    asserter(monster.TestarrayofstringLength() == 2)
+    asserter(monster.Testarrayofstring(0) == b"test1")
+    asserter(monster.Testarrayofstring(1) == b"test2")
+
+    asserter(monster.TestarrayoftablesIsNone())
+    asserter(monster.TestarrayoftablesLength() == 0)
+    asserter(monster.TestnestedflatbufferIsNone())
+    asserter(monster.TestnestedflatbufferLength() == 0)
+    asserter(monster.Testempty() is None)
+
+
+class TestFuzz(unittest.TestCase):
+    ''' Low level stress/fuzz test: serialize/deserialize a variety of
+        different kinds of data in different combinations '''
+
+    binary_type = compat.binary_types[0] # this will always exist
+    ofInt32Bytes = binary_type([0x83, 0x33, 0x33, 0x33])
+    ofInt64Bytes = binary_type([0x84, 0x44, 0x44, 0x44,
+                                0x44, 0x44, 0x44, 0x44])
+    overflowingInt32Val = flatbuffers.encode.Get(flatbuffers.packer.int32,
+                                                 ofInt32Bytes, 0)
+    overflowingInt64Val = flatbuffers.encode.Get(flatbuffers.packer.int64,
+                                                 ofInt64Bytes, 0)
+
+    # Values we're testing against: chosen to ensure no bits get chopped
+    # off anywhere, and also be different from eachother.
+    boolVal = True
+    int8Val = N.Int8Flags.py_type(-127) # 0x81
+    uint8Val = N.Uint8Flags.py_type(0xFF)
+    int16Val = N.Int16Flags.py_type(-32222) # 0x8222
+    uint16Val = N.Uint16Flags.py_type(0xFEEE)
+    int32Val = N.Int32Flags.py_type(overflowingInt32Val)
+    uint32Val = N.Uint32Flags.py_type(0xFDDDDDDD)
+    int64Val = N.Int64Flags.py_type(overflowingInt64Val)
+    uint64Val = N.Uint64Flags.py_type(0xFCCCCCCCCCCCCCCC)
+    # Python uses doubles, so force it here
+    float32Val = N.Float32Flags.py_type(ctypes.c_float(3.14159).value)
+    float64Val = N.Float64Flags.py_type(3.14159265359)
+
+    def test_fuzz(self):
+        return self.check_once(11, 100)
+
+    def check_once(self, fuzzFields, fuzzObjects):
+        testValuesMax = 11 # hardcoded to the number of scalar types
+
+        builder = flatbuffers.Builder(0)
+        l = LCG()
+
+        objects = [0 for _ in compat_range(fuzzObjects)]
+
+        # Generate fuzzObjects random objects each consisting of
+        # fuzzFields fields, each of a random type.
+        for i in compat_range(fuzzObjects):
+            builder.StartObject(fuzzFields)
+
+            for j in compat_range(fuzzFields):
+                choice = int(l.Next()) % testValuesMax
+                if choice == 0:
+                    builder.PrependBoolSlot(int(j), self.boolVal, False)
+                elif choice == 1:
+                    builder.PrependInt8Slot(int(j), self.int8Val, 0)
+                elif choice == 2:
+                    builder.PrependUint8Slot(int(j), self.uint8Val, 0)
+                elif choice == 3:
+                    builder.PrependInt16Slot(int(j), self.int16Val, 0)
+                elif choice == 4:
+                    builder.PrependUint16Slot(int(j), self.uint16Val, 0)
+                elif choice == 5:
+                    builder.PrependInt32Slot(int(j), self.int32Val, 0)
+                elif choice == 6:
+                    builder.PrependUint32Slot(int(j), self.uint32Val, 0)
+                elif choice == 7:
+                    builder.PrependInt64Slot(int(j), self.int64Val, 0)
+                elif choice == 8:
+                    builder.PrependUint64Slot(int(j), self.uint64Val, 0)
+                elif choice == 9:
+                    builder.PrependFloat32Slot(int(j), self.float32Val, 0)
+                elif choice == 10:
+                    builder.PrependFloat64Slot(int(j), self.float64Val, 0)
+                else:
+                    raise RuntimeError('unreachable')
+
+            off = builder.EndObject()
+
+            # store the offset from the end of the builder buffer,
+            # since it will keep growing:
+            objects[i] = off
+
+        # Do some bookkeeping to generate stats on fuzzes:
+        stats = defaultdict(int)
+        def check(table, desc, want, got):
+            stats[desc] += 1
+            self.assertEqual(want, got, "%s != %s, %s" % (want, got, desc))
+
+        l = LCG()  # Reset.
+
+        # Test that all objects we generated are readable and return the
+        # expected values. We generate random objects in the same order
+        # so this is deterministic.
+        for i in compat_range(fuzzObjects):
+
+            table = flatbuffers.table.Table(builder.Bytes,
+                                            len(builder.Bytes) - objects[i])
+
+            for j in compat_range(fuzzFields):
+                field_count = flatbuffers.builder.VtableMetadataFields + j
+                f = N.VOffsetTFlags.py_type(field_count *
+                                            N.VOffsetTFlags.bytewidth)
+                choice = int(l.Next()) % testValuesMax
+
+                if choice == 0:
+                    check(table, "bool", self.boolVal,
+                          table.GetSlot(f, False, N.BoolFlags))
+                elif choice == 1:
+                    check(table, "int8", self.int8Val,
+                          table.GetSlot(f, 0, N.Int8Flags))
+                elif choice == 2:
+                    check(table, "uint8", self.uint8Val,
+                          table.GetSlot(f, 0, N.Uint8Flags))
+                elif choice == 3:
+                    check(table, "int16", self.int16Val,
+                          table.GetSlot(f, 0, N.Int16Flags))
+                elif choice == 4:
+                    check(table, "uint16", self.uint16Val,
+                          table.GetSlot(f, 0, N.Uint16Flags))
+                elif choice == 5:
+                    check(table, "int32", self.int32Val,
+                          table.GetSlot(f, 0, N.Int32Flags))
+                elif choice == 6:
+                    check(table, "uint32", self.uint32Val,
+                          table.GetSlot(f, 0, N.Uint32Flags))
+                elif choice == 7:
+                    check(table, "int64", self.int64Val,
+                          table.GetSlot(f, 0, N.Int64Flags))
+                elif choice == 8:
+                    check(table, "uint64", self.uint64Val,
+                          table.GetSlot(f, 0, N.Uint64Flags))
+                elif choice == 9:
+                    check(table, "float32", self.float32Val,
+                          table.GetSlot(f, 0, N.Float32Flags))
+                elif choice == 10:
+                    check(table, "float64", self.float64Val,
+                          table.GetSlot(f, 0, N.Float64Flags))
+                else:
+                    raise RuntimeError('unreachable')
+
+        # If enough checks were made, verify that all scalar types were used:
+        self.assertEqual(testValuesMax, len(stats),
+                "fuzzing failed to test all scalar types: %s" % stats)
+
+
+class TestByteLayout(unittest.TestCase):
+    ''' TestByteLayout checks the bytes of a Builder in various scenarios. '''
+
+    def assertBuilderEquals(self, builder, want_chars_or_ints):
+        def integerize(x):
+            if isinstance(x, compat.string_types):
+                return ord(x)
+            return x
+
+        want_ints = list(map(integerize, want_chars_or_ints))
+        want = bytearray(want_ints)
+        got = builder.Bytes[builder.Head():] # use the buffer directly
+        self.assertEqual(want, got)
+
+    def test_numbers(self):
+        b = flatbuffers.Builder(0)
+        self.assertBuilderEquals(b, [])
+        b.PrependBool(True)
+        self.assertBuilderEquals(b, [1])
+        b.PrependInt8(-127)
+        self.assertBuilderEquals(b, [129, 1])
+        b.PrependUint8(255)
+        self.assertBuilderEquals(b, [255, 129, 1])
+        b.PrependInt16(-32222)
+        self.assertBuilderEquals(b, [0x22, 0x82, 0, 255, 129, 1]) # first pad
+        b.PrependUint16(0xFEEE)
+        # no pad this time:
+        self.assertBuilderEquals(b, [0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1])
+        b.PrependInt32(-53687092)
+        self.assertBuilderEquals(b, [204, 204, 204, 252, 0xEE, 0xFE,
+                                     0x22, 0x82, 0, 255, 129, 1])
+        b.PrependUint32(0x98765432)
+        self.assertBuilderEquals(b, [0x32, 0x54, 0x76, 0x98,
+                                     204, 204, 204, 252,
+                                     0xEE, 0xFE, 0x22, 0x82,
+                                     0, 255, 129, 1])
+
+    def test_numbers64(self):
+        b = flatbuffers.Builder(0)
+        b.PrependUint64(0x1122334455667788)
+        self.assertBuilderEquals(b, [0x88, 0x77, 0x66, 0x55,
+                                     0x44, 0x33, 0x22, 0x11])
+
+        b = flatbuffers.Builder(0)
+        b.PrependInt64(0x1122334455667788)
+        self.assertBuilderEquals(b, [0x88, 0x77, 0x66, 0x55,
+                                     0x44, 0x33, 0x22, 0x11])
+
+    def test_1xbyte_vector(self):
+        b = flatbuffers.Builder(0)
+        self.assertBuilderEquals(b, [])
+        b.StartVector(flatbuffers.number_types.Uint8Flags.bytewidth, 1, 1)
+        self.assertBuilderEquals(b, [0, 0, 0]) # align to 4bytes
+        b.PrependByte(1)
+        self.assertBuilderEquals(b, [1, 0, 0, 0])
+        b.EndVector()
+        self.assertBuilderEquals(b, [1, 0, 0, 0, 1, 0, 0, 0]) # padding
+
+    def test_2xbyte_vector(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Uint8Flags.bytewidth, 2, 1)
+        self.assertBuilderEquals(b, [0, 0]) # align to 4bytes
+        b.PrependByte(1)
+        self.assertBuilderEquals(b, [1, 0, 0])
+        b.PrependByte(2)
+        self.assertBuilderEquals(b, [2, 1, 0, 0])
+        b.EndVector()
+        self.assertBuilderEquals(b, [2, 0, 0, 0, 2, 1, 0, 0]) # padding
+
+    def test_1xuint16_vector(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Uint16Flags.bytewidth, 1, 1)
+        self.assertBuilderEquals(b, [0, 0]) # align to 4bytes
+        b.PrependUint16(1)
+        self.assertBuilderEquals(b, [1, 0, 0, 0])
+        b.EndVector()
+        self.assertBuilderEquals(b, [1, 0, 0, 0, 1, 0, 0, 0]) # padding
+
+    def test_2xuint16_vector(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Uint16Flags.bytewidth, 2, 1)
+        self.assertBuilderEquals(b, []) # align to 4bytes
+        b.PrependUint16(0xABCD)
+        self.assertBuilderEquals(b, [0xCD, 0xAB])
+        b.PrependUint16(0xDCBA)
+        self.assertBuilderEquals(b, [0xBA, 0xDC, 0xCD, 0xAB])
+        b.EndVector()
+        self.assertBuilderEquals(b, [2, 0, 0, 0, 0xBA, 0xDC, 0xCD, 0xAB])
+
+    def test_create_ascii_string(self):
+        b = flatbuffers.Builder(0)
+        b.CreateString(u"foo", encoding='ascii')
+
+        # 0-terminated, no pad:
+        self.assertBuilderEquals(b, [3, 0, 0, 0, 'f', 'o', 'o', 0])
+        b.CreateString(u"moop", encoding='ascii')
+        # 0-terminated, 3-byte pad:
+        self.assertBuilderEquals(b, [4, 0, 0, 0, 'm', 'o', 'o', 'p',
+                                     0, 0, 0, 0,
+                                     3, 0, 0, 0, 'f', 'o', 'o', 0])
+
+    def test_create_utf8_string(self):
+        b = flatbuffers.Builder(0)
+        b.CreateString(u"Цлїςσδε")
+        self.assertBuilderEquals(b, "\x0e\x00\x00\x00\xd0\xa6\xd0\xbb\xd1\x97" \
+            "\xcf\x82\xcf\x83\xce\xb4\xce\xb5\x00\x00")
+
+        b.CreateString(u"ﾌﾑｱﾑｶﾓｹﾓ")
+        self.assertBuilderEquals(b, "\x18\x00\x00\x00\xef\xbe\x8c\xef\xbe\x91" \
+            "\xef\xbd\xb1\xef\xbe\x91\xef\xbd\xb6\xef\xbe\x93\xef\xbd\xb9\xef" \
+            "\xbe\x93\x00\x00\x00\x00\x0e\x00\x00\x00\xd0\xa6\xd0\xbb\xd1\x97" \
+            "\xcf\x82\xcf\x83\xce\xb4\xce\xb5\x00\x00")
+
+    def test_create_arbitrary_string(self):
+        b = flatbuffers.Builder(0)
+        s = "\x01\x02\x03"
+        b.CreateString(s) # Default encoding is utf-8.
+        # 0-terminated, no pad:
+        self.assertBuilderEquals(b, [3, 0, 0, 0, 1, 2, 3, 0])
+        s2 = "\x04\x05\x06\x07"
+        b.CreateString(s2) # Default encoding is utf-8.
+        # 0-terminated, 3-byte pad:
+        self.assertBuilderEquals(b, [4, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 0,
+                                     3, 0, 0, 0, 1, 2, 3, 0])
+
+    def test_create_byte_vector(self):
+        b = flatbuffers.Builder(0)
+        b.CreateByteVector(b"")
+        # 0-byte pad:
+        self.assertBuilderEquals(b, [0, 0, 0, 0])
+
+        b = flatbuffers.Builder(0)
+        b.CreateByteVector(b"\x01\x02\x03")
+        # 1-byte pad:
+        self.assertBuilderEquals(b, [3, 0, 0, 0, 1, 2, 3, 0])
+
+    def test_create_numpy_vector_int8(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([1, 2, -3], dtype=np.int8)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,  # vector length
+                1, 2, 256 - 3, 0   # vector value + padding
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,  # vector length
+                1, 2, 256 - 3, 0   # vector value + padding
+            ])
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_uint16(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([1, 2, 312], dtype=np.uint16)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,     # vector length
+                1, 0,           # 1
+                2, 0,           # 2
+                312 - 256, 1,   # 312
+                0, 0            # padding
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,     # vector length
+                1, 0,           # 1
+                2, 0,           # 2
+                312 - 256, 1,   # 312
+                0, 0            # padding
+            ])
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_int64(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([1, 2, -12], dtype=np.int64)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                1, 0, 0, 0, 0, 0, 0, 0,         # 1
+                2, 0, 0, 0, 0, 0, 0, 0,         # 2
+                256 - 12, 255, 255, 255, 255, 255, 255, 255   # -12
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                1, 0, 0, 0, 0, 0, 0, 0,         # 1
+                2, 0, 0, 0, 0, 0, 0, 0,         # 2
+                256 - 12, 255, 255, 255, 255, 255, 255, 255   # -12
+            ])
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_float32(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([1, 2, -12], dtype=np.float32)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                0, 0, 128, 63,                  # 1
+                0, 0, 0, 64,                    # 2
+                0, 0, 64, 193                   # -12
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                0, 0, 128, 63,                  # 1
+                0, 0, 0, 64,                    # 2
+                0, 0, 64, 193                   # -12
+            ])
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_float64(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([1, 2, -12], dtype=np.float64)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                0, 0, 0, 0, 0, 0, 240, 63,                  # 1
+                0, 0, 0, 0, 0, 0, 0, 64,                    # 2
+                0, 0, 0, 0, 0, 0, 40, 192                   # -12
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0,                     # vector length
+                0, 0, 0, 0, 0, 0, 240, 63,                  # 1
+                0, 0, 0, 0, 0, 0, 0, 64,                    # 2
+                0, 0, 0, 0, 0, 0, 40, 192                   # -12
+            ])
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_bool(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Systems endian:
+            b = flatbuffers.Builder(0)
+            x = np.array([True, False, True], dtype=np.bool)
+            b.CreateNumpyVector(x)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0, # vector length
+                1, 0, 1, 0  # vector values + padding
+            ])
+
+            # Reverse endian:
+            b = flatbuffers.Builder(0)
+            x_other_endian = x.byteswap().newbyteorder()
+            b.CreateNumpyVector(x_other_endian)
+            self.assertBuilderEquals(b, [
+                3, 0, 0, 0, # vector length
+                1, 0, 1, 0  # vector values + padding
+            ])
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_reject_strings(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Create String array
+            b = flatbuffers.Builder(0)
+            x = np.array(["hello", "fb", "testing"])
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                TypeError)
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_create_numpy_vector_reject_object(self):
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            import numpy as np
+
+            # Create String array
+            b = flatbuffers.Builder(0)
+            x = np.array([{"m": 0}, {"as": -2.1, 'c': 'c'}])
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                TypeError)
+
+        except ImportError:
+            b = flatbuffers.Builder(0)
+            x = 0
+            assertRaises(
+                self,
+                lambda: b.CreateNumpyVector(x),
+                NumpyRequiredForThisFeature)
+
+    def test_empty_vtable(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(0)
+        self.assertBuilderEquals(b, [])
+        b.EndObject()
+        self.assertBuilderEquals(b, [4, 0, 4, 0, 4, 0, 0, 0])
+
+    def test_vtable_with_one_true_bool(self):
+        b = flatbuffers.Builder(0)
+        self.assertBuilderEquals(b, [])
+        b.StartObject(1)
+        self.assertBuilderEquals(b, [])
+        b.PrependBoolSlot(0, True, False)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            8, 0,  # length of object including vtable offset
+            7, 0,  # start of bool value
+            6, 0, 0, 0,  # offset for start of vtable (int32)
+            0, 0, 0,  # padded to 4 bytes
+            1,  # bool value
+        ])
+
+    def test_vtable_with_one_default_bool(self):
+        b = flatbuffers.Builder(0)
+        self.assertBuilderEquals(b, [])
+        b.StartObject(1)
+        self.assertBuilderEquals(b, [])
+        b.PrependBoolSlot(0, False, False)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            4, 0,  # vtable bytes
+            4, 0,  # end of object from here
+            # entry 1 is zero and not stored
+            4, 0, 0, 0,  # offset for start of vtable (int32)
+        ])
+
+    def test_vtable_with_one_int16(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(1)
+        b.PrependInt16Slot(0, 0x789A, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            8, 0,  # end of object from here
+            6, 0,  # offset to value
+            6, 0, 0, 0,  # offset for start of vtable (int32)
+            0, 0,  # padding to 4 bytes
+            0x9A, 0x78,
+        ])
+
+    def test_vtable_with_two_int16(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(2)
+        b.PrependInt16Slot(0, 0x3456, 0)
+        b.PrependInt16Slot(1, 0x789A, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            8, 0,  # vtable bytes
+            8, 0,  # end of object from here
+            6, 0,  # offset to value 0
+            4, 0,  # offset to value 1
+            8, 0, 0, 0,  # offset for start of vtable (int32)
+            0x9A, 0x78,  # value 1
+            0x56, 0x34,  # value 0
+        ])
+
+    def test_vtable_with_int16_and_bool(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(2)
+        b.PrependInt16Slot(0, 0x3456, 0)
+        b.PrependBoolSlot(1, True, False)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            8, 0,  # vtable bytes
+            8, 0,  # end of object from here
+            6, 0,  # offset to value 0
+            5, 0,  # offset to value 1
+            8, 0, 0, 0,  # offset for start of vtable (int32)
+            0,          # padding
+            1,          # value 1
+            0x56, 0x34,  # value 0
+        ])
+
+    def test_vtable_with_empty_vector(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Uint8Flags.bytewidth, 0, 1)
+        vecend = b.EndVector()
+        b.StartObject(1)
+        b.PrependUOffsetTRelativeSlot(0, vecend, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            8, 0,
+            4, 0,  # offset to vector offset
+            6, 0, 0, 0,  # offset for start of vtable (int32)
+            4, 0, 0, 0,
+            0, 0, 0, 0,  # length of vector (not in struct)
+        ])
+
+    def test_vtable_with_empty_vector_of_byte_and_some_scalars(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Uint8Flags.bytewidth, 0, 1)
+        vecend = b.EndVector()
+        b.StartObject(2)
+        b.PrependInt16Slot(0, 55, 0)
+        b.PrependUOffsetTRelativeSlot(1, vecend, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            8, 0,  # vtable bytes
+            12, 0,
+            10, 0,  # offset to value 0
+            4, 0,  # offset to vector offset
+            8, 0, 0, 0,  # vtable loc
+            8, 0, 0, 0,  # value 1
+            0, 0, 55, 0,  # value 0
+
+            0, 0, 0, 0,  # length of vector (not in struct)
+        ])
+
+    def test_vtable_with_1_int16_and_2vector_of_int16(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Int16Flags.bytewidth, 2, 1)
+        b.PrependInt16(0x1234)
+        b.PrependInt16(0x5678)
+        vecend = b.EndVector()
+        b.StartObject(2)
+        b.PrependUOffsetTRelativeSlot(1, vecend, 0)
+        b.PrependInt16Slot(0, 55, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            8, 0,  # vtable bytes
+            12, 0,  # length of object
+            6, 0,  # start of value 0 from end of vtable
+            8, 0,  # start of value 1 from end of buffer
+            8, 0, 0, 0,  # offset for start of vtable (int32)
+            0, 0,  # padding
+            55, 0,  # value 0
+            4, 0, 0, 0,  # vector position from here
+            2, 0, 0, 0,  # length of vector (uint32)
+            0x78, 0x56,  # vector value 1
+            0x34, 0x12,  # vector value 0
+        ])
+
+    def test_vtable_with_1_struct_of_1_int8__1_int16__1_int32(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(1)
+        b.Prep(4+4+4, 0)
+        b.PrependInt8(55)
+        b.Pad(3)
+        b.PrependInt16(0x1234)
+        b.Pad(2)
+        b.PrependInt32(0x12345678)
+        structStart = b.Offset()
+        b.PrependStructSlot(0, structStart, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            16, 0,  # end of object from here
+            4, 0,  # start of struct from here
+            6, 0, 0, 0,  # offset for start of vtable (int32)
+            0x78, 0x56, 0x34, 0x12,  # value 2
+            0, 0,  # padding
+            0x34, 0x12,  # value 1
+            0, 0, 0,  # padding
+            55,  # value 0
+        ])
+
+    def test_vtable_with_1_vector_of_2_struct_of_2_int8(self):
+        b = flatbuffers.Builder(0)
+        b.StartVector(flatbuffers.number_types.Int8Flags.bytewidth*2, 2, 1)
+        b.PrependInt8(33)
+        b.PrependInt8(44)
+        b.PrependInt8(55)
+        b.PrependInt8(66)
+        vecend = b.EndVector()
+        b.StartObject(1)
+        b.PrependUOffsetTRelativeSlot(0, vecend, 0)
+        b.EndObject()
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            8, 0,
+            4, 0,  # offset of vector offset
+            6, 0, 0, 0,  # offset for start of vtable (int32)
+            4, 0, 0, 0,  # vector start offset
+
+            2, 0, 0, 0,  # vector length
+            66,  # vector value 1,1
+            55,  # vector value 1,0
+            44,  # vector value 0,1
+            33,  # vector value 0,0
+        ])
+
+    def test_table_with_some_elements(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(2)
+        b.PrependInt8Slot(0, 33, 0)
+        b.PrependInt16Slot(1, 66, 0)
+        off = b.EndObject()
+        b.Finish(off)
+
+        self.assertBuilderEquals(b, [
+            12, 0, 0, 0,  # root of table: points to vtable offset
+
+            8, 0,  # vtable bytes
+            8, 0,  # end of object from here
+            7, 0,  # start of value 0
+            4, 0,  # start of value 1
+
+            8, 0, 0, 0,  # offset for start of vtable (int32)
+
+            66, 0,  # value 1
+            0,  # padding
+            33,  # value 0
+        ])
+
+    def test__one_unfinished_table_and_one_finished_table(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(2)
+        b.PrependInt8Slot(0, 33, 0)
+        b.PrependInt8Slot(1, 44, 0)
+        off = b.EndObject()
+        b.Finish(off)
+
+        b.StartObject(3)
+        b.PrependInt8Slot(0, 55, 0)
+        b.PrependInt8Slot(1, 66, 0)
+        b.PrependInt8Slot(2, 77, 0)
+        off = b.EndObject()
+        b.Finish(off)
+
+        self.assertBuilderEquals(b, [
+            16, 0, 0, 0,  # root of table: points to object
+            0, 0,  # padding
+
+            10, 0,  # vtable bytes
+            8, 0,  # size of object
+            7, 0,  # start of value 0
+            6, 0,  # start of value 1
+            5, 0,  # start of value 2
+            10, 0, 0, 0,  # offset for start of vtable (int32)
+            0,  # padding
+            77,  # value 2
+            66,  # value 1
+            55,  # value 0
+
+            12, 0, 0, 0,  # root of table: points to object
+
+            8, 0,  # vtable bytes
+            8, 0,  # size of object
+            7, 0,  # start of value 0
+            6, 0,  # start of value 1
+            8, 0, 0, 0,  # offset for start of vtable (int32)
+            0, 0,  # padding
+            44,  # value 1
+            33,  # value 0
+        ])
+
+    def test_a_bunch_of_bools(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(8)
+        b.PrependBoolSlot(0, True, False)
+        b.PrependBoolSlot(1, True, False)
+        b.PrependBoolSlot(2, True, False)
+        b.PrependBoolSlot(3, True, False)
+        b.PrependBoolSlot(4, True, False)
+        b.PrependBoolSlot(5, True, False)
+        b.PrependBoolSlot(6, True, False)
+        b.PrependBoolSlot(7, True, False)
+        off = b.EndObject()
+        b.Finish(off)
+
+        self.assertBuilderEquals(b, [
+            24, 0, 0, 0,  # root of table: points to vtable offset
+
+            20, 0,  # vtable bytes
+            12, 0,  # size of object
+            11, 0,  # start of value 0
+            10, 0,  # start of value 1
+            9, 0,  # start of value 2
+            8, 0,  # start of value 3
+            7, 0,  # start of value 4
+            6, 0,  # start of value 5
+            5, 0,  # start of value 6
+            4, 0,  # start of value 7
+            20, 0, 0, 0,  # vtable offset
+
+            1,  # value 7
+            1,  # value 6
+            1,  # value 5
+            1,  # value 4
+            1,  # value 3
+            1,  # value 2
+            1,  # value 1
+            1,  # value 0
+        ])
+
+    def test_three_bools(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(3)
+        b.PrependBoolSlot(0, True, False)
+        b.PrependBoolSlot(1, True, False)
+        b.PrependBoolSlot(2, True, False)
+        off = b.EndObject()
+        b.Finish(off)
+
+        self.assertBuilderEquals(b, [
+            16, 0, 0, 0,  # root of table: points to vtable offset
+
+            0, 0,  # padding
+
+            10, 0,  # vtable bytes
+            8, 0,  # size of object
+            7, 0,  # start of value 0
+            6, 0,  # start of value 1
+            5, 0,  # start of value 2
+            10, 0, 0, 0,  # vtable offset from here
+
+            0,  # padding
+            1,  # value 2
+            1,  # value 1
+            1,  # value 0
+        ])
+
+    def test_some_floats(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(1)
+        b.PrependFloat32Slot(0, 1.0, 0.0)
+        off = b.EndObject()
+
+        self.assertBuilderEquals(b, [
+            6, 0,  # vtable bytes
+            8, 0,  # size of object
+            4, 0,  # start of value 0
+            6, 0, 0, 0,  # vtable offset
+
+            0, 0, 128, 63,  # value 0
+        ])
+
+
+def make_monster_from_generated_code(sizePrefix = False, file_identifier=None):
+    ''' Use generated code to build the example Monster. '''
+
+    b = flatbuffers.Builder(0)
+    string = b.CreateString("MyMonster")
+    test1 = b.CreateString("test1")
+    test2 = b.CreateString("test2")
+    fred = b.CreateString("Fred")
+
+    MyGame.Example.Monster.StartInventoryVector(b, 5)
+    b.PrependByte(4)
+    b.PrependByte(3)
+    b.PrependByte(2)
+    b.PrependByte(1)
+    b.PrependByte(0)
+    inv = b.EndVector()
+
+    MyGame.Example.Monster.Start(b)
+    MyGame.Example.Monster.AddName(b, fred)
+    mon2 = MyGame.Example.Monster.End(b)
+
+    MyGame.Example.Monster.StartTest4Vector(b, 2)
+    MyGame.Example.Test.CreateTest(b, 10, 20)
+    MyGame.Example.Test.CreateTest(b, 30, 40)
+    test4 = b.EndVector()
+
+    MyGame.Example.Monster.StartTestarrayofstringVector(b, 2)
+    b.PrependUOffsetTRelative(test2)
+    b.PrependUOffsetTRelative(test1)
+    testArrayOfString = b.EndVector()
+
+    MyGame.Example.Monster.StartVectorOfLongsVector(b, 5)
+    b.PrependInt64(100000000)
+    b.PrependInt64(1000000)
+    b.PrependInt64(10000)
+    b.PrependInt64(100)
+    b.PrependInt64(1)
+    VectorOfLongs = b.EndVector()
+
+    MyGame.Example.Monster.StartVectorOfDoublesVector(b, 3)
+    b.PrependFloat64(1.7976931348623157e+308)
+    b.PrependFloat64(0)
+    b.PrependFloat64(-1.7976931348623157e+308)
+    VectorOfDoubles = b.EndVector()
+
+    MyGame.Example.Monster.Start(b)
+
+    pos = MyGame.Example.Vec3.CreateVec3(b, 1.0, 2.0, 3.0, 3.0, 2, 5, 6)
+    MyGame.Example.Monster.AddPos(b, pos)
+
+    MyGame.Example.Monster.AddHp(b, 80)
+    MyGame.Example.Monster.AddName(b, string)
+    MyGame.Example.Monster.AddInventory(b, inv)
+    MyGame.Example.Monster.AddTestType(b, 1)
+    MyGame.Example.Monster.AddTest(b, mon2)
+    MyGame.Example.Monster.AddTest4(b, test4)
+    MyGame.Example.Monster.AddTestarrayofstring(b, testArrayOfString)
+    MyGame.Example.Monster.AddVectorOfLongs(b, VectorOfLongs)
+    MyGame.Example.Monster.AddVectorOfDoubles(b, VectorOfDoubles)
+    mon = MyGame.Example.Monster.End(b)
+
+    if sizePrefix:
+        b.FinishSizePrefixed(mon, file_identifier)
+    else:
+        b.Finish(mon, file_identifier)
+
+    return b.Bytes, b.Head()
+
+
+class TestBuilderForceDefaults(unittest.TestCase):
+    """Verify that the builder adds default values when forced."""
+
+    test_flags = [N.BoolFlags(), N.Uint8Flags(), N.Uint16Flags(), \
+                  N.Uint32Flags(), N.Uint64Flags(), N.Int8Flags(), \
+                  N.Int16Flags(), N.Int32Flags(), N.Int64Flags(), \
+                  N.Float32Flags(), N.Float64Flags(), N.UOffsetTFlags()]
+    def test_default_force_defaults(self):
+        for flag in self.test_flags:
+            b = flatbuffers.Builder(0)
+            b.StartObject(1)
+            stored_offset = b.Offset()
+            if flag != N.UOffsetTFlags():
+                b.PrependSlot(flag, 0, 0, 0)
+            else:
+                b.PrependUOffsetTRelativeSlot(0, 0, 0)
+            end_offset = b.Offset()
+            b.EndObject()
+            self.assertEqual(0, end_offset - stored_offset)
+
+    def test_force_defaults_true(self):
+        for flag in self.test_flags:
+            b = flatbuffers.Builder(0)
+            b.ForceDefaults(True)
+            b.StartObject(1)
+            stored_offset = b.Offset()
+            if flag != N.UOffsetTFlags():
+                b.PrependSlot(flag, 0, 0, 0)
+            else:
+                b.PrependUOffsetTRelativeSlot(0, 0, 0)
+            end_offset = b.Offset()
+            b.EndObject()
+            self.assertEqual(flag.bytewidth, end_offset - stored_offset)
+
+
+class TestAllCodePathsOfExampleSchema(unittest.TestCase):
+    def setUp(self, *args, **kwargs):
+        super(TestAllCodePathsOfExampleSchema, self).setUp(*args, **kwargs)
+
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b)
+        gen_mon = MyGame.Example.Monster.End(b)
+        b.Finish(gen_mon)
+
+        self.mon = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                                   b.Head())
+
+    def test_default_monster_pos(self):
+        self.assertTrue(self.mon.Pos() is None)
+
+    def test_nondefault_monster_mana(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddMana(b, 50)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        got_mon = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                                  b.Head())
+        self.assertEqual(50, got_mon.Mana())
+
+    def test_default_monster_hp(self):
+        self.assertEqual(100, self.mon.Hp())
+
+    def test_default_monster_name(self):
+        self.assertEqual(None, self.mon.Name())
+
+    def test_default_monster_inventory_item(self):
+        self.assertEqual(0, self.mon.Inventory(0))
+
+    def test_default_monster_inventory_length(self):
+        self.assertEqual(0, self.mon.InventoryLength())
+        self.assertTrue(self.mon.InventoryIsNone())
+
+    def test_empty_monster_inventory_vector(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.StartInventoryVector(b, 0)
+        inv = b.EndVector()
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddInventory(b, inv)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertFalse(mon2.InventoryIsNone())
+
+    def test_default_monster_color(self):
+        self.assertEqual(MyGame.Example.Color.Color.Blue, self.mon.Color())
+
+    def test_nondefault_monster_color(self):
+        b = flatbuffers.Builder(0)
+        color = MyGame.Example.Color.Color.Red
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddColor(b, color)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertEqual(MyGame.Example.Color.Color.Red, mon2.Color())
+
+    def test_default_monster_testtype(self):
+        self.assertEqual(0, self.mon.TestType())
+
+    def test_default_monster_test_field(self):
+        self.assertEqual(None, self.mon.Test())
+
+    def test_default_monster_test4_item(self):
+        self.assertEqual(None, self.mon.Test4(0))
+
+    def test_default_monster_test4_length(self):
+        self.assertEqual(0, self.mon.Test4Length())
+        self.assertTrue(self.mon.Test4IsNone())
+
+    def test_empty_monster_test4_vector(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.StartTest4Vector(b, 0)
+        test4 = b.EndVector()
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTest4(b, test4)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertFalse(mon2.Test4IsNone())
+
+    def test_default_monster_testarrayofstring(self):
+        self.assertEqual("", self.mon.Testarrayofstring(0))
+
+    def test_default_monster_testarrayofstring_length(self):
+        self.assertEqual(0, self.mon.TestarrayofstringLength())
+        self.assertTrue(self.mon.TestarrayofstringIsNone())
+
+    def test_empty_monster_testarrayofstring_vector(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.StartTestarrayofstringVector(b, 0)
+        testarrayofstring = b.EndVector()
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestarrayofstring(b, testarrayofstring)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertFalse(mon2.TestarrayofstringIsNone())
+
+    def test_default_monster_testarrayoftables(self):
+        self.assertEqual(None, self.mon.Testarrayoftables(0))
+
+    def test_nondefault_monster_testarrayoftables(self):
+        b = flatbuffers.Builder(0)
+
+        # make a child Monster within a vector of Monsters:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddHp(b, 99)
+        sub_monster = MyGame.Example.Monster.End(b)
+
+        # build the vector:
+        MyGame.Example.Monster.StartTestarrayoftablesVector(b, 1)
+        b.PrependUOffsetTRelative(sub_monster)
+        vec = b.EndVector()
+
+        # make the parent monster and include the vector of Monster:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestarrayoftables(b, vec)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Output(), 0)
+        self.assertEqual(99, mon2.Testarrayoftables(0).Hp())
+        self.assertEqual(1, mon2.TestarrayoftablesLength())
+        self.assertFalse(mon2.TestarrayoftablesIsNone())
+
+    def test_default_monster_testarrayoftables_length(self):
+        self.assertEqual(0, self.mon.TestarrayoftablesLength())
+        self.assertTrue(self.mon.TestarrayoftablesIsNone())
+
+    def test_empty_monster_testarrayoftables_vector(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.StartTestarrayoftablesVector(b, 0)
+        testarrayoftables = b.EndVector()
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestarrayoftables(b, testarrayoftables)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertFalse(mon2.TestarrayoftablesIsNone())
+
+    def test_default_monster_testarrayoftables_length(self):
+        self.assertEqual(0, self.mon.TestarrayoftablesLength())
+
+    def test_nondefault_monster_enemy(self):
+        b = flatbuffers.Builder(0)
+
+        # make an Enemy object:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddHp(b, 88)
+        enemy = MyGame.Example.Monster.End(b)
+        b.Finish(enemy)
+
+        # make the parent monster and include the vector of Monster:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddEnemy(b, enemy)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertEqual(88, mon2.Enemy().Hp())
+
+    def test_default_monster_testnestedflatbuffer(self):
+        self.assertEqual(0, self.mon.Testnestedflatbuffer(0))
+
+    def test_default_monster_testnestedflatbuffer_length(self):
+        self.assertEqual(0, self.mon.TestnestedflatbufferLength())
+        self.assertTrue(self.mon.TestnestedflatbufferIsNone())
+
+    def test_empty_monster_testnestedflatbuffer_vector(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.StartTestnestedflatbufferVector(b, 0)
+        testnestedflatbuffer = b.EndVector()
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestnestedflatbuffer(b, testnestedflatbuffer)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertFalse(mon2.TestnestedflatbufferIsNone())
+
+    def test_nondefault_monster_testnestedflatbuffer(self):
+        b = flatbuffers.Builder(0)
+
+        MyGame.Example.Monster.StartTestnestedflatbufferVector(b, 3)
+        b.PrependByte(4)
+        b.PrependByte(2)
+        b.PrependByte(0)
+        sub_buf = b.EndVector()
+
+        # make the parent monster and include the vector of Monster:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestnestedflatbuffer(b, sub_buf)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertEqual(3, mon2.TestnestedflatbufferLength())
+        self.assertFalse(mon2.TestnestedflatbufferIsNone())
+        self.assertEqual(0, mon2.Testnestedflatbuffer(0))
+        self.assertEqual(2, mon2.Testnestedflatbuffer(1))
+        self.assertEqual(4, mon2.Testnestedflatbuffer(2))
+        try:
+            imp.find_module('numpy')
+            # if numpy exists, then we should be able to get the
+            # vector as a numpy array
+            self.assertEqual([0, 2, 4], mon2.TestnestedflatbufferAsNumpy().tolist())
+        except ImportError:
+            assertRaises(self,
+                         lambda: mon2.TestnestedflatbufferAsNumpy(),
+                         NumpyRequiredForThisFeature)
+
+    def test_nested_monster_testnestedflatbuffer(self):
+        b = flatbuffers.Builder(0)
+
+        # build another monster to nest inside testnestedflatbuffer
+        nestedB = flatbuffers.Builder(0)
+        nameStr = nestedB.CreateString("Nested Monster")
+        MyGame.Example.Monster.Start(nestedB)
+        MyGame.Example.Monster.AddHp(nestedB, 30)
+        MyGame.Example.Monster.AddName(nestedB, nameStr)
+        nestedMon = MyGame.Example.Monster.End(nestedB)
+        nestedB.Finish(nestedMon)
+
+        # write the nested FB bytes
+        sub_buf = MyGame.Example.Monster.MakeTestnestedflatbufferVectorFromBytes(
+            b, nestedB.Output())
+
+        # make the parent monster and include the bytes of the nested monster
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestnestedflatbuffer(b, sub_buf)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        nestedMon2 = mon2.TestnestedflatbufferNestedRoot()
+        self.assertEqual(b"Nested Monster", nestedMon2.Name())
+        self.assertEqual(30, nestedMon2.Hp())
+
+    def test_nondefault_monster_testempty(self):
+        b = flatbuffers.Builder(0)
+
+        # make a Stat object:
+        MyGame.Example.Stat.Start(b)
+        MyGame.Example.Stat.AddVal(b, 123)
+        my_stat = MyGame.Example.Stat.End(b)
+        b.Finish(my_stat)
+
+        # include the stat object in a monster:
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestempty(b, my_stat)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertEqual(123, mon2.Testempty().Val())
+
+    def test_default_monster_testbool(self):
+        self.assertFalse(self.mon.Testbool())
+
+    def test_nondefault_monster_testbool(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTestbool(b, True)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertTrue(mon2.Testbool())
+
+    def test_default_monster_testhashes(self):
+        self.assertEqual(0, self.mon.Testhashs32Fnv1())
+        self.assertEqual(0, self.mon.Testhashu32Fnv1())
+        self.assertEqual(0, self.mon.Testhashs64Fnv1())
+        self.assertEqual(0, self.mon.Testhashu64Fnv1())
+        self.assertEqual(0, self.mon.Testhashs32Fnv1a())
+        self.assertEqual(0, self.mon.Testhashu32Fnv1a())
+        self.assertEqual(0, self.mon.Testhashs64Fnv1a())
+        self.assertEqual(0, self.mon.Testhashu64Fnv1a())
+
+    def test_nondefault_monster_testhashes(self):
+        b = flatbuffers.Builder(0)
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddTesthashs32Fnv1(b, 1)
+        MyGame.Example.Monster.AddTesthashu32Fnv1(b, 2)
+        MyGame.Example.Monster.AddTesthashs64Fnv1(b, 3)
+        MyGame.Example.Monster.AddTesthashu64Fnv1(b, 4)
+        MyGame.Example.Monster.AddTesthashs32Fnv1a(b, 5)
+        MyGame.Example.Monster.AddTesthashu32Fnv1a(b, 6)
+        MyGame.Example.Monster.AddTesthashs64Fnv1a(b, 7)
+        MyGame.Example.Monster.AddTesthashu64Fnv1a(b, 8)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # inspect the resulting data:
+        mon2 = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                               b.Head())
+        self.assertEqual(1, mon2.Testhashs32Fnv1())
+        self.assertEqual(2, mon2.Testhashu32Fnv1())
+        self.assertEqual(3, mon2.Testhashs64Fnv1())
+        self.assertEqual(4, mon2.Testhashu64Fnv1())
+        self.assertEqual(5, mon2.Testhashs32Fnv1a())
+        self.assertEqual(6, mon2.Testhashu32Fnv1a())
+        self.assertEqual(7, mon2.Testhashs64Fnv1a())
+        self.assertEqual(8, mon2.Testhashu64Fnv1a())
+
+    def test_default_monster_parent_namespace_test(self):
+        self.assertEqual(None, self.mon.ParentNamespaceTest())
+
+    def test_nondefault_monster_parent_namespace_test(self):
+        b = flatbuffers.Builder(0)
+        MyGame.InParentNamespace.Start(b)
+        parent = MyGame.InParentNamespace.End(b)
+        MyGame.Example.Monster.Start(b)
+        MyGame.Example.Monster.AddParentNamespaceTest(b, parent)
+        mon = MyGame.Example.Monster.End(b)
+        b.Finish(mon)
+
+        # Inspect the resulting data.
+        monster = MyGame.Example.Monster.Monster.GetRootAs(b.Bytes,
+                                                                  b.Head())
+        self.assertTrue(isinstance(monster.ParentNamespaceTest(),
+                                   MyGame.InParentNamespace.InParentNamespace))
+
+    def test_getrootas_for_nonroot_table(self):
+        b = flatbuffers.Builder(0)
+        string = b.CreateString("MyStat")
+
+        MyGame.Example.Stat.Start(b)
+        MyGame.Example.Stat.AddId(b, string)
+        MyGame.Example.Stat.AddVal(b, 12345678)
+        MyGame.Example.Stat.AddCount(b, 12345)
+        stat = MyGame.Example.Stat.End(b)
+        b.Finish(stat)
+
+        stat2 = MyGame.Example.Stat.Stat.GetRootAs(b.Bytes, b.Head())
+
+        self.assertEqual(b"MyStat", stat2.Id())
+        self.assertEqual(12345678, stat2.Val())
+        self.assertEqual(12345, stat2.Count())
+
+
+class TestAllCodePathsOfMonsterExtraSchema(unittest.TestCase):
+    def setUp(self, *args, **kwargs):
+        super(TestAllCodePathsOfMonsterExtraSchema, self).setUp(*args, **kwargs)
+
+        b = flatbuffers.Builder(0)
+        MyGame.MonsterExtra.Start(b)
+        gen_mon = MyGame.MonsterExtra.End(b)
+        b.Finish(gen_mon)
+
+        self.mon = MyGame.MonsterExtra.MonsterExtra.GetRootAs(b.Bytes, b.Head())
+
+    def test_default_nan_inf(self):
+        self.assertTrue(math.isnan(self.mon.F1()))
+        self.assertEqual(self.mon.F2(), float("inf"))
+        self.assertEqual(self.mon.F3(), float("-inf"))
+
+        self.assertTrue(math.isnan(self.mon.D1()))
+        self.assertEqual(self.mon.D2(), float("inf"))
+        self.assertEqual(self.mon.D3(), float("-inf"))
+
+
+class TestVtableDeduplication(unittest.TestCase):
+    ''' TestVtableDeduplication verifies that vtables are deduplicated. '''
+
+    def test_vtable_deduplication(self):
+        b = flatbuffers.Builder(0)
+
+        b.StartObject(4)
+        b.PrependByteSlot(0, 0, 0)
+        b.PrependByteSlot(1, 11, 0)
+        b.PrependByteSlot(2, 22, 0)
+        b.PrependInt16Slot(3, 33, 0)
+        obj0 = b.EndObject()
+
+        b.StartObject(4)
+        b.PrependByteSlot(0, 0, 0)
+        b.PrependByteSlot(1, 44, 0)
+        b.PrependByteSlot(2, 55, 0)
+        b.PrependInt16Slot(3, 66, 0)
+        obj1 = b.EndObject()
+
+        b.StartObject(4)
+        b.PrependByteSlot(0, 0, 0)
+        b.PrependByteSlot(1, 77, 0)
+        b.PrependByteSlot(2, 88, 0)
+        b.PrependInt16Slot(3, 99, 0)
+        obj2 = b.EndObject()
+
+        got = b.Bytes[b.Head():]
+
+        want = bytearray([
+            240, 255, 255, 255,  # == -12. offset to dedupped vtable.
+            99, 0,
+            88,
+            77,
+            248, 255, 255, 255,  # == -8. offset to dedupped vtable.
+            66, 0,
+            55,
+            44,
+            12, 0,
+            8, 0,
+            0, 0,
+            7, 0,
+            6, 0,
+            4, 0,
+            12, 0, 0, 0,
+            33, 0,
+            22,
+            11,
+        ])
+
+        self.assertEqual((len(want), want), (len(got), got))
+
+        table0 = flatbuffers.table.Table(b.Bytes, len(b.Bytes) - obj0)
+        table1 = flatbuffers.table.Table(b.Bytes, len(b.Bytes) - obj1)
+        table2 = flatbuffers.table.Table(b.Bytes, len(b.Bytes) - obj2)
+
+        def _checkTable(tab, voffsett_value, b, c, d):
+            # vtable size
+            got = tab.GetVOffsetTSlot(0, 0)
+            self.assertEqual(12, got, 'case 0, 0')
+
+            # object size
+            got = tab.GetVOffsetTSlot(2, 0)
+            self.assertEqual(8, got, 'case 2, 0')
+
+            # default value
+            got = tab.GetVOffsetTSlot(4, 0)
+            self.assertEqual(voffsett_value, got, 'case 4, 0')
+
+            got = tab.GetSlot(6, 0, N.Uint8Flags)
+            self.assertEqual(b, got, 'case 6, 0')
+
+            val = tab.GetSlot(8, 0, N.Uint8Flags)
+            self.assertEqual(c, val, 'failed 8, 0')
+
+            got = tab.GetSlot(10, 0, N.Uint8Flags)
+            self.assertEqual(d, got, 'failed 10, 0')
+
+        _checkTable(table0, 0, 11, 22, 33)
+        _checkTable(table1, 0, 44, 55, 66)
+        _checkTable(table2, 0, 77, 88, 99)
+
+
+class TestExceptions(unittest.TestCase):
+    def test_object_is_nested_error(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(0)
+        assertRaises(self, lambda: b.StartObject(0),
+                     flatbuffers.builder.IsNestedError)
+
+    def test_object_is_not_nested_error(self):
+        b = flatbuffers.Builder(0)
+        assertRaises(self, lambda: b.EndObject(),
+                     flatbuffers.builder.IsNotNestedError)
+
+    def test_struct_is_not_inline_error(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(0)
+        assertRaises(self, lambda: b.PrependStructSlot(0, 1, 0),
+                     flatbuffers.builder.StructIsNotInlineError)
+
+    def test_unreachable_error(self):
+        b = flatbuffers.Builder(0)
+        assertRaises(self, lambda: b.PrependUOffsetTRelative(1),
+                     flatbuffers.builder.OffsetArithmeticError)
+
+    def test_create_string_is_nested_error(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(0)
+        s = 'test1'
+        assertRaises(self, lambda: b.CreateString(s),
+                     flatbuffers.builder.IsNestedError)
+
+    def test_create_byte_vector_is_nested_error(self):
+        b = flatbuffers.Builder(0)
+        b.StartObject(0)
+        s = b'test1'
+        assertRaises(self, lambda: b.CreateByteVector(s),
+                     flatbuffers.builder.IsNestedError)
+
+    def test_finished_bytes_error(self):
+        b = flatbuffers.Builder(0)
+        assertRaises(self, lambda: b.Output(),
+                     flatbuffers.builder.BuilderNotFinishedError)
+
+
+class TestFixedLengthArrays(unittest.TestCase):
+    def test_fixed_length_array(self):
+        builder = flatbuffers.Builder(0)
+
+        a = 0.5
+        b = range(0, 15)
+        c = 1
+        d_a = [[1, 2], [3, 4]]
+        d_b = [MyGame.Example.TestEnum.TestEnum.B, \
+                MyGame.Example.TestEnum.TestEnum.C]
+        d_c = [[MyGame.Example.TestEnum.TestEnum.A, \
+                MyGame.Example.TestEnum.TestEnum.B], \
+                [MyGame.Example.TestEnum.TestEnum.C, \
+                 MyGame.Example.TestEnum.TestEnum.B]]
+        d_d = [[-1, 1], [-2, 2]]
+        e = 2
+        f = [-1, 1]
+
+        arrayOffset = MyGame.Example.ArrayStruct.CreateArrayStruct(builder, \
+            a, b, c, d_a, d_b, d_c, d_d, e, f)
+
+        # Create a table with the ArrayStruct.
+        MyGame.Example.ArrayTable.Start(builder)
+        MyGame.Example.ArrayTable.AddA(builder, arrayOffset)
+        tableOffset = MyGame.Example.ArrayTable.End(builder)
+
+        builder.Finish(tableOffset)
+
+        buf = builder.Output()
+
+        table = MyGame.Example.ArrayTable.ArrayTable.GetRootAs(buf)
+
+        # Verify structure.
+        nested = MyGame.Example.NestedStruct.NestedStruct()
+        self.assertEqual(table.A().A(), 0.5)
+        self.assertEqual(table.A().B(), \
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+        self.assertEqual(table.A().C(), 1)
+        self.assertEqual(table.A().D(nested, 0).A(), [1, 2])
+        self.assertEqual(table.A().D(nested, 1).A(), [3, 4])
+        self.assertEqual(table.A().D(nested, 0).B(), \
+            MyGame.Example.TestEnum.TestEnum.B)
+        self.assertEqual(table.A().D(nested, 1).B(), \
+            MyGame.Example.TestEnum.TestEnum.C)
+        self.assertEqual(table.A().D(nested, 0).C(), \
+            [MyGame.Example.TestEnum.TestEnum.A, \
+             MyGame.Example.TestEnum.TestEnum.B])
+        self.assertEqual(table.A().D(nested, 1).C(), \
+            [MyGame.Example.TestEnum.TestEnum.C, \
+             MyGame.Example.TestEnum.TestEnum.B])
+        self.assertEqual(table.A().D(nested, 0).D(), [-1, 1])
+        self.assertEqual(table.A().D(nested, 1).D(), [-2, 2])
+        self.assertEqual(table.A().E(), 2)
+        self.assertEqual(table.A().F(), [-1, 1])
+
+
+def CheckAgainstGoldDataGo():
+    try:
+        gen_buf, gen_off = make_monster_from_generated_code()
+        fn = 'monsterdata_go_wire.mon'
+        if not os.path.exists(fn):
+            print('Go-generated data does not exist, failed.')
+            return False
+
+        # would like to use a context manager here, but it's less
+        # backwards-compatible:
+        f = open(fn, 'rb')
+        go_wire_data = f.read()
+        f.close()
+
+        CheckReadBuffer(bytearray(go_wire_data), 0)
+        if not bytearray(gen_buf[gen_off:]) == bytearray(go_wire_data):
+            raise AssertionError('CheckAgainstGoldDataGo failed')
+    except:
+        print('Failed to test against Go-generated test data.')
+        return False
+
+    print('Can read Go-generated test data, and Python generates bytewise identical data.')
+    return True
+
+
+def CheckAgainstGoldDataJava():
+    try:
+        gen_buf, gen_off = make_monster_from_generated_code()
+        fn = 'monsterdata_java_wire.mon'
+        if not os.path.exists(fn):
+            print('Java-generated data does not exist, failed.')
+            return False
+        f = open(fn, 'rb')
+        java_wire_data = f.read()
+        f.close()
+
+        CheckReadBuffer(bytearray(java_wire_data), 0)
+    except:
+        print('Failed to read Java-generated test data.')
+        return False
+
+    print('Can read Java-generated test data.')
+    return True
+
+
+class LCG(object):
+    ''' Include simple random number generator to ensure results will be the
+        same cross platform.
+        http://en.wikipedia.org/wiki/Park%E2%80%93Miller_random_number_generator '''
+
+    __slots__ = ['n']
+
+    InitialLCGSeed = 48271
+
+    def __init__(self):
+        self.n = self.InitialLCGSeed
+
+    def Reset(self):
+        self.n = self.InitialLCGSeed
+
+    def Next(self):
+        self.n = ((self.n * 279470273) % 4294967291) & 0xFFFFFFFF
+        return self.n
+
+
+def BenchmarkVtableDeduplication(count):
+    '''
+    BenchmarkVtableDeduplication measures the speed of vtable deduplication
+    by creating `prePop` vtables, then populating `count` objects with a
+    different single vtable.
+
+    When count is large (as in long benchmarks), memory usage may be high.
+    '''
+
+    for prePop in (1, 10, 100, 1000):
+        builder = flatbuffers.Builder(0)
+        n = 1 + int(math.log(prePop, 1.5))
+
+        # generate some layouts:
+        layouts = set()
+        r = list(compat_range(n))
+        while len(layouts) < prePop:
+            layouts.add(tuple(sorted(random.sample(r, int(max(1, n / 2))))))
+
+        layouts = list(layouts)
+
+        # pre-populate vtables:
+        for layout in layouts:
+            builder.StartObject(n)
+            for j in layout:
+                builder.PrependInt16Slot(j, j, 0)
+            builder.EndObject()
+
+        # benchmark deduplication of a new vtable:
+        def f():
+            layout = random.choice(layouts)
+            builder.StartObject(n)
+            for j in layout:
+                builder.PrependInt16Slot(j, j, 0)
+            builder.EndObject()
+
+        duration = timeit.timeit(stmt=f, number=count)
+        rate = float(count) / duration
+        print(('vtable deduplication rate (n=%d, vtables=%d): %.2f sec' % (
+            prePop,
+            len(builder.vtables),
+            rate))
+        )
+
+
+def BenchmarkCheckReadBuffer(count, buf, off):
+    '''
+    BenchmarkCheckReadBuffer measures the speed of flatbuffer reading
+    by re-using the CheckReadBuffer function with the gold data.
+    '''
+
+    def f():
+        CheckReadBuffer(buf, off)
+
+    duration = timeit.timeit(stmt=f, number=count)
+    rate = float(count) / duration
+    data = float(len(buf) * count) / float(1024 * 1024)
+    data_rate = data / float(duration)
+
+    print(('traversed %d %d-byte flatbuffers in %.2fsec: %.2f/sec, %.2fMB/sec')
+          % (count, len(buf), duration, rate, data_rate))
+
+
+def BenchmarkMakeMonsterFromGeneratedCode(count, length):
+    '''
+    BenchmarkMakeMonsterFromGeneratedCode measures the speed of flatbuffer
+    creation by re-using the make_monster_from_generated_code function for
+    generating gold data examples.
+    '''
+
+    duration = timeit.timeit(stmt=make_monster_from_generated_code,
+                             number=count)
+    rate = float(count) / duration
+    data = float(length * count) / float(1024 * 1024)
+    data_rate = data / float(duration)
+
+    print(('built %d %d-byte flatbuffers in %.2fsec: %.2f/sec, %.2fMB/sec' % \
+           (count, length, duration, rate, data_rate)))
+
+
+def backward_compatible_run_tests(**kwargs):
+    if PY_VERSION < (2, 6):
+        sys.stderr.write("Python version less than 2.6 are not supported")
+        sys.stderr.flush()
+        return False
+
+    # python2.6 has a reduced-functionality unittest.main function:
+    if PY_VERSION == (2, 6):
+        try:
+            unittest.main(**kwargs)
+        except SystemExit as e:
+            if not e.code == 0:
+                return False
+        return True
+
+    # python2.7 and above let us not exit once unittest.main is run:
+    kwargs['exit'] = False
+    kwargs['verbosity'] = 0
+    ret = unittest.main(**kwargs)
+    if ret.result.errors or ret.result.failures:
+        return False
+
+    return True
+
+def main():
+    import os
+    import sys
+    if not len(sys.argv) == 4:
+       sys.stderr.write('Usage: %s <benchmark vtable count>'
+                        '<benchmark read count> <benchmark build count>\n'
+                        % sys.argv[0])
+       sys.stderr.write('       Provide COMPARE_GENERATED_TO_GO=1   to check'
+                        'for bytewise comparison to Go data.\n')
+       sys.stderr.write('       Provide COMPARE_GENERATED_TO_JAVA=1 to check'
+                        'for bytewise comparison to Java data.\n')
+       sys.stderr.flush()
+       sys.exit(1)
+
+    kwargs = dict(argv=sys.argv[:-3])
+
+    # show whether numpy is present, as it changes the test logic:
+    try:
+        import numpy
+        print('numpy available')
+    except ImportError:
+        print('numpy not available')
+
+    # run tests, and run some language comparison checks if needed:
+    success = backward_compatible_run_tests(**kwargs)
+    if success and os.environ.get('COMPARE_GENERATED_TO_GO', 0) == "1":
+        success = success and CheckAgainstGoldDataGo()
+    if success and os.environ.get('COMPARE_GENERATED_TO_JAVA', 0) == "1":
+        success = success and CheckAgainstGoldDataJava()
+
+    if not success:
+        sys.stderr.write('Tests failed, skipping benchmarks.\n')
+        sys.stderr.flush()
+        sys.exit(1)
+
+    # run benchmarks (if 0, they will be a noop):
+    bench_vtable = int(sys.argv[1])
+    bench_traverse = int(sys.argv[2])
+    bench_build = int(sys.argv[3])
+    if bench_vtable:
+        BenchmarkVtableDeduplication(bench_vtable)
+    if bench_traverse:
+        buf, off = make_monster_from_generated_code()
+        BenchmarkCheckReadBuffer(bench_traverse, buf, off)
+    if bench_build:
+        buf, off = make_monster_from_generated_code()
+        BenchmarkMakeMonsterFromGeneratedCode(bench_build, len(buf))
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/Cargo.toml b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/Cargo.toml
new file mode 100644
index 0000000..22b2590
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/Cargo.toml
@@ -0,0 +1,51 @@
+[package]
+name = "rust_usage_test"
+version = "0.1.0"
+authors = ["Robert Winslow <hello@rwinslow.com>",
+           "Casper Neo <cneo@google.com>",
+           "FlatBuffers Maintainers"]
+
+[dependencies]
+flatbuffers = { path = "../../rust/flatbuffers" }
+flexbuffers = { path = "../../rust/flexbuffers" }
+serde_derive = "1.0"
+serde = "1.0"
+serde_bytes = "0.11"
+
+[[bin]]
+name = "monster_example"
+path = "bin/monster_example.rs"
+
+[[bin]]
+name = "flatbuffers_alloc_check"
+path = "bin/flatbuffers_alloc_check.rs"
+
+[[bin]]
+name = "flexbuffers_alloc_check"
+path = "bin/flexbuffers_alloc_check.rs"
+
+[[bin]]
+name = "sample_flexbuffers"
+path = "../../samples/sample_flexbuffers.rs"
+
+[[bin]]
+name = "sample_flexbuffers_serde"
+path = "../../samples/sample_flexbuffers_serde.rs"
+
+[[bin]]
+name = "sample_flatbuffers"
+path = "../../samples/sample_binary.rs"
+
+[dev-dependencies]
+quickcheck = "0.6"
+# TODO(rw): look into moving to criterion.rs
+bencher = "0.1.5"
+static_assertions = "1.0.0"
+rand = "*"
+quickcheck_derive = "*"
+array-init = "2.0"
+
+[[bench]]
+# setup for bencher
+name = "benchmarks"
+harness = false
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/benchmarks.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/benchmarks.rs
new file mode 100644
index 0000000..bfe63b6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/benchmarks.rs
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[macro_use]
+extern crate bencher;
+extern crate flatbuffers;
+extern crate flexbuffers;
+
+mod flatbuffers_benchmarks;
+mod flexbuffers_benchmarks;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/include_test1_generated.rs"]
+pub mod include_test1_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/sub/include_test2_generated.rs"]
+pub mod include_test2_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../monster_test_generated.rs"]
+mod monster_test_generated;
+pub use monster_test_generated::my_game;
+
+benchmark_main!(
+    flatbuffers_benchmarks::benches,
+    flexbuffers_benchmarks::benches
+);
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flatbuffers_benchmarks.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flatbuffers_benchmarks.rs
new file mode 100644
index 0000000..ee6d81d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flatbuffers_benchmarks.rs
@@ -0,0 +1,257 @@
+/*
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use bencher::Bencher;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/include_test1_generated.rs"]
+pub mod include_test1_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/sub/include_test2_generated.rs"]
+pub mod include_test2_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../monster_test_generated.rs"]
+mod monster_test_generated;
+pub use monster_test_generated::my_game;
+
+fn traverse_canonical_buffer(bench: &mut Bencher) {
+    let owned_data = {
+        let mut builder = &mut flatbuffers::FlatBufferBuilder::new();
+        create_serialized_example_with_generated_code(&mut builder, true);
+        builder.finished_data().to_vec()
+    };
+    let data = &owned_data[..];
+    let n = data.len() as u64;
+    bench.iter(|| {
+        traverse_serialized_example_with_generated_code(data);
+    });
+    bench.bytes = n;
+}
+
+fn create_canonical_buffer_then_reset(bench: &mut Bencher) {
+    let mut builder = &mut flatbuffers::FlatBufferBuilder::new();
+    // warmup
+    create_serialized_example_with_generated_code(&mut builder, true);
+    let n = builder.finished_data().len() as u64;
+    builder.reset();
+
+    bench.iter(|| {
+        let _ = create_serialized_example_with_generated_code(&mut builder, true);
+        builder.reset();
+    });
+
+    bench.bytes = n;
+}
+
+#[inline(always)]
+fn create_serialized_example_with_generated_code(
+    builder: &mut flatbuffers::FlatBufferBuilder,
+    finish: bool,
+) -> usize {
+    let s0 = builder.create_string("test1");
+    let s1 = builder.create_string("test2");
+    let t0_name = builder.create_string("Barney");
+    let t1_name = builder.create_string("Fred");
+    let t2_name = builder.create_string("Wilma");
+    let t0 = my_game::example::Monster::create(
+        builder,
+        &my_game::example::MonsterArgs {
+            hp: 1000,
+            name: Some(t0_name),
+            ..Default::default()
+        },
+    );
+    let t1 = my_game::example::Monster::create(
+        builder,
+        &my_game::example::MonsterArgs {
+            name: Some(t1_name),
+            ..Default::default()
+        },
+    );
+    let t2 = my_game::example::Monster::create(
+        builder,
+        &my_game::example::MonsterArgs {
+            name: Some(t2_name),
+            ..Default::default()
+        },
+    );
+    let mon = {
+        let name = builder.create_string("MyMonster");
+        let fred_name = builder.create_string("Fred");
+        let inventory = builder.create_vector_direct(&[0u8, 1, 2, 3, 4]);
+        let test4 = builder.create_vector_direct(&[
+            my_game::example::Test::new(10, 20),
+            my_game::example::Test::new(30, 40),
+        ]);
+        let pos = my_game::example::Vec3::new(
+            1.0,
+            2.0,
+            3.0,
+            3.0,
+            my_game::example::Color::Green,
+            &my_game::example::Test::new(5i16, 6i8),
+        );
+        let args = my_game::example::MonsterArgs {
+            hp: 80,
+            mana: 150,
+            name: Some(name),
+            pos: Some(&pos),
+            test_type: my_game::example::Any::Monster,
+            test: Some(
+                my_game::example::Monster::create(
+                    builder,
+                    &my_game::example::MonsterArgs {
+                        name: Some(fred_name),
+                        ..Default::default()
+                    },
+                )
+                .as_union_value(),
+            ),
+            inventory: Some(inventory),
+            test4: Some(test4),
+            testarrayofstring: Some(builder.create_vector(&[s0, s1])),
+            testarrayoftables: Some(builder.create_vector(&[t0, t1, t2])),
+            ..Default::default()
+        };
+        my_game::example::Monster::create(builder, &args)
+    };
+    if finish {
+        my_game::example::finish_monster_buffer(builder, mon);
+    }
+
+    builder.finished_data().len()
+
+    // make it do some work
+    // if builder.finished_data().len() == 0 { panic!("bad benchmark"); }
+}
+
+#[inline(always)]
+fn blackbox<T>(t: T) -> T {
+    // encapsulate this in case we need to turn it into a noop
+    bencher::black_box(t)
+}
+
+#[inline(always)]
+fn traverse_serialized_example_with_generated_code(bytes: &[u8]) {
+    let m = my_game::example::get_root_as_monster(bytes);
+    blackbox(m.hp());
+    blackbox(m.mana());
+    blackbox(m.name());
+    let pos = m.pos().unwrap();
+    blackbox(pos.x());
+    blackbox(pos.y());
+    blackbox(pos.z());
+    blackbox(pos.test1());
+    blackbox(pos.test2());
+    let pos_test3 = pos.test3();
+    blackbox(pos_test3.a());
+    blackbox(pos_test3.b());
+    blackbox(m.test_type());
+    let table2 = m.test().unwrap();
+    let monster2 = my_game::example::Monster::init_from_table(table2);
+    blackbox(monster2.name());
+    blackbox(m.inventory());
+    blackbox(m.test4());
+    let testarrayoftables = m.testarrayoftables().unwrap();
+    blackbox(testarrayoftables.get(0).hp());
+    blackbox(testarrayoftables.get(0).name());
+    blackbox(testarrayoftables.get(1).name());
+    blackbox(testarrayoftables.get(2).name());
+    let testarrayofstring = m.testarrayofstring().unwrap();
+    blackbox(testarrayofstring.get(0));
+    blackbox(testarrayofstring.get(1));
+}
+
+fn create_string_10(bench: &mut Bencher) {
+    let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
+    let mut i = 0;
+    bench.iter(|| {
+        builder.create_string("foobarbaz"); // zero-terminated -> 10 bytes
+        i += 1;
+        if i == 10000 {
+            builder.reset();
+            i = 0;
+        }
+    });
+
+    bench.bytes = 10;
+}
+
+fn create_string_100(bench: &mut Bencher) {
+    let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
+    let s_owned = (0..99).map(|_| "x").collect::<String>();
+    let s: &str = &s_owned;
+
+    let mut i = 0;
+    bench.iter(|| {
+        builder.create_string(s); // zero-terminated -> 100 bytes
+        i += 1;
+        if i == 1000 {
+            builder.reset();
+            i = 0;
+        }
+    });
+
+    bench.bytes = s.len() as u64;
+}
+
+fn create_byte_vector_100_naive(bench: &mut Bencher) {
+    let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
+    let v_owned = (0u8..100).map(|i| i).collect::<Vec<u8>>();
+    let v: &[u8] = &v_owned;
+
+    let mut i = 0;
+    bench.iter(|| {
+        builder.create_vector(v); // zero-terminated -> 100 bytes
+        i += 1;
+        if i == 10000 {
+            builder.reset();
+            i = 0;
+        }
+    });
+
+    bench.bytes = v.len() as u64;
+}
+
+fn create_byte_vector_100_optimal(bench: &mut Bencher) {
+    let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
+    let v_owned = (0u8..100).map(|i| i).collect::<Vec<u8>>();
+    let v: &[u8] = &v_owned;
+
+    let mut i = 0;
+    bench.iter(|| {
+        builder.create_vector_direct(v);
+        i += 1;
+        if i == 10000 {
+            builder.reset();
+            i = 0;
+        }
+    });
+
+    bench.bytes = v.len() as u64;
+}
+
+benchmark_group!(
+    benches,
+    create_byte_vector_100_naive,
+    create_byte_vector_100_optimal,
+    traverse_canonical_buffer,
+    create_canonical_buffer_then_reset,
+    create_string_10,
+    create_string_100
+);
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flexbuffers_benchmarks.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flexbuffers_benchmarks.rs
new file mode 100644
index 0000000..1b9a8fb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/benches/flexbuffers_benchmarks.rs
@@ -0,0 +1,295 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use bencher::Bencher;
+use flexbuffers::*;
+
+fn push_vec_u64_to_map(b: &mut Bencher) {
+    let va = vec![u64::max_value() - 10; 512];
+    let vb = vec![u64::max_value() - 20; 512];
+    let vc = vec![u64::max_value() - 30; 512];
+    let mut n = 0;
+
+    b.iter(|| {
+        let mut fxb = Builder::default();
+        let mut m = fxb.start_map();
+        let mut ma = m.start_vector("a");
+        for &a in va.iter() {
+            ma.push(a);
+        }
+        ma.end_vector();
+        let mut mb = m.start_vector("b");
+        for &b in vb.iter() {
+            mb.push(b);
+        }
+        mb.end_vector();
+        let mut mc = m.start_vector("c");
+        for &c in vc.iter() {
+            mc.push(c);
+        }
+        mc.end_vector();
+        m.end_map();
+        n = fxb.view().len();
+    });
+    b.bytes = n as u64;
+}
+fn push_vec_u64_to_map_reused(b: &mut Bencher) {
+    let va = vec![u64::max_value() - 10; 512];
+    let vb = vec![u64::max_value() - 20; 512];
+    let vc = vec![u64::max_value() - 30; 512];
+    let mut fxb = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut m = fxb.start_map();
+        let mut ma = m.start_vector("a");
+        for &a in va.iter() {
+            ma.push(a);
+        }
+        ma.end_vector();
+        let mut mb = m.start_vector("b");
+        for &b in vb.iter() {
+            mb.push(b);
+        }
+        mb.end_vector();
+        let mut mc = m.start_vector("c");
+        for &c in vc.iter() {
+            mc.push(c);
+        }
+        mc.end_vector();
+        m.end_map();
+        n = fxb.view().len();
+    };
+    go(); // warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+fn push_vec_u64_to_map_direct(b: &mut Bencher) {
+    let va = vec![u64::max_value() - 10; 512];
+    let vb = vec![u64::max_value() - 20; 512];
+    let vc = vec![u64::max_value() - 30; 512];
+    let mut n = 0;
+
+    b.iter(|| {
+        let mut fxb = Builder::default();
+        let mut m = fxb.start_map();
+        m.push("a", &va);
+        m.push("b", &vb);
+        m.push("c", &vc);
+        m.end_map();
+        n = fxb.view().len();
+    });
+    b.bytes = n as u64;
+}
+fn push_vec_u64_to_map_direct_reused(b: &mut Bencher) {
+    let va = vec![u64::max_value() - 10; 512];
+    let vb = vec![u64::max_value() - 20; 512];
+    let vc = vec![u64::max_value() - 30; 512];
+    let mut n = 0;
+    let mut fxb = Builder::default();
+    let mut go = || {
+        let mut m = fxb.start_map();
+        m.push("a", &va);
+        m.push("b", &vb);
+        m.push("c", &vc);
+        m.end_map();
+        n = fxb.view().len();
+    };
+    go(); // warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+
+fn push_vec_without_indirect(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut b = builder.start_vector();
+        for i in 0..1024u16 {
+            b.push(i);
+        }
+        b.push(i64::max_value());
+        b.end_vector();
+        n = builder.view().len();
+    };
+    go(); // warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+// This isn't actually faster than the alternative but it is a lot smaller.
+// Based on the above benchmarks a lot of time is stuck in the `values` stack.
+fn push_vec_with_indirect(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut b = builder.start_vector();
+        for i in 0..1024u16 {
+            b.push(i);
+        }
+        b.push(IndirectInt(i64::max_value()));
+        b.end_vector();
+        n = builder.view().len();
+    };
+    go(); // warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+
+fn example_map<'a>(m: &mut MapBuilder<'a>) {
+    m.push("some_ints", &[256; 5]);
+    m.push("some_uints", &[256u16; 5]);
+    m.push("some_floats", &[256f32; 5]);
+    m.push("some_strings", "muahahahahaha");
+}
+fn hundred_maps(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut v = builder.start_vector();
+        for _ in 0..100 {
+            example_map(&mut v.start_map());
+        }
+        v.end_vector();
+        n = builder.view().len();
+    };
+    go(); // Warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+fn hundred_maps_pooled(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut v = builder.start_vector();
+        for _ in 0..100 {
+            example_map(&mut v.start_map());
+        }
+        v.end_vector();
+        n = builder.view().len();
+    };
+    go(); // Warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+fn make_monster(mut monster: MapBuilder) {
+    monster.push("type", "great orc");
+    monster.push("age", 100u8);
+    monster.push("name", "Mr. Orc");
+    monster.push("coins", &[1, 25, 50, 100, 250]);
+    monster.push("color", &[255u8, 0, 0, 0]);
+    {
+        let mut weapons = monster.start_vector("weapons");
+        {
+            let mut hammer = weapons.start_map();
+            hammer.push("name", "hammer");
+            hammer.push("damage type", "crush");
+            hammer.push("damage", 20);
+        }
+        {
+            let mut axe = weapons.start_map();
+            axe.push("name", "Great Axe");
+            axe.push("damage type", "slash");
+            axe.push("damage", 30);
+        }
+    }
+    {
+        let mut sounds = monster.start_vector("sounds");
+        sounds.push("grr");
+        sounds.push("rawr");
+        sounds.push("muahaha");
+    }
+}
+fn serialize_monsters(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut n = 0;
+    let mut go = || {
+        let mut monsters = builder.start_vector();
+        for _ in 0..100 {
+            make_monster(monsters.start_map())
+        }
+        monsters.end_vector();
+        n = builder.view().len();
+    };
+    go(); // Warm up allocations.
+    b.iter(go);
+    b.bytes = n as u64;
+}
+fn validate_monster(r: MapReader<&[u8]>) {
+    assert_eq!(r.idx("type").as_str(), "great orc");
+    assert_eq!(r.idx("age").as_u8(), 100);
+    assert_eq!(r.idx("name").as_str(), "Mr. Orc");
+    assert!(r
+        .idx("coins")
+        .as_vector()
+        .iter()
+        .map(|c| c.as_i16())
+        .eq([1, 25, 50, 100, 250].iter().cloned()));
+    assert!(r
+        .idx("color")
+        .as_vector()
+        .iter()
+        .map(|c| c.as_u8())
+        .eq([255, 0, 0, 0].iter().cloned()));
+
+    let weapons = r.idx("weapons").as_vector();
+    assert_eq!(weapons.len(), 2);
+
+    let hammer = weapons.idx(0).as_map();
+    assert_eq!(hammer.idx("name").as_str(), "hammer");
+    assert_eq!(hammer.idx("damage type").as_str(), "crush");
+    assert_eq!(hammer.idx("damage").as_u64(), 20);
+
+    let axe = weapons.idx(1).as_map();
+    assert_eq!(axe.idx("name").as_str(), "Great Axe");
+    assert_eq!(axe.idx("damage type").as_str(), "slash");
+    assert_eq!(axe.idx("damage").as_u64(), 30);
+
+    assert!(r
+        .idx("sounds")
+        .as_vector()
+        .iter()
+        .map(|s| s.as_str())
+        .eq(["grr", "rawr", "muahaha"].iter().cloned()));
+}
+fn read_monsters(b: &mut Bencher) {
+    let mut builder = Builder::default();
+    let mut monsters = builder.start_vector();
+    for _ in 0..100 {
+        make_monster(monsters.start_map());
+    }
+    monsters.end_vector();
+    b.bytes = builder.view().len() as u64;
+    let go = || {
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        assert_eq!(r.len(), 100);
+        for i in 0..100 {
+            validate_monster(r.idx(i).as_map());
+        }
+    };
+    b.iter(go);
+}
+
+benchmark_group!(
+    benches,
+    push_vec_u64_to_map,
+    push_vec_u64_to_map_reused,
+    push_vec_u64_to_map_direct,
+    push_vec_u64_to_map_direct_reused,
+    push_vec_without_indirect,
+    push_vec_with_indirect,
+    hundred_maps,
+    hundred_maps_pooled,
+    serialize_monsters,
+    read_monsters,
+);
+benchmark_main!(benches);
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/Cargo.toml b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/Cargo.toml
new file mode 100644
index 0000000..8387d17
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "outdir"
+version = "0.1.0"
+authors = ["Casper Neo <cneo@google.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+flatbuffers = { path = "../../../rust/flatbuffers" }
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/build.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/build.rs
new file mode 100644
index 0000000..663ca99
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/build.rs
@@ -0,0 +1,39 @@
+fn main() {
+    use std::process::Command;
+    
+    let project_root = std::env::current_dir()
+        .unwrap()
+        .parent()  // flatbuffers/tests/rust_usage test
+        .unwrap()
+        .parent()  // flatbuffers/tests
+        .unwrap()
+        .parent()  // flatbuffers/
+        .unwrap()
+        .to_path_buf();
+    
+    let sample_schema = {
+        let mut s = project_root.to_path_buf();
+        s.push("samples");
+        s.push("monster.fbs");
+        s
+    };
+
+    let flatc = {
+        let mut f = project_root.to_path_buf();
+        f.push("flatc");
+        f
+    };
+
+    let out_dir = std::path::Path::new(&std::env::var("OUT_DIR").unwrap()).to_path_buf();
+
+    Command::new(&flatc)
+        .arg("--rust")
+        .arg(&sample_schema)
+        .arg("--filename-suffix")
+        .arg("_gen")
+        .output()
+        .expect("Failed to generate file");
+
+    let genfile = "monster_gen.rs";
+    std::fs::rename(&genfile, out_dir.join("monster_generated.rs")).unwrap();
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/src/main.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/src/main.rs
new file mode 100644
index 0000000..b5fcaf8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/outdir/src/main.rs
@@ -0,0 +1,38 @@
+// In this example, a build.rs file generates the code and then copies it into $OUT_DIR. 
+extern crate flatbuffers;
+
+#[cfg(target_family = "unix")]
+#[allow(dead_code, unused_imports)]
+mod generated {
+    include!(concat!(env!("OUT_DIR"), "/monster_generated.rs"));
+}
+
+#[cfg(target_family = "windows")]
+#[allow(dead_code, unused_imports)]
+mod generated {
+    include!(concat!(env!("OUT_DIR"), "\\monster_generated.rs"));
+}
+
+use generated::my_game::sample::{Monster, MonsterArgs};
+
+
+fn main() {
+    let mut fbb = flatbuffers::FlatBufferBuilder::new();
+    let name = Some(fbb.create_string("bob"));
+    let m = Monster::create(&mut fbb, &MonsterArgs {
+        hp: 1,
+        mana: 2,
+        name,
+        ..Default::default()
+    });
+    fbb.finish(m, None);
+    let mon = flatbuffers::root::<Monster>(fbb.finished_data()).unwrap();
+    assert_eq!(mon.hp(), 1);
+    assert_eq!(mon.mana(), 2);
+    assert_eq!(mon.name().unwrap(), "bob");
+}
+
+#[test]
+fn test_main() {
+    main()
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/arrays_test.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/arrays_test.rs
new file mode 100644
index 0000000..41e7590
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/arrays_test.rs
@@ -0,0 +1,325 @@
+extern crate array_init;
+#[allow(dead_code, unused_imports)]
+#[path = "../../arrays_test_generated.rs"]
+mod arrays_test_generated;
+use std::fmt::Debug;
+
+use crate::arrays_test_generated::my_game::example::*;
+extern crate quickcheck;
+use array_init::array_init;
+use std::mem::size_of;
+use quickcheck::{Arbitrary, Gen};
+
+
+fn create_serialized_example_with_generated_code(builder: &mut flatbuffers::FlatBufferBuilder) {
+    let nested_struct1 = NestedStruct::new(
+        &[-1, 2],
+        TestEnum::A,
+        &[TestEnum::C, TestEnum::B],
+        &[0x1122334455667788, -0x1122334455667788],
+    );
+    let nested_struct2 = NestedStruct::new(
+        &[3, -4],
+        TestEnum::B,
+        &[TestEnum::B, TestEnum::A],
+        &[-0x1122334455667788, 0x1122334455667788],
+    );
+    let array_struct = ArrayStruct::new(
+        12.34,
+        &[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF],
+        -127,
+        &[nested_struct1, nested_struct2],
+        1,
+        &[-0x8000000000000000, 0x7FFFFFFFFFFFFFFF],
+    );
+    // Test five makes sense when specified.
+    let ss = ArrayTable::create(
+        builder,
+        &ArrayTableArgs {
+            a: Some(&array_struct),
+        },
+    );
+    finish_array_table_buffer(builder, ss);
+}
+
+fn serialized_example_is_accessible_and_correct(
+    bytes: &[u8],
+    identifier_required: bool,
+    size_prefixed: bool,
+) {
+    if identifier_required {
+        let correct = if size_prefixed {
+            array_table_size_prefixed_buffer_has_identifier(bytes)
+        } else {
+            array_table_buffer_has_identifier(bytes)
+        };
+        assert_eq!(correct, true);
+    }
+
+    let array_table = if size_prefixed {
+        size_prefixed_root_as_array_table(bytes).unwrap()
+    } else {
+        root_as_array_table(bytes).unwrap()
+    };
+
+    let array_struct = array_table.a().unwrap();
+    assert_eq!(array_struct.a(), 12.34);
+    assert_eq!(array_struct.b().len(), 0xF);
+    assert_eq!(array_struct.b().iter().sum::<i32>(), 120);
+    assert_eq!(array_struct.c(), -127);
+
+    assert_eq!(array_struct.d().len(), 2);
+    let nested_struct1 = array_struct.d().get(0);
+    assert_eq!(nested_struct1.a().len(), 2);
+    assert_eq!(nested_struct1.a().iter().sum::<i32>(), 1);
+    assert_eq!(nested_struct1.b(), TestEnum::A);
+    assert_eq!(nested_struct1.c().len(), 2);
+    assert_eq!(nested_struct1.c().get(0), TestEnum::C);
+    assert_eq!(nested_struct1.c().get(1), TestEnum::B);
+    assert_eq!(nested_struct1.d().len(), 2);
+    assert_eq!(
+        [nested_struct1.d().get(0), nested_struct1.d().get(1)],
+        [0x1122334455667788, -0x1122334455667788]
+    );
+    let nested_struct2 = array_struct.d().get(1);
+    assert_eq!(nested_struct2.a().len(), 2);
+    assert_eq!(nested_struct2.a().iter().sum::<i32>(), -1);
+    assert_eq!(nested_struct2.b(), TestEnum::B);
+    assert_eq!(nested_struct2.c().len(), 2);
+    assert_eq!(nested_struct2.c().get(0), TestEnum::B);
+    assert_eq!(nested_struct2.c().get(1), TestEnum::A);
+    assert_eq!(nested_struct2.d().len(), 2);
+    let arr: [i64; 2] = nested_struct2.d().into();
+    assert_eq!(
+        arr,
+        [-0x1122334455667788, 0x1122334455667788]
+    );
+
+    assert_eq!(array_struct.e(), 1);
+    assert_eq!(array_struct.f().len(), 2);
+    assert_eq!(array_struct.f().get(0), -0x8000000000000000);
+    assert_eq!(array_struct.f().get(1), 0x7FFFFFFFFFFFFFFF);
+}
+
+#[test]
+fn generated_code_creates_correct_example() {
+    let mut b = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(&mut b);
+    let buf = b.finished_data();
+    serialized_example_is_accessible_and_correct(&buf[..], true, false);
+}
+
+#[test]
+fn struct_netsted_struct_is_32_bytes() {
+    assert_eq!(32, ::std::mem::size_of::<NestedStruct>());
+}
+
+#[test]
+fn struct_array_struct_is_160_bytes() {
+    assert_eq!(160, ::std::mem::size_of::<ArrayStruct>());
+}
+
+#[test]
+fn test_object_api_reads_correctly() {
+    let mut b = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(&mut b);
+
+    let array_table = root_as_array_table(b.finished_data()).unwrap().unpack();
+
+    let array_struct = array_table.a.unwrap();
+    assert_eq!(array_struct.a, 12.34);
+    assert_eq!(array_struct.b.len(), 0xF);
+    assert_eq!(array_struct.b.iter().sum::<i32>(), 120);
+    assert_eq!(array_struct.c, -127);
+
+    assert_eq!(array_struct.d.len(), 2);
+    let nested_struct1 = &array_struct.d[0];
+    assert_eq!(nested_struct1.a.len(), 2);
+    assert_eq!(nested_struct1.a.iter().sum::<i32>(), 1);
+    assert_eq!(nested_struct1.b, TestEnum::A);
+    assert_eq!(nested_struct1.c.len(), 2);
+    assert_eq!(nested_struct1.c[0], TestEnum::C);
+    assert_eq!(nested_struct1.c[1], TestEnum::B);
+    assert_eq!(nested_struct1.d.len(), 2);
+    assert_eq!(nested_struct1.d, [0x1122334455667788, -0x1122334455667788]);
+    let nested_struct2 = &array_struct.d[1];
+    assert_eq!(nested_struct2.a.len(), 2);
+    assert_eq!(nested_struct2.a.iter().sum::<i32>(), -1);
+    assert_eq!(nested_struct2.b, TestEnum::B);
+    assert_eq!(nested_struct2.c.len(), 2);
+    assert_eq!(nested_struct2.c[0], TestEnum::B);
+    assert_eq!(nested_struct2.c[1], TestEnum::A);
+    assert_eq!(nested_struct2.d.len(), 2);
+    assert_eq!(nested_struct2.d, [-0x1122334455667788, 0x1122334455667788]);
+
+    assert_eq!(array_struct.e, 1);
+    assert_eq!(array_struct.f.len(), 2);
+    assert_eq!(array_struct.f[0], -0x8000000000000000);
+    assert_eq!(array_struct.f[1], 0x7FFFFFFFFFFFFFFF);
+}
+
+#[test]
+fn object_api_defaults() {
+    use arrays_test_generated::my_game::example::*;
+
+    assert_eq!(
+        NestedStructT::default(),
+        NestedStructT {
+            a: [0, 0],
+            b: TestEnum::default(),
+            c: [TestEnum::default(), TestEnum::default()],
+            d: [0, 0]
+        }
+    );
+
+    assert_eq!(
+        ArrayStructT::default(),
+        ArrayStructT {
+            a: 0.0,
+            b: [0; 0xF],
+            c: 0,
+            d: [NestedStructT::default(), NestedStructT::default()],
+            e: 0,
+            f: [0, 0]
+        }
+    );
+}
+
+#[test]
+fn generated_code_debug_prints_correctly() {
+    let b = &mut flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(b);
+    let buf = b.finished_data();
+    let array_table = root_as_array_table(buf).unwrap();
+    assert_eq!(
+        format!("{:.5?}", &array_table),
+        "ArrayTable { a: Some(ArrayStruct { a: 12.34000, \
+         b: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], \
+         c: -127, d: [NestedStruct { a: [-1, 2], b: A, c: [C, B], \
+         d: [1234605616436508552, -1234605616436508552] }, \
+         NestedStruct { a: [3, -4], b: B, c: [B, A], d: [-1234605616436508552, 1234605616436508552] }], \
+         e: 1, f: [-9223372036854775808, 9223372036854775807] }) }"
+    );
+}
+
+#[test]
+#[should_panic]
+fn assert_on_too_small_array_buf() {
+    let a = [0u8; 19];
+    flatbuffers::Array::<i32, 5>::new(&a);
+}
+
+#[test]
+#[should_panic]
+fn assert_on_too_big_array_buf() {
+    let a = [0u8; 21];
+    flatbuffers::Array::<i32, 5>::new(&a);
+}
+
+#[test]
+#[cfg(target_endian = "little")]
+fn verify_struct_array_alignment() {
+    let mut b = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(&mut b);
+    let buf = b.finished_data();
+    let array_table = root_as_array_table(buf).unwrap();
+    let array_struct = array_table.a().unwrap();
+    let struct_start_ptr = array_struct.0.as_ptr() as usize;
+    let b_start_ptr = array_struct.b().as_ptr() as usize;
+    let d_start_ptr = array_struct.d().as_ptr() as usize;
+    // The T type of b
+    let b_aln = ::std::mem::align_of::<i32>();
+    assert_eq!((b_start_ptr - struct_start_ptr) % b_aln, 0);
+    assert_eq!((d_start_ptr - b_start_ptr) % b_aln, 0);
+    assert_eq!((d_start_ptr - struct_start_ptr) % 8, 0);
+}
+
+#[derive(Clone, Debug)]
+struct FakeArray<T, const N: usize>([T; N]);
+
+impl<T: Arbitrary + Debug + PartialEq, const N: usize> Arbitrary for FakeArray<T, N> {
+    fn arbitrary<G: Gen>(g: &mut G) -> FakeArray<T, N> {
+        let x: [T; N] = array_init(|_| {
+            loop {
+                let generated_scalar = T::arbitrary(g);
+                // Verify that generated scalar is not Nan, which is not equals to itself, 
+                // therefore we can't use it to validate input == output
+                if generated_scalar == generated_scalar { return generated_scalar }
+            }
+        });
+        FakeArray{0: x}
+    }
+}
+
+#[cfg(test)]
+mod array_fuzz {
+    #[cfg(not(miri))]  // slow.
+    extern crate quickcheck;
+    extern crate flatbuffers;
+    use self::flatbuffers::{Follow, Push};
+    use super::*;
+
+    const MAX_TESTS: u64 = 20;
+    const ARRAY_SIZE: usize = 29;
+
+    // This uses a macro because lifetimes for the trait-bounded function get too
+    // complicated.
+    macro_rules! impl_prop {
+        ($test_name:ident, $fn_name:ident, $ty:ident) => (
+            fn $fn_name(xs: FakeArray<$ty, ARRAY_SIZE>) {
+                let mut test_buf = [0 as u8; 1024];
+                flatbuffers::emplace_scalar_array(&mut test_buf, 0, &xs.0);
+                let arr: flatbuffers::Array<$ty, ARRAY_SIZE> = flatbuffers::Array::follow(&test_buf, 0);
+                let got: [$ty; ARRAY_SIZE] = arr.into();
+                assert_eq!(got, xs.0);
+            }
+            #[test]
+            fn $test_name() { 
+                quickcheck::QuickCheck::new().max_tests(MAX_TESTS).quickcheck($fn_name as fn(FakeArray<$ty, ARRAY_SIZE>));
+            }
+        )
+    }
+
+    impl_prop!(test_bool, prop_bool, bool);
+    impl_prop!(test_u8, prop_u8, u8);
+    impl_prop!(test_i8, prop_i8, i8);
+    impl_prop!(test_u16, prop_u16, u16);
+    impl_prop!(test_u32, prop_u32, u32);
+    impl_prop!(test_u64, prop_u64, u64);
+    impl_prop!(test_i16, prop_i16, i16);
+    impl_prop!(test_i32, prop_i32, i32);
+    impl_prop!(test_i64, prop_i64, i64);
+    impl_prop!(test_f32, prop_f32, f32);
+    impl_prop!(test_f64, prop_f64, f64);
+
+    const NESTED_STRUCT_SIZE: usize = size_of::<NestedStruct>();
+
+    #[derive(Clone, Debug, PartialEq)]
+    struct NestedStructWrapper(NestedStruct);
+
+    impl Arbitrary for NestedStructWrapper {
+        fn arbitrary<G: Gen>(g: &mut G) -> NestedStructWrapper {
+            let mut x = NestedStruct::default();
+            x.0 = FakeArray::<u8, NESTED_STRUCT_SIZE>::arbitrary(g).0;
+            NestedStructWrapper{0: x}
+        }
+    }
+
+    fn prop_struct(xs: FakeArray<NestedStructWrapper, ARRAY_SIZE>) {
+        let mut test_buf = [0 as u8; 1024];
+        let native_struct_array: [&NestedStruct; ARRAY_SIZE] = array_init::from_iter(xs.0.iter().map(|x| &x.0)).unwrap();
+        for i in 0..ARRAY_SIZE {
+            let offset = i * NESTED_STRUCT_SIZE;
+            native_struct_array[i].push(&mut test_buf[offset..offset + NESTED_STRUCT_SIZE], &[]);
+        }
+        let arr: flatbuffers::Array<NestedStruct, ARRAY_SIZE> = flatbuffers::Array::follow(&test_buf, 0);
+        let got: [&NestedStruct; ARRAY_SIZE] = arr.into();
+        assert_eq!(got, native_struct_array);
+    }
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn test_struct() { 
+        quickcheck::QuickCheck::new().max_tests(MAX_TESTS).quickcheck(prop_struct as fn(FakeArray<NestedStructWrapper, ARRAY_SIZE>));
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/binary_format.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/binary_format.rs
new file mode 100644
index 0000000..910fc78
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/binary_format.rs
@@ -0,0 +1,537 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use flexbuffers::*;
+use serde::Serialize;
+
+#[test]
+fn store_13() {
+    let buf = singleton(13i32);
+    assert_eq!(&buf, &[13, 4, 1]);
+}
+#[test]
+fn store_2pow20() {
+    let buf = singleton(1_048_576i32);
+    assert_eq!(
+        &buf,
+        &[
+            0,
+            0,
+            16,
+            0,          // 2^20 in LE bytes.
+            1 << 2 | 2, // Int 32bit
+            4           // Root width 32 bit
+        ]
+    );
+}
+
+#[test]
+fn heterogenous_vector_of_string_because_width() {
+    // Each string is 32 characters. They are 256 bytes altogether.
+    // This forces the vector to be W16 because of the large offsets.
+    let test_data = [
+        "0aaabbbbccccddddeeeeffffgggghhh",
+        "1aaabbbbccccddddeeeeffffgggghhh",
+        "2aaabbbbccccddddeeeeffffgggghhh",
+        "3aaabbbbccccddddeeeeffffgggghhh",
+        "4aaabbbbccccddddeeeeffffgggghhh",
+        "5aaabbbbccccddddeeeeffffgggghhh",
+        "6aaabbbbccccddddeeeeffffgggghhh",
+        "7aaabbbbccccddddeeeeffffgggghhh",
+    ];
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    for &s in test_data.iter() {
+        v.push(s);
+    }
+    v.end_vector();
+    let mut expected = vec![];
+    for &s in test_data.iter() {
+        expected.push(s.len() as u8);
+        expected.extend(s.bytes());
+        expected.push(b'\0');
+    }
+    expected.extend(8u16.to_le_bytes().iter()); // Length.
+    for i in 0..test_data.len() as u16 {
+        let offset = 32 * (8 - i) + 9 + i;
+        expected.extend(offset.to_le_bytes().iter());
+    }
+    for _ in 0..test_data.len() {
+        expected.push(5 << 2 | 0); // String, W8.
+    }
+    expected.push(24); // Offset to Vector.
+    expected.push(10 << 2 | 1); // Vector, W16.
+    expected.push(1); // Root width W8.
+    assert_eq!(fxb.view(), expected.as_slice());
+}
+
+#[test]
+fn store_vec_uint_16() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(256u16);
+    v.push(257u16);
+    v.push(258u16);
+    v.push(259u16);
+    v.push(0u8); // This still becomes u16.
+    v.end_vector();
+    assert_eq!(
+        fxb.view(),
+        &[
+            5,
+            0,
+            0,
+            1,
+            1,
+            1,
+            2,
+            1,
+            3,
+            1,
+            0,
+            0,           // Data
+            10,          // Vector offset.
+            12 << 2 | 1, // (VectorUInt, W16 - referring to data).
+            1,           // Root width W8 - referring to vector.
+        ]
+    );
+}
+
+#[cfg(not(miri))]  // slow.
+quickcheck! {
+    fn qc_f32(x: f32) -> bool {
+        let fxb = singleton(x);
+        let mut expected = x.to_le_bytes().to_vec();
+        expected.push(3 << 2 | 2);  // Float W32.
+        expected.push(4); // Root width W32.
+        println!("{:?}: {:?} vs {:?} cmp {:?}", x, &fxb, &expected, fxb==expected);
+        fxb == expected
+    }
+}
+
+#[test]
+fn singleton_vector_uint_4_16bit() {
+    let buf = singleton(&[4u16, 16, 64, 256]);
+    assert_eq!(
+        &buf,
+        &[
+            4,
+            0,
+            16,
+            0,
+            64,
+            0,
+            0,
+            1,           // Data
+            8,           // Vector offset.
+            23 << 2 | 1, // (VectorUInt, W16 - referring to data).
+            1,           // Root width W8 - referring to vector.
+        ]
+    );
+}
+#[test]
+fn store_u64() {
+    let buf = singleton(u64::max_value() - 10);
+    assert_eq!(
+        &buf,
+        &[
+            245,
+            255,
+            255,
+            255,
+            255,
+            255,
+            255,
+            255,        // max value - 10.
+            2 << 2 | 3, // (UInt, W64)
+            8,          // Root width W64.
+        ]
+    );
+}
+#[test]
+fn vector_uint4() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(2u8);
+    v.push(3u8);
+    v.push(5u8);
+    v.push(7u8);
+    v.end_vector();
+    assert_eq!(
+        &fxb.view(),
+        &[
+            2,
+            3,
+            5,
+            7,           // data
+            4,           // Root (offset)
+            23 << 2 | 0, // Root type VectorUInt4, BitWidth::W8
+            1,           // Root bitwidth W8
+        ]
+    );
+}
+#[test]
+fn nested_vector() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(0u8);
+    {
+        let mut nested = v.start_vector();
+        nested.push(1u8);
+        nested.push(2u8);
+        nested.push(3u8);
+    }
+    v.push(-42i8);
+    v.end_vector();
+    assert_eq!(
+        fxb.view(),
+        &[
+            1,
+            2,
+            3, // Nested vector
+            3,
+            0,
+            5,
+            214,         // Root Vector: size, v[0], v[1] (offset), v[2] as u8
+            2 << 2 | 0,  // v[0]: (UInt, W8)
+            20 << 2 | 0, // v[1]: (VectorUInt3, W8)
+            1 << 2 | 0,  // v[2]: (Int, W8)
+            6,           // Root points to Root vector
+            10 << 2 | 0, // Root type and width (Vector, W8)
+            1,           // Root bytes
+        ]
+    )
+}
+
+#[test]
+fn nested_vector_push_direct() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(0u8);
+    v.push(&[1u8, 2, 3]);
+    v.push(-42i8);
+    v.end_vector();
+    assert_eq!(
+        fxb.view(),
+        &[
+            1,
+            2,
+            3, // Nested VectorUInt3
+            3,
+            0,
+            5,
+            214,         // Root Vector: size, v[0], v[1] (offset), v[2] as u8
+            2 << 2 | 0,  // v[0]: (UInt, W8)
+            20 << 2 | 0, // v[1]: (VectorUInt3, W8)
+            1 << 2 | 0,  // v[2]: (Int, W8)
+            6,           // Root points to Root vector
+            10 << 2 | 0, // Root type and width (Vector, W8)
+            1,           // Root bytes
+        ]
+    )
+}
+#[test]
+fn store_map_index_into_it() {
+    let mut fxb = Builder::default();
+    {
+        let mut m = fxb.start_map();
+        m.push("foo", 17u8);
+        m.push("bar", 33u16);
+        m.push("baz", 41u32);
+    }
+    assert_eq!(
+        fxb.view(),
+        &[
+            b'f',
+            b'o',
+            b'o',
+            b'\0',
+            b'b',
+            b'a',
+            b'r',
+            b'\0',
+            b'b',
+            b'a',
+            b'z',
+            b'\0',
+            3,
+            9,
+            6,
+            15, // Keys vector (note "bar" < "baz" < "foo").
+            3,
+            1,
+            3, // map prefix
+            33,
+            41,
+            17, // values
+            8,
+            8,
+            8,          // types (UInt, W8) ~ (2 << 2 | 0)
+            6,          // Offset to map (root)
+            9 << 2 | 0, // Root type (map)
+            1,          // Root bytes
+        ]
+    );
+}
+#[test]
+fn utf8_snowman() {
+    let buf = singleton("snowman ☃︎");
+    assert_eq!(
+        &buf,
+        &[
+            14, // Byte length (besides extra null terminator).
+            b's',
+            b'n',
+            b'o',
+            b'w',
+            b'm',
+            b'a',
+            b'n',
+            b' ',
+            226,
+            152,
+            131, // snowman bytes
+            239,
+            184,
+            142,    // UTF Variation selector 15
+            0,      // extra null terminator.
+            15,     // Offset to string start.
+            5 << 2, // String, W8
+            1,      // Root bytes
+        ]
+    );
+    let r = Reader::get_root(buf.as_ref()).unwrap();
+    assert_eq!(r.get_str(), Ok("snowman ☃︎"));
+}
+#[test]
+fn indirect_numbers() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(IndirectUInt(u64::max_value()));
+    v.push(IndirectInt(i64::min_value()));
+    // TODO(cneo): Something about Float EPSILON and casting leads to a different binary format.
+    v.push(IndirectFloat(std::f64::consts::PI));
+    v.push(0u32); // This is stored in 8 bits instead of 64 because of indirection.
+    v.end_vector();
+    assert_eq!(
+        fxb.view(),
+        vec![
+            255,
+            255,
+            255,
+            255,
+            255,
+            255,
+            255,
+            255, // u64 max
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            128, // i64 min value
+            24,
+            45,
+            68,
+            84,
+            251,
+            33,
+            9,
+            64, // f64 PI.
+            4,  // Vector length
+            25,
+            18,
+            11,
+            0,           // offsets to the indirect numbers and zero.
+            7 << 2 | 3,  // IndirectUInt 64 bit
+            6 << 2 | 3,  // IndirectInt 64 bit
+            8 << 2 | 3,  // IndirectFloat 64 bit
+            2 << 2 | 0,  // (inline) UInt 8 bit
+            8,           // Offset to Root.
+            10 << 2 | 0, // Vector 8 bit
+            1,           // 1 byte root
+        ]
+        .as_slice()
+    )
+}
+#[test]
+fn indirect_2p5x_smaller() {
+    let mut builder = Builder::default();
+    let mut v = builder.start_vector();
+    for i in 0..512 {
+        v.push(i);
+    }
+    v.push(i64::max_value());
+    v.end_vector();
+    let len_without_indirect = builder.view().len() as f32;
+
+    let mut v = builder.start_vector();
+    for i in 0..512 {
+        v.push(i);
+    }
+    v.push(IndirectInt(i64::max_value()));
+    v.end_vector();
+    let len_with_indirect = builder.view().len() as f32;
+    dbg!(len_with_indirect, len_without_indirect);
+    assert!(len_with_indirect * 2.5 < len_without_indirect);
+}
+#[test]
+fn key_pool() {
+    let mut builder = Builder::default();
+    let mut vector = builder.start_vector();
+    for _ in 0..2 {
+        let mut m = vector.start_map();
+        m.push("a", 42u8);
+        m.push("b", 42u8);
+        m.push("c", 42u8);
+    }
+    vector.end_vector();
+
+    assert_eq!(
+        builder.view(),
+        vec![
+            b'a',
+            b'\0',
+            b'b',
+            b'\0',
+            b'c',
+            b'\0',
+            3,
+            7,
+            6,
+            5, // Key vector 0
+            3,
+            1,
+            3,
+            42,
+            42,
+            42,
+            2 << 2,
+            2 << 2,
+            2 << 2, // Map 0.
+            3,
+            20,
+            19,
+            18, // Key vector 1 (shares keys with key vector 0).
+            3,
+            1,
+            3,
+            42,
+            42,
+            42,
+            2 << 2,
+            2 << 2,
+            2 << 2, // Map 1.
+            2,
+            20,
+            8,
+            9 << 2,
+            9 << 2, // Vector containing the maps.
+            4,
+            10 << 2,
+            1, // Root.
+        ]
+        .as_slice()
+    );
+}
+
+#[test]
+fn serialize_unit() {
+    #[derive(Serialize)]
+    struct Foo;
+    let mut s = FlexbufferSerializer::new();
+    Foo.serialize(&mut s).unwrap();
+    assert_eq!(s.view(), &[0, 0, 1]);
+}
+
+#[test]
+fn serialize_i8() {
+    let mut s = FlexbufferSerializer::new();
+    13i8.serialize(&mut s).unwrap();
+    assert_eq!(s.view(), &[13, 4, 1]);
+}
+#[test]
+fn serialize_tuple_struct_i8() {
+    #[derive(Serialize)]
+    struct Foo(i32);
+    let mut s = FlexbufferSerializer::new();
+    Foo(13).serialize(&mut s).unwrap();
+    assert_eq!(s.view(), &[13, 4, 1]);
+}
+#[test]
+fn serialize_tuple_tuple_struct_i8_is_inlined() {
+    #[derive(Serialize)]
+    struct Foo(i32);
+    #[derive(Serialize)]
+    struct Bar(Foo);
+    let mut s = FlexbufferSerializer::new();
+    Bar(Foo(13)).serialize(&mut s).unwrap();
+    assert_eq!(s.view(), &[13, 4, 1]);
+}
+#[test]
+fn align_8byte() {
+    let mut b = Builder::default();
+    let mut v = b.start_vector();
+    v.push(IndirectUInt(42));
+    v.push(&[u64::max_value(); 2]);
+    v.end_vector();
+    assert_eq!(
+        b.view()[..16],
+        [
+            42, 0, 0, 0, 0, 0, 0, 0, // padding
+            255, 255, 255, 255, 255, 255, 255, 255, // the first u64 max value.
+        ]
+    );
+}
+#[test]
+fn align_4byte() {
+    let mut b = Builder::default();
+    let mut v = b.start_vector();
+    v.push(IndirectUInt(42));
+    v.push(&[u32::max_value(); 2]);
+    v.end_vector();
+    assert_eq!(
+        b.view()[..8],
+        [
+            42, 0, 0, 0, // padding
+            255, 255, 255, 255, // the first u32 max value.
+        ]
+    );
+}
+#[test]
+fn align_2byte() {
+    let mut b = Builder::default();
+    let mut v = b.start_vector();
+    v.push(IndirectUInt(42));
+    v.push(&[u16::max_value(); 2]);
+    v.end_vector();
+    assert_eq!(
+        b.view()[..4],
+        [
+            42, 0, // padding
+            255, 255, // the first u16 max value.
+        ]
+    );
+}
+#[test]
+fn align_1byte() {
+    let mut b = Builder::default();
+    let mut v = b.start_vector();
+    v.push(IndirectUInt(42));
+    v.push(&[u8::max_value(); 2]);
+    v.end_vector();
+    assert_eq!(b.view()[..2], [42, 255]); // No padding.
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/interop.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/interop.rs
new file mode 100644
index 0000000..705948f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/interop.rs
@@ -0,0 +1,50 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use flexbuffers::*;
+
+#[test]
+fn read_golden_flexbuffer() {
+    let s =
+        std::fs::read("../gold_flexbuffer_example.bin").expect("Unable to read golden flexbuffer.");
+    let r = Reader::get_root(s.as_ref()).unwrap();
+    let m = r.as_map();
+
+    let vec = m.idx("vec").as_vector();
+    assert_eq!(vec.idx(0).as_i8(), -100);
+    assert_eq!(vec.idx(1).as_str(), "Fred");
+    assert_eq!(vec.idx(2).as_f32(), 4.0);
+    assert_eq!(vec.idx(3).as_blob(), Blob([77].as_ref()));
+    assert_eq!(vec.idx(4).flexbuffer_type(), FlexBufferType::Bool);
+    assert_eq!(vec.idx(4).as_bool(), false);
+    assert_eq!(vec.idx(5).as_f64(), 4.0);
+
+    let bar = m.idx("bar").as_vector();
+    for (i, &x) in [1, 2, 3].iter().enumerate() {
+        assert_eq!(bar.idx(i).as_i8(), x);
+    }
+    let bar3 = m.idx("bar3").as_vector();
+    for (i, &x) in [1, 2, 3].iter().enumerate() {
+        assert_eq!(bar3.idx(i).as_i8(), x);
+    }
+    let bools = m.idx("bools").as_vector();
+    for (i, &b) in [true, false, true, false].iter().enumerate() {
+        assert_eq!(bools.idx(i).as_bool(), b)
+    }
+
+    assert_eq!(m.idx("bool").as_bool(), true);
+    assert_eq!(m.idx("foo").as_f64(), 100.0);
+    let mymap = m.idx("mymap").as_map();
+    assert_eq!(mymap.idx("foo").as_str(), "Fred");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/mod.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/mod.rs
new file mode 100644
index 0000000..621f81b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/mod.rs
@@ -0,0 +1,20 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod binary_format;
+mod interop;
+mod other_api;
+#[cfg(not(miri))]  // slow.
+mod qc_serious;
+mod rwyw;
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/other_api.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/other_api.rs
new file mode 100644
index 0000000..d1c0873
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/other_api.rs
@@ -0,0 +1,209 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use flexbuffers::*;
+#[cfg(not(miri))]  // slow.
+use quickcheck::QuickCheck;
+
+#[test]
+#[cfg(not(miri))]  // slow.
+fn qc_reader_no_crash() {
+    fn no_crash(xs: Vec<u8>) -> bool {
+        let r = Reader::get_root(xs.as_ref());
+        r.is_err() || r.is_ok()
+    }
+    QuickCheck::new()
+        .min_tests_passed(10_000_000)
+        .quicktest(no_crash as fn(Vec<u8>) -> bool)
+        .unwrap();
+
+    no_crash(vec![0, 10 << 2 | 2, 0]);
+}
+#[test]
+fn as_num() {
+    let mut fxb = Builder::default();
+    let mut m = fxb.start_map();
+    m.push("a", &[-1i8, -2, -3, -4]);
+    m.push("b", 250i64);
+    m.push("c", 5000u16);
+    m.end_map();
+
+    let r = Reader::get_root(fxb.view()).unwrap();
+    assert_eq!(r.as_i8(), 3); // length.
+    assert_eq!(r.as_i16(), 3);
+    assert_eq!(r.as_i32(), 3);
+    assert_eq!(r.as_i64(), 3);
+    assert_eq!(r.as_u8(), 3);
+    assert_eq!(r.as_u16(), 3);
+    assert_eq!(r.as_u32(), 3);
+    assert_eq!(r.as_u64(), 3);
+    assert_eq!(r.as_f32(), 3.0);
+    assert_eq!(r.as_f64(), 3.0);
+
+    let m = r.as_map();
+    let a = m.index("a").unwrap();
+    assert_eq!(a.as_f32(), 4.0); // length.
+    assert_eq!(a.as_f64(), 4.0); // length.
+    assert_eq!(a.as_vector().idx(0).as_i8(), -1);
+    assert_eq!(a.as_vector().idx(1).as_i16(), -2);
+    assert_eq!(a.as_vector().idx(2).as_i32(), -3);
+    assert_eq!(a.as_vector().idx(3).as_i64(), -4);
+
+    let b = m.index("b").unwrap();
+    assert_eq!(b.as_u8(), 250);
+    assert_eq!(b.as_u16(), 250);
+    assert_eq!(b.as_u32(), 250);
+    assert_eq!(b.as_u64(), 250);
+    assert_eq!(b.as_i8(), 0); // overflow
+    assert_eq!(b.as_i16(), 250);
+    assert_eq!(b.as_i32(), 250);
+    assert_eq!(b.as_i64(), 250);
+
+    let c = m.index("c").unwrap();
+    assert_eq!(c.as_i64(), 5000);
+    assert_eq!(c.as_u64(), 5000);
+    assert_eq!(c.as_f32(), 5000.0);
+    assert_eq!(c.as_u8(), 0); // overflow
+    assert_eq!(c.as_u16(), 5000);
+    assert_eq!(c.as_u32(), 5000);
+    assert_eq!(c.as_u64(), 5000);
+    assert_eq!(c.as_i8(), 0); // overflow
+    assert_eq!(c.as_i16(), 5000);
+    assert_eq!(c.as_i32(), 5000);
+    assert_eq!(c.as_i64(), 5000);
+}
+#[test]
+fn string_as_num() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push("3.1415");
+    v.push("9.001e3");
+    v.push("42");
+    v.end_vector();
+    let r = Reader::get_root(fxb.view()).unwrap();
+
+    let v0 = r.as_vector().idx(0);
+    assert_eq!(v0.as_f64(), 3.1415);
+    assert_eq!(v0.as_f32(), 3.1415);
+    assert_eq!(v0.as_u8(), 0);
+    assert_eq!(v0.as_u16(), 0);
+    assert_eq!(v0.as_u32(), 0);
+    assert_eq!(v0.as_u64(), 0);
+    assert_eq!(v0.as_i8(), 0);
+    assert_eq!(v0.as_i16(), 0);
+    assert_eq!(v0.as_i32(), 0);
+    assert_eq!(v0.as_i64(), 0);
+
+    let v1 = r.as_vector().idx(1);
+    assert_eq!(v1.as_f64(), 9001.0);
+    assert_eq!(v1.as_f32(), 9001.0);
+    assert_eq!(v1.as_u8(), 0);
+    assert_eq!(v1.as_u16(), 0);
+    assert_eq!(v1.as_u32(), 0);
+    assert_eq!(v1.as_u64(), 0);
+    assert_eq!(v1.as_i8(), 0);
+    assert_eq!(v1.as_i16(), 0);
+    assert_eq!(v1.as_i32(), 0);
+    assert_eq!(v1.as_i64(), 0);
+    assert_eq!(v1.as_i32(), 0);
+
+    let v2 = r.as_vector().idx(2);
+    assert_eq!(v2.as_f64(), 42.0);
+    assert_eq!(v2.as_f32(), 42.0);
+    assert_eq!(v2.as_u8(), 42);
+    assert_eq!(v2.as_u16(), 42);
+    assert_eq!(v2.as_u32(), 42);
+    assert_eq!(v2.as_u64(), 42);
+    assert_eq!(v2.as_i8(), 42);
+    assert_eq!(v2.as_i16(), 42);
+    assert_eq!(v2.as_i32(), 42);
+    assert_eq!(v2.as_i64(), 42);
+    assert_eq!(v2.as_i32(), 42);
+}
+#[test]
+fn null_reader() {
+    let n = Reader::<&[u8]>::default();
+    assert_eq!(n.as_i8(), 0);
+    assert_eq!(n.as_i16(), 0);
+    assert_eq!(n.as_i32(), 0);
+    assert_eq!(n.as_i64(), 0);
+    assert_eq!(n.as_u8(), 0);
+    assert_eq!(n.as_u16(), 0);
+    assert_eq!(n.as_u32(), 0);
+    assert_eq!(n.as_u64(), 0);
+    assert_eq!(n.as_f32(), 0.0);
+    assert_eq!(n.as_f64(), 0.0);
+    assert!(n.get_i64().is_err());
+    assert!(n.get_u64().is_err());
+    assert!(n.get_f64().is_err());
+    assert!(n.as_vector().is_empty());
+    assert!(n.as_map().is_empty());
+    assert_eq!(n.as_vector().idx(1).flexbuffer_type(), FlexBufferType::Null);
+    assert_eq!(n.as_map().idx("1").flexbuffer_type(), FlexBufferType::Null);
+}
+#[test]
+fn get_root_deref_oob() {
+    let s = &[
+        4, // Deref out of bounds
+        (FlexBufferType::Vector as u8) << 2 | BitWidth::W8 as u8,
+        1,
+    ];
+    assert!(Reader::get_root(s.as_ref()).is_err());
+}
+#[test]
+fn get_root_deref_u64() {
+    let s = &[
+        0,
+        0,
+        (FlexBufferType::IndirectUInt as u8) << 2 | BitWidth::W64 as u8,
+        1,
+    ];
+    // The risk of crashing is reading 8 bytes from index 0.
+    assert_eq!(Reader::get_root(s.as_ref()).unwrap().as_u64(), 0);
+}
+
+/// Verifies that the clone operation is shallow / zero copy.
+#[test]
+fn clone_is_shallow() {
+    let mut fxb = Builder::default();
+    let mut m = fxb.start_map();
+    m.push("a", &[-1i8, -2, -3, -4]);
+    m.push("b", 250i64);
+    m.push("c", 5000u16);
+    m.end_map();
+
+    let r = Reader::get_root(fxb.view()).unwrap();
+
+    let r2 = r.clone();
+
+    assert_eq!(r.buffer().as_ptr(), r2.buffer().as_ptr());
+}
+
+#[test]
+#[should_panic]
+fn build_map_panic_on_repeated_key() {
+    let mut b = Builder::default();
+    let mut m = b.start_map();
+    m.push("foo", 5u8);
+    m.push("foo", 6u8);
+    m.end_map();
+}
+#[test]
+#[should_panic]
+fn build_map_panic_on_internal_null() {
+    let mut b = Builder::default();
+    let mut m = b.start_map();
+    m.push("foo\0", 5u8);
+    m.end_map();
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/qc_serious.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/qc_serious.rs
new file mode 100644
index 0000000..abd1ced
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/qc_serious.rs
@@ -0,0 +1,145 @@
+use super::rwyw::NonNullString;
+use flexbuffers::*;
+use quickcheck::{Arbitrary, Gen};
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+enum Enum {
+    Unit,
+    U8(u8),
+    U16(u16),
+    U32(u32),
+    U64(u64),
+    Us(u8, u16, u32, u64),
+    I8(i8),
+    I16(i16),
+    I32(i32),
+    I64(i64),
+    Is(i8, i16, i32, i64),
+    F32(f32),
+    F64(f64),
+    Fs(f32, f64),
+    String(String),
+    Strings(String, String),
+    Everything(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64, String),
+    Arrays {
+        a: Array3<u16>,
+        b: Array4<i32>,
+        c: Array2<f64>,
+    },
+    Blobs(#[serde(with = "serde_bytes")] Vec<u8>),
+}
+
+// There is some upstream bug in deriving Arbitrary for Enum so we manually implement it here.
+impl Arbitrary for Enum {
+    fn arbitrary<G: Gen>(g: &mut G) -> Self {
+        match g.gen_range(0, 18) {
+            0 => Enum::Unit,
+            1 => Enum::U8(<u8>::arbitrary(g)),
+            2 => Enum::U16(<u16>::arbitrary(g)),
+            3 => Enum::U32(<u32>::arbitrary(g)),
+            4 => Enum::U64(<u64>::arbitrary(g)),
+            5 => {
+                let (a, b, c, d) = <(u8, u16, u32, u64)>::arbitrary(g);
+                Enum::Us(a, b, c, d)
+            }
+            6 => Enum::I8(<i8>::arbitrary(g)),
+            7 => Enum::I16(<i16>::arbitrary(g)),
+            8 => Enum::I32(<i32>::arbitrary(g)),
+            9 => Enum::I64(<i64>::arbitrary(g)),
+            10 => {
+                let (a, b, c, d) = <(i8, i16, i32, i64)>::arbitrary(g);
+                Enum::Is(a, b, c, d)
+            }
+            11 => Enum::F32(<f32>::arbitrary(g)),
+            12 => Enum::F64(<f64>::arbitrary(g)),
+            13 => {
+                let (a, b) = <(f32, f64)>::arbitrary(g);
+                Enum::Fs(a, b)
+            }
+            14 => Enum::String(String::arbitrary(g)),
+            15 => {
+                let (a, b) = <(String, String)>::arbitrary(g);
+                Enum::Strings(a, b)
+            }
+            16 => Enum::Everything(
+                <u8>::arbitrary(g),
+                <u16>::arbitrary(g),
+                <u32>::arbitrary(g),
+                <u64>::arbitrary(g),
+                <i8>::arbitrary(g),
+                <i16>::arbitrary(g),
+                <i32>::arbitrary(g),
+                <i64>::arbitrary(g),
+                <f32>::arbitrary(g),
+                <f64>::arbitrary(g),
+                <String>::arbitrary(g),
+            ),
+            17 => {
+                let a = Array3::arbitrary(g);
+                let b = Array4::arbitrary(g);
+                let c = Array2::arbitrary(g);
+                Enum::Arrays { a, b, c }
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Arbitrary, PartialEq, Serialize, Deserialize)]
+struct Unit;
+
+#[derive(Debug, Clone, Arbitrary, PartialEq, Serialize, Deserialize)]
+struct NewType(bool);
+
+#[derive(Debug, Clone, Arbitrary, PartialEq, Serialize, Deserialize)]
+struct Tuple(bool, u8, i16, f32, String);
+
+#[derive(Debug, Clone, Arbitrary, PartialEq, Serialize, Deserialize)]
+struct Struct {
+    a: Vec<Enum>,
+    b: BTreeMap<NonNullString, Enum>,
+    c: Tuple,
+    d: (Unit, Unit),
+    e: Array4<NewType>,
+}
+
+#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Serialize, Deserialize)]
+struct Array2<A: Arbitrary>([A; 2]);
+#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Serialize, Deserialize)]
+struct Array3<A: Arbitrary>([A; 3]);
+#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Serialize, Deserialize)]
+struct Array4<A: Arbitrary>([A; 4]);
+
+impl<A: Arbitrary> Arbitrary for Array2<A> {
+    fn arbitrary<G: Gen>(g: &mut G) -> Self {
+        Array2([A::arbitrary(g), A::arbitrary(g)])
+    }
+}
+impl<A: Arbitrary> Arbitrary for Array3<A> {
+    fn arbitrary<G: Gen>(g: &mut G) -> Self {
+        Array3([A::arbitrary(g), A::arbitrary(g), A::arbitrary(g)])
+    }
+}
+impl<A: Arbitrary> Arbitrary for Array4<A> {
+    fn arbitrary<G: Gen>(g: &mut G) -> Self {
+        Array4([
+            A::arbitrary(g),
+            A::arbitrary(g),
+            A::arbitrary(g),
+            A::arbitrary(g),
+        ])
+    }
+}
+
+quickcheck! {
+    fn qc_serious(x: Struct) -> bool {
+        let mut s = FlexbufferSerializer::new();
+        x.serialize(&mut s).unwrap();
+        let r = Reader::get_root(s.view()).unwrap();
+        println!("{}", r);
+        let x2 = Struct::deserialize(r).unwrap();
+        x == x2
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/rwyw.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/rwyw.rs
new file mode 100644
index 0000000..8c27e6b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/flexbuffers_tests/rwyw.rs
@@ -0,0 +1,513 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Read what you wrote.
+use flexbuffers::*;
+#[cfg(not(miri))]  // slow.
+use quickcheck;
+use serde::{Deserialize, Serialize};
+
+// TODO(cneo): Upstream this to the quickcheck crate. Also, write a macro to derive Arbitrary.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)]
+pub struct NonNullString(String);
+impl quickcheck::Arbitrary for NonNullString {
+    fn arbitrary<G: quickcheck::Gen>(g: &mut G) -> Self {
+        let size = std::cmp::min(1, usize::arbitrary(g));
+        NonNullString(
+            (0..)
+                .map(|_| <char>::arbitrary(g))
+                .filter(|&b| b != '\0')
+                .take(size)
+                .collect(),
+        )
+    }
+}
+
+#[cfg(not(miri))]  // slow.
+quickcheck! {
+    fn qc_vec_bool(xs: Vec<bool>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for &x in &xs {
+            v.push(x);
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(|(i, &x)| r.index(i).unwrap().get_bool().unwrap() == x)
+    }
+    fn qc_vec_uint(xs: Vec<u64>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for &x in &xs {
+            v.push(x);
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(|(i, &x)| r.idx(i).as_u64() == x)
+    }
+    fn qc_vec_int(xs: Vec<i64>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for &x in &xs {
+            v.push(x);
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(|(i, &x)| r.idx(i).as_i64() == x)
+    }
+    fn qc_vec_float(xs: Vec<f64>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for &x in &xs {
+            v.push(x);
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(|(i, &x)| (r.idx(i).as_f64() - x).abs() < std::f64::EPSILON)
+    }
+    fn qc_vec_string(xs: Vec<String>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for x in &xs {
+            v.push(x as &str);
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(|(i, x)| (r.idx(i).as_str() == x))
+    }
+    fn qc_map_int(xs: std::collections::BTreeMap<NonNullString, i64>) -> bool {
+        let mut builder = Builder::default();
+        let mut m = builder.start_map();
+        for (k, &v) in &xs {
+            m.push(&k.0, v);
+        }
+        m.end_map();
+        let r = Reader::get_root(builder.view()).unwrap().as_map();
+        xs.iter().enumerate().all(|(i, (k, &v))| {
+            r.idx(i).as_i64() == v && r.idx(k.0.as_str()).as_i64() == v
+        })
+    }
+    fn qc_map_string(xs: std::collections::BTreeMap<NonNullString, String>) -> bool {
+        let mut builder = Builder::default();
+        let mut m = builder.start_map();
+        for (k, v) in &xs {
+            m.push(&k.0, v as &str);
+        }
+        m.end_map();
+        let r = Reader::get_root(builder.view()).unwrap().as_map();
+        xs.iter().enumerate().all(|(i, (k, v))| {
+            r.idx(i).as_str() == v && r.idx(k.0.as_str()).as_str() == v
+        })
+    }
+    fn qc_blob(xs: Vec<Vec<u8>>) -> bool {
+        let mut builder = Builder::default();
+        let mut v = builder.start_vector();
+        for x in &xs {
+            v.push(Blob(x.as_ref()));
+        }
+        v.end_vector();
+        let r = Reader::get_root(builder.view()).unwrap().as_vector();
+        xs.iter().enumerate().all(
+            |(i, x)| r.idx(i).get_blob().unwrap().0.iter().eq(x.iter())
+        )
+    }
+    fn qc_serde_ints(
+        u8s: Vec<u8>,
+        u16s: Vec<u16>,
+        u32s: Vec<u32>,
+        u64s: Vec<u64>,
+        i8s: Vec<i8>,
+        i16s: Vec<i16>,
+        i32s: Vec<i32>,
+        i64s: Vec<i64>
+    ) -> bool {
+        #[derive(Serialize, Deserialize, PartialEq)]
+        struct Foo {
+            u8s: Vec<u8>,
+            u16s: Vec<u16>,
+            u32s: Vec<u32>,
+            u64s: Vec<u64>,
+            i8s: Vec<i8>,
+            i16s: Vec<i16>,
+            i32s: Vec<i32>,
+            i64s: Vec<i64>,
+        }
+        let mut ser = FlexbufferSerializer::new();
+        let foo1 = Foo { u8s, u16s, u32s, u64s, i8s, i16s, i32s, i64s };
+        foo1.serialize(&mut ser).unwrap();
+        let r = Reader::get_root(ser.view()).unwrap();
+        let foo2 = Foo::deserialize(r).unwrap();
+        foo1 == foo2
+    }
+    fn qc_serde_others(
+        bools: Vec<bool>,
+        strings: Vec<String>,
+        f32s: Vec<f32>,
+        f64s: Vec<f64>
+    ) -> bool {
+        #[derive(Serialize, Deserialize, PartialEq)]
+        struct Foo {
+            bools: Vec<bool>,
+            strings: Vec<String>,
+            f32s: Vec<f32>,
+            f64s: Vec<f64>,
+        }
+        let mut ser = FlexbufferSerializer::new();
+        let foo1 = Foo { bools, strings, f32s, f64s };
+        foo1.serialize(&mut ser).unwrap();
+        let r = Reader::get_root(ser.view()).unwrap();
+        let foo2 = Foo::deserialize(r).unwrap();
+        foo1 == foo2
+    }
+    fn qc_serde_others2(
+        bools: Vec<bool>,
+        strings: Vec<String>,
+        f32s: Vec<f32>,
+        f64s: Vec<f64>
+    ) -> bool {
+        #[derive(Serialize, Deserialize, PartialEq)]
+        struct Foo (Vec<bool>, Vec<String>, Vec<f32>, Vec<f64>);
+        let mut ser = FlexbufferSerializer::new();
+        let foo1 = Foo(bools, strings, f32s, f64s);
+        foo1.serialize(&mut ser).unwrap();
+        let r = Reader::get_root(ser.view()).unwrap();
+        let foo2 = Foo::deserialize(r).unwrap();
+        foo1 == foo2
+    }
+
+}
+
+#[test]
+fn empty_vectors() {
+    #[derive(PartialEq, Serialize, Deserialize, Default, Debug)]
+    struct Foo(Vec<u8>, Vec<i8>);
+    let foo1 = Foo::default();
+    let mut s = FlexbufferSerializer::new();
+    foo1.serialize(&mut s).unwrap();
+    dbg!(s.view());
+    let r = Reader::get_root(s.view()).unwrap();
+    let foo2 = Foo::deserialize(r).unwrap();
+    assert_eq!(foo1, foo2);
+}
+
+#[test]
+fn string() {
+    let mut builder = Builder::default();
+    let mut v = builder.start_vector();
+    v.push("foo");
+    v.push("barrr");
+    v.push("bazzzzzz");
+    v.end_vector();
+    let r = Reader::get_root(builder.view()).unwrap().as_vector();
+    assert_eq!(r.idx(0).as_str(), "foo");
+    assert_eq!(r.idx(1).as_str(), "barrr");
+    assert_eq!(r.idx(2).as_str(), "bazzzzzz");
+}
+
+#[test]
+fn store_13() {
+    let finished = singleton::<i32>(13);
+    let r = Reader::get_root(finished.as_ref()).unwrap();
+    assert_eq!(r.as_i32(), 13);
+}
+#[test]
+fn singleton_vector_uint_4_16bit() {
+    let mut builder = Builder::default();
+    let mut v = builder.start_vector();
+    v.push(2u8);
+    v.push(3u8);
+    v.push(5u8);
+    v.end_vector();
+    let buf1 = builder.view();
+    let buf2 = singleton(&[2u8, 3, 5]);
+    assert_eq!(buf1, buf2.as_slice());
+
+    let r = Reader::get_root(buf1).unwrap().as_vector();
+    assert_eq!(r.idx(0).get_u64(), Ok(2));
+    assert_eq!(r.idx(1).get_u64(), Ok(3));
+    assert_eq!(r.idx(2).get_u64(), Ok(5));
+    assert_eq!(r.index(3).unwrap_err(), ReaderError::IndexOutOfBounds);
+}
+#[test]
+fn vector_uint4() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(2u8);
+    v.push(3u8);
+    v.push(5u8);
+    v.push(7u8);
+    v.end_vector();
+    let r = Reader::get_root(fxb.view()).unwrap();
+    let v = r.as_vector();
+    assert_eq!(v.idx(0).get_u64(), Ok(2));
+    assert_eq!(v.idx(1).get_u64(), Ok(3));
+    assert_eq!(v.idx(2).get_u64(), Ok(5));
+    assert_eq!(v.idx(3).get_u64(), Ok(7));
+    assert!(v.index(4).is_err());
+    #[allow(deprecated)]
+    #[cfg(target_endian = "little")]
+    {
+        assert_eq!(r.get_slice::<u8>().unwrap(), [2, 3, 5, 7]);
+    }
+}
+#[test]
+fn store_and_read_blob() {
+    let mut fxb = Builder::default();
+    let mut v = fxb.start_vector();
+    v.push(Blob([1, 2, 3, 4].as_ref()));
+    v.push(Blob([5, 6, 7].as_ref()));
+    v.end_vector();
+
+    let r = Reader::get_root(fxb.view()).unwrap().as_vector();
+    assert_eq!(r.idx(0).get_blob(), Ok(Blob([1, 2, 3, 4].as_ref())));
+    assert_eq!(r.idx(1).get_blob(), Ok(Blob([5, 6, 7].as_ref())));
+}
+#[test]
+fn map_64bit() {
+    let mut fxb = Builder::default();
+    let mut m = fxb.start_map();
+    m.push("a", 257u16);
+    m.push("b", u64::max_value() - 3);
+    m.end_map();
+
+    let r = Reader::get_root(fxb.view()).unwrap().as_map();
+    assert_eq!(r.idx("a").as_u16(), 257);
+    assert_eq!(r.idx("b").as_u64(), u64::max_value() - 3);
+}
+#[test]
+fn index_map() {
+    let mut fxb = Builder::default();
+    let mut m = fxb.start_map();
+    m.push("foo", 17u8);
+    m.push("bar", 33u16);
+    m.push("baz", 41u32);
+    m.end_map();
+
+    let r = Reader::get_root(fxb.view()).unwrap().as_map();
+    assert_eq!(r.idx(0).get_u64(), Ok(33));
+    assert_eq!(r.idx(1).get_u64(), Ok(41));
+    assert_eq!(r.idx(2).as_u8(), 17);
+    assert_eq!(r.index(3).unwrap_err(), ReaderError::IndexOutOfBounds);
+
+    assert_eq!(r.idx("bar").as_u64(), 33);
+    assert_eq!(r.idx("baz").as_u32(), 41);
+    assert_eq!(r.idx("foo").as_u16(), 17);
+    assert_eq!(r.index("???").unwrap_err(), ReaderError::KeyNotFound);
+}
+
+#[test]
+fn map_strings() {
+    let mut fxb = Builder::default();
+    {
+        let mut m = fxb.start_map();
+        let mut a = m.start_vector("a");
+        for &s in ["b", "c", "d", "e"].iter() {
+            a.push(s);
+        }
+        a.end_vector();
+        let mut f = m.start_vector("f");
+        for &s in ["gh", "ij"].iter() {
+            f.push(s);
+        }
+    }
+    let r = Reader::get_root(fxb.view()).unwrap().as_map();
+    let a = r.idx("a").as_vector();
+
+    assert_eq!(a.idx(0).as_str(), "b");
+    assert_eq!(a.idx(1).as_str(), "c");
+    assert_eq!(a.idx(2).as_str(), "d");
+    assert_eq!(a.idx(3).as_str(), "e");
+
+    let f = r.idx("f").as_vector();
+    assert_eq!(f.idx(0).as_str(), "gh");
+    assert_eq!(f.idx(1).as_str(), "ij");
+
+    // Defaults to empty string for index errors.
+    assert_eq!(r.idx("a").as_vector().idx(4).as_str(), "");
+    assert_eq!(r.idx("b").as_vector().idx(2).as_str(), "");
+    assert_eq!(r.idx("c").as_str(), "");
+}
+
+#[test]
+fn store_u64() {
+    let finished = singleton(u64::max_value() - 10);
+    let r = Reader::get_root(finished.as_ref()).unwrap();
+    assert_eq!(r.get_u64(), Ok(u64::max_value() - 10));
+}
+#[test]
+fn store_indirects() {
+    let mut b = Builder::default();
+    let mut v = b.start_vector();
+    v.push(IndirectInt(-42));
+    v.push(IndirectUInt(9000));
+    v.push(IndirectFloat(3.14));
+    v.end_vector();
+    let r = Reader::get_root(b.view()).unwrap().as_vector();
+    assert_eq!(r.idx(0).get_i64().unwrap(), -42);
+    assert_eq!(r.idx(1).get_u64().unwrap(), 9000);
+    assert_eq!(r.idx(2).get_f64().unwrap(), 3.14);
+}
+
+#[derive(Serialize, Deserialize, Debug, PartialEq)]
+struct Foo {
+    a: i8,
+    b: f64,
+    c: Vec<u32>,
+    d: String,
+}
+
+#[cfg(not(miri))]  // slow.
+quickcheck! {
+    fn serde_foo(a: i8,
+    b: f64,
+    c: Vec<u32>,
+    d: String) -> bool {
+        let mut s = FlexbufferSerializer::new();
+        let data = Foo { a, b, c, d };
+        data.serialize(&mut s).unwrap();
+
+        let read = Foo::deserialize(Reader::get_root(s.view()).unwrap()).unwrap();
+        data == read
+    }
+}
+
+#[test]
+fn serde_serious() {
+    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    enum MyEnum {
+        Unit,
+        NewType([i32; 3]),
+        Tuple(f32, f64),
+        Struct { a: u8, b: u16, c: u32 },
+    }
+    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    struct MyNewType;
+
+    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    struct MyStruct {
+        a: u8,
+        b: u16,
+        c: u32,
+        d: u64,
+    }
+
+    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    struct MyUnitStruct(Vec<String>);
+
+    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    struct MyTupleStruct(MyNewType, MyUnitStruct, MyStruct, Vec<MyEnum>);
+
+    let data = MyTupleStruct(
+        MyNewType,
+        MyUnitStruct(vec!["Hello".to_string(), "World".to_string()]),
+        MyStruct {
+            a: 2,
+            b: 4,
+            c: 8,
+            d: 16,
+        },
+        vec![
+            MyEnum::Unit,
+            MyEnum::NewType([-1, 0, 1]),
+            MyEnum::Unit,
+            MyEnum::Tuple(3.14, 2.71),
+            MyEnum::Struct {
+                a: 32,
+                b: 64,
+                c: 128,
+            },
+        ],
+    );
+
+    let mut s = FlexbufferSerializer::new();
+    data.serialize(&mut s).unwrap();
+
+    let reader = Reader::get_root(s.view()).unwrap();
+    let read = MyTupleStruct::deserialize(reader).unwrap();
+    assert_eq!(data, read);
+}
+#[test]
+fn serialize_serde_with_bytes_as_blob() {
+    #[derive(Serialize, Deserialize)]
+    struct Foo(#[serde(with = "serde_bytes")] Vec<u8>);
+    let mut s = FlexbufferSerializer::new();
+    Foo(vec![5, 6, 7, 8]).serialize(&mut s).unwrap();
+    let reader = Reader::get_root(s.view()).unwrap();
+    assert_eq!(reader.flexbuffer_type(), FlexBufferType::Blob);
+    assert_eq!(reader.as_blob(), Blob([5, 6, 7, 8].as_ref()));
+}
+#[test]
+fn iter() {
+    let mut fxb = Builder::default();
+    {
+        let mut m = fxb.start_map();
+        m.push("a", "42");
+        m.push("b", 250i64);
+        m.push("c", 5000u16);
+    }
+    let r = Reader::get_root(fxb.view()).unwrap();
+
+    let v: Vec<u32> = r.as_vector().iter().map(|x| x.as_u32()).collect();
+    assert_eq!(&v, &[42, 250, 5000]);
+}
+
+#[test]
+fn deserialize_newtype_i8() {
+    #[derive(Deserialize)]
+    struct Foo(u8);
+    let data = [13, 4, 1];
+    let r = Reader::get_root(data.as_ref()).unwrap();
+    let foo = Foo::deserialize(r).unwrap();
+    assert_eq!(foo.0, 13);
+}
+#[test]
+fn deserialize_newtype_str() {
+    #[derive(Deserialize)]
+    struct Foo<'a>(&'a str);
+    let data = [5, b'h', b'e', b'l', b'l', b'o', b'\0', 6, 5 << 2, 1];
+    let r = Reader::get_root(data.as_ref()).unwrap();
+    let foo = Foo::deserialize(r).unwrap();
+    assert_eq!(foo.0, "hello");
+}
+#[test]
+#[rustfmt::skip]
+fn deserialize_tuple_struct_to_vec_uint4() {
+    #[derive(Deserialize)]
+    struct Foo(u8, u16, u32, u64);
+    let data = [
+        4, 0, 16, 0, 64, 0, 0, 1, // Data
+        8,              // Vector offset.
+        23 << 2 | 1,    // (VectorUInt4, W16 - referring to data).
+        1,              // Root width W8 - referring to vector.
+    ];
+    let r = Reader::get_root(data.as_ref()).unwrap();
+    let foo = Foo::deserialize(r).unwrap();
+    assert_eq!(foo.0, 4);
+    assert_eq!(foo.1, 16);
+    assert_eq!(foo.2, 64);
+    assert_eq!(foo.3, 256);
+
+    let data = [
+        1, 2, 3, 4, // The vector.
+        4,          // Root data (offset).
+        23 << 2,    // Root type: VectorUInt4, W8.
+        1,          // Root width: W8.
+    ];
+    let r = Reader::get_root(data.as_ref()).unwrap();
+    let foo = Foo::deserialize(r).unwrap();
+    assert_eq!(foo.0, 1);
+    assert_eq!(foo.1, 2);
+    assert_eq!(foo.2, 3);
+    assert_eq!(foo.3, 4);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/integration_test.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/integration_test.rs
new file mode 100644
index 0000000..01973a6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/integration_test.rs
@@ -0,0 +1,3259 @@
+/*
+ *
+ * Copyright 2018 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[macro_use]
+#[cfg(not(miri))] // slow.
+extern crate quickcheck;
+extern crate flatbuffers;
+extern crate flexbuffers;
+extern crate rand;
+extern crate serde;
+#[macro_use]
+extern crate serde_derive;
+#[cfg(not(miri))] // slow.
+#[macro_use]
+extern crate quickcheck_derive;
+
+mod flexbuffers_tests;
+mod more_defaults_test;
+mod optional_scalars_test;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/include_test1_generated.rs"]
+pub mod include_test1_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../include_test/sub/include_test2_generated.rs"]
+pub mod include_test2_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../namespace_test/namespace_test1_generated.rs"]
+pub mod namespace_test1_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../namespace_test/namespace_test2_generated.rs"]
+pub mod namespace_test2_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../monster_test_generated.rs"]
+mod monster_test_generated;
+pub use monster_test_generated::my_game;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../optional_scalars_generated.rs"]
+mod optional_scalars_generated;
+
+#[allow(dead_code, unused_imports)]
+#[path = "../../arrays_test_generated.rs"]
+mod arrays_test_generated;
+
+#[rustfmt::skip] // TODO: Use standard rust formatting and remove dead code.
+#[allow(dead_code)]
+mod flatbuffers_tests {
+use super::*;
+
+// Include simple random number generator to ensure results will be the
+// same across platforms.
+// http://en.wikipedia.org/wiki/Park%E2%80%93Miller_random_number_generator
+struct LCG(u64);
+impl LCG {
+    fn new() -> Self {
+        LCG { 0: 48271 }
+    }
+    fn next(&mut self) -> u64 {
+        let old = self.0;
+        self.0 = (self.0 * 279470273u64) % 4294967291u64;
+        old
+    }
+    fn reset(&mut self) {
+        self.0 = 48271
+    }
+}
+
+// test helper macro to return an error if two expressions are not equal
+macro_rules! check_eq {
+    ($field_call:expr, $want:expr) => (
+        if $field_call == $want {
+            Ok(())
+        } else {
+            Err(stringify!($field_call))
+        }
+    )
+}
+
+#[test]
+fn macro_check_eq() {
+    assert!(check_eq!(1, 1).is_ok());
+    assert!(check_eq!(1, 2).is_err());
+}
+
+// test helper macro to return an error if two expressions are equal
+macro_rules! check_is_some {
+    ($field_call:expr) => (
+        if $field_call.is_some() {
+            Ok(())
+        } else {
+            Err(stringify!($field_call))
+        }
+    )
+}
+
+#[test]
+fn macro_check_is_some() {
+    let some: Option<usize> = Some(0);
+    let none: Option<usize> = None;
+    assert!(check_is_some!(some).is_ok());
+    assert!(check_is_some!(none).is_err());
+}
+
+#[test]
+fn object_api_defaults() {
+    use my_game::example::*;
+    assert_eq!(
+        Vec3T::default(), Vec3T {
+        x: 0.0,
+        y: 0.0,
+        z: 0.0,
+        test1: 0.0,
+        test2: Color::empty(),
+        test3: TestT {
+            a: 0,
+            b: 0
+        }
+    });
+    assert_eq!(
+        MonsterT::default(),
+        MonsterT {
+            pos: None,
+            hp: 100,
+            mana: 150,
+            name: String::new(),  // required string => default is empty string.
+            color: Color::Blue,
+            inventory: None,
+            testarrayoftables: None,
+            testarrayofstring: None,
+            testarrayofstring2: None,
+            testarrayofbools: None,
+            testarrayofsortedstruct: None,
+            enemy: None,
+            test: AnyT::NONE,
+            test4: None,
+            test5: None,
+            testnestedflatbuffer: None,
+            testempty: None,
+            testbool: false,
+            testhashs32_fnv1: 0,
+            testhashu32_fnv1: 0,
+            testhashs64_fnv1: 0,
+            testhashu64_fnv1: 0,
+            testhashs32_fnv1a: 0,
+            testhashu32_fnv1a: 0,
+            testhashs64_fnv1a: 0,
+            testhashu64_fnv1a: 0,
+            testf: 3.14159,
+            testf2: 3.0,
+            testf3: 0.0,
+            flex: None,
+            vector_of_longs: None,
+            vector_of_doubles: None,
+            parent_namespace_test: None,
+            vector_of_referrables: None,
+            single_weak_reference: 0,
+            vector_of_weak_references: None,
+            vector_of_strong_referrables: None,
+            co_owning_reference: 0,
+            vector_of_co_owning_references: None,
+            non_owning_reference: 0,
+            vector_of_non_owning_references: None,
+            any_unique: AnyUniqueAliasesT::NONE,
+            any_ambiguous: AnyAmbiguousAliasesT::NONE,
+            vector_of_enums: None,
+            signed_enum: Race::None,
+            testrequirednestedflatbuffer: None,  // despite the name, it is not required.
+            scalar_key_sorted_tables: None,
+        }
+    );
+}
+
+fn create_serialized_example_with_generated_code(builder: &mut flatbuffers::FlatBufferBuilder) {
+    let mon = {
+        let s0 = builder.create_string("test1");
+        let s1 = builder.create_string("test2");
+        let fred_name = builder.create_string("Fred");
+
+        // can't inline creation of this Vec3 because we refer to it by reference, so it must live
+        // long enough to be used by MonsterArgs.
+        let pos = my_game::example::Vec3::new(1.0, 2.0, 3.0, 3.0, my_game::example::Color::Green, &my_game::example::Test::new(5i16, 6i8));
+
+        let args = my_game::example::MonsterArgs{
+            hp: 80,
+            mana: 150,
+            name: Some(builder.create_string("MyMonster")),
+            pos: Some(&pos),
+            test_type: my_game::example::Any::Monster,
+            test: Some(my_game::example::Monster::create(builder, &my_game::example::MonsterArgs{
+                name: Some(fred_name),
+                ..Default::default()
+            }).as_union_value()),
+            inventory: Some(builder.create_vector_direct(&[0u8, 1, 2, 3, 4][..])),
+            test4: Some(builder.create_vector_direct(&[my_game::example::Test::new(10, 20),
+                                                       my_game::example::Test::new(30, 40)])),
+            testarrayofstring: Some(builder.create_vector(&[s0, s1])),
+            ..Default::default()
+        };
+        my_game::example::Monster::create(builder, &args)
+    };
+    my_game::example::finish_monster_buffer(builder, mon);
+}
+
+fn create_serialized_example_with_library_code(builder: &mut flatbuffers::FlatBufferBuilder) {
+    let nested_union_mon = {
+        let name = builder.create_string("Fred");
+        let table_start = builder.start_table();
+        builder.push_slot_always(my_game::example::Monster::VT_NAME, name);
+        builder.end_table(table_start)
+    };
+    let pos = my_game::example::Vec3::new(1.0, 2.0, 3.0, 3.0, my_game::example::Color::Green, &my_game::example::Test::new(5i16, 6i8));
+    let inv = builder.create_vector(&[0u8, 1, 2, 3, 4]);
+
+    let test4 = builder.create_vector(&[my_game::example::Test::new(10, 20),
+                                        my_game::example::Test::new(30, 40)][..]);
+
+    let name = builder.create_string("MyMonster");
+    let testarrayofstring = builder.create_vector_of_strings(&["test1", "test2"][..]);
+
+    // begin building
+
+    let table_start = builder.start_table();
+    builder.push_slot(my_game::example::Monster::VT_HP, 80i16, 100);
+    builder.push_slot_always(my_game::example::Monster::VT_NAME, name);
+    builder.push_slot_always(my_game::example::Monster::VT_POS, &pos);
+    builder.push_slot(my_game::example::Monster::VT_TEST_TYPE, my_game::example::Any::Monster, my_game::example::Any::NONE);
+    builder.push_slot_always(my_game::example::Monster::VT_TEST, nested_union_mon);
+    builder.push_slot_always(my_game::example::Monster::VT_INVENTORY, inv);
+    builder.push_slot_always(my_game::example::Monster::VT_TEST4, test4);
+    builder.push_slot_always(my_game::example::Monster::VT_TESTARRAYOFSTRING, testarrayofstring);
+    let root = builder.end_table(table_start);
+    builder.finish(root, Some(my_game::example::MONSTER_IDENTIFIER));
+}
+
+fn serialized_example_is_accessible_and_correct(bytes: &[u8], identifier_required: bool, size_prefixed: bool) -> Result<(), &'static str> {
+
+    if identifier_required {
+        let correct = if size_prefixed {
+            my_game::example::monster_size_prefixed_buffer_has_identifier(bytes)
+        } else {
+            my_game::example::monster_buffer_has_identifier(bytes)
+        };
+        check_eq!(correct, true)?;
+    }
+
+    let m = if size_prefixed {
+        my_game::example::size_prefixed_root_as_monster(bytes).unwrap()
+    } else {
+        my_game::example::root_as_monster(bytes).unwrap()
+    };
+
+    check_eq!(m.hp(), 80)?;
+    check_eq!(m.mana(), 150)?;
+    check_eq!(m.name(), "MyMonster")?;
+
+    let pos = m.pos().unwrap();
+    check_eq!(pos.x(), 1.0f32)?;
+    check_eq!(pos.y(), 2.0f32)?;
+    check_eq!(pos.z(), 3.0f32)?;
+    check_eq!(pos.test1(), 3.0f64)?;
+    check_eq!(pos.test2(), my_game::example::Color::Green)?;
+
+    let pos_test3 = pos.test3();
+    check_eq!(pos_test3.a(), 5i16)?;
+    check_eq!(pos_test3.b(), 6i8)?;
+
+    check_eq!(m.test_type(), my_game::example::Any::Monster)?;
+    check_is_some!(m.test())?;
+    let table2 = m.test().unwrap();
+    let monster2 = my_game::example::Monster::init_from_table(table2);
+
+    check_eq!(monster2.name(), "Fred")?;
+
+    check_is_some!(m.inventory())?;
+    let inv = m.inventory().unwrap();
+    check_eq!(inv.len(), 5)?;
+    check_eq!(inv.iter().sum::<u8>(), 10u8)?;
+    check_eq!(inv.iter().rev().sum::<u8>(), 10u8)?;
+
+    check_is_some!(m.test4())?;
+    let test4 = m.test4().unwrap();
+    check_eq!(test4.len(), 2)?;
+    check_eq!(test4[0].a() as i32 + test4[0].b() as i32 +
+              test4[1].a() as i32 + test4[1].b() as i32, 100)?;
+
+    check_is_some!(m.testarrayofstring())?;
+    let testarrayofstring = m.testarrayofstring().unwrap();
+    check_eq!(testarrayofstring.len(), 2)?;
+    check_eq!(testarrayofstring.get(0), "test1")?;
+    check_eq!(testarrayofstring.get(1), "test2")?;
+
+    Ok(())
+}
+
+#[test]
+fn test_object_api_reads_correctly() -> Result<(), &'static str>{
+    let mut fbb = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_library_code(&mut fbb);
+
+    let m = my_game::example::root_as_monster(fbb.finished_data()).unwrap().unpack();
+
+    check_eq!(m.hp, 80)?;
+    check_eq!(m.mana, 150)?;
+    check_eq!(m.name, "MyMonster")?;
+
+    let pos = m.pos.as_ref().unwrap();
+    check_eq!(pos.x, 1.0f32)?;
+    check_eq!(pos.y, 2.0f32)?;
+    check_eq!(pos.z, 3.0f32)?;
+    check_eq!(pos.test1, 3.0f64)?;
+    check_eq!(pos.test2, my_game::example::Color::Green)?;
+
+    let pos_test3 = &pos.test3;
+    check_eq!(pos_test3.a, 5i16)?;
+    check_eq!(pos_test3.b, 6i8)?;
+
+    let monster2 = m.test.as_monster().unwrap();
+    check_eq!(monster2.name, "Fred")?;
+
+    let inv = m.inventory.as_ref().unwrap();
+    check_eq!(inv.len(), 5)?;
+    check_eq!(inv.iter().sum::<u8>(), 10u8)?;
+    check_eq!(inv.iter().rev().sum::<u8>(), 10u8)?;
+
+    let test4 = m.test4.as_ref().unwrap();
+    check_eq!(test4.len(), 2)?;
+    check_eq!(test4[0].a as i32 + test4[0].b as i32 +
+              test4[1].a as i32 + test4[1].b as i32, 100)?;
+
+    let testarrayofstring = m.testarrayofstring.as_ref().unwrap();
+    check_eq!(testarrayofstring.len(), 2)?;
+    check_eq!(testarrayofstring[0], "test1")?;
+    check_eq!(testarrayofstring[1], "test2")?;
+    Ok(())
+}
+
+
+
+// Disabled due to Windows CI limitations.
+// #[test]
+// fn builder_initializes_with_maximum_buffer_size() {
+//     flatbuffers::FlatBufferBuilder::new_with_capacity(flatbuffers::FLATBUFFERS_MAX_BUFFER_SIZE);
+// }
+
+#[should_panic]
+#[test]
+fn builder_abort_with_greater_than_maximum_buffer_size() {
+    flatbuffers::FlatBufferBuilder::new_with_capacity(flatbuffers::FLATBUFFERS_MAX_BUFFER_SIZE+1);
+}
+
+#[test]
+fn builder_collapses_into_vec() {
+    let mut b = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(&mut b);
+    let (backing_buf, head) = b.collapse();
+    serialized_example_is_accessible_and_correct(&backing_buf[head..], true, false).unwrap();
+}
+
+#[test]
+#[cfg(not(miri))]  // slow.
+fn verifier_one_byte_errors_do_not_crash() {
+    let mut b = flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_library_code(&mut b);
+    let mut badbuf = b.finished_data().to_vec();
+    // If the verifier says a buffer is okay then using it won't cause a crash.
+    // We use write_fmt since Debug visits all the fields - but there's no need to store anything.
+    struct ForgetfulWriter;
+    use std::fmt::Write;
+    impl Write for ForgetfulWriter {
+        fn write_str(&mut self, _: &str) -> Result<(), std::fmt::Error> {
+            Ok(())
+        }
+    }
+    let mut w = ForgetfulWriter;
+    for d in 1..=255u8 {
+        for i in 0..badbuf.len() {
+            let orig = badbuf[i];
+            badbuf[i] = badbuf[i].wrapping_add(d);
+            if let Ok(m) = flatbuffers::root::<my_game::example::Monster>(&badbuf) {
+                w.write_fmt(format_args!("{:?}", m)).unwrap()
+            }
+            badbuf[i] = orig;
+        }
+    }
+}
+#[test]
+#[cfg(not(miri))]  // slow.
+fn verifier_too_many_tables() {
+    use my_game::example::*;
+    let b = &mut flatbuffers::FlatBufferBuilder::new();
+    let r = Referrable::create(b, &ReferrableArgs { id: 42 });
+    let rs = b.create_vector(&vec![r; 500]);
+    let name = Some(b.create_string("foo"));
+    let m = Monster::create(b, &MonsterArgs {
+        vector_of_referrables: Some(rs),
+        name,  // required field.
+        ..Default::default()
+    });
+    b.finish(m, None);
+
+    let data = b.finished_data();
+    let mut opts = flatbuffers::VerifierOptions::default();
+
+    opts.max_tables = 500;
+    let res = flatbuffers::root_with_opts::<Monster>(&opts, data);
+    assert_eq!(res.unwrap_err(), flatbuffers::InvalidFlatbuffer::TooManyTables);
+
+    opts.max_tables += 2;
+    assert!(flatbuffers::root_with_opts::<Monster>(&opts, data).is_ok());
+}
+#[test]
+#[cfg(not(miri))]  // slow.
+fn verifier_apparent_size_too_large() {
+    use my_game::example::*;
+    let b = &mut flatbuffers::FlatBufferBuilder::new();
+    let name = Some(b.create_string("foo"));
+    // String amplification attack.
+    let s = b.create_string(&(std::iter::repeat("X").take(1000).collect::<String>()));
+    let testarrayofstring = Some(b.create_vector(&vec![s; 1000]));
+    let m = Monster::create(b, &MonsterArgs {
+        testarrayofstring,
+        name,  // required field.
+        ..Default::default()
+    });
+    b.finish(m, None);
+    let data = b.finished_data();
+    assert!(data.len() < 5100);  // est 4000 for the vector + 1000 for the string + 100 overhead.
+    let mut opts = flatbuffers::VerifierOptions::default();
+    opts.max_apparent_size = 1_000_000;
+
+    let res = flatbuffers::root_with_opts::<Monster>(&opts, data);
+    assert_eq!(res.unwrap_err(), flatbuffers::InvalidFlatbuffer::ApparentSizeTooLarge);
+
+    opts.max_apparent_size += 20_000;
+    assert!(flatbuffers::root_with_opts::<Monster>(&opts, data).is_ok());
+}
+#[test]
+fn verifier_in_too_deep() {
+    use my_game::example::*;
+    let b = &mut flatbuffers::FlatBufferBuilder::new();
+    let name = Some(b.create_string("foo"));
+    let mut prev_monster = None;
+    for _ in 0..11 {
+        prev_monster = Some(Monster::create(b, &MonsterArgs {
+            enemy: prev_monster,
+            name,  // required field.
+            ..Default::default()
+        }));
+    };
+    b.finish(prev_monster.unwrap(), None);
+    let mut opts = flatbuffers::VerifierOptions::default();
+    opts.max_depth = 10;
+
+    let data = b.finished_data();
+    let res = flatbuffers::root_with_opts::<Monster>(&opts, data);
+    assert_eq!(res.unwrap_err(), flatbuffers::InvalidFlatbuffer::DepthLimitReached);
+
+    opts.max_depth += 1;
+    assert!(flatbuffers::root_with_opts::<Monster>(&opts, data).is_ok());
+}
+
+#[cfg(test)]
+mod generated_constants {
+    extern crate flatbuffers;
+    use super::my_game;
+
+    #[test]
+    fn monster_identifier() {
+        assert_eq!("MONS", my_game::example::MONSTER_IDENTIFIER);
+    }
+
+    #[test]
+    fn monster_file_extension() {
+        assert_eq!("mon", my_game::example::MONSTER_EXTENSION);
+    }
+
+    #[test]
+    fn enum_constants_are_public() {
+        assert_eq!(-1, my_game::example::Race::ENUM_MIN);
+        assert_eq!(2, my_game::example::Race::ENUM_MAX);
+        assert_eq!(my_game::example::Race::ENUM_VALUES, [
+            my_game::example::Race::None,
+            my_game::example::Race::Human,
+            my_game::example::Race::Dwarf,
+            my_game::example::Race::Elf,
+        ]);
+
+        assert_eq!(0, my_game::example::Any::ENUM_MIN);
+        assert_eq!(3, my_game::example::Any::ENUM_MAX);
+        assert_eq!(my_game::example::Any::ENUM_VALUES, [
+            my_game::example::Any::NONE,
+            my_game::example::Any::Monster,
+            my_game::example::Any::TestSimpleTableWithEnum,
+            my_game::example::Any::MyGame_Example2_Monster,
+        ]);
+
+        assert_eq!(0, my_game::example::AnyUniqueAliases::ENUM_MIN);
+        assert_eq!(3, my_game::example::AnyUniqueAliases::ENUM_MAX);
+        assert_eq!(my_game::example::AnyUniqueAliases::ENUM_VALUES, [
+            my_game::example::AnyUniqueAliases::NONE,
+            my_game::example::AnyUniqueAliases::M,
+            my_game::example::AnyUniqueAliases::TS,
+            my_game::example::AnyUniqueAliases::M2,
+        ]);
+
+        assert_eq!(0, my_game::example::AnyAmbiguousAliases::ENUM_MIN);
+        assert_eq!(3, my_game::example::AnyAmbiguousAliases::ENUM_MAX);
+        assert_eq!(my_game::example::AnyAmbiguousAliases::ENUM_VALUES, [
+            my_game::example::AnyAmbiguousAliases::NONE,
+            my_game::example::AnyAmbiguousAliases::M1,
+            my_game::example::AnyAmbiguousAliases::M2,
+            my_game::example::AnyAmbiguousAliases::M3,
+        ]);
+    }
+}
+
+#[cfg(test)]
+mod lifetime_correctness {
+    extern crate flatbuffers;
+
+    use std::mem;
+
+    use super::my_game;
+    use super::load_file;
+
+    #[test]
+    fn table_get_field_from_static_buffer_1() {
+        let buf = load_file("../monsterdata_test.mon").expect("missing monsterdata_test.mon");
+        // create 'static slice
+        let slice: &[u8] = &buf;
+        let slice: &'static [u8] = unsafe { mem::transmute(slice) };
+        // make sure values retrieved from the 'static buffer are themselves 'static
+        let monster: my_game::example::Monster<'static> = my_game::example::root_as_monster(slice).unwrap();
+        // this line should compile:
+        let name: Option<&'static str> = monster._tab.get::<flatbuffers::ForwardsUOffset<&str>>(my_game::example::Monster::VT_NAME, None);
+        assert_eq!(name, Some("MyMonster"));
+    }
+
+    #[test]
+    fn table_get_field_from_static_buffer_2() {
+        static DATA: [u8; 4] = [0, 0, 0, 0]; // some binary data
+        let table: flatbuffers::Table<'static> = flatbuffers::Table::new(&DATA, 0);
+        // this line should compile:
+        table.get::<&'static str>(0, None);
+    }
+
+    #[test]
+    fn table_object_self_lifetime_in_closure() {
+        // This test is designed to ensure that lifetimes for temporary intermediate tables aren't inflated beyond where the need to be.
+        let buf = load_file("../monsterdata_test.mon").expect("missing monsterdata_test.mon");
+        let monster = my_game::example::root_as_monster(&buf).unwrap();
+        let enemy: Option<my_game::example::Monster> = monster.enemy();
+        // This line won't compile if "self" is required to live for the lifetime of buf above as the borrow disappears at the end of the closure.
+        let enemy_of_my_enemy = enemy.map(|e| {
+            // enemy (the Option) is consumed, and the enum's value is taken as a temporary (e) at the start of the closure
+            let name = e.name();
+            // ... the temporary dies here, so for this to compile name's lifetime must not be tied to the temporary
+            name
+            // If this test fails the error would be "`e` dropped here while still borrowed"
+        });
+        assert_eq!(enemy_of_my_enemy, Some("Fred"));
+    }
+}
+
+#[cfg(test)]
+mod roundtrip_generated_code {
+    extern crate flatbuffers;
+
+    use super::my_game;
+
+    fn build_mon<'a, 'b>(builder: &'a mut flatbuffers::FlatBufferBuilder, args: &'b my_game::example::MonsterArgs) -> my_game::example::Monster<'a> {
+        let mon = my_game::example::Monster::create(builder, &args);
+        my_game::example::finish_monster_buffer(builder, mon);
+        my_game::example::root_as_monster(builder.finished_data()).unwrap()
+    }
+
+    #[test]
+    fn scalar_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{hp: 123, name: Some(name), ..Default::default()});
+        assert_eq!(m.hp(), 123);
+    }
+    #[test]
+    fn scalar_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.hp(), 100);
+    }
+    #[test]
+    fn string_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foobar");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.name(), "foobar");
+    }
+    #[test]
+    fn struct_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            pos: Some(&my_game::example::Vec3::new(1.0, 2.0, 3.0, 4.0,
+                                                   my_game::example::Color::Green,
+                                                   &my_game::example::Test::new(98, 99))),
+            ..Default::default()
+        });
+        assert_eq!(m.pos(), Some(&my_game::example::Vec3::new(1.0, 2.0, 3.0, 4.0,
+                                                              my_game::example::Color::Green,
+                                                              &my_game::example::Test::new(98, 99))));
+    }
+    #[test]
+    fn struct_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.pos(), None);
+    }
+    #[test]
+    fn enum_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), color: my_game::example::Color::Red, ..Default::default()});
+        assert_eq!(m.color(), my_game::example::Color::Red);
+    }
+    #[test]
+    fn enum_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.color(), my_game::example::Color::Blue);
+    }
+    #[test]
+    fn union_store() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        {
+            let name_inner = b.create_string("foo");
+            let name_outer = b.create_string("bar");
+
+            let inner = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name_inner),
+                ..Default::default()
+            });
+            let outer = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name_outer),
+                test_type: my_game::example::Any::Monster,
+                test: Some(inner.as_union_value()),
+                ..Default::default()
+            });
+            my_game::example::finish_monster_buffer(b, outer);
+        }
+
+        let mon = my_game::example::root_as_monster(b.finished_data()).unwrap();
+        assert_eq!(mon.name(), "bar");
+        assert_eq!(mon.test_type(), my_game::example::Any::Monster);
+        assert_eq!(my_game::example::Monster::init_from_table(mon.test().unwrap()).name(),
+                   "foo");
+        assert_eq!(mon.test_as_monster().unwrap().name(), "foo");
+        assert_eq!(mon.test_as_test_simple_table_with_enum(), None);
+        assert_eq!(mon.test_as_my_game_example_2_monster(), None);
+    }
+    #[test]
+    fn union_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.test_type(), my_game::example::Any::NONE);
+        assert_eq!(m.test(), None);
+    }
+    #[test]
+    fn table_full_namespace_store() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        {
+            let name_inner = b.create_string("foo");
+            let name_outer = b.create_string("bar");
+
+            let inner = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name_inner),
+                ..Default::default()
+            });
+            let outer = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name_outer),
+                enemy: Some(inner),
+                ..Default::default()
+            });
+            my_game::example::finish_monster_buffer(b, outer);
+        }
+
+        let mon = my_game::example::root_as_monster(b.finished_data()).unwrap();
+        assert_eq!(mon.name(), "bar");
+        assert_eq!(mon.enemy().unwrap().name(), "foo");
+    }
+    #[test]
+    fn table_full_namespace_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.enemy(), None);
+    }
+    #[test]
+    fn table_store() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        {
+            let id_inner = b.create_string("foo");
+            let name_outer = b.create_string("bar");
+
+            let inner = my_game::example::Stat::create(b, &my_game::example::StatArgs{
+                id: Some(id_inner),
+                ..Default::default()
+            });
+            let outer = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name_outer),
+                testempty: Some(inner),
+                ..Default::default()
+            });
+            my_game::example::finish_monster_buffer(b, outer);
+        }
+
+        let mon = my_game::example::root_as_monster(b.finished_data()).unwrap();
+        assert_eq!(mon.name(), "bar");
+        assert_eq!(mon.testempty().unwrap().id(), Some("foo"));
+    }
+    #[test]
+    fn table_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert_eq!(m.testempty(), None);
+    }
+    #[test]
+    fn nested_flatbuffer_store() {
+        let b0 = {
+            let mut b0 = flatbuffers::FlatBufferBuilder::new();
+            let args = my_game::example::MonsterArgs{
+                hp: 123,
+                name: Some(b0.create_string("foobar")),
+                ..Default::default()
+            };
+            let mon = my_game::example::Monster::create(&mut b0, &args);
+            my_game::example::finish_monster_buffer(&mut b0, mon);
+            b0
+        };
+
+        let b1 = {
+            let mut b1 = flatbuffers::FlatBufferBuilder::new();
+            let args = my_game::example::MonsterArgs{
+                testnestedflatbuffer: Some(b1.create_vector(b0.finished_data())),
+                name: Some(b1.create_string("foo")),
+                ..Default::default()
+            };
+            let mon = my_game::example::Monster::create(&mut b1, &args);
+            my_game::example::finish_monster_buffer(&mut b1, mon);
+            b1
+        };
+
+        let m = my_game::example::root_as_monster(b1.finished_data()).unwrap();
+
+        assert!(m.testnestedflatbuffer().is_some());
+        assert_eq!(m.testnestedflatbuffer().unwrap(), b0.finished_data());
+
+        let m2_a = my_game::example::root_as_monster(m.testnestedflatbuffer().unwrap()).unwrap();
+        assert_eq!(m2_a.hp(), 123);
+        assert_eq!(m2_a.name(), "foobar");
+
+        assert!(m.testnestedflatbuffer_nested_flatbuffer().is_some());
+        let m2_b = m.testnestedflatbuffer_nested_flatbuffer().unwrap();
+
+        assert_eq!(m2_b.hp(), 123);
+        assert_eq!(m2_b.name(), "foobar");
+    }
+    #[test]
+    fn nested_flatbuffer_default() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{name: Some(name), ..Default::default()});
+        assert!(m.testnestedflatbuffer().is_none());
+    }
+    #[test]
+    fn vector_of_string_store_helper_build() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector_of_strings(&["foobar", "baz"]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            testarrayofstring: Some(v), ..Default::default()});
+        assert_eq!(m.testarrayofstring().unwrap().len(), 2);
+        assert_eq!(m.testarrayofstring().unwrap().get(0), "foobar");
+        assert_eq!(m.testarrayofstring().unwrap().get(1), "baz");
+
+        let rust_vec_inst = m.testarrayofstring().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect.len(), 2);
+        assert_eq!(rust_vec_iter_collect[0], "foobar");
+        assert_eq!(rust_vec_iter_collect[1], "baz");
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect.len(), 2);
+        assert_eq!(rust_vec_iter_rev_collect[1], "foobar");
+        assert_eq!(rust_vec_iter_rev_collect[0], "baz");
+
+    }
+    #[test]
+    fn vector_of_string_store_manual_build() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let s0 = b.create_string("foobar");
+        let s1 = b.create_string("baz");
+        let v = b.create_vector(&[s0, s1]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            testarrayofstring: Some(v), ..Default::default()});
+        assert_eq!(m.testarrayofstring().unwrap().len(), 2);
+        assert_eq!(m.testarrayofstring().unwrap().get(0), "foobar");
+        assert_eq!(m.testarrayofstring().unwrap().get(1), "baz");
+
+        let rust_vec_inst = m.testarrayofstring().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect.len(), 2);
+        assert_eq!(rust_vec_iter_collect[0], "foobar");
+        assert_eq!(rust_vec_iter_collect[1], "baz");
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect.len(), 2);
+        assert_eq!(rust_vec_iter_rev_collect[0], "baz");
+        assert_eq!(rust_vec_iter_rev_collect[1], "foobar");
+    }
+    #[test]
+    fn vector_of_ubyte_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector(&[123u8, 234u8][..]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            inventory: Some(v), ..Default::default()});
+        assert_eq!(m.inventory().unwrap(), &[123, 234][..]);
+    }
+    #[test]
+    fn vector_of_bool_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector(&[false, true, false, true][..]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            testarrayofbools: Some(v), ..Default::default()});
+        assert_eq!(m.testarrayofbools().unwrap(), &[false, true, false, true][..]);
+
+        let rust_vec_inst = m.testarrayofbools().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect, &[&false, &true, &false, &true][..]);
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect, &[&true, &false, &true, &false][..]);
+    }
+    #[test]
+    fn vector_of_f64_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector(&[3.14159265359f64][..]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            vector_of_doubles: Some(v), ..Default::default()});
+        assert_eq!(m.vector_of_doubles().unwrap().len(), 1);
+        assert_eq!(m.vector_of_doubles().unwrap().get(0), 3.14159265359f64);
+
+        let rust_vec_inst = m.vector_of_doubles().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect.len(), 1);
+        assert_eq!(rust_vec_iter_collect[0], 3.14159265359f64);
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect.len(), 1);
+        assert_eq!(rust_vec_iter_rev_collect[0], 3.14159265359f64);
+    }
+    #[test]
+    fn vector_of_struct_store() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector(&[my_game::example::Test::new(127, -128), my_game::example::Test::new(3, 123)][..]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            test4: Some(v), ..Default::default()});
+        assert_eq!(m.test4().unwrap(), &[my_game::example::Test::new(127, -128), my_game::example::Test::new(3, 123)][..]);
+
+        let rust_vec_inst = m.test4().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect, &[&my_game::example::Test::new(127, -128), &my_game::example::Test::new(3, 123)][..]);
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect, &[&my_game::example::Test::new(3, 123), &my_game::example::Test::new(127, -128)][..]);
+    }
+    #[test]
+    fn vector_of_struct_store_with_type_inference() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let v = b.create_vector(&[my_game::example::Test::new(127, -128),
+                                  my_game::example::Test::new(3, 123),
+                                  my_game::example::Test::new(100, 101)]);
+        let name = b.create_string("foo");
+        let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            test4: Some(v), ..Default::default()});
+        assert_eq!(m.test4().unwrap(), &[my_game::example::Test::new(127, -128), my_game::example::Test::new(3, 123), my_game::example::Test::new(100, 101)][..]);
+    }
+     #[test]
+     fn vector_of_enums_store() {
+         let mut b = flatbuffers::FlatBufferBuilder::new();
+         let v = b.create_vector::<my_game::example::Color>(&[my_game::example::Color::Red, my_game::example::Color::Green][..]);
+         let name = b.create_string("foo");
+         let m = build_mon(&mut b, &my_game::example::MonsterArgs{
+             name: Some(name),
+             vector_of_enums: Some(v), ..Default::default()});
+         assert_eq!(m.vector_of_enums().unwrap().len(), 2);
+         assert_eq!(m.vector_of_enums().unwrap().get(0), my_game::example::Color::Red);
+         assert_eq!(m.vector_of_enums().unwrap().get(1), my_game::example::Color::Green);
+     }
+    #[test]
+    fn vector_of_table_store() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        let t0 = {
+            let name = b.create_string("foo");
+            let args = my_game::example::MonsterArgs{hp: 55, name: Some(name), ..Default::default()};
+            my_game::example::Monster::create(b, &args)
+        };
+        let t1 = {
+            let name = b.create_string("bar");
+            let args = my_game::example::MonsterArgs{name: Some(name), ..Default::default()};
+            my_game::example::Monster::create(b, &args)
+        };
+        let v = b.create_vector(&[t0, t1][..]);
+        let name = b.create_string("foo");
+        let m = build_mon(b, &my_game::example::MonsterArgs{
+            name: Some(name),
+            testarrayoftables: Some(v), ..Default::default()});
+        assert_eq!(m.testarrayoftables().unwrap().len(), 2);
+        assert_eq!(m.testarrayoftables().unwrap().get(0).hp(), 55);
+        assert_eq!(m.testarrayoftables().unwrap().get(0).name(), "foo");
+        assert_eq!(m.testarrayoftables().unwrap().get(1).hp(), 100);
+        assert_eq!(m.testarrayoftables().unwrap().get(1).name(), "bar");
+
+        let rust_vec_inst = m.testarrayoftables().unwrap();
+        let rust_vec_iter_collect = rust_vec_inst.iter().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_collect.len(), 2);
+        assert_eq!(rust_vec_iter_collect[0].hp(), 55);
+        assert_eq!(rust_vec_iter_collect[0].name(), "foo");
+        assert_eq!(rust_vec_iter_collect[1].hp(), 100);
+        assert_eq!(rust_vec_iter_collect[1].name(), "bar");
+
+        let rust_vec_iter_rev_collect = rust_vec_inst.iter().rev().collect::<Vec<_>>();
+        assert_eq!(rust_vec_iter_rev_collect.len(), 2);
+        assert_eq!(rust_vec_iter_rev_collect[0].hp(), 100);
+        assert_eq!(rust_vec_iter_rev_collect[0].name(), "bar");
+        assert_eq!(rust_vec_iter_rev_collect[1].hp(), 55);
+        assert_eq!(rust_vec_iter_rev_collect[1].name(), "foo");
+    }
+}
+
+#[cfg(test)]
+mod generated_code_alignment_and_padding {
+    extern crate flatbuffers;
+    use super::my_game;
+
+    #[test]
+    fn enum_color_is_1_byte() {
+        assert_eq!(1, ::std::mem::size_of::<my_game::example::Color>());
+    }
+
+    #[test]
+    fn union_any_is_1_byte() {
+        assert_eq!(1, ::std::mem::size_of::<my_game::example::Any>());
+    }
+
+    #[test]
+    fn union_any_is_aligned_to_1() {
+        assert_eq!(1, ::std::mem::align_of::<my_game::example::Any>());
+    }
+    #[test]
+    fn struct_test_is_4_bytes() {
+        assert_eq!(4, ::std::mem::size_of::<my_game::example::Test>());
+    }
+    #[test]
+    fn struct_vec3_is_32_bytes() {
+        assert_eq!(32, ::std::mem::size_of::<my_game::example::Vec3>());
+    }
+
+    #[test]
+    fn struct_vec3_is_written_with_correct_alignment_in_table() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        {
+            let name = b.create_string("foo");
+            let mon = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name),
+                pos: Some(&my_game::example::Vec3::new(1.0, 2.0, 3.0, 4.0,
+                                                       my_game::example::Color::Green,
+                                                       &my_game::example::Test::new(98, 99))),
+                                                       ..Default::default()});
+            my_game::example::finish_monster_buffer(b, mon);
+        }
+        let buf = b.finished_data();
+        let mon = my_game::example::root_as_monster(buf).unwrap();
+        let vec3 = mon.pos().unwrap();
+
+        let start_ptr = buf.as_ptr() as usize;
+        let vec3_ptr = vec3 as *const my_game::example::Vec3 as usize;
+
+        assert!(vec3_ptr > start_ptr);
+        // Vec3 is aligned to 8 wrt the flatbuffer.
+        assert_eq!((vec3_ptr - start_ptr) % 8, 0);
+    }
+
+    #[test]
+    fn struct_ability_is_8_bytes() {
+        assert_eq!(8, ::std::mem::size_of::<my_game::example::Ability>());
+    }
+
+    #[test]
+    fn struct_ability_is_written_with_correct_alignment_in_table_vector() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        {
+            let name = b.create_string("foo");
+            let v = b.create_vector(&[my_game::example::Ability::new(1, 2),
+                                      my_game::example::Ability::new(3, 4),
+                                      my_game::example::Ability::new(5, 6)]);
+            let mon = my_game::example::Monster::create(b, &my_game::example::MonsterArgs{
+                name: Some(name),
+                testarrayofsortedstruct: Some(v),
+                ..Default::default()});
+            my_game::example::finish_monster_buffer(b, mon);
+        }
+        let buf = b.finished_data();
+        let mon = my_game::example::root_as_monster(buf).unwrap();
+        let abilities = mon.testarrayofsortedstruct().unwrap();
+
+        let start_ptr = buf.as_ptr() as usize;
+        for a in abilities.iter() {
+            let a_ptr = a as *const my_game::example::Ability as usize;
+            assert!(a_ptr > start_ptr);
+            let aln = ::std::mem::align_of::<my_game::example::Ability>();
+            assert_eq!((a_ptr - start_ptr) % aln, 0);
+        }
+        for a in abilities.iter().rev() {
+            let a_ptr = a as *const my_game::example::Ability as usize;
+            assert!(a_ptr > start_ptr);
+            // Vec3 is aligned to 8 wrt the flatbuffer.
+            assert_eq!((a_ptr - start_ptr) % 8, 0);
+        }
+    }
+}
+
+#[cfg(test)]
+mod roundtrip_byteswap {
+    #[cfg(not(miri))]  // slow.
+    extern crate quickcheck;
+    extern crate flatbuffers;
+
+    const N: u64 = 10000;
+
+    fn palindrome_32(x: f32) -> bool {
+        x == f32::from_bits(x.to_bits().swap_bytes())
+    }
+    fn palindrome_64(x: f64) -> bool {
+        x == f64::from_bits(x.to_bits().swap_bytes())
+    }
+
+    fn prop_f32(x: f32) {
+        use flatbuffers::byte_swap_f32;
+
+        let there = byte_swap_f32(x);
+
+        let back_again = byte_swap_f32(there);
+
+        if !palindrome_32(x) {
+            assert!(x != there);
+        }
+
+        assert_eq!(x, back_again);
+    }
+
+    fn prop_f64(x: f64) {
+        use flatbuffers::byte_swap_f64;
+
+        let there = byte_swap_f64(x);
+        let back_again = byte_swap_f64(there);
+
+        if !palindrome_64(x) {
+            assert!(x != there);
+        }
+
+        assert_eq!(x, back_again);
+    }
+
+    // TODO(rw): Replace the implementations with the new stdlib endian-conversion functions.
+    // TODO(rw): Re-enable these tests (currently, rare CI failures occur that seem spurious).
+    // #[test]
+    // fn fuzz_f32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_f32 as fn(f32)); }
+    // #[test]
+    // fn fuzz_f64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_f64 as fn(f64)); }
+}
+
+#[cfg(not(miri))]
+quickcheck! {
+  fn struct_of_structs(
+    a_id: u32,
+    a_distance: u32,
+    b_a: i16,
+    b_b: i8,
+    c_id: u32,
+    c_distance: u32
+  ) -> bool {
+    use my_game::example::*;
+    let mut sos = StructOfStructs::default();
+    let mut a = Ability::default();
+    a.set_id(a_id);
+    a.set_distance(a_distance);
+    let mut b = Test::default();
+    b.set_a(b_a);
+    b.set_b(b_b);
+    let mut c = Ability::default();
+    c.set_id(c_id);
+    c.set_distance(c_distance);
+    sos.set_a(&a);
+    sos.set_b(&b);
+    sos.set_c(&c);
+
+    sos.a().id() == a_id &&
+    sos.a().distance() == a_distance &&
+    sos.b().a() == b_a &&
+    sos.b().b() == b_b &&
+    sos.c().id() == c_id &&
+    sos.c().distance() == c_distance
+  }
+}
+
+#[cfg(not(miri))]  // slow.
+#[cfg(test)]
+mod roundtrip_vectors {
+
+    #[cfg(test)]
+    mod scalar {
+        extern crate quickcheck;
+        extern crate flatbuffers;
+
+        const N: u64 = 20;
+
+        fn prop<T>(xs: Vec<T>)
+        where
+            T: for<'a> flatbuffers::Follow<'a, Inner = T>
+                + flatbuffers::EndianScalar
+                + flatbuffers::Push
+                + ::std::fmt::Debug,
+        {
+            use flatbuffers::Follow;
+
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            b.start_vector::<T>(xs.len());
+            for i in (0..xs.len()).rev() {
+                b.push::<T>(xs[i]);
+            }
+            let vecend = b.end_vector::<T>(xs.len());
+            b.finish_minimal(vecend);
+
+            let buf = b.finished_data();
+
+            let got = <flatbuffers::ForwardsUOffset<flatbuffers::Vector<T>>>::follow(&buf[..], 0);
+            let mut result_vec: Vec<T> = Vec::with_capacity(got.len());
+            for i in 0..got.len() {
+                result_vec.push(got.get(i));
+            }
+            assert_eq!(result_vec, xs);
+
+            let rust_vec_iter = got.iter().collect::<Vec<T>>();
+            assert_eq!(rust_vec_iter, xs);
+
+            let mut rust_vec_rev_iter = got.iter().rev().collect::<Vec<T>>();
+            rust_vec_rev_iter.reverse();
+            assert_eq!(rust_vec_rev_iter, xs);
+        }
+
+        #[test]
+        fn easy_u8() {
+            prop::<u8>(vec![]);
+            prop::<u8>(vec![1u8]);
+            prop::<u8>(vec![1u8, 2u8]);
+            prop::<u8>(vec![1u8, 2u8, 3u8]);
+            prop::<u8>(vec![1u8, 2u8, 3u8, 4u8]);
+        }
+
+        #[test]
+        fn fuzz_bool() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<bool> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_u8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u8> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_i8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i8> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_u16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u16> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_i16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i16> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_u32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u32> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_i32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i32> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_u64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u64> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_i64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i64> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_f32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<f32> as fn(Vec<_>)); }
+        #[test]
+        fn fuzz_f64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<f64> as fn(Vec<_>)); }
+    }
+
+    #[cfg(test)]
+    mod create_vector_direct {
+        #[cfg(not(miri))]  // slow.
+        extern crate quickcheck;
+        extern crate flatbuffers;
+
+        const N: u64 = 20;
+
+        // This uses a macro because lifetimes for the trait-bounded function get too
+        // complicated.
+        macro_rules! impl_prop {
+            ($test_name:ident, $fn_name:ident, $ty:ident) => (
+                fn $fn_name(xs: Vec<$ty>) {
+                    use flatbuffers::Follow;
+
+                    let mut b = flatbuffers::FlatBufferBuilder::new();
+                    b.create_vector_direct(&xs[..]);
+                    let buf = b.unfinished_data();
+
+                    let got = <flatbuffers::Vector<$ty>>::follow(&buf[..], 0).safe_slice();
+                    assert_eq!(got, &xs[..]);
+                }
+                #[test]
+                fn $test_name() { quickcheck::QuickCheck::new().max_tests(N).quickcheck($fn_name as fn(Vec<_>)); }
+            )
+        }
+
+        impl_prop!(test_bool, prop_bool, bool);
+        impl_prop!(test_u8, prop_u8, u8);
+        impl_prop!(test_i8, prop_i8, i8);
+
+        #[cfg(test)]
+        #[cfg(target_endian = "little")]
+        mod host_is_le {
+            const N: u64 = 20;
+            use super::flatbuffers;
+            use super::quickcheck;
+            impl_prop!(test_u16, prop_u16, u16);
+            impl_prop!(test_u32, prop_u32, u32);
+            impl_prop!(test_u64, prop_u64, u64);
+            impl_prop!(test_i16, prop_i16, i16);
+            impl_prop!(test_i32, prop_i32, i32);
+            impl_prop!(test_i64, prop_i64, i64);
+            impl_prop!(test_f32, prop_f32, f32);
+            impl_prop!(test_f64, prop_f64, f64);
+        }
+    }
+
+    #[cfg(test)]
+    mod string_manual_build {
+        #[cfg(not(miri))]  // slow.
+        extern crate quickcheck;
+        extern crate flatbuffers;
+
+        fn prop(xs: Vec<String>) {
+            use flatbuffers::Follow;
+
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            let mut offsets = Vec::new();
+            for s in xs.iter().rev() {
+                offsets.push(b.create_string(s.as_str()));
+            }
+
+            b.start_vector::<flatbuffers::WIPOffset<&str>>(xs.len());
+            for &i in offsets.iter() {
+                b.push(i);
+            }
+            let vecend = b.end_vector::<flatbuffers::WIPOffset<&str>>(xs.len());
+
+            b.finish_minimal(vecend);
+
+            let buf = b.finished_data();
+            let got = <flatbuffers::ForwardsUOffset<flatbuffers::Vector<flatbuffers::ForwardsUOffset<&str>>>>::follow(buf, 0);
+
+            assert_eq!(got.len(), xs.len());
+            for i in 0..xs.len() {
+                assert_eq!(got.get(i), &xs[i][..]);
+            }
+        }
+
+        #[test]
+        fn fuzz() {
+            quickcheck::QuickCheck::new().max_tests(20).quickcheck(prop as fn(Vec<_>));
+        }
+    }
+
+    #[cfg(test)]
+    mod string_helper_build {
+        #[cfg(not(miri))]  // slow.
+        extern crate quickcheck;
+        extern crate flatbuffers;
+
+        fn prop(input: Vec<String>) {
+            let xs: Vec<&str> = input.iter().map(|s: &String| &s[..]).collect();
+
+            use flatbuffers::Follow;
+
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            let vecend = b.create_vector_of_strings(&xs[..]);
+
+            b.finish_minimal(vecend);
+
+            let buf = b.finished_data();
+            let got = <flatbuffers::ForwardsUOffset<flatbuffers::Vector<flatbuffers::ForwardsUOffset<&str>>>>::follow(buf, 0);
+
+            assert_eq!(got.len(), xs.len());
+            for i in 0..xs.len() {
+                assert_eq!(got.get(i), &xs[i][..]);
+            }
+        }
+
+        #[test]
+        fn fuzz() {
+            quickcheck::QuickCheck::new().max_tests(100).quickcheck(prop as fn(Vec<_>));
+        }
+    }
+
+    #[cfg(test)]
+    mod ubyte {
+        #[cfg(not(miri))]  // slow.
+        extern crate quickcheck;
+        extern crate flatbuffers;
+
+        #[cfg(not(miri))]  // slow.
+        #[test]
+        fn fuzz_manual_build() {
+            fn prop(vec: Vec<u8>) {
+                let xs = &vec[..];
+
+                let mut b1 = flatbuffers::FlatBufferBuilder::new();
+                b1.start_vector::<u8>(xs.len());
+
+                for i in (0..xs.len()).rev() {
+                    b1.push(xs[i]);
+                }
+                b1.end_vector::<u8>(xs.len());
+
+                let mut b2 = flatbuffers::FlatBufferBuilder::new();
+                b2.create_vector(xs);
+                assert_eq!(b1.unfinished_data(), b2.unfinished_data());
+            }
+            quickcheck::QuickCheck::new().max_tests(100).quickcheck(prop as fn(Vec<_>));
+        }
+    }
+}
+
+#[cfg(test)]
+mod framing_format {
+    extern crate flatbuffers;
+
+    use super::my_game;
+
+    #[test]
+    fn test_size_prefixed_buffer() {
+        // Create size prefixed buffer.
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let args = &my_game::example::MonsterArgs{
+            mana: 200,
+            hp: 300,
+            name: Some(b.create_string("bob")),
+            ..Default::default()
+        };
+        let mon = my_game::example::Monster::create(&mut b, &args);
+        b.finish_size_prefixed(mon, None);
+
+        // Access it.
+        let buf = b.finished_data();
+        let m = flatbuffers::size_prefixed_root::<my_game::example::Monster>(buf).unwrap();
+        assert_eq!(m.mana(), 200);
+        assert_eq!(m.hp(), 300);
+        assert_eq!(m.name(), "bob");
+    }
+}
+
+#[cfg(test)]
+mod roundtrip_table {
+    use std::collections::HashMap;
+
+    extern crate flatbuffers;
+    #[cfg(not(miri))]  // slow.
+    extern crate quickcheck;
+
+    use super::LCG;
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn table_of_mixed_scalars_fuzz() {
+        // Values we're testing against: chosen to ensure no bits get chopped
+        // off anywhere, and also be different from eachother.
+        let bool_val: bool = true;
+        let char_val: i8 = -127;  // 0x81
+        let uchar_val: u8 = 0xFF;
+        let short_val: i16 = -32222;  // 0x8222;
+        let ushort_val: u16 = 0xFEEE;
+        let int_val: i32 = unsafe { ::std::mem::transmute(0x83333333u32) };
+        let uint_val: u32 = 0xFDDDDDDD;
+        let long_val: i64 = unsafe { ::std::mem::transmute(0x8444444444444444u64) }; // TODO: byte literal?
+        let ulong_val: u64 = 0xFCCCCCCCCCCCCCCCu64;
+        let float_val: f32 = 3.14159;
+        let double_val: f64 = 3.14159265359;
+
+        let test_value_types_max: isize = 11;
+        let max_fields_per_object: flatbuffers::VOffsetT = 100;
+        let num_fuzz_objects: isize = 1000;  // The higher, the more thorough :)
+
+        let mut builder = flatbuffers::FlatBufferBuilder::new();
+        let mut lcg = LCG::new();
+
+        let mut objects: Vec<flatbuffers::UOffsetT> = vec![0; num_fuzz_objects as usize];
+
+        // Generate num_fuzz_objects random objects each consisting of
+        // fields_per_object fields, each of a random type.
+        for i in 0..(num_fuzz_objects as usize) {
+            let fields_per_object = (lcg.next() % (max_fields_per_object as u64)) as flatbuffers::VOffsetT;
+            let start = builder.start_table();
+
+            for j in 0..fields_per_object {
+                let choice = lcg.next() % (test_value_types_max as u64);
+
+                let f = flatbuffers::field_index_to_field_offset(j);
+
+                match choice {
+                    0 => {builder.push_slot::<bool>(f, bool_val, false);}
+                    1 => {builder.push_slot::<i8>(f, char_val, 0);}
+                    2 => {builder.push_slot::<u8>(f, uchar_val, 0);}
+                    3 => {builder.push_slot::<i16>(f, short_val, 0);}
+                    4 => {builder.push_slot::<u16>(f, ushort_val, 0);}
+                    5 => {builder.push_slot::<i32>(f, int_val, 0);}
+                    6 => {builder.push_slot::<u32>(f, uint_val, 0);}
+                    7 => {builder.push_slot::<i64>(f, long_val, 0);}
+                    8 => {builder.push_slot::<u64>(f, ulong_val, 0);}
+                    9 => {builder.push_slot::<f32>(f, float_val, 0.0);}
+                    10 => {builder.push_slot::<f64>(f, double_val, 0.0);}
+                    _ => { panic!("unknown choice: {}", choice); }
+                }
+            }
+            objects[i] = builder.end_table(start).value();
+        }
+
+        // Do some bookkeeping to generate stats on fuzzes:
+        let mut stats: HashMap<u64, u64> = HashMap::new();
+        let mut values_generated: u64 = 0;
+
+        // Embrace PRNG determinism:
+        lcg.reset();
+
+        // Test that all objects we generated are readable and return the
+        // expected values. We generate random objects in the same order
+        // so this is deterministic:
+        for i in 0..(num_fuzz_objects as usize) {
+            let table = {
+                let buf = builder.unfinished_data();
+                let loc = buf.len() as flatbuffers::UOffsetT - objects[i];
+                flatbuffers::Table::new(buf, loc as usize)
+            };
+
+            let fields_per_object = (lcg.next() % (max_fields_per_object as u64)) as flatbuffers::VOffsetT;
+            for j in 0..fields_per_object {
+                let choice = lcg.next() % (test_value_types_max as u64);
+
+                *stats.entry(choice).or_insert(0) += 1;
+                values_generated += 1;
+
+                let f = flatbuffers::field_index_to_field_offset(j);
+
+                match choice {
+                    0 => { assert_eq!(bool_val, table.get::<bool>(f, Some(false)).unwrap()); }
+                    1 => { assert_eq!(char_val, table.get::<i8>(f, Some(0)).unwrap()); }
+                    2 => { assert_eq!(uchar_val, table.get::<u8>(f, Some(0)).unwrap()); }
+                    3 => { assert_eq!(short_val, table.get::<i16>(f, Some(0)).unwrap()); }
+                    4 => { assert_eq!(ushort_val, table.get::<u16>(f, Some(0)).unwrap()); }
+                    5 => { assert_eq!(int_val, table.get::<i32>(f, Some(0)).unwrap()); }
+                    6 => { assert_eq!(uint_val, table.get::<u32>(f, Some(0)).unwrap()); }
+                    7 => { assert_eq!(long_val, table.get::<i64>(f, Some(0)).unwrap()); }
+                    8 => { assert_eq!(ulong_val, table.get::<u64>(f, Some(0)).unwrap()); }
+                    9 => { assert_eq!(float_val, table.get::<f32>(f, Some(0.0)).unwrap()); }
+                    10 => { assert_eq!(double_val, table.get::<f64>(f, Some(0.0)).unwrap()); }
+                    _ => { panic!("unknown choice: {}", choice); }
+                }
+            }
+        }
+
+        // Assert that we tested all the fuzz cases enough:
+        let min_tests_per_choice = 1000;
+        assert!(values_generated > 0);
+        assert!(min_tests_per_choice > 0);
+        for i in 0..test_value_types_max as u64 {
+            assert!(stats[&i] >= min_tests_per_choice, "inadequately-tested fuzz case: {}", i);
+        }
+    }
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn table_of_byte_strings_fuzz() {
+        fn prop(vec: Vec<Vec<u8>>) {
+            use flatbuffers::field_index_to_field_offset as fi2fo;
+            use flatbuffers::Follow;
+
+            let xs = &vec[..];
+
+            // build
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            let str_offsets: Vec<flatbuffers::WIPOffset<_>> = xs.iter().map(|s| b.create_byte_string(&s[..])).collect();
+            let table_start = b.start_table();
+
+            for i in 0..xs.len() {
+                b.push_slot_always(fi2fo(i as flatbuffers::VOffsetT), str_offsets[i]);
+            }
+            let root = b.end_table(table_start);
+            b.finish_minimal(root);
+
+            // use
+            let buf = b.finished_data();
+            let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(buf, 0);
+
+            for i in 0..xs.len() {
+                let v = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<u8>>>(fi2fo(i as flatbuffers::VOffsetT), None);
+                assert!(v.is_some());
+                let v2 = v.unwrap().safe_slice();
+                assert_eq!(v2, &xs[i][..]);
+            }
+        }
+        prop(vec![vec![1,2,3]]);
+
+        let n = 20;
+        quickcheck::QuickCheck::new().max_tests(n).quickcheck(prop as fn(Vec<_>));
+    }
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn fuzz_table_of_strings() {
+        fn prop(vec: Vec<String>) {
+            use flatbuffers::field_index_to_field_offset as fi2fo;
+            use flatbuffers::Follow;
+
+            let xs = &vec[..];
+
+            // build
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            let str_offsets: Vec<flatbuffers::WIPOffset<_>> = xs.iter().map(|s| b.create_string(&s[..])).collect();
+            let table_start = b.start_table();
+
+            for i in 0..xs.len() {
+                b.push_slot_always(fi2fo(i as flatbuffers::VOffsetT), str_offsets[i]);
+            }
+            let root = b.end_table(table_start);
+            b.finish_minimal(root);
+
+            // use
+            let buf = b.finished_data();
+            let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(buf, 0);
+
+            for i in 0..xs.len() {
+                let v = tab.get::<flatbuffers::ForwardsUOffset<&str>>(fi2fo(i as flatbuffers::VOffsetT), None);
+                assert_eq!(v, Some(&xs[i][..]));
+            }
+        }
+        let n = 20;
+        quickcheck::QuickCheck::new().max_tests(n).quickcheck(prop as fn(Vec<String>));
+    }
+
+    #[cfg(not(miri))]  // slow.
+    mod table_of_vectors_of_scalars {
+        extern crate flatbuffers;
+        #[cfg(not(miri))]  // slow.
+        extern crate quickcheck;
+
+        const N: u64 = 20;
+
+        fn prop<T>(vecs: Vec<Vec<T>>)
+        where
+            T: for<'a> flatbuffers::Follow<'a, Inner = T>
+                + flatbuffers::EndianScalar
+                + flatbuffers::Push
+                + ::std::fmt::Debug,
+        {
+            use flatbuffers::field_index_to_field_offset as fi2fo;
+            use flatbuffers::Follow;
+
+            // build
+            let mut b = flatbuffers::FlatBufferBuilder::new();
+            let mut offs = vec![];
+            for vec in &vecs {
+                b.start_vector::<T>(vec.len());
+
+                let xs = &vec[..];
+                for i in (0..xs.len()).rev() {
+                    b.push::<T>(xs[i]);
+                }
+                let vecend = b.end_vector::<T>(xs.len());
+                offs.push(vecend);
+            }
+
+            let table_start = b.start_table();
+
+            for i in 0..vecs.len() {
+                b.push_slot_always(fi2fo(i as flatbuffers::VOffsetT), offs[i]);
+            }
+            let root = b.end_table(table_start);
+            b.finish_minimal(root);
+
+            // use
+            let buf = b.finished_data();
+            let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(buf, 0);
+
+            for i in 0..vecs.len() {
+                let got = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<T>>>(fi2fo(i as flatbuffers::VOffsetT), None);
+                assert!(got.is_some());
+                let got2 = got.unwrap();
+                let mut got3: Vec<T> = Vec::with_capacity(got2.len());
+                for i in 0..got2.len() {
+                    got3.push(got2.get(i));
+                }
+                assert_eq!(vecs[i], got3);
+            }
+        }
+
+        #[test]
+        fn fuzz_bool() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<bool>>)); }
+
+        #[test]
+        fn fuzz_u8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u8>>)); }
+        #[test]
+        fn fuzz_u16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u16>>)); }
+        #[test]
+        fn fuzz_u32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u32>>)); }
+        #[test]
+        fn fuzz_u64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u64>>)); }
+
+        #[test]
+        fn fuzz_i8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u8>>)); }
+        #[test]
+        fn fuzz_i16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u16>>)); }
+        #[test]
+        fn fuzz_i32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u32>>)); }
+        #[test]
+        fn fuzz_i64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<u64>>)); }
+
+        #[test]
+        fn fuzz_f32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<f32>>)); }
+        #[test]
+        fn fuzz_f64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop as fn(Vec<Vec<f64>>)); }
+    }
+}
+
+#[cfg(not(miri))]  // slow.
+#[cfg(test)]
+mod roundtrip_scalars {
+    extern crate flatbuffers;
+    #[cfg(not(miri))]  // slow.
+    extern crate quickcheck;
+
+    const N: u64 = 1000;
+
+    fn prop<T: PartialEq + ::std::fmt::Debug + Copy + flatbuffers::EndianScalar>(x: T) {
+        let mut buf = vec![0u8; ::std::mem::size_of::<T>()];
+        let y = unsafe {
+            flatbuffers::emplace_scalar(&mut buf[..], x);
+            flatbuffers::read_scalar(&buf[..])
+        };
+        assert_eq!(x, y);
+    }
+
+    #[test]
+    fn fuzz_bool() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<bool> as fn(_)); }
+    #[test]
+    fn fuzz_u8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u8> as fn(_)); }
+    #[test]
+    fn fuzz_i8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i8> as fn(_)); }
+
+    #[test]
+    fn fuzz_u16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u16> as fn(_)); }
+    #[test]
+    fn fuzz_i16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i16> as fn(_)); }
+
+    #[test]
+    fn fuzz_u32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u32> as fn(_)); }
+    #[test]
+    fn fuzz_i32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i32> as fn(_)); }
+
+    #[test]
+    fn fuzz_u64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<u64> as fn(_)); }
+    #[test]
+    fn fuzz_i64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<i64> as fn(_)); }
+
+    #[test]
+    fn fuzz_f32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<f32> as fn(_)); }
+    #[test]
+    fn fuzz_f64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop::<f64> as fn(_)); }
+}
+
+#[cfg(test)]
+#[cfg(not(miri))]  // slow.
+mod roundtrip_push_follow_scalars {
+    extern crate flatbuffers;
+    #[cfg(not(miri))]  // slow.
+    extern crate quickcheck;
+
+    use flatbuffers::Push;
+
+    const N: u64 = 1000;
+
+    // This uses a macro because lifetimes for a trait-bounded function get too
+    // complicated.
+    macro_rules! impl_prop {
+        ($fn_name:ident, $ty:ident) => (
+            fn $fn_name(x: $ty) {
+                let mut buf = vec![0u8; ::std::mem::size_of::<$ty>()];
+                x.push(&mut buf[..], &[][..]);
+                let fs: flatbuffers::FollowStart<$ty> = flatbuffers::FollowStart::new();
+                assert_eq!(fs.self_follow(&buf[..], 0), x);
+            }
+        )
+    }
+
+    impl_prop!(prop_bool, bool);
+    impl_prop!(prop_u8, u8);
+    impl_prop!(prop_i8, i8);
+    impl_prop!(prop_u16, u16);
+    impl_prop!(prop_i16, i16);
+    impl_prop!(prop_u32, u32);
+    impl_prop!(prop_i32, i32);
+    impl_prop!(prop_u64, u64);
+    impl_prop!(prop_i64, i64);
+    impl_prop!(prop_f32, f32);
+    impl_prop!(prop_f64, f64);
+
+    #[test]
+    fn fuzz_bool() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_bool as fn(bool)); }
+    #[test]
+    fn fuzz_u8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_u8 as fn(u8)); }
+    #[test]
+    fn fuzz_i8() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_i8 as fn(i8)); }
+    #[test]
+    fn fuzz_u16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_u16 as fn(u16)); }
+    #[test]
+    fn fuzz_i16() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_i16 as fn(i16)); }
+    #[test]
+    fn fuzz_u32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_u32 as fn(u32)); }
+    #[test]
+    fn fuzz_i32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_i32 as fn(i32)); }
+    #[test]
+    fn fuzz_u64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_u64 as fn(u64)); }
+    #[test]
+    fn fuzz_i64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_i64 as fn(i64)); }
+    #[test]
+    fn fuzz_f32() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_f32 as fn(f32)); }
+    #[test]
+    fn fuzz_f64() { quickcheck::QuickCheck::new().max_tests(N).quickcheck(prop_f64 as fn(f64)); }
+}
+
+
+#[cfg(test)]
+mod write_and_read_examples {
+    extern crate flatbuffers;
+
+    use super::create_serialized_example_with_library_code;
+    use super::create_serialized_example_with_generated_code;
+    use super::serialized_example_is_accessible_and_correct;
+
+    #[test]
+    fn generated_code_creates_correct_example() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        create_serialized_example_with_generated_code(b);
+        let buf = b.finished_data();
+        serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+    }
+
+    #[test]
+    fn generated_code_debug_prints_correctly() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        create_serialized_example_with_generated_code(b);
+        let buf = b.finished_data();
+        serialized_example_is_accessible_and_correct(&buf, true, false).unwrap();
+        let m = super::my_game::example::root_as_monster(buf).unwrap();
+        assert_eq!(
+            format!("{:.5?}", &m),
+            "Monster { pos: Some(Vec3 { x: 1.00000, y: 2.00000, z: 3.00000, \
+            test1: 3.00000, test2: Green, test3: Test { a: 5, b: 6 } }), \
+            mana: 150, hp: 80, name: \"MyMonster\", \
+            inventory: Some([0, 1, 2, 3, 4]), color: Blue, test_type: Monster, \
+            test: Monster { pos: None, mana: 150, hp: 100, name: \"Fred\", \
+            inventory: None, color: Blue, test_type: NONE, test: None, \
+            test4: None, testarrayofstring: None, testarrayoftables: None, \
+            enemy: None, testnestedflatbuffer: None, testempty: None, \
+            testbool: false, testhashs32_fnv1: 0, testhashu32_fnv1: 0, \
+            testhashs64_fnv1: 0, testhashu64_fnv1: 0, testhashs32_fnv1a: 0, \
+            testhashu32_fnv1a: 0, testhashs64_fnv1a: 0, testhashu64_fnv1a: 0, \
+            testarrayofbools: None, testf: 3.14159, testf2: 3.00000, testf3: 0.00000, \
+            testarrayofstring2: None, testarrayofsortedstruct: None, flex: None, \
+            test5: None, vector_of_longs: None, vector_of_doubles: None, \
+            parent_namespace_test: None, vector_of_referrables: None, \
+            single_weak_reference: 0, vector_of_weak_references: None, \
+            vector_of_strong_referrables: None, co_owning_reference: 0, \
+            vector_of_co_owning_references: None, non_owning_reference: 0, \
+            vector_of_non_owning_references: None, any_unique_type: NONE, \
+            any_unique: None, any_ambiguous_type: NONE, any_ambiguous: None, \
+            vector_of_enums: None, signed_enum: None, \
+            testrequirednestedflatbuffer: None, scalar_key_sorted_tables: None }, \
+            test4: Some([Test { a: 10, b: 20 }, Test { a: 30, b: 40 }]), \
+            testarrayofstring: Some([\"test1\", \"test2\"]), \
+            testarrayoftables: None, enemy: None, testnestedflatbuffer: None, \
+            testempty: None, testbool: false, testhashs32_fnv1: 0, \
+            testhashu32_fnv1: 0, testhashs64_fnv1: 0, testhashu64_fnv1: 0, \
+            testhashs32_fnv1a: 0, testhashu32_fnv1a: 0, testhashs64_fnv1a: 0, \
+            testhashu64_fnv1a: 0, testarrayofbools: None, testf: 3.14159, \
+            testf2: 3.00000, testf3: 0.00000, testarrayofstring2: None, \
+            testarrayofsortedstruct: None, flex: None, test5: None, \
+            vector_of_longs: None, vector_of_doubles: None, \
+            parent_namespace_test: None, vector_of_referrables: None, \
+            single_weak_reference: 0, vector_of_weak_references: None, \
+            vector_of_strong_referrables: None, co_owning_reference: 0, \
+            vector_of_co_owning_references: None, non_owning_reference: 0, \
+            vector_of_non_owning_references: None, any_unique_type: NONE, \
+            any_unique: None, any_ambiguous_type: NONE, any_ambiguous: None, \
+            vector_of_enums: None, signed_enum: None, \
+            testrequirednestedflatbuffer: None, scalar_key_sorted_tables: None }"
+        );
+    }
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn generated_code_creates_correct_example_repeatedly_with_reset() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        for _ in 0..100 {
+            create_serialized_example_with_generated_code(b);
+            {
+                let buf = b.finished_data();
+                serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+            }
+            b.reset();
+        }
+    }
+
+    #[test]
+    fn library_code_creates_correct_example() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        create_serialized_example_with_library_code(b);
+        let buf = b.finished_data();
+        serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+    }
+
+    #[test]
+    #[cfg(not(miri))]  // slow.
+    fn library_code_creates_correct_example_repeatedly_with_reset() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        for _ in 0..100 {
+            create_serialized_example_with_library_code(b);
+            {
+                let buf = b.finished_data();
+                serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+            }
+            b.reset();
+        }
+    }
+}
+
+#[cfg(test)]
+mod read_examples_from_other_language_ports {
+    extern crate flatbuffers;
+
+    use super::load_file;
+    use super::serialized_example_is_accessible_and_correct;
+
+    #[test]
+    fn gold_cpp_example_data_is_accessible_and_correct() {
+        let buf = load_file("../monsterdata_test.mon").expect("missing monsterdata_test.mon");
+        serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+    }
+    #[test]
+    fn java_wire_example_data_is_accessible_and_correct() {
+        let buf = load_file("../monsterdata_java_wire.mon");
+        if buf.is_err() {
+            println!("skipping java wire test because it is not present");
+            return;
+        }
+        let buf = buf.unwrap();
+        serialized_example_is_accessible_and_correct(&buf[..], true, false).unwrap();
+    }
+    #[test]
+    fn java_wire_size_prefixed_example_data_is_accessible_and_correct() {
+        let buf = load_file("../monsterdata_java_wire_sp.mon");
+        if buf.is_err() {
+            println!("skipping java wire test because it is not present");
+            return;
+        }
+        let buf = buf.unwrap();
+        serialized_example_is_accessible_and_correct(&buf[..], true, true).unwrap();
+    }
+}
+
+#[cfg(test)]
+mod generated_code_asserts {
+    extern crate flatbuffers;
+
+    use super::my_game;
+
+    #[test]
+    #[should_panic]
+    fn monster_builder_fails_when_name_is_missing() {
+        let b = &mut flatbuffers::FlatBufferBuilder::new();
+        my_game::example::Monster::create(b, &my_game::example::MonsterArgs{..Default::default()});
+    }
+}
+
+#[cfg(test)]
+mod generated_key_comparisons {
+    extern crate flatbuffers;
+
+    use super::my_game;
+
+    #[test]
+    fn struct_ability_key_compare_less_than() {
+        let a = my_game::example::Ability::new(1, 2);
+        let b = my_game::example::Ability::new(2, 1);
+        let c = my_game::example::Ability::new(3, 3);
+
+        assert_eq!(a.key_compare_less_than(&a), false);
+        assert_eq!(b.key_compare_less_than(&b), false);
+        assert_eq!(c.key_compare_less_than(&c), false);
+
+        assert_eq!(a.key_compare_less_than(&b), true);
+        assert_eq!(a.key_compare_less_than(&c), true);
+
+        assert_eq!(b.key_compare_less_than(&a), false);
+        assert_eq!(b.key_compare_less_than(&c), true);
+
+        assert_eq!(c.key_compare_less_than(&a), false);
+        assert_eq!(c.key_compare_less_than(&b), false);
+    }
+
+    #[test]
+    fn struct_key_compare_with_value() {
+        let a = my_game::example::Ability::new(1, 2);
+
+        assert_eq!(a.key_compare_with_value(0), ::std::cmp::Ordering::Greater);
+        assert_eq!(a.key_compare_with_value(1), ::std::cmp::Ordering::Equal);
+        assert_eq!(a.key_compare_with_value(2), ::std::cmp::Ordering::Less);
+    }
+
+    #[test]
+    fn struct_key_compare_less_than() {
+        let a = my_game::example::Ability::new(1, 2);
+        let b = my_game::example::Ability::new(2, 1);
+        let c = my_game::example::Ability::new(3, 3);
+
+        assert_eq!(a.key_compare_less_than(&a), false);
+        assert_eq!(b.key_compare_less_than(&b), false);
+        assert_eq!(c.key_compare_less_than(&c), false);
+
+        assert_eq!(a.key_compare_less_than(&b), true);
+        assert_eq!(a.key_compare_less_than(&c), true);
+
+        assert_eq!(b.key_compare_less_than(&a), false);
+        assert_eq!(b.key_compare_less_than(&c), true);
+
+        assert_eq!(c.key_compare_less_than(&a), false);
+        assert_eq!(c.key_compare_less_than(&b), false);
+    }
+
+    #[test]
+    fn table_key_compare_with_value() {
+        // setup
+        let builder = &mut flatbuffers::FlatBufferBuilder::new();
+        super::create_serialized_example_with_library_code(builder);
+        let buf = builder.finished_data();
+        let a = my_game::example::root_as_monster(buf).unwrap();
+
+        // preconditions
+        assert_eq!(a.name(), "MyMonster");
+
+        assert_eq!(a.key_compare_with_value("AAA"), ::std::cmp::Ordering::Greater);
+        assert_eq!(a.key_compare_with_value("MyMonster"), ::std::cmp::Ordering::Equal);
+        assert_eq!(a.key_compare_with_value("ZZZ"), ::std::cmp::Ordering::Less);
+    }
+
+    #[test]
+    fn table_key_compare_less_than() {
+        // setup
+        let builder = &mut flatbuffers::FlatBufferBuilder::new();
+        super::create_serialized_example_with_library_code(builder);
+        let buf = builder.finished_data();
+        let a = my_game::example::root_as_monster(buf).unwrap();
+        let b = a.test_as_monster().unwrap();
+
+        // preconditions
+        assert_eq!(a.name(), "MyMonster");
+        assert_eq!(b.name(), "Fred");
+
+        assert_eq!(a.key_compare_less_than(&a), false);
+        assert_eq!(a.key_compare_less_than(&b), false);
+
+        assert_eq!(b.key_compare_less_than(&a), true);
+        assert_eq!(b.key_compare_less_than(&b), false);
+    }
+}
+
+#[cfg(test)]
+mod included_schema_generated_code {
+    extern crate flatbuffers;
+
+    //extern crate rust_usage_test;
+
+    // TODO(rw): make generated sub-namespace files importable
+    //#[test]
+    //fn namespace_test_mod_is_importable() {
+    //    use rust_usage_test::namespace_test;
+    //}
+    //#[test]
+    //fn namespace_test1_mod_is_importable() {
+    //    use rust_usage_test::namespace_test::namespace_test1_generated;
+    //}
+    //#[test]
+    //fn namespace_test2_mod_is_importable() {
+    //    use rust_usage_test::namespace_test::namespace_test2_generated;
+    //}
+}
+
+#[cfg(test)]
+mod builder_asserts {
+    extern crate flatbuffers;
+
+    #[test]
+    #[should_panic]
+    fn end_table_should_panic_when_not_in_table() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.end_table(flatbuffers::WIPOffset::new(0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn create_string_should_panic_when_in_table() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_table();
+        b.create_string("foo");
+    }
+
+    #[test]
+    #[should_panic]
+    fn create_byte_string_should_panic_when_in_table() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_table();
+        b.create_byte_string(b"foo");
+    }
+
+    #[test]
+    #[should_panic]
+    fn push_struct_slot_should_panic_when_not_in_table() {
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        #[repr(C, packed)]
+        struct foo { }
+        impl<'b> flatbuffers::Push for &'b foo {
+            type Output = foo;
+            fn push<'a>(&'a self, _dst: &'a mut [u8], _rest: &'a [u8]) { }
+        }
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push_slot_always(0, &foo{});
+    }
+
+    #[test]
+    #[should_panic]
+    fn finished_bytes_should_panic_when_table_is_not_finished() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_table();
+        b.finished_data();
+    }
+
+    #[test]
+    #[should_panic]
+    fn required_panics_when_field_not_set() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let start = b.start_table();
+        let o = b.end_table(start);
+        b.required(o, 4 /* byte offset to first field */, "test field");
+    }
+}
+
+#[cfg(test)]
+mod follow_impls {
+    extern crate flatbuffers;
+    use flatbuffers::Follow;
+    use flatbuffers::field_index_to_field_offset as fi2fo;
+
+    // Define a test struct to use in a few tests. This replicates the work that the code generator
+    // would normally do when defining a FlatBuffer struct. For reference, compare the following
+    // `FooStruct` code with the code generated for the `Vec3` struct in
+    // `../../monster_test_generated.rs`.
+    use flatbuffers::EndianScalar;
+    #[derive(Copy, Clone, Debug, PartialEq)]
+    #[repr(C, packed)]
+    struct FooStruct {
+        a: i8,
+        b: u8,
+        c: i16,
+    }
+    impl FooStruct {
+        fn new(_a: i8, _b: u8, _c: i16) -> Self {
+            FooStruct {
+                a: _a.to_little_endian(),
+                b: _b.to_little_endian(),
+                c: _c.to_little_endian(),
+            }
+        }
+    }
+    impl flatbuffers::SafeSliceAccess for FooStruct {}
+    impl<'a> flatbuffers::Follow<'a> for FooStruct {
+        type Inner = &'a FooStruct;
+        #[inline(always)]
+        fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+            <&'a FooStruct>::follow(buf, loc)
+        }
+    }
+    impl<'a> flatbuffers::Follow<'a> for &'a FooStruct {
+        type Inner = &'a FooStruct;
+        #[inline(always)]
+        fn follow(buf: &'a [u8], loc: usize) -> Self::Inner {
+            flatbuffers::follow_cast_ref::<FooStruct>(buf, loc)
+        }
+    }
+
+    #[test]
+    fn to_u8() {
+        let vec: Vec<u8> = vec![255, 3];
+        let fs: flatbuffers::FollowStart<u8> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&vec[..], 1), 3);
+    }
+
+    #[test]
+    fn to_u16() {
+        let vec: Vec<u8> = vec![255, 255, 3, 4];
+        let fs: flatbuffers::FollowStart<u16> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&vec[..], 2), 1027);
+    }
+
+    #[test]
+    fn to_f32() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, /* start of value */ 208, 15, 73, 64];
+        let fs: flatbuffers::FollowStart<f32> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&vec[..], 4), 3.14159);
+    }
+
+    #[test]
+    fn to_string() {
+        let vec: Vec<u8> = vec![255,255,255,255, 3, 0, 0, 0, 'f' as u8, 'o' as u8, 'o' as u8, 0];
+        let off: flatbuffers::FollowStart<&str> = flatbuffers::FollowStart::new();
+        assert_eq!(off.self_follow(&vec[..], 4), "foo");
+    }
+
+    #[test]
+    fn to_byte_slice() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, 4, 0, 0, 0, 1, 2, 3, 4];
+        let off: flatbuffers::FollowStart<flatbuffers::Vector<u8>> = flatbuffers::FollowStart::new();
+        assert_eq!(off.self_follow(&vec[..], 4).safe_slice(), &[1, 2, 3, 4][..]);
+    }
+
+    #[test]
+    fn to_byte_vector() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, 4, 0, 0, 0, 1, 2, 3, 4];
+        let off: flatbuffers::FollowStart<flatbuffers::Vector<u8>> = flatbuffers::FollowStart::new();
+        assert_eq!(off.self_follow(&vec[..], 4).safe_slice(), &[1, 2, 3, 4][..]);
+    }
+
+    #[test]
+    fn to_byte_string_zero_teriminated() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, 3, 0, 0, 0, 1, 2, 3, 0];
+        let off: flatbuffers::FollowStart<flatbuffers::Vector<u8>> = flatbuffers::FollowStart::new();
+        assert_eq!(off.self_follow(&vec[..], 4).safe_slice(), &[1, 2, 3][..]);
+    }
+
+    #[test]
+    fn to_vector_of_u16() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, 2, 0, 0, 0, 1, 2, 3, 4];
+        let off: flatbuffers::FollowStart<flatbuffers::Vector<u16>> = flatbuffers::FollowStart::new();
+        assert_eq!(off.self_follow(&vec[..], 4).len(), 2);
+        assert_eq!(off.self_follow(&vec[..], 4).get(0), 513);
+        assert_eq!(off.self_follow(&vec[..], 4).get(1), 1027);
+    }
+
+    #[test]
+    fn to_struct() {
+        let vec: Vec<u8> = vec![255, 255, 255, 255, 1, 2, 3, 4];
+        let off: flatbuffers::FollowStart<&FooStruct> = flatbuffers::FollowStart::new();
+        assert_eq!(*off.self_follow(&vec[..], 4), FooStruct::new(1, 2, 1027));
+    }
+
+    #[test]
+    fn to_vector_of_offset_to_string_elements() {
+        let buf: Vec<u8> = vec![/* vec len */ 1, 0, 0, 0, /* offset to string */ 4, 0, 0, 0, /* str length */ 3, 0, 0, 0, 'f' as u8, 'o' as u8, 'o' as u8, 0];
+        let s: flatbuffers::FollowStart<flatbuffers::Vector<flatbuffers::ForwardsUOffset<&str>>> = flatbuffers::FollowStart::new();
+        assert_eq!(s.self_follow(&buf[..], 0).len(), 1);
+        assert_eq!(s.self_follow(&buf[..], 0).get(0), "foo");
+    }
+
+    #[test]
+    fn to_slice_of_struct_elements() {
+        let buf: Vec<u8> = vec![1, 0, 0, 0, /* struct data */ 1, 2, 3, 4];
+        let fs: flatbuffers::FollowStart<flatbuffers::Vector<FooStruct>> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&buf[..], 0).safe_slice(), &vec![FooStruct::new(1, 2, 1027)][..]);
+    }
+
+    #[test]
+    fn to_vector_of_struct_elements() {
+        let buf: Vec<u8> = vec![1, 0, 0, 0, /* struct data */ 1, 2, 3, 4];
+        let fs: flatbuffers::FollowStart<flatbuffers::Vector<FooStruct>> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&buf[..], 0).len(), 1);
+        assert_eq!(fs.self_follow(&buf[..], 0).get(0), &FooStruct::new(1, 2, 1027));
+    }
+
+    #[test]
+    fn to_root_to_empty_table() {
+        let buf: Vec<u8> = vec![
+            12, 0, 0, 0, // offset to root table
+            // enter vtable
+            4, 0, // vtable len
+            0, 0, // inline size
+            255, 255, 255, 255, // canary
+            // enter table
+            8, 0, 0, 0, // vtable location
+        ];
+        let fs: flatbuffers::FollowStart<flatbuffers::ForwardsUOffset<flatbuffers::Table>> = flatbuffers::FollowStart::new();
+        assert_eq!(fs.self_follow(&buf[..], 0), flatbuffers::Table::new(&buf[..], 12));
+    }
+
+    #[test]
+    fn to_root_table_get_slot_scalar_u8() {
+        let buf: Vec<u8> = vec![
+            14, 0, 0, 0, // offset to root table
+            // enter vtable
+            6, 0, // vtable len
+            2, 0, // inline size
+            5, 0, // value loc
+            255, 255, 255, 255, // canary
+            // enter table
+            10, 0, 0, 0, // vtable location
+            0, 99 // value (with padding)
+        ];
+        let fs: flatbuffers::FollowStart<flatbuffers::ForwardsUOffset<flatbuffers::Table>> = flatbuffers::FollowStart::new();
+        let tab = fs.self_follow(&buf[..], 0);
+        assert_eq!(tab.get::<u8>(fi2fo(0), Some(123)), Some(99));
+    }
+
+    #[test]
+    fn to_root_to_table_get_slot_scalar_u8_default_via_vtable_len() {
+        let buf: Vec<u8> = vec![
+            12, 0, 0, 0, // offset to root table
+            // enter vtable
+            4, 0, // vtable len
+            2, 0, // inline size
+            255, 255, 255, 255, // canary
+            // enter table
+            8, 0, 0, 0, // vtable location
+        ];
+        let fs: flatbuffers::FollowStart<flatbuffers::ForwardsUOffset<flatbuffers::Table>> = flatbuffers::FollowStart::new();
+        let tab = fs.self_follow(&buf[..], 0);
+        assert_eq!(tab.get::<u8>(fi2fo(0), Some(123)), Some(123));
+    }
+
+    #[test]
+    fn to_root_to_table_get_slot_scalar_u8_default_via_vtable_zero() {
+        let buf: Vec<u8> = vec![
+            14, 0, 0, 0, // offset to root table
+            // enter vtable
+            6, 0, // vtable len
+            2, 0, // inline size
+            0, 0, // zero means use the default value
+            255, 255, 255, 255, // canary
+            // enter table
+            10, 0, 0, 0, // vtable location
+        ];
+        let fs: flatbuffers::FollowStart<flatbuffers::ForwardsUOffset<flatbuffers::Table>> = flatbuffers::FollowStart::new();
+        let tab = fs.self_follow(&buf[..], 0);
+        assert_eq!(tab.get::<u8>(fi2fo(0), Some(123)), Some(123));
+    }
+
+    #[test]
+    fn to_root_to_table_get_slot_string_multiple_types() {
+        let buf: Vec<u8> = vec![
+            14, 0, 0, 0, // offset to root table
+            // enter vtable
+            6, 0, // vtable len
+            2, 0, // inline size
+            4, 0, // value loc
+            255, 255, 255, 255, // canary
+            // enter table
+            10, 0, 0, 0, // vtable location
+            8, 0, 0, 0, // offset to string
+            // leave table
+            255, 255, 255, 255, // canary
+            // enter string
+            3, 0, 0, 0, 109, 111, 111, 0 // string length and contents
+        ];
+        let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(&buf[..], 0);
+        assert_eq!(tab.get::<flatbuffers::ForwardsUOffset<&str>>(fi2fo(0), None), Some("moo"));
+        let byte_vec = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<u8>>>(fi2fo(0), None).unwrap().safe_slice();
+        assert_eq!(byte_vec, &vec![109, 111, 111][..]);
+        let v = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<u8>>>(fi2fo(0), None).unwrap();
+        assert_eq!(v.len(), 3);
+        assert_eq!(v.get(0), 109);
+        assert_eq!(v.get(1), 111);
+        assert_eq!(v.get(2), 111);
+    }
+
+    #[test]
+    fn to_root_to_table_get_slot_string_multiple_types_default_via_vtable_len() {
+        let buf: Vec<u8> = vec![
+            12, 0, 0, 0, // offset to root table
+            // enter vtable
+            4, 0, // vtable len
+            4, 0, // table inline len
+            255, 255, 255, 255, // canary
+            // enter table
+            8, 0, 0, 0, // vtable location
+        ];
+        let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(&buf[..], 0);
+        assert_eq!(tab.get::<flatbuffers::ForwardsUOffset<&str>>(fi2fo(0), Some("abc")), Some("abc"));
+        #[cfg(target_endian = "little")]
+        {
+            assert_eq!(tab.get::<flatbuffers::ForwardsUOffset<&[u8]>>(fi2fo(0), Some(&vec![70, 71, 72][..])), Some(&vec![70, 71, 72][..]));
+        }
+
+        let default_vec_buf: Vec<u8> = vec![3, 0, 0, 0, 70, 71, 72, 0];
+        let default_vec = flatbuffers::Vector::new(&default_vec_buf[..], 0);
+        let v = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<u8>>>(fi2fo(0), Some(default_vec)).unwrap();
+        assert_eq!(v.len(), 3);
+        assert_eq!(v.get(0), 70);
+        assert_eq!(v.get(1), 71);
+        assert_eq!(v.get(2), 72);
+    }
+
+    #[test]
+    fn to_root_to_table_get_slot_string_multiple_types_default_via_vtable_zero() {
+        let buf: Vec<u8> = vec![
+            14, 0, 0, 0, // offset to root table
+            // enter vtable
+            6, 0, // vtable len
+            2, 0, // inline size
+            0, 0, // value loc
+            255, 255, 255, 255, // canary
+            // enter table
+            10, 0, 0, 0, // vtable location
+        ];
+        let tab = <flatbuffers::ForwardsUOffset<flatbuffers::Table>>::follow(&buf[..], 0);
+        assert_eq!(tab.get::<flatbuffers::ForwardsUOffset<&str>>(fi2fo(0), Some("abc")), Some("abc"));
+        #[cfg(target_endian = "little")]
+        {
+            assert_eq!(tab.get::<flatbuffers::ForwardsUOffset<&[u8]>>(fi2fo(0), Some(&vec![70, 71, 72][..])), Some(&vec![70, 71, 72][..]));
+        }
+
+        let default_vec_buf: Vec<u8> = vec![3, 0, 0, 0, 70, 71, 72, 0];
+        let default_vec = flatbuffers::Vector::new(&default_vec_buf[..], 0);
+        let v = tab.get::<flatbuffers::ForwardsUOffset<flatbuffers::Vector<u8>>>(fi2fo(0), Some(default_vec)).unwrap();
+        assert_eq!(v.len(), 3);
+        assert_eq!(v.get(0), 70);
+        assert_eq!(v.get(1), 71);
+        assert_eq!(v.get(2), 72);
+    }
+}
+
+#[cfg(test)]
+mod push_impls {
+    extern crate flatbuffers;
+
+    use super::my_game;
+
+    fn check<'a>(b: &'a flatbuffers::FlatBufferBuilder, want: &'a [u8]) {
+        let got = b.unfinished_data();
+        assert_eq!(want, got);
+    }
+
+    #[test]
+    fn push_u8() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(123u8);
+        check(&b, &[123]);
+    }
+
+    #[test]
+    fn push_u64() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(0x12345678);
+        check(&b, &[0x78, 0x56, 0x34, 0x12]);
+    }
+
+    #[test]
+    fn push_f64() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(3.14159265359f64);
+        check(&b, &[234, 46, 68, 84, 251, 33, 9, 64]);
+    }
+
+    #[test]
+    fn push_generated_struct() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(my_game::example::Test::new(10, 20));
+        check(&b, &[10, 0, 20, 0]);
+    }
+
+    #[test]
+    fn push_u8_vector_with_offset_with_alignment() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.create_vector(&[1u8, 2, 3, 4, 5, 6, 7, 8, 9][..]);
+        b.push(off);
+        check(&b, &[/* loc */ 4, 0, 0, 0, /* len */ 9, 0, 0, 0, /* val */ 1, 2, 3, 4, 5, 6, 7, 8, 9, /* padding */ 0, 0, 0]);
+    }
+
+    #[test]
+    fn push_u8_u16_alignment() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(1u8);
+        b.push(2u16);
+        check(&b, &[2, 0, 0, 1]);
+    }
+
+    #[test]
+    fn push_u8_u32_alignment() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(1u8);
+        b.push(2u32);
+        check(&b, &[2, 0, 0, 0, 0, 0, 0, 1]);
+    }
+
+    #[test]
+    fn push_u8_u64_alignment() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(1u8);
+        b.push(2u64);
+        check(&b, &[2, 0, 0, 0,
+                    0, 0, 0, 0,
+                    0, 0, 0, 0,
+                    0, 0, 0, 1]);
+    }
+}
+
+#[cfg(test)]
+mod vtable_deduplication {
+    extern crate flatbuffers;
+    use flatbuffers::field_index_to_field_offset as fi2fo;
+
+    fn check<'a>(b: &'a flatbuffers::FlatBufferBuilder, want: &'a [u8]) {
+        let got = b.unfinished_data();
+        assert_eq!(want, got);
+    }
+
+    #[test]
+    fn one_empty_table() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let start0 = b.start_table();
+        b.end_table(start0);
+        check(&b, &[
+              4, 0, // vtable size in bytes
+              4, 0, // object inline data in bytes
+
+              4, 0, 0, 0, // backwards offset to vtable
+        ]);
+    }
+
+    #[test]
+    fn two_empty_tables_are_deduplicated() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let start0 = b.start_table();
+        b.end_table(start0);
+        let start1 = b.start_table();
+        b.end_table(start1);
+        check(&b, &[
+              252, 255, 255, 255, // forwards offset to vtable
+
+              4, 0, // vtable size in bytes
+              4, 0, // object inline data in bytes
+
+              4, 0, 0, 0, // backwards offset to vtable
+        ]);
+    }
+
+    #[test]
+    fn two_tables_with_two_conveniently_sized_inline_elements_are_deduplicated() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let start0 = b.start_table();
+        b.push_slot::<u64>(fi2fo(0), 100, 0);
+        b.push_slot::<u32>(fi2fo(1), 101, 0);
+        b.end_table(start0);
+        let start1 = b.start_table();
+        b.push_slot::<u64>(fi2fo(0), 200, 0);
+        b.push_slot::<u32>(fi2fo(1), 201, 0);
+        b.end_table(start1);
+        check(&b, &[
+              240, 255, 255, 255, // forwards offset to vtable
+
+              201, 0, 0, 0, // value #1
+              200, 0, 0, 0, 0, 0, 0, 0, // value #0
+
+              8, 0, // vtable size in bytes
+              16, 0, // object inline data in bytes
+              8, 0, // offset in object for value #0
+              4, 0, // offset in object for value #1
+
+              8, 0, 0, 0, // backwards offset to vtable
+              101, 0, 0, 0, // value #1
+              100, 0, 0, 0, 0, 0, 0, 0 // value #0
+        ]);
+    }
+
+    #[cfg(not(miri))]  // slow.
+    #[test]
+    fn many_identical_tables_use_few_vtables() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        for _ in 0..1000 {
+            let start = b.start_table();
+            b.push_slot::<u8>(fi2fo(0), 100, 0);
+            b.push_slot::<u32>(fi2fo(1), 101, 0);
+            b.end_table(start);
+        }
+        assert!(b.num_written_vtables() <= 10);
+    }
+}
+
+#[cfg(test)]
+mod byte_layouts {
+    extern crate flatbuffers;
+    use flatbuffers::field_index_to_field_offset as fi2fo;
+
+    fn check<'a>(b: &'a flatbuffers::FlatBufferBuilder, want: &'a [u8]) {
+        let got = b.unfinished_data();
+        assert_eq!(want, got);
+    }
+
+    #[test]
+    fn layout_01_basic_numbers() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(true);
+        check(&b, &[1]);
+        b.push(-127i8);
+        check(&b, &[129, 1]);
+        b.push(255u8);
+        check(&b, &[255, 129, 1]);
+        b.push(-32222i16);
+        check(&b, &[0x22, 0x82, 0, 255, 129, 1]); // first pad
+        b.push(0xFEEEu16);
+        check(&b, &[0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1]); // no pad this time
+        b.push(-53687092i32);
+        check(&b, &[204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1]);
+        b.push(0x98765432u32);
+        check(&b, &[0x32, 0x54, 0x76, 0x98, 204, 204, 204, 252, 0xEE, 0xFE, 0x22, 0x82, 0, 255, 129, 1]);
+    }
+
+    #[test]
+    fn layout_01b_bigger_numbers() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.push(0x1122334455667788u64);
+        check(&b, &[0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11]);
+    }
+
+    #[test]
+    fn layout_02_1xbyte_vector() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        check(&b, &[]);
+        b.start_vector::<u8>(1);
+        check(&b, &[0, 0, 0]); // align to 4bytes
+        b.push(1u8);
+        check(&b, &[1, 0, 0, 0]);
+        b.end_vector::<u8>(1);
+        check(&b, &[1, 0, 0, 0, 1, 0, 0, 0]); // padding
+    }
+
+    #[test]
+    fn layout_03_2xbyte_vector() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<u8>(2);
+        check(&b, &[0, 0]); // align to 4bytes
+        b.push(1u8);
+        check(&b, &[1, 0, 0]);
+        b.push(2u8);
+        check(&b, &[2, 1, 0, 0]);
+        b.end_vector::<u8>(2);
+        check(&b, &[2, 0, 0, 0, 2, 1, 0, 0]); // padding
+    }
+
+    #[test]
+    fn layout_03b_11xbyte_vector_matches_builder_size() {
+        let mut b = flatbuffers::FlatBufferBuilder::new_with_capacity(12);
+        b.start_vector::<u8>(8);
+
+        let mut gold = vec![0u8; 0];
+        check(&b, &gold[..]);
+
+        for i in 1u8..=8 {
+            b.push(i);
+            gold.insert(0, i);
+            check(&b, &gold[..]);
+        }
+        b.end_vector::<u8>(8);
+        let want = vec![8u8, 0, 0, 0,  8, 7, 6, 5, 4, 3, 2, 1];
+        check(&b, &want[..]);
+    }
+    #[test]
+    fn layout_04_1xuint16_vector() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<u16>(1);
+        check(&b, &[0, 0]); // align to 4bytes
+        b.push(1u16);
+        check(&b, &[1, 0, 0, 0]);
+        b.end_vector::<u16>(1);
+        check(&b, &[1, 0, 0, 0, 1, 0, 0, 0]); // padding
+    }
+
+    #[test]
+    fn layout_05_2xuint16_vector() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let _off = b.start_vector::<u16>(2);
+        check(&b, &[]); // align to 4bytes
+        b.push(0xABCDu16);
+        check(&b, &[0xCD, 0xAB]);
+        b.push(0xDCBAu16);
+        check(&b, &[0xBA, 0xDC, 0xCD, 0xAB]);
+        b.end_vector::<u16>(2);
+        check(&b, &[2, 0, 0, 0, 0xBA, 0xDC, 0xCD, 0xAB]);
+    }
+
+    #[test]
+    fn layout_06_create_string() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off0 = b.create_string("foo");
+        assert_eq!(8, off0.value());
+        check(&b, b"\x03\x00\x00\x00foo\x00"); // 0-terminated, no pad
+        let off1 = b.create_string("moop");
+        assert_eq!(20, off1.value());
+        check(&b, b"\x04\x00\x00\x00moop\x00\x00\x00\x00\
+                    \x03\x00\x00\x00foo\x00"); // 0-terminated, 3-byte pad
+    }
+
+    #[test]
+    fn layout_06b_create_string_unicode() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        // These characters are chinese from blog.golang.org/strings
+        // We use escape codes here so that editors without unicode support
+        // aren't bothered:
+        let uni_str = "\u{65e5}\u{672c}\u{8a9e}";
+        let off0 = b.create_string(uni_str);
+        assert_eq!(16, off0.value());
+        check(&b, &[9, 0, 0, 0, 230, 151, 165, 230, 156, 172, 232, 170, 158, 0, //  null-terminated, 2-byte pad
+                    0, 0]);
+    }
+
+    #[test]
+    fn layout_06c_create_byte_string() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off0 = b.create_byte_string(b"foo");
+        assert_eq!(8, off0.value());
+        check(&b, b"\x03\x00\x00\x00foo\x00"); // 0-terminated, no pad
+        let off1 = b.create_byte_string(b"moop");
+        assert_eq!(20, off1.value());
+        check(&b, b"\x04\x00\x00\x00moop\x00\x00\x00\x00\
+                    \x03\x00\x00\x00foo\x00"); // 0-terminated, 3-byte pad
+    }
+
+    #[test]
+    fn layout_07_empty_vtable() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off0 = b.start_table();
+        check(&b, &[]);
+        b.end_table(off0);
+        check(&b, &[4, 0, // vtable length
+                    4, 0, // length of table including vtable offset
+                    4, 0, 0, 0]); // offset for start of vtable
+    }
+
+    #[test]
+    fn layout_08_vtable_with_one_true_bool() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        check(&b, &[]);
+        let off0 = b.start_table();
+        assert_eq!(0, off0.value());
+        check(&b, &[]);
+        b.push_slot(fi2fo(0), true, false);
+        check(&b, &[1]);
+        let off1 = b.end_table(off0);
+        assert_eq!(8, off1.value());
+        check(&b, &[
+              6, 0, // vtable bytes
+              8, 0, // length of object including vtable offset
+              7, 0, // start of bool value
+              6, 0, 0, 0, // offset for start of vtable (int32)
+              0, 0, 0, // padded to 4 bytes
+              1, // bool value
+        ]);
+    }
+
+    #[test]
+    fn layout_09_vtable_with_one_default_bool() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        check(&b, &[]);
+        let off = b.start_table();
+        check(&b, &[]);
+        b.push_slot(fi2fo(0), false, false);
+        b.end_table(off);
+        check(&b, &[
+             4, 0, // vtable bytes
+             4, 0, // end of object from here
+             // entry 1 is zero and not stored.
+             4, 0, 0, 0, // offset for start of vtable (int32)
+        ]);
+    }
+
+    #[test]
+    fn layout_09b_vtable_with_one_default_bool_force_defaults() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        check(&b, &[]);
+        let off = b.start_table();
+        check(&b, &[]);
+        b.force_defaults(true);
+        b.push_slot(fi2fo(0), false, false);
+        b.end_table(off);
+        check(&b, &[
+            6, 0, // vtable bytes
+            8, 0, // length of object including vtable offset
+            7, 0, // start of bool value
+            6, 0, 0, 0, // offset for start of vtable (int32)
+            0, 0, 0, // padded to 4 bytes
+            0, // bool value
+      ]);
+    }
+
+    #[test]
+    fn layout_10_vtable_with_one_int16() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        check(&b, &[]);
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), 0x789Ai16, 0);
+        b.end_table(off);
+        check(&b, &[
+              6, 0, // vtable bytes
+              8, 0, // end of object from here
+              6, 0, // offset to value
+              6, 0, 0, 0, // offset for start of vtable (int32)
+              0, 0, // padding to 4 bytes
+              0x9A, 0x78,
+        ]);
+    }
+
+    #[test]
+    fn layout_11_vtable_with_two_int16() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), 0x3456i16, 0);
+        b.push_slot(fi2fo(1), 0x789Ai16, 0);
+        b.end_table(off);
+        check(&b, &[
+              8, 0, // vtable bytes
+              8, 0, // end of object from here
+              6, 0, // offset to value 0
+              4, 0, // offset to value 1
+              8, 0, 0, 0, // offset for start of vtable (int32)
+              0x9A, 0x78, // value 1
+              0x56, 0x34, // value 0
+        ]);
+    }
+
+    #[test]
+    fn layout_12_vtable_with_int16_and_bool() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), 0x3456i16, 0);
+        b.push_slot(fi2fo(1), true, false);
+        b.end_table(off);
+        check(&b, &[
+            8, 0, // vtable bytes
+            8, 0, // end of object from here
+            6, 0, // offset to value 0
+            5, 0, // offset to value 1
+            8, 0, 0, 0, // offset for start of vtable (int32)
+            0,          // padding
+            1,          // value 1
+            0x56, 0x34, // value 0
+        ]);
+    }
+
+    #[test]
+    fn layout_12b_vtable_with_empty_vector() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<u8>(0);
+        let vecend = b.end_vector::<u8>(0);
+        let off = b.start_table();
+        b.push_slot_always(fi2fo(0), vecend);
+        b.end_table(off);
+        check(&b, &[
+              6, 0, // vtable bytes
+              8, 0,
+              4, 0, // offset to vector offset
+              6, 0, 0, 0, // offset for start of vtable (int32)
+              4, 0, 0, 0,
+              0, 0, 0, 0, // length of vector (not in struct)
+        ]);
+    }
+
+    #[test]
+    fn layout_12c_vtable_with_empty_vector_of_byte_and_some_scalars() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<u8>(0);
+        let vecend = b.end_vector::<u8>(0);
+        let off = b.start_table();
+        b.push_slot::<i16>(fi2fo(0), 55i16, 0);
+        b.push_slot_always::<flatbuffers::WIPOffset<_>>(fi2fo(1), vecend);
+        b.end_table(off);
+        check(&b, &[
+              8, 0, // vtable bytes
+              12, 0,
+              10, 0, // offset to value 0
+              4, 0, // offset to vector offset
+              8, 0, 0, 0, // vtable loc
+              8, 0, 0, 0, // value 1
+              0, 0, 55, 0, // value 0
+
+              0, 0, 0, 0, // length of vector (not in struct)
+        ]);
+    }
+    #[test]
+    fn layout_13_vtable_with_1_int16_and_2_vector_of_i16() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<i16>(2);
+        b.push(0x1234i16);
+        b.push(0x5678i16);
+        let vecend = b.end_vector::<i16>(2);
+        let off = b.start_table();
+        b.push_slot_always(fi2fo(1), vecend);
+        b.push_slot(fi2fo(0), 55i16, 0);
+        b.end_table(off);
+        check(&b, &[
+              8, 0, // vtable bytes
+              12, 0, // length of object
+              6, 0, // start of value 0 from end of vtable
+              8, 0, // start of value 1 from end of buffer
+              8, 0, 0, 0, // offset for start of vtable (int32)
+              0, 0, // padding
+              55, 0, // value 0
+              4, 0, 0, 0, // vector position from here
+              2, 0, 0, 0, // length of vector (uint32)
+              0x78, 0x56, // vector value 1
+              0x34, 0x12, // vector value 0
+        ]);
+    }
+    #[test]
+    fn layout_14_vtable_with_1_struct_of_int8_and_int16_and_int32() {
+        #[derive(Copy, Clone, Debug, Eq, PartialEq)]
+        #[repr(C, packed)]
+        struct foo {
+            a: i32,
+            _pad0: [u8; 2],
+            b: i16,
+            _pad1: [u8; 3],
+            c: i8,
+            _pad2: [u8; 4],
+        }
+        assert_eq!(::std::mem::size_of::<foo>(), 16);
+        impl<'b> flatbuffers::Push for &'b foo {
+            type Output = foo;
+            fn push<'a>(&'a self, dst: &'a mut [u8], _rest: &'a [u8]) {
+                let src = unsafe {
+                    ::std::slice::from_raw_parts(*self as *const foo as *const u8, ::std::mem::size_of::<foo>())
+                };
+                dst.copy_from_slice(src);
+            }
+        }
+
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        let x = foo{a: 0x12345678i32.to_le(), _pad0: [0,0], b: 0x1234i16.to_le(), _pad1: [0, 0, 0], c: 0x12i8.to_le(), _pad2: [0, 0, 0, 0]};
+        b.push_slot_always(fi2fo(0), &x);
+        b.end_table(off);
+        check(&b, &[
+              6, 0, // vtable bytes
+              20, 0, // end of object from here
+              4, 0, // start of struct from here
+              6, 0, 0, 0, // offset for start of vtable (int32)
+
+              0x78, 0x56, 0x34, 0x12, // value a
+              0, 0, // padding
+              0x34, 0x12, // value b
+              0, 0, 0, // padding
+              0x12, // value c
+              0, 0, 0, 0, // padding
+        ]);
+    }
+    #[test]
+    fn layout_15_vtable_with_1_vector_of_4_int8() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        b.start_vector::<i8>(4);
+        b.push(33i8);
+        b.push(44i8);
+        b.push(55i8);
+        b.push(66i8);
+        let vecend = b.end_vector::<i8>(4);
+        let off = b.start_table();
+        b.push_slot_always(fi2fo(0), vecend);
+        b.end_table(off);
+        check(&b, &[
+              6, 0, // vtable bytes
+              8, 0,
+              4, 0, // offset of vector offset
+              6, 0, 0, 0, // offset for start of vtable (int32)
+              4, 0, 0, 0, // vector start offset
+
+              4, 0, 0, 0, // vector length
+              66, // vector value 1,1
+              55, // vector value 1,0
+              44, // vector value 0,1
+              33, // vector value 0,0
+        ]);
+    }
+
+    #[test]
+    fn layout_16_table_with_some_elements() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), 33i8, 0);
+        b.push_slot(fi2fo(1), 66i16, 0);
+        let off2 = b.end_table(off);
+        b.finish_minimal(off2);
+
+        check(&b, &[
+              12, 0, 0, 0, // root of table: points to vtable offset
+
+              8, 0, // vtable bytes
+              8, 0, // end of object from here
+              7, 0, // start of value 0
+              4, 0, // start of value 1
+
+              8, 0, 0, 0, // offset for start of vtable (int32)
+
+              66, 0, // value 1
+              0,  // padding
+              33, // value 0
+        ]);
+    }
+
+    #[test]
+    fn layout_17_one_unfinished_table_and_one_finished_table() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        {
+            let off = b.start_table();
+            b.push_slot(fi2fo(0), 33i8, 0);
+            b.push_slot(fi2fo(1), 44i8, 0);
+            b.end_table(off);
+        }
+
+        {
+            let off = b.start_table();
+            b.push_slot(fi2fo(0), 55i8, 0);
+            b.push_slot(fi2fo(1), 66i8, 0);
+            b.push_slot(fi2fo(2), 77i8, 0);
+            let off2 = b.end_table(off);
+            b.finish_minimal(off2);
+        }
+
+        check(&b, &[
+              16, 0, 0, 0, // root of table: points to object
+              0, 0, // padding
+
+              10, 0, // vtable bytes
+              8, 0, // size of object
+              7, 0, // start of value 0
+              6, 0, // start of value 1
+              5, 0, // start of value 2
+              10, 0, 0, 0, // offset for start of vtable (int32)
+              0,  // padding
+              77, // value 2
+              66, // value 1
+              55, // value 0
+
+              //12, 0, 0, 0, // root of table: points to object
+
+              8, 0, // vtable bytes
+              8, 0, // size of object
+              7, 0, // start of value 0
+              6, 0, // start of value 1
+              8, 0, 0, 0, // offset for start of vtable (int32)
+              0, 0, // padding
+              44, // value 1
+              33, // value 0
+              ]);
+    }
+
+    #[test]
+    fn layout_18_a_bunch_of_bools() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), true, false);
+        b.push_slot(fi2fo(1), true, false);
+        b.push_slot(fi2fo(2), true, false);
+        b.push_slot(fi2fo(3), true, false);
+        b.push_slot(fi2fo(4), true, false);
+        b.push_slot(fi2fo(5), true, false);
+        b.push_slot(fi2fo(6), true, false);
+        b.push_slot(fi2fo(7), true, false);
+        let off2 = b.end_table(off);
+        b.finish_minimal(off2);
+
+        check(&b, &[
+              24, 0, 0, 0, // root of table: points to vtable offset
+
+              20, 0, // vtable bytes
+              12, 0, // size of object
+              11, 0, // start of value 0
+              10, 0, // start of value 1
+              9, 0, // start of value 2
+              8, 0, // start of value 3
+              7, 0, // start of value 4
+              6, 0, // start of value 5
+              5, 0, // start of value 6
+              4, 0, // start of value 7
+              20, 0, 0, 0, // vtable offset
+
+              1, // value 7
+              1, // value 6
+              1, // value 5
+              1, // value 4
+              1, // value 3
+              1, // value 2
+              1, // value 1
+              1, // value 0
+              ]);
+    }
+
+    #[test]
+    fn layout_19_three_bools() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), true, false);
+        b.push_slot(fi2fo(1), true, false);
+        b.push_slot(fi2fo(2), true, false);
+        let off2 = b.end_table(off);
+        b.finish_minimal(off2);
+
+        check(&b, &[
+              16, 0, 0, 0, // root of table: points to vtable offset
+
+              0, 0, // padding
+
+              10, 0, // vtable bytes
+              8, 0, // size of object
+              7, 0, // start of value 0
+              6, 0, // start of value 1
+              5, 0, // start of value 2
+              10, 0, 0, 0, // vtable offset from here
+
+              0, // padding
+              1, // value 2
+              1, // value 1
+              1, // value 0
+        ]);
+    }
+
+    #[test]
+    fn layout_20_some_floats() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot(fi2fo(0), 1.0f32, 0.0);
+        b.end_table(off);
+
+        check(&b, &[
+              6, 0, // vtable bytes
+              8, 0, // size of object
+              4, 0, // start of value 0
+              6, 0, 0, 0, // vtable offset
+
+              0, 0, 128, 63, // value 0
+        ]);
+    }
+
+    #[test]
+    fn layout_21_vtable_defaults() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot::<i8>(fi2fo(0), 1, 1);
+        b.push_slot::<i8>(fi2fo(1), 3, 2);
+        b.push_slot::<i8>(fi2fo(2), 3, 3);
+        b.end_table(off);
+        check(&b, &[
+              8, 0, // vtable size in bytes
+              8, 0, // object inline data in bytes
+              0, 0, // entry 1/3: 0 => default
+              7, 0, // entry 2/3: 7 => table start + 7 bytes
+              // entry 3/3: not present => default
+              8, 0, 0, 0,
+              0, 0, 0,
+              3,
+        ]);
+    }
+
+    #[test]
+    fn layout_22_root() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        // skipped: b.push_slot_scalar::<i16>(0, 1, 1);
+        b.push_slot::<i16>(fi2fo(1), 3, 2);
+        b.push_slot::<i16>(fi2fo(2), 3, 3);
+        let table_end = b.end_table(off);
+        b.finish_minimal(table_end);
+        check(&b, &[
+              12, 0, 0, 0, // root
+
+              8, 0, // vtable size in bytes
+              8, 0, // object inline data in bytes
+              0, 0, // entry 1/3: 0 => default
+              6, 0, // entry 2/3: 6 => table start + 6 bytes
+              // entry 3/3: not present => default
+              8, 0, 0, 0, // size of table data in bytes
+              0, 0, // padding
+              3, 0, // value 2/3
+        ]);
+    }
+    #[test]
+    fn layout_23_varied_slots_and_root() {
+        let mut b = flatbuffers::FlatBufferBuilder::new();
+        let off = b.start_table();
+        b.push_slot::<i16>(fi2fo(0), 1, 0);
+        b.push_slot::<u8>(fi2fo(1), 2, 0);
+        b.push_slot::<f32>(fi2fo(2), 3.0, 0.0);
+        let table_end = b.end_table(off);
+        b.finish_minimal(table_end);
+        check(&b, &[
+              16, 0, 0, 0, // root
+              0, 0, // padding
+              10, 0, // vtable bytes
+              12, 0, // object inline data size
+              10, 0, // offset to value #1 (i16)
+              9, 0, // offset to value #2 (u8)
+              4, 0, // offset to value #3 (f32)
+              10, 0, // offset to vtable
+              0, 0, // padding
+              0, 0, 64, 64, // value #3 => 3.0 (float32)
+              0, 2, // value #1 => 2 (u8)
+              1, 0, // value #0 => 1 (int16)
+        ]);
+    }
+}
+
+#[cfg(test)]
+mod copy_clone_traits {
+    #[test]
+    fn follow_types_implement_copy_and_clone() {
+        static_assertions::assert_impl_all!(flatbuffers::WIPOffset<u32>: Copy, Clone);
+        static_assertions::assert_impl_all!(flatbuffers::WIPOffset<Vec<u32>>: Copy, Clone);
+
+        static_assertions::assert_impl_all!(flatbuffers::ForwardsUOffset<u32>: Copy, Clone);
+        static_assertions::assert_impl_all!(flatbuffers::ForwardsUOffset<Vec<u32>>: Copy, Clone);
+
+        static_assertions::assert_impl_all!(flatbuffers::Vector<'static, u32>: Copy, Clone);
+        static_assertions::assert_impl_all!(flatbuffers::Vector<'static, Vec<u32>>: Copy, Clone);
+    }
+}
+
+#[cfg(test)]
+mod fully_qualified_name {
+    #[test]
+    fn fully_qualified_name_generated() {
+        assert!(check_eq!(::my_game::example::Monster::get_fully_qualified_name(), "MyGame.Example.Monster").is_ok());
+        assert!(check_eq!(::my_game::example_2::Monster::get_fully_qualified_name(), "MyGame.Example2.Monster").is_ok());
+
+        assert!(check_eq!(::my_game::example::Vec3::get_fully_qualified_name(), "MyGame.Example.Vec3").is_ok());
+        assert!(check_eq!(::my_game::example::Ability::get_fully_qualified_name(), "MyGame.Example.Ability").is_ok());
+    }
+}
+
+// this is not technically a test, but we want to always keep this generated
+// file up-to-date, and the simplest way to do that is to make sure that when
+// tests are run, the file is generated.
+#[test]
+fn write_example_wire_data_to_file() {
+    let b = &mut flatbuffers::FlatBufferBuilder::new();
+    create_serialized_example_with_generated_code(b);
+
+    use ::std::io::Write;
+    let mut f = std::fs::File::create("../monsterdata_rust_wire.mon").unwrap();
+    f.write_all(b.finished_data()).unwrap();
+}
+
+fn load_file(filename: &str) -> Result<Vec<u8>, std::io::Error> {
+    use std::io::Read;
+    let mut f = std::fs::File::open(filename)?;
+    let mut buf = Vec::new();
+    f.read_to_end(&mut buf)?;
+    Ok(buf)
+}
+
+#[test]
+fn test_shared_strings() {
+    let mut builder = flatbuffers::FlatBufferBuilder::new();
+    let offset1 = builder.create_shared_string("welcome to flatbuffers!!");
+    let offset2 = builder.create_shared_string("welcome");
+    let offset3 = builder.create_shared_string("welcome to flatbuffers!!");
+    assert_ne!(offset2.value(), offset3.value());
+    assert_eq!(offset1.value(), offset3.value());
+    builder.reset();
+    let offset4 = builder.create_shared_string("welcome");
+    let offset5 = builder.create_shared_string("welcome to flatbuffers!!");
+    assert_ne!(offset2.value(), offset4.value());
+    assert_ne!(offset5.value(), offset1.value());
+    builder.reset();
+
+    // Checks if the shared string function would always work with
+    // an object in between the writes
+    let name = builder.create_shared_string("foo");
+    let enemy = my_game::example::Monster::create(&mut builder, &my_game::example::MonsterArgs {
+        name: Some(name),
+        ..Default::default()
+    });
+    let secondary_name = builder.create_shared_string("foo");
+    assert_eq!(name.value(), secondary_name.value());
+
+    // Builds a new monster object and embeds enemy into it so we can verify
+    // that shared strings are working.
+    let args = my_game::example::MonsterArgs {
+        name: Some(secondary_name),
+        enemy: Some(enemy),
+        testarrayofstring: Some(builder.create_vector(&[name, secondary_name])),
+        ..Default::default()
+    };
+    // Building secondary monster
+    let main_monster = my_game::example::Monster::create(&mut builder, &args);
+    builder.finish(main_monster, None);
+    let monster = my_game::example::root_as_monster(builder.finished_data()).unwrap();
+
+    // Checks if the embedded object (Enemy) name is foo
+    assert_eq!(monster.enemy().unwrap().name(), "foo");
+    let string_vector = monster.testarrayofstring().unwrap();
+    // Check if the vector will have the same string
+    assert_eq!(string_vector.get(0), "foo");
+    assert_eq!(string_vector.get(1), "foo");
+}
+
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/more_defaults_test.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/more_defaults_test.rs
new file mode 100644
index 0000000..c1ed81f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/more_defaults_test.rs
@@ -0,0 +1,30 @@
+#[allow(dead_code, unused_imports)]
+#[path = "../../more_defaults_generated.rs"]
+mod more_defaults_generated;
+use self::more_defaults_generated::*;
+
+#[test]
+fn object_defaults() {
+    assert_eq!(
+        MoreDefaultsT::default(),
+        MoreDefaultsT {
+            ints: Vec::new(),
+            floats: Vec::new(),
+            empty_string: "".to_string(),
+            some_string: "some".to_string(),
+            abcs: Vec::new(),
+            bools: Vec::new(),
+        },
+    )
+}
+
+#[test]
+fn nonpresent_values() {
+    let m = flatbuffers::root::<MoreDefaults>(&[0; 4]).unwrap();
+    assert_eq!(m.ints().len(), 0);
+    assert_eq!(m.floats().len(), 0);
+    assert_eq!(m.abcs().len(), 0);
+    assert_eq!(m.bools().len(), 0);
+    assert_eq!(m.empty_string(), "");
+    assert_eq!(m.some_string(), "some");
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/optional_scalars_test.rs b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/optional_scalars_test.rs
new file mode 100644
index 0000000..d6c8422
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/rust_usage_test/tests/optional_scalars_test.rs
@@ -0,0 +1,149 @@
+#[allow(dead_code, unused_imports)]
+#[path = "../../optional_scalars_generated.rs"]
+mod optional_scalars_generated;
+use crate::optional_scalars_generated::optional_scalars::*;
+
+// There are 3 variants of scalars in tables - those specified with default=42,
+// optional scalars, and those with nothing specified (implicitly default=0).
+// This tests that you can read what you write.
+macro_rules! make_test {
+    (
+        $test_name: ident,
+        $just: ident, $default: ident, $maybe: ident,
+        $five: expr, $zero: expr, $fortytwo: expr
+    ) => {
+        #[test]
+        fn $test_name() {
+            let mut builder = flatbuffers::FlatBufferBuilder::new();
+            // Test five makes sense when specified.
+            let ss = ScalarStuff::create(
+                &mut builder,
+                &ScalarStuffArgs {
+                    $just: $five,
+                    $default: $five,
+                    $maybe: Some($five),
+                    ..Default::default()
+                },
+            );
+            builder.finish(ss, None);
+
+            let s = flatbuffers::root::<ScalarStuff>(builder.finished_data()).unwrap();
+            assert_eq!(s.$just(), $five);
+            assert_eq!(s.$default(), $five);
+            assert_eq!(s.$maybe(), Some($five));
+
+            // Test defaults are used when not specified.
+            let s = flatbuffers::root::<ScalarStuff>(&[0; 8]).unwrap();
+            assert_eq!(s.$just(), $zero);
+            assert_eq!(s.$default(), $fortytwo);
+            assert_eq!(s.$maybe(), None);
+
+            // Same for object API
+            let s = flatbuffers::root::<ScalarStuff>(builder.finished_data()).unwrap().unpack();
+            assert_eq!(s.$just, $five);
+            assert_eq!(s.$default, $five);
+            assert_eq!(s.$maybe, Some($five));
+            let s = flatbuffers::root::<ScalarStuff>(&[0; 8]).unwrap().unpack();
+            assert_eq!(s.$just, $zero);
+            assert_eq!(s.$default, $fortytwo);
+            assert_eq!(s.$maybe, None);
+        }
+    };
+}
+
+make_test!(optional_i8, just_i8, default_i8, maybe_i8, 5, 0, 42);
+make_test!(optional_u8, just_u8, default_u8, maybe_u8, 5, 0, 42);
+make_test!(optional_i16, just_i16, default_i16, maybe_i16, 5, 0, 42);
+make_test!(optional_u16, just_u16, default_u16, maybe_u16, 5, 0, 42);
+make_test!(optional_i32, just_i32, default_i32, maybe_i32, 5, 0, 42);
+make_test!(optional_u32, just_u32, default_u32, maybe_u32, 5, 0, 42);
+make_test!(optional_i64, just_i64, default_i64, maybe_i64, 5, 0, 42);
+make_test!(optional_u64, just_u64, default_u64, maybe_u64, 5, 0, 42);
+make_test!(
+    optional_f32,
+    just_f32,
+    default_f32,
+    maybe_f32,
+    5.0,
+    0.0,
+    42.0
+);
+make_test!(
+    optional_f64,
+    just_f64,
+    default_f64,
+    maybe_f64,
+    5.0,
+    0.0,
+    42.0
+);
+make_test!(
+    optional_bool,
+    just_bool,
+    default_bool,
+    maybe_bool,
+    true,
+    false,
+    true
+);
+make_test!(
+     optional_enum,
+     just_enum,
+     default_enum,
+     maybe_enum,
+     OptionalByte::Two,
+     OptionalByte::None,
+     OptionalByte::One
+);
+
+#[test]
+fn object_api_defaults() {
+    assert_eq!(
+        ScalarStuffT::default(),
+        ScalarStuffT {
+            just_i8: 0,
+            maybe_i8: None,
+            default_i8: 42,
+            just_u8: 0,
+            maybe_u8: None,
+            default_u8: 42,
+
+            just_i16: 0,
+            maybe_i16: None,
+            default_i16: 42,
+            just_u16: 0,
+            maybe_u16: None,
+            default_u16: 42,
+
+            just_i32: 0,
+            maybe_i32: None,
+            default_i32: 42,
+            just_u32: 0,
+            maybe_u32: None,
+            default_u32: 42,
+
+            just_i64: 0,
+            maybe_i64: None,
+            default_i64: 42,
+            just_u64: 0,
+            maybe_u64: None,
+            default_u64: 42,
+
+            just_f32: 0.0,
+            maybe_f32: None,
+            default_f32: 42.0,
+            just_f64: 0.0,
+            maybe_f64: None,
+            default_f64: 42.0,
+
+            just_bool: false,
+            maybe_bool: None,
+            default_bool: true,
+
+            just_enum: OptionalByte::None,
+            maybe_enum: None,
+            default_enum: OptionalByte::One,
+
+        }
+    );
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/test.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/test.cpp
new file mode 100644
index 0000000..2c00339
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/test.cpp
@@ -0,0 +1,4038 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/minireflect.h"
+#include "flatbuffers/registry.h"
+#include "flatbuffers/util.h"
+
+// clang-format off
+#ifdef FLATBUFFERS_CPP98_STL
+  namespace std {
+    using flatbuffers::unique_ptr;
+  }
+#endif
+// clang-format on
+
+#include "monster_test_generated.h"
+#include "namespace_test/namespace_test1_generated.h"
+#include "namespace_test/namespace_test2_generated.h"
+#include "union_vector/union_vector_generated.h"
+#include "optional_scalars_generated.h"
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+#  include "monster_extra_generated.h"
+#  include "arrays_test_generated.h"
+#  include "evolution_test/evolution_v1_generated.h"
+#  include "evolution_test/evolution_v2_generated.h"
+#endif
+
+#include "native_type_test_generated.h"
+#include "test_assert.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include "monster_test_bfbs_generated.h"  // Generated using --bfbs-comments --bfbs-builtins --cpp --bfbs-gen-embed
+
+// clang-format off
+// Check that char* and uint8_t* are interoperable types.
+// The reinterpret_cast<> between the pointers are used to simplify data loading.
+static_assert(flatbuffers::is_same<uint8_t, char>::value ||
+              flatbuffers::is_same<uint8_t, unsigned char>::value,
+              "unexpected uint8_t type");
+
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+  // Ensure IEEE-754 support if tests of floats with NaN/Inf will run.
+  static_assert(std::numeric_limits<float>::is_iec559 &&
+                std::numeric_limits<double>::is_iec559,
+                "IEC-559 (IEEE-754) standard required");
+#endif
+// clang-format on
+
+// Shortcuts for the infinity.
+static const auto infinity_f = std::numeric_limits<float>::infinity();
+static const auto infinity_d = std::numeric_limits<double>::infinity();
+
+using namespace MyGame::Example;
+
+void FlatBufferBuilderTest();
+
+// Include simple random number generator to ensure results will be the
+// same cross platform.
+// http://en.wikipedia.org/wiki/Park%E2%80%93Miller_random_number_generator
+uint32_t lcg_seed = 48271;
+uint32_t lcg_rand() {
+  return lcg_seed =
+             (static_cast<uint64_t>(lcg_seed) * 279470273UL) % 4294967291UL;
+}
+void lcg_reset() { lcg_seed = 48271; }
+
+std::string test_data_path =
+#ifdef BAZEL_TEST_DATA_PATH
+    "../com_github_google_flatbuffers/tests/";
+#else
+    "tests/";
+#endif
+
+// example of how to build up a serialized buffer algorithmically:
+flatbuffers::DetachedBuffer CreateFlatBufferTest(std::string &buffer) {
+  flatbuffers::FlatBufferBuilder builder;
+
+  auto vec = Vec3(1, 2, 3, 0, Color_Red, Test(10, 20));
+
+  auto name = builder.CreateString("MyMonster");
+
+  unsigned char inv_data[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  auto inventory = builder.CreateVector(inv_data, 10);
+
+  // Alternatively, create the vector first, and fill in data later:
+  // unsigned char *inv_buf = nullptr;
+  // auto inventory = builder.CreateUninitializedVector<unsigned char>(
+  //                                                              10, &inv_buf);
+  // memcpy(inv_buf, inv_data, 10);
+
+  Test tests[] = { Test(10, 20), Test(30, 40) };
+  auto testv = builder.CreateVectorOfStructs(tests, 2);
+
+  // clang-format off
+  #ifndef FLATBUFFERS_CPP98_STL
+    // Create a vector of structures from a lambda.
+    auto testv2 = builder.CreateVectorOfStructs<Test>(
+          2, [&](size_t i, Test* s) -> void {
+            *s = tests[i];
+          });
+  #else
+    // Create a vector of structures using a plain old C++ function.
+    auto testv2 = builder.CreateVectorOfStructs<Test>(
+          2, [](size_t i, Test* s, void *state) -> void {
+            *s = (reinterpret_cast<Test*>(state))[i];
+          }, tests);
+  #endif  // FLATBUFFERS_CPP98_STL
+  // clang-format on
+
+  // create monster with very few fields set:
+  // (same functionality as CreateMonster below, but sets fields manually)
+  flatbuffers::Offset<Monster> mlocs[3];
+  auto fred = builder.CreateString("Fred");
+  auto barney = builder.CreateString("Barney");
+  auto wilma = builder.CreateString("Wilma");
+  MonsterBuilder mb1(builder);
+  mb1.add_name(fred);
+  mlocs[0] = mb1.Finish();
+  MonsterBuilder mb2(builder);
+  mb2.add_name(barney);
+  mb2.add_hp(1000);
+  mlocs[1] = mb2.Finish();
+  MonsterBuilder mb3(builder);
+  mb3.add_name(wilma);
+  mlocs[2] = mb3.Finish();
+
+  // Create an array of strings. Also test string pooling, and lambdas.
+  auto vecofstrings =
+      builder.CreateVector<flatbuffers::Offset<flatbuffers::String>>(
+          4,
+          [](size_t i, flatbuffers::FlatBufferBuilder *b)
+              -> flatbuffers::Offset<flatbuffers::String> {
+            static const char *names[] = { "bob", "fred", "bob", "fred" };
+            return b->CreateSharedString(names[i]);
+          },
+          &builder);
+
+  // Creating vectors of strings in one convenient call.
+  std::vector<std::string> names2;
+  names2.push_back("jane");
+  names2.push_back("mary");
+  auto vecofstrings2 = builder.CreateVectorOfStrings(names2);
+
+  // Create an array of sorted tables, can be used with binary search when read:
+  auto vecoftables = builder.CreateVectorOfSortedTables(mlocs, 3);
+
+  // Create an array of sorted structs,
+  // can be used with binary search when read:
+  std::vector<Ability> abilities;
+  abilities.push_back(Ability(4, 40));
+  abilities.push_back(Ability(3, 30));
+  abilities.push_back(Ability(2, 20));
+  abilities.push_back(Ability(0, 0));
+  auto vecofstructs = builder.CreateVectorOfSortedStructs(&abilities);
+
+  flatbuffers::Offset<Stat> mlocs_stats[1];
+  auto miss = builder.CreateString("miss");
+  StatBuilder mb_miss(builder);
+  mb_miss.add_id(miss);
+  mb_miss.add_val(0);
+  mb_miss.add_count(0);  // key
+  mlocs_stats[0] = mb_miss.Finish();
+  auto vec_of_stats = builder.CreateVectorOfSortedTables(mlocs_stats, 1);
+
+  // Create a nested FlatBuffer.
+  // Nested FlatBuffers are stored in a ubyte vector, which can be convenient
+  // since they can be memcpy'd around much easier than other FlatBuffer
+  // values. They have little overhead compared to storing the table directly.
+  // As a test, create a mostly empty Monster buffer:
+  flatbuffers::FlatBufferBuilder nested_builder;
+  auto nmloc = CreateMonster(nested_builder, nullptr, 0, 0,
+                             nested_builder.CreateString("NestedMonster"));
+  FinishMonsterBuffer(nested_builder, nmloc);
+  // Now we can store the buffer in the parent. Note that by default, vectors
+  // are only aligned to their elements or size field, so in this case if the
+  // buffer contains 64-bit elements, they may not be correctly aligned. We fix
+  // that with:
+  builder.ForceVectorAlignment(nested_builder.GetSize(), sizeof(uint8_t),
+                               nested_builder.GetBufferMinAlignment());
+  // If for whatever reason you don't have the nested_builder available, you
+  // can substitute flatbuffers::largest_scalar_t (64-bit) for the alignment, or
+  // the largest force_align value in your schema if you're using it.
+  auto nested_flatbuffer_vector = builder.CreateVector(
+      nested_builder.GetBufferPointer(), nested_builder.GetSize());
+
+  // Test a nested FlexBuffer:
+  flexbuffers::Builder flexbuild;
+  flexbuild.Int(1234);
+  flexbuild.Finish();
+  auto flex = builder.CreateVector(flexbuild.GetBuffer());
+  // Test vector of enums.
+  Color colors[] = { Color_Blue, Color_Green };
+  // We use this special creation function because we have an array of
+  // pre-C++11 (enum class) enums whose size likely is int, yet its declared
+  // type in the schema is byte.
+  auto vecofcolors = builder.CreateVectorScalarCast<uint8_t, Color>(colors, 2);
+
+  // shortcut for creating monster with all fields set:
+  auto mloc = CreateMonster(
+      builder, &vec, 150, 80, name, inventory, Color_Blue, Any_Monster,
+      mlocs[1].Union(),  // Store a union.
+      testv, vecofstrings, vecoftables, 0, nested_flatbuffer_vector, 0, false,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 3.14159f, 3.0f, 0.0f, vecofstrings2,
+      vecofstructs, flex, testv2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      AnyUniqueAliases_NONE, 0, AnyAmbiguousAliases_NONE, 0, vecofcolors,
+      MyGame::Example::Race_None, 0, vec_of_stats);
+
+  FinishMonsterBuffer(builder, mloc);
+
+  // clang-format off
+  #ifdef FLATBUFFERS_TEST_VERBOSE
+  // print byte data for debugging:
+  auto p = builder.GetBufferPointer();
+  for (flatbuffers::uoffset_t i = 0; i < builder.GetSize(); i++)
+    printf("%d ", p[i]);
+  #endif
+  // clang-format on
+
+  // return the buffer for the caller to use.
+  auto bufferpointer =
+      reinterpret_cast<const char *>(builder.GetBufferPointer());
+  buffer.assign(bufferpointer, bufferpointer + builder.GetSize());
+
+  return builder.Release();
+}
+
+//  example of accessing a buffer loaded in memory:
+void AccessFlatBufferTest(const uint8_t *flatbuf, size_t length,
+                          bool pooled = true) {
+  // First, verify the buffers integrity (optional)
+  flatbuffers::Verifier verifier(flatbuf, length);
+  TEST_EQ(VerifyMonsterBuffer(verifier), true);
+
+  // clang-format off
+  #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+    std::vector<uint8_t> test_buff;
+    test_buff.resize(length * 2);
+    std::memcpy(&test_buff[0], flatbuf, length);
+    std::memcpy(&test_buff[length], flatbuf, length);
+
+    flatbuffers::Verifier verifier1(&test_buff[0], length);
+    TEST_EQ(VerifyMonsterBuffer(verifier1), true);
+    TEST_EQ(verifier1.GetComputedSize(), length);
+
+    flatbuffers::Verifier verifier2(&test_buff[length], length);
+    TEST_EQ(VerifyMonsterBuffer(verifier2), true);
+    TEST_EQ(verifier2.GetComputedSize(), length);
+  #endif
+  // clang-format on
+
+  TEST_EQ(strcmp(MonsterIdentifier(), "MONS"), 0);
+  TEST_EQ(MonsterBufferHasIdentifier(flatbuf), true);
+  TEST_EQ(strcmp(MonsterExtension(), "mon"), 0);
+
+  // Access the buffer from the root.
+  auto monster = GetMonster(flatbuf);
+
+  TEST_EQ(monster->hp(), 80);
+  TEST_EQ(monster->mana(), 150);  // default
+  TEST_EQ_STR(monster->name()->c_str(), "MyMonster");
+  // Can't access the following field, it is deprecated in the schema,
+  // which means accessors are not generated:
+  // monster.friendly()
+
+  auto pos = monster->pos();
+  TEST_NOTNULL(pos);
+  TEST_EQ(pos->z(), 3);
+  TEST_EQ(pos->test3().a(), 10);
+  TEST_EQ(pos->test3().b(), 20);
+
+  auto inventory = monster->inventory();
+  TEST_EQ(VectorLength(inventory), 10UL);  // Works even if inventory is null.
+  TEST_NOTNULL(inventory);
+  unsigned char inv_data[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  // Check compatibilty of iterators with STL.
+  std::vector<unsigned char> inv_vec(inventory->begin(), inventory->end());
+  size_t n = 0;
+  for (auto it = inventory->begin(); it != inventory->end(); ++it, ++n) {
+    auto indx = it - inventory->begin();
+    TEST_EQ(*it, inv_vec.at(indx));  // Use bounds-check.
+    TEST_EQ(*it, inv_data[indx]);
+  }
+  TEST_EQ(n, inv_vec.size());
+
+  n = 0;
+  for (auto it = inventory->cbegin(); it != inventory->cend(); ++it, ++n) {
+    auto indx = it - inventory->cbegin();
+    TEST_EQ(*it, inv_vec.at(indx));  // Use bounds-check.
+    TEST_EQ(*it, inv_data[indx]);
+  }
+  TEST_EQ(n, inv_vec.size());
+
+  n = 0;
+  for (auto it = inventory->rbegin(); it != inventory->rend(); ++it, ++n) {
+    auto indx = inventory->rend() - it - 1;
+    TEST_EQ(*it, inv_vec.at(indx));  // Use bounds-check.
+    TEST_EQ(*it, inv_data[indx]);
+  }
+  TEST_EQ(n, inv_vec.size());
+
+  n = 0;
+  for (auto it = inventory->crbegin(); it != inventory->crend(); ++it, ++n) {
+    auto indx = inventory->crend() - it - 1;
+    TEST_EQ(*it, inv_vec.at(indx));  // Use bounds-check.
+    TEST_EQ(*it, inv_data[indx]);
+  }
+  TEST_EQ(n, inv_vec.size());
+
+  TEST_EQ(monster->color(), Color_Blue);
+
+  // Example of accessing a union:
+  TEST_EQ(monster->test_type(), Any_Monster);  // First make sure which it is.
+  auto monster2 = reinterpret_cast<const Monster *>(monster->test());
+  TEST_NOTNULL(monster2);
+  TEST_EQ_STR(monster2->name()->c_str(), "Fred");
+
+  // Example of accessing a vector of strings:
+  auto vecofstrings = monster->testarrayofstring();
+  TEST_EQ(vecofstrings->size(), 4U);
+  TEST_EQ_STR(vecofstrings->Get(0)->c_str(), "bob");
+  TEST_EQ_STR(vecofstrings->Get(1)->c_str(), "fred");
+  if (pooled) {
+    // These should have pointer equality because of string pooling.
+    TEST_EQ(vecofstrings->Get(0)->c_str(), vecofstrings->Get(2)->c_str());
+    TEST_EQ(vecofstrings->Get(1)->c_str(), vecofstrings->Get(3)->c_str());
+  }
+
+  auto vecofstrings2 = monster->testarrayofstring2();
+  if (vecofstrings2) {
+    TEST_EQ(vecofstrings2->size(), 2U);
+    TEST_EQ_STR(vecofstrings2->Get(0)->c_str(), "jane");
+    TEST_EQ_STR(vecofstrings2->Get(1)->c_str(), "mary");
+  }
+
+  // Example of accessing a vector of tables:
+  auto vecoftables = monster->testarrayoftables();
+  TEST_EQ(vecoftables->size(), 3U);
+  for (auto it = vecoftables->begin(); it != vecoftables->end(); ++it) {
+    TEST_EQ(strlen(it->name()->c_str()) >= 4, true);
+  }
+  TEST_EQ_STR(vecoftables->Get(0)->name()->c_str(), "Barney");
+  TEST_EQ(vecoftables->Get(0)->hp(), 1000);
+  TEST_EQ_STR(vecoftables->Get(1)->name()->c_str(), "Fred");
+  TEST_EQ_STR(vecoftables->Get(2)->name()->c_str(), "Wilma");
+  TEST_NOTNULL(vecoftables->LookupByKey("Barney"));
+  TEST_NOTNULL(vecoftables->LookupByKey("Fred"));
+  TEST_NOTNULL(vecoftables->LookupByKey("Wilma"));
+
+  // Test accessing a vector of sorted structs
+  auto vecofstructs = monster->testarrayofsortedstruct();
+  if (vecofstructs) {  // not filled in monster_test.bfbs
+    for (flatbuffers::uoffset_t i = 0; i < vecofstructs->size() - 1; i++) {
+      auto left = vecofstructs->Get(i);
+      auto right = vecofstructs->Get(i + 1);
+      TEST_EQ(true, (left->KeyCompareLessThan(right)));
+    }
+    TEST_NOTNULL(vecofstructs->LookupByKey(0));  // test default value
+    TEST_NOTNULL(vecofstructs->LookupByKey(3));
+    TEST_EQ(static_cast<const Ability *>(nullptr),
+            vecofstructs->LookupByKey(5));
+  }
+
+  if (auto vec_of_stat = monster->scalar_key_sorted_tables()) {
+    auto stat_0 = vec_of_stat->LookupByKey(static_cast<uint16_t>(0u));
+    TEST_NOTNULL(stat_0);
+    TEST_NOTNULL(stat_0->id());
+    TEST_EQ(0, stat_0->count());
+    TEST_EQ_STR("miss", stat_0->id()->c_str());
+  }
+
+  // Test nested FlatBuffers if available:
+  auto nested_buffer = monster->testnestedflatbuffer();
+  if (nested_buffer) {
+    // nested_buffer is a vector of bytes you can memcpy. However, if you
+    // actually want to access the nested data, this is a convenient
+    // accessor that directly gives you the root table:
+    auto nested_monster = monster->testnestedflatbuffer_nested_root();
+    TEST_EQ_STR(nested_monster->name()->c_str(), "NestedMonster");
+  }
+
+  // Test flexbuffer if available:
+  auto flex = monster->flex();
+  // flex is a vector of bytes you can memcpy etc.
+  TEST_EQ(flex->size(), 4);  // Encoded FlexBuffer bytes.
+  // However, if you actually want to access the nested data, this is a
+  // convenient accessor that directly gives you the root value:
+  TEST_EQ(monster->flex_flexbuffer_root().AsInt16(), 1234);
+
+  // Test vector of enums:
+  auto colors = monster->vector_of_enums();
+  if (colors) {
+    TEST_EQ(colors->size(), 2);
+    TEST_EQ(colors->Get(0), Color_Blue);
+    TEST_EQ(colors->Get(1), Color_Green);
+  }
+
+  // Since Flatbuffers uses explicit mechanisms to override the default
+  // compiler alignment, double check that the compiler indeed obeys them:
+  // (Test consists of a short and byte):
+  TEST_EQ(flatbuffers::AlignOf<Test>(), 2UL);
+  TEST_EQ(sizeof(Test), 4UL);
+
+  const flatbuffers::Vector<const Test *> *tests_array[] = {
+    monster->test4(),
+    monster->test5(),
+  };
+  for (size_t i = 0; i < sizeof(tests_array) / sizeof(tests_array[0]); ++i) {
+    auto tests = tests_array[i];
+    TEST_NOTNULL(tests);
+    auto test_0 = tests->Get(0);
+    auto test_1 = tests->Get(1);
+    TEST_EQ(test_0->a(), 10);
+    TEST_EQ(test_0->b(), 20);
+    TEST_EQ(test_1->a(), 30);
+    TEST_EQ(test_1->b(), 40);
+    for (auto it = tests->begin(); it != tests->end(); ++it) {
+      TEST_EQ(it->a() == 10 || it->a() == 30, true);  // Just testing iterators.
+    }
+  }
+
+  // Checking for presence of fields:
+  TEST_EQ(flatbuffers::IsFieldPresent(monster, Monster::VT_HP), true);
+  TEST_EQ(flatbuffers::IsFieldPresent(monster, Monster::VT_MANA), false);
+
+  // Obtaining a buffer from a root:
+  TEST_EQ(GetBufferStartFromRootPointer(monster), flatbuf);
+}
+
+// Change a FlatBuffer in-place, after it has been constructed.
+void MutateFlatBuffersTest(uint8_t *flatbuf, std::size_t length) {
+  // Get non-const pointer to root.
+  auto monster = GetMutableMonster(flatbuf);
+
+  // Each of these tests mutates, then tests, then set back to the original,
+  // so we can test that the buffer in the end still passes our original test.
+  auto hp_ok = monster->mutate_hp(10);
+  TEST_EQ(hp_ok, true);  // Field was present.
+  TEST_EQ(monster->hp(), 10);
+  // Mutate to default value
+  auto hp_ok_default = monster->mutate_hp(100);
+  TEST_EQ(hp_ok_default, true);  // Field was present.
+  TEST_EQ(monster->hp(), 100);
+  // Test that mutate to default above keeps field valid for further mutations
+  auto hp_ok_2 = monster->mutate_hp(20);
+  TEST_EQ(hp_ok_2, true);
+  TEST_EQ(monster->hp(), 20);
+  monster->mutate_hp(80);
+
+  // Monster originally at 150 mana (default value)
+  auto mana_default_ok = monster->mutate_mana(150);  // Mutate to default value.
+  TEST_EQ(mana_default_ok,
+          true);  // Mutation should succeed, because default value.
+  TEST_EQ(monster->mana(), 150);
+  auto mana_ok = monster->mutate_mana(10);
+  TEST_EQ(mana_ok, false);  // Field was NOT present, because default value.
+  TEST_EQ(monster->mana(), 150);
+
+  // Mutate structs.
+  auto pos = monster->mutable_pos();
+  auto test3 = pos->mutable_test3();  // Struct inside a struct.
+  test3.mutate_a(50);                 // Struct fields never fail.
+  TEST_EQ(test3.a(), 50);
+  test3.mutate_a(10);
+
+  // Mutate vectors.
+  auto inventory = monster->mutable_inventory();
+  inventory->Mutate(9, 100);
+  TEST_EQ(inventory->Get(9), 100);
+  inventory->Mutate(9, 9);
+
+  auto tables = monster->mutable_testarrayoftables();
+  auto first = tables->GetMutableObject(0);
+  TEST_EQ(first->hp(), 1000);
+  first->mutate_hp(0);
+  TEST_EQ(first->hp(), 0);
+  first->mutate_hp(1000);
+
+  // Run the verifier and the regular test to make sure we didn't trample on
+  // anything.
+  AccessFlatBufferTest(flatbuf, length);
+}
+
+// Unpack a FlatBuffer into objects.
+void ObjectFlatBuffersTest(uint8_t *flatbuf) {
+  // Optional: we can specify resolver and rehasher functions to turn hashed
+  // strings into object pointers and back, to implement remote references
+  // and such.
+  auto resolver = flatbuffers::resolver_function_t(
+      [](void **pointer_adr, flatbuffers::hash_value_t hash) {
+        (void)pointer_adr;
+        (void)hash;
+        // Don't actually do anything, leave variable null.
+      });
+  auto rehasher = flatbuffers::rehasher_function_t(
+      [](void *pointer) -> flatbuffers::hash_value_t {
+        (void)pointer;
+        return 0;
+      });
+
+  // Turn a buffer into C++ objects.
+  auto monster1 = UnPackMonster(flatbuf, &resolver);
+
+  // Re-serialize the data.
+  flatbuffers::FlatBufferBuilder fbb1;
+  fbb1.Finish(CreateMonster(fbb1, monster1.get(), &rehasher),
+              MonsterIdentifier());
+
+  // Unpack again, and re-serialize again.
+  auto monster2 = UnPackMonster(fbb1.GetBufferPointer(), &resolver);
+  flatbuffers::FlatBufferBuilder fbb2;
+  fbb2.Finish(CreateMonster(fbb2, monster2.get(), &rehasher),
+              MonsterIdentifier());
+
+  // Now we've gone full round-trip, the two buffers should match.
+  auto len1 = fbb1.GetSize();
+  auto len2 = fbb2.GetSize();
+  TEST_EQ(len1, len2);
+  TEST_EQ(memcmp(fbb1.GetBufferPointer(), fbb2.GetBufferPointer(), len1), 0);
+
+  // Test it with the original buffer test to make sure all data survived.
+  AccessFlatBufferTest(fbb2.GetBufferPointer(), len2, false);
+
+  // Test accessing fields, similar to AccessFlatBufferTest above.
+  TEST_EQ(monster2->hp, 80);
+  TEST_EQ(monster2->mana, 150);  // default
+  TEST_EQ_STR(monster2->name.c_str(), "MyMonster");
+
+  auto &pos = monster2->pos;
+  TEST_NOTNULL(pos);
+  TEST_EQ(pos->z(), 3);
+  TEST_EQ(pos->test3().a(), 10);
+  TEST_EQ(pos->test3().b(), 20);
+
+  auto &inventory = monster2->inventory;
+  TEST_EQ(inventory.size(), 10UL);
+  unsigned char inv_data[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  for (auto it = inventory.begin(); it != inventory.end(); ++it)
+    TEST_EQ(*it, inv_data[it - inventory.begin()]);
+
+  TEST_EQ(monster2->color, Color_Blue);
+
+  auto monster3 = monster2->test.AsMonster();
+  TEST_NOTNULL(monster3);
+  TEST_EQ_STR(monster3->name.c_str(), "Fred");
+
+  auto &vecofstrings = monster2->testarrayofstring;
+  TEST_EQ(vecofstrings.size(), 4U);
+  TEST_EQ_STR(vecofstrings[0].c_str(), "bob");
+  TEST_EQ_STR(vecofstrings[1].c_str(), "fred");
+
+  auto &vecofstrings2 = monster2->testarrayofstring2;
+  TEST_EQ(vecofstrings2.size(), 2U);
+  TEST_EQ_STR(vecofstrings2[0].c_str(), "jane");
+  TEST_EQ_STR(vecofstrings2[1].c_str(), "mary");
+
+  auto &vecoftables = monster2->testarrayoftables;
+  TEST_EQ(vecoftables.size(), 3U);
+  TEST_EQ_STR(vecoftables[0]->name.c_str(), "Barney");
+  TEST_EQ(vecoftables[0]->hp, 1000);
+  TEST_EQ_STR(vecoftables[1]->name.c_str(), "Fred");
+  TEST_EQ_STR(vecoftables[2]->name.c_str(), "Wilma");
+
+  auto &tests = monster2->test4;
+  TEST_EQ(tests[0].a(), 10);
+  TEST_EQ(tests[0].b(), 20);
+  TEST_EQ(tests[1].a(), 30);
+  TEST_EQ(tests[1].b(), 40);
+}
+
+// Prefix a FlatBuffer with a size field.
+void SizePrefixedTest() {
+  // Create size prefixed buffer.
+  flatbuffers::FlatBufferBuilder fbb;
+  FinishSizePrefixedMonsterBuffer(
+      fbb, CreateMonster(fbb, 0, 200, 300, fbb.CreateString("bob")));
+
+  // Verify it.
+  flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  TEST_EQ(VerifySizePrefixedMonsterBuffer(verifier), true);
+
+  // Access it.
+  auto m = GetSizePrefixedMonster(fbb.GetBufferPointer());
+  TEST_EQ(m->mana(), 200);
+  TEST_EQ(m->hp(), 300);
+  TEST_EQ_STR(m->name()->c_str(), "bob");
+}
+
+void TriviallyCopyableTest() {
+  // clang-format off
+  #if __GNUG__ && __GNUC__ < 5
+    TEST_EQ(__has_trivial_copy(Vec3), true);
+  #else
+    #if __cplusplus >= 201103L
+      TEST_EQ(std::is_trivially_copyable<Vec3>::value, true);
+    #endif
+  #endif
+  // clang-format on
+}
+
+// Check stringify of an default enum value to json
+void JsonDefaultTest() {
+  // load FlatBuffer schema (.fbs) from disk
+  std::string schemafile;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "monster_test.fbs").c_str(),
+                                false, &schemafile),
+          true);
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parser;
+  auto include_test_path =
+      flatbuffers::ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+
+  TEST_EQ(parser.Parse(schemafile.c_str(), include_directories), true);
+  // create incomplete monster and store to json
+  parser.opts.output_default_scalars_in_json = true;
+  parser.opts.output_enum_identifiers = true;
+  flatbuffers::FlatBufferBuilder builder;
+  auto name = builder.CreateString("default_enum");
+  MonsterBuilder color_monster(builder);
+  color_monster.add_name(name);
+  FinishMonsterBuffer(builder, color_monster.Finish());
+  std::string jsongen;
+  auto result = GenerateText(parser, builder.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  // default value of the "color" field is Blue
+  TEST_EQ(std::string::npos != jsongen.find("color: \"Blue\""), true);
+  // default value of the "testf" field is 3.14159
+  TEST_EQ(std::string::npos != jsongen.find("testf: 3.14159"), true);
+}
+
+void JsonEnumsTest() {
+  // load FlatBuffer schema (.fbs) from disk
+  std::string schemafile;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "monster_test.fbs").c_str(),
+                                false, &schemafile),
+          true);
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parser;
+  auto include_test_path =
+      flatbuffers::ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+  parser.opts.output_enum_identifiers = true;
+  TEST_EQ(parser.Parse(schemafile.c_str(), include_directories), true);
+  flatbuffers::FlatBufferBuilder builder;
+  auto name = builder.CreateString("bitflag_enum");
+  MonsterBuilder color_monster(builder);
+  color_monster.add_name(name);
+  color_monster.add_color(Color(Color_Blue | Color_Red));
+  FinishMonsterBuffer(builder, color_monster.Finish());
+  std::string jsongen;
+  auto result = GenerateText(parser, builder.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ(std::string::npos != jsongen.find("color: \"Red Blue\""), true);
+  // Test forward compatibility with 'output_enum_identifiers = true'.
+  // Current Color doesn't have '(1u << 2)' field, let's add it.
+  builder.Clear();
+  std::string future_json;
+  auto future_name = builder.CreateString("future bitflag_enum");
+  MonsterBuilder future_color(builder);
+  future_color.add_name(future_name);
+  future_color.add_color(
+      static_cast<Color>((1u << 2) | Color_Blue | Color_Red));
+  FinishMonsterBuffer(builder, future_color.Finish());
+  result = GenerateText(parser, builder.GetBufferPointer(), &future_json);
+  TEST_EQ(result, true);
+  TEST_EQ(std::string::npos != future_json.find("color: 13"), true);
+}
+
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+// The IEEE-754 quiet_NaN is not simple binary constant.
+// All binary NaN bit strings have all the bits of the biased exponent field E
+// set to 1. A quiet NaN bit string should be encoded with the first bit d[1]
+// of the trailing significand field T being 1 (d[0] is implicit bit).
+// It is assumed that endianness of floating-point is same as integer.
+template<typename T, typename U, U qnan_base> bool is_quiet_nan_impl(T v) {
+  static_assert(sizeof(T) == sizeof(U), "unexpected");
+  U b = 0;
+  std::memcpy(&b, &v, sizeof(T));
+  return ((b & qnan_base) == qnan_base);
+}
+#  if defined(__mips__) || defined(__hppa__)
+static bool is_quiet_nan(float v) {
+  return is_quiet_nan_impl<float, uint32_t, 0x7FC00000u>(v) ||
+         is_quiet_nan_impl<float, uint32_t, 0x7FBFFFFFu>(v);
+}
+static bool is_quiet_nan(double v) {
+  return is_quiet_nan_impl<double, uint64_t, 0x7FF8000000000000ul>(v) ||
+         is_quiet_nan_impl<double, uint64_t, 0x7FF7FFFFFFFFFFFFu>(v);
+}
+#  else
+static bool is_quiet_nan(float v) {
+  return is_quiet_nan_impl<float, uint32_t, 0x7FC00000u>(v);
+}
+static bool is_quiet_nan(double v) {
+  return is_quiet_nan_impl<double, uint64_t, 0x7FF8000000000000ul>(v);
+}
+#  endif
+
+void TestMonsterExtraFloats() {
+  TEST_EQ(is_quiet_nan(1.0), false);
+  TEST_EQ(is_quiet_nan(infinity_d), false);
+  TEST_EQ(is_quiet_nan(-infinity_f), false);
+  TEST_EQ(is_quiet_nan(std::numeric_limits<float>::quiet_NaN()), true);
+  TEST_EQ(is_quiet_nan(std::numeric_limits<double>::quiet_NaN()), true);
+
+  using namespace flatbuffers;
+  using namespace MyGame;
+  // Load FlatBuffer schema (.fbs) from disk.
+  std::string schemafile;
+  TEST_EQ(LoadFile((test_data_path + "monster_extra.fbs").c_str(), false,
+                   &schemafile),
+          true);
+  // Parse schema first, so we can use it to parse the data after.
+  Parser parser;
+  auto include_test_path = ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+  TEST_EQ(parser.Parse(schemafile.c_str(), include_directories), true);
+  // Create empty extra and store to json.
+  parser.opts.output_default_scalars_in_json = true;
+  parser.opts.output_enum_identifiers = true;
+  FlatBufferBuilder builder;
+  const auto def_root = MonsterExtraBuilder(builder).Finish();
+  FinishMonsterExtraBuffer(builder, def_root);
+  const auto def_obj = builder.GetBufferPointer();
+  const auto def_extra = GetMonsterExtra(def_obj);
+  TEST_NOTNULL(def_extra);
+  TEST_EQ(is_quiet_nan(def_extra->f0()), true);
+  TEST_EQ(is_quiet_nan(def_extra->f1()), true);
+  TEST_EQ(def_extra->f2(), +infinity_f);
+  TEST_EQ(def_extra->f3(), -infinity_f);
+  TEST_EQ(is_quiet_nan(def_extra->d0()), true);
+  TEST_EQ(is_quiet_nan(def_extra->d1()), true);
+  TEST_EQ(def_extra->d2(), +infinity_d);
+  TEST_EQ(def_extra->d3(), -infinity_d);
+  std::string jsongen;
+  auto result = GenerateText(parser, def_obj, &jsongen);
+  TEST_EQ(result, true);
+  // Check expected default values.
+  TEST_EQ(std::string::npos != jsongen.find("f0: nan"), true);
+  TEST_EQ(std::string::npos != jsongen.find("f1: nan"), true);
+  TEST_EQ(std::string::npos != jsongen.find("f2: inf"), true);
+  TEST_EQ(std::string::npos != jsongen.find("f3: -inf"), true);
+  TEST_EQ(std::string::npos != jsongen.find("d0: nan"), true);
+  TEST_EQ(std::string::npos != jsongen.find("d1: nan"), true);
+  TEST_EQ(std::string::npos != jsongen.find("d2: inf"), true);
+  TEST_EQ(std::string::npos != jsongen.find("d3: -inf"), true);
+  // Parse 'mosterdata_extra.json'.
+  const auto extra_base = test_data_path + "monsterdata_extra";
+  jsongen = "";
+  TEST_EQ(LoadFile((extra_base + ".json").c_str(), false, &jsongen), true);
+  TEST_EQ(parser.Parse(jsongen.c_str()), true);
+  const auto test_file = parser.builder_.GetBufferPointer();
+  const auto test_size = parser.builder_.GetSize();
+  Verifier verifier(test_file, test_size);
+  TEST_ASSERT(VerifyMonsterExtraBuffer(verifier));
+  const auto extra = GetMonsterExtra(test_file);
+  TEST_NOTNULL(extra);
+  TEST_EQ(is_quiet_nan(extra->f0()), true);
+  TEST_EQ(is_quiet_nan(extra->f1()), true);
+  TEST_EQ(extra->f2(), +infinity_f);
+  TEST_EQ(extra->f3(), -infinity_f);
+  TEST_EQ(is_quiet_nan(extra->d0()), true);
+  TEST_EQ(extra->d1(), +infinity_d);
+  TEST_EQ(extra->d2(), -infinity_d);
+  TEST_EQ(is_quiet_nan(extra->d3()), true);
+  TEST_NOTNULL(extra->fvec());
+  TEST_EQ(extra->fvec()->size(), 4);
+  TEST_EQ(extra->fvec()->Get(0), 1.0f);
+  TEST_EQ(extra->fvec()->Get(1), -infinity_f);
+  TEST_EQ(extra->fvec()->Get(2), +infinity_f);
+  TEST_EQ(is_quiet_nan(extra->fvec()->Get(3)), true);
+  TEST_NOTNULL(extra->dvec());
+  TEST_EQ(extra->dvec()->size(), 4);
+  TEST_EQ(extra->dvec()->Get(0), 2.0);
+  TEST_EQ(extra->dvec()->Get(1), +infinity_d);
+  TEST_EQ(extra->dvec()->Get(2), -infinity_d);
+  TEST_EQ(is_quiet_nan(extra->dvec()->Get(3)), true);
+}
+#else
+void TestMonsterExtraFloats() {}
+#endif
+
+// example of parsing text straight into a buffer, and generating
+// text back from it:
+void ParseAndGenerateTextTest(bool binary) {
+  // load FlatBuffer schema (.fbs) and JSON from disk
+  std::string schemafile;
+  std::string jsonfile;
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "monster_test." + (binary ? "bfbs" : "fbs"))
+                  .c_str(),
+              binary, &schemafile),
+          true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "monsterdata_test.golden").c_str(), false,
+              &jsonfile),
+          true);
+
+  auto include_test_path =
+      flatbuffers::ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parser;
+  if (binary) {
+    flatbuffers::Verifier verifier(
+        reinterpret_cast<const uint8_t *>(schemafile.c_str()),
+        schemafile.size());
+    TEST_EQ(reflection::VerifySchemaBuffer(verifier), true);
+    // auto schema = reflection::GetSchema(schemafile.c_str());
+    TEST_EQ(parser.Deserialize((const uint8_t *)schemafile.c_str(),
+                               schemafile.size()),
+            true);
+  } else {
+    TEST_EQ(parser.Parse(schemafile.c_str(), include_directories), true);
+  }
+  TEST_EQ(parser.ParseJson(jsonfile.c_str()), true);
+
+  // here, parser.builder_ contains a binary buffer that is the parsed data.
+
+  // First, verify it, just in case:
+  flatbuffers::Verifier verifier(parser.builder_.GetBufferPointer(),
+                                 parser.builder_.GetSize());
+  TEST_EQ(VerifyMonsterBuffer(verifier), true);
+
+  AccessFlatBufferTest(parser.builder_.GetBufferPointer(),
+                       parser.builder_.GetSize(), false);
+
+  // to ensure it is correct, we now generate text back from the binary,
+  // and compare the two:
+  std::string jsongen;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(), jsonfile.c_str());
+
+  // We can also do the above using the convenient Registry that knows about
+  // a set of file_identifiers mapped to schemas.
+  flatbuffers::Registry registry;
+  // Make sure schemas can find their includes.
+  registry.AddIncludeDirectory(test_data_path.c_str());
+  registry.AddIncludeDirectory(include_test_path.c_str());
+  // Call this with many schemas if possible.
+  registry.Register(MonsterIdentifier(),
+                    (test_data_path + "monster_test.fbs").c_str());
+  // Now we got this set up, we can parse by just specifying the identifier,
+  // the correct schema will be loaded on the fly:
+  auto buf = registry.TextToFlatBuffer(jsonfile.c_str(), MonsterIdentifier());
+  // If this fails, check registry.lasterror_.
+  TEST_NOTNULL(buf.data());
+  // Test the buffer, to be sure:
+  AccessFlatBufferTest(buf.data(), buf.size(), false);
+  // We can use the registry to turn this back into text, in this case it
+  // will get the file_identifier from the binary:
+  std::string text;
+  auto ok = registry.FlatBufferToText(buf.data(), buf.size(), &text);
+  // If this fails, check registry.lasterror_.
+  TEST_EQ(ok, true);
+  TEST_EQ_STR(text.c_str(), jsonfile.c_str());
+
+  // Generate text for UTF-8 strings without escapes.
+  std::string jsonfile_utf8;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "unicode_test.json").c_str(),
+                                false, &jsonfile_utf8),
+          true);
+  TEST_EQ(parser.Parse(jsonfile_utf8.c_str(), include_directories), true);
+  // To ensure it is correct, generate utf-8 text back from the binary.
+  std::string jsongen_utf8;
+  // request natural printing for utf-8 strings
+  parser.opts.natural_utf8 = true;
+  parser.opts.strict_json = true;
+  TEST_EQ(
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen_utf8),
+      true);
+  TEST_EQ_STR(jsongen_utf8.c_str(), jsonfile_utf8.c_str());
+}
+
+void ReflectionTest(uint8_t *flatbuf, size_t length) {
+  // Load a binary schema.
+  std::string bfbsfile;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "monster_test.bfbs").c_str(),
+                                true, &bfbsfile),
+          true);
+
+  // Verify it, just in case:
+  flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t *>(bfbsfile.c_str()), bfbsfile.length());
+  TEST_EQ(reflection::VerifySchemaBuffer(verifier), true);
+
+  // Make sure the schema is what we expect it to be.
+  auto &schema = *reflection::GetSchema(bfbsfile.c_str());
+  auto root_table = schema.root_table();
+  TEST_EQ_STR(root_table->name()->c_str(), "MyGame.Example.Monster");
+  auto fields = root_table->fields();
+  auto hp_field_ptr = fields->LookupByKey("hp");
+  TEST_NOTNULL(hp_field_ptr);
+  auto &hp_field = *hp_field_ptr;
+  TEST_EQ_STR(hp_field.name()->c_str(), "hp");
+  TEST_EQ(hp_field.id(), 2);
+  TEST_EQ(hp_field.type()->base_type(), reflection::Short);
+
+  auto friendly_field_ptr = fields->LookupByKey("friendly");
+  TEST_NOTNULL(friendly_field_ptr);
+  TEST_NOTNULL(friendly_field_ptr->attributes());
+  TEST_NOTNULL(friendly_field_ptr->attributes()->LookupByKey("priority"));
+
+  // Make sure the table index is what we expect it to be.
+  auto pos_field_ptr = fields->LookupByKey("pos");
+  TEST_NOTNULL(pos_field_ptr);
+  TEST_EQ(pos_field_ptr->type()->base_type(), reflection::Obj);
+  auto pos_table_ptr = schema.objects()->Get(pos_field_ptr->type()->index());
+  TEST_NOTNULL(pos_table_ptr);
+  TEST_EQ_STR(pos_table_ptr->name()->c_str(), "MyGame.Example.Vec3");
+
+  // Test nullability of fields: hp is a 0-default scalar, pos is a struct =>
+  // optional, and name is a required string => not optional.
+  TEST_EQ(hp_field.optional(), false);
+  TEST_EQ(pos_field_ptr->optional(), true);
+  TEST_EQ(fields->LookupByKey("name")->optional(), false);
+
+  // Now use it to dynamically access a buffer.
+  auto &root = *flatbuffers::GetAnyRoot(flatbuf);
+
+  // Verify the buffer first using reflection based verification
+  TEST_EQ(flatbuffers::Verify(schema, *schema.root_table(), flatbuf, length),
+          true);
+
+  auto hp = flatbuffers::GetFieldI<uint16_t>(root, hp_field);
+  TEST_EQ(hp, 80);
+
+  // Rather than needing to know the type, we can also get the value of
+  // any field as an int64_t/double/string, regardless of what it actually is.
+  auto hp_int64 = flatbuffers::GetAnyFieldI(root, hp_field);
+  TEST_EQ(hp_int64, 80);
+  auto hp_double = flatbuffers::GetAnyFieldF(root, hp_field);
+  TEST_EQ(hp_double, 80.0);
+  auto hp_string = flatbuffers::GetAnyFieldS(root, hp_field, &schema);
+  TEST_EQ_STR(hp_string.c_str(), "80");
+
+  // Get struct field through reflection
+  auto pos_struct = flatbuffers::GetFieldStruct(root, *pos_field_ptr);
+  TEST_NOTNULL(pos_struct);
+  TEST_EQ(flatbuffers::GetAnyFieldF(*pos_struct,
+                                    *pos_table_ptr->fields()->LookupByKey("z")),
+          3.0f);
+
+  auto test3_field = pos_table_ptr->fields()->LookupByKey("test3");
+  auto test3_struct = flatbuffers::GetFieldStruct(*pos_struct, *test3_field);
+  TEST_NOTNULL(test3_struct);
+  auto test3_object = schema.objects()->Get(test3_field->type()->index());
+
+  TEST_EQ(flatbuffers::GetAnyFieldF(*test3_struct,
+                                    *test3_object->fields()->LookupByKey("a")),
+          10);
+
+  // We can also modify it.
+  flatbuffers::SetField<uint16_t>(&root, hp_field, 200);
+  hp = flatbuffers::GetFieldI<uint16_t>(root, hp_field);
+  TEST_EQ(hp, 200);
+
+  // We can also set fields generically:
+  flatbuffers::SetAnyFieldI(&root, hp_field, 300);
+  hp_int64 = flatbuffers::GetAnyFieldI(root, hp_field);
+  TEST_EQ(hp_int64, 300);
+  flatbuffers::SetAnyFieldF(&root, hp_field, 300.5);
+  hp_int64 = flatbuffers::GetAnyFieldI(root, hp_field);
+  TEST_EQ(hp_int64, 300);
+  flatbuffers::SetAnyFieldS(&root, hp_field, "300");
+  hp_int64 = flatbuffers::GetAnyFieldI(root, hp_field);
+  TEST_EQ(hp_int64, 300);
+
+  // Test buffer is valid after the modifications
+  TEST_EQ(flatbuffers::Verify(schema, *schema.root_table(), flatbuf, length),
+          true);
+
+  // Reset it, for further tests.
+  flatbuffers::SetField<uint16_t>(&root, hp_field, 80);
+
+  // More advanced functionality: changing the size of items in-line!
+  // First we put the FlatBuffer inside an std::vector.
+  std::vector<uint8_t> resizingbuf(flatbuf, flatbuf + length);
+  // Find the field we want to modify.
+  auto &name_field = *fields->LookupByKey("name");
+  // Get the root.
+  // This time we wrap the result from GetAnyRoot in a smartpointer that
+  // will keep rroot valid as resizingbuf resizes.
+  auto rroot = flatbuffers::piv(
+      flatbuffers::GetAnyRoot(flatbuffers::vector_data(resizingbuf)),
+      resizingbuf);
+  SetString(schema, "totally new string", GetFieldS(**rroot, name_field),
+            &resizingbuf);
+  // Here resizingbuf has changed, but rroot is still valid.
+  TEST_EQ_STR(GetFieldS(**rroot, name_field)->c_str(), "totally new string");
+  // Now lets extend a vector by 100 elements (10 -> 110).
+  auto &inventory_field = *fields->LookupByKey("inventory");
+  auto rinventory = flatbuffers::piv(
+      flatbuffers::GetFieldV<uint8_t>(**rroot, inventory_field), resizingbuf);
+  flatbuffers::ResizeVector<uint8_t>(schema, 110, 50, *rinventory,
+                                     &resizingbuf);
+  // rinventory still valid, so lets read from it.
+  TEST_EQ(rinventory->Get(10), 50);
+
+  // For reflection uses not covered already, there is a more powerful way:
+  // we can simply generate whatever object we want to add/modify in a
+  // FlatBuffer of its own, then add that to an existing FlatBuffer:
+  // As an example, let's add a string to an array of strings.
+  // First, find our field:
+  auto &testarrayofstring_field = *fields->LookupByKey("testarrayofstring");
+  // Find the vector value:
+  auto rtestarrayofstring = flatbuffers::piv(
+      flatbuffers::GetFieldV<flatbuffers::Offset<flatbuffers::String>>(
+          **rroot, testarrayofstring_field),
+      resizingbuf);
+  // It's a vector of 2 strings, to which we add one more, initialized to
+  // offset 0.
+  flatbuffers::ResizeVector<flatbuffers::Offset<flatbuffers::String>>(
+      schema, 3, 0, *rtestarrayofstring, &resizingbuf);
+  // Here we just create a buffer that contans a single string, but this
+  // could also be any complex set of tables and other values.
+  flatbuffers::FlatBufferBuilder stringfbb;
+  stringfbb.Finish(stringfbb.CreateString("hank"));
+  // Add the contents of it to our existing FlatBuffer.
+  // We do this last, so the pointer doesn't get invalidated (since it is
+  // at the end of the buffer):
+  auto string_ptr = flatbuffers::AddFlatBuffer(
+      resizingbuf, stringfbb.GetBufferPointer(), stringfbb.GetSize());
+  // Finally, set the new value in the vector.
+  rtestarrayofstring->MutateOffset(2, string_ptr);
+  TEST_EQ_STR(rtestarrayofstring->Get(0)->c_str(), "bob");
+  TEST_EQ_STR(rtestarrayofstring->Get(2)->c_str(), "hank");
+  // Test integrity of all resize operations above.
+  flatbuffers::Verifier resize_verifier(
+      reinterpret_cast<const uint8_t *>(flatbuffers::vector_data(resizingbuf)),
+      resizingbuf.size());
+  TEST_EQ(VerifyMonsterBuffer(resize_verifier), true);
+
+  // Test buffer is valid using reflection as well
+  TEST_EQ(flatbuffers::Verify(schema, *schema.root_table(),
+                              flatbuffers::vector_data(resizingbuf),
+                              resizingbuf.size()),
+          true);
+
+  // As an additional test, also set it on the name field.
+  // Note: unlike the name change above, this just overwrites the offset,
+  // rather than changing the string in-place.
+  SetFieldT(*rroot, name_field, string_ptr);
+  TEST_EQ_STR(GetFieldS(**rroot, name_field)->c_str(), "hank");
+
+  // Using reflection, rather than mutating binary FlatBuffers, we can also copy
+  // tables and other things out of other FlatBuffers into a FlatBufferBuilder,
+  // either part or whole.
+  flatbuffers::FlatBufferBuilder fbb;
+  auto root_offset = flatbuffers::CopyTable(
+      fbb, schema, *root_table, *flatbuffers::GetAnyRoot(flatbuf), true);
+  fbb.Finish(root_offset, MonsterIdentifier());
+  // Test that it was copied correctly:
+  AccessFlatBufferTest(fbb.GetBufferPointer(), fbb.GetSize());
+
+  // Test buffer is valid using reflection as well
+  TEST_EQ(flatbuffers::Verify(schema, *schema.root_table(),
+                              fbb.GetBufferPointer(), fbb.GetSize()),
+          true);
+}
+
+void MiniReflectFlatBuffersTest(uint8_t *flatbuf) {
+  auto s =
+      flatbuffers::FlatBufferToString(flatbuf, Monster::MiniReflectTypeTable());
+  TEST_EQ_STR(
+      s.c_str(),
+      "{ "
+      "pos: { x: 1.0, y: 2.0, z: 3.0, test1: 0.0, test2: Red, test3: "
+      "{ a: 10, b: 20 } }, "
+      "hp: 80, "
+      "name: \"MyMonster\", "
+      "inventory: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], "
+      "test_type: Monster, "
+      "test: { name: \"Fred\" }, "
+      "test4: [ { a: 10, b: 20 }, { a: 30, b: 40 } ], "
+      "testarrayofstring: [ \"bob\", \"fred\", \"bob\", \"fred\" ], "
+      "testarrayoftables: [ { hp: 1000, name: \"Barney\" }, { name: \"Fred\" "
+      "}, "
+      "{ name: \"Wilma\" } ], "
+      // TODO(wvo): should really print this nested buffer correctly.
+      "testnestedflatbuffer: [ 20, 0, 0, 0, 77, 79, 78, 83, 12, 0, 12, 0, 0, "
+      "0, "
+      "4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 13, 0, 0, 0, 78, "
+      "101, 115, 116, 101, 100, 77, 111, 110, 115, 116, 101, 114, 0, 0, 0 ], "
+      "testarrayofstring2: [ \"jane\", \"mary\" ], "
+      "testarrayofsortedstruct: [ { id: 0, distance: 0 }, "
+      "{ id: 2, distance: 20 }, { id: 3, distance: 30 }, "
+      "{ id: 4, distance: 40 } ], "
+      "flex: [ 210, 4, 5, 2 ], "
+      "test5: [ { a: 10, b: 20 }, { a: 30, b: 40 } ], "
+      "vector_of_enums: [ Blue, Green ], "
+      "scalar_key_sorted_tables: [ { id: \"miss\" } ] "
+      "}");
+
+  Test test(16, 32);
+  Vec3 vec(1, 2, 3, 1.5, Color_Red, test);
+  flatbuffers::FlatBufferBuilder vec_builder;
+  vec_builder.Finish(vec_builder.CreateStruct(vec));
+  auto vec_buffer = vec_builder.Release();
+  auto vec_str = flatbuffers::FlatBufferToString(vec_buffer.data(),
+                                                 Vec3::MiniReflectTypeTable());
+  TEST_EQ_STR(vec_str.c_str(),
+              "{ x: 1.0, y: 2.0, z: 3.0, test1: 1.5, test2: Red, test3: { a: "
+              "16, b: 32 } }");
+}
+
+void MiniReflectFixedLengthArrayTest() {
+  // VS10 does not support typed enums, exclude from tests
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+  flatbuffers::FlatBufferBuilder fbb;
+  MyGame::Example::ArrayStruct aStruct(2, 12, 1);
+  auto aTable = MyGame::Example::CreateArrayTable(fbb, &aStruct);
+  fbb.Finish(aTable);
+
+  auto flatbuf = fbb.Release();
+  auto s = flatbuffers::FlatBufferToString(
+      flatbuf.data(), MyGame::Example::ArrayTableTypeTable());
+  TEST_EQ_STR(
+      "{ "
+      "a: { a: 2.0, "
+      "b: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], "
+      "c: 12, "
+      "d: [ { a: [ 0, 0 ], b: A, c: [ A, A ], d: [ 0, 0 ] }, "
+      "{ a: [ 0, 0 ], b: A, c: [ A, A ], d: [ 0, 0 ] } ], "
+      "e: 1, f: [ 0, 0 ] } "
+      "}",
+      s.c_str());
+#endif
+}
+
+// Parse a .proto schema, output as .fbs
+void ParseProtoTest() {
+  // load the .proto and the golden file from disk
+  std::string protofile;
+  std::string goldenfile;
+  std::string goldenunionfile;
+  TEST_EQ(
+      flatbuffers::LoadFile((test_data_path + "prototest/test.proto").c_str(),
+                            false, &protofile),
+      true);
+  TEST_EQ(
+      flatbuffers::LoadFile((test_data_path + "prototest/test.golden").c_str(),
+                            false, &goldenfile),
+      true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/test_union.golden").c_str(), false,
+              &goldenunionfile),
+          true);
+
+  flatbuffers::IDLOptions opts;
+  opts.include_dependence_headers = false;
+  opts.proto_mode = true;
+
+  // Parse proto.
+  flatbuffers::Parser parser(opts);
+  auto protopath = test_data_path + "prototest/";
+  const char *include_directories[] = { protopath.c_str(), nullptr };
+  TEST_EQ(parser.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs = flatbuffers::GenerateFBS(parser, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser2;
+  TEST_EQ(parser2.Parse(fbs.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs.c_str(), goldenfile.c_str());
+
+  // Parse proto with --oneof-union option.
+  opts.proto_oneof_union = true;
+  flatbuffers::Parser parser3(opts);
+  TEST_EQ(parser3.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs_union = flatbuffers::GenerateFBS(parser3, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser4;
+  TEST_EQ(parser4.Parse(fbs_union.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs_union.c_str(), goldenunionfile.c_str());
+}
+
+// Parse a .proto schema, output as .fbs
+void ParseProtoTestWithSuffix() {
+  // load the .proto and the golden file from disk
+  std::string protofile;
+  std::string goldenfile;
+  std::string goldenunionfile;
+  TEST_EQ(
+      flatbuffers::LoadFile((test_data_path + "prototest/test.proto").c_str(),
+                            false, &protofile),
+      true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/test_suffix.golden").c_str(), false,
+              &goldenfile),
+          true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/test_union_suffix.golden").c_str(),
+              false, &goldenunionfile),
+          true);
+
+  flatbuffers::IDLOptions opts;
+  opts.include_dependence_headers = false;
+  opts.proto_mode = true;
+  opts.proto_namespace_suffix = "test_namespace_suffix";
+
+  // Parse proto.
+  flatbuffers::Parser parser(opts);
+  auto protopath = test_data_path + "prototest/";
+  const char *include_directories[] = { protopath.c_str(), nullptr };
+  TEST_EQ(parser.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs = flatbuffers::GenerateFBS(parser, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser2;
+  TEST_EQ(parser2.Parse(fbs.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs.c_str(), goldenfile.c_str());
+
+  // Parse proto with --oneof-union option.
+  opts.proto_oneof_union = true;
+  flatbuffers::Parser parser3(opts);
+  TEST_EQ(parser3.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs_union = flatbuffers::GenerateFBS(parser3, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser4;
+  TEST_EQ(parser4.Parse(fbs_union.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs_union.c_str(), goldenunionfile.c_str());
+}
+
+// Parse a .proto schema, output as .fbs
+void ParseProtoTestWithIncludes() {
+  // load the .proto and the golden file from disk
+  std::string protofile;
+  std::string goldenfile;
+  std::string goldenunionfile;
+  std::string importprotofile;
+  TEST_EQ(
+      flatbuffers::LoadFile((test_data_path + "prototest/test.proto").c_str(),
+                            false, &protofile),
+      true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/imported.proto").c_str(), false,
+              &importprotofile),
+          true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/test_include.golden").c_str(), false,
+              &goldenfile),
+          true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "prototest/test_union_include.golden").c_str(),
+              false, &goldenunionfile),
+          true);
+
+  flatbuffers::IDLOptions opts;
+  opts.include_dependence_headers = true;
+  opts.proto_mode = true;
+
+  // Parse proto.
+  flatbuffers::Parser parser(opts);
+  auto protopath = test_data_path + "prototest/";
+  const char *include_directories[] = { protopath.c_str(), nullptr };
+  TEST_EQ(parser.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs = flatbuffers::GenerateFBS(parser, "test");
+
+  // Generate fbs from import.proto
+  flatbuffers::Parser import_parser(opts);
+  TEST_EQ(import_parser.Parse(importprotofile.c_str(), include_directories),
+          true);
+  auto import_fbs = flatbuffers::GenerateFBS(import_parser, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser2;
+  TEST_EQ(
+      parser2.Parse(import_fbs.c_str(), include_directories, "imported.fbs"),
+      true);
+  TEST_EQ(parser2.Parse(fbs.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs.c_str(), goldenfile.c_str());
+
+  // Parse proto with --oneof-union option.
+  opts.proto_oneof_union = true;
+  flatbuffers::Parser parser3(opts);
+  TEST_EQ(parser3.Parse(protofile.c_str(), include_directories), true);
+
+  // Generate fbs.
+  auto fbs_union = flatbuffers::GenerateFBS(parser3, "test");
+
+  // Ensure generated file is parsable.
+  flatbuffers::Parser parser4;
+  TEST_EQ(parser4.Parse(import_fbs.c_str(), nullptr, "imported.fbs"), true);
+  TEST_EQ(parser4.Parse(fbs_union.c_str(), nullptr), true);
+  TEST_EQ_STR(fbs_union.c_str(), goldenunionfile.c_str());
+}
+
+template<typename T>
+void CompareTableFieldValue(flatbuffers::Table *table,
+                            flatbuffers::voffset_t voffset, T val) {
+  T read = table->GetField(voffset, static_cast<T>(0));
+  TEST_EQ(read, val);
+}
+
+// Low level stress/fuzz test: serialize/deserialize a variety of
+// different kinds of data in different combinations
+void FuzzTest1() {
+  // Values we're testing against: chosen to ensure no bits get chopped
+  // off anywhere, and also be different from eachother.
+  const uint8_t bool_val = true;
+  const int8_t char_val = -127;  // 0x81
+  const uint8_t uchar_val = 0xFF;
+  const int16_t short_val = -32222;  // 0x8222;
+  const uint16_t ushort_val = 0xFEEE;
+  const int32_t int_val = 0x83333333;
+  const uint32_t uint_val = 0xFDDDDDDD;
+  const int64_t long_val = 0x8444444444444444LL;
+  const uint64_t ulong_val = 0xFCCCCCCCCCCCCCCCULL;
+  const float float_val = 3.14159f;
+  const double double_val = 3.14159265359;
+
+  const int test_values_max = 11;
+  const flatbuffers::voffset_t fields_per_object = 4;
+  const int num_fuzz_objects = 10000;  // The higher, the more thorough :)
+
+  flatbuffers::FlatBufferBuilder builder;
+
+  lcg_reset();  // Keep it deterministic.
+
+  flatbuffers::uoffset_t objects[num_fuzz_objects];
+
+  // Generate num_fuzz_objects random objects each consisting of
+  // fields_per_object fields, each of a random type.
+  for (int i = 0; i < num_fuzz_objects; i++) {
+    auto start = builder.StartTable();
+    for (flatbuffers::voffset_t f = 0; f < fields_per_object; f++) {
+      int choice = lcg_rand() % test_values_max;
+      auto off = flatbuffers::FieldIndexToOffset(f);
+      switch (choice) {
+        case 0: builder.AddElement<uint8_t>(off, bool_val, 0); break;
+        case 1: builder.AddElement<int8_t>(off, char_val, 0); break;
+        case 2: builder.AddElement<uint8_t>(off, uchar_val, 0); break;
+        case 3: builder.AddElement<int16_t>(off, short_val, 0); break;
+        case 4: builder.AddElement<uint16_t>(off, ushort_val, 0); break;
+        case 5: builder.AddElement<int32_t>(off, int_val, 0); break;
+        case 6: builder.AddElement<uint32_t>(off, uint_val, 0); break;
+        case 7: builder.AddElement<int64_t>(off, long_val, 0); break;
+        case 8: builder.AddElement<uint64_t>(off, ulong_val, 0); break;
+        case 9: builder.AddElement<float>(off, float_val, 0); break;
+        case 10: builder.AddElement<double>(off, double_val, 0); break;
+      }
+    }
+    objects[i] = builder.EndTable(start);
+  }
+  builder.PreAlign<flatbuffers::largest_scalar_t>(0);  // Align whole buffer.
+
+  lcg_reset();  // Reset.
+
+  uint8_t *eob = builder.GetCurrentBufferPointer() + builder.GetSize();
+
+  // Test that all objects we generated are readable and return the
+  // expected values. We generate random objects in the same order
+  // so this is deterministic.
+  for (int i = 0; i < num_fuzz_objects; i++) {
+    auto table = reinterpret_cast<flatbuffers::Table *>(eob - objects[i]);
+    for (flatbuffers::voffset_t f = 0; f < fields_per_object; f++) {
+      int choice = lcg_rand() % test_values_max;
+      flatbuffers::voffset_t off = flatbuffers::FieldIndexToOffset(f);
+      switch (choice) {
+        case 0: CompareTableFieldValue(table, off, bool_val); break;
+        case 1: CompareTableFieldValue(table, off, char_val); break;
+        case 2: CompareTableFieldValue(table, off, uchar_val); break;
+        case 3: CompareTableFieldValue(table, off, short_val); break;
+        case 4: CompareTableFieldValue(table, off, ushort_val); break;
+        case 5: CompareTableFieldValue(table, off, int_val); break;
+        case 6: CompareTableFieldValue(table, off, uint_val); break;
+        case 7: CompareTableFieldValue(table, off, long_val); break;
+        case 8: CompareTableFieldValue(table, off, ulong_val); break;
+        case 9: CompareTableFieldValue(table, off, float_val); break;
+        case 10: CompareTableFieldValue(table, off, double_val); break;
+      }
+    }
+  }
+}
+
+// High level stress/fuzz test: generate a big schema and
+// matching json data in random combinations, then parse both,
+// generate json back from the binary, and compare with the original.
+void FuzzTest2() {
+  lcg_reset();  // Keep it deterministic.
+
+  const int num_definitions = 30;
+  const int num_struct_definitions = 5;  // Subset of num_definitions.
+  const int fields_per_definition = 15;
+  const int instances_per_definition = 5;
+  const int deprecation_rate = 10;  // 1 in deprecation_rate fields will
+                                    // be deprecated.
+
+  std::string schema = "namespace test;\n\n";
+
+  struct RndDef {
+    std::string instances[instances_per_definition];
+
+    // Since we're generating schema and corresponding data in tandem,
+    // this convenience function adds strings to both at once.
+    static void Add(RndDef (&definitions_l)[num_definitions],
+                    std::string &schema_l, const int instances_per_definition_l,
+                    const char *schema_add, const char *instance_add,
+                    int definition) {
+      schema_l += schema_add;
+      for (int i = 0; i < instances_per_definition_l; i++)
+        definitions_l[definition].instances[i] += instance_add;
+    }
+  };
+
+  // clang-format off
+  #define AddToSchemaAndInstances(schema_add, instance_add) \
+    RndDef::Add(definitions, schema, instances_per_definition, \
+                schema_add, instance_add, definition)
+
+  #define Dummy() \
+    RndDef::Add(definitions, schema, instances_per_definition, \
+                "byte", "1", definition)
+  // clang-format on
+
+  RndDef definitions[num_definitions];
+
+  // We are going to generate num_definitions, the first
+  // num_struct_definitions will be structs, the rest tables. For each
+  // generate random fields, some of which may be struct/table types
+  // referring to previously generated structs/tables.
+  // Simultanenously, we generate instances_per_definition JSON data
+  // definitions, which will have identical structure to the schema
+  // being generated. We generate multiple instances such that when creating
+  // hierarchy, we get some variety by picking one randomly.
+  for (int definition = 0; definition < num_definitions; definition++) {
+    std::string definition_name = "D" + flatbuffers::NumToString(definition);
+
+    bool is_struct = definition < num_struct_definitions;
+
+    AddToSchemaAndInstances(
+        ((is_struct ? "struct " : "table ") + definition_name + " {\n").c_str(),
+        "{\n");
+
+    for (int field = 0; field < fields_per_definition; field++) {
+      const bool is_last_field = field == fields_per_definition - 1;
+
+      // Deprecate 1 in deprecation_rate fields. Only table fields can be
+      // deprecated.
+      // Don't deprecate the last field to avoid dangling commas in JSON.
+      const bool deprecated =
+          !is_struct && !is_last_field && (lcg_rand() % deprecation_rate == 0);
+
+      std::string field_name = "f" + flatbuffers::NumToString(field);
+      AddToSchemaAndInstances(("  " + field_name + ":").c_str(),
+                              deprecated ? "" : (field_name + ": ").c_str());
+      // Pick random type:
+      auto base_type = static_cast<flatbuffers::BaseType>(
+          lcg_rand() % (flatbuffers::BASE_TYPE_UNION + 1));
+      switch (base_type) {
+        case flatbuffers::BASE_TYPE_STRING:
+          if (is_struct) {
+            Dummy();  // No strings in structs.
+          } else {
+            AddToSchemaAndInstances("string", deprecated ? "" : "\"hi\"");
+          }
+          break;
+        case flatbuffers::BASE_TYPE_VECTOR:
+          if (is_struct) {
+            Dummy();  // No vectors in structs.
+          } else {
+            AddToSchemaAndInstances("[ubyte]",
+                                    deprecated ? "" : "[\n0,\n1,\n255\n]");
+          }
+          break;
+        case flatbuffers::BASE_TYPE_NONE:
+        case flatbuffers::BASE_TYPE_UTYPE:
+        case flatbuffers::BASE_TYPE_STRUCT:
+        case flatbuffers::BASE_TYPE_UNION:
+          if (definition) {
+            // Pick a random previous definition and random data instance of
+            // that definition.
+            int defref = lcg_rand() % definition;
+            int instance = lcg_rand() % instances_per_definition;
+            AddToSchemaAndInstances(
+                ("D" + flatbuffers::NumToString(defref)).c_str(),
+                deprecated ? ""
+                           : definitions[defref].instances[instance].c_str());
+          } else {
+            // If this is the first definition, we have no definition we can
+            // refer to.
+            Dummy();
+          }
+          break;
+        case flatbuffers::BASE_TYPE_BOOL:
+          AddToSchemaAndInstances(
+              "bool", deprecated ? "" : (lcg_rand() % 2 ? "true" : "false"));
+          break;
+        case flatbuffers::BASE_TYPE_ARRAY:
+          if (!is_struct) {
+            AddToSchemaAndInstances(
+                "ubyte",
+                deprecated ? "" : "255");  // No fixed-length arrays in tables.
+          } else {
+            AddToSchemaAndInstances("[int:3]", deprecated ? "" : "[\n,\n,\n]");
+          }
+          break;
+        default:
+          // All the scalar types.
+          schema += flatbuffers::kTypeNames[base_type];
+
+          if (!deprecated) {
+            // We want each instance to use its own random value.
+            for (int inst = 0; inst < instances_per_definition; inst++)
+              definitions[definition].instances[inst] +=
+                  flatbuffers::IsFloat(base_type)
+                      ? flatbuffers::NumToString<double>(lcg_rand() % 128)
+                            .c_str()
+                      : flatbuffers::NumToString<int>(lcg_rand() % 128).c_str();
+          }
+      }
+      AddToSchemaAndInstances(deprecated ? "(deprecated);\n" : ";\n",
+                              deprecated ? "" : is_last_field ? "\n" : ",\n");
+    }
+    AddToSchemaAndInstances("}\n\n", "}");
+  }
+
+  schema += "root_type D" + flatbuffers::NumToString(num_definitions - 1);
+  schema += ";\n";
+
+  flatbuffers::Parser parser;
+
+  // Will not compare against the original if we don't write defaults
+  parser.builder_.ForceDefaults(true);
+
+  // Parse the schema, parse the generated data, then generate text back
+  // from the binary and compare against the original.
+  TEST_EQ(parser.Parse(schema.c_str()), true);
+
+  const std::string &json =
+      definitions[num_definitions - 1].instances[0] + "\n";
+
+  TEST_EQ(parser.Parse(json.c_str()), true);
+
+  std::string jsongen;
+  parser.opts.indent_step = 0;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+
+  if (jsongen != json) {
+    // These strings are larger than a megabyte, so we show the bytes around
+    // the first bytes that are different rather than the whole string.
+    size_t len = std::min(json.length(), jsongen.length());
+    for (size_t i = 0; i < len; i++) {
+      if (json[i] != jsongen[i]) {
+        i -= std::min(static_cast<size_t>(10), i);  // show some context;
+        size_t end = std::min(len, i + 20);
+        for (; i < end; i++)
+          TEST_OUTPUT_LINE("at %d: found \"%c\", expected \"%c\"\n",
+                           static_cast<int>(i), jsongen[i], json[i]);
+        break;
+      }
+    }
+    TEST_NOTNULL(nullptr);  //-V501 (this comment supresses CWE-570 warning)
+  }
+
+  // clang-format off
+  #ifdef FLATBUFFERS_TEST_VERBOSE
+    TEST_OUTPUT_LINE("%dk schema tested with %dk of json\n",
+                     static_cast<int>(schema.length() / 1024),
+                     static_cast<int>(json.length() / 1024));
+  #endif
+  // clang-format on
+}
+
+// Test that parser errors are actually generated.
+void TestError_(const char *src, const char *error_substr, bool strict_json,
+                const char *file, int line, const char *func) {
+  flatbuffers::IDLOptions opts;
+  opts.strict_json = strict_json;
+  flatbuffers::Parser parser(opts);
+  if (parser.Parse(src)) {
+    TestFail("true", "false",
+             ("parser.Parse(\"" + std::string(src) + "\")").c_str(), file, line,
+             func);
+  } else if (!strstr(parser.error_.c_str(), error_substr)) {
+    TestFail(error_substr, parser.error_.c_str(),
+             ("parser.Parse(\"" + std::string(src) + "\")").c_str(), file, line,
+             func);
+  }
+}
+
+void TestError_(const char *src, const char *error_substr, const char *file,
+                int line, const char *func) {
+  TestError_(src, error_substr, false, file, line, func);
+}
+
+#ifdef _WIN32
+#  define TestError(src, ...) \
+    TestError_(src, __VA_ARGS__, __FILE__, __LINE__, __FUNCTION__)
+#else
+#  define TestError(src, ...) \
+    TestError_(src, __VA_ARGS__, __FILE__, __LINE__, __PRETTY_FUNCTION__)
+#endif
+
+// Test that parsing errors occur as we'd expect.
+// Also useful for coverage, making sure these paths are run.
+void ErrorTest() {
+  // In order they appear in idl_parser.cpp
+  TestError("table X { Y:byte; } root_type X; { Y: 999 }", "does not fit");
+  TestError("\"\0", "illegal");
+  TestError("\"\\q", "escape code");
+  TestError("table ///", "documentation");
+  TestError("@", "illegal");
+  TestError("table 1", "expecting");
+  TestError("table X { Y:[[int]]; }", "nested vector");
+  TestError("table X { Y:1; }", "illegal type");
+  TestError("table X { Y:int; Y:int; }", "field already");
+  TestError("table Y {} table X { Y:int; }", "same as table");
+  TestError("struct X { Y:string; }", "only scalar");
+  TestError("struct X { a:uint = 42; }", "default values");
+  TestError("enum Y:byte { Z = 1 } table X { y:Y; }", "not part of enum");
+  TestError("struct X { Y:int (deprecated); }", "deprecate");
+  TestError("union Z { X } table X { Y:Z; } root_type X; { Y: {}, A:1 }",
+            "missing type field");
+  TestError("union Z { X } table X { Y:Z; } root_type X; { Y_type: 99, Y: {",
+            "type id");
+  TestError("table X { Y:int; } root_type X; { Z:", "unknown field");
+  TestError("table X { Y:int; } root_type X; { Y:", "string constant", true);
+  TestError("table X { Y:int; } root_type X; { \"Y\":1, }", "string constant",
+            true);
+  TestError(
+      "struct X { Y:int; Z:int; } table W { V:X; } root_type W; "
+      "{ V:{ Y:1 } }",
+      "wrong number");
+  TestError("enum E:byte { A } table X { Y:E; } root_type X; { Y:U }",
+            "unknown enum value");
+  TestError("table X { Y:byte; } root_type X; { Y:; }", "starting");
+  TestError("enum X:byte { Y } enum X {", "enum already");
+  TestError("enum X:float {}", "underlying");
+  TestError("enum X:byte { Y, Y }", "value already");
+  TestError("enum X:byte { Y=2, Z=2 }", "unique");
+  TestError("table X { Y:int; } table X {", "datatype already");
+  TestError("struct X (force_align: 7) { Y:int; }", "force_align");
+  TestError("struct X {}", "size 0");
+  TestError("{}", "no root");
+  TestError("table X { Y:byte; } root_type X; { Y:1 } { Y:1 }", "end of file");
+  TestError("table X { Y:byte; } root_type X; { Y:1 } table Y{ Z:int }",
+            "end of file");
+  TestError("root_type X;", "unknown root");
+  TestError("struct X { Y:int; } root_type X;", "a table");
+  TestError("union X { Y }", "referenced");
+  TestError("union Z { X } struct X { Y:int; }", "only tables");
+  TestError("table X { Y:[int]; YLength:int; }", "clash");
+  TestError("table X { Y:byte; } root_type X; { Y:1, Y:2 }", "more than once");
+  // float to integer conversion is forbidden
+  TestError("table X { Y:int; } root_type X; { Y:1.0 }", "float");
+  TestError("table X { Y:bool; } root_type X; { Y:1.0 }", "float");
+  TestError("enum X:bool { Y = true }", "must be integral");
+  // Array of non-scalar
+  TestError("table X { x:int; } struct Y { y:[X:2]; }",
+            "may contain only scalar or struct fields");
+  // Non-snake case field names
+  TestError("table X { Y: int; } root_type Y: {Y:1.0}", "snake_case");
+  // Complex defaults
+  TestError("table X { y: string = 1; }", "expecting: string");
+  TestError("table X { y: string = []; }", " Cannot assign token");
+  TestError("table X { y: [int] = [1]; }", "Expected `]`");
+  TestError("table X { y: [int] = [; }", "Expected `]`");
+  TestError("table X { y: [int] = \"\"; }", "type mismatch");
+  // An identifier can't start from sign (+|-)
+  TestError("table X { -Y: int; } root_type Y: {Y:1.0}", "identifier");
+  TestError("table X { +Y: int; } root_type Y: {Y:1.0}", "identifier");
+}
+
+template<typename T>
+T TestValue(const char *json, const char *type_name,
+            const char *decls = nullptr) {
+  flatbuffers::Parser parser;
+  parser.builder_.ForceDefaults(true);  // return defaults
+  auto check_default = json ? false : true;
+  if (check_default) { parser.opts.output_default_scalars_in_json = true; }
+  // Simple schema.
+  std::string schema = std::string(decls ? decls : "") + "\n" +
+                       "table X { y:" + std::string(type_name) +
+                       "; } root_type X;";
+  auto schema_done = parser.Parse(schema.c_str());
+  TEST_EQ_STR(parser.error_.c_str(), "");
+  TEST_EQ(schema_done, true);
+
+  auto done = parser.Parse(check_default ? "{}" : json);
+  TEST_EQ_STR(parser.error_.c_str(), "");
+  TEST_EQ(done, true);
+
+  // Check with print.
+  std::string print_back;
+  parser.opts.indent_step = -1;
+  TEST_EQ(GenerateText(parser, parser.builder_.GetBufferPointer(), &print_back),
+          true);
+  // restore value from its default
+  if (check_default) { TEST_EQ(parser.Parse(print_back.c_str()), true); }
+
+  auto root = flatbuffers::GetRoot<flatbuffers::Table>(
+      parser.builder_.GetBufferPointer());
+  return root->GetField<T>(flatbuffers::FieldIndexToOffset(0), 0);
+}
+
+bool FloatCompare(float a, float b) { return fabs(a - b) < 0.001; }
+
+// Additional parser testing not covered elsewhere.
+void ValueTest() {
+  // Test scientific notation numbers.
+  TEST_EQ(
+      FloatCompare(TestValue<float>("{ y:0.0314159e+2 }", "float"), 3.14159f),
+      true);
+  // number in string
+  TEST_EQ(FloatCompare(TestValue<float>("{ y:\"0.0314159e+2\" }", "float"),
+                       3.14159f),
+          true);
+
+  // Test conversion functions.
+  TEST_EQ(FloatCompare(TestValue<float>("{ y:cos(rad(180)) }", "float"), -1),
+          true);
+
+  // int embedded to string
+  TEST_EQ(TestValue<int>("{ y:\"-876\" }", "int=-123"), -876);
+  TEST_EQ(TestValue<int>("{ y:\"876\" }", "int=-123"), 876);
+
+  // Test negative hex constant.
+  TEST_EQ(TestValue<int>("{ y:-0x8ea0 }", "int=-0x8ea0"), -36512);
+  TEST_EQ(TestValue<int>(nullptr, "int=-0x8ea0"), -36512);
+
+  // positive hex constant
+  TEST_EQ(TestValue<int>("{ y:0x1abcdef }", "int=0x1"), 0x1abcdef);
+  // with optional '+' sign
+  TEST_EQ(TestValue<int>("{ y:+0x1abcdef }", "int=+0x1"), 0x1abcdef);
+  // hex in string
+  TEST_EQ(TestValue<int>("{ y:\"0x1abcdef\" }", "int=+0x1"), 0x1abcdef);
+
+  // Make sure we do unsigned 64bit correctly.
+  TEST_EQ(TestValue<uint64_t>("{ y:12335089644688340133 }", "ulong"),
+          12335089644688340133ULL);
+
+  // bool in string
+  TEST_EQ(TestValue<bool>("{ y:\"false\" }", "bool=true"), false);
+  TEST_EQ(TestValue<bool>("{ y:\"true\" }", "bool=\"true\""), true);
+  TEST_EQ(TestValue<bool>("{ y:'false' }", "bool=true"), false);
+  TEST_EQ(TestValue<bool>("{ y:'true' }", "bool=\"true\""), true);
+
+  // check comments before and after json object
+  TEST_EQ(TestValue<int>("/*before*/ { y:1 } /*after*/", "int"), 1);
+  TEST_EQ(TestValue<int>("//before \n { y:1 } //after", "int"), 1);
+}
+
+void NestedListTest() {
+  flatbuffers::Parser parser1;
+  TEST_EQ(parser1.Parse("struct Test { a:short; b:byte; } table T { F:[Test]; }"
+                        "root_type T;"
+                        "{ F:[ [10,20], [30,40]] }"),
+          true);
+}
+
+void EnumStringsTest() {
+  flatbuffers::Parser parser1;
+  TEST_EQ(parser1.Parse("enum E:byte { A, B, C } table T { F:[E]; }"
+                        "root_type T;"
+                        "{ F:[ A, B, \"C\", \"A B C\" ] }"),
+          true);
+  flatbuffers::Parser parser2;
+  TEST_EQ(parser2.Parse("enum E:byte { A, B, C } table T { F:[int]; }"
+                        "root_type T;"
+                        "{ F:[ \"E.C\", \"E.A E.B E.C\" ] }"),
+          true);
+  // unsigned bit_flags
+  flatbuffers::Parser parser3;
+  TEST_EQ(
+      parser3.Parse("enum E:uint16 (bit_flags) { F0, F07=7, F08, F14=14, F15 }"
+                    " table T { F: E = \"F15 F08\"; }"
+                    "root_type T;"),
+      true);
+}
+
+void EnumNamesTest() {
+  TEST_EQ_STR("Red", EnumNameColor(Color_Red));
+  TEST_EQ_STR("Green", EnumNameColor(Color_Green));
+  TEST_EQ_STR("Blue", EnumNameColor(Color_Blue));
+  // Check that Color to string don't crash while decode a mixture of Colors.
+  // 1) Example::Color enum is enum with unfixed underlying type.
+  // 2) Valid enum range: [0; 2^(ceil(log2(Color_ANY))) - 1].
+  // Consequence: A value is out of this range will lead to UB (since C++17).
+  // For details see C++17 standard or explanation on the SO:
+  // stackoverflow.com/questions/18195312/what-happens-if-you-static-cast-invalid-value-to-enum-class
+  TEST_EQ_STR("", EnumNameColor(static_cast<Color>(0)));
+  TEST_EQ_STR("", EnumNameColor(static_cast<Color>(Color_ANY - 1)));
+  TEST_EQ_STR("", EnumNameColor(static_cast<Color>(Color_ANY + 1)));
+}
+
+void EnumOutOfRangeTest() {
+  TestError("enum X:byte { Y = 128 }", "enum value does not fit");
+  TestError("enum X:byte { Y = -129 }", "enum value does not fit");
+  TestError("enum X:byte { Y = 126, Z0, Z1 }", "enum value does not fit");
+  TestError("enum X:ubyte { Y = -1 }", "enum value does not fit");
+  TestError("enum X:ubyte { Y = 256 }", "enum value does not fit");
+  TestError("enum X:ubyte { Y = 255, Z }", "enum value does not fit");
+  TestError("table Y{} union X { Y = -1 }", "enum value does not fit");
+  TestError("table Y{} union X { Y = 256 }", "enum value does not fit");
+  TestError("table Y{} union X { Y = 255, Z:Y }", "enum value does not fit");
+  TestError("enum X:int { Y = -2147483649 }", "enum value does not fit");
+  TestError("enum X:int { Y = 2147483648 }", "enum value does not fit");
+  TestError("enum X:uint { Y = -1 }", "enum value does not fit");
+  TestError("enum X:uint { Y = 4294967297 }", "enum value does not fit");
+  TestError("enum X:long { Y = 9223372036854775808 }", "does not fit");
+  TestError("enum X:long { Y = 9223372036854775807, Z }",
+            "enum value does not fit");
+  TestError("enum X:ulong { Y = -1 }", "does not fit");
+  TestError("enum X:ubyte (bit_flags) { Y=8 }", "bit flag out");
+  TestError("enum X:byte (bit_flags) { Y=7 }", "must be unsigned");  // -128
+  // bit_flgs out of range
+  TestError("enum X:ubyte (bit_flags) { Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8 }",
+            "out of range");
+}
+
+void EnumValueTest() {
+  // json: "{ Y:0 }", schema: table X { y: "E"}
+  // 0 in enum (V=0) E then Y=0 is valid.
+  TEST_EQ(TestValue<int>("{ y:0 }", "E", "enum E:int { V }"), 0);
+  TEST_EQ(TestValue<int>("{ y:V }", "E", "enum E:int { V }"), 0);
+  // A default value of Y is 0.
+  TEST_EQ(TestValue<int>("{ }", "E", "enum E:int { V }"), 0);
+  TEST_EQ(TestValue<int>("{ y:5 }", "E=V", "enum E:int { V=5 }"), 5);
+  // Generate json with defaults and check.
+  TEST_EQ(TestValue<int>(nullptr, "E=V", "enum E:int { V=5 }"), 5);
+  // 5 in enum
+  TEST_EQ(TestValue<int>("{ y:5 }", "E", "enum E:int { Z, V=5 }"), 5);
+  TEST_EQ(TestValue<int>("{ y:5 }", "E=V", "enum E:int { Z, V=5 }"), 5);
+  // Generate json with defaults and check.
+  TEST_EQ(TestValue<int>(nullptr, "E", "enum E:int { Z, V=5 }"), 0);
+  TEST_EQ(TestValue<int>(nullptr, "E=V", "enum E:int { Z, V=5 }"), 5);
+  // u84 test
+  TEST_EQ(TestValue<uint64_t>(nullptr, "E=V",
+                              "enum E:ulong { V = 13835058055282163712 }"),
+          13835058055282163712ULL);
+  TEST_EQ(TestValue<uint64_t>(nullptr, "E=V",
+                              "enum E:ulong { V = 18446744073709551615 }"),
+          18446744073709551615ULL);
+  // Assign non-enum value to enum field. Is it right?
+  TEST_EQ(TestValue<int>("{ y:7 }", "E", "enum E:int { V = 0 }"), 7);
+  // Check that non-ascending values are valid.
+  TEST_EQ(TestValue<int>("{ y:5 }", "E=V", "enum E:int { Z=10, V=5 }"), 5);
+}
+
+void IntegerOutOfRangeTest() {
+  TestError("table T { F:byte; } root_type T; { F:128 }",
+            "constant does not fit");
+  TestError("table T { F:byte; } root_type T; { F:-129 }",
+            "constant does not fit");
+  TestError("table T { F:ubyte; } root_type T; { F:256 }",
+            "constant does not fit");
+  TestError("table T { F:ubyte; } root_type T; { F:-1 }",
+            "constant does not fit");
+  TestError("table T { F:short; } root_type T; { F:32768 }",
+            "constant does not fit");
+  TestError("table T { F:short; } root_type T; { F:-32769 }",
+            "constant does not fit");
+  TestError("table T { F:ushort; } root_type T; { F:65536 }",
+            "constant does not fit");
+  TestError("table T { F:ushort; } root_type T; { F:-1 }",
+            "constant does not fit");
+  TestError("table T { F:int; } root_type T; { F:2147483648 }",
+            "constant does not fit");
+  TestError("table T { F:int; } root_type T; { F:-2147483649 }",
+            "constant does not fit");
+  TestError("table T { F:uint; } root_type T; { F:4294967296 }",
+            "constant does not fit");
+  TestError("table T { F:uint; } root_type T; { F:-1 }",
+            "constant does not fit");
+  // Check fixed width aliases
+  TestError("table X { Y:uint8; } root_type X; { Y: -1 }", "does not fit");
+  TestError("table X { Y:uint8; } root_type X; { Y: 256 }", "does not fit");
+  TestError("table X { Y:uint16; } root_type X; { Y: -1 }", "does not fit");
+  TestError("table X { Y:uint16; } root_type X; { Y: 65536 }", "does not fit");
+  TestError("table X { Y:uint32; } root_type X; { Y: -1 }", "");
+  TestError("table X { Y:uint32; } root_type X; { Y: 4294967296 }",
+            "does not fit");
+  TestError("table X { Y:uint64; } root_type X; { Y: -1 }", "");
+  TestError("table X { Y:uint64; } root_type X; { Y: -9223372036854775809 }",
+            "does not fit");
+  TestError("table X { Y:uint64; } root_type X; { Y: 18446744073709551616 }",
+            "does not fit");
+
+  TestError("table X { Y:int8; } root_type X; { Y: -129 }", "does not fit");
+  TestError("table X { Y:int8; } root_type X; { Y: 128 }", "does not fit");
+  TestError("table X { Y:int16; } root_type X; { Y: -32769 }", "does not fit");
+  TestError("table X { Y:int16; } root_type X; { Y: 32768 }", "does not fit");
+  TestError("table X { Y:int32; } root_type X; { Y: -2147483649 }", "");
+  TestError("table X { Y:int32; } root_type X; { Y: 2147483648 }",
+            "does not fit");
+  TestError("table X { Y:int64; } root_type X; { Y: -9223372036854775809 }",
+            "does not fit");
+  TestError("table X { Y:int64; } root_type X; { Y: 9223372036854775808 }",
+            "does not fit");
+  // check out-of-int64 as int8
+  TestError("table X { Y:int8; } root_type X; { Y: -9223372036854775809 }",
+            "does not fit");
+  TestError("table X { Y:int8; } root_type X; { Y: 9223372036854775808 }",
+            "does not fit");
+
+  // Check default values
+  TestError("table X { Y:int64=-9223372036854775809; } root_type X; {}",
+            "does not fit");
+  TestError("table X { Y:int64= 9223372036854775808; } root_type X; {}",
+            "does not fit");
+  TestError("table X { Y:uint64; } root_type X; { Y: -1 }", "");
+  TestError("table X { Y:uint64=-9223372036854775809; } root_type X; {}",
+            "does not fit");
+  TestError("table X { Y:uint64= 18446744073709551616; } root_type X; {}",
+            "does not fit");
+}
+
+void IntegerBoundaryTest() {
+  // Check numerical compatibility with non-C++ languages.
+  // By the C++ standard, std::numerical_limits<int64_t>::min() ==
+  // -9223372036854775807 (-2^63+1) or less* The Flatbuffers grammar and most of
+  // the languages (C#, Java, Rust) expect that minimum values are: -128,
+  // -32768,.., -9223372036854775808. Since C++20,
+  // static_cast<int64>(0x8000000000000000ULL) is well-defined two's complement
+  // cast. Therefore -9223372036854775808 should be valid negative value.
+  TEST_EQ(flatbuffers::numeric_limits<int8_t>::min(), -128);
+  TEST_EQ(flatbuffers::numeric_limits<int8_t>::max(), 127);
+  TEST_EQ(flatbuffers::numeric_limits<int16_t>::min(), -32768);
+  TEST_EQ(flatbuffers::numeric_limits<int16_t>::max(), 32767);
+  TEST_EQ(flatbuffers::numeric_limits<int32_t>::min() + 1, -2147483647);
+  TEST_EQ(flatbuffers::numeric_limits<int32_t>::max(), 2147483647ULL);
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::min() + 1LL,
+          -9223372036854775807LL);
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::max(), 9223372036854775807ULL);
+  TEST_EQ(flatbuffers::numeric_limits<uint8_t>::max(), 255);
+  TEST_EQ(flatbuffers::numeric_limits<uint16_t>::max(), 65535);
+  TEST_EQ(flatbuffers::numeric_limits<uint32_t>::max(), 4294967295ULL);
+  TEST_EQ(flatbuffers::numeric_limits<uint64_t>::max(),
+          18446744073709551615ULL);
+
+  TEST_EQ(TestValue<int8_t>("{ y:127 }", "byte"), 127);
+  TEST_EQ(TestValue<int8_t>("{ y:-128 }", "byte"), -128);
+  TEST_EQ(TestValue<uint8_t>("{ y:255 }", "ubyte"), 255);
+  TEST_EQ(TestValue<uint8_t>("{ y:0 }", "ubyte"), 0);
+  TEST_EQ(TestValue<int16_t>("{ y:32767 }", "short"), 32767);
+  TEST_EQ(TestValue<int16_t>("{ y:-32768 }", "short"), -32768);
+  TEST_EQ(TestValue<uint16_t>("{ y:65535 }", "ushort"), 65535);
+  TEST_EQ(TestValue<uint16_t>("{ y:0 }", "ushort"), 0);
+  TEST_EQ(TestValue<int32_t>("{ y:2147483647 }", "int"), 2147483647);
+  TEST_EQ(TestValue<int32_t>("{ y:-2147483648 }", "int") + 1, -2147483647);
+  TEST_EQ(TestValue<uint32_t>("{ y:4294967295 }", "uint"), 4294967295);
+  TEST_EQ(TestValue<uint32_t>("{ y:0 }", "uint"), 0);
+  TEST_EQ(TestValue<int64_t>("{ y:9223372036854775807 }", "long"),
+          9223372036854775807LL);
+  TEST_EQ(TestValue<int64_t>("{ y:-9223372036854775808 }", "long") + 1LL,
+          -9223372036854775807LL);
+  TEST_EQ(TestValue<uint64_t>("{ y:18446744073709551615 }", "ulong"),
+          18446744073709551615ULL);
+  TEST_EQ(TestValue<uint64_t>("{ y:0 }", "ulong"), 0);
+  TEST_EQ(TestValue<uint64_t>("{ y: 18446744073709551615 }", "uint64"),
+          18446744073709551615ULL);
+  // check that the default works
+  TEST_EQ(TestValue<uint64_t>(nullptr, "uint64 = 18446744073709551615"),
+          18446744073709551615ULL);
+}
+
+void ValidFloatTest() {
+  // check rounding to infinity
+  TEST_EQ(TestValue<float>("{ y:+3.4029e+38 }", "float"), +infinity_f);
+  TEST_EQ(TestValue<float>("{ y:-3.4029e+38 }", "float"), -infinity_f);
+  TEST_EQ(TestValue<double>("{ y:+1.7977e+308 }", "double"), +infinity_d);
+  TEST_EQ(TestValue<double>("{ y:-1.7977e+308 }", "double"), -infinity_d);
+
+  TEST_EQ(
+      FloatCompare(TestValue<float>("{ y:0.0314159e+2 }", "float"), 3.14159f),
+      true);
+  // float in string
+  TEST_EQ(FloatCompare(TestValue<float>("{ y:\" 0.0314159e+2  \" }", "float"),
+                       3.14159f),
+          true);
+
+  TEST_EQ(TestValue<float>("{ y:1 }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:1.0 }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:1. }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:+1. }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:-1. }", "float"), -1.0f);
+  TEST_EQ(TestValue<float>("{ y:1.e0 }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:1.e+0 }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:1.e-0 }", "float"), 1.0f);
+  TEST_EQ(TestValue<float>("{ y:0.125 }", "float"), 0.125f);
+  TEST_EQ(TestValue<float>("{ y:.125 }", "float"), 0.125f);
+  TEST_EQ(TestValue<float>("{ y:-.125 }", "float"), -0.125f);
+  TEST_EQ(TestValue<float>("{ y:+.125 }", "float"), +0.125f);
+  TEST_EQ(TestValue<float>("{ y:5 }", "float"), 5.0f);
+  TEST_EQ(TestValue<float>("{ y:\"5\" }", "float"), 5.0f);
+
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+  // Old MSVC versions may have problem with this check.
+  // https://www.exploringbinary.com/visual-c-plus-plus-strtod-still-broken/
+  TEST_EQ(TestValue<double>("{ y:6.9294956446009195e15 }", "double"),
+          6929495644600920.0);
+  // check nan's
+  TEST_EQ(std::isnan(TestValue<double>("{ y:nan }", "double")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:nan }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:\"nan\" }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:\"+nan\" }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:\"-nan\" }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:+nan }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>("{ y:-nan }", "float")), true);
+  TEST_EQ(std::isnan(TestValue<float>(nullptr, "float=nan")), true);
+  TEST_EQ(std::isnan(TestValue<float>(nullptr, "float=-nan")), true);
+  // check inf
+  TEST_EQ(TestValue<float>("{ y:inf }", "float"), infinity_f);
+  TEST_EQ(TestValue<float>("{ y:\"inf\" }", "float"), infinity_f);
+  TEST_EQ(TestValue<float>("{ y:\"-inf\" }", "float"), -infinity_f);
+  TEST_EQ(TestValue<float>("{ y:\"+inf\" }", "float"), infinity_f);
+  TEST_EQ(TestValue<float>("{ y:+inf }", "float"), infinity_f);
+  TEST_EQ(TestValue<float>("{ y:-inf }", "float"), -infinity_f);
+  TEST_EQ(TestValue<float>(nullptr, "float=inf"), infinity_f);
+  TEST_EQ(TestValue<float>(nullptr, "float=-inf"), -infinity_f);
+  TestValue<double>(
+      "{ y: [0.2, .2, 1.0, -1.0, -2., 2., 1e0, -1e0, 1.0e0, -1.0e0, -3.e2, "
+      "3.0e2] }",
+      "[double]");
+  TestValue<float>(
+      "{ y: [0.2, .2, 1.0, -1.0, -2., 2., 1e0, -1e0, 1.0e0, -1.0e0, -3.e2, "
+      "3.0e2] }",
+      "[float]");
+
+  // Test binary format of float point.
+  // https://en.cppreference.com/w/cpp/language/floating_literal
+  // 0x11.12p-1 = (1*16^1 + 2*16^0 + 3*16^-1 + 4*16^-2) * 2^-1 =
+  TEST_EQ(TestValue<double>("{ y:0x12.34p-1 }", "double"), 9.1015625);
+  // hex fraction 1.2 (decimal 1.125) scaled by 2^3, that is 9.0
+  TEST_EQ(TestValue<float>("{ y:-0x0.2p0 }", "float"), -0.125f);
+  TEST_EQ(TestValue<float>("{ y:-0x.2p1 }", "float"), -0.25f);
+  TEST_EQ(TestValue<float>("{ y:0x1.2p3 }", "float"), 9.0f);
+  TEST_EQ(TestValue<float>("{ y:0x10.1p0 }", "float"), 16.0625f);
+  TEST_EQ(TestValue<double>("{ y:0x1.2p3 }", "double"), 9.0);
+  TEST_EQ(TestValue<double>("{ y:0x10.1p0 }", "double"), 16.0625);
+  TEST_EQ(TestValue<double>("{ y:0xC.68p+2 }", "double"), 49.625);
+  TestValue<double>("{ y: [0x20.4ep1, +0x20.4ep1, -0x20.4ep1] }", "[double]");
+  TestValue<float>("{ y: [0x20.4ep1, +0x20.4ep1, -0x20.4ep1] }", "[float]");
+
+#else   // FLATBUFFERS_HAS_NEW_STRTOD
+  TEST_OUTPUT_LINE("FLATBUFFERS_HAS_NEW_STRTOD tests skipped");
+#endif  // !FLATBUFFERS_HAS_NEW_STRTOD
+}
+
+void InvalidFloatTest() {
+  auto invalid_msg = "invalid number";
+  auto comma_msg = "expecting: ,";
+  TestError("table T { F:float; } root_type T; { F:1,0 }", "");
+  TestError("table T { F:float; } root_type T; { F:. }", "");
+  TestError("table T { F:float; } root_type T; { F:- }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:-. }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+. }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:.e }", "");
+  TestError("table T { F:float; } root_type T; { F:-e }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+e }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:-.e }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+.e }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:-e1 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+e1 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.0e+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.0e- }", invalid_msg);
+  // exponent pP is mandatory for hex-float
+  TestError("table T { F:float; } root_type T; { F:0x0 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:-0x. }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x. }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0Xe }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"0Xe\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"nan(1)\" }", invalid_msg);
+  // eE not exponent in hex-float!
+  TestError("table T { F:float; } root_type T; { F:0x0.0e+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0e- }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0p }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0p+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0p- }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0pa1 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0e+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0e- }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0e+0 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0e-0 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0ep+ }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:0x0.0ep- }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2.3 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2.e3 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2e.3 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2e0.3 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2e3. }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.2e3.0 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:+-1.0 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.0e+-1 }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"1.0e+-1\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:1.e0e }", comma_msg);
+  TestError("table T { F:float; } root_type T; { F:0x1.p0e }", comma_msg);
+  TestError("table T { F:float; } root_type T; { F:\" 0x10 \" }", invalid_msg);
+  // floats in string
+  TestError("table T { F:float; } root_type T; { F:\"1,2.\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"1.2e3.\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"0x1.p0e\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"0x1.0\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\" 0x1.0\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:\"+ 0\" }", invalid_msg);
+  // disable escapes for "number-in-string"
+  TestError("table T { F:float; } root_type T; { F:\"\\f1.2e3.\" }", "invalid");
+  TestError("table T { F:float; } root_type T; { F:\"\\t1.2e3.\" }", "invalid");
+  TestError("table T { F:float; } root_type T; { F:\"\\n1.2e3.\" }", "invalid");
+  TestError("table T { F:float; } root_type T; { F:\"\\r1.2e3.\" }", "invalid");
+  TestError("table T { F:float; } root_type T; { F:\"4\\x005\" }", "invalid");
+  TestError("table T { F:float; } root_type T; { F:\"\'12\'\" }", invalid_msg);
+  // null is not a number constant!
+  TestError("table T { F:float; } root_type T; { F:\"null\" }", invalid_msg);
+  TestError("table T { F:float; } root_type T; { F:null }", invalid_msg);
+}
+
+void GenerateTableTextTest() {
+  std::string schemafile;
+  std::string jsonfile;
+  bool ok =
+      flatbuffers::LoadFile((test_data_path + "monster_test.fbs").c_str(),
+                            false, &schemafile) &&
+      flatbuffers::LoadFile((test_data_path + "monsterdata_test.json").c_str(),
+                            false, &jsonfile);
+  TEST_EQ(ok, true);
+  auto include_test_path =
+      flatbuffers::ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+  flatbuffers::IDLOptions opt;
+  opt.indent_step = -1;
+  flatbuffers::Parser parser(opt);
+  ok = parser.Parse(schemafile.c_str(), include_directories) &&
+       parser.Parse(jsonfile.c_str(), include_directories);
+  TEST_EQ(ok, true);
+  // Test root table
+  const Monster *monster = GetMonster(parser.builder_.GetBufferPointer());
+  const auto abilities = monster->testarrayofsortedstruct();
+  TEST_EQ(abilities->size(), 3);
+  TEST_EQ(abilities->Get(0)->id(), 0);
+  TEST_EQ(abilities->Get(0)->distance(), 45);
+  TEST_EQ(abilities->Get(1)->id(), 1);
+  TEST_EQ(abilities->Get(1)->distance(), 21);
+  TEST_EQ(abilities->Get(2)->id(), 5);
+  TEST_EQ(abilities->Get(2)->distance(), 12);
+
+  std::string jsongen;
+  auto result = GenerateTextFromTable(parser, monster, "MyGame.Example.Monster",
+                                      &jsongen);
+  TEST_EQ(result, true);
+  // Test sub table
+  const Vec3 *pos = monster->pos();
+  jsongen.clear();
+  result = GenerateTextFromTable(parser, pos, "MyGame.Example.Vec3", &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(
+      jsongen.c_str(),
+      "{x: 1.0,y: 2.0,z: 3.0,test1: 3.0,test2: \"Green\",test3: {a: 5,b: 6}}");
+  const Test &test3 = pos->test3();
+  jsongen.clear();
+  result =
+      GenerateTextFromTable(parser, &test3, "MyGame.Example.Test", &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(), "{a: 5,b: 6}");
+  const Test *test4 = monster->test4()->Get(0);
+  jsongen.clear();
+  result =
+      GenerateTextFromTable(parser, test4, "MyGame.Example.Test", &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(), "{a: 10,b: 20}");
+}
+
+template<typename T>
+void NumericUtilsTestInteger(const char *lower, const char *upper) {
+  T x;
+  TEST_EQ(flatbuffers::StringToNumber("1q", &x), false);
+  TEST_EQ(x, 0);
+  TEST_EQ(flatbuffers::StringToNumber(upper, &x), false);
+  TEST_EQ(x, flatbuffers::numeric_limits<T>::max());
+  TEST_EQ(flatbuffers::StringToNumber(lower, &x), false);
+  auto expval = flatbuffers::is_unsigned<T>::value
+                    ? flatbuffers::numeric_limits<T>::max()
+                    : flatbuffers::numeric_limits<T>::lowest();
+  TEST_EQ(x, expval);
+}
+
+template<typename T>
+void NumericUtilsTestFloat(const char *lower, const char *upper) {
+  T f;
+  TEST_EQ(flatbuffers::StringToNumber("", &f), false);
+  TEST_EQ(flatbuffers::StringToNumber("1q", &f), false);
+  TEST_EQ(f, 0);
+  TEST_EQ(flatbuffers::StringToNumber(upper, &f), true);
+  TEST_EQ(f, +flatbuffers::numeric_limits<T>::infinity());
+  TEST_EQ(flatbuffers::StringToNumber(lower, &f), true);
+  TEST_EQ(f, -flatbuffers::numeric_limits<T>::infinity());
+}
+
+void NumericUtilsTest() {
+  NumericUtilsTestInteger<uint64_t>("-1", "18446744073709551616");
+  NumericUtilsTestInteger<uint8_t>("-1", "256");
+  NumericUtilsTestInteger<int64_t>("-9223372036854775809",
+                                   "9223372036854775808");
+  NumericUtilsTestInteger<int8_t>("-129", "128");
+  NumericUtilsTestFloat<float>("-3.4029e+38", "+3.4029e+38");
+  NumericUtilsTestFloat<float>("-1.7977e+308", "+1.7977e+308");
+}
+
+void IsAsciiUtilsTest() {
+  char c = -128;
+  for (int cnt = 0; cnt < 256; cnt++) {
+    auto alpha = (('a' <= c) && (c <= 'z')) || (('A' <= c) && (c <= 'Z'));
+    auto dec = (('0' <= c) && (c <= '9'));
+    auto hex = (('a' <= c) && (c <= 'f')) || (('A' <= c) && (c <= 'F'));
+    TEST_EQ(flatbuffers::is_alpha(c), alpha);
+    TEST_EQ(flatbuffers::is_alnum(c), alpha || dec);
+    TEST_EQ(flatbuffers::is_digit(c), dec);
+    TEST_EQ(flatbuffers::is_xdigit(c), dec || hex);
+    c += 1;
+  }
+}
+
+void UnicodeTest() {
+  flatbuffers::Parser parser;
+  // Without setting allow_non_utf8 = true, we treat \x sequences as byte
+  // sequences which are then validated as UTF-8.
+  TEST_EQ(parser.Parse("table T { F:string; }"
+                       "root_type T;"
+                       "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+                       "\\u5225\\u30B5\\u30A4\\u30C8\\xE2\\x82\\xAC\\u0080\\uD8"
+                       "3D\\uDE0E\" }"),
+          true);
+  std::string jsongen;
+  parser.opts.indent_step = -1;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(),
+              "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+              "\\u5225\\u30B5\\u30A4\\u30C8\\u20AC\\u0080\\uD83D\\uDE0E\"}");
+}
+
+void UnicodeTestAllowNonUTF8() {
+  flatbuffers::Parser parser;
+  parser.opts.allow_non_utf8 = true;
+  TEST_EQ(
+      parser.Parse(
+          "table T { F:string; }"
+          "root_type T;"
+          "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+          "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"),
+      true);
+  std::string jsongen;
+  parser.opts.indent_step = -1;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(
+      jsongen.c_str(),
+      "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+      "\\u5225\\u30B5\\u30A4\\u30C8\\u0001\\x80\\u0080\\uD83D\\uDE0E\"}");
+}
+
+void UnicodeTestGenerateTextFailsOnNonUTF8() {
+  flatbuffers::Parser parser;
+  // Allow non-UTF-8 initially to model what happens when we load a binary
+  // flatbuffer from disk which contains non-UTF-8 strings.
+  parser.opts.allow_non_utf8 = true;
+  TEST_EQ(
+      parser.Parse(
+          "table T { F:string; }"
+          "root_type T;"
+          "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+          "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"),
+      true);
+  std::string jsongen;
+  parser.opts.indent_step = -1;
+  // Now, disallow non-UTF-8 (the default behavior) so GenerateText indicates
+  // failure.
+  parser.opts.allow_non_utf8 = false;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, false);
+}
+
+void UnicodeSurrogatesTest() {
+  flatbuffers::Parser parser;
+
+  TEST_EQ(parser.Parse("table T { F:string (id: 0); }"
+                       "root_type T;"
+                       "{ F:\"\\uD83D\\uDCA9\"}"),
+          true);
+  auto root = flatbuffers::GetRoot<flatbuffers::Table>(
+      parser.builder_.GetBufferPointer());
+  auto string = root->GetPointer<flatbuffers::String *>(
+      flatbuffers::FieldIndexToOffset(0));
+  TEST_EQ_STR(string->c_str(), "\xF0\x9F\x92\xA9");
+}
+
+void UnicodeInvalidSurrogatesTest() {
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\\uD800\"}",
+      "unpaired high surrogate");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\\uD800abcd\"}",
+      "unpaired high surrogate");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\\uD800\\n\"}",
+      "unpaired high surrogate");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\\uD800\\uD800\"}",
+      "multiple high surrogates");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\\uDC00\"}",
+      "unpaired low surrogate");
+}
+
+void InvalidUTF8Test() {
+  // "1 byte" pattern, under min length of 2 bytes
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\x80\"}",
+      "illegal UTF-8 sequence");
+  // 2 byte pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xDF\"}",
+      "illegal UTF-8 sequence");
+  // 3 byte pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xEF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // 4 byte pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xF7\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "5 byte" pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFB\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "6 byte" pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFD\xBF\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "7 byte" pattern, string too short
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "5 byte" pattern, over max length of 4 bytes
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFB\xBF\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "6 byte" pattern, over max length of 4 bytes
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFD\xBF\xBF\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+  // "7 byte" pattern, over max length of 4 bytes
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"}",
+      "illegal UTF-8 sequence");
+
+  // Three invalid encodings for U+000A (\n, aka NEWLINE)
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xC0\x8A\"}",
+      "illegal UTF-8 sequence");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xE0\x80\x8A\"}",
+      "illegal UTF-8 sequence");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xF0\x80\x80\x8A\"}",
+      "illegal UTF-8 sequence");
+
+  // Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xE0\x81\xA9\"}",
+      "illegal UTF-8 sequence");
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xF0\x80\x81\xA9\"}",
+      "illegal UTF-8 sequence");
+
+  // Invalid encoding for U+20AC (EURO SYMBOL)
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      "{ F:\"\xF0\x82\x82\xAC\"}",
+      "illegal UTF-8 sequence");
+
+  // UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in
+  // UTF-8
+  TestError(
+      "table T { F:string; }"
+      "root_type T;"
+      // U+10400 "encoded" as U+D801 U+DC00
+      "{ F:\"\xED\xA0\x81\xED\xB0\x80\"}",
+      "illegal UTF-8 sequence");
+
+  // Check independence of identifier from locale.
+  std::string locale_ident;
+  locale_ident += "table T { F";
+  locale_ident += static_cast<char>(-32);  // unsigned 0xE0
+  locale_ident += " :string; }";
+  locale_ident += "root_type T;";
+  locale_ident += "{}";
+  TestError(locale_ident.c_str(), "");
+}
+
+void UnknownFieldsTest() {
+  flatbuffers::IDLOptions opts;
+  opts.skip_unexpected_fields_in_json = true;
+  flatbuffers::Parser parser(opts);
+
+  TEST_EQ(parser.Parse("table T { str:string; i:int;}"
+                       "root_type T;"
+                       "{ str:\"test\","
+                       "unknown_string:\"test\","
+                       "\"unknown_string\":\"test\","
+                       "unknown_int:10,"
+                       "unknown_float:1.0,"
+                       "unknown_array: [ 1, 2, 3, 4],"
+                       "unknown_object: { i: 10 },"
+                       "\"unknown_object\": { \"i\": 10 },"
+                       "i:10}"),
+          true);
+
+  std::string jsongen;
+  parser.opts.indent_step = -1;
+  auto result =
+      GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(), "{str: \"test\",i: 10}");
+}
+
+void ParseUnionTest() {
+  // Unions must be parseable with the type field following the object.
+  flatbuffers::Parser parser;
+  TEST_EQ(parser.Parse("table T { A:int; }"
+                       "union U { T }"
+                       "table V { X:U; }"
+                       "root_type V;"
+                       "{ X:{ A:1 }, X_type: T }"),
+          true);
+  // Unions must be parsable with prefixed namespace.
+  flatbuffers::Parser parser2;
+  TEST_EQ(parser2.Parse("namespace N; table A {} namespace; union U { N.A }"
+                        "table B { e:U; } root_type B;"
+                        "{ e_type: N_A, e: {} }"),
+          true);
+}
+
+void InvalidNestedFlatbufferTest() {
+  // First, load and parse FlatBuffer schema (.fbs)
+  std::string schemafile;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "monster_test.fbs").c_str(),
+                                false, &schemafile),
+          true);
+  auto include_test_path =
+      flatbuffers::ConCatPathFileName(test_data_path, "include_test");
+  const char *include_directories[] = { test_data_path.c_str(),
+                                        include_test_path.c_str(), nullptr };
+  flatbuffers::Parser parser1;
+  TEST_EQ(parser1.Parse(schemafile.c_str(), include_directories), true);
+
+  // "color" inside nested flatbuffer contains invalid enum value
+  TEST_EQ(parser1.Parse("{ name: \"Bender\", testnestedflatbuffer: { name: "
+                        "\"Leela\", color: \"nonexistent\"}}"),
+          false);
+}
+
+void EvolutionTest() {
+  // VS10 does not support typed enums, exclude from tests
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+  const int NUM_VERSIONS = 2;
+  std::string schemas[NUM_VERSIONS];
+  std::string jsonfiles[NUM_VERSIONS];
+  std::vector<uint8_t> binaries[NUM_VERSIONS];
+
+  flatbuffers::IDLOptions idl_opts;
+  idl_opts.lang_to_generate |= flatbuffers::IDLOptions::kBinary;
+  flatbuffers::Parser parser(idl_opts);
+
+  // Load all the schema versions and their associated data.
+  for (int i = 0; i < NUM_VERSIONS; ++i) {
+    std::string schema = test_data_path + "evolution_test/evolution_v" +
+                         flatbuffers::NumToString(i + 1) + ".fbs";
+    TEST_ASSERT(flatbuffers::LoadFile(schema.c_str(), false, &schemas[i]));
+    std::string json = test_data_path + "evolution_test/evolution_v" +
+                       flatbuffers::NumToString(i + 1) + ".json";
+    TEST_ASSERT(flatbuffers::LoadFile(json.c_str(), false, &jsonfiles[i]));
+
+    TEST_ASSERT(parser.Parse(schemas[i].c_str()));
+    TEST_ASSERT(parser.Parse(jsonfiles[i].c_str()));
+
+    auto bufLen = parser.builder_.GetSize();
+    auto buf = parser.builder_.GetBufferPointer();
+    binaries[i].reserve(bufLen);
+    std::copy(buf, buf + bufLen, std::back_inserter(binaries[i]));
+  }
+
+  // Assert that all the verifiers for the different schema versions properly
+  // verify any version data.
+  for (int i = 0; i < NUM_VERSIONS; ++i) {
+    flatbuffers::Verifier verifier(&binaries[i].front(), binaries[i].size());
+    TEST_ASSERT(Evolution::V1::VerifyRootBuffer(verifier));
+    TEST_ASSERT(Evolution::V2::VerifyRootBuffer(verifier));
+  }
+
+  // Test backwards compatibility by reading old data with an evolved schema.
+  auto root_v1_viewed_from_v2 = Evolution::V2::GetRoot(&binaries[0].front());
+  // field 'k' is new in version 2, so it should be null.
+  TEST_ASSERT(nullptr == root_v1_viewed_from_v2->k());
+  // field 'l' is new in version 2 with a default of 56.
+  TEST_EQ(root_v1_viewed_from_v2->l(), 56);
+  // field 'c' of 'TableA' is new in version 2, so it should be null.
+  TEST_ASSERT(nullptr == root_v1_viewed_from_v2->e()->c());
+  // 'TableC' was added to field 'c' union in version 2, so it should be null.
+  TEST_ASSERT(nullptr == root_v1_viewed_from_v2->c_as_TableC());
+  // The field 'c' union should be of type 'TableB' regardless of schema version
+  TEST_ASSERT(root_v1_viewed_from_v2->c_type() == Evolution::V2::Union::TableB);
+  // The field 'f' was renamed to 'ff' in version 2, it should still be
+  // readable.
+  TEST_EQ(root_v1_viewed_from_v2->ff()->a(), 16);
+
+  // Test forwards compatibility by reading new data with an old schema.
+  auto root_v2_viewed_from_v1 = Evolution::V1::GetRoot(&binaries[1].front());
+  // The field 'c' union in version 2 is a new table (index = 3) and should
+  // still be accessible, but not interpretable.
+  TEST_EQ(static_cast<uint8_t>(root_v2_viewed_from_v1->c_type()), 3);
+  TEST_NOTNULL(root_v2_viewed_from_v1->c());
+  // The field 'd' enum in verison 2 has new members and should still be
+  // accessible, but not interpretable.
+  TEST_EQ(static_cast<int8_t>(root_v2_viewed_from_v1->d()), 3);
+  // The field 'a' in version 2 is deprecated and should return the default
+  // value (0) instead of the value stored in the in the buffer (42).
+  TEST_EQ(root_v2_viewed_from_v1->a(), 0);
+  // The field 'ff' was originally named 'f' in version 1, it should still be
+  // readable.
+  TEST_EQ(root_v2_viewed_from_v1->f()->a(), 35);
+#endif
+}
+
+void UnionDeprecationTest() {
+  const int NUM_VERSIONS = 2;
+  std::string schemas[NUM_VERSIONS];
+  std::string jsonfiles[NUM_VERSIONS];
+  std::vector<uint8_t> binaries[NUM_VERSIONS];
+
+  flatbuffers::IDLOptions idl_opts;
+  idl_opts.lang_to_generate |= flatbuffers::IDLOptions::kBinary;
+  flatbuffers::Parser parser(idl_opts);
+
+  // Load all the schema versions and their associated data.
+  for (int i = 0; i < NUM_VERSIONS; ++i) {
+    std::string schema = test_data_path + "evolution_test/evolution_v" +
+                         flatbuffers::NumToString(i + 1) + ".fbs";
+    TEST_ASSERT(flatbuffers::LoadFile(schema.c_str(), false, &schemas[i]));
+    std::string json = test_data_path + "evolution_test/evolution_v" +
+                       flatbuffers::NumToString(i + 1) + ".json";
+    TEST_ASSERT(flatbuffers::LoadFile(json.c_str(), false, &jsonfiles[i]));
+
+    TEST_ASSERT(parser.Parse(schemas[i].c_str()));
+    TEST_ASSERT(parser.Parse(jsonfiles[i].c_str()));
+
+    auto bufLen = parser.builder_.GetSize();
+    auto buf = parser.builder_.GetBufferPointer();
+    binaries[i].reserve(bufLen);
+    std::copy(buf, buf + bufLen, std::back_inserter(binaries[i]));
+  }
+
+  auto v2 = parser.LookupStruct("Evolution.V2.Root");
+  TEST_NOTNULL(v2);
+  auto j_type_field = v2->fields.Lookup("j_type");
+  TEST_NOTNULL(j_type_field);
+  TEST_ASSERT(j_type_field->deprecated);
+}
+
+void UnionVectorTest() {
+  // load FlatBuffer fbs schema and json.
+  std::string schemafile, jsonfile;
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "union_vector/union_vector.fbs").c_str(), false,
+              &schemafile),
+          true);
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "union_vector/union_vector.json").c_str(),
+              false, &jsonfile),
+          true);
+
+  // parse schema.
+  flatbuffers::IDLOptions idl_opts;
+  idl_opts.lang_to_generate |= flatbuffers::IDLOptions::kBinary;
+  flatbuffers::Parser parser(idl_opts);
+  TEST_EQ(parser.Parse(schemafile.c_str()), true);
+
+  flatbuffers::FlatBufferBuilder fbb;
+
+  // union types.
+  std::vector<uint8_t> types;
+  types.push_back(static_cast<uint8_t>(Character_Belle));
+  types.push_back(static_cast<uint8_t>(Character_MuLan));
+  types.push_back(static_cast<uint8_t>(Character_BookFan));
+  types.push_back(static_cast<uint8_t>(Character_Other));
+  types.push_back(static_cast<uint8_t>(Character_Unused));
+
+  // union values.
+  std::vector<flatbuffers::Offset<void>> characters;
+  characters.push_back(fbb.CreateStruct(BookReader(/*books_read=*/7)).Union());
+  characters.push_back(CreateAttacker(fbb, /*sword_attack_damage=*/5).Union());
+  characters.push_back(fbb.CreateStruct(BookReader(/*books_read=*/2)).Union());
+  characters.push_back(fbb.CreateString("Other").Union());
+  characters.push_back(fbb.CreateString("Unused").Union());
+
+  // create Movie.
+  const auto movie_offset =
+      CreateMovie(fbb, Character_Rapunzel,
+                  fbb.CreateStruct(Rapunzel(/*hair_length=*/6)).Union(),
+                  fbb.CreateVector(types), fbb.CreateVector(characters));
+  FinishMovieBuffer(fbb, movie_offset);
+
+  flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  TEST_EQ(VerifyMovieBuffer(verifier), true);
+
+  auto flat_movie = GetMovie(fbb.GetBufferPointer());
+
+  auto TestMovie = [](const Movie *movie) {
+    TEST_EQ(movie->main_character_type() == Character_Rapunzel, true);
+
+    auto cts = movie->characters_type();
+    TEST_EQ(movie->characters_type()->size(), 5);
+    TEST_EQ(cts->GetEnum<Character>(0) == Character_Belle, true);
+    TEST_EQ(cts->GetEnum<Character>(1) == Character_MuLan, true);
+    TEST_EQ(cts->GetEnum<Character>(2) == Character_BookFan, true);
+    TEST_EQ(cts->GetEnum<Character>(3) == Character_Other, true);
+    TEST_EQ(cts->GetEnum<Character>(4) == Character_Unused, true);
+
+    auto rapunzel = movie->main_character_as_Rapunzel();
+    TEST_NOTNULL(rapunzel);
+    TEST_EQ(rapunzel->hair_length(), 6);
+
+    auto cs = movie->characters();
+    TEST_EQ(cs->size(), 5);
+    auto belle = cs->GetAs<BookReader>(0);
+    TEST_EQ(belle->books_read(), 7);
+    auto mu_lan = cs->GetAs<Attacker>(1);
+    TEST_EQ(mu_lan->sword_attack_damage(), 5);
+    auto book_fan = cs->GetAs<BookReader>(2);
+    TEST_EQ(book_fan->books_read(), 2);
+    auto other = cs->GetAsString(3);
+    TEST_EQ_STR(other->c_str(), "Other");
+    auto unused = cs->GetAsString(4);
+    TEST_EQ_STR(unused->c_str(), "Unused");
+  };
+
+  TestMovie(flat_movie);
+
+  // Also test the JSON we loaded above.
+  TEST_EQ(parser.Parse(jsonfile.c_str()), true);
+  auto jbuf = parser.builder_.GetBufferPointer();
+  flatbuffers::Verifier jverifier(jbuf, parser.builder_.GetSize());
+  TEST_EQ(VerifyMovieBuffer(jverifier), true);
+  TestMovie(GetMovie(jbuf));
+
+  auto movie_object = flat_movie->UnPack();
+  TEST_EQ(movie_object->main_character.AsRapunzel()->hair_length(), 6);
+  TEST_EQ(movie_object->characters[0].AsBelle()->books_read(), 7);
+  TEST_EQ(movie_object->characters[1].AsMuLan()->sword_attack_damage, 5);
+  TEST_EQ(movie_object->characters[2].AsBookFan()->books_read(), 2);
+  TEST_EQ_STR(movie_object->characters[3].AsOther()->c_str(), "Other");
+  TEST_EQ_STR(movie_object->characters[4].AsUnused()->c_str(), "Unused");
+
+  fbb.Clear();
+  fbb.Finish(Movie::Pack(fbb, movie_object));
+
+  delete movie_object;
+
+  auto repacked_movie = GetMovie(fbb.GetBufferPointer());
+
+  TestMovie(repacked_movie);
+
+  // Generate text using mini-reflection.
+  auto s =
+      flatbuffers::FlatBufferToString(fbb.GetBufferPointer(), MovieTypeTable());
+  TEST_EQ_STR(
+      s.c_str(),
+      "{ main_character_type: Rapunzel, main_character: { hair_length: 6 }, "
+      "characters_type: [ Belle, MuLan, BookFan, Other, Unused ], "
+      "characters: [ { books_read: 7 }, { sword_attack_damage: 5 }, "
+      "{ books_read: 2 }, \"Other\", \"Unused\" ] }");
+
+  flatbuffers::ToStringVisitor visitor("\n", true, "  ");
+  IterateFlatBuffer(fbb.GetBufferPointer(), MovieTypeTable(), &visitor);
+  TEST_EQ_STR(visitor.s.c_str(),
+              "{\n"
+              "  \"main_character_type\": \"Rapunzel\",\n"
+              "  \"main_character\": {\n"
+              "    \"hair_length\": 6\n"
+              "  },\n"
+              "  \"characters_type\": [\n"
+              "    \"Belle\",\n"
+              "    \"MuLan\",\n"
+              "    \"BookFan\",\n"
+              "    \"Other\",\n"
+              "    \"Unused\"\n"
+              "  ],\n"
+              "  \"characters\": [\n"
+              "    {\n"
+              "      \"books_read\": 7\n"
+              "    },\n"
+              "    {\n"
+              "      \"sword_attack_damage\": 5\n"
+              "    },\n"
+              "    {\n"
+              "      \"books_read\": 2\n"
+              "    },\n"
+              "    \"Other\",\n"
+              "    \"Unused\"\n"
+              "  ]\n"
+              "}");
+
+  // Generate text using parsed schema.
+  std::string jsongen;
+  auto result = GenerateText(parser, fbb.GetBufferPointer(), &jsongen);
+  TEST_EQ(result, true);
+  TEST_EQ_STR(jsongen.c_str(),
+              "{\n"
+              "  main_character_type: \"Rapunzel\",\n"
+              "  main_character: {\n"
+              "    hair_length: 6\n"
+              "  },\n"
+              "  characters_type: [\n"
+              "    \"Belle\",\n"
+              "    \"MuLan\",\n"
+              "    \"BookFan\",\n"
+              "    \"Other\",\n"
+              "    \"Unused\"\n"
+              "  ],\n"
+              "  characters: [\n"
+              "    {\n"
+              "      books_read: 7\n"
+              "    },\n"
+              "    {\n"
+              "      sword_attack_damage: 5\n"
+              "    },\n"
+              "    {\n"
+              "      books_read: 2\n"
+              "    },\n"
+              "    \"Other\",\n"
+              "    \"Unused\"\n"
+              "  ]\n"
+              "}\n");
+
+  // Simple test with reflection.
+  parser.Serialize();
+  auto schema = reflection::GetSchema(parser.builder_.GetBufferPointer());
+  auto ok = flatbuffers::Verify(*schema, *schema->root_table(),
+                                fbb.GetBufferPointer(), fbb.GetSize());
+  TEST_EQ(ok, true);
+
+  flatbuffers::Parser parser2(idl_opts);
+  TEST_EQ(parser2.Parse("struct Bool { b:bool; }"
+                        "union Any { Bool }"
+                        "table Root { a:Any; }"
+                        "root_type Root;"),
+          true);
+  TEST_EQ(parser2.Parse("{a_type:Bool,a:{b:true}}"), true);
+}
+
+void ConformTest() {
+  flatbuffers::Parser parser;
+  TEST_EQ(parser.Parse("table T { A:int; } enum E:byte { A }"), true);
+
+  auto test_conform = [](flatbuffers::Parser &parser1, const char *test,
+                         const char *expected_err) {
+    flatbuffers::Parser parser2;
+    TEST_EQ(parser2.Parse(test), true);
+    auto err = parser2.ConformTo(parser1);
+    TEST_NOTNULL(strstr(err.c_str(), expected_err));
+  };
+
+  test_conform(parser, "table T { A:byte; }", "types differ for field");
+  test_conform(parser, "table T { B:int; A:int; }", "offsets differ for field");
+  test_conform(parser, "table T { A:int = 1; }", "defaults differ for field");
+  test_conform(parser, "table T { B:float; }",
+               "field renamed to different type");
+  test_conform(parser, "enum E:byte { B, A }", "values differ for enum");
+}
+
+void ParseProtoBufAsciiTest() {
+  // We can put the parser in a mode where it will accept JSON that looks more
+  // like Protobuf ASCII, for users that have data in that format.
+  // This uses no "" for field names (which we already support by default,
+  // omits `,`, `:` before `{` and a couple of other features.
+  flatbuffers::Parser parser;
+  parser.opts.protobuf_ascii_alike = true;
+  TEST_EQ(
+      parser.Parse("table S { B:int; } table T { A:[int]; C:S; } root_type T;"),
+      true);
+  TEST_EQ(parser.Parse("{ A [1 2] C { B:2 }}"), true);
+  // Similarly, in text output, it should omit these.
+  std::string text;
+  auto ok = flatbuffers::GenerateText(
+      parser, parser.builder_.GetBufferPointer(), &text);
+  TEST_EQ(ok, true);
+  TEST_EQ_STR(text.c_str(),
+              "{\n  A [\n    1\n    2\n  ]\n  C {\n    B: 2\n  }\n}\n");
+}
+
+void FlexBuffersTest() {
+  flexbuffers::Builder slb(512,
+                           flexbuffers::BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+
+  // Write the equivalent of:
+  // { vec: [ -100, "Fred", 4.0, false ], bar: [ 1, 2, 3 ], bar3: [ 1, 2, 3 ],
+  // foo: 100, bool: true, mymap: { foo: "Fred" } }
+  // clang-format off
+  #ifndef FLATBUFFERS_CPP98_STL
+    // It's possible to do this without std::function support as well.
+    slb.Map([&]() {
+       slb.Vector("vec", [&]() {
+        slb += -100;  // Equivalent to slb.Add(-100) or slb.Int(-100);
+        slb += "Fred";
+        slb.IndirectFloat(4.0f);
+        auto i_f = slb.LastValue();
+        uint8_t blob[] = { 77 };
+        slb.Blob(blob, 1);
+        slb += false;
+        slb.ReuseValue(i_f);
+      });
+      int ints[] = { 1, 2, 3 };
+      slb.Vector("bar", ints, 3);
+      slb.FixedTypedVector("bar3", ints, 3);
+      bool bools[] = {true, false, true, false};
+      slb.Vector("bools", bools, 4);
+      slb.Bool("bool", true);
+      slb.Double("foo", 100);
+      slb.Map("mymap", [&]() {
+        slb.String("foo", "Fred");  // Testing key and string reuse.
+      });
+    });
+    slb.Finish();
+  #else
+    // It's possible to do this without std::function support as well.
+    slb.Map([](flexbuffers::Builder& slb2) {
+       slb2.Vector("vec", [](flexbuffers::Builder& slb3) {
+        slb3 += -100;  // Equivalent to slb.Add(-100) or slb.Int(-100);
+        slb3 += "Fred";
+        slb3.IndirectFloat(4.0f);
+        auto i_f = slb3.LastValue();
+        uint8_t blob[] = { 77 };
+        slb3.Blob(blob, 1);
+        slb3 += false;
+        slb3.ReuseValue(i_f);
+      }, slb2);
+      int ints[] = { 1, 2, 3 };
+      slb2.Vector("bar", ints, 3);
+      slb2.FixedTypedVector("bar3", ints, 3);
+      slb2.Bool("bool", true);
+      slb2.Double("foo", 100);
+      slb2.Map("mymap", [](flexbuffers::Builder& slb3) {
+        slb3.String("foo", "Fred");  // Testing key and string reuse.
+      }, slb2);
+    }, slb);
+    slb.Finish();
+  #endif  // FLATBUFFERS_CPP98_STL
+
+  #ifdef FLATBUFFERS_TEST_VERBOSE
+    for (size_t i = 0; i < slb.GetBuffer().size(); i++)
+      printf("%d ", flatbuffers::vector_data(slb.GetBuffer())[i]);
+    printf("\n");
+  #endif
+  // clang-format on
+
+  auto map = flexbuffers::GetRoot(slb.GetBuffer()).AsMap();
+  TEST_EQ(map.size(), 7);
+  auto vec = map["vec"].AsVector();
+  TEST_EQ(vec.size(), 6);
+  TEST_EQ(vec[0].AsInt64(), -100);
+  TEST_EQ_STR(vec[1].AsString().c_str(), "Fred");
+  TEST_EQ(vec[1].AsInt64(), 0);  // Number parsing failed.
+  TEST_EQ(vec[2].AsDouble(), 4.0);
+  TEST_EQ(vec[2].AsString().IsTheEmptyString(), true);  // Wrong Type.
+  TEST_EQ_STR(vec[2].AsString().c_str(), "");     // This still works though.
+  TEST_EQ_STR(vec[2].ToString().c_str(), "4.0");  // Or have it converted.
+  // Few tests for templated version of As.
+  TEST_EQ(vec[0].As<int64_t>(), -100);
+  TEST_EQ_STR(vec[1].As<std::string>().c_str(), "Fred");
+  TEST_EQ(vec[1].As<int64_t>(), 0);  // Number parsing failed.
+  TEST_EQ(vec[2].As<double>(), 4.0);
+  // Test that the blob can be accessed.
+  TEST_EQ(vec[3].IsBlob(), true);
+  auto blob = vec[3].AsBlob();
+  TEST_EQ(blob.size(), 1);
+  TEST_EQ(blob.data()[0], 77);
+  TEST_EQ(vec[4].IsBool(), true);   // Check if type is a bool
+  TEST_EQ(vec[4].AsBool(), false);  // Check if value is false
+  TEST_EQ(vec[5].AsDouble(), 4.0);  // This is shared with vec[2] !
+  auto tvec = map["bar"].AsTypedVector();
+  TEST_EQ(tvec.size(), 3);
+  TEST_EQ(tvec[2].AsInt8(), 3);
+  auto tvec3 = map["bar3"].AsFixedTypedVector();
+  TEST_EQ(tvec3.size(), 3);
+  TEST_EQ(tvec3[2].AsInt8(), 3);
+  TEST_EQ(map["bool"].AsBool(), true);
+  auto tvecb = map["bools"].AsTypedVector();
+  TEST_EQ(tvecb.ElementType(), flexbuffers::FBT_BOOL);
+  TEST_EQ(map["foo"].AsUInt8(), 100);
+  TEST_EQ(map["unknown"].IsNull(), true);
+  auto mymap = map["mymap"].AsMap();
+  // These should be equal by pointer equality, since key and value are shared.
+  TEST_EQ(mymap.Keys()[0].AsKey(), map.Keys()[4].AsKey());
+  TEST_EQ(mymap.Values()[0].AsString().c_str(), vec[1].AsString().c_str());
+  // We can mutate values in the buffer.
+  TEST_EQ(vec[0].MutateInt(-99), true);
+  TEST_EQ(vec[0].AsInt64(), -99);
+  TEST_EQ(vec[1].MutateString("John"), true);  // Size must match.
+  TEST_EQ_STR(vec[1].AsString().c_str(), "John");
+  TEST_EQ(vec[1].MutateString("Alfred"), false);  // Too long.
+  TEST_EQ(vec[2].MutateFloat(2.0f), true);
+  TEST_EQ(vec[2].AsFloat(), 2.0f);
+  TEST_EQ(vec[2].MutateFloat(3.14159), false);  // Double does not fit in float.
+  TEST_EQ(vec[4].AsBool(), false);              // Is false before change
+  TEST_EQ(vec[4].MutateBool(true), true);       // Can change a bool
+  TEST_EQ(vec[4].AsBool(), true);               // Changed bool is now true
+
+  // Parse from JSON:
+  flatbuffers::Parser parser;
+  slb.Clear();
+  auto jsontest = "{ a: [ 123, 456.0 ], b: \"hello\", c: true, d: false }";
+  TEST_EQ(parser.ParseFlexBuffer(jsontest, nullptr, &slb), true);
+  auto jroot = flexbuffers::GetRoot(slb.GetBuffer());
+  auto jmap = jroot.AsMap();
+  auto jvec = jmap["a"].AsVector();
+  TEST_EQ(jvec[0].AsInt64(), 123);
+  TEST_EQ(jvec[1].AsDouble(), 456.0);
+  TEST_EQ_STR(jmap["b"].AsString().c_str(), "hello");
+  TEST_EQ(jmap["c"].IsBool(), true);   // Parsed correctly to a bool
+  TEST_EQ(jmap["c"].AsBool(), true);   // Parsed correctly to true
+  TEST_EQ(jmap["d"].IsBool(), true);   // Parsed correctly to a bool
+  TEST_EQ(jmap["d"].AsBool(), false);  // Parsed correctly to false
+  // And from FlexBuffer back to JSON:
+  auto jsonback = jroot.ToString();
+  TEST_EQ_STR(jsontest, jsonback.c_str());
+
+  slb.Clear();
+  slb.Vector([&]() {
+    for (int i = 0; i < 130; ++i) slb.Add(static_cast<uint8_t>(255));
+    slb.Vector([&]() {
+      for (int i = 0; i < 130; ++i) slb.Add(static_cast<uint8_t>(255));
+      slb.Vector([] {});
+    });
+  });
+  slb.Finish();
+  TEST_EQ(slb.GetSize(), 664);
+}
+
+void FlexBuffersFloatingPointTest() {
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+  flexbuffers::Builder slb(512,
+                           flexbuffers::BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+  // Parse floating-point values from JSON:
+  flatbuffers::Parser parser;
+  slb.Clear();
+  auto jsontest =
+      "{ a: [1.0, nan, inf, infinity, -inf, +inf, -infinity, 8.0] }";
+  TEST_EQ(parser.ParseFlexBuffer(jsontest, nullptr, &slb), true);
+  auto jroot = flexbuffers::GetRoot(slb.GetBuffer());
+  auto jmap = jroot.AsMap();
+  auto jvec = jmap["a"].AsVector();
+  TEST_EQ(8, jvec.size());
+  TEST_EQ(1.0, jvec[0].AsDouble());
+  TEST_ASSERT(is_quiet_nan(jvec[1].AsDouble()));
+  TEST_EQ(infinity_d, jvec[2].AsDouble());
+  TEST_EQ(infinity_d, jvec[3].AsDouble());
+  TEST_EQ(-infinity_d, jvec[4].AsDouble());
+  TEST_EQ(+infinity_d, jvec[5].AsDouble());
+  TEST_EQ(-infinity_d, jvec[6].AsDouble());
+  TEST_EQ(8.0, jvec[7].AsDouble());
+#endif
+}
+
+void FlexBuffersDeprecatedTest() {
+  // FlexBuffers as originally designed had a flaw involving the
+  // FBT_VECTOR_STRING datatype, and this test documents/tests the fix for it.
+  // Discussion: https://github.com/google/flatbuffers/issues/5627
+  flexbuffers::Builder slb;
+  // FBT_VECTOR_* are "typed vectors" where all elements are of the same type.
+  // Problem is, when storing FBT_STRING elements, it relies on that type to
+  // get the bit-width for the size field of the string, which in this case
+  // isn't present, and instead defaults to 8-bit. This means that any strings
+  // stored inside such a vector, when accessed thru the old API that returns
+  // a String reference, will appear to be truncated if the string stored is
+  // actually >=256 bytes.
+  std::string test_data(300, 'A');
+  auto start = slb.StartVector();
+  // This one will have a 16-bit size field.
+  slb.String(test_data);
+  // This one will have an 8-bit size field.
+  slb.String("hello");
+  // We're asking this to be serialized as a typed vector (true), but not
+  // fixed size (false). The type will be FBT_VECTOR_STRING with a bit-width
+  // of whatever the offsets in the vector need, the bit-widths of the strings
+  // are not stored(!) <- the actual design flaw.
+  // Note that even in the fixed code, we continue to serialize the elements of
+  // FBT_VECTOR_STRING as FBT_STRING, since there may be old code out there
+  // reading new data that we want to continue to function.
+  // Thus, FBT_VECTOR_STRING, while deprecated, will always be represented the
+  // same way, the fix lies on the reading side.
+  slb.EndVector(start, true, false);
+  slb.Finish();
+  // So now lets read this data back.
+  // For existing data, since we have no way of knowing what the actual
+  // bit-width of the size field of the string is, we are going to ignore this
+  // field, and instead treat these strings as FBT_KEY (null-terminated), so we
+  // can deal with strings of arbitrary length. This of course truncates strings
+  // with embedded nulls, but we think that that is preferrable over truncating
+  // strings >= 256 bytes.
+  auto vec = flexbuffers::GetRoot(slb.GetBuffer()).AsTypedVector();
+  // Even though this was serialized as FBT_VECTOR_STRING, it is read as
+  // FBT_VECTOR_KEY:
+  TEST_EQ(vec.ElementType(), flexbuffers::FBT_KEY);
+  // Access the long string. Previously, this would return a string of size 1,
+  // since it would read the high-byte of the 16-bit length.
+  // This should now correctly test the full 300 bytes, using AsKey():
+  TEST_EQ_STR(vec[0].AsKey(), test_data.c_str());
+  // Old code that called AsString will continue to work, as the String
+  // accessor objects now use a cached size that can come from a key as well.
+  TEST_EQ_STR(vec[0].AsString().c_str(), test_data.c_str());
+  // Short strings work as before:
+  TEST_EQ_STR(vec[1].AsKey(), "hello");
+  TEST_EQ_STR(vec[1].AsString().c_str(), "hello");
+  // So, while existing code and data mostly "just work" with the fixes applied
+  // to AsTypedVector and AsString, what do you do going forward?
+  // Code accessing existing data doesn't necessarily need to change, though
+  // you could consider using AsKey instead of AsString for a) documenting
+  // that you are accessing keys, or b) a speedup if you don't actually use
+  // the string size.
+  // For new data, or data that doesn't need to be backwards compatible,
+  // instead serialize as FBT_VECTOR (call EndVector with typed = false, then
+  // read elements with AsString), or, for maximum compactness, use
+  // FBT_VECTOR_KEY (call slb.Key above instead, read with AsKey or AsString).
+}
+
+void TypeAliasesTest() {
+  flatbuffers::FlatBufferBuilder builder;
+
+  builder.Finish(CreateTypeAliases(
+      builder, flatbuffers::numeric_limits<int8_t>::min(),
+      flatbuffers::numeric_limits<uint8_t>::max(),
+      flatbuffers::numeric_limits<int16_t>::min(),
+      flatbuffers::numeric_limits<uint16_t>::max(),
+      flatbuffers::numeric_limits<int32_t>::min(),
+      flatbuffers::numeric_limits<uint32_t>::max(),
+      flatbuffers::numeric_limits<int64_t>::min(),
+      flatbuffers::numeric_limits<uint64_t>::max(), 2.3f, 2.3));
+
+  auto p = builder.GetBufferPointer();
+  auto ta = flatbuffers::GetRoot<TypeAliases>(p);
+
+  TEST_EQ(ta->i8(), flatbuffers::numeric_limits<int8_t>::min());
+  TEST_EQ(ta->u8(), flatbuffers::numeric_limits<uint8_t>::max());
+  TEST_EQ(ta->i16(), flatbuffers::numeric_limits<int16_t>::min());
+  TEST_EQ(ta->u16(), flatbuffers::numeric_limits<uint16_t>::max());
+  TEST_EQ(ta->i32(), flatbuffers::numeric_limits<int32_t>::min());
+  TEST_EQ(ta->u32(), flatbuffers::numeric_limits<uint32_t>::max());
+  TEST_EQ(ta->i64(), flatbuffers::numeric_limits<int64_t>::min());
+  TEST_EQ(ta->u64(), flatbuffers::numeric_limits<uint64_t>::max());
+  TEST_EQ(ta->f32(), 2.3f);
+  TEST_EQ(ta->f64(), 2.3);
+  using namespace flatbuffers;  // is_same
+  static_assert(is_same<decltype(ta->i8()), int8_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->i16()), int16_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->i32()), int32_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->i64()), int64_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->u8()), uint8_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->u16()), uint16_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->u32()), uint32_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->u64()), uint64_t>::value, "invalid type");
+  static_assert(is_same<decltype(ta->f32()), float>::value, "invalid type");
+  static_assert(is_same<decltype(ta->f64()), double>::value, "invalid type");
+}
+
+void EndianSwapTest() {
+  TEST_EQ(flatbuffers::EndianSwap(static_cast<int16_t>(0x1234)), 0x3412);
+  TEST_EQ(flatbuffers::EndianSwap(static_cast<int32_t>(0x12345678)),
+          0x78563412);
+  TEST_EQ(flatbuffers::EndianSwap(static_cast<int64_t>(0x1234567890ABCDEF)),
+          0xEFCDAB9078563412);
+  TEST_EQ(flatbuffers::EndianSwap(flatbuffers::EndianSwap(3.14f)), 3.14f);
+}
+
+void UninitializedVectorTest() {
+  flatbuffers::FlatBufferBuilder builder;
+
+  Test *buf = nullptr;
+  auto vector_offset =
+      builder.CreateUninitializedVectorOfStructs<Test>(2, &buf);
+  TEST_NOTNULL(buf);
+  buf[0] = Test(10, 20);
+  buf[1] = Test(30, 40);
+
+  auto required_name = builder.CreateString("myMonster");
+  auto monster_builder = MonsterBuilder(builder);
+  monster_builder.add_name(
+      required_name);  // required field mandated for monster.
+  monster_builder.add_test4(vector_offset);
+  builder.Finish(monster_builder.Finish());
+
+  auto p = builder.GetBufferPointer();
+  auto uvt = flatbuffers::GetRoot<Monster>(p);
+  TEST_NOTNULL(uvt);
+  auto vec = uvt->test4();
+  TEST_NOTNULL(vec);
+  auto test_0 = vec->Get(0);
+  auto test_1 = vec->Get(1);
+  TEST_EQ(test_0->a(), 10);
+  TEST_EQ(test_0->b(), 20);
+  TEST_EQ(test_1->a(), 30);
+  TEST_EQ(test_1->b(), 40);
+}
+
+void EqualOperatorTest() {
+  MonsterT a;
+  MonsterT b;
+  TEST_EQ(b == a, true);
+  TEST_EQ(b != a, false);
+
+  b.mana = 33;
+  TEST_EQ(b == a, false);
+  TEST_EQ(b != a, true);
+  b.mana = 150;
+  TEST_EQ(b == a, true);
+  TEST_EQ(b != a, false);
+
+  b.inventory.push_back(3);
+  TEST_EQ(b == a, false);
+  TEST_EQ(b != a, true);
+  b.inventory.clear();
+  TEST_EQ(b == a, true);
+  TEST_EQ(b != a, false);
+
+  b.test.type = Any_Monster;
+  TEST_EQ(b == a, false);
+  TEST_EQ(b != a, true);
+}
+
+// For testing any binaries, e.g. from fuzzing.
+void LoadVerifyBinaryTest() {
+  std::string binary;
+  if (flatbuffers::LoadFile(
+          (test_data_path + "fuzzer/your-filename-here").c_str(), true,
+          &binary)) {
+    flatbuffers::Verifier verifier(
+        reinterpret_cast<const uint8_t *>(binary.data()), binary.size());
+    TEST_EQ(VerifyMonsterBuffer(verifier), true);
+  }
+}
+
+void CreateSharedStringTest() {
+  flatbuffers::FlatBufferBuilder builder;
+  const auto one1 = builder.CreateSharedString("one");
+  const auto two = builder.CreateSharedString("two");
+  const auto one2 = builder.CreateSharedString("one");
+  TEST_EQ(one1.o, one2.o);
+  const auto onetwo = builder.CreateSharedString("onetwo");
+  TEST_EQ(onetwo.o != one1.o, true);
+  TEST_EQ(onetwo.o != two.o, true);
+
+  // Support for embedded nulls
+  const char chars_b[] = { 'a', '\0', 'b' };
+  const char chars_c[] = { 'a', '\0', 'c' };
+  const auto null_b1 = builder.CreateSharedString(chars_b, sizeof(chars_b));
+  const auto null_c = builder.CreateSharedString(chars_c, sizeof(chars_c));
+  const auto null_b2 = builder.CreateSharedString(chars_b, sizeof(chars_b));
+  TEST_EQ(null_b1.o != null_c.o, true);  // Issue#5058 repro
+  TEST_EQ(null_b1.o, null_b2.o);
+
+  // Put the strings into an array for round trip verification.
+  const flatbuffers::Offset<flatbuffers::String> array[7] = {
+    one1, two, one2, onetwo, null_b1, null_c, null_b2
+  };
+  const auto vector_offset =
+      builder.CreateVector(array, flatbuffers::uoffset_t(7));
+  MonsterBuilder monster_builder(builder);
+  monster_builder.add_name(two);
+  monster_builder.add_testarrayofstring(vector_offset);
+  builder.Finish(monster_builder.Finish());
+
+  // Read the Monster back.
+  const auto *monster =
+      flatbuffers::GetRoot<Monster>(builder.GetBufferPointer());
+  TEST_EQ_STR(monster->name()->c_str(), "two");
+  const auto *testarrayofstring = monster->testarrayofstring();
+  TEST_EQ(testarrayofstring->size(), flatbuffers::uoffset_t(7));
+  const auto &a = *testarrayofstring;
+  TEST_EQ_STR(a[0]->c_str(), "one");
+  TEST_EQ_STR(a[1]->c_str(), "two");
+  TEST_EQ_STR(a[2]->c_str(), "one");
+  TEST_EQ_STR(a[3]->c_str(), "onetwo");
+  TEST_EQ(a[4]->str(), (std::string(chars_b, sizeof(chars_b))));
+  TEST_EQ(a[5]->str(), (std::string(chars_c, sizeof(chars_c))));
+  TEST_EQ(a[6]->str(), (std::string(chars_b, sizeof(chars_b))));
+
+  // Make sure String::operator< works, too, since it is related to
+  // StringOffsetCompare.
+  TEST_EQ((*a[0]) < (*a[1]), true);
+  TEST_EQ((*a[1]) < (*a[0]), false);
+  TEST_EQ((*a[1]) < (*a[2]), false);
+  TEST_EQ((*a[2]) < (*a[1]), true);
+  TEST_EQ((*a[4]) < (*a[3]), true);
+  TEST_EQ((*a[5]) < (*a[4]), false);
+  TEST_EQ((*a[5]) < (*a[4]), false);
+  TEST_EQ((*a[6]) < (*a[5]), true);
+}
+
+#if !defined(FLATBUFFERS_SPAN_MINIMAL)
+void FlatbuffersSpanTest() {
+  // Compile-time checking of non-const [] to const [] conversions.
+  using flatbuffers::internal::is_span_convertable;
+  (void)is_span_convertable<int, 1, int, 1>::type(123);
+  (void)is_span_convertable<const int, 1, int, 1>::type(123);
+  (void)is_span_convertable<const int64_t, 1, int64_t, 1>::type(123);
+  (void)is_span_convertable<const uint64_t, 1, uint64_t, 1>::type(123);
+  (void)is_span_convertable<const int, 1, const int, 1>::type(123);
+  (void)is_span_convertable<const int64_t, 1, const int64_t, 1>::type(123);
+  (void)is_span_convertable<const uint64_t, 1, const uint64_t, 1>::type(123);
+
+  using flatbuffers::span;
+  span<char, 0> c1;
+  TEST_EQ(c1.size(), 0);
+  span<char, flatbuffers::dynamic_extent> c2;
+  TEST_EQ(c2.size(), 0);
+  span<char> c3;
+  TEST_EQ(c3.size(), 0);
+  TEST_ASSERT(c1.empty() && c2.empty() && c3.empty());
+
+  int i_data7[7] = { 0, 1, 2, 3, 4, 5, 6 };
+  span<int, 7> i1(&i_data7[0], 7);
+  span<int> i2(i1);  // make dynamic from static
+  TEST_EQ(i1.size(), 7);
+  TEST_EQ(i1.empty(), false);
+  TEST_EQ(i1.size(), i2.size());
+  TEST_EQ(i1.data(), i_data7);
+  TEST_EQ(i1[2], 2);
+  // Make const span from a non-const one.
+  span<const int, 7> i3(i1);
+  // Construct from a C-array.
+  span<int, 7> i4(i_data7);
+  span<const int, 7> i5(i_data7);
+  span<int> i6(i_data7);
+  span<const int> i7(i_data7);
+  TEST_EQ(i7.size(), 7);
+  // Check construction from a const array.
+  const int i_cdata5[5] = { 4, 3, 2, 1, 0 };
+  span<const int, 5> i8(i_cdata5);
+  span<const int> i9(i_cdata5);
+  TEST_EQ(i9.size(), 5);
+  // Construction from a (ptr, size) pair.
+  span<int, 7> i10(i_data7, 7);
+  span<int> i11(i_data7, 7);
+  TEST_EQ(i11.size(), 7);
+  span<const int, 5> i12(i_cdata5, 5);
+  span<const int> i13(i_cdata5, 5);
+  TEST_EQ(i13.size(), 5);
+  // Construction from std::array.
+  std::array<int, 6> i_arr6 = { { 0, 1, 2, 3, 4, 5 } };
+  span<int, 6> i14(i_arr6);
+  span<const int, 6> i15(i_arr6);
+  span<int> i16(i_arr6);
+  span<const int> i17(i_arr6);
+  TEST_EQ(i17.size(), 6);
+  const std::array<int, 8> i_carr8 = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+  span<const int, 8> i18(i_carr8);
+  span<const int> i19(i_carr8);
+  TEST_EQ(i18.size(), 8);
+  TEST_EQ(i19.size(), 8);
+  TEST_EQ(i19[7], 7);
+  // Check compatibility with flatbuffers::Array.
+  int fbs_int3_underlaying[3] = { 0 };
+  int fbs_int3_data[3] = { 1, 2, 3 };
+  auto &fbs_int3 = flatbuffers::CastToArray(fbs_int3_underlaying);
+  fbs_int3.CopyFromSpan(fbs_int3_data);
+  TEST_EQ(fbs_int3.Get(1), 2);
+  const int fbs_cint3_data[3] = { 2, 3, 4 };
+  fbs_int3.CopyFromSpan(fbs_cint3_data);
+  TEST_EQ(fbs_int3.Get(1), 3);
+  // Check with Array<Enum, N>
+  enum class Dummy : uint16_t { Zero = 0, One, Two };
+  Dummy fbs_dummy3_underlaying[3] = {};
+  Dummy fbs_dummy3_data[3] = { Dummy::One, Dummy::Two, Dummy::Two };
+  auto &fbs_dummy3 = flatbuffers::CastToArray(fbs_dummy3_underlaying);
+  fbs_dummy3.CopyFromSpan(fbs_dummy3_data);
+  TEST_EQ(fbs_dummy3.Get(1), Dummy::Two);
+}
+#else
+void FlatbuffersSpanTest() {}
+#endif
+
+void FixedLengthArrayTest() {
+  // VS10 does not support typed enums, exclude from tests
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+  // Generate an ArrayTable containing one ArrayStruct.
+  flatbuffers::FlatBufferBuilder fbb;
+  MyGame::Example::NestedStruct nStruct0(MyGame::Example::TestEnum::B);
+  TEST_NOTNULL(nStruct0.mutable_a());
+  nStruct0.mutable_a()->Mutate(0, 1);
+  nStruct0.mutable_a()->Mutate(1, 2);
+  TEST_NOTNULL(nStruct0.mutable_c());
+  nStruct0.mutable_c()->Mutate(0, MyGame::Example::TestEnum::C);
+  nStruct0.mutable_c()->Mutate(1, MyGame::Example::TestEnum::A);
+  TEST_NOTNULL(nStruct0.mutable_d());
+  nStruct0.mutable_d()->Mutate(0, flatbuffers::numeric_limits<int64_t>::max());
+  nStruct0.mutable_d()->Mutate(1, flatbuffers::numeric_limits<int64_t>::min());
+  MyGame::Example::NestedStruct nStruct1(MyGame::Example::TestEnum::C);
+  TEST_NOTNULL(nStruct1.mutable_a());
+  nStruct1.mutable_a()->Mutate(0, 3);
+  nStruct1.mutable_a()->Mutate(1, 4);
+  TEST_NOTNULL(nStruct1.mutable_c());
+  nStruct1.mutable_c()->Mutate(0, MyGame::Example::TestEnum::C);
+  nStruct1.mutable_c()->Mutate(1, MyGame::Example::TestEnum::A);
+  TEST_NOTNULL(nStruct1.mutable_d());
+  nStruct1.mutable_d()->Mutate(0, flatbuffers::numeric_limits<int64_t>::min());
+  nStruct1.mutable_d()->Mutate(1, flatbuffers::numeric_limits<int64_t>::max());
+  MyGame::Example::ArrayStruct aStruct(2, 12, 1);
+  TEST_NOTNULL(aStruct.b());
+  TEST_NOTNULL(aStruct.mutable_b());
+  TEST_NOTNULL(aStruct.mutable_d());
+  TEST_NOTNULL(aStruct.mutable_f());
+  for (int i = 0; i < aStruct.b()->size(); i++)
+    aStruct.mutable_b()->Mutate(i, i + 1);
+  aStruct.mutable_d()->Mutate(0, nStruct0);
+  aStruct.mutable_d()->Mutate(1, nStruct1);
+  auto aTable = MyGame::Example::CreateArrayTable(fbb, &aStruct);
+  MyGame::Example::FinishArrayTableBuffer(fbb, aTable);
+
+  // Verify correctness of the ArrayTable.
+  flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  MyGame::Example::VerifyArrayTableBuffer(verifier);
+  auto p = MyGame::Example::GetMutableArrayTable(fbb.GetBufferPointer());
+  auto mArStruct = p->mutable_a();
+  TEST_NOTNULL(mArStruct);
+  TEST_NOTNULL(mArStruct->b());
+  TEST_NOTNULL(mArStruct->d());
+  TEST_NOTNULL(mArStruct->f());
+  TEST_NOTNULL(mArStruct->mutable_b());
+  TEST_NOTNULL(mArStruct->mutable_d());
+  TEST_NOTNULL(mArStruct->mutable_f());
+  mArStruct->mutable_b()->Mutate(14, -14);
+  TEST_EQ(mArStruct->a(), 2);
+  TEST_EQ(mArStruct->b()->size(), 15);
+  TEST_EQ(mArStruct->b()->Get(aStruct.b()->size() - 1), -14);
+  TEST_EQ(mArStruct->c(), 12);
+  TEST_NOTNULL(mArStruct->d()->Get(0));
+  TEST_NOTNULL(mArStruct->d()->Get(0)->a());
+  TEST_EQ(mArStruct->d()->Get(0)->a()->Get(0), 1);
+  TEST_EQ(mArStruct->d()->Get(0)->a()->Get(1), 2);
+  TEST_NOTNULL(mArStruct->d()->Get(1));
+  TEST_NOTNULL(mArStruct->d()->Get(1)->a());
+  TEST_EQ(mArStruct->d()->Get(1)->a()->Get(0), 3);
+  TEST_EQ(mArStruct->d()->Get(1)->a()->Get(1), 4);
+  TEST_NOTNULL(mArStruct->mutable_d()->GetMutablePointer(1));
+  TEST_NOTNULL(mArStruct->mutable_d()->GetMutablePointer(1)->mutable_a());
+  mArStruct->mutable_d()->GetMutablePointer(1)->mutable_a()->Mutate(1, 5);
+  TEST_EQ(5, mArStruct->d()->Get(1)->a()->Get(1));
+  TEST_EQ(MyGame::Example::TestEnum::B, mArStruct->d()->Get(0)->b());
+  TEST_NOTNULL(mArStruct->d()->Get(0)->c());
+  TEST_EQ(MyGame::Example::TestEnum::C, mArStruct->d()->Get(0)->c()->Get(0));
+  TEST_EQ(MyGame::Example::TestEnum::A, mArStruct->d()->Get(0)->c()->Get(1));
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::max(),
+          mArStruct->d()->Get(0)->d()->Get(0));
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::min(),
+          mArStruct->d()->Get(0)->d()->Get(1));
+  TEST_EQ(MyGame::Example::TestEnum::C, mArStruct->d()->Get(1)->b());
+  TEST_NOTNULL(mArStruct->d()->Get(1)->c());
+  TEST_EQ(MyGame::Example::TestEnum::C, mArStruct->d()->Get(1)->c()->Get(0));
+  TEST_EQ(MyGame::Example::TestEnum::A, mArStruct->d()->Get(1)->c()->Get(1));
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::min(),
+          mArStruct->d()->Get(1)->d()->Get(0));
+  TEST_EQ(flatbuffers::numeric_limits<int64_t>::max(),
+          mArStruct->d()->Get(1)->d()->Get(1));
+  for (int i = 0; i < mArStruct->b()->size() - 1; i++)
+    TEST_EQ(mArStruct->b()->Get(i), i + 1);
+  // Check alignment
+  TEST_EQ(0, reinterpret_cast<uintptr_t>(mArStruct->d()) % 8);
+  TEST_EQ(0, reinterpret_cast<uintptr_t>(mArStruct->f()) % 8);
+
+  // Check if default constructor set all memory zero
+  const size_t arr_size = sizeof(MyGame::Example::ArrayStruct);
+  char non_zero_memory[arr_size];
+  // set memory chunk of size ArrayStruct to 1's
+  std::memset(static_cast<void *>(non_zero_memory), 1, arr_size);
+  // after placement-new it should be all 0's
+#  if defined(_MSC_VER) && defined(_DEBUG)
+#    undef new
+#  endif
+  MyGame::Example::ArrayStruct *ap =
+      new (non_zero_memory) MyGame::Example::ArrayStruct;
+#  if defined(_MSC_VER) && defined(_DEBUG)
+#    define new DEBUG_NEW
+#  endif
+  (void)ap;
+  for (size_t i = 0; i < arr_size; ++i) { TEST_EQ(non_zero_memory[i], 0); }
+#endif
+}
+
+#if !defined(FLATBUFFERS_SPAN_MINIMAL) && \
+    (!defined(_MSC_VER) || _MSC_VER >= 1700)
+void FixedLengthArrayConstructorTest() {
+  const int32_t nested_a[2] = { 1, 2 };
+  MyGame::Example::TestEnum nested_c[2] = { MyGame::Example::TestEnum::A,
+                                            MyGame::Example::TestEnum::B };
+  const int64_t int64_2[2] = { -2, -1 };
+
+  std::array<MyGame::Example::NestedStruct, 2> init_d = {
+    { MyGame::Example::NestedStruct(nested_a, MyGame::Example::TestEnum::B,
+                                    nested_c, int64_2),
+      MyGame::Example::NestedStruct(nested_a, MyGame::Example::TestEnum::A,
+                                    nested_c,
+                                    std::array<int64_t, 2>{ { 12, 13 } }) }
+  };
+
+  MyGame::Example::ArrayStruct arr_struct(
+      8.125,
+      std::array<int32_t, 0xF>{
+          { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
+      -17, init_d, 10, int64_2);
+  TEST_EQ(arr_struct.a(), 8.125);
+  TEST_EQ(arr_struct.b()->Get(2), 3);
+  TEST_EQ(arr_struct.c(), -17);
+
+  TEST_NOTNULL(arr_struct.d());
+  const auto &arr_d_0 = *arr_struct.d()->Get(0);
+  TEST_EQ(arr_d_0.a()->Get(0), 1);
+  TEST_EQ(arr_d_0.a()->Get(1), 2);
+  TEST_EQ(arr_d_0.b(), MyGame::Example::TestEnum::B);
+  TEST_EQ(arr_d_0.c()->Get(0), MyGame::Example::TestEnum::A);
+  TEST_EQ(arr_d_0.c()->Get(1), MyGame::Example::TestEnum::B);
+  TEST_EQ(arr_d_0.d()->Get(0), -2);
+  TEST_EQ(arr_d_0.d()->Get(1), -1);
+  const auto &arr_d_1 = *arr_struct.d()->Get(1);
+  TEST_EQ(arr_d_1.a()->Get(0), 1);
+  TEST_EQ(arr_d_1.a()->Get(1), 2);
+  TEST_EQ(arr_d_1.b(), MyGame::Example::TestEnum::A);
+  TEST_EQ(arr_d_1.c()->Get(0), MyGame::Example::TestEnum::A);
+  TEST_EQ(arr_d_1.c()->Get(1), MyGame::Example::TestEnum::B);
+  TEST_EQ(arr_d_1.d()->Get(0), 12);
+  TEST_EQ(arr_d_1.d()->Get(1), 13);
+
+  TEST_EQ(arr_struct.e(), 10);
+  TEST_EQ(arr_struct.f()->Get(0), -2);
+  TEST_EQ(arr_struct.f()->Get(1), -1);
+}
+#else
+void FixedLengthArrayConstructorTest() {}
+#endif
+
+void NativeTypeTest() {
+  const int N = 3;
+
+  Geometry::ApplicationDataT src_data;
+  src_data.vectors.reserve(N);
+  src_data.vectors_alt.reserve(N);
+
+  for (int i = 0; i < N; ++i) {
+    src_data.vectors.push_back(
+        Native::Vector3D(10 * i + 0.1f, 10 * i + 0.2f, 10 * i + 0.3f));
+    src_data.vectors_alt.push_back(
+        Native::Vector3D(20 * i + 0.1f, 20 * i + 0.2f, 20 * i + 0.3f));
+  }
+
+  flatbuffers::FlatBufferBuilder fbb;
+  fbb.Finish(Geometry::ApplicationData::Pack(fbb, &src_data));
+
+  auto dstDataT = Geometry::UnPackApplicationData(fbb.GetBufferPointer());
+
+  for (int i = 0; i < N; ++i) {
+    const Native::Vector3D &v = dstDataT->vectors[i];
+    TEST_EQ(v.x, 10 * i + 0.1f);
+    TEST_EQ(v.y, 10 * i + 0.2f);
+    TEST_EQ(v.z, 10 * i + 0.3f);
+
+    const Native::Vector3D &v2 = dstDataT->vectors_alt[i];
+    TEST_EQ(v2.x, 20 * i + 0.1f);
+    TEST_EQ(v2.y, 20 * i + 0.2f);
+    TEST_EQ(v2.z, 20 * i + 0.3f);
+  }
+}
+
+void FixedLengthArrayJsonTest(bool binary) {
+  // VS10 does not support typed enums, exclude from tests
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+  // load FlatBuffer schema (.fbs) and JSON from disk
+  std::string schemafile;
+  std::string jsonfile;
+  TEST_EQ(
+      flatbuffers::LoadFile(
+          (test_data_path + "arrays_test." + (binary ? "bfbs" : "fbs")).c_str(),
+          binary, &schemafile),
+      true);
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "arrays_test.golden").c_str(),
+                                false, &jsonfile),
+          true);
+
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parserOrg, parserGen;
+  if (binary) {
+    flatbuffers::Verifier verifier(
+        reinterpret_cast<const uint8_t *>(schemafile.c_str()),
+        schemafile.size());
+    TEST_EQ(reflection::VerifySchemaBuffer(verifier), true);
+    TEST_EQ(parserOrg.Deserialize((const uint8_t *)schemafile.c_str(),
+                                  schemafile.size()),
+            true);
+    TEST_EQ(parserGen.Deserialize((const uint8_t *)schemafile.c_str(),
+                                  schemafile.size()),
+            true);
+  } else {
+    TEST_EQ(parserOrg.Parse(schemafile.c_str()), true);
+    TEST_EQ(parserGen.Parse(schemafile.c_str()), true);
+  }
+  TEST_EQ(parserOrg.Parse(jsonfile.c_str()), true);
+
+  // First, verify it, just in case:
+  flatbuffers::Verifier verifierOrg(parserOrg.builder_.GetBufferPointer(),
+                                    parserOrg.builder_.GetSize());
+  TEST_EQ(VerifyArrayTableBuffer(verifierOrg), true);
+
+  // Export to JSON
+  std::string jsonGen;
+  TEST_EQ(
+      GenerateText(parserOrg, parserOrg.builder_.GetBufferPointer(), &jsonGen),
+      true);
+
+  // Import from JSON
+  TEST_EQ(parserGen.Parse(jsonGen.c_str()), true);
+
+  // Verify buffer from generated JSON
+  flatbuffers::Verifier verifierGen(parserGen.builder_.GetBufferPointer(),
+                                    parserGen.builder_.GetSize());
+  TEST_EQ(VerifyArrayTableBuffer(verifierGen), true);
+
+  // Compare generated buffer to original
+  TEST_EQ(parserOrg.builder_.GetSize(), parserGen.builder_.GetSize());
+  TEST_EQ(std::memcmp(parserOrg.builder_.GetBufferPointer(),
+                      parserGen.builder_.GetBufferPointer(),
+                      parserOrg.builder_.GetSize()),
+          0);
+#else
+  (void)binary;
+#endif
+}
+
+void TestEmbeddedBinarySchema() {
+  // load JSON from disk
+  std::string jsonfile;
+  TEST_EQ(flatbuffers::LoadFile(
+              (test_data_path + "monsterdata_test.golden").c_str(), false,
+              &jsonfile),
+          true);
+
+  // parse schema first, so we can use it to parse the data after
+  flatbuffers::Parser parserOrg, parserGen;
+  flatbuffers::Verifier verifier(MyGame::Example::MonsterBinarySchema::data(),
+                                 MyGame::Example::MonsterBinarySchema::size());
+  TEST_EQ(reflection::VerifySchemaBuffer(verifier), true);
+  TEST_EQ(parserOrg.Deserialize(MyGame::Example::MonsterBinarySchema::data(),
+                                MyGame::Example::MonsterBinarySchema::size()),
+          true);
+  TEST_EQ(parserGen.Deserialize(MyGame::Example::MonsterBinarySchema::data(),
+                                MyGame::Example::MonsterBinarySchema::size()),
+          true);
+  TEST_EQ(parserOrg.Parse(jsonfile.c_str()), true);
+
+  // First, verify it, just in case:
+  flatbuffers::Verifier verifierOrg(parserOrg.builder_.GetBufferPointer(),
+                                    parserOrg.builder_.GetSize());
+  TEST_EQ(VerifyMonsterBuffer(verifierOrg), true);
+
+  // Export to JSON
+  std::string jsonGen;
+  TEST_EQ(
+      GenerateText(parserOrg, parserOrg.builder_.GetBufferPointer(), &jsonGen),
+      true);
+
+  // Import from JSON
+  TEST_EQ(parserGen.Parse(jsonGen.c_str()), true);
+
+  // Verify buffer from generated JSON
+  flatbuffers::Verifier verifierGen(parserGen.builder_.GetBufferPointer(),
+                                    parserGen.builder_.GetSize());
+  TEST_EQ(VerifyMonsterBuffer(verifierGen), true);
+
+  // Compare generated buffer to original
+  TEST_EQ(parserOrg.builder_.GetSize(), parserGen.builder_.GetSize());
+  TEST_EQ(std::memcmp(parserOrg.builder_.GetBufferPointer(),
+                      parserGen.builder_.GetBufferPointer(),
+                      parserOrg.builder_.GetSize()),
+          0);
+}
+
+void StringVectorDefaultsTest() {
+  std::vector<std::string> schemas;
+  schemas.push_back("table Monster { mana: string = \"\"; }");
+  schemas.push_back("table Monster { mana: string = \"mystr\"; }");
+  schemas.push_back("table Monster { mana: string = \"  \"; }");
+  schemas.push_back("table Monster { mana: [int] = []; }");
+  schemas.push_back("table Monster { mana: [uint] = [  ]; }");
+  schemas.push_back("table Monster { mana: [byte] = [\t\t\n]; }");
+  schemas.push_back("enum E:int{}table Monster{mana:[E]=[];}");
+  for (auto s = schemas.begin(); s < schemas.end(); s++) {
+    flatbuffers::Parser parser;
+    TEST_ASSERT(parser.Parse(s->c_str()));
+    const auto *mana = parser.structs_.Lookup("Monster")->fields.Lookup("mana");
+    TEST_EQ(mana->IsDefault(), true);
+  }
+}
+
+void OptionalScalarsTest() {
+  // Simple schemas and a "has optional scalar" sentinal.
+  std::vector<std::string> schemas;
+  schemas.push_back("table Monster { mana : int; }");
+  schemas.push_back("table Monster { mana : int = 42; }");
+  schemas.push_back("table Monster { mana : int =  null; }");
+  schemas.push_back("table Monster { mana : long; }");
+  schemas.push_back("table Monster { mana : long = 42; }");
+  schemas.push_back("table Monster { mana : long = null; }");
+  schemas.push_back("table Monster { mana : float; }");
+  schemas.push_back("table Monster { mana : float = 42; }");
+  schemas.push_back("table Monster { mana : float = null; }");
+  schemas.push_back("table Monster { mana : double; }");
+  schemas.push_back("table Monster { mana : double = 42; }");
+  schemas.push_back("table Monster { mana : double = null; }");
+  schemas.push_back("table Monster { mana : bool; }");
+  schemas.push_back("table Monster { mana : bool = 42; }");
+  schemas.push_back("table Monster { mana : bool = null; }");
+  schemas.push_back(
+      "enum Enum: int {A=0, B=1} "
+      "table Monster { mana : Enum; }");
+  schemas.push_back(
+      "enum Enum: int {A=0, B=1} "
+      "table Monster { mana : Enum = B; }");
+  schemas.push_back(
+      "enum Enum: int {A=0, B=1} "
+      "table Monster { mana : Enum = null; }");
+
+  // Check the FieldDef is correctly set.
+  for (auto schema = schemas.begin(); schema < schemas.end(); schema++) {
+    const bool has_null = schema->find("null") != std::string::npos;
+    flatbuffers::Parser parser;
+    TEST_ASSERT(parser.Parse(schema->c_str()));
+    const auto *mana = parser.structs_.Lookup("Monster")->fields.Lookup("mana");
+    TEST_EQ(mana->IsOptional(), has_null);
+  }
+
+  // Test if nullable scalars are allowed for each language.
+  for (unsigned lang = 1; lang < flatbuffers::IDLOptions::kMAX; lang <<= 1) {
+    flatbuffers::IDLOptions opts;
+    opts.lang_to_generate = lang;
+    if (false == flatbuffers::Parser::SupportsOptionalScalars(opts)) {
+      continue;
+    }
+    for (auto schema = schemas.begin(); schema < schemas.end(); schema++) {
+      flatbuffers::Parser parser(opts);
+      auto done = parser.Parse(schema->c_str());
+      TEST_EQ_STR(parser.error_.c_str(), "");
+      TEST_ASSERT(done);
+    }
+  }
+
+  // test C++ nullable
+  flatbuffers::FlatBufferBuilder fbb;
+  FinishScalarStuffBuffer(
+      fbb, optional_scalars::CreateScalarStuff(fbb, 1, static_cast<int8_t>(2)));
+  auto opts = optional_scalars::GetMutableScalarStuff(fbb.GetBufferPointer());
+  TEST_ASSERT(!opts->maybe_bool());
+  TEST_ASSERT(!opts->maybe_f32().has_value());
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 2);
+  TEST_ASSERT(opts->mutate_maybe_i8(3));
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 3);
+  TEST_ASSERT(!opts->mutate_maybe_i16(-10));
+
+  optional_scalars::ScalarStuffT obj;
+  TEST_ASSERT(!obj.maybe_bool);
+  TEST_ASSERT(!obj.maybe_f32.has_value());
+  opts->UnPackTo(&obj);
+  TEST_ASSERT(!obj.maybe_bool);
+  TEST_ASSERT(!obj.maybe_f32.has_value());
+  TEST_ASSERT(obj.maybe_i8.has_value() && obj.maybe_i8.value() == 3);
+  TEST_ASSERT(obj.maybe_i8 && *obj.maybe_i8 == 3);
+  obj.maybe_i32 = -1;
+  obj.maybe_enum = optional_scalars::OptionalByte_Two;
+
+  fbb.Clear();
+  FinishScalarStuffBuffer(fbb, optional_scalars::ScalarStuff::Pack(fbb, &obj));
+  opts = optional_scalars::GetMutableScalarStuff(fbb.GetBufferPointer());
+  TEST_ASSERT(opts->maybe_i8().has_value());
+  TEST_EQ(opts->maybe_i8().value(), 3);
+  TEST_ASSERT(opts->maybe_i32().has_value());
+  TEST_EQ(opts->maybe_i32().value(), -1);
+  TEST_EQ(opts->maybe_enum().value(), optional_scalars::OptionalByte_Two);
+  TEST_ASSERT(opts->maybe_i32() == flatbuffers::Optional<int64_t>(-1));
+}
+
+void ParseFlexbuffersFromJsonWithNullTest() {
+  // Test nulls are handled appropriately through flexbuffers to exercise other
+  // code paths of ParseSingleValue in the optional scalars change.
+  // TODO(cneo): Json -> Flatbuffers test once some language can generate code
+  // with optional scalars.
+  {
+    char json[] = "{\"opt_field\": 123 }";
+    flatbuffers::Parser parser;
+    flexbuffers::Builder flexbuild;
+    parser.ParseFlexBuffer(json, nullptr, &flexbuild);
+    auto root = flexbuffers::GetRoot(flexbuild.GetBuffer());
+    TEST_EQ(root.AsMap()["opt_field"].AsInt64(), 123);
+  }
+  {
+    char json[] = "{\"opt_field\": 123.4 }";
+    flatbuffers::Parser parser;
+    flexbuffers::Builder flexbuild;
+    parser.ParseFlexBuffer(json, nullptr, &flexbuild);
+    auto root = flexbuffers::GetRoot(flexbuild.GetBuffer());
+    TEST_EQ(root.AsMap()["opt_field"].AsDouble(), 123.4);
+  }
+  {
+    char json[] = "{\"opt_field\": null }";
+    flatbuffers::Parser parser;
+    flexbuffers::Builder flexbuild;
+    parser.ParseFlexBuffer(json, nullptr, &flexbuild);
+    auto root = flexbuffers::GetRoot(flexbuild.GetBuffer());
+    TEST_ASSERT(!root.AsMap().IsTheEmptyMap());
+    TEST_ASSERT(root.AsMap()["opt_field"].IsNull());
+    TEST_EQ(root.ToString(), std::string("{ opt_field: null }"));
+  }
+}
+
+void FieldIdentifierTest() {
+  using flatbuffers::Parser;
+  TEST_EQ(true, Parser().Parse("table T{ f: int (id:0); }"));
+  // non-integer `id` should be rejected
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:text); }"));
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:\"text\"); }"));
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:0text); }"));
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:1.0); }"));
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:-1); g: int (id:0); }"));
+  TEST_EQ(false, Parser().Parse("table T{ f: int (id:129496726); }"));
+  // A unuion filed occupys two ids: enumerator + pointer (offset).
+  TEST_EQ(false,
+          Parser().Parse("union X{} table T{ u: X(id:0); table F{x:int;\n}"));
+  // Positive tests for unions
+  TEST_EQ(true, Parser().Parse("union X{} table T{ u: X (id:1); }"));
+  TEST_EQ(true, Parser().Parse("union X{} table T{ u: X; }"));
+  // Test using 'inf' and 'nan' words both as identifiers and as default values.
+  TEST_EQ(true, Parser().Parse("table T{ nan: string; }"));
+  TEST_EQ(true, Parser().Parse("table T{ inf: string; }"));
+#if defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+  TEST_EQ(true, Parser().Parse("table T{ inf: float = inf; }"));
+  TEST_EQ(true, Parser().Parse("table T{ nan: float = inf; }"));
+#endif
+}
+
+void ParseIncorrectMonsterJsonTest() {
+  std::string schemafile;
+  TEST_EQ(flatbuffers::LoadFile((test_data_path + "monster_test.bfbs").c_str(),
+                                true, &schemafile),
+          true);
+  flatbuffers::Parser parser;
+  flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t *>(schemafile.c_str()), schemafile.size());
+  TEST_EQ(reflection::VerifySchemaBuffer(verifier), true);
+  TEST_EQ(parser.Deserialize((const uint8_t *)schemafile.c_str(),
+                             schemafile.size()),
+          true);
+  TEST_EQ(parser.ParseJson("{name:\"monster\"}"), true);
+  TEST_EQ(parser.ParseJson(""), false);
+  TEST_EQ(parser.ParseJson("{name: 1}"), false);
+  TEST_EQ(parser.ParseJson("{name:+1}"), false);
+  TEST_EQ(parser.ParseJson("{name:-1}"), false);
+  TEST_EQ(parser.ParseJson("{name:-f}"), false);
+  TEST_EQ(parser.ParseJson("{name:+f}"), false);
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1700
+template<class T, class Container>
+void TestIterators(const std::vector<T> &expected, const Container &tested) {
+  TEST_ASSERT(tested.rbegin().base() == tested.end());
+  TEST_ASSERT(tested.crbegin().base() == tested.cend());
+  TEST_ASSERT(tested.rend().base() == tested.begin());
+  TEST_ASSERT(tested.crend().base() == tested.cbegin());
+
+  size_t k = 0;
+  for (auto it = tested.begin(); it != tested.end(); ++it, ++k) {
+    const auto &e = expected.at(k);
+    TEST_EQ(*it, e);
+  }
+  TEST_EQ(k, expected.size());
+
+  k = expected.size();
+  for (auto it = tested.rbegin(); it != tested.rend(); ++it, --k) {
+    const auto &e = expected.at(k - 1);
+    TEST_EQ(*it, e);
+  }
+  TEST_EQ(k, 0);
+}
+
+void FlatbuffersIteratorsTest() {
+  {
+    flatbuffers::FlatBufferBuilder fbb;
+    const std::vector<unsigned char> inv_data = { 1, 2, 3 };
+    {
+      auto mon_name = fbb.CreateString("MyMonster");  // key, mandatory
+      auto inv_vec = fbb.CreateVector(inv_data);
+      auto empty_i64_vec =
+          fbb.CreateVector(static_cast<const int64_t *>(nullptr), 0);
+      MonsterBuilder mb(fbb);
+      mb.add_name(mon_name);
+      mb.add_inventory(inv_vec);
+      mb.add_vector_of_longs(empty_i64_vec);
+      FinishMonsterBuffer(fbb, mb.Finish());
+    }
+    const auto &mon = *flatbuffers::GetRoot<Monster>(fbb.GetBufferPointer());
+
+    TEST_EQ_STR("MyMonster", mon.name()->c_str());
+    TEST_ASSERT(mon.inventory());
+    TEST_ASSERT(mon.vector_of_longs());
+    TestIterators(inv_data, *mon.inventory());
+    TestIterators(std::vector<int64_t>(), *mon.vector_of_longs());
+  }
+
+  {
+    flatbuffers::FlatBufferBuilder fbb;
+    MyGame::Example::ArrayStruct aStruct;
+    MyGame::Example::FinishArrayTableBuffer(
+        fbb, MyGame::Example::CreateArrayTable(fbb, &aStruct));
+    const auto &array_table =
+        *flatbuffers::GetRoot<ArrayTable>(fbb.GetBufferPointer());
+    TEST_ASSERT(array_table.a());
+    auto &int_15 = *array_table.a()->b();
+    TestIterators(std::vector<int>(15, 0), int_15);
+  }
+}
+#else
+void FlatbuffersIteratorsTest() {}
+#endif
+
+int FlatBufferTests() {
+  // clang-format off
+
+  // Run our various test suites:
+
+  std::string rawbuf;
+  auto flatbuf1 = CreateFlatBufferTest(rawbuf);
+  #if !defined(FLATBUFFERS_CPP98_STL)
+    auto flatbuf = std::move(flatbuf1);  // Test move assignment.
+  #else
+    auto &flatbuf = flatbuf1;
+  #endif // !defined(FLATBUFFERS_CPP98_STL)
+
+  TriviallyCopyableTest();
+
+  AccessFlatBufferTest(reinterpret_cast<const uint8_t *>(rawbuf.c_str()),
+                       rawbuf.length());
+  AccessFlatBufferTest(flatbuf.data(), flatbuf.size());
+
+  MutateFlatBuffersTest(flatbuf.data(), flatbuf.size());
+
+  ObjectFlatBuffersTest(flatbuf.data());
+
+  MiniReflectFlatBuffersTest(flatbuf.data());
+  MiniReflectFixedLengthArrayTest();
+
+  SizePrefixedTest();
+
+  #ifndef FLATBUFFERS_NO_FILE_TESTS
+    #ifdef FLATBUFFERS_TEST_PATH_PREFIX
+      test_data_path = FLATBUFFERS_STRING(FLATBUFFERS_TEST_PATH_PREFIX) +
+                       test_data_path;
+    #endif
+    ParseAndGenerateTextTest(false);
+    ParseAndGenerateTextTest(true);
+    FixedLengthArrayJsonTest(false);
+    FixedLengthArrayJsonTest(true);
+    ReflectionTest(flatbuf.data(), flatbuf.size());
+    ParseProtoTest();
+    ParseProtoTestWithSuffix();
+    ParseProtoTestWithIncludes();
+    EvolutionTest();
+    UnionDeprecationTest();
+    UnionVectorTest();
+    LoadVerifyBinaryTest();
+    GenerateTableTextTest();
+    TestEmbeddedBinarySchema();
+  #endif
+  // clang-format on
+
+  FuzzTest1();
+  FuzzTest2();
+
+  ErrorTest();
+  ValueTest();
+  EnumValueTest();
+  EnumStringsTest();
+  EnumNamesTest();
+  EnumOutOfRangeTest();
+  IntegerOutOfRangeTest();
+  IntegerBoundaryTest();
+  UnicodeTest();
+  UnicodeTestAllowNonUTF8();
+  UnicodeTestGenerateTextFailsOnNonUTF8();
+  UnicodeSurrogatesTest();
+  UnicodeInvalidSurrogatesTest();
+  InvalidUTF8Test();
+  UnknownFieldsTest();
+  ParseUnionTest();
+  InvalidNestedFlatbufferTest();
+  ConformTest();
+  ParseProtoBufAsciiTest();
+  TypeAliasesTest();
+  EndianSwapTest();
+  CreateSharedStringTest();
+  JsonDefaultTest();
+  JsonEnumsTest();
+  FlexBuffersTest();
+  FlexBuffersDeprecatedTest();
+  UninitializedVectorTest();
+  EqualOperatorTest();
+  NumericUtilsTest();
+  IsAsciiUtilsTest();
+  ValidFloatTest();
+  InvalidFloatTest();
+  TestMonsterExtraFloats();
+  FixedLengthArrayTest();
+  NativeTypeTest();
+  OptionalScalarsTest();
+  ParseFlexbuffersFromJsonWithNullTest();
+  FlatbuffersSpanTest();
+  FixedLengthArrayConstructorTest();
+  FieldIdentifierTest();
+  StringVectorDefaultsTest();
+  ParseIncorrectMonsterJsonTest();
+  FlexBuffersFloatingPointTest();
+  FlatbuffersIteratorsTest();
+  return 0;
+}
+
+int main(int /*argc*/, const char * /*argv*/[]) {
+  InitTestEngine();
+
+  std::string req_locale;
+  if (flatbuffers::ReadEnvironmentVariable("FLATBUFFERS_TEST_LOCALE",
+                                           &req_locale)) {
+    TEST_OUTPUT_LINE("The environment variable FLATBUFFERS_TEST_LOCALE=%s",
+                     req_locale.c_str());
+    req_locale = flatbuffers::RemoveStringQuotes(req_locale);
+    std::string the_locale;
+    TEST_ASSERT_FUNC(
+        flatbuffers::SetGlobalTestLocale(req_locale.c_str(), &the_locale));
+    TEST_OUTPUT_LINE("The global C-locale changed: %s", the_locale.c_str());
+  }
+
+  FlatBufferTests();
+  FlatBufferBuilderTest();
+
+  if (!testing_fails) {
+    TEST_OUTPUT_LINE("ALL TESTS PASSED");
+  } else {
+    TEST_OUTPUT_LINE("%d FAILED TESTS", testing_fails);
+  }
+  return CloseTestEngine();
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.cpp
new file mode 100644
index 0000000..e2b43a7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.cpp
@@ -0,0 +1,71 @@
+#include "test_assert.h"
+
+#include <assert.h>
+
+#ifdef _MSC_VER
+#  include <crtdbg.h>
+#  include <windows.h>
+#endif
+
+int testing_fails = 0;
+static TestFailEventListener fail_listener_ = nullptr;
+
+void TestFail(const char *expval, const char *val, const char *exp,
+              const char *file, int line, const char *func) {
+  TEST_OUTPUT_LINE("EXPECTED: \"%s\"", expval);
+  TEST_OUTPUT_LINE("VALUE: \"%s\"", val);
+  TEST_OUTPUT_LINE("TEST FAILED: %s:%d, %s in %s", file, line, exp,
+                   func ? func : "");
+  testing_fails++;
+
+  // Notify, emulate 'gtest::OnTestPartResult' event handler.
+  if (fail_listener_) (*fail_listener_)(expval, val, exp, file, line, func);
+
+  assert(0);  // ignored in Release if NDEBUG defined
+}
+
+void TestEqStr(const char *expval, const char *val, const char *exp,
+               const char *file, int line, const char *func) {
+  if (strcmp(expval, val) != 0) {
+    TestFail(expval, val, exp, file, line, func);
+  }
+}
+
+#if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING) && defined(_MSC_VER) && \
+    defined(_DEBUG)
+#  define FLATBUFFERS_MEMORY_LEAK_TRACKING_MSVC
+#endif
+
+void InitTestEngine(TestFailEventListener listener) {
+  testing_fails = 0;
+  // Disable stdout buffering to prevent information lost on assertion or core
+  // dump.
+  setvbuf(stdout, NULL, _IONBF, 0);
+  setvbuf(stderr, NULL, _IONBF, 0);
+
+  flatbuffers::SetupDefaultCRTReportMode();
+
+  // clang-format off
+
+  #if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING_MSVC)
+    // For more thorough checking:
+    // _CRTDBG_DELAY_FREE_MEM_DF | _CRTDBG_CHECK_ALWAYS_DF
+    auto flags = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    _CrtSetDbgFlag(flags | _CRTDBG_ALLOC_MEM_DF);
+  #endif
+  // clang-format on
+
+  fail_listener_ = listener;
+}
+
+int CloseTestEngine(bool force_report) {
+  if (!testing_fails || force_report) {
+#if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING_MSVC)
+    auto flags = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    flags &= ~_CRTDBG_DELAY_FREE_MEM_DF;
+    flags |= _CRTDBG_LEAK_CHECK_DF;
+    _CrtSetDbgFlag(flags);
+#endif
+  }
+  return (0 != testing_fails);
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.h b/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.h
new file mode 100644
index 0000000..353e3ce
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/test_assert.h
@@ -0,0 +1,109 @@
+#ifndef TEST_ASSERT_H
+#define TEST_ASSERT_H
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/util.h"
+
+// clang-format off
+
+#ifdef __ANDROID__
+  #include <android/log.h>
+  #define TEST_OUTPUT_LINE(...) \
+      __android_log_print(ANDROID_LOG_INFO, "FlatBuffers", __VA_ARGS__)
+  #define FLATBUFFERS_NO_FILE_TESTS
+#else
+  #define TEST_OUTPUT_LINE(...) \
+      do { printf(__VA_ARGS__); printf("\n"); } while(!IsConstTrue(true))
+#endif
+
+#define TEST_EQ(exp, val) TestEq(exp, val, "'" #exp "' != '" #val "'", __FILE__, __LINE__, "")
+#define TEST_ASSERT(val)  TestEq(true, !!(val), "'" "true" "' != '" #val "'", __FILE__, __LINE__, "")
+#define TEST_NOTNULL(val) TestEq(true, (val) != nullptr, "'" "nullptr" "' == '" #val "'", __FILE__, __LINE__, "")
+#define TEST_EQ_STR(exp, val) TestEqStr(exp, val, "'" #exp "' != '" #val "'", __FILE__, __LINE__, "")
+
+#ifdef _WIN32
+  #define TEST_ASSERT_FUNC(val) TestEq(true, !!(val), "'" "true" "' != '" #val "'", __FILE__, __LINE__, __FUNCTION__)
+  #define TEST_EQ_FUNC(exp, val) TestEq(exp, val, "'" #exp "' != '" #val "'", __FILE__, __LINE__, __FUNCTION__)
+#else
+  #define TEST_ASSERT_FUNC(val) TestEq(true, !!(val), "'" "true" "' != '" #val "'", __FILE__, __LINE__, __PRETTY_FUNCTION__)
+  #define TEST_EQ_FUNC(exp, val) TestEq(exp, val, "'" #exp "' != '" #val "'", __FILE__, __LINE__, __PRETTY_FUNCTION__)
+#endif
+
+// clang-format on
+
+extern int testing_fails;
+
+// Listener of TestFail, like 'gtest::OnTestPartResult' event handler.
+// Called in TestFail after a failed assertion.
+typedef bool (*TestFailEventListener)(const char *expval, const char *val,
+                                      const char *exp, const char *file,
+                                      int line, const char *func);
+
+// Prepare test engine (MSVC assertion setup, etc).
+// listener - this function will be notified on each TestFail call.
+void InitTestEngine(TestFailEventListener listener = nullptr);
+
+// Release all test-engine resources.
+// Prints or schedule a debug report if all test passed.
+// Returns 0 if all tests passed or 1 otherwise.
+// Memory leak report: FLATBUFFERS_MEMORY_LEAK_TRACKING && _MSC_VER && _DEBUG.
+int CloseTestEngine(bool force_report = false);
+
+// Write captured state to a log and terminate test run.
+void TestFail(const char *expval, const char *val, const char *exp,
+              const char *file, int line, const char *func = 0);
+
+void TestEqStr(const char *expval, const char *val, const char *exp,
+               const char *file, int line, const char *func = 0);
+
+// Workaround for `enum class` printing.
+// There is an issue with the printing of enums with a fixed underlying type.
+// These enums are generated by `flatc` if `--scoped-enums` is active.
+// All modern compilers have problems with `std::stringstream&<<(T v)` if T is
+// an enum with fixed type. For details see DR1601:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1601
+// https://stackoverflow.com/questions/34336024/ambiguous-overload-when-writing-an-enum-with-an-enum-base-but-only-with-clang
+
+template<typename T, bool is_enum_type = flatbuffers::is_enum<T>::value>
+struct underlying_of_scalar {
+  static_assert(flatbuffers::is_scalar<T>::value, "invalid type T");
+  typedef T type;
+};
+
+template<typename T> struct underlying_of_scalar<T, true> {
+// clang-format off
+  // There are old compilers without full C++11 support (see stl_emulation.h).
+  #if defined(FLATBUFFERS_TEMPLATES_ALIASES) && !defined(FLATBUFFERS_CPP98_STL)
+  using type = typename std::underlying_type<T>::type;
+  #else
+  typedef int64_t type;
+  #endif
+  // clang-format on
+};
+
+template<typename T>
+typename underlying_of_scalar<T>::type scalar_as_underlying(T v) {
+  return static_cast<typename underlying_of_scalar<T>::type>(v);
+}
+
+template<typename T, typename U>
+void TestEq(T expval, U val, const char *exp, const char *file, int line,
+            const char *func) {
+  if (static_cast<U>(expval) != val) {
+    TestFail(flatbuffers::NumToString(scalar_as_underlying(expval)).c_str(),
+             flatbuffers::NumToString(scalar_as_underlying(val)).c_str(), exp,
+             file, line, func);
+  }
+}
+
+template<>
+inline void TestEq<std::string, std::string>(std::string expval,
+                                             std::string val, const char *exp,
+                                             const char *file, int line,
+                                             const char *func) {
+  if (expval != val) {
+    TestFail(expval.c_str(), val.c_str(), exp, file, line, func);
+  }
+}
+
+#endif  // !TEST_ASSERT_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.cpp b/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.cpp
new file mode 100644
index 0000000..3a12d79
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.cpp
@@ -0,0 +1,158 @@
+#include "test_builder.h"
+
+#include "flatbuffers/stl_emulation.h"
+#include "monster_test_generated.h"
+
+using namespace MyGame::Example;
+
+struct OwnedAllocator : public flatbuffers::DefaultAllocator {};
+
+class TestHeapBuilder : public flatbuffers::FlatBufferBuilder {
+ private:
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  TestHeapBuilder(const TestHeapBuilder &);
+  TestHeapBuilder &operator=(const TestHeapBuilder &);
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+ public:
+  TestHeapBuilder()
+      : flatbuffers::FlatBufferBuilder(2048, new OwnedAllocator(), true) {}
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  TestHeapBuilder(TestHeapBuilder &&other)
+      : FlatBufferBuilder(std::move(other)) {}
+
+  TestHeapBuilder &operator=(TestHeapBuilder &&other) {
+    FlatBufferBuilder::operator=(std::move(other));
+    return *this;
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+};
+
+// This class simulates flatbuffers::grpc::detail::SliceAllocatorMember
+struct AllocatorMember {
+  flatbuffers::DefaultAllocator member_allocator_;
+};
+
+struct GrpcLikeMessageBuilder : private AllocatorMember,
+                                public flatbuffers::FlatBufferBuilder {
+ private:
+  GrpcLikeMessageBuilder(const GrpcLikeMessageBuilder &);
+  GrpcLikeMessageBuilder &operator=(const GrpcLikeMessageBuilder &);
+
+ public:
+  GrpcLikeMessageBuilder()
+      : flatbuffers::FlatBufferBuilder(1024, &member_allocator_, false) {}
+
+  GrpcLikeMessageBuilder(GrpcLikeMessageBuilder &&other)
+      : FlatBufferBuilder(1024, &member_allocator_, false) {
+    // Default construct and swap idiom.
+    Swap(other);
+  }
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  GrpcLikeMessageBuilder &operator=(GrpcLikeMessageBuilder &&other) {
+    // Construct temporary and swap idiom
+    GrpcLikeMessageBuilder temp(std::move(other));
+    Swap(temp);
+    return *this;
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  void Swap(GrpcLikeMessageBuilder &other) {
+    // No need to swap member_allocator_ because it's stateless.
+    FlatBufferBuilder::Swap(other);
+    // After swapping the FlatBufferBuilder, we swap back the allocator, which
+    // restores the original allocator back in place. This is necessary because
+    // MessageBuilder's allocator is its own member (SliceAllocatorMember). The
+    // allocator passed to FlatBufferBuilder::vector_downward must point to this
+    // member.
+    buf_.swap_allocator(other.buf_);
+  }
+};
+
+flatbuffers::Offset<Monster> populate1(
+    flatbuffers::FlatBufferBuilder &builder) {
+  auto name_offset = builder.CreateString(m1_name());
+  return CreateMonster(builder, nullptr, 0, 0, name_offset, 0, m1_color());
+}
+
+flatbuffers::Offset<Monster> populate2(
+    flatbuffers::FlatBufferBuilder &builder) {
+  auto name_offset = builder.CreateString(m2_name());
+  return CreateMonster(builder, nullptr, 0, 0, name_offset, 0, m2_color());
+}
+
+uint8_t *release_raw_base(flatbuffers::FlatBufferBuilder &fbb, size_t &size,
+                          size_t &offset) {
+  return fbb.ReleaseRaw(size, offset);
+}
+
+void free_raw(flatbuffers::grpc::MessageBuilder &, uint8_t *) {
+  // release_raw_base calls FlatBufferBuilder::ReleaseRaw on the argument
+  // MessageBuilder. It's semantically wrong as MessageBuilder has its own
+  // ReleaseRaw member function that takes three arguments. In such cases
+  // though, ~MessageBuilder() invokes ~SliceAllocator() that takes care of
+  // deleting memory as it calls grpc_slice_unref. Obviously, this behavior is
+  // very surprising as the pointer returned by FlatBufferBuilder::ReleaseRaw is
+  // not valid as soon as MessageBuilder goes out of scope. This problem does
+  // not occur with FlatBufferBuilder.
+}
+
+void free_raw(flatbuffers::FlatBufferBuilder &, uint8_t *buf) {
+  flatbuffers::DefaultAllocator().deallocate(buf, 0);
+}
+
+bool verify(const flatbuffers::DetachedBuffer &buf,
+            const std::string &expected_name, Color color) {
+  const Monster *monster = flatbuffers::GetRoot<Monster>(buf.data());
+  return (monster->name()->str() == expected_name) &&
+         (monster->color() == color);
+}
+
+bool verify(const uint8_t *buf, size_t offset, const std::string &expected_name,
+            Color color) {
+  const Monster *monster = flatbuffers::GetRoot<Monster>(buf + offset);
+  return (monster->name()->str() == expected_name) &&
+         (monster->color() == color);
+}
+
+bool release_n_verify(flatbuffers::FlatBufferBuilder &fbb,
+                      const std::string &expected_name, Color color) {
+  flatbuffers::DetachedBuffer buf = fbb.Release();
+  return verify(buf, expected_name, color);
+}
+
+void FlatBufferBuilderTest() {
+  using flatbuffers::FlatBufferBuilder;
+
+  BuilderTests<FlatBufferBuilder>::all_tests();
+  BuilderTests<TestHeapBuilder>::all_tests();
+  BuilderTests<GrpcLikeMessageBuilder>::all_tests();
+
+  BuilderReuseTestSelector tests[4] = {
+    REUSABLE_AFTER_RELEASE, REUSABLE_AFTER_RELEASE_RAW,
+    REUSABLE_AFTER_RELEASE_AND_MOVE_ASSIGN,
+    REUSABLE_AFTER_RELEASE_RAW_AND_MOVE_ASSIGN
+  };
+
+  BuilderReuseTests<FlatBufferBuilder, FlatBufferBuilder>::run_tests(
+      TestSelector(tests, tests + 4));
+  BuilderReuseTests<TestHeapBuilder, TestHeapBuilder>::run_tests(
+      TestSelector(tests, tests + 4));
+  BuilderReuseTests<GrpcLikeMessageBuilder, GrpcLikeMessageBuilder>::run_tests(
+      TestSelector(tests, tests + 4));
+}
+
+// Link-time check using pointer type.
+void CheckTestGeneratedIsValid(const MyGame::Example::Color &) {}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.h b/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.h
new file mode 100644
index 0000000..5555e90
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/test_builder.h
@@ -0,0 +1,332 @@
+#ifndef TEST_BUILDER_H
+#define TEST_BUILDER_H
+
+#include <set>
+#include <type_traits>
+
+#include "flatbuffers/flatbuffers.h"
+#include "monster_test_generated.h"
+#include "test_assert.h"
+
+using MyGame::Example::Color;
+using MyGame::Example::Monster;
+
+namespace flatbuffers {
+namespace grpc {
+class MessageBuilder;
+}
+}  // namespace flatbuffers
+
+template<class T, class U> struct is_same { static const bool value = false; };
+
+template<class T> struct is_same<T, T> { static const bool value = true; };
+
+inline std::string m1_name() { return "Cyberdemon"; }
+inline std::string m2_name() { return "Imp"; }
+inline MyGame::Example::Color m1_color() {
+  return MyGame::Example::Color_Red;
+}
+inline MyGame::Example::Color m2_color() {
+  return MyGame::Example::Color_Green;
+}
+inline void m1_color_check() {
+  // Ensure that all compilation units see the same monster_test_generated.h.
+  extern void CheckTestGeneratedIsValid(const MyGame::Example::Color&);
+  CheckTestGeneratedIsValid(m1_color());
+}
+
+flatbuffers::Offset<Monster> populate1(flatbuffers::FlatBufferBuilder &builder);
+flatbuffers::Offset<Monster> populate2(flatbuffers::FlatBufferBuilder &builder);
+
+uint8_t *release_raw_base(flatbuffers::FlatBufferBuilder &fbb, size_t &size,
+                          size_t &offset);
+
+void free_raw(flatbuffers::grpc::MessageBuilder &mbb, uint8_t *buf);
+void free_raw(flatbuffers::FlatBufferBuilder &fbb, uint8_t *buf);
+
+bool verify(const flatbuffers::DetachedBuffer &buf,
+            const std::string &expected_name, Color color);
+bool verify(const uint8_t *buf, size_t offset, const std::string &expected_name,
+            Color color);
+
+bool release_n_verify(flatbuffers::FlatBufferBuilder &fbb,
+                      const std::string &expected_name, Color color);
+bool release_n_verify(flatbuffers::grpc::MessageBuilder &mbb,
+                      const std::string &expected_name, Color color);
+
+// clang-format off
+#if !defined(FLATBUFFERS_CPP98_STL)
+// clang-format on
+// Invokes this function when testing the following Builder types
+// FlatBufferBuilder, TestHeapBuilder, and GrpcLikeMessageBuilder
+template<class Builder>
+void builder_move_assign_after_releaseraw_test(Builder b1) {
+  auto root_offset1 = populate1(b1);
+  b1.Finish(root_offset1);
+  size_t size, offset;
+  std::shared_ptr<uint8_t> raw(
+      b1.ReleaseRaw(size, offset), [size](uint8_t *ptr) {
+        flatbuffers::DefaultAllocator::dealloc(ptr, size);
+      });
+  Builder src;
+  auto root_offset2 = populate2(src);
+  src.Finish(root_offset2);
+  auto src_size = src.GetSize();
+  // Move into a released builder.
+  b1 = std::move(src);
+  TEST_EQ_FUNC(b1.GetSize(), src_size);
+  TEST_ASSERT_FUNC(release_n_verify(b1, m2_name(), m2_color()));
+  TEST_EQ_FUNC(src.GetSize(), 0);
+}
+// clang-format off
+#endif  // !defined(FLATBUFFERS_CPP98_STL)
+// clang-format on
+
+void builder_move_assign_after_releaseraw_test(
+    flatbuffers::grpc::MessageBuilder b1);
+
+template<class DestBuilder, class SrcBuilder = DestBuilder>
+struct BuilderTests {
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  static void empty_builder_movector_test() {
+    SrcBuilder src;
+    size_t src_size = src.GetSize();
+    DestBuilder dst(std::move(src));
+    size_t dst_size = dst.GetSize();
+    TEST_EQ_FUNC(src_size, 0);
+    TEST_EQ_FUNC(src_size, dst_size);
+  }
+
+  static void nonempty_builder_movector_test() {
+    SrcBuilder src;
+    populate1(src);
+    size_t src_size = src.GetSize();
+    DestBuilder dst(std::move(src));
+    TEST_EQ_FUNC(src_size, dst.GetSize());
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+
+  static void builder_movector_before_finish_test() {
+    SrcBuilder src;
+    auto root_offset1 = populate1(src);
+    DestBuilder dst(std::move(src));
+    dst.Finish(root_offset1);
+    TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+
+  static void builder_movector_after_finish_test() {
+    SrcBuilder src;
+    auto root_offset1 = populate1(src);
+    src.Finish(root_offset1);
+    auto src_size = src.GetSize();
+    DestBuilder dst(std::move(src));
+    TEST_EQ_FUNC(dst.GetSize(), src_size);
+    TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+
+  static void builder_move_assign_before_finish_test() {
+    SrcBuilder src;
+    auto root_offset1 = populate1(src);
+    DestBuilder dst;
+    populate2(dst);
+    dst = std::move(src);
+    dst.Finish(root_offset1);
+    TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+
+  static void builder_move_assign_after_finish_test() {
+    SrcBuilder src;
+    auto root_offset1 = populate1(src);
+    src.Finish(root_offset1);
+    auto src_size = src.GetSize();
+    DestBuilder dst;
+    auto root_offset2 = populate2(dst);
+    dst.Finish(root_offset2);
+    dst = std::move(src);
+    TEST_EQ_FUNC(dst.GetSize(), src_size);
+    TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+
+  static void builder_move_assign_after_release_test() {
+    DestBuilder dst;
+    auto root_offset1 = populate1(dst);
+    dst.Finish(root_offset1);
+    {
+      flatbuffers::DetachedBuffer dst_detached = dst.Release();
+      // detached buffer is deleted
+    }
+    SrcBuilder src;
+    auto root_offset2 = populate2(src);
+    src.Finish(root_offset2);
+    auto src_size = src.GetSize();
+    // Move into a released builder.
+    dst = std::move(src);
+    TEST_EQ_FUNC(dst.GetSize(), src_size);
+    TEST_ASSERT_FUNC(release_n_verify(dst, m2_name(), m2_color()));
+    TEST_EQ_FUNC(src.GetSize(), 0);
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  static void builder_swap_before_finish_test(
+      bool run = is_same<DestBuilder, SrcBuilder>::value) {
+    /// Swap is allowed only when lhs and rhs are the same concrete type.
+    if (run) {
+      SrcBuilder src;
+      auto root_offset1 = populate1(src);
+      auto size1 = src.GetSize();
+      DestBuilder dst;
+      auto root_offset2 = populate2(dst);
+      auto size2 = dst.GetSize();
+      src.Swap(dst);
+      src.Finish(root_offset2);
+      dst.Finish(root_offset1);
+      TEST_EQ_FUNC(src.GetSize() > size2, true);
+      TEST_EQ_FUNC(dst.GetSize() > size1, true);
+      TEST_ASSERT_FUNC(release_n_verify(src, m2_name(), m2_color()));
+      TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    }
+  }
+
+  static void builder_swap_after_finish_test(
+      bool run = is_same<DestBuilder, SrcBuilder>::value) {
+    /// Swap is allowed only when lhs and rhs are the same concrete type.
+    if (run) {
+      SrcBuilder src;
+      auto root_offset1 = populate1(src);
+      src.Finish(root_offset1);
+      auto size1 = src.GetSize();
+      DestBuilder dst;
+      auto root_offset2 = populate2(dst);
+      dst.Finish(root_offset2);
+      auto size2 = dst.GetSize();
+      src.Swap(dst);
+      TEST_EQ_FUNC(src.GetSize(), size2);
+      TEST_EQ_FUNC(dst.GetSize(), size1);
+      TEST_ASSERT_FUNC(release_n_verify(src, m2_name(), m2_color()));
+      TEST_ASSERT_FUNC(release_n_verify(dst, m1_name(), m1_color()));
+    }
+  }
+
+  static void all_tests() {
+    // clang-format off
+    #if !defined(FLATBUFFERS_CPP98_STL)
+    // clang-format on
+    empty_builder_movector_test();
+    nonempty_builder_movector_test();
+    builder_movector_before_finish_test();
+    builder_movector_after_finish_test();
+    builder_move_assign_before_finish_test();
+    builder_move_assign_after_finish_test();
+    builder_move_assign_after_release_test();
+    builder_move_assign_after_releaseraw_test(DestBuilder());
+    // clang-format off
+    #endif   // !defined(FLATBUFFERS_CPP98_STL)
+    // clang-format on
+    builder_swap_before_finish_test();
+    builder_swap_after_finish_test();
+  }
+};
+
+enum BuilderReuseTestSelector {
+  REUSABLE_AFTER_RELEASE = 1,
+  REUSABLE_AFTER_RELEASE_RAW = 2,
+  REUSABLE_AFTER_RELEASE_MESSAGE = 3,
+  REUSABLE_AFTER_RELEASE_AND_MOVE_ASSIGN = 4,
+  REUSABLE_AFTER_RELEASE_RAW_AND_MOVE_ASSIGN = 5,
+  REUSABLE_AFTER_RELEASE_MESSAGE_AND_MOVE_ASSIGN = 6
+};
+
+typedef std::set<BuilderReuseTestSelector> TestSelector;
+
+template<class DestBuilder, class SrcBuilder> struct BuilderReuseTests {
+  static void builder_reusable_after_release_test(TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE)) { return; }
+
+    DestBuilder fbb;
+    std::vector<flatbuffers::DetachedBuffer> buffers;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(fbb);
+      fbb.Finish(root_offset1);
+      buffers.push_back(fbb.Release());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+    }
+  }
+
+  static void builder_reusable_after_releaseraw_test(TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_RAW)) { return; }
+
+    DestBuilder fbb;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(fbb);
+      fbb.Finish(root_offset1);
+      size_t size, offset;
+      uint8_t *buf = release_raw_base(fbb, size, offset);
+      TEST_ASSERT_FUNC(verify(buf, offset, m1_name(), m1_color()));
+      free_raw(fbb, buf);
+    }
+  }
+
+  // clang-format off
+  #if !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+  static void builder_reusable_after_release_and_move_assign_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_AND_MOVE_ASSIGN)) { return; }
+
+    DestBuilder dst;
+    std::vector<flatbuffers::DetachedBuffer> buffers;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(dst);
+      dst.Finish(root_offset1);
+      buffers.push_back(dst.Release());
+      TEST_ASSERT_FUNC(verify(buffers[i], m1_name(), m1_color()));
+      SrcBuilder src;
+      dst = std::move(src);
+      TEST_EQ_FUNC(src.GetSize(), 0);
+    }
+  }
+
+  static void builder_reusable_after_releaseraw_and_move_assign_test(
+      TestSelector selector) {
+    if (!selector.count(REUSABLE_AFTER_RELEASE_RAW_AND_MOVE_ASSIGN)) { return; }
+
+    DestBuilder dst;
+    for (int i = 0; i < 5; ++i) {
+      auto root_offset1 = populate1(dst);
+      dst.Finish(root_offset1);
+      size_t size, offset;
+      uint8_t *buf = release_raw_base(dst, size, offset);
+      TEST_ASSERT_FUNC(verify(buf, offset, m1_name(), m1_color()));
+      free_raw(dst, buf);
+      SrcBuilder src;
+      dst = std::move(src);
+      TEST_EQ_FUNC(src.GetSize(), 0);
+    }
+  }
+  // clang-format off
+  #endif  // !defined(FLATBUFFERS_CPP98_STL)
+  // clang-format on
+
+  static void run_tests(TestSelector selector) {
+    builder_reusable_after_release_test(selector);
+    builder_reusable_after_releaseraw_test(selector);
+    // clang-format off
+    #if !defined(FLATBUFFERS_CPP98_STL)
+    // clang-format on
+    builder_reusable_after_release_and_move_assign_test(selector);
+    builder_reusable_after_releaseraw_and_move_assign_test(selector);
+    // clang-format off
+    #endif  // !defined(FLATBUFFERS_CPP98_STL)
+    // clang-format on
+  }
+};
+
+#endif  // TEST_BUILDER_H
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/tsconfig.json b/3rdparty/TNN/third_party/flatbuffers/tests/tsconfig.json
new file mode 100644
index 0000000..7d126a2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/tsconfig.json
@@ -0,0 +1,21 @@
+{
+  "compilerOptions": {
+    "target": "ES6",
+    "lib": ["ES2015", "ES2020.BigInt", "DOM"],
+    "moduleResolution": "Node",
+    "noImplicitAny": true,
+    "strict": true,
+    "noUnusedParameters": false,
+    "noUnusedLocals": false,
+    "noImplicitReturns": true,
+    "strictNullChecks": true,
+    "baseUrl": ".",
+    "noEmit": false
+  },
+  "include": [ 
+    "my-game/**/*.ts",
+    "optional_scalars/**/*.ts",
+    "namespace_test/**/*.ts",
+    "union_vector/**/*.ts"
+  ]
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.json b/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.json
new file mode 100644
index 0000000..7bd2671
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.json
@@ -0,0 +1,31 @@
+{
+  "name": "unicode_test",
+  "testarrayofstring": [
+    "Цлїςσδε",
+    "ﾌﾑｱﾑｶﾓｹﾓ",
+    "フムヤムカモケモ",
+    "㊀㊁㊂㊃㊄",
+    "☳☶☲",
+    "𡇙𝌆"
+  ],
+  "testarrayoftables": [
+    {
+      "name": "Цлїςσδε"
+    },
+    {
+      "name": "☳☶☲"
+    },
+    {
+      "name": "フムヤムカモケモ"
+    },
+    {
+      "name": "㊀㊁㊂㊃㊄"
+    },
+    {
+      "name": "ﾌﾑｱﾑｶﾓｹﾓ"
+    },
+    {
+      "name": "𡇙𝌆"
+    }
+  ]
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.mon b/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.mon
new file mode 100644
index 0000000..14f5fb4
Binary files /dev/null and b/3rdparty/TNN/third_party/flatbuffers/tests/unicode_test.mon differ
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.cs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.cs
new file mode 100644
index 0000000..d5bb81b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.cs
@@ -0,0 +1,60 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Attacker : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Attacker GetRootAsAttacker(ByteBuffer _bb) { return GetRootAsAttacker(_bb, new Attacker()); }
+  public static Attacker GetRootAsAttacker(ByteBuffer _bb, Attacker obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Attacker __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int SwordAttackDamage { get { int o = __p.__offset(4); return o != 0 ? __p.bb.GetInt(o + __p.bb_pos) : (int)0; } }
+  public bool MutateSwordAttackDamage(int sword_attack_damage) { int o = __p.__offset(4); if (o != 0) { __p.bb.PutInt(o + __p.bb_pos, sword_attack_damage); return true; } else { return false; } }
+
+  public static Offset<Attacker> CreateAttacker(FlatBufferBuilder builder,
+      int sword_attack_damage = 0) {
+    builder.StartTable(1);
+    Attacker.AddSwordAttackDamage(builder, sword_attack_damage);
+    return Attacker.EndAttacker(builder);
+  }
+
+  public static void StartAttacker(FlatBufferBuilder builder) { builder.StartTable(1); }
+  public static void AddSwordAttackDamage(FlatBufferBuilder builder, int swordAttackDamage) { builder.AddInt(0, swordAttackDamage, 0); }
+  public static Offset<Attacker> EndAttacker(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<Attacker>(o);
+  }
+  public AttackerT UnPack() {
+    var _o = new AttackerT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(AttackerT _o) {
+    _o.SwordAttackDamage = this.SwordAttackDamage;
+  }
+  public static Offset<Attacker> Pack(FlatBufferBuilder builder, AttackerT _o) {
+    if (_o == null) return default(Offset<Attacker>);
+    return CreateAttacker(
+      builder,
+      _o.SwordAttackDamage);
+  }
+};
+
+public class AttackerT
+{
+  [Newtonsoft.Json.JsonProperty("sword_attack_damage")]
+  public int SwordAttackDamage { get; set; }
+
+  public AttackerT() {
+    this.SwordAttackDamage = 0;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.java b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.java
new file mode 100644
index 0000000..fec2a46
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.java
@@ -0,0 +1,40 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Attacker extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Attacker getRootAsAttacker(ByteBuffer _bb) { return getRootAsAttacker(_bb, new Attacker()); }
+  public static Attacker getRootAsAttacker(ByteBuffer _bb, Attacker obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Attacker __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int swordAttackDamage() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; }
+  public boolean mutateSwordAttackDamage(int sword_attack_damage) { int o = __offset(4); if (o != 0) { bb.putInt(o + bb_pos, sword_attack_damage); return true; } else { return false; } }
+
+  public static int createAttacker(FlatBufferBuilder builder,
+      int sword_attack_damage) {
+    builder.startTable(1);
+    Attacker.addSwordAttackDamage(builder, sword_attack_damage);
+    return Attacker.endAttacker(builder);
+  }
+
+  public static void startAttacker(FlatBufferBuilder builder) { builder.startTable(1); }
+  public static void addSwordAttackDamage(FlatBufferBuilder builder, int swordAttackDamage) { builder.addInt(0, swordAttackDamage, 0); }
+  public static int endAttacker(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Attacker get(int j) { return get(new Attacker(), j); }
+    public Attacker get(Attacker obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.kt b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.kt
new file mode 100644
index 0000000..105d3bc
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.kt
@@ -0,0 +1,51 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Attacker : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Attacker {
+        __init(_i, _bb)
+        return this
+    }
+    val swordAttackDamage : Int
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.getInt(o + bb_pos) else 0
+        }
+    fun mutateSwordAttackDamage(swordAttackDamage: Int) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.putInt(o + bb_pos, swordAttackDamage)
+            true
+        } else {
+            false
+        }
+    }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsAttacker(_bb: ByteBuffer): Attacker = getRootAsAttacker(_bb, Attacker())
+        fun getRootAsAttacker(_bb: ByteBuffer, obj: Attacker): Attacker {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun createAttacker(builder: FlatBufferBuilder, swordAttackDamage: Int) : Int {
+            builder.startTable(1)
+            addSwordAttackDamage(builder, swordAttackDamage)
+            return endAttacker(builder)
+        }
+        fun startAttacker(builder: FlatBufferBuilder) = builder.startTable(1)
+        fun addSwordAttackDamage(builder: FlatBufferBuilder, swordAttackDamage: Int) = builder.addInt(0, swordAttackDamage, 0)
+        fun endAttacker(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.php b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.php
new file mode 100644
index 0000000..e3ebfe6
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Attacker.php
@@ -0,0 +1,92 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Attacker extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Attacker
+     */
+    public static function getRootAsAttacker(ByteBuffer $bb)
+    {
+        $obj = new Attacker();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function AttackerIdentifier()
+    {
+        return "MOVI";
+    }
+
+    public static function AttackerBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::AttackerIdentifier());
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Attacker
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return int
+     */
+    public function getSwordAttackDamage()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getInt($o + $this->bb_pos) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startAttacker(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Attacker
+     */
+    public static function createAttacker(FlatBufferBuilder $builder, $sword_attack_damage)
+    {
+        $builder->startObject(1);
+        self::addSwordAttackDamage($builder, $sword_attack_damage);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int
+     * @return void
+     */
+    public static function addSwordAttackDamage(FlatBufferBuilder $builder, $swordAttackDamage)
+    {
+        $builder->addIntX(0, $swordAttackDamage, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endAttacker(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.cs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.cs
new file mode 100644
index 0000000..3f80cdf
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.cs
@@ -0,0 +1,49 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct BookReader : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public BookReader __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int BooksRead { get { return __p.bb.GetInt(__p.bb_pos + 0); } }
+  public void MutateBooksRead(int books_read) { __p.bb.PutInt(__p.bb_pos + 0, books_read); }
+
+  public static Offset<BookReader> CreateBookReader(FlatBufferBuilder builder, int BooksRead) {
+    builder.Prep(4, 4);
+    builder.PutInt(BooksRead);
+    return new Offset<BookReader>(builder.Offset);
+  }
+  public BookReaderT UnPack() {
+    var _o = new BookReaderT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(BookReaderT _o) {
+    _o.BooksRead = this.BooksRead;
+  }
+  public static Offset<BookReader> Pack(FlatBufferBuilder builder, BookReaderT _o) {
+    if (_o == null) return default(Offset<BookReader>);
+    return CreateBookReader(
+      builder,
+      _o.BooksRead);
+  }
+};
+
+public class BookReaderT
+{
+  [Newtonsoft.Json.JsonProperty("books_read")]
+  public int BooksRead { get; set; }
+
+  public BookReaderT() {
+    this.BooksRead = 0;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.java b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.java
new file mode 100644
index 0000000..a6d1b43
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.java
@@ -0,0 +1,29 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class BookReader extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public BookReader __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int booksRead() { return bb.getInt(bb_pos + 0); }
+  public void mutateBooksRead(int books_read) { bb.putInt(bb_pos + 0, books_read); }
+
+  public static int createBookReader(FlatBufferBuilder builder, int booksRead) {
+    builder.prep(4, 4);
+    builder.putInt(booksRead);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public BookReader get(int j) { return get(new BookReader(), j); }
+    public BookReader get(BookReader obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.kt b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.kt
new file mode 100644
index 0000000..fc41473
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.kt
@@ -0,0 +1,27 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class BookReader : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : BookReader {
+        __init(_i, _bb)
+        return this
+    }
+    val booksRead : Int get() = bb.getInt(bb_pos + 0)
+    fun mutateBooksRead(booksRead: Int) : ByteBuffer = bb.putInt(bb_pos + 0, booksRead)
+    companion object {
+        fun createBookReader(builder: FlatBufferBuilder, booksRead: Int) : Int {
+            builder.prep(4, 4)
+            builder.putInt(booksRead)
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.php b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.php
new file mode 100644
index 0000000..1f8f8d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/BookReader.php
@@ -0,0 +1,41 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class BookReader extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return BookReader
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return int
+     */
+    public function GetBooksRead()
+    {
+        return $this->bb->getInt($this->bb_pos + 0);
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createBookReader(FlatBufferBuilder $builder, $booksRead)
+    {
+        $builder->prep(4, 4);
+        $builder->putInt($booksRead);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.cs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.cs
new file mode 100644
index 0000000..d067e22
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.cs
@@ -0,0 +1,92 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+[Newtonsoft.Json.JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))]
+public enum Character : byte
+{
+  NONE = 0,
+  MuLan = 1,
+  Rapunzel = 2,
+  Belle = 3,
+  BookFan = 4,
+  Other = 5,
+  Unused = 6,
+};
+
+public class CharacterUnion {
+  public Character Type { get; set; }
+  public object Value { get; set; }
+
+  public CharacterUnion() {
+    this.Type = Character.NONE;
+    this.Value = null;
+  }
+
+  public T As<T>() where T : class { return this.Value as T; }
+  public AttackerT AsMuLan() { return this.As<AttackerT>(); }
+  public RapunzelT AsRapunzel() { return this.As<RapunzelT>(); }
+  public BookReaderT AsBelle() { return this.As<BookReaderT>(); }
+  public BookReaderT AsBookFan() { return this.As<BookReaderT>(); }
+  public string AsOther() { return this.As<string>(); }
+  public string AsUnused() { return this.As<string>(); }
+
+  public static int Pack(FlatBuffers.FlatBufferBuilder builder, CharacterUnion _o) {
+    switch (_o.Type) {
+      default: return 0;
+      case Character.MuLan: return Attacker.Pack(builder, _o.AsMuLan()).Value;
+      case Character.Rapunzel: return Rapunzel.Pack(builder, _o.AsRapunzel()).Value;
+      case Character.Belle: return BookReader.Pack(builder, _o.AsBelle()).Value;
+      case Character.BookFan: return BookReader.Pack(builder, _o.AsBookFan()).Value;
+      case Character.Other: return builder.CreateString(_o.AsOther()).Value;
+      case Character.Unused: return builder.CreateString(_o.AsUnused()).Value;
+    }
+  }
+}
+
+public class CharacterUnion_JsonConverter : Newtonsoft.Json.JsonConverter {
+  public override bool CanConvert(System.Type objectType) {
+    return objectType == typeof(CharacterUnion) || objectType == typeof(System.Collections.Generic.List<CharacterUnion>);
+  }
+  public override void WriteJson(Newtonsoft.Json.JsonWriter writer, object value, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = value as System.Collections.Generic.List<CharacterUnion>;
+    if (_olist != null) {
+      writer.WriteStartArray();
+      foreach (var _o in _olist) { this.WriteJson(writer, _o, serializer); }
+      writer.WriteEndArray();
+    } else {
+      this.WriteJson(writer, value as CharacterUnion, serializer);
+    }
+  }
+  public void WriteJson(Newtonsoft.Json.JsonWriter writer, CharacterUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return;
+    serializer.Serialize(writer, _o.Value);
+  }
+  public override object ReadJson(Newtonsoft.Json.JsonReader reader, System.Type objectType, object existingValue, Newtonsoft.Json.JsonSerializer serializer) {
+    var _olist = existingValue as System.Collections.Generic.List<CharacterUnion>;
+    if (_olist != null) {
+      for (var _j = 0; _j < _olist.Count; ++_j) {
+        reader.Read();
+        _olist[_j] = this.ReadJson(reader, _olist[_j], serializer);
+      }
+      reader.Read();
+      return _olist;
+    } else {
+      return this.ReadJson(reader, existingValue as CharacterUnion, serializer);
+    }
+  }
+  public CharacterUnion ReadJson(Newtonsoft.Json.JsonReader reader, CharacterUnion _o, Newtonsoft.Json.JsonSerializer serializer) {
+    if (_o == null) return null;
+    switch (_o.Type) {
+      default: break;
+      case Character.MuLan: _o.Value = serializer.Deserialize<AttackerT>(reader); break;
+      case Character.Rapunzel: _o.Value = serializer.Deserialize<RapunzelT>(reader); break;
+      case Character.Belle: _o.Value = serializer.Deserialize<BookReaderT>(reader); break;
+      case Character.BookFan: _o.Value = serializer.Deserialize<BookReaderT>(reader); break;
+      case Character.Other: _o.Value = serializer.Deserialize<string>(reader); break;
+      case Character.Unused: _o.Value = serializer.Deserialize<string>(reader); break;
+    }
+    return _o;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.java b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.java
new file mode 100644
index 0000000..5d6c5b7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.java
@@ -0,0 +1,17 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+public final class Character {
+  private Character() { }
+  public static final byte NONE = 0;
+  public static final byte MuLan = 1;
+  public static final byte Rapunzel = 2;
+  public static final byte Belle = 3;
+  public static final byte BookFan = 4;
+  public static final byte Other = 5;
+  public static final byte Unused = 6;
+
+  public static final String[] names = { "NONE", "MuLan", "Rapunzel", "Belle", "BookFan", "Other", "Unused", };
+
+  public static String name(int e) { return names[e]; }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.kt b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.kt
new file mode 100644
index 0000000..ff7dd5e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.kt
@@ -0,0 +1,17 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Character_ private constructor() {
+    companion object {
+        const val NONE: UByte = 0u
+        const val MuLan: UByte = 1u
+        const val Rapunzel: UByte = 2u
+        const val Belle: UByte = 3u
+        const val BookFan: UByte = 4u
+        const val Other: UByte = 5u
+        const val Unused: UByte = 6u
+        val names : Array<String> = arrayOf("NONE", "MuLan", "Rapunzel", "Belle", "BookFan", "Other", "Unused")
+        fun name(e: Int) : String = names[e]
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.php b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.php
new file mode 100644
index 0000000..755958b
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Character.php
@@ -0,0 +1,31 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+class Character
+{
+    const NONE = 0;
+    const MuLan = 1;
+    const Rapunzel = 2;
+    const Belle = 3;
+    const BookFan = 4;
+    const Other = 5;
+    const Unused = 6;
+
+    private static $names = array(
+        Character::NONE=>"NONE",
+        Character::MuLan=>"MuLan",
+        Character::Rapunzel=>"Rapunzel",
+        Character::Belle=>"Belle",
+        Character::BookFan=>"BookFan",
+        Character::Other=>"Other",
+        Character::Unused=>"Unused",
+    );
+
+    public static function Name($e)
+    {
+        if (!isset(self::$names[$e])) {
+            throw new \Exception();
+        }
+        return self::$names[$e];
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.cs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.cs
new file mode 100644
index 0000000..bfbc10d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.cs
@@ -0,0 +1,209 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Movie : IFlatbufferObject
+{
+  private Table __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_2_0_0(); }
+  public static Movie GetRootAsMovie(ByteBuffer _bb) { return GetRootAsMovie(_bb, new Movie()); }
+  public static Movie GetRootAsMovie(ByteBuffer _bb, Movie obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); }
+  public static bool MovieBufferHasIdentifier(ByteBuffer _bb) { return Table.__has_identifier(_bb, "MOVI"); }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); }
+  public Movie __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public Character MainCharacterType { get { int o = __p.__offset(4); return o != 0 ? (Character)__p.bb.Get(o + __p.bb_pos) : Character.NONE; } }
+  public TTable? MainCharacter<TTable>() where TTable : struct, IFlatbufferObject { int o = __p.__offset(6); return o != 0 ? (TTable?)__p.__union<TTable>(o + __p.bb_pos) : null; }
+  public string MainCharacterAsString() { int o = __p.__offset(6); return o != 0 ? __p.__string(o + __p.bb_pos) : null; }
+  public Attacker MainCharacterAsMuLan() { return MainCharacter<Attacker>().Value; }
+  public Rapunzel MainCharacterAsRapunzel() { return MainCharacter<Rapunzel>().Value; }
+  public BookReader MainCharacterAsBelle() { return MainCharacter<BookReader>().Value; }
+  public BookReader MainCharacterAsBookFan() { return MainCharacter<BookReader>().Value; }
+  public string MainCharacterAsOther() { return MainCharacterAsString(); }
+  public string MainCharacterAsUnused() { return MainCharacterAsString(); }
+  public Character CharactersType(int j) { int o = __p.__offset(8); return o != 0 ? (Character)__p.bb.Get(__p.__vector(o) + j * 1) : (Character)0; }
+  public int CharactersTypeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } }
+#if ENABLE_SPAN_T
+  public Span<Character> GetCharactersTypeBytes() { return __p.__vector_as_span<Character>(8, 1); }
+#else
+  public ArraySegment<byte>? GetCharactersTypeBytes() { return __p.__vector_as_arraysegment(8); }
+#endif
+  public Character[] GetCharactersTypeArray() { int o = __p.__offset(8); if (o == 0) return null; int p = __p.__vector(o); int l = __p.__vector_len(o); Character[] a = new Character[l]; for (int i = 0; i < l; i++) { a[i] = (Character)__p.bb.Get(p + i * 1); } return a; }
+  public TTable? Characters<TTable>(int j) where TTable : struct, IFlatbufferObject { int o = __p.__offset(10); return o != 0 ? (TTable?)__p.__union<TTable>(__p.__vector(o) + j * 4) : null; }
+  public string CharactersAsString(int j) { int o = __p.__offset(10); return o != 0 ? __p.__string(__p.__vector(o) + j * 4) : null; }
+  public int CharactersLength { get { int o = __p.__offset(10); return o != 0 ? __p.__vector_len(o) : 0; } }
+
+  public static Offset<Movie> CreateMovie(FlatBufferBuilder builder,
+      Character main_character_type = Character.NONE,
+      int main_characterOffset = 0,
+      VectorOffset characters_typeOffset = default(VectorOffset),
+      VectorOffset charactersOffset = default(VectorOffset)) {
+    builder.StartTable(4);
+    Movie.AddCharacters(builder, charactersOffset);
+    Movie.AddCharactersType(builder, characters_typeOffset);
+    Movie.AddMainCharacter(builder, main_characterOffset);
+    Movie.AddMainCharacterType(builder, main_character_type);
+    return Movie.EndMovie(builder);
+  }
+
+  public static void StartMovie(FlatBufferBuilder builder) { builder.StartTable(4); }
+  public static void AddMainCharacterType(FlatBufferBuilder builder, Character mainCharacterType) { builder.AddByte(0, (byte)mainCharacterType, 0); }
+  public static void AddMainCharacter(FlatBufferBuilder builder, int mainCharacterOffset) { builder.AddOffset(1, mainCharacterOffset, 0); }
+  public static void AddCharactersType(FlatBufferBuilder builder, VectorOffset charactersTypeOffset) { builder.AddOffset(2, charactersTypeOffset.Value, 0); }
+  public static VectorOffset CreateCharactersTypeVector(FlatBufferBuilder builder, Character[] data) { builder.StartVector(1, data.Length, 1); for (int i = data.Length - 1; i >= 0; i--) builder.AddByte((byte)data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateCharactersTypeVectorBlock(FlatBufferBuilder builder, Character[] data) { builder.StartVector(1, data.Length, 1); builder.Add(data); return builder.EndVector(); }
+  public static void StartCharactersTypeVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(1, numElems, 1); }
+  public static void AddCharacters(FlatBufferBuilder builder, VectorOffset charactersOffset) { builder.AddOffset(3, charactersOffset.Value, 0); }
+  public static VectorOffset CreateCharactersVector(FlatBufferBuilder builder, int[] data) { builder.StartVector(4, data.Length, 4); for (int i = data.Length - 1; i >= 0; i--) builder.AddOffset(data[i]); return builder.EndVector(); }
+  public static VectorOffset CreateCharactersVectorBlock(FlatBufferBuilder builder, int[] data) { builder.StartVector(4, data.Length, 4); builder.Add(data); return builder.EndVector(); }
+  public static void StartCharactersVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(4, numElems, 4); }
+  public static Offset<Movie> EndMovie(FlatBufferBuilder builder) {
+    int o = builder.EndTable();
+    return new Offset<Movie>(o);
+  }
+  public static void FinishMovieBuffer(FlatBufferBuilder builder, Offset<Movie> offset) { builder.Finish(offset.Value, "MOVI"); }
+  public static void FinishSizePrefixedMovieBuffer(FlatBufferBuilder builder, Offset<Movie> offset) { builder.FinishSizePrefixed(offset.Value, "MOVI"); }
+  public MovieT UnPack() {
+    var _o = new MovieT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(MovieT _o) {
+    _o.MainCharacter = new CharacterUnion();
+    _o.MainCharacter.Type = this.MainCharacterType;
+    switch (this.MainCharacterType) {
+      default: break;
+      case Character.MuLan:
+        _o.MainCharacter.Value = this.MainCharacter<Attacker>().HasValue ? this.MainCharacter<Attacker>().Value.UnPack() : null;
+        break;
+      case Character.Rapunzel:
+        _o.MainCharacter.Value = this.MainCharacter<Rapunzel>().HasValue ? this.MainCharacter<Rapunzel>().Value.UnPack() : null;
+        break;
+      case Character.Belle:
+        _o.MainCharacter.Value = this.MainCharacter<BookReader>().HasValue ? this.MainCharacter<BookReader>().Value.UnPack() : null;
+        break;
+      case Character.BookFan:
+        _o.MainCharacter.Value = this.MainCharacter<BookReader>().HasValue ? this.MainCharacter<BookReader>().Value.UnPack() : null;
+        break;
+      case Character.Other:
+        _o.MainCharacter.Value = this.MainCharacterAsString();
+        break;
+      case Character.Unused:
+        _o.MainCharacter.Value = this.MainCharacterAsString();
+        break;
+    }
+    _o.Characters = new List<CharacterUnion>();
+    for (var _j = 0; _j < this.CharactersLength; ++_j) {
+      var _o_Characters = new CharacterUnion();
+      _o_Characters.Type = this.CharactersType(_j);
+      switch (this.CharactersType(_j)) {
+        default: break;
+        case Character.MuLan:
+          _o_Characters.Value = this.Characters<Attacker>(_j).HasValue ? this.Characters<Attacker>(_j).Value.UnPack() : null;
+          break;
+        case Character.Rapunzel:
+          _o_Characters.Value = this.Characters<Rapunzel>(_j).HasValue ? this.Characters<Rapunzel>(_j).Value.UnPack() : null;
+          break;
+        case Character.Belle:
+          _o_Characters.Value = this.Characters<BookReader>(_j).HasValue ? this.Characters<BookReader>(_j).Value.UnPack() : null;
+          break;
+        case Character.BookFan:
+          _o_Characters.Value = this.Characters<BookReader>(_j).HasValue ? this.Characters<BookReader>(_j).Value.UnPack() : null;
+          break;
+        case Character.Other:
+          _o_Characters.Value = this.CharactersAsString(_j);
+          break;
+        case Character.Unused:
+          _o_Characters.Value = this.CharactersAsString(_j);
+          break;
+      }
+      _o.Characters.Add(_o_Characters);
+    }
+  }
+  public static Offset<Movie> Pack(FlatBufferBuilder builder, MovieT _o) {
+    if (_o == null) return default(Offset<Movie>);
+    var _main_character_type = _o.MainCharacter == null ? Character.NONE : _o.MainCharacter.Type;
+    var _main_character = _o.MainCharacter == null ? 0 : CharacterUnion.Pack(builder, _o.MainCharacter);
+    var _characters_type = default(VectorOffset);
+    if (_o.Characters != null) {
+      var __characters_type = new Character[_o.Characters.Count];
+      for (var _j = 0; _j < __characters_type.Length; ++_j) { __characters_type[_j] = _o.Characters[_j].Type; }
+      _characters_type = CreateCharactersTypeVector(builder, __characters_type);
+    }
+    var _characters = default(VectorOffset);
+    if (_o.Characters != null) {
+      var __characters = new int[_o.Characters.Count];
+      for (var _j = 0; _j < __characters.Length; ++_j) { __characters[_j] = CharacterUnion.Pack(builder,  _o.Characters[_j]); }
+      _characters = CreateCharactersVector(builder, __characters);
+    }
+    return CreateMovie(
+      builder,
+      _main_character_type,
+      _main_character,
+      _characters_type,
+      _characters);
+  }
+};
+
+public class MovieT
+{
+  [Newtonsoft.Json.JsonProperty("main_character_type")]
+  private Character MainCharacterType {
+    get {
+      return this.MainCharacter != null ? this.MainCharacter.Type : Character.NONE;
+    }
+    set {
+      this.MainCharacter = new CharacterUnion();
+      this.MainCharacter.Type = value;
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("main_character")]
+  [Newtonsoft.Json.JsonConverter(typeof(CharacterUnion_JsonConverter))]
+  public CharacterUnion MainCharacter { get; set; }
+  [Newtonsoft.Json.JsonProperty("characters_type")]
+  private Character[] CharactersType {
+    get {
+      if (this.Characters == null) return null;
+      var _o = new Character[this.Characters.Count];
+      for (var _j = 0; _j < _o.Length; ++_j) { _o[_j] = this.Characters[_j].Type; }
+      return _o;
+    }
+    set {
+      this.Characters = new List<CharacterUnion>();
+      for (var _j = 0; _j < value.Length; ++_j) {
+        var _o = new CharacterUnion();
+        _o.Type = value[_j];
+        this.Characters.Add(_o);
+      }
+    }
+  }
+  [Newtonsoft.Json.JsonProperty("characters")]
+  [Newtonsoft.Json.JsonConverter(typeof(CharacterUnion_JsonConverter))]
+  public List<CharacterUnion> Characters { get; set; }
+
+  public MovieT() {
+    this.MainCharacter = null;
+    this.Characters = null;
+  }
+
+  public static MovieT DeserializeFromJson(string jsonText) {
+    return Newtonsoft.Json.JsonConvert.DeserializeObject<MovieT>(jsonText);
+  }
+  public string SerializeToJson() {
+    return Newtonsoft.Json.JsonConvert.SerializeObject(this, Newtonsoft.Json.Formatting.Indented);
+  }
+  public static MovieT DeserializeFromBinary(byte[] fbBuffer) {
+    return Movie.GetRootAsMovie(new ByteBuffer(fbBuffer)).UnPack();
+  }
+  public byte[] SerializeToBinary() {
+    var fbb = new FlatBufferBuilder(0x10000);
+    Movie.FinishMovieBuffer(fbb, Movie.Pack(fbb, this));
+    return fbb.DataBuffer.ToSizedArray();
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.java b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.java
new file mode 100644
index 0000000..6a58f72
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.java
@@ -0,0 +1,66 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Movie extends Table {
+  public static void ValidateVersion() { Constants.FLATBUFFERS_2_0_0(); }
+  public static Movie getRootAsMovie(ByteBuffer _bb) { return getRootAsMovie(_bb, new Movie()); }
+  public static Movie getRootAsMovie(ByteBuffer _bb, Movie obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); }
+  public static boolean MovieBufferHasIdentifier(ByteBuffer _bb) { return __has_identifier(_bb, "MOVI"); }
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Movie __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public byte mainCharacterType() { int o = __offset(4); return o != 0 ? bb.get(o + bb_pos) : 0; }
+  public Table mainCharacter(Table obj) { int o = __offset(6); return o != 0 ? __union(obj, o + bb_pos) : null; }
+  public byte charactersType(int j) { int o = __offset(8); return o != 0 ? bb.get(__vector(o) + j * 1) : 0; }
+  public int charactersTypeLength() { int o = __offset(8); return o != 0 ? __vector_len(o) : 0; }
+  public ByteVector charactersTypeVector() { return charactersTypeVector(new ByteVector()); }
+  public ByteVector charactersTypeVector(ByteVector obj) { int o = __offset(8); return o != 0 ? obj.__assign(__vector(o), bb) : null; }
+  public ByteBuffer charactersTypeAsByteBuffer() { return __vector_as_bytebuffer(8, 1); }
+  public ByteBuffer charactersTypeInByteBuffer(ByteBuffer _bb) { return __vector_in_bytebuffer(_bb, 8, 1); }
+  public Table characters(Table obj, int j) { int o = __offset(10); return o != 0 ? __union(obj, __vector(o) + j * 4) : null; }
+  public int charactersLength() { int o = __offset(10); return o != 0 ? __vector_len(o) : 0; }
+  public UnionVector charactersVector() { return charactersVector(new UnionVector()); }
+  public UnionVector charactersVector(UnionVector obj) { int o = __offset(10); return o != 0 ? obj.__assign(__vector(o), 4, bb) : null; }
+
+  public static int createMovie(FlatBufferBuilder builder,
+      byte main_character_type,
+      int main_characterOffset,
+      int characters_typeOffset,
+      int charactersOffset) {
+    builder.startTable(4);
+    Movie.addCharacters(builder, charactersOffset);
+    Movie.addCharactersType(builder, characters_typeOffset);
+    Movie.addMainCharacter(builder, main_characterOffset);
+    Movie.addMainCharacterType(builder, main_character_type);
+    return Movie.endMovie(builder);
+  }
+
+  public static void startMovie(FlatBufferBuilder builder) { builder.startTable(4); }
+  public static void addMainCharacterType(FlatBufferBuilder builder, byte mainCharacterType) { builder.addByte(0, mainCharacterType, 0); }
+  public static void addMainCharacter(FlatBufferBuilder builder, int mainCharacterOffset) { builder.addOffset(1, mainCharacterOffset, 0); }
+  public static void addCharactersType(FlatBufferBuilder builder, int charactersTypeOffset) { builder.addOffset(2, charactersTypeOffset, 0); }
+  public static int createCharactersTypeVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); }
+  public static void startCharactersTypeVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); }
+  public static void addCharacters(FlatBufferBuilder builder, int charactersOffset) { builder.addOffset(3, charactersOffset, 0); }
+  public static int createCharactersVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); }
+  public static void startCharactersVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); }
+  public static int endMovie(FlatBufferBuilder builder) {
+    int o = builder.endTable();
+    return o;
+  }
+  public static void finishMovieBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset, "MOVI"); }
+  public static void finishSizePrefixedMovieBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset, "MOVI"); }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Movie get(int j) { return get(new Movie(), j); }
+    public Movie get(Movie obj, int j) {  return obj.__assign(__indirect(__element(j), bb), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.kt b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.kt
new file mode 100644
index 0000000..3342db2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.kt
@@ -0,0 +1,114 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Movie : Table() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Movie {
+        __init(_i, _bb)
+        return this
+    }
+    val mainCharacterType : UByte
+        get() {
+            val o = __offset(4)
+            return if(o != 0) bb.get(o + bb_pos).toUByte() else 0u
+        }
+    fun mutateMainCharacterType(mainCharacterType: UByte) : Boolean {
+        val o = __offset(4)
+        return if (o != 0) {
+            bb.put(o + bb_pos, mainCharacterType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun mainCharacter(obj: Table) : Table? {
+        val o = __offset(6); return if (o != 0) __union(obj, o + bb_pos) else null
+    }
+    fun charactersType(j: Int) : UByte {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.get(__vector(o) + j * 1).toUByte()
+        } else {
+            0u
+        }
+    }
+    val charactersTypeLength : Int
+        get() {
+            val o = __offset(8); return if (o != 0) __vector_len(o) else 0
+        }
+    val charactersTypeAsByteBuffer : ByteBuffer get() = __vector_as_bytebuffer(8, 1)
+    fun charactersTypeInByteBuffer(_bb: ByteBuffer) : ByteBuffer = __vector_in_bytebuffer(_bb, 8, 1)
+    fun mutateCharactersType(j: Int, charactersType: UByte) : Boolean {
+        val o = __offset(8)
+        return if (o != 0) {
+            bb.put(__vector(o) + j * 1, charactersType.toByte())
+            true
+        } else {
+            false
+        }
+    }
+    fun characters(obj: Table, j: Int) : Table? {
+        val o = __offset(10)
+        return if (o != 0) {
+            __union(obj, __vector(o) + j * 4)
+        } else {
+            null
+        }
+    }
+    val charactersLength : Int
+        get() {
+            val o = __offset(10); return if (o != 0) __vector_len(o) else 0
+        }
+    companion object {
+        fun validateVersion() = Constants.FLATBUFFERS_2_0_0()
+        fun getRootAsMovie(_bb: ByteBuffer): Movie = getRootAsMovie(_bb, Movie())
+        fun getRootAsMovie(_bb: ByteBuffer, obj: Movie): Movie {
+            _bb.order(ByteOrder.LITTLE_ENDIAN)
+            return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb))
+        }
+        fun MovieBufferHasIdentifier(_bb: ByteBuffer) : Boolean = __has_identifier(_bb, "MOVI")
+        fun createMovie(builder: FlatBufferBuilder, mainCharacterType: UByte, mainCharacterOffset: Int, charactersTypeOffset: Int, charactersOffset: Int) : Int {
+            builder.startTable(4)
+            addCharacters(builder, charactersOffset)
+            addCharactersType(builder, charactersTypeOffset)
+            addMainCharacter(builder, mainCharacterOffset)
+            addMainCharacterType(builder, mainCharacterType)
+            return endMovie(builder)
+        }
+        fun startMovie(builder: FlatBufferBuilder) = builder.startTable(4)
+        fun addMainCharacterType(builder: FlatBufferBuilder, mainCharacterType: UByte) = builder.addByte(0, mainCharacterType.toByte(), 0)
+        fun addMainCharacter(builder: FlatBufferBuilder, mainCharacter: Int) = builder.addOffset(1, mainCharacter, 0)
+        fun addCharactersType(builder: FlatBufferBuilder, charactersType: Int) = builder.addOffset(2, charactersType, 0)
+        fun createCharactersTypeVector(builder: FlatBufferBuilder, data: UByteArray) : Int {
+            builder.startVector(1, data.size, 1)
+            for (i in data.size - 1 downTo 0) {
+                builder.addByte(data[i].toByte())
+            }
+            return builder.endVector()
+        }
+        fun startCharactersTypeVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(1, numElems, 1)
+        fun addCharacters(builder: FlatBufferBuilder, characters: Int) = builder.addOffset(3, characters, 0)
+        fun createCharactersVector(builder: FlatBufferBuilder, data: IntArray) : Int {
+            builder.startVector(4, data.size, 4)
+            for (i in data.size - 1 downTo 0) {
+                builder.addOffset(data[i])
+            }
+            return builder.endVector()
+        }
+        fun startCharactersVector(builder: FlatBufferBuilder, numElems: Int) = builder.startVector(4, numElems, 4)
+        fun endMovie(builder: FlatBufferBuilder) : Int {
+            val o = builder.endTable()
+            return o
+        }
+        fun finishMovieBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finish(offset, "MOVI")
+        fun finishSizePrefixedMovieBuffer(builder: FlatBufferBuilder, offset: Int) = builder.finishSizePrefixed(offset, "MOVI")
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.php b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.php
new file mode 100644
index 0000000..216cd28
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Movie.php
@@ -0,0 +1,220 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Movie extends Table
+{
+    /**
+     * @param ByteBuffer $bb
+     * @return Movie
+     */
+    public static function getRootAsMovie(ByteBuffer $bb)
+    {
+        $obj = new Movie();
+        return ($obj->init($bb->getInt($bb->getPosition()) + $bb->getPosition(), $bb));
+    }
+
+    public static function MovieIdentifier()
+    {
+        return "MOVI";
+    }
+
+    public static function MovieBufferHasIdentifier(ByteBuffer $buf)
+    {
+        return self::__has_identifier($buf, self::MovieIdentifier());
+    }
+
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Movie
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return byte
+     */
+    public function getMainCharacterType()
+    {
+        $o = $this->__offset(4);
+        return $o != 0 ? $this->bb->getByte($o + $this->bb_pos) : \Character::NONE;
+    }
+
+    /**
+     * @returnint
+     */
+    public function getMainCharacter($obj)
+    {
+        $o = $this->__offset(6);
+        return $o != 0 ? $this->__union($obj, $o) : null;
+    }
+
+    /**
+     * @param int offset
+     * @return byte
+     */
+    public function getCharactersType($j)
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->bb->getByte($this->__vector($o) + $j * 1) : \Character::NONE;
+    }
+
+    /**
+     * @return int
+     */
+    public function getCharactersTypeLength()
+    {
+        $o = $this->__offset(8);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param int offset
+     * @return Table
+     */
+    public function getCharacters($j, $obj)
+    {
+        $o = $this->__offset(10);
+        return $o != 0 ? $this->__union($obj, $this->__vector($o) + $j * 4 - $this->bb_pos) : null;
+    }
+
+    /**
+     * @return int
+     */
+    public function getCharactersLength()
+    {
+        $o = $this->__offset(10);
+        return $o != 0 ? $this->__vector_len($o) : 0;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return void
+     */
+    public static function startMovie(FlatBufferBuilder $builder)
+    {
+        $builder->StartObject(4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return Movie
+     */
+    public static function createMovie(FlatBufferBuilder $builder, $main_character_type, $main_character, $characters_type, $characters)
+    {
+        $builder->startObject(4);
+        self::addMainCharacterType($builder, $main_character_type);
+        self::addMainCharacter($builder, $main_character);
+        self::addCharactersType($builder, $characters_type);
+        self::addCharacters($builder, $characters);
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param byte
+     * @return void
+     */
+    public static function addMainCharacterType(FlatBufferBuilder $builder, $mainCharacterType)
+    {
+        $builder->addByteX(0, $mainCharacterType, 0);
+    }
+
+    public static function addMainCharacter(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->addOffsetX(1, $offset, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addCharactersType(FlatBufferBuilder $builder, $charactersType)
+    {
+        $builder->addOffsetX(2, $charactersType, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createCharactersTypeVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(1, count($data), 1);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putByte($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startCharactersTypeVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(1, $numElems, 1);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param VectorOffset
+     * @return void
+     */
+    public static function addCharacters(FlatBufferBuilder $builder, $characters)
+    {
+        $builder->addOffsetX(3, $characters, 0);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param array offset array
+     * @return int vector offset
+     */
+    public static function createCharactersVector(FlatBufferBuilder $builder, array $data)
+    {
+        $builder->startVector(4, count($data), 4);
+        for ($i = count($data) - 1; $i >= 0; $i--) {
+            $builder->putOffset($data[$i]);
+        }
+        return $builder->endVector();
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @param int $numElems
+     * @return void
+     */
+    public static function startCharactersVector(FlatBufferBuilder $builder, $numElems)
+    {
+        $builder->startVector(4, $numElems, 4);
+    }
+
+    /**
+     * @param FlatBufferBuilder $builder
+     * @return int table offset
+     */
+    public static function endMovie(FlatBufferBuilder $builder)
+    {
+        $o = $builder->endObject();
+        return $o;
+    }
+
+    public static function finishMovieBuffer(FlatBufferBuilder $builder, $offset)
+    {
+        $builder->finish($offset, "MOVI");
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.cs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.cs
new file mode 100644
index 0000000..e5ffff8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.cs
@@ -0,0 +1,49 @@
+// <auto-generated>
+//  automatically generated by the FlatBuffers compiler, do not modify
+// </auto-generated>
+
+using global::System;
+using global::System.Collections.Generic;
+using global::FlatBuffers;
+
+public struct Rapunzel : IFlatbufferObject
+{
+  private Struct __p;
+  public ByteBuffer ByteBuffer { get { return __p.bb; } }
+  public void __init(int _i, ByteBuffer _bb) { __p = new Struct(_i, _bb); }
+  public Rapunzel __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int HairLength { get { return __p.bb.GetInt(__p.bb_pos + 0); } }
+  public void MutateHairLength(int hair_length) { __p.bb.PutInt(__p.bb_pos + 0, hair_length); }
+
+  public static Offset<Rapunzel> CreateRapunzel(FlatBufferBuilder builder, int HairLength) {
+    builder.Prep(4, 4);
+    builder.PutInt(HairLength);
+    return new Offset<Rapunzel>(builder.Offset);
+  }
+  public RapunzelT UnPack() {
+    var _o = new RapunzelT();
+    this.UnPackTo(_o);
+    return _o;
+  }
+  public void UnPackTo(RapunzelT _o) {
+    _o.HairLength = this.HairLength;
+  }
+  public static Offset<Rapunzel> Pack(FlatBufferBuilder builder, RapunzelT _o) {
+    if (_o == null) return default(Offset<Rapunzel>);
+    return CreateRapunzel(
+      builder,
+      _o.HairLength);
+  }
+};
+
+public class RapunzelT
+{
+  [Newtonsoft.Json.JsonProperty("hair_length")]
+  public int HairLength { get; set; }
+
+  public RapunzelT() {
+    this.HairLength = 0;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.java b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.java
new file mode 100644
index 0000000..96d3cfe
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.java
@@ -0,0 +1,29 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*;
+import java.lang.*;
+import java.util.*;
+import com.google.flatbuffers.*;
+
+@SuppressWarnings("unused")
+public final class Rapunzel extends Struct {
+  public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); }
+  public Rapunzel __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; }
+
+  public int hairLength() { return bb.getInt(bb_pos + 0); }
+  public void mutateHairLength(int hair_length) { bb.putInt(bb_pos + 0, hair_length); }
+
+  public static int createRapunzel(FlatBufferBuilder builder, int hairLength) {
+    builder.prep(4, 4);
+    builder.putInt(hairLength);
+    return builder.offset();
+  }
+
+  public static final class Vector extends BaseVector {
+    public Vector __assign(int _vector, int _element_size, ByteBuffer _bb) { __reset(_vector, _element_size, _bb); return this; }
+
+    public Rapunzel get(int j) { return get(new Rapunzel(), j); }
+    public Rapunzel get(Rapunzel obj, int j) {  return obj.__assign(__element(j), bb); }
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.kt b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.kt
new file mode 100644
index 0000000..080a7f7
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.kt
@@ -0,0 +1,27 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import java.nio.*
+import kotlin.math.sign
+import com.google.flatbuffers.*
+
+@Suppress("unused")
+@ExperimentalUnsignedTypes
+class Rapunzel : Struct() {
+
+    fun __init(_i: Int, _bb: ByteBuffer)  {
+        __reset(_i, _bb)
+    }
+    fun __assign(_i: Int, _bb: ByteBuffer) : Rapunzel {
+        __init(_i, _bb)
+        return this
+    }
+    val hairLength : Int get() = bb.getInt(bb_pos + 0)
+    fun mutateHairLength(hairLength: Int) : ByteBuffer = bb.putInt(bb_pos + 0, hairLength)
+    companion object {
+        fun createRapunzel(builder: FlatBufferBuilder, hairLength: Int) : Int {
+            builder.prep(4, 4)
+            builder.putInt(hairLength)
+            return builder.offset()
+        }
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.php b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.php
new file mode 100644
index 0000000..9842d95
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/Rapunzel.php
@@ -0,0 +1,41 @@
+<?php
+// automatically generated by the FlatBuffers compiler, do not modify
+
+use \Google\FlatBuffers\Struct;
+use \Google\FlatBuffers\Table;
+use \Google\FlatBuffers\ByteBuffer;
+use \Google\FlatBuffers\FlatBufferBuilder;
+
+class Rapunzel extends Struct
+{
+    /**
+     * @param int $_i offset
+     * @param ByteBuffer $_bb
+     * @return Rapunzel
+     **/
+    public function init($_i, ByteBuffer $_bb)
+    {
+        $this->bb_pos = $_i;
+        $this->bb = $_bb;
+        return $this;
+    }
+
+    /**
+     * @return int
+     */
+    public function GetHairLength()
+    {
+        return $this->bb->getInt($this->bb_pos + 0);
+    }
+
+
+    /**
+     * @return int offset
+     */
+    public static function createRapunzel(FlatBufferBuilder $builder, $hairLength)
+    {
+        $builder->prep(4, 4);
+        $builder->putInt($hairLength);
+        return $builder->offset();
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.js b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.js
new file mode 100644
index 0000000..32be94e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.js
@@ -0,0 +1,64 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+export class Attacker {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsAttacker(bb, obj) {
+        return (obj || new Attacker()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsAttacker(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Attacker()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    swordAttackDamage() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readInt32(this.bb_pos + offset) : 0;
+    }
+    mutate_sword_attack_damage(value) {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        if (offset === 0) {
+            return false;
+        }
+        this.bb.writeInt32(this.bb_pos + offset, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'Attacker';
+    }
+    static startAttacker(builder) {
+        builder.startObject(1);
+    }
+    static addSwordAttackDamage(builder, swordAttackDamage) {
+        builder.addFieldInt32(0, swordAttackDamage, 0);
+    }
+    static endAttacker(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static createAttacker(builder, swordAttackDamage) {
+        Attacker.startAttacker(builder);
+        Attacker.addSwordAttackDamage(builder, swordAttackDamage);
+        return Attacker.endAttacker(builder);
+    }
+    unpack() {
+        return new AttackerT(this.swordAttackDamage());
+    }
+    unpackTo(_o) {
+        _o.swordAttackDamage = this.swordAttackDamage();
+    }
+}
+export class AttackerT {
+    constructor(swordAttackDamage = 0) {
+        this.swordAttackDamage = swordAttackDamage;
+    }
+    pack(builder) {
+        return Attacker.createAttacker(builder, this.swordAttackDamage);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.ts b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.ts
new file mode 100644
index 0000000..af37020
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/attacker.ts
@@ -0,0 +1,87 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Attacker {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Attacker {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsAttacker(bb:flatbuffers.ByteBuffer, obj?:Attacker):Attacker {
+  return (obj || new Attacker()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsAttacker(bb:flatbuffers.ByteBuffer, obj?:Attacker):Attacker {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Attacker()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+swordAttackDamage():number {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0;
+}
+
+mutate_sword_attack_damage(value:number):boolean {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+
+  if (offset === 0) {
+    return false;
+  }
+
+  this.bb!.writeInt32(this.bb_pos + offset, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'Attacker';
+}
+
+static startAttacker(builder:flatbuffers.Builder) {
+  builder.startObject(1);
+}
+
+static addSwordAttackDamage(builder:flatbuffers.Builder, swordAttackDamage:number) {
+  builder.addFieldInt32(0, swordAttackDamage, 0);
+}
+
+static endAttacker(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static createAttacker(builder:flatbuffers.Builder, swordAttackDamage:number):flatbuffers.Offset {
+  Attacker.startAttacker(builder);
+  Attacker.addSwordAttackDamage(builder, swordAttackDamage);
+  return Attacker.endAttacker(builder);
+}
+
+unpack(): AttackerT {
+  return new AttackerT(
+    this.swordAttackDamage()
+  );
+}
+
+
+unpackTo(_o: AttackerT): void {
+  _o.swordAttackDamage = this.swordAttackDamage();
+}
+}
+
+export class AttackerT {
+constructor(
+  public swordAttackDamage: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Attacker.createAttacker(builder,
+    this.swordAttackDamage
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.js b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.js
new file mode 100644
index 0000000..0d9e1a5
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.js
@@ -0,0 +1,44 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export class BookReader {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    booksRead() {
+        return this.bb.readInt32(this.bb_pos);
+    }
+    mutate_books_read(value) {
+        this.bb.writeInt32(this.bb_pos + 0, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'BookReader';
+    }
+    static sizeOf() {
+        return 4;
+    }
+    static createBookReader(builder, books_read) {
+        builder.prep(4, 4);
+        builder.writeInt32(books_read);
+        return builder.offset();
+    }
+    unpack() {
+        return new BookReaderT(this.booksRead());
+    }
+    unpackTo(_o) {
+        _o.booksRead = this.booksRead();
+    }
+}
+export class BookReaderT {
+    constructor(booksRead = 0) {
+        this.booksRead = booksRead;
+    }
+    pack(builder) {
+        return BookReader.createBookReader(builder, this.booksRead);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.ts b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.ts
new file mode 100644
index 0000000..2310600
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/book-reader.ts
@@ -0,0 +1,63 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class BookReader {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):BookReader {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+booksRead():number {
+  return this.bb!.readInt32(this.bb_pos);
+}
+
+mutate_books_read(value:number):boolean {
+  this.bb!.writeInt32(this.bb_pos + 0, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'BookReader';
+}
+
+static sizeOf():number {
+  return 4;
+}
+
+static createBookReader(builder:flatbuffers.Builder, books_read: number):flatbuffers.Offset {
+  builder.prep(4, 4);
+  builder.writeInt32(books_read);
+  return builder.offset();
+}
+
+
+unpack(): BookReaderT {
+  return new BookReaderT(
+    this.booksRead()
+  );
+}
+
+
+unpackTo(_o: BookReaderT): void {
+  _o.booksRead = this.booksRead();
+}
+}
+
+export class BookReaderT {
+constructor(
+  public booksRead: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return BookReader.createBookReader(builder,
+    this.booksRead
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.js b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.js
new file mode 100644
index 0000000..c060298
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.js
@@ -0,0 +1,38 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import { Attacker } from './attacker';
+import { BookReader } from './book-reader';
+import { Rapunzel } from './rapunzel';
+export var Character;
+(function (Character) {
+    Character[Character["NONE"] = 0] = "NONE";
+    Character[Character["MuLan"] = 1] = "MuLan";
+    Character[Character["Rapunzel"] = 2] = "Rapunzel";
+    Character[Character["Belle"] = 3] = "Belle";
+    Character[Character["BookFan"] = 4] = "BookFan";
+    Character[Character["Other"] = 5] = "Other";
+    Character[Character["Unused"] = 6] = "Unused";
+})(Character || (Character = {}));
+export function unionToCharacter(type, accessor) {
+    switch (Character[type]) {
+        case 'NONE': return null;
+        case 'MuLan': return accessor(new Attacker());
+        case 'Rapunzel': return accessor(new Rapunzel());
+        case 'Belle': return accessor(new BookReader());
+        case 'BookFan': return accessor(new BookReader());
+        case 'Other': return accessor('');
+        case 'Unused': return accessor('');
+        default: return null;
+    }
+}
+export function unionListToCharacter(type, accessor, index) {
+    switch (Character[type]) {
+        case 'NONE': return null;
+        case 'MuLan': return accessor(index, new Attacker());
+        case 'Rapunzel': return accessor(index, new Rapunzel());
+        case 'Belle': return accessor(index, new BookReader());
+        case 'BookFan': return accessor(index, new BookReader());
+        case 'Other': return accessor(index, '');
+        case 'Unused': return accessor(index, '');
+        default: return null;
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.ts b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.ts
new file mode 100644
index 0000000..dbf8d51
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/character.ts
@@ -0,0 +1,50 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import { Attacker, AttackerT } from './attacker';
+import { BookReader, BookReaderT } from './book-reader';
+import { Rapunzel, RapunzelT } from './rapunzel';
+
+
+export enum Character{
+  NONE = 0,
+  MuLan = 1,
+  Rapunzel = 2,
+  Belle = 3,
+  BookFan = 4,
+  Other = 5,
+  Unused = 6
+}
+
+export function unionToCharacter(
+  type: Character,
+  accessor: (obj:Attacker|BookReader|Rapunzel|string) => Attacker|BookReader|Rapunzel|string|null
+): Attacker|BookReader|Rapunzel|string|null {
+  switch(Character[type]) {
+    case 'NONE': return null; 
+    case 'MuLan': return accessor(new Attacker())! as Attacker;
+    case 'Rapunzel': return accessor(new Rapunzel())! as Rapunzel;
+    case 'Belle': return accessor(new BookReader())! as BookReader;
+    case 'BookFan': return accessor(new BookReader())! as BookReader;
+    case 'Other': return accessor('') as string;
+    case 'Unused': return accessor('') as string;
+    default: return null;
+  }
+}
+
+export function unionListToCharacter(
+  type: Character, 
+  accessor: (index: number, obj:Attacker|BookReader|Rapunzel|string) => Attacker|BookReader|Rapunzel|string|null, 
+  index: number
+): Attacker|BookReader|Rapunzel|string|null {
+  switch(Character[type]) {
+    case 'NONE': return null; 
+    case 'MuLan': return accessor(index, new Attacker())! as Attacker;
+    case 'Rapunzel': return accessor(index, new Rapunzel())! as Rapunzel;
+    case 'Belle': return accessor(index, new BookReader())! as BookReader;
+    case 'BookFan': return accessor(index, new BookReader())! as BookReader;
+    case 'Other': return accessor(index, '') as string;
+    case 'Unused': return accessor(index, '') as string;
+    default: return null;
+  }
+}
+
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.js b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.js
new file mode 100644
index 0000000..b4d8099
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.js
@@ -0,0 +1,185 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+import * as flatbuffers from 'flatbuffers';
+import { Character, unionToCharacter, unionListToCharacter } from './character';
+export class Movie {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    static getRootAsMovie(bb, obj) {
+        return (obj || new Movie()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static getSizePrefixedRootAsMovie(bb, obj) {
+        bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+        return (obj || new Movie()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+    }
+    static bufferHasIdentifier(bb) {
+        return bb.__has_identifier('MOVI');
+    }
+    mainCharacterType() {
+        const offset = this.bb.__offset(this.bb_pos, 4);
+        return offset ? this.bb.readUint8(this.bb_pos + offset) : Character.NONE;
+    }
+    mainCharacter(obj) {
+        const offset = this.bb.__offset(this.bb_pos, 6);
+        return offset ? this.bb.__union_with_string(obj, this.bb_pos + offset) : null;
+    }
+    charactersType(index) {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.readUint8(this.bb.__vector(this.bb_pos + offset) + index) : 0;
+    }
+    charactersTypeLength() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    charactersTypeArray() {
+        const offset = this.bb.__offset(this.bb_pos, 8);
+        return offset ? new Uint8Array(this.bb.bytes().buffer, this.bb.bytes().byteOffset + this.bb.__vector(this.bb_pos + offset), this.bb.__vector_len(this.bb_pos + offset)) : null;
+    }
+    characters(index, obj) {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.__union_with_string(obj, this.bb.__vector(this.bb_pos + offset) + index * 4) : null;
+    }
+    charactersLength() {
+        const offset = this.bb.__offset(this.bb_pos, 10);
+        return offset ? this.bb.__vector_len(this.bb_pos + offset) : 0;
+    }
+    static getFullyQualifiedName() {
+        return 'Movie';
+    }
+    static startMovie(builder) {
+        builder.startObject(4);
+    }
+    static addMainCharacterType(builder, mainCharacterType) {
+        builder.addFieldInt8(0, mainCharacterType, Character.NONE);
+    }
+    static addMainCharacter(builder, mainCharacterOffset) {
+        builder.addFieldOffset(1, mainCharacterOffset, 0);
+    }
+    static addCharactersType(builder, charactersTypeOffset) {
+        builder.addFieldOffset(2, charactersTypeOffset, 0);
+    }
+    static createCharactersTypeVector(builder, data) {
+        builder.startVector(1, data.length, 1);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addInt8(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startCharactersTypeVector(builder, numElems) {
+        builder.startVector(1, numElems, 1);
+    }
+    static addCharacters(builder, charactersOffset) {
+        builder.addFieldOffset(3, charactersOffset, 0);
+    }
+    static createCharactersVector(builder, data) {
+        builder.startVector(4, data.length, 4);
+        for (let i = data.length - 1; i >= 0; i--) {
+            builder.addOffset(data[i]);
+        }
+        return builder.endVector();
+    }
+    static startCharactersVector(builder, numElems) {
+        builder.startVector(4, numElems, 4);
+    }
+    static endMovie(builder) {
+        const offset = builder.endObject();
+        return offset;
+    }
+    static finishMovieBuffer(builder, offset) {
+        builder.finish(offset, 'MOVI');
+    }
+    static finishSizePrefixedMovieBuffer(builder, offset) {
+        builder.finish(offset, 'MOVI', true);
+    }
+    static createMovie(builder, mainCharacterType, mainCharacterOffset, charactersTypeOffset, charactersOffset) {
+        Movie.startMovie(builder);
+        Movie.addMainCharacterType(builder, mainCharacterType);
+        Movie.addMainCharacter(builder, mainCharacterOffset);
+        Movie.addCharactersType(builder, charactersTypeOffset);
+        Movie.addCharacters(builder, charactersOffset);
+        return Movie.endMovie(builder);
+    }
+    unpack() {
+        return new MovieT(this.mainCharacterType(), (() => {
+            let temp = unionToCharacter(this.mainCharacterType(), this.mainCharacter.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            if (typeof temp === 'string') {
+                return temp;
+            }
+            return temp.unpack();
+        })(), this.bb.createScalarList(this.charactersType.bind(this), this.charactersTypeLength()), (() => {
+            let ret = [];
+            for (let targetEnumIndex = 0; targetEnumIndex < this.charactersTypeLength(); ++targetEnumIndex) {
+                let targetEnum = this.charactersType(targetEnumIndex);
+                if (targetEnum === null || Character[targetEnum] === 'NONE') {
+                    continue;
+                }
+                let temp = unionListToCharacter(targetEnum, this.characters.bind(this), targetEnumIndex);
+                if (temp === null) {
+                    continue;
+                }
+                if (typeof temp === 'string') {
+                    ret.push(temp);
+                    continue;
+                }
+                ret.push(temp.unpack());
+            }
+            return ret;
+        })());
+    }
+    unpackTo(_o) {
+        _o.mainCharacterType = this.mainCharacterType();
+        _o.mainCharacter = (() => {
+            let temp = unionToCharacter(this.mainCharacterType(), this.mainCharacter.bind(this));
+            if (temp === null) {
+                return null;
+            }
+            if (typeof temp === 'string') {
+                return temp;
+            }
+            return temp.unpack();
+        })();
+        _o.charactersType = this.bb.createScalarList(this.charactersType.bind(this), this.charactersTypeLength());
+        _o.characters = (() => {
+            let ret = [];
+            for (let targetEnumIndex = 0; targetEnumIndex < this.charactersTypeLength(); ++targetEnumIndex) {
+                let targetEnum = this.charactersType(targetEnumIndex);
+                if (targetEnum === null || Character[targetEnum] === 'NONE') {
+                    continue;
+                }
+                let temp = unionListToCharacter(targetEnum, this.characters.bind(this), targetEnumIndex);
+                if (temp === null) {
+                    continue;
+                }
+                if (typeof temp === 'string') {
+                    ret.push(temp);
+                    continue;
+                }
+                ret.push(temp.unpack());
+            }
+            return ret;
+        })();
+    }
+}
+export class MovieT {
+    constructor(mainCharacterType = Character.NONE, mainCharacter = null, charactersType = [], characters = []) {
+        this.mainCharacterType = mainCharacterType;
+        this.mainCharacter = mainCharacter;
+        this.charactersType = charactersType;
+        this.characters = characters;
+    }
+    pack(builder) {
+        const mainCharacter = builder.createObjectOffset(this.mainCharacter);
+        const charactersType = Movie.createCharactersTypeVector(builder, this.charactersType);
+        const characters = Movie.createCharactersVector(builder, builder.createObjectOffsetList(this.characters));
+        return Movie.createMovie(builder, this.mainCharacterType, mainCharacter, charactersType, characters);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.ts b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.ts
new file mode 100644
index 0000000..b3e8381
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/movie.ts
@@ -0,0 +1,211 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+import { Attacker, AttackerT } from './attacker';
+import { BookReader, BookReaderT } from './book-reader';
+import { Character, unionToCharacter, unionListToCharacter } from './character';
+import { Rapunzel, RapunzelT } from './rapunzel';
+
+
+export class Movie {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Movie {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+static getRootAsMovie(bb:flatbuffers.ByteBuffer, obj?:Movie):Movie {
+  return (obj || new Movie()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static getSizePrefixedRootAsMovie(bb:flatbuffers.ByteBuffer, obj?:Movie):Movie {
+  bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH);
+  return (obj || new Movie()).__init(bb.readInt32(bb.position()) + bb.position(), bb);
+}
+
+static bufferHasIdentifier(bb:flatbuffers.ByteBuffer):boolean {
+  return bb.__has_identifier('MOVI');
+}
+
+mainCharacterType():Character {
+  const offset = this.bb!.__offset(this.bb_pos, 4);
+  return offset ? this.bb!.readUint8(this.bb_pos + offset) : Character.NONE;
+}
+
+mainCharacter<T extends flatbuffers.Table>(obj:any|string):any|string|null {
+  const offset = this.bb!.__offset(this.bb_pos, 6);
+  return offset ? this.bb!.__union_with_string(obj, this.bb_pos + offset) : null;
+}
+
+charactersType(index: number):Character|null {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.readUint8(this.bb!.__vector(this.bb_pos + offset) + index) : 0;
+}
+
+charactersTypeLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+charactersTypeArray():Uint8Array|null {
+  const offset = this.bb!.__offset(this.bb_pos, 8);
+  return offset ? new Uint8Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null;
+}
+
+characters(index: number, obj:any|string):any|string|null {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.__union_with_string(obj, this.bb!.__vector(this.bb_pos + offset) + index * 4) : null;
+}
+
+charactersLength():number {
+  const offset = this.bb!.__offset(this.bb_pos, 10);
+  return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0;
+}
+
+static getFullyQualifiedName():string {
+  return 'Movie';
+}
+
+static startMovie(builder:flatbuffers.Builder) {
+  builder.startObject(4);
+}
+
+static addMainCharacterType(builder:flatbuffers.Builder, mainCharacterType:Character) {
+  builder.addFieldInt8(0, mainCharacterType, Character.NONE);
+}
+
+static addMainCharacter(builder:flatbuffers.Builder, mainCharacterOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(1, mainCharacterOffset, 0);
+}
+
+static addCharactersType(builder:flatbuffers.Builder, charactersTypeOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(2, charactersTypeOffset, 0);
+}
+
+static createCharactersTypeVector(builder:flatbuffers.Builder, data:Character[]):flatbuffers.Offset {
+  builder.startVector(1, data.length, 1);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addInt8(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startCharactersTypeVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(1, numElems, 1);
+}
+
+static addCharacters(builder:flatbuffers.Builder, charactersOffset:flatbuffers.Offset) {
+  builder.addFieldOffset(3, charactersOffset, 0);
+}
+
+static createCharactersVector(builder:flatbuffers.Builder, data:flatbuffers.Offset[]):flatbuffers.Offset {
+  builder.startVector(4, data.length, 4);
+  for (let i = data.length - 1; i >= 0; i--) {
+    builder.addOffset(data[i]!);
+  }
+  return builder.endVector();
+}
+
+static startCharactersVector(builder:flatbuffers.Builder, numElems:number) {
+  builder.startVector(4, numElems, 4);
+}
+
+static endMovie(builder:flatbuffers.Builder):flatbuffers.Offset {
+  const offset = builder.endObject();
+  return offset;
+}
+
+static finishMovieBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'MOVI');
+}
+
+static finishSizePrefixedMovieBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) {
+  builder.finish(offset, 'MOVI', true);
+}
+
+static createMovie(builder:flatbuffers.Builder, mainCharacterType:Character, mainCharacterOffset:flatbuffers.Offset, charactersTypeOffset:flatbuffers.Offset, charactersOffset:flatbuffers.Offset):flatbuffers.Offset {
+  Movie.startMovie(builder);
+  Movie.addMainCharacterType(builder, mainCharacterType);
+  Movie.addMainCharacter(builder, mainCharacterOffset);
+  Movie.addCharactersType(builder, charactersTypeOffset);
+  Movie.addCharacters(builder, charactersOffset);
+  return Movie.endMovie(builder);
+}
+
+unpack(): MovieT {
+  return new MovieT(
+    this.mainCharacterType(),
+    (() => {
+      let temp = unionToCharacter(this.mainCharacterType(), this.mainCharacter.bind(this));
+      if(temp === null) { return null; }
+      if(typeof temp === 'string') { return temp; }
+      return temp.unpack()
+  })(),
+    this.bb!.createScalarList(this.charactersType.bind(this), this.charactersTypeLength()),
+    (() => {
+    let ret = [];
+    for(let targetEnumIndex = 0; targetEnumIndex < this.charactersTypeLength(); ++targetEnumIndex) {
+      let targetEnum = this.charactersType(targetEnumIndex);
+      if(targetEnum === null || Character[targetEnum!] === 'NONE') { continue; }
+
+      let temp = unionListToCharacter(targetEnum, this.characters.bind(this), targetEnumIndex);
+      if(temp === null) { continue; }
+      if(typeof temp === 'string') { ret.push(temp); continue; }
+      ret.push(temp.unpack());
+    }
+    return ret;
+  })()
+  );
+}
+
+
+unpackTo(_o: MovieT): void {
+  _o.mainCharacterType = this.mainCharacterType();
+  _o.mainCharacter = (() => {
+      let temp = unionToCharacter(this.mainCharacterType(), this.mainCharacter.bind(this));
+      if(temp === null) { return null; }
+      if(typeof temp === 'string') { return temp; }
+      return temp.unpack()
+  })();
+  _o.charactersType = this.bb!.createScalarList(this.charactersType.bind(this), this.charactersTypeLength());
+  _o.characters = (() => {
+    let ret = [];
+    for(let targetEnumIndex = 0; targetEnumIndex < this.charactersTypeLength(); ++targetEnumIndex) {
+      let targetEnum = this.charactersType(targetEnumIndex);
+      if(targetEnum === null || Character[targetEnum!] === 'NONE') { continue; }
+
+      let temp = unionListToCharacter(targetEnum, this.characters.bind(this), targetEnumIndex);
+      if(temp === null) { continue; }
+      if(typeof temp === 'string') { ret.push(temp); continue; }
+      ret.push(temp.unpack());
+    }
+    return ret;
+  })();
+}
+}
+
+export class MovieT {
+constructor(
+  public mainCharacterType: Character = Character.NONE,
+  public mainCharacter: AttackerT|BookReaderT|RapunzelT|string|null = null,
+  public charactersType: (Character)[] = [],
+  public characters: (AttackerT|BookReaderT|RapunzelT|string)[] = []
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  const mainCharacter = builder.createObjectOffset(this.mainCharacter);
+  const charactersType = Movie.createCharactersTypeVector(builder, this.charactersType);
+  const characters = Movie.createCharactersVector(builder, builder.createObjectOffsetList(this.characters));
+
+  return Movie.createMovie(builder,
+    this.mainCharacterType,
+    mainCharacter,
+    charactersType,
+    characters
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.js b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.js
new file mode 100644
index 0000000..67a5e44
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.js
@@ -0,0 +1,44 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+export class Rapunzel {
+    constructor() {
+        this.bb = null;
+        this.bb_pos = 0;
+    }
+    __init(i, bb) {
+        this.bb_pos = i;
+        this.bb = bb;
+        return this;
+    }
+    hairLength() {
+        return this.bb.readInt32(this.bb_pos);
+    }
+    mutate_hair_length(value) {
+        this.bb.writeInt32(this.bb_pos + 0, value);
+        return true;
+    }
+    static getFullyQualifiedName() {
+        return 'Rapunzel';
+    }
+    static sizeOf() {
+        return 4;
+    }
+    static createRapunzel(builder, hair_length) {
+        builder.prep(4, 4);
+        builder.writeInt32(hair_length);
+        return builder.offset();
+    }
+    unpack() {
+        return new RapunzelT(this.hairLength());
+    }
+    unpackTo(_o) {
+        _o.hairLength = this.hairLength();
+    }
+}
+export class RapunzelT {
+    constructor(hairLength = 0) {
+        this.hairLength = hairLength;
+    }
+    pack(builder) {
+        return Rapunzel.createRapunzel(builder, this.hairLength);
+    }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.ts b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.ts
new file mode 100644
index 0000000..28f6e99
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/rapunzel.ts
@@ -0,0 +1,63 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+import * as flatbuffers from 'flatbuffers';
+
+
+
+export class Rapunzel {
+  bb: flatbuffers.ByteBuffer|null = null;
+  bb_pos = 0;
+__init(i:number, bb:flatbuffers.ByteBuffer):Rapunzel {
+  this.bb_pos = i;
+  this.bb = bb;
+  return this;
+}
+
+hairLength():number {
+  return this.bb!.readInt32(this.bb_pos);
+}
+
+mutate_hair_length(value:number):boolean {
+  this.bb!.writeInt32(this.bb_pos + 0, value);
+  return true;
+}
+
+static getFullyQualifiedName():string {
+  return 'Rapunzel';
+}
+
+static sizeOf():number {
+  return 4;
+}
+
+static createRapunzel(builder:flatbuffers.Builder, hair_length: number):flatbuffers.Offset {
+  builder.prep(4, 4);
+  builder.writeInt32(hair_length);
+  return builder.offset();
+}
+
+
+unpack(): RapunzelT {
+  return new RapunzelT(
+    this.hairLength()
+  );
+}
+
+
+unpackTo(_o: RapunzelT): void {
+  _o.hairLength = this.hairLength();
+}
+}
+
+export class RapunzelT {
+constructor(
+  public hairLength: number = 0
+){}
+
+
+pack(builder:flatbuffers.Builder): flatbuffers.Offset {
+  return Rapunzel.createRapunzel(builder,
+    this.hairLength
+  );
+}
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.fbs b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.fbs
new file mode 100644
index 0000000..495076f
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.fbs
@@ -0,0 +1,31 @@
+// Demonstrates the ability to have vectors of unions, and also to
+// store structs and strings in unions.
+
+table Attacker {
+  sword_attack_damage: int;
+}
+
+struct Rapunzel {
+  hair_length: int;
+}
+
+struct BookReader {
+  books_read: int;
+}
+
+union Character {
+  MuLan: Attacker,  // Can have name be different from type.
+  Rapunzel,         // Or just both the same, as before.
+  Belle: BookReader,
+  BookFan: BookReader,
+  Other: string,
+  Unused: string
+}
+
+table Movie {
+  main_character: Character;
+  characters: [Character];
+}
+
+root_type Movie;
+file_identifier "MOVI";
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.json b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.json
new file mode 100644
index 0000000..af0c9cb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector.json
@@ -0,0 +1,26 @@
+{
+  "main_character_type": "Rapunzel",
+  "main_character": {
+    "hair_length": 6
+  },
+  "characters_type": [
+    "Belle",
+    "MuLan",
+    "BookFan",
+    "Other",
+    "Unused"
+  ],
+  "characters": [
+    {
+      "books_read": 7
+    },
+    {
+      "sword_attack_damage": 5
+    },
+    {
+      "books_read": 2
+    },
+    "Other",
+    "Unused"
+  ]
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector_generated.h b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector_generated.h
new file mode 100644
index 0000000..c819c98
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tests/union_vector/union_vector_generated.h
@@ -0,0 +1,873 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_UNIONVECTOR_H_
+#define FLATBUFFERS_GENERATED_UNIONVECTOR_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+struct Attacker;
+struct AttackerBuilder;
+struct AttackerT;
+
+struct Rapunzel;
+
+struct BookReader;
+
+struct Movie;
+struct MovieBuilder;
+struct MovieT;
+
+bool operator==(const AttackerT &lhs, const AttackerT &rhs);
+bool operator!=(const AttackerT &lhs, const AttackerT &rhs);
+bool operator==(const Rapunzel &lhs, const Rapunzel &rhs);
+bool operator!=(const Rapunzel &lhs, const Rapunzel &rhs);
+bool operator==(const BookReader &lhs, const BookReader &rhs);
+bool operator!=(const BookReader &lhs, const BookReader &rhs);
+bool operator==(const MovieT &lhs, const MovieT &rhs);
+bool operator!=(const MovieT &lhs, const MovieT &rhs);
+
+inline const flatbuffers::TypeTable *AttackerTypeTable();
+
+inline const flatbuffers::TypeTable *RapunzelTypeTable();
+
+inline const flatbuffers::TypeTable *BookReaderTypeTable();
+
+inline const flatbuffers::TypeTable *MovieTypeTable();
+
+enum Character : uint8_t {
+  Character_NONE = 0,
+  Character_MuLan = 1,
+  Character_Rapunzel = 2,
+  Character_Belle = 3,
+  Character_BookFan = 4,
+  Character_Other = 5,
+  Character_Unused = 6,
+  Character_MIN = Character_NONE,
+  Character_MAX = Character_Unused
+};
+
+inline const Character (&EnumValuesCharacter())[7] {
+  static const Character values[] = {
+    Character_NONE,
+    Character_MuLan,
+    Character_Rapunzel,
+    Character_Belle,
+    Character_BookFan,
+    Character_Other,
+    Character_Unused
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCharacter() {
+  static const char * const names[8] = {
+    "NONE",
+    "MuLan",
+    "Rapunzel",
+    "Belle",
+    "BookFan",
+    "Other",
+    "Unused",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCharacter(Character e) {
+  if (flatbuffers::IsOutRange(e, Character_NONE, Character_Unused)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCharacter()[index];
+}
+
+struct CharacterUnion {
+  Character type;
+  void *value;
+
+  CharacterUnion() : type(Character_NONE), value(nullptr) {}
+  CharacterUnion(CharacterUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(Character_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  CharacterUnion(const CharacterUnion &);
+  CharacterUnion &operator=(const CharacterUnion &u)
+    { CharacterUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  CharacterUnion &operator=(CharacterUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~CharacterUnion() { Reset(); }
+
+  void Reset();
+
+  static void *UnPack(const void *obj, Character type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  AttackerT *AsMuLan() {
+    return type == Character_MuLan ?
+      reinterpret_cast<AttackerT *>(value) : nullptr;
+  }
+  const AttackerT *AsMuLan() const {
+    return type == Character_MuLan ?
+      reinterpret_cast<const AttackerT *>(value) : nullptr;
+  }
+  Rapunzel *AsRapunzel() {
+    return type == Character_Rapunzel ?
+      reinterpret_cast<Rapunzel *>(value) : nullptr;
+  }
+  const Rapunzel *AsRapunzel() const {
+    return type == Character_Rapunzel ?
+      reinterpret_cast<const Rapunzel *>(value) : nullptr;
+  }
+  BookReader *AsBelle() {
+    return type == Character_Belle ?
+      reinterpret_cast<BookReader *>(value) : nullptr;
+  }
+  const BookReader *AsBelle() const {
+    return type == Character_Belle ?
+      reinterpret_cast<const BookReader *>(value) : nullptr;
+  }
+  BookReader *AsBookFan() {
+    return type == Character_BookFan ?
+      reinterpret_cast<BookReader *>(value) : nullptr;
+  }
+  const BookReader *AsBookFan() const {
+    return type == Character_BookFan ?
+      reinterpret_cast<const BookReader *>(value) : nullptr;
+  }
+  std::string *AsOther() {
+    return type == Character_Other ?
+      reinterpret_cast<std::string *>(value) : nullptr;
+  }
+  const std::string *AsOther() const {
+    return type == Character_Other ?
+      reinterpret_cast<const std::string *>(value) : nullptr;
+  }
+  std::string *AsUnused() {
+    return type == Character_Unused ?
+      reinterpret_cast<std::string *>(value) : nullptr;
+  }
+  const std::string *AsUnused() const {
+    return type == Character_Unused ?
+      reinterpret_cast<const std::string *>(value) : nullptr;
+  }
+};
+
+
+inline bool operator==(const CharacterUnion &lhs, const CharacterUnion &rhs) {
+  if (lhs.type != rhs.type) return false;
+  switch (lhs.type) {
+    case Character_NONE: {
+      return true;
+    }
+    case Character_MuLan: {
+      return *(reinterpret_cast<const AttackerT *>(lhs.value)) ==
+             *(reinterpret_cast<const AttackerT *>(rhs.value));
+    }
+    case Character_Rapunzel: {
+      return *(reinterpret_cast<const Rapunzel *>(lhs.value)) ==
+             *(reinterpret_cast<const Rapunzel *>(rhs.value));
+    }
+    case Character_Belle: {
+      return *(reinterpret_cast<const BookReader *>(lhs.value)) ==
+             *(reinterpret_cast<const BookReader *>(rhs.value));
+    }
+    case Character_BookFan: {
+      return *(reinterpret_cast<const BookReader *>(lhs.value)) ==
+             *(reinterpret_cast<const BookReader *>(rhs.value));
+    }
+    case Character_Other: {
+      return *(reinterpret_cast<const std::string *>(lhs.value)) ==
+             *(reinterpret_cast<const std::string *>(rhs.value));
+    }
+    case Character_Unused: {
+      return *(reinterpret_cast<const std::string *>(lhs.value)) ==
+             *(reinterpret_cast<const std::string *>(rhs.value));
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+inline bool operator!=(const CharacterUnion &lhs, const CharacterUnion &rhs) {
+    return !(lhs == rhs);
+}
+
+bool VerifyCharacter(flatbuffers::Verifier &verifier, const void *obj, Character type);
+bool VerifyCharacterVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Rapunzel FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t hair_length_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RapunzelTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "Rapunzel";
+  }
+  Rapunzel()
+      : hair_length_(0) {
+  }
+  Rapunzel(int32_t _hair_length)
+      : hair_length_(flatbuffers::EndianScalar(_hair_length)) {
+  }
+  int32_t hair_length() const {
+    return flatbuffers::EndianScalar(hair_length_);
+  }
+  void mutate_hair_length(int32_t _hair_length) {
+    flatbuffers::WriteScalar(&hair_length_, _hair_length);
+  }
+};
+FLATBUFFERS_STRUCT_END(Rapunzel, 4);
+
+inline bool operator==(const Rapunzel &lhs, const Rapunzel &rhs) {
+  return
+      (lhs.hair_length() == rhs.hair_length());
+}
+
+inline bool operator!=(const Rapunzel &lhs, const Rapunzel &rhs) {
+    return !(lhs == rhs);
+}
+
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) BookReader FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t books_read_;
+
+ public:
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BookReaderTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "BookReader";
+  }
+  BookReader()
+      : books_read_(0) {
+  }
+  BookReader(int32_t _books_read)
+      : books_read_(flatbuffers::EndianScalar(_books_read)) {
+  }
+  int32_t books_read() const {
+    return flatbuffers::EndianScalar(books_read_);
+  }
+  void mutate_books_read(int32_t _books_read) {
+    flatbuffers::WriteScalar(&books_read_, _books_read);
+  }
+};
+FLATBUFFERS_STRUCT_END(BookReader, 4);
+
+inline bool operator==(const BookReader &lhs, const BookReader &rhs) {
+  return
+      (lhs.books_read() == rhs.books_read());
+}
+
+inline bool operator!=(const BookReader &lhs, const BookReader &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct AttackerT : public flatbuffers::NativeTable {
+  typedef Attacker TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "AttackerT";
+  }
+  int32_t sword_attack_damage = 0;
+};
+
+inline bool operator==(const AttackerT &lhs, const AttackerT &rhs) {
+  return
+      (lhs.sword_attack_damage == rhs.sword_attack_damage);
+}
+
+inline bool operator!=(const AttackerT &lhs, const AttackerT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Attacker FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AttackerT NativeTableType;
+  typedef AttackerBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AttackerTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "Attacker";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SWORD_ATTACK_DAMAGE = 4
+  };
+  int32_t sword_attack_damage() const {
+    return GetField<int32_t>(VT_SWORD_ATTACK_DAMAGE, 0);
+  }
+  bool mutate_sword_attack_damage(int32_t _sword_attack_damage) {
+    return SetField<int32_t>(VT_SWORD_ATTACK_DAMAGE, _sword_attack_damage, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SWORD_ATTACK_DAMAGE) &&
+           verifier.EndTable();
+  }
+  AttackerT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AttackerT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Attacker> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AttackerT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AttackerBuilder {
+  typedef Attacker Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_sword_attack_damage(int32_t sword_attack_damage) {
+    fbb_.AddElement<int32_t>(Attacker::VT_SWORD_ATTACK_DAMAGE, sword_attack_damage, 0);
+  }
+  explicit AttackerBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Attacker> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Attacker>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Attacker> CreateAttacker(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t sword_attack_damage = 0) {
+  AttackerBuilder builder_(_fbb);
+  builder_.add_sword_attack_damage(sword_attack_damage);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Attacker> CreateAttacker(flatbuffers::FlatBufferBuilder &_fbb, const AttackerT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MovieT : public flatbuffers::NativeTable {
+  typedef Movie TableType;
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "MovieT";
+  }
+  CharacterUnion main_character{};
+  std::vector<CharacterUnion> characters{};
+};
+
+inline bool operator==(const MovieT &lhs, const MovieT &rhs) {
+  return
+      (lhs.main_character == rhs.main_character) &&
+      (lhs.characters == rhs.characters);
+}
+
+inline bool operator!=(const MovieT &lhs, const MovieT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+struct Movie FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MovieT NativeTableType;
+  typedef MovieBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MovieTypeTable();
+  }
+  static FLATBUFFERS_CONSTEXPR const char *GetFullyQualifiedName() {
+    return "Movie";
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MAIN_CHARACTER_TYPE = 4,
+    VT_MAIN_CHARACTER = 6,
+    VT_CHARACTERS_TYPE = 8,
+    VT_CHARACTERS = 10
+  };
+  Character main_character_type() const {
+    return static_cast<Character>(GetField<uint8_t>(VT_MAIN_CHARACTER_TYPE, 0));
+  }
+  const void *main_character() const {
+    return GetPointer<const void *>(VT_MAIN_CHARACTER);
+  }
+  const Attacker *main_character_as_MuLan() const {
+    return main_character_type() == Character_MuLan ? static_cast<const Attacker *>(main_character()) : nullptr;
+  }
+  const Rapunzel *main_character_as_Rapunzel() const {
+    return main_character_type() == Character_Rapunzel ? static_cast<const Rapunzel *>(main_character()) : nullptr;
+  }
+  const BookReader *main_character_as_Belle() const {
+    return main_character_type() == Character_Belle ? static_cast<const BookReader *>(main_character()) : nullptr;
+  }
+  const BookReader *main_character_as_BookFan() const {
+    return main_character_type() == Character_BookFan ? static_cast<const BookReader *>(main_character()) : nullptr;
+  }
+  const flatbuffers::String *main_character_as_Other() const {
+    return main_character_type() == Character_Other ? static_cast<const flatbuffers::String *>(main_character()) : nullptr;
+  }
+  const flatbuffers::String *main_character_as_Unused() const {
+    return main_character_type() == Character_Unused ? static_cast<const flatbuffers::String *>(main_character()) : nullptr;
+  }
+  void *mutable_main_character() {
+    return GetPointer<void *>(VT_MAIN_CHARACTER);
+  }
+  const flatbuffers::Vector<uint8_t> *characters_type() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CHARACTERS_TYPE);
+  }
+  flatbuffers::Vector<uint8_t> *mutable_characters_type() {
+    return GetPointer<flatbuffers::Vector<uint8_t> *>(VT_CHARACTERS_TYPE);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<void>> *characters() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<void>> *>(VT_CHARACTERS);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<void>> *mutable_characters() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<void>> *>(VT_CHARACTERS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_MAIN_CHARACTER_TYPE) &&
+           VerifyOffset(verifier, VT_MAIN_CHARACTER) &&
+           VerifyCharacter(verifier, main_character(), main_character_type()) &&
+           VerifyOffset(verifier, VT_CHARACTERS_TYPE) &&
+           verifier.VerifyVector(characters_type()) &&
+           VerifyOffset(verifier, VT_CHARACTERS) &&
+           verifier.VerifyVector(characters()) &&
+           VerifyCharacterVector(verifier, characters(), characters_type()) &&
+           verifier.EndTable();
+  }
+  MovieT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MovieT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Movie> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MovieT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MovieBuilder {
+  typedef Movie Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_main_character_type(Character main_character_type) {
+    fbb_.AddElement<uint8_t>(Movie::VT_MAIN_CHARACTER_TYPE, static_cast<uint8_t>(main_character_type), 0);
+  }
+  void add_main_character(flatbuffers::Offset<void> main_character) {
+    fbb_.AddOffset(Movie::VT_MAIN_CHARACTER, main_character);
+  }
+  void add_characters_type(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> characters_type) {
+    fbb_.AddOffset(Movie::VT_CHARACTERS_TYPE, characters_type);
+  }
+  void add_characters(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<void>>> characters) {
+    fbb_.AddOffset(Movie::VT_CHARACTERS, characters);
+  }
+  explicit MovieBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Movie> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Movie>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Movie> CreateMovie(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    Character main_character_type = Character_NONE,
+    flatbuffers::Offset<void> main_character = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> characters_type = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<void>>> characters = 0) {
+  MovieBuilder builder_(_fbb);
+  builder_.add_characters(characters);
+  builder_.add_characters_type(characters_type);
+  builder_.add_main_character(main_character);
+  builder_.add_main_character_type(main_character_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Movie> CreateMovieDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    Character main_character_type = Character_NONE,
+    flatbuffers::Offset<void> main_character = 0,
+    const std::vector<uint8_t> *characters_type = nullptr,
+    const std::vector<flatbuffers::Offset<void>> *characters = nullptr) {
+  auto characters_type__ = characters_type ? _fbb.CreateVector<uint8_t>(*characters_type) : 0;
+  auto characters__ = characters ? _fbb.CreateVector<flatbuffers::Offset<void>>(*characters) : 0;
+  return CreateMovie(
+      _fbb,
+      main_character_type,
+      main_character,
+      characters_type__,
+      characters__);
+}
+
+flatbuffers::Offset<Movie> CreateMovie(flatbuffers::FlatBufferBuilder &_fbb, const MovieT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline AttackerT *Attacker::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AttackerT>(new AttackerT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Attacker::UnPackTo(AttackerT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = sword_attack_damage(); _o->sword_attack_damage = _e; }
+}
+
+inline flatbuffers::Offset<Attacker> Attacker::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AttackerT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAttacker(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Attacker> CreateAttacker(flatbuffers::FlatBufferBuilder &_fbb, const AttackerT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AttackerT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _sword_attack_damage = _o->sword_attack_damage;
+  return CreateAttacker(
+      _fbb,
+      _sword_attack_damage);
+}
+
+inline MovieT *Movie::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MovieT>(new MovieT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Movie::UnPackTo(MovieT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = main_character_type(); _o->main_character.type = _e; }
+  { auto _e = main_character(); if (_e) _o->main_character.value = CharacterUnion::UnPack(_e, main_character_type(), _resolver); }
+  { auto _e = characters_type(); if (_e) { _o->characters.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->characters[_i].type = static_cast<Character>(_e->Get(_i)); } } }
+  { auto _e = characters(); if (_e) { _o->characters.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->characters[_i].value = CharacterUnion::UnPack(_e->Get(_i), characters_type()->GetEnum<Character>(_i), _resolver); } } }
+}
+
+inline flatbuffers::Offset<Movie> Movie::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MovieT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMovie(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Movie> CreateMovie(flatbuffers::FlatBufferBuilder &_fbb, const MovieT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MovieT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _main_character_type = _o->main_character.type;
+  auto _main_character = _o->main_character.Pack(_fbb);
+  auto _characters_type = _o->characters.size() ? _fbb.CreateVector<uint8_t>(_o->characters.size(), [](size_t i, _VectorArgs *__va) { return static_cast<uint8_t>(__va->__o->characters[i].type); }, &_va) : 0;
+  auto _characters = _o->characters.size() ? _fbb.CreateVector<flatbuffers::Offset<void>>(_o->characters.size(), [](size_t i, _VectorArgs *__va) { return __va->__o->characters[i].Pack(*__va->__fbb, __va->__rehasher); }, &_va) : 0;
+  return CreateMovie(
+      _fbb,
+      _main_character_type,
+      _main_character,
+      _characters_type,
+      _characters);
+}
+
+inline bool VerifyCharacter(flatbuffers::Verifier &verifier, const void *obj, Character type) {
+  switch (type) {
+    case Character_NONE: {
+      return true;
+    }
+    case Character_MuLan: {
+      auto ptr = reinterpret_cast<const Attacker *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Character_Rapunzel: {
+      return verifier.Verify<Rapunzel>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case Character_Belle: {
+      return verifier.Verify<BookReader>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case Character_BookFan: {
+      return verifier.Verify<BookReader>(static_cast<const uint8_t *>(obj), 0);
+    }
+    case Character_Other: {
+      auto ptr = reinterpret_cast<const flatbuffers::String *>(obj);
+      return verifier.VerifyString(ptr);
+    }
+    case Character_Unused: {
+      auto ptr = reinterpret_cast<const flatbuffers::String *>(obj);
+      return verifier.VerifyString(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyCharacterVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyCharacter(
+        verifier,  values->Get(i), types->GetEnum<Character>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *CharacterUnion::UnPack(const void *obj, Character type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case Character_MuLan: {
+      auto ptr = reinterpret_cast<const Attacker *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case Character_Rapunzel: {
+      auto ptr = reinterpret_cast<const Rapunzel *>(obj);
+      return new Rapunzel(*ptr);
+    }
+    case Character_Belle: {
+      auto ptr = reinterpret_cast<const BookReader *>(obj);
+      return new BookReader(*ptr);
+    }
+    case Character_BookFan: {
+      auto ptr = reinterpret_cast<const BookReader *>(obj);
+      return new BookReader(*ptr);
+    }
+    case Character_Other: {
+      auto ptr = reinterpret_cast<const flatbuffers::String *>(obj);
+      return new std::string(ptr->c_str(), ptr->size());
+    }
+    case Character_Unused: {
+      auto ptr = reinterpret_cast<const flatbuffers::String *>(obj);
+      return new std::string(ptr->c_str(), ptr->size());
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> CharacterUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case Character_MuLan: {
+      auto ptr = reinterpret_cast<const AttackerT *>(value);
+      return CreateAttacker(_fbb, ptr, _rehasher).Union();
+    }
+    case Character_Rapunzel: {
+      auto ptr = reinterpret_cast<const Rapunzel *>(value);
+      return _fbb.CreateStruct(*ptr).Union();
+    }
+    case Character_Belle: {
+      auto ptr = reinterpret_cast<const BookReader *>(value);
+      return _fbb.CreateStruct(*ptr).Union();
+    }
+    case Character_BookFan: {
+      auto ptr = reinterpret_cast<const BookReader *>(value);
+      return _fbb.CreateStruct(*ptr).Union();
+    }
+    case Character_Other: {
+      auto ptr = reinterpret_cast<const std::string *>(value);
+      return _fbb.CreateString(*ptr).Union();
+    }
+    case Character_Unused: {
+      auto ptr = reinterpret_cast<const std::string *>(value);
+      return _fbb.CreateString(*ptr).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline CharacterUnion::CharacterUnion(const CharacterUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case Character_MuLan: {
+      value = new AttackerT(*reinterpret_cast<AttackerT *>(u.value));
+      break;
+    }
+    case Character_Rapunzel: {
+      value = new Rapunzel(*reinterpret_cast<Rapunzel *>(u.value));
+      break;
+    }
+    case Character_Belle: {
+      value = new BookReader(*reinterpret_cast<BookReader *>(u.value));
+      break;
+    }
+    case Character_BookFan: {
+      value = new BookReader(*reinterpret_cast<BookReader *>(u.value));
+      break;
+    }
+    case Character_Other: {
+      value = new std::string(*reinterpret_cast<std::string *>(u.value));
+      break;
+    }
+    case Character_Unused: {
+      value = new std::string(*reinterpret_cast<std::string *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void CharacterUnion::Reset() {
+  switch (type) {
+    case Character_MuLan: {
+      auto ptr = reinterpret_cast<AttackerT *>(value);
+      delete ptr;
+      break;
+    }
+    case Character_Rapunzel: {
+      auto ptr = reinterpret_cast<Rapunzel *>(value);
+      delete ptr;
+      break;
+    }
+    case Character_Belle: {
+      auto ptr = reinterpret_cast<BookReader *>(value);
+      delete ptr;
+      break;
+    }
+    case Character_BookFan: {
+      auto ptr = reinterpret_cast<BookReader *>(value);
+      delete ptr;
+      break;
+    }
+    case Character_Other: {
+      auto ptr = reinterpret_cast<std::string *>(value);
+      delete ptr;
+      break;
+    }
+    case Character_Unused: {
+      auto ptr = reinterpret_cast<std::string *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = Character_NONE;
+}
+
+inline const flatbuffers::TypeTable *CharacterTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_STRING, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    AttackerTypeTable,
+    RapunzelTypeTable,
+    BookReaderTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "MuLan",
+    "Rapunzel",
+    "Belle",
+    "BookFan",
+    "Other",
+    "Unused"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AttackerTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "sword_attack_damage"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RapunzelTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4 };
+  static const char * const names[] = {
+    "hair_length"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 1, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BookReaderTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const int64_t values[] = { 0, 4 };
+  static const char * const names[] = {
+    "books_read"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_STRUCT, 1, type_codes, nullptr, nullptr, values, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MovieTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UTYPE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_UTYPE, 1, 0 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    CharacterTypeTable
+  };
+  static const char * const names[] = {
+    "main_character_type",
+    "main_character",
+    "characters_type",
+    "characters"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const Movie *GetMovie(const void *buf) {
+  return flatbuffers::GetRoot<Movie>(buf);
+}
+
+inline const Movie *GetSizePrefixedMovie(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<Movie>(buf);
+}
+
+inline Movie *GetMutableMovie(void *buf) {
+  return flatbuffers::GetMutableRoot<Movie>(buf);
+}
+
+inline const char *MovieIdentifier() {
+  return "MOVI";
+}
+
+inline bool MovieBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, MovieIdentifier());
+}
+
+inline bool VerifyMovieBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<Movie>(MovieIdentifier());
+}
+
+inline bool VerifySizePrefixedMovieBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<Movie>(MovieIdentifier());
+}
+
+inline void FinishMovieBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Movie> root) {
+  fbb.Finish(root, MovieIdentifier());
+}
+
+inline void FinishSizePrefixedMovieBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<Movie> root) {
+  fbb.FinishSizePrefixed(root, MovieIdentifier());
+}
+
+inline flatbuffers::unique_ptr<MovieT> UnPackMovie(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MovieT>(GetMovie(buf)->UnPack(res));
+}
+
+inline flatbuffers::unique_ptr<MovieT> UnPackSizePrefixedMovie(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return flatbuffers::unique_ptr<MovieT>(GetSizePrefixedMovie(buf)->UnPack(res));
+}
+
+#endif  // FLATBUFFERS_GENERATED_UNIONVECTOR_H_
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/builder.ts b/3rdparty/TNN/third_party/flatbuffers/ts/builder.ts
new file mode 100644
index 0000000..7aad15e
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/builder.ts
@@ -0,0 +1,628 @@
+import { ByteBuffer } from "./byte-buffer"
+import { SIZEOF_SHORT, SIZE_PREFIX_LENGTH, SIZEOF_INT, FILE_IDENTIFIER_LENGTH } from "./constants"
+import { Offset, IGeneratedObject } from "./types"
+import { Long } from "./long"
+
+export class Builder {
+    private bb: ByteBuffer
+    /** Remaining space in the ByteBuffer. */
+    private space: number
+    /** Minimum alignment encountered so far. */
+    private minalign = 1
+    /** The vtable for the current table. */
+    private vtable: number[] | null = null
+    /** The amount of fields we're actually using. */
+    private vtable_in_use = 0
+    /** Whether we are currently serializing a table. */
+    private isNested = false;
+    /** Starting offset of the current struct/table. */
+    private object_start = 0
+    /** List of offsets of all vtables. */
+    private vtables: number[] = []
+    /** For the current vector being built. */
+    private vector_num_elems = 0 
+    /** False omits default values from the serialized data */
+    private force_defaults = false;
+    
+    private string_maps: Map<string | Uint8Array, number> | null = null;
+  
+    /**
+     * Create a FlatBufferBuilder.
+     */
+    constructor(opt_initial_size?: number) {
+      let initial_size: number;
+  
+      if (!opt_initial_size) {
+        initial_size = 1024;
+      } else {
+        initial_size = opt_initial_size;
+      }
+  
+      /**
+       * @type {ByteBuffer}
+       * @private
+       */
+      this.bb = ByteBuffer.allocate(initial_size);
+      this.space = initial_size;
+    }
+  
+  
+    clear(): void {
+      this.bb.clear();
+      this.space = this.bb.capacity();
+      this.minalign = 1;
+      this.vtable = null;
+      this.vtable_in_use = 0;
+      this.isNested = false;
+      this.object_start = 0;
+      this.vtables = [];
+      this.vector_num_elems = 0;
+      this.force_defaults = false;
+      this.string_maps = null;
+    }
+  
+    /**
+     * In order to save space, fields that are set to their default value
+     * don't get serialized into the buffer. Forcing defaults provides a
+     * way to manually disable this optimization.
+     *
+     * @param forceDefaults true always serializes default values
+     */
+    forceDefaults(forceDefaults: boolean): void {
+      this.force_defaults = forceDefaults;
+    }
+  
+    /**
+     * Get the ByteBuffer representing the FlatBuffer. Only call this after you've
+     * called finish(). The actual data starts at the ByteBuffer's current position,
+     * not necessarily at 0.
+     */
+    dataBuffer(): ByteBuffer {
+      return this.bb;
+    }
+  
+    /**
+     * Get the bytes representing the FlatBuffer. Only call this after you've
+     * called finish().
+     */
+    asUint8Array(): Uint8Array {
+      return this.bb.bytes().subarray(this.bb.position(), this.bb.position() + this.offset());
+    }
+  
+    /**
+     * Prepare to write an element of `size` after `additional_bytes` have been
+     * written, e.g. if you write a string, you need to align such the int length
+     * field is aligned to 4 bytes, and the string data follows it directly. If all
+     * you need to do is alignment, `additional_bytes` will be 0.
+     *
+     * @param size This is the of the new element to write
+     * @param additional_bytes The padding size
+     */
+    prep(size: number, additional_bytes: number): void {
+      // Track the biggest thing we've ever aligned to.
+      if (size > this.minalign) {
+        this.minalign = size;
+      }
+  
+      // Find the amount of alignment needed such that `size` is properly
+      // aligned after `additional_bytes`
+      const align_size = ((~(this.bb.capacity() - this.space + additional_bytes)) + 1) & (size - 1);
+  
+      // Reallocate the buffer if needed.
+      while (this.space < align_size + size + additional_bytes) {
+        const old_buf_size = this.bb.capacity();
+        this.bb = Builder.growByteBuffer(this.bb);
+        this.space += this.bb.capacity() - old_buf_size;
+      }
+  
+      this.pad(align_size);
+    }
+  
+    pad(byte_size: number): void {
+      for (let i = 0; i < byte_size; i++) {
+        this.bb.writeInt8(--this.space, 0);
+      }
+    }
+  
+    writeInt8(value: number): void {
+      this.bb.writeInt8(this.space -= 1, value);
+    }
+  
+    writeInt16(value: number): void {
+      this.bb.writeInt16(this.space -= 2, value);
+    }
+  
+    writeInt32(value: number): void {
+      this.bb.writeInt32(this.space -= 4, value);
+    }
+  
+    writeInt64(value: Long): void {
+      this.bb.writeInt64(this.space -= 8, value);
+    }
+  
+    writeFloat32(value: number): void {
+      this.bb.writeFloat32(this.space -= 4, value);
+    }
+  
+    writeFloat64(value: number): void {
+      this.bb.writeFloat64(this.space -= 8, value);
+    }
+  
+    /**
+     * Add an `int8` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `int8` to add the the buffer.
+     */
+    addInt8(value: number): void {
+      this.prep(1, 0);
+      this.writeInt8(value);
+    }
+  
+    /**
+     * Add an `int16` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `int16` to add the the buffer.
+     */
+    addInt16(value: number): void {
+      this.prep(2, 0);
+      this.writeInt16(value);
+    }
+  
+    /**
+     * Add an `int32` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `int32` to add the the buffer.
+     */
+    addInt32(value: number): void {
+      this.prep(4, 0);
+      this.writeInt32(value);
+    }
+  
+    /**
+     * Add an `int64` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `int64` to add the the buffer.
+     */
+    addInt64(value: Long): void {
+      this.prep(8, 0);
+      this.writeInt64(value);
+    }
+  
+    /**
+     * Add a `float32` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `float32` to add the the buffer.
+     */
+    addFloat32(value: number): void {
+      this.prep(4, 0);
+      this.writeFloat32(value);
+    }
+  
+    /**
+     * Add a `float64` to the buffer, properly aligned, and grows the buffer (if necessary).
+     * @param value The `float64` to add the the buffer.
+     */
+    addFloat64(value: number): void {
+      this.prep(8, 0);
+      this.writeFloat64(value);
+    }
+  
+    addFieldInt8(voffset: number, value: number, defaultValue: number): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addInt8(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldInt16(voffset: number, value: number, defaultValue: number): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addInt16(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldInt32(voffset: number, value: number, defaultValue: number): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addInt32(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldInt64(voffset: number, value: Long, defaultValue: Long): void {
+      if (this.force_defaults || !value.equals(defaultValue)) {
+        this.addInt64(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldFloat32(voffset: number, value: number, defaultValue: number): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addFloat32(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldFloat64(voffset: number, value: number, defaultValue: number): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addFloat64(value);
+        this.slot(voffset);
+      }
+    }
+  
+    addFieldOffset(voffset: number, value: Offset, defaultValue: Offset): void {
+      if (this.force_defaults || value != defaultValue) {
+        this.addOffset(value);
+        this.slot(voffset);
+      }
+    }
+  
+    /**
+     * Structs are stored inline, so nothing additional is being added. `d` is always 0.
+     */
+    addFieldStruct(voffset: number, value: Offset, defaultValue: Offset): void {
+      if (value != defaultValue) {
+        this.nested(value);
+        this.slot(voffset);
+      }
+    }
+  
+    /**
+     * Structures are always stored inline, they need to be created right
+     * where they're used.  You'll get this assertion failure if you
+     * created it elsewhere.
+     */
+    nested(obj: Offset): void {
+      if (obj != this.offset()) {
+        throw new Error('FlatBuffers: struct must be serialized inline.');
+      }
+    }
+  
+    /**
+     * Should not be creating any other object, string or vector
+     * while an object is being constructed
+     */
+    notNested(): void {
+      if (this.isNested) {
+        throw new Error('FlatBuffers: object serialization must not be nested.');
+      }
+    }
+  
+    /**
+     * Set the current vtable at `voffset` to the current location in the buffer.
+     */
+    slot(voffset: number): void {
+      if (this.vtable !== null)
+        this.vtable[voffset] = this.offset();
+    }
+  
+    /**
+     * @returns Offset relative to the end of the buffer.
+     */
+    offset(): Offset {
+      return this.bb.capacity() - this.space;
+    }
+  
+    /**
+     * Doubles the size of the backing ByteBuffer and copies the old data towards
+     * the end of the new buffer (since we build the buffer backwards).
+     *
+     * @param bb The current buffer with the existing data
+     * @returns A new byte buffer with the old data copied
+     * to it. The data is located at the end of the buffer.
+     *
+     * uint8Array.set() formally takes {Array<number>|ArrayBufferView}, so to pass
+     * it a uint8Array we need to suppress the type check:
+     * @suppress {checkTypes}
+     */
+    static growByteBuffer(bb: ByteBuffer): ByteBuffer {
+      const old_buf_size = bb.capacity();
+  
+      // Ensure we don't grow beyond what fits in an int.
+      if (old_buf_size & 0xC0000000) {
+        throw new Error('FlatBuffers: cannot grow buffer beyond 2 gigabytes.');
+      }
+  
+      const new_buf_size = old_buf_size << 1;
+      const nbb = ByteBuffer.allocate(new_buf_size);
+      nbb.setPosition(new_buf_size - old_buf_size);
+      nbb.bytes().set(bb.bytes(), new_buf_size - old_buf_size);
+      return nbb;
+    }
+  
+    /**
+     * Adds on offset, relative to where it will be written.
+     *
+     * @param offset The offset to add.
+     */
+    addOffset(offset: Offset): void {
+      this.prep(SIZEOF_INT, 0); // Ensure alignment is already done.
+      this.writeInt32(this.offset() - offset + SIZEOF_INT);
+    }
+  
+    /**
+     * Start encoding a new object in the buffer.  Users will not usually need to
+     * call this directly. The FlatBuffers compiler will generate helper methods
+     * that call this method internally.
+     */
+    startObject(numfields: number): void {
+      this.notNested();
+      if (this.vtable == null) {
+        this.vtable = [];
+      }
+      this.vtable_in_use = numfields;
+      for (let i = 0; i < numfields; i++) {
+        this.vtable[i] = 0; // This will push additional elements as needed
+      }
+      this.isNested = true;
+      this.object_start = this.offset();
+    }
+  
+    /**
+     * Finish off writing the object that is under construction.
+     *
+     * @returns The offset to the object inside `dataBuffer`
+     */
+    endObject(): Offset {
+      if (this.vtable == null || !this.isNested) {
+        throw new Error('FlatBuffers: endObject called without startObject');
+      }
+  
+      this.addInt32(0);
+      const vtableloc = this.offset();
+  
+      // Trim trailing zeroes.
+      let i = this.vtable_in_use - 1;
+      // eslint-disable-next-line no-empty
+      for (; i >= 0 && this.vtable[i] == 0; i--) {}
+      const trimmed_size = i + 1;
+  
+      // Write out the current vtable.
+      for (; i >= 0; i--) {
+        // Offset relative to the start of the table.
+        this.addInt16(this.vtable[i] != 0 ? vtableloc - this.vtable[i] : 0);
+      }
+  
+      const standard_fields = 2; // The fields below:
+      this.addInt16(vtableloc - this.object_start);
+      const len = (trimmed_size + standard_fields) * SIZEOF_SHORT;
+      this.addInt16(len);
+  
+      // Search for an existing vtable that matches the current one.
+      let existing_vtable = 0;
+      const vt1 = this.space;
+    outer_loop:
+      for (i = 0; i < this.vtables.length; i++) {
+        const vt2 = this.bb.capacity() - this.vtables[i];
+        if (len == this.bb.readInt16(vt2)) {
+          for (let j = SIZEOF_SHORT; j < len; j += SIZEOF_SHORT) {
+            if (this.bb.readInt16(vt1 + j) != this.bb.readInt16(vt2 + j)) {
+              continue outer_loop;
+            }
+          }
+          existing_vtable = this.vtables[i];
+          break;
+        }
+      }
+  
+      if (existing_vtable) {
+        // Found a match:
+        // Remove the current vtable.
+        this.space = this.bb.capacity() - vtableloc;
+  
+        // Point table to existing vtable.
+        this.bb.writeInt32(this.space, existing_vtable - vtableloc);
+      } else {
+        // No match:
+        // Add the location of the current vtable to the list of vtables.
+        this.vtables.push(this.offset());
+  
+        // Point table to current vtable.
+        this.bb.writeInt32(this.bb.capacity() - vtableloc, this.offset() - vtableloc);
+      }
+  
+      this.isNested = false;
+      return vtableloc as Offset;
+    }
+  
+    /**
+     * Finalize a buffer, poiting to the given `root_table`.
+     */
+    finish(root_table: Offset, opt_file_identifier?: string, opt_size_prefix?: boolean): void {
+      const size_prefix = opt_size_prefix ? SIZE_PREFIX_LENGTH : 0;
+      if (opt_file_identifier) {
+        const file_identifier = opt_file_identifier;
+        this.prep(this.minalign, SIZEOF_INT +
+          FILE_IDENTIFIER_LENGTH + size_prefix);
+        if (file_identifier.length != FILE_IDENTIFIER_LENGTH) {
+          throw new Error('FlatBuffers: file identifier must be length ' +
+            FILE_IDENTIFIER_LENGTH);
+        }
+        for (let i = FILE_IDENTIFIER_LENGTH - 1; i >= 0; i--) {
+          this.writeInt8(file_identifier.charCodeAt(i));
+        }
+      }
+      this.prep(this.minalign, SIZEOF_INT + size_prefix);
+      this.addOffset(root_table);
+      if (size_prefix) {
+        this.addInt32(this.bb.capacity() - this.space);
+      }
+      this.bb.setPosition(this.space);
+    }
+  
+    /**
+     * Finalize a size prefixed buffer, pointing to the given `root_table`.
+     */
+    finishSizePrefixed(this: Builder, root_table: Offset, opt_file_identifier?: string): void {
+      this.finish(root_table, opt_file_identifier, true);
+    }
+  
+    /**
+     * This checks a required field has been set in a given table that has
+     * just been constructed.
+     */
+    requiredField(table: Offset, field: number): void {
+      const table_start = this.bb.capacity() - table;
+      const vtable_start = table_start - this.bb.readInt32(table_start);
+      const ok = this.bb.readInt16(vtable_start + field) != 0;
+  
+      // If this fails, the caller will show what field needs to be set.
+      if (!ok) {
+        throw new Error('FlatBuffers: field ' + field + ' must be set');
+      }
+    }
+  
+    /**
+     * Start a new array/vector of objects.  Users usually will not call
+     * this directly. The FlatBuffers compiler will create a start/end
+     * method for vector types in generated code.
+     *
+     * @param elem_size The size of each element in the array
+     * @param num_elems The number of elements in the array
+     * @param alignment The alignment of the array
+     */
+    startVector(elem_size: number, num_elems: number, alignment: number): void {
+      this.notNested();
+      this.vector_num_elems = num_elems;
+      this.prep(SIZEOF_INT, elem_size * num_elems);
+      this.prep(alignment, elem_size * num_elems); // Just in case alignment > int.
+    }
+  
+    /**
+     * Finish off the creation of an array and all its elements. The array must be
+     * created with `startVector`.
+     *
+     * @returns The offset at which the newly created array
+     * starts.
+     */
+    endVector(): Offset {
+      this.writeInt32(this.vector_num_elems);
+      return this.offset();
+    }
+  
+    /**
+     * Encode the string `s` in the buffer using UTF-8. If the string passed has 
+     * already been seen, we return the offset of the already written string
+     *
+     * @param s The string to encode
+     * @return The offset in the buffer where the encoded string starts
+     */
+    createSharedString(s: string | Uint8Array): Offset {
+      if (!s) { return 0 }
+  
+      if (!this.string_maps) {
+        this.string_maps = new Map();
+      }
+  
+      if (this.string_maps.has(s)) {
+        return this.string_maps.get(s) as Offset
+      }
+      const offset = this.createString(s)
+      this.string_maps.set(s, offset)
+      return offset
+    }
+  
+    /**
+     * Encode the string `s` in the buffer using UTF-8. If a Uint8Array is passed
+     * instead of a string, it is assumed to contain valid UTF-8 encoded data.
+     *
+     * @param s The string to encode
+     * @return The offset in the buffer where the encoded string starts
+     */
+    createString(s: string | Uint8Array): Offset {
+      if (!s) { return 0 }
+      let utf8: string | Uint8Array | number[];
+      if (s instanceof Uint8Array) {
+        utf8 = s;
+      } else {
+        utf8 = [];
+        let i = 0;
+  
+        while (i < s.length) {
+          let codePoint;
+  
+          // Decode UTF-16
+          const a = s.charCodeAt(i++);
+          if (a < 0xD800 || a >= 0xDC00) {
+            codePoint = a;
+          } else {
+            const b = s.charCodeAt(i++);
+            codePoint = (a << 10) + b + (0x10000 - (0xD800 << 10) - 0xDC00);
+          }
+  
+          // Encode UTF-8
+          if (codePoint < 0x80) {
+            utf8.push(codePoint);
+          } else {
+            if (codePoint < 0x800) {
+              utf8.push(((codePoint >> 6) & 0x1F) | 0xC0);
+            } else {
+              if (codePoint < 0x10000) {
+                utf8.push(((codePoint >> 12) & 0x0F) | 0xE0);
+              } else {
+                utf8.push(
+                  ((codePoint >> 18) & 0x07) | 0xF0,
+                  ((codePoint >> 12) & 0x3F) | 0x80);
+              }
+              utf8.push(((codePoint >> 6) & 0x3F) | 0x80);
+            }
+            utf8.push((codePoint & 0x3F) | 0x80);
+          }
+        }
+      }
+  
+      this.addInt8(0);
+      this.startVector(1, utf8.length, 1);
+      this.bb.setPosition(this.space -= utf8.length);
+      for (let i = 0, offset = this.space, bytes = this.bb.bytes(); i < utf8.length; i++) {
+        bytes[offset++] = utf8[i];
+      }
+      return this.endVector();
+    }
+  
+    /**
+     * A helper function to avoid generated code depending on this file directly.
+     */
+    createLong(low: number, high: number): Long {
+      return Long.create(low, high);
+    }
+  
+    /**
+     * A helper function to pack an object
+     * 
+     * @returns offset of obj
+     */
+    createObjectOffset(obj: string | any): Offset {
+      if(obj === null) {
+        return 0
+      }
+  
+      if(typeof obj === 'string') {
+        return this.createString(obj);
+      } else {
+        return obj.pack(this);
+      }
+    }
+  
+    /**
+     * A helper function to pack a list of object
+     * 
+     * @returns list of offsets of each non null object
+     */
+    createObjectOffsetList(list: string[] | any[]): Offset[] {
+      const ret: number[] = [];
+  
+      for(let i = 0; i < list.length; ++i) {
+        const val = list[i];
+  
+        if(val !== null) {
+          ret.push(this.createObjectOffset(val));
+        } else {
+          throw new Error(
+            'FlatBuffers: Argument for createObjectOffsetList cannot contain null.'); 
+        }
+      }
+      
+      return ret;
+    }
+  
+    createStructOffsetList(list: string[] | any[], startFunc: (builder: Builder, length: number) => void): Offset {
+      startFunc(this, list.length);
+      this.createObjectOffsetList(list);
+      return this.endVector();
+    }
+  }
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/byte-buffer.ts b/3rdparty/TNN/third_party/flatbuffers/ts/byte-buffer.ts
new file mode 100644
index 0000000..f802548
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/byte-buffer.ts
@@ -0,0 +1,344 @@
+import { FILE_IDENTIFIER_LENGTH, SIZEOF_INT } from "./constants";
+import { Long } from "./long";
+import { int32, isLittleEndian, float32, float64 } from "./utils";
+import { Offset, Table, IGeneratedObject } from "./types";
+import { Encoding } from "./encoding";
+
+export class ByteBuffer {
+    private position_ = 0;
+  
+    /**
+     * Create a new ByteBuffer with a given array of bytes (`Uint8Array`)
+     */
+    constructor(private bytes_: Uint8Array) { }
+  
+    /**
+     * Create and allocate a new ByteBuffer with a given size.
+     */
+    static allocate(byte_size: number): ByteBuffer {
+      return new ByteBuffer(new Uint8Array(byte_size));
+    }
+  
+    clear(): void {
+      this.position_ = 0;
+    }
+  
+    /**
+     * Get the underlying `Uint8Array`.
+     */
+    bytes(): Uint8Array {
+      return this.bytes_;
+    }
+  
+    /**
+     * Get the buffer's position.
+     */
+    position(): number {
+      return this.position_;
+    }
+  
+    /**
+     * Set the buffer's position.
+     */
+    setPosition(position: number): void {
+      this.position_ = position;
+    }
+  
+    /**
+     * Get the buffer's capacity.
+     */
+    capacity(): number {
+      return this.bytes_.length;
+    }
+  
+    readInt8(offset: number): number {
+      return this.readUint8(offset) << 24 >> 24;
+    }
+  
+    readUint8(offset: number): number {
+      return this.bytes_[offset];
+    }
+  
+    readInt16(offset: number): number {
+      return this.readUint16(offset) << 16 >> 16;
+    }
+  
+    readUint16(offset: number): number {
+      return this.bytes_[offset] | this.bytes_[offset + 1] << 8;
+    }
+  
+    readInt32(offset: number): number {
+      return this.bytes_[offset] | this.bytes_[offset + 1] << 8 | this.bytes_[offset + 2] << 16 | this.bytes_[offset + 3] << 24;
+    }
+  
+    readUint32(offset: number): number {
+      return this.readInt32(offset) >>> 0;
+    }
+  
+    readInt64(offset: number): Long {
+      return new Long(this.readInt32(offset), this.readInt32(offset + 4));
+    }
+  
+    readUint64(offset: number): Long {
+      return new Long(this.readUint32(offset), this.readUint32(offset + 4));
+    }
+  
+    readFloat32(offset: number): number {
+      int32[0] = this.readInt32(offset);
+      return float32[0];
+    }
+  
+    readFloat64(offset: number): number {
+      int32[isLittleEndian ? 0 : 1] = this.readInt32(offset);
+      int32[isLittleEndian ? 1 : 0] = this.readInt32(offset + 4);
+      return float64[0];
+    }
+  
+    writeInt8(offset: number, value: number): void {
+      this.bytes_[offset] = value;
+    }
+  
+    writeUint8(offset: number, value: number): void {
+      this.bytes_[offset] = value;
+    }
+  
+    writeInt16(offset: number, value: number): void {
+      this.bytes_[offset] = value;
+      this.bytes_[offset + 1] = value >> 8;
+    }
+  
+    writeUint16(offset: number, value: number): void {
+        this.bytes_[offset] = value;
+        this.bytes_[offset + 1] = value >> 8;
+    }
+  
+    writeInt32(offset: number, value: number): void {
+      this.bytes_[offset] = value;
+      this.bytes_[offset + 1] = value >> 8;
+      this.bytes_[offset + 2] = value >> 16;
+      this.bytes_[offset + 3] = value >> 24;
+    }
+  
+    writeUint32(offset: number, value: number): void {
+        this.bytes_[offset] = value;
+        this.bytes_[offset + 1] = value >> 8;
+        this.bytes_[offset + 2] = value >> 16;
+        this.bytes_[offset + 3] = value >> 24;
+    }
+  
+    writeInt64(offset: number, value: Long): void {
+      this.writeInt32(offset, value.low);
+      this.writeInt32(offset + 4, value.high);
+    }
+  
+    writeUint64(offset: number, value: Long): void {
+        this.writeUint32(offset, value.low);
+        this.writeUint32(offset + 4, value.high);
+    }
+  
+    writeFloat32(offset: number, value: number): void {
+      float32[0] = value;
+      this.writeInt32(offset, int32[0]);
+    }
+  
+    writeFloat64(offset: number, value: number): void {
+      float64[0] = value;
+      this.writeInt32(offset, int32[isLittleEndian ? 0 : 1]);
+      this.writeInt32(offset + 4, int32[isLittleEndian ? 1 : 0]);
+    }
+  
+    /**
+     * Return the file identifier.   Behavior is undefined for FlatBuffers whose
+     * schema does not include a file_identifier (likely points at padding or the
+     * start of a the root vtable).
+     */
+    getBufferIdentifier(): string {
+      if (this.bytes_.length < this.position_ + SIZEOF_INT +
+          FILE_IDENTIFIER_LENGTH) {
+        throw new Error(
+            'FlatBuffers: ByteBuffer is too short to contain an identifier.');
+      }
+      let result = "";
+      for (let i = 0; i < FILE_IDENTIFIER_LENGTH; i++) {
+        result += String.fromCharCode(
+            this.readInt8(this.position_ + SIZEOF_INT + i));
+      }
+      return result;
+    }
+  
+    /**
+     * Look up a field in the vtable, return an offset into the object, or 0 if the
+     * field is not present.
+     */
+    __offset(bb_pos: number, vtable_offset: number): Offset {
+      const vtable = bb_pos - this.readInt32(bb_pos);
+      return vtable_offset < this.readInt16(vtable) ? this.readInt16(vtable + vtable_offset) : 0;
+    }
+  
+    /**
+     * Initialize any Table-derived type to point to the union at the given offset.
+     */
+    __union(t: Table, offset: number): Table {
+      t.bb_pos = offset + this.readInt32(offset);
+      t.bb = this;
+      return t;
+    }
+  
+    /**
+     * Create a JavaScript string from UTF-8 data stored inside the FlatBuffer.
+     * This allocates a new string and converts to wide chars upon each access.
+     *
+     * To avoid the conversion to UTF-16, pass Encoding.UTF8_BYTES as
+     * the "optionalEncoding" argument. This is useful for avoiding conversion to
+     * and from UTF-16 when the data will just be packaged back up in another
+     * FlatBuffer later on.
+     *
+     * @param offset
+     * @param opt_encoding Defaults to UTF16_STRING
+     */
+    __string(offset: number, opt_encoding?: Encoding): string | Uint8Array {
+      offset += this.readInt32(offset);
+  
+      const length = this.readInt32(offset);
+      let result = '';
+      let i = 0;
+  
+      offset += SIZEOF_INT;
+  
+      if (opt_encoding === Encoding.UTF8_BYTES) {
+        return this.bytes_.subarray(offset, offset + length);
+      }
+  
+      while (i < length) {
+        let codePoint;
+  
+        // Decode UTF-8
+        const a = this.readUint8(offset + i++);
+        if (a < 0xC0) {
+          codePoint = a;
+        } else {
+          const b = this.readUint8(offset + i++);
+          if (a < 0xE0) {
+            codePoint =
+              ((a & 0x1F) << 6) |
+              (b & 0x3F);
+          } else {
+            const c = this.readUint8(offset + i++);
+            if (a < 0xF0) {
+              codePoint =
+                ((a & 0x0F) << 12) |
+                ((b & 0x3F) << 6) |
+                (c & 0x3F);
+            } else {
+              const d = this.readUint8(offset + i++);
+              codePoint =
+                ((a & 0x07) << 18) |
+                ((b & 0x3F) << 12) |
+                ((c & 0x3F) << 6) |
+                (d & 0x3F);
+            }
+          }
+        }
+  
+        // Encode UTF-16
+        if (codePoint < 0x10000) {
+          result += String.fromCharCode(codePoint);
+        } else {
+          codePoint -= 0x10000;
+          result += String.fromCharCode(
+            (codePoint >> 10) + 0xD800,
+            (codePoint & ((1 << 10) - 1)) + 0xDC00);
+        }
+      }
+  
+      return result;
+    }
+  
+    /**
+     * Handle unions that can contain string as its member, if a Table-derived type then initialize it, 
+     * if a string then return a new one
+     * 
+     * WARNING: strings are immutable in JS so we can't change the string that the user gave us, this 
+     * makes the behaviour of __union_with_string different compared to __union
+     */
+    __union_with_string(o: Table | string, offset: number) : Table | string {
+      if(typeof o === 'string') {
+        return this.__string(offset) as string;
+      } 
+      return this.__union(o, offset);
+    }
+  
+    /**
+     * Retrieve the relative offset stored at "offset"
+     */
+    __indirect(offset: Offset): Offset {
+      return offset + this.readInt32(offset);
+    }
+  
+    /**
+     * Get the start of data of a vector whose offset is stored at "offset" in this object.
+     */
+    __vector(offset: Offset): Offset {
+      return offset + this.readInt32(offset) + SIZEOF_INT; // data starts after the length
+    }
+  
+    /**
+     * Get the length of a vector whose offset is stored at "offset" in this object.
+     */
+    __vector_len(offset: Offset): Offset {
+      return this.readInt32(offset + this.readInt32(offset));
+    }
+  
+    __has_identifier(ident: string): boolean {
+      if (ident.length != FILE_IDENTIFIER_LENGTH) {
+        throw new Error('FlatBuffers: file identifier must be length ' +
+                        FILE_IDENTIFIER_LENGTH);
+      }
+      for (let i = 0; i < FILE_IDENTIFIER_LENGTH; i++) {
+        if (ident.charCodeAt(i) != this.readInt8(this.position() + SIZEOF_INT + i)) {
+          return false;
+        }
+      }
+      return true;
+    }
+  
+    /**
+     * A helper function to avoid generated code depending on this file directly.
+     */
+    createLong(low: number, high: number): Long {
+      return Long.create(low, high);
+    }
+  
+    /**
+     * A helper function for generating list for obj api
+     */
+    createScalarList(listAccessor: (i: number) => unknown, listLength: number): any[] {
+      const ret: any[]  = [];
+      for(let i = 0; i < listLength; ++i) {
+        if(listAccessor(i) !== null) {
+          ret.push(listAccessor(i));
+        }
+      }
+  
+      return ret;
+    }
+  
+    /**
+     * A helper function for generating list for obj api
+     * @param listAccessor function that accepts an index and return data at that index
+     * @param listLength listLength
+     * @param res result list
+     */
+    createObjList(listAccessor: (i: number) => unknown, listLength: number): any[] {
+      const ret: any[] = [];
+      for(let i = 0; i < listLength; ++i) {
+        const val = listAccessor(i);
+        if(val !== null) {
+          ret.push((val as IGeneratedObject).unpack());
+        }
+      }
+      
+      return ret;
+    }
+  
+  }
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/constants.ts b/3rdparty/TNN/third_party/flatbuffers/ts/constants.ts
new file mode 100644
index 0000000..04510fb
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/constants.ts
@@ -0,0 +1,4 @@
+export const SIZEOF_SHORT = 2;
+export const SIZEOF_INT = 4;
+export const FILE_IDENTIFIER_LENGTH = 4;
+export const SIZE_PREFIX_LENGTH = 4;
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/encoding.ts b/3rdparty/TNN/third_party/flatbuffers/ts/encoding.ts
new file mode 100644
index 0000000..856792c
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/encoding.ts
@@ -0,0 +1,4 @@
+export enum Encoding {
+    UTF8_BYTES = 1,
+    UTF16_STRING = 2
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flatbuffers.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flatbuffers.ts
new file mode 100644
index 0000000..e7b0729
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flatbuffers.ts
@@ -0,0 +1,13 @@
+export { SIZEOF_SHORT } from './constants'
+export { SIZEOF_INT } from './constants'
+export { FILE_IDENTIFIER_LENGTH } from './constants'
+export { SIZE_PREFIX_LENGTH } from './constants'
+
+export { Table, Offset } from './types'
+
+export { int32, float32, float64, isLittleEndian } from './utils'
+
+export { Long, createLong } from './long'
+export { Encoding } from './encoding'
+export { Builder } from './builder'
+export { ByteBuffer } from './byte-buffer'
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers.ts
new file mode 100644
index 0000000..fa518ec
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers.ts
@@ -0,0 +1,18 @@
+/* eslint-disable @typescript-eslint/no-namespace */
+import { Builder } from './flexbuffers/builder'
+import { toReference } from './flexbuffers/reference'
+export { toReference } from './flexbuffers/reference'
+
+export function builder(): Builder {
+    return new Builder();
+}
+
+export function toObject(buffer: Uint8Array): unknown {
+    return toReference(buffer).toObject();
+}
+
+export function encode(object: unknown, size = 2048, deduplicateStrings = true, deduplicateKeys = true, deduplicateKeyVectors = true): Uint8Array {
+    const builder = new Builder(size > 0 ? size : 2048, deduplicateStrings, deduplicateKeys, deduplicateKeyVectors);
+    builder.add(object);
+    return builder.finish();
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width-util.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width-util.ts
new file mode 100644
index 0000000..acb3c96
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width-util.ts
@@ -0,0 +1,34 @@
+import { BitWidth } from './bit-width'
+
+export function toByteWidth(bitWidth: BitWidth): number {
+  return 1 << bitWidth;
+}
+
+export function iwidth(value: number | bigint): BitWidth {
+  if (value >= -128 && value <= 127) return BitWidth.WIDTH8;
+  if (value >= -32768 && value <= 32767) return BitWidth.WIDTH16;
+  if (value >= -2147483648 && value <= 2147483647) return BitWidth.WIDTH32;
+  return BitWidth.WIDTH64;
+}
+
+export function fwidth(value: number): BitWidth {
+  return value === Math.fround(value) ? BitWidth.WIDTH32 : BitWidth.WIDTH64;
+}
+
+export function uwidth(value: number): BitWidth {
+  if (value <= 255) return BitWidth.WIDTH8;
+  if (value <= 65535) return BitWidth.WIDTH16;
+  if (value <= 4294967295) return BitWidth.WIDTH32;
+  return BitWidth.WIDTH64;
+}
+
+export function fromByteWidth(value: number): BitWidth {
+  if (value === 1) return BitWidth.WIDTH8;
+  if (value === 2) return BitWidth.WIDTH16;
+  if (value === 4) return BitWidth.WIDTH32;
+  return BitWidth.WIDTH64;
+}
+
+export function paddingSize(bufSize: number, scalarSize: number): number {
+  return (~bufSize + 1) & (scalarSize - 1);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width.ts
new file mode 100644
index 0000000..5f85b61
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/bit-width.ts
@@ -0,0 +1,6 @@
+export enum BitWidth {
+  WIDTH8 = 0,
+  WIDTH16 = 1,
+  WIDTH32 = 2,
+  WIDTH64 = 3,
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/builder.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/builder.ts
new file mode 100644
index 0000000..4321547
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/builder.ts
@@ -0,0 +1,549 @@
+import { BitWidth } from './bit-width'
+import { paddingSize, iwidth, uwidth, fwidth, toByteWidth, fromByteWidth } from './bit-width-util'
+import { toUTF8Array } from './flexbuffers-util'
+import { ValueType } from './value-type'
+import { isNumber, isTypedVectorElement, toTypedVector } from './value-type-util'
+import { StackValue } from './stack-value'
+
+interface StackPointer {
+  stackPosition: number,
+  isVector: boolean
+  presorted?: boolean
+}
+
+export class Builder {
+  buffer: ArrayBuffer
+  view: DataView
+
+  readonly stack: Array<StackValue> = [];
+  readonly stackPointers: Array<StackPointer> = [];
+  offset = 0;
+  finished = false;
+  readonly stringLookup: Record<string, StackValue> = {};
+  readonly keyLookup: Record<string, StackValue> = {};
+  readonly keyVectorLookup: Record<string, StackValue> = {};
+  readonly indirectIntLookup: Record<number, StackValue> = {};
+  readonly indirectUIntLookup: Record<number, StackValue> = {};
+  readonly indirectFloatLookup: Record<number, StackValue> = {};
+
+  constructor(size = 2048, private dedupStrings = true, private dedupKeys = true, private dedupKeyVectors = true) {
+    this.buffer = new ArrayBuffer(size > 0 ? size : 2048);
+    this.view = new DataView(this.buffer);
+  }
+
+  private align(width: BitWidth) {
+    const byteWidth = toByteWidth(width);
+    this.offset += paddingSize(this.offset, byteWidth);
+    return byteWidth;
+  }
+
+  computeOffset(newValueSize: number): number {
+    const targetOffset = this.offset + newValueSize;
+    let size = this.buffer.byteLength;
+    const prevSize = size;
+    while (size < targetOffset) {
+      size <<= 1;
+    }
+    if (prevSize < size) {
+      const prevBuffer = this.buffer;
+      this.buffer = new ArrayBuffer(size);
+      this.view = new DataView(this.buffer);
+      new Uint8Array(this.buffer).set(new Uint8Array(prevBuffer), 0);
+    }
+    return targetOffset;
+  }
+
+  pushInt(value: number, width: BitWidth): void {
+    if (width === BitWidth.WIDTH8) {
+      this.view.setInt8(this.offset, value);
+    } else if (width === BitWidth.WIDTH16) {
+      this.view.setInt16(this.offset, value, true);
+    } else if (width === BitWidth.WIDTH32) {
+      this.view.setInt32(this.offset, value, true);
+    } else if (width === BitWidth.WIDTH64) {
+      this.view.setBigInt64(this.offset, BigInt(value), true);
+    } else {
+      throw `Unexpected width: ${width} for value: ${value}`;
+    }
+  }
+
+  pushUInt(value: number, width: BitWidth): void {
+    if (width === BitWidth.WIDTH8) {
+      this.view.setUint8(this.offset, value);
+    } else if (width === BitWidth.WIDTH16) {
+      this.view.setUint16(this.offset, value, true);
+    } else if (width === BitWidth.WIDTH32) {
+      this.view.setUint32(this.offset, value, true);
+    } else if (width === BitWidth.WIDTH64) {
+      this.view.setBigUint64(this.offset, BigInt(value), true);
+    } else {
+      throw `Unexpected width: ${width} for value: ${value}`;
+    }
+  }
+
+  private writeInt(value: number, byteWidth: number) {
+    const newOffset = this.computeOffset(byteWidth);
+    this.pushInt(value, fromByteWidth(byteWidth));
+    this.offset = newOffset;
+  }
+
+  private writeUInt(value: number, byteWidth: number) {
+    const newOffset = this.computeOffset(byteWidth);
+    this.pushUInt(value, fromByteWidth(byteWidth));
+    this.offset = newOffset;
+  }
+
+  private writeBlob(arrayBuffer: ArrayBuffer) {
+    const length = arrayBuffer.byteLength;
+    const bitWidth = uwidth(length);
+    const byteWidth = this.align(bitWidth);
+    this.writeUInt(length, byteWidth);
+    const blobOffset = this.offset;
+    const newOffset = this.computeOffset(length);
+    new Uint8Array(this.buffer).set(new Uint8Array(arrayBuffer), blobOffset);
+    this.stack.push(this.offsetStackValue(blobOffset, ValueType.BLOB, bitWidth));
+    this.offset = newOffset;
+  }
+
+  private writeString(str: string): void {
+    if (this.dedupStrings && Object.prototype.hasOwnProperty.call(this.stringLookup, str)) {
+      this.stack.push(this.stringLookup[str]);
+      return;
+    }
+    const utf8 = toUTF8Array(str);
+    const length = utf8.length;
+    const bitWidth = uwidth(length);
+    const byteWidth = this.align(bitWidth);
+    this.writeUInt(length, byteWidth);
+    const stringOffset = this.offset;
+    const newOffset = this.computeOffset(length + 1);
+    new Uint8Array(this.buffer).set(utf8, stringOffset);
+    const stackValue = this.offsetStackValue(stringOffset, ValueType.STRING, bitWidth);
+    this.stack.push(stackValue);
+    if (this.dedupStrings) {
+      this.stringLookup[str] = stackValue;
+    }
+    this.offset = newOffset;
+  }
+
+  private writeKey(str: string): void {
+    if (this.dedupKeys && Object.prototype.hasOwnProperty.call(this.keyLookup, str)) {
+      this.stack.push(this.keyLookup[str]);
+      return;
+    }
+    const utf8 = toUTF8Array(str);
+    const length = utf8.length;
+    const newOffset = this.computeOffset(length + 1);
+    new Uint8Array(this.buffer).set(utf8, this.offset);
+    const stackValue = this.offsetStackValue(this.offset, ValueType.KEY, BitWidth.WIDTH8);
+    this.stack.push(stackValue);
+    if (this.dedupKeys) {
+      this.keyLookup[str] = stackValue;
+    }
+    this.offset = newOffset;
+  }
+
+  private writeStackValue(value: StackValue, byteWidth: number): void {
+    const newOffset = this.computeOffset(byteWidth);
+    if (value.isOffset()) {
+      const relativeOffset = this.offset - value.offset;
+      if (byteWidth === 8 || BigInt(relativeOffset) < (BigInt(1) << BigInt(byteWidth * 8))) {
+        this.writeUInt(relativeOffset, byteWidth);
+      } else {
+        throw `Unexpected size ${byteWidth}. This might be a bug. Please create an issue https://github.com/google/flatbuffers/issues/new`
+      }
+    } else {
+      value.writeToBuffer(byteWidth);
+    }
+    this.offset = newOffset;
+  }
+
+  private integrityCheckOnValueAddition() {
+    if (this.finished) {
+      throw "Adding values after finish is prohibited";
+    }
+    if (this.stackPointers.length !== 0 && this.stackPointers[this.stackPointers.length - 1].isVector === false) {
+      if (this.stack[this.stack.length - 1].type !== ValueType.KEY) {
+        throw "Adding value to a map before adding a key is prohibited";
+      }
+    }
+  }
+
+  private integrityCheckOnKeyAddition() {
+    if (this.finished) {
+      throw "Adding values after finish is prohibited";
+    }
+    if (this.stackPointers.length === 0 || this.stackPointers[this.stackPointers.length - 1].isVector) {
+      throw "Adding key before starting a map is prohibited";
+    }
+  }
+
+  startVector(): void {
+    this.stackPointers.push({ stackPosition: this.stack.length, isVector: true });
+  }
+
+  startMap(presorted = false): void {
+    this.stackPointers.push({ stackPosition: this.stack.length, isVector: false, presorted: presorted });
+  }
+
+  private endVector(stackPointer: StackPointer) {
+    const vecLength = this.stack.length - stackPointer.stackPosition;
+    const vec = this.createVector(stackPointer.stackPosition, vecLength, 1);
+    this.stack.splice(stackPointer.stackPosition, vecLength);
+    this.stack.push(vec);
+  }
+
+  private endMap(stackPointer: StackPointer) {
+    if (!stackPointer.presorted) {
+      this.sort(stackPointer);
+    }
+    let keyVectorHash = "";
+    for (let i = stackPointer.stackPosition; i < this.stack.length; i += 2) {
+      keyVectorHash += `,${this.stack[i].offset}`;
+    }
+    const vecLength = (this.stack.length - stackPointer.stackPosition) >> 1;
+
+    if (this.dedupKeyVectors && !Object.prototype.hasOwnProperty.call(this.keyVectorLookup, keyVectorHash)) {
+      this.keyVectorLookup[keyVectorHash] = this.createVector(stackPointer.stackPosition, vecLength, 2);
+    }
+    const keysStackValue = this.dedupKeyVectors ? this.keyVectorLookup[keyVectorHash] : this.createVector(stackPointer.stackPosition, vecLength, 2);
+    const valuesStackValue = this.createVector(stackPointer.stackPosition + 1, vecLength, 2, keysStackValue);
+    this.stack.splice(stackPointer.stackPosition, vecLength << 1);
+    this.stack.push(valuesStackValue);
+  }
+
+  private sort(stackPointer: StackPointer) {
+    const view = this.view
+    const stack = this.stack
+
+    function shouldFlip(v1: StackValue, v2: StackValue) {
+      if (v1.type !== ValueType.KEY || v2.type !== ValueType.KEY) {
+        throw `Stack values are not keys ${v1} | ${v2}. Check if you combined [addKey] with add... method calls properly.`
+      }
+      let c1, c2;
+      let index = 0;
+      do {
+        c1 = view.getUint8(v1.offset + index);
+        c2 = view.getUint8(v2.offset + index);
+        if (c2 < c1) return true;
+        if (c1 < c2) return false;
+        index += 1;
+      } while (c1 !== 0 && c2 !== 0);
+      return false;
+    }
+
+    function swap(stack: Array<StackValue>, flipIndex: number, i: number) {
+      if (flipIndex === i) return;
+      const k = stack[flipIndex];
+      const v = stack[flipIndex + 1];
+      stack[flipIndex] = stack[i];
+      stack[flipIndex + 1] = stack[i + 1];
+      stack[i] = k;
+      stack[i + 1] = v;
+    }
+
+    function selectionSort() {
+      for (let i = stackPointer.stackPosition; i < stack.length; i += 2) {
+        let flipIndex = i;
+        for (let j = i + 2; j < stack.length; j += 2) {
+          if (shouldFlip(stack[flipIndex], stack[j])) {
+            flipIndex = j;
+          }
+        }
+        if (flipIndex !== i) {
+          swap(stack, flipIndex, i);
+        }
+      }
+    }
+
+    function smaller(v1: StackValue, v2: StackValue) {
+      if (v1.type !== ValueType.KEY || v2.type !== ValueType.KEY) {
+        throw `Stack values are not keys ${v1} | ${v2}. Check if you combined [addKey] with add... method calls properly.`
+      }
+      if (v1.offset === v2.offset) {
+        return false;
+      }
+      let c1, c2;
+      let index = 0;
+      do {
+        c1 = view.getUint8(v1.offset + index);
+        c2 = view.getUint8(v2.offset + index);
+        if (c1 < c2) return true;
+        if (c2 < c1) return false;
+        index += 1;
+      } while (c1 !== 0 && c2 !== 0);
+      return false;
+    }
+
+    function quickSort(left: number, right: number) {
+
+      if (left < right) {
+        const mid = left + (((right - left) >> 2)) * 2;
+        const pivot = stack[mid];
+        let left_new = left;
+        let right_new = right;
+
+        do {
+          while (smaller(stack[left_new], pivot)) {
+            left_new += 2;
+          }
+          while (smaller(pivot, stack[right_new])) {
+            right_new -= 2;
+          }
+          if (left_new <= right_new) {
+            swap(stack, left_new, right_new);
+            left_new += 2;
+            right_new -= 2;
+          }
+        } while (left_new <= right_new);
+
+        quickSort(left, right_new);
+        quickSort(left_new, right);
+
+      }
+    }
+
+    let sorted = true;
+    for (let i = stackPointer.stackPosition; i < this.stack.length - 2; i += 2) {
+      if (shouldFlip(this.stack[i], this.stack[i + 2])) {
+        sorted = false;
+        break;
+      }
+    }
+
+    if (!sorted) {
+      if (this.stack.length - stackPointer.stackPosition > 40) {
+        quickSort(stackPointer.stackPosition, this.stack.length - 2);
+      } else {
+        selectionSort();
+      }
+    }
+  }
+
+  end(): void {
+    if (this.stackPointers.length < 1) return;
+    const pointer = this.stackPointers.pop() as StackPointer;
+    if (pointer.isVector) {
+      this.endVector(pointer);
+    } else {
+      this.endMap(pointer);
+    }
+  }
+
+  private createVector(start: number, vecLength: number, step: number, keys: StackValue | null = null) {
+    let bitWidth = uwidth(vecLength);
+    let prefixElements = 1;
+    if (keys !== null) {
+      const elementWidth = keys.elementWidth(this.offset, 0);
+      if (elementWidth > bitWidth) {
+        bitWidth = elementWidth;
+      }
+      prefixElements += 2;
+    }
+    let vectorType = ValueType.KEY;
+    let typed = keys === null;
+    for (let i = start; i < this.stack.length; i += step) {
+      const elementWidth = this.stack[i].elementWidth(this.offset, i + prefixElements);
+      if (elementWidth > bitWidth) {
+        bitWidth = elementWidth;
+      }
+      if (i === start) {
+        vectorType = this.stack[i].type;
+        typed = typed && isTypedVectorElement(vectorType);
+      } else {
+        if (vectorType !== this.stack[i].type) {
+          typed = false;
+        }
+      }
+    }
+    const byteWidth = this.align(bitWidth);
+    const fix = typed && isNumber(vectorType) && vecLength >= 2 && vecLength <= 4;
+    if (keys !== null) {
+      this.writeStackValue(keys, byteWidth);
+      this.writeUInt(1 << keys.width, byteWidth);
+    }
+    if (!fix) {
+      this.writeUInt(vecLength, byteWidth);
+    }
+    const vecOffset = this.offset;
+    for (let i = start; i < this.stack.length; i += step) {
+      this.writeStackValue(this.stack[i], byteWidth);
+    }
+    if (!typed) {
+      for (let i = start; i < this.stack.length; i += step) {
+        this.writeUInt(this.stack[i].storedPackedType(), 1);
+      }
+    }
+    if (keys !== null) {
+      return this.offsetStackValue(vecOffset, ValueType.MAP, bitWidth);
+    }
+    if (typed) {
+      const vType = toTypedVector(vectorType, fix ? vecLength : 0);
+      return this.offsetStackValue(vecOffset, vType, bitWidth);
+    }
+    return this.offsetStackValue(vecOffset, ValueType.VECTOR, bitWidth);
+  }
+
+  private nullStackValue() {
+    return new StackValue(this, ValueType.NULL, BitWidth.WIDTH8);
+  }
+
+  private boolStackValue(value: boolean) {
+    return new StackValue(this, ValueType.BOOL, BitWidth.WIDTH8, value);
+  }
+
+  private intStackValue(value: number | bigint) {
+    return new StackValue(this, ValueType.INT, iwidth(value), value as number);
+  }
+
+  private uintStackValue(value: number) {
+    return new StackValue(this, ValueType.UINT, uwidth(value), value);
+  }
+
+  private floatStackValue(value: number) {
+    return new StackValue(this, ValueType.FLOAT, fwidth(value), value);
+  }
+
+  private offsetStackValue(offset: number, valueType: ValueType, bitWidth: BitWidth): StackValue {
+    return new StackValue(this, valueType, bitWidth, null, offset);
+  }
+
+  private finishBuffer() {
+    if (this.stack.length !== 1) {
+      throw `Stack has to be exactly 1, but it is ${this.stack.length}. You have to end all started vectors and maps before calling [finish]`;
+    }
+    const value = this.stack[0];
+    const byteWidth = this.align(value.elementWidth(this.offset, 0));
+    this.writeStackValue(value, byteWidth);
+    this.writeUInt(value.storedPackedType(), 1);
+    this.writeUInt(byteWidth, 1);
+    this.finished = true;
+  }
+
+  add(value: undefined | null | boolean | bigint | number | DataView | string | Array<unknown> | Record<string, unknown> | unknown): void {
+    this.integrityCheckOnValueAddition();
+    if (typeof value === 'undefined') {
+      throw "You need to provide a value";
+    }
+    if (value === null) {
+      this.stack.push(this.nullStackValue());
+    } else if (typeof value === "boolean") {
+      this.stack.push(this.boolStackValue(value));
+    } else if (typeof value === "bigint") {
+      this.stack.push(this.intStackValue(value));
+    } else if (typeof value == 'number') {
+      if (Number.isInteger(value)) {
+        this.stack.push(this.intStackValue(value));
+      } else {
+        this.stack.push(this.floatStackValue(value));
+      }
+    } else if (ArrayBuffer.isView(value)) {
+      this.writeBlob(value.buffer);
+    } else if (typeof value === 'string' || value instanceof String) {
+      this.writeString(value as string);
+    } else if (Array.isArray(value)) {
+      this.startVector();
+      for (let i = 0; i < value.length; i++) {
+        this.add(value[i]);
+      }
+      this.end();
+    } else if (typeof value === 'object') {
+      const properties = Object.getOwnPropertyNames(value).sort();
+      this.startMap(true);
+      for (let i = 0; i < properties.length; i++) {
+        const key = properties[i];
+        this.addKey(key);
+        this.add((value as Record<string, unknown>)[key]);
+      }
+      this.end();
+    } else {
+      throw `Unexpected value input ${value}`;
+    }
+  }
+
+  finish(): Uint8Array {
+    if (!this.finished) {
+      this.finishBuffer();
+    }
+    const result = this.buffer.slice(0, this.offset);
+    return new Uint8Array(result);
+  }
+
+  isFinished(): boolean {
+    return this.finished;
+  }
+
+  addKey(key: string): void {
+    this.integrityCheckOnKeyAddition();
+    this.writeKey(key);
+  }
+
+  addInt(value: number, indirect = false, deduplicate = false): void {
+    this.integrityCheckOnValueAddition();
+    if (!indirect) {
+      this.stack.push(this.intStackValue(value));
+      return;
+    }
+    if (deduplicate && Object.prototype.hasOwnProperty.call(this.indirectIntLookup, value)) {
+      this.stack.push(this.indirectIntLookup[value]);
+      return;
+    }
+    const stackValue = this.intStackValue(value);
+    const byteWidth = this.align(stackValue.width);
+    const newOffset = this.computeOffset(byteWidth);
+    const valueOffset = this.offset;
+    stackValue.writeToBuffer(byteWidth);
+    const stackOffset = this.offsetStackValue(valueOffset, ValueType.INDIRECT_INT, stackValue.width);
+    this.stack.push(stackOffset);
+    this.offset = newOffset;
+    if (deduplicate) {
+      this.indirectIntLookup[value] = stackOffset;
+    }
+  }
+
+  addUInt(value: number, indirect = false, deduplicate = false): void {
+    this.integrityCheckOnValueAddition();
+    if (!indirect) {
+      this.stack.push(this.uintStackValue(value));
+      return;
+    }
+    if (deduplicate && Object.prototype.hasOwnProperty.call(this.indirectUIntLookup, value)) {
+      this.stack.push(this.indirectUIntLookup[value]);
+      return;
+    }
+    const stackValue = this.uintStackValue(value);
+    const byteWidth = this.align(stackValue.width);
+    const newOffset = this.computeOffset(byteWidth);
+    const valueOffset = this.offset;
+    stackValue.writeToBuffer(byteWidth);
+    const stackOffset = this.offsetStackValue(valueOffset, ValueType.INDIRECT_UINT, stackValue.width);
+    this.stack.push(stackOffset);
+    this.offset = newOffset;
+    if (deduplicate) {
+      this.indirectUIntLookup[value] = stackOffset;
+    }
+  }
+
+  addFloat(value: number, indirect = false, deduplicate = false): void {
+    this.integrityCheckOnValueAddition();
+    if (!indirect) {
+      this.stack.push(this.floatStackValue(value));
+      return;
+    }
+    if (deduplicate && Object.prototype.hasOwnProperty.call(this.indirectFloatLookup, value)) {
+      this.stack.push(this.indirectFloatLookup[value]);
+      return;
+    }
+    const stackValue = this.floatStackValue(value);
+    const byteWidth = this.align(stackValue.width);
+    const newOffset = this.computeOffset(byteWidth);
+    const valueOffset = this.offset;
+    stackValue.writeToBuffer(byteWidth);
+    const stackOffset = this.offsetStackValue(valueOffset, ValueType.INDIRECT_FLOAT, stackValue.width);
+    this.stack.push(stackOffset);
+    this.offset = newOffset;
+    if (deduplicate) {
+      this.indirectFloatLookup[value] = stackOffset;
+    }
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/flexbuffers-util.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/flexbuffers-util.ts
new file mode 100644
index 0000000..83186e9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/flexbuffers-util.ts
@@ -0,0 +1,9 @@
+export function fromUTF8Array(data: BufferSource): string {
+  const decoder = new TextDecoder();
+  return decoder.decode(data);
+}
+
+export function toUTF8Array(str: string) : Uint8Array {
+  const encoder = new TextEncoder();
+  return encoder.encode(str);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference-util.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference-util.ts
new file mode 100644
index 0000000..a5eb48d
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference-util.ts
@@ -0,0 +1,119 @@
+import { BitWidth } from './bit-width'
+import { toByteWidth, fromByteWidth } from './bit-width-util'
+import { toUTF8Array, fromUTF8Array } from './flexbuffers-util'
+import { Reference } from './reference'
+
+import { Long } from '../long'
+
+export function validateOffset(dataView: DataView, offset: number, width: number): void {
+  if (dataView.byteLength <= offset + width || (offset & (toByteWidth(width) - 1)) !== 0) {
+    throw "Bad offset: " + offset + ", width: " + width;
+  }
+}
+
+export function readInt(dataView: DataView, offset: number, width: number): number | Long | bigint {
+  if (width < 2) {
+    if (width < 1) {
+      return dataView.getInt8(offset);
+    } else {
+      return dataView.getInt16(offset, true);
+    }
+  } else {
+    if (width < 3) {
+      return dataView.getInt32(offset, true)
+    } else {
+      if (dataView.setBigInt64 === undefined) {
+        return new Long(dataView.getUint32(offset, true), dataView.getUint32(offset + 4, true))
+      }
+      return dataView.getBigInt64(offset, true)
+    }
+  }
+}
+
+export function readUInt(dataView: DataView, offset: number, width: number): number | Long | bigint {
+  if (width < 2) {
+    if (width < 1) {
+      return dataView.getUint8(offset);
+    } else {
+      return dataView.getUint16(offset, true);
+    }
+  } else {
+    if (width < 3) {
+      return dataView.getUint32(offset, true)
+    } else {
+      if (dataView.getBigUint64 === undefined) {
+        return new Long(dataView.getUint32(offset, true), dataView.getUint32(offset + 4, true))
+      }
+      return dataView.getBigUint64(offset, true)
+    }
+  }
+}
+
+export function readFloat(dataView: DataView, offset: number, width: number): number {
+  if (width < BitWidth.WIDTH32) {
+    throw "Bad width: " + width;
+  }
+  if (width === BitWidth.WIDTH32) {
+    return dataView.getFloat32(offset, true);
+  }
+  return dataView.getFloat64(offset, true);
+}
+
+export function indirect(dataView: DataView, offset: number, width: number): number {
+  const step = readUInt(dataView, offset, width) as number;
+  return offset - step;
+}
+
+export function keyIndex(key: string, dataView: DataView, offset: number, parentWidth: number, byteWidth: number, length: number): number | null {
+  const input = toUTF8Array(key);
+  const keysVectorOffset = indirect(dataView, offset, parentWidth) - byteWidth * 3;
+  const bitWidth = fromByteWidth(byteWidth);
+  const indirectOffset = keysVectorOffset - (readUInt(dataView, keysVectorOffset, bitWidth) as number);
+  const _byteWidth = readUInt(dataView, keysVectorOffset + byteWidth, bitWidth) as number;
+  let low = 0;
+  let high = length - 1;
+  while (low <= high) {
+    const mid = (high + low) >> 1;
+    const dif = diffKeys(input, mid, dataView, indirectOffset, _byteWidth);
+    if (dif === 0) return mid;
+    if (dif < 0) {
+      high = mid - 1;
+    } else {
+      low = mid + 1;
+    }
+  }
+  return null;
+}
+
+export function diffKeys(input: Uint8Array, index: number, dataView: DataView, offset: number, width: number): number {
+  const keyOffset = offset + index * width;
+  const keyIndirectOffset = keyOffset - (readUInt(dataView, keyOffset, fromByteWidth(width)) as number);
+  for (let i = 0; i < input.length; i++) {
+    const dif = input[i] - dataView.getUint8(keyIndirectOffset + i);
+    if (dif !== 0) {
+      return dif;
+    }
+  }
+  return dataView.getUint8(keyIndirectOffset + input.length) === 0 ? 0 : -1;
+}
+
+export function valueForIndexWithKey(index: number, key: string, dataView: DataView, offset: number, parentWidth: number, byteWidth: number, length: number, path: string): Reference {
+  const _indirect = indirect(dataView, offset, parentWidth);
+  const elementOffset = _indirect + index * byteWidth;
+  const packedType = dataView.getUint8(_indirect + length * byteWidth + index);
+  return new Reference(dataView, elementOffset, fromByteWidth(byteWidth), packedType, `${path}/${key}`)
+}
+
+export function keyForIndex(index: number, dataView: DataView, offset: number, parentWidth: number, byteWidth: number): string {
+  const keysVectorOffset = indirect(dataView, offset, parentWidth) - byteWidth * 3;
+  const bitWidth = fromByteWidth(byteWidth);
+  const indirectOffset = keysVectorOffset - (readUInt(dataView, keysVectorOffset, bitWidth) as number);
+  const _byteWidth = readUInt(dataView, keysVectorOffset + byteWidth, bitWidth) as number;
+  const keyOffset = indirectOffset + index * _byteWidth;
+  const keyIndirectOffset = keyOffset - (readUInt(dataView, keyOffset, fromByteWidth(_byteWidth)) as number);
+  let length = 0;
+  while (dataView.getUint8(keyIndirectOffset + length) !== 0) {
+    length++;
+  }
+  return fromUTF8Array(new Uint8Array(dataView.buffer, keyIndirectOffset, length));
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference.ts
new file mode 100644
index 0000000..a93c743
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/reference.ts
@@ -0,0 +1,185 @@
+import { fromByteWidth } from './bit-width-util'
+import { ValueType } from './value-type'
+import { isNumber, isIndirectNumber, isAVector, fixedTypedVectorElementSize, isFixedTypedVector, isTypedVector, typedVectorElementType, packedType, fixedTypedVectorElementType } from './value-type-util'
+import { indirect, keyForIndex, keyIndex, readFloat, readInt, readUInt, valueForIndexWithKey } from './reference-util'
+import { Long } from '../long';
+import { fromUTF8Array } from './flexbuffers-util';
+import { BitWidth } from './bit-width';
+
+export function toReference(buffer: Uint8Array): Reference {
+  const len = buffer.byteLength;
+  
+  if (len < 3) {
+    throw "Buffer needs to be bigger than 3";
+  }
+
+  const dataView = new DataView(buffer);
+  const byteWidth = dataView.getUint8(len - 1);
+  const packedType = dataView.getUint8(len - 2);
+  const parentWidth = fromByteWidth(byteWidth);
+  const offset = len - byteWidth - 2;
+
+  return new Reference(dataView, offset, parentWidth, packedType, "/")
+}
+
+export class Reference {
+  private readonly byteWidth: number
+  private readonly valueType: ValueType
+  private _length = -1
+  constructor(private dataView: DataView, private offset: number, private parentWidth: number, private packedType: ValueType, private path: string) {
+    this.byteWidth = 1 << (packedType & 3)
+    this.valueType = packedType >> 2
+  }
+
+  isNull(): boolean { return this.valueType === ValueType.NULL; }
+  isNumber(): boolean { return isNumber(this.valueType) || isIndirectNumber(this.valueType); }
+  isFloat(): boolean { return ValueType.FLOAT === this.valueType || ValueType.INDIRECT_FLOAT === this.valueType; }
+  isInt(): boolean { return this.isNumber() && !this.isFloat(); }
+  isString(): boolean { return ValueType.STRING === this.valueType || ValueType.KEY === this.valueType; }
+  isBool(): boolean { return ValueType.BOOL === this.valueType; }
+  isBlob(): boolean { return ValueType.BLOB === this.valueType; }
+  isVector(): boolean { return isAVector(this.valueType); }
+  isMap(): boolean { return ValueType.MAP === this.valueType; }
+
+  boolValue(): boolean | null {
+    if (this.isBool()) {
+      return readInt(this.dataView, this.offset, this.parentWidth) > 0;
+    }
+    return null;
+  }
+
+  intValue(): number | Long | bigint | null {
+    if (this.valueType === ValueType.INT) {
+      return readInt(this.dataView, this.offset, this.parentWidth);
+    }
+    if (this.valueType === ValueType.UINT) {
+      return readUInt(this.dataView, this.offset, this.parentWidth);
+    }
+    if (this.valueType === ValueType.INDIRECT_INT) {
+      return readInt(this.dataView, indirect(this.dataView, this.offset, this.parentWidth), fromByteWidth(this.byteWidth));
+    }
+    if (this.valueType === ValueType.INDIRECT_UINT) {
+      return readUInt(this.dataView, indirect(this.dataView, this.offset, this.parentWidth), fromByteWidth(this.byteWidth));
+    }
+    return null;
+  }
+
+  floatValue(): number | null {
+    if (this.valueType === ValueType.FLOAT) {
+      return readFloat(this.dataView, this.offset, this.parentWidth);
+    }
+    if (this.valueType === ValueType.INDIRECT_FLOAT) {
+      return readFloat(this.dataView, indirect(this.dataView, this.offset, this.parentWidth), fromByteWidth(this.byteWidth));
+    }
+    return null;
+  }
+
+  numericValue(): number | Long | bigint | null { return this.floatValue() || this.intValue()}
+
+  stringValue(): string | null {
+    if (this.valueType === ValueType.STRING || this.valueType === ValueType.KEY) {
+      const begin = indirect(this.dataView, this.offset, this.parentWidth);
+      return fromUTF8Array(new Uint8Array(this.dataView.buffer, begin, this.length()));
+    }
+    return null;
+  }
+
+  blobValue(): Uint8Array | null {
+    if (this.isBlob()) {
+      const begin = indirect(this.dataView, this.offset, this.parentWidth);
+      return new Uint8Array(this.dataView.buffer, begin, this.length());
+    }
+    return null;
+  }
+
+  get(key: number): Reference {
+    const length = this.length();
+    if (Number.isInteger(key) && isAVector(this.valueType)) {
+      if (key >= length || key < 0) {
+        throw `Key: [${key}] is not applicable on ${this.path} of ${this.valueType} length: ${length}`;
+      }
+      const _indirect = indirect(this.dataView, this.offset, this.parentWidth);
+      const elementOffset = _indirect + key * this.byteWidth;
+      let _packedType = this.dataView.getUint8(_indirect + length * this.byteWidth + key);
+      if (isTypedVector(this.valueType)) {
+        const _valueType = typedVectorElementType(this.valueType);
+        _packedType = packedType(_valueType, BitWidth.WIDTH8);
+      } else if (isFixedTypedVector(this.valueType)) {
+        const _valueType = fixedTypedVectorElementType(this.valueType);
+        _packedType = packedType(_valueType, BitWidth.WIDTH8);
+      }
+      return new Reference(this.dataView, elementOffset, fromByteWidth(this.byteWidth), _packedType, `${this.path}[${key}]`);
+    }
+    if (typeof key === 'string') {
+      const index = keyIndex(key, this.dataView, this.offset, this.parentWidth, this.byteWidth, length);
+      if (index !== null) {
+        return valueForIndexWithKey(index, key, this.dataView, this.offset, this.parentWidth, this.byteWidth, length, this.path)
+      }
+    }
+    throw `Key [${key}] is not applicable on ${this.path} of ${this.valueType}`;
+  }
+
+  length(): number {
+    let size;
+    if (this._length > -1) {
+      return this._length;
+    }
+    if (isFixedTypedVector(this.valueType)) {
+      this._length = fixedTypedVectorElementSize(this.valueType);
+    } else if (this.valueType === ValueType.BLOB
+      || this.valueType === ValueType.MAP
+      || isAVector(this.valueType)) {
+      this._length = readUInt(this.dataView, indirect(this.dataView, this.offset, this.parentWidth) - this.byteWidth, fromByteWidth(this.byteWidth)) as number
+    } else if (this.valueType === ValueType.NULL) {
+      this._length = 0;
+    } else if (this.valueType === ValueType.STRING) {
+      const _indirect = indirect(this.dataView, this.offset, this.parentWidth);
+      let sizeByteWidth = this.byteWidth;
+      size = readUInt(this.dataView, _indirect - sizeByteWidth, fromByteWidth(this.byteWidth));
+      while (this.dataView.getInt8(_indirect + (size as number)) !== 0) {
+        sizeByteWidth <<= 1;
+        size = readUInt(this.dataView, _indirect - sizeByteWidth, fromByteWidth(this.byteWidth));
+      }
+      this._length = size as number;
+    } else if (this.valueType === ValueType.KEY) {
+      const _indirect = indirect(this.dataView, this.offset, this.parentWidth);
+      size = 1;
+      while (this.dataView.getInt8(_indirect + size) !== 0) {
+        size++;
+      }
+      this._length = size;
+    } else {
+      this._length = 1;
+    }
+    return this._length;
+  }
+
+  toObject(): unknown {
+    const length = this.length();
+    if (this.isVector()) {
+      const result = [];
+      for (let i = 0; i < length; i++) {
+        result.push(this.get(i).toObject());
+      }
+      return result;
+    }
+    if (this.isMap()) {
+      const result: Record<string, unknown> = {};
+      for (let i = 0; i < length; i++) {
+        const key = keyForIndex(i, this.dataView, this.offset, this.parentWidth, this.byteWidth);
+        result[key] = valueForIndexWithKey(i, key, this.dataView, this.offset, this.parentWidth, this.byteWidth, length, this.path).toObject();
+      }
+      return result;
+    }
+    if (this.isNull()) {
+      return null;
+    }
+    if (this.isBool()) {
+      return this.boolValue();
+    }
+    if (this.isNumber()) {
+      return this.numericValue();
+    }
+    return this.blobValue() || this.stringValue();
+  }
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/stack-value.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/stack-value.ts
new file mode 100644
index 0000000..ef8e2f1
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/stack-value.ts
@@ -0,0 +1,61 @@
+import { Builder } from './builder'
+import { BitWidth } from './bit-width'
+import { paddingSize, uwidth, fromByteWidth } from './bit-width-util'
+import { ValueType } from './value-type'
+import { isInline, packedType } from './value-type-util'
+
+export class StackValue {
+  constructor(private builder: Builder, public type: ValueType, public width: number, public value: number | boolean | null = null, public offset: number = 0) {
+
+  }
+
+  elementWidth(size: number, index: number): BitWidth {
+    if (isInline(this.type)) return this.width;
+    for (let i = 0; i < 4; i++) {
+      const width = 1 << i;
+      const offsetLoc = size + paddingSize(size, width) + index * width;
+      const offset = offsetLoc - this.offset;
+      const bitWidth = uwidth(offset);
+      if (1 << bitWidth === width) {
+        return bitWidth;
+      }
+    }
+    throw `Element is unknown. Size: ${size} at index: ${index}. This might be a bug. Please create an issue https://github.com/google/flatbuffers/issues/new`;
+  }
+
+  writeToBuffer(byteWidth: number): void {
+    const newOffset = this.builder.computeOffset(byteWidth);
+    if (this.type === ValueType.FLOAT) {
+      if (this.width === BitWidth.WIDTH32) {
+        this.builder.view.setFloat32(this.builder.offset, this.value as number, true);
+      } else {
+        this.builder.view.setFloat64(this.builder.offset, this.value as number, true);
+      }
+    } else if (this.type === ValueType.INT) {
+      const bitWidth = fromByteWidth(byteWidth);
+      this.builder.pushInt(this.value as number, bitWidth);
+    } else if (this.type === ValueType.UINT) {
+      const bitWidth = fromByteWidth(byteWidth);
+      this.builder.pushUInt(this.value as number, bitWidth);
+    } else if (this.type === ValueType.NULL) {
+      this.builder.pushInt(0, this.width);
+    } else if (this.type === ValueType.BOOL) {
+      this.builder.pushInt(this.value ? 1 : 0, this.width);
+    } else {
+      throw `Unexpected type: ${this.type}. This might be a bug. Please create an issue https://github.com/google/flatbuffers/issues/new`
+    }
+    this.offset = newOffset;
+  }
+
+  storedWidth(width = BitWidth.WIDTH8): BitWidth {
+    return isInline(this.type) ? Math.max(width, this.width) : this.width;
+  }
+
+  storedPackedType(width = BitWidth.WIDTH8): ValueType {
+    return packedType(this.type, this.storedWidth(width));
+  }
+
+  isOffset(): boolean {
+    return !isInline(this.type)
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type-util.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type-util.ts
new file mode 100644
index 0000000..da869a9
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type-util.ts
@@ -0,0 +1,64 @@
+import { ValueType } from './value-type'
+
+export function isInline(value: ValueType): boolean {
+  return value === ValueType.BOOL
+    || value <= ValueType.FLOAT;
+}
+
+export function isNumber(value: ValueType): boolean {
+  return value >= ValueType.INT
+    && value <= ValueType.FLOAT;
+}
+
+export function isIndirectNumber(value: ValueType): boolean {
+  return value >= ValueType.INDIRECT_INT
+    && value <= ValueType.INDIRECT_FLOAT;
+}
+
+export function isTypedVectorElement(value: ValueType): boolean {
+  return value === ValueType.BOOL
+    || (value >= ValueType.INT
+      && value <= ValueType.STRING);
+}
+
+export function isTypedVector(value: ValueType): boolean {
+  return value === ValueType.VECTOR_BOOL
+    || (value >= ValueType.VECTOR_INT
+      && value <= ValueType.VECTOR_STRING_DEPRECATED);
+}
+
+export function isFixedTypedVector(value: ValueType): boolean {
+  return value >= ValueType.VECTOR_INT2
+    && value <= ValueType.VECTOR_FLOAT4;
+}
+
+export function isAVector(value: ValueType): boolean {
+  return isTypedVector(value)
+    || isFixedTypedVector(value)
+    || value === ValueType.VECTOR;
+}
+
+export function toTypedVector(valueType: ValueType, length: number): ValueType {
+  if (length === 0) return valueType - ValueType.INT + ValueType.VECTOR_INT;
+  if (length === 2) return valueType - ValueType.INT + ValueType.VECTOR_INT2;
+  if (length === 3) return valueType - ValueType.INT + ValueType.VECTOR_INT3;
+  if (length === 4) return valueType - ValueType.INT + ValueType.VECTOR_INT4;
+  throw "Unexpected length " + length;
+}
+
+export function typedVectorElementType(valueType: ValueType): ValueType {
+  return valueType - ValueType.VECTOR_INT + ValueType.INT;
+}
+
+export function fixedTypedVectorElementType(valueType: ValueType): ValueType {
+  return ((valueType - ValueType.VECTOR_INT2) % 3) + ValueType.INT;
+}
+
+export function fixedTypedVectorElementSize(valueType: ValueType): ValueType {
+  // The x / y >> 0 trick is to have an int division. Suppose to be faster than Math.floor()
+  return (((valueType - ValueType.VECTOR_INT2) / 3) >> 0) + 2;
+}
+
+export function packedType(valueType: ValueType, bitWidth: number): ValueType {
+  return bitWidth | (valueType << 2);
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type.ts b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type.ts
new file mode 100644
index 0000000..9c88ba2
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/flexbuffers/value-type.ts
@@ -0,0 +1,30 @@
+export enum ValueType {
+  NULL = 0,
+  INT = 1,
+  UINT = 2,
+  FLOAT = 3,
+  KEY = 4,
+  STRING = 5,
+  INDIRECT_INT = 6,
+  INDIRECT_UINT = 7,
+  INDIRECT_FLOAT = 8,
+  MAP = 9,
+  VECTOR = 10,
+  VECTOR_INT = 11,
+  VECTOR_UINT = 12,
+  VECTOR_FLOAT = 13,
+  VECTOR_KEY = 14,
+  VECTOR_STRING_DEPRECATED = 15,
+  VECTOR_INT2 = 16,
+  VECTOR_UINT2 = 17,
+  VECTOR_FLOAT2 = 18,
+  VECTOR_INT3 = 19,
+  VECTOR_UINT3 = 20,
+  VECTOR_FLOAT3 = 21,
+  VECTOR_INT4 = 22,
+  VECTOR_UINT4 = 23,
+  VECTOR_FLOAT4 = 24,
+  BLOB = 25,
+  BOOL = 26,
+  VECTOR_BOOL = 36,
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/long.ts b/3rdparty/TNN/third_party/flatbuffers/ts/long.ts
new file mode 100644
index 0000000..50a3ea8
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/long.ts
@@ -0,0 +1,23 @@
+export function createLong(low: number, high: number): Long {
+    return Long.create(low, high);
+}
+  
+export class Long {
+    static readonly ZERO = new Long(0, 0)
+    low: number
+    high: number
+    constructor(low: number, high: number) {
+        this.low = low | 0;
+        this.high = high | 0;
+    }
+    static create(low: number, high: number): Long {
+        // Special-case zero to avoid GC overhead for default values
+        return low == 0 && high == 0 ? Long.ZERO : new Long(low, high);
+    }
+    toFloat64(): number {
+        return (this.low >>> 0) + this.high * 0x100000000;
+    }
+    equals(other: Long): boolean {
+        return this.low == other.low && this.high == other.high;
+    }
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/types.ts b/3rdparty/TNN/third_party/flatbuffers/ts/types.ts
new file mode 100644
index 0000000..30e4050
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/types.ts
@@ -0,0 +1,14 @@
+import { ByteBuffer } from './byte-buffer'
+import { Builder } from './builder'
+
+export type Offset = number;
+
+export type Table = {
+  bb: ByteBuffer
+  bb_pos: number
+};
+
+export interface IGeneratedObject {
+  pack(builder:Builder): Offset
+  unpack(): IGeneratedObject
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/ts/utils.ts b/3rdparty/TNN/third_party/flatbuffers/ts/utils.ts
new file mode 100644
index 0000000..a2902e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/ts/utils.ts
@@ -0,0 +1,4 @@
+export const int32 = new Int32Array(2);
+export const float32 = new Float32Array(int32.buffer);
+export const float64 = new Float64Array(int32.buffer);
+export const isLittleEndian = new Uint16Array(new Uint8Array([1, 0]).buffer)[0] === 1;
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/flatbuffers/tsconfig.json b/3rdparty/TNN/third_party/flatbuffers/tsconfig.json
new file mode 100644
index 0000000..9af4075
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES5",
+    "module": "commonjs",
+    "lib": ["ES2015", "ES2020.BigInt", "DOM"],
+    "declaration": true,
+    "outDir": "./js",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true, 
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": [
+    "ts/**/*.ts"
+  ]
+}
diff --git a/3rdparty/TNN/third_party/flatbuffers/tsconfig.mjs.json b/3rdparty/TNN/third_party/flatbuffers/tsconfig.mjs.json
new file mode 100644
index 0000000..5af9460
--- /dev/null
+++ b/3rdparty/TNN/third_party/flatbuffers/tsconfig.mjs.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2017",
+    "module": "ES2015",
+    "lib": ["ES2017", "ES2020.BigInt", "DOM"],
+    "declaration": true,
+    "outDir": "./mjs",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true, 
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": [
+    "ts/**/*.ts"
+  ]
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/.gitattributes b/3rdparty/TNN/third_party/gflags/.gitattributes
new file mode 100644
index 0000000..87fe9c0
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/.gitattributes
@@ -0,0 +1,3 @@
+# treat all files in this repository as text files
+# and normalize them to LF line endings when committed
+* text
diff --git a/3rdparty/TNN/third_party/gflags/.gitignore b/3rdparty/TNN/third_party/gflags/.gitignore
new file mode 100644
index 0000000..321f70e
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/.gitignore
@@ -0,0 +1,24 @@
+/xcode/
+/build/
+/builds/
+/build-*/
+.DS_Store
+CMakeCache.txt
+DartConfiguration.tcl
+Makefile
+CMakeFiles/
+/Testing/
+/include/gflags/config.h
+/include/gflags/gflags_completions.h
+/include/gflags/gflags_declare.h
+/include/gflags/gflags.h
+/lib/
+/test/gflags_unittest_main.cc
+/test/gflags_unittest-main.cc
+/packages/
+CMakeLists.txt.user
+/bazel-bin
+/bazel-genfiles
+/bazel-gflags
+/bazel-out
+/bazel-testlogs
diff --git a/3rdparty/TNN/third_party/gflags/.gitmodules b/3rdparty/TNN/third_party/gflags/.gitmodules
new file mode 100644
index 0000000..aa2072c
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "doc"]
+	path = doc
+	url = https://github.com/gflags/gflags.git
+	branch = gh-pages
diff --git a/3rdparty/TNN/third_party/gflags/.travis.yml b/3rdparty/TNN/third_party/gflags/.travis.yml
new file mode 100644
index 0000000..0989c7c
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/.travis.yml
@@ -0,0 +1,20 @@
+# Ubuntu 14.04 Trusty support, to get newer cmake and compilers.
+sudo: required
+dist: trusty
+
+language: cpp
+
+os:
+  - linux
+  - osx
+
+compiler:
+  - clang
+  - gcc
+
+env:
+  - CONFIG=Release
+  - CONFIG=Debug
+
+script:
+  - mkdir out && cd out && cmake -D CMAKE_BUILD_TYPE=$CONFIG -D GFLAGS_BUILD_SHARED_LIBS=ON -D GFLAGS_BUILD_STATIC_LIBS=ON -D GFLAGS_BUILD_TESTING=ON .. && cmake --build . --config $CONFIG && ctest
diff --git a/3rdparty/TNN/third_party/gflags/AUTHORS.txt b/3rdparty/TNN/third_party/gflags/AUTHORS.txt
new file mode 100644
index 0000000..887918b
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/AUTHORS.txt
@@ -0,0 +1,2 @@
+google-gflags@googlegroups.com
+
diff --git a/3rdparty/TNN/third_party/gflags/CMakeLists.txt b/3rdparty/TNN/third_party/gflags/CMakeLists.txt
new file mode 100644
index 0000000..02954d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/CMakeLists.txt
@@ -0,0 +1,702 @@
+## CMake configuration file of gflags project
+##
+## This CMakeLists.txt defines some gflags specific configuration variables
+## using the "gflags_define" utility macro. The default values of these variables
+## can be overridden either on the CMake command-line using the -D option of
+## the cmake command or in a super-project which includes the gflags source
+## tree by setting the GFLAGS_<varname> CMake variables before adding the
+## gflags source directory via CMake's "add_subdirectory" command. Only when
+## the non-cached variable GFLAGS_IS_SUBPROJECT has a value equivalent to FALSE,
+## these configuration variables are added to the CMake cache so they can be
+## edited in the CMake GUI. By default, GFLAGS_IS_SUBPROJECT is set to TRUE when
+## the CMAKE_SOURCE_DIR is not identical to the directory of this CMakeLists.txt
+## file, i.e., the top-level directory of the gflags project source tree.
+##
+## When this project is a subproject (GFLAGS_IS_SUBPROJECT is TRUE), the default
+## settings are such that only the static single-threaded library is built without
+## installation of the gflags files. The "gflags" target is in this case an ALIAS
+## library target for the "gflags_nothreads_static" library target. Targets which
+## depend on the gflags library should link to the "gflags" library target.
+##
+## Example CMakeLists.txt of user project which requires separate gflags installation:
+##   cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+##
+##   project(Foo)
+##
+##   find_package(gflags REQUIRED)
+##
+##   add_executable(foo src/foo.cc)
+##   target_link_libraries(foo gflags)
+##
+## Example CMakeLists.txt of user project which requires separate single-threaded static gflags installation:
+##   cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+##
+##   project(Foo)
+##
+##   find_package(gflags COMPONENTS nothreads_static)
+##
+##   add_executable(foo src/foo.cc)
+##   target_link_libraries(foo gflags)
+##
+## Example CMakeLists.txt of super-project which contains gflags source tree:
+##   cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+##
+##   project(Foo)
+##
+##   add_subdirectory(gflags)
+##
+##   add_executable(foo src/foo.cc)
+##   target_link_libraries(foo gflags)
+##
+## Variables to configure the source files:
+## - GFLAGS_IS_A_DLL
+## - GFLAGS_NAMESPACE
+## - GFLAGS_ATTRIBUTE_UNUSED
+## - GFLAGS_INTTYPES_FORMAT
+##
+## Variables to configure the build:
+## - GFLAGS_SOVERSION
+## - GFLAGS_BUILD_SHARED_LIBS
+## - GFLAGS_BUILD_STATIC_LIBS
+## - GFLAGS_BUILD_gflags_LIB
+## - GFLAGS_BUILD_gflags_nothreads_LIB
+## - GFLAGS_BUILD_TESTING
+## - GFLAGS_BUILD_PACKAGING
+##
+## Variables to configure the installation:
+## - GFLAGS_INCLUDE_DIR
+## - GFLAGS_LIBRARY_INSTALL_DIR or LIB_INSTALL_DIR or LIB_SUFFIX
+## - GFLAGS_INSTALL_HEADERS
+## - GFLAGS_INSTALL_SHARED_LIBS
+## - GFLAGS_INSTALL_STATIC_LIBS
+
+cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
+
+if (POLICY CMP0042)
+  cmake_policy (SET CMP0042 NEW)
+endif ()
+
+# ----------------------------------------------------------------------------
+# includes
+include ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils.cmake")
+
+# ----------------------------------------------------------------------------
+# package information
+set (PACKAGE_NAME        "gflags")
+set (PACKAGE_VERSION     "2.2.1")
+set (PACKAGE_STRING      "${PACKAGE_NAME} ${PACKAGE_VERSION}")
+set (PACKAGE_TARNAME     "${PACKAGE_NAME}-${PACKAGE_VERSION}")
+set (PACKAGE_BUGREPORT   "https://github.com/gflags/gflags/issues")
+set (PACKAGE_DESCRIPTION "A commandline flags library that allows for distributed flags.")
+set (PACKAGE_URL         "http://gflags.github.io/gflags")
+
+project (${PACKAGE_NAME} CXX)
+if (CMAKE_VERSION VERSION_LESS 3.4)
+  # C language still needed because the following required CMake modules
+  # (or their dependencies, respectively) are not correctly handling
+  # the case where only CXX is enabled
+  # - CheckTypeSize.cmake (fixed in CMake 3.1, cf. https://cmake.org/Bug/view.php?id=14056)
+  # - FindThreads.cmake   (fixed in CMake 3.4, cf. https://cmake.org/Bug/view.php?id=14905)
+  enable_language (C)
+endif ()
+
+version_numbers (
+  ${PACKAGE_VERSION}
+    PACKAGE_VERSION_MAJOR
+    PACKAGE_VERSION_MINOR
+    PACKAGE_VERSION_PATCH
+)
+
+# shared library ABI version number, can be overridden by package maintainers
+# using -DGFLAGS_SOVERSION=XXX on the command-line
+if (GFLAGS_SOVERSION)
+  set (PACKAGE_SOVERSION "${GFLAGS_SOVERSION}")
+else ()
+  # TODO: Change default SOVERSION back to PACKAGE_VERSION_MAJOR with the
+  #       next increase of major version number (i.e., 3.0.0 -> SOVERSION 3)
+  #       The <major>.<minor> SOVERSION should be used for the 2.x releases
+  #       versions only which temporarily broke the API by changing the default
+  #       namespace from "google" to "gflags".
+  set (PACKAGE_SOVERSION "${PACKAGE_VERSION_MAJOR}.${PACKAGE_VERSION_MINOR}")
+endif ()
+
+# when gflags is included as subproject (e.g., as Git submodule/subtree) in the source
+# tree of a project that uses it, no variables should be added to the CMake cache;
+# users may set the non-cached variable GFLAGS_IS_SUBPROJECT before add_subdirectory(gflags)
+if (NOT DEFINED GFLAGS_IS_SUBPROJECT)
+  if ("^${CMAKE_SOURCE_DIR}$" STREQUAL "^${PROJECT_SOURCE_DIR}$")
+    set (GFLAGS_IS_SUBPROJECT FALSE)
+  else ()
+    set (GFLAGS_IS_SUBPROJECT TRUE)
+  endif ()
+endif ()
+
+# prefix for package variables in CMake configuration file
+string (TOUPPER "${PACKAGE_NAME}" PACKAGE_PREFIX)
+
+# convert file path on Windows with back slashes to path with forward slashes
+# otherwise this causes an issue with the cmake_install.cmake script
+file (TO_CMAKE_PATH "${CMAKE_INSTALL_PREFIX}" CMAKE_INSTALL_PREFIX)
+
+# ----------------------------------------------------------------------------
+# options
+
+# maintain binary backwards compatibility with gflags library version <= 2.0,
+# but at the same time enable the use of the preferred new "gflags" namespace
+gflags_define (STRING NAMESPACE "Name(s) of library namespace (separate multiple options by semicolon)" "google;${PACKAGE_NAME}" "${PACKAGE_NAME}")
+gflags_property (NAMESPACE ADVANCED TRUE)
+set (GFLAGS_NAMESPACE_SECONDARY "${NAMESPACE}")
+list (REMOVE_DUPLICATES GFLAGS_NAMESPACE_SECONDARY)
+if (NOT GFLAGS_NAMESPACE_SECONDARY)
+  message (FATAL_ERROR "GFLAGS_NAMESPACE must be set to one (or more) valid C++ namespace identifier(s separated by semicolon \";\").")
+endif ()
+foreach (ns IN LISTS GFLAGS_NAMESPACE_SECONDARY)
+  if (NOT ns MATCHES "^[a-zA-Z][a-zA-Z0-9_]*$")
+    message (FATAL_ERROR "GFLAGS_NAMESPACE contains invalid namespace identifier: ${ns}")
+  endif ()
+endforeach ()
+list (GET       GFLAGS_NAMESPACE_SECONDARY 0 GFLAGS_NAMESPACE)
+list (REMOVE_AT GFLAGS_NAMESPACE_SECONDARY 0)
+
+# cached build options when gflags is not a subproject, otherwise non-cached CMake variables
+# usage: gflags_define(BOOL <name> <doc> <default> [<subproject default>])
+gflags_define (BOOL BUILD_SHARED_LIBS          "Request build of shared libraries."                                       OFF OFF)
+gflags_define (BOOL BUILD_STATIC_LIBS          "Request build of static libraries (default if BUILD_SHARED_LIBS is OFF)." OFF ON)
+gflags_define (BOOL BUILD_gflags_LIB           "Request build of the multi-threaded gflags library."                      ON  OFF)
+gflags_define (BOOL BUILD_gflags_nothreads_LIB "Request build of the single-threaded gflags library."                     ON  ON)
+gflags_define (BOOL BUILD_PACKAGING            "Enable build of distribution packages using CPack."                       OFF OFF)
+gflags_define (BOOL BUILD_TESTING              "Enable build of the unit tests and their execution using CTest."          OFF OFF)
+gflags_define (BOOL INSTALL_HEADERS            "Request installation of headers and other development files."             ON  OFF)
+gflags_define (BOOL INSTALL_SHARED_LIBS        "Request installation of shared libraries."                                ON  ON)
+gflags_define (BOOL INSTALL_STATIC_LIBS        "Request installation of static libraries."                                ON  OFF)
+gflags_define (BOOL REGISTER_BUILD_DIR         "Request entry of build directory in CMake's package registry."            OFF OFF)
+gflags_define (BOOL REGISTER_INSTALL_PREFIX    "Request entry of installed package in CMake's package registry."          ON  OFF)
+
+gflags_property (BUILD_STATIC_LIBS   ADVANCED TRUE)
+gflags_property (INSTALL_HEADERS     ADVANCED TRUE)
+gflags_property (INSTALL_SHARED_LIBS ADVANCED TRUE)
+gflags_property (INSTALL_STATIC_LIBS ADVANCED TRUE)
+
+if (NOT GFLAGS_IS_SUBPROJECT)
+  foreach (varname IN ITEMS CMAKE_INSTALL_PREFIX)
+    gflags_property (${varname} ADVANCED FALSE)
+  endforeach ()
+  foreach (varname IN ITEMS CMAKE_CONFIGURATION_TYPES CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT)
+    gflags_property (${varname} ADVANCED TRUE)
+  endforeach ()
+  if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+    gflags_set (CMAKE_BUILD_TYPE Release)
+  endif ()
+  if (CMAKE_CONFIGURATION_TYPES)
+    gflags_property (CMAKE_BUILD_TYPE STRINGS "${CMAKE_CONFIGURATION_TYPES}")
+  endif ()
+endif () # NOT GFLAGS_IS_SUBPROJECT
+
+if (NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
+  set (BUILD_STATIC_LIBS ON)
+endif ()
+if (NOT BUILD_gflags_LIB AND NOT BUILD_gflags_nothreads_LIB)
+  message (FATAL_ERROR "At least one of [GFLAGS_]BUILD_gflags_LIB and [GFLAGS_]BUILD_gflags_nothreads_LIB must be ON.")
+endif ()
+
+gflags_define (STRING INCLUDE_DIR "Name of include directory of installed header files relative to CMAKE_INSTALL_PREFIX/include/" "${PACKAGE_NAME}")
+gflags_property (INCLUDE_DIR ADVANCED TRUE)
+file (TO_CMAKE_PATH "${INCLUDE_DIR}" INCLUDE_DIR)
+if (IS_ABSOLUTE INCLUDE_DIR)
+  message (FATAL_ERROR "[GFLAGS_]INCLUDE_DIR must be a path relative to CMAKE_INSTALL_PREFIX/include/")
+endif ()
+if (INCLUDE_DIR MATCHES "^\\.\\.[/\\]")
+  message (FATAL_ERROR "[GFLAGS_]INCLUDE_DIR must not start with parent directory reference (../)")
+endif ()
+set (GFLAGS_INCLUDE_DIR "${INCLUDE_DIR}")
+
+# ----------------------------------------------------------------------------
+# system checks
+include (CheckTypeSize)
+include (CheckIncludeFileCXX)
+include (CheckCXXSymbolExists)
+
+if (WIN32 AND NOT CYGWIN)
+  set (OS_WINDOWS 1)
+else ()
+  set (OS_WINDOWS 0)
+endif ()
+
+if (MSVC)
+  set (HAVE_SYS_TYPES_H 1)
+  set (HAVE_STDDEF_H    1) # used by CheckTypeSize module
+  set (HAVE_UNISTD_H    0)
+  set (HAVE_SYS_STAT_H  1)
+  set (HAVE_SHLWAPI_H   1)
+  if (MSVC_VERSION VERSION_LESS 1600)
+    check_include_file_cxx ("stdint.h" HAVE_STDINT_H)
+    bool_to_int (HAVE_STDINT_H)  # used in #if directive
+  else ()
+    set (HAVE_STDINT_H 1)
+  endif ()
+  if (MSVC_VERSION VERSION_LESS 1800)
+    check_include_file_cxx ("inttypes.h" HAVE_INTTYPES_H)
+    bool_to_int (HAVE_INTTYPES_H)  # used in #if directive
+  else ()
+    set (HAVE_INTTYPES_H 1)
+  endif ()
+else ()
+  foreach (fname IN ITEMS unistd stdint inttypes sys/types sys/stat fnmatch)
+    string (TOUPPER "${fname}" FNAME)
+    string (REPLACE "/" "_" FNAME "${FNAME}")
+    if (NOT HAVE_${FNAME}_H)
+      check_include_file_cxx ("${fname}.h" HAVE_${FNAME}_H)
+    endif ()
+  endforeach ()
+  if (NOT HAVE_FNMATCH_H AND OS_WINDOWS)
+    check_include_file_cxx ("shlwapi.h" HAVE_SHLWAPI_H)
+  endif ()
+  # the following are used in #if directives not #ifdef
+  bool_to_int (HAVE_STDINT_H)
+  bool_to_int (HAVE_SYS_TYPES_H)
+  bool_to_int (HAVE_INTTYPES_H)
+endif ()
+
+gflags_define (STRING INTTYPES_FORMAT "Format of integer types: \"C99\" (uint32_t), \"BSD\" (u_int32_t), \"VC7\" (__int32)" "")
+gflags_property (INTTYPES_FORMAT STRINGS "C99;BSD;VC7")
+gflags_property (INTTYPES_FORMAT ADVANCED TRUE)
+if (NOT INTTYPES_FORMAT)
+  set (TYPES uint32_t u_int32_t)
+  if (MSVC)
+    list (INSERT TYPES 0 __int32)
+  endif ()
+  foreach (type IN LISTS TYPES)
+    check_type_size (${type} ${type} LANGUAGE CXX)
+    if (HAVE_${type})
+      break ()
+    endif ()
+  endforeach ()
+  if (HAVE_uint32_t)
+    gflags_set (INTTYPES_FORMAT C99)
+  elseif (HAVE_u_int32_t)
+    gflags_set (INTTYPES_FORMAT BSD)
+  elseif (HAVE___int32)
+    gflags_set (INTTYPES_FORMAT VC7)
+  else ()
+    gflags_property (INTTYPES_FORMAT ADVANCED FALSE)
+    message (FATAL_ERROR "Do not know how to define a 32-bit integer quantity on your system!"
+                         " Neither uint32_t, u_int32_t, nor __int32 seem to be available."
+                         " Set [GFLAGS_]INTTYPES_FORMAT to either C99, BSD, or VC7 and try again.")
+  endif ()
+endif ()
+# use of special characters in strings to circumvent bug #0008226
+if ("^${INTTYPES_FORMAT}$" STREQUAL "^WIN$")
+  gflags_set (INTTYPES_FORMAT VC7)
+endif ()
+if (NOT INTTYPES_FORMAT MATCHES "^(C99|BSD|VC7)$")
+  message (FATAL_ERROR "Invalid value for [GFLAGS_]INTTYPES_FORMAT! Choose one of \"C99\", \"BSD\", or \"VC7\"")
+endif ()
+set (GFLAGS_INTTYPES_FORMAT "${INTTYPES_FORMAT}")
+set (GFLAGS_INTTYPES_FORMAT_C99 0)
+set (GFLAGS_INTTYPES_FORMAT_BSD 0)
+set (GFLAGS_INTTYPES_FORMAT_VC7 0)
+set ("GFLAGS_INTTYPES_FORMAT_${INTTYPES_FORMAT}" 1)
+
+if (MSVC)
+  set (HAVE_strtoll 0)
+  set (HAVE_strtoq  0)
+else ()
+  check_cxx_symbol_exists (strtoll stdlib.h HAVE_STRTOLL)
+  if (NOT HAVE_STRTOLL)
+    check_cxx_symbol_exists (strtoq stdlib.h HAVE_STRTOQ)
+  endif ()
+endif ()
+
+if (BUILD_gflags_LIB)
+  set (CMAKE_THREAD_PREFER_PTHREAD TRUE)
+  find_package (Threads)
+  if (Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    set (HAVE_PTHREAD 1)
+    check_type_size (pthread_rwlock_t RWLOCK LANGUAGE CXX)
+  else ()
+    set (HAVE_PTHREAD 0)
+  endif ()
+  if (UNIX AND NOT HAVE_PTHREAD)
+    if (CMAKE_HAVE_PTHREAD_H)
+      set (what "library")
+    else ()
+      set (what ".h file")
+    endif ()
+    message (FATAL_ERROR "Could not find pthread${what}. Check the log file"
+                         "\n\t${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log"
+                         "\nor disable the build of the multi-threaded gflags library (BUILD_gflags_LIB=OFF).")
+  endif ()
+else ()
+  set (HAVE_PTHREAD 0)
+endif ()
+
+# ----------------------------------------------------------------------------
+# source files - excluding root subdirectory and/or .in suffix
+set (PUBLIC_HDRS
+  "gflags.h"
+  "gflags_declare.h"
+  "gflags_completions.h"
+)
+
+if (GFLAGS_NAMESPACE_SECONDARY)
+  set (INCLUDE_GFLAGS_NS_H "// Import gflags library symbols into alternative/deprecated namespace(s)")
+  foreach (ns IN LISTS GFLAGS_NAMESPACE_SECONDARY)
+    string (TOUPPER "${ns}" NS)
+    set (gflags_ns_h "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/gflags_${ns}.h")
+    configure_file ("${PROJECT_SOURCE_DIR}/src/gflags_ns.h.in" "${gflags_ns_h}" @ONLY)
+    list (APPEND PUBLIC_HDRS "${gflags_ns_h}")
+    set (INCLUDE_GFLAGS_NS_H "${INCLUDE_GFLAGS_NS_H}\n#include \"gflags_${ns}.h\"")
+  endforeach ()
+else ()
+  set (INCLUDE_GFLAGS_NS_H)
+endif ()
+
+set (PRIVATE_HDRS
+  "config.h"
+  "util.h"
+  "mutex.h"
+)
+
+set (GFLAGS_SRCS
+  "gflags.cc"
+  "gflags_reporting.cc"
+  "gflags_completions.cc"
+)
+
+if (OS_WINDOWS)
+  list (APPEND PRIVATE_HDRS "windows_port.h")
+  list (APPEND GFLAGS_SRCS  "windows_port.cc")
+endif ()
+
+# ----------------------------------------------------------------------------
+# configure source files
+if (NOT DEFINED GFLAGS_ATTRIBUTE_UNUSED)
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    set (GFLAGS_ATTRIBUTE_UNUSED "__attribute((unused))")
+  else ()
+    set (GFLAGS_ATTRIBUTE_UNUSED)
+  endif ()
+endif ()
+
+# whenever we build a shared library (DLL on Windows), configure the public
+# headers of the API for use of this shared library rather than the optionally
+# also build statically linked library; users can override GFLAGS_DLL_DECL
+# in particular, this done by setting the INTERFACE_COMPILE_DEFINITIONS of
+# static libraries to include an empty definition for GFLAGS_DLL_DECL
+if (NOT DEFINED GFLAGS_IS_A_DLL)
+  if (BUILD_SHARED_LIBS)
+    set (GFLAGS_IS_A_DLL 1)
+  else ()
+    set (GFLAGS_IS_A_DLL 0)
+  endif ()
+endif ()
+
+configure_headers (PUBLIC_HDRS  ${PUBLIC_HDRS})
+configure_sources (PRIVATE_HDRS ${PRIVATE_HDRS})
+configure_sources (GFLAGS_SRCS  ${GFLAGS_SRCS})
+
+# ----------------------------------------------------------------------------
+# output directories
+if (NOT GFLAGS_IS_SUBPROJECT)
+  set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
+  set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "lib")
+  set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "lib")
+endif ()
+
+# ----------------------------------------------------------------------------
+# installation directories
+if (OS_WINDOWS)
+  set (RUNTIME_INSTALL_DIR "bin")
+  set (LIBRARY_INSTALL_DIR "lib")
+  set (INCLUDE_INSTALL_DIR "include")
+  set (CONFIG_INSTALL_DIR  "lib/cmake/${PACKAGE_NAME}")
+  set (PKGCONFIG_INSTALL_DIR)
+else ()
+  set (RUNTIME_INSTALL_DIR bin)
+  # The LIB_INSTALL_DIR and LIB_SUFFIX variables are used by the Fedora
+  # package maintainers. Also package maintainers of other distribution
+  # packages need to be able to specify the name of the library directory.
+  if (NOT GFLAGS_LIBRARY_INSTALL_DIR AND LIB_INSTALL_DIR)
+    set (GFLAGS_LIBRARY_INSTALL_DIR "${LIB_INSTALL_DIR}")
+  endif ()
+  gflags_define (PATH LIBRARY_INSTALL_DIR "Directory of installed libraries, e.g., \"lib64\"" "lib${LIB_SUFFIX}")
+  gflags_property (LIBRARY_INSTALL_DIR ADVANCED TRUE)
+  set (INCLUDE_INSTALL_DIR include)
+  set (CONFIG_INSTALL_DIR  ${LIBRARY_INSTALL_DIR}/cmake/${PACKAGE_NAME})
+  set (PKGCONFIG_INSTALL_DIR ${LIBRARY_INSTALL_DIR}/pkgconfig)
+endif ()
+
+# ----------------------------------------------------------------------------
+# add library targets
+set (TARGETS)
+# static vs. shared
+foreach (TYPE IN ITEMS STATIC SHARED)
+  if (BUILD_${TYPE}_LIBS)
+    string (TOLOWER "${TYPE}" type)
+    # whether or not targets are a DLL
+    if (OS_WINDOWS AND "^${TYPE}$" STREQUAL "^SHARED$")
+      set (GFLAGS_IS_A_DLL 1)
+    else ()
+      set (GFLAGS_IS_A_DLL 0)
+    endif ()
+    # filename suffix for static libraries on Windows
+    if (OS_WINDOWS AND "^${TYPE}$" STREQUAL "^STATIC$")
+      set (type_suffix "_${type}")
+    else ()
+      set (type_suffix "")
+    endif ()
+    # multi-threaded vs. single-threaded
+    foreach (opts IN ITEMS "" _nothreads)
+      if (BUILD_gflags${opts}_LIB)
+        set (target_name "gflags${opts}_${type}")
+        add_library (${target_name} ${TYPE} ${GFLAGS_SRCS} ${PRIVATE_HDRS} ${PUBLIC_HDRS})
+        set_target_properties (${target_name} PROPERTIES
+          OUTPUT_NAME "gflags${opts}${type_suffix}"
+          VERSION     "${PACKAGE_VERSION}"
+          SOVERSION   "${PACKAGE_SOVERSION}"
+        )
+        set (include_dirs "$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>")
+        if (INSTALL_HEADERS)
+          list (APPEND include_dirs "$<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>")
+        endif ()
+        target_include_directories (${target_name}
+          PUBLIC  "${include_dirs}"
+          PRIVATE "${PROJECT_SOURCE_DIR}/src;${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}"
+        )
+        target_compile_definitions (${target_name} PUBLIC GFLAGS_IS_A_DLL=${GFLAGS_IS_A_DLL})
+        if (opts MATCHES "nothreads")
+          target_compile_definitions (${target_name} PRIVATE NO_THREADS)
+        elseif (CMAKE_USE_PTHREADS_INIT)
+          target_link_libraries (${target_name} ${CMAKE_THREAD_LIBS_INIT})
+        endif ()
+        if (HAVE_SHLWAPI_H)
+          target_link_libraries (${target_name} shlwapi.lib)
+        endif ()
+        list (APPEND TARGETS ${target_name})
+        # add convenience make target for build of both shared and static libraries
+        if (NOT GFLAGS_IS_SUBPROJECT)
+          if (NOT TARGET gflags${opts})
+            add_custom_target (gflags${opts})
+          endif ()
+          add_dependencies (gflags${opts} ${target_name})
+        endif ()
+      endif ()
+    endforeach ()
+  endif ()
+endforeach ()
+
+# add ALIAS target for use in super-project, prefer static over shared, single-threaded over multi-threaded
+if (GFLAGS_IS_SUBPROJECT)
+  foreach (type IN ITEMS static shared)
+    foreach (opts IN ITEMS "_nothreads" "")
+      if (TARGET gflags${opts}_${type})
+        add_library (gflags ALIAS gflags${opts}_${type})
+        break ()
+      endif ()
+    endforeach ()
+    if (TARGET gflags)
+       break ()
+    endif ()
+  endforeach ()
+endif ()
+
+# ----------------------------------------------------------------------------
+# installation rules
+set (EXPORT_NAME ${PACKAGE_NAME}-targets)
+file (RELATIVE_PATH INSTALL_PREFIX_REL2CONFIG_DIR "${CMAKE_INSTALL_PREFIX}/${CONFIG_INSTALL_DIR}" "${CMAKE_INSTALL_PREFIX}")
+configure_file (cmake/config.cmake.in  "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-config-install.cmake" @ONLY)
+configure_file (cmake/version.cmake.in "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake" @ONLY)
+
+if (BUILD_SHARED_LIBS AND INSTALL_SHARED_LIBS)
+  foreach (opts IN ITEMS "" _nothreads)
+    if (BUILD_gflags${opts}_LIB)
+      install (TARGETS gflags${opts}_shared
+               EXPORT ${EXPORT_NAME}
+               RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR}
+               LIBRARY DESTINATION ${LIBRARY_INSTALL_DIR}
+               ARCHIVE DESTINATION ${LIBRARY_INSTALL_DIR})
+    endif ()
+  endforeach ()
+endif ()
+if (BUILD_STATIC_LIBS AND INSTALL_STATIC_LIBS)
+  foreach (opts IN ITEMS "" _nothreads)
+    if (BUILD_gflags${opts}_LIB)
+      install (TARGETS gflags${opts}_static
+               EXPORT ${EXPORT_NAME}
+               RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR}
+               LIBRARY DESTINATION ${LIBRARY_INSTALL_DIR}
+               ARCHIVE DESTINATION ${LIBRARY_INSTALL_DIR})
+    endif ()
+  endforeach ()
+endif ()
+
+if (INSTALL_HEADERS)
+  install (FILES ${PUBLIC_HDRS} DESTINATION ${INCLUDE_INSTALL_DIR}/${GFLAGS_INCLUDE_DIR})
+  install (
+    FILES "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-config-install.cmake"
+    RENAME ${PACKAGE_NAME}-config.cmake
+    DESTINATION ${CONFIG_INSTALL_DIR}
+  )
+  install (
+    FILES "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake"
+    DESTINATION ${CONFIG_INSTALL_DIR}
+  )
+  install (EXPORT ${EXPORT_NAME} DESTINATION ${CONFIG_INSTALL_DIR})
+  if (UNIX)
+    install (PROGRAMS src/gflags_completions.sh DESTINATION ${RUNTIME_INSTALL_DIR})
+  endif ()
+endif ()
+
+if (PKGCONFIG_INSTALL_DIR)
+  configure_file ("cmake/package.pc.in" "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}.pc" @ONLY)
+  install (FILES "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}.pc" DESTINATION "${PKGCONFIG_INSTALL_DIR}")
+endif ()
+
+# ----------------------------------------------------------------------------
+# support direct use of build tree
+set (INSTALL_PREFIX_REL2CONFIG_DIR .)
+export (TARGETS ${TARGETS} FILE "${PROJECT_BINARY_DIR}/${EXPORT_NAME}.cmake")
+if (REGISTER_BUILD_DIR)
+  export (PACKAGE ${PACKAGE_NAME})
+endif ()
+if (REGISTER_INSTALL_PREFIX)
+  register_gflags_package(${CONFIG_INSTALL_DIR})
+endif ()
+configure_file (cmake/config.cmake.in "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake" @ONLY)
+
+# ----------------------------------------------------------------------------
+# testing - MUST follow the generation of the build tree config file
+if (BUILD_TESTING)
+  include (CTest)
+  enable_testing ()
+  add_subdirectory (test)
+endif ()
+
+# ----------------------------------------------------------------------------
+# packaging
+if (BUILD_PACKAGING)
+
+  if (NOT BUILD_SHARED_LIBS AND NOT INSTALL_HEADERS)
+    message (WARNING "Package will contain static libraries without headers!"
+                     "\nRecommended options for generation of runtime package:"
+                     "\n  BUILD_SHARED_LIBS=ON"
+                     "\n  BUILD_STATIC_LIBS=OFF"
+                     "\n  INSTALL_HEADERS=OFF"
+                     "\n  INSTALL_SHARED_LIBS=ON"
+                     "\nRecommended options for generation of development package:"
+                     "\n  BUILD_SHARED_LIBS=ON"
+                     "\n  BUILD_STATIC_LIBS=ON"
+                     "\n  INSTALL_HEADERS=ON"
+                     "\n  INSTALL_SHARED_LIBS=ON"
+                     "\n  INSTALL_STATIC_LIBS=ON")
+  endif ()
+
+  # default package generators
+  if (APPLE)
+    set (PACKAGE_GENERATOR        "PackageMaker")
+    set (PACKAGE_SOURCE_GENERATOR "TGZ;ZIP")
+  elseif (UNIX)
+    set (PACKAGE_GENERATOR        "DEB;RPM")
+    set (PACKAGE_SOURCE_GENERATOR "TGZ;ZIP")
+  else ()
+    set (PACKAGE_GENERATOR        "ZIP")
+    set (PACKAGE_SOURCE_GENERATOR "ZIP")
+  endif ()
+
+  # used package generators
+  set (CPACK_GENERATOR        "${PACKAGE_GENERATOR}"        CACHE STRING "List of binary package generators (CPack).")
+  set (CPACK_SOURCE_GENERATOR "${PACKAGE_SOURCE_GENERATOR}" CACHE STRING "List of source package generators (CPack).")
+  mark_as_advanced (CPACK_GENERATOR CPACK_SOURCE_GENERATOR)
+
+  # some package generators (e.g., PackageMaker) do not allow .md extension
+  configure_file ("${CMAKE_CURRENT_LIST_DIR}/README.md" "${CMAKE_CURRENT_BINARY_DIR}/README.txt" COPYONLY)
+
+  # common package information
+  set (CPACK_PACKAGE_VENDOR              "Andreas Schuh")
+  set (CPACK_PACKAGE_CONTACT             "google-gflags@googlegroups.com")
+  set (CPACK_PACKAGE_NAME                "${PACKAGE_NAME}")
+  set (CPACK_PACKAGE_VERSION             "${PACKAGE_VERSION}")
+  set (CPACK_PACKAGE_VERSION_MAJOR       "${PACKAGE_VERSION_MAJOR}")
+  set (CPACK_PACKAGE_VERSION_MINOR       "${PACKAGE_VERSION_MINOR}")
+  set (CPACK_PACKAGE_VERSION_PATCH       "${PACKAGE_VERSION_PATCH}")
+  set (CPACK_PACKAGE_DESCRIPTION_SUMMARY "${PACKAGE_DESCRIPTION}")
+  set (CPACK_RESOURCE_FILE_WELCOME       "${CMAKE_CURRENT_BINARY_DIR}/README.txt")
+  set (CPACK_RESOURCE_FILE_LICENSE       "${CMAKE_CURRENT_LIST_DIR}/COPYING.txt")
+  set (CPACK_PACKAGE_DESCRIPTION_FILE    "${CMAKE_CURRENT_BINARY_DIR}/README.txt")
+  set (CPACK_INSTALL_PREFIX              "${CMAKE_INSTALL_PREFIX}")
+  set (CPACK_OUTPUT_FILE_PREFIX          packages)
+  set (CPACK_PACKAGE_RELOCATABLE         TRUE)
+  set (CPACK_MONOLITHIC_INSTALL          TRUE)
+
+  # RPM package information -- used in cmake/package.cmake.in also for DEB
+  set (CPACK_RPM_PACKAGE_GROUP   "Development/Libraries")
+  set (CPACK_RPM_PACKAGE_LICENSE "BSD")
+  set (CPACK_RPM_PACKAGE_URL     "${PACKAGE_URL}")
+  set (CPACK_RPM_CHANGELOG_FILE  "${CMAKE_CURRENT_LIST_DIR}/ChangeLog.txt")
+
+  if (INSTALL_HEADERS)
+    set (CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_LIST_DIR}/doc/index.html")
+  else ()
+    set (CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_LIST_DIR}/cmake/README_runtime.txt")
+  endif ()
+
+  # system/architecture
+  if (WINDOWS)
+    if (CMAKE_CL_64)
+      set (CPACK_SYSTEM_NAME "win64")
+    else ()
+      set (CPACK_SYSTEM_NAME "win32")
+    endif ()
+    set (CPACK_PACKAGE_ARCHITECTURE)
+  elseif (APPLE)
+    set (CPACK_PACKAGE_ARCHITECTURE darwin)
+  else ()
+    string (TOLOWER "${CMAKE_SYSTEM_NAME}" CPACK_SYSTEM_NAME)
+    if (CMAKE_CXX_FLAGS MATCHES "-m32")
+      set (CPACK_PACKAGE_ARCHITECTURE i386)
+    else ()
+      execute_process (
+        COMMAND         dpkg --print-architecture
+        RESULT_VARIABLE RV
+        OUTPUT_VARIABLE CPACK_PACKAGE_ARCHITECTURE
+      )
+      if (RV EQUAL 0)
+	      string (STRIP "${CPACK_PACKAGE_ARCHITECTURE}" CPACK_PACKAGE_ARCHITECTURE)
+      else ()
+        execute_process (COMMAND uname -m OUTPUT_VARIABLE CPACK_PACKAGE_ARCHITECTURE)
+        if (CPACK_PACKAGE_ARCHITECTURE MATCHES "x86_64")
+	        set (CPACK_PACKAGE_ARCHITECTURE amd64)
+        else ()
+          set (CPACK_PACKAGE_ARCHITECTURE i386)
+        endif ()
+      endif ()
+    endif ()
+  endif ()
+
+  # source package settings
+  set (CPACK_SOURCE_TOPLEVEL_TAG      "source/tnn")
+  set (CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}")
+  set (CPACK_SOURCE_IGNORE_FILES      "/\\\\.git/;\\\\.swp$;\\\\.#;/#;\\\\.*~;cscope\\\\.*;/[Bb]uild[.+-_a-zA-Z0-9]*/")
+
+  # default binary package settings
+  set (CPACK_INCLUDE_TOPLEVEL_DIRECTORY TRUE)
+  set (CPACK_PACKAGE_FILE_NAME          "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CPACK_SYSTEM_NAME}")
+  if (CPACK_PACKAGE_ARCHITECTURE)
+    set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-${CPACK_PACKAGE_ARCHITECTURE}")
+  endif ()
+
+  # generator specific configuration file
+  #
+  # allow package maintainers to use their own configuration file
+  # $ cmake -DCPACK_PROJECT_CONFIG_FILE:FILE=/path/to/package/config
+  if (NOT CPACK_PROJECT_CONFIG_FILE)
+    configure_file (
+      "${CMAKE_CURRENT_LIST_DIR}/cmake/package.cmake.in"
+      "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-package.cmake" @ONLY
+    )
+    set (CPACK_PROJECT_CONFIG_FILE "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-package.cmake")
+  endif ()
+
+  include (CPack)
+
+endif () # BUILD_PACKAGING
diff --git a/3rdparty/TNN/third_party/gflags/COPYING.txt b/3rdparty/TNN/third_party/gflags/COPYING.txt
new file mode 100644
index 0000000..d15b0c2
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/COPYING.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/TNN/third_party/gflags/ChangeLog.txt b/3rdparty/TNN/third_party/gflags/ChangeLog.txt
new file mode 100644
index 0000000..c26d0ab
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/ChangeLog.txt
@@ -0,0 +1,254 @@
+* Tue Jul 11 2017 - Andreas Schuh <andreas.schuh.84@gmail.com>
+
+- gflags: version 2.2.1
+- Link to online documentation in README
+- Merged 194: Include utils by file instead of CMAKE_MODULE_PATH search
+- Merged 195: Remove unused program_name variable
+- Merged 196: Enable language C for older CMake versions when needed
+- Merged 202: Changed include directory in bazel build
+- Merged 207: Mark single argument constructors in mutex.h as explicit
+- Merged 209: Use inttypes.h on VC++ 2013 and later
+- Merged 212: Fix statically linked gflags library with MSVC
+- Meregd 213: Modify installation paths on Windows for vcpkg
+- Merged 215: Fix static initialization order fiasco caused by global registry lock
+- Merged 216: Fix use of ARGC in CMake macros
+- Merged 222: Static code analyzer error regarding strncmp with empty kRootDir
+- Merged 224: Check HAVE_STDINT_H or HAVE_INTTYPES_H for older MSVC versions
+
+* Fri Nov 25 2016 - Andreas Schuh <andreas.schuh.84@gmail.com>
+
+- gflags: version 2.2.0
+- Merged 178: Implicitly convert dashes in option names to underscores
+- Merged 159: CI builds and automatic tests with Travis CI and AppVeyor
+- Merged 158: Use enum for flag value types
+- Merged 126: File name postfix for static libraries on Windows
+- Closed issue 120: Configure and install gflags.pc file for pkg-config users
+- Fixed issue 127: snprintf already defined when building with MSVC 2015
+- Fixed issue 51/138: Memory leaks reported by valgrind
+- Fixed issue 173: Validate flags only once
+- Fixed issue 168: Unsigned and signed comparison in gflags_reporting.cc
+- Fixed issues 176/153: Add -lpthread link argument to Bazel build, refactor BUILD rules
+- Fixed issue 89: Add GFLAGS_IS_A_DLL to imported CMake target INTERFACE_COMPILE_DEFINITIONS
+- Fixed issue 104: Set INTERFACE_INCLUDE_DIRECTORIES of exported CMake targets
+- Fixed issue 174: Missing gflags-targets.cmake file after installation
+- Fixed issue 186: Error linking to gflags IMPLIB with MSVC using CMake
+- Closed issue 106: Add example project to test use of gflags library
+
+* Tue Mar 24 2014 - Andreas Schuh <andreas.schuh.84@gmail.com>
+
+- gflags: version 2.1.2
+- Moved project to GitHub
+- Added GFLAGS_NAMESPACE definition to gflags_declare.h
+- Fixed issue 94: Keep "google" as primary namespace and import symbols into "gflags" namespace
+- Fixed issue 96: Fix binary ABI compatibility with gflags 2.0 using "google" as primary namespace
+- Fixed issue 97/101: Removed (patched) CMake modules and enabled C language instead
+- Fixed issue 103: Set CMake policy CMP0042 to silence warning regarding MACOS_RPATH setting
+
+* Sun Mar 20 2014 - Andreas Schuh <google-gflags@googlegroups.com>
+
+- gflags: version 2.1.1
+- Fixed issue 77: GFLAGS_IS_A_DLL expands to empty string in gflags_declare.h
+- Fixed issue 79: GFLAGS_NAMESPACE not expanded to actual namespace in gflags_declare.h
+- Fixed issue 80: Allow include path to differ from GFLAGS_NAMESPACE
+
+* Thu Mar 20 2014 - Andreas Schuh <google-gflags@googlegroups.com>
+
+- gflags: version 2.1.0
+- Build system configuration using CMake instead of autotools
+- CPack packaging support for Debian/Ubuntu, Red Hat, and Mac OS X
+- Fixed issue 54: Fix "invalid suffix on literal" (C++11)
+- Fixed issue 57: Use _strdup instead of strdup on Windows
+- Fixed issue 62: Change all preprocessor include guards to start with GFLAGS_
+- Fixed issue 64: Add DEFINE_validator macro
+- Fixed issue 73: Warnings in Visual Studio 2010 and unable to compile unit test
+
+* Wed Jan 25 2012 - Google Inc. <google-gflags@googlegroups.com>
+
+- gflags: version 2.0
+- Changed the 'official' gflags email in setup.py/etc
+- Renamed google-gflags.sln to gflags.sln
+- Changed copyright text to reflect Google's relinquished ownership
+
+* Tue Dec 20 2011 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.7
+- Add CommandLineFlagInfo::flag_ptr pointing to current storage (musji)
+- PORTING: flush after writing to stderr, needed on cygwin
+- PORTING: Clean up the GFLAGS_DLL_DECL stuff better
+- Fix a bug in StringPrintf() that affected large strings (csilvers)
+- Die at configure-time when g++ isn't installed
+
+* Fri Jul 29 2011 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.6
+- BUGFIX: Fix a bug where we were leaving out a required $(top_srcdir)
+- Fix definition of clstring (jyrki)
+- Split up flag declares into its own file (jyrki)
+- Add --version support (csilvers)
+- Update the README for gflags with static libs
+- Update acx_pthread.m4 for nostdlib
+- Change ReparseCommandLineFlags to return void (csilvers)
+- Some doc typofixes and example augmentation (various)
+
+* Mon Jan 24 2011 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.5
+- Better reporting of current vs default value (handler)
+- Add API for cleaning up of memory at program-exit (jmarantz)
+- Fix macros to work inside namespaces (csilvers)
+- Use our own string typedef in case string is redefined (csilvers)
+- Updated to autoconf 2.65
+
+* Wed Oct 13 2010 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.4
+- Add a check to prevent passing 0 to DEFINE_string (jorg)
+- Reduce compile (.o) size (jyrki)
+- Some small changes to quiet debug compiles (alexk)
+- PORTING: better support static linking on windows (csilvers)
+- DOCUMENTATION: change default values, use validators, etc.
+- Update the NEWS file to be non-empty
+- Add pkg-config (.pc) files for libgflags and libgflags_nothreads
+
+* Mon Jan  4 2010 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.3
+- PORTABILITY: can now build and run tests under MSVC (csilvers)
+- Remove the python gflags code, which is now its own package (tansell)
+- Clarify that "last flag wins" in the docs (csilvers)
+- Comment danger of using GetAllFlags in validators (wojtekm)
+- PORTABILITY: Some fixes necessary for c++0x (mboerger)
+- Makefile fix: $(srcdir) -> $(top_srcdir) in one place (csilvres)
+- INSTALL: autotools to autoconf v2.64 + automake v1.11 (csilvers)
+
+* Thu Sep 10 2009 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.2
+- PORTABILITY: can now build and run tests under mingw (csilvers)
+- Using a string arg for a bool flag is a compile-time error (rbayardo)
+- Add --helpxml to gflags.py (salcianu)
+- Protect against a hypothetical global d'tor mutex problem (csilvers)
+- BUGFIX: can now define a flag after 'using namespace google' (hamaji)
+
+* Tue Apr 14 2009 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.1
+- Add both foo and nofoo for boolean flags, with --undefok (andychu)
+- Better document how validators work (wojtekm)
+- Improve binary-detection for bash-completion (mtamsky)
+- Python: Add a concept of "key flags", used with --help (salcianu)
+- Python: Robustify flag_values (salcianu)
+- Python: Add a new DEFINE_bool alias (keir, andrewliu)
+- Python: Do module introspection based on module name (dsturtevant)
+- Fix autoconf a bit better, especially on windows and solaris (ajenjo)
+- BUG FIX: gflags_nothreads was linking against the wrong lib (ajenjo)
+- BUG FIX: threads-detection failed on FreeBSD; replace it (ajenjo)
+- PORTABILITY: Quiet an internal compiler error with SUSE 10 (csilvers)
+- PORTABILITY: Update deb.sh for more recenty debuilds (csilvers)
+- PORTABILITY: #include more headers to satify new gcc's (csilvers)
+- INSTALL: Updated to autoconf 2.61 and libtool 1.5.26 (csilvers)
+
+* Fri Oct  3 2008 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.0
+- Add a missing newline to an error string (bcmills)
+- (otherwise exactly the same as gflags 1.0rc2)
+
+* Thu Sep 18 2008 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.0rc2
+- Report current flag values in --helpxml (hdn)
+- Fix compilation troubles with gcc 4.3.3 (simonb)
+- BUG FIX: I was missing a std:: in DECLARE_string (csilvers)
+- BUG FIX: Clarify in docs how to specify --bool flags (csilvers)
+- BUG FIX: Fix --helpshort for source files not in a subdir (csilvers)
+- BUG FIX: Fix python unittest for 64-bit builds (bcmills)
+
+* Tue Aug 19 2008 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 1.0rc1
+- Move #include files from google/ to gflags/ (csilvers)
+- Small optimizations to reduce binary (library) size (jyrki)
+- BUGFIX: forgot a std:: in one of the .h files (csilvers)
+- Speed up locking by making sure calls are inlined (ajenjo)
+- 64-BIT COMPATIBILITY: Use %PRId64 instead of %lld (csilvers)
+- PORTABILITY: fix Makefile to work with Cygwin (ajenjo)
+- PORTABILITY: fix code to compile under Visual Studio (ajenjo)
+- PORTABILITY: fix code to compile under Solaris 10 with CC (csilvers)
+
+* Mon Jul 21 2008 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.9
+- Add the ability to validate a command-line flag (csilvers)
+- Add completion support for commandline flags in bash (daven)
+- Add -W compile flags to Makefile, when using gcc (csilvers)
+- Allow helpstring to be NULL (cristianoc)
+- Improved documentation of classes in the .cc file (csilvers)
+- Fix python bug with AppendFlagValues + shortnames (jjtswan)
+- Use bool instead of int for boolean flags in gflags.py (bcmills)
+- Simplify the way we declare flags, now more foolproof (csilvers)
+- Better error messages when bool flags collide (colohan)
+- Only evaluate DEFINE_foo macro args once (csilvers)
+
+* Wed Mar 26 2008 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.8
+- Export DescribeOneFlag() in the API
+- Add support for automatic line wrapping at 80 cols for gflags.py
+- Bugfix: do not treat an isolated "-" the same as an isolated "--"
+- Update rpm spec to point to Google Code rather than sourceforge (!)
+- Improve documentation (including documenting thread-safety)
+- Improve #include hygiene
+- Improve testing
+
+* Thu Oct 18 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.7
+- Deal even more correctly with libpthread not linked in (csilvers)
+- Add STRIP_LOG, an improved DO_NOT_SHOW_COMMANDLINE_HELP (sioffe)
+- Be more accurate printing default flag values in --help (dsturtevant)
+- Reduce .o file size a bit by using shorter namespace names (jeff)
+- Use relative install path, so 'setup.py --home' works (csilvers)
+- Notice when a boolean flag has a non-boolean default (bnmouli)
+- Broaden --helpshort to match foo-main.cc and foo_main.cc (hendrie)
+- Fix "no modules match" message for --helpshort, etc (hendrie)
+
+* Wed Aug 15 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.6
+- Deal correctly with case that libpthread is not linked in (csilvers)
+- Update Makefile/tests so we pass "make distcheck" (csilvers)
+- Document and test that last assignment to a flag wins (wan)
+
+* Tue Jun 12 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.5
+- Include all m4 macros in the distribution (csilvers)
+- Python: Fix broken data_files field in setup.py (sidlon)
+- Python: better string serliaizing and unparsing (abo, csimmons)
+- Fix checks for NaN and inf to work with Mac OS X (csilvers)
+
+* Thu Apr 19 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.4
+- Remove is_default from GetCommandLineFlagInfo (csilvers)
+- Portability fixes: includes, strtoll, gcc4.3 errors (csilvers)
+- A few doc typo cleanups (csilvers)
+
+* Wed Mar 28 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.3
+- python portability fix: use popen instead of subprocess (csilvers)
+- Add is_default to CommandLineFlagInfo (pchien)
+- Make docs a bit prettier (csilvers)
+- Actually include the python files in the distribution! :-/ (csilvers)
+
+* Mon Jan 22 2007 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.2
+- added support for python commandlineflags, as well as c++
+- gflags2man, a script to turn flags into a man page (dchristian)
+
+* Wed Dec 13 2006 - Google Inc. <opensource@google.com>
+
+- google-gflags: version 0.1
diff --git a/3rdparty/TNN/third_party/gflags/INSTALL.md b/3rdparty/TNN/third_party/gflags/INSTALL.md
new file mode 100644
index 0000000..d37528f
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/INSTALL.md
@@ -0,0 +1,88 @@
+Installing a binary distribution package
+========================================
+
+No official binary distribution packages are provided by the gflags developers.
+There may, however, be binary packages available for your OS. Please consult
+also the package repositories of your Linux distribution.
+
+For example on Debian/Ubuntu Linux, gflags can be installed using the
+following command:
+
+    sudo apt-get install libgflags-dev
+
+
+Compiling the source code with CMake
+=========================
+
+The build system of gflags is since version 2.1 based on [CMake](http://cmake.org).
+The common steps to build, test, and install software are therefore:
+
+1. Extract source files.
+2. Create build directory and change to it.
+3. Run CMake to configure the build tree.
+4. Build the software using selected build tool.
+5. Test the built software.
+6. Install the built files.
+
+On Unix-like systems with GNU Make as build tool, these build steps can be
+summarized by the following sequence of commands executed in a shell,
+where ```$package``` and ```$version``` are shell variables which represent
+the name of this package and the obtained version of the software.
+
+    $ tar xzf gflags-$version-source.tar.gz
+    $ cd gflags-$version
+    $ mkdir build && cd build
+    $ ccmake ..
+    
+      - Press 'c' to configure the build system and 'e' to ignore warnings.
+      - Set CMAKE_INSTALL_PREFIX and other CMake variables and options.
+      - Continue pressing 'c' until the option 'g' is available.
+      - Then press 'g' to generate the configuration files for GNU Make.
+    
+    $ make
+    $ make test    (optional)
+    $ make install (optional)
+
+In the following, only gflags-specific CMake settings available to
+configure the build and installation are documented. Note that most of these
+variables are for advanced users and binary package maintainers only.
+They usually do not have to be modified.
+
+
+CMake Option                | Description
+--------------------------- | -------------------------------------------------------
+CMAKE_INSTALL_PREFIX        | Installation directory, e.g., "/usr/local" on Unix and "C:\Program Files\gflags" on Windows.
+BUILD_SHARED_LIBS           | Request build of dynamic link libraries.
+BUILD_STATIC_LIBS           | Request build of static link libraries. Implied if BUILD_SHARED_LIBS is OFF.
+BUILD_PACKAGING             | Enable binary package generation using CPack.
+BUILD_TESTING               | Build tests for execution by CTest.
+BUILD_NC_TESTS              | Request inclusion of negative compilation tests (requires Python).
+BUILD_CONFIG_TESTS          | Request inclusion of package configuration tests (requires Python).
+BUILD_gflags_LIBS           | Request build of multi-threaded gflags libraries (if threading library found).
+BUILD_gflags_nothreads_LIBS | Request build of single-threaded gflags libraries.
+GFLAGS_NAMESPACE            | Name of the C++ namespace to be used by the gflags library. Note that the public source header files are installed in a subdirectory named after this namespace. To maintain backwards compatibility with the Google Commandline Flags, set this variable to "google". The default is "gflags".
+GFLAGS_INTTYPES_FORMAT      | String identifying format of built-in integer types.
+GFLAGS_INCLUDE_DIR          | Name of headers installation directory relative to CMAKE_INSTALL_PREFIX.
+LIBRARY_INSTALL_DIR         | Name of library installation directory relative to CMAKE_INSTALL_PREFIX.
+INSTALL_HEADERS             | Request installation of public header files.
+
+Using gflags with [Bazel](http://bazel.io)
+=========================
+
+To use gflags in a Bazel project, map it in as an external dependency by editing
+your WORKSPACE file:
+
+    git_repository(
+        name = "com_github_gflags_gflags",
+        commit = "<INSERT COMMIT SHA HERE>",
+        remote = "https://github.com/gflags/gflags.git",
+    )
+
+    bind(
+        name = "gflags",
+        actual = "@com_github_gflags_gflags//:gflags",
+    )
+
+You can then add `//external:gflags` to the `deps` section of a `cc_binary` or
+`cc_library` rule, and `#include <gflags/gflags.h>` to include it in your source
+code.
diff --git a/3rdparty/TNN/third_party/gflags/README.md b/3rdparty/TNN/third_party/gflags/README.md
new file mode 100644
index 0000000..9f3e3f2
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/README.md
@@ -0,0 +1,305 @@
+[![Build Status](https://travis-ci.org/gflags/gflags.svg?branch=master)](https://travis-ci.org/gflags/gflags)
+[![Build status](https://ci.appveyor.com/api/projects/status/4ctod566ysraus74/branch/master?svg=true)](https://ci.appveyor.com/project/schuhschuh/gflags/branch/master)
+
+The documentation of the gflags library is available online at https://gflags.github.io/gflags/.
+
+11 July 2017
+------------
+
+I've just released gflags 2.2.1.
+
+This maintenance release primarily fixes build issues on Windows and
+false alarms reported by static code analyzers.
+
+Please report any further issues with this release using the GitHub issue tracker.
+
+
+25 November 2016
+----------------
+
+I've finally released gflags 2.2.0.
+
+This release adds support for use of the gflags library as external dependency
+not only in projects using CMake, but also [Bazel](https://bazel.build/),
+or [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/).
+One new minor feature is added in this release: when a command flag argument
+contains dashes, these are implicitly converted to underscores.
+This is to allow those used to separate words of the flag name by dashes
+to do so, while the flag variable names are required to use underscores.
+
+Memory leaks reported by valgrind should be resolved by this release.
+This release fixes build errors with MS Visual Studio 2015.
+
+Please report any further issues with this release using the GitHub issue tracker.
+
+
+24 March 2015
+-------------
+
+I've just released gflags 2.1.2.
+
+This release completes the namespace change fixes. In particular,
+it restores binary ABI compatibility with release version 2.0.
+The deprecated "google" namespace is by default still kept as
+primary namespace while symbols are imported into the new "gflags" namespace.
+This can be overridden using the CMake variable GFLAGS_NAMESPACE.
+
+Other fixes of the build configuration are related to the (patched)
+CMake modules FindThreads.cmake and CheckTypeSize.cmake. These have
+been removed and instead the C language is enabled again even though
+gflags is written in C++ only.
+
+This release also marks the complete move of the gflags project
+from Google Code to GitHub. Email addresses of original issue
+reporters got lost in the process. Given the age of most issue reports,
+this should be negligable.
+
+Please report any further issues using the GitHub issue tracker.
+
+
+30 March 2014
+-------------
+
+I've just released gflags 2.1.1.
+
+This release fixes a few bugs in the configuration of gflags\_declare.h
+and adds a separate GFLAGS\_INCLUDE\_DIR CMake variable to the build configuration.
+Setting GFLAGS\_NAMESPACE to "google" no longer changes also the include
+path of the public header files. This allows the use of the library with
+other Google projects such as glog which still use the deprecated "google"
+namespace for the gflags library, but include it as "gflags/gflags.h".
+
+20 March 2014
+-------------
+
+I've just released gflags 2.1.
+
+The major changes are the use of CMake for the build configuration instead
+of the autotools and packaging support through CPack. The default namespace
+of all C++ symbols is now "gflags" instead of "google". This can be
+configured via the GFLAGS\_NAMESPACE variable.
+
+This release compiles with all major compilers without warnings and passed
+the unit tests on  Ubuntu 12.04, Windows 7 (Visual Studio 2008 and 2010,
+Cygwin, MinGW), and Mac OS X (Xcode 5.1).
+
+The SVN repository on Google Code is now frozen and replaced by a Git
+repository such that it can be used as Git submodule by projects. The main
+hosting of this project remains at Google Code. Thanks to the distributed
+character of Git, I can push (and pull) changes from both GitHub and Google Code
+in order to keep the two public repositories in sync.
+When fixing an issue for a pull request through either of these hosting
+platforms, please reference the issue number as
+[described here](https://code.google.com/p/support/wiki/IssueTracker#Integration_with_version_control).
+For the further development, I am following the
+[Git branching model](http://nvie.com/posts/a-successful-git-branching-model/)
+with feature branch names prefixed by "feature/" and bugfix branch names
+prefixed by "bugfix/", respectively.
+
+Binary and source [packages](https://github.com/schuhschuh/gflags/releases) are available on GitHub.
+
+
+14 January 2014
+---------------
+
+The migration of the build system to CMake is almost complete.
+What remains to be done is rewriting the tests in Python such they can be
+executed on non-Unix platforms and splitting them up into separate CTest tests.
+Though merging these changes into the master branch yet remains to be done,
+it is recommended to already start using the
+[cmake-migration](https://github.com/schuhschuh/gflags/tree/cmake-migration) branch.
+
+
+20 April 2013
+-------------
+
+More than a year has past since I (Andreas) took over the maintenance for
+`gflags`. Only few minor changes have been made since then, much to my regret.
+To get more involved and stimulate participation in the further
+development of the library, I moved the project source code today to
+[GitHub](https://github.com/schuhschuh/gflags).
+I believe that the strengths of [Git](http://git-scm.com/) will allow for better community collaboration
+as well as ease the integration of changes made by others. I encourage everyone
+who would like to contribute to send me pull requests.
+Git's lightweight feature branches will also provide the right tool for more
+radical changes which should only be merged back into the master branch
+after these are complete and implement the desired behavior.
+
+The SVN repository remains accessible at Google Code and I will keep the
+master branch of the Git repository hosted at GitHub and the trunk of the
+Subversion repository synchronized. Initially, I was going to simply switch the
+Google Code project to Git, but in this case the SVN repository would be
+frozen and force everyone who would like the latest development changes to
+use Git as well. Therefore I decided to host the public Git repository at GitHub
+instead.
+
+Please continue to report any issues with gflags on Google Code. The GitHub project will
+only be used to host the Git repository.
+
+One major change of the project structure I have in mind for the next weeks
+is the migration from autotools to [CMake](http://www.cmake.org/).
+Check out the (unstable!)
+[cmake-migration](https://github.com/schuhschuh/gflags/tree/cmake-migration)
+branch on GitHub for details.
+
+
+25 January 2012
+---------------
+
+I've just released gflags 2.0.
+
+The `google-gflags` project has been renamed to `gflags`.  I
+(csilvers) am stepping down as maintainer, to be replaced by Andreas
+Schuh.  Welcome to the team, Andreas!  I've seen the energy you have
+around gflags and the ideas you have for the project going forward,
+and look forward to having you on the team.
+
+I bumped the major version number up to 2 to reflect the new community
+ownership of the project.  All the [changes](ChangeLog.txt)
+are related to the renaming.  There are no functional changes from
+gflags 1.7.  In particular, I've kept the code in the namespace
+`google`, though in a future version it should be renamed to `gflags`.
+I've also kept the `/usr/local/include/google/` subdirectory as
+synonym of `/usr/local/include/gflags/`, though the former name has
+been obsolete for some time now.
+
+
+18 January 2011
+---------------
+
+The `google-gflags` Google Code page has been renamed to
+`gflags`, in preparation for the project being renamed to
+`gflags`.  In the coming weeks, I'll be stepping down as
+maintainer for the gflags project, and as part of that Google is
+relinquishing ownership of the project; it will now be entirely
+community run.  The name change reflects that shift.
+
+
+20 December 2011
+----------------
+
+I've just released gflags 1.7.  This is a minor release; the major
+change is that `CommandLineFlagInfo` now exports the address in memory
+where the flag is located.  There has also been a bugfix involving
+very long --help strings, and some other minor [changes](ChangeLog.txt).
+
+29 July 2011
+------------
+
+I've just released gflags 1.6.  The major new feature in this release
+is support for setting version info, so that --version does something
+useful.
+
+One minor change has required bumping the library number:
+`ReparseCommandlineFlags` now returns `void` instead of `int` (the int
+return value was always meaningless).  Though I doubt anyone ever used
+this (meaningless) return value, technically it's a change to the ABI
+that requires a version bump.  A bit sad.
+
+There's also a procedural change with this release: I've changed the
+internal tools used to integrate Google-supplied patches for gflags
+into the opensource release.  These new tools should result in more
+frequent updates with better change descriptions.  They will also
+result in future `ChangeLog` entries being much more verbose (for better
+or for worse).
+
+See the [ChangeLog](ChangeLog.txt) for a full list of changes for this release.
+
+24 January 2011
+---------------
+
+I've just released gflags 1.5.  This release has only minor changes
+from 1.4, including some slightly better reporting in --help, and
+an new memory-cleanup function that can help when running gflags-using
+libraries under valgrind.  The major change is to fix up the macros
+(`DEFINE_bool` and the like) to work more reliably inside namespaces.
+
+If you have not had a problem with these macros, and don't need any of
+the other changes described, there is no need to upgrade.  See the
+[ChangeLog](ChangeLog.txt) for a full list of changes for this release.
+
+11 October 2010
+---------------
+
+I've just released gflags 1.4.  This release has only minor changes
+from 1.3, including some documentation tweaks and some work to make
+the library smaller.  If 1.3 is working well for you, there's no
+particular reason to upgrade.
+
+4 January 2010
+--------------
+
+I've just released gflags 1.3.  gflags now compiles under MSVC, and
+all tests pass.  I **really** never thought non-unix-y Windows folks
+would want gflags, but at least some of them do.
+
+The major news, though, is that I've separated out the python package
+into its own library, [python-gflags](http://code.google.com/p/python-gflags).
+If you're interested in the Python version of gflags, that's the place to
+get it now.
+
+10 September 2009
+-----------------
+
+I've just released gflags 1.2.  The major change from gflags 1.1 is it
+now compiles under MinGW (as well as cygwin), and all tests pass.  I
+never thought Windows folks would want unix-style command-line flags,
+since they're so different from the Windows style, but I guess I was
+wrong!
+
+The other changes are minor, such as support for --htmlxml in the
+python version of gflags.
+
+15 April 2009
+-------------
+
+I've just released gflags 1.1.  It has only minor changes fdrom gflags
+1.0 (see the [ChangeLog](ChangeLog.txt) for details).
+The major change is that I moved to a new system for creating .deb and .rpm files.
+This allows me to create x86\_64 deb and rpm files.
+
+In the process of moving to this new system, I noticed an
+inconsistency: the tar.gz and .rpm files created libraries named
+libgflags.so, but the deb file created libgoogle-gflags.so.  I have
+fixed the deb file to create libraries like the others.  I'm no expert
+in debian packaging, but I believe this has caused the package name to
+change as well.  Please let me know (at
+[[mailto:google-gflags@googlegroups.com](mailto:google-gflags@googlegroups.com)
+google-gflags@googlegroups.com]) if this causes problems for you --
+especially if you know of a fix!  I would be happy to change the deb
+packages to add symlinks from the old library name to the new
+(libgoogle-gflags.so -> libgflags.so), but that is beyond my knowledge
+of how to make .debs.
+
+If you've tried to install a .rpm or .deb and it doesn't work for you,
+let me know.  I'm excited to finally have 64-bit package files, but
+there may still be some wrinkles in the new system to iron out.
+
+1 October 2008
+--------------
+
+gflags 1.0rc2 was out for a few weeks without any issues, so gflags
+1.0 is now released.  This is much like gflags 0.9.  The major change
+is that the .h files have been moved from `/usr/include/google` to
+`/usr/include/gflags`.  While I have backwards-compatibility
+forwarding headeds in place, please rewrite existing code to say
+```
+   #include <gflags/gflags.h>
+```
+instead of
+```
+   #include <google/gflags.h>
+```
+
+I've kept the default namespace to google.  You can still change with
+with the appropriate flag to the configure script (`./configure
+--help` to see the flags).  If you have feedback as to whether the
+default namespace should change to gflags, which would be a
+non-backwards-compatible change, send mail to
+`google-gflags@googlegroups.com`!
+
+Version 1.0 also has some neat new features, like support for bash
+commandline-completion of help flags.  See the [ChangeLog](ChangeLog.txt)
+for more details.
+
+If I don't hear any bad news for a few weeks, I'll release 1.0-final.
diff --git a/3rdparty/TNN/third_party/gflags/WORKSPACE b/3rdparty/TNN/third_party/gflags/WORKSPACE
new file mode 100644
index 0000000..f3707e9
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/WORKSPACE
@@ -0,0 +1,6 @@
+# Copyright 2006 Google Inc.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the COPYING.txt file.
+
+# Bazel (http://bazel.io/) WORKSPACE file for gflags.
+workspace(name="com_github_gflags_gflags")
diff --git a/3rdparty/TNN/third_party/gflags/appveyor.yml b/3rdparty/TNN/third_party/gflags/appveyor.yml
new file mode 100644
index 0000000..a5110e5
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/appveyor.yml
@@ -0,0 +1,68 @@
+# Configuration for continuous integration service at appveyor.com
+
+version: '{build}'
+
+os: Visual Studio 2015
+
+environment:
+  matrix:
+  - Toolset: v140
+  - Toolset: v120
+  - Toolset: v110
+  - Toolset: v100
+  - Toolset: v90
+
+platform:
+  - Win32
+  - x64
+
+configuration:
+  - Release
+
+matrix:
+  exclude:
+    - Toolset: v90
+      platform: x64
+    - Toolset: v100
+      platform: x64
+
+build:
+  verbosity: minimal
+
+before_build:
+- ps: |
+    Write-Output "Configuration: $env:CONFIGURATION"
+    Write-Output "Platform: $env:PLATFORM"
+    $generator = switch ($env:TOOLSET)
+    {
+        "v140" {"Visual Studio 14 2015"}
+        "v120" {"Visual Studio 12 2013"}
+        "v110" {"Visual Studio 11 2012"}
+        "v100" {"Visual Studio 10 2010"}
+        "v90" {"Visual Studio 9 2008"}
+    }
+    if ($env:PLATFORM -eq "x64")
+    {
+        $generator = "$generator Win64"
+    }
+
+build_script:
+- ps: |
+    md _build -Force | Out-Null
+    cd _build
+
+    & cmake -G "$generator" -D CMAKE_CONFIGURATION_TYPES="Debug;Release" -D GFLAGS_BUILD_TESTING=ON -D GFLAGS_BUILD_SHARED_LIBS=ON -D GFLAGS_BUILD_STATIC_LIBS=ON ..
+    if ($LastExitCode -ne 0) {
+        throw "Exec: $ErrorMessage"
+    }
+    & cmake --build . --config $env:CONFIGURATION
+    if ($LastExitCode -ne 0) {
+        throw "Exec: $ErrorMessage"
+    }
+
+test_script:
+- ps: |
+    & ctest -C $env:CONFIGURATION --output-on-failure
+    if ($LastExitCode -ne 0) {
+        throw "Exec: $ErrorMessage"
+    }
diff --git a/3rdparty/TNN/third_party/gflags/bazel/gflags.bzl b/3rdparty/TNN/third_party/gflags/bazel/gflags.bzl
new file mode 100644
index 0000000..cd0edad
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/bazel/gflags.bzl
@@ -0,0 +1,92 @@
+# ------------------------------------------------------------------------------
+# Add native rules to configure source files
+def gflags_sources(namespace=["google", "gflags"]):
+    native.genrule(
+        name = "config_h",
+        srcs = ["src/config.h.in"],
+        outs = ["config.h"],
+        cmd  = "awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $(<) > $(@)"
+    )
+    native.genrule(
+        name = "gflags_declare_h",
+        srcs = ["src/gflags_declare.h.in"],
+        outs = ["include/gflags/gflags_declare.h"],
+        cmd  = ("awk '{ " +
+                "gsub(/@GFLAGS_NAMESPACE@/, \"" + namespace[0] + "\"); " +
+                "gsub(/@(HAVE_STDINT_H|HAVE_SYS_TYPES_H|HAVE_INTTYPES_H|GFLAGS_INTTYPES_FORMAT_C99)@/, \"1\"); " +
+                "gsub(/@([A-Z0-9_]+)@/, \"0\"); " +
+                "print; }' $(<) > $(@)")
+    )
+    gflags_ns_h_files = []
+    for ns in namespace[1:]:
+        gflags_ns_h_file = "gflags_{}.h".format(ns)
+        native.genrule(
+            name = gflags_ns_h_file.replace('.', '_'),
+            srcs = ["src/gflags_ns.h.in"],
+            outs = ["include/gflags/" + gflags_ns_h_file],
+            cmd  = ("awk '{ " +
+                    "gsub(/@ns@/, \"" + ns + "\"); " +
+                    "gsub(/@NS@/, \"" + ns.upper() + "\"); " +
+                    "print; }' $(<) > $(@)")
+        )
+        gflags_ns_h_files.append(gflags_ns_h_file)
+    native.genrule(
+        name = "gflags_h",
+        srcs = ["src/gflags.h.in"],
+        outs = ["include/gflags/gflags.h"],
+        cmd  = ("awk '{ " +
+                "gsub(/@GFLAGS_ATTRIBUTE_UNUSED@/, \"\"); " +
+                "gsub(/@INCLUDE_GFLAGS_NS_H@/, \"" + '\n'.join(["#include \\\"gflags/{}\\\"".format(hdr) for hdr in gflags_ns_h_files]) + "\"); " +
+                "print; }' $(<) > $(@)")
+    )
+    native.genrule(
+        name = "gflags_completions_h",
+        srcs = ["src/gflags_completions.h.in"],
+        outs = ["include/gflags/gflags_completions.h"],
+        cmd  = "awk '{ gsub(/@GFLAGS_NAMESPACE@/, \"" + namespace[0] + "\"); print; }' $(<) > $(@)"
+    )
+    hdrs = [":gflags_h", ":gflags_declare_h", ":gflags_completions_h"]
+    hdrs.extend([':' + hdr.replace('.', '_') for hdr in gflags_ns_h_files])
+    srcs = [
+        ":config_h",
+        "src/gflags.cc",
+        "src/gflags_completions.cc",
+        "src/gflags_reporting.cc",
+        "src/mutex.h",
+        "src/util.h"
+    ]
+    return [hdrs, srcs]
+
+# ------------------------------------------------------------------------------
+# Add native rule to build gflags library
+def gflags_library(hdrs=[], srcs=[], threads=1):
+    name = "gflags"
+    copts = [
+        "-DHAVE_STDINT_H",
+        "-DHAVE_SYS_TYPES_H",
+        "-DHAVE_INTTYPES_H",
+        "-DHAVE_SYS_STAT_H",
+        "-DHAVE_UNISTD_H",
+        "-DHAVE_FNMATCH_H",
+        "-DHAVE_STRTOLL",
+        "-DHAVE_STRTOQ",
+        "-DHAVE_PTHREAD",
+        "-DHAVE_RWLOCK",
+        "-DGFLAGS_INTTYPES_FORMAT_C99",
+        "-DGFLAGS_IS_A_DLL=0",
+    ]
+    linkopts = []
+    if threads:
+        linkopts.append("-lpthread")
+    else:
+        name += "_nothreads"
+        copts.append("-DNO_THREADS")
+    native.cc_library(
+        name       = name,
+        hdrs       = hdrs,
+        srcs       = srcs,
+        includes   = ["include/"],
+        copts      = copts,
+        linkopts   = linkopts,
+        visibility = ["//visibility:public"]
+    )
diff --git a/3rdparty/TNN/third_party/gflags/cmake/README_runtime.txt b/3rdparty/TNN/third_party/gflags/cmake/README_runtime.txt
new file mode 100644
index 0000000..d2556b2
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/README_runtime.txt
@@ -0,0 +1,4 @@
+This package contains runtime libraries only which are required
+by applications that use these libraries for the commandline flags
+processing. If you want to develop such application, download
+and install the development package instead.
diff --git a/3rdparty/TNN/third_party/gflags/cmake/config.cmake.in b/3rdparty/TNN/third_party/gflags/cmake/config.cmake.in
new file mode 100644
index 0000000..a121b3a
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/config.cmake.in
@@ -0,0 +1,169 @@
+## gflags CMake configuration file
+
+# library version information
+set (@PACKAGE_PREFIX@_VERSION_STRING "@PACKAGE_VERSION@")
+set (@PACKAGE_PREFIX@_VERSION_MAJOR  @PACKAGE_VERSION_MAJOR@)
+set (@PACKAGE_PREFIX@_VERSION_MINOR  @PACKAGE_VERSION_MINOR@)
+set (@PACKAGE_PREFIX@_VERSION_PATCH  @PACKAGE_VERSION_PATCH@)
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/@EXPORT_NAME@.cmake")
+
+# installation prefix
+get_filename_component (CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component (_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/@INSTALL_PREFIX_REL2CONFIG_DIR@" ABSOLUTE)
+
+# include directory
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+set (@PACKAGE_PREFIX@_INCLUDE_DIR "${_INSTALL_PREFIX}/@INCLUDE_INSTALL_DIR@")
+
+if (@PACKAGE_NAME@_FIND_COMPONENTS)
+  foreach (@PACKAGE_NAME@_FIND_COMPONENT IN LISTS @PACKAGE_NAME@_FIND_COMPONENTS)
+    if (@PACKAGE_NAME@_FIND_REQUIRED_${@PACKAGE_NAME@_FIND_COMPONENT} AND NOT TARGET @PACKAGE_NAME@_${@PACKAGE_NAME@_FIND_COMPONENT})
+      message (FATAL_ERROR "Package @PACKAGE_NAME@ was installed without required component ${@PACKAGE_NAME@_FIND_COMPONENT}!")
+    endif ()
+  endforeach ()
+  list (GET @PACKAGE_NAME@_FIND_COMPONENTS 0 @PACKAGE_NAME@_FIND_COMPONENT)
+else ()
+  set (@PACKAGE_NAME@_FIND_COMPONENT)
+endif ()
+
+# default settings of @PACKAGE_PREFIX@_SHARED and @PACKAGE_PREFIX@_NOTHREADS
+#
+# It is recommended to use either one of the following find_package commands
+# instead of setting the @PACKAGE_PREFIX@_(SHARED|NOTHREADS) variables:
+# - find_package(@PACKAGE_NAME@ REQUIRED)
+# - find_package(@PACKAGE_NAME@ COMPONENTS nothreads_static)
+# - find_package(@PACKAGE_NAME@ COMPONENTS nothreads_shared)
+# - find_package(@PACKAGE_NAME@ COMPONENTS static)
+# - find_package(@PACKAGE_NAME@ COMPONENTS shared)
+if (NOT DEFINED @PACKAGE_PREFIX@_SHARED)
+  if (DEFINED @PACKAGE_NAME@_SHARED)
+    set (@PACKAGE_PREFIX@_SHARED ${@PACKAGE_NAME@_SHARED})
+  elseif (@PACKAGE_NAME@_FIND_COMPONENT)
+    if (@PACKAGE_NAME@_FIND_COMPONENT MATCHES "shared")
+      set (@PACKAGE_PREFIX@_SHARED TRUE)
+    else ()
+      set (@PACKAGE_PREFIX@_SHARED FALSE)
+    endif ()
+  elseif (TARGET @PACKAGE_NAME@_shared OR TARGET @PACKAGE_NAME@_nothreads_shared)
+    set (@PACKAGE_PREFIX@_SHARED TRUE)
+  else ()
+    set (@PACKAGE_PREFIX@_SHARED FALSE)
+  endif ()
+endif ()
+if (NOT DEFINED @PACKAGE_PREFIX@_NOTHREADS)
+  if (DEFINED @PACKAGE_NAME@_NOTHREADS)
+    set (@PACKAGE_PREFIX@_NOTHREADS ${@PACKAGE_NAME@_NOTHREADS})
+  elseif (@PACKAGE_NAME@_FIND_COMPONENT)
+    if (@PACKAGE_NAME@_FIND_COMPONENT MATCHES "nothreads")
+      set (@PACKAGE_PREFIX@_NOTHREADS TRUE)
+    else ()
+      set (@PACKAGE_PREFIX@_NOTHREADS FALSE)
+    endif ()
+  elseif (TARGET @PACKAGE_NAME@_static OR TARGET @PACKAGE_NAME@_shared)
+    set (@PACKAGE_PREFIX@_NOTHREADS FALSE)
+  else ()
+    set (@PACKAGE_PREFIX@_NOTHREADS TRUE)
+  endif ()
+endif ()
+
+# choose imported library target
+if (NOT @PACKAGE_PREFIX@_TARGET)
+  if (@PACKAGE_NAME@_TARGET)
+    set (@PACKAGE_PREFIX@_TARGET ${@PACKAGE_NAME@_TARGET})
+  elseif (@PACKAGE_PREFIX@_SHARED)
+    if (@PACKAGE_PREFIX@_NOTHREADS)
+      set (@PACKAGE_PREFIX@_TARGET @PACKAGE_NAME@_nothreads_shared)
+    else ()
+      set (@PACKAGE_PREFIX@_TARGET @PACKAGE_NAME@_shared)
+    endif ()
+  else ()
+    if (@PACKAGE_PREFIX@_NOTHREADS)
+      set (@PACKAGE_PREFIX@_TARGET @PACKAGE_NAME@_nothreads_static)
+    else ()
+      set (@PACKAGE_PREFIX@_TARGET @PACKAGE_NAME@_static)
+    endif ()
+  endif ()
+endif ()
+if (NOT TARGET ${@PACKAGE_PREFIX@_TARGET})
+  message (FATAL_ERROR "Your @PACKAGE_NAME@ installation does not contain a ${@PACKAGE_PREFIX@_TARGET} library target!"
+                       " Try a different combination of @PACKAGE_PREFIX@_SHARED and @PACKAGE_PREFIX@_NOTHREADS.")
+endif ()
+
+# add more convenient "@PACKAGE_NAME@" import target
+if (NOT TARGET @PACKAGE_NAME@)
+  if (@PACKAGE_PREFIX@_SHARED)
+    add_library (@PACKAGE_NAME@ SHARED IMPORTED)
+  else ()
+    add_library (@PACKAGE_NAME@ STATIC IMPORTED)
+  endif ()
+  # copy INTERFACE_* properties
+  foreach (_@PACKAGE_PREFIX@_PROPERTY_NAME IN ITEMS
+    COMPILE_DEFINITIONS
+    COMPILE_FEATURES
+    COMPILE_OPTIONS
+    INCLUDE_DIRECTORIES
+    LINK_LIBRARIES
+    POSITION_INDEPENDENT_CODE
+  )
+    get_target_property (_@PACKAGE_PREFIX@_PROPERTY_VALUE ${@PACKAGE_PREFIX@_TARGET} INTERFACE_${_@PACKAGE_PREFIX@_PROPERTY_NAME})
+    if (_@PACKAGE_PREFIX@_PROPERTY_VALUE)
+      set_target_properties(@PACKAGE_NAME@ PROPERTIES
+        INTERFACE_${_@PACKAGE_PREFIX@_PROPERTY_NAME} "${_@PACKAGE_PREFIX@_PROPERTY_VALUE}"
+      )
+    endif ()
+  endforeach ()
+  # copy IMPORTED_*_<CONFIG> properties
+  get_target_property (_@PACKAGE_PREFIX@_CONFIGURATIONS ${@PACKAGE_PREFIX@_TARGET} IMPORTED_CONFIGURATIONS)
+  set_target_properties (@PACKAGE_NAME@ PROPERTIES IMPORTED_CONFIGURATIONS "${_@PACKAGE_PREFIX@_CONFIGURATIONS}")
+  foreach (_@PACKAGE_PREFIX@_PROPERTY_NAME IN ITEMS
+    IMPLIB
+    LOCATION
+    LINK_DEPENDENT_LIBRARIES
+    LINK_INTERFACE_LIBRARIES
+    LINK_INTERFACE_LANGUAGES 
+    LINK_INTERFACE_MULTIPLICITY
+    NO_SONAME
+    SONAME
+  )
+    foreach (_@PACKAGE_PREFIX@_CONFIG IN LISTS _@PACKAGE_PREFIX@_CONFIGURATIONS)
+      get_target_property (_@PACKAGE_PREFIX@_PROPERTY_VALUE ${@PACKAGE_PREFIX@_TARGET} IMPORTED_${_@PACKAGE_PREFIX@_PROPERTY_NAME}_${_@PACKAGE_PREFIX@_CONFIG})
+      if (_@PACKAGE_PREFIX@_PROPERTY_VALUE)
+        set_target_properties(@PACKAGE_NAME@ PROPERTIES
+          IMPORTED_${_@PACKAGE_PREFIX@_PROPERTY_NAME}_${_@PACKAGE_PREFIX@_CONFIG} "${_@PACKAGE_PREFIX@_PROPERTY_VALUE}"
+        )
+      endif ()
+    endforeach ()
+  endforeach ()
+  unset (_@PACKAGE_PREFIX@_CONFIGURATIONS)
+  unset (_@PACKAGE_PREFIX@_CONFIG)
+  unset (_@PACKAGE_PREFIX@_PROPERTY_NAME)
+  unset (_@PACKAGE_PREFIX@_PROPERTY_VALUE)
+endif ()
+
+# alias for default import target to be compatible with older CMake package configurations
+set (@PACKAGE_PREFIX@_LIBRARIES "${@PACKAGE_PREFIX@_TARGET}")
+
+# set @PACKAGE_NAME@_* variables for backwards compatibility
+if (NOT "^@PACKAGE_NAME@$" STREQUAL "^@PACKAGE_PREFIX@$")
+  foreach (_@PACKAGE_PREFIX@_VARIABLE IN ITEMS
+    VERSION_STRING
+    VERSION_MAJOR
+    VERSION_MINOR
+    VERSION_PATCH
+    INCLUDE_DIR
+    LIBRARIES
+    TARGET
+  )
+    set (@PACKAGE_NAME@_${_@PACKAGE_PREFIX@_VARIABLE} "${@PACKAGE_PREFIX@_${_@PACKAGE_PREFIX@_VARIABLE}}")
+  endforeach ()
+  unset (_@PACKAGE_PREFIX@_VARIABLE)
+endif ()
+
+# unset private variables
+unset (@PACKAGE_NAME@_FIND_COMPONENT)
+unset (_INSTALL_PREFIX)
diff --git a/3rdparty/TNN/third_party/gflags/cmake/execute_test.cmake b/3rdparty/TNN/third_party/gflags/cmake/execute_test.cmake
new file mode 100644
index 0000000..df008cf
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/execute_test.cmake
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------------
+# sanitize string stored in variable for use in regular expression.
+macro (sanitize_for_regex STRVAR)
+  string (REGEX REPLACE "([.+*?^$()])" "\\\\\\1" ${STRVAR} "${${STRVAR}}")
+endmacro ()
+
+# ----------------------------------------------------------------------------
+# script arguments
+if (NOT COMMAND)
+  message (FATAL_ERROR "Test command not specified!")
+endif ()
+if (NOT DEFINED EXPECTED_RC)
+  set (EXPECTED_RC 0)
+endif ()
+if (EXPECTED_OUTPUT)
+  sanitize_for_regex(EXPECTED_OUTPUT)
+endif ()
+if (UNEXPECTED_OUTPUT)
+  sanitize_for_regex(UNEXPECTED_OUTPUT)
+endif ()
+
+# ----------------------------------------------------------------------------
+# set a few environment variables (useful for --tryfromenv)
+set (ENV{FLAGS_undefok} "foo,bar")
+set (ENV{FLAGS_weirdo}  "")
+set (ENV{FLAGS_version} "true")
+set (ENV{FLAGS_help}    "false")
+
+# ----------------------------------------------------------------------------
+# execute test command
+execute_process(
+  COMMAND ${COMMAND}
+  RESULT_VARIABLE RC
+  OUTPUT_VARIABLE OUTPUT
+  ERROR_VARIABLE  OUTPUT
+)
+
+if (OUTPUT)
+  message ("${OUTPUT}")
+endif ()
+
+# ----------------------------------------------------------------------------
+# check test result
+if (NOT RC EQUAL EXPECTED_RC)
+  string (REPLACE ";" " " COMMAND "${COMMAND}")
+  message (FATAL_ERROR "Command:\n\t${COMMAND}\nExit status is ${RC}, expected ${EXPECTED_RC}")
+endif ()
+if (EXPECTED_OUTPUT AND NOT OUTPUT MATCHES "${EXPECTED_OUTPUT}")
+  message (FATAL_ERROR "Test output does not match expected output: ${EXPECTED_OUTPUT}")
+endif ()
+if (UNEXPECTED_OUTPUT AND OUTPUT MATCHES "${UNEXPECTED_OUTPUT}")
+  message (FATAL_ERROR "Test output matches unexpected output: ${UNEXPECTED_OUTPUT}")
+endif ()
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/cmake/package.cmake.in b/3rdparty/TNN/third_party/gflags/cmake/package.cmake.in
new file mode 100644
index 0000000..aaec792
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/package.cmake.in
@@ -0,0 +1,49 @@
+# Per-generator CPack configuration file. See CPACK_PROJECT_CONFIG_FILE documented at
+# http://www.cmake.org/cmake/help/v2.8.12/cpack.html#variable:CPACK_PROJECT_CONFIG_FILE
+#
+# All common CPACK_* variables are set in CMakeLists.txt already. This file only
+# overrides some of these to provide package generator specific settings.
+
+# whether package contains all development files or only runtime files
+set (DEVEL @INSTALL_HEADERS@)
+
+# ------------------------------------------------------------------------------
+# Mac OS X package
+if (CPACK_GENERATOR MATCHES "PackageMaker|DragNDrop")
+
+  set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}")
+  if (DEVEL)
+    set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-devel")
+  endif ()
+  set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-${CPACK_PACKAGE_VERSION}")
+
+# ------------------------------------------------------------------------------
+# Debian package
+elseif (CPACK_GENERATOR MATCHES "DEB")
+
+  set (CPACK_PACKAGE_FILE_NAME   "lib${CPACK_PACKAGE_NAME}")
+  if (DEVEL)
+    set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-dev")
+  else ()
+    set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}0")
+  endif ()
+  set (CPACK_PACKAGE_FILE_NAME   "${CPACK_PACKAGE_FILE_NAME}_${CPACK_PACKAGE_VERSION}-1_${CPACK_PACKAGE_ARCHITECTURE}")
+
+  set (CPACK_DEBIAN_PACKAGE_DEPENDS)
+  set (CPACK_DEBIAN_PACKAGE_SECTION      "devel")
+  set (CPACK_DEBIAN_PACKAGE_PRIORITY     "optional")
+  set (CPACK_DEBIAN_PACKAGE_HOMEPAGE     "${CPACK_RPM_PACKAGE_URL}")
+  set (CPACK_DEBIAN_PACKAGE_MAINTAINER   "${CPACK_PACKAGE_VENDOR}")
+  set (CPACK_DEBIAN_PACKAGE_ARCHITECTURE "${CPACK_PACKAGE_ARCHITECTURE}")
+
+# ------------------------------------------------------------------------------
+# RPM package
+elseif (CPACK_GENERATOR MATCHES "RPM")
+
+  set (CPACK_PACKAGE_FILE_NAME   "${CPACK_PACKAGE_NAME}")
+  if (DEVEL)
+    set (CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-devel")
+  endif ()
+  set (CPACK_PACKAGE_FILE_NAME   "${CPACK_PACKAGE_FILE_NAME}-${CPACK_PACKAGE_VERSION}-1.${CPACK_PACKAGE_ARCHITECTURE}")
+
+endif ()
diff --git a/3rdparty/TNN/third_party/gflags/cmake/package.pc.in b/3rdparty/TNN/third_party/gflags/cmake/package.pc.in
new file mode 100644
index 0000000..80df818
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/package.pc.in
@@ -0,0 +1,14 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+bindir=${prefix}/@RUNTIME_INSTALL_DIR@
+libdir=${prefix}/@LIBRARY_INSTALL_DIR@
+includedir=${prefix}/@INCLUDE_INSTALL_DIR@
+
+Name: @PACKAGE_NAME@
+Version: @PACKAGE_VERSION@
+Description: @PACKAGE_DESCRIPTION@
+URL: @PACKAGE_URL@
+Requires:
+Libs: -L${libdir} -lgflags
+Libs.private: -lpthread 
+Cflags: -I${includedir}
diff --git a/3rdparty/TNN/third_party/gflags/cmake/utils.cmake b/3rdparty/TNN/third_party/gflags/cmake/utils.cmake
new file mode 100644
index 0000000..d039e5c
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/utils.cmake
@@ -0,0 +1,205 @@
+## Utility CMake functions.
+
+# ----------------------------------------------------------------------------
+## Convert boolean value to 0 or 1
+macro (bool_to_int VAR)
+  if (${VAR})
+    set (${VAR} 1)
+  else ()
+    set (${VAR} 0)
+  endif ()
+endmacro ()
+
+# ----------------------------------------------------------------------------
+## Extract version numbers from version string
+function (version_numbers version major minor patch)
+  if (version MATCHES "([0-9]+)(\\.[0-9]+)?(\\.[0-9]+)?(rc[1-9][0-9]*|[a-z]+)?")
+    if (CMAKE_MATCH_1)
+      set (_major ${CMAKE_MATCH_1})
+    else ()
+      set (_major 0)
+    endif ()
+    if (CMAKE_MATCH_2)
+      set (_minor ${CMAKE_MATCH_2})
+      string (REGEX REPLACE "^\\." "" _minor "${_minor}")
+    else ()
+      set (_minor 0)
+    endif ()
+    if (CMAKE_MATCH_3)
+      set (_patch ${CMAKE_MATCH_3})
+      string (REGEX REPLACE "^\\." "" _patch "${_patch}")
+    else ()
+      set (_patch 0)
+    endif ()
+  else ()
+    set (_major 0)
+    set (_minor 0)
+    set (_patch 0)
+  endif ()
+  set ("${major}" "${_major}" PARENT_SCOPE)
+  set ("${minor}" "${_minor}" PARENT_SCOPE)
+  set ("${patch}" "${_patch}" PARENT_SCOPE)
+endfunction ()
+
+# ----------------------------------------------------------------------------
+## Determine if cache entry exists
+macro (gflags_is_cached retvar varname)
+  if (DEFINED ${varname})
+    get_property (${retvar} CACHE ${varname} PROPERTY TYPE SET)
+  else ()
+    set (${retvar} FALSE)
+  endif ()
+endmacro ()
+
+# ----------------------------------------------------------------------------
+## Add gflags configuration variable
+#
+# The default value of the (cached) configuration value can be overridden either
+# on the CMake command-line or the super-project by setting the GFLAGS_<varname>
+# variable. When gflags is a subproject of another project (GFLAGS_IS_SUBPROJECT),
+# the variable is not added to the CMake cache. Otherwise it is cached.
+macro (gflags_define type varname docstring default)
+  # note that ARGC must be expanded here, as it is not a "real" variable
+  # (see the CMake documentation for the macro command)
+  if ("${ARGC}" GREATER 5)
+    message (FATAL_ERROR "gflags_variable: Too many macro arguments")
+  endif ()
+  if (NOT DEFINED GFLAGS_${varname})
+    if (GFLAGS_IS_SUBPROJECT AND "${ARGC}" EQUAL 5)
+      set (GFLAGS_${varname} "${ARGV4}")
+    else ()
+      set (GFLAGS_${varname} "${default}")
+    endif ()
+  endif ()
+  if (GFLAGS_IS_SUBPROJECT)
+    if (NOT DEFINED ${varname})
+      set (${varname} "${GFLAGS_${varname}}")
+    endif ()
+  else ()
+    set (${varname} "${GFLAGS_${varname}}" CACHE ${type} "${docstring}")
+  endif ()
+endmacro ()
+
+# ----------------------------------------------------------------------------
+## Set property of cached gflags configuration variable
+macro (gflags_property varname property value)
+  gflags_is_cached (_cached ${varname})
+  if (_cached)
+    # note that property must be expanded here, as it is not a "real" variable
+    # (see the CMake documentation for the macro command)
+    if ("${property}" STREQUAL "ADVANCED")
+      if (${value})
+        mark_as_advanced (FORCE ${varname})
+      else ()
+        mark_as_advanced (CLEAR ${varname})
+      endif ()
+    else ()
+      set_property (CACHE ${varname} PROPERTY "${property}" "${value}")
+    endif ()
+  endif ()
+  unset (_cached)
+endmacro ()
+
+# ----------------------------------------------------------------------------
+## Modify value of gflags configuration variable
+macro (gflags_set varname value)
+  gflags_is_cached (_cached ${varname})
+  if (_cached)
+    set_property (CACHE ${varname} PROPERTY VALUE "${value}")
+  else ()
+    set (${varname} "${value}")
+  endif ()
+  unset (_cached)
+endmacro ()
+
+# ----------------------------------------------------------------------------
+## Configure public header files
+function (configure_headers out)
+  set (tmp)
+  foreach (src IN LISTS ARGN)
+    if (IS_ABSOLUTE "${src}")
+      list (APPEND tmp "${src}")
+    elseif (EXISTS "${PROJECT_SOURCE_DIR}/src/${src}.in")
+      configure_file ("${PROJECT_SOURCE_DIR}/src/${src}.in" "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}" @ONLY)
+      list (APPEND tmp "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}")
+    else ()
+	    configure_file ("${PROJECT_SOURCE_DIR}/src/${src}" "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}" COPYONLY)
+      list (APPEND tmp "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}")
+    endif ()
+  endforeach ()
+  set (${out} "${tmp}" PARENT_SCOPE)
+endfunction ()
+
+# ----------------------------------------------------------------------------
+## Configure source files with .in suffix
+function (configure_sources out)
+  set (tmp)
+  foreach (src IN LISTS ARGN)
+    if (src MATCHES ".h$" AND EXISTS "${PROJECT_SOURCE_DIR}/src/${src}.in")
+      configure_file ("${PROJECT_SOURCE_DIR}/src/${src}.in" "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}" @ONLY)
+      list (APPEND tmp "${PROJECT_BINARY_DIR}/include/${GFLAGS_INCLUDE_DIR}/${src}")
+    else ()
+      list (APPEND tmp "${PROJECT_SOURCE_DIR}/src/${src}")
+    endif ()
+  endforeach ()
+  set (${out} "${tmp}" PARENT_SCOPE)
+endfunction ()
+
+# ----------------------------------------------------------------------------
+## Add usage test
+#
+# Using PASS_REGULAR_EXPRESSION and FAIL_REGULAR_EXPRESSION would
+# do as well, but CMake/CTest does not allow us to specify an
+# expected exit status. Moreover, the execute_test.cmake script
+# sets environment variables needed by the --fromenv/--tryfromenv tests.
+macro (add_gflags_test name expected_rc expected_output unexpected_output cmd)
+  set (args "--test_tmpdir=${PROJECT_BINARY_DIR}/Testing/Temporary"
+            "--srcdir=${PROJECT_SOURCE_DIR}/test")
+  add_test (
+    NAME    ${name}
+    COMMAND "${CMAKE_COMMAND}" "-DCOMMAND:STRING=$<TARGET_FILE:${cmd}>;${args};${ARGN}"
+                               "-DEXPECTED_RC:STRING=${expected_rc}"
+                               "-DEXPECTED_OUTPUT:STRING=${expected_output}"
+                               "-DUNEXPECTED_OUTPUT:STRING=${unexpected_output}"
+                               -P "${PROJECT_SOURCE_DIR}/cmake/execute_test.cmake"
+    WORKING_DIRECTORY "${GFLAGS_FLAGFILES_DIR}"
+  )
+endmacro ()
+
+# ------------------------------------------------------------------------------
+## Register installed package with CMake
+#
+# This function adds an entry to the CMake registry for packages with the
+# path of the directory where the package configuration file of the installed
+# package is located in order to help CMake find the package in a custom
+# installation prefix. This differs from CMake's export(PACKAGE) command
+# which registers the build directory instead.
+function (register_gflags_package CONFIG_DIR)
+  if (NOT IS_ABSOLUTE "${CONFIG_DIR}")
+    set (CONFIG_DIR "${CMAKE_INSTALL_PREFIX}/${CONFIG_DIR}")
+  endif ()
+  string (MD5 REGISTRY_ENTRY "${CONFIG_DIR}")
+  if (WIN32)
+    install (CODE
+      "execute_process (
+         COMMAND reg add \"HKCU\\\\Software\\\\Kitware\\\\CMake\\\\Packages\\\\${PACKAGE_NAME}\" /v \"${REGISTRY_ENTRY}\" /d \"${CONFIG_DIR}\" /t REG_SZ /f
+         RESULT_VARIABLE RT
+         ERROR_VARIABLE  ERR
+         OUTPUT_QUIET
+       )
+       if (RT EQUAL 0)
+         message (STATUS \"Register:   Added HKEY_CURRENT_USER\\\\Software\\\\Kitware\\\\CMake\\\\Packages\\\\${PACKAGE_NAME}\\\\${REGISTRY_ENTRY}\")
+       else ()
+         string (STRIP \"\${ERR}\" ERR)
+         message (STATUS \"Register:   Failed to add registry entry: \${ERR}\")
+       endif ()"
+    )
+  elseif (IS_DIRECTORY "$ENV{HOME}")
+    file (WRITE "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-registry-entry" "${CONFIG_DIR}")
+    install (
+      FILES       "${PROJECT_BINARY_DIR}/${PACKAGE_NAME}-registry-entry"
+      DESTINATION "$ENV{HOME}/.cmake/packages/${PACKAGE_NAME}"
+      RENAME      "${REGISTRY_ENTRY}"
+    )
+  endif ()
+endfunction ()
diff --git a/3rdparty/TNN/third_party/gflags/cmake/version.cmake.in b/3rdparty/TNN/third_party/gflags/cmake/version.cmake.in
new file mode 100644
index 0000000..1e1af05
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/cmake/version.cmake.in
@@ -0,0 +1,21 @@
+## gflags CMake configuration version file
+
+# -----------------------------------------------------------------------------
+# library version
+set (PACKAGE_VERSION "@PACKAGE_VERSION@")
+
+# -----------------------------------------------------------------------------
+# check compatibility
+
+# Perform compatibility check here using the input CMake variables.
+# See example in http://www.cmake.org/Wiki/CMake_2.6_Notes.
+
+set (PACKAGE_VERSION_COMPATIBLE TRUE)
+set (PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if ("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "@PACKAGE_VERSION_MAJOR@" AND
+    "${PACKAGE_FIND_VERSION_MINOR}" EQUAL "@PACKAGE_VERSION_MINOR@")
+  set (PACKAGE_VERSION_EXACT TRUE)
+else ()
+  set (PACKAGE_VERSION_EXACT FALSE)
+endif ()
diff --git a/3rdparty/TNN/third_party/gflags/doc/.nojekyll b/3rdparty/TNN/third_party/gflags/doc/.nojekyll
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/third_party/gflags/doc/designstyle.css b/3rdparty/TNN/third_party/gflags/doc/designstyle.css
new file mode 100644
index 0000000..f5d1ec2
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/doc/designstyle.css
@@ -0,0 +1,115 @@
+body {
+  background-color: #ffffff;
+  color: black;
+  margin-right: 1in;
+  margin-left: 1in;
+}
+
+
+h1, h2, h3, h4, h5, h6 {
+  color: #3366ff;
+  font-family: sans-serif;
+}
+@media print {
+  /* Darker version for printing */
+  h1, h2, h3, h4, h5, h6 {
+    color: #000080;
+    font-family: helvetica, sans-serif;
+  }
+}
+
+h1 { 
+  text-align: center;
+  font-size: 18pt;
+}
+h2 {
+  margin-left: -0.5in;
+}
+h3 {
+  margin-left: -0.25in;
+}
+h4 {
+  margin-left: -0.125in;
+}
+hr {
+  margin-left: -1in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: right;
+}
+/* Use the <code> tag for bits of code and <var> for variables and objects. */
+code,pre,samp,var {
+  color: #006000;
+}
+/* Use the <file> tag for file and directory paths and names. */
+file {
+  color: #905050;
+  font-family: monospace;
+}
+/* Use the <kbd> tag for stuff the user should type. */
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+/*
+body:after {
+  content: "Google Confidential";
+}
+*/
+
+/* pretty printing styles.  See prettify.js */
+.str { color: #080; }
+.kwd { color: #008; }
+.com { color: #800; }
+.typ { color: #606; }
+.lit { color: #066; }
+.pun { color: #660; }
+.pln { color: #000; }
+.tag { color: #008; }
+.atn { color: #606; }
+.atv { color: #080; }
+pre.prettyprint { padding: 2px; border: 1px solid #888; }
+
+.embsrc { background: #eee; }
+
+@media print {
+  .str { color: #060; }
+  .kwd { color: #006; font-weight: bold; }
+  .com { color: #600; font-style: italic; }
+  .typ { color: #404; font-weight: bold; }
+  .lit { color: #044; }
+  .pun { color: #440; }
+  .pln { color: #000; }
+  .tag { color: #006; font-weight: bold; }
+  .atn { color: #404; }
+  .atv { color: #060; }
+}
+
+/* Table Column Headers */
+.hdr { 
+  color: #006; 
+  font-weight: bold; 
+  background-color: #dddddd; }
+.hdr2 { 
+  color: #006; 
+  background-color: #eeeeee; }
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/doc/index.html b/3rdparty/TNN/third_party/gflags/doc/index.html
new file mode 100644
index 0000000..e0afb47
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/doc/index.html
@@ -0,0 +1,648 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+
+<html>
+<head>
+<title>How To Use Gflags (formerly Google Commandline Flags)</title>
+
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<link href="designstyle.css" type="text/css" rel="stylesheet">
+<style type="text/css">
+<!--
+  ol.bluelist li {
+    color: #3366ff;
+    font-family: sans-serif;
+  }
+  ol.bluelist li p {
+    color: #000;
+    font-family: "Times Roman", times, serif;
+  }
+  ul.blacklist li {
+    color: #000;
+    font-family: "Times Roman", times, serif;
+  }
+//-->
+</style>
+</head>
+
+<body>
+
+<h1>How To Use gflags (formerly Google Commandline Flags)</h1>
+<small>(as of
+<script type=text/javascript>
+  var lm = new Date(document.lastModified);
+  document.write(lm.toDateString());
+</script>)
+</small>
+<br>
+
+<blockquote><dl>
+  <dt> Table of contents </dt>
+  <dd> <a href="#intro">Introduction</a> </dd>
+  <dd> <a href="#download">Download and Installation</a> </dd>
+  <dd> <a href="#cmake">Declare dependency on gflags with CMake</a></dd>
+  <dd> <a href="#bazel">Declare dependency on gflags with Bazel</a></dd>
+  <dd> <a href="#define">DEFINE: Defining Flags In Program</A> </dd>
+  <dd> <a href="#using">Accessing the Flag</A> </dd>
+  <dd> <a href="#declare">DECLARE: Using the Flag in a Different File</a> </dd>
+  <dd> <a href="#validate">RegisterFlagValidator: Sanity-checking Flag Values</a> </dd>
+  <dd> <a href="#together">Putting It Together: How to Set Up Flags</a> </dd>
+  <dd> <a href="#commandline">Setting Flags on the Command Line</a> </dd>
+  <dd> <a href="#varz">Setting Flags at Runtime</a> </dd>
+  <dd> <a href="#default">Changing the Default Flag Value</a> </dd>
+  <dd> <a href="#special">Special Flags</a> </dd>
+  <dd> <a href="#api">The API</a> </dd>
+  <dd> <a href="#misc">Miscellaneous Notes</a> </dd>
+  <dd> <a href="#issues">Issues and Feature Requests</a> </dd>
+  <dd> <br/> </dd>
+</dl></blockquote>
+
+<h2> <A NAME=intro>Introduction, and Comparison to Other Commandline
+     Flags Libraries</A> </h2>
+
+<p><b>Commandline flags</b> are flags that users specify on the
+command line when they run an executable.  In the command</p>
+<pre>
+   fgrep -l -f /var/tmp/foo johannes brahms
+</pre>
+<p><code>-l</code> and <code>-f /var/tmp/foo</code> are the two
+commandline flags.  (<code>johannes</code> and <code>brahms</code>,
+which don't start with a dash, are <b>commandline arguments</b>.)</p>
+
+<p>Typically, an application lists what flags the user is allowed to
+pass in, and what arguments they take -- in this example,
+<code>-l</code> takes no argument, and <code>-f</code> takes a
+string (in particular, a filename) as an argument.  Users can use a
+library to help parse the commandline and store the flags in some data
+structure.</p>
+
+<p>Gflags, the commandline flags library used within Google,
+differs from other libraries,
+such as <code>getopt()</code>, in that flag definitions can be
+scattered around the source code, and not just listed in one place
+such as <code>main()</code>.  In practice, this means that a single
+source-code file will define and use flags that are meaningful to that
+file.  Any application that links in that file will get the flags, and
+the gflags library will automatically handle that
+flag appropriately.</p>
+
+<p>There's significant gain in flexibility, and ease of code reuse,
+due to this technique.  However, there is a danger that two files will
+define the same flag, and then give an error when they're linked
+together.</p>
+
+<p>The rest of this document describes how to use the commandlineflag
+library.  It's a C++ library, so examples are in C++.  However, there
+is a Python port with the same functionality, and this discussion
+translates directly to Python.</p>
+
+<h2> <A NAME=download>Download and Installation</A> </h2>
+
+<p>The gflags library can be downloaded from <A href="https://github.com/gflags/gflags">GitHub</A>.
+You can clone the project using the command:</p>
+<pre>
+   git clone https://github.com/gflags/gflags.git
+</pre>
+<p>Build and installation instructions are provided in the
+<A href="https://github.com/gflags/gflags/blob/master/INSTALL.md">INSTALL</A> file.
+The installation of the gflags package includes configuration files for popular build systems
+such as <A href="https://www.freedesktop.org/wiki/Software/pkg-config/">pkg-config</A>,
+<A href="#cmake">CMake</A>, and <A href="#bazel">Bazel</A>.</p>
+
+
+<h2> <A name=cmake>Declare dependency on gflags with CMake</A></h2>
+
+<p>Using gflags within a project which uses <A href="http://www.cmake.org">CMake</A> for its build system is easy.
+You can either require an external installation of the gflags package and find it using CMake's find_package
+command, or include the gflags project as subtree or submodule within your project's source tree and add the directory
+using CMake's add_subdirectory command.
+    
+<p>To use an external gflags installation, add the following CMake code to your <code>CMakeLists.txt</code> file.</p>
+
+<p>Find gflags installation. The <code>gflags_DIR</code> variable must be set to the &lt;prefix&gt;/lib/cmake/gflags directory
+containing the gflags-config.cmake file if &lt;prefix&gt; is a non-standard location. Otherwise, CMake should find
+the gflags installation automatically.</p>
+<pre>
+   find_package(gflags REQUIRED)
+</pre>
+<p>To request a particular imported gflags library target to link against, use the <code>COMPONENTS</code> option of
+the find_package command. For example, to force the use of the single-threaded static library, use the command</p>
+<pre>
+   find_package(gflags COMPONENTS nothreads_static)
+</pre>
+<p>Note that this will raise a fatal error when the installed gflags package does not contain the requested library.
+It is therefore recommended to only specify the particular component to look for if a specific library must be used.
+Otherwise, the gflags-config.cmake module will choose a suitable and available library for you. By default, the
+multi-threaded gflags library with shared linkage is chosen if available.</p>
+
+<p>When the source tree of the gflags project is included as subtree or submodule in the "gflags" directory of your project,
+replace the above find_package command by <code>add_subdirectory(gflags)</code>. See the top of the <code>gflags/CMakeLists.txt</code>
+file for a listing of available CMake variables that can be set before this command to configure the build of the
+gflags library. The default build settings are the build of a single-threaded static library which does not require
+any installation of the gflags subproject products.</p>
+
+<p>Finally, add your executable build target which uses gflags to parse the command arguments with dependency on the
+imported gflags library target:</p>
+<pre>
+   add_executable(foo main.cc)
+   target_link_libraries(foo gflags)
+</pre>
+
+<h2> <A name=bazel>Declare dependency on gflags with Bazel</A></h2>
+
+<p>To use gflags within a project which uses <A href="https://bazel.build/">Bazel</A> as build tool,
+add the following lines to your <code>WORKSPACE</code> file
+(see also Bazel documentation of <A href="https://www.bazel.io/versions/master/docs/be/workspace.html#git_repository">git_repository</A>):
+
+<pre>
+git_repository(
+    name   = "com_github_gflags_gflags",
+    commit = "&lt;INSERT COMMIT SHA HERE&gt;",
+    remote = "https://github.com/gflags/gflags.git",
+)
+
+bind(
+    name = "gflags",
+    actual = "@com_github_gflags_gflags//:gflags",
+)
+
+bind(
+    name = "gflags_nothreads",
+    actual = "@com_github_gflags_gflags//:gflags_nothreads",
+)
+</pre>
+
+<p>You can then add <code>//external:gflags</code> to the <code>deps</code> section of a <code>cc_binary</code>
+or <code>cc_library</code> rule, and <code>#include "gflags/gflags.h"</code> to include it in your source code.
+This use the shared gflags library with multi-threading enabled. In order to use the single-threaded shared
+gflags library, use the external dependency <code>//external:gflags_nothreads</code> instead.</p>
+
+<p>For example, see the following <code>BUILD</code> rule of the gflags/example project:</p>
+
+<pre>
+cc_binary(
+    name = "foo",
+    srcs = ["main.cc"],
+    deps = ["//external:gflags"],
+)
+</pre>
+
+<h2> <A name=define>DEFINE: Defining Flags In Program</A> </h2>
+
+<p> Defining a flag is easy: just use the appropriate macro for the
+type you want the flag to be, as defined at the bottom of
+<code>gflags/gflags.h</code>.  Here's an example file,
+<code>foo.cc</code>:</p>
+
+<pre>
+   #include &lt;gflags/gflags.h&gt;
+
+   DEFINE_bool(big_menu, true, "Include 'advanced' options in the menu listing");
+   DEFINE_string(languages, "english,french,german",
+                 "comma-separated list of languages to offer in the 'lang' menu");
+</pre>
+
+<p><code>DEFINE_bool</code> defines a boolean flag.  Here are the
+types supported:</p>
+<ul>
+  <li> <code>DEFINE_bool</code>: boolean
+  <li> <code>DEFINE_int32</code>: 32-bit integer
+  <li> <code>DEFINE_int64</code>: 64-bit integer
+  <li> <code>DEFINE_uint64</code>: unsigned 64-bit integer
+  <li> <code>DEFINE_double</code>: double
+  <li> <code>DEFINE_string</code>: C++ string
+</ul>
+
+<p>Note that there are no 'complex' types like lists: the "languages"
+flag in our example is a list of strings, but is defined of type
+"string", not "list_of_string" or similar.  This is by design.  We'd
+rather use only simple types for the flags, and allow for complex,
+arbitrary parsing routines to parse them, than to try to put the logic
+inside the flags library proper.</p>
+
+<p>All DEFINE macros take the same three arguments: the name of the
+flag, its default value, and a 'help' string that describes its use.
+The 'help' string is displayed when the user runs the application with
+the <A HREF="#special"><code>--help</code> flag</A>.</p>
+
+<p>You can define a flag in any source-code file in your executable.
+Only define a flag once!  If you want to access a flag in more than
+one source file, DEFINE it in one file, and <A
+HREF="#declare">DECLARE</A> it in the others.  Even better, DEFINE it
+in <code>foo.cc</code> and DECLARE it in <code>foo.h</code>; then
+everyone who <code>#includes foo.h</code> can use the flag.</p>
+
+<p>
+Defining flags in libraries rather than in main() is powerful, but
+does have some costs. One is that a library might not have a good
+default value for its flags, for example if the flag holds a
+filename that might not exist in some environments. To mitigate such problems,
+you can use <a href="#validate">flag validators</a> to ensure prompt
+notification (in the form of a crash) of an invalid flag value.
+</p>
+
+<p>Note that while most functions in this library are defined in the
+<code>google</code> namespace, <code>DEFINE_foo</code> (and
+<code>DECLARE_foo</code>, <A HREF="#declare">below</A>), should always
+be in the global namespace.</p>
+
+
+<h2> <A name=using>Accessing the Flag</A> </h2>
+
+<p>All defined flags are available to the program as just a normal
+variable, with the prefix <code>FLAGS_</code> prepended.  In the above
+example, the macros define two variables, <code>FLAGS_big_menu</code>
+(a bool), and <code>FLAGS_languages</code> (a C++ string).</p>
+
+<p>You can read and write to the flag just like any other
+variable:</p>
+<pre>
+   if (FLAGS_consider_made_up_languages)
+     FLAGS_languages += ",klingon";   // implied by --consider_made_up_languages
+   if (FLAGS_languages.find("finnish") != string::npos)
+     HandleFinnish();
+</pre>
+
+<p>You can also get and set flag values via special functions in
+<code>gflags.h</code>.  That's a rarer use case, though.</p>
+
+
+<h2> <A name=declare>DECLARE: Using the Flag in a Different File</A> </h2>
+
+<p>Accessing a flag in the manner of the previous section only works
+if the flag was <code>DEFINE</code>-ed at the top of the file.  If it
+wasn't, you'll get an 'unknown variable' error.</p>
+
+<p>The <code>DECLARE_type</code> macro is available when you want to
+use a flag that's defined in another file.  For instance, if I were
+writing <code>bar.cc</code> but wanted to access the big_menu, flag, I
+would put this near the top of <code>bar.cc</code>:</p>
+<pre>
+   DECLARE_bool(big_menu);
+</pre>
+
+<p>This is functionally equivalent to saying <code>extern
+FLAGS_big_menu</code>.</p>
+
+<p>Note that such an extern declaration introduces a dependency
+between your file and the file that defines the <code>big_menu</code>
+flag: <code>foo.cc</code>, in this case.  Such implicit dependencies
+can be difficult to manage in large projects.  For that reason we
+recommend the following guideline:</p>
+
+<blockquote>
+If you DEFINE a flag in <code>foo.cc</code>, either don't DECLARE it
+at all, only DECLARE it in tightly related tests, or only DECLARE
+it in <code>foo.h</code>.
+</blockquote>
+
+<p>You should go the do-not-DECLARE route when the flag is only needed
+by <code>foo.cc</code>, and not in any other file. If you want to
+modify the value of the flag in the related test file to see if it is
+functioning as expected, DECLARE it in the <code>foo_test.cc</code>
+file.
+
+<p>If the flag does span multiple files, DECLARE it in the associated
+<code>.h</code> file, and make others <code>#include</code> that
+<code>.h</code> file if they want to access the flag.  The
+<code>#include</code> will make explicit the dependency between the
+two files. This causes the flag to be a global variable.</p>
+
+
+<h2> <A name=validate>RegisterFlagValidator: Sanity-checking Flag Values</A> </h2>
+
+<p>After DEFINE-ing a flag, you may optionally register a validator
+function with the flag.  If you do this, after the flag is parsed from
+the commandline, and whenever its value is changed via a call to
+<code>SetCommandLineOption()</code>, the validator function is called
+with the new value as an argument.  The validator function should
+return 'true' if the flag value is valid, and false otherwise.
+If the function returns false for the new setting of the
+flag, the flag will retain its current value. If it returns false for the
+default value, ParseCommandLineFlags will die.
+
+<p>Here is an example use of this functionality:</p>
+<pre>
+static bool ValidatePort(const char* flagname, int32 value) {
+   if (value > 0 && value < 32768)   // value is ok
+     return true;
+   printf("Invalid value for --%s: %d\n", flagname, (int)value);
+   return false;
+}
+DEFINE_int32(port, 0, "What port to listen on");
+DEFINE_validator(port, &ValidatePort);
+</pre>
+
+<p>By doing the registration at global initialization time (right
+after the DEFINE_int32), we ensure that the registration happens before
+the commandline is parsed at the beginning of <code>main()</code>.</p>
+
+<p>The above used <code>DEFINE_validator</code> macro calls the
+<code>RegisterFlagValidator()</code> function which returns true if the
+registration is successful.  It returns false if the registration fails
+because a) the first argument does not refer to a commandline flag, or
+b) a different validator has already been registered for this flag.
+The return value is available as global static boolean variable named
+<code>&lt;flag&gt;_validator_registered</code>.</p>
+
+
+<h2> <A name=together>Putting It Together: How to Set Up Flags</A> </h2>
+
+<p>The final piece is the one that tells the executable to process the
+commandline flags, and set the <code>FLAGS_*</code> variables to the
+appropriate, non-default value based on what is seen on the
+commandline.  This is equivalent to the <code>getopt()</code> call in
+the getopt library, but has much less overhead to use.  In fact, it's
+just a single function call:</p>
+
+<pre>
+   gflags::ParseCommandLineFlags(&argc, &argv, true);
+</pre>
+
+<p>Usually, this code is at the beginning of <code>main()</code>.
+<code>argc</code> and <code>argv</code> are exactly as passed in to
+<code>main()</code>.  This routine might modify them, which is why
+pointers to them are passed in.</p>
+
+<p>The last argument is called "remove_flags".  If true, then
+<code>ParseCommandLineFlags</code> removes the flags and their
+arguments from <code>argv</code>, and modifies <code>argc</code>
+appropriately.  In this case, after the function call,
+<code>argv</code> will hold only commandline arguments, and not
+commandline flags.</p>
+
+<p>If, on the other hand, <code>remove_flags</code> is false, then
+<code>ParseCommandLineFlags</code> will leave argc unchanged, but will
+rearrange the arguments in argv so that the flags are all at the
+beginning.  For example, if the input is <code>"/bin/foo" "arg1" "-q"
+"arg2"</code> (which is legal but weird), the function will rearrange
+<code>argv</code> so it reads <code>"/bin/foo", "-q", "arg1",
+"arg2"</code>.  In this case, <code>ParseCommandLineFlags</code>
+returns the index into argv that holds the first commandline argument:
+that is, the index past the last flag.  (In this example, it would
+return 2, since <code>argv[2]</code> points to <code>arg1</code>.)</p>
+
+<p>In either case, the <code>FLAGS_*</code> variables are modified
+based on what was <A HREF="#commandline">passed in on the
+commandline</A>.</p>
+
+
+<h2> <A name=commandline>Setting Flags on the Command Line</A> </h2>
+
+<p>The reason you make something a flag instead of a compile-time
+constant, is so users can specify a non-default value on the
+commandline.  Here's how they might do it for an application that
+links in <code>foo.cc</code>:</p>
+<pre>
+   app_containing_foo --nobig_menu -languages="chinese,japanese,korean" ...
+</pre>
+
+<p>This sets <code>FLAGS_big_menu = false;</code> and
+<code>FLAGS_languages = "chinese,japanese,korean"</code>, when
+<code>ParseCommandLineFlags</code> is run.</p>
+
+<p>Note the atypical syntax for setting a boolean flag to false:
+putting "no" in front of its name.  There's a fair bit of flexibility
+to how flags may be specified.  Here's an example of all the ways to
+specify the "languages" flag:</p>
+<ul>
+  <li> <code>app_containing_foo --languages="chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo -languages="chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo --languages "chinese,japanese,korean"</code>
+  <li> <code>app_containing_foo -languages "chinese,japanese,korean"</code>
+</ul>
+
+<p>For boolean flags, the possibilities are slightly different:</p>
+<ul>
+  <li> <code>app_containing_foo --big_menu</code>
+  <li> <code>app_containing_foo --nobig_menu</code>
+  <li> <code>app_containing_foo --big_menu=true</code>
+  <li> <code>app_containing_foo --big_menu=false</code>
+</ul>
+<p>(as well as the single-dash variant on all of these).</p>
+
+<p>Despite this flexibility, we recommend using only a single form:
+<code>--variable=value</code> for non-boolean flags, and
+<code>--variable/--novariable</code> for boolean flags.  This
+consistency will make your code more readable, and is also the format
+required for certain special-use cases like <A
+HREF="#flagfiles">flagfiles</A>.</p>
+
+<p>It is a fatal error to specify a flag on the commandline that has
+not been DEFINED somewhere in the executable.  If you need that
+functionality for some reason -- say you want to use the same set of
+flags for several executables, but not all of them DEFINE every flag
+in your list -- you can specify <A
+HREF="#special"><code>--undefok</code></A> to suppress the error.</p>
+
+<p>As in getopt(), <code>--</code> by itself will terminate flags
+processing.  So in <code>foo -f1 1 -- -f2 2</code>, <code>f1</code> is
+considered a flag, but <code>-f2</code> is not.</p>
+
+<p>If a flag is specified more than once, only the last specification
+is used; the others are ignored.</p>
+
+<p>Note that flags do not have single-letter synonyms, like they do in
+the getopt library, nor do we allow "combining" flags behind a
+single dash, as in <code>ls -la</code>.</p>
+
+
+
+<h2> <A name=default>Changing the Default Flag Value</A> </h2>
+
+<p>Sometimes a flag is defined in a library, and you want to change
+its default value in one application but not others.  It's simple to
+do this: just assign a new value to the flag in <code>main()</code>,
+before calling <code>ParseCommandLineFlags()</code>:</p>
+<pre>
+   DECLARE_bool(lib_verbose);   // mylib has a lib_verbose flag, default is false
+   int main(int argc, char** argv) {
+     FLAGS_lib_verbose = true;  // in my app, I want a verbose lib by default
+     ParseCommandLineFlags(...);
+   }
+</pre>
+
+<p>For this application, users can still set the flag value on the
+commandline, but if they do not, the flag's value will default to
+true.</p>
+
+
+<h2> <A name="special">Special Flags</a> </h2>
+
+<p>There are a few flags defined by the commandlineflags module
+itself, and are available to all applications that use
+commandlineflags.  These fall into
+three categories.  First are the 'reporting' flags that, when found, cause
+the application to print some information about itself and exit.</p>
+
+<table><tr valign=top>
+  <td><code>--help</code></td>
+  <td>shows all flags from all files, sorted by file and then by name;
+      shows the flagname, its default value, and its help string</td>
+</tr><tr valign=top>
+  <td><code>--helpfull</code></td>
+  <td>same as -help, but unambiguously asks for all flags
+     (in case -help changes in the future)</td>
+</tr><tr valign=top>
+  <td><code>--helpshort</code></td>
+  <td>shows only flags for the file with the same name as the executable 
+      (usually the one containing <code>main()</code>)</td>
+</tr><tr valign=top>
+  <td><code>--helpxml</code></td>
+  <td>like --help, but output is in xml for easier parsing</td>
+</tr><tr valign=top>
+  <td><code>--helpon=FILE &nbsp;</code></td>
+  <td>shows only flags defined in FILE.*</td>
+</tr><tr valign=top>
+  <td><code>--helpmatch=S</code></td>
+  <td>shows only flags defined in *S*.*</td>
+</tr><tr valign=top>
+  <td><code>--helppackage</code></td>
+  <td>shows flags defined in files in same directory as <code>main()</code></td>
+</tr><tr valign=top>
+  <td><code>--version</code></td>
+  <td>prints version info for the executable</td>
+</tr></table>
+
+<p>Second are the flags that affect how other flags are parsed.</p>
+
+<table><tr valign=top>
+  <td><code>--undefok=flagname,flagname,...</code></td>
+  <td>for those names listed as the argument to <code>--undefok</code>,
+      suppress the normal error-exit that occurs when
+      <code>--name</code> is seen on the commandline, but
+      <code>name</code> has not been DEFINED anywhere in the
+      application
+</table>
+
+<p>Third are the 'recursive' flags, that cause other flag values to be
+set: <code>--fromenv</code>, <code>--tryfromenv</code>,
+<code>--flagfile</code>.  These are described below in more
+detail.</p>
+
+<h3> <code>--fromenv</code> </h3>
+
+<p><code>--fromenv=foo,bar</code> says to read the values for the
+<code>foo</code> and <code>bar</code> flags from the environment.
+In concert with this flag, you must actually set the values in the
+environment, via a line like one of the two below:</p>
+<pre>
+   export FLAGS_foo=xxx; export FLAGS_bar=yyy   # sh
+   setenv FLAGS_foo xxx; setenv FLAGS_bar yyy   # tcsh
+</pre>
+<p>This is equivalent to specifying <code>--foo=xxx</code>,
+<code>--bar=yyy</code> on the commandline.</p>
+
+<p>Note it is a fatal error to say <code>--fromenv=foo</code> if
+<code>foo</code> is not DEFINED somewhere in the application.  (Though
+you can suppress this error via <code>--undefok=foo</code>, just like
+for any other flag.)</p>
+
+<p>It is also a fatal error to say <code>--fromenv=foo</code> if
+<code>FLAGS_foo</code> is not actually defined in the environment.</p>
+
+<h3> <code>--tryfromenv</code> </h3>
+
+<p><code>--tryfromenv</code> is exactly like <code>--fromenv</code>,
+except it is <b>not</b> a fatal error to say
+<code>--tryfromenv=foo</code> if <code>FLAGS_foo</code> is not
+actually defined in the environment.  Instead, in such cases,
+<code>FLAGS_foo</code> just keeps its default value as specified in
+the application.</p>
+
+<p>Note it is still an error to say <code>--tryfromenv=foo</code> if
+<code>foo</code> is not DEFINED somewhere in the application.</p>
+
+<h3> <code>--flagfile</code> </h3>
+
+<p><code>--flagfile=f</code> tells the commandlineflags module to read
+the file <code>f</code>, and to run all the flag-assignments found in
+that file as if these flags had been specified on the commandline.</p>
+
+<p>In its simplest form, <code>f</code> should just be a list of flag
+assignments, one per line.  Unlike on the commandline, the equals sign
+separating a flagname from its argument is <i>required</i> for
+flagfiles.  An example flagfile, <code>/tmp/myflags</code>:</p>
+<pre>
+--nobig_menus
+--languages=english,french
+</pre>
+
+<p>With this flagfile, the following two lines are equivalent:<p>
+<pre>
+   ./myapp --foo --nobig_menus --languages=english,french --bar
+   ./myapp --foo --flagfile=/tmp/myflags --bar
+</pre>
+
+<p>Note that many errors are silently suppressed in flagfiles.  In
+particular, unrecognized flagnames are silently ignored, as are flags
+that are missing a required value (e.g., a flagfile that just says
+<code>--languages</code>).</p>
+
+<p>The general format of a flagfile is a bit more complicated than the
+simple, common case above.  It is: a sequence of filenames, one per
+line, followed by a sequence of flags, one per line, repeated as many
+times as desired.  Filenames in a flagfile can use wildcards
+(<code>*</code> and <code>?</code>), and the sequence of flags located
+after a sequence of filenames is processed only if the current
+executable's name matches one of the filenames.  It is possible to
+start the flagfile with a sequence of flags instead of a sequence of
+filenames; if such a sequence of flags is present, these flags are
+applied to the current executable no matter what it is.</p>
+
+<p>Lines that start with a <code>#</code> are ignored as comments.
+Leading whitespace is also ignored in flagfiles, as are blank
+lines.</p>
+
+<p>It is possible for a flagfile to use the <code>--flagfile</code>
+flag to include another flagfile.</p>
+
+<p>Flags are always processed in the expected order.  That is,
+processing begins by examining the flags specified directly on the
+command line.  If a flagfile is specified, its contents are processed,
+and then processing continues with remaining flags from the command
+line.</p>
+
+
+<h2> <A name="api">The API</a> </h2>
+
+<p>In addition to accessing <code>FLAGS_foo</code> directly, it is
+possible to access the flags programmatically, through an API.  It is
+also possible to access information about a flag, such as its default
+value and help-string.  A <code>FlagSaver</code> makes it easy to
+modify flags and then automatically undo the modifications later.
+Finally, there are somewhat unrelated, but useful, routines to easily
+access parts of <code>argv</code> outside main, including the program
+name (<code>argv[0]</code>).</p>
+
+<p>For more information about these routines, and other useful helper
+methods such as <code>gflags::SetUsageMessage()</code> and
+<code>gflags::SetVersionString</code>, see <code>gflags.h</code>.</p>
+
+
+<h2> <A name="misc">Miscellaneous Notes</code> </h2>
+
+<p>If your application has code like this:</p>
+<pre>
+   #define STRIP_FLAG_HELP 1    // this must go before the #include!
+   #include &lt;gflags/gflags.h&gt;
+</pre>
+<p>we will remove the help messages from the compiled source. This can
+reduce the size of the resulting binary somewhat, and may also be
+useful for security reasons.</p>
+
+<h2> <A name="issues">Issues and Feature Requests</code> </h2>
+
+<p>Please report any issues or ideas for additional features on <A href="https://github.com/gflags/gflags/issues">GitHub</A>.
+We would also like to encourage <A href="https://github.com/gflags/gflags/pulls">pull requests</A> for bug fixes and implementations of new features.</p>
+
+<hr>
+<address>
+Craig Silverstein, Andreas Schuh<br>
+<script type=text/javascript>
+  var lm = new Date(document.lastModified);
+  document.write(lm.toDateString());
+</script>
+</address>
+
+</body>
+</html>
diff --git a/3rdparty/TNN/third_party/gflags/gflags-config-install.cmake b/3rdparty/TNN/third_party/gflags/gflags-config-install.cmake
new file mode 100644
index 0000000..4629652
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/gflags-config-install.cmake
@@ -0,0 +1,169 @@
+## gflags CMake configuration file
+
+# library version information
+set (GFLAGS_VERSION_STRING "2.2.1")
+set (GFLAGS_VERSION_MAJOR  2)
+set (GFLAGS_VERSION_MINOR  2)
+set (GFLAGS_VERSION_PATCH  0)
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/gflags-targets.cmake")
+
+# installation prefix
+get_filename_component (CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component (_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+
+# include directory
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+set (GFLAGS_INCLUDE_DIR "${_INSTALL_PREFIX}/include")
+
+if (gflags_FIND_COMPONENTS)
+  foreach (gflags_FIND_COMPONENT IN LISTS gflags_FIND_COMPONENTS)
+    if (gflags_FIND_REQUIRED_${gflags_FIND_COMPONENT} AND NOT TARGET gflags_${gflags_FIND_COMPONENT})
+      message (FATAL_ERROR "Package gflags was installed without required component ${gflags_FIND_COMPONENT}!")
+    endif ()
+  endforeach ()
+  list (GET gflags_FIND_COMPONENTS 0 gflags_FIND_COMPONENT)
+else ()
+  set (gflags_FIND_COMPONENT)
+endif ()
+
+# default settings of GFLAGS_SHARED and GFLAGS_NOTHREADS
+#
+# It is recommended to use either one of the following find_package commands
+# instead of setting the GFLAGS_(SHARED|NOTHREADS) variables:
+# - find_package(gflags REQUIRED)
+# - find_package(gflags COMPONENTS nothreads_static)
+# - find_package(gflags COMPONENTS nothreads_shared)
+# - find_package(gflags COMPONENTS static)
+# - find_package(gflags COMPONENTS shared)
+if (NOT DEFINED GFLAGS_SHARED)
+  if (DEFINED gflags_SHARED)
+    set (GFLAGS_SHARED ${gflags_SHARED})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "shared")
+      set (GFLAGS_SHARED TRUE)
+    else ()
+      set (GFLAGS_SHARED FALSE)
+    endif ()
+  elseif (TARGET gflags_shared OR TARGET gflags_nothreads_shared)
+    set (GFLAGS_SHARED TRUE)
+  else ()
+    set (GFLAGS_SHARED FALSE)
+  endif ()
+endif ()
+if (NOT DEFINED GFLAGS_NOTHREADS)
+  if (DEFINED gflags_NOTHREADS)
+    set (GFLAGS_NOTHREADS ${gflags_NOTHREADS})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "nothreads")
+      set (GFLAGS_NOTHREADS TRUE)
+    else ()
+      set (GFLAGS_NOTHREADS FALSE)
+    endif ()
+  elseif (TARGET gflags_static OR TARGET gflags_shared)
+    set (GFLAGS_NOTHREADS FALSE)
+  else ()
+    set (GFLAGS_NOTHREADS TRUE)
+  endif ()
+endif ()
+
+# choose imported library target
+if (NOT GFLAGS_TARGET)
+  if (gflags_TARGET)
+    set (GFLAGS_TARGET ${gflags_TARGET})
+  elseif (GFLAGS_SHARED)
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_shared)
+    else ()
+      set (GFLAGS_TARGET gflags_shared)
+    endif ()
+  else ()
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_static)
+    else ()
+      set (GFLAGS_TARGET gflags_static)
+    endif ()
+  endif ()
+endif ()
+if (NOT TARGET ${GFLAGS_TARGET})
+  message (FATAL_ERROR "Your gflags installation does not contain a ${GFLAGS_TARGET} library target!"
+                       " Try a different combination of GFLAGS_SHARED and GFLAGS_NOTHREADS.")
+endif ()
+
+# add more convenient "gflags" import target
+if (NOT TARGET gflags)
+  if (GFLAGS_SHARED)
+    add_library (gflags SHARED IMPORTED)
+  else ()
+    add_library (gflags STATIC IMPORTED)
+  endif ()
+  # copy INTERFACE_* properties
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    COMPILE_DEFINITIONS
+    COMPILE_FEATURES
+    COMPILE_OPTIONS
+    INCLUDE_DIRECTORIES
+    LINK_LIBRARIES
+    POSITION_INDEPENDENT_CODE
+  )
+    get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} INTERFACE_${_GFLAGS_PROPERTY_NAME})
+    if (_GFLAGS_PROPERTY_VALUE)
+      set_target_properties(gflags PROPERTIES
+        INTERFACE_${_GFLAGS_PROPERTY_NAME} "${_GFLAGS_PROPERTY_VALUE}"
+      )
+    endif ()
+  endforeach ()
+  # copy IMPORTED_*_<CONFIG> properties
+  get_target_property (_GFLAGS_CONFIGURATIONS ${GFLAGS_TARGET} IMPORTED_CONFIGURATIONS)
+  set_target_properties (gflags PROPERTIES IMPORTED_CONFIGURATIONS "${_GFLAGS_CONFIGURATIONS}")
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    IMPLIB
+    LOCATION
+    LINK_DEPENDENT_LIBRARIES
+    LINK_INTERFACE_LIBRARIES
+    LINK_INTERFACE_LANGUAGES 
+    LINK_INTERFACE_MULTIPLICITY
+    NO_SONAME
+    SONAME
+  )
+    foreach (_GFLAGS_CONFIG IN LISTS _GFLAGS_CONFIGURATIONS)
+      get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG})
+      if (_GFLAGS_PROPERTY_VALUE)
+        set_target_properties(gflags PROPERTIES
+          IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG} "${_GFLAGS_PROPERTY_VALUE}"
+        )
+      endif ()
+    endforeach ()
+  endforeach ()
+  unset (_GFLAGS_CONFIGURATIONS)
+  unset (_GFLAGS_CONFIG)
+  unset (_GFLAGS_PROPERTY_NAME)
+  unset (_GFLAGS_PROPERTY_VALUE)
+endif ()
+
+# alias for default import target to be compatible with older CMake package configurations
+set (GFLAGS_LIBRARIES "${GFLAGS_TARGET}")
+
+# set gflags_* variables for backwards compatibility
+if (NOT "^gflags$" STREQUAL "^GFLAGS$")
+  foreach (_GFLAGS_VARIABLE IN ITEMS
+    VERSION_STRING
+    VERSION_MAJOR
+    VERSION_MINOR
+    VERSION_PATCH
+    INCLUDE_DIR
+    LIBRARIES
+    TARGET
+  )
+    set (gflags_${_GFLAGS_VARIABLE} "${GFLAGS_${_GFLAGS_VARIABLE}}")
+  endforeach ()
+  unset (_GFLAGS_VARIABLE)
+endif ()
+
+# unset private variables
+unset (gflags_FIND_COMPONENT)
+unset (_INSTALL_PREFIX)
diff --git a/3rdparty/TNN/third_party/gflags/gflags-config-version.cmake b/3rdparty/TNN/third_party/gflags/gflags-config-version.cmake
new file mode 100644
index 0000000..d68a39f
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/gflags-config-version.cmake
@@ -0,0 +1,21 @@
+## gflags CMake configuration version file
+
+# -----------------------------------------------------------------------------
+# library version
+set (PACKAGE_VERSION "2.2.1")
+
+# -----------------------------------------------------------------------------
+# check compatibility
+
+# Perform compatibility check here using the input CMake variables.
+# See example in http://www.cmake.org/Wiki/CMake_2.6_Notes.
+
+set (PACKAGE_VERSION_COMPATIBLE TRUE)
+set (PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if ("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "2" AND
+    "${PACKAGE_FIND_VERSION_MINOR}" EQUAL "2")
+  set (PACKAGE_VERSION_EXACT TRUE)
+else ()
+  set (PACKAGE_VERSION_EXACT FALSE)
+endif ()
diff --git a/3rdparty/TNN/third_party/gflags/gflags-config.cmake b/3rdparty/TNN/third_party/gflags/gflags-config.cmake
new file mode 100644
index 0000000..82b5837
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/gflags-config.cmake
@@ -0,0 +1,169 @@
+## gflags CMake configuration file
+
+# library version information
+set (GFLAGS_VERSION_STRING "2.2.1")
+set (GFLAGS_VERSION_MAJOR  2)
+set (GFLAGS_VERSION_MINOR  2)
+set (GFLAGS_VERSION_PATCH  0)
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/gflags-targets.cmake")
+
+# installation prefix
+get_filename_component (CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component (_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/." ABSOLUTE)
+
+# include directory
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+set (GFLAGS_INCLUDE_DIR "${_INSTALL_PREFIX}/include")
+
+if (gflags_FIND_COMPONENTS)
+  foreach (gflags_FIND_COMPONENT IN LISTS gflags_FIND_COMPONENTS)
+    if (gflags_FIND_REQUIRED_${gflags_FIND_COMPONENT} AND NOT TARGET gflags_${gflags_FIND_COMPONENT})
+      message (FATAL_ERROR "Package gflags was installed without required component ${gflags_FIND_COMPONENT}!")
+    endif ()
+  endforeach ()
+  list (GET gflags_FIND_COMPONENTS 0 gflags_FIND_COMPONENT)
+else ()
+  set (gflags_FIND_COMPONENT)
+endif ()
+
+# default settings of GFLAGS_SHARED and GFLAGS_NOTHREADS
+#
+# It is recommended to use either one of the following find_package commands
+# instead of setting the GFLAGS_(SHARED|NOTHREADS) variables:
+# - find_package(gflags REQUIRED)
+# - find_package(gflags COMPONENTS nothreads_static)
+# - find_package(gflags COMPONENTS nothreads_shared)
+# - find_package(gflags COMPONENTS static)
+# - find_package(gflags COMPONENTS shared)
+if (NOT DEFINED GFLAGS_SHARED)
+  if (DEFINED gflags_SHARED)
+    set (GFLAGS_SHARED ${gflags_SHARED})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "shared")
+      set (GFLAGS_SHARED TRUE)
+    else ()
+      set (GFLAGS_SHARED FALSE)
+    endif ()
+  elseif (TARGET gflags_shared OR TARGET gflags_nothreads_shared)
+    set (GFLAGS_SHARED TRUE)
+  else ()
+    set (GFLAGS_SHARED FALSE)
+  endif ()
+endif ()
+if (NOT DEFINED GFLAGS_NOTHREADS)
+  if (DEFINED gflags_NOTHREADS)
+    set (GFLAGS_NOTHREADS ${gflags_NOTHREADS})
+  elseif (gflags_FIND_COMPONENT)
+    if (gflags_FIND_COMPONENT MATCHES "nothreads")
+      set (GFLAGS_NOTHREADS TRUE)
+    else ()
+      set (GFLAGS_NOTHREADS FALSE)
+    endif ()
+  elseif (TARGET gflags_static OR TARGET gflags_shared)
+    set (GFLAGS_NOTHREADS FALSE)
+  else ()
+    set (GFLAGS_NOTHREADS TRUE)
+  endif ()
+endif ()
+
+# choose imported library target
+if (NOT GFLAGS_TARGET)
+  if (gflags_TARGET)
+    set (GFLAGS_TARGET ${gflags_TARGET})
+  elseif (GFLAGS_SHARED)
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_shared)
+    else ()
+      set (GFLAGS_TARGET gflags_shared)
+    endif ()
+  else ()
+    if (GFLAGS_NOTHREADS)
+      set (GFLAGS_TARGET gflags_nothreads_static)
+    else ()
+      set (GFLAGS_TARGET gflags_static)
+    endif ()
+  endif ()
+endif ()
+if (NOT TARGET ${GFLAGS_TARGET})
+  message (FATAL_ERROR "Your gflags installation does not contain a ${GFLAGS_TARGET} library target!"
+                       " Try a different combination of GFLAGS_SHARED and GFLAGS_NOTHREADS.")
+endif ()
+
+# add more convenient "gflags" import target
+if (NOT TARGET gflags)
+  if (GFLAGS_SHARED)
+    add_library (gflags SHARED IMPORTED)
+  else ()
+    add_library (gflags STATIC IMPORTED)
+  endif ()
+  # copy INTERFACE_* properties
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    COMPILE_DEFINITIONS
+    COMPILE_FEATURES
+    COMPILE_OPTIONS
+    INCLUDE_DIRECTORIES
+    LINK_LIBRARIES
+    POSITION_INDEPENDENT_CODE
+  )
+    get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} INTERFACE_${_GFLAGS_PROPERTY_NAME})
+    if (_GFLAGS_PROPERTY_VALUE)
+      set_target_properties(gflags PROPERTIES
+        INTERFACE_${_GFLAGS_PROPERTY_NAME} "${_GFLAGS_PROPERTY_VALUE}"
+      )
+    endif ()
+  endforeach ()
+  # copy IMPORTED_*_<CONFIG> properties
+  get_target_property (_GFLAGS_CONFIGURATIONS ${GFLAGS_TARGET} IMPORTED_CONFIGURATIONS)
+  set_target_properties (gflags PROPERTIES IMPORTED_CONFIGURATIONS "${_GFLAGS_CONFIGURATIONS}")
+  foreach (_GFLAGS_PROPERTY_NAME IN ITEMS
+    IMPLIB
+    LOCATION
+    LINK_DEPENDENT_LIBRARIES
+    LINK_INTERFACE_LIBRARIES
+    LINK_INTERFACE_LANGUAGES 
+    LINK_INTERFACE_MULTIPLICITY
+    NO_SONAME
+    SONAME
+  )
+    foreach (_GFLAGS_CONFIG IN LISTS _GFLAGS_CONFIGURATIONS)
+      get_target_property (_GFLAGS_PROPERTY_VALUE ${GFLAGS_TARGET} IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG})
+      if (_GFLAGS_PROPERTY_VALUE)
+        set_target_properties(gflags PROPERTIES
+          IMPORTED_${_GFLAGS_PROPERTY_NAME}_${_GFLAGS_CONFIG} "${_GFLAGS_PROPERTY_VALUE}"
+        )
+      endif ()
+    endforeach ()
+  endforeach ()
+  unset (_GFLAGS_CONFIGURATIONS)
+  unset (_GFLAGS_CONFIG)
+  unset (_GFLAGS_PROPERTY_NAME)
+  unset (_GFLAGS_PROPERTY_VALUE)
+endif ()
+
+# alias for default import target to be compatible with older CMake package configurations
+set (GFLAGS_LIBRARIES "${GFLAGS_TARGET}")
+
+# set gflags_* variables for backwards compatibility
+if (NOT "^gflags$" STREQUAL "^GFLAGS$")
+  foreach (_GFLAGS_VARIABLE IN ITEMS
+    VERSION_STRING
+    VERSION_MAJOR
+    VERSION_MINOR
+    VERSION_PATCH
+    INCLUDE_DIR
+    LIBRARIES
+    TARGET
+  )
+    set (gflags_${_GFLAGS_VARIABLE} "${GFLAGS_${_GFLAGS_VARIABLE}}")
+  endforeach ()
+  unset (_GFLAGS_VARIABLE)
+endif ()
+
+# unset private variables
+unset (gflags_FIND_COMPONENT)
+unset (_INSTALL_PREFIX)
diff --git a/3rdparty/TNN/third_party/gflags/gflags-targets.cmake b/3rdparty/TNN/third_party/gflags/gflags-targets.cmake
new file mode 100644
index 0000000..d9f39b6
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/gflags-targets.cmake
@@ -0,0 +1,60 @@
+# Generated by CMake 2.8.12-rc1
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget gflags_nothreads_static)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+
+
+# Create imported target gflags_nothreads_static
+add_library(gflags_nothreads_static STATIC IMPORTED)
+
+set_target_properties(gflags_nothreads_static PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "GFLAGS_IS_A_DLL=0"
+  INTERFACE_INCLUDE_DIRECTORIES "/data/neiltian/intel/computer_vision_sdk/inference_engine/samples/thirdparty/gflags/include"
+)
+
+# Import target "gflags_nothreads_static" for configuration "Release"
+set_property(TARGET gflags_nothreads_static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(gflags_nothreads_static PROPERTIES
+  IMPORTED_LOCATION_RELEASE "/data/neiltian/intel/computer_vision_sdk/inference_engine/samples/intel64/Release/lib/libgflags_nothreads.a"
+  )
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/3rdparty/TNN/third_party/gflags/gflags.pc b/3rdparty/TNN/third_party/gflags/gflags.pc
new file mode 100644
index 0000000..d4662b8
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/gflags.pc
@@ -0,0 +1,14 @@
+prefix=/usr/local
+exec_prefix=${prefix}
+bindir=${prefix}/bin
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: gflags
+Version: 2.2.1
+Description: A commandline flags library that allows for distributed flags.
+URL: http://gflags.github.io/gflags
+Requires:
+Libs: -L${libdir} -lgflags
+Libs.private: -lpthread 
+Cflags: -I${includedir}
diff --git a/3rdparty/TNN/third_party/gflags/src/config.h.in b/3rdparty/TNN/third_party/gflags/src/config.h.in
new file mode 100644
index 0000000..f6d18d8
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/config.h.in
@@ -0,0 +1,114 @@
+/* Generated from config.h.in during build configuration using CMake. */
+
+// Note: This header file is only used internally. It is not part of public interface!
+
+#ifndef GFLAGS_CONFIG_H_
+#define GFLAGS_CONFIG_H_
+
+
+// ---------------------------------------------------------------------------
+// System checks
+
+// Define if you build this library for a MS Windows OS.
+#cmakedefine OS_WINDOWS
+
+// Define if you have the <stdint.h> header file.
+#cmakedefine HAVE_STDINT_H
+
+// Define if you have the <sys/types.h> header file.
+#cmakedefine HAVE_SYS_TYPES_H
+
+// Define if you have the <inttypes.h> header file.
+#cmakedefine HAVE_INTTYPES_H
+
+// Define if you have the <sys/stat.h> header file.
+#cmakedefine HAVE_SYS_STAT_H
+
+// Define if you have the <unistd.h> header file.
+#cmakedefine HAVE_UNISTD_H
+
+// Define if you have the <fnmatch.h> header file.
+#cmakedefine HAVE_FNMATCH_H
+
+// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
+#cmakedefine HAVE_SHLWAPI_H
+
+// Define if you have the strtoll function.
+#cmakedefine HAVE_STRTOLL
+
+// Define if you have the strtoq function.
+#cmakedefine HAVE_STRTOQ
+
+// Define if you have the <pthread.h> header file.
+#cmakedefine HAVE_PTHREAD
+
+// Define if your pthread library defines the type pthread_rwlock_t
+#cmakedefine HAVE_RWLOCK
+
+// gcc requires this to get PRId64, etc.
+#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS 1
+#endif
+
+// ---------------------------------------------------------------------------
+// Package information
+
+// Name of package.
+#define PACKAGE @PROJECT_NAME@
+
+// Define to the full name of this package.
+#define PACKAGE_NAME @PACKAGE_NAME@
+
+// Define to the full name and version of this package.
+#define PACKAGE_STRING @PACKAGE_STRING@
+
+// Define to the one symbol short name of this package.
+#define PACKAGE_TARNAME @PACKAGE_TARNAME@
+
+// Define to the version of this package.
+#define PACKAGE_VERSION @PACKAGE_VERSION@
+
+// Version number of package.
+#define VERSION PACKAGE_VERSION
+
+// Define to the address where bug reports for this package should be sent.
+#define PACKAGE_BUGREPORT @PACKAGE_BUGREPORT@
+
+// ---------------------------------------------------------------------------
+// Path separator
+#ifndef PATH_SEPARATOR
+#  ifdef OS_WINDOWS
+#    define PATH_SEPARATOR  '\\'
+#  else
+#    define PATH_SEPARATOR  '/'
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Windows
+
+// Always export symbols when compiling a shared library as this file is only
+// included by internal modules when building the gflags library itself.
+// The gflags_declare.h header file will set it to import these symbols otherwise.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+// Flags defined by the gflags library itself must be exported
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
+#endif
+
+#ifdef OS_WINDOWS
+// The unittests import the symbols of the shared gflags library
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
+#  endif
+#  include "windows_port.h"
+#endif
+
+
+#endif // GFLAGS_CONFIG_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags.cc b/3rdparty/TNN/third_party/gflags/src/gflags.cc
new file mode 100644
index 0000000..d499a2f
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags.cc
@@ -0,0 +1,2209 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This file contains the implementation of all our command line flags
+// stuff.  Here's how everything fits together
+//
+// * FlagRegistry owns CommandLineFlags owns FlagValue.
+// * FlagSaver holds a FlagRegistry (saves it at construct time,
+//     restores it at destroy time).
+// * CommandLineFlagParser lives outside that hierarchy, but works on
+//     CommandLineFlags (modifying the FlagValues).
+// * Free functions like SetCommandLineOption() work via one of the
+//     above (such as CommandLineFlagParser).
+//
+// In more detail:
+//
+// -- The main classes that hold flag data:
+//
+// FlagValue holds the current value of a flag.  It's
+// pseudo-templatized: every operation on a FlagValue is typed.  It
+// also deals with storage-lifetime issues (so flag values don't go
+// away in a destructor), which is why we need a whole class to hold a
+// variable's value.
+//
+// CommandLineFlag is all the information about a single command-line
+// flag.  It has a FlagValue for the flag's current value, but also
+// the flag's name, type, etc.
+//
+// FlagRegistry is a collection of CommandLineFlags.  There's the
+// global registry, which is where flags defined via DEFINE_foo()
+// live.  But it's possible to define your own flag, manually, in a
+// different registry you create.  (In practice, multiple registries
+// are used only by FlagSaver).
+//
+// A given FlagValue is owned by exactly one CommandLineFlag.  A given
+// CommandLineFlag is owned by exactly one FlagRegistry.  FlagRegistry
+// has a lock; any operation that writes to a FlagValue or
+// CommandLineFlag owned by that registry must acquire the
+// FlagRegistry lock before doing so.
+//
+// --- Some other classes and free functions:
+//
+// CommandLineFlagInfo is a client-exposed version of CommandLineFlag.
+// Once it's instantiated, it has no dependencies or relationships
+// with any other part of this file.
+//
+// FlagRegisterer is the helper class used by the DEFINE_* macros to
+// allow work to be done at global initialization time.
+//
+// CommandLineFlagParser is the class that reads from the commandline
+// and instantiates flag values based on that.  It needs to poke into
+// the innards of the FlagValue->CommandLineFlag->FlagRegistry class
+// hierarchy to do that.  It's careful to acquire the FlagRegistry
+// lock before doing any writing or other non-const actions.
+//
+// GetCommandLineOption is just a hook into registry routines to
+// retrieve a flag based on its name.  SetCommandLineOption, on the
+// other hand, hooks into CommandLineFlagParser.  Other API functions
+// are, similarly, mostly hooks into the functionality described above.
+
+#include "gflags/gflags.h"
+#include "config.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#if defined(HAVE_FNMATCH_H)
+#include <fnmatch.h>
+#elif defined(HAVE_SHLWAPI_H)
+#define NO_SHLWAPI_ISOS
+#include <shlwapi.h>
+#endif
+#include <stdarg.h>  // For va_list and related operations
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>  // for pair<>
+#include <vector>
+
+#include "mutex.h"
+#include "util.h"
+
+using namespace MUTEX_NAMESPACE;
+
+// Special flags, type 1: the 'recursive' flags.  They set another flag's val.
+DEFINE_string(flagfile, "", "load flags from file");
+DEFINE_string(fromenv, "",
+              "set flags from the environment"
+              " [use 'export FLAGS_flag1=value']");
+DEFINE_string(tryfromenv, "", "set flags from the environment if present");
+
+// Special flags, type 2: the 'parsing' flags.  They modify how we parse.
+DEFINE_string(undefok, "",
+              "comma-separated list of flag names that it is okay to specify "
+              "on the command line even if the program does not define a flag "
+              "with that name.  IMPORTANT: flags in this list that have "
+              "arguments MUST use the flag=value format");
+
+namespace GFLAGS_NAMESPACE {
+
+using std::map;
+using std::pair;
+using std::sort;
+using std::string;
+using std::vector;
+
+// This is used by the unittest to test error-exit code
+void GFLAGS_DLL_DECL (*gflags_exitfunc)(int) = &exit;  // from stdlib.h
+
+// The help message indicating that the commandline flag has been
+// 'stripped'. It will not show up when doing "-help" and its
+// variants. The flag is stripped if STRIP_FLAG_HELP is set to 1
+// before including base/gflags.h
+
+// This is used by this file, and also in gflags_reporting.cc
+const char kStrippedFlagHelp[] = "\001\002\003\004 (unknown) \004\003\002\001";
+
+namespace {
+
+    // There are also 'reporting' flags, in gflags_reporting.cc.
+
+    static const char kError[] = "ERROR: ";
+
+    // Indicates that undefined options are to be ignored.
+    // Enables deferred processing of flags in dynamically loaded libraries.
+    static bool allow_command_line_reparsing = false;
+
+    static bool logging_is_probably_set_up = false;
+
+    // This is a 'prototype' validate-function.  'Real' validate
+    // functions, take a flag-value as an argument: ValidateFn(bool) or
+    // ValidateFn(uint64).  However, for easier storage, we strip off this
+    // argument and then restore it when actually calling the function on
+    // a flag value.
+    typedef bool (*ValidateFnProto)();
+
+    // Whether we should die when reporting an error.
+    enum DieWhenReporting { DIE, DO_NOT_DIE };
+
+    // Report Error and exit if requested.
+    static void ReportError(DieWhenReporting should_die, const char* format,
+                            ...) {
+        va_list ap;
+        va_start(ap, format);
+        vfprintf(stderr, format, ap);
+        va_end(ap);
+        fflush(
+            stderr);  // should be unnecessary, but cygwin's rxvt buffers stderr
+        if (should_die == DIE)
+            gflags_exitfunc(1);
+    }
+
+    // --------------------------------------------------------------------
+    // FlagValue
+    //    This represent the value a single flag might have.  The major
+    //    functionality is to convert from a string to an object of a
+    //    given type, and back.  Thread-compatible.
+    // --------------------------------------------------------------------
+
+    class CommandLineFlag;
+    class FlagValue {
+    public:
+        enum ValueType {
+            FV_BOOL      = 0,
+            FV_INT32     = 1,
+            FV_UINT32    = 2,
+            FV_INT64     = 3,
+            FV_UINT64    = 4,
+            FV_DOUBLE    = 5,
+            FV_STRING    = 6,
+            FV_MAX_INDEX = 6,
+        };
+
+        template <typename FlagType>
+        FlagValue(FlagType* valbuf, bool transfer_ownership_of_value);
+        ~FlagValue();
+
+        bool ParseFrom(const char* spec);
+        string ToString() const;
+
+        ValueType Type() const {
+            return static_cast<ValueType>(type_);
+        }
+
+    private:
+        friend class CommandLineFlag;  // for many things, including Validate()
+        friend class GFLAGS_NAMESPACE::FlagSaverImpl;  // calls New()
+        friend class FlagRegistry;  // checks value_buffer_ for flags_by_ptr_
+                                    // map
+        template <typename T>
+        friend T GetFromEnv(const char*, T);
+        friend bool TryParseLocked(const CommandLineFlag*, FlagValue*,
+                                   const char*,
+                                   string*);  // for New(), CopyFrom()
+
+        template <typename FlagType>
+        struct FlagValueTraits;
+
+        const char* TypeName() const;
+        bool Equal(const FlagValue& x) const;
+        FlagValue* New() const;  // creates a new one with default value
+        void CopyFrom(const FlagValue& x);
+        int ValueSize() const;
+
+        // Calls the given validate-fn on value_buffer_, and returns
+        // whatever it returns.  But first casts validate_fn_proto to a
+        // function that takes our value as an argument (eg void
+        // (*validate_fn)(bool) for a bool flag).
+        bool Validate(const char* flagname,
+                      ValidateFnProto validate_fn_proto) const;
+
+        void* const value_buffer_;  // points to the buffer holding our data
+        const int8 type_;           // how to interpret value_
+        const bool owns_value_;     // whether to free value on destruct
+
+        FlagValue(const FlagValue&);  // no copying!
+        void operator=(const FlagValue&);
+    };
+
+// Map the given C++ type to a value of the ValueType enum at compile time.
+#define DEFINE_FLAG_TRAITS(type, value)                                        \
+    template <>                                                                \
+    struct FlagValue::FlagValueTraits<type> {                                  \
+        static const ValueType kValueType = value;                             \
+    }
+
+    // Define full template specializations of the FlagValueTraits template
+    // for all supported flag types.
+    DEFINE_FLAG_TRAITS(bool, FV_BOOL);
+    DEFINE_FLAG_TRAITS(int32, FV_INT32);
+    DEFINE_FLAG_TRAITS(uint32, FV_UINT32);
+    DEFINE_FLAG_TRAITS(int64, FV_INT64);
+    DEFINE_FLAG_TRAITS(uint64, FV_UINT64);
+    DEFINE_FLAG_TRAITS(double, FV_DOUBLE);
+    DEFINE_FLAG_TRAITS(std::string, FV_STRING);
+
+#undef DEFINE_FLAG_TRAITS
+
+// This could be a templated method of FlagValue, but doing so adds to the
+// size of the .o.  Since there's no type-safety here anyway, macro is ok.
+#define VALUE_AS(type) *reinterpret_cast<type*>(value_buffer_)
+#define OTHER_VALUE_AS(fv, type) *reinterpret_cast<type*>(fv.value_buffer_)
+#define SET_VALUE_AS(type, value) VALUE_AS(type) = (value)
+
+    template <typename FlagType>
+    FlagValue::FlagValue(FlagType* valbuf, bool transfer_ownership_of_value)
+        : value_buffer_(valbuf),
+          type_(FlagValueTraits<FlagType>::kValueType),
+          owns_value_(transfer_ownership_of_value) {}
+
+    FlagValue::~FlagValue() {
+        if (!owns_value_) {
+            return;
+        }
+        switch (type_) {
+            case FV_BOOL:
+                delete reinterpret_cast<bool*>(value_buffer_);
+                break;
+            case FV_INT32:
+                delete reinterpret_cast<int32*>(value_buffer_);
+                break;
+            case FV_UINT32:
+                delete reinterpret_cast<uint32*>(value_buffer_);
+                break;
+            case FV_INT64:
+                delete reinterpret_cast<int64*>(value_buffer_);
+                break;
+            case FV_UINT64:
+                delete reinterpret_cast<uint64*>(value_buffer_);
+                break;
+            case FV_DOUBLE:
+                delete reinterpret_cast<double*>(value_buffer_);
+                break;
+            case FV_STRING:
+                delete reinterpret_cast<string*>(value_buffer_);
+                break;
+        }
+    }
+
+    bool FlagValue::ParseFrom(const char* value) {
+        if (type_ == FV_BOOL) {
+            const char* kTrue[]  = {"1", "t", "true", "y", "yes"};
+            const char* kFalse[] = {"0", "f", "false", "n", "no"};
+            COMPILE_ASSERT(sizeof(kTrue) == sizeof(kFalse), true_false_equal);
+            for (size_t i = 0; i < sizeof(kTrue) / sizeof(*kTrue); ++i) {
+                if (strcasecmp(value, kTrue[i]) == 0) {
+                    SET_VALUE_AS(bool, true);
+                    return true;
+                } else if (strcasecmp(value, kFalse[i]) == 0) {
+                    SET_VALUE_AS(bool, false);
+                    return true;
+                }
+            }
+            return false;  // didn't match a legal input
+
+        } else if (type_ == FV_STRING) {
+            SET_VALUE_AS(string, value);
+            return true;
+        }
+
+        // OK, it's likely to be numeric, and we'll be using a strtoXXX method.
+        if (value[0] == '\0')  // empty-string is only allowed for string type.
+            return false;
+        char* end;
+        // Leading 0x puts us in base 16.  But leading 0 does not put us in base
+        // 8! It caused too many bugs when we had that behavior.
+        int base = 10;  // by default
+        if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X'))
+            base = 16;
+        errno = 0;
+
+        switch (type_) {
+            case FV_INT32: {
+                const int64 r = strto64(value, &end, base);
+                if (errno || end != value + strlen(value))
+                    return false;  // bad parse
+                if (static_cast<int32>(r) !=
+                    r)  // worked, but number out of range
+                    return false;
+                SET_VALUE_AS(int32, static_cast<int32>(r));
+                return true;
+            }
+            case FV_UINT32: {
+                while (*value == ' ')
+                    value++;
+                if (*value == '-')
+                    return false;  // negative number
+                const uint64 r = strtou64(value, &end, base);
+                if (errno || end != value + strlen(value))
+                    return false;  // bad parse
+                if (static_cast<uint32>(r) !=
+                    r)  // worked, but number out of range
+                    return false;
+                SET_VALUE_AS(uint32, static_cast<uint32>(r));
+                return true;
+            }
+            case FV_INT64: {
+                const int64 r = strto64(value, &end, base);
+                if (errno || end != value + strlen(value))
+                    return false;  // bad parse
+                SET_VALUE_AS(int64, r);
+                return true;
+            }
+            case FV_UINT64: {
+                while (*value == ' ')
+                    value++;
+                if (*value == '-')
+                    return false;  // negative number
+                const uint64 r = strtou64(value, &end, base);
+                if (errno || end != value + strlen(value))
+                    return false;  // bad parse
+                SET_VALUE_AS(uint64, r);
+                return true;
+            }
+            case FV_DOUBLE: {
+                const double r = strtod(value, &end);
+                if (errno || end != value + strlen(value))
+                    return false;  // bad parse
+                SET_VALUE_AS(double, r);
+                return true;
+            }
+            default: {
+                assert(false);  // unknown type
+                return false;
+            }
+        }
+    }
+
+    string FlagValue::ToString() const {
+        char intbuf[64];  // enough to hold even the biggest number
+        switch (type_) {
+            case FV_BOOL:
+                return VALUE_AS(bool) ? "true" : "false";
+            case FV_INT32:
+                snprintf(intbuf, sizeof(intbuf), "%" PRId32, VALUE_AS(int32));
+                return intbuf;
+            case FV_UINT32:
+                snprintf(intbuf, sizeof(intbuf), "%" PRIu32, VALUE_AS(uint32));
+                return intbuf;
+            case FV_INT64:
+                snprintf(intbuf, sizeof(intbuf), "%" PRId64, VALUE_AS(int64));
+                return intbuf;
+            case FV_UINT64:
+                snprintf(intbuf, sizeof(intbuf), "%" PRIu64, VALUE_AS(uint64));
+                return intbuf;
+            case FV_DOUBLE:
+                snprintf(intbuf, sizeof(intbuf), "%.17g", VALUE_AS(double));
+                return intbuf;
+            case FV_STRING:
+                return VALUE_AS(string);
+            default:
+                assert(false);
+                return "";  // unknown type
+        }
+    }
+
+    bool FlagValue::Validate(const char* flagname,
+                             ValidateFnProto validate_fn_proto) const {
+        switch (type_) {
+            case FV_BOOL:
+                return reinterpret_cast<bool (*)(const char*, bool)>(
+                    validate_fn_proto)(flagname, VALUE_AS(bool));
+            case FV_INT32:
+                return reinterpret_cast<bool (*)(const char*, int32)>(
+                    validate_fn_proto)(flagname, VALUE_AS(int32));
+            case FV_UINT32:
+                return reinterpret_cast<bool (*)(const char*, uint32)>(
+                    validate_fn_proto)(flagname, VALUE_AS(uint32));
+            case FV_INT64:
+                return reinterpret_cast<bool (*)(const char*, int64)>(
+                    validate_fn_proto)(flagname, VALUE_AS(int64));
+            case FV_UINT64:
+                return reinterpret_cast<bool (*)(const char*, uint64)>(
+                    validate_fn_proto)(flagname, VALUE_AS(uint64));
+            case FV_DOUBLE:
+                return reinterpret_cast<bool (*)(const char*, double)>(
+                    validate_fn_proto)(flagname, VALUE_AS(double));
+            case FV_STRING:
+                return reinterpret_cast<bool (*)(const char*, const string&)>(
+                    validate_fn_proto)(flagname, VALUE_AS(string));
+            default:
+                assert(false);  // unknown type
+                return false;
+        }
+    }
+
+    const char* FlagValue::TypeName() const {
+        static const char types[] =
+            "bool\0xx"
+            "int32\0x"
+            "uint32\0"
+            "int64\0x"
+            "uint64\0"
+            "double\0"
+            "string";
+        if (type_ > FV_MAX_INDEX) {
+            assert(false);
+            return "";
+        }
+        // Directly indexing the strings in the 'types' string, each of them is
+        // 7 bytes long.
+        return &types[type_ * 7];
+    }
+
+    bool FlagValue::Equal(const FlagValue& x) const {
+        if (type_ != x.type_)
+            return false;
+        switch (type_) {
+            case FV_BOOL:
+                return VALUE_AS(bool) == OTHER_VALUE_AS(x, bool);
+            case FV_INT32:
+                return VALUE_AS(int32) == OTHER_VALUE_AS(x, int32);
+            case FV_UINT32:
+                return VALUE_AS(uint32) == OTHER_VALUE_AS(x, uint32);
+            case FV_INT64:
+                return VALUE_AS(int64) == OTHER_VALUE_AS(x, int64);
+            case FV_UINT64:
+                return VALUE_AS(uint64) == OTHER_VALUE_AS(x, uint64);
+            case FV_DOUBLE:
+                return VALUE_AS(double) == OTHER_VALUE_AS(x, double);
+            case FV_STRING:
+                return VALUE_AS(string) == OTHER_VALUE_AS(x, string);
+            default:
+                assert(false);
+                return false;  // unknown type
+        }
+    }
+
+    FlagValue* FlagValue::New() const {
+        switch (type_) {
+            case FV_BOOL:
+                return new FlagValue(new bool(false), true);
+            case FV_INT32:
+                return new FlagValue(new int32(0), true);
+            case FV_UINT32:
+                return new FlagValue(new uint32(0), true);
+            case FV_INT64:
+                return new FlagValue(new int64(0), true);
+            case FV_UINT64:
+                return new FlagValue(new uint64(0), true);
+            case FV_DOUBLE:
+                return new FlagValue(new double(0.0), true);
+            case FV_STRING:
+                return new FlagValue(new string, true);
+            default:
+                assert(false);
+                return NULL;  // unknown type
+        }
+    }
+
+    void FlagValue::CopyFrom(const FlagValue& x) {
+        assert(type_ == x.type_);
+        switch (type_) {
+            case FV_BOOL:
+                SET_VALUE_AS(bool, OTHER_VALUE_AS(x, bool));
+                break;
+            case FV_INT32:
+                SET_VALUE_AS(int32, OTHER_VALUE_AS(x, int32));
+                break;
+            case FV_UINT32:
+                SET_VALUE_AS(uint32, OTHER_VALUE_AS(x, uint32));
+                break;
+            case FV_INT64:
+                SET_VALUE_AS(int64, OTHER_VALUE_AS(x, int64));
+                break;
+            case FV_UINT64:
+                SET_VALUE_AS(uint64, OTHER_VALUE_AS(x, uint64));
+                break;
+            case FV_DOUBLE:
+                SET_VALUE_AS(double, OTHER_VALUE_AS(x, double));
+                break;
+            case FV_STRING:
+                SET_VALUE_AS(string, OTHER_VALUE_AS(x, string));
+                break;
+            default:
+                assert(false);  // unknown type
+        }
+    }
+
+    int FlagValue::ValueSize() const {
+        if (type_ > FV_MAX_INDEX) {
+            assert(false);  // unknown type
+            return 0;
+        }
+        static const uint8 valuesize[] = {
+            sizeof(bool),   sizeof(int32),  sizeof(uint32), sizeof(int64),
+            sizeof(uint64), sizeof(double), sizeof(string),
+        };
+        return valuesize[type_];
+    }
+
+    // --------------------------------------------------------------------
+    // CommandLineFlag
+    //    This represents a single flag, including its name, description,
+    //    default value, and current value.  Mostly this serves as a
+    //    struct, though it also knows how to register itself.
+    //       All CommandLineFlags are owned by a (exactly one)
+    //    FlagRegistry.  If you wish to modify fields in this class, you
+    //    should acquire the FlagRegistry lock for the registry that owns
+    //    this flag.
+    // --------------------------------------------------------------------
+
+    class CommandLineFlag {
+    public:
+        // Note: we take over memory-ownership of current_val and default_val.
+        CommandLineFlag(const char* name, const char* help,
+                        const char* filename, FlagValue* current_val,
+                        FlagValue* default_val);
+        ~CommandLineFlag();
+
+        const char* name() const {
+            return name_;
+        }
+        const char* help() const {
+            return help_;
+        }
+        const char* filename() const {
+            return file_;
+        }
+        const char* CleanFileName()
+            const;  // nixes irrelevant prefix such as homedir
+        string current_value() const {
+            return current_->ToString();
+        }
+        string default_value() const {
+            return defvalue_->ToString();
+        }
+        const char* type_name() const {
+            return defvalue_->TypeName();
+        }
+        ValidateFnProto validate_function() const {
+            return validate_fn_proto_;
+        }
+        const void* flag_ptr() const {
+            return current_->value_buffer_;
+        }
+
+        FlagValue::ValueType Type() const {
+            return defvalue_->Type();
+        }
+
+        void FillCommandLineFlagInfo(struct CommandLineFlagInfo* result);
+
+        // If validate_fn_proto_ is non-NULL, calls it on value, returns result.
+        bool Validate(const FlagValue& value) const;
+        bool ValidateCurrent() const {
+            return Validate(*current_);
+        }
+        bool Modified() const {
+            return modified_;
+        }
+
+    private:
+        // for SetFlagLocked() and setting flags_by_ptr_
+        friend class FlagRegistry;
+        friend class GFLAGS_NAMESPACE::FlagSaverImpl;  // for cloning the values
+        // set validate_fn
+        friend bool AddFlagValidator(const void*, ValidateFnProto);
+
+        // This copies all the non-const members: modified, processed, defvalue,
+        // etc.
+        void CopyFrom(const CommandLineFlag& src);
+
+        void UpdateModifiedBit();
+
+        const char* const name_;  // Flag name
+        const char* const help_;  // Help message
+        const char* const file_;  // Which file did this come from?
+        bool modified_;           // Set after default assignment?
+        FlagValue* defvalue_;     // Default value for flag
+        FlagValue* current_;      // Current value for flag
+        // This is a casted, 'generic' version of validate_fn, which actually
+        // takes a flag-value as an arg (void (*validate_fn)(bool), say).
+        // When we pass this to current_->Validate(), it will cast it back to
+        // the proper type.  This may be NULL to mean we have no validate_fn.
+        ValidateFnProto validate_fn_proto_;
+
+        CommandLineFlag(const CommandLineFlag&);  // no copying!
+        void operator=(const CommandLineFlag&);
+    };
+
+    CommandLineFlag::CommandLineFlag(const char* name, const char* help,
+                                     const char* filename,
+                                     FlagValue* current_val,
+                                     FlagValue* default_val)
+        : name_(name),
+          help_(help),
+          file_(filename),
+          modified_(false),
+          defvalue_(default_val),
+          current_(current_val),
+          validate_fn_proto_(NULL) {}
+
+    CommandLineFlag::~CommandLineFlag() {
+        delete current_;
+        delete defvalue_;
+    }
+
+    const char* CommandLineFlag::CleanFileName() const {
+        // Compute top-level directory & file that this appears in
+        // search full path backwards.
+        // Stop going backwards at kRootDir; and skip by the first slash.
+        static const char kRootDir[] = "";  // can set this to root directory,
+
+        if (sizeof(kRootDir) - 1 == 0)  // no prefix to strip
+            return filename();
+
+        const char* clean_name = filename() + strlen(filename()) - 1;
+        while (clean_name > filename()) {
+            if (*clean_name == PATH_SEPARATOR) {
+                if (sizeof(kRootDir) > 1 &&
+                    strncmp(clean_name, kRootDir, sizeof(kRootDir) - 1) == 0) {
+                    clean_name += sizeof(kRootDir) - 1;  // past root-dir
+                    break;
+                }
+            }
+            --clean_name;
+        }
+        while (*clean_name == PATH_SEPARATOR)
+            ++clean_name;  // Skip any slashes
+        return clean_name;
+    }
+
+    void CommandLineFlag::FillCommandLineFlagInfo(CommandLineFlagInfo* result) {
+        result->name          = name();
+        result->type          = type_name();
+        result->description   = help();
+        result->current_value = current_value();
+        result->default_value = default_value();
+        result->filename      = CleanFileName();
+        UpdateModifiedBit();
+        result->is_default       = !modified_;
+        result->has_validator_fn = validate_function() != NULL;
+        result->flag_ptr         = flag_ptr();
+    }
+
+    void CommandLineFlag::UpdateModifiedBit() {
+        // Update the "modified" bit in case somebody bypassed the
+        // Flags API and wrote directly through the FLAGS_name variable.
+        if (!modified_ && !current_->Equal(*defvalue_)) {
+            modified_ = true;
+        }
+    }
+
+    void CommandLineFlag::CopyFrom(const CommandLineFlag& src) {
+        // Note we only copy the non-const members; others are fixed at
+        // construct time
+        if (modified_ != src.modified_)
+            modified_ = src.modified_;
+        if (!current_->Equal(*src.current_))
+            current_->CopyFrom(*src.current_);
+        if (!defvalue_->Equal(*src.defvalue_))
+            defvalue_->CopyFrom(*src.defvalue_);
+        if (validate_fn_proto_ != src.validate_fn_proto_)
+            validate_fn_proto_ = src.validate_fn_proto_;
+    }
+
+    bool CommandLineFlag::Validate(const FlagValue& value) const {
+        if (validate_function() == NULL)
+            return true;
+        else
+            return value.Validate(name(), validate_function());
+    }
+
+    // --------------------------------------------------------------------
+    // FlagRegistry
+    //    A FlagRegistry singleton object holds all flag objects indexed
+    //    by their names so that if you know a flag's name (as a C
+    //    string), you can access or set it.  If the function is named
+    //    FooLocked(), you must own the registry lock before calling
+    //    the function; otherwise, you should *not* hold the lock, and
+    //    the function will acquire it itself if needed.
+    // --------------------------------------------------------------------
+
+    struct StringCmp {  // Used by the FlagRegistry map class to compare char*'s
+        bool operator()(const char* s1, const char* s2) const {
+            return (strcmp(s1, s2) < 0);
+        }
+    };
+
+    class FlagRegistry {
+    public:
+        FlagRegistry() {}
+        ~FlagRegistry() {
+            // Not using STLDeleteElements as that resides in util and this
+            // class is base.
+            for (FlagMap::iterator p = flags_.begin(), e = flags_.end(); p != e;
+                 ++p) {
+                CommandLineFlag* flag = p->second;
+                delete flag;
+            }
+        }
+
+        static void DeleteGlobalRegistry() {
+            delete global_registry_;
+            global_registry_ = NULL;
+        }
+
+        // Store a flag in this registry.  Takes ownership of the given pointer.
+        void RegisterFlag(CommandLineFlag* flag);
+
+        void Lock() {
+            lock_.Lock();
+        }
+        void Unlock() {
+            lock_.Unlock();
+        }
+
+        // Returns the flag object for the specified name, or NULL if not found.
+        CommandLineFlag* FindFlagLocked(const char* name);
+
+        // Returns the flag object whose current-value is stored at flag_ptr.
+        // That is, for whom current_->value_buffer_ == flag_ptr
+        CommandLineFlag* FindFlagViaPtrLocked(const void* flag_ptr);
+
+        // A fancier form of FindFlag that works correctly if name is of the
+        // form flag=value.  In that case, we set key to point to flag, and
+        // modify v to point to the value (if present), and return the flag
+        // with the given name.  If the flag does not exist, returns NULL
+        // and sets error_message.
+        CommandLineFlag* SplitArgumentLocked(const char* argument, string* key,
+                                             const char** v,
+                                             string* error_message);
+
+        // Set the value of a flag.  If the flag was successfully set to
+        // value, set msg to indicate the new flag-value, and return true.
+        // Otherwise, set msg to indicate the error, leave flag unchanged,
+        // and return false.  msg can be NULL.
+        bool SetFlagLocked(CommandLineFlag* flag, const char* value,
+                           FlagSettingMode set_mode, string* msg);
+
+        static FlagRegistry* GlobalRegistry();  // returns a singleton registry
+
+    private:
+        friend class GFLAGS_NAMESPACE::FlagSaverImpl;  // reads all the flags in
+                                                       // order to copy them
+        friend class CommandLineFlagParser;  // for ValidateUnmodifiedFlags
+        friend void GFLAGS_NAMESPACE::GetAllFlags(vector<CommandLineFlagInfo>*);
+
+        // The map from name to flag, for FindFlagLocked().
+        typedef map<const char*, CommandLineFlag*, StringCmp> FlagMap;
+        typedef FlagMap::iterator FlagIterator;
+        typedef FlagMap::const_iterator FlagConstIterator;
+        FlagMap flags_;
+
+        // The map from current-value pointer to flag, fo
+        // FindFlagViaPtrLocked().
+        typedef map<const void*, CommandLineFlag*> FlagPtrMap;
+        FlagPtrMap flags_by_ptr_;
+
+        static FlagRegistry* global_registry_;  // a singleton registry
+
+        Mutex lock_;
+
+        static void InitGlobalRegistry();
+
+        // Disallow
+        FlagRegistry(const FlagRegistry&);
+        FlagRegistry& operator=(const FlagRegistry&);
+    };
+
+    class FlagRegistryLock {
+    public:
+        explicit FlagRegistryLock(FlagRegistry* fr) : fr_(fr) {
+            fr_->Lock();
+        }
+        ~FlagRegistryLock() {
+            fr_->Unlock();
+        }
+
+    private:
+        FlagRegistry* const fr_;
+    };
+
+    void FlagRegistry::RegisterFlag(CommandLineFlag* flag) {
+        Lock();
+        pair<FlagIterator, bool> ins = flags_.insert(
+            pair<const char*, CommandLineFlag*>(flag->name(), flag));
+        if (ins.second == false) {  // means the name was already in the map
+            if (strcmp(ins.first->second->filename(), flag->filename()) != 0) {
+                ReportError(DIE,
+                            "ERROR: flag '%s' was defined more than once "
+                            "(in files '%s' and '%s').\n",
+                            flag->name(), ins.first->second->filename(),
+                            flag->filename());
+            } else {
+                ReportError(
+                    DIE,
+                    "ERROR: something wrong with flag '%s' in file '%s'.  "
+                    "One possibility: file '%s' is being linked both "
+                    "statically "
+                    "and dynamically into this executable.\n",
+                    flag->name(), flag->filename(), flag->filename());
+            }
+        }
+        // Also add to the flags_by_ptr_ map.
+        flags_by_ptr_[flag->current_->value_buffer_] = flag;
+        Unlock();
+    }
+
+    CommandLineFlag* FlagRegistry::FindFlagLocked(const char* name) {
+        FlagConstIterator i = flags_.find(name);
+        if (i == flags_.end()) {
+            // If the name has dashes in it, try again after replacing with
+            // underscores.
+            if (strchr(name, '-') == NULL)
+                return NULL;
+            string name_rep = name;
+            std::replace(name_rep.begin(), name_rep.end(), '-', '_');
+            return FindFlagLocked(name_rep.c_str());
+        } else {
+            return i->second;
+        }
+    }
+
+    CommandLineFlag* FlagRegistry::FindFlagViaPtrLocked(const void* flag_ptr) {
+        FlagPtrMap::const_iterator i = flags_by_ptr_.find(flag_ptr);
+        if (i == flags_by_ptr_.end()) {
+            return NULL;
+        } else {
+            return i->second;
+        }
+    }
+
+    CommandLineFlag* FlagRegistry::SplitArgumentLocked(const char* arg,
+                                                       string* key,
+                                                       const char** v,
+                                                       string* error_message) {
+        // Find the flag object for this option
+        const char* flag_name;
+        const char* value = strchr(arg, '=');
+        if (value == NULL) {
+            key->assign(arg);
+            *v = NULL;
+        } else {
+            // Strip out the "=value" portion from arg
+            key->assign(arg, value - arg);
+            *v = ++value;  // advance past the '='
+        }
+        flag_name = key->c_str();
+
+        CommandLineFlag* flag = FindFlagLocked(flag_name);
+
+        if (flag == NULL) {
+            // If we can't find the flag-name, then we should return an error.
+            // The one exception is if 1) the flag-name is 'nox', 2) there
+            // exists a flag named 'x', and 3) 'x' is a boolean flag.
+            // In that case, we want to return flag 'x'.
+            if (!(flag_name[0] == 'n' && flag_name[1] == 'o')) {
+                // flag-name is not 'nox', so we're not in the exception case.
+                *error_message = StringPrintf(
+                    "%sunknown command line flag '%s'\n", kError, key->c_str());
+                return NULL;
+            }
+            flag = FindFlagLocked(flag_name + 2);
+            if (flag == NULL) {
+                // No flag named 'x' exists, so we're not in the exception case.
+                *error_message = StringPrintf(
+                    "%sunknown command line flag '%s'\n", kError, key->c_str());
+                return NULL;
+            }
+            if (flag->Type() != FlagValue::FV_BOOL) {
+                // 'x' exists but is not boolean, so we're not in the exception
+                // case.
+                *error_message = StringPrintf(
+                    "%sboolean value (%s) specified for %s command line flag\n",
+                    kError, key->c_str(), flag->type_name());
+                return NULL;
+            }
+            // We're in the exception case!
+            // Make up a fake value to replace the "no" we stripped out
+            key->assign(flag_name + 2);  // the name without the "no"
+            *v = "0";
+        }
+
+        // Assign a value if this is a boolean flag
+        if (*v == NULL && flag->Type() == FlagValue::FV_BOOL) {
+            *v = "1";  // the --nox case was already handled, so this is the --x
+                       // case
+        }
+
+        return flag;
+    }
+
+    bool TryParseLocked(const CommandLineFlag* flag, FlagValue* flag_value,
+                        const char* value, string* msg) {
+        // Use tenative_value, not flag_value, until we know value is valid.
+        FlagValue* tentative_value = flag_value->New();
+        if (!tentative_value->ParseFrom(value)) {
+            if (msg) {
+                StringAppendF(
+                    msg, "%sillegal value '%s' specified for %s flag '%s'\n",
+                    kError, value, flag->type_name(), flag->name());
+            }
+            delete tentative_value;
+            return false;
+        } else if (!flag->Validate(*tentative_value)) {
+            if (msg) {
+                StringAppendF(
+                    msg,
+                    "%sfailed validation of new value '%s' for flag '%s'\n",
+                    kError, tentative_value->ToString().c_str(), flag->name());
+            }
+            delete tentative_value;
+            return false;
+        } else {
+            flag_value->CopyFrom(*tentative_value);
+            if (msg) {
+                StringAppendF(msg, "%s set to %s\n", flag->name(),
+                              flag_value->ToString().c_str());
+            }
+            delete tentative_value;
+            return true;
+        }
+    }
+
+    bool FlagRegistry::SetFlagLocked(CommandLineFlag* flag, const char* value,
+                                     FlagSettingMode set_mode, string* msg) {
+        flag->UpdateModifiedBit();
+        switch (set_mode) {
+            case SET_FLAGS_VALUE: {
+                // set or modify the flag's value
+                if (!TryParseLocked(flag, flag->current_, value, msg))
+                    return false;
+                flag->modified_ = true;
+                break;
+            }
+            case SET_FLAG_IF_DEFAULT: {
+                // set the flag's value, but only if it hasn't been set by
+                // someone else
+                if (!flag->modified_) {
+                    if (!TryParseLocked(flag, flag->current_, value, msg))
+                        return false;
+                    flag->modified_ = true;
+                } else {
+                    *msg = StringPrintf("%s set to %s", flag->name(),
+                                        flag->current_value().c_str());
+                }
+                break;
+            }
+            case SET_FLAGS_DEFAULT: {
+                // modify the flag's default-value
+                if (!TryParseLocked(flag, flag->defvalue_, value, msg))
+                    return false;
+                if (!flag->modified_) {
+                    // Need to set both defvalue *and* current, in this case
+                    TryParseLocked(flag, flag->current_, value, NULL);
+                }
+                break;
+            }
+            default: {
+                // unknown set_mode
+                assert(false);
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    // Get the singleton FlagRegistry object
+    FlagRegistry* FlagRegistry::global_registry_ = NULL;
+
+    FlagRegistry* FlagRegistry::GlobalRegistry() {
+        static Mutex lock(Mutex::LINKER_INITIALIZED);
+        MutexLock acquire_lock(&lock);
+        if (!global_registry_) {
+            global_registry_ = new FlagRegistry;
+        }
+        return global_registry_;
+    }
+
+    // --------------------------------------------------------------------
+    // CommandLineFlagParser
+    //    Parsing is done in two stages.  In the first, we go through
+    //    argv.  For every flag-like arg we can make sense of, we parse
+    //    it and set the appropriate FLAGS_* variable.  For every flag-
+    //    like arg we can't make sense of, we store it in a vector,
+    //    along with an explanation of the trouble.  In stage 2, we
+    //    handle the 'reporting' flags like --help and --mpm_version.
+    //    (This is via a call to HandleCommandLineHelpFlags(), in
+    //    gflags_reporting.cc.)
+    //    An optional stage 3 prints out the error messages.
+    //       This is a bit of a simplification.  For instance, --flagfile
+    //    is handled as soon as it's seen in stage 1, not in stage 2.
+    // --------------------------------------------------------------------
+
+    class CommandLineFlagParser {
+    public:
+        // The argument is the flag-registry to register the parsed flags in
+        explicit CommandLineFlagParser(FlagRegistry* reg) : registry_(reg) {}
+        ~CommandLineFlagParser() {}
+
+        // Stage 1: Every time this is called, it reads all flags in argv.
+        // However, it ignores all flags that have been successfully set
+        // before.  Typically this is only called once, so this 'reparsing'
+        // behavior isn't important.  It can be useful when trying to
+        // reparse after loading a dll, though.
+        uint32 ParseNewCommandLineFlags(int* argc, char*** argv,
+                                        bool remove_flags);
+
+        // Stage 2: print reporting info and exit, if requested.
+        // In gflags_reporting.cc:HandleCommandLineHelpFlags().
+
+        // Stage 3: validate all the commandline flags that have validators
+        // registered and were not set/modified by ParseNewCommandLineFlags.
+        void ValidateFlags(bool all);
+        void ValidateAllFlags();
+        void ValidateUnmodifiedFlags();
+
+        // Stage 4: report any errors and return true if any were found.
+        bool ReportErrors();
+
+        // Set a particular command line option.  "newval" is a string
+        // describing the new value that the option has been set to.  If
+        // option_name does not specify a valid option name, or value is not
+        // a valid value for option_name, newval is empty.  Does recursive
+        // processing for --flagfile and --fromenv.  Returns the new value
+        // if everything went ok, or empty-string if not.  (Actually, the
+        // return-string could hold many flag/value pairs due to --flagfile.)
+        // NB: Must have called registry_->Lock() before calling this function.
+        string ProcessSingleOptionLocked(CommandLineFlag* flag,
+                                         const char* value,
+                                         FlagSettingMode set_mode);
+
+        // Set a whole batch of command line options as specified by
+        // contentdata, which is in flagfile format (and probably has been read
+        // from a flagfile). Returns the new value if everything went ok, or
+        // empty-string if not.  (Actually, the return-string could hold many
+        // flag/value pairs due to --flagfile.) NB: Must have called
+        // registry_->Lock() before calling this function.
+        string ProcessOptionsFromStringLocked(const string& contentdata,
+                                              FlagSettingMode set_mode);
+
+        // These are the 'recursive' flags, defined at the top of this file.
+        // Whenever we see these flags on the commandline, we must take action.
+        // These are called by ProcessSingleOptionLocked and, similarly, return
+        // new values if everything went ok, or the empty-string if not.
+        string ProcessFlagfileLocked(const string& flagval,
+                                     FlagSettingMode set_mode);
+        // diff fromenv/tryfromenv
+        string ProcessFromenvLocked(const string& flagval,
+                                    FlagSettingMode set_mode,
+                                    bool errors_are_fatal);
+
+    private:
+        FlagRegistry* const registry_;
+        map<string, string> error_flags_;  // map from name to error message
+        // This could be a set<string>, but we reuse the map to minimize the .o
+        // size
+        map<string, string>
+            undefined_names_;  // --[flag] name was not registered
+    };
+
+    // Parse a list of (comma-separated) flags.
+    static void ParseFlagList(const char* value, vector<string>* flags) {
+        for (const char* p = value; p && *p; value = p) {
+            p = strchr(value, ',');
+            size_t len;
+            if (p) {
+                len = p - value;
+                p++;
+            } else {
+                len = strlen(value);
+            }
+
+            if (len == 0)
+                ReportError(DIE, "ERROR: empty flaglist entry\n");
+            if (value[0] == '-')
+                ReportError(DIE, "ERROR: flag \"%*s\" begins with '-'\n", len,
+                            value);
+
+            flags->push_back(string(value, len));
+        }
+    }
+
+// Snarf an entire file into a C++ string.  This is just so that we
+// can do all the I/O in one place and not worry about it everywhere.
+// Plus, it's convenient to have the whole file contents at hand.
+// Adds a newline at the end of the file.
+#define PFATAL(s)                                                              \
+    do {                                                                       \
+        perror(s);                                                             \
+        gflags_exitfunc(1);                                                    \
+    } while (0)
+
+    static string ReadFileIntoString(const char* filename) {
+        const int kBufSize = 8092;
+        char buffer[kBufSize];
+        string s;
+        FILE* fp;
+        if ((errno = SafeFOpen(&fp, filename, "r")) != 0)
+            PFATAL(filename);
+        size_t n;
+        while ((n = fread(buffer, 1, kBufSize, fp)) > 0) {
+            if (ferror(fp))
+                PFATAL(filename);
+            s.append(buffer, n);
+        }
+        fclose(fp);
+        return s;
+    }
+
+    uint32 CommandLineFlagParser::ParseNewCommandLineFlags(int* argc,
+                                                           char*** argv,
+                                                           bool remove_flags) {
+        int first_nonopt = *argc;  // for non-options moved to the end
+
+        registry_->Lock();
+        for (int i = 1; i < first_nonopt; i++) {
+            char* arg = (*argv)[i];
+
+            // Like getopt(), we permute non-option flags to be at the end.
+            if (arg[0] != '-' ||  // must be a program argument
+                (arg[0] == '-' &&
+                 arg[1] == '\0')) {  // "-" is an argument, not a flag
+                memmove((*argv) + i, (*argv) + i + 1,
+                        (*argc - (i + 1)) * sizeof((*argv)[i]));
+                (*argv)[*argc - 1] = arg;  // we go last
+                first_nonopt--;            // we've been pushed onto the stack
+                i--;                       // to undo the i++ in the loop
+                continue;
+            }
+
+            if (arg[0] == '-')
+                arg++;  // allow leading '-'
+            if (arg[0] == '-')
+                arg++;  // or leading '--'
+
+            // -- alone means what it does for GNU: stop options parsing
+            if (*arg == '\0') {
+                first_nonopt = i + 1;
+                break;
+            }
+
+            // Find the flag object for this option
+            string key;
+            const char* value;
+            string error_message;
+            CommandLineFlag* flag = registry_->SplitArgumentLocked(
+                arg, &key, &value, &error_message);
+            if (flag == NULL) {
+                undefined_names_[key] = "";  // value isn't actually used
+                error_flags_[key]     = error_message;
+                continue;
+            }
+
+            if (value == NULL) {
+                // Boolean options are always assigned a value by
+                // SplitArgumentLocked()
+                assert(flag->Type() != FlagValue::FV_BOOL);
+                if (i + 1 >= first_nonopt) {
+                    // This flag needs a value, but there is nothing available
+                    error_flags_[key] =
+                        (string(kError) + "flag '" + (*argv)[i] + "'" +
+                         " is missing its argument");
+                    if (flag->help() && flag->help()[0] > '\001') {
+                        // Be useful in case we have a non-stripped description.
+                        error_flags_[key] +=
+                            string("; flag description: ") + flag->help();
+                    }
+                    error_flags_[key] += "\n";
+                    break;  // we treat this as an unrecoverable error
+                } else {
+                    value = (*argv)[++i];  // read next arg for value
+
+                    // Heuristic to detect the case where someone treats a
+                    // string arg like a bool:
+                    // --my_string_var --foo=bar
+                    // We look for a flag of string type, whose value begins
+                    // with a dash, and where the flag-name and value are
+                    // separated by a space rather than an '='. To avoid false
+                    // positives, we also require the word "true" or "false" in
+                    // the help string.  Without this, a valid usage
+                    // "-lat -30.5" would trigger the warning.  The common cases
+                    // we want to solve talk about true and false as values.
+                    if (value[0] == '-' &&
+                        flag->Type() == FlagValue::FV_STRING &&
+                        (strstr(flag->help(), "true") ||
+                         strstr(flag->help(), "false"))) {
+                        LOG(WARNING) << "Did you really mean to set flag '"
+                                     << flag->name() << "' to the value '"
+                                     << value << "'?";
+                    }
+                }
+            }
+
+            // TODO(csilvers): only set a flag if we hadn't set it before here
+            ProcessSingleOptionLocked(flag, value, SET_FLAGS_VALUE);
+        }
+        registry_->Unlock();
+
+        if (remove_flags) {  // Fix up argc and argv by removing command line
+                             // flags
+            (*argv)[first_nonopt - 1] = (*argv)[0];
+            (*argv) += (first_nonopt - 1);
+            (*argc) -= (first_nonopt - 1);
+            first_nonopt = 1;  // because we still don't count argv[0]
+        }
+
+        logging_is_probably_set_up =
+            true;  // because we've parsed --logdir, etc.
+
+        return first_nonopt;
+    }
+
+    string CommandLineFlagParser::ProcessFlagfileLocked(
+        const string& flagval, FlagSettingMode set_mode) {
+        if (flagval.empty())
+            return "";
+
+        string msg;
+        vector<string> filename_list;
+        ParseFlagList(flagval.c_str(),
+                      &filename_list);  // take a list of filenames
+        for (size_t i = 0; i < filename_list.size(); ++i) {
+            const char* file = filename_list[i].c_str();
+            msg += ProcessOptionsFromStringLocked(ReadFileIntoString(file),
+                                                  set_mode);
+        }
+        return msg;
+    }
+
+    string CommandLineFlagParser::ProcessFromenvLocked(const string& flagval,
+                                                       FlagSettingMode set_mode,
+                                                       bool errors_are_fatal) {
+        if (flagval.empty())
+            return "";
+
+        string msg;
+        vector<string> flaglist;
+        ParseFlagList(flagval.c_str(), &flaglist);
+
+        for (size_t i = 0; i < flaglist.size(); ++i) {
+            const char* flagname  = flaglist[i].c_str();
+            CommandLineFlag* flag = registry_->FindFlagLocked(flagname);
+            if (flag == NULL) {
+                error_flags_[flagname] = StringPrintf(
+                    "%sunknown command line flag '%s' "
+                    "(via --fromenv or --tryfromenv)\n",
+                    kError, flagname);
+                undefined_names_[flagname] = "";
+                continue;
+            }
+
+            const string envname = string("FLAGS_") + string(flagname);
+            string envval;
+            if (!SafeGetEnv(envname.c_str(), envval)) {
+                if (errors_are_fatal) {
+                    error_flags_[flagname] = (string(kError) + envname +
+                                              " not found in environment\n");
+                }
+                continue;
+            }
+
+            // Avoid infinite recursion.
+            if (envval == "fromenv" || envval == "tryfromenv") {
+                error_flags_[flagname] = StringPrintf(
+                    "%sinfinite recursion on environment flag '%s'\n", kError,
+                    envval.c_str());
+                continue;
+            }
+
+            msg += ProcessSingleOptionLocked(flag, envval.c_str(), set_mode);
+        }
+        return msg;
+    }
+
+    string CommandLineFlagParser::ProcessSingleOptionLocked(
+        CommandLineFlag* flag, const char* value, FlagSettingMode set_mode) {
+        string msg;
+        if (value && !registry_->SetFlagLocked(flag, value, set_mode, &msg)) {
+            error_flags_[flag->name()] = msg;
+            return "";
+        }
+
+        // The recursive flags, --flagfile and --fromenv and --tryfromenv,
+        // must be dealt with as soon as they're seen.  They will emit
+        // messages of their own.
+        if (strcmp(flag->name(), "flagfile") == 0) {
+            msg += ProcessFlagfileLocked(FLAGS_flagfile, set_mode);
+
+        } else if (strcmp(flag->name(), "fromenv") == 0) {
+            // last arg indicates envval-not-found is fatal (unlike in
+            // --tryfromenv)
+            msg += ProcessFromenvLocked(FLAGS_fromenv, set_mode, true);
+
+        } else if (strcmp(flag->name(), "tryfromenv") == 0) {
+            msg += ProcessFromenvLocked(FLAGS_tryfromenv, set_mode, false);
+        }
+
+        return msg;
+    }
+
+    void CommandLineFlagParser::ValidateFlags(bool all) {
+        FlagRegistryLock frl(registry_);
+        for (FlagRegistry::FlagConstIterator i = registry_->flags_.begin();
+             i != registry_->flags_.end(); ++i) {
+            if ((all || !i->second->Modified()) &&
+                !i->second->ValidateCurrent()) {
+                // only set a message if one isn't already there.  (If there's
+                // an error message, our job is done, even if it's not exactly
+                // the same error.)
+                if (error_flags_[i->second->name()].empty()) {
+                    error_flags_[i->second->name()] =
+                        string(kError) + "--" + i->second->name() +
+                        " must be set on the commandline";
+                    if (!i->second->Modified()) {
+                        error_flags_[i->second->name()] +=
+                            " (default value fails validation)";
+                    }
+                    error_flags_[i->second->name()] += "\n";
+                }
+            }
+        }
+    }
+
+    void CommandLineFlagParser::ValidateAllFlags() {
+        ValidateFlags(true);
+    }
+
+    void CommandLineFlagParser::ValidateUnmodifiedFlags() {
+        ValidateFlags(false);
+    }
+
+    bool CommandLineFlagParser::ReportErrors() {
+        // error_flags_ indicates errors we saw while parsing.
+        // But we ignore undefined-names if ok'ed by --undef_ok
+        if (!FLAGS_undefok.empty()) {
+            vector<string> flaglist;
+            ParseFlagList(FLAGS_undefok.c_str(), &flaglist);
+            for (size_t i = 0; i < flaglist.size(); ++i) {
+                // We also deal with --no<flag>, in case the flagname was
+                // boolean
+                const string no_version = string("no") + flaglist[i];
+                if (undefined_names_.find(flaglist[i]) !=
+                    undefined_names_.end()) {
+                    error_flags_[flaglist[i]] = "";  // clear the error message
+                } else if (undefined_names_.find(no_version) !=
+                           undefined_names_.end()) {
+                    error_flags_[no_version] = "";
+                }
+            }
+        }
+        // Likewise, if they decided to allow reparsing, all undefined-names
+        // are ok; we just silently ignore them now, and hope that a future
+        // parse will pick them up somehow.
+        if (allow_command_line_reparsing) {
+            for (map<string, string>::const_iterator it =
+                     undefined_names_.begin();
+                 it != undefined_names_.end(); ++it)
+                error_flags_[it->first] = "";  // clear the error message
+        }
+
+        bool found_error = false;
+        string error_message;
+        for (map<string, string>::const_iterator it = error_flags_.begin();
+             it != error_flags_.end(); ++it) {
+            if (!it->second.empty()) {
+                error_message.append(it->second.data(), it->second.size());
+                found_error = true;
+            }
+        }
+        if (found_error)
+            ReportError(DO_NOT_DIE, "%s", error_message.c_str());
+        return found_error;
+    }
+
+    string CommandLineFlagParser::ProcessOptionsFromStringLocked(
+        const string& contentdata, FlagSettingMode set_mode) {
+        string retval;
+        const char* flagfile_contents = contentdata.c_str();
+        bool flags_are_relevant =
+            true;  // set to false when filenames don't match
+        bool in_filename_section = false;
+
+        const char* line_end = flagfile_contents;
+        // We read this file a line at a time.
+        for (; line_end; flagfile_contents = line_end + 1) {
+            while (*flagfile_contents && isspace(*flagfile_contents))
+                ++flagfile_contents;
+            // Windows uses "\r\n"
+            line_end = strchr(flagfile_contents, '\r');
+            if (line_end == NULL)
+                line_end = strchr(flagfile_contents, '\n');
+
+            size_t len = line_end ? line_end - flagfile_contents
+                                  : strlen(flagfile_contents);
+            string line(flagfile_contents, len);
+
+            // Each line can be one of four things:
+            // 1) A comment line -- we skip it
+            // 2) An empty line -- we skip it
+            // 3) A list of filenames -- starts a new filenames+flags section
+            // 4) A --flag=value line -- apply if previous filenames match
+            if (line.empty() || line[0] == '#') {
+                // comment or empty line; just ignore
+
+            } else if (line[0] == '-') {      // flag
+                in_filename_section = false;  // instead, it was a flag-line
+                if (!flags_are_relevant)  // skip this flag; applies to someone
+                                          // else
+                    continue;
+
+                const char* name_and_val =
+                    line.c_str() + 1;  // skip the leading -
+                if (*name_and_val == '-')
+                    name_and_val++;  // skip second - too
+                string key;
+                const char* value;
+                string error_message;
+                CommandLineFlag* flag = registry_->SplitArgumentLocked(
+                    name_and_val, &key, &value, &error_message);
+                // By API, errors parsing flagfile lines are silently ignored.
+                if (flag == NULL) {
+                    // "WARNING: flagname '" + key + "' not found\n"
+                } else if (value == NULL) {
+                    // "WARNING: flagname '" + key + "' missing a value\n"
+                } else {
+                    retval += ProcessSingleOptionLocked(flag, value, set_mode);
+                }
+
+            } else {                         // a filename!
+                if (!in_filename_section) {  // start over: assume filenames
+                                             // don't match
+                    in_filename_section = true;
+                    flags_are_relevant  = false;
+                }
+
+                // Split the line up at spaces into glob-patterns
+                const char* space = line.c_str();  // just has to be non-NULL
+                for (const char* word = line.c_str(); *space;
+                     word             = space + 1) {
+                    if (flags_are_relevant)  // we can stop as soon as we match
+                        break;
+                    space = strchr(word, ' ');
+                    if (space == NULL)
+                        space = word + strlen(word);
+                    const string glob(word, space - word);
+                    // We try matching both against the full argv0 and
+                    // basename(argv0)
+                    if (glob == ProgramInvocationName()  // small optimization
+                        || glob == ProgramInvocationShortName()
+#if defined(HAVE_FNMATCH_H)
+                        || fnmatch(glob.c_str(), ProgramInvocationName(),
+                                   FNM_PATHNAME) == 0 ||
+                        fnmatch(glob.c_str(), ProgramInvocationShortName(),
+                                FNM_PATHNAME) == 0
+#elif defined(HAVE_SHLWAPI_H)
+                        ||
+                        PathMatchSpec(glob.c_str(), ProgramInvocationName()) ||
+                        PathMatchSpec(glob.c_str(),
+                                      ProgramInvocationShortName())
+#endif
+                    ) {
+                        flags_are_relevant = true;
+                    }
+                }
+            }
+        }
+        return retval;
+    }
+
+    // --------------------------------------------------------------------
+    // GetFromEnv()
+    // AddFlagValidator()
+    //    These are helper functions for routines like BoolFromEnv() and
+    //    RegisterFlagValidator, defined below.  They're defined here so
+    //    they can live in the unnamed namespace (which makes friendship
+    //    declarations for these classes possible).
+    // --------------------------------------------------------------------
+
+    template <typename T>
+    T GetFromEnv(const char* varname, T dflt) {
+        std::string valstr;
+        if (SafeGetEnv(varname, valstr)) {
+            FlagValue ifv(new T, true);
+            if (!ifv.ParseFrom(valstr.c_str())) {
+                ReportError(
+                    DIE,
+                    "ERROR: error parsing env variable '%s' with value '%s'\n",
+                    varname, valstr.c_str());
+            }
+            return OTHER_VALUE_AS(ifv, T);
+        } else
+            return dflt;
+    }
+
+    bool AddFlagValidator(const void* flag_ptr,
+                          ValidateFnProto validate_fn_proto) {
+        // We want a lock around this routine, in case two threads try to
+        // add a validator (hopefully the same one!) at once.  We could use
+        // our own thread, but we need to loook at the registry anyway, so
+        // we just steal that one.
+        FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+        FlagRegistryLock frl(registry);
+        // First, find the flag whose current-flag storage is 'flag'.
+        // This is the CommandLineFlag whose current_->value_buffer_ == flag
+        CommandLineFlag* flag = registry->FindFlagViaPtrLocked(flag_ptr);
+        if (!flag) {
+            LOG(WARNING)
+                << "Ignoring RegisterValidateFunction() for flag pointer "
+                << flag_ptr << ": no flag found at that address";
+            return false;
+        } else if (validate_fn_proto == flag->validate_function()) {
+            return true;  // ok to register the same function over and over
+                          // again
+        } else if (validate_fn_proto != NULL &&
+                   flag->validate_function() != NULL) {
+            LOG(WARNING) << "Ignoring RegisterValidateFunction() for flag '"
+                         << flag->name() << "': validate-fn already registered";
+            return false;
+        } else {
+            flag->validate_fn_proto_ = validate_fn_proto;
+            return true;
+        }
+    }
+
+}  // namespace
+
+// Now define the functions that are exported via the .h file
+
+// --------------------------------------------------------------------
+// FlagRegisterer
+//    This class exists merely to have a global constructor (the
+//    kind that runs before main(), that goes an initializes each
+//    flag that's been declared.  Note that it's very important we
+//    don't have a destructor that deletes flag_, because that would
+//    cause us to delete current_storage/defvalue_storage as well,
+//    which can cause a crash if anything tries to access the flag
+//    values in a global destructor.
+// --------------------------------------------------------------------
+
+namespace {
+    void RegisterCommandLineFlag(const char* name, const char* help,
+                                 const char* filename, FlagValue* current,
+                                 FlagValue* defvalue) {
+        if (help == NULL)
+            help = "";
+        // Importantly, flag_ will never be deleted, so storage is always good.
+        CommandLineFlag* flag =
+            new CommandLineFlag(name, help, filename, current, defvalue);
+        FlagRegistry::GlobalRegistry()->RegisterFlag(flag);  // default registry
+    }
+}  // namespace
+
+template <typename FlagType>
+FlagRegisterer::FlagRegisterer(const char* name, const char* help,
+                               const char* filename, FlagType* current_storage,
+                               FlagType* defvalue_storage) {
+    FlagValue* const current  = new FlagValue(current_storage, false);
+    FlagValue* const defvalue = new FlagValue(defvalue_storage, false);
+    RegisterCommandLineFlag(name, help, filename, current, defvalue);
+}
+
+// Force compiler to generate code for the given template specialization.
+#define INSTANTIATE_FLAG_REGISTERER_CTOR(type)                                 \
+    template GFLAGS_DLL_DECL FlagRegisterer::FlagRegisterer(                   \
+        const char* name, const char* help, const char* filename,              \
+        type* current_storage, type* defvalue_storage)
+
+// Do this for all supported flag types.
+INSTANTIATE_FLAG_REGISTERER_CTOR(bool);
+INSTANTIATE_FLAG_REGISTERER_CTOR(int32);
+INSTANTIATE_FLAG_REGISTERER_CTOR(uint32);
+INSTANTIATE_FLAG_REGISTERER_CTOR(int64);
+INSTANTIATE_FLAG_REGISTERER_CTOR(uint64);
+INSTANTIATE_FLAG_REGISTERER_CTOR(double);
+INSTANTIATE_FLAG_REGISTERER_CTOR(std::string);
+
+#undef INSTANTIATE_FLAG_REGISTERER_CTOR
+
+// --------------------------------------------------------------------
+// GetAllFlags()
+//    The main way the FlagRegistry class exposes its data.  This
+//    returns, as strings, all the info about all the flags in
+//    the main registry, sorted first by filename they are defined
+//    in, and then by flagname.
+// --------------------------------------------------------------------
+
+struct FilenameFlagnameCmp {
+    bool operator()(const CommandLineFlagInfo& a,
+                    const CommandLineFlagInfo& b) const {
+        int cmp = strcmp(a.filename.c_str(), b.filename.c_str());
+        if (cmp == 0)
+            cmp = strcmp(a.name.c_str(), b.name.c_str());  // secondary sort key
+        return cmp < 0;
+    }
+};
+
+void GetAllFlags(vector<CommandLineFlagInfo>* OUTPUT) {
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    registry->Lock();
+    for (FlagRegistry::FlagConstIterator i = registry->flags_.begin();
+         i != registry->flags_.end(); ++i) {
+        CommandLineFlagInfo fi;
+        i->second->FillCommandLineFlagInfo(&fi);
+        OUTPUT->push_back(fi);
+    }
+    registry->Unlock();
+    // Now sort the flags, first by filename they occur in, then alphabetically
+    sort(OUTPUT->begin(), OUTPUT->end(), FilenameFlagnameCmp());
+}
+
+// --------------------------------------------------------------------
+// SetArgv()
+// GetArgvs()
+// GetArgv()
+// GetArgv0()
+// ProgramInvocationName()
+// ProgramInvocationShortName()
+// SetUsageMessage()
+// ProgramUsage()
+//    Functions to set and get argv.  Typically the setter is called
+//    by ParseCommandLineFlags.  Also can get the ProgramUsage string,
+//    set by SetUsageMessage.
+// --------------------------------------------------------------------
+
+// These values are not protected by a Mutex because they are normally
+// set only once during program startup.
+static string argv0("UNKNOWN");  // just the program name
+static string cmdline;           // the entire command-line
+static string program_usage;
+static vector<string> argvs;
+static uint32 argv_sum = 0;
+
+void SetArgv(int argc, const char** argv) {
+    static bool called_set_argv = false;
+    if (called_set_argv)
+        return;
+    called_set_argv = true;
+
+    assert(argc > 0);  // every program has at least a name
+    argv0 = argv[0];
+
+    cmdline.clear();
+    for (int i = 0; i < argc; i++) {
+        if (i != 0)
+            cmdline += " ";
+        cmdline += argv[i];
+        argvs.push_back(argv[i]);
+    }
+
+    // Compute a simple sum of all the chars in argv
+    argv_sum = 0;
+    for (string::const_iterator c = cmdline.begin(); c != cmdline.end(); ++c) {
+        argv_sum += *c;
+    }
+}
+
+const vector<string>& GetArgvs() {
+    return argvs;
+}
+const char* GetArgv() {
+    return cmdline.c_str();
+}
+const char* GetArgv0() {
+    return argv0.c_str();
+}
+uint32 GetArgvSum() {
+    return argv_sum;
+}
+const char* ProgramInvocationName() {  // like the GNU libc fn
+    return GetArgv0();
+}
+const char* ProgramInvocationShortName() {  // like the GNU libc fn
+    size_t pos = argv0.rfind('/');
+#ifdef OS_WINDOWS
+    if (pos == string::npos)
+        pos = argv0.rfind('\\');
+#endif
+    return (pos == string::npos ? argv0.c_str() : (argv0.c_str() + pos + 1));
+}
+
+void SetUsageMessage(const string& usage) {
+    program_usage = usage;
+}
+
+const char* ProgramUsage() {
+    if (program_usage.empty()) {
+        return "Warning: SetUsageMessage() never called";
+    }
+    return program_usage.c_str();
+}
+
+// --------------------------------------------------------------------
+// SetVersionString()
+// VersionString()
+// --------------------------------------------------------------------
+
+static string version_string;
+
+void SetVersionString(const string& version) {
+    version_string = version;
+}
+
+const char* VersionString() {
+    return version_string.c_str();
+}
+
+// --------------------------------------------------------------------
+// GetCommandLineOption()
+// GetCommandLineFlagInfo()
+// GetCommandLineFlagInfoOrDie()
+// SetCommandLineOption()
+// SetCommandLineOptionWithMode()
+//    The programmatic way to set a flag's value, using a string
+//    for its name rather than the variable itself (that is,
+//    SetCommandLineOption("foo", x) rather than FLAGS_foo = x).
+//    There's also a bit more flexibility here due to the various
+//    set-modes, but typically these are used when you only have
+//    that flag's name as a string, perhaps at runtime.
+//    All of these work on the default, global registry.
+//       For GetCommandLineOption, return false if no such flag
+//    is known, true otherwise.  We clear "value" if a suitable
+//    flag is found.
+// --------------------------------------------------------------------
+
+bool GetCommandLineOption(const char* name, string* value) {
+    if (NULL == name)
+        return false;
+    assert(value);
+
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    FlagRegistryLock frl(registry);
+    CommandLineFlag* flag = registry->FindFlagLocked(name);
+    if (flag == NULL) {
+        return false;
+    } else {
+        *value = flag->current_value();
+        return true;
+    }
+}
+
+bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT) {
+    if (NULL == name)
+        return false;
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    FlagRegistryLock frl(registry);
+    CommandLineFlag* flag = registry->FindFlagLocked(name);
+    if (flag == NULL) {
+        return false;
+    } else {
+        assert(OUTPUT);
+        flag->FillCommandLineFlagInfo(OUTPUT);
+        return true;
+    }
+}
+
+CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name) {
+    CommandLineFlagInfo info;
+    if (!GetCommandLineFlagInfo(name, &info)) {
+        fprintf(stderr, "FATAL ERROR: flag name '%s' doesn't exist\n", name);
+        gflags_exitfunc(1);  // almost certainly gflags_exitfunc()
+    }
+    return info;
+}
+
+string SetCommandLineOptionWithMode(const char* name, const char* value,
+                                    FlagSettingMode set_mode) {
+    string result;
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    FlagRegistryLock frl(registry);
+    CommandLineFlag* flag = registry->FindFlagLocked(name);
+    if (flag) {
+        CommandLineFlagParser parser(registry);
+        result = parser.ProcessSingleOptionLocked(flag, value, set_mode);
+        if (!result.empty()) {  // in the error case, we've already logged
+                                // Could consider logging this change
+        }
+    }
+    // The API of this function is that we return empty string on error
+    return result;
+}
+
+string SetCommandLineOption(const char* name, const char* value) {
+    return SetCommandLineOptionWithMode(name, value, SET_FLAGS_VALUE);
+}
+
+// --------------------------------------------------------------------
+// FlagSaver
+// FlagSaverImpl
+//    This class stores the states of all flags at construct time,
+//    and restores all flags to that state at destruct time.
+//    Its major implementation challenge is that it never modifies
+//    pointers in the 'main' registry, so global FLAG_* vars always
+//    point to the right place.
+// --------------------------------------------------------------------
+
+class FlagSaverImpl {
+public:
+    // Constructs an empty FlagSaverImpl object.
+    explicit FlagSaverImpl(FlagRegistry* main_registry)
+        : main_registry_(main_registry) {}
+    ~FlagSaverImpl() {
+        // reclaim memory from each of our CommandLineFlags
+        vector<CommandLineFlag*>::const_iterator it;
+        for (it = backup_registry_.begin(); it != backup_registry_.end(); ++it)
+            delete *it;
+    }
+
+    // Saves the flag states from the flag registry into this object.
+    // It's an error to call this more than once.
+    // Must be called when the registry mutex is not held.
+    void SaveFromRegistry() {
+        FlagRegistryLock frl(main_registry_);
+        assert(backup_registry_.empty());  // call only once!
+        for (FlagRegistry::FlagConstIterator it =
+                 main_registry_->flags_.begin();
+             it != main_registry_->flags_.end(); ++it) {
+            const CommandLineFlag* main = it->second;
+            // Sets up all the const variables in backup correctly
+            CommandLineFlag* backup = new CommandLineFlag(
+                main->name(), main->help(), main->filename(),
+                main->current_->New(), main->defvalue_->New());
+            // Sets up all the non-const variables in backup correctly
+            backup->CopyFrom(*main);
+            backup_registry_.push_back(backup);  // add it to a convenient list
+        }
+    }
+
+    // Restores the saved flag states into the flag registry.  We
+    // assume no flags were added or deleted from the registry since
+    // the SaveFromRegistry; if they were, that's trouble!  Must be
+    // called when the registry mutex is not held.
+    void RestoreToRegistry() {
+        FlagRegistryLock frl(main_registry_);
+        vector<CommandLineFlag*>::const_iterator it;
+        for (it = backup_registry_.begin(); it != backup_registry_.end();
+             ++it) {
+            CommandLineFlag* main =
+                main_registry_->FindFlagLocked((*it)->name());
+            if (main != NULL) {  // if NULL, flag got deleted from registry(!)
+                main->CopyFrom(**it);
+            }
+        }
+    }
+
+private:
+    FlagRegistry* const main_registry_;
+    vector<CommandLineFlag*> backup_registry_;
+
+    FlagSaverImpl(const FlagSaverImpl&);  // no copying!
+    void operator=(const FlagSaverImpl&);
+};
+
+FlagSaver::FlagSaver()
+    : impl_(new FlagSaverImpl(FlagRegistry::GlobalRegistry())) {
+    impl_->SaveFromRegistry();
+}
+
+FlagSaver::~FlagSaver() {
+    impl_->RestoreToRegistry();
+    delete impl_;
+}
+
+// --------------------------------------------------------------------
+// CommandlineFlagsIntoString()
+// ReadFlagsFromString()
+// AppendFlagsIntoFile()
+// ReadFromFlagsFile()
+//    These are mostly-deprecated routines that stick the
+//    commandline flags into a file/string and read them back
+//    out again.  I can see a use for CommandlineFlagsIntoString,
+//    for creating a flagfile, but the rest don't seem that useful
+//    -- some, I think, are a poor-man's attempt at FlagSaver --
+//    and are included only until we can delete them from callers.
+//    Note they don't save --flagfile flags (though they do save
+//    the result of having called the flagfile, of course).
+// --------------------------------------------------------------------
+
+static string TheseCommandlineFlagsIntoString(
+    const vector<CommandLineFlagInfo>& flags) {
+    vector<CommandLineFlagInfo>::const_iterator i;
+
+    size_t retval_space = 0;
+    for (i = flags.begin(); i != flags.end(); ++i) {
+        // An (over)estimate of how much space it will take to print this flag
+        retval_space += i->name.length() + i->current_value.length() + 5;
+    }
+
+    string retval;
+    retval.reserve(retval_space);
+    for (i = flags.begin(); i != flags.end(); ++i) {
+        retval += "--";
+        retval += i->name;
+        retval += "=";
+        retval += i->current_value;
+        retval += "\n";
+    }
+    return retval;
+}
+
+string CommandlineFlagsIntoString() {
+    vector<CommandLineFlagInfo> sorted_flags;
+    GetAllFlags(&sorted_flags);
+    return TheseCommandlineFlagsIntoString(sorted_flags);
+}
+
+bool ReadFlagsFromString(const string& flagfilecontents,
+                         const char* /*prog_name*/,  // TODO(csilvers): nix this
+                         bool errors_are_fatal) {
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    FlagSaverImpl saved_states(registry);
+    saved_states.SaveFromRegistry();
+
+    CommandLineFlagParser parser(registry);
+    registry->Lock();
+    parser.ProcessOptionsFromStringLocked(flagfilecontents, SET_FLAGS_VALUE);
+    registry->Unlock();
+    // Should we handle --help and such when reading flags from a string?  Sure.
+    HandleCommandLineHelpFlags();
+    if (parser.ReportErrors()) {
+        // Error.  Restore all global flags to their previous values.
+        if (errors_are_fatal)
+            gflags_exitfunc(1);
+        saved_states.RestoreToRegistry();
+        return false;
+    }
+    return true;
+}
+
+// TODO(csilvers): nix prog_name in favor of ProgramInvocationShortName()
+bool AppendFlagsIntoFile(const string& filename, const char* prog_name) {
+    FILE* fp;
+    if (SafeFOpen(&fp, filename.c_str(), "a") != 0) {
+        return false;
+    }
+
+    if (prog_name)
+        fprintf(fp, "%s\n", prog_name);
+
+    vector<CommandLineFlagInfo> flags;
+    GetAllFlags(&flags);
+    // But we don't want --flagfile, which leads to weird recursion issues
+    vector<CommandLineFlagInfo>::iterator i;
+    for (i = flags.begin(); i != flags.end(); ++i) {
+        if (strcmp(i->name.c_str(), "flagfile") == 0) {
+            flags.erase(i);
+            break;
+        }
+    }
+    fprintf(fp, "%s", TheseCommandlineFlagsIntoString(flags).c_str());
+
+    fclose(fp);
+    return true;
+}
+
+bool ReadFromFlagsFile(const string& filename, const char* prog_name,
+                       bool errors_are_fatal) {
+    return ReadFlagsFromString(ReadFileIntoString(filename.c_str()), prog_name,
+                               errors_are_fatal);
+}
+
+// --------------------------------------------------------------------
+// BoolFromEnv()
+// Int32FromEnv()
+// Uint32FromEnv()
+// Int64FromEnv()
+// Uint64FromEnv()
+// DoubleFromEnv()
+// StringFromEnv()
+//    Reads the value from the environment and returns it.
+//    We use an FlagValue to make the parsing easy.
+//    Example usage:
+//       DEFINE_bool(myflag, BoolFromEnv("MYFLAG_DEFAULT", false), "whatever");
+// --------------------------------------------------------------------
+
+bool BoolFromEnv(const char* v, bool dflt) {
+    return GetFromEnv(v, dflt);
+}
+int32 Int32FromEnv(const char* v, int32 dflt) {
+    return GetFromEnv(v, dflt);
+}
+uint32 Uint32FromEnv(const char* v, uint32 dflt) {
+    return GetFromEnv(v, dflt);
+}
+int64 Int64FromEnv(const char* v, int64 dflt) {
+    return GetFromEnv(v, dflt);
+}
+uint64 Uint64FromEnv(const char* v, uint64 dflt) {
+    return GetFromEnv(v, dflt);
+}
+double DoubleFromEnv(const char* v, double dflt) {
+    return GetFromEnv(v, dflt);
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)  // ignore getenv security warning
+#endif
+const char* StringFromEnv(const char* varname, const char* dflt) {
+    const char* const val = getenv(varname);
+    return val ? val : dflt;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// --------------------------------------------------------------------
+// RegisterFlagValidator()
+//    RegisterFlagValidator() is the function that clients use to
+//    'decorate' a flag with a validation function.  Once this is
+//    done, every time the flag is set (including when the flag
+//    is parsed from argv), the validator-function is called.
+//       These functions return true if the validator was added
+//    successfully, or false if not: the flag already has a validator,
+//    (only one allowed per flag), the 1st arg isn't a flag, etc.
+//       This function is not thread-safe.
+// --------------------------------------------------------------------
+
+bool RegisterFlagValidator(const bool* flag,
+                           bool (*validate_fn)(const char*, bool)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const int32* flag,
+                           bool (*validate_fn)(const char*, int32)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const uint32* flag,
+                           bool (*validate_fn)(const char*, uint32)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const int64* flag,
+                           bool (*validate_fn)(const char*, int64)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const uint64* flag,
+                           bool (*validate_fn)(const char*, uint64)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const double* flag,
+                           bool (*validate_fn)(const char*, double)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+bool RegisterFlagValidator(const string* flag,
+                           bool (*validate_fn)(const char*, const string&)) {
+    return AddFlagValidator(flag,
+                            reinterpret_cast<ValidateFnProto>(validate_fn));
+}
+
+// --------------------------------------------------------------------
+// ParseCommandLineFlags()
+// ParseCommandLineNonHelpFlags()
+// HandleCommandLineHelpFlags()
+//    This is the main function called from main(), to actually
+//    parse the commandline.  It modifies argc and argv as described
+//    at the top of gflags.h.  You can also divide this
+//    function into two parts, if you want to do work between
+//    the parsing of the flags and the printing of any help output.
+// --------------------------------------------------------------------
+
+static uint32 ParseCommandLineFlagsInternal(int* argc, char*** argv,
+                                            bool remove_flags, bool do_report) {
+    SetArgv(*argc, const_cast<const char**>(*argv));  // save it for later
+
+    FlagRegistry* const registry = FlagRegistry::GlobalRegistry();
+    CommandLineFlagParser parser(registry);
+
+    // When we parse the commandline flags, we'll handle --flagfile,
+    // --tryfromenv, etc. as we see them (since flag-evaluation order
+    // may be important).  But sometimes apps set FLAGS_tryfromenv/etc.
+    // manually before calling ParseCommandLineFlags.  We want to evaluate
+    // those too, as if they were the first flags on the commandline.
+    registry->Lock();
+    parser.ProcessFlagfileLocked(FLAGS_flagfile, SET_FLAGS_VALUE);
+    // Last arg here indicates whether flag-not-found is a fatal error or not
+    parser.ProcessFromenvLocked(FLAGS_fromenv, SET_FLAGS_VALUE, true);
+    parser.ProcessFromenvLocked(FLAGS_tryfromenv, SET_FLAGS_VALUE, false);
+    registry->Unlock();
+
+    // Now get the flags specified on the commandline
+    const int r = parser.ParseNewCommandLineFlags(argc, argv, remove_flags);
+
+    if (do_report)
+        HandleCommandLineHelpFlags();  // may cause us to exit on --help, etc.
+
+    // See if any of the unset flags fail their validation checks
+    parser.ValidateUnmodifiedFlags();
+
+    if (parser.ReportErrors())  // may cause us to exit on illegal flags
+        gflags_exitfunc(1);
+    return r;
+}
+
+uint32 ParseCommandLineFlags(int* argc, char*** argv, bool remove_flags) {
+    return ParseCommandLineFlagsInternal(argc, argv, remove_flags, true);
+}
+
+uint32 ParseCommandLineNonHelpFlags(int* argc, char*** argv,
+                                    bool remove_flags) {
+    return ParseCommandLineFlagsInternal(argc, argv, remove_flags, false);
+}
+
+// --------------------------------------------------------------------
+// AllowCommandLineReparsing()
+// ReparseCommandLineNonHelpFlags()
+//    This is most useful for shared libraries.  The idea is if
+//    a flag is defined in a shared library that is dlopen'ed
+//    sometime after main(), you can ParseCommandLineFlags before
+//    the dlopen, then ReparseCommandLineNonHelpFlags() after the
+//    dlopen, to get the new flags.  But you have to explicitly
+//    Allow() it; otherwise, you get the normal default behavior
+//    of unrecognized flags calling a fatal error.
+// TODO(csilvers): this isn't used.  Just delete it?
+// --------------------------------------------------------------------
+
+void AllowCommandLineReparsing() {
+    allow_command_line_reparsing = true;
+}
+
+void ReparseCommandLineNonHelpFlags() {
+    // We make a copy of argc and argv to pass in
+    const vector<string>& argvs = GetArgvs();
+    int tmp_argc                = static_cast<int>(argvs.size());
+    char** tmp_argv             = new char*[tmp_argc + 1];
+    for (int i = 0; i < tmp_argc; ++i)
+        tmp_argv[i] = strdup(argvs[i].c_str());  // TODO(csilvers): don't dup
+
+    ParseCommandLineNonHelpFlags(&tmp_argc, &tmp_argv, false);
+
+    for (int i = 0; i < tmp_argc; ++i)
+        free(tmp_argv[i]);
+    delete[] tmp_argv;
+}
+
+void ShutDownCommandLineFlags() {
+    FlagRegistry::DeleteGlobalRegistry();
+}
+
+}  // namespace GFLAGS_NAMESPACE
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags.h.in b/3rdparty/TNN/third_party/gflags/src/gflags.h.in
new file mode 100644
index 0000000..82e640f
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags.h.in
@@ -0,0 +1,605 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags/gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint32*      flag, bool (*validate_fn)(const char*, uint32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+}@GFLAGS_ATTRIBUTE_UNUSED@;
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL uint32 Uint32FromEnv(const char *varname, uint32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  // We instantiate this template ctor for all supported types,
+  // so it is possible to place implementation of the FlagRegisterer ctor in
+  // .cc file.
+  // Calling this constructor with unsupported type will produce linker error.
+  template <typename FlagType>
+  FlagRegisterer(const char* name,
+                 const char* help, const char* filename,
+                 FlagType* current_storage, FlagType* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, MAYBE_STRIPPED_HELP(help), __FILE__,                       \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_uint32(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+
+// Auxiliary class used to explicitly call destructor of string objects
+// allocated using placement new during static program deinitialization.
+// The destructor MUST be an inline function such that the explicit
+// destruction occurs in the same compilation unit as the placement new.
+class StringFlagDestructor {
+  void *current_storage_;
+  void *defvalue_storage_;
+
+public: 
+
+  StringFlagDestructor(void *current, void *defvalue)
+  : current_storage_(current), defvalue_storage_(defvalue) {}
+
+  ~StringFlagDestructor() {
+    reinterpret_cast<clstring*>(current_storage_ )->~clstring();
+    reinterpret_cast<clstring*>(defvalue_storage_)->~clstring();
+  }
+};
+
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    using ::fLS::StringFlagDestructor;                                      \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, MAYBE_STRIPPED_HELP(txt), __FILE__,                          \
+        FLAGS_no##name, new (s_##name[1].s) clstring(*FLAGS_no##name));     \
+    static StringFlagDestructor d_##name(s_##name[0].s, s_##name[1].s);     \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+@INCLUDE_GFLAGS_NS_H@
+
+
+#endif  // GFLAGS_GFLAGS_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_completions.cc b/3rdparty/TNN/third_party/gflags/src/gflags_completions.cc
new file mode 100644
index 0000000..306bed2
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_completions.cc
@@ -0,0 +1,714 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+// Bash-style command line flag completion for C++ binaries
+//
+// This module implements bash-style completions.  It achieves this
+// goal in the following broad chunks:
+//
+//  1) Take a to-be-completed word, and examine it for search hints
+//  2) Identify all potentially matching flags
+//     2a) If there are no matching flags, do nothing.
+//     2b) If all matching flags share a common prefix longer than the
+//         completion word, output just that matching prefix
+//  3) Categorize those flags to produce a rough ordering of relevence.
+//  4) Potentially trim the set of flags returned to a smaller number
+//     that bash is happier with
+//  5) Output the matching flags in groups ordered by relevence.
+//     5a) Force bash to place most-relevent groups at the top of the list
+//     5b) Trim most flag's descriptions to fit on a single terminal line
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>  // for strlen
+
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "config.h"
+#include "gflags/gflags.h"
+#include "gflags/gflags_completions.h"
+#include "util.h"
+
+using std::set;
+using std::string;
+using std::vector;
+
+DEFINE_string(tab_completion_word, "",
+              "If non-empty, HandleCommandLineCompletions() will hijack the "
+              "process and attempt to do bash-style command line flag "
+              "completion on this value.");
+DEFINE_int32(tab_completion_columns, 80,
+             "Number of columns to use in output for tab completion");
+
+namespace GFLAGS_NAMESPACE {
+
+namespace {
+    // Function prototypes and Type forward declarations.  Code may be
+    // more easily understood if it is roughly ordered according to
+    // control flow, rather than by C's "declare before use" ordering
+    struct CompletionOptions;
+    struct NotableFlags;
+
+    // The entry point if flag completion is to be used.
+    static void PrintFlagCompletionInfo(void);
+
+    // 1) Examine search word
+    static void CanonicalizeCursorWordAndSearchOptions(
+        const string &cursor_word, string *canonical_search_token,
+        CompletionOptions *options);
+
+    static bool RemoveTrailingChar(string *str, char c);
+
+    // 2) Find all matches
+    static void FindMatchingFlags(const vector<CommandLineFlagInfo> &all_flags,
+                                  const CompletionOptions &options,
+                                  const string &match_token,
+                                  set<const CommandLineFlagInfo *> *all_matches,
+                                  string *longest_common_prefix);
+
+    static bool DoesSingleFlagMatch(const CommandLineFlagInfo &flag,
+                                    const CompletionOptions &options,
+                                    const string &match_token);
+
+    // 3) Categorize matches
+    static void CategorizeAllMatchingFlags(
+        const set<const CommandLineFlagInfo *> &all_matches,
+        const string &search_token, const string &module,
+        const string &package_dir, NotableFlags *notable_flags);
+
+    static void TryFindModuleAndPackageDir(
+        const vector<CommandLineFlagInfo> &all_flags, string *module,
+        string *package_dir);
+
+    // 4) Decide which flags to use
+    static void FinalizeCompletionOutput(
+        const set<const CommandLineFlagInfo *> &matching_flags,
+        CompletionOptions *options, NotableFlags *notable_flags,
+        vector<string> *completions);
+
+    static void RetrieveUnusedFlags(
+        const set<const CommandLineFlagInfo *> &matching_flags,
+        const NotableFlags &notable_flags,
+        set<const CommandLineFlagInfo *> *unused_flags);
+
+    // 5) Output matches
+    static void OutputSingleGroupWithLimit(
+        const set<const CommandLineFlagInfo *> &group,
+        const string &line_indentation, const string &header,
+        const string &footer, bool long_output_format,
+        int *remaining_line_limit, size_t *completion_elements_added,
+        vector<string> *completions);
+
+    // (helpers for #5)
+    static string GetShortFlagLine(const string &line_indentation,
+                                   const CommandLineFlagInfo &info);
+
+    static string GetLongFlagLine(const string &line_indentation,
+                                  const CommandLineFlagInfo &info);
+
+    //
+    // Useful types
+
+    // Try to deduce the intentions behind this completion attempt.  Return the
+    // canonical search term in 'canonical_search_token'.  Binary search options
+    // are returned in the various booleans, which should all have intuitive
+    // semantics, possibly except:
+    //  - return_all_matching_flags: Generally, we'll trim the number of
+    //    returned candidates to some small number, showing those that are
+    //    most likely to be useful first.  If this is set, however, the user
+    //    really does want us to return every single flag as an option.
+    //  - force_no_update: Any time we output lines, all of which share a
+    //    common prefix, bash will 'helpfully' not even bother to show the
+    //    output, instead changing the current word to be that common prefix.
+    //    If it's clear this shouldn't happen, we'll set this boolean
+    struct CompletionOptions {
+        bool flag_name_substring_search;
+        bool flag_location_substring_search;
+        bool flag_description_substring_search;
+        bool return_all_matching_flags;
+        bool force_no_update;
+    };
+
+    // Notable flags are flags that are special or preferred for some
+    // reason.  For example, flags that are defined in the binary's module
+    // are expected to be much more relevent than flags defined in some
+    // other random location.  These sets are specified roughly in precedence
+    // order.  Once a flag is placed in one of these 'higher' sets, it won't
+    // be placed in any of the 'lower' sets.
+    struct NotableFlags {
+        typedef set<const CommandLineFlagInfo *> FlagSet;
+        FlagSet perfect_match_flag;
+        FlagSet module_flags;   // Found in module file
+        FlagSet package_flags;  // Found in same directory as module file
+        FlagSet
+            most_common_flags;  // One of the XXX most commonly supplied flags
+        FlagSet subpackage_flags;  // Found in subdirectories of package
+    };
+
+    //
+    // Tab completion implementation - entry point
+    static void PrintFlagCompletionInfo(void) {
+        string cursor_word = FLAGS_tab_completion_word;
+        string canonical_token;
+        CompletionOptions options = {};
+        CanonicalizeCursorWordAndSearchOptions(cursor_word, &canonical_token,
+                                               &options);
+
+        DVLOG(1) << "Identified canonical_token: '" << canonical_token << "'";
+
+        vector<CommandLineFlagInfo> all_flags;
+        set<const CommandLineFlagInfo *> matching_flags;
+        GetAllFlags(&all_flags);
+        DVLOG(2) << "Found " << all_flags.size() << " flags overall";
+
+        string longest_common_prefix;
+        FindMatchingFlags(all_flags, options, canonical_token, &matching_flags,
+                          &longest_common_prefix);
+        DVLOG(1) << "Identified " << matching_flags.size() << " matching flags";
+        DVLOG(1) << "Identified " << longest_common_prefix
+                 << " as longest common prefix.";
+        if (longest_common_prefix.size() > canonical_token.size()) {
+            // There's actually a shared common prefix to all matching flags,
+            // so may as well output that and quit quickly.
+            DVLOG(1) << "The common prefix '" << longest_common_prefix
+                     << "' was longer than the token '" << canonical_token
+                     << "'.  Returning just this prefix for completion.";
+            fprintf(stdout, "--%s", longest_common_prefix.c_str());
+            return;
+        }
+        if (matching_flags.empty()) {
+            VLOG(1) << "There were no matching flags, returning nothing.";
+            return;
+        }
+
+        string module;
+        string package_dir;
+        TryFindModuleAndPackageDir(all_flags, &module, &package_dir);
+        DVLOG(1) << "Identified module: '" << module << "'";
+        DVLOG(1) << "Identified package_dir: '" << package_dir << "'";
+
+        NotableFlags notable_flags;
+        CategorizeAllMatchingFlags(matching_flags, canonical_token, module,
+                                   package_dir, &notable_flags);
+        DVLOG(2) << "Categorized matching flags:";
+        DVLOG(2) << " perfect_match: "
+                 << notable_flags.perfect_match_flag.size();
+        DVLOG(2) << " module: " << notable_flags.module_flags.size();
+        DVLOG(2) << " package: " << notable_flags.package_flags.size();
+        DVLOG(2) << " most common: " << notable_flags.most_common_flags.size();
+        DVLOG(2) << " subpackage: " << notable_flags.subpackage_flags.size();
+
+        vector<string> completions;
+        FinalizeCompletionOutput(matching_flags, &options, &notable_flags,
+                                 &completions);
+
+        if (options.force_no_update)
+            completions.push_back("~");
+
+        DVLOG(1) << "Finalized with " << completions.size()
+                 << " chosen completions";
+
+        for (vector<string>::const_iterator it = completions.begin();
+             it != completions.end(); ++it) {
+            DVLOG(9) << "  Completion entry: '" << *it << "'";
+            fprintf(stdout, "%s\n", it->c_str());
+        }
+    }
+
+    // 1) Examine search word (and helper method)
+    static void CanonicalizeCursorWordAndSearchOptions(
+        const string &cursor_word, string *canonical_search_token,
+        CompletionOptions *options) {
+        *canonical_search_token = cursor_word;
+        if (canonical_search_token->empty())
+            return;
+
+        // Get rid of leading quotes and dashes in the search term
+        if ((*canonical_search_token)[0] == '"')
+            *canonical_search_token = canonical_search_token->substr(1);
+        while ((*canonical_search_token)[0] == '-')
+            *canonical_search_token = canonical_search_token->substr(1);
+
+        options->flag_name_substring_search        = false;
+        options->flag_location_substring_search    = false;
+        options->flag_description_substring_search = false;
+        options->return_all_matching_flags         = false;
+        options->force_no_update                   = false;
+
+        // Look for all search options we can deduce now.  Do this by walking
+        // backwards through the term, looking for up to three '?' and up to
+        // one '+' as suffixed characters.  Consume them if found, and remove
+        // them from the canonical search token.
+        int found_question_marks = 0;
+        int found_plusses        = 0;
+        while (true) {
+            if (found_question_marks < 3 &&
+                RemoveTrailingChar(canonical_search_token, '?')) {
+                ++found_question_marks;
+                continue;
+            }
+            if (found_plusses < 1 &&
+                RemoveTrailingChar(canonical_search_token, '+')) {
+                ++found_plusses;
+                continue;
+            }
+            break;
+        }
+
+        switch (found_question_marks) {  // all fallthroughs
+            case 3:
+                options->flag_description_substring_search = true;
+            case 2:
+                options->flag_location_substring_search = true;
+            case 1:
+                options->flag_name_substring_search = true;
+        };
+
+        options->return_all_matching_flags = (found_plusses > 0);
+    }
+
+    // Returns true if a char was removed
+    static bool RemoveTrailingChar(string *str, char c) {
+        if (str->empty())
+            return false;
+        if ((*str)[str->size() - 1] == c) {
+            *str = str->substr(0, str->size() - 1);
+            return true;
+        }
+        return false;
+    }
+
+    // 2) Find all matches (and helper methods)
+    static void FindMatchingFlags(const vector<CommandLineFlagInfo> &all_flags,
+                                  const CompletionOptions &options,
+                                  const string &match_token,
+                                  set<const CommandLineFlagInfo *> *all_matches,
+                                  string *longest_common_prefix) {
+        all_matches->clear();
+        bool first_match = true;
+        for (vector<CommandLineFlagInfo>::const_iterator it = all_flags.begin();
+             it != all_flags.end(); ++it) {
+            if (DoesSingleFlagMatch(*it, options, match_token)) {
+                all_matches->insert(&*it);
+                if (first_match) {
+                    first_match            = false;
+                    *longest_common_prefix = it->name;
+                } else {
+                    if (longest_common_prefix->empty() || it->name.empty()) {
+                        longest_common_prefix->clear();
+                        continue;
+                    }
+                    string::size_type pos = 0;
+                    while (pos < longest_common_prefix->size() &&
+                           pos < it->name.size() &&
+                           (*longest_common_prefix)[pos] == it->name[pos])
+                        ++pos;
+                    longest_common_prefix->erase(pos);
+                }
+            }
+        }
+    }
+
+    // Given the set of all flags, the parsed match options, and the
+    // canonical search token, produce the set of all candidate matching
+    // flags for subsequent analysis or filtering.
+    static bool DoesSingleFlagMatch(const CommandLineFlagInfo &flag,
+                                    const CompletionOptions &options,
+                                    const string &match_token) {
+        // Is there a prefix match?
+        string::size_type pos = flag.name.find(match_token);
+        if (pos == 0)
+            return true;
+
+        // Is there a substring match if we want it?
+        if (options.flag_name_substring_search && pos != string::npos)
+            return true;
+
+        // Is there a location match if we want it?
+        if (options.flag_location_substring_search &&
+            flag.filename.find(match_token) != string::npos)
+            return true;
+
+        // TODO(user): All searches should probably be case-insensitive
+        // (especially this one...)
+        if (options.flag_description_substring_search &&
+            flag.description.find(match_token) != string::npos)
+            return true;
+
+        return false;
+    }
+
+    // 3) Categorize matches (and helper method)
+
+    // Given a set of matching flags, categorize them by
+    // likely relevence to this specific binary
+    static void CategorizeAllMatchingFlags(
+        const set<const CommandLineFlagInfo *> &all_matches,
+        const string &search_token,
+        const string &module,       // empty if we couldn't find any
+        const string &package_dir,  // empty if we couldn't find any
+        NotableFlags *notable_flags) {
+        notable_flags->perfect_match_flag.clear();
+        notable_flags->module_flags.clear();
+        notable_flags->package_flags.clear();
+        notable_flags->most_common_flags.clear();
+        notable_flags->subpackage_flags.clear();
+
+        for (set<const CommandLineFlagInfo *>::const_iterator it =
+                 all_matches.begin();
+             it != all_matches.end(); ++it) {
+            DVLOG(2) << "Examining match '" << (*it)->name << "'";
+            DVLOG(7) << "  filename: '" << (*it)->filename << "'";
+            string::size_type pos = string::npos;
+            if (!package_dir.empty())
+                pos = (*it)->filename.find(package_dir);
+            string::size_type slash = string::npos;
+            if (pos !=
+                string::npos)  // candidate for package or subpackage match
+                slash = (*it)->filename.find(PATH_SEPARATOR,
+                                             pos + package_dir.size() + 1);
+
+            if ((*it)->name == search_token) {
+                // Exact match on some flag's name
+                notable_flags->perfect_match_flag.insert(*it);
+                DVLOG(3) << "Result: perfect match";
+            } else if (!module.empty() && (*it)->filename == module) {
+                // Exact match on module filename
+                notable_flags->module_flags.insert(*it);
+                DVLOG(3) << "Result: module match";
+            } else if (!package_dir.empty() && pos != string::npos &&
+                       slash == string::npos) {
+                // In the package, since there was no slash after the package
+                // portion
+                notable_flags->package_flags.insert(*it);
+                DVLOG(3) << "Result: package match";
+            } else if (false) {
+                // In the list of the XXX most commonly supplied flags overall
+                // TODO(user): Compile this list.
+                DVLOG(3) << "Result: most-common match";
+            } else if (!package_dir.empty() && pos != string::npos &&
+                       slash != string::npos) {
+                // In a subdirectory of the package
+                notable_flags->subpackage_flags.insert(*it);
+                DVLOG(3) << "Result: subpackage match";
+            }
+
+            DVLOG(3) << "Result: not special match";
+        }
+    }
+
+    static void PushNameWithSuffix(vector<string> *suffixes,
+                                   const char *suffix) {
+        suffixes->push_back(
+            StringPrintf("/%s%s", ProgramInvocationShortName(), suffix));
+    }
+
+    static void TryFindModuleAndPackageDir(
+        const vector<CommandLineFlagInfo> &all_flags, string *module,
+        string *package_dir) {
+        module->clear();
+        package_dir->clear();
+
+        vector<string> suffixes;
+        // TODO(user): There's some inherant ambiguity here - multiple
+        // directories could share the same trailing folder and file structure
+        // (and even worse, same file names), causing us to be unsure as to
+        // which of the two is the actual package for this binary.  In this
+        // case, we'll arbitrarily choose.
+        PushNameWithSuffix(&suffixes, ".");
+        PushNameWithSuffix(&suffixes, "-main.");
+        PushNameWithSuffix(&suffixes, "_main.");
+        // These four are new but probably merited?
+        PushNameWithSuffix(&suffixes, "-test.");
+        PushNameWithSuffix(&suffixes, "_test.");
+        PushNameWithSuffix(&suffixes, "-unittest.");
+        PushNameWithSuffix(&suffixes, "_unittest.");
+
+        for (vector<CommandLineFlagInfo>::const_iterator it = all_flags.begin();
+             it != all_flags.end(); ++it) {
+            for (vector<string>::const_iterator suffix = suffixes.begin();
+                 suffix != suffixes.end(); ++suffix) {
+                // TODO(user): Make sure the match is near the end of the string
+                if (it->filename.find(*suffix) != string::npos) {
+                    *module               = it->filename;
+                    string::size_type sep = it->filename.rfind(PATH_SEPARATOR);
+                    *package_dir =
+                        it->filename.substr(0, (sep == string::npos) ? 0 : sep);
+                    return;
+                }
+            }
+        }
+    }
+
+    // Can't specialize template type on a locally defined type.  Silly C++...
+    struct DisplayInfoGroup {
+        const char *header;
+        const char *footer;
+        set<const CommandLineFlagInfo *> *group;
+
+        int SizeInLines() const {
+            int size_in_lines = static_cast<int>(group->size()) + 1;
+            if (strlen(header) > 0) {
+                size_in_lines++;
+            }
+            if (strlen(footer) > 0) {
+                size_in_lines++;
+            }
+            return size_in_lines;
+        }
+    };
+
+    // 4) Finalize and trim output flag set
+    static void FinalizeCompletionOutput(
+        const set<const CommandLineFlagInfo *> &matching_flags,
+        CompletionOptions *options, NotableFlags *notable_flags,
+        vector<string> *completions) {
+        // We want to output lines in groups.  Each group needs to be indented
+        // the same to keep its lines together.  Unless otherwise required,
+        // only 99 lines should be output to prevent bash from harassing the
+        // user.
+
+        // First, figure out which output groups we'll actually use.  For each
+        // nonempty group, there will be ~3 lines of header & footer, plus all
+        // output lines themselves.
+        int max_desired_lines =  // "999999 flags should be enough for anyone.
+                                 // -dave"
+            (options->return_all_matching_flags ? 999999 : 98);
+        int lines_so_far = 0;
+
+        vector<DisplayInfoGroup> output_groups;
+        bool perfect_match_found = false;
+        if (lines_so_far < max_desired_lines &&
+            !notable_flags->perfect_match_flag.empty()) {
+            perfect_match_found    = true;
+            DisplayInfoGroup group = {
+                "", "==========", &notable_flags->perfect_match_flag};
+            lines_so_far += group.SizeInLines();
+            output_groups.push_back(group);
+        }
+        if (lines_so_far < max_desired_lines &&
+            !notable_flags->module_flags.empty()) {
+            DisplayInfoGroup group = {
+                "-* Matching module flags *-",
+                "===========================", &notable_flags->module_flags};
+            lines_so_far += group.SizeInLines();
+            output_groups.push_back(group);
+        }
+        if (lines_so_far < max_desired_lines &&
+            !notable_flags->package_flags.empty()) {
+            DisplayInfoGroup group = {
+                "-* Matching package flags *-",
+                "============================", &notable_flags->package_flags};
+            lines_so_far += group.SizeInLines();
+            output_groups.push_back(group);
+        }
+        if (lines_so_far < max_desired_lines &&
+            !notable_flags->most_common_flags.empty()) {
+            DisplayInfoGroup group = {
+                "-* Commonly used flags *-",
+                "=========================", &notable_flags->most_common_flags};
+            lines_so_far += group.SizeInLines();
+            output_groups.push_back(group);
+        }
+        if (lines_so_far < max_desired_lines &&
+            !notable_flags->subpackage_flags.empty()) {
+            DisplayInfoGroup group = {"-* Matching sub-package flags *-",
+                                      "================================",
+                                      &notable_flags->subpackage_flags};
+            lines_so_far += group.SizeInLines();
+            output_groups.push_back(group);
+        }
+
+        set<const CommandLineFlagInfo *> obscure_flags;  // flags not notable
+        if (lines_so_far < max_desired_lines) {
+            RetrieveUnusedFlags(matching_flags, *notable_flags, &obscure_flags);
+            if (!obscure_flags.empty()) {
+                DisplayInfoGroup group = {"-* Other flags *-", "",
+                                          &obscure_flags};
+                lines_so_far += group.SizeInLines();
+                output_groups.push_back(group);
+            }
+        }
+
+        // Second, go through each of the chosen output groups and output
+        // as many of those flags as we can, while remaining below our limit
+        int remaining_lines       = max_desired_lines;
+        size_t completions_output = 0;
+        int indent                = static_cast<int>(output_groups.size()) - 1;
+        for (vector<DisplayInfoGroup>::const_iterator it =
+                 output_groups.begin();
+             it != output_groups.end(); ++it, --indent) {
+            OutputSingleGroupWithLimit(
+                *it->group,           // group
+                string(indent, ' '),  // line indentation
+                string(it->header),   // header
+                string(it->footer),   // footer
+                perfect_match_found,  // long format
+                &remaining_lines,     // line limit - reduces this by number
+                                      // printed
+                &completions_output,  // completions (not lines) added
+                completions);         // produced completions
+            perfect_match_found = false;
+        }
+
+        if (completions_output != matching_flags.size()) {
+            options->force_no_update = false;
+            completions->push_back("~ (Remaining flags hidden) ~");
+        } else {
+            options->force_no_update = true;
+        }
+    }
+
+    static void RetrieveUnusedFlags(
+        const set<const CommandLineFlagInfo *> &matching_flags,
+        const NotableFlags &notable_flags,
+        set<const CommandLineFlagInfo *> *unused_flags) {
+        // Remove from 'matching_flags' set all members of the sets of
+        // flags we've already printed (specifically, those in notable_flags)
+        for (set<const CommandLineFlagInfo *>::const_iterator it =
+                 matching_flags.begin();
+             it != matching_flags.end(); ++it) {
+            if (notable_flags.perfect_match_flag.count(*it) ||
+                notable_flags.module_flags.count(*it) ||
+                notable_flags.package_flags.count(*it) ||
+                notable_flags.most_common_flags.count(*it) ||
+                notable_flags.subpackage_flags.count(*it))
+                continue;
+            unused_flags->insert(*it);
+        }
+    }
+
+    // 5) Output matches (and helper methods)
+
+    static void OutputSingleGroupWithLimit(
+        const set<const CommandLineFlagInfo *> &group,
+        const string &line_indentation, const string &header,
+        const string &footer, bool long_output_format,
+        int *remaining_line_limit, size_t *completion_elements_output,
+        vector<string> *completions) {
+        if (group.empty())
+            return;
+        if (!header.empty()) {
+            if (*remaining_line_limit < 2)
+                return;
+            *remaining_line_limit -= 2;
+            completions->push_back(line_indentation + header);
+            completions->push_back(line_indentation +
+                                   string(header.size(), '-'));
+        }
+        for (set<const CommandLineFlagInfo *>::const_iterator it =
+                 group.begin();
+             it != group.end() && *remaining_line_limit > 0; ++it) {
+            --*remaining_line_limit;
+            ++*completion_elements_output;
+            completions->push_back((
+                long_output_format ? GetLongFlagLine(line_indentation, **it)
+                                   : GetShortFlagLine(line_indentation, **it)));
+        }
+        if (!footer.empty()) {
+            if (*remaining_line_limit < 1)
+                return;
+            --*remaining_line_limit;
+            completions->push_back(line_indentation + footer);
+        }
+    }
+
+    static string GetShortFlagLine(const string &line_indentation,
+                                   const CommandLineFlagInfo &info) {
+        string prefix;
+        bool is_string = (info.type == "string");
+        SStringPrintf(&prefix, "%s--%s [%s%s%s] ", line_indentation.c_str(),
+                      info.name.c_str(), (is_string ? "'" : ""),
+                      info.default_value.c_str(), (is_string ? "'" : ""));
+        int remainder =
+            FLAGS_tab_completion_columns - static_cast<int>(prefix.size());
+        string suffix;
+        if (remainder > 0)
+            suffix = (static_cast<int>(info.description.size()) > remainder
+                          ? (info.description.substr(0, remainder - 3) + "...")
+                                .c_str()
+                          : info.description.c_str());
+        return prefix + suffix;
+    }
+
+    static string GetLongFlagLine(const string &line_indentation,
+                                  const CommandLineFlagInfo &info) {
+        string output = DescribeOneFlag(info);
+
+        // Replace '-' with '--', and remove trailing newline before appending
+        // the module definition location.
+        string old_flagname = "-" + info.name;
+        output.replace(output.find(old_flagname), old_flagname.size(),
+                       "-" + old_flagname);
+        // Stick a newline and indentation in front of the type and default
+        // portions of DescribeOneFlag()s description
+        static const char kNewlineWithIndent[] = "\n    ";
+        output.replace(output.find(" type:"), 1, string(kNewlineWithIndent));
+        output.replace(output.find(" default:"), 1, string(kNewlineWithIndent));
+        output = StringPrintf(
+            "%s Details for '--%s':\n"
+            "%s    defined: %s",
+            line_indentation.c_str(), info.name.c_str(), output.c_str(),
+            info.filename.c_str());
+
+        // Eliminate any doubled newlines that crept in.  Specifically, if
+        // DescribeOneFlag() decided to break the line just before "type"
+        // or "default", we don't want to introduce an extra blank line
+        static const string line_of_spaces(FLAGS_tab_completion_columns, ' ');
+        static const char kDoubledNewlines[] = "\n     \n";
+        for (string::size_type newlines         = output.find(kDoubledNewlines);
+             newlines != string::npos; newlines = output.find(kDoubledNewlines))
+            // Replace each 'doubled newline' with a single newline
+            output.replace(newlines, sizeof(kDoubledNewlines) - 1,
+                           string("\n"));
+
+        for (string::size_type newline        = output.find('\n');
+             newline != string::npos; newline = output.find('\n')) {
+            int newline_pos =
+                static_cast<int>(newline) % FLAGS_tab_completion_columns;
+            int missing_spaces = FLAGS_tab_completion_columns - newline_pos;
+            output.replace(newline, 1, line_of_spaces, 1, missing_spaces);
+        }
+        return output;
+    }
+}  // namespace
+
+void HandleCommandLineCompletions(void) {
+    if (FLAGS_tab_completion_word.empty())
+        return;
+    PrintFlagCompletionInfo();
+    gflags_exitfunc(0);
+}
+
+}  // namespace GFLAGS_NAMESPACE
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_completions.h.in b/3rdparty/TNN/third_party/gflags/src/gflags_completions.h.in
new file mode 100644
index 0000000..b27e5fd
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_completions.h.in
@@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                            \
+ '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GFLAGS_COMPLETIONS_H_
+#define GFLAGS_COMPLETIONS_H_
+
+namespace @GFLAGS_NAMESPACE@ {
+
+extern void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_completions.sh b/3rdparty/TNN/third_party/gflags/src/gflags_completions.sh
new file mode 100755
index 0000000..c5fb7e6
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_completions.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Copyright (c) 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ---
+# Author: Dave Nicponski
+#
+# This script is invoked by bash in response to a matching compspec.  When
+# this happens, bash calls this script using the command shown in the -C
+# block of the complete entry, but also appends 3 arguments.  They are:
+#   - The command being used for completion
+#   - The word being completed
+#   - The word preceding the completion word.
+#
+# Here's an example of how you might use this script:
+# $ complete -o bashdefault -o default -o nospace -C                         \
+#   '/usr/local/bin/gflags_completions.sh --tab_completion_columns $COLUMNS' \
+#   time  env  binary_name  another_binary  [...]
+
+# completion_word_index gets the index of the (N-1)th argument for
+# this command line.  completion_word gets the actual argument from
+# this command line at the (N-1)th position
+completion_word_index="$(($# - 1))"
+completion_word="${!completion_word_index}"
+
+# TODO(user): Replace this once gflags_completions.cc has
+# a bool parameter indicating unambiguously to hijack the process for
+# completion purposes.
+if [ -z "$completion_word" ]; then
+  # Until an empty value for the completion word stops being misunderstood
+  # by binaries, don't actually execute the binary or the process
+  # won't be hijacked!
+  exit 0
+fi
+
+# binary_index gets the index of the command being completed (which bash
+# places in the (N-2)nd position.  binary gets the actual command from
+# this command line at that (N-2)nd position
+binary_index="$(($# - 2))"
+binary="${!binary_index}"
+
+# For completions to be universal, we may have setup the compspec to
+# trigger on 'harmless pass-through' commands, like 'time' or 'env'.
+# If the command being completed is one of those two, we'll need to
+# identify the actual command being executed.  To do this, we need
+# the actual command line that the <TAB> was pressed on.  Bash helpfully
+# places this in the $COMP_LINE variable.
+if [ "$binary" == "time" ] || [ "$binary" == "env" ]; then
+  # we'll assume that the first 'argument' is actually the
+  # binary
+
+
+  # TODO(user): This is not perfect - the 'env' command, for instance,
+  #   is allowed to have options between the 'env' and 'the command to
+  #   be executed'.  For example, consider:
+  # $ env FOO="bar"  bin/do_something  --help<TAB>
+  # In this case, we'll mistake the FOO="bar" portion as the binary.
+  #   Perhaps we should continuing consuming leading words until we
+  #   either run out of words, or find a word that is a valid file
+  #   marked as executable.  I can't think of any reason this wouldn't
+  #   work.
+
+  # Break up the 'original command line' (not this script's command line,
+  # rather the one the <TAB> was pressed on) and find the second word.
+  parts=( ${COMP_LINE} )
+  binary=${parts[1]}
+fi
+
+# Build the command line to use for completion.  Basically it involves
+# passing through all the arguments given to this script (except the 3
+# that bash added), and appending a '--tab_completion_word "WORD"' to
+# the arguments.
+params=""
+for ((i=1; i<=$(($# - 3)); ++i)); do 
+  params="$params \"${!i}\"";
+done
+params="$params --tab_completion_word \"$completion_word\""
+
+# TODO(user): Perhaps stash the output in a temporary file somewhere
+# in /tmp, and only cat it to stdout if the command returned a success
+# code, to prevent false positives
+
+# If we think we have a reasonable command to execute, then execute it
+# and hope for the best.
+candidate=$(type -p "$binary")
+if [ ! -z "$candidate" ]; then
+  eval "$candidate 2>/dev/null $params"
+elif [ -f "$binary" ] && [ -x "$binary" ]; then
+  eval "$binary 2>/dev/null $params"
+fi
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_declare.h.in b/3rdparty/TNN/third_party/gflags/src/gflags_declare.h.in
new file mode 100644
index 0000000..508d46b
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_declare.h.in
@@ -0,0 +1,153 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// command line flag.
+
+#ifndef GFLAGS_DECLARE_H_
+#define GFLAGS_DECLARE_H_
+
+
+// ---------------------------------------------------------------------------
+// Namespace of gflags library symbols.
+#define GFLAGS_NAMESPACE @GFLAGS_NAMESPACE@
+
+// ---------------------------------------------------------------------------
+// Windows DLL import/export.
+
+// Whether gflags library is a DLL.
+//
+// Set to 1 by default when the shared gflags library was built on Windows.
+// Must be overwritten when this header file is used with the optionally also
+// built static library instead; set by CMake's INTERFACE_COMPILE_DEFINITIONS.
+#ifndef GFLAGS_IS_A_DLL
+#  define GFLAGS_IS_A_DLL @GFLAGS_IS_A_DLL@
+#endif
+
+// We always want to import the symbols of the gflags library.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+
+// We always want to import variables declared in user code.
+#ifndef GFLAGS_DLL_DECLARE_FLAG
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECLARE_FLAG
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Flag types
+#include <string>
+#if @HAVE_STDINT_H@
+#  include <stdint.h>                   // the normal place uint32_t is defined
+#elif @HAVE_SYS_TYPES_H@
+#  include <sys/types.h>                // the normal place u_int32_t is defined
+#elif @HAVE_INTTYPES_H@
+#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+#if @GFLAGS_INTTYPES_FORMAT_C99@ // C99
+typedef int32_t          int32;
+typedef uint32_t         uint32;
+typedef int64_t          int64;
+typedef uint64_t         uint64;
+#elif @GFLAGS_INTTYPES_FORMAT_BSD@ // BSD
+typedef int32_t          int32;
+typedef u_int32_t        uint32;
+typedef int64_t          int64;
+typedef u_int64_t        uint64;
+#elif @GFLAGS_INTTYPES_FORMAT_VC7@ // Windows
+typedef __int32          int32;
+typedef unsigned __int32 uint32;
+typedef __int64          int64;
+typedef unsigned __int64 uint64;
+#else
+#  error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+} // namespace GFLAGS_NAMESPACE
+
+
+namespace fLS {
+
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+} // namespace fLS
+
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, B, name)
+
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
+
+#define DECLARE_uint32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint32, U, name)
+
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
+
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, D, name)
+
+#define DECLARE_string(name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fLS { \
+  using ::fLS::clstring; \
+  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
+  } \
+  using fLS::FLAGS_##name
+
+
+#endif  // GFLAGS_DECLARE_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_ns.h.in b/3rdparty/TNN/third_party/gflags/src/gflags_ns.h.in
new file mode 100644
index 0000000..ef6ac29
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_ns.h.in
@@ -0,0 +1,102 @@
+// Copyright (c) 2014, Andreas Schuh
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// -----------------------------------------------------------------------------
+// Imports the gflags library symbols into an alternative/deprecated namespace.
+
+#ifndef GFLAGS_GFLAGS_H_
+#  error The internal header gflags_@ns@.h may only be included by gflags.h
+#endif
+
+#ifndef GFLAGS_NS_@NS@_H_
+#define GFLAGS_NS_@NS@_H_
+
+
+namespace @ns@ {
+
+
+using GFLAGS_NAMESPACE::int32;
+using GFLAGS_NAMESPACE::uint32;
+using GFLAGS_NAMESPACE::int64;
+using GFLAGS_NAMESPACE::uint64;
+
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::CommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetAllFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
+using GFLAGS_NAMESPACE::DescribeOneFlag;
+using GFLAGS_NAMESPACE::SetArgv;
+using GFLAGS_NAMESPACE::GetArgvs;
+using GFLAGS_NAMESPACE::GetArgv;
+using GFLAGS_NAMESPACE::GetArgv0;
+using GFLAGS_NAMESPACE::GetArgvSum;
+using GFLAGS_NAMESPACE::ProgramInvocationName;
+using GFLAGS_NAMESPACE::ProgramInvocationShortName;
+using GFLAGS_NAMESPACE::ProgramUsage;
+using GFLAGS_NAMESPACE::VersionString;
+using GFLAGS_NAMESPACE::GetCommandLineOption;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
+using GFLAGS_NAMESPACE::FlagSettingMode;
+using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
+using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
+using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
+using GFLAGS_NAMESPACE::SetCommandLineOption;
+using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
+using GFLAGS_NAMESPACE::FlagSaver;
+using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
+using GFLAGS_NAMESPACE::ReadFlagsFromString;
+using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
+using GFLAGS_NAMESPACE::ReadFromFlagsFile;
+using GFLAGS_NAMESPACE::BoolFromEnv;
+using GFLAGS_NAMESPACE::Int32FromEnv;
+using GFLAGS_NAMESPACE::Uint32FromEnv;
+using GFLAGS_NAMESPACE::Int64FromEnv;
+using GFLAGS_NAMESPACE::Uint64FromEnv;
+using GFLAGS_NAMESPACE::DoubleFromEnv;
+using GFLAGS_NAMESPACE::StringFromEnv;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::SetVersionString;
+using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
+using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
+using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
+using GFLAGS_NAMESPACE::FlagRegisterer;
+
+#ifndef SWIG
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+#endif
+
+
+} // namespace @ns@
+
+
+#endif  // GFLAGS_NS_@NS@_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/gflags_reporting.cc b/3rdparty/TNN/third_party/gflags/src/gflags_reporting.cc
new file mode 100644
index 0000000..08ff24a
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/gflags_reporting.cc
@@ -0,0 +1,441 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This file contains code for handling the 'reporting' flags.  These
+// are flags that, when present, cause the program to report some
+// information and then exit.  --help and --version are the canonical
+// reporting flags, but we also have flags like --helpxml, etc.
+//
+// There's only one function that's meant to be called externally:
+// HandleCommandLineHelpFlags().  (Well, actually, ShowUsageWithFlags(),
+// ShowUsageWithFlagsRestrict(), and DescribeOneFlag() can be called
+// externally too, but there's little need for it.)  These are all
+// declared in the main gflags.h header file.
+//
+// HandleCommandLineHelpFlags() will check what 'reporting' flags have
+// been defined, if any -- the "help" part of the function name is a
+// bit misleading -- and do the relevant reporting.  It should be
+// called after all flag-values have been assigned, that is, after
+// parsing the command-line.
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "config.h"
+#include "gflags/gflags.h"
+#include "gflags/gflags_completions.h"
+#include "util.h"
+
+// The 'reporting' flags.  They all call gflags_exitfunc().
+DEFINE_bool(help, false,
+            "show help on all flags [tip: all flags can have two dashes]");
+DEFINE_bool(helpfull, false, "show help on all flags -- same as -help");
+DEFINE_bool(helpshort, false,
+            "show help on only the main module for this program");
+DEFINE_string(helpon, "", "show help on the modules named by this flag value");
+DEFINE_string(helpmatch, "",
+              "show help on modules whose name contains the specified substr");
+DEFINE_bool(helppackage, false, "show help on all modules in the main package");
+DEFINE_bool(helpxml, false, "produce an xml version of help");
+DEFINE_bool(version, false, "show version and build info and exit");
+
+namespace GFLAGS_NAMESPACE {
+
+using std::string;
+using std::vector;
+
+// --------------------------------------------------------------------
+// DescribeOneFlag()
+// DescribeOneFlagInXML()
+//    Routines that pretty-print info about a flag.  These use
+//    a CommandLineFlagInfo, which is the way the gflags
+//    API exposes static info about a flag.
+// --------------------------------------------------------------------
+
+static const int kLineLength = 80;
+
+static void AddString(const string& s, string* final_string,
+                      int* chars_in_line) {
+    const int slen = static_cast<int>(s.length());
+    if (*chars_in_line + 1 + slen >= kLineLength) {  // < 80 chars/line
+        *final_string += "\n      ";
+        *chars_in_line = 6;
+    } else {
+        *final_string += " ";
+        *chars_in_line += 1;
+    }
+    *final_string += s;
+    *chars_in_line += slen;
+}
+
+static string PrintStringFlagsWithQuotes(const CommandLineFlagInfo& flag,
+                                         const string& text, bool current) {
+    const char* c_string =
+        (current ? flag.current_value.c_str() : flag.default_value.c_str());
+    if (strcmp(flag.type.c_str(), "string") == 0) {  // add quotes for strings
+        return StringPrintf("%s: \"%s\"", text.c_str(), c_string);
+    } else {
+        return StringPrintf("%s: %s", text.c_str(), c_string);
+    }
+}
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+string DescribeOneFlag(const CommandLineFlagInfo& flag) {
+    string main_part;
+    SStringPrintf(&main_part, "    -%s (%s)", flag.name.c_str(),
+                  flag.description.c_str());
+    const char* c_string = main_part.c_str();
+    int chars_left       = static_cast<int>(main_part.length());
+    string final_string  = "";
+    int chars_in_line    = 0;  // how many chars in current line so far?
+    while (1) {
+        assert(static_cast<size_t>(chars_left) ==
+               strlen(c_string));  // Unless there's a \0 in there?
+        const char* newline = strchr(c_string, '\n');
+        if (newline == NULL && chars_in_line + chars_left < kLineLength) {
+            // The whole remainder of the string fits on this line
+            final_string += c_string;
+            chars_in_line += chars_left;
+            break;
+        }
+        if (newline != NULL &&
+            newline - c_string < kLineLength - chars_in_line) {
+            int n = static_cast<int>(newline - c_string);
+            final_string.append(c_string, n);
+            chars_left -= n + 1;
+            c_string += n + 1;
+        } else {
+            // Find the last whitespace on this 80-char line
+            int whitespace =
+                kLineLength - chars_in_line - 1;  // < 80 chars/line
+            while (whitespace > 0 && !isspace(c_string[whitespace])) {
+                --whitespace;
+            }
+            if (whitespace <= 0) {
+                // Couldn't find any whitespace to make a line break.  Just dump
+                // the rest out!
+                final_string += c_string;
+                chars_in_line =
+                    kLineLength;  // next part gets its own line for sure!
+                break;
+            }
+            final_string += string(c_string, whitespace);
+            chars_in_line += whitespace;
+            while (isspace(c_string[whitespace]))
+                ++whitespace;
+            c_string += whitespace;
+            chars_left -= whitespace;
+        }
+        if (*c_string == '\0')
+            break;
+        StringAppendF(&final_string, "\n      ");
+        chars_in_line = 6;
+    }
+
+    // Append data type
+    AddString(string("type: ") + flag.type, &final_string, &chars_in_line);
+    // The listed default value will be the actual default from the flag
+    // definition in the originating source file, unless the value has
+    // subsequently been modified using SetCommandLineOptionWithMode() with mode
+    // SET_FLAGS_DEFAULT, or by setting FLAGS_foo = bar before
+    // ParseCommandLineFlags().
+    AddString(PrintStringFlagsWithQuotes(flag, "default", false), &final_string,
+              &chars_in_line);
+    if (!flag.is_default) {
+        AddString(PrintStringFlagsWithQuotes(flag, "currently", true),
+                  &final_string, &chars_in_line);
+    }
+
+    StringAppendF(&final_string, "\n");
+    return final_string;
+}
+
+// Simple routine to xml-escape a string: escape & and < only.
+static string XMLText(const string& txt) {
+    string ans = txt;
+    for (string::size_type pos = 0; (pos = ans.find("&", pos)) != string::npos;)
+        ans.replace(pos++, 1, "&amp;");
+    for (string::size_type pos = 0; (pos = ans.find("<", pos)) != string::npos;)
+        ans.replace(pos++, 1, "&lt;");
+    return ans;
+}
+
+static void AddXMLTag(string* r, const char* tag, const string& txt) {
+    StringAppendF(r, "<%s>%s</%s>", tag, XMLText(txt).c_str(), tag);
+}
+
+static string DescribeOneFlagInXML(const CommandLineFlagInfo& flag) {
+    // The file and flagname could have been attributes, but default
+    // and meaning need to avoid attribute normalization.  This way it
+    // can be parsed by simple programs, in addition to xml parsers.
+    string r("<flag>");
+    AddXMLTag(&r, "file", flag.filename);
+    AddXMLTag(&r, "name", flag.name);
+    AddXMLTag(&r, "meaning", flag.description);
+    AddXMLTag(&r, "default", flag.default_value);
+    AddXMLTag(&r, "current", flag.current_value);
+    AddXMLTag(&r, "type", flag.type);
+    r += "</flag>";
+    return r;
+}
+
+// --------------------------------------------------------------------
+// ShowUsageWithFlags()
+// ShowUsageWithFlagsRestrict()
+// ShowXMLOfFlags()
+//    These routines variously expose the registry's list of flag
+//    values.  ShowUsage*() prints the flag-value information
+//    to stdout in a user-readable format (that's what --help uses).
+//    The Restrict() version limits what flags are shown.
+//    ShowXMLOfFlags() prints the flag-value information to stdout
+//    in a machine-readable format.  In all cases, the flags are
+//    sorted: first by filename they are defined in, then by flagname.
+// --------------------------------------------------------------------
+
+static const char* Basename(const char* filename) {
+    const char* sep = strrchr(filename, PATH_SEPARATOR);
+    return sep ? sep + 1 : filename;
+}
+
+static string Dirname(const string& filename) {
+    string::size_type sep = filename.rfind(PATH_SEPARATOR);
+    return filename.substr(0, (sep == string::npos) ? 0 : sep);
+}
+
+// Test whether a filename contains at least one of the substrings.
+static bool FileMatchesSubstring(const string& filename,
+                                 const vector<string>& substrings) {
+    for (vector<string>::const_iterator target = substrings.begin();
+         target != substrings.end(); ++target) {
+        if (strstr(filename.c_str(), target->c_str()) != NULL)
+            return true;
+        // If the substring starts with a '/', that means that we want
+        // the string to be at the beginning of a directory component.
+        // That should match the first directory component as well, so
+        // we allow '/foo' to match a filename of 'foo'.
+        if (!target->empty() && (*target)[0] == PATH_SEPARATOR &&
+            strncmp(filename.c_str(), target->c_str() + 1,
+                    strlen(target->c_str() + 1)) == 0)
+            return true;
+    }
+    return false;
+}
+
+// Show help for every filename which matches any of the target substrings.
+// If substrings is empty, shows help for every file. If a flag's help message
+// has been stripped (e.g. by adding '#define STRIP_FLAG_HELP 1'
+// before including gflags/gflags.h), then this flag will not be displayed
+// by '--help' and its variants.
+static void ShowUsageWithFlagsMatching(const char* argv0,
+                                       const vector<string>& substrings) {
+    fprintf(stdout, "%s: %s\n", Basename(argv0), ProgramUsage());
+
+    vector<CommandLineFlagInfo> flags;
+    GetAllFlags(&flags);  // flags are sorted by filename, then flagname
+
+    string last_filename;          // so we know when we're at a new file
+    bool first_directory = true;   // controls blank lines between dirs
+    bool found_match     = false;  // stays false iff no dir matches restrict
+    for (vector<CommandLineFlagInfo>::const_iterator flag = flags.begin();
+         flag != flags.end(); ++flag) {
+        if (substrings.empty() ||
+            FileMatchesSubstring(flag->filename, substrings)) {
+            // If the flag has been stripped, pretend that it doesn't exist.
+            if (flag->description == kStrippedFlagHelp)
+                continue;
+            found_match = true;  // this flag passed the match!
+            if (flag->filename != last_filename) {  // new file
+                if (Dirname(flag->filename) !=
+                    Dirname(last_filename)) {  // new dir!
+                    if (!first_directory)
+                        fprintf(stdout,
+                                "\n\n");  // put blank lines between directories
+                    first_directory = false;
+                }
+                fprintf(stdout, "\n  Flags from %s:\n", flag->filename.c_str());
+                last_filename = flag->filename;
+            }
+            // Now print this flag
+            fprintf(stdout, "%s", DescribeOneFlag(*flag).c_str());
+        }
+    }
+    if (!found_match && !substrings.empty()) {
+        fprintf(stdout, "\n  No modules matched: use -help\n");
+    }
+}
+
+void ShowUsageWithFlagsRestrict(const char* argv0, const char* restrict) {
+    vector<string> substrings;
+    if (restrict != NULL && *restrict != '\0') {
+        substrings.push_back(restrict);
+    }
+    ShowUsageWithFlagsMatching(argv0, substrings);
+}
+
+void ShowUsageWithFlags(const char* argv0) {
+    ShowUsageWithFlagsRestrict(argv0, "");
+}
+
+// Convert the help, program, and usage to xml.
+static void ShowXMLOfFlags(const char* prog_name) {
+    vector<CommandLineFlagInfo> flags;
+    GetAllFlags(&flags);  // flags are sorted: by filename, then flagname
+
+    // XML.  There is no corresponding schema yet
+    fprintf(stdout, "<?xml version=\"1.0\"?>\n");
+    // The document
+    fprintf(stdout, "<AllFlags>\n");
+    // the program name and usage
+    fprintf(stdout, "<program>%s</program>\n",
+            XMLText(Basename(prog_name)).c_str());
+    fprintf(stdout, "<usage>%s</usage>\n", XMLText(ProgramUsage()).c_str());
+    // All the flags
+    for (vector<CommandLineFlagInfo>::const_iterator flag = flags.begin();
+         flag != flags.end(); ++flag) {
+        if (flag->description != kStrippedFlagHelp)
+            fprintf(stdout, "%s\n", DescribeOneFlagInXML(*flag).c_str());
+    }
+    // The end of the document
+    fprintf(stdout, "</AllFlags>\n");
+}
+
+// --------------------------------------------------------------------
+// ShowVersion()
+//    Called upon --version.  Prints build-related info.
+// --------------------------------------------------------------------
+
+static void ShowVersion() {
+    const char* version_string = VersionString();
+    if (version_string && *version_string) {
+        fprintf(stdout, "%s version %s\n", ProgramInvocationShortName(),
+                version_string);
+    } else {
+        fprintf(stdout, "%s\n", ProgramInvocationShortName());
+    }
+#if !defined(NDEBUG)
+    fprintf(stdout, "Debug build (NDEBUG not #defined)\n");
+#endif
+}
+
+static void AppendPrognameStrings(vector<string>* substrings,
+                                  const char* progname) {
+    string r("");
+    r += PATH_SEPARATOR;
+    r += progname;
+    substrings->push_back(r + ".");
+    substrings->push_back(r + "-main.");
+    substrings->push_back(r + "_main.");
+}
+
+// --------------------------------------------------------------------
+// HandleCommandLineHelpFlags()
+//    Checks all the 'reporting' commandline flags to see if any
+//    have been set.  If so, handles them appropriately.  Note
+//    that all of them, by definition, cause the program to exit
+//    if they trigger.
+// --------------------------------------------------------------------
+
+void HandleCommandLineHelpFlags() {
+    const char* progname = ProgramInvocationShortName();
+
+    HandleCommandLineCompletions();
+
+    vector<string> substrings;
+    AppendPrognameStrings(&substrings, progname);
+
+    if (FLAGS_helpshort) {
+        // show only flags related to this binary:
+        // E.g. for fileutil.cc, want flags containing   ... "/fileutil." cc
+        ShowUsageWithFlagsMatching(progname, substrings);
+        gflags_exitfunc(1);
+
+    } else if (FLAGS_help || FLAGS_helpfull) {
+        // show all options
+        ShowUsageWithFlagsRestrict(progname, "");  // empty restrict
+        gflags_exitfunc(1);
+
+    } else if (!FLAGS_helpon.empty()) {
+        string restrict = PATH_SEPARATOR + FLAGS_helpon + ".";
+        ShowUsageWithFlagsRestrict(progname, restrict.c_str());
+        gflags_exitfunc(1);
+
+    } else if (!FLAGS_helpmatch.empty()) {
+        ShowUsageWithFlagsRestrict(progname, FLAGS_helpmatch.c_str());
+        gflags_exitfunc(1);
+
+    } else if (FLAGS_helppackage) {
+        // Shows help for all files in the same directory as main().  We
+        // don't want to resort to looking at dirname(progname), because
+        // the user can pick progname, and it may not relate to the file
+        // where main() resides.  So instead, we search the flags for a
+        // filename like "/progname.cc", and take the dirname of that.
+        vector<CommandLineFlagInfo> flags;
+        GetAllFlags(&flags);
+        string last_package;
+        for (vector<CommandLineFlagInfo>::const_iterator flag = flags.begin();
+             flag != flags.end(); ++flag) {
+            if (!FileMatchesSubstring(flag->filename, substrings))
+                continue;
+            const string package = Dirname(flag->filename) + PATH_SEPARATOR;
+            if (package != last_package) {
+                ShowUsageWithFlagsRestrict(progname, package.c_str());
+                VLOG(7) << "Found package: " << package;
+                if (!last_package.empty()) {  // means this isn't our first pkg
+                    LOG(WARNING)
+                        << "Multiple packages contain a file=" << progname;
+                }
+                last_package = package;
+            }
+        }
+        if (last_package.empty()) {  // never found a package to print
+            LOG(WARNING) << "Unable to find a package for file=" << progname;
+        }
+        gflags_exitfunc(1);
+
+    } else if (FLAGS_helpxml) {
+        ShowXMLOfFlags(progname);
+        gflags_exitfunc(1);
+
+    } else if (FLAGS_version) {
+        ShowVersion();
+        // Unlike help, we may be asking for version in a script, so return 0
+        gflags_exitfunc(0);
+    }
+}
+
+}  // namespace GFLAGS_NAMESPACE
diff --git a/3rdparty/TNN/third_party/gflags/src/mutex.h b/3rdparty/TNN/third_party/gflags/src/mutex.h
new file mode 100644
index 0000000..c93fb37
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/mutex.h
@@ -0,0 +1,431 @@
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+//
+// A simple mutex wrapper, supporting locks and read-write locks.
+// You should assume the locks are *not* re-entrant.
+//
+// This class is meant to be internal-only and should be wrapped by an
+// internal namespace.  Before you use this module, please give the
+// name of your internal namespace for this module.  Or, if you want
+// to expose it, you'll want to move it to the Google namespace.  We
+// cannot put this class in global namespace because there can be some
+// problems when we have multiple versions of Mutex in each shared object.
+//
+// NOTE: by default, we have #ifdef'ed out the TryLock() method.
+//       This is for two reasons:
+// 1) TryLock() under Windows is a bit annoying (it requires a
+//    #define to be defined very early).
+// 2) TryLock() is broken for NO_THREADS mode, at least in NDEBUG
+//    mode.
+// If you need TryLock(), and either these two caveats are not a
+// problem for you, or you're willing to work around them, then
+// feel free to #define GMUTEX_TRYLOCK, or to remove the #ifdefs
+// in the code below.
+//
+// CYGWIN NOTE: Cygwin support for rwlock seems to be buggy:
+//    http://www.cygwin.com/ml/cygwin/2008-12/msg00017.html
+// Because of that, we might as well use windows locks for
+// cygwin.  They seem to be more reliable than the cygwin pthreads layer.
+//
+// TRICKY IMPLEMENTATION NOTE:
+// This class is designed to be safe to use during
+// dynamic-initialization -- that is, by global constructors that are
+// run before main() starts.  The issue in this case is that
+// dynamic-initialization happens in an unpredictable order, and it
+// could be that someone else's dynamic initializer could call a
+// function that tries to acquire this mutex -- but that all happens
+// before this mutex's constructor has run.  (This can happen even if
+// the mutex and the function that uses the mutex are in the same .cc
+// file.)  Basically, because Mutex does non-trivial work in its
+// constructor, it's not, in the naive implementation, safe to use
+// before dynamic initialization has run on it.
+//
+// The solution used here is to pair the actual mutex primitive with a
+// bool that is set to true when the mutex is dynamically initialized.
+// (Before that it's false.)  Then we modify all mutex routines to
+// look at the bool, and not try to lock/unlock until the bool makes
+// it to true (which happens after the Mutex constructor has run.)
+//
+// This works because before main() starts -- particularly, during
+// dynamic initialization -- there are no threads, so a) it's ok that
+// the mutex operations are a no-op, since we don't need locking then
+// anyway; and b) we can be quite confident our bool won't change
+// state between a call to Lock() and a call to Unlock() (that would
+// require a global constructor in one translation unit to call Lock()
+// and another global constructor in another translation unit to call
+// Unlock() later, which is pretty perverse).
+//
+// That said, it's tricky, and can conceivably fail; it's safest to
+// avoid trying to acquire a mutex in a global constructor, if you
+// can.  One way it can fail is that a really smart compiler might
+// initialize the bool to true at static-initialization time (too
+// early) rather than at dynamic-initialization time.  To discourage
+// that, we set is_safe_ to true in code (not the constructor
+// colon-initializer) and set it to true via a function that always
+// evaluates to true, but that the compiler can't know always
+// evaluates to true.  This should be good enough.
+//
+// A related issue is code that could try to access the mutex
+// after it's been destroyed in the global destructors (because
+// the Mutex global destructor runs before some other global
+// destructor, that tries to acquire the mutex).  The way we
+// deal with this is by taking a constructor arg that global
+// mutexes should pass in, that causes the destructor to do no
+// work.  We still depend on the compiler not doing anything
+// weird to a Mutex's memory after it is destroyed, but for a
+// static global variable, that's pretty safe.
+
+#ifndef GFLAGS_MUTEX_H_
+#define GFLAGS_MUTEX_H_
+
+#include "gflags/gflags_declare.h"  // to figure out pthreads support
+
+#if defined(NO_THREADS)
+typedef int MutexType;  // to keep a lock-count
+#elif defined(OS_WINDOWS)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN  // We only need minimal includes
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX  // Don't want windows to override min()/max()
+#endif
+#ifdef GMUTEX_TRYLOCK
+// We need Windows NT or later for TryEnterCriticalSection().  If you
+// don't need that functionality, you can remove these _WIN32_WINNT
+// lines, and change TryLock() to assert(0) or something.
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0400
+#endif
+#endif
+#include <windows.h>
+typedef CRITICAL_SECTION MutexType;
+#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+// Needed for pthread_rwlock_*.  If it causes problems, you could take it
+// out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
+// *does* cause problems for FreeBSD, or MacOSX, but isn't needed
+// for locking there.)
+#ifdef __linux__
+#if _XOPEN_SOURCE < 500  // including not being defined at all
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 500  // may be needed to get the rwlock calls
+#endif
+#endif
+#include <pthread.h>
+typedef pthread_rwlock_t MutexType;
+#elif defined(HAVE_PTHREAD)
+#include <pthread.h>
+typedef pthread_mutex_t MutexType;
+#else
+#error Need to implement mutex.h for your architecture, or #define NO_THREADS
+#endif
+
+#include <assert.h>
+#include <stdlib.h>  // for abort()
+
+#define MUTEX_NAMESPACE gflags_mutex_namespace
+
+namespace MUTEX_NAMESPACE {
+
+class Mutex {
+public:
+    // This is used for the single-arg constructor
+    enum LinkerInitialized { LINKER_INITIALIZED };
+
+    // Create a Mutex that is not held by anybody.  This constructor is
+    // typically used for Mutexes allocated on the heap or the stack.
+    inline Mutex();
+    // This constructor should be used for global, static Mutex objects.
+    // It inhibits work being done by the destructor, which makes it
+    // safer for code that tries to acqiure this mutex in their global
+    // destructor.
+    explicit inline Mutex(LinkerInitialized);
+
+    // Destructor
+    inline ~Mutex();
+
+    inline void Lock();  // Block if needed until free then acquire exclusively
+    inline void Unlock();  // Release a lock acquired via Lock()
+#ifdef GMUTEX_TRYLOCK
+    inline bool
+    TryLock();  // If free, Lock() and return true, else return false
+#endif
+    // Note that on systems that don't support read-write locks, these may
+    // be implemented as synonyms to Lock() and Unlock().  So you can use
+    // these for efficiency, but don't use them anyplace where being able
+    // to do shared reads is necessary to avoid deadlock.
+    inline void
+    ReaderLock();  // Block until free or shared then acquire a share
+    inline void ReaderUnlock();  // Release a read share of this Mutex
+    inline void WriterLock() {
+        Lock();
+    }  // Acquire an exclusive lock
+    inline void WriterUnlock() {
+        Unlock();
+    }  // Release a lock from WriterLock()
+
+private:
+    MutexType mutex_;
+    // We want to make sure that the compiler sets is_safe_ to true only
+    // when we tell it to, and never makes assumptions is_safe_ is
+    // always true.  volatile is the most reliable way to do that.
+    volatile bool is_safe_;
+    // This indicates which constructor was called.
+    bool destroy_;
+
+    inline void SetIsSafe() {
+        is_safe_ = true;
+    }
+
+    // Catch the error of writing Mutex when intending MutexLock.
+    explicit Mutex(Mutex * /*ignored*/) {}
+    // Disallow "evil" constructors
+    Mutex(const Mutex &);
+    void operator=(const Mutex &);
+};
+
+// Now the implementation of Mutex for various systems
+#if defined(NO_THREADS)
+
+// When we don't have threads, we can be either reading or writing,
+// but not both.  We can have lots of readers at once (in no-threads
+// mode, that's most likely to happen in recursive function calls),
+// but only one writer.  We represent this by having mutex_ be -1 when
+// writing and a number > 0 when reading (and 0 when no lock is held).
+//
+// In debug mode, we assert these invariants, while in non-debug mode
+// we do nothing, for efficiency.  That's why everything is in an
+// assert.
+
+Mutex::Mutex() : mutex_(0) {}
+Mutex::Mutex(Mutex::LinkerInitialized) : mutex_(0) {}
+Mutex::~Mutex() {
+    assert(mutex_ == 0);
+}
+void Mutex::Lock() {
+    assert(--mutex_ == -1);
+}
+void Mutex::Unlock() {
+    assert(mutex_++ == -1);
+}
+#ifdef GMUTEX_TRYLOCK
+bool Mutex::TryLock() {
+    if (mutex_)
+        return false;
+    Lock();
+    return true;
+}
+#endif
+void Mutex::ReaderLock() {
+    assert(++mutex_ > 0);
+}
+void Mutex::ReaderUnlock() {
+    assert(mutex_-- > 0);
+}
+
+#elif defined(OS_WINDOWS)
+
+Mutex::Mutex() : destroy_(true) {
+    InitializeCriticalSection(&mutex_);
+    SetIsSafe();
+}
+Mutex::Mutex(LinkerInitialized) : destroy_(false) {
+    InitializeCriticalSection(&mutex_);
+    SetIsSafe();
+}
+Mutex::~Mutex() {
+    if (destroy_)
+        DeleteCriticalSection(&mutex_);
+}
+void Mutex::Lock() {
+    if (is_safe_)
+        EnterCriticalSection(&mutex_);
+}
+void Mutex::Unlock() {
+    if (is_safe_)
+        LeaveCriticalSection(&mutex_);
+}
+#ifdef GMUTEX_TRYLOCK
+bool Mutex::TryLock() {
+    return is_safe_ ? TryEnterCriticalSection(&mutex_) != 0 : true;
+}
+#endif
+void Mutex::ReaderLock() {
+    Lock();
+}  // we don't have read-write locks
+void Mutex::ReaderUnlock() {
+    Unlock();
+}
+
+#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+
+#define SAFE_PTHREAD(fncall)                                                   \
+    do { /* run fncall if is_safe_ is true */                                  \
+        if (is_safe_ && fncall(&mutex_) != 0)                                  \
+            abort();                                                           \
+    } while (0)
+
+Mutex::Mutex() : destroy_(true) {
+    SetIsSafe();
+    if (is_safe_ && pthread_rwlock_init(&mutex_, NULL) != 0)
+        abort();
+}
+Mutex::Mutex(Mutex::LinkerInitialized) : destroy_(false) {
+    SetIsSafe();
+    if (is_safe_ && pthread_rwlock_init(&mutex_, NULL) != 0)
+        abort();
+}
+Mutex::~Mutex() {
+    if (destroy_)
+        SAFE_PTHREAD(pthread_rwlock_destroy);
+}
+void Mutex::Lock() {
+    SAFE_PTHREAD(pthread_rwlock_wrlock);
+}
+void Mutex::Unlock() {
+    SAFE_PTHREAD(pthread_rwlock_unlock);
+}
+#ifdef GMUTEX_TRYLOCK
+bool Mutex::TryLock() {
+    return is_safe_ ? pthread_rwlock_trywrlock(&mutex_) == 0 : true;
+}
+#endif
+void Mutex::ReaderLock() {
+    SAFE_PTHREAD(pthread_rwlock_rdlock);
+}
+void Mutex::ReaderUnlock() {
+    SAFE_PTHREAD(pthread_rwlock_unlock);
+}
+#undef SAFE_PTHREAD
+
+#elif defined(HAVE_PTHREAD)
+
+#define SAFE_PTHREAD(fncall)                                                   \
+    do { /* run fncall if is_safe_ is true */                                  \
+        if (is_safe_ && fncall(&mutex_) != 0)                                  \
+            abort();                                                           \
+    } while (0)
+
+Mutex::Mutex() : destroy_(true) {
+    SetIsSafe();
+    if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0)
+        abort();
+}
+Mutex::Mutex(Mutex::LinkerInitialized) : destroy_(false) {
+    SetIsSafe();
+    if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0)
+        abort();
+}
+Mutex::~Mutex() {
+    if (destroy_)
+        SAFE_PTHREAD(pthread_mutex_destroy);
+}
+void Mutex::Lock() {
+    SAFE_PTHREAD(pthread_mutex_lock);
+}
+void Mutex::Unlock() {
+    SAFE_PTHREAD(pthread_mutex_unlock);
+}
+#ifdef GMUTEX_TRYLOCK
+bool Mutex::TryLock() {
+    return is_safe_ ? pthread_mutex_trylock(&mutex_) == 0 : true;
+}
+#endif
+void Mutex::ReaderLock() {
+    Lock();
+}
+void Mutex::ReaderUnlock() {
+    Unlock();
+}
+#undef SAFE_PTHREAD
+
+#endif
+
+// --------------------------------------------------------------------------
+// Some helper classes
+
+// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
+class MutexLock {
+public:
+    explicit MutexLock(Mutex *mu) : mu_(mu) {
+        mu_->Lock();
+    }
+    ~MutexLock() {
+        mu_->Unlock();
+    }
+
+private:
+    Mutex *const mu_;
+    // Disallow "evil" constructors
+    MutexLock(const MutexLock &);
+    void operator=(const MutexLock &);
+};
+
+// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
+class ReaderMutexLock {
+public:
+    explicit ReaderMutexLock(Mutex *mu) : mu_(mu) {
+        mu_->ReaderLock();
+    }
+    ~ReaderMutexLock() {
+        mu_->ReaderUnlock();
+    }
+
+private:
+    Mutex *const mu_;
+    // Disallow "evil" constructors
+    ReaderMutexLock(const ReaderMutexLock &);
+    void operator=(const ReaderMutexLock &);
+};
+
+class WriterMutexLock {
+public:
+    explicit WriterMutexLock(Mutex *mu) : mu_(mu) {
+        mu_->WriterLock();
+    }
+    ~WriterMutexLock() {
+        mu_->WriterUnlock();
+    }
+
+private:
+    Mutex *const mu_;
+    // Disallow "evil" constructors
+    WriterMutexLock(const WriterMutexLock &);
+    void operator=(const WriterMutexLock &);
+};
+
+// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
+#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
+#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
+#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
+
+}  // namespace MUTEX_NAMESPACE
+
+#endif /* #define GFLAGS_MUTEX_H__ */
diff --git a/3rdparty/TNN/third_party/gflags/src/util.h b/3rdparty/TNN/third_party/gflags/src/util.h
new file mode 100644
index 0000000..db3c245
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/util.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ---
+//
+// Some generically useful utility routines that in google-land would
+// be their own projects.  We make a shortened version here.
+
+#ifndef GFLAGS_UTIL_H_
+#define GFLAGS_UTIL_H_
+
+#include "config.h"
+
+#include <assert.h>
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+#include <errno.h>
+#include <stdarg.h>  // for va_*
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <string>
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>  // for mkdir
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+// This is used for unittests for death-testing.  It is defined in gflags.cc.
+extern GFLAGS_DLL_DECL void (*gflags_exitfunc)(int);
+
+// Work properly if either strtoll or strtoq is on this system.
+#if defined(strtoll) || defined(HAVE_STRTOLL)
+#define strto64 strtoll
+#define strtou64 strtoull
+#elif defined(HAVE_STRTOQ)
+#define strto64 strtoq
+#define strtou64 strtouq
+// Neither strtoll nor strtoq are defined.  I hope strtol works!
+#else
+#define strto64 strtol
+#define strtou64 strtoul
+#endif
+
+// If we have inttypes.h, it will have defined PRId32/etc for us.
+// If not, take our best guess.
+#ifndef PRId32
+#define PRId32 "d"
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif
+
+typedef signed char int8;
+typedef unsigned char uint8;
+
+// -- utility macros ---------------------------------------------------------
+
+template <bool b>
+struct CompileAssert;
+template <>
+struct CompileAssert<true> {};
+#define COMPILE_ASSERT(expr, msg)                                              \
+    enum { assert_##msg = sizeof(CompileAssert<bool(expr)>) }
+
+// Returns the number of elements in an array.
+#define arraysize(arr) (sizeof(arr) / sizeof(*(arr)))
+
+// -- logging and testing ---------------------------------------------------
+
+// For now, we ignore the level for logging, and don't show *VLOG's at
+// all, except by hand-editing the lines below
+#define LOG(level) std::cerr
+#define VLOG(level)                                                            \
+    if (true) {                                                                \
+    } else                                                                     \
+        std::cerr
+#define DVLOG(level)                                                           \
+    if (true) {                                                                \
+    } else                                                                     \
+        std::cerr
+
+// CHECK dies with a fatal error if condition is not true.  It is *not*
+// controlled by NDEBUG, so the check will be executed regardless of
+// compilation mode.  Therefore, it is safe to do things like:
+//    CHECK(fp->Write(x) == 4)
+// We allow stream-like objects after this for debugging, but they're ignored.
+#define EXPECT_TRUE(condition)                                                 \
+    if (true) {                                                                \
+        if (!(condition)) {                                                    \
+            fprintf(stderr, "Check failed: %s\n", #condition);                 \
+            exit(1);                                                           \
+        }                                                                      \
+    } else                                                                     \
+        std::cerr << ""
+
+#define EXPECT_OP(op, val1, val2)                                              \
+    if (true) {                                                                \
+        if (!((val1)op(val2))) {                                               \
+            fprintf(stderr, "Check failed: %s %s %s\n", #val1, #op, #val2);    \
+            exit(1);                                                           \
+        }                                                                      \
+    } else                                                                     \
+        std::cerr << ""
+
+#define EXPECT_EQ(val1, val2) EXPECT_OP(==, val1, val2)
+#define EXPECT_NE(val1, val2) EXPECT_OP(!=, val1, val2)
+#define EXPECT_LE(val1, val2) EXPECT_OP(<=, val1, val2)
+#define EXPECT_LT(val1, val2) EXPECT_OP(<, val1, val2)
+#define EXPECT_GE(val1, val2) EXPECT_OP(>=, val1, val2)
+#define EXPECT_GT(val1, val2) EXPECT_OP(>, val1, val2)
+#define EXPECT_FALSE(cond) EXPECT_TRUE(!(cond))
+
+// C99 declares isnan and isinf should be macros, so the #ifdef test
+// should be reliable everywhere.  Of course, it's not, but these
+// are testing pertty marginal functionality anyway, so it's ok to
+// not-run them even in situations they might, with effort, be made to work.
+#ifdef isnan  // Some compilers, like sun's for Solaris 10, don't define this
+#define EXPECT_NAN(arg)                                                        \
+    do {                                                                       \
+        if (!isnan(arg)) {                                                     \
+            fprintf(stderr, "Check failed: isnan(%s)\n", #arg);                \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+#else
+#define EXPECT_NAN(arg)
+#endif
+
+#ifdef isinf  // Some compilers, like sun's for Solaris 10, don't define this
+#define EXPECT_INF(arg)                                                        \
+    do {                                                                       \
+        if (!isinf(arg)) {                                                     \
+            fprintf(stderr, "Check failed: isinf(%s)\n", #arg);                \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+#else
+#define EXPECT_INF(arg)
+#endif
+
+#define EXPECT_DOUBLE_EQ(val1, val2)                                           \
+    do {                                                                       \
+        if (((val1) < (val2)-0.001 || (val1) > (val2) + 0.001)) {              \
+            fprintf(stderr, "Check failed: %s == %s\n", #val1, #val2);         \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+
+#define EXPECT_STREQ(val1, val2)                                               \
+    do {                                                                       \
+        if (strcmp((val1), (val2)) != 0) {                                     \
+            fprintf(stderr, "Check failed: streq(%s, %s)\n", #val1, #val2);    \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+
+// Call this in a .cc file where you will later call RUN_ALL_TESTS in main().
+#define TEST_INIT                                                              \
+    static std::vector<void (*)()> g_testlist; /* the tests to run */          \
+    static int RUN_ALL_TESTS() {                                               \
+        std::vector<void (*)()>::const_iterator it;                            \
+        for (it = g_testlist.begin(); it != g_testlist.end(); ++it) {          \
+            (*it)(); /* The test will error-exit if there's a problem. */      \
+        }                                                                      \
+        fprintf(stderr, "\nPassed %d tests\n\nPASS\n",                         \
+                static_cast<int>(g_testlist.size()));                          \
+        return 0;                                                              \
+    }
+
+// Note that this macro uses a FlagSaver to keep tests isolated.
+#define TEST(a, b)                                                             \
+    struct Test_##a##_##b {                                                    \
+        Test_##a##_##b() {                                                     \
+            g_testlist.push_back(&Run);                                        \
+        }                                                                      \
+        static void Run() {                                                    \
+            FlagSaver fs;                                                      \
+            fprintf(stderr, "Running test %s/%s\n", #a, #b);                   \
+            RunTest();                                                         \
+        }                                                                      \
+        static void RunTest();                                                 \
+    };                                                                         \
+    static Test_##a##_##b g_test_##a##_##b;                                    \
+    void Test_##a##_##b::RunTest()
+
+// This is a dummy class that eases the google->opensource transition.
+namespace testing {
+    class Test {};
+}  // namespace testing
+
+// Call this in a .cc file where you will later call EXPECT_DEATH
+#define EXPECT_DEATH_INIT                                                      \
+    static bool g_called_exit;                                                 \
+    static void CalledExit(int) {                                              \
+        g_called_exit = true;                                                  \
+    }
+
+#define EXPECT_DEATH(fn, msg)                                                  \
+    do {                                                                       \
+        g_called_exit   = false;                                               \
+        gflags_exitfunc = &CalledExit;                                         \
+        fn;                                                                    \
+        gflags_exitfunc = &exit; /* set back to its default */                 \
+        if (!g_called_exit) {                                                  \
+            fprintf(stderr, "Function didn't die (%s): %s\n", msg, #fn);       \
+            exit(1);                                                           \
+        }                                                                      \
+    } while (0)
+
+#define GTEST_HAS_DEATH_TEST 1
+
+// -- path routines ----------------------------------------------------------
+
+// Tries to create the directory path as a temp-dir.  If it fails,
+// changes path to some directory it *can* create.
+#if defined(__MINGW32__)
+#include <io.h>
+inline void MakeTmpdir(std::string* path) {
+    if (!path->empty()) {
+        path->append("/gflags_unittest_testdir");
+        int err = mkdir(path->c_str());
+        if (err == 0 || errno == EEXIST)
+            return;
+    }
+    // I had trouble creating a directory in /tmp from mingw
+    *path = "./gflags_unittest";
+    mkdir(path->c_str());
+}
+#elif defined(_MSC_VER)
+#include <direct.h>
+inline void MakeTmpdir(std::string* path) {
+    if (!path->empty()) {
+        int err = _mkdir(path->c_str());
+        if (err == 0 || errno == EEXIST)
+            return;
+    }
+    char tmppath_buffer[1024];
+    int tmppath_len = GetTempPathA(sizeof(tmppath_buffer), tmppath_buffer);
+    assert(tmppath_len > 0 && tmppath_len < sizeof(tmppath_buffer));
+    assert(tmppath_buffer[tmppath_len - 1] == '\\');  // API guarantees it
+    *path = std::string(tmppath_buffer) + "gflags_unittest";
+    _mkdir(path->c_str());
+}
+#else
+inline void MakeTmpdir(std::string* path) {
+    if (!path->empty()) {
+        int err = mkdir(path->c_str(), 0755);
+        if (err == 0 || errno == EEXIST)
+            return;
+    }
+    mkdir("/tmp/gflags_unittest", 0755);
+}
+#endif
+
+// -- string routines --------------------------------------------------------
+
+inline void InternalStringPrintf(std::string* output, const char* format,
+                                 va_list ap) {
+    char space[128];  // try a small buffer and hope it fits
+
+    // It's possible for methods that use a va_list to invalidate
+    // the data in it upon use.  The fix is to make a copy
+    // of the structure before using it and use that copy instead.
+    va_list backup_ap;
+    va_copy(backup_ap, ap);
+    int bytes_written = vsnprintf(space, sizeof(space), format, backup_ap);
+    va_end(backup_ap);
+
+    if ((bytes_written >= 0) &&
+        (static_cast<size_t>(bytes_written) < sizeof(space))) {
+        output->append(space, bytes_written);
+        return;
+    }
+
+    // Repeatedly increase buffer size until it fits.
+    int length = sizeof(space);
+    while (true) {
+        if (bytes_written < 0) {
+            // Older snprintf() behavior. :-(  Just try doubling the buffer size
+            length *= 2;
+        } else {
+            // We need exactly "bytes_written+1" characters
+            length = bytes_written + 1;
+        }
+        char* buf = new char[length];
+
+        // Restore the va_list before we use it again
+        va_copy(backup_ap, ap);
+        bytes_written = vsnprintf(buf, length, format, backup_ap);
+        va_end(backup_ap);
+
+        if ((bytes_written >= 0) && (bytes_written < length)) {
+            output->append(buf, bytes_written);
+            delete[] buf;
+            return;
+        }
+        delete[] buf;
+    }
+}
+
+// Clears output before writing to it.
+inline void SStringPrintf(std::string* output, const char* format, ...) {
+    va_list ap;
+    va_start(ap, format);
+    output->clear();
+    InternalStringPrintf(output, format, ap);
+    va_end(ap);
+}
+
+inline void StringAppendF(std::string* output, const char* format, ...) {
+    va_list ap;
+    va_start(ap, format);
+    InternalStringPrintf(output, format, ap);
+    va_end(ap);
+}
+
+inline std::string StringPrintf(const char* format, ...) {
+    va_list ap;
+    va_start(ap, format);
+    std::string output;
+    InternalStringPrintf(&output, format, ap);
+    va_end(ap);
+    return output;
+}
+
+inline bool SafeGetEnv(const char* varname, std::string& valstr) {
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    char* val;
+    size_t sz;
+    if (_dupenv_s(&val, &sz, varname) != 0 || !val)
+        return false;
+    valstr = val;
+    free(val);
+#else
+    const char* const val = getenv(varname);
+    if (!val)
+        return false;
+    valstr = val;
+#endif
+    return true;
+}
+
+inline int SafeFOpen(FILE** fp, const char* fname, const char* mode) {
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    return fopen_s(fp, fname, mode);
+#else
+    assert(fp != NULL);
+    *fp = fopen(fname, mode);
+    // errno only guaranteed to be set on failure
+    return ((*fp == NULL) ? errno : 0);
+#endif
+}
+
+}  // namespace GFLAGS_NAMESPACE
+
+#endif  // GFLAGS_UTIL_H_
diff --git a/3rdparty/TNN/third_party/gflags/src/windows_port.cc b/3rdparty/TNN/third_party/gflags/src/windows_port.cc
new file mode 100644
index 0000000..1131295
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/windows_port.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ */
+
+#ifndef _WIN32
+#error You should only be including windows/port.cc in a windows environment!
+#endif
+
+#include <assert.h>
+#include <stdarg.h>  // for va_list, va_start, va_end
+#include <string.h>  // for strlen(), memset(), memcmp()
+#include <windows.h>
+
+#include "windows_port.h"
+
+// These call the windows _vsnprintf, but always NUL-terminate.
+#if !defined(__MINGW32__) && !defined(__MINGW64__) /* mingw already defines */
+#if !(defined(_MSC_VER) && _MSC_VER >= 1900) /* msvc 2015 already defines */
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)  // ignore _vsnprintf security warning
+#endif
+int safe_vsnprintf(char *str, size_t size, const char *format, va_list ap) {
+    if (size == 0)  // not even room for a \0?
+        return -1;  // not what C99 says to do, but what windows does
+    str[size - 1] = '\0';
+    return _vsnprintf(str, size - 1, format, ap);
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+int snprintf(char *str, size_t size, const char *format, ...) {
+    int r;
+    va_list ap;
+    va_start(ap, format);
+    r = vsnprintf(str, size, format, ap);
+    va_end(ap);
+    return r;
+}
+
+#endif /* if !(defined(_MSC_VER) && _MSC_VER >= 1900)  */
+#endif /* #if !defined(__MINGW32__) && !defined(__MINGW64__) */
diff --git a/3rdparty/TNN/third_party/gflags/src/windows_port.h b/3rdparty/TNN/third_party/gflags/src/windows_port.h
new file mode 100644
index 0000000..61cf5b7
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/src/windows_port.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ *
+ * These are some portability typedefs and defines to make it a bit
+ * easier to compile this code under VC++.
+ *
+ * Several of these are taken from glib:
+ *    http://developer.gnome.org/doc/API/glib/glib-windows-compatability-functions.html
+ */
+
+#ifndef GFLAGS_WINDOWS_PORT_H_
+#define GFLAGS_WINDOWS_PORT_H_
+
+#include "config.h"
+
+// This must be defined before the windows.h is included.
+// It's needed for mutex.h, to give access to the TryLock method.
+#  if !defined(_WIN32_WINNT) && !(defined( __MINGW32__) || defined(__MINGW64__))
+#    define _WIN32_WINNT 0x0400
+#  endif
+// We always want minimal includes
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#include <direct.h>          /* for mkdir */
+#include <stdlib.h>          /* for _putenv, getenv */
+#include <stdio.h>           /* need this to override stdio's snprintf, also defines _unlink used by unit tests */
+#include <stdarg.h>          /* util.h uses va_copy */
+#include <string.h>          /* for _stricmp and _strdup */
+
+/* We can't just use _vsnprintf and _snprintf as drop-in-replacements,
+ * because they don't always NUL-terminate. :-(  We also can't use the
+ * name vsnprintf, since windows defines that (but not snprintf (!)).
+ */
+#if !defined(__MINGW32__) && !defined(__MINGW64__)  /* mingw already defines */
+#if !(defined(_MSC_VER) && _MSC_VER >= 1900)  /* msvc 2015 already defines */
+extern GFLAGS_DLL_DECL int snprintf(char *str, size_t size,
+                                       const char *format, ...);
+extern int GFLAGS_DLL_DECL safe_vsnprintf(char *str, size_t size,
+                                             const char *format, va_list ap);
+#define vsnprintf(str, size, format, ap)  safe_vsnprintf(str, size, format, ap)
+#define va_copy(dst, src)  (dst) = (src)
+#endif
+#endif  /* #if !defined(__MINGW32__) && !defined(__MINGW64__) */
+
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable: 4996) // ignore getenv security warning
+#endif
+inline void setenv(const char* name, const char* value, int) {
+  // In windows, it's impossible to set a variable to the empty string.
+  // We handle this by setting it to "0" and the NUL-ing out the \0.
+  // That is, we putenv("FOO=0") and then find out where in memory the
+  // putenv wrote "FOO=0", and change it in-place to "FOO=\0".
+  // c.f. http://svn.apache.org/viewvc/stdcxx/trunk/tests/src/environ.cpp?r1=611451&r2=637508&pathrev=637508
+  static const char* const kFakeZero = "0";
+  if (*value == '\0')
+    value = kFakeZero;
+  // Apparently the semantics of putenv() is that the input
+  // must live forever, so we leak memory here. :-(
+  const size_t nameval_len = strlen(name) + 1 + strlen(value) + 1;
+  char* nameval = reinterpret_cast<char*>(malloc(nameval_len));
+  snprintf(nameval, nameval_len, "%s=%s", name, value);
+  _putenv(nameval);
+  if (value == kFakeZero) {
+    nameval[nameval_len - 2] = '\0';   // works when putenv() makes no copy
+    if (*getenv(name) != '\0')
+      *getenv(name) = '\0';            // works when putenv() copies nameval
+  }
+}
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+#define strcasecmp _stricmp
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#define strdup   _strdup
+#define unlink   _unlink
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#include <inttypes.h>
+#else
+#define PRId32  "d"
+#define PRIu32  "u"
+#define PRId64  "I64d"
+#define PRIu64  "I64u"
+#endif
+
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+#define strtoq   _strtoi64
+#define strtouq  _strtoui64
+#define strtoll  _strtoi64
+#define strtoull _strtoui64
+#define atoll    _atoi64
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 1024
+#endif
+
+#endif  /* GFLAGS_WINDOWS_PORT_H_ */
diff --git a/3rdparty/TNN/third_party/gflags/test/CMakeLists.txt b/3rdparty/TNN/third_party/gflags/test/CMakeLists.txt
new file mode 100644
index 0000000..4cd1e69
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/CMakeLists.txt
@@ -0,0 +1,209 @@
+## gflags tests
+
+# ----------------------------------------------------------------------------
+# output directories
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
+set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib")
+set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib")
+
+# set working directory of test commands
+set (GFLAGS_FLAGFILES_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+# ----------------------------------------------------------------------------
+# common include directories and link libraries
+include_directories ("${CMAKE_CURRENT_SOURCE_DIR}")
+include_directories ("${gflags_SOURCE_DIR}/src")
+include_directories ("${gflags_BINARY_DIR}/include")
+include_directories ("${gflags_BINARY_DIR}/include/gflags")
+
+if (BUILD_SHARED_LIBS)
+  set (type shared)
+  if (GFLAGS_IS_A_DLL)
+    add_definitions(-DGFLAGS_IS_A_DLL)
+  endif ()
+else ()
+  set (type static)
+endif ()
+if (BUILD_gflags_LIB)
+  link_libraries (gflags_${type})
+else ()
+  link_libraries (gflags_nothreads_${type})
+endif ()
+
+# ----------------------------------------------------------------------------
+# STRIP_FLAG_HELP
+add_executable (gflags_strip_flags_test gflags_strip_flags_test.cc)
+# Make sure the --help output doesn't print the stripped text.
+add_gflags_test (strip_flags_help 1 "" "This text should be stripped out" gflags_strip_flags_test --help)
+# Make sure the stripped text isn't in the binary at all.
+add_test (
+  NAME strip_flags_binary
+  COMMAND "${CMAKE_COMMAND}" "-DBINARY=$<TARGET_FILE:gflags_strip_flags_test>"
+          -P "${CMAKE_CURRENT_SOURCE_DIR}/gflags_strip_flags_test.cmake"
+  CONFIGURATIONS Release MinSizeRel
+)
+
+# ----------------------------------------------------------------------------
+# unit tests
+configure_file (gflags_unittest.cc gflags_unittest-main.cc COPYONLY)
+configure_file (gflags_unittest.cc gflags_unittest_main.cc COPYONLY)
+
+add_executable (gflags_unittest      gflags_unittest.cc)
+add_executable (gflags_unittest-main gflags_unittest-main.cc)
+add_executable (gflags_unittest_main gflags_unittest_main.cc)
+
+if (OS_WINDOWS)
+  set (SLASH "\\\\")
+else ()
+  set (SLASH "/")
+endif ()
+
+# First, just make sure the  gflags_unittest  works as-is
+add_gflags_test(unittest 0 "" "" gflags_unittest)
+
+# --help should show all flags, including flags from gflags_reporting
+add_gflags_test(help-reporting 1 "${SLASH}gflags_reporting.cc:" ""  gflags_unittest  --help)
+
+# Make sure that --help prints even very long helpstrings.
+add_gflags_test(long-helpstring 1 "end of a long helpstring" ""  gflags_unittest  --help)
+
+# Make sure --help reflects flag changes made before flag-parsing
+add_gflags_test(changed_bool1 1 "-changed_bool1 (changed) type: bool default: true" ""  gflags_unittest  --help)
+add_gflags_test(changed_bool2 1 "-changed_bool2 (changed) type: bool default: false currently: true" ""  gflags_unittest  --help)
+# And on the command-line, too
+add_gflags_test(changeable_string_var 1 "-changeable_string_var () type: string default: \"1\" currently: \"2\"" ""  gflags_unittest  --changeable_string_var 2 --help)
+
+# --nohelp and --help=false should be as if we didn't say anything
+add_gflags_test(nohelp     0 "PASS" ""  gflags_unittest  --nohelp)
+add_gflags_test(help=false 0 "PASS" ""  gflags_unittest  --help=false)
+
+# --helpfull is the same as help
+add_gflags_test(helpfull 1 "${SLASH}gflags_reporting.cc:" ""  gflags_unittest  --helpfull)
+
+# --helpshort should show only flags from the  gflags_unittest  itself
+add_gflags_test(helpshort 1 "${SLASH}gflags_unittest.cc:" "${SLASH}gflags_reporting.cc:"  gflags_unittest  --helpshort)
+
+# --helpshort should show the tldflag we created in the  gflags_unittest  dir
+add_gflags_test(helpshort-tldflag1 1 "tldflag1" "${SLASH}google.cc:"  gflags_unittest  --helpshort)
+add_gflags_test(helpshort-tldflag2 1 "tldflag2" "${SLASH}google.cc:"  gflags_unittest  --helpshort)
+
+# --helpshort should work if the main source file is suffixed with [_-]main
+add_gflags_test(helpshort-main 1 "${SLASH}gflags_unittest-main.cc:" "${SLASH}gflags_reporting.cc:" gflags_unittest-main --helpshort)
+add_gflags_test(helpshort_main 1 "${SLASH}gflags_unittest_main.cc:" "${SLASH}gflags_reporting.cc:" gflags_unittest_main --helpshort)
+
+# --helpon needs an argument
+add_gflags_test(helpon 1 "'--helpon' is missing its argument; flag description: show help on" ""  gflags_unittest  --helpon)
+# --helpon argument indicates what file we'll show args from
+add_gflags_test(helpon=gflags 1 "${SLASH}gflags.cc:" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --helpon=gflags)
+# another way of specifying the argument
+add_gflags_test(helpon_gflags 1 "${SLASH}gflags.cc:" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --helpon gflags)
+# test another argument
+add_gflags_test(helpon=gflags_unittest 1 "${SLASH}gflags_unittest.cc:" "${SLASH}gflags.cc:"  gflags_unittest  --helpon=gflags_unittest)
+
+# helpmatch is like helpon but takes substrings
+add_gflags_test(helpmatch_reporting 1 "${SLASH}gflags_reporting.cc:" "${SLASH}gflags_unittest.cc:"  gflags_unittest  -helpmatch reporting)
+add_gflags_test(helpmatch=unittest  1 "${SLASH}gflags_unittest.cc:" "${SLASH}gflags.cc:"  gflags_unittest  -helpmatch=unittest)
+
+# if no flags are found with helpmatch or helpon, suggest --help
+add_gflags_test(helpmatch=nosuchsubstring 1 "No modules matched" "${SLASH}gflags_unittest.cc:"  gflags_unittest  -helpmatch=nosuchsubstring)
+add_gflags_test(helpon=nosuchmodule       1 "No modules matched" "${SLASH}gflags_unittest.cc:"  gflags_unittest  -helpon=nosuchmodule)
+
+# helppackage shows all the flags in the same dir as this unittest
+# --help should show all flags, including flags from google.cc
+add_gflags_test(helppackage 1 "${SLASH}gflags_reporting.cc:" ""  gflags_unittest  --helppackage)
+
+# xml!
+add_gflags_test(helpxml 1 "${SLASH}gflags_unittest.cc</file>" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --helpxml)
+
+# just print the version info and exit
+add_gflags_test(version-1 0 "gflags_unittest"      "${SLASH}gflags_unittest.cc:"  gflags_unittest  --version)
+add_gflags_test(version-2 0 "version test_version" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --version)
+
+# --undefok is a fun flag...
+add_gflags_test(undefok-1 1 "unknown command line flag 'foo'" ""  gflags_unittest  --undefok= --foo --unused_bool)
+add_gflags_test(undefok-2 0 "PASS" ""  gflags_unittest  --undefok=foo --foo --unused_bool)
+# If you say foo is ok to be undefined, we'll accept --nofoo as well
+add_gflags_test(undefok-3 0 "PASS" ""  gflags_unittest  --undefok=foo --nofoo --unused_bool)
+# It's ok if the foo is in the middle
+add_gflags_test(undefok-4 0 "PASS" ""  gflags_unittest  --undefok=fee,fi,foo,fum --foo --unused_bool)
+# But the spelling has to be just right...
+add_gflags_test(undefok-5 1 "unknown command line flag 'foo'" ""  gflags_unittest  --undefok=fo --foo --unused_bool)
+add_gflags_test(undefok-6 1 "unknown command line flag 'foo'" ""  gflags_unittest  --undefok=foot --foo --unused_bool)
+
+# See if we can successfully load our flags from the flagfile
+add_gflags_test(flagfile.1 0 "gflags_unittest" "${SLASH}gflags_unittest.cc:"  gflags_unittest  "--flagfile=flagfile.1")
+add_gflags_test(flagfile.2 0 "PASS" ""  gflags_unittest  "--flagfile=flagfile.2")
+add_gflags_test(flagfile.3 0 "PASS" ""  gflags_unittest  "--flagfile=flagfile.3")
+
+# Also try to load flags from the environment
+add_gflags_test(fromenv=version      0 "gflags_unittest" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --fromenv=version)
+add_gflags_test(tryfromenv=version   0 "gflags_unittest" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --tryfromenv=version)
+add_gflags_test(fromenv=help         0 "PASS" ""  gflags_unittest  --fromenv=help)
+add_gflags_test(tryfromenv=help      0 "PASS" ""  gflags_unittest  --tryfromenv=help)
+add_gflags_test(fromenv=helpfull     1 "helpfull not found in environment" ""  gflags_unittest  --fromenv=helpfull)
+add_gflags_test(tryfromenv=helpfull  0 "PASS" ""  gflags_unittest  --tryfromenv=helpfull)
+add_gflags_test(tryfromenv=undefok   0 "PASS" ""  gflags_unittest  --tryfromenv=undefok --foo)
+add_gflags_test(tryfromenv=weirdo    1 "unknown command line flag" ""  gflags_unittest  --tryfromenv=weirdo)
+add_gflags_test(tryfromenv-multiple  0 "gflags_unittest" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --tryfromenv=test_bool,version,unused_bool)
+add_gflags_test(fromenv=test_bool    1 "not found in environment" ""  gflags_unittest  --fromenv=test_bool)
+add_gflags_test(fromenv=test_bool-ok 1 "unknown command line flag" ""  gflags_unittest  --fromenv=test_bool,ok)
+# Here, the --version overrides the fromenv
+add_gflags_test(version-overrides-fromenv 0 "gflags_unittest" "${SLASH}gflags_unittest.cc:"  gflags_unittest  --fromenv=test_bool,version,ok)
+
+# Make sure -- by itself stops argv processing
+add_gflags_test(dashdash 0 "PASS" ""  gflags_unittest  -- --help)
+
+# And we should die if the flag value doesn't pass the validator
+add_gflags_test(always_fail 1 "ERROR: failed validation of new value 'true' for flag 'always_fail'" ""  gflags_unittest  --always_fail)
+
+# And if locking in validators fails
+# TODO(andreas): Worked on Windows 7 Release configuration, but causes
+#                debugger abort() intervention in case of Debug configuration.
+#add_gflags_test(deadlock_if_cant_lock 0 "PASS" ""  gflags_unittest  --deadlock_if_cant_lock)
+
+# ----------------------------------------------------------------------------
+# use gflags_declare.h
+add_executable (gflags_declare_test gflags_declare_test.cc gflags_declare_flags.cc)
+
+add_test(NAME gflags_declare COMMAND gflags_declare_test --message "Hello gflags!")
+set_tests_properties(gflags_declare PROPERTIES PASS_REGULAR_EXPRESSION "Hello gflags!")
+
+# ----------------------------------------------------------------------------
+# configure Python script which configures and builds a test project
+if (BUILD_NC_TESTS OR BUILD_CONFIG_TESTS)
+  find_package (PythonInterp)
+  if (NOT PYTHON_EXECUTABLE)
+    message (FATAL_ERROR "No Python installation found! It is required by the (negative) compilation tests."
+                         " Either install Python or set BUILD_NC_TESTS and BUILD_CONFIG_TESTS to FALSE.")
+  endif ()
+  set (TMPDIR "${PROJECT_BINARY_DIR}/Testing/Temporary")
+  configure_file (gflags_build.py.in "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/build.py" @ONLY)
+  function (add_gflags_build_test name srcdir expect_fail)
+    set (srcdir "${CMAKE_CURRENT_SOURCE_DIR}/${srcdir}")
+    add_test (
+      NAME    "${name}"
+      COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/build.py" 
+                    ${name} ${srcdir} ${expect_fail}
+    )
+  endfunction ()
+endif ()
+
+# ----------------------------------------------------------------------------
+# negative compilation tests
+option (BUILD_NC_TESTS "Request addition of negative compilation tests." OFF)
+mark_as_advanced (BUILD_NC_TESTS)
+if (BUILD_NC_TESTS)
+  add_gflags_build_test (nc_sanity               nc 0)
+  add_gflags_build_test (nc_swapped_args         nc 1)
+  add_gflags_build_test (nc_int_instead_of_bool  nc 1)
+  add_gflags_build_test (nc_bool_in_quotes       nc 1)
+  add_gflags_build_test (nc_define_string_with_0 nc 1)
+endif ()
+
+# ----------------------------------------------------------------------------
+# build configuration test
+option (BUILD_CONFIG_TESTS "Request addition of package configuration tests." OFF)
+mark_as_advanced (BUILD_CONFIG_TESTS)
+if (BUILD_CONFIG_TESTS)
+  add_gflags_build_test (cmake_config config 0)
+endif ()
diff --git a/3rdparty/TNN/third_party/gflags/test/config/CMakeLists.txt b/3rdparty/TNN/third_party/gflags/test/config/CMakeLists.txt
new file mode 100644
index 0000000..c54f54b
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/config/CMakeLists.txt
@@ -0,0 +1,10 @@
+## gflags package configuration tests
+
+cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
+
+project (gflags_${TEST_NAME})
+
+find_package (gflags REQUIRED)
+
+add_executable (foo main.cc)
+target_link_libraries (foo gflags)
diff --git a/3rdparty/TNN/third_party/gflags/test/config/main.cc b/3rdparty/TNN/third_party/gflags/test/config/main.cc
new file mode 100644
index 0000000..3c033e3
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/config/main.cc
@@ -0,0 +1,20 @@
+#include <iostream>
+#include <gflags/gflags.h>
+
+DEFINE_string(message, "Hello World!", "The message to print");
+
+static bool ValidateMessage(const char* flagname, const std::string &message)
+{
+  return !message.empty();
+}
+DEFINE_validator(message, ValidateMessage);
+
+int main(int argc, char **argv)
+{
+  gflags::SetUsageMessage("Test CMake configuration of gflags library (gflags-config.cmake)");
+  gflags::SetVersionString("0.1");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  std::cout << FLAGS_message << std::endl;
+  gflags::ShutDownCommandLineFlags();
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/gflags/test/flagfile.1 b/3rdparty/TNN/third_party/gflags/test/flagfile.1
new file mode 100644
index 0000000..e0f9217
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/flagfile.1
@@ -0,0 +1 @@
+--version
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/test/flagfile.2 b/3rdparty/TNN/third_party/gflags/test/flagfile.2
new file mode 100644
index 0000000..864f8e8
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/flagfile.2
@@ -0,0 +1,2 @@
+--foo=bar
+--nounused_bool
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/test/flagfile.3 b/3rdparty/TNN/third_party/gflags/test/flagfile.3
new file mode 100644
index 0000000..76d92bb
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/flagfile.3
@@ -0,0 +1 @@
+--flagfile=flagfile.2
\ No newline at end of file
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_build.py.in b/3rdparty/TNN/third_party/gflags/test/gflags_build.py.in
new file mode 100644
index 0000000..a8cba2b
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_build.py.in
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import subprocess
+import shutil
+
+CMAKE            = '@CMAKE_COMMAND@'
+CMAKE_BUILD_TYPE = '@CMAKE_BUILD_TYPE@'
+TMPDIR           = '@TMPDIR@'
+SRCDIR           = '@SRCDIR@'
+GFLAGS_DIR       = '@gflags_BINARY_DIR@'
+
+if __name__ == "__main__":
+  if len(sys.argv) != 4:
+    sys.stderr.write(' '.join(['usage:', sys.argv[0], '<test_name> <srcdir> <expect_fail:0|1>\n']))
+    sys.exit(1)
+  test_name   = sys.argv[1]
+  srcdir      = sys.argv[2]
+  expect_fail = (sys.argv[3].lower() in ['true', 'yes', 'on', '1'])
+  bindir      = os.path.join(TMPDIR, test_name)
+  if TMPDIR == '':
+    sys.stderr.write('Temporary directory not set!\n')
+    sys.exit(1)
+  # create build directory
+  if os.path.isdir(bindir): shutil.rmtree(bindir)
+  os.makedirs(bindir)
+  # configure the build tree
+  if subprocess.call([CMAKE, '-DCMAKE_BUILD_TYPE:STRING='+CMAKE_BUILD_TYPE,
+                             '-Dgflags_DIR:PATH='+GFLAGS_DIR,
+                             '-DTEST_NAME:STRING='+test_name, srcdir], cwd=bindir) != 0:
+    sys.stderr.write('Failed to configure the build tree!\n')
+    sys.exit(1)
+  # build the test project
+  exit_code = subprocess.call([CMAKE, '--build', bindir, '--config', CMAKE_BUILD_TYPE], cwd=bindir)
+  if expect_fail == True:
+    if exit_code == 0:
+      sys.stderr.write('Build expected to fail, but it succeeded!\n')
+      sys.exit(1)
+    else:
+      sys.stderr.write('Build failed as expected\n')
+      exit_code = 0
+  sys.exit(exit_code)
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_declare_flags.cc b/3rdparty/TNN/third_party/gflags/test/gflags_declare_flags.cc
new file mode 100755
index 0000000..3d952a8
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_declare_flags.cc
@@ -0,0 +1,12 @@
+#define GFLAGS_DLL_DECLARE_FLAG
+
+#include <iostream>
+#include <gflags/gflags_declare.h>
+
+DECLARE_string(message); // in gflags_delcare_test.cc
+
+void print_message();
+void print_message()
+{
+  std::cout << FLAGS_message << std::endl;
+}
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_declare_test.cc b/3rdparty/TNN/third_party/gflags/test/gflags_declare_test.cc
new file mode 100644
index 0000000..707bcc0
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_declare_test.cc
@@ -0,0 +1,12 @@
+#include <gflags/gflags.h>
+
+DEFINE_string(message, "", "The message to print");
+void print_message(); // in gflags_declare_flags.cc
+
+int main(int argc, char **argv)
+{
+  gflags::SetUsageMessage("Test compilation and use of gflags_declare.h");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  print_message();
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cc b/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cc
new file mode 100755
index 0000000..143f0c6
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: csilvers@google.com (Craig Silverstein)
+//
+// A simple program that uses STRIP_FLAG_HELP.  We'll have a shell
+// script that runs 'strings' over this program and makes sure
+// that the help string is not in there.
+
+#define STRIP_FLAG_HELP 1
+#include <gflags/gflags.h>
+
+#include <stdio.h>
+
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+
+DEFINE_bool(test, true, "This text should be stripped out");
+
+int main(int argc, char** argv) {
+  SetUsageMessage("Usage message");
+  ParseCommandLineFlags(&argc, &argv, false);
+
+  // Unfortunately, for us, libtool can replace executables with a shell
+  // script that does some work before calling the 'real' executable
+  // under a different name.  We need the 'real' executable name to run
+  // 'strings' on it, so we construct this binary to print the real
+  // name (argv[0]) on stdout when run.
+  puts(argv[0]);
+
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cmake b/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cmake
new file mode 100644
index 0000000..5bb5cc1
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_strip_flags_test.cmake
@@ -0,0 +1,7 @@
+if (NOT BINARY)
+  message (FATAL_ERROR "BINARY file to check not specified!")
+endif ()
+file (STRINGS "${BINARY}" strings REGEX "This text should be stripped out")
+if (strings)
+  message (FATAL_ERROR "Text not stripped from binary like it should be: ${BINARY}")
+endif ()
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_unittest.cc b/3rdparty/TNN/third_party/gflags/test/gflags_unittest.cc
new file mode 100755
index 0000000..9a922ef
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_unittest.cc
@@ -0,0 +1,1572 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// For now, this unit test does not cover all features of
+// gflags.cc
+
+#include <gflags/gflags.h>
+
+#include "config.h"
+#include "util.h"
+
+#include <math.h>       // for isinf() and isnan()
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_UNISTD_H
+#  include <unistd.h>   // for unlink()
+#endif
+#include <vector>
+#include <string>
+TEST_INIT
+EXPECT_DEATH_INIT
+
+// I don't actually use this header file, but #include it under the
+// old location to make sure that the include-header-forwarding
+// works.  But don't bother on windows; the windows port is so new
+// it never had the old location-names.
+#ifndef _MSC_VER
+#include <gflags/gflags_completions.h>
+void (*unused_fn)() = &GFLAGS_NAMESPACE::HandleCommandLineCompletions;
+#endif
+
+using std::string;
+using std::vector;
+using GFLAGS_NAMESPACE::int32;
+using GFLAGS_NAMESPACE::FlagRegisterer;
+using GFLAGS_NAMESPACE::StringFromEnv;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::CommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetAllFlags;
+
+DEFINE_string(test_tmpdir, "", "Dir we use for temp files");
+DEFINE_string(srcdir, StringFromEnv("SRCDIR", "."), "Source-dir root, needed to find gflags_unittest_flagfile");
+
+DECLARE_string(tryfromenv);   // in gflags.cc
+
+DEFINE_bool(test_bool, false, "tests bool-ness");
+DEFINE_int32(test_int32, -1, "");
+DEFINE_int64(test_int64, -2, "");
+DEFINE_uint32(test_uint32, 1, "");
+DEFINE_uint64(test_uint64, 2, "");
+DEFINE_double(test_double, -1.0, "");
+DEFINE_string(test_string, "initial", "");
+
+//
+// The below ugliness gets some additional code coverage in the -helpxml
+// and -helpmatch test cases having to do with string lengths and formatting
+//
+DEFINE_bool(test_bool_with_quite_quite_quite_quite_quite_quite_quite_quite_quite_quite_quite_quite_quite_quite_long_name,
+            false,
+            "extremely_extremely_extremely_extremely_extremely_extremely_extremely_extremely_long_meaning");
+
+DEFINE_string(test_str1, "initial", "");
+DEFINE_string(test_str2, "initial", "");
+DEFINE_string(test_str3, "initial", "");
+
+// This is used to test setting tryfromenv manually
+DEFINE_string(test_tryfromenv, "initial", "");
+
+// Don't try this at home!
+static int changeable_var = 12;
+DEFINE_int32(changeable_var, ++changeable_var, "");
+
+static int changeable_bool_var = 8008;
+DEFINE_bool(changeable_bool_var, ++changeable_bool_var == 8009, "");
+
+static int changeable_string_var = 0;
+static string ChangeableString() {
+  char r[] = {static_cast<char>('0' + ++changeable_string_var), '\0'};
+  return r;
+}
+DEFINE_string(changeable_string_var, ChangeableString(), "");
+
+// These are never used in this unittest, but can be used by
+// gflags_unittest.sh when it needs to specify flags
+// that are legal for gflags_unittest but don't need to
+// be a particular value.
+DEFINE_bool(unused_bool, true, "unused bool-ness");
+DEFINE_int32(unused_int32, -1001, "");
+DEFINE_int64(unused_int64, -2001, "");
+DEFINE_uint32(unused_uint32, 1000, "");
+DEFINE_uint64(unused_uint64, 2000, "");
+DEFINE_double(unused_double, -1000.0, "");
+DEFINE_string(unused_string, "unused", "");
+
+// These flags are used by gflags_unittest.sh
+DEFINE_bool(changed_bool1, false, "changed");
+DEFINE_bool(changed_bool2, false, "changed");
+DEFINE_bool(long_helpstring, false,
+            "This helpstring goes on forever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever and ever and ever and ever and ever and ever and ever and "
+            "ever.  This is the end of a long helpstring");
+
+
+static bool AlwaysFail(const char* flag, bool value) { return value == false; }
+DEFINE_bool(always_fail, false, "will fail to validate when you set it");
+DEFINE_validator(always_fail, AlwaysFail);
+
+// See the comment by GetAllFlags in gflags.h
+static bool DeadlockIfCantLockInValidators(const char* flag, bool value) {
+  if (!value) {
+    return true;
+  }
+  vector<CommandLineFlagInfo> dummy;
+  GetAllFlags(&dummy);
+  return true;
+}
+DEFINE_bool(deadlock_if_cant_lock,
+            false,
+            "will deadlock if set to true and "
+            "if locking of registry in validators fails.");
+DEFINE_validator(deadlock_if_cant_lock, DeadlockIfCantLockInValidators);
+
+#define MAKEFLAG(x) DEFINE_int32(test_flag_num##x, x, "Test flag")
+
+// Define 10 flags
+#define MAKEFLAG10(x)                           \
+  MAKEFLAG(x##0);                               \
+  MAKEFLAG(x##1);                               \
+  MAKEFLAG(x##2);                               \
+  MAKEFLAG(x##3);                               \
+  MAKEFLAG(x##4);                               \
+  MAKEFLAG(x##5);                               \
+  MAKEFLAG(x##6);                               \
+  MAKEFLAG(x##7);                               \
+  MAKEFLAG(x##8);                               \
+  MAKEFLAG(x##9)
+
+// Define 100 flags
+#define MAKEFLAG100(x)                          \
+  MAKEFLAG10(x##0);                             \
+  MAKEFLAG10(x##1);                             \
+  MAKEFLAG10(x##2);                             \
+  MAKEFLAG10(x##3);                             \
+  MAKEFLAG10(x##4);                             \
+  MAKEFLAG10(x##5);                             \
+  MAKEFLAG10(x##6);                             \
+  MAKEFLAG10(x##7);                             \
+  MAKEFLAG10(x##8);                             \
+  MAKEFLAG10(x##9)
+
+// Define a bunch of command-line flags.  Each occurrence of the MAKEFLAG100
+// macro defines 100 integer flags.  This lets us test the effect of having
+// many flags on startup time.
+MAKEFLAG100(1);
+MAKEFLAG100(2);
+MAKEFLAG100(3);
+MAKEFLAG100(4);
+MAKEFLAG100(5);
+MAKEFLAG100(6);
+MAKEFLAG100(7);
+MAKEFLAG100(8);
+MAKEFLAG100(9);
+MAKEFLAG100(10);
+MAKEFLAG100(11);
+MAKEFLAG100(12);
+MAKEFLAG100(13);
+MAKEFLAG100(14);
+MAKEFLAG100(15);
+
+#undef MAKEFLAG100
+#undef MAKEFLAG10
+#undef MAKEFLAG
+
+// This is a pseudo-flag -- we want to register a flag with a filename
+// at the top level, but there is no way to do this except by faking
+// the filename.
+namespace fLI {
+  static const int32 FLAGS_nonotldflag1 = 12;
+  int32 FLAGS_tldflag1 = FLAGS_nonotldflag1;
+  int32 FLAGS_notldflag1 = FLAGS_nonotldflag1;
+  static FlagRegisterer o_tldflag1(
+    "tldflag1",
+    "should show up in --helpshort", "gflags_unittest.cc",
+    &FLAGS_tldflag1, &FLAGS_notldflag1);
+}
+using fLI::FLAGS_tldflag1;
+
+namespace fLI {
+  static const int32 FLAGS_nonotldflag2 = 23;
+  int32 FLAGS_tldflag2 = FLAGS_nonotldflag2;
+  int32 FLAGS_notldflag2 = FLAGS_nonotldflag2;
+  static FlagRegisterer o_tldflag2(
+    "tldflag2",
+    "should show up in --helpshort", "gflags_unittest.",
+    &FLAGS_tldflag2, &FLAGS_notldflag2);
+}
+using fLI::FLAGS_tldflag2;
+
+namespace GFLAGS_NAMESPACE {
+
+namespace {
+
+
+static string TmpFile(const string& basename) {
+#ifdef _MSC_VER
+  return FLAGS_test_tmpdir + "\\" + basename;
+#else
+  return FLAGS_test_tmpdir + "/" + basename;
+#endif
+}
+
+// Returns the definition of the --flagfile flag to be used in the tests.
+// Must be called after ParseCommandLineFlags().
+static const char* GetFlagFileFlag() {
+#ifdef _MSC_VER
+  static const string flagfile = FLAGS_srcdir + "\\gflags_unittest_flagfile";
+#else
+  static const string flagfile = FLAGS_srcdir + "/gflags_unittest_flagfile";
+#endif
+  static const string flagfile_flag = string("--flagfile=") + flagfile;
+  return flagfile_flag.c_str();
+}
+
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+
+template <typename Expected, typename Actual>
+void AssertIsType(Actual& x) {
+  CompileAssertTypesEqual<Expected, Actual>();
+}
+
+// Verify all the flags are the right type.
+TEST(FlagTypes, FlagTypes) {
+  AssertIsType<bool>(FLAGS_test_bool);
+  AssertIsType<int32>(FLAGS_test_int32);
+  AssertIsType<int64>(FLAGS_test_int64);
+  AssertIsType<uint32>(FLAGS_test_uint32);
+  AssertIsType<uint64>(FLAGS_test_uint64);
+  AssertIsType<double>(FLAGS_test_double);
+  AssertIsType<string>(FLAGS_test_string);
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+// Death tests for "help" options.
+//
+// The help system automatically calls gflags_exitfunc(1) when you specify any of
+// the help-related flags ("-helpmatch", "-helpxml") so we can't test
+// those mainline.
+
+// Tests that "-helpmatch" causes the process to die.
+TEST(ReadFlagsFromStringDeathTest, HelpMatch) {
+  EXPECT_DEATH(ReadFlagsFromString("-helpmatch=base", GetArgv0(), true),
+               "");
+}
+
+
+// Tests that "-helpxml" causes the process to die.
+TEST(ReadFlagsFromStringDeathTest, HelpXml) {
+  EXPECT_DEATH(ReadFlagsFromString("-helpxml", GetArgv0(), true),
+               "");
+}
+#endif
+
+
+// A subroutine needed for testing reading flags from a string.
+void TestFlagString(const string& flags,
+                    const string& expected_string,
+                    bool expected_bool,
+                    int32 expected_int32,
+                    double expected_double) {
+  EXPECT_TRUE(ReadFlagsFromString(flags,
+                                  GetArgv0(),
+                                  // errors are fatal
+                                  true));
+
+  EXPECT_EQ(expected_string, FLAGS_test_string);
+  EXPECT_EQ(expected_bool, FLAGS_test_bool);
+  EXPECT_EQ(expected_int32, FLAGS_test_int32);
+  EXPECT_DOUBLE_EQ(expected_double, FLAGS_test_double);
+}
+
+
+// Tests reading flags from a string.
+TEST(FlagFileTest, ReadFlagsFromString) {
+  TestFlagString(
+      // Flag string
+      "-test_string=continued\n"
+      "# some comments are in order\n"
+      "# some\n"
+      "  # comments\n"
+      "#are\n"
+      "                  #trickier\n"
+      "# than others\n"
+      "-test_bool=true\n"
+      "     -test_int32=1\n"
+      "-test_double=0.0\n",
+      // Expected values
+      "continued",
+      true,
+      1,
+      0.0);
+
+  TestFlagString(
+      // Flag string
+      "# let's make sure it can update values\n"
+      "-test_string=initial\n"
+      "-test_bool=false\n"
+      "-test_int32=123\n"
+      "-test_double=123.0\n",
+      // Expected values
+      "initial",
+      false,
+      123,
+      123.0);
+
+  // Test that flags can use dashes instead of underscores.
+  TestFlagString(
+      // Flag string
+      "-test-string=initial\n"
+      "--test-bool=false\n"
+      "--test-int32=123\n"
+      "--test-double=123.0\n",
+      // Expected values
+      "initial",
+      false,
+      123,
+      123.0);
+}
+
+// Tests the filename part of the flagfile
+TEST(FlagFileTest, FilenamesOurfileLast) {
+  FLAGS_test_string = "initial";
+  FLAGS_test_bool = false;
+  FLAGS_test_int32 = -1;
+  FLAGS_test_double = -1.0;
+  TestFlagString(
+      // Flag string
+      "-test_string=continued\n"
+      "# some comments are in order\n"
+      "# some\n"
+      "  # comments\n"
+      "#are\n"
+      "                  #trickier\n"
+      "# than others\n"
+      "not_our_filename\n"
+      "-test_bool=true\n"
+      "     -test_int32=1\n"
+      "gflags_unittest\n"
+      "-test_double=1000.0\n",
+      // Expected values
+      "continued",
+      false,
+      -1,
+      1000.0);
+}
+
+TEST(FlagFileTest, FilenamesOurfileFirst) {
+  FLAGS_test_string = "initial";
+  FLAGS_test_bool = false;
+  FLAGS_test_int32 = -1;
+  FLAGS_test_double = -1.0;
+  TestFlagString(
+      // Flag string
+      "-test_string=continued\n"
+      "# some comments are in order\n"
+      "# some\n"
+      "  # comments\n"
+      "#are\n"
+      "                  #trickier\n"
+      "# than others\n"
+      "gflags_unittest\n"
+      "-test_bool=true\n"
+      "     -test_int32=1\n"
+      "not_our_filename\n"
+      "-test_double=1000.0\n",
+      // Expected values
+      "continued",
+      true,
+      1,
+      -1.0);
+}
+
+#if defined(HAVE_FNMATCH_H) || defined(HAVE_SHLWAPI_H)  // otherwise glob isn't supported
+TEST(FlagFileTest, FilenamesOurfileGlob) {
+  FLAGS_test_string = "initial";
+  FLAGS_test_bool = false;
+  FLAGS_test_int32 = -1;
+  FLAGS_test_double = -1.0;
+  TestFlagString(
+      // Flag string
+      "-test_string=continued\n"
+      "# some comments are in order\n"
+      "# some\n"
+      "  # comments\n"
+      "#are\n"
+      "                  #trickier\n"
+      "# than others\n"
+      "*flags*\n"
+      "-test_bool=true\n"
+      "     -test_int32=1\n"
+      "flags\n"
+      "-test_double=1000.0\n",
+      // Expected values
+      "continued",
+      true,
+      1,
+      -1.0);
+}
+
+TEST(FlagFileTest, FilenamesOurfileInBigList) {
+  FLAGS_test_string = "initial";
+  FLAGS_test_bool = false;
+  FLAGS_test_int32 = -1;
+  FLAGS_test_double = -1.0;
+  TestFlagString(
+      // Flag string
+      "-test_string=continued\n"
+      "# some comments are in order\n"
+      "# some\n"
+      "  # comments\n"
+      "#are\n"
+      "                  #trickier\n"
+      "# than others\n"
+      "*first* *flags* *third*\n"
+      "-test_bool=true\n"
+      "     -test_int32=1\n"
+      "flags\n"
+      "-test_double=1000.0\n",
+      // Expected values
+      "continued",
+      true,
+      1,
+      -1.0);
+}
+#endif  // defined(HAVE_FNMATCH_H) || defined(HAVE_SHLWAPI_H)
+
+// Tests that a failed flag-from-string read keeps flags at default values
+TEST(FlagFileTest, FailReadFlagsFromString) {
+  FLAGS_test_int32 = 119;
+  string flags("# let's make sure it can update values\n"
+               "-test_string=non_initial\n"
+               "-test_bool=false\n"
+               "-test_int32=123\n"
+               "-test_double=illegal\n");
+
+  EXPECT_FALSE(ReadFlagsFromString(flags,
+                                   GetArgv0(),
+                                   // errors are fatal
+                                   false));
+
+  EXPECT_EQ(119, FLAGS_test_int32);
+  EXPECT_EQ("initial", FLAGS_test_string);
+}
+
+// Tests that flags can be set to ordinary values.
+TEST(SetFlagValueTest, OrdinaryValues) {
+  EXPECT_EQ("initial", FLAGS_test_str1);
+
+  SetCommandLineOptionWithMode("test_str1", "second", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str1);  // set; was default
+
+  SetCommandLineOptionWithMode("test_str1", "third", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str1);  // already set once
+
+  FLAGS_test_str1 = "initial";
+  SetCommandLineOptionWithMode("test_str1", "third", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("initial", FLAGS_test_str1);  // still already set before
+
+  SetCommandLineOptionWithMode("test_str1", "third", SET_FLAGS_VALUE);
+  EXPECT_EQ("third", FLAGS_test_str1);  // changed value
+
+  SetCommandLineOptionWithMode("test_str1", "fourth", SET_FLAGS_DEFAULT);
+  EXPECT_EQ("third", FLAGS_test_str1);
+  // value not changed (already set before)
+
+  EXPECT_EQ("initial", FLAGS_test_str2);
+
+  SetCommandLineOptionWithMode("test_str2", "second", SET_FLAGS_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str2);  // changed (was default)
+
+  FLAGS_test_str2 = "extra";
+  EXPECT_EQ("extra", FLAGS_test_str2);
+
+  FLAGS_test_str2 = "second";
+  SetCommandLineOptionWithMode("test_str2", "third", SET_FLAGS_DEFAULT);
+  EXPECT_EQ("third", FLAGS_test_str2);  // still changed (was equal to default)
+
+  SetCommandLineOptionWithMode("test_str2", "fourth", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("fourth", FLAGS_test_str2);  // changed (was default)
+
+  EXPECT_EQ("initial", FLAGS_test_str3);
+
+  SetCommandLineOptionWithMode("test_str3", "second", SET_FLAGS_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str3);  // changed
+
+  FLAGS_test_str3 = "third";
+  SetCommandLineOptionWithMode("test_str3", "fourth", SET_FLAGS_DEFAULT);
+  EXPECT_EQ("third", FLAGS_test_str3);  // not changed (was set)
+
+  SetCommandLineOptionWithMode("test_str3", "fourth", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("third", FLAGS_test_str3);  // not changed (was set)
+
+  SetCommandLineOptionWithMode("test_str3", "fourth", SET_FLAGS_VALUE);
+  EXPECT_EQ("fourth", FLAGS_test_str3);  // changed value
+}
+
+
+// Tests that flags can be set to exceptional values.
+// Note: apparently MINGW doesn't parse inf and nan correctly:
+//    http://www.mail-archive.com/bug-gnulib@gnu.org/msg09573.html
+// This url says FreeBSD also has a problem, but I didn't see that.
+TEST(SetFlagValueTest, ExceptionalValues) {
+#if defined(isinf) && !defined(__MINGW32__)
+  EXPECT_EQ("test_double set to inf\n",
+            SetCommandLineOption("test_double", "inf"));
+  EXPECT_INF(FLAGS_test_double);
+
+  EXPECT_EQ("test_double set to inf\n",
+            SetCommandLineOption("test_double", "INF"));
+  EXPECT_INF(FLAGS_test_double);
+#endif
+
+  // set some bad values
+  EXPECT_EQ("",
+            SetCommandLineOption("test_double", "0.1xxx"));
+  EXPECT_EQ("",
+            SetCommandLineOption("test_double", " "));
+  EXPECT_EQ("",
+            SetCommandLineOption("test_double", ""));
+#if defined(isinf) && !defined(__MINGW32__)
+  EXPECT_EQ("test_double set to -inf\n",
+            SetCommandLineOption("test_double", "-inf"));
+  EXPECT_INF(FLAGS_test_double);
+  EXPECT_GT(0, FLAGS_test_double);
+#endif
+
+#if defined(isnan) && !defined(__MINGW32__)
+  EXPECT_EQ("test_double set to nan\n",
+            SetCommandLineOption("test_double", "NaN"));
+  EXPECT_NAN(FLAGS_test_double);
+#endif
+}
+
+// Tests that integer flags can be specified in many ways
+TEST(SetFlagValueTest, DifferentRadices) {
+  EXPECT_EQ("test_int32 set to 12\n",
+            SetCommandLineOption("test_int32", "12"));
+
+  EXPECT_EQ("test_int32 set to 16\n",
+            SetCommandLineOption("test_int32", "0x10"));
+
+  EXPECT_EQ("test_int32 set to 34\n",
+            SetCommandLineOption("test_int32", "0X22"));
+
+  // Leading 0 is *not* octal; it's still decimal
+  EXPECT_EQ("test_int32 set to 10\n",
+            SetCommandLineOption("test_int32", "010"));
+}
+
+// Tests what happens when you try to set a flag to an illegal value
+TEST(SetFlagValueTest, IllegalValues) {
+  FLAGS_test_bool = true;
+  FLAGS_test_int32 = 119;
+  FLAGS_test_int64 = 1191;
+  FLAGS_test_uint32 = 11911;
+  FLAGS_test_uint64 = 119111;
+
+  EXPECT_EQ("",
+            SetCommandLineOption("test_bool", "12"));
+
+  EXPECT_EQ("",
+            SetCommandLineOption("test_uint32", "-1970"));
+
+  EXPECT_EQ("",
+            SetCommandLineOption("test_int32", "7000000000000"));
+
+  EXPECT_EQ("",
+            SetCommandLineOption("test_uint64", "-1"));
+
+  EXPECT_EQ("",
+            SetCommandLineOption("test_int64", "not a number!"));
+
+  // Test the empty string with each type of input
+  EXPECT_EQ("", SetCommandLineOption("test_bool", ""));
+  EXPECT_EQ("", SetCommandLineOption("test_int32", ""));
+  EXPECT_EQ("", SetCommandLineOption("test_int64", ""));
+  EXPECT_EQ("", SetCommandLineOption("test_uint32", ""));
+  EXPECT_EQ("", SetCommandLineOption("test_uint64", ""));
+  EXPECT_EQ("", SetCommandLineOption("test_double", ""));
+  EXPECT_EQ("test_string set to \n", SetCommandLineOption("test_string", ""));
+
+  EXPECT_TRUE(FLAGS_test_bool);
+  EXPECT_EQ(119, FLAGS_test_int32);
+  EXPECT_EQ(1191, FLAGS_test_int64);
+  EXPECT_EQ(11911, FLAGS_test_uint32);
+  EXPECT_EQ(119111, FLAGS_test_uint64);
+}
+
+
+// Tests that we only evaluate macro args once
+TEST(MacroArgs, EvaluateOnce) {
+  EXPECT_EQ(13, FLAGS_changeable_var);
+  // Make sure we don't ++ the value somehow, when evaluating the flag.
+  EXPECT_EQ(13, FLAGS_changeable_var);
+  // Make sure the macro only evaluated this var once.
+  EXPECT_EQ(13, changeable_var);
+  // Make sure the actual value and default value are the same
+  SetCommandLineOptionWithMode("changeable_var", "21", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ(21, FLAGS_changeable_var);
+}
+
+TEST(MacroArgs, EvaluateOnceBool) {
+  EXPECT_TRUE(FLAGS_changeable_bool_var);
+  EXPECT_TRUE(FLAGS_changeable_bool_var);
+  EXPECT_EQ(8009, changeable_bool_var);
+  SetCommandLineOptionWithMode("changeable_bool_var", "false",
+                               SET_FLAG_IF_DEFAULT);
+  EXPECT_FALSE(FLAGS_changeable_bool_var);
+}
+
+TEST(MacroArgs, EvaluateOnceStrings) {
+  EXPECT_EQ("1", FLAGS_changeable_string_var);
+  EXPECT_EQ("1", FLAGS_changeable_string_var);
+  EXPECT_EQ(1, changeable_string_var);
+  SetCommandLineOptionWithMode("changeable_string_var", "different",
+                               SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("different", FLAGS_changeable_string_var);
+}
+
+// Tests that the FooFromEnv does the right thing
+TEST(FromEnvTest, LegalValues) {
+  setenv("BOOL_VAL1", "true", 1);
+  setenv("BOOL_VAL2", "false", 1);
+  setenv("BOOL_VAL3", "1", 1);
+  setenv("BOOL_VAL4", "F", 1);
+  EXPECT_TRUE(BoolFromEnv("BOOL_VAL1", false));
+  EXPECT_FALSE(BoolFromEnv("BOOL_VAL2", true));
+  EXPECT_TRUE(BoolFromEnv("BOOL_VAL3", false));
+  EXPECT_FALSE(BoolFromEnv("BOOL_VAL4", true));
+  EXPECT_TRUE(BoolFromEnv("BOOL_VAL_UNKNOWN", true));
+  EXPECT_FALSE(BoolFromEnv("BOOL_VAL_UNKNOWN", false));
+
+  setenv("INT_VAL1", "1", 1);
+  setenv("INT_VAL2", "-1", 1);
+  EXPECT_EQ(1, Int32FromEnv("INT_VAL1", 10));
+  EXPECT_EQ(-1, Int32FromEnv("INT_VAL2", 10));
+  EXPECT_EQ(10, Int32FromEnv("INT_VAL_UNKNOWN", 10));
+
+  setenv("INT_VAL3", "4294967295", 1);
+  EXPECT_EQ(1, Uint32FromEnv("INT_VAL1", 10));
+  EXPECT_EQ(4294967295L, Uint32FromEnv("INT_VAL3", 30));
+  EXPECT_EQ(10, Uint32FromEnv("INT_VAL_UNKNOWN", 10));
+
+  setenv("INT_VAL4", "1099511627776", 1);
+  EXPECT_EQ(1, Int64FromEnv("INT_VAL1", 20));
+  EXPECT_EQ(-1, Int64FromEnv("INT_VAL2", 20));
+  EXPECT_EQ(1099511627776LL, Int64FromEnv("INT_VAL4", 20));
+  EXPECT_EQ(20, Int64FromEnv("INT_VAL_UNKNOWN", 20));
+
+  EXPECT_EQ(1, Uint64FromEnv("INT_VAL1", 30));
+  EXPECT_EQ(1099511627776ULL, Uint64FromEnv("INT_VAL4", 30));
+  EXPECT_EQ(30, Uint64FromEnv("INT_VAL_UNKNOWN", 30));
+
+  // I pick values here that can be easily represented exactly in floating-point
+  setenv("DOUBLE_VAL1", "0.0", 1);
+  setenv("DOUBLE_VAL2", "1.0", 1);
+  setenv("DOUBLE_VAL3", "-1.0", 1);
+  EXPECT_EQ(0.0, DoubleFromEnv("DOUBLE_VAL1", 40.0));
+  EXPECT_EQ(1.0, DoubleFromEnv("DOUBLE_VAL2", 40.0));
+  EXPECT_EQ(-1.0, DoubleFromEnv("DOUBLE_VAL3", 40.0));
+  EXPECT_EQ(40.0, DoubleFromEnv("DOUBLE_VAL_UNKNOWN", 40.0));
+
+  setenv("STRING_VAL1", "", 1);
+  setenv("STRING_VAL2", "my happy string!", 1);
+  EXPECT_STREQ("", StringFromEnv("STRING_VAL1", "unknown"));
+  EXPECT_STREQ("my happy string!", StringFromEnv("STRING_VAL2", "unknown"));
+  EXPECT_STREQ("unknown", StringFromEnv("STRING_VAL_UNKNOWN", "unknown"));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+// Tests that the FooFromEnv dies on parse-error
+TEST(FromEnvDeathTest, IllegalValues) {
+  setenv("BOOL_BAD1", "so true!", 1);
+  setenv("BOOL_BAD2", "", 1);
+  EXPECT_DEATH(BoolFromEnv("BOOL_BAD1", false), "error parsing env variable");
+  EXPECT_DEATH(BoolFromEnv("BOOL_BAD2", true), "error parsing env variable");
+
+  setenv("INT_BAD1", "one", 1);
+  setenv("INT_BAD2", "100000000000000000", 1);
+  setenv("INT_BAD3", "0xx10", 1);
+  setenv("INT_BAD4", "", 1);
+  EXPECT_DEATH(Int32FromEnv("INT_BAD1", 10), "error parsing env variable");
+  EXPECT_DEATH(Int32FromEnv("INT_BAD2", 10), "error parsing env variable");
+  EXPECT_DEATH(Int32FromEnv("INT_BAD3", 10), "error parsing env variable");
+  EXPECT_DEATH(Int32FromEnv("INT_BAD4", 10), "error parsing env variable");
+
+  EXPECT_DEATH(Uint32FromEnv("INT_BAD1", 10), "error parsing env variable");
+  EXPECT_DEATH(Uint32FromEnv("INT_BAD2", 10), "error parsing env variable");
+  EXPECT_DEATH(Uint32FromEnv("INT_BAD3", 10), "error parsing env variable");
+  EXPECT_DEATH(Uint32FromEnv("INT_BAD4", 10), "error parsing env variable");
+
+  setenv("BIGINT_BAD1", "18446744073709551616000", 1);
+  EXPECT_DEATH(Int64FromEnv("INT_BAD1", 20), "error parsing env variable");
+  EXPECT_DEATH(Int64FromEnv("INT_BAD3", 20), "error parsing env variable");
+  EXPECT_DEATH(Int64FromEnv("INT_BAD4", 20), "error parsing env variable");
+  EXPECT_DEATH(Int64FromEnv("BIGINT_BAD1", 200), "error parsing env variable");
+
+  setenv("BIGINT_BAD2", "-1", 1);
+  EXPECT_DEATH(Uint64FromEnv("INT_BAD1", 30), "error parsing env variable");
+  EXPECT_DEATH(Uint64FromEnv("INT_BAD3", 30), "error parsing env variable");
+  EXPECT_DEATH(Uint64FromEnv("INT_BAD4", 30), "error parsing env variable");
+  EXPECT_DEATH(Uint64FromEnv("BIGINT_BAD1", 30), "error parsing env variable");
+  // TODO(csilvers): uncomment this when we disallow negative numbers for uint64
+#if 0
+  EXPECT_DEATH(Uint64FromEnv("BIGINT_BAD2", 30), "error parsing env variable");
+#endif
+
+  setenv("DOUBLE_BAD1", "0.0.0", 1);
+  setenv("DOUBLE_BAD2", "", 1);
+  EXPECT_DEATH(DoubleFromEnv("DOUBLE_BAD1", 40.0), "error parsing env variable");
+  EXPECT_DEATH(DoubleFromEnv("DOUBLE_BAD2", 40.0), "error parsing env variable");
+}
+#endif
+
+
+// Tests that FlagSaver can save the states of string flags.
+TEST(FlagSaverTest, CanSaveStringFlagStates) {
+  // 1. Initializes the flags.
+
+  // State of flag test_str1:
+  //   default value - "initial"
+  //   current value - "initial"
+  //   not set       - true
+
+  SetCommandLineOptionWithMode("test_str2", "second", SET_FLAGS_VALUE);
+  // State of flag test_str2:
+  //   default value - "initial"
+  //   current value - "second"
+  //   not set       - false
+
+  SetCommandLineOptionWithMode("test_str3", "second", SET_FLAGS_DEFAULT);
+  // State of flag test_str3:
+  //   default value - "second"
+  //   current value - "second"
+  //   not set       - true
+
+  // 2. Saves the flag states.
+
+  {
+    FlagSaver fs;
+
+    // 3. Modifies the flag states.
+
+    SetCommandLineOptionWithMode("test_str1", "second", SET_FLAGS_VALUE);
+    EXPECT_EQ("second", FLAGS_test_str1);
+    // State of flag test_str1:
+    //   default value - "second"
+    //   current value - "second"
+    //   not set       - true
+
+    SetCommandLineOptionWithMode("test_str2", "third", SET_FLAGS_DEFAULT);
+    EXPECT_EQ("second", FLAGS_test_str2);
+    // State of flag test_str2:
+    //   default value - "third"
+    //   current value - "second"
+    //   not set       - false
+
+    SetCommandLineOptionWithMode("test_str3", "third", SET_FLAGS_VALUE);
+    EXPECT_EQ("third", FLAGS_test_str3);
+    // State of flag test_str1:
+    //   default value - "second"
+    //   current value - "third"
+    //   not set       - false
+
+    // 4. Restores the flag states.
+  }
+
+  // 5. Verifies that the states were restored.
+
+  // Verifies that the value of test_str1 was restored.
+  EXPECT_EQ("initial", FLAGS_test_str1);
+  // Verifies that the "not set" attribute of test_str1 was restored to true.
+  SetCommandLineOptionWithMode("test_str1", "second", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str1);
+
+  // Verifies that the value of test_str2 was restored.
+  EXPECT_EQ("second", FLAGS_test_str2);
+  // Verifies that the "not set" attribute of test_str2 was restored to false.
+  SetCommandLineOptionWithMode("test_str2", "fourth", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("second", FLAGS_test_str2);
+
+  // Verifies that the value of test_str3 was restored.
+  EXPECT_EQ("second", FLAGS_test_str3);
+  // Verifies that the "not set" attribute of test_str3 was restored to true.
+  SetCommandLineOptionWithMode("test_str3", "fourth", SET_FLAG_IF_DEFAULT);
+  EXPECT_EQ("fourth", FLAGS_test_str3);
+}
+
+
+// Tests that FlagSaver can save the values of various-typed flags.
+TEST(FlagSaverTest, CanSaveVariousTypedFlagValues) {
+  // Initializes the flags.
+  FLAGS_test_bool = false;
+  FLAGS_test_int32 = -1;
+  FLAGS_test_uint32 = 2;
+  FLAGS_test_int64 = -3;
+  FLAGS_test_uint64 = 4;
+  FLAGS_test_double = 5.0;
+  FLAGS_test_string = "good";
+
+  // Saves the flag states.
+  {
+    FlagSaver fs;
+
+    // Modifies the flags.
+    FLAGS_test_bool = true;
+    FLAGS_test_int32 = -5;
+    FLAGS_test_uint32 = 6;
+    FLAGS_test_int64 = -7;
+    FLAGS_test_uint64 = 8;
+    FLAGS_test_double = 8.0;
+    FLAGS_test_string = "bad";
+
+    // Restores the flag states.
+  }
+
+  // Verifies the flag values were restored.
+  EXPECT_FALSE(FLAGS_test_bool);
+  EXPECT_EQ(-1, FLAGS_test_int32);
+  EXPECT_EQ(2, FLAGS_test_uint32);
+  EXPECT_EQ(-3, FLAGS_test_int64);
+  EXPECT_EQ(4, FLAGS_test_uint64);
+  EXPECT_DOUBLE_EQ(5.0, FLAGS_test_double);
+  EXPECT_EQ("good", FLAGS_test_string);
+}
+
+TEST(GetAllFlagsTest, BaseTest) {
+  vector<CommandLineFlagInfo> flags;
+  GetAllFlags(&flags);
+  bool found_test_bool = false;
+  vector<CommandLineFlagInfo>::const_iterator i;
+  for (i = flags.begin(); i != flags.end(); ++i) {
+    if (i->name == "test_bool") {
+      found_test_bool = true;
+      EXPECT_EQ(i->type, "bool");
+      EXPECT_EQ(i->default_value, "false");
+      EXPECT_EQ(i->flag_ptr, &FLAGS_test_bool);
+      break;
+    }
+  }
+  EXPECT_TRUE(found_test_bool);
+}
+
+TEST(ShowUsageWithFlagsTest, BaseTest) {
+  // TODO(csilvers): test this by allowing output other than to stdout.
+  // Not urgent since this functionality is tested via
+  // gflags_unittest.sh, though only through use of --help.
+}
+
+TEST(ShowUsageWithFlagsRestrictTest, BaseTest) {
+  // TODO(csilvers): test this by allowing output other than to stdout.
+  // Not urgent since this functionality is tested via
+  // gflags_unittest.sh, though only through use of --helpmatch.
+}
+
+// Note: all these argv-based tests depend on SetArgv being called
+// before ParseCommandLineFlags() in main(), below.
+TEST(GetArgvsTest, BaseTest) {
+  vector<string> argvs = GetArgvs();
+  EXPECT_EQ(4, argvs.size());
+  EXPECT_EQ("/test/argv/for/gflags_unittest", argvs[0]);
+  EXPECT_EQ("argv 2", argvs[1]);
+  EXPECT_EQ("3rd argv", argvs[2]);
+  EXPECT_EQ("argv #4", argvs[3]);
+}
+
+TEST(GetArgvTest, BaseTest) {
+  EXPECT_STREQ("/test/argv/for/gflags_unittest "
+               "argv 2 3rd argv argv #4", GetArgv());
+}
+
+TEST(GetArgv0Test, BaseTest) {
+  EXPECT_STREQ("/test/argv/for/gflags_unittest", GetArgv0());
+}
+
+TEST(GetArgvSumTest, BaseTest) {
+  // This number is just the sum of the ASCII values of all the chars
+  // in GetArgv().
+  EXPECT_EQ(4904, GetArgvSum());
+}
+
+TEST(ProgramInvocationNameTest, BaseTest) {
+  EXPECT_STREQ("/test/argv/for/gflags_unittest",
+               ProgramInvocationName());
+}
+
+TEST(ProgramInvocationShortNameTest, BaseTest) {
+  EXPECT_STREQ("gflags_unittest", ProgramInvocationShortName());
+}
+
+TEST(ProgramUsageTest, BaseTest) {  // Depends on 1st arg to ParseCommandLineFlags()
+  EXPECT_STREQ("/test/argv/for/gflags_unittest: "
+               "<useless flag> [...]\nDoes something useless.\n",
+               ProgramUsage());
+}
+
+TEST(GetCommandLineOptionTest, NameExistsAndIsDefault) {
+  string value("will be changed");
+  bool r = GetCommandLineOption("test_bool", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("false", value);
+
+  r = GetCommandLineOption("test_int32", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("-1", value);
+}
+
+TEST(GetCommandLineOptionTest, NameExistsAndWasAssigned) {
+  FLAGS_test_int32 = 400;
+  string value("will be changed");
+  const bool r = GetCommandLineOption("test_int32", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("400", value);
+}
+
+TEST(GetCommandLineOptionTest, NameExistsAndWasSet) {
+  SetCommandLineOption("test_int32", "700");
+  string value("will be changed");
+  const bool r = GetCommandLineOption("test_int32", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("700", value);
+}
+
+TEST(GetCommandLineOptionTest, NameExistsAndWasNotSet) {
+  // This doesn't set the flag's value, but rather its default value.
+  // is_default is still true, but the 'default' value returned has changed!
+  SetCommandLineOptionWithMode("test_int32", "800", SET_FLAGS_DEFAULT);
+  string value("will be changed");
+  const bool r = GetCommandLineOption("test_int32", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("800", value);
+  EXPECT_TRUE(GetCommandLineFlagInfoOrDie("test_int32").is_default);
+}
+
+TEST(GetCommandLineOptionTest, NameExistsAndWasConditionallySet) {
+  SetCommandLineOptionWithMode("test_int32", "900", SET_FLAG_IF_DEFAULT);
+  string value("will be changed");
+  const bool r = GetCommandLineOption("test_int32", &value);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("900", value);
+}
+
+TEST(GetCommandLineOptionTest, NameDoesNotExist) {
+  string value("will not be changed");
+  const bool r = GetCommandLineOption("test_int3210", &value);
+  EXPECT_FALSE(r);
+  EXPECT_EQ("will not be changed", value);
+}
+
+TEST(GetCommandLineFlagInfoTest, FlagExists) {
+  CommandLineFlagInfo info;
+  bool r = GetCommandLineFlagInfo("test_int32", &info);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("test_int32", info.name);
+  EXPECT_EQ("int32", info.type);
+  EXPECT_EQ("", info.description);
+  EXPECT_EQ("-1", info.current_value);
+  EXPECT_EQ("-1", info.default_value);
+  EXPECT_TRUE(info.is_default);
+  EXPECT_FALSE(info.has_validator_fn);
+  EXPECT_EQ(&FLAGS_test_int32, info.flag_ptr);
+
+  FLAGS_test_bool = true;
+  r = GetCommandLineFlagInfo("test_bool", &info);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("test_bool", info.name);
+  EXPECT_EQ("bool", info.type);
+  EXPECT_EQ("tests bool-ness", info.description);
+  EXPECT_EQ("true", info.current_value);
+  EXPECT_EQ("false", info.default_value);
+  EXPECT_FALSE(info.is_default);
+  EXPECT_FALSE(info.has_validator_fn);
+  EXPECT_EQ(&FLAGS_test_bool, info.flag_ptr);
+
+  FLAGS_test_bool = false;
+  r = GetCommandLineFlagInfo("test_bool", &info);
+  EXPECT_TRUE(r);
+  EXPECT_EQ("test_bool", info.name);
+  EXPECT_EQ("bool", info.type);
+  EXPECT_EQ("tests bool-ness", info.description);
+  EXPECT_EQ("false", info.current_value);
+  EXPECT_EQ("false", info.default_value);
+  EXPECT_FALSE(info.is_default);  // value is same, but flag *was* modified
+  EXPECT_FALSE(info.has_validator_fn);
+  EXPECT_EQ(&FLAGS_test_bool, info.flag_ptr);
+}
+
+TEST(GetCommandLineFlagInfoTest, FlagDoesNotExist) {
+  CommandLineFlagInfo info;
+  // Set to some random values that GetCommandLineFlagInfo should not change
+  info.name = "name";
+  info.type = "type";
+  info.current_value = "curr";
+  info.default_value = "def";
+  info.filename = "/";
+  info.is_default = false;
+  info.has_validator_fn = true;
+  info.flag_ptr = NULL;
+  bool r = GetCommandLineFlagInfo("test_int3210", &info);
+  EXPECT_FALSE(r);
+  EXPECT_EQ("name", info.name);
+  EXPECT_EQ("type", info.type);
+  EXPECT_EQ("", info.description);
+  EXPECT_EQ("curr", info.current_value);
+  EXPECT_EQ("def", info.default_value);
+  EXPECT_EQ("/", info.filename);
+  EXPECT_FALSE(info.is_default);
+  EXPECT_TRUE(info.has_validator_fn);
+  EXPECT_EQ(NULL, info.flag_ptr);
+}
+
+TEST(GetCommandLineFlagInfoOrDieTest, FlagExistsAndIsDefault) {
+  CommandLineFlagInfo info;
+  info = GetCommandLineFlagInfoOrDie("test_int32");
+  EXPECT_EQ("test_int32", info.name);
+  EXPECT_EQ("int32", info.type);
+  EXPECT_EQ("", info.description);
+  EXPECT_EQ("-1", info.current_value);
+  EXPECT_EQ("-1", info.default_value);
+  EXPECT_TRUE(info.is_default);
+  EXPECT_EQ(&FLAGS_test_int32, info.flag_ptr);
+  info = GetCommandLineFlagInfoOrDie("test_bool");
+  EXPECT_EQ("test_bool", info.name);
+  EXPECT_EQ("bool", info.type);
+  EXPECT_EQ("tests bool-ness", info.description);
+  EXPECT_EQ("false", info.current_value);
+  EXPECT_EQ("false", info.default_value);
+  EXPECT_TRUE(info.is_default);
+  EXPECT_FALSE(info.has_validator_fn);
+  EXPECT_EQ(&FLAGS_test_bool, info.flag_ptr);
+}
+
+TEST(GetCommandLineFlagInfoOrDieTest, FlagExistsAndWasAssigned) {
+  FLAGS_test_int32 = 400;
+  CommandLineFlagInfo info;
+  info = GetCommandLineFlagInfoOrDie("test_int32");
+  EXPECT_EQ("test_int32", info.name);
+  EXPECT_EQ("int32", info.type);
+  EXPECT_EQ("", info.description);
+  EXPECT_EQ("400", info.current_value);
+  EXPECT_EQ("-1", info.default_value);
+  EXPECT_FALSE(info.is_default);
+  EXPECT_EQ(&FLAGS_test_int32, info.flag_ptr);
+  FLAGS_test_bool = true;
+  info = GetCommandLineFlagInfoOrDie("test_bool");
+  EXPECT_EQ("test_bool", info.name);
+  EXPECT_EQ("bool", info.type);
+  EXPECT_EQ("tests bool-ness", info.description);
+  EXPECT_EQ("true", info.current_value);
+  EXPECT_EQ("false", info.default_value);
+  EXPECT_FALSE(info.is_default);
+  EXPECT_FALSE(info.has_validator_fn);
+  EXPECT_EQ(&FLAGS_test_bool, info.flag_ptr);
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(GetCommandLineFlagInfoOrDieDeathTest, FlagDoesNotExist) {
+  EXPECT_DEATH(GetCommandLineFlagInfoOrDie("test_int3210"),
+               ".*: flag test_int3210 does not exist");
+}
+#endif
+
+
+// These are lightly tested because they're deprecated.  Basically,
+// the tests are meant to cover how existing users use these functions,
+// but not necessarily how new users could use them.
+TEST(DeprecatedFunctionsTest, CommandlineFlagsIntoString) {
+  string s = CommandlineFlagsIntoString();
+  EXPECT_NE(string::npos, s.find("--test_bool="));
+}
+
+TEST(DeprecatedFunctionsTest, AppendFlagsIntoFile) {
+  FLAGS_test_int32 = 10;     // just to make the test more interesting
+  string filename(TmpFile("flagfile"));
+  unlink(filename.c_str());  // just to be safe
+  const bool r = AppendFlagsIntoFile(filename, "not the real argv0");
+  EXPECT_TRUE(r);
+
+  FILE* fp;
+  EXPECT_EQ(0, SafeFOpen(&fp, filename.c_str(), "r"));
+  EXPECT_TRUE(fp != NULL);
+  char line[8192];
+  EXPECT_TRUE(fgets(line, sizeof(line)-1, fp) != NULL);  // get the first line
+  // First line should be progname.
+  EXPECT_STREQ("not the real argv0\n", line);
+
+  bool found_bool = false, found_int32 = false;
+  while (fgets(line, sizeof(line)-1, fp)) {
+    line[sizeof(line)-1] = '\0';    // just to be safe
+    if (strcmp(line, "--test_bool=false\n") == 0)
+      found_bool = true;
+    if (strcmp(line, "--test_int32=10\n") == 0)
+      found_int32 = true;
+  }
+  EXPECT_TRUE(found_int32);
+  EXPECT_TRUE(found_bool);
+  fclose(fp);
+}
+
+TEST(DeprecatedFunctionsTest, ReadFromFlagsFile) {
+  FLAGS_test_int32 = -10;    // just to make the test more interesting
+  string filename(TmpFile("flagfile2"));
+  unlink(filename.c_str());  // just to be safe
+  bool r = AppendFlagsIntoFile(filename, GetArgv0());
+  EXPECT_TRUE(r);
+
+  FLAGS_test_int32 = -11;
+  r = ReadFromFlagsFile(filename, GetArgv0(), true);
+  EXPECT_TRUE(r);
+  EXPECT_EQ(-10, FLAGS_test_int32);
+}  // unnamed namespace
+
+TEST(DeprecatedFunctionsTest, ReadFromFlagsFileFailure) {
+  FLAGS_test_int32 = -20;
+  string filename(TmpFile("flagfile3"));
+  FILE* fp;
+  EXPECT_EQ(0, SafeFOpen(&fp, filename.c_str(), "w"));
+  EXPECT_TRUE(fp != NULL);
+  // Note the error in the bool assignment below...
+  fprintf(fp, "%s\n--test_int32=-21\n--test_bool=not_a_bool!\n", GetArgv0());
+  fclose(fp);
+
+  FLAGS_test_int32 = -22;
+  const bool r = ReadFromFlagsFile(filename, GetArgv0(), false);
+  EXPECT_FALSE(r);
+  EXPECT_EQ(-22, FLAGS_test_int32);   // the -21 from the flagsfile didn't take
+}
+
+TEST(FlagsSetBeforeInitTest, TryFromEnv) {
+  EXPECT_EQ("pre-set", FLAGS_test_tryfromenv);
+}
+
+// The following test case verifies that ParseCommandLineFlags() and
+// ParseCommandLineNonHelpFlags() uses the last definition of a flag
+// in case it's defined more than once.
+
+DEFINE_int32(test_flag, -1, "used for testing gflags.cc");
+
+// Parses and returns the --test_flag flag.
+// If with_help is true, calls ParseCommandLineFlags; otherwise calls
+// ParseCommandLineNonHelpFlags.
+int32 ParseTestFlag(bool with_help, int argc, const char** const_argv) {
+  FlagSaver fs;  // Restores the flags before returning.
+
+  // Makes a copy of the input array s.t. it can be reused
+  // (ParseCommandLineFlags() will alter the array).
+  char** const argv_save = new char*[argc + 1];
+  char** argv = argv_save;
+  memcpy(argv, const_argv, sizeof(*argv)*(argc + 1));
+
+  if (with_help) {
+    ParseCommandLineFlags(&argc, &argv, true);
+  } else {
+    ParseCommandLineNonHelpFlags(&argc, &argv, true);
+  }
+
+  delete[] argv_save;
+  return FLAGS_test_flag;
+}
+
+TEST(ParseCommandLineFlagsUsesLastDefinitionTest,
+     WhenFlagIsDefinedTwiceOnCommandLine) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=1",
+    "--test_flag=2",
+    NULL,
+  };
+
+  EXPECT_EQ(2, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(2, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsUsesLastDefinitionTest,
+     WhenFlagIsDefinedTwiceInFlagFile) {
+  const char* argv[] = {
+    "my_test",
+    GetFlagFileFlag(),
+    NULL,
+  };
+
+  EXPECT_EQ(2, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(2, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsUsesLastDefinitionTest,
+     WhenFlagIsDefinedInCommandLineAndThenFlagFile) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=0",
+    GetFlagFileFlag(),
+    NULL,
+  };
+
+  EXPECT_EQ(2, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(2, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsUsesLastDefinitionTest,
+     WhenFlagIsDefinedInFlagFileAndThenCommandLine) {
+  const char* argv[] = {
+    "my_test",
+    GetFlagFileFlag(),
+    "--test_flag=3",
+    NULL,
+  };
+
+  EXPECT_EQ(3, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(3, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsUsesLastDefinitionTest,
+     WhenFlagIsDefinedInCommandLineAndFlagFileAndThenCommandLine) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=0",
+    GetFlagFileFlag(),
+    "--test_flag=3",
+    NULL,
+  };
+
+  EXPECT_EQ(3, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(3, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsAndDashArgs, TwoDashArgFirst) {
+  const char* argv[] = {
+    "my_test",
+    "--",
+    "--test_flag=0",
+    NULL,
+  };
+
+  EXPECT_EQ(-1, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(-1, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsAndDashArgs, TwoDashArgMiddle) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=7",
+    "--",
+    "--test_flag=0",
+    NULL,
+  };
+
+  EXPECT_EQ(7, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(7, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+TEST(ParseCommandLineFlagsAndDashArgs, OneDashArg) {
+  const char* argv[] = {
+    "my_test",
+    "-",
+    "--test_flag=0",
+    NULL,
+  };
+
+  EXPECT_EQ(0, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  EXPECT_EQ(0, ParseTestFlag(false, arraysize(argv) - 1, argv));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(ParseCommandLineFlagsUnknownFlagDeathTest,
+     FlagIsCompletelyUnknown) {
+  const char* argv[] = {
+    "my_test",
+    "--this_flag_does_not_exist",
+    NULL,
+  };
+
+  EXPECT_DEATH(ParseTestFlag(true, arraysize(argv) - 1, argv),
+               "unknown command line flag.*");
+  EXPECT_DEATH(ParseTestFlag(false, arraysize(argv) - 1, argv),
+               "unknown command line flag.*");
+}
+
+TEST(ParseCommandLineFlagsUnknownFlagDeathTest,
+     BoolFlagIsCompletelyUnknown) {
+  const char* argv[] = {
+    "my_test",
+    "--nothis_flag_does_not_exist",
+    NULL,
+  };
+
+  EXPECT_DEATH(ParseTestFlag(true, arraysize(argv) - 1, argv),
+               "unknown command line flag.*");
+  EXPECT_DEATH(ParseTestFlag(false, arraysize(argv) - 1, argv),
+               "unknown command line flag.*");
+}
+
+TEST(ParseCommandLineFlagsUnknownFlagDeathTest,
+     FlagIsNotABool) {
+  const char* argv[] = {
+    "my_test",
+    "--notest_string",
+    NULL,
+  };
+
+  EXPECT_DEATH(ParseTestFlag(true, arraysize(argv) - 1, argv),
+               "boolean value .* specified for .* command line flag");
+  EXPECT_DEATH(ParseTestFlag(false, arraysize(argv) - 1, argv),
+               "boolean value .* specified for .* command line flag");
+}
+#endif
+
+TEST(ParseCommandLineFlagsWrongFields,
+     DescriptionIsInvalid) {
+  // These must not be automatic variables, since command line flags
+  // aren't unregistered and gUnit uses FlagSaver to save and restore
+  // command line flags' values.  If these are on the stack, then when
+  // later tests attempt to save and restore their values, the stack
+  // addresses of these variables will be overwritten...  Stack smash!
+  static bool current_storage;
+  static bool defvalue_storage;
+  FlagRegisterer fr("flag_name", NULL, "filename",
+                    &current_storage, &defvalue_storage);
+  CommandLineFlagInfo fi;
+  EXPECT_TRUE(GetCommandLineFlagInfo("flag_name", &fi));
+  EXPECT_EQ("", fi.description);
+  EXPECT_EQ(&current_storage, fi.flag_ptr);
+}
+
+static bool ValidateTestFlagIs5(const char* flagname, int32 flagval) {
+  if (flagval == 5)
+    return true;
+  printf("%s isn't 5!\n", flagname);
+  return false;
+}
+
+static bool ValidateTestFlagIs10(const char* flagname, int32 flagval) {
+  return flagval == 10;
+}
+
+
+TEST(FlagsValidator, ValidFlagViaArgv) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=5",
+    NULL,
+  };
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_EQ(5, ParseTestFlag(true, arraysize(argv) - 1, argv));
+  // Undo the flag validator setting
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+TEST(FlagsValidator, ValidFlagViaSetDefault) {
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  // SetCommandLineOptionWithMode returns the empty string on error.
+  EXPECT_NE("", SetCommandLineOptionWithMode("test_flag", "5",
+                                             SET_FLAG_IF_DEFAULT));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+TEST(FlagsValidator, ValidFlagViaSetValue) {
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  FLAGS_test_flag = 100;   // doesn't trigger the validator
+  // SetCommandLineOptionWithMode returns the empty string on error.
+  EXPECT_NE("", SetCommandLineOptionWithMode("test_flag", "5",
+                                             SET_FLAGS_VALUE));
+  EXPECT_NE("", SetCommandLineOptionWithMode("test_flag", "5",
+                                             SET_FLAGS_DEFAULT));
+  EXPECT_NE("", SetCommandLineOption("test_flag", "5"));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(FlagsValidatorDeathTest, InvalidFlagViaArgv) {
+  const char* argv[] = {
+    "my_test",
+    "--test_flag=50",
+    NULL,
+  };
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_DEATH(ParseTestFlag(true, arraysize(argv) - 1, argv),
+               "ERROR: failed validation of new value '50' for flag 'test_flag'");
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+#endif
+
+TEST(FlagsValidator, InvalidFlagViaSetDefault) {
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  // SetCommandLineOptionWithMode returns the empty string on error.
+  EXPECT_EQ("", SetCommandLineOptionWithMode("test_flag", "50",
+                                             SET_FLAG_IF_DEFAULT));
+  EXPECT_EQ(-1, FLAGS_test_flag);   // the setting-to-50 should have failed
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+TEST(FlagsValidator, InvalidFlagViaSetValue) {
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  FLAGS_test_flag = 100;   // doesn't trigger the validator
+  // SetCommandLineOptionWithMode returns the empty string on error.
+  EXPECT_EQ("", SetCommandLineOptionWithMode("test_flag", "50",
+                                             SET_FLAGS_VALUE));
+  EXPECT_EQ("", SetCommandLineOptionWithMode("test_flag", "50",
+                                             SET_FLAGS_DEFAULT));
+  EXPECT_EQ("", SetCommandLineOption("test_flag", "50"));
+  EXPECT_EQ(100, FLAGS_test_flag);   // the setting-to-50 should have failed
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST(FlagsValidatorDeathTest, InvalidFlagNeverSet) {
+  // If a flag keeps its default value, and that default value is
+  // invalid, we should die at argv-parse time.
+  const char* argv[] = {
+    "my_test",
+    NULL,
+  };
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_DEATH(ParseTestFlag(true, arraysize(argv) - 1, argv),
+               "ERROR: --test_flag must be set on the commandline");
+}
+#endif
+
+TEST(FlagsValidator, InvalidFlagPtr) {
+  int32 dummy;
+  EXPECT_FALSE(RegisterFlagValidator(NULL, &ValidateTestFlagIs5));
+  EXPECT_FALSE(RegisterFlagValidator(&dummy, &ValidateTestFlagIs5));
+}
+
+TEST(FlagsValidator, RegisterValidatorTwice) {
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_FALSE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs10));
+  EXPECT_FALSE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs10));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs10));
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+}
+
+TEST(FlagsValidator, CommandLineFlagInfo) {
+  CommandLineFlagInfo info;
+  info = GetCommandLineFlagInfoOrDie("test_flag");
+  EXPECT_FALSE(info.has_validator_fn);
+
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  info = GetCommandLineFlagInfoOrDie("test_flag");
+  EXPECT_TRUE(info.has_validator_fn);
+
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+  info = GetCommandLineFlagInfoOrDie("test_flag");
+  EXPECT_FALSE(info.has_validator_fn);
+}
+
+TEST(FlagsValidator, FlagSaver) {
+  {
+    FlagSaver fs;
+    EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+    EXPECT_EQ("", SetCommandLineOption("test_flag", "50"));  // fails validation
+  }
+  EXPECT_NE("", SetCommandLineOption("test_flag", "50"));  // validator is gone
+
+  EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, &ValidateTestFlagIs5));
+  {
+    FlagSaver fs;
+    EXPECT_TRUE(RegisterFlagValidator(&FLAGS_test_flag, NULL));
+    EXPECT_NE("", SetCommandLineOption("test_flag", "50"));  // no validator
+  }
+  EXPECT_EQ("", SetCommandLineOption("test_flag", "50"));  // validator is back
+}
+
+
+}  // unnamed namespace
+
+static int main(int argc, char **argv) {
+
+  // Run unit tests only if called without arguments, otherwise this program
+  // is used by an "external" usage test
+  const bool run_tests = (argc == 1);
+
+  // We need to call SetArgv before parsing flags, so our "test" argv will
+  // win out over this executable's real argv.  That makes running this
+  // test with a real --help flag kinda annoying, unfortunately.
+  const char* test_argv[] = { "/test/argv/for/gflags_unittest",
+                              "argv 2", "3rd argv", "argv #4" };
+  SetArgv(arraysize(test_argv), test_argv);
+
+  // The first arg is the usage message, also important for testing.
+  string usage_message = (string(GetArgv0()) +
+                          ": <useless flag> [...]\nDoes something useless.\n");
+
+  // We test setting tryfromenv manually, and making sure
+  // ParseCommandLineFlags still evaluates it.
+  FLAGS_tryfromenv = "test_tryfromenv";
+  setenv("FLAGS_test_tryfromenv", "pre-set", 1);
+
+  // Modify flag values from declared default value in two ways.
+  // The recommended way:
+  SetCommandLineOptionWithMode("changed_bool1", "true", SET_FLAGS_DEFAULT);
+
+  // The non-recommended way:
+  FLAGS_changed_bool2 = true;
+
+  SetUsageMessage(usage_message);
+  SetVersionString("test_version");
+  ParseCommandLineFlags(&argc, &argv, true);
+  MakeTmpdir(&FLAGS_test_tmpdir);
+
+  int exit_status = 0;
+  if (run_tests) {
+	  fprintf(stdout, "Running the unit tests now...\n\n"); fflush(stdout);
+	  exit_status = RUN_ALL_TESTS();
+  } else fprintf(stderr, "\n\nPASS\n");
+  ShutDownCommandLineFlags();
+  return exit_status;
+}
+
+} // GFLAGS_NAMESPACE
+
+int main(int argc, char** argv) {
+  return GFLAGS_NAMESPACE::main(argc, argv);
+}
+
diff --git a/3rdparty/TNN/third_party/gflags/test/gflags_unittest_flagfile b/3rdparty/TNN/third_party/gflags/test/gflags_unittest_flagfile
new file mode 100644
index 0000000..f4fa0c4
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/gflags_unittest_flagfile
@@ -0,0 +1,2 @@
+--test_flag=1
+--test_flag=2
diff --git a/3rdparty/TNN/third_party/gflags/test/nc/CMakeLists.txt b/3rdparty/TNN/third_party/gflags/test/nc/CMakeLists.txt
new file mode 100644
index 0000000..d00b07d
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/nc/CMakeLists.txt
@@ -0,0 +1,16 @@
+## gflags negative compilation tests
+
+cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
+
+if (NOT TEST_NAME)
+  message (FATAL_ERROR "Missing TEST_NAME CMake flag")
+endif ()
+string (TOUPPER ${TEST_NAME} TEST_NAME_UPPER)
+
+project (gflags_${TEST_NAME})
+
+find_package (gflags REQUIRED)
+include_directories ("${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_definitions (-DTEST_${TEST_NAME_UPPER})
+add_executable (gflags_${TEST_NAME} gflags_nc.cc)
+target_link_libraries(gflags_${TEST_NAME} gflags)
diff --git a/3rdparty/TNN/third_party/gflags/test/nc/gflags_nc.cc b/3rdparty/TNN/third_party/gflags/test/nc/gflags_nc.cc
new file mode 100644
index 0000000..1990c30
--- /dev/null
+++ b/3rdparty/TNN/third_party/gflags/test/nc/gflags_nc.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// A negative comiple test for gflags.
+
+#include <gflags/gflags.h>
+
+#if defined(TEST_NC_SWAPPED_ARGS)
+
+DEFINE_bool(some_bool_flag,
+            "the default value should go here, not the description",
+            false);
+
+
+#elif defined(TEST_NC_INT_INSTEAD_OF_BOOL)
+
+DEFINE_bool(some_bool_flag_2,
+            0,
+            "should have been an int32 flag but mistakenly used bool instead");
+
+#elif defined(TEST_NC_BOOL_IN_QUOTES)
+
+
+DEFINE_bool(some_bool_flag_3,
+            "false",
+            "false in in quotes, which is wrong");
+
+#elif defined(TEST_NC_SANITY)
+
+DEFINE_bool(some_bool_flag_4,
+            true,
+            "this is the correct usage of DEFINE_bool");
+
+#elif defined(TEST_NC_DEFINE_STRING_WITH_0)
+
+DEFINE_string(some_string_flag,
+              0,
+              "Trying to construct a string by passing 0 would cause a crash.");
+
+#endif
+
+int main(int, char **)
+{
+  return 0;
+}
diff --git a/3rdparty/TNN/third_party/huawei_npu/README.md b/3rdparty/TNN/third_party/huawei_npu/README.md
new file mode 100644
index 0000000..b56bcfd
--- /dev/null
+++ b/3rdparty/TNN/third_party/huawei_npu/README.md
@@ -0,0 +1,59 @@
+# 如何正确放置HiAI ddk
+选项1: 
+  在 <TNN_PROJECT>/thrid_party/huawei_npu/ 下运行 ./download_ddk.sh 脚本下载最新版的ddk。
+  
+
+选项2：
+1. 到华为开发者联盟下载DDK[https://developer.huawei.com/consumer/cn/doc/overview/HUAWEI_HiAI]
+2. 解压缩
+3. 进入到下载文件夹下的`ddk/ai_ddk_lib`目录
+4. 在`<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`下创建`armeabi-v7a`文件夹， 并将ai_ddk_lib目录下的lib文件夹中所有文件复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a`
+5. 在`<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`下创建`arm64-v8a`文件夹，并将ai_ddk_lib目录下的lib64文件夹中所有文件复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/arm64-v8a`
+6. 将ai_ddk_lib目录下include`文件夹`复制到 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`目录下
+
+### 最终 `<path_to_tnn>/third_party/huawei_npu/hiai_ddk_latest/`文件结构应该如下：
+
+```
+hiai_ddk_latest
+├── arm64-v8a 
+│   ├── libcpucl.so 
+│   ├── libhcl.so
+│   ├── libhiai.so:
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+├── armeabi-v7a
+│   ├── libcpucl.so
+│   ├── libhcl.so
+│   ├── libhiai.so
+│   ├── libhiai_ir.so
+│   └── libhiai_ir_build.so
+└── include
+    ├── HiAiAippPara.h
+    ├── HiAiModelManagerService.h
+    ├── HiAiModelManagerType.h
+    ├── graph
+    │   ├── attr_value.h
+    │   ├── buffer.h
+    │   ├── common
+    │   │   └── secures\tl.h
+    │   ├── debug
+    │   │   └── ge_error_codes.h
+    │   ├── detail
+    │   │   └── attributes_holder.h
+    │   ├── graph.h
+    │   ├── model.h
+    │   ├── op
+    │   │   ├── all_ops.h
+    │   │   ├── array_defs.h
+    │   │   ├── const_defs.h
+    │   │   ├── detection_defs.h
+    │   │   ├── image_defs.h
+    │   │   ├── math_defs.h
+    │   │   ├── nn_defs.h
+    │   │   └── random_defs.h
+    │   ├── operator.h
+    │   ├── operator_reg.h
+    │   ├── tensor.h 
+    │   └── types.h
+    └── hiai_ir_build.h
+```
diff --git a/3rdparty/TNN/third_party/huawei_npu/download_ddk.sh b/3rdparty/TNN/third_party/huawei_npu/download_ddk.sh
new file mode 100755
index 0000000..955423e
--- /dev/null
+++ b/3rdparty/TNN/third_party/huawei_npu/download_ddk.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+HIAI_DDK_VERSION=hwhiai-ddk-100.500.010.011
+#URL, local path
+download_file() { #URL, path
+  if [ -e $2 ]; then return 0; fi
+
+  name=`basename $2`
+  echo "downloading $name ..."
+  status=`curl $1 -s -w %{http_code} -o $2`
+  if (( status == 200 )); then
+    return 0
+  else
+    echo "download $name failed" 1>&2
+    return -1
+  fi
+}
+
+download_ddk() {
+  directory="./"
+  if [ ! -e ${directory} ]; then
+    mkdir -p ${directory}
+  fi
+
+  ddk_name=`basename $1`
+  ddk_path_local="${directory}/${ddk_name}"
+  if [ ! -f ${ddk_path_local} ]; then
+    download_file $1 $ddk_path_local
+  fi
+
+}
+download_ddk\
+    "https://raw.githubusercontent.com/darrenyao87/tnn-models/master/ddk/$HIAI_DDK_VERSION.tar"\
+  "$HIAI_DDK_VERSION.tar"
+tar -xvf $HIAI_DDK_VERSION.tar
+rm hiai_ddk_latest
+ln -s $HIAI_DDK_VERSION hiai_ddk_latest
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl.h b/3rdparty/TNN/third_party/opencl/include/CL/cl.h
new file mode 100644
index 0000000..7224ed3
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl.h
@@ -0,0 +1,1783 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_version.h>
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_version.h>
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_device_svm_capabilities;
+#endif
+typedef cl_bitfield         cl_command_queue_properties;
+#ifdef CL_VERSION_1_2
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+#endif
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_queue_properties;
+#endif
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_svm_mem_flags;
+#endif
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+#ifdef CL_VERSION_1_2
+typedef cl_bitfield         cl_mem_migration_flags;
+#endif
+typedef cl_uint             cl_image_info;
+#ifdef CL_VERSION_1_1
+typedef cl_uint             cl_buffer_create_type;
+#endif
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+#ifdef CL_VERSION_2_0
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+#endif
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_program_binary_type;
+#endif
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+#endif
+typedef cl_uint             cl_kernel_work_group_info;
+#ifdef CL_VERSION_2_1
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+#endif
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+#ifdef CL_VERSION_1_2
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
+} cl_image_desc;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+#endif
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#ifdef CL_VERSION_1_1
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#endif
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#ifdef CL_VERSION_1_1
+#define CL_INVALID_PROPERTY                         -64
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+#endif
+
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#ifdef CL_VERSION_1_2
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+#endif
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#ifdef CL_VERSION_2_1
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#endif
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#endif
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#endif
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#ifdef CL_VERSION_1_1
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#endif
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#ifdef CL_VERSION_1_1
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+#endif
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+#endif
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#ifdef CL_VERSION_1_1
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#endif
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#ifdef CL_VERSION_1_2
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+#endif
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_SIZE                               0x1094
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#endif
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#ifdef CL_VERSION_1_2
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+#endif
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#ifdef CL_VERSION_1_1
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+#endif
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#ifdef CL_VERSION_1_2
+#define CL_UNORM_INT24                              0x10DF
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_UNORM_INT_101010_2                       0x10E0
+#endif
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#ifdef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+#endif
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#ifdef CL_VERSION_1_1
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#endif
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#ifdef CL_VERSION_1_2
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
+#endif
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#ifdef CL_VERSION_1_1
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+#endif
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#ifdef CL_VERSION_2_0
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#endif
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#ifdef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#endif
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_PROGRAM_IL                               0x1169
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+#endif
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+#endif
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#ifdef CL_VERSION_2_0
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+#endif
+
+#endif
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+#endif
+
+#ifdef CL_VERSION_2_1
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+#endif
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#ifdef CL_VERSION_1_1
+#define CL_EVENT_CONTEXT                            0x11D4
+#endif
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#ifdef CL_VERSION_1_1
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#endif
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#ifdef CL_VERSION_1_1
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+#endif
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#ifdef CL_VERSION_2_0
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+#endif
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   /* platform */,
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */,
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */,
+               cl_uint          /* num_entries */,
+               cl_device_id *   /* devices */,
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           /* context */,
+                               cl_device_id         /* device */,
+                               cl_command_queue     /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    /* device */,
+                        cl_ulong*       /* device_timestamp */,
+                        cl_ulong*       /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id /* device */,
+               cl_ulong *   /* host_timestamp */)  CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */,
+                 cl_context_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */,
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */,
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* SVM Allocation APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+/* Sampler APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    /* context */,
+                     const void*    /* il */,
+                     size_t         /* length */,
+                     cl_int*        /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */,
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */,
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */,
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          /* program */,
+                            void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                            void *              /* user_data */) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  /* program */,
+                                   cl_uint     /* spec_id */,
+                                   size_t      /* spec_size */,
+                                   const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     /* source_kernel */,
+              cl_int*       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   /* kernel */,
+                        cl_device_id                /* device */,
+                        cl_kernel_sub_group_info    /* param_name */,
+                        size_t                      /* input_value_size */,
+                        const void*                 /*input_value */,
+                        size_t                      /* param_value_size */,
+                        void*                       /* param_value */,
+                        size_t*                     /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */,
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */,
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
+                     cl_mem             /* buffer */,
+                     cl_bool            /* blocking_write */,
+                     size_t             /* offset */,
+                     size_t             /* size */,
+                     const void *       /* ptr */,
+                     cl_uint            /* num_events_in_wait_list */,
+                     const cl_event *   /* event_wait_list */,
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */,
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */,
+                    const void *       /* pattern */,
+                    size_t             /* pattern_size */,
+                    size_t             /* offset */,
+                    size_t             /* size */,
+                    cl_uint            /* num_events_in_wait_list */,
+                    const cl_event *   /* event_wait_list */,
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */,
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */,
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */,
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */,
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */,
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */,
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */,
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */,
+                   const void *       /* fill_color */,
+                   const size_t *     /* origin[3] */,
+                   const size_t *     /* region[3] */,
+                   cl_uint            /* num_events_in_wait_list */,
+                   const cl_event *   /* event_wait_list */,
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */,
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */,
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */,
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */,
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */,
+                  cl_bool           /* blocking_map */,
+                  cl_map_flags      /* map_flags */,
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+                      void (CL_CALLBACK * /*user_func*/)(void *),
+                      void *            /* args */,
+                      size_t            /* cb_args */,
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         /* command_queue */,
+                       cl_uint                  /* num_svm_pointers */,
+                       const void **            /* svm_pointers */,
+                       const size_t *           /* sizes */,
+                       cl_mem_migration_flags   /* flags */,
+                       cl_uint                  /* num_events_in_wait_list */,
+                       const cl_event *         /* event_wait_list */,
+                       cl_event *               /* event */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
+                              cl_command_queue_properties   /* properties */,
+                              cl_bool                        /* enable */,
+                              cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */,
+                size_t                  /* image_row_pitch */,
+                size_t                  /* image_slice_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl.hpp b/3rdparty/TNN/third_party/opencl/include/CL/cl.hpp
new file mode 100755
index 0000000..c10a839
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl.hpp
@@ -0,0 +1,12952 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.9
+ *   \date December 2015
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (_MSC_VER >= 1700) || (__cplusplus >= 201103L)
+#define CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#define CL_HPP_CPP11_ATOMICS_SUPPORTED
+#include <atomic>
+#endif
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT noexcept
+#else
+#define CL_HPP_NOEXCEPT
+#endif
+
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  __ERR_STR(clLinkProgram)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            ::size_t copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    //! \brief Resizes the vector to the given size
+    void resize(unsigned int newSize, T fill = T())
+    {
+        if (newSize > N)
+        {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+        else
+        {
+            while (size_ < newSize)
+            {
+                new (&data_[size_]) T(fill);
+                size_++;
+            }
+            while (size_ > newSize)
+            {
+                --size_;
+                data_[size_].~T();
+            }
+        }
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(std::atomic<int> * dest, int exchange, int comparand)
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_compare_exchange_strong(dest, &comparand, exchange);
+        return comparand;
+#elif _MSC_VER
+        return (int)(_InterlockedCompareExchange(
+            (volatile long*)dest,
+            (long)exchange,
+            (long)comparand));
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        return (__sync_val_compare_and_swap(
+            dest,
+            comparand,
+            exchange));
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+
+    inline void fence() {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+#elif _MSC_VER // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        _ReadWriteBarrier();
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        __sync_synchronize();
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+} // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+#if defined(__NO_STD_VECTOR) || defined(__NO_STD_STRING)
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*)alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+#else 
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (required > 0) {
+        // std::string has a constant data member
+        // a char vector does not
+        VECTOR_CLASS<char> value(required);
+        err = f(name, required, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+        if (param) {
+            param->assign(value.begin(), value.end() - 1u);
+        }
+    }
+    else if (param) {
+        param->assign("");
+    }
+#endif
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(Device&& dev) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        
+        return Platform(ids[0]);
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+
+        if (!formats) {
+            return CL_SUCCESS;
+        }
+
+        cl_int err = ::clGetSupportedImageFormats(
+            object_,
+            flags,
+            type,
+            0,
+            NULL,
+            &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        if (numEntries > 0) {
+            ImageFormat* value = (ImageFormat*)
+                alloca(numEntries * sizeof(ImageFormat));
+            err = ::clGetSupportedImageFormats(
+                object_,
+                flags,
+                type,
+                numEntries,
+                (cl_image_format*)value,
+                NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+            }
+
+            formats->assign(&value[0], &value[numEntries]);
+        }
+        else {
+            formats->clear();
+        }
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must represent contiguous data.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+    
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(const BufferD3D10& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+*
+*  This is provided to facilitate interoperability with OpenGL.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class BufferRenderGL : 
+#if defined(CL_VERSION_1_2)
+    public ImageGL
+#else // #if defined(CL_VERSION_1_2)
+    public Image2DGL
+#endif //#if defined(CL_VERSION_1_2)
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+    *         GL Renderbuffer.
+    *
+    *  Wraps clCreateFromGLRenderbuffer().
+    */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL() : ImageGL() {};
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL() : Image2DGL() {};
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+    *
+    *  See Memory for further details.
+    */
+#if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : ImageGL(buffer) { }
+#else // #if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Image2DGL(buffer) { }
+#endif //#if defined(CL_VERSION_1_2)
+
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : ImageGL(buf) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : Image2DGL(buf) {}
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (const BufferRenderGL &rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : ImageGL(std::move(buf)) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : Image2DGL(std::move(buf)) {}
+#endif //#if defined(CL_VERSION_1_2)
+    
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(std::move(buf));
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(std::move(buf));
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_, type, gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                0,
+                NULL,
+                NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        const void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        ) const
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    const void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    const void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+
+    >
+{
+public:
+    typedef detail::KernelFunctorGlobal<             
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+#undef CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#undef CL_HPP_NOEXCEPT
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl2.hpp b/3rdparty/TNN/third_party/opencl/include/CL/cl2.hpp
new file mode 100644
index 0000000..d8618d9
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl2.hpp
@@ -0,0 +1,9570 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
+ *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
+ *   \author Lee Howes and Bruce Merry
+ *
+ *   Derived from the OpenCL 1.x C++ bindings written by
+ *   Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   With additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *       James Price, 2015-
+ *
+ *   \version 2.0.10
+ *   \date 2016-07-20
+ *
+ *   Optional extension support
+ *
+ *         cl_ext_device_fission
+ *         #define CL_HPP_USE_CL_DEVICE_FISSION
+ *         cl_khr_d3d10_sharing
+ *         #define CL_HPP_USE_DX_INTEROP
+ *         cl_khr_sub_groups
+ *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *
+ *   Doxygen documentation for this header is available here:
+ *
+ *       http://khronosgroup.github.io/OpenCL-CLHPP/
+ *
+ *   The latest version of this header can be found on the GitHub releases page:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
+ *
+ *   Bugs and patches can be submitted to the GitHub repository:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings; it is enough to simply include \em cl2.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * There are numerous compatibility, portability and memory management
+ * fixes in the new header as well as additional OpenCL 2.0 features.
+ * As a result the header is not directly backward compatible and for this
+ * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * 
+ *
+ * \section compatibility Compatibility
+ * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings
+ * include an updated approach to defining supported feature versions
+ * and the range of valid underlying OpenCL runtime versions supported.
+ *
+ * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
+ * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
+ * decimal values representing OpenCL runime versions. The default for 
+ * the target is 200, representing OpenCL 2.0 and the minimum is also 
+ * defined as 200. These settings would use 2.0 API calls only.
+ * If backward compatibility with a 1.2 runtime is required, the minimum
+ * version may be set to 120.
+ *
+ * Note that this is a compile-time setting, and so affects linking against
+ * a particular SDK version rather than the versioning of the loaded runtime.
+ *
+ * The earlier versions of the header included basic vector and string 
+ * classes based loosely on STL versions. These were difficult to 
+ * maintain and very rarely used. For the 2.0 header we now assume
+ * the presence of the standard library unless requested otherwise.
+ * We use std::array, std::vector, std::shared_ptr and std::string 
+ * throughout to safely manage memory and reduce the chance of a 
+ * recurrance of earlier memory management bugs.
+ *
+ * These classes are used through typedefs in the cl namespace: 
+ * cl::array, cl::vector, cl::pointer and cl::string.
+ * In addition cl::allocate_pointer forwards to std::allocate_shared
+ * by default.
+ * In all cases these standard library classes can be replaced with 
+ * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, 
+ * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and 
+ * CL_HPP_NO_STD_STRING macros.
+ *
+ * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper
+ * class to interface with kernel enqueue. This caused unpleasant interactions
+ * with the standard size_t declaration and led to namespacing bugs.
+ * In the 2.0 version we have replaced this with a std::array-based interface.
+ * However, the old behaviour can be regained for backward compatibility
+ * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro.
+ *
+ * Finally, the program construction interface used a clumsy vector-of-pairs
+ * design in the earlier versions. We have replaced that with a cleaner 
+ * vector-of-vectors and vector-of-strings design. However, for backward 
+ * compatibility old behaviour can be regained with the
+ * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro.
+ * 
+ * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with 
+ * earlier versions. As a result a flag must be passed to the OpenCL C
+ * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as
+ * the default in the absence of the flag.
+ * In some cases the C++ bindings automatically compile code for ease.
+ * For those cases the compilation defaults to OpenCL C 2.0.
+ * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may
+ * be specified to assume 1.2 compilation.
+ * If more fine-grained decisions on a per-kernel bases are required
+ * then explicit build operations that take the flag should be used.
+ *
+ *
+ * \section parameterization Parameters
+ * This header may be parameterized by a set of preprocessor macros.
+ *
+ * - CL_HPP_TARGET_OPENCL_VERSION
+ *
+ *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 200, representing OpenCL 2.0.
+ *
+ * - CL_HPP_NO_STD_STRING
+ *
+ *   Do not use the standard library string class. cl::string is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_VECTOR
+ *
+ *   Do not use the standard library vector class. cl::vector is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_ARRAY
+ *
+ *   Do not use the standard library array class. cl::array is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_UNIQUE_PTR
+ *
+ *   Do not use the standard library unique_ptr class. cl::pointer and
+ *   the cl::allocate_pointer functions are not defined and may be
+ *   defined by the user before cl2.hpp is included.
+ *
+ * - CL_HPP_ENABLE_DEVICE_FISSION
+ *
+ *   Enables device fission for OpenCL 1.2 platforms.
+ *
+ * - CL_HPP_ENABLE_EXCEPTIONS
+ *
+ *   Enable exceptions for use in the C++ bindings header. This is the
+ *   preferred error handling mechanism but is not required.
+ *
+ * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ *
+ *   Backward compatibility option to support cl.hpp-style size_t
+ *   class.  Replaces the updated std::array derived version and
+ *   removal of size_t from the namespace. Note that in this case the
+ *   new size_t class is placed in the cl::compatibility namespace and
+ *   thus requires an additional using declaration for direct backward
+ *   compatibility.
+ *
+ * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
+ *
+ *   Enable older vector of pairs interface for construction of
+ *   programs.
+ *
+ * - CL_HPP_CL_1_2_DEFAULT_BUILD
+ *
+ *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *   applies to use of cl::Program construction and other program
+ *   build variants.
+ *
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+    #define CL_HPP_ENABLE_EXCEPTIONS
+    #define CL_HPP_TARGET_OPENCL_VERSION 200
+
+    #include <CL/cl2.hpp>
+    #include <iostream>
+    #include <vector>
+    #include <memory>
+    #include <algorithm>
+
+    const int numElements = 32;
+
+    int main(void)
+    {
+        // Filter for a 2.0 platform and set it as the default
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        cl::Platform plat;
+        for (auto &p : platforms) {
+            std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
+            if (platver.find("OpenCL 2.") != std::string::npos) {
+                plat = p;
+            }
+        }
+        if (plat() == 0)  {
+            std::cout << "No OpenCL 2.0 platform found.";
+            return -1;
+        }
+
+        cl::Platform newP = cl::Platform::setDefault(plat);
+        if (newP != plat) {
+            std::cout << "Error setting default platform.";
+            return -1;
+        }
+
+        // Use C++11 raw string literals for kernel source code
+        std::string kernel1{R"CLC(
+            global int globalA;
+            kernel void updateGlobal()
+            {
+              globalA = 75;
+            }
+        )CLC"};
+        std::string kernel2{R"CLC(
+            typedef struct { global int *bar; } Foo;
+            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
+                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
+            {
+              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
+              write_pipe(outPipe, &val);
+              queue_t default_queue = get_default_queue();
+              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
+
+              // Have a child kernel write into third quarter of output
+              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*2 + get_global_id(0)] =
+                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
+                });
+
+              // Have a child kernel write into last quarter of output
+              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*3 + get_global_id(0)] =
+                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
+                });
+            }
+        )CLC"};
+
+        // New simpler string interface style
+        std::vector<std::string> programStrings {kernel1, kernel2};
+
+        cl::Program vectorAddProgram(programStrings);
+        try {
+            vectorAddProgram.build("-cl-std=CL2.0");
+        }
+        catch (...) {
+            // Print build info for all devices
+            cl_int buildErr = CL_SUCCESS;
+            auto buildInfo = vectorAddProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
+            for (auto &pair : buildInfo) {
+                std::cerr << pair.second << std::endl << std::endl;
+            }
+
+            return 1;
+        }
+
+        typedef struct { int *bar; } Foo;
+
+        // Get and run kernel that initializes the program-scope global
+        // A test for kernels that take no arguments
+        auto program2Kernel =
+            cl::KernelFunctor<>(vectorAddProgram, "updateGlobal");
+        program2Kernel(
+            cl::EnqueueArgs(
+            cl::NDRange(1)));
+
+        //////////////////
+        // SVM allocations
+
+        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+        *anSVMInt = 5;
+        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
+        fooPointer->bar = anSVMInt.get();
+        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
+        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
+        cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
+
+        //
+        //////////////
+
+        // Traditional cl_mem allocations
+        std::vector<int> output(numElements, 0xdeadbeef);
+        cl::Buffer outputBuffer(begin(output), end(output), false);
+        cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
+
+        // Default command queue, also passed in as a parameter
+        cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
+            cl::Context::getDefault(), cl::Device::getDefault());
+
+        auto vectorAddKernel =
+            cl::KernelFunctor<
+                decltype(fooPointer)&,
+                int*,
+                cl::coarse_svm_vector<int>&,
+                cl::Buffer,
+                int,
+                cl::Pipe&,
+                cl::DeviceCommandQueue
+                >(vectorAddProgram, "vectorAdd");
+
+        // Ensure that the additional SVM pointer is available to the kernel
+        // This one was not passed as a parameter
+        vectorAddKernel.setSVMPointers(anSVMInt);
+
+        // Hand control of coarse allocations to runtime
+        cl::enqueueUnmapSVM(anSVMInt);
+        cl::enqueueUnmapSVM(fooPointer);
+        cl::unmapSVM(inputB);
+        cl::unmapSVM(output2);
+
+	    cl_int error;
+	    vectorAddKernel(
+            cl::EnqueueArgs(
+                cl::NDRange(numElements/2),
+                cl::NDRange(numElements/2)),
+            fooPointer,
+            inputA.data(),
+            inputB,
+            outputBuffer,
+            3,
+            aPipe,
+            defaultDeviceQueue,
+		    error
+            );
+
+        cl::copy(outputBuffer, begin(output), end(output));
+        // Grab the SVM output vector using a map
+        cl::mapSVM(output2);
+
+        cl::Device d = cl::Device::getDefault();
+
+        std::cout << "Output:\n";
+        for (int i = 1; i < numElements; ++i) {
+            std::cout << "\t" << output[i] << "\n";
+        }
+        std::cout << "\n\n";
+
+        return 0;
+    }
+ *
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+/* Handle deprecated preprocessor definitions. In each case, we only check for
+ * the old name if the new name is not defined, so that user code can define
+ * both and hence work with either version of the bindings.
+ */
+#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
+# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# define CL_HPP_USE_DX_INTEROP
+#endif
+#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
+# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
+# define CL_HPP_USE_CL_DEVICE_FISSION
+#endif
+#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
+# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# define CL_HPP_ENABLE_EXCEPTIONS
+#endif
+#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
+# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# define CL_HPP_NO_STD_VECTOR
+#endif
+#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
+# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# define CL_HPP_NO_STD_STRING
+#endif
+#if defined(VECTOR_CLASS)
+# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+#endif
+#if defined(STRING_CLASS)
+# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+#endif
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+#endif
+
+/* Warn about features that are no longer supported
+ */
+#if defined(__USE_DEV_VECTOR)
+# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+#endif
+#if defined(__USE_DEV_STRING)
+# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+#endif
+
+/* Detect which version to target */
+#if !defined(CL_HPP_TARGET_OPENCL_VERSION)
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200")
+# undef CL_HPP_TARGET_OPENCL_VERSION
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+
+#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
+#define CL_HPP_MINIMUM_OPENCL_VERSION 200
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100")
+# undef CL_HPP_MINIMUM_OPENCL_VERSION
+# define CL_HPP_MINIMUM_OPENCL_VERSION 100
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION
+# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION"
+#endif
+
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER 
+ 
+ // Check for a valid C++ version
+
+// Need to do both tests here because for some reason __cplusplus is not 
+// updated in visual studio
+#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700)
+#error Visual studio 2013 or another C++11-supporting compiler required
+#endif
+
+// 
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT_ noexcept
+#else
+#define CL_HPP_NOEXCEPT_
+#endif
+
+#if defined(_MSC_VER)
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
+#else
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
+#endif // !_MSC_VER
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+#include <mutex>
+#include <cstring>
+#include <functional>
+
+
+// Define a size_type to represent a correctly resolved size_t
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = ::size_t;
+} // namespace cl
+#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = size_t;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+#if !defined(CL_HPP_NO_STD_VECTOR)
+#include <vector>
+namespace cl {
+    template < class T, class Alloc = std::allocator<T> >
+    using vector = std::vector<T, Alloc>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_VECTOR)
+
+#if !defined(CL_HPP_NO_STD_STRING)
+#include <string>
+namespace cl {
+    using string = std::string;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_STRING)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+#include <memory>
+namespace cl {
+    // Replace unique_ptr and allocate_pointer for internal use
+    // to allow user to replace them
+    template<class T, class D>
+    using pointer = std::unique_ptr<T, D>;
+} // namespace cl
+#endif 
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if !defined(CL_HPP_NO_STD_ARRAY)
+#include <array>
+namespace cl {
+    template < class T, size_type N >
+    using array = std::array<T, N>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_ARRAY)
+
+// Define size_type appropriately to allow backward-compatibility
+// use of the old size_t interface class
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    namespace compatibility {
+        /*! \brief class used to interface between C++ and
+        *  OpenCL C calls that require arrays of size_t values, whose
+        *  size is known statically.
+        */
+        template <int N>
+        class size_t
+        {
+        private:
+            size_type data_[N];
+
+        public:
+            //! \brief Initialize size_t to all 0s
+            size_t()
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = 0;
+                }
+            }
+
+            size_t(const array<size_type, N> &rhs)
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = rhs[i];
+                }
+            }
+
+            size_type& operator[](int index)
+            {
+                return data_[index];
+            }
+
+            const size_type& operator[](int index) const
+            {
+                return data_[index];
+            }
+
+            //! \brief Conversion operator to T*.
+            operator size_type* ()             { return data_; }
+
+            //! \brief Conversion operator to const T*.
+            operator const size_type* () const { return data_; }
+
+            operator array<size_type, N>() const
+            {
+                array<size_type, N> ret;
+
+                for (int i = 0; i < N; ++i) {
+                    ret[i] = data_[i];
+                }
+                return ret;
+            }
+        };
+    } // namespace compatibility
+
+    template<int N>
+    using size_t = compatibility::size_t<N>;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+// Helper alias to avoid confusing the macros
+namespace cl {
+    namespace detail {
+        using size_t_array = array<size_type, 3>;
+    } // namespace detail
+} // namespace cl
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+    class Memory;
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddress(#name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddressForPlatform(platform, #name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+    class Program;
+    class Device;
+    class Context;
+    class CommandQueue;
+    class DeviceCommandQueue;
+    class Memory;
+    class Buffer;
+    class Pipe;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    /*! \brief Exception class 
+     * 
+     *  This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined.
+     */
+    class Error : public std::exception
+    {
+    private:
+        cl_int err_;
+        const char * errStr_;
+    public:
+        /*! \brief Create a new CL error exception for a given error code
+         *  and corresponding message.
+         * 
+         *  \param err error code value.
+         *
+         *  \param errStr a descriptive string that must remain in scope until
+         *                handling of the exception has concluded.  If set, it
+         *                will be returned by what().
+         */
+        Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+        {}
+
+        ~Error() throw() {}
+
+        /*! \brief Get error string associated with exception
+         *
+         * \return A memory pointer to the error message string.
+         */
+        virtual const char * what() const throw ()
+        {
+            if (errStr_ == NULL) {
+                return "empty";
+            }
+            else {
+                return errStr_;
+            }
+        }
+
+        /*! \brief Get error code associated with exception
+         *
+         *  \return The error code.
+         */
+        cl_int err(void) const { return err_; }
+    };
+#define CL_HPP_ERR_STR_(x) #x
+#else
+#define CL_HPP_ERR_STR_(x) NULL
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               CL_HPP_ERR_STR_(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             CL_HPP_ERR_STR_(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                CL_HPP_ERR_STR_(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              CL_HPP_ERR_STR_(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              CL_HPP_ERR_STR_(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                CL_HPP_ERR_STR_(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        CL_HPP_ERR_STR_(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           CL_HPP_ERR_STR_(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                CL_HPP_ERR_STR_(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              CL_HPP_ERR_STR_(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               CL_HPP_ERR_STR_(clGetKernelInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        CL_HPP_ERR_STR_(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
+#define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              CL_HPP_ERR_STR_(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         CL_HPP_ERR_STR_(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            CL_HPP_ERR_STR_(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               CL_HPP_ERR_STR_(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __BUILD_PROGRAM_ERR                 CL_HPP_ERR_STR_(clBuildProgram)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __COMPILE_PROGRAM_ERR               CL_HPP_ERR_STR_(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  CL_HPP_ERR_STR_(clLinkProgram)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     CL_HPP_ERR_STR_(clCreateKernelsInProgram)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties)
+#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR                CL_HPP_ERR_STR_(clCreateSamplerWithProperties)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    CL_HPP_ERR_STR_(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      CL_HPP_ERR_STR_(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          CL_HPP_ERR_STR_(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     CL_HPP_ERR_STR_(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       CL_HPP_ERR_STR_(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           CL_HPP_ERR_STR_(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)
+#define __ENQUEUE_NATIVE_KERNEL             CL_HPP_ERR_STR_(clEnqueueNativeKernel)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)
+
+#define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
+#define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)
+
+
+#define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
+#define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
+#define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
+#define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
+#define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevicesEXT)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __ENQUEUE_MARKER_ERR                CL_HPP_ERR_STR_(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       CL_HPP_ERR_STR_(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               CL_HPP_ERR_STR_(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               CL_HPP_ERR_STR_(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                CL_HPP_ERR_STR_(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                CL_HPP_ERR_STR_(clCreateImage3D)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * Deprecated APIs for 2.0
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)
+#define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)
+#define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+// Assumes that the output vector was correctly resized on the way in
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>* param, int)
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Create array of pointers, calculate total size and pass pointer array in
+        size_type numBinaries = param->size();
+        vector<unsigned char*> binariesPointers(numBinaries);
+
+        for (size_type i = 0; i < numBinaries; ++i)
+        {
+            binariesPointers[i] = (*param)[i].data();
+        }
+
+        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL);
+
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+
+
+    return CL_SUCCESS;
+}
+
+// Specialized getInfoHelper for vector params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    const size_type elements = required / sizeof(T);
+
+    // Temporary to avoid changing param on an error
+    vector<T> localData(elements);
+    err = f(name, required, localData.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (param) {
+        *param = std::move(localData);
+    }
+
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(
+    Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    const size_type elements = required / sizeof(typename T::cl_type);
+
+    vector<typename T::cl_type> value(elements);
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (param) {
+        // Assign to convert CL type to T for each element
+        param->resize(elements);
+
+        // Assign to param, constructing with retain behaviour
+        // to correctly capture each underlying CL object
+        for (size_type i = 0; i < elements; i++) {
+            (*param)[i] = T(value[i], true);
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for string params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    // std::string has a constant data member
+    // a char vector does not
+    if (required > 0) {
+        vector<char> value(required);
+        err = f(name, required, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+        if (param) {
+            param->assign(begin(value), prev(end(value)));
+        }
+    }
+    else if (param) {
+        param->assign("");
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for clsize_t params
+template <typename Func, size_type N>
+inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    size_type elements = required / sizeof(size_type);
+    vector<size_type> value(elements, 0);
+
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    
+    // Bound the copy with N to prevent overruns
+    // if passed N > than the amount copied
+    if (elements > N) {
+        elements = N;
+    }
+    for (size_type i = 0; i < elements; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, string) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, string) \
+    F(cl_platform_info, CL_PLATFORM_NAME, string) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, string) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector<size_type>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, string) \
+    F(cl_device_info, CL_DEVICE_VENDOR, string) \
+    F(cl_device_info, CL_DRIVER_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_PROFILE, string) \
+    F(cl_device_info, CL_DEVICE_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, size_type) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_WIDTH, size_type) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \
+    F(cl_image_info, CL_IMAGE_DEPTH, size_type) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, string) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector<size_type>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector<cl::vector<unsigned char>>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, size_type) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    \
+    F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
+    F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint)
+
+#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
+    F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
+    F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
+    F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)
+
+#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+
+// Flags deprecated in OpenCL 2.0
+#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer)
+
+// Include deprecated query flags based on versions
+// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash
+#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110
+#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const vector<char> &versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    size_type size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+
+    vector<char> versionInfo(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    size_type size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    vector<cl_device_id> devices(size/sizeof(cl_device_id));
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
+    {
+        if (retainObject) { 
+            detail::errHandler(retain(), __RETAIN_ERR); 
+        }
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    const cl_type get() const { return object_; }
+
+    cl_type get() { return object_; }
+
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        retVal = true;
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : 
+        object_(obj), 
+        referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+
+        if (retainObject) {
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+    }
+
+    ~Wrapper()
+    {
+        release();
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    const cl_type get() const { return object_; }
+
+    cl_type get() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, vector<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( object_ != nullptr && referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr && referenceCountable_) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <typename T>
+inline bool operator==(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return lhs() == rhs();
+}
+
+template <typename T>
+inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return !operator==(lhs, rhs);
+}
+
+} // namespace detail
+//! \endcond
+
+
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+/**
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
+    {
+    }
+
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
+};
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        if (err != CL_SUCCESS) {
+            throw BuildError(err, errStr, buildLogs);
+        }
+        return err;
+    }
+} // namespace detail
+
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
+    }
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Device default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault();
+
+    /*! \brief Create the default platform from a provided platform.
+    *
+    * This sets @c default_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefaultProvided(const Device &p) {
+        default_ = p;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Device();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    explicit Device(const cl_device_id &device, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(device, retainObject) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+    * Modify the default device to be used by
+    * subsequent operations.
+    * Will only set the default if no default was previously created.
+    * @return updated default device.
+    *         Should be compared to the passed value to ensure that it was updated.
+    */
+    static Device setDefault(const Device &default_device)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clCreateSubDevices().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        vector<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = clCreateSubDevices(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+#elif defined(CL_HPP_USE_CL_DEVICE_FISSION)
+
+/**
+ * CL 1.1 version that uses device fission extension.
+ */
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        vector<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
+};
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Platform default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+        * what we wish, so we catch it and save the error.
+        */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            // If default wasn't passed ,generate one
+            // Otherwise set it
+            cl_uint n = 0;
+
+            cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+            if (n == 0) {
+                default_error_ = CL_INVALID_PLATFORM;
+                return;
+            }
+
+            vector<cl_platform_id> ids(n);
+            err = ::clGetPlatformIDs(n, ids.data(), NULL);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+
+            default_ = Platform(ids[0]);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default platform from a provided platform.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Platform &p) {
+       default_ = p;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Platform();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    explicit Platform(const cl_platform_id &platform, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(platform, retainObject) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    static Platform getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default platform to be used by 
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default platform. 
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Platform setDefault(const Platform &default_platform)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, string* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        vector<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        vector<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids.data(), 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        vector<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        vector<cl_platform_id> ids(n);
+        err = ::clGetPlatformIDs(n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (platforms) {
+            platforms->resize(ids.size());
+
+            // Platforms don't reference count
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*platforms)[i] = Platform(ids[i]);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (platform) {
+            *platform = default_platform;
+        }
+        return err;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     *
+     * \return Returns a valid platform if one is available.
+     *         If no platform is available will return a null platform.
+     * Throws an exception if no platforms are available
+     * or an error condition occurs.
+     * Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (errResult) {
+            *errResult = err;
+        }
+        return default_platform;
+    }    
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+}; // class Platform
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
+
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Context default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context from the default device type in the default platform.
+     *
+     * This sets @c default_ and @c default_error_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+         * what we wish, so we catch it and save the error.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+#if !defined(__APPLE__) && !defined(__MACOS)
+            const Platform &p = Platform::getDefault();
+            cl_platform_id defaultPlatform = p();
+            cl_context_properties properties[3] = {
+                CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0
+            };
+#else // #if !defined(__APPLE__) && !defined(__MACOS)
+            cl_context_properties *properties = nullptr;
+#endif // #if !defined(__APPLE__) && !defined(__MACOS)
+
+            default_ = Context(
+                CL_DEVICE_TYPE_DEFAULT,
+                properties,
+                NULL,
+                NULL,
+                &default_error_);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+
+    /*! \brief Create the default context from a provided Context.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Context &c) {
+        default_ = c;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Context();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const vector<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs.data(),
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            vector<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                vector<Device> devices;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default context to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default context.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Context setDefault(const Context &default_context)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    explicit Context(const cl_context& context, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(context, retainObject) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        vector<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        
+        if (!formats) {
+            return CL_SUCCESS;
+        }
+
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        if (numEntries > 0) {
+            vector<ImageFormat> value(numEntries);
+            err = ::clGetSupportedImageFormats(
+                object_,
+                flags,
+                type,
+                numEntries,
+                (cl_image_format*)value.data(),
+                NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+            }
+
+            formats->assign(begin(value), end(value));
+        }
+        else {
+            // If no values are being returned, ensure an empty vector comes back
+            formats->clear();
+        }
+
+        return CL_SUCCESS;
+    }
+};
+
+inline void Device::makeDefault()
+{
+    /* Throwing an exception from a call_once invocation does not do
+    * what we wish, so we catch it and save the error.
+    */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    try
+#endif
+    {
+        cl_int error = 0;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            default_error_ = error;
+        }
+        else {
+            default_ = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+            default_error_ = CL_SUCCESS;
+        }
+    }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    catch (cl::Error &e) {
+        default_error_ = e.err();
+    }
+#endif
+}
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    explicit Event(const cl_event& event, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(event, retainObject) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const vector<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const vector<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  Optionally transfer ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Memory(const cl_mem& memory, bool retainObject) :
+        detail::Wrapper<cl_type>(memory, retainObject) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+namespace detail
+{
+    class SVMTraitNull
+    {
+    public:
+        static cl_svm_mem_flags getSVMMemFlags()
+        {
+            return 0;
+        }
+    };
+} // namespace detail
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadWrite
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_WRITE |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitWriteOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_WRITE_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitCoarse
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitFine
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitAtomic
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return
+            CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            CL_MEM_SVM_ATOMICS |
+            Trait::getSVMMemFlags();
+    }
+};
+
+// Pre-declare SVM map function
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL);
+
+/**
+ * STL-like allocator class for managing SVM objects provided for convenience.
+ *
+ * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects,
+ * care must be taken when using with smart pointers.
+ * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because
+ * the coarse-grained management behaviour would behave incorrectly with respect to reference counting.
+ *
+ * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used
+ * with the allocate_shared and allocate_ptr supplied operations.
+ */
+template<typename T, class SVMTrait>
+class SVMAllocator {
+private:
+    Context context_;
+
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+
+    SVMAllocator() :
+        context_(Context::getDefault())
+    {
+    }
+
+    explicit SVMAllocator(cl::Context context) :
+        context_(context)
+    {
+    }
+
+
+    SVMAllocator(const SVMAllocator &other) :
+        context_(other.context_)
+    {
+    }
+
+    template<typename U>
+    SVMAllocator(const SVMAllocator<U, SVMTrait> &other) :
+        context_(other.context_)
+    {
+    }
+
+    ~SVMAllocator()
+    {
+    }
+
+    pointer address(reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    const_pointer address(const_reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    /**
+     * Allocate an SVM pointer.
+     *
+     * If the allocator is coarse-grained, this will take ownership to allow
+     * containers to correctly construct data in place. 
+     */
+    pointer allocate(
+        size_type size,
+        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0)
+    {
+        // Allocate memory with default alignment matching the size of the type
+        void* voidPointer =
+            clSVMAlloc(
+            context_(),
+            SVMTrait::getSVMMemFlags(),
+            size*sizeof(T),
+            0);
+        pointer retValue = reinterpret_cast<pointer>(
+            voidPointer);
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        if (!retValue) {
+            std::bad_alloc excep;
+            throw excep;
+        }
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+        // If allocation was coarse-grained then map it
+        if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
+            cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));
+            if (err != CL_SUCCESS) {
+                std::bad_alloc excep;
+                throw excep;
+            }
+        }
+
+        // If exceptions disabled, return null pointer from allocator
+        return retValue;
+    }
+
+    void deallocate(pointer p, size_type)
+    {
+        clSVMFree(context_(), p);
+    }
+
+    /**
+     * Return the maximum possible allocation size.
+     * This is the minimum of the maximum sizes of all devices in the context.
+     */
+    size_type max_size() const CL_HPP_NOEXCEPT_
+    {
+        size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);
+
+        for (Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
+            maxSize = std::min(
+                maxSize, 
+                static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));
+        }
+
+        return maxSize;
+    }
+
+    template< class U, class... Args >
+    void construct(U* p, Args&&... args)
+    {
+        new(p)T(args...);
+    }
+
+    template< class U >
+    void destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * Returns true if the contexts match.
+     */
+    inline bool operator==(SVMAllocator const& rhs)
+    {
+        return (context_==rhs.context_);
+    }
+
+    inline bool operator!=(SVMAllocator const& a)
+    {
+        return !operator==(a);
+    }
+}; // class SVMAllocator        return cl::pointer<T>(tmp, detail::Deleter<T, Alloc>{alloc, copies});
+
+
+template<class SVMTrait>
+class SVMAllocator<void, SVMTrait> {
+public:
+    typedef void value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+};
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+namespace detail
+{
+    template<class Alloc>
+    class Deleter {
+    private:
+        Alloc alloc_;
+        size_type copies_;
+
+    public:
+        typedef typename std::allocator_traits<Alloc>::pointer pointer;
+
+        Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies }
+        {
+        }
+
+        void operator()(pointer ptr) const {
+            Alloc tmpAlloc{ alloc_ };
+            std::allocator_traits<Alloc>::destroy(tmpAlloc, std::addressof(*ptr));
+            std::allocator_traits<Alloc>::deallocate(tmpAlloc, ptr, copies_);
+        }
+    };
+} // namespace detail
+
+/**
+ * Allocation operation compatible with std::allocate_ptr.
+ * Creates a unique_ptr<T> by default.
+ * This requirement is to ensure that the control block is not
+ * allocated in memory inaccessible to the host.
+ */
+template <class T, class Alloc, class... Args>
+cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
+{
+    Alloc alloc(alloc_);
+    static const size_type copies = 1;
+
+    // Ensure that creation of the management block and the
+    // object are dealt with separately such that we only provide a deleter
+
+    T* tmp = std::allocator_traits<Alloc>::allocate(alloc, copies);
+    if (!tmp) {
+        std::bad_alloc excep;
+        throw excep;
+    }
+    try {
+        std::allocator_traits<Alloc>::construct(
+            alloc,
+            std::addressof(*tmp),
+            std::forward<Args>(args)...);
+
+        return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
+    }
+    catch (std::bad_alloc b)
+    {
+        std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
+        throw;
+    }
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc;
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(const cl::Context &c, Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc(c);
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+
+/*! \brief Vector alias to simplify construction of coarse-grained SVM containers.
+ * 
+ */
+template < class T >
+using coarse_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>>;
+
+/*! \brief Vector alias to simplify construction of fine-grained SVM containers.
+*
+*/
+template < class T >
+using fine_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitFine<>>>;
+
+/*! \brief Vector alias to simplify construction of fine-grained SVM containers that support platform atomics.
+*
+*/
+template < class T >
+using atomic_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitAtomic<>>>;
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+    
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must be random access.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Buffer(const cl_mem& buffer, bool retainObject = false) :
+        Memory(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+};
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+   
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr)
+    {
+        typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+            cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+            cl_int* errcode_ret);
+        PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : 
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(const BufferD3D10& buf) : 
+        Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image(const cl_mem& image, bool retainObject = false) :
+        Memory(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1D(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+  
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) :
+        Image(imageArray, retainObject) { }
+
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 2D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief Constructs a 2D Image from a buffer.
+    * \note This will share storage with the underlying buffer.
+    *
+    *  Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        ImageFormat format,
+        const Buffer &sourceBuffer,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            width,
+            height,
+            0, 0, // depth, array size (unused)
+            row_pitch,
+            0, 0, 0,
+            // Use buffer as input to image
+            sourceBuffer()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags inherited from buffer
+            &format,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief Constructs a 2D Image from an image.
+    * \note This will share storage with the underlying image but may
+    *       reinterpret the channel order and type.
+    *
+    * The image will be created matching with a descriptor matching the source. 
+    *
+    * \param order is the channel order to reinterpret the image data as.
+    *              The channel order may differ as described in the OpenCL 
+    *              2.0 API specification.
+    *
+    * Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        cl_channel_order order,
+        const Image &sourceImage,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        // Descriptor fields have to match source image
+        size_type sourceWidth = 
+            sourceImage.getImageInfo<CL_IMAGE_WIDTH>();
+        size_type sourceHeight = 
+            sourceImage.getImageInfo<CL_IMAGE_HEIGHT>();
+        size_type sourceRowPitch =
+            sourceImage.getImageInfo<CL_IMAGE_ROW_PITCH>();
+        cl_uint sourceNumMIPLevels =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_MIP_LEVELS>();
+        cl_uint sourceNumSamples =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_SAMPLES>();
+        cl_image_format sourceFormat =
+            sourceImage.getImageInfo<CL_IMAGE_FORMAT>();
+
+        // Update only the channel order. 
+        // Channel format inherited from source.
+        sourceFormat.image_channel_order = order;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            sourceWidth,
+            sourceHeight,
+            0, 0, // depth (unused), array size (unused)
+            sourceRowPitch,
+            0, // slice pitch (unused)
+            sourceNumMIPLevels,
+            sourceNumSamples,
+            // Use buffer as input to image
+            sourceImage()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags should be inherited from mem_object
+            &sourceFormat,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2D(const cl_mem& image2D, bool retainObject = false) :
+        Image(image2D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DGL(const cl_mem& image, bool retainObject = false) : 
+        Image2D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *c
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type height,
+        size_type rowPitch,
+        size_type slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type depth,
+        size_type row_pitch = 0,
+        size_type slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3D(const cl_mem& image3D, bool retainObject = false) : 
+        Image(image3D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3DGL(const cl_mem& image, bool retainObject = false) : 
+        Image3D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit ImageGL(const cl_mem& image, bool retainObject = false) : 
+        Image(image, retainObject) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/*! \brief Class interface for Pipe Memory Objects.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class Pipe : public Memory
+{
+public:
+
+    /*! \brief Constructs a Pipe in a specified context.
+     *
+     * Wraps clCreatePipe().
+     * @param context Context in which to create the pipe.
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        const Context& context,
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Pipe in a the default context.
+     *
+     * Wraps clCreatePipe().
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Pipe() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Pipe(const cl_mem& pipe, bool retainObject = false) :
+        Memory(pipe, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Pipe& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(const Pipe& pipe) : Memory(pipe) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (const Pipe &pipe)
+    {
+        Memory::operator=(pipe);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (Pipe &&pipe)
+    {
+        Memory::operator=(std::move(pipe));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_pipe_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPipeInfo, object_, name, param),
+            __GET_PIPE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_pipe_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_pipe_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+}; // class Pipe
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_sampler_properties sampler_properties[] = {
+            CL_SAMPLER_NORMALIZED_COORDS, normalized_coords,
+            CL_SAMPLER_ADDRESSING_MODE, addressing_mode,
+            CL_SAMPLER_FILTER_MODE, filter_mode,
+            0 };
+        object_ = ::clCreateSamplerWithProperties(
+            context(),
+            sampler_properties,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateSampler(
+            context(),
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif        
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(sampler, retainObject) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class DeviceCommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_type sizes_[3];
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    {
+        sizes_[0] = 0;
+        sizes_[1] = 0;
+        sizes_[2] = 0;
+    }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(size_type size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = 1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(size_type size0, size_type size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(size_type size0, size_type size1, size_type size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const size_type *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const size_type*() const { 
+        return sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    size_type dimensions() const 
+    { 
+        return dimensions_; 
+    }
+
+    //! \brief Returns the size of the object in bytes based on the
+    // runtime number of dimensions
+    size_type size() const
+    {
+        return dimensions_*sizeof(size_type);
+    }
+
+    size_type* get()
+    {
+        return sizes_;
+    }
+    
+    const size_type* get() const
+    {
+        return sizes_;
+    }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    size_type size_;
+};
+
+namespace detail {
+
+template <typename T, class Enable = void>
+struct KernelArgumentHandler;
+
+// Enable for objects that are not subclasses of memory
+// Pointers, constants etc
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+// Enable for subclasses of memory where we want to get a reference to the cl_mem out
+// and pass that in for safety
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(cl_mem); }
+    static const cl_mem* ptr(const T& value) { return &(value()); }
+};
+
+// Specialization for DeviceCommandQueue defined later
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg, void>
+{
+    static size_type size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(size_type size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(kernel, retainObject) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+    cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
+    {
+        typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
+        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
+
+        return detail::errHandler(
+            pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name>
+        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
+    {
+        size_type param;
+        cl_int result = getSubGroupInfo(dev, name, range, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief setArg overload taking a shared_ptr type
+     */
+    template<typename T, class D>
+    cl_int setArg(cl_uint index, const cl::pointer<T, D> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.get()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a vector type.
+     */
+    template<typename T, class Alloc>
+    cl_int setArg(cl_uint index, const cl::vector<T, Alloc> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.data()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a pointer type
+     */
+    template<typename T>
+    typename std::enable_if<std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    /*! \brief setArg overload taking a POD type
+     */
+    template <typename T>
+    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, size_type size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*!
+     * Specify a vector of SVM pointers that the kernel may access in 
+     * addition to its arguments.
+     */
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*!
+     * Specify a std::array of SVM pointers that the kernel may access in
+     * addition to its arguments.
+     */
+    template<int ArrayLength>
+    cl_int setSVMPointers(const std::array<void*, ArrayLength> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*! \brief Enable fine-grained system SVM.
+     *
+     * \note It is only possible to enable fine-grained system SVM if all devices
+     *       in the context associated with kernel support it.
+     * 
+     * \param svmEnabled True if fine-grained system SVM is requested. False otherwise.
+     * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION
+     *         if no devices in the context support fine-grained system SVM.
+     *
+     * \see clSetKernelExecInfo
+     */
+    cl_int enableFineGrainedSystemSVM(bool svmEnabled)
+    {
+        cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE;
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM,
+                sizeof(cl_bool),
+                &svmEnabled_
+                )
+            );
+    }
+    
+    template<int index, int ArrayLength, class D, typename T0, typename... Ts>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, Ts... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+        setSVMPointersHelper<index + 1, Ts...>(ts...);
+    }
+
+    template<int index, int ArrayLength, typename T0, typename... Ts>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, Ts... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+        setSVMPointersHelper<index + 1, Ts...>(ts...);
+    }
+    
+    template<int index, int ArrayLength, typename T0, class D>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+    }
+
+    template<int index, int ArrayLength, typename T0>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+    }
+
+    template<typename T0, typename... Ts>
+    cl_int setSVMPointers(const T0 &t0, Ts... ts)
+    {
+        std::array<void*, 1 + sizeof...(Ts)> pointerList;
+
+        setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...);
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            CL_KERNEL_EXEC_INFO_SVM_PTRS,
+            sizeof(void*)*(1 + sizeof...(Ts)),
+            pointerList.data()));
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<vector<unsigned char>> Binaries;
+    typedef vector<string> Sources;
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<std::pair<const void*, size_type> > Binaries;
+    typedef vector<std::pair<const char*, size_type> > Sources;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    
+    Program(
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+            
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and the default context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        Context context = Context::getDefault(err);
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and a provided context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const Binaries& binaries,
+        vector<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const size_type numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+
+        vector<size_type> lengths(numDevices);
+        vector<const unsigned char*> images(numDevices);
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = binaries[i].data();
+            lengths[i] = binaries[(int)i].size();
+        }
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs.data(),
+            lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const string& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs.data(),
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    Program() { }
+    
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit Program(const cl_program& program, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(program, retainObject) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+
+    cl_int build(
+        const vector<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            (cl_uint)
+            devices.size(),
+            deviceIDs.data(),
+            options,
+            notifyFptr,
+            data);
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            notifyFptr,
+            data);
+
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int error = ::clCompileProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            0,
+            NULL,
+            NULL,
+            notifyFptr,
+            data);
+        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+    /**
+     * Build info function that returns a vector of device/info pairs for the specified 
+     * info type and for all devices in the program.
+     * On an error reading the info for any device, an empty vector of info will be returned.
+     */
+    template <cl_int name>
+    vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+        getBuildInfo(cl_int *err = NULL) const
+    {
+        cl_int result = CL_SUCCESS;
+
+        auto devs = getInfo<CL_PROGRAM_DEVICES>(&result);
+        vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+            devInfo;
+
+        // If there was an initial error from getInfo return the error
+        if (result != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = result;
+            }
+            return devInfo;
+        }
+
+        for (cl::Device d : devs) {
+            typename detail::param_traits<
+                detail::cl_program_build_info, name>::param_type param;
+            result = getBuildInfo(d, name, &param);
+            devInfo.push_back(
+                std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>
+                (d, param));
+            if (result != CL_SUCCESS) {
+                // On error, leave the loop and return the error code
+                break;
+            }
+        }
+        if (err != NULL) {
+            *err = result;
+        }
+        if (result != CL_SUCCESS) {
+            devInfo.clear();
+        }
+        return devInfo;
+    }
+
+    cl_int createKernels(vector<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        vector<cl_kernel> value(numKernels);
+        
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        if (kernels) {
+            kernels->resize(value.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < value.size(); i++) {
+                // We do not need to retain because this kernel is being created 
+                // by the runtime
+                (*kernels)[i] = Kernel(value[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    vector<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    vector<cl_program> programs(inputPrograms.size());
+
+    for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+        programs[i] = inputPrograms[i]();
+    }
+    
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs.data(),
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog, false);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+// Template specialization for CL_PROGRAM_BINARIES
+template <>
+inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned char>>* param) const
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Resize the parameter array appropriately for each allocation
+        // and pass down to the helper
+
+        vector<size_type> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+        size_type numBinaries = sizes.size();
+
+        // Resize the parameter array and constituent arrays
+        param->resize(numBinaries);
+        for (size_type i = 0; i < numBinaries; ++i) {
+            (*param)[i].resize(sizes[i]);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    return CL_SUCCESS;
+}
+
+template<>
+inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    vector<vector<unsigned char>> binariesVectors;
+
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binariesVectors;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+enum class QueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+};
+
+inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static std::once_flag default_initialized_;
+    static CommandQueue default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default command queue returned by @ref getDefault.
+     *
+     * It sets default_error_ to indicate success or failure. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault()
+    {
+        /* We don't want to throw an error from this function, so we have to
+         * catch and set the error flag.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            int error;
+            Context context = Context::getDefault(&error);
+
+            if (error != CL_SUCCESS) {
+                default_error_ = error;
+            }
+            else {
+                Device device = Device::getDefault();
+                default_ = CommandQueue(context, device, 0, &default_error_);
+            }
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default command queue.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const CommandQueue &c) {
+        default_ = c;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = CommandQueue();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+        
+
+    /*!
+     * \brief Constructs a CommandQueue based on passed properties.
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                object_ = ::clCreateCommandQueueWithProperties(
+                    context(), device(), queue_properties, &error);
+            }
+            else {
+                error = CL_INVALID_QUEUE_PROPERTIES;
+            }
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#else
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#endif
+        }
+    }
+
+   /*!
+    * \brief Constructs a CommandQueue based on passed properties.
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+   CommandQueue(
+       QueueProperties properties,
+       cl_int* err = NULL)
+   {
+       cl_int error;
+
+       Context context = Context::getDefault(&error);
+       detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+       if (error != CL_SUCCESS) {
+           if (err != NULL) {
+               *err = error;
+           }
+       }
+       else {
+           Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+           cl_queue_properties queue_properties[] = {
+               CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+
+           object_ = ::clCreateCommandQueueWithProperties(
+               context(), device(), queue_properties, &error);
+
+
+           detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+           if (err != NULL) {
+               *err = error;
+           }
+#else
+           object_ = ::clCreateCommandQueue(
+               context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+           detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+           if (err != NULL) {
+               *err = error;
+           }
+#endif
+       }
+   }
+
+    /*!
+     * \brief Constructs a CommandQueue for an implementation defined device in the given context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties, 0 };
+        if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), devices[0](), queue_properties, &error);
+        }
+        else {
+            error = CL_INVALID_QUEUE_PROPERTIES;
+        }
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+
+    }
+
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+    explicit CommandQueue(
+        const Context& context,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), devices[0](), queue_properties, &error);
+       
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+        
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+            cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error);
+      
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#else
+            object_ = ::clCreateCommandQueue(
+                context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#endif
+        }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default command queue to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default command queue.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static CommandQueue setDefault(const CommandQueue &default_queue)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    CommandQueue() { }
+
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        const void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified as a vector type.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     * \tparam offset Is the offset in bytes into the buffer at 
+     *     which to start filling. This must be a multiple of 
+     *     the pattern size.
+     * \tparam size Is the size in bytes of the region to fill.
+     *     This must be a multiple of the pattern size.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& region,
+        size_type dst_offset,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                region.data(), 
+                dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_offset,
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type * row_pitch,
+        size_type * slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            origin.data(), 
+            region.data(),
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueMapSVM(
+        T* ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueMapSVM(
+        cl::pointer<T, D> &ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr.get()), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMapSVM(
+        cl::vector<T, Alloc> &container,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueUnmapSVM(
+        T* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueUnmapSVM(
+        cl::pointer<T, D> &ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr.get()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueUnmapSVM(
+        cl::vector<T, Alloc> &container,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(container.data()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const vector<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        vector<cl_mem> localMemObjects(memObjects.size());
+
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                localMemObjects.data(),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : NULL,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+        const Kernel& kernel,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, size_type> args,
+        const vector<Memory>* mem_objects = NULL,
+        const vector<const void*>* mem_locs = NULL,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        size_type elements = 0;
+        if (mem_objects != NULL) {
+            elements = mem_objects->size();
+        }
+        vector<cl_mem> mems(elements);
+        for (unsigned int i = 0; i < elements; i++) {
+            mems[i] = ((*mem_objects)[i])();
+        }
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems.data(),
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+    cl_int enqueueAcquireGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+}; // CommandQueue
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+enum class DeviceQueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+};
+
+inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs)
+{
+    return static_cast<DeviceQueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class DeviceCommandQueue
+ * \brief DeviceCommandQueue interface for device cl_command_queues.
+ */
+class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
+{
+public:
+
+    /*!
+     * Trivial empty constructor to create a null queue.
+     */
+    DeviceCommandQueue() { }
+
+    /*!
+     * Default construct device command queue on default context and device
+     */
+    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_uint queueSize,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties,
+            CL_QUEUE_SIZE, queueSize, 
+            0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_command_queue - takes ownership.
+    *
+    * \param retainObject will cause the constructor to retain its cl object.
+    *                     Defaults to false to maintain compatibility with
+    *                     earlier versions.
+    */
+    explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) :
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    DeviceCommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (const DeviceCommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (DeviceCommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+            &::clGetCommandQueueInfo, object_, name, param),
+            __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_command_queue_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*!
+    * Create a new default device command queue for the default device,
+    * in the default context and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        cl_int *err = nullptr)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+    * Create a new default device command queue for the specified device
+    * and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+     * Create a new default device command queue for the specified device 
+     * and of the requested size in bytes.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            CL_QUEUE_SIZE, queueSize,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+}; // DeviceCommandQueue
+
+namespace detail
+{
+    // Specialization for device command queue
+    template <>
+    struct KernelArgumentHandler<cl::DeviceCommandQueue, void>
+    {
+        static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); }
+        static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); }
+    };
+} // namespace detail
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    size_type offset,
+    size_type size,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events,
+    Event* event)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to 
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueMapSVM(
+    cl::pointer<T, D> ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueMapSVM(
+    cl::vector<T, Alloc> container,
+    cl_bool blocking,
+    cl_map_flags flags,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        container, blocking, flags, events, event);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+        queue(), memory(), mapped_ptr,
+        (events != NULL) ? (cl_uint)events->size() : 0,
+        (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+        (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueUnmapSVM(
+    T* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), 
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueUnmapSVM(
+    cl::pointer<T, D> &ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueUnmapSVM(
+    cl::vector<T, Alloc> &container,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Blocking SVM map operation - performs a blocking map underneath.
+ */
+template<typename T, class Alloc>
+inline cl_int mapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE);
+}
+
+/**
+* Blocking SVM map operation - performs a blocking map underneath.
+*/
+template<typename T, class Alloc>
+inline cl_int unmapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueUnmapSVM(container);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    const void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    size_type src_row_pitch,
+    size_type src_slice_pitch,
+    size_type dst_row_pitch,
+    size_type dst_slice_pitch,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& region,
+    size_type dst_offset,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+class EnqueueArgs
+{
+private:
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    vector<Event> events_;
+
+    template<typename... Ts>
+    friend class KernelFunctor;
+
+public:
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+
+//----------------------------------------------------------------------------------------------
+
+
+/**
+ * Type safe kernel functor.
+ * 
+ */
+template<typename... Ts>
+class KernelFunctor
+{
+private:
+    Kernel kernel_;
+
+    template<int index, typename T0, typename... T1s>
+    void setArgs(T0&& t0, T1s&&... t1s)
+    {
+        kernel_.setArg(index, t0);
+        setArgs<index + 1, T1s...>(std::forward<T1s>(t1s)...);
+    }
+
+    template<int index, typename T0>
+    void setArgs(T0&& t0)
+    {
+        kernel_.setArg(index, t0);
+    }
+
+    template<int index>
+    void setArgs()
+    {
+    }
+
+
+public:
+    KernelFunctor(Kernel kernel) : kernel_(kernel)
+    {}
+
+    KernelFunctor(
+        const Program& program,
+        const string name,
+        cl_int * err = NULL) :
+        kernel_(program, name.c_str(), err)
+    {}
+
+    //! \brief Return type of the functor
+    typedef Event result_type;
+
+    /**
+     * Enqueue kernel.
+     * @param args Launch parameters of the kernel.
+     * @param t0... List of kernel arguments based on the template type of the functor.
+     */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+
+        return event;
+    }
+
+    /**
+    * Enqueue kernel with support for error code.
+    * @param args Launch parameters of the kernel.
+    * @param t0... List of kernel arguments based on the template type of the functor.
+    * @param error Out parameter returning the error code from the execution.
+    */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts,
+        cl_int &error)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+
+        error = args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return kernel_.setSVMPointers(pointerList);
+    }
+
+    template<typename T0, typename... T1s>
+    cl_int setSVMPointers(const T0 &t0, T1s... ts)
+    {
+        return kernel_.setSVMPointers(t0, ts...);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    Kernel getKernel()
+    {
+        return kernel_;
+    }
+};
+
+namespace compatibility {
+    /**
+     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Please use KernelFunctor directly.
+     */
+    template<typename... Ts>
+    struct make_kernel
+    {
+        typedef KernelFunctor<Ts...> FunctorType;
+
+        FunctorType functor_;
+
+        make_kernel(
+            const Program& program,
+            const string name,
+            cl_int * err = NULL) :
+            functor_(FunctorType(program, name, err))
+        {}
+
+        make_kernel(
+            const Kernel kernel) :
+            functor_(FunctorType(kernel))
+        {}
+
+        //! \brief Return type of the functor
+        typedef Event result_type;
+
+        //! \brief Function signature of kernel functor with no event dependency.
+        typedef Event type_(
+            const EnqueueArgs&,
+            Ts...);
+
+        Event operator()(
+            const EnqueueArgs& enqueueArgs,
+            Ts... args)
+        {
+            return functor_(
+                enqueueArgs, args...);
+        }
+    };
+} // namespace compatibility
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef CL_HPP_ERR_STR_
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __UNLOAD_COMPILER_ERR
+#undef __CREATE_SUB_DEVICES_ERR
+
+#undef __CREATE_PIPE_ERR
+#undef __GET_PIPE_INFO_ERR
+
+#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+
+// Extensions
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#undef CL_HPP_NOEXCEPT_
+#undef CL_HPP_DEFINE_STATIC_MEMBER_
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d10.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d10.h
new file mode 100644
index 0000000..d5960a4
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d11.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d11.h
new file mode 100644
index 0000000..39f9072
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_d3d11.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..2729e8b
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,132 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing_intel.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing_intel.h
new file mode 100644
index 0000000..331bab9
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_dx9_media_sharing_intel.h
@@ -0,0 +1,182 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_dx9_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_egl.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_egl.h
new file mode 100644
index 0000000..a765bd5
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_egl.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_ext.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_ext.h
new file mode 100644
index 0000000..7b101d7
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_ext.h
@@ -0,0 +1,723 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+
+#if CL_TARGET_OPENCL_VERSION <= 110
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */,
+                                            const void * /* private_info */,
+                                            size_t       /* cb */,
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+
+extern CL_API_ENTRY cl_program
+  CL_API_CALL clCreateProgramWithILKHR(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_program
+  (CL_API_CALL *clCreateProgramWithILKHR_fn)(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
+                                       cl_device_id /* device */,
+                                       const cl_queue_properties_khr* /* properties */,
+                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
+                                                         cl_device_id /* device */,
+                                                         const cl_queue_properties_khr* /* properties */,
+                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                        const cl_device_partition_property_ext * /* properties */,
+                        cl_uint /*num_entries*/,
+                        cl_device_id * /*out_devices*/,
+                        cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                            const cl_device_partition_property_ext * /* properties */,
+                                            cl_uint /*num_entries*/,
+                                            cl_device_id * /*out_devices*/,
+                                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+
+typedef cl_bitfield cl_mem_migration_flags_ext;
+
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */,
+                              cl_uint /* num_mem_objects */,
+                              const cl_mem * /* mem_objects */,
+                              cl_mem_migration_flags_ext /* flags */,
+                              cl_uint /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event * /* event */ );
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */,
+                                                cl_uint /* num_mem_objects */,
+                                                const cl_mem * /* mem_objects */,
+                                                cl_mem_migration_flags_ext /* flags */,
+                                                cl_uint /* num_events_in_wait_list */,
+                                                const cl_event * /* event_wait_list */,
+                                                cl_event * /* event */ );
+
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+                           cl_device_id /*in_device*/,
+                           cl_kernel_sub_group_info /* param_name */,
+                           size_t /*input_value_size*/,
+                           const void * /*input_value*/,
+                           size_t /*param_value_size*/,
+                           void* /*param_value*/,
+                           size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+                              cl_device_id /*in_device*/,
+                              cl_kernel_sub_group_info /* param_name */,
+                              size_t /*input_value_size*/,
+                              const void * /*input_value*/,
+                              size_t /*param_value_size*/,
+                              void* /*param_value*/,
+                              size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Protected DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_ext_intel.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_ext_intel.h
new file mode 100644
index 0000000..1c358cf
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_ext_intel.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2017 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_ext_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <OpenCL/cl_platform.h>
+#else
+    #include <CL/cl.h>
+    #include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_INTEL_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_gl.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_gl.h
new file mode 100644
index 0000000..58b6449
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_gl.h
@@ -0,0 +1,175 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#ifdef CL_VERSION_1_2
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+#endif
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#ifdef CL_VERSION_1_2
+#define CL_GL_NUM_SAMPLES                       0x2012
+#endif
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+#endif
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_gl_ext.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000..e3c14c6
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_gl_ext.h
@@ -0,0 +1,74 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_platform.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_platform.h
new file mode 100644
index 0000000..c2f408f
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_platform.h
@@ -0,0 +1,1460 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_version.h>
+
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl_version.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #endif
+    #elif defined(_WIN32)
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated)
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_va_api_media_sharing_intel.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_va_api_media_sharing_intel.h
new file mode 100644
index 0000000..2844428
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_va_api_media_sharing_intel.h
@@ -0,0 +1,172 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_va_api_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <va/va.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+
+#define cl_intel_va_api_media_sharing 1
+
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+	
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
+
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/cl_version.h b/3rdparty/TNN/third_party/opencl/include/CL/cl_version.h
new file mode 100644
index 0000000..99e7eec
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/cl_version.h
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ * Copyright (c) 2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+
+
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#endif  /* __CL_VERSION_H */
diff --git a/3rdparty/TNN/third_party/opencl/include/CL/opencl.h b/3rdparty/TNN/third_party/opencl/include/CL/opencl.h
new file mode 100644
index 0000000..9855cd7
--- /dev/null
+++ b/3rdparty/TNN/third_party/opencl/include/CL/opencl.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/3rdparty/TNN/third_party/stb/stb_image.h b/3rdparty/TNN/third_party/stb/stb_image.h
new file mode 100644
index 0000000..196dfd5
--- /dev/null
+++ b/3rdparty/TNN/third_party/stb/stb_image.h
@@ -0,0 +1,7559 @@
+/* stb_image - v2.23 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
+    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
+    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
+    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
+    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
+    Christian Floisand      Kevin Schmidt      JR Smith           github:darealshinji
+    Blazej Dariusz Roszkowski                                     github:Michaelangel007
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// By default we convert iphone-formatted PNGs back to RGB, even though
+// they are internally encoded differently. You can disable this conversion
+// by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through (which
+// is BGR stored in RGB).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// NOT THREADSAFE
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+
+#ifdef _MSC_VER
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+// assume GCC or Clang on ARM targets
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   fseek((FILE*) user, n, SEEK_CUR);
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+// this is not threadsafe
+static const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+}
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
+      bytes += slice_size; 
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 8) {
+      STBI_ASSERT(ri.bits_per_channel == 16);
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 16) {
+      STBI_ASSERT(ri.bits_per_channel == 8);
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+      return 0;
+	
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+      return 0;
+
+#if _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s; 
+   stbi__start_mem(&s,buffer,len); 
+   
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
+   }
+
+   return result; 
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   return z + (stbi__get16le(s) << 16);
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (stbi_uc) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+
+   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   k = stbi_lrot(j->code_buffer, n);
+   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & ~sgn);
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc << j->succ_low);
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) << shift);
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!stbi__at_eof(j->s)) {
+               int x = stbi__get8(j->s);
+               if (x == 255) {
+                  j->marker = stbi__get8(j->s);
+                  break;
+               }
+            }
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+      } else {
+         if (!stbi__process_marker(j, m)) return 0;
+      }
+      m = stbi__get_marker(j);
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[288];
+   stbi__uint16 value[288];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   STBI_ASSERT(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) stbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = stbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = stbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   STBI_ASSERT(a->num_bits == 0);
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[288] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load = 0;
+static int stbi__de_iphone_flag = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag = flag_true_if_should_convert;
+}
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v >= 0 && v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+               } else {
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
+               }
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history; 
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx; 
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y; 
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;  
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels; 
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose; 
+   int first_frame; 
+   int pi; 
+   int pcount; 
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0; 
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background; 
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame. 
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1; 
+   } else {
+      // second frame - how do we dispoase of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2; 
+      pcount = g->w * g->h; 
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
+            }
+         }
+      } else if (dispose == 2) { 
+         // restore what was changed last frame to background before that frame; 
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just 
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame; 
+      memcpy( g->background, g->out, 4 * g->w * g->h ); 
+   }
+
+   // clear my history; 
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s); 
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");            
+            
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame, 
+            pcount = g->w * g->h; 
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s); 
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255; 
+                  } 
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0; 
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1); 
+                     g->transparent = -1; 
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            } 
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0; 
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0; 
+      stbi__gif g;
+      int stride; 
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0; 
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers; 
+            stride = g.w * g.h * 4; 
+         
+            if (out) {
+               out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
+               if (delays) {
+                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride ); 
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride ); 
+            if (layers >= 2) {
+               two_back = out - 2 * stride; 
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay; 
+            }
+         }
+      } while (u != 0); 
+
+      // free temp buffer; 
+      STBI_FREE(g.out); 
+      STBI_FREE(g.history); 
+      STBI_FREE(g.background); 
+
+      // do the final conversion after loading everything; 
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers; 
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type."); 
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames. 
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading; 
+   STBI_FREE(g.history);
+   STBI_FREE(g.background); 
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   (void) stbi__get32be(s);
+   (void) stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+//    Does not support 16-bit-per-channel
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+      return 0;
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+
+   if (maxv > 255)
+      return stbi__err("max value > 255", "PPM image not 8-bit");
+   else
+      return 1;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/TNN/third_party/stb/stb_image_resize.h b/3rdparty/TNN/third_party/stb/stb_image_resize.h
new file mode 100644
index 0000000..4f6ad35
--- /dev/null
+++ b/3rdparty/TNN/third_party/stb/stb_image_resize.h
@@ -0,0 +1,2630 @@
+/* stb_image_resize - v0.96 - public domain image resizing
+   by Jorge L Rodriguez (@VinoBS) - 2014
+   http://github.com/nothings/stb
+
+   Written with emphasis on usability, portability, and efficiency. (No
+   SIMD or threads, so it be easily outperformed by libs that use those.)
+   Only scaling and translation is supported, no rotations or shears.
+   Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   QUICKSTART
+      stbir_resize_uint8(      input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0, num_channels)
+      stbir_resize_float(...)
+      stbir_resize_uint8_srgb( input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0,
+                               num_channels , alpha_chan  , 0)
+      stbir_resize_uint8_srgb_edgemode(
+                               input_pixels , in_w , in_h , 0, 
+                               output_pixels, out_w, out_h, 0, 
+                               num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
+                                                            // WRAP/REFLECT/ZERO
+
+   FULL API
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      SRGB & FLOATING POINT REPRESENTATION
+         The sRGB functions presume IEEE floating point. If you do not have
+         IEEE floating point, define STBIR_NON_IEEE_FLOAT. This will use
+         a slower implementation.
+
+      MEMORY ALLOCATION
+         The resize functions here perform a single memory allocation using
+         malloc. To control the memory allocation, before the #include that
+         triggers the implementation, do:
+
+            #define STBIR_MALLOC(size,context) ...
+            #define STBIR_FREE(ptr,context)   ...
+
+         Each resize function makes exactly one call to malloc/free, so to use
+         temp memory, store the temp memory in the context and return that.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      OPTIMIZATION
+         Define STBIR_SATURATE_INT to compute clamp values in-range using
+         integer operations instead of float operations. This may be faster
+         on some platforms.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters
+         to use, you can change the compile-time defaults with
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are used. For a list of
+         supported filters see the stbir_filter enum. To add a new filter,
+         write a filter function and add it to stbir__filter_info_table.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can install
+         a progress-report callback:
+
+            #define STBIR_PROGRESS_REPORT(val)   some_func(val)
+
+         The parameter val is a float which goes from 0 to 1 as progress is made.
+
+         For example:
+
+            static void my_progress_report(float progress);
+            #define STBIR_PROGRESS_REPORT(val) my_progress_report(val)
+
+            #define STB_IMAGE_RESIZE_IMPLEMENTATION
+            #include "stb_image_resize.h"
+
+            static void my_progress_report(float progress)
+            {
+               printf("Progress: %f%%\n", progress*100);
+            }
+
+      MAX CHANNELS
+         If your image has more than 64 channels, define STBIR_MAX_CHANNELS
+         to the max you'll have.
+
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how
+         the alpha channel of an image is processed. The important things
+         to know about this:
+
+         1. The best mathematically-behaved version of alpha to use is
+         called "premultiplied alpha", in which the other color channels
+         have had the alpha value multiplied in. If you use premultiplied
+         alpha, linear filtering (such as image resampling done by this
+         library, or performed in texture units on GPUs) does the "right
+         thing". While premultiplied alpha is standard in the movie CGI
+         industry, it is still uncommon in the videogame/real-time world.
+
+         If you linearly filter non-premultiplied alpha, strange effects
+         occur. (For example, the 50/50 average of 99% transparent bright green
+         and 1% transparent black produces 50% transparent dark green when
+         non-premultiplied, whereas premultiplied it produces 50%
+         transparent near-black. The former introduces green energy
+         that doesn't exist in the source image.)
+
+         2. Artists should not edit premultiplied-alpha images; artists
+         want non-premultiplied alpha images. Thus, art tools generally output
+         non-premultiplied alpha images.
+
+         3. You will get best results in most cases by converting images
+         to premultiplied alpha before processing them mathematically.
+
+         4. If you pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED, the
+         resizer does not do anything special for the alpha channel;
+         it is resampled identically to other channels. This produces
+         the correct results for premultiplied-alpha images, but produces
+         less-than-ideal results for non-premultiplied-alpha images.
+
+         5. If you do not pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED,
+         then the resizer weights the contribution of input pixels
+         based on their alpha values, or, equivalently, it multiplies
+         the alpha value into the color channels, resamples, then divides
+         by the resultant alpha value. Input pixels which have alpha=0 do
+         not contribute at all to output pixels unless _all_ of the input
+         pixels affecting that output pixel have alpha=0, in which case
+         the result for that pixel is the same as it would be without
+         STBIR_FLAG_ALPHA_PREMULTIPLIED. However, this is only true for
+         input images in integer formats. For input images in float format,
+         input pixels with alpha=0 have no effect, and output pixels
+         which have alpha=0 will be 0 in all channels. (For float images,
+         you can manually achieve the same result by adding a tiny epsilon
+         value to the alpha channel of every image, and then subtracting
+         or clamping it at the end.)
+
+         6. You can suppress the behavior described in #5 and make
+         all-0-alpha pixels have 0 in all channels by #defining
+         STBIR_NO_ALPHA_EPSILON.
+
+         7. You can separately control whether the alpha channel is
+         interpreted as linear or affected by the colorspace. By default
+         it is linear; you almost never want to apply the colorspace.
+         (For example, graphics hardware does not apply sRGB conversion
+         to the alpha channel.)
+
+   CONTRIBUTORS
+      Jorge L Rodriguez: Implementation
+      Sean Barrett: API design, optimizations
+      Aras Pranckevicius: bugfix
+      Nathan Reed: warning fixes
+
+   REVISIONS
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+
+   TODO
+      Don't decode all of the image data when only processing a partial tile
+      Don't use full-width decode buffers when only processing a partial tile
+      When processing wide images, break processing into tiles so data fits in L1 cache
+      Installable filters?
+      Resize that respects alpha test coverage
+         (Reference code: FloatImage::alphaTestCoverage and FloatImage::scaleAlphaToCoverage:
+         https://code.google.com/p/nvidia-texture-tools/source/browse/trunk/src/nvimage/FloatImage.cpp )
+*/
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+#ifdef _MSC_VER
+typedef unsigned char  stbir_uint8;
+typedef unsigned short stbir_uint16;
+typedef unsigned int   stbir_uint32;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * "input pixels" points to an array of image data with 'num_channels' channels (e.g. RGB=3, RGBA=4)
+//     * input_w is input image width (x-axis), input_h is input image height (y-axis)
+//     * stride is the offset between successive rows of image data in memory, in bytes. you can
+//       specify 0 to mean packed continuously in memory
+//     * alpha channel is treated identically to other channels.
+//     * colorspace is linear or sRGB as specified by function name
+//     * returned result is 1 for success or 0 in case of an error.
+//       #define STBIR_ASSERT() to trigger an assert on parameter validation errors.
+//     * Memory required grows approximately linearly with input and output size, but with
+//       discontinuities at input_w == output_w and input_h == output_h.
+//     * These functions use a "default" resampling filter defined at compile time. To change the filter,
+//       you can change the compile-time defaults by #defining STBIR_DEFAULT_FILTER_UPSAMPLE
+//       and STBIR_DEFAULT_FILTER_DOWNSAMPLE, or you can use the medium-complexity API.
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+
+// The following functions interpret image data as gamma-corrected sRGB. 
+// Specify STBIR_ALPHA_CHANNEL_NONE if you have no alpha channel,
+// or otherwise provide the index of the alpha channel. Flags value
+// of 0 will probably do the right thing if you're not sure what
+// the flags mean.
+
+#define STBIR_ALPHA_CHANNEL_NONE       -1
+
+// Set this flag if your texture has premultiplied alpha. Otherwise, stbir will
+// use alpha-weighted resampling (effectively premultiplying, resampling,
+// then unpremultiplying).
+#define STBIR_FLAG_ALPHA_PREMULTIPLIED    (1 << 0)
+// The specified alpha channel should be handled as gamma-corrected value even
+// when doing sRGB operations.
+#define STBIR_FLAG_ALPHA_USES_COLORSPACE  (1 << 1)
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags);
+
+
+typedef enum
+{
+    STBIR_EDGE_CLAMP   = 1,
+    STBIR_EDGE_REFLECT = 2,
+    STBIR_EDGE_WRAP    = 3,
+    STBIR_EDGE_ZERO    = 4,
+} stbir_edge;
+
+// This function adds the ability to specify how requests to sample off the edge of the image are handled.
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode);
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Alpha-channel can be processed separately
+//       * If alpha_channel is not STBIR_ALPHA_CHANNEL_NONE
+//         * Alpha channel will not be gamma corrected (unless flags&STBIR_FLAG_GAMMA_CORRECT)
+//         * Filters will be weighted by alpha channel (unless flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)
+//     * Filter can be selected explicitly
+//     * uint16 image type
+//     * sRGB colorspace available for all types
+//     * context parameter for passing to STBIR_MALLOC
+
+typedef enum
+{
+    STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+    STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+    STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+    STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+    STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+    STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+} stbir_filter;
+
+typedef enum
+{
+    STBIR_COLORSPACE_LINEAR,
+    STBIR_COLORSPACE_SRGB,
+
+    STBIR_MAX_COLORSPACES,
+} stbir_colorspace;
+
+// The following functions are all identical except for the type of the image data
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Full-complexity API
+//
+// This extends the medium API as follows:
+//
+//       * uint32 image type
+//     * not typesafe
+//     * separate filter types for each axis
+//     * separate edge modes for each axis
+//     * can specify scale explicitly for subpixel correctness
+//     * can specify image source tile using texture coordinates
+
+typedef enum
+{
+    STBIR_TYPE_UINT8 ,
+    STBIR_TYPE_UINT16,
+    STBIR_TYPE_UINT32,
+    STBIR_TYPE_FLOAT ,
+
+    STBIR_MAX_TYPES
+} stbir_datatype;
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context);
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset);
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1);
+// (s0, t0) & (s1, t1) are the top-left and bottom right corner (uv addressing style: [0, 1]x[0, 1]) of a region of the input image to use.
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+
+
+
+
+#ifdef STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+// For memset
+#include <string.h>
+
+#include <math.h>
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+// use comma operator to evaluate c, to avoid "unused parameter" warnings
+#define STBIR_MALLOC(size,c) ((void)(c), malloc(size))
+#define STBIR_FREE(ptr,c)    ((void)(c), free(ptr))
+#endif
+
+#ifndef _MSC_VER
+#ifdef __cplusplus
+#define stbir__inline inline
+#else
+#define stbir__inline
+#endif
+#else
+#define stbir__inline __forceinline
+#endif
+
+
+// should produce compiler error if size is wrong
+typedef unsigned char stbir__validate_uint32[sizeof(stbir_uint32) == 4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBIR__NOTUSED(v)  (void)(v)
+#else
+#define STBIR__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+#ifndef STBIR_PROGRESS_REPORT
+#define STBIR_PROGRESS_REPORT(float_0_to_1)
+#endif
+
+#ifndef STBIR_MAX_CHANNELS
+#define STBIR_MAX_CHANNELS 64
+#endif
+
+#if STBIR_MAX_CHANNELS > 65536
+#error "Too many channels; STBIR_MAX_CHANNELS must be no more than 65536."
+// because we store the indices in 16-bit variables
+#endif
+
+// This value is added to alpha just before premultiplication to avoid
+// zeroing out color values. It is equivalent to 2^-80. If you don't want
+// that behavior (it may interfere if you have floating point images with
+// very small alpha values) then you can define STBIR_NO_ALPHA_EPSILON to
+// disable it.
+#ifndef STBIR_ALPHA_EPSILON
+#define STBIR_ALPHA_EPSILON ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+#endif
+
+
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED_PARAM(v)  (void)(v)
+#else
+#define STBIR__UNUSED_PARAM(v)  (void)sizeof(v)
+#endif
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+    1, // STBIR_TYPE_UINT8
+    2, // STBIR_TYPE_UINT16
+    4, // STBIR_TYPE_UINT32
+    4, // STBIR_TYPE_FLOAT
+};
+
+// Kernel function centered at 0
+typedef float (stbir__kernel_fn)(float x, float scale);
+typedef float (stbir__support_fn)(float scale);
+
+typedef struct
+{
+    stbir__kernel_fn* kernel;
+    stbir__support_fn* support;
+} stbir__filter_info;
+
+// When upsampling, the contributors are which source pixels contribute.
+// When downsampling, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+    int n0; // First contributing pixel
+    int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+    const void* input_data;
+    int input_w;
+    int input_h;
+    int input_stride_bytes;
+
+    void* output_data;
+    int output_w;
+    int output_h;
+    int output_stride_bytes;
+
+    float s0, t0, s1, t1;
+
+    float horizontal_shift; // Units: output pixels
+    float vertical_shift;   // Units: output pixels
+    float horizontal_scale;
+    float vertical_scale;
+
+    int channels;
+    int alpha_channel;
+    stbir_uint32 flags;
+    stbir_datatype type;
+    stbir_filter horizontal_filter;
+    stbir_filter vertical_filter;
+    stbir_edge edge_horizontal;
+    stbir_edge edge_vertical;
+    stbir_colorspace colorspace;
+
+    stbir__contributors* horizontal_contributors;
+    float* horizontal_coefficients;
+
+    stbir__contributors* vertical_contributors;
+    float* vertical_coefficients;
+
+    int decode_buffer_pixels;
+    float* decode_buffer;
+
+    float* horizontal_buffer;
+
+    // cache these because ceil/floor are inexplicably showing up in profile
+    int horizontal_coefficient_width;
+    int vertical_coefficient_width;
+    int horizontal_filter_pixel_width;
+    int vertical_filter_pixel_width;
+    int horizontal_filter_pixel_margin;
+    int vertical_filter_pixel_margin;
+    int horizontal_num_contributors;
+    int vertical_num_contributors;
+
+    int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+    int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+    int ring_buffer_first_scanline;
+    int ring_buffer_last_scanline;
+    int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+    float* ring_buffer;
+
+    float* encode_buffer; // A temporary buffer to store floats so we don't lose precision while we do multiply-adds.
+
+    int horizontal_contributors_size;
+    int horizontal_coefficients_size;
+    int vertical_contributors_size;
+    int vertical_coefficients_size;
+    int decode_buffer_size;
+    int horizontal_buffer_size;
+    int ring_buffer_size;
+    int encode_buffer_size;
+} stbir__info;
+
+
+static const float stbir__max_uint8_as_float  = 255.0f;
+static const float stbir__max_uint16_as_float = 65535.0f;
+static const double stbir__max_uint32_as_float = 4294967295.0;
+
+
+static stbir__inline int stbir__min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+static stbir__inline float stbir__saturate(float x)
+{
+    if (x < 0)
+        return 0;
+
+    if (x > 1)
+        return 1;
+
+    return x;
+}
+
+#ifdef STBIR_SATURATE_INT
+static stbir__inline stbir_uint8 stbir__saturate8(int x)
+{
+    if ((unsigned int) x <= 255)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 255;
+}
+
+static stbir__inline stbir_uint16 stbir__saturate16(int x)
+{
+    if ((unsigned int) x <= 65535)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 65535;
+}
+#endif
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+    0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+    0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+    0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+    0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+    0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+    0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+    0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+    0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+    0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+    0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+    0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+    0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+    0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+    0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+    0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+    0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+    0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+    0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+    0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+    0.982251f, 0.991102f, 1.0f
+};
+
+static float stbir__srgb_to_linear(float f)
+{
+    if (f <= 0.04045f)
+        return f / 12.92f;
+    else
+        return (float)pow((f + 0.055f) / 1.055f, 2.4f);
+}
+
+static float stbir__linear_to_srgb(float f)
+{
+    if (f <= 0.0031308f)
+        return f * 12.92f;
+    else
+        return 1.055f * (float)pow(f, 1 / 2.4f) - 0.055f;
+}
+
+#ifndef STBIR_NON_IEEE_FLOAT
+// From https://gist.github.com/rygorous/2203834
+
+typedef union
+{
+    stbir_uint32 u;
+    float f;
+} stbir__FP32;
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+ 
+static stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+    static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+    static const stbir__FP32 minval = { (127-13) << 23 };
+    stbir_uint32 tab,bias,scale,t;
+    stbir__FP32 f;
+ 
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    // The tests are carefully written so that NaNs map to 0, same as in the reference
+    // implementation.
+    if (!(in > minval.f)) // written this way to catch NaNs
+        in = minval.f;
+    if (in > almostone.f)
+        in = almostone.f;
+ 
+    // Do the table lookup and unpack bias, scale
+    f.f = in;
+    tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+    bias = (tab >> 16) << 9;
+    scale = tab & 0xffff;
+ 
+    // Grab next-highest mantissa bits and perform linear interpolation
+    t = (f.u >> 12) & 0xff;
+    return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#else
+// sRGB transition values, scaled by 1<<28
+static int stbir__srgb_offset_to_linear_scaled[256] =
+{
+            0,     40738,    122216,    203693,    285170,    366648,    448125,    529603,
+       611080,    692557,    774035,    855852,    942009,   1033024,   1128971,   1229926,
+      1335959,   1447142,   1563542,   1685229,   1812268,   1944725,   2082664,   2226148,
+      2375238,   2529996,   2690481,   2856753,   3028870,   3206888,   3390865,   3580856,
+      3776916,   3979100,   4187460,   4402049,   4622919,   4850123,   5083710,   5323731,
+      5570236,   5823273,   6082892,   6349140,   6622065,   6901714,   7188133,   7481369,
+      7781466,   8088471,   8402427,   8723380,   9051372,   9386448,   9728650,  10078021,
+     10434603,  10798439,  11169569,  11548036,  11933879,  12327139,  12727857,  13136073,
+     13551826,  13975156,  14406100,  14844697,  15290987,  15745007,  16206795,  16676389,
+     17153826,  17639142,  18132374,  18633560,  19142734,  19659934,  20185196,  20718552,
+     21260042,  21809696,  22367554,  22933648,  23508010,  24090680,  24681686,  25281066,
+     25888850,  26505076,  27129772,  27762974,  28404716,  29055026,  29713942,  30381490,
+     31057708,  31742624,  32436272,  33138682,  33849884,  34569912,  35298800,  36036568,
+     36783260,  37538896,  38303512,  39077136,  39859796,  40651528,  41452360,  42262316,
+     43081432,  43909732,  44747252,  45594016,  46450052,  47315392,  48190064,  49074096,
+     49967516,  50870356,  51782636,  52704392,  53635648,  54576432,  55526772,  56486700,
+     57456236,  58435408,  59424248,  60422780,  61431036,  62449032,  63476804,  64514376,
+     65561776,  66619028,  67686160,  68763192,  69850160,  70947088,  72053992,  73170912,
+     74297864,  75434880,  76581976,  77739184,  78906536,  80084040,  81271736,  82469648,
+     83677792,  84896192,  86124888,  87363888,  88613232,  89872928,  91143016,  92423512,
+     93714432,  95015816,  96327688,  97650056,  98982952, 100326408, 101680440, 103045072,
+    104420320, 105806224, 107202800, 108610064, 110028048, 111456776, 112896264, 114346544,
+    115807632, 117279552, 118762328, 120255976, 121760536, 123276016, 124802440, 126339832,
+    127888216, 129447616, 131018048, 132599544, 134192112, 135795792, 137410592, 139036528,
+    140673648, 142321952, 143981456, 145652208, 147334208, 149027488, 150732064, 152447968,
+    154175200, 155913792, 157663776, 159425168, 161197984, 162982240, 164777968, 166585184,
+    168403904, 170234160, 172075968, 173929344, 175794320, 177670896, 179559120, 181458992,
+    183370528, 185293776, 187228736, 189175424, 191133888, 193104112, 195086128, 197079968,
+    199085648, 201103184, 203132592, 205173888, 207227120, 209292272, 211369392, 213458480,
+    215559568, 217672656, 219797792, 221934976, 224084240, 226245600, 228419056, 230604656,
+    232802400, 235012320, 237234432, 239468736, 241715280, 243974080, 246245120, 248528464,
+    250824112, 253132064, 255452368, 257785040, 260130080, 262487520, 264857376, 267239664,
+};
+
+static stbir_uint8 stbir__linear_to_srgb_uchar(float f)
+{
+    int x = (int) (f * (1 << 28)); // has headroom so you don't need to clamp
+    int v = 0;
+    int i;
+
+    // Refine the guess with a short binary search.
+    i = v + 128; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  64; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  32; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  16; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   8; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   4; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   2; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   1; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+
+    return (stbir_uint8) v;
+}
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale)
+{
+    float halfscale = scale / 2;
+    float t = 0.5f + halfscale;
+    STBIR_ASSERT(scale <= 1);
+
+    x = (float)fabs(x);
+
+    if (x >= t)
+        return 0;
+    else
+    {
+        float r = 0.5f - halfscale;
+        if (x <= r)
+            return 1;
+        else
+            return (t - x) / scale;
+    }
+}
+
+static float stbir__support_trapezoid(float scale)
+{
+    STBIR_ASSERT(scale <= 1);
+    return 0.5f + scale / 2;
+}
+
+static float stbir__filter_triangle(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x <= 1.0f)
+        return 1 - x;
+    else
+        return 0;
+}
+
+static float stbir__filter_cubic(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (4 + x*x*(3*x - 6))/6;
+    else if (x < 2.0f)
+        return (8 + x*(-12 + x*(6 - x)))/6;
+
+    return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return 1 - x*x*(2.5f - 1.5f*x);
+    else if (x < 2.0f)
+        return 2 - x*(4 + x*(0.5f*x - 2.5f));
+
+    return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (16 + x*x*(21 * x - 36))/18;
+    else if (x < 2.0f)
+        return (32 + x*(-60 + x*(36 - 7*x)))/18;
+
+    return (0.0f);
+}
+
+static float stbir__support_zero(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 0;
+}
+
+static float stbir__support_one(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 1;
+}
+
+static float stbir__support_two(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 2;
+}
+
+static stbir__filter_info stbir__filter_info_table[] = {
+        { NULL,                     stbir__support_zero },
+        { stbir__filter_trapezoid,  stbir__support_trapezoid },
+        { stbir__filter_triangle,   stbir__support_one },
+        { stbir__filter_cubic,      stbir__support_two },
+        { stbir__filter_catmullrom, stbir__support_two },
+        { stbir__filter_mitchell,   stbir__support_two },
+};
+
+stbir__inline static int stbir__use_upsampling(float ratio)
+{
+    return ratio > 1;
+}
+
+stbir__inline static int stbir__use_width_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->horizontal_scale);
+}
+
+stbir__inline static int stbir__use_height_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->vertical_scale);
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter
+static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
+{
+    STBIR_ASSERT(filter != 0);
+    STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
+}
+
+// This is how much to expand buffers to account for filters seeking outside
+// the image boundaries.
+static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
+{
+    return stbir__get_filter_pixel_width(filter, scale) / 2;
+}
+
+static int stbir__get_coefficient_width(stbir_filter filter, float scale)
+{
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
+}
+
+static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
+{
+    if (stbir__use_upsampling(scale))
+        return output_size;
+    else
+        return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
+}
+
+static int stbir__get_total_horizontal_coefficients(stbir__info* info)
+{
+    return info->horizontal_num_contributors
+         * stbir__get_coefficient_width      (info->horizontal_filter, info->horizontal_scale);
+}
+
+static int stbir__get_total_vertical_coefficients(stbir__info* info)
+{
+    return info->vertical_num_contributors
+         * stbir__get_coefficient_width      (info->vertical_filter, info->vertical_scale);
+}
+
+static stbir__contributors* stbir__get_contributor(stbir__contributors* contributors, int n)
+{
+    return &contributors[n];
+}
+
+// For perf reasons this code is duplicated in stbir__resample_horizontal_upsample/downsample,
+// if you change it here change it there too.
+static float* stbir__get_coefficient(float* coefficients, stbir_filter filter, float scale, int n, int c)
+{
+    int width = stbir__get_coefficient_width(filter, scale);
+    return &coefficients[width*n + c];
+}
+
+static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
+{
+    switch (edge)
+    {
+    case STBIR_EDGE_ZERO:
+        return 0; // we'll decode the wrong pixel here, and then overwrite with 0s later
+
+    case STBIR_EDGE_CLAMP:
+        if (n < 0)
+            return 0;
+
+        if (n >= max)
+            return max - 1;
+
+        return n; // NOTREACHED
+
+    case STBIR_EDGE_REFLECT:
+    {
+        if (n < 0)
+        {
+            if (n < max)
+                return -n;
+            else
+                return max - 1;
+        }
+
+        if (n >= max)
+        {
+            int max2 = max * 2;
+            if (n >= max2)
+                return 0;
+            else
+                return max2 - n - 1;
+        }
+
+        return n; // NOTREACHED
+    }
+
+    case STBIR_EDGE_WRAP:
+        if (n >= 0)
+            return (n % max);
+        else
+        {
+            int m = (-n) % max;
+
+            if (m != 0)
+                m = max - m;
+
+            return (m);
+        }
+        // NOTREACHED
+
+    default:
+        STBIR_ASSERT(!"Unimplemented edge type");
+        return 0;
+    }
+}
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+    // avoid per-pixel switch
+    if (n >= 0 && n < max)
+        return n;
+    return stbir__edge_wrap_slow(edge, n, max);
+}
+
+// What input pixels contribute to this output pixel?
+static void stbir__calculate_sample_range_upsample(int n, float out_filter_radius, float scale_ratio, float out_shift, int* in_first_pixel, int* in_last_pixel, float* in_center_of_out)
+{
+    float out_pixel_center = (float)n + 0.5f;
+    float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+    float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+    float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) / scale_ratio;
+    float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) / scale_ratio;
+
+    *in_center_of_out = (out_pixel_center + out_shift) / scale_ratio;
+    *in_first_pixel = (int)(floor(in_pixel_influence_lowerbound + 0.5));
+    *in_last_pixel = (int)(floor(in_pixel_influence_upperbound - 0.5));
+}
+
+// What output pixels does this input pixel contribute to?
+static void stbir__calculate_sample_range_downsample(int n, float in_pixels_radius, float scale_ratio, float out_shift, int* out_first_pixel, int* out_last_pixel, float* out_center_of_in)
+{
+    float in_pixel_center = (float)n + 0.5f;
+    float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+    float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+
+    float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale_ratio - out_shift;
+    float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale_ratio - out_shift;
+
+    *out_center_of_in = in_pixel_center * scale_ratio - out_shift;
+    *out_first_pixel = (int)(floor(out_pixel_influence_lowerbound + 0.5));
+    *out_last_pixel = (int)(floor(out_pixel_influence_upperbound - 0.5));
+}
+
+static void stbir__calculate_coefficients_upsample(stbir_filter filter, float scale, int in_first_pixel, int in_last_pixel, float in_center_of_out, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+    float total_filter = 0;
+    float filter_scale;
+
+    STBIR_ASSERT(in_last_pixel - in_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = in_first_pixel;
+    contributor->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+        float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(in_center_of_out - in_pixel_center, 1 / scale);
+
+        // If the coefficient is zero, skip it. (Don't do the <0 check here, we want the influence of those outside pixels.)
+        if (i == 0 && !coefficient_group[i])
+        {
+            contributor->n0 = ++in_first_pixel;
+            i--;
+            continue;
+        }
+
+        total_filter += coefficient_group[i];
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+
+    STBIR_ASSERT(total_filter > 0.9);
+    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
+
+    // Make sure the sum of all coefficients is 1.
+    filter_scale = 1 / total_filter;
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+        coefficient_group[i] *= filter_scale;
+
+    for (i = in_last_pixel - in_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__calculate_coefficients_downsample(stbir_filter filter, float scale_ratio, int out_first_pixel, int out_last_pixel, float out_center_of_in, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+
+     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = out_first_pixel;
+    contributor->n1 = out_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+        float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+        float x = out_pixel_center - out_center_of_in;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+
+    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__normalize_downsample_coefficients(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, int input_size, int output_size)
+{
+    int num_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+    int num_coefficients = stbir__get_coefficient_width(filter, scale_ratio);
+    int i, j;
+    int skip;
+
+    for (i = 0; i < output_size; i++)
+    {
+        float scale;
+        float total = 0;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+            {
+                float coefficient = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0);
+                total += coefficient;
+            }
+            else if (i < contributors[j].n0)
+                break;
+        }
+
+        STBIR_ASSERT(total > 0.9f);
+        STBIR_ASSERT(total < 1.1f);
+
+        scale = 1 / total;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+                *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0) *= scale;
+            else if (i < contributors[j].n0)
+                break;
+        }
+    }
+
+    // Optimize: Skip zero coefficients and contributions outside of image bounds.
+    // Do this after normalizing because normalization depends on the n0/n1 values.
+    for (j = 0; j < num_contributors; j++)
+    {
+        int range, max, width;
+
+        skip = 0;
+        while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
+            skip++;
+
+        contributors[j].n0 += skip;
+
+        while (contributors[j].n0 < 0)
+        {
+            contributors[j].n0++;
+            skip++;
+        }
+
+        range = contributors[j].n1 - contributors[j].n0 + 1;
+        max = stbir__min(num_coefficients, range);
+
+        width = stbir__get_coefficient_width(filter, scale_ratio);
+        for (i = 0; i < max; i++)
+        {
+            if (i + skip >= width)
+                break;
+
+            *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
+        }
+
+        continue;
+    }
+
+    // Using min to avoid writing into invalid pixels.
+    for (i = 0; i < num_contributors; i++)
+        contributors[i].n1 = stbir__min(contributors[i].n1, output_size - 1);
+}
+
+// Each scan line uses the same kernel values so we should calculate the kernel
+// values once and then we can use them for every scan line.
+static void stbir__calculate_filters(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
+{
+    int n;
+    int total_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+
+    if (stbir__use_upsampling(scale_ratio))
+    {
+        float out_pixels_radius = stbir__filter_info_table[filter].support(1 / scale_ratio) * scale_ratio;
+
+        // Looping through out pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float in_center_of_out; // Center of the current out pixel in the in pixel space
+            int in_first_pixel, in_last_pixel;
+
+            stbir__calculate_sample_range_upsample(n, out_pixels_radius, scale_ratio, shift, &in_first_pixel, &in_last_pixel, &in_center_of_out);
+
+            stbir__calculate_coefficients_upsample(filter, scale_ratio, in_first_pixel, in_last_pixel, in_center_of_out, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+    }
+    else
+    {
+        float in_pixels_radius = stbir__filter_info_table[filter].support(scale_ratio) / scale_ratio;
+
+        // Looping through in pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float out_center_of_in; // Center of the current out pixel in the in pixel space
+            int out_first_pixel, out_last_pixel;
+            int n_adjusted = n - stbir__get_filter_pixel_margin(filter, scale_ratio);
+
+            stbir__calculate_sample_range_downsample(n_adjusted, in_pixels_radius, scale_ratio, shift, &out_first_pixel, &out_last_pixel, &out_center_of_in);
+
+            stbir__calculate_coefficients_downsample(filter, scale_ratio, out_first_pixel, out_last_pixel, out_center_of_in, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+
+        stbir__normalize_downsample_coefficients(contributors, coefficients, filter, scale_ratio, input_size, output_size);
+    }
+}
+
+static float* stbir__get_decode_buffer(stbir__info* stbir_info)
+{
+    // The 0 index of the decode buffer starts after the margin. This makes
+    // it okay to use negative indexes on the decode buffer.
+    return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
+}
+
+#define STBIR__DECODE(type, colorspace) ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
+
+static void stbir__decode_scanline(stbir__info* stbir_info, int n)
+{
+    int c;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int input_w = stbir_info->input_w;
+    size_t input_stride_bytes = stbir_info->input_stride_bytes;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir_edge edge_horizontal = stbir_info->edge_horizontal;
+    stbir_edge edge_vertical = stbir_info->edge_vertical;
+    size_t in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
+    const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
+    int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    int x = -stbir_info->horizontal_filter_pixel_margin;
+
+    // special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
+    // and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
+    if (edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->input_h))
+    {
+        for (; x < max_x; x++)
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        return;
+    }
+
+    switch (decode)
+    {
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned char*)input_data)[input_pixel_index + c]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_uchar_to_linear_float[((const unsigned char*)input_data)[input_pixel_index + c]];
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned char*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned short*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear((float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float));
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((const float*)input_data)[input_pixel_index + c];
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((const float*)input_data)[input_pixel_index + c]);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((const float*)input_data)[input_pixel_index + alpha_channel];
+        }
+
+        break;
+
+    default:
+        STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+        break;
+    }
+
+    if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+
+            // If the alpha value is 0 it will clobber the color values. Make sure it's not.
+            float alpha = decode_buffer[decode_pixel_index + alpha_channel];
+#ifndef STBIR_NO_ALPHA_EPSILON
+            if (stbir_info->type != STBIR_TYPE_FLOAT) {
+                alpha += STBIR_ALPHA_EPSILON;
+                decode_buffer[decode_pixel_index + alpha_channel] = alpha;
+            }
+#endif
+            for (c = 0; c < channels; c++)
+            {
+                if (c == alpha_channel)
+                    continue;
+
+                decode_buffer[decode_pixel_index + c] *= alpha;
+            }
+        }
+    }
+
+    if (edge_horizontal == STBIR_EDGE_ZERO)
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+        for (x = input_w; x < max_x; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+    }
+}
+
+static float* stbir__get_ring_buffer_entry(float* ring_buffer, int index, int ring_buffer_length)
+{
+    return &ring_buffer[index * ring_buffer_length];
+}
+
+static float* stbir__add_empty_ring_buffer_entry(stbir__info* stbir_info, int n)
+{
+    int ring_buffer_index;
+    float* ring_buffer;
+
+    stbir_info->ring_buffer_last_scanline = n;
+
+    if (stbir_info->ring_buffer_begin_index < 0)
+    {
+        ring_buffer_index = stbir_info->ring_buffer_begin_index = 0;
+        stbir_info->ring_buffer_first_scanline = n;
+    }
+    else
+    {
+        ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+        STBIR_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
+    }
+
+    ring_buffer = stbir__get_ring_buffer_entry(stbir_info->ring_buffer, ring_buffer_index, stbir_info->ring_buffer_length_bytes / sizeof(float));
+    memset(ring_buffer, 0, stbir_info->ring_buffer_length_bytes);
+
+    return ring_buffer;
+}
+
+
+static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+
+    for (x = 0; x < output_w; x++)
+    {
+        int n0 = horizontal_contributors[x].n0;
+        int n1 = horizontal_contributors[x].n1;
+
+        int out_pixel_index = x * channels;
+        int coefficient_group = coefficient_width * x;
+        int coefficient_counter = 0;
+
+        STBIR_ASSERT(n1 >= n0);
+        STBIR_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+
+        switch (channels) {
+            case 1:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    int c;
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int input_w = stbir_info->input_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+    int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
+    int max_x = input_w + filter_pixel_margin * 2;
+
+    STBIR_ASSERT(!stbir__use_width_upsampling(stbir_info));
+
+    switch (channels) {
+        case 1:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 1;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+
+        case 2:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 2;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+
+        case 3:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 3;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+
+        case 4:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 4;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+
+        default:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * channels;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int c;
+                    int out_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+}
+
+static void stbir__decode_and_resample_upsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    // Now resample it into the ring buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+
+    // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__decode_and_resample_downsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    memset(stbir_info->horizontal_buffer, 0, stbir_info->output_w * stbir_info->channels * sizeof(float));
+
+    // Now resample it into the horizontal buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir_info->horizontal_buffer);
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir_info->horizontal_buffer);
+
+    // Now it's sitting in the horizontal buffer ready to be distributed into the ring buffers.
+}
+
+// Get the specified scan line from the ring buffer.
+static float* stbir__get_ring_buffer_scanline(int get_scanline, float* ring_buffer, int begin_index, int first_scanline, int ring_buffer_num_entries, int ring_buffer_length)
+{
+    int ring_buffer_index = (begin_index + (get_scanline - first_scanline)) % ring_buffer_num_entries;
+    return stbir__get_ring_buffer_entry(ring_buffer, ring_buffer_index, ring_buffer_length);
+}
+
+
+static void stbir__encode_scanline(stbir__info* stbir_info, int num_pixels, void *output_buffer, float *encode_buffer, int channels, int alpha_channel, int decode)
+{
+    int x;
+    int n;
+    int num_nonalpha;
+    stbir_uint16 nonalpha[STBIR_MAX_CHANNELS];
+
+    if (!(stbir_info->flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x=0; x < num_pixels; ++x)
+        {
+            int pixel_index = x*channels;
+
+            float alpha = encode_buffer[pixel_index + alpha_channel];
+            float reciprocal_alpha = alpha ? 1.0f / alpha : 0;
+
+            // unrolling this produced a 1% slowdown upscaling a large RGBA linear-space image on my machine - stb
+            for (n = 0; n < channels; n++)
+                if (n != alpha_channel)
+                    encode_buffer[pixel_index + n] *= reciprocal_alpha;
+
+            // We added in a small epsilon to prevent the color channel from being deleted with zero alpha.
+            // Because we only add it for integer types, it will automatically be discarded on integer
+            // conversion, so we don't need to subtract it back out (which would be problematic for
+            // numeric precision reasons).
+        }
+    }
+
+    // build a table of all channels that need colorspace correction, so
+    // we don't perform colorspace correction on channels that don't need it.
+    for (x = 0, num_nonalpha = 0; x < channels; ++x)
+    {
+        if (x != alpha_channel || (stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+        {
+            nonalpha[num_nonalpha++] = (stbir_uint16)x;
+        }
+    }
+
+    #define STBIR__ROUND_INT(f)    ((int)          ((f)+0.5))
+    #define STBIR__ROUND_UINT(f)   ((stbir_uint32) ((f)+0.5))
+
+    #ifdef STBIR__SATURATE_INT
+    #define STBIR__ENCODE_LINEAR8(f)   stbir__saturate8 (STBIR__ROUND_INT((f) * stbir__max_uint8_as_float ))
+    #define STBIR__ENCODE_LINEAR16(f)  stbir__saturate16(STBIR__ROUND_INT((f) * stbir__max_uint16_as_float))
+    #else
+    #define STBIR__ENCODE_LINEAR8(f)   (unsigned char ) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint8_as_float )
+    #define STBIR__ENCODE_LINEAR16(f)  (unsigned short) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint16_as_float)
+    #endif
+
+    switch (decode)
+    {
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned char*)output_buffer)[index] = STBIR__ENCODE_LINEAR8(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned char*)output_buffer)[index] = stbir__linear_to_srgb_uchar(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned char *)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR8(encode_buffer[pixel_index+alpha_channel]);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned short*)output_buffer)[index] = STBIR__ENCODE_LINEAR16(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned short*)output_buffer)[index] = (unsigned short)STBIR__ROUND_INT(stbir__linear_to_srgb(stbir__saturate(encode_buffer[index])) * stbir__max_uint16_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned short*)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR16(encode_buffer[pixel_index + alpha_channel]);
+            }
+
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__saturate(encode_buffer[index])) * stbir__max_uint32_as_float);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__linear_to_srgb(stbir__saturate(encode_buffer[index]))) * stbir__max_uint32_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned int*)output_buffer)[pixel_index + alpha_channel] = (unsigned int)STBIR__ROUND_INT(((double)stbir__saturate(encode_buffer[pixel_index + alpha_channel])) * stbir__max_uint32_as_float);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((float*)output_buffer)[index] = encode_buffer[index];
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((float*)output_buffer)[index] = stbir__linear_to_srgb(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((float*)output_buffer)[pixel_index + alpha_channel] = encode_buffer[pixel_index + alpha_channel];
+            }
+            break;
+
+        default:
+            STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+            break;
+    }
+}
+
+static void stbir__resample_vertical_upsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    void* output_data = stbir_info->output_data;
+    float* encode_buffer = stbir_info->encode_buffer;
+    int decode = STBIR__DECODE(type, colorspace);
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int coefficient_counter;
+    int contributor = n;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    int n0,n1, output_row_start;
+    int coefficient_group = coefficient_width * contributor;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    output_row_start = n * stbir_info->output_stride_bytes;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    memset(encode_buffer, 0, output_w * sizeof(float) * channels);
+
+    // I tried reblocking this for better cache usage of encode_buffer
+    // (using x_outer, k, x_inner), but it lost speed. -- stb
+
+    coefficient_counter = 0;
+    switch (channels) {
+        case 1:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 1;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+        case 2:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 2;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+        case 3:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 3;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+        case 4:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 4;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                    encode_buffer[in_pixel_index + 3] += ring_buffer_entry[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+        default:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * channels;
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        encode_buffer[in_pixel_index + c] += ring_buffer_entry[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+    stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, encode_buffer, channels, alpha_channel, decode);
+}
+
+static void stbir__resample_vertical_downsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    float* horizontal_buffer = stbir_info->horizontal_buffer;
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int contributor = n + stbir_info->vertical_filter_pixel_margin;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+    int n0,n1;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (k = n0; k <= n1; k++)
+    {
+        int coefficient_index = k - n0;
+        int coefficient_group = coefficient_width * contributor;
+        float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+
+        float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+
+        switch (channels) {
+            case 1:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 1;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 2;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 3;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 4;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 3] += horizontal_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * channels;
+
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        ring_buffer_entry[in_pixel_index + c] += horizontal_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    float out_scanlines_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(1/scale_ratio) * scale_ratio;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    for (y = 0; y < stbir_info->output_h; y++)
+    {
+        float in_center_of_out = 0; // Center of the current out scanline in the in scanline space
+        int in_first_scanline = 0, in_last_scanline = 0;
+
+        stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
+
+        STBIR_ASSERT(in_last_scanline - in_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (stbir_info->ring_buffer_begin_index >= 0)
+        {
+            // Get rid of whatever we don't need anymore.
+            while (in_first_scanline > stbir_info->ring_buffer_first_scanline)
+            {
+                if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+                {
+                    // We just popped the last scanline off the ring buffer.
+                    // Reset it to the empty state.
+                    stbir_info->ring_buffer_begin_index = -1;
+                    stbir_info->ring_buffer_first_scanline = 0;
+                    stbir_info->ring_buffer_last_scanline = 0;
+                    break;
+                }
+                else
+                {
+                    stbir_info->ring_buffer_first_scanline++;
+                    stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+                }
+            }
+        }
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__decode_and_resample_upsample(stbir_info, in_first_scanline);
+
+        while (in_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__decode_and_resample_upsample(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now all buffers should be ready to write a row of vertical sampling.
+        stbir__resample_vertical_upsample(stbir_info, y);
+
+        STBIR_PROGRESS_REPORT((float)y / stbir_info->output_h);
+    }
+}
+
+static void stbir__empty_ring_buffer(stbir__info* stbir_info, int first_necessary_scanline)
+{
+    int output_stride_bytes = stbir_info->output_stride_bytes;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int output_w = stbir_info->output_w;
+    void* output_data = stbir_info->output_data;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    if (stbir_info->ring_buffer_begin_index >= 0)
+    {
+        // Get rid of whatever we don't need anymore.
+        while (first_necessary_scanline > stbir_info->ring_buffer_first_scanline)
+        {
+            if (stbir_info->ring_buffer_first_scanline >= 0 && stbir_info->ring_buffer_first_scanline < stbir_info->output_h)
+            {
+                int output_row_start = stbir_info->ring_buffer_first_scanline * output_stride_bytes;
+                float* ring_buffer_entry = stbir__get_ring_buffer_entry(ring_buffer, stbir_info->ring_buffer_begin_index, ring_buffer_length);
+                stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, ring_buffer_entry, channels, alpha_channel, decode);
+                STBIR_PROGRESS_REPORT((float)stbir_info->ring_buffer_first_scanline / stbir_info->output_h);
+            }
+
+            if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+            {
+                // We just popped the last scanline off the ring buffer.
+                // Reset it to the empty state.
+                stbir_info->ring_buffer_begin_index = -1;
+                stbir_info->ring_buffer_first_scanline = 0;
+                stbir_info->ring_buffer_last_scanline = 0;
+                break;
+            }
+            else
+            {
+                stbir_info->ring_buffer_first_scanline++;
+                stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+            }
+        }
+    }
+}
+
+static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    int output_h = stbir_info->output_h;
+    float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
+    int pixel_margin = stbir_info->vertical_filter_pixel_margin;
+    int max_y = stbir_info->input_h + pixel_margin;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (y = -pixel_margin; y < max_y; y++)
+    {
+        float out_center_of_in; // Center of the current out scanline in the in scanline space
+        int out_first_scanline, out_last_scanline;
+
+        stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
+
+        STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (out_last_scanline < 0 || out_first_scanline >= output_h)
+            continue;
+
+        stbir__empty_ring_buffer(stbir_info, out_first_scanline);
+
+        stbir__decode_and_resample_downsample(stbir_info, y);
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__add_empty_ring_buffer_entry(stbir_info, out_first_scanline);
+
+        while (out_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__add_empty_ring_buffer_entry(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now the horizontal buffer is ready to write to all ring buffer rows.
+        stbir__resample_vertical_downsample(stbir_info, y);
+    }
+
+    stbir__empty_ring_buffer(stbir_info, stbir_info->output_h);
+}
+
+static void stbir__setup(stbir__info *info, int input_w, int input_h, int output_w, int output_h, int channels)
+{
+    info->input_w = input_w;
+    info->input_h = input_h;
+    info->output_w = output_w;
+    info->output_h = output_h;
+    info->channels = channels;
+}
+
+static void stbir__calculate_transform(stbir__info *info, float s0, float t0, float s1, float t1, float *transform)
+{
+    info->s0 = s0;
+    info->t0 = t0;
+    info->s1 = s1;
+    info->t1 = t1;
+
+    if (transform)
+    {
+        info->horizontal_scale = transform[0];
+        info->vertical_scale   = transform[1];
+        info->horizontal_shift = transform[2];
+        info->vertical_shift   = transform[3];
+    }
+    else
+    {
+        info->horizontal_scale = ((float)info->output_w / info->input_w) / (s1 - s0);
+        info->vertical_scale = ((float)info->output_h / info->input_h) / (t1 - t0);
+
+        info->horizontal_shift = s0 * info->output_w / (s1 - s0);
+        info->vertical_shift = t0 * info->output_h / (t1 - t0);
+    }
+}
+
+static void stbir__choose_filter(stbir__info *info, stbir_filter h_filter, stbir_filter v_filter)
+{
+    if (h_filter == 0)
+        h_filter = stbir__use_upsampling(info->horizontal_scale) ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    if (v_filter == 0)
+        v_filter = stbir__use_upsampling(info->vertical_scale)   ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    info->horizontal_filter = h_filter;
+    info->vertical_filter = v_filter;
+}
+
+static stbir_uint32 stbir__calculate_memory(stbir__info *info)
+{
+    int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
+
+    info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
+    info->vertical_num_contributors   = stbir__get_contributors(info->vertical_scale  , info->vertical_filter  , info->input_h, info->output_h);
+
+    // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+    info->ring_buffer_num_entries = filter_height + 1;
+
+    info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
+    info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
+    info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
+    info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
+    info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
+    info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
+    info->ring_buffer_size = info->output_w * info->channels * info->ring_buffer_num_entries * sizeof(float);
+    info->encode_buffer_size = info->output_w * info->channels * sizeof(float);
+
+    STBIR_ASSERT(info->horizontal_filter != 0);
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+    STBIR_ASSERT(info->vertical_filter != 0);
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+
+    if (stbir__use_height_upsampling(info))
+        // The horizontal buffer is for when we're downsampling the height and we
+        // can't output the result of sampling the decode buffer directly into the
+        // ring buffers.
+        info->horizontal_buffer_size = 0;
+    else
+        // The encode buffer is to retain precision in the height upsampling method
+        // and isn't used when height downsampling.
+        info->encode_buffer_size = 0;
+
+    return info->horizontal_contributors_size + info->horizontal_coefficients_size
+        + info->vertical_contributors_size + info->vertical_coefficients_size
+        + info->decode_buffer_size + info->horizontal_buffer_size
+        + info->ring_buffer_size + info->encode_buffer_size;
+}
+
+static int stbir__resize_allocated(stbir__info *info,
+    const void* input_data, int input_stride_in_bytes,
+    void* output_data, int output_stride_in_bytes,
+    int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace,
+    void* tempmem, size_t tempmem_size_in_bytes)
+{
+    size_t memory_required = stbir__calculate_memory(info);
+
+    int width_stride_input = input_stride_in_bytes ? input_stride_in_bytes : info->channels * info->input_w * stbir__type_size[type];
+    int width_stride_output = output_stride_in_bytes ? output_stride_in_bytes : info->channels * info->output_w * stbir__type_size[type];
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+#define OVERWRITE_ARRAY_SIZE 8
+    unsigned char overwrite_output_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_output_after_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_after_pre[OVERWRITE_ARRAY_SIZE];
+
+    size_t begin_forbidden = width_stride_output * (info->output_h - 1) + info->output_w * info->channels * stbir__type_size[type];
+    memcpy(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE);
+#endif
+
+    STBIR_ASSERT(info->channels >= 0);
+    STBIR_ASSERT(info->channels <= STBIR_MAX_CHANNELS);
+
+    if (info->channels < 0 || info->channels > STBIR_MAX_CHANNELS)
+        return 0;
+
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (info->horizontal_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+    if (info->vertical_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+
+    if (alpha_channel < 0)
+        flags |= STBIR_FLAG_ALPHA_USES_COLORSPACE | STBIR_FLAG_ALPHA_PREMULTIPLIED;
+
+    if (!(flags&STBIR_FLAG_ALPHA_USES_COLORSPACE) || !(flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)) {
+        STBIR_ASSERT(alpha_channel >= 0 && alpha_channel < info->channels);
+    }
+
+    if (alpha_channel >= info->channels)
+        return 0;
+
+    STBIR_ASSERT(tempmem);
+
+    if (!tempmem)
+        return 0;
+
+    STBIR_ASSERT(tempmem_size_in_bytes >= memory_required);
+
+    if (tempmem_size_in_bytes < memory_required)
+        return 0;
+
+    memset(tempmem, 0, tempmem_size_in_bytes);
+
+    info->input_data = input_data;
+    info->input_stride_bytes = width_stride_input;
+
+    info->output_data = output_data;
+    info->output_stride_bytes = width_stride_output;
+
+    info->alpha_channel = alpha_channel;
+    info->flags = flags;
+    info->type = type;
+    info->edge_horizontal = edge_horizontal;
+    info->edge_vertical = edge_vertical;
+    info->colorspace = colorspace;
+
+    info->horizontal_coefficient_width   = stbir__get_coefficient_width  (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_coefficient_width     = stbir__get_coefficient_width  (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_width  = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_width    = stbir__get_filter_pixel_width (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_margin   = stbir__get_filter_pixel_margin(info->vertical_filter  , info->vertical_scale  );
+
+    info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
+    info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
+
+#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)
+
+    info->horizontal_contributors = (stbir__contributors *) tempmem;
+    info->horizontal_coefficients = STBIR__NEXT_MEMPTR(info->horizontal_contributors, float);
+    info->vertical_contributors = STBIR__NEXT_MEMPTR(info->horizontal_coefficients, stbir__contributors);
+    info->vertical_coefficients = STBIR__NEXT_MEMPTR(info->vertical_contributors, float);
+    info->decode_buffer = STBIR__NEXT_MEMPTR(info->vertical_coefficients, float);
+
+    if (stbir__use_height_upsampling(info))
+    {
+        info->horizontal_buffer = NULL;
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->encode_buffer = STBIR__NEXT_MEMPTR(info->ring_buffer, float);
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->encode_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+    else
+    {
+        info->horizontal_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->horizontal_buffer, float);
+        info->encode_buffer = NULL;
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->ring_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+
+#undef STBIR__NEXT_MEMPTR
+
+    // This signals that the ring buffer is empty
+    info->ring_buffer_begin_index = -1;
+
+    stbir__calculate_filters(info->horizontal_contributors, info->horizontal_coefficients, info->horizontal_filter, info->horizontal_scale, info->horizontal_shift, info->input_w, info->output_w);
+    stbir__calculate_filters(info->vertical_contributors, info->vertical_coefficients, info->vertical_filter, info->vertical_scale, info->vertical_shift, info->input_h, info->output_h);
+
+    STBIR_PROGRESS_REPORT(0);
+
+    if (stbir__use_height_upsampling(info))
+        stbir__buffer_loop_upsample(info);
+    else
+        stbir__buffer_loop_downsample(info);
+
+    STBIR_PROGRESS_REPORT(1);
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+    STBIR_ASSERT(memcmp(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE) == 0);
+#endif
+
+    return 1;
+}
+
+
+static int stbir__resize_arbitrary(
+    void *alloc_context,
+    const void* input_data, int input_w, int input_h, int input_stride_in_bytes,
+    void* output_data, int output_w, int output_h, int output_stride_in_bytes,
+    float s0, float t0, float s1, float t1, float *transform,
+    int channels, int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_filter h_filter, stbir_filter v_filter,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace)
+{
+    stbir__info info;
+    int result;
+    size_t memory_required;
+    void* extra_memory;
+
+    stbir__setup(&info, input_w, input_h, output_w, output_h, channels);
+    stbir__calculate_transform(&info, s0,t0,s1,t1,transform);
+    stbir__choose_filter(&info, h_filter, v_filter);
+    memory_required = stbir__calculate_memory(&info);
+    extra_memory = STBIR_MALLOC(memory_required, alloc_context);
+
+    if (!extra_memory)
+        return 0;
+
+    result = stbir__resize_allocated(&info, input_data, input_stride_in_bytes,
+                                            output_data, output_stride_in_bytes, 
+                                            alpha_channel, flags, type,
+                                            edge_horizontal, edge_vertical,
+                                            colorspace, extra_memory, memory_required);
+
+    STBIR_FREE(extra_memory, alloc_context);
+
+    return result;
+}
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_FLOAT, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        edge_wrap_mode, edge_wrap_mode, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT16, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_FLOAT, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset)
+{
+    float transform[4];
+    transform[0] = x_scale;
+    transform[1] = y_scale;
+    transform[2] = x_offset;
+    transform[3] = y_offset;
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,transform,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        s0,t0,s1,t1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/TNN/third_party/stb/stb_image_write.h b/3rdparty/TNN/third_party/stb/stb_image_write.h
new file mode 100644
index 0000000..c117344
--- /dev/null
+++ b/3rdparty/TNN/third_party/stb/stb_image_write.h
@@ -0,0 +1,1619 @@
+/* stb_image_write - v1.13 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio or a callback.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+
+USAGE:
+
+   There are five functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+   
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
+CREDITS:
+
+
+   Sean Barrett           -    PNG/BMP/TGA 
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+extern int stbi_write_tga_with_rle;
+extern int stbi_write_png_compression_level;
+extern int stbi_write_force_png_filter;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBI_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi__flip_vertically_on_write=0;
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi__flip_vertically_on_write=0;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+      return 0;
+	
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+      return 0;
+
+#if _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   unsigned char arr[3];
+   arr[0] = a; arr[1] = b; arr[2] = c;
+   s->func(s->context, arr, 3);
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      s->func(s->context, &d[comp - 1], 1);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            s->func(s->context, d, 1);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      s->func(s->context, &d[comp - 1], 1);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   int pad = (-x*3) & 3;
+   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+           "11 4 22 4" "4 44 22 444444",
+           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               s->func(s->context, &header, 1);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               s->func(s->context, &header, 1);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef __STDC_WANT_SECURE_LIB__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = snprintf(buffer, 128, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+    
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel    
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, diff, end0pos;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0; dataOff<64; dataOff+=8) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(i=0; i<64; ++i) {
+      float v = CDU[i]*fdtbl[i];
+      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      const unsigned char *imageData = (const unsigned char *)data;
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      int x, y, pos;
+      for(y = 0; y < height; y += 8) {
+         for(x = 0; x < width; x += 8) {
+            float YDU[64], UDU[64], VDU[64];
+            for(row = y, pos = 0; row < y+8; ++row) {
+               // row >= height => use last input row
+               int clamped_row = (row < height) ? row : height - 1;
+               int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+               for(col = x; col < x+8; ++col, ++pos) {
+                  float r, g, b;
+                  // if col >= width => use pixel from last input column
+                  int p = base_p + ((col < width) ? col : (width-1))*comp;
+
+                  r = imageData[p+0];
+                  g = imageData[p+ofsG];
+                  b = imageData[p+ofsB];
+                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
+                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
+                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
+               }
+            }
+
+            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.11  (2019-08-11)
+             
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+		       add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/TNN/third_party/xbyak/COPYRIGHT b/3rdparty/TNN/third_party/xbyak/COPYRIGHT
new file mode 100644
index 0000000..78d3140
--- /dev/null
+++ b/3rdparty/TNN/third_party/xbyak/COPYRIGHT
@@ -0,0 +1,47 @@
+
+Copyright (c) 2007 MITSUNARI Shigeo
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+Neither the name of the copyright owner nor the names of its contributors may
+be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満た
+す場合に限り、再頒布および使用が許可されます。
+
+ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項
+を含めること。
+バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作
+権表示、本条件一覧、および下記免責条項を含めること。
+書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進
+に、著作権者の名前またはコントリビューターの名前を使用してはならない。
+本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供さ
+れており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性
+に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。
+著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを
+問わず、かつ責任の根拠が契約であるか厳格責任であるか（過失その他の）不法行為で
+あるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、
+本ソフトウェアの使用によって発生した（代替品または代用サービスの調達、使用の
+喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない）直接
+損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、
+一切責任を負わないものとします。
diff --git a/3rdparty/TNN/third_party/xbyak/xbyak/xbyak.h b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak.h
new file mode 100644
index 0000000..365df98
--- /dev/null
+++ b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak.h
@@ -0,0 +1,2829 @@
+#pragma once
+#ifndef XBYAK_XBYAK_H_
+#define XBYAK_XBYAK_H_
+/*!
+	@file xbyak.h
+	@brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
+	@author herumi
+	@url https://github.com/herumi/xbyak
+	@note modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#if (not +0) && !defined(XBYAK_NO_OP_NAMES) // trick to detect whether 'not' is operator or not
+	#define XBYAK_NO_OP_NAMES
+#endif
+
+#include <stdio.h> // for debug print
+#include <assert.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+// #define XBYAK_DISABLE_AVX512
+
+#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
+	#define XBYAK_USE_MMAP_ALLOCATOR
+#endif
+#if !defined(__GNUC__) || defined(__MINGW32__)
+	#undef XBYAK_USE_MMAP_ALLOCATOR
+#endif
+
+#ifdef __GNUC__
+	#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__) * 100 + (__GNUC_MINOR__) >= (major) * 100 + (minor))
+#else
+	#define XBYAK_GNUC_PREREQ(major, minor) 0
+#endif
+
+// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
+#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\
+	 			 ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
+	#include <unordered_set>
+	#define XBYAK_STD_UNORDERED_SET std::unordered_set
+	#include <unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
+
+/*
+	Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
+	libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
+*/
+#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__)
+	#include <tr1/unordered_set>
+	#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
+	#include <tr1/unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
+	#include <unordered_set>
+	#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
+	#include <unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#else
+	#include <set>
+	#define XBYAK_STD_UNORDERED_SET std::set
+	#include <map>
+	#define XBYAK_STD_UNORDERED_MAP std::map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
+#endif
+#ifdef _WIN32
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <malloc.h>
+	#define XBYAK_TLS __declspec(thread)
+#elif defined(__GNUC__)
+	#include <unistd.h>
+	#include <sys/mman.h>
+	#include <stdlib.h>
+	#define XBYAK_TLS __thread
+#endif
+#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
+	#define XBYAK_USE_MAP_JIT
+	#include <sys/sysctl.h>
+	#ifndef MAP_JIT
+		#define MAP_JIT 0x800
+	#endif
+#endif
+#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
+	#include <stdint.h>
+#endif
+
+#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
+	#define XBYAK64_WIN
+#elif defined(__x86_64__)
+	#define XBYAK64_GCC
+#endif
+#if !defined(XBYAK64) && !defined(XBYAK32)
+	#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
+		#define XBYAK64
+	#else
+		#define XBYAK32
+	#endif
+#endif
+
+#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+	#undef XBYAK_TLS
+	#define XBYAK_TLS thread_local
+	#define XBYAK_VARIADIC_TEMPLATE
+	#define XBYAK_NOEXCEPT noexcept
+#else
+	#define XBYAK_NOEXCEPT throw()
+#endif
+
+#if (__cplusplus >= 201402L) || (defined(_MSC_VER) && _MSC_VER >= 1910) // Visual Studio 2017 version 15.0
+	#define XBYAK_CONSTEXPR constexpr // require c++14 or later
+#else
+	#define XBYAK_CONSTEXPR
+#endif
+
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4514) /* remove inline function */
+	#pragma warning(disable : 4786) /* identifier is too long */
+	#pragma warning(disable : 4503) /* name is too long */
+	#pragma warning(disable : 4127) /* constant expresison */
+#endif
+
+namespace Xbyak {
+
+enum {
+	DEFAULT_MAX_CODE_SIZE = 4096,
+	VERSION = 0x5990 /* 0xABCD = A.BC(D) */
+};
+
+#ifndef MIE_INTEGER_TYPE_DEFINED
+#define MIE_INTEGER_TYPE_DEFINED
+// for backward compatibility
+typedef uint64_t uint64;
+typedef int64_t sint64;
+typedef uint32_t uint32;
+typedef uint16_t uint16;
+typedef uint8_t uint8;
+#endif
+
+#ifndef MIE_ALIGN
+	#ifdef _MSC_VER
+		#define MIE_ALIGN(x) __declspec(align(x))
+	#else
+		#define MIE_ALIGN(x) __attribute__((aligned(x)))
+	#endif
+#endif
+#ifndef MIE_PACK // for shufps
+	#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
+#endif
+
+enum {
+	ERR_NONE = 0,
+	ERR_BAD_ADDRESSING,
+	ERR_CODE_IS_TOO_BIG,
+	ERR_BAD_SCALE,
+	ERR_ESP_CANT_BE_INDEX,
+	ERR_BAD_COMBINATION,
+	ERR_BAD_SIZE_OF_REGISTER,
+	ERR_IMM_IS_TOO_BIG,
+	ERR_BAD_ALIGN,
+	ERR_LABEL_IS_REDEFINED,
+	ERR_LABEL_IS_TOO_FAR,
+	ERR_LABEL_IS_NOT_FOUND,
+	ERR_CODE_ISNOT_COPYABLE,
+	ERR_BAD_PARAMETER,
+	ERR_CANT_PROTECT,
+	ERR_CANT_USE_64BIT_DISP,
+	ERR_OFFSET_IS_TOO_BIG,
+	ERR_MEM_SIZE_IS_NOT_SPECIFIED,
+	ERR_BAD_MEM_SIZE,
+	ERR_BAD_ST_COMBINATION,
+	ERR_OVER_LOCAL_LABEL, // not used
+	ERR_UNDER_LOCAL_LABEL,
+	ERR_CANT_ALLOC,
+	ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
+	ERR_BAD_PROTECT_MODE,
+	ERR_BAD_PNUM,
+	ERR_BAD_TNUM,
+	ERR_BAD_VSIB_ADDRESSING,
+	ERR_CANT_CONVERT,
+	ERR_LABEL_ISNOT_SET_BY_L,
+	ERR_LABEL_IS_ALREADY_SET_BY_L,
+	ERR_BAD_LABEL_STR,
+	ERR_MUNMAP,
+	ERR_OPMASK_IS_ALREADY_SET,
+	ERR_ROUNDING_IS_ALREADY_SET,
+	ERR_K0_IS_INVALID,
+	ERR_EVEX_IS_INVALID,
+	ERR_SAE_IS_INVALID,
+	ERR_ER_IS_INVALID,
+	ERR_INVALID_BROADCAST,
+	ERR_INVALID_OPMASK_WITH_MEMORY,
+	ERR_INVALID_ZERO,
+	ERR_INVALID_RIP_IN_AUTO_GROW,
+	ERR_INVALID_MIB_ADDRESS,
+	ERR_X2APIC_IS_NOT_SUPPORTED,
+	ERR_NOT_SUPPORTED,
+	ERR_INTERNAL // Put it at last.
+};
+
+inline const char *ConvertErrorToString(int err)
+{
+	static const char *errTbl[] = {
+		"none",
+		"bad addressing",
+		"code is too big",
+		"bad scale",
+		"esp can't be index",
+		"bad combination",
+		"bad size of register",
+		"imm is too big",
+		"bad align",
+		"label is redefined",
+		"label is too far",
+		"label is not found",
+		"code is not copyable",
+		"bad parameter",
+		"can't protect",
+		"can't use 64bit disp(use (void*))",
+		"offset is too big",
+		"MEM size is not specified",
+		"bad mem size",
+		"bad st combination",
+		"over local label",
+		"under local label",
+		"can't alloc",
+		"T_SHORT is not supported in AutoGrow",
+		"bad protect mode",
+		"bad pNum",
+		"bad tNum",
+		"bad vsib addressing",
+		"can't convert",
+		"label is not set by L()",
+		"label is already set by L()",
+		"bad label string",
+		"err munmap",
+		"opmask is already set",
+		"rounding is already set",
+		"k0 is invalid",
+		"evex is invalid",
+		"sae(suppress all exceptions) is invalid",
+		"er(embedded rounding) is invalid",
+		"invalid broadcast",
+		"invalid opmask with memory",
+		"invalid zero",
+		"invalid rip in AutoGrow",
+		"invalid mib address",
+		"x2APIC is not supported",
+		"not supported",
+		"internal error"
+	};
+	assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
+	return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
+}
+
+#ifdef XBYAK_NO_EXCEPTION
+namespace local {
+
+inline int& GetErrorRef() {
+	static XBYAK_TLS int err = 0;
+	return err;
+}
+
+inline void SetError(int err) {
+	if (local::GetErrorRef()) return; // keep the first err code
+	local::GetErrorRef() = err;
+}
+
+} // local
+
+inline void ClearError() {
+	local::GetErrorRef() = 0;
+}
+inline int GetError() { return local::GetErrorRef(); }
+
+#define XBYAK_THROW(err) { local::SetError(err); return; }
+#define XBYAK_THROW_RET(err, r) { local::SetError(err); return r; }
+
+#else
+class Error : public std::exception {
+	int err_;
+public:
+	explicit Error(int err) : err_(err)
+	{
+		if (err_ < 0 || err_ > ERR_INTERNAL) {
+			err_ = ERR_INTERNAL;
+		}
+	}
+	operator int() const { return err_; }
+	const char *what() const XBYAK_NOEXCEPT
+	{
+		return ConvertErrorToString(err_);
+	}
+};
+
+// dummy functions
+inline void ClearError() { }
+inline int GetError() { return 0; }
+
+inline const char *ConvertErrorToString(const Error& err)
+{
+	return err.what();
+}
+
+#define XBYAK_THROW(err) { throw Error(err); }
+#define XBYAK_THROW_RET(err, r) { throw Error(err); }
+
+#endif
+
+inline void *AlignedMalloc(size_t size, size_t alignment)
+{
+#ifdef __MINGW32__
+	return __mingw_aligned_malloc(size, alignment);
+#elif defined(_WIN32)
+	return _aligned_malloc(size, alignment);
+#else
+	void *p;
+	int ret = posix_memalign(&p, alignment, size);
+	return (ret == 0) ? p : 0;
+#endif
+}
+
+inline void AlignedFree(void *p)
+{
+#ifdef __MINGW32__
+	__mingw_aligned_free(p);
+#elif defined(_MSC_VER)
+	_aligned_free(p);
+#else
+	free(p);
+#endif
+}
+
+template<class To, class From>
+inline const To CastTo(From p) XBYAK_NOEXCEPT
+{
+	return (const To)(size_t)(p);
+}
+namespace inner {
+
+static const size_t ALIGN_PAGE_SIZE = 4096;
+
+inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
+inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
+
+inline uint32_t VerifyInInt32(uint64_t x)
+{
+#ifdef XBYAK64
+	if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
+#endif
+	return static_cast<uint32_t>(x);
+}
+
+enum LabelMode {
+	LasIs, // as is
+	Labs, // absolute
+	LaddTop // (addr + top) for mov(reg, label) with AutoGrow
+};
+
+} // inner
+
+/*
+	custom allocator
+*/
+struct Allocator {
+	virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::ALIGN_PAGE_SIZE)); }
+	virtual void free(uint8_t *p) { AlignedFree(p); }
+	virtual ~Allocator() {}
+	/* override to return false if you call protect() manually */
+	virtual bool useProtect() const { return true; }
+};
+
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+#ifdef XBYAK_USE_MAP_JIT
+namespace util {
+
+inline int getMacOsVersionPure()
+{
+	char buf[64];
+	size_t size = sizeof(buf);
+	int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
+	if (err != 0) return 0;
+	char *endp;
+	int major = strtol(buf, &endp, 10);
+	if (*endp != '.') return 0;
+	return major;
+}
+
+inline int getMacOsVersion()
+{
+	static const int version = getMacOsVersionPure();
+	return version;
+}
+
+} // util
+#endif
+class MmapAllocator : Allocator {
+	typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, size_t> SizeList;
+	SizeList sizeList_;
+public:
+	uint8_t *alloc(size_t size)
+	{
+		const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1;
+		size = (size + alignedSizeM1) & ~alignedSizeM1;
+#if defined(MAP_ANONYMOUS)
+		int mode = MAP_PRIVATE | MAP_ANONYMOUS;
+#elif defined(MAP_ANON)
+		int mode = MAP_PRIVATE | MAP_ANON;
+#else
+		#error "not supported"
+#endif
+#if defined(XBYAK_USE_MAP_JIT)
+		const int mojaveVersion = 18;
+		if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
+#endif
+		void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, -1, 0);
+		if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
+		assert(p);
+		sizeList_[(uintptr_t)p] = size;
+		return (uint8_t*)p;
+	}
+	void free(uint8_t *p)
+	{
+		if (p == 0) return;
+		SizeList::iterator i = sizeList_.find((uintptr_t)p);
+		if (i == sizeList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
+		if (munmap((void*)i->first, i->second) < 0) XBYAK_THROW(ERR_MUNMAP)
+		sizeList_.erase(i);
+	}
+};
+#endif
+
+class Address;
+class Reg;
+
+class Operand {
+	static const uint8_t EXT8BIT = 0x20;
+	unsigned int idx_:6; // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
+	unsigned int kind_:10;
+	unsigned int bit_:14;
+protected:
+	unsigned int zero_:1;
+	unsigned int mask_:3;
+	unsigned int rounding_:3;
+	void setIdx(int idx) { idx_ = idx; }
+public:
+	enum Kind {
+		NONE = 0,
+		MEM = 1 << 0,
+		REG = 1 << 1,
+		MMX = 1 << 2,
+		FPU = 1 << 3,
+		XMM = 1 << 4,
+		YMM = 1 << 5,
+		ZMM = 1 << 6,
+		OPMASK = 1 << 7,
+		BNDREG = 1 << 8,
+		TMM = 1 << 9
+	};
+	enum Code {
+#ifdef XBYAK64
+		RAX = 0, RCX, RDX, RBX, RSP, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
+		R8D = 8, R9D, R10D, R11D, R12D, R13D, R14D, R15D,
+		R8W = 8, R9W, R10W, R11W, R12W, R13W, R14W, R15W,
+		R8B = 8, R9B, R10B, R11B, R12B, R13B, R14B, R15B,
+		SPL = 4, BPL, SIL, DIL,
+#endif
+		EAX = 0, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+		AX = 0, CX, DX, BX, SP, BP, SI, DI,
+		AL = 0, CL, DL, BL, AH, CH, DH, BH
+	};
+	XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) { }
+	XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
+		: idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0)))
+		, kind_(kind)
+		, bit_(bit)
+		, zero_(0), mask_(0), rounding_(0)
+	{
+		assert((bit_ & (bit_ - 1)) == 0); // bit must be power of two
+	}
+	XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
+	XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
+	XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
+	XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
+	XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
+	XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
+	XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
+	XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
+	XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
+	XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
+	XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
+	XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
+	XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
+	XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
+	XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
+	XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
+	XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
+	XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
+	XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
+	XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
+	XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
+	XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
+	XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
+	XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
+	void setKind(Kind kind)
+	{
+		if ((kind & (XMM|YMM|ZMM|TMM)) == 0) return;
+		kind_ = kind;
+		bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
+	}
+	// err if MMX/FPU/OPMASK/BNDREG
+	void setBit(int bit);
+	void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true)
+	{
+		if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
+		mask_ = idx;
+	}
+	void setRounding(int idx)
+	{
+		if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
+		rounding_ = idx;
+	}
+	void setZero() { zero_ = true; }
+	// ah, ch, dh, bh?
+	bool isHigh8bit() const
+	{
+		if (!isBit(8)) return false;
+		if (isExt8bit()) return false;
+		const int idx = getIdx();
+		return AH <= idx && idx <= BH;
+	}
+	// any bit is accetable if bit == 0
+	XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const
+	{
+		return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit)); // cf. you can set (8|16)
+	}
+	XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
+	XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
+	const char *toString() const
+	{
+		const int idx = getIdx();
+		if (kind_ == REG) {
+			if (isExt8bit()) {
+				static const char *tbl[4] = { "spl", "bpl", "sil", "dil" };
+				return tbl[idx - 4];
+			}
+			static const char *tbl[4][16] = {
+				{ "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b",  "r11b", "r12b", "r13b", "r14b", "r15b" },
+				{ "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",  "r11w", "r12w", "r13w", "r14w", "r15w" },
+				{ "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d",  "r11d", "r12d", "r13d", "r14d", "r15d" },
+				{ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",  "r11", "r12", "r13", "r14", "r15" },
+			};
+			return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
+		} else if (isOPMASK()) {
+			static const char *tbl[8] = { "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" };
+			return tbl[idx];
+		} else if (isTMM()) {
+			static const char *tbl[8] = {
+				"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"
+			};
+			return tbl[idx];
+		} else if (isZMM()) {
+			static const char *tbl[32] = {
+				"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+				"zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"
+			};
+			return tbl[idx];
+		} else if (isYMM()) {
+			static const char *tbl[32] = {
+				"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
+				"ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"
+			};
+			return tbl[idx];
+		} else if (isXMM()) {
+			static const char *tbl[32] = {
+				"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+				"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
+			};
+			return tbl[idx];
+		} else if (isMMX()) {
+			static const char *tbl[8] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
+			return tbl[idx];
+		} else if (isFPU()) {
+			static const char *tbl[8] = { "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7" };
+			return tbl[idx];
+		} else if (isBNDREG()) {
+			static const char *tbl[4] = { "bnd0", "bnd1", "bnd2", "bnd3" };
+			return tbl[idx];
+		}
+		XBYAK_THROW_RET(ERR_INTERNAL, 0);
+	}
+	bool isEqualIfNotInherited(const Operand& rhs) const { return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ && rounding_ == rhs.rounding_; }
+	bool operator==(const Operand& rhs) const;
+	bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
+	const Address& getAddress() const;
+	const Reg& getReg() const;
+};
+
+inline void Operand::setBit(int bit)
+{
+	if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192) goto ERR;
+	if (isBit(bit)) return;
+	if (is(MEM | OPMASK)) {
+		bit_ = bit;
+		return;
+	}
+	if (is(REG | XMM | YMM | ZMM | TMM)) {
+		int idx = getIdx();
+		// err if converting ah, bh, ch, dh
+		if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
+		Kind kind = REG;
+		switch (bit) {
+		case 8:
+			if (idx >= 16) goto ERR;
+#ifdef XBYAK32
+			if (idx >= 4) goto ERR;
+#else
+			if (4 <= idx && idx < 8) idx |= EXT8BIT;
+#endif
+			break;
+		case 16:
+		case 32:
+		case 64:
+			if (idx >= 16) goto ERR;
+			break;
+		case 128: kind = XMM; break;
+		case 256: kind = YMM; break;
+		case 512: kind = ZMM; break;
+		case 8192: kind = TMM; break;
+		}
+		idx_ = idx;
+		kind_ = kind;
+		bit_ = bit;
+		if (bit >= 128) return; // keep mask_ and rounding_
+		mask_ = 0;
+		rounding_ = 0;
+		return;
+	}
+ERR:
+	XBYAK_THROW(ERR_CANT_CONVERT)
+}
+
+class Label;
+
+struct Reg8;
+struct Reg16;
+struct Reg32;
+#ifdef XBYAK64
+struct Reg64;
+#endif
+class Reg : public Operand {
+public:
+	XBYAK_CONSTEXPR Reg() { }
+	XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) { }
+	// convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
+	Reg changeBit(int bit) const { Reg r(*this); r.setBit(bit); return r; }
+	uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
+	uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
+	uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
+	uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
+	uint8_t getRex(const Reg& base = Reg()) const
+	{
+		uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
+		if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
+		return rex;
+	}
+	Reg8 cvt8() const;
+	Reg16 cvt16() const;
+	Reg32 cvt32() const;
+#ifdef XBYAK64
+	Reg64 cvt64() const;
+#endif
+};
+
+inline const Reg& Operand::getReg() const
+{
+	assert(!isMEM());
+	return static_cast<const Reg&>(*this);
+}
+
+struct Reg8 : public Reg {
+	explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) { }
+};
+
+struct Reg16 : public Reg {
+	explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) { }
+};
+
+struct Mmx : public Reg {
+	explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) { }
+};
+
+struct EvexModifierRounding {
+	enum {
+		T_RN_SAE = 1,
+		T_RD_SAE = 2,
+		T_RU_SAE = 3,
+		T_RZ_SAE = 4,
+		T_SAE = 5
+	};
+	explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
+	int rounding;
+};
+struct EvexModifierZero{ XBYAK_CONSTEXPR EvexModifierZero() {}};
+
+struct Xmm : public Mmx {
+	explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { }
+	XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) { }
+	Xmm operator|(const EvexModifierRounding& emr) const { Xmm r(*this); r.setRounding(emr.rounding); return r; }
+	Xmm copyAndSetIdx(int idx) const { Xmm ret(*this); ret.setIdx(idx); return ret; }
+	Xmm copyAndSetKind(Operand::Kind kind) const { Xmm ret(*this); ret.setKind(kind); return ret; }
+};
+
+struct Ymm : public Xmm {
+	explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) { }
+	Ymm operator|(const EvexModifierRounding& emr) const { Ymm r(*this); r.setRounding(emr.rounding); return r; }
+};
+
+struct Zmm : public Ymm {
+	explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) { }
+	Zmm operator|(const EvexModifierRounding& emr) const { Zmm r(*this); r.setRounding(emr.rounding); return r; }
+};
+
+#ifdef XBYAK64
+struct Tmm : public Reg {
+	explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) { }
+};
+#endif
+
+struct Opmask : public Reg {
+	explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
+};
+
+struct BoundsReg : public Reg {
+	explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
+};
+
+template<class T>T operator|(const T& x, const Opmask& k) { T r(x); r.setOpmaskIdx(k.getIdx()); return r; }
+template<class T>T operator|(const T& x, const EvexModifierZero&) { T r(x); r.setZero(); return r; }
+template<class T>T operator|(const T& x, const EvexModifierRounding& emr) { T r(x); r.setRounding(emr.rounding); return r; }
+
+struct Fpu : public Reg {
+	explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) { }
+};
+
+struct Reg32e : public Reg {
+	explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
+};
+struct Reg32 : public Reg32e {
+	explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
+};
+#ifdef XBYAK64
+struct Reg64 : public Reg32e {
+	explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
+};
+struct RegRip {
+	int64_t disp_;
+	const Label* label_;
+	bool isAddr_;
+	explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {}
+	friend const RegRip operator+(const RegRip& r, int disp) {
+		return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
+	}
+	friend const RegRip operator-(const RegRip& r, int disp) {
+		return RegRip(r.disp_ - disp, r.label_, r.isAddr_);
+	}
+	friend const RegRip operator+(const RegRip& r, int64_t disp) {
+		return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
+	}
+	friend const RegRip operator-(const RegRip& r, int64_t disp) {
+		return RegRip(r.disp_ - disp, r.label_, r.isAddr_);
+	}
+	friend const RegRip operator+(const RegRip& r, const Label& label) {
+		if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
+		return RegRip(r.disp_, &label);
+	}
+	friend const RegRip operator+(const RegRip& r, const void *addr) {
+		if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
+		return RegRip(r.disp_ + (int64_t)addr, 0, true);
+	}
+};
+#endif
+
+inline Reg8 Reg::cvt8() const
+{
+	Reg r = changeBit(8); return Reg8(r.getIdx(), r.isExt8bit());
+}
+
+inline Reg16 Reg::cvt16() const
+{
+	return Reg16(changeBit(16).getIdx());
+}
+
+inline Reg32 Reg::cvt32() const
+{
+	return Reg32(changeBit(32).getIdx());
+}
+
+#ifdef XBYAK64
+inline Reg64 Reg::cvt64() const
+{
+	return Reg64(changeBit(64).getIdx());
+}
+#endif
+
+#ifndef XBYAK_DISABLE_SEGMENT
+// not derived from Reg
+class Segment {
+	int idx_;
+public:
+	enum {
+		es, cs, ss, ds, fs, gs
+	};
+	explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
+	int getIdx() const { return idx_; }
+	const char *toString() const
+	{
+		static const char tbl[][3] = {
+			"es", "cs", "ss", "ds", "fs", "gs"
+		};
+		return tbl[idx_];
+	}
+};
+#endif
+
+class RegExp {
+public:
+#ifdef XBYAK64
+	enum { i32e = 32 | 64 };
+#else
+	enum { i32e = 32 };
+#endif
+	XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) { }
+	XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1)
+		: scale_(scale)
+		, disp_(0)
+	{
+		if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM|Reg::ZMM|Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		if (scale == 0) return;
+		if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
+		if (r.getBit() >= 128 || scale != 1) { // xmm/ymm is always index
+			index_ = r;
+		} else {
+			base_ = r;
+		}
+	}
+	bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
+	RegExp optimize() const
+	{
+		RegExp exp = *this;
+		// [reg * 2] => [reg + reg]
+		if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
+			exp.base_ = index_;
+			exp.scale_ = 1;
+		}
+		return exp;
+	}
+	bool operator==(const RegExp& rhs) const
+	{
+		return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
+	}
+	const Reg& getBase() const { return base_; }
+	const Reg& getIndex() const { return index_; }
+	int getScale() const { return scale_; }
+	size_t getDisp() const { return disp_; }
+	XBYAK_CONSTEXPR void verify() const
+	{
+		if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		if (index_.getBit() && index_.getBit() <= 64) {
+			if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
+			if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		}
+	}
+	friend RegExp operator+(const RegExp& a, const RegExp& b);
+	friend RegExp operator-(const RegExp& e, size_t disp);
+	uint8_t getRex() const
+	{
+		uint8_t rex = index_.getRexX() | base_.getRexB();
+		return rex ? uint8_t(rex | 0x40) : 0;
+	}
+private:
+	/*
+		[base_ + index_ * scale_ + disp_]
+		base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
+	*/
+	Reg base_;
+	Reg index_;
+	int scale_;
+	size_t disp_;
+};
+
+inline RegExp operator+(const RegExp& a, const RegExp& b)
+{
+	if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
+	RegExp ret = a;
+	if (!ret.index_.getBit()) { ret.index_ = b.index_; ret.scale_ = b.scale_; }
+	if (b.base_.getBit()) {
+		if (ret.base_.getBit()) {
+			if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
+			// base + base => base + index * 1
+			ret.index_ = b.base_;
+			// [reg + esp] => [esp + reg]
+			if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
+			ret.scale_ = 1;
+		} else {
+			ret.base_ = b.base_;
+		}
+	}
+	ret.disp_ += b.disp_;
+	return ret;
+}
+inline RegExp operator*(const Reg& r, int scale)
+{
+	return RegExp(r, scale);
+}
+inline RegExp operator*(int scale, const Reg& r)
+{
+	return r * scale;
+}
+inline RegExp operator-(const RegExp& e, size_t disp)
+{
+	RegExp ret = e;
+	ret.disp_ -= disp;
+	return ret;
+}
+
+// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
+void *const AutoGrow = (void*)1; //-V566
+void *const DontSetProtectRWE = (void*)2; //-V566
+
+class CodeArray {
+	enum Type {
+		USER_BUF = 1, // use userPtr(non alignment, non protect)
+		ALLOC_BUF, // use new(alignment, protect)
+		AUTO_GROW // automatically move and grow memory if necessary
+	};
+	CodeArray(const CodeArray& rhs);
+	void operator=(const CodeArray&);
+	bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
+	struct AddrInfo {
+		size_t codeOffset; // position to write
+		size_t jmpAddr; // value to write
+		int jmpSize; // size of jmpAddr
+		inner::LabelMode mode;
+		AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
+			: codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
+		uint64_t getVal(const uint8_t *top) const
+		{
+			uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top) : (mode == inner::LasIs) ? jmpAddr : jmpAddr - size_t(top);
+			if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
+			return disp;
+		}
+	};
+	typedef std::list<AddrInfo> AddrInfoList;
+	AddrInfoList addrInfoList_;
+	const Type type_;
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+	MmapAllocator defaultAllocator_;
+#else
+	Allocator defaultAllocator_;
+#endif
+	Allocator *alloc_;
+protected:
+	size_t maxSize_;
+	uint8_t *top_;
+	size_t size_;
+	bool isCalledCalcJmpAddress_;
+
+	bool useProtect() const { return alloc_->useProtect(); }
+	/*
+		allocate new memory and copy old data to the new area
+	*/
+	void growMemory()
+	{
+		const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
+		uint8_t *newTop = alloc_->alloc(newSize);
+		if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
+		for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
+		alloc_->free(top_);
+		top_ = newTop;
+		maxSize_ = newSize;
+	}
+	/*
+		calc jmp address for AutoGrow mode
+	*/
+	void calcJmpAddress()
+	{
+		if (isCalledCalcJmpAddress_) return;
+		for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
+			uint64_t disp = i->getVal(top_);
+			rewrite(i->codeOffset, disp, i->jmpSize);
+		}
+		isCalledCalcJmpAddress_ = true;
+	}
+public:
+	enum ProtectMode {
+		PROTECT_RW = 0, // read/write
+		PROTECT_RWE = 1, // read/write/exec
+		PROTECT_RE = 2 // read/exec
+	};
+	explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
+		: type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
+		, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
+		, maxSize_(maxSize)
+		, top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
+		, size_(0)
+		, isCalledCalcJmpAddress_(false)
+	{
+		if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
+		if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
+			alloc_->free(top_);
+			XBYAK_THROW(ERR_CANT_PROTECT)
+		}
+	}
+	virtual ~CodeArray()
+	{
+		if (isAllocType()) {
+			if (useProtect()) setProtectModeRW(false);
+			alloc_->free(top_);
+		}
+	}
+	bool setProtectMode(ProtectMode mode, bool throwException = true)
+	{
+		bool isOK = protect(top_, maxSize_, mode);
+		if (isOK) return true;
+		if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
+		return false;
+	}
+	bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
+	bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
+	void resetSize()
+	{
+		size_ = 0;
+		addrInfoList_.clear();
+		isCalledCalcJmpAddress_ = false;
+	}
+	void db(int code)
+	{
+		if (size_ >= maxSize_) {
+			if (type_ == AUTO_GROW) {
+				growMemory();
+			} else {
+				XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
+			}
+		}
+		top_[size_++] = static_cast<uint8_t>(code);
+	}
+	void db(const uint8_t *code, size_t codeSize)
+	{
+		for (size_t i = 0; i < codeSize; i++) db(code[i]);
+	}
+	void db(uint64_t code, size_t codeSize)
+	{
+		if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
+		for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
+	}
+	void dw(uint32_t code) { db(code, 2); }
+	void dd(uint32_t code) { db(code, 4); }
+	void dq(uint64_t code) { db(code, 8); }
+	const uint8_t *getCode() const { return top_; }
+	template<class F>
+	const F getCode() const { return reinterpret_cast<F>(top_); }
+	const uint8_t *getCurr() const { return &top_[size_]; }
+	template<class F>
+	const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
+	size_t getSize() const { return size_; }
+	void setSize(size_t size)
+	{
+		if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+		size_ = size;
+	}
+	void dump() const
+	{
+		const uint8_t *p = getCode();
+		size_t bufSize = getSize();
+		size_t remain = bufSize;
+		for (int i = 0; i < 4; i++) {
+			size_t disp = 16;
+			if (remain < 16) {
+				disp = remain;
+			}
+			for (size_t j = 0; j < 16; j++) {
+				if (j < disp) {
+					printf("%02X", p[i * 16 + j]);
+				}
+			}
+			putchar('\n');
+			remain -= disp;
+			if (remain == 0) {
+				break;
+			}
+		}
+	}
+	/*
+		@param offset [in] offset from top
+		@param disp [in] offset from the next of jmp
+		@param size [in] write size(1, 2, 4, 8)
+	*/
+	void rewrite(size_t offset, uint64_t disp, size_t size)
+	{
+		assert(offset < maxSize_);
+		if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
+		uint8_t *const data = top_ + offset;
+		for (size_t i = 0; i < size; i++) {
+			data[i] = static_cast<uint8_t>(disp >> (i * 8));
+		}
+	}
+	void save(size_t offset, size_t val, int size, inner::LabelMode mode)
+	{
+		addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
+	}
+	bool isAutoGrow() const { return type_ == AUTO_GROW; }
+	bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
+	/**
+		change exec permission of memory
+		@param addr [in] buffer address
+		@param size [in] buffer size
+		@param protectMode [in] mode(RW/RWE/RE)
+		@return true(success), false(failure)
+	*/
+	static inline bool protect(const void *addr, size_t size, int protectMode)
+	{
+#if defined(_WIN32)
+		const DWORD c_rw = PAGE_READWRITE;
+		const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
+		const DWORD c_re = PAGE_EXECUTE_READ;
+		DWORD mode;
+#else
+		const int c_rw = PROT_READ | PROT_WRITE;
+		const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
+		const int c_re = PROT_READ | PROT_EXEC;
+		int mode;
+#endif
+		switch (protectMode) {
+		case PROTECT_RW: mode = c_rw; break;
+		case PROTECT_RWE: mode = c_rwe; break;
+		case PROTECT_RE: mode = c_re; break;
+		default:
+			return false;
+		}
+#if defined(_WIN32)
+		DWORD oldProtect;
+		return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
+#elif defined(__GNUC__)
+		size_t pageSize = sysconf(_SC_PAGESIZE);
+		size_t iaddr = reinterpret_cast<size_t>(addr);
+		size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+#ifndef NDEBUG
+		if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
+#endif
+		return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
+#else
+		return true;
+#endif
+	}
+	/**
+		get aligned memory pointer
+		@param addr [in] address
+		@param alignedSize [in] power of two
+		@return aligned addr by alingedSize
+	*/
+	static inline uint8_t *getAlignedAddress(uint8_t *addr, size_t alignedSize = 16)
+	{
+		return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
+	}
+};
+
+class Address : public Operand {
+public:
+	enum Mode {
+		M_ModRM,
+		M_64bitDisp,
+		M_rip,
+		M_ripAddr
+	};
+	XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
+		: Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast)
+	{
+		e_.verify();
+	}
+#ifdef XBYAK64
+	explicit XBYAK_CONSTEXPR Address(size_t disp)
+		: Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false){ }
+	XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
+		: Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), broadcast_(broadcast) { }
+#endif
+	RegExp getRegExp(bool optimize = true) const
+	{
+		return optimize ? e_.optimize() : e_;
+	}
+	Mode getMode() const { return mode_; }
+	bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
+	bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax
+	size_t getDisp() const { return e_.getDisp(); }
+	uint8_t getRex() const
+	{
+		if (mode_ != M_ModRM) return 0;
+		return getRegExp().getRex();
+	}
+	bool is64bitDisp() const { return mode_ == M_64bitDisp; } // for moffset
+	bool isBroadcast() const { return broadcast_; }
+	const Label* getLabel() const { return label_; }
+	bool operator==(const Address& rhs) const
+	{
+		return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ && broadcast_ == rhs.broadcast_;
+	}
+	bool operator!=(const Address& rhs) const { return !operator==(rhs); }
+	bool isVsib() const { return e_.isVsib(); }
+private:
+	RegExp e_;
+	const Label* label_;
+	Mode mode_;
+	bool broadcast_;
+};
+
+inline const Address& Operand::getAddress() const
+{
+	assert(isMEM());
+	return static_cast<const Address&>(*this);
+}
+
+inline bool Operand::operator==(const Operand& rhs) const
+{
+	if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
+	return isEqualIfNotInherited(rhs);
+}
+
+class AddressFrame {
+	void operator=(const AddressFrame&);
+	AddressFrame(const AddressFrame&);
+public:
+	const uint32_t bit_;
+	const bool broadcast_;
+	explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) { }
+	Address operator[](const RegExp& e) const
+	{
+		return Address(bit_, broadcast_, e);
+	}
+	Address operator[](const void *disp) const
+	{
+		return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
+	}
+#ifdef XBYAK64
+	Address operator[](uint64_t disp) const { return Address(disp); }
+	Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
+#endif
+};
+
+struct JmpLabel {
+	size_t endOfJmp; /* offset from top to the end address of jmp */
+	int jmpSize;
+	inner::LabelMode mode;
+	size_t disp; // disp for [rip + disp]
+	explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
+		: endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp)
+	{
+	}
+};
+
+class LabelManager;
+
+class Label {
+	mutable LabelManager *mgr;
+	mutable int id;
+	friend class LabelManager;
+public:
+	Label() : mgr(0), id(0) {}
+	Label(const Label& rhs);
+	Label& operator=(const Label& rhs);
+	~Label();
+	void clear() { mgr = 0; id = 0; }
+	int getId() const { return id; }
+	const uint8_t *getAddress() const;
+
+	// backward compatibility
+	static inline std::string toStr(int num)
+	{
+		char buf[16];
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+		_snprintf_s
+#else
+		snprintf
+#endif
+		(buf, sizeof(buf), ".%08x", num);
+		return buf;
+	}
+};
+
+class LabelManager {
+	// for string label
+	struct SlabelVal {
+		size_t offset;
+		SlabelVal(size_t offset) : offset(offset) {}
+	};
+	typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
+	typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
+	struct SlabelState {
+		SlabelDefList defList;
+		SlabelUndefList undefList;
+	};
+	typedef std::list<SlabelState> StateList;
+	// for Label class
+	struct ClabelVal {
+		ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
+		size_t offset;
+		int refCount;
+	};
+	typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
+	typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
+	typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
+
+	CodeArray *base_;
+	// global : stateList_.front(), local : stateList_.back()
+	StateList stateList_;
+	mutable int labelId_;
+	ClabelDefList clabelDefList_;
+	ClabelUndefList clabelUndefList_;
+	LabelPtrList labelPtrList_;
+
+	int getId(const Label& label) const
+	{
+		if (label.id == 0) label.id = labelId_++;
+		return label.id;
+	}
+	template<class DefList, class UndefList, class T>
+	void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset)
+	{
+		// add label
+		typename DefList::value_type item(labelId, addrOffset);
+		std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
+		if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
+		// search undefined label
+		for (;;) {
+			typename UndefList::iterator itr = undefList.find(labelId);
+			if (itr == undefList.end()) break;
+			const JmpLabel *jmp = &itr->second;
+			const size_t offset = jmp->endOfJmp - jmp->jmpSize;
+			size_t disp;
+			if (jmp->mode == inner::LaddTop) {
+				disp = addrOffset;
+			} else if (jmp->mode == inner::Labs) {
+				disp = size_t(base_->getCurr());
+			} else {
+				disp = addrOffset - jmp->endOfJmp + jmp->disp;
+#ifdef XBYAK64
+				if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+#endif
+				if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) { 
+					// printf("disp:%lu, addrOffset:%u, jmp->endOfJmp:%u, jmp->disp:%u\n", disp, jmp->endOfJmp, jmp->disp);
+					XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
+				}
+			}
+			if (base_->isAutoGrow()) {
+				base_->save(offset, disp, jmp->jmpSize, jmp->mode);
+			} else {
+				base_->rewrite(offset, disp, jmp->jmpSize);
+			}
+			undefList.erase(itr);
+		}
+	}
+	template<class DefList, class T>
+	bool getOffset_inner(const DefList& defList, size_t *offset, const T& label) const
+	{
+		typename DefList::const_iterator i = defList.find(label);
+		if (i == defList.end()) return false;
+		*offset = i->second.offset;
+		return true;
+	}
+	friend class Label;
+	void incRefCount(int id, Label *label)
+	{
+		clabelDefList_[id].refCount++;
+		labelPtrList_.insert(label);
+	}
+	void decRefCount(int id, Label *label)
+	{
+		labelPtrList_.erase(label);
+		ClabelDefList::iterator i = clabelDefList_.find(id);
+		if (i == clabelDefList_.end()) return;
+		if (i->second.refCount == 1) {
+			clabelDefList_.erase(id);
+		} else {
+			--i->second.refCount;
+		}
+	}
+	template<class T>
+	bool hasUndefinedLabel_inner(const T& list) const
+	{
+#ifndef NDEBUG
+		for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
+			std::cerr << "undefined label:" << i->first << std::endl;
+		}
+#endif
+		return !list.empty();
+	}
+	// detach all labels linked to LabelManager
+	void resetLabelPtrList()
+	{
+		for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
+			(*i)->clear();
+		}
+		labelPtrList_.clear();
+	}
+public:
+	LabelManager()
+	{
+		reset();
+	}
+	~LabelManager()
+	{
+		resetLabelPtrList();
+	}
+	void reset()
+	{
+		base_ = 0;
+		labelId_ = 1;
+		stateList_.clear();
+		stateList_.push_back(SlabelState());
+		stateList_.push_back(SlabelState());
+		clabelDefList_.clear();
+		clabelUndefList_.clear();
+		resetLabelPtrList();
+		ClearError();
+	}
+	void enterLocal()
+	{
+		stateList_.push_back(SlabelState());
+	}
+	void leaveLocal()
+	{
+		if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
+		if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
+		stateList_.pop_back();
+	}
+	void set(CodeArray *base) { base_ = base; }
+	void defineSlabel(std::string label)
+	{
+		if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
+		if (label == "@@") {
+			SlabelDefList& defList = stateList_.front().defList;
+			SlabelDefList::iterator i = defList.find("@f");
+			if (i != defList.end()) {
+				defList.erase(i);
+				label = "@b";
+			} else {
+				i = defList.find("@b");
+				if (i != defList.end()) {
+					defList.erase(i);
+				}
+				label = "@f";
+			}
+		}
+		SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		define_inner(st.defList, st.undefList, label, base_->getSize());
+	}
+	void defineClabel(Label& label)
+	{
+		define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
+		label.mgr = this;
+		labelPtrList_.insert(&label);
+	}
+	void assign(Label& dst, const Label& src)
+	{
+		ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
+		if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
+		define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
+		dst.mgr = this;
+		labelPtrList_.insert(&dst);
+	}
+	bool getOffset(size_t *offset, std::string& label) const
+	{
+		const SlabelDefList& defList = stateList_.front().defList;
+		if (label == "@b") {
+			if (defList.find("@f") != defList.end()) {
+				label = "@f";
+			} else if (defList.find("@b") == defList.end()) {
+				XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
+			}
+		} else if (label == "@f") {
+			if (defList.find("@f") != defList.end()) {
+				label = "@b";
+			}
+		}
+		const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		return getOffset_inner(st.defList, offset, label);
+	}
+	bool getOffset(size_t *offset, const Label& label) const
+	{
+		return getOffset_inner(clabelDefList_, offset, getId(label));
+	}
+	void addUndefinedLabel(const std::string& label, const JmpLabel& jmp)
+	{
+		SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		st.undefList.insert(SlabelUndefList::value_type(label, jmp));
+	}
+	void addUndefinedLabel(const Label& label, const JmpLabel& jmp)
+	{
+		clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
+	}
+	bool hasUndefSlabel() const
+	{
+		for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
+			if (hasUndefinedLabel_inner(i->undefList)) return true;
+		}
+		return false;
+	}
+	bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
+	const uint8_t *getCode() const { return base_->getCode(); }
+	bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
+};
+
+inline Label::Label(const Label& rhs)
+{
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id, this);
+}
+inline Label& Label::operator=(const Label& rhs)
+{
+	if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id, this);
+	return *this;
+}
+inline Label::~Label()
+{
+	if (id && mgr) mgr->decRefCount(id, this);
+}
+inline const uint8_t* Label::getAddress() const
+{
+	if (mgr == 0 || !mgr->isReady()) return 0;
+	size_t offset;
+	if (!mgr->getOffset(&offset, *this)) return 0;
+	return mgr->getCode() + offset;
+}
+
+typedef enum {
+	DefaultEncoding,
+	VexEncoding,
+	EvexEncoding
+} PreferredEncoding;
+
+class CodeGenerator : public CodeArray {
+public:
+	enum LabelType {
+		T_SHORT,
+		T_NEAR,
+		T_AUTO // T_SHORT if possible
+	};
+private:
+	CodeGenerator operator=(const CodeGenerator&); // don't call
+#ifdef XBYAK64
+	enum { i32e = 32 | 64, BIT = 64 };
+	static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
+	typedef Reg64 NativeReg;
+#else
+	enum { i32e = 32, BIT = 32 };
+	static const size_t dummyAddr = 0x12345678;
+	typedef Reg32 NativeReg;
+#endif
+	// (XMM, XMM|MEM)
+	static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isXMM() || op2.isMEM());
+	}
+	// (MMX, MMX|MEM) or (XMM, XMM|MEM)
+	static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2)
+	{
+		return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
+	}
+	// (XMM, MMX|MEM)
+	static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isMMX() || op2.isMEM());
+	}
+	// (MMX, XMM|MEM)
+	static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isMMX() && (op2.isXMM() || op2.isMEM());
+	}
+	// (XMM, REG32|MEM)
+	static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
+	}
+	// (REG32, XMM|MEM)
+	static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
+	}
+	// (REG32, REG32|MEM)
+	static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
+	}
+	void rex(const Operand& op1, const Operand& op2 = Operand())
+	{
+		uint8_t rex = 0;
+		const Operand *p1 = &op1, *p2 = &op2;
+		if (p1->isMEM()) std::swap(p1, p2);
+		if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (p2->isMEM()) {
+			const Address& addr = p2->getAddress();
+			if (BIT == 64 && addr.is32bit()) db(0x67);
+			rex = addr.getRex() | p1->getReg().getRex();
+		} else {
+			// ModRM(reg, base);
+			rex = op2.getReg().getRex(op1.getReg());
+		}
+		// except movsx(16bit, 32/64bit)
+		if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
+		if (rex) db(rex);
+	}
+	enum AVXtype {
+		// low 3 bit
+		T_N1 = 1,
+		T_N2 = 2,
+		T_N4 = 3,
+		T_N8 = 4,
+		T_N16 = 5,
+		T_N32 = 6,
+		T_NX_MASK = 7,
+		//
+		T_N_VL = 1 << 3, // N * (1, 2, 4) for VL
+		T_DUP = 1 << 4, // N = (8, 32, 64)
+		T_66 = 1 << 5,
+		T_F3 = 1 << 6,
+		T_F2 = 1 << 7,
+		T_0F = 1 << 8,
+		T_0F38 = 1 << 9,
+		T_0F3A = 1 << 10,
+		T_L0 = 1 << 11,
+		T_L1 = 1 << 12,
+		T_W0 = 1 << 13,
+		T_W1 = 1 << 14,
+		T_EW0 = 1 << 15,
+		T_EW1 = 1 << 16,
+		T_YMM = 1 << 17, // support YMM, ZMM
+		T_EVEX = 1 << 18,
+		T_ER_X = 1 << 19, // xmm{er}
+		T_ER_Y = 1 << 20, // ymm{er}
+		T_ER_Z = 1 << 21, // zmm{er}
+		T_SAE_X = 1 << 22, // xmm{sae}
+		T_SAE_Y = 1 << 23, // ymm{sae}
+		T_SAE_Z = 1 << 24, // zmm{sae}
+		T_MUST_EVEX = 1 << 25, // contains T_EVEX
+		T_B32 = 1 << 26, // m32bcst
+		T_B64 = 1 << 27, // m64bcst
+		T_M_K = 1 << 28, // mem{k}
+		T_VSIB = 1 << 29,
+		T_MEM_EVEX = 1 << 30, // use evex if mem
+		T_XXX
+	};
+	void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
+	{
+		int w = (type & T_W1) ? 1 : 0;
+		bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
+		bool r = reg.isExtIdx();
+		bool b = base.isExtIdx();
+		int idx = v ? v->getIdx() : 0;
+		if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
+		uint32_t pp = (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0;
+		uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
+		if (!b && !x && !w && (type & T_0F)) {
+			db(0xC5); db((r ? 0 : 0x80) | vvvv);
+		} else {
+			uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+			db(0xC4); db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm); db((w << 7) | vvvv);
+		}
+		db(code);
+	}
+	void verifySAE(const Reg& r, int type) const
+	{
+		if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
+		XBYAK_THROW(ERR_SAE_IS_INVALID)
+	}
+	void verifyER(const Reg& r, int type) const
+	{
+		if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
+		XBYAK_THROW(ERR_ER_IS_INVALID)
+	}
+	// (a, b, c) contains non zero two or three values then err
+	int verifyDuplicate(int a, int b, int c, int err)
+	{
+		int v = a | b | c;
+		if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
+		return v;
+	}
+	int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false)
+	{
+		if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
+		int w = (type & T_EW1) ? 1 : 0;
+		uint32_t mm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+		uint32_t pp = (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0;
+
+		int idx = v ? v->getIdx() : 0;
+		uint32_t vvvv = ~idx;
+
+		bool R = !reg.isExtIdx();
+		bool X = x ? false : !base.isExtIdx2();
+		bool B = !base.isExtIdx();
+		bool Rp = !reg.isExtIdx2();
+		int LL;
+		int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
+		int disp8N = 1;
+		if (rounding) {
+			if (rounding == EvexModifierRounding::T_SAE) {
+				verifySAE(base, type); LL = 0;
+			} else {
+				verifyER(base, type); LL = rounding - 1;
+			}
+			b = true;
+		} else {
+			if (v) VL = (std::max)(VL, v->getBit());
+			VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
+			LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
+			if (b) {
+				disp8N = (type & T_B32) ? 4 : 8;
+			} else if (type & T_DUP) {
+				disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
+			} else {
+				if ((type & (T_NX_MASK | T_N_VL)) == 0) {
+					type |= T_N16 | T_N_VL; // default
+				}
+				int low = type & T_NX_MASK;
+				if (low > 0) {
+					disp8N = 1 << (low - 1);
+					if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
+				}
+			}
+		}
+		bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
+		bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
+		if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
+		if (aaa == 0) z = 0; // clear T_z if mask is not set
+		db(0x62);
+		db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | (mm & 3));
+		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
+		db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
+		db(code);
+		return disp8N;
+	}
+	void setModRM(int mod, int r1, int r2)
+	{
+		db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7)));
+	}
+	void setSIB(const RegExp& e, int reg, int disp8N = 0)
+	{
+		uint64_t disp64 = e.getDisp();
+#ifdef XBYAK64
+		uint64_t high = disp64 >> 32;
+		if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+#endif
+		uint32_t disp = static_cast<uint32_t>(disp64);
+		const Reg& base = e.getBase();
+		const Reg& index = e.getIndex();
+		const int baseIdx = base.getIdx();
+		const int baseBit = base.getBit();
+		const int indexBit = index.getBit();
+		enum {
+			mod00 = 0, mod01 = 1, mod10 = 2
+		};
+		int mod = mod10; // disp32
+		if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
+			mod = mod00;
+		} else {
+			if (disp8N == 0) {
+				if (inner::IsInDisp8(disp)) {
+					mod = mod01;
+				}
+			} else {
+				// disp must be casted to signed
+				uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
+				if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
+					disp = t;
+					mod = mod01;
+				}
+			}
+		}
+		const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
+		/* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
+		bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
+#ifdef XBYAK64
+		if (!baseBit && !indexBit) hasSIB = true;
+#endif
+		if (hasSIB) {
+			setModRM(mod, reg, Operand::ESP);
+			/* SIB = [2:3:3] = [SS:index:base(=rm)] */
+			const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
+			const int scale = e.getScale();
+			const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
+			setModRM(SS, idx, newBaseIdx);
+		} else {
+			setModRM(mod, reg, newBaseIdx);
+		}
+		if (mod == mod01) {
+			db(disp);
+		} else if (mod == mod10 || (mod == mod00 && !baseBit)) {
+			dd(disp);
+		}
+	}
+	LabelManager labelMgr_;
+	bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
+	void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE)
+	{
+		rex(reg2, reg1);
+		db(code0 | (reg1.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
+		setModRM(3, reg1.getIdx(), reg2.getIdx());
+	}
+	void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0)
+	{
+		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+		rex(addr, reg);
+		db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
+		opAddr(addr, reg.getIdx(), immSize);
+	}
+	void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE)
+	{
+		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+		if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		rex(addr, reg);
+		db(code0); if (code1 != NONE) db(code1);
+		opAddr(addr, reg.getIdx());
+	}
+	void opMIB(const Address& addr, const Reg& reg, int code0, int code1)
+	{
+		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+		if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
+		if (BIT == 64 && addr.is32bit()) db(0x67);
+		const RegExp& regExp = addr.getRegExp(false);
+		uint8_t rex = regExp.getRex();
+		if (rex) db(rex);
+		db(code0); db(code1);
+		setSIB(regExp, reg.getIdx());
+	}
+	void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref)
+	{
+		const int shortJmpSize = 2;
+		const int longHeaderSize = longPref ? 2 : 1;
+		const int longJmpSize = longHeaderSize + 4;
+		if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
+			db(shortCode); db(disp - shortJmpSize);
+		} else {
+			// printf("disp:%lu shortJmpSize:%lu, disp-shorJmp:%lu\n", disp, shortJmpSize, disp- shortJmpSize);
+			if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
+			if (longPref) db(longPref);
+			db(longCode); dd(disp - longJmpSize);
+		}
+	}
+	bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
+	template<class T>
+	void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref)
+	{
+		if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
+		size_t offset = 0;
+		if (labelMgr_.getOffset(&offset, label)) { /* label exists */
+			makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
+		} else {
+			int jmpSize = 0;
+			if (isNEAR(type)) {
+				jmpSize = 4;
+				if (longPref) db(longPref);
+				db(longCode); dd(0);
+			} else {
+				jmpSize = 1;
+				db(shortCode); db(0);
+			}
+			JmpLabel jmp(size_, jmpSize, inner::LasIs);
+			labelMgr_.addUndefinedLabel(label, jmp);
+		}
+	}
+	void opJmpAbs(const void *addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0)
+	{
+		if (isAutoGrow()) {
+			if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
+			if (size_ + 16 >= maxSize_) growMemory();
+			if (longPref) db(longPref);
+			db(longCode);
+			dd(0);
+			save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
+		} else {
+			makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode, longPref);
+		}
+
+	}
+	// reg is reg field of ModRM
+	// immSize is the size for immediate value
+	// disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
+	void opAddr(const Address &addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false)
+	{
+		if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+		if (addr.getMode() == Address::M_ModRM) {
+			setSIB(addr.getRegExp(), reg, disp8N);
+		} else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
+			setModRM(0, reg, 5);
+			if (addr.getLabel()) { // [rip + Label]
+				putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
+			} else {
+				size_t disp = addr.getDisp();
+				if (addr.getMode() == Address::M_ripAddr) {
+					if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
+					disp -= (size_t)getCurr() + 4 + immSize;
+				}
+				dd(inner::VerifyInInt32(disp));
+			}
+		}
+	}
+	/* preCode is for SSSE3/SSE4 */
+	void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&), int imm8 = NONE, int preCode = NONE)
+	{
+		if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (pref != NONE) db(pref);
+		if (op.isMEM()) {
+			opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
+		} else {
+			opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
+		}
+		if (imm8 != NONE) db(imm8);
+	}
+	void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(Reg32(ext), mmx, 0x0F, code);
+		db(imm8);
+	}
+	void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE)
+	{
+		opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
+	}
+	void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
+	{
+		if (pref != NONE) db(pref);
+		if (op1.isXMM() && op2.isMEM()) {
+			opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
+		} else if (op1.isMEM() && op2.isXMM()) {
+			opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
+		} else {
+			XBYAK_THROW(ERR_BAD_COMBINATION)
+		}
+	}
+	void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false)
+	{
+		if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
+			if (mmx.isXMM()) db(0x66);
+			opModR(op.getReg(), mmx, 0x0F, 0xC5); db(imm);
+		} else {
+			opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
+		}
+	}
+	void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE, bool disableRex = false, int immSize = 0)
+	{
+		int opBit = op.getBit();
+		if (disableRex && opBit == 64) opBit = 32;
+		if (op.isREG(bit)) {
+			opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
+		} else if (op.isMEM()) {
+			opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
+		} else {
+			XBYAK_THROW(ERR_BAD_COMBINATION)
+		}
+	}
+	void opShift(const Operand& op, int imm, int ext)
+	{
+		verifyMemHasSize(op);
+		opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
+		if (imm != 1) db(imm);
+	}
+	void opShift(const Operand& op, const Reg8& _cl, int ext)
+	{
+		if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opR_ModM(op, 0, ext, 0xD2);
+	}
+	void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0)
+	{
+		if (condR) {
+			opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
+		} else if (condM) {
+			opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
+		} else {
+			XBYAK_THROW(ERR_BAD_COMBINATION)
+		}
+	}
+	void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8 *_cl = 0)
+	{
+		if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F, code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
+		if (!_cl) db(imm);
+	}
+	// (REG, REG|MEM), (MEM, REG)
+	void opRM_RM(const Operand& op1, const Operand& op2, int code)
+	{
+		if (op1.isREG() && op2.isMEM()) {
+			opModM(op2.getAddress(), op1.getReg(), code | 2);
+		} else {
+			opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
+		}
+	}
+	// (REG|MEM, IMM)
+	void opRM_I(const Operand& op, uint32_t imm, int code, int ext)
+	{
+		verifyMemHasSize(op);
+		uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
+		if (op.isBit(8)) immBit = 8;
+		if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+		if (op.isBit(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
+		if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al
+			rex(op);
+			db(code | 4 | (immBit == 8 ? 0 : 1));
+		} else {
+			int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
+			opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
+		}
+		db(imm, immBit / 8);
+	}
+	void opIncDec(const Operand& op, int code, int ext)
+	{
+		verifyMemHasSize(op);
+#ifndef XBYAK64
+		if (op.isREG() && !op.isBit(8)) {
+			rex(op); db(code | op.getIdx());
+			return;
+		}
+#endif
+		code = 0xFE;
+		if (op.isREG()) {
+			opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
+		} else {
+			opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
+		}
+	}
+	void opPushPop(const Operand& op, int code, int ext, int alt)
+	{
+		int bit = op.getBit();
+		if (bit == 16 || bit == BIT) {
+			if (bit == 16) db(0x66);
+			if (op.isREG()) {
+				if (op.getReg().getIdx() >= 8) db(0x41);
+				db(alt | (op.getIdx() & 7));
+				return;
+			}
+			if (op.isMEM()) {
+				opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
+				return;
+			}
+		}
+		XBYAK_THROW(ERR_BAD_COMBINATION)
+	}
+	void verifyMemHasSize(const Operand& op) const
+	{
+		if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
+	}
+	/*
+		mov(r, imm) = db(imm, mov_imm(r, imm))
+	*/
+	int mov_imm(const Reg& reg, uint64_t imm)
+	{
+		int bit = reg.getBit();
+		const int idx = reg.getIdx();
+		int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
+		if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
+			rex(Reg32(idx));
+			bit = 32;
+		} else {
+			rex(reg);
+			if (bit == 64 && inner::IsInInt32(imm)) {
+				db(0xC7);
+				code = 0xC0;
+				bit = 32;
+			}
+		}
+		db(code | (idx & 7));
+		return bit / 8;
+	}
+	template<class T>
+	void putL_inner(T& label, bool relative = false, size_t disp = 0)
+	{
+		const int jmpSize = relative ? 4 : (int)sizeof(size_t);
+		if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
+		size_t offset = 0;
+		if (labelMgr_.getOffset(&offset, label)) {
+			if (relative) {
+				db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
+			} else if (isAutoGrow()) {
+				db(uint64_t(0), jmpSize);
+				save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
+			} else {
+				db(size_t(top_) + offset, jmpSize);
+			}
+			return;
+		}
+		db(uint64_t(0), jmpSize);
+		JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
+		labelMgr_.addUndefinedLabel(label, jmp);
+	}
+	void opMovxx(const Reg& reg, const Operand& op, uint8_t code)
+	{
+		if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
+		int w = op.isBit(16);
+#ifdef XBYAK64
+		if (op.isHigh8bit()) XBYAK_THROW(ERR_BAD_COMBINATION)
+#endif
+		bool cond = reg.isREG() && (reg.getBit() > op.getBit());
+		opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
+	}
+	void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext)
+	{
+		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+		uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
+		if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+		if (m64ext && addr.isBit(64)) ext = m64ext;
+
+		rex(addr, st0);
+		db(code);
+		opAddr(addr, ext);
+	}
+	// use code1 if reg1 == st0
+	// use code2 if reg1 != st0 && reg2 == st0
+	void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2)
+	{
+		uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
+		if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
+		db(uint8_t(code >> 8));
+		db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
+	}
+	void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2)
+	{
+		db(code1); db(code2 | reg.getIdx());
+	}
+	void opVex(const Reg& r, const Operand *p1, const Operand& op2, int type, int code, int imm8 = NONE)
+	{
+		if (op2.isMEM()) {
+			const Address& addr = op2.getAddress();
+			const RegExp& regExp = addr.getRegExp();
+			const Reg& base = regExp.getBase();
+			const Reg& index = regExp.getIndex();
+			if (BIT == 64 && addr.is32bit()) db(0x67);
+			int disp8N = 0;
+			bool x = index.isExtIdx();
+			if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
+				int aaa = addr.getOpmaskIdx();
+				if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
+				bool b = false;
+				if (addr.isBroadcast()) {
+					if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
+					b = true;
+				}
+				int VL = regExp.isVsib() ? index.getBit() : 0;
+				disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
+			} else {
+				vex(r, base, p1, type, code, x);
+			}
+			opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
+		} else {
+			const Reg& base = op2.getReg();
+			if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
+				evex(r, base, p1, type, code);
+			} else {
+				vex(r, base, p1, type, code);
+			}
+			setModRM(3, r.getIdx(), base.getIdx());
+		}
+		if (imm8 != NONE) db(imm8);
+	}
+	// (r, r, r/m) if isR_R_RM
+	// (r, r/m, r)
+	void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM, int imm8 = NONE)
+	{
+		const Operand *p1 = &op1;
+		const Operand *p2 = &op2;
+		if (!isR_R_RM) std::swap(p1, p2);
+		const unsigned int bit = r.getBit();
+		if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
+		type |= (bit == 64) ? T_W1 : T_W0;
+		opVex(r, p1, *p2, type, code, imm8);
+	}
+	void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE)
+	{
+		const Xmm *x2 = static_cast<const Xmm*>(&op1);
+		const Operand *op = &op2;
+		if (op2.isNone()) { // (x1, op1) -> (x1, x1, op1)
+			x2 = &x1;
+			op = &op1;
+		}
+		// (x1, x2, op)
+		if (!((x1.isXMM() && x2->isXMM()) || ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM()))))) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opVex(x1, x2, *op, type, code0, imm8);
+	}
+	void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE)
+	{
+		if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opVex(k, &x2, op3, type, code0, imm8);
+	}
+	// (x, x/m), (y, x/m256), (z, y/m)
+	void checkCvt1(const Operand& x, const Operand& op) const
+	{
+		if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION)
+	}
+	// (x, x/m), (x, y/m256), (y, z/m)
+	void checkCvt2(const Xmm& x, const Operand& op) const
+	{
+		if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) && !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM))) XBYAK_THROW(ERR_BAD_COMBINATION)
+	}
+	void opCvt2(const Xmm& x, const Operand& op, int type, int code)
+	{
+		checkCvt2(x, op);
+		Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
+		opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
+	}
+	void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code)
+	{
+		if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		Xmm x(op.getIdx());
+		const Operand *p = op.isREG() ? &x : &op;
+		opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
+	}
+	const Xmm& cvtIdx0(const Operand& x) const
+	{
+		return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0;
+	}
+	// support (x, x/m, imm), (y, y/m, imm)
+	void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE)
+	{
+		opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
+	}
+	// QQQ:need to refactor
+	void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1)
+	{
+		if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
+		if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (is16bit) db(0x66);
+		db(pref); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
+	}
+	void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode)
+	{
+		const RegExp& regExp = addr.getRegExp();
+		if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+		const int y_vx_y = 0;
+		const int y_vy_y = 1;
+//		const int x_vy_x = 2;
+		const bool isAddrYMM = regExp.getIndex().getBit() == 256;
+		if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
+			bool isOK = false;
+			if (mode == y_vx_y) {
+				isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
+			} else if (mode == y_vy_y) {
+				isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
+			} else { // x_vy_x
+				isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
+			}
+			if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+		}
+		opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type, code);
+	}
+	enum {
+		xx_yy_zz = 0,
+		xx_yx_zy = 1,
+		xx_xy_yz = 2
+	};
+	void checkGather2(const Xmm& x1, const Reg& x2, int mode) const
+	{
+		if (x1.isXMM() && x2.isXMM()) return;
+		switch (mode) {
+		case xx_yy_zz: if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
+			break;
+		case xx_yx_zy: if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
+			break;
+		case xx_xy_yz: if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
+			break;
+		}
+		XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+	}
+	void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode)
+	{
+		if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
+		checkGather2(x, addr.getRegExp().getIndex(), mode);
+		opVex(x, 0, addr, type, code);
+	}
+	/*
+		xx_xy_yz ; mode = true
+		xx_xy_xz ; mode = false
+	*/
+	void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode)
+	{
+		if (mode) {
+			if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
+		} else {
+			if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+		}
+		opVex(x, 0, op, type, code);
+	}
+	void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind)
+	{
+		if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
+		if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+		opVex(x, 0, addr, type, code);
+	}
+	void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
+	{
+		if (encoding == DefaultEncoding) {
+			encoding = EvexEncoding;
+		}
+		if (encoding == EvexEncoding) {
+#ifdef XBYAK_DISABLE_AVX512
+			XBYAK_THROW(ERR_EVEX_IS_INVALID)
+#endif
+			type |= T_MUST_EVEX;
+		}
+		opAVX_X_X_XM(x1, x2, op, type, code0);
+	}
+	void opInOut(const Reg& a, const Reg& d, uint8_t code)
+	{
+		if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
+			switch (a.getBit()) {
+			case 8: db(code); return;
+			case 16: db(0x66); db(code + 1); return;
+			case 32: db(code + 1); return;
+			}
+		}
+		XBYAK_THROW(ERR_BAD_COMBINATION)
+	}
+	void opInOut(const Reg& a, uint8_t code, uint8_t v)
+	{
+		if (a.getIdx() == Operand::AL) {
+			switch (a.getBit()) {
+			case 8: db(code); db(v); return;
+			case 16: db(0x66); db(code + 1); db(v); return;
+			case 32: db(code + 1); db(v); return;
+			}
+		}
+		XBYAK_THROW(ERR_BAD_COMBINATION)
+	}
+#ifdef XBYAK64
+	void opAMX(const Tmm& t1, const Address& addr, int type, int code0)
+	{
+		// require both base and index
+		const RegExp exp = addr.getRegExp(false);
+		if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
+		opVex(t1, &tmm0, addr, type, code0);
+	}
+#endif
+public:
+	unsigned int getVersion() const { return VERSION; }
+	using CodeArray::db;
+	const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+	const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+	const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+	const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
+	const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
+	const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
+	const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
+	const Reg16 ax, cx, dx, bx, sp, bp, si, di;
+	const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
+	const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword; // xword is same as oword of NASM
+	const AddressFrame ptr_b, xword_b, yword_b, zword_b; // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
+	const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
+	const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
+	const BoundsReg bnd0, bnd1, bnd2, bnd3;
+	const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_sae; // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
+	const EvexModifierZero T_z; // {z}
+#ifdef XBYAK64
+	const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
+	const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
+	const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
+	const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
+	const Reg8 spl, bpl, sil, dil;
+	const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+	const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
+	const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
+	const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+	const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
+	const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
+	const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+	const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
+	const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+	const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
+	const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; // for my convenience
+	const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
+	const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
+	const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
+	const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
+	const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
+	const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
+	const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
+	const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
+	const RegRip rip;
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+	const Segment es, cs, ss, ds, fs, gs;
+#endif
+private:
+	bool isDefaultJmpNEAR_;
+public:
+	void L(const std::string& label) { labelMgr_.defineSlabel(label); }
+	void L(Label& label) { labelMgr_.defineClabel(label); }
+	Label L() { Label label; L(label); return label; }
+	void inLocalLabel() { labelMgr_.enterLocal(); }
+	void outLocalLabel() { labelMgr_.leaveLocal(); }
+	/*
+		assign src to dst
+		require
+		dst : does not used by L()
+		src : used by L()
+	*/
+	void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
+	/*
+		put address of label to buffer
+		@note the put size is 4(32-bit), 8(64-bit)
+	*/
+	void putL(std::string label) { putL_inner(label); }
+	void putL(const Label& label) { putL_inner(label); }
+
+	// set default type of `jmp` of undefined label to T_NEAR
+	void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
+	void jmp(const Operand& op) { opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true); }
+	void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
+	void jmp(const char *label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
+	void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
+	void jmp(const void *addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
+
+	void call(const Operand& op) { opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true); }
+	// call(string label), not const std::string&
+	void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
+	void call(const char *label) { call(std::string(label)); }
+	void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
+	// call(function pointer)
+#ifdef XBYAK_VARIADIC_TEMPLATE
+	template<class Ret, class... Params>
+	void call(Ret(*func)(Params...)) { call(reinterpret_cast<const void*>(func)); }
+#endif
+	void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
+
+	void test(const Operand& op, const Reg& reg)
+	{
+		opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
+	}
+	void test(const Operand& op, uint32_t imm)
+	{
+		verifyMemHasSize(op);
+        int immSize = (std::min)(op.getBit() / 8, 4U);
+		if (op.isREG() && op.getIdx() == 0) { // al, ax, eax
+			rex(op);
+			db(0xA8 | (op.isBit(8) ? 0 : 1));
+		} else {
+			opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
+		}
+		db(imm, immSize);
+	}
+	void imul(const Reg& reg, const Operand& op)
+	{
+		opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
+	}
+	void imul(const Reg& reg, const Operand& op, int imm)
+	{
+		int s = inner::IsInDisp8(imm) ? 1 : 0;
+        int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
+		opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
+		db(imm, immSize);
+	}
+	void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
+	void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
+	void push(const AddressFrame& af, uint32_t imm)
+	{
+		if (af.bit_ == 8) {
+			db(0x6A); db(imm);
+		} else if (af.bit_ == 16) {
+			db(0x66); db(0x68); dw(imm);
+		} else {
+			db(0x68); dd(imm);
+		}
+	}
+	/* use "push(word, 4)" if you want "push word 4" */
+	void push(uint32_t imm)
+	{
+		if (inner::IsInDisp8(imm)) {
+			push(byte, imm);
+		} else {
+			push(dword, imm);
+		}
+	}
+	void mov(const Operand& reg1, const Operand& reg2)
+	{
+		const Reg *reg = 0;
+		const Address *addr = 0;
+		uint8_t code = 0;
+		if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) { // mov eax|ax|al, [disp]
+			reg = &reg1.getReg();
+			addr= &reg2.getAddress();
+			code = 0xA0;
+		} else
+		if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) { // mov [disp], eax|ax|al
+			reg = &reg2.getReg();
+			addr= &reg1.getAddress();
+			code = 0xA2;
+		}
+#ifdef XBYAK64
+		if (addr && addr->is64bitDisp()) {
+			if (code) {
+				rex(*reg);
+				db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
+				db(addr->getDisp(), 8);
+			} else {
+				XBYAK_THROW(ERR_BAD_COMBINATION)
+			}
+		} else
+#else
+		if (code && addr->isOnlyDisp()) {
+			rex(*reg, *addr);
+			db(code | (reg->isBit(8) ? 0 : 1));
+			dd(static_cast<uint32_t>(addr->getDisp()));
+		} else
+#endif
+		{
+			opRM_RM(reg1, reg2, 0x88);
+		}
+	}
+	void mov(const Operand& op, uint64_t imm)
+	{
+		if (op.isREG()) {
+			const int size = mov_imm(op.getReg(), imm);
+			db(imm, size);
+		} else if (op.isMEM()) {
+			verifyMemHasSize(op);
+			int immSize = op.getBit() / 8;
+			if (immSize <= 4) {
+				int64_t s = int64_t(imm) >> (immSize * 8);
+				if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+			} else {
+				if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+				immSize = 4;
+			}
+			opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
+			db(static_cast<uint32_t>(imm), immSize);
+		} else {
+			XBYAK_THROW(ERR_BAD_COMBINATION)
+		}
+	}
+
+	// The template is used to avoid ambiguity when the 2nd argument is 0.
+	// When the 2nd argument is 0 the call goes to
+	// `void mov(const Operand& op, uint64_t imm)`.
+	template <typename T1, typename T2>
+	void mov(const T1&, const T2 *) { T1::unexpected; }
+	void mov(const NativeReg& reg, const Label& label)
+	{
+		mov_imm(reg, dummyAddr);
+		putL(label);
+	}
+	void xchg(const Operand& op1, const Operand& op2)
+	{
+		const Operand *p1 = &op1, *p2 = &op2;
+		if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
+			p1 = &op2; p2 = &op1;
+		}
+		if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
+#ifdef XBYAK64
+			&& (p2->getIdx() != 0 || !p1->isREG(32))
+#endif
+		) {
+			rex(*p2, *p1); db(0x90 | (p2->getIdx() & 7));
+			return;
+		}
+		opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(), 0x86 | (p1->isBit(8) ? 0 : 1));
+	}
+
+#ifndef XBYAK_DISABLE_SEGMENT
+	void push(const Segment& seg)
+	{
+		switch (seg.getIdx()) {
+		case Segment::es: db(0x06); break;
+		case Segment::cs: db(0x0E); break;
+		case Segment::ss: db(0x16); break;
+		case Segment::ds: db(0x1E); break;
+		case Segment::fs: db(0x0F); db(0xA0); break;
+		case Segment::gs: db(0x0F); db(0xA8); break;
+		default:
+			assert(0);
+		}
+	}
+	void pop(const Segment& seg)
+	{
+		switch (seg.getIdx()) {
+		case Segment::es: db(0x07); break;
+		case Segment::cs: XBYAK_THROW(ERR_BAD_COMBINATION)
+		case Segment::ss: db(0x17); break;
+		case Segment::ds: db(0x1F); break;
+		case Segment::fs: db(0x0F); db(0xA1); break;
+		case Segment::gs: db(0x0F); db(0xA9); break;
+		default:
+			assert(0);
+		}
+	}
+	void putSeg(const Segment& seg)
+	{
+		switch (seg.getIdx()) {
+		case Segment::es: db(0x2E); break;
+		case Segment::cs: db(0x36); break;
+		case Segment::ss: db(0x3E); break;
+		case Segment::ds: db(0x26); break;
+		case Segment::fs: db(0x64); break;
+		case Segment::gs: db(0x65); break;
+		default:
+			assert(0);
+		}
+	}
+	void mov(const Operand& op, const Segment& seg)
+	{
+		printf("mov op seg\n");
+		opModRM(Reg8(seg.getIdx()), op, op.isREG(16|i32e), op.isMEM(), 0x8C);
+	}
+	void mov(const Segment& seg, const Operand& op)
+	{
+		printf("mov seg op\n");
+		opModRM(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op, op.isREG(16|i32e), op.isMEM(), 0x8E);
+	}
+#endif
+
+	enum { NONE = 256 };
+	// constructor
+	CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0, Allocator *allocator = 0)
+		: CodeArray(maxSize, userPtr, allocator)
+		, mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7)
+		, xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7)
+		, ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7)
+		, zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7)
+		// for my convenience
+		, xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7)
+		, ym0(ymm0), ym1(ymm1), ym2(ymm2), ym3(ymm3), ym4(ymm4), ym5(ymm5), ym6(ymm6), ym7(ymm7)
+		, zm0(zmm0), zm1(zmm1), zm2(zmm2), zm3(zmm3), zm4(zmm4), zm5(zmm5), zm6(zmm6), zm7(zmm7)
+
+		, eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI)
+		, ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP), bp(Operand::BP), si(Operand::SI), di(Operand::DI)
+		, al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH)
+		, ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256), zword(512)
+		, ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true)
+		, st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7)
+		, k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7)
+		, bnd0(0), bnd1(1), bnd2(2), bnd3(3)
+		, T_sae(EvexModifierRounding::T_SAE), T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE), T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE)
+		, T_z()
+#ifdef XBYAK64
+		, rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15)
+		, r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15)
+		, r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15)
+		, r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15)
+		, spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true)
+		, xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15)
+		, xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23)
+		, xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31)
+		, ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15)
+		, ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23)
+		, ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31)
+		, zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15)
+		, zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23)
+		, zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31)
+		, tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7)
+		// for my convenience
+		, xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15)
+		, xm16(xmm16), xm17(xmm17), xm18(xmm18), xm19(xmm19), xm20(xmm20), xm21(xmm21), xm22(xmm22), xm23(xmm23)
+		, xm24(xmm24), xm25(xmm25), xm26(xmm26), xm27(xmm27), xm28(xmm28), xm29(xmm29), xm30(xmm30), xm31(xmm31)
+		, ym8(ymm8), ym9(ymm9), ym10(ymm10), ym11(ymm11), ym12(ymm12), ym13(ymm13), ym14(ymm14), ym15(ymm15)
+		, ym16(ymm16), ym17(ymm17), ym18(ymm18), ym19(ymm19), ym20(ymm20), ym21(ymm21), ym22(ymm22), ym23(ymm23)
+		, ym24(ymm24), ym25(ymm25), ym26(ymm26), ym27(ymm27), ym28(ymm28), ym29(ymm29), ym30(ymm30), ym31(ymm31)
+		, zm8(zmm8), zm9(zmm9), zm10(zmm10), zm11(zmm11), zm12(zmm12), zm13(zmm13), zm14(zmm14), zm15(zmm15)
+		, zm16(zmm16), zm17(zmm17), zm18(zmm18), zm19(zmm19), zm20(zmm20), zm21(zmm21), zm22(zmm22), zm23(zmm23)
+		, zm24(zmm24), zm25(zmm25), zm26(zmm26), zm27(zmm27), zm28(zmm28), zm29(zmm29), zm30(zmm30), zm31(zmm31)
+		, rip()
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+		, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
+#endif
+		, isDefaultJmpNEAR_(false)
+	{
+		labelMgr_.set(this);
+	}
+	void reset()
+	{
+		resetSize();
+		labelMgr_.reset();
+		labelMgr_.set(this);
+	}
+	bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
+	/*
+		MUST call ready() to complete generating code if you use AutoGrow mode.
+		It is not necessary for the other mode if hasUndefinedLabel() is true.
+	*/
+	void ready(ProtectMode mode = PROTECT_RWE)
+	{
+		if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
+		if (isAutoGrow()) {
+			calcJmpAddress();
+			if (useProtect()) setProtectMode(mode);
+		}
+	}
+	// set read/exec
+	void readyRE() { return ready(PROTECT_RE); }
+#ifdef XBYAK_TEST
+	void dump(bool doClear = true)
+	{
+		CodeArray::dump();
+		if (doClear) size_ = 0;
+	}
+#endif
+
+#ifdef XBYAK_UNDEF_JNL
+	#undef jnl
+#endif
+
+	/*
+		use single byte nop if useMultiByteNop = false
+	*/
+	void nop(size_t size = 1, bool useMultiByteNop = true)
+	{
+		if (!useMultiByteNop) {
+			for (size_t i = 0; i < size; i++) {
+				db(0x90);
+			}
+			return;
+		}
+		/*
+			Intel Architectures Software Developer's Manual Volume 2
+			recommended multi-byte sequence of NOP instruction
+			AMD and Intel seem to agree on the same sequences for up to 9 bytes:
+			https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
+		*/
+		static const uint8_t nopTbl[9][9] = {
+			{0x90},
+			{0x66, 0x90},
+			{0x0F, 0x1F, 0x00},
+			{0x0F, 0x1F, 0x40, 0x00},
+			{0x0F, 0x1F, 0x44, 0x00, 0x00},
+			{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
+			{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
+			{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+			{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+		};
+		const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
+		while (size > 0) {
+			size_t len = (std::min)(n, size);
+			const uint8_t *seq = nopTbl[len - 1];
+			db(seq, len);
+			size -= len;
+		}
+	}
+
+#ifndef XBYAK_DONT_READ_LIST
+#include "xbyak_mnemonic.h"
+	/*
+		use single byte nop if useMultiByteNop = false
+	*/
+	void align(size_t x = 16, bool useMultiByteNop = true)
+	{
+		if (x == 1) return;
+		if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
+		if (isAutoGrow() && x > inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", (int)x);
+		size_t remain = size_t(getCurr()) % x;
+		if (remain) {
+			nop(x - remain, useMultiByteNop);
+		}
+	}
+#endif
+};
+
+template <>
+inline void CodeGenerator::mov(const NativeReg& reg, const char *label) // can't use std::string
+{
+	assert(label);
+	mov_imm(reg, dummyAddr);
+	putL(label);
+}
+
+namespace util {
+static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
+static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
+static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
+static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
+static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
+static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP), bp(Operand::BP), si(Operand::SI), di(Operand::DI);
+static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
+static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256), zword(512);
+static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
+static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
+static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
+static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
+static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE), T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE), T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
+static const XBYAK_CONSTEXPR EvexModifierZero T_z;
+#ifdef XBYAK64
+static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
+static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
+static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
+static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15), spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
+static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
+static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
+static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
+static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
+static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
+static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
+static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
+static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
+static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
+static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
+static const XBYAK_CONSTEXPR RegRip rip;
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs);
+#endif
+} // util
+
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+
+} // end of namespace
+
+#endif // XBYAK_XBYAK_H_
diff --git a/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_bin2hex.h b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_bin2hex.h
new file mode 100644
index 0000000..69ecdbf
--- /dev/null
+++ b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_bin2hex.h
@@ -0,0 +1,258 @@
+enum {
+	B00000000= 0,
+	B00000001= 1,
+	B00000010= 2,
+	B00000011= 3,
+	B00000100= 4,
+	B00000101= 5,
+	B00000110= 6,
+	B00000111= 7,
+	B00001000= 8,
+	B00001001= 9,
+	B00001010= 10,
+	B00001011= 11,
+	B00001100= 12,
+	B00001101= 13,
+	B00001110= 14,
+	B00001111= 15,
+	B00010000= 16,
+	B00010001= 17,
+	B00010010= 18,
+	B00010011= 19,
+	B00010100= 20,
+	B00010101= 21,
+	B00010110= 22,
+	B00010111= 23,
+	B00011000= 24,
+	B00011001= 25,
+	B00011010= 26,
+	B00011011= 27,
+	B00011100= 28,
+	B00011101= 29,
+	B00011110= 30,
+	B00011111= 31,
+	B00100000= 32,
+	B00100001= 33,
+	B00100010= 34,
+	B00100011= 35,
+	B00100100= 36,
+	B00100101= 37,
+	B00100110= 38,
+	B00100111= 39,
+	B00101000= 40,
+	B00101001= 41,
+	B00101010= 42,
+	B00101011= 43,
+	B00101100= 44,
+	B00101101= 45,
+	B00101110= 46,
+	B00101111= 47,
+	B00110000= 48,
+	B00110001= 49,
+	B00110010= 50,
+	B00110011= 51,
+	B00110100= 52,
+	B00110101= 53,
+	B00110110= 54,
+	B00110111= 55,
+	B00111000= 56,
+	B00111001= 57,
+	B00111010= 58,
+	B00111011= 59,
+	B00111100= 60,
+	B00111101= 61,
+	B00111110= 62,
+	B00111111= 63,
+	B01000000= 64,
+	B01000001= 65,
+	B01000010= 66,
+	B01000011= 67,
+	B01000100= 68,
+	B01000101= 69,
+	B01000110= 70,
+	B01000111= 71,
+	B01001000= 72,
+	B01001001= 73,
+	B01001010= 74,
+	B01001011= 75,
+	B01001100= 76,
+	B01001101= 77,
+	B01001110= 78,
+	B01001111= 79,
+	B01010000= 80,
+	B01010001= 81,
+	B01010010= 82,
+	B01010011= 83,
+	B01010100= 84,
+	B01010101= 85,
+	B01010110= 86,
+	B01010111= 87,
+	B01011000= 88,
+	B01011001= 89,
+	B01011010= 90,
+	B01011011= 91,
+	B01011100= 92,
+	B01011101= 93,
+	B01011110= 94,
+	B01011111= 95,
+	B01100000= 96,
+	B01100001= 97,
+	B01100010= 98,
+	B01100011= 99,
+	B01100100= 100,
+	B01100101= 101,
+	B01100110= 102,
+	B01100111= 103,
+	B01101000= 104,
+	B01101001= 105,
+	B01101010= 106,
+	B01101011= 107,
+	B01101100= 108,
+	B01101101= 109,
+	B01101110= 110,
+	B01101111= 111,
+	B01110000= 112,
+	B01110001= 113,
+	B01110010= 114,
+	B01110011= 115,
+	B01110100= 116,
+	B01110101= 117,
+	B01110110= 118,
+	B01110111= 119,
+	B01111000= 120,
+	B01111001= 121,
+	B01111010= 122,
+	B01111011= 123,
+	B01111100= 124,
+	B01111101= 125,
+	B01111110= 126,
+	B01111111= 127,
+	B10000000= 128,
+	B10000001= 129,
+	B10000010= 130,
+	B10000011= 131,
+	B10000100= 132,
+	B10000101= 133,
+	B10000110= 134,
+	B10000111= 135,
+	B10001000= 136,
+	B10001001= 137,
+	B10001010= 138,
+	B10001011= 139,
+	B10001100= 140,
+	B10001101= 141,
+	B10001110= 142,
+	B10001111= 143,
+	B10010000= 144,
+	B10010001= 145,
+	B10010010= 146,
+	B10010011= 147,
+	B10010100= 148,
+	B10010101= 149,
+	B10010110= 150,
+	B10010111= 151,
+	B10011000= 152,
+	B10011001= 153,
+	B10011010= 154,
+	B10011011= 155,
+	B10011100= 156,
+	B10011101= 157,
+	B10011110= 158,
+	B10011111= 159,
+	B10100000= 160,
+	B10100001= 161,
+	B10100010= 162,
+	B10100011= 163,
+	B10100100= 164,
+	B10100101= 165,
+	B10100110= 166,
+	B10100111= 167,
+	B10101000= 168,
+	B10101001= 169,
+	B10101010= 170,
+	B10101011= 171,
+	B10101100= 172,
+	B10101101= 173,
+	B10101110= 174,
+	B10101111= 175,
+	B10110000= 176,
+	B10110001= 177,
+	B10110010= 178,
+	B10110011= 179,
+	B10110100= 180,
+	B10110101= 181,
+	B10110110= 182,
+	B10110111= 183,
+	B10111000= 184,
+	B10111001= 185,
+	B10111010= 186,
+	B10111011= 187,
+	B10111100= 188,
+	B10111101= 189,
+	B10111110= 190,
+	B10111111= 191,
+	B11000000= 192,
+	B11000001= 193,
+	B11000010= 194,
+	B11000011= 195,
+	B11000100= 196,
+	B11000101= 197,
+	B11000110= 198,
+	B11000111= 199,
+	B11001000= 200,
+	B11001001= 201,
+	B11001010= 202,
+	B11001011= 203,
+	B11001100= 204,
+	B11001101= 205,
+	B11001110= 206,
+	B11001111= 207,
+	B11010000= 208,
+	B11010001= 209,
+	B11010010= 210,
+	B11010011= 211,
+	B11010100= 212,
+	B11010101= 213,
+	B11010110= 214,
+	B11010111= 215,
+	B11011000= 216,
+	B11011001= 217,
+	B11011010= 218,
+	B11011011= 219,
+	B11011100= 220,
+	B11011101= 221,
+	B11011110= 222,
+	B11011111= 223,
+	B11100000= 224,
+	B11100001= 225,
+	B11100010= 226,
+	B11100011= 227,
+	B11100100= 228,
+	B11100101= 229,
+	B11100110= 230,
+	B11100111= 231,
+	B11101000= 232,
+	B11101001= 233,
+	B11101010= 234,
+	B11101011= 235,
+	B11101100= 236,
+	B11101101= 237,
+	B11101110= 238,
+	B11101111= 239,
+	B11110000= 240,
+	B11110001= 241,
+	B11110010= 242,
+	B11110011= 243,
+	B11110100= 244,
+	B11110101= 245,
+	B11110110= 246,
+	B11110111= 247,
+	B11111000= 248,
+	B11111001= 249,
+	B11111010= 250,
+	B11111011= 251,
+	B11111100= 252,
+	B11111101= 253,
+	B11111110= 254,
+	B11111111= 255
+};
diff --git a/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_mnemonic.h b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_mnemonic.h
new file mode 100644
index 0000000..5c1ecff
--- /dev/null
+++ b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_mnemonic.h
@@ -0,0 +1,2050 @@
+const char *getVersionString() const { return "5.991"; }
+void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
+void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
+void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
+void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
+void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
+void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
+void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
+void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
+void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
+void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
+void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
+void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
+void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
+void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
+void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
+void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
+void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
+void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
+void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
+void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
+void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void bnd() { db(0xF2); }
+void bndcl(const BoundsReg& bnd, const Operand& op) { db(0xF3); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); }
+void bndcn(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM()); }
+void bndcu(const BoundsReg& bnd, const Operand& op) { db(0xF2); opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM()); }
+void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
+void bndmk(const BoundsReg& bnd, const Address& addr) { db(0xF3); opModM(addr, bnd, 0x0F, 0x1B); }
+void bndmov(const Address& addr, const BoundsReg& bnd) { db(0x66); opModM(addr, bnd, 0x0F, 0x1B); }
+void bndmov(const BoundsReg& bnd, const Operand& op) { db(0x66); opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A); }
+void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
+void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
+void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
+void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
+void bt(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3); }
+void bt(const Operand& op, uint8_t imm) { opR_ModM(op, 16|32|64, 4, 0x0f, 0xba, NONE, false, 1); db(imm); }
+void btc(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB); }
+void btc(const Operand& op, uint8_t imm) { opR_ModM(op, 16|32|64, 7, 0x0f, 0xba, NONE, false, 1); db(imm); }
+void btr(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3); }
+void btr(const Operand& op, uint8_t imm) { opR_ModM(op, 16|32|64, 6, 0x0f, 0xba, NONE, false, 1); db(imm); }
+void bts(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB); }
+void bts(const Operand& op, uint8_t imm) { opR_ModM(op, 16|32|64, 5, 0x0f, 0xba, NONE, false, 1); db(imm); }
+void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
+void cbw() { db(0x66); db(0x98); }
+void cdq() { db(0x99); }
+void clc() { db(0xF8); }
+void cld() { db(0xFC); }
+void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
+void clflushopt(const Address& addr) { db(0x66); opModM(addr, Reg32(7), 0x0F, 0xAE); }
+void cli() { db(0xFA); }
+void clzero() { db(0x0F); db(0x01); db(0xFC); }
+void cmc() { db(0xF5); }
+void cmova(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524
+void cmovae(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3); }//-V524
+void cmovb(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2); }//-V524
+void cmovbe(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6); }//-V524
+void cmovc(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2); }//-V524
+void cmove(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4); }//-V524
+void cmovg(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15); }//-V524
+void cmovge(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13); }//-V524
+void cmovl(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12); }//-V524
+void cmovle(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14); }//-V524
+void cmovna(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6); }//-V524
+void cmovnae(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2); }//-V524
+void cmovnb(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3); }//-V524
+void cmovnbe(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524
+void cmovnc(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3); }//-V524
+void cmovne(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5); }//-V524
+void cmovng(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14); }//-V524
+void cmovnge(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12); }//-V524
+void cmovnl(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13); }//-V524
+void cmovnle(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15); }//-V524
+void cmovno(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1); }//-V524
+void cmovnp(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11); }//-V524
+void cmovns(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9); }//-V524
+void cmovnz(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5); }//-V524
+void cmovo(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0); }//-V524
+void cmovp(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10); }//-V524
+void cmovpe(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10); }//-V524
+void cmovpo(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11); }//-V524
+void cmovs(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8); }//-V524
+void cmovz(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4); }//-V524
+void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
+void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
+void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
+void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
+void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
+void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
+void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
+void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
+void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
+void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
+void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
+void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
+void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
+void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
+void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
+void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
+void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
+void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
+void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
+void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
+void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
+void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
+void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
+void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
+void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
+void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
+void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
+void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
+void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
+void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
+void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
+void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
+void cmpsb() { db(0xA6); }
+void cmpsd() { db(0xA7); }
+void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
+void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
+void cmpsw() { db(0x66); db(0xA7); }
+void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
+void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
+void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
+void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
+void cmpxchg(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xB0 | (reg.isBit(8) ? 0 : 1)); }
+void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
+void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
+void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
+void cpuid() { db(0x0F); db(0xA2); }
+void crc32(const Reg32e& reg, const Operand& op) { if (reg.isBit(32) && op.isBit(16)) db(0x66); db(0xF2); opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1)); }
+void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
+void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
+void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
+void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
+void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
+void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
+void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
+void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
+void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
+void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
+void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
+void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
+void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
+void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
+void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
+void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
+void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
+void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
+void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
+void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
+void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
+void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
+void cwd() { db(0x66); db(0x99); }
+void cwde() { db(0x98); }
+void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
+void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
+void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
+void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
+void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
+void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
+void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void dpps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void emms() { db(0x0F); db(0x77); }
+void enter(uint16_t x, uint8_t y) { db(0xC8); dw(x); db(y); }
+void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
+void f2xm1() { db(0xD9); db(0xF0); }
+void fabs() { db(0xD9); db(0xE1); }
+void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
+void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
+void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
+void faddp() { db(0xDE); db(0xC1); }
+void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
+void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
+void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
+void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
+void fchs() { db(0xD9); db(0xE0); }
+void fclex() { db(0x9B); db(0xDB); db(0xE2); }
+void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
+void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
+void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
+void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
+void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
+void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
+void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
+void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
+void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
+void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
+void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
+void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
+void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
+void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
+void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
+void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
+void fcom() { db(0xD8); db(0xD1); }
+void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
+void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
+void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
+void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
+void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
+void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
+void fcomp() { db(0xD8); db(0xD9); }
+void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
+void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
+void fcompp() { db(0xDE); db(0xD9); }
+void fcos() { db(0xD9); db(0xFF); }
+void fdecstp() { db(0xD9); db(0xF6); }
+void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
+void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
+void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
+void fdivp() { db(0xDE); db(0xF9); }
+void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
+void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
+void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
+void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
+void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
+void fdivrp() { db(0xDE); db(0xF1); }
+void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
+void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
+void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
+void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
+void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
+void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
+void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
+void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
+void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
+void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
+void fincstp() { db(0xD9); db(0xF7); }
+void finit() { db(0x9B); db(0xDB); db(0xE3); }
+void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
+void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
+void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
+void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
+void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
+void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
+void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
+void fld1() { db(0xD9); db(0xE8); }
+void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
+void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
+void fldl2e() { db(0xD9); db(0xEA); }
+void fldl2t() { db(0xD9); db(0xE9); }
+void fldlg2() { db(0xD9); db(0xEC); }
+void fldln2() { db(0xD9); db(0xED); }
+void fldpi() { db(0xD9); db(0xEB); }
+void fldz() { db(0xD9); db(0xEE); }
+void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
+void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
+void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
+void fmulp() { db(0xDE); db(0xC9); }
+void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
+void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
+void fnclex() { db(0xDB); db(0xE2); }
+void fninit() { db(0xDB); db(0xE3); }
+void fnop() { db(0xD9); db(0xD0); }
+void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
+void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
+void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
+void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
+void fnstsw(const Reg16& r) { if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF); db(0xE0); }
+void fpatan() { db(0xD9); db(0xF3); }
+void fprem() { db(0xD9); db(0xF8); }
+void fprem1() { db(0xD9); db(0xF5); }
+void fptan() { db(0xD9); db(0xF2); }
+void frndint() { db(0xD9); db(0xFC); }
+void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
+void fsave(const Address& addr) { db(0x9B); opModM(addr, Reg32(6), 0xDD, 0x100); }
+void fscale() { db(0xD9); db(0xFD); }
+void fsin() { db(0xD9); db(0xFE); }
+void fsincos() { db(0xD9); db(0xFB); }
+void fsqrt() { db(0xD9); db(0xFA); }
+void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
+void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
+void fstcw(const Address& addr) { db(0x9B); opModM(addr, Reg32(7), 0xD9, 0x100); }
+void fstenv(const Address& addr) { db(0x9B); opModM(addr, Reg32(6), 0xD9, 0x100); }
+void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
+void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
+void fstsw(const Address& addr) { db(0x9B); opModM(addr, Reg32(7), 0xDD, 0x100); }
+void fstsw(const Reg16& r) { if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B); db(0xDF); db(0xE0); }
+void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
+void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
+void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
+void fsubp() { db(0xDE); db(0xE9); }
+void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
+void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
+void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
+void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
+void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
+void fsubrp() { db(0xDE); db(0xE1); }
+void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
+void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
+void ftst() { db(0xD9); db(0xE4); }
+void fucom() { db(0xDD); db(0xE1); }
+void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
+void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
+void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
+void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
+void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
+void fucomp() { db(0xDD); db(0xE9); }
+void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
+void fucompp() { db(0xDA); db(0xE9); }
+void fwait() { db(0x9B); }
+void fxam() { db(0xD9); db(0xE5); }
+void fxch() { db(0xD9); db(0xC9); }
+void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
+void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
+void fxtract() { db(0xD9); db(0xF4); }
+void fyl2x() { db(0xD9); db(0xF1); }
+void fyl2xp1() { db(0xD9); db(0xF9); }
+void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
+void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
+void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
+void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
+void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
+void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
+void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
+void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
+void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
+void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void int3() { db(0xCC); }
+void int_(uint8_t x) { db(0xCD); db(x); }
+void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }//-V524
+void ja(const char *label, LabelType type = T_AUTO) { ja(std::string(label), type); }//-V524
+void ja(const void *addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }//-V524
+void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }//-V524
+void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jae(const char *label, LabelType type = T_AUTO) { jae(std::string(label), type); }//-V524
+void jae(const void *addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }//-V524
+void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void jb(const char *label, LabelType type = T_AUTO) { jb(std::string(label), type); }//-V524
+void jb(const void *addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }//-V524
+void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }//-V524
+void jbe(const char *label, LabelType type = T_AUTO) { jbe(std::string(label), type); }//-V524
+void jbe(const void *addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }//-V524
+void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }//-V524
+void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void jc(const char *label, LabelType type = T_AUTO) { jc(std::string(label), type); }//-V524
+void jc(const void *addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }//-V524
+void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524
+void je(const char *label, LabelType type = T_AUTO) { je(std::string(label), type); }//-V524
+void je(const void *addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }//-V524
+void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524
+void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }//-V524
+void jg(const char *label, LabelType type = T_AUTO) { jg(std::string(label), type); }//-V524
+void jg(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }//-V524
+void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }//-V524
+void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }//-V524
+void jge(const char *label, LabelType type = T_AUTO) { jge(std::string(label), type); }//-V524
+void jge(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }//-V524
+void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }//-V524
+void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }//-V524
+void jl(const char *label, LabelType type = T_AUTO) { jl(std::string(label), type); }//-V524
+void jl(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }//-V524
+void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }//-V524
+void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }//-V524
+void jle(const char *label, LabelType type = T_AUTO) { jle(std::string(label), type); }//-V524
+void jle(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }//-V524
+void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }//-V524
+void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }//-V524
+void jna(const char *label, LabelType type = T_AUTO) { jna(std::string(label), type); }//-V524
+void jna(const void *addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }//-V524
+void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }//-V524
+void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void jnae(const char *label, LabelType type = T_AUTO) { jnae(std::string(label), type); }//-V524
+void jnae(const void *addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }//-V524
+void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }//-V524
+void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jnb(const char *label, LabelType type = T_AUTO) { jnb(std::string(label), type); }//-V524
+void jnb(const void *addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }//-V524
+void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }//-V524
+void jnbe(const char *label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }//-V524
+void jnbe(const void *addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }//-V524
+void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }//-V524
+void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jnc(const char *label, LabelType type = T_AUTO) { jnc(std::string(label), type); }//-V524
+void jnc(const void *addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }//-V524
+void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }//-V524
+void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }//-V524
+void jne(const char *label, LabelType type = T_AUTO) { jne(std::string(label), type); }//-V524
+void jne(const void *addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }//-V524
+void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }//-V524
+void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }//-V524
+void jng(const char *label, LabelType type = T_AUTO) { jng(std::string(label), type); }//-V524
+void jng(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }//-V524
+void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }//-V524
+void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }//-V524
+void jnge(const char *label, LabelType type = T_AUTO) { jnge(std::string(label), type); }//-V524
+void jnge(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }//-V524
+void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }//-V524
+void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }//-V524
+void jnl(const char *label, LabelType type = T_AUTO) { jnl(std::string(label), type); }//-V524
+void jnl(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }//-V524
+void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }//-V524
+void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }//-V524
+void jnle(const char *label, LabelType type = T_AUTO) { jnle(std::string(label), type); }//-V524
+void jnle(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }//-V524
+void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }//-V524
+void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }//-V524
+void jno(const char *label, LabelType type = T_AUTO) { jno(std::string(label), type); }//-V524
+void jno(const void *addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }//-V524
+void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }//-V524
+void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }//-V524
+void jnp(const char *label, LabelType type = T_AUTO) { jnp(std::string(label), type); }//-V524
+void jnp(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }//-V524
+void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }//-V524
+void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }//-V524
+void jns(const char *label, LabelType type = T_AUTO) { jns(std::string(label), type); }//-V524
+void jns(const void *addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }//-V524
+void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }//-V524
+void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }//-V524
+void jnz(const char *label, LabelType type = T_AUTO) { jnz(std::string(label), type); }//-V524
+void jnz(const void *addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }//-V524
+void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }//-V524
+void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }//-V524
+void jo(const char *label, LabelType type = T_AUTO) { jo(std::string(label), type); }//-V524
+void jo(const void *addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }//-V524
+void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }//-V524
+void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }//-V524
+void jp(const char *label, LabelType type = T_AUTO) { jp(std::string(label), type); }//-V524
+void jp(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }//-V524
+void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }//-V524
+void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }//-V524
+void jpe(const char *label, LabelType type = T_AUTO) { jpe(std::string(label), type); }//-V524
+void jpe(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }//-V524
+void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }//-V524
+void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }//-V524
+void jpo(const char *label, LabelType type = T_AUTO) { jpo(std::string(label), type); }//-V524
+void jpo(const void *addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }//-V524
+void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }//-V524
+void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }//-V524
+void js(const char *label, LabelType type = T_AUTO) { js(std::string(label), type); }//-V524
+void js(const void *addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }//-V524
+void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }//-V524
+void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524
+void jz(const char *label, LabelType type = T_AUTO) { jz(std::string(label), type); }//-V524
+void jz(const void *addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }//-V524
+void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524
+void lahf() { db(0x9F); }
+void lddqu(const Xmm& xmm, const Address& addr) { db(0xF2); opModM(addr, xmm, 0x0F, 0xF0); }
+void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
+void lea(const Reg& reg, const Address& addr) { if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D); }
+void leave() { db(0xC9); }
+void lfence() { db(0x0F); db(0xAE); db(0xE8); }
+void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
+void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
+void lock() { db(0xF0); }
+void lodsb() { db(0xAC); }
+void lodsd() { db(0xAD); }
+void lodsw() { db(0x66); db(0xAD); }
+void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
+void loop(const char *label) { loop(std::string(label)); }
+void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
+void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
+void loope(const char *label) { loope(std::string(label)); }
+void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
+void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
+void loopne(const char *label) { loopne(std::string(label)); }
+void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
+void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
+void lzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
+void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { db(0x66);  opModR(reg1, reg2, 0x0F, 0xF7); }
+void maskmovq(const Mmx& reg1, const Mmx& reg2) { if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7); }
+void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
+void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
+void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
+void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
+void mfence() { db(0x0F); db(0xAE); db(0xF0); }
+void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
+void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
+void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
+void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
+void monitor() { db(0x0F); db(0x01); db(0xC8); }
+void monitorx() { db(0x0F); db(0x01); db(0xFA); }
+void movapd(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x29); }
+void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
+void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
+void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
+void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
+void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
+void movd(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModM(addr, mmx, 0x0F, 0x7E); }
+void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opModM(addr, mmx, 0x0F, 0x6E); }
+void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); }
+void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
+void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
+void movdq2q(const Mmx& mmx, const Xmm& xmm) { db(0xF2); opModR(mmx, xmm, 0x0F, 0xD6); }
+void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); }
+void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
+void movdqu(const Address& addr, const Xmm& xmm) { db(0xF3); opModM(addr, xmm, 0x0F, 0x7F); }
+void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
+void movhlps(const Xmm& reg1, const Xmm& reg2) {  opModR(reg1, reg2, 0x0F, 0x12); }
+void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
+void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
+void movlhps(const Xmm& reg1, const Xmm& reg2) {  opModR(reg1, reg2, 0x0F, 0x16); }
+void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
+void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
+void movmskpd(const Reg32e& reg, const Xmm& xmm) { db(0x66); movmskps(reg, xmm); }
+void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
+void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
+void movntdqa(const Xmm& xmm, const Address& addr) { db(0x66); opModM(addr, xmm, 0x0F, 0x38, 0x2A); }
+void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
+void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
+void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
+void movntq(const Address& addr, const Mmx& mmx) { if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7); }
+void movq(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F); }
+void movq(const Mmx& mmx, const Operand& op) { if (mmx.isXMM()) db(0xF3); opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F); }
+void movq2dq(const Xmm& xmm, const Mmx& mmx) { db(0xF3); opModR(xmm, mmx, 0x0F, 0xD6); }
+void movsb() { db(0xA4); }
+void movsd() { db(0xA5); }
+void movsd(const Address& addr, const Xmm& xmm) { db(0xF2); opModM(addr, xmm, 0x0F, 0x11); }
+void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
+void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
+void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
+void movss(const Address& addr, const Xmm& xmm) { db(0xF3); opModM(addr, xmm, 0x0F, 0x11); }
+void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
+void movsw() { db(0x66); db(0xA5); }
+void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
+void movupd(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x11); }
+void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
+void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
+void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
+void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
+void mpsadbw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
+void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
+void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
+void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
+void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
+void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
+void mwait() { db(0x0F); db(0x01); db(0xC9); }
+void mwaitx() { db(0x0F); db(0x01); db(0xFB); }
+void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
+void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
+void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
+void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
+void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
+void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
+void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
+void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
+void outsb() { db(0x6E); }
+void outsd() { db(0x6F); }
+void outsw() { db(0x66); db(0x6F); }
+void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
+void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
+void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
+void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
+void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
+void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
+void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
+void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
+void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
+void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
+void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
+void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
+void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
+void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
+void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a); }
+void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
+void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
+void pause() { db(0xF3); db(0x90); }
+void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
+void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
+void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pblendw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void pclmulhqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
+void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
+void pclmullqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
+void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
+void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
+void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
+void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
+void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
+void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
+void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
+void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
+void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
+void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
+void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
+void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
+void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
+void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
+void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
+void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
+void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
+void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
+void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
+void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
+void pinsrw(const Mmx& mmx, const Operand& op, int imm) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm); }
+void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
+void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
+void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
+void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
+void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
+void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
+void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovmskb(const Reg32e& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(reg, mmx, 0x0F, 0xD7); }
+void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
+void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
+void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
+void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
+void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
+void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
+void popf() { db(0x9D); }
+void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
+void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
+void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
+void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
+void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
+void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
+void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
+void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
+void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
+void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
+void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
+void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
+void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
+void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
+void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
+void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
+void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
+void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
+void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
+void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
+void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
+void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
+void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
+void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
+void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
+void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
+void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
+void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
+void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
+void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
+void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
+void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
+void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
+void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
+void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
+void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
+void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
+void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
+void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
+void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
+void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
+void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
+void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
+void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
+void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
+void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
+void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
+void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
+void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
+void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
+void pushf() { db(0x9C); }
+void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
+void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
+void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
+void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
+void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
+void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
+void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
+void rdmsr() { db(0x0F); db(0x32); }
+void rdpmc() { db(0x0F); db(0x33); }
+void rdrand(const Reg& r) { if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7); }
+void rdseed(const Reg& r) { if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7); }
+void rdtsc() { db(0x0F); db(0x31); }
+void rdtscp() { db(0x0F); db(0x01); db(0xF9); }
+void rep() { db(0xF3); }
+void repe() { db(0xF3); }
+void repne() { db(0xF2); }
+void repnz() { db(0xF2); }
+void repz() { db(0xF3); }
+void ret(int imm = 0) { if (imm) { db(0xC2); dw(imm); } else { db(0xC3); } }
+void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
+void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
+void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
+void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
+void rorx(const Reg32e& r, const Operand& op, uint8_t imm) { opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm); }
+void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void roundsd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void roundss(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
+void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
+void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
+void sahf() { db(0x9E); }
+void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
+void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
+void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
+void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
+void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
+void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
+void scasb() { db(0xAE); }
+void scasd() { db(0xAF); }
+void scasw() { db(0x66); db(0xAF); }
+void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
+void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
+void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
+void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }//-V524
+void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
+void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }//-V524
+void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }//-V524
+void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }//-V524
+void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }//-V524
+void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }//-V524
+void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }//-V524
+void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
+void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
+void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
+void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
+void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }//-V524
+void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }//-V524
+void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }//-V524
+void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }//-V524
+void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }//-V524
+void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }//-V524
+void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }//-V524
+void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }//-V524
+void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }//-V524
+void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }//-V524
+void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }//-V524
+void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }//-V524
+void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }//-V524
+void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }//-V524
+void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }//-V524
+void sfence() { db(0x0F); db(0xAE); db(0xF8); }
+void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A); }
+void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
+void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
+void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
+void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
+void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
+void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
+void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
+void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
+void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
+void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
+void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
+void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
+void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
+void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
+void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
+void stac() { db(0x0F); db(0x01); db(0xCB); }
+void stc() { db(0xF9); }
+void std() { db(0xFD); }
+void sti() { db(0xFB); }
+void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
+void stosb() { db(0xAA); }
+void stosd() { db(0xAB); }
+void stosw() { db(0x66); db(0xAB); }
+void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
+void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
+void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
+void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
+void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
+void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
+void sysenter() { db(0x0F); db(0x34); }
+void sysexit() { db(0x0F); db(0x35); }
+void tzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
+void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
+void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
+void ud2() { db(0x0F); db(0x0B); }
+void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
+void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
+void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
+void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
+void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58); }
+void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58); }
+void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x58); }
+void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x58); }
+void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0); }
+void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0); }
+void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE); }
+void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF); }
+void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC); }
+void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD); }
+void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
+void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm); }
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); }
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
+void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
+void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
+void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
+void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4); }
+void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
+void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
+void vbroadcastsd(const Ymm& y, const Operand& op) { if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19); }
+void vbroadcastss(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18); }
+void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
+void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
+void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
+void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
+void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
+void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
+void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
+void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
+void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
+void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
+void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
+void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
+void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
+void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
+void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
+void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
+void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
+void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
+void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
+void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
+void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
+void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
+void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
+void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
+void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
+void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
+void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
+void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
+void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
+void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
+void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
+void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
+void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
+void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
+void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
+void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
+void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
+void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
+void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
+void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
+void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
+void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
+void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
+void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
+void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
+void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
+void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
+void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
+void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
+void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
+void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
+void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
+void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
+void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
+void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
+void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
+void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
+void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
+void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
+void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
+void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
+void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
+void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
+void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
+void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
+void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
+void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
+void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
+void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
+void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
+void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
+void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
+void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
+void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
+void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
+void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
+void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
+void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
+void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
+void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
+void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
+void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
+void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
+void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
+void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
+void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
+void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
+void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
+void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
+void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
+void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
+void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
+void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
+void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
+void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
+void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
+void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
+void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
+void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
+void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
+void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
+void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
+void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
+void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
+void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
+void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
+void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
+void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
+void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
+void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
+void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
+void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
+void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm); }
+void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm); }
+void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm); }
+void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm); }
+void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
+void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
+void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
+void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
+void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
+void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
+void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
+void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
+void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
+void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
+void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
+void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
+void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
+void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
+void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
+void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
+void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F); }
+void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
+void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
+void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
+void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
+void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
+void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
+void vcvtps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
+void vcvtps2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A); }
+void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) { checkCvt1(x, op); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x1D, imm); }
+void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D); }
+void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A); }
+void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); }
+void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); }
+void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A); }
+void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D); }
+void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); }
+void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B); }
+void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C); }
+void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C); }
+void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E); }
+void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E); }
+void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5E); }
+void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5E); }
+void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm); }
+void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm); }
+void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm); }
+void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm); }
+void vextractps(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm); }
+void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98); }
+void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98); }
+void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99); }
+void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99); }
+void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8); }
+void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8); }
+void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9); }
+void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9); }
+void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8); }
+void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8); }
+void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9); }
+void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9); }
+void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96); }
+void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96); }
+void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6); }
+void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6); }
+void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6); }
+void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6); }
+void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A); }
+void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A); }
+void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B); }
+void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B); }
+void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA); }
+void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA); }
+void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB); }
+void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB); }
+void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA); }
+void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA); }
+void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB); }
+void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB); }
+void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97); }
+void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97); }
+void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7); }
+void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7); }
+void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7); }
+void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7); }
+void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C); }
+void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C); }
+void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D); }
+void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D); }
+void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC); }
+void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC); }
+void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD); }
+void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD); }
+void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC); }
+void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC); }
+void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD); }
+void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD); }
+void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E); }
+void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E); }
+void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F); }
+void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F); }
+void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE); }
+void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE); }
+void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF); }
+void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF); }
+void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE); }
+void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE); }
+void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF); }
+void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF); }
+void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); }
+void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1); }
+void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1); }
+void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2); }
+void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm); }
+void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm); }
+void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF); }
+void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C); }
+void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C); }
+void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D); }
+void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); }
+void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); }
+void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); }
+void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
+void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
+void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
+void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
+void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F); }
+void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D); }
+void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E); }
+void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C); }
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F); }
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F); }
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5F); }
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5F); }
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D); }
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); }
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5D); }
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5D); }
+void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29); }
+void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
+void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29); }
+void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
+void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }
+void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }
+void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12); }
+void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
+void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
+void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
+void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
+void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12); }
+void vmovhpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17); }
+void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16); }
+void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
+void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16); }
+void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16); }
+void vmovlpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13); }
+void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12); }
+void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
+void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12); }
+void vmovmskpd(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50); }
+void vmovmskps(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50); }
+void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
+void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
+void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
+void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
+void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }
+void vmovq(const Xmm& x, const Address& addr) { int type, code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }
+void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
+void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11); }
+void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10); }
+void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10); }
+void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
+void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
+void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11); }
+void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); }
+void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10); }
+void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11); }
+void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
+void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11); }
+void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
+void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm); }
+void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); }
+void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); }
+void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x59); }
+void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x59); }
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56); }
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56); }
+void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
+void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E); }
+void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
+void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B); }
+void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63); }
+void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B); }
+void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67); }
+void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC); }
+void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE); }
+void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4); }
+void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC); }
+void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED); }
+void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC); }
+void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD); }
+void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD); }
+void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm); }
+void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
+void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
+void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0); }
+void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3); }
+void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm); }
+void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4); }
+void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm); }
+void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78); }
+void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58); }
+void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59); }
+void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79); }
+void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm); }
+void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
+void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
+void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29); }
+void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
+void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
+void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
+void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
+void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
+void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37); }
+void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
+void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
+void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
+void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
+void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
+void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
+void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D); }
+void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm); }
+void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C); }
+void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm); }
+void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm); }
+void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16); }
+void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16); }
+void vpermq(const Ymm& y, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm); }
+void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36); }
+void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(8|16|i32e) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm); }
+void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm); }
+void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(64) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm); }
+void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(16|i32e) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) { opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm); } else { opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm); } }
+void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1); }
+void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0); }
+void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2); }
+void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1); }
+void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
+void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03); }
+void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
+void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
+void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
+void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07); }
+void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
+void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm); }
+void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
+void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
+void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
+void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
+void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
+void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
+void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C); }
+void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E); }
+void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C); }
+void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C); }
+void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D); }
+void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE); }
+void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE); }
+void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F); }
+void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E); }
+void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38); }
+void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39); }
+void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA); }
+void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA); }
+void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B); }
+void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A); }
+void vpmovmskb(const Reg32e& r, const Xmm& x) { if (!x.is(Operand::XMM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7); }
+void vpmovsxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21); }
+void vpmovsxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22); }
+void vpmovsxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20); }
+void vpmovsxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25); }
+void vpmovsxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23); }
+void vpmovsxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24); }
+void vpmovzxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31); }
+void vpmovzxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32); }
+void vpmovzxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30); }
+void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35); }
+void vpmovzxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33); }
+void vpmovzxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34); }
+void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28); }
+void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B); }
+void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4); }
+void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5); }
+void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40); }
+void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5); }
+void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4); }
+void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
+void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6); }
+void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00); }
+void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm); }
+void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm); }
+void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm); }
+void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
+void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
+void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
+void vpslld(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
+void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); }
+void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
+void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
+void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); }
+void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); }
+void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); }
+void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
+void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); }
+void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
+void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); }
+void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); }
+void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
+void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); }
+void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
+void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); }
+void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
+void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
+void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); }
+void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); }
+void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); }
+void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
+void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); }
+void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); }
+void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); }
+void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB); }
+void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8); }
+void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9); }
+void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8); }
+void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9); }
+void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9); }
+void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
+void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68); }
+void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A); }
+void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D); }
+void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69); }
+void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60); }
+void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62); }
+void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C); }
+void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61); }
+void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
+void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
+void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
+void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm); }
+void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm); }
+void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm); }
+void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm); }
+void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
+void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
+void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm); }
+void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm); }
+void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51); }
+void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51); }
+void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51); }
+void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51); }
+void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
+void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C); }
+void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C); }
+void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_Z | T_N8, 0x5C); }
+void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_Z | T_N4, 0x5C); }
+void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
+void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
+void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E); }
+void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E); }
+void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15); }
+void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15); }
+void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14); }
+void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14); }
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57); }
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57); }
+void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
+void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
+void wait() { db(0x9B); }
+void wbinvd() { db(0x0F); db(0x09); }
+void wrmsr() { db(0x0F); db(0x30); }
+void xadd(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xC0 | (reg.isBit(8) ? 0 : 1)); }
+void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
+void xlatb() { db(0xD7); }
+void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
+void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
+void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
+void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
+#ifdef XBYAK_ENABLE_OMITTED_OPERAND
+void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
+void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
+void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
+void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
+void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
+void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
+void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
+void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
+void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
+void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
+void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
+void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
+void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
+void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
+void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
+void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
+void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
+void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
+void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
+void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
+void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
+void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
+void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
+void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
+void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
+void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
+void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
+void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
+void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
+void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
+void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
+void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
+void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
+void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
+void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
+void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
+void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
+void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
+void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
+void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
+void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
+void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
+void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
+void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
+void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
+void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
+void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
+void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
+void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
+void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
+void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
+void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
+void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
+void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
+void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
+void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
+void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
+void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
+void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
+void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
+void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
+void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
+void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
+void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
+void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
+void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
+void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
+void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
+void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
+void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
+void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
+void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
+void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
+void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
+void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
+void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
+void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
+void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
+void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
+void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
+void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
+void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
+void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
+void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
+void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
+void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
+void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
+void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
+void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
+void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
+void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
+void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
+void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
+void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
+void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
+void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
+void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
+void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
+void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
+void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
+void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
+void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
+void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
+void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
+void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
+void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
+void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
+void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
+void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
+void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
+void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
+void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
+void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
+void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
+void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
+void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
+void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
+void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
+void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
+void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
+void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
+void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
+void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
+void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
+void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
+void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
+void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
+void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
+void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
+void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
+void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
+void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
+void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
+void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
+void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
+void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
+void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
+void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
+void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
+void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
+void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
+void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
+void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
+void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
+void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
+void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
+void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
+void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
+void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
+void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
+void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
+void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
+void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
+void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
+void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
+void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
+void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
+void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
+void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
+void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
+void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
+void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
+void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
+void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
+void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
+void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
+void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
+void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
+void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
+void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
+void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
+void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
+void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
+void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
+void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
+void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
+void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
+void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
+void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
+void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
+void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
+void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
+void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
+void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
+void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
+void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
+void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
+void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
+void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
+void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
+void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
+void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
+void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
+void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
+void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
+void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
+void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
+void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
+void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
+void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
+void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
+void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
+void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
+void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
+void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
+void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
+void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
+void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
+void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
+void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
+void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
+void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
+void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
+void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
+void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
+void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
+void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
+void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
+void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
+void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
+void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
+void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
+void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
+void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
+void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
+void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
+void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
+void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
+void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
+void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
+void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
+void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
+void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
+void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
+void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
+void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
+void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
+void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
+void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
+void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
+void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
+void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
+void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
+void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
+void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
+void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
+void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
+void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
+void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
+void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
+void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
+void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
+void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
+void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
+void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
+void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
+#endif
+#ifdef XBYAK64
+void jecxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void cdqe() { db(0x48); db(0x98); }
+void cqo() { db(0x48); db(0x99); }
+void cmpsq() { db(0x48); db(0xA7); }
+void popfq() { db(0x9D); }
+void pushfq() { db(0x9C); }
+void lodsq() { db(0x48); db(0xAD); }
+void movsq() { db(0x48); db(0xA5); }
+void scasq() { db(0x48); db(0xAF); }
+void stosq() { db(0x48); db(0xAB); }
+void syscall() { db(0x0F); db(0x05); }
+void sysret() { db(0x0F); db(0x07); }
+void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
+void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
+void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
+void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); }
+void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }
+void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }
+void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }
+void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }
+void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }
+void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); }
+void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
+void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
+void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
+void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
+void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
+void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
+void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
+void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }
+void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
+void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
+void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
+void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
+void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
+void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
+void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
+#else
+void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jcxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void aaa() { db(0x37); }
+void aad() { db(0xD5); db(0x0A); }
+void aam() { db(0xD4); db(0x0A); }
+void aas() { db(0x3F); }
+void daa() { db(0x27); }
+void das() { db(0x2F); }
+void into() { db(0xCE); }
+void popad() { db(0x61); }
+void popfd() { db(0x9D); }
+void pusha() { db(0x60); }
+void pushad() { db(0x60); }
+void pushfd() { db(0x9C); }
+void popa() { db(0x61); }
+void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
+void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
+#endif
+#ifndef XBYAK_NO_OP_NAMES
+void and(const Operand& op1, const Operand& op2) { and_(op1, op2); }
+void and(const Operand& op, uint32_t imm) { and_(op, imm); }
+void or(const Operand& op1, const Operand& op2) { or_(op1, op2); }
+void or(const Operand& op, uint32_t imm) { or_(op, imm); }
+void xor(const Operand& op1, const Operand& op2) { xor_(op1, op2); }
+void xor(const Operand& op, uint32_t imm) { xor_(op, imm); }
+void not(const Operand& op) { not_(op); }
+#endif
+#ifndef XBYAK_DISABLE_AVX512
+void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A); }
+void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A); }
+void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
+void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
+void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41); }
+void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41); }
+void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42); }
+void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42); }
+void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
+void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
+void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
+void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
+void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
+void kmovb(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90); }
+void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
+void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
+void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
+void kmovd(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90); }
+void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
+void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
+void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
+void kmovq(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90); }
+void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
+void kmovw(const Opmask& k, const Operand& op) { if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90); }
+void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
+void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
+void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
+void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
+void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
+void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
+void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
+void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
+void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
+void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
+void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
+void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
+void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
+void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
+void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
+void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
+void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
+void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
+void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
+void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
+void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
+void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
+void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
+void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
+void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
+void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
+void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B); }
+void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
+void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
+void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46); }
+void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46); }
+void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
+void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
+void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47); }
+void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47); }
+void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
+void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
+void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A); }
+void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); }
+void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }
+void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); }
+void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm); }
+void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm); }
+void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65); }
+void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65); }
+void vbroadcastf32x2(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19); }
+void vbroadcastf32x4(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A); }
+void vbroadcastf32x8(const Zmm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B); }
+void vbroadcastf64x2(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A); }
+void vbroadcastf64x4(const Zmm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B); }
+void vbroadcasti32x2(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59); }
+void vbroadcasti32x4(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A); }
+void vbroadcasti32x8(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B); }
+void vbroadcasti64x2(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A); }
+void vbroadcasti64x4(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B); }
+void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm); }
+void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm); }
+void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
+void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
+void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63); }
+void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); }
+void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); }
+void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
+void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
+void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
+void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); }
+void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
+void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_ER_Y, 0x7B); }
+void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79); }
+void vcvtps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_ER_Y, 0x79); }
+void vcvtqq2pd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6); }
+void vcvtqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5B); }
+void vcvtsd2usi(const Reg32e& r, const Operand& op) { int type = (T_F2 | T_0F | T_MUST_EVEX | T_N8 | T_ER_X) | (r.isREG(64) ? T_EW1 : T_EW0); opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, type, 0x79); }
+void vcvtss2usi(const Reg32e& r, const Operand& op) { int type = (T_F3 | T_0F | T_MUST_EVEX | T_N4 | T_ER_X) | (r.isREG(64) ? T_EW1 : T_EW0); opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, type, 0x79); }
+void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A); }
+void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_SAE_Z, 0x78); }
+void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78); }
+void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x7A); }
+void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78); }
+void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x78); }
+void vcvttsd2usi(const Reg32e& r, const Operand& op) { int type = (T_F2 | T_0F | T_MUST_EVEX | T_N8 | T_SAE_X) | (r.isREG(64) ? T_EW1 : T_EW0); opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, type, 0x78); }
+void vcvttss2usi(const Reg32e& r, const Operand& op) { int type = (T_F3 | T_0F | T_MUST_EVEX | T_N4 | T_SAE_X) | (r.isREG(64) ? T_EW1 : T_EW0); opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, type, 0x78); }
+void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_F3 | T_0F | T_YMM | T_MUST_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0x7A); }
+void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A); }
+void vcvtuqq2pd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A); }
+void vcvtuqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F2 | T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x7A); }
+void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
+void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
+void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); }
+void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
+void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
+void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
+void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); }
+void vexpandps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88); }
+void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::XMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm); }
+void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm); }
+void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::XMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm); }
+void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm); }
+void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::XMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm); }
+void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm); }
+void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::XMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm); }
+void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm); }
+void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm); }
+void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm); }
+void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); }
+void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); }
+void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
+void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }
+void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); }
+void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }
+void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1); }
+void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0); }
+void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); }
+void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); }
+void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); }
+void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); }
+void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0); }
+void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2); }
+void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42); }
+void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42); }
+void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43); }
+void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43); }
+void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm); }
+void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm); }
+void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm); }
+void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm); }
+void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm); }
+void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm); }
+void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm); }
+void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm); }
+void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm); }
+void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm); }
+void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm); }
+void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm); }
+void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqa64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vmovdqu16(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqu16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vmovdqu32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqu32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vmovdqu64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqu64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vmovdqu8(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F); }
+void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); }
+void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }
+void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68); }
+void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52); }
+void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53); }
+void vpabsq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F); }
+void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB); }
+void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF); }
+void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF); }
+void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB); }
+void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66); }
+void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64); }
+void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64); }
+void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66); }
+void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
+void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
+void vpbroadcastmb2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A); }
+void vpbroadcastmw2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A); }
+void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
+void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm); }
+void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm); }
+void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74); }
+void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76); }
+void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29); }
+void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75); }
+void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64); }
+void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66); }
+void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37); }
+void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65); }
+void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm); }
+void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm); }
+void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm); }
+void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm); }
+void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm); }
+void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm); }
+void vpcompressd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B); }
+void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B); }
+void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4); }
+void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4); }
+void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); }
+void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); }
+void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); }
+void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77); }
+void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77); }
+void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76); }
+void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75); }
+void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D); }
+void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E); }
+void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F); }
+void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F); }
+void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E); }
+void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D); }
+void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D); }
+void vpexpandb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); }
+void vpexpandd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89); }
+void vpexpandq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89); }
+void vpexpandw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); }
+void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0); }
+void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1); }
+void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2); }
+void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
+void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
+void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
+void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
+void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
+void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
+void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
+void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
+void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
+void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
+void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
+void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); }
+void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); }
+void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
+void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
+void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
+void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
+void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
+void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); }
+void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); }
+void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); }
+void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); }
+void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); }
+void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); }
+void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); }
+void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); }
+void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); }
+void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); }
+void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); }
+void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); }
+void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); }
+void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); }
+void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); }
+void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
+void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); }
+void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
+void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
+void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }
+void vpopcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55); }
+void vpopcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55); }
+void vpopcntw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }
+void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB); }
+void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB); }
+void vprold(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm); }
+void vprolq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); }
+void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15); }
+void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15); }
+void vprord(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm); }
+void vprorq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); }
+void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14); }
+void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14); }
+void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0); }
+void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1); }
+void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2); }
+void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0); }
+void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm); }
+void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm); }
+void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71); }
+void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71); }
+void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70); }
+void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm); }
+void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm); }
+void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm); }
+void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73); }
+void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73); }
+void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72); }
+void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm); }
+void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }
+void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12); }
+void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); }
+void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2); }
+void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46); }
+void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11); }
+void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10); }
+void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm); }
+void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm); }
+void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26); }
+void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27); }
+void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27); }
+void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26); }
+void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26); }
+void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27); }
+void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27); }
+void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26); }
+void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF); }
+void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF); }
+void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm); }
+void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm); }
+void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm); }
+void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm); }
+void vrcp14pd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C); }
+void vrcp14ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C); }
+void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D); }
+void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D); }
+void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA); }
+void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); }
+void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB); }
+void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB); }
+void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm); }
+void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm); }
+void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm); }
+void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm); }
+void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x09, imm); }
+void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x08, imm); }
+void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_MUST_EVEX, 0x0B, imm); }
+void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_MUST_EVEX, 0x0A, imm); }
+void vrsqrt14pd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E); }
+void vrsqrt14ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E); }
+void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F); }
+void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F); }
+void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC); }
+void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); }
+void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD); }
+void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD); }
+void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C); }
+void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C); }
+void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D); }
+void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D); }
+void vscatterdpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1); }
+void vscatterdps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0); }
+void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); }
+void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); }
+void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); }
+void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); }
+void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); }
+void vscatterqpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0); }
+void vscatterqps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2); }
+void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm); }
+void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); }
+void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); }
+void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); }
+#ifdef XBYAK64
+void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
+void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
+void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
+#endif
+#endif
diff --git a/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_util.h b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_util.h
new file mode 100644
index 0000000..6ff9aab
--- /dev/null
+++ b/3rdparty/TNN/third_party/xbyak/xbyak/xbyak_util.h
@@ -0,0 +1,926 @@
+#ifndef XBYAK_XBYAK_UTIL_H_
+#define XBYAK_XBYAK_UTIL_H_
+
+#ifdef XBYAK_ONLY_CLASS_CPU
+#include <stdint.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <assert.h>
+#ifndef XBYAK_THROW
+	#define XBYAK_THROW(x) ;
+	#define XBYAK_THROW_RET(x, y) return y;
+#endif
+#else
+#include <string.h>
+
+/**
+	utility class and functions for Xbyak
+	Xbyak::util::Clock ; rdtsc timer
+	Xbyak::util::Cpu ; detect CPU
+	@note this header is UNDER CONSTRUCTION!
+*/
+#include "xbyak.h"
+#endif // XBYAK_ONLY_CLASS_CPU
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+	#define XBYAK_INTEL_CPU_SPECIFIC
+#endif
+
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _MSC_VER
+	#if (_MSC_VER < 1400) && defined(XBYAK32)
+		static inline __declspec(naked) void __cpuid(int[4], int)
+		{
+			__asm {
+				push	ebx
+				push	esi
+				mov		eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
+				cpuid
+				mov		esi, dword ptr [esp + 4 * 2 + 4] // data
+				mov		dword ptr [esi], eax
+				mov		dword ptr [esi + 4], ebx
+				mov		dword ptr [esi + 8], ecx
+				mov		dword ptr [esi + 12], edx
+				pop		esi
+				pop		ebx
+				ret
+			}
+		}
+	#else
+		#include <intrin.h> // for __cpuid
+	#endif
+#else
+	#ifndef __GNUC_PREREQ
+    	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
+	#endif
+	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
+		#include <cpuid.h>
+	#else
+		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+		#else
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+		#endif
+	#endif
+#endif
+#endif
+
+#ifdef XBYAK_USE_VTUNE
+	// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
+	#include <jitprofiling.h>
+	#ifdef _MSC_VER
+		#pragma comment(lib, "libittnotify.lib")
+	#endif
+	#ifdef __linux__
+		#include <dlfcn.h>
+	#endif
+#endif
+#ifdef __linux__
+	#define XBYAK_USE_PERF
+#endif
+
+namespace Xbyak { namespace util {
+
+typedef enum {
+   SmtLevel = 1,
+   CoreLevel = 2
+} IntelCpuTopologyLevel;
+
+/**
+	CPU detection class
+*/
+class Cpu {
+	uint64_t type_;
+	//system topology
+	bool x2APIC_supported_;
+	static const size_t maxTopologyLevels = 2;
+	unsigned int numCores_[maxTopologyLevels];
+
+	static const unsigned int maxNumberCacheLevels = 10;
+	unsigned int dataCacheSize_[maxNumberCacheLevels];
+	unsigned int coresSharignDataCache_[maxNumberCacheLevels];
+	unsigned int dataCacheLevels_;
+
+	unsigned int get32bitAsBE(const char *x) const
+	{
+		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
+	}
+	unsigned int mask(int n) const
+	{
+		return (1U << n) - 1;
+	}
+	void setFamily()
+	{
+		unsigned int data[4] = {};
+		getCpuid(1, data);
+		stepping = data[0] & mask(4);
+		model = (data[0] >> 4) & mask(4);
+		family = (data[0] >> 8) & mask(4);
+		// type = (data[0] >> 12) & mask(2);
+		extModel = (data[0] >> 16) & mask(4);
+		extFamily = (data[0] >> 20) & mask(8);
+		if (family == 0x0f) {
+			displayFamily = family + extFamily;
+		} else {
+			displayFamily = family;
+		}
+		if (family == 6 || family == 0x0f) {
+			displayModel = (extModel << 4) + model;
+		} else {
+			displayModel = model;
+		}
+	}
+	unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
+	{
+		return (val >> base) & ((1u << (end - base)) - 1);
+	}
+	void setNumCores()
+	{
+		if ((type_ & tINTEL) == 0) return;
+
+		unsigned int data[4] = {};
+
+		 /* CAUTION: These numbers are configuration as shipped by Intel. */
+		getCpuidEx(0x0, 0, data);
+		if (data[0] >= 0xB) {
+			 /*
+				if leaf 11 exists(x2APIC is supported),
+				we use it to get the number of smt cores and cores on socket
+
+				leaf 0xB can be zeroed-out by a hypervisor
+			*/
+			x2APIC_supported_ = true;
+			for (unsigned int i = 0; i < maxTopologyLevels; i++) {
+				getCpuidEx(0xB, i, data);
+				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
+				if (level == SmtLevel || level == CoreLevel) {
+					numCores_[level - 1] = extractBit(data[1], 0, 15);
+				}
+			}
+			/*
+				Fallback values in case a hypervisor has 0xB leaf zeroed-out.
+			*/
+			numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
+			numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
+		} else {
+			/*
+				Failed to deremine num of cores without x2APIC support.
+				TODO: USE initial APIC ID to determine ncores.
+			*/
+			numCores_[SmtLevel - 1] = 0;
+			numCores_[CoreLevel - 1] = 0;
+		}
+
+	}
+	void setCacheHierarchy()
+	{
+		if ((type_ & tINTEL) == 0) return;
+		const unsigned int NO_CACHE = 0;
+		const unsigned int DATA_CACHE = 1;
+//		const unsigned int INSTRUCTION_CACHE = 2;
+		const unsigned int UNIFIED_CACHE = 3;
+		unsigned int smt_width = 0;
+		unsigned int logical_cores = 0;
+		unsigned int data[4] = {};
+
+		if (x2APIC_supported_) {
+			smt_width = numCores_[0];
+			logical_cores = numCores_[1];
+		}
+
+		/*
+			Assumptions:
+			the first level of data cache is not shared (which is the
+			case for every existing architecture) and use this to
+			determine the SMT width for arch not supporting leaf 11.
+			when leaf 4 reports a number of core less than numCores_
+			on socket reported by leaf 11, then it is a correct number
+			of cores not an upperbound.
+		*/
+		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
+			getCpuidEx(0x4, i, data);
+			unsigned int cacheType = extractBit(data[0], 0, 4);
+			if (cacheType == NO_CACHE) break;
+			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
+				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
+					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
+				}
+				assert(actual_logical_cores != 0);
+				dataCacheSize_[dataCacheLevels_] =
+					(extractBit(data[1], 22, 31) + 1)
+					* (extractBit(data[1], 12, 21) + 1)
+					* (extractBit(data[1], 0, 11) + 1)
+					* (data[2] + 1);
+				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
+				assert(smt_width != 0);
+				coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
+				dataCacheLevels_++;
+			}
+		}
+	}
+
+public:
+	int model;
+	int family;
+	int stepping;
+	int extModel;
+	int extFamily;
+	int displayFamily; // family + extFamily
+	int displayModel; // model + extModel
+
+	unsigned int getNumCores(IntelCpuTopologyLevel level) const {
+		if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
+		switch (level) {
+		case SmtLevel: return numCores_[level - 1];
+		case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
+		default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
+		}
+	}
+
+	unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
+	unsigned int getCoresSharingDataCache(unsigned int i) const
+	{
+		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
+		return coresSharignDataCache_[i];
+	}
+	unsigned int getDataCacheSize(unsigned int i) const
+	{
+		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
+		return dataCacheSize_[i];
+	}
+
+	/*
+		data[] = { eax, ebx, ecx, edx }
+	*/
+	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
+	{
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
+		__cpuid(reinterpret_cast<int*>(data), eaxIn);
+	#else
+		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+	#endif
+#else
+		(void)eaxIn;
+		(void)data;
+#endif
+	}
+	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
+	{
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
+		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
+	#else
+		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+	#endif
+#else
+		(void)eaxIn;
+		(void)ecxIn;
+		(void)data;
+#endif
+	}
+	static inline uint64_t getXfeature()
+	{
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
+		return _xgetbv(0);
+	#else
+		unsigned int eax, edx;
+		// xgetvb is not support on gcc 4.2
+//		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+		__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
+		return ((uint64_t)edx << 32) | eax;
+	#endif
+#else
+		return 0;
+#endif
+	}
+	typedef uint64_t Type;
+
+	static const Type NONE = 0;
+	static const Type tMMX = 1 << 0;
+	static const Type tMMX2 = 1 << 1;
+	static const Type tCMOV = 1 << 2;
+	static const Type tSSE = 1 << 3;
+	static const Type tSSE2 = 1 << 4;
+	static const Type tSSE3 = 1 << 5;
+	static const Type tSSSE3 = 1 << 6;
+	static const Type tSSE41 = 1 << 7;
+	static const Type tSSE42 = 1 << 8;
+	static const Type tPOPCNT = 1 << 9;
+	static const Type tAESNI = 1 << 10;
+	static const Type tSSE5 = 1 << 11;
+	static const Type tOSXSAVE = 1 << 12;
+	static const Type tPCLMULQDQ = 1 << 13;
+	static const Type tAVX = 1 << 14;
+	static const Type tFMA = 1 << 15;
+
+	static const Type t3DN = 1 << 16;
+	static const Type tE3DN = 1 << 17;
+	static const Type tSSE4a = 1 << 18;
+	static const Type tRDTSCP = 1 << 19;
+	static const Type tAVX2 = 1 << 20;
+	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
+	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+	static const Type tLZCNT = 1 << 23;
+
+	static const Type tINTEL = 1 << 24;
+	static const Type tAMD = 1 << 25;
+
+	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
+	static const Type tRDRAND = 1 << 27;
+	static const Type tADX = 1 << 28; // adcx, adox
+	static const Type tRDSEED = 1 << 29; // rdseed
+	static const Type tSMAP = 1 << 30; // stac
+	static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
+	static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
+	static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
+	static const Type tMOVBE = uint64_t(1) << 34; // mobve
+	static const Type tAVX512F = uint64_t(1) << 35;
+	static const Type tAVX512DQ = uint64_t(1) << 36;
+	static const Type tAVX512_IFMA = uint64_t(1) << 37;
+	static const Type tAVX512IFMA = tAVX512_IFMA;
+	static const Type tAVX512PF = uint64_t(1) << 38;
+	static const Type tAVX512ER = uint64_t(1) << 39;
+	static const Type tAVX512CD = uint64_t(1) << 40;
+	static const Type tAVX512BW = uint64_t(1) << 41;
+	static const Type tAVX512VL = uint64_t(1) << 42;
+	static const Type tAVX512_VBMI = uint64_t(1) << 43;
+	static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
+	static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
+	static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
+	static const Type tPREFETCHWT1 = uint64_t(1) << 46;
+	static const Type tPREFETCHW = uint64_t(1) << 47;
+	static const Type tSHA = uint64_t(1) << 48;
+	static const Type tMPX = uint64_t(1) << 49;
+	static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
+	static const Type tGFNI = uint64_t(1) << 51;
+	static const Type tVAES = uint64_t(1) << 52;
+	static const Type tVPCLMULQDQ = uint64_t(1) << 53;
+	static const Type tAVX512_VNNI = uint64_t(1) << 54;
+	static const Type tAVX512_BITALG = uint64_t(1) << 55;
+	static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
+	static const Type tAVX512_BF16 = uint64_t(1) << 57;
+	static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
+	static const Type tAMX_TILE = uint64_t(1) << 59;
+	static const Type tAMX_INT8 = uint64_t(1) << 60;
+	static const Type tAMX_BF16 = uint64_t(1) << 61;
+	static const Type tAVX_VNNI = uint64_t(1) << 62;
+
+	Cpu()
+		: type_(NONE)
+		, x2APIC_supported_(false)
+		, numCores_()
+		, dataCacheSize_()
+		, coresSharignDataCache_()
+		, dataCacheLevels_(0)
+	{
+		unsigned int data[4] = {};
+		const unsigned int& EAX = data[0];
+		const unsigned int& EBX = data[1];
+		const unsigned int& ECX = data[2];
+		const unsigned int& EDX = data[3];
+		getCpuid(0, data);
+		const unsigned int maxNum = EAX;
+		static const char intel[] = "ntel";
+		static const char amd[] = "cAMD";
+		if (ECX == get32bitAsBE(amd)) {
+			type_ |= tAMD;
+			getCpuid(0x80000001, data);
+			if (EDX & (1U << 31)) {
+				type_ |= t3DN;
+				// 3DNow! implies support for PREFETCHW on AMD
+				type_ |= tPREFETCHW;
+			}
+
+			if (EDX & (1U << 29)) {
+				// Long mode implies support for PREFETCHW on AMD
+				type_ |= tPREFETCHW;
+			}
+		}
+		if (ECX == get32bitAsBE(intel)) {
+			type_ |= tINTEL;
+		}
+
+		// Extended flags information
+		getCpuid(0x80000000, data);
+		if (EAX >= 0x80000001) {
+			getCpuid(0x80000001, data);
+
+			if (EDX & (1U << 31)) type_ |= t3DN;
+			if (EDX & (1U << 30)) type_ |= tE3DN;
+			if (EDX & (1U << 27)) type_ |= tRDTSCP;
+			if (EDX & (1U << 22)) type_ |= tMMX2;
+			if (EDX & (1U << 15)) type_ |= tCMOV;
+			if (ECX & (1U << 5)) type_ |= tLZCNT;
+			if (ECX & (1U << 8)) type_ |= tPREFETCHW;
+		}
+
+		getCpuid(1, data);
+		if (ECX & (1U << 0)) type_ |= tSSE3;
+		if (ECX & (1U << 9)) type_ |= tSSSE3;
+		if (ECX & (1U << 19)) type_ |= tSSE41;
+		if (ECX & (1U << 20)) type_ |= tSSE42;
+		if (ECX & (1U << 22)) type_ |= tMOVBE;
+		if (ECX & (1U << 23)) type_ |= tPOPCNT;
+		if (ECX & (1U << 25)) type_ |= tAESNI;
+		if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
+		if (ECX & (1U << 27)) type_ |= tOSXSAVE;
+		if (ECX & (1U << 30)) type_ |= tRDRAND;
+		if (ECX & (1U << 29)) type_ |= tF16C;
+
+		if (EDX & (1U << 15)) type_ |= tCMOV;
+		if (EDX & (1U << 23)) type_ |= tMMX;
+		if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
+		if (EDX & (1U << 26)) type_ |= tSSE2;
+
+		if (type_ & tOSXSAVE) {
+			// check XFEATURE_ENABLED_MASK[2:1] = '11b'
+			uint64_t bv = getXfeature();
+			if ((bv & 6) == 6) {
+				if (ECX & (1U << 28)) type_ |= tAVX;
+				if (ECX & (1U << 12)) type_ |= tFMA;
+				// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
+#if !defined(__APPLE__)
+				if (((bv >> 5) & 7) == 7)
+#endif
+				{
+					getCpuidEx(7, 0, data);
+					if (EBX & (1U << 16)) type_ |= tAVX512F;
+					if (type_ & tAVX512F) {
+						if (EBX & (1U << 17)) type_ |= tAVX512DQ;
+						if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
+						if (EBX & (1U << 26)) type_ |= tAVX512PF;
+						if (EBX & (1U << 27)) type_ |= tAVX512ER;
+						if (EBX & (1U << 28)) type_ |= tAVX512CD;
+						if (EBX & (1U << 30)) type_ |= tAVX512BW;
+						if (EBX & (1U << 31)) type_ |= tAVX512VL;
+						if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
+						if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
+						if (ECX & (1U << 8)) type_ |= tGFNI;
+						if (ECX & (1U << 9)) type_ |= tVAES;
+						if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
+						if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
+						if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
+						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
+						if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
+						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
+						if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
+					}
+				}
+			}
+		}
+		if (maxNum >= 7) {
+			getCpuidEx(7, 0, data);
+			const uint32_t maxNumSubLeaves = EAX;
+			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
+			if (EBX & (1U << 3)) type_ |= tBMI1;
+			if (EBX & (1U << 8)) type_ |= tBMI2;
+			if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
+			if (EBX & (1U << 18)) type_ |= tRDSEED;
+			if (EBX & (1U << 19)) type_ |= tADX;
+			if (EBX & (1U << 20)) type_ |= tSMAP;
+			if (EBX & (1U << 4)) type_ |= tHLE;
+			if (EBX & (1U << 11)) type_ |= tRTM;
+			if (EBX & (1U << 14)) type_ |= tMPX;
+			if (EBX & (1U << 29)) type_ |= tSHA;
+			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
+			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
+			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
+			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
+			if (maxNumSubLeaves >= 1) {
+				getCpuidEx(7, 1, data);
+				if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
+				if (type_ & tAVX512F) {
+					if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+				}
+			}
+		}
+		setFamily();
+		setNumCores();
+		setCacheHierarchy();
+	}
+	void putFamily() const
+	{
+#ifndef XBYAK_ONLY_CLASS_CPU
+		printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
+			family, model, stepping, extFamily, extModel);
+		printf("display:family=%X, model=%X\n", displayFamily, displayModel);
+#endif
+	}
+	bool has(Type type) const
+	{
+		return (type & type_) != 0;
+	}
+};
+
+#ifndef XBYAK_ONLY_CLASS_CPU
+class Clock {
+public:
+	static inline uint64_t getRdtsc()
+	{
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
+		return __rdtsc();
+	#else
+		unsigned int eax, edx;
+		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
+		return ((uint64_t)edx << 32) | eax;
+	#endif
+#else
+		// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
+		return 0;
+#endif
+	}
+	Clock()
+		: clock_(0)
+		, count_(0)
+	{
+	}
+	void begin()
+	{
+		clock_ -= getRdtsc();
+	}
+	void end()
+	{
+		clock_ += getRdtsc();
+		count_++;
+	}
+	int getCount() const { return count_; }
+	uint64_t getClock() const { return clock_; }
+	void clear() { count_ = 0; clock_ = 0; }
+private:
+	uint64_t clock_;
+	int count_;
+};
+
+#ifdef XBYAK64
+const int UseRCX = 1 << 6;
+const int UseRDX = 1 << 7;
+
+class Pack {
+	static const size_t maxTblNum = 15;
+	const Xbyak::Reg64 *tbl_[maxTblNum];
+	size_t n_;
+public:
+	Pack() : tbl_(), n_(0) {}
+	Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
+	Pack(const Pack& rhs)
+		: n_(rhs.n_)
+	{
+		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+	}
+	Pack& operator=(const Pack& rhs)
+	{
+		n_ = rhs.n_;
+		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+		return *this;
+	}
+	Pack(const Xbyak::Reg64& t0)
+	{ n_ = 1; tbl_[0] = &t0; }
+	Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
+	Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
+	Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
+	Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
+	Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
+	Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
+	Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
+	Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
+	Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
+	Pack& append(const Xbyak::Reg64& t)
+	{
+		if (n_ == maxTblNum) {
+			fprintf(stderr, "ERR Pack::can't append\n");
+			XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
+		}
+		tbl_[n_++] = &t;
+		return *this;
+	}
+	void init(const Xbyak::Reg64 *tbl, size_t n)
+	{
+		if (n > maxTblNum) {
+			fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
+			XBYAK_THROW(ERR_BAD_PARAMETER)
+		}
+		n_ = n;
+		for (size_t i = 0; i < n; i++) {
+			tbl_[i] = &tbl[i];
+		}
+	}
+	const Xbyak::Reg64& operator[](size_t n) const
+	{
+		if (n >= n_) {
+			fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
+			XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
+		}
+		return *tbl_[n];
+	}
+	size_t size() const { return n_; }
+	/*
+		get tbl[pos, pos + num)
+	*/
+	Pack sub(size_t pos, size_t num = size_t(-1)) const
+	{
+		if (num == size_t(-1)) num = n_ - pos;
+		if (pos + num > n_) {
+			fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
+			XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
+		}
+		Pack pack;
+		pack.n_ = num;
+		for (size_t i = 0; i < num; i++) {
+			pack.tbl_[i] = tbl_[pos + i];
+		}
+		return pack;
+	}
+	void put() const
+	{
+		for (size_t i = 0; i < n_; i++) {
+			printf("%s ", tbl_[i]->toString());
+		}
+		printf("\n");
+	}
+};
+
+class StackFrame {
+#ifdef XBYAK64_WIN
+	static const int noSaveNum = 6;
+	static const int rcxPos = 0;
+	static const int rdxPos = 1;
+#else
+	static const int noSaveNum = 8;
+	static const int rcxPos = 3;
+	static const int rdxPos = 2;
+#endif
+	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
+	Xbyak::CodeGenerator *code_;
+	int pNum_;
+	int tNum_;
+	bool useRcx_;
+	bool useRdx_;
+	int saveNum_;
+	int P_;
+	bool makeEpilog_;
+	Xbyak::Reg64 pTbl_[4];
+	Xbyak::Reg64 tTbl_[maxRegNum];
+	Pack p_;
+	Pack t_;
+	StackFrame(const StackFrame&);
+	void operator=(const StackFrame&);
+public:
+	const Pack& p;
+	const Pack& t;
+	/*
+		make stack frame
+		@param sf [in] this
+		@param pNum [in] num of function parameter(0 <= pNum <= 4)
+		@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
+		@param stackSizeByte [in] local stack size
+		@param makeEpilog [in] automatically call close() if true
+
+		you can use
+		rax
+		gp0, ..., gp(pNum - 1)
+		gt0, ..., gt(tNum-1)
+		rcx if tNum & UseRCX
+		rdx if tNum & UseRDX
+		rsp[0..stackSizeByte - 1]
+	*/
+	StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
+		: code_(code)
+		, pNum_(pNum)
+		, tNum_(tNum & ~(UseRCX | UseRDX))
+		, useRcx_((tNum & UseRCX) != 0)
+		, useRdx_((tNum & UseRDX) != 0)
+		, saveNum_(0)
+		, P_(0)
+		, makeEpilog_(makeEpilog)
+		, p(p_)
+		, t(t_)
+	{
+		using namespace Xbyak;
+		if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
+		const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
+		if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
+		const Reg64& _rsp = code->rsp;
+		saveNum_ = (std::max)(0, allRegNum - noSaveNum);
+		const int *tbl = getOrderTbl() + noSaveNum;
+		for (int i = 0; i < saveNum_; i++) {
+			code->push(Reg64(tbl[i]));
+		}
+		P_ = (stackSizeByte + 7) / 8;
+		if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
+		P_ *= 8;
+		if (P_ > 0) code->sub(_rsp, P_);
+		int pos = 0;
+		for (int i = 0; i < pNum; i++) {
+			pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+		}
+		for (int i = 0; i < tNum_; i++) {
+			tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+		}
+		if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
+		if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
+		p_.init(pTbl_, pNum);
+		t_.init(tTbl_, tNum_);
+	}
+	/*
+		make epilog manually
+		@param callRet [in] call ret() if true
+	*/
+	void close(bool callRet = true)
+	{
+		using namespace Xbyak;
+		const Reg64& _rsp = code_->rsp;
+		const int *tbl = getOrderTbl() + noSaveNum;
+		if (P_ > 0) code_->add(_rsp, P_);
+		for (int i = 0; i < saveNum_; i++) {
+			code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
+		}
+
+		if (callRet) code_->ret();
+	}
+	~StackFrame()
+	{
+		if (!makeEpilog_) return;
+		close();
+	}
+private:
+	const int *getOrderTbl() const
+	{
+		using namespace Xbyak;
+		static const int tbl[] = {
+#ifdef XBYAK64_WIN
+			Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
+#else
+			Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
+#endif
+			Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
+		};
+		return &tbl[0];
+	}
+	int getRegIdx(int& pos) const
+	{
+		assert(pos < maxRegNum);
+		using namespace Xbyak;
+		const int *tbl = getOrderTbl();
+		int r = tbl[pos++];
+		if (useRcx_) {
+			if (r == Operand::RCX) { return Operand::R10; }
+			if (r == Operand::R10) { r = tbl[pos++]; }
+		}
+		if (useRdx_) {
+			if (r == Operand::RDX) { return Operand::R11; }
+			if (r == Operand::R11) { return tbl[pos++]; }
+		}
+		return r;
+	}
+};
+#endif
+
+class Profiler {
+	int mode_;
+	const char *suffix_;
+	const void *startAddr_;
+#ifdef XBYAK_USE_PERF
+	FILE *fp_;
+#endif
+public:
+	enum {
+		None = 0,
+		Perf = 1,
+		VTune = 2
+	};
+	Profiler()
+		: mode_(None)
+		, suffix_("")
+		, startAddr_(0)
+#ifdef XBYAK_USE_PERF
+		, fp_(0)
+#endif
+	{
+	}
+	// append suffix to funcName
+	void setNameSuffix(const char *suffix)
+	{
+		suffix_ = suffix;
+	}
+	void setStartAddr(const void *startAddr)
+	{
+		startAddr_ = startAddr;
+	}
+	void init(int mode)
+	{
+		mode_ = None;
+		switch (mode) {
+		default:
+		case None:
+			return;
+		case Perf:
+#ifdef XBYAK_USE_PERF
+			close();
+			{
+				const int pid = getpid();
+				char name[128];
+				snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
+				fp_ = fopen(name, "a+");
+				if (fp_ == 0) {
+					fprintf(stderr, "can't open %s\n", name);
+					return;
+				}
+			}
+			mode_ = Perf;
+#endif
+			return;
+		case VTune:
+#ifdef XBYAK_USE_VTUNE
+			dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling
+			if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
+				fprintf(stderr, "VTune profiling is not active\n");
+				return;
+			}
+			mode_ = VTune;
+#endif
+			return;
+		}
+	}
+	~Profiler()
+	{
+		close();
+	}
+	void close()
+	{
+#ifdef XBYAK_USE_PERF
+		if (fp_ == 0) return;
+		fclose(fp_);
+		fp_ = 0;
+#endif
+	}
+	void set(const char *funcName, const void *startAddr, size_t funcSize) const
+	{
+		if (mode_ == None) return;
+#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
+		(void)funcName;
+		(void)startAddr;
+		(void)funcSize;
+#endif
+#ifdef XBYAK_USE_PERF
+		if (mode_ == Perf) {
+			if (fp_ == 0) return;
+			fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
+			/*
+				perf does not recognize the function name which is less than 3,
+				so append '_' at the end of the name if necessary
+			*/
+			size_t n = strlen(funcName) + strlen(suffix_);
+			for (size_t i = n; i < 3; i++) {
+				fprintf(fp_, "_");
+			}
+			fprintf(fp_, "\n");
+			fflush(fp_);
+		}
+#endif
+#ifdef XBYAK_USE_VTUNE
+		if (mode_ != VTune) return;
+		char className[] = "";
+		char fileName[] = "";
+		iJIT_Method_Load jmethod = {};
+		jmethod.method_id = iJIT_GetNewMethodID();
+		jmethod.class_file_name = className;
+		jmethod.source_file_name = fileName;
+		jmethod.method_load_address = const_cast<void*>(startAddr);
+		jmethod.method_size = funcSize;
+		jmethod.line_number_size = 0;
+		char buf[128];
+		snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
+		jmethod.method_name = buf;
+		iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
+#endif
+	}
+	/*
+		for continuous set
+		funcSize = endAddr - <previous set endAddr>
+	*/
+	void set(const char *funcName, const void *endAddr)
+	{
+		set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
+		startAddr_ = endAddr;
+	}
+};
+#endif // XBYAK_ONLY_CLASS_CPU
+
+} } // end of util
+
+#endif
diff --git a/3rdparty/TNN/tools/caffe2onnx/.gitignore b/3rdparty/TNN/tools/caffe2onnx/.gitignore
new file mode 100644
index 0000000..feba0d6
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/.gitignore
@@ -0,0 +1,248 @@
+### https://raw.github.com/github/gitignore/2f75277037d172200d4a37621c1b9c3b9901dbd8/Global/JetBrains.gitignore
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+
+### https://raw.github.com/github/gitignore/2f75277037d172200d4a37621c1b9c3b9901dbd8/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+
+### https://raw.github.com/github/gitignore/2f75277037d172200d4a37621c1b9c3b9901dbd8/Global/macOS.gitignore
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
diff --git a/3rdparty/TNN/tools/caffe2onnx/caffe2onnx.py b/3rdparty/TNN/tools/caffe2onnx/caffe2onnx.py
new file mode 100644
index 0000000..89aef18
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/caffe2onnx.py
@@ -0,0 +1,35 @@
+from src.load_save_model import LoadCaffeModel, SaveOnnxModel
+from src.caffe2onnx import Caffe2Onnx
+from src.args_parser import parse_args
+from src.utils import is_ssd_model
+
+def main(args):
+    caffe_graph_path = args.proto_file
+    caffe_params_path = args.caffe_model_file
+
+    pos_s = caffe_graph_path.rfind("/")
+    if  pos_s == -1:
+        pos_s = 0
+
+    pos_dot = caffe_graph_path.rfind(".")
+    onnx_name = caffe_graph_path[pos_s+1:pos_dot]
+    save_path = caffe_graph_path[0:pos_dot] + '.onnx'
+    if args.onnx_file is not None:
+        save_path = args.onnx_file
+
+    graph, params = LoadCaffeModel(caffe_graph_path,caffe_params_path)
+    print('2. 开始进行模型转换')
+    c2o = Caffe2Onnx(graph, params, onnx_name)
+    print('3. 创建 onnx 模型')
+    onnx_model = c2o.createOnnxModel()
+    print('4. 保存 onnx 模型')
+    is_ssd = is_ssd_model(caffe_graph_path)
+    if is_ssd:
+        SaveOnnxModel(onnx_model, save_path, need_polish=False)
+    else:
+        SaveOnnxModel(onnx_model, save_path, need_polish=True)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/3rdparty/TNN/tools/caffe2onnx/doc/onnx-version.md b/3rdparty/TNN/tools/caffe2onnx/doc/onnx-version.md
new file mode 100644
index 0000000..d5adf51
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/doc/onnx-version.md
@@ -0,0 +1,38 @@
+# onnx version
+
+| onnx                  | 1.2.2                              | 1.6.0                                            | compatible   |
+|-----------------------|------------------------------------|--------------------------------------------------|--------------|
+| AveragePool           | -                                  | attributes(ceil\_mode)                           | yes          |
+| BatchNormalization    | spatial                            | spatial(delete) (not use)                        | yes          |
+| Clip                  | attributes(min, max)               | inputs(min, max)                                 | yes(support) |
+| Concat                | -                                  | -                                                | yes          |
+| Conv                  | -                                  | -                                                | yes          |
+| ConvTranspose         | -                                  | -                                                | yes          |
+| DepthToSpace          | attributes(blocksize)              | attributes(blocksize,mode)                       | yes(support) |
+| Div                   | -                                  | -                                                | yes          |
+| Exp                   | -                                  | -                                                | yes          |
+| Expand                | not support                        | support                                          | yes          |
+| Gemm                  | -                                  | -                                                | yes          |
+| GlobalAveragePool     | -                                  | -                                                | yes          |
+| GlobalMaxPool         | -                                  | -                                                | yes          |
+| InstanceNormalization | -                                  | -                                                | yes          |
+| LeakyRelu             | -                                  | -                                                | yes          |
+| MaxPool               | -                                  | attributes(ceil\_mode, dilations,storage\_order) | ?            |
+| Mul                   | -                                  | -                                                | yes          |
+| PRelu                 | -                                  | -                                                | yes          |
+| Pad                   | attributes(pads, value)            | inputs(pads, constant_value)                     | yes(support) |
+| ReduceL2              | -                                  | -                                                | yes          |
+| ReduceMean            | -                                  | -                                                | yes          |
+| ReduceSum             | -                                  | -                                                | yes          |
+| Relu                  | -                                  | -                                                | yes          |
+| Reshape               | -                                  | -                                                | yes          |
+| Slice                 | attributes(starts,ends,axes,steps) | inputs(starts,ends,axes,steps)                   | yes(support) |
+| Softmax               | -                                  | -                                                | yes          |
+| Softplus              | -                                  | -                                                | yes          |
+| Split                 | -                                  | -                                                | yes          |
+| Sub                   | -                                  | -                                                | yes          |
+| Tanh                  | -                                  | -                                                | yes          |
+| Tile                  | -                                  | -                                                | yes          |
+| Transpose             | -                                  | -                                                | yes          |
+| Upsample              | Upsample                           | deprecated(弃用了), 使用 Resize 替代             | yes          |
+    
diff --git a/3rdparty/TNN/tools/caffe2onnx/proto/__init__.py b/3rdparty/TNN/tools/caffe2onnx/proto/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample.proto b/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample.proto
new file mode 100644
index 0000000..e86c30b
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample.proto
@@ -0,0 +1,1676 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 43 (last added: weights)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  // The prefix for the snapshot.
+  // If not set then is replaced by prototxt file path without extension.
+  // If is set to directory then is augmented by prototxt file name
+  // without extention.
+  optional string snapshot_prefix = 15;
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38 [default = 0.99];
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [default = true];
+
+  // Path to caffemodel file(s) with pretrained weights to initialize finetuning.
+  // Tha same as command line --weights parameter for caffe train command.
+  // If command line --weights parameter is specified, it has higher priority
+  // and overwrites this one(s).
+  // If --snapshot command line parameter is specified, this one(s) are ignored.
+  // If several model files are expected, they can be listed in a one 
+  // weights parameter separated by ',' (like in a command string) or
+  // in repeated weights parameters separately.
+  repeated string weights = 42;
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 149 (last added: clip_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ClipParameter clip_param = 148;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional UpsampleParameter upsample_param = 149;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional ParameterParameter parameter_param = 145;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional SwishParameter swish_param = 147;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional WindowDataParameter window_data_param = 129;
+  optional InterpParameter interp_param = 166;
+  optional ShuffleChannelParameter shuffle_channel_param = 164;
+  optional PermuteParameter permute_param = 202;
+  optional PriorBoxParameter prior_box_param = 203;
+  optional DetectionOutputParameter detection_output_param = 204;
+  optional DetectionEvaluateParameter detection_evaluate_param = 205;
+  optional NormalizeParameter norm_param = 206;
+  optional AxpyParameter axpy_param = 151;
+  optional ReLU6Parameter relu6_param = 100000;
+}
+
+
+message ShuffleChannelParameter {
+  optional uint32 group = 1[default = 1]; // The number of group
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would subtract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+// Message that stores parameters used by ClipLayer
+message ClipParameter {
+  required float min = 1;
+  required float max = 2;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message BatchNormParameter {
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
+  optional bool use_global_stats = 1;
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
+  optional float moving_average_fraction = 2 [default = .999];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [default = 1e-5];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message UpsampleParameter{
+
+  optional float scale = 1 [default = 0];
+
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [default = 2];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
+  optional uint32 prefetch = 10 [default = 4];
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+  // How to calculate the output size - using ceil (default) or floor rounding.
+  enum RoundMode {
+    CEIL = 0;
+    FLOOR = 1;
+  }
+  optional RoundMode round_mode = 13 [default = CEIL];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // DEPRECATED
+  optional bool share_in_parallel = 4 [default = false];
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+// Message that stores parameters used by SwishLayer
+message SwishParameter {
+  // Beta parameter for the Swish activation function
+  // Described in:
+  // Prajit Ramachandran, Barret Zoph, Quoc V. Le. (2017). Searching for
+  // Activation Functions. https://arxiv.org/abs/1710.05941v2
+  optional float beta = 1 [default = 1];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+	UPSAMPLE = 40;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional UpsampleParameter upsample_param = 43;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope parameters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
+
+message ReLU6Parameter {
+  optional float negative_slope = 1 [default = 0];
+}
+
+message InterpParameter {
+   optional int32 height = 1 [default = 0]; // Height of output
+   optional int32 width = 2 [default = 0]; // Width of output
+   optional int32 zoom_factor = 3 [default = 1]; // zoom factor
+   optional int32 shrink_factor = 4 [default = 1]; // shrink factor
+   optional int32 pad_beg = 5 [default = 0]; // padding at begin of input
+   optional int32 pad_end = 6 [default = 0]; // padding at end of input
+}
+
+message PermuteParameter {
+  // The new orders of the axes of data. Notice it should be with
+  // in the same range as the input data, and it starts from 0.
+  // Do not provide repeated order.
+  repeated uint32 order = 1;
+}
+
+message PriorBoxParameter {
+  // Encode/decode type.
+  enum CodeType {
+    CORNER = 1;
+    CENTER_SIZE = 2;
+    CORNER_SIZE = 3;
+  }
+  // Minimum box size (in pixels). Required!
+  repeated float min_size = 1;//对应论文2.2节中公式（4）中的sk×网络输入层输入图像[data层的输入]大小
+  // Maximum box size (in pixels). Required!
+  repeated float max_size = 2;
+  // Various of aspect ratios. Duplicate ratios will be ignored.
+  // If none is provided, we use default ratio 1.
+  repeated float aspect_ratio = 3; //  等宽比
+  // If true, will flip each aspect ratio.
+  // For example, if there is aspect ratio "r",
+  // we will generate aspect ratio "1.0/r" as well.
+  optional bool flip = 4 [default = true]; // 是否反转等宽比
+  // If true, will clip the prior so that it is within [0, 1]
+  optional bool clip = 5 [default = false]; // 是否进行裁剪,是否保证默认框整个在网络输入层输入图像内）
+  // Variance for adjusting the prior bboxes.
+  repeated float variance = 6;
+  // By default, we calculate img_height, img_width, step_x, step_y based on
+  // bottom[0] (feat) and bottom[1] (img). Unless these values are explicitely
+  // provided.
+  // Explicitly provide the img_size.
+  optional uint32 img_size = 7;
+  // Either img_size or img_h/img_w should be specified; not both.
+  optional uint32 img_h = 8;//网络输入层输入图像的高（或自行设置的高度）
+  optional uint32 img_w = 9;//网络输入层输入图像的宽（或自行设置的高度）
+
+  // Explicitly provide the step size.
+  optional float step = 10;
+  // Either step or step_h/step_w should be specified; not both.
+  optional float step_h = 11;
+  optional float step_w = 12;
+
+  // Offset to the top left corner of each cell.
+  optional float offset = 13 [default = 0.5];
+}
+
+// Message that store parameters used by DetectionOutputLayer
+message DetectionOutputParameter {
+  // Number of classes to be predicted. Required!
+  optional uint32 num_classes = 1;
+  // If true, bounding box are shared among different classes.
+  optional bool share_location = 2 [default = true];
+  // Background label id. If there is no background class,
+  // set it as -1.
+  optional int32 background_label_id = 3 [default = 0];
+  // Parameters used for non maximum suppression.
+  optional NonMaximumSuppressionParameter nms_param = 4;
+  // Parameters used for saving detection results.
+  optional SaveOutputParameter save_output_param = 5;
+  // Type of coding method for bbox.
+  optional PriorBoxParameter.CodeType code_type = 6 [default = CORNER];
+  // If true, variance is encoded in target; otherwise we need to adjust the
+  // predicted offset accordingly.
+  optional bool variance_encoded_in_target = 8 [default = false];
+  // Number of total bboxes to be kept per image after nms step.
+  // -1 means keeping all bboxes after nms step.
+  optional int32 keep_top_k = 7 [default = -1];
+  // Only consider detections whose confidences are larger than a threshold.
+  // If not provided, consider all boxes.
+  optional float confidence_threshold = 9;
+  // If true, visualize the detection results.
+  optional bool visualize = 10 [default = false];
+  // The threshold used to visualize the detection results.
+  optional float visualize_threshold = 11;
+  // If provided, save outputs to video file.
+  optional string save_file = 12;
+}
+
+// Message that store parameters used by DetectionEvaluateLayer
+message DetectionEvaluateParameter {
+  // Number of classes that are actually predicted. Required!
+  optional uint32 num_classes = 1;
+  // Label id for background class. Needed for sanity check so that
+  // background class is neither in the ground truth nor the detections.
+  optional uint32 background_label_id = 2 [default = 0];
+  // Threshold for deciding true/false positive.
+  optional float overlap_threshold = 3 [default = 0.5];
+  // If true, also consider difficult ground truth for evaluation.
+  optional bool evaluate_difficult_gt = 4 [default = true];
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  // If provided, we will scale the prediction and ground truth NormalizedBBox
+  // for evaluation.
+  optional string name_size_file = 5;
+  // The resize parameter used in converting NormalizedBBox to original image.
+  optional ResizeParameter resize_param = 6;
+}
+
+message ResizeParameter {
+  //Probability of using this resize policy
+  optional float prob = 1 [default = 1];
+
+  enum Resize_mode {
+    WARP = 1;
+    FIT_SMALL_SIZE = 2;
+    FIT_LARGE_SIZE_AND_PAD = 3;
+  }
+  optional Resize_mode resize_mode = 2 [default = WARP];
+  optional uint32 height = 3 [default = 0];
+  optional uint32 width = 4 [default = 0];
+  // A parameter used to update bbox in FIT_SMALL_SIZE mode.
+  optional uint32 height_scale = 8 [default = 0];
+  optional uint32 width_scale = 9 [default = 0];
+
+  enum Pad_mode {
+    CONSTANT = 1;
+    MIRRORED = 2;
+    REPEAT_NEAREST = 3;
+  }
+  // Padding mode for BE_SMALL_SIZE_AND_PAD mode and object centering
+  optional Pad_mode pad_mode = 5 [default = CONSTANT];
+  // if specified can be repeated once (would fill all the channels)
+  // or can be repeated the same number of times as channels
+  // (would use it them to the corresponding channel)
+  repeated float pad_value = 6;
+
+  enum Interp_mode { //Same as in OpenCV
+    LINEAR = 1;
+    AREA = 2;
+    NEAREST = 3;
+    CUBIC = 4;
+    LANCZOS4 = 5;
+  }
+  //interpolation for for resizing
+  repeated Interp_mode interp_mode = 7;
+}
+
+message NonMaximumSuppressionParameter {
+  // Threshold to be used in nms.
+  optional float nms_threshold = 1 [default = 0.3];
+  // Maximum number of results to be kept.
+  optional int32 top_k = 2;
+  // Parameter for adaptive nms.
+  optional float eta = 3 [default = 1.0];
+}
+
+message SaveOutputParameter {
+  // Output directory. If not empty, we will save the results.
+  optional string output_directory = 1;
+  // Output name prefix.
+  optional string output_name_prefix = 2;
+  // Output format.
+  //    VOC - PASCAL VOC output format.
+  //    COCO - MS COCO output format.
+  optional string output_format = 3;
+  // If you want to output results, must also provide the following two files.
+  // Otherwise, we will ignore saving results.
+  // label map file.
+  optional string label_map_file = 4;
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  optional string name_size_file = 5;
+  // Number of test images. It can be less than the lines specified in
+  // name_size_file. For example, when we only want to evaluate on part
+  // of the test images.
+  optional uint32 num_test_image = 6;
+  // The resize parameter used in saving the data.
+  optional ResizeParameter resize_param = 7;
+}
+
+message AxpyParameter {
+
+}
+
+// Message that stores parameters used by NormalizeLayer
+message NormalizeParameter {
+  optional bool across_spatial = 1 [default = true];
+  // Initial value of scale. Default is 1.0 for all
+  optional FillerParameter scale_filler = 2;
+  // Whether or not scale parameters are shared across channels.
+  optional bool channel_shared = 3 [default = true];
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 4 [default = 1e-10];
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample_pb2.py b/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample_pb2.py
new file mode 100644
index 0000000..add8ade
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/proto/caffe_upsample_pb2.py
@@ -0,0 +1,7340 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: caffe_upsample.proto
+
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='caffe_upsample.proto',
+  package='caffe',
+  syntax='proto2',
+  serialized_options=None,
+  create_key=_descriptor._internal_create_key,
+  serialized_pb=b'\n\x14\x63\x61\x66\x66\x65_upsample.proto\x12\x05\x63\x61\x66\x66\x65\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcc\x01\n\tBlobProto\x12\x1f\n\x05shape\x18\x07 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"2\n\x0f\x42lobProtoVector\x12\x1f\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x10.caffe.BlobProto\"\x81\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\"\x8a\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x42\n\rvariance_norm\x18\x08 \x01(\x0e\x32#.caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\x8e\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12%\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x1e\n\x05state\x18\x06 \x01(\x0b\x32\x0f.caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12$\n\x05layer\x18\x64 \x03(\x0b\x32\x15.caffe.LayerParameter\x12\'\n\x06layers\x18\x02 \x03(\x0b\x32\x17.caffe.V1LayerParameter\"\xd4\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12&\n\tnet_param\x18\x19 \x01(\x0b\x32\x13.caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12,\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x13.caffe.NetParameter\x12+\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x13.caffe.NetParameter\x12$\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x0f.caffe.NetState\x12#\n\ntest_state\x18\x1b \x03(\x0b\x32\x0f.caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12K\n\x0fsnapshot_format\x18% \x01(\x0e\x32%.caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12;\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32!.caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x14\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x05\x31\x65-08\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12;\n\x0bsolver_type\x18\x1e \x01(\x0e\x32!.caffe.SolverParameter.SolverType:\x03SGD\x12\x1f\n\x11layer_wise_reduce\x18) \x01(\x08:\x04true\x12\x0f\n\x07weights\x18* \x03(\t\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"l\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12!\n\x07history\x18\x03 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\"N\n\x08NetState\x12!\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"s\n\x0cNetStateRule\x12\x1b\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\"\xa3\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\nshare_mode\x18\x02 \x01(\x0e\x32\x1d.caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xf2\x18\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1b\n\x05phase\x18\n \x01(\x0e\x32\x0c.caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\x1f\n\x05param\x18\x06 \x03(\x0b\x32\x10.caffe.ParamSpec\x12\x1f\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12$\n\x07include\x18\x08 \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18\t \x03(\x0b\x32\x13.caffe.NetStateRule\x12\x37\n\x0ftransform_param\x18\x64 \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18\x65 \x01(\x0b\x32\x14.caffe.LossParameter\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12\x34\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x19.caffe.BatchNormParameter\x12)\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x14.caffe.BiasParameter\x12)\n\nclip_param\x18\x94\x01 \x01(\x0b\x32\x14.caffe.ClipParameter\x12,\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12\x31\n\x0eupsample_param\x18\x95\x01 \x01(\x0b\x32\x18.caffe.UpsampleParameter\x12)\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x14.caffe.CropParameter\x12(\n\ndata_param\x18k \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18l \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18n \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12\'\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x13.caffe.ELUParameter\x12+\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x15.caffe.EmbedParameter\x12&\n\texp_param\x18o \x01(\x0b\x32\x13.caffe.ExpParameter\x12/\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x17.caffe.FlattenParameter\x12\x31\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18s \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18u \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12+\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x15.caffe.InputParameter\x12\'\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x13.caffe.LogParameter\x12&\n\tlrn_param\x18v \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18w \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18x \x01(\x0b\x32\x13.caffe.MVNParameter\x12\x33\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x19.caffe.ParameterParameter\x12.\n\rpooling_param\x18y \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18z \x01(\x0b\x32\x15.caffe.PowerParameter\x12+\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x15.caffe.PReLUParameter\x12-\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x16.caffe.PythonParameter\x12\x33\n\x0frecurrent_param\x18\x92\x01 \x01(\x0b\x32\x19.caffe.RecurrentParameter\x12\x33\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x19.caffe.ReductionParameter\x12(\n\nrelu_param\x18{ \x01(\x0b\x32\x14.caffe.ReLUParameter\x12/\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x17.caffe.ReshapeParameter\x12+\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x15.caffe.ScaleParameter\x12.\n\rsigmoid_param\x18| \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18} \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12\'\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x13.caffe.SPPParameter\x12*\n\x0bslice_param\x18~ \x01(\x0b\x32\x15.caffe.SliceParameter\x12+\n\x0bswish_param\x18\x93\x01 \x01(\x0b\x32\x15.caffe.SwishParameter\x12(\n\ntanh_param\x18\x7f \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x33\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12)\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x14.caffe.TileParameter\x12\x36\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12-\n\x0cinterp_param\x18\xa6\x01 \x01(\x0b\x32\x16.caffe.InterpParameter\x12>\n\x15shuffle_channel_param\x18\xa4\x01 \x01(\x0b\x32\x1e.caffe.ShuffleChannelParameter\x12/\n\rpermute_param\x18\xca\x01 \x01(\x0b\x32\x17.caffe.PermuteParameter\x12\x32\n\x0fprior_box_param\x18\xcb\x01 \x01(\x0b\x32\x18.caffe.PriorBoxParameter\x12@\n\x16\x64\x65tection_output_param\x18\xcc\x01 \x01(\x0b\x32\x1f.caffe.DetectionOutputParameter\x12\x44\n\x18\x64\x65tection_evaluate_param\x18\xcd\x01 \x01(\x0b\x32!.caffe.DetectionEvaluateParameter\x12.\n\nnorm_param\x18\xce\x01 \x01(\x0b\x32\x19.caffe.NormalizeParameter\x12)\n\naxpy_param\x18\x97\x01 \x01(\x0b\x32\x14.caffe.AxpyParameter\x12,\n\x0brelu6_param\x18\xa0\x8d\x06 \x01(\x0b\x32\x15.caffe.ReLU6Parameter\"+\n\x17ShuffleChannelParameter\x12\x10\n\x05group\x18\x01 \x01(\r:\x01\x31\"\xb6\x01\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\"\xc2\x01\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12\x44\n\rnormalization\x18\x03 \x01(\x0e\x32&.caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\"B\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\x08\n\x04NONE\x10\x03\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\")\n\rClipParameter\x12\x0b\n\x03min\x18\x01 \x02(\x02\x12\x0b\n\x03max\x18\x02 \x02(\x02\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"j\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12&\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x05\x30.999\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-05\"]\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"%\n\x11UpsampleParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x30\"\xfc\x03\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12-\n\rweight_filler\x18\x07 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x16.caffe.FillerParameter\x12;\n\x06\x65ngine\x18\x0f \x01(\x0e\x32\".caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"0\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\"\xa4\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x31\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x17.caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x34\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\".\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\"\xa0\x01\n\x12\x44ummyDataParameter\x12+\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x16.caffe.FillerParameter\x12\x1f\n\x05shape\x18\x06 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa5\x01\n\x10\x45ltwiseParameter\x12\x39\n\toperation\x18\x01 \x01(\x0e\x32!.caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xac\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"^\n\x12HingeLossParameter\x12\x30\n\x04norm\x18\x01 \x01(\x0e\x32\x1e.caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"8\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"\xcb\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"1\n\x0eInputParameter\x12\x1f\n\x05shape\x18\x01 \x03(\x0b\x32\x10.caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xb8\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12\x44\n\x0bnorm_region\x18\x04 \x01(\x0e\x32\x1e.caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\"d\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-09\"5\n\x12ParameterParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\"\x81\x04\n\x10PoolingParameter\x12\x35\n\x04pool\x18\x01 \x01(\x0e\x32\".caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12\x37\n\x06\x65ngine\x18\x0b \x01(\x0e\x32\x1e.caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\x12;\n\nround_mode\x18\r \x01(\x0e\x32!.caffe.PoolingParameter.RoundMode:\x04\x43\x45IL\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\" \n\tRoundMode\x12\x08\n\x04\x43\x45IL\x10\x00\x12\t\n\x05\x46LOOR\x10\x01\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xc0\x01\n\x12RecurrentParameter\x12\x15\n\nnum_output\x18\x01 \x01(\r:\x01\x30\x12-\n\rweight_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x19\n\ndebug_info\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1c\n\rexpose_hidden\x18\x05 \x01(\x08:\x05\x66\x61lse\"\xad\x01\n\x12ReductionParameter\x12=\n\toperation\x18\x01 \x01(\x0e\x32%.caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x8d\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x34\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1b.caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x10ReshapeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"\xa5\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"x\n\x10SigmoidParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\"\x89\x01\n\x10SoftmaxParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"!\n\x0eSwishParameter\x12\x0f\n\x04\x62\x65ta\x18\x01 \x01(\x02:\x01\x31\"r\n\rTanHParameter\x12\x34\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1b.caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"/\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xeb\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x31\n\x04pool\x18\x02 \x01(\x0e\x32\x1e.caffe.SPPParameter.PoolMethod:\x03MAX\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xa0\x14\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12$\n\x07include\x18  \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18! \x03(\x0b\x32\x13.caffe.NetStateRule\x12/\n\x04type\x18\x05 \x01(\x0e\x32!.caffe.V1LayerParameter.LayerType\x12\x1f\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12>\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32$.caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12,\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12\x30\n\x0eupsample_param\x18+ \x01(\x0b\x32\x18.caffe.UpsampleParameter\x12(\n\ndata_param\x18\x0b \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18\x0c \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18\x18 \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12&\n\texp_param\x18) \x01(\x0b\x32\x13.caffe.ExpParameter\x12\x31\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12&\n\tlrn_param\x18\x12 \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18\" \x01(\x0b\x32\x13.caffe.MVNParameter\x12.\n\rpooling_param\x18\x13 \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x15.caffe.PowerParameter\x12(\n\nrelu_param\x18\x1e \x01(\x0b\x32\x14.caffe.ReLUParameter\x12.\n\rsigmoid_param\x18& \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18\' \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12*\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18% \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x32\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12\x35\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x37\n\x0ftransform_param\x18$ \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18* \x01(\x0b\x32\x14.caffe.LossParameter\x12&\n\x05layer\x18\x01 \x01(\x0b\x32\x17.caffe.V0LayerParameter\"\xe6\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x0c\n\x08UPSAMPLE\x10(\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xfd\x07\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x35\n\x04pool\x18\x0b \x01(\x0e\x32\".caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\x1f\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x36\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"W\n\x0ePReLUParameter\x12&\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\"+\n\x0eReLU6Parameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\"\x90\x01\n\x0fInterpParameter\x12\x11\n\x06height\x18\x01 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bzoom_factor\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\rshrink_factor\x18\x04 \x01(\x05:\x01\x31\x12\x12\n\x07pad_beg\x18\x05 \x01(\x05:\x01\x30\x12\x12\n\x07pad_end\x18\x06 \x01(\x05:\x01\x30\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\xb5\x02\n\x11PriorBoxParameter\x12\x10\n\x08min_size\x18\x01 \x03(\x02\x12\x10\n\x08max_size\x18\x02 \x03(\x02\x12\x14\n\x0c\x61spect_ratio\x18\x03 \x03(\x02\x12\x12\n\x04\x66lip\x18\x04 \x01(\x08:\x04true\x12\x13\n\x04\x63lip\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x08variance\x18\x06 \x03(\x02\x12\x10\n\x08img_size\x18\x07 \x01(\r\x12\r\n\x05img_h\x18\x08 \x01(\r\x12\r\n\x05img_w\x18\t \x01(\r\x12\x0c\n\x04step\x18\n \x01(\x02\x12\x0e\n\x06step_h\x18\x0b \x01(\x02\x12\x0e\n\x06step_w\x18\x0c \x01(\x02\x12\x13\n\x06offset\x18\r \x01(\x02:\x03\x30.5\"8\n\x08\x43odeType\x12\n\n\x06\x43ORNER\x10\x01\x12\x0f\n\x0b\x43\x45NTER_SIZE\x10\x02\x12\x0f\n\x0b\x43ORNER_SIZE\x10\x03\"\xc7\x03\n\x18\x44\x65tectionOutputParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1c\n\x0eshare_location\x18\x02 \x01(\x08:\x04true\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x03 \x01(\x05:\x01\x30\x12\x38\n\tnms_param\x18\x04 \x01(\x0b\x32%.caffe.NonMaximumSuppressionParameter\x12\x35\n\x11save_output_param\x18\x05 \x01(\x0b\x32\x1a.caffe.SaveOutputParameter\x12<\n\tcode_type\x18\x06 \x01(\x0e\x32!.caffe.PriorBoxParameter.CodeType:\x06\x43ORNER\x12)\n\x1avariance_encoded_in_target\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x16\n\nkeep_top_k\x18\x07 \x01(\x05:\x02-1\x12\x1c\n\x14\x63onfidence_threshold\x18\t \x01(\x02\x12\x18\n\tvisualize\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x13visualize_threshold\x18\x0b \x01(\x02\x12\x11\n\tsave_file\x18\x0c \x01(\t\"\xdc\x01\n\x1a\x44\x65tectionEvaluateParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x02 \x01(\r:\x01\x30\x12\x1e\n\x11overlap_threshold\x18\x03 \x01(\x02:\x03\x30.5\x12#\n\x15\x65valuate_difficult_gt\x18\x04 \x01(\x08:\x04true\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12,\n\x0cresize_param\x18\x06 \x01(\x0b\x32\x16.caffe.ResizeParameter\"\x90\x04\n\x0fResizeParameter\x12\x0f\n\x04prob\x18\x01 \x01(\x02:\x01\x31\x12=\n\x0bresize_mode\x18\x02 \x01(\x0e\x32\".caffe.ResizeParameter.Resize_mode:\x04WARP\x12\x11\n\x06height\x18\x03 \x01(\r:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\r:\x01\x30\x12\x17\n\x0cheight_scale\x18\x08 \x01(\r:\x01\x30\x12\x16\n\x0bwidth_scale\x18\t \x01(\r:\x01\x30\x12;\n\x08pad_mode\x18\x05 \x01(\x0e\x32\x1f.caffe.ResizeParameter.Pad_mode:\x08\x43ONSTANT\x12\x11\n\tpad_value\x18\x06 \x03(\x02\x12\x37\n\x0binterp_mode\x18\x07 \x03(\x0e\x32\".caffe.ResizeParameter.Interp_mode\"G\n\x0bResize_mode\x12\x08\n\x04WARP\x10\x01\x12\x12\n\x0e\x46IT_SMALL_SIZE\x10\x02\x12\x1a\n\x16\x46IT_LARGE_SIZE_AND_PAD\x10\x03\":\n\x08Pad_mode\x12\x0c\n\x08\x43ONSTANT\x10\x01\x12\x0c\n\x08MIRRORED\x10\x02\x12\x12\n\x0eREPEAT_NEAREST\x10\x03\"I\n\x0bInterp_mode\x12\n\n\x06LINEAR\x10\x01\x12\x08\n\x04\x41REA\x10\x02\x12\x0b\n\x07NEAREST\x10\x03\x12\t\n\x05\x43UBIC\x10\x04\x12\x0c\n\x08LANCZOS4\x10\x05\"[\n\x1eNonMaximumSuppressionParameter\x12\x1a\n\rnms_threshold\x18\x01 \x01(\x02:\x03\x30.3\x12\r\n\x05top_k\x18\x02 \x01(\x05\x12\x0e\n\x03\x65ta\x18\x03 \x01(\x02:\x01\x31\"\xd8\x01\n\x13SaveOutputParameter\x12\x18\n\x10output_directory\x18\x01 \x01(\t\x12\x1a\n\x12output_name_prefix\x18\x02 \x01(\t\x12\x15\n\routput_format\x18\x03 \x01(\t\x12\x16\n\x0elabel_map_file\x18\x04 \x01(\t\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12\x16\n\x0enum_test_image\x18\x06 \x01(\r\x12,\n\x0cresize_param\x18\x07 \x01(\x0b\x32\x16.caffe.ResizeParameter\"\x0f\n\rAxpyParameter\"\x92\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12,\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x31\x65-10*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01'
+)
+
+_PHASE = _descriptor.EnumDescriptor(
+  name='Phase',
+  full_name='caffe.Phase',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='TRAIN', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='TEST', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=18603,
+  serialized_end=18631,
+)
+_sym_db.RegisterEnumDescriptor(_PHASE)
+
+Phase = enum_type_wrapper.EnumTypeWrapper(_PHASE)
+TRAIN = 0
+TEST = 1
+
+
+_FILLERPARAMETER_VARIANCENORM = _descriptor.EnumDescriptor(
+  name='VarianceNorm',
+  full_name='caffe.FillerParameter.VarianceNorm',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='FAN_IN', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='FAN_OUT', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='AVERAGE', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=667,
+  serialized_end=719,
+)
+_sym_db.RegisterEnumDescriptor(_FILLERPARAMETER_VARIANCENORM)
+
+_SOLVERPARAMETER_SNAPSHOTFORMAT = _descriptor.EnumDescriptor(
+  name='SnapshotFormat',
+  full_name='caffe.SolverParameter.SnapshotFormat',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='HDF5', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='BINARYPROTO', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2197,
+  serialized_end=2240,
+)
+_sym_db.RegisterEnumDescriptor(_SOLVERPARAMETER_SNAPSHOTFORMAT)
+
+_SOLVERPARAMETER_SOLVERMODE = _descriptor.EnumDescriptor(
+  name='SolverMode',
+  full_name='caffe.SolverParameter.SolverMode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='CPU', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='GPU', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2242,
+  serialized_end=2272,
+)
+_sym_db.RegisterEnumDescriptor(_SOLVERPARAMETER_SOLVERMODE)
+
+_SOLVERPARAMETER_SOLVERTYPE = _descriptor.EnumDescriptor(
+  name='SolverType',
+  full_name='caffe.SolverParameter.SolverType',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='SGD', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='NESTEROV', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ADAGRAD', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='RMSPROP', index=3, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ADADELTA', index=4, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ADAM', index=5, number=5,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2274,
+  serialized_end=2359,
+)
+_sym_db.RegisterEnumDescriptor(_SOLVERPARAMETER_SOLVERTYPE)
+
+_PARAMSPEC_DIMCHECKMODE = _descriptor.EnumDescriptor(
+  name='DimCheckMode',
+  full_name='caffe.ParamSpec.DimCheckMode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='STRICT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='PERMISSIVE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2790,
+  serialized_end=2832,
+)
+_sym_db.RegisterEnumDescriptor(_PARAMSPEC_DIMCHECKMODE)
+
+_LOSSPARAMETER_NORMALIZATIONMODE = _descriptor.EnumDescriptor(
+  name='NormalizationMode',
+  full_name='caffe.LossParameter.NormalizationMode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='FULL', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='VALID', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='BATCH_SIZE', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='NONE', index=3, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=6382,
+  serialized_end=6448,
+)
+_sym_db.RegisterEnumDescriptor(_LOSSPARAMETER_NORMALIZATIONMODE)
+
+_CONVOLUTIONPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.ConvolutionParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_CONVOLUTIONPARAMETER_ENGINE)
+
+_DATAPARAMETER_DB = _descriptor.EnumDescriptor(
+  name='DB',
+  full_name='caffe.DataParameter.DB',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='LEVELDB', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='LMDB', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7856,
+  serialized_end=7883,
+)
+_sym_db.RegisterEnumDescriptor(_DATAPARAMETER_DB)
+
+_ELTWISEPARAMETER_ELTWISEOP = _descriptor.EnumDescriptor(
+  name='EltwiseOp',
+  full_name='caffe.EltwiseParameter.EltwiseOp',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='PROD', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SUM', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MAX', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=8223,
+  serialized_end=8262,
+)
+_sym_db.RegisterEnumDescriptor(_ELTWISEPARAMETER_ELTWISEOP)
+
+_HINGELOSSPARAMETER_NORM = _descriptor.EnumDescriptor(
+  name='Norm',
+  full_name='caffe.HingeLossParameter.Norm',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='L1', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='L2', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=8797,
+  serialized_end=8819,
+)
+_sym_db.RegisterEnumDescriptor(_HINGELOSSPARAMETER_NORM)
+
+_LRNPARAMETER_NORMREGION = _descriptor.EnumDescriptor(
+  name='NormRegion',
+  full_name='caffe.LRNParameter.NormRegion',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='ACROSS_CHANNELS', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='WITHIN_CHANNEL', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=9703,
+  serialized_end=9756,
+)
+_sym_db.RegisterEnumDescriptor(_LRNPARAMETER_NORMREGION)
+
+_LRNPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.LRNParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_LRNPARAMETER_ENGINE)
+
+_POOLINGPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
+  name='PoolMethod',
+  full_name='caffe.PoolingParameter.PoolMethod',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='MAX', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='AVE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='STOCHASTIC', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=10441,
+  serialized_end=10487,
+)
+_sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_POOLMETHOD)
+
+_POOLINGPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.PoolingParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_ENGINE)
+
+_POOLINGPARAMETER_ROUNDMODE = _descriptor.EnumDescriptor(
+  name='RoundMode',
+  full_name='caffe.PoolingParameter.RoundMode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='CEIL', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='FLOOR', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=10534,
+  serialized_end=10566,
+)
+_sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_ROUNDMODE)
+
+_REDUCTIONPARAMETER_REDUCTIONOP = _descriptor.EnumDescriptor(
+  name='ReductionOp',
+  full_name='caffe.ReductionParameter.ReductionOp',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='SUM', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ASUM', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SUMSQ', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MEAN', index=3, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=11061,
+  serialized_end=11114,
+)
+_sym_db.RegisterEnumDescriptor(_REDUCTIONPARAMETER_REDUCTIONOP)
+
+_RELUPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.ReLUParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_RELUPARAMETER_ENGINE)
+
+_SIGMOIDPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.SigmoidParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_SIGMOIDPARAMETER_ENGINE)
+
+_SOFTMAXPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.SoftmaxParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_SOFTMAXPARAMETER_ENGINE)
+
+_TANHPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.TanHParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_TANHPARAMETER_ENGINE)
+
+_SPPPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
+  name='PoolMethod',
+  full_name='caffe.SPPParameter.PoolMethod',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='MAX', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='AVE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='STOCHASTIC', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=10441,
+  serialized_end=10487,
+)
+_sym_db.RegisterEnumDescriptor(_SPPPARAMETER_POOLMETHOD)
+
+_SPPPARAMETER_ENGINE = _descriptor.EnumDescriptor(
+  name='Engine',
+  full_name='caffe.SPPParameter.Engine',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CAFFE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUDNN', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=7495,
+  serialized_end=7538,
+)
+_sym_db.RegisterEnumDescriptor(_SPPPARAMETER_ENGINE)
+
+_V1LAYERPARAMETER_LAYERTYPE = _descriptor.EnumDescriptor(
+  name='LayerType',
+  full_name='caffe.V1LayerParameter.LayerType',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='NONE', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ABSVAL', index=1, number=35,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ACCURACY', index=2, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ARGMAX', index=3, number=30,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='BNLL', index=4, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CONCAT', index=5, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CONTRASTIVE_LOSS', index=6, number=37,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CONVOLUTION', index=7, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='UPSAMPLE', index=8, number=40,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='DATA', index=9, number=5,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='DECONVOLUTION', index=10, number=39,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='DROPOUT', index=11, number=6,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='DUMMY_DATA', index=12, number=32,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='EUCLIDEAN_LOSS', index=13, number=7,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='ELTWISE', index=14, number=25,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='EXP', index=15, number=38,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='FLATTEN', index=16, number=8,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='HDF5_DATA', index=17, number=9,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='HDF5_OUTPUT', index=18, number=10,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='HINGE_LOSS', index=19, number=28,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='IM2COL', index=20, number=11,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='IMAGE_DATA', index=21, number=12,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='INFOGAIN_LOSS', index=22, number=13,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='INNER_PRODUCT', index=23, number=14,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='LRN', index=24, number=15,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MEMORY_DATA', index=25, number=29,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MULTINOMIAL_LOGISTIC_LOSS', index=26, number=16,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MVN', index=27, number=34,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='POOLING', index=28, number=17,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='POWER', index=29, number=26,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='RELU', index=30, number=18,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SIGMOID', index=31, number=19,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SIGMOID_CROSS_ENTROPY_LOSS', index=32, number=27,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SILENCE', index=33, number=36,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SOFTMAX', index=34, number=20,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SOFTMAX_LOSS', index=35, number=21,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SPLIT', index=36, number=22,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='SLICE', index=37, number=33,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='TANH', index=38, number=23,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='WINDOW_DATA', index=39, number=24,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='THRESHOLD', index=40, number=31,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=14601,
+  serialized_end=15215,
+)
+_sym_db.RegisterEnumDescriptor(_V1LAYERPARAMETER_LAYERTYPE)
+
+_V1LAYERPARAMETER_DIMCHECKMODE = _descriptor.EnumDescriptor(
+  name='DimCheckMode',
+  full_name='caffe.V1LayerParameter.DimCheckMode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='STRICT', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='PERMISSIVE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2790,
+  serialized_end=2832,
+)
+_sym_db.RegisterEnumDescriptor(_V1LAYERPARAMETER_DIMCHECKMODE)
+
+_V0LAYERPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
+  name='PoolMethod',
+  full_name='caffe.V0LayerParameter.PoolMethod',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='MAX', index=0, number=0,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='AVE', index=1, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='STOCHASTIC', index=2, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=10441,
+  serialized_end=10487,
+)
+_sym_db.RegisterEnumDescriptor(_V0LAYERPARAMETER_POOLMETHOD)
+
+_PRIORBOXPARAMETER_CODETYPE = _descriptor.EnumDescriptor(
+  name='CodeType',
+  full_name='caffe.PriorBoxParameter.CodeType',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='CORNER', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CENTER_SIZE', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CORNER_SIZE', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=16855,
+  serialized_end=16911,
+)
+_sym_db.RegisterEnumDescriptor(_PRIORBOXPARAMETER_CODETYPE)
+
+_RESIZEPARAMETER_RESIZE_MODE = _descriptor.EnumDescriptor(
+  name='Resize_mode',
+  full_name='caffe.ResizeParameter.Resize_mode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='WARP', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='FIT_SMALL_SIZE', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='FIT_LARGE_SIZE_AND_PAD', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=17917,
+  serialized_end=17988,
+)
+_sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_RESIZE_MODE)
+
+_RESIZEPARAMETER_PAD_MODE = _descriptor.EnumDescriptor(
+  name='Pad_mode',
+  full_name='caffe.ResizeParameter.Pad_mode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='CONSTANT', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='MIRRORED', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='REPEAT_NEAREST', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=17990,
+  serialized_end=18048,
+)
+_sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_PAD_MODE)
+
+_RESIZEPARAMETER_INTERP_MODE = _descriptor.EnumDescriptor(
+  name='Interp_mode',
+  full_name='caffe.ResizeParameter.Interp_mode',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='LINEAR', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='AREA', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='NEAREST', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CUBIC', index=3, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='LANCZOS4', index=4, number=5,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=18050,
+  serialized_end=18123,
+)
+_sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_INTERP_MODE)
+
+
+_BLOBSHAPE = _descriptor.Descriptor(
+  name='BlobShape',
+  full_name='caffe.BlobShape',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='dim', full_name='caffe.BlobShape.dim', index=0,
+      number=1, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\020\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=31,
+  serialized_end=59,
+)
+
+
+_BLOBPROTO = _descriptor.Descriptor(
+  name='BlobProto',
+  full_name='caffe.BlobProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='caffe.BlobProto.shape', index=0,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='data', full_name='caffe.BlobProto.data', index=1,
+      number=5, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\020\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='diff', full_name='caffe.BlobProto.diff', index=2,
+      number=6, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\020\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='double_data', full_name='caffe.BlobProto.double_data', index=3,
+      number=8, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\020\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='double_diff', full_name='caffe.BlobProto.double_diff', index=4,
+      number=9, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\020\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num', full_name='caffe.BlobProto.num', index=5,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='channels', full_name='caffe.BlobProto.channels', index=6,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.BlobProto.height', index=7,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.BlobProto.width', index=8,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=62,
+  serialized_end=266,
+)
+
+
+_BLOBPROTOVECTOR = _descriptor.Descriptor(
+  name='BlobProtoVector',
+  full_name='caffe.BlobProtoVector',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='blobs', full_name='caffe.BlobProtoVector.blobs', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=268,
+  serialized_end=318,
+)
+
+
+_DATUM = _descriptor.Descriptor(
+  name='Datum',
+  full_name='caffe.Datum',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='channels', full_name='caffe.Datum.channels', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.Datum.height', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.Datum.width', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='data', full_name='caffe.Datum.data', index=3,
+      number=4, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"",
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='label', full_name='caffe.Datum.label', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='float_data', full_name='caffe.Datum.float_data', index=5,
+      number=6, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='encoded', full_name='caffe.Datum.encoded', index=6,
+      number=7, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=321,
+  serialized_end=450,
+)
+
+
+_FILLERPARAMETER = _descriptor.Descriptor(
+  name='FillerParameter',
+  full_name='caffe.FillerParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='caffe.FillerParameter.type', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"constant".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='caffe.FillerParameter.value', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='min', full_name='caffe.FillerParameter.min', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max', full_name='caffe.FillerParameter.max', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean', full_name='caffe.FillerParameter.mean', index=4,
+      number=5, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='std', full_name='caffe.FillerParameter.std', index=5,
+      number=6, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='sparse', full_name='caffe.FillerParameter.sparse', index=6,
+      number=7, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='variance_norm', full_name='caffe.FillerParameter.variance_norm', index=7,
+      number=8, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _FILLERPARAMETER_VARIANCENORM,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=453,
+  serialized_end=719,
+)
+
+
+_NETPARAMETER = _descriptor.Descriptor(
+  name='NetParameter',
+  full_name='caffe.NetParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='caffe.NetParameter.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input', full_name='caffe.NetParameter.input', index=1,
+      number=3, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_shape', full_name='caffe.NetParameter.input_shape', index=2,
+      number=8, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_dim', full_name='caffe.NetParameter.input_dim', index=3,
+      number=4, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='force_backward', full_name='caffe.NetParameter.force_backward', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='state', full_name='caffe.NetParameter.state', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='debug_info', full_name='caffe.NetParameter.debug_info', index=6,
+      number=7, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='layer', full_name='caffe.NetParameter.layer', index=7,
+      number=100, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='layers', full_name='caffe.NetParameter.layers', index=8,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=722,
+  serialized_end=992,
+)
+
+
+_SOLVERPARAMETER = _descriptor.Descriptor(
+  name='SolverParameter',
+  full_name='caffe.SolverParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='net', full_name='caffe.SolverParameter.net', index=0,
+      number=24, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='net_param', full_name='caffe.SolverParameter.net_param', index=1,
+      number=25, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='train_net', full_name='caffe.SolverParameter.train_net', index=2,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_net', full_name='caffe.SolverParameter.test_net', index=3,
+      number=2, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='train_net_param', full_name='caffe.SolverParameter.train_net_param', index=4,
+      number=21, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_net_param', full_name='caffe.SolverParameter.test_net_param', index=5,
+      number=22, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='train_state', full_name='caffe.SolverParameter.train_state', index=6,
+      number=26, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_state', full_name='caffe.SolverParameter.test_state', index=7,
+      number=27, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_iter', full_name='caffe.SolverParameter.test_iter', index=8,
+      number=3, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_interval', full_name='caffe.SolverParameter.test_interval', index=9,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_compute_loss', full_name='caffe.SolverParameter.test_compute_loss', index=10,
+      number=19, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='test_initialization', full_name='caffe.SolverParameter.test_initialization', index=11,
+      number=32, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='base_lr', full_name='caffe.SolverParameter.base_lr', index=12,
+      number=5, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='display', full_name='caffe.SolverParameter.display', index=13,
+      number=6, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='average_loss', full_name='caffe.SolverParameter.average_loss', index=14,
+      number=33, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max_iter', full_name='caffe.SolverParameter.max_iter', index=15,
+      number=7, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='iter_size', full_name='caffe.SolverParameter.iter_size', index=16,
+      number=36, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='lr_policy', full_name='caffe.SolverParameter.lr_policy', index=17,
+      number=8, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='gamma', full_name='caffe.SolverParameter.gamma', index=18,
+      number=9, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='power', full_name='caffe.SolverParameter.power', index=19,
+      number=10, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='momentum', full_name='caffe.SolverParameter.momentum', index=20,
+      number=11, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_decay', full_name='caffe.SolverParameter.weight_decay', index=21,
+      number=12, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='regularization_type', full_name='caffe.SolverParameter.regularization_type', index=22,
+      number=29, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"L2".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stepsize', full_name='caffe.SolverParameter.stepsize', index=23,
+      number=13, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stepvalue', full_name='caffe.SolverParameter.stepvalue', index=24,
+      number=34, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='clip_gradients', full_name='caffe.SolverParameter.clip_gradients', index=25,
+      number=35, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(-1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='snapshot', full_name='caffe.SolverParameter.snapshot', index=26,
+      number=14, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='snapshot_prefix', full_name='caffe.SolverParameter.snapshot_prefix', index=27,
+      number=15, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='snapshot_diff', full_name='caffe.SolverParameter.snapshot_diff', index=28,
+      number=16, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='snapshot_format', full_name='caffe.SolverParameter.snapshot_format', index=29,
+      number=37, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='solver_mode', full_name='caffe.SolverParameter.solver_mode', index=30,
+      number=17, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='device_id', full_name='caffe.SolverParameter.device_id', index=31,
+      number=18, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='random_seed', full_name='caffe.SolverParameter.random_seed', index=32,
+      number=20, type=3, cpp_type=2, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='caffe.SolverParameter.type', index=33,
+      number=40, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"SGD".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='delta', full_name='caffe.SolverParameter.delta', index=34,
+      number=31, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1e-08),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='momentum2', full_name='caffe.SolverParameter.momentum2', index=35,
+      number=39, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.999),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='rms_decay', full_name='caffe.SolverParameter.rms_decay', index=36,
+      number=38, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.99),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='debug_info', full_name='caffe.SolverParameter.debug_info', index=37,
+      number=23, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='snapshot_after_train', full_name='caffe.SolverParameter.snapshot_after_train', index=38,
+      number=28, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='solver_type', full_name='caffe.SolverParameter.solver_type', index=39,
+      number=30, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='layer_wise_reduce', full_name='caffe.SolverParameter.layer_wise_reduce', index=40,
+      number=41, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weights', full_name='caffe.SolverParameter.weights', index=41,
+      number=42, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _SOLVERPARAMETER_SNAPSHOTFORMAT,
+    _SOLVERPARAMETER_SOLVERMODE,
+    _SOLVERPARAMETER_SOLVERTYPE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=995,
+  serialized_end=2359,
+)
+
+
+_SOLVERSTATE = _descriptor.Descriptor(
+  name='SolverState',
+  full_name='caffe.SolverState',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='iter', full_name='caffe.SolverState.iter', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='learned_net', full_name='caffe.SolverState.learned_net', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='history', full_name='caffe.SolverState.history', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='current_step', full_name='caffe.SolverState.current_step', index=3,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2361,
+  serialized_end=2469,
+)
+
+
+_NETSTATE = _descriptor.Descriptor(
+  name='NetState',
+  full_name='caffe.NetState',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='phase', full_name='caffe.NetState.phase', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='level', full_name='caffe.NetState.level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stage', full_name='caffe.NetState.stage', index=2,
+      number=3, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2471,
+  serialized_end=2549,
+)
+
+
+_NETSTATERULE = _descriptor.Descriptor(
+  name='NetStateRule',
+  full_name='caffe.NetStateRule',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='phase', full_name='caffe.NetStateRule.phase', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='min_level', full_name='caffe.NetStateRule.min_level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max_level', full_name='caffe.NetStateRule.max_level', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stage', full_name='caffe.NetStateRule.stage', index=3,
+      number=4, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='not_stage', full_name='caffe.NetStateRule.not_stage', index=4,
+      number=5, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2551,
+  serialized_end=2666,
+)
+
+
+_PARAMSPEC = _descriptor.Descriptor(
+  name='ParamSpec',
+  full_name='caffe.ParamSpec',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='caffe.ParamSpec.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='share_mode', full_name='caffe.ParamSpec.share_mode', index=1,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='lr_mult', full_name='caffe.ParamSpec.lr_mult', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='decay_mult', full_name='caffe.ParamSpec.decay_mult', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _PARAMSPEC_DIMCHECKMODE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2669,
+  serialized_end=2832,
+)
+
+
+_LAYERPARAMETER = _descriptor.Descriptor(
+  name='LayerParameter',
+  full_name='caffe.LayerParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='caffe.LayerParameter.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='caffe.LayerParameter.type', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bottom', full_name='caffe.LayerParameter.bottom', index=2,
+      number=3, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='top', full_name='caffe.LayerParameter.top', index=3,
+      number=4, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='phase', full_name='caffe.LayerParameter.phase', index=4,
+      number=10, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='loss_weight', full_name='caffe.LayerParameter.loss_weight', index=5,
+      number=5, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='param', full_name='caffe.LayerParameter.param', index=6,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blobs', full_name='caffe.LayerParameter.blobs', index=7,
+      number=7, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='propagate_down', full_name='caffe.LayerParameter.propagate_down', index=8,
+      number=11, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='include', full_name='caffe.LayerParameter.include', index=9,
+      number=8, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='exclude', full_name='caffe.LayerParameter.exclude', index=10,
+      number=9, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='transform_param', full_name='caffe.LayerParameter.transform_param', index=11,
+      number=100, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='loss_param', full_name='caffe.LayerParameter.loss_param', index=12,
+      number=101, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='accuracy_param', full_name='caffe.LayerParameter.accuracy_param', index=13,
+      number=102, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='argmax_param', full_name='caffe.LayerParameter.argmax_param', index=14,
+      number=103, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batch_norm_param', full_name='caffe.LayerParameter.batch_norm_param', index=15,
+      number=139, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_param', full_name='caffe.LayerParameter.bias_param', index=16,
+      number=141, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='clip_param', full_name='caffe.LayerParameter.clip_param', index=17,
+      number=148, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='concat_param', full_name='caffe.LayerParameter.concat_param', index=18,
+      number=104, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='contrastive_loss_param', full_name='caffe.LayerParameter.contrastive_loss_param', index=19,
+      number=105, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='convolution_param', full_name='caffe.LayerParameter.convolution_param', index=20,
+      number=106, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='upsample_param', full_name='caffe.LayerParameter.upsample_param', index=21,
+      number=149, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_param', full_name='caffe.LayerParameter.crop_param', index=22,
+      number=144, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='data_param', full_name='caffe.LayerParameter.data_param', index=23,
+      number=107, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dropout_param', full_name='caffe.LayerParameter.dropout_param', index=24,
+      number=108, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dummy_data_param', full_name='caffe.LayerParameter.dummy_data_param', index=25,
+      number=109, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eltwise_param', full_name='caffe.LayerParameter.eltwise_param', index=26,
+      number=110, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='elu_param', full_name='caffe.LayerParameter.elu_param', index=27,
+      number=140, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='embed_param', full_name='caffe.LayerParameter.embed_param', index=28,
+      number=137, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='exp_param', full_name='caffe.LayerParameter.exp_param', index=29,
+      number=111, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='flatten_param', full_name='caffe.LayerParameter.flatten_param', index=30,
+      number=135, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hdf5_data_param', full_name='caffe.LayerParameter.hdf5_data_param', index=31,
+      number=112, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hdf5_output_param', full_name='caffe.LayerParameter.hdf5_output_param', index=32,
+      number=113, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hinge_loss_param', full_name='caffe.LayerParameter.hinge_loss_param', index=33,
+      number=114, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='image_data_param', full_name='caffe.LayerParameter.image_data_param', index=34,
+      number=115, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='infogain_loss_param', full_name='caffe.LayerParameter.infogain_loss_param', index=35,
+      number=116, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='inner_product_param', full_name='caffe.LayerParameter.inner_product_param', index=36,
+      number=117, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_param', full_name='caffe.LayerParameter.input_param', index=37,
+      number=143, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='log_param', full_name='caffe.LayerParameter.log_param', index=38,
+      number=134, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='lrn_param', full_name='caffe.LayerParameter.lrn_param', index=39,
+      number=118, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='memory_data_param', full_name='caffe.LayerParameter.memory_data_param', index=40,
+      number=119, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mvn_param', full_name='caffe.LayerParameter.mvn_param', index=41,
+      number=120, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='parameter_param', full_name='caffe.LayerParameter.parameter_param', index=42,
+      number=145, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pooling_param', full_name='caffe.LayerParameter.pooling_param', index=43,
+      number=121, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='power_param', full_name='caffe.LayerParameter.power_param', index=44,
+      number=122, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='prelu_param', full_name='caffe.LayerParameter.prelu_param', index=45,
+      number=131, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='python_param', full_name='caffe.LayerParameter.python_param', index=46,
+      number=130, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='recurrent_param', full_name='caffe.LayerParameter.recurrent_param', index=47,
+      number=146, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='reduction_param', full_name='caffe.LayerParameter.reduction_param', index=48,
+      number=136, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='relu_param', full_name='caffe.LayerParameter.relu_param', index=49,
+      number=123, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='reshape_param', full_name='caffe.LayerParameter.reshape_param', index=50,
+      number=133, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale_param', full_name='caffe.LayerParameter.scale_param', index=51,
+      number=142, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='sigmoid_param', full_name='caffe.LayerParameter.sigmoid_param', index=52,
+      number=124, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='softmax_param', full_name='caffe.LayerParameter.softmax_param', index=53,
+      number=125, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='spp_param', full_name='caffe.LayerParameter.spp_param', index=54,
+      number=132, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='slice_param', full_name='caffe.LayerParameter.slice_param', index=55,
+      number=126, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='swish_param', full_name='caffe.LayerParameter.swish_param', index=56,
+      number=147, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='tanh_param', full_name='caffe.LayerParameter.tanh_param', index=57,
+      number=127, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='threshold_param', full_name='caffe.LayerParameter.threshold_param', index=58,
+      number=128, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='tile_param', full_name='caffe.LayerParameter.tile_param', index=59,
+      number=138, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='window_data_param', full_name='caffe.LayerParameter.window_data_param', index=60,
+      number=129, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='interp_param', full_name='caffe.LayerParameter.interp_param', index=61,
+      number=166, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shuffle_channel_param', full_name='caffe.LayerParameter.shuffle_channel_param', index=62,
+      number=164, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='permute_param', full_name='caffe.LayerParameter.permute_param', index=63,
+      number=202, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='prior_box_param', full_name='caffe.LayerParameter.prior_box_param', index=64,
+      number=203, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='detection_output_param', full_name='caffe.LayerParameter.detection_output_param', index=65,
+      number=204, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='detection_evaluate_param', full_name='caffe.LayerParameter.detection_evaluate_param', index=66,
+      number=205, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='norm_param', full_name='caffe.LayerParameter.norm_param', index=67,
+      number=206, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axpy_param', full_name='caffe.LayerParameter.axpy_param', index=68,
+      number=151, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='relu6_param', full_name='caffe.LayerParameter.relu6_param', index=69,
+      number=100000, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2835,
+  serialized_end=6021,
+)
+
+
+_SHUFFLECHANNELPARAMETER = _descriptor.Descriptor(
+  name='ShuffleChannelParameter',
+  full_name='caffe.ShuffleChannelParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='group', full_name='caffe.ShuffleChannelParameter.group', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6023,
+  serialized_end=6066,
+)
+
+
+_TRANSFORMATIONPARAMETER = _descriptor.Descriptor(
+  name='TransformationParameter',
+  full_name='caffe.TransformationParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.TransformationParameter.scale', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mirror', full_name='caffe.TransformationParameter.mirror', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_size', full_name='caffe.TransformationParameter.crop_size', index=2,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean_file', full_name='caffe.TransformationParameter.mean_file', index=3,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean_value', full_name='caffe.TransformationParameter.mean_value', index=4,
+      number=5, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='force_color', full_name='caffe.TransformationParameter.force_color', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='force_gray', full_name='caffe.TransformationParameter.force_gray', index=6,
+      number=7, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6069,
+  serialized_end=6251,
+)
+
+
+_LOSSPARAMETER = _descriptor.Descriptor(
+  name='LossParameter',
+  full_name='caffe.LossParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='ignore_label', full_name='caffe.LossParameter.ignore_label', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='normalization', full_name='caffe.LossParameter.normalization', index=1,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='normalize', full_name='caffe.LossParameter.normalize', index=2,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _LOSSPARAMETER_NORMALIZATIONMODE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6254,
+  serialized_end=6448,
+)
+
+
+_ACCURACYPARAMETER = _descriptor.Descriptor(
+  name='AccuracyParameter',
+  full_name='caffe.AccuracyParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='top_k', full_name='caffe.AccuracyParameter.top_k', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.AccuracyParameter.axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='ignore_label', full_name='caffe.AccuracyParameter.ignore_label', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6450,
+  serialized_end=6526,
+)
+
+
+_ARGMAXPARAMETER = _descriptor.Descriptor(
+  name='ArgMaxParameter',
+  full_name='caffe.ArgMaxParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='out_max_val', full_name='caffe.ArgMaxParameter.out_max_val', index=0,
+      number=1, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='top_k', full_name='caffe.ArgMaxParameter.top_k', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ArgMaxParameter.axis', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6528,
+  serialized_end=6605,
+)
+
+
+_CLIPPARAMETER = _descriptor.Descriptor(
+  name='ClipParameter',
+  full_name='caffe.ClipParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='min', full_name='caffe.ClipParameter.min', index=0,
+      number=1, type=2, cpp_type=6, label=2,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max', full_name='caffe.ClipParameter.max', index=1,
+      number=2, type=2, cpp_type=6, label=2,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6607,
+  serialized_end=6648,
+)
+
+
+_CONCATPARAMETER = _descriptor.Descriptor(
+  name='ConcatParameter',
+  full_name='caffe.ConcatParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ConcatParameter.axis', index=0,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='concat_dim', full_name='caffe.ConcatParameter.concat_dim', index=1,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6650,
+  serialized_end=6707,
+)
+
+
+_BATCHNORMPARAMETER = _descriptor.Descriptor(
+  name='BatchNormParameter',
+  full_name='caffe.BatchNormParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='use_global_stats', full_name='caffe.BatchNormParameter.use_global_stats', index=0,
+      number=1, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='moving_average_fraction', full_name='caffe.BatchNormParameter.moving_average_fraction', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.999),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eps', full_name='caffe.BatchNormParameter.eps', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1e-05),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6709,
+  serialized_end=6815,
+)
+
+
+_BIASPARAMETER = _descriptor.Descriptor(
+  name='BiasParameter',
+  full_name='caffe.BiasParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.BiasParameter.axis', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_axes', full_name='caffe.BiasParameter.num_axes', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='filler', full_name='caffe.BiasParameter.filler', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6817,
+  serialized_end=6910,
+)
+
+
+_CONTRASTIVELOSSPARAMETER = _descriptor.Descriptor(
+  name='ContrastiveLossParameter',
+  full_name='caffe.ContrastiveLossParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='margin', full_name='caffe.ContrastiveLossParameter.margin', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='legacy_version', full_name='caffe.ContrastiveLossParameter.legacy_version', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6912,
+  serialized_end=6988,
+)
+
+
+_UPSAMPLEPARAMETER = _descriptor.Descriptor(
+  name='UpsampleParameter',
+  full_name='caffe.UpsampleParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.UpsampleParameter.scale', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=6990,
+  serialized_end=7027,
+)
+
+
+_CONVOLUTIONPARAMETER = _descriptor.Descriptor(
+  name='ConvolutionParameter',
+  full_name='caffe.ConvolutionParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_output', full_name='caffe.ConvolutionParameter.num_output', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_term', full_name='caffe.ConvolutionParameter.bias_term', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad', full_name='caffe.ConvolutionParameter.pad', index=2,
+      number=3, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_size', full_name='caffe.ConvolutionParameter.kernel_size', index=3,
+      number=4, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride', full_name='caffe.ConvolutionParameter.stride', index=4,
+      number=6, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dilation', full_name='caffe.ConvolutionParameter.dilation', index=5,
+      number=18, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_h', full_name='caffe.ConvolutionParameter.pad_h', index=6,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_w', full_name='caffe.ConvolutionParameter.pad_w', index=7,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_h', full_name='caffe.ConvolutionParameter.kernel_h', index=8,
+      number=11, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_w', full_name='caffe.ConvolutionParameter.kernel_w', index=9,
+      number=12, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride_h', full_name='caffe.ConvolutionParameter.stride_h', index=10,
+      number=13, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride_w', full_name='caffe.ConvolutionParameter.stride_w', index=11,
+      number=14, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='group', full_name='caffe.ConvolutionParameter.group', index=12,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_filler', full_name='caffe.ConvolutionParameter.weight_filler', index=13,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.ConvolutionParameter.bias_filler', index=14,
+      number=8, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.ConvolutionParameter.engine', index=15,
+      number=15, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ConvolutionParameter.axis', index=16,
+      number=16, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='force_nd_im2col', full_name='caffe.ConvolutionParameter.force_nd_im2col', index=17,
+      number=17, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _CONVOLUTIONPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=7030,
+  serialized_end=7538,
+)
+
+
+_CROPPARAMETER = _descriptor.Descriptor(
+  name='CropParameter',
+  full_name='caffe.CropParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.CropParameter.axis', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=2,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='offset', full_name='caffe.CropParameter.offset', index=1,
+      number=2, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=7540,
+  serialized_end=7588,
+)
+
+
+_DATAPARAMETER = _descriptor.Descriptor(
+  name='DataParameter',
+  full_name='caffe.DataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.DataParameter.source', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batch_size', full_name='caffe.DataParameter.batch_size', index=1,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='rand_skip', full_name='caffe.DataParameter.rand_skip', index=2,
+      number=7, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='backend', full_name='caffe.DataParameter.backend', index=3,
+      number=8, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.DataParameter.scale', index=4,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean_file', full_name='caffe.DataParameter.mean_file', index=5,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_size', full_name='caffe.DataParameter.crop_size', index=6,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mirror', full_name='caffe.DataParameter.mirror', index=7,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='force_encoded_color', full_name='caffe.DataParameter.force_encoded_color', index=8,
+      number=9, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='prefetch', full_name='caffe.DataParameter.prefetch', index=9,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=4,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _DATAPARAMETER_DB,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=7591,
+  serialized_end=7883,
+)
+
+
+_DROPOUTPARAMETER = _descriptor.Descriptor(
+  name='DropoutParameter',
+  full_name='caffe.DropoutParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='dropout_ratio', full_name='caffe.DropoutParameter.dropout_ratio', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=7885,
+  serialized_end=7931,
+)
+
+
+_DUMMYDATAPARAMETER = _descriptor.Descriptor(
+  name='DummyDataParameter',
+  full_name='caffe.DummyDataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data_filler', full_name='caffe.DummyDataParameter.data_filler', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='caffe.DummyDataParameter.shape', index=1,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num', full_name='caffe.DummyDataParameter.num', index=2,
+      number=2, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='channels', full_name='caffe.DummyDataParameter.channels', index=3,
+      number=3, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.DummyDataParameter.height', index=4,
+      number=4, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.DummyDataParameter.width', index=5,
+      number=5, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=7934,
+  serialized_end=8094,
+)
+
+
+_ELTWISEPARAMETER = _descriptor.Descriptor(
+  name='EltwiseParameter',
+  full_name='caffe.EltwiseParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='operation', full_name='caffe.EltwiseParameter.operation', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='coeff', full_name='caffe.EltwiseParameter.coeff', index=1,
+      number=2, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stable_prod_grad', full_name='caffe.EltwiseParameter.stable_prod_grad', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _ELTWISEPARAMETER_ELTWISEOP,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8097,
+  serialized_end=8262,
+)
+
+
+_ELUPARAMETER = _descriptor.Descriptor(
+  name='ELUParameter',
+  full_name='caffe.ELUParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='alpha', full_name='caffe.ELUParameter.alpha', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8264,
+  serialized_end=8296,
+)
+
+
+_EMBEDPARAMETER = _descriptor.Descriptor(
+  name='EmbedParameter',
+  full_name='caffe.EmbedParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_output', full_name='caffe.EmbedParameter.num_output', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_dim', full_name='caffe.EmbedParameter.input_dim', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_term', full_name='caffe.EmbedParameter.bias_term', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_filler', full_name='caffe.EmbedParameter.weight_filler', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.EmbedParameter.bias_filler', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8299,
+  serialized_end=8471,
+)
+
+
+_EXPPARAMETER = _descriptor.Descriptor(
+  name='ExpParameter',
+  full_name='caffe.ExpParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='base', full_name='caffe.ExpParameter.base', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(-1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.ExpParameter.scale', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shift', full_name='caffe.ExpParameter.shift', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8473,
+  serialized_end=8541,
+)
+
+
+_FLATTENPARAMETER = _descriptor.Descriptor(
+  name='FlattenParameter',
+  full_name='caffe.FlattenParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.FlattenParameter.axis', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='end_axis', full_name='caffe.FlattenParameter.end_axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8543,
+  serialized_end=8600,
+)
+
+
+_HDF5DATAPARAMETER = _descriptor.Descriptor(
+  name='HDF5DataParameter',
+  full_name='caffe.HDF5DataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.HDF5DataParameter.source', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batch_size', full_name='caffe.HDF5DataParameter.batch_size', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shuffle', full_name='caffe.HDF5DataParameter.shuffle', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8602,
+  serialized_end=8681,
+)
+
+
+_HDF5OUTPUTPARAMETER = _descriptor.Descriptor(
+  name='HDF5OutputParameter',
+  full_name='caffe.HDF5OutputParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='file_name', full_name='caffe.HDF5OutputParameter.file_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8683,
+  serialized_end=8723,
+)
+
+
+_HINGELOSSPARAMETER = _descriptor.Descriptor(
+  name='HingeLossParameter',
+  full_name='caffe.HingeLossParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='norm', full_name='caffe.HingeLossParameter.norm', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _HINGELOSSPARAMETER_NORM,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8725,
+  serialized_end=8819,
+)
+
+
+_IMAGEDATAPARAMETER = _descriptor.Descriptor(
+  name='ImageDataParameter',
+  full_name='caffe.ImageDataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.ImageDataParameter.source', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batch_size', full_name='caffe.ImageDataParameter.batch_size', index=1,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='rand_skip', full_name='caffe.ImageDataParameter.rand_skip', index=2,
+      number=7, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shuffle', full_name='caffe.ImageDataParameter.shuffle', index=3,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_height', full_name='caffe.ImageDataParameter.new_height', index=4,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_width', full_name='caffe.ImageDataParameter.new_width', index=5,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='is_color', full_name='caffe.ImageDataParameter.is_color', index=6,
+      number=11, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.ImageDataParameter.scale', index=7,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean_file', full_name='caffe.ImageDataParameter.mean_file', index=8,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_size', full_name='caffe.ImageDataParameter.crop_size', index=9,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mirror', full_name='caffe.ImageDataParameter.mirror', index=10,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='root_folder', full_name='caffe.ImageDataParameter.root_folder', index=11,
+      number=12, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=8822,
+  serialized_end=9101,
+)
+
+
+_INFOGAINLOSSPARAMETER = _descriptor.Descriptor(
+  name='InfogainLossParameter',
+  full_name='caffe.InfogainLossParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.InfogainLossParameter.source', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.InfogainLossParameter.axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9103,
+  serialized_end=9159,
+)
+
+
+_INNERPRODUCTPARAMETER = _descriptor.Descriptor(
+  name='InnerProductParameter',
+  full_name='caffe.InnerProductParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_output', full_name='caffe.InnerProductParameter.num_output', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_term', full_name='caffe.InnerProductParameter.bias_term', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_filler', full_name='caffe.InnerProductParameter.weight_filler', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.InnerProductParameter.bias_filler', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.InnerProductParameter.axis', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='transpose', full_name='caffe.InnerProductParameter.transpose', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9162,
+  serialized_end=9365,
+)
+
+
+_INPUTPARAMETER = _descriptor.Descriptor(
+  name='InputParameter',
+  full_name='caffe.InputParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='caffe.InputParameter.shape', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9367,
+  serialized_end=9416,
+)
+
+
+_LOGPARAMETER = _descriptor.Descriptor(
+  name='LogParameter',
+  full_name='caffe.LogParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='base', full_name='caffe.LogParameter.base', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(-1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.LogParameter.scale', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shift', full_name='caffe.LogParameter.shift', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9418,
+  serialized_end=9486,
+)
+
+
+_LRNPARAMETER = _descriptor.Descriptor(
+  name='LRNParameter',
+  full_name='caffe.LRNParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='local_size', full_name='caffe.LRNParameter.local_size', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=5,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='alpha', full_name='caffe.LRNParameter.alpha', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='beta', full_name='caffe.LRNParameter.beta', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.75),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='norm_region', full_name='caffe.LRNParameter.norm_region', index=3,
+      number=4, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='k', full_name='caffe.LRNParameter.k', index=4,
+      number=5, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.LRNParameter.engine', index=5,
+      number=6, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _LRNPARAMETER_NORMREGION,
+    _LRNPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9489,
+  serialized_end=9801,
+)
+
+
+_MEMORYDATAPARAMETER = _descriptor.Descriptor(
+  name='MemoryDataParameter',
+  full_name='caffe.MemoryDataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='batch_size', full_name='caffe.MemoryDataParameter.batch_size', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='channels', full_name='caffe.MemoryDataParameter.channels', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.MemoryDataParameter.height', index=2,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.MemoryDataParameter.width', index=3,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9803,
+  serialized_end=9893,
+)
+
+
+_MVNPARAMETER = _descriptor.Descriptor(
+  name='MVNParameter',
+  full_name='caffe.MVNParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='normalize_variance', full_name='caffe.MVNParameter.normalize_variance', index=0,
+      number=1, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='across_channels', full_name='caffe.MVNParameter.across_channels', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eps', full_name='caffe.MVNParameter.eps', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1e-09),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9895,
+  serialized_end=9995,
+)
+
+
+_PARAMETERPARAMETER = _descriptor.Descriptor(
+  name='ParameterParameter',
+  full_name='caffe.ParameterParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='caffe.ParameterParameter.shape', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=9997,
+  serialized_end=10050,
+)
+
+
+_POOLINGPARAMETER = _descriptor.Descriptor(
+  name='PoolingParameter',
+  full_name='caffe.PoolingParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='pool', full_name='caffe.PoolingParameter.pool', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad', full_name='caffe.PoolingParameter.pad', index=1,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_h', full_name='caffe.PoolingParameter.pad_h', index=2,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_w', full_name='caffe.PoolingParameter.pad_w', index=3,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_size', full_name='caffe.PoolingParameter.kernel_size', index=4,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_h', full_name='caffe.PoolingParameter.kernel_h', index=5,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernel_w', full_name='caffe.PoolingParameter.kernel_w', index=6,
+      number=6, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride', full_name='caffe.PoolingParameter.stride', index=7,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride_h', full_name='caffe.PoolingParameter.stride_h', index=8,
+      number=7, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride_w', full_name='caffe.PoolingParameter.stride_w', index=9,
+      number=8, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.PoolingParameter.engine', index=10,
+      number=11, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='global_pooling', full_name='caffe.PoolingParameter.global_pooling', index=11,
+      number=12, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='round_mode', full_name='caffe.PoolingParameter.round_mode', index=12,
+      number=13, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _POOLINGPARAMETER_POOLMETHOD,
+    _POOLINGPARAMETER_ENGINE,
+    _POOLINGPARAMETER_ROUNDMODE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=10053,
+  serialized_end=10566,
+)
+
+
+_POWERPARAMETER = _descriptor.Descriptor(
+  name='PowerParameter',
+  full_name='caffe.PowerParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='power', full_name='caffe.PowerParameter.power', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.PowerParameter.scale', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shift', full_name='caffe.PowerParameter.shift', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=10568,
+  serialized_end=10638,
+)
+
+
+_PYTHONPARAMETER = _descriptor.Descriptor(
+  name='PythonParameter',
+  full_name='caffe.PythonParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='module', full_name='caffe.PythonParameter.module', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='layer', full_name='caffe.PythonParameter.layer', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='param_str', full_name='caffe.PythonParameter.param_str', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='share_in_parallel', full_name='caffe.PythonParameter.share_in_parallel', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=10640,
+  serialized_end=10743,
+)
+
+
+_RECURRENTPARAMETER = _descriptor.Descriptor(
+  name='RecurrentParameter',
+  full_name='caffe.RecurrentParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_output', full_name='caffe.RecurrentParameter.num_output', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_filler', full_name='caffe.RecurrentParameter.weight_filler', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.RecurrentParameter.bias_filler', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='debug_info', full_name='caffe.RecurrentParameter.debug_info', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='expose_hidden', full_name='caffe.RecurrentParameter.expose_hidden', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=10746,
+  serialized_end=10938,
+)
+
+
+_REDUCTIONPARAMETER = _descriptor.Descriptor(
+  name='ReductionParameter',
+  full_name='caffe.ReductionParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='operation', full_name='caffe.ReductionParameter.operation', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ReductionParameter.axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='coeff', full_name='caffe.ReductionParameter.coeff', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _REDUCTIONPARAMETER_REDUCTIONOP,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=10941,
+  serialized_end=11114,
+)
+
+
+_RELUPARAMETER = _descriptor.Descriptor(
+  name='ReLUParameter',
+  full_name='caffe.ReLUParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='negative_slope', full_name='caffe.ReLUParameter.negative_slope', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.ReLUParameter.engine', index=1,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _RELUPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11117,
+  serialized_end=11258,
+)
+
+
+_RESHAPEPARAMETER = _descriptor.Descriptor(
+  name='ReshapeParameter',
+  full_name='caffe.ReshapeParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='caffe.ReshapeParameter.shape', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ReshapeParameter.axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_axes', full_name='caffe.ReshapeParameter.num_axes', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11260,
+  serialized_end=11350,
+)
+
+
+_SCALEPARAMETER = _descriptor.Descriptor(
+  name='ScaleParameter',
+  full_name='caffe.ScaleParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.ScaleParameter.axis', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_axes', full_name='caffe.ScaleParameter.num_axes', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='filler', full_name='caffe.ScaleParameter.filler', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_term', full_name='caffe.ScaleParameter.bias_term', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.ScaleParameter.bias_filler', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11353,
+  serialized_end=11518,
+)
+
+
+_SIGMOIDPARAMETER = _descriptor.Descriptor(
+  name='SigmoidParameter',
+  full_name='caffe.SigmoidParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.SigmoidParameter.engine', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _SIGMOIDPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11520,
+  serialized_end=11640,
+)
+
+
+_SLICEPARAMETER = _descriptor.Descriptor(
+  name='SliceParameter',
+  full_name='caffe.SliceParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.SliceParameter.axis', index=0,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='slice_point', full_name='caffe.SliceParameter.slice_point', index=1,
+      number=2, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='slice_dim', full_name='caffe.SliceParameter.slice_dim', index=2,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11642,
+  serialized_end=11718,
+)
+
+
+_SOFTMAXPARAMETER = _descriptor.Descriptor(
+  name='SoftmaxParameter',
+  full_name='caffe.SoftmaxParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.SoftmaxParameter.engine', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.SoftmaxParameter.axis', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _SOFTMAXPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11721,
+  serialized_end=11858,
+)
+
+
+_SWISHPARAMETER = _descriptor.Descriptor(
+  name='SwishParameter',
+  full_name='caffe.SwishParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='beta', full_name='caffe.SwishParameter.beta', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11860,
+  serialized_end=11893,
+)
+
+
+_TANHPARAMETER = _descriptor.Descriptor(
+  name='TanHParameter',
+  full_name='caffe.TanHParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.TanHParameter.engine', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _TANHPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=11895,
+  serialized_end=12009,
+)
+
+
+_TILEPARAMETER = _descriptor.Descriptor(
+  name='TileParameter',
+  full_name='caffe.TileParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='axis', full_name='caffe.TileParameter.axis', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='tiles', full_name='caffe.TileParameter.tiles', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=12011,
+  serialized_end=12058,
+)
+
+
+_THRESHOLDPARAMETER = _descriptor.Descriptor(
+  name='ThresholdParameter',
+  full_name='caffe.ThresholdParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='threshold', full_name='caffe.ThresholdParameter.threshold', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=12060,
+  serialized_end=12102,
+)
+
+
+_WINDOWDATAPARAMETER = _descriptor.Descriptor(
+  name='WindowDataParameter',
+  full_name='caffe.WindowDataParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.WindowDataParameter.source', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.WindowDataParameter.scale', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mean_file', full_name='caffe.WindowDataParameter.mean_file', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batch_size', full_name='caffe.WindowDataParameter.batch_size', index=3,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_size', full_name='caffe.WindowDataParameter.crop_size', index=4,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mirror', full_name='caffe.WindowDataParameter.mirror', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='fg_threshold', full_name='caffe.WindowDataParameter.fg_threshold', index=6,
+      number=7, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bg_threshold', full_name='caffe.WindowDataParameter.bg_threshold', index=7,
+      number=8, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='fg_fraction', full_name='caffe.WindowDataParameter.fg_fraction', index=8,
+      number=9, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.25),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='context_pad', full_name='caffe.WindowDataParameter.context_pad', index=9,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='crop_mode', full_name='caffe.WindowDataParameter.crop_mode', index=10,
+      number=11, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"warp".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='cache_images', full_name='caffe.WindowDataParameter.cache_images', index=11,
+      number=12, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='root_folder', full_name='caffe.WindowDataParameter.root_folder', index=12,
+      number=13, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=12105,
+  serialized_end=12426,
+)
+
+
+_SPPPARAMETER = _descriptor.Descriptor(
+  name='SPPParameter',
+  full_name='caffe.SPPParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='pyramid_height', full_name='caffe.SPPParameter.pyramid_height', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pool', full_name='caffe.SPPParameter.pool', index=1,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='engine', full_name='caffe.SPPParameter.engine', index=2,
+      number=6, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _SPPPARAMETER_POOLMETHOD,
+    _SPPPARAMETER_ENGINE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=12429,
+  serialized_end=12664,
+)
+
+
+_V1LAYERPARAMETER = _descriptor.Descriptor(
+  name='V1LayerParameter',
+  full_name='caffe.V1LayerParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='bottom', full_name='caffe.V1LayerParameter.bottom', index=0,
+      number=2, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='top', full_name='caffe.V1LayerParameter.top', index=1,
+      number=3, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='name', full_name='caffe.V1LayerParameter.name', index=2,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='include', full_name='caffe.V1LayerParameter.include', index=3,
+      number=32, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='exclude', full_name='caffe.V1LayerParameter.exclude', index=4,
+      number=33, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='caffe.V1LayerParameter.type', index=5,
+      number=5, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blobs', full_name='caffe.V1LayerParameter.blobs', index=6,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='param', full_name='caffe.V1LayerParameter.param', index=7,
+      number=1001, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blob_share_mode', full_name='caffe.V1LayerParameter.blob_share_mode', index=8,
+      number=1002, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blobs_lr', full_name='caffe.V1LayerParameter.blobs_lr', index=9,
+      number=7, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_decay', full_name='caffe.V1LayerParameter.weight_decay', index=10,
+      number=8, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='loss_weight', full_name='caffe.V1LayerParameter.loss_weight', index=11,
+      number=35, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='accuracy_param', full_name='caffe.V1LayerParameter.accuracy_param', index=12,
+      number=27, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='argmax_param', full_name='caffe.V1LayerParameter.argmax_param', index=13,
+      number=23, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='concat_param', full_name='caffe.V1LayerParameter.concat_param', index=14,
+      number=9, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='contrastive_loss_param', full_name='caffe.V1LayerParameter.contrastive_loss_param', index=15,
+      number=40, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='convolution_param', full_name='caffe.V1LayerParameter.convolution_param', index=16,
+      number=10, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='upsample_param', full_name='caffe.V1LayerParameter.upsample_param', index=17,
+      number=43, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='data_param', full_name='caffe.V1LayerParameter.data_param', index=18,
+      number=11, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dropout_param', full_name='caffe.V1LayerParameter.dropout_param', index=19,
+      number=12, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dummy_data_param', full_name='caffe.V1LayerParameter.dummy_data_param', index=20,
+      number=26, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eltwise_param', full_name='caffe.V1LayerParameter.eltwise_param', index=21,
+      number=24, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='exp_param', full_name='caffe.V1LayerParameter.exp_param', index=22,
+      number=41, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hdf5_data_param', full_name='caffe.V1LayerParameter.hdf5_data_param', index=23,
+      number=13, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hdf5_output_param', full_name='caffe.V1LayerParameter.hdf5_output_param', index=24,
+      number=14, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hinge_loss_param', full_name='caffe.V1LayerParameter.hinge_loss_param', index=25,
+      number=29, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='image_data_param', full_name='caffe.V1LayerParameter.image_data_param', index=26,
+      number=15, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='infogain_loss_param', full_name='caffe.V1LayerParameter.infogain_loss_param', index=27,
+      number=16, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='inner_product_param', full_name='caffe.V1LayerParameter.inner_product_param', index=28,
+      number=17, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='lrn_param', full_name='caffe.V1LayerParameter.lrn_param', index=29,
+      number=18, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='memory_data_param', full_name='caffe.V1LayerParameter.memory_data_param', index=30,
+      number=22, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mvn_param', full_name='caffe.V1LayerParameter.mvn_param', index=31,
+      number=34, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pooling_param', full_name='caffe.V1LayerParameter.pooling_param', index=32,
+      number=19, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='power_param', full_name='caffe.V1LayerParameter.power_param', index=33,
+      number=21, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='relu_param', full_name='caffe.V1LayerParameter.relu_param', index=34,
+      number=30, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='sigmoid_param', full_name='caffe.V1LayerParameter.sigmoid_param', index=35,
+      number=38, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='softmax_param', full_name='caffe.V1LayerParameter.softmax_param', index=36,
+      number=39, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='slice_param', full_name='caffe.V1LayerParameter.slice_param', index=37,
+      number=31, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='tanh_param', full_name='caffe.V1LayerParameter.tanh_param', index=38,
+      number=37, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='threshold_param', full_name='caffe.V1LayerParameter.threshold_param', index=39,
+      number=25, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='window_data_param', full_name='caffe.V1LayerParameter.window_data_param', index=40,
+      number=20, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='transform_param', full_name='caffe.V1LayerParameter.transform_param', index=41,
+      number=36, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='loss_param', full_name='caffe.V1LayerParameter.loss_param', index=42,
+      number=42, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='layer', full_name='caffe.V1LayerParameter.layer', index=43,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _V1LAYERPARAMETER_LAYERTYPE,
+    _V1LAYERPARAMETER_DIMCHECKMODE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=12667,
+  serialized_end=15259,
+)
+
+
+_V0LAYERPARAMETER = _descriptor.Descriptor(
+  name='V0LayerParameter',
+  full_name='caffe.V0LayerParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='caffe.V0LayerParameter.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='caffe.V0LayerParameter.type', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_output', full_name='caffe.V0LayerParameter.num_output', index=2,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='biasterm', full_name='caffe.V0LayerParameter.biasterm', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_filler', full_name='caffe.V0LayerParameter.weight_filler', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bias_filler', full_name='caffe.V0LayerParameter.bias_filler', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad', full_name='caffe.V0LayerParameter.pad', index=6,
+      number=7, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='kernelsize', full_name='caffe.V0LayerParameter.kernelsize', index=7,
+      number=8, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='group', full_name='caffe.V0LayerParameter.group', index=8,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='stride', full_name='caffe.V0LayerParameter.stride', index=9,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pool', full_name='caffe.V0LayerParameter.pool', index=10,
+      number=11, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='dropout_ratio', full_name='caffe.V0LayerParameter.dropout_ratio', index=11,
+      number=12, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='local_size', full_name='caffe.V0LayerParameter.local_size', index=12,
+      number=13, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=5,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='alpha', full_name='caffe.V0LayerParameter.alpha', index=13,
+      number=14, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='beta', full_name='caffe.V0LayerParameter.beta', index=14,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.75),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='k', full_name='caffe.V0LayerParameter.k', index=15,
+      number=22, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='source', full_name='caffe.V0LayerParameter.source', index=16,
+      number=16, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale', full_name='caffe.V0LayerParameter.scale', index=17,
+      number=17, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='meanfile', full_name='caffe.V0LayerParameter.meanfile', index=18,
+      number=18, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='batchsize', full_name='caffe.V0LayerParameter.batchsize', index=19,
+      number=19, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='cropsize', full_name='caffe.V0LayerParameter.cropsize', index=20,
+      number=20, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mirror', full_name='caffe.V0LayerParameter.mirror', index=21,
+      number=21, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blobs', full_name='caffe.V0LayerParameter.blobs', index=22,
+      number=50, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='blobs_lr', full_name='caffe.V0LayerParameter.blobs_lr', index=23,
+      number=51, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='weight_decay', full_name='caffe.V0LayerParameter.weight_decay', index=24,
+      number=52, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='rand_skip', full_name='caffe.V0LayerParameter.rand_skip', index=25,
+      number=53, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='det_fg_threshold', full_name='caffe.V0LayerParameter.det_fg_threshold', index=26,
+      number=54, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='det_bg_threshold', full_name='caffe.V0LayerParameter.det_bg_threshold', index=27,
+      number=55, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='det_fg_fraction', full_name='caffe.V0LayerParameter.det_fg_fraction', index=28,
+      number=56, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.25),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='det_context_pad', full_name='caffe.V0LayerParameter.det_context_pad', index=29,
+      number=58, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='det_crop_mode', full_name='caffe.V0LayerParameter.det_crop_mode', index=30,
+      number=59, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"warp".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_num', full_name='caffe.V0LayerParameter.new_num', index=31,
+      number=60, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_channels', full_name='caffe.V0LayerParameter.new_channels', index=32,
+      number=61, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_height', full_name='caffe.V0LayerParameter.new_height', index=33,
+      number=62, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='new_width', full_name='caffe.V0LayerParameter.new_width', index=34,
+      number=63, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shuffle_images', full_name='caffe.V0LayerParameter.shuffle_images', index=35,
+      number=64, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='concat_dim', full_name='caffe.V0LayerParameter.concat_dim', index=36,
+      number=65, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hdf5_output_param', full_name='caffe.V0LayerParameter.hdf5_output_param', index=37,
+      number=1001, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _V0LAYERPARAMETER_POOLMETHOD,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=15262,
+  serialized_end=16283,
+)
+
+
+_PRELUPARAMETER = _descriptor.Descriptor(
+  name='PReLUParameter',
+  full_name='caffe.PReLUParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='filler', full_name='caffe.PReLUParameter.filler', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='channel_shared', full_name='caffe.PReLUParameter.channel_shared', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16285,
+  serialized_end=16372,
+)
+
+
+_RELU6PARAMETER = _descriptor.Descriptor(
+  name='ReLU6Parameter',
+  full_name='caffe.ReLU6Parameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='negative_slope', full_name='caffe.ReLU6Parameter.negative_slope', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16374,
+  serialized_end=16417,
+)
+
+
+_INTERPPARAMETER = _descriptor.Descriptor(
+  name='InterpParameter',
+  full_name='caffe.InterpParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.InterpParameter.height', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.InterpParameter.width', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='zoom_factor', full_name='caffe.InterpParameter.zoom_factor', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shrink_factor', full_name='caffe.InterpParameter.shrink_factor', index=3,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_beg', full_name='caffe.InterpParameter.pad_beg', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_end', full_name='caffe.InterpParameter.pad_end', index=5,
+      number=6, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16420,
+  serialized_end=16564,
+)
+
+
+_PERMUTEPARAMETER = _descriptor.Descriptor(
+  name='PermuteParameter',
+  full_name='caffe.PermuteParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='order', full_name='caffe.PermuteParameter.order', index=0,
+      number=1, type=13, cpp_type=3, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16566,
+  serialized_end=16599,
+)
+
+
+_PRIORBOXPARAMETER = _descriptor.Descriptor(
+  name='PriorBoxParameter',
+  full_name='caffe.PriorBoxParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='min_size', full_name='caffe.PriorBoxParameter.min_size', index=0,
+      number=1, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max_size', full_name='caffe.PriorBoxParameter.max_size', index=1,
+      number=2, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='aspect_ratio', full_name='caffe.PriorBoxParameter.aspect_ratio', index=2,
+      number=3, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='flip', full_name='caffe.PriorBoxParameter.flip', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='clip', full_name='caffe.PriorBoxParameter.clip', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='variance', full_name='caffe.PriorBoxParameter.variance', index=5,
+      number=6, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='img_size', full_name='caffe.PriorBoxParameter.img_size', index=6,
+      number=7, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='img_h', full_name='caffe.PriorBoxParameter.img_h', index=7,
+      number=8, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='img_w', full_name='caffe.PriorBoxParameter.img_w', index=8,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='step', full_name='caffe.PriorBoxParameter.step', index=9,
+      number=10, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='step_h', full_name='caffe.PriorBoxParameter.step_h', index=10,
+      number=11, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='step_w', full_name='caffe.PriorBoxParameter.step_w', index=11,
+      number=12, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='offset', full_name='caffe.PriorBoxParameter.offset', index=12,
+      number=13, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _PRIORBOXPARAMETER_CODETYPE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16602,
+  serialized_end=16911,
+)
+
+
+_DETECTIONOUTPUTPARAMETER = _descriptor.Descriptor(
+  name='DetectionOutputParameter',
+  full_name='caffe.DetectionOutputParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_classes', full_name='caffe.DetectionOutputParameter.num_classes', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='share_location', full_name='caffe.DetectionOutputParameter.share_location', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='background_label_id', full_name='caffe.DetectionOutputParameter.background_label_id', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='nms_param', full_name='caffe.DetectionOutputParameter.nms_param', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='save_output_param', full_name='caffe.DetectionOutputParameter.save_output_param', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='code_type', full_name='caffe.DetectionOutputParameter.code_type', index=5,
+      number=6, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='variance_encoded_in_target', full_name='caffe.DetectionOutputParameter.variance_encoded_in_target', index=6,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='keep_top_k', full_name='caffe.DetectionOutputParameter.keep_top_k', index=7,
+      number=7, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='confidence_threshold', full_name='caffe.DetectionOutputParameter.confidence_threshold', index=8,
+      number=9, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='visualize', full_name='caffe.DetectionOutputParameter.visualize', index=9,
+      number=10, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='visualize_threshold', full_name='caffe.DetectionOutputParameter.visualize_threshold', index=10,
+      number=11, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='save_file', full_name='caffe.DetectionOutputParameter.save_file', index=11,
+      number=12, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=16914,
+  serialized_end=17369,
+)
+
+
+_DETECTIONEVALUATEPARAMETER = _descriptor.Descriptor(
+  name='DetectionEvaluateParameter',
+  full_name='caffe.DetectionEvaluateParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_classes', full_name='caffe.DetectionEvaluateParameter.num_classes', index=0,
+      number=1, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='background_label_id', full_name='caffe.DetectionEvaluateParameter.background_label_id', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='overlap_threshold', full_name='caffe.DetectionEvaluateParameter.overlap_threshold', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.5),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='evaluate_difficult_gt', full_name='caffe.DetectionEvaluateParameter.evaluate_difficult_gt', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='name_size_file', full_name='caffe.DetectionEvaluateParameter.name_size_file', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='resize_param', full_name='caffe.DetectionEvaluateParameter.resize_param', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=17372,
+  serialized_end=17592,
+)
+
+
+_RESIZEPARAMETER = _descriptor.Descriptor(
+  name='ResizeParameter',
+  full_name='caffe.ResizeParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='prob', full_name='caffe.ResizeParameter.prob', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='resize_mode', full_name='caffe.ResizeParameter.resize_mode', index=1,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='caffe.ResizeParameter.height', index=2,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='caffe.ResizeParameter.width', index=3,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='height_scale', full_name='caffe.ResizeParameter.height_scale', index=4,
+      number=8, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='width_scale', full_name='caffe.ResizeParameter.width_scale', index=5,
+      number=9, type=13, cpp_type=3, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_mode', full_name='caffe.ResizeParameter.pad_mode', index=6,
+      number=5, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_value', full_name='caffe.ResizeParameter.pad_value', index=7,
+      number=6, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='interp_mode', full_name='caffe.ResizeParameter.interp_mode', index=8,
+      number=7, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _RESIZEPARAMETER_RESIZE_MODE,
+    _RESIZEPARAMETER_PAD_MODE,
+    _RESIZEPARAMETER_INTERP_MODE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=17595,
+  serialized_end=18123,
+)
+
+
+_NONMAXIMUMSUPPRESSIONPARAMETER = _descriptor.Descriptor(
+  name='NonMaximumSuppressionParameter',
+  full_name='caffe.NonMaximumSuppressionParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='nms_threshold', full_name='caffe.NonMaximumSuppressionParameter.nms_threshold', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.3),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='top_k', full_name='caffe.NonMaximumSuppressionParameter.top_k', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eta', full_name='caffe.NonMaximumSuppressionParameter.eta', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=18125,
+  serialized_end=18216,
+)
+
+
+_SAVEOUTPUTPARAMETER = _descriptor.Descriptor(
+  name='SaveOutputParameter',
+  full_name='caffe.SaveOutputParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='output_directory', full_name='caffe.SaveOutputParameter.output_directory', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='output_name_prefix', full_name='caffe.SaveOutputParameter.output_name_prefix', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='output_format', full_name='caffe.SaveOutputParameter.output_format', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='label_map_file', full_name='caffe.SaveOutputParameter.label_map_file', index=3,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='name_size_file', full_name='caffe.SaveOutputParameter.name_size_file', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_test_image', full_name='caffe.SaveOutputParameter.num_test_image', index=5,
+      number=6, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='resize_param', full_name='caffe.SaveOutputParameter.resize_param', index=6,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=18219,
+  serialized_end=18435,
+)
+
+
+_AXPYPARAMETER = _descriptor.Descriptor(
+  name='AxpyParameter',
+  full_name='caffe.AxpyParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=18437,
+  serialized_end=18452,
+)
+
+
+_NORMALIZEPARAMETER = _descriptor.Descriptor(
+  name='NormalizeParameter',
+  full_name='caffe.NormalizeParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='across_spatial', full_name='caffe.NormalizeParameter.across_spatial', index=0,
+      number=1, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='scale_filler', full_name='caffe.NormalizeParameter.scale_filler', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='channel_shared', full_name='caffe.NormalizeParameter.channel_shared', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eps', full_name='caffe.NormalizeParameter.eps', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(1e-10),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=18455,
+  serialized_end=18601,
+)
+
+_BLOBPROTO.fields_by_name['shape'].message_type = _BLOBSHAPE
+_BLOBPROTOVECTOR.fields_by_name['blobs'].message_type = _BLOBPROTO
+_FILLERPARAMETER.fields_by_name['variance_norm'].enum_type = _FILLERPARAMETER_VARIANCENORM
+_FILLERPARAMETER_VARIANCENORM.containing_type = _FILLERPARAMETER
+_NETPARAMETER.fields_by_name['input_shape'].message_type = _BLOBSHAPE
+_NETPARAMETER.fields_by_name['state'].message_type = _NETSTATE
+_NETPARAMETER.fields_by_name['layer'].message_type = _LAYERPARAMETER
+_NETPARAMETER.fields_by_name['layers'].message_type = _V1LAYERPARAMETER
+_SOLVERPARAMETER.fields_by_name['net_param'].message_type = _NETPARAMETER
+_SOLVERPARAMETER.fields_by_name['train_net_param'].message_type = _NETPARAMETER
+_SOLVERPARAMETER.fields_by_name['test_net_param'].message_type = _NETPARAMETER
+_SOLVERPARAMETER.fields_by_name['train_state'].message_type = _NETSTATE
+_SOLVERPARAMETER.fields_by_name['test_state'].message_type = _NETSTATE
+_SOLVERPARAMETER.fields_by_name['snapshot_format'].enum_type = _SOLVERPARAMETER_SNAPSHOTFORMAT
+_SOLVERPARAMETER.fields_by_name['solver_mode'].enum_type = _SOLVERPARAMETER_SOLVERMODE
+_SOLVERPARAMETER.fields_by_name['solver_type'].enum_type = _SOLVERPARAMETER_SOLVERTYPE
+_SOLVERPARAMETER_SNAPSHOTFORMAT.containing_type = _SOLVERPARAMETER
+_SOLVERPARAMETER_SOLVERMODE.containing_type = _SOLVERPARAMETER
+_SOLVERPARAMETER_SOLVERTYPE.containing_type = _SOLVERPARAMETER
+_SOLVERSTATE.fields_by_name['history'].message_type = _BLOBPROTO
+_NETSTATE.fields_by_name['phase'].enum_type = _PHASE
+_NETSTATERULE.fields_by_name['phase'].enum_type = _PHASE
+_PARAMSPEC.fields_by_name['share_mode'].enum_type = _PARAMSPEC_DIMCHECKMODE
+_PARAMSPEC_DIMCHECKMODE.containing_type = _PARAMSPEC
+_LAYERPARAMETER.fields_by_name['phase'].enum_type = _PHASE
+_LAYERPARAMETER.fields_by_name['param'].message_type = _PARAMSPEC
+_LAYERPARAMETER.fields_by_name['blobs'].message_type = _BLOBPROTO
+_LAYERPARAMETER.fields_by_name['include'].message_type = _NETSTATERULE
+_LAYERPARAMETER.fields_by_name['exclude'].message_type = _NETSTATERULE
+_LAYERPARAMETER.fields_by_name['transform_param'].message_type = _TRANSFORMATIONPARAMETER
+_LAYERPARAMETER.fields_by_name['loss_param'].message_type = _LOSSPARAMETER
+_LAYERPARAMETER.fields_by_name['accuracy_param'].message_type = _ACCURACYPARAMETER
+_LAYERPARAMETER.fields_by_name['argmax_param'].message_type = _ARGMAXPARAMETER
+_LAYERPARAMETER.fields_by_name['batch_norm_param'].message_type = _BATCHNORMPARAMETER
+_LAYERPARAMETER.fields_by_name['bias_param'].message_type = _BIASPARAMETER
+_LAYERPARAMETER.fields_by_name['clip_param'].message_type = _CLIPPARAMETER
+_LAYERPARAMETER.fields_by_name['concat_param'].message_type = _CONCATPARAMETER
+_LAYERPARAMETER.fields_by_name['contrastive_loss_param'].message_type = _CONTRASTIVELOSSPARAMETER
+_LAYERPARAMETER.fields_by_name['convolution_param'].message_type = _CONVOLUTIONPARAMETER
+_LAYERPARAMETER.fields_by_name['upsample_param'].message_type = _UPSAMPLEPARAMETER
+_LAYERPARAMETER.fields_by_name['crop_param'].message_type = _CROPPARAMETER
+_LAYERPARAMETER.fields_by_name['data_param'].message_type = _DATAPARAMETER
+_LAYERPARAMETER.fields_by_name['dropout_param'].message_type = _DROPOUTPARAMETER
+_LAYERPARAMETER.fields_by_name['dummy_data_param'].message_type = _DUMMYDATAPARAMETER
+_LAYERPARAMETER.fields_by_name['eltwise_param'].message_type = _ELTWISEPARAMETER
+_LAYERPARAMETER.fields_by_name['elu_param'].message_type = _ELUPARAMETER
+_LAYERPARAMETER.fields_by_name['embed_param'].message_type = _EMBEDPARAMETER
+_LAYERPARAMETER.fields_by_name['exp_param'].message_type = _EXPPARAMETER
+_LAYERPARAMETER.fields_by_name['flatten_param'].message_type = _FLATTENPARAMETER
+_LAYERPARAMETER.fields_by_name['hdf5_data_param'].message_type = _HDF5DATAPARAMETER
+_LAYERPARAMETER.fields_by_name['hdf5_output_param'].message_type = _HDF5OUTPUTPARAMETER
+_LAYERPARAMETER.fields_by_name['hinge_loss_param'].message_type = _HINGELOSSPARAMETER
+_LAYERPARAMETER.fields_by_name['image_data_param'].message_type = _IMAGEDATAPARAMETER
+_LAYERPARAMETER.fields_by_name['infogain_loss_param'].message_type = _INFOGAINLOSSPARAMETER
+_LAYERPARAMETER.fields_by_name['inner_product_param'].message_type = _INNERPRODUCTPARAMETER
+_LAYERPARAMETER.fields_by_name['input_param'].message_type = _INPUTPARAMETER
+_LAYERPARAMETER.fields_by_name['log_param'].message_type = _LOGPARAMETER
+_LAYERPARAMETER.fields_by_name['lrn_param'].message_type = _LRNPARAMETER
+_LAYERPARAMETER.fields_by_name['memory_data_param'].message_type = _MEMORYDATAPARAMETER
+_LAYERPARAMETER.fields_by_name['mvn_param'].message_type = _MVNPARAMETER
+_LAYERPARAMETER.fields_by_name['parameter_param'].message_type = _PARAMETERPARAMETER
+_LAYERPARAMETER.fields_by_name['pooling_param'].message_type = _POOLINGPARAMETER
+_LAYERPARAMETER.fields_by_name['power_param'].message_type = _POWERPARAMETER
+_LAYERPARAMETER.fields_by_name['prelu_param'].message_type = _PRELUPARAMETER
+_LAYERPARAMETER.fields_by_name['python_param'].message_type = _PYTHONPARAMETER
+_LAYERPARAMETER.fields_by_name['recurrent_param'].message_type = _RECURRENTPARAMETER
+_LAYERPARAMETER.fields_by_name['reduction_param'].message_type = _REDUCTIONPARAMETER
+_LAYERPARAMETER.fields_by_name['relu_param'].message_type = _RELUPARAMETER
+_LAYERPARAMETER.fields_by_name['reshape_param'].message_type = _RESHAPEPARAMETER
+_LAYERPARAMETER.fields_by_name['scale_param'].message_type = _SCALEPARAMETER
+_LAYERPARAMETER.fields_by_name['sigmoid_param'].message_type = _SIGMOIDPARAMETER
+_LAYERPARAMETER.fields_by_name['softmax_param'].message_type = _SOFTMAXPARAMETER
+_LAYERPARAMETER.fields_by_name['spp_param'].message_type = _SPPPARAMETER
+_LAYERPARAMETER.fields_by_name['slice_param'].message_type = _SLICEPARAMETER
+_LAYERPARAMETER.fields_by_name['swish_param'].message_type = _SWISHPARAMETER
+_LAYERPARAMETER.fields_by_name['tanh_param'].message_type = _TANHPARAMETER
+_LAYERPARAMETER.fields_by_name['threshold_param'].message_type = _THRESHOLDPARAMETER
+_LAYERPARAMETER.fields_by_name['tile_param'].message_type = _TILEPARAMETER
+_LAYERPARAMETER.fields_by_name['window_data_param'].message_type = _WINDOWDATAPARAMETER
+_LAYERPARAMETER.fields_by_name['interp_param'].message_type = _INTERPPARAMETER
+_LAYERPARAMETER.fields_by_name['shuffle_channel_param'].message_type = _SHUFFLECHANNELPARAMETER
+_LAYERPARAMETER.fields_by_name['permute_param'].message_type = _PERMUTEPARAMETER
+_LAYERPARAMETER.fields_by_name['prior_box_param'].message_type = _PRIORBOXPARAMETER
+_LAYERPARAMETER.fields_by_name['detection_output_param'].message_type = _DETECTIONOUTPUTPARAMETER
+_LAYERPARAMETER.fields_by_name['detection_evaluate_param'].message_type = _DETECTIONEVALUATEPARAMETER
+_LAYERPARAMETER.fields_by_name['norm_param'].message_type = _NORMALIZEPARAMETER
+_LAYERPARAMETER.fields_by_name['axpy_param'].message_type = _AXPYPARAMETER
+_LAYERPARAMETER.fields_by_name['relu6_param'].message_type = _RELU6PARAMETER
+_LOSSPARAMETER.fields_by_name['normalization'].enum_type = _LOSSPARAMETER_NORMALIZATIONMODE
+_LOSSPARAMETER_NORMALIZATIONMODE.containing_type = _LOSSPARAMETER
+_BIASPARAMETER.fields_by_name['filler'].message_type = _FILLERPARAMETER
+_CONVOLUTIONPARAMETER.fields_by_name['weight_filler'].message_type = _FILLERPARAMETER
+_CONVOLUTIONPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_CONVOLUTIONPARAMETER.fields_by_name['engine'].enum_type = _CONVOLUTIONPARAMETER_ENGINE
+_CONVOLUTIONPARAMETER_ENGINE.containing_type = _CONVOLUTIONPARAMETER
+_DATAPARAMETER.fields_by_name['backend'].enum_type = _DATAPARAMETER_DB
+_DATAPARAMETER_DB.containing_type = _DATAPARAMETER
+_DUMMYDATAPARAMETER.fields_by_name['data_filler'].message_type = _FILLERPARAMETER
+_DUMMYDATAPARAMETER.fields_by_name['shape'].message_type = _BLOBSHAPE
+_ELTWISEPARAMETER.fields_by_name['operation'].enum_type = _ELTWISEPARAMETER_ELTWISEOP
+_ELTWISEPARAMETER_ELTWISEOP.containing_type = _ELTWISEPARAMETER
+_EMBEDPARAMETER.fields_by_name['weight_filler'].message_type = _FILLERPARAMETER
+_EMBEDPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_HINGELOSSPARAMETER.fields_by_name['norm'].enum_type = _HINGELOSSPARAMETER_NORM
+_HINGELOSSPARAMETER_NORM.containing_type = _HINGELOSSPARAMETER
+_INNERPRODUCTPARAMETER.fields_by_name['weight_filler'].message_type = _FILLERPARAMETER
+_INNERPRODUCTPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_INPUTPARAMETER.fields_by_name['shape'].message_type = _BLOBSHAPE
+_LRNPARAMETER.fields_by_name['norm_region'].enum_type = _LRNPARAMETER_NORMREGION
+_LRNPARAMETER.fields_by_name['engine'].enum_type = _LRNPARAMETER_ENGINE
+_LRNPARAMETER_NORMREGION.containing_type = _LRNPARAMETER
+_LRNPARAMETER_ENGINE.containing_type = _LRNPARAMETER
+_PARAMETERPARAMETER.fields_by_name['shape'].message_type = _BLOBSHAPE
+_POOLINGPARAMETER.fields_by_name['pool'].enum_type = _POOLINGPARAMETER_POOLMETHOD
+_POOLINGPARAMETER.fields_by_name['engine'].enum_type = _POOLINGPARAMETER_ENGINE
+_POOLINGPARAMETER.fields_by_name['round_mode'].enum_type = _POOLINGPARAMETER_ROUNDMODE
+_POOLINGPARAMETER_POOLMETHOD.containing_type = _POOLINGPARAMETER
+_POOLINGPARAMETER_ENGINE.containing_type = _POOLINGPARAMETER
+_POOLINGPARAMETER_ROUNDMODE.containing_type = _POOLINGPARAMETER
+_RECURRENTPARAMETER.fields_by_name['weight_filler'].message_type = _FILLERPARAMETER
+_RECURRENTPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_REDUCTIONPARAMETER.fields_by_name['operation'].enum_type = _REDUCTIONPARAMETER_REDUCTIONOP
+_REDUCTIONPARAMETER_REDUCTIONOP.containing_type = _REDUCTIONPARAMETER
+_RELUPARAMETER.fields_by_name['engine'].enum_type = _RELUPARAMETER_ENGINE
+_RELUPARAMETER_ENGINE.containing_type = _RELUPARAMETER
+_RESHAPEPARAMETER.fields_by_name['shape'].message_type = _BLOBSHAPE
+_SCALEPARAMETER.fields_by_name['filler'].message_type = _FILLERPARAMETER
+_SCALEPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_SIGMOIDPARAMETER.fields_by_name['engine'].enum_type = _SIGMOIDPARAMETER_ENGINE
+_SIGMOIDPARAMETER_ENGINE.containing_type = _SIGMOIDPARAMETER
+_SOFTMAXPARAMETER.fields_by_name['engine'].enum_type = _SOFTMAXPARAMETER_ENGINE
+_SOFTMAXPARAMETER_ENGINE.containing_type = _SOFTMAXPARAMETER
+_TANHPARAMETER.fields_by_name['engine'].enum_type = _TANHPARAMETER_ENGINE
+_TANHPARAMETER_ENGINE.containing_type = _TANHPARAMETER
+_SPPPARAMETER.fields_by_name['pool'].enum_type = _SPPPARAMETER_POOLMETHOD
+_SPPPARAMETER.fields_by_name['engine'].enum_type = _SPPPARAMETER_ENGINE
+_SPPPARAMETER_POOLMETHOD.containing_type = _SPPPARAMETER
+_SPPPARAMETER_ENGINE.containing_type = _SPPPARAMETER
+_V1LAYERPARAMETER.fields_by_name['include'].message_type = _NETSTATERULE
+_V1LAYERPARAMETER.fields_by_name['exclude'].message_type = _NETSTATERULE
+_V1LAYERPARAMETER.fields_by_name['type'].enum_type = _V1LAYERPARAMETER_LAYERTYPE
+_V1LAYERPARAMETER.fields_by_name['blobs'].message_type = _BLOBPROTO
+_V1LAYERPARAMETER.fields_by_name['blob_share_mode'].enum_type = _V1LAYERPARAMETER_DIMCHECKMODE
+_V1LAYERPARAMETER.fields_by_name['accuracy_param'].message_type = _ACCURACYPARAMETER
+_V1LAYERPARAMETER.fields_by_name['argmax_param'].message_type = _ARGMAXPARAMETER
+_V1LAYERPARAMETER.fields_by_name['concat_param'].message_type = _CONCATPARAMETER
+_V1LAYERPARAMETER.fields_by_name['contrastive_loss_param'].message_type = _CONTRASTIVELOSSPARAMETER
+_V1LAYERPARAMETER.fields_by_name['convolution_param'].message_type = _CONVOLUTIONPARAMETER
+_V1LAYERPARAMETER.fields_by_name['upsample_param'].message_type = _UPSAMPLEPARAMETER
+_V1LAYERPARAMETER.fields_by_name['data_param'].message_type = _DATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['dropout_param'].message_type = _DROPOUTPARAMETER
+_V1LAYERPARAMETER.fields_by_name['dummy_data_param'].message_type = _DUMMYDATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['eltwise_param'].message_type = _ELTWISEPARAMETER
+_V1LAYERPARAMETER.fields_by_name['exp_param'].message_type = _EXPPARAMETER
+_V1LAYERPARAMETER.fields_by_name['hdf5_data_param'].message_type = _HDF5DATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['hdf5_output_param'].message_type = _HDF5OUTPUTPARAMETER
+_V1LAYERPARAMETER.fields_by_name['hinge_loss_param'].message_type = _HINGELOSSPARAMETER
+_V1LAYERPARAMETER.fields_by_name['image_data_param'].message_type = _IMAGEDATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['infogain_loss_param'].message_type = _INFOGAINLOSSPARAMETER
+_V1LAYERPARAMETER.fields_by_name['inner_product_param'].message_type = _INNERPRODUCTPARAMETER
+_V1LAYERPARAMETER.fields_by_name['lrn_param'].message_type = _LRNPARAMETER
+_V1LAYERPARAMETER.fields_by_name['memory_data_param'].message_type = _MEMORYDATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['mvn_param'].message_type = _MVNPARAMETER
+_V1LAYERPARAMETER.fields_by_name['pooling_param'].message_type = _POOLINGPARAMETER
+_V1LAYERPARAMETER.fields_by_name['power_param'].message_type = _POWERPARAMETER
+_V1LAYERPARAMETER.fields_by_name['relu_param'].message_type = _RELUPARAMETER
+_V1LAYERPARAMETER.fields_by_name['sigmoid_param'].message_type = _SIGMOIDPARAMETER
+_V1LAYERPARAMETER.fields_by_name['softmax_param'].message_type = _SOFTMAXPARAMETER
+_V1LAYERPARAMETER.fields_by_name['slice_param'].message_type = _SLICEPARAMETER
+_V1LAYERPARAMETER.fields_by_name['tanh_param'].message_type = _TANHPARAMETER
+_V1LAYERPARAMETER.fields_by_name['threshold_param'].message_type = _THRESHOLDPARAMETER
+_V1LAYERPARAMETER.fields_by_name['window_data_param'].message_type = _WINDOWDATAPARAMETER
+_V1LAYERPARAMETER.fields_by_name['transform_param'].message_type = _TRANSFORMATIONPARAMETER
+_V1LAYERPARAMETER.fields_by_name['loss_param'].message_type = _LOSSPARAMETER
+_V1LAYERPARAMETER.fields_by_name['layer'].message_type = _V0LAYERPARAMETER
+_V1LAYERPARAMETER_LAYERTYPE.containing_type = _V1LAYERPARAMETER
+_V1LAYERPARAMETER_DIMCHECKMODE.containing_type = _V1LAYERPARAMETER
+_V0LAYERPARAMETER.fields_by_name['weight_filler'].message_type = _FILLERPARAMETER
+_V0LAYERPARAMETER.fields_by_name['bias_filler'].message_type = _FILLERPARAMETER
+_V0LAYERPARAMETER.fields_by_name['pool'].enum_type = _V0LAYERPARAMETER_POOLMETHOD
+_V0LAYERPARAMETER.fields_by_name['blobs'].message_type = _BLOBPROTO
+_V0LAYERPARAMETER.fields_by_name['hdf5_output_param'].message_type = _HDF5OUTPUTPARAMETER
+_V0LAYERPARAMETER_POOLMETHOD.containing_type = _V0LAYERPARAMETER
+_PRELUPARAMETER.fields_by_name['filler'].message_type = _FILLERPARAMETER
+_PRIORBOXPARAMETER_CODETYPE.containing_type = _PRIORBOXPARAMETER
+_DETECTIONOUTPUTPARAMETER.fields_by_name['nms_param'].message_type = _NONMAXIMUMSUPPRESSIONPARAMETER
+_DETECTIONOUTPUTPARAMETER.fields_by_name['save_output_param'].message_type = _SAVEOUTPUTPARAMETER
+_DETECTIONOUTPUTPARAMETER.fields_by_name['code_type'].enum_type = _PRIORBOXPARAMETER_CODETYPE
+_DETECTIONEVALUATEPARAMETER.fields_by_name['resize_param'].message_type = _RESIZEPARAMETER
+_RESIZEPARAMETER.fields_by_name['resize_mode'].enum_type = _RESIZEPARAMETER_RESIZE_MODE
+_RESIZEPARAMETER.fields_by_name['pad_mode'].enum_type = _RESIZEPARAMETER_PAD_MODE
+_RESIZEPARAMETER.fields_by_name['interp_mode'].enum_type = _RESIZEPARAMETER_INTERP_MODE
+_RESIZEPARAMETER_RESIZE_MODE.containing_type = _RESIZEPARAMETER
+_RESIZEPARAMETER_PAD_MODE.containing_type = _RESIZEPARAMETER
+_RESIZEPARAMETER_INTERP_MODE.containing_type = _RESIZEPARAMETER
+_SAVEOUTPUTPARAMETER.fields_by_name['resize_param'].message_type = _RESIZEPARAMETER
+_NORMALIZEPARAMETER.fields_by_name['scale_filler'].message_type = _FILLERPARAMETER
+DESCRIPTOR.message_types_by_name['BlobShape'] = _BLOBSHAPE
+DESCRIPTOR.message_types_by_name['BlobProto'] = _BLOBPROTO
+DESCRIPTOR.message_types_by_name['BlobProtoVector'] = _BLOBPROTOVECTOR
+DESCRIPTOR.message_types_by_name['Datum'] = _DATUM
+DESCRIPTOR.message_types_by_name['FillerParameter'] = _FILLERPARAMETER
+DESCRIPTOR.message_types_by_name['NetParameter'] = _NETPARAMETER
+DESCRIPTOR.message_types_by_name['SolverParameter'] = _SOLVERPARAMETER
+DESCRIPTOR.message_types_by_name['SolverState'] = _SOLVERSTATE
+DESCRIPTOR.message_types_by_name['NetState'] = _NETSTATE
+DESCRIPTOR.message_types_by_name['NetStateRule'] = _NETSTATERULE
+DESCRIPTOR.message_types_by_name['ParamSpec'] = _PARAMSPEC
+DESCRIPTOR.message_types_by_name['LayerParameter'] = _LAYERPARAMETER
+DESCRIPTOR.message_types_by_name['ShuffleChannelParameter'] = _SHUFFLECHANNELPARAMETER
+DESCRIPTOR.message_types_by_name['TransformationParameter'] = _TRANSFORMATIONPARAMETER
+DESCRIPTOR.message_types_by_name['LossParameter'] = _LOSSPARAMETER
+DESCRIPTOR.message_types_by_name['AccuracyParameter'] = _ACCURACYPARAMETER
+DESCRIPTOR.message_types_by_name['ArgMaxParameter'] = _ARGMAXPARAMETER
+DESCRIPTOR.message_types_by_name['ClipParameter'] = _CLIPPARAMETER
+DESCRIPTOR.message_types_by_name['ConcatParameter'] = _CONCATPARAMETER
+DESCRIPTOR.message_types_by_name['BatchNormParameter'] = _BATCHNORMPARAMETER
+DESCRIPTOR.message_types_by_name['BiasParameter'] = _BIASPARAMETER
+DESCRIPTOR.message_types_by_name['ContrastiveLossParameter'] = _CONTRASTIVELOSSPARAMETER
+DESCRIPTOR.message_types_by_name['UpsampleParameter'] = _UPSAMPLEPARAMETER
+DESCRIPTOR.message_types_by_name['ConvolutionParameter'] = _CONVOLUTIONPARAMETER
+DESCRIPTOR.message_types_by_name['CropParameter'] = _CROPPARAMETER
+DESCRIPTOR.message_types_by_name['DataParameter'] = _DATAPARAMETER
+DESCRIPTOR.message_types_by_name['DropoutParameter'] = _DROPOUTPARAMETER
+DESCRIPTOR.message_types_by_name['DummyDataParameter'] = _DUMMYDATAPARAMETER
+DESCRIPTOR.message_types_by_name['EltwiseParameter'] = _ELTWISEPARAMETER
+DESCRIPTOR.message_types_by_name['ELUParameter'] = _ELUPARAMETER
+DESCRIPTOR.message_types_by_name['EmbedParameter'] = _EMBEDPARAMETER
+DESCRIPTOR.message_types_by_name['ExpParameter'] = _EXPPARAMETER
+DESCRIPTOR.message_types_by_name['FlattenParameter'] = _FLATTENPARAMETER
+DESCRIPTOR.message_types_by_name['HDF5DataParameter'] = _HDF5DATAPARAMETER
+DESCRIPTOR.message_types_by_name['HDF5OutputParameter'] = _HDF5OUTPUTPARAMETER
+DESCRIPTOR.message_types_by_name['HingeLossParameter'] = _HINGELOSSPARAMETER
+DESCRIPTOR.message_types_by_name['ImageDataParameter'] = _IMAGEDATAPARAMETER
+DESCRIPTOR.message_types_by_name['InfogainLossParameter'] = _INFOGAINLOSSPARAMETER
+DESCRIPTOR.message_types_by_name['InnerProductParameter'] = _INNERPRODUCTPARAMETER
+DESCRIPTOR.message_types_by_name['InputParameter'] = _INPUTPARAMETER
+DESCRIPTOR.message_types_by_name['LogParameter'] = _LOGPARAMETER
+DESCRIPTOR.message_types_by_name['LRNParameter'] = _LRNPARAMETER
+DESCRIPTOR.message_types_by_name['MemoryDataParameter'] = _MEMORYDATAPARAMETER
+DESCRIPTOR.message_types_by_name['MVNParameter'] = _MVNPARAMETER
+DESCRIPTOR.message_types_by_name['ParameterParameter'] = _PARAMETERPARAMETER
+DESCRIPTOR.message_types_by_name['PoolingParameter'] = _POOLINGPARAMETER
+DESCRIPTOR.message_types_by_name['PowerParameter'] = _POWERPARAMETER
+DESCRIPTOR.message_types_by_name['PythonParameter'] = _PYTHONPARAMETER
+DESCRIPTOR.message_types_by_name['RecurrentParameter'] = _RECURRENTPARAMETER
+DESCRIPTOR.message_types_by_name['ReductionParameter'] = _REDUCTIONPARAMETER
+DESCRIPTOR.message_types_by_name['ReLUParameter'] = _RELUPARAMETER
+DESCRIPTOR.message_types_by_name['ReshapeParameter'] = _RESHAPEPARAMETER
+DESCRIPTOR.message_types_by_name['ScaleParameter'] = _SCALEPARAMETER
+DESCRIPTOR.message_types_by_name['SigmoidParameter'] = _SIGMOIDPARAMETER
+DESCRIPTOR.message_types_by_name['SliceParameter'] = _SLICEPARAMETER
+DESCRIPTOR.message_types_by_name['SoftmaxParameter'] = _SOFTMAXPARAMETER
+DESCRIPTOR.message_types_by_name['SwishParameter'] = _SWISHPARAMETER
+DESCRIPTOR.message_types_by_name['TanHParameter'] = _TANHPARAMETER
+DESCRIPTOR.message_types_by_name['TileParameter'] = _TILEPARAMETER
+DESCRIPTOR.message_types_by_name['ThresholdParameter'] = _THRESHOLDPARAMETER
+DESCRIPTOR.message_types_by_name['WindowDataParameter'] = _WINDOWDATAPARAMETER
+DESCRIPTOR.message_types_by_name['SPPParameter'] = _SPPPARAMETER
+DESCRIPTOR.message_types_by_name['V1LayerParameter'] = _V1LAYERPARAMETER
+DESCRIPTOR.message_types_by_name['V0LayerParameter'] = _V0LAYERPARAMETER
+DESCRIPTOR.message_types_by_name['PReLUParameter'] = _PRELUPARAMETER
+DESCRIPTOR.message_types_by_name['ReLU6Parameter'] = _RELU6PARAMETER
+DESCRIPTOR.message_types_by_name['InterpParameter'] = _INTERPPARAMETER
+DESCRIPTOR.message_types_by_name['PermuteParameter'] = _PERMUTEPARAMETER
+DESCRIPTOR.message_types_by_name['PriorBoxParameter'] = _PRIORBOXPARAMETER
+DESCRIPTOR.message_types_by_name['DetectionOutputParameter'] = _DETECTIONOUTPUTPARAMETER
+DESCRIPTOR.message_types_by_name['DetectionEvaluateParameter'] = _DETECTIONEVALUATEPARAMETER
+DESCRIPTOR.message_types_by_name['ResizeParameter'] = _RESIZEPARAMETER
+DESCRIPTOR.message_types_by_name['NonMaximumSuppressionParameter'] = _NONMAXIMUMSUPPRESSIONPARAMETER
+DESCRIPTOR.message_types_by_name['SaveOutputParameter'] = _SAVEOUTPUTPARAMETER
+DESCRIPTOR.message_types_by_name['AxpyParameter'] = _AXPYPARAMETER
+DESCRIPTOR.message_types_by_name['NormalizeParameter'] = _NORMALIZEPARAMETER
+DESCRIPTOR.enum_types_by_name['Phase'] = _PHASE
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+BlobShape = _reflection.GeneratedProtocolMessageType('BlobShape', (_message.Message,), {
+  'DESCRIPTOR' : _BLOBSHAPE,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.BlobShape)
+  })
+_sym_db.RegisterMessage(BlobShape)
+
+BlobProto = _reflection.GeneratedProtocolMessageType('BlobProto', (_message.Message,), {
+  'DESCRIPTOR' : _BLOBPROTO,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.BlobProto)
+  })
+_sym_db.RegisterMessage(BlobProto)
+
+BlobProtoVector = _reflection.GeneratedProtocolMessageType('BlobProtoVector', (_message.Message,), {
+  'DESCRIPTOR' : _BLOBPROTOVECTOR,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.BlobProtoVector)
+  })
+_sym_db.RegisterMessage(BlobProtoVector)
+
+Datum = _reflection.GeneratedProtocolMessageType('Datum', (_message.Message,), {
+  'DESCRIPTOR' : _DATUM,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.Datum)
+  })
+_sym_db.RegisterMessage(Datum)
+
+FillerParameter = _reflection.GeneratedProtocolMessageType('FillerParameter', (_message.Message,), {
+  'DESCRIPTOR' : _FILLERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.FillerParameter)
+  })
+_sym_db.RegisterMessage(FillerParameter)
+
+NetParameter = _reflection.GeneratedProtocolMessageType('NetParameter', (_message.Message,), {
+  'DESCRIPTOR' : _NETPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.NetParameter)
+  })
+_sym_db.RegisterMessage(NetParameter)
+
+SolverParameter = _reflection.GeneratedProtocolMessageType('SolverParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SOLVERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SolverParameter)
+  })
+_sym_db.RegisterMessage(SolverParameter)
+
+SolverState = _reflection.GeneratedProtocolMessageType('SolverState', (_message.Message,), {
+  'DESCRIPTOR' : _SOLVERSTATE,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SolverState)
+  })
+_sym_db.RegisterMessage(SolverState)
+
+NetState = _reflection.GeneratedProtocolMessageType('NetState', (_message.Message,), {
+  'DESCRIPTOR' : _NETSTATE,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.NetState)
+  })
+_sym_db.RegisterMessage(NetState)
+
+NetStateRule = _reflection.GeneratedProtocolMessageType('NetStateRule', (_message.Message,), {
+  'DESCRIPTOR' : _NETSTATERULE,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.NetStateRule)
+  })
+_sym_db.RegisterMessage(NetStateRule)
+
+ParamSpec = _reflection.GeneratedProtocolMessageType('ParamSpec', (_message.Message,), {
+  'DESCRIPTOR' : _PARAMSPEC,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ParamSpec)
+  })
+_sym_db.RegisterMessage(ParamSpec)
+
+LayerParameter = _reflection.GeneratedProtocolMessageType('LayerParameter', (_message.Message,), {
+  'DESCRIPTOR' : _LAYERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.LayerParameter)
+  })
+_sym_db.RegisterMessage(LayerParameter)
+
+ShuffleChannelParameter = _reflection.GeneratedProtocolMessageType('ShuffleChannelParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SHUFFLECHANNELPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ShuffleChannelParameter)
+  })
+_sym_db.RegisterMessage(ShuffleChannelParameter)
+
+TransformationParameter = _reflection.GeneratedProtocolMessageType('TransformationParameter', (_message.Message,), {
+  'DESCRIPTOR' : _TRANSFORMATIONPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.TransformationParameter)
+  })
+_sym_db.RegisterMessage(TransformationParameter)
+
+LossParameter = _reflection.GeneratedProtocolMessageType('LossParameter', (_message.Message,), {
+  'DESCRIPTOR' : _LOSSPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.LossParameter)
+  })
+_sym_db.RegisterMessage(LossParameter)
+
+AccuracyParameter = _reflection.GeneratedProtocolMessageType('AccuracyParameter', (_message.Message,), {
+  'DESCRIPTOR' : _ACCURACYPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.AccuracyParameter)
+  })
+_sym_db.RegisterMessage(AccuracyParameter)
+
+ArgMaxParameter = _reflection.GeneratedProtocolMessageType('ArgMaxParameter', (_message.Message,), {
+  'DESCRIPTOR' : _ARGMAXPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ArgMaxParameter)
+  })
+_sym_db.RegisterMessage(ArgMaxParameter)
+
+ClipParameter = _reflection.GeneratedProtocolMessageType('ClipParameter', (_message.Message,), {
+  'DESCRIPTOR' : _CLIPPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ClipParameter)
+  })
+_sym_db.RegisterMessage(ClipParameter)
+
+ConcatParameter = _reflection.GeneratedProtocolMessageType('ConcatParameter', (_message.Message,), {
+  'DESCRIPTOR' : _CONCATPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ConcatParameter)
+  })
+_sym_db.RegisterMessage(ConcatParameter)
+
+BatchNormParameter = _reflection.GeneratedProtocolMessageType('BatchNormParameter', (_message.Message,), {
+  'DESCRIPTOR' : _BATCHNORMPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.BatchNormParameter)
+  })
+_sym_db.RegisterMessage(BatchNormParameter)
+
+BiasParameter = _reflection.GeneratedProtocolMessageType('BiasParameter', (_message.Message,), {
+  'DESCRIPTOR' : _BIASPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.BiasParameter)
+  })
+_sym_db.RegisterMessage(BiasParameter)
+
+ContrastiveLossParameter = _reflection.GeneratedProtocolMessageType('ContrastiveLossParameter', (_message.Message,), {
+  'DESCRIPTOR' : _CONTRASTIVELOSSPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ContrastiveLossParameter)
+  })
+_sym_db.RegisterMessage(ContrastiveLossParameter)
+
+UpsampleParameter = _reflection.GeneratedProtocolMessageType('UpsampleParameter', (_message.Message,), {
+  'DESCRIPTOR' : _UPSAMPLEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.UpsampleParameter)
+  })
+_sym_db.RegisterMessage(UpsampleParameter)
+
+ConvolutionParameter = _reflection.GeneratedProtocolMessageType('ConvolutionParameter', (_message.Message,), {
+  'DESCRIPTOR' : _CONVOLUTIONPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ConvolutionParameter)
+  })
+_sym_db.RegisterMessage(ConvolutionParameter)
+
+CropParameter = _reflection.GeneratedProtocolMessageType('CropParameter', (_message.Message,), {
+  'DESCRIPTOR' : _CROPPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.CropParameter)
+  })
+_sym_db.RegisterMessage(CropParameter)
+
+DataParameter = _reflection.GeneratedProtocolMessageType('DataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _DATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DataParameter)
+  })
+_sym_db.RegisterMessage(DataParameter)
+
+DropoutParameter = _reflection.GeneratedProtocolMessageType('DropoutParameter', (_message.Message,), {
+  'DESCRIPTOR' : _DROPOUTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DropoutParameter)
+  })
+_sym_db.RegisterMessage(DropoutParameter)
+
+DummyDataParameter = _reflection.GeneratedProtocolMessageType('DummyDataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _DUMMYDATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DummyDataParameter)
+  })
+_sym_db.RegisterMessage(DummyDataParameter)
+
+EltwiseParameter = _reflection.GeneratedProtocolMessageType('EltwiseParameter', (_message.Message,), {
+  'DESCRIPTOR' : _ELTWISEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.EltwiseParameter)
+  })
+_sym_db.RegisterMessage(EltwiseParameter)
+
+ELUParameter = _reflection.GeneratedProtocolMessageType('ELUParameter', (_message.Message,), {
+  'DESCRIPTOR' : _ELUPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ELUParameter)
+  })
+_sym_db.RegisterMessage(ELUParameter)
+
+EmbedParameter = _reflection.GeneratedProtocolMessageType('EmbedParameter', (_message.Message,), {
+  'DESCRIPTOR' : _EMBEDPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.EmbedParameter)
+  })
+_sym_db.RegisterMessage(EmbedParameter)
+
+ExpParameter = _reflection.GeneratedProtocolMessageType('ExpParameter', (_message.Message,), {
+  'DESCRIPTOR' : _EXPPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ExpParameter)
+  })
+_sym_db.RegisterMessage(ExpParameter)
+
+FlattenParameter = _reflection.GeneratedProtocolMessageType('FlattenParameter', (_message.Message,), {
+  'DESCRIPTOR' : _FLATTENPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.FlattenParameter)
+  })
+_sym_db.RegisterMessage(FlattenParameter)
+
+HDF5DataParameter = _reflection.GeneratedProtocolMessageType('HDF5DataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _HDF5DATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.HDF5DataParameter)
+  })
+_sym_db.RegisterMessage(HDF5DataParameter)
+
+HDF5OutputParameter = _reflection.GeneratedProtocolMessageType('HDF5OutputParameter', (_message.Message,), {
+  'DESCRIPTOR' : _HDF5OUTPUTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.HDF5OutputParameter)
+  })
+_sym_db.RegisterMessage(HDF5OutputParameter)
+
+HingeLossParameter = _reflection.GeneratedProtocolMessageType('HingeLossParameter', (_message.Message,), {
+  'DESCRIPTOR' : _HINGELOSSPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.HingeLossParameter)
+  })
+_sym_db.RegisterMessage(HingeLossParameter)
+
+ImageDataParameter = _reflection.GeneratedProtocolMessageType('ImageDataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _IMAGEDATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ImageDataParameter)
+  })
+_sym_db.RegisterMessage(ImageDataParameter)
+
+InfogainLossParameter = _reflection.GeneratedProtocolMessageType('InfogainLossParameter', (_message.Message,), {
+  'DESCRIPTOR' : _INFOGAINLOSSPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.InfogainLossParameter)
+  })
+_sym_db.RegisterMessage(InfogainLossParameter)
+
+InnerProductParameter = _reflection.GeneratedProtocolMessageType('InnerProductParameter', (_message.Message,), {
+  'DESCRIPTOR' : _INNERPRODUCTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.InnerProductParameter)
+  })
+_sym_db.RegisterMessage(InnerProductParameter)
+
+InputParameter = _reflection.GeneratedProtocolMessageType('InputParameter', (_message.Message,), {
+  'DESCRIPTOR' : _INPUTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.InputParameter)
+  })
+_sym_db.RegisterMessage(InputParameter)
+
+LogParameter = _reflection.GeneratedProtocolMessageType('LogParameter', (_message.Message,), {
+  'DESCRIPTOR' : _LOGPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.LogParameter)
+  })
+_sym_db.RegisterMessage(LogParameter)
+
+LRNParameter = _reflection.GeneratedProtocolMessageType('LRNParameter', (_message.Message,), {
+  'DESCRIPTOR' : _LRNPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.LRNParameter)
+  })
+_sym_db.RegisterMessage(LRNParameter)
+
+MemoryDataParameter = _reflection.GeneratedProtocolMessageType('MemoryDataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _MEMORYDATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.MemoryDataParameter)
+  })
+_sym_db.RegisterMessage(MemoryDataParameter)
+
+MVNParameter = _reflection.GeneratedProtocolMessageType('MVNParameter', (_message.Message,), {
+  'DESCRIPTOR' : _MVNPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.MVNParameter)
+  })
+_sym_db.RegisterMessage(MVNParameter)
+
+ParameterParameter = _reflection.GeneratedProtocolMessageType('ParameterParameter', (_message.Message,), {
+  'DESCRIPTOR' : _PARAMETERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ParameterParameter)
+  })
+_sym_db.RegisterMessage(ParameterParameter)
+
+PoolingParameter = _reflection.GeneratedProtocolMessageType('PoolingParameter', (_message.Message,), {
+  'DESCRIPTOR' : _POOLINGPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PoolingParameter)
+  })
+_sym_db.RegisterMessage(PoolingParameter)
+
+PowerParameter = _reflection.GeneratedProtocolMessageType('PowerParameter', (_message.Message,), {
+  'DESCRIPTOR' : _POWERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PowerParameter)
+  })
+_sym_db.RegisterMessage(PowerParameter)
+
+PythonParameter = _reflection.GeneratedProtocolMessageType('PythonParameter', (_message.Message,), {
+  'DESCRIPTOR' : _PYTHONPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PythonParameter)
+  })
+_sym_db.RegisterMessage(PythonParameter)
+
+RecurrentParameter = _reflection.GeneratedProtocolMessageType('RecurrentParameter', (_message.Message,), {
+  'DESCRIPTOR' : _RECURRENTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.RecurrentParameter)
+  })
+_sym_db.RegisterMessage(RecurrentParameter)
+
+ReductionParameter = _reflection.GeneratedProtocolMessageType('ReductionParameter', (_message.Message,), {
+  'DESCRIPTOR' : _REDUCTIONPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ReductionParameter)
+  })
+_sym_db.RegisterMessage(ReductionParameter)
+
+ReLUParameter = _reflection.GeneratedProtocolMessageType('ReLUParameter', (_message.Message,), {
+  'DESCRIPTOR' : _RELUPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ReLUParameter)
+  })
+_sym_db.RegisterMessage(ReLUParameter)
+
+ReshapeParameter = _reflection.GeneratedProtocolMessageType('ReshapeParameter', (_message.Message,), {
+  'DESCRIPTOR' : _RESHAPEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ReshapeParameter)
+  })
+_sym_db.RegisterMessage(ReshapeParameter)
+
+ScaleParameter = _reflection.GeneratedProtocolMessageType('ScaleParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SCALEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ScaleParameter)
+  })
+_sym_db.RegisterMessage(ScaleParameter)
+
+SigmoidParameter = _reflection.GeneratedProtocolMessageType('SigmoidParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SIGMOIDPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SigmoidParameter)
+  })
+_sym_db.RegisterMessage(SigmoidParameter)
+
+SliceParameter = _reflection.GeneratedProtocolMessageType('SliceParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SLICEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SliceParameter)
+  })
+_sym_db.RegisterMessage(SliceParameter)
+
+SoftmaxParameter = _reflection.GeneratedProtocolMessageType('SoftmaxParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SOFTMAXPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SoftmaxParameter)
+  })
+_sym_db.RegisterMessage(SoftmaxParameter)
+
+SwishParameter = _reflection.GeneratedProtocolMessageType('SwishParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SWISHPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SwishParameter)
+  })
+_sym_db.RegisterMessage(SwishParameter)
+
+TanHParameter = _reflection.GeneratedProtocolMessageType('TanHParameter', (_message.Message,), {
+  'DESCRIPTOR' : _TANHPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.TanHParameter)
+  })
+_sym_db.RegisterMessage(TanHParameter)
+
+TileParameter = _reflection.GeneratedProtocolMessageType('TileParameter', (_message.Message,), {
+  'DESCRIPTOR' : _TILEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.TileParameter)
+  })
+_sym_db.RegisterMessage(TileParameter)
+
+ThresholdParameter = _reflection.GeneratedProtocolMessageType('ThresholdParameter', (_message.Message,), {
+  'DESCRIPTOR' : _THRESHOLDPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ThresholdParameter)
+  })
+_sym_db.RegisterMessage(ThresholdParameter)
+
+WindowDataParameter = _reflection.GeneratedProtocolMessageType('WindowDataParameter', (_message.Message,), {
+  'DESCRIPTOR' : _WINDOWDATAPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.WindowDataParameter)
+  })
+_sym_db.RegisterMessage(WindowDataParameter)
+
+SPPParameter = _reflection.GeneratedProtocolMessageType('SPPParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SPPPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SPPParameter)
+  })
+_sym_db.RegisterMessage(SPPParameter)
+
+V1LayerParameter = _reflection.GeneratedProtocolMessageType('V1LayerParameter', (_message.Message,), {
+  'DESCRIPTOR' : _V1LAYERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.V1LayerParameter)
+  })
+_sym_db.RegisterMessage(V1LayerParameter)
+
+V0LayerParameter = _reflection.GeneratedProtocolMessageType('V0LayerParameter', (_message.Message,), {
+  'DESCRIPTOR' : _V0LAYERPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.V0LayerParameter)
+  })
+_sym_db.RegisterMessage(V0LayerParameter)
+
+PReLUParameter = _reflection.GeneratedProtocolMessageType('PReLUParameter', (_message.Message,), {
+  'DESCRIPTOR' : _PRELUPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PReLUParameter)
+  })
+_sym_db.RegisterMessage(PReLUParameter)
+
+ReLU6Parameter = _reflection.GeneratedProtocolMessageType('ReLU6Parameter', (_message.Message,), {
+  'DESCRIPTOR' : _RELU6PARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ReLU6Parameter)
+  })
+_sym_db.RegisterMessage(ReLU6Parameter)
+
+InterpParameter = _reflection.GeneratedProtocolMessageType('InterpParameter', (_message.Message,), {
+  'DESCRIPTOR' : _INTERPPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.InterpParameter)
+  })
+_sym_db.RegisterMessage(InterpParameter)
+
+PermuteParameter = _reflection.GeneratedProtocolMessageType('PermuteParameter', (_message.Message,), {
+  'DESCRIPTOR' : _PERMUTEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PermuteParameter)
+  })
+_sym_db.RegisterMessage(PermuteParameter)
+
+PriorBoxParameter = _reflection.GeneratedProtocolMessageType('PriorBoxParameter', (_message.Message,), {
+  'DESCRIPTOR' : _PRIORBOXPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.PriorBoxParameter)
+  })
+_sym_db.RegisterMessage(PriorBoxParameter)
+
+DetectionOutputParameter = _reflection.GeneratedProtocolMessageType('DetectionOutputParameter', (_message.Message,), {
+  'DESCRIPTOR' : _DETECTIONOUTPUTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DetectionOutputParameter)
+  })
+_sym_db.RegisterMessage(DetectionOutputParameter)
+
+DetectionEvaluateParameter = _reflection.GeneratedProtocolMessageType('DetectionEvaluateParameter', (_message.Message,), {
+  'DESCRIPTOR' : _DETECTIONEVALUATEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DetectionEvaluateParameter)
+  })
+_sym_db.RegisterMessage(DetectionEvaluateParameter)
+
+ResizeParameter = _reflection.GeneratedProtocolMessageType('ResizeParameter', (_message.Message,), {
+  'DESCRIPTOR' : _RESIZEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.ResizeParameter)
+  })
+_sym_db.RegisterMessage(ResizeParameter)
+
+NonMaximumSuppressionParameter = _reflection.GeneratedProtocolMessageType('NonMaximumSuppressionParameter', (_message.Message,), {
+  'DESCRIPTOR' : _NONMAXIMUMSUPPRESSIONPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.NonMaximumSuppressionParameter)
+  })
+_sym_db.RegisterMessage(NonMaximumSuppressionParameter)
+
+SaveOutputParameter = _reflection.GeneratedProtocolMessageType('SaveOutputParameter', (_message.Message,), {
+  'DESCRIPTOR' : _SAVEOUTPUTPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.SaveOutputParameter)
+  })
+_sym_db.RegisterMessage(SaveOutputParameter)
+
+AxpyParameter = _reflection.GeneratedProtocolMessageType('AxpyParameter', (_message.Message,), {
+  'DESCRIPTOR' : _AXPYPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.AxpyParameter)
+  })
+_sym_db.RegisterMessage(AxpyParameter)
+
+NormalizeParameter = _reflection.GeneratedProtocolMessageType('NormalizeParameter', (_message.Message,), {
+  'DESCRIPTOR' : _NORMALIZEPARAMETER,
+  '__module__' : 'caffe_upsample_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.NormalizeParameter)
+  })
+_sym_db.RegisterMessage(NormalizeParameter)
+
+
+_BLOBSHAPE.fields_by_name['dim']._options = None
+_BLOBPROTO.fields_by_name['data']._options = None
+_BLOBPROTO.fields_by_name['diff']._options = None
+_BLOBPROTO.fields_by_name['double_data']._options = None
+_BLOBPROTO.fields_by_name['double_diff']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Add.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Add.py
new file mode 100644
index 0000000..49c80a6
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Add.py
@@ -0,0 +1,28 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+
+
+def get_add_output_shape(input_shape):
+    output_shape = input_shape[0]
+    return [output_shape]
+
+
+def create_add_node(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_add_output_shape(input_shape)
+
+    node = Node.c2oNode(layer, node_name, 'Add', input_name, output_name, input_shape, output_shape)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Axpy.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Axpy.py
new file mode 100644
index 0000000..f43b0ea
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Axpy.py
@@ -0,0 +1,77 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+from typing import *
+import copy
+
+
+def need_add_reshape(input_shape: List[List]) -> bool:
+    return len(input_shape[0]) != len(input_shape[1])
+
+
+def get_param_shape(input_shape: List[List]) -> List:
+    input = input_shape[0]
+    scale = copy.deepcopy(input_shape[1])
+    if len(input) > len(scale):
+        for i in range(len(input) - len(scale)):
+            scale.append(1)
+    return scale
+
+def broadcast_scale(input_shape: List[List]) -> List[List]:
+    input = input_shape[0]
+    scale = input_shape[1]
+    if len(input) > len(scale):
+        for i in range(len(input) - len(scale)):
+            scale.append(1)
+        broadcast_shape = [input, scale]
+    elif len(input) < len(scale):
+        print("the scale should be less than input")
+        exit(-1)
+    else:
+        broadcast_shape = [input, scale]
+    return broadcast_shape
+
+
+def get_mul_output_shape(input_shape: List[List]) -> List[List]:
+    output_shape = input_shape[1]
+    return [output_shape]
+
+
+def create_axpy_mul_node(layer, node_name, input_name, output_name, input_shape):
+
+    new_node_name = node_name + "_middle"
+    output_shape = get_mul_output_shape(input_shape)
+    new_input_name = [input_name[0], input_name[1]]
+    new_output_name = [output_name[0] + "_mul"]
+    new_input_shape = [input_shape[0], input_shape[1]]
+
+    node = Node.c2oNode(layer, new_node_name, 'Mul', new_input_name, new_output_name, new_input_shape, output_shape)
+
+    return node
+
+def get_add_output_shape(input_shape):
+
+    output_shape = input_shape[1]
+
+    return [output_shape]
+
+def create_axpy_add_node(layer, node_name, input_name, output_name, input_shape):
+
+    output_shape = get_add_output_shape(input_shape)
+    new_input_name = [node_name + "_mul", input_name[2]]
+    new_input_shape = [input_shape[1], input_shape[2]]
+    node = Node.c2oNode(layer, node_name, "Add", new_input_name, output_name, input_shape, output_shape)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/BatchNorm.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/BatchNorm.py
new file mode 100644
index 0000000..d1ad032
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/BatchNorm.py
@@ -0,0 +1,25 @@
+import src.c2oObject as Node
+
+##-----------------------------BatchNormalization层 = BatchNorm + Scale-------------------------------------##
+#获取超参数
+def getBNAttri(layer):
+    #超参数字典
+    eps = layer.batch_norm_param.eps
+    momentum = layer.batch_norm_param.moving_average_fraction
+    dict = {"epsilon": eps,  # 滑动系数
+            "momentum": momentum
+            }
+    return dict
+#计算输出维度
+def getBNOutShape(input_shape):
+    output_shape = input_shape
+    return output_shape
+#构建节点
+def createBN(layer, nodename, inname, outname, input_shape):
+    dict = getBNAttri(layer)
+    #计算output_shape,输出维度等于输入维度
+    output_shape = getBNOutShape(input_shape)
+
+    #构建node
+    node = Node.c2oNode(layer, nodename, "BatchNormalization", inname, outname, input_shape, output_shape,dict)
+    return node
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Clip.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Clip.py
new file mode 100644
index 0000000..d88650c
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Clip.py
@@ -0,0 +1,42 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+
+def get_attribute(layer):
+    attributes = {}
+    max_attribute = 0
+    min_attribute = 0
+    if layer.type == 'ReLU6':
+        max_attribute = 6.0
+        min_attribute = 0
+
+    attribute = {
+        'max': max_attribute,
+        'min': min_attribute
+    }
+    return attributes
+
+
+def get_clip_output_shape(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def create_clip_node(layer, node_name, input_name, output_name, input_shape):
+    # onnx 1.6.0 don't use
+    # attributes = get_attribute(layer)
+    output_shape = get_clip_output_shape(input_shape)
+    node = Node.c2oNode(layer, node_name, 'Clip', input_name, output_name, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Concat.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Concat.py
new file mode 100644
index 0000000..4cfbebb
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Concat.py
@@ -0,0 +1,56 @@
+import src.c2oObject as Node
+from typing import List
+import copy
+
+
+def get_concat_attributes(layer):
+    ##轴
+    axis = layer.concat_param.axis
+    attributes = {"axis": axis}
+    return attributes
+
+
+# 计算输出维度
+def get_concat_outshape(layer, input_shape: List) -> List:
+    bottom = input_shape[0]
+    axis = layer.concat_param.axis
+
+    output_shape = copy.deepcopy(bottom)
+
+    assert (axis < len(bottom))
+
+    for i in range(1, len(input_shape)):
+        output_shape[axis] = output_shape[axis] + input_shape[i][axis]
+    return [output_shape]
+    #
+    # if len(bottom) == 2:
+    #     n, c = bottom[0], 0
+    #     for i in range(len(input_shape)):
+    #         c = c + input_shape[i][1]
+    #     output_shape = [[n, c]]
+    #     return output_shape
+    #
+    # elif len(bottom) == 3:
+    #     n, c = bottom[0], 0
+    #     for i in range(len(input_shape)):
+    #         c = c + input_shape[i][1]
+    #     output_shape = [[n, c]]
+    #     return output_shape
+    #
+    # elif len(bottom) == 4:
+    #     n, c, w, h = input_shape[0][0], 0, input_shape[0][2], input_shape[0][3]
+    #     for i in range(len(input_shape)):
+    #         c = c + input_shape[i][1]
+    #     output_shape = [[n, c, w, h]]
+    #     return output_shape
+
+
+# 构建节点
+def createConcat(layer, nodename, inname, outname, input_shape):
+    attributes = get_concat_attributes(layer)
+
+    output_shape = get_concat_outshape(layer, input_shape)
+
+    node = Node.c2oNode(layer, nodename, "Concat", inname, outname, input_shape, output_shape, attributes)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Conv.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Conv.py
new file mode 100644
index 0000000..89cc98c
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Conv.py
@@ -0,0 +1,76 @@
+import numpy as np
+import src.c2oObject as Node
+import math
+
+
+# 获取超参数
+def getConvAttri(layer, input_shape):
+    # 膨胀系数 dilations
+    dilations = [1, 1]
+    if layer.convolution_param.dilation != []:
+        dilation = layer.convolution_param.dilation[0]
+        dilations = [dilation, dilation]
+    ##填充pads
+    pads = [0, 0, 0, 0]  # 默认为0
+    if layer.convolution_param.pad != []:  # 若存在pad,则根据pad赋值
+        pads = np.array([layer.convolution_param.pad] * 4).reshape(1, -1)[0].tolist()
+    elif layer.convolution_param.pad_h != 0 or layer.convolution_param.pad_w != 0:  # 若存在pad_w,pad_h则根据其赋值
+        pads = [layer.convolution_param.pad_h, layer.convolution_param.pad_w, layer.convolution_param.pad_h,
+                layer.convolution_param.pad_w]
+    ##步长strides
+    strides = [1, 1]  # 默认为1
+    if layer.convolution_param.stride != []:
+        strides = np.array([layer.convolution_param.stride] * 2).reshape(1, -1)[0].tolist()
+
+    elif layer.convolution_param.stride_h != 0 and layer.convolution_param.stride_w != 0:
+        strides = [layer.convolution_param.stride_h, layer.convolution_param.stride_w]
+
+    ##卷积核尺寸kernel_shape
+    kernel_shape = np.array([layer.convolution_param.kernel_size] * 2).reshape(1, -1)[0].tolist()
+    if layer.convolution_param.kernel_size == []:
+        kernel_shape = [layer.convolution_param.kernel_h, layer.convolution_param.kernel_w]
+    ##分组group
+    group = 1
+    if layer.type == "ConvolutionDepthwise":
+        group = input_shape[0][1]
+    else:
+        group = layer.convolution_param.group
+
+    # 超参数字典
+    dict = {
+        #"auto_pad":"NOTSET",
+        "dilations": dilations,
+        "group": group,
+        "kernel_shape": kernel_shape,
+        "pads": pads,
+        "strides": strides
+    }
+    return dict
+
+
+# 计算输出维度
+def getConvOutShape(input_shape, layer, dict):
+    dilations = dict["dilations"]
+    kernel_shape = dict["kernel_shape"]
+    pads = dict["pads"]
+    strides = dict["strides"]
+    ##卷积核数量kernel_num
+    kernel_num = layer.convolution_param.num_output
+    # reference the caffe source code
+    kernel_extent_h = dilations[0] * (kernel_shape[0] - 1) + 1
+    output_shape_h = math.floor((input_shape[0][2] + 2 * pads[0] - kernel_extent_h) / strides[0]) + 1
+
+    kernel_extent_w = dilations[1] * (kernel_shape[1] - 1) + 1
+    output_shape_w = math.floor((input_shape[0][3] + 2 * pads[1] - kernel_extent_w) / strides[1]) + 1
+
+    output_shape = [[input_shape[0][0], kernel_num, output_shape_h, output_shape_w]]
+    return output_shape
+
+
+# 构建节点
+def createConv(layer, node_name, input_name, output_name, input_shape):
+    attributes = getConvAttri(layer, input_shape)
+    output_shape = getConvOutShape(input_shape, layer, attributes)
+    # 构建node
+    node = Node.c2oNode(layer, node_name, "Conv", input_name, output_name, input_shape, output_shape, attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/ConvTranspose.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/ConvTranspose.py
new file mode 100644
index 0000000..f426ac6
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/ConvTranspose.py
@@ -0,0 +1,66 @@
+import numpy as np
+import src.c2oObject as Node
+##---------------------------------------------------ConvTranspose层-------------------------------------------------------##
+#获取超参数
+def getConvTransposeAttri(layer):
+    ##膨胀系数dilations
+    dilations = [1, 1]
+    if layer.convolution_param.dilation != []:
+        dilation = layer.convolution_param.dilation[0]
+        dilations = [dilation, dilation]
+    ##填充pads
+    pads = [0, 0, 0, 0]  # 默认为0
+    if layer.convolution_param.pad != []:  # 若存在pad,则根据pad赋值
+        pads = np.array([layer.convolution_param.pad] * 4).reshape(1, -1)[0].tolist()
+    elif layer.convolution_param.pad_h != 0 or layer.convolution_param.pad_w != 0:  # 若存在pad_w,pad_h则根据其赋值
+        pads = [layer.convolution_param.pad_h, layer.convolution_param.pad_w, layer.convolution_param.pad_h,
+                layer.convolution_param.pad_w]
+    ##步长strides
+    strides = [1, 1]  # 默认为1
+    if layer.convolution_param.stride != []:
+        strides = np.array([layer.convolution_param.stride] * 2).reshape(1, -1)[0].tolist()
+    elif layer.convolution_param.stride_h != 0 and layer.convolution_param.stride_w != 0:
+        strides = [layer.convolution_param.stride_h, layer.convolution_param.stride_w]
+
+    ##卷积核尺寸kernel_shape
+    kernel_shape = np.array([layer.convolution_param.kernel_size] * 2).reshape(1, -1)[0].tolist()
+    if layer.convolution_param.kernel_size == []:
+        kernel_shape = [layer.convolution_param.kernel_h, layer.convolution_param.kernel_w]
+    ##分组group
+    group = layer.convolution_param.group
+
+
+    # 超参数字典
+    dict = {  # "auto_pad":"NOTSET",
+        "dilations": dilations,
+        "group": group,
+        "kernel_shape": kernel_shape,
+        "pads": pads,
+        "strides": strides
+    }
+    return dict
+#计算输出维度
+def getConvTransposeOutShape(input_shape, layer,dict):
+    dilations = dict["dilations"]
+    kernel_shape = dict["kernel_shape"]
+    pads = dict["pads"]
+    strides = dict["strides"]
+    ##卷积核数量kernel_num
+    kernel_num = layer.convolution_param.num_output
+
+    def get_output_shape(i, k, p, s):
+        return (i-1)*s + k -2*p
+
+    h = get_output_shape(input_shape[0][2], kernel_shape[0], pads[0], strides[0])
+    w = get_output_shape(input_shape[0][3], kernel_shape[1], pads[1], strides[1])
+
+    output_shape = [[input_shape[0][0], kernel_num, h, w]]
+
+    return output_shape
+#构建节点
+def createConvTranspose(layer, nodename, inname, outname, input_shape):
+    dict = getConvTransposeAttri(layer)
+    output_shape = getConvTransposeOutShape(input_shape, layer, dict)
+    #构建node
+    node = Node.c2oNode(layer, nodename, "ConvTranspose", inname, outname, input_shape, output_shape, dict)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Crop.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Crop.py
new file mode 100644
index 0000000..dfcd6fd
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Crop.py
@@ -0,0 +1,59 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+import numpy as np
+
+def get_crop_param(layer, input_shape):
+    axis: int = layer.crop_param.axis
+    crop_offset = layer.crop_param.offset
+
+    if not crop_offset:
+        offset_0 = 0
+    else:
+        offset_0 = crop_offset[0]
+
+    offset = []
+    starts = []
+    axes = []
+    ends = []
+
+    for i in range(len(input_shape[0])):
+        if i < axis:
+            start = 0
+            end = input_shape[1][i]
+        else:
+            if (i - axis) >= len(crop_offset):
+                offset.append(offset_0)
+            else:
+                offset.append(crop_offset[i - axis])
+
+            start = offset[i - axis]
+            end = start + input_shape[1][i]
+
+        if input_shape[0][i] != input_shape[1][i]:
+            axes.append(i)
+            starts.append(start)
+            ends.append(end)
+
+    return starts, ends, axes
+
+def get_crop_output_shape(layer, input_shape):  
+    return [input_shape[1]]
+
+
+def create_crop_node(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_crop_output_shape(layer, input_shape)
+    node = Node.c2oNode(layer, node_name, "Slice", input_name, output_name, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/DetectionOutput.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/DetectionOutput.py
new file mode 100644
index 0000000..3b97edf
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/DetectionOutput.py
@@ -0,0 +1,86 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import onnx
+from typing import *
+from onnx import helper
+from typing import *
+import ctypes
+import src.c2oObject as Node
+
+
+def create_attribuates(layer) -> Dict:
+    detection_output_param = layer.detection_output_param
+    num_classes  = detection_output_param.num_classes
+    share_location        = 1 if detection_output_param.share_location else 0
+    background_label_id    = detection_output_param.background_label_id
+    # NonMaximumSuppressionParameter
+    nms_threshold = detection_output_param.nms_param.nms_threshold
+    top_k = detection_output_param.nms_param.top_k
+    eta = detection_output_param.nms_param.eta
+
+    code_type              = detection_output_param.code_type
+    variance_encoded_in_target = 1 if detection_output_param.variance_encoded_in_target else 0
+    keep_top_k  = detection_output_param.keep_top_k
+    confidence_threshold = detection_output_param.confidence_threshold
+    visualize = 1 if detection_output_param.visualize else 0
+    visualize_threshold = detection_output_param.visualize_threshold
+    save_file = detection_output_param.save_file
+
+
+
+    # TODO: SaveOutputParameter
+    # save_output_param = detection_output_param.save_output_param
+    # output_directory: str = save_output_param.output_directory
+    # output_name_prefix: str = save_output_param.output_name_prefix
+    # output_format: str = save_output_param.output_format
+    # label_map_file: str = save_output_param.label_map_file
+    # name_size_file: str = save_output_param.name_size_file
+    # num_test_image: int = save_output_param.num_test_image
+
+
+
+    attributes = {
+        'num_classes'            : num_classes,
+        'share_location'       : share_location,
+        'background_label_id'  : background_label_id,
+        'nms_threshold'        : nms_threshold,
+        'top_k'                : top_k,
+        'eta'                  : eta,
+        'code_type'            : code_type,
+        'variance_encoded_in_target' : variance_encoded_in_target,
+        'keep_top_k'           : keep_top_k,
+        'confidence_threshold' : confidence_threshold,
+        'visualize'            : visualize,
+        'visualize_threshold'  : visualize_threshold,
+        'save_file'            : save_file
+        }
+    return attributes
+
+
+def create_detection_output(layer,
+                            node_name: str,
+                            inputs_name: List[str],
+                            outputs_name: List[str],
+                            inputs_shape: List, ) -> onnx.NodeProto:
+
+    attributes = create_attribuates(layer)
+
+    outputs_shape = [[1, 1, 1, 7]]
+
+    node = Node.c2oNode(layer, node_name, "DetectionOutput",
+                        inputs_name, outputs_name,
+                        inputs_shape, outputs_shape,
+                        attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Dropout.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Dropout.py
new file mode 100644
index 0000000..34fcb5e
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Dropout.py
@@ -0,0 +1,23 @@
+import src.c2oObject as Node
+##----------------------------------------------------Dropout层-------------------------------------------------------##
+#获取超参数
+def getDropoutAttri(layer):
+    ##drop 比率
+    ratio = layer.dropout_param.dropout_ratio
+    #前向不需要dropout,ratio设置为0后，后续可以onnx工具优化掉
+    ratio = 0.0
+
+    # 超参数字典
+    dict = {"ratio":ratio}
+    return dict
+def getDropoutOutShape(input_shape):
+    # 计算输出维度output_shape
+    output_shape = input_shape  # 与输入维度一样
+    return output_shape
+#构建节点
+def createDropout(layer, nodename, inname, outname, input_shape):
+    dict = getDropoutAttri(layer)
+    output_shape = getDropoutOutShape(input_shape)
+    # 构建node
+    node = Node.c2oNode(layer, nodename, "Dropout", inname, outname, input_shape, output_shape, dict=dict)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Eltwise.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Eltwise.py
new file mode 100644
index 0000000..f2aa89b
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Eltwise.py
@@ -0,0 +1,33 @@
+import src.c2oObject as Node
+##-------------------------------------------------eltwise层----------------------------------------------------------##
+def createEltwise(layer, nodename, inname, outname, input_shape):
+    #判断算子类型
+    if layer.eltwise_param.operation == 0:
+        node = __createMul(layer, nodename, inname, outname, input_shape)#按元素相乘
+
+    elif layer.eltwise_param.operation == 1:
+        node = __createAdd(layer, nodename, inname, outname, input_shape)#按元素相加
+
+    elif layer.eltwise_param.operation == 2:
+        node = __createMax(layer, nodename, inname, outname, input_shape)#按元素求最大值
+
+    return node
+
+
+
+##----------------------------------------------Mul层,对应Prod-----------------------------------------------##
+def __createMul(layer, nodename, inname, outname, input_shape):
+    output_shape = input_shape[0]
+    node = Node.c2oNode(layer, nodename, "Mul", inname, outname, input_shape, output_shape)
+    return node
+
+##---------------------Add层,可能是两个中间层输出相加，也可能是一个输出加一个bias这种------------------------##
+def __createAdd(layer, nodename, inname, outname, input_shape):
+    output_shape = [input_shape[0]]
+    node = Node.c2oNode(layer, nodename, "Add", inname, outname, input_shape, output_shape)
+    return node
+##----------------------------------------------Max层-------------------------------------------------------------##
+def __createMax(layer, nodename, inname, outname, input_shape):
+    output_shape = input_shape
+    node = Node.c2oNode(layer, nodename, "Max", inname, outname, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Flatten.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Flatten.py
new file mode 100644
index 0000000..298be83
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Flatten.py
@@ -0,0 +1,39 @@
+import src.c2oObject as Node
+from typing import List, Dict
+import onnx
+
+
+def get_attributes(layer) -> Dict:
+    axis = layer.flatten_param.axis
+    end_axis = layer.flatten_param.end_axis
+    if end_axis != -1:
+        print("not support end_axis param!")
+        exit(-1)
+    attributes = {
+        "axis": axis
+    }
+    return attributes
+
+
+def get_flatten_output_shape(input_shape: List,
+                             attributes: Dict) -> List:
+    shape = input_shape[0]
+    input_prod = 1
+    axis = attributes.get("axis")
+    for i in range(axis, len(shape)):
+        input_prod = input_prod * shape[i]
+
+    output_shape = [shape[0:axis]+ [input_prod]]
+    return output_shape
+
+def create_flatten_node(layer, node_name : str,
+                        input_names: List,
+                        output_name: List,
+                        input_shape: List) -> onnx.NodeProto:
+    attributes = get_attributes(layer)
+
+    output_shape = get_flatten_output_shape(input_shape, attributes)
+
+    node = Node.c2oNode(layer, node_name, "Flatten", input_names,
+                        output_name, input_shape, output_shape, attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Gemm.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Gemm.py
new file mode 100644
index 0000000..61fc32a
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Gemm.py
@@ -0,0 +1,21 @@
+import src.c2oObject as Node
+##-----------------------------------------------------Gemm层-------------------------------------------------------##
+#获取超参数
+def getGemmAttri(layer):
+    #超参数字典
+    dict = {"alpha": 1.0,
+            "beta": 1.0,
+            "transA": 0,
+            "transB": 1}
+    return dict
+#计算输出维度
+def getGemmOutShape(input_shape,num_output):
+    output_shape = [[input_shape[0][0], num_output]]
+    return output_shape
+#构建节点
+def createGemm(layer, nodename, inname, outname, input_shape, num_output):
+    dict = getGemmAttri(layer)
+    output_shape = getGemmOutShape(input_shape,num_output)
+    #构建node
+    node = Node.c2oNode(layer, nodename, "Gemm", inname, outname, input_shape, output_shape, dict)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/InstanceNorm.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/InstanceNorm.py
new file mode 100644
index 0000000..94645fb
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/InstanceNorm.py
@@ -0,0 +1,34 @@
+import src.c2oObject as Node
+import numpy as np
+
+
+def get_InstanceNorm_param(layer, input_shape):
+    scale = []
+    bias = []
+    for i in range(input_shape[0][1]):
+        scale.append(1)
+        bias.append(0)
+    return scale, bias
+
+
+def create_InstanceNorm_attributes(layer):
+    epsilon: float = layer.mvn_param.eps
+    if not epsilon:
+        epsilon = 1e-05
+
+    attributes = {"epsilon": epsilon}
+    return attributes
+
+
+def get_InstanceNorm_output_shape(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def create_InstanceNorm_op(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_InstanceNorm_output_shape(input_shape)
+    attributes = create_InstanceNorm_attributes(layer)
+    node = Node.c2oNode(layer, node_name, "InstanceNormalization",
+                        input_name, output_name,
+                        input_shape,output_shape,attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Interp.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Interp.py
new file mode 100644
index 0000000..cacc02a
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Interp.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+import numpy as np
+
+def get_interp_attri(layer, input_shape):
+    # scale = layer.upsample_param.scale
+    # scales = [1.0,1.0,scale,scale]
+    # dict = {"scales":scales,"mode":"nearest"}#Upsample将scales放入参数里面了
+    # dict = {"width_scale": scale,"height_scale":scale, "mode": "nearest"}#在OpenVINO读onnx的时候要求用width_scale和height_scale
+    height = layer.interp_param.height
+    width = layer.interp_param.width
+    zoom_factor = layer.interp_param.zoom_factor
+    shrink_factor = layer.interp_param.shrink_factor
+    pad_beg = layer.interp_param.pad_beg
+    pad_end = layer.interp_param.pad_end
+    H, W = input_shape[0][2], input_shape[0][3]
+
+    sacles = [1.0, 1.0, 1.0, 1.0]
+    if height > H and width > W:
+        if height / H == width / W:
+            scale = float(height / H)
+            scales = [1.0, 1.0, scale, scale]
+            attributes = {"mode": "linear",
+                          'scales': scales}
+            return attributes
+    if height == 0 and width == 0:
+        if zoom_factor > 1 and shrink_factor == 1:
+            height_in_eff = height + pad_beg + pad_end
+            width_in_eff = width + pad_beg + pad_end
+            height_out = height_in_eff + (height_in_eff - 1) * (zoom_factor -1)
+            width_out = width_in_eff + (width_in_eff - 1) * (zoom_factor -1)
+            scale_height = float(height_out /height_in_eff)
+            scale_width = float(width_out /width_in_eff)
+            scales = [1.0, 1.0, scale_height, scale_width]
+            attributes = {"mode": "linear",
+                          'scales': scales}
+            return attributes
+        else:
+            print("do not support interp type")
+            exit(-1)
+
+
+def get_interp_output_shape(layer, input_shape, attributes):
+    scales = attributes.get("scales")
+    output_shape = [np.multiply(np.array(scales, dtype=np.int), np.array(input_shape[0])).tolist()]
+    return output_shape
+
+# TODO interp 只支持放大的情况，后期会将 onnx 升级到 1.6.0 , 并使用 resize 替换
+def create_interp_node(layer, node_name, input_name, output_name, input_shape):
+    attributes = get_interp_attri(layer, input_shape)
+    output_shape = get_interp_output_shape(layer, input_shape, attributes)
+
+    # print(output_shape)
+    node = Node.c2oNode(layer, node_name, "Upsample", input_name, output_name, input_shape, output_shape, attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/LRN.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/LRN.py
new file mode 100644
index 0000000..0f65c52
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/LRN.py
@@ -0,0 +1,31 @@
+import src.c2oObject as Node
+##-------------------------------------------------LRN层-------------------------------------------------------------##
+#获取超参数
+def getLRNAttri(layer):
+    # 获取超参数
+    ##尺寸
+    size = layer.lrn_param.local_size
+    ##alpha
+    alpha = layer.lrn_param.alpha
+    ##beta
+    beta = layer.lrn_param.beta
+
+    # 超参数字典
+    dict = {"alpha":alpha,
+            "beta":beta,
+            "bias":1.0,
+            "size": size}
+    return dict
+#计算输出维度
+def getLRNOutShape(input_shape):
+    # 计算输出维度output_shape
+    output_shape = input_shape  # 与输入维度一样
+    return output_shape
+#构建节点
+def createLRN(layer,nodename, inname,outname,input_shape):
+    dict = getLRNAttri(layer)
+    output_shape = getLRNOutShape(input_shape)
+
+    #构建node
+    node = Node.c2oNode(layer, nodename, "LRN", inname, outname, input_shape, output_shape, dict)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Log.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Log.py
new file mode 100644
index 0000000..327d428
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Log.py
@@ -0,0 +1,38 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+# Message that stores parameters used by LogLayer
+# message LogParameter {
+#   // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+#   // Or if base is set to the default (-1), base is set to e,
+#   // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+#   optional float base = 1 [default = -1.0];
+#   optional float scale = 2 [default = 1.0];
+#   optional float shift = 3 [default = 0.0];
+# }
+
+import src.c2oObject as Node
+
+
+def get_log_output_shape(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def create_log_node(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_log_output_shape(layer)
+
+    node = Node.c2oNode(layer, node_name, 'Log', input_name, output_name, input_shape, output_shape)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/LpNormalization.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/LpNormalization.py
new file mode 100644
index 0000000..78a277e
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/LpNormalization.py
@@ -0,0 +1,38 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+from typing import Dict
+
+
+def create_attribute(layer):
+    attribute: Dict = {
+        'axis': 1,
+        'p': 2
+    }
+    return attribute
+
+
+def get_node_output(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def create_Lp_Normalization(layer, node_name, input_name, output_name, input_shape):
+    attribute = create_attribute(layer)
+    output_shape = get_node_output(input_shape)
+
+    node = Node.c2oNode(layer, node_name, "LpNormalization", input_name, output_name, input_shape, output_shape,
+                        attribute)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Min.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Min.py
new file mode 100644
index 0000000..21baffa
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Min.py
@@ -0,0 +1,26 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+
+
+def get_min_output_shape(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def create_min_op(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_min_output_shape(input_shape)
+    node = Node.c2oNode(layer, node_name, "Min", input_name, output_name, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Mul.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Mul.py
new file mode 100644
index 0000000..039bb3e
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Mul.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+import numpy as np
+from typing import *
+from onnx import TensorProto
+import copy
+
+
+def need_add_reshape(input_shape: List[List]) -> bool:
+    return len(input_shape[0]) != len(input_shape[1])
+
+
+def get_param_shape(input_shape: List[List]) -> List:
+    input = input_shape[0]
+    scale = copy.deepcopy(input_shape[1])
+    if len(input) > len(scale):
+        for i in range(len(input) - len(scale)):
+            scale.append(1)
+    return scale
+
+def broadcast_scale(input_shape: List[List]) -> List[List]:
+    input = input_shape[0]
+    scale = input_shape[1]
+    if len(input) > len(scale):
+        for i in range(len(input) - len(scale)):
+            scale.append(1)
+        broadcast_shape = [input, scale]
+    elif len(input) < len(scale):
+        print("the scale should be less than input")
+        exit(-1)
+    else:
+        broadcast_shape = [input, scale]
+    return broadcast_shape
+
+
+def get_mul_output_shape(input_shape: List[List]) -> List[List]:
+    output_shape = input_shape[0]
+    return [output_shape]
+
+
+def create_mul_node(layer, node_name, input_name, output_name, input_shape):
+
+    output_shape = get_mul_output_shape(input_shape)
+
+    node = Node.c2oNode(layer, node_name, 'Mul', input_name, output_name, input_shape, output_shape)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/PRelu.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/PRelu.py
new file mode 100644
index 0000000..d4d1ddf
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/PRelu.py
@@ -0,0 +1,12 @@
+import src.c2oObject as Node
+
+
+def getPReluOutShape(input_shape):
+    output_shape = input_shape
+    return output_shape
+
+
+def createPRelu(layer, nodename, inname, outname, input_shape):
+    output_shape = getPReluOutShape(input_shape)
+    node = Node.c2oNode(layer, nodename, "PRelu", inname, outname, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Pooling.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Pooling.py
new file mode 100644
index 0000000..1c0b01e
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Pooling.py
@@ -0,0 +1,206 @@
+import numpy as np
+import src.c2oObject as Node
+import math
+import copy
+
+
+def get_pool_pads(layer):
+    pad = layer.pooling_param.pad
+    if pad != 0:
+        pad_h = pad_w = pad
+    else:
+        if layer.pooling_param.pad_h != 0 and layer.pooling_param.pad_w != 0:
+            pad_h = layer.pooling_param.pad_h
+            pad_w = layer.pooling_param.pad_w
+        else:
+            pad_h = pad_w = 0
+    pads = [0, 0, pad_h, pad_w, 0, 0, pad_h, pad_w]
+
+    return pads
+
+
+def calculate_pad_output_shape(input_shape, pads):
+    pad_h = pads[2]
+    pad_w = pads[3]
+    output_shape = copy.deepcopy(input_shape[0])
+
+    output_shape[2] = output_shape[2] + 2 * pad_h
+    output_shape[3] = output_shape[3] + 2 * pad_w
+    return [output_shape]
+
+
+def create_pad_node(layer, node_name, input_name, output_name, input_shape):
+    pads = get_pool_pads(layer)
+    attributes = {"mode": "constant"}
+    pad_input_name = input_name
+    pad_output_name = output_name
+    pad_output_shape = calculate_pad_output_shape(input_shape, pads)
+
+    node = Node.c2oNode(layer, node_name, 'Pad', pad_input_name, pad_output_name, input_shape, pad_output_shape,
+                        attributes)
+
+    return node
+
+
+def get_pool_attributes(layer, pool_type, input_shape):
+    number = input_shape[0][0]
+    channel = input_shape[0][1]
+    height = input_shape[0][2]
+    weight = input_shape[0][3]
+    kernel_size = layer.pooling_param.kernel_size
+    pad = layer.pooling_param.pad
+    stride = layer.pooling_param.stride
+
+    if pool_type == 'GlobalMaxPool' or pool_type == 'GlobalAveragePool':
+        global_pooling = True
+    else:
+        global_pooling = False
+    # pass kernel_shape
+    if global_pooling:
+        kernel_h = height
+        kernel_w = weight
+    else:
+        if kernel_size != 0:
+            kernel_h = kernel_w = kernel_size
+        elif layer.pooling_param.kernel_h != 0 and layer.pooling_param.kernel_w != 0:
+            kernel_h = layer.pooling_param.kernel_h
+            kernel_w = layer.pooling_param.kernel_w
+        else:
+            kernel_h = 1
+            kernel_w = 1
+    kernel_shape = [kernel_h, kernel_w]
+    # pass pad
+    if pad != 0:
+        pad_h = pad_w = pad
+    else:
+        if layer.pooling_param.pad_h != 0 and layer.pooling_param.pad_w != 0:
+            pad_h = layer.pooling_param.pad_h
+            pad_w = layer.pooling_param.pad_w
+        else:
+            pad_h = pad_w = 0
+    pads = [pad_h, pad_w, pad_h, pad_w]
+    # 由于 caffe 与 onnx 的 pad 的计算的原因，将 pad 属性，单独创建一个节点
+    pads = [0, 0, 0, 0]
+    # pass strides
+    stride_h = stride_w = 1
+    if stride != 1:
+        stride_h = stride_w = stride
+    else:
+        if layer.pooling_param.stride_h != 0 and layer.pooling_param.stride_w != 0:
+            stride_h = layer.pooling_param.stride_h
+            stride_w = layer.pooling_param.stride_w
+        else:
+            stride_h = stride_w = 1
+    strides = [stride_h, stride_w]
+
+    # pass round_mode
+    # caffe 上默认是使用 ceil 的，但是在 onnx 默认使用 floor
+    # caffe definition
+    #   enum RoundMode {
+    #     CEIL = 0;
+    #     FLOOR = 1;
+    #   }
+    # default Ceil = 0
+    # onnx ceil_mode floor = 0, ceil = 1, default: floor = 0
+    round_mode_ceil = 0
+    round_mode_floor = 1
+    round_mode = 0
+    if layer.pooling_param.round_mode == 0:
+        round_mode = round_mode_ceil
+    elif layer.pooling_param.round_mode == 1:
+        round_mode = round_mode_floor
+    else:
+        # wrong condition
+        exit(-1)
+    if round_mode == round_mode_ceil:
+        ceil_mode = 1
+    else:
+        ceil_mode = 0
+
+    attributes = {"kernel_shape": kernel_shape,
+                  "strides": strides,
+                  "pads": pads,
+                  "ceil_mode": ceil_mode
+                  }
+    return attributes
+
+
+# 计算输出维度
+def get_pooling_output_shape(input_shape, layer, attributes, with_indices=False):
+    number = input_shape[0][0]
+    channel = input_shape[0][1]
+    kernel_shape = attributes["kernel_shape"]
+    kernel_h = kernel_shape[0]
+    kernel_w = kernel_shape[1]
+    pads = attributes["pads"]
+    strides = attributes["strides"]
+    stride_h = strides[0]
+    stride_w = strides[1]
+    ceil_mode = attributes["ceil_mode"]
+    pad_h = pads[2]
+    pad_w = pads[3]
+    height = input_shape[0][2]
+    width = input_shape[0][3]
+
+    if ceil_mode == 1:
+        # ceil
+        pooled_height = int(math.ceil((height + 2 * pad_h - kernel_h) / stride_h)) + 1
+        pooled_width = int(math.ceil((width + 2 * pad_h - kernel_w) / stride_w)) + 1
+    else:
+        # floor
+        pooled_height = int(math.floor((height + 2 * pad_h - kernel_h) / stride_h)) + 1
+        pooled_width = int(math.floor((width + 2 * pad_h - kernel_w) / stride_w)) + 1
+
+    if pad_h != 0 or pad_w != 0:
+        if ((pooled_height - 1) * stride_h) >= (height + pad_h):
+            pooled_height = pooled_height - 1
+        if ((pooled_width - 1) * stride_w) >= (width + pad_w):
+            pooled_width = pooled_width - 1
+    if kernel_h == 0:
+        kernel_h = kernel_w = 1
+    if with_indices:
+        output_shape = [[number, channel, pooled_height, pooled_width],
+                        [number, channel, pooled_height, pooled_width]]
+    else:
+        output_shape = [[number, channel, pooled_height, pooled_width]]
+    return output_shape
+
+
+def pooling_type(layer):
+    pool_value = layer.pooling_param.pool
+    global_value = layer.pooling_param.global_pooling
+    if pool_value == 0 and global_value is True:
+        return 'GlobalMaxPool'
+    elif pool_value == 1 and global_value is True:
+        return 'GlobalAveragePool'
+    elif pool_value == 0 and global_value is False:
+        return 'MaxPool'
+    elif pool_value == 1 and global_value is False:
+        return 'AveragePool'
+    else:
+        print("unsupport pooling!")
+        exit(-1)
+
+
+# 构建节点
+def create_pooling_node(layer, nodename, inname, outname, input_shape):
+    pool_type = pooling_type(layer)
+    node = None
+    attributes = get_pool_attributes(layer, pool_type, input_shape)
+    with_indices = True if len(outname) == 2 else False
+    output_shape = get_pooling_output_shape(input_shape, layer, attributes, with_indices=with_indices)
+
+    # 判断是池化种类,最大池化、平均池化
+    if pool_type == 'GlobalMaxPool':
+        node = Node.c2oNode(layer, nodename, "GlobalMaxPool", inname, outname, input_shape, output_shape, dict={})
+    elif pool_type == 'MaxPool':
+        node = Node.c2oNode(layer, nodename, "MaxPool", inname, outname, input_shape, output_shape, dict=attributes)
+    elif pool_type == 'GlobalAveragePool':
+        node = Node.c2oNode(layer, nodename, "GlobalAveragePool", inname, outname, input_shape, output_shape,
+                            dict={})
+    elif pool_type == 'AveragePool':
+        node = Node.c2oNode(layer, nodename, "AveragePool", inname, outname, input_shape, output_shape,
+                            dict=attributes)
+    # Layers[i].pooling_param.pool==2为随机池化
+    assert (node is not None)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Power.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Power.py
new file mode 100644
index 0000000..010cfda
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Power.py
@@ -0,0 +1,32 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+import numpy as np
+
+def get_power_param(layer):
+    power: int = layer.power_param.power
+    scale: int = layer.power_param.scale
+    shift: int = layer.power_param.shift
+    return np.array([power]), np.array([scale]), np.array([shift])
+
+
+def get_power_output_shape(input_shape):
+    return [input_shape[0]]
+
+
+def create_power_node(layer, node_name, input_name, output_name, input_shape):
+    output_shape = get_power_output_shape(input_shape)
+    node = Node.c2oNode(layer, node_name, "Pow", input_name, output_name, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/PriroBox.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/PriroBox.py
new file mode 100644
index 0000000..5e1e2d5
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/PriroBox.py
@@ -0,0 +1,123 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import onnx
+from typing import *
+from onnx import helper
+from typing import *
+import ctypes
+import src.c2oObject as Node
+import math
+
+
+def create_custom_node(type_name: Text,
+                       inputs: Sequence[Text],
+                       outputs: Sequence[Text],
+                       attributes: Dict) -> onnx.NodeProto:
+    node = helper.make_node(type_name, inputs, outputs, **attributes)
+    print(format(node))
+    return node
+
+
+def create_priorbox_attributes(layer) -> Dict:
+    min_sizes = layer.prior_box_param.min_size
+    max_sizes = layer.prior_box_param.max_size
+
+    # onnx attributes does not support bool type
+    flip = 1 if layer.prior_box_param.flip else 0
+    clip = 1 if layer.prior_box_param.clip else 0
+
+    aspect_ratio_tmp = layer.prior_box_param.aspect_ratio
+    # get aspect ratio
+    aspect_ratios = [1.0]
+    for item in aspect_ratio_tmp:
+        already_exist = False
+        for i in range(len(aspect_ratios)):
+            if math.fabs(item - aspect_ratios[i]) < 1e-6:
+                already_exist = True
+        if already_exist is False:
+            aspect_ratios.append(item)
+        if flip == 1:
+            aspect_ratios.append(1. / item)
+
+    # get variances     variances_tmp: List[float]
+    variances = []
+    if len(layer.prior_box_param.variance) > 1:
+        assert len(layer.prior_box_param.variance) == 4
+        variances = layer.prior_box_param.variance
+    elif len(layer.prior_box_param.variance) == 1:
+        variances = layer.prior_box_param.variance
+    else:
+        # set default to 0.1
+        variances.append(0.1)
+
+    # get image size
+    img_sizes = [0, 0]
+    if layer.prior_box_param.img_size != 0:
+        img_sizes = [layer.prior_box_param.img_size, layer.prior_box_param.img_size]
+    elif (layer.prior_box_param.img_h != 0) and (layer.prior_box_param.img_w != 0):
+        # be careful the order: [img_w, img_h]
+        img_sizes = [layer.prior_box_param.img_w, layer.prior_box_param.img_h]
+
+    # get step
+    steps = [0.0, 0.0]
+    if layer.prior_box_param.step != 0:
+        steps = [layer.prior_box_param.step, layer.prior_box_param.step]
+    elif (layer.prior_box_param.step_h != 0) and (layer.prior_box_param.step_w != 0):
+        # be careful the order: [step_w, step_h]
+        steps = [layer.prior_box_param.step_w, layer.prior_box_param.step_h]
+
+    offset = layer.prior_box_param.offset
+
+    attributes = {
+        'min_sizes': min_sizes,
+        'max_sizes': max_sizes,
+        'clip': clip,
+        'flip': flip,
+        'variances': variances,
+        'aspect_ratios': aspect_ratios,
+        'img_sizes': img_sizes,
+        'steps': steps,
+        'offset': offset
+    }
+    return attributes
+
+
+def caculate_output_shape(layer, input_shape: List, attributes: Dict) -> List:
+    width = input_shape[0][2]
+    height = input_shape[0][3]
+    aspect_ratios = attributes.get('aspect_ratios')
+    min_sizes = attributes.get('min_sizes')
+    num_priors = len(aspect_ratios) * len(min_sizes)
+    max_sizes = attributes.get('max_sizes')
+    for max_size in max_sizes:
+        if max_size > 0:
+            num_priors = num_priors + 1
+
+    return [[1, 2, width * height * num_priors * 4]]
+
+
+def create_priorbox_node(layer,
+                         node_name: str,
+                         inputs_name: List[str],
+                         outputs_name: List[str],
+                         inputs_shape: List, ) -> onnx.NodeProto:
+    attributes = create_priorbox_attributes(layer)
+
+    outputs_shape = caculate_output_shape(layer, inputs_shape, attributes)
+    node = Node.c2oNode(layer, node_name, "PriorBox",
+                        inputs_name, outputs_name,
+                        inputs_shape, outputs_shape,
+                        attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/ReLU.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/ReLU.py
new file mode 100644
index 0000000..4b7543e
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/ReLU.py
@@ -0,0 +1,29 @@
+import src.c2oObject as Node
+
+
+# 获取超参数
+def getReluAttri(layer):
+    attributes = {}
+    if layer.relu_param.negative_slope != 0:
+        attributes = {"alpha": layer.relu_param.negative_slope}
+    return attributes
+
+
+# 计算输出维度
+def getReluOutShape(input_shape):
+    # 获取output_shape
+    output_shape = input_shape
+    return output_shape
+
+
+# 构建节点
+def createRelu(layer, nodename, inname, outname, input_shape):
+    attributes = getReluAttri(layer)
+    output_shape = getReluOutShape(input_shape)
+
+    if attributes == {}:
+        node = Node.c2oNode(layer, nodename, "Relu", inname, outname, input_shape, output_shape)
+    else:
+        node = Node.c2oNode(layer, nodename, "LeakyRelu", inname, outname, input_shape, output_shape, dict=attributes)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Reshape.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Reshape.py
new file mode 100644
index 0000000..b6e0d42
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Reshape.py
@@ -0,0 +1,83 @@
+import src.c2oObject as Node
+import numpy as np
+from typing import *
+from operator import mul
+from functools import reduce
+
+# 计算输出维度
+def getReshapeOutShape(layer, input_shape: List) -> List:
+    if layer.type == 'InnerProduct':
+        dims = input_shape[0]
+        in_prod = 1
+        for i in range(1, len(dims)):
+            in_prod = in_prod * dims[i]
+        output_shape = [dims[0], in_prod]
+        return [output_shape]
+
+    elif layer.type == 'ShuffleChannel':
+        ## change [N, C, H, W] -> [N, G, C', H, W] tensor
+        group = layer.shuffle_channel_param.group
+        n, g, c, h, w = input_shape[0][0], group, int(input_shape[0][1] / group), input_shape[0][2], input_shape[0][3]
+        out_shape = [[n, g, c, h, w]]
+        return out_shape
+
+    elif layer.type == 'DeReshape':
+        n, c, h, w = input_shape[0][0], input_shape[0][1] * input_shape[0][2], input_shape[0][3], input_shape[0][4]
+        out_shape  = [[n, c, h, w]]
+        return out_shape
+
+    elif layer.type == 'Flatten':
+
+        axis = layer.flatten_param.axis
+        assert axis == 1, "Flatten: not support axis not equal 1"
+        # return [[0, -1]]
+        shape = input_shape[0]
+        input_prod = 1
+        for i in range(axis, len(shape)):
+            input_prod = input_prod * shape[i]
+        output_shape = [shape[0:axis] + [input_prod]]
+        return output_shape
+
+    elif layer.type == 'Scale':
+        return input_shape
+
+    elif layer.type == 'Reshape':
+        shape = input_shape[0]
+        re_shape = layer.reshape_param.shape.dim
+        new_shape_list = []
+        for j in range(len(re_shape)):
+            if re_shape[j] == 0:
+                # if value = 0 ; then use original
+                new_shape_list.append(shape[j])
+            else:
+                new_shape_list.append(re_shape[j])
+        if -1 in new_shape_list:
+            index = new_shape_list.index(-1)
+            if index == 0:
+                prod = reduce(mul, new_shape_list[1:], 1)
+            elif index == (len(new_shape_list) -1):
+                prod = reduce(mul, new_shape_list[0:index])
+            else:
+                prod = reduce(mul, new_shape_list[0:index]) * reduce(mul, new_shape_list[index+1:], 1)
+            new_shape_list[index] = int(reduce(mul, shape, 1) / prod)
+        output_shape = [new_shape_list]
+        return output_shape
+
+
+def get_reshape_param(layer, input_shape: List[int]) -> List[int]:
+    re_shape = layer.reshape_param.shape.dim
+    return re_shape
+
+
+# 构建节点
+def createReshape(layer, node_name, input_name, output_name, input_shape, output_shape={}):
+    # 获取output_shape
+    if layer.type == 'Scale' and output_shape != {}:
+        node = Node.c2oNode(layer, node_name, "Reshape", input_name, output_name, input_shape, output_shape)
+        return node
+    else:
+        output_shape = getReshapeOutShape(layer, input_shape)
+
+    # 构建node
+    node = Node.c2oNode(layer, node_name, "Reshape", input_name, output_name, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Resize.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Resize.py
new file mode 100644
index 0000000..c0c8327
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Resize.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import onnx
+from typing import *
+from onnx import helper
+from typing import *
+import ctypes
+import src.c2oObject as Node
+import numpy as np
+
+
+def create_attributes(layer) -> Dict:
+    coordinate_transformation_mode = 'half_pixel'
+    cubic_coeff_a = -0.75
+    exclude_outside = 0
+    extrapolation_value = 0.0
+    mode = 'nearest'
+    nearest_mode = 'round_prefer_floor'
+    attributes = {
+        "coordinate_transformation_mode": coordinate_transformation_mode,
+        "cubic_coeff_a": cubic_coeff_a,
+        "exclude_outside": exclude_outside,
+        "extrapolation_value": extrapolation_value,
+        "mode": mode,
+        "nearest_mode" :nearest_mode
+    }
+    return attributes
+
+
+def caculate_output_shape(layer, input_shape) -> List:
+    scale = layer.upsample_param.scale
+    scales = [1.0,1.0,scale,scale]
+    output_shape = [np.multiply(np.array(scales,dtype=np.int),np.array(input_shape[0])).tolist()]
+    return output_shape
+
+
+
+def create_resize_node(layer,
+                       node_name: str,
+                       inputs_name: List[str],
+                       outputs_name: List[str],
+                       inputs_shape: List, ) -> onnx.NodeProto:
+    attributes = create_attributes(layer)
+
+    outputs_shape = caculate_output_shape(layer, inputs_shape)
+
+    node = Node.c2oNode(layer, node_name, "Resize",
+                        inputs_name, outputs_name,
+                        inputs_shape, outputs_shape,
+                        attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Shuffle.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Shuffle.py
new file mode 100644
index 0000000..ec8403f
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Shuffle.py
@@ -0,0 +1,55 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+
+import src.c2oObject as Node
+
+
+# 计算输出维度
+def getReshapeOutShape(layer, input_shape):
+    try:
+        # 获取layer的reshape param
+        re_shape = layer.reshape_param.shape.dim
+    except Exception as e:
+        re_shape = []
+
+    # 计算input shape所有维度之积
+    in_prod = 1
+    for dim in input_shape[0]:
+        in_prod = in_prod * dim
+    if re_shape == []:
+        output_shape = [[1, in_prod]]
+    else:
+        output_shape = re_shape
+        for i in range(len(re_shape)):
+            if re_shape[i] == 0:
+                output_shape[i] = input_shape[0][i]
+
+        for j in range(len(output_shape)):
+            if output_shape[j] == -1:
+                for d in output_shape:
+                    in_prod = in_prod / d
+                output_shape[j] = int(in_prod * -1)
+        output_shape = [output_shape]
+    return output_shape
+
+
+# 构建节点
+def createShuffle(layer, nodename, inname, outname, input_shape):
+    # 获取output_shape
+    output_shape = getReshapeOutShape(layer, input_shape)
+
+    # 构建node
+    node = Node.c2oNode(layer, nodename, "Reshape", inname, outname, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Sigmoid.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Sigmoid.py
new file mode 100644
index 0000000..7ff7e81
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Sigmoid.py
@@ -0,0 +1,30 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+
+
+def getOutShape(input_shape):
+    # 获取output_shape
+    output_shape = input_shape
+    return output_shape
+
+
+# 构建节点
+def createSigmoid(layer, nodename, inname, outname, input_shape):
+    output_shape = getOutShape(input_shape)
+
+    node = Node.c2oNode(layer, nodename, "Sigmoid", inname, outname, input_shape, output_shape)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Slice.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Slice.py
new file mode 100644
index 0000000..d949574
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Slice.py
@@ -0,0 +1,89 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+
+def analyzeLayer(layer, input_shape):
+    # 获取到 slice_point
+    axis = layer.slice_param.axis
+    starts = [0]
+    axes = [axis]
+    for step in layer.slice_param.slice_point:
+        starts.append(step)
+        axes.append(axis)
+    # 获取需要进行操作的轴
+    ends = []
+    for step in layer.slice_param.slice_point:
+        ends.append(step)
+    # 这个地方搞了一个小 trick, 使用输入的 axis 作为最后一个
+    ends.append(input_shape[0][axis])
+
+
+    return starts, ends, axes
+
+
+# 计算输出维度
+# def getSliceOutShape(layer, input_shape, output_name):
+#     # TODO:
+#     steps = []
+#     for step in layer.slice_param.slice_point:
+#         steps.append(step)
+#     # slice point
+#     assert len(steps) == len(output_name) - 1
+#     # 轴
+#     axis = layer.concat_param.axis
+#     start = 0
+#     n, c, w, h = input_shape[0][0], 0, input_shape[0][2], input_shape[0][3]
+#     # 计算总体的值
+#     output_shape = [[]]
+#     sum = input_shape[0][1]
+#     if (axis == 1):
+#         for step in steps:
+#             # update start
+#             c = step - start
+#             output_shape.append([n, c, w, h])
+#             start = step
+#     output_shape.append([n, sum - start, w, h])
+#     return output_shape[1:]
+
+
+# def getSliceAttri(layer, start, end, axes):
+#     attributs = {
+#         'starts': [start],
+#         'ends': [end],
+#         'axes': [axes],
+#     }
+#     return attributs
+
+
+def getSliceOutShape(input_shape, start, end):
+    if len(input_shape[0]) == 4:
+        output_shape = [[input_shape[0][0], end - start, input_shape[0][2], input_shape[0][3]]]
+    elif len(input_shape[0]) == 2:
+        output_shape = [[input_shape[0][0], end - start]]
+    else:
+        print("Unsupport slice shape")
+        exit(-1)
+
+    return output_shape
+
+
+
+# 构建节点
+def createSlice(layer, node_name, input_name, output_name, input_shape, start, end):
+
+    output_shape = getSliceOutShape(input_shape, start, end)
+
+    node = Node.c2oNode(layer, node_name, "Slice", input_name, output_name, input_shape, output_shape, Flag=True)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Softmax.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Softmax.py
new file mode 100644
index 0000000..a932e62
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Softmax.py
@@ -0,0 +1,21 @@
+import src.c2oObject as Node
+##---------------------------------------------softmax层--------------------------------------------------------------##
+#获取超参数
+def getSoftmaxAttri(layer):
+    ##轴
+    axis = layer.softmax_param.axis
+    #超参数字典
+    dict = {"axis": axis}
+    return dict
+#计算输出维度
+def getSoftmaxOutShape(input_shape):
+    #计算输出维度output_shape
+    output_shape = input_shape#与输入维度一样
+    return output_shape
+#构建节点
+def createSoftmax(layer, nodename, inname, outname, input_shape):
+    dict = getSoftmaxAttri(layer)
+    output_shape = getSoftmaxOutShape(input_shape)
+    #构建node
+    node = Node.c2oNode(layer, nodename, "Softmax", inname, outname, input_shape, output_shape, dict)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Tanh.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Tanh.py
new file mode 100644
index 0000000..d43b3d9
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Tanh.py
@@ -0,0 +1,16 @@
+#coding:utf-8
+import src.c2oObject as Node
+
+
+def getOutShape(input_shape):
+    # 获取output_shape
+    return input_shape
+
+# 构建节点
+
+
+def createTanh(layer, nodename, inname, outname, input_shape):
+    output_shape = getOutShape(input_shape)
+    node = Node.c2oNode(layer, nodename, "Tanh", inname,
+                        outname, input_shape, output_shape)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Transpose.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Transpose.py
new file mode 100644
index 0000000..7904202
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Transpose.py
@@ -0,0 +1,52 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import src.c2oObject as Node
+import typing
+
+
+def getTransposeAttri(layer) -> typing.Dict:
+    if layer.type == "ShuffleChannel":
+        # 超参数字典
+        perm_array = [0, 2, 1, 3, 4]
+        attributes = {"perm": perm_array}
+        return attributes
+    else:
+        orders = layer.permute_param.order
+        attributes = {"perm": orders}
+        return attributes
+
+
+# 计算输出维度
+def getTransposeOutShape(layer, input_shape, attributes):
+    if layer.type == "ShuffleChannel":
+        n, g, c, h, w = input_shape[0][0], input_shape[0][1], input_shape[0][2], input_shape[0][3], input_shape[0][4]
+
+        output_shape = [[n, c, g, h, w]]
+        return output_shape
+    else:
+        orders = attributes.get("perm")
+        shape = []
+        for order in orders:
+            shape.append(input_shape[0][order])
+        return [shape]
+
+# 构建节点
+def createTranspose(layer, node_name, input_name, output_name, input_shape) -> Node:
+    attributes = getTransposeAttri(layer)
+
+    output_shape = getTransposeOutShape(layer, input_shape, attributes)
+
+    node = Node.c2oNode(layer, node_name, "Transpose", input_name, output_name, input_shape, output_shape, attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/UnPooling.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/UnPooling.py
new file mode 100644
index 0000000..19a0675
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/UnPooling.py
@@ -0,0 +1,53 @@
+import numpy as np
+import src.c2oObject as Node
+##-----------------------------------------------------UnPooling层--------------------------------------------------##
+#获取超参数
+def getUnPoolingAttri(layer):
+    # ##池化核尺寸
+    # kernel_shape = np.array([layer.pooling_param.kernel_size]*2).reshape(1,-1)[0].tolist()
+    # if layer.pooling_param.kernel_size == []:
+    #     kernel_shape = [layer.pooling_param.kernel_h,layer.pooling_param.kernel_w]
+    # ##步长
+    # strides = [1, 1]#默认为1
+    # if layer.pooling_param.stride != []:
+    #     strides = np.array([layer.pooling_param.stride]*2).reshape(1,-1)[0].tolist()
+    # ##填充
+    # pads = [0, 0, 0, 0]#默认为0
+    # # 这里与卷积时一样,有pad,就按其值设置
+    # if layer.pooling_param.pad != []:
+    #     pads = np.array([layer.pooling_param.pad] * 4).reshape(1, -1)[0].tolist()
+    # elif layer.pooling_param.pad_h != 0 or layer.pooling_param.pad_w != 0:
+    #     pads = [layer.pooling_param.pad_h,layer.pooling_param.pad_w,layer.pooling_param.pad_h,layer.pooling_param.pad_w]
+
+    #超参数字典
+    dict = {"kernel_shape": [2, 2],
+            "strides": [2, 2],
+            "pads": [0, 0, 0, 0]
+            }
+    return dict
+#计算输出维度
+def getUnPoolingOutShape(input_shape,layer,dict):
+    kernel_shape = dict["kernel_shape"]
+    pads = dict["pads"]
+    strides = dict["strides"]
+
+    #计算输出维度,与卷积一样,若为非整数则向上取整
+    # h = (input_shape[0][2] - kernel_shape[0] + 2 * pads[0])/strides[0] + 1
+    # if h > int(h):
+    #     output_shape_h = int(h) + 1
+    #     pads = [0,0,1,1]
+    # else:
+    #     output_shape_h = int(h)
+    # output_shape = [[input_shape[0][0],input_shape[0][1],output_shape_h,output_shape_h]]
+
+    output_shape = [[input_shape[0][0], input_shape[0][1], input_shape[0][2]*2, input_shape[0][3]*2]]
+
+    return output_shape
+#构建节点
+def createUnPooling(layer,nodename,inname,outname,input_shape):
+    dict = getUnPoolingAttri(layer)
+    output_shape = getUnPoolingOutShape(input_shape,layer,dict)
+
+    node = Node.c2oNode(layer, nodename, "MaxUnpool", inname, outname, input_shape, output_shape, dict=dict)
+
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/Upsample.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Upsample.py
new file mode 100644
index 0000000..cc3c86d
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/Upsample.py
@@ -0,0 +1,33 @@
+import src.c2oObject as Node
+import numpy as np
+
+
+# 获取超参数
+def get_upsample_attri(layer):
+    # scale = layer.upsample_param.scale
+    # scales = [1.0,1.0,scale,scale]
+    # dict = {"scales":scales,"mode":"nearest"}#Upsample将scales放入参数里面了
+    # dict = {"width_scale": scale,"height_scale":scale, "mode": "nearest"}#在OpenVINO读onnx的时候要求用width_scale和height_scale
+    scale = layer.upsample_param.scale
+    scales = [1.0, 1.0, scale, scale]
+
+    attributes = {"mode": "linear",
+                  'scales': scales}
+
+    return attributes
+
+
+def get_upsample_outputshape(input_shape, layer):
+    scale = layer.upsample_param.scale
+    scales = [1.0, 1.0, scale, scale]
+    output_shape = [np.multiply(np.array(scales, dtype=np.int), np.array(input_shape[0])).tolist()]
+    return output_shape
+
+
+def create_upsample_node(layer, node_name, input_name, output_name, input_shape):
+    attributes = get_upsample_attri(layer)
+    output_shape = get_upsample_outputshape(input_shape, layer)
+
+    # print(output_shape)
+    node = Node.c2oNode(layer, node_name, "Upsample", input_name, output_name, input_shape, output_shape, attributes)
+    return node
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/OPs/__init__.py b/3rdparty/TNN/tools/caffe2onnx/src/OPs/__init__.py
new file mode 100644
index 0000000..884fdf4
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/OPs/__init__.py
@@ -0,0 +1,35 @@
+from src.OPs.BatchNorm import *
+from src.OPs.Concat import *
+from src.OPs.Conv import *
+from src.OPs.Dropout import *
+from src.OPs.Eltwise import *
+from src.OPs.Gemm import *
+from src.OPs.LRN import *
+from src.OPs.Pooling import *
+from src.OPs.PRelu import *
+from src.OPs.ReLU import *
+from src.OPs.Reshape import *
+from src.OPs.Softmax import *
+from src.OPs.Upsample import *
+from src.OPs.UnPooling import *
+from src.OPs.ConvTranspose import *
+from src.OPs.Slice import *
+from src.OPs.Transpose import *
+from src.OPs.Sigmoid import *
+from src.OPs.Min import *
+from src.OPs.Clip import *
+from src.OPs.Log import *
+from src.OPs.Mul import *
+from src.OPs.Interp import *
+from src.OPs.Crop import *
+from src.OPs.InstanceNorm import *
+from src.OPs.PriroBox import create_priorbox_node
+from src.OPs.DetectionOutput import create_detection_output
+from src.OPs.Flatten import create_flatten_node
+from src.OPs.Resize import create_resize_node
+from src.OPs.Axpy import create_axpy_add_node, create_axpy_mul_node
+from src.OPs.LpNormalization import create_Lp_Normalization
+from src.OPs.Power import get_power_param, create_power_node
+from src.OPs.Add import create_add_node
+from src.OPs.Tanh import createTanh
+
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/__init__.py b/3rdparty/TNN/tools/caffe2onnx/src/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/__init__.py
@@ -0,0 +1 @@
+
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/args_parser.py b/3rdparty/TNN/tools/caffe2onnx/src/args_parser.py
new file mode 100644
index 0000000..454082a
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/args_parser.py
@@ -0,0 +1,34 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='convert caffe model to onnx')
+
+    parser.add_argument(dest='proto_file',
+                        action='store',
+                        help='the path for prototxt file, the file name must end with .prototxt')
+
+    parser.add_argument(dest='caffe_model_file',
+                        action='store',
+                        help='the path for caffe model file, the file name must end with .caffemodel!')
+
+    parser.add_argument('-o',
+                        dest='onnx_file',
+                        action='store',
+                        help='the path for generate onnx file')
+    args = parser.parse_args()
+    return args
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/c2oObject.py b/3rdparty/TNN/tools/caffe2onnx/src/c2oObject.py
new file mode 100644
index 0000000..39c560c
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/c2oObject.py
@@ -0,0 +1,43 @@
+from onnx import helper
+class c2oNode(object):
+    def __init__(self,layer,node_name,type,inputs_name,outputs_name,inputs_shape,outputs_shape,dict={}, Flag=False):
+        self.node = self.__createNode(type, inputs_name, outputs_name, node_name, dict)
+        self.bottom = layer.bottom
+        if Flag is True:
+            self.top = outputs_name
+        else:
+            self.top = layer.top
+        self.inputs_name = inputs_name
+        self.outputs_name = outputs_name
+        self.inputs_shape = inputs_shape
+        self.outputs_shape = outputs_shape
+
+        self.dict = dict
+
+    #创建节点
+    def __createNode(self, node_type, in_name, out_name, node_name, dict):
+        node_def = helper.make_node(
+            node_type,
+            in_name,
+            out_name,
+            node_name,
+            **dict,
+        )
+        return node_def
+
+class c2oGraph():
+    def __init__(self,onnxname):
+        self.name = onnxname
+        self.in_tvi = []#存放输入信息，包括第一个输入和输入参数信息
+        self.out_tvi = []#存放输出信息
+        self.init_t = []#存放输入参数的值
+        self.hidden_out_tvi = []#存放中间输出信息
+
+    def addInputsTVI(self,in_tvi):
+        self.in_tvi.append(in_tvi)
+    def addOutputsTVI(self,out_tvi):
+        self.out_tvi.append(out_tvi)
+    def addInitTensor(self,init_t):
+        self.init_t.append(init_t)
+    def addValueInfoTVI(self,vi_tvi):
+        self.hidden_out_tvi.append(vi_tvi)
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/caffe2onnx.py b/3rdparty/TNN/tools/caffe2onnx/src/caffe2onnx.py
new file mode 100644
index 0000000..1257fad
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/caffe2onnx.py
@@ -0,0 +1,1116 @@
+import src.OPs as op
+from src.c2oObject import *
+from onnx import helper
+import copy
+import numpy as np
+from src.op_layer_info import *
+import random
+import sys
+from typing import *
+import onnx
+
+class Caffe2Onnx():
+    def __init__(self, net, model, onnxname):
+        # 初始化一个c2oGraph对象
+        self.onnxmodel = c2oGraph(onnxname)
+        # 网络和参数
+        self.netLayerCaffe = self.GetNetLayerCaffe(net)
+        self.netModelCaffe = self.GetNetModelCaffe(model)
+
+        # 模型的输入名和输入维度
+        self.model_input_name = []
+        self.model_input_shape = []
+
+        # 节点列表
+        self.onnxNodeList = []
+
+        # 获取层列表
+        LayerList = self.AddInputsTVIAndGetLayerList(net)
+        self.GenerateOnnxNodeList(LayerList)
+        self.AddOutputsTVIAndValueInfo()
+
+    # 获取网络层
+    def GetNetLayerCaffe(self, net):
+        if len(net.layer) == 0 and len(net.layers) != 0:
+            return net.layers
+        elif len(net.layer) != 0 and len(net.layers) == 0:
+            return net.layer
+        else:
+            print("prototxt layer error")
+            return -1
+
+    # 获取参数层
+    def GetNetModelCaffe(self, model):
+        if len(model.layer) == 0 and len(model.layers) != 0:
+            return model.layers
+        elif len(model.layer) != 0 and len(model.layers) == 0:
+            return model.layer
+        else:
+            print("caffemodel layer error")
+            return -1
+
+    # 将模型输入信息添加到Inputs中并获取后续层列表
+    def AddInputsTVIAndGetLayerList(self, net):
+        # 如果第一个layer的类型为Input,且没有net.input存在
+        if net.input == [] and self.netLayerCaffe[0].type == "Input":
+            layer_list = []
+            # 考虑到整个网络会有多输入情况
+            for lay in self.netLayerCaffe:
+                if lay.type == "Input":
+                    if len(lay.top) == 1 and lay.top[0] != lay.name:
+                        input_layer_name = lay.top[0]
+                    else:
+                        input_layer_name = lay.name
+
+                    in_tvi = helper.make_tensor_value_info(
+                        input_layer_name + "_input", TensorProto.FLOAT,
+                        lay.input_param.shape[0].dim)
+
+                    self.model_input_name.append(input_layer_name + "_input")
+                    self.model_input_shape.append(lay.input_param.shape[0].dim)
+                    self.onnxmodel.addInputsTVI(in_tvi)
+                else:
+                    layer_list.append(lay)
+            return layer_list
+
+        # 如果存在net.input
+        elif net.input != []:
+
+            if bool(net.input_dim):
+                input_dim = net.input_dim
+            elif bool(net.input_shape):
+                input_dim = net.input_shape[0].dim
+            else:
+                raise RuntimeError("Input shape missing!")
+
+            in_tvi = helper.make_tensor_value_info("input", TensorProto.FLOAT, input_dim)
+            self.model_input_name.append("input")
+            self.model_input_shape.append(input_dim)
+            self.onnxmodel.addInputsTVI(in_tvi)
+            return self.netLayerCaffe
+
+        # 以上情况都不是,则该caffe模型没有输入,存在问题
+        else:
+            raise ValueError("the caffe model has no input")
+
+    # 得到layer的参数shape
+    def GetParamsShapeAndData(self, layer):
+        ParamShape = []
+        ParamData = []
+        # 根据这个layer名找出对应的caffemodel中的参数
+        for model_layer in self.netModelCaffe:
+            if layer.name == model_layer.name:
+                Params = copy.deepcopy(model_layer.blobs)
+                ParamShape = [p.shape.dim for p in Params]
+                ParamData = [p.data for p in Params]
+                if layer.type == "BatchNorm" or layer.type == "BN":
+                    if len(ParamShape) == 3:
+                        # 如果是bn层，则不用最后一层的滑动系数
+                        ParamShape = ParamShape[:-1]
+                        ParamData = ParamData[:-1]
+                    elif len(ParamShape) == 2 and len(ParamShape[0]) != 1:
+                        ParamShape = [[ParamShape[0][1]], [ParamShape[1][1]]]
+                        ParamData = ParamData
+        return ParamShape, ParamData
+
+    def get_param_shape(self, params):
+        shapes = []
+        for p in params:
+            if p.shape.dim != []:
+                shape = p.shape.dim
+                shapes.append(shape)
+            else:
+                shape = [p.num, p.channels, p.height, p.width]
+                shapes.append(shape)
+        return shapes
+
+    # 将参数添加到Inputs中,并生成tensor存储数据
+    def AddInputsTVIFromParams(self, layer, ParamName, ParamType):
+        ParamShape = []
+        ParamData = []
+        # 根据这个layer名找出对应的caffemodel中的参数
+        for model_layer in self.netModelCaffe:
+            if layer.name == model_layer.name:
+                Params = copy.deepcopy(model_layer.blobs)
+                #ParamShape = [p.shape.dim for p in Params]
+                ParamShape = self.get_param_shape(Params)
+                ParamData = [p.data for p in Params]
+                if layer.type == "BatchNorm" or layer.type == "BN":
+                    if len(ParamShape) == 3:
+                        # 如果是bn层，params为[mean, var, s]，则需要把mean和var除以滑动系数s
+                        ParamShape = ParamShape[:-1]
+                        ParamData = [
+                            [q / (Params[-1].data[0])
+                             for q in p.data] if i == 0 else
+                            [q / (Params[-1].data[0] + 1e-5) for q in p.data]
+                            for i, p in enumerate(Params[:-1])
+                        ]  # with s
+                    elif len(ParamShape) == 2 and len(ParamShape[0]) == 4:
+                        ParamShape = [[ParamShape[0][1]], [ParamShape[1][1]]]
+                        ParamData = [[q / 1. for q in p.data] if i == 0 else
+                                     [q / (1. + 1e-5) for q in p.data]
+                                     for i, p in enumerate(Params)]
+                if layer.type == "Reshape":
+                    ParamShape = [[len(model_layer.reshape_param.shape.dim)]]
+                    ParamData = [model_layer.reshape_param.shape.dim]
+                if layer.type == "Convolution" or layer.type == "ConvolutionDepthwise":
+                    if len(ParamShape) == 2:
+                        ParamShape[1] = [ParamShape[0][0]]
+                if layer.type == "InnerProduct":
+                    if len(ParamShape[0]) > 2:
+                        ParamShape[0] = [ParamShape[0][2], ParamShape[0][3]]
+                    if len(ParamShape) == 2:
+                        if len(ParamShape[1]) > 2:
+                            ParamShape[1] = [ParamShape[1][2], ParamShape[1][3]]
+                if layer.type == "Normalize":
+                    if len(ParamShape) == 1:
+                        ParamShape[0] = [1, ParamShape[0][0], 1, 1]
+
+                # comment it for tvm because tvm use broadcast at prelu layer
+                # 个人感觉如果不用 tvm，就不需要使用 Prelu
+                # if layer.type == 'PReLU':
+                #     ParamShape = [[ParamShape[0][0], 1, 1]]
+
+                break
+        # 判断是否有Param
+        if ParamShape != []:
+            ParamName = ParamName[0:len(ParamShape)]
+            ParamType = ParamType[0:len(ParamShape)]
+            for i in range(len(ParamShape)):
+                ParamName[i] = layer.name + ParamName[i]
+                p_tvi = helper.make_tensor_value_info(ParamName[i],
+                                                      ParamType[i],
+                                                      ParamShape[i])
+                p_t = helper.make_tensor(ParamName[i], ParamType[i],
+                                         ParamShape[i], ParamData[i])
+                self.onnxmodel.addInputsTVI(p_tvi)
+                self.onnxmodel.addInitTensor(p_t)
+                # print("添加参数" + ParamName[i] + "输入信息和tensor数据")
+        if layer.type == "BatchNorm" or layer.type == "BN" or layer.type == "Scale":
+            return ParamName, ParamShape
+        return ParamName
+
+    # 手动将参数添加到输入信息中,并生成tensor存储数据
+    def AddInputsTVIMannul(self, layer, param_names, param_types, param_shapes,
+                           param_data):
+        node_names = copy.deepcopy(param_names)
+        for i in range(len(param_shapes)):
+            node_names[i] = layer.name + param_names[i]
+            p_tvi = helper.make_tensor_value_info(node_names[i],
+                                                  param_types[i],
+                                                  param_shapes[i])
+            p_t = helper.make_tensor(node_names[i], param_types[i],
+                                     param_shapes[i], param_data[i])
+            self.onnxmodel.addInputsTVI(p_tvi)
+            self.onnxmodel.addInitTensor(p_t)
+        return node_names
+        # # 由于 Slice 的 input 情况特殊，所以需要特殊处理
+        # if layer.type == 'Slice':
+        #     for i in range(len(ParamShape)):
+        #         p_tvi = helper.make_tensor_value_info(Param_Name[i], ParamType[i], ParamShape[i])
+        #         p_t = helper.make_tensor(Param_Name[i], ParamType[i], ParamShape[i], ParamData[i])
+        #         self.onnxmodel.addInputsTVI(p_tvi)
+        #         self.onnxmodel.addInitTensor(p_t)
+        #     return Param_Name
+        # else:
+        #     for i in range(len(ParamShape)):
+        #         Param_Name[i] = layer.name + ParamName[i]
+        #         p_tvi = helper.make_tensor_value_info(Param_Name[i], ParamType[i], ParamShape[i])
+        #         p_t = helper.make_tensor(Param_Name[i], ParamType[i], ParamShape[i], ParamData[i])
+        #         self.onnxmodel.addInputsTVI(p_tvi)
+        #         self.onnxmodel.addInitTensor(p_t)
+        #     return Param_Name
+
+    # 获取上一层的输出名(即当前层的输入)
+    def GetLastLayerOutNameAndShape(self, layer):
+        output_name = []
+        outshape = []
+        # flag is True: 模型的输入没有被覆盖
+        # flag is False: 模型的输入已经被覆盖
+        flag = True
+
+        # 如果结点列表为空，或者当前层的bottom在input_name中，那么上一层输入一定是 Input
+        if self.onnxNodeList == []:
+            output_name += self.model_input_name
+            outshape += self.model_input_shape
+
+        else:
+            for i in range(len(layer.bottom)):
+
+                # 因为prototxt中存在top和bottom同名的情况，但是layer.bottom只能对应一个node，所以对每个layer.bottom，找到最末的那个同名节点作为上一层节点
+                name = None
+                shape = None
+                for node in self.onnxNodeList:
+                    for j in range(len(node.top) if node.node.op_type != "MaxPool" else 1):
+                        if layer.bottom[i] == node.top[j]:
+                            name = node.outputs_name[j]
+                            shape = node.outputs_shape[j]
+                        for k in range(len(node.bottom)):
+                            if node.top[j] == node.bottom[k]:
+                                for w in range(len(self.model_input_name)):
+                                    if node.top[j] + '_input' == self.model_input_name[w]:
+                                        flag = False
+
+                for j in range(len(self.model_input_name)):
+                    if layer.bottom[i] + '_input' == self.model_input_name[j] and flag:
+                        output_name.append(self.model_input_name[j])
+                        outshape.append(self.model_input_shape[j])
+
+                if name:
+                    output_name.append(name)
+                    outshape.append(shape)
+
+        try:
+            assert output_name, "Failed at layer %s, layer's bottom not detected ..." % (layer.name)
+        except:
+            print("Failed at layer %s, layer's bottom not detected ..." % (layer.name))
+            exit(-1)
+        return output_name, outshape
+
+    # 获取当前层的输出名，即layername
+    def GetCurrentLayerOutName(self, layer):
+        # return [layer.name]
+        # 考虑有多个输出的情况
+        # # TODO: 为什么要使用 layer.name 进行替代呢?
+        if layer.top == layer.bottom and len(layer.top) == 1:
+            return [layer.name]
+        return [out for out in layer.top]
+
+
+    def GenerateOnnxNodeList(self, Layers):
+        for i in range(len(Layers)):
+            print("convert layer: " + Layers[i].name)
+            # Convolution
+            if Layers[i].type == "Convolution" or Layers[i].  type == Layer_CONVOLUTION:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                conv_pname = self.AddInputsTVIFromParams(Layers[i], op_pname["Conv"], op_ptype["Conv"])
+                input_name.extend(conv_pname)
+
+                # 3.构建conv_node
+                conv_node = op.createConv(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(conv_node)
+
+            elif Layers[i].type == "ConvolutionDepthwise" or Layers[i].type == Layer_CONVOLUTION:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                conv_pname = self.AddInputsTVIFromParams(Layers[i], op_pname["Conv"], op_ptype["Conv"])
+                input_name.extend(conv_pname)
+
+                # 3.构建conv_node
+                conv_node = op.createConv(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(conv_node)
+
+            # BatchNorm+Scale
+            elif Layers[i].type == "BatchNorm" or Layers[i].type == "BN":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                if i < len(Layers) - 1 and Layers[i + 1].type == "Scale":
+                    scale_pname, scale_pshape = self.AddInputsTVIFromParams(Layers[i + 1], op_pname["Scale"],
+                                                                            op_ptype["Scale"])
+                    bn_pname, bn_pshape = self.AddInputsTVIFromParams(Layers[i], op_pname["BatchNorm"],
+                                                                      op_ptype["BatchNorm"])
+                    assert bn_pshape == scale_pshape, "BatchNorm and Scale params should share the same shape"
+                    input_name.extend(scale_pname)
+                    input_name.extend(bn_pname)
+                else:
+                    bn_pshape, _ = self.GetParamsShapeAndData(Layers[i])
+                    custom_params = [np.ones(shape=bn_pshape[0], dtype=np.float),
+                                     0.001 + np.zeros(shape=bn_pshape[1], dtype=np.float)]
+                    scale_pname = self.AddInputsTVIMannul(Layers[i], op_pname["Scale"], op_ptype["Scale"], bn_pshape,
+                                                          custom_params)
+                    bn_pname, bn_pshape = self.AddInputsTVIFromParams(Layers[i], op_pname["BatchNorm"],
+                                                                      op_ptype["BatchNorm"])
+                    input_name.extend(scale_pname)
+                    input_name.extend(bn_pname)
+
+                # 3.构建bn_node
+                bn_node = op.createBN(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(bn_node)
+
+            elif Layers[i].type == "Scale":
+                if i > 0 and (Layers[i - 1].type == "BatchNorm" or Layers[i - 1].type == "BN"):
+                    # bn + scale
+                    continue
+                # signal scale
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                # node_name = Layers[i].name + random.choice('1234567890abcdefghijklmnopqrst')
+                node_name = Layers[i].name
+                has_two_input: bool = False
+                if len(input_name) > 1:
+                    has_two_input = True
+
+                if has_two_input and op.need_add_reshape(input_shape):
+                    reshape_layer = copy.deepcopy(Layers[i])
+                    # add reshape layer
+                    reshape_node_name =  input_name[1] + '_reshap_' + random.choice('1234567890abcdefghijklmnopqrst')
+
+                    reshape_input_name = input_name[1]
+                    reshape_input_shape = input_shape[1]
+
+                    reshape_shape_data = op.get_param_shape(input_shape)
+                    reshape_shape_shape = np.shape(reshape_shape_data)
+
+                    reshape_params = self.AddInputsTVIMannul(Layers[i], [reshape_node_name + 'shape'], [TensorProto.INT64],
+                                                             [reshape_shape_shape], [reshape_shape_data])
+
+                    reshape_output_name = [reshape_input_name + '_output_name']
+
+
+                    reshape_node = op.createReshape(reshape_layer, reshape_node_name, [reshape_input_name, reshape_params[0]],
+                                                    reshape_output_name, reshape_input_shape,  output_shape=[reshape_shape_data])
+
+                    self.onnxNodeList.append(reshape_node)
+
+                    # add mul node
+                    input_name[1] = reshape_output_name[0]
+                    input_shape[1] = reshape_shape_data
+                    mul_node = op.create_mul_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                    self.onnxNodeList.append(mul_node)
+                else:
+                    param_shape, param_data = self.GetParamsShapeAndData(Layers[i])
+                    # Scale = Mul + Add
+                    if len(param_shape) == 2:
+                        # create mul
+                        param_scale_shape = [1, param_shape[0][0], 1, 1]
+                        param_scale_data = param_data[0]
+                        param_scale_name = self.AddInputsTVIMannul(Layers[i], ["_scale"], [TensorProto.FLOAT], [param_scale_shape], [param_scale_data])
+
+                        mul_node_name = node_name + "_mul"
+                        mul_input_name = [input_name[0], param_scale_name[0]]
+                        mul_output_name = [output_name[0] + "_mul"]
+                        mul_input_shape = [input_shape[0], param_scale_shape]
+
+                        mul_node = op.create_mul_node(Layers[i], mul_node_name, mul_input_name, mul_output_name, mul_input_shape)
+                        self.onnxNodeList.append(mul_node)
+
+                        param_bias_shape = [1, param_shape[1][0], 1, 1]
+                        param_bias_data = param_data[1]
+                        param_bias_name = self.AddInputsTVIMannul(Layers[i], ["_bias"], [TensorProto.FLOAT], [param_bias_shape], [param_bias_data])
+
+                        add_node_name = node_name + "_add"
+                        add_input_name = [mul_output_name[0], param_bias_name[0]]
+                        add_output_name = output_name
+                        add_input_shape = [input_shape[0], param_bias_shape]
+                        add_node = op.create_add_node(Layers[i], add_node_name, add_input_name, add_output_name, add_input_shape)
+                        self.onnxNodeList.append(add_node)
+                    # Scale = Mul
+                    if len(param_shape) == 1:
+                        # create mul
+                        param_scale_shape = [1, param_shape[0][0], 1, 1]
+                        param_scale_data = param_data[0]
+                        param_scale_name = self.AddInputsTVIMannul(
+                            Layers[i], ["_scale"], [TensorProto.FLOAT],
+                            [param_scale_shape], [param_scale_data])
+
+                        mul_input_name = [input_name[0], param_scale_name[0]] 
+                        mul_input_shape = [input_shape[0], param_scale_shape]
+
+                        mul_node = op.create_mul_node(Layers[i], node_name,
+                                                      mul_input_name,
+                                                      output_name,
+                                                      mul_input_shape)
+                        self.onnxNodeList.append(mul_node)
+
+            # Pooling
+            elif Layers[i].type == "Pooling" or Layers[i].type == Layer_POOLING:
+                # TODO:
+                # Pooling <= Pad + Pool
+                # NOTE： 由于 Caffe 和 ONNX 对 AveragePool 的处理的方式的不同，所以需要在pool node 之前添加 Pad node
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+                # create pad node
+                pads = op.get_pool_pads(Layers[i])
+                pads_shape = [np.shape(pads)]
+                pads_name = node_name + "_output"
+                pads_output_name = [node_name + "_output"]
+                pad_output_shape = op.calculate_pad_output_shape(input_shape, pads)
+                pads_param = self.AddInputsTVIMannul(Layers[i], ["_pad"], [TensorProto.INT64], pads_shape, [pads])
+                input_name.extend(pads_param)
+
+                pool_type = op.pooling_type(Layers[i])
+                if pool_type == "GlobalMaxPool" or pool_type == "MaxPool":
+                    constant_value = [-sys.float_info.max]
+                    constant_shape = [np.shape(constant_value)]
+
+                    constant_value_param = self.AddInputsTVIMannul(Layers[i], ["_constant_value"], [TensorProto.FLOAT],
+                                                                   constant_shape, [constant_value])
+                    input_name.extend(constant_value_param)
+
+                pad_node = op.create_pad_node(Layers[i], pads_name, input_name, pads_output_name, input_shape)
+                self.onnxNodeList.append(pad_node)
+
+                # 2.构建pool_node
+                pool_node = op.create_pooling_node(Layers[i], node_name, pads_output_name, output_name,
+                                                   pad_output_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(pool_node)
+
+
+            # MaxUnPool
+            elif Layers[i].type == "MaxUnpool":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                # 2.构建unpool_node
+                unpool_node = op.createUnPooling(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(unpool_node)
+
+
+            # Eltwise
+            elif Layers[i].type == "Eltwise" or Layers[i].type == Layer_ELTWISE:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+
+                node_name = Layers[i].name
+
+                # 2.构建eltwise_node
+                eltwise_node = op.createEltwise(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(eltwise_node)
+
+
+            # Softmax
+            elif Layers[i].type == "Softmax" or Layers[i].type == Layer_SOFTMAX:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                # 2.构建softmax_node
+                softmax_node = op.createSoftmax(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(softmax_node)
+
+
+            # Relu
+            elif Layers[i].type == "ReLU" or Layers[i].type == Layer_RELU:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+                # letters = '1234567890abcdefghijklmnopqrst'
+                # length = random.randrange(5, 16)
+                # randstr = ''.join(random.choice(letters) for _ in range(length))
+                # node_name = node_name
+                # for i in range(len(output_name)):
+                #     output_name[i] = output_name[i] + random.choice('1234567890abcdef')
+                #print(output_name)
+
+
+                # 2.构建relu_node
+                relu_node = op.createRelu(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(relu_node)
+            # PRelu
+            elif Layers[i].type == "PReLU":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                pname = self.AddInputsTVIFromParams(Layers[i], op_pname["PRelu"], op_ptype["PRelu"])
+                input_name.extend(pname)
+
+                # 3.构建PRelu_node
+                PRelu_node = op.createPRelu(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(PRelu_node)
+            # relu6
+            elif Layers[i].type == 'ReLU6':
+                # relu6 = clip(0, 6)
+                # add relu node
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                min_value = np.float(0)
+                max_value = np.float(6)
+                shape = np.shape([min_value])
+                min_param = self.AddInputsTVIMannul(Layers[i], ["_min"],
+                                                    [TensorProto.FLOAT], [shape],
+                                                    [[min_value]])
+                input_name.extend(min_param)
+                max_param = self.AddInputsTVIMannul(Layers[i], ['_max'],
+                                                    [TensorProto.FLOAT], [shape],
+                                                    [[max_value]])
+                input_name.extend(max_param)
+                relu6_node = op.create_clip_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                self.onnxNodeList.append(relu6_node)
+
+            elif Layers[i].type == "Sigmoid":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                # 2.构建relu_node
+                sigmoid_node = op.createSigmoid(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(sigmoid_node)
+            elif Layers[i].type == 'Log':
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                log_node = op.create_log_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                self.onnxNodeList.append(log_node)
+            # LRN
+            elif Layers[i].type == "LRN" or Layers[i].type == Layer_LRN:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.构建LRN_node
+                LRN_node = op.createLRN(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(LRN_node)
+
+
+            # Dropout
+            elif Layers[i].type == "Dropout" or Layers[i].type == Layer_DROPOUT:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.构建Dropout_node
+                Dropout_node = op.createDropout(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(Dropout_node)
+
+
+            # Upsample
+            elif Layers[i].type == "Upsample" or Layers[i].type == Layer_UPSAMPLE:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                # add roi input
+
+                # add scales input
+                paramshape = [[8, 1],
+                              [4, 1]]
+                paramdata = [[1, 1, 1, 1, 2, 2, 2, 2],
+                             [1.0, 1.0, Layers[i].upsample_param.scale, Layers[i].upsample_param.scale]]
+
+                pname = self.AddInputsTVIMannul(Layers[i], op_pname["Upsample"], op_ptype["Upsample"], paramshape,
+                                               paramdata)
+
+                input_name.extend(pname)
+
+                # 3.构建Upsample_node
+                Upsample_node = op.create_resize_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(Upsample_node)
+
+            elif Layers[i].type == 'Interp':
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                interp_node = op.create_interp_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                self.onnxNodeList.append(interp_node)
+
+            # Concat
+            elif Layers[i].type == "Concat" or Layers[i].type == Layer_CONCAT:
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.构建Concat_node
+                Concat_node = op.createConcat(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(Concat_node)
+
+            elif Layers[i].type == 'Slice':
+                # 1. 获取节点书输入名，输入维度，输出名，节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name_list = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                starts, ends, axes = op.analyzeLayer(Layers[i], input_shape)
+
+                SliceLayer = copy.deepcopy(Layers[i])
+
+                for i in range(len(output_name_list)):
+                    # 放在这里的原因是
+                    slice_name = copy.deepcopy(input_name)
+                    # starts ends axes 的 shape 是相同的
+                    shape = [np.shape([1])]
+
+                    starts_param = self.AddInputsTVIMannul(SliceLayer, ['_starts' + str(i)],
+                                                           [TensorProto.INT64], shape,
+                                                           [[starts[i]]])
+                    ends_param = self.AddInputsTVIMannul(SliceLayer, ['_ends' + str(i)],
+                                                         [TensorProto.INT64], shape,
+                                                         [[ends[i]]])
+                    axes_param = self.AddInputsTVIMannul(SliceLayer, ['_axes' + str(i)],
+                                                         [TensorProto.INT64], shape,
+                                                         [[axes[i]]])
+                    slice_name.extend(starts_param)
+                    slice_name.extend(ends_param)
+                    slice_name.extend(axes_param)
+
+                    Slice_node = op.createSlice(SliceLayer, output_name_list[i], slice_name, [output_name_list[i]],
+                                                input_shape, starts[i], ends[i])
+                    # 3. 添加节点到节点列表
+                    self.onnxNodeList.append(Slice_node)
+            # Reshape
+            elif Layers[i].type == "Reshape":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                reshape_param = op.get_reshape_param(Layers[i], input_shape)
+                reshape_param_shape = [np.shape(reshape_param)]
+                pname = self.AddInputsTVIMannul(Layers[i], op_pname["Reshape"], op_ptype["Reshape"], reshape_param_shape,
+                                                [reshape_param])
+                input_name.extend(pname)
+
+                # 3.构建reshape节点
+                reshape_node = op.createReshape(Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 4.添加点到节点列表
+                self.onnxNodeList.append(reshape_node)
+
+            # InnerProduct
+            # 由于onnx中没有全连接层，因此需要拆分，拆分有两种方法(Reshape+Gemm,Reshape+MatMul+Add)
+            elif Layers[i].type == "InnerProduct" or Layers[i].type == Layer_INNER_PRODUCT:
+                node_layer = copy.deepcopy(Layers[i])  # 深拷贝
+                node_input_name, node_input_shape = self.GetLastLayerOutNameAndShape(node_layer)  # 获取输入名列表和输入形状
+
+                reshape_outname = ""
+                reshape_output_shape = op.getReshapeOutShape(Layers[i], node_input_shape)
+                need_reshape = 0 if reshape_output_shape[0] == node_input_shape[0] else 1
+
+                if need_reshape:
+                    ####一、reshape
+                    # 1.获取节点输入名、输入维度、输出名、节点名
+                    reshape_outname = [node_layer.name + "_Reshape"]
+                    reshape_nodename = node_layer.name + "_Reshape"
+
+                    # 2.生成节点参数tensor value info,并获取节点参数名, 将参数名加入节点输入名列表
+                    paramshape = [[2]]
+                    reshape_pname = self.AddInputsTVIMannul(node_layer, op_pname["Reshape"], op_ptype["Reshape"],
+                                                            paramshape, reshape_output_shape)
+                    node_input_name.extend(reshape_pname)
+                    # 3.构建reshape_node
+                    reshape_node = op.createReshape(node_layer, reshape_nodename, node_input_name, reshape_outname,
+                                                    node_input_shape)
+
+                    # 4.添加节点到节点列表
+                    self.onnxNodeList.append(reshape_node)
+
+                # import ipdb; ipdb.set_trace()
+
+                ####二、Gemm 最后一个node输出保持原名称
+                gemm_layer = copy.deepcopy(Layers[i])  # 深拷贝
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                gemm_inname = reshape_outname if need_reshape == 1 else node_input_name
+                gemm_input_shape = reshape_output_shape if need_reshape == 1 else node_input_shape
+                gemm_outname = [gemm_layer.name]
+                gemm_nodename = gemm_layer.name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                gemm_pname = self.AddInputsTVIFromParams(gemm_layer, op_pname["InnerProduct"], op_ptype[
+                    "InnerProduct"])  # 获取输入参数，对于add来说blobs[1]里存放的是bias不需要,所以直接获取blobs[0]
+                gemm_inname.extend(gemm_pname)
+
+                # 3.构建gemm_node
+                matmul_node = op.createGemm(gemm_layer, gemm_nodename, gemm_inname, gemm_outname, gemm_input_shape,
+                                            gemm_layer.inner_product_param.num_output)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(matmul_node)
+
+            elif Layers[i].type == 'ShuffleChannel':
+                # TODO support ShuffleChannel
+                # reshape  [N, C, H, W] tensor to [N, G, C', H, W]
+                node_layer = copy.deepcopy(Layers[i])  # 深拷贝
+                node_input_name, node_input_shape = self.GetLastLayerOutNameAndShape(node_layer)  # 获取输入名列表和输入形状
+
+                reshape_outname = ""
+                reshape_output_shape = op.getReshapeOutShape(Layers[i], node_input_shape)
+                need_reshape = 0 if reshape_output_shape[0] == node_input_shape[0] else 1
+
+                if need_reshape:
+                    # 一. reshape  [N, C, H, W] tensor to [N, G, C', H, W]
+                    # 1.获取节点输入名、输入维度、输出名、节点名
+                    reshape_outname = [node_layer.name + "_Reshape"]
+                    reshape_nodename = node_layer.name + "_Reshape"
+
+                    # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                    param_data = op.getReshapeOutShape(node_layer, node_input_shape)
+                    param_shape = np.array([1, 2, 3, 4, 5], np.int).shape
+                    reshape_pname = self.AddInputsTVIMannul(node_layer, op_pname["Reshape"], op_ptype["Reshape"],
+                                                            [param_shape], param_data)
+
+                    node_input_name.extend(reshape_pname)
+                    # 这里不用对输入进行拓展，因为输入没有增加
+                    # node_input_name.extend(reshape_pname)
+                    # 3.构建reshape_node
+                    reshape_node = op.createReshape(node_layer,
+                                                    reshape_nodename,
+                                                    node_input_name,
+                                                    reshape_outname,
+                                                    node_input_shape)
+
+                    # 4.添加节点到节点列表
+                    self.onnxNodeList.append(reshape_node)
+
+                # 2. transpose  [N, C', G, H, W]
+                transpose_layer = copy.deepcopy(Layers[i])  # 深拷贝
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                transpose_input_name = reshape_outname if need_reshape == 1 else node_input_name
+                transpose_input_shape = reshape_output_shape if need_reshape == 1 else node_input_shape
+                transpose_output_name = [node_layer.name + "_Transpose"]
+                transpose_node_name = node_layer.name + "_Transpose"
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                # 获取输入参数，对于add来说blobs[1]里存放的是bias不需要,所以直接获取blobs[0]
+
+                # TODO 这地方为什么要选择使用AddInputsTVIMannul？取决于什么？
+                # ANSWER: 取决于要转换的 onnx 的类型
+                # TODO param_date 是什么？为什么要设置这个变量
+                param_data = [[2]]
+                # transpose_pname = self.AddInputsTVIMannul(transpose_layer,
+                #                                      op_pname["Transpose"],
+                #                                      op_ptype['Transpose'],
+                #                                      param_data,
+                #                                      transpose_input_shape)
+                # transpose_input_name.extend(transpose_pname)
+                # 3.
+                transpose_node = op.createTranspose(transpose_layer,
+                                                    transpose_node_name,
+                                                    transpose_input_name,
+                                                    transpose_output_name,
+                                                    transpose_input_shape)
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(transpose_node)
+
+                # 三、 Reshape [N, C', G, H, W] tensor to [N, C, H, W]
+                #
+                end_layer = copy.deepcopy(Layers[i])
+                end_layer.type = "DeReshape"
+                # 最后的输出的节点要保持原名称，这是为了生成该节点，保持链路畅通
+                end_output_name = [end_layer.name]
+                end_node_name = end_layer.name
+
+                # 上一层的输出是这一层的输入
+                end_input_name = transpose_node.outputs_name
+                end_input_shape = transpose_node.outputs_shape
+                # 最后保持输出和输入的形状是一致的
+                end_output_shape = [[node_input_shape[0][0], -1, node_input_shape[0][2], node_input_shape[0][3]]]
+                param_shape = [np.array([1, 2, 3, 4], dtype=np.int).shape]
+                end_pname = self.AddInputsTVIMannul(node_layer, op_pname["DouReshape"], op_ptype["DouReshape"],
+                                                    param_shape, end_output_shape)
+
+                end_input_name.extend(end_pname)
+                # 构建
+                end_node = op.createReshape(end_layer,
+                                            end_node_name,
+                                            end_input_name,
+                                            end_output_name,
+                                            end_input_shape)
+
+                self.onnxNodeList.append(end_node)
+
+            # Deconvolution
+            elif Layers[i].type == "Deconvolution":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+
+                conv_pname = self.AddInputsTVIFromParams(Layers[i], op_pname["ConvTranspose"],
+                                                         op_ptype["ConvTranspose"])
+                input_name.extend(conv_pname)
+
+                # 3.构建conv_node
+                conv_node = op.createConvTranspose(Layers[i], node_name, input_name, output_name, input_shape)
+                # if True:
+                #     self.__print_debug_info(node_name, input_name, output_name, input_shape, conv_node.outputs_shape)
+
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(conv_node)
+
+            # Flatten
+            elif Layers[i].type == "Flatten":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                # 由于后面 Flatten 的优化有问题,所以目前先将 Flatten -> reshape
+                # flatten_node = op.create_flatten_node(layers[i], node_name, input_name,
+                #                                       output_name, input_shape)
+                # self.onnxnodelist.append(flatten_nodelatten_node)
+                # continue
+
+                # Flatten -> Reshape
+                # import ipdb; ipdb.set_trace()
+                # # 2.生成节点参数tensor value info,并获取节点参数名,将参数名加入节点输入名列表
+                paramshape = [[2]]
+                paramdata = op.getReshapeOutShape(Layers[i], input_shape)
+                reshape_pname = self.AddInputsTVIMannul(Layers[i], op_pname["Reshape"], op_ptype["Reshape"], paramshape,
+                                                        paramdata)
+                input_name.extend(reshape_pname)
+
+                # 3.构建reshape_node
+                reshape_node = op.createReshape(Layers[i], node_name, input_name, output_name, input_shape)
+                # 4.添加节点到节点列表
+                self.onnxNodeList.append(reshape_node)
+
+            elif Layers[i].type == "Permute":
+                # Permute -> Transpose
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                transpose_node = op.createTranspose(Layers[i], node_name, input_name, output_name, input_shape)
+                self.onnxNodeList.append(transpose_node)
+            elif Layers[i].type == "PriorBox":
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                priorbox_node = op.create_priorbox_node(Layers[i], node_name, input_name, output_name, input_shape)
+
+                self.onnxNodeList.append(priorbox_node)
+
+            elif Layers[i].type == "DetectionOutput":
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                detection_output_node = op.create_detection_output(Layers[i], node_name, input_name, output_name, input_shape)
+                self.onnxNodeList.append(detection_output_node)
+            elif Layers[i].type == "Axpy":
+                # axpy = mul + add
+                # top = bottom[0] * bottom[1] + bottom[2]
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+                # create mul node
+                mul_node = op.create_axpy_mul_node(Layers[i], node_name, input_name, output_name, input_shape)
+                self.onnxNodeList.append(mul_node)
+
+                # create add node
+                add_node = op.create_axpy_add_node(Layers[i], node_name, input_name, output_name, input_shape)
+                self.onnxNodeList.append(add_node)
+            elif Layers[i].type == "Normalize":
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+                lp_normalization_output_name = [output_name[0] + "_lp"]
+                lp_normalization_node = op.create_Lp_Normalization(Layers[i], node_name, input_name,
+                                                                   lp_normalization_output_name, input_shape)
+                self.onnxNodeList.append(lp_normalization_node)
+                # get Normalize
+                scale_shape, scale_data = self.GetParamsShapeAndData(Layers[i])
+                scale_shape = [1, scale_shape[0][0], 1, 1]
+                scale_input = self.AddInputsTVIFromParams(Layers[i], ["_scale"], [TensorProto.FLOAT])
+                mul_input_name = [lp_normalization_output_name[0], node_name + "_scale"]
+                mul_input_shape = [input_shape[0], scale_shape]
+                mul_node = op.create_mul_node(Layers[i], node_name + "_mul", mul_input_name, output_name,
+                                              mul_input_shape)
+                self.onnxNodeList.append(mul_node)
+            elif Layers[i].type == "Power":
+                # Power: Mul + Add + Pow
+                # create mul node
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+                power, scale, shift = op.get_power_param(Layers[i])
+                scale_node_name = self.AddInputsTVIMannul(Layers[i], ["_scale"], [TensorProto.FLOAT], [np.shape(scale)], [scale])
+                mul_input_name = [input_name[0], scale_node_name[0]]
+                mul_node = op.create_mul_node(Layers[i], node_name + "_mul", mul_input_name, [output_name[0] + "_mul"],
+                                              [input_shape[0], np.shape(power)])
+                self.onnxNodeList.append(mul_node)
+                # create Add node
+                shift_param_name = self.AddInputsTVIMannul(Layers[i], ["_shift"], [TensorProto.FLOAT], [np.shape(scale)],
+                                                        [shift])
+                add_input_name = [output_name[0] + "_mul", shift_param_name[0]]
+                add_node = op.create_add_node(Layers[i], node_name + "_add", add_input_name, [output_name[0] + "_add"], [input_shape[0], np.shape(shift)])
+                self.onnxNodeList.append(add_node)
+
+                # create Pow
+                power_param_name = self.AddInputsTVIMannul(Layers[i], ["_param_power"], [TensorProto.FLOAT], [np.shape(power)],[power])
+                power_input_name = [output_name[0] + "_add", power_param_name[0]]
+                power_node = op.create_power_node(Layers[i], node_name + "_power", power_input_name, output_name,
+                                                  [input_shape[0], np.shape(power)])
+                self.onnxNodeList.append(power_node)
+
+            elif Layers[i].type == "TanH":
+                # 1.获取节点输入名、输入维度、输出名、节点名
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(
+                    Layers[i])  # 获取输入名列表和输入形状
+                output_name = self.GetCurrentLayerOutName(Layers[i])  # 获取输出名列表
+                node_name = Layers[i].name
+
+                # 2.构建tanh_node
+                tanh_node = op.createTanh(
+                    Layers[i], node_name, input_name, output_name, input_shape)
+
+                # 3.添加节点到节点列表
+                self.onnxNodeList.append(tanh_node)
+                
+            elif Layers[i].type == "Crop":
+                # Crop: Slice
+                # create Slice node
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                starts, ends, axes = op.get_crop_param(Layers[i],input_shape)
+                
+                Crop_name=[]
+                Crop_name.append(input_name[0])
+                
+                starts_param = self.AddInputsTVIMannul(Layers[i],
+                                                       ['_starts' + str(i)],
+                                                       [TensorProto.INT64],
+                                                       [np.shape(starts)],
+                                                       [starts])
+                ends_param = self.AddInputsTVIMannul(Layers[i],
+                                                     ['_ends' + str(i)],
+                                                     [TensorProto.INT64],
+                                                     [np.shape(ends)], [ends])
+                axes_param = self.AddInputsTVIMannul(Layers[i],
+                                                     ['_axes' + str(i)],
+                                                     [TensorProto.INT64],
+                                                     [np.shape(axes)], [axes])
+           
+                Crop_name.extend(starts_param)
+                Crop_name.extend(ends_param)
+                Crop_name.extend(axes_param)
+                crop_node = op.create_crop_node(Layers[i], node_name, Crop_name, output_name,
+                                                  input_shape)
+                self.onnxNodeList.append(crop_node)
+
+            # MVN
+            elif Layers[i].type == "MVN":
+                # MVN: InstanceNormalization
+                # create InstanceNormalization
+                if Layers[i].mvn_param.normalize_variance  == False or Layers[i].mvn_param.across_channels  == True:
+                               print("Failed type not support: " + Layers[i].type)
+                               exit(-1)
+                              
+
+                input_name, input_shape = self.GetLastLayerOutNameAndShape(
+                    Layers[i])
+                output_name = self.GetCurrentLayerOutName(Layers[i])
+                node_name = Layers[i].name
+
+                MVN_name = []
+                MVN_name.append(input_name[0])
+                scale, bias = op.get_InstanceNorm_param(Layers[i],input_shape)
+
+                scale_param = self.AddInputsTVIMannul(Layers[i],
+                                                       ['_scale' + str(i)],
+                                                       [TensorProto.FLOAT],
+                                                       [np.shape(scale)],
+                                                       [scale])
+                bias_param = self.AddInputsTVIMannul(Layers[i],
+                                                     ['_bias' + str(i)],
+                                                     [TensorProto.FLOAT],
+                                                     [np.shape(bias)], [bias])
+
+                MVN_name.extend(scale_param)
+                MVN_name.extend(bias_param)
+                MVN_node = op.create_InstanceNorm_op(Layers[i], node_name,
+                                                MVN_name, output_name,
+                                                input_shape)
+                self.onnxNodeList.append(MVN_node)
+            else:
+                print("Failed type not support: " + Layers[i].type)
+                exit(-1)
+
+    # 判断当前节点是否是输出节点
+    def JudgeOutput(self, current_node, nodelist):
+        for output_name in current_node.outputs_name:
+            for node in nodelist:
+                if output_name in node.inputs_name:
+                    return False
+        return True
+
+    # 添加模型输出信息和中间节点信息
+    def AddOutputsTVIAndValueInfo(self):
+        for i in range(len(self.onnxNodeList)):
+            if self.JudgeOutput(self.onnxNodeList[i], self.onnxNodeList):  # 构建输出节点信息
+                lastnode = self.onnxNodeList[i]
+                for j in range(len(lastnode.outputs_shape)):
+                    output_tvi = helper.make_tensor_value_info(lastnode.outputs_name[j], TensorProto.FLOAT,
+                                                               lastnode.outputs_shape[j])
+                    self.onnxmodel.addOutputsTVI(output_tvi)
+            else:  # 构建中间节点信息
+                innernode = self.onnxNodeList[i]
+                for k in range(len(innernode.outputs_shape)):
+                    hid_out_tvi = helper.make_tensor_value_info(innernode.outputs_name[k], TensorProto.FLOAT,
+                                                                innernode.outputs_shape[k])
+                    self.onnxmodel.addValueInfoTVI(hid_out_tvi)
+
+    # 创建模型
+    def createOnnxModel(self):
+        node_def = [Node.node for Node in self.onnxNodeList]
+        graph_def = helper.make_graph(
+            node_def,
+            self.onnxmodel.name,
+            self.onnxmodel.in_tvi,
+            self.onnxmodel.out_tvi,
+            self.onnxmodel.init_t,
+            value_info=self.onnxmodel.hidden_out_tvi
+        )
+        model_def = helper.make_model(graph_def, producer_name='Tencent YouTu')
+        print("2.onnx模型转换完成")
+        return model_def
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/load_save_model.py b/3rdparty/TNN/tools/caffe2onnx/src/load_save_model.py
new file mode 100644
index 0000000..d74c1c7
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/load_save_model.py
@@ -0,0 +1,34 @@
+from google.protobuf import text_format
+from proto import caffe_upsample_pb2
+import onnx
+from onnx import utils
+
+
+def LoadCaffeModel(net_path, model_path):
+    # read prototxt
+    net = caffe_upsample_pb2.NetParameter()
+    text_format.Merge(open(net_path).read(), net)
+    # read caffemodel
+    model = caffe_upsample_pb2.NetParameter()
+    f = open(model_path, 'rb')
+    model.ParseFromString(f.read())
+    f.close()
+    return net, model
+
+
+def LoadOnnxModel(onnx_path):
+    onnxmodel = onnx.load(onnx_path)
+    return onnxmodel
+
+
+def SaveOnnxModel(onnx_model, onnx_save_path, need_polish=True):
+    try:
+
+        if need_polish:
+            polished_model = onnx.utils.polish_model(onnx_model)
+            onnx.save_model(polished_model, onnx_save_path)
+        else:
+            onnx.save_model(onnx_model, onnx_save_path)
+        print("模型保存成功,已保存至:" + onnx_save_path)
+    except Exception as e:
+        print("模型存在问题,未保存成功:", e)
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/op_layer_info.py b/3rdparty/TNN/tools/caffe2onnx/src/op_layer_info.py
new file mode 100644
index 0000000..8cffd35
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/op_layer_info.py
@@ -0,0 +1,39 @@
+from proto import caffe_upsample_pb2
+from onnx import TensorProto
+
+Layer_CONCAT = caffe_upsample_pb2.V1LayerParameter.CONCAT  # 3
+Layer_CONVOLUTION = caffe_upsample_pb2.V1LayerParameter.CONVOLUTION  # 4
+Layer_DROPOUT = caffe_upsample_pb2.V1LayerParameter.DROPOUT  # 6
+Layer_INNER_PRODUCT = caffe_upsample_pb2.V1LayerParameter.INNER_PRODUCT  # 14
+Layer_LRN = caffe_upsample_pb2.V1LayerParameter.LRN  # 15
+Layer_POOLING = caffe_upsample_pb2.V1LayerParameter.POOLING  # 17
+Layer_RELU = caffe_upsample_pb2.V1LayerParameter.RELU  # 18
+Layer_SOFTMAX = caffe_upsample_pb2.V1LayerParameter.SOFTMAX  # 20
+Layer_ELTWISE = caffe_upsample_pb2.V1LayerParameter.ELTWISE  # 25
+Layer_UPSAMPLE = caffe_upsample_pb2.V1LayerParameter.UPSAMPLE  # 40
+
+op_pname = {"Conv": ["_W", "_b"],
+            "BatchNorm": ["_mean", "_var"],
+            "Scale": ["_scale", "_b"],
+            "Reshape": ["_shape"],
+            "DouReshape": ["_Doureshape"],
+            "InnerProduct": ["_W", "_B"],
+            "Upsample": ["_roi_" ,"_Scale"],
+            "PRelu": ["_slope"],
+            "Transpose": ["_trans"],
+            "ConvTranspose": ["_W", "_b"],
+            "Slice": ['_starts', '_ends', '_axes', '_steps']
+            }
+
+op_ptype = {"Conv": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "BatchNorm": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "Scale": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "Reshape": [TensorProto.INT64],
+            "InnerProduct": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "Upsample": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "PRelu": [TensorProto.FLOAT],
+            "Transpose": [TensorProto.INT64],
+            "ConvTranspose": [TensorProto.FLOAT, TensorProto.FLOAT],
+            "DouReshape": [TensorProto.INT64],
+            "Slice": [TensorProto.INT64, TensorProto.INT64, TensorProto.INT64, TensorProto.INT64]
+            }
diff --git a/3rdparty/TNN/tools/caffe2onnx/src/utils.py b/3rdparty/TNN/tools/caffe2onnx/src/utils.py
new file mode 100644
index 0000000..328a06a
--- /dev/null
+++ b/3rdparty/TNN/tools/caffe2onnx/src/utils.py
@@ -0,0 +1,25 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+
+def is_ssd_model(proto_path):
+    proto_file = open(proto_path, 'r')
+    lines = proto_file.read()
+    proto_file.close()
+    if "PriorBox" in lines:
+        return True
+    elif "DetectionOutput" in lines:
+        return True
+    else:
+        return False
diff --git a/3rdparty/TNN/tools/clang-format/README.md b/3rdparty/TNN/tools/clang-format/README.md
new file mode 100644
index 0000000..21b1d4f
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/README.md
@@ -0,0 +1,10 @@
+1.安装clang-format 6.0以上版本
+./install.sh
+
+2.运行clang-format
+./clang-format.sh tnn_root_path
+
+3.MAC平台为xcode添加快捷键
+双击xcode-clang-format.workflow安装
+在键盘->服务->xcode-clang-format添加快捷键control+S
+Xcode使用方法：command+A  control+S
diff --git a/3rdparty/TNN/tools/clang-format/clang-format.sh b/3rdparty/TNN/tools/clang-format/clang-format.sh
new file mode 100755
index 0000000..27869f6
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/clang-format.sh
@@ -0,0 +1,23 @@
+# 递归搜索文件
+function ClangFormatFile()
+{
+  # for file in `ls $1`　　　　　　
+   for file in $(ls $1)　
+   do
+    #  echo $1"/"$file
+     if [ -d $1'/'$file ]; then
+       ClangFormatFile $1'/'$file
+     else
+      #  echo $1'/'$file
+       if [ "${file##*.}"x = "h"x ] ||
+       [ "${file##*.}"x = "cc"x ] ||
+       [ "${file##*.}"x = "m"x ] || [ "${file##*.}"x = "mm"x ]; then
+         clang-format -i $1'/'$file
+       fi
+     fi
+   done
+}
+
+echo '------clang-format start: '$1
+ClangFormatFile $1
+echo '------clang-format end'
diff --git a/3rdparty/TNN/tools/clang-format/install.sh b/3rdparty/TNN/tools/clang-format/install.sh
new file mode 100755
index 0000000..6f87889
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/install.sh
@@ -0,0 +1,2 @@
+brew install clang-format
+cp ./xcode-clang-format.workflow.backup ./xcode-clang-format.workflow
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/Info.plist b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/Info.plist
new file mode 100644
index 0000000..b14ea44
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/Info.plist
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSServices</key>
+	<array>
+		<dict>
+			<key>NSBackgroundColorName</key>
+			<string>background</string>
+			<key>NSIconName</key>
+			<string>NSActionTemplate</string>
+			<key>NSMenuItem</key>
+			<dict>
+				<key>default</key>
+				<string>xcode-clang-format</string>
+			</dict>
+			<key>NSMessage</key>
+			<string>runWorkflowAsService</string>
+			<key>NSRequiredContext</key>
+			<dict>
+				<key>NSApplicationIdentifier</key>
+				<string>com.apple.dt.Xcode</string>
+			</dict>
+			<key>NSReturnTypes</key>
+			<array>
+				<string>public.utf8-plain-text</string>
+			</array>
+			<key>NSSendTypes</key>
+			<array>
+				<string>public.utf8-plain-text</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/QuickLook/Thumbnail.png b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/QuickLook/Thumbnail.png
new file mode 100644
index 0000000..a5225dd
Binary files /dev/null and b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/QuickLook/Thumbnail.png differ
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/document.wflow b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/document.wflow
new file mode 100644
index 0000000..e66894a
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow.backup.workflow/Contents/document.wflow
@@ -0,0 +1,229 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>AMApplicationBuild</key>
+	<string>488</string>
+	<key>AMApplicationVersion</key>
+	<string>2.10</string>
+	<key>AMDocumentVersion</key>
+	<string>2</string>
+	<key>actions</key>
+	<array>
+		<dict>
+			<key>action</key>
+			<dict>
+				<key>AMAccepts</key>
+				<dict>
+					<key>Container</key>
+					<string>List</string>
+					<key>Optional</key>
+					<true/>
+					<key>Types</key>
+					<array>
+						<string>com.apple.cocoa.string</string>
+					</array>
+				</dict>
+				<key>AMActionVersion</key>
+				<string>2.0.3</string>
+				<key>AMApplication</key>
+				<array>
+					<string>自动操作</string>
+				</array>
+				<key>AMParameterProperties</key>
+				<dict>
+					<key>COMMAND_STRING</key>
+					<dict/>
+					<key>CheckedForUserDefaultShell</key>
+					<dict/>
+					<key>inputMethod</key>
+					<dict/>
+					<key>shell</key>
+					<dict/>
+					<key>source</key>
+					<dict/>
+				</dict>
+				<key>AMProvides</key>
+				<dict>
+					<key>Container</key>
+					<string>List</string>
+					<key>Types</key>
+					<array>
+						<string>com.apple.cocoa.string</string>
+					</array>
+				</dict>
+				<key>ActionBundlePath</key>
+				<string>/System/Library/Automator/Run Shell Script.action</string>
+				<key>ActionName</key>
+				<string>运行Shell脚本</string>
+				<key>ActionParameters</key>
+				<dict>
+					<key>COMMAND_STRING</key>
+					<string>export PATH=/usr/local/bin:$PATH
+clang-format</string>
+					<key>CheckedForUserDefaultShell</key>
+					<true/>
+					<key>inputMethod</key>
+					<integer>0</integer>
+					<key>shell</key>
+					<string>/bin/bash</string>
+					<key>source</key>
+					<string></string>
+				</dict>
+				<key>BundleIdentifier</key>
+				<string>com.apple.RunShellScript</string>
+				<key>CFBundleVersion</key>
+				<string>2.0.3</string>
+				<key>CanShowSelectedItemsWhenRun</key>
+				<false/>
+				<key>CanShowWhenRun</key>
+				<true/>
+				<key>Category</key>
+				<array>
+					<string>AMCategoryUtilities</string>
+				</array>
+				<key>Class Name</key>
+				<string>RunShellScriptAction</string>
+				<key>InputUUID</key>
+				<string>90E54013-D7E8-4DAE-B158-CC2F07F89733</string>
+				<key>Keywords</key>
+				<array>
+					<string>Shell</string>
+					<string>脚本</string>
+					<string>命令</string>
+					<string>运行</string>
+					<string>Unix</string>
+				</array>
+				<key>OutputUUID</key>
+				<string>EEEFE348-8F13-4CC0-9ED5-5A691052BC69</string>
+				<key>UUID</key>
+				<string>FE987B04-71F1-4E56-B930-B814B4C83B31</string>
+				<key>UnlocalizedApplications</key>
+				<array>
+					<string>Automator</string>
+				</array>
+				<key>arguments</key>
+				<dict>
+					<key>0</key>
+					<dict>
+						<key>default value</key>
+						<integer>0</integer>
+						<key>name</key>
+						<string>inputMethod</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>0</string>
+					</dict>
+					<key>1</key>
+					<dict>
+						<key>default value</key>
+						<false/>
+						<key>name</key>
+						<string>CheckedForUserDefaultShell</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>1</string>
+					</dict>
+					<key>2</key>
+					<dict>
+						<key>default value</key>
+						<string></string>
+						<key>name</key>
+						<string>source</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>2</string>
+					</dict>
+					<key>3</key>
+					<dict>
+						<key>default value</key>
+						<string></string>
+						<key>name</key>
+						<string>COMMAND_STRING</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>3</string>
+					</dict>
+					<key>4</key>
+					<dict>
+						<key>default value</key>
+						<string>/bin/sh</string>
+						<key>name</key>
+						<string>shell</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>4</string>
+					</dict>
+				</dict>
+				<key>conversionLabel</key>
+				<integer>0</integer>
+				<key>isViewVisible</key>
+				<true/>
+				<key>location</key>
+				<string>559.000000:305.000000</string>
+				<key>nibPath</key>
+				<string>/System/Library/Automator/Run Shell Script.action/Contents/Resources/Base.lproj/main.nib</string>
+			</dict>
+			<key>isViewVisible</key>
+			<true/>
+		</dict>
+	</array>
+	<key>connectors</key>
+	<dict/>
+	<key>workflowMetaData</key>
+	<dict>
+		<key>applicationBundleID</key>
+		<string>com.apple.dt.Xcode</string>
+		<key>applicationBundleIDsByPath</key>
+		<dict>
+			<key>/Applications/Xcode.app</key>
+			<string>com.apple.dt.Xcode</string>
+		</dict>
+		<key>applicationPath</key>
+		<string>/Applications/Xcode.app</string>
+		<key>applicationPaths</key>
+		<array>
+			<string>/Applications/Xcode.app</string>
+		</array>
+		<key>inputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>outputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>presentationMode</key>
+		<integer>11</integer>
+		<key>processesInput</key>
+		<integer>0</integer>
+		<key>serviceApplicationBundleID</key>
+		<string>com.apple.dt.Xcode</string>
+		<key>serviceApplicationPath</key>
+		<string>/Applications/Xcode.app</string>
+		<key>serviceInputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>serviceOutputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>serviceProcessesInput</key>
+		<integer>0</integer>
+		<key>systemImageName</key>
+		<string>NSActionTemplate</string>
+		<key>useAutomaticInputType</key>
+		<integer>1</integer>
+		<key>workflowTypeIdentifier</key>
+		<string>com.apple.Automator.servicesMenu</string>
+	</dict>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/Info.plist b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/Info.plist
new file mode 100644
index 0000000..b14ea44
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/Info.plist
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSServices</key>
+	<array>
+		<dict>
+			<key>NSBackgroundColorName</key>
+			<string>background</string>
+			<key>NSIconName</key>
+			<string>NSActionTemplate</string>
+			<key>NSMenuItem</key>
+			<dict>
+				<key>default</key>
+				<string>xcode-clang-format</string>
+			</dict>
+			<key>NSMessage</key>
+			<string>runWorkflowAsService</string>
+			<key>NSRequiredContext</key>
+			<dict>
+				<key>NSApplicationIdentifier</key>
+				<string>com.apple.dt.Xcode</string>
+			</dict>
+			<key>NSReturnTypes</key>
+			<array>
+				<string>public.utf8-plain-text</string>
+			</array>
+			<key>NSSendTypes</key>
+			<array>
+				<string>public.utf8-plain-text</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/QuickLook/Thumbnail.png b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/QuickLook/Thumbnail.png
new file mode 100644
index 0000000..a5225dd
Binary files /dev/null and b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/QuickLook/Thumbnail.png differ
diff --git a/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/document.wflow b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/document.wflow
new file mode 100644
index 0000000..e66894a
--- /dev/null
+++ b/3rdparty/TNN/tools/clang-format/xcode-clang-format.workflow/Contents/document.wflow
@@ -0,0 +1,229 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>AMApplicationBuild</key>
+	<string>488</string>
+	<key>AMApplicationVersion</key>
+	<string>2.10</string>
+	<key>AMDocumentVersion</key>
+	<string>2</string>
+	<key>actions</key>
+	<array>
+		<dict>
+			<key>action</key>
+			<dict>
+				<key>AMAccepts</key>
+				<dict>
+					<key>Container</key>
+					<string>List</string>
+					<key>Optional</key>
+					<true/>
+					<key>Types</key>
+					<array>
+						<string>com.apple.cocoa.string</string>
+					</array>
+				</dict>
+				<key>AMActionVersion</key>
+				<string>2.0.3</string>
+				<key>AMApplication</key>
+				<array>
+					<string>自动操作</string>
+				</array>
+				<key>AMParameterProperties</key>
+				<dict>
+					<key>COMMAND_STRING</key>
+					<dict/>
+					<key>CheckedForUserDefaultShell</key>
+					<dict/>
+					<key>inputMethod</key>
+					<dict/>
+					<key>shell</key>
+					<dict/>
+					<key>source</key>
+					<dict/>
+				</dict>
+				<key>AMProvides</key>
+				<dict>
+					<key>Container</key>
+					<string>List</string>
+					<key>Types</key>
+					<array>
+						<string>com.apple.cocoa.string</string>
+					</array>
+				</dict>
+				<key>ActionBundlePath</key>
+				<string>/System/Library/Automator/Run Shell Script.action</string>
+				<key>ActionName</key>
+				<string>运行Shell脚本</string>
+				<key>ActionParameters</key>
+				<dict>
+					<key>COMMAND_STRING</key>
+					<string>export PATH=/usr/local/bin:$PATH
+clang-format</string>
+					<key>CheckedForUserDefaultShell</key>
+					<true/>
+					<key>inputMethod</key>
+					<integer>0</integer>
+					<key>shell</key>
+					<string>/bin/bash</string>
+					<key>source</key>
+					<string></string>
+				</dict>
+				<key>BundleIdentifier</key>
+				<string>com.apple.RunShellScript</string>
+				<key>CFBundleVersion</key>
+				<string>2.0.3</string>
+				<key>CanShowSelectedItemsWhenRun</key>
+				<false/>
+				<key>CanShowWhenRun</key>
+				<true/>
+				<key>Category</key>
+				<array>
+					<string>AMCategoryUtilities</string>
+				</array>
+				<key>Class Name</key>
+				<string>RunShellScriptAction</string>
+				<key>InputUUID</key>
+				<string>90E54013-D7E8-4DAE-B158-CC2F07F89733</string>
+				<key>Keywords</key>
+				<array>
+					<string>Shell</string>
+					<string>脚本</string>
+					<string>命令</string>
+					<string>运行</string>
+					<string>Unix</string>
+				</array>
+				<key>OutputUUID</key>
+				<string>EEEFE348-8F13-4CC0-9ED5-5A691052BC69</string>
+				<key>UUID</key>
+				<string>FE987B04-71F1-4E56-B930-B814B4C83B31</string>
+				<key>UnlocalizedApplications</key>
+				<array>
+					<string>Automator</string>
+				</array>
+				<key>arguments</key>
+				<dict>
+					<key>0</key>
+					<dict>
+						<key>default value</key>
+						<integer>0</integer>
+						<key>name</key>
+						<string>inputMethod</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>0</string>
+					</dict>
+					<key>1</key>
+					<dict>
+						<key>default value</key>
+						<false/>
+						<key>name</key>
+						<string>CheckedForUserDefaultShell</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>1</string>
+					</dict>
+					<key>2</key>
+					<dict>
+						<key>default value</key>
+						<string></string>
+						<key>name</key>
+						<string>source</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>2</string>
+					</dict>
+					<key>3</key>
+					<dict>
+						<key>default value</key>
+						<string></string>
+						<key>name</key>
+						<string>COMMAND_STRING</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>3</string>
+					</dict>
+					<key>4</key>
+					<dict>
+						<key>default value</key>
+						<string>/bin/sh</string>
+						<key>name</key>
+						<string>shell</string>
+						<key>required</key>
+						<string>0</string>
+						<key>type</key>
+						<string>0</string>
+						<key>uuid</key>
+						<string>4</string>
+					</dict>
+				</dict>
+				<key>conversionLabel</key>
+				<integer>0</integer>
+				<key>isViewVisible</key>
+				<true/>
+				<key>location</key>
+				<string>559.000000:305.000000</string>
+				<key>nibPath</key>
+				<string>/System/Library/Automator/Run Shell Script.action/Contents/Resources/Base.lproj/main.nib</string>
+			</dict>
+			<key>isViewVisible</key>
+			<true/>
+		</dict>
+	</array>
+	<key>connectors</key>
+	<dict/>
+	<key>workflowMetaData</key>
+	<dict>
+		<key>applicationBundleID</key>
+		<string>com.apple.dt.Xcode</string>
+		<key>applicationBundleIDsByPath</key>
+		<dict>
+			<key>/Applications/Xcode.app</key>
+			<string>com.apple.dt.Xcode</string>
+		</dict>
+		<key>applicationPath</key>
+		<string>/Applications/Xcode.app</string>
+		<key>applicationPaths</key>
+		<array>
+			<string>/Applications/Xcode.app</string>
+		</array>
+		<key>inputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>outputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>presentationMode</key>
+		<integer>11</integer>
+		<key>processesInput</key>
+		<integer>0</integer>
+		<key>serviceApplicationBundleID</key>
+		<string>com.apple.dt.Xcode</string>
+		<key>serviceApplicationPath</key>
+		<string>/Applications/Xcode.app</string>
+		<key>serviceInputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>serviceOutputTypeIdentifier</key>
+		<string>com.apple.Automator.text</string>
+		<key>serviceProcessesInput</key>
+		<integer>0</integer>
+		<key>systemImageName</key>
+		<string>NSActionTemplate</string>
+		<key>useAutomaticInputType</key>
+		<integer>1</integer>
+		<key>workflowTypeIdentifier</key>
+		<string>com.apple.Automator.servicesMenu</string>
+	</dict>
+</dict>
+</plist>
diff --git a/3rdparty/TNN/tools/common/file_reader.cc b/3rdparty/TNN/tools/common/file_reader.cc
new file mode 100644
index 0000000..3ad6db0
--- /dev/null
+++ b/3rdparty/TNN/tools/common/file_reader.cc
@@ -0,0 +1,236 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "file_reader.h"
+#include <fstream>
+#include "tnn/utils/dims_vector_utils.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
+
+namespace TNN_NS {
+
+static void ProcessNHWC2NCHW(unsigned char* img_data, float* blob_data, int channel, int height, int width,
+                             std::vector<float> bias, std::vector<float> scale, bool reverse_channel) {
+    ASSERT(bias.size() >= channel)
+    ASSERT(scale.size() >= channel)
+    // only reverse B and R channel for color images
+    bool need_do_reverse = false;
+    if (reverse_channel) {
+        if (channel == 3 || channel == 4) {
+            need_do_reverse = true;
+        }
+    }
+    for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+            for (int c = 0; c < channel; ++c) {
+                int c_src = c;
+                if (need_do_reverse) {
+                    c_src = (c < 3) ? (2 - c) : c;
+                }
+                int idx_src        = h * width * channel + w * channel + c_src;
+                int idx_dst        = c * height * width + h * width + w;
+                blob_data[idx_dst] = ((float)img_data[idx_src] - bias[c]) * scale[c];
+            }
+        }
+    }
+}
+
+FileReader::FileReader() {
+    bias_            = {0.0f, 0.0f, 0.0f, 0.0f};
+    scale_           = {1.0f, 1.0f, 1.0f, 1.0f};
+    reverse_channel_ = false;
+}
+
+FileReader::~FileReader() {}
+
+Status FileReader::Read(Blob* output_blob, const std::string file_path, const FileFormat format) {
+    if (output_blob->GetBlobDesc().data_type != DATA_TYPE_FLOAT) {
+        LOGE("The blob data type is not support yet!\n");
+        return TNNERR_INVALID_INPUT;
+    }
+
+    Status ret = TNN_OK;
+    if (format == TEXT) {
+        std::ifstream f_stream(file_path);
+        int count       = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
+        float* data_ptr = static_cast<float*>(output_blob->GetHandle().base);
+        for (int i = 0; i < count; ++i) {
+            f_stream >> data_ptr[i];
+        }
+        f_stream.close();
+
+    } else if (format == IMAGE) {
+        int blob_c = 0;
+        if (output_blob->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+            blob_c = output_blob->GetBlobDesc().dims[1];
+        } else {
+            LOGE("The blob data format is not support yet!\n");
+            return TNNERR_INVALID_INPUT;
+        }
+
+        int w                   = 0;
+        int h                   = 0;
+        int c                   = 0;
+        unsigned char* img_data = stbi_load(file_path.c_str(), &w, &h, &c, blob_c);
+        if (img_data == nullptr) {
+            LOGE("load image data failed!\n");
+            return TNNERR_INVALID_INPUT;
+        }
+        ret = PreProcessImage(img_data, output_blob, w, h, blob_c);
+        stbi_image_free(img_data);
+
+    } else {
+        LOGE("The input format is not support yet!\n");
+        return TNNERR_INVALID_INPUT;
+    }
+
+    return ret;
+}
+
+Status FileReader::Read(std::map<std::string, std::shared_ptr<Mat>>& mat_map, const std::string file_path,
+                        const FileFormat format) {
+    Status ret = TNN_OK;
+
+    if (format == TEXT) {
+        std::ifstream f_stream(file_path);
+
+        int blob_count = 1;
+        f_stream >> blob_count;
+
+        for (int blob_i = 0; blob_i < blob_count; ++blob_i) {
+            std::string blob_name;
+            uint32_t dims_size;
+            DimsVector dims;
+
+            f_stream >> blob_name;
+            f_stream >> dims_size;
+            for (int j = 0; j < dims_size; ++j) {
+                uint32_t dim_value;
+                f_stream >> dim_value;
+                dims.push_back(dim_value);
+            }
+
+            int data_type;
+            f_stream >> data_type;
+
+            MatType mat_type = INVALID;
+            if (DATA_TYPE_FLOAT == data_type) {
+                mat_type = NCHW_FLOAT;
+            } else if (DATA_TYPE_INT32 == data_type) {
+                mat_type = NC_INT32;
+            } else if (DATA_TYPE_INT8 == data_type) {
+                mat_type = RESERVED_INT8_TEST;
+            } else {
+                LOGE("FileReader::Read dont support data type:%d\n", data_type);
+                f_stream.close();
+                return Status(TNNERR_INVALID_INPUT, "the data type is not support in txt in file reader");
+            }
+
+            std::shared_ptr<Mat> mat(new Mat(DEVICE_NAIVE, mat_type, dims));
+
+            int count = DimsVectorUtils::Count(dims);
+            if (DATA_TYPE_FLOAT == data_type) {
+                auto data_ptr = static_cast<float*>(mat->GetData());
+                for (int i = 0; i < count; ++i) {
+                    //corresponding to dump_single_output in run_onnx_model.py
+                    //np.savetxt(f, output_data.reshape(-1), fmt="%0.6f") will save data as double
+                    //support tensor with double data type, read it with float may cause error
+                    double temp;
+                    f_stream >> temp;
+                    data_ptr[i] = temp;
+                }
+            } else if (DATA_TYPE_INT32 == data_type) {
+                auto data_ptr = static_cast<int*>(mat->GetData());
+                for (int i = 0; i < count; ++i) {
+                    f_stream >> data_ptr[i];
+                }
+            } else if (DATA_TYPE_INT8 == data_type) {
+                auto data_ptr = static_cast<char*>(mat->GetData());
+                for (int i = 0; i < count; ++i) {
+                    //corresponding to dump_single_output in run_onnx_model.py
+                    //np.savetxt(f, output_data.reshape(-1), fmt="%d") will save data as int
+                    int temp;
+                    f_stream >> temp;
+                    data_ptr[i] = (char) temp;
+                }
+            }
+
+            mat_map[blob_name] = mat;
+        }
+        f_stream.close();
+
+    } else if (format == IMAGE) {
+        // TO-DO: support image input
+        LOGE("The input format is not support yet!\n");
+        return TNNERR_INVALID_INPUT;
+    } else {
+        LOGE("The input format is not support yet!\n");
+        return TNNERR_INVALID_INPUT;
+    }
+
+    return TNN_OK;
+}
+
+void FileReader::SetBiasValue(std::vector<float> bias) {
+    bias_ = bias;
+}
+
+void FileReader::SetScaleValue(std::vector<float> scale) {
+    scale_ = scale;
+}
+
+void FileReader::SetReverseChannel(bool reverse_channel) {
+    reverse_channel_ = reverse_channel;
+}
+
+Status FileReader::PreProcessImage(unsigned char* img_data, Blob* blob, int width, int height, int channel) {
+    float* data_ptr = static_cast<float*>(blob->GetHandle().base);
+    if (blob->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
+        int blob_c = blob->GetBlobDesc().dims[1];
+        int blob_h = blob->GetBlobDesc().dims[2];
+        int blob_w = blob->GetBlobDesc().dims[3];
+
+        if (blob_c != channel) {
+            LOGE("input channel not match!\n");
+            return TNNERR_INVALID_INPUT;
+        }
+
+        if (blob_h != height || blob_w != width) {
+            // resize img_data
+            printf("\t\tresize from %dx%dx%d to %dx%dx%d\n", height, width, channel, blob_h, blob_w, blob_c);
+            unsigned char* img_resized = (unsigned char*)malloc(blob_w * blob_h * blob_c);
+            int ret = stbir_resize_uint8(img_data, width, height, 0, img_resized, blob_w, blob_h, 0, channel);
+            if (ret == 0) {
+                free(img_resized);
+                LOGE("resize image failed!\n");
+                return TNNERR_INVALID_INPUT;
+            }
+            ProcessNHWC2NCHW(img_resized, data_ptr, blob_c, blob_h, blob_w, bias_, scale_, reverse_channel_);
+            free(img_resized);
+        } else {
+            ProcessNHWC2NCHW(img_data, data_ptr, blob_c, blob_h, blob_w, bias_, scale_, reverse_channel_);
+        }
+
+    } else {
+        LOGE("The blob data format is not support yet!\n");
+        return TNNERR_INVALID_INPUT;
+    }
+
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/tools/common/file_reader.h b/3rdparty/TNN/tools/common/file_reader.h
new file mode 100644
index 0000000..fe073b4
--- /dev/null
+++ b/3rdparty/TNN/tools/common/file_reader.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_COMMON_FILE_READER_H_
+#define TNN_TOOLS_COMMON_FILE_READER_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/mat.h"
+#include "tnn/core/status.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    /* Not Support */
+    NOTSUPPORT = 0,
+    /* text file */
+    TEXT = 1,
+    /* numpy file */
+    NPY = 2,
+    /* image file */
+    IMAGE = 3,
+} FileFormat;
+
+class FileReader {
+public:
+    // @brief FileReader constructor
+    FileReader();
+
+    // @brief FileReader virtual Destructor
+    ~FileReader();
+
+public:
+    // @brief Read the file into Blob
+    // param 0 : output_blob
+    // param 1 : file_path, the file_path of the input
+    // param 2 : format, the format of the input file. txt or npy
+    Status Read(Blob* output_blob, const std::string file_path, const FileFormat format);
+
+    // @brief Read the file into Mat Map
+    // param 0 : mat_map
+    // param 1 : file_path, the file_path of the input
+    // param 2 : format, the format of the input file. txt or npy
+    Status Read(std::map<std::string, std::shared_ptr<Mat>>& mat_map, const std::string file_path,
+                const FileFormat format);
+
+    // @brief set bias_ value
+    // param 0 : bias val
+    void SetBiasValue(std::vector<float> bias);
+
+    // @brief set scale_ value
+    // param 0 : scale val
+    void SetScaleValue(std::vector<float> scale);
+
+    // @brief set reverse_channel_ value
+    // param 0 : reverse_channel val
+    void SetReverseChannel(bool reverse_channel);
+
+private:
+    Status PreProcessImage(unsigned char* img_data, Blob* blob, int width, int height, int channel);
+
+    std::vector<float> bias_;
+    std::vector<float> scale_;
+    bool reverse_channel_;
+
+    size_t length_in_elements_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_COMMON_FILE_READER_H_
diff --git a/3rdparty/TNN/tools/convert2tnn/.gitignore b/3rdparty/TNN/tools/convert2tnn/.gitignore
new file mode 100644
index 0000000..acc339d
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/.gitignore
@@ -0,0 +1,244 @@
+### https://raw.github.com/github/gitignore/599646e9d0a26283d67715dead8f26e0eb2df753/Global/macOS.gitignore
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### https://raw.github.com/github/gitignore/599646e9d0a26283d67715dead8f26e0eb2df753/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+
+### https://raw.github.com/github/gitignore/3df9bc0bd9cb0c3f5ea0904e88b9df5a378ccd59/Global/JetBrains.gitignore
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+
diff --git a/3rdparty/TNN/tools/convert2tnn/build.sh b/3rdparty/TNN/tools/convert2tnn/build.sh
new file mode 100755
index 0000000..e4a642a
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/build.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+CURRENT_DIR=$(pwd)
+CLEAN=""
+BUILD_DIR=build
+BIN_DIR=bin
+
+function usage() {
+    echo "usage: ./build.sh [-c]"
+    echo "options:"
+    echo "        -c    Clean up build folders."
+}
+
+function clean_build() {
+    echo $1 | grep -e "${BUILD_DIR}\b" -e "${BIN_DIR}\b" >/dev/null
+    if [[ "$?" != "0" ]]; then
+        echo "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir -p $1
+}
+
+function build_model_check_and_tnn_converter_and_onnx2tnn() {
+
+	clean_build ${BIN_DIR}
+
+    if [ "-c" == "${CLEAN}" ]; then
+        clean_build ${BUILD_DIR}
+    fi
+    pwd
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ../../.. \
+        -DCMAKE_BUILD_TYPE=DEBUG \
+        -DDEBUG:BOOL="ON" \
+        -DTNN_CPU_ENABLE:BOOL="ON" \
+        -DTNN_MODEL_CHECK_ENABLE:BOOL="ON" \
+        -DTNN_CONVERTER_ENABLE:BOOL="ON" \
+        -DTNN_ONNX2TNN_ENABLE:BOOL="ON" \
+        -DTNN_BUILD_SHARED="OFF"
+
+    make -j4
+
+    if [ -f "model_check" ]; then
+        cp model_check ../${BIN_DIR}/
+        echo "Compiled model_check successfully !"
+    else
+
+        echo "Compiled model_check failed !!!"
+    fi
+
+    if [ -f "tools/converter/TnnConverter" ]; then
+        cp tools/converter/TnnConverter ../${BIN_DIR}/
+        echo "Compiled TNNConverter successfully !"
+    else
+        echo "Compiled TNNConverter failed !!!"
+    fi
+
+    #From the date 20210123 on, onnx2tnn is compiled by default with Cmake option DTNN_CONVERTER_ENABLE
+    onnx2nn_files=$(ls -U tools/onnx2tnn/onnx-converter/onnx2tnn*.so);
+    if [ ${#onnx2nn_files[*]} -ge 1 ]; then
+        cp ${onnx2nn_files[i]} ../../onnx2tnn/onnx-converter
+        rm ${onnx2nn_files[i]}
+        echo "Compiled onnx2tnn successfully !"
+    else
+        echo "Compiled onnx2tnn failed !!!"
+    fi
+}
+
+
+while [ "$1" != "" ]; do
+    case $1 in
+    -c)
+        shift
+        CLEAN="-c"
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+    esac
+done
+
+build_model_check_and_tnn_converter_and_onnx2tnn
diff --git a/3rdparty/TNN/tools/convert2tnn/caffe_converter/__init__.py b/3rdparty/TNN/tools/convert2tnn/caffe_converter/__init__.py
new file mode 100644
index 0000000..98ba53f
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/caffe_converter/__init__.py
@@ -0,0 +1,15 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from caffe_converter import caffe2tnn
diff --git a/3rdparty/TNN/tools/convert2tnn/caffe_converter/caffe2tnn.py b/3rdparty/TNN/tools/convert2tnn/caffe_converter/caffe2tnn.py
new file mode 100644
index 0000000..78c8121
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/caffe_converter/caffe2tnn.py
@@ -0,0 +1,88 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from utils import cmd
+from utils import checker
+from utils import return_code
+from onnx_converter import onnx2tnn
+from utils import align_model
+
+from converter import logging
+
+import os
+import sys
+import time
+
+
+def caffe2onnx(proto_path, model_path, output_path):
+    work_dir = "../caffe2onnx/"
+    command = "python3 caffe2onnx.py " + proto_path + " " + model_path + " -o " + output_path
+    result = cmd.run(command, work_dir=work_dir)
+    if result == 0:
+        return True
+    else:
+        return False
+
+
+def convert(proto_path, model_path, output_dir, version, optimize, half, align,
+            input_path=None, refer_path=None, debug_mode: bool = False):
+    logging.info("Converter Caffe to ONNX Model\n")
+    checker.check_file_exist(proto_path)
+    checker.check_file_exist(model_path)
+    if output_dir is None:
+        output_dir = os.path.dirname(proto_path)
+    checker.check_file_exist(output_dir)
+
+    proto_name = os.path.basename(proto_path)
+    proto_name = proto_name[:-len(".prototxt")]
+    onnx_path = os.path.join(output_dir, proto_name + ".onnx")
+
+    if caffe2onnx(proto_path, model_path, onnx_path) is False:
+        logging.error("Oh No, caff2onnx failed :(\n")
+        sys.exit(return_code.CONVERT_FAILED)
+    else:
+        logging.info("Congratulations! caffe2onnx succeed!\n")
+    if version is None:
+        version = time.strftime('%Y%m%d%H%M', time.localtime())
+
+    is_ssd = checker.is_ssd_model(proto_path)
+    if is_ssd:
+        onnx2tnn.convert(onnx_path, output_dir, version, False, half, is_ssd=True)
+    else:
+        onnx2tnn.convert(onnx_path, output_dir, version, optimize, half)
+
+    if is_ssd and ((input_path is None) or (refer_path is None)):
+        align = False
+        optimize = False
+
+    proto_suffix = '.tnnproto'
+    model_suffix = '.tnnmodel'
+    onnx_base_name = os.path.basename(onnx_path)
+    if optimize is True:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + '.opt' + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + '.opt' + model_suffix
+    else:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + model_suffix
+    tnn_proto_path = os.path.join(output_dir, tnn_proto_name)
+    tnn_model_path = os.path.join(output_dir, tnn_model_name)
+
+    if align == 'output':
+        align_model.align_model(onnx_path, tnn_proto_path, tnn_model_path, input_path, refer_path, debug_mode=debug_mode)
+    elif align == 'all':
+        is_opt = '.opt' if optimize else ''
+        onnx_base_name = os.path.basename(onnx_path)
+        is_align_all = (align == 'all')
+        align_model.align_all(onnx_path, tnn_proto_path,
+                              is_align_all, None, input_path, refer_path)
diff --git a/3rdparty/TNN/tools/convert2tnn/converter.py b/3rdparty/TNN/tools/convert2tnn/converter.py
new file mode 100755
index 0000000..ec244eb
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/converter.py
@@ -0,0 +1,128 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import logging
+
+from utils import args_parser
+from onnx_converter import onnx2tnn
+from caffe_converter import caffe2tnn
+from tf_converter import tf2tnn
+from tflite_converter import tflite2tnn
+from utils import parse_path
+
+
+def main():
+    parser = args_parser.parse_args()
+    args = parser.parse_args()
+
+    debug_mode: bool = args.debug
+    if debug_mode is True:
+        logging.basicConfig(level=logging.DEBUG, format='')
+    else:
+        logging.basicConfig(level=logging.INFO, format='')
+    logging.info("\n{}  convert model, please wait a moment {}\n".format("-" * 10, "-" * 10))
+
+    if args.sub_command == 'onnx2tnn':
+        onnx_path = parse_path.parse_path(args.onnx_path)
+        output_dir = parse_path.parse_path(args.output_dir)
+        version = args.version
+        optimize = args.optimize
+        half = args.half
+        align = args.align
+        if align is None:
+            align = 'output'
+        align_batch = args.align_batch
+        input_file = args.input_file_path
+        ref_file = args.refer_file_path
+        onnx_path = parse_path.parse_path(onnx_path)
+        output_dir = parse_path.parse_path(output_dir)
+        input_file = parse_path.parse_path(input_file)
+        ref_file = parse_path.parse_path(ref_file)
+        input_names = None
+        if args.input_names is not None:
+            input_names = ""
+            for item in args.input_names:
+                input_names += (item + " ")
+        try:
+            onnx2tnn.convert(onnx_path, output_dir, version, optimize, half, align, align_batch, input_file, ref_file, input_names,
+                             debug_mode=debug_mode)
+        except Exception as err:
+            logging.error("Conversion to  tnn failed :(\n")
+            logging.error(err)
+
+    elif args.sub_command == 'caffe2tnn':
+        proto_path = parse_path.parse_path(args.proto_path)
+        model_path = parse_path.parse_path(args.model_path)
+        output_dir = parse_path.parse_path(args.output_dir)
+        version = args.version
+        optimize = args.optimize
+        half = args.half
+        align = args.align
+        if align is None:
+            align = 'output'
+        input_file = args.input_file_path
+        ref_file = args.refer_file_path
+        input_file = parse_path.parse_path(input_file)
+        ref_file = parse_path.parse_path(ref_file)
+        try:
+            caffe2tnn.convert(proto_path, model_path, output_dir, version, optimize, half, align, input_file, ref_file,
+                              debug_mode=debug_mode)
+        except Exception as err:
+            logging.error("Conversion to  tnn failed :(\n")
+            logging.error(err)
+
+    elif args.sub_command == 'tf2tnn':
+        tf_path = parse_path.parse_path(args.tf_path)
+        output_dir = parse_path.parse_path(args.output_dir)
+        input_names = args.input_names
+        output_names = args.output_names
+        version = args.version
+        optimize = args.optimize
+        half = args.half
+        align = args.align
+        if align is None:
+            align = 'output'
+        not_fold_const = args.not_fold_const
+        input_file = args.input_file_path
+        ref_file = args.refer_file_path
+        input_file = parse_path.parse_path(input_file)
+        ref_file = parse_path.parse_path(ref_file)
+        try:
+            tf2tnn.convert(tf_path, input_names, output_names, output_dir, version, optimize, half, align, not_fold_const,
+                        input_file, ref_file, debug_mode=debug_mode)
+        except Exception as err:
+            logging.error("\nConversion to  tnn failed :(\n")
+            logging.error(err)
+    elif args.sub_command == 'tflite2tnn':
+        tf_path = parse_path.parse_path(args.tf_path)
+        output_dir = parse_path.parse_path(args.output_dir)
+        version = args.version
+        align = args.align.lower()
+        input_file = args.input_file_path
+        ref_file = args.refer_file_path
+        input_file = parse_path.parse_path(input_file)
+        ref_file = parse_path.parse_path(ref_file)
+        try:
+            tflite2tnn.convert(tf_path,  output_dir, version, align, input_file, ref_file, debug_mode=debug_mode)
+        except Exception as err:
+           logging.error("\n Conversion to  tnn failed :(\n")
+           logging.error(err)
+    elif args.sub_command is None:
+        parser.print_help()
+    else:
+        logging.info("Do not support convert!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/convert2tnn/onnx_converter/__init__.py b/3rdparty/TNN/tools/convert2tnn/onnx_converter/__init__.py
new file mode 100644
index 0000000..5505ee0
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/onnx_converter/__init__.py
@@ -0,0 +1,15 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from onnx_converter import onnx2tnn
diff --git a/3rdparty/TNN/tools/convert2tnn/onnx_converter/onnx2tnn.py b/3rdparty/TNN/tools/convert2tnn/onnx_converter/onnx2tnn.py
new file mode 100644
index 0000000..db215c8
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/onnx_converter/onnx2tnn.py
@@ -0,0 +1,145 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from utils import cmd
+from utils import checker
+from utils import return_code
+from utils import align_model
+
+from converter import logging
+
+import os
+import sys
+
+
+def throw_exception(current_shape):
+    message = "Current shape: "
+    for name, shape in current_shape.items():
+        message += str(name) + ": " + str(shape) + "   "
+
+    logging.error("You should use -in to specify input's name and shape. e.g.: -in name:1,3,32,32")
+    logging.error(message)
+
+    sys.exit(return_code.CONVERT_FAILED)
+
+
+def check_input_names(input_names: str, onnx_input_info: dict):
+    input_names = input_names.strip()
+
+    input_shapes_ = {}
+    for x in input_names.split(" "):
+        if ':' not in x:
+            input_shapes_[None] = list(map(int, x.split(',')))
+        else:
+            pieces = x.split(':')
+            # for the input name like input:0
+            name, shape = ':'.join(
+                pieces[:-1]), list(map(int, pieces[-1].split(',')))
+            input_shapes_[name] = shape
+
+    if len(input_shapes_) != len(onnx_input_info):
+        logging.error("The specified input does not match the input of the ONNX model.")
+        logging.error("Specified: ", list(input_shapes_.keys()))
+        logging.error("ONNX: ", list(onnx_input_info.keys()))
+        sys.exit(return_code.CONVERT_FAILED)
+
+
+def convert(onnx_path, output_dir=None, version="v1.0", optimize=True, half=False, align='', align_batch=False,
+            input_path=None, refer_path=None, input_names: str = None, is_ssd=False, debug_mode: bool = False):
+    """
+    执行 onnx 转换为 tnn 的转换指令
+    :parameter:
+          onnx_path:    需要转换的 onnx 文件的路径
+          output_path:  生成的 tnn 文件的路径
+          version:      转换模型的版本号
+          optimize:     是否需要对模型进行优化,默认是需要进行优化
+          half:         是否需要转为 FP16 的模型,减小模型的大小
+    :return return_code
+    :exception 执行超时
+    """
+    logging.info("Converter ONNX to TNN Model...\n")
+
+    checker.check_file_exist(onnx_path)
+
+    try:
+        if not is_ssd:
+            logging.info("Converter ONNX to TNN check_onnx_dim...\n")
+            ret, current_shape = checker.check_onnx_dim(onnx_path)
+            logging.info("Converter ONNX to TNN check_onnx_dim...\n")
+            if ret is False and current_shape is not None:
+                if input_names is None:
+                    logging.info("Converter ONNX to TNN current_shape...\n")
+                    throw_exception(current_shape)
+            if input_names is not None:
+                input_names = input_names.strip()
+                if ":" not in input_names and " " not in input_names:
+                    input_names = list(current_shape.keys())[0] + ":" + input_names
+                check_input_names(input_names, current_shape)
+    except Exception as e:
+        print(e)
+        logging.error("check_onnx_dim failed, next stage of convertion may failed too\n")
+        
+    proto_suffix = '.tnnproto'
+    model_suffix = '.tnnmodel'
+    command = "python3 onnx2tnn.py " + onnx_path
+    command = command + " -version=" + version
+    checker.check_file_exist(onnx_path)
+    if optimize is True:
+        command = command + " -optimize=1"
+    else:
+        command = command + " -optimize=0"
+    if half is True:
+        command = command + " -half=1"
+    else:
+        command = command + " -half=0"
+
+    if output_dir is None:
+        output_dir = os.path.dirname(onnx_path)
+    checker.check_file_exist(output_dir)
+    command = command + " -o " + output_dir
+
+    if input_names is not None:
+        command = command + " -input_shape " + input_names
+    logging.debug("The onnx2tnn command:" + command + "\n")
+
+    work_dir = "../onnx2tnn/onnx-converter/"
+    result = cmd.run(command, work_dir=work_dir)
+
+    if result == 0:
+        logging.info("Converter ONNX to TNN model succeed!\n")
+    else:
+        logging.error("Converter ONNX to TNN model failed!\n")
+        sys.exit(return_code.CONVERT_FAILED)
+    onnx_base_name = os.path.basename(onnx_path)
+
+    if optimize is True:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + '.opt' + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + '.opt' + model_suffix
+    else:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + model_suffix
+    tnn_proto_path = os.path.join(output_dir, tnn_proto_name)
+    tnn_model_path = os.path.join(output_dir, tnn_model_name)
+
+    if align == 'output' or align_batch is True:
+        if input_names is None:
+            align_model.align_model(onnx_path, tnn_proto_path, tnn_model_path, input_path, refer_path,
+                                    debug_mode=debug_mode, align_batch=align_batch)
+        else:
+            align_model.align_model(onnx_path, tnn_proto_path, tnn_model_path, input_path, refer_path, input_names,
+                                    debug_mode=debug_mode, align_batch=align_batch)
+    elif align == 'all':
+        is_align_all = (align == 'all')
+        align_model.align_all(onnx_path, tnn_proto_path,
+                                          is_align_all, input_names, input_path, refer_path)
diff --git a/3rdparty/TNN/tools/convert2tnn/requirements.txt b/3rdparty/TNN/tools/convert2tnn/requirements.txt
new file mode 100644
index 0000000..b20b937
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/requirements.txt
@@ -0,0 +1 @@
+onnx
diff --git a/3rdparty/TNN/tools/convert2tnn/tf_converter/__init__.py b/3rdparty/TNN/tools/convert2tnn/tf_converter/__init__.py
new file mode 100644
index 0000000..b9a1ece
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/tf_converter/__init__.py
@@ -0,0 +1,15 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from tf_converter import tf2tnn
diff --git a/3rdparty/TNN/tools/convert2tnn/tf_converter/tf2tnn.py b/3rdparty/TNN/tools/convert2tnn/tf_converter/tf2tnn.py
new file mode 100644
index 0000000..1497da8
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/tf_converter/tf2tnn.py
@@ -0,0 +1,120 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from utils import cmd
+from utils import checker
+from utils import return_code
+from onnx_converter import onnx2tnn
+from utils import align_model
+from utils import fix_tnn_output
+
+from converter import logging
+
+import os
+import sys
+import time
+
+
+def hack_name(names: list):
+    hacked_names = ""
+    for name in names:
+        if name.endswith(":0"):
+            hacked_names = hacked_names + name + ","
+        else:
+            hacked_names = hacked_names + name + ":0,"
+    return hacked_names[:-1]
+
+
+def format_input(arguments: list) -> dict:
+    format_input_info: dict = {}
+    for item in arguments:
+        position = item.rfind(':')
+        name, dims = item[0:position], item[position+1:]
+        if not name.endswith(':0'):
+            name += ':0'
+        dims = '[' + dims + ']'
+        format_input_info.update({name: dims})
+    return format_input_info
+
+
+def tf2onnx(tf_path, input_names, output_name, onnx_path, not_fold_const=False):
+    work_dir = "./"
+    input_info: dict = format_input(input_names)
+    input_info_str: str = ""
+    input_nchw_names: str = ""
+    for item in input_info.items():
+        input_info_str += item[0] + item[1] + ","
+        input_nchw_names += item[0] + ","
+    command = "python3 -m tf2onnx.convert  --graphdef " + tf_path
+    command = command + " --inputs " + input_info_str
+    command = command + " --inputs-as-nchw " + input_nchw_names
+
+    command = command + " --outputs " + hack_name(output_name)
+    command = command + " --output " + onnx_path
+    command = command + " --opset 11"
+    if not_fold_const is False:
+        command = command + " --fold_const"
+
+    logging.debug(command)
+    result = cmd.run(command, work_dir=work_dir)
+    if result == 0:
+        return True
+    else:
+        return False
+
+
+def convert(tf_path, input_names, output_names, output_dir, version, optimize, half, align=False, not_fold_const=False,
+            input_path=None, refer_path=None, debug: bool = False, debug_mode: bool = False):
+    logging.info("Converter Tensorflow to TNN model\n")
+    checker.check_file_exist(tf_path)
+    model_name = os.path.basename(tf_path)
+    if output_dir is None or not os.path.isdir(output_dir):
+        output_dir = os.path.dirname(tf_path)
+    checker.check_file_exist(output_dir)
+    model_name = model_name[:-len(".pb")]
+    onnx_path = os.path.join(output_dir, model_name + ".onnx")
+    if tf2onnx(tf_path, input_names, output_names, onnx_path, not_fold_const) is False:
+        logging.error("Oh No, tf2onnx failed :(\n")
+        sys.exit(return_code.CONVERT_FAILED)
+    else:
+        logging.info("Convert TensorFlow to ONNX model succeed!\n")
+    if version is None:
+        version = time.strftime('%Y%m%d%H%M', time.localtime())
+    checker.check_file_exist(onnx_path)
+    onnx2tnn.convert(onnx_path, output_dir, version, optimize, half)
+
+    proto_suffix = '.tnnproto'
+    model_suffix = '.tnnmodel'
+    onnx_base_name = os.path.basename(onnx_path)
+    if optimize is True:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + '.opt' + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + '.opt' + model_suffix
+    else:
+        tnn_proto_name = onnx_base_name[:-len('.onnx')] + proto_suffix
+        tnn_model_name = onnx_base_name[:-len('.onnx')] + model_suffix
+    tnn_proto_path = os.path.join(output_dir, tnn_proto_name)
+    tnn_model_path = os.path.join(output_dir, tnn_model_name)
+
+    if align == 'output':
+        align_model.align_model(onnx_path, tnn_proto_path, tnn_model_path, input_path, refer_path, debug_mode=debug_mode)
+    elif align == 'all':
+        is_align_all = (align == 'all')
+        align_model.align_all(onnx_path, tnn_proto_path,
+                              is_align_all, input_names, input_path, refer_path)
+
+    onnx_base_name = os.path.basename(onnx_path)
+    tnn_proto_name = onnx_base_name[:-len('.onnx')] + ('.opt.tnnproto' if optimize else ".tnnproto")
+    tnn_proto_path = os.path.join(output_dir, tnn_proto_name)
+
+    fix_tnn_output.fix_tnn_output(tnn_proto_path)
diff --git a/3rdparty/TNN/tools/convert2tnn/tflite_converter/__init__.py b/3rdparty/TNN/tools/convert2tnn/tflite_converter/__init__.py
new file mode 100644
index 0000000..ca76cf6
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/tflite_converter/__init__.py
@@ -0,0 +1,15 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from tflite_converter import tflite2tnn
diff --git a/3rdparty/TNN/tools/convert2tnn/tflite_converter/tflite2tnn.py b/3rdparty/TNN/tools/convert2tnn/tflite_converter/tflite2tnn.py
new file mode 100644
index 0000000..19306d3
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/tflite_converter/tflite2tnn.py
@@ -0,0 +1,69 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from utils import checker
+from utils import parse_path
+from utils import cmd
+from utils import align_model
+from utils import return_code
+from converter import logging
+
+import os
+import sys
+
+def tflite2tnn(tf_path, tnn_path, not_fold_const=False):
+    cmd.run("pwd")
+    relative_path = "bin/TnnConverter"
+    TnnConverter_path = parse_path.parse_path(relative_path)
+    checker.check_file_exist(TnnConverter_path)
+    command = TnnConverter_path + " -mt TFLITE  -mp " + tf_path
+    checker.check_file_exist(TnnConverter_path)
+    checker.check_file_exist(tf_path)
+    if tnn_path is None:
+        tnn_path = os.path.dirname(tf_path)
+    checker.check_file_exist(tnn_path)
+    command = command + " -od " + tnn_path + "/"
+    logging.debug(command)
+    result = cmd.run(command)
+    if result == 0:
+        return True
+    else:
+        return False
+
+
+def convert(tf_path,  output_dir, version,  align=False,
+            input_path=None, refer_path=None, debug_mode: bool = False):
+    checker.check_file_exist(tf_path)
+    model_name = os.path.basename(tf_path)
+    if output_dir is None or not os.path.isdir(output_dir):
+        output_dir = os.path.dirname(tf_path)
+    checker.check_file_exist(output_dir)
+    model_name = model_name[:-len(".tflite")]
+    if tflite2tnn(tf_path, output_dir) is False:
+        logging.error("Oh No, tflite2tnn failed :(\n")
+        sys.exit(return_code.CONVERT_FAILED)
+    else:
+        logging.info("Convert TensorFlowLite to TNN model succeed!\n")
+
+    if version is None:
+        version = "v1.0"
+    if align == 'output':
+        proto_suffix = '.tnnproto'
+        model_suffix = '.tnnmodel'
+        tnn_proto_name = model_name + proto_suffix
+        tnn_model_name = model_name + model_suffix
+        tnn_proto_path = os.path.join(output_dir, tnn_proto_name)
+        tnn_model_path = os.path.join(output_dir, tnn_model_name)
+        align_model.align_model(tf_path, tnn_proto_path, tnn_model_path, input_path, refer_path, None, True,
+                                debug_mode=debug_mode)
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/__init__.py b/3rdparty/TNN/tools/convert2tnn/utils/__init__.py
new file mode 100644
index 0000000..d490437
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/__init__.py
@@ -0,0 +1,19 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from utils import args_parser
+from utils import checker
+from utils import cmd
+from utils import parse_path
+from utils import align_model
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/align_model.py b/3rdparty/TNN/tools/convert2tnn/utils/align_model.py
new file mode 100644
index 0000000..0ef5ff0
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/align_model.py
@@ -0,0 +1,509 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+from utils import checker
+from utils import parse_path
+from utils import cmd
+from utils import data
+from utils import convert_name
+from utils import return_code
+from utils.run_onnx_model import OnnxRunner
+from types import *
+from converter import logging
+from functools import reduce
+
+
+import linecache
+import math
+import os
+import onnxruntime
+import sys
+
+import numpy as np
+
+
+def run_tnn_model_check(proto_path, model_path, input_path, reference_output_path, is_tflite=False, align_batch=False):
+    cmd.run("pwd")
+    relative_path = "bin/model_check"
+    model_check_path = parse_path.parse_path(relative_path)
+    checker.check_file_exist(model_check_path)
+    command = model_check_path + " -e -p  " + proto_path + " -m " + \
+        model_path + " -i " + input_path + " -f " + reference_output_path + " -d NAIVE"
+
+    if align_batch:
+        command += " -b "
+
+    logging.debug(command)
+    ret = cmd.run(command)
+
+    if ret == 0:
+        print_align_message(is_tflite)
+    else:
+        print_not_align_message(None, is_tflite)
+
+    return
+
+
+def get_input_from_file(path: str) -> dict:
+    input_dict: dict = {}
+    f = open(path, 'r')
+    size = f.readline().strip('\n')
+    for i in range(int(size)):
+        input_info = f.readline().strip('\n').split(' ')
+        input_name = input_info[0]
+        dims_size = int(input_info[1])
+        data_type = int(input_info[-1])
+        dims = list(map(int, input_info[2:-1]))
+        count = reduce(lambda x,y:x * y,dims)
+        data: list = []
+        if data_type == 0:
+            #float
+            for j in range(count):
+                data.append(float(f.readline().strip('\n')))
+            np_data = np.reshape(np.array(data).astype(np.float32), dims)
+            input_dict.update({input_name: np_data})
+        elif data_type == 3:
+            #int32
+            for j in range(count):
+                data.append(int(f.readline().strip('\n')))
+            np_data = np.array(data).astype(np.int64).reshape(dims)
+            input_dict.update({input_name: np_data})
+    return input_dict
+
+
+def run_onnx(model_path: str, input_path: str, input_info: dict) -> str:
+    session = onnxruntime.InferenceSession(model_path)
+
+    output_path = input_path
+    deli = "/"
+    if output_path[-1] == "/":
+        output_path = output_path[:-1]
+    output_path = deli.join(output_path.split("/")[:-1])
+    output_path += "/output-onnx.txt"
+
+    input_info_list = session.get_inputs()
+    input_data_dict: dict = get_input_from_file(input_path)
+    for item in input_info_list:
+        name = item.name
+        if ":" in name:
+            tnn_name = name.replace(":", "_")
+            input_data_dict[name] = input_data_dict[tnn_name]
+            del input_data_dict[tnn_name]
+
+    output_info = session.get_outputs()
+    pred = session.run([], input_data_dict)
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(output_info)))
+        cnt = 0
+        for item in output_info:
+            output_name = item.name
+            output_shape = pred[cnt].shape
+            type_str = item.type
+            data_type = 0
+            # keep the same as dump_single_output in run_onnx_model.py
+            if type_str == "tensor(int64)" or type_str == "tensor(int32)":
+                data_type = 3
+            elif type_str == "tensor(bool)" or type_str == "tensor(int8)":
+                data_type = 2
+
+            description = "{} {} ".format(output_name, len(output_shape))
+            for dim in output_shape:
+                description += "{} " .format(dim)
+            description += "{}".format(str(data_type))
+            f.write(description + "\n")
+
+            # keep the same as dump_single_output in run_onnx_model.py
+            if type_str == "tensor(int64)" or type_str == "tensor(int32)":
+                np.savetxt(f, pred[cnt].reshape(-1), fmt="%d")
+            elif type_str == "tensor(bool)" or type_str == "tensor(int8)":
+                np.savetxt(f, pred[cnt].reshape(-1), fmt="%d")
+            elif type_str == "tensor(float)" or type_str == "tensor(double)":
+                np.savetxt(f, pred[cnt].reshape(-1), fmt="%0.6f")
+            else:
+                print("ERROR: run_onnx dump dont support data type: " + type_str)
+
+            cnt += 1
+
+    return output_path
+
+
+def squeeze_data(data: np.ndarray, num_axes):
+    for i in range(num_axes):
+        data = data.squeeze(-2)
+
+    return data
+
+
+def prepare_data_for_tflite(data_dict: dict, input_details: dict):
+    for item in input_details:
+        name = item["name"]
+        tnn_format_shape = item["shape"]
+        size = len(tnn_format_shape)
+        if size < 3:
+            continue
+        else:
+            tnn_format_data = data_dict[name]
+            tflite_format_data = np.rollaxis(tnn_format_data, 1, size)
+            data_dict[name] = tflite_format_data
+    return data_dict
+
+
+def run_tflite(model_path: str, input_path: str, input_info: dict) -> str:
+    import tensorflow as tf
+
+    output_path = input_path
+    deli = "/"
+    if output_path[-1] == "":
+        output_path = output_path[:-1]
+    output_path = deli.join(output_path.split("/")[:-1])
+    output_path += "/output-onnx.txt"
+
+    input_data_dict: dict = get_input_from_file(input_path)
+
+    interpreter = tf.lite.Interpreter(model_path)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    input_data_dict = prepare_data_for_tflite(input_data_dict, input_details)
+    for item in input_details:
+        name = item["name"]
+        index = item["index"]
+        input_data = input_data_dict[name]
+        interpreter.set_tensor(index, input_data)
+
+    interpreter.invoke()
+
+    with open(output_path, "w") as f:
+        f.write("{}\n" .format(len(output_details)))
+        for item in output_details:
+            output_name = item["name"]
+            index = item["index"]
+            output_data = interpreter.get_tensor(index)
+            if item["dtype"] == np.float32:
+                data_type = 0
+            elif item["dtype"] == np.int64:
+                data_type = 3
+            shape = list(output_data.shape)
+            original_len = len(shape)
+            while len(shape) < 4:
+               shape.insert(-1, 1)
+            output_data = output_data.reshape(*shape)
+
+            output_data = np.transpose(output_data, (0, 3, 1, 2)) # transpose result from nhwc to nchw
+            tnn_shape = output_data.shape
+            if original_len < 4:
+                expand_size = len(shape) - original_len
+                original_shape = tnn_shape[:-expand_size]
+                output_data = output_data.reshape(original_shape)
+            output_shape = output_data.shape
+            description = "{} {} " .format(output_name, len(output_shape))
+            for dim in output_shape:
+                description += "{} " .format(dim)
+            description += "{}".format(str(data_type))
+            f.write(description + "\n")
+            np.savetxt(f, output_data.reshape(-1), fmt="%0.6f")
+    return output_path
+
+
+def get_input_shape_from_onnx(onnx_path) -> dict:
+    # onnx_input_info: list = {  "input name":{
+    #                                           {"shape": [n, c,...]},
+    #                                           {"data_type": 0}
+    #                                       }
+    #                           ,
+    #                           "input name": {
+    #                                           {"shape": [n, c,...]},
+    #                                           {"data_type": 0}
+    #                                       }
+    #                         }
+    onnxruntime.set_default_logger_severity(3)
+    session = onnxruntime.InferenceSession(onnx_path)
+    input_info: dict = {}
+    for ip in session.get_inputs():
+        name = ip.name
+        name = name.replace(":", "_")
+        shape = ip.shape
+        data_type = 0
+        if ip.type == 'tensor(float)':
+            data_type = 0
+        elif ip.type == 'tensor(int64)':
+            data_type = 3
+        else:
+            logging.error("Do not support input date type")
+        if type(shape[0]) is not int:
+            shape[0] = 1
+        shape_information = {'shape': shape,
+                             'data_type': data_type}
+        input_info.update({name: shape_information})
+    return input_info
+
+
+def nhwc_shape_to_nchw(shape: list):
+    size = len(shape)
+    if size < 3:
+        return shape
+    else:
+        channels = shape.pop()
+        shape.insert(1, channels)
+        return shape
+
+
+def get_input_shape_from_tflite(tflite_path)->dict:
+    import tensorflow as tf
+    input_info: dict={}
+    interpreter = tf.lite.Interpreter(tflite_path)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    for item in input_details:
+        name = item["name"]
+        shape = list(item["shape"])
+        shape = nhwc_shape_to_nchw(shape)
+
+        if item["dtype"] == np.float32:
+            data_type = 0
+        elif item["dtype"] == np.int64:
+            data_type = 3
+        else:
+            logging.error("Do not support input date type")
+        shape_infomation = {"shape": shape, "data_type": data_type}
+        input_info.update({name: shape_infomation})
+    return input_info
+
+
+TNN_MAGIC_NUMBER = 0x0FABC0002
+TNN_MAGIC_NUMBER_V2 = 0x0FABC0004
+def get_input_shape_from_tnn(tnn_proto_path):
+    input_info: dict = {}
+    magic_number = linecache.getline(tnn_proto_path, 1).strip('\n').strip('\"').strip(',').strip(' ').split(" ")[-1]
+    magic_number = int(magic_number)
+    if magic_number == TNN_MAGIC_NUMBER:
+        line = linecache.getline(tnn_proto_path, 2).strip('\n').strip('\"').strip(',')
+        input_list = line.split(':')
+        for tnn_input in input_list:
+            name, n, c, h, w = tnn_input.strip(' ').split(' ')
+            size = 4
+            shape = [int(n), int(c), int(h), int(w)]
+            data_type = 0
+            input_shape_info = {'shape':shape, 'data_type': data_type }
+            input_info.update({name: input_shape_info})
+    elif magic_number == TNN_MAGIC_NUMBER_V2:
+        line = linecache.getline(tnn_proto_path, 2).strip('\n').strip('\"').strip(',')
+        input_list = line.split(':')
+        for tnn_input in input_list:
+            # information: name size shape1 shape2... data_type
+            information = tnn_input.strip(' ').split(' ')
+            name = information[0]
+            size = int(information[1])
+            data_type = int(information[-1])
+            shape: list = list(map(int, information[2:-1]))
+            input_shape_info = {'shape': shape, 'data_type': data_type}
+            input_info.update({name: input_shape_info})
+    else:
+        logging.error("Unspport TNN model version\n")
+        sys.exit(-1)
+    return input_info
+
+
+def print_not_align_message(reason=None, is_tflite=False):
+    logging.error("{}   Unfortunately   {}" .format("-" * 10, "-" * 10))
+    if is_tflite == True:
+       logging.error("The tflite model is not aligned with tnn model\n")
+    else:
+       logging.error("The onnx model is not aligned with tnn model\n")
+    sys.exit(return_code.ALIGN_FAILED)
+
+
+def print_align_message(is_tflite = False):
+    logging.info("{}  Congratulations!   {}" .format("-" * 10, "-" * 10))
+    if is_tflite == True:
+       logging.info("The tflite model is aligned with tnn model\n")
+    else:
+        logging.info("The onnx model is aligned with tnn model\n")
+
+
+def check_shape_info(onnx_info: dict, tnn_info: dict) -> bool:
+    onnx_shape = onnx_info['shape']
+    onnx_data_type = onnx_info['data_type']
+    tnn_shape = tnn_info['shape']
+    tnn_data_type = onnx_info['data_type']
+    if type(onnx_shape[0]) is not int:
+        onnx_shape[0] = 1
+    if onnx_data_type == tnn_data_type and onnx_shape == tnn_shape:
+        return True
+    else:
+        return False
+
+
+def check_shape_info(onnx_info: dict, tnn_info: dict) -> bool:
+    onnx_shape = onnx_info['shape']
+    onnx_data_type = onnx_info['data_type']
+    tnn_shape = tnn_info['shape']
+    tnn_data_type = onnx_info['data_type']
+    if type(onnx_shape[0]) is not int:
+        onnx_shape[0] = 1
+    if onnx_data_type == tnn_data_type and onnx_shape == tnn_shape:
+        return True
+    else:
+        return False
+
+
+def check_input_info(onnx_input_info: dict, tnn_input_info: dict):
+    if len(onnx_input_info) != len(tnn_input_info):
+        print_not_align_message("onnx input size != tnn input size")
+    for name, onnx_info in onnx_input_info.items():
+        tnn_name = convert_name.onnx_name2tnn_name(name)
+        tnn_info = tnn_input_info[tnn_name]
+        if check_shape_info(onnx_info, tnn_info) == True:
+            logging.info(name + ": input shape of onnx and tnn is aligned!\n")
+        else:
+            logging.error("input is not align 194\n")
+            print_not_align_message(
+                "The {}'s shape not equal! the onnx shape:{}, tnn shape: {}\n".format(name, str(onnx_info),
+                                                                                      str(tnn_info)))
+
+
+def check_input_lite_info(onnx_input_info: dict, tnn_input_info: dict):
+    if len(onnx_input_info) != len(tnn_input_info):
+        print_not_align_message("tflite input size != tnn input size")
+    for name, onnx_info in onnx_input_info.items():
+        tnn_name = convert_name.onnx_name2tnn_name(name)
+        tnn_info = tnn_input_info[tnn_name]
+        if check_shape_info(onnx_info, tnn_info):
+            logging.info("Check tflite input shape and tnn input shape align!\n")
+        else:
+            logging.info("input is not align\n")
+            print_not_align_message(
+                "The {}'s shape not equal! the onnx shape:{}, tnn shape: {}\n".format(name, str(onnx_info),
+                                                                                      str(tnn_info)))
+    logging.info("Check tflite input shape and tnn input shape align!\n")
+
+
+def parse_specify_input_args(input_names: str) -> dict:
+    input_info = {}
+    for x in input_names.split(" "):
+        if ':' not in x:
+            shape = list(map(int, x.split(',')))
+            input_info[None] = {'shape': shape, 'data_type': 0}
+        else:
+            pieces = x.split(':')
+            # for the input name like input:0
+            name, shape = ':'.join(
+                pieces[:-1]), list(map(int, pieces[-1].split(',')))
+            input_shape_info = {'shape': shape, 'data_type': 0}
+            input_info[name] = input_shape_info
+
+    for name, input_shape_info in input_info.items():
+        if ":" not in name:
+            continue
+        tnn_name = convert_name.onnx_name2tnn_name(name)
+        input_info[tnn_name] = input_shape_info
+        del input_info[name]
+
+    return input_info
+
+
+def replace_tnn_input_name(input_info: dict):
+    new_input_info = {}
+    for name, shape in input_info.items():
+        new_name = name.replace(":", "_")
+        new_input_info[new_name] = shape
+
+    return new_input_info
+
+
+def update_original_input_shape(original_input_info: dict, specify_input_info:dict):
+    for name, original_shape_datatype in original_input_info.items():
+        specify_shape_datatype = specify_input_info.get(name, None)
+        if specify_shape_datatype is not None:
+            original_shape_datatype["shape"] = specify_shape_datatype["shape"]
+
+
+def align_model(original_model_path: str, tnn_proto_path: str, tnn_model_path: str, input_file_path: str = None,
+                refer_path: str = None, specify_input_args: str = None, is_tflite: bool = False, debug_mode: bool = False, align_batch: bool = False) -> bool:
+    """
+    对 onnx 模型和 tnn 模型进行对齐.
+    当前支持模型: 单输入,单输出;单输入,多输出;
+    :param original_model_path:
+    :param tnn_proto_path:
+    :param tnn_model_path:
+    :return:
+    """
+    logging.info("{}  align model (tflite or ONNX vs TNN),please wait a moment {}\n" .format("-" * 10, "-" * 10))
+
+    checker.check_file_exist(tnn_proto_path)
+    checker.check_file_exist(tnn_model_path)
+    # list = {  "input name1":{
+    #                           {"shape": [n, c,...]},
+    #                           {"data_type": 0}
+    #                        },
+    #           "input name22": {
+    #                            {"shape": [n, c,...]},
+    #                            {"data_type": 0}
+    #                         }
+    # get original input info
+    if is_tflite:
+        original_input_info = get_input_shape_from_tflite(original_model_path)
+    else:
+        original_input_info = get_input_shape_from_onnx(original_model_path)
+    # get tnn input info
+    tnn_input_info = get_input_shape_from_tnn(tnn_proto_path)
+    # check input
+    if specify_input_args is not None:
+        specify_input_info = parse_specify_input_args(specify_input_args)
+        update_original_input_shape(original_input_info, specify_input_info)
+
+    if is_tflite:
+        check_input_lite_info(original_input_info, tnn_input_info)
+    else:
+       check_input_info(original_input_info, tnn_input_info)
+    if input_file_path is None:
+        # generate data
+        input_path = data.gene_random_data(original_input_info)
+    else:
+        if os.path.exists(input_file_path):
+            input_path = input_file_path
+        else:
+            logging.error("Invalid input_file_path")
+            sys.exit(return_code.ALIGN_FAILED)
+    if refer_path is None:
+        if is_tflite == True:
+            reference_output_path = run_tflite(original_model_path, input_path, original_input_info)
+        else:
+            reference_output_path = run_onnx(original_model_path, input_path, original_input_info)
+    else:
+        if os.path.exists(refer_path):
+            reference_output_path = refer_path
+        else:
+            logging.error("Invalid refer_path")
+            sys.exit(return_code.ALIGN_FAILED)
+
+    logging.info("Run tnn model_check...")
+    run_tnn_model_check(tnn_proto_path, tnn_model_path, input_path, reference_output_path, is_tflite, align_batch)
+    if debug_mode is False:
+        if input_file_path is None and os.path.exists(input_path):
+            data.clean_temp_data(os.path.dirname(input_path))
+        if refer_path is None and os.path.exists(reference_output_path):
+            data.clean_temp_data(reference_output_path)
+    return True
+
+
+def align_all(src_model_path: str, tnn_proto_path: str, align_all: bool,
+               input_names: str = None, input_file_path: str = None, refer_file_path: str = None,
+               is_tflite: bool = False) -> bool:
+    runner = OnnxRunner(src_model_path, tnn_proto_path, align_all, input_names,
+                        input_file_path, refer_file_path, is_tflite)
+
+    runner.run()
+
+    return True
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/args_parser.py b/3rdparty/TNN/tools/convert2tnn/utils/args_parser.py
new file mode 100644
index 0000000..31afc8c
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/args_parser.py
@@ -0,0 +1,297 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import argparse
+import time
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(prog='convert',
+                                     description='convert ONNX/Tensorflow/Tensorflowlite/Caffe model to TNN model')
+    subparsers = parser.add_subparsers(dest="sub_command")
+
+    onnx2tnn_parser = subparsers.add_parser('onnx2tnn',
+                                            help="convert onnx model to tnn model")
+    onnx2tnn_parser.add_argument(dest='onnx_path',
+                                 action='store',
+                                 help="the path for onnx file")
+    onnx2tnn_parser.add_argument('-in',
+                                 metavar='input_info',
+                                 dest='input_names',
+                                 action='store',
+                                 required=False,
+                                 nargs='+',
+                                 type=str,
+                                 help="specify the input name and shape of the model. e.g., " +
+                                      "-in input1_name:1,3,128,128 input2_name:1,3,256,256")
+    onnx2tnn_parser.add_argument('-optimize',
+                                 dest='optimize',
+                                 default=False,
+                                 action='store_true',
+                                 required=False,
+                                 help="If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result")
+    onnx2tnn_parser.add_argument('-half',
+                                 dest='half',
+                                 default=False,
+                                 action='store_true',
+                                 required=False,
+                                 help="save model using half")
+    onnx2tnn_parser.add_argument('-v',
+                                 metavar="v1.0.0",
+                                 dest='version',
+                                 default=time.strftime('%Y%m%d%H%M', time.localtime()),
+                                 action='store',
+                                 required=False,
+                                 help="the version for model")
+    onnx2tnn_parser.add_argument('-o',
+                                 dest='output_dir',
+                                 action='store',
+                                 required=False,
+                                 help="the output tnn directory")
+    onnx2tnn_parser.add_argument('-align',
+                                 dest='align',
+                                 default='',
+                                 action='store',
+                                 required=False,
+                                 choices=[None, 'output', 'all'],
+                                 nargs='?',
+                                 type=str,
+                                 help='align the onnx model with tnn model. '
+                                      'e.g., if you want to align the last output, you can use \'-align\' '
+                                      'or \'-align output\'; '
+                                      'if the model is not align, you can use \'-align all\' '
+                                      'to address the first unaligned layer')
+    onnx2tnn_parser.add_argument('-align_batch',
+                                 dest='align_batch',
+                                 default=False,
+                                 action='store_true',
+                                 required=False,
+                                 help='align the onnx model with tnn model and check mutli batch')
+    onnx2tnn_parser.add_argument('-input_file',
+                                 dest='input_file_path',
+                                 action='store',
+                                 required=False,
+                                 help="the input file path which contains the input data for the inference model.")
+    onnx2tnn_parser.add_argument('-ref_file',
+                                 dest='refer_file_path',
+                                 action='store',
+                                 required=False,
+                                 help="the reference file path which contains the reference data to compare the results.")
+    onnx2tnn_parser.add_argument('-debug',
+                                 dest='debug',
+                                 default=False,
+                                 action='store_true',
+                                 required=False,
+                                 help="Turn on the switch to debug the model.")
+
+    # convert caff2onnx -pp proto_path -mp model_path -o
+    caffe2tnn_parser = subparsers.add_parser('caffe2tnn',
+                                             help="convert caffe model to tnn model")
+    caffe2tnn_parser.add_argument(metavar='prototxt_file_path',
+                                  dest='proto_path',
+                                  action='store',
+                                  help="the path for prototxt file")
+    caffe2tnn_parser.add_argument(metavar='caffemodel_file_path',
+                                  dest='model_path',
+                                  action='store',
+                                  help="the path for caffemodel file")
+    caffe2tnn_parser.add_argument('-o',
+                                  dest='output_dir',
+                                  action='store',
+                                  required=False,
+                                  help="the output tnn directory")
+    caffe2tnn_parser.add_argument('-v',
+                                  metavar="v1.0",
+                                  dest='version',
+                                  default=time.strftime('%Y%m%d%H%M', time.localtime()),
+                                  action='store',
+                                  required=False,
+                                  help="the version for model, default v1.0")
+    caffe2tnn_parser.add_argument('-optimize',
+                                  dest='optimize',
+                                  default=False,
+                                  action='store_true',
+                                  required=False,
+                                  help="If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result")
+    caffe2tnn_parser.add_argument('-half',
+                                  dest='half',
+                                  default=False,
+                                  action='store_true',
+                                  required=False,
+                                  help="save model using half")
+    caffe2tnn_parser.add_argument('-align',
+                                  dest='align',
+                                  default='',
+                                  action='store',
+                                  required=False,
+                                  choices=[None, 'output', 'all'],
+                                  nargs='?',
+                                  type=str,
+                                  help='align the onnx model with tnn model. '
+                                       'e.g., if you want to align the last output, you can use \'-align\' '
+                                       'or \'-align output\'; '
+                                       'if the model is not align, you can use \'-align all\' '
+                                       'to address the first unaligned layer')
+    caffe2tnn_parser.add_argument('-input_file',
+                                  dest='input_file_path',
+                                  action='store',
+                                  required=False,
+                                  help="the input file path which contains the input data for the inference model.")
+    caffe2tnn_parser.add_argument('-ref_file',
+                                  dest='refer_file_path',
+                                  action='store',
+                                  required=False,
+                                  help="the reference file path which contains the reference data to compare the results.")
+    caffe2tnn_parser.add_argument('-debug',
+                                  dest='debug',
+                                  default=False,
+                                  action='store_true',
+                                  required=False,
+                                  help="Turn on the switch to debug the model.")
+
+    tf2tnn_parser = subparsers.add_parser('tf2tnn',
+                                          help="convert tensorflow model to tnn model")
+    tf2tnn_parser.add_argument('-tp',
+                               dest="tf_path",
+                               action='store',
+                               required=True,
+                               help="the path for tensorflow graphdef file")
+
+    tf2tnn_parser.add_argument('-in',
+                               metavar='input_info',
+                               dest='input_names',
+                               action='store',
+                               nargs='+',
+                               type=str,
+                               required=True,
+                               help="specify the input name and shape of the model. e.g., " +
+                                    "-in input1_name:1,128,128,3 input2_name:1,256,256,3")
+    tf2tnn_parser.add_argument('-on',
+                               metavar='output_name',
+                               dest='output_names',
+                               action='store',
+                               required=True,
+                               nargs='+',
+                               type=str,
+                               help="the tensorflow model's output name. e.g. -on output_name1 output_name2")
+
+    tf2tnn_parser.add_argument('-o',
+                               dest='output_dir',
+                               action='store',
+                               required=False,
+                               help="the output tnn directory")
+
+    tf2tnn_parser.add_argument('-v',
+                               metavar="v1.0",
+                               dest='version',
+                               default=time.strftime('%Y%m%d%H%M', time.localtime()),
+                               action='store',
+                               required=False,
+                               help="the version for model")
+    tf2tnn_parser.add_argument('-optimize',
+                               dest='optimize',
+                               default=False,
+                               action='store_true',
+                               required=False,
+                               help="If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result")
+    tf2tnn_parser.add_argument('-half',
+                               dest='half',
+                               default=False,
+                               action='store_true',
+                               required=False,
+                               help="save the model using half")
+    tf2tnn_parser.add_argument('-align',
+                               dest='align',
+                               default='',
+                               action='store',
+                               required=False,
+                               choices=[None, 'output', 'all'],
+                               nargs='?',
+                               type=str,
+                               help='align the onnx model with tnn model. '
+                                    'e.g., if you want to align the last output, you can use \'-align\' '
+                                    'or \'-align output\'; '
+                                    'if the model is not align, you can use \'-align all\' '
+                                    'to address the first unaligned layer')
+    tf2tnn_parser.add_argument('-not_fold_const',
+                               dest='not_fold_const',
+                               default=False,
+                               action='store_true',
+                               required=False,
+                               help=argparse.SUPPRESS)
+    tf2tnn_parser.add_argument('-input_file',
+                               dest='input_file_path',
+                               action='store',
+                               required=False,
+                               help="the input file path which contains the input data for the inference model.")
+    tf2tnn_parser.add_argument('-ref_file',
+                               dest='refer_file_path',
+                               action='store',
+                               required=False,
+                               help="the reference file path which contains the reference data to compare the results.")
+    tf2tnn_parser.add_argument('-debug',
+                               dest='debug',
+                               default=False,
+                               action='store_true',
+                               required=False,
+                               help="Turn on the switch to debug the model.")
+    # tflie parser
+    tflite2tnn_parser = subparsers.add_parser('tflite2tnn',
+                                              help="convert tensorflow-lite model to tnn model")
+    tflite2tnn_parser.add_argument(dest="tf_path",
+                                   action='store',
+                                   help="the path for tensorflow-lite graphdef file")
+
+    tflite2tnn_parser.add_argument('-o',
+                                   dest='output_dir',
+                                   action='store',
+                                   required=False,
+                                   help="the output tnn directory")
+
+    tflite2tnn_parser.add_argument('-v',
+                                   metavar="v1.0",
+                                   dest='version',
+                                   default=time.strftime('%Y%m%d%H%M', time.localtime()),
+                                   action='store',
+                                   required=False,
+                                   help="the version for model")
+
+    tflite2tnn_parser.add_argument('-align',
+                                   dest='align',
+                                   default='',
+                                   action='store',
+                                   required=False,
+                                   help='align the onnx model with tnn model. '
+                                        'e.g., if you want to align the final output, you can use -align output'
+                                        'if you want to align whole model, you can use -align all')
+
+    tflite2tnn_parser.add_argument('-input_file',
+                                   dest='input_file_path',
+                                   action='store',
+                                   required=False,
+                                   help="the input file path which contains the input data for the inference model.")
+
+    tflite2tnn_parser.add_argument('-ref_file',
+                                   dest='refer_file_path',
+                                   action='store',
+                                   required=False,
+                                   help="the reference file path which contains the reference data to compare the results.")
+
+    tflite2tnn_parser.add_argument('-debug',
+                                   dest='debug',
+                                   default=False,
+                                   action='store_true',
+                                   required=False,
+                                   help="Turn on the switch to debug the model.")
+    return parser
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/checker.py b/3rdparty/TNN/tools/convert2tnn/utils/checker.py
new file mode 100644
index 0000000..2536538
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/checker.py
@@ -0,0 +1,55 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+
+import os
+import onnxruntime
+import re
+import sys
+
+from converter import logging
+from utils import return_code
+
+
+def check_file_exist(file_path):
+    if os.path.exists(file_path) is False:
+        logging.error("The " + file_path + " does not exist! please make sure the file exist!\n")
+        sys.exit(return_code.CONVERT_FAILED)
+
+
+def is_ssd_model(proto_path):
+    proto_file = open(proto_path, 'r')
+    lines = proto_file.read()
+    proto_file.close()
+    if "PriorBox" in lines:
+        return True
+    elif "DetectionOutput" in lines:
+        return True
+    else:
+        return False
+
+def check_onnx_dim(onnx_path : str):
+    onnxruntime.set_default_logger_severity(3)
+    session = onnxruntime.InferenceSession(onnx_path)
+    current_shape = {}
+    status = 0
+    for ip in session.get_inputs():
+        current_shape[ip.name] = ip.shape
+        for dim in ip.shape:
+            if type(dim) is not int or dim < 0:
+                status = -1
+                
+    if status == -1:
+        return False, current_shape
+    return True, current_shape
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/cmd.py b/3rdparty/TNN/tools/convert2tnn/utils/cmd.py
new file mode 100644
index 0000000..052000d
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/cmd.py
@@ -0,0 +1,59 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+
+import shlex
+import subprocess
+import datetime
+import time
+
+from converter import logging
+
+
+def run(cmd_string, work_dir=None, timeout=None, is_shell=True, log_level="debug"):
+    """
+         执行一个SHELL命令 封装了subprocess的Popen方法, 支持超时判断，支持读取stdout和stderr
+        :parameter:
+              cwd: 运行命令时更改路径，如果被设定，子进程会直接先更改当前路径到cwd
+              timeout: 超时时间，秒，支持小数，精度0.1秒
+              shell: 是否通过shell运行
+        :return return_code
+        :exception 执行超时
+    """
+
+    if is_shell:
+        cmd_string_list = cmd_string
+    else:
+        cmd_string_list = shlex.split(cmd_string)
+    if timeout:
+        end_time = datetime.datetime.now() + \
+                   datetime.timedelta(seconds=timeout)
+
+    sub = subprocess.Popen(cmd_string_list,
+                           stdout=subprocess.PIPE,
+                           stderr=subprocess.STDOUT,
+                           shell=True,
+                           bufsize=0,
+                           cwd=work_dir,
+                           close_fds=True)
+    while True:
+        line = sub.stdout.readline().decode('utf-8')
+        if log_level == "error":
+            logging.error(str(line))
+        elif log_level == "debug":
+            logging.debug(str(line))
+        if line == '' and sub.poll() is not None:
+            break
+    rc = sub.poll()
+    return rc
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/convert_name.py b/3rdparty/TNN/tools/convert2tnn/utils/convert_name.py
new file mode 100644
index 0000000..67c0bc7
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/convert_name.py
@@ -0,0 +1,22 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+def onnx_name2tnn_name(onnx_name : str) -> str:
+    split = onnx_name.split(":")
+    tnn_name = ""
+    for item in split:
+        tnn_name += (item + "_")
+    tnn_name = tnn_name[:-1]
+
+    return tnn_name
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/data.py b/3rdparty/TNN/tools/convert2tnn/utils/data.py
new file mode 100644
index 0000000..d35dd2c
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/data.py
@@ -0,0 +1,53 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+import numpy as np
+import os
+import pathlib
+from utils import cmd
+from utils import checker
+
+from converter import logging
+
+
+def gene_random_data(input_info: dict) -> str:
+    data = {}
+    current_dir = pathlib.Path(__file__).parent.parent
+    data_dir = os.path.join(current_dir, "temp_data")
+    command = "mkdir -p " + data_dir
+    
+    logging.debug(command)
+
+    cmd.run(command)
+    checker.check_file_exist(data_dir)
+    data_path = os.path.join(data_dir, "input.txt")
+    data_file = open(data_path, "w")
+    data_file.write(str(len(input_info)) + '\n')
+    for name, info in input_info.items():
+        shape = info['shape']
+        data_type = info['data_type']
+        data_file.write(name + ' ' + str(len(shape)) + ' ' + ' '.join([str(dim) for dim in shape]) + ' ' + str(data_type) + '\n')
+        if data_type == 0:
+            data[name] = np.random.rand(*shape)
+            np.savetxt(data_file, data[name].reshape(-1), fmt="%0.6f")
+        elif data_type == 3:
+            # range [low, high)
+            data[name] = np.random.randint(low=0, high=2, size=shape)
+            np.savetxt(data_file, data[name].reshape(-1), fmt="%i")
+    data_file.close()
+    return data_path
+
+
+def clean_temp_data(path: str):
+    command = "rm -rf " + path
+    cmd.run(command)
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/differ.py b/3rdparty/TNN/tools/convert2tnn/utils/differ.py
new file mode 100644
index 0000000..e2f8b73
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/differ.py
@@ -0,0 +1,131 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+import os
+import sys
+import math
+
+error_up_limits = 0.001
+
+def is_number(s):
+    try:
+        if math.isinf(float(s)) or math.isnan(float(s)):
+            return False
+        return True
+    except ValueError:
+        pass
+    try:
+        import unicodedata
+        value = unicodedata.numeric(s)
+        if math.isinf(value) or math.isnan(value):
+            return False
+        return True
+    except (TypeError, ValueError):
+        pass
+    return False
+
+
+def remove_invalid_number(lines):
+    valid_lines=[]
+    for i in range(0, len(lines)):
+        if is_number(lines[i]):
+            valid_lines.append(lines[i])
+        else:
+            print("there are some error number: " + str(lines[i]))
+            #exit(-1)
+    return valid_lines
+
+def compare_files(file_name1, file_name2):
+
+    if file_name1.endswith('\n'):
+        file_name1 = file_name1[:-1]
+
+    if file_name2.endswith('\n'):
+        file_name2 = file_name2[:-1]
+
+    if not os.path.isfile(file_name1):
+        print ('error:file:%s. doesn\'t exist.' % file_name1)
+        return -1
+
+    if not os.path.isfile(file_name2):
+        print ('error:file:%s. doesn\'t exist.' % file_name2)
+        return -1
+
+    print ('comparing %s and %s' % (file_name1, file_name2))
+
+    max_diff = 0.0
+    max_diff_line_number = -1
+    with open(file_name1) as file1:
+        with open(file_name2) as file2:
+            lines1 = file1.readlines()
+            lines2 = file2.readlines()
+
+            #strip invalid lines
+            valid_lines1 = remove_invalid_number(lines1)
+            valid_lines2 = remove_invalid_number(lines2)
+
+        if len(valid_lines1) != len(valid_lines2):
+            print ('the number of lines is not equal in file %s and %s' % (file_name1, file_name2))
+            print ( str(len(valid_lines1))+ " vs " + str(len(valid_lines2)))
+            return -1
+
+        # file_name1 and file_name2 have same number of lines, let's compare these two files
+        for i in range (0, len(lines1)):
+            #print 'processing line %d of %d in file %s and %s' % (i+1, len(lines1), file_name1, file_name2)
+                is_line1_valid = is_number(lines1[i])
+                is_line2_valid = is_number(lines2[i])
+                if not is_line1_valid and not is_line2_valid:
+                    continue
+                if not is_line1_valid:
+                   print ("error: line %d in file %s is a NOT valid number while %d in file %s is a valid number" %
+                           (i+1, file_name1, i+1, file_name2))
+                   return -1
+                if not is_line2_valid:
+                   print ("error: line %d in file %s is a valid number while %d in file %s is NOT a valid number" %
+                           (i+1, file_name1, i+1, file_name2))
+                   return -1
+
+                number1 = float(lines1[i])
+                number2 = float(lines2[i])
+                diff = number1 - number2
+                if diff < 0:
+                    diff = -1*diff
+
+                if diff >= error_up_limits:
+                    print ('%s and %s differ %f at line %d with %f and %f' % (file_name1, file_name2, diff, i+1,
+                        number1, number2))
+                    return -1
+
+                if diff > max_diff:
+                    max_diff = diff
+                    max_diff_line_number = i+1
+    if(max_diff_line_number == -1):
+        print ('max_diff is 0.')
+    else:
+        print ('max_diff is %f at line %d' % (max_diff, max_diff_line_number))
+
+    return 0
+
+if len(sys.argv) != 3 and len(sys.argv) != 4:
+    print ('usage(1):python %s layer_output_files_list_file1 layer_output_files_list_file2' % sys.argv[0])
+    print ('usage(2):python %s layer_output_files_list_file1 layer_output_files_list_file2 starting_file_id(zero-based)'
+            % sys.argv[0])
+    exit(-1)
+
+file_name1 = sys.argv[1];
+file_name2 = sys.argv[2];
+if compare_files(file_name1, file_name2) != 0:
+    exit(-1)
+
+print ('all the numbers in the file %s and %s are aligned.' % (file_name1, file_name2))
+print ("the differ limits:" + str(error_up_limits))
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/fix_tnn_output.py b/3rdparty/TNN/tools/convert2tnn/utils/fix_tnn_output.py
new file mode 100644
index 0000000..2f1fb92
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/fix_tnn_output.py
@@ -0,0 +1,118 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import onnxruntime
+
+import numpy as np
+
+
+def get_output_shape(onnx_path: str) -> dict:
+    session = onnxruntime.InferenceSession(onnx_path)
+    input_data = {}
+    for inp in session.get_inputs():
+        name = inp.name
+        shape = inp.shape
+        dtype = inp.type
+        if "int" in dtype:
+            input_data[name] = np.random.randint(low=0, high=2, size=shape)
+        else:
+            input_data[name] = np.random.rand(*shape).astype(np.float32)
+
+    pred = session.run([], input_data)
+
+    output_shape = {}
+    for tensor, oup_info in zip(pred, session.get_outputs()):
+        shape = tensor.shape
+        name = oup_info.name
+        output_shape[name] = shape
+
+    return output_shape
+
+
+def get_perm(shape_length: int):
+    return [0, *list(range(2, shape_length)), 1]
+
+
+def get_output_info(tnnproto: list) -> list:
+    output_info = tnnproto[3]
+    output_name = output_info.split("\"")[1]
+    output_name = output_name.split(" ")[:-1]
+    return output_name
+
+
+def find_target_layer(content: list, target_name: str):
+    for idx, layer in enumerate(content):
+        if target_name in layer:
+            name = layer.split(" ")[1]
+            return idx, name
+
+    raise Exception("Conversion failed")
+
+
+def generate_transpose(input_name: str, output_name: str, perm: list) -> str:
+    permute_layer = "\"Permute {} 1 1 {} {} {} " .format(output_name, input_name, output_name, len(perm))
+    for item in perm:
+        permute_layer += "{} " .format(item)
+    permute_layer += ",\"\n"
+    
+    return permute_layer
+
+
+def replace_output_name(layer_info: str, src_name: str, dst_name: str):
+    layer_info_ = layer_info.split(" ")
+    input_cnt = int(layer_info_[2])
+    output_cnt = int(layer_info_[3])
+    for i in range(4 + input_cnt, 4 + input_cnt + output_cnt):
+        if layer_info_[i] == src_name:
+            layer_info_[i] = dst_name
+
+    return " " .join(layer_info_)
+
+
+def fix_tnn_output(tnnproto_path: str):
+    onnx_path = tnnproto_path[:-8] + "onnx"
+    output_shape = get_output_shape(onnx_path)
+
+    with open(tnnproto_path) as f:
+        tnnproto = f.readlines()
+    output_info = get_output_info(tnnproto)
+    offset = 5
+    add_layers = []
+    for output_name in output_info:
+        shape = output_shape[output_name]
+        if len(shape) <= 2:
+            continue
+        perm = get_perm(len(shape))
+        idx, name = find_target_layer(tnnproto[offset:], output_name)
+        inner_output_name = output_name + "_fix_output_name_from_tf2onnx"
+        add_layers.append(inner_output_name)
+        tnnproto[offset + idx] = replace_output_name(tnnproto[offset + idx], output_name, inner_output_name)
+        tnnproto.append(generate_transpose(inner_output_name, output_name, perm))
+
+    # fix op
+    info = tnnproto[0].split(" ")
+    total_ops = int(info[1]) + len(add_layers)
+    info[1] = str(total_ops)
+    tnnproto[0] = " ".join(info)
+
+    op_name_list = tnnproto[2].split(" ")
+    new_ops_name_list = op_name_list[:1] + add_layers + op_name_list[1:]
+    new_ops_name = " ".join(new_ops_name_list)
+    tnnproto[2] = new_ops_name
+
+    # fix layer cnt
+    layer_cnt = len(tnnproto[offset:])
+    tnnproto[4] = "\" {} ,\"\n" .format(layer_cnt)
+    with open(tnnproto_path, "w") as f:
+        f.writelines(tnnproto)
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/parse_path.py b/3rdparty/TNN/tools/convert2tnn/utils/parse_path.py
new file mode 100644
index 0000000..d5c05d0
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/parse_path.py
@@ -0,0 +1,39 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import os
+import sys
+
+from converter import logging
+from utils import return_code
+
+
+def parse_path(path: str):
+    if path is None:
+        return None
+    if " " in path or " " in os.getcwd():
+        logging.error("The path can not contain spaces!")
+        sys.exit(return_code.SPACE_IN_PATH)
+    if path.startswith("/"):
+        return path
+    elif path.startswith("./"):
+        return os.path.join(os.getcwd(), path[2:])
+    elif path.startswith("../"):
+        abs_path = os.getcwd() + "/" + path
+        return abs_path
+    elif path.startswith("~"):
+        abs_path = os.path.expanduser('~') + path[1:]
+        return abs_path
+    else:
+        return os.path.join(os.getcwd(), path)
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/return_code.py b/3rdparty/TNN/tools/convert2tnn/utils/return_code.py
new file mode 100644
index 0000000..edd17cd
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/return_code.py
@@ -0,0 +1,18 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+SPACE_IN_PATH = -2
+CONVERT_FAILED = -1
+
+ALIGN_FAILED = 100
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/run_onnx_model.py b/3rdparty/TNN/tools/convert2tnn/utils/run_onnx_model.py
new file mode 100644
index 0000000..e302332
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/run_onnx_model.py
@@ -0,0 +1,109 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import logging
+import onnx
+import onnxruntime
+import os
+
+import numpy as np
+
+from collections import OrderedDict
+from utils.run_src_model import BaseRunner
+
+
+class OnnxRunner(BaseRunner):
+    def get_src_model_input_information(self) -> dict:
+        onnxruntime.set_default_logger_severity(3)
+        session = onnxruntime.InferenceSession(self.src_model_path)
+        input_info: dict = {}
+        for ip in session.get_inputs():
+            name = ip.name
+            shape = ip.shape
+            data_type = 0
+            if ip.type == 'tensor(float)':
+                data_type = 0
+            elif ip.type == 'tensor(int64)':
+                data_type = 3
+            else:
+                logging.error("Do not support input date type")
+            if type(shape[0]) is not int:
+                shape[0] = 1
+            shape_information = {'shape': shape,
+                                 'data_type': data_type}
+            input_info.update({name: shape_information})
+
+        return input_info
+
+    def modify_src_model_output(self) -> bool:
+        onnx_model = onnx.load(self.src_model_path)
+
+        output_list = []
+        for output in onnx_model.graph.output:
+            output_list.append(output.name)
+
+        if self.align_all:
+            for node in onnx_model.graph.node:
+                for output in node.output:
+                    if output in output_list:
+                        continue
+                    onnx_model.graph.output.extend([onnx.ValueInfoProto(name=output)])
+
+        model_name = self.src_model_path.split("/")[-1][:-5]
+        model_name = model_name + ".hack.onnx"
+        self.modify_model_path = os.path.join(self.dump_dir_path, model_name)
+        onnx.save(onnx_model, self.modify_model_path)
+
+        return True
+
+    def inference(self) -> dict:
+        session = onnxruntime.InferenceSession(self.modify_model_path)
+        outputs = [x.name for x in session.get_outputs()]
+        dump_data = OrderedDict(zip(outputs, session.run(outputs, self.input_data)))
+
+        return dump_data
+
+    def dump_single_output(self, output_name: str, output_data: np.ndarray, full_message: bool):
+        output_name = output_name.replace("/", "_")
+        output_name = output_name.replace(":", "_")
+        output_path = os.path.join(self.dump_dir_path, output_name + ".txt")
+        with open(output_path, "w") as f:
+            if full_message == True:
+                # number of output
+                f.write("1\n")
+                output_shape = output_data.shape
+                data_type = 0
+                if output_data.dtype == np.int64 or output_data.dtype == np.int32:
+                    data_type = 3
+                elif output_data.dtype == np.int8 or output_data.dtype == np.bool:
+                    data_type = 2
+
+                description = "{} {} ".format(output_name, len(output_shape))
+                for dim in output_shape:
+                    description += "{} ".format(dim)
+                description += "{}".format(str(data_type))
+                f.write(description + "\n")
+
+            # keep the same as run_onnx in align_model.py
+            if output_data.dtype == np.int64 or output_data.dtype == np.int32:
+                np.savetxt(f, output_data.reshape(-1), fmt="%d")
+            elif output_data.dtype == np.int8 or output_data.dtype == np.bool:
+                np.savetxt(f, output_data.reshape(-1), fmt="%d")
+            elif output_data.dtype == np.float32 or output_data.dtype == np.float64:
+                np.savetxt(f, output_data.reshape(-1), fmt="%0.6f")
+            else :
+                print("ERROR: dump_single_output dont support data type: " + str(output_data.dtype))
+                return False
+
+        return True
diff --git a/3rdparty/TNN/tools/convert2tnn/utils/run_src_model.py b/3rdparty/TNN/tools/convert2tnn/utils/run_src_model.py
new file mode 100644
index 0000000..5a32a43
--- /dev/null
+++ b/3rdparty/TNN/tools/convert2tnn/utils/run_src_model.py
@@ -0,0 +1,227 @@
+# Tencent is pleased to support the open source community by making TNN available.
+#
+# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import linecache
+import logging
+import os
+import pathlib
+import sys
+
+import numpy as np
+
+from multiprocessing import Pool
+from utils import cmd
+
+
+TNN_MAGIC_NUMBER = 0x0FABC0002
+TNN_MAGIC_NUMBER_V2 = 0x0FABC0004
+
+np.random.seed(0)
+
+
+def print_align_message(is_tflite: bool = False):
+    logging.info("{}  Congratulations!   {}".format("-" * 10, "-" * 10))
+    logging.info("The {} model is aligned with tnn model\n" .format("tflite" if is_tflite else "onnx"))
+
+
+def print_not_align_message(is_tflite=False):
+    logging.error("{}   Unfortunately   {}" .format("-" * 10, "-" * 10))
+    logging.error("The {} model is not aligned with tnn model\n" .format("tflite" if is_tflite else "onnx"))
+
+
+class BaseRunner:
+    def __init__(self, src_model_path: str, tnn_proto_path: str, align_all: bool,
+                 input_names: str = None, input_file_path: str = None, refer_file_path: str = None,
+                 is_tflite: bool = False):
+        self.src_model_path = src_model_path
+        self.tnn_proto_path = tnn_proto_path
+        self.align_all = align_all
+        self.input_names = input_names
+        self.input_file_path = input_file_path
+        self.refer_file_path = refer_file_path
+        self.is_tflite = is_tflite
+
+        self.dump_dir_path = ""
+        self.modify_model_path = ""
+        self.input_data = {}
+        self.dump_data = {}
+
+    def get_dump_dir_path(self) -> str:
+        convert2tnn_path = pathlib.Path(__file__).parent.parent
+        data_dir = os.path.join(convert2tnn_path, "temp_data/")
+
+        if os.path.exists(data_dir):
+            command = "rm -rf {}".format(data_dir)
+            cmd.run(command)
+
+        command = "mkdir {}".format(data_dir)
+        cmd.run(command)
+
+        return data_dir
+
+    def create_dump_dir(self):
+        self.dump_dir_path = self.get_dump_dir_path()
+
+    def get_src_model_input_information(self) -> dict:
+        pass
+
+    def get_tnn_model_input_information(self) -> dict:
+        input_info: dict = {}
+        magic_number = \
+        linecache.getline(self.tnn_proto_path, 1).strip('\n').strip('\"').strip(',').strip(' ').split(" ")[-1]
+        magic_number = int(magic_number)
+        if magic_number == TNN_MAGIC_NUMBER:
+            line = linecache.getline(self.tnn_proto_path, 2).strip('\n').strip('\"').strip(',')
+            input_list = line.split(':')
+            for tnn_input in input_list:
+                name, n, c, h, w = tnn_input.strip(' ').split(' ')
+                size = 4
+                shape = [int(n), int(c), int(h), int(w)]
+                data_type = 0
+                input_shape_info = {'shape': shape, 'data_type': data_type}
+                input_info.update({name: input_shape_info})
+        elif magic_number == TNN_MAGIC_NUMBER_V2:
+            line = linecache.getline(self.tnn_proto_path, 2).strip('\n').strip('\"').strip(',')
+            input_list = line.split(':')
+            for tnn_input in input_list:
+                # information: name size shape1 shape2... data_type
+                information = tnn_input.strip(' ').split(' ')
+                name = information[0]
+                size = int(information[1])
+                data_type = int(information[-1])
+                shape: list = list(map(int, information[2:-1]))
+                input_shape_info = {'shape': shape, 'data_type': data_type}
+                input_info.update({name: input_shape_info})
+        else:
+            logging.error("Unspport TNN model version\n")
+            sys.exit(-1)
+
+        return input_info
+
+    def check_shape_information(self, src_model_input_information: dict, tnn_model_input_information: dict) -> bool:
+        onnx_shape = src_model_input_information['shape']
+        onnx_data_type = src_model_input_information['data_type']
+        tnn_shape = tnn_model_input_information['shape']
+        tnn_data_type = src_model_input_information['data_type']
+        if type(onnx_shape[0]) is not int:
+            onnx_shape[0] = 1
+        # if tnn has valid shape and onnx model's is str, use tnn shape
+        if len(onnx_shape) == len(tnn_shape):
+            for i in range(len(onnx_shape)):
+                os = onnx_shape[i]
+                ts = tnn_shape[i]
+                if isinstance(ts, int) and not isinstance(os, int):
+                    onnx_shape[i] = ts
+                    src_model_input_information['borrow_shape'] = True
+        if onnx_data_type == tnn_data_type and onnx_shape == tnn_shape:
+            return True
+        else:
+            return False
+
+    def check_input_information(self, src_model_input_information: dict, tnn_model_input_information: dict) -> bool:
+        if len(src_model_input_information) != len(tnn_model_input_information):
+            logging.info("input is not align 186\n")
+            # print_not_align_message("onnx input size != tnn input size")
+        for name, onnx_info in src_model_input_information.items():
+            tnn_name = name.replace(":", "_")
+            tnn_info = tnn_model_input_information[tnn_name]
+            if self.check_shape_information(onnx_info, tnn_info):
+                logging.info(name + ": input shape of onnx and tnn is aligned!\n")
+            else:
+                logging.error("input is not align 194\n")
+                # print_not_align_message(
+                #     "The {}'s shape not equal! the onnx shape:{}, tnn shape: {}\n".format(name, str(onnx_info),
+                #                                                                           str(tnn_info)))
+        return True
+
+    def modify_src_model_output(self) -> bool:
+        pass
+
+    def generate_input_data(self, input_information: dict, tnn_model_input_information: dict) -> bool:
+        self.input_data = {}
+        data_path = os.path.join(self.dump_dir_path, "input.txt")
+        data_file = open(data_path, "w")
+        data_file.write(str(len(input_information)) + '\n')
+        for name, info in input_information.items():
+            tnn_name = name.replace(":", "_")
+            tnn_info = tnn_model_input_information[tnn_name]
+            shape = info['shape']
+            if "borrow_shape" in info and info["borrow_shape"]:
+                shape = tnn_info['shape']
+                logging.info("Using tnn shape:{} for input:{}".format(shape,name))
+            data_type = info['data_type']
+            data_file.write(name + ' ' + str(len(shape)) + ' ' + ' '.join([str(dim) for dim in shape]) + ' ' + str(
+                data_type) + '\n')
+            if data_type == 0:
+                self.input_data[name] = np.random.rand(*shape).astype(np.float32)
+                np.savetxt(data_file, self.input_data[name].reshape(-1), fmt="%0.6f")
+            elif data_type == 3:
+                # range [low, high)
+                self.input_data[name] = np.random.randint(low=0, high=2, size=shape).astype(np.int64)
+                np.savetxt(data_file, self.input_data[name].reshape(-1), fmt="%i")
+        data_file.close()
+
+        return True
+
+    def inference(self) -> dict:
+        pass
+
+    def dump_single_output(self, output_name: str, output_data: np.ndarray, full_message: bool):
+        pass
+
+    def dump_all_output(self, dump_data: dict) -> bool:
+        param_list = []
+        for name, data in dump_data.items():
+            param_list.append((name, data, True))
+        with Pool(4) as p:
+            p.starmap(self.dump_single_output, param_list)
+
+    def run_model_check(self) -> bool:
+        model_check_path = os.path.join(self.dump_dir_path[:-10], "bin/model_check")
+        tnn_model_path = self.tnn_proto_path[:-9] + ".tnnmodel"
+        input_path = os.path.join(self.dump_dir_path, "input.txt")
+        command = "{} -p {} -m {} -i {} -a {} -d NAIVE".format(
+            model_check_path, self.tnn_proto_path, tnn_model_path, input_path, self.dump_dir_path)
+        logging.debug(command)
+
+        return cmd.run(command, log_level="error")
+
+    def remove_dump_file(self) -> bool:
+        command = "rm -rf {}" .format(self.dump_dir_path)
+        cmd.run(command)
+
+    def run(self):
+        self.create_dump_dir()
+
+        src_model_input_information = self.get_src_model_input_information()
+        tnn_model_input_information = self.get_tnn_model_input_information()
+
+        self.check_input_information(src_model_input_information, tnn_model_input_information)
+
+        self.modify_src_model_output()
+
+        self.generate_input_data(src_model_input_information, tnn_model_input_information)
+
+        dump_data = self.inference()
+
+        self.dump_all_output(dump_data)
+
+        status = self.run_model_check()
+        if status == 0:
+            self.remove_dump_file()
+            print_align_message(self.is_tflite)
+            return
+        print_not_align_message(self.is_tflite)
+
+        return
diff --git a/3rdparty/TNN/tools/converter/CMakeLists.txt b/3rdparty/TNN/tools/converter/CMakeLists.txt
new file mode 100644
index 0000000..928525f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/CMakeLists.txt
@@ -0,0 +1,40 @@
+find_package(Protobuf REQUIRED)
+if (PROTOBUF_FOUND)
+    include_directories(${Protobuf_INCLUDE_DIRS})
+    include_directories(${CMAKE_CURRENT_BINARY_DIR})
+    protobuf_generate_cpp(ONNX_PROTO_SRC ONNX_PROTO_HEAD source/onnx/onnx-proto/onnx.proto)
+else ()
+    message(FATAL_ERROR "Protobuf not found, must install first")
+endif ()
+
+file(GLOB_RECURSE TNN_CONVERTER_SRC source/*.cc source/*.h)
+get_filename_component(TNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..  ABSOLUTE)
+include_directories(${TNN_ROOT})
+include_directories(source)
+
+add_executable(TnnConverter ${TNN_CONVERTER_SRC} ${ONNX_PROTO_SRC} ${ONNX_PROTO_HEAD})
+
+
+if(TNN_BUILD_SHARED)
+    target_link_libraries(TnnConverter
+            TNN
+            gflags
+            flatbuffers
+            ${Protobuf_LIBRARIES}
+            )
+elseif(SYSTEM.iOS OR SYSTEM.Darwin)
+    target_link_libraries(TnnConverter
+            -Wl,-force_load TNN
+            gflags
+            flatbuffers
+            ${Protobuf_LIBRARIES}
+            )
+else()
+    message("target link libraries whole-archive")
+    target_link_libraries(TnnConverter
+            -Wl,--whole-archive TNN -Wl,--no-whole-archive
+            gflags
+            flatbuffers
+            ${Protobuf_LIBRARIES}
+            )
+endif()
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto b/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto
new file mode 100644
index 0000000..8e70560
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto
@@ -0,0 +1,714 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+syntax = "proto2";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+// 
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control. 
+  // For the IR, we are using simple numbers starting with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION_2019_1_22 = 0x0000000000000004;
+
+  // IR VERSION 5 published on March 18, 2019
+  // - Add message TensorAnnotation.
+  // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
+  IR_VERSION_2019_3_18 = 0x0000000000000005;
+
+  // IR VERSION 6 published on Sep 19, 2019
+  // - Add support for sparse tensor constants stored in model.
+  //   - Add message SparseTensorProto
+  //   - Add sparse initializers
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add support to allow function body graph to rely on multiple external opreator sets.
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Implicitly add inference graph into each TrainingInfoProto's algorithm.
+  IR_VERSION = 0x0000000000000007;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+    SPARSE_TENSOR = 11;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+    SPARSE_TENSORS = 12;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field heuristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accommodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  optional SparseTensorProto sparse_tensor = 22;  // sparse tensor value
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+  repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR for
+  // inputs and outputs of the top-level graph.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+}
+
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been performed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update steps (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each step.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Thus, no initializer would be changed by default.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this field contains loss node, gradient node,
+  // optimizer node, increment of iteration count.
+  //
+  // An execution of the training algorithm step is performed by executing the
+  // graph obtained by combining the inference graph (namely "ModelProto.graph")
+  // and the "algorithm" graph. That is, the actual the actual
+  // input/initializer/output/node/value_info/sparse_initializer list of
+  // the training graph is the concatenation of
+  // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer"
+  // and "algorithm.input/initializer/output/node/value_info/sparse_initializer"
+  // in that order. This combined graph must satisfy the normal ONNX conditions.
+  // Now, let's provide a visualization of graph combination for clarity.
+  // Let the inference graph (i.e., "ModelProto.graph") be
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d
+  // and the "algorithm" graph be
+  //    tensor_d -> Add -> tensor_e
+  // The combination process results
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e
+  //
+  // Notice that an input of a node in the "algorithm" graph may reference the
+  // output of a node in the inference graph (but not the other way round). Also, inference
+  // node cannot reference inputs of "algorithm". With these restrictions, inference graph 
+  // can always be run independently without training information.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Evaluating the default training step never
+  // update any initializers.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two
+  //     variables may not have the same name. This ensures that one
+  //     variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm" or "ModelProto.graph.output".
+  //  4. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto's.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  optional GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+message TensorAnnotation {
+  optional string tensor_name = 1;
+  // <key, value> pairs to annotate tensor specified by <tensor_name> above.
+  // The keys used in the mapping below must be pre-defined in ONNX spec.
+  // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as
+  // quantization parameter keys.
+  repeated StringStringEntryProto quant_parameter_tensor_names = 2;
+}
+
+
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // Initializers (see above) stored in sparse format.
+  repeated SparseTensorProto sparse_initializer = 15;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // This field carries information to indicate the mapping among a tensor and its
+  // quantization parameter tensors. For example:
+  // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated,
+  // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
+  repeated TensorAnnotation quantization_annotation = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  optional int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+  }
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  optional DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+}
+
+// A serialized sparse-tensor value
+message SparseTensorProto {
+  // The sequence of non-default values are encoded as a tensor of shape [NNZ].
+  // The default-value is zero for numeric tensors, and empty-string for string tensors.
+  optional TensorProto values = 1;
+
+  // The indices of the non-default values, which may be stored in one of two formats.
+  // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value
+  // corresponding to the j-th index of the i-th value (in the values tensor).
+  // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value
+  // must be the linearized-index of the i-th value (in the values tensor).
+  // The linearized-index can be converted into an index tuple (k_1,...,k_rank)
+  // using the shape provided below.
+  // The indices must appear in ascending order without duplication.
+  // In the first format, the ordering is lexicographic-ordering:
+  // e.g., index-value [1,4] must appear before [2,1]
+  optional TensorProto indices = 2;
+
+  // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank]
+  repeated int64 dims = 3;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+  // repeated T
+  message Sequence {
+    // The type and optional shape of each element of the sequence.
+    // This field MUST be present for this version of the IR.
+    optional TypeProto elem_type = 1;
+  };
+
+  // map<K,V>
+  message Map {
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
+    optional int32 key_type = 1;
+    // This field MUST be present for this version of the IR.
+    optional TypeProto value_type = 2;
+  };
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
+    //        as input and output to graphs and nodes. These types are needed to naturally
+    //        support classical ML operators.  DNN operators SHOULD restrict their input
+    //        and output types to tensors.
+
+    // The type of a sequence.
+    Sequence sequence_type = 4;
+
+    // The type of a map.
+    Map map_type = 5;
+
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto3 b/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto3
new file mode 100644
index 0000000..73b8ec5
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx-proto/onnx.proto3
@@ -0,0 +1,714 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+syntax = "proto3";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+// 
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control. 
+  // For the IR, we are using simple numbers starting with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION_2019_1_22 = 0x0000000000000004;
+
+  // IR VERSION 5 published on March 18, 2019
+  // - Add message TensorAnnotation.
+  // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
+  IR_VERSION_2019_3_18 = 0x0000000000000005;
+
+  // IR VERSION 6 published on Sep 19, 2019
+  // - Add support for sparse tensor constants stored in model.
+  //   - Add message SparseTensorProto
+  //   - Add sparse initializers
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add support to allow function body graph to rely on multiple external opreator sets.
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Implicitly add inference graph into each TrainingInfoProto's algorithm.
+  IR_VERSION = 0x0000000000000007;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+    SPARSE_TENSOR = 11;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+    SPARSE_TENSORS = 12;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field heuristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accommodate proto3 implementations.
+  AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  float f = 2;               // float
+  int64 i = 3;               // int
+  bytes s = 4;               // UTF-8 string
+  TensorProto t = 5;         // tensor value
+  GraphProto g = 6;          // graph
+  SparseTensorProto sparse_tensor = 22;  // sparse tensor value
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+  repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR for
+  // inputs and outputs of the top-level graph.
+  TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  string doc_string = 6;
+}
+
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been performed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update steps (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each step.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Thus, no initializer would be changed by default.
+  GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this field contains loss node, gradient node,
+  // optimizer node, increment of iteration count.
+  //
+  // An execution of the training algorithm step is performed by executing the
+  // graph obtained by combining the inference graph (namely "ModelProto.graph")
+  // and the "algorithm" graph. That is, the actual the actual
+  // input/initializer/output/node/value_info/sparse_initializer list of
+  // the training graph is the concatenation of
+  // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer"
+  // and "algorithm.input/initializer/output/node/value_info/sparse_initializer"
+  // in that order. This combined graph must satisfy the normal ONNX conditions.
+  // Now, let's provide a visualization of graph combination for clarity.
+  // Let the inference graph (i.e., "ModelProto.graph") be
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d
+  // and the "algorithm" graph be
+  //    tensor_d -> Add -> tensor_e
+  // The combination process results
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e
+  //
+  // Notice that an input of a node in the "algorithm" graph may reference the
+  // output of a node in the inference graph (but not the other way round). Also, inference
+  // node cannot reference inputs of "algorithm". With these restrictions, inference graph 
+  // can always be run independently without training information.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Evaluating the default training step never
+  // update any initializers.
+  GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two
+  //     variables may not have the same name. This ensures that one
+  //     variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm" or "ModelProto.graph.output".
+  //  4. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto's.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  string key = 1;
+  string value= 2;
+};
+
+message TensorAnnotation {
+  string tensor_name = 1;
+  // <key, value> pairs to annotate tensor specified by <tensor_name> above.
+  // The keys used in the mapping below must be pre-defined in ONNX spec.
+  // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as
+  // quantization parameter keys.
+  repeated StringStringEntryProto quant_parameter_tensor_names = 2;
+}
+
+
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // Initializers (see above) stored in sparse format.
+  repeated SparseTensorProto sparse_initializer = 15;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // This field carries information to indicate the mapping among a tensor and its
+  // quantization parameter tensors. For example:
+  // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated,
+  // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
+  repeated TensorAnnotation quantization_annotation = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    int64 begin = 1;
+    int64 end = 2;
+  }
+  Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+}
+
+// A serialized sparse-tensor value
+message SparseTensorProto {
+  // The sequence of non-default values are encoded as a tensor of shape [NNZ].
+  // The default-value is zero for numeric tensors, and empty-string for string tensors.
+  TensorProto values = 1;
+
+  // The indices of the non-default values, which may be stored in one of two formats.
+  // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value
+  // corresponding to the j-th index of the i-th value (in the values tensor).
+  // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value
+  // must be the linearized-index of the i-th value (in the values tensor).
+  // The linearized-index can be converted into an index tuple (k_1,...,k_rank)
+  // using the shape provided below.
+  // The indices must appear in ascending order without duplication.
+  // In the first format, the ordering is lexicographic-ordering:
+  // e.g., index-value [1,4] must appear before [2,1]
+  TensorProto indices = 2;
+
+  // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank]
+  repeated int64 dims = 3;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    int32 elem_type = 1;
+    TensorShapeProto shape = 2;
+  }
+
+  // repeated T
+  message Sequence {
+    // The type and optional shape of each element of the sequence.
+    // This field MUST be present for this version of the IR.
+    TypeProto elem_type = 1;
+  };
+
+  // map<K,V>
+  message Map {
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
+    int32 key_type = 1;
+    // This field MUST be present for this version of the IR.
+    TypeProto value_type = 2;
+  };
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
+    //        as input and output to graphs and nodes. These types are needed to naturally
+    //        support classical ML operators.  DNN operators SHOULD restrict their input
+    //        and output types to tensors.
+
+    // The type of a sequence.
+    Sequence sequence_type = 4;
+
+    // The type of a map.
+    Map map_type = 5;
+
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  int64 version = 2;
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.cc
new file mode 100644
index 0000000..da0cc41
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+
+OnnxConverterManager* OnnxConverterManager::onnx_converter_manager_ = nullptr;
+
+OnnxConverterManager::~OnnxConverterManager() {
+    for (auto& iter : onnx_converter_map_) {
+        delete iter.second;
+    }
+    onnx_converter_map_.clear();
+    delete onnx_converter_manager_;
+}
+
+OnnxConverterManager* OnnxConverterManager::get() {
+    if (onnx_converter_manager_ == nullptr) {
+        onnx_converter_manager_ = new OnnxConverterManager;
+    }
+    return onnx_converter_manager_;
+}
+
+OnnxBaseConverter* OnnxConverterManager::search(const std::string onnx_op_type) {
+    auto iter = onnx_converter_map_.find(onnx_op_type);
+    if (iter == onnx_converter_map_.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+
+void OnnxConverterManager::insert(const std::string onnx_op_type, OnnxBaseConverter* t) {
+    onnx_converter_map_.insert(std::make_pair(onnx_op_type, t));
+}
+
+TNN_NS::Status OnnxBaseConverter::SeparateActivation(TNN_NS::NetStructure& net_structure,
+                                                     TNN_NS::ActivationType activation_function_type) {
+    if (activation_function_type == TNN_NS::ActivationType_None) {
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+    auto& layers                         = net_structure.layers;
+    const std::string conv_output_suffix = "_midline_output";
+    const std::string activation_suffix  = "_activation";
+    auto& layer                          = layers.back();
+    if (activation_function_type == TNN_NS::ActivationType_ReLU ||
+        activation_function_type == TNN_NS::ActivationType_ReLU6) {
+        auto activation_layer = new TNN_NS::LayerInfo;
+        activation_layer->type =
+            activation_function_type == TNN_NS::ActivationType_ReLU ? TNN_NS::LAYER_RELU : TNN_NS::LAYER_RELU6;
+        activation_layer->type_str = activation_function_type == TNN_NS::ActivationType_ReLU ? "ReLU" : "ReLU6";
+        activation_layer->name     = layer->name + activation_suffix;
+        activation_layer->inputs.push_back(layer->outputs[0] + conv_output_suffix);
+        activation_layer->outputs.push_back(layer->outputs[0]);
+
+        // modify layer
+        layer->outputs[0] = layer->outputs[0] + conv_output_suffix;
+        // create activation layer
+        // create relu param
+        auto activation_param       = new TNN_NS::LayerParam;
+        activation_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(activation_param);
+        activation_param->type      = activation_layer->type_str;
+        activation_param->name      = layer->name + activation_suffix;
+        activation_param->quantized = false;
+        if (layer->param->quantized) {
+            activation_param->type      = "Quantized" + activation_param->type;
+            activation_param->name      = "Quantized" + activation_param->name;
+            activation_param->quantized = true;
+        }
+        // insert activation layer
+        layers.push_back(std::shared_ptr<TNN_NS::LayerInfo>(activation_layer));
+    } else {
+        LOGE("TNN Converter unsupport activation function\n");
+        return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+const onnx::NodeProto* OnnxBaseConverter::FindNodeProto(
+    const std::string& name, std::map<std::string, std::shared_ptr<OnnxProxyNode>> proxy_nodes) {
+    if (proxy_nodes.find(name) != proxy_nodes.end()) {
+        return proxy_nodes.find(name)->second.get()->onnx_node;
+    }
+    return nullptr;
+}
+void OnnxBaseConverter::InsertBlobs(TNN_NS::NetStructure& net_structure) {
+    auto& cur_layer     = net_structure.layers.back();
+    const auto& inputs  = cur_layer->inputs;
+    const auto& outputs = cur_layer->outputs;
+    for (const auto& input_name : inputs) {
+        const auto& blob_name = input_name;
+        net_structure.blobs.insert(blob_name);
+    }
+    for (const auto& output_name : outputs) {
+        const auto& blob_name = output_name;
+        net_structure.blobs.insert(blob_name);
+    }
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.h b/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.h
new file mode 100644
index 0000000..356f33a
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_base_converter.h
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_BASE_CONVERTER_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_BASE_CONVERTER_H_
+#include <memory>
+
+#include "onnx.pb.h"
+#include "onnx_proxy_graph.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+
+class OnnxBaseConverter {
+public:
+    OnnxBaseConverter()          = default;
+    virtual ~OnnxBaseConverter() = default;
+
+    virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                const onnx::NodeProto& node,
+                                std::map<std::string, const onnx::TensorProto*>& proxy_initializers_map,
+                                std::map<std::string, std::shared_ptr<OnnxProxyNode>>& proxy_nodes,
+                                bool& quantized_model)                               = 0;
+    virtual std::string TNNOpType(const onnx::NodeProto& node, bool quantized_model) = 0;
+    virtual TNN_NS::ActivationType ActivationType(const onnx::NodeProto& node)       = 0;
+    TNN_NS::Status SeparateActivation(TNN_NS::NetStructure& net_structure, TNN_NS::ActivationType activation_type);
+    void InsertBlobs(TNN_NS::NetStructure& net_structure);
+
+protected:
+    const onnx::NodeProto* FindNodeProto(const std::string& name,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> proxy_nodes);
+};
+
+class OnnxConverterManager {
+public:
+    OnnxConverterManager() = default;
+    ~OnnxConverterManager();
+    static OnnxConverterManager* get();
+    void insert(const std::string onnx_op_type, OnnxBaseConverter* onnx_base_converter);
+    OnnxBaseConverter* search(const std::string onnx_op_type);
+
+private:
+    static OnnxConverterManager* onnx_converter_manager_;
+    std::map<std::string, OnnxBaseConverter*> onnx_converter_map_;
+};
+template <class T>
+class OnnxConverterRegister {
+public:
+    explicit OnnxConverterRegister(const std::string onnx_op_type) {
+        T* converter                                 = new T;
+        OnnxConverterManager* onnx_converter_manager = OnnxConverterManager::get();
+        onnx_converter_manager->insert(onnx_op_type, converter);
+    }
+    ~OnnxConverterRegister(){};
+};
+
+#define DECLARE_OP_CONVERTER(op_converter_name)                                                                        \
+    class Onnx##op_converter_name##Converter : public OnnxBaseConverter {                                              \
+    public:                                                                                                            \
+        Onnx##op_converter_name##Converter(){};                                                                        \
+        virtual ~Onnx##op_converter_name##Converter(){};                                                               \
+        virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,            \
+                                    const onnx::NodeProto& node,                                                       \
+                                    std::map<std::string, const onnx::TensorProto*>& proxy_initializers_map,           \
+                                    std::map<std::string, std::shared_ptr<OnnxProxyNode>>& proxy_nodes,                \
+                                    bool& quantized_model);                                                            \
+        virtual std::string TNNOpType(const onnx::NodeProto& node, bool quantized_model);                              \
+        virtual TNN_NS::ActivationType ActivationType(const onnx::NodeProto& node);                                    \
+    }
+
+#define REGISTER_CONVERTER(op_converter_name, onnx_op_type)                                                            \
+    OnnxConverterRegister<Onnx##op_converter_name##Converter> g_converter_##onnx_op_type##_(#onnx_op_type)
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_BASE_CONVERTER_H_
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_batchnorm_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_batchnorm_converter.cc
new file mode 100644
index 0000000..f661eb2
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_batchnorm_converter.cc
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(BatchNorm);
+
+std::string OnnxBatchNormConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "BatchNormCxx";
+}
+
+TNN_NS::ActivationType OnnxBatchNormConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxBatchNormConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                            const onnx::NodeProto &node,
+                                            std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                            std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                            bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::BatchNormLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+
+    const double epsilon = GetAttributeFloat(node, "epsilon", 1e-5f);
+
+    const auto &gamma = proxy_initializers_map[node.input(1)];
+    const auto &beta  = proxy_initializers_map[node.input(2)];
+    const auto &mean  = proxy_initializers_map[node.input(3)];
+    const auto &var   = proxy_initializers_map[node.input(4)];
+
+    const auto *gamma_data = GetTensorProtoData(*gamma);
+    const auto *beta_data  = GetTensorProtoData(*beta);
+    const auto *mean_data  = GetTensorProtoData(*mean);
+    const auto *var_data   = GetTensorProtoData(*var);
+
+    const int channels = GetTensorProtoDataSize(*gamma);
+    auto *slope        = new float[channels];
+    auto *bias         = new float[channels];
+
+    double sqrt_var;
+    for (int i = 0; i < channels; i++) {
+        sqrt_var = std::sqrt(static_cast<double>(var_data[i]) + epsilon);
+        slope[i] = static_cast<double>(gamma_data[i]) / sqrt_var;
+        bias[i]  = static_cast<double>(beta_data[i]) - static_cast<double>(mean_data[i]) * slope[i];
+    }
+
+    auto layer_resource            = new TNN_NS::BatchNormLayerResource;
+    layer_resource->name           = cur_layer->name;
+    TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(channels * sizeof(float));
+    ::memcpy(scale_handle.force_to<float *>(), slope, channels * sizeof(float));
+    layer_resource->scale_handle = scale_handle;
+
+    TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(channels * sizeof(float));
+    ::memcpy(bias_handle.force_to<float *>(), bias, channels * sizeof(float));
+    layer_resource->bias_handle = bias_handle;
+
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = node.output(0);
+
+    delete[] slope;
+    delete[] bias;
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(BatchNorm, BatchNormalization);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_binary_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_binary_converter.cc
new file mode 100644
index 0000000..348b049
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_binary_converter.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Binary);
+
+std::string OnnxBinaryConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return node.op_type();
+}
+
+TNN_NS::ActivationType OnnxBinaryConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxBinaryConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const onnx::NodeProto &node,
+                                         std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                         bool &quantized_model) {
+    auto param       = new TNN_NS::MultidirBroadcastLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+
+    int weight_input_index = -1;
+    std::string weight_name;
+    auto status = GetWeightInputIndexName(weight_input_index, weight_name, node, proxy_initializers_map, proxy_nodes);
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        return status;
+    }
+
+    param->weight_input_index = weight_input_index;
+    if (weight_input_index == -1) {
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+
+    const auto *weight_tensor = proxy_initializers_map[weight_name];
+    auto *weight_tensor_data  = reinterpret_cast<const float *>(GetTensorProtoData(*weight_tensor));
+    const auto &weight_dims   = weight_tensor->dims();
+    int weight_size           = 1;
+    for (const auto dim : weight_dims) {
+        weight_size *= dim;
+    }
+
+    auto layer_resource              = new TNN_NS::EltwiseLayerResource;
+    layer_resource->name             = cur_layer->name;
+    TNN_NS::RawBuffer element_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+    ::memcpy(element_handle.force_to<float *>(), weight_tensor_data, weight_size * sizeof(float));
+    layer_resource->element_handle             = element_handle;
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    cur_layer->inputs.resize(1);
+    if (weight_input_index == 0) {
+        cur_layer->inputs[0] = node.input(1);
+    } else {
+        cur_layer->inputs[0] = node.input(0);
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Binary, Add);
+REGISTER_CONVERTER(Binary, Sub);
+REGISTER_CONVERTER(Binary, Mul);
+REGISTER_CONVERTER(Binary, Div);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_cast_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_cast_converter.cc
new file mode 100644
index 0000000..df632f3
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_cast_converter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Cast);
+
+std::string OnnxCastConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Cast";
+}
+
+TNN_NS::ActivationType OnnxCastConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxCastConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                       const onnx::NodeProto &node,
+                                       std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                       bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::CastLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+
+    auto to_type         = GetAttributeInt(node, "to", 0);
+    const auto onnx_type = static_cast<onnx::TensorProto_DataType>(to_type);
+    const auto tnn_type  = GetTnnDataTypeFromOnnx(onnx_type);
+    param->to            = tnn_type;
+
+    cur_layer->inputs[0] = node.input(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Cast, Cast);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_clip_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_clip_converter.cc
new file mode 100644
index 0000000..d6de1b3
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_clip_converter.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Clip);
+
+std::string OnnxClipConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Clip";
+}
+
+TNN_NS::ActivationType OnnxClipConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxClipConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                       const onnx::NodeProto &node,
+                                       std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                       bool &quantized_model) {
+    const auto min = GetAttributeFloat(node, "min", 1, -DBL_MAX, proxy_initializers_map);
+    const auto max = GetAttributeFloat(node, "max", 2, DBL_MAX, proxy_initializers_map);
+    auto cur_layer = net_structure.layers.back();
+    if (std::fabs(min) <= DBL_EPSILON && std::fabs(max - 6) <= DBL_EPSILON) {
+        cur_layer->type_str = "ReLU6";
+        cur_layer->type     = TNN_NS::LAYER_RELU6;
+        cur_layer->param    = std::shared_ptr<TNN_NS::LayerParam>(new TNN_NS::LayerParam);
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::ClipLayerParam;
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->min                 = min;
+    param->max                 = max;
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Clip, Clip);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_concate_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_concate_converter.cc
new file mode 100644
index 0000000..adc276f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_concate_converter.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Concat);
+
+std::string OnnxConcatConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Concat";
+}
+
+TNN_NS::ActivationType OnnxConcatConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxConcatConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const onnx::NodeProto &node,
+                                         std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                         bool &quantized_model) {
+    auto param       = new TNN_NS::ConcatLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    param->axis      = GetAttributeInt(node, "axis", 1);
+
+    for (const auto &input : node.input()) {
+        if (proxy_initializers_map.find(input) != proxy_initializers_map.end()) {
+            auto const_tensor                   = proxy_initializers_map[input];
+            TNN_NS::RawBuffer *const_raw_buffer = nullptr;
+            CreateRawBufferFromTensor(*const_tensor, &const_raw_buffer);
+            net_resource.constant_map[input] = std::shared_ptr<TNN_NS::RawBuffer>(const_raw_buffer);
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Concat, Concat);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_constant_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_constant_converter.cc
new file mode 100644
index 0000000..7dfe9a9
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_constant_converter.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Constant);
+
+std::string OnnxConstantConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Const";
+}
+
+TNN_NS::ActivationType OnnxConstantConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxConstantConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const onnx::NodeProto &node,
+                                           std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                           std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                           bool &quantized_model) {
+    auto param                      = new TNN_NS::ConstLayerParam;
+    auto cur_layer                  = net_structure.layers.back();
+    cur_layer->param                = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                     = cur_layer->name;
+    param->type                     = cur_layer->type_str;
+    param->quantized                = false;
+    const onnx::TensorProto *tensor = GetTensorFromConstantNode(node);
+    ASSERT(tensor != nullptr);
+    TNN_NS::DimsVector dims             = CreateDimsVectorFromTensor(*tensor);
+    param->dims                         = dims;
+    TNN_NS::RawBuffer *const_raw_buffer = nullptr;
+    CreateRawBufferFromTensor(*tensor, &const_raw_buffer);
+    ASSERT(const_raw_buffer != nullptr);
+    auto const_layer_resource                 = new TNN_NS::ConstLayerResource;
+    const_layer_resource->weight_handle       = *const_raw_buffer;
+    const_layer_resource->name                = cur_layer->name;
+    net_resource.constant_map[node.output(0)] = std::shared_ptr<TNN_NS::RawBuffer>(const_raw_buffer);
+    // net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(const_layer_resource);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Constant, Constant);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_conv_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_conv_converter.cc
new file mode 100644
index 0000000..241519c
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_conv_converter.cc
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Conv);
+
+std::string OnnxConvConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Convolution";
+}
+TNN_NS::ActivationType OnnxConvConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+TNN_NS::Status OnnxConvConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                       const onnx::NodeProto &node,
+                                       std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                       bool &quantized_model) {
+    TNN_NS::ConvLayerParam *param = new TNN_NS::ConvLayerParam;
+    auto cur_layer                = net_structure.layers.back();
+    cur_layer->param              = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                   = cur_layer->name;
+    param->type                   = cur_layer->type_str;
+    param->quantized              = false;
+
+    // 3|2 inputs: input tensor, weight, (bias)
+    const int input_size = node.input_size();
+    ASSERT(input_size == 2 || input_size == 3);
+
+    const int has_bias = input_size == 3 ? 1 : 0;
+
+    const auto &filter_name   = node.input(1);
+    const auto *filter_tensor = proxy_initializers_map[filter_name];
+    const int co              = filter_tensor->dims(0);
+    const int ci              = filter_tensor->dims(1);
+    const int kh              = filter_tensor->dims(2);
+    const int kw              = filter_tensor->dims(3);
+    param->bias               = has_bias;
+    param->input_channel      = ci;
+    param->output_channel     = co;
+    param->kernels            = {kw, kh};
+    auto strides              = GetAttributeIntVector(node, "strides");
+    ASSERT(strides.size() == 2);
+    param->strides = {strides[1], strides[0]};
+    // dilation
+    auto dilations = GetAttributeIntVector(node, "dilations");
+    ASSERT(dilations.size() == 2);
+    param->dialations = {dilations[1], dilations[0]};
+    param->group      = GetAttributeInt(node, "group", 1);
+    // parse pads type
+    auto pads = GetAttributeIntVector(node, "pads");
+    if (!pads.empty()) {
+        param->pad_type = -1;
+        if (pads[0] < pads[2] || pads[1] < pads[3]) {
+            // same upper
+            param->pad_type = 0;
+        }
+        param->pads = {pads[1], pads[3], pads[0], pads[2]};
+    } else {
+        auto auto_pad = GetAttributeString(node, "auto_pad", "NOTSET");
+        if (auto_pad == "NOTSET") {
+            param->pad_type = -1;
+        } else if (auto_pad == "SAME_UPPER") {
+            param->pad_type = 0;
+        } else if (auto_pad == "VALID") {
+            param->pad_type = 1;
+        } else {
+            LOGE("Conv: SAME_LOWER does not support, change toSAME_UPPER\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        param->pads = {0, 0, 0, 0};
+    }
+    ASSERT(pads.size() == 4);
+
+    param->activation_type = TNN_NS::ActivationType_None;
+
+    // weight
+    auto layer_resource             = new TNN_NS::ConvLayerResource;
+    layer_resource->name            = cur_layer->name;
+    const int weight_count          = co * kh * kw * ci;
+    TNN_NS::RawBuffer filter_handle = TNN_NS::RawBuffer(weight_count * sizeof(float));
+    filter_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+    auto *filter_tensor_data = reinterpret_cast<const float *>(GetTensorProtoData(*filter_tensor));
+    ::memcpy(filter_handle.force_to<float *>(), filter_tensor_data, weight_count * sizeof(float));
+    layer_resource->filter_handle = filter_handle;
+    // bias
+    if (has_bias) {
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(co * sizeof(float));
+        bias_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        const auto &bias_name   = node.input(2);
+        const auto *bias_tensor = proxy_initializers_map[bias_name];
+        auto *bias_tensor_data  = reinterpret_cast<const float *>(GetTensorProtoData(*bias_tensor));
+        ::memcpy(bias_handle.force_to<float *>(), bias_tensor_data, co * sizeof(float));
+        layer_resource->bias_handle = bias_handle;
+    }
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = node.output(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Conv, Conv);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.cc
new file mode 100644
index 0000000..ec8bd32
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.cc
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_converter.h"
+
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "onnx_base_converter.h"
+#include "onnx_proxy_graph.h"
+#include "onnx_utils.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_CONVERTER {
+
+Onnx2Tnn::Onnx2Tnn(std::string model_path) {
+    this->onnx_model_path_ = model_path;
+    this->onnx_model_      = std::unique_ptr<onnx::ModelProto>(new onnx::ModelProto());
+}
+
+Onnx2Tnn::~Onnx2Tnn() {
+    // do nothing
+}
+
+TNN_NS::Status Onnx2Tnn::Converter2Tnn(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource) {
+    if (!ReadModel()) {
+        return TNN_NS::TNNERR_CONVERT_INVALID_MODEL;
+    }
+    const auto& onnx_graph = onnx_model_->graph();
+    std::shared_ptr<OnnxProxyGraph> onnx_proxy_graph(new OnnxProxyGraph(&onnx_graph));
+    auto& proxy_initializers      = onnx_proxy_graph->proxy_initializers_map_;
+    auto& proxy_inputs            = onnx_proxy_graph->proxy_inputs_map_;
+    auto& proxy_outputs           = onnx_proxy_graph->proxy_outputs_map_;
+    auto& proxy_nodes             = onnx_proxy_graph->proxy_nodes_map_;
+    auto& constant_node_to_delete = onnx_proxy_graph->constant_node_to_delete_;
+
+    bool quantized_mode = IsQuantized();
+    // convert onnx graph input
+    TNN_NS::InputShapesMap& input_shapes_map      = net_structure.inputs_shape_map;
+    TNN_NS::InputDataTypeMap& input_data_type_map = net_structure.input_data_type_map;
+    for (const auto iter : proxy_inputs) {
+        // input in initializers
+        if (proxy_initializers.find(iter.first) != proxy_initializers.end()) {
+            continue;
+        }
+        auto input_name                = iter.first;
+        auto input                     = iter.second;
+        auto input_shape_tensor        = input->type().tensor_type().shape();
+        TNN_NS::DimsVector dims_vector = ConvertTensorShapeProtoToDimsVector(input_shape_tensor);
+        if (dims_vector.size() != 4) {
+            // dims_vector.push_back(1);
+            //            LOGE("The onnx have support input shape\n");
+            //            return TNN_NS::TNNERR_CONVERT_INVALID_MODEL;
+            continue;
+        }
+        if (input_shapes_map.find(input_name) == input_shapes_map.end()) {
+            input_shapes_map[input_name] = dims_vector;
+            const auto& input_data_type =
+                static_cast<onnx::TensorProto_DataType>(input->type().tensor_type().elem_type());
+            input_data_type_map[input_name] = GetTnnDataTypeFromOnnx(input_data_type);
+        }
+    }
+    // convert onnx graph output
+    std::set<std::string>& outputs = net_structure.outputs;
+    for (const auto iter : proxy_outputs) {
+        auto output_name = iter.first;
+        if (outputs.find(output_name) == outputs.end()) {
+            outputs.insert(output_name);
+        }
+    }
+
+    // convert onnx nodes
+    const auto node_size = onnx_graph.node_size();
+    for (int i = 0; i < node_size; ++i) {
+        const auto& node                = onnx_graph.node(i);
+        const std::string& node_op_type = node.op_type();
+        if (node_op_type == "Int8Quantize") {
+            // TODO
+            quantized_mode = true;
+        } else if (node_op_type == "Int8Dequantize") {
+            // TODO
+            quantized_mode = false;
+        } else if (node_op_type == "Int8GivenTensorFill" || node_op_type == "Int8GivenIntTensorFill") {
+            continue;
+        }
+
+        if (node_op_type == "Constant") {
+            continue;
+        }
+
+        auto converter = OnnxConverterManager::get()->search(node_op_type);
+        if (converter == nullptr) {
+            LOGE("The OnnxConverter do not support layer:%s output: %s \n", node.name().c_str(),
+                 node.output(0).c_str());
+            LOGE("The unsupported operator type is:%s\n", node_op_type.c_str());
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        auto cur_layer      = std::make_shared<TNN_NS::LayerInfo>();
+        cur_layer->name     = node.output(0);
+        auto type_name      = converter->TNNOpType(node, quantized_mode);
+        auto layer_type     = TNN_NS::GlobalConvertLayerType(type_name);
+        cur_layer->type     = layer_type;
+        cur_layer->type_str = type_name;
+        for (const auto& input : node.input()) {
+            cur_layer->inputs.push_back(input);
+        }
+        for (const auto& output : node.output()) {
+            cur_layer->outputs.push_back(output);
+        }
+        net_structure.layers.push_back(cur_layer);
+        auto status =
+            converter->exec(net_structure, net_resource, node, proxy_initializers, proxy_nodes, quantized_mode);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            LOGE("Onnx2Tnn converter %s failed!\n", cur_layer->type_str.c_str());
+            return status;
+        }
+        TNN_NS::ActivationType activation_function_type = converter->ActivationType(node);
+        status = converter->SeparateActivation(net_structure, activation_function_type);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            LOGE("Onnx2Tnn converter %s failed!\n", cur_layer->type_str.c_str());
+            return status;
+        }
+        converter->InsertBlobs(net_structure);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+bool Onnx2Tnn::ReadModel() {
+    std::ifstream input_stream(this->onnx_model_path_, std::ifstream::in | std::ifstream::binary);
+    if (!input_stream.is_open()) {
+        LOGE("Open the %s failed\n", this->onnx_model_path_.c_str());
+        return false;
+    }
+    google::protobuf::io::IstreamInputStream input(&input_stream);
+    google::protobuf::io::CodedInputStream coded_input_stream(&input);
+    coded_input_stream.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+    bool success = this->onnx_model_->ParseFromCodedStream(&coded_input_stream);
+    input_stream.close();
+    return success;
+}
+
+bool Onnx2Tnn::IsQuantized() {
+    // TODO
+    return true;
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.h b/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.h
new file mode 100644
index 0000000..320968f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_converter.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_CONVERTER_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_CONVERTER_H_
+#include "onnx.pb.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+class Onnx2Tnn {
+public:
+    Onnx2Tnn(std::string model_path);
+    Onnx2Tnn() = delete;
+    ~Onnx2Tnn();
+    TNN_NS::Status Converter2Tnn(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource);
+
+private:
+    bool ReadModel();
+    bool IsQuantized();
+    std::string onnx_model_path_;
+    std::unique_ptr<onnx::ModelProto> onnx_model_;
+};
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_CONVERTER_H_
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_gather_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_gather_converter.cc
new file mode 100644
index 0000000..bf18206
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_gather_converter.cc
@@ -0,0 +1,112 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <memory>
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Gather);
+
+std::string OnnxGatherConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Gather";
+}
+
+TNN_NS::ActivationType OnnxGatherConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxGatherConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const onnx::NodeProto &node,
+                                         std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                         bool &quantized_model) {
+    ASSERT(node.input_size() == 2);
+    auto param       = new TNN_NS::GatherLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    param->axis      = GetAttributeInt(node, "axis", 0);
+
+    auto &resource_map            = net_resource.resource_map;
+    auto resource                 = std::make_shared<TNN_NS::GatherLayerResource>();
+    resource_map[cur_layer->name] = resource;
+    // parse indices
+    const auto &indices_name = node.input(1);
+    if (proxy_initializers_map.find(indices_name) != proxy_initializers_map.end()) {
+        param->indices_in_resource            = true;
+        auto indices_tensor                   = proxy_initializers_map[indices_name];
+        TNN_NS::RawBuffer *indices_raw_buffer = nullptr;
+        CreateRawBufferFromTensor(*indices_tensor, &indices_raw_buffer);
+        resource->indices = *indices_raw_buffer;
+    } else if (proxy_nodes.find(indices_name) != proxy_nodes.end() &&
+               proxy_nodes.find(indices_name)->second->op_type == "Constant") {
+        param->indices_in_resource            = true;
+        auto indices_node                     = proxy_nodes.find(indices_name)->second->onnx_node;
+        TNN_NS::RawBuffer *indices_raw_buffer = nullptr;
+        CreateRawBufferFromConstant(*indices_node, &indices_raw_buffer);
+        ASSERT(indices_raw_buffer != nullptr);
+        resource->indices = *indices_raw_buffer;
+    } else {
+        param->indices_in_resource = false;
+    }
+    // parse data
+    const auto &data_name = node.input(0);
+    if (proxy_initializers_map.find(data_name) != proxy_initializers_map.end()) {
+        param->data_in_resource = true;
+        auto data_tensor        = proxy_initializers_map[data_name];
+        auto &data_dims         = data_tensor->dims();
+        auto dims               = std::vector<int>(data_dims.begin(), data_dims.end());
+        auto data_count         = TNN_NS::DimsVectorUtils::Count(dims);
+        auto data_raw_buffer    = TNN_NS::RawBuffer(data_count * sizeof(float), dims);
+        data_raw_buffer.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        void *data_ptr = GetDataFromTensor(*data_tensor, onnx::TensorProto_DataType_FLOAT);
+        if (data_ptr == nullptr) {
+            LOGE("Gather: can not get data from onnx model, please check the model\n");
+            return TNN_NS::TNNERR_MODEL_ERR;
+        }
+        auto tmp = new float[data_count];
+        for (int i = 0; i < data_count; ++i) {
+            tmp[i] = *((float *)data_ptr + i);
+        }
+        memcpy(data_raw_buffer.force_to<float *>(), tmp, data_count * sizeof(float));
+        resource->data = data_raw_buffer;
+        delete[] tmp;
+    } else {
+        param->data_in_resource = false;
+    }
+    // check indices and
+    if (param->indices_in_resource && param->indices_in_resource == param->data_in_resource) {
+        LOGE("Gather: There is not possible indices and data both in  or both not in resource\n");
+        return TNN_NS::TNNERR_MODEL_ERR;
+    }
+    if (!param->data_in_resource && param->indices_in_resource) {
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = node.input(0);
+    } else if (param->data_in_resource && !param->indices_in_resource) {
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = node.input(1);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Gather, Gather);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_gemm_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_gemm_converter.cc
new file mode 100644
index 0000000..8bd2476
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_gemm_converter.cc
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Gemm);
+
+std::string OnnxGemmConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    float alpha = GetAttributeFloat(node, "alpha", 1.f);
+    int transA  = (int)GetAttributeInt(node, "transA", 0);
+    if (alpha == 1.f) {
+        // InnerProduct-like A * B + C
+        if (transA == 0) {
+            return "InnerProduct";
+        }
+    }
+    return "";
+}
+TNN_NS::ActivationType OnnxGemmConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+TNN_NS::Status OnnxGemmConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                       const onnx::NodeProto &node,
+                                       std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                       bool &quantized_model) {
+    const int input_size = node.input_size();
+    assert(input_size == 2 || input_size == 3);
+    auto *param    = new TNN_NS::InnerProductLayerParam;
+    auto cur_layer = net_structure.layers.back();
+
+    float alpha = GetAttributeFloat(node, "alpha", 1.f);
+    float beta  = GetAttributeFloat(node, "beta", 1.f);
+    int transA  = (int)GetAttributeInt(node, "transA", 0);
+    int transB  = (int)GetAttributeInt(node, "transB", 0);
+
+    assert(beta == 1 || beta == 0);
+    assert(alpha == 1.f && transA == 0);
+
+    // get gemm param
+    const auto &weight_name   = node.input(1);
+    const auto *weight_tensor = proxy_initializers_map[weight_name];
+    float *weight_tensor_data = (float *)GetTensorProtoData(*weight_tensor);
+    int weight_tensor_size    = GetTensorProtoDataSize(*weight_tensor);
+
+    for (int i = 0; i < weight_tensor_size; i++) {
+        float w = weight_tensor_data[i];
+        int x   = 0;
+    }
+
+    const auto h = weight_tensor->dims(0);
+    const auto w = weight_tensor->dims(1);
+    if (transB != 1) {
+        auto *permuted_data      = new float[h * w];
+        float *permuted_data_ptr = permuted_data;
+        for (int j = 0; j < w; j++) {
+            for (int k = 0; k < h; k++) {
+                float vb           = weight_tensor_data[k * w + j];
+                *permuted_data_ptr = vb;
+                permuted_data_ptr++;
+            }
+        }
+        ::memcpy(weight_tensor_data, permuted_data, weight_tensor_size * sizeof(float));
+        delete[] permuted_data;
+    }
+
+    cur_layer->param  = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name       = cur_layer->name;
+    param->type       = cur_layer->type_str;
+    param->quantized  = false;
+    param->axis       = 1;
+    param->transpose  = 0;
+    param->num_output = weight_tensor->dims(0);
+
+    auto layer_resource             = new TNN_NS::InnerProductLayerResource;
+    layer_resource->name            = cur_layer->name;
+    TNN_NS::RawBuffer weight_handle = TNN_NS::RawBuffer(weight_tensor_size * sizeof(float));
+    weight_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+    ::memcpy(weight_handle.force_to<float *>(), weight_tensor_data, weight_tensor_size * sizeof(float));
+    layer_resource->weight_handle = weight_handle;
+
+    if (input_size > 2) {
+        // Get Bias
+        param->has_bias       = 1;
+        const auto &bias_name = node.input(2);
+
+        const auto *bias_tensor = proxy_initializers_map[bias_name];
+        auto *bias_tensor_data  = (float *)GetTensorProtoData(*bias_tensor);
+        int bias_tensor_size    = GetTensorProtoDataSize(*bias_tensor);
+
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(bias_tensor_size * sizeof(float));
+        //        bias_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        ::memcpy(bias_handle.force_to<float *>(), bias_tensor_data, bias_tensor_size * sizeof(float));
+        layer_resource->bias_handle = bias_handle;
+
+        for (int i = 0; i < bias_tensor_size; i++) {
+            float tmp = layer_resource->bias_handle.force_to<float *>()[i];
+            int x     = 0;
+        }
+    }
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+    int size                                   = net_resource.resource_map.size();
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = node.output(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Gemm, Gemm);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_matmul_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_matmul_converter.cc
new file mode 100644
index 0000000..dd08ce9
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_matmul_converter.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(MatMul);
+
+std::string OnnxMatMulConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "MatMul";
+}
+
+TNN_NS::ActivationType OnnxMatMulConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxMatMulConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const onnx::NodeProto &node,
+                                         std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                         bool &quantized_model) {
+    auto param       = new TNN_NS::MatMulLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+
+    std::pair<int, std::string> weight_input_index_name;
+    int weight_input_index = -1;
+    std::string weight_name;
+    auto status = GetWeightInputIndexName(weight_input_index, weight_name, node, proxy_initializers_map, proxy_nodes);
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        return status;
+    }
+
+    param->weight_position = weight_input_index;
+    if (weight_input_index == -1) {
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+
+    const auto *weight_tensor = proxy_initializers_map[weight_name];
+    auto *weight_tensor_data  = reinterpret_cast<const float *>(GetTensorProtoData(*weight_tensor));
+    const auto &weight_shape  = weight_tensor->dims();
+    int weight_size           = 1;
+    TNN_NS::DimsVector weight_dims;
+    for (const auto dim : weight_shape) {
+        weight_size *= dim;
+        weight_dims.push_back(dim);
+    }
+
+    auto layer_resource             = new TNN_NS::MatMulLayerResource;
+    layer_resource->name            = cur_layer->name;
+    TNN_NS::RawBuffer weight_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+    weight_handle.SetBufferDims(weight_dims);
+    ::memcpy(weight_handle.force_to<float *>(), weight_tensor_data, weight_size * sizeof(float));
+    layer_resource->weight                     = weight_handle;
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    cur_layer->inputs.resize(1);
+    if (weight_input_index == 0) {
+        cur_layer->inputs[0] = node.input(1);
+    } else {
+        cur_layer->inputs[0] = node.input(0);
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(MatMul, MatMul);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_pool_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_pool_converter.cc
new file mode 100644
index 0000000..d8f800d
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_pool_converter.cc
@@ -0,0 +1,190 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_utils.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Pooling);
+
+std::string OnnxPoolingConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    if (node.op_type() == "GlobalAveragePool" || node.op_type() == "GlobalMaxPool" || node.op_type() == "AveragePool" ||
+        node.op_type() == "MaxPool") {
+        return "Pooling";
+    }
+    return "";
+}
+
+TNN_NS::ActivationType OnnxPoolingConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxPoolingConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                          const onnx::NodeProto &node,
+                                          std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                          std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                          bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::PoolingLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    // do not support adaptive pool for now
+    param->is_adaptive_pool = 0;
+    param->output_shape     = {-1, -1};
+    if (onnx_op == "AveragePool" || onnx_op == "MaxPool") {
+        auto auto_pad     = GetAttributeString(node, "auto_pad", "NOTSET");
+        auto kernel_shape = GetAttributeIntVector(node, "kernel_shape");
+        auto strides      = GetAttributeIntVector(node, "strides");
+        auto pads         = GetAttributeIntVector(node, "pads");
+        //计算输出时候采用的截断方式 0：floor 1：ceil
+        int ceil_mode = GetAttributeInt(node, "ceil_mode", 0);
+
+        int pad_type = -1;
+        if (auto_pad == "SAME_UPPER") {
+            pad_type = 0;
+        } else if (auto_pad == "VALID") {
+            pad_type = 1;
+        } else if (auto_pad == "SAME_LOWER") {
+            pad_type = 0;
+            LOGE("SAME_LOWER is unsuported, change to SAME_UPPER\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+
+        param->pool_type = (onnx_op == "AveragePool") ? 1 : 0;
+
+        bool is3d = false;
+        if (kernel_shape.size() == 1) {
+            param->kernels.push_back(kernel_shape[0]);
+            param->kernels.push_back(kernel_shape[0]);
+        } else if (kernel_shape.size() == 2) {
+            param->kernels.push_back(kernel_shape[0]);
+            param->kernels.push_back(kernel_shape[1]);
+        } else if (kernel_shape.size() == 3) {
+            is3d = true;
+            param->kernels.push_back(kernel_shape[0]);
+            param->kernels.push_back(kernel_shape[1]);
+            param->kernels.push_back(kernel_shape[2]);
+        }
+        param->kernels_params = param->kernels;
+
+        if (strides.size() == 1) {
+            param->strides.push_back(strides[0]);
+            param->strides.push_back(strides[0]);
+        } else if (strides.size() == 2) {
+            param->strides.push_back(strides[0]);
+            param->strides.push_back(strides[1]);
+        } else if (strides.size() == 3) {
+            param->strides.push_back(strides[0]);
+            param->strides.push_back(strides[1]);
+            param->strides.push_back(strides[2]);
+        }
+
+        if (pads.size() == 1) {
+            param->pads.push_back(pads[0]);
+            param->pads.push_back(pads[0]);
+            param->pads.push_back(pads[0]);
+            param->pads.push_back(pads[0]);
+        } else if (pads.size() == 2) {
+            param->pads.push_back(pads[0]);
+            param->pads.push_back(pads[0]);
+            param->pads.push_back(pads[1]);
+            param->pads.push_back(pads[1]);
+        } else if (pads.size() == 4) {
+            if (pads[0] == pads[2] && pads[1] == pads[3]) {
+                param->pads.push_back(pads[1]);
+                param->pads.push_back(pads[1]);
+                param->pads.push_back(pads[0]);
+                param->pads.push_back(pads[0]);
+            } else if (pads[0] < pads[2] && pads[1] < pads[3]) {
+                pad_type = 0;  // SAME UPPER
+                param->pads.push_back(pads[0]);
+                param->pads.push_back(pads[0]);
+                param->pads.push_back(pads[1]);
+                param->pads.push_back(pads[1]);
+            } else {
+                LOGE("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+                return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+            }
+        } else if (pads.size() == 6) {
+            if (pads[0] == pads[3] && pads[1] == pads[4] && pads[2] == pads[5]) {
+                param->pads.push_back(pads[0]);
+                param->pads.push_back(pads[1]);
+                param->pads.push_back(pads[2]);
+            } else if (pads[0] < pads[3] && pads[1] < pads[4] && pads[2] < pads[5]) {
+                pad_type = 0;  // SAME UPPER
+                param->pads.push_back(pads[0]);
+                param->pads.push_back(pads[1]);
+                param->pads.push_back(pads[2]);
+            } else {
+                LOGE("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+                return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+            }
+        } else {
+            if (auto_pad == "NOTSET" || auto_pad == "SAME_LOWER" || auto_pad == "SAME_UPPER" || auto_pad == "VALID") {
+                if (kernel_shape.size() == 3) {
+                    param->pads.push_back(0);
+                    param->pads.push_back(0);
+                    param->pads.push_back(0);
+                } else {
+                    param->pads.push_back(0);
+                    param->pads.push_back(0);
+                }
+            } else {
+                LOGE("OnnxPooling unsupport this type!\n");
+                return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+            }
+        }
+        // kernel_h_index_in_input_node_size kernel_w_index_in_input_node_size
+        // for runtime kernel size of global pool
+        if (is3d) {
+            param->kernel_indexs.push_back(-1);
+            param->kernel_indexs.push_back(-1);
+            param->kernel_indexs.push_back(-1);
+        } else {
+            param->kernel_indexs.push_back(-1);
+            param->kernel_indexs.push_back(-1);
+        }
+        // pad type
+        param->pad_type = pad_type;
+        // ceil mode, 计算输出时候采用的截断方式 0：floor 1：ceil
+        param->ceil_mode = ceil_mode;
+
+    } else {
+        param->pool_type = (onnx_op == "GlobalAveragePool") ? 1 : 0;
+        param->kernels.push_back(0);
+        param->kernels.push_back(0);
+        param->kernels_params = param->kernels;
+        param->strides.push_back(1);
+        param->strides.push_back(1);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->kernel_indexs.push_back(-1);
+        param->kernel_indexs.push_back(-1);
+        param->pad_type  = -1;
+        param->ceil_mode = 0;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+REGISTER_CONVERTER(Pooling, MaxPool);
+REGISTER_CONVERTER(Pooling, AveragePool);
+REGISTER_CONVERTER(Pooling, GlobalMaxPool);
+REGISTER_CONVERTER(Pooling, GlobalAveragePool);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_pow_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_pow_converter.cc
new file mode 100644
index 0000000..aab4a31
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_pow_converter.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Power);
+
+std::string OnnxPowerConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Power";
+}
+
+TNN_NS::ActivationType OnnxPowerConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxPowerConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                        const onnx::NodeProto &node,
+                                        std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                        std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                        bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::PowLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->scale               = 1.0;
+    param->shift               = 0.0;
+    param->exponent            = GetAttributeFloat(node, "exponent", 1, 0.0, proxy_initializers_map);
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Power, Pow);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.cc
new file mode 100644
index 0000000..80948ce
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_proxy_graph.h"
+
+#include "onnx_utils.h"
+namespace TNN_CONVERTER {
+
+OnnxProxyNode::OnnxProxyNode() : op_name(), op_type(), onnx_node(nullptr) {
+    // do nothing
+}
+
+OnnxProxyNode::~OnnxProxyNode() {
+    // do nothing
+}
+
+OnnxProxyGraph::OnnxProxyGraph(const onnx::GraphProto* graph_proto) {
+    this->graph_proto_ = graph_proto;
+    InitProxyGraph();
+}
+
+OnnxProxyGraph::~OnnxProxyGraph() {
+    // do nothing
+}
+
+void OnnxProxyGraph::InitProxyGraph() {
+    const int node_size = this->graph_proto_->node_size();
+    for (int i = 0; i < node_size; ++i) {
+        const auto& onnx_node = this->graph_proto_->node(i);
+        if (onnx_node.op_type() == "Constant") {
+            // TODO
+            auto tensor = GetTensorFromConstantNode(onnx_node);
+            proxy_initializers_map_.insert(std::make_pair(onnx_node.output(0), tensor));
+        } else {
+            std::shared_ptr<OnnxProxyNode> proxy_node(new OnnxProxyNode());
+            proxy_node->op_name   = onnx_node.output(0);
+            proxy_node->op_type   = onnx_node.op_type();
+            proxy_node->onnx_node = &onnx_node;
+            proxy_nodes_map_.insert(std::make_pair(onnx_node.output(0), proxy_node));
+        }
+    }
+
+    const int initializer_size = this->graph_proto_->initializer_size();
+    for (int i = 0; i < initializer_size; ++i) {
+        const auto& initializer = this->graph_proto_->initializer(i);
+        proxy_initializers_map_.insert(std::make_pair(initializer.name(), &initializer));
+    }
+
+    const int input_size = this->graph_proto_->input_size();
+    for (int i = 0; i < input_size; ++i) {
+        const auto& input = this->graph_proto_->input(i);
+        proxy_inputs_map_.insert(std::make_pair(input.name(), &input));
+    }
+
+    const int output_size = this->graph_proto_->output_size();
+    for (int i = 0; i < output_size; ++i) {
+        const auto& output = this->graph_proto_->output(i);
+        proxy_outputs_map_.insert(std::make_pair(output.name(), &output));
+    }
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.h b/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.h
new file mode 100644
index 0000000..36a92bd
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_proxy_graph.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_PROXY_GRAPH_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_PROXY_GRAPH_H_
+#include <memory>
+#include <set>
+
+#include "onnx.pb.h"
+
+namespace TNN_CONVERTER {
+
+class OnnxProxyNode {
+public:
+    OnnxProxyNode();
+    ~OnnxProxyNode();
+    std::string op_name;
+    std::string op_type;
+    const onnx::NodeProto* onnx_node;
+
+    std::vector<std::string> in_edges;
+    std::vector<std::string> out_edges;
+};
+
+class OnnxProxyGraph {
+public:
+    OnnxProxyGraph(const onnx::GraphProto* graph_proto);
+    OnnxProxyGraph() = delete;
+    ~OnnxProxyGraph();
+
+    const onnx::GraphProto* graph_proto_;
+    std::map<std::string, std::shared_ptr<OnnxProxyNode>> proxy_nodes_map_;
+    std::map<std::string, const onnx::TensorProto*> proxy_initializers_map_;
+    std::map<std::string, const onnx::ValueInfoProto*> proxy_inputs_map_;
+    std::map<std::string, const onnx::ValueInfoProto*> proxy_outputs_map_;
+    std::set<std::string> constant_node_to_delete_;
+
+private:
+    void InitProxyGraph();
+};
+
+}  // namespace TNN_CONVERTER
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_PROXY_GRAPH_H_
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_reduce_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_reduce_converter.cc
new file mode 100644
index 0000000..352198a
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_reduce_converter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Reduce);
+
+std::string OnnxReduceConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return node.op_type();
+}
+
+TNN_NS::ActivationType OnnxReduceConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxReduceConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const onnx::NodeProto &node,
+                                         std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                         std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                         bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::ReduceLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->axis                = GetAttributeIntVector(node, "axes");
+    param->keep_dims           = GetAttributeInt(node, "keep_dims", 1);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Reduce, ReduceMean);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_reshape_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_reshape_converter.cc
new file mode 100644
index 0000000..f1c5c31
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_reshape_converter.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Reshape);
+
+std::string OnnxReshapeConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Reshape";
+}
+TNN_NS::ActivationType OnnxReshapeConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+TNN_NS::Status OnnxReshapeConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                          const onnx::NodeProto &node,
+                                          std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                          std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                          bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::ReshapeLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+#if 0
+    param->axis         = 0;
+    param->num_axes     = 4;
+    param->shape        = {0, -1, 1, 1};
+    param->reshape_type = 0;
+    return TNN_NS::TNN_CONVERT_OK;
+#endif
+    const auto &resource_map = net_resource.resource_map;
+    const auto &shape_name   = node.input(1);
+    if (proxy_initializers_map.find(shape_name) != proxy_initializers_map.end()) {
+        const auto shape_tensor     = proxy_initializers_map[shape_name];
+        std::vector<int> shape_dims = std::vector<int>(shape_tensor->dims().begin(), shape_tensor->dims().end());
+        std::vector<int> shape;
+        int shape_count    = TNN_NS::DimsVectorUtils::Count(shape_dims);
+        void *raw_data_ptr = GetDataFromTensor(*shape_tensor, onnx::TensorProto_DataType_INT64);
+        for (int i = 0; i < shape_count; ++i) {
+            shape.push_back(*((int64_t *)raw_data_ptr + i));
+        }
+        param->axis         = 0;
+        param->num_axes     = shape.size();
+        param->shape        = shape;
+        param->reshape_type = 0;
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = node.input(0);
+    } else {
+        param->axis         = 0;
+        param->num_axes     = 0;
+        param->shape        = {};
+        param->reshape_type = 0;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Reshape, Reshape);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_sigmoid_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_sigmoid_converter.cc
new file mode 100644
index 0000000..5dabd0f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_sigmoid_converter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Sigmoid);
+
+std::string OnnxSigmoidConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Sigmoid";
+}
+
+TNN_NS::ActivationType OnnxSigmoidConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxSigmoidConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                          const onnx::NodeProto &node,
+                                          std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                          std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                          bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::ElementWiseLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+
+    cur_layer->inputs[0] = node.input(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Sigmoid, Sigmoid);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_slice_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_slice_converter.cc
new file mode 100644
index 0000000..6f4d4de
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_slice_converter.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Slice);
+
+int Int64ToInt32(const int64_t number) {
+    if (number < INT_MIN) {
+        return INT_MIN;
+    } else if (number > INT_MAX) {
+        return INT_MAX;
+    }
+
+    return (int)number;
+}
+
+std::string OnnxSliceConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "StridedSliceV2";
+}
+
+TNN_NS::ActivationType OnnxSliceConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxSliceConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                        const onnx::NodeProto &node,
+                                        std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                        std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                        bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::StrideSliceV2LayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+
+    auto starts     = GetAttributeInt64Vector(node, "starts", proxy_initializers_map, 1);
+    auto ends       = GetAttributeInt64Vector(node, "ends", proxy_initializers_map, 2);
+    auto axes       = GetAttributeInt64Vector(node, "axes", proxy_initializers_map, 3);
+    auto steps      = GetAttributeInt64Vector(node, "steps", proxy_initializers_map, 4);
+    int starts_size = starts.size();
+    int ends_size   = ends.size();
+    int axis_size   = axes.size();
+    int steps_size  = steps.size();
+
+    for (int i = 0; i < starts_size; i++) {
+        param->begins.push_back(Int64ToInt32(starts[i]));
+    }
+
+    for (int i = 0; i < ends_size; i++) {
+        param->ends.push_back(Int64ToInt32(ends[i]));
+    }
+
+    for (int i = 0; i < axis_size; i++) {
+        param->axes.push_back(axes[i]);
+    }
+
+    for (int i = 0; i < steps_size; i++) {
+        param->strides.push_back(steps[i]);
+    }
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Slice, Slice);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_softmax_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_softmax_converter.cc
new file mode 100644
index 0000000..ff8ac60
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_softmax_converter.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Softmax);
+
+std::string OnnxSoftmaxConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "SoftmaxCaffe";
+}
+
+TNN_NS::ActivationType OnnxSoftmaxConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxSoftmaxConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                          const onnx::NodeProto &node,
+                                          std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                          std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                          bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::SoftmaxLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->axis                = GetAttributeInt(node, "axis", 1);
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Softmax, Softmax);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_split_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_split_converter.cc
new file mode 100644
index 0000000..ded2f72
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_split_converter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Split);
+
+std::string OnnxSplitConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "SplitV";
+}
+
+TNN_NS::ActivationType OnnxSplitConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxSplitConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                        const onnx::NodeProto &node,
+                                        std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                        std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                        bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::SplitVLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->axis                = GetAttributeInt(node, "axis", 1);
+    param->slices              = GetAttributeIntVector(node, "split");
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Split, Split);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_squeeze_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_squeeze_converter.cc
new file mode 100644
index 0000000..2d87c23
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_squeeze_converter.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Squeeze);
+
+std::string OnnxSqueezeConverter::TNNOpType(const onnx::NodeProto& node, bool quantized_model) {
+    return "Squeeze";
+}
+
+TNN_NS::ActivationType OnnxSqueezeConverter::ActivationType(const onnx::NodeProto& node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxSqueezeConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                          const onnx::NodeProto& node,
+                                          std::map<std::string, const onnx::TensorProto*>& proxy_initializers_map,
+                                          std::map<std::string, std::shared_ptr<OnnxProxyNode>>& proxy_nodes,
+                                          bool& quantized_model) {
+    auto param       = new TNN_NS::SqueezeLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    param->axes      = GetAttributeIntVector(node, "axes");
+    auto& data_name  = node.input(0);
+    const auto& iter = proxy_initializers_map.find(data_name);
+    if (iter != proxy_initializers_map.end()) {
+        param->data_in_resource            = true;
+        auto& resource_map                 = net_resource.resource_map;
+        auto resource                      = std::make_shared<TNN_NS::SqueezeLayerResource>();
+        resource_map[cur_layer->name]      = resource;
+        auto data_tensor_proto             = iter->second;
+        TNN_NS::RawBuffer* data_raw_buffer = nullptr;
+        CreateRawBufferFromTensor(*data_tensor_proto, &data_raw_buffer);
+        resource->data = *data_raw_buffer;
+        cur_layer->inputs.clear();
+    } else {
+        param->data_in_resource = false;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Squeeze, Squeeze);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_transpose_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_transpose_converter.cc
new file mode 100644
index 0000000..7cc3171
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_transpose_converter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Transpose);
+
+std::string OnnxTransposeConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Permute";
+}
+
+TNN_NS::ActivationType OnnxTransposeConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxTransposeConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                            const onnx::NodeProto &node,
+                                            std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                            std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                            bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::PermuteLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->orders              = GetAttributeIntVector(node, "perm");
+
+    if (param->orders.empty()) {
+        param->orders = {0, 1, 2, 3};
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Transpose, Transpose);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_unary_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_unary_converter.cc
new file mode 100644
index 0000000..2ce8370
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_unary_converter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Unary);
+
+std::string OnnxUnaryConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    if (node.op_type() == "Relu") {
+        return "ReLU";
+    } else {
+        return node.op_type();
+    }
+}
+
+TNN_NS::ActivationType OnnxUnaryConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxUnaryConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                        const onnx::NodeProto &node,
+                                        std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                        std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                        bool &quantized_model) {
+    auto param       = new TNN_NS::LayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Unary, Shape);
+REGISTER_CONVERTER(Unary, Erf);
+REGISTER_CONVERTER(Unary, Floor);
+REGISTER_CONVERTER(Unary, Relu);
+REGISTER_CONVERTER(Unary, Tanh);
+REGISTER_CONVERTER(Unary, Sqrt);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_unsqueeze_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_unsqueeze_converter.cc
new file mode 100644
index 0000000..134a64b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_unsqueeze_converter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_base_converter.h"
+#include "onnx_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Unsqueeze);
+
+std::string OnnxUnsqueezeConverter::TNNOpType(const onnx::NodeProto& node, bool quantized_model) {
+    return "Unsqueeze";
+}
+
+TNN_NS::ActivationType OnnxUnsqueezeConverter::ActivationType(const onnx::NodeProto& node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxUnsqueezeConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                            const onnx::NodeProto& node,
+                                            std::map<std::string, const onnx::TensorProto*>& proxy_initializers_map,
+                                            std::map<std::string, std::shared_ptr<OnnxProxyNode>>& proxy_nodes,
+                                            bool& quantized_model) {
+    auto param       = new TNN_NS::UnsqueezeLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    param->axes      = GetAttributeIntVector(node, "axes");
+    auto& data_name  = node.input(0);
+    const auto& iter = proxy_initializers_map.find(data_name);
+    for (const auto& input : node.input()) {
+        if (proxy_initializers_map.find(input) != proxy_initializers_map.end()) {
+            auto const_tensor                   = proxy_initializers_map[input];
+            TNN_NS::RawBuffer* const_raw_buffer = nullptr;
+            CreateRawBufferFromTensor(*const_tensor, &const_raw_buffer);
+            net_resource.constant_map[input] = std::shared_ptr<TNN_NS::RawBuffer>(const_raw_buffer);
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Unsqueeze, Unsqueeze);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_upsample_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_upsample_converter.cc
new file mode 100644
index 0000000..10e74fe
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_upsample_converter.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Upsample);
+
+std::string OnnxUpsampleConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Upsample";
+}
+
+TNN_NS::ActivationType OnnxUpsampleConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxUpsampleConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const onnx::NodeProto &node,
+                                           std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                           std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                           bool &quantized_model) {
+    const std::string &onnx_op = node.op_type();
+    auto param                 = new TNN_NS::UpsampleLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                = cur_layer->type_str;
+    param->name                = cur_layer->name;
+    param->quantized           = false;
+    param->mode                = 0;
+
+    auto model = GetAttributeString(node, "mode", "nearest");
+    if ("nearest" == model) {
+        param->mode = 1;
+    } else if ("bilinear" == model || "linear" == model) {
+        param->mode = 2;
+    } else if ("trilinear" == model) {
+        LOGE("Onnx Converter: do not support upsample trilinear mode\n");
+        return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+
+    const auto &scale_name = node.input(1);
+    if (proxy_initializers_map.find(scale_name) != proxy_initializers_map.end()) {
+        // TODO
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = node.input(0);
+    } else {
+        // the upsample's scale should be calculate in tnn runtime;
+        param->align_corners = -1;
+        param->scales        = {};
+        param->dims          = {};
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Upsample, Upsample);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.cc b/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.cc
new file mode 100644
index 0000000..9440234
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.cc
@@ -0,0 +1,518 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_utils.h"
+
+#include <algorithm>
+
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+
+TNN_NS::DimsVector ConvertTensorShapeProtoToDimsVector(onnx::TensorShapeProto tensor_shape_proto) {
+    TNN_NS::DimsVector dims_vector;
+    int dim_size = tensor_shape_proto.dim_size();
+    for (int i = 0; i < dim_size; ++i) {
+        int dim = std::max(1, (int)tensor_shape_proto.dim(i).dim_value());
+        dims_vector.push_back(dim);
+    }
+    return dims_vector;
+}
+
+onnx::AttributeProto_AttributeType GetAttributeType(const char *type_name) {
+    if (type_name == typeid(int64_t).name()) {
+        return onnx::AttributeProto_AttributeType_INT;
+    } else if (type_name == typeid(int64_t[]).name()) {
+        return onnx::AttributeProto_AttributeType_INTS;
+    } else if (type_name == typeid(float).name()) {
+        return onnx::AttributeProto_AttributeType_FLOAT;
+    } else if (type_name == typeid(float[]).name()) {
+        return onnx::AttributeProto_AttributeType_FLOATS;
+    } else if (type_name == typeid(std::string).name()) {
+        return onnx::AttributeProto_AttributeType_STRING;
+    } else if (type_name == typeid(std::string[]).name()) {
+        return onnx::AttributeProto_AttributeType_STRINGS;
+    } else if (type_name == typeid(onnx::TensorProto).name()) {
+        return onnx::AttributeProto_AttributeType_TENSOR;
+    } else if (type_name == typeid(onnx::TensorProto[]).name()) {
+        return onnx::AttributeProto_AttributeType_TENSORS;
+    } else if (type_name == typeid(onnx::GraphProto).name()) {
+        return onnx::AttributeProto_AttributeType_GRAPH;
+    } else if (type_name == typeid(onnx::GraphProto[]).name()) {
+        return onnx::AttributeProto_AttributeType_GRAPHS;
+    } else {
+        return onnx::AttributeProto_AttributeType_UNDEFINED;
+    }
+}
+int GetAttributeInt(const onnx::NodeProto &node, const std::string &name, int default_value) {
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() != name) {
+            continue;
+        }
+        assert(iter.type() == onnx::AttributeProto_AttributeType_INT);
+        return iter.i();
+    }
+    return default_value;
+}
+
+std::vector<int32_t> GetAttributeIntVector(const onnx::NodeProto &node, const std::string &name) {
+    std::vector<int32_t> attributes;
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() != name) {
+            continue;
+        }
+        assert(iter.type() == onnx::AttributeProto_AttributeType_INTS);
+        for (const auto &value : iter.ints()) {
+            attributes.push_back(value);
+        }
+    }
+    return attributes;
+}
+
+std::vector<int64_t> GetAttributeInt64Vector(const onnx::NodeProto &node, const std::string &name) {
+    std::vector<int64_t> attributes;
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() != name) {
+            continue;
+        }
+        assert(iter.type() == onnx::AttributeProto_AttributeType_INTS);
+        for (const auto &value : iter.ints()) {
+            attributes.push_back(value);
+        }
+    }
+    return attributes;
+}
+
+std::vector<int64_t> GetAttributeInt64Vector(const onnx::NodeProto &node, const std::string &name,
+                                             std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                             int location) {
+    auto attributes = GetAttributeInt64Vector(node, name);
+    if (attributes.empty()) {
+        const int node_input_size = node.input_size();
+        if (location < 0) {
+            location += node_input_size;
+        }
+        assert(location >= 0);
+        if (location > node_input_size - 1) {
+            return attributes;
+        }
+        const auto &attributes_name = node.input(location);
+        if (proxy_initializers_map.find(attributes_name) == proxy_initializers_map.end()) {
+            return attributes;
+        }
+        const auto attributes_data = GetTensorProtoDataVector<int64_t>(*proxy_initializers_map[attributes_name]);
+        for (const auto &item : attributes_data) {
+            attributes.push_back(item);
+        }
+    }
+    return attributes;
+}
+
+float GetAttributeFloat(const onnx::NodeProto &node, const std::string &name, float default_value) {
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() != name) {
+            continue;
+        }
+        assert(iter.type() == onnx::AttributeProto_AttributeType_FLOAT);
+        return iter.f();
+    }
+    return default_value;
+}
+
+float GetAttributeFloat(const onnx::NodeProto &node, const std::string &name, int location, float default_value,
+                        std::map<std::string, const onnx::TensorProto *> proxy_initializers_map) {
+    auto value = GetAttributeFloat(node, name, default_value);
+    if (std::fabs(value - default_value) > 1e-6) {
+        return value;
+    }
+
+    const auto node_input_size = node.input_size();
+    assert(location >= 0 && location < node_input_size);
+
+    const auto &target_name = node.input(location);
+    assert(proxy_initializers_map.find(target_name) != proxy_initializers_map.end());
+    const auto &tensor      = proxy_initializers_map[target_name];
+    const int tensor_size   = GetTensorProtoDataSize(*tensor);
+    const auto *tensor_data = GetTensorProtoData(*tensor);
+    if (tensor_size > 0) {
+        return tensor_data[0];
+    }
+
+    return default_value;
+}
+
+std::string GetAttributeString(const onnx::NodeProto &node, const std::string &name, std::string def) {
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() == name) {
+            assert(iter.type() == onnx::AttributeProto_AttributeType_STRING);
+            return iter.s();
+        }
+    }
+    return def;
+}
+
+std::vector<std::string> GetAttributeStringVector(const onnx::NodeProto &node, const std::string &name) {
+    std::vector<std::string> attributes;
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() != name) {
+            continue;
+        }
+        assert(iter.type() == onnx::AttributeProto_AttributeType_STRINGS);
+        for (const auto &value : iter.strings()) {
+            attributes.push_back(value);
+        }
+    }
+    return attributes;
+}
+
+std::vector<std::string> SplitString(std::string &s, const std::string &c) {
+    std::vector<std::string> res;
+    std::string::size_type pos1, pos2;
+    pos2 = s.find(c);
+    pos1 = 0;
+    while (std::string::npos != pos2) {
+        res.push_back(s.substr(pos1, pos2 - pos1));
+
+        pos1 = pos2 + c.size();
+        pos2 = s.find(c, pos1);
+    }
+    if (pos1 != s.length()) {
+        res.push_back(s.substr(pos1));
+    }
+    return res;
+}
+
+std::vector<uint8_t> GetAttributeUInt8Vector(const onnx::NodeProto &node, const std::string &name) {
+    std::vector<uint8_t> attribute;
+    for (const auto &iter : node.attribute()) {
+        if (iter.name() == name) {
+            assert(iter.type() == onnx::AttributeProto_AttributeType_STRING);
+            const auto &raw_data = iter.s();
+            int size             = raw_data.size();
+            for (int i = 0; i < size; ++i) {
+                attribute.push_back(*((uint8_t *)raw_data.data() + i));
+            }
+        }
+    }
+    return attribute;
+}
+
+std::vector<int8_t> Asymmetric2Symmetric(std::vector<uint8_t> &raw_value, uint8_t zero_point) {
+    std::vector<int8_t> res;
+    for (const auto &value : raw_value) {
+        res.push_back(value - zero_point);
+    }
+    return res;
+}
+
+onnx::TensorProto GetAttributeTensor(const onnx::NodeProto &node, const char *key) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto &attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.t();
+        }
+    }
+
+    return onnx::TensorProto();
+}
+
+const float *GetTensorProtoData(const onnx::TensorProto &tp) {
+    if (tp.has_raw_data()) {
+        return (const float *)tp.raw_data().data();
+    } else if (tp.data_type() == 1) {
+        return tp.float_data().data();
+    } else if (tp.data_type() == 6) {
+        return (const float *)tp.int32_data().data();
+    } else if (tp.data_type() == 7) {
+        return (const float *)tp.int64_data().data();
+    } else if (tp.data_type() == 11) {
+        return (const float *)tp.double_data().data();
+    } else {
+        assert(0);
+        return nullptr;
+    }
+}
+
+int GetTensorProtoDataSize(const onnx::TensorProto &tp) {
+    int size = 0;
+    if (tp.has_raw_data()) {
+        const std::string &raw_data = tp.raw_data();
+        switch (tp.data_type()) {
+            case onnx::TensorProto_DataType_FLOAT: {
+                size = int(raw_data.size() / sizeof(float));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT8: {
+                size = int(raw_data.size() / sizeof(uint8_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT8: {
+                size = int(raw_data.size() / sizeof(int8_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT16: {
+                size = int(raw_data.size() / sizeof(uint16_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT16: {
+                size = int(raw_data.size() / sizeof(int16_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT32: {
+                size = int(raw_data.size() / sizeof(int32_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT64: {
+                size = int(raw_data.size() / sizeof(int64_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_BOOL: {
+                size = int(raw_data.size() / sizeof(bool));
+                break;
+            }
+            case onnx::TensorProto_DataType_FLOAT16: {
+                size = int(raw_data.size() / (sizeof(float) / 2));
+                break;
+            }
+            case onnx::TensorProto_DataType_DOUBLE: {
+                size = int(raw_data.size() / sizeof(double));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT32: {
+                size = int(raw_data.size() / sizeof(uint32_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT64: {
+                size = int(raw_data.size() / sizeof(uint64_t));
+                break;
+            }
+            default: {
+                LOGE("Onnx Converter: do not support tensor proto data type\n");
+                size = -1;
+            }
+        }
+    } else {
+        switch (tp.data_type()) {
+            case onnx::TensorProto_DataType_FLOAT: {
+                size = tp.float_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_INT32: {
+                size = tp.int32_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_INT64: {
+                size = tp.int64_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_DOUBLE: {
+                size = tp.double_data_size();
+                break;
+            }
+            default: {
+                LOGE("Onnx Converter: do not support tensor proto data type\n");
+                size = -1;
+            }
+        }
+    }
+    return size;
+}
+
+void *GetDataFromTensor(const onnx::TensorProto &tensor, onnx::TensorProto_DataType data_type) {
+    void *data_ptr = nullptr;
+    if (tensor.data_type() == data_type) {
+        if (tensor.has_raw_data()) {
+            data_ptr = (void *)tensor.raw_data().data();
+        }
+    }
+    return data_ptr;
+}
+
+const onnx::TensorProto *GetTensorFromConstantNode(const onnx::NodeProto &constant_node) {
+    for (int i = 0; i < constant_node.attribute_size(); ++i) {
+        const auto &attribute_proto = constant_node.attribute(i);
+        const auto &attribute_name  = attribute_proto.name();
+        if (attribute_name == "value") {
+            return &attribute_proto.t();
+        }
+    }
+    return nullptr;
+}
+
+TNN_NS::DimsVector CreateDimsVectorFromTensor(const onnx::TensorProto &tensor) {
+    TNN_NS::DimsVector dims = {};
+    const auto &tensor_dims = tensor.dims();
+    if (tensor_dims.empty()) {
+        return dims;
+    }
+    for (int i = 0; i < tensor_dims.size(); ++i) {
+        dims.push_back((int)tensor.dims(i));
+    }
+    return dims;
+}
+
+void CreateRawBufferFromTensor(const onnx::TensorProto &tensor, TNN_NS::RawBuffer **raw_buffer) {
+    int count               = GetTensorProtoDataSize(tensor);
+    const auto &tensor_dims = CreateDimsVectorFromTensor(tensor);
+    switch (tensor.data_type()) {
+        case onnx::TensorProto_DataType_INT64: {
+            int64_t *tensor_data_ptr = (int64_t *)(tensor.raw_data().data());
+            // raw_buffer = std::make_shared<TNN_NS::RawBuffer>(data_count * sizeof(int32_t)).get();
+            *raw_buffer = new TNN_NS::RawBuffer(count * sizeof(int32_t), tensor_dims);
+            (*raw_buffer)->SetDataType(TNN_NS::DATA_TYPE_INT32);
+            auto tmp = new int32_t[count]();
+            for (int i = 0; i < count; ++i) {
+                tmp[i] = (int)tensor_data_ptr[i];
+            }
+            memcpy((*raw_buffer)->force_to<void *>(), tmp, count * sizeof(int32_t));
+            delete[] tmp;
+            break;
+        }
+        case onnx::TensorProto_DataType_FLOAT: {
+            auto raw_data_ptr = tensor.raw_data().data();
+            *raw_buffer       = new TNN_NS::RawBuffer(count * sizeof(float), tensor_dims);
+            (*raw_buffer)->SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+            memcpy((*raw_buffer)->force_to<void *>(), (void *)raw_data_ptr, count * sizeof(float));
+            break;
+        }
+        default: {
+            LOGE("Converter: do not support onnx tensor type\n");
+        }
+    }
+}
+
+void CreateRawBufferFromConstant(const onnx::NodeProto &constant_node, TNN_NS::RawBuffer **raw_buffer) {
+    ASSERT(constant_node.op_type() == "Constant");
+    onnx::TensorProto tensor;
+    for (int i = 0; i < constant_node.attribute_size(); ++i) {
+        const auto &attribute_proto = constant_node.attribute(i);
+        const auto &attribute_name  = attribute_proto.name();
+        if (attribute_name == "value") {
+            tensor = attribute_proto.t();
+            break;
+        }
+    }
+    switch (tensor.data_type()) {
+        case onnx::TensorProto_DataType_INT64: {
+            auto data_count             = 1;
+            const void *tensor_data_ptr = tensor.raw_data().data();
+            // raw_buffer = std::make_shared<TNN_NS::RawBuffer>(data_count * sizeof(int32_t)).get();
+            *raw_buffer = new TNN_NS::RawBuffer(data_count * sizeof(int32_t), TNN_NS::DimsVector({1}));
+            (*raw_buffer)->SetDataType(TNN_NS::DATA_TYPE_INT32);
+            int value = *((int64_t *)tensor_data_ptr);
+            memcpy((*raw_buffer)->force_to<int32_t *>(), &value, data_count * sizeof(int32_t));
+            break;
+        }
+        default: {
+            LOGE("Converter: do not support onnx tensor type\n");
+        }
+    }
+}
+
+/**
+ * onnx::TensorProto_DataType data_type
+ * */
+int TensorProtoDataType2TnnDataType(int data_type) {
+    if (onnx::TensorProto_DataType_FLOAT == data_type) {
+        return TNN_NS::DATA_TYPE_FLOAT;
+    } else if (onnx::TensorProto_DataType_FLOAT16 == data_type) {
+        return TNN_NS::DATA_TYPE_HALF;
+    } else if (onnx::TensorProto_DataType_INT8 == data_type) {
+        return TNN_NS::DATA_TYPE_INT8;
+    } else if (onnx::TensorProto_DataType_INT32 == data_type || onnx::TensorProto_DataType_INT64 == data_type) {
+        return TNN_NS::DATA_TYPE_INT32;
+    } else {
+        LOGE("Converter: TensorProtoDataType2TnnDataType do not support type\n");
+        assert(0);
+    }
+}
+
+TNN_NS::DataType GetTnnDataTypeFromOnnx(const onnx::TensorProto_DataType &onnx_type) {
+    switch (onnx_type) {
+        case onnx::TensorProto_DataType_FLOAT: {
+            return TNN_NS::DATA_TYPE_FLOAT;
+        }
+        case onnx::TensorProto_DataType_FLOAT16: {
+            return TNN_NS::DATA_TYPE_HALF;
+        }
+        case onnx::TensorProto_DataType_UINT8:
+        case onnx::TensorProto_DataType_INT8: {
+            return TNN_NS::DATA_TYPE_INT8;
+        }
+        case onnx::TensorProto_DataType_INT64:
+        case onnx::TensorProto_DataType_INT32: {
+            return TNN_NS::DATA_TYPE_INT32;
+        }
+        case onnx::TensorProto_DataType_BFLOAT16: {
+            return TNN_NS::DATA_TYPE_BFP16;
+        }
+        default: {
+            LOGE("Not support onnx TypeProto type: %d", onnx_type);
+            assert(0);
+        }
+    }
+}
+
+template <class T>
+std::vector<T> GetTensorProtoDataVector(const onnx::TensorProto &tp) {
+    std::vector<T> data_vec;
+    //    TensorProto_DataType_FLOAT = 1,
+    int size  = GetTensorProtoDataSize(tp);
+    T *data_T = nullptr;
+    if (tp.has_raw_data()) {
+        const std::string &raw_data = tp.raw_data();
+        data_T                      = (T *)raw_data.data();
+    } else if (tp.data_type() == 1) {
+        data_T = (T *)tp.float_data().data();
+    } else if (tp.data_type() == 6) {
+        data_T = (T *)tp.int32_data().data();
+    } else if (tp.data_type() == 7) {
+        data_T = (T *)tp.int64_data().data();
+    } else if (tp.data_type() == 11) {
+        data_T = (T *)tp.double_data().data();
+    } else {
+        LOGE("name:%s data_type :%d\n", tp.name().c_str(), tp.data_type());
+        assert(0);
+        return data_vec;
+    }
+
+    for (int i = 0; i < size; i++) {
+        data_vec.push_back(data_T[i]);
+    }
+    return data_vec;
+}
+
+TNN_NS::Status GetWeightInputIndexName(int &weight_input_index, std::string &weight_name, const onnx::NodeProto &node,
+                                       std::map<std::string, const onnx::TensorProto *> proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes) {
+    weight_input_index        = -1;
+    weight_name               = "";
+    const int node_input_size = node.input_size();
+    for (int i = 0; i < node_input_size; i++) {
+        const auto &input_name = node.input(i);
+        if (proxy_initializers_map.find(input_name) == proxy_initializers_map.end()) {
+            continue;
+        }
+        if (weight_input_index != -1) {
+            LOGE("Binary: Only support one weight input index\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        weight_input_index = i;
+        weight_name        = input_name;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.h b/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.h
new file mode 100644
index 0000000..27f42be
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/onnx_utils.h
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_UTILS_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_UTILS_H_
+
+#include <cassert>
+#include <vector>
+
+#include "onnx.pb.h"
+#include "onnx_proxy_graph.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_CONVERTER {
+
+TNN_NS::DimsVector ConvertTensorShapeProtoToDimsVector(onnx::TensorShapeProto tensor_shape_proto);
+
+onnx::AttributeProto_AttributeType GetAttributeType(const char* basic_type_name);
+
+int GetAttributeInt(const onnx::NodeProto& node, const std::string& name, int default_value);
+
+std::vector<int32_t> GetAttributeIntVector(const onnx::NodeProto& node, const std::string& name);
+
+float GetAttributeFloat(const onnx::NodeProto& node, const std::string& name, float default_value);
+
+float GetAttributeFloat(const onnx::NodeProto& node, const std::string& name, int location, float default_value,
+                        std::map<std::string, const onnx::TensorProto*> proxy_initializers_map);
+
+std::string GetAttributeString(const onnx::NodeProto& node, const std::string& name, std::string default_value);
+
+std::vector<std::string> GetAttributeStringVector(const onnx::NodeProto& node, const std::string& name);
+
+std::vector<std::string> SplitString(std::string& s, const std::string& c);
+
+std::vector<uint8_t> GetAttributeUInt8Vector(const onnx::NodeProto& node, const std::string& name);
+
+std::vector<int8_t> Asymmetric2Symmetric(std::vector<uint8_t>& raw_value, uint8_t zero_point);
+
+onnx::TensorProto GetAttributeTensor(const onnx::NodeProto& node, const char* key);
+
+const float* GetTensorProtoData(const onnx::TensorProto& tp);
+
+int GetTensorProtoDataSize(const onnx::TensorProto& tp);
+
+void* GetDataFromTensor(const onnx::TensorProto& tensor, onnx::TensorProto_DataType data_type);
+
+void CreateRawBufferFromConstant(const onnx::NodeProto& constant_node, TNN_NS::RawBuffer** raw_buffer);
+
+std::vector<int64_t> GetAttributeInt64Vector(const onnx::NodeProto& node, const std::string& name);
+
+std::vector<int64_t> GetAttributeInt64Vector(const onnx::NodeProto& node, const std::string& name,
+                                             std::map<std::string, const onnx::TensorProto*>& proxy_initializers_map,
+                                             int location);
+
+const onnx::TensorProto* GetTensorFromConstantNode(const onnx::NodeProto& constant_node);
+
+void CreateRawBufferFromTensor(const onnx::TensorProto& tensor, TNN_NS::RawBuffer** raw_buffer);
+
+TNN_NS::DimsVector CreateDimsVectorFromTensor(const onnx::TensorProto& tensor);
+
+int TensorProtoDataType2TnnDataType(int data_type);
+
+template <typename T>
+bool OHWI2OIHW(T* src, T* dst, int CO, int KH, int KW, int CI) {
+    ASSERT(CO > 0);
+    ASSERT(KH > 0);
+    ASSERT(KW > 0);
+    ASSERT(CI > 0);
+    ASSERT(src != nullptr);
+    for (int co = 0; co < CO; ++co) {
+        for (int ci = 0; ci < CI; ++ci) {
+            for (int h = 0; h < KH; ++h) {
+                for (int w = 0; w < KW; ++w) {
+                    dst[(co * CI + ci) * KH * KW + h * KW + w] = src[(co * KH + h) * KW * CI + w * CI + ci];
+                }
+            }
+        }
+    }
+    return true;
+}
+
+TNN_NS::DataType GetTnnDataTypeFromOnnx(const onnx::TensorProto_DataType& onnx_type);
+
+template <class T>
+std::vector<T> GetTensorProtoDataVector(const onnx::TensorProto& tp);
+
+TNN_NS::Status GetWeightInputIndexName(int& weight_input_index, std::string& weight_name, const onnx::NodeProto& node,
+                                       std::map<std::string, const onnx::TensorProto*> proxy_initializers_map,
+                                       std::map<std::string, std::shared_ptr<OnnxProxyNode>>& proxy_nodes);
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_ONNX_ONNX_UTILS_H_
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_binary_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_binary_converter.cc
new file mode 100644
index 0000000..8412ff5
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_binary_converter.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Binary);
+
+std::string OnnxInt8BinaryConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    if (node.op_type() == "Int8Add") {
+        return "QuantizedAdd";
+    }
+    return "";
+}
+
+TNN_NS::ActivationType OnnxInt8BinaryConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8BinaryConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                             const onnx::NodeProto &node,
+                                             std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                             std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                             bool &quantized_model) {
+    auto param                = new TNN_NS::MultidirBroadcastLayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type               = cur_layer->type_str;
+    param->name               = cur_layer->name;
+    param->quantized          = true;
+    param->weight_input_index = -1;
+    // create input blob scale
+    for (int i = 0; i < node.input_size(); ++i) {
+        const auto &input_name     = node.input(i);
+        auto input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+        if (net_resource.resource_map.find(input_blob_scale_name) == net_resource.resource_map.end()) {
+            const auto &node                     = FindNodeProto(input_name, proxy_nodes);
+            auto scale                           = GetAttributeFloat(*node, "Y_scale", 1.0);
+            auto zero_point                      = GetAttributeInt(*node, "Y_zero_point", 0);
+            auto input_blob_scale                = new TNN_NS::IntScaleResource;
+            input_blob_scale->name               = input_blob_scale_name;
+            TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+            input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+            input_blob_scale->scale_handle      = input_scale_handle;
+            TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+            zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+            input_blob_scale->bias_handle                    = zero_point_handle;
+            net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+        }
+    }
+    // create output scale
+    const auto &output_name     = node.output(0);
+    auto output_blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(output_blob_scale_name) == net_resource.resource_map.end()) {
+        const auto &node               = FindNodeProto(output_name, proxy_nodes);
+        auto scale                     = GetAttributeFloat(*node, "Y_scale", 1.0);
+        auto zero_point                = GetAttributeInt(*node, "Y_zero_point", 0);
+        auto output_blob_scale         = new TNN_NS::IntScaleResource;
+        output_blob_scale->name        = output_blob_scale_name;
+        TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+        scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        output_blob_scale->scale_handle     = scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        output_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[output_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(output_blob_scale);
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Binary, Int8Add);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_conv_relu_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_conv_relu_converter.cc
new file mode 100644
index 0000000..039f5e3
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_conv_relu_converter.cc
@@ -0,0 +1,191 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_base_converter.h"
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Int8ConvRelu);
+
+std::string OnnxInt8ConvReluConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "QuantizedConvolution";
+}
+
+TNN_NS::ActivationType OnnxInt8ConvReluConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8ConvReluConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                               const onnx::NodeProto &node,
+                                               std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                               std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                               bool &quantized_model) {
+    TNN_NS::ConvLayerParam *param = new TNN_NS::ConvLayerParam;
+    auto cur_layer                = net_structure.layers.back();
+    cur_layer->param              = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                   = cur_layer->name;
+    param->type                   = cur_layer->type_str;
+    param->quantized              = true;
+    const int input_size          = node.input_size();
+    ASSERT(input_size == 2 || input_size == 3);
+    // get convolution param
+    const auto &weight_name = node.input(1);
+    const auto &weight_node = FindNodeProto(weight_name, proxy_nodes);
+    auto weight_shape       = GetAttributeIntVector(*weight_node, "shape");
+    auto group              = GetAttributeInt(node, "group", 1);
+    const int co            = weight_shape[0];
+    const int kh            = weight_shape[1];
+    const int kw            = weight_shape[2];
+    const int ci            = weight_shape[3];
+    const int weight_count  = co * kw * kw * ci;
+    param->input_channel    = ci;
+    param->output_channel   = co;
+    param->kernels.push_back(kw);
+    param->kernels.push_back(kh);
+    // onnx order: stride_h, stride_w
+    // tnn  order: stride_w, stride_h
+    auto strides = GetAttributeIntVector(node, "strides");
+    ASSERT(strides.size() == 2);
+    param->strides = {strides[1], strides[0]};
+    // dilation
+    auto dilations = GetAttributeIntVector(node, "dilations");
+    ASSERT(dilations.size() == 2);
+    param->dialations = {dilations[1], dilations[0]};
+    param->group      = group;
+    // parse pads type
+    auto pads = GetAttributeIntVector(node, "pads");
+    if (!pads.empty()) {
+        param->pad_type = -1;
+        if (pads[0] < pads[2] || pads[1] < pads[3]) {
+            // same upper
+            param->pad_type = 0;
+        }
+        param->pads = {pads[1], pads[3], pads[0], pads[2]};
+    } else {
+        auto auto_pad = GetAttributeString(node, "auto_pad", "NOTSET");
+        if (auto_pad == "NOTSET") {
+            param->pad_type = -1;
+        } else if (auto_pad == "SAME_UPPER") {
+            param->pad_type = 0;
+        } else if (auto_pad == "VALID") {
+            param->pad_type = 1;
+        } else {
+            LOGE("Conv: SAME_LOWER does not support, change toSAME_UPPER\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        param->pads = {0, 0, 0, 0};
+    }
+
+    ASSERT(pads.size() == 4);
+    if (node.op_type() == "Int8ConvRelu") {
+        param->activation_type = TNN_NS::ActivationType_ReLU;
+    } else if (node.op_type() == "Int8ConvRelu6") {
+        param->activation_type = TNN_NS::ActivationType_ReLU6;
+    } else {
+        param->activation_type = TNN_NS::ActivationType_None;
+    }
+
+    const auto &input_name     = node.input(0);
+    const auto &input_node     = FindNodeProto(input_name, proxy_nodes);
+    auto input_scale           = GetAttributeFloat(*input_node, "Y_scale", 1.0f);
+    auto input_zero_point      = GetAttributeInt(*input_node, "Y_zero_point", 0);
+    auto input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(input_blob_scale_name) == net_resource.resource_map.end()) {
+        // create input blob scale
+        // assert(input_zero_point == 0);
+        auto input_blob_scale                = new TNN_NS::IntScaleResource;
+        input_blob_scale->name               = input_blob_scale_name;
+        TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&input_scale);
+        input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        input_blob_scale->scale_handle      = input_scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&input_zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        input_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+    }
+
+    // quantized weight value
+    auto weight_scale      = GetAttributeFloat(*weight_node, "Y_scale", 1.0);
+    auto weight_zero_point = GetAttributeInt(*weight_node, "Y_zero_point", 0);
+    assert(weight_shape.size() == 4);
+    auto asymmetric_weight_value = GetAttributeUInt8Vector(*weight_node, "values");
+    auto weight_value            = Asymmetric2Symmetric(asymmetric_weight_value, weight_zero_point);
+    assert(weight_value.size() == weight_count);
+    auto layer_resource             = new TNN_NS::ConvLayerResource;
+    layer_resource->name            = cur_layer->name;
+    TNN_NS::RawBuffer filter_handle = TNN_NS::RawBuffer(weight_count * sizeof(int8_t));
+    filter_handle.SetDataType(TNN_NS::DATA_TYPE_INT8);
+    OHWI2OIHW(reinterpret_cast<int8_t *>(weight_value.data()), filter_handle.force_to<int8_t *>(), co, kh, kw, ci);
+    layer_resource->filter_handle = filter_handle;
+    // quantized weight scale
+    auto cal_weight_scale          = input_scale * weight_scale;
+    TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&cal_weight_scale);
+    scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+    layer_resource->scale_handle = scale_handle;
+
+    if (input_size > 2) {
+        // Get Bias
+        param->bias           = 1;
+        const auto &bias_name = node.input(2);
+        const auto &bias_node = FindNodeProto(bias_name, proxy_nodes);
+        auto bias_scale       = GetAttributeFloat(*bias_node, "Y_scale", 1.0);
+        auto bias_zero_point  = GetAttributeInt(*bias_node, "Y_zero_point", 0);
+        auto bias_shape       = GetAttributeIntVector(*bias_node, "shape");
+        auto bias_value       = GetAttributeIntVector(*bias_node, "values");
+        // calculate bias
+        std::vector<int32_t> cal_bias_value;
+        for (const auto &value : bias_value) {
+            cal_bias_value.push_back(value * bias_scale / cal_weight_scale);
+        }
+        assert(bias_shape.size() == 1);
+        assert(bias_zero_point == 0);
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(cal_bias_value.size() * sizeof(int32_t));
+        bias_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        ::memcpy(bias_handle.force_to<int32_t *>(), cal_bias_value.data(), bias_value.size() * sizeof(int32_t));
+        layer_resource->bias_handle = bias_handle;
+    }
+    // update net_resource resource_map
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    // create output blob_scale
+    const auto &output_name    = node.output(0);
+    auto output_blob_cale_name = output_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(output_blob_cale_name) == net_resource.resource_map.end()) {
+        auto output_scale                     = GetAttributeFloat(node, "Y_scale", 1.0);
+        auto output_zero_point                = GetAttributeInt(node, "Y_zero_point", 0);
+        auto output_blob_scale                = new TNN_NS::IntScaleResource;
+        output_blob_scale->name               = output_blob_cale_name;
+        TNN_NS::RawBuffer output_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&output_scale);
+        output_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        output_blob_scale->scale_handle     = output_scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&output_zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        output_blob_scale->bias_handle                   = zero_point_handle;
+        net_resource.resource_map[output_blob_cale_name] = std::shared_ptr<TNN_NS::LayerResource>(output_blob_scale);
+    }
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = node.output(0);
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8ConvRelu, Int8Conv);
+REGISTER_CONVERTER(Int8ConvRelu, Int8ConvRelu);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_dequantized_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_dequantized_converter.cc
new file mode 100644
index 0000000..60ff240
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_dequantized_converter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Dequantized);
+
+std::string OnnxInt8DequantizedConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Int8Dequantized";
+}
+
+TNN_NS::ActivationType OnnxInt8DequantizedConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8DequantizedConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource, const onnx::NodeProto &node,
+    std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+    std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes, bool &quantized_model) {
+    TNN_NS::LayerParam *param = new TNN_NS::LayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name               = cur_layer->name;
+    param->type               = cur_layer->type_str;
+    param->quantized          = false;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Dequantized, Int8Dequantize);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_fc_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_fc_converter.cc
new file mode 100644
index 0000000..7b1f9ed
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_fc_converter.cc
@@ -0,0 +1,143 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_base_converter.h"
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Int8InnerProduct);
+
+std::string OnnxInt8InnerProductConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "QuantizedInnerProduct";
+}
+
+TNN_NS::ActivationType OnnxInt8InnerProductConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8InnerProductConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource, const onnx::NodeProto &node,
+    std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+    std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes, bool &quantized_model) {
+    const int input_size = node.input_size();
+    assert(input_size == 2 || input_size == 3);
+    auto *param      = new TNN_NS::InnerProductLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = true;
+    param->axis      = 1;
+    param->transpose = 0;
+    // get convolution param
+    const auto &weight_name = node.input(1);
+    const auto &weight_node = FindNodeProto(weight_name, proxy_nodes);
+    auto weight_shape       = GetAttributeIntVector(*weight_node, "shape");
+    assert(weight_shape.size() == 2);
+    auto co           = weight_shape[0];
+    param->num_output = co;
+
+    // create input blob scale
+    const auto &input_name     = node.input(0);
+    const auto &input_node     = FindNodeProto(input_name, proxy_nodes);
+    auto input_scale           = GetAttributeFloat(*input_node, "Y_scale", 1.0f);
+    auto input_zero_point      = GetAttributeInt(*input_node, "Y_zero_point", 0);
+    auto input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(input_blob_scale_name) == net_resource.resource_map.end()) {
+        // create input blob scale
+        // assert(input_zero_point == 0);
+        auto input_blob_scale                = new TNN_NS::IntScaleResource;
+        input_blob_scale->name               = input_blob_scale_name;
+        TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&input_scale);
+        input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        input_blob_scale->scale_handle      = input_scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&input_zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        input_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+    }
+
+    // quantized weight value
+    auto weight_scale            = GetAttributeFloat(*weight_node, "Y_scale", 1.0);
+    auto weight_zero_point       = GetAttributeInt(*weight_node, "Y_zero_point", 0);
+    auto asymmetric_weight_value = GetAttributeUInt8Vector(*weight_node, "values");
+    auto weight_value            = Asymmetric2Symmetric(asymmetric_weight_value, weight_zero_point);
+    auto weight_count            = weight_shape[0] * weight_shape[1];
+    assert(weight_count == weight_value.size());
+    auto layer_resource             = new TNN_NS::InnerProductLayerResource;
+    layer_resource->name            = cur_layer->name;
+    TNN_NS::RawBuffer weight_handle = TNN_NS::RawBuffer(weight_count * sizeof(uint8_t));
+    weight_handle.SetDataType(TNN_NS::DATA_TYPE_INT8);
+    ::memcpy(weight_handle.force_to<uint8_t *>(), weight_value.data(), weight_count * sizeof(uint8_t));
+    layer_resource->weight_handle = weight_handle;
+    // quantized weight scale
+    auto cal_weight_scale          = input_scale * weight_scale;
+    TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&cal_weight_scale);
+    scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+    layer_resource->scale_handle = scale_handle;
+
+    if (input_size > 2) {
+        // Get Bias
+        param->has_bias       = 1;
+        const auto &bias_name = node.input(2);
+        const auto &bias_node = FindNodeProto(bias_name, proxy_nodes);
+        auto bias_scale       = GetAttributeFloat(*bias_node, "Y_scale", 1.0);
+        auto bias_zero_point  = GetAttributeInt(*bias_node, "Y_zero_point", 0);
+        auto bias_shape       = GetAttributeIntVector(*bias_node, "shape");
+        auto bias_value       = GetAttributeIntVector(*bias_node, "values");
+        // calculate bias
+        std::vector<int32_t> cal_bias_value;
+        for (const auto value : bias_value) {
+            cal_bias_value.push_back(value * bias_scale / cal_weight_scale);
+        }
+        assert(bias_shape.size() == 1);
+        assert(bias_zero_point == 0);
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(cal_bias_value.size() * sizeof(int32_t));
+        bias_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        ::memcpy(bias_handle.force_to<int32_t *>(), cal_bias_value.data(), bias_value.size() * sizeof(int32_t));
+        layer_resource->bias_handle = bias_handle;
+    }
+    // update net_resource resource_map
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    // create output blob_scale
+    const auto &output_name    = node.output(0);
+    auto output_blob_cale_name = output_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(output_blob_cale_name) == net_resource.resource_map.end()) {
+        auto output_scale                     = GetAttributeFloat(node, "Y_scale", 1.0);
+        auto output_zero_point                = GetAttributeInt(node, "Y_zero_point", 0);
+        auto output_blob_scale                = new TNN_NS::IntScaleResource;
+        output_blob_scale->name               = output_blob_cale_name;
+        TNN_NS::RawBuffer output_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&output_scale);
+        output_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        output_blob_scale->scale_handle     = output_scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&output_zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        output_blob_scale->bias_handle                   = zero_point_handle;
+        net_resource.resource_map[output_blob_cale_name] = std::shared_ptr<TNN_NS::LayerResource>(output_blob_scale);
+    }
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = node.input(0);
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = node.output(0);
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8InnerProduct, Int8FC);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_quantized_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_quantized_converter.cc
new file mode 100644
index 0000000..d8b3c5f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_quantized_converter.cc
@@ -0,0 +1,66 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Quantized);
+
+std::string OnnxInt8QuantizedConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "Int8Quantized";
+}
+
+TNN_NS::ActivationType OnnxInt8QuantizedConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8QuantizedConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource, const onnx::NodeProto &node,
+    std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+    std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes, bool &quantized_model) {
+    TNN_NS::LayerParam *param = new TNN_NS::LayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name               = cur_layer->name;
+    param->type               = cur_layer->type_str;
+    param->quantized          = false;
+    for (int i = 0; i < node.input_size(); ++i) {
+        const auto &input_name            = node.input(i);
+        std::string input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+        auto &resource_map                = net_resource.resource_map;
+        if (resource_map.find(input_blob_scale_name) != resource_map.end()) {
+            continue;
+        }
+        auto scale                           = GetAttributeFloat(node, "Y_scale", 1.0);
+        auto zero_point                      = GetAttributeInt(node, "Y_zero_point", 0);
+        auto input_blob_scale                = new TNN_NS::IntScaleResource;
+        input_blob_scale->name               = input_blob_scale_name;
+        TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+        input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        input_blob_scale->scale_handle      = input_scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        input_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Quantized, Int8Quantize);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_relu_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_relu_converter.cc
new file mode 100644
index 0000000..6e1b923
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_relu_converter.cc
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Relu);
+
+std::string OnnxInt8ReluConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "QuantizedReLU";
+}
+
+TNN_NS::ActivationType OnnxInt8ReluConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8ReluConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const onnx::NodeProto &node,
+                                           std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                           std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                           bool &quantized_model) {
+    auto param                = new TNN_NS::MultidirBroadcastLayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type               = cur_layer->type_str;
+    param->name               = cur_layer->name;
+    param->quantized          = true;
+    param->weight_input_index = -1;
+    // create input blob scale
+    for (int i = 0; i < node.input_size(); ++i) {
+        const auto &input_name     = node.input(i);
+        auto input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+        if (net_resource.resource_map.find(input_blob_scale_name) == net_resource.resource_map.end()) {
+            const auto &node                     = FindNodeProto(input_name, proxy_nodes);
+            auto scale                           = GetAttributeFloat(*node, "Y_scale", 1.0);
+            auto zero_point                      = GetAttributeInt(*node, "Y_zero_point", 0);
+            auto input_blob_scale                = new TNN_NS::IntScaleResource;
+            TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+            input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+            input_blob_scale->scale_handle      = input_scale_handle;
+            TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+            zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+            input_blob_scale->bias_handle                    = zero_point_handle;
+            net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+        }
+    }
+    // create output scale
+    const auto &output_name     = node.output(0);
+    auto output_blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(output_blob_scale_name) == net_resource.resource_map.end()) {
+        const auto &node               = FindNodeProto(output_name, proxy_nodes);
+        auto scale                     = GetAttributeFloat(*node, "Y_scale", 1.0);
+        auto zero_point                = GetAttributeInt(*node, "Y_zero_point", 0);
+        auto output_blob_scale         = new TNN_NS::IntScaleResource;
+        TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+        scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        output_blob_scale->scale_handle     = scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        output_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[output_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(output_blob_scale);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Relu, Int8Relu);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_transpose_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_transpose_converter.cc
new file mode 100644
index 0000000..02adb06
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_transpose_converter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Transpose);
+
+std::string OnnxInt8TransposeConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    return "QuantizedPermute";
+}
+
+TNN_NS::ActivationType OnnxInt8TransposeConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8TransposeConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource, const onnx::NodeProto &node,
+    std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+    std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes, bool &quantized_model) {
+    TNN_NS::LayerParam *param = new TNN_NS::LayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name               = cur_layer->name;
+    param->type               = cur_layer->type_str;
+    param->quantized          = true;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Transpose, Int8Transpose);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_unary_converter.cc b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_unary_converter.cc
new file mode 100644
index 0000000..b66e205
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/onnx/quantized_operator/onnx_int8_unary_converter.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx/onnx_utils.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tools/converter/source/onnx/onnx_base_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Int8Unary);
+
+std::string OnnxInt8UnaryConverter::TNNOpType(const onnx::NodeProto &node, bool quantized_model) {
+    if (node.op_type() == "Int8Sigmoid") {
+        return "QuantizedSigmoid";
+    }
+    return "";
+}
+
+TNN_NS::ActivationType OnnxInt8UnaryConverter::ActivationType(const onnx::NodeProto &node) {
+    return TNN_NS::ActivationType_None;
+}
+
+TNN_NS::Status OnnxInt8UnaryConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                            const onnx::NodeProto &node,
+                                            std::map<std::string, const onnx::TensorProto *> &proxy_initializers_map,
+                                            std::map<std::string, std::shared_ptr<OnnxProxyNode>> &proxy_nodes,
+                                            bool &quantized_model) {
+    auto param                = new TNN_NS::MultidirBroadcastLayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type               = cur_layer->type_str;
+    param->name               = cur_layer->name;
+    param->quantized          = true;
+    param->weight_input_index = -1;
+    // create input blob scale
+    for (int i = 0; i < node.input_size(); ++i) {
+        const auto &input_name     = node.input(i);
+        auto input_blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+        if (net_resource.resource_map.find(input_blob_scale_name) == net_resource.resource_map.end()) {
+            const auto &node                     = FindNodeProto(input_name, proxy_nodes);
+            auto scale                           = GetAttributeFloat(*node, "Y_scale", 1.0);
+            auto zero_point                      = GetAttributeInt(*node, "Y_zero_point", 0);
+            auto input_blob_scale                = new TNN_NS::IntScaleResource;
+            input_blob_scale->name               = input_blob_scale_name;
+            TNN_NS::RawBuffer input_scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+            input_scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+            input_blob_scale->scale_handle      = input_scale_handle;
+            TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+            zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+            input_blob_scale->bias_handle                    = zero_point_handle;
+            net_resource.resource_map[input_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(input_blob_scale);
+        }
+    }
+    // create output scale
+    const auto &output_name     = node.output(0);
+    auto output_blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(output_blob_scale_name) == net_resource.resource_map.end()) {
+        const auto &node               = FindNodeProto(output_name, proxy_nodes);
+        auto scale                     = GetAttributeFloat(*node, "Y_scale", 1.0);
+        auto zero_point                = GetAttributeInt(*node, "Y_zero_point", 0);
+        auto output_blob_scale         = new TNN_NS::IntScaleResource;
+        output_blob_scale->name        = output_blob_scale_name;
+        TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(1 * sizeof(float), (char *)&scale);
+        scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        output_blob_scale->scale_handle     = scale_handle;
+        TNN_NS::RawBuffer zero_point_handle = TNN_NS::RawBuffer(1 * sizeof(int32_t), (char *)&zero_point);
+        zero_point_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        output_blob_scale->bias_handle                    = zero_point_handle;
+        net_resource.resource_map[output_blob_scale_name] = std::shared_ptr<TNN_NS::LayerResource>(output_blob_scale);
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_CONVERTER(Int8Unary, Int8Sigmoid);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_constant_folding.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_constant_folding.cc
new file mode 100644
index 0000000..8207a7b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_constant_folding.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(ConstantFolding);
+
+std::string TnnOptimizeConstantFoldingPass::PassName() {
+    return "ConstantFolding";
+}
+
+TNN_NS::Status TnnOptimizeConstantFoldingPass::exec(TNN_NS::NetStructure& net_structure,
+                                                    TNN_NS::NetResource& net_resource) {
+    std::set<TNN_NS::LayerType> black_list = {TNN_NS::LAYER_SHAPE};
+
+    auto& constant_map    = net_resource.constant_map;
+    auto& constant_layers = net_resource.constant_layers;
+    auto& net_layers      = net_structure.layers;
+    for (auto iter = net_layers.begin(); iter != net_layers.end();) {
+        auto& layer = *iter;
+        if (constant_layers.find(layer->name) == constant_layers.end()) {
+            iter++;
+            continue;
+        }
+        auto layer_output_names   = layer->outputs;
+        const auto& pre_node_name = layer->inputs[0];
+        for (auto sub_iter = iter; sub_iter < net_layers.end(); sub_iter++) {
+            auto sub_layer = *sub_iter;
+            for (auto& input_name : sub_layer->inputs) {
+                if (std::find(layer_output_names.begin(), layer_output_names.end(), input_name) !=
+                    layer_output_names.end()) {
+                    input_name = pre_node_name;
+                }
+            }
+        }
+        iter = net_layers.erase(iter);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(ConstantFolding);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_reformat_node_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_reformat_node_pass.cc
new file mode 100644
index 0000000..841e644
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_reformat_node_pass.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(EliminateReformatNode);
+
+std::string TnnOptimizeEliminateReformatNodePass::PassName() {
+    return "EliminateReformatNode";
+}
+
+TNN_NS::Status TnnOptimizeEliminateReformatNodePass::exec(TNN_NS::NetStructure& net_structure,
+                                                          TNN_NS::NetResource& net_resource) {
+    std::set<std::string> black_list = {"Reformat"};
+    auto& layers                     = net_structure.layers;
+    auto& resource_map               = net_resource.resource_map;
+    for (auto iter = layers.begin(); iter != layers.end();) {
+        auto& layer = *iter;
+        if (black_list.find(layer->type_str) == black_list.end()) {
+            iter++;
+            continue;
+        }
+        if (layer->inputs.size() > 1) {
+            iter++;
+            continue;
+        }
+        auto pre_node_name      = layer->inputs[0];
+        auto layer_output_names = layer->outputs;
+        // to deal with the int8 dequantize node is the model output
+        auto& model_output_names = net_structure.outputs;
+        for (const auto& output_name : model_output_names) {
+            if (std::find(layer_output_names.begin(), layer_output_names.end(), output_name) !=
+                layer_output_names.end()) {
+                model_output_names.erase(output_name);
+                model_output_names.insert(pre_node_name);
+                break;
+            }
+        }
+
+        for (const auto& sub_iter : layers) {
+            for (int i = 0; i < sub_iter->inputs.size(); i++) {
+                if (std::find(layer_output_names.begin(), layer_output_names.end(), sub_iter->inputs[i]) !=
+                    layer_output_names.end()) {
+                    auto older_input_name = sub_iter->inputs[i];
+                    sub_iter->inputs[i]   = pre_node_name;
+                    if (sub_iter->param->quantized) {
+                        auto older_input_scale_name = older_input_name + BLOB_SCALE_SUFFIX;
+                        auto new_input_scale_name   = pre_node_name + BLOB_SCALE_SUFFIX;
+                        if (resource_map.find(older_input_scale_name) != resource_map.end()) {
+                            resource_map[new_input_scale_name] = resource_map[older_input_scale_name];
+                        } else {
+                            LOGE("Converter: Can not eliminate Reformat\n");
+                            return TNN_NS::TNNERR_CONVERT_OPTIMIZE_ERROR;
+                        }
+                    }
+                }
+            }
+        }
+        iter = layers.erase(iter);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(EliminateReformatNode);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_unuseful_node_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_unuseful_node_pass.cc
new file mode 100644
index 0000000..f945bba
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_eliminate_unuseful_node_pass.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(EliminateUnusefulNode);
+
+std::string TnnOptimizeEliminateUnusefulNodePass::PassName() {
+    return "EliminateUnusefulNode";
+}
+
+TNN_NS::Status TnnOptimizeEliminateUnusefulNodePass::exec(TNN_NS::NetStructure& net_structure,
+                                                          TNN_NS::NetResource& net_resource) {
+    std::set<std::string> black_list = {"QuantizedPermute", "Int8Quantized", "Int8Dequantized"};
+    auto& layers                     = net_structure.layers;
+    for (auto iter = layers.begin(); iter != layers.end();) {
+        auto& layer = *iter;
+        if (black_list.find(layer->type_str) == black_list.end()) {
+            iter++;
+            continue;
+        }
+        // 处理悬空节点
+        if (layer->inputs.empty() || layer->outputs.empty()) {
+            iter = layers.erase(iter);
+            iter++;
+            continue;
+        }
+        if (layer->inputs.size() > 1) {
+            iter++;
+            continue;
+        }
+        auto pre_node_name      = layer->inputs[0];
+        auto layer_output_names = layer->outputs;
+        // to deal with the int8 dequantize node is the model output
+        auto& model_output_names = net_structure.outputs;
+        for (const auto& output_name : model_output_names) {
+            if (std::find(layer_output_names.begin(), layer_output_names.end(), output_name) !=
+                layer_output_names.end()) {
+                model_output_names.erase(output_name);
+                model_output_names.insert(pre_node_name);
+                break;
+            }
+        }
+
+        for (const auto& sub_iter : layers) {
+            for (int i = 0; i < sub_iter->inputs.size(); i++) {
+                if (std::find(layer_output_names.begin(), layer_output_names.end(), sub_iter->inputs[i]) !=
+                    layer_output_names.end()) {
+                    sub_iter->inputs[i] = pre_node_name;
+                }
+            }
+        }
+        iter = layers.erase(iter);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(EliminateUnusefulNode);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_instance_normalization_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_instance_normalization_pass.cc
new file mode 100644
index 0000000..089926b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_instance_normalization_pass.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimize_pass.h"
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(FuseInstanceNormalization);
+
+std::string TnnOptimizeFuseInstanceNormalizationPass::PassName() {
+    return "FuseInstanceNormalization";
+}
+
+TNN_NS::Status TnnOptimizeFuseInstanceNormalizationPass::exec(TNN_NS::NetStructure& net_structure,
+                                                              TNN_NS::NetResource& net_resource) {
+    auto& layers = net_structure.layers;
+    for (auto iter = layers.begin(); iter != layers.end(); iter++) {
+        auto& layer = *iter;
+        if (layer->type != TNN_NS::LAYER_REDUCE_MEAN) {
+            continue;
+        }
+        auto reduce_mean_param = dynamic_cast<TNN_NS::ReduceLayerParam*>(layer->param.get());
+        // take care belpw "!"
+        if (!(reduce_mean_param->axis.size() == 2 && reduce_mean_param->axis[0] == 2 &&
+              reduce_mean_param->axis[1] == 3)) {
+            continue;
+        }
+        auto pooling_layer_param       = new TNN_NS::PoolingLayerParam;
+        pooling_layer_param->type      = "Pooling";
+        pooling_layer_param->name      = reduce_mean_param->name;
+        pooling_layer_param->quantized = reduce_mean_param->quantized;
+        // pool_type 1 meaning: AveragePool
+        pooling_layer_param->pool_type      = 1;
+        pooling_layer_param->kernels        = {0, 0};
+        pooling_layer_param->kernels_params = pooling_layer_param->kernels;
+        pooling_layer_param->strides        = {1, 1};
+        pooling_layer_param->pads           = {0, 0, 0, 0};
+        pooling_layer_param->kernel_indexs  = {-1, -1};
+        pooling_layer_param->pad_type       = -1;
+        pooling_layer_param->ceil_mode      = 0;
+        // update layer
+        layer->type     = TNN_NS::LAYER_POOLING;
+        layer->type_str = "Pooling";
+        layer->param    = std::shared_ptr<TNN_NS::LayerParam>(pooling_layer_param);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(FuseInstanceNormalization);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_shuffle_channel_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_shuffle_channel_pass.cc
new file mode 100644
index 0000000..88fcfd0
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_fuse_shuffle_channel_pass.cc
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimize_pass.h"
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(FuseShuffleChannel);
+
+std::string TnnOptimizeFuseShuffleChannelPass::PassName() {
+    return "FuseShuffleChannel";
+}
+
+TNN_NS::Status TnnOptimizeFuseShuffleChannelPass::exec(TNN_NS::NetStructure& net_structure,
+                                                       TNN_NS::NetResource& net_resource) {
+    auto& layers = net_structure.layers;
+
+    // ShuffleChannel <= Reshape - Transpose - Reshape
+    for (auto iter = layers.begin(); iter + 2 != layers.end(); iter++) {
+        auto& reshape_layer1 = *iter;
+        if (reshape_layer1->type != TNN_NS::LAYER_RESHAPE || reshape_layer1->outputs.size() != 1) {
+            continue;
+        }
+
+        auto transpose_iter = iter + 1;
+        auto reshape_iter2  = iter + 2;
+        auto tranpose_layer = *transpose_iter;
+        auto reshape_layer2 = *reshape_iter2;
+        if (tranpose_layer->type != TNN_NS::LAYER_PERMUTE || reshape_layer2->type != TNN_NS::LAYER_RESHAPE) {
+            continue;
+        }
+        if (tranpose_layer->outputs.size() != 1) {
+            continue;
+        }
+        if (reshape_layer1->outputs[0] != tranpose_layer->inputs[0] ||
+            tranpose_layer->outputs[0] != reshape_layer2->inputs[0]) {
+            continue;
+        }
+
+        auto* reshape_param1  = dynamic_cast<TNN_NS::ReshapeLayerParam*>(reshape_layer1->param.get());
+        auto* transpose_param = dynamic_cast<TNN_NS::PermuteLayerParam*>(tranpose_layer->param.get());
+        auto* reshape_param2  = dynamic_cast<TNN_NS::ReshapeLayerParam*>(reshape_layer2->param.get());
+        const auto shape1     = reshape_param1->shape;
+        const auto perm       = transpose_param->orders;
+        const auto shape3     = reshape_param2->shape;
+
+        int64_t group = 0;
+
+        if (shape1.size() == 5 && perm.size() == 5) {
+            // batch groups channels_per_group, height, width
+            group = shape1[1];
+
+            // 0 2 1 3 4
+            if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4) {
+                continue;
+            }
+
+            if (shape3.size() != 4 || shape3[0] != shape1[0] ||
+                (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3] ||
+                shape3[3] != shape1[4]) {
+                continue;
+            }
+        } else if (shape1.size() == 3 && perm.size() == 3) {
+            // groups, channels_per_group, height*width
+            group = shape1[0];
+            // 1 0 2
+            if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
+                continue;
+            }
+
+            // TODO：考虑情况shape3各种大小
+            if (shape3.size() != 5 || shape3[0] != shape1[0] ||
+                (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3]) {
+                continue;
+            }
+        } else {
+            continue;
+        }
+
+        auto shuffle_param       = new TNN_NS::ShuffleLayerParam;
+        shuffle_param->group     = group;
+        reshape_layer1->param    = std::shared_ptr<TNN_NS::LayerParam>(shuffle_param);
+        reshape_layer1->type     = TNN_NS::LAYER_SHUFFLE_CHANNEL;
+        reshape_layer1->type_str = "ShuffleChannel";
+
+        reshape_layer1->outputs.clear();
+        reshape_layer1->outputs = reshape_layer2->outputs;
+        layers.erase(transpose_iter);
+        reshape_iter2 -= 1;
+        layers.erase(reshape_iter2);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(FuseShuffleChannel);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.cc
new file mode 100644
index 0000000..f2137f4
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+TnnOptimizePassManager* TnnOptimizePassManager::tnn_optimize_pass_manager_ = nullptr;
+
+TnnOptimizePassManager::~TnnOptimizePassManager() {
+    for (auto& iter : tnn_optimize_pass_map_) {
+        delete iter.second;
+    }
+    tnn_optimize_pass_map_.clear();
+}
+
+TnnOptimizePassManager* TnnOptimizePassManager::get() {
+    if (tnn_optimize_pass_manager_ == nullptr) {
+        tnn_optimize_pass_manager_ = new TnnOptimizePassManager;
+    }
+    return tnn_optimize_pass_manager_;
+}
+
+TnnOptimizePass* TnnOptimizePassManager::search(const std::string pass_name) {
+    auto iter = tnn_optimize_pass_map_.find(pass_name);
+    if (iter == tnn_optimize_pass_map_.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+
+void TnnOptimizePassManager::insert(const std::string pass_name, TnnOptimizePass* t) {
+    tnn_optimize_pass_map_.insert(std::make_pair(pass_name, t));
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.h b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.h
new file mode 100644
index 0000000..d0bf626
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimize_pass.h
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZE_PASS_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZE_PASS_H_
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+
+class TnnOptimizePass {
+public:
+    TnnOptimizePass()          = default;
+    virtual ~TnnOptimizePass() = default;
+
+    virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource) = 0;
+    virtual std::string PassName()                                                                      = 0;
+};
+
+class TnnOptimizePassManager {
+public:
+    TnnOptimizePassManager() = default;
+    ~TnnOptimizePassManager();
+    static TnnOptimizePassManager* get();
+    TnnOptimizePass* search(const std::string pass_name);
+    void insert(const std::string pass_name, TnnOptimizePass* t);
+
+private:
+    static TnnOptimizePassManager* tnn_optimize_pass_manager_;
+    std::map<const std::string, TnnOptimizePass*> tnn_optimize_pass_map_;
+};
+
+template <class T>
+class TnnOptimizePassRegister {
+public:
+    explicit TnnOptimizePassRegister(const std::string pass_name) {
+        T* pass                                           = new T;
+        TnnOptimizePassManager* tnn_optimize_pass_manager = TnnOptimizePassManager::get();
+        tnn_optimize_pass_manager->insert(pass_name, pass);
+    }
+    ~TnnOptimizePassRegister() = default;
+};
+#define DECLARE_OPTIMIZE_PASS(pass_name)                                                                               \
+    class TnnOptimize##pass_name##Pass : public TnnOptimizePass {                                                      \
+    public:                                                                                                            \
+        TnnOptimize##pass_name##Pass(){};                                                                              \
+        virtual ~TnnOptimize##pass_name##Pass(){};                                                                     \
+        virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource);           \
+        virtual std::string PassName();                                                                                \
+    }
+
+#define REGISTER_OPTIMIZE_PASS(pass_name)                                                                              \
+    TnnOptimizePassRegister<TnnOptimize##pass_name##Pass> g_tnn_optimize_##pass_name##_pass_(#pass_name)
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZE_PASS_H_
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.cc
new file mode 100644
index 0000000..10462be
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimizer.h"
+
+#include "include/tnn/core/status.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+TNN_NS::Status TnnOptimizer::PreOptimize(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource) {
+    // pre optimize
+    std::vector<std::string> pre_optimize_pass = {
+        "EliminateUnusefulNode",
+        "FuseShuffleChannel",
+    };
+    for (const auto& pass_name : pre_optimize_pass) {
+        auto pass = TnnOptimizePassManager::get()->search(pass_name);
+        if (pass == nullptr) {
+            LOGE("Converter: do not support pre optimize pass %s\n", pass_name.c_str());
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_PASS;
+        }
+        TNN_NS::Status status = pass->exec(net_structure, net_resource);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            LOGE("Converter: pre optimize failed. the failed pass is %s\n", pass_name.c_str());
+            return TNN_NS::TNNERR_CONVERT_OPTIMIZE_ERROR;
+        }
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+TNN_NS::Status TnnOptimizer::PostOptimize(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource) {
+    // pre optimize
+    std::vector<std::string> pre_optimize_pass = {"ConstantFolding", "AdjustLayerInputs", "EliminateReformatNode",
+                                                  "TransformDequantized"};
+    for (const auto& pass_name : pre_optimize_pass) {
+        auto pass = TnnOptimizePassManager::get()->search(pass_name);
+        if (pass == nullptr) {
+            LOGE("Converter: do not support pre optimize pass %s\n", pass_name.c_str());
+            return TNN_NS::TNNERR_CONVERT_OPTIMIZE_ERROR;
+        }
+        TNN_NS::Status status = pass->exec(net_structure, net_resource);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            LOGE("Converter: pre optimize failed. the failed pass is %s\n", pass_name.c_str());
+            return TNN_NS::TNNERR_CONVERT_OPTIMIZE_ERROR;
+        }
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.h b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.h
new file mode 100644
index 0000000..109a14a
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_optimizer.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZER_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZER_H_
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+class TnnOptimizer {
+public:
+    TnnOptimizer()  = default;
+    ~TnnOptimizer() = default;
+    TNN_NS::Status PreOptimize(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource);
+    TNN_NS::Status PostOptimize(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource);
+};
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_OPTIMIZER_TNN_OPTIMIZER_H_
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_reshape_const_folding_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_reshape_const_folding_pass.cc
new file mode 100644
index 0000000..0ddb0df
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_reshape_const_folding_pass.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(AdjustLayerInputs);
+
+std::string TnnOptimizeAdjustLayerInputsPass::PassName() {
+    return "AdjustLayerInputs";
+}
+
+TNN_NS::Status TnnOptimizeAdjustLayerInputsPass::exec(TNN_NS::NetStructure& net_structure,
+                                                      TNN_NS::NetResource& net_resource) {
+    auto& layers = net_structure.layers;
+    for (auto iter = layers.begin(); iter < layers.end();) {
+        auto cur_layer = *iter;
+        if (cur_layer->type == TNN_NS::LAYER_RESHAPE) {
+            auto param = dynamic_cast<TNN_NS::ReshapeLayerParam*>(cur_layer->param.get());
+            if (cur_layer->inputs.size() == 1 && !param->shape.empty()) {
+                iter++;
+                continue;
+            }
+            while (param->shape.size() < 4) {
+                param->shape.push_back(1);
+            }
+            param->num_axes = param->shape.size();
+            ASSERT(cur_layer->inputs.size() == 2);
+            std::string input_name = cur_layer->inputs[0];
+            cur_layer->inputs.resize(1);
+            cur_layer->inputs[0] = input_name;
+        } else if (cur_layer->type == TNN_NS::LAYER_UPSAMPLE) {
+            const auto& input_size = cur_layer->inputs.size();
+            if (input_size == 2) {
+                const auto input_name = cur_layer->inputs[0];
+                cur_layer->inputs.resize(1);
+                cur_layer->inputs[0] = input_name;
+            }
+        }
+        iter++;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(AdjustLayerInputs);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_separate_relu_and_relu6_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_separate_relu_and_relu6_pass.cc
new file mode 100644
index 0000000..a6bffff
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_separate_relu_and_relu6_pass.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimize_pass.h"
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(SeparateReluAndRelu6);
+
+std::string TnnOptimizeSeparateReluAndRelu6Pass::PassName() {
+    return "SeparateReluAndRelu6";
+}
+
+TNN_NS::Status TnnOptimizeSeparateReluAndRelu6Pass::exec(TNN_NS::NetStructure& net_structure,
+                                                         TNN_NS::NetResource& net_resource) {
+    auto& layers                         = net_structure.layers;
+    const std::string conv_output_suffix = "_output";
+    const std::string activation_suffix  = "_activation";
+    for (int i = 0; i < layers.size(); i++) {
+        auto& layer = layers[i];
+        if (layer->type == TNN_NS::LAYER_CONVOLUTION || layer->type == TNN_NS::LAYER_CONVOLUTION) {
+            auto conv_param = dynamic_cast<TNN_NS::ConvLayerParam*>(layer->param.get());
+            if (conv_param->activation_type == TNN_NS::ActivationType_None) {
+                continue;
+            } else if (conv_param->activation_type == TNN_NS::ActivationType_ReLU ||
+                       conv_param->activation_type == TNN_NS::ActivationType_ReLU6) {
+                auto activation_layer  = new TNN_NS::LayerInfo;
+                activation_layer->type = conv_param->activation_type == TNN_NS::ActivationType_ReLU
+                                             ? TNN_NS::LAYER_RELU
+                                             : TNN_NS::LAYER_RELU6;
+                activation_layer->type_str =
+                    conv_param->activation_type == TNN_NS::ActivationType_ReLU ? "ReLU" : "ReLU6";
+                activation_layer->name = layer->name + activation_suffix;
+                activation_layer->inputs.push_back(layer->outputs[0] + conv_output_suffix);
+                activation_layer->outputs.push_back(layer->outputs[0]);
+                // update convolution output
+                layer->outputs[0] = layer->outputs[0] + conv_output_suffix;
+                // create relu param
+                auto activation_param       = new TNN_NS::LayerParam;
+                activation_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(activation_param);
+                activation_param->type      = activation_layer->type;
+                activation_param->name      = layer->name + activation_suffix;
+                activation_param->quantized = false;
+                // insert activation layer
+                layers.insert(layers.begin() + i + 1, std::shared_ptr<TNN_NS::LayerInfo>(activation_layer));
+            }
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(SeparateReluAndRelu6);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_dequantized_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_dequantized_pass.cc
new file mode 100644
index 0000000..b92ea9b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_dequantized_pass.cc
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn_optimize_pass.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(TransformDequantized);
+
+std::set<TNN_NS::LayerType> one_direction_layers_ = {TNN_NS::LAYER_RELU, TNN_NS::LAYER_UPSAMPLE};
+std::set<TNN_NS::LayerType> two_direction_layers_ = {TNN_NS::LAYER_ADD, TNN_NS::LAYER_CONCAT};
+
+std::string TnnOptimizeTransformDequantizedPass::PassName() {
+    return "TransformDequantized";
+}
+
+TNN_NS::Status ConvertByOneDirection(const std::vector<std::shared_ptr<TNN_NS::LayerInfo>> &layers_orig,
+                                     TNN_NS::NetResource &resource) {
+    for (int index = 0; index < layers_orig.size(); index++) {
+        auto cur_layer = layers_orig[index];
+
+        if (one_direction_layers_.count(cur_layer->type) == 0 || cur_layer->param->quantized) {
+            continue;
+        }
+
+        bool can_convert_to_int8 = true;
+        // assume inputs and outputs are one-one correspondence
+        if (cur_layer->outputs.size() != cur_layer->inputs.size()) {
+            continue;
+        }
+        auto &resource_map = resource.resource_map;
+        for (int i = 0; i < cur_layer->outputs.size(); ++i) {
+            if (resource_map.find(cur_layer->inputs[i] + BLOB_SCALE_SUFFIX) == resource_map.end() &&
+                resource_map.find(cur_layer->outputs[i] + BLOB_SCALE_SUFFIX) == resource_map.end()) {
+                can_convert_to_int8 = false;
+                break;
+            }
+        }
+        if (can_convert_to_int8) {
+            // set int resources
+            for (int i = 0; i < cur_layer->outputs.size(); ++i) {
+                auto k_input_resource  = cur_layer->inputs[i] + BLOB_SCALE_SUFFIX;
+                auto k_output_resource = cur_layer->outputs[i] + BLOB_SCALE_SUFFIX;
+                if (resource_map.find(k_input_resource) == resource_map.end() &&
+                    resource_map.find(k_output_resource) != resource_map.end()) {
+                    resource_map[k_input_resource] = resource_map[k_output_resource];
+                } else if (resource_map.find(k_input_resource) != resource_map.end() &&
+                           resource_map.find(k_output_resource) == resource_map.end()) {
+                    resource_map[k_output_resource] = resource_map[k_input_resource];
+                } else if (resource_map.find(k_input_resource) == resource_map.end() &&
+                           resource_map.find(k_output_resource) == resource_map.end()) {
+                    return TNN_NS::Status(TNN_NS::TNNERR_LAYER_ERR, "Converter: ConvertByOneDirection failed\n");
+                }
+            }
+            // convert to int8 layer
+            std::string type_name       = "Quantized" + cur_layer->type_str;
+            cur_layer->type             = TNN_NS::GlobalConvertLayerType(type_name);
+            cur_layer->type_str         = type_name;
+            cur_layer->param->quantized = true;
+            LOGD("Convert to int8 layer: type %s name %s\n", cur_layer->type_str.c_str(), cur_layer->name.c_str());
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+TNN_NS::Status ConvertByTwoDirection(const std::vector<std::shared_ptr<TNN_NS::LayerInfo>> &layers_orig,
+                                     TNN_NS::NetResource &resource) {
+    for (int index = 0; index < layers_orig.size(); index++) {
+        auto cur_layer = layers_orig[index];
+
+        if (two_direction_layers_.count(cur_layer->type) == 0 || cur_layer->param->quantized) {
+            continue;
+        }
+        bool can_convert_to_int8 = true;
+        // int resources of inputs and outputs should be available
+        auto &resource_map = resource.resource_map;
+        for (const auto &blob_name : cur_layer->inputs) {
+            if (resource_map.find(blob_name + BLOB_SCALE_SUFFIX) == resource_map.end()) {
+                can_convert_to_int8 = false;
+                break;
+            }
+        }
+        for (const auto &blob_name : cur_layer->outputs) {
+            if (resource_map.find(blob_name + BLOB_SCALE_SUFFIX) == resource_map.end()) {
+                can_convert_to_int8 = false;
+                break;
+            }
+        }
+        if (can_convert_to_int8) {
+            // convert to int8 layer
+            std::string type_name       = "Quantized" + cur_layer->type_str;
+            cur_layer->type             = TNN_NS::GlobalConvertLayerType(type_name);
+            cur_layer->type_str         = type_name;
+            cur_layer->param->quantized = true;
+            LOGD("Convert to int8 layer: type %s name %s\n", cur_layer->type_str.c_str(), cur_layer->name.c_str());
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+TNN_NS::Status TnnOptimizeTransformDequantizedPass::exec(TNN_NS::NetStructure &net_structure,
+                                                         TNN_NS::NetResource &net_resource) {
+    auto &layers          = net_structure.layers;
+    auto is_quantized_net = TNN_NS::GetQuantizedInfoFromNetStructure(&net_structure);
+    if (!is_quantized_net) {
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+    auto status = ConvertByOneDirection(layers, net_resource);
+    ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+    status = ConvertByTwoDirection(layers, net_resource);
+    ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(TransformDequantized);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_reduce_mean_pass.cc b/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_reduce_mean_pass.cc
new file mode 100644
index 0000000..5fa9980
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/optimizer/tnn_transform_reduce_mean_pass.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_optimize_pass.h"
+namespace TNN_CONVERTER {
+
+DECLARE_OPTIMIZE_PASS(TransformReduceMean);
+
+std::string TnnOptimizeTransformReduceMeanPass::PassName() {
+    return "TransformReduceMean";
+}
+
+TNN_NS::Status TnnOptimizeTransformReduceMeanPass::exec(TNN_NS::NetStructure& net_structure,
+                                                        TNN_NS::NetResource& net_resource) {
+    auto& layers = net_structure.layers;
+    for (auto iter = layers.begin(); iter != layers.end(); iter++) {
+        auto& layer = *iter;
+        if (layer->type != TNN_NS::LAYER_REDUCE_MEAN) {
+            continue;
+        }
+        auto reduce_mean_param = dynamic_cast<TNN_NS::ReduceLayerParam*>(layer->param.get());
+        // take care belpw "!"
+        if (!(reduce_mean_param->axis.size() == 2 && reduce_mean_param->axis[0] == 2 &&
+              reduce_mean_param->axis[1] == 3)) {
+            continue;
+        }
+        auto pooling_layer_param       = new TNN_NS::PoolingLayerParam;
+        pooling_layer_param->type      = "Pooling";
+        pooling_layer_param->name      = reduce_mean_param->name;
+        pooling_layer_param->quantized = reduce_mean_param->quantized;
+        // pool_type 1 meaning: AveragePool
+        pooling_layer_param->pool_type      = 1;
+        pooling_layer_param->kernels        = {0, 0};
+        pooling_layer_param->kernels_params = pooling_layer_param->kernels;
+        pooling_layer_param->strides        = {1, 1};
+        pooling_layer_param->pads           = {0, 0, 0, 0};
+        pooling_layer_param->kernel_indexs  = {-1, -1};
+        pooling_layer_param->pad_type       = -1;
+        pooling_layer_param->ceil_mode      = 0;
+        // update layer
+        layer->type     = TNN_NS::LAYER_POOLING;
+        layer->type_str = "Pooling";
+        layer->param    = std::shared_ptr<TNN_NS::LayerParam>(pooling_layer_param);
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+REGISTER_OPTIMIZE_PASS(TransformReduceMean);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.cc b/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.cc
new file mode 100644
index 0000000..02223a5
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.cc
@@ -0,0 +1,207 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn_runtime.h"
+
+#include <iomanip>
+
+#include "include/tnn/core/common.h"
+#include "include/tnn/core/instance.h"
+#include "tnn/core/const_folder.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "utils/flags.h"
+
+namespace TNN_CONVERTER {
+
+TnnRuntime::TnnRuntime() {
+    // initial network config
+    network_config_.network_type = TNN_NS::NETWORK_TYPE_DEFAULT;
+    network_config_.device_type  = TNN_NS::DEVICE_NAIVE;
+    network_config_.precision    = TNN_NS::PRECISION_AUTO;
+    network_config_.library_path = {};
+    // initial model config
+    model_config_.model_type = TNN_NS::MODEL_TYPE_TNN;
+    // fake mode config params
+    model_config_.params = {};
+}
+TnnRuntime::~TnnRuntime() {
+    // do nothing
+}
+
+TNN_NS::Status TnnRuntime::ConstantFolding(const std::shared_ptr<TNN_NS::AbstractModelInterpreter> interpreter) {
+    // create input shape map
+    TNN_NS::DefaultModelInterpreter* tnn_interpreter =
+        (dynamic_cast<TNN_NS::DefaultModelInterpreter*>(interpreter.get()));
+    TNN_NS::InputShapesMap& input_shapes_map = tnn_interpreter->GetNetStructure()->inputs_shape_map;
+    auto const_folder                        = std::make_shared<TNN_NS::ConstFolder>();
+    auto& instance                           = const_folder;
+    auto status = const_folder->Init(network_config_, model_config_, tnn_interpreter, {}, {});
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("Converter Runtime: instance init failed!\n");
+        return status;
+    }
+    status = const_folder->Forward();
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("ConstFolding Forward Error: %s\n", status.description().c_str());
+        return status;
+    }
+    return TNN_NS::TNN_OK;
+}
+/**
+ * TODO: support align model in tnn converter
+ * **/
+TNN_NS::Status TnnRuntime::AlignModel(const std::shared_ptr<TNN_NS::AbstractModelInterpreter> interpreter) {
+    TNN_NS::DefaultModelInterpreter* tnn_interpreter =
+        (dynamic_cast<TNN_NS::DefaultModelInterpreter*>(interpreter.get()));
+    TNN_NS::InputShapesMap& input_shapes_map = tnn_interpreter->GetNetStructure()->inputs_shape_map;
+    auto instance                            = std::make_shared<TNN_NS::Instance>(network_config_, model_config_);
+    auto status                              = instance->Init(interpreter, input_shapes_map);
+    TNN_NS::BlobMap input_blob_map;
+    TNN_NS::BlobMap output_blob_map;
+    void* command_queue;
+    instance->GetAllInputBlobs(input_blob_map);
+    instance->GetAllOutputBlobs(output_blob_map);
+    instance->GetCommandQueue(&command_queue);
+    // create mat and converter
+    // format type 0: DATA_TYPE_FLOAT
+    TNN_NS::MatMap input_mat_map = CreateBlobMatMap(input_blob_map, 0);
+    InitInputMatMap(input_mat_map);
+    auto input_converters_map = CreateBlobConverterMap(input_blob_map);
+    auto input_params_map     = CreateConvertParamMap(input_mat_map);
+    for (const auto& iter : input_converters_map) {
+        auto name           = iter.first;
+        auto blob_converter = iter.second;
+        blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
+    }
+    status = instance->Forward();
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("Converter Runtime: instance forward failed\n");
+        return status;
+    }
+
+    if (!FLAGS_sp.empty()) {
+        // mat format NCHW_FLOAT
+        TNN_NS::MatMap output_mat_map = CreateBlobMatMap(output_blob_map, 0);
+        auto output_converters_map    = CreateBlobConverterMap(output_blob_map);
+        auto output_params_map        = CreateConvertParamMap(output_mat_map);
+        for (const auto& iter : output_converters_map) {
+            auto& name           = iter.first;
+            auto& blob_converter = iter.second;
+            status = blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
+            if (status != TNN_NS::TNN_OK) {
+                LOGE("Converter: convert from mat to blob failed\n");
+                return status;
+            }
+        }
+        std::ofstream output_file(FLAGS_sp);
+        LOGD("The save path of the results after TNN inference: %s\n", FLAGS_sp.c_str());
+        for (const auto& output : output_mat_map) {
+            auto& name              = output.first;
+            auto& mat               = output.second;
+            TNN_NS::DimsVector dims = mat->GetDims();
+            std::string message     = name + "(";
+            for (const auto& dim : dims) {
+                message += std::to_string(dim);
+                message += " ";
+            }
+            message += ")";
+            LOGD("the output message: %s\n", message.c_str());
+            auto count = TNN_NS::DimsVectorUtils::Count(dims);
+            auto* data = reinterpret_cast<float*>(mat->GetData());
+            for (int i = 0; i < count; ++i) {
+                output_file << std::fixed << std::setprecision(6) << data[i] << std::endl;
+            }
+        }
+        output_file.close();
+    }
+    return TNN_NS::TNN_OK;
+}
+
+TNN_NS::MatMap TnnRuntime::CreateBlobMatMap(TNN_NS::BlobMap& blob_map, int format_type) {
+    TNN_NS::MatMap mat_map;
+    for (const auto& iter : blob_map) {
+        auto name                  = iter.first;
+        TNN_NS::Blob* device_blob  = iter.second;
+        TNN_NS::BlobDesc blob_desc = device_blob->GetBlobDesc();
+        // Format Types: (0: NCHW FLOAT), (1: 8UC3), (2: 8UC1)
+        TNN_NS::DataType data_type = TNN_NS::DATA_TYPE_INT8;
+        TNN_NS::MatType mat_type;
+        if (format_type == 0) {
+            data_type = TNN_NS::DATA_TYPE_FLOAT;
+            mat_type  = TNN_NS::NCHW_FLOAT;
+        } else if (format_type == 1) {
+            mat_type = TNN_NS::N8UC3;
+        } else if (format_type == 2) {
+            mat_type = TNN_NS::NGRAY;
+        }
+
+        int bytes = TNN_NS::DimsVectorUtils::Count(blob_desc.dims) * TNN_NS::DataTypeUtils::GetBytesSize(data_type);
+        void* mat_data = malloc(bytes);
+        auto mat       = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, mat_type, blob_desc.dims, mat_data);
+
+        mat_map[name] = mat;
+    }
+    return mat_map;
+}
+
+void TnnRuntime::InitInputMatMap(TNN_NS::MatMap& mat_map) {
+    for (const auto& iter : mat_map) {
+        auto name      = iter.first;
+        auto mat       = iter.second;
+        void* mat_data = mat->GetData();
+        int data_count = TNN_NS::DimsVectorUtils::Count(mat->GetDims());
+        auto mat_type  = mat->GetMatType();
+        for (int i = 0; i < data_count; i++) {
+            if (mat_type == TNN_NS::NCHW_FLOAT) {
+                reinterpret_cast<float*>(mat_data)[i] = (float)(rand() % 256 - 128) / 128.0f;
+            } else {
+                reinterpret_cast<uint8_t*>(mat_data)[i] = (rand() % 256);
+            }
+        }
+    }
+}
+
+std::map<std::string, std::shared_ptr<TNN_NS::BlobConverter>> TnnRuntime::CreateBlobConverterMap(
+    TNN_NS::BlobMap& blob_map) {
+    std::map<std::string, std::shared_ptr<TNN_NS::BlobConverter>> converter_map;
+    for (auto iter : blob_map) {
+        auto blob_converter       = std::make_shared<TNN_NS::BlobConverter>(iter.second);
+        converter_map[iter.first] = blob_converter;
+    }
+    return converter_map;
+}
+
+std::map<std::string, TNN_NS::MatConvertParam> TnnRuntime::CreateConvertParamMap(TNN_NS::MatMap& mat_map) {
+    std::map<std::string, TNN_NS::MatConvertParam> param_map;
+    for (auto iter : mat_map) {
+        TNN_NS::MatConvertParam param;
+        auto name     = iter.first;
+        auto mat      = iter.second;
+        auto mat_type = mat->GetMatType();
+        auto dims     = mat->GetDims();
+        if (mat_type != TNN_NS::NCHW_FLOAT) {
+            std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
+            std::fill(param.bias.begin(), param.bias.end(), 0);
+        } else if (dims[1] > 4) {
+            param.scale = std::vector<float>(dims[1], 1);
+            param.bias  = std::vector<float>(dims[1], 0);
+        }
+        param_map[name] = param;
+    }
+    return param_map;
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.h b/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.h
new file mode 100644
index 0000000..888e20e
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/runtime/tnn_runtime.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_RUNTIME_TNN_RUNTIME_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_RUNTIME_TNN_RUNTIME_H_
+#include "include/tnn/core/common.h"
+#include "include/tnn/core/mat.h"
+#include "include/tnn/core/status.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_CONVERTER {
+
+class TnnRuntime {
+public:
+    TnnRuntime();
+    ~TnnRuntime();
+    TNN_NS::Status ConstantFolding(const std::shared_ptr<TNN_NS::AbstractModelInterpreter> interpreter);
+    TNN_NS::Status AlignModel(const std::shared_ptr<TNN_NS::AbstractModelInterpreter> interpreter);
+
+private:
+    TNN_NS::MatMap CreateBlobMatMap(TNN_NS::BlobMap& blob_map, int format_type);
+    void InitInputMatMap(TNN_NS::MatMap& mat_map);
+    std::map<std::string, std::shared_ptr<TNN_NS::BlobConverter>> CreateBlobConverterMap(TNN_NS::BlobMap& blob_map);
+    std::map<std::string, TNN_NS::MatConvertParam> CreateConvertParamMap(TNN_NS::MatMap& mat_map);
+    TNN_NS::NetworkConfig network_config_;
+    TNN_NS::ModelConfig model_config_;
+};
+}  // namespace TNN_CONVERTER
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_RUNTIME_TNN_RUNTIME_H_
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema.fbs b/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema.fbs
new file mode 100644
index 0000000..468bbc3
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema.fbs
@@ -0,0 +1,1099 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126
+}
+
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 4.
+  pot_scale_int16:bool = true;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+}
+
+root_type Model;
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema_generated.h b/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema_generated.h
new file mode 100644
index 0000000..6e1ef6c
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite-schema/schema_generated.h
@@ -0,0 +1,19124 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+
+struct CustomQuantization;
+struct CustomQuantizationBuilder;
+struct CustomQuantizationT;
+
+struct QuantizationParameters;
+struct QuantizationParametersBuilder;
+struct QuantizationParametersT;
+
+struct Int32Vector;
+struct Int32VectorBuilder;
+struct Int32VectorT;
+
+struct Uint16Vector;
+struct Uint16VectorBuilder;
+struct Uint16VectorT;
+
+struct Uint8Vector;
+struct Uint8VectorBuilder;
+struct Uint8VectorT;
+
+struct DimensionMetadata;
+struct DimensionMetadataBuilder;
+struct DimensionMetadataT;
+
+struct SparsityParameters;
+struct SparsityParametersBuilder;
+struct SparsityParametersT;
+
+struct Tensor;
+struct TensorBuilder;
+struct TensorT;
+
+struct Conv2DOptions;
+struct Conv2DOptionsBuilder;
+struct Conv2DOptionsT;
+
+struct Pool2DOptions;
+struct Pool2DOptionsBuilder;
+struct Pool2DOptionsT;
+
+struct DepthwiseConv2DOptions;
+struct DepthwiseConv2DOptionsBuilder;
+struct DepthwiseConv2DOptionsT;
+
+struct ConcatEmbeddingsOptions;
+struct ConcatEmbeddingsOptionsBuilder;
+struct ConcatEmbeddingsOptionsT;
+
+struct LSHProjectionOptions;
+struct LSHProjectionOptionsBuilder;
+struct LSHProjectionOptionsT;
+
+struct SVDFOptions;
+struct SVDFOptionsBuilder;
+struct SVDFOptionsT;
+
+struct RNNOptions;
+struct RNNOptionsBuilder;
+struct RNNOptionsT;
+
+struct SequenceRNNOptions;
+struct SequenceRNNOptionsBuilder;
+struct SequenceRNNOptionsT;
+
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsBuilder;
+struct BidirectionalSequenceRNNOptionsT;
+
+struct FullyConnectedOptions;
+struct FullyConnectedOptionsBuilder;
+struct FullyConnectedOptionsT;
+
+struct SoftmaxOptions;
+struct SoftmaxOptionsBuilder;
+struct SoftmaxOptionsT;
+
+struct ConcatenationOptions;
+struct ConcatenationOptionsBuilder;
+struct ConcatenationOptionsT;
+
+struct AddOptions;
+struct AddOptionsBuilder;
+struct AddOptionsT;
+
+struct MulOptions;
+struct MulOptionsBuilder;
+struct MulOptionsT;
+
+struct L2NormOptions;
+struct L2NormOptionsBuilder;
+struct L2NormOptionsT;
+
+struct LocalResponseNormalizationOptions;
+struct LocalResponseNormalizationOptionsBuilder;
+struct LocalResponseNormalizationOptionsT;
+
+struct LSTMOptions;
+struct LSTMOptionsBuilder;
+struct LSTMOptionsT;
+
+struct UnidirectionalSequenceLSTMOptions;
+struct UnidirectionalSequenceLSTMOptionsBuilder;
+struct UnidirectionalSequenceLSTMOptionsT;
+
+struct BidirectionalSequenceLSTMOptions;
+struct BidirectionalSequenceLSTMOptionsBuilder;
+struct BidirectionalSequenceLSTMOptionsT;
+
+struct ResizeBilinearOptions;
+struct ResizeBilinearOptionsBuilder;
+struct ResizeBilinearOptionsT;
+
+struct ResizeNearestNeighborOptions;
+struct ResizeNearestNeighborOptionsBuilder;
+struct ResizeNearestNeighborOptionsT;
+
+struct CallOptions;
+struct CallOptionsBuilder;
+struct CallOptionsT;
+
+struct PadOptions;
+struct PadOptionsBuilder;
+struct PadOptionsT;
+
+struct PadV2Options;
+struct PadV2OptionsBuilder;
+struct PadV2OptionsT;
+
+struct ReshapeOptions;
+struct ReshapeOptionsBuilder;
+struct ReshapeOptionsT;
+
+struct SpaceToBatchNDOptions;
+struct SpaceToBatchNDOptionsBuilder;
+struct SpaceToBatchNDOptionsT;
+
+struct BatchToSpaceNDOptions;
+struct BatchToSpaceNDOptionsBuilder;
+struct BatchToSpaceNDOptionsT;
+
+struct SkipGramOptions;
+struct SkipGramOptionsBuilder;
+struct SkipGramOptionsT;
+
+struct SpaceToDepthOptions;
+struct SpaceToDepthOptionsBuilder;
+struct SpaceToDepthOptionsT;
+
+struct DepthToSpaceOptions;
+struct DepthToSpaceOptionsBuilder;
+struct DepthToSpaceOptionsT;
+
+struct SubOptions;
+struct SubOptionsBuilder;
+struct SubOptionsT;
+
+struct DivOptions;
+struct DivOptionsBuilder;
+struct DivOptionsT;
+
+struct TopKV2Options;
+struct TopKV2OptionsBuilder;
+struct TopKV2OptionsT;
+
+struct EmbeddingLookupSparseOptions;
+struct EmbeddingLookupSparseOptionsBuilder;
+struct EmbeddingLookupSparseOptionsT;
+
+struct GatherOptions;
+struct GatherOptionsBuilder;
+struct GatherOptionsT;
+
+struct TransposeOptions;
+struct TransposeOptionsBuilder;
+struct TransposeOptionsT;
+
+struct ExpOptions;
+struct ExpOptionsBuilder;
+struct ExpOptionsT;
+
+struct CosOptions;
+struct CosOptionsBuilder;
+struct CosOptionsT;
+
+struct ReducerOptions;
+struct ReducerOptionsBuilder;
+struct ReducerOptionsT;
+
+struct SqueezeOptions;
+struct SqueezeOptionsBuilder;
+struct SqueezeOptionsT;
+
+struct SplitOptions;
+struct SplitOptionsBuilder;
+struct SplitOptionsT;
+
+struct SplitVOptions;
+struct SplitVOptionsBuilder;
+struct SplitVOptionsT;
+
+struct StridedSliceOptions;
+struct StridedSliceOptionsBuilder;
+struct StridedSliceOptionsT;
+
+struct LogSoftmaxOptions;
+struct LogSoftmaxOptionsBuilder;
+struct LogSoftmaxOptionsT;
+
+struct CastOptions;
+struct CastOptionsBuilder;
+struct CastOptionsT;
+
+struct DequantizeOptions;
+struct DequantizeOptionsBuilder;
+struct DequantizeOptionsT;
+
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsBuilder;
+struct MaximumMinimumOptionsT;
+
+struct TileOptions;
+struct TileOptionsBuilder;
+struct TileOptionsT;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsBuilder;
+struct ArgMaxOptionsT;
+
+struct ArgMinOptions;
+struct ArgMinOptionsBuilder;
+struct ArgMinOptionsT;
+
+struct GreaterOptions;
+struct GreaterOptionsBuilder;
+struct GreaterOptionsT;
+
+struct GreaterEqualOptions;
+struct GreaterEqualOptionsBuilder;
+struct GreaterEqualOptionsT;
+
+struct LessOptions;
+struct LessOptionsBuilder;
+struct LessOptionsT;
+
+struct LessEqualOptions;
+struct LessEqualOptionsBuilder;
+struct LessEqualOptionsT;
+
+struct NegOptions;
+struct NegOptionsBuilder;
+struct NegOptionsT;
+
+struct SelectOptions;
+struct SelectOptionsBuilder;
+struct SelectOptionsT;
+
+struct SliceOptions;
+struct SliceOptionsBuilder;
+struct SliceOptionsT;
+
+struct TransposeConvOptions;
+struct TransposeConvOptionsBuilder;
+struct TransposeConvOptionsT;
+
+struct ExpandDimsOptions;
+struct ExpandDimsOptionsBuilder;
+struct ExpandDimsOptionsT;
+
+struct SparseToDenseOptions;
+struct SparseToDenseOptionsBuilder;
+struct SparseToDenseOptionsT;
+
+struct EqualOptions;
+struct EqualOptionsBuilder;
+struct EqualOptionsT;
+
+struct NotEqualOptions;
+struct NotEqualOptionsBuilder;
+struct NotEqualOptionsT;
+
+struct ShapeOptions;
+struct ShapeOptionsBuilder;
+struct ShapeOptionsT;
+
+struct RankOptions;
+struct RankOptionsBuilder;
+struct RankOptionsT;
+
+struct PowOptions;
+struct PowOptionsBuilder;
+struct PowOptionsT;
+
+struct FakeQuantOptions;
+struct FakeQuantOptionsBuilder;
+struct FakeQuantOptionsT;
+
+struct PackOptions;
+struct PackOptionsBuilder;
+struct PackOptionsT;
+
+struct LogicalOrOptions;
+struct LogicalOrOptionsBuilder;
+struct LogicalOrOptionsT;
+
+struct OneHotOptions;
+struct OneHotOptionsBuilder;
+struct OneHotOptionsT;
+
+struct AbsOptions;
+struct AbsOptionsBuilder;
+struct AbsOptionsT;
+
+struct HardSwishOptions;
+struct HardSwishOptionsBuilder;
+struct HardSwishOptionsT;
+
+struct LogicalAndOptions;
+struct LogicalAndOptionsBuilder;
+struct LogicalAndOptionsT;
+
+struct LogicalNotOptions;
+struct LogicalNotOptionsBuilder;
+struct LogicalNotOptionsT;
+
+struct UnpackOptions;
+struct UnpackOptionsBuilder;
+struct UnpackOptionsT;
+
+struct FloorDivOptions;
+struct FloorDivOptionsBuilder;
+struct FloorDivOptionsT;
+
+struct SquareOptions;
+struct SquareOptionsBuilder;
+struct SquareOptionsT;
+
+struct ZerosLikeOptions;
+struct ZerosLikeOptionsBuilder;
+struct ZerosLikeOptionsT;
+
+struct FillOptions;
+struct FillOptionsBuilder;
+struct FillOptionsT;
+
+struct FloorModOptions;
+struct FloorModOptionsBuilder;
+struct FloorModOptionsT;
+
+struct RangeOptions;
+struct RangeOptionsBuilder;
+struct RangeOptionsT;
+
+struct LeakyReluOptions;
+struct LeakyReluOptionsBuilder;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsBuilder;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsBuilder;
+struct MirrorPadOptionsT;
+
+struct UniqueOptions;
+struct UniqueOptionsBuilder;
+struct UniqueOptionsT;
+
+struct ReverseV2Options;
+struct ReverseV2OptionsBuilder;
+struct ReverseV2OptionsT;
+
+struct AddNOptions;
+struct AddNOptionsBuilder;
+struct AddNOptionsT;
+
+struct GatherNdOptions;
+struct GatherNdOptionsBuilder;
+struct GatherNdOptionsT;
+
+struct WhereOptions;
+struct WhereOptionsBuilder;
+struct WhereOptionsT;
+
+struct ReverseSequenceOptions;
+struct ReverseSequenceOptionsBuilder;
+struct ReverseSequenceOptionsT;
+
+struct MatrixDiagOptions;
+struct MatrixDiagOptionsBuilder;
+struct MatrixDiagOptionsT;
+
+struct QuantizeOptions;
+struct QuantizeOptionsBuilder;
+struct QuantizeOptionsT;
+
+struct MatrixSetDiagOptions;
+struct MatrixSetDiagOptionsBuilder;
+struct MatrixSetDiagOptionsT;
+
+struct IfOptions;
+struct IfOptionsBuilder;
+struct IfOptionsT;
+
+struct WhileOptions;
+struct WhileOptionsBuilder;
+struct WhileOptionsT;
+
+struct NonMaxSuppressionV4Options;
+struct NonMaxSuppressionV4OptionsBuilder;
+struct NonMaxSuppressionV4OptionsT;
+
+struct NonMaxSuppressionV5Options;
+struct NonMaxSuppressionV5OptionsBuilder;
+struct NonMaxSuppressionV5OptionsT;
+
+struct ScatterNdOptions;
+struct ScatterNdOptionsBuilder;
+struct ScatterNdOptionsT;
+
+struct SelectV2Options;
+struct SelectV2OptionsBuilder;
+struct SelectV2OptionsT;
+
+struct DensifyOptions;
+struct DensifyOptionsBuilder;
+struct DensifyOptionsT;
+
+struct SegmentSumOptions;
+struct SegmentSumOptionsBuilder;
+struct SegmentSumOptionsT;
+
+struct BatchMatMulOptions;
+struct BatchMatMulOptionsBuilder;
+struct BatchMatMulOptionsT;
+
+struct OperatorCode;
+struct OperatorCodeBuilder;
+struct OperatorCodeT;
+
+struct Operator;
+struct OperatorBuilder;
+struct OperatorT;
+
+struct SubGraph;
+struct SubGraphBuilder;
+struct SubGraphT;
+
+struct Buffer;
+struct BufferBuilder;
+struct BufferT;
+
+struct Metadata;
+struct MetadataBuilder;
+struct MetadataT;
+
+struct Model;
+struct ModelBuilder;
+struct ModelT;
+
+inline const flatbuffers::TypeTable *CustomQuantizationTypeTable();
+
+inline const flatbuffers::TypeTable *QuantizationParametersTypeTable();
+
+inline const flatbuffers::TypeTable *Int32VectorTypeTable();
+
+inline const flatbuffers::TypeTable *Uint16VectorTypeTable();
+
+inline const flatbuffers::TypeTable *Uint8VectorTypeTable();
+
+inline const flatbuffers::TypeTable *DimensionMetadataTypeTable();
+
+inline const flatbuffers::TypeTable *SparsityParametersTypeTable();
+
+inline const flatbuffers::TypeTable *TensorTypeTable();
+
+inline const flatbuffers::TypeTable *Conv2DOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *Pool2DOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *DepthwiseConv2DOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ConcatEmbeddingsOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LSHProjectionOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SVDFOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *RNNOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SequenceRNNOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *BidirectionalSequenceRNNOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *FullyConnectedOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SoftmaxOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ConcatenationOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *AddOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *MulOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *L2NormOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LocalResponseNormalizationOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LSTMOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *UnidirectionalSequenceLSTMOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *BidirectionalSequenceLSTMOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ResizeBilinearOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ResizeNearestNeighborOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *CallOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *PadOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *PadV2OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ReshapeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SpaceToBatchNDOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *BatchToSpaceNDOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SkipGramOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SpaceToDepthOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *DepthToSpaceOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SubOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *DivOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *TopKV2OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *EmbeddingLookupSparseOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *GatherOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *TransposeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ExpOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *CosOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ReducerOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SqueezeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SplitOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SplitVOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *StridedSliceOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LogSoftmaxOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *CastOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *DequantizeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *MaximumMinimumOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *TileOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ArgMaxOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ArgMinOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *GreaterOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *GreaterEqualOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LessOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LessEqualOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *NegOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SelectOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SliceOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *TransposeConvOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ExpandDimsOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SparseToDenseOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *EqualOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *NotEqualOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ShapeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *RankOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *PowOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *FakeQuantOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *PackOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LogicalOrOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *OneHotOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *AbsOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *HardSwishOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LogicalAndOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LogicalNotOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *UnpackOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *FloorDivOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SquareOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ZerosLikeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *FillOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *FloorModOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *RangeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *LeakyReluOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SquaredDifferenceOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *MirrorPadOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *UniqueOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ReverseV2OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *AddNOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *GatherNdOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *WhereOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ReverseSequenceOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *MatrixDiagOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *QuantizeOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *MatrixSetDiagOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *IfOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *WhileOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *NonMaxSuppressionV4OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *NonMaxSuppressionV5OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *ScatterNdOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SelectV2OptionsTypeTable();
+
+inline const flatbuffers::TypeTable *DensifyOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *SegmentSumOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *BatchMatMulOptionsTypeTable();
+
+inline const flatbuffers::TypeTable *OperatorCodeTypeTable();
+
+inline const flatbuffers::TypeTable *OperatorTypeTable();
+
+inline const flatbuffers::TypeTable *SubGraphTypeTable();
+
+inline const flatbuffers::TypeTable *BufferTypeTable();
+
+inline const flatbuffers::TypeTable *MetadataTypeTable();
+
+inline const flatbuffers::TypeTable *ModelTypeTable();
+
+enum TensorType : int8_t {
+  TensorType_FLOAT32 = 0,
+  TensorType_FLOAT16 = 1,
+  TensorType_INT32 = 2,
+  TensorType_UINT8 = 3,
+  TensorType_INT64 = 4,
+  TensorType_STRING = 5,
+  TensorType_BOOL = 6,
+  TensorType_INT16 = 7,
+  TensorType_COMPLEX64 = 8,
+  TensorType_INT8 = 9,
+  TensorType_FLOAT64 = 10,
+  TensorType_COMPLEX128 = 11,
+  TensorType_MIN = TensorType_FLOAT32,
+  TensorType_MAX = TensorType_COMPLEX128
+};
+
+inline const TensorType (&EnumValuesTensorType())[12] {
+  static const TensorType values[] = {
+    TensorType_FLOAT32,
+    TensorType_FLOAT16,
+    TensorType_INT32,
+    TensorType_UINT8,
+    TensorType_INT64,
+    TensorType_STRING,
+    TensorType_BOOL,
+    TensorType_INT16,
+    TensorType_COMPLEX64,
+    TensorType_INT8,
+    TensorType_FLOAT64,
+    TensorType_COMPLEX128
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTensorType() {
+  static const char * const names[13] = {
+    "FLOAT32",
+    "FLOAT16",
+    "INT32",
+    "UINT8",
+    "INT64",
+    "STRING",
+    "BOOL",
+    "INT16",
+    "COMPLEX64",
+    "INT8",
+    "FLOAT64",
+    "COMPLEX128",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTensorType(TensorType e) {
+  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_COMPLEX128)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTensorType()[index];
+}
+
+enum QuantizationDetails : uint8_t {
+  QuantizationDetails_NONE = 0,
+  QuantizationDetails_CustomQuantization = 1,
+  QuantizationDetails_MIN = QuantizationDetails_NONE,
+  QuantizationDetails_MAX = QuantizationDetails_CustomQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[2] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails_NONE,
+    QuantizationDetails_CustomQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[3] = {
+    "NONE",
+    "CustomQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  if (flatbuffers::IsOutRange(e, QuantizationDetails_NONE, QuantizationDetails_CustomQuantization)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsTraits<tflite::CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+struct QuantizationDetailsUnion {
+  QuantizationDetails type;
+  void *value;
+
+  QuantizationDetailsUnion() : type(QuantizationDetails_NONE), value(nullptr) {}
+  QuantizationDetailsUnion(QuantizationDetailsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(QuantizationDetails_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  QuantizationDetailsUnion(const QuantizationDetailsUnion &);
+  QuantizationDetailsUnion &operator=(const QuantizationDetailsUnion &u)
+    { QuantizationDetailsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  QuantizationDetailsUnion &operator=(QuantizationDetailsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~QuantizationDetailsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = QuantizationDetailsTraits<typename RT::TableType>::enum_value;
+    if (type != QuantizationDetails_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::CustomQuantizationT *AsCustomQuantization() {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<tflite::CustomQuantizationT *>(value) : nullptr;
+  }
+  const tflite::CustomQuantizationT *AsCustomQuantization() const {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<const tflite::CustomQuantizationT *>(value) : nullptr;
+  }
+};
+
+bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum DimensionType : int8_t {
+  DimensionType_DENSE = 0,
+  DimensionType_SPARSE_CSR = 1,
+  DimensionType_MIN = DimensionType_DENSE,
+  DimensionType_MAX = DimensionType_SPARSE_CSR
+};
+
+inline const DimensionType (&EnumValuesDimensionType())[2] {
+  static const DimensionType values[] = {
+    DimensionType_DENSE,
+    DimensionType_SPARSE_CSR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDimensionType() {
+  static const char * const names[3] = {
+    "DENSE",
+    "SPARSE_CSR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDimensionType(DimensionType e) {
+  if (flatbuffers::IsOutRange(e, DimensionType_DENSE, DimensionType_SPARSE_CSR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDimensionType()[index];
+}
+
+enum SparseIndexVector : uint8_t {
+  SparseIndexVector_NONE = 0,
+  SparseIndexVector_Int32Vector = 1,
+  SparseIndexVector_Uint16Vector = 2,
+  SparseIndexVector_Uint8Vector = 3,
+  SparseIndexVector_MIN = SparseIndexVector_NONE,
+  SparseIndexVector_MAX = SparseIndexVector_Uint8Vector
+};
+
+inline const SparseIndexVector (&EnumValuesSparseIndexVector())[4] {
+  static const SparseIndexVector values[] = {
+    SparseIndexVector_NONE,
+    SparseIndexVector_Int32Vector,
+    SparseIndexVector_Uint16Vector,
+    SparseIndexVector_Uint8Vector
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesSparseIndexVector() {
+  static const char * const names[5] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameSparseIndexVector(SparseIndexVector e) {
+  if (flatbuffers::IsOutRange(e, SparseIndexVector_NONE, SparseIndexVector_Uint8Vector)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesSparseIndexVector()[index];
+}
+
+template<typename T> struct SparseIndexVectorTraits {
+  static const SparseIndexVector enum_value = SparseIndexVector_NONE;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Int32Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Int32Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint16Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint16Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint8Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint8Vector;
+};
+
+struct SparseIndexVectorUnion {
+  SparseIndexVector type;
+  void *value;
+
+  SparseIndexVectorUnion() : type(SparseIndexVector_NONE), value(nullptr) {}
+  SparseIndexVectorUnion(SparseIndexVectorUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(SparseIndexVector_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  SparseIndexVectorUnion(const SparseIndexVectorUnion &);
+  SparseIndexVectorUnion &operator=(const SparseIndexVectorUnion &u)
+    { SparseIndexVectorUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  SparseIndexVectorUnion &operator=(SparseIndexVectorUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~SparseIndexVectorUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = SparseIndexVectorTraits<typename RT::TableType>::enum_value;
+    if (type != SparseIndexVector_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::Int32VectorT *AsInt32Vector() {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<tflite::Int32VectorT *>(value) : nullptr;
+  }
+  const tflite::Int32VectorT *AsInt32Vector() const {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<const tflite::Int32VectorT *>(value) : nullptr;
+  }
+  tflite::Uint16VectorT *AsUint16Vector() {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<tflite::Uint16VectorT *>(value) : nullptr;
+  }
+  const tflite::Uint16VectorT *AsUint16Vector() const {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<const tflite::Uint16VectorT *>(value) : nullptr;
+  }
+  tflite::Uint8VectorT *AsUint8Vector() {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<tflite::Uint8VectorT *>(value) : nullptr;
+  }
+  const tflite::Uint8VectorT *AsUint8Vector() const {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<const tflite::Uint8VectorT *>(value) : nullptr;
+  }
+};
+
+bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
+bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum BuiltinOperator : int8_t {
+  BuiltinOperator_ADD = 0,
+  BuiltinOperator_AVERAGE_POOL_2D = 1,
+  BuiltinOperator_CONCATENATION = 2,
+  BuiltinOperator_CONV_2D = 3,
+  BuiltinOperator_DEPTHWISE_CONV_2D = 4,
+  BuiltinOperator_DEPTH_TO_SPACE = 5,
+  BuiltinOperator_DEQUANTIZE = 6,
+  BuiltinOperator_EMBEDDING_LOOKUP = 7,
+  BuiltinOperator_FLOOR = 8,
+  BuiltinOperator_FULLY_CONNECTED = 9,
+  BuiltinOperator_HASHTABLE_LOOKUP = 10,
+  BuiltinOperator_L2_NORMALIZATION = 11,
+  BuiltinOperator_L2_POOL_2D = 12,
+  BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION = 13,
+  BuiltinOperator_LOGISTIC = 14,
+  BuiltinOperator_LSH_PROJECTION = 15,
+  BuiltinOperator_LSTM = 16,
+  BuiltinOperator_MAX_POOL_2D = 17,
+  BuiltinOperator_MUL = 18,
+  BuiltinOperator_RELU = 19,
+  BuiltinOperator_RELU_N1_TO_1 = 20,
+  BuiltinOperator_RELU6 = 21,
+  BuiltinOperator_RESHAPE = 22,
+  BuiltinOperator_RESIZE_BILINEAR = 23,
+  BuiltinOperator_RNN = 24,
+  BuiltinOperator_SOFTMAX = 25,
+  BuiltinOperator_SPACE_TO_DEPTH = 26,
+  BuiltinOperator_SVDF = 27,
+  BuiltinOperator_TANH = 28,
+  BuiltinOperator_CONCAT_EMBEDDINGS = 29,
+  BuiltinOperator_SKIP_GRAM = 30,
+  BuiltinOperator_CALL = 31,
+  BuiltinOperator_CUSTOM = 32,
+  BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
+  BuiltinOperator_PAD = 34,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  BuiltinOperator_GATHER = 36,
+  BuiltinOperator_BATCH_TO_SPACE_ND = 37,
+  BuiltinOperator_SPACE_TO_BATCH_ND = 38,
+  BuiltinOperator_TRANSPOSE = 39,
+  BuiltinOperator_MEAN = 40,
+  BuiltinOperator_SUB = 41,
+  BuiltinOperator_DIV = 42,
+  BuiltinOperator_SQUEEZE = 43,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  BuiltinOperator_EXP = 47,
+  BuiltinOperator_TOPK_V2 = 48,
+  BuiltinOperator_SPLIT = 49,
+  BuiltinOperator_LOG_SOFTMAX = 50,
+  BuiltinOperator_DELEGATE = 51,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  BuiltinOperator_CAST = 53,
+  BuiltinOperator_PRELU = 54,
+  BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
+  BuiltinOperator_LESS = 58,
+  BuiltinOperator_NEG = 59,
+  BuiltinOperator_PADV2 = 60,
+  BuiltinOperator_GREATER = 61,
+  BuiltinOperator_GREATER_EQUAL = 62,
+  BuiltinOperator_LESS_EQUAL = 63,
+  BuiltinOperator_SELECT = 64,
+  BuiltinOperator_SLICE = 65,
+  BuiltinOperator_SIN = 66,
+  BuiltinOperator_TRANSPOSE_CONV = 67,
+  BuiltinOperator_SPARSE_TO_DENSE = 68,
+  BuiltinOperator_TILE = 69,
+  BuiltinOperator_EXPAND_DIMS = 70,
+  BuiltinOperator_EQUAL = 71,
+  BuiltinOperator_NOT_EQUAL = 72,
+  BuiltinOperator_LOG = 73,
+  BuiltinOperator_SUM = 74,
+  BuiltinOperator_SQRT = 75,
+  BuiltinOperator_RSQRT = 76,
+  BuiltinOperator_SHAPE = 77,
+  BuiltinOperator_POW = 78,
+  BuiltinOperator_ARG_MIN = 79,
+  BuiltinOperator_FAKE_QUANT = 80,
+  BuiltinOperator_REDUCE_PROD = 81,
+  BuiltinOperator_REDUCE_MAX = 82,
+  BuiltinOperator_PACK = 83,
+  BuiltinOperator_LOGICAL_OR = 84,
+  BuiltinOperator_ONE_HOT = 85,
+  BuiltinOperator_LOGICAL_AND = 86,
+  BuiltinOperator_LOGICAL_NOT = 87,
+  BuiltinOperator_UNPACK = 88,
+  BuiltinOperator_REDUCE_MIN = 89,
+  BuiltinOperator_FLOOR_DIV = 90,
+  BuiltinOperator_REDUCE_ANY = 91,
+  BuiltinOperator_SQUARE = 92,
+  BuiltinOperator_ZEROS_LIKE = 93,
+  BuiltinOperator_FILL = 94,
+  BuiltinOperator_FLOOR_MOD = 95,
+  BuiltinOperator_RANGE = 96,
+  BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
+  BuiltinOperator_ABS = 101,
+  BuiltinOperator_SPLIT_V = 102,
+  BuiltinOperator_UNIQUE = 103,
+  BuiltinOperator_CEIL = 104,
+  BuiltinOperator_REVERSE_V2 = 105,
+  BuiltinOperator_ADD_N = 106,
+  BuiltinOperator_GATHER_ND = 107,
+  BuiltinOperator_COS = 108,
+  BuiltinOperator_WHERE = 109,
+  BuiltinOperator_RANK = 110,
+  BuiltinOperator_ELU = 111,
+  BuiltinOperator_REVERSE_SEQUENCE = 112,
+  BuiltinOperator_MATRIX_DIAG = 113,
+  BuiltinOperator_QUANTIZE = 114,
+  BuiltinOperator_MATRIX_SET_DIAG = 115,
+  BuiltinOperator_ROUND = 116,
+  BuiltinOperator_HARD_SWISH = 117,
+  BuiltinOperator_IF = 118,
+  BuiltinOperator_WHILE = 119,
+  BuiltinOperator_NON_MAX_SUPPRESSION_V4 = 120,
+  BuiltinOperator_NON_MAX_SUPPRESSION_V5 = 121,
+  BuiltinOperator_SCATTER_ND = 122,
+  BuiltinOperator_SELECT_V2 = 123,
+  BuiltinOperator_DENSIFY = 124,
+  BuiltinOperator_SEGMENT_SUM = 125,
+  BuiltinOperator_BATCH_MATMUL = 126,
+  BuiltinOperator_MIN = BuiltinOperator_ADD,
+  BuiltinOperator_MAX = BuiltinOperator_BATCH_MATMUL
+};
+
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[127] {
+  static const BuiltinOperator values[] = {
+    BuiltinOperator_ADD,
+    BuiltinOperator_AVERAGE_POOL_2D,
+    BuiltinOperator_CONCATENATION,
+    BuiltinOperator_CONV_2D,
+    BuiltinOperator_DEPTHWISE_CONV_2D,
+    BuiltinOperator_DEPTH_TO_SPACE,
+    BuiltinOperator_DEQUANTIZE,
+    BuiltinOperator_EMBEDDING_LOOKUP,
+    BuiltinOperator_FLOOR,
+    BuiltinOperator_FULLY_CONNECTED,
+    BuiltinOperator_HASHTABLE_LOOKUP,
+    BuiltinOperator_L2_NORMALIZATION,
+    BuiltinOperator_L2_POOL_2D,
+    BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+    BuiltinOperator_LOGISTIC,
+    BuiltinOperator_LSH_PROJECTION,
+    BuiltinOperator_LSTM,
+    BuiltinOperator_MAX_POOL_2D,
+    BuiltinOperator_MUL,
+    BuiltinOperator_RELU,
+    BuiltinOperator_RELU_N1_TO_1,
+    BuiltinOperator_RELU6,
+    BuiltinOperator_RESHAPE,
+    BuiltinOperator_RESIZE_BILINEAR,
+    BuiltinOperator_RNN,
+    BuiltinOperator_SOFTMAX,
+    BuiltinOperator_SPACE_TO_DEPTH,
+    BuiltinOperator_SVDF,
+    BuiltinOperator_TANH,
+    BuiltinOperator_CONCAT_EMBEDDINGS,
+    BuiltinOperator_SKIP_GRAM,
+    BuiltinOperator_CALL,
+    BuiltinOperator_CUSTOM,
+    BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+    BuiltinOperator_PAD,
+    BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator_GATHER,
+    BuiltinOperator_BATCH_TO_SPACE_ND,
+    BuiltinOperator_SPACE_TO_BATCH_ND,
+    BuiltinOperator_TRANSPOSE,
+    BuiltinOperator_MEAN,
+    BuiltinOperator_SUB,
+    BuiltinOperator_DIV,
+    BuiltinOperator_SQUEEZE,
+    BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator_STRIDED_SLICE,
+    BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator_EXP,
+    BuiltinOperator_TOPK_V2,
+    BuiltinOperator_SPLIT,
+    BuiltinOperator_LOG_SOFTMAX,
+    BuiltinOperator_DELEGATE,
+    BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator_CAST,
+    BuiltinOperator_PRELU,
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM,
+    BuiltinOperator_LESS,
+    BuiltinOperator_NEG,
+    BuiltinOperator_PADV2,
+    BuiltinOperator_GREATER,
+    BuiltinOperator_GREATER_EQUAL,
+    BuiltinOperator_LESS_EQUAL,
+    BuiltinOperator_SELECT,
+    BuiltinOperator_SLICE,
+    BuiltinOperator_SIN,
+    BuiltinOperator_TRANSPOSE_CONV,
+    BuiltinOperator_SPARSE_TO_DENSE,
+    BuiltinOperator_TILE,
+    BuiltinOperator_EXPAND_DIMS,
+    BuiltinOperator_EQUAL,
+    BuiltinOperator_NOT_EQUAL,
+    BuiltinOperator_LOG,
+    BuiltinOperator_SUM,
+    BuiltinOperator_SQRT,
+    BuiltinOperator_RSQRT,
+    BuiltinOperator_SHAPE,
+    BuiltinOperator_POW,
+    BuiltinOperator_ARG_MIN,
+    BuiltinOperator_FAKE_QUANT,
+    BuiltinOperator_REDUCE_PROD,
+    BuiltinOperator_REDUCE_MAX,
+    BuiltinOperator_PACK,
+    BuiltinOperator_LOGICAL_OR,
+    BuiltinOperator_ONE_HOT,
+    BuiltinOperator_LOGICAL_AND,
+    BuiltinOperator_LOGICAL_NOT,
+    BuiltinOperator_UNPACK,
+    BuiltinOperator_REDUCE_MIN,
+    BuiltinOperator_FLOOR_DIV,
+    BuiltinOperator_REDUCE_ANY,
+    BuiltinOperator_SQUARE,
+    BuiltinOperator_ZEROS_LIKE,
+    BuiltinOperator_FILL,
+    BuiltinOperator_FLOOR_MOD,
+    BuiltinOperator_RANGE,
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD,
+    BuiltinOperator_ABS,
+    BuiltinOperator_SPLIT_V,
+    BuiltinOperator_UNIQUE,
+    BuiltinOperator_CEIL,
+    BuiltinOperator_REVERSE_V2,
+    BuiltinOperator_ADD_N,
+    BuiltinOperator_GATHER_ND,
+    BuiltinOperator_COS,
+    BuiltinOperator_WHERE,
+    BuiltinOperator_RANK,
+    BuiltinOperator_ELU,
+    BuiltinOperator_REVERSE_SEQUENCE,
+    BuiltinOperator_MATRIX_DIAG,
+    BuiltinOperator_QUANTIZE,
+    BuiltinOperator_MATRIX_SET_DIAG,
+    BuiltinOperator_ROUND,
+    BuiltinOperator_HARD_SWISH,
+    BuiltinOperator_IF,
+    BuiltinOperator_WHILE,
+    BuiltinOperator_NON_MAX_SUPPRESSION_V4,
+    BuiltinOperator_NON_MAX_SUPPRESSION_V5,
+    BuiltinOperator_SCATTER_ND,
+    BuiltinOperator_SELECT_V2,
+    BuiltinOperator_DENSIFY,
+    BuiltinOperator_SEGMENT_SUM,
+    BuiltinOperator_BATCH_MATMUL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOperator() {
+  static const char * const names[128] = {
+    "ADD",
+    "AVERAGE_POOL_2D",
+    "CONCATENATION",
+    "CONV_2D",
+    "DEPTHWISE_CONV_2D",
+    "DEPTH_TO_SPACE",
+    "DEQUANTIZE",
+    "EMBEDDING_LOOKUP",
+    "FLOOR",
+    "FULLY_CONNECTED",
+    "HASHTABLE_LOOKUP",
+    "L2_NORMALIZATION",
+    "L2_POOL_2D",
+    "LOCAL_RESPONSE_NORMALIZATION",
+    "LOGISTIC",
+    "LSH_PROJECTION",
+    "LSTM",
+    "MAX_POOL_2D",
+    "MUL",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "RESHAPE",
+    "RESIZE_BILINEAR",
+    "RNN",
+    "SOFTMAX",
+    "SPACE_TO_DEPTH",
+    "SVDF",
+    "TANH",
+    "CONCAT_EMBEDDINGS",
+    "SKIP_GRAM",
+    "CALL",
+    "CUSTOM",
+    "EMBEDDING_LOOKUP_SPARSE",
+    "PAD",
+    "UNIDIRECTIONAL_SEQUENCE_RNN",
+    "GATHER",
+    "BATCH_TO_SPACE_ND",
+    "SPACE_TO_BATCH_ND",
+    "TRANSPOSE",
+    "MEAN",
+    "SUB",
+    "DIV",
+    "SQUEEZE",
+    "UNIDIRECTIONAL_SEQUENCE_LSTM",
+    "STRIDED_SLICE",
+    "BIDIRECTIONAL_SEQUENCE_RNN",
+    "EXP",
+    "TOPK_V2",
+    "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
+    "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
+    "LOG",
+    "SUM",
+    "SQRT",
+    "RSQRT",
+    "SHAPE",
+    "POW",
+    "ARG_MIN",
+    "FAKE_QUANT",
+    "REDUCE_PROD",
+    "REDUCE_MAX",
+    "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
+    "LOGICAL_AND",
+    "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
+    "SQUARE",
+    "ZEROS_LIKE",
+    "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
+    "MATRIX_DIAG",
+    "QUANTIZE",
+    "MATRIX_SET_DIAG",
+    "ROUND",
+    "HARD_SWISH",
+    "IF",
+    "WHILE",
+    "NON_MAX_SUPPRESSION_V4",
+    "NON_MAX_SUPPRESSION_V5",
+    "SCATTER_ND",
+    "SELECT_V2",
+    "DENSIFY",
+    "SEGMENT_SUM",
+    "BATCH_MATMUL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
+  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_BATCH_MATMUL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOperator()[index];
+}
+
+enum BuiltinOptions : uint8_t {
+  BuiltinOptions_NONE = 0,
+  BuiltinOptions_Conv2DOptions = 1,
+  BuiltinOptions_DepthwiseConv2DOptions = 2,
+  BuiltinOptions_ConcatEmbeddingsOptions = 3,
+  BuiltinOptions_LSHProjectionOptions = 4,
+  BuiltinOptions_Pool2DOptions = 5,
+  BuiltinOptions_SVDFOptions = 6,
+  BuiltinOptions_RNNOptions = 7,
+  BuiltinOptions_FullyConnectedOptions = 8,
+  BuiltinOptions_SoftmaxOptions = 9,
+  BuiltinOptions_ConcatenationOptions = 10,
+  BuiltinOptions_AddOptions = 11,
+  BuiltinOptions_L2NormOptions = 12,
+  BuiltinOptions_LocalResponseNormalizationOptions = 13,
+  BuiltinOptions_LSTMOptions = 14,
+  BuiltinOptions_ResizeBilinearOptions = 15,
+  BuiltinOptions_CallOptions = 16,
+  BuiltinOptions_ReshapeOptions = 17,
+  BuiltinOptions_SkipGramOptions = 18,
+  BuiltinOptions_SpaceToDepthOptions = 19,
+  BuiltinOptions_EmbeddingLookupSparseOptions = 20,
+  BuiltinOptions_MulOptions = 21,
+  BuiltinOptions_PadOptions = 22,
+  BuiltinOptions_GatherOptions = 23,
+  BuiltinOptions_BatchToSpaceNDOptions = 24,
+  BuiltinOptions_SpaceToBatchNDOptions = 25,
+  BuiltinOptions_TransposeOptions = 26,
+  BuiltinOptions_ReducerOptions = 27,
+  BuiltinOptions_SubOptions = 28,
+  BuiltinOptions_DivOptions = 29,
+  BuiltinOptions_SqueezeOptions = 30,
+  BuiltinOptions_SequenceRNNOptions = 31,
+  BuiltinOptions_StridedSliceOptions = 32,
+  BuiltinOptions_ExpOptions = 33,
+  BuiltinOptions_TopKV2Options = 34,
+  BuiltinOptions_SplitOptions = 35,
+  BuiltinOptions_LogSoftmaxOptions = 36,
+  BuiltinOptions_CastOptions = 37,
+  BuiltinOptions_DequantizeOptions = 38,
+  BuiltinOptions_MaximumMinimumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
+  BuiltinOptions_LessOptions = 41,
+  BuiltinOptions_NegOptions = 42,
+  BuiltinOptions_PadV2Options = 43,
+  BuiltinOptions_GreaterOptions = 44,
+  BuiltinOptions_GreaterEqualOptions = 45,
+  BuiltinOptions_LessEqualOptions = 46,
+  BuiltinOptions_SelectOptions = 47,
+  BuiltinOptions_SliceOptions = 48,
+  BuiltinOptions_TransposeConvOptions = 49,
+  BuiltinOptions_SparseToDenseOptions = 50,
+  BuiltinOptions_TileOptions = 51,
+  BuiltinOptions_ExpandDimsOptions = 52,
+  BuiltinOptions_EqualOptions = 53,
+  BuiltinOptions_NotEqualOptions = 54,
+  BuiltinOptions_ShapeOptions = 55,
+  BuiltinOptions_PowOptions = 56,
+  BuiltinOptions_ArgMinOptions = 57,
+  BuiltinOptions_FakeQuantOptions = 58,
+  BuiltinOptions_PackOptions = 59,
+  BuiltinOptions_LogicalOrOptions = 60,
+  BuiltinOptions_OneHotOptions = 61,
+  BuiltinOptions_LogicalAndOptions = 62,
+  BuiltinOptions_LogicalNotOptions = 63,
+  BuiltinOptions_UnpackOptions = 64,
+  BuiltinOptions_FloorDivOptions = 65,
+  BuiltinOptions_SquareOptions = 66,
+  BuiltinOptions_ZerosLikeOptions = 67,
+  BuiltinOptions_FillOptions = 68,
+  BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
+  BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
+  BuiltinOptions_UnidirectionalSequenceLSTMOptions = 71,
+  BuiltinOptions_FloorModOptions = 72,
+  BuiltinOptions_RangeOptions = 73,
+  BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
+  BuiltinOptions_AbsOptions = 78,
+  BuiltinOptions_SplitVOptions = 79,
+  BuiltinOptions_UniqueOptions = 80,
+  BuiltinOptions_ReverseV2Options = 81,
+  BuiltinOptions_AddNOptions = 82,
+  BuiltinOptions_GatherNdOptions = 83,
+  BuiltinOptions_CosOptions = 84,
+  BuiltinOptions_WhereOptions = 85,
+  BuiltinOptions_RankOptions = 86,
+  BuiltinOptions_ReverseSequenceOptions = 87,
+  BuiltinOptions_MatrixDiagOptions = 88,
+  BuiltinOptions_QuantizeOptions = 89,
+  BuiltinOptions_MatrixSetDiagOptions = 90,
+  BuiltinOptions_HardSwishOptions = 91,
+  BuiltinOptions_IfOptions = 92,
+  BuiltinOptions_WhileOptions = 93,
+  BuiltinOptions_DepthToSpaceOptions = 94,
+  BuiltinOptions_NonMaxSuppressionV4Options = 95,
+  BuiltinOptions_NonMaxSuppressionV5Options = 96,
+  BuiltinOptions_ScatterNdOptions = 97,
+  BuiltinOptions_SelectV2Options = 98,
+  BuiltinOptions_DensifyOptions = 99,
+  BuiltinOptions_SegmentSumOptions = 100,
+  BuiltinOptions_BatchMatMulOptions = 101,
+  BuiltinOptions_MIN = BuiltinOptions_NONE,
+  BuiltinOptions_MAX = BuiltinOptions_BatchMatMulOptions
+};
+
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[102] {
+  static const BuiltinOptions values[] = {
+    BuiltinOptions_NONE,
+    BuiltinOptions_Conv2DOptions,
+    BuiltinOptions_DepthwiseConv2DOptions,
+    BuiltinOptions_ConcatEmbeddingsOptions,
+    BuiltinOptions_LSHProjectionOptions,
+    BuiltinOptions_Pool2DOptions,
+    BuiltinOptions_SVDFOptions,
+    BuiltinOptions_RNNOptions,
+    BuiltinOptions_FullyConnectedOptions,
+    BuiltinOptions_SoftmaxOptions,
+    BuiltinOptions_ConcatenationOptions,
+    BuiltinOptions_AddOptions,
+    BuiltinOptions_L2NormOptions,
+    BuiltinOptions_LocalResponseNormalizationOptions,
+    BuiltinOptions_LSTMOptions,
+    BuiltinOptions_ResizeBilinearOptions,
+    BuiltinOptions_CallOptions,
+    BuiltinOptions_ReshapeOptions,
+    BuiltinOptions_SkipGramOptions,
+    BuiltinOptions_SpaceToDepthOptions,
+    BuiltinOptions_EmbeddingLookupSparseOptions,
+    BuiltinOptions_MulOptions,
+    BuiltinOptions_PadOptions,
+    BuiltinOptions_GatherOptions,
+    BuiltinOptions_BatchToSpaceNDOptions,
+    BuiltinOptions_SpaceToBatchNDOptions,
+    BuiltinOptions_TransposeOptions,
+    BuiltinOptions_ReducerOptions,
+    BuiltinOptions_SubOptions,
+    BuiltinOptions_DivOptions,
+    BuiltinOptions_SqueezeOptions,
+    BuiltinOptions_SequenceRNNOptions,
+    BuiltinOptions_StridedSliceOptions,
+    BuiltinOptions_ExpOptions,
+    BuiltinOptions_TopKV2Options,
+    BuiltinOptions_SplitOptions,
+    BuiltinOptions_LogSoftmaxOptions,
+    BuiltinOptions_CastOptions,
+    BuiltinOptions_DequantizeOptions,
+    BuiltinOptions_MaximumMinimumOptions,
+    BuiltinOptions_ArgMaxOptions,
+    BuiltinOptions_LessOptions,
+    BuiltinOptions_NegOptions,
+    BuiltinOptions_PadV2Options,
+    BuiltinOptions_GreaterOptions,
+    BuiltinOptions_GreaterEqualOptions,
+    BuiltinOptions_LessEqualOptions,
+    BuiltinOptions_SelectOptions,
+    BuiltinOptions_SliceOptions,
+    BuiltinOptions_TransposeConvOptions,
+    BuiltinOptions_SparseToDenseOptions,
+    BuiltinOptions_TileOptions,
+    BuiltinOptions_ExpandDimsOptions,
+    BuiltinOptions_EqualOptions,
+    BuiltinOptions_NotEqualOptions,
+    BuiltinOptions_ShapeOptions,
+    BuiltinOptions_PowOptions,
+    BuiltinOptions_ArgMinOptions,
+    BuiltinOptions_FakeQuantOptions,
+    BuiltinOptions_PackOptions,
+    BuiltinOptions_LogicalOrOptions,
+    BuiltinOptions_OneHotOptions,
+    BuiltinOptions_LogicalAndOptions,
+    BuiltinOptions_LogicalNotOptions,
+    BuiltinOptions_UnpackOptions,
+    BuiltinOptions_FloorDivOptions,
+    BuiltinOptions_SquareOptions,
+    BuiltinOptions_ZerosLikeOptions,
+    BuiltinOptions_FillOptions,
+    BuiltinOptions_BidirectionalSequenceLSTMOptions,
+    BuiltinOptions_BidirectionalSequenceRNNOptions,
+    BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+    BuiltinOptions_FloorModOptions,
+    BuiltinOptions_RangeOptions,
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions,
+    BuiltinOptions_AbsOptions,
+    BuiltinOptions_SplitVOptions,
+    BuiltinOptions_UniqueOptions,
+    BuiltinOptions_ReverseV2Options,
+    BuiltinOptions_AddNOptions,
+    BuiltinOptions_GatherNdOptions,
+    BuiltinOptions_CosOptions,
+    BuiltinOptions_WhereOptions,
+    BuiltinOptions_RankOptions,
+    BuiltinOptions_ReverseSequenceOptions,
+    BuiltinOptions_MatrixDiagOptions,
+    BuiltinOptions_QuantizeOptions,
+    BuiltinOptions_MatrixSetDiagOptions,
+    BuiltinOptions_HardSwishOptions,
+    BuiltinOptions_IfOptions,
+    BuiltinOptions_WhileOptions,
+    BuiltinOptions_DepthToSpaceOptions,
+    BuiltinOptions_NonMaxSuppressionV4Options,
+    BuiltinOptions_NonMaxSuppressionV5Options,
+    BuiltinOptions_ScatterNdOptions,
+    BuiltinOptions_SelectV2Options,
+    BuiltinOptions_DensifyOptions,
+    BuiltinOptions_SegmentSumOptions,
+    BuiltinOptions_BatchMatMulOptions
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOptions() {
+  static const char * const names[103] = {
+    "NONE",
+    "Conv2DOptions",
+    "DepthwiseConv2DOptions",
+    "ConcatEmbeddingsOptions",
+    "LSHProjectionOptions",
+    "Pool2DOptions",
+    "SVDFOptions",
+    "RNNOptions",
+    "FullyConnectedOptions",
+    "SoftmaxOptions",
+    "ConcatenationOptions",
+    "AddOptions",
+    "L2NormOptions",
+    "LocalResponseNormalizationOptions",
+    "LSTMOptions",
+    "ResizeBilinearOptions",
+    "CallOptions",
+    "ReshapeOptions",
+    "SkipGramOptions",
+    "SpaceToDepthOptions",
+    "EmbeddingLookupSparseOptions",
+    "MulOptions",
+    "PadOptions",
+    "GatherOptions",
+    "BatchToSpaceNDOptions",
+    "SpaceToBatchNDOptions",
+    "TransposeOptions",
+    "ReducerOptions",
+    "SubOptions",
+    "DivOptions",
+    "SqueezeOptions",
+    "SequenceRNNOptions",
+    "StridedSliceOptions",
+    "ExpOptions",
+    "TopKV2Options",
+    "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
+    "ShapeOptions",
+    "PowOptions",
+    "ArgMinOptions",
+    "FakeQuantOptions",
+    "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
+    "LogicalAndOptions",
+    "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
+    "SquareOptions",
+    "ZerosLikeOptions",
+    "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
+    "MatrixDiagOptions",
+    "QuantizeOptions",
+    "MatrixSetDiagOptions",
+    "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
+    "NonMaxSuppressionV4Options",
+    "NonMaxSuppressionV5Options",
+    "ScatterNdOptions",
+    "SelectV2Options",
+    "DensifyOptions",
+    "SegmentSumOptions",
+    "BatchMatMulOptions",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
+  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_BatchMatMulOptions)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOptions()[index];
+}
+
+template<typename T> struct BuiltinOptionsTraits {
+  static const BuiltinOptions enum_value = BuiltinOptions_NONE;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Conv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthwiseConv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatEmbeddingsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSHProjectionOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Pool2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SVDFOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FullyConnectedOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatenationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::L2NormOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LocalResponseNormalizationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeBilinearOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReshapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SkipGramOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToDepthOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EmbeddingLookupSparseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchToSpaceNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchToSpaceNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToBatchNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToBatchNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReducerOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReducerOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SubOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SubOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SqueezeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SqueezeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::StridedSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_StridedSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TopKV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TopKV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogSoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogSoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DequantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NegOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NegOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeConvOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SparseToDenseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpandDimsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NotEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NotEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ShapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ShapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PowOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PowOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMinOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMinOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FakeQuantOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FakeQuantOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalOrOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::OneHotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalAndOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalAndOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalNotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalNotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnpackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnpackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorDivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorDivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquareOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquareOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ZerosLikeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ZerosLikeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FillOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FillOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorModOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorModOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RangeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RangeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeNearestNeighborOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseSequenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseSequenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::QuantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_QuantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixSetDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MatrixSetDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HardSwishOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_HardSwishOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::IfOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_IfOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthToSpaceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DepthToSpaceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV4Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV4Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV5Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NonMaxSuppressionV5Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ScatterNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ScatterNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DensifyOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DensifyOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SegmentSumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchMatMulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchMatMulOptions;
+};
+
+struct BuiltinOptionsUnion {
+  BuiltinOptions type;
+  void *value;
+
+  BuiltinOptionsUnion() : type(BuiltinOptions_NONE), value(nullptr) {}
+  BuiltinOptionsUnion(BuiltinOptionsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(BuiltinOptions_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  BuiltinOptionsUnion(const BuiltinOptionsUnion &);
+  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u)
+    { BuiltinOptionsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~BuiltinOptionsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = BuiltinOptionsTraits<typename RT::TableType>::enum_value;
+    if (type != BuiltinOptions_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  tflite::Conv2DOptionsT *AsConv2DOptions() {
+    return type == BuiltinOptions_Conv2DOptions ?
+      reinterpret_cast<tflite::Conv2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::Conv2DOptionsT *AsConv2DOptions() const {
+    return type == BuiltinOptions_Conv2DOptions ?
+      reinterpret_cast<const tflite::Conv2DOptionsT *>(value) : nullptr;
+  }
+  tflite::DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() {
+    return type == BuiltinOptions_DepthwiseConv2DOptions ?
+      reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() const {
+    return type == BuiltinOptions_DepthwiseConv2DOptions ?
+      reinterpret_cast<const tflite::DepthwiseConv2DOptionsT *>(value) : nullptr;
+  }
+  tflite::ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
+      reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(value) : nullptr;
+  }
+  const tflite::ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() const {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
+      reinterpret_cast<const tflite::ConcatEmbeddingsOptionsT *>(value) : nullptr;
+  }
+  tflite::LSHProjectionOptionsT *AsLSHProjectionOptions() {
+    return type == BuiltinOptions_LSHProjectionOptions ?
+      reinterpret_cast<tflite::LSHProjectionOptionsT *>(value) : nullptr;
+  }
+  const tflite::LSHProjectionOptionsT *AsLSHProjectionOptions() const {
+    return type == BuiltinOptions_LSHProjectionOptions ?
+      reinterpret_cast<const tflite::LSHProjectionOptionsT *>(value) : nullptr;
+  }
+  tflite::Pool2DOptionsT *AsPool2DOptions() {
+    return type == BuiltinOptions_Pool2DOptions ?
+      reinterpret_cast<tflite::Pool2DOptionsT *>(value) : nullptr;
+  }
+  const tflite::Pool2DOptionsT *AsPool2DOptions() const {
+    return type == BuiltinOptions_Pool2DOptions ?
+      reinterpret_cast<const tflite::Pool2DOptionsT *>(value) : nullptr;
+  }
+  tflite::SVDFOptionsT *AsSVDFOptions() {
+    return type == BuiltinOptions_SVDFOptions ?
+      reinterpret_cast<tflite::SVDFOptionsT *>(value) : nullptr;
+  }
+  const tflite::SVDFOptionsT *AsSVDFOptions() const {
+    return type == BuiltinOptions_SVDFOptions ?
+      reinterpret_cast<const tflite::SVDFOptionsT *>(value) : nullptr;
+  }
+  tflite::RNNOptionsT *AsRNNOptions() {
+    return type == BuiltinOptions_RNNOptions ?
+      reinterpret_cast<tflite::RNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::RNNOptionsT *AsRNNOptions() const {
+    return type == BuiltinOptions_RNNOptions ?
+      reinterpret_cast<const tflite::RNNOptionsT *>(value) : nullptr;
+  }
+  tflite::FullyConnectedOptionsT *AsFullyConnectedOptions() {
+    return type == BuiltinOptions_FullyConnectedOptions ?
+      reinterpret_cast<tflite::FullyConnectedOptionsT *>(value) : nullptr;
+  }
+  const tflite::FullyConnectedOptionsT *AsFullyConnectedOptions() const {
+    return type == BuiltinOptions_FullyConnectedOptions ?
+      reinterpret_cast<const tflite::FullyConnectedOptionsT *>(value) : nullptr;
+  }
+  tflite::SoftmaxOptionsT *AsSoftmaxOptions() {
+    return type == BuiltinOptions_SoftmaxOptions ?
+      reinterpret_cast<tflite::SoftmaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::SoftmaxOptionsT *AsSoftmaxOptions() const {
+    return type == BuiltinOptions_SoftmaxOptions ?
+      reinterpret_cast<const tflite::SoftmaxOptionsT *>(value) : nullptr;
+  }
+  tflite::ConcatenationOptionsT *AsConcatenationOptions() {
+    return type == BuiltinOptions_ConcatenationOptions ?
+      reinterpret_cast<tflite::ConcatenationOptionsT *>(value) : nullptr;
+  }
+  const tflite::ConcatenationOptionsT *AsConcatenationOptions() const {
+    return type == BuiltinOptions_ConcatenationOptions ?
+      reinterpret_cast<const tflite::ConcatenationOptionsT *>(value) : nullptr;
+  }
+  tflite::AddOptionsT *AsAddOptions() {
+    return type == BuiltinOptions_AddOptions ?
+      reinterpret_cast<tflite::AddOptionsT *>(value) : nullptr;
+  }
+  const tflite::AddOptionsT *AsAddOptions() const {
+    return type == BuiltinOptions_AddOptions ?
+      reinterpret_cast<const tflite::AddOptionsT *>(value) : nullptr;
+  }
+  tflite::L2NormOptionsT *AsL2NormOptions() {
+    return type == BuiltinOptions_L2NormOptions ?
+      reinterpret_cast<tflite::L2NormOptionsT *>(value) : nullptr;
+  }
+  const tflite::L2NormOptionsT *AsL2NormOptions() const {
+    return type == BuiltinOptions_L2NormOptions ?
+      reinterpret_cast<const tflite::L2NormOptionsT *>(value) : nullptr;
+  }
+  tflite::LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
+      reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  }
+  const tflite::LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() const {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
+      reinterpret_cast<const tflite::LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  }
+  tflite::LSTMOptionsT *AsLSTMOptions() {
+    return type == BuiltinOptions_LSTMOptions ?
+      reinterpret_cast<tflite::LSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::LSTMOptionsT *AsLSTMOptions() const {
+    return type == BuiltinOptions_LSTMOptions ?
+      reinterpret_cast<const tflite::LSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::ResizeBilinearOptionsT *AsResizeBilinearOptions() {
+    return type == BuiltinOptions_ResizeBilinearOptions ?
+      reinterpret_cast<tflite::ResizeBilinearOptionsT *>(value) : nullptr;
+  }
+  const tflite::ResizeBilinearOptionsT *AsResizeBilinearOptions() const {
+    return type == BuiltinOptions_ResizeBilinearOptions ?
+      reinterpret_cast<const tflite::ResizeBilinearOptionsT *>(value) : nullptr;
+  }
+  tflite::CallOptionsT *AsCallOptions() {
+    return type == BuiltinOptions_CallOptions ?
+      reinterpret_cast<tflite::CallOptionsT *>(value) : nullptr;
+  }
+  const tflite::CallOptionsT *AsCallOptions() const {
+    return type == BuiltinOptions_CallOptions ?
+      reinterpret_cast<const tflite::CallOptionsT *>(value) : nullptr;
+  }
+  tflite::ReshapeOptionsT *AsReshapeOptions() {
+    return type == BuiltinOptions_ReshapeOptions ?
+      reinterpret_cast<tflite::ReshapeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReshapeOptionsT *AsReshapeOptions() const {
+    return type == BuiltinOptions_ReshapeOptions ?
+      reinterpret_cast<const tflite::ReshapeOptionsT *>(value) : nullptr;
+  }
+  tflite::SkipGramOptionsT *AsSkipGramOptions() {
+    return type == BuiltinOptions_SkipGramOptions ?
+      reinterpret_cast<tflite::SkipGramOptionsT *>(value) : nullptr;
+  }
+  const tflite::SkipGramOptionsT *AsSkipGramOptions() const {
+    return type == BuiltinOptions_SkipGramOptions ?
+      reinterpret_cast<const tflite::SkipGramOptionsT *>(value) : nullptr;
+  }
+  tflite::SpaceToDepthOptionsT *AsSpaceToDepthOptions() {
+    return type == BuiltinOptions_SpaceToDepthOptions ?
+      reinterpret_cast<tflite::SpaceToDepthOptionsT *>(value) : nullptr;
+  }
+  const tflite::SpaceToDepthOptionsT *AsSpaceToDepthOptions() const {
+    return type == BuiltinOptions_SpaceToDepthOptions ?
+      reinterpret_cast<const tflite::SpaceToDepthOptionsT *>(value) : nullptr;
+  }
+  tflite::EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
+      reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+  }
+  const tflite::EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() const {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
+      reinterpret_cast<const tflite::EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+  }
+  tflite::MulOptionsT *AsMulOptions() {
+    return type == BuiltinOptions_MulOptions ?
+      reinterpret_cast<tflite::MulOptionsT *>(value) : nullptr;
+  }
+  const tflite::MulOptionsT *AsMulOptions() const {
+    return type == BuiltinOptions_MulOptions ?
+      reinterpret_cast<const tflite::MulOptionsT *>(value) : nullptr;
+  }
+  tflite::PadOptionsT *AsPadOptions() {
+    return type == BuiltinOptions_PadOptions ?
+      reinterpret_cast<tflite::PadOptionsT *>(value) : nullptr;
+  }
+  const tflite::PadOptionsT *AsPadOptions() const {
+    return type == BuiltinOptions_PadOptions ?
+      reinterpret_cast<const tflite::PadOptionsT *>(value) : nullptr;
+  }
+  tflite::GatherOptionsT *AsGatherOptions() {
+    return type == BuiltinOptions_GatherOptions ?
+      reinterpret_cast<tflite::GatherOptionsT *>(value) : nullptr;
+  }
+  const tflite::GatherOptionsT *AsGatherOptions() const {
+    return type == BuiltinOptions_GatherOptions ?
+      reinterpret_cast<const tflite::GatherOptionsT *>(value) : nullptr;
+  }
+  tflite::BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() {
+    return type == BuiltinOptions_BatchToSpaceNDOptions ?
+      reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(value) : nullptr;
+  }
+  const tflite::BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() const {
+    return type == BuiltinOptions_BatchToSpaceNDOptions ?
+      reinterpret_cast<const tflite::BatchToSpaceNDOptionsT *>(value) : nullptr;
+  }
+  tflite::SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() {
+    return type == BuiltinOptions_SpaceToBatchNDOptions ?
+      reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(value) : nullptr;
+  }
+  const tflite::SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() const {
+    return type == BuiltinOptions_SpaceToBatchNDOptions ?
+      reinterpret_cast<const tflite::SpaceToBatchNDOptionsT *>(value) : nullptr;
+  }
+  tflite::TransposeOptionsT *AsTransposeOptions() {
+    return type == BuiltinOptions_TransposeOptions ?
+      reinterpret_cast<tflite::TransposeOptionsT *>(value) : nullptr;
+  }
+  const tflite::TransposeOptionsT *AsTransposeOptions() const {
+    return type == BuiltinOptions_TransposeOptions ?
+      reinterpret_cast<const tflite::TransposeOptionsT *>(value) : nullptr;
+  }
+  tflite::ReducerOptionsT *AsReducerOptions() {
+    return type == BuiltinOptions_ReducerOptions ?
+      reinterpret_cast<tflite::ReducerOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReducerOptionsT *AsReducerOptions() const {
+    return type == BuiltinOptions_ReducerOptions ?
+      reinterpret_cast<const tflite::ReducerOptionsT *>(value) : nullptr;
+  }
+  tflite::SubOptionsT *AsSubOptions() {
+    return type == BuiltinOptions_SubOptions ?
+      reinterpret_cast<tflite::SubOptionsT *>(value) : nullptr;
+  }
+  const tflite::SubOptionsT *AsSubOptions() const {
+    return type == BuiltinOptions_SubOptions ?
+      reinterpret_cast<const tflite::SubOptionsT *>(value) : nullptr;
+  }
+  tflite::DivOptionsT *AsDivOptions() {
+    return type == BuiltinOptions_DivOptions ?
+      reinterpret_cast<tflite::DivOptionsT *>(value) : nullptr;
+  }
+  const tflite::DivOptionsT *AsDivOptions() const {
+    return type == BuiltinOptions_DivOptions ?
+      reinterpret_cast<const tflite::DivOptionsT *>(value) : nullptr;
+  }
+  tflite::SqueezeOptionsT *AsSqueezeOptions() {
+    return type == BuiltinOptions_SqueezeOptions ?
+      reinterpret_cast<tflite::SqueezeOptionsT *>(value) : nullptr;
+  }
+  const tflite::SqueezeOptionsT *AsSqueezeOptions() const {
+    return type == BuiltinOptions_SqueezeOptions ?
+      reinterpret_cast<const tflite::SqueezeOptionsT *>(value) : nullptr;
+  }
+  tflite::SequenceRNNOptionsT *AsSequenceRNNOptions() {
+    return type == BuiltinOptions_SequenceRNNOptions ?
+      reinterpret_cast<tflite::SequenceRNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::SequenceRNNOptionsT *AsSequenceRNNOptions() const {
+    return type == BuiltinOptions_SequenceRNNOptions ?
+      reinterpret_cast<const tflite::SequenceRNNOptionsT *>(value) : nullptr;
+  }
+  tflite::StridedSliceOptionsT *AsStridedSliceOptions() {
+    return type == BuiltinOptions_StridedSliceOptions ?
+      reinterpret_cast<tflite::StridedSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::StridedSliceOptionsT *AsStridedSliceOptions() const {
+    return type == BuiltinOptions_StridedSliceOptions ?
+      reinterpret_cast<const tflite::StridedSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::ExpOptionsT *AsExpOptions() {
+    return type == BuiltinOptions_ExpOptions ?
+      reinterpret_cast<tflite::ExpOptionsT *>(value) : nullptr;
+  }
+  const tflite::ExpOptionsT *AsExpOptions() const {
+    return type == BuiltinOptions_ExpOptions ?
+      reinterpret_cast<const tflite::ExpOptionsT *>(value) : nullptr;
+  }
+  tflite::TopKV2OptionsT *AsTopKV2Options() {
+    return type == BuiltinOptions_TopKV2Options ?
+      reinterpret_cast<tflite::TopKV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::TopKV2OptionsT *AsTopKV2Options() const {
+    return type == BuiltinOptions_TopKV2Options ?
+      reinterpret_cast<const tflite::TopKV2OptionsT *>(value) : nullptr;
+  }
+  tflite::SplitOptionsT *AsSplitOptions() {
+    return type == BuiltinOptions_SplitOptions ?
+      reinterpret_cast<tflite::SplitOptionsT *>(value) : nullptr;
+  }
+  const tflite::SplitOptionsT *AsSplitOptions() const {
+    return type == BuiltinOptions_SplitOptions ?
+      reinterpret_cast<const tflite::SplitOptionsT *>(value) : nullptr;
+  }
+  tflite::LogSoftmaxOptionsT *AsLogSoftmaxOptions() {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<tflite::LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogSoftmaxOptionsT *AsLogSoftmaxOptions() const {
+    return type == BuiltinOptions_LogSoftmaxOptions ?
+      reinterpret_cast<const tflite::LogSoftmaxOptionsT *>(value) : nullptr;
+  }
+  tflite::CastOptionsT *AsCastOptions() {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<tflite::CastOptionsT *>(value) : nullptr;
+  }
+  const tflite::CastOptionsT *AsCastOptions() const {
+    return type == BuiltinOptions_CastOptions ?
+      reinterpret_cast<const tflite::CastOptionsT *>(value) : nullptr;
+  }
+  tflite::DequantizeOptionsT *AsDequantizeOptions() {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<tflite::DequantizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::DequantizeOptionsT *AsDequantizeOptions() const {
+    return type == BuiltinOptions_DequantizeOptions ?
+      reinterpret_cast<const tflite::DequantizeOptionsT *>(value) : nullptr;
+  }
+  tflite::MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<tflite::MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  const tflite::MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const tflite::MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  tflite::ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<tflite::ArgMaxOptionsT *>(value) : nullptr;
+  }
+  const tflite::ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const tflite::ArgMaxOptionsT *>(value) : nullptr;
+  }
+  tflite::LessOptionsT *AsLessOptions() {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<tflite::LessOptionsT *>(value) : nullptr;
+  }
+  const tflite::LessOptionsT *AsLessOptions() const {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<const tflite::LessOptionsT *>(value) : nullptr;
+  }
+  tflite::NegOptionsT *AsNegOptions() {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<tflite::NegOptionsT *>(value) : nullptr;
+  }
+  const tflite::NegOptionsT *AsNegOptions() const {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<const tflite::NegOptionsT *>(value) : nullptr;
+  }
+  tflite::PadV2OptionsT *AsPadV2Options() {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<tflite::PadV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::PadV2OptionsT *AsPadV2Options() const {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<const tflite::PadV2OptionsT *>(value) : nullptr;
+  }
+  tflite::GreaterOptionsT *AsGreaterOptions() {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<tflite::GreaterOptionsT *>(value) : nullptr;
+  }
+  const tflite::GreaterOptionsT *AsGreaterOptions() const {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<const tflite::GreaterOptionsT *>(value) : nullptr;
+  }
+  tflite::GreaterEqualOptionsT *AsGreaterEqualOptions() {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<tflite::GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::GreaterEqualOptionsT *AsGreaterEqualOptions() const {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<const tflite::GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::LessEqualOptionsT *AsLessEqualOptions() {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<tflite::LessEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::LessEqualOptionsT *AsLessEqualOptions() const {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<const tflite::LessEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::SelectOptionsT *AsSelectOptions() {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<tflite::SelectOptionsT *>(value) : nullptr;
+  }
+  const tflite::SelectOptionsT *AsSelectOptions() const {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<const tflite::SelectOptionsT *>(value) : nullptr;
+  }
+  tflite::SliceOptionsT *AsSliceOptions() {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<tflite::SliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::SliceOptionsT *AsSliceOptions() const {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<const tflite::SliceOptionsT *>(value) : nullptr;
+  }
+  tflite::TransposeConvOptionsT *AsTransposeConvOptions() {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<tflite::TransposeConvOptionsT *>(value) : nullptr;
+  }
+  const tflite::TransposeConvOptionsT *AsTransposeConvOptions() const {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<const tflite::TransposeConvOptionsT *>(value) : nullptr;
+  }
+  tflite::SparseToDenseOptionsT *AsSparseToDenseOptions() {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<tflite::SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  const tflite::SparseToDenseOptionsT *AsSparseToDenseOptions() const {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<const tflite::SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  tflite::TileOptionsT *AsTileOptions() {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<tflite::TileOptionsT *>(value) : nullptr;
+  }
+  const tflite::TileOptionsT *AsTileOptions() const {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<const tflite::TileOptionsT *>(value) : nullptr;
+  }
+  tflite::ExpandDimsOptionsT *AsExpandDimsOptions() {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<tflite::ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  const tflite::ExpandDimsOptionsT *AsExpandDimsOptions() const {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<const tflite::ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  tflite::EqualOptionsT *AsEqualOptions() {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<tflite::EqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::EqualOptionsT *AsEqualOptions() const {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<const tflite::EqualOptionsT *>(value) : nullptr;
+  }
+  tflite::NotEqualOptionsT *AsNotEqualOptions() {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<tflite::NotEqualOptionsT *>(value) : nullptr;
+  }
+  const tflite::NotEqualOptionsT *AsNotEqualOptions() const {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<const tflite::NotEqualOptionsT *>(value) : nullptr;
+  }
+  tflite::ShapeOptionsT *AsShapeOptions() {
+    return type == BuiltinOptions_ShapeOptions ?
+      reinterpret_cast<tflite::ShapeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ShapeOptionsT *AsShapeOptions() const {
+    return type == BuiltinOptions_ShapeOptions ?
+      reinterpret_cast<const tflite::ShapeOptionsT *>(value) : nullptr;
+  }
+  tflite::PowOptionsT *AsPowOptions() {
+    return type == BuiltinOptions_PowOptions ?
+      reinterpret_cast<tflite::PowOptionsT *>(value) : nullptr;
+  }
+  const tflite::PowOptionsT *AsPowOptions() const {
+    return type == BuiltinOptions_PowOptions ?
+      reinterpret_cast<const tflite::PowOptionsT *>(value) : nullptr;
+  }
+  tflite::ArgMinOptionsT *AsArgMinOptions() {
+    return type == BuiltinOptions_ArgMinOptions ?
+      reinterpret_cast<tflite::ArgMinOptionsT *>(value) : nullptr;
+  }
+  const tflite::ArgMinOptionsT *AsArgMinOptions() const {
+    return type == BuiltinOptions_ArgMinOptions ?
+      reinterpret_cast<const tflite::ArgMinOptionsT *>(value) : nullptr;
+  }
+  tflite::FakeQuantOptionsT *AsFakeQuantOptions() {
+    return type == BuiltinOptions_FakeQuantOptions ?
+      reinterpret_cast<tflite::FakeQuantOptionsT *>(value) : nullptr;
+  }
+  const tflite::FakeQuantOptionsT *AsFakeQuantOptions() const {
+    return type == BuiltinOptions_FakeQuantOptions ?
+      reinterpret_cast<const tflite::FakeQuantOptionsT *>(value) : nullptr;
+  }
+  tflite::PackOptionsT *AsPackOptions() {
+    return type == BuiltinOptions_PackOptions ?
+      reinterpret_cast<tflite::PackOptionsT *>(value) : nullptr;
+  }
+  const tflite::PackOptionsT *AsPackOptions() const {
+    return type == BuiltinOptions_PackOptions ?
+      reinterpret_cast<const tflite::PackOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalOrOptionsT *AsLogicalOrOptions() {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<tflite::LogicalOrOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalOrOptionsT *AsLogicalOrOptions() const {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<const tflite::LogicalOrOptionsT *>(value) : nullptr;
+  }
+  tflite::OneHotOptionsT *AsOneHotOptions() {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<tflite::OneHotOptionsT *>(value) : nullptr;
+  }
+  const tflite::OneHotOptionsT *AsOneHotOptions() const {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<const tflite::OneHotOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalAndOptionsT *AsLogicalAndOptions() {
+    return type == BuiltinOptions_LogicalAndOptions ?
+      reinterpret_cast<tflite::LogicalAndOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalAndOptionsT *AsLogicalAndOptions() const {
+    return type == BuiltinOptions_LogicalAndOptions ?
+      reinterpret_cast<const tflite::LogicalAndOptionsT *>(value) : nullptr;
+  }
+  tflite::LogicalNotOptionsT *AsLogicalNotOptions() {
+    return type == BuiltinOptions_LogicalNotOptions ?
+      reinterpret_cast<tflite::LogicalNotOptionsT *>(value) : nullptr;
+  }
+  const tflite::LogicalNotOptionsT *AsLogicalNotOptions() const {
+    return type == BuiltinOptions_LogicalNotOptions ?
+      reinterpret_cast<const tflite::LogicalNotOptionsT *>(value) : nullptr;
+  }
+  tflite::UnpackOptionsT *AsUnpackOptions() {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<tflite::UnpackOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnpackOptionsT *AsUnpackOptions() const {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<const tflite::UnpackOptionsT *>(value) : nullptr;
+  }
+  tflite::FloorDivOptionsT *AsFloorDivOptions() {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<tflite::FloorDivOptionsT *>(value) : nullptr;
+  }
+  const tflite::FloorDivOptionsT *AsFloorDivOptions() const {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<const tflite::FloorDivOptionsT *>(value) : nullptr;
+  }
+  tflite::SquareOptionsT *AsSquareOptions() {
+    return type == BuiltinOptions_SquareOptions ?
+      reinterpret_cast<tflite::SquareOptionsT *>(value) : nullptr;
+  }
+  const tflite::SquareOptionsT *AsSquareOptions() const {
+    return type == BuiltinOptions_SquareOptions ?
+      reinterpret_cast<const tflite::SquareOptionsT *>(value) : nullptr;
+  }
+  tflite::ZerosLikeOptionsT *AsZerosLikeOptions() {
+    return type == BuiltinOptions_ZerosLikeOptions ?
+      reinterpret_cast<tflite::ZerosLikeOptionsT *>(value) : nullptr;
+  }
+  const tflite::ZerosLikeOptionsT *AsZerosLikeOptions() const {
+    return type == BuiltinOptions_ZerosLikeOptions ?
+      reinterpret_cast<const tflite::ZerosLikeOptionsT *>(value) : nullptr;
+  }
+  tflite::FillOptionsT *AsFillOptions() {
+    return type == BuiltinOptions_FillOptions ?
+      reinterpret_cast<tflite::FillOptionsT *>(value) : nullptr;
+  }
+  const tflite::FillOptionsT *AsFillOptions() const {
+    return type == BuiltinOptions_FillOptions ?
+      reinterpret_cast<const tflite::FillOptionsT *>(value) : nullptr;
+  }
+  tflite::BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::BidirectionalSequenceLSTMOptionsT *AsBidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
+  const tflite::BidirectionalSequenceRNNOptionsT *AsBidirectionalSequenceRNNOptions() const {
+    return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
+      reinterpret_cast<const tflite::BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
+  }
+  tflite::UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  tflite::FloorModOptionsT *AsFloorModOptions() {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<tflite::FloorModOptionsT *>(value) : nullptr;
+  }
+  const tflite::FloorModOptionsT *AsFloorModOptions() const {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<const tflite::FloorModOptionsT *>(value) : nullptr;
+  }
+  tflite::RangeOptionsT *AsRangeOptions() {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<tflite::RangeOptionsT *>(value) : nullptr;
+  }
+  const tflite::RangeOptionsT *AsRangeOptions() const {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<const tflite::RangeOptionsT *>(value) : nullptr;
+  }
+  tflite::ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  const tflite::ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() const {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<const tflite::ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  tflite::LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<tflite::LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const tflite::LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const tflite::LeakyReluOptionsT *>(value) : nullptr;
+  }
+  tflite::SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const tflite::SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const tflite::SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  tflite::MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<tflite::MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const tflite::MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const tflite::MirrorPadOptionsT *>(value) : nullptr;
+  }
+  tflite::AbsOptionsT *AsAbsOptions() {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<tflite::AbsOptionsT *>(value) : nullptr;
+  }
+  const tflite::AbsOptionsT *AsAbsOptions() const {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<const tflite::AbsOptionsT *>(value) : nullptr;
+  }
+  tflite::SplitVOptionsT *AsSplitVOptions() {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<tflite::SplitVOptionsT *>(value) : nullptr;
+  }
+  const tflite::SplitVOptionsT *AsSplitVOptions() const {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<const tflite::SplitVOptionsT *>(value) : nullptr;
+  }
+  tflite::UniqueOptionsT *AsUniqueOptions() {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<tflite::UniqueOptionsT *>(value) : nullptr;
+  }
+  const tflite::UniqueOptionsT *AsUniqueOptions() const {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<const tflite::UniqueOptionsT *>(value) : nullptr;
+  }
+  tflite::ReverseV2OptionsT *AsReverseV2Options() {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<tflite::ReverseV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::ReverseV2OptionsT *AsReverseV2Options() const {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<const tflite::ReverseV2OptionsT *>(value) : nullptr;
+  }
+  tflite::AddNOptionsT *AsAddNOptions() {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<tflite::AddNOptionsT *>(value) : nullptr;
+  }
+  const tflite::AddNOptionsT *AsAddNOptions() const {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<const tflite::AddNOptionsT *>(value) : nullptr;
+  }
+  tflite::GatherNdOptionsT *AsGatherNdOptions() {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<tflite::GatherNdOptionsT *>(value) : nullptr;
+  }
+  const tflite::GatherNdOptionsT *AsGatherNdOptions() const {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<const tflite::GatherNdOptionsT *>(value) : nullptr;
+  }
+  tflite::CosOptionsT *AsCosOptions() {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<tflite::CosOptionsT *>(value) : nullptr;
+  }
+  const tflite::CosOptionsT *AsCosOptions() const {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<const tflite::CosOptionsT *>(value) : nullptr;
+  }
+  tflite::WhereOptionsT *AsWhereOptions() {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<tflite::WhereOptionsT *>(value) : nullptr;
+  }
+  const tflite::WhereOptionsT *AsWhereOptions() const {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<const tflite::WhereOptionsT *>(value) : nullptr;
+  }
+  tflite::RankOptionsT *AsRankOptions() {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<tflite::RankOptionsT *>(value) : nullptr;
+  }
+  const tflite::RankOptionsT *AsRankOptions() const {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<const tflite::RankOptionsT *>(value) : nullptr;
+  }
+  tflite::ReverseSequenceOptionsT *AsReverseSequenceOptions() {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<tflite::ReverseSequenceOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReverseSequenceOptionsT *AsReverseSequenceOptions() const {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<const tflite::ReverseSequenceOptionsT *>(value) : nullptr;
+  }
+  tflite::MatrixDiagOptionsT *AsMatrixDiagOptions() {
+    return type == BuiltinOptions_MatrixDiagOptions ?
+      reinterpret_cast<tflite::MatrixDiagOptionsT *>(value) : nullptr;
+  }
+  const tflite::MatrixDiagOptionsT *AsMatrixDiagOptions() const {
+    return type == BuiltinOptions_MatrixDiagOptions ?
+      reinterpret_cast<const tflite::MatrixDiagOptionsT *>(value) : nullptr;
+  }
+  tflite::QuantizeOptionsT *AsQuantizeOptions() {
+    return type == BuiltinOptions_QuantizeOptions ?
+      reinterpret_cast<tflite::QuantizeOptionsT *>(value) : nullptr;
+  }
+  const tflite::QuantizeOptionsT *AsQuantizeOptions() const {
+    return type == BuiltinOptions_QuantizeOptions ?
+      reinterpret_cast<const tflite::QuantizeOptionsT *>(value) : nullptr;
+  }
+  tflite::MatrixSetDiagOptionsT *AsMatrixSetDiagOptions() {
+    return type == BuiltinOptions_MatrixSetDiagOptions ?
+      reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(value) : nullptr;
+  }
+  const tflite::MatrixSetDiagOptionsT *AsMatrixSetDiagOptions() const {
+    return type == BuiltinOptions_MatrixSetDiagOptions ?
+      reinterpret_cast<const tflite::MatrixSetDiagOptionsT *>(value) : nullptr;
+  }
+  tflite::HardSwishOptionsT *AsHardSwishOptions() {
+    return type == BuiltinOptions_HardSwishOptions ?
+      reinterpret_cast<tflite::HardSwishOptionsT *>(value) : nullptr;
+  }
+  const tflite::HardSwishOptionsT *AsHardSwishOptions() const {
+    return type == BuiltinOptions_HardSwishOptions ?
+      reinterpret_cast<const tflite::HardSwishOptionsT *>(value) : nullptr;
+  }
+  tflite::IfOptionsT *AsIfOptions() {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<tflite::IfOptionsT *>(value) : nullptr;
+  }
+  const tflite::IfOptionsT *AsIfOptions() const {
+    return type == BuiltinOptions_IfOptions ?
+      reinterpret_cast<const tflite::IfOptionsT *>(value) : nullptr;
+  }
+  tflite::WhileOptionsT *AsWhileOptions() {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<tflite::WhileOptionsT *>(value) : nullptr;
+  }
+  const tflite::WhileOptionsT *AsWhileOptions() const {
+    return type == BuiltinOptions_WhileOptions ?
+      reinterpret_cast<const tflite::WhileOptionsT *>(value) : nullptr;
+  }
+  tflite::DepthToSpaceOptionsT *AsDepthToSpaceOptions() {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<tflite::DepthToSpaceOptionsT *>(value) : nullptr;
+  }
+  const tflite::DepthToSpaceOptionsT *AsDepthToSpaceOptions() const {
+    return type == BuiltinOptions_DepthToSpaceOptions ?
+      reinterpret_cast<const tflite::DepthToSpaceOptionsT *>(value) : nullptr;
+  }
+  tflite::NonMaxSuppressionV4OptionsT *AsNonMaxSuppressionV4Options() {
+    return type == BuiltinOptions_NonMaxSuppressionV4Options ?
+      reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(value) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV4OptionsT *AsNonMaxSuppressionV4Options() const {
+    return type == BuiltinOptions_NonMaxSuppressionV4Options ?
+      reinterpret_cast<const tflite::NonMaxSuppressionV4OptionsT *>(value) : nullptr;
+  }
+  tflite::NonMaxSuppressionV5OptionsT *AsNonMaxSuppressionV5Options() {
+    return type == BuiltinOptions_NonMaxSuppressionV5Options ?
+      reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(value) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV5OptionsT *AsNonMaxSuppressionV5Options() const {
+    return type == BuiltinOptions_NonMaxSuppressionV5Options ?
+      reinterpret_cast<const tflite::NonMaxSuppressionV5OptionsT *>(value) : nullptr;
+  }
+  tflite::ScatterNdOptionsT *AsScatterNdOptions() {
+    return type == BuiltinOptions_ScatterNdOptions ?
+      reinterpret_cast<tflite::ScatterNdOptionsT *>(value) : nullptr;
+  }
+  const tflite::ScatterNdOptionsT *AsScatterNdOptions() const {
+    return type == BuiltinOptions_ScatterNdOptions ?
+      reinterpret_cast<const tflite::ScatterNdOptionsT *>(value) : nullptr;
+  }
+  tflite::SelectV2OptionsT *AsSelectV2Options() {
+    return type == BuiltinOptions_SelectV2Options ?
+      reinterpret_cast<tflite::SelectV2OptionsT *>(value) : nullptr;
+  }
+  const tflite::SelectV2OptionsT *AsSelectV2Options() const {
+    return type == BuiltinOptions_SelectV2Options ?
+      reinterpret_cast<const tflite::SelectV2OptionsT *>(value) : nullptr;
+  }
+  tflite::DensifyOptionsT *AsDensifyOptions() {
+    return type == BuiltinOptions_DensifyOptions ?
+      reinterpret_cast<tflite::DensifyOptionsT *>(value) : nullptr;
+  }
+  const tflite::DensifyOptionsT *AsDensifyOptions() const {
+    return type == BuiltinOptions_DensifyOptions ?
+      reinterpret_cast<const tflite::DensifyOptionsT *>(value) : nullptr;
+  }
+  tflite::SegmentSumOptionsT *AsSegmentSumOptions() {
+    return type == BuiltinOptions_SegmentSumOptions ?
+      reinterpret_cast<tflite::SegmentSumOptionsT *>(value) : nullptr;
+  }
+  const tflite::SegmentSumOptionsT *AsSegmentSumOptions() const {
+    return type == BuiltinOptions_SegmentSumOptions ?
+      reinterpret_cast<const tflite::SegmentSumOptionsT *>(value) : nullptr;
+  }
+  tflite::BatchMatMulOptionsT *AsBatchMatMulOptions() {
+    return type == BuiltinOptions_BatchMatMulOptions ?
+      reinterpret_cast<tflite::BatchMatMulOptionsT *>(value) : nullptr;
+  }
+  const tflite::BatchMatMulOptionsT *AsBatchMatMulOptions() const {
+    return type == BuiltinOptions_BatchMatMulOptions ?
+      reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value) : nullptr;
+  }
+};
+
+bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+enum Padding : int8_t {
+  Padding_SAME = 0,
+  Padding_VALID = 1,
+  Padding_MIN = Padding_SAME,
+  Padding_MAX = Padding_VALID
+};
+
+inline const Padding (&EnumValuesPadding())[2] {
+  static const Padding values[] = {
+    Padding_SAME,
+    Padding_VALID
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPadding() {
+  static const char * const names[3] = {
+    "SAME",
+    "VALID",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePadding(Padding e) {
+  if (flatbuffers::IsOutRange(e, Padding_SAME, Padding_VALID)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPadding()[index];
+}
+
+enum ActivationFunctionType : int8_t {
+  ActivationFunctionType_NONE = 0,
+  ActivationFunctionType_RELU = 1,
+  ActivationFunctionType_RELU_N1_TO_1 = 2,
+  ActivationFunctionType_RELU6 = 3,
+  ActivationFunctionType_TANH = 4,
+  ActivationFunctionType_SIGN_BIT = 5,
+  ActivationFunctionType_MIN = ActivationFunctionType_NONE,
+  ActivationFunctionType_MAX = ActivationFunctionType_SIGN_BIT
+};
+
+inline const ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
+  static const ActivationFunctionType values[] = {
+    ActivationFunctionType_NONE,
+    ActivationFunctionType_RELU,
+    ActivationFunctionType_RELU_N1_TO_1,
+    ActivationFunctionType_RELU6,
+    ActivationFunctionType_TANH,
+    ActivationFunctionType_SIGN_BIT
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesActivationFunctionType() {
+  static const char * const names[7] = {
+    "NONE",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "TANH",
+    "SIGN_BIT",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
+  if (flatbuffers::IsOutRange(e, ActivationFunctionType_NONE, ActivationFunctionType_SIGN_BIT)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesActivationFunctionType()[index];
+}
+
+enum LSHProjectionType : int8_t {
+  LSHProjectionType_UNKNOWN = 0,
+  LSHProjectionType_SPARSE = 1,
+  LSHProjectionType_DENSE = 2,
+  LSHProjectionType_MIN = LSHProjectionType_UNKNOWN,
+  LSHProjectionType_MAX = LSHProjectionType_DENSE
+};
+
+inline const LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
+  static const LSHProjectionType values[] = {
+    LSHProjectionType_UNKNOWN,
+    LSHProjectionType_SPARSE,
+    LSHProjectionType_DENSE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSHProjectionType() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "SPARSE",
+    "DENSE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
+  if (flatbuffers::IsOutRange(e, LSHProjectionType_UNKNOWN, LSHProjectionType_DENSE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSHProjectionType()[index];
+}
+
+enum FullyConnectedOptionsWeightsFormat : int8_t {
+  FullyConnectedOptionsWeightsFormat_DEFAULT = 0,
+  FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8 = 1,
+  FullyConnectedOptionsWeightsFormat_MIN = FullyConnectedOptionsWeightsFormat_DEFAULT,
+  FullyConnectedOptionsWeightsFormat_MAX = FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8
+};
+
+inline const FullyConnectedOptionsWeightsFormat (&EnumValuesFullyConnectedOptionsWeightsFormat())[2] {
+  static const FullyConnectedOptionsWeightsFormat values[] = {
+    FullyConnectedOptionsWeightsFormat_DEFAULT,
+    FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFullyConnectedOptionsWeightsFormat() {
+  static const char * const names[3] = {
+    "DEFAULT",
+    "SHUFFLED4x16INT8",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFullyConnectedOptionsWeightsFormat(FullyConnectedOptionsWeightsFormat e) {
+  if (flatbuffers::IsOutRange(e, FullyConnectedOptionsWeightsFormat_DEFAULT, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFullyConnectedOptionsWeightsFormat()[index];
+}
+
+enum LSTMKernelType : int8_t {
+  LSTMKernelType_FULL = 0,
+  LSTMKernelType_BASIC = 1,
+  LSTMKernelType_MIN = LSTMKernelType_FULL,
+  LSTMKernelType_MAX = LSTMKernelType_BASIC
+};
+
+inline const LSTMKernelType (&EnumValuesLSTMKernelType())[2] {
+  static const LSTMKernelType values[] = {
+    LSTMKernelType_FULL,
+    LSTMKernelType_BASIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSTMKernelType() {
+  static const char * const names[3] = {
+    "FULL",
+    "BASIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  if (flatbuffers::IsOutRange(e, LSTMKernelType_FULL, LSTMKernelType_BASIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSTMKernelType()[index];
+}
+
+enum CombinerType : int8_t {
+  CombinerType_SUM = 0,
+  CombinerType_MEAN = 1,
+  CombinerType_SQRTN = 2,
+  CombinerType_MIN = CombinerType_SUM,
+  CombinerType_MAX = CombinerType_SQRTN
+};
+
+inline const CombinerType (&EnumValuesCombinerType())[3] {
+  static const CombinerType values[] = {
+    CombinerType_SUM,
+    CombinerType_MEAN,
+    CombinerType_SQRTN
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCombinerType() {
+  static const char * const names[4] = {
+    "SUM",
+    "MEAN",
+    "SQRTN",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCombinerType(CombinerType e) {
+  if (flatbuffers::IsOutRange(e, CombinerType_SUM, CombinerType_SQRTN)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCombinerType()[index];
+}
+
+enum MirrorPadMode : int8_t {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[3] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  if (flatbuffers::IsOutRange(e, MirrorPadMode_REFLECT, MirrorPadMode_SYMMETRIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
+enum CustomOptionsFormat : int8_t {
+  CustomOptionsFormat_FLEXBUFFERS = 0,
+  CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
+  CustomOptionsFormat_MAX = CustomOptionsFormat_FLEXBUFFERS
+};
+
+inline const CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
+  static const CustomOptionsFormat values[] = {
+    CustomOptionsFormat_FLEXBUFFERS
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCustomOptionsFormat() {
+  static const char * const names[2] = {
+    "FLEXBUFFERS",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
+  if (flatbuffers::IsOutRange(e, CustomOptionsFormat_FLEXBUFFERS, CustomOptionsFormat_FLEXBUFFERS)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCustomOptionsFormat()[index];
+}
+
+struct CustomQuantizationT : public flatbuffers::NativeTable {
+  typedef CustomQuantization TableType;
+  std::vector<uint8_t> custom{};
+};
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CustomQuantizationT NativeTableType;
+  typedef CustomQuantizationBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CustomQuantizationTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CUSTOM = 4
+  };
+  const flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+  CustomQuantizationT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CustomQuantization> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CustomQuantizationBuilder {
+  typedef CustomQuantization Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *custom = nullptr) {
+  if (custom) { _fbb.ForceVectorAlignment(custom->size(), sizeof(uint8_t), 16); }
+  auto custom__ = custom ? _fbb.CreateVector<uint8_t>(*custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom__);
+}
+
+flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct QuantizationParametersT : public flatbuffers::NativeTable {
+  typedef QuantizationParameters TableType;
+  std::vector<float> min{};
+  std::vector<float> max{};
+  std::vector<float> scale{};
+  std::vector<int64_t> zero_point{};
+  tflite::QuantizationDetailsUnion details{};
+  int32_t quantized_dimension = 0;
+};
+
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef QuantizationParametersT NativeTableType;
+  typedef QuantizationParametersBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return QuantizationParametersTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_SCALE = 8,
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14,
+    VT_QUANTIZED_DIMENSION = 16
+  };
+  const flatbuffers::Vector<float> *min() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
+  }
+  const flatbuffers::Vector<float> *max() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_MAX);
+  }
+  const flatbuffers::Vector<float> *scale() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_SCALE);
+  }
+  const flatbuffers::Vector<int64_t> *zero_point() const {
+    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
+  }
+  tflite::QuantizationDetails details_type() const {
+    return static_cast<tflite::QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const tflite::CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == tflite::QuantizationDetails_CustomQuantization ? static_cast<const tflite::CustomQuantization *>(details()) : nullptr;
+  }
+  int32_t quantized_dimension() const {
+    return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MIN) &&
+           verifier.VerifyVector(min()) &&
+           VerifyOffset(verifier, VT_MAX) &&
+           verifier.VerifyVector(max()) &&
+           VerifyOffset(verifier, VT_SCALE) &&
+           verifier.VerifyVector(scale()) &&
+           VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
+           VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION) &&
+           verifier.EndTable();
+  }
+  QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::CustomQuantization *QuantizationParameters::details_as<tflite::CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
+struct QuantizationParametersBuilder {
+  typedef QuantizationParameters Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_min(flatbuffers::Offset<flatbuffers::Vector<float>> min) {
+    fbb_.AddOffset(QuantizationParameters::VT_MIN, min);
+  }
+  void add_max(flatbuffers::Offset<flatbuffers::Vector<float>> max) {
+    fbb_.AddOffset(QuantizationParameters::VT_MAX, max);
+  }
+  void add_scale(flatbuffers::Offset<flatbuffers::Vector<float>> scale) {
+    fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
+  }
+  void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
+    fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
+  }
+  void add_details_type(tflite::QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
+  void add_quantized_dimension(int32_t quantized_dimension) {
+    fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
+  }
+  explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<QuantizationParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<QuantizationParameters>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
+    flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
+    flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_quantized_dimension(quantized_dimension);
+  builder_.add_details(details);
+  builder_.add_zero_point(zero_point);
+  builder_.add_scale(scale);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_details_type(details_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *min = nullptr,
+    const std::vector<float> *max = nullptr,
+    const std::vector<float> *scale = nullptr,
+    const std::vector<int64_t> *zero_point = nullptr,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  auto min__ = min ? _fbb.CreateVector<float>(*min) : 0;
+  auto max__ = max ? _fbb.CreateVector<float>(*max) : 0;
+  auto scale__ = scale ? _fbb.CreateVector<float>(*scale) : 0;
+  auto zero_point__ = zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0;
+  return tflite::CreateQuantizationParameters(
+      _fbb,
+      min__,
+      max__,
+      scale__,
+      zero_point__,
+      details_type,
+      details,
+      quantized_dimension);
+}
+
+flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Int32VectorT : public flatbuffers::NativeTable {
+  typedef Int32Vector TableType;
+  std::vector<int32_t> values{};
+};
+
+struct Int32Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Int32VectorT NativeTableType;
+  typedef Int32VectorBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Int32VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<int32_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Int32VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Int32Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Int32VectorBuilder {
+  typedef Int32Vector Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<int32_t>> values) {
+    fbb_.AddOffset(Int32Vector::VT_VALUES, values);
+  }
+  explicit Int32VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Int32Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Int32Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> values = 0) {
+  Int32VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<int32_t>(*values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint16VectorT : public flatbuffers::NativeTable {
+  typedef Uint16Vector TableType;
+  std::vector<uint16_t> values{};
+};
+
+struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Uint16VectorT NativeTableType;
+  typedef Uint16VectorBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Uint16VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<uint16_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<uint16_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint16VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Uint16Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint16VectorBuilder {
+  typedef Uint16Vector Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values) {
+    fbb_.AddOffset(Uint16Vector::VT_VALUES, values);
+  }
+  explicit Uint16VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Uint16Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Uint16Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values = 0) {
+  Uint16VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint16_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint16_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint16_t>(*values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint8VectorT : public flatbuffers::NativeTable {
+  typedef Uint8Vector TableType;
+  std::vector<uint8_t> values{};
+};
+
+struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Uint8VectorT NativeTableType;
+  typedef Uint8VectorBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Uint8VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<uint8_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint8VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Uint8Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint8VectorBuilder {
+  typedef Uint8Vector Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values) {
+    fbb_.AddOffset(Uint8Vector::VT_VALUES, values);
+  }
+  explicit Uint8VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Uint8Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Uint8Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values = 0) {
+  Uint8VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint8_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint8_t>(*values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DimensionMetadataT : public flatbuffers::NativeTable {
+  typedef DimensionMetadata TableType;
+  tflite::DimensionType format = tflite::DimensionType_DENSE;
+  int32_t dense_size = 0;
+  tflite::SparseIndexVectorUnion array_segments{};
+  tflite::SparseIndexVectorUnion array_indices{};
+};
+
+struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DimensionMetadataT NativeTableType;
+  typedef DimensionMetadataBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DimensionMetadataTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FORMAT = 4,
+    VT_DENSE_SIZE = 6,
+    VT_ARRAY_SEGMENTS_TYPE = 8,
+    VT_ARRAY_SEGMENTS = 10,
+    VT_ARRAY_INDICES_TYPE = 12,
+    VT_ARRAY_INDICES = 14
+  };
+  tflite::DimensionType format() const {
+    return static_cast<tflite::DimensionType>(GetField<int8_t>(VT_FORMAT, 0));
+  }
+  int32_t dense_size() const {
+    return GetField<int32_t>(VT_DENSE_SIZE, 0);
+  }
+  tflite::SparseIndexVector array_segments_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_SEGMENTS_TYPE, 0));
+  }
+  const void *array_segments() const {
+    return GetPointer<const void *>(VT_ARRAY_SEGMENTS);
+  }
+  template<typename T> const T *array_segments_as() const;
+  const tflite::Int32Vector *array_segments_as_Int32Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Int32Vector ? static_cast<const tflite::Int32Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_segments_as_Uint16Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_segments_as_Uint8Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector_Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_segments()) : nullptr;
+  }
+  tflite::SparseIndexVector array_indices_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_INDICES_TYPE, 0));
+  }
+  const void *array_indices() const {
+    return GetPointer<const void *>(VT_ARRAY_INDICES);
+  }
+  template<typename T> const T *array_indices_as() const;
+  const tflite::Int32Vector *array_indices_as_Int32Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Int32Vector ? static_cast<const tflite::Int32Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_indices_as_Uint16Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_indices_as_Uint8Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector_Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_indices()) : nullptr;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FORMAT) &&
+           VerifyField<int32_t>(verifier, VT_DENSE_SIZE) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_SEGMENTS_TYPE) &&
+           VerifyOffset(verifier, VT_ARRAY_SEGMENTS) &&
+           VerifySparseIndexVector(verifier, array_segments(), array_segments_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_INDICES_TYPE) &&
+           VerifyOffset(verifier, VT_ARRAY_INDICES) &&
+           VerifySparseIndexVector(verifier, array_indices(), array_indices_type()) &&
+           verifier.EndTable();
+  }
+  DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DimensionMetadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_segments_as<tflite::Int32Vector>() const {
+  return array_segments_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_segments_as<tflite::Uint16Vector>() const {
+  return array_segments_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_segments_as<tflite::Uint8Vector>() const {
+  return array_segments_as_Uint8Vector();
+}
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_indices_as<tflite::Int32Vector>() const {
+  return array_indices_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_indices_as<tflite::Uint16Vector>() const {
+  return array_indices_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_indices_as<tflite::Uint8Vector>() const {
+  return array_indices_as_Uint8Vector();
+}
+
+struct DimensionMetadataBuilder {
+  typedef DimensionMetadata Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_format(tflite::DimensionType format) {
+    fbb_.AddElement<int8_t>(DimensionMetadata::VT_FORMAT, static_cast<int8_t>(format), 0);
+  }
+  void add_dense_size(int32_t dense_size) {
+    fbb_.AddElement<int32_t>(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0);
+  }
+  void add_array_segments_type(tflite::SparseIndexVector array_segments_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_SEGMENTS_TYPE, static_cast<uint8_t>(array_segments_type), 0);
+  }
+  void add_array_segments(flatbuffers::Offset<void> array_segments) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments);
+  }
+  void add_array_indices_type(tflite::SparseIndexVector array_indices_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_INDICES_TYPE, static_cast<uint8_t>(array_indices_type), 0);
+  }
+  void add_array_indices(flatbuffers::Offset<void> array_indices) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices);
+  }
+  explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DimensionMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DimensionMetadata>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::DimensionType format = tflite::DimensionType_DENSE,
+    int32_t dense_size = 0,
+    tflite::SparseIndexVector array_segments_type = tflite::SparseIndexVector_NONE,
+    flatbuffers::Offset<void> array_segments = 0,
+    tflite::SparseIndexVector array_indices_type = tflite::SparseIndexVector_NONE,
+    flatbuffers::Offset<void> array_indices = 0) {
+  DimensionMetadataBuilder builder_(_fbb);
+  builder_.add_array_indices(array_indices);
+  builder_.add_array_segments(array_segments);
+  builder_.add_dense_size(dense_size);
+  builder_.add_array_indices_type(array_indices_type);
+  builder_.add_array_segments_type(array_segments_type);
+  builder_.add_format(format);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparsityParametersT : public flatbuffers::NativeTable {
+  typedef SparsityParameters TableType;
+  std::vector<int32_t> traversal_order{};
+  std::vector<int32_t> block_map{};
+  std::vector<std::unique_ptr<tflite::DimensionMetadataT>> dim_metadata{};
+};
+
+struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SparsityParametersT NativeTableType;
+  typedef SparsityParametersBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SparsityParametersTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TRAVERSAL_ORDER = 4,
+    VT_BLOCK_MAP = 6,
+    VT_DIM_METADATA = 8
+  };
+  const flatbuffers::Vector<int32_t> *traversal_order() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_TRAVERSAL_ORDER);
+  }
+  const flatbuffers::Vector<int32_t> *block_map() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_MAP);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>> *>(VT_DIM_METADATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TRAVERSAL_ORDER) &&
+           verifier.VerifyVector(traversal_order()) &&
+           VerifyOffset(verifier, VT_BLOCK_MAP) &&
+           verifier.VerifyVector(block_map()) &&
+           VerifyOffset(verifier, VT_DIM_METADATA) &&
+           verifier.VerifyVector(dim_metadata()) &&
+           verifier.VerifyVectorOfTables(dim_metadata()) &&
+           verifier.EndTable();
+  }
+  SparsityParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SparsityParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparsityParametersBuilder {
+  typedef SparsityParameters Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_traversal_order(flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order) {
+    fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order);
+  }
+  void add_block_map(flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map) {
+    fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
+  }
+  void add_dim_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata) {
+    fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
+  }
+  explicit SparsityParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SparsityParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SparsityParameters>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata = 0) {
+  SparsityParametersBuilder builder_(_fbb);
+  builder_.add_dim_metadata(dim_metadata);
+  builder_.add_block_map(block_map);
+  builder_.add_traversal_order(traversal_order);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *traversal_order = nullptr,
+    const std::vector<int32_t> *block_map = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata = nullptr) {
+  auto traversal_order__ = traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0;
+  auto block_map__ = block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0;
+  auto dim_metadata__ = dim_metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::DimensionMetadata>>(*dim_metadata) : 0;
+  return tflite::CreateSparsityParameters(
+      _fbb,
+      traversal_order__,
+      block_map__,
+      dim_metadata__);
+}
+
+flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TensorT : public flatbuffers::NativeTable {
+  typedef Tensor TableType;
+  std::vector<int32_t> shape{};
+  tflite::TensorType type = tflite::TensorType_FLOAT32;
+  uint32_t buffer = 0;
+  std::string name{};
+  std::unique_ptr<tflite::QuantizationParametersT> quantization{};
+  bool is_variable = false;
+  std::unique_ptr<tflite::SparsityParametersT> sparsity{};
+  std::vector<int32_t> shape_signature{};
+};
+
+struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TensorT NativeTableType;
+  typedef TensorBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TensorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_BUFFER = 8,
+    VT_NAME = 10,
+    VT_QUANTIZATION = 12,
+    VT_IS_VARIABLE = 14,
+    VT_SPARSITY = 16,
+    VT_SHAPE_SIGNATURE = 18
+  };
+  const flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  tflite::TensorType type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  const tflite::QuantizationParameters *quantization() const {
+    return GetPointer<const tflite::QuantizationParameters *>(VT_QUANTIZATION);
+  }
+  bool is_variable() const {
+    return GetField<uint8_t>(VT_IS_VARIABLE, 0) != 0;
+  }
+  const tflite::SparsityParameters *sparsity() const {
+    return GetPointer<const tflite::SparsityParameters *>(VT_SPARSITY);
+  }
+  const flatbuffers::Vector<int32_t> *shape_signature() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE_SIGNATURE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyVector(shape()) &&
+           VerifyField<int8_t>(verifier, VT_TYPE) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_QUANTIZATION) &&
+           verifier.VerifyTable(quantization()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_VARIABLE) &&
+           VerifyOffset(verifier, VT_SPARSITY) &&
+           verifier.VerifyTable(sparsity()) &&
+           VerifyOffset(verifier, VT_SHAPE_SIGNATURE) &&
+           verifier.VerifyVector(shape_signature()) &&
+           verifier.EndTable();
+  }
+  TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Tensor> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorBuilder {
+  typedef Tensor Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(Tensor::VT_SHAPE, shape);
+  }
+  void add_type(tflite::TensorType type) {
+    fbb_.AddElement<int8_t>(Tensor::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_BUFFER, buffer, 0);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Tensor::VT_NAME, name);
+  }
+  void add_quantization(flatbuffers::Offset<tflite::QuantizationParameters> quantization) {
+    fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
+  }
+  void add_is_variable(bool is_variable) {
+    fbb_.AddElement<uint8_t>(Tensor::VT_IS_VARIABLE, static_cast<uint8_t>(is_variable), 0);
+  }
+  void add_sparsity(flatbuffers::Offset<tflite::SparsityParameters> sparsity) {
+    fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity);
+  }
+  void add_shape_signature(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape_signature) {
+    fbb_.AddOffset(Tensor::VT_SHAPE_SIGNATURE, shape_signature);
+  }
+  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Tensor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Tensor>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    uint32_t buffer = 0,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape_signature = 0) {
+  TensorBuilder builder_(_fbb);
+  builder_.add_shape_signature(shape_signature);
+  builder_.add_sparsity(sparsity);
+  builder_.add_quantization(quantization);
+  builder_.add_name(name);
+  builder_.add_buffer(buffer);
+  builder_.add_shape(shape);
+  builder_.add_is_variable(is_variable);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    tflite::TensorType type = tflite::TensorType_FLOAT32,
+    uint32_t buffer = 0,
+    const char *name = nullptr,
+    flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    const std::vector<int32_t> *shape_signature = nullptr) {
+  auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
+  return tflite::CreateTensor(
+      _fbb,
+      shape__,
+      type,
+      buffer,
+      name__,
+      quantization,
+      is_variable,
+      sparsity,
+      shape_signature__);
+}
+
+flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Conv2DOptionsT : public flatbuffers::NativeTable {
+  typedef Conv2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  int32_t dilation_w_factor = 1;
+  int32_t dilation_h_factor = 1;
+};
+
+struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Conv2DOptionsT NativeTableType;
+  typedef Conv2DOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Conv2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR) &&
+           verifier.EndTable();
+  }
+  Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Conv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Conv2DOptionsBuilder {
+  typedef Conv2DOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Conv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Conv2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Pool2DOptionsT : public flatbuffers::NativeTable {
+  typedef Pool2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  int32_t filter_width = 0;
+  int32_t filter_height = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Pool2DOptionsT NativeTableType;
+  typedef Pool2DOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Pool2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FILTER_WIDTH = 10,
+    VT_FILTER_HEIGHT = 12,
+    VT_FUSED_ACTIVATION_FUNCTION = 14
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t filter_width() const {
+    return GetField<int32_t>(VT_FILTER_WIDTH, 0);
+  }
+  int32_t filter_height() const {
+    return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_WIDTH) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_HEIGHT) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  Pool2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Pool2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Pool2DOptionsBuilder {
+  typedef Pool2DOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_filter_width(int32_t filter_width) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_WIDTH, filter_width, 0);
+  }
+  void add_filter_height(int32_t filter_height) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit Pool2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Pool2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Pool2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t filter_width = 0,
+    int32_t filter_height = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  Pool2DOptionsBuilder builder_(_fbb);
+  builder_.add_filter_height(filter_height);
+  builder_.add_filter_width(filter_width);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
+  typedef DepthwiseConv2DOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+  int32_t depth_multiplier = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  int32_t dilation_w_factor = 1;
+  int32_t dilation_h_factor = 1;
+};
+
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DepthwiseConv2DOptionsT NativeTableType;
+  typedef DepthwiseConv2DOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DepthwiseConv2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_DEPTH_MULTIPLIER = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12,
+    VT_DILATION_W_FACTOR = 14,
+    VT_DILATION_H_FACTOR = 16
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t depth_multiplier() const {
+    return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int32_t>(verifier, VT_DEPTH_MULTIPLIER) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR) &&
+           verifier.EndTable();
+  }
+  DepthwiseConv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthwiseConv2DOptionsBuilder {
+  typedef DepthwiseConv2DOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_depth_multiplier(int32_t depth_multiplier) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER, depth_multiplier, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit DepthwiseConv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DepthwiseConv2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t depth_multiplier = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  DepthwiseConv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_depth_multiplier(depth_multiplier);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
+  typedef ConcatEmbeddingsOptions TableType;
+  int32_t num_channels = 0;
+  std::vector<int32_t> num_columns_per_channel{};
+  std::vector<int32_t> embedding_dim_per_channel{};
+};
+
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConcatEmbeddingsOptionsT NativeTableType;
+  typedef ConcatEmbeddingsOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ConcatEmbeddingsOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_CHANNELS = 4,
+    VT_NUM_COLUMNS_PER_CHANNEL = 6,
+    VT_EMBEDDING_DIM_PER_CHANNEL = 8
+  };
+  int32_t num_channels() const {
+    return GetField<int32_t>(VT_NUM_CHANNELS, 0);
+  }
+  const flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+  }
+  const flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHANNELS) &&
+           VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
+           verifier.VerifyVector(num_columns_per_channel()) &&
+           VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
+           verifier.VerifyVector(embedding_dim_per_channel()) &&
+           verifier.EndTable();
+  }
+  ConcatEmbeddingsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatEmbeddingsOptionsBuilder {
+  typedef ConcatEmbeddingsOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_channels(int32_t num_channels) {
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
+  }
+  void add_num_columns_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
+  }
+  void add_embedding_dim_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
+  }
+  explicit ConcatEmbeddingsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+  ConcatEmbeddingsOptionsBuilder builder_(_fbb);
+  builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
+  builder_.add_num_columns_per_channel(num_columns_per_channel);
+  builder_.add_num_channels(num_channels);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    const std::vector<int32_t> *num_columns_per_channel = nullptr,
+    const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
+  auto num_columns_per_channel__ = num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0;
+  auto embedding_dim_per_channel__ = embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0;
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb,
+      num_channels,
+      num_columns_per_channel__,
+      embedding_dim_per_channel__);
+}
+
+flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
+  typedef LSHProjectionOptions TableType;
+  tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN;
+};
+
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LSHProjectionOptionsT NativeTableType;
+  typedef LSHProjectionOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LSHProjectionOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4
+  };
+  tflite::LSHProjectionType type() const {
+    return static_cast<tflite::LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_TYPE) &&
+           verifier.EndTable();
+  }
+  LSHProjectionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSHProjectionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSHProjectionOptionsBuilder {
+  typedef LSHProjectionOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_type(tflite::LSHProjectionType type) {
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  explicit LSHProjectionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LSHProjectionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LSHProjectionOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN) {
+  LSHProjectionOptionsBuilder builder_(_fbb);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SVDFOptionsT : public flatbuffers::NativeTable {
+  typedef SVDFOptions TableType;
+  int32_t rank = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SVDFOptionsT NativeTableType;
+  typedef SVDFOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SVDFOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RANK = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  int32_t rank() const {
+    return GetField<int32_t>(VT_RANK, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RANK) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  SVDFOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SVDFOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SVDFOptionsBuilder {
+  typedef SVDFOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_rank(int32_t rank) {
+    fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SVDFOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SVDFOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SVDFOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SVDFOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t rank = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SVDFOptionsBuilder builder_(_fbb);
+  builder_.add_rank(rank);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RNNOptionsT : public flatbuffers::NativeTable {
+  typedef RNNOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RNNOptionsT NativeTableType;
+  typedef RNNOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  RNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RNNOptionsBuilder {
+  typedef RNNOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(RNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit RNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<RNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  RNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef SequenceRNNOptions TableType;
+  bool time_major = false;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SequenceRNNOptionsT NativeTableType;
+  typedef SequenceRNNOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SequenceRNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  SequenceRNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SequenceRNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SequenceRNNOptionsBuilder {
+  typedef SequenceRNNOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major = false;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool merge_outputs = false;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  typedef BidirectionalSequenceRNNOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BidirectionalSequenceRNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_MERGE_OUTPUTS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  typedef BidirectionalSequenceRNNOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool merge_outputs = false,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
+  typedef FullyConnectedOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT;
+  bool keep_num_dims = false;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FullyConnectedOptionsT NativeTableType;
+  typedef FullyConnectedOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FullyConnectedOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_WEIGHTS_FORMAT = 6,
+    VT_KEEP_NUM_DIMS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  tflite::FullyConnectedOptionsWeightsFormat weights_format() const {
+    return static_cast<tflite::FullyConnectedOptionsWeightsFormat>(GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
+  }
+  bool keep_num_dims() const {
+    return GetField<uint8_t>(VT_KEEP_NUM_DIMS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_NUM_DIMS) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  FullyConnectedOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FullyConnectedOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FullyConnectedOptionsBuilder {
+  typedef FullyConnectedOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_weights_format(tflite::FullyConnectedOptionsWeightsFormat weights_format) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_WEIGHTS_FORMAT, static_cast<int8_t>(weights_format), 0);
+  }
+  void add_keep_num_dims(bool keep_num_dims) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_KEEP_NUM_DIMS, static_cast<uint8_t>(keep_num_dims), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<FullyConnectedOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FullyConnectedOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT,
+    bool keep_num_dims = false,
+    bool asymmetric_quantize_inputs = false) {
+  FullyConnectedOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_keep_num_dims(keep_num_dims);
+  builder_.add_weights_format(weights_format);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SoftmaxOptionsT : public flatbuffers::NativeTable {
+  typedef SoftmaxOptions TableType;
+  float beta = 0.0f;
+};
+
+struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SoftmaxOptionsT NativeTableType;
+  typedef SoftmaxOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SoftmaxOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BETA = 4
+  };
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_BETA) &&
+           verifier.EndTable();
+  }
+  SoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SoftmaxOptionsBuilder {
+  typedef SoftmaxOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit SoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float beta = 0.0f) {
+  SoftmaxOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatenationOptionsT : public flatbuffers::NativeTable {
+  typedef ConcatenationOptions TableType;
+  int32_t axis = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConcatenationOptionsT NativeTableType;
+  typedef ConcatenationOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ConcatenationOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  ConcatenationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatenationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatenationOptionsBuilder {
+  typedef ConcatenationOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit ConcatenationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ConcatenationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConcatenationOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  ConcatenationOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddOptionsT : public flatbuffers::NativeTable {
+  typedef AddOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool pot_scale_int16 = true;
+};
+
+struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AddOptionsT NativeTableType;
+  typedef AddOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AddOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
+           verifier.EndTable();
+  }
+  AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddOptionsBuilder {
+  typedef AddOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<AddOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AddOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
+  AddOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MulOptionsT : public flatbuffers::NativeTable {
+  typedef MulOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MulOptionsT NativeTableType;
+  typedef MulOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MulOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  MulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MulOptionsBuilder {
+  typedef MulOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit MulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MulOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  MulOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct L2NormOptionsT : public flatbuffers::NativeTable {
+  typedef L2NormOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef L2NormOptionsT NativeTableType;
+  typedef L2NormOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return L2NormOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  L2NormOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<L2NormOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct L2NormOptionsBuilder {
+  typedef L2NormOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit L2NormOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<L2NormOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<L2NormOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  L2NormOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
+  typedef LocalResponseNormalizationOptions TableType;
+  int32_t radius = 0;
+  float bias = 0.0f;
+  float alpha = 0.0f;
+  float beta = 0.0f;
+};
+
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LocalResponseNormalizationOptionsT NativeTableType;
+  typedef LocalResponseNormalizationOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LocalResponseNormalizationOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RADIUS = 4,
+    VT_BIAS = 6,
+    VT_ALPHA = 8,
+    VT_BETA = 10
+  };
+  int32_t radius() const {
+    return GetField<int32_t>(VT_RADIUS, 0);
+  }
+  float bias() const {
+    return GetField<float>(VT_BIAS, 0.0f);
+  }
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RADIUS) &&
+           VerifyField<float>(verifier, VT_BIAS) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           VerifyField<float>(verifier, VT_BETA) &&
+           verifier.EndTable();
+  }
+  LocalResponseNormalizationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LocalResponseNormalizationOptionsBuilder {
+  typedef LocalResponseNormalizationOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_radius(int32_t radius) {
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
+  }
+  void add_bias(float bias) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias, 0.0f);
+  }
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t radius = 0,
+    float bias = 0.0f,
+    float alpha = 0.0f,
+    float beta = 0.0f) {
+  LocalResponseNormalizationOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  builder_.add_alpha(alpha);
+  builder_.add_bias(bias);
+  builder_.add_radius(radius);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSTMOptionsT : public flatbuffers::NativeTable {
+  typedef LSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  tflite::LSTMKernelType kernel_type = tflite::LSTMKernelType_FULL;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LSTMOptionsT NativeTableType;
+  typedef LSTMOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_KERNEL_TYPE = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  tflite::LSTMKernelType kernel_type() const {
+    return static_cast<tflite::LSTMKernelType>(GetField<int8_t>(VT_KERNEL_TYPE, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<int8_t>(verifier, VT_KERNEL_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSTMOptionsBuilder {
+  typedef LSTMOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_kernel_type(tflite::LSTMKernelType kernel_type) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_KERNEL_TYPE, static_cast<int8_t>(kernel_type), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(LSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    tflite::LSTMKernelType kernel_type = tflite::LSTMKernelType_FULL,
+    bool asymmetric_quantize_inputs = false) {
+  LSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_kernel_type(kernel_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+  typedef UnidirectionalSequenceLSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  bool time_major = false;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
+  typedef UnidirectionalSequenceLSTMOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnidirectionalSequenceLSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_TIME_MAJOR = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  UnidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnidirectionalSequenceLSTMOptionsBuilder {
+  typedef UnidirectionalSequenceLSTMOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool time_major = false,
+    bool asymmetric_quantize_inputs = false) {
+  UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceLSTMOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  float cell_clip = 0.0f;
+  float proj_clip = 0.0f;
+  bool merge_outputs = false;
+  bool time_major = true;
+  bool asymmetric_quantize_inputs = false;
+};
+
+struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BidirectionalSequenceLSTMOptionsT NativeTableType;
+  typedef BidirectionalSequenceLSTMOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BidirectionalSequenceLSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_MERGE_OUTPUTS = 10,
+    VT_TIME_MAJOR = 12,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 14
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 1) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceLSTMOptionsBuilder {
+  typedef BidirectionalSequenceLSTMOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 1);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool merge_outputs = false,
+    bool time_major = true,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
+  typedef ResizeBilinearOptions TableType;
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ResizeBilinearOptionsT NativeTableType;
+  typedef ResizeBilinearOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ResizeBilinearOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 8,
+    VT_HALF_PIXEL_CENTERS = 10
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS) &&
+           verifier.EndTable();
+  }
+  ResizeBilinearOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeBilinearOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeBilinearOptionsBuilder {
+  typedef ResizeBilinearOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ResizeBilinearOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ResizeBilinearOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeBilinearOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ResizeNearestNeighborOptionsT : public flatbuffers::NativeTable {
+  typedef ResizeNearestNeighborOptions TableType;
+  bool align_corners = false;
+  bool half_pixel_centers = false;
+};
+
+struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ResizeNearestNeighborOptionsT NativeTableType;
+  typedef ResizeNearestNeighborOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ResizeNearestNeighborOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 4,
+    VT_HALF_PIXEL_CENTERS = 6
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS) &&
+           verifier.EndTable();
+  }
+  ResizeNearestNeighborOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeNearestNeighborOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeNearestNeighborOptionsBuilder {
+  typedef ResizeNearestNeighborOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeNearestNeighborOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeNearestNeighborOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CallOptionsT : public flatbuffers::NativeTable {
+  typedef CallOptions TableType;
+  uint32_t subgraph = 0;
+};
+
+struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CallOptionsT NativeTableType;
+  typedef CallOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CallOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SUBGRAPH = 4
+  };
+  uint32_t subgraph() const {
+    return GetField<uint32_t>(VT_SUBGRAPH, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) &&
+           verifier.EndTable();
+  }
+  CallOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CallOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CallOptionsBuilder {
+  typedef CallOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_subgraph(uint32_t subgraph) {
+    fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
+  }
+  explicit CallOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<CallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CallOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t subgraph = 0) {
+  CallOptionsBuilder builder_(_fbb);
+  builder_.add_subgraph(subgraph);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadOptionsT : public flatbuffers::NativeTable {
+  typedef PadOptions TableType;
+};
+
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadOptionsT NativeTableType;
+  typedef PadOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PadOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadOptionsBuilder {
+  typedef PadOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<PadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PadOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadV2OptionsT : public flatbuffers::NativeTable {
+  typedef PadV2Options TableType;
+};
+
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadV2OptionsT NativeTableType;
+  typedef PadV2OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PadV2OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadV2OptionsBuilder {
+  typedef PadV2Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<PadV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PadV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReshapeOptionsT : public flatbuffers::NativeTable {
+  typedef ReshapeOptions TableType;
+  std::vector<int32_t> new_shape{};
+};
+
+struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReshapeOptionsT NativeTableType;
+  typedef ReshapeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReshapeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NEW_SHAPE = 4
+  };
+  const flatbuffers::Vector<int32_t> *new_shape() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.VerifyVector(new_shape()) &&
+           verifier.EndTable();
+  }
+  ReshapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReshapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReshapeOptionsBuilder {
+  typedef ReshapeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_new_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
+    fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
+  }
+  explicit ReshapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ReshapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReshapeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape = 0) {
+  ReshapeOptionsBuilder builder_(_fbb);
+  builder_.add_new_shape(new_shape);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *new_shape = nullptr) {
+  auto new_shape__ = new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0;
+  return tflite::CreateReshapeOptions(
+      _fbb,
+      new_shape__);
+}
+
+flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
+  typedef SpaceToBatchNDOptions TableType;
+};
+
+struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SpaceToBatchNDOptionsT NativeTableType;
+  typedef SpaceToBatchNDOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SpaceToBatchNDOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SpaceToBatchNDOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToBatchNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToBatchNDOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToBatchNDOptionsBuilder {
+  typedef SpaceToBatchNDOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SpaceToBatchNDOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SpaceToBatchNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
+  typedef BatchToSpaceNDOptions TableType;
+};
+
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BatchToSpaceNDOptionsT NativeTableType;
+  typedef BatchToSpaceNDOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BatchToSpaceNDOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BatchToSpaceNDOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchToSpaceNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BatchToSpaceNDOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchToSpaceNDOptionsBuilder {
+  typedef BatchToSpaceNDOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  BatchToSpaceNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SkipGramOptionsT : public flatbuffers::NativeTable {
+  typedef SkipGramOptions TableType;
+  int32_t ngram_size = 0;
+  int32_t max_skip_size = 0;
+  bool include_all_ngrams = false;
+};
+
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SkipGramOptionsT NativeTableType;
+  typedef SkipGramOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SkipGramOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NGRAM_SIZE = 4,
+    VT_MAX_SKIP_SIZE = 6,
+    VT_INCLUDE_ALL_NGRAMS = 8
+  };
+  int32_t ngram_size() const {
+    return GetField<int32_t>(VT_NGRAM_SIZE, 0);
+  }
+  int32_t max_skip_size() const {
+    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
+  }
+  bool include_all_ngrams() const {
+    return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NGRAM_SIZE) &&
+           VerifyField<int32_t>(verifier, VT_MAX_SKIP_SIZE) &&
+           VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS) &&
+           verifier.EndTable();
+  }
+  SkipGramOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SkipGramOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SkipGramOptionsBuilder {
+  typedef SkipGramOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_ngram_size(int32_t ngram_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
+  }
+  void add_max_skip_size(int32_t max_skip_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size, 0);
+  }
+  void add_include_all_ngrams(bool include_all_ngrams) {
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
+  }
+  explicit SkipGramOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SkipGramOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SkipGramOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t ngram_size = 0,
+    int32_t max_skip_size = 0,
+    bool include_all_ngrams = false) {
+  SkipGramOptionsBuilder builder_(_fbb);
+  builder_.add_max_skip_size(max_skip_size);
+  builder_.add_ngram_size(ngram_size);
+  builder_.add_include_all_ngrams(include_all_ngrams);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
+  typedef SpaceToDepthOptions TableType;
+  int32_t block_size = 0;
+};
+
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SpaceToDepthOptionsT NativeTableType;
+  typedef SpaceToDepthOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SpaceToDepthOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) &&
+           verifier.EndTable();
+  }
+  SpaceToDepthOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToDepthOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToDepthOptionsBuilder {
+  typedef SpaceToDepthOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit SpaceToDepthOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SpaceToDepthOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SpaceToDepthOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  SpaceToDepthOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DepthToSpaceOptionsT : public flatbuffers::NativeTable {
+  typedef DepthToSpaceOptions TableType;
+  int32_t block_size = 0;
+};
+
+struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DepthToSpaceOptionsT NativeTableType;
+  typedef DepthToSpaceOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DepthToSpaceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) &&
+           verifier.EndTable();
+  }
+  DepthToSpaceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthToSpaceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthToSpaceOptionsBuilder {
+  typedef DepthToSpaceOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(DepthToSpaceOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit DepthToSpaceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DepthToSpaceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DepthToSpaceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  DepthToSpaceOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubOptionsT : public flatbuffers::NativeTable {
+  typedef SubOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+  bool pot_scale_int16 = true;
+};
+
+struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SubOptionsT NativeTableType;
+  typedef SubOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SubOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
+           verifier.EndTable();
+  }
+  SubOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubOptionsBuilder {
+  typedef SubOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit SubOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SubOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SubOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SubOptions> CreateSubOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
+  SubOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DivOptionsT : public flatbuffers::NativeTable {
+  typedef DivOptions TableType;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
+};
+
+struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DivOptionsT NativeTableType;
+  typedef DivOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DivOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  DivOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DivOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DivOptionsBuilder {
+  typedef DivOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DivOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit DivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DivOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DivOptions> CreateDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+  DivOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DivOptions> CreateDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TopKV2OptionsT : public flatbuffers::NativeTable {
+  typedef TopKV2Options TableType;
+};
+
+struct TopKV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TopKV2OptionsT NativeTableType;
+  typedef TopKV2OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TopKV2OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TopKV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TopKV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TopKV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TopKV2OptionsBuilder {
+  typedef TopKV2Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TopKV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TopKV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TopKV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TopKV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
+  typedef EmbeddingLookupSparseOptions TableType;
+  tflite::CombinerType combiner = tflite::CombinerType_SUM;
+};
+
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EmbeddingLookupSparseOptionsT NativeTableType;
+  typedef EmbeddingLookupSparseOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return EmbeddingLookupSparseOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COMBINER = 4
+  };
+  tflite::CombinerType combiner() const {
+    return static_cast<tflite::CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_COMBINER) &&
+           verifier.EndTable();
+  }
+  EmbeddingLookupSparseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EmbeddingLookupSparseOptionsBuilder {
+  typedef EmbeddingLookupSparseOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_combiner(tflite::CombinerType combiner) {
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
+  }
+  explicit EmbeddingLookupSparseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CombinerType combiner = tflite::CombinerType_SUM) {
+  EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
+  builder_.add_combiner(combiner);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherOptionsT : public flatbuffers::NativeTable {
+  typedef GatherOptions TableType;
+  int32_t axis = 0;
+};
+
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherOptionsT NativeTableType;
+  typedef GatherOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GatherOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  GatherOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherOptionsBuilder {
+  typedef GatherOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
+  }
+  explicit GatherOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<GatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  GatherOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherOptions> CreateGatherOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeOptionsT : public flatbuffers::NativeTable {
+  typedef TransposeOptions TableType;
+};
+
+struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TransposeOptionsT NativeTableType;
+  typedef TransposeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TransposeOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TransposeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TransposeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeOptionsBuilder {
+  typedef TransposeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TransposeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TransposeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpOptionsT : public flatbuffers::NativeTable {
+  typedef ExpOptions TableType;
+};
+
+struct ExpOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ExpOptionsT NativeTableType;
+  typedef ExpOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ExpOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ExpOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpOptionsBuilder {
+  typedef ExpOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ExpOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ExpOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ExpOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CosOptionsT : public flatbuffers::NativeTable {
+  typedef CosOptions TableType;
+};
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CosOptionsT NativeTableType;
+  typedef CosOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CosOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  CosOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CosOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CosOptionsBuilder {
+  typedef CosOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReducerOptionsT : public flatbuffers::NativeTable {
+  typedef ReducerOptions TableType;
+  bool keep_dims = false;
+};
+
+struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReducerOptionsT NativeTableType;
+  typedef ReducerOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReducerOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEEP_DIMS = 4
+  };
+  bool keep_dims() const {
+    return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_DIMS) &&
+           verifier.EndTable();
+  }
+  ReducerOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReducerOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReducerOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReducerOptionsBuilder {
+  typedef ReducerOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_keep_dims(bool keep_dims) {
+    fbb_.AddElement<uint8_t>(ReducerOptions::VT_KEEP_DIMS, static_cast<uint8_t>(keep_dims), 0);
+  }
+  explicit ReducerOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ReducerOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReducerOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReducerOptions> CreateReducerOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool keep_dims = false) {
+  ReducerOptionsBuilder builder_(_fbb);
+  builder_.add_keep_dims(keep_dims);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReducerOptions> CreateReducerOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SqueezeOptionsT : public flatbuffers::NativeTable {
+  typedef SqueezeOptions TableType;
+  std::vector<int32_t> squeeze_dims{};
+};
+
+struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SqueezeOptionsT NativeTableType;
+  typedef SqueezeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SqueezeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SQUEEZE_DIMS = 4
+  };
+  const flatbuffers::Vector<int32_t> *squeeze_dims() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SQUEEZE_DIMS) &&
+           verifier.VerifyVector(squeeze_dims()) &&
+           verifier.EndTable();
+  }
+  SqueezeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SqueezeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SqueezeOptionsBuilder {
+  typedef SqueezeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_squeeze_dims(flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims) {
+    fbb_.AddOffset(SqueezeOptions::VT_SQUEEZE_DIMS, squeeze_dims);
+  }
+  explicit SqueezeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SqueezeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SqueezeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
+  SqueezeOptionsBuilder builder_(_fbb);
+  builder_.add_squeeze_dims(squeeze_dims);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *squeeze_dims = nullptr) {
+  auto squeeze_dims__ = squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      squeeze_dims__);
+}
+
+flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SplitOptionsT : public flatbuffers::NativeTable {
+  typedef SplitOptions TableType;
+  int32_t num_splits = 0;
+};
+
+struct SplitOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SplitOptionsT NativeTableType;
+  typedef SplitOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SplitOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS) &&
+           verifier.EndTable();
+  }
+  SplitOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SplitOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitOptionsBuilder {
+  typedef SplitOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SplitOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SplitOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SplitVOptionsT : public flatbuffers::NativeTable {
+  typedef SplitVOptions TableType;
+  int32_t num_splits = 0;
+};
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SplitVOptionsT NativeTableType;
+  typedef SplitVOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SplitVOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS) &&
+           verifier.EndTable();
+  }
+  SplitVOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SplitVOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitVOptionsBuilder {
+  typedef SplitVOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StridedSliceOptionsT : public flatbuffers::NativeTable {
+  typedef StridedSliceOptions TableType;
+  int32_t begin_mask = 0;
+  int32_t end_mask = 0;
+  int32_t ellipsis_mask = 0;
+  int32_t new_axis_mask = 0;
+  int32_t shrink_axis_mask = 0;
+};
+
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef StridedSliceOptionsT NativeTableType;
+  typedef StridedSliceOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StridedSliceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BEGIN_MASK = 4,
+    VT_END_MASK = 6,
+    VT_ELLIPSIS_MASK = 8,
+    VT_NEW_AXIS_MASK = 10,
+    VT_SHRINK_AXIS_MASK = 12
+  };
+  int32_t begin_mask() const {
+    return GetField<int32_t>(VT_BEGIN_MASK, 0);
+  }
+  int32_t end_mask() const {
+    return GetField<int32_t>(VT_END_MASK, 0);
+  }
+  int32_t ellipsis_mask() const {
+    return GetField<int32_t>(VT_ELLIPSIS_MASK, 0);
+  }
+  int32_t new_axis_mask() const {
+    return GetField<int32_t>(VT_NEW_AXIS_MASK, 0);
+  }
+  int32_t shrink_axis_mask() const {
+    return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BEGIN_MASK) &&
+           VerifyField<int32_t>(verifier, VT_END_MASK) &&
+           VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK) &&
+           verifier.EndTable();
+  }
+  StridedSliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<StridedSliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StridedSliceOptionsBuilder {
+  typedef StridedSliceOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_begin_mask(int32_t begin_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
+  }
+  void add_end_mask(int32_t end_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_END_MASK, end_mask, 0);
+  }
+  void add_ellipsis_mask(int32_t ellipsis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_ELLIPSIS_MASK, ellipsis_mask, 0);
+  }
+  void add_new_axis_mask(int32_t new_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_NEW_AXIS_MASK, new_axis_mask, 0);
+  }
+  void add_shrink_axis_mask(int32_t shrink_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK, shrink_axis_mask, 0);
+  }
+  explicit StridedSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<StridedSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<StridedSliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t begin_mask = 0,
+    int32_t end_mask = 0,
+    int32_t ellipsis_mask = 0,
+    int32_t new_axis_mask = 0,
+    int32_t shrink_axis_mask = 0) {
+  StridedSliceOptionsBuilder builder_(_fbb);
+  builder_.add_shrink_axis_mask(shrink_axis_mask);
+  builder_.add_new_axis_mask(new_axis_mask);
+  builder_.add_ellipsis_mask(ellipsis_mask);
+  builder_.add_end_mask(end_mask);
+  builder_.add_begin_mask(begin_mask);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogSoftmaxOptionsT : public flatbuffers::NativeTable {
+  typedef LogSoftmaxOptions TableType;
+};
+
+struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogSoftmaxOptionsT NativeTableType;
+  typedef LogSoftmaxOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogSoftmaxOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogSoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogSoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogSoftmaxOptionsBuilder {
+  typedef LogSoftmaxOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogSoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LogSoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogSoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogSoftmaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CastOptionsT : public flatbuffers::NativeTable {
+  typedef CastOptions TableType;
+  tflite::TensorType in_data_type = tflite::TensorType_FLOAT32;
+  tflite::TensorType out_data_type = tflite::TensorType_FLOAT32;
+};
+
+struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CastOptionsT NativeTableType;
+  typedef CastOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CastOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  tflite::TensorType in_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  tflite::TensorType out_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE) &&
+           verifier.EndTable();
+  }
+  CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CastOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CastOptionsBuilder {
+  typedef CastOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_in_data_type(tflite::TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(tflite::TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
+  explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<CastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CastOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CastOptions> CreateCastOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType in_data_type = tflite::TensorType_FLOAT32,
+    tflite::TensorType out_data_type = tflite::TensorType_FLOAT32) {
+  CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DequantizeOptionsT : public flatbuffers::NativeTable {
+  typedef DequantizeOptions TableType;
+};
+
+struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DequantizeOptionsT NativeTableType;
+  typedef DequantizeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DequantizeOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DequantizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DequantizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DequantizeOptionsBuilder {
+  typedef DequantizeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit DequantizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DequantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DequantizeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  DequantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+};
+
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
+  typedef MaximumMinimumOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MaximumMinimumOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MaximumMinimumOptionsBuilder {
+  typedef MaximumMinimumOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MaximumMinimumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TileOptionsT : public flatbuffers::NativeTable {
+  typedef TileOptions TableType;
+};
+
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TileOptionsT NativeTableType;
+  typedef TileOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TileOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TileOptionsBuilder {
+  typedef TileOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TileOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TileOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  tflite::TensorType output_type = tflite::TensorType_FLOAT32;
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  typedef ArgMaxOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArgMaxOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  typedef ArgMaxOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMinOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMinOptions TableType;
+  tflite::TensorType output_type = tflite::TensorType_FLOAT32;
+};
+
+struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMinOptionsT NativeTableType;
+  typedef ArgMinOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArgMinOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMinOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMinOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMinOptionsBuilder {
+  typedef ArgMinOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMinOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMinOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ArgMinOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMinOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
+  ArgMinOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterOptions TableType;
+};
+
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterOptionsT NativeTableType;
+  typedef GreaterOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GreaterOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterOptionsBuilder {
+  typedef GreaterOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<GreaterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterEqualOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterEqualOptions TableType;
+};
+
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterEqualOptionsT NativeTableType;
+  typedef GreaterEqualOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GreaterEqualOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterEqualOptionsBuilder {
+  typedef GreaterEqualOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<GreaterEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessOptionsT : public flatbuffers::NativeTable {
+  typedef LessOptions TableType;
+};
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessOptionsT NativeTableType;
+  typedef LessOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LessOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessOptionsBuilder {
+  typedef LessOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessEqualOptionsT : public flatbuffers::NativeTable {
+  typedef LessEqualOptions TableType;
+};
+
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessEqualOptionsT NativeTableType;
+  typedef LessEqualOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LessEqualOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessEqualOptionsBuilder {
+  typedef LessEqualOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LessEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NegOptionsT : public flatbuffers::NativeTable {
+  typedef NegOptions TableType;
+};
+
+struct NegOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NegOptionsT NativeTableType;
+  typedef NegOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NegOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NegOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NegOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NegOptionsBuilder {
+  typedef NegOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NegOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<NegOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NegOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<NegOptions> CreateNegOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NegOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SelectOptionsT : public flatbuffers::NativeTable {
+  typedef SelectOptions TableType;
+};
+
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SelectOptionsT NativeTableType;
+  typedef SelectOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SelectOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SelectOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectOptionsBuilder {
+  typedef SelectOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SelectOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SelectOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SliceOptionsT : public flatbuffers::NativeTable {
+  typedef SliceOptions TableType;
+};
+
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SliceOptionsT NativeTableType;
+  typedef SliceOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SliceOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SliceOptionsBuilder {
+  typedef SliceOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeConvOptionsT : public flatbuffers::NativeTable {
+  typedef TransposeConvOptions TableType;
+  tflite::Padding padding = tflite::Padding_SAME;
+  int32_t stride_w = 0;
+  int32_t stride_h = 0;
+};
+
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TransposeConvOptionsT NativeTableType;
+  typedef TransposeConvOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TransposeConvOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           verifier.EndTable();
+  }
+  TransposeConvOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TransposeConvOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeConvOptionsBuilder {
+  typedef TransposeConvOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  explicit TransposeConvOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<TransposeConvOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TransposeConvOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0) {
+  TransposeConvOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpandDimsOptionsT : public flatbuffers::NativeTable {
+  typedef ExpandDimsOptions TableType;
+};
+
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ExpandDimsOptionsT NativeTableType;
+  typedef ExpandDimsOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ExpandDimsOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpandDimsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ExpandDimsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpandDimsOptionsBuilder {
+  typedef ExpandDimsOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ExpandDimsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ExpandDimsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpandDimsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
+  typedef SparseToDenseOptions TableType;
+  bool validate_indices = false;
+};
+
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SparseToDenseOptionsT NativeTableType;
+  typedef SparseToDenseOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SparseToDenseOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALIDATE_INDICES = 4
+  };
+  bool validate_indices() const {
+    return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES) &&
+           verifier.EndTable();
+  }
+  SparseToDenseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SparseToDenseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparseToDenseOptionsBuilder {
+  typedef SparseToDenseOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_validate_indices(bool validate_indices) {
+    fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
+  }
+  explicit SparseToDenseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SparseToDenseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SparseToDenseOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool validate_indices = false) {
+  SparseToDenseOptionsBuilder builder_(_fbb);
+  builder_.add_validate_indices(validate_indices);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EqualOptionsT : public flatbuffers::NativeTable {
+  typedef EqualOptions TableType;
+};
+
+struct EqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EqualOptionsT NativeTableType;
+  typedef EqualOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return EqualOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  EqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EqualOptionsBuilder {
+  typedef EqualOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit EqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<EqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<EqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  EqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NotEqualOptionsT : public flatbuffers::NativeTable {
+  typedef NotEqualOptions TableType;
+};
+
+struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NotEqualOptionsT NativeTableType;
+  typedef NotEqualOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NotEqualOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NotEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NotEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NotEqualOptionsBuilder {
+  typedef NotEqualOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NotEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<NotEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NotEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NotEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ShapeOptionsT : public flatbuffers::NativeTable {
+  typedef ShapeOptions TableType;
+  tflite::TensorType out_type = tflite::TensorType_FLOAT32;
+};
+
+struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ShapeOptionsT NativeTableType;
+  typedef ShapeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ShapeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUT_TYPE = 4
+  };
+  tflite::TensorType out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ShapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ShapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ShapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ShapeOptionsBuilder {
+  typedef ShapeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_out_type(tflite::TensorType out_type) {
+    fbb_.AddElement<int8_t>(ShapeOptions::VT_OUT_TYPE, static_cast<int8_t>(out_type), 0);
+  }
+  explicit ShapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ShapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ShapeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType out_type = tflite::TensorType_FLOAT32) {
+  ShapeOptionsBuilder builder_(_fbb);
+  builder_.add_out_type(out_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RankOptionsT : public flatbuffers::NativeTable {
+  typedef RankOptions TableType;
+};
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RankOptionsT NativeTableType;
+  typedef RankOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RankOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RankOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RankOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RankOptionsBuilder {
+  typedef RankOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PowOptionsT : public flatbuffers::NativeTable {
+  typedef PowOptions TableType;
+};
+
+struct PowOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PowOptionsT NativeTableType;
+  typedef PowOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PowOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PowOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PowOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PowOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PowOptionsBuilder {
+  typedef PowOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PowOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<PowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PowOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PowOptions> CreatePowOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PowOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PowOptions> CreatePowOptions(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FakeQuantOptionsT : public flatbuffers::NativeTable {
+  typedef FakeQuantOptions TableType;
+  float min = 0.0f;
+  float max = 0.0f;
+  int32_t num_bits = 0;
+  bool narrow_range = false;
+};
+
+struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FakeQuantOptionsT NativeTableType;
+  typedef FakeQuantOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FakeQuantOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_NUM_BITS = 8,
+    VT_NARROW_RANGE = 10
+  };
+  float min() const {
+    return GetField<float>(VT_MIN, 0.0f);
+  }
+  float max() const {
+    return GetField<float>(VT_MAX, 0.0f);
+  }
+  int32_t num_bits() const {
+    return GetField<int32_t>(VT_NUM_BITS, 0);
+  }
+  bool narrow_range() const {
+    return GetField<uint8_t>(VT_NARROW_RANGE, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_MIN) &&
+           VerifyField<float>(verifier, VT_MAX) &&
+           VerifyField<int32_t>(verifier, VT_NUM_BITS) &&
+           VerifyField<uint8_t>(verifier, VT_NARROW_RANGE) &&
+           verifier.EndTable();
+  }
+  FakeQuantOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FakeQuantOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FakeQuantOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FakeQuantOptionsBuilder {
+  typedef FakeQuantOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_min(float min) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MIN, min, 0.0f);
+  }
+  void add_max(float max) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MAX, max, 0.0f);
+  }
+  void add_num_bits(int32_t num_bits) {
+    fbb_.AddElement<int32_t>(FakeQuantOptions::VT_NUM_BITS, num_bits, 0);
+  }
+  void add_narrow_range(bool narrow_range) {
+    fbb_.AddElement<uint8_t>(FakeQuantOptions::VT_NARROW_RANGE, static_cast<uint8_t>(narrow_range), 0);
+  }
+  explicit FakeQuantOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<FakeQuantOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FakeQuantOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float min = 0.0f,
+    float max = 0.0f,
+    int32_t num_bits = 0,
+    bool narrow_range = false) {
+  FakeQuantOptionsBuilder builder_(_fbb);
+  builder_.add_num_bits(num_bits);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_narrow_range(narrow_range);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PackOptionsT : public flatbuffers::NativeTable {
+  typedef PackOptions TableType;
+  int32_t values_count = 0;
+  int32_t axis = 0;
+};
+
+struct PackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PackOptionsT NativeTableType;
+  typedef PackOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PackOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES_COUNT = 4,
+    VT_AXIS = 6
+  };
+  int32_t values_count() const {
+    return GetField<int32_t>(VT_VALUES_COUNT, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_VALUES_COUNT) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  PackOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PackOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PackOptionsBuilder {
+  typedef PackOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values_count(int32_t values_count) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_VALUES_COUNT, values_count, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_AXIS, axis, 0);
+  }
+  explicit PackOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<PackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PackOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PackOptions> CreatePackOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t values_count = 0,
+    int32_t axis = 0) {
+  PackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_values_count(values_count);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalOrOptionsT : public flatbuffers::NativeTable {
+  typedef LogicalOrOptions TableType;
+};
+
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogicalOrOptionsT NativeTableType;
+  typedef LogicalOrOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalOrOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalOrOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogicalOrOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalOrOptionsBuilder {
+  typedef LogicalOrOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LogicalOrOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogicalOrOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalOrOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OneHotOptionsT : public flatbuffers::NativeTable {
+  typedef OneHotOptions TableType;
+  int32_t axis = 0;
+};
+
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OneHotOptionsT NativeTableType;
+  typedef OneHotOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OneHotOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  OneHotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OneHotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OneHotOptionsBuilder {
+  typedef OneHotOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
+  }
+  explicit OneHotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<OneHotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OneHotOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  OneHotOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AbsOptionsT : public flatbuffers::NativeTable {
+  typedef AbsOptions TableType;
+};
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AbsOptionsT NativeTableType;
+  typedef AbsOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AbsOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AbsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AbsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AbsOptionsBuilder {
+  typedef AbsOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HardSwishOptionsT : public flatbuffers::NativeTable {
+  typedef HardSwishOptions TableType;
+};
+
+struct HardSwishOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef HardSwishOptionsT NativeTableType;
+  typedef HardSwishOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HardSwishOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  HardSwishOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HardSwishOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<HardSwishOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HardSwishOptionsBuilder {
+  typedef HardSwishOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit HardSwishOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<HardSwishOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<HardSwishOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  HardSwishOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalAndOptionsT : public flatbuffers::NativeTable {
+  typedef LogicalAndOptions TableType;
+};
+
+struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogicalAndOptionsT NativeTableType;
+  typedef LogicalAndOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalAndOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalAndOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogicalAndOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalAndOptionsBuilder {
+  typedef LogicalAndOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogicalAndOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LogicalAndOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogicalAndOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalAndOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LogicalNotOptionsT : public flatbuffers::NativeTable {
+  typedef LogicalNotOptions TableType;
+};
+
+struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogicalNotOptionsT NativeTableType;
+  typedef LogicalNotOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalNotOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalNotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogicalNotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalNotOptionsBuilder {
+  typedef LogicalNotOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogicalNotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LogicalNotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogicalNotOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalNotOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnpackOptionsT : public flatbuffers::NativeTable {
+  typedef UnpackOptions TableType;
+  int32_t num = 0;
+  int32_t axis = 0;
+};
+
+struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnpackOptionsT NativeTableType;
+  typedef UnpackOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnpackOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM = 4,
+    VT_AXIS = 6
+  };
+  int32_t num() const {
+    return GetField<int32_t>(VT_NUM, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  UnpackOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnpackOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnpackOptionsBuilder {
+  typedef UnpackOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num(int32_t num) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_NUM, num, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_AXIS, axis, 0);
+  }
+  explicit UnpackOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<UnpackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnpackOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num = 0,
+    int32_t axis = 0) {
+  UnpackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_num(num);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FloorDivOptionsT : public flatbuffers::NativeTable {
+  typedef FloorDivOptions TableType;
+};
+
+struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloorDivOptionsT NativeTableType;
+  typedef FloorDivOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FloorDivOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorDivOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FloorDivOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorDivOptionsBuilder {
+  typedef FloorDivOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FloorDivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<FloorDivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloorDivOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorDivOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquareOptionsT : public flatbuffers::NativeTable {
+  typedef SquareOptions TableType;
+};
+
+struct SquareOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquareOptionsT NativeTableType;
+  typedef SquareOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SquareOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquareOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquareOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquareOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquareOptionsBuilder {
+  typedef SquareOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquareOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SquareOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquareOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquareOptions> CreateSquareOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquareOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquareOptions> CreateSquareOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ZerosLikeOptionsT : public flatbuffers::NativeTable {
+  typedef ZerosLikeOptions TableType;
+};
+
+struct ZerosLikeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ZerosLikeOptionsT NativeTableType;
+  typedef ZerosLikeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ZerosLikeOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ZerosLikeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ZerosLikeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ZerosLikeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ZerosLikeOptionsBuilder {
+  typedef ZerosLikeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ZerosLikeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ZerosLikeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ZerosLikeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ZerosLikeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FillOptionsT : public flatbuffers::NativeTable {
+  typedef FillOptions TableType;
+};
+
+struct FillOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FillOptionsT NativeTableType;
+  typedef FillOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FillOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FillOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FillOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FillOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FillOptionsBuilder {
+  typedef FillOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FillOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<FillOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FillOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FillOptions> CreateFillOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FillOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FloorModOptionsT : public flatbuffers::NativeTable {
+  typedef FloorModOptions TableType;
+};
+
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloorModOptionsT NativeTableType;
+  typedef FloorModOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FloorModOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorModOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FloorModOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorModOptionsBuilder {
+  typedef FloorModOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<FloorModOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloorModOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorModOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RangeOptionsT : public flatbuffers::NativeTable {
+  typedef RangeOptions TableType;
+};
+
+struct RangeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RangeOptionsT NativeTableType;
+  typedef RangeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RangeOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RangeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RangeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RangeOptionsBuilder {
+  typedef RangeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RangeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<RangeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RangeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RangeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha = 0.0f;
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  typedef LeakyReluOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LeakyReluOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  typedef LeakyReluOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  typedef SquaredDifferenceOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SquaredDifferenceOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  typedef SquaredDifferenceOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT;
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  typedef MirrorPadOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MirrorPadOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODE = 4
+  };
+  tflite::MirrorPadMode mode() const {
+    return static_cast<tflite::MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  typedef MirrorPadOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(tflite::MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UniqueOptionsT : public flatbuffers::NativeTable {
+  typedef UniqueOptions TableType;
+  tflite::TensorType idx_out_type = tflite::TensorType_INT32;
+};
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UniqueOptionsT NativeTableType;
+  typedef UniqueOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UniqueOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IDX_OUT_TYPE = 4
+  };
+  tflite::TensorType idx_out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE) &&
+           verifier.EndTable();
+  }
+  UniqueOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UniqueOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UniqueOptionsBuilder {
+  typedef UniqueOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_idx_out_type(tflite::TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType idx_out_type = tflite::TensorType_INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseV2OptionsT : public flatbuffers::NativeTable {
+  typedef ReverseV2Options TableType;
+};
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseV2OptionsT NativeTableType;
+  typedef ReverseV2OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReverseV2OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReverseV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseV2OptionsBuilder {
+  typedef ReverseV2Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddNOptionsT : public flatbuffers::NativeTable {
+  typedef AddNOptions TableType;
+};
+
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AddNOptionsT NativeTableType;
+  typedef AddNOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AddNOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AddNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddNOptionsBuilder {
+  typedef AddNOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<AddNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AddNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AddNOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherNdOptionsT : public flatbuffers::NativeTable {
+  typedef GatherNdOptions TableType;
+};
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherNdOptionsT NativeTableType;
+  typedef GatherNdOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GatherNdOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GatherNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherNdOptionsBuilder {
+  typedef GatherNdOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhereOptionsT : public flatbuffers::NativeTable {
+  typedef WhereOptions TableType;
+};
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhereOptionsT NativeTableType;
+  typedef WhereOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return WhereOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  WhereOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhereOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhereOptionsBuilder {
+  typedef WhereOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseSequenceOptionsT : public flatbuffers::NativeTable {
+  typedef ReverseSequenceOptions TableType;
+  int32_t seq_dim = 0;
+  int32_t batch_dim = 0;
+};
+
+struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseSequenceOptionsT NativeTableType;
+  typedef ReverseSequenceOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReverseSequenceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SEQ_DIM = 4,
+    VT_BATCH_DIM = 6
+  };
+  int32_t seq_dim() const {
+    return GetField<int32_t>(VT_SEQ_DIM, 0);
+  }
+  int32_t batch_dim() const {
+    return GetField<int32_t>(VT_BATCH_DIM, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SEQ_DIM) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIM) &&
+           verifier.EndTable();
+  }
+  ReverseSequenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseSequenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseSequenceOptionsBuilder {
+  typedef ReverseSequenceOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_seq_dim(int32_t seq_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_SEQ_DIM, seq_dim, 0);
+  }
+  void add_batch_dim(int32_t batch_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_BATCH_DIM, batch_dim, 0);
+  }
+  explicit ReverseSequenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ReverseSequenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseSequenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t seq_dim = 0,
+    int32_t batch_dim = 0) {
+  ReverseSequenceOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dim(batch_dim);
+  builder_.add_seq_dim(seq_dim);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MatrixDiagOptionsT : public flatbuffers::NativeTable {
+  typedef MatrixDiagOptions TableType;
+};
+
+struct MatrixDiagOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MatrixDiagOptionsT NativeTableType;
+  typedef MatrixDiagOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MatrixDiagOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MatrixDiagOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MatrixDiagOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MatrixDiagOptionsBuilder {
+  typedef MatrixDiagOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MatrixDiagOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MatrixDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MatrixDiagOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct QuantizeOptionsT : public flatbuffers::NativeTable {
+  typedef QuantizeOptions TableType;
+};
+
+struct QuantizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef QuantizeOptionsT NativeTableType;
+  typedef QuantizeOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return QuantizeOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  QuantizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<QuantizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct QuantizeOptionsBuilder {
+  typedef QuantizeOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit QuantizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<QuantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<QuantizeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  QuantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MatrixSetDiagOptionsT : public flatbuffers::NativeTable {
+  typedef MatrixSetDiagOptions TableType;
+};
+
+struct MatrixSetDiagOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MatrixSetDiagOptionsT NativeTableType;
+  typedef MatrixSetDiagOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MatrixSetDiagOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MatrixSetDiagOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixSetDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MatrixSetDiagOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MatrixSetDiagOptionsBuilder {
+  typedef MatrixSetDiagOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MatrixSetDiagOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<MatrixSetDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MatrixSetDiagOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixSetDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct IfOptionsT : public flatbuffers::NativeTable {
+  typedef IfOptions TableType;
+  int32_t then_subgraph_index = 0;
+  int32_t else_subgraph_index = 0;
+};
+
+struct IfOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef IfOptionsT NativeTableType;
+  typedef IfOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return IfOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_THEN_SUBGRAPH_INDEX = 4,
+    VT_ELSE_SUBGRAPH_INDEX = 6
+  };
+  int32_t then_subgraph_index() const {
+    return GetField<int32_t>(VT_THEN_SUBGRAPH_INDEX, 0);
+  }
+  int32_t else_subgraph_index() const {
+    return GetField<int32_t>(VT_ELSE_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_THEN_SUBGRAPH_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_ELSE_SUBGRAPH_INDEX) &&
+           verifier.EndTable();
+  }
+  IfOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<IfOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct IfOptionsBuilder {
+  typedef IfOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_then_subgraph_index(int32_t then_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0);
+  }
+  void add_else_subgraph_index(int32_t else_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0);
+  }
+  explicit IfOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<IfOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<IfOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<IfOptions> CreateIfOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t then_subgraph_index = 0,
+    int32_t else_subgraph_index = 0) {
+  IfOptionsBuilder builder_(_fbb);
+  builder_.add_else_subgraph_index(else_subgraph_index);
+  builder_.add_then_subgraph_index(then_subgraph_index);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhileOptionsT : public flatbuffers::NativeTable {
+  typedef WhileOptions TableType;
+  int32_t cond_subgraph_index = 0;
+  int32_t body_subgraph_index = 0;
+};
+
+struct WhileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhileOptionsT NativeTableType;
+  typedef WhileOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return WhileOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX) &&
+           verifier.EndTable();
+  }
+  WhileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhileOptionsBuilder {
+  typedef WhileOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit WhileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<WhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhileOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  WhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NonMaxSuppressionV4OptionsT : public flatbuffers::NativeTable {
+  typedef NonMaxSuppressionV4Options TableType;
+};
+
+struct NonMaxSuppressionV4Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NonMaxSuppressionV4OptionsT NativeTableType;
+  typedef NonMaxSuppressionV4OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NonMaxSuppressionV4OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NonMaxSuppressionV4OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV4OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NonMaxSuppressionV4Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NonMaxSuppressionV4OptionsBuilder {
+  typedef NonMaxSuppressionV4Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV4OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<NonMaxSuppressionV4Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NonMaxSuppressionV4Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV4OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NonMaxSuppressionV5OptionsT : public flatbuffers::NativeTable {
+  typedef NonMaxSuppressionV5Options TableType;
+};
+
+struct NonMaxSuppressionV5Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NonMaxSuppressionV5OptionsT NativeTableType;
+  typedef NonMaxSuppressionV5OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NonMaxSuppressionV5OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NonMaxSuppressionV5OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV5OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NonMaxSuppressionV5Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NonMaxSuppressionV5OptionsBuilder {
+  typedef NonMaxSuppressionV5Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV5OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<NonMaxSuppressionV5Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NonMaxSuppressionV5Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV5OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ScatterNdOptionsT : public flatbuffers::NativeTable {
+  typedef ScatterNdOptions TableType;
+};
+
+struct ScatterNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ScatterNdOptionsT NativeTableType;
+  typedef ScatterNdOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ScatterNdOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ScatterNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScatterNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ScatterNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ScatterNdOptionsBuilder {
+  typedef ScatterNdOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ScatterNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ScatterNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ScatterNdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ScatterNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SelectV2OptionsT : public flatbuffers::NativeTable {
+  typedef SelectV2Options TableType;
+};
+
+struct SelectV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SelectV2OptionsT NativeTableType;
+  typedef SelectV2OptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SelectV2OptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SelectV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectV2OptionsBuilder {
+  typedef SelectV2Options Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SelectV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SelectV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SelectV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DensifyOptionsT : public flatbuffers::NativeTable {
+  typedef DensifyOptions TableType;
+};
+
+struct DensifyOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DensifyOptionsT NativeTableType;
+  typedef DensifyOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DensifyOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DensifyOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DensifyOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DensifyOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DensifyOptionsBuilder {
+  typedef DensifyOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit DensifyOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<DensifyOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DensifyOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  DensifyOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SegmentSumOptionsT : public flatbuffers::NativeTable {
+  typedef SegmentSumOptions TableType;
+};
+
+struct SegmentSumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SegmentSumOptionsT NativeTableType;
+  typedef SegmentSumOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SegmentSumOptionsTypeTable();
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SegmentSumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SegmentSumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SegmentSumOptionsBuilder {
+  typedef SegmentSumOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SegmentSumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SegmentSumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SegmentSumOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SegmentSumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchMatMulOptionsT : public flatbuffers::NativeTable {
+  typedef BatchMatMulOptions TableType;
+  bool adj_x = false;
+  bool adj_y = false;
+};
+
+struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BatchMatMulOptionsT NativeTableType;
+  typedef BatchMatMulOptionsBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BatchMatMulOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ADJ_X = 4,
+    VT_ADJ_Y = 6
+  };
+  bool adj_x() const {
+    return GetField<uint8_t>(VT_ADJ_X, 0) != 0;
+  }
+  bool adj_y() const {
+    return GetField<uint8_t>(VT_ADJ_Y, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_X) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_Y) &&
+           verifier.EndTable();
+  }
+  BatchMatMulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchMatMulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BatchMatMulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchMatMulOptionsBuilder {
+  typedef BatchMatMulOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_adj_x(bool adj_x) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_X, static_cast<uint8_t>(adj_x), 0);
+  }
+  void add_adj_y(bool adj_y) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_Y, static_cast<uint8_t>(adj_y), 0);
+  }
+  explicit BatchMatMulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BatchMatMulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BatchMatMulOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool adj_x = false,
+    bool adj_y = false) {
+  BatchMatMulOptionsBuilder builder_(_fbb);
+  builder_.add_adj_y(adj_y);
+  builder_.add_adj_x(adj_x);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorCodeT : public flatbuffers::NativeTable {
+  typedef OperatorCode TableType;
+  tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD;
+  std::string custom_code{};
+  int32_t version = 1;
+};
+
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorCodeT NativeTableType;
+  typedef OperatorCodeBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OperatorCodeTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BUILTIN_CODE = 4,
+    VT_CUSTOM_CODE = 6,
+    VT_VERSION = 8
+  };
+  tflite::BuiltinOperator builtin_code() const {
+    return static_cast<tflite::BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
+  }
+  const flatbuffers::String *custom_code() const {
+    return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
+           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
+           verifier.VerifyString(custom_code()) &&
+           VerifyField<int32_t>(verifier, VT_VERSION) &&
+           verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OperatorCodeBuilder {
+  typedef OperatorCode Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_builtin_code(tflite::BuiltinOperator builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
+  }
+  void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
+    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
+  }
+  explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<OperatorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OperatorCode>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_version(version);
+  builder_.add_custom_code(custom_code);
+  builder_.add_builtin_code(builtin_code);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD,
+    const char *custom_code = nullptr,
+    int32_t version = 1) {
+  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
+  return tflite::CreateOperatorCode(
+      _fbb,
+      builtin_code,
+      custom_code__,
+      version);
+}
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorT : public flatbuffers::NativeTable {
+  typedef Operator TableType;
+  uint32_t opcode_index = 0;
+  std::vector<int32_t> inputs{};
+  std::vector<int32_t> outputs{};
+  tflite::BuiltinOptionsUnion builtin_options{};
+  std::vector<uint8_t> custom_options{};
+  tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
+  std::vector<bool> mutating_variable_inputs{};
+  std::vector<int32_t> intermediates{};
+};
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorT NativeTableType;
+  typedef OperatorBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OperatorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OPCODE_INDEX = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_BUILTIN_OPTIONS_TYPE = 10,
+    VT_BUILTIN_OPTIONS = 12,
+    VT_CUSTOM_OPTIONS = 14,
+    VT_CUSTOM_OPTIONS_FORMAT = 16,
+    VT_MUTATING_VARIABLE_INPUTS = 18,
+    VT_INTERMEDIATES = 20
+  };
+  uint32_t opcode_index() const {
+    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+  }
+  const flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  tflite::BuiltinOptions builtin_options_type() const {
+    return static_cast<tflite::BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+  }
+  const void *builtin_options() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
+  }
+  template<typename T> const T *builtin_options_as() const;
+  const tflite::Conv2DOptions *builtin_options_as_Conv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Conv2DOptions ? static_cast<const tflite::Conv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DepthwiseConv2DOptions ? static_cast<const tflite::DepthwiseConv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ConcatEmbeddingsOptions ? static_cast<const tflite::ConcatEmbeddingsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LSHProjectionOptions ? static_cast<const tflite::LSHProjectionOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Pool2DOptions *builtin_options_as_Pool2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_Pool2DOptions ? static_cast<const tflite::Pool2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SVDFOptions *builtin_options_as_SVDFOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SVDFOptions ? static_cast<const tflite::SVDFOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RNNOptions *builtin_options_as_RNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RNNOptions ? static_cast<const tflite::RNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FullyConnectedOptions ? static_cast<const tflite::FullyConnectedOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SoftmaxOptions ? static_cast<const tflite::SoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ConcatenationOptions ? static_cast<const tflite::ConcatenationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddOptions *builtin_options_as_AddOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AddOptions ? static_cast<const tflite::AddOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::L2NormOptions *builtin_options_as_L2NormOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_L2NormOptions ? static_cast<const tflite::L2NormOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LocalResponseNormalizationOptions *builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LocalResponseNormalizationOptions ? static_cast<const tflite::LocalResponseNormalizationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSTMOptions *builtin_options_as_LSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LSTMOptions ? static_cast<const tflite::LSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ResizeBilinearOptions ? static_cast<const tflite::ResizeBilinearOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOptions *builtin_options_as_CallOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CallOptions ? static_cast<const tflite::CallOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReshapeOptions *builtin_options_as_ReshapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReshapeOptions ? static_cast<const tflite::ReshapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SkipGramOptions *builtin_options_as_SkipGramOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SkipGramOptions ? static_cast<const tflite::SkipGramOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SpaceToDepthOptions ? static_cast<const tflite::SpaceToDepthOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_EmbeddingLookupSparseOptions ? static_cast<const tflite::EmbeddingLookupSparseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MulOptions *builtin_options_as_MulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MulOptions ? static_cast<const tflite::MulOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadOptions *builtin_options_as_PadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PadOptions ? static_cast<const tflite::PadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherOptions *builtin_options_as_GatherOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GatherOptions ? static_cast<const tflite::GatherOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BatchToSpaceNDOptions ? static_cast<const tflite::BatchToSpaceNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SpaceToBatchNDOptions ? static_cast<const tflite::SpaceToBatchNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeOptions *builtin_options_as_TransposeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TransposeOptions ? static_cast<const tflite::TransposeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReducerOptions *builtin_options_as_ReducerOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReducerOptions ? static_cast<const tflite::ReducerOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SubOptions *builtin_options_as_SubOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SubOptions ? static_cast<const tflite::SubOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DivOptions *builtin_options_as_DivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DivOptions ? static_cast<const tflite::DivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SqueezeOptions *builtin_options_as_SqueezeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SqueezeOptions ? static_cast<const tflite::SqueezeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SequenceRNNOptions ? static_cast<const tflite::SequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::StridedSliceOptions *builtin_options_as_StridedSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_StridedSliceOptions ? static_cast<const tflite::StridedSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpOptions *builtin_options_as_ExpOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ExpOptions ? static_cast<const tflite::ExpOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TopKV2Options *builtin_options_as_TopKV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TopKV2Options ? static_cast<const tflite::TopKV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitOptions *builtin_options_as_SplitOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SplitOptions ? static_cast<const tflite::SplitOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogSoftmaxOptions ? static_cast<const tflite::LogSoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CastOptions *builtin_options_as_CastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CastOptions ? static_cast<const tflite::CastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DequantizeOptions *builtin_options_as_DequantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DequantizeOptions ? static_cast<const tflite::DequantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MaximumMinimumOptions ? static_cast<const tflite::MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ArgMaxOptions ? static_cast<const tflite::ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LessOptions ? static_cast<const tflite::LessOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NegOptions *builtin_options_as_NegOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NegOptions ? static_cast<const tflite::NegOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadV2Options *builtin_options_as_PadV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PadV2Options ? static_cast<const tflite::PadV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterOptions *builtin_options_as_GreaterOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GreaterOptions ? static_cast<const tflite::GreaterOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GreaterEqualOptions ? static_cast<const tflite::GreaterEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessEqualOptions *builtin_options_as_LessEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LessEqualOptions ? static_cast<const tflite::LessEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectOptions *builtin_options_as_SelectOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SelectOptions ? static_cast<const tflite::SelectOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SliceOptions *builtin_options_as_SliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SliceOptions ? static_cast<const tflite::SliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TransposeConvOptions ? static_cast<const tflite::TransposeConvOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SparseToDenseOptions ? static_cast<const tflite::SparseToDenseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TileOptions *builtin_options_as_TileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_TileOptions ? static_cast<const tflite::TileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ExpandDimsOptions ? static_cast<const tflite::ExpandDimsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EqualOptions *builtin_options_as_EqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_EqualOptions ? static_cast<const tflite::EqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NotEqualOptions *builtin_options_as_NotEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NotEqualOptions ? static_cast<const tflite::NotEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ShapeOptions *builtin_options_as_ShapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ShapeOptions ? static_cast<const tflite::ShapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PowOptions *builtin_options_as_PowOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PowOptions ? static_cast<const tflite::PowOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMinOptions *builtin_options_as_ArgMinOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ArgMinOptions ? static_cast<const tflite::ArgMinOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FakeQuantOptions *builtin_options_as_FakeQuantOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FakeQuantOptions ? static_cast<const tflite::FakeQuantOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PackOptions *builtin_options_as_PackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_PackOptions ? static_cast<const tflite::PackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalOrOptions *builtin_options_as_LogicalOrOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalOrOptions ? static_cast<const tflite::LogicalOrOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::OneHotOptions *builtin_options_as_OneHotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_OneHotOptions ? static_cast<const tflite::OneHotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalAndOptions *builtin_options_as_LogicalAndOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalAndOptions ? static_cast<const tflite::LogicalAndOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalNotOptions *builtin_options_as_LogicalNotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LogicalNotOptions ? static_cast<const tflite::LogicalNotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnpackOptions *builtin_options_as_UnpackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnpackOptions ? static_cast<const tflite::UnpackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorDivOptions *builtin_options_as_FloorDivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FloorDivOptions ? static_cast<const tflite::FloorDivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquareOptions *builtin_options_as_SquareOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SquareOptions ? static_cast<const tflite::SquareOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ZerosLikeOptions *builtin_options_as_ZerosLikeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ZerosLikeOptions ? static_cast<const tflite::ZerosLikeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FillOptions *builtin_options_as_FillOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FillOptions ? static_cast<const tflite::FillOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceLSTMOptions *builtin_options_as_BidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions ? static_cast<const tflite::BidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BidirectionalSequenceRNNOptions ? static_cast<const tflite::BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnidirectionalSequenceLSTMOptions ? static_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorModOptions *builtin_options_as_FloorModOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_FloorModOptions ? static_cast<const tflite::FloorModOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RangeOptions *builtin_options_as_RangeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RangeOptions ? static_cast<const tflite::RangeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const tflite::ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_LeakyReluOptions ? static_cast<const tflite::LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SquaredDifferenceOptions ? static_cast<const tflite::SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MirrorPadOptions ? static_cast<const tflite::MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AbsOptions ? static_cast<const tflite::AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SplitVOptions ? static_cast<const tflite::SplitVOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UniqueOptions ? static_cast<const tflite::UniqueOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReverseV2Options ? static_cast<const tflite::ReverseV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddNOptions *builtin_options_as_AddNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_AddNOptions ? static_cast<const tflite::AddNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_GatherNdOptions ? static_cast<const tflite::GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_CosOptions ? static_cast<const tflite::CosOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_WhereOptions ? static_cast<const tflite::WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RankOptions ? static_cast<const tflite::RankOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ReverseSequenceOptions ? static_cast<const tflite::ReverseSequenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixDiagOptions *builtin_options_as_MatrixDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MatrixDiagOptions ? static_cast<const tflite::MatrixDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::QuantizeOptions *builtin_options_as_QuantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_QuantizeOptions ? static_cast<const tflite::QuantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixSetDiagOptions *builtin_options_as_MatrixSetDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_MatrixSetDiagOptions ? static_cast<const tflite::MatrixSetDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HardSwishOptions *builtin_options_as_HardSwishOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_HardSwishOptions ? static_cast<const tflite::HardSwishOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::IfOptions *builtin_options_as_IfOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_IfOptions ? static_cast<const tflite::IfOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhileOptions *builtin_options_as_WhileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_WhileOptions ? static_cast<const tflite::WhileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DepthToSpaceOptions ? static_cast<const tflite::DepthToSpaceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV4Options *builtin_options_as_NonMaxSuppressionV4Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NonMaxSuppressionV4Options ? static_cast<const tflite::NonMaxSuppressionV4Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV5Options *builtin_options_as_NonMaxSuppressionV5Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_NonMaxSuppressionV5Options ? static_cast<const tflite::NonMaxSuppressionV5Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::ScatterNdOptions *builtin_options_as_ScatterNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_ScatterNdOptions ? static_cast<const tflite::ScatterNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectV2Options *builtin_options_as_SelectV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SelectV2Options ? static_cast<const tflite::SelectV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::DensifyOptions *builtin_options_as_DensifyOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DensifyOptions ? static_cast<const tflite::DensifyOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SegmentSumOptions *builtin_options_as_SegmentSumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_SegmentSumOptions ? static_cast<const tflite::SegmentSumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BatchMatMulOptions ? static_cast<const tflite::BatchMatMulOptions *>(builtin_options()) : nullptr;
+  }
+  const flatbuffers::Vector<uint8_t> *custom_options() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
+  }
+  tflite::CustomOptionsFormat custom_options_format() const {
+    return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+  }
+  const flatbuffers::Vector<uint8_t> *mutating_variable_inputs() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_MUTATING_VARIABLE_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *intermediates() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INTERMEDIATES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
+           VerifyBuiltinOptions(verifier, builtin_options(), builtin_options_type()) &&
+           VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
+           verifier.VerifyVector(custom_options()) &&
+           VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT) &&
+           VerifyOffset(verifier, VT_MUTATING_VARIABLE_INPUTS) &&
+           verifier.VerifyVector(mutating_variable_inputs()) &&
+           VerifyOffset(verifier, VT_INTERMEDIATES) &&
+           verifier.VerifyVector(intermediates()) &&
+           verifier.EndTable();
+  }
+  OperatorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Operator> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template<> inline const tflite::Conv2DOptions *Operator::builtin_options_as<tflite::Conv2DOptions>() const {
+  return builtin_options_as_Conv2DOptions();
+}
+
+template<> inline const tflite::DepthwiseConv2DOptions *Operator::builtin_options_as<tflite::DepthwiseConv2DOptions>() const {
+  return builtin_options_as_DepthwiseConv2DOptions();
+}
+
+template<> inline const tflite::ConcatEmbeddingsOptions *Operator::builtin_options_as<tflite::ConcatEmbeddingsOptions>() const {
+  return builtin_options_as_ConcatEmbeddingsOptions();
+}
+
+template<> inline const tflite::LSHProjectionOptions *Operator::builtin_options_as<tflite::LSHProjectionOptions>() const {
+  return builtin_options_as_LSHProjectionOptions();
+}
+
+template<> inline const tflite::Pool2DOptions *Operator::builtin_options_as<tflite::Pool2DOptions>() const {
+  return builtin_options_as_Pool2DOptions();
+}
+
+template<> inline const tflite::SVDFOptions *Operator::builtin_options_as<tflite::SVDFOptions>() const {
+  return builtin_options_as_SVDFOptions();
+}
+
+template<> inline const tflite::RNNOptions *Operator::builtin_options_as<tflite::RNNOptions>() const {
+  return builtin_options_as_RNNOptions();
+}
+
+template<> inline const tflite::FullyConnectedOptions *Operator::builtin_options_as<tflite::FullyConnectedOptions>() const {
+  return builtin_options_as_FullyConnectedOptions();
+}
+
+template<> inline const tflite::SoftmaxOptions *Operator::builtin_options_as<tflite::SoftmaxOptions>() const {
+  return builtin_options_as_SoftmaxOptions();
+}
+
+template<> inline const tflite::ConcatenationOptions *Operator::builtin_options_as<tflite::ConcatenationOptions>() const {
+  return builtin_options_as_ConcatenationOptions();
+}
+
+template<> inline const tflite::AddOptions *Operator::builtin_options_as<tflite::AddOptions>() const {
+  return builtin_options_as_AddOptions();
+}
+
+template<> inline const tflite::L2NormOptions *Operator::builtin_options_as<tflite::L2NormOptions>() const {
+  return builtin_options_as_L2NormOptions();
+}
+
+template<> inline const tflite::LocalResponseNormalizationOptions *Operator::builtin_options_as<tflite::LocalResponseNormalizationOptions>() const {
+  return builtin_options_as_LocalResponseNormalizationOptions();
+}
+
+template<> inline const tflite::LSTMOptions *Operator::builtin_options_as<tflite::LSTMOptions>() const {
+  return builtin_options_as_LSTMOptions();
+}
+
+template<> inline const tflite::ResizeBilinearOptions *Operator::builtin_options_as<tflite::ResizeBilinearOptions>() const {
+  return builtin_options_as_ResizeBilinearOptions();
+}
+
+template<> inline const tflite::CallOptions *Operator::builtin_options_as<tflite::CallOptions>() const {
+  return builtin_options_as_CallOptions();
+}
+
+template<> inline const tflite::ReshapeOptions *Operator::builtin_options_as<tflite::ReshapeOptions>() const {
+  return builtin_options_as_ReshapeOptions();
+}
+
+template<> inline const tflite::SkipGramOptions *Operator::builtin_options_as<tflite::SkipGramOptions>() const {
+  return builtin_options_as_SkipGramOptions();
+}
+
+template<> inline const tflite::SpaceToDepthOptions *Operator::builtin_options_as<tflite::SpaceToDepthOptions>() const {
+  return builtin_options_as_SpaceToDepthOptions();
+}
+
+template<> inline const tflite::EmbeddingLookupSparseOptions *Operator::builtin_options_as<tflite::EmbeddingLookupSparseOptions>() const {
+  return builtin_options_as_EmbeddingLookupSparseOptions();
+}
+
+template<> inline const tflite::MulOptions *Operator::builtin_options_as<tflite::MulOptions>() const {
+  return builtin_options_as_MulOptions();
+}
+
+template<> inline const tflite::PadOptions *Operator::builtin_options_as<tflite::PadOptions>() const {
+  return builtin_options_as_PadOptions();
+}
+
+template<> inline const tflite::GatherOptions *Operator::builtin_options_as<tflite::GatherOptions>() const {
+  return builtin_options_as_GatherOptions();
+}
+
+template<> inline const tflite::BatchToSpaceNDOptions *Operator::builtin_options_as<tflite::BatchToSpaceNDOptions>() const {
+  return builtin_options_as_BatchToSpaceNDOptions();
+}
+
+template<> inline const tflite::SpaceToBatchNDOptions *Operator::builtin_options_as<tflite::SpaceToBatchNDOptions>() const {
+  return builtin_options_as_SpaceToBatchNDOptions();
+}
+
+template<> inline const tflite::TransposeOptions *Operator::builtin_options_as<tflite::TransposeOptions>() const {
+  return builtin_options_as_TransposeOptions();
+}
+
+template<> inline const tflite::ReducerOptions *Operator::builtin_options_as<tflite::ReducerOptions>() const {
+  return builtin_options_as_ReducerOptions();
+}
+
+template<> inline const tflite::SubOptions *Operator::builtin_options_as<tflite::SubOptions>() const {
+  return builtin_options_as_SubOptions();
+}
+
+template<> inline const tflite::DivOptions *Operator::builtin_options_as<tflite::DivOptions>() const {
+  return builtin_options_as_DivOptions();
+}
+
+template<> inline const tflite::SqueezeOptions *Operator::builtin_options_as<tflite::SqueezeOptions>() const {
+  return builtin_options_as_SqueezeOptions();
+}
+
+template<> inline const tflite::SequenceRNNOptions *Operator::builtin_options_as<tflite::SequenceRNNOptions>() const {
+  return builtin_options_as_SequenceRNNOptions();
+}
+
+template<> inline const tflite::StridedSliceOptions *Operator::builtin_options_as<tflite::StridedSliceOptions>() const {
+  return builtin_options_as_StridedSliceOptions();
+}
+
+template<> inline const tflite::ExpOptions *Operator::builtin_options_as<tflite::ExpOptions>() const {
+  return builtin_options_as_ExpOptions();
+}
+
+template<> inline const tflite::TopKV2Options *Operator::builtin_options_as<tflite::TopKV2Options>() const {
+  return builtin_options_as_TopKV2Options();
+}
+
+template<> inline const tflite::SplitOptions *Operator::builtin_options_as<tflite::SplitOptions>() const {
+  return builtin_options_as_SplitOptions();
+}
+
+template<> inline const tflite::LogSoftmaxOptions *Operator::builtin_options_as<tflite::LogSoftmaxOptions>() const {
+  return builtin_options_as_LogSoftmaxOptions();
+}
+
+template<> inline const tflite::CastOptions *Operator::builtin_options_as<tflite::CastOptions>() const {
+  return builtin_options_as_CastOptions();
+}
+
+template<> inline const tflite::DequantizeOptions *Operator::builtin_options_as<tflite::DequantizeOptions>() const {
+  return builtin_options_as_DequantizeOptions();
+}
+
+template<> inline const tflite::MaximumMinimumOptions *Operator::builtin_options_as<tflite::MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const tflite::ArgMaxOptions *Operator::builtin_options_as<tflite::ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
+template<> inline const tflite::LessOptions *Operator::builtin_options_as<tflite::LessOptions>() const {
+  return builtin_options_as_LessOptions();
+}
+
+template<> inline const tflite::NegOptions *Operator::builtin_options_as<tflite::NegOptions>() const {
+  return builtin_options_as_NegOptions();
+}
+
+template<> inline const tflite::PadV2Options *Operator::builtin_options_as<tflite::PadV2Options>() const {
+  return builtin_options_as_PadV2Options();
+}
+
+template<> inline const tflite::GreaterOptions *Operator::builtin_options_as<tflite::GreaterOptions>() const {
+  return builtin_options_as_GreaterOptions();
+}
+
+template<> inline const tflite::GreaterEqualOptions *Operator::builtin_options_as<tflite::GreaterEqualOptions>() const {
+  return builtin_options_as_GreaterEqualOptions();
+}
+
+template<> inline const tflite::LessEqualOptions *Operator::builtin_options_as<tflite::LessEqualOptions>() const {
+  return builtin_options_as_LessEqualOptions();
+}
+
+template<> inline const tflite::SelectOptions *Operator::builtin_options_as<tflite::SelectOptions>() const {
+  return builtin_options_as_SelectOptions();
+}
+
+template<> inline const tflite::SliceOptions *Operator::builtin_options_as<tflite::SliceOptions>() const {
+  return builtin_options_as_SliceOptions();
+}
+
+template<> inline const tflite::TransposeConvOptions *Operator::builtin_options_as<tflite::TransposeConvOptions>() const {
+  return builtin_options_as_TransposeConvOptions();
+}
+
+template<> inline const tflite::SparseToDenseOptions *Operator::builtin_options_as<tflite::SparseToDenseOptions>() const {
+  return builtin_options_as_SparseToDenseOptions();
+}
+
+template<> inline const tflite::TileOptions *Operator::builtin_options_as<tflite::TileOptions>() const {
+  return builtin_options_as_TileOptions();
+}
+
+template<> inline const tflite::ExpandDimsOptions *Operator::builtin_options_as<tflite::ExpandDimsOptions>() const {
+  return builtin_options_as_ExpandDimsOptions();
+}
+
+template<> inline const tflite::EqualOptions *Operator::builtin_options_as<tflite::EqualOptions>() const {
+  return builtin_options_as_EqualOptions();
+}
+
+template<> inline const tflite::NotEqualOptions *Operator::builtin_options_as<tflite::NotEqualOptions>() const {
+  return builtin_options_as_NotEqualOptions();
+}
+
+template<> inline const tflite::ShapeOptions *Operator::builtin_options_as<tflite::ShapeOptions>() const {
+  return builtin_options_as_ShapeOptions();
+}
+
+template<> inline const tflite::PowOptions *Operator::builtin_options_as<tflite::PowOptions>() const {
+  return builtin_options_as_PowOptions();
+}
+
+template<> inline const tflite::ArgMinOptions *Operator::builtin_options_as<tflite::ArgMinOptions>() const {
+  return builtin_options_as_ArgMinOptions();
+}
+
+template<> inline const tflite::FakeQuantOptions *Operator::builtin_options_as<tflite::FakeQuantOptions>() const {
+  return builtin_options_as_FakeQuantOptions();
+}
+
+template<> inline const tflite::PackOptions *Operator::builtin_options_as<tflite::PackOptions>() const {
+  return builtin_options_as_PackOptions();
+}
+
+template<> inline const tflite::LogicalOrOptions *Operator::builtin_options_as<tflite::LogicalOrOptions>() const {
+  return builtin_options_as_LogicalOrOptions();
+}
+
+template<> inline const tflite::OneHotOptions *Operator::builtin_options_as<tflite::OneHotOptions>() const {
+  return builtin_options_as_OneHotOptions();
+}
+
+template<> inline const tflite::LogicalAndOptions *Operator::builtin_options_as<tflite::LogicalAndOptions>() const {
+  return builtin_options_as_LogicalAndOptions();
+}
+
+template<> inline const tflite::LogicalNotOptions *Operator::builtin_options_as<tflite::LogicalNotOptions>() const {
+  return builtin_options_as_LogicalNotOptions();
+}
+
+template<> inline const tflite::UnpackOptions *Operator::builtin_options_as<tflite::UnpackOptions>() const {
+  return builtin_options_as_UnpackOptions();
+}
+
+template<> inline const tflite::FloorDivOptions *Operator::builtin_options_as<tflite::FloorDivOptions>() const {
+  return builtin_options_as_FloorDivOptions();
+}
+
+template<> inline const tflite::SquareOptions *Operator::builtin_options_as<tflite::SquareOptions>() const {
+  return builtin_options_as_SquareOptions();
+}
+
+template<> inline const tflite::ZerosLikeOptions *Operator::builtin_options_as<tflite::ZerosLikeOptions>() const {
+  return builtin_options_as_ZerosLikeOptions();
+}
+
+template<> inline const tflite::FillOptions *Operator::builtin_options_as<tflite::FillOptions>() const {
+  return builtin_options_as_FillOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_BidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceRNNOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceRNNOptions>() const {
+  return builtin_options_as_BidirectionalSequenceRNNOptions();
+}
+
+template<> inline const tflite::UnidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::UnidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_UnidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::FloorModOptions *Operator::builtin_options_as<tflite::FloorModOptions>() const {
+  return builtin_options_as_FloorModOptions();
+}
+
+template<> inline const tflite::RangeOptions *Operator::builtin_options_as<tflite::RangeOptions>() const {
+  return builtin_options_as_RangeOptions();
+}
+
+template<> inline const tflite::ResizeNearestNeighborOptions *Operator::builtin_options_as<tflite::ResizeNearestNeighborOptions>() const {
+  return builtin_options_as_ResizeNearestNeighborOptions();
+}
+
+template<> inline const tflite::LeakyReluOptions *Operator::builtin_options_as<tflite::LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const tflite::SquaredDifferenceOptions *Operator::builtin_options_as<tflite::SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const tflite::MirrorPadOptions *Operator::builtin_options_as<tflite::MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const tflite::AbsOptions *Operator::builtin_options_as<tflite::AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const tflite::SplitVOptions *Operator::builtin_options_as<tflite::SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
+template<> inline const tflite::UniqueOptions *Operator::builtin_options_as<tflite::UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
+template<> inline const tflite::ReverseV2Options *Operator::builtin_options_as<tflite::ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
+template<> inline const tflite::AddNOptions *Operator::builtin_options_as<tflite::AddNOptions>() const {
+  return builtin_options_as_AddNOptions();
+}
+
+template<> inline const tflite::GatherNdOptions *Operator::builtin_options_as<tflite::GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const tflite::CosOptions *Operator::builtin_options_as<tflite::CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const tflite::WhereOptions *Operator::builtin_options_as<tflite::WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const tflite::RankOptions *Operator::builtin_options_as<tflite::RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
+template<> inline const tflite::ReverseSequenceOptions *Operator::builtin_options_as<tflite::ReverseSequenceOptions>() const {
+  return builtin_options_as_ReverseSequenceOptions();
+}
+
+template<> inline const tflite::MatrixDiagOptions *Operator::builtin_options_as<tflite::MatrixDiagOptions>() const {
+  return builtin_options_as_MatrixDiagOptions();
+}
+
+template<> inline const tflite::QuantizeOptions *Operator::builtin_options_as<tflite::QuantizeOptions>() const {
+  return builtin_options_as_QuantizeOptions();
+}
+
+template<> inline const tflite::MatrixSetDiagOptions *Operator::builtin_options_as<tflite::MatrixSetDiagOptions>() const {
+  return builtin_options_as_MatrixSetDiagOptions();
+}
+
+template<> inline const tflite::HardSwishOptions *Operator::builtin_options_as<tflite::HardSwishOptions>() const {
+  return builtin_options_as_HardSwishOptions();
+}
+
+template<> inline const tflite::IfOptions *Operator::builtin_options_as<tflite::IfOptions>() const {
+  return builtin_options_as_IfOptions();
+}
+
+template<> inline const tflite::WhileOptions *Operator::builtin_options_as<tflite::WhileOptions>() const {
+  return builtin_options_as_WhileOptions();
+}
+
+template<> inline const tflite::DepthToSpaceOptions *Operator::builtin_options_as<tflite::DepthToSpaceOptions>() const {
+  return builtin_options_as_DepthToSpaceOptions();
+}
+
+template<> inline const tflite::NonMaxSuppressionV4Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV4Options>() const {
+  return builtin_options_as_NonMaxSuppressionV4Options();
+}
+
+template<> inline const tflite::NonMaxSuppressionV5Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV5Options>() const {
+  return builtin_options_as_NonMaxSuppressionV5Options();
+}
+
+template<> inline const tflite::ScatterNdOptions *Operator::builtin_options_as<tflite::ScatterNdOptions>() const {
+  return builtin_options_as_ScatterNdOptions();
+}
+
+template<> inline const tflite::SelectV2Options *Operator::builtin_options_as<tflite::SelectV2Options>() const {
+  return builtin_options_as_SelectV2Options();
+}
+
+template<> inline const tflite::DensifyOptions *Operator::builtin_options_as<tflite::DensifyOptions>() const {
+  return builtin_options_as_DensifyOptions();
+}
+
+template<> inline const tflite::SegmentSumOptions *Operator::builtin_options_as<tflite::SegmentSumOptions>() const {
+  return builtin_options_as_SegmentSumOptions();
+}
+
+template<> inline const tflite::BatchMatMulOptions *Operator::builtin_options_as<tflite::BatchMatMulOptions>() const {
+  return builtin_options_as_BatchMatMulOptions();
+}
+
+struct OperatorBuilder {
+  typedef Operator Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_opcode_index(uint32_t opcode_index) {
+    fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
+  }
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(Operator::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
+  }
+  void add_builtin_options_type(tflite::BuiltinOptions builtin_options_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
+  }
+  void add_builtin_options(flatbuffers::Offset<void> builtin_options) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
+  }
+  void add_custom_options(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
+    fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
+  }
+  void add_custom_options_format(tflite::CustomOptionsFormat custom_options_format) {
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
+  }
+  void add_mutating_variable_inputs(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs) {
+    fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
+  }
+  void add_intermediates(flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates) {
+    fbb_.AddOffset(Operator::VT_INTERMEDIATES, intermediates);
+  }
+  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
+    flatbuffers::Offset<void> builtin_options = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_intermediates(intermediates);
+  builder_.add_mutating_variable_inputs(mutating_variable_inputs);
+  builder_.add_custom_options(custom_options);
+  builder_.add_builtin_options(builtin_options);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_opcode_index(opcode_index);
+  builder_.add_custom_options_format(custom_options_format);
+  builder_.add_builtin_options_type(builtin_options_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Operator> CreateOperatorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
+    flatbuffers::Offset<void> builtin_options = 0,
+    const std::vector<uint8_t> *custom_options = nullptr,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
+    const std::vector<int32_t> *intermediates = nullptr) {
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto custom_options__ = custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0;
+  auto mutating_variable_inputs__ = mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0;
+  auto intermediates__ = intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0;
+  return tflite::CreateOperator(
+      _fbb,
+      opcode_index,
+      inputs__,
+      outputs__,
+      builtin_options_type,
+      builtin_options,
+      custom_options__,
+      custom_options_format,
+      mutating_variable_inputs__,
+      intermediates__);
+}
+
+flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubGraphT : public flatbuffers::NativeTable {
+  typedef SubGraph TableType;
+  std::vector<std::unique_ptr<tflite::TensorT>> tensors{};
+  std::vector<int32_t> inputs{};
+  std::vector<int32_t> outputs{};
+  std::vector<std::unique_ptr<tflite::OperatorT>> operators{};
+  std::string name{};
+};
+
+struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SubGraphT NativeTableType;
+  typedef SubGraphBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SubGraphTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TENSORS = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_OPERATORS = 10,
+    VT_NAME = 12
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>> *tensors() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>> *>(VT_TENSORS);
+  }
+  const flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>> *operators() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>> *>(VT_OPERATORS);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TENSORS) &&
+           verifier.VerifyVector(tensors()) &&
+           verifier.VerifyVectorOfTables(tensors()) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.VerifyVector(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+  SubGraphT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubGraph> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubGraphBuilder {
+  typedef SubGraph Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>> tensors) {
+    fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
+  }
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(SubGraph::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
+  }
+  void add_operators(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>> operators) {
+    fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(SubGraph::VT_NAME, name);
+  }
+  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SubGraph> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SubGraph>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>> tensors = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>> operators = 0,
+    flatbuffers::Offset<flatbuffers::String> name = 0) {
+  SubGraphBuilder builder_(_fbb);
+  builder_.add_name(name);
+  builder_.add_operators(operators);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_tensors(tensors);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<tflite::Tensor>> *tensors = nullptr,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::Operator>> *operators = nullptr,
+    const char *name = nullptr) {
+  auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::Tensor>>(*tensors) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<flatbuffers::Offset<tflite::Operator>>(*operators) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateSubGraph(
+      _fbb,
+      tensors__,
+      inputs__,
+      outputs__,
+      operators__,
+      name__);
+}
+
+flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BufferT : public flatbuffers::NativeTable {
+  typedef Buffer TableType;
+  std::vector<uint8_t> data{};
+};
+
+struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BufferT NativeTableType;
+  typedef BufferBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BufferTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+  BufferT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Buffer> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BufferBuilder {
+  typedef Buffer Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(Buffer::VT_DATA, data);
+  }
+  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Buffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Buffer>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+  BufferBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Buffer> CreateBufferDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return tflite::CreateBuffer(
+      _fbb,
+      data__);
+}
+
+flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MetadataT : public flatbuffers::NativeTable {
+  typedef Metadata TableType;
+  std::string name{};
+  uint32_t buffer = 0;
+};
+
+struct Metadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MetadataT NativeTableType;
+  typedef MetadataBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MetadataTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_BUFFER = 6
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER) &&
+           verifier.EndTable();
+  }
+  MetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Metadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MetadataBuilder {
+  typedef Metadata Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Metadata::VT_NAME, name);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Metadata::VT_BUFFER, buffer, 0);
+  }
+  explicit MetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Metadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Metadata>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Metadata> CreateMetadata(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    uint32_t buffer = 0) {
+  MetadataBuilder builder_(_fbb);
+  builder_.add_buffer(buffer);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t buffer = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateMetadata(
+      _fbb,
+      name__,
+      buffer);
+}
+
+flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelT : public flatbuffers::NativeTable {
+  typedef Model TableType;
+  uint32_t version = 0;
+  std::vector<std::unique_ptr<tflite::OperatorCodeT>> operator_codes{};
+  std::vector<std::unique_ptr<tflite::SubGraphT>> subgraphs{};
+  std::string description{};
+  std::vector<std::unique_ptr<tflite::BufferT>> buffers{};
+  std::vector<int32_t> metadata_buffer{};
+  std::vector<std::unique_ptr<tflite::MetadataT>> metadata{};
+};
+
+struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ModelT NativeTableType;
+  typedef ModelBuilder Builder;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ModelTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION = 4,
+    VT_OPERATOR_CODES = 6,
+    VT_SUBGRAPHS = 8,
+    VT_DESCRIPTION = 10,
+    VT_BUFFERS = 12,
+    VT_METADATA_BUFFER = 14,
+    VT_METADATA = 16
+  };
+  uint32_t version() const {
+    return GetField<uint32_t>(VT_VERSION, 0);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *operator_codes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *>(VT_OPERATOR_CODES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>> *subgraphs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>> *>(VT_SUBGRAPHS);
+  }
+  const flatbuffers::String *description() const {
+    return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>> *buffers() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>> *>(VT_BUFFERS);
+  }
+  const flatbuffers::Vector<int32_t> *metadata_buffer() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *metadata() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_VERSION) &&
+           VerifyOffset(verifier, VT_OPERATOR_CODES) &&
+           verifier.VerifyVector(operator_codes()) &&
+           verifier.VerifyVectorOfTables(operator_codes()) &&
+           VerifyOffset(verifier, VT_SUBGRAPHS) &&
+           verifier.VerifyVector(subgraphs()) &&
+           verifier.VerifyVectorOfTables(subgraphs()) &&
+           VerifyOffset(verifier, VT_DESCRIPTION) &&
+           verifier.VerifyString(description()) &&
+           VerifyOffset(verifier, VT_BUFFERS) &&
+           verifier.VerifyVector(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) &&
+           VerifyOffset(verifier, VT_METADATA_BUFFER) &&
+           verifier.VerifyVector(metadata_buffer()) &&
+           VerifyOffset(verifier, VT_METADATA) &&
+           verifier.VerifyVector(metadata()) &&
+           verifier.VerifyVectorOfTables(metadata()) &&
+           verifier.EndTable();
+  }
+  ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Model> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelBuilder {
+  typedef Model Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_version(uint32_t version) {
+    fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
+  }
+  void add_operator_codes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>>> operator_codes) {
+    fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
+  }
+  void add_subgraphs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>>> subgraphs) {
+    fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
+  }
+  void add_description(flatbuffers::Offset<flatbuffers::String> description) {
+    fbb_.AddOffset(Model::VT_DESCRIPTION, description);
+  }
+  void add_buffers(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers) {
+    fbb_.AddOffset(Model::VT_BUFFERS, buffers);
+  }
+  void add_metadata_buffer(flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer) {
+    fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
+  }
+  void add_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata) {
+    fbb_.AddOffset(Model::VT_METADATA, metadata);
+  }
+  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Model> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Model>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>>> operator_codes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>>> subgraphs = 0,
+    flatbuffers::Offset<flatbuffers::String> description = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0) {
+  ModelBuilder builder_(_fbb);
+  builder_.add_metadata(metadata);
+  builder_.add_metadata_buffer(metadata_buffer);
+  builder_.add_buffers(buffers);
+  builder_.add_description(description);
+  builder_.add_subgraphs(subgraphs);
+  builder_.add_operator_codes(operator_codes);
+  builder_.add_version(version);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Model> CreateModelDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    const std::vector<flatbuffers::Offset<tflite::OperatorCode>> *operator_codes = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::SubGraph>> *subgraphs = nullptr,
+    const char *description = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
+    const std::vector<int32_t> *metadata_buffer = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr) {
+  auto operator_codes__ = operator_codes ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
+  auto subgraphs__ = subgraphs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
+  auto description__ = description ? _fbb.CreateString(description) : 0;
+  auto buffers__ = buffers ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
+  auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
+  auto metadata__ = metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  return tflite::CreateModel(
+      _fbb,
+      version,
+      operator_codes__,
+      subgraphs__,
+      description__,
+      buffers__,
+      metadata_buffer__,
+      metadata__);
+}
+
+flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CustomQuantizationT>(new CustomQuantizationT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom.begin()); } }
+}
+
+inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCustomQuantization(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->custom.size(), sizeof(uint8_t), 16);
+  auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      _custom);
+}
+
+inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<QuantizationParametersT>(new QuantizationParametersT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } }
+  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } }
+  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } }
+  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } }
+  { auto _e = details_type(); _o->details.type = _e; }
+  { auto _e = details(); if (_e) _o->details.value = tflite::QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); }
+  { auto _e = quantized_dimension(); _o->quantized_dimension = _e; }
+}
+
+inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizationParameters(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
+  auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
+  auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
+  auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  auto _details_type = _o->details.type;
+  auto _details = _o->details.Pack(_fbb);
+  auto _quantized_dimension = _o->quantized_dimension;
+  return tflite::CreateQuantizationParameters(
+      _fbb,
+      _min,
+      _max,
+      _scale,
+      _zero_point,
+      _details_type,
+      _details,
+      _quantized_dimension);
+}
+
+inline Int32VectorT *Int32Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Int32VectorT>(new Int32VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Int32Vector::UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<Int32Vector> Int32Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInt32Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Int32VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint16VectorT *Uint16Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Uint16VectorT>(new Uint16VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Uint16Vector::UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<Uint16Vector> Uint16Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint16Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint16VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint16_t), 4);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint8VectorT *Uint8Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Uint8VectorT>(new Uint8VectorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->values.begin()); } }
+}
+
+inline flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint8Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint8VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint8_t), 4);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      _values);
+}
+
+inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DimensionMetadataT>(new DimensionMetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = format(); _o->format = _e; }
+  { auto _e = dense_size(); _o->dense_size = _e; }
+  { auto _e = array_segments_type(); _o->array_segments.type = _e; }
+  { auto _e = array_segments(); if (_e) _o->array_segments.value = tflite::SparseIndexVectorUnion::UnPack(_e, array_segments_type(), _resolver); }
+  { auto _e = array_indices_type(); _o->array_indices.type = _e; }
+  { auto _e = array_indices(); if (_e) _o->array_indices.value = tflite::SparseIndexVectorUnion::UnPack(_e, array_indices_type(), _resolver); }
+}
+
+inline flatbuffers::Offset<DimensionMetadata> DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDimensionMetadata(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _format = _o->format;
+  auto _dense_size = _o->dense_size;
+  auto _array_segments_type = _o->array_segments.type;
+  auto _array_segments = _o->array_segments.Pack(_fbb);
+  auto _array_indices_type = _o->array_indices.type;
+  auto _array_indices = _o->array_indices.Pack(_fbb);
+  return tflite::CreateDimensionMetadata(
+      _fbb,
+      _format,
+      _dense_size,
+      _array_segments_type,
+      _array_segments,
+      _array_indices_type,
+      _array_indices);
+}
+
+inline SparsityParametersT *SparsityParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparsityParametersT>(new SparsityParametersT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } }
+  { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } }
+  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparsityParameters(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0;
+  auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0;
+  auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::DimensionMetadata>> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateSparsityParameters(
+      _fbb,
+      _traversal_order,
+      _block_map,
+      _dim_metadata);
+}
+
+inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TensorT>(new TensorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } }
+  { auto _e = type(); _o->type = _e; }
+  { auto _e = buffer(); _o->buffer = _e; }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr<tflite::QuantizationParametersT>(_e->UnPack(_resolver)); }
+  { auto _e = is_variable(); _o->is_variable = _e; }
+  { auto _e = sparsity(); if (_e) _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); }
+  { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<Tensor> Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensor(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
+  auto _type = _o->type;
+  auto _buffer = _o->buffer;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0;
+  auto _is_variable = _o->is_variable;
+  auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0;
+  auto _shape_signature = _o->shape_signature.size() ? _fbb.CreateVector(_o->shape_signature) : 0;
+  return tflite::CreateTensor(
+      _fbb,
+      _shape,
+      _type,
+      _buffer,
+      _name,
+      _quantization,
+      _is_variable,
+      _sparsity,
+      _shape_signature);
+}
+
+inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Conv2DOptionsT>(new Conv2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; }
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
+}
+
+inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
+  return tflite::CreateConv2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
+}
+
+inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<Pool2DOptionsT>(new Pool2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = filter_width(); _o->filter_width = _e; }
+  { auto _e = filter_height(); _o->filter_height = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePool2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _filter_width = _o->filter_width;
+  auto _filter_height = _o->filter_height;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreatePool2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _filter_width,
+      _filter_height,
+      _fused_activation_function);
+}
+
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DepthwiseConv2DOptionsT>(new DepthwiseConv2DOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = depth_multiplier(); _o->depth_multiplier = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; }
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _depth_multiplier = _o->depth_multiplier;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
+  return tflite::CreateDepthwiseConv2DOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h,
+      _depth_multiplier,
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
+}
+
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConcatEmbeddingsOptionsT>(new ConcatEmbeddingsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_channels(); _o->num_channels = _e; }
+  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } }
+  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_channels = _o->num_channels;
+  auto _num_columns_per_channel = _o->num_columns_per_channel.size() ? _fbb.CreateVector(_o->num_columns_per_channel) : 0;
+  auto _embedding_dim_per_channel = _o->embedding_dim_per_channel.size() ? _fbb.CreateVector(_o->embedding_dim_per_channel) : 0;
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb,
+      _num_channels,
+      _num_columns_per_channel,
+      _embedding_dim_per_channel);
+}
+
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LSHProjectionOptionsT>(new LSHProjectionOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = type(); _o->type = _e; }
+}
+
+inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _type = _o->type;
+  return tflite::CreateLSHProjectionOptions(
+      _fbb,
+      _type);
+}
+
+inline SVDFOptionsT *SVDFOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SVDFOptionsT>(new SVDFOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = rank(); _o->rank = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSVDFOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _rank = _o->rank;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateSVDFOptions(
+      _fbb,
+      _rank,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline RNNOptionsT *RNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RNNOptionsT>(new RNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateRNNOptions(
+      _fbb,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline SequenceRNNOptionsT *SequenceRNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SequenceRNNOptionsT>(new SequenceRNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SequenceRNNOptions::UnPackTo(SequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SequenceRNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateSequenceRNNOptions(
+      _fbb,
+      _time_major,
+      _fused_activation_function,
+      _asymmetric_quantize_inputs);
+}
+
+inline BidirectionalSequenceRNNOptionsT *BidirectionalSequenceRNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BidirectionalSequenceRNNOptionsT>(new BidirectionalSequenceRNNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> BidirectionalSequenceRNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceRNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _merge_outputs = _o->merge_outputs;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb,
+      _time_major,
+      _fused_activation_function,
+      _merge_outputs,
+      _asymmetric_quantize_inputs);
+}
+
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FullyConnectedOptionsT>(new FullyConnectedOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = weights_format(); _o->weights_format = _e; }
+  { auto _e = keep_num_dims(); _o->keep_num_dims = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _weights_format = _o->weights_format;
+  auto _keep_num_dims = _o->keep_num_dims;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateFullyConnectedOptions(
+      _fbb,
+      _fused_activation_function,
+      _weights_format,
+      _keep_num_dims,
+      _asymmetric_quantize_inputs);
+}
+
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SoftmaxOptionsT>(new SoftmaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = beta(); _o->beta = _e; }
+}
+
+inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _beta = _o->beta;
+  return tflite::CreateSoftmaxOptions(
+      _fbb,
+      _beta);
+}
+
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConcatenationOptionsT>(new ConcatenationOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatenationOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConcatenationOptions(
+      _fbb,
+      _axis,
+      _fused_activation_function);
+}
+
+inline AddOptionsT *AddOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AddOptionsT>(new AddOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
+}
+
+inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
+  return tflite::CreateAddOptions(
+      _fbb,
+      _fused_activation_function,
+      _pot_scale_int16);
+}
+
+inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MulOptionsT>(new MulOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MulOptions::UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline flatbuffers::Offset<MulOptions> MulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMulOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateMulOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline L2NormOptionsT *L2NormOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<L2NormOptionsT>(new L2NormOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateL2NormOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateL2NormOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LocalResponseNormalizationOptionsT>(new LocalResponseNormalizationOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = radius(); _o->radius = _e; }
+  { auto _e = bias(); _o->bias = _e; }
+  { auto _e = alpha(); _o->alpha = _e; }
+  { auto _e = beta(); _o->beta = _e; }
+}
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _radius = _o->radius;
+  auto _bias = _o->bias;
+  auto _alpha = _o->alpha;
+  auto _beta = _o->beta;
+  return tflite::CreateLocalResponseNormalizationOptions(
+      _fbb,
+      _radius,
+      _bias,
+      _alpha,
+      _beta);
+}
+
+inline LSTMOptionsT *LSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LSTMOptionsT>(new LSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = kernel_type(); _o->kernel_type = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _kernel_type = _o->kernel_type;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _kernel_type,
+      _asymmetric_quantize_inputs);
+}
+
+inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnidirectionalSequenceLSTMOptionsT>(new UnidirectionalSequenceLSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _time_major = _o->time_major;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateUnidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _time_major,
+      _asymmetric_quantize_inputs);
+}
+
+inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BidirectionalSequenceLSTMOptionsT>(new BidirectionalSequenceLSTMOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = cell_clip(); _o->cell_clip = _e; }
+  { auto _e = proj_clip(); _o->proj_clip = _e; }
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; }
+  { auto _e = time_major(); _o->time_major = _e; }
+  { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _merge_outputs = _o->merge_outputs;
+  auto _time_major = _o->time_major;
+  auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  return tflite::CreateBidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _merge_outputs,
+      _time_major,
+      _asymmetric_quantize_inputs);
+}
+
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ResizeBilinearOptionsT>(new ResizeBilinearOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = align_corners(); _o->align_corners = _e; }
+  { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
+}
+
+inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _align_corners = _o->align_corners;
+  auto _half_pixel_centers = _o->half_pixel_centers;
+  return tflite::CreateResizeBilinearOptions(
+      _fbb,
+      _align_corners,
+      _half_pixel_centers);
+}
+
+inline ResizeNearestNeighborOptionsT *ResizeNearestNeighborOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ResizeNearestNeighborOptionsT>(new ResizeNearestNeighborOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = align_corners(); _o->align_corners = _e; }
+  { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
+}
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeNearestNeighborOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _align_corners = _o->align_corners;
+  auto _half_pixel_centers = _o->half_pixel_centers;
+  return tflite::CreateResizeNearestNeighborOptions(
+      _fbb,
+      _align_corners,
+      _half_pixel_centers);
+}
+
+inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CallOptionsT>(new CallOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CallOptions::UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = subgraph(); _o->subgraph = _e; }
+}
+
+inline flatbuffers::Offset<CallOptions> CallOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _subgraph = _o->subgraph;
+  return tflite::CreateCallOptions(
+      _fbb,
+      _subgraph);
+}
+
+inline PadOptionsT *PadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PadOptionsT>(new PadOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PadOptions::UnPackTo(PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PadOptions> PadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadOptions(
+      _fbb);
+}
+
+inline PadV2OptionsT *PadV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PadV2OptionsT>(new PadV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PadV2Options> PadV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadV2Options(
+      _fbb);
+}
+
+inline ReshapeOptionsT *ReshapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReshapeOptionsT>(new ReshapeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReshapeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
+  return tflite::CreateReshapeOptions(
+      _fbb,
+      _new_shape);
+}
+
+inline SpaceToBatchNDOptionsT *SpaceToBatchNDOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SpaceToBatchNDOptionsT>(new SpaceToBatchNDOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SpaceToBatchNDOptions::UnPackTo(SpaceToBatchNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToBatchNDOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToBatchNDOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSpaceToBatchNDOptions(
+      _fbb);
+}
+
+inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BatchToSpaceNDOptionsT>(new BatchToSpaceNDOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BatchToSpaceNDOptions::UnPackTo(BatchToSpaceNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchToSpaceNDOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BatchToSpaceNDOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBatchToSpaceNDOptions(
+      _fbb);
+}
+
+inline SkipGramOptionsT *SkipGramOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SkipGramOptionsT>(new SkipGramOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = ngram_size(); _o->ngram_size = _e; }
+  { auto _e = max_skip_size(); _o->max_skip_size = _e; }
+  { auto _e = include_all_ngrams(); _o->include_all_ngrams = _e; }
+}
+
+inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSkipGramOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _ngram_size = _o->ngram_size;
+  auto _max_skip_size = _o->max_skip_size;
+  auto _include_all_ngrams = _o->include_all_ngrams;
+  return tflite::CreateSkipGramOptions(
+      _fbb,
+      _ngram_size,
+      _max_skip_size,
+      _include_all_ngrams);
+}
+
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SpaceToDepthOptionsT>(new SpaceToDepthOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = block_size(); _o->block_size = _e; }
+}
+
+inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateSpaceToDepthOptions(
+      _fbb,
+      _block_size);
+}
+
+inline DepthToSpaceOptionsT *DepthToSpaceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DepthToSpaceOptionsT>(new DepthToSpaceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DepthToSpaceOptions::UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = block_size(); _o->block_size = _e; }
+}
+
+inline flatbuffers::Offset<DepthToSpaceOptions> DepthToSpaceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthToSpaceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthToSpaceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateDepthToSpaceOptions(
+      _fbb,
+      _block_size);
+}
+
+inline SubOptionsT *SubOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SubOptionsT>(new SubOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SubOptions::UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
+}
+
+inline flatbuffers::Offset<SubOptions> SubOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
+  return tflite::CreateSubOptions(
+      _fbb,
+      _fused_activation_function,
+      _pot_scale_int16);
+}
+
+inline DivOptionsT *DivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DivOptionsT>(new DivOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DivOptions::UnPackTo(DivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+}
+
+inline flatbuffers::Offset<DivOptions> DivOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDivOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DivOptions> CreateDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DivOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateDivOptions(
+      _fbb,
+      _fused_activation_function);
+}
+
+inline TopKV2OptionsT *TopKV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TopKV2OptionsT>(new TopKV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TopKV2Options::UnPackTo(TopKV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<TopKV2Options> TopKV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTopKV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TopKV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTopKV2Options(
+      _fbb);
+}
+
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EmbeddingLookupSparseOptionsT>(new EmbeddingLookupSparseOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = combiner(); _o->combiner = _e; }
+}
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _combiner = _o->combiner;
+  return tflite::CreateEmbeddingLookupSparseOptions(
+      _fbb,
+      _combiner);
+}
+
+inline GatherOptionsT *GatherOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GatherOptionsT>(new GatherOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GatherOptions::UnPackTo(GatherOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline flatbuffers::Offset<GatherOptions> GatherOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateGatherOptions(
+      _fbb,
+      _axis);
+}
+
+inline TransposeOptionsT *TransposeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TransposeOptionsT>(new TransposeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TransposeOptions::UnPackTo(TransposeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTransposeOptions(
+      _fbb);
+}
+
+inline ExpOptionsT *ExpOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExpOptionsT>(new ExpOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExpOptions::UnPackTo(ExpOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ExpOptions> ExpOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpOptions(
+      _fbb);
+}
+
+inline CosOptionsT *CosOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CosOptionsT>(new CosOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<CosOptions> CosOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCosOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateCosOptions(
+      _fbb);
+}
+
+inline ReducerOptionsT *ReducerOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReducerOptionsT>(new ReducerOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReducerOptions::UnPackTo(ReducerOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = keep_dims(); _o->keep_dims = _e; }
+}
+
+inline flatbuffers::Offset<ReducerOptions> ReducerOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReducerOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReducerOptions> CreateReducerOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReducerOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _keep_dims = _o->keep_dims;
+  return tflite::CreateReducerOptions(
+      _fbb,
+      _keep_dims);
+}
+
+inline SqueezeOptionsT *SqueezeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SqueezeOptionsT>(new SqueezeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSqueezeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _squeeze_dims = _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      _squeeze_dims);
+}
+
+inline SplitOptionsT *SplitOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SplitOptionsT>(new SplitOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; }
+}
+
+inline flatbuffers::Offset<SplitOptions> SplitOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitOptions(
+      _fbb,
+      _num_splits);
+}
+
+inline SplitVOptionsT *SplitVOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SplitVOptionsT>(new SplitVOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; }
+}
+
+inline flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitVOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitVOptions(
+      _fbb,
+      _num_splits);
+}
+
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StridedSliceOptionsT>(new StridedSliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = begin_mask(); _o->begin_mask = _e; }
+  { auto _e = end_mask(); _o->end_mask = _e; }
+  { auto _e = ellipsis_mask(); _o->ellipsis_mask = _e; }
+  { auto _e = new_axis_mask(); _o->new_axis_mask = _e; }
+  { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; }
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _begin_mask = _o->begin_mask;
+  auto _end_mask = _o->end_mask;
+  auto _ellipsis_mask = _o->ellipsis_mask;
+  auto _new_axis_mask = _o->new_axis_mask;
+  auto _shrink_axis_mask = _o->shrink_axis_mask;
+  return tflite::CreateStridedSliceOptions(
+      _fbb,
+      _begin_mask,
+      _end_mask,
+      _ellipsis_mask,
+      _new_axis_mask,
+      _shrink_axis_mask);
+}
+
+inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogSoftmaxOptionsT>(new LogSoftmaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogSoftmaxOptions(
+      _fbb);
+}
+
+inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CastOptionsT>(new CastOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; }
+  { auto _e = out_data_type(); _o->out_data_type = _e; }
+}
+
+inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCastOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
+  return tflite::CreateCastOptions(
+      _fbb,
+      _in_data_type,
+      _out_data_type);
+}
+
+inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DequantizeOptionsT>(new DequantizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDequantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDequantizeOptions(
+      _fbb);
+}
+
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MaximumMinimumOptionsT>(new MaximumMinimumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
+      _fbb);
+}
+
+inline TileOptionsT *TileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TileOptionsT>(new TileOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TileOptions::UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<TileOptions> TileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTileOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTileOptions(
+      _fbb);
+}
+
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArgMaxOptionsT>(new ArgMaxOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; }
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
+inline ArgMinOptionsT *ArgMinOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArgMinOptionsT>(new ArgMinOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArgMinOptions::UnPackTo(ArgMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; }
+}
+
+inline flatbuffers::Offset<ArgMinOptions> ArgMinOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMinOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMinOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMinOptions(
+      _fbb,
+      _output_type);
+}
+
+inline GreaterOptionsT *GreaterOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GreaterOptionsT>(new GreaterOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterOptions(
+      _fbb);
+}
+
+inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GreaterEqualOptionsT>(new GreaterEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterEqualOptions(
+      _fbb);
+}
+
+inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LessOptionsT>(new LessOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
+      _fbb);
+}
+
+inline LessEqualOptionsT *LessEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LessEqualOptionsT>(new LessEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessEqualOptions(
+      _fbb);
+}
+
+inline NegOptionsT *NegOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NegOptionsT>(new NegOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NegOptions::UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<NegOptions> NegOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNegOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NegOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNegOptions(
+      _fbb);
+}
+
+inline SelectOptionsT *SelectOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SelectOptionsT>(new SelectOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SelectOptions> SelectOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectOptions(
+      _fbb);
+}
+
+inline SliceOptionsT *SliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SliceOptionsT>(new SliceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SliceOptions> SliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSliceOptions(
+      _fbb);
+}
+
+inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TransposeConvOptionsT>(new TransposeConvOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; }
+  { auto _e = stride_w(); _o->stride_w = _e; }
+  { auto _e = stride_h(); _o->stride_h = _e; }
+}
+
+inline flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeConvOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  return tflite::CreateTransposeConvOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h);
+}
+
+inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExpandDimsOptionsT>(new ExpandDimsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpandDimsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpandDimsOptions(
+      _fbb);
+}
+
+inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparseToDenseOptionsT>(new SparseToDenseOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = validate_indices(); _o->validate_indices = _e; }
+}
+
+inline flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparseToDenseOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _validate_indices = _o->validate_indices;
+  return tflite::CreateSparseToDenseOptions(
+      _fbb,
+      _validate_indices);
+}
+
+inline EqualOptionsT *EqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EqualOptionsT>(new EqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EqualOptions::UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<EqualOptions> EqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateEqualOptions(
+      _fbb);
+}
+
+inline NotEqualOptionsT *NotEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NotEqualOptionsT>(new NotEqualOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NotEqualOptions::UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<NotEqualOptions> NotEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNotEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NotEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNotEqualOptions(
+      _fbb);
+}
+
+inline ShapeOptionsT *ShapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ShapeOptionsT>(new ShapeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ShapeOptions::UnPackTo(ShapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = out_type(); _o->out_type = _e; }
+}
+
+inline flatbuffers::Offset<ShapeOptions> ShapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateShapeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ShapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _out_type = _o->out_type;
+  return tflite::CreateShapeOptions(
+      _fbb,
+      _out_type);
+}
+
+inline RankOptionsT *RankOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RankOptionsT>(new RankOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RankOptions> RankOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRankOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRankOptions(
+      _fbb);
+}
+
+inline PowOptionsT *PowOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PowOptionsT>(new PowOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PowOptions::UnPackTo(PowOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PowOptions> PowOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePowOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PowOptions> CreatePowOptions(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PowOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePowOptions(
+      _fbb);
+}
+
+inline FakeQuantOptionsT *FakeQuantOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FakeQuantOptionsT>(new FakeQuantOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FakeQuantOptions::UnPackTo(FakeQuantOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = min(); _o->min = _e; }
+  { auto _e = max(); _o->max = _e; }
+  { auto _e = num_bits(); _o->num_bits = _e; }
+  { auto _e = narrow_range(); _o->narrow_range = _e; }
+}
+
+inline flatbuffers::Offset<FakeQuantOptions> FakeQuantOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFakeQuantOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FakeQuantOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _min = _o->min;
+  auto _max = _o->max;
+  auto _num_bits = _o->num_bits;
+  auto _narrow_range = _o->narrow_range;
+  return tflite::CreateFakeQuantOptions(
+      _fbb,
+      _min,
+      _max,
+      _num_bits,
+      _narrow_range);
+}
+
+inline PackOptionsT *PackOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PackOptionsT>(new PackOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PackOptions::UnPackTo(PackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values_count(); _o->values_count = _e; }
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline flatbuffers::Offset<PackOptions> PackOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePackOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PackOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values_count = _o->values_count;
+  auto _axis = _o->axis;
+  return tflite::CreatePackOptions(
+      _fbb,
+      _values_count,
+      _axis);
+}
+
+inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalOrOptionsT>(new LogicalOrOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalOrOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalOrOptions(
+      _fbb);
+}
+
+inline OneHotOptionsT *OneHotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OneHotOptionsT>(new OneHotOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOneHotOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateOneHotOptions(
+      _fbb,
+      _axis);
+}
+
+inline AbsOptionsT *AbsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AbsOptionsT>(new AbsOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AbsOptions> AbsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAbsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAbsOptions(
+      _fbb);
+}
+
+inline HardSwishOptionsT *HardSwishOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HardSwishOptionsT>(new HardSwishOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HardSwishOptions::UnPackTo(HardSwishOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<HardSwishOptions> HardSwishOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHardSwishOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HardSwishOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateHardSwishOptions(
+      _fbb);
+}
+
+inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalAndOptionsT>(new LogicalAndOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalAndOptions::UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogicalAndOptions> LogicalAndOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalAndOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalAndOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalAndOptions(
+      _fbb);
+}
+
+inline LogicalNotOptionsT *LogicalNotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LogicalNotOptionsT>(new LogicalNotOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LogicalNotOptions::UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogicalNotOptions> LogicalNotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalNotOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalNotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalNotOptions(
+      _fbb);
+}
+
+inline UnpackOptionsT *UnpackOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UnpackOptionsT>(new UnpackOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UnpackOptions::UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num(); _o->num = _e; }
+  { auto _e = axis(); _o->axis = _e; }
+}
+
+inline flatbuffers::Offset<UnpackOptions> UnpackOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnpackOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnpackOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num = _o->num;
+  auto _axis = _o->axis;
+  return tflite::CreateUnpackOptions(
+      _fbb,
+      _num,
+      _axis);
+}
+
+inline FloorDivOptionsT *FloorDivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FloorDivOptionsT>(new FloorDivOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FloorDivOptions::UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FloorDivOptions> FloorDivOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorDivOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorDivOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorDivOptions(
+      _fbb);
+}
+
+inline SquareOptionsT *SquareOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SquareOptionsT>(new SquareOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SquareOptions::UnPackTo(SquareOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquareOptions> SquareOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquareOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquareOptions> CreateSquareOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquareOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquareOptions(
+      _fbb);
+}
+
+inline ZerosLikeOptionsT *ZerosLikeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ZerosLikeOptionsT>(new ZerosLikeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ZerosLikeOptions::UnPackTo(ZerosLikeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ZerosLikeOptions> ZerosLikeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateZerosLikeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ZerosLikeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateZerosLikeOptions(
+      _fbb);
+}
+
+inline FillOptionsT *FillOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FillOptionsT>(new FillOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FillOptions::UnPackTo(FillOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FillOptions> FillOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFillOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FillOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFillOptions(
+      _fbb);
+}
+
+inline FloorModOptionsT *FloorModOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FloorModOptionsT>(new FloorModOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorModOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorModOptions(
+      _fbb);
+}
+
+inline RangeOptionsT *RangeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RangeOptionsT>(new RangeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RangeOptions::UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RangeOptions> RangeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRangeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RangeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRangeOptions(
+      _fbb);
+}
+
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<LeakyReluOptionsT>(new LeakyReluOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; }
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SquaredDifferenceOptionsT>(new SquaredDifferenceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MirrorPadOptionsT>(new MirrorPadOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; }
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
+inline UniqueOptionsT *UniqueOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<UniqueOptionsT>(new UniqueOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = idx_out_type(); _o->idx_out_type = _e; }
+}
+
+inline flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUniqueOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _idx_out_type = _o->idx_out_type;
+  return tflite::CreateUniqueOptions(
+      _fbb,
+      _idx_out_type);
+}
+
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReverseV2OptionsT>(new ReverseV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReverseV2Options(
+      _fbb);
+}
+
+inline AddNOptionsT *AddNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<AddNOptionsT>(new AddNOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AddNOptions> AddNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAddNOptions(
+      _fbb);
+}
+
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GatherNdOptionsT>(new GatherNdOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherNdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGatherNdOptions(
+      _fbb);
+}
+
+inline WhereOptionsT *WhereOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<WhereOptionsT>(new WhereOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<WhereOptions> WhereOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhereOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateWhereOptions(
+      _fbb);
+}
+
+inline ReverseSequenceOptionsT *ReverseSequenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReverseSequenceOptionsT>(new ReverseSequenceOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReverseSequenceOptions::UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = seq_dim(); _o->seq_dim = _e; }
+  { auto _e = batch_dim(); _o->batch_dim = _e; }
+}
+
+inline flatbuffers::Offset<ReverseSequenceOptions> ReverseSequenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseSequenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseSequenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _seq_dim = _o->seq_dim;
+  auto _batch_dim = _o->batch_dim;
+  return tflite::CreateReverseSequenceOptions(
+      _fbb,
+      _seq_dim,
+      _batch_dim);
+}
+
+inline MatrixDiagOptionsT *MatrixDiagOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MatrixDiagOptionsT>(new MatrixDiagOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MatrixDiagOptions::UnPackTo(MatrixDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MatrixDiagOptions> MatrixDiagOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMatrixDiagOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MatrixDiagOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMatrixDiagOptions(
+      _fbb);
+}
+
+inline QuantizeOptionsT *QuantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<QuantizeOptionsT>(new QuantizeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void QuantizeOptions::UnPackTo(QuantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<QuantizeOptions> QuantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateQuantizeOptions(
+      _fbb);
+}
+
+inline MatrixSetDiagOptionsT *MatrixSetDiagOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MatrixSetDiagOptionsT>(new MatrixSetDiagOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MatrixSetDiagOptions::UnPackTo(MatrixSetDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MatrixSetDiagOptions> MatrixSetDiagOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMatrixSetDiagOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MatrixSetDiagOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMatrixSetDiagOptions(
+      _fbb);
+}
+
+inline IfOptionsT *IfOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<IfOptionsT>(new IfOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void IfOptions::UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = then_subgraph_index(); _o->then_subgraph_index = _e; }
+  { auto _e = else_subgraph_index(); _o->else_subgraph_index = _e; }
+}
+
+inline flatbuffers::Offset<IfOptions> IfOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateIfOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _then_subgraph_index = _o->then_subgraph_index;
+  auto _else_subgraph_index = _o->else_subgraph_index;
+  return tflite::CreateIfOptions(
+      _fbb,
+      _then_subgraph_index,
+      _else_subgraph_index);
+}
+
+inline WhileOptionsT *WhileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<WhileOptionsT>(new WhileOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; }
+  { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
+}
+
+inline flatbuffers::Offset<WhileOptions> WhileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhileOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cond_subgraph_index = _o->cond_subgraph_index;
+  auto _body_subgraph_index = _o->body_subgraph_index;
+  return tflite::CreateWhileOptions(
+      _fbb,
+      _cond_subgraph_index,
+      _body_subgraph_index);
+}
+
+inline NonMaxSuppressionV4OptionsT *NonMaxSuppressionV4Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NonMaxSuppressionV4OptionsT>(new NonMaxSuppressionV4OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NonMaxSuppressionV4Options::UnPackTo(NonMaxSuppressionV4OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<NonMaxSuppressionV4Options> NonMaxSuppressionV4Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNonMaxSuppressionV4Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV4OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNonMaxSuppressionV4Options(
+      _fbb);
+}
+
+inline NonMaxSuppressionV5OptionsT *NonMaxSuppressionV5Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NonMaxSuppressionV5OptionsT>(new NonMaxSuppressionV5OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NonMaxSuppressionV5Options::UnPackTo(NonMaxSuppressionV5OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<NonMaxSuppressionV5Options> NonMaxSuppressionV5Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNonMaxSuppressionV5Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV5OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNonMaxSuppressionV5Options(
+      _fbb);
+}
+
+inline ScatterNdOptionsT *ScatterNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ScatterNdOptionsT>(new ScatterNdOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ScatterNdOptions::UnPackTo(ScatterNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ScatterNdOptions> ScatterNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateScatterNdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ScatterNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateScatterNdOptions(
+      _fbb);
+}
+
+inline SelectV2OptionsT *SelectV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SelectV2OptionsT>(new SelectV2OptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SelectV2Options::UnPackTo(SelectV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SelectV2Options> SelectV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectV2Options(
+      _fbb);
+}
+
+inline DensifyOptionsT *DensifyOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<DensifyOptionsT>(new DensifyOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void DensifyOptions::UnPackTo(DensifyOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<DensifyOptions> DensifyOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDensifyOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DensifyOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDensifyOptions(
+      _fbb);
+}
+
+inline SegmentSumOptionsT *SegmentSumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SegmentSumOptionsT>(new SegmentSumOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SegmentSumOptions::UnPackTo(SegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SegmentSumOptions> SegmentSumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSegmentSumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SegmentSumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSegmentSumOptions(
+      _fbb);
+}
+
+inline BatchMatMulOptionsT *BatchMatMulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BatchMatMulOptionsT>(new BatchMatMulOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = adj_x(); _o->adj_x = _e; }
+  { auto _e = adj_y(); _o->adj_y = _e; }
+}
+
+inline flatbuffers::Offset<BatchMatMulOptions> BatchMatMulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchMatMulOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BatchMatMulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _adj_x = _o->adj_x;
+  auto _adj_y = _o->adj_y;
+  return tflite::CreateBatchMatMulOptions(
+      _fbb,
+      _adj_x,
+      _adj_y);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OperatorCodeT>(new OperatorCodeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = builtin_code(); _o->builtin_code = _e; }
+  { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); }
+  { auto _e = version(); _o->version = _e; }
+}
+
+inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperatorCode(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _builtin_code = _o->builtin_code;
+  auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  auto _version = _o->version;
+  return tflite::CreateOperatorCode(
+      _fbb,
+      _builtin_code,
+      _custom_code,
+      _version);
+}
+
+inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<OperatorT>(new OperatorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = opcode_index(); _o->opcode_index = _e; }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
+  { auto _e = builtin_options_type(); _o->builtin_options.type = _e; }
+  { auto _e = builtin_options(); if (_e) _o->builtin_options.value = tflite::BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); }
+  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_options.begin()); } }
+  { auto _e = custom_options_format(); _o->custom_options_format = _e; }
+  { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } }
+  { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<Operator> Operator::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperator(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _opcode_index = _o->opcode_index;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _builtin_options_type = _o->builtin_options.type;
+  auto _builtin_options = _o->builtin_options.Pack(_fbb);
+  auto _custom_options = _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
+  auto _custom_options_format = _o->custom_options_format;
+  auto _mutating_variable_inputs = _o->mutating_variable_inputs.size() ? _fbb.CreateVector(_o->mutating_variable_inputs) : 0;
+  auto _intermediates = _o->intermediates.size() ? _fbb.CreateVector(_o->intermediates) : 0;
+  return tflite::CreateOperator(
+      _fbb,
+      _opcode_index,
+      _inputs,
+      _outputs,
+      _builtin_options_type,
+      _builtin_options,
+      _custom_options,
+      _custom_options_format,
+      _mutating_variable_inputs,
+      _intermediates);
+}
+
+inline SubGraphT *SubGraph::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SubGraphT>(new SubGraphT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SubGraph::UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
+  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+}
+
+inline flatbuffers::Offset<SubGraph> SubGraph::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubGraph(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _operators = _o->operators.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  return tflite::CreateSubGraph(
+      _fbb,
+      _tensors,
+      _inputs,
+      _outputs,
+      _operators,
+      _name);
+}
+
+inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BufferT>(new BufferT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->data.begin()); } }
+}
+
+inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBuffer(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->data.size(), sizeof(uint8_t), 16);
+  auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
+  return tflite::CreateBuffer(
+      _fbb,
+      _data);
+}
+
+inline MetadataT *Metadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MetadataT>(new MetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Metadata::UnPackTo(MetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = buffer(); _o->buffer = _e; }
+}
+
+inline flatbuffers::Offset<Metadata> Metadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMetadata(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _buffer = _o->buffer;
+  return tflite::CreateMetadata(
+      _fbb,
+      _name,
+      _buffer);
+}
+
+inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelT>(new ModelT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = version(); _o->version = _e; }
+  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = description(); if (_e) _o->description = _e->str(); }
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } }
+  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModel(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _version = _o->version;
+  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
+  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
+  auto _metadata = _o->metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateModel(
+      _fbb,
+      _version,
+      _operator_codes,
+      _subgraphs,
+      _description,
+      _buffers,
+      _metadata_buffer,
+      _metadata);
+}
+
+inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails_NONE: {
+      return true;
+    }
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantizationT *>(value);
+      return CreateCustomQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline QuantizationDetailsUnion::QuantizationDetailsUnion(const QuantizationDetailsUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      value = new tflite::CustomQuantizationT(*reinterpret_cast<tflite::CustomQuantizationT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void QuantizationDetailsUnion::Reset() {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<tflite::CustomQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = QuantizationDetails_NONE;
+}
+
+inline bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
+  switch (type) {
+    case SparseIndexVector_NONE: {
+      return true;
+    }
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifySparseIndexVector(
+        verifier,  values->Get(i), types->GetEnum<SparseIndexVector>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> SparseIndexVectorUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32VectorT *>(value);
+      return CreateInt32Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16VectorT *>(value);
+      return CreateUint16Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8VectorT *>(value);
+      return CreateUint8Vector(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline SparseIndexVectorUnion::SparseIndexVectorUnion(const SparseIndexVectorUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      value = new tflite::Int32VectorT(*reinterpret_cast<tflite::Int32VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      value = new tflite::Uint16VectorT(*reinterpret_cast<tflite::Uint16VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      value = new tflite::Uint8VectorT(*reinterpret_cast<tflite::Uint8VectorT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void SparseIndexVectorUnion::Reset() {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<tflite::Int32VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<tflite::Uint16VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<tflite::Uint8VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = SparseIndexVector_NONE;
+}
+
+inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+  switch (type) {
+    case BuiltinOptions_NONE: {
+      return true;
+    }
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions(
+        verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptionsT *>(value);
+      return CreateConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptionsT *>(value);
+      return CreateDepthwiseConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptionsT *>(value);
+      return CreateConcatEmbeddingsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptionsT *>(value);
+      return CreateLSHProjectionOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptionsT *>(value);
+      return CreatePool2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptionsT *>(value);
+      return CreateSVDFOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptionsT *>(value);
+      return CreateRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptionsT *>(value);
+      return CreateFullyConnectedOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptionsT *>(value);
+      return CreateSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptionsT *>(value);
+      return CreateConcatenationOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptionsT *>(value);
+      return CreateAddOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptionsT *>(value);
+      return CreateL2NormOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptionsT *>(value);
+      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptionsT *>(value);
+      return CreateLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptionsT *>(value);
+      return CreateResizeBilinearOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptionsT *>(value);
+      return CreateCallOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptionsT *>(value);
+      return CreateReshapeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptionsT *>(value);
+      return CreateSkipGramOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptionsT *>(value);
+      return CreateSpaceToDepthOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptionsT *>(value);
+      return CreateEmbeddingLookupSparseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptionsT *>(value);
+      return CreateMulOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptionsT *>(value);
+      return CreatePadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptionsT *>(value);
+      return CreateGatherOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptionsT *>(value);
+      return CreateBatchToSpaceNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptionsT *>(value);
+      return CreateSpaceToBatchNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptionsT *>(value);
+      return CreateTransposeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptionsT *>(value);
+      return CreateReducerOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptionsT *>(value);
+      return CreateSubOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptionsT *>(value);
+      return CreateDivOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptionsT *>(value);
+      return CreateSqueezeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptionsT *>(value);
+      return CreateSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptionsT *>(value);
+      return CreateStridedSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptionsT *>(value);
+      return CreateExpOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2OptionsT *>(value);
+      return CreateTopKV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptionsT *>(value);
+      return CreateSplitOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptionsT *>(value);
+      return CreateLogSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptionsT *>(value);
+      return CreateCastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptionsT *>(value);
+      return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptionsT *>(value);
+      return CreateLessOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptionsT *>(value);
+      return CreateNegOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2OptionsT *>(value);
+      return CreatePadV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptionsT *>(value);
+      return CreateGreaterOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptionsT *>(value);
+      return CreateGreaterEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptionsT *>(value);
+      return CreateLessEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptionsT *>(value);
+      return CreateSelectOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptionsT *>(value);
+      return CreateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptionsT *>(value);
+      return CreateTransposeConvOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptionsT *>(value);
+      return CreateSparseToDenseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptionsT *>(value);
+      return CreateTileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptionsT *>(value);
+      return CreateExpandDimsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptionsT *>(value);
+      return CreateEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptionsT *>(value);
+      return CreateNotEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptionsT *>(value);
+      return CreateShapeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptionsT *>(value);
+      return CreatePowOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptionsT *>(value);
+      return CreateArgMinOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptionsT *>(value);
+      return CreateFakeQuantOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptionsT *>(value);
+      return CreatePackOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptionsT *>(value);
+      return CreateLogicalOrOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptionsT *>(value);
+      return CreateOneHotOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptionsT *>(value);
+      return CreateLogicalAndOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptionsT *>(value);
+      return CreateLogicalNotOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptionsT *>(value);
+      return CreateUnpackOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptionsT *>(value);
+      return CreateFloorDivOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptionsT *>(value);
+      return CreateSquareOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptionsT *>(value);
+      return CreateZerosLikeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptionsT *>(value);
+      return CreateFillOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateBidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptionsT *>(value);
+      return CreateBidirectionalSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateUnidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptionsT *>(value);
+      return CreateFloorModOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptionsT *>(value);
+      return CreateRangeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptionsT *>(value);
+      return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptionsT *>(value);
+      return CreateAbsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptionsT *>(value);
+      return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptionsT *>(value);
+      return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2OptionsT *>(value);
+      return CreateReverseV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptionsT *>(value);
+      return CreateAddNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptionsT *>(value);
+      return CreateGatherNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptionsT *>(value);
+      return CreateCosOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptionsT *>(value);
+      return CreateWhereOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptionsT *>(value);
+      return CreateRankOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptionsT *>(value);
+      return CreateReverseSequenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptionsT *>(value);
+      return CreateMatrixDiagOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptionsT *>(value);
+      return CreateQuantizeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptionsT *>(value);
+      return CreateMatrixSetDiagOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptionsT *>(value);
+      return CreateHardSwishOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptionsT *>(value);
+      return CreateIfOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptionsT *>(value);
+      return CreateWhileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptionsT *>(value);
+      return CreateDepthToSpaceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4OptionsT *>(value);
+      return CreateNonMaxSuppressionV4Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5OptionsT *>(value);
+      return CreateNonMaxSuppressionV5Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptionsT *>(value);
+      return CreateScatterNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2OptionsT *>(value);
+      return CreateSelectV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptionsT *>(value);
+      return CreateDensifyOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptionsT *>(value);
+      return CreateSegmentSumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptionsT *>(value);
+      return CreateBatchMatMulOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) : type(u.type), value(nullptr) {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      value = new tflite::Conv2DOptionsT(*reinterpret_cast<tflite::Conv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      value = new tflite::DepthwiseConv2DOptionsT(*reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      value = new tflite::ConcatEmbeddingsOptionsT(*reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      value = new tflite::LSHProjectionOptionsT(*reinterpret_cast<tflite::LSHProjectionOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      value = new tflite::Pool2DOptionsT(*reinterpret_cast<tflite::Pool2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      value = new tflite::SVDFOptionsT(*reinterpret_cast<tflite::SVDFOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      value = new tflite::RNNOptionsT(*reinterpret_cast<tflite::RNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      value = new tflite::FullyConnectedOptionsT(*reinterpret_cast<tflite::FullyConnectedOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      value = new tflite::SoftmaxOptionsT(*reinterpret_cast<tflite::SoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      value = new tflite::ConcatenationOptionsT(*reinterpret_cast<tflite::ConcatenationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      value = new tflite::AddOptionsT(*reinterpret_cast<tflite::AddOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      value = new tflite::L2NormOptionsT(*reinterpret_cast<tflite::L2NormOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      value = new tflite::LocalResponseNormalizationOptionsT(*reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      value = new tflite::LSTMOptionsT(*reinterpret_cast<tflite::LSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      value = new tflite::ResizeBilinearOptionsT(*reinterpret_cast<tflite::ResizeBilinearOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      value = new tflite::CallOptionsT(*reinterpret_cast<tflite::CallOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      value = new tflite::ReshapeOptionsT(*reinterpret_cast<tflite::ReshapeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      value = new tflite::SkipGramOptionsT(*reinterpret_cast<tflite::SkipGramOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      value = new tflite::SpaceToDepthOptionsT(*reinterpret_cast<tflite::SpaceToDepthOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      value = new tflite::EmbeddingLookupSparseOptionsT(*reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      value = new tflite::MulOptionsT(*reinterpret_cast<tflite::MulOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PadOptions: {
+      value = new tflite::PadOptionsT(*reinterpret_cast<tflite::PadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      value = new tflite::GatherOptionsT(*reinterpret_cast<tflite::GatherOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      value = new tflite::BatchToSpaceNDOptionsT(*reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      value = new tflite::SpaceToBatchNDOptionsT(*reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      value = new tflite::TransposeOptionsT(*reinterpret_cast<tflite::TransposeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReducerOptions: {
+      value = new tflite::ReducerOptionsT(*reinterpret_cast<tflite::ReducerOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      value = new tflite::SubOptionsT(*reinterpret_cast<tflite::SubOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      value = new tflite::DivOptionsT(*reinterpret_cast<tflite::DivOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      value = new tflite::SqueezeOptionsT(*reinterpret_cast<tflite::SqueezeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      value = new tflite::SequenceRNNOptionsT(*reinterpret_cast<tflite::SequenceRNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      value = new tflite::StridedSliceOptionsT(*reinterpret_cast<tflite::StridedSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      value = new tflite::ExpOptionsT(*reinterpret_cast<tflite::ExpOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TopKV2Options: {
+      value = new tflite::TopKV2OptionsT(*reinterpret_cast<tflite::TopKV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitOptions: {
+      value = new tflite::SplitOptionsT(*reinterpret_cast<tflite::SplitOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      value = new tflite::LogSoftmaxOptionsT(*reinterpret_cast<tflite::LogSoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      value = new tflite::CastOptionsT(*reinterpret_cast<tflite::CastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      value = new tflite::DequantizeOptionsT(*reinterpret_cast<tflite::DequantizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new tflite::MaximumMinimumOptionsT(*reinterpret_cast<tflite::MaximumMinimumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new tflite::ArgMaxOptionsT(*reinterpret_cast<tflite::ArgMaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      value = new tflite::LessOptionsT(*reinterpret_cast<tflite::LessOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NegOptions: {
+      value = new tflite::NegOptionsT(*reinterpret_cast<tflite::NegOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      value = new tflite::PadV2OptionsT(*reinterpret_cast<tflite::PadV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      value = new tflite::GreaterOptionsT(*reinterpret_cast<tflite::GreaterOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      value = new tflite::GreaterEqualOptionsT(*reinterpret_cast<tflite::GreaterEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      value = new tflite::LessEqualOptionsT(*reinterpret_cast<tflite::LessEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      value = new tflite::SelectOptionsT(*reinterpret_cast<tflite::SelectOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      value = new tflite::SliceOptionsT(*reinterpret_cast<tflite::SliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      value = new tflite::TransposeConvOptionsT(*reinterpret_cast<tflite::TransposeConvOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      value = new tflite::SparseToDenseOptionsT(*reinterpret_cast<tflite::SparseToDenseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      value = new tflite::TileOptionsT(*reinterpret_cast<tflite::TileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      value = new tflite::ExpandDimsOptionsT(*reinterpret_cast<tflite::ExpandDimsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_EqualOptions: {
+      value = new tflite::EqualOptionsT(*reinterpret_cast<tflite::EqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      value = new tflite::NotEqualOptionsT(*reinterpret_cast<tflite::NotEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ShapeOptions: {
+      value = new tflite::ShapeOptionsT(*reinterpret_cast<tflite::ShapeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PowOptions: {
+      value = new tflite::PowOptionsT(*reinterpret_cast<tflite::PowOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      value = new tflite::ArgMinOptionsT(*reinterpret_cast<tflite::ArgMinOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      value = new tflite::FakeQuantOptionsT(*reinterpret_cast<tflite::FakeQuantOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PackOptions: {
+      value = new tflite::PackOptionsT(*reinterpret_cast<tflite::PackOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      value = new tflite::LogicalOrOptionsT(*reinterpret_cast<tflite::LogicalOrOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      value = new tflite::OneHotOptionsT(*reinterpret_cast<tflite::OneHotOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      value = new tflite::LogicalAndOptionsT(*reinterpret_cast<tflite::LogicalAndOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      value = new tflite::LogicalNotOptionsT(*reinterpret_cast<tflite::LogicalNotOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnpackOptions: {
+      value = new tflite::UnpackOptionsT(*reinterpret_cast<tflite::UnpackOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      value = new tflite::FloorDivOptionsT(*reinterpret_cast<tflite::FloorDivOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquareOptions: {
+      value = new tflite::SquareOptionsT(*reinterpret_cast<tflite::SquareOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      value = new tflite::ZerosLikeOptionsT(*reinterpret_cast<tflite::ZerosLikeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FillOptions: {
+      value = new tflite::FillOptionsT(*reinterpret_cast<tflite::FillOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      value = new tflite::BidirectionalSequenceLSTMOptionsT(*reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      value = new tflite::BidirectionalSequenceRNNOptionsT(*reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      value = new tflite::UnidirectionalSequenceLSTMOptionsT(*reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      value = new tflite::FloorModOptionsT(*reinterpret_cast<tflite::FloorModOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      value = new tflite::RangeOptionsT(*reinterpret_cast<tflite::RangeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      value = new tflite::ResizeNearestNeighborOptionsT(*reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new tflite::LeakyReluOptionsT(*reinterpret_cast<tflite::LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new tflite::SquaredDifferenceOptionsT(*reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new tflite::MirrorPadOptionsT(*reinterpret_cast<tflite::MirrorPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      value = new tflite::AbsOptionsT(*reinterpret_cast<tflite::AbsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      value = new tflite::SplitVOptionsT(*reinterpret_cast<tflite::SplitVOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UniqueOptions: {
+      value = new tflite::UniqueOptionsT(*reinterpret_cast<tflite::UniqueOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      value = new tflite::ReverseV2OptionsT(*reinterpret_cast<tflite::ReverseV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      value = new tflite::AddNOptionsT(*reinterpret_cast<tflite::AddNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      value = new tflite::GatherNdOptionsT(*reinterpret_cast<tflite::GatherNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      value = new tflite::CosOptionsT(*reinterpret_cast<tflite::CosOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      value = new tflite::WhereOptionsT(*reinterpret_cast<tflite::WhereOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      value = new tflite::RankOptionsT(*reinterpret_cast<tflite::RankOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      value = new tflite::ReverseSequenceOptionsT(*reinterpret_cast<tflite::ReverseSequenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      value = new tflite::MatrixDiagOptionsT(*reinterpret_cast<tflite::MatrixDiagOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      value = new tflite::QuantizeOptionsT(*reinterpret_cast<tflite::QuantizeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      value = new tflite::MatrixSetDiagOptionsT(*reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      value = new tflite::HardSwishOptionsT(*reinterpret_cast<tflite::HardSwishOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_IfOptions: {
+      value = new tflite::IfOptionsT(*reinterpret_cast<tflite::IfOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      value = new tflite::WhileOptionsT(*reinterpret_cast<tflite::WhileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      value = new tflite::DepthToSpaceOptionsT(*reinterpret_cast<tflite::DepthToSpaceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      value = new tflite::NonMaxSuppressionV4OptionsT(*reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      value = new tflite::NonMaxSuppressionV5OptionsT(*reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      value = new tflite::ScatterNdOptionsT(*reinterpret_cast<tflite::ScatterNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectV2Options: {
+      value = new tflite::SelectV2OptionsT(*reinterpret_cast<tflite::SelectV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DensifyOptions: {
+      value = new tflite::DensifyOptionsT(*reinterpret_cast<tflite::DensifyOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      value = new tflite::SegmentSumOptionsT(*reinterpret_cast<tflite::SegmentSumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      value = new tflite::BatchMatMulOptionsT(*reinterpret_cast<tflite::BatchMatMulOptionsT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void BuiltinOptionsUnion::Reset() {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<tflite::Conv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<tflite::DepthwiseConv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<tflite::ConcatEmbeddingsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<tflite::LSHProjectionOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<tflite::Pool2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<tflite::SVDFOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<tflite::RNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<tflite::FullyConnectedOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<tflite::SoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<tflite::ConcatenationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<tflite::AddOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<tflite::L2NormOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<tflite::LocalResponseNormalizationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::LSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<tflite::ResizeBilinearOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<tflite::CallOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<tflite::ReshapeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<tflite::SkipGramOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<tflite::SpaceToDepthOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<tflite::EmbeddingLookupSparseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<tflite::MulOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<tflite::PadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<tflite::GatherOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<tflite::BatchToSpaceNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<tflite::SpaceToBatchNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<tflite::TransposeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReducerOptions: {
+      auto ptr = reinterpret_cast<tflite::ReducerOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<tflite::SubOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<tflite::DivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<tflite::SqueezeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<tflite::SequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::StridedSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<tflite::ExpOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TopKV2Options: {
+      auto ptr = reinterpret_cast<tflite::TopKV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitOptions: {
+      auto ptr = reinterpret_cast<tflite::SplitOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<tflite::LogSoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CastOptions: {
+      auto ptr = reinterpret_cast<tflite::CastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DequantizeOptions: {
+      auto ptr = reinterpret_cast<tflite::DequantizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<tflite::MaximumMinimumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<tflite::ArgMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<tflite::LessOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<tflite::NegOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<tflite::PadV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<tflite::GreaterOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::GreaterEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::LessEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<tflite::SelectOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<tflite::SliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<tflite::TransposeConvOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<tflite::SparseToDenseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<tflite::TileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<tflite::ExpandDimsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<tflite::EqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<tflite::NotEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ShapeOptions: {
+      auto ptr = reinterpret_cast<tflite::ShapeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PowOptions: {
+      auto ptr = reinterpret_cast<tflite::PowOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMinOptions: {
+      auto ptr = reinterpret_cast<tflite::ArgMinOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FakeQuantOptions: {
+      auto ptr = reinterpret_cast<tflite::FakeQuantOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PackOptions: {
+      auto ptr = reinterpret_cast<tflite::PackOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalOrOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<tflite::OneHotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalAndOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalAndOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LogicalNotOptions: {
+      auto ptr = reinterpret_cast<tflite::LogicalNotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<tflite::UnpackOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<tflite::FloorDivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquareOptions: {
+      auto ptr = reinterpret_cast<tflite::SquareOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<tflite::ZerosLikeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FillOptions: {
+      auto ptr = reinterpret_cast<tflite::FillOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::BidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<tflite::BidirectionalSequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<tflite::UnidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<tflite::FloorModOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<tflite::RangeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<tflite::ResizeNearestNeighborOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<tflite::LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<tflite::SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<tflite::MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<tflite::AbsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<tflite::SplitVOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<tflite::UniqueOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<tflite::ReverseV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<tflite::AddNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<tflite::GatherNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<tflite::CosOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<tflite::WhereOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<tflite::RankOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<tflite::ReverseSequenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<tflite::MatrixDiagOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_QuantizeOptions: {
+      auto ptr = reinterpret_cast<tflite::QuantizeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<tflite::MatrixSetDiagOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_HardSwishOptions: {
+      auto ptr = reinterpret_cast<tflite::HardSwishOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_IfOptions: {
+      auto ptr = reinterpret_cast<tflite::IfOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhileOptions: {
+      auto ptr = reinterpret_cast<tflite::WhileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<tflite::DepthToSpaceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<tflite::NonMaxSuppressionV4OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<tflite::NonMaxSuppressionV5OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ScatterNdOptions: {
+      auto ptr = reinterpret_cast<tflite::ScatterNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectV2Options: {
+      auto ptr = reinterpret_cast<tflite::SelectV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DensifyOptions: {
+      auto ptr = reinterpret_cast<tflite::DensifyOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SegmentSumOptions: {
+      auto ptr = reinterpret_cast<tflite::SegmentSumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<tflite::BatchMatMulOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = BuiltinOptions_NONE;
+}
+
+inline const flatbuffers::TypeTable *TensorTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "FLOAT32",
+    "FLOAT16",
+    "INT32",
+    "UINT8",
+    "INT64",
+    "STRING",
+    "BOOL",
+    "INT16",
+    "COMPLEX64",
+    "INT8",
+    "FLOAT64",
+    "COMPLEX128"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 12, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *QuantizationDetailsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::CustomQuantizationTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "CustomQuantization"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DimensionTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "DENSE",
+    "SPARSE_CSR"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SparseIndexVectorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::Int32VectorTypeTable,
+    tflite::Uint16VectorTypeTable,
+    tflite::Uint8VectorTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BuiltinOperatorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOperatorTypeTable
+  };
+  static const char * const names[] = {
+    "ADD",
+    "AVERAGE_POOL_2D",
+    "CONCATENATION",
+    "CONV_2D",
+    "DEPTHWISE_CONV_2D",
+    "DEPTH_TO_SPACE",
+    "DEQUANTIZE",
+    "EMBEDDING_LOOKUP",
+    "FLOOR",
+    "FULLY_CONNECTED",
+    "HASHTABLE_LOOKUP",
+    "L2_NORMALIZATION",
+    "L2_POOL_2D",
+    "LOCAL_RESPONSE_NORMALIZATION",
+    "LOGISTIC",
+    "LSH_PROJECTION",
+    "LSTM",
+    "MAX_POOL_2D",
+    "MUL",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "RESHAPE",
+    "RESIZE_BILINEAR",
+    "RNN",
+    "SOFTMAX",
+    "SPACE_TO_DEPTH",
+    "SVDF",
+    "TANH",
+    "CONCAT_EMBEDDINGS",
+    "SKIP_GRAM",
+    "CALL",
+    "CUSTOM",
+    "EMBEDDING_LOOKUP_SPARSE",
+    "PAD",
+    "UNIDIRECTIONAL_SEQUENCE_RNN",
+    "GATHER",
+    "BATCH_TO_SPACE_ND",
+    "SPACE_TO_BATCH_ND",
+    "TRANSPOSE",
+    "MEAN",
+    "SUB",
+    "DIV",
+    "SQUEEZE",
+    "UNIDIRECTIONAL_SEQUENCE_LSTM",
+    "STRIDED_SLICE",
+    "BIDIRECTIONAL_SEQUENCE_RNN",
+    "EXP",
+    "TOPK_V2",
+    "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
+    "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
+    "LOG",
+    "SUM",
+    "SQRT",
+    "RSQRT",
+    "SHAPE",
+    "POW",
+    "ARG_MIN",
+    "FAKE_QUANT",
+    "REDUCE_PROD",
+    "REDUCE_MAX",
+    "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
+    "LOGICAL_AND",
+    "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
+    "SQUARE",
+    "ZEROS_LIKE",
+    "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
+    "MATRIX_DIAG",
+    "QUANTIZE",
+    "MATRIX_SET_DIAG",
+    "ROUND",
+    "HARD_SWISH",
+    "IF",
+    "WHILE",
+    "NON_MAX_SUPPRESSION_V4",
+    "NON_MAX_SUPPRESSION_V5",
+    "SCATTER_ND",
+    "SELECT_V2",
+    "DENSIFY",
+    "SEGMENT_SUM",
+    "BATCH_MATMUL"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 127, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BuiltinOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_SEQUENCE, 0, 3 },
+    { flatbuffers::ET_SEQUENCE, 0, 4 },
+    { flatbuffers::ET_SEQUENCE, 0, 5 },
+    { flatbuffers::ET_SEQUENCE, 0, 6 },
+    { flatbuffers::ET_SEQUENCE, 0, 7 },
+    { flatbuffers::ET_SEQUENCE, 0, 8 },
+    { flatbuffers::ET_SEQUENCE, 0, 9 },
+    { flatbuffers::ET_SEQUENCE, 0, 10 },
+    { flatbuffers::ET_SEQUENCE, 0, 11 },
+    { flatbuffers::ET_SEQUENCE, 0, 12 },
+    { flatbuffers::ET_SEQUENCE, 0, 13 },
+    { flatbuffers::ET_SEQUENCE, 0, 14 },
+    { flatbuffers::ET_SEQUENCE, 0, 15 },
+    { flatbuffers::ET_SEQUENCE, 0, 16 },
+    { flatbuffers::ET_SEQUENCE, 0, 17 },
+    { flatbuffers::ET_SEQUENCE, 0, 18 },
+    { flatbuffers::ET_SEQUENCE, 0, 19 },
+    { flatbuffers::ET_SEQUENCE, 0, 20 },
+    { flatbuffers::ET_SEQUENCE, 0, 21 },
+    { flatbuffers::ET_SEQUENCE, 0, 22 },
+    { flatbuffers::ET_SEQUENCE, 0, 23 },
+    { flatbuffers::ET_SEQUENCE, 0, 24 },
+    { flatbuffers::ET_SEQUENCE, 0, 25 },
+    { flatbuffers::ET_SEQUENCE, 0, 26 },
+    { flatbuffers::ET_SEQUENCE, 0, 27 },
+    { flatbuffers::ET_SEQUENCE, 0, 28 },
+    { flatbuffers::ET_SEQUENCE, 0, 29 },
+    { flatbuffers::ET_SEQUENCE, 0, 30 },
+    { flatbuffers::ET_SEQUENCE, 0, 31 },
+    { flatbuffers::ET_SEQUENCE, 0, 32 },
+    { flatbuffers::ET_SEQUENCE, 0, 33 },
+    { flatbuffers::ET_SEQUENCE, 0, 34 },
+    { flatbuffers::ET_SEQUENCE, 0, 35 },
+    { flatbuffers::ET_SEQUENCE, 0, 36 },
+    { flatbuffers::ET_SEQUENCE, 0, 37 },
+    { flatbuffers::ET_SEQUENCE, 0, 38 },
+    { flatbuffers::ET_SEQUENCE, 0, 39 },
+    { flatbuffers::ET_SEQUENCE, 0, 40 },
+    { flatbuffers::ET_SEQUENCE, 0, 41 },
+    { flatbuffers::ET_SEQUENCE, 0, 42 },
+    { flatbuffers::ET_SEQUENCE, 0, 43 },
+    { flatbuffers::ET_SEQUENCE, 0, 44 },
+    { flatbuffers::ET_SEQUENCE, 0, 45 },
+    { flatbuffers::ET_SEQUENCE, 0, 46 },
+    { flatbuffers::ET_SEQUENCE, 0, 47 },
+    { flatbuffers::ET_SEQUENCE, 0, 48 },
+    { flatbuffers::ET_SEQUENCE, 0, 49 },
+    { flatbuffers::ET_SEQUENCE, 0, 50 },
+    { flatbuffers::ET_SEQUENCE, 0, 51 },
+    { flatbuffers::ET_SEQUENCE, 0, 52 },
+    { flatbuffers::ET_SEQUENCE, 0, 53 },
+    { flatbuffers::ET_SEQUENCE, 0, 54 },
+    { flatbuffers::ET_SEQUENCE, 0, 55 },
+    { flatbuffers::ET_SEQUENCE, 0, 56 },
+    { flatbuffers::ET_SEQUENCE, 0, 57 },
+    { flatbuffers::ET_SEQUENCE, 0, 58 },
+    { flatbuffers::ET_SEQUENCE, 0, 59 },
+    { flatbuffers::ET_SEQUENCE, 0, 60 },
+    { flatbuffers::ET_SEQUENCE, 0, 61 },
+    { flatbuffers::ET_SEQUENCE, 0, 62 },
+    { flatbuffers::ET_SEQUENCE, 0, 63 },
+    { flatbuffers::ET_SEQUENCE, 0, 64 },
+    { flatbuffers::ET_SEQUENCE, 0, 65 },
+    { flatbuffers::ET_SEQUENCE, 0, 66 },
+    { flatbuffers::ET_SEQUENCE, 0, 67 },
+    { flatbuffers::ET_SEQUENCE, 0, 68 },
+    { flatbuffers::ET_SEQUENCE, 0, 69 },
+    { flatbuffers::ET_SEQUENCE, 0, 70 },
+    { flatbuffers::ET_SEQUENCE, 0, 71 },
+    { flatbuffers::ET_SEQUENCE, 0, 72 },
+    { flatbuffers::ET_SEQUENCE, 0, 73 },
+    { flatbuffers::ET_SEQUENCE, 0, 74 },
+    { flatbuffers::ET_SEQUENCE, 0, 75 },
+    { flatbuffers::ET_SEQUENCE, 0, 76 },
+    { flatbuffers::ET_SEQUENCE, 0, 77 },
+    { flatbuffers::ET_SEQUENCE, 0, 78 },
+    { flatbuffers::ET_SEQUENCE, 0, 79 },
+    { flatbuffers::ET_SEQUENCE, 0, 80 },
+    { flatbuffers::ET_SEQUENCE, 0, 81 },
+    { flatbuffers::ET_SEQUENCE, 0, 82 },
+    { flatbuffers::ET_SEQUENCE, 0, 83 },
+    { flatbuffers::ET_SEQUENCE, 0, 84 },
+    { flatbuffers::ET_SEQUENCE, 0, 85 },
+    { flatbuffers::ET_SEQUENCE, 0, 86 },
+    { flatbuffers::ET_SEQUENCE, 0, 87 },
+    { flatbuffers::ET_SEQUENCE, 0, 88 },
+    { flatbuffers::ET_SEQUENCE, 0, 89 },
+    { flatbuffers::ET_SEQUENCE, 0, 90 },
+    { flatbuffers::ET_SEQUENCE, 0, 91 },
+    { flatbuffers::ET_SEQUENCE, 0, 92 },
+    { flatbuffers::ET_SEQUENCE, 0, 93 },
+    { flatbuffers::ET_SEQUENCE, 0, 94 },
+    { flatbuffers::ET_SEQUENCE, 0, 95 },
+    { flatbuffers::ET_SEQUENCE, 0, 96 },
+    { flatbuffers::ET_SEQUENCE, 0, 97 },
+    { flatbuffers::ET_SEQUENCE, 0, 98 },
+    { flatbuffers::ET_SEQUENCE, 0, 99 },
+    { flatbuffers::ET_SEQUENCE, 0, 100 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::Conv2DOptionsTypeTable,
+    tflite::DepthwiseConv2DOptionsTypeTable,
+    tflite::ConcatEmbeddingsOptionsTypeTable,
+    tflite::LSHProjectionOptionsTypeTable,
+    tflite::Pool2DOptionsTypeTable,
+    tflite::SVDFOptionsTypeTable,
+    tflite::RNNOptionsTypeTable,
+    tflite::FullyConnectedOptionsTypeTable,
+    tflite::SoftmaxOptionsTypeTable,
+    tflite::ConcatenationOptionsTypeTable,
+    tflite::AddOptionsTypeTable,
+    tflite::L2NormOptionsTypeTable,
+    tflite::LocalResponseNormalizationOptionsTypeTable,
+    tflite::LSTMOptionsTypeTable,
+    tflite::ResizeBilinearOptionsTypeTable,
+    tflite::CallOptionsTypeTable,
+    tflite::ReshapeOptionsTypeTable,
+    tflite::SkipGramOptionsTypeTable,
+    tflite::SpaceToDepthOptionsTypeTable,
+    tflite::EmbeddingLookupSparseOptionsTypeTable,
+    tflite::MulOptionsTypeTable,
+    tflite::PadOptionsTypeTable,
+    tflite::GatherOptionsTypeTable,
+    tflite::BatchToSpaceNDOptionsTypeTable,
+    tflite::SpaceToBatchNDOptionsTypeTable,
+    tflite::TransposeOptionsTypeTable,
+    tflite::ReducerOptionsTypeTable,
+    tflite::SubOptionsTypeTable,
+    tflite::DivOptionsTypeTable,
+    tflite::SqueezeOptionsTypeTable,
+    tflite::SequenceRNNOptionsTypeTable,
+    tflite::StridedSliceOptionsTypeTable,
+    tflite::ExpOptionsTypeTable,
+    tflite::TopKV2OptionsTypeTable,
+    tflite::SplitOptionsTypeTable,
+    tflite::LogSoftmaxOptionsTypeTable,
+    tflite::CastOptionsTypeTable,
+    tflite::DequantizeOptionsTypeTable,
+    tflite::MaximumMinimumOptionsTypeTable,
+    tflite::ArgMaxOptionsTypeTable,
+    tflite::LessOptionsTypeTable,
+    tflite::NegOptionsTypeTable,
+    tflite::PadV2OptionsTypeTable,
+    tflite::GreaterOptionsTypeTable,
+    tflite::GreaterEqualOptionsTypeTable,
+    tflite::LessEqualOptionsTypeTable,
+    tflite::SelectOptionsTypeTable,
+    tflite::SliceOptionsTypeTable,
+    tflite::TransposeConvOptionsTypeTable,
+    tflite::SparseToDenseOptionsTypeTable,
+    tflite::TileOptionsTypeTable,
+    tflite::ExpandDimsOptionsTypeTable,
+    tflite::EqualOptionsTypeTable,
+    tflite::NotEqualOptionsTypeTable,
+    tflite::ShapeOptionsTypeTable,
+    tflite::PowOptionsTypeTable,
+    tflite::ArgMinOptionsTypeTable,
+    tflite::FakeQuantOptionsTypeTable,
+    tflite::PackOptionsTypeTable,
+    tflite::LogicalOrOptionsTypeTable,
+    tflite::OneHotOptionsTypeTable,
+    tflite::LogicalAndOptionsTypeTable,
+    tflite::LogicalNotOptionsTypeTable,
+    tflite::UnpackOptionsTypeTable,
+    tflite::FloorDivOptionsTypeTable,
+    tflite::SquareOptionsTypeTable,
+    tflite::ZerosLikeOptionsTypeTable,
+    tflite::FillOptionsTypeTable,
+    tflite::BidirectionalSequenceLSTMOptionsTypeTable,
+    tflite::BidirectionalSequenceRNNOptionsTypeTable,
+    tflite::UnidirectionalSequenceLSTMOptionsTypeTable,
+    tflite::FloorModOptionsTypeTable,
+    tflite::RangeOptionsTypeTable,
+    tflite::ResizeNearestNeighborOptionsTypeTable,
+    tflite::LeakyReluOptionsTypeTable,
+    tflite::SquaredDifferenceOptionsTypeTable,
+    tflite::MirrorPadOptionsTypeTable,
+    tflite::AbsOptionsTypeTable,
+    tflite::SplitVOptionsTypeTable,
+    tflite::UniqueOptionsTypeTable,
+    tflite::ReverseV2OptionsTypeTable,
+    tflite::AddNOptionsTypeTable,
+    tflite::GatherNdOptionsTypeTable,
+    tflite::CosOptionsTypeTable,
+    tflite::WhereOptionsTypeTable,
+    tflite::RankOptionsTypeTable,
+    tflite::ReverseSequenceOptionsTypeTable,
+    tflite::MatrixDiagOptionsTypeTable,
+    tflite::QuantizeOptionsTypeTable,
+    tflite::MatrixSetDiagOptionsTypeTable,
+    tflite::HardSwishOptionsTypeTable,
+    tflite::IfOptionsTypeTable,
+    tflite::WhileOptionsTypeTable,
+    tflite::DepthToSpaceOptionsTypeTable,
+    tflite::NonMaxSuppressionV4OptionsTypeTable,
+    tflite::NonMaxSuppressionV5OptionsTypeTable,
+    tflite::ScatterNdOptionsTypeTable,
+    tflite::SelectV2OptionsTypeTable,
+    tflite::DensifyOptionsTypeTable,
+    tflite::SegmentSumOptionsTypeTable,
+    tflite::BatchMatMulOptionsTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Conv2DOptions",
+    "DepthwiseConv2DOptions",
+    "ConcatEmbeddingsOptions",
+    "LSHProjectionOptions",
+    "Pool2DOptions",
+    "SVDFOptions",
+    "RNNOptions",
+    "FullyConnectedOptions",
+    "SoftmaxOptions",
+    "ConcatenationOptions",
+    "AddOptions",
+    "L2NormOptions",
+    "LocalResponseNormalizationOptions",
+    "LSTMOptions",
+    "ResizeBilinearOptions",
+    "CallOptions",
+    "ReshapeOptions",
+    "SkipGramOptions",
+    "SpaceToDepthOptions",
+    "EmbeddingLookupSparseOptions",
+    "MulOptions",
+    "PadOptions",
+    "GatherOptions",
+    "BatchToSpaceNDOptions",
+    "SpaceToBatchNDOptions",
+    "TransposeOptions",
+    "ReducerOptions",
+    "SubOptions",
+    "DivOptions",
+    "SqueezeOptions",
+    "SequenceRNNOptions",
+    "StridedSliceOptions",
+    "ExpOptions",
+    "TopKV2Options",
+    "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
+    "ShapeOptions",
+    "PowOptions",
+    "ArgMinOptions",
+    "FakeQuantOptions",
+    "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
+    "LogicalAndOptions",
+    "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
+    "SquareOptions",
+    "ZerosLikeOptions",
+    "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
+    "MatrixDiagOptions",
+    "QuantizeOptions",
+    "MatrixSetDiagOptions",
+    "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
+    "NonMaxSuppressionV4Options",
+    "NonMaxSuppressionV5Options",
+    "ScatterNdOptions",
+    "SelectV2Options",
+    "DensifyOptions",
+    "SegmentSumOptions",
+    "BatchMatMulOptions"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_UNION, 102, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *PaddingTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable
+  };
+  static const char * const names[] = {
+    "SAME",
+    "VALID"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ActivationFunctionTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "TANH",
+    "SIGN_BIT"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LSHProjectionTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSHProjectionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "UNKNOWN",
+    "SPARSE",
+    "DENSE"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FullyConnectedOptionsWeightsFormatTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::FullyConnectedOptionsWeightsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "DEFAULT",
+    "SHUFFLED4x16INT8"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LSTMKernelTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSTMKernelTypeTypeTable
+  };
+  static const char * const names[] = {
+    "FULL",
+    "BASIC"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CombinerTypeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::CombinerTypeTypeTable
+  };
+  static const char * const names[] = {
+    "SUM",
+    "MEAN",
+    "SQRTN"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MirrorPadModeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::MirrorPadModeTypeTable
+  };
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CustomOptionsFormatTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::CustomOptionsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "FLEXBUFFERS"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_ENUM, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CustomQuantizationTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "custom"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *QuantizationParametersTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 1, -1 },
+    { flatbuffers::ET_FLOAT, 1, -1 },
+    { flatbuffers::ET_FLOAT, 1, -1 },
+    { flatbuffers::ET_LONG, 1, -1 },
+    { flatbuffers::ET_UTYPE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::QuantizationDetailsTypeTable
+  };
+  static const char * const names[] = {
+    "min",
+    "max",
+    "scale",
+    "zero_point",
+    "details_type",
+    "details",
+    "quantized_dimension"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Int32VectorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Uint16VectorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_USHORT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Uint8VectorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DimensionMetadataTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_UTYPE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_UTYPE, 0, 1 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionTypeTypeTable,
+    tflite::SparseIndexVectorTypeTable
+  };
+  static const char * const names[] = {
+    "format",
+    "dense_size",
+    "array_segments_type",
+    "array_segments",
+    "array_indices_type",
+    "array_indices"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SparsityParametersTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionMetadataTypeTable
+  };
+  static const char * const names[] = {
+    "traversal_order",
+    "block_map",
+    "dim_metadata"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TensorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 0, 2 },
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable,
+    tflite::QuantizationParametersTypeTable,
+    tflite::SparsityParametersTypeTable
+  };
+  static const char * const names[] = {
+    "shape",
+    "type",
+    "buffer",
+    "name",
+    "quantization",
+    "is_variable",
+    "sparsity",
+    "shape_signature"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 8, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Conv2DOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "fused_activation_function",
+    "dilation_w_factor",
+    "dilation_h_factor"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *Pool2DOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "filter_width",
+    "filter_height",
+    "fused_activation_function"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DepthwiseConv2DOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "depth_multiplier",
+    "fused_activation_function",
+    "dilation_w_factor",
+    "dilation_h_factor"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ConcatEmbeddingsOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "num_channels",
+    "num_columns_per_channel",
+    "embedding_dim_per_channel"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LSHProjectionOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSHProjectionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SVDFOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "rank",
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RNNOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SequenceRNNOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "time_major",
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BidirectionalSequenceRNNOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "time_major",
+    "fused_activation_function",
+    "merge_outputs",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FullyConnectedOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::FullyConnectedOptionsWeightsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "weights_format",
+    "keep_num_dims",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SoftmaxOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "beta"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ConcatenationOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "axis",
+    "fused_activation_function"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AddOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "pot_scale_int16"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MulOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *L2NormOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LocalResponseNormalizationOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "radius",
+    "bias",
+    "alpha",
+    "beta"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LSTMOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::LSTMKernelTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "kernel_type",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *UnidirectionalSequenceLSTMOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "time_major",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BidirectionalSequenceLSTMOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "merge_outputs",
+    "time_major",
+    "asymmetric_quantize_inputs"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ResizeBilinearOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "new_height",
+    "new_width",
+    "align_corners",
+    "half_pixel_centers"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ResizeNearestNeighborOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "align_corners",
+    "half_pixel_centers"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CallOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "subgraph"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *PadOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *PadV2OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReshapeOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "new_shape"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SpaceToBatchNDOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BatchToSpaceNDOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SkipGramOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "ngram_size",
+    "max_skip_size",
+    "include_all_ngrams"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SpaceToDepthOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "block_size"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DepthToSpaceOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "block_size"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SubOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "pot_scale_int16"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DivOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TopKV2OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *EmbeddingLookupSparseOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::CombinerTypeTypeTable
+  };
+  static const char * const names[] = {
+    "combiner"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *GatherOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "axis"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TransposeOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ExpOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CosOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReducerOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "keep_dims"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SqueezeOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "squeeze_dims"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SplitOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num_splits"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SplitVOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num_splits"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *StridedSliceOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "begin_mask",
+    "end_mask",
+    "ellipsis_mask",
+    "new_axis_mask",
+    "shrink_axis_mask"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 5, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LogSoftmaxOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *CastOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "in_data_type",
+    "out_data_type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DequantizeOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MaximumMinimumOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TileOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ArgMaxOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "output_type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ArgMinOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "output_type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *GreaterOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *GreaterEqualOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LessOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LessEqualOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *NegOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SelectOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SliceOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *TransposeConvOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ExpandDimsOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SparseToDenseOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "validate_indices"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *EqualOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *NotEqualOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ShapeOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "out_type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RankOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *PowOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FakeQuantOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_FLOAT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "min",
+    "max",
+    "num_bits",
+    "narrow_range"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *PackOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "values_count",
+    "axis"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LogicalOrOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *OneHotOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "axis"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AbsOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *HardSwishOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LogicalAndOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LogicalNotOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *UnpackOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num",
+    "axis"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FloorDivOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SquareOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ZerosLikeOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FillOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *FloorModOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *RangeOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *LeakyReluOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "alpha"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SquaredDifferenceOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MirrorPadOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::MirrorPadModeTypeTable
+  };
+  static const char * const names[] = {
+    "mode"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *UniqueOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "idx_out_type"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReverseV2OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *AddNOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *GatherNdOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *WhereOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ReverseSequenceOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "seq_dim",
+    "batch_dim"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MatrixDiagOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *QuantizeOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MatrixSetDiagOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *IfOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "then_subgraph_index",
+    "else_subgraph_index"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *WhileOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_INT, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "cond_subgraph_index",
+    "body_subgraph_index"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *NonMaxSuppressionV4OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *NonMaxSuppressionV5OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ScatterNdOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SelectV2OptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *DensifyOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SegmentSumOptionsTypeTable() {
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BatchMatMulOptionsTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_BOOL, 0, -1 },
+    { flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "adj_x",
+    "adj_y"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *OperatorCodeTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_CHAR, 0, 0 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_INT, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOperatorTypeTable
+  };
+  static const char * const names[] = {
+    "builtin_code",
+    "custom_code",
+    "version"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *OperatorTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_UTYPE, 0, 0 },
+    { flatbuffers::ET_SEQUENCE, 0, 0 },
+    { flatbuffers::ET_UCHAR, 1, -1 },
+    { flatbuffers::ET_CHAR, 0, 1 },
+    { flatbuffers::ET_BOOL, 1, -1 },
+    { flatbuffers::ET_INT, 1, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOptionsTypeTable,
+    tflite::CustomOptionsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "opcode_index",
+    "inputs",
+    "outputs",
+    "builtin_options_type",
+    "builtin_options",
+    "custom_options",
+    "custom_options_format",
+    "mutating_variable_inputs",
+    "intermediates"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 9, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *SubGraphTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 1 },
+    { flatbuffers::ET_STRING, 0, -1 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTable,
+    tflite::OperatorTypeTable
+  };
+  static const char * const names[] = {
+    "tensors",
+    "inputs",
+    "outputs",
+    "operators",
+    "name"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *BufferTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "data"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *MetadataTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "name",
+    "buffer"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const flatbuffers::TypeTable *ModelTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_UINT, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 0 },
+    { flatbuffers::ET_SEQUENCE, 1, 1 },
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 2 },
+    { flatbuffers::ET_INT, 1, -1 },
+    { flatbuffers::ET_SEQUENCE, 1, 3 }
+  };
+  static const flatbuffers::TypeFunction type_refs[] = {
+    tflite::OperatorCodeTypeTable,
+    tflite::SubGraphTypeTable,
+    tflite::BufferTypeTable,
+    tflite::MetadataTypeTable
+  };
+  static const char * const names[] = {
+    "version",
+    "operator_codes",
+    "subgraphs",
+    "description",
+    "buffers",
+    "metadata_buffer",
+    "metadata"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const tflite::Model *GetModel(const void *buf) {
+  return flatbuffers::GetRoot<tflite::Model>(buf);
+}
+
+inline const tflite::Model *GetSizePrefixedModel(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<tflite::Model>(buf);
+}
+
+inline const char *ModelIdentifier() {
+  return "TFL3";
+}
+
+inline bool ModelBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ModelIdentifier());
+}
+
+inline bool VerifyModelBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline bool VerifySizePrefixedModelBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline const char *ModelExtension() {
+  return "tflite";
+}
+
+inline void FinishModelBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::Model> root) {
+  fbb.Finish(root, ModelIdentifier());
+}
+
+inline void FinishSizePrefixedModelBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::Model> root) {
+  fbb.FinishSizePrefixed(root, ModelIdentifier());
+}
+
+inline std::unique_ptr<tflite::ModelT> UnPackModel(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ModelT>(GetModel(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<tflite::ModelT> UnPackSizePrefixedModel(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ModelT>(GetSizePrefixedModel(buf)->UnPack(res));
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_binary_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_binary_converter.cc
new file mode 100644
index 0000000..131b94b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_binary_converter.cc
@@ -0,0 +1,159 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+#include "tnn/utils/data_format_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Binary);
+
+std::string TFLiteBinaryConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    std::string prefix;
+    if (quantized_model) {
+        prefix = "Quantized";
+    }
+    switch (op_code) {
+        case tflite::BuiltinOperator_ADD:
+            return prefix + "Add";
+        case tflite::BuiltinOperator_SUB:
+            return prefix + "Sub";
+        case tflite::BuiltinOperator_MUL:
+            return prefix + "Mul";
+        case tflite::BuiltinOperator_DIV:
+            return prefix + "Div";
+        case tflite::BuiltinOperator_MAXIMUM:
+            return prefix + "Maximum";
+        case tflite::BuiltinOperator_MINIMUM:
+            return prefix + "Minimum";
+        case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+            return prefix + "SquaredDifference";
+        default:
+            return "";
+    }
+}
+
+tflite::ActivationFunctionType TFLiteBinaryConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    switch (op_code) {
+        case tflite::BuiltinOperator_ADD:
+            return tf_lite_operator->builtin_options.AsAddOptions()->fused_activation_function;
+        case tflite::BuiltinOperator_SUB:
+            return tf_lite_operator->builtin_options.AsSubOptions()->fused_activation_function;
+        case tflite::BuiltinOperator_MUL:
+            return tf_lite_operator->builtin_options.AsMulOptions()->fused_activation_function;
+        case tflite::BuiltinOperator_DIV:
+            return tf_lite_operator->builtin_options.AsDivOptions()->fused_activation_function;
+        case tflite::BuiltinOperator_MAXIMUM:
+        case tflite::BuiltinOperator_MINIMUM:
+        case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+            return tflite::ActivationFunctionType_NONE;
+        default:
+            return tflite::ActivationFunctionType_NONE;
+    }
+}
+
+TNN_NS::Status TFLiteBinaryConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                           const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                           bool quantized_model) {
+    auto param                = new TNN_NS::MultidirBroadcastLayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type               = cur_layer->type_str;
+    param->name               = cur_layer->name;
+    param->quantized          = quantized_model;
+    param->weight_input_index = -1;
+    for (int i = 0; i < tf_lite_operator->inputs.size(); ++i) {
+        auto& tensor = tf_lite_tensors[tf_lite_operator->inputs[i]];
+        auto& buffer = tf_lite_model_buffer[tensor->buffer];
+        if (!buffer->data.empty()) {
+            assert(param->weight_input_index == -1);
+            param->weight_input_index = i;
+        }
+    }
+    if (quantized_model) {
+        // TODO
+    } else {
+        if (param->weight_input_index != -1) {
+            // get weight from weight
+            auto layer_resource     = new TNN_NS::EltwiseLayerResource;
+            layer_resource->name    = cur_layer->name;
+            auto& weight_tensor     = tf_lite_tensors[tf_lite_operator->inputs[param->weight_input_index]];
+            const auto& weight_dims = weight_tensor->shape;
+            auto weight_ptr = reinterpret_cast<float*>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+            auto weight_size =
+                tf_lite_model_buffer[weight_tensor->buffer]->data.size() / SizeofTFLiteTensorData(weight_tensor->type);
+            const auto& input_index  = param->weight_input_index == 0 ? 1 : 0;
+            const auto& input_tensor = tf_lite_tensors[tf_lite_operator->inputs[input_index]];
+            const auto& input_dims   = input_tensor->shape;
+            auto tnn_dims            = std::vector<int32_t>(input_dims.size(), 1);
+            if (weight_size != 0) {
+                if (weight_dims.empty() || weight_dims.size() == 1) {
+                    if (input_dims.size() == 1) {
+                        tnn_dims[0] = weight_size;
+                    } else {
+                        tnn_dims[1] = weight_size;
+                    }
+                } else {
+                    tnn_dims = weight_dims;
+                    ConvertShapeFormatTFLite(tnn_dims);
+                }
+            } else {
+                return TNN_NS::Status(TNN_NS::TNNERR_CONVERT_INVALID_MODEL,
+                                      "TFLite:The weight size should not be zero!\n");
+            }
+            TNN_NS::RawBuffer element_handle = TNN_NS::RawBuffer(weight_size * sizeof(float), tnn_dims);
+            if (tnn_dims.size() < 3) {
+                ::memcpy(element_handle.force_to<float*>(), weight_ptr, weight_size * sizeof(float));
+            } else if (tnn_dims.size() == 3) {
+                const auto& n = tnn_dims[0];
+                const auto& c = tnn_dims[1];
+                const auto& h = tnn_dims[2];
+                const auto& w = 1;
+                TNN_NS::DataFormatConverter::ConvertBetweenNHWCAndNCHW<float>(
+                    weight_ptr, element_handle.force_to<float*>(), n, c, h, w, TNN_NS::DataFormatConverter::NHWC2NCHW);
+            } else if (tnn_dims.size() == 4) {
+                const auto& n = tnn_dims[0];
+                const auto& c = tnn_dims[1];
+                const auto& h = tnn_dims[2];
+                const auto& w = tnn_dims[3];
+                TNN_NS::DataFormatConverter::ConvertBetweenNHWCAndNCHW<float>(
+                    weight_ptr, element_handle.force_to<float*>(), n, c, h, w, TNN_NS::DataFormatConverter::NHWC2NCHW);
+            }
+            layer_resource->element_handle             = element_handle;
+            net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+            cur_layer->inputs.resize(1);
+            if (param->weight_input_index == 0) {
+                cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[1]]->name;
+            } else {
+                cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+            }
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Binary, BuiltinOperator_ADD);
+REGISTER_CONVERTER(Binary, BuiltinOperator_SUB);
+REGISTER_CONVERTER(Binary, BuiltinOperator_MUL);
+REGISTER_CONVERTER(Binary, BuiltinOperator_DIV);
+REGISTER_CONVERTER(Binary, BuiltinOperator_MAXIMUM);
+REGISTER_CONVERTER(Binary, BuiltinOperator_MINIMUM);
+REGISTER_CONVERTER(Binary, BuiltinOperator_SQUARED_DIFFERENCE);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_concate_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_concate_converter.cc
new file mode 100644
index 0000000..0b64e2b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_concate_converter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Concat);
+
+std::string TFLiteConcatConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Concat";
+}
+
+tflite::ActivationFunctionType TFLiteConcatConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tf_lite_operator->builtin_options.AsConcatenationOptions()->fused_activation_function;
+}
+
+TNN_NS::Status TFLiteConcatConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                           bool quantized_model) {
+    auto param                 = new TNN_NS::ConcatLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    auto tf_lite_op_type       = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    const auto &reshape_option = tf_lite_operator->builtin_options.AsReshapeOptions();
+    param->name                = cur_layer->name;
+    param->type                = cur_layer->type_str;
+    param->quantized           = false;
+    auto option                = tf_lite_operator->builtin_options.AsConcatenationOptions();
+    auto &input_tensor         = tf_lite_tensors[tf_lite_operator->inputs[0]];
+    param->axis                = ConvertAxisFormatTFLite(option->axis, input_tensor->shape.size());
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Concat, BuiltinOperator_CONCATENATION);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_conv2d_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_conv2d_converter.cc
new file mode 100644
index 0000000..4caa211
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_conv2d_converter.cc
@@ -0,0 +1,326 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/dims_vector_utils.h>
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Conv2D);
+
+std::string TFLiteConv2DConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedConvolution";
+    } else {
+        return "Convolution";
+    }
+}
+
+tflite::ActivationFunctionType TFLiteConv2DConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    switch (op_code) {
+        case tflite::BuiltinOperator_CONV_2D:
+            return tf_lite_operator->builtin_options.AsConv2DOptions()->fused_activation_function;
+        case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
+            return tf_lite_operator->builtin_options.AsDepthwiseConv2DOptions()->fused_activation_function;
+        default:
+            return tflite::ActivationFunctionType_NONE;
+    }
+}
+void CalculatePadSize(const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                      const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                      const tflite::BuiltinOperator tf_lite_op_type,
+                      const tflite::BuiltinOptionsUnion& builtin_options_union, const int kernel_h, const int kernel_w,
+                      TNN_NS::ConvLayerParam* param) {
+    auto input_index          = tf_lite_operator->inputs[0];
+    const auto& input_tensor  = tf_lite_tensors[input_index];
+    const auto input_shape    = input_tensor->shape;
+    const int input_height    = input_shape[1];
+    const int input_wight     = input_shape[2];
+    const auto& output_tensor = tf_lite_tensors[tf_lite_operator->outputs[0]];
+    const auto& output_shape  = output_tensor->shape;
+    const int output_height   = output_shape[1];
+    const int output_weight   = output_shape[2];
+    int pad_left              = 0;
+    int pad_right             = 0;
+    int pad_top               = 0;
+    int pad_bottom            = 0;
+    int dilation_h            = 0;
+    int dilation_w            = 0;
+    int stride_h              = 0;
+    int stride_w              = 0;
+    int kernel_extent_h       = 0;
+    int kernel_extent_w       = 0;
+    int total_pad_h           = 0;
+    int total_pad_w           = 0;
+    if (tf_lite_op_type == tflite::BuiltinOperator_CONV_2D) {
+        const auto& option = builtin_options_union.AsConv2DOptions();
+        dilation_h         = option->dilation_h_factor;
+        dilation_w         = option->dilation_w_factor;
+        stride_h           = option->stride_h;
+        stride_w           = option->stride_w;
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) {
+        const auto& option = builtin_options_union.AsDepthwiseConv2DOptions();
+        dilation_h         = option->dilation_h_factor;
+        dilation_w         = option->dilation_w_factor;
+        stride_h           = option->stride_h;
+        stride_w           = option->stride_w;
+    }
+    kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    total_pad_h     = (output_height - 1) * stride_h + kernel_extent_h - input_height;
+    total_pad_w     = (output_weight - 1) * stride_w + kernel_extent_w - input_wight;
+    pad_top         = total_pad_h % 2 == 0 ? total_pad_h / 2 : total_pad_h / 2 + 1;
+    pad_bottom      = total_pad_h - pad_top;
+    pad_left        = total_pad_w % 2 == 0 ? total_pad_w / 2 : total_pad_w / 2 + 1;
+    pad_right       = total_pad_w - pad_left;
+    param->pads.clear();
+    param->pads.push_back(pad_left);
+    param->pads.push_back(pad_right);
+    param->pads.push_back(pad_top);
+    param->pads.push_back(pad_bottom);
+}
+
+static TNN_NS::Status CreateResource(TNN_NS::NetResource& net_resource,
+                                     const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                     const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                     const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                    std::string layer_name) {
+    auto resource                         = std::make_shared<TNN_NS::ConvLayerResource>();
+    resource->name                        = layer_name;
+    net_resource.resource_map[layer_name] = resource;
+    const auto& weight_tensor             = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    const auto& weight_quantization       = weight_tensor->quantization;
+    const auto& weight_scales             = weight_quantization->scale;
+    ASSERT(weight_scales.size() == 1);
+    const auto& weight_zero_point = weight_quantization->zero_point;
+    ASSERT(weight_zero_point.size() == 1);
+    auto& weight_dims            = weight_tensor->shape;
+    const int CO          = weight_dims[0];
+    const int KH          = weight_dims[1];
+    const int KW          = weight_dims[2];
+    const int CI          = weight_dims[3];
+    const int weight_size = TNN_NS::DimsVectorUtils::Count(weight_dims);
+    auto filter_handle    = TNN_NS::RawBuffer(weight_size * sizeof(int8_t));
+    filter_handle.SetDataType(TNN_NS::DATA_TYPE_INT8);
+    auto dst     = filter_handle.force_to<int8_t*>();
+    uint8_t* src = tf_lite_model_buffer[weight_tensor->buffer]->data.data();
+    // OHWi -> OIHW
+    for (int co = 0; co < CO; ++co) {
+        for (int ci = 0; ci < CI; ++ci) {
+            for (int h = 0; h < KH; ++h) {
+                for (int w = 0; w < KW; ++w) {
+                    dst[(co * CI + ci) * KH * KW + h * KW + w] =
+                        src[(co * KH + h) * KW * CI + w * CI + ci] - weight_zero_point[0];
+                }
+            }
+        }
+    }
+    resource->filter_handle = filter_handle;
+
+    // create weight scale
+    const auto& input_scale = tf_lite_tensors[tf_lite_operator->inputs[0]]->quantization->scale;
+    assert(weight_scales.size() == input_scale.size());
+    TNN_NS::RawBuffer scale_handle = TNN_NS::RawBuffer(weight_scales.size() * sizeof(float ));
+    scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+    const auto cal_weight_scale_data = scale_handle.force_to<float*>();
+    for (int i = 0; i < weight_scales.size(); ++i) {
+        cal_weight_scale_data[i] = input_scale[i] * weight_scales[i];
+    }
+    resource->scale_handle = scale_handle;
+
+    if (tf_lite_operator->inputs.size() >2) {
+        // bias
+        const auto& bias_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+        const auto bias_data = reinterpret_cast<int32_t*>(tf_lite_model_buffer[bias_tensor->buffer]->data.data());
+        const auto& bias_dims = bias_tensor->shape;
+        const int bias_size = TNN_NS::DimsVectorUtils::Count(bias_dims);
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(bias_size * sizeof(int32_t));
+        bias_handle.SetDataType(TNN_NS::DATA_TYPE_INT32);
+        ::memcpy(bias_handle.force_to<int32_t*>(),bias_data, bias_size*sizeof(int32_t));
+        resource->bias_handle = bias_handle;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+TNN_NS::Status TFLiteConv2DConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                           const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                           bool quantized_model) {
+    TNN_NS::ConvLayerParam* param = new TNN_NS::ConvLayerParam;
+    auto cur_layer                = net_structure.layers.back();
+    cur_layer->param              = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                   = cur_layer->name;
+    param->type                   = cur_layer->type_str;
+    param->quantized              = quantized_model;
+    // 3|2 inputs: input tensor, weight, (bias)
+    const int input_size = tf_lite_operator->inputs.size();
+    ASSERT(input_size == 2 || input_size == 3);
+    param->bias = input_size > 2 ? 1 : 0;
+    // weight index
+    const int weight_index    = tf_lite_operator->inputs[1];
+    const auto& weight_tensor = tf_lite_tensors[weight_index];
+    // co kh kw ci
+    const auto& weight_shape = weight_tensor->shape;
+    ASSERT(weight_shape.size() == 4);
+    const int co          = weight_shape[0];
+    const int kh          = weight_shape[1];
+    const int kw          = weight_shape[2];
+    const int ci          = weight_shape[3];
+    const int weight_size = co * kh * kw * ci;
+    auto tf_lite_op_type  = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    if (tf_lite_op_type == tflite::BuiltinOperator_CONV_2D) {
+        const auto option     = tf_lite_operator->builtin_options.AsConv2DOptions();
+        param->input_channel  = ci;
+        param->output_channel = co;
+        param->kernels.push_back(kw);
+        param->kernels.push_back(kh);
+        param->strides.push_back(option->stride_w);
+        param->strides.push_back(option->stride_h);
+        param->dialations.push_back(option->dilation_w_factor);
+        param->dialations.push_back(option->dilation_h_factor);
+        param->group = 1;
+        if (option->padding == tflite::Padding_VALID) {
+            // tensorflow pad valid
+            param->pad_type = 1;
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+        } else if (option->padding == tflite::Padding_SAME) {
+            param->pad_type = 0;
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+        }
+        if (param->dialations[0] != 1 && param->dialations[1] != 1) {
+            param->pad_type = -1;
+            TNN_CONVERTER::CalculatePadSize(tf_lite_operator, tf_lite_tensors, tf_lite_op_type,
+                                            tf_lite_operator->builtin_options, kh, kw, param);
+        }
+        const auto activation = option->fused_activation_function;
+        if (activation == tflite::ActivationFunctionType_RELU) {
+            param->activation_type = TNN_NS::ActivationType_ReLU;
+        } else if (activation == tflite::ActivationFunctionType_RELU6) {
+            param->activation_type = TNN_NS::ActivationType_ReLU6;
+        } else if (activation > tflite::ActivationFunctionType_NONE) {
+            LOGE("TNN Conv2D do not Support fused_activation_function\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) {
+        const auto depthwise_option = tf_lite_operator->builtin_options.AsDepthwiseConv2DOptions();
+        assert(co == 1);
+        const int cm          = co;
+        param->input_channel  = ci;
+        param->output_channel = ci * cm;
+        param->kernels.push_back(kw);
+        param->kernels.push_back(kh);
+        param->strides.push_back(depthwise_option->stride_w);
+        param->strides.push_back(depthwise_option->stride_h);
+        param->dialations.push_back(depthwise_option->dilation_w_factor);
+        param->dialations.push_back(depthwise_option->dilation_h_factor);
+        param->group    = ci;
+        param->pad_type = 0;
+        if (depthwise_option->padding == tflite::Padding_VALID) {
+            // tensorflow pad valid
+            param->pad_type = 1;
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+        } else if (depthwise_option->padding == tflite::Padding_SAME) {
+            param->pad_type = 0;
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+            param->pads.push_back(0);
+        }
+        if ((param->dialations[0] != 1 || param->dialations[1] != 1) &&
+            (param->strides[0] != 1 || param->strides[1] != 1)) {
+            LOGE("TFLite Converter: If any value of dilation_rate is > 1, then all values of strides must be 1.\n");
+            return TNN_NS::TNNERR_INVALID_MODEL;
+        }
+
+        if ((param->dialations[0] != 1 || param->dialations[1] != 1) &&
+            (param->strides[0] == 1 && param->strides[1] == 1)) {
+            param->pad_type = -1;
+            TNN_CONVERTER::CalculatePadSize(tf_lite_operator, tf_lite_tensors, tf_lite_op_type,
+                                            tf_lite_operator->builtin_options, kh, kw, param);
+        }
+        const auto activation = depthwise_option->fused_activation_function;
+        if (activation == tflite::ActivationFunctionType_RELU) {
+            param->activation_type = TNN_NS::ActivationType_ReLU;
+        } else if (activation == tflite::ActivationFunctionType_RELU6) {
+            param->activation_type = TNN_NS::ActivationType_ReLU6;
+        } else if (activation > tflite::ActivationFunctionType_NONE) {
+            LOGE("TNN Depthwise Conv2D do not Support fused_activation_function\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+    } else {
+        LOGE("TNN Conv2D do not Support operator\n");
+        return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+
+    if (quantized_model) {
+        // create IntScaleResource for input
+        int input_tensor_index = tf_lite_operator->inputs[0];
+        TNN_NS::Status status  = CreateIntScaleResource(net_resource, tf_lite_tensors, input_tensor_index);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+        // create conv resource
+        status = CreateResource(net_resource, tf_lite_operator, tf_lite_tensors, tf_lite_model_buffer,cur_layer->name);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+        // create IntScaleResource for output
+        int output_tensor_index = tf_lite_operator->outputs[0];
+        status                  = CreateIntScaleResource(net_resource, tf_lite_tensors, output_tensor_index);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+
+    } else {
+        // weight
+        auto layer_resource  = new TNN_NS::ConvLayerResource;
+        layer_resource->name = cur_layer->name;
+
+        TNN_NS::RawBuffer filter_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+        auto original_weight_ptr =
+            reinterpret_cast<const float*>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+        TFLiteConvertOHWI2OIHW(original_weight_ptr, filter_handle.force_to<float*>(), co, kh, kw, ci);
+        layer_resource->filter_handle = filter_handle;
+        // bias
+        if (input_size == 3) {
+            const auto& bias_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+            auto bias_data_ptr = reinterpret_cast<const float*>(tf_lite_model_buffer[bias_tensor->buffer]->data.data());
+            if (bias_data_ptr != nullptr) {
+                TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(param->output_channel * sizeof(float));
+                ::memcpy(bias_handle.force_to<float*>(), bias_data_ptr, param->output_channel * sizeof(float));
+                layer_resource->bias_handle = bias_handle;
+            }
+        }
+        net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+    }
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = tf_lite_tensors[tf_lite_operator->outputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+using namespace tflite;
+REGISTER_CONVERTER(Conv2D, BuiltinOperator_CONV_2D);
+REGISTER_CONVERTER(Conv2D, BuiltinOperator_DEPTHWISE_CONV_2D);
+
+}  // namespace TNN_CONVERTER
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.cc
new file mode 100644
index 0000000..edea782
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.cc
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_converter.h"
+
+#include <fstream>
+#include <utility>
+
+#include "tflite-schema/schema_generated.h"
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_CONVERTER {
+
+TFLite2Tnn::TFLite2Tnn(std::string model_path) {
+    tf_lite_model_path_ = model_path;
+}
+TFLite2Tnn::TFLite2Tnn(std::string model_path, std::string onnx_path) {
+    // TODO
+}
+TFLite2Tnn::TFLite2Tnn(std::string mode_path, std::string model_name, std::string onnx_path) {
+    tf_lite_model_path_ = mode_path;
+    tf_lite_model_name_ = model_name;
+    onnx_model_path_    = onnx_path;
+}
+
+static bool NeedExtractInput(uint32_t opCode) {
+#define NONEED(x)                                                                                                      \
+    if (x == opCode)                                                                                                   \
+        return false;
+    NONEED(tflite::BuiltinOperator_CONV_2D);
+    NONEED(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
+    NONEED(tflite::BuiltinOperator_SPLIT);
+    NONEED(tflite::BuiltinOperator_CONCATENATION);
+    NONEED(tflite::BuiltinOperator_CONV_2D);
+    NONEED(tflite::BuiltinOperator_RESHAPE);
+    NONEED(tflite::BuiltinOperator_RESIZE_BILINEAR);
+    NONEED(tflite::BuiltinOperator_SOFTMAX);
+
+    return true;
+}
+
+TNN_NS::Status TFLite2Tnn::Convert2Tnn(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource) {
+    ReadModel(tf_lite_model_path_);
+    const auto& tf_lite_op_set       = tf_lite_model_->operator_codes;
+    int sub_graphs_size              = tf_lite_model_->subgraphs.size();
+    const auto& tf_lite_model_buffer = tf_lite_model_->buffers;
+    bool quantized_model             = IsQuantized();
+    auto& buffer                     = tf_lite_model_->buffers;
+    for (int i = 0; i < sub_graphs_size; ++i) {
+        const auto& operators = tf_lite_model_->subgraphs[i]->operators;
+        const auto& tensors   = tf_lite_model_->subgraphs[i]->tensors;
+
+        // set const
+        std::vector<bool> extracted_tensors(tf_lite_model_->subgraphs[i]->tensors.size(), false);
+
+        // set input
+        TNN_NS::InputShapesMap& inputs_shape_map      = net_structure.inputs_shape_map;
+        TNN_NS::InputDataTypeMap& input_data_type_map = net_structure.input_data_type_map;
+        for (const auto index : tf_lite_model_->subgraphs[i]->inputs) {
+            const auto& input_tensor = tensors[index];
+            const auto& name         = input_tensor->name;
+            std::vector<int32_t> shape(input_tensor->shape);
+            ConvertShapeFormatTFLite(shape);
+            if (inputs_shape_map.find(name) == inputs_shape_map.end()) {
+                inputs_shape_map[name]             = shape;
+                const auto& tflite_input_data_type = input_tensor->type;
+                input_data_type_map[name]          = GetTnnDataTypeFromTFLite(tflite_input_data_type);
+            } else {
+                LOGE("The model conflict between same input names %s\n", name.c_str());
+                return TNN_NS::TNNERR_CONVERT_INVALID_MODEL;
+            }
+        }
+
+        // set output
+        auto& outputs = net_structure.outputs;
+        for (const auto index : tf_lite_model_->subgraphs[i]->outputs) {
+            const auto& output_tensor = tensors[index];
+            const auto& name          = output_tensor->name;
+            std::vector<int32_t> shape(output_tensor->shape);
+            ConvertShapeFormatTFLite(shape);
+            if (outputs.find(name) == outputs.end()) {
+                outputs.insert(name);
+            } else {
+                LOGE("The model conflict between same output names %s\n", name.c_str());
+                return TNN_NS::TNNERR_CONVERT_INVALID_MODEL;
+            }
+        }
+        // convert layer
+        auto& layers = net_structure.layers;
+        for (int j = 0; j < operators.size(); ++j) {
+            const int op_code_index = operators[j]->opcode_index;
+            const auto op_code      = tf_lite_op_set[op_code_index]->builtin_code;
+            if (NeedExtractInput(op_code)) {
+                // TODO
+            }
+            auto converter = TFLiteOpConverterManager::get()->search(op_code);
+            if (converter == nullptr) {
+                LOGE("The TFLiteConverter do not support layer:%s\n", tensors[operators[j]->outputs[0]]->name.c_str());
+                LOGE("The unsupported operator type is:%s\n",
+                     tflite::EnumNameBuiltinOperator(tf_lite_op_set[operators[j]->opcode_index]->builtin_code));
+                return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+            }
+            auto cur_layer = std::make_shared<TNN_NS::LayerInfo>();
+            // TNN 默认使用每层op的第一个输出作为层的名称
+            cur_layer->name              = tensors[operators[j]->outputs[0]]->name;
+            std::string type_name        = converter->TNNOpType(op_code, quantized_model);
+            TNN_NS::LayerType layer_type = TNN_NS::GlobalConvertLayerType(type_name);
+            cur_layer->type              = layer_type;
+            cur_layer->type_str          = type_name;
+            for (auto input_index : operators[j]->inputs) {
+                if (input_index < 0) {
+                    continue;
+                }
+                cur_layer->inputs.push_back(tensors[input_index]->name);
+            }
+            for (auto output_index : operators[j]->outputs) {
+                cur_layer->outputs.push_back(tensors[output_index]->name);
+            }
+            net_structure.layers.push_back(cur_layer);
+            auto status = converter->exec(net_structure, net_resource, operators[j], tensors, tf_lite_model_buffer,
+                                          tf_lite_op_set, quantized_model);
+            if (status != TNN_NS::TNN_CONVERT_OK) {
+                LOGE("TFLite converter %s failed!\n", cur_layer->type_str.c_str());
+                return status;
+            }
+            tflite::ActivationFunctionType activation_function_type = converter->ActivationType(operators[j], op_code);
+            status = converter->SeparateActivation(net_structure, activation_function_type);
+            if (status != TNN_NS::TNN_CONVERT_OK) {
+                LOGE("TFLite converter %s failed!\n", cur_layer->type_str.c_str());
+                return status;
+            }
+            converter->InsertBlobs(net_structure);
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+void TFLite2Tnn::ReadModel(std::string tf_lite_model_path) {
+    std::ifstream input_file(tf_lite_model_path, std::ios::binary);
+    input_file.seekg(0, std::ios::end);
+    const auto file_size = input_file.tellg();
+    input_file.seekg(0, std::ios::beg);
+    char* buffer = new char[file_size];
+    input_file.read(buffer, file_size);
+    input_file.close();
+
+    // TODO verify the mode
+    flatbuffers::Verifier verify((uint8_t*)buffer, file_size);
+    if (!tflite::VerifyModelBuffer(verify)) {
+    }
+
+    tf_lite_model_ = tflite::UnPackModel(buffer);
+    assert(tf_lite_model_ != nullptr);
+    delete[] buffer;
+}
+bool TFLite2Tnn::IsQuantized() {
+    const auto& tf_lite_op_set = tf_lite_model_->operator_codes;
+    int sub_graphs_size        = tf_lite_model_->subgraphs.size();
+    bool quantized_mode        = true;
+    for (int i = 0; i < sub_graphs_size; ++i) {
+        const auto& operators   = tf_lite_model_->subgraphs[i]->operators;
+        const auto& tensors     = tf_lite_model_->subgraphs[i]->tensors;
+        const int operator_size = operators.size();
+        for (int j = 0; j < operator_size; ++j) {
+            const int opcode_index = operators[j]->opcode_index;
+            const auto opcode      = tf_lite_op_set[opcode_index]->builtin_code;
+            if (opcode == tflite::BuiltinOperator_CONV_2D || opcode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D ||
+                opcode == tflite::BuiltinOperator_FULLY_CONNECTED) {
+                const int weight_index    = operators[j]->inputs[1];
+                const auto& weight_tensor = tensors[weight_index];
+                quantized_mode            = weight_tensor->type == tflite::TensorType_UINT8;
+                if (!quantized_mode) {
+                    return quantized_mode;
+                }
+            }
+        }
+    }
+    return quantized_mode;
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.h b/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.h
new file mode 100644
index 0000000..10a356f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_converter.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNNCONVERTER_SRC_TFLITE_TF_LITE_CONVERTER_H_
+#define TNNCONVERTER_SRC_TFLITE_TF_LITE_CONVERTER_H_
+
+#include "tflite-schema/schema_generated.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "utils/model_config.h"
+
+namespace TNN_CONVERTER {
+class TFLite2Tnn {
+public:
+    TFLite2Tnn(std::string model_path);
+    TFLite2Tnn(std::string model_path, std::string onnx_path);
+    TFLite2Tnn(std::string mode_path, std::string model_name, std::string onnx_path);
+    ~TFLite2Tnn(){};
+    TNN_NS::Status Convert2Tnn(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource);
+
+private:
+    void ReadModel(std::string tf_lite_model_path);
+    bool IsQuantized();
+    std::string tf_lite_model_name_;
+    std::string tf_lite_model_path_;
+    std::string onnx_model_path_;
+    std::unique_ptr<tflite::ModelT> tf_lite_model_;
+};
+};  // namespace TNN_CONVERTER
+
+#endif  // TNNCONVERTER_SRC_TFLITE_TF_LITE_CONVERTER_H_
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_custom_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_custom_converter.cc
new file mode 100644
index 0000000..c3dce6b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_custom_converter.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "flatbuffers/flexbuffers.h"
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Custom);
+
+std::string TFLiteCustomConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "DetectionPostProcess";
+}
+tflite::ActivationFunctionType TFLiteCustomConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteCustomConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                           bool quantized_model) {
+    TNN_NS::DetectionPostProcessLayerParam *param = new TNN_NS::DetectionPostProcessLayerParam;
+    auto cur_layer                                = net_structure.layers.back();
+    cur_layer->param                              = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type                                   = cur_layer->type_str;
+    param->name                                   = cur_layer->name;
+    param->quantized                              = false;
+    auto &custom_op_code                          = tf_lite_op_set[tf_lite_operator->opcode_index]->custom_code;
+    assert(custom_op_code == "TFLite_Detection_PostProcess");
+    const uint8_t *custom_option_buffer_data_ptr = tf_lite_operator->custom_options.data();
+    const auto size                              = tf_lite_operator->custom_options.size();
+    const flexbuffers::Map &flex_buffers_map     = flexbuffers::GetRoot(custom_option_buffer_data_ptr, size).AsMap();
+    param->max_detections                        = flex_buffers_map["max_detections"].AsInt32();
+    param->max_classes_per_detection             = flex_buffers_map["max_classes_per_detection"].AsInt32();
+    if (flex_buffers_map["detections_per_class"].IsNull()) {
+        param->detections_per_class = 100;
+    } else {
+        param->detections_per_class = flex_buffers_map["detections_per_class"].AsInt32();
+    }
+    if (flex_buffers_map["use_regular_nms"].IsNull()) {
+        param->use_regular_nms = false;
+    } else {
+        param->use_regular_nms = flex_buffers_map["use_regular_nms"].AsBool();
+    }
+    param->nms_score_threshold = flex_buffers_map["nms_score_threshold"].AsFloat();
+    param->nms_iou_threshold   = flex_buffers_map["nms_iou_threshold"].AsFloat();
+    param->num_classes         = flex_buffers_map["num_classes"].AsInt32();
+    param->center_size_encoding.push_back(flex_buffers_map["y_scale"].AsFloat());
+    param->center_size_encoding.push_back(flex_buffers_map["x_scale"].AsFloat());
+    param->center_size_encoding.push_back(flex_buffers_map["h_scale"].AsFloat());
+    param->center_size_encoding.push_back(flex_buffers_map["w_scale"].AsFloat());
+    if (tf_lite_operator->inputs.size() == 3) {
+        auto layer_resource        = std::make_shared<TNN_NS::DetectionPostProcessLayerResource>();
+        layer_resource->name       = cur_layer->name;
+        const auto &anchors_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+        auto anchors_data_ptr =
+            reinterpret_cast<const float *>(tf_lite_model_buffer[anchors_tensor->buffer]->data.data());
+        if (anchors_data_ptr != nullptr) {
+            param->has_anchors        = true;
+            auto anchors_tensor_shape = anchors_tensor->shape;
+            auto anchors_size         = Count(anchors_tensor_shape);
+            assert(anchors_tensor_shape.size() == 2);
+            param->num_anchors               = anchors_tensor_shape[0];
+            param->anchors_coord_num         = anchors_tensor_shape[1];
+            TNN_NS::RawBuffer anchors_handle = TNN_NS::RawBuffer(anchors_size * sizeof(float));
+            ::memcpy(anchors_handle.force_to<float *>(), anchors_data_ptr, anchors_size * sizeof(float));
+            layer_resource->anchors_handle = anchors_handle;
+        }
+        net_resource.resource_map[cur_layer->name] = layer_resource;
+    }
+    cur_layer->inputs.resize(2);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    cur_layer->inputs[1] = tf_lite_tensors[tf_lite_operator->inputs[1]]->name;
+    assert(tf_lite_operator->outputs.size() == 4);
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Custom, BuiltinOperator_CUSTOM);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_depthtospace_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_depthtospace_converter.cc
new file mode 100644
index 0000000..7d88ca8
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_depthtospace_converter.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(DepthToSpace);
+
+std::string TFLiteDepthToSpaceConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        // TODO
+    }
+    return "Reorg";
+}
+
+tflite::ActivationFunctionType TFLiteDepthToSpaceConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteDepthToSpaceConverter::exec(
+    TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+    const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+    const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set, bool quantized_model) {
+    auto param           = new TNN_NS::ReorgLayerParam;
+    auto cur_layer       = net_structure.layers.back();
+    cur_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(param);
+    auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = false;
+
+    param->mode = 0;
+    switch (tf_lite_op_type) {
+        case tflite::BuiltinOperator_DEPTH_TO_SPACE:
+            param->forward = true;
+            param->stride  = tf_lite_operator->builtin_options.AsDepthToSpaceOptions()->block_size;
+            break;
+        case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+            param->forward = false;
+            param->stride  = tf_lite_operator->builtin_options.AsSpaceToDepthOptions()->block_size;
+            break;
+        default:
+            LOGE("TNN Reorg unknown op type\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+using namespace tflite;
+REGISTER_CONVERTER(DepthToSpace, BuiltinOperator_DEPTH_TO_SPACE);
+REGISTER_CONVERTER(DepthToSpace, BuiltinOperator_SPACE_TO_DEPTH);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_elu_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_elu_converter.cc
new file mode 100644
index 0000000..ea67a6f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_elu_converter.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Elu);
+
+std::string TFLiteEluConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        // TODO
+    }
+    return "Elu";
+}
+
+tflite::ActivationFunctionType TFLiteEluConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteEluConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                        const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                        const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                        const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                        const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                        bool quantized_model) {
+    auto param       = new TNN_NS::EluLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = false;
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+using namespace tflite;
+REGISTER_CONVERTER(Elu, BuiltinOperator_ELU);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_fully_connected_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_fully_connected_converter.cc
new file mode 100644
index 0000000..6afa679
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_fully_connected_converter.cc
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+#include "tnn//utils/dims_vector_utils.h"
+#include "tnn/utils/data_format_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(FullyConnected);
+
+std::string TFLiteFullyConnectedConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedInnerProduct";
+    }
+    return "InnerProduct";
+};
+
+tflite::ActivationFunctionType TFLiteFullyConnectedConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tf_lite_operator->builtin_options.AsFullyConnectedOptions()->fused_activation_function;
+}
+
+static TNN_NS::Status CreateResource(TNN_NS::NetResource &net_resource,
+                                     const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                     const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                     const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                     std::string &layer_name) {
+    if (net_resource.resource_map.find(layer_name) == net_resource.resource_map.end()) {
+        return TNN_NS::Status(TNN_NS::TNNERR_CONVERT_INVALID_MODEL, "Create InnerProduct layer failed\n");
+    }
+    auto resource = reinterpret_cast<TNN_NS::InnerProductLayerResource *>(net_resource.resource_map[layer_name].get());
+    auto &input_tensor  = tf_lite_tensors[tf_lite_operator->inputs[0]];
+    auto &input_dims    = input_tensor->shape;
+    auto &weight_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    auto &weight_dims   = weight_tensor->shape;
+    int weight_count    = Count(weight_dims);
+    if (input_dims.size() == 4) {
+        // create weight
+        int n              = input_dims[0];
+        int h              = input_dims[1];
+        int w              = input_dims[2];
+        int c              = input_dims[3];
+        int output_channel = weight_dims[0];
+        int feature_size   = weight_dims[1];
+        auto tensor_data   = static_cast<uint8_t *>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+        auto tmp_buffer    = new uint8_t[weight_count]();
+        for (int i = 0; i < output_channel; ++i) {
+            auto data_ptr = &tensor_data[i * feature_size];
+            TNN_NS::DataFormatConverter::ConvertBetweenNHWCAndNCHW<uint8_t>(
+                data_ptr, &tmp_buffer[i * feature_size], n, c, h, w, TNN_NS::DataFormatConverter::NHWC2NCHW);
+        }
+        auto weight_handle = TNN_NS::RawBuffer(weight_count * sizeof(int8_t));
+        weight_handle.SetDataType(TNN_NS::DATA_TYPE_INT8);
+        auto &weight_zero_point = weight_tensor->quantization->zero_point;
+        ASSERT(weight_zero_point.size() == 1);
+        auto weight_handle_data = weight_handle.force_to<int8_t *>();
+        for (int i = 0; i < weight_count; ++i) {
+            weight_handle_data[i] = (int8_t)(tmp_buffer[i] - weight_zero_point[0]);
+        }
+        resource->weight_handle = weight_handle;
+        delete[] tmp_buffer;
+        // create weight scale
+        auto &input_tensor = tf_lite_tensors[tf_lite_operator->inputs[0]];
+        auto &input_scale  = input_tensor->quantization->scale;
+        auto &weight_scale = weight_tensor->quantization->scale;
+        ASSERT(input_scale.size() == weight_scale.size());
+        auto scale_handle = TNN_NS::RawBuffer(weight_scale.size() * sizeof(float));
+        scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        auto scale_data = scale_handle.force_to<float *>();
+        for (int i = 0; i < weight_scale.size(); ++i) {
+            scale_data[i] = input_scale[i] * weight_scale[i];
+        }
+        resource->scale_handle = scale_handle;
+    } else {
+        return TNN_NS::Status(TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER, "Quantized TFLite Converter do not support\n");
+    }
+    if (tf_lite_operator->inputs.size() == 3) {
+        // bias
+        auto &bias_tensor             = tf_lite_tensors[tf_lite_operator->inputs[2]];
+        auto bias_ptr                 = (int32_t *)(tf_lite_model_buffer[bias_tensor->buffer]->data.data());
+        auto bias_dims                = bias_tensor->shape;
+        int bias_count                = Count(bias_dims);
+        TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(bias_count * sizeof(int32_t));
+        ::memcpy(bias_handle.force_to<int32_t *>(), bias_ptr, bias_count * sizeof(int32_t));
+        resource->bias_handle = bias_handle;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+TNN_NS::Status TFLiteFullyConnectedConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+    const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+    const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set, bool quantized_model) {
+    TNN_NS::InnerProductLayerParam *param = new TNN_NS::InnerProductLayerParam;
+    auto cur_layer                        = net_structure.layers.back();
+    cur_layer->param                      = std::shared_ptr<TNN_NS::LayerParam>(param);
+    assert(tf_lite_operator->inputs.size() == 2 || tf_lite_operator->inputs.size() == 3);
+    auto &weight_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    auto weight_shape   = weight_tensor->shape;
+    assert(weight_shape.size() == 2);
+    param->type       = cur_layer->type_str;
+    param->name       = cur_layer->name;
+    param->quantized  = quantized_model;
+    param->axis       = 1;
+    param->transpose  = 0;
+    const int co      = weight_shape[0];
+    param->num_output = co;
+    if (tf_lite_operator->inputs.size() == 3) {
+        param->has_bias = 1;
+    }
+    auto layer_resource                        = std::make_shared<TNN_NS::InnerProductLayerResource>();
+    layer_resource->name                       = cur_layer->name;
+    net_resource.resource_map[cur_layer->name] = layer_resource;
+    int weight_size                            = Count(weight_shape);
+    assert(weight_size > 0);
+    if (quantized_model) {
+        // create IntScaleResource for input
+        int input_tensor_index = tf_lite_operator->inputs[0];
+        auto status            = CreateIntScaleResource(net_resource, tf_lite_tensors, input_tensor_index);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+        // crate InnerProduct layer resource
+        status = CreateResource(net_resource, tf_lite_operator, tf_lite_tensors, tf_lite_model_buffer, cur_layer->name);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+        // create IntScaleResource for output
+        int output_tensor_index = tf_lite_operator->outputs[0];
+        status                  = CreateIntScaleResource(net_resource, tf_lite_tensors, output_tensor_index);
+        ASSERT(status == TNN_NS::TNN_CONVERT_OK);
+    } else {
+        auto weight_ptr       = reinterpret_cast<float *>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+        auto input_data_shape = tf_lite_tensors[tf_lite_operator->inputs[0]]->shape;
+        if (input_data_shape.size() == 4) {
+            int n            = input_data_shape[0];
+            int h            = input_data_shape[1];
+            int w            = input_data_shape[2];
+            int c            = input_data_shape[3];
+            int feature_size = weight_shape[1];
+            auto tmp         = new float[weight_size]();
+            for (int i = 0; i < co; ++i) {
+                auto data_ptr = weight_ptr + (i * feature_size);
+                TNN_NS::DataFormatConverter::ConvertBetweenNHWCAndNCHW<float>(
+                    data_ptr, &tmp[i * feature_size], n, c, h, w, TNN_NS::DataFormatConverter::NHWC2NCHW);
+            }
+            TNN_NS::RawBuffer weight_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+            ::memcpy(weight_handle.force_to<float *>(), tmp, weight_size * sizeof(float));
+            layer_resource->weight_handle = weight_handle;
+            delete[] tmp;
+        } else {
+            TNN_NS::RawBuffer weight_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+            ::memcpy(weight_handle.force_to<float *>(), weight_ptr, weight_size * sizeof(float));
+            layer_resource->weight_handle = weight_handle;
+        }
+        if (tf_lite_operator->inputs.size() == 3 && tf_lite_operator->inputs[2] >= 0) {
+            auto &bias_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+            auto bias_ptr     = reinterpret_cast<const float *>(tf_lite_model_buffer[bias_tensor->buffer]->data.data());
+            auto bias_shape   = bias_tensor->shape;
+            int bias_size     = Count(bias_shape);
+            TNN_NS::RawBuffer bias_handle = TNN_NS::RawBuffer(bias_size * sizeof(float));
+            ::memcpy(bias_handle.force_to<float *>(), bias_ptr, bias_size * sizeof(float));
+            layer_resource->bias_handle = bias_handle;
+        }
+    }
+
+    cur_layer->inputs.resize(1);
+    cur_layer->outputs.resize(1);
+    cur_layer->inputs[0]  = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    cur_layer->outputs[0] = tf_lite_tensors[tf_lite_operator->outputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(FullyConnected, BuiltinOperator_FULLY_CONNECTED);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_hardswish_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_hardswish_converter.cc
new file mode 100644
index 0000000..417364a
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_hardswish_converter.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+#include "tnn/utils/data_format_converter.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Hardswish);
+
+std::string TFLiteHardswishConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "HardSwish";
+}
+
+tflite::ActivationFunctionType TFLiteHardswishConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteHardswishConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                              const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                              const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                              const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                              const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                              bool quantized_model) {
+    auto param                = new TNN_NS::HardSwishLayerParam;
+    auto cur_layer            = net_structure.layers.back();
+    cur_layer->param          = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type               = cur_layer->type_str;
+    param->name               = cur_layer->name;
+    param->quantized          = quantized_model;
+    param->weight_input_index = -1;
+    param->alpha              = 1.0f / 6.0f;
+    param->beta               = 3.f / 6.f;
+    ASSERT(cur_layer->inputs.size() == 1);
+    cur_layer->inputs.push_back(cur_layer->inputs[0]);
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Hardswish, BuiltinOperator_HARD_SWISH);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_l2_normalization_conveter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_l2_normalization_conveter.cc
new file mode 100644
index 0000000..91fb838
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_l2_normalization_conveter.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(L2Normalization);
+
+std::string TFLiteL2NormalizationConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedNormalize";
+    }
+    return "Normalize";
+}
+
+tflite::ActivationFunctionType TFLiteL2NormalizationConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tf_lite_operator->builtin_options.AsL2NormOptions()->fused_activation_function;
+}
+
+TNN_NS::Status TFLiteL2NormalizationConverter::exec(
+    TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+    const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+    const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set, bool quantized_model) {
+    ASSERT(tf_lite_operator->inputs.size() == 1);
+
+    auto* param      = new TNN_NS::NormalizeLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = quantized_model;
+    // l2 normalize
+    param->p = 2;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(L2Normalization, BuiltinOperator_L2_NORMALIZATION);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.cc
new file mode 100644
index 0000000..13c5b18
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+TFLiteOpConverterManager* TFLiteOpConverterManager::tf_lite_op_converter_manager_ = nullptr;
+
+TFLiteOpConverterManager* TFLiteOpConverterManager::get() {
+    if (tf_lite_op_converter_manager_ == nullptr) {
+        tf_lite_op_converter_manager_ = new TFLiteOpConverterManager;
+    }
+    return tf_lite_op_converter_manager_;
+}
+TFLiteOpConverter* TFLiteOpConverterManager::search(const tflite::BuiltinOperator op_index) {
+    auto iter = tf_lite_op_converter_map_.find(op_index);
+    if (iter == tf_lite_op_converter_map_.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+
+TFLiteOpConverterManager::~TFLiteOpConverterManager() {
+    for (auto& it : tf_lite_op_converter_map_) {
+        delete it.second;
+    }
+    tf_lite_op_converter_map_.clear();
+    delete tf_lite_op_converter_manager_;
+}
+
+void TFLiteOpConverterManager::insert(const tflite::BuiltinOperator op_index, TFLiteOpConverter* t) {
+    tf_lite_op_converter_map_.insert(std::make_pair(op_index, t));
+}
+
+TNN_NS::Status TFLiteOpConverter::SeparateActivation(TNN_NS::NetStructure& net_structure,
+                                                     tflite::ActivationFunctionType activation_function_type) {
+    auto& layers = net_structure.layers;
+    auto& layer  = layers.back();
+    if (activation_function_type == tflite::ActivationFunctionType_NONE || layer->type == TNN_NS::LAYER_CONVOLUTION ||
+        layer->type == TNN_NS::LAYER_DECONVOLUTION) {
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+    const std::string conv_output_suffix = "_output";
+    const std::string activation_suffix  = "_activation";
+    if (activation_function_type == tflite::ActivationFunctionType_RELU ||
+        activation_function_type == tflite::ActivationFunctionType_RELU6) {
+        auto activation_layer = new TNN_NS::LayerInfo;
+        activation_layer->type =
+            activation_function_type == tflite::ActivationFunctionType_RELU ? TNN_NS::LAYER_RELU : TNN_NS::LAYER_RELU6;
+        activation_layer->type_str = activation_function_type == tflite::ActivationFunctionType_RELU ? "ReLU" : "ReLU6";
+        activation_layer->name     = layer->name + activation_suffix;
+        activation_layer->inputs.push_back(layer->outputs[0] + conv_output_suffix);
+        activation_layer->outputs.push_back(layer->outputs[0]);
+
+        // modify layer
+        layer->outputs[0] = layer->outputs[0] + conv_output_suffix;
+        // create activation layer
+        // create relu param
+        auto activation_param       = new TNN_NS::LayerParam;
+        activation_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(activation_param);
+        activation_param->type      = activation_layer->type_str;
+        activation_param->name      = layer->name + activation_suffix;
+        activation_param->quantized = false;
+        // insert activation layer
+        layers.push_back(std::shared_ptr<TNN_NS::LayerInfo>(activation_layer));
+    } else {
+        LOGE("TNN Converter unsupport activation function\n");
+        return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+void TFLiteOpConverter::InsertBlobs(TNN_NS::NetStructure& net_structure) {
+    auto& cur_layer = net_structure.layers.back();
+    auto& inputs    = cur_layer->inputs;
+    auto& outputs   = cur_layer->outputs;
+    for (const auto& input_name : inputs) {
+        net_structure.blobs.insert(input_name);
+    }
+    for (const auto& output_name : outputs) {
+        net_structure.blobs.insert(output_name);
+    }
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.h b/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.h
new file mode 100644
index 0000000..6763456
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_op_converter.h
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_OP_CONVERTER_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_OP_CONVERTER_H_
+
+#include "tflite/tflite-schema/schema_generated.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+
+class TFLiteOpConverter {
+public:
+    TFLiteOpConverter()          = default;
+    virtual ~TFLiteOpConverter() = default;
+
+    virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                bool quantized_model)                                      = 0;
+    virtual std::string TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model)   = 0;
+    virtual tflite::ActivationFunctionType ActivationType(const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                                          tflite::BuiltinOperator op_code) = 0;
+    TNN_NS::Status SeparateActivation(TNN_NS::NetStructure& net_structure,
+                                      tflite::ActivationFunctionType activation_function_type);
+    void InsertBlobs(TNN_NS::NetStructure& net_structure);
+
+protected:
+    std::string tflite_op_type_;
+};
+
+class TFLiteOpConverterManager {
+public:
+    static TFLiteOpConverterManager* get();
+    void insert(const tflite::BuiltinOperator op_index, TFLiteOpConverter* tflite_op_converter);
+    TFLiteOpConverter* search(const tflite::BuiltinOperator op_index);
+    TFLiteOpConverterManager(){};
+    ~TFLiteOpConverterManager();
+
+private:
+    static TFLiteOpConverterManager* tf_lite_op_converter_manager_;
+    std::map<tflite::BuiltinOperator, TFLiteOpConverter*> tf_lite_op_converter_map_;
+};
+
+template <class T>
+class TFLiteOpConverterRegister {
+public:
+    explicit TFLiteOpConverterRegister(const tflite::BuiltinOperator op_index) {
+        T* converter                                           = new T;
+        TFLiteOpConverterManager* tf_lite_op_converter_manager = TFLiteOpConverterManager::get();
+        tf_lite_op_converter_manager->insert(op_index, converter);
+    };
+    ~TFLiteOpConverterRegister(){};
+};
+
+#define DECLARE_OP_CONVERTER(tf_lite_type)                                                                             \
+    class TFLite##tf_lite_type##Converter : public TFLiteOpConverter {                                                 \
+    public:                                                                                                            \
+        TFLite##tf_lite_type##Converter() {}                                                                           \
+        virtual ~TFLite##tf_lite_type##Converter() {}                                                                  \
+        virtual TNN_NS::Status exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,            \
+                                    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,                        \
+                                    const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,              \
+                                    const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,         \
+                                    const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,         \
+                                    bool quantized_model);                                                             \
+        virtual std::string TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model);                          \
+        virtual tflite::ActivationFunctionType ActivationType(                                                         \
+            const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code);              \
+    }  // namespace TNN_CONVERTER
+
+#define REGISTER_CONVERTER(converter_suffix, tf_lite_type)                                                             \
+    TFLiteOpConverterRegister<TFLite##converter_suffix##Converter> g_converter_##tf_lite_type##_(tf_lite_type)
+};  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_OP_CONVERTER_H_
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_pack_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_pack_converter.cc
new file mode 100644
index 0000000..d649321
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_pack_converter.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Pack);
+
+std::string TFLitePackConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Concat";
+}
+
+tflite::ActivationFunctionType TFLitePackConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLitePackConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                         const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                         const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                         const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                         const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                         bool quantized_model) {
+    auto param           = new TNN_NS::ConcatLayerParam;
+    auto cur_layer       = net_structure.layers.back();
+    cur_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name          = cur_layer->name;
+    param->type          = cur_layer->type_str;
+    param->quantized     = quantized_model;
+    auto option          = tf_lite_operator->builtin_options.AsPackOptions();
+    auto &input_tensor   = tf_lite_tensors[tf_lite_operator->inputs[0]];
+    param->axis          = ConvertAxisFormatTFLite(option->axis, input_tensor->shape.size());
+    for (const auto& index: tf_lite_operator->inputs){
+        const auto& tensor = tf_lite_tensors[index];
+        const auto& buffer = tf_lite_model_buffer[tensor->buffer];
+        if (buffer->data.empty()) {
+            continue;
+        }
+        const auto& input_dims = tensor->shape;
+        int data_count = buffer->data.size() / SizeofTFLiteTensorData(tensor->type);
+        if (input_dims.empty() && data_count == 1 && tensor->type == tflite::TensorType_INT32) {
+            // only one value
+            std::vector<int32_t> tnn_dims = {1};
+            auto raw_buffer = new TNN_NS::RawBuffer(data_count * sizeof(int32_t), tnn_dims);
+            raw_buffer->SetDataType(TNN_NS::DATA_TYPE_INT32);
+            ::memcpy(raw_buffer->force_to<int32_t*>(), reinterpret_cast<int32_t*>(buffer->data.data()),
+                     data_count * sizeof(int32_t));
+            net_resource.constant_map[tensor->name] = std::shared_ptr<TNN_NS::RawBuffer>(raw_buffer);
+        } else {
+            LOGE("TFLite Pack only support pack one value");
+            return TNN_NS::Status(TNN_NS::TNNERR_UNSUPPORT_NET, "TFLite Pack only support pack one value");
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Pack, BuiltinOperator_PACK);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_pad_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_pad_converter.cc
new file mode 100644
index 0000000..5cd4492
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_pad_converter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Pad);
+
+std::string TFLitePadConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Pad";
+}
+
+tflite::ActivationFunctionType TFLitePadConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLitePadConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                        const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                        const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                        const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                        const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                        bool quantized_model) {
+    TNN_NS::PadLayerParam* param = new TNN_NS::PadLayerParam;
+    auto cur_layer               = net_structure.layers.back();
+    cur_layer->param             = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type = 0;
+    param->name = cur_layer->name;
+    param->quantized = false;
+    // pad value
+    const int pad_value_index    = tf_lite_operator->inputs[1];
+    const auto& pad_value_tensor = tf_lite_tensors[pad_value_index];
+    const auto& pad_value_shape  = pad_value_tensor->shape;
+    auto pad_value_ptr = reinterpret_cast<int32_t*>(tf_lite_model_buffer[pad_value_tensor->buffer]->data.data());
+    param->pads.resize(6);
+    ConvertConstFormatTFLite(param->pads.data(), pad_value_ptr, pad_value_shape);
+    auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    switch (tf_lite_op_type) {
+        case tflite::BuiltinOperator_PADV2:
+        case tflite::BuiltinOperator_PAD: {
+            param->type = 0;
+            break;
+        }
+        default: {
+            LOGE("TFLitePadConverter do not support ");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+    }
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Pad, BuiltinOperator_PAD);
+REGISTER_CONVERTER(Pad, BuiltinOperator_PADV2);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_pool2d_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_pool2d_converter.cc
new file mode 100644
index 0000000..d9801e8
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_pool2d_converter.cc
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Pool2D);
+
+std::string TFLitePool2DConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedPooling";
+    }
+    return "Pooling";
+}
+
+tflite::ActivationFunctionType TFLitePool2DConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tf_lite_operator->builtin_options.AsPool2DOptions()->fused_activation_function;
+}
+
+TNN_NS::Status TFLitePool2DConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                           const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                           bool quantized_model) {
+    TNN_NS::PoolingLayerParam* param = new TNN_NS::PoolingLayerParam;
+    auto cur_layer                   = net_structure.layers.back();
+    auto tf_lite_op_type             = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    const auto& pool_option          = tf_lite_operator->builtin_options.AsPool2DOptions();
+
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = quantized_model;
+
+    switch (tf_lite_op_type) {
+        case tflite::BuiltinOperator_MAX_POOL_2D: {
+            param->pool_type = 0;
+            break;
+        }
+        case tflite::BuiltinOperator_AVERAGE_POOL_2D: {
+            param->pool_type = 1;
+            break;
+        }
+        default: {
+            LOGE("TNN Pool 2D do not Support unknown pool type\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+    }
+
+    param->kernels.push_back(pool_option->filter_width);
+    param->kernels.push_back(pool_option->filter_height);
+    param->kernels_params = param->kernels;
+
+    param->strides.push_back(pool_option->stride_w);
+    param->strides.push_back(pool_option->stride_h);
+
+    // default: Padding_SAME
+    param->pad_type = 0;
+    if (pool_option->padding == tflite::Padding_VALID) {
+        // tensorflow pad valid
+        param->pad_type = 1;
+    }
+    param->pads.push_back(0);
+    param->pads.push_back(0);
+    param->pads.push_back(0);
+    param->pads.push_back(0);
+
+    param->kernel_indexs.push_back(-1);
+    param->kernel_indexs.push_back(-1);
+    // TFLite do not have adaptive pool
+    param->is_adaptive_pool = 0;
+    param->output_shape     = {-1, -1};
+    // update param
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    if (quantized_model)  {
+        // create IntScaleResource for input
+        int input_tensor_index = tf_lite_operator->inputs[0];
+        TNN_NS::Status status = CreateIntScaleResource(net_resource, tf_lite_tensors, input_tensor_index);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            return status;
+        }
+        // create IntScaleResource for output
+        int output_tensor_index = tf_lite_operator->outputs[0];
+        status = CreateIntScaleResource(net_resource, tf_lite_tensors, output_tensor_index);
+        if (status != TNN_NS::TNN_CONVERT_OK) {
+            return status;
+        }
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Pool2D, BuiltinOperator_MAX_POOL_2D);
+REGISTER_CONVERTER(Pool2D, BuiltinOperator_AVERAGE_POOL_2D);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_prelu_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_prelu_converter.cc
new file mode 100644
index 0000000..80a1396
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_prelu_converter.cc
@@ -0,0 +1,89 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(PRelu);
+
+std::string TFLitePReluConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        // TODO
+    }
+    return "PReLU";
+}
+
+tflite::ActivationFunctionType TFLitePReluConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLitePReluConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                          const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                          const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                          const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                          const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                          bool quantized_model) {
+    auto param       = new TNN_NS::PReluLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+
+    // inputs: input tensor, weight
+    const int input_size = tf_lite_operator->inputs.size();
+    // weight index
+    const int weight_index    = tf_lite_operator->inputs[1];
+    const auto& weight_tensor = tf_lite_tensors[weight_index];
+
+    const auto& weight_shape = weight_tensor->shape;
+    const int co             = weight_shape[2];
+    param->name              = cur_layer->name;
+    param->type              = cur_layer->type_str;
+    param->quantized         = false;
+
+    auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    if (tf_lite_op_type == tflite::BuiltinOperator_LEAKY_RELU) {
+        param->channel_shared          = 1;
+        param->has_filler              = 0;
+        auto option                    = tf_lite_operator->builtin_options.AsLeakyReluOptions();
+        auto alpha                     = option->alpha;
+        auto layer_resource            = new TNN_NS::PReluLayerResource;
+        layer_resource->name           = cur_layer->name;
+        TNN_NS::RawBuffer slope_handle = TNN_NS::RawBuffer(1 * sizeof(float));
+        ::memcpy(slope_handle.force_to<float*>(), &alpha, 1 * sizeof(float));
+        layer_resource->slope_handle               = slope_handle;
+        net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_PRELU) {
+        ASSERT(input_size == 2);
+        param->channel_shared          = 0;
+        param->has_filler              = 0;
+        auto layer_resource            = new TNN_NS::PReluLayerResource;
+        layer_resource->name           = cur_layer->name;
+        TNN_NS::RawBuffer slope_handle = TNN_NS::RawBuffer(co * sizeof(float));
+        auto data_ptr = reinterpret_cast<const float*>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+        ::memcpy(slope_handle.force_to<float*>(), data_ptr, sizeof(float) * co);
+        layer_resource->slope_handle = slope_handle;
+
+        net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+    }
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+using namespace tflite;
+REGISTER_CONVERTER(PRelu, BuiltinOperator_PRELU);
+REGISTER_CONVERTER(PRelu, BuiltinOperator_LEAKY_RELU);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_reduce_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_reduce_converter.cc
new file mode 100644
index 0000000..5d5581f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_reduce_converter.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Reduce);
+
+std::string TFLiteReduceConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    switch (op_code) {
+        case tflite::BuiltinOperator_MEAN:
+            return "ReduceMean";
+        case tflite::BuiltinOperator_SUM:
+            return "ReduceSum";
+        default:
+            return "";
+    }
+}
+tflite::ActivationFunctionType TFLiteReduceConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteReduceConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                           bool quantized_model) {
+    auto param       = new TNN_NS::ReduceLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = false;
+    auto option      = tf_lite_operator->builtin_options.AsReducerOptions();
+    param->keep_dims = option->keep_dims;
+    assert(cur_layer->inputs.size() == 2);
+
+    auto input_index         = tf_lite_operator->inputs[0];
+    const auto &input_tensor = tf_lite_tensors[input_index];
+    const auto input_shape   = input_tensor->shape;
+    int input_shape_size     = input_shape.size();
+
+    const auto &axes_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    int axes_size           = Count(axes_tensor->shape);
+    auto axes_ptr           = reinterpret_cast<int *>(tf_lite_model_buffer[axes_tensor->buffer]->data.data());
+    for (int i = 0; i < axes_size; ++i) {
+        param->axis.push_back(ConvertAxisFormatTFLite(*axes_ptr, input_shape_size));
+        axes_ptr++;
+    }
+    if (axes_size == 0) {
+        param->axis.push_back(ConvertAxisFormatTFLite(*axes_ptr, input_shape_size));
+    }
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Reduce, BuiltinOperator_MEAN);
+REGISTER_CONVERTER(Reduce, BuiltinOperator_SUM);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_reshape_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_reshape_converter.cc
new file mode 100644
index 0000000..d4d0f2b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_reshape_converter.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Reshape);
+
+std::string TFLiteReshapeConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedReshape";
+    }
+    return "Reshape";
+}
+tflite::ActivationFunctionType TFLiteReshapeConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteReshapeConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                            const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                            const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                            const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                            const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                            bool quantized_model) {
+    TNN_NS::ReshapeLayerParam* param = new TNN_NS::ReshapeLayerParam;
+    auto cur_layer                   = net_structure.layers.back();
+    cur_layer->param                 = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                      = cur_layer->name;
+    param->type                      = cur_layer->type_str;
+    param->quantized                 = quantized_model;
+    param->reshape_type              = 1;
+    param->axis                      = 0;
+    // tensorflow reshape (n,h,w,c);
+    std::vector<int> reshape_dim;
+
+    const auto option = tf_lite_operator->builtin_options.AsReshapeOptions();
+    if (option == nullptr || option->new_shape.empty()) {
+        ASSERT(tf_lite_operator->inputs.size() == 2);
+        const auto& new_shape_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+        ASSERT(new_shape_tensor->type == tflite::TensorType_INT32);
+        const auto& buffer = tf_lite_model_buffer[new_shape_tensor->buffer];
+        int data_count     = buffer->data.size() / SizeofTFLiteTensorData(new_shape_tensor->type);
+        if (data_count != 0) {
+            reshape_dim.assign(reinterpret_cast<int32_t*>(buffer->data.data()),
+                               reinterpret_cast<int32_t*>(buffer->data.data()) + data_count);
+            reshape_dim[0] = 0;
+            ConvertShapeFormatTFLite(reshape_dim);
+            param->num_axes = reshape_dim.size();
+            param->shape    = reshape_dim;
+            cur_layer->inputs.resize(1);
+            cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+        } else {
+            LOGE("TFLite do not support this type reshape!\n");
+            return TNN_NS::Status(TNN_NS::TNNERR_UNSUPPORT_NET, "TFLite do not support this type reshape!\n");
+        }
+    } else {
+        const auto& new_shape = option->new_shape;
+        reshape_dim.assign(new_shape.begin(), new_shape.end());
+        reshape_dim[0] = 0;
+        ConvertShapeFormatTFLite(reshape_dim);
+        param->num_axes = reshape_dim.size();
+        param->shape    = reshape_dim;
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Reshape, BuiltinOperator_RESHAPE);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_resize_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_resize_converter.cc
new file mode 100644
index 0000000..800defb
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_resize_converter.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Resize);
+
+std::string TFLiteResizeConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Upsample";
+}
+
+tflite::ActivationFunctionType TFLiteResizeConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteResizeConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                           bool quantized_model) {
+    auto param       = new TNN_NS::UpsampleLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name      = cur_layer->name;
+    param->type      = cur_layer->type_str;
+    param->quantized = false;
+    // scales always 1.0
+    param->scales        = {1.0, 1.0};
+    auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    if (tf_lite_op_type == tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR) {
+        param->mode          = 1;
+        auto option          = tf_lite_operator->builtin_options.AsResizeNearestNeighborOptions();
+        param->align_corners = option->align_corners;
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_RESIZE_BILINEAR) {
+        param->mode          = 2;
+        auto option          = tf_lite_operator->builtin_options.AsResizeBilinearOptions();
+        param->align_corners = option->align_corners;
+    } else {
+        LOGE("TNN TFLite Converter don't support resize mode!\n");
+        return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+    }
+    ASSERT(tf_lite_operator->inputs.size() == 2);
+    auto &output_shape_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    auto output_shape_data_ptr =
+        reinterpret_cast<int *>(tf_lite_model_buffer[output_shape_tensor->buffer]->data.data());
+    auto data_size = SizeofTFLiteTensorData(output_shape_tensor->type);
+    ASSERT(tf_lite_model_buffer[output_shape_tensor->buffer]->data.size() / data_size == 2);
+    int output_shape_height = output_shape_data_ptr[0];
+    int output_shape_width  = output_shape_data_ptr[1];
+    param->dims.push_back(output_shape_width);
+    param->dims.push_back(output_shape_height);
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Resize, BuiltinOperator_RESIZE_BILINEAR);
+REGISTER_CONVERTER(Resize, BuiltinOperator_RESIZE_NEAREST_NEIGHBOR);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_softmax_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_softmax_converter.cc
new file mode 100644
index 0000000..813f073
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_softmax_converter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Softmax);
+
+std::string TFLiteSoftmaxConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        return "QuantizedSoftmax";
+    }
+    return "Softmax";
+}
+
+tflite::ActivationFunctionType TFLiteSoftmaxConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteSoftmaxConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                            const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                            const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                            const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                            const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                            bool quantized_model) {
+    ASSERT(tf_lite_operator->inputs.size() == 1);
+
+    TNN_NS::SoftmaxLayerParam* param = new TNN_NS::SoftmaxLayerParam;
+    auto cur_layer                   = net_structure.layers.back();
+    cur_layer->param                 = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                      = cur_layer->name;
+    param->type                      = cur_layer->type_str;
+    param->quantized                 = false;
+    param->axis                      = 1;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Softmax, BuiltinOperator_SOFTMAX);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_splitv_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_splitv_converter.cc
new file mode 100644
index 0000000..f156229
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_splitv_converter.cc
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(SplitV);
+
+std::string TFLiteSplitVConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "SplitV";
+}
+
+tflite::ActivationFunctionType TFLiteSplitVConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteSplitVConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                           const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                           bool quantized_model) {
+    auto parm            = new TNN_NS::SplitVLayerParam;
+    auto cur_layer       = net_structure.layers.back();
+    cur_layer->param     = std::shared_ptr<TNN_NS::LayerParam>(parm);
+    parm->type           = cur_layer->type_str;
+    parm->name           = cur_layer->name;
+    parm->quantized      = false;
+    auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    if (tf_lite_op_type == tflite::BuiltinOperator_SPLIT_V) {
+        auto option     = tf_lite_operator->builtin_options.AsSplitVOptions();
+        auto num_splits = option->num_splits;
+        ASSERT(tf_lite_operator->inputs.size() >= 2);
+        auto &splits_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+        auto data_size =
+            tf_lite_model_buffer[splits_tensor->buffer]->data.size() / SizeofTFLiteTensorData(splits_tensor->type);
+        ASSERT(data_size == num_splits);
+        auto data_ptr = reinterpret_cast<int32_t *>(tf_lite_model_buffer[splits_tensor->buffer]->data.data());
+        int sum       = 0;
+        for (int i = 0; i < data_size; ++i) {
+            sum += data_ptr[i];
+            parm->slices.push_back(data_ptr[i]);
+        }
+        if (tf_lite_operator->inputs.size() == 3) {
+            auto &axis_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+            auto axis_size =
+                tf_lite_model_buffer[axis_tensor->buffer]->data.size() / SizeofTFLiteTensorData(axis_tensor->type);
+            ASSERT(axis_size == 1);
+            auto &value_tensor = tf_lite_tensors[tf_lite_operator->inputs[0]];
+            int axis           = *(reinterpret_cast<int32_t *>(tf_lite_model_buffer[axis_tensor->buffer]->data.data()));
+            if (axis < 0) {
+                axis += value_tensor->shape.size();
+            }
+            parm->axis = ConvertAxisFormatTFLite(axis);
+            ASSERT(sum == value_tensor->shape[axis]);
+
+        } else {
+            parm->axis = 0;
+        }
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+        return TNN_NS::TNN_CONVERT_OK;
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_SPLIT) {
+        auto option     = tf_lite_operator->builtin_options.AsSplitOptions();
+        auto num_splits = option->num_splits;
+        ASSERT(num_splits == tf_lite_operator->outputs.size());
+        ASSERT(tf_lite_operator->inputs.size() >= 2);
+        auto &axis_tensor  = tf_lite_tensors[tf_lite_operator->inputs[0]];
+        auto &input_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+        auto axis_size =
+            tf_lite_model_buffer[axis_tensor->buffer]->data.size() / SizeofTFLiteTensorData(axis_tensor->type);
+        ASSERT(axis_size == 1);
+        int axis = *(reinterpret_cast<int32_t *>(tf_lite_model_buffer[axis_tensor->buffer]->data.data()));
+        if (axis < 0) {
+            axis += input_tensor->shape.size();
+        }
+        parm->axis = ConvertAxisFormatTFLite(axis);
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[1]]->name;
+        return TNN_NS::TNN_CONVERT_OK;
+    }
+    return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(SplitV, BuiltinOperator_SPLIT_V);
+REGISTER_CONVERTER(SplitV, BuiltinOperator_SPLIT);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_square_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_square_converter.cc
new file mode 100644
index 0000000..e9c3b70
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_square_converter.cc
@@ -0,0 +1,68 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Square);
+
+std::string TFLiteSquareConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Power";
+}
+
+tflite::ActivationFunctionType TFLiteSquareConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteSquareConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                           const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                           const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                           const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                           const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                           bool quantized_model) {
+    auto* param                = new TNN_NS::PowLayerParam;
+    auto cur_layer             = net_structure.layers.back();
+    cur_layer->param           = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                = cur_layer->name;
+    param->type                = cur_layer->type_str;
+    param->quantized           = false;
+    param->scale               = 1.0;
+    param->shift               = 0.0;
+    const auto tf_lite_op_type = tf_lite_op_set[tf_lite_operator->opcode_index]->builtin_code;
+    if (tf_lite_op_type == tflite::BuiltinOperator_SQRT) {
+        param->exponent = 2.0;
+    } else if (tf_lite_op_type == tflite::BuiltinOperator_POW) {
+        ASSERT(tf_lite_operator->inputs.size() == 2);
+        const auto& tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+        const auto& buffer = tf_lite_model_buffer[tensor->buffer];
+        const auto count   = buffer->data.size() / SizeofTFLiteTensorData(tensor->type);
+        ASSERT(count == 1);
+        param->exponent = reinterpret_cast<float*>(buffer->data.data())[0];
+        cur_layer->inputs.resize(1);
+        cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    } else {
+        LOGE("TFLite Convert: do not support operator type");
+        return TNN_NS::TNNERR_MODEL_ERR;
+    }
+
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Square, BuiltinOperator_SQUARE);
+REGISTER_CONVERTER(Square, BuiltinOperator_POW);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_squeeze_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_squeeze_converter.cc
new file mode 100644
index 0000000..5739a82
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_squeeze_converter.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Squeeze);
+
+std::string TFLiteSqueezeConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Squeeze";
+}
+
+tflite::ActivationFunctionType TFLiteSqueezeConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteSqueezeConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                            const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                            const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                            const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                            const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                            bool quantized_model) {
+    TNN_NS::SqueezeLayerParam *param = new TNN_NS::SqueezeLayerParam;
+    auto cur_layer                   = net_structure.layers.back();
+    cur_layer->param                 = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                      = cur_layer->name;
+    param->type                      = cur_layer->type_str;
+    param->quantized                 = false;
+    auto option                      = tf_lite_operator->builtin_options.AsSqueezeOptions();
+    for (auto axis : option->squeeze_dims) {
+        param->axes.push_back(ConvertAxisFormatTFLite(axis));
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Squeeze, BuiltinOperator_SQUEEZE);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_strided_slice_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_strided_slice_converter.cc
new file mode 100644
index 0000000..5d311a3
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_strided_slice_converter.cc
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(StridedSlice);
+
+std::string TFLiteStridedSliceConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "StridedSlice";
+}
+
+tflite::ActivationFunctionType TFLiteStridedSliceConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteStridedSliceConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+    const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+    const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set, bool quantized_model) {
+    auto parm        = new TNN_NS::StrideSliceLayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(parm);
+    parm->type       = cur_layer->type_str;
+    parm->name       = cur_layer->name;
+    parm->quantized  = false;
+    auto option      = tf_lite_operator->builtin_options.AsStridedSliceOptions();
+    ASSERT(tf_lite_operator->inputs.size() >= 3);
+    auto &input_tensor = tf_lite_tensors[tf_lite_operator->inputs[0]];
+    auto &begin_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+    auto begin_size =
+        tf_lite_model_buffer[begin_tensor->buffer]->data.size() / SizeofTFLiteTensorData(begin_tensor->type);
+    auto begin_data_ptr = reinterpret_cast<int32_t *>(tf_lite_model_buffer[begin_tensor->buffer]->data.data());
+    for (int i = 0; i < begin_size; ++i) {
+        parm->begins.push_back(begin_data_ptr[i]);
+    }
+    Mask(input_tensor->shape, option->begin_mask, 0, parm->begins);
+    ConvertShapeFormatTFLite(parm->begins);
+    std::reverse(parm->begins.begin(), parm->begins.end());
+
+    auto &end_tensor = tf_lite_tensors[tf_lite_operator->inputs[2]];
+    auto end_size    = tf_lite_model_buffer[end_tensor->buffer]->data.size() / SizeofTFLiteTensorData(end_tensor->type);
+    auto end_data_ptr = reinterpret_cast<int32_t *>(tf_lite_model_buffer[end_tensor->buffer]->data.data());
+    for (int i = 0; i < end_size; ++i) {
+        parm->ends.push_back(end_data_ptr[i]);
+    }
+    Mask(input_tensor->shape, option->end_mask, 1, parm->ends);
+    ConvertShapeFormatTFLite(parm->ends);
+    std::reverse(parm->ends.begin(), parm->ends.end());
+
+    if (tf_lite_operator->inputs.size() == 4) {
+        auto &strides_tensor = tf_lite_tensors[tf_lite_operator->inputs[3]];
+        auto stride_size =
+            tf_lite_model_buffer[strides_tensor->buffer]->data.size() / SizeofTFLiteTensorData(strides_tensor->type);
+        auto stride_data_ptr = reinterpret_cast<int32_t *>(tf_lite_model_buffer[strides_tensor->buffer]->data.data());
+        for (int i = 0; i < stride_size; ++i) {
+            parm->strides.push_back(stride_data_ptr[i]);
+        }
+    } else {
+        parm->strides = {(int)begin_size, 1};
+    }
+    ConvertShapeFormatTFLite(parm->strides);
+    std::reverse(parm->strides.begin(), parm->strides.end());
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(StridedSlice, BuiltinOperator_STRIDED_SLICE);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_conv_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_conv_converter.cc
new file mode 100644
index 0000000..7989d86
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_conv_converter.cc
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(TransposeConv);
+
+std::string TFLiteTransposeConvConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    return "Deconvolution";
+}
+
+tflite::ActivationFunctionType TFLiteTransposeConvConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteTransposeConvConverter::exec(
+    TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+    const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+    const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+    const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set, bool quantized_model) {
+    TNN_NS::ConvLayerParam *param = new TNN_NS::ConvLayerParam;
+    auto cur_layer                = net_structure.layers.back();
+    cur_layer->param              = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                   = cur_layer->name;
+    param->type                   = cur_layer->type_str;
+    param->quantized              = false;
+    // 3|2 inputs: input tensor, weight, (bias)
+    const int input_size = tf_lite_operator->inputs.size();
+    ASSERT(input_size == 2 || input_size == 3);
+    // weight index
+    const int weight_index    = tf_lite_operator->inputs[1];
+    const auto &weight_tensor = tf_lite_tensors[weight_index];
+    // co kh kw ci
+    const auto &weight_shape = weight_tensor->shape;
+    ASSERT(weight_shape.size() == 4);
+    int co                = weight_shape[0];
+    int kh                = weight_shape[1];
+    int kw                = weight_shape[2];
+    int ci                = weight_shape[3];
+    int weight_size       = co * kh * kw * ci;
+    const auto option     = tf_lite_operator->builtin_options.AsTransposeConvOptions();
+    param->input_channel  = ci;
+    param->output_channel = co;
+    param->kernels.push_back(kw);
+    param->kernels.push_back(kh);
+    param->strides.push_back(option->stride_w);
+    param->strides.push_back(option->stride_h);
+
+    param->dialations.push_back(1);
+    param->dialations.push_back(1);
+    param->group    = 1;
+    param->pad_type = 0;
+    if (option->padding == tflite::Padding_VALID) {
+        // tensorflow pad valid
+        param->pad_type = -1;
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+    } else if (option->padding == tflite::Padding_SAME) {
+        param->pad_type = 0;
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+        param->pads.push_back(0);
+    }
+    param->activation_type = TNN_NS::ActivationType_None;
+
+    auto layer_resource  = new TNN_NS::ConvLayerResource;
+    layer_resource->name = cur_layer->name;
+
+    TNN_NS::RawBuffer filter_handle = TNN_NS::RawBuffer(weight_size * sizeof(float));
+    auto original_weight_ptr =
+        reinterpret_cast<const float *>(tf_lite_model_buffer[weight_tensor->buffer]->data.data());
+    TFLiteConvertOHWI2IOHW(original_weight_ptr, filter_handle.force_to<float *>(), co, kh, kw, ci);
+    layer_resource->filter_handle              = filter_handle;
+    net_resource.resource_map[cur_layer->name] = std::shared_ptr<TNN_NS::LayerResource>(layer_resource);
+
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[2]]->name;
+    cur_layer->outputs.resize(1);
+    cur_layer->outputs[0] = tf_lite_tensors[tf_lite_operator->outputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(TransposeConv, BuiltinOperator_TRANSPOSE_CONV);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_converter.cc
new file mode 100644
index 0000000..17a8c7f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_transpose_converter.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+DECLARE_OP_CONVERTER(Transpose);
+
+std::string TFLiteTransposeConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    if (quantized_model) {
+        // TODO
+    }
+    return "Permute";
+}
+tflite::ActivationFunctionType TFLiteTransposeConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT>& tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteTransposeConverter::exec(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                                              const std::unique_ptr<tflite::OperatorT>& tf_lite_operator,
+                                              const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                              const std::vector<std::unique_ptr<tflite::BufferT>>& tf_lite_model_buffer,
+                                              const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& tf_lite_op_set,
+                                              bool quantized_model) {
+    TNN_NS::PermuteLayerParam* param = new TNN_NS::PermuteLayerParam;
+    auto cur_layer                   = net_structure.layers.back();
+    cur_layer->param                 = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->name                      = cur_layer->name;
+    param->type                      = cur_layer->type_str;
+    param->quantized                 = false;
+
+    if (quantized_model) {
+        // TODO
+    } else {
+        std::vector<int> perm;
+        if (tf_lite_operator->inputs.size() == 2) {
+            const auto& shape_tensor = tf_lite_tensors[tf_lite_operator->inputs[1]];
+            assert(shape_tensor->type == tflite::TensorType_INT32);
+            int shape_size = 1;
+            for (int i = 0; i < shape_tensor->shape.size(); ++i) {
+                shape_size *= shape_tensor->shape[i];
+            }
+            const auto& shape_data = tf_lite_model_buffer[shape_tensor->buffer]->data;
+            ASSERT(shape_size == shape_data.size() / 4);
+
+            auto shape_data_ptr = reinterpret_cast<const int32_t*>(shape_data.data());
+            perm.assign(shape_data_ptr, shape_data_ptr + shape_size);
+        } else {
+            LOGE("TNN Transpose do not support type\n");
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        auto status = ConvertPermFormatTFLite(perm);
+        if (!status) {
+            return TNN_NS::TNNERR_CONVERT_UNSUPPORT_LAYER;
+        }
+        param->orders = perm;
+    }
+
+    // set input output index
+    cur_layer->inputs.resize(1);
+    cur_layer->inputs[0] = tf_lite_tensors[tf_lite_operator->inputs[0]]->name;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Transpose, BuiltinOperator_TRANSPOSE);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_unary_converter.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_unary_converter.cc
new file mode 100644
index 0000000..637c6b5
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_unary_converter.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_op_converter.h"
+#include "tflite_utils.h"
+
+namespace TNN_CONVERTER {
+
+DECLARE_OP_CONVERTER(Unary);
+std::string TFLiteUnaryConverter::TNNOpType(tflite::BuiltinOperator op_code, bool quantized_model) {
+    switch (op_code) {
+        case tflite::BuiltinOperator_LOGISTIC:
+            return "Sigmoid";
+        case tflite::BuiltinOperator_EXP:
+            return "Exp";
+        case tflite::BuiltinOperator_LOG:
+            return "Log";
+        case tflite::BuiltinOperator_TANH:
+            return "Tanh";
+        case tflite::BuiltinOperator_COS:
+            return "Cos";
+        case tflite::BuiltinOperator_SIN:
+            return "Sin";
+        case tflite::BuiltinOperator_NEG:
+            return "Neg";
+        case tflite::BuiltinOperator_RSQRT:
+            return "Rsqrt";
+        case tflite::BuiltinOperator_RELU:
+            return "ReLU";
+        case tflite::BuiltinOperator_SHAPE:
+            return "Shape";
+        case tflite::BuiltinOperator_SQRT:
+            return "Sqrt";
+        default:
+            return "";
+    }
+}
+tflite::ActivationFunctionType TFLiteUnaryConverter::ActivationType(
+    const std::unique_ptr<tflite::OperatorT> &tf_lite_operator, tflite::BuiltinOperator op_code) {
+    return tflite::ActivationFunctionType_NONE;
+}
+
+TNN_NS::Status TFLiteUnaryConverter::exec(TNN_NS::NetStructure &net_structure, TNN_NS::NetResource &net_resource,
+                                          const std::unique_ptr<tflite::OperatorT> &tf_lite_operator,
+                                          const std::vector<std::unique_ptr<tflite::TensorT>> &tf_lite_tensors,
+                                          const std::vector<std::unique_ptr<tflite::BufferT>> &tf_lite_model_buffer,
+                                          const std::vector<std::unique_ptr<tflite::OperatorCodeT>> &tf_lite_op_set,
+                                          bool quantized_model) {
+    auto param       = new TNN_NS::LayerParam;
+    auto cur_layer   = net_structure.layers.back();
+    cur_layer->param = std::shared_ptr<TNN_NS::LayerParam>(param);
+    param->type      = cur_layer->type_str;
+    param->name      = cur_layer->name;
+    param->quantized = false;
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+using namespace tflite;
+REGISTER_CONVERTER(Unary, BuiltinOperator_LOGISTIC);
+REGISTER_CONVERTER(Unary, BuiltinOperator_EXP);
+REGISTER_CONVERTER(Unary, BuiltinOperator_LOG);
+REGISTER_CONVERTER(Unary, BuiltinOperator_TANH);
+REGISTER_CONVERTER(Unary, BuiltinOperator_COS);
+REGISTER_CONVERTER(Unary, BuiltinOperator_SIN);
+REGISTER_CONVERTER(Unary, BuiltinOperator_NEG);
+REGISTER_CONVERTER(Unary, BuiltinOperator_RSQRT);
+REGISTER_CONVERTER(Unary, BuiltinOperator_RELU);
+REGISTER_CONVERTER(Unary, BuiltinOperator_SHAPE);
+REGISTER_CONVERTER(Unary, BuiltinOperator_SQRT);
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.cc b/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.cc
new file mode 100644
index 0000000..62fb470
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.cc
@@ -0,0 +1,255 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tflite_utils.h"
+
+#include <cstring>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/raw_buffer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_CONVERTER {
+
+bool TFLiteConvertOHWI2OIHW(const float* src, float* dst, int CO, int KH, int KW, int CI) {
+    ASSERT(CO > 0);
+    ASSERT(KH > 0);
+    ASSERT(KW > 0);
+    ASSERT(CI > 0);
+    ASSERT(src != nullptr);
+    for (int co = 0; co < CO; ++co) {
+        for (int ci = 0; ci < CI; ++ci) {
+            for (int h = 0; h < KH; ++h) {
+                for (int w = 0; w < KW; ++w) {
+                    dst[(co * CI + ci) * KH * KW + h * KW + w] = src[(co * KH + h) * KW * CI + w * CI + ci];
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool TFLiteConvertOHWI2IOHW(const float* src, float* dst, int CO, int KH, int KW, int CI) {
+    ASSERT(CI > 0);
+    ASSERT(KH > 0);
+    ASSERT(KW > 0);
+    ASSERT(CO > 0);
+    ASSERT(src != nullptr);
+    for (int ci = 0; ci < CI; ++ci) {
+        for (int co = 0; co < CO; ++co) {
+            for (int h = 0; h < KH; ++h) {
+                for (int w = 0; w < KW; ++w) {
+                    dst[(ci * CO + co) * KH * KW + h * KW + w] = src[(co * KH + h) * KW * CI + w * CI + ci];
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool ConvertShapeFormatTFLite(std::vector<int32_t>& shape) {
+    if (shape.empty()) {
+        LOGE("TNN Converter do not support wrong shape!\n");
+        return false;
+    }
+    if (shape.size() < 3) {
+        return true;
+    } else if (shape.size() == 3) {
+        auto h = shape[1];
+        auto c = shape[2];
+        shape[1] = c;
+        shape[2] = h;
+    } else if (shape.size() == 4) {
+        // shape [n, h , w, c] -> shape [n, c, h, w]
+        auto h   = shape[1];
+        auto w   = shape[2];
+        auto c   = shape[3];
+        shape[1] = c;
+        shape[2] = h;
+        shape[3] = w;
+    } else {
+        LOGE("TNN Converter do not support wrong shape!\n");
+        return false;
+    }
+    return true;
+}
+
+bool ConvertPermFormatTFLite(std::vector<int32_t>& perm) {
+    if (perm.empty()) {
+        LOGE("TNN Converter do not support wrong perm!\n");
+        return false;
+    }
+
+    int perm_size = perm.size();
+    if (perm_size > 4) {
+        LOGE("TNN Transpose do not support perm's size larger than 4!\n");
+        return false;
+    }
+
+    for (int i = perm_size; i < 4; i++) {
+        perm.emplace_back(i);
+    }
+
+    std::map<int, int> nhwc_to_nchw;
+    nhwc_to_nchw[0] = 0;
+    nhwc_to_nchw[1] = 2;
+    nhwc_to_nchw[2] = 3;
+    nhwc_to_nchw[3] = 1;
+
+    for (auto& v : perm) {
+        v = nhwc_to_nchw[v];
+    }
+    ConvertShapeFormatTFLite(perm);
+
+    return true;
+}
+
+// template <typename T>
+bool ConvertConstFormatTFLite(int32_t const* dst, int32_t const* src, std::vector<int32_t> shape) {
+    ASSERT(shape.size() == 2);
+    ASSERT(shape[0] == 4);
+    int data_size = shape[1];
+    // std::memcpy((void*)(dst + 0 * data_size), src + 0 * data_size, data_size*sizeof(int32_t));
+    std::memcpy((void*)(dst + 0 * data_size), src + 2 * data_size, data_size * sizeof(int32_t));
+    std::memcpy((void*)(dst + 1 * data_size), src + 1 * data_size, data_size * sizeof(int32_t));
+    std::memcpy((void*)(dst + 2 * data_size), src + 3 * data_size, data_size * sizeof(int32_t));
+    return true;
+}
+
+int ConvertAxisFormatTFLite(int axis, int input_shape_size) {
+    assert(axis > -4 && axis < 4);
+    if (axis < 0) {
+        axis += input_shape_size;
+    }
+
+    if (input_shape_size == 2) {
+        return axis;
+    } else if (input_shape_size == 3) {
+        // [n,h,c] -> [n,c,h]
+        switch (axis) {
+            case 1:
+                return 2;
+            case 2:
+                return 1;
+            default:
+                return 0;
+        }
+    } else if (input_shape_size == 4) {
+        switch (axis) {
+            case 0:
+                return 0;
+            case 1:
+                return 2;
+            case 2:
+                return 3;
+            default:
+                return 1;
+        }
+    }
+
+    return axis;
+}
+
+int Count(std::vector<int> shape) {
+    if (shape.empty()) {
+        return 0;
+    }
+    int count = 1;
+    for (auto i : shape) {
+        count *= i;
+    }
+    return count;
+}
+
+int SizeofTFLiteTensorData(tflite::TensorType type) {
+    switch (type) {
+        case tflite::TensorType_FLOAT32:
+            return sizeof(float);
+        case tflite::TensorType_INT32:
+            return sizeof(int32_t);
+        case tflite::TensorType_INT16:
+            return sizeof(int16_t);
+        case tflite::TensorType_INT64:
+            return sizeof(int64_t);
+        default:
+            return 0;
+    }
+    return 0;
+}
+
+void Mask(std::vector<int> shape, int mask, int upper, std::vector<int>& v) {
+    int window = 0x1;
+    for (int i = 0; i < shape.size(); ++i) {
+        if (mask & window) {
+            // upper == 0: 处理的是 begin，取的是 0
+            // upper != 0: 处理的是 ends， 取最大值
+            v[i] = upper == 0? 0: shape[i];
+        }
+        window = window << 1;
+    }
+}
+
+TNN_NS::Status CreateIntScaleResource(TNN_NS::NetResource& net_resource,
+                                      const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors, int tensor_index) {
+    const auto& tensor    = tf_lite_tensors[tensor_index];
+    const auto scale_name = tensor->name + BLOB_SCALE_SUFFIX;
+    if (net_resource.resource_map.find(scale_name) == net_resource.resource_map.end()) {
+        const auto scale_resource             = std::make_shared<TNN_NS::IntScaleResource>();
+        scale_resource->name                  = scale_name;
+        net_resource.resource_map[scale_name] = scale_resource;
+        auto& quantization                    = tensor->quantization;
+        std::vector<float> scales             = quantization->scale;
+        if (scales.empty()) {
+            return TNN_NS::Status(TNN_NS::TNNERR_CONVERT_INVALID_MODEL, "The scale size is empty\n");
+        }
+        auto scale_handle = TNN_NS::RawBuffer(scales.size() * sizeof(float));
+        scale_handle.SetDataType(TNN_NS::DATA_TYPE_FLOAT);
+        auto scale_handle_data = scale_handle.force_to<float*>();
+        for (int i = 0; i < scales.size(); i++) {
+            scale_handle_data[i] = scales[i];
+        }
+        scale_resource->scale_handle = scale_handle;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+
+TNN_NS::DataType GetTnnDataTypeFromTFLite(const tflite::TensorType& tensor_type) {
+    switch (tensor_type) {
+        case tflite::TensorType_FLOAT32: {
+            return TNN_NS::DATA_TYPE_FLOAT;
+        }
+        case tflite::TensorType_FLOAT16: {
+            return TNN_NS::DATA_TYPE_HALF;
+        }
+        case tflite::TensorType_UINT8:
+        case tflite::TensorType_INT8: {
+            return TNN_NS::DATA_TYPE_INT8;
+        }
+        case tflite::TensorType_INT32:
+        case tflite::TensorType_INT64: {
+            return TNN_NS::DATA_TYPE_INT32;
+        }
+        default: {
+            LOGE("Not support tflite TensorType\n");
+            assert(0);
+        }
+    }
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.h b/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.h
new file mode 100644
index 0000000..26c343f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tflite/tflite_utils.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_UTILS_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_UTILS_H_
+#include <cstdint>
+#include <map>
+#include <vector>
+
+#include "tflite-schema/schema_generated.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/net_resource.h"
+
+namespace TNN_CONVERTER {
+
+bool TFLiteConvertOHWI2OIHW(const float* src, float* dst, int CO, int KH, int KW, int CI);
+
+bool TFLiteConvertOHWI2IOHW(const float* src, float* dst, int CO, int KH, int KW, int CI);
+
+bool ConvertShapeFormatTFLite(std::vector<int32_t>& shape);
+
+bool ConvertPermFormatTFLite(std::vector<int32_t>& perm);
+
+// template <typename T>
+bool ConvertConstFormatTFLite(int32_t const* dst, int32_t const* src, std::vector<int32_t> shape);
+
+int ConvertAxisFormatTFLite(int axis, int input_shape_size = 4);
+
+int Count(std::vector<int> shape);
+
+int SizeofTFLiteTensorData(tflite::TensorType type);
+
+void Mask(std::vector<int> shape, int mask, int upper, std::vector<int>& v);
+
+TNN_NS::Status CreateIntScaleResource(TNN_NS::NetResource& net_resource,
+                                      const std::vector<std::unique_ptr<tflite::TensorT>>& tf_lite_tensors,
+                                      int tensor_index);
+TNN_NS::DataType GetTnnDataTypeFromTFLite(const tflite::TensorType& tensor_type);
+
+}  // namespace TNN_CONVERTER
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_TFLITE_TFLITE_UTILS_H_
diff --git a/3rdparty/TNN/tools/converter/source/tnn_converter.cc b/3rdparty/TNN/tools/converter/source/tnn_converter.cc
new file mode 100644
index 0000000..0c2e32f
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/tnn_converter.cc
@@ -0,0 +1,86 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "include/tnn/core/tnn.h"
+#include "onnx/onnx_converter.h"
+#include "optimizer/tnn_optimizer.h"
+#include "runtime/tnn_runtime.h"
+#include "tflite/tflite_converter.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "utils/command.h"
+#include "utils/flags.h"
+#include "utils/generate_model.h"
+#include "utils/model_config.h"
+
+namespace TNN_CONVERTER {
+int Run(int argc, char* argv[]) {
+    ParseCommandLine(argc, argv);
+    auto interpreter =
+        std::shared_ptr<TNN_NS::AbstractModelInterpreter>(TNN_NS::CreateModelInterpreter(TNN_NS::MODEL_TYPE_TNN));
+    TNN_NS::NetStructure& net_structure =
+        *(dynamic_cast<TNN_NS::DefaultModelInterpreter*>(interpreter.get())->GetNetStructure());
+    TNN_NS::NetResource& net_resource =
+        *(dynamic_cast<TNN_NS::DefaultModelInterpreter*>(interpreter.get())->GetNetResource());
+    ModelConfig model_config(FLAGS_mt, FLAGS_mp, FLAGS_od);
+    TNN_NS::Status status;
+    if (model_config.model_type_ == TNN_CONVERTER::MODEL_TYPE_TF_LITE) {
+        TFLite2Tnn tf_lite_2_tnn(model_config.model_path_);
+        status = tf_lite_2_tnn.Convert2Tnn(net_structure, net_resource);
+    } else if (model_config.model_type_ == TNN_CONVERTER::MODEL_TYPE_ONNX) {
+        Onnx2Tnn onnx_2_tnn(model_config.model_path_);
+        status = onnx_2_tnn.Converter2Tnn(net_structure, net_resource);
+    }
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        LOGE("Converter: converter %s failed!\n", FLAGS_mp.c_str());
+        return status;
+    }
+    // TODO optimize the model
+    // prefer optimize
+    TnnOptimizer tnn_optimizer;
+    status = tnn_optimizer.PreOptimize(net_structure, net_resource);
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        LOGE("Converter: optimize %s failed!\n", FLAGS_mp.c_str());
+        return status;
+    }
+    // tnn run time
+    TnnRuntime tnn_runtime;
+    status = tnn_runtime.ConstantFolding(interpreter);
+    if (status != TNN_NS::TNN_OK) {
+        LOGE("Converter: tnn runtime failed\n");
+        return status;
+    }
+    // post optimize
+    status = tnn_optimizer.PostOptimize(net_structure, net_resource);
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        LOGE("Converter: optimize %s failed!\n", FLAGS_mp.c_str());
+        return status;
+    }
+
+    // wright the model
+    std::string file_name = GetFileName(model_config.model_path_);
+    status                = GenerateModel(net_structure, net_resource, model_config.output_dir_, file_name);
+    if (status != TNN_NS::TNN_CONVERT_OK) {
+        LOGE("Converter: generate tnn model failed!\n");
+        return status;
+    }
+    return 0;
+}
+
+}  // namespace TNN_CONVERTER
+int main(int argc, char* argv[]) {
+    return TNN_CONVERTER::Run(argc, argv);
+}
diff --git a/3rdparty/TNN/tools/converter/source/utils/command.cc b/3rdparty/TNN/tools/converter/source/utils/command.cc
new file mode 100644
index 0000000..f01b7ca
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/command.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "command.h"
+
+#include <iostream>
+
+#include "gflags/gflags.h"
+#include "tools/converter/source/utils/flags.h"
+
+namespace TNN_CONVERTER {
+
+void ShowHelpMessage() {
+    // TODO
+    std::cout << "show help message!" << std::endl;
+}
+
+void ShowModelPathMessage() {
+    // TODO
+    std::cout << "please special the tensorflow lite path!" << std::endl;
+}
+
+void ShowOnnxPathMessage() {
+    // TODO
+}
+
+bool ParseCommandLine(int argc, char* argv[]) {
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        ShowHelpMessage();
+        return false;
+    }
+    if (FLAGS_mp.empty()) {
+        ShowModelPathMessage();
+        ShowHelpMessage();
+        return false;
+    }
+    if (FLAGS_od.empty()) {
+        ShowOnnxPathMessage();
+        ShowHelpMessage();
+        return false;
+    }
+    // TODO
+    return true;
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/utils/command.h b/3rdparty/TNN/tools/converter/source/utils/command.h
new file mode 100644
index 0000000..d9f41f4
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/command.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNNCONVERTER_SRC_UTILS_COMMAND_H_
+#define TNNCONVERTER_SRC_UTILS_COMMAND_H_
+namespace TNN_CONVERTER {
+
+bool ParseCommandLine(int argc, char* argv[]);
+
+void ShowHelpMessage();
+
+void ShowModelPathMessage();
+}  // namespace TNN_CONVERTER
+
+#endif  // TNNCONVERTER_SRC_UTILS_COMMAND_H_
diff --git a/3rdparty/TNN/tools/converter/source/utils/flags.cc b/3rdparty/TNN/tools/converter/source/utils/flags.cc
new file mode 100644
index 0000000..9adefef
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/flags.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tools/converter/source/utils/flags.h"
+
+namespace TNN_CONVERTER {
+
+DEFINE_bool(h, false, help_message);
+
+DEFINE_string(mp, "", tf_path_message);
+
+DEFINE_string(od, "", output_dir_message);
+
+DEFINE_string(mt, "", model_type_message);
+
+DEFINE_string(sp, "", save_path_message);
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/utils/flags.h b/3rdparty/TNN/tools/converter/source/utils/flags.h
new file mode 100644
index 0000000..88d6b53
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/flags.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNNCONVERTER_SRC_FLAGS_H_
+#define TNNCONVERTER_SRC_FLAGS_H_
+#include "gflags/gflags.h"
+
+namespace TNN_CONVERTER {
+
+static const char help_message[] = "print a usage message.";
+
+static const char tf_path_message[] = "specify model path: <the>/<path>/<to>/<test.tflite>.";
+
+static const char output_dir_message[] =
+    "Specify the output directory of the converted model: <the>/<path>/<to>/<directory>.";
+
+static const char model_type_message[] = "specify model type: Caffe, TF, TFLite.";
+
+static const char save_path_message[] = "Specify the save path of the results after TNN inference";
+
+DECLARE_bool(h);
+
+DECLARE_string(mp);
+
+DECLARE_string(od);
+
+DECLARE_string(mt);
+
+DECLARE_string(sp);
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNNCONVERTER_SRC_FLAGS_H_
diff --git a/3rdparty/TNN/tools/converter/source/utils/generate_model.cc b/3rdparty/TNN/tools/converter/source/utils/generate_model.cc
new file mode 100644
index 0000000..24d0127
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/generate_model.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "generate_model.h"
+
+#include "tnn/interpreter/tnn/model_packer.h"
+namespace TNN_CONVERTER {
+
+std::string GetFileName(std::string& file_path) {
+    auto pos_s = file_path.rfind('/');
+    auto pos_e = file_path.rfind('.');
+    pos_s == std::string::npos ? (pos_s = 0) : (pos_s++);
+
+    if (pos_e == std::string::npos) {
+        pos_e = file_path.length();
+    }
+    auto len = pos_e - pos_s;
+    return file_path.substr(pos_s, len);
+}
+
+TNN_NS::Status GenerateModel(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                             std::string& output_dir, std::string& file_name) {
+    std::string proto_path = output_dir + file_name + PROTO_SUFFIX;
+    std::string model_path = output_dir + file_name + MODEL_SUFFIX;
+    printf("TNN Converter generate TNN proto path %s\n", proto_path.c_str());
+    printf("TNN Converter generate TNN model path %s\n", model_path.c_str());
+    TNN_NS::ModelPacker model_packer(&net_structure, &net_resource);
+    Status status = model_packer.Pack(proto_path, model_path);
+    if (status != TNN_OK) {
+        LOGE("generate tnn model failed!\n");
+        return TNN_NS::TNNERR_CONVERT_GENERATE_MODEL;
+    }
+    return TNN_NS::TNN_CONVERT_OK;
+}
+
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/utils/generate_model.h b/3rdparty/TNN/tools/converter/source/utils/generate_model.h
new file mode 100644
index 0000000..d2ca30b
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/generate_model.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_CONVERTER_SOURCE_UTILS_GENERATE_MODEL_H_
+#define TNN_TOOLS_CONVERTER_SOURCE_UTILS_GENERATE_MODEL_H_
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_CONVERTER {
+
+const std::string PROTO_SUFFIX = ".tnnproto";
+const std::string MODEL_SUFFIX = ".tnnmodel";
+
+std::string GetFileName(std::string& file_path);
+
+TNN_NS::Status GenerateModel(TNN_NS::NetStructure& net_structure, TNN_NS::NetResource& net_resource,
+                             std::string& output_dir, std::string& file_name);
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNN_TOOLS_CONVERTER_SOURCE_UTILS_GENERATE_MODEL_H_
diff --git a/3rdparty/TNN/tools/converter/source/utils/model_config.cc b/3rdparty/TNN/tools/converter/source/utils/model_config.cc
new file mode 100644
index 0000000..0685ce4
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/model_config.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "model_config.h"
+namespace TNN_CONVERTER {
+
+ModelConfig::~ModelConfig(){};
+
+ModelConfig::ModelConfig(std::string model_type, std::string proto_path, std::string model_path, std::string tnn_path) {
+    // TODO
+    proto_path_ = proto_path;
+    model_path_ = model_path;
+    output_dir_ = tnn_path;
+}
+ModelConfig::ModelConfig(std::string model_type, std::string model_path, std::string output_dir) {
+    // TODO
+    if (model_type == "TFLITE") {
+        model_type_ = MODEL_TYPE_TF_LITE;
+    } else if (model_type == "ONNX") {
+        model_type_ = MODEL_TYPE_ONNX;
+    }
+    model_path_ = model_path;
+    output_dir_ = output_dir;
+}
+
+bool ModelConfig::CheckPath(std::string path) {
+    auto filepath = path.c_str();
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (fs.is_open()) {
+        fs.close();
+        return true;
+    }
+
+    fprintf(stderr, "open failed %s\n", filepath);
+
+    return false;
+}
+bool ModelConfig::CheckDir(std::string dir) {
+    auto dirpath = dir.c_str();
+    DIR* root    = opendir(dirpath);
+
+    if (root != NULL) {
+        closedir(root);
+        return true;
+    }
+
+    fprintf(stderr, "open failed %s\n", dirpath);
+    return false;
+}
+}  // namespace TNN_CONVERTER
diff --git a/3rdparty/TNN/tools/converter/source/utils/model_config.h b/3rdparty/TNN/tools/converter/source/utils/model_config.h
new file mode 100644
index 0000000..bd2c3f8
--- /dev/null
+++ b/3rdparty/TNN/tools/converter/source/utils/model_config.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNNCONVERTER_SRC_MODEL_CONFIG_H_
+#define TNNCONVERTER_SRC_MODEL_CONFIG_H_
+
+#include <dirent.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <string>
+
+namespace TNN_CONVERTER {
+
+typedef enum { MODEL_TYPE_CAFFE = 0, MODEL_TYPE_TF = 1, MODEL_TYPE_TF_LITE = 2, MODEL_TYPE_ONNX = 3 } ModelType;
+
+class ModelConfig {
+public:
+    ModelConfig(std::string model_type, std::string proto_path, std::string model_path, std::string onnx_path);
+    ModelConfig(std::string model_type, std::string model_path, std::string onnx_path);
+    ~ModelConfig();
+
+    std::string proto_path_;
+    std::string model_path_;
+    std::string output_dir_;
+    ModelType model_type_;
+
+private:
+    bool CheckPath(std::string path);
+    bool CheckDir(std::string dir);
+};
+
+}  // namespace TNN_CONVERTER
+
+#endif  // TNNCONVERTER_SRC_MODEL_CONFIG_H_
diff --git a/3rdparty/TNN/tools/hooks/README.md b/3rdparty/TNN/tools/hooks/README.md
new file mode 100644
index 0000000..27cf426
--- /dev/null
+++ b/3rdparty/TNN/tools/hooks/README.md
@@ -0,0 +1,12 @@
+为防止账号密码等信息泄漏，所有人员均必须添加hook方式进行本地检查。
+之后每次本地提交都将触发自动检查，疑似敏感信息会用红色进行标记，个人负责针对性处理。
+
+添加方案：
+1.命令行脚本方案
+cd tools/hooks
+sh add_hooks.sh
+
+2.手动添加
+将tools/hooks/pre-commit复制到项目的.git/hooks/目录下
+
+注意目录下可能已存在同名文件，可选择合并
diff --git a/3rdparty/TNN/tools/hooks/add_hooks.sh b/3rdparty/TNN/tools/hooks/add_hooks.sh
new file mode 100755
index 0000000..471ece2
--- /dev/null
+++ b/3rdparty/TNN/tools/hooks/add_hooks.sh
@@ -0,0 +1,9 @@
+SRC_HOOKS_DIR=$(pwd)
+DOT_GIT_DIR=${SRC_HOOKS_DIR}/../../.git
+DOT_GIT_HOOKS_DIR=${DOT_GIT_DIR}/hooks
+
+if [ ! -d ${DOT_GIT_HOOKS_DIR} ]; then
+  mkdir ${DOT_GIT_HOOKS_DIR}
+fi
+
+cp ${SRC_HOOKS_DIR}/pre-commit ${DOT_GIT_HOOKS_DIR}/pre-commit
diff --git a/3rdparty/TNN/tools/model_check/CMakeLists.txt b/3rdparty/TNN/tools/model_check/CMakeLists.txt
new file mode 100644
index 0000000..3d74be2
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/CMakeLists.txt
@@ -0,0 +1,25 @@
+file(GLOB COMMON_SRCS ${CMAKE_SOURCE_DIR}/tools/common/*.cc)
+file(GLOB MODEL_CHECK_SRCS *.cc)
+
+message(${MODEL_CHECK_SRCS})
+
+include_directories(${CMAKE_SOURCE_DIR}/tools/common)
+include_directories(${CMAKE_SOURCE_DIR}/tools/model_check)
+include_directories(${CMAKE_SOURCE_DIR}/source/tnn/interpreter/tnn)
+include_directories(${CMAKE_SOURCE_DIR}/third_party/stb)
+
+add_executable(model_check ${MODEL_CHECK_SRCS} ${COMMON_SRCS})
+
+if(TNN_BUILD_SHARED)
+    target_link_libraries(model_check TNN gflags)
+elseif(SYSTEM.Darwin OR SYSTEM.iOS)
+    message("target link libraries -force_load")
+    target_link_libraries(model_check -Wl,-force_load TNN gflags)
+elseif(SYSTEM.Windows)
+    target_link_libraries(model_check -Wl,--whole-archive TNN -Wl,--no-whole-archive gflags)
+else()
+    message("target link libraries whole-archive")
+    target_link_libraries(model_check -Wl,--whole-archive TNN -Wl,--no-whole-archive gflags)
+endif()
+
+set_target_properties(model_check PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
diff --git a/3rdparty/TNN/tools/model_check/flags.cc b/3rdparty/TNN/tools/model_check/flags.cc
new file mode 100644
index 0000000..f6c5ba3
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/flags.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flags.h"
+
+namespace TNN_NS {
+
+DEFINE_bool(h, false, help_message);
+
+DEFINE_string(p, "", proto_path_message);
+
+DEFINE_string(m, "", model_path_message);
+
+DEFINE_string(d, "", device_type_message);
+
+DEFINE_string(i, "", input_path_message);
+
+DEFINE_string(f, "", output_ref_path_message);
+
+DEFINE_bool(e, false, cmp_end_message);
+
+DEFINE_string(n, "", bias_message);
+
+DEFINE_string(s, "", scale_message);
+
+DEFINE_bool(o, false, output_dump_message);
+
+DEFINE_bool(b, false, check_batch_message);
+
+DEFINE_string(a, "", align_all_message);
+
+DEFINE_string(sp, "", set_precision_message);
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/tools/model_check/flags.h b/3rdparty/TNN/tools/model_check/flags.h
new file mode 100644
index 0000000..588aba5
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/flags.h
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_MODEL_CHECK_FLAGS_H_
+#define TNN_TOOLS_MODEL_CHECK_FLAGS_H_
+
+#include "gflags/gflags.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+static const char help_message[] = "show this message";
+
+static const char proto_path_message[] = "(required) tnn proto file path";
+
+static const char model_path_message[] = "(required) tnn model file path";
+
+static const char device_type_message[] = "(required) specify tnn device type: NAIVE, X86, ARM, CUDA, METAL, OPENCL, HUAWEI_NPU, default is ARM.";
+
+static const char input_path_message[] = "(optional) input file path";
+
+static const char output_ref_path_message[] = "(optional) the reference output to compare";
+
+static const char cmp_end_message[] = "(optional) compare output only";
+
+static const char bias_message[] = "(optional) bias val when preprocess image input, ie, 0.0,0.0,0.0";
+
+static const char scale_message[] = "(optional) scale val when preprocess image input, ie, 1.0,1.0,1.0";
+
+static const char output_dump_message[] = "(optional) dump output";
+
+static const char check_batch_message[] = "(optional) check result of multi batch";
+
+static const char align_all_message[] = "(optional) dump folder path to compare the all model";
+
+static const char set_precision_message[] = "(optional) specify tnn precision type(default HIGH): AUTO, NORMAL, HIGH, LOW";
+
+DECLARE_bool(h);
+
+DECLARE_string(p);
+
+DECLARE_string(m);
+
+DECLARE_string(d);
+
+DECLARE_string(i);
+
+DECLARE_string(f);
+
+DECLARE_bool(e);
+
+DECLARE_string(n);
+
+DECLARE_string(s);
+
+DECLARE_bool(o);
+
+DECLARE_bool(b);
+
+DECLARE_string(a);
+
+DECLARE_string(sp);
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_MODEL_CHECK_FLAGS_H_
diff --git a/3rdparty/TNN/tools/model_check/main.cc b/3rdparty/TNN/tools/model_check/main.cc
new file mode 100644
index 0000000..1cab942
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/main.cc
@@ -0,0 +1,314 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn.h"
+
+#include "file_reader.h"
+#include "model_checker.h"
+#include "flags.h"
+#include "tnn/utils/split_utils.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+
+using namespace TNN_NS;
+
+bool CheckResult(std::string desc, int ret) {
+    if (ret != 0) {
+        printf("%s failed: ret %d or 0x%X\n", desc.c_str(), ret, ret);
+        return false;
+    } else {
+        printf("%s success!\n", desc.c_str());
+        return true;
+    }
+}
+
+DeviceType ConvertDeviceType(std::string device_type) {
+    std::transform(device_type.begin(), device_type.end(), device_type.begin(), ::toupper);
+    if ("METAL" == device_type) {
+        return DEVICE_METAL;
+    } else if ("OPENCL" == device_type) {
+        return DEVICE_OPENCL;
+    } else if ("CUDA" == device_type) {
+        return DEVICE_CUDA;
+    } else if ("ARM" == device_type) {
+        return DEVICE_ARM;
+    } else if ("HUAWEI_NPU" == device_type) {
+        return DEVICE_HUAWEI_NPU;
+    } else if ("X86" == device_type) {
+        return DEVICE_X86;
+    } else {
+        return DEVICE_NAIVE;
+    }
+}
+
+Precision ConvertPrecision(std::string precision) {
+    std::transform(precision.begin(), precision.end(), precision.begin(), ::toupper);
+    if ("AUTO" == precision) {
+        return PRECISION_AUTO;
+    } else if ("NORMAL" == precision) {
+        return PRECISION_NORMAL;
+    } else if ("HIGH" == precision) {
+        return PRECISION_HIGH;
+    } else if ("LOW" == precision) {
+        return PRECISION_LOW;
+    } else {
+        return PRECISION_HIGH;
+    }
+}
+
+int InitModelConfig(ModelConfig& model_config, std::string proto_file, std::string model_file) {
+    {
+        std::ifstream proto_stream(proto_file);
+        if (!proto_stream.is_open() || !proto_stream.good()) {
+            printf("read proto_file failed!\n");
+            return -1;
+        }
+        auto buffer = std::string((std::istreambuf_iterator<char>(proto_stream)), std::istreambuf_iterator<char>());
+        model_config.params.push_back(buffer);
+    }
+
+    {
+        std::ifstream model_stream(model_file, std::ios::binary);
+        if (!model_stream.is_open() || !model_stream.good()) {
+            printf("read model_file failed!\n");
+            return -1;
+        }
+        auto buffer = std::string((std::istreambuf_iterator<char>(model_stream)), std::istreambuf_iterator<char>());
+        model_config.params.push_back(buffer);
+    }
+    return 0;
+}
+
+bool GetInputType(std::string name, FileFormat& format) {
+    int pos = name.rfind('.');
+    if (pos == std::string::npos)
+        return false;
+
+    std::string suffix = name.substr(pos);
+    std::transform(suffix.begin(), suffix.end(), suffix.begin(), tolower);
+    if (suffix == ".txt") {
+        format = TEXT;
+    } else if (suffix == ".npy") {
+        format = NPY;
+    } else if (suffix == ".jpg") {
+        format = IMAGE;
+    } else if (suffix == ".jpeg") {
+        format = IMAGE;
+    } else if (suffix == ".png") {
+        format = IMAGE;
+    } else if (suffix == ".bmp") {
+        format = IMAGE;
+    } else {
+        return false;
+    }
+
+    return true;
+}
+
+std::pair<std::string, FileFormat> GetFileInfo(std::string input_path) {
+    FileFormat format = NOTSUPPORT;
+#ifdef WIN32
+    if (access(input_path.c_str(), 0) == 0) {
+#else
+    if (access(input_path.c_str(), F_OK) == 0) {
+#endif
+        if (GetInputType(input_path, format)) {
+            printf("\tfile name: %s  type: %d\n", input_path.c_str(), format);
+            return std::make_pair(input_path, format);
+        }
+    }
+    return std::make_pair("", format);
+}
+
+void ShowUsage() {
+    printf(
+        "usage:\n./model_check [-h] [-p] <tnnproto> [-m] <tnnmodel> [-d] <device> [-i] <input> [-o] [-e] [-f] "
+        "<refernece> [-n] <val> [-s] <val> [-a] <align_folder>\n");
+    printf("\t-h, <help>     \t%s\n", help_message);
+    printf("\t-p, <proto>    \t%s\n", proto_path_message);
+    printf("\t-m, <model>    \t%s\n", model_path_message);
+    printf("\t-d, <device>   \t%s\n", device_type_message);
+    printf("\t-i, <input>    \t%s\n", input_path_message);
+    printf("\t-f, <ref>      \t%s\n", output_ref_path_message);
+    printf("\t-e, <end>      \t%s\n", cmp_end_message);
+    printf("\t-n, <bias>     \t%s\n", bias_message);
+    printf("\t-s, <scale>    \t%s\n", scale_message);
+    printf("\t\tformula: y = (x - bias) * scale\n");
+    printf("\t-o, <output>   \t%s\n", output_dump_message);
+    printf("\t-b, <batch>    \t%s\n", check_batch_message);
+    printf("\t-a, <align_all>\t%s\n", align_all_message);
+    printf("\t-sp, <set precision>\t%s\n", set_precision_message);
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[]) {
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        ShowUsage();
+        return false;
+    }
+
+    if (FLAGS_p.empty() || FLAGS_m.empty() || FLAGS_d.empty()) {
+        printf("Parameter -p/-m/-d is not set \n");
+        ShowUsage();
+        return false;
+    }
+
+    return true;
+}
+
+int main(int argc, char* argv[]) {
+    // parse command line params
+    if (!ParseAndCheckCommandLine(argc, argv))
+        return -1;
+
+    // Init parameters
+    std::string proto_file_name = FLAGS_p;
+    std::string model_file_name = FLAGS_m;
+
+    NetworkConfig net_config;
+    net_config.device_type = ConvertDeviceType(FLAGS_d);
+    ModelConfig model_config;
+
+    ModelCheckerParam model_checker_param;
+    model_checker_param.input_file        = std::make_pair("", NOTSUPPORT);
+    model_checker_param.input_bias        = {0, 0, 0, 0};
+    model_checker_param.input_scale       = {1.0f, 1.0f, 1.0f, 1.0f};
+    model_checker_param.ref_file          = std::make_pair("", NOTSUPPORT);
+    model_checker_param.dump_output       = FLAGS_o;
+    model_checker_param.dump_dir_path     = FLAGS_a;
+    model_checker_param.only_check_output = FLAGS_e;
+    model_checker_param.check_batch       = FLAGS_b;
+
+    printf("proto: %s\n", proto_file_name.c_str());
+    printf("model: %s\n", model_file_name.c_str());
+    printf("device: %s\n", FLAGS_d.c_str());
+
+    if(!FLAGS_i.empty()) {
+        printf("input file: %s\n", FLAGS_i.c_str());
+        model_checker_param.input_file = GetFileInfo(FLAGS_i);
+    }
+    if(!FLAGS_f.empty()) {
+        printf("reference output file: %s\n", FLAGS_f.c_str());
+        model_checker_param.ref_file = GetFileInfo(FLAGS_f);
+    }
+    if(!FLAGS_n.empty()) {
+        printf("bias: %s\n", FLAGS_n.c_str());
+        std::vector<std::string> array;
+        SplitUtils::SplitStr(FLAGS_n.c_str(), array, ",");
+        model_checker_param.input_bias.clear();
+        for (auto s : array) {
+            model_checker_param.input_bias.push_back(atof(s.c_str()));
+        }
+    }
+    if(!FLAGS_s.empty()) {
+        printf("scale: %s\n", FLAGS_s.c_str());
+        std::vector<std::string> array;
+        SplitUtils::SplitStr(FLAGS_s.c_str(), array, ",");
+        model_checker_param.input_scale.clear();
+        for (auto s : array) {
+            model_checker_param.input_scale.push_back(atof(s.c_str()));
+        }
+    }
+    if(FLAGS_e) {
+        printf("compare output only\n");
+    }
+    if(FLAGS_o) {
+        printf("dump output\n");
+    }
+    if(FLAGS_b) {
+        printf("check result of multi batch\n");
+    }
+
+    if ("" == model_checker_param.input_file.first && "" != model_checker_param.ref_file.first) {
+        printf("Error: there is no input file for output reference file!\n");
+        return -1;
+    }
+
+    // for HuaweiNPU only check output
+    if (net_config.device_type == DEVICE_HUAWEI_NPU) {
+        model_checker_param.only_check_output = true;
+        net_config.network_type               = NETWORK_TYPE_HUAWEI_NPU;
+    }
+
+    // for NAIVE only check output
+    if (net_config.device_type == DEVICE_NAIVE) {
+        model_checker_param.only_check_output = true;
+    }
+
+#ifndef WIN32
+    // only for metal device
+    if (net_config.device_type == DEVICE_METAL) {
+        //获取当前目录绝对路径
+        auto current_absolute_path = std::shared_ptr<char>((char*)calloc(2048, sizeof(char)), [](char* p) { free(p); });
+        if (NULL != realpath("./", current_absolute_path.get())) {
+            //获取当默认metallib路径
+            strcat(current_absolute_path.get(), "/tnn.metallib");
+            LOGD("Metal library path:%s\n", current_absolute_path.get());
+            net_config.library_path = {std::string(current_absolute_path.get())};
+        }
+    }
+#endif
+
+    int ret = InitModelConfig(model_config, proto_file_name, model_file_name);
+    if (CheckResult("init model config", ret) != true)
+        return -1;
+
+    ModelChecker model_checker;
+    auto status = model_checker.SetModelCheckerParams(model_checker_param);
+    if (status != TNN_OK) {
+        printf("set model_checker params failed! (error: %s)\n", status.description().c_str());
+        return -1;
+    }
+    if ("" == FLAGS_sp) {
+        if (model_checker_param.only_check_output) {
+            net_config.precision = PRECISION_AUTO;
+        } else {
+            net_config.precision = PRECISION_HIGH;
+        }
+    } else {
+        net_config.precision = ConvertPrecision(FLAGS_sp);
+    }
+    // NPU devices always use PRECISION_AUTO
+    if (net_config.device_type == DEVICE_HUAWEI_NPU) {
+        net_config.precision = PRECISION_AUTO;
+    }
+    printf("tnn precision %d\n", net_config.precision);
+    status = model_checker.Init(net_config, model_config);
+    if (status != TNN_OK) {
+        printf("model_checker init failed! (error: %s)\n", status.description().c_str());
+        return -1;
+    }
+
+    status = model_checker.RunModelChecker();
+    if (status != TNN_OK) {
+        printf("model check failed! (error: %s)\n", status.description().c_str());
+        return -1;
+    }
+    printf("model check pass!\n");
+
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/model_check/model_checker.cc b/3rdparty/TNN/tools/model_check/model_checker.cc
new file mode 100644
index 0000000..2cb8bf5
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/model_checker.cc
@@ -0,0 +1,783 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "model_checker.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <random>
+
+#include "file_reader.h"
+#include "tnn/core/macro.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/mat_converter_utils.h"
+
+namespace TNN_NS {
+
+ModelChecker::ModelChecker() {
+    model_checker_params_.input_file  = std::make_pair("", NOTSUPPORT);
+    model_checker_params_.input_bias  = {0, 0, 0, 0};
+    model_checker_params_.input_scale = {1.0f, 1.0f, 1.0f, 1.0f};
+    model_checker_params_.dump_output = false;
+    output_ref_mat_map_.clear();
+    cpu_blobdata_map.clear();
+    check_results.clear();
+}
+
+ModelChecker::~ModelChecker() {
+    instance_device_ = nullptr;
+    instance_cpu_    = nullptr;
+    tnn_             = nullptr;
+}
+
+Status ModelChecker::Init(NetworkConfig& net_config, ModelConfig& model_config) {
+    // tnn_ init
+    tnn_.reset(new TNN());
+    Status status = tnn_->Init(model_config);
+    if (status != TNN_OK) {
+        LOGE("tnn init failed: %s!\n", status.description().c_str());
+        return Status(TNNERR_NET_ERR, "tnn init failed");
+    }
+    InputShapesMap input_shapes;
+    if (model_checker_params_.check_batch) {
+        status = tnn_->GetModelInputShapesMap(input_shapes);
+        if (status != TNN_OK) {
+            LOGE("tnn get input shape map failed: %s!\n", status.description().c_str());
+            return status;
+        }
+        status = ChangeBatchOfInputShapes(input_shapes);
+        if (status != TNN_OK) {
+            LOGE("change batch of input shape map failed: %s!\n", status.description().c_str());
+            return status;
+        }
+    }
+
+    NetworkConfig net_config_cpu;
+    net_config_cpu.device_type = DEVICE_NAIVE;
+    if (net_config.device_type == DEVICE_NAIVE) {
+        net_config_cpu = net_config;
+    }
+    instance_cpu_ = tnn_->CreateInst(net_config_cpu, status, input_shapes);
+    if (status != TNN_OK) {
+        LOGE("create cpu instance failed: %s\n", status.description().c_str());
+        return status;
+    }
+
+    // just compare the output if Device is NAIVE
+    if (net_config.device_type == DEVICE_NAIVE) {
+        instance_device_ = instance_cpu_;
+        return TNN_OK;
+    }
+
+    // create device instance
+    if (net_config.device_type == DEVICE_CUDA) {
+        net_config.network_type = NETWORK_TYPE_TENSORRT;
+    }
+
+    instance_device_ = tnn_->CreateInst(net_config, status, input_shapes);
+    if (status != TNN_OK) {
+        LOGE("create device instance failed: %s\n", status.description().c_str());
+        return Status(TNNERR_INST_ERR, "create device instance failed");
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::SetModelCheckerParams(ModelCheckerParam params) {
+    model_checker_params_ = params;
+    return TNN_OK;
+}
+
+Status ModelChecker::RunModelChecker() {
+    Status ret = TNN_OK;
+
+    if (model_checker_params_.only_check_output) {
+        if (!model_checker_params_.dump_dir_path.empty()) {
+            ret = RunModelCheckerFromDumpFile();
+        } else {
+            ret = RunModelCheckerOutput();
+        }
+    } else {
+        ret = RunModelCheckerPerLayer();
+    }
+
+    return ret;
+}
+
+Status ModelChecker::ChangeBatchOfInputShapes(InputShapesMap& input_shapes) {
+    // check validation
+    if (input_shapes.size() <= 0) {
+        return Status(TNNERR_INVALID_MODEL, "input shape count less then 0");
+    }
+
+    int cur_batch = input_shapes.begin()->second[0];
+    for (auto shape : input_shapes) {
+        if (cur_batch != shape.second[0]) {
+            return Status(TNNERR_INVALID_MODEL, "the batch of each input are not equal");
+        }
+    }
+
+    // change batch to 2*batch
+    for (auto& shape : input_shapes) {
+        shape.second[0] = cur_batch * 2;
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::RunModelCheckerPerLayer() {
+    LOGD("ModelChecker::RunModelCheckerPerLayer\n");
+    // feed instance input
+    Status ret = FeedInputData();
+    if (ret != TNN_OK) {
+        return Status(TNNERR_COMMON_ERROR, "feed input data failed");
+    }
+
+    // get ref output data
+    ret = GetOutputRefData();
+    if (ret != TNN_OK) {
+        return Status(TNNERR_COMMON_ERROR, "get output reference data failed");
+    }
+
+    // get cpu instance blobs data
+    ret = GetCpuBlobData();
+    if (ret != TNN_OK) {
+        return Status(TNNERR_COMMON_ERROR, "get cpu blob data failed");
+    }
+
+    // compare between cpu and device
+    ret = CompareDeviceAndCpu();
+    if (ret != TNN_OK) {
+        return Status(TNNERR_COMMON_ERROR, "compare device and cpu data failed");
+    }
+
+    // check result
+    bool check_pass  = true;
+    int failed_count = 0;
+    int pass_count   = 0;
+    for (auto item : check_results) {
+        if (!item.second) {
+            failed_count++;
+            check_pass = false;
+            LOGE("layer is not aligned! (layer name: %s,  layer type: %s)\n", item.first->name.c_str(),
+                 item.first->type_str.c_str());
+        } else {
+            pass_count++;
+        }
+    }
+    if (check_pass) {
+        return TNN_OK;
+    } else {
+        printf("failed layer count: %d    pass layer count: %d\n", failed_count, pass_count);
+        if (!check_results.back().second) {
+            printf("the last layer check failed!\n");
+        }
+        return Status(TNNERR_COMMON_ERROR, "model check failed");
+    }
+}
+
+Status ModelChecker::RunModelCheckerFromDumpFile() {
+    LOGD("ModelChecker::RunModelCheckerFromDumpFile\n");
+    Status status = FeedInputData();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    check_results.clear();
+
+    TNN_NS::BlobStatisticCallback cpu_func_after = [&](std::vector<TNN_NS::Blob*>& blobs, TNN_NS::LayerInfo* info) {
+        if (!check_results.empty()) {
+            return;
+        }
+        bool check_pass = true;
+        for (auto blob : blobs) {
+            const auto data_type = blob->GetBlobDesc().data_type;
+            const auto blob_name = blob->GetBlobDesc().name;
+            auto replace_name    = blob_name;
+
+            std::replace(replace_name.begin(), replace_name.end(), '/', '_');
+            const auto dump_data_path = model_checker_params_.dump_dir_path + replace_name + ".txt";
+
+            FileReader file_reader;
+            auto status = file_reader.Read(output_ref_mat_map_, dump_data_path, TEXT);
+            if (status != TNN_OK) {
+                LOGE("read input file (%s) failed!\n", dump_data_path.c_str());
+                return;
+            }
+
+            auto* dump_data_ptr = output_ref_mat_map_[replace_name]->GetData();
+            auto* tnn_data_ptr  = blob->GetHandle().base;
+            auto data_dims      = blob->GetBlobDesc().dims;
+
+            check_pass &= CompareData(dump_data_ptr, tnn_data_ptr, data_type, data_dims);
+            if (!check_pass) {
+                check_results.push_back(std::make_pair(info, check_pass));
+
+                const auto dump_path = model_checker_params_.dump_dir_path + "tnn-" + replace_name + ".txt";
+                DumpBlobData(tnn_data_ptr, data_dims, dump_path);
+                printf("TNN model and src model not aligned at %s\n", blob_name.c_str());
+                printf("You can find the output of %s of TNN at %s\n", blob_name.c_str(), dump_path.c_str());
+                printf("You can find the output of %s of source model at %s\n", blob_name.c_str(),
+                       (model_checker_params_.dump_dir_path + replace_name + ".txt").c_str());
+                return;
+            }
+        }
+    };
+
+    status = instance_cpu_->ForwardWithCallback(nullptr, cpu_func_after);
+
+    if (check_results.empty()) {
+        return TNN_OK;
+    }
+
+    LOGE("layer is not aligned! (layer name: %s,  layer type: %s)\n", check_results[0].first->name.c_str(),
+         check_results[0].first->type_str.c_str());
+
+    return Status(TNNERR_COMMON_ERROR, "model check failed");
+}
+
+Status ModelChecker::RunModelCheckerOutput() {
+    LOGD("ModelChecker::RunModelCheckerOutput\n");
+    // feed instance input
+    auto status = FeedInputData();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // get ref output data
+    status = GetOutputRefData();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    if (output_ref_mat_map_.empty() && instance_device_ == instance_cpu_) {
+        LOGE("output file must be specified with option -f when check tnn result with device = NAIVE\n");
+        return Status(TNNERR_COMMON_ERROR,
+                      "output file must be specified with option -f when check tnn result with device = NAIVE");
+    }
+
+    // extend output_ref_mat_map_ if test batch
+    if (!output_ref_mat_map_.empty()) {
+        BlobMap output_blobs_cpu;
+        auto status = instance_cpu_->GetAllOutputBlobs(output_blobs_cpu);
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        status = ExtendMatMap(output_blobs_cpu, output_ref_mat_map_);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+
+    // get ref output blob data
+    if (output_ref_mat_map_.empty() && instance_device_ != instance_cpu_) {
+        status = instance_cpu_->Forward();
+        RETURN_ON_NEQ(status, TNN_OK);
+
+        status = GetOutputData(instance_cpu_.get(), output_ref_mat_map_);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+
+    // get device output blob data
+    status = instance_device_->Forward();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    std::map<std::string, std::shared_ptr<Mat>> device_output_mat_map;
+    status = GetOutputData(instance_device_.get(), device_output_mat_map);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // compare data diff and cos-distance
+    bool check_pass = true;
+    for (auto device_output_item : device_output_mat_map) {
+        auto blob_name = device_output_item.first;
+        if (output_ref_mat_map_.count(blob_name) == 0) {
+            LOGE("cpu output don't have mat (name: %s)\n", blob_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, "cpu and device output not match");
+        }
+        auto cpu_blob_dims    = output_ref_mat_map_[blob_name]->GetDims();
+        auto device_blob_dims = device_output_mat_map[blob_name]->GetDims();
+        const auto mat_type = output_ref_mat_map_[blob_name]->GetMatType();
+
+        if ( mat_type != device_output_mat_map[blob_name]->GetMatType()) {
+            LOGE("output mat type is not the same. blob_name(%s) output_ref_mat_map_(%d) device_output_mat_map(%d)\n",
+                           blob_name.c_str(),
+                           output_ref_mat_map_[blob_name]->GetMatType(),
+                           device_output_mat_map[blob_name]->GetMatType());
+            return Status(TNNERR_COMMON_ERROR, "the mat type of cpu and device output  dont match");
+        }
+
+        DataType data_type = DATA_TYPE_FLOAT;
+        if (mat_type == NC_INT32) {
+            data_type = DATA_TYPE_INT32;
+        } else if(mat_type == RESERVED_INT8_TEST) {
+            data_type = DATA_TYPE_INT8;
+        }
+        
+        // check for dims count
+        if (!DimsVectorUtils::Equal(cpu_blob_dims, device_blob_dims)) {
+            LOGI("the output dims of cpu and device are not same! (blob name: %s)\n", blob_name.c_str());
+        }
+
+        int batch = 0;
+        int bytesize_perbatch = 0;
+        if (cpu_blob_dims.size() == 4) {
+            batch = cpu_blob_dims[0];
+            bytesize_perbatch =
+            DimsVectorUtils::Count(cpu_blob_dims, 1) * GetMatElementSize(output_ref_mat_map_[blob_name].get());
+        } else {
+            batch = 1;
+            bytesize_perbatch =
+            DimsVectorUtils::Count(cpu_blob_dims) * GetMatElementSize(output_ref_mat_map_[blob_name].get());
+        }
+
+        printf("\n---- blob (name:%s  data_type:%d) ----\n", blob_name.c_str(), data_type);
+        auto compare_dims = cpu_blob_dims;
+        compare_dims[0]   = 1;
+        for (int b = 0; b < batch; ++b) {
+            printf("\tbatch: %d\n", b);
+            int offset = b * bytesize_perbatch;
+            bool check_result = false;
+            if (CompareData((char*)device_output_mat_map[blob_name]->GetData() + offset,
+                            (char*)output_ref_mat_map_[blob_name]->GetData() + offset,
+                            data_type,
+                            compare_dims,
+                            COSINE)) {
+                check_result = true;
+            } else if (CompareData((char*)device_output_mat_map[blob_name]->GetData() + offset,
+                                   (char*)output_ref_mat_map_[blob_name]->GetData() + offset,
+                                   data_type,
+                                   compare_dims)) {
+                check_result = true;
+            } else {
+                check_result = false;
+            }
+            check_pass &= check_result;
+        }
+
+        if (model_checker_params_.dump_output) {
+            printf("\ndump blob (%s) data\n", blob_name.c_str());
+            DumpBlobData(output_ref_mat_map_[blob_name]->GetData(), cpu_blob_dims, "cpu_" + blob_name + ".txt");
+            DumpBlobData(device_output_mat_map[blob_name]->GetData(), device_blob_dims, "device_" + blob_name + ".txt");
+        }
+    }
+    if (check_pass) {
+        return TNN_OK;
+    } else {
+        return Status(TNNERR_COMMON_ERROR, "model check failed");
+    }
+}
+
+bool ModelChecker::IsDimsCanBeExtend(std::vector<int> src_dims, std::vector<int> dst_dims) {
+    if (src_dims.size() != dst_dims.size()) {
+        return false;
+    }
+
+    if (dst_dims[0] < src_dims[0]) {
+        return false;
+    }
+
+    if (dst_dims[0] % src_dims[0] != 0) {
+        return false;
+    }
+
+    for (int i = 1; i < dst_dims.size(); ++i) {
+        if (src_dims[i] != dst_dims[i]) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+Status ModelChecker::ExtendMatMap(const BlobMap& blobs_map, std::map<std::string, std::shared_ptr<Mat>>& mat_map) {
+    for (auto item : blobs_map) {
+        auto blob_name = item.first;
+        if (mat_map.count(blob_name) <= 0) {
+            LOGE("mat map don't has blob data (name: %s)\n", blob_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, "extend failed: mat map is not match with blobs map");
+        }
+
+        auto mat      = mat_map[blob_name];
+        auto src_dims = mat->GetDims();
+        auto dst_dims = item.second->GetBlobDesc().dims;
+
+        if (DimsVectorUtils::Equal(src_dims, dst_dims)) {
+            continue;
+        }
+
+        printf("Warning: mat map (name: %s) will try to be extended due to dims not match\n", blob_name.c_str());
+        if (!IsDimsCanBeExtend(src_dims, dst_dims)) {
+            return Status(TNNERR_COMMON_ERROR, "extend failed: dims can't be extend");
+        }
+
+        int bytesize_perbatch = DimsVectorUtils::Count(src_dims, 1) * GetMatElementSize(mat.get());
+        int src_batch_size    = src_dims[0];
+        int dst_batch_size    = dst_dims[0];
+        int src_bytesize      = bytesize_perbatch * src_batch_size;
+
+        printf("batch extrend form %d to %d\n", src_batch_size, dst_batch_size);
+        std::shared_ptr<Mat> mat_new(new Mat(mat->GetDeviceType(), mat->GetMatType(), dst_dims));
+        int batch_idx = 0;
+        for (; batch_idx < dst_batch_size - src_batch_size; batch_idx += src_batch_size) {
+            memcpy((char*)mat_new->GetData() + batch_idx * bytesize_perbatch, mat->GetData(), src_bytesize);
+        }
+        int batch_left = dst_batch_size - batch_idx;
+        memcpy((char*)mat_new->GetData() + batch_idx * bytesize_perbatch, mat->GetData(),
+               batch_left * bytesize_perbatch);
+
+        mat_map[blob_name] = mat_new;
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::FeedInputData() {
+    BlobMap input_blobs_cpu;
+    auto status = instance_cpu_->GetAllInputBlobs(input_blobs_cpu);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // get mat map
+    std::map<std::string, std::shared_ptr<Mat>> input_mat_map;
+    std::string input_name = model_checker_params_.input_file.first;
+    if (input_name != "") {
+        FileReader file_reader;
+        file_reader.SetBiasValue(model_checker_params_.input_bias);
+        file_reader.SetScaleValue(model_checker_params_.input_scale);
+        status = file_reader.Read(input_mat_map, input_name, model_checker_params_.input_file.second);
+        if (status != TNN_OK) {
+            LOGE("read input file (%s) failed!\n", input_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, "read input failed");
+        }
+
+        status = ExtendMatMap(input_blobs_cpu, input_mat_map);
+        RETURN_ON_NEQ(status, TNN_OK);
+    } else {
+        LOGE("Generate Random input...\n");
+        for (auto item : input_blobs_cpu) {
+            auto dims      = item.second->GetBlobDesc().dims;
+            auto data_type = item.second->GetBlobDesc().data_type;
+            int data_count = DimsVectorUtils::Count(dims);
+            std::shared_ptr<Mat> mat;
+            if (DATA_TYPE_FLOAT == data_type) {
+                mat             = std::shared_ptr<Mat>(new Mat(DEVICE_NAIVE, NCHW_FLOAT, dims));
+                float* data_ptr = reinterpret_cast<float*>(mat->GetData());
+                for (int i = 0; i < data_count; i++) {
+                    data_ptr[i] = (float)(rand() % 256 - 128) / 128.0f;
+                }
+            } else if (DATA_TYPE_INT32 == data_type) {
+                //ensure gather indice is valid
+                mat           = std::shared_ptr<Mat>(new Mat(DEVICE_NAIVE, NC_INT32, dims));
+                int* data_ptr = reinterpret_cast<int*>(mat->GetData());
+                for (int i = 0; i < data_count; i++) {
+                    data_ptr[i] = rand() % 2;
+                }
+            } else {
+                return Status(TNNERR_COMMON_ERROR, "generate input data failed");
+            }
+            input_mat_map[item.first] = mat;
+        }
+    }
+
+    // feed cpu instance input
+    for (auto item : input_blobs_cpu) {
+        if (input_mat_map.count(item.first) == 0) {
+            LOGE("input mat map not found blob data (name: %s)\n", item.first.c_str());
+            return Status(TNNERR_COMMON_ERROR, "input mat not match with blobs");
+        }
+        MatConvertParam param;
+        status = instance_cpu_->SetInputMat(input_mat_map[item.first], param, item.first);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+
+    if (instance_device_ == instance_cpu_) {
+        return TNN_OK;
+    }
+
+    // feed device instance input
+    BlobMap input_blobs_device;
+    status = instance_device_->GetAllInputBlobs(input_blobs_device);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    for (auto item : input_blobs_device) {
+        if (input_mat_map.count(item.first) == 0) {
+            LOGE("input mat map not found blob data (name: %s)\n", item.first.c_str());
+            return Status(TNNERR_COMMON_ERROR, "input mat not match with blobs");
+        }
+        MatConvertParam param;
+        status = instance_device_->SetInputMat(input_mat_map[item.first], param, item.first);
+        RETURN_ON_NEQ(status, TNN_OK);
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::GetOutputRefData() {
+    std::string output_file_name = model_checker_params_.ref_file.first;
+
+    if ("" != output_file_name) {
+        if (TEXT == model_checker_params_.ref_file.second) {
+            FileReader file_reader;
+            auto status = file_reader.Read(output_ref_mat_map_, output_file_name, TEXT);
+            if (status != TNN_OK) {
+                LOGE("read input file (%s) failed!\n", output_file_name.c_str());
+                return Status(TNNERR_COMMON_ERROR, "read input failed");
+            }
+        } else {
+            LOGE("invalid output reference file (%s)!\n", output_file_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, "invalid output ref file, we only support txt format!");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::GetCpuBlobData() {
+    BlobStatisticCallback cpu_func_after = [&](std::vector<Blob*>& blobs, LayerInfo* info) {
+        for (auto blob : blobs) {
+            auto ret = GetBlobData(instance_cpu_.get(), blob, cpu_blobdata_map);
+            if (ret != TNN_OK) {
+                LOGE("get blob data failed (%s)\n", ret.description().c_str());
+            }
+        }
+    };
+
+    return instance_cpu_->ForwardWithCallback(nullptr, cpu_func_after);
+}
+
+Status ModelChecker::GetOutputData(Instance* instance, std::map<std::string, std::shared_ptr<Mat>>& output_map) {
+    BlobMap output_blobs;
+    instance->GetAllOutputBlobs(output_blobs);
+
+    for (auto blobs_item : output_blobs) {
+        auto data_type = blobs_item.second->GetBlobDesc().data_type;
+        
+        //keep the same as FileReader::Read
+        MatType mat_type = INVALID;
+        if (DATA_TYPE_FLOAT == data_type || DATA_TYPE_HALF == data_type) {
+            mat_type = NCHW_FLOAT;
+        } else if (DATA_TYPE_INT32 == data_type) {
+            mat_type = NC_INT32;
+        } else if (DATA_TYPE_INT8 == data_type) {
+            mat_type = RESERVED_INT8_TEST;
+        } else {
+            LOGE("ModelChecker::GetOutputData dont support data type:%d\n", data_type);
+            return Status(TNNERR_INVALID_INPUT, "the data type is not support in ModelChecker::GetOutputData");
+        }
+        
+        std::shared_ptr<Mat> mat;
+        auto blob_name = blobs_item.first;
+
+        auto ret = instance->GetOutputMat(mat, MatConvertParam(), blob_name, DEVICE_NAIVE, mat_type);
+        if (ret != TNN_OK) {
+            LOGE("get output mat failed (%s)\n", ret.description().c_str());
+            return ret;
+        }
+
+        output_map[blob_name] = mat;
+    }
+
+    return TNN_OK;
+}
+
+Status ModelChecker::GetBlobData(Instance* instance, Blob* blob,
+                                 std::map<std::string, std::shared_ptr<char>>& output_map) {
+    auto blob_desc        = blob->GetBlobDesc();
+    std::string blob_name = blob_desc.name;
+    auto data_type = blob_desc.data_type;
+    
+    //keep the same as FileReader::Read
+    MatType mat_type = INVALID;
+    if (DATA_TYPE_FLOAT == data_type || DATA_TYPE_HALF == data_type) {
+        mat_type = NCHW_FLOAT;
+    } else if (DATA_TYPE_INT32 == data_type) {
+        mat_type = NC_INT32;
+    } else if (DATA_TYPE_INT8 == data_type) {
+        mat_type = RESERVED_INT8_TEST;
+    } else {
+        LOGE("ModelChecker::GetBlobData dont support data type:%d\n", data_type);
+        return Status(TNNERR_INVALID_INPUT, "the data type is not support in ModelChecker::GetBlobData");
+    }
+    
+    
+    // convert blob
+    int data_bytes_size = DataTypeUtils::GetBytesSize(data_type);
+    if (DATA_TYPE_HALF == data_type) {
+        // fp16 use NCHW_FLOAT mat
+        data_bytes_size = sizeof(float);
+    }
+    int blob_data_bytes   = DimsVectorUtils::Count(blob_desc.dims) * data_bytes_size;
+    output_map[blob_name] = std::shared_ptr<char>(new char[blob_data_bytes], [](char* p) { delete[] p; });
+
+    void* command_queue;
+    instance->GetCommandQueue(&command_queue);
+    MatConvertParam param;
+    BlobConverter blob_converter(blob);
+    TNN_NS::Mat cpu_mat(DEVICE_NAIVE, mat_type, blob_desc.dims, output_map[blob_name].get());
+    Status ret = blob_converter.ConvertToMat(cpu_mat, param, command_queue);
+    if (ret != TNN_OK) {
+        LOGE("blob (name:%s) converte failed (%s)\n", blob_name.c_str(), ret.description().c_str());
+        return ret;
+    }
+    return TNN_OK;
+}
+
+Status ModelChecker::CompareDeviceAndCpu() {
+    BlobMap output_blobs_device;
+    instance_device_->GetAllOutputBlobs(output_blobs_device);
+
+    check_results.clear();
+
+    BlobStatisticCallback device_func_after = [&](std::vector<Blob*>& blobs, LayerInfo* info) {
+        bool is_pass = true;
+
+        for (auto blob : blobs) {
+            std::map<std::string, std::shared_ptr<char>> device_output_map;
+            auto blob_desc        = blob->GetBlobDesc();
+            std::string blob_name = blob_desc.name;
+            auto ret              = GetBlobData(instance_device_.get(), blob, device_output_map);
+            if (ret != TNN_OK) {
+                LOGE("get blob data failed (%s)\n", ret.description().c_str());
+            }
+            char* output_data_ptr = device_output_map[blob_name].get();
+            const auto data_type = blob_desc.data_type;
+
+            // compare device data with default data
+            if (cpu_blobdata_map.count(blob_name) <= 0 && info->type == LAYER_REFORMAT) {
+                printf("Skip reformat laye:%s\n", info->name.c_str());
+                return;
+            }
+            is_pass &= CompareData(output_data_ptr, cpu_blobdata_map[blob_name].get(), data_type, blob_desc.dims);
+
+            // compare data with reference file
+            if (!output_ref_mat_map_.empty()) {
+                if (output_blobs_device.find(blob_name) != output_blobs_device.end()) {
+                    if (output_ref_mat_map_.find(blob_name) != output_ref_mat_map_.end()) {
+                        auto compare_data = output_ref_mat_map_[blob_name]->GetData();
+                        is_pass &= CompareData(output_data_ptr, compare_data, data_type, blob_desc.dims);
+                    } else {
+                        LOGE("The output layer name: %s not find in the reference file.\n", blob_name.c_str());
+                        is_pass = false;
+                    }
+                }
+            }
+
+            if (model_checker_params_.dump_output) {
+                if (output_blobs_device.find(blob_name) != output_blobs_device.end()) {
+                    LOGE("dump blob (%s) data\n", blob_name.c_str());
+                    DumpBlobData(cpu_blobdata_map[blob_name].get(), blob_desc.dims, "cpu_" + blob_name + ".txt");
+                    DumpBlobData(output_data_ptr, blob_desc.dims, "device_" + blob_name + ".txt");
+                }
+            }
+        }
+        check_results.push_back(std::make_pair(info, is_pass));
+    };
+
+    return instance_device_->ForwardWithCallback(nullptr, device_func_after);
+}
+
+bool ModelChecker::CompareData(void* device_data, void* cpu_data, DataType data_type, DimsVector blob_dims, CompareType dist_type) {
+    int data_count     = DimsVectorUtils::Count(blob_dims);
+    
+    //use COSINE only for float data
+    if (data_type != DATA_TYPE_FLOAT) {
+        dist_type = DEFAULT;
+    }
+
+    if (DEFAULT == dist_type) {
+        float ep = 0.005;
+        if (data_type == DATA_TYPE_FLOAT) {
+            auto result_data  = reinterpret_cast<float*>(device_data);
+            auto ref_data    = reinterpret_cast<float*>(cpu_data);
+            for (unsigned long long i = 0; i < data_count; i++) {
+                auto diff = static_cast<float>(fabs(result_data[i] - ref_data[i]));
+                auto sum  = static_cast<float>(fabs(result_data[i]) + fabs(ref_data[i]));
+                if (fabs(diff / sum) > ep && fabs(diff) > 1e-3f) {
+                    LOGE("ERROR AT %llu result %.6f ref %.6f  diff/sum %f  diff %f\n", i, result_data[i],
+                         ref_data[i],
+                         fabs(diff / sum), fabs(diff));
+                    return false;
+                }
+            }
+        } else if (data_type == DATA_TYPE_INT32) {
+            auto result_data  = reinterpret_cast<int*>(device_data);
+            auto ref_data    = reinterpret_cast<int*>(cpu_data);
+            for (unsigned long long i = 0; i < data_count; i++) {
+                if (abs(result_data[i] - ref_data[i]) > 1) {
+                    LOGE("ERROR AT %llu result %d ref %d\n", i, result_data[i], ref_data[i]);
+                    return false;
+                }
+            }
+        } else if (data_type == DATA_TYPE_INT8) {
+            auto result_data  = reinterpret_cast<char*>(device_data);
+            auto ref_data    = reinterpret_cast<char*>(cpu_data);
+            for (unsigned long long i = 0; i < data_count; i++) {
+                if (abs(result_data[i] - ref_data[i]) > 1) {
+                    LOGE("ERROR AT %llu result %d ref %d\n", i, result_data[i], ref_data[i]);
+                    return false;
+                }
+            }
+        } else {
+            LOGE("ModelChecker::CompareData dont support compare data type: %d\n", data_type);
+            return false;
+        }
+
+    } else if (COSINE == dist_type) {
+        auto result_data  = reinterpret_cast<float*>(device_data);
+        auto ref_data    = reinterpret_cast<float*>(cpu_data);
+        
+        double max_diff     = 0;
+        int max_diff_idx    = -1;
+        double cos_distance = 0;
+
+        double cpu_device_mul = 0;
+        double cpu_sum2       = 0.000001;
+        double device_sum2    = 0.000001;
+        for (unsigned long long i = 0; i < data_count; i++) {
+            float diff = static_cast<float>(fabs(result_data[i] - ref_data[i]));
+            if (diff > max_diff) {
+                max_diff     = diff;
+                max_diff_idx = i;
+            }
+            cpu_device_mul += result_data[i] * ref_data[i];
+            cpu_sum2 += ref_data[i] * ref_data[i];
+            device_sum2 += result_data[i] * result_data[i];
+        }
+        cos_distance = cpu_device_mul / std::sqrt(cpu_sum2) / std::sqrt(device_sum2);
+
+        printf("max diff: %lf   index: %d\n", max_diff, max_diff_idx);
+        printf("cos distance: %lf\n", cos_distance);
+        if (cos_distance < 0.999 || std::isnan(cos_distance) || std::isinf(cos_distance)) {
+            return false;
+        }
+    } else {
+        LOGE("ModelChecker::CompareData unsupport compare data CompareType\n");
+    }
+
+    return true;
+}
+
+void ModelChecker::DumpBlobData(void* blob_data, DimsVector blob_dims, std::string output_name) {
+    std::ofstream f_out(output_name.c_str());
+
+    int count       = DimsVectorUtils::Count(blob_dims);
+    float* data_ptr = reinterpret_cast<float*>(blob_data);
+    for (int index = 0; index < count; ++index) {
+        f_out << data_ptr[index] << std::endl;
+    }
+
+    f_out.close();
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/tools/model_check/model_checker.h b/3rdparty/TNN/tools/model_check/model_checker.h
new file mode 100644
index 0000000..b9c3ddb
--- /dev/null
+++ b/3rdparty/TNN/tools/model_check/model_checker.h
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_MODEL_CHECK_MODEL_CHECKER_H_
+#define TNN_TOOLS_MODEL_CHECK_MODEL_CHECKER_H_
+
+#include <memory>
+#include "file_reader.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/core/tnn.h"
+
+namespace TNN_NS {
+
+struct ModelCheckerParam {
+    std::pair<std::string, FileFormat> input_file;
+    std::vector<float> input_bias;
+    std::vector<float> input_scale;
+    bool dump_output       = false;
+    bool only_check_output = false;
+    bool check_batch       = false;
+    std::pair<std::string, FileFormat> ref_file;
+    std::string dump_dir_path;
+};
+
+enum CompareType { DEFAULT = 0, COSINE = 1 };
+
+class ModelChecker {
+public:
+    // @brief ModelChecker Constructor
+    ModelChecker();
+
+    // @brief ModelChecker virtual Destructor
+    virtual ~ModelChecker();
+
+public:
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    Status Init(NetworkConfig& net_config, ModelConfig& model_config);
+
+    // @brief set model checker param
+    // @param params the params of model checker
+    Status SetModelCheckerParams(ModelCheckerParam params);
+
+    // @brief run model checker
+    Status RunModelChecker();
+
+private:
+    // @brief change batch size to check multi batch
+    Status ChangeBatchOfInputShapes(InputShapesMap& input_shapes);
+
+    // @brief compare all blobs data between device and cpu
+    Status CompareDeviceAndCpu();
+    // @brief per channel compare
+    Status RunModelCheckerPerLayer();
+
+    // @brief just compare output
+    Status RunModelCheckerOutput();
+
+    // @brief per layer compare dump file
+    Status RunModelCheckerFromDumpFile();
+
+    // @brief judge whether src_dims can be extended to dst_dims
+    bool IsDimsCanBeExtend(std::vector<int> src_dims, std::vector<int> dst_dims);
+    // @brief extend mat map due to multi batch
+    Status ExtendMatMap(const BlobMap& blobs_map, std::map<std::string, std::shared_ptr<Mat>>& mat_map);
+    // @brief feed input data of instance
+    Status FeedInputData();
+
+    // @brief get output mat map form instance
+    Status GetOutputData(Instance* instance, std::map<std::string, std::shared_ptr<Mat>>& output_map);
+    // @brief get output mat map form file
+    Status GetOutputRefData();
+    // @brief convert blob data to nchw float data
+    Status GetBlobData(Instance* instance, Blob* blob, std::map<std::string, std::shared_ptr<char>>& output_map);
+    // @brief get all blobs data
+    Status GetCpuBlobData();
+
+    // @brief compare raw
+    bool CompareData(void* device_data, void* cpu_data, DataType data_type, DimsVector blob_dims, CompareType type = DEFAULT);
+    // @brief dump blob data
+    void DumpBlobData(void* blob_data, DimsVector blob_dims, std::string output_name);
+
+    ModelCheckerParam model_checker_params_;
+    std::shared_ptr<TNN> tnn_                  = nullptr;
+    std::shared_ptr<Instance> instance_cpu_    = nullptr;
+    std::shared_ptr<Instance> instance_device_ = nullptr;
+    std::map<std::string, std::shared_ptr<Mat>> output_ref_mat_map_;
+    std::map<std::string, std::shared_ptr<char>> cpu_blobdata_map;
+    std::vector<std::pair<LayerInfo*, bool>> check_results;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_MODEL_CHECK_MODEL_CHECKER_H_
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/CMakeLists.txt
new file mode 100755
index 0000000..d905120
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/CMakeLists.txt
@@ -0,0 +1,70 @@
+get_filename_component(TNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..  ABSOLUTE)
+
+# include(script/protobuf.cmake)
+find_package(Protobuf REQUIRED)
+if (PROTOBUF_FOUND)
+    include_directories(${Protobuf_INCLUDE_DIRS})
+    include_directories(${CMAKE_CURRENT_BINARY_DIR})
+    protobuf_generate_cpp(ONNX_PROTO_SRC ONNX_PROTO_HEAD ${TNN_ROOT}/tools/converter/source/onnx//onnx-proto/onnx.proto)
+else ()
+    message(FATAL_ERROR "Protobuf not found, must install first")
+endif ()
+
+add_subdirectory(pybind11)
+
+if(APPLE)
+    find_library(MAC_ACCE Accelerate)
+    if (NOT MAC_ACCE)
+        message(FATAL_ERROR "Accelerate not found")
+    endif()
+endif(APPLE)
+
+file(GLOB_RECURSE ONNX2TNN_SRC ${TNN_ROOT}/tools/onnx2tnn/src/core/*.h ${TNN_ROOT}/tools/onnx2tnn/src/core/*.cc)
+
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/core)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/core/objseri)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/core/half)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/core/layer)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/core/onnx_fuse)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/onnx_remove)
+include_directories(${TNN_ROOT}/tools/onnx2tnn/src/onnx_process_tf)
+
+
+include_directories(${TNN_ROOT}/include)
+include_directories(${TNN_ROOT}/source)
+
+# if convert onnx model from tf , you should uncomment below code .
+# add_definitions(-DPROCESS_TF)
+
+add_definitions(-DDEBUG)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-declarations -Wno-ignored-attributes")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-ignored-attributes")
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
+SET(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS} -rdynamic")
+
+add_library(onnx2tnn MODULE
+                             onnx2tnn_convert.cc
+                             ${ONNX2TNN_SRC}
+                             ${ONNX_PROTO_SRC}
+                             ${ONNX_PROTO_HEAD})
+set_target_properties(onnx2tnn PROPERTIES CXX_STANDARD 11)
+if (MAC_ACCE)
+    target_link_libraries(onnx2tnn PRIVATE ${MAC_ACCE})
+endif()
+target_link_libraries(onnx2tnn PRIVATE pybind11::module)
+target_link_libraries(onnx2tnn PRIVATE ${Protobuf_LIBRARIES})
+set_target_properties(onnx2tnn PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+if(TNN_BUILD_SHARED)
+    target_link_libraries(onnx2tnn PRIVATE TNN)
+elseif(SYSTEM.iOS OR SYSTEM.Darwin)
+    #use -w to fix ld: warning: direct access to global weak symbol
+    target_link_libraries(onnx2tnn PRIVATE -w -Wl,-force_load TNN)
+else()
+    message("target link libraries whole-archive")
+    target_link_libraries(onnx2tnn PRIVATE -Wl,--whole-archive TNN -Wl,--no-whole-archive)
+endif()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/__init__.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/__init__.py
new file mode 100644
index 0000000..6019a51
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/__init__.py
@@ -0,0 +1 @@
+from .onnx2tnn import *
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/build.sh b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/build.sh
new file mode 100755
index 0000000..7e9754a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/build.sh
@@ -0,0 +1,82 @@
+##!/usr/bin/env bash
+#After brew install python@3.7, maybe you should set it as default, or pybind11 will raise error of symbol not found
+#If you need to have python@3.7 first in your PATH run:
+#  echo 'export PATH="/usr/local/opt/python@3.7/bin:$PATH"' >> ~/.zshrc
+#
+#For compilers to find python@3.7 you may need to set:
+#  export LDFLAGS="-L/usr/local/opt/python@3.7/lib"
+#
+#For pkg-config to find python@3.7 you may need to set:
+#  export PKG_CONFIG_PATH="/usr/local/opt/python@3.7/lib/pkgconfig"
+
+export CMAKE=cmake
+export CPP_COMPILER=g++
+export C_COMPILER=gcc
+export PYTHON=`which python3`
+
+CURRENT_DIR=$(pwd)
+CLEAN=""
+BUILD_DIR=build
+BIN_DIR=bin
+
+function usage() {
+    echo "usage: ./build.sh [-c]"
+    echo "options:"
+    echo "        -c    Clean up build folders."
+}
+
+function clean_build() {
+    echo $1 | grep "${BUILD_DIR}\b" >/dev/null
+    if [[ "$?" != "0" ]]; then
+        die "Warnning: $1 seems not to be a BUILD folder."
+    fi
+    rm -rf $1
+    mkdir -p $1
+}
+
+function build_onnx2tnn() {
+
+	clean_build ${BIN_DIR}
+
+    if [ "-c" == "${CLEAN}" ]; then
+        clean_build ${BUILD_DIR}
+    fi
+    pwd
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ../../../.. \
+        -DCMAKE_BUILD_TYPE=DEBUG \
+        -DDEBUG:BOOL="ON" \
+        -DTNN_CPU_ENABLE:BOOL="ON" \
+        -DTNN_ONNX2TNN_ENABLE:BOOL="ON" \
+        -DTNN_BUILD_SHARED="OFF"
+
+    make -j4
+
+    #From the date 20210123 on, onnx2tnn is compiled by default with Cmake option DTNN_CONVERTER_ENABLE
+    onnx2nn_files=$(ls -U tools/onnx2tnn/onnx-converter/onnx2tnn*.so);
+    if [ ${#onnx2nn_files[*]} -ge 1 ]; then
+        pwd
+        cp ${onnx2nn_files[i]} ../../onnx-converter
+        echo "Compiled onnx2tnn successfully !"
+    else
+        echo "Compiled onnx2tnn failed !!!"
+    fi
+}
+
+
+while [ "$1" != "" ]; do
+    case $1 in
+    -c)
+        shift
+        CLEAN="-c"
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+    esac
+done
+
+build_onnx2tnn
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn.py
new file mode 100644
index 0000000..4cc46b7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+import os
+import argparse
+import sys
+import time
+import traceback
+# sys.path.append('./onnx-optimizer')
+# from onnx_optimizer import onnx_optimizer
+
+import onnx
+# from onnx import version_converter
+
+import onnx2tnn
+
+
+def check_file_exist(file_path):
+    if os.path.exists(file_path) is False:
+        print("the " + file_path + " does not exist! please make sure the file exist!")
+        exit(-1)
+
+
+def parse_path(path: str):
+    if path is None:
+        return None
+    if path.endswith(".onnx") is False:
+        print("please make sure the onnx file path end with  \'.onnx\'")
+        exit(-1)
+    if path.startswith("/"):
+        return path
+    elif path.startswith("./"):
+        return os.path.join(os.getcwd(), path[2:])
+    elif path.startswith("../"):
+        abs_path = os.getcwd() + "/" + path
+        return abs_path
+    else:
+        return os.path.join(os.getcwd(), path)
+
+
+def do_optimize(onnx_net_path, input_shape):
+    try:
+        import onnx2tnn.onnx_optimizer.onnx_optimizer as opt
+    except ImportError:
+        import onnx_optimizer.onnx_optimizer as opt
+    else:
+        print("\n\n t fail")
+        os.system(sys.executable + " onnx_optimizer " + onnx_net_path)
+        return
+
+    import multiprocessing
+    ctx = multiprocessing.get_context('spawn')
+
+    p = ctx.Process(target=opt.onnx_optimizer, args=(onnx_net_path, input_shape))
+    p.start()
+    p.join()
+    return
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('onnx_model_path', help='Input ONNX model path')
+    parser.add_argument('-version', help='Algorithm version string')
+    parser.add_argument('-optimize', help='If the model has fixed input shape, use this option to optimize the model for speed. On the other hand, if the model has dynamic input shape, dont use this option. It may cause warong result')
+    parser.add_argument('-half', help='Save model using half, 1:yes, 0:default no')
+    parser.add_argument('-o',
+                        dest='output_dir',
+                        required=False,
+                        action='store',
+                        help='the output dir for tnn model')
+    parser.add_argument('-input_shape', 
+                        required=False, 
+                        action='store',
+                        nargs='+',
+                        help='manually-set static input shape, useful when the input shape is dynamic')
+    args = parser.parse_args()
+    onnx_net_path = args.onnx_model_path
+    algo_version = args.version
+    algo_optimize = args.optimize
+    model_half = args.half
+    output_dir = args.output_dir
+    input_shape = ""
+    if args.input_shape is not None:
+        input_shape = ""
+        for item in args.input_shape:
+            input_shape += (item + " ")
+
+    if onnx_net_path is None:
+        print('Please make sure the onnx model path is correct!')
+        exit(-1)
+    onnx_net_path = parse_path(onnx_net_path)
+
+    if algo_optimize == None:
+        algo_optimize = '1'
+    if model_half == None:
+        model_half = '0'
+
+    if algo_version == None:
+        print('Please add model version with -version=xxxx')
+        return
+    if output_dir is None:
+        output_dir = os.path.dirname(onnx_net_path)
+    check_file_exist(onnx_net_path)
+    check_file_exist(output_dir)
+    onnx_net_opt_path = onnx_net_path[:-5] + '.opt.onnx'
+    if algo_optimize == '0':
+        onnx_net_opt_path = onnx_net_path
+
+    if "convert" not in dir(onnx2tnn):
+        print("\nYou should compile onnx2tnn first !!!")
+        print("You can find more compilation details in <path-to-tnn>/doc/cn/user/convert.md")
+        exit(-1)
+
+    # original_net = onnx.load(onnx_net_path)
+    # converted_model = version_converter.convert_version(original_net, 10)
+    print('0.----onnx version:' + str(onnx.__version__))
+
+    print('algo_optimize ' + algo_optimize)
+    print('onnx_net_opt_path ' + onnx_net_opt_path)
+    if algo_optimize != '0':
+        print("1.----onnx_optimizer: " + onnx_net_path)
+        do_optimize(onnx_net_path, input_shape)
+
+        if os.path.exists(onnx_net_opt_path) is False:
+            print("1.----onnx_optimizer:  onnx_simplifier.py failed, onnx2tnn will try to convert the orignal onnx model")
+            onnx_net_opt_path = onnx_net_path
+
+    # os.access('/python/test.py',os.F_OK)
+    print("2.----onnx2tnn: " + onnx_net_opt_path)
+    file_time = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
+    status = 0
+
+    try:
+        if input_shape is None:
+            input_shape = ""
+        status = onnx2tnn.convert(onnx_net_opt_path, output_dir, algo_version, file_time,
+                                  0 if model_half == '0' else 1,
+                                  1 if algo_optimize != '0' else 0,
+                                  input_shape)
+    except Exception as err:
+        status = -1
+        traceback.print_exc()
+
+    if status != 0:
+        exit(status)
+
+    print("3.----onnx2tnn status: " + str(status))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc
new file mode 100644
index 0000000..ae65fb7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc
@@ -0,0 +1,256 @@
+#include <float.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <set>
+#include <limits>
+#include <algorithm>
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/xattr.h>
+#include <sys/types.h>
+
+#include "onnx2tnn.h"
+#include "tnn/core/const_folder.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/core/blob.h"
+using namespace TNN_NS;
+
+#include <pybind11/pybind11.h>
+
+static string version_key = "user.version";
+static string time_key = "user.time";
+
+int onnx2tnn_time(std::string file_path)
+{
+    //获取版本信息到文件属性
+    if (file_path.length() > 0) {
+        char file_time[1024] = {'\0'};
+#if __APPLE__
+        getxattr(file_path.c_str(), time_key.c_str(), file_time, 1024, 0, 0);
+#elif __linux__
+        getxattr(file_path.c_str(), time_key.c_str(), file_time, 1024);
+#endif
+        printf("%s\n", file_time);
+    }
+    return 0;
+}
+
+int onnx2tnn_set_time(std::string file_path, std::string file_time)
+{
+    //设置版本信息到文件属性
+    if (file_time.length() > 0) {
+        if (file_time.length() > 1024) {
+            DLog("time length must <= 1024\n");
+            assert(0);
+        }
+#if __APPLE__
+        if (setxattr(file_path.c_str(), time_key.c_str(), file_time.c_str(), file_time.length(), 0, 0) != 0) {
+            DLog("setxattr error\n");
+            assert(0);
+        }
+#elif __linux__
+        if (setxattr(file_path.c_str(), time_key.c_str(), file_time.c_str(), file_time.length(), 0) != 0) {
+            DLog("setxattr error\n");
+            // assert(0);
+        }
+#endif
+    }
+    return 0;
+}
+
+int onnx2tnn_version(std::string file_path)
+{
+    //获取版本信息到文件属性
+    if (file_path.length() > 0) {
+        char file_version[1024] = {'\0'};
+#if __APPLE__
+        getxattr(file_path.c_str(), version_key.c_str(), file_version, 1024, 0, 0);
+#elif __linux__
+        // getxattr(file_path.c_str(), version_key.c_str(), file_version, 1024);
+#endif
+        printf("%s\n", file_version);
+    }
+    return 0;
+}
+
+int onnx2tnn_set_version(std::string file_path, std::string file_version)
+{
+    //设置版本信息到文件属性
+    if (file_version.length() > 0) {
+        if (file_version.length() > 1024) {
+            DLog("version length must <= 1024\n");
+            // assert(0);
+        }
+#if __APPLE__
+        if (setxattr(file_path.c_str(), version_key.c_str(), file_version.c_str(), file_version.length(), 0, 0) != 0) {
+            DLog("setxattr error\n");
+            assert(0);
+        }
+#elif __linux__
+        if (setxattr(file_path.c_str(), version_key.c_str(), file_version.c_str(), file_version.length(), 0) != 0) {
+            DLog("setxattr error\n");
+            // assert(0);
+        }
+#endif
+    }
+    return 0;
+}
+
+
+Status parse_input_info(std::string input_info, TNN_NS::InputShapesMap & input_shape_map) {
+    if (input_info.empty()) {
+        return TNN_NS::TNN_OK;
+    }
+    int size = input_info.size();
+    std::vector<std::string> split_input_info;
+    Status status = SplitUtils::SplitStr(input_info.c_str(), split_input_info, " ", true, false);
+    if (status != TNN_NS::TNN_OK) {
+        return Status(TNNERR_INVALID_NETCFG, "split input info error\n");
+    }
+    for (auto& item: split_input_info) {
+        str_arr input_cfg_vec;
+        status = SplitUtils::SplitStr(item.c_str(), input_cfg_vec, ":", true, false);
+        if (status != TNN_NS::TNN_OK || input_cfg_vec.size() < 2) {
+            return Status(TNNERR_INVALID_NETCFG, "split input line error\n");
+        }
+        auto input_name = input_cfg_vec[0];
+        DimsVector& input_shape = input_shape_map[input_name];
+        str_arr input_shape_vec;
+        status = SplitUtils::SplitStr(input_cfg_vec[1].c_str(), input_shape_vec, ",", true, false);
+        if (status != TNN_NS::TNN_OK || input_shape_vec.size() < 1) {
+            return Status(TNNERR_INVALID_NETCFG, "split input line error\n");
+        }
+        for (int i = 0; i < input_shape_vec.size() ; ++i) {
+            input_shape.push_back(atoi(input_shape_vec[i].c_str()));
+        }
+    }
+    return TNN_NS::TNN_OK;
+}
+
+
+//data_type: 0:float 1:half 2:int8 not support now
+int onnx2tnn_convert(std::string onnx_model_path, std::string output_dir, std::string algo_version,
+                     std::string file_time, int data_type, int fixed_input_shape, std::string input_info)
+{
+    std::string onnx_model_name;
+    std::string onnx_suffix  = ".onnx";
+    std::size_t sep_position = onnx_model_path.rfind('/');
+    if (sep_position != std::string::npos) {
+        onnx_model_name =
+            onnx_model_path.substr(sep_position + 1, onnx_model_path.size() - 1 - onnx_suffix.size() - sep_position);
+    }
+    std::string tnn_proto = output_dir + "/" + onnx_model_name + ".tnnproto";
+    std::string tnn_model = output_dir + "/" + onnx_model_name + ".tnnmodel";
+    TNN_NS::InputShapesMap input_shape_map = {};
+    LOGE("The input_info %s", input_info.c_str());
+    Status status = parse_input_info(input_info, input_shape_map);
+    Onnx2TNN converter(onnx_model_path, tnn_proto, tnn_model, input_shape_map);
+    int ret = converter.Convert((DataType)data_type);
+    if(ret != 0) {
+        DLog("tnn converter error:(%d)\n", ret);
+        return -1;
+    }
+
+    //do net const folding
+    {
+        //网络初始化
+        NetworkConfig network_config;
+        {
+            network_config.network_type = NETWORK_TYPE_DEFAULT;
+            network_config.device_type =  DEVICE_NAIVE;
+        }
+
+        ModelConfig model_config;
+        {
+            model_config.model_type = MODEL_TYPE_TNN;
+            {
+                std::ifstream proto_stream(tnn_proto);
+                if (!proto_stream.is_open() || !proto_stream.good()) {
+                    DLog("read proto_file failed!\n");
+                    return -1;
+                }
+                auto buffer = std::string((std::istreambuf_iterator<char>(proto_stream)),
+                                          std::istreambuf_iterator<char>());
+                model_config.params.push_back(buffer);
+            }
+
+            {
+                std::ifstream model_stream(tnn_model);
+                if (!model_stream.is_open() || !model_stream.good()) {
+                    DLog("read model_file failed!\n");
+                    return -1;
+                }
+                auto buffer = std::string((std::istreambuf_iterator<char>(model_stream)),
+                                          std::istreambuf_iterator<char>());
+                model_config.params.push_back(buffer);
+            }
+        }
+        
+        auto interpreter = dynamic_cast<DefaultModelInterpreter *>(CreateModelInterpreter(model_config.model_type));
+        if (!interpreter) {
+            return Status(TNNERR_NET_ERR, "interpreter is nil");
+        }
+        auto status = interpreter->Interpret(model_config.params);
+        if (status != TNN_OK) {
+            DLog("Interpret Error: %s\n", status.description().c_str());
+            return status;
+        }
+
+        auto const_folder = std::make_shared<ConstFolder>();
+        status = const_folder->Init(network_config, model_config, interpreter, {}, {});
+        if (status != TNN_OK) {
+            DLog("ConstFolder Init Error: %s\n", status.description().c_str());
+            return status;
+        }
+
+        status = const_folder->Forward();
+        if (status != TNN_OK) {
+            DLog("ConstFolder Forward Error: %s\n", status.description().c_str());
+            return status;
+        }
+
+        std::shared_ptr<NetStructure> opt_structure = nullptr;
+        std::shared_ptr<NetResource> opt_resource = nullptr;
+        status = const_folder->GetOptimizedNet(opt_structure, opt_resource,
+                                        fixed_input_shape ? DATA_FLAG_CHANGE_IF_SHAPE_DIFFER : DATA_FLAG_CHANGE_NEVER);
+        if (status != TNN_OK) {
+            DLog("GetOptimizedNet Error: %s\n", status.description().c_str());
+            return status;
+        }
+        
+        auto packer = std::make_shared<ModelPacker>(opt_structure.get(), opt_resource.get());
+        if (packer->Pack(tnn_proto, tnn_model) != 0) {
+            DLog("ModelPacker Pack failed!\n");
+            return -1;
+        }
+    }
+    
+    //添加版本信息到文件属性
+    onnx2tnn_set_version(tnn_proto, algo_version);
+    onnx2tnn_set_version(tnn_model, algo_version);
+
+    //添加时间信息到文件属性
+    if (file_time.length() > 0) {
+      onnx2tnn_set_time(tnn_proto, file_time);
+      onnx2tnn_set_time(tnn_model, file_time);
+    }
+
+    return ret;
+}
+
+PYBIND11_MODULE(onnx2tnn, m) {
+    m.doc() = "pybind11 onnx2tnn plugin"; // optional module docstring
+    m.def("convert", &onnx2tnn_convert, "A function to convert onnx to tnn");
+    m.def("version", &onnx2tnn_version, "A function to get file version");
+    m.def("set_version", &onnx2tnn_set_version, "A function to set version to tnn");
+    m.def("time", &onnx2tnn_time, "A function to get file time");
+    m.def("set_time", &onnx2tnn_set_time, "A function to set time to tnn");
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_model_cheker.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_model_cheker.py
new file mode 100644
index 0000000..c2f12e9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_model_cheker.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# @Author: Dandi Ding
+# @Date:   2019-11-29 17:40:11
+# @Last Modified by:   Dandiding
+# @Last Modified time: 2019-11-29 17:43:41
+
+import onnx
+import onnxruntime as rt
+import sys, traceback
+
+if len(sys.argv) < 2:
+    print("Usage {:s} onnx_model".format(sys.argv[0]))
+    exit(0)
+
+try:
+    model = model = onnx.load(sys.argv[1])
+    rt.InferenceSession(model.SerializeToString())
+    print("Check passed.")
+except RuntimeError:
+    traceback.print_exc(file=sys.stdout)
+    print("Check failed.")
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__init__.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__init__.py
new file mode 100755
index 0000000..8b13789
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__init__.py
@@ -0,0 +1 @@
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__main__.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__main__.py
new file mode 100755
index 0000000..0a5af90
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/__main__.py
@@ -0,0 +1,20 @@
+import argparse
+import onnx_optimizer
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('onnx_model', help='Input ONNX model')
+    parser.add_argument('-input_shape', type=str, nargs='+')
+    args = parser.parse_args()
+    onnx_net_path = args.onnx_model
+
+    input_shape = None
+    if args.input_shape is not None:
+        input_shape = ""
+        for item in args.input_shape:
+            input_shape += (item + " ")
+    onnx_optimizer.onnx_optimizer(onnx_net_path, input_shape)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/onnx_optimizer.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/onnx_optimizer.py
new file mode 100755
index 0000000..f60e106
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/onnx_optimizer/onnx_optimizer.py
@@ -0,0 +1,134 @@
+import os
+
+import argparse
+# import pickle
+
+import onnx
+import onnx.utils
+from onnx import optimizer
+
+# import onnxsim #pip3 install onnx-simplifier
+# import sys
+# sys.path.append('./onnx_simplifier/onnxsim')
+from onnxsim import onnx_simplifier
+
+def onnx_fix_prelu(m: onnx.ModelProto) -> None:
+    tensor_name_to_fix = set()
+
+    for node in m.graph.node:
+        if node.op_type == 'PRelu':
+            tensor_name_to_fix.add(node.input[1])  # 2nd input is the weight
+
+    for init in m.graph.initializer:
+        if init.name not in tensor_name_to_fix:
+            continue
+        if len(init.dims) > 1:
+            print('%s, PRelu to fix expect weight rank == 1' % init.name, file=sys.stderr)
+            continue
+        # We only support NCHW layput
+        c = init.dims.pop()
+        init.dims.extend([1, c, 1, 1])
+
+    for inp in m.graph.input:
+        if inp.name not in tensor_name_to_fix:
+            continue
+        if len(inp.type.tensor_type.shape.dim) > 1:
+            print('%s, PRelu to fix expect weight rank == 1' % inp.name, file=sys.stderr)
+            continue
+        # We only support NCHW layput
+        c = inp.type.tensor_type.shape.dim.pop().dim_value
+        d1 = onnx.TensorShapeProto.Dimension()
+        d2 = onnx.TensorShapeProto.Dimension()
+        d3 = onnx.TensorShapeProto.Dimension()
+        d4 = onnx.TensorShapeProto.Dimension()
+        d1.dim_value = 1
+        d2.dim_value = c
+        d3.dim_value = 1
+        d4.dim_value = 1
+        inp.type.tensor_type.shape.dim.extend([d1, d2, d3, d4])
+
+def onnx_optimizer(onnx_net_path, input_shape=None):
+        onnx_net_opt_path = onnx_net_path[:-5]+'.opt.onnx'
+
+        print(os.getcwd())
+
+        print("----load onnx model: "+onnx_net_path)
+        onnx_model = onnx.load(onnx_net_path)
+
+        # all_passes = optimizer.get_available_passes()
+        # print("----available optimization passes:")
+        # for p in all_passes:
+        #     print(p)
+        # print()
+        #
+        # print("----optimize onnx model: "+onnx_net_path)
+        # passes = ['eliminate_nop_pad',
+        #           'eliminate_identity',
+        #           'extract_constant_to_initializer',
+        #           'fuse_bn_into_conv',
+        #           'fuse_add_bias_into_conv',
+        #           'fuse_pad_into_conv',
+        #           'fuse_matmul_add_bias_into_gemm']
+        passes = ['eliminate_identity',
+                  'eliminate_nop_dropout',
+                  'eliminate_nop_monotone_argmax',
+                  'eliminate_nop_pad',
+                  'eliminate_nop_transpose',
+                  'extract_constant_to_initializer',
+                  'fuse_bn_into_conv',
+                  'fuse_add_bias_into_conv',
+                  'fuse_consecutive_concats',
+                  'fuse_consecutive_log_softmax',
+                  'fuse_consecutive_reduce_unsqueeze',
+                  'fuse_consecutive_squeezes',
+                  'fuse_consecutive_transposes',
+                  'fuse_matmul_add_bias_into_gemm',
+                  'fuse_pad_into_conv',
+                  'fuse_transpose_into_gemm']
+
+        #try:
+        #    optimized_onnx_model = optimizer.optimize(onnx_model, passes)
+        #    optimized_onnx_model = onnx.utils.polish_model(optimized_onnx_model)
+        #except IndexError as e:
+        #    optimized_onnx_model = onnx_model
+        optimized_onnx_model = onnx_model
+
+        try:
+            input_shapes_ = {}
+            if (input_shape is not None) and (input_shape != ""):
+                input_shape = input_shape.strip()
+                for x in input_shape.split(" "):
+                    if ':' not in x:
+                        input_shapes_[None] = list(map(int, x.split(',')))
+                    else:
+                        pieces = x.split(':')
+                        # for the input name like input:0
+                        name, shape = ':'.join(
+                            pieces[:-1]), list(map(int, pieces[-1].split(',')))
+                        input_shapes_[name] = shape
+            optimized_onnx_model, check_ok = onnx_simplifier.simplify(
+                optimized_onnx_model, input_shapes=input_shapes_, perform_optimization=False)
+            if not check_ok :
+                print("Check failed!")
+                exit()
+        except IndexError as e:
+            print("----onnxsim.simplify error: You'd better check the result with Netron")
+            print("----onnxsim.simplify error: "+str(RuntimeError))
+        except RuntimeError:
+            print("----onnxsim.simplify error: You'd better check the result with Netron")
+            print("----onnxsim.simplify error: "+str(RuntimeError))
+
+        # optimized_onnx_model = onnx_model
+        print("----export optimized onnx model: "+onnx_net_opt_path)
+        onnx.save(optimized_onnx_model, onnx_net_opt_path)
+        print("----export optimized onnx model done")
+
+# def main():
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument('onnx_model', help='Input ONNX model')
+#     args = parser.parse_args()
+#     onnx_net_path = args.onnx_model
+#     onnx_optimizer(onnx_net_path)
+#
+# if __name__ == '__main__':
+#     main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CMakeLists.txt
new file mode 100644
index 0000000..85ecd90
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CMakeLists.txt
@@ -0,0 +1,157 @@
+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+if (POLICY CMP0048)
+  # cmake warns if loaded from a min-3.0-required parent dir, so silence the warning:
+  cmake_policy(SET CMP0048 NEW)
+endif()
+
+# CMake versions < 3.4.0 do not support try_compile/pthread checks without C as active language.
+if(CMAKE_VERSION VERSION_LESS 3.4.0)
+  project(pybind11)
+else()
+  project(pybind11 CXX)
+endif()
+
+# Check if pybind11 is being used directly or via add_subdirectory
+set(PYBIND11_MASTER_PROJECT OFF)
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  set(PYBIND11_MASTER_PROJECT ON)
+endif()
+
+option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_TEST    "Build pybind11 test suite?"     ${PYBIND11_MASTER_PROJECT})
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/tools")
+
+include(pybind11Tools)
+
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYBIND11_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/include" CACHE INTERNAL "")
+set(PYTHON_INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS} CACHE INTERNAL "")
+set(PYTHON_LIBRARIES ${PYTHON_LIBRARIES} CACHE INTERNAL "")
+set(PYTHON_MODULE_PREFIX ${PYTHON_MODULE_PREFIX} CACHE INTERNAL "")
+set(PYTHON_MODULE_EXTENSION ${PYTHON_MODULE_EXTENSION} CACHE INTERNAL "")
+set(PYTHON_VERSION_MAJOR ${PYTHON_VERSION_MAJOR} CACHE INTERNAL "")
+set(PYTHON_VERSION_MINOR ${PYTHON_VERSION_MINOR} CACHE INTERNAL "")
+
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+  include/pybind11/detail/class.h
+  include/pybind11/detail/common.h
+  include/pybind11/detail/descr.h
+  include/pybind11/detail/init.h
+  include/pybind11/detail/internals.h
+  include/pybind11/detail/typeid.h
+  include/pybind11/attr.h
+  include/pybind11/buffer_info.h
+  include/pybind11/cast.h
+  include/pybind11/chrono.h
+  include/pybind11/common.h
+  include/pybind11/complex.h
+  include/pybind11/options.h
+  include/pybind11/eigen.h
+  include/pybind11/embed.h
+  include/pybind11/eval.h
+  include/pybind11/functional.h
+  include/pybind11/numpy.h
+  include/pybind11/operators.h
+  include/pybind11/pybind11.h
+  include/pybind11/pytypes.h
+  include/pybind11/stl.h
+  include/pybind11/stl_bind.h
+)
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/"
+       PYBIND11_HEADERS "${PYBIND11_HEADERS}")
+
+if (PYBIND11_TEST)
+  add_subdirectory(tests)
+endif()
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+# extract project version from source
+file(STRINGS "${PYBIND11_INCLUDE_DIR}/pybind11/detail/common.h" pybind11_version_defines
+     REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+foreach(ver ${pybind11_version_defines})
+  if (ver MATCHES "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$")
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "")
+  endif()
+endforeach()
+set(${PROJECT_NAME}_VERSION ${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH})
+message(STATUS "pybind11 v${${PROJECT_NAME}_VERSION}")
+
+option (USE_PYTHON_INCLUDE_DIR "Install pybind11 headers in Python include directory instead of default installation prefix" OFF)
+if (USE_PYTHON_INCLUDE_DIR)
+    file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))  # CMake >= 3.0
+  # Build an interface library target:
+  add_library(pybind11 INTERFACE)
+  add_library(pybind11::pybind11 ALIAS pybind11)  # to match exported target
+  target_include_directories(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  target_compile_options(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_CPP_STANDARD}>)
+
+  add_library(module INTERFACE)
+  add_library(pybind11::module ALIAS module)
+  if(NOT MSVC)
+    target_compile_options(module INTERFACE -fvisibility=hidden)
+  endif()
+  target_link_libraries(module INTERFACE pybind11::pybind11)
+  if(WIN32 OR CYGWIN)
+    target_link_libraries(module INTERFACE $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+  elseif(APPLE)
+    target_link_libraries(module INTERFACE "-undefined dynamic_lookup")
+  endif()
+
+  add_library(embed INTERFACE)
+  add_library(pybind11::embed ALIAS embed)
+  target_link_libraries(embed INTERFACE pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+endif()
+
+if (PYBIND11_INSTALL)
+  install(DIRECTORY ${PYBIND11_INCLUDE_DIR}/pybind11 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
+  set(PYBIND11_CMAKECONFIG_INSTALL_DIR "share/cmake/${PROJECT_NAME}" CACHE STRING "install path for pybind11Config.cmake")
+
+  configure_package_config_file(tools/${PROJECT_NAME}Config.cmake.in
+                                "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+                                INSTALL_DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+  # Remove CMAKE_SIZEOF_VOID_P from ConfigVersion.cmake since the library does
+  # not depend on architecture specific settings or libraries.
+  set(_PYBIND11_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset(CMAKE_SIZEOF_VOID_P)
+  write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+                                   VERSION ${${PROJECT_NAME}_VERSION}
+                                   COMPATIBILITY AnyNewerVersion)
+  set(CMAKE_SIZEOF_VOID_P ${_PYBIND11_CMAKE_SIZEOF_VOID_P})
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+                tools/FindPythonLibsNew.cmake
+                tools/pybind11Tools.cmake
+          DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+    if(NOT PYBIND11_EXPORT_NAME)
+      set(PYBIND11_EXPORT_NAME "${PROJECT_NAME}Targets")
+    endif()
+
+    install(TARGETS pybind11 module embed
+            EXPORT "${PYBIND11_EXPORT_NAME}")
+    if(PYBIND11_MASTER_PROJECT)
+      install(EXPORT "${PYBIND11_EXPORT_NAME}"
+              NAMESPACE "${PROJECT_NAME}::"
+              DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+    endif()
+  endif()
+endif()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CONTRIBUTING.md b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CONTRIBUTING.md
new file mode 100644
index 0000000..01596d9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/CONTRIBUTING.md
@@ -0,0 +1,49 @@
+Thank you for your interest in this project! Please refer to the following
+sections on how to contribute code and bug reports.
+
+### Reporting bugs
+
+At the moment, this project is run in the spare time of a single person
+([Wenzel Jakob](http://rgl.epfl.ch/people/wjakob)) with very limited resources
+for issue tracker tickets. Thus, before submitting a question or bug report,
+please take a moment of your time and ensure that your issue isn't already
+discussed in the project documentation provided at
+[http://pybind11.readthedocs.org/en/latest](http://pybind11.readthedocs.org/en/latest).
+
+Assuming that you have identified a previously unknown problem or an important
+question, it's essential that you submit a self-contained and minimal piece of
+code that reproduces the problem. In other words: no external dependencies,
+isolate the function(s) that cause breakage, submit matched and complete C++
+and Python snippets that can be easily compiled and run on my end.
+
+## Pull requests
+Contributions are submitted, reviewed, and accepted using Github pull requests.
+Please refer to [this
+article](https://help.github.com/articles/using-pull-requests) for details and
+adhere to the following rules to make the process as smooth as possible:
+
+* Make a new branch for every feature you're working on.
+* Make small and clean pull requests that are easy to review but make sure they
+  do add value by themselves.
+* Add tests for any new functionality and run the test suite (``make pytest``)
+  to ensure that no existing features break.
+* Please run ``flake8`` and ``tools/check-style.sh`` to check your code matches
+  the project style. (Note that ``check-style.sh`` requires ``gawk``.)
+* This project has a strong focus on providing general solutions using a
+  minimal amount of code, thus small pull requests are greatly preferred.
+
+### Licensing of contributions
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project, you
+agree to the terms and conditions of this license.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to the author of this software, without
+imposing a separate written license agreement for such Enhancements, then you
+hereby grant the following license: a non-exclusive, royalty-free perpetual
+license to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/ISSUE_TEMPLATE.md b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..75df399
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/ISSUE_TEMPLATE.md
@@ -0,0 +1,17 @@
+Make sure you've completed the following steps before submitting your issue -- thank you!
+
+1. Check if your question has already been answered in the [FAQ](http://pybind11.readthedocs.io/en/latest/faq.html) section.
+2. Make sure you've read the [documentation](http://pybind11.readthedocs.io/en/latest/). Your issue may be addressed there.
+3. If those resources didn't help and you only have a short question (not a bug report), consider asking in the [Gitter chat room](https://gitter.im/pybind/Lobby).
+4. If you have a genuine bug report or a more complex question which is not answered in the previous items (or not suitable for chat), please fill in the details below.
+5. Include a self-contained and minimal piece of code that reproduces the problem. If that's not possible, try to make the description as clear as possible.
+
+*After reading, remove this checklist and the template text in parentheses below.*
+
+## Issue description
+
+(Provide a short description, state the expected behavior and what actually happens.)
+
+## Reproducible example code
+
+(The code should be minimal, have no external dependencies, isolate the function(s) that cause breakage. Submit matched and complete C++ and Python snippets that can be easily compiled and run to diagnose the issue.)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/LICENSE b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/LICENSE
new file mode 100644
index 0000000..6f15578
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/MANIFEST.in b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/MANIFEST.in
new file mode 100644
index 0000000..6e57bae
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include include/pybind11 *.h
+include LICENSE README.md CONTRIBUTING.md
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/README.md b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/README.md
new file mode 100644
index 0000000..35d2d76
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/README.md
@@ -0,0 +1,129 @@
+![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
+
+# pybind11 — Seamless operability between C++11 and Python
+
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
+[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
+[![Build Status](https://travis-ci.org/pybind/pybind11.svg?branch=master)](https://travis-ci.org/pybind/pybind11)
+[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
+
+**pybind11** is a lightweight header-only library that exposes C++ types in Python
+and vice versa, mainly to create Python bindings of existing C++ code. Its
+goals and syntax are similar to the excellent
+[Boost.Python](http://www.boost.org/doc/libs/1_58_0/libs/python/doc/) library
+by David Abrahams: to minimize boilerplate code in traditional extension
+modules by inferring type information using compile-time introspection.
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.
+
+Tutorial and reference documentation is provided at
+[http://pybind11.readthedocs.org/en/master](http://pybind11.readthedocs.org/en/master).
+A PDF version of the manual is available
+[here](https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf).
+
+## Core features
+pybind11 can map the following core C++ features to Python
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+## Goodies
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
+  implementation-agnostic interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of PyRosetta, an enormous Boost.Python binding project,
+  [reported](http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf) a binary
+  size reduction of **5.4x** and compile time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using ``constexpr``),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+## Supported compilers
+
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11 v2.0 and a [workaround](https://github.com/pybind/pybind11/issues/276))
+5. Cygwin/GCC (tested on 2.5.1)
+
+## About
+
+This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
+Significant features and/or improvements to the code were contributed by
+Jonas Adler,
+Lori A. Burns,
+Sylvain Corlay,
+Trent Houliston,
+Axel Huebl,
+@hulucc,
+Sergey Lyskov
+Johan Mabille,
+Tomasz Miąsko,
+Dean Moldovan,
+Ben Pritchard,
+Jason Rhinelander,
+Boris Schäling,
+Pim Schellart,
+Henry Schreiner,
+Ivan Smirnov, and
+Patrick Stewart.
+
+### License
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project,
+you agree to the terms and conditions of this license.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/Doxyfile b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/Doxyfile
new file mode 100644
index 0000000..1b9d129
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/Doxyfile
@@ -0,0 +1,20 @@
+PROJECT_NAME           = pybind11
+INPUT                  = ../include/pybind11/
+RECURSIVE              = YES
+
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = .build/doxygenxml
+XML_PROGRAMLISTING     = YES
+
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+EXPAND_AS_DEFINED      = PYBIND11_RUNTIME_EXCEPTION
+
+ALIASES                = "rst=\verbatim embed:rst"
+ALIASES               += "endrst=\endverbatim"
+
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = NO
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/_static/theme_overrides.css b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/_static/theme_overrides.css
new file mode 100644
index 0000000..1071809
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/_static/theme_overrides.css
@@ -0,0 +1,11 @@
+.wy-table-responsive table td,
+.wy-table-responsive table th {
+    white-space: initial !important;
+}
+.rst-content table.docutils td {
+    vertical-align: top !important;
+}
+div[class^='highlight'] pre {
+    white-space: pre;
+    white-space: pre-wrap;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/chrono.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/chrono.rst
new file mode 100644
index 0000000..fbd4605
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/chrono.rst
@@ -0,0 +1,81 @@
+Chrono
+======
+
+When including the additional header file :file:`pybind11/chrono.h` conversions
+from C++11 chrono datatypes to python datetime objects are automatically enabled.
+This header also enables conversions of python floats (often from sources such
+as ``time.monotonic()``, ``time.perf_counter()`` and ``time.process_time()``)
+into durations.
+
+An overview of clocks in C++11
+------------------------------
+
+A point of confusion when using these conversions is the differences between
+clocks provided in C++11. There are three clock types defined by the C++11
+standard and users can define their own if needed. Each of these clocks have
+different properties and when converting to and from python will give different
+results.
+
+The first clock defined by the standard is ``std::chrono::system_clock``. This
+clock measures the current date and time. However, this clock changes with to
+updates to the operating system time. For example, if your time is synchronised
+with a time server this clock will change. This makes this clock a poor choice
+for timing purposes but good for measuring the wall time.
+
+The second clock defined in the standard is ``std::chrono::steady_clock``.
+This clock ticks at a steady rate and is never adjusted. This makes it excellent
+for timing purposes, however the value in this clock does not correspond to the
+current date and time. Often this clock will be the amount of time your system
+has been on, although it does not have to be. This clock will never be the same
+clock as the system clock as the system clock can change but steady clocks
+cannot.
+
+The third clock defined in the standard is ``std::chrono::high_resolution_clock``.
+This clock is the clock that has the highest resolution out of the clocks in the
+system. It is normally a typedef to either the system clock or the steady clock
+but can be its own independent clock. This is important as when using these
+conversions as the types you get in python for this clock might be different
+depending on the system.
+If it is a typedef of the system clock, python will get datetime objects, but if
+it is a different clock they will be timedelta objects.
+
+Provided conversions
+--------------------
+
+.. rubric:: C++ to Python
+
+- ``std::chrono::system_clock::time_point`` → ``datetime.datetime``
+    System clock times are converted to python datetime instances. They are
+    in the local timezone, but do not have any timezone information attached
+    to them (they are naive datetime objects).
+
+- ``std::chrono::duration`` → ``datetime.timedelta``
+    Durations are converted to timedeltas, any precision in the duration
+    greater than microseconds is lost by rounding towards zero.
+
+- ``std::chrono::[other_clocks]::time_point`` → ``datetime.timedelta``
+    Any clock time that is not the system clock is converted to a time delta.
+    This timedelta measures the time from the clocks epoch to now.
+
+.. rubric:: Python to C++
+
+- ``datetime.datetime`` or ``datetime.date`` or ``datetime.time`` → ``std::chrono::system_clock::time_point``
+    Date/time objects are converted into system clock timepoints. Any
+    timezone information is ignored and the type is treated as a naive
+    object.
+
+- ``datetime.timedelta`` → ``std::chrono::duration``
+    Time delta are converted into durations with microsecond precision.
+
+- ``datetime.timedelta`` → ``std::chrono::[other_clocks]::time_point``
+    Time deltas that are converted into clock timepoints are treated as
+    the amount of time from the start of the clocks epoch.
+
+- ``float`` → ``std::chrono::duration``
+    Floats that are passed to C++ as durations be interpreted as a number of
+    seconds. These will be converted to the duration using ``duration_cast``
+    from the float.
+
+- ``float`` → ``std::chrono::[other_clocks]::time_point``
+    Floats that are passed to C++ as time points will be interpreted as the
+    number of seconds from the start of the clocks epoch.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst
new file mode 100644
index 0000000..e4f99ac
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst
@@ -0,0 +1,91 @@
+Custom type casters
+===================
+
+In very rare cases, applications may require custom type casters that cannot be
+expressed using the abstractions provided by pybind11, thus requiring raw
+Python C API calls. This is fairly advanced usage and should only be pursued by
+experts who are familiar with the intricacies of Python reference counting.
+
+The following snippets demonstrate how this works for a very simple ``inty``
+type that that should be convertible from Python types that provide a
+``__int__(self)`` method.
+
+.. code-block:: cpp
+
+    struct inty { long long_value; };
+
+    void print(inty s) {
+        std::cout << s.long_value << std::endl;
+    }
+
+The following Python snippet demonstrates the intended usage from the Python side:
+
+.. code-block:: python
+
+    class A:
+        def __int__(self):
+            return 123
+
+    from example import print
+    print(A())
+
+To register the necessary conversion routines, it is necessary to add
+a partial overload to the ``pybind11::detail::type_caster<T>`` template.
+Although this is an implementation detail, adding partial overloads to this
+type is explicitly allowed.
+
+.. code-block:: cpp
+
+    namespace pybind11 { namespace detail {
+        template <> struct type_caster<inty> {
+        public:
+            /**
+             * This macro establishes the name 'inty' in
+             * function signatures and declares a local variable
+             * 'value' of type inty
+             */
+            PYBIND11_TYPE_CASTER(inty, _("inty"));
+
+            /**
+             * Conversion part 1 (Python->C++): convert a PyObject into a inty
+             * instance or return false upon failure. The second argument
+             * indicates whether implicit conversions should be applied.
+             */
+            bool load(handle src, bool) {
+                /* Extract PyObject from handle */
+                PyObject *source = src.ptr();
+                /* Try converting into a Python integer value */
+                PyObject *tmp = PyNumber_Long(source);
+                if (!tmp)
+                    return false;
+                /* Now try to convert into a C++ int */
+                value.long_value = PyLong_AsLong(tmp);
+                Py_DECREF(tmp);
+                /* Ensure return code was OK (to avoid out-of-range errors etc) */
+                return !(value.long_value == -1 && !PyErr_Occurred());
+            }
+
+            /**
+             * Conversion part 2 (C++ -> Python): convert an inty instance into
+             * a Python object. The second and third arguments are used to
+             * indicate the return value policy and parent object (for
+             * ``return_value_policy::reference_internal``) and are generally
+             * ignored by implicit casters.
+             */
+            static handle cast(inty src, return_value_policy /* policy */, handle /* parent */) {
+                return PyLong_FromLong(src.long_value);
+            }
+        };
+    }} // namespace pybind11::detail
+
+.. note::
+
+    A ``type_caster<T>`` defined with ``PYBIND11_TYPE_CASTER(T, ...)`` requires
+    that ``T`` is default-constructible (``value`` is first default constructed
+    and then ``load()`` assigns to it).
+
+.. warning::
+
+    When using custom type casters, it's important to declare them consistently
+    in every compilation unit of the Python extension module. Otherwise,
+    undefined behavior can ensue.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst
new file mode 100644
index 0000000..59ba08c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst
@@ -0,0 +1,310 @@
+Eigen
+#####
+
+`Eigen <http://eigen.tuxfamily.org>`_ is C++ header-based library for dense and
+sparse linear algebra. Due to its popularity and widespread adoption, pybind11
+provides transparent conversion and limited mapping support between Eigen and
+Scientific Python linear algebra data types.
+
+To enable the built-in Eigen support you must include the optional header file
+:file:`pybind11/eigen.h`.
+
+Pass-by-value
+=============
+
+When binding a function with ordinary Eigen dense object arguments (for
+example, ``Eigen::MatrixXd``), pybind11 will accept any input value that is
+already (or convertible to) a ``numpy.ndarray`` with dimensions compatible with
+the Eigen type, copy its values into a temporary Eigen variable of the
+appropriate type, then call the function with this temporary variable.
+
+Sparse matrices are similarly copied to or from
+``scipy.sparse.csr_matrix``/``scipy.sparse.csc_matrix`` objects.
+
+Pass-by-reference
+=================
+
+One major limitation of the above is that every data conversion implicitly
+involves a copy, which can be both expensive (for large matrices) and disallows
+binding functions that change their (Matrix) arguments.  Pybind11 allows you to
+work around this by using Eigen's ``Eigen::Ref<MatrixType>`` class much as you
+would when writing a function taking a generic type in Eigen itself (subject to
+some limitations discussed below).
+
+When calling a bound function accepting a ``Eigen::Ref<const MatrixType>``
+type, pybind11 will attempt to avoid copying by using an ``Eigen::Map`` object
+that maps into the source ``numpy.ndarray`` data: this requires both that the
+data types are the same (e.g. ``dtype='float64'`` and ``MatrixType::Scalar`` is
+``double``); and that the storage is layout compatible.  The latter limitation
+is discussed in detail in the section below, and requires careful
+consideration: by default, numpy matrices and Eigen matrices are *not* storage
+compatible.
+
+If the numpy matrix cannot be used as is (either because its types differ, e.g.
+passing an array of integers to an Eigen parameter requiring doubles, or
+because the storage is incompatible), pybind11 makes a temporary copy and
+passes the copy instead.
+
+When a bound function parameter is instead ``Eigen::Ref<MatrixType>`` (note the
+lack of ``const``), pybind11 will only allow the function to be called if it
+can be mapped *and* if the numpy array is writeable (that is
+``a.flags.writeable`` is true).  Any access (including modification) made to
+the passed variable will be transparently carried out directly on the
+``numpy.ndarray``.
+
+This means you can can write code such as the following and have it work as
+expected:
+
+.. code-block:: cpp
+
+    void scale_by_2(Eigen::Ref<Eigen::VectorXd> v) {
+        v *= 2;
+    }
+
+Note, however, that you will likely run into limitations due to numpy and
+Eigen's difference default storage order for data; see the below section on
+:ref:`storage_orders` for details on how to bind code that won't run into such
+limitations.
+
+.. note::
+
+    Passing by reference is not supported for sparse types.
+
+Returning values to Python
+==========================
+
+When returning an ordinary dense Eigen matrix type to numpy (e.g.
+``Eigen::MatrixXd`` or ``Eigen::RowVectorXf``) pybind11 keeps the matrix and
+returns a numpy array that directly references the Eigen matrix: no copy of the
+data is performed.  The numpy array will have ``array.flags.owndata`` set to
+``False`` to indicate that it does not own the data, and the lifetime of the
+stored Eigen matrix will be tied to the returned ``array``.
+
+If you bind a function with a non-reference, ``const`` return type (e.g.
+``const Eigen::MatrixXd``), the same thing happens except that pybind11 also
+sets the numpy array's ``writeable`` flag to false.
+
+If you return an lvalue reference or pointer, the usual pybind11 rules apply,
+as dictated by the binding function's return value policy (see the
+documentation on :ref:`return_value_policies` for full details).  That means,
+without an explicit return value policy, lvalue references will be copied and
+pointers will be managed by pybind11.  In order to avoid copying, you should
+explicitly specify an appropriate return value policy, as in the following
+example:
+
+.. code-block:: cpp
+
+    class MyClass {
+        Eigen::MatrixXd big_mat = Eigen::MatrixXd::Zero(10000, 10000);
+    public:
+        Eigen::MatrixXd &getMatrix() { return big_mat; }
+        const Eigen::MatrixXd &viewMatrix() { return big_mat; }
+    };
+
+    // Later, in binding code:
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def("copy_matrix", &MyClass::getMatrix) // Makes a copy!
+        .def("get_matrix", &MyClass::getMatrix, py::return_value_policy::reference_internal)
+        .def("view_matrix", &MyClass::viewMatrix, py::return_value_policy::reference_internal)
+        ;
+
+.. code-block:: python
+
+    a = MyClass()
+    m = a.get_matrix()   # flags.writeable = True,  flags.owndata = False
+    v = a.view_matrix()  # flags.writeable = False, flags.owndata = False
+    c = a.copy_matrix()  # flags.writeable = True,  flags.owndata = True
+    # m[5,6] and v[5,6] refer to the same element, c[5,6] does not.
+
+Note in this example that ``py::return_value_policy::reference_internal`` is
+used to tie the life of the MyClass object to the life of the returned arrays.
+
+You may also return an ``Eigen::Ref``, ``Eigen::Map`` or other map-like Eigen
+object (for example, the return value of ``matrix.block()`` and related
+methods) that map into a dense Eigen type.  When doing so, the default
+behaviour of pybind11 is to simply reference the returned data: you must take
+care to ensure that this data remains valid!  You may ask pybind11 to
+explicitly *copy* such a return value by using the
+``py::return_value_policy::copy`` policy when binding the function.  You may
+also use ``py::return_value_policy::reference_internal`` or a
+``py::keep_alive`` to ensure the data stays valid as long as the returned numpy
+array does.
+
+When returning such a reference of map, pybind11 additionally respects the
+readonly-status of the returned value, marking the numpy array as non-writeable
+if the reference or map was itself read-only.
+
+.. note::
+
+    Sparse types are always copied when returned.
+
+.. _storage_orders:
+
+Storage orders
+==============
+
+Passing arguments via ``Eigen::Ref`` has some limitations that you must be
+aware of in order to effectively pass matrices by reference.  First and
+foremost is that the default ``Eigen::Ref<MatrixType>`` class requires
+contiguous storage along columns (for column-major types, the default in Eigen)
+or rows if ``MatrixType`` is specifically an ``Eigen::RowMajor`` storage type.
+The former, Eigen's default, is incompatible with ``numpy``'s default row-major
+storage, and so you will not be able to pass numpy arrays to Eigen by reference
+without making one of two changes.
+
+(Note that this does not apply to vectors (or column or row matrices): for such
+types the "row-major" and "column-major" distinction is meaningless).
+
+The first approach is to change the use of ``Eigen::Ref<MatrixType>`` to the
+more general ``Eigen::Ref<MatrixType, 0, Eigen::Stride<Eigen::Dynamic,
+Eigen::Dynamic>>`` (or similar type with a fully dynamic stride type in the
+third template argument).  Since this is a rather cumbersome type, pybind11
+provides a ``py::EigenDRef<MatrixType>`` type alias for your convenience (along
+with EigenDMap for the equivalent Map, and EigenDStride for just the stride
+type).
+
+This type allows Eigen to map into any arbitrary storage order.  This is not
+the default in Eigen for performance reasons: contiguous storage allows
+vectorization that cannot be done when storage is not known to be contiguous at
+compile time.  The default ``Eigen::Ref`` stride type allows non-contiguous
+storage along the outer dimension (that is, the rows of a column-major matrix
+or columns of a row-major matrix), but not along the inner dimension.
+
+This type, however, has the added benefit of also being able to map numpy array
+slices.  For example, the following (contrived) example uses Eigen with a numpy
+slice to multiply by 2 all coefficients that are both on even rows (0, 2, 4,
+...) and in columns 2, 5, or 8:
+
+.. code-block:: cpp
+
+    m.def("scale", [](py::EigenDRef<Eigen::MatrixXd> m, double c) { m *= c; });
+
+.. code-block:: python
+
+    # a = np.array(...)
+    scale_by_2(myarray[0::2, 2:9:3])
+
+The second approach to avoid copying is more intrusive: rearranging the
+underlying data types to not run into the non-contiguous storage problem in the
+first place.  In particular, that means using matrices with ``Eigen::RowMajor``
+storage, where appropriate, such as:
+
+.. code-block:: cpp
+
+    using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    // Use RowMatrixXd instead of MatrixXd
+
+Now bound functions accepting ``Eigen::Ref<RowMatrixXd>`` arguments will be
+callable with numpy's (default) arrays without involving a copying.
+
+You can, alternatively, change the storage order that numpy arrays use by
+adding the ``order='F'`` option when creating an array:
+
+.. code-block:: python
+
+    myarray = np.array(source, order='F')
+
+Such an object will be passable to a bound function accepting an
+``Eigen::Ref<MatrixXd>`` (or similar column-major Eigen type).
+
+One major caveat with this approach, however, is that it is not entirely as
+easy as simply flipping all Eigen or numpy usage from one to the other: some
+operations may alter the storage order of a numpy array.  For example, ``a2 =
+array.transpose()`` results in ``a2`` being a view of ``array`` that references
+the same data, but in the opposite storage order!
+
+While this approach allows fully optimized vectorized calculations in Eigen, it
+cannot be used with array slices, unlike the first approach.
+
+When *returning* a matrix to Python (either a regular matrix, a reference via
+``Eigen::Ref<>``, or a map/block into a matrix), no special storage
+consideration is required: the created numpy array will have the required
+stride that allows numpy to properly interpret the array, whatever its storage
+order.
+
+Failing rather than copying
+===========================
+
+The default behaviour when binding ``Eigen::Ref<const MatrixType>`` Eigen
+references is to copy matrix values when passed a numpy array that does not
+conform to the element type of ``MatrixType`` or does not have a compatible
+stride layout.  If you want to explicitly avoid copying in such a case, you
+should bind arguments using the ``py::arg().noconvert()`` annotation (as
+described in the :ref:`nonconverting_arguments` documentation).
+
+The following example shows an example of arguments that don't allow data
+copying to take place:
+
+.. code-block:: cpp
+
+    // The method and function to be bound:
+    class MyClass {
+        // ...
+        double some_method(const Eigen::Ref<const MatrixXd> &matrix) { /* ... */ }
+    };
+    float some_function(const Eigen::Ref<const MatrixXf> &big,
+                        const Eigen::Ref<const MatrixXf> &small) {
+        // ...
+    }
+
+    // The associated binding code:
+    using namespace pybind11::literals; // for "arg"_a
+    py::class_<MyClass>(m, "MyClass")
+        // ... other class definitions
+        .def("some_method", &MyClass::some_method, py::arg().noconvert());
+
+    m.def("some_function", &some_function,
+        "big"_a.noconvert(), // <- Don't allow copying for this arg
+        "small"_a            // <- This one can be copied if needed
+    );
+
+With the above binding code, attempting to call the the ``some_method(m)``
+method on a ``MyClass`` object, or attempting to call ``some_function(m, m2)``
+will raise a ``RuntimeError`` rather than making a temporary copy of the array.
+It will, however, allow the ``m2`` argument to be copied into a temporary if
+necessary.
+
+Note that explicitly specifying ``.noconvert()`` is not required for *mutable*
+Eigen references (e.g. ``Eigen::Ref<MatrixXd>`` without ``const`` on the
+``MatrixXd``): mutable references will never be called with a temporary copy.
+
+Vectors versus column/row matrices
+==================================
+
+Eigen and numpy have fundamentally different notions of a vector.  In Eigen, a
+vector is simply a matrix with the number of columns or rows set to 1 at
+compile time (for a column vector or row vector, respectively).  Numpy, in
+contrast, has comparable 2-dimensional 1xN and Nx1 arrays, but *also* has
+1-dimensional arrays of size N.
+
+When passing a 2-dimensional 1xN or Nx1 array to Eigen, the Eigen type must
+have matching dimensions: That is, you cannot pass a 2-dimensional Nx1 numpy
+array to an Eigen value expecting a row vector, or a 1xN numpy array as a
+column vector argument.
+
+On the other hand, pybind11 allows you to pass 1-dimensional arrays of length N
+as Eigen parameters.  If the Eigen type can hold a column vector of length N it
+will be passed as such a column vector.  If not, but the Eigen type constraints
+will accept a row vector, it will be passed as a row vector.  (The column
+vector takes precedence when both are supported, for example, when passing a
+1D numpy array to a MatrixXd argument).  Note that the type need not be
+explicitly a vector: it is permitted to pass a 1D numpy array of size 5 to an
+Eigen ``Matrix<double, Dynamic, 5>``: you would end up with a 1x5 Eigen matrix.
+Passing the same to an ``Eigen::MatrixXd`` would result in a 5x1 Eigen matrix.
+
+When returning an Eigen vector to numpy, the conversion is ambiguous: a row
+vector of length 4 could be returned as either a 1D array of length 4, or as a
+2D array of size 1x4.  When encountering such a situation, pybind11 compromises
+by considering the returned Eigen type: if it is a compile-time vector--that
+is, the type has either the number of rows or columns set to 1 at compile
+time--pybind11 converts to a 1D numpy array when returning the value.  For
+instances that are a vector only at run-time (e.g. ``MatrixXd``,
+``Matrix<float, Dynamic, 4>``), pybind11 returns the vector as a 2D array to
+numpy.  If this isn't want you want, you can use ``array.reshape(...)`` to get
+a view of the same data in the desired dimensions.
+
+.. seealso::
+
+    The file :file:`tests/test_eigen.cpp` contains a complete example that
+    shows how to pass Eigen sparse and dense data types in more detail.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/functional.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/functional.rst
new file mode 100644
index 0000000..d9b4605
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/functional.rst
@@ -0,0 +1,109 @@
+Functional
+##########
+
+The following features must be enabled by including :file:`pybind11/functional.h`.
+
+
+Callbacks and passing anonymous functions
+=========================================
+
+The C++11 standard brought lambda functions and the generic polymorphic
+function wrapper ``std::function<>`` to the C++ programming language, which
+enable powerful new ways of working with functions. Lambda functions come in
+two flavors: stateless lambda function resemble classic function pointers that
+link to an anonymous piece of code, while stateful lambda functions
+additionally depend on captured variables that are stored in an anonymous
+*lambda closure object*.
+
+Here is a simple example of a C++ function that takes an arbitrary function
+(stateful or stateless) with signature ``int -> int`` as an argument and runs
+it with the value 10.
+
+.. code-block:: cpp
+
+    int func_arg(const std::function<int(int)> &f) {
+        return f(10);
+    }
+
+The example below is more involved: it takes a function of signature ``int -> int``
+and returns another function of the same kind. The return value is a stateful
+lambda function, which stores the value ``f`` in the capture object and adds 1 to
+its return value upon execution.
+
+.. code-block:: cpp
+
+    std::function<int(int)> func_ret(const std::function<int(int)> &f) {
+        return [f](int i) {
+            return f(i) + 1;
+        };
+    }
+
+This example demonstrates using python named parameters in C++ callbacks which
+requires using ``py::cpp_function`` as a wrapper. Usage is similar to defining
+methods of classes:
+
+.. code-block:: cpp
+
+    py::cpp_function func_cpp() {
+        return py::cpp_function([](int i) { return i+1; },
+           py::arg("number"));
+    }
+
+After including the extra header file :file:`pybind11/functional.h`, it is almost
+trivial to generate binding code for all of these functions.
+
+.. code-block:: cpp
+
+    #include <pybind11/functional.h>
+
+    PYBIND11_MODULE(example, m) {
+        m.def("func_arg", &func_arg);
+        m.def("func_ret", &func_ret);
+        m.def("func_cpp", &func_cpp);
+    }
+
+The following interactive session shows how to call them from Python.
+
+.. code-block:: pycon
+
+    $ python
+    >>> import example
+    >>> def square(i):
+    ...     return i * i
+    ...
+    >>> example.func_arg(square)
+    100L
+    >>> square_plus_1 = example.func_ret(square)
+    >>> square_plus_1(4)
+    17L
+    >>> plus_1 = func_cpp()
+    >>> plus_1(number=43)
+    44L
+
+.. warning::
+
+    Keep in mind that passing a function from C++ to Python (or vice versa)
+    will instantiate a piece of wrapper code that translates function
+    invocations between the two languages. Naturally, this translation
+    increases the computational cost of each function call somewhat. A
+    problematic situation can arise when a function is copied back and forth
+    between Python and C++ many times in a row, in which case the underlying
+    wrappers will accumulate correspondingly. The resulting long sequence of
+    C++ -> Python -> C++ -> ... roundtrips can significantly decrease
+    performance.
+
+    There is one exception: pybind11 detects case where a stateless function
+    (i.e. a function pointer or a lambda function without captured variables)
+    is passed as an argument to another C++ function exposed in Python. In this
+    case, there is no overhead. Pybind11 will extract the underlying C++
+    function pointer from the wrapped function to sidestep a potential C++ ->
+    Python -> C++ roundtrip. This is demonstrated in :file:`tests/test_callbacks.cpp`.
+
+.. note::
+
+    This functionality is very useful when generating bindings for callbacks in
+    C++ libraries (e.g. GUI libraries, asynchronous networking libraries, etc.).
+
+    The file :file:`tests/test_callbacks.cpp` contains a complete example
+    that demonstrates how to work with callbacks and anonymous functions in
+    more detail.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/index.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/index.rst
new file mode 100644
index 0000000..54c1057
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/index.rst
@@ -0,0 +1,42 @@
+Type conversions
+################
+
+Apart from enabling cross-language function calls, a fundamental problem
+that a binding tool like pybind11 must address is to provide access to
+native Python types in C++ and vice versa. There are three fundamentally
+different ways to do this—which approach is preferable for a particular type
+depends on the situation at hand.
+
+1. Use a native C++ type everywhere. In this case, the type must be wrapped
+   using pybind11-generated bindings so that Python can interact with it.
+
+2. Use a native Python type everywhere. It will need to be wrapped so that
+   C++ functions can interact with it.
+
+3. Use a native C++ type on the C++ side and a native Python type on the
+   Python side. pybind11 refers to this as a *type conversion*.
+
+   Type conversions are the most "natural" option in the sense that native
+   (non-wrapped) types are used everywhere. The main downside is that a copy
+   of the data must be made on every Python ↔ C++ transition: this is
+   needed since the C++ and Python versions of the same type generally won't
+   have the same memory layout.
+
+   pybind11 can perform many kinds of conversions automatically. An overview
+   is provided in the table ":ref:`conversion_table`".
+
+The following subsections discuss the differences between these options in more
+detail. The main focus in this section is on type conversions, which represent
+the last case of the above list.
+
+.. toctree::
+   :maxdepth: 1
+
+   overview
+   strings
+   stl
+   functional
+   chrono
+   eigen
+   custom
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/overview.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/overview.rst
new file mode 100644
index 0000000..b0e32a5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/overview.rst
@@ -0,0 +1,165 @@
+Overview
+########
+
+.. rubric:: 1. Native type in C++, wrapper in Python
+
+Exposing a custom C++ type using :class:`py::class_` was covered in detail
+in the :doc:`/classes` section. There, the underlying data structure is
+always the original C++ class while the :class:`py::class_` wrapper provides
+a Python interface. Internally, when an object like this is sent from C++ to
+Python, pybind11 will just add the outer wrapper layer over the native C++
+object. Getting it back from Python is just a matter of peeling off the
+wrapper.
+
+.. rubric:: 2. Wrapper in C++, native type in Python
+
+This is the exact opposite situation. Now, we have a type which is native to
+Python, like a ``tuple`` or a ``list``. One way to get this data into C++ is
+with the :class:`py::object` family of wrappers. These are explained in more
+detail in the :doc:`/advanced/pycpp/object` section. We'll just give a quick
+example here:
+
+.. code-block:: cpp
+
+    void print_list(py::list my_list) {
+        for (auto item : my_list)
+            std::cout << item << " ";
+    }
+
+.. code-block:: pycon
+
+    >>> print_list([1, 2, 3])
+    1 2 3
+
+The Python ``list`` is not converted in any way -- it's just wrapped in a C++
+:class:`py::list` class. At its core it's still a Python object. Copying a
+:class:`py::list` will do the usual reference-counting like in Python.
+Returning the object to Python will just remove the thin wrapper.
+
+.. rubric:: 3. Converting between native C++ and Python types
+
+In the previous two cases we had a native type in one language and a wrapper in
+the other. Now, we have native types on both sides and we convert between them.
+
+.. code-block:: cpp
+
+    void print_vector(const std::vector<int> &v) {
+        for (auto item : v)
+            std::cout << item << "\n";
+    }
+
+.. code-block:: pycon
+
+    >>> print_vector([1, 2, 3])
+    1 2 3
+
+In this case, pybind11 will construct a new ``std::vector<int>`` and copy each
+element from the Python ``list``. The newly constructed object will be passed
+to ``print_vector``. The same thing happens in the other direction: a new
+``list`` is made to match the value returned from C++.
+
+Lots of these conversions are supported out of the box, as shown in the table
+below. They are very convenient, but keep in mind that these conversions are
+fundamentally based on copying data. This is perfectly fine for small immutable
+types but it may become quite expensive for large data structures. This can be
+avoided by overriding the automatic conversion with a custom wrapper (i.e. the
+above-mentioned approach 1). This requires some manual effort and more details
+are available in the :ref:`opaque` section.
+
+.. _conversion_table:
+
+List of all builtin conversions
+-------------------------------
+
+The following basic data types are supported out of the box (some may require
+an additional extension header to be included). To pass other data structures
+as arguments and return values, refer to the section on binding :ref:`classes`.
+
++------------------------------------+---------------------------+-------------------------------+
+|  Data type                         |  Description              | Header file                   |
++====================================+===========================+===============================+
+| ``int8_t``, ``uint8_t``            | 8-bit integers            | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int16_t``, ``uint16_t``          | 16-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int32_t``, ``uint32_t``          | 32-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int64_t``, ``uint64_t``          | 64-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``ssize_t``, ``size_t``            | Platform-dependent size   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``float``, ``double``              | Floating point types      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``bool``                           | Two-state Boolean type    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char``                           | Character literal         | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`   |
+| ``std::u16string_view``, etc.      |                           |                               |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::reference_wrapper<...>``    | Reference type wrapper    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::complex<T>``                | Complex numbers           | :file:`pybind11/complex.h`    |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::array<T, Size>``            | STL static array          | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::vector<T>``                 | STL dynamic array         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::deque<T>``                  | STL double-ended queue    | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::valarray<T>``               | STL value array           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::list<T>``                   | STL linked list           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::map<T1, T2>``               | STL ordered map           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_map<T1, T2>``     | STL unordered map         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::set<T>``                    | STL ordered set           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_set<T>``          | STL unordered set         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::optional<T>``               | STL optional type (C++17) | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h` |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::time_point<...>``   | STL date/time             | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Matrix<...>``             | Eigen: dense matrix       | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Map<...>``                | Eigen: mapped memory      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::SparseMatrix<...>``       | Eigen: sparse matrix      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/stl.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/stl.rst
new file mode 100644
index 0000000..e48409f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/stl.rst
@@ -0,0 +1,240 @@
+STL containers
+##############
+
+Automatic conversion
+====================
+
+When including the additional header file :file:`pybind11/stl.h`, conversions
+between ``std::vector<>``/``std::deque<>``/``std::list<>``/``std::array<>``,
+``std::set<>``/``std::unordered_set<>``, and
+``std::map<>``/``std::unordered_map<>`` and the Python ``list``, ``set`` and
+``dict`` data structures are automatically enabled. The types ``std::pair<>``
+and ``std::tuple<>`` are already supported out of the box with just the core
+:file:`pybind11/pybind11.h` header.
+
+The major downside of these implicit conversions is that containers must be
+converted (i.e. copied) on every Python->C++ and C++->Python transition, which
+can have implications on the program semantics and performance. Please read the
+next sections for more details and alternative approaches that avoid this.
+
+.. note::
+
+    Arbitrary nesting of any of these types is possible.
+
+.. seealso::
+
+    The file :file:`tests/test_stl.cpp` contains a complete
+    example that demonstrates how to pass STL data types in more detail.
+
+.. _cpp17_container_casters:
+
+C++17 library containers
+========================
+
+The :file:`pybind11/stl.h` header also includes support for ``std::optional<>``
+and ``std::variant<>``. These require a C++17 compiler and standard library.
+In C++14 mode, ``std::experimental::optional<>`` is supported if available.
+
+Various versions of these containers also exist for C++11 (e.g. in Boost).
+pybind11 provides an easy way to specialize the ``type_caster`` for such
+types:
+
+.. code-block:: cpp
+
+    // `boost::optional` as an example -- can be any `std::optional`-like container
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct type_caster<boost::optional<T>> : optional_caster<boost::optional<T>> {};
+    }}
+
+The above should be placed in a header file and included in all translation units
+where automatic conversion is needed. Similarly, a specialization can be provided
+for custom variant types:
+
+.. code-block:: cpp
+
+    // `boost::variant` as an example -- can be any `std::variant`-like container
+    namespace pybind11 { namespace detail {
+        template <typename... Ts>
+        struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+        // Specifies the function used to visit the variant -- `apply_visitor` instead of `visit`
+        template <>
+        struct visit_helper<boost::variant> {
+            template <typename... Args>
+            static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+                return boost::apply_visitor(args...);
+            }
+        };
+    }} // namespace pybind11::detail
+
+The ``visit_helper`` specialization is not required if your ``name::variant`` provides
+a ``name::visit()`` function. For any other function name, the specialization must be
+included to tell pybind11 how to visit the variant.
+
+.. note::
+
+    pybind11 only supports the modern implementation of ``boost::variant``
+    which makes use of variadic templates. This requires Boost 1.56 or newer.
+    Additionally, on Windows, MSVC 2017 is required because ``boost::variant``
+    falls back to the old non-variadic implementation on MSVC 2015.
+
+.. _opaque:
+
+Making opaque types
+===================
+
+pybind11 heavily relies on a template matching mechanism to convert parameters
+and return values that are constructed from STL data types such as vectors,
+linked lists, hash tables, etc. This even works in a recursive manner, for
+instance to deal with lists of hash maps of pairs of elementary and custom
+types, etc.
+
+However, a fundamental limitation of this approach is that internal conversions
+between Python and C++ types involve a copy operation that prevents
+pass-by-reference semantics. What does this mean?
+
+Suppose we bind the following function
+
+.. code-block:: cpp
+
+    void append_1(std::vector<int> &v) {
+       v.push_back(1);
+    }
+
+and call it from Python, the following happens:
+
+.. code-block:: pycon
+
+   >>> v = [5, 6]
+   >>> append_1(v)
+   >>> print(v)
+   [5, 6]
+
+As you can see, when passing STL data structures by reference, modifications
+are not propagated back the Python side. A similar situation arises when
+exposing STL data structures using the ``def_readwrite`` or ``def_readonly``
+functions:
+
+.. code-block:: cpp
+
+    /* ... definition ... */
+
+    class MyClass {
+        std::vector<int> contents;
+    };
+
+    /* ... binding code ... */
+
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def_readwrite("contents", &MyClass::contents);
+
+In this case, properties can be read and written in their entirety. However, an
+``append`` operation involving such a list type has no effect:
+
+.. code-block:: pycon
+
+   >>> m = MyClass()
+   >>> m.contents = [5, 6]
+   >>> print(m.contents)
+   [5, 6]
+   >>> m.contents.append(7)
+   >>> print(m.contents)
+   [5, 6]
+
+Finally, the involved copy operations can be costly when dealing with very
+large lists. To deal with all of the above situations, pybind11 provides a
+macro named ``PYBIND11_MAKE_OPAQUE(T)`` that disables the template-based
+conversion machinery of types, thus rendering them *opaque*. The contents of
+opaque objects are never inspected or extracted, hence they *can* be passed by
+reference. For instance, to turn ``std::vector<int>`` into an opaque type, add
+the declaration
+
+.. code-block:: cpp
+
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+
+before any binding code (e.g. invocations to ``class_::def()``, etc.). This
+macro must be specified at the top level (and outside of any namespaces), since
+it instantiates a partial template overload. If your binding code consists of
+multiple compilation units, it must be present in every file (typically via a
+common header) preceding any usage of ``std::vector<int>``. Opaque types must
+also have a corresponding ``class_`` declaration to associate them with a name
+in Python, and to define a set of available operations, e.g.:
+
+.. code-block:: cpp
+
+    py::class_<std::vector<int>>(m, "IntVector")
+        .def(py::init<>())
+        .def("clear", &std::vector<int>::clear)
+        .def("pop_back", &std::vector<int>::pop_back)
+        .def("__len__", [](const std::vector<int> &v) { return v.size(); })
+        .def("__iter__", [](std::vector<int> &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>()) /* Keep vector alive while iterator is used */
+        // ....
+
+.. seealso::
+
+    The file :file:`tests/test_opaque_types.cpp` contains a complete
+    example that demonstrates how to create and expose opaque types using
+    pybind11 in more detail.
+
+.. _stl_bind:
+
+Binding STL containers
+======================
+
+The ability to expose STL containers as native Python objects is a fairly
+common request, hence pybind11 also provides an optional header file named
+:file:`pybind11/stl_bind.h` that does exactly this. The mapped containers try
+to match the behavior of their native Python counterparts as much as possible.
+
+The following example showcases usage of :file:`pybind11/stl_bind.h`:
+
+.. code-block:: cpp
+
+    // Don't forget this
+    #include <pybind11/stl_bind.h>
+
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+    PYBIND11_MAKE_OPAQUE(std::map<std::string, double>);
+
+    // ...
+
+    // later in binding code:
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+
+When binding STL containers pybind11 considers the types of the container's
+elements to decide whether the container should be confined to the local module
+(via the :ref:`module_local` feature).  If the container element types are
+anything other than already-bound custom types bound without
+``py::module_local()`` the container binding will have ``py::module_local()``
+applied.  This includes converting types such as numeric types, strings, Eigen
+types; and types that have not yet been bound at the time of the stl container
+binding.  This module-local binding is designed to avoid potential conflicts
+between module bindings (for example, from two separate modules each attempting
+to bind ``std::vector<int>`` as a python type).
+
+It is possible to override this behavior to force a definition to be either
+module-local or global.  To do so, you can pass the attributes
+``py::module_local()`` (to make the binding module-local) or
+``py::module_local(false)`` (to make the binding global) into the
+``py::bind_vector`` or ``py::bind_map`` arguments:
+
+.. code-block:: cpp
+
+    py::bind_vector<std::vector<int>>(m, "VectorInt", py::module_local(false));
+
+Note, however, that such a global binding would make it impossible to load this
+module at the same time as any other pybind module that also attempts to bind
+the same container type (``std::vector<int>`` in the above example).
+
+See :ref:`module_local` for more details on module-local bindings.
+
+.. seealso::
+
+    The file :file:`tests/test_stl_binders.cpp` shows how to use the
+    convenience STL container wrappers.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/strings.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/strings.rst
new file mode 100644
index 0000000..e25701e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/strings.rst
@@ -0,0 +1,305 @@
+Strings, bytes and Unicode conversions
+######################################
+
+.. note::
+
+    This section discusses string handling in terms of Python 3 strings. For
+    Python 2.7, replace all occurrences of ``str`` with ``unicode`` and
+    ``bytes`` with ``str``.  Python 2.7 users may find it best to use ``from
+    __future__ import unicode_literals`` to avoid unintentionally using ``str``
+    instead of ``unicode``.
+
+Passing Python strings to C++
+=============================
+
+When a Python ``str`` is passed from Python to a C++ function that accepts
+``std::string`` or ``char *`` as arguments, pybind11 will encode the Python
+string to UTF-8. All Python ``str`` can be encoded in UTF-8, so this operation
+does not fail.
+
+The C++ language is encoding agnostic. It is the responsibility of the
+programmer to track encodings. It's often easiest to simply `use UTF-8
+everywhere <http://utf8everywhere.org/>`_.
+
+.. code-block:: c++
+
+    m.def("utf8_test",
+        [](const std::string &s) {
+            cout << "utf-8 is icing on the cake.\n";
+            cout << s;
+        }
+    );
+    m.def("utf8_charptr",
+        [](const char *s) {
+            cout << "My favorite food is\n";
+            cout << s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> utf8_test('🎂')
+    utf-8 is icing on the cake.
+    🎂
+
+    >>> utf8_charptr('🍕')
+    My favorite food is
+    🍕
+
+.. note::
+
+    Some terminal emulators do not support UTF-8 or emoji fonts and may not
+    display the example above correctly.
+
+The results are the same whether the C++ function accepts arguments by value or
+reference, and whether or not ``const`` is used.
+
+Passing bytes to C++
+--------------------
+
+A Python ``bytes`` object will be passed to C++ functions that accept
+``std::string`` or ``char*`` *without* conversion.  On Python 3, in order to
+make a function *only* accept ``bytes`` (and not ``str``), declare it as taking
+a ``py::bytes`` argument.
+
+
+Returning C++ strings to Python
+===============================
+
+When a C++ function returns a ``std::string`` or ``char*`` to a Python caller,
+**pybind11 will assume that the string is valid UTF-8** and will decode it to a
+native Python ``str``, using the same API as Python uses to perform
+``bytes.decode('utf-8')``. If this implicit conversion fails, pybind11 will
+raise a ``UnicodeDecodeError``.
+
+.. code-block:: c++
+
+    m.def("std_string_return",
+        []() {
+            return std::string("This string needs to be UTF-8 encoded");
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.std_string_return(), str)
+    True
+
+
+Because UTF-8 is inclusive of pure ASCII, there is never any issue with
+returning a pure ASCII string to Python. If there is any possibility that the
+string is not pure ASCII, it is necessary to ensure the encoding is valid
+UTF-8.
+
+.. warning::
+
+    Implicit conversion assumes that a returned ``char *`` is null-terminated.
+    If there is no null terminator a buffer overrun will occur.
+
+Explicit conversions
+--------------------
+
+If some C++ code constructs a ``std::string`` that is not a UTF-8 string, one
+can perform a explicit conversion and return a ``py::str`` object. Explicit
+conversion has the same overhead as implicit conversion.
+
+.. code-block:: c++
+
+    // This uses the Python C API to convert Latin-1 to Unicode
+    m.def("str_output",
+        []() {
+            std::string s = "Send your r\xe9sum\xe9 to Alice in HR"; // Latin-1
+            py::str py_s = PyUnicode_DecodeLatin1(s.data(), s.length());
+            return py_s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> str_output()
+    'Send your résumé to Alice in HR'
+
+The `Python C API
+<https://docs.python.org/3/c-api/unicode.html#built-in-codecs>`_ provides
+several built-in codecs.
+
+
+One could also use a third party encoding library such as libiconv to transcode
+to UTF-8.
+
+Return C++ strings without conversion
+-------------------------------------
+
+If the data in a C++ ``std::string`` does not represent text and should be
+returned to Python as ``bytes``, then one can return the data as a
+``py::bytes`` object.
+
+.. code-block:: c++
+
+    m.def("return_bytes",
+        []() {
+            std::string s("\xba\xd0\xba\xd0");  // Not valid UTF-8
+            return py::bytes(s);  // Return the data without transcoding
+        }
+    );
+
+.. code-block:: python
+
+    >>> example.return_bytes()
+    b'\xba\xd0\xba\xd0'
+
+
+Note the asymmetry: pybind11 will convert ``bytes`` to ``std::string`` without
+encoding, but cannot convert ``std::string`` back to ``bytes`` implicitly.
+
+.. code-block:: c++
+
+    m.def("asymmetry",
+        [](std::string s) {  // Accepts str or bytes from Python
+            return s;  // Looks harmless, but implicitly converts to str
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.asymmetry(b"have some bytes"), str)
+    True
+
+    >>> example.asymmetry(b"\xba\xd0\xba\xd0")  # invalid utf-8 as bytes
+    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte
+
+
+Wide character strings
+======================
+
+When a Python ``str`` is passed to a C++ function expecting ``std::wstring``,
+``wchar_t*``, ``std::u16string`` or ``std::u32string``, the ``str`` will be
+encoded to UTF-16 or UTF-32 depending on how the C++ compiler implements each
+type, in the platform's native endianness. When strings of these types are
+returned, they are assumed to contain valid UTF-16 or UTF-32, and will be
+decoded to Python ``str``.
+
+.. code-block:: c++
+
+    #define UNICODE
+    #include <windows.h>
+
+    m.def("set_window_text",
+        [](HWND hwnd, std::wstring s) {
+            // Call SetWindowText with null-terminated UTF-16 string
+            ::SetWindowText(hwnd, s.c_str());
+        }
+    );
+    m.def("get_window_text",
+        [](HWND hwnd) {
+            const int buffer_size = ::GetWindowTextLength(hwnd) + 1;
+            auto buffer = std::make_unique< wchar_t[] >(buffer_size);
+
+            ::GetWindowText(hwnd, buffer.data(), buffer_size);
+
+            std::wstring text(buffer.get());
+
+            // wstring will be converted to Python str
+            return text;
+        }
+    );
+
+.. warning::
+
+    Wide character strings may not work as described on Python 2.7 or Python
+    3.3 compiled with ``--enable-unicode=ucs2``.
+
+Strings in multibyte encodings such as Shift-JIS must transcoded to a
+UTF-8/16/32 before being returned to Python.
+
+
+Character literals
+==================
+
+C++ functions that accept character literals as input will receive the first
+character of a Python ``str`` as their input. If the string is longer than one
+Unicode character, trailing characters will be ignored.
+
+When a character literal is returned from C++ (such as a ``char`` or a
+``wchar_t``), it will be converted to a ``str`` that represents the single
+character.
+
+.. code-block:: c++
+
+    m.def("pass_char", [](char c) { return c; });
+    m.def("pass_wchar", [](wchar_t w) { return w; });
+
+.. code-block:: python
+
+    >>> example.pass_char('A')
+    'A'
+
+While C++ will cast integers to character types (``char c = 0x65;``), pybind11
+does not convert Python integers to characters implicitly. The Python function
+``chr()`` can be used to convert integers to characters.
+
+.. code-block:: python
+
+    >>> example.pass_char(0x65)
+    TypeError
+
+    >>> example.pass_char(chr(0x65))
+    'A'
+
+If the desire is to work with an 8-bit integer, use ``int8_t`` or ``uint8_t``
+as the argument type.
+
+Grapheme clusters
+-----------------
+
+A single grapheme may be represented by two or more Unicode characters. For
+example 'é' is usually represented as U+00E9 but can also be expressed as the
+combining character sequence U+0065 U+0301 (that is, the letter 'e' followed by
+a combining acute accent). The combining character will be lost if the
+two-character sequence is passed as an argument, even though it renders as a
+single grapheme.
+
+.. code-block:: python
+
+    >>> example.pass_wchar('é')
+    'é'
+
+    >>> combining_e_acute = 'e' + '\u0301'
+
+    >>> combining_e_acute
+    'é'
+
+    >>> combining_e_acute == 'é'
+    False
+
+    >>> example.pass_wchar(combining_e_acute)
+    'e'
+
+Normalizing combining characters before passing the character literal to C++
+may resolve *some* of these issues:
+
+.. code-block:: python
+
+    >>> example.pass_wchar(unicodedata.normalize('NFC', combining_e_acute))
+    'é'
+
+In some languages (Thai for example), there are `graphemes that cannot be
+expressed as a single Unicode code point
+<http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>`_, so there is
+no way to capture them in a C++ character type.
+
+
+C++17 string views
+==================
+
+C++17 string views are automatically supported when compiling in C++17 mode.
+They follow the same rules for encoding and decoding as the corresponding STL
+string type (for example, a ``std::u16string_view`` argument will be passed
+UTF-16-encoded data, and a returned ``std::string_view`` will be decoded as
+UTF-8).
+
+References
+==========
+
+* `The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!) <https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/>`_
+* `C++ - Using STL Strings at Win32 API Boundaries <https://msdn.microsoft.com/en-ca/magazine/mt238407.aspx>`_
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/classes.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/classes.rst
new file mode 100644
index 0000000..ae5907d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/classes.rst
@@ -0,0 +1,1126 @@
+Classes
+#######
+
+This section presents advanced binding code for classes and it is assumed
+that you are already familiar with the basics from :doc:`/classes`.
+
+.. _overriding_virtuals:
+
+Overriding virtual functions in Python
+======================================
+
+Suppose that a C++ class or interface has a virtual function that we'd like to
+to override from within Python (we'll focus on the class ``Animal``; ``Dog`` is
+given as a specific example of how one would do this with traditional C++
+code).
+
+.. code-block:: cpp
+
+    class Animal {
+    public:
+        virtual ~Animal() { }
+        virtual std::string go(int n_times) = 0;
+    };
+
+    class Dog : public Animal {
+    public:
+        std::string go(int n_times) override {
+            std::string result;
+            for (int i=0; i<n_times; ++i)
+                result += "woof! ";
+            return result;
+        }
+    };
+
+Let's also suppose that we are given a plain function which calls the
+function ``go()`` on an arbitrary ``Animal`` instance.
+
+.. code-block:: cpp
+
+    std::string call_go(Animal *animal) {
+        return animal->go(3);
+    }
+
+Normally, the binding code for these classes would look as follows:
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal>(m, "Animal")
+            .def("go", &Animal::go);
+
+        py::class_<Dog, Animal>(m, "Dog")
+            .def(py::init<>());
+
+        m.def("call_go", &call_go);
+    }
+
+However, these bindings are impossible to extend: ``Animal`` is not
+constructible, and we clearly require some kind of "trampoline" that
+redirects virtual calls back to Python.
+
+Defining a new type of ``Animal`` from within Python is possible but requires a
+helper class that is defined as follows:
+
+.. code-block:: cpp
+
+    class PyAnimal : public Animal {
+    public:
+        /* Inherit the constructors */
+        using Animal::Animal;
+
+        /* Trampoline (need one for each virtual function) */
+        std::string go(int n_times) override {
+            PYBIND11_OVERLOAD_PURE(
+                std::string, /* Return type */
+                Animal,      /* Parent class */
+                go,          /* Name of function in C++ (must match Python name) */
+                n_times      /* Argument(s) */
+            );
+        }
+    };
+
+The macro :c:macro:`PYBIND11_OVERLOAD_PURE` should be used for pure virtual
+functions, and :c:macro:`PYBIND11_OVERLOAD` should be used for functions which have
+a default implementation.  There are also two alternate macros 
+:c:macro:`PYBIND11_OVERLOAD_PURE_NAME` and :c:macro:`PYBIND11_OVERLOAD_NAME` which
+take a string-valued name argument between the *Parent class* and *Name of the
+function* slots, which defines the name of function in Python. This is required
+when the C++ and Python versions of the
+function have different names, e.g.  ``operator()`` vs ``__call__``.
+
+The binding code also needs a few minor adaptations (highlighted):
+
+.. code-block:: cpp
+    :emphasize-lines: 2,3
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal, PyAnimal /* <--- trampoline*/>(m, "Animal")
+            .def(py::init<>())
+            .def("go", &Animal::go);
+
+        py::class_<Dog, Animal>(m, "Dog")
+            .def(py::init<>());
+
+        m.def("call_go", &call_go);
+    }
+
+Importantly, pybind11 is made aware of the trampoline helper class by
+specifying it as an extra template argument to :class:`class_`. (This can also
+be combined with other template arguments such as a custom holder type; the
+order of template types does not matter).  Following this, we are able to
+define a constructor as usual.
+
+Bindings should be made against the actual class, not the trampoline helper class.
+
+.. code-block:: cpp
+    :emphasize-lines: 3
+
+    py::class_<Animal, PyAnimal /* <--- trampoline*/>(m, "Animal");
+        .def(py::init<>())
+        .def("go", &PyAnimal::go); /* <--- THIS IS WRONG, use &Animal::go */
+
+Note, however, that the above is sufficient for allowing python classes to
+extend ``Animal``, but not ``Dog``: see :ref:`virtual_and_inheritance` for the
+necessary steps required to providing proper overload support for inherited
+classes.
+
+The Python session below shows how to override ``Animal::go`` and invoke it via
+a virtual method call.
+
+.. code-block:: pycon
+
+    >>> from example import *
+    >>> d = Dog()
+    >>> call_go(d)
+    u'woof! woof! woof! '
+    >>> class Cat(Animal):
+    ...     def go(self, n_times):
+    ...             return "meow! " * n_times
+    ...
+    >>> c = Cat()
+    >>> call_go(c)
+    u'meow! meow! meow! '
+
+If you are defining a custom constructor in a derived Python class, you *must*
+ensure that you explicitly call the bound C++ constructor using ``__init__``,
+*regardless* of whether it is a default constructor or not. Otherwise, the
+memory for the C++ portion of the instance will be left uninitialized, which
+will generally leave the C++ instance in an invalid state and cause undefined
+behavior if the C++ instance is subsequently used.
+
+Here is an example:
+
+.. code-block:: python
+
+    class Dachshund(Dog):
+        def __init__(self, name):
+            Dog.__init__(self) # Without this, undefined behavior may occur if the C++ portions are referenced.
+            self.name = name
+        def bark(self):
+            return "yap!"
+
+Note that a direct ``__init__`` constructor *should be called*, and ``super()``
+should not be used. For simple cases of linear inheritance, ``super()``
+may work, but once you begin mixing Python and C++ multiple inheritance,
+things will fall apart due to differences between Python's MRO and C++'s
+mechanisms.
+
+Please take a look at the :ref:`macro_notes` before using this feature.
+
+.. note::
+
+    When the overridden type returns a reference or pointer to a type that
+    pybind11 converts from Python (for example, numeric values, std::string,
+    and other built-in value-converting types), there are some limitations to
+    be aware of:
+
+    - because in these cases there is no C++ variable to reference (the value
+      is stored in the referenced Python variable), pybind11 provides one in
+      the PYBIND11_OVERLOAD macros (when needed) with static storage duration.
+      Note that this means that invoking the overloaded method on *any*
+      instance will change the referenced value stored in *all* instances of
+      that type.
+
+    - Attempts to modify a non-const reference will not have the desired
+      effect: it will change only the static cache variable, but this change
+      will not propagate to underlying Python instance, and the change will be
+      replaced the next time the overload is invoked.
+
+.. seealso::
+
+    The file :file:`tests/test_virtual_functions.cpp` contains a complete
+    example that demonstrates how to override virtual functions using pybind11
+    in more detail.
+
+.. _virtual_and_inheritance:
+
+Combining virtual functions and inheritance
+===========================================
+
+When combining virtual methods with inheritance, you need to be sure to provide
+an override for each method for which you want to allow overrides from derived
+python classes.  For example, suppose we extend the above ``Animal``/``Dog``
+example as follows:
+
+.. code-block:: cpp
+
+    class Animal {
+    public:
+        virtual std::string go(int n_times) = 0;
+        virtual std::string name() { return "unknown"; }
+    };
+    class Dog : public Animal {
+    public:
+        std::string go(int n_times) override {
+            std::string result;
+            for (int i=0; i<n_times; ++i)
+                result += bark() + " ";
+            return result;
+        }
+        virtual std::string bark() { return "woof!"; }
+    };
+
+then the trampoline class for ``Animal`` must, as described in the previous
+section, override ``go()`` and ``name()``, but in order to allow python code to
+inherit properly from ``Dog``, we also need a trampoline class for ``Dog`` that
+overrides both the added ``bark()`` method *and* the ``go()`` and ``name()``
+methods inherited from ``Animal`` (even though ``Dog`` doesn't directly
+override the ``name()`` method):
+
+.. code-block:: cpp
+
+    class PyAnimal : public Animal {
+    public:
+        using Animal::Animal; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, Animal, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Animal, name, ); }
+    };
+    class PyDog : public Dog {
+    public:
+        using Dog::Dog; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD(std::string, Dog, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Dog, name, ); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, Dog, bark, ); }
+    };
+
+.. note::
+
+    Note the trailing commas in the ``PYBIND11_OVERLOAD`` calls to ``name()``
+    and ``bark()``. These are needed to portably implement a trampoline for a
+    function that does not take any arguments. For functions that take
+    a nonzero number of arguments, the trailing comma must be omitted.
+
+A registered class derived from a pybind11-registered class with virtual
+methods requires a similar trampoline class, *even if* it doesn't explicitly
+declare or override any virtual methods itself:
+
+.. code-block:: cpp
+
+    class Husky : public Dog {};
+    class PyHusky : public Husky {
+    public:
+        using Husky::Husky; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, Husky, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, Husky, name, ); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, Husky, bark, ); }
+    };
+
+There is, however, a technique that can be used to avoid this duplication
+(which can be especially helpful for a base class with several virtual
+methods).  The technique involves using template trampoline classes, as
+follows:
+
+.. code-block:: cpp
+
+    template <class AnimalBase = Animal> class PyAnimal : public AnimalBase {
+    public:
+        using AnimalBase::AnimalBase; // Inherit constructors
+        std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, AnimalBase, go, n_times); }
+        std::string name() override { PYBIND11_OVERLOAD(std::string, AnimalBase, name, ); }
+    };
+    template <class DogBase = Dog> class PyDog : public PyAnimal<DogBase> {
+    public:
+        using PyAnimal<DogBase>::PyAnimal; // Inherit constructors
+        // Override PyAnimal's pure virtual go() with a non-pure one:
+        std::string go(int n_times) override { PYBIND11_OVERLOAD(std::string, DogBase, go, n_times); }
+        std::string bark() override { PYBIND11_OVERLOAD(std::string, DogBase, bark, ); }
+    };
+
+This technique has the advantage of requiring just one trampoline method to be
+declared per virtual method and pure virtual method override.  It does,
+however, require the compiler to generate at least as many methods (and
+possibly more, if both pure virtual and overridden pure virtual methods are
+exposed, as above).
+
+The classes are then registered with pybind11 using:
+
+.. code-block:: cpp
+
+    py::class_<Animal, PyAnimal<>> animal(m, "Animal");
+    py::class_<Dog, PyDog<>> dog(m, "Dog");
+    py::class_<Husky, PyDog<Husky>> husky(m, "Husky");
+    // ... add animal, dog, husky definitions
+
+Note that ``Husky`` did not require a dedicated trampoline template class at
+all, since it neither declares any new virtual methods nor provides any pure
+virtual method implementations.
+
+With either the repeated-virtuals or templated trampoline methods in place, you
+can now create a python class that inherits from ``Dog``:
+
+.. code-block:: python
+
+    class ShihTzu(Dog):
+        def bark(self):
+            return "yip!"
+
+.. seealso::
+
+    See the file :file:`tests/test_virtual_functions.cpp` for complete examples
+    using both the duplication and templated trampoline approaches.
+
+.. _extended_aliases:
+
+Extended trampoline class functionality
+=======================================
+
+.. _extended_class_functionality_forced_trampoline:
+
+Forced trampoline class initialisation
+--------------------------------------
+The trampoline classes described in the previous sections are, by default, only
+initialized when needed.  More specifically, they are initialized when a python
+class actually inherits from a registered type (instead of merely creating an
+instance of the registered type), or when a registered constructor is only
+valid for the trampoline class but not the registered class.  This is primarily
+for performance reasons: when the trampoline class is not needed for anything
+except virtual method dispatching, not initializing the trampoline class
+improves performance by avoiding needing to do a run-time check to see if the
+inheriting python instance has an overloaded method.
+
+Sometimes, however, it is useful to always initialize a trampoline class as an
+intermediate class that does more than just handle virtual method dispatching.
+For example, such a class might perform extra class initialization, extra
+destruction operations, and might define new members and methods to enable a
+more python-like interface to a class.
+
+In order to tell pybind11 that it should *always* initialize the trampoline
+class when creating new instances of a type, the class constructors should be
+declared using ``py::init_alias<Args, ...>()`` instead of the usual
+``py::init<Args, ...>()``.  This forces construction via the trampoline class,
+ensuring member initialization and (eventual) destruction.
+
+.. seealso::
+
+    See the file :file:`tests/test_virtual_functions.cpp` for complete examples
+    showing both normal and forced trampoline instantiation.
+
+Different method signatures
+---------------------------
+The macro's introduced in :ref:`overriding_virtuals` cover most of the standard
+use cases when exposing C++ classes to Python. Sometimes it is hard or unwieldy
+to create a direct one-on-one mapping between the arguments and method return
+type.
+
+An example would be when the C++ signature contains output arguments using
+references (See also :ref:`faq_reference_arguments`). Another way of solving
+this is to use the method body of the trampoline class to do conversions to the
+input and return of the Python method.
+
+The main building block to do so is the :func:`get_overload`, this function
+allows retrieving a method implemented in Python from within the trampoline's
+methods. Consider for example a C++ method which has the signature
+``bool myMethod(int32_t& value)``, where the return indicates whether
+something should be done with the ``value``. This can be made convenient on the
+Python side by allowing the Python function to return ``None`` or an ``int``:
+
+.. code-block:: cpp
+
+    bool MyClass::myMethod(int32_t& value)
+    {
+        pybind11::gil_scoped_acquire gil;  // Acquire the GIL while in this scope.
+        // Try to look up the overloaded method on the Python side.
+        pybind11::function overload = pybind11::get_overload(this, "myMethod");
+        if (overload) {  // method is found
+            auto obj = overload(value);  // Call the Python function.
+            if (py::isinstance<py::int_>(obj)) {  // check if it returned a Python integer type
+                value = obj.cast<int32_t>();  // Cast it and assign it to the value.
+                return true;  // Return true; value should be used.
+            } else {
+                return false;  // Python returned none, return false.
+            }
+        }
+        return false;  // Alternatively return MyClass::myMethod(value);
+    }
+
+
+.. _custom_constructors:
+
+Custom constructors
+===================
+
+The syntax for binding constructors was previously introduced, but it only
+works when a constructor of the appropriate arguments actually exists on the
+C++ side.  To extend this to more general cases, pybind11 makes it possible
+to bind factory functions as constructors. For example, suppose you have a
+class like this:
+
+.. code-block:: cpp
+
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function:
+        static Example create(int a) { return Example(a); }
+    };
+
+    py::class_<Example>(m, "Example")
+        .def(py::init(&Example::create));
+
+While it is possible to create a straightforward binding of the static
+``create`` method, it may sometimes be preferable to expose it as a constructor
+on the Python side. This can be accomplished by calling ``.def(py::init(...))``
+with the function reference returning the new instance passed as an argument.
+It is also possible to use this approach to bind a function returning a new
+instance by raw pointer or by the holder (e.g. ``std::unique_ptr``).
+
+The following example shows the different approaches:
+
+.. code-block:: cpp
+
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function - returned by value:
+        static Example create(int a) { return Example(a); }
+
+        // These constructors are publicly callable:
+        Example(double);
+        Example(int, int);
+        Example(std::string);
+    };
+
+    py::class_<Example>(m, "Example")
+        // Bind the factory function as a constructor:
+        .def(py::init(&Example::create))
+        // Bind a lambda function returning a pointer wrapped in a holder:
+        .def(py::init([](std::string arg) {
+            return std::unique_ptr<Example>(new Example(arg));
+        }))
+        // Return a raw pointer:
+        .def(py::init([](int a, int b) { return new Example(a, b); }))
+        // You can mix the above with regular C++ constructor bindings as well:
+        .def(py::init<double>())
+        ;
+
+When the constructor is invoked from Python, pybind11 will call the factory
+function and store the resulting C++ instance in the Python instance.
+
+When combining factory functions constructors with :ref:`virtual function
+trampolines <overriding_virtuals>` there are two approaches.  The first is to
+add a constructor to the alias class that takes a base value by
+rvalue-reference.  If such a constructor is available, it will be used to
+construct an alias instance from the value returned by the factory function.
+The second option is to provide two factory functions to ``py::init()``: the
+first will be invoked when no alias class is required (i.e. when the class is
+being used but not inherited from in Python), and the second will be invoked
+when an alias is required.
+
+You can also specify a single factory function that always returns an alias
+instance: this will result in behaviour similar to ``py::init_alias<...>()``,
+as described in the :ref:`extended trampoline class documentation
+<extended_aliases>`.
+
+The following example shows the different factory approaches for a class with
+an alias:
+
+.. code-block:: cpp
+
+    #include <pybind11/factory.h>
+    class Example {
+    public:
+        // ...
+        virtual ~Example() = default;
+    };
+    class PyExample : public Example {
+    public:
+        using Example::Example;
+        PyExample(Example &&base) : Example(std::move(base)) {}
+    };
+    py::class_<Example, PyExample>(m, "Example")
+        // Returns an Example pointer.  If a PyExample is needed, the Example
+        // instance will be moved via the extra constructor in PyExample, above.
+        .def(py::init([]() { return new Example(); }))
+        // Two callbacks:
+        .def(py::init([]() { return new Example(); } /* no alias needed */,
+                      []() { return new PyExample(); } /* alias needed */))
+        // *Always* returns an alias instance (like py::init_alias<>())
+        .def(py::init([]() { return new PyExample(); }))
+        ;
+
+Brace initialization
+--------------------
+
+``pybind11::init<>`` internally uses C++11 brace initialization to call the
+constructor of the target class. This means that it can be used to bind
+*implicit* constructors as well:
+
+.. code-block:: cpp
+
+    struct Aggregate {
+        int a;
+        std::string b;
+    };
+
+    py::class_<Aggregate>(m, "Aggregate")
+        .def(py::init<int, const std::string &>());
+
+.. note::
+
+    Note that brace initialization preferentially invokes constructor overloads
+    taking a ``std::initializer_list``. In the rare event that this causes an
+    issue, you can work around it by using ``py::init(...)`` with a lambda
+    function that constructs the new object as desired.
+
+.. _classes_with_non_public_destructors:
+
+Non-public destructors
+======================
+
+If a class has a private or protected destructor (as might e.g. be the case in
+a singleton pattern), a compile error will occur when creating bindings via
+pybind11. The underlying issue is that the ``std::unique_ptr`` holder type that
+is responsible for managing the lifetime of instances will reference the
+destructor even if no deallocations ever take place. In order to expose classes
+with private or protected destructors, it is possible to override the holder
+type via a holder type argument to ``class_``. Pybind11 provides a helper class
+``py::nodelete`` that disables any destructor invocations. In this case, it is
+crucial that instances are deallocated on the C++ side to avoid memory leaks.
+
+.. code-block:: cpp
+
+    /* ... definition ... */
+
+    class MyClass {
+    private:
+        ~MyClass() { }
+    };
+
+    /* ... binding code ... */
+
+    py::class_<MyClass, std::unique_ptr<MyClass, py::nodelete>>(m, "MyClass")
+        .def(py::init<>())
+
+.. _implicit_conversions:
+
+Implicit conversions
+====================
+
+Suppose that instances of two types ``A`` and ``B`` are used in a project, and
+that an ``A`` can easily be converted into an instance of type ``B`` (examples of this
+could be a fixed and an arbitrary precision number type).
+
+.. code-block:: cpp
+
+    py::class_<A>(m, "A")
+        /// ... members ...
+
+    py::class_<B>(m, "B")
+        .def(py::init<A>())
+        /// ... members ...
+
+    m.def("func",
+        [](const B &) { /* .... */ }
+    );
+
+To invoke the function ``func`` using a variable ``a`` containing an ``A``
+instance, we'd have to write ``func(B(a))`` in Python. On the other hand, C++
+will automatically apply an implicit type conversion, which makes it possible
+to directly write ``func(a)``.
+
+In this situation (i.e. where ``B`` has a constructor that converts from
+``A``), the following statement enables similar implicit conversions on the
+Python side:
+
+.. code-block:: cpp
+
+    py::implicitly_convertible<A, B>();
+
+.. note::
+
+    Implicit conversions from ``A`` to ``B`` only work when ``B`` is a custom
+    data type that is exposed to Python via pybind11.
+
+    To prevent runaway recursion, implicit conversions are non-reentrant: an
+    implicit conversion invoked as part of another implicit conversion of the
+    same type (i.e. from ``A`` to ``B``) will fail.
+
+.. _static_properties:
+
+Static properties
+=================
+
+The section on :ref:`properties` discussed the creation of instance properties
+that are implemented in terms of C++ getters and setters.
+
+Static properties can also be created in a similar way to expose getters and
+setters of static class attributes. Note that the implicit ``self`` argument
+also exists in this case and is used to pass the Python ``type`` subclass
+instance. This parameter will often not be needed by the C++ side, and the
+following example illustrates how to instantiate a lambda getter function
+that ignores it:
+
+.. code-block:: cpp
+
+    py::class_<Foo>(m, "Foo")
+        .def_property_readonly_static("foo", [](py::object /* self */) { return Foo(); });
+
+Operator overloading
+====================
+
+Suppose that we're given the following ``Vector2`` class with a vector addition
+and scalar multiplication operation, all implemented using overloaded operators
+in C++.
+
+.. code-block:: cpp
+
+    class Vector2 {
+    public:
+        Vector2(float x, float y) : x(x), y(y) { }
+
+        Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
+        Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
+        Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
+        Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
+
+        friend Vector2 operator*(float f, const Vector2 &v) {
+            return Vector2(f * v.x, f * v.y);
+        }
+
+        std::string toString() const {
+            return "[" + std::to_string(x) + ", " + std::to_string(y) + "]";
+        }
+    private:
+        float x, y;
+    };
+
+The following snippet shows how the above operators can be conveniently exposed
+to Python.
+
+.. code-block:: cpp
+
+    #include <pybind11/operators.h>
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Vector2>(m, "Vector2")
+            .def(py::init<float, float>())
+            .def(py::self + py::self)
+            .def(py::self += py::self)
+            .def(py::self *= float())
+            .def(float() * py::self)
+            .def(py::self * float())
+            .def(-py::self)
+            .def("__repr__", &Vector2::toString);
+    }
+
+Note that a line like
+
+.. code-block:: cpp
+
+            .def(py::self * float())
+
+is really just short hand notation for
+
+.. code-block:: cpp
+
+    .def("__mul__", [](const Vector2 &a, float b) {
+        return a * b;
+    }, py::is_operator())
+
+This can be useful for exposing additional operators that don't exist on the
+C++ side, or to perform other types of customization. The ``py::is_operator``
+flag marker is needed to inform pybind11 that this is an operator, which
+returns ``NotImplemented`` when invoked with incompatible arguments rather than
+throwing a type error.
+
+.. note::
+
+    To use the more convenient ``py::self`` notation, the additional
+    header file :file:`pybind11/operators.h` must be included.
+
+.. seealso::
+
+    The file :file:`tests/test_operator_overloading.cpp` contains a
+    complete example that demonstrates how to work with overloaded operators in
+    more detail.
+
+.. _pickling:
+
+Pickling support
+================
+
+Python's ``pickle`` module provides a powerful facility to serialize and
+de-serialize a Python object graph into a binary data stream. To pickle and
+unpickle C++ classes using pybind11, a ``py::pickle()`` definition must be
+provided. Suppose the class in question has the following signature:
+
+.. code-block:: cpp
+
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra(int extra) { m_extra = extra; }
+        int extra() const { return m_extra; }
+    private:
+        std::string m_value;
+        int m_extra = 0;
+    };
+
+Pickling support in Python is enabled by defining the ``__setstate__`` and
+``__getstate__`` methods [#f3]_. For pybind11 classes, use ``py::pickle()``
+to bind these two functions:
+
+.. code-block:: cpp
+
+    py::class_<Pickleable>(m, "Pickleable")
+        .def(py::init<std::string>())
+        .def("value", &Pickleable::value)
+        .def("extra", &Pickleable::extra)
+        .def("setExtra", &Pickleable::setExtra)
+        .def(py::pickle(
+            [](const Pickleable &p) { // __getstate__
+                /* Return a tuple that fully encodes the state of the object */
+                return py::make_tuple(p.value(), p.extra());
+            },
+            [](py::tuple t) { // __setstate__
+                if (t.size() != 2)
+                    throw std::runtime_error("Invalid state!");
+
+                /* Create a new C++ instance */
+                Pickleable p(t[0].cast<std::string>());
+
+                /* Assign any additional state */
+                p.setExtra(t[1].cast<int>());
+
+                return p;
+            }
+        ));
+
+The ``__setstate__`` part of the ``py::picke()`` definition follows the same
+rules as the single-argument version of ``py::init()``. The return type can be
+a value, pointer or holder type. See :ref:`custom_constructors` for details.
+
+An instance can now be pickled as follows:
+
+.. code-block:: python
+
+    try:
+        import cPickle as pickle  # Use cPickle on Python 2.7
+    except ImportError:
+        import pickle
+
+    p = Pickleable("test_value")
+    p.setExtra(15)
+    data = pickle.dumps(p, 2)
+
+Note that only the cPickle module is supported on Python 2.7. The second
+argument to ``dumps`` is also crucial: it selects the pickle protocol version
+2, since the older version 1 is not supported. Newer versions are also fine—for
+instance, specify ``-1`` to always use the latest available version. Beware:
+failure to follow these instructions will cause important pybind11 memory
+allocation routines to be skipped during unpickling, which will likely lead to
+memory corruption and/or segmentation faults.
+
+.. seealso::
+
+    The file :file:`tests/test_pickling.cpp` contains a complete example
+    that demonstrates how to pickle and unpickle types using pybind11 in more
+    detail.
+
+.. [#f3] http://docs.python.org/3/library/pickle.html#pickling-class-instances
+
+Multiple Inheritance
+====================
+
+pybind11 can create bindings for types that derive from multiple base types
+(aka. *multiple inheritance*). To do so, specify all bases in the template
+arguments of the ``class_`` declaration:
+
+.. code-block:: cpp
+
+    py::class_<MyType, BaseType1, BaseType2, BaseType3>(m, "MyType")
+       ...
+
+The base types can be specified in arbitrary order, and they can even be
+interspersed with alias types and holder types (discussed earlier in this
+document)---pybind11 will automatically find out which is which. The only
+requirement is that the first template argument is the type to be declared.
+
+It is also permitted to inherit multiply from exported C++ classes in Python,
+as well as inheriting from multiple Python and/or pybind11-exported classes.
+
+There is one caveat regarding the implementation of this feature:
+
+When only one base type is specified for a C++ type that actually has multiple
+bases, pybind11 will assume that it does not participate in multiple
+inheritance, which can lead to undefined behavior. In such cases, add the tag
+``multiple_inheritance`` to the class constructor:
+
+.. code-block:: cpp
+
+    py::class_<MyType, BaseType2>(m, "MyType", py::multiple_inheritance());
+
+The tag is redundant and does not need to be specified when multiple base types
+are listed.
+
+.. _module_local:
+
+Module-local class bindings
+===========================
+
+When creating a binding for a class, pybind11 by default makes that binding
+"global" across modules.  What this means is that a type defined in one module
+can be returned from any module resulting in the same Python type.  For
+example, this allows the following:
+
+.. code-block:: cpp
+
+    // In the module1.cpp binding code for module1:
+    py::class_<Pet>(m, "Pet")
+        .def(py::init<std::string>())
+        .def_readonly("name", &Pet::name);
+
+.. code-block:: cpp
+
+    // In the module2.cpp binding code for module2:
+    m.def("create_pet", [](std::string name) { return new Pet(name); });
+
+.. code-block:: pycon
+
+    >>> from module1 import Pet
+    >>> from module2 import create_pet
+    >>> pet1 = Pet("Kitty")
+    >>> pet2 = create_pet("Doggy")
+    >>> pet2.name()
+    'Doggy'
+
+When writing binding code for a library, this is usually desirable: this
+allows, for example, splitting up a complex library into multiple Python
+modules.
+
+In some cases, however, this can cause conflicts.  For example, suppose two
+unrelated modules make use of an external C++ library and each provide custom
+bindings for one of that library's classes.  This will result in an error when
+a Python program attempts to import both modules (directly or indirectly)
+because of conflicting definitions on the external type:
+
+.. code-block:: cpp
+
+    // dogs.cpp
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("name", &pets::Pet::name);
+
+    // Binding for local extension class:
+    py::class<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+
+.. code-block:: cpp
+
+    // cats.cpp, in a completely separate project from the above dogs.cpp.
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("get_name", &pets::Pet::name);
+
+    // Binding for local extending class:
+    py::class<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+
+.. code-block:: pycon
+
+    >>> import cats
+    >>> import dogs
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ImportError: generic_type: type "Pet" is already registered!
+
+To get around this, you can tell pybind11 to keep the external class binding
+localized to the module by passing the ``py::module_local()`` attribute into
+the ``py::class_`` constructor:
+
+.. code-block:: cpp
+
+    // Pet binding in dogs.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+
+.. code-block:: cpp
+
+    // Pet binding in cats.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+
+This makes the Python-side ``dogs.Pet`` and ``cats.Pet`` into distinct classes,
+avoiding the conflict and allowing both modules to be loaded.  C++ code in the
+``dogs`` module that casts or returns a ``Pet`` instance will result in a
+``dogs.Pet`` Python instance, while C++ code in the ``cats`` module will result
+in a ``cats.Pet`` Python instance.
+
+This does come with two caveats, however: First, external modules cannot return
+or cast a ``Pet`` instance to Python (unless they also provide their own local
+bindings).  Second, from the Python point of view they are two distinct classes.
+
+Note that the locality only applies in the C++ -> Python direction.  When
+passing such a ``py::module_local`` type into a C++ function, the module-local
+classes are still considered.  This means that if the following function is
+added to any module (including but not limited to the ``cats`` and ``dogs``
+modules above) it will be callable with either a ``dogs.Pet`` or ``cats.Pet``
+argument:
+
+.. code-block:: cpp
+
+    m.def("pet_name", [](const pets::Pet &pet) { return pet.name(); });
+
+For example, suppose the above function is added to each of ``cats.cpp``,
+``dogs.cpp`` and ``frogs.cpp`` (where ``frogs.cpp`` is some other module that
+does *not* bind ``Pets`` at all).
+
+.. code-block:: pycon
+
+    >>> import cats, dogs, frogs  # No error because of the added py::module_local()
+    >>> mycat, mydog = cats.Cat("Fluffy"), dogs.Dog("Rover")
+    >>> (cats.pet_name(mycat), dogs.pet_name(mydog))
+    ('Fluffy', 'Rover')
+    >>> (cats.pet_name(mydog), dogs.pet_name(mycat), frogs.pet_name(mycat))
+    ('Rover', 'Fluffy', 'Fluffy')
+
+It is possible to use ``py::module_local()`` registrations in one module even
+if another module registers the same type globally: within the module with the
+module-local definition, all C++ instances will be cast to the associated bound
+Python type.  In other modules any such values are converted to the global
+Python type created elsewhere.
+
+.. note::
+
+    STL bindings (as provided via the optional :file:`pybind11/stl_bind.h`
+    header) apply ``py::module_local`` by default when the bound type might
+    conflict with other modules; see :ref:`stl_bind` for details.
+
+.. note::
+
+    The localization of the bound types is actually tied to the shared object
+    or binary generated by the compiler/linker.  For typical modules created
+    with ``PYBIND11_MODULE()``, this distinction is not significant.  It is
+    possible, however, when :ref:`embedding` to embed multiple modules in the
+    same binary (see :ref:`embedding_modules`).  In such a case, the
+    localization will apply across all embedded modules within the same binary.
+
+.. seealso::
+
+    The file :file:`tests/test_local_bindings.cpp` contains additional examples
+    that demonstrate how ``py::module_local()`` works.
+
+Binding protected member functions
+==================================
+
+It's normally not possible to expose ``protected`` member functions to Python:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    py::class_<A>(m, "A")
+        .def("foo", &A::foo); // error: 'foo' is a protected member of 'A'
+
+On one hand, this is good because non-``public`` members aren't meant to be
+accessed from the outside. But we may want to make use of ``protected``
+functions in derived Python classes.
+
+The following pattern makes this possible:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    class Publicist : public A { // helper type for exposing protected functions
+    public:
+        using A::foo; // inherited with different access modifier
+    };
+
+    py::class_<A>(m, "A") // bind the primary class
+        .def("foo", &Publicist::foo); // expose protected methods via the publicist
+
+This works because ``&Publicist::foo`` is exactly the same function as
+``&A::foo`` (same signature and address), just with a different access
+modifier. The only purpose of the ``Publicist`` helper class is to make
+the function name ``public``.
+
+If the intent is to expose ``protected`` ``virtual`` functions which can be
+overridden in Python, the publicist pattern can be combined with the previously
+described trampoline:
+
+.. code-block:: cpp
+
+    class A {
+    public:
+        virtual ~A() = default;
+
+    protected:
+        virtual int foo() const { return 42; }
+    };
+
+    class Trampoline : public A {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, A, foo, ); }
+    };
+
+    class Publicist : public A {
+    public:
+        using A::foo;
+    };
+
+    py::class_<A, Trampoline>(m, "A") // <-- `Trampoline` here
+        .def("foo", &Publicist::foo); // <-- `Publicist` here, not `Trampoline`!
+
+.. note::
+
+    MSVC 2015 has a compiler bug (fixed in version 2017) which
+    requires a more explicit function binding in the form of
+    ``.def("foo", static_cast<int (A::*)() const>(&Publicist::foo));``
+    where ``int (A::*)() const`` is the type of ``A::foo``.
+
+Custom automatic downcasters
+============================
+
+As explained in :ref:`inheritance`, pybind11 comes with built-in
+understanding of the dynamic type of polymorphic objects in C++; that
+is, returning a Pet to Python produces a Python object that knows it's
+wrapping a Dog, if Pet has virtual methods and pybind11 knows about
+Dog and this Pet is in fact a Dog. Sometimes, you might want to
+provide this automatic downcasting behavior when creating bindings for
+a class hierarchy that does not use standard C++ polymorphism, such as
+LLVM [#f4]_. As long as there's some way to determine at runtime
+whether a downcast is safe, you can proceed by specializing the
+``pybind11::polymorphic_type_hook`` template:
+
+.. code-block:: cpp
+
+    enum class PetKind { Cat, Dog, Zebra };
+    struct Pet {   // Not polymorphic: has no virtual methods
+        const PetKind kind;
+        int age = 0;
+      protected:
+        Pet(PetKind _kind) : kind(_kind) {}
+    };
+    struct Dog : Pet {
+        Dog() : Pet(PetKind::Dog) {}
+        std::string sound = "woof!";
+        std::string bark() const { return sound; }
+    };
+
+    namespace pybind11 {
+        template<> struct polymorphic_type_hook<Pet> {
+            static const void *get(const Pet *src, const std::type_info*& type) {
+                // note that src may be nullptr
+                if (src && src->kind == PetKind::Dog) {
+                    type = &typeid(Dog);
+                    return static_cast<const Dog*>(src);
+                }
+                return src;
+            }
+        };
+    } // namespace pybind11
+
+When pybind11 wants to convert a C++ pointer of type ``Base*`` to a
+Python object, it calls ``polymorphic_type_hook<Base>::get()`` to
+determine if a downcast is possible. The ``get()`` function should use
+whatever runtime information is available to determine if its ``src``
+parameter is in fact an instance of some class ``Derived`` that
+inherits from ``Base``. If it finds such a ``Derived``, it sets ``type
+= &typeid(Derived)`` and returns a pointer to the ``Derived`` object
+that contains ``src``. Otherwise, it just returns ``src``, leaving
+``type`` at its default value of nullptr. If you set ``type`` to a
+type that pybind11 doesn't know about, no downcasting will occur, and
+the original ``src`` pointer will be used with its static type
+``Base*``.
+
+It is critical that the returned pointer and ``type`` argument of
+``get()`` agree with each other: if ``type`` is set to something
+non-null, the returned pointer must point to the start of an object
+whose type is ``type``. If the hierarchy being exposed uses only
+single inheritance, a simple ``return src;`` will achieve this just
+fine, but in the general case, you must cast ``src`` to the
+appropriate derived-class pointer (e.g. using
+``static_cast<Derived>(src)``) before allowing it to be returned as a
+``void*``.
+
+.. [#f4] https://llvm.org/docs/HowToSetUpLLVMStyleRTTI.html
+
+.. note::
+
+    pybind11's standard support for downcasting objects whose types
+    have virtual methods is implemented using
+    ``polymorphic_type_hook`` too, using the standard C++ ability to
+    determine the most-derived type of a polymorphic object using
+    ``typeid()`` and to cast a base pointer to that most-derived type
+    (even if you don't know what it is) using ``dynamic_cast<void*>``.
+
+.. seealso::
+
+    The file :file:`tests/test_tagbased_polymorphic.cpp` contains a
+    more complete example, including a demonstration of how to provide
+    automatic downcasting for an entire class hierarchy without
+    writing one get() function for each class.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/embedding.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/embedding.rst
new file mode 100644
index 0000000..3930316
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/embedding.rst
@@ -0,0 +1,261 @@
+.. _embedding:
+
+Embedding the interpreter
+#########################
+
+While pybind11 is mainly focused on extending Python using C++, it's also
+possible to do the reverse: embed the Python interpreter into a C++ program.
+All of the other documentation pages still apply here, so refer to them for
+general pybind11 usage. This section will cover a few extra things required
+for embedding.
+
+Getting started
+===============
+
+A basic executable with an embedded interpreter can be created with just a few
+lines of CMake and the ``pybind11::embed`` target, as shown below. For more
+information, see :doc:`/compiling`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.0)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or `add_subdirectory(pybind11)`
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+The essential structure of the ``main.cpp`` file looks like this:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h> // everything needed for embedding
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+        py::print("Hello, World!"); // use the Python API
+    }
+
+The interpreter must be initialized before using any Python API, which includes
+all the functions and classes in pybind11. The RAII guard class `scoped_interpreter`
+takes care of the interpreter lifetime. After the guard is destroyed, the interpreter
+shuts down and clears its memory. No Python functions can be called after this.
+
+Executing Python code
+=====================
+
+There are a few different ways to run Python code. One option is to use `eval`,
+`exec` or `eval_file`, as explained in :ref:`eval`. Here is a quick example in
+the context of an executable with an embedded interpreter:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        py::exec(R"(
+            kwargs = dict(name="World", number=42)
+            message = "Hello, {name}! The answer is {number}".format(**kwargs)
+            print(message)
+        )");
+    }
+
+Alternatively, similar results can be achieved using pybind11's API (see
+:doc:`/advanced/pycpp/index` for more details).
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto kwargs = py::dict("name"_a="World", "number"_a=42);
+        auto message = "Hello, {name}! The answer is {number}"_s.format(**kwargs);
+        py::print(message);
+    }
+
+The two approaches can also be combined:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    #include <iostream>
+
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto locals = py::dict("name"_a="World", "number"_a=42);
+        py::exec(R"(
+            message = "Hello, {name}! The answer is {number}".format(**locals())
+        )", py::globals(), locals);
+
+        auto message = locals["message"].cast<std::string>();
+        std::cout << message;
+    }
+
+Importing modules
+=================
+
+Python modules can be imported using `module::import()`:
+
+.. code-block:: cpp
+
+    py::module sys = py::module::import("sys");
+    py::print(sys.attr("path"));
+
+For convenience, the current working directory is included in ``sys.path`` when
+embedding the interpreter. This makes it easy to import local Python files:
+
+.. code-block:: python
+
+    """calc.py located in the working directory"""
+
+    def add(i, j):
+        return i + j
+
+
+.. code-block:: cpp
+
+    py::module calc = py::module::import("calc");
+    py::object result = calc.attr("add")(1, 2);
+    int n = result.cast<int>();
+    assert(n == 3);
+
+Modules can be reloaded using `module::reload()` if the source is modified e.g.
+by an external process. This can be useful in scenarios where the application
+imports a user defined data processing script which needs to be updated after
+changes by the user. Note that this function does not reload modules recursively.
+
+.. _embedding_modules:
+
+Adding embedded modules
+=======================
+
+Embedded binary modules can be added using the `PYBIND11_EMBEDDED_MODULE` macro.
+Note that the definition must be placed at global scope. They can be imported
+like any other module.
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(fast_calc, m) {
+        // `m` is a `py::module` which is used to bind functions and classes
+        m.def("add", [](int i, int j) {
+            return i + j;
+        });
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto fast_calc = py::module::import("fast_calc");
+        auto result = fast_calc.attr("add")(1, 2).cast<int>();
+        assert(result == 3);
+    }
+
+Unlike extension modules where only a single binary module can be created, on
+the embedded side an unlimited number of modules can be added using multiple
+`PYBIND11_EMBEDDED_MODULE` definitions (as long as they have unique names).
+
+These modules are added to Python's list of builtins, so they can also be
+imported in pure Python files loaded by the interpreter. Everything interacts
+naturally:
+
+.. code-block:: python
+
+    """py_module.py located in the working directory"""
+    import cpp_module
+
+    a = cpp_module.a
+    b = a + 1
+
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(cpp_module, m) {
+        m.attr("a") = 1;
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto py_module = py::module::import("py_module");
+
+        auto locals = py::dict("fmt"_a="{} + {} = {}", **py_module.attr("__dict__"));
+        assert(locals["a"].cast<int>() == 1);
+        assert(locals["b"].cast<int>() == 2);
+
+        py::exec(R"(
+            c = a + b
+            message = fmt.format(a, b, c)
+        )", py::globals(), locals);
+
+        assert(locals["c"].cast<int>() == 3);
+        assert(locals["message"].cast<std::string>() == "1 + 2 = 3");
+    }
+
+
+Interpreter lifetime
+====================
+
+The Python interpreter shuts down when `scoped_interpreter` is destroyed. After
+this, creating a new instance will restart the interpreter. Alternatively, the
+`initialize_interpreter` / `finalize_interpreter` pair of functions can be used
+to directly set the state at any time.
+
+Modules created with pybind11 can be safely re-initialized after the interpreter
+has been restarted. However, this may not apply to third-party extension modules.
+The issue is that Python itself cannot completely unload extension modules and
+there are several caveats with regard to interpreter restarting. In short, not
+all memory may be freed, either due to Python reference cycles or user-created
+global data. All the details can be found in the CPython documentation.
+
+.. warning::
+
+    Creating two concurrent `scoped_interpreter` guards is a fatal error. So is
+    calling `initialize_interpreter` for a second time after the interpreter
+    has already been initialized.
+
+    Do not use the raw CPython API functions ``Py_Initialize`` and
+    ``Py_Finalize`` as these do not properly handle the lifetime of
+    pybind11's internal data.
+
+
+Sub-interpreter support
+=======================
+
+Creating multiple copies of `scoped_interpreter` is not possible because it
+represents the main Python interpreter. Sub-interpreters are something different
+and they do permit the existence of multiple interpreters. This is an advanced
+feature of the CPython API and should be handled with care. pybind11 does not
+currently offer a C++ interface for sub-interpreters, so refer to the CPython
+documentation for all the details regarding this feature.
+
+We'll just mention a couple of caveats the sub-interpreters support in pybind11:
+
+ 1. Sub-interpreters will not receive independent copies of embedded modules.
+    Instead, these are shared and modifications in one interpreter may be
+    reflected in another.
+
+ 2. Managing multiple threads, multiple interpreters and the GIL can be
+    challenging and there are several caveats here, even within the pure
+    CPython API (please refer to the Python docs for details). As for
+    pybind11, keep in mind that `gil_scoped_release` and `gil_scoped_acquire`
+    do not take sub-interpreters into account.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/exceptions.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/exceptions.rst
new file mode 100644
index 0000000..75ad7f7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/exceptions.rst
@@ -0,0 +1,144 @@
+Exceptions
+##########
+
+Built-in exception translation
+==============================
+
+When C++ code invoked from Python throws an ``std::exception``, it is
+automatically converted into a Python ``Exception``. pybind11 defines multiple
+special exception classes that will map to different types of Python
+exceptions:
+
+.. tabularcolumns:: |p{0.5\textwidth}|p{0.45\textwidth}|
+
++--------------------------------------+--------------------------------------+
+|  C++ exception type                  |  Python exception type               |
++======================================+======================================+
+| :class:`std::exception`              | ``RuntimeError``                     |
++--------------------------------------+--------------------------------------+
+| :class:`std::bad_alloc`              | ``MemoryError``                      |
++--------------------------------------+--------------------------------------+
+| :class:`std::domain_error`           | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::invalid_argument`       | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::length_error`           | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::out_of_range`           | ``IndexError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::range_error`            | ``ValueError``                       |
++--------------------------------------+--------------------------------------+
+| :class:`std::overflow_error`         | ``OverflowError``                    |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::stop_iteration`    | ``StopIteration`` (used to implement |
+|                                      | custom iterators)                    |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::index_error`       | ``IndexError`` (used to indicate out |
+|                                      | of bounds access in ``__getitem__``, |
+|                                      | ``__setitem__``, etc.)               |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::value_error`       | ``ValueError`` (used to indicate     |
+|                                      | wrong value passed in                |
+|                                      | ``container.remove(...)``)           |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::key_error`         | ``KeyError`` (used to indicate out   |
+|                                      | of bounds access in ``__getitem__``, |
+|                                      | ``__setitem__`` in dict-like         |
+|                                      | objects, etc.)                       |
++--------------------------------------+--------------------------------------+
+| :class:`pybind11::error_already_set` | Indicates that the Python exception  |
+|                                      | flag has already been set via Python |
+|                                      | API calls from C++ code; this C++    |
+|                                      | exception is used to propagate such  |
+|                                      | a Python exception back to Python.   |
++--------------------------------------+--------------------------------------+
+
+When a Python function invoked from C++ throws an exception, it is converted
+into a C++ exception of type :class:`error_already_set` whose string payload
+contains a textual summary.
+
+There is also a special exception :class:`cast_error` that is thrown by
+:func:`handle::call` when the input arguments cannot be converted to Python
+objects.
+
+Registering custom translators
+==============================
+
+If the default exception conversion policy described above is insufficient,
+pybind11 also provides support for registering custom exception translators.
+To register a simple exception conversion that translates a C++ exception into
+a new Python exception using the C++ exception's ``what()`` method, a helper
+function is available:
+
+.. code-block:: cpp
+
+    py::register_exception<CppExp>(module, "PyExp");
+
+This call creates a Python exception class with the name ``PyExp`` in the given
+module and automatically converts any encountered exceptions of type ``CppExp``
+into Python exceptions of type ``PyExp``.
+
+When more advanced exception translation is needed, the function
+``py::register_exception_translator(translator)`` can be used to register
+functions that can translate arbitrary exception types (and which may include
+additional logic to do so).  The function takes a stateless callable (e.g.  a
+function pointer or a lambda function without captured variables) with the call
+signature ``void(std::exception_ptr)``.
+
+When a C++ exception is thrown, the registered exception translators are tried
+in reverse order of registration (i.e. the last registered translator gets the
+first shot at handling the exception).
+
+Inside the translator, ``std::rethrow_exception`` should be used within
+a try block to re-throw the exception.  One or more catch clauses to catch
+the appropriate exceptions should then be used with each clause using
+``PyErr_SetString`` to set a Python exception or ``ex(string)`` to set
+the python exception to a custom exception type (see below).
+
+To declare a custom Python exception type, declare a ``py::exception`` variable
+and use this in the associated exception translator (note: it is often useful
+to make this a static declaration when using it inside a lambda expression
+without requiring capturing).
+
+
+The following example demonstrates this for a hypothetical exception classes
+``MyCustomException`` and ``OtherException``: the first is translated to a
+custom python exception ``MyCustomError``, while the second is translated to a
+standard python RuntimeError:
+
+.. code-block:: cpp
+
+    static py::exception<MyCustomException> exc(m, "MyCustomError");
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyCustomException &e) {
+            exc(e.what());
+        } catch (const OtherException &e) {
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+    });
+
+Multiple exceptions can be handled by a single translator, as shown in the
+example above. If the exception is not caught by the current translator, the
+previously registered one gets a chance.
+
+If none of the registered exception translators is able to handle the
+exception, it is handled by the default converter as described in the previous
+section.
+
+.. seealso::
+
+    The file :file:`tests/test_exceptions.cpp` contains examples
+    of various custom exception translators and custom exception types.
+
+.. note::
+
+    You must call either ``PyErr_SetString`` or a custom exception's call
+    operator (``exc(string)``) for every exception caught in a custom exception
+    translator.  Failure to do so will cause Python to crash with ``SystemError:
+    error return without exception set``.
+
+    Exceptions that you do not plan to handle should simply not be caught, or
+    may be explicitly (re-)thrown to delegate it to the other,
+    previously-declared existing exception translators.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/functions.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/functions.rst
new file mode 100644
index 0000000..3e1a3ff
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/functions.rst
@@ -0,0 +1,507 @@
+Functions
+#########
+
+Before proceeding with this section, make sure that you are already familiar
+with the basics of binding functions and classes, as explained in :doc:`/basics`
+and :doc:`/classes`. The following guide is applicable to both free and member
+functions, i.e. *methods* in Python.
+
+.. _return_value_policies:
+
+Return value policies
+=====================
+
+Python and C++ use fundamentally different ways of managing the memory and
+lifetime of objects managed by them. This can lead to issues when creating
+bindings for functions that return a non-trivial type. Just by looking at the
+type information, it is not clear whether Python should take charge of the
+returned value and eventually free its resources, or if this is handled on the
+C++ side. For this reason, pybind11 provides a several *return value policy*
+annotations that can be passed to the :func:`module::def` and
+:func:`class_::def` functions. The default policy is
+:enum:`return_value_policy::automatic`.
+
+Return value policies are tricky, and it's very important to get them right.
+Just to illustrate what can go wrong, consider the following simple example:
+
+.. code-block:: cpp
+
+    /* Function declaration */
+    Data *get_data() { return _data; /* (pointer to a static data structure) */ }
+    ...
+
+    /* Binding code */
+    m.def("get_data", &get_data); // <-- KABOOM, will cause crash when called from Python
+
+What's going on here? When ``get_data()`` is called from Python, the return
+value (a native C++ type) must be wrapped to turn it into a usable Python type.
+In this case, the default return value policy (:enum:`return_value_policy::automatic`)
+causes pybind11 to assume ownership of the static ``_data`` instance.
+
+When Python's garbage collector eventually deletes the Python
+wrapper, pybind11 will also attempt to delete the C++ instance (via ``operator
+delete()``) due to the implied ownership. At this point, the entire application
+will come crashing down, though errors could also be more subtle and involve
+silent data corruption.
+
+In the above example, the policy :enum:`return_value_policy::reference` should have
+been specified so that the global data instance is only *referenced* without any
+implied transfer of ownership, i.e.:
+
+.. code-block:: cpp
+
+    m.def("get_data", &get_data, return_value_policy::reference);
+
+On the other hand, this is not the right policy for many other situations,
+where ignoring ownership could lead to resource leaks.
+As a developer using pybind11, it's important to be familiar with the different
+return value policies, including which situation calls for which one of them.
+The following table provides an overview of available policies:
+
+.. tabularcolumns:: |p{0.5\textwidth}|p{0.45\textwidth}|
+
++--------------------------------------------------+----------------------------------------------------------------------------+
+| Return value policy                              | Description                                                                |
++==================================================+============================================================================+
+| :enum:`return_value_policy::take_ownership`      | Reference an existing object (i.e. do not create a new copy) and take      |
+|                                                  | ownership. Python will call the destructor and delete operator when the    |
+|                                                  | object's reference count reaches zero. Undefined behavior ensues when the  |
+|                                                  | C++ side does the same, or when the data was not dynamically allocated.    |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::copy`                | Create a new copy of the returned object, which will be owned by Python.   |
+|                                                  | This policy is comparably safe because the lifetimes of the two instances  |
+|                                                  | are decoupled.                                                             |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::move`                | Use ``std::move`` to move the return value contents into a new instance    |
+|                                                  | that will be owned by Python. This policy is comparably safe because the   |
+|                                                  | lifetimes of the two instances (move source and destination) are decoupled.|
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::reference`           | Reference an existing object, but do not take ownership. The C++ side is   |
+|                                                  | responsible for managing the object's lifetime and deallocating it when    |
+|                                                  | it is no longer used. Warning: undefined behavior will ensue when the C++  |
+|                                                  | side deletes an object that is still referenced and used by Python.        |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::reference_internal`  | Indicates that the lifetime of the return value is tied to the lifetime    |
+|                                                  | of a parent object, namely the implicit ``this``, or ``self`` argument of  |
+|                                                  | the called method or property. Internally, this policy works just like     |
+|                                                  | :enum:`return_value_policy::reference` but additionally applies a          |
+|                                                  | ``keep_alive<0, 1>`` *call policy* (described in the next section) that    |
+|                                                  | prevents the parent object from being garbage collected as long as the     |
+|                                                  | return value is referenced by Python. This is the default policy for       |
+|                                                  | property getters created via ``def_property``, ``def_readwrite``, etc.     |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::automatic`           | **Default policy.** This policy falls back to the policy                   |
+|                                                  | :enum:`return_value_policy::take_ownership` when the return value is a     |
+|                                                  | pointer. Otherwise, it uses :enum:`return_value_policy::move` or           |
+|                                                  | :enum:`return_value_policy::copy` for rvalue and lvalue references,        |
+|                                                  | respectively. See above for a description of what all of these different   |
+|                                                  | policies do.                                                               |
++--------------------------------------------------+----------------------------------------------------------------------------+
+| :enum:`return_value_policy::automatic_reference` | As above, but use policy :enum:`return_value_policy::reference` when the   |
+|                                                  | return value is a pointer. This is the default conversion policy for       |
+|                                                  | function arguments when calling Python functions manually from C++ code    |
+|                                                  | (i.e. via handle::operator()). You probably won't need to use this.        |
++--------------------------------------------------+----------------------------------------------------------------------------+
+
+Return value policies can also be applied to properties:
+
+.. code-block:: cpp
+
+    class_<MyClass>(m, "MyClass")
+        .def_property("data", &MyClass::getData, &MyClass::setData,
+                      py::return_value_policy::copy);
+
+Technically, the code above applies the policy to both the getter and the
+setter function, however, the setter doesn't really care about *return*
+value policies which makes this a convenient terse syntax. Alternatively,
+targeted arguments can be passed through the :class:`cpp_function` constructor:
+
+.. code-block:: cpp
+
+    class_<MyClass>(m, "MyClass")
+        .def_property("data"
+            py::cpp_function(&MyClass::getData, py::return_value_policy::copy),
+            py::cpp_function(&MyClass::setData)
+        );
+
+.. warning::
+
+    Code with invalid return value policies might access uninitialized memory or
+    free data structures multiple times, which can lead to hard-to-debug
+    non-determinism and segmentation faults, hence it is worth spending the
+    time to understand all the different options in the table above.
+
+.. note::
+
+    One important aspect of the above policies is that they only apply to
+    instances which pybind11 has *not* seen before, in which case the policy
+    clarifies essential questions about the return value's lifetime and
+    ownership.  When pybind11 knows the instance already (as identified by its
+    type and address in memory), it will return the existing Python object
+    wrapper rather than creating a new copy.
+
+.. note::
+
+    The next section on :ref:`call_policies` discusses *call policies* that can be
+    specified *in addition* to a return value policy from the list above. Call
+    policies indicate reference relationships that can involve both return values
+    and parameters of functions.
+
+.. note::
+
+   As an alternative to elaborate call policies and lifetime management logic,
+   consider using smart pointers (see the section on :ref:`smart_pointers` for
+   details). Smart pointers can tell whether an object is still referenced from
+   C++ or Python, which generally eliminates the kinds of inconsistencies that
+   can lead to crashes or undefined behavior. For functions returning smart
+   pointers, it is not necessary to specify a return value policy.
+
+.. _call_policies:
+
+Additional call policies
+========================
+
+In addition to the above return value policies, further *call policies* can be
+specified to indicate dependencies between parameters or ensure a certain state
+for the function call.
+
+Keep alive
+----------
+
+In general, this policy is required when the C++ object is any kind of container
+and another object is being added to the container. ``keep_alive<Nurse, Patient>``
+indicates that the argument with index ``Patient`` should be kept alive at least
+until the argument with index ``Nurse`` is freed by the garbage collector. Argument
+indices start at one, while zero refers to the return value. For methods, index
+``1`` refers to the implicit ``this`` pointer, while regular arguments begin at
+index ``2``. Arbitrarily many call policies can be specified. When a ``Nurse``
+with value ``None`` is detected at runtime, the call policy does nothing.
+
+When the nurse is not a pybind11-registered type, the implementation internally
+relies on the ability to create a *weak reference* to the nurse object. When
+the nurse object is not a pybind11-registered type and does not support weak
+references, an exception will be thrown.
+
+Consider the following example: here, the binding code for a list append
+operation ties the lifetime of the newly added element to the underlying
+container:
+
+.. code-block:: cpp
+
+    py::class_<List>(m, "List")
+        .def("append", &List::append, py::keep_alive<1, 2>());
+
+For consistency, the argument indexing is identical for constructors. Index
+``1`` still refers to the implicit ``this`` pointer, i.e. the object which is
+being constructed. Index ``0`` refers to the return type which is presumed to
+be ``void`` when a constructor is viewed like a function. The following example
+ties the lifetime of the constructor element to the constructed object:
+
+.. code-block:: cpp
+
+    py::class_<Nurse>(m, "Nurse")
+        .def(py::init<Patient &>(), py::keep_alive<1, 2>());
+
+.. note::
+
+    ``keep_alive`` is analogous to the ``with_custodian_and_ward`` (if Nurse,
+    Patient != 0) and ``with_custodian_and_ward_postcall`` (if Nurse/Patient ==
+    0) policies from Boost.Python.
+
+Call guard
+----------
+
+The ``call_guard<T>`` policy allows any scope guard type ``T`` to be placed
+around the function call. For example, this definition:
+
+.. code-block:: cpp
+
+    m.def("foo", foo, py::call_guard<T>());
+
+is equivalent to the following pseudocode:
+
+.. code-block:: cpp
+
+    m.def("foo", [](args...) {
+        T scope_guard;
+        return foo(args...); // forwarded arguments
+    });
+
+The only requirement is that ``T`` is default-constructible, but otherwise any
+scope guard will work. This is very useful in combination with `gil_scoped_release`.
+See :ref:`gil`.
+
+Multiple guards can also be specified as ``py::call_guard<T1, T2, T3...>``. The
+constructor order is left to right and destruction happens in reverse.
+
+.. seealso::
+
+    The file :file:`tests/test_call_policies.cpp` contains a complete example
+    that demonstrates using `keep_alive` and `call_guard` in more detail.
+
+.. _python_objects_as_args:
+
+Python objects as arguments
+===========================
+
+pybind11 exposes all major Python types using thin C++ wrapper classes. These
+wrapper classes can also be used as parameters of functions in bindings, which
+makes it possible to directly work with native Python types on the C++ side.
+For instance, the following statement iterates over a Python ``dict``:
+
+.. code-block:: cpp
+
+    void print_dict(py::dict dict) {
+        /* Easily interact with Python types */
+        for (auto item : dict)
+            std::cout << "key=" << std::string(py::str(item.first)) << ", "
+                      << "value=" << std::string(py::str(item.second)) << std::endl;
+    }
+
+It can be exported:
+
+.. code-block:: cpp
+
+    m.def("print_dict", &print_dict);
+
+And used in Python as usual:
+
+.. code-block:: pycon
+
+    >>> print_dict({'foo': 123, 'bar': 'hello'})
+    key=foo, value=123
+    key=bar, value=hello
+
+For more information on using Python objects in C++, see :doc:`/advanced/pycpp/index`.
+
+Accepting \*args and \*\*kwargs
+===============================
+
+Python provides a useful mechanism to define functions that accept arbitrary
+numbers of arguments and keyword arguments:
+
+.. code-block:: python
+
+   def generic(*args, **kwargs):
+       ...  # do something with args and kwargs
+
+Such functions can also be created using pybind11:
+
+.. code-block:: cpp
+
+   void generic(py::args args, py::kwargs kwargs) {
+       /// .. do something with args
+       if (kwargs)
+           /// .. do something with kwargs
+   }
+
+   /// Binding code
+   m.def("generic", &generic);
+
+The class ``py::args`` derives from ``py::tuple`` and ``py::kwargs`` derives
+from ``py::dict``.
+
+You may also use just one or the other, and may combine these with other
+arguments as long as the ``py::args`` and ``py::kwargs`` arguments are the last
+arguments accepted by the function.
+
+Please refer to the other examples for details on how to iterate over these,
+and on how to cast their entries into C++ objects. A demonstration is also
+available in ``tests/test_kwargs_and_defaults.cpp``.
+
+.. note::
+
+    When combining \*args or \*\*kwargs with :ref:`keyword_args` you should
+    *not* include ``py::arg`` tags for the ``py::args`` and ``py::kwargs``
+    arguments.
+
+Default arguments revisited
+===========================
+
+The section on :ref:`default_args` previously discussed basic usage of default
+arguments using pybind11. One noteworthy aspect of their implementation is that
+default arguments are converted to Python objects right at declaration time.
+Consider the following example:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg("arg") = SomeType(123));
+
+In this case, pybind11 must already be set up to deal with values of the type
+``SomeType`` (via a prior instantiation of ``py::class_<SomeType>``), or an
+exception will be thrown.
+
+Another aspect worth highlighting is that the "preview" of the default argument
+in the function signature is generated using the object's ``__repr__`` method.
+If not available, the signature may not be very helpful, e.g.:
+
+.. code-block:: pycon
+
+    FUNCTIONS
+    ...
+    |  myFunction(...)
+    |      Signature : (MyClass, arg : SomeType = <SomeType object at 0x101b7b080>) -> NoneType
+    ...
+
+The first way of addressing this is by defining ``SomeType.__repr__``.
+Alternatively, it is possible to specify the human-readable preview of the
+default argument manually using the ``arg_v`` notation:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg_v("arg", SomeType(123), "SomeType(123)"));
+
+Sometimes it may be necessary to pass a null pointer value as a default
+argument. In this case, remember to cast it to the underlying type in question,
+like so:
+
+.. code-block:: cpp
+
+    py::class_<MyClass>("MyClass")
+        .def("myFunction", py::arg("arg") = (SomeType *) nullptr);
+
+.. _nonconverting_arguments:
+
+Non-converting arguments
+========================
+
+Certain argument types may support conversion from one type to another.  Some
+examples of conversions are:
+
+* :ref:`implicit_conversions` declared using ``py::implicitly_convertible<A,B>()``
+* Calling a method accepting a double with an integer argument
+* Calling a ``std::complex<float>`` argument with a non-complex python type
+  (for example, with a float).  (Requires the optional ``pybind11/complex.h``
+  header).
+* Calling a function taking an Eigen matrix reference with a numpy array of the
+  wrong type or of an incompatible data layout.  (Requires the optional
+  ``pybind11/eigen.h`` header).
+
+This behaviour is sometimes undesirable: the binding code may prefer to raise
+an error rather than convert the argument.  This behaviour can be obtained
+through ``py::arg`` by calling the ``.noconvert()`` method of the ``py::arg``
+object, such as:
+
+.. code-block:: cpp
+
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+
+Attempting the call the second function (the one without ``.noconvert()``) with
+an integer will succeed, but attempting to call the ``.noconvert()`` version
+will fail with a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> floats_preferred(4)
+    2.0
+    >>> floats_only(4)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: floats_only(): incompatible function arguments. The following argument types are supported:
+        1. (f: float) -> float
+
+    Invoked with: 4
+
+You may, of course, combine this with the :var:`_a` shorthand notation (see
+:ref:`keyword_args`) and/or :ref:`default_args`.  It is also permitted to omit
+the argument name by using the ``py::arg()`` constructor without an argument
+name, i.e. by specifying ``py::arg().noconvert()``.
+
+.. note::
+
+    When specifying ``py::arg`` options it is necessary to provide the same
+    number of options as the bound function has arguments.  Thus if you want to
+    enable no-convert behaviour for just one of several arguments, you will
+    need to specify a ``py::arg()`` annotation for each argument with the
+    no-convert argument modified to ``py::arg().noconvert()``.
+
+.. _none_arguments:
+
+Allow/Prohibiting None arguments
+================================
+
+When a C++ type registered with :class:`py::class_` is passed as an argument to
+a function taking the instance as pointer or shared holder (e.g. ``shared_ptr``
+or a custom, copyable holder as described in :ref:`smart_pointers`), pybind
+allows ``None`` to be passed from Python which results in calling the C++
+function with ``nullptr`` (or an empty holder) for the argument.
+
+To explicitly enable or disable this behaviour, using the
+``.none`` method of the :class:`py::arg` object:
+
+.. code-block:: cpp
+
+    py::class_<Dog>(m, "Dog").def(py::init<>());
+    py::class_<Cat>(m, "Cat").def(py::init<>());
+    m.def("bark", [](Dog *dog) -> std::string {
+        if (dog) return "woof!"; /* Called with a Dog instance */
+        else return "(no dog)"; /* Called with None, dog == nullptr */
+    }, py::arg("dog").none(true));
+    m.def("meow", [](Cat *cat) -> std::string {
+        // Can't be called with None argument
+        return "meow";
+    }, py::arg("cat").none(false));
+
+With the above, the Python call ``bark(None)`` will return the string ``"(no
+dog)"``, while attempting to call ``meow(None)`` will raise a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> from animals import Dog, Cat, bark, meow
+    >>> bark(Dog())
+    'woof!'
+    >>> meow(Cat())
+    'meow'
+    >>> bark(None)
+    '(no dog)'
+    >>> meow(None)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: meow(): incompatible function arguments. The following argument types are supported:
+        1. (cat: animals.Cat) -> str
+
+    Invoked with: None
+
+The default behaviour when the tag is unspecified is to allow ``None``.
+
+.. note::
+
+    Even when ``.none(true)`` is specified for an argument, ``None`` will be converted to a
+    ``nullptr`` *only* for custom and :ref:`opaque <opaque>` types. Pointers to built-in types
+    (``double *``, ``int *``, ...) and STL types (``std::vector<T> *``, ...; if ``pybind11/stl.h``
+    is included) are copied when converted to C++ (see :doc:`/advanced/cast/overview`) and will
+    not allow ``None`` as argument.  To pass optional argument of these copied types consider
+    using ``std::optional<T>``
+
+Overload resolution order
+=========================
+
+When a function or method with multiple overloads is called from Python,
+pybind11 determines which overload to call in two passes.  The first pass
+attempts to call each overload without allowing argument conversion (as if
+every argument had been specified as ``py::arg().noconvert()`` as described
+above).
+
+If no overload succeeds in the no-conversion first pass, a second pass is
+attempted in which argument conversion is allowed (except where prohibited via
+an explicit ``py::arg().noconvert()`` attribute in the function definition).
+
+If the second pass also fails a ``TypeError`` is raised.
+
+Within each pass, overloads are tried in the order they were registered with
+pybind11.
+
+What this means in practice is that pybind11 will prefer any overload that does
+not require conversion of arguments to an overload that does, but otherwise prefers
+earlier-defined overloads to later-defined ones.
+
+.. note::
+
+    pybind11 does *not* further prioritize based on the number/pattern of
+    overloaded arguments.  That is, pybind11 does not prioritize a function
+    requiring one conversion over one requiring three, but only prioritizes
+    overloads requiring no conversion at all to overloads that require
+    conversion of at least one argument.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/misc.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/misc.rst
new file mode 100644
index 0000000..5b38ec7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/misc.rst
@@ -0,0 +1,306 @@
+Miscellaneous
+#############
+
+.. _macro_notes:
+
+General notes regarding convenience macros
+==========================================
+
+pybind11 provides a few convenience macros such as
+:func:`PYBIND11_DECLARE_HOLDER_TYPE` and ``PYBIND11_OVERLOAD_*``. Since these
+are "just" macros that are evaluated in the preprocessor (which has no concept
+of types), they *will* get confused by commas in a template argument; for
+example, consider:
+
+.. code-block:: cpp
+
+    PYBIND11_OVERLOAD(MyReturnType<T1, T2>, Class<T3, T4>, func)
+
+The limitation of the C preprocessor interprets this as five arguments (with new
+arguments beginning after each comma) rather than three.  To get around this,
+there are two alternatives: you can use a type alias, or you can wrap the type
+using the ``PYBIND11_TYPE`` macro:
+
+.. code-block:: cpp
+
+    // Version 1: using a type alias
+    using ReturnType = MyReturnType<T1, T2>;
+    using ClassType = Class<T3, T4>;
+    PYBIND11_OVERLOAD(ReturnType, ClassType, func);
+
+    // Version 2: using the PYBIND11_TYPE macro:
+    PYBIND11_OVERLOAD(PYBIND11_TYPE(MyReturnType<T1, T2>),
+                      PYBIND11_TYPE(Class<T3, T4>), func)
+
+The ``PYBIND11_MAKE_OPAQUE`` macro does *not* require the above workarounds.
+
+.. _gil:
+
+Global Interpreter Lock (GIL)
+=============================
+
+When calling a C++ function from Python, the GIL is always held.
+The classes :class:`gil_scoped_release` and :class:`gil_scoped_acquire` can be
+used to acquire and release the global interpreter lock in the body of a C++
+function call. In this way, long-running C++ code can be parallelized using
+multiple Python threads. Taking :ref:`overriding_virtuals` as an example, this
+could be realized as follows (important changes highlighted):
+
+.. code-block:: cpp
+    :emphasize-lines: 8,9,31,32
+
+    class PyAnimal : public Animal {
+    public:
+        /* Inherit the constructors */
+        using Animal::Animal;
+
+        /* Trampoline (need one for each virtual function) */
+        std::string go(int n_times) {
+            /* Acquire GIL before calling Python code */
+            py::gil_scoped_acquire acquire;
+
+            PYBIND11_OVERLOAD_PURE(
+                std::string, /* Return type */
+                Animal,      /* Parent class */
+                go,          /* Name of function */
+                n_times      /* Argument(s) */
+            );
+        }
+    };
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Animal, PyAnimal> animal(m, "Animal");
+        animal
+            .def(py::init<>())
+            .def("go", &Animal::go);
+
+        py::class_<Dog>(m, "Dog", animal)
+            .def(py::init<>());
+
+        m.def("call_go", [](Animal *animal) -> std::string {
+            /* Release GIL before calling into (potentially long-running) C++ code */
+            py::gil_scoped_release release;
+            return call_go(animal);
+        });
+    }
+
+The ``call_go`` wrapper can also be simplified using the `call_guard` policy
+(see :ref:`call_policies`) which yields the same result:
+
+.. code-block:: cpp
+
+    m.def("call_go", &call_go, py::call_guard<py::gil_scoped_release>());
+
+
+Binding sequence data types, iterators, the slicing protocol, etc.
+==================================================================
+
+Please refer to the supplemental example for details.
+
+.. seealso::
+
+    The file :file:`tests/test_sequences_and_iterators.cpp` contains a
+    complete example that shows how to bind a sequence data type, including
+    length queries (``__len__``), iterators (``__iter__``), the slicing
+    protocol and other kinds of useful operations.
+
+
+Partitioning code over multiple extension modules
+=================================================
+
+It's straightforward to split binding code over multiple extension modules,
+while referencing types that are declared elsewhere. Everything "just" works
+without any special precautions. One exception to this rule occurs when
+extending a type declared in another extension module. Recall the basic example
+from Section :ref:`inheritance`.
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+    pet.def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    py::class_<Dog>(m, "Dog", pet /* <- specify parent */)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Suppose now that ``Pet`` bindings are defined in a module named ``basic``,
+whereas the ``Dog`` bindings are defined somewhere else. The challenge is of
+course that the variable ``pet`` is not available anymore though it is needed
+to indicate the inheritance relationship to the constructor of ``class_<Dog>``.
+However, it can be acquired as follows:
+
+.. code-block:: cpp
+
+    py::object pet = (py::object) py::module::import("basic").attr("Pet");
+
+    py::class_<Dog>(m, "Dog", pet)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Alternatively, you can specify the base class as a template parameter option to
+``class_``, which performs an automated lookup of the corresponding Python
+type. Like the above code, however, this also requires invoking the ``import``
+function once to ensure that the pybind11 binding code of the module ``basic``
+has been executed:
+
+.. code-block:: cpp
+
+    py::module::import("basic");
+
+    py::class_<Dog, Pet>(m, "Dog")
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Naturally, both methods will fail when there are cyclic dependencies.
+
+Note that pybind11 code compiled with hidden-by-default symbol visibility (e.g.
+via the command line flag ``-fvisibility=hidden`` on GCC/Clang), which is
+required for proper pybind11 functionality, can interfere with the ability to
+access types defined in another extension module.  Working around this requires
+manually exporting types that are accessed by multiple extension modules;
+pybind11 provides a macro to do just this:
+
+.. code-block:: cpp
+
+    class PYBIND11_EXPORT Dog : public Animal {
+        ...
+    };
+
+Note also that it is possible (although would rarely be required) to share arbitrary
+C++ objects between extension modules at runtime. Internal library data is shared
+between modules using capsule machinery [#f6]_ which can be also utilized for
+storing, modifying and accessing user-defined data. Note that an extension module
+will "see" other extensions' data if and only if they were built with the same
+pybind11 version. Consider the following example:
+
+.. code-block:: cpp
+
+    auto data = (MyData *) py::get_shared_data("mydata");
+    if (!data)
+        data = (MyData *) py::set_shared_data("mydata", new MyData(42));
+
+If the above snippet was used in several separately compiled extension modules,
+the first one to be imported would create a ``MyData`` instance and associate
+a ``"mydata"`` key with a pointer to it. Extensions that are imported later
+would be then able to access the data behind the same pointer.
+
+.. [#f6] https://docs.python.org/3/extending/extending.html#using-capsules
+
+Module Destructors
+==================
+
+pybind11 does not provide an explicit mechanism to invoke cleanup code at
+module destruction time. In rare cases where such functionality is required, it
+is possible to emulate it using Python capsules or weak references with a
+destruction callback.
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() {
+        // perform cleanup here -- this function is called with the GIL held
+    };
+
+    m.add_object("_cleanup", py::capsule(cleanup_callback));
+
+This approach has the potential downside that instances of classes exposed
+within the module may still be alive when the cleanup callback is invoked
+(whether this is acceptable will generally depend on the application).
+
+Alternatively, the capsule may also be stashed within a type object, which
+ensures that it not called before all instances of that type have been
+collected:
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() { /* ... */ };
+    m.attr("BaseClass").attr("_cleanup") = py::capsule(cleanup_callback);
+
+Both approaches also expose a potentially dangerous ``_cleanup`` attribute in
+Python, which may be undesirable from an API standpoint (a premature explicit
+call from Python might lead to undefined behavior). Yet another approach that 
+avoids this issue involves weak reference with a cleanup callback:
+
+.. code-block:: cpp
+
+    // Register a callback function that is invoked when the BaseClass object is colelcted
+    py::cpp_function cleanup_callback(
+        [](py::handle weakref) {
+            // perform cleanup here -- this function is called with the GIL held
+
+            weakref.dec_ref(); // release weak reference
+        }
+    );
+
+    // Create a weak reference with a cleanup callback and initially leak it
+    (void) py::weakref(m.attr("BaseClass"), cleanup_callback).release();
+
+.. note::
+
+    PyPy (at least version 5.9) does not garbage collect objects when the
+    interpreter exits. An alternative approach (which also works on CPython) is to use
+    the :py:mod:`atexit` module [#f7]_, for example:
+
+    .. code-block:: cpp
+
+        auto atexit = py::module::import("atexit");
+        atexit.attr("register")(py::cpp_function([]() {
+            // perform cleanup here -- this function is called with the GIL held
+        }));
+
+    .. [#f7] https://docs.python.org/3/library/atexit.html
+
+
+Generating documentation using Sphinx
+=====================================
+
+Sphinx [#f4]_ has the ability to inspect the signatures and documentation
+strings in pybind11-based extension modules to automatically generate beautiful
+documentation in a variety formats. The python_example repository [#f5]_ contains a
+simple example repository which uses this approach.
+
+There are two potential gotchas when using this approach: first, make sure that
+the resulting strings do not contain any :kbd:`TAB` characters, which break the
+docstring parsing routines. You may want to use C++11 raw string literals,
+which are convenient for multi-line comments. Conveniently, any excess
+indentation will be automatically be removed by Sphinx. However, for this to
+work, it is important that all lines are indented consistently, i.e.:
+
+.. code-block:: cpp
+
+    // ok
+    m.def("foo", &foo, R"mydelimiter(
+        The foo function
+
+        Parameters
+        ----------
+    )mydelimiter");
+
+    // *not ok*
+    m.def("foo", &foo, R"mydelimiter(The foo function
+
+        Parameters
+        ----------
+    )mydelimiter");
+
+By default, pybind11 automatically generates and prepends a signature to the docstring of a function 
+registered with ``module::def()`` and ``class_::def()``. Sometimes this
+behavior is not desirable, because you want to provide your own signature or remove 
+the docstring completely to exclude the function from the Sphinx documentation.
+The class ``options`` allows you to selectively suppress auto-generated signatures:
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        py::options options;
+        options.disable_function_signatures();
+
+        m.def("add", [](int a, int b) { return a + b; }, "A function which adds two numbers");
+    }
+
+Note that changes to the settings affect only function bindings created during the 
+lifetime of the ``options`` instance. When it goes out of scope at the end of the module's init function, 
+the default settings are restored to prevent unwanted side effects.
+
+.. [#f4] http://www.sphinx-doc.org
+.. [#f5] http://github.com/pybind/python_example
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/index.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/index.rst
new file mode 100644
index 0000000..6885bdc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/index.rst
@@ -0,0 +1,13 @@
+Python C++ interface
+####################
+
+pybind11 exposes Python types and functions using thin C++ wrappers, which
+makes it possible to conveniently call Python code from C++ without resorting
+to Python's C API.
+
+.. toctree::
+   :maxdepth: 2
+
+   object
+   numpy
+   utilities
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/numpy.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/numpy.rst
new file mode 100644
index 0000000..458f99e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/numpy.rst
@@ -0,0 +1,386 @@
+.. _numpy:
+
+NumPy
+#####
+
+Buffer protocol
+===============
+
+Python supports an extremely general and convenient approach for exchanging
+data between plugin libraries. Types can expose a buffer view [#f2]_, which
+provides fast direct access to the raw internal data representation. Suppose we
+want to bind the following simplistic Matrix class:
+
+.. code-block:: cpp
+
+    class Matrix {
+    public:
+        Matrix(size_t rows, size_t cols) : m_rows(rows), m_cols(cols) {
+            m_data = new float[rows*cols];
+        }
+        float *data() { return m_data; }
+        size_t rows() const { return m_rows; }
+        size_t cols() const { return m_cols; }
+    private:
+        size_t m_rows, m_cols;
+        float *m_data;
+    };
+
+The following binding code exposes the ``Matrix`` contents as a buffer object,
+making it possible to cast Matrices into NumPy arrays. It is even possible to
+completely avoid copy operations with Python expressions like
+``np.array(matrix_instance, copy = False)``.
+
+.. code-block:: cpp
+
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+       .def_buffer([](Matrix &m) -> py::buffer_info {
+            return py::buffer_info(
+                m.data(),                               /* Pointer to buffer */
+                sizeof(float),                          /* Size of one scalar */
+                py::format_descriptor<float>::format(), /* Python struct-style format descriptor */
+                2,                                      /* Number of dimensions */
+                { m.rows(), m.cols() },                 /* Buffer dimensions */
+                { sizeof(float) * m.cols(),             /* Strides (in bytes) for each index */
+                  sizeof(float) }
+            );
+        });
+
+Supporting the buffer protocol in a new type involves specifying the special
+``py::buffer_protocol()`` tag in the ``py::class_`` constructor and calling the
+``def_buffer()`` method with a lambda function that creates a
+``py::buffer_info`` description record on demand describing a given matrix
+instance. The contents of ``py::buffer_info`` mirror the Python buffer protocol
+specification.
+
+.. code-block:: cpp
+
+    struct buffer_info {
+        void *ptr;
+        ssize_t itemsize;
+        std::string format;
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::vector<ssize_t> strides;
+    };
+
+To create a C++ function that can take a Python buffer object as an argument,
+simply use the type ``py::buffer`` as one of its arguments. Buffers can exist
+in a great variety of configurations, hence some safety checks are usually
+necessary in the function body. Below, you can see an basic example on how to
+define a custom constructor for the Eigen double precision matrix
+(``Eigen::MatrixXd``) type, which supports initialization from compatible
+buffer objects (e.g. a NumPy matrix).
+
+.. code-block:: cpp
+
+    /* Bind MatrixXd (or some other Eigen type) to Python */
+    typedef Eigen::MatrixXd Matrix;
+
+    typedef Matrix::Scalar Scalar;
+    constexpr bool rowMajor = Matrix::Flags & Eigen::RowMajorBit;
+
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def("__init__", [](Matrix &m, py::buffer b) {
+            typedef Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic> Strides;
+
+            /* Request a buffer descriptor from Python */
+            py::buffer_info info = b.request();
+
+            /* Some sanity checks ... */
+            if (info.format != py::format_descriptor<Scalar>::format())
+                throw std::runtime_error("Incompatible format: expected a double array!");
+
+            if (info.ndim != 2)
+                throw std::runtime_error("Incompatible buffer dimension!");
+
+            auto strides = Strides(
+                info.strides[rowMajor ? 0 : 1] / (py::ssize_t)sizeof(Scalar),
+                info.strides[rowMajor ? 1 : 0] / (py::ssize_t)sizeof(Scalar));
+
+            auto map = Eigen::Map<Matrix, 0, Strides>(
+                static_cast<Scalar *>(info.ptr), info.shape[0], info.shape[1], strides);
+
+            new (&m) Matrix(map);
+        });
+
+For reference, the ``def_buffer()`` call for this Eigen data type should look
+as follows:
+
+.. code-block:: cpp
+
+    .def_buffer([](Matrix &m) -> py::buffer_info {
+        return py::buffer_info(
+            m.data(),                                /* Pointer to buffer */
+            sizeof(Scalar),                          /* Size of one scalar */
+            py::format_descriptor<Scalar>::format(), /* Python struct-style format descriptor */
+            2,                                       /* Number of dimensions */
+            { m.rows(), m.cols() },                  /* Buffer dimensions */
+            { sizeof(Scalar) * (rowMajor ? m.cols() : 1),
+              sizeof(Scalar) * (rowMajor ? 1 : m.rows()) }
+                                                     /* Strides (in bytes) for each index */
+        );
+     })
+
+For a much easier approach of binding Eigen types (although with some
+limitations), refer to the section on :doc:`/advanced/cast/eigen`.
+
+.. seealso::
+
+    The file :file:`tests/test_buffers.cpp` contains a complete example
+    that demonstrates using the buffer protocol with pybind11 in more detail.
+
+.. [#f2] http://docs.python.org/3/c-api/buffer.html
+
+Arrays
+======
+
+By exchanging ``py::buffer`` with ``py::array`` in the above snippet, we can
+restrict the function so that it only accepts NumPy arrays (rather than any
+type of Python object satisfying the buffer protocol).
+
+In many situations, we want to define a function which only accepts a NumPy
+array of a certain data type. This is possible via the ``py::array_t<T>``
+template. For instance, the following function requires the argument to be a
+NumPy array containing double precision values.
+
+.. code-block:: cpp
+
+    void f(py::array_t<double> array);
+
+When it is invoked with a different type (e.g. an integer or a list of
+integers), the binding code will attempt to cast the input into a NumPy array
+of the requested type. Note that this feature requires the
+:file:`pybind11/numpy.h` header to be included.
+
+Data in NumPy arrays is not guaranteed to packed in a dense manner;
+furthermore, entries can be separated by arbitrary column and row strides.
+Sometimes, it can be useful to require a function to only accept dense arrays
+using either the C (row-major) or Fortran (column-major) ordering. This can be
+accomplished via a second template argument with values ``py::array::c_style``
+or ``py::array::f_style``.
+
+.. code-block:: cpp
+
+    void f(py::array_t<double, py::array::c_style | py::array::forcecast> array);
+
+The ``py::array::forcecast`` argument is the default value of the second
+template parameter, and it ensures that non-conforming arguments are converted
+into an array satisfying the specified requirements instead of trying the next
+function overload.
+
+Structured types
+================
+
+In order for ``py::array_t`` to work with structured (record) types, we first
+need to register the memory layout of the type. This can be done via
+``PYBIND11_NUMPY_DTYPE`` macro, called in the plugin definition code, which
+expects the type followed by field names:
+
+.. code-block:: cpp
+
+    struct A {
+        int x;
+        double y;
+    };
+
+    struct B {
+        int z;
+        A a;
+    };
+
+    // ...
+    PYBIND11_MODULE(test, m) {
+        // ...
+
+        PYBIND11_NUMPY_DTYPE(A, x, y);
+        PYBIND11_NUMPY_DTYPE(B, z, a);
+        /* now both A and B can be used as template arguments to py::array_t */
+    }
+
+The structure should consist of fundamental arithmetic types, ``std::complex``,
+previously registered substructures, and arrays of any of the above. Both C++
+arrays and ``std::array`` are supported. While there is a static assertion to
+prevent many types of unsupported structures, it is still the user's
+responsibility to use only "plain" structures that can be safely manipulated as
+raw memory without violating invariants.
+
+Vectorizing functions
+=====================
+
+Suppose we want to bind a function with the following signature to Python so
+that it can process arbitrary NumPy array arguments (vectors, matrices, general
+N-D arrays) in addition to its normal arguments:
+
+.. code-block:: cpp
+
+    double my_func(int x, float y, double z);
+
+After including the ``pybind11/numpy.h`` header, this is extremely simple:
+
+.. code-block:: cpp
+
+    m.def("vectorized_func", py::vectorize(my_func));
+
+Invoking the function like below causes 4 calls to be made to ``my_func`` with
+each of the array elements. The significant advantage of this compared to
+solutions like ``numpy.vectorize()`` is that the loop over the elements runs
+entirely on the C++ side and can be crunched down into a tight, optimized loop
+by the compiler. The result is returned as a NumPy array of type
+``numpy.dtype.float64``.
+
+.. code-block:: pycon
+
+    >>> x = np.array([[1, 3],[5, 7]])
+    >>> y = np.array([[2, 4],[6, 8]])
+    >>> z = 3
+    >>> result = vectorized_func(x, y, z)
+
+The scalar argument ``z`` is transparently replicated 4 times.  The input
+arrays ``x`` and ``y`` are automatically converted into the right types (they
+are of type  ``numpy.dtype.int64`` but need to be ``numpy.dtype.int32`` and
+``numpy.dtype.float32``, respectively).
+
+.. note::
+
+    Only arithmetic, complex, and POD types passed by value or by ``const &``
+    reference are vectorized; all other arguments are passed through as-is.
+    Functions taking rvalue reference arguments cannot be vectorized.
+
+In cases where the computation is too complicated to be reduced to
+``vectorize``, it will be necessary to create and access the buffer contents
+manually. The following snippet contains a complete example that shows how this
+works (the code is somewhat contrived, since it could have been done more
+simply using ``vectorize``).
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+    #include <pybind11/numpy.h>
+
+    namespace py = pybind11;
+
+    py::array_t<double> add_arrays(py::array_t<double> input1, py::array_t<double> input2) {
+        py::buffer_info buf1 = input1.request(), buf2 = input2.request();
+
+        if (buf1.ndim != 1 || buf2.ndim != 1)
+            throw std::runtime_error("Number of dimensions must be one");
+
+        if (buf1.size != buf2.size)
+            throw std::runtime_error("Input shapes must match");
+
+        /* No pointer is passed, so NumPy will allocate the buffer */
+        auto result = py::array_t<double>(buf1.size);
+
+        py::buffer_info buf3 = result.request();
+
+        double *ptr1 = (double *) buf1.ptr,
+               *ptr2 = (double *) buf2.ptr,
+               *ptr3 = (double *) buf3.ptr;
+
+        for (size_t idx = 0; idx < buf1.shape[0]; idx++)
+            ptr3[idx] = ptr1[idx] + ptr2[idx];
+
+        return result;
+    }
+
+    PYBIND11_MODULE(test, m) {
+        m.def("add_arrays", &add_arrays, "Add two NumPy arrays");
+    }
+
+.. seealso::
+
+    The file :file:`tests/test_numpy_vectorize.cpp` contains a complete
+    example that demonstrates using :func:`vectorize` in more detail.
+
+Direct access
+=============
+
+For performance reasons, particularly when dealing with very large arrays, it
+is often desirable to directly access array elements without internal checking
+of dimensions and bounds on every access when indices are known to be already
+valid.  To avoid such checks, the ``array`` class and ``array_t<T>`` template
+class offer an unchecked proxy object that can be used for this unchecked
+access through the ``unchecked<N>`` and ``mutable_unchecked<N>`` methods,
+where ``N`` gives the required dimensionality of the array:
+
+.. code-block:: cpp
+
+    m.def("sum_3d", [](py::array_t<double> x) {
+        auto r = x.unchecked<3>(); // x must have ndim = 3; can be non-writeable
+        double sum = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    sum += r(i, j, k);
+        return sum;
+    });
+    m.def("increment_3d", [](py::array_t<double> x) {
+        auto r = x.mutable_unchecked<3>(); // Will throw if ndim != 3 or flags.writeable is false
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    r(i, j, k) += 1.0;
+    }, py::arg().noconvert());
+
+To obtain the proxy from an ``array`` object, you must specify both the data
+type and number of dimensions as template arguments, such as ``auto r =
+myarray.mutable_unchecked<float, 2>()``.
+
+If the number of dimensions is not known at compile time, you can omit the
+dimensions template parameter (i.e. calling ``arr_t.unchecked()`` or
+``arr.unchecked<T>()``.  This will give you a proxy object that works in the
+same way, but results in less optimizable code and thus a small efficiency
+loss in tight loops.
+
+Note that the returned proxy object directly references the array's data, and
+only reads its shape, strides, and writeable flag when constructed.  You must
+take care to ensure that the referenced array is not destroyed or reshaped for
+the duration of the returned object, typically by limiting the scope of the
+returned instance.
+
+The returned proxy object supports some of the same methods as ``py::array`` so
+that it can be used as a drop-in replacement for some existing, index-checked
+uses of ``py::array``:
+
+- ``r.ndim()`` returns the number of dimensions
+
+- ``r.data(1, 2, ...)`` and ``r.mutable_data(1, 2, ...)``` returns a pointer to
+  the ``const T`` or ``T`` data, respectively, at the given indices.  The
+  latter is only available to proxies obtained via ``a.mutable_unchecked()``.
+
+- ``itemsize()`` returns the size of an item in bytes, i.e. ``sizeof(T)``.
+
+- ``ndim()`` returns the number of dimensions.
+
+- ``shape(n)`` returns the size of dimension ``n``
+
+- ``size()`` returns the total number of elements (i.e. the product of the shapes).
+
+- ``nbytes()`` returns the number of bytes used by the referenced elements
+  (i.e. ``itemsize()`` times ``size()``).
+
+.. seealso::
+
+    The file :file:`tests/test_numpy_array.cpp` contains additional examples
+    demonstrating the use of this feature.
+
+Ellipsis
+========
+
+Python 3 provides a convenient ``...`` ellipsis notation that is often used to
+slice multidimensional arrays. For instance, the following snippet extracts the
+middle dimensions of a tensor with the first and last index set to zero.
+
+.. code-block:: python
+
+   a = # a NumPy array
+   b = a[0, ..., 0]
+
+The function ``py::ellipsis()`` function can be used to perform the same
+operation on the C++ side:
+
+.. code-block:: cpp
+
+   py::array a = /* A NumPy array */;
+   py::array b = a[py::make_tuple(0, py::ellipsis(), 0)];
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/object.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/object.rst
new file mode 100644
index 0000000..117131e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/object.rst
@@ -0,0 +1,170 @@
+Python types
+############
+
+Available wrappers
+==================
+
+All major Python types are available as thin C++ wrapper classes. These
+can also be used as function parameters -- see :ref:`python_objects_as_args`.
+
+Available types include :class:`handle`, :class:`object`, :class:`bool_`,
+:class:`int_`, :class:`float_`, :class:`str`, :class:`bytes`, :class:`tuple`,
+:class:`list`, :class:`dict`, :class:`slice`, :class:`none`, :class:`capsule`,
+:class:`iterable`, :class:`iterator`, :class:`function`, :class:`buffer`,
+:class:`array`, and :class:`array_t`.
+
+Casting back and forth
+======================
+
+In this kind of mixed code, it is often necessary to convert arbitrary C++
+types to Python, which can be done using :func:`py::cast`:
+
+.. code-block:: cpp
+
+    MyClass *cls = ..;
+    py::object obj = py::cast(cls);
+
+The reverse direction uses the following syntax:
+
+.. code-block:: cpp
+
+    py::object obj = ...;
+    MyClass *cls = obj.cast<MyClass *>();
+
+When conversion fails, both directions throw the exception :class:`cast_error`.
+
+.. _python_libs:
+
+Accessing Python libraries from C++
+===================================
+
+It is also possible to import objects defined in the Python standard
+library or available in the current Python environment (``sys.path``) and work
+with these in C++.
+
+This example obtains a reference to the Python ``Decimal`` class.
+
+.. code-block:: cpp
+
+    // Equivalent to "from decimal import Decimal"
+    py::object Decimal = py::module::import("decimal").attr("Decimal");
+
+.. code-block:: cpp
+
+    // Try to import scipy
+    py::object scipy = py::module::import("scipy");
+    return scipy.attr("__version__");
+
+.. _calling_python_functions:
+
+Calling Python functions
+========================
+
+It is also possible to call Python classes, functions and methods 
+via ``operator()``.
+
+.. code-block:: cpp
+
+    // Construct a Python object of class Decimal
+    py::object pi = Decimal("3.14159");
+
+.. code-block:: cpp
+
+    // Use Python to make our directories
+    py::object os = py::module::import("os");
+    py::object makedirs = os.attr("makedirs");
+    makedirs("/tmp/path/to/somewhere");
+
+One can convert the result obtained from Python to a pure C++ version 
+if a ``py::class_`` or type conversion is defined.
+
+.. code-block:: cpp
+
+    py::function f = <...>;
+    py::object result_py = f(1234, "hello", some_instance);
+    MyClass &result = result_py.cast<MyClass>();
+
+.. _calling_python_methods:
+
+Calling Python methods
+========================
+
+To call an object's method, one can again use ``.attr`` to obtain access to the
+Python method.
+
+.. code-block:: cpp
+
+    // Calculate e^π in decimal
+    py::object exp_pi = pi.attr("exp")();
+    py::print(py::str(exp_pi));
+
+In the example above ``pi.attr("exp")`` is a *bound method*: it will always call
+the method for that same instance of the class. Alternately one can create an 
+*unbound method* via the Python class (instead of instance) and pass the ``self`` 
+object explicitly, followed by other arguments.
+
+.. code-block:: cpp
+
+    py::object decimal_exp = Decimal.attr("exp");
+
+    // Compute the e^n for n=0..4
+    for (int n = 0; n < 5; n++) {
+        py::print(decimal_exp(Decimal(n));
+    }
+
+Keyword arguments
+=================
+
+Keyword arguments are also supported. In Python, there is the usual call syntax:
+
+.. code-block:: python
+
+    def f(number, say, to):
+        ...  # function code
+
+    f(1234, say="hello", to=some_instance)  # keyword call in Python
+
+In C++, the same call can be made using:
+
+.. code-block:: cpp
+
+    using namespace pybind11::literals; // to bring in the `_a` literal
+    f(1234, "say"_a="hello", "to"_a=some_instance); // keyword call in C++
+
+Unpacking arguments
+===================
+
+Unpacking of ``*args`` and ``**kwargs`` is also possible and can be mixed with
+other arguments:
+
+.. code-block:: cpp
+
+    // * unpacking
+    py::tuple args = py::make_tuple(1234, "hello", some_instance);
+    f(*args);
+
+    // ** unpacking
+    py::dict kwargs = py::dict("number"_a=1234, "say"_a="hello", "to"_a=some_instance);
+    f(**kwargs);
+
+    // mixed keywords, * and ** unpacking
+    py::tuple args = py::make_tuple(1234);
+    py::dict kwargs = py::dict("to"_a=some_instance);
+    f(*args, "say"_a="hello", **kwargs);
+
+Generalized unpacking according to PEP448_ is also supported:
+
+.. code-block:: cpp
+
+    py::dict kwargs1 = py::dict("number"_a=1234);
+    py::dict kwargs2 = py::dict("to"_a=some_instance);
+    f(**kwargs1, "say"_a="hello", **kwargs2);
+
+.. seealso::
+
+    The file :file:`tests/test_pytypes.cpp` contains a complete
+    example that demonstrates passing native Python types in more detail. The
+    file :file:`tests/test_callbacks.cpp` presents a few examples of calling
+    Python functions from C++, including keywords arguments and unpacking.
+
+.. _PEP448: https://www.python.org/dev/peps/pep-0448/
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/utilities.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/utilities.rst
new file mode 100644
index 0000000..369e7c9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/utilities.rst
@@ -0,0 +1,144 @@
+Utilities
+#########
+
+Using Python's print function in C++
+====================================
+
+The usual way to write output in C++ is using ``std::cout`` while in Python one
+would use ``print``. Since these methods use different buffers, mixing them can
+lead to output order issues. To resolve this, pybind11 modules can use the
+:func:`py::print` function which writes to Python's ``sys.stdout`` for consistency.
+
+Python's ``print`` function is replicated in the C++ API including optional
+keyword arguments ``sep``, ``end``, ``file``, ``flush``. Everything works as
+expected in Python:
+
+.. code-block:: cpp
+
+    py::print(1, 2.0, "three"); // 1 2.0 three
+    py::print(1, 2.0, "three", "sep"_a="-"); // 1-2.0-three
+
+    auto args = py::make_tuple("unpacked", true);
+    py::print("->", *args, "end"_a="<-"); // -> unpacked True <-
+
+.. _ostream_redirect:
+
+Capturing standard output from ostream
+======================================
+
+Often, a library will use the streams ``std::cout`` and ``std::cerr`` to print,
+but this does not play well with Python's standard ``sys.stdout`` and ``sys.stderr``
+redirection. Replacing a library's printing with `py::print <print>` may not
+be feasible. This can be fixed using a guard around the library function that
+redirects output to the corresponding Python streams:
+
+.. code-block:: cpp
+
+    #include <pybind11/iostream.h>
+
+    ...
+
+    // Add a scoped redirect for your noisy code
+    m.def("noisy_func", []() {
+        py::scoped_ostream_redirect stream(
+            std::cout,                               // std::ostream&
+            py::module::import("sys").attr("stdout") // Python output
+        );
+        call_noisy_func();
+    });
+
+This method respects flushes on the output streams and will flush if needed
+when the scoped guard is destroyed. This allows the output to be redirected in
+real time, such as to a Jupyter notebook. The two arguments, the C++ stream and
+the Python output, are optional, and default to standard output if not given. An
+extra type, `py::scoped_estream_redirect <scoped_estream_redirect>`, is identical
+except for defaulting to ``std::cerr`` and ``sys.stderr``; this can be useful with
+`py::call_guard`, which allows multiple items, but uses the default constructor:
+
+.. code-block:: py
+
+    // Alternative: Call single function using call guard
+    m.def("noisy_func", &call_noisy_function,
+          py::call_guard<py::scoped_ostream_redirect,
+                         py::scoped_estream_redirect>());
+
+The redirection can also be done in Python with the addition of a context
+manager, using the `py::add_ostream_redirect() <add_ostream_redirect>` function:
+
+.. code-block:: cpp
+
+    py::add_ostream_redirect(m, "ostream_redirect");
+
+The name in Python defaults to ``ostream_redirect`` if no name is passed.  This
+creates the following context manager in Python:
+
+.. code-block:: python
+
+    with ostream_redirect(stdout=True, stderr=True):
+        noisy_function()
+
+It defaults to redirecting both streams, though you can use the keyword
+arguments to disable one of the streams if needed.
+
+.. note::
+
+    The above methods will not redirect C-level output to file descriptors, such
+    as ``fprintf``. For those cases, you'll need to redirect the file
+    descriptors either directly in C or with Python's ``os.dup2`` function
+    in an operating-system dependent way.
+
+.. _eval:
+
+Evaluating Python expressions from strings and files
+====================================================
+
+pybind11 provides the `eval`, `exec` and `eval_file` functions to evaluate
+Python expressions and statements. The following example illustrates how they
+can be used.
+
+.. code-block:: cpp
+
+    // At beginning of file
+    #include <pybind11/eval.h>
+
+    ...
+
+    // Evaluate in scope of main module
+    py::object scope = py::module::import("__main__").attr("__dict__");
+
+    // Evaluate an isolated expression
+    int result = py::eval("my_variable + 10", scope).cast<int>();
+
+    // Evaluate a sequence of statements
+    py::exec(
+        "print('Hello')\n"
+        "print('world!');",
+        scope);
+
+    // Evaluate the statements in an separate Python file on disk
+    py::eval_file("script.py", scope);
+
+C++11 raw string literals are also supported and quite handy for this purpose.
+The only requirement is that the first statement must be on a new line following
+the raw string delimiter ``R"(``, ensuring all lines have common leading indent:
+
+.. code-block:: cpp
+
+    py::exec(R"(
+        x = get_answer()
+        if x == 42:
+            print('Hello World!')
+        else:
+            print('Bye!')
+        )", scope
+    );
+
+.. note::
+
+    `eval` and `eval_file` accept a template parameter that describes how the
+    string/file should be interpreted. Possible choices include ``eval_expr``
+    (isolated expression), ``eval_single_statement`` (a single statement, return
+    value is always ``none``), and ``eval_statements`` (sequence of statements,
+    return value is always ``none``). `eval` defaults to  ``eval_expr``,
+    `eval_file` defaults to ``eval_statements`` and `exec` is just a shortcut
+    for ``eval<eval_statements>``.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/smart_ptrs.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/smart_ptrs.rst
new file mode 100644
index 0000000..da57748
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/smart_ptrs.rst
@@ -0,0 +1,173 @@
+Smart pointers
+##############
+
+std::unique_ptr
+===============
+
+Given a class ``Example`` with Python bindings, it's possible to return
+instances wrapped in C++11 unique pointers, like so
+
+.. code-block:: cpp
+
+    std::unique_ptr<Example> create_example() { return std::unique_ptr<Example>(new Example()); }
+
+.. code-block:: cpp
+
+    m.def("create_example", &create_example);
+
+In other words, there is nothing special that needs to be done. While returning
+unique pointers in this way is allowed, it is *illegal* to use them as function
+arguments. For instance, the following function signature cannot be processed
+by pybind11.
+
+.. code-block:: cpp
+
+    void do_something_with_example(std::unique_ptr<Example> ex) { ... }
+
+The above signature would imply that Python needs to give up ownership of an
+object that is passed to this function, which is generally not possible (for
+instance, the object might be referenced elsewhere).
+
+std::shared_ptr
+===============
+
+The binding generator for classes, :class:`class_`, can be passed a template
+type that denotes a special *holder* type that is used to manage references to
+the object.  If no such holder type template argument is given, the default for
+a type named ``Type`` is ``std::unique_ptr<Type>``, which means that the object
+is deallocated when Python's reference count goes to zero.
+
+It is possible to switch to other types of reference counting wrappers or smart
+pointers, which is useful in codebases that rely on them. For instance, the
+following snippet causes ``std::shared_ptr`` to be used instead.
+
+.. code-block:: cpp
+
+    py::class_<Example, std::shared_ptr<Example> /* <- holder type */> obj(m, "Example");
+
+Note that any particular class can only be associated with a single holder type.
+
+One potential stumbling block when using holder types is that they need to be
+applied consistently. Can you guess what's broken about the following binding
+code?
+
+.. code-block:: cpp
+
+    class Child { };
+
+    class Parent {
+    public:
+       Parent() : child(std::make_shared<Child>()) { }
+       Child *get_child() { return child.get(); }  /* Hint: ** DON'T DO THIS ** */
+    private:
+        std::shared_ptr<Child> child;
+    };
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Child, std::shared_ptr<Child>>(m, "Child");
+
+        py::class_<Parent, std::shared_ptr<Parent>>(m, "Parent")
+           .def(py::init<>())
+           .def("get_child", &Parent::get_child);
+    }
+
+The following Python code will cause undefined behavior (and likely a
+segmentation fault).
+
+.. code-block:: python
+
+   from example import Parent
+   print(Parent().get_child())
+
+The problem is that ``Parent::get_child()`` returns a pointer to an instance of
+``Child``, but the fact that this instance is already managed by
+``std::shared_ptr<...>`` is lost when passing raw pointers. In this case,
+pybind11 will create a second independent ``std::shared_ptr<...>`` that also
+claims ownership of the pointer. In the end, the object will be freed **twice**
+since these shared pointers have no way of knowing about each other.
+
+There are two ways to resolve this issue:
+
+1. For types that are managed by a smart pointer class, never use raw pointers
+   in function arguments or return values. In other words: always consistently
+   wrap pointers into their designated holder types (such as
+   ``std::shared_ptr<...>``). In this case, the signature of ``get_child()``
+   should be modified as follows:
+
+.. code-block:: cpp
+
+    std::shared_ptr<Child> get_child() { return child; }
+
+2. Adjust the definition of ``Child`` by specifying
+   ``std::enable_shared_from_this<T>`` (see cppreference_ for details) as a
+   base class. This adds a small bit of information to ``Child`` that allows
+   pybind11 to realize that there is already an existing
+   ``std::shared_ptr<...>`` and communicate with it. In this case, the
+   declaration of ``Child`` should look as follows:
+
+.. _cppreference: http://en.cppreference.com/w/cpp/memory/enable_shared_from_this
+
+.. code-block:: cpp
+
+    class Child : public std::enable_shared_from_this<Child> { };
+
+.. _smart_pointers:
+
+Custom smart pointers
+=====================
+
+pybind11 supports ``std::unique_ptr`` and ``std::shared_ptr`` right out of the
+box. For any other custom smart pointer, transparent conversions can be enabled
+using a macro invocation similar to the following. It must be declared at the
+top namespace level before any binding code:
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>);
+
+The first argument of :func:`PYBIND11_DECLARE_HOLDER_TYPE` should be a
+placeholder name that is used as a template parameter of the second argument.
+Thus, feel free to use any identifier, but use it consistently on both sides;
+also, don't use the name of a type that already exists in your codebase.
+
+The macro also accepts a third optional boolean parameter that is set to false
+by default. Specify
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>, true);
+
+if ``SmartPtr<T>`` can always be initialized from a ``T*`` pointer without the
+risk of inconsistencies (such as multiple independent ``SmartPtr`` instances
+believing that they are the sole owner of the ``T*`` pointer). A common
+situation where ``true`` should be passed is when the ``T`` instances use
+*intrusive* reference counting.
+
+Please take a look at the :ref:`macro_notes` before using this feature.
+
+By default, pybind11 assumes that your custom smart pointer has a standard
+interface, i.e. provides a ``.get()`` member function to access the underlying
+raw pointer. If this is not the case, pybind11's ``holder_helper`` must be
+specialized:
+
+.. code-block:: cpp
+
+    // Always needed for custom holder types
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>);
+
+    // Only needed if the type's `.get()` goes by another name
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct holder_helper<SmartPtr<T>> { // <-- specialization
+            static const T *get(const SmartPtr<T> &p) { return p.getPointer(); }
+        };
+    }}
+
+The above specialization informs pybind11 that the custom ``SmartPtr`` class
+provides ``.get()`` functionality via ``.getPointer()``.
+
+.. seealso::
+
+    The file :file:`tests/test_smart_ptr.cpp` contains a complete example
+    that demonstrates how to work with custom reference-counting holder types
+    in more detail.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/basics.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/basics.rst
new file mode 100644
index 0000000..7bf4d42
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/basics.rst
@@ -0,0 +1,293 @@
+.. _basics:
+
+First steps
+###########
+
+This sections demonstrates the basic features of pybind11. Before getting
+started, make sure that development environment is set up to compile the
+included set of test cases.
+
+
+Compiling the test cases
+========================
+
+Linux/MacOS
+-----------
+
+On Linux  you'll need to install the **python-dev** or **python3-dev** packages as
+well as **cmake**. On Mac OS, the included python version works out of the box,
+but **cmake** must still be installed.
+
+After installing the prerequisites, run
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake ..
+   make check -j 4
+
+The last line will both compile and run the tests.
+
+Windows
+-------
+
+On Windows, only **Visual Studio 2015** and newer are supported since pybind11 relies
+on various C++11 language features that break older versions of Visual Studio.
+
+To compile and run the tests:
+
+.. code-block:: batch
+
+   mkdir build
+   cd build
+   cmake ..
+   cmake --build . --config Release --target check
+
+This will create a Visual Studio project, compile and run the target, all from the
+command line.
+
+.. Note::
+
+    If all tests fail, make sure that the Python binary and the testcases are compiled
+    for the same processor type and bitness (i.e. either **i386** or **x86_64**). You
+    can specify **x86_64** as the target architecture for the generated Visual Studio
+    project using ``cmake -A x64 ..``.
+
+.. seealso::
+
+    Advanced users who are already familiar with Boost.Python may want to skip
+    the tutorial and look at the test cases in the :file:`tests` directory,
+    which exercise all features of pybind11.
+
+Header and namespace conventions
+================================
+
+For brevity, all code examples assume that the following two lines are present:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    namespace py = pybind11;
+
+Some features may require additional headers, but those will be specified as needed.
+
+.. _simple_example:
+
+Creating bindings for a simple function
+=======================================
+
+Let's start by creating Python bindings for an extremely simple function, which
+adds two numbers and returns their result:
+
+.. code-block:: cpp
+
+    int add(int i, int j) {
+        return i + j;
+    }
+
+For simplicity [#f1]_, we'll put both this function and the binding code into
+a file named :file:`example.cpp` with the following contents:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    int add(int i, int j) {
+        return i + j;
+    }
+
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "pybind11 example plugin"; // optional module docstring
+
+        m.def("add", &add, "A function which adds two numbers");
+    }
+
+.. [#f1] In practice, implementation and binding code will generally be located
+         in separate files.
+
+The :func:`PYBIND11_MODULE` macro creates a function that will be called when an
+``import`` statement is issued from within Python. The module name (``example``)
+is given as the first macro argument (it should not be in quotes). The second
+argument (``m``) defines a variable of type :class:`py::module <module>` which
+is the main interface for creating bindings. The method :func:`module::def`
+generates binding code that exposes the ``add()`` function to Python.
+
+.. note::
+
+    Notice how little code was needed to expose our function to Python: all
+    details regarding the function's parameters and return value were
+    automatically inferred using template metaprogramming. This overall
+    approach and the used syntax are borrowed from Boost.Python, though the
+    underlying implementation is very different.
+
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+On Linux, the above example can be compiled using the following command:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+For more details on the required compiler flags on Linux and MacOS, see
+:ref:`building_manually`. For complete cross-platform compilation instructions,
+refer to the :ref:`compiling` page.
+
+The `python_example`_ and `cmake_example`_ repositories are also a good place
+to start. They are both complete project examples with cross-platform build
+systems. The only difference between the two is that `python_example`_ uses
+Python's ``setuptools`` to build the module, while `cmake_example`_ uses CMake
+(which may be preferable for existing C++ projects).
+
+.. _python_example: https://github.com/pybind/python_example
+.. _cmake_example: https://github.com/pybind/cmake_example
+
+Building the above C++ code will produce a binary module file that can be
+imported to Python. Assuming that the compiled module is located in the
+current directory, the following interactive Python session shows how to
+load and execute the example:
+
+.. code-block:: pycon
+
+    $ python
+    Python 2.7.10 (default, Aug 22 2015, 20:33:39)
+    [GCC 4.2.1 Compatible Apple LLVM 7.0.0 (clang-700.0.59.1)] on darwin
+    Type "help", "copyright", "credits" or "license" for more information.
+    >>> import example
+    >>> example.add(1, 2)
+    3L
+    >>>
+
+.. _keyword_args:
+
+Keyword arguments
+=================
+
+With a simple code modification, it is possible to inform Python about the
+names of the arguments ("i" and "j" in this case).
+
+.. code-block:: cpp
+
+    m.def("add", &add, "A function which adds two numbers",
+          py::arg("i"), py::arg("j"));
+
+:class:`arg` is one of several special tag classes which can be used to pass
+metadata into :func:`module::def`. With this modified binding code, we can now
+call the function using keyword arguments, which is a more readable alternative
+particularly for functions taking many parameters:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.add(i=1, j=2)
+    3L
+
+The keyword names also appear in the function signatures within the documentation.
+
+.. code-block:: pycon
+
+    >>> help(example)
+
+    ....
+
+    FUNCTIONS
+        add(...)
+            Signature : (i: int, j: int) -> int
+
+            A function which adds two numbers
+
+A shorter notation for named arguments is also available:
+
+.. code-block:: cpp
+
+    // regular notation
+    m.def("add1", &add, py::arg("i"), py::arg("j"));
+    // shorthand
+    using namespace pybind11::literals;
+    m.def("add2", &add, "i"_a, "j"_a);
+
+The :var:`_a` suffix forms a C++11 literal which is equivalent to :class:`arg`.
+Note that the literal operator must first be made visible with the directive
+``using namespace pybind11::literals``. This does not bring in anything else
+from the ``pybind11`` namespace except for literals.
+
+.. _default_args:
+
+Default arguments
+=================
+
+Suppose now that the function to be bound has default arguments, e.g.:
+
+.. code-block:: cpp
+
+    int add(int i = 1, int j = 2) {
+        return i + j;
+    }
+
+Unfortunately, pybind11 cannot automatically extract these parameters, since they
+are not part of the function's type information. However, they are simple to specify
+using an extension of :class:`arg`:
+
+.. code-block:: cpp
+
+    m.def("add", &add, "A function which adds two numbers",
+          py::arg("i") = 1, py::arg("j") = 2);
+
+The default values also appear within the documentation.
+
+.. code-block:: pycon
+
+    >>> help(example)
+
+    ....
+
+    FUNCTIONS
+        add(...)
+            Signature : (i: int = 1, j: int = 2) -> int
+
+            A function which adds two numbers
+
+The shorthand notation is also available for default arguments:
+
+.. code-block:: cpp
+
+    // regular notation
+    m.def("add1", &add, py::arg("i") = 1, py::arg("j") = 2);
+    // shorthand
+    m.def("add2", &add, "i"_a=1, "j"_a=2);
+
+Exporting variables
+===================
+
+To expose a value from C++, use the ``attr`` function to register it in a
+module as shown below. Built-in types and general objects (more on that later)
+are automatically converted when assigned as attributes, and can be explicitly
+converted using the function ``py::cast``.
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m) {
+        m.attr("the_answer") = 42;
+        py::object world = py::cast("World");
+        m.attr("what") = world;
+    }
+
+These are then accessible from Python:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.the_answer
+    42
+    >>> example.what
+    'World'
+
+.. _supported_types:
+
+Supported data types
+====================
+
+A large number of data types are supported out of the box and can be used
+seamlessly as functions arguments, return values or with ``py::cast`` in general.
+For a full overview, see the :doc:`advanced/cast/index` section.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.py
new file mode 100644
index 0000000..6dc0604
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.py
@@ -0,0 +1,88 @@
+import random
+import os
+import time
+import datetime as dt
+
+nfns = 4  # Functions per class
+nargs = 4  # Arguments per function
+
+
+def generate_dummy_code_pybind11(nclasses=10):
+    decl = ""
+    bindings = ""
+
+    for cl in range(nclasses):
+        decl += "class cl%03i;\n" % cl
+    decl += '\n'
+
+    for cl in range(nclasses):
+        decl += "class cl%03i {\n" % cl
+        decl += "public:\n"
+        bindings += '    py::class_<cl%03i>(m, "cl%03i")\n' % (cl, cl)
+        for fn in range(nfns):
+            ret = random.randint(0, nclasses - 1)
+            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            decl += "    cl%03i *fn_%03i(" % (ret, fn)
+            decl += ", ".join("cl%03i *" % p for p in params)
+            decl += ");\n"
+            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i)\n' % \
+                (fn, cl, fn)
+        decl += "};\n\n"
+        bindings += '        ;\n'
+
+    result = "#include <pybind11/pybind11.h>\n\n"
+    result += "namespace py = pybind11;\n\n"
+    result += decl + '\n'
+    result += "PYBIND11_MODULE(example, m) {\n"
+    result += bindings
+    result += "}"
+    return result
+
+
+def generate_dummy_code_boost(nclasses=10):
+    decl = ""
+    bindings = ""
+
+    for cl in range(nclasses):
+        decl += "class cl%03i;\n" % cl
+    decl += '\n'
+
+    for cl in range(nclasses):
+        decl += "class cl%03i {\n" % cl
+        decl += "public:\n"
+        bindings += '    py::class_<cl%03i>("cl%03i")\n' % (cl, cl)
+        for fn in range(nfns):
+            ret = random.randint(0, nclasses - 1)
+            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            decl += "    cl%03i *fn_%03i(" % (ret, fn)
+            decl += ", ".join("cl%03i *" % p for p in params)
+            decl += ");\n"
+            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i, py::return_value_policy<py::manage_new_object>())\n' % \
+                (fn, cl, fn)
+        decl += "};\n\n"
+        bindings += '        ;\n'
+
+    result = "#include <boost/python.hpp>\n\n"
+    result += "namespace py = boost::python;\n\n"
+    result += decl + '\n'
+    result += "BOOST_PYTHON_MODULE(example) {\n"
+    result += bindings
+    result += "}"
+    return result
+
+
+for codegen in [generate_dummy_code_pybind11, generate_dummy_code_boost]:
+    print ("{")
+    for i in range(0, 10):
+        nclasses = 2 ** i
+        with open("test.cpp", "w") as f:
+            f.write(codegen(nclasses))
+        n1 = dt.datetime.now()
+        os.system("g++ -Os -shared -rdynamic -undefined dynamic_lookup "
+            "-fvisibility=hidden -std=c++14 test.cpp -I include "
+            "-I /System/Library/Frameworks/Python.framework/Headers -o test.so")
+        n2 = dt.datetime.now()
+        elapsed = (n2 - n1).total_seconds()
+        size = os.stat('test.so').st_size
+        print("   {%i, %f, %i}," % (nclasses * nfns, elapsed, size))
+    print ("}")
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.rst
new file mode 100644
index 0000000..59d533d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.rst
@@ -0,0 +1,97 @@
+Benchmark
+=========
+
+The following is the result of a synthetic benchmark comparing both compilation
+time and module size of pybind11 against Boost.Python. A detailed report about a
+Boost.Python to pybind11 conversion of a real project is available here: [#f1]_.
+
+.. [#f1] http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+
+Setup
+-----
+
+A python script (see the ``docs/benchmark.py`` file) was used to generate a set
+of files with dummy classes whose count increases for each successive benchmark
+(between 1 and 2048 classes in powers of two). Each class has four methods with
+a randomly generated signature with a return value and four arguments. (There
+was no particular reason for this setup other than the desire to generate many
+unique function signatures whose count could be controlled in a simple way.)
+
+Here is an example of the binding code for one class:
+
+.. code-block:: cpp
+
+    ...
+    class cl034 {
+    public:
+        cl279 *fn_000(cl084 *, cl057 *, cl065 *, cl042 *);
+        cl025 *fn_001(cl098 *, cl262 *, cl414 *, cl121 *);
+        cl085 *fn_002(cl445 *, cl297 *, cl145 *, cl421 *);
+        cl470 *fn_003(cl200 *, cl323 *, cl332 *, cl492 *);
+    };
+    ...
+
+    PYBIND11_MODULE(example, m) {
+        ...
+        py::class_<cl034>(m, "cl034")
+            .def("fn_000", &cl034::fn_000)
+            .def("fn_001", &cl034::fn_001)
+            .def("fn_002", &cl034::fn_002)
+            .def("fn_003", &cl034::fn_003)
+        ...
+    }
+
+The Boost.Python version looks almost identical except that a return value
+policy had to be specified as an argument to ``def()``. For both libraries,
+compilation was done with
+
+.. code-block:: bash
+
+    Apple LLVM version 7.0.2 (clang-700.1.81)
+
+and the following compilation flags
+
+.. code-block:: bash
+
+    g++ -Os -shared -rdynamic -undefined dynamic_lookup -fvisibility=hidden -std=c++14
+
+Compilation time
+----------------
+
+The following log-log plot shows how the compilation time grows for an
+increasing number of class and function declarations. pybind11 includes many
+fewer headers, which initially leads to shorter compilation times, but the
+performance is ultimately fairly similar (pybind11 is 19.8 seconds faster for
+the largest largest file with 2048 classes and a total of 8192 methods -- a
+modest **1.2x** speedup relative to Boost.Python, which required 116.35
+seconds).
+
+.. only:: not latex
+
+    .. image:: pybind11_vs_boost_python1.svg
+
+.. only:: latex
+
+    .. image:: pybind11_vs_boost_python1.png
+
+Module size
+-----------
+
+Differences between the two libraries become much more pronounced when
+considering the file size of the generated Python plugin: for the largest file,
+the binary generated by Boost.Python required 16.8 MiB, which was **2.17
+times** / **9.1 megabytes** larger than the output generated by pybind11. For
+very small inputs, Boost.Python has an edge in the plot below -- however, note
+that it stores many definitions in an external library, whose size was not
+included here, hence the comparison is slightly shifted in Boost.Python's
+favor.
+
+.. only:: not latex
+
+    .. image:: pybind11_vs_boost_python2.svg
+
+.. only:: latex
+
+    .. image:: pybind11_vs_boost_python2.png
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst
new file mode 100644
index 0000000..d65c2d8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst
@@ -0,0 +1,1232 @@
+.. _changelog:
+
+Changelog
+#########
+
+Starting with version 1.8.0, pybind11 releases use a `semantic versioning
+<http://semver.org>`_ policy.
+
+v2.4.3 (Oct 15, 2019)
+-----------------------------------------------------
+
+* Adapt pybind11 to a C API convention change in Python 3.8. `#1950
+  <https://github.com/pybind/pybind11/pull/1950>`_.
+
+v2.4.2 (Sep 21, 2019)
+-----------------------------------------------------
+
+* Replaced usage of a C++14 only construct. `#1929
+  <https://github.com/pybind/pybind11/pull/1929>`_.
+
+* Made an ifdef future-proof for Python >= 4. `f3109d
+  <https://github.com/pybind/pybind11/commit/f3109d>`_.
+
+v2.4.1 (Sep 20, 2019)
+-----------------------------------------------------
+
+* Fixed a problem involving implicit conversion from enumerations to integers
+  on Python 3.8. `#1780 <https://github.com/pybind/pybind11/pull/1780>`_.
+
+v2.4.0 (Sep 19, 2019)
+-----------------------------------------------------
+
+* Try harder to keep pybind11-internal data structures separate when there
+  are potential ABI incompatibilities. Fixes crashes that occurred when loading
+  multiple pybind11 extensions that were e.g. compiled by GCC (libstdc++)
+  and Clang (libc++).
+  `#1588 <https://github.com/pybind/pybind11/pull/1588>`_ and
+  `c9f5a <https://github.com/pybind/pybind11/commit/c9f5a>`_.
+
+* Added support for ``__await__``, ``__aiter__``, and ``__anext__`` protocols.
+  `#1842 <https://github.com/pybind/pybind11/pull/1842>`_.
+
+* ``pybind11_add_module()``: don't strip symbols when compiling in
+  ``RelWithDebInfo`` mode. `#1980
+  <https://github.com/pybind/pybind11/pull/1980>`_.
+
+* ``enum_``: Reproduce Python behavior when comparing against invalid values
+  (e.g. ``None``, strings, etc.). Add back support for ``__invert__()``.
+  `#1912 <https://github.com/pybind/pybind11/pull/1912>`_,
+  `#1907 <https://github.com/pybind/pybind11/pull/1907>`_.
+
+* List insertion operation for ``py::list``.
+  Added ``.empty()`` to all collection types.
+  Added ``py::set::contains()`` and ``py::dict::contains()``.
+  `#1887 <https://github.com/pybind/pybind11/pull/1887>`_,
+  `#1884 <https://github.com/pybind/pybind11/pull/1884>`_,
+  `#1888 <https://github.com/pybind/pybind11/pull/1888>`_.
+
+* ``py::details::overload_cast_impl`` is available in C++11 mode, can be used
+  like ``overload_cast`` with an additional set of parantheses.
+  `#1581 <https://github.com/pybind/pybind11/pull/1581>`_.
+
+* Fixed ``get_include()`` on Conda.
+  `#1877 <https://github.com/pybind/pybind11/pull/1877>`_.
+
+* ``stl_bind.h``: negative indexing support.
+  `#1882 <https://github.com/pybind/pybind11/pull/1882>`_.
+
+* Minor CMake fix to add MinGW compatibility.
+  `#1851 <https://github.com/pybind/pybind11/pull/1851>`_.
+
+* GIL-related fixes.
+  `#1836 <https://github.com/pybind/pybind11/pull/1836>`_,
+  `8b90b <https://github.com/pybind/pybind11/commit/8b90b>`_.
+
+* Other very minor/subtle fixes and improvements.
+  `#1329 <https://github.com/pybind/pybind11/pull/1329>`_,
+  `#1910 <https://github.com/pybind/pybind11/pull/1910>`_,
+  `#1863 <https://github.com/pybind/pybind11/pull/1863>`_,
+  `#1847 <https://github.com/pybind/pybind11/pull/1847>`_,
+  `#1890 <https://github.com/pybind/pybind11/pull/1890>`_,
+  `#1860 <https://github.com/pybind/pybind11/pull/1860>`_,
+  `#1848 <https://github.com/pybind/pybind11/pull/1848>`_,
+  `#1821 <https://github.com/pybind/pybind11/pull/1821>`_,
+  `#1837 <https://github.com/pybind/pybind11/pull/1837>`_,
+  `#1833 <https://github.com/pybind/pybind11/pull/1833>`_,
+  `#1748 <https://github.com/pybind/pybind11/pull/1748>`_,
+  `#1852 <https://github.com/pybind/pybind11/pull/1852>`_.
+
+v2.3.0 (June 11, 2019)
+-----------------------------------------------------
+
+* Significantly reduced module binary size (10-20%) when compiled in C++11 mode
+  with GCC/Clang, or in any mode with MSVC. Function signatures are now always
+  precomputed at compile time (this was previously only available in C++14 mode
+  for non-MSVC compilers).
+  `#934 <https://github.com/pybind/pybind11/pull/934>`_.
+
+* Add basic support for tag-based static polymorphism, where classes
+  provide a method to returns the desired type of an instance.
+  `#1326 <https://github.com/pybind/pybind11/pull/1326>`_.
+
+* Python type wrappers (``py::handle``, ``py::object``, etc.)
+  now support map Python's number protocol onto C++ arithmetic
+  operators such as ``operator+``, ``operator/=``, etc.
+  `#1511 <https://github.com/pybind/pybind11/pull/1511>`_.
+
+* A number of improvements related to enumerations:
+
+   1. The ``enum_`` implementation was rewritten from scratch to reduce
+      code bloat. Rather than instantiating a full implementation for each
+      enumeration, most code is now contained in a generic base class.
+      `#1511 <https://github.com/pybind/pybind11/pull/1511>`_.
+
+   2. The ``value()``  method of ``py::enum_`` now accepts an optional
+      docstring that will be shown in the documentation of the associated
+      enumeration. `#1160 <https://github.com/pybind/pybind11/pull/1160>`_.
+
+   3. check for already existing enum value and throw an error if present.
+      `#1453 <https://github.com/pybind/pybind11/pull/1453>`_.
+
+* Support for over-aligned type allocation via C++17's aligned ``new``
+  statement. `#1582 <https://github.com/pybind/pybind11/pull/1582>`_.
+
+* Added ``py::ellipsis()`` method for slicing of multidimensional NumPy arrays
+  `#1502 <https://github.com/pybind/pybind11/pull/1502>`_.
+
+* Numerous Improvements to the ``mkdoc.py`` script for extracting documentation
+  from C++ header files.
+  `#1788 <https://github.com/pybind/pybind11/pull/1788>`_.
+
+* ``pybind11_add_module()``: allow including Python as a ``SYSTEM`` include path.
+  `#1416 <https://github.com/pybind/pybind11/pull/1416>`_.
+
+* ``pybind11/stl.h`` does not convert strings to ``vector<string>`` anymore.
+  `#1258 <https://github.com/pybind/pybind11/issues/1258>`_.
+
+* Mark static methods as such to fix auto-generated Sphinx documentation.
+  `#1732 <https://github.com/pybind/pybind11/pull/1732>`_.
+
+* Re-throw forced unwind exceptions (e.g. during pthread termination).
+  `#1208 <https://github.com/pybind/pybind11/pull/1208>`_.
+
+* Added ``__contains__`` method to the bindings of maps (``std::map``,
+  ``std::unordered_map``).
+  `#1767 <https://github.com/pybind/pybind11/pull/1767>`_.
+
+* Improvements to ``gil_scoped_acquire``.
+  `#1211 <https://github.com/pybind/pybind11/pull/1211>`_.
+
+* Type caster support for ``std::deque<T>``.
+  `#1609 <https://github.com/pybind/pybind11/pull/1609>`_.
+
+* Support for ``std::unique_ptr`` holders, whose deleters differ between a base and derived
+  class. `#1353 <https://github.com/pybind/pybind11/pull/1353>`_.
+
+* Construction of STL array/vector-like data structures from
+  iterators. Added an ``extend()`` operation.
+  `#1709 <https://github.com/pybind/pybind11/pull/1709>`_,
+
+* CMake build system improvements for projects that include non-C++
+  files (e.g. plain C, CUDA) in ``pybind11_add_module`` et al.
+  `#1678 <https://github.com/pybind/pybind11/pull/1678>`_.
+
+* Fixed asynchronous invocation and deallocation of Python functions
+  wrapped in ``std::function``.
+  `#1595 <https://github.com/pybind/pybind11/pull/1595>`_.
+
+* Fixes regarding return value policy propagation in STL type casters.
+  `#1603 <https://github.com/pybind/pybind11/pull/1603>`_.
+
+* Fixed scoped enum comparisons.
+  `#1571 <https://github.com/pybind/pybind11/pull/1571>`_.
+
+* Fixed iostream redirection for code that releases the GIL.
+  `#1368 <https://github.com/pybind/pybind11/pull/1368>`_,
+
+* A number of CI-related fixes.
+  `#1757 <https://github.com/pybind/pybind11/pull/1757>`_,
+  `#1744 <https://github.com/pybind/pybind11/pull/1744>`_,
+  `#1670 <https://github.com/pybind/pybind11/pull/1670>`_.
+
+v2.2.4 (September 11, 2018)
+-----------------------------------------------------
+
+* Use new Python 3.7 Thread Specific Storage (TSS) implementation if available.
+  `#1454 <https://github.com/pybind/pybind11/pull/1454>`_,
+  `#1517 <https://github.com/pybind/pybind11/pull/1517>`_.
+
+* Fixes for newer MSVC versions and C++17 mode.
+  `#1347 <https://github.com/pybind/pybind11/pull/1347>`_,
+  `#1462 <https://github.com/pybind/pybind11/pull/1462>`_.
+
+* Propagate return value policies to type-specific casters
+  when casting STL containers.
+  `#1455 <https://github.com/pybind/pybind11/pull/1455>`_.
+
+* Allow ostream-redirection of more than 1024 characters.
+  `#1479 <https://github.com/pybind/pybind11/pull/1479>`_.
+
+* Set ``Py_DEBUG`` define when compiling against a debug Python build.
+  `#1438 <https://github.com/pybind/pybind11/pull/1438>`_.
+
+* Untangle integer logic in number type caster to work for custom
+  types that may only be castable to a restricted set of builtin types.
+  `#1442 <https://github.com/pybind/pybind11/pull/1442>`_.
+
+* CMake build system: Remember Python version in cache file.
+  `#1434 <https://github.com/pybind/pybind11/pull/1434>`_.
+
+* Fix for custom smart pointers: use ``std::addressof`` to obtain holder
+  address instead of ``operator&``.
+  `#1435 <https://github.com/pybind/pybind11/pull/1435>`_.
+
+* Properly report exceptions thrown during module initialization.
+  `#1362 <https://github.com/pybind/pybind11/pull/1362>`_.
+
+* Fixed a segmentation fault when creating empty-shaped NumPy array.
+  `#1371 <https://github.com/pybind/pybind11/pull/1371>`_.
+
+* The version of Intel C++ compiler must be >= 2017, and this is now checked by
+  the header files. `#1363 <https://github.com/pybind/pybind11/pull/1363>`_.
+
+* A few minor typo fixes and improvements to the test suite, and
+  patches that silence compiler warnings.
+
+* Vectors now support construction from generators, as well as ``extend()`` from a
+  list or generator.
+  `#1496 <https://github.com/pybind/pybind11/pull/1496>`_.
+
+
+v2.2.3 (April 29, 2018)
+-----------------------------------------------------
+
+* The pybind11 header location detection was replaced by a new implementation
+  that no longer depends on ``pip`` internals (the recently released ``pip``
+  10 has restricted access to this API).
+  `#1190 <https://github.com/pybind/pybind11/pull/1190>`_.
+
+* Small adjustment to an implementation detail to work around a compiler segmentation fault in Clang 3.3/3.4.
+  `#1350 <https://github.com/pybind/pybind11/pull/1350>`_.
+
+* The minimal supported version of the Intel compiler was >= 17.0 since
+  pybind11 v2.1. This check is now explicit, and a compile-time error is raised
+  if the compiler meet the requirement.
+  `#1363 <https://github.com/pybind/pybind11/pull/1363>`_.
+
+* Fixed an endianness-related fault in the test suite.
+  `#1287 <https://github.com/pybind/pybind11/pull/1287>`_.
+
+v2.2.2 (February 7, 2018)
+-----------------------------------------------------
+
+* Fixed a segfault when combining embedded interpreter
+  shutdown/reinitialization with external loaded pybind11 modules.
+  `#1092 <https://github.com/pybind/pybind11/pull/1092>`_.
+
+* Eigen support: fixed a bug where Nx1/1xN numpy inputs couldn't be passed as
+  arguments to Eigen vectors (which for Eigen are simply compile-time fixed
+  Nx1/1xN matrices).
+  `#1106 <https://github.com/pybind/pybind11/pull/1106>`_.
+
+* Clarified to license by moving the licensing of contributions from
+  ``LICENSE`` into ``CONTRIBUTING.md``: the licensing of contributions is not
+  actually part of the software license as distributed.  This isn't meant to be
+  a substantial change in the licensing of the project, but addresses concerns
+  that the clause made the license non-standard.
+  `#1109 <https://github.com/pybind/pybind11/issues/1109>`_.
+
+* Fixed a regression introduced in 2.1 that broke binding functions with lvalue
+  character literal arguments.
+  `#1128 <https://github.com/pybind/pybind11/pull/1128>`_.
+
+* MSVC: fix for compilation failures under /permissive-, and added the flag to
+  the appveyor test suite.
+  `#1155 <https://github.com/pybind/pybind11/pull/1155>`_.
+
+* Fixed ``__qualname__`` generation, and in turn, fixes how class names
+  (especially nested class names) are shown in generated docstrings.
+  `#1171 <https://github.com/pybind/pybind11/pull/1171>`_.
+
+* Updated the FAQ with a suggested project citation reference.
+  `#1189 <https://github.com/pybind/pybind11/pull/1189>`_.
+
+* Added fixes for deprecation warnings when compiled under C++17 with
+  ``-Wdeprecated`` turned on, and add ``-Wdeprecated`` to the test suite
+  compilation flags.
+  `#1191 <https://github.com/pybind/pybind11/pull/1191>`_.
+
+* Fixed outdated PyPI URLs in ``setup.py``.
+  `#1213 <https://github.com/pybind/pybind11/pull/1213>`_.
+
+* Fixed a refcount leak for arguments that end up in a ``py::args`` argument
+  for functions with both fixed positional and ``py::args`` arguments.
+  `#1216 <https://github.com/pybind/pybind11/pull/1216>`_.
+
+* Fixed a potential segfault resulting from possible premature destruction of
+  ``py::args``/``py::kwargs`` arguments with overloaded functions.
+  `#1223 <https://github.com/pybind/pybind11/pull/1223>`_.
+
+* Fixed ``del map[item]`` for a ``stl_bind.h`` bound stl map.
+  `#1229 <https://github.com/pybind/pybind11/pull/1229>`_.
+
+* Fixed a regression from v2.1.x where the aggregate initialization could
+  unintentionally end up at a constructor taking a templated
+  ``std::initializer_list<T>`` argument.
+  `#1249 <https://github.com/pybind/pybind11/pull/1249>`_.
+
+* Fixed an issue where calling a function with a keep_alive policy on the same
+  nurse/patient pair would cause the internal patient storage to needlessly
+  grow (unboundedly, if the nurse is long-lived).
+  `#1251 <https://github.com/pybind/pybind11/issues/1251>`_.
+
+* Various other minor fixes.
+
+v2.2.1 (September 14, 2017)
+-----------------------------------------------------
+
+* Added ``py::module::reload()`` member function for reloading a module.
+  `#1040 <https://github.com/pybind/pybind11/pull/1040>`_.
+
+* Fixed a reference leak in the number converter.
+  `#1078 <https://github.com/pybind/pybind11/pull/1078>`_.
+
+* Fixed compilation with Clang on host GCC < 5 (old libstdc++ which isn't fully
+  C++11 compliant). `#1062 <https://github.com/pybind/pybind11/pull/1062>`_.
+
+* Fixed a regression where the automatic ``std::vector<bool>`` caster would
+  fail to compile. The same fix also applies to any container which returns
+  element proxies instead of references.
+  `#1053 <https://github.com/pybind/pybind11/pull/1053>`_.
+
+* Fixed a regression where the ``py::keep_alive`` policy could not be applied
+  to constructors. `#1065 <https://github.com/pybind/pybind11/pull/1065>`_.
+
+* Fixed a nullptr dereference when loading a ``py::module_local`` type
+  that's only registered in an external module.
+  `#1058 <https://github.com/pybind/pybind11/pull/1058>`_.
+
+* Fixed implicit conversion of accessors to types derived from ``py::object``.
+  `#1076 <https://github.com/pybind/pybind11/pull/1076>`_.
+
+* The ``name`` in ``PYBIND11_MODULE(name, variable)`` can now be a macro.
+  `#1082 <https://github.com/pybind/pybind11/pull/1082>`_.
+
+* Relaxed overly strict ``py::pickle()`` check for matching get and set types.
+  `#1064 <https://github.com/pybind/pybind11/pull/1064>`_.
+
+* Conversion errors now try to be more informative when it's likely that
+  a missing header is the cause (e.g. forgetting ``<pybind11/stl.h>``).
+  `#1077 <https://github.com/pybind/pybind11/pull/1077>`_.
+
+v2.2.0 (August 31, 2017)
+-----------------------------------------------------
+
+* Support for embedding the Python interpreter. See the
+  :doc:`documentation page </advanced/embedding>` for a
+  full overview of the new features.
+  `#774 <https://github.com/pybind/pybind11/pull/774>`_,
+  `#889 <https://github.com/pybind/pybind11/pull/889>`_,
+  `#892 <https://github.com/pybind/pybind11/pull/892>`_,
+  `#920 <https://github.com/pybind/pybind11/pull/920>`_.
+
+  .. code-block:: cpp
+
+      #include <pybind11/embed.h>
+      namespace py = pybind11;
+
+      int main() {
+          py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+          py::print("Hello, World!"); // use the Python API
+      }
+
+* Support for inheriting from multiple C++ bases in Python.
+  `#693 <https://github.com/pybind/pybind11/pull/693>`_.
+
+  .. code-block:: python
+
+      from cpp_module import CppBase1, CppBase2
+
+      class PyDerived(CppBase1, CppBase2):
+          def __init__(self):
+              CppBase1.__init__(self)  # C++ bases must be initialized explicitly
+              CppBase2.__init__(self)
+
+* ``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+  ``PYBIND11_PLUGIN`` is deprecated. See :ref:`macros` for details.
+  `#879 <https://github.com/pybind/pybind11/pull/879>`_.
+
+  .. code-block:: cpp
+
+      // new
+      PYBIND11_MODULE(example, m) {
+          m.def("add", [](int a, int b) { return a + b; });
+      }
+
+      // old
+      PYBIND11_PLUGIN(example) {
+          py::module m("example");
+          m.def("add", [](int a, int b) { return a + b; });
+          return m.ptr();
+      }
+
+* pybind11's headers and build system now more strictly enforce hidden symbol
+  visibility for extension modules. This should be seamless for most users,
+  but see the :doc:`upgrade` if you use a custom build system.
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Support for ``py::module_local`` types which allow multiple modules to
+  export the same C++ types without conflicts. This is useful for opaque
+  types like ``std::vector<int>``. ``py::bind_vector`` and ``py::bind_map``
+  now default to ``py::module_local`` if their elements are builtins or
+  local types. See :ref:`module_local` for details.
+  `#949 <https://github.com/pybind/pybind11/pull/949>`_,
+  `#981 <https://github.com/pybind/pybind11/pull/981>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_,
+  `#997 <https://github.com/pybind/pybind11/pull/997>`_.
+
+* Custom constructors can now be added very easily using lambdas or factory
+  functions which return a class instance by value, pointer or holder. This
+  supersedes the old placement-new ``__init__`` technique.
+  See :ref:`custom_constructors` for details.
+  `#805 <https://github.com/pybind/pybind11/pull/805>`_,
+  `#1014 <https://github.com/pybind/pybind11/pull/1014>`_.
+
+  .. code-block:: cpp
+
+      struct Example {
+          Example(std::string);
+      };
+
+      py::class_<Example>(m, "Example")
+          .def(py::init<std::string>()) // existing constructor
+          .def(py::init([](int n) { // custom constructor
+              return std::make_unique<Example>(std::to_string(n));
+          }));
+
+* Similarly to custom constructors, pickling support functions are now bound
+  using the ``py::pickle()`` adaptor which improves type safety. See the
+  :doc:`upgrade` and :ref:`pickling` for details.
+  `#1038 <https://github.com/pybind/pybind11/pull/1038>`_.
+
+* Builtin support for converting C++17 standard library types and general
+  conversion improvements:
+
+  1. C++17 ``std::variant`` is supported right out of the box. C++11/14
+     equivalents (e.g. ``boost::variant``) can also be added with a simple
+     user-defined specialization. See :ref:`cpp17_container_casters` for details.
+     `#811 <https://github.com/pybind/pybind11/pull/811>`_,
+     `#845 <https://github.com/pybind/pybind11/pull/845>`_,
+     `#989 <https://github.com/pybind/pybind11/pull/989>`_.
+
+  2. Out-of-the-box support for C++17 ``std::string_view``.
+     `#906 <https://github.com/pybind/pybind11/pull/906>`_.
+
+  3. Improved compatibility of the builtin ``optional`` converter.
+     `#874 <https://github.com/pybind/pybind11/pull/874>`_.
+
+  4. The ``bool`` converter now accepts ``numpy.bool_`` and types which
+     define ``__bool__`` (Python 3.x) or ``__nonzero__`` (Python 2.7).
+     `#925 <https://github.com/pybind/pybind11/pull/925>`_.
+
+  5. C++-to-Python casters are now more efficient and move elements out
+     of rvalue containers whenever possible.
+     `#851 <https://github.com/pybind/pybind11/pull/851>`_,
+     `#936 <https://github.com/pybind/pybind11/pull/936>`_,
+     `#938 <https://github.com/pybind/pybind11/pull/938>`_.
+
+  6. Fixed ``bytes`` to ``std::string/char*`` conversion on Python 3.
+     `#817 <https://github.com/pybind/pybind11/pull/817>`_.
+
+  7. Fixed lifetime of temporary C++ objects created in Python-to-C++ conversions.
+     `#924 <https://github.com/pybind/pybind11/pull/924>`_.
+
+* Scope guard call policy for RAII types, e.g. ``py::call_guard<py::gil_scoped_release>()``,
+  ``py::call_guard<py::scoped_ostream_redirect>()``. See :ref:`call_policies` for details.
+  `#740 <https://github.com/pybind/pybind11/pull/740>`_.
+
+* Utility for redirecting C++ streams to Python (e.g. ``std::cout`` ->
+  ``sys.stdout``). Scope guard ``py::scoped_ostream_redirect`` in C++ and
+  a context manager in Python. See :ref:`ostream_redirect`.
+  `#1009 <https://github.com/pybind/pybind11/pull/1009>`_.
+
+* Improved handling of types and exceptions across module boundaries.
+  `#915 <https://github.com/pybind/pybind11/pull/915>`_,
+  `#951 <https://github.com/pybind/pybind11/pull/951>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Fixed destruction order of ``py::keep_alive`` nurse/patient objects
+  in reference cycles.
+  `#856 <https://github.com/pybind/pybind11/pull/856>`_.
+
+* Numpy and buffer protocol related improvements:
+
+  1. Support for negative strides in Python buffer objects/numpy arrays. This
+     required changing integers from unsigned to signed for the related C++ APIs.
+     Note: If you have compiler warnings enabled, you may notice some new conversion
+     warnings after upgrading. These can be resolved with ``static_cast``.
+     `#782 <https://github.com/pybind/pybind11/pull/782>`_.
+
+  2. Support ``std::complex`` and arrays inside ``PYBIND11_NUMPY_DTYPE``.
+     `#831 <https://github.com/pybind/pybind11/pull/831>`_,
+     `#832 <https://github.com/pybind/pybind11/pull/832>`_.
+
+  3. Support for constructing ``py::buffer_info`` and ``py::arrays`` using
+     arbitrary containers or iterators instead of requiring a ``std::vector``.
+     `#788 <https://github.com/pybind/pybind11/pull/788>`_,
+     `#822 <https://github.com/pybind/pybind11/pull/822>`_,
+     `#860 <https://github.com/pybind/pybind11/pull/860>`_.
+
+  4. Explicitly check numpy version and require >= 1.7.0.
+     `#819 <https://github.com/pybind/pybind11/pull/819>`_.
+
+* Support for allowing/prohibiting ``None`` for specific arguments and improved
+  ``None`` overload resolution order. See :ref:`none_arguments` for details.
+  `#843 <https://github.com/pybind/pybind11/pull/843>`_.
+  `#859 <https://github.com/pybind/pybind11/pull/859>`_.
+
+* Added ``py::exec()`` as a shortcut for ``py::eval<py::eval_statements>()``
+  and support for C++11 raw string literals as input. See :ref:`eval`.
+  `#766 <https://github.com/pybind/pybind11/pull/766>`_,
+  `#827 <https://github.com/pybind/pybind11/pull/827>`_.
+
+* ``py::vectorize()`` ignores non-vectorizable arguments and supports
+  member functions.
+  `#762 <https://github.com/pybind/pybind11/pull/762>`_.
+
+* Support for bound methods as callbacks (``pybind11/functional.h``).
+  `#815 <https://github.com/pybind/pybind11/pull/815>`_.
+
+* Allow aliasing pybind11 methods: ``cls.attr("foo") = cls.attr("bar")``.
+  `#802 <https://github.com/pybind/pybind11/pull/802>`_.
+
+* Don't allow mixed static/non-static overloads.
+  `#804 <https://github.com/pybind/pybind11/pull/804>`_.
+
+* Fixed overriding static properties in derived classes.
+  `#784 <https://github.com/pybind/pybind11/pull/784>`_.
+
+* Added support for write only properties.
+  `#1144 <https://github.com/pybind/pybind11/pull/1144>`_.
+
+* Improved deduction of member functions of a derived class when its bases
+  aren't registered with pybind11.
+  `#855 <https://github.com/pybind/pybind11/pull/855>`_.
+
+  .. code-block:: cpp
+
+      struct Base {
+          int foo() { return 42; }
+      }
+
+      struct Derived : Base {}
+
+      // Now works, but previously required also binding `Base`
+      py::class_<Derived>(m, "Derived")
+          .def("foo", &Derived::foo); // function is actually from `Base`
+
+* The implementation of ``py::init<>`` now uses C++11 brace initialization
+  syntax to construct instances, which permits binding implicit constructors of
+  aggregate types. `#1015 <https://github.com/pybind/pybind11/pull/1015>`_.
+
+    .. code-block:: cpp
+
+        struct Aggregate {
+            int a;
+            std::string b;
+        };
+
+        py::class_<Aggregate>(m, "Aggregate")
+            .def(py::init<int, const std::string &>());
+
+* Fixed issues with multiple inheritance with offset base/derived pointers.
+  `#812 <https://github.com/pybind/pybind11/pull/812>`_,
+  `#866 <https://github.com/pybind/pybind11/pull/866>`_,
+  `#960 <https://github.com/pybind/pybind11/pull/960>`_.
+
+* Fixed reference leak of type objects.
+  `#1030 <https://github.com/pybind/pybind11/pull/1030>`_.
+
+* Improved support for the ``/std:c++14`` and ``/std:c++latest`` modes
+  on MSVC 2017.
+  `#841 <https://github.com/pybind/pybind11/pull/841>`_,
+  `#999 <https://github.com/pybind/pybind11/pull/999>`_.
+
+* Fixed detection of private operator new on MSVC.
+  `#893 <https://github.com/pybind/pybind11/pull/893>`_,
+  `#918 <https://github.com/pybind/pybind11/pull/918>`_.
+
+* Intel C++ compiler compatibility fixes.
+  `#937 <https://github.com/pybind/pybind11/pull/937>`_.
+
+* Fixed implicit conversion of `py::enum_` to integer types on Python 2.7.
+  `#821 <https://github.com/pybind/pybind11/pull/821>`_.
+
+* Added ``py::hash`` to fetch the hash value of Python objects, and
+  ``.def(hash(py::self))`` to provide the C++ ``std::hash`` as the Python
+  ``__hash__`` method.
+  `#1034 <https://github.com/pybind/pybind11/pull/1034>`_.
+
+* Fixed ``__truediv__`` on Python 2 and ``__itruediv__`` on Python 3.
+  `#867 <https://github.com/pybind/pybind11/pull/867>`_.
+
+* ``py::capsule`` objects now support the ``name`` attribute. This is useful
+  for interfacing with ``scipy.LowLevelCallable``.
+  `#902 <https://github.com/pybind/pybind11/pull/902>`_.
+
+* Fixed ``py::make_iterator``'s ``__next__()`` for past-the-end calls.
+  `#897 <https://github.com/pybind/pybind11/pull/897>`_.
+
+* Added ``error_already_set::matches()`` for checking Python exceptions.
+  `#772 <https://github.com/pybind/pybind11/pull/772>`_.
+
+* Deprecated ``py::error_already_set::clear()``. It's no longer needed
+  following a simplification of the ``py::error_already_set`` class.
+  `#954 <https://github.com/pybind/pybind11/pull/954>`_.
+
+* Deprecated ``py::handle::operator==()`` in favor of ``py::handle::is()``
+  `#825 <https://github.com/pybind/pybind11/pull/825>`_.
+
+* Deprecated ``py::object::borrowed``/``py::object::stolen``.
+  Use ``py::object::borrowed_t{}``/``py::object::stolen_t{}`` instead.
+  `#771 <https://github.com/pybind/pybind11/pull/771>`_.
+
+* Changed internal data structure versioning to avoid conflicts between
+  modules compiled with different revisions of pybind11.
+  `#1012 <https://github.com/pybind/pybind11/pull/1012>`_.
+
+* Additional compile-time and run-time error checking and more informative messages.
+  `#786 <https://github.com/pybind/pybind11/pull/786>`_,
+  `#794 <https://github.com/pybind/pybind11/pull/794>`_,
+  `#803 <https://github.com/pybind/pybind11/pull/803>`_.
+
+* Various minor improvements and fixes.
+  `#764 <https://github.com/pybind/pybind11/pull/764>`_,
+  `#791 <https://github.com/pybind/pybind11/pull/791>`_,
+  `#795 <https://github.com/pybind/pybind11/pull/795>`_,
+  `#840 <https://github.com/pybind/pybind11/pull/840>`_,
+  `#844 <https://github.com/pybind/pybind11/pull/844>`_,
+  `#846 <https://github.com/pybind/pybind11/pull/846>`_,
+  `#849 <https://github.com/pybind/pybind11/pull/849>`_,
+  `#858 <https://github.com/pybind/pybind11/pull/858>`_,
+  `#862 <https://github.com/pybind/pybind11/pull/862>`_,
+  `#871 <https://github.com/pybind/pybind11/pull/871>`_,
+  `#872 <https://github.com/pybind/pybind11/pull/872>`_,
+  `#881 <https://github.com/pybind/pybind11/pull/881>`_,
+  `#888 <https://github.com/pybind/pybind11/pull/888>`_,
+  `#899 <https://github.com/pybind/pybind11/pull/899>`_,
+  `#928 <https://github.com/pybind/pybind11/pull/928>`_,
+  `#931 <https://github.com/pybind/pybind11/pull/931>`_,
+  `#944 <https://github.com/pybind/pybind11/pull/944>`_,
+  `#950 <https://github.com/pybind/pybind11/pull/950>`_,
+  `#952 <https://github.com/pybind/pybind11/pull/952>`_,
+  `#962 <https://github.com/pybind/pybind11/pull/962>`_,
+  `#965 <https://github.com/pybind/pybind11/pull/965>`_,
+  `#970 <https://github.com/pybind/pybind11/pull/970>`_,
+  `#978 <https://github.com/pybind/pybind11/pull/978>`_,
+  `#979 <https://github.com/pybind/pybind11/pull/979>`_,
+  `#986 <https://github.com/pybind/pybind11/pull/986>`_,
+  `#1020 <https://github.com/pybind/pybind11/pull/1020>`_,
+  `#1027 <https://github.com/pybind/pybind11/pull/1027>`_,
+  `#1037 <https://github.com/pybind/pybind11/pull/1037>`_.
+
+* Testing improvements.
+  `#798 <https://github.com/pybind/pybind11/pull/798>`_,
+  `#882 <https://github.com/pybind/pybind11/pull/882>`_,
+  `#898 <https://github.com/pybind/pybind11/pull/898>`_,
+  `#900 <https://github.com/pybind/pybind11/pull/900>`_,
+  `#921 <https://github.com/pybind/pybind11/pull/921>`_,
+  `#923 <https://github.com/pybind/pybind11/pull/923>`_,
+  `#963 <https://github.com/pybind/pybind11/pull/963>`_.
+
+v2.1.1 (April 7, 2017)
+-----------------------------------------------------
+
+* Fixed minimum version requirement for MSVC 2015u3
+  `#773 <https://github.com/pybind/pybind11/pull/773>`_.
+
+v2.1.0 (March 22, 2017)
+-----------------------------------------------------
+
+* pybind11 now performs function overload resolution in two phases. The first
+  phase only considers exact type matches, while the second allows for implicit
+  conversions to take place. A special ``noconvert()`` syntax can be used to
+  completely disable implicit conversions for specific arguments.
+  `#643 <https://github.com/pybind/pybind11/pull/643>`_,
+  `#634 <https://github.com/pybind/pybind11/pull/634>`_,
+  `#650 <https://github.com/pybind/pybind11/pull/650>`_.
+
+* Fixed a regression where static properties no longer worked with classes
+  using multiple inheritance. The ``py::metaclass`` attribute is no longer
+  necessary (and deprecated as of this release) when binding classes with
+  static properties.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* Classes bound using ``pybind11`` can now use custom metaclasses.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* ``py::args`` and ``py::kwargs`` can now be mixed with other positional
+  arguments when binding functions using pybind11.
+  `#611 <https://github.com/pybind/pybind11/pull/611>`_.
+
+* Improved support for C++11 unicode string and character types; added
+  extensive documentation regarding pybind11's string conversion behavior.
+  `#624 <https://github.com/pybind/pybind11/pull/624>`_,
+  `#636 <https://github.com/pybind/pybind11/pull/636>`_,
+  `#715 <https://github.com/pybind/pybind11/pull/715>`_.
+
+* pybind11 can now avoid expensive copies when converting Eigen arrays to NumPy
+  arrays (and vice versa). `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* The "fast path" in ``py::vectorize`` now works for any full-size group of C or
+  F-contiguous arrays. The non-fast path is also faster since it no longer performs
+  copies of the input arguments (except when type conversions are necessary).
+  `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* Added fast, unchecked access to NumPy arrays via a proxy object.
+  `#746 <https://github.com/pybind/pybind11/pull/746>`_.
+
+* Transparent support for class-specific ``operator new`` and
+  ``operator delete`` implementations.
+  `#755 <https://github.com/pybind/pybind11/pull/755>`_.
+
+* Slimmer and more efficient STL-compatible iterator interface for sequence types.
+  `#662 <https://github.com/pybind/pybind11/pull/662>`_.
+
+* Improved custom holder type support.
+  `#607 <https://github.com/pybind/pybind11/pull/607>`_.
+
+* ``nullptr`` to ``None`` conversion fixed in various builtin type casters.
+  `#732 <https://github.com/pybind/pybind11/pull/732>`_.
+
+* ``enum_`` now exposes its members via a special ``__members__`` attribute.
+  `#666 <https://github.com/pybind/pybind11/pull/666>`_.
+
+* ``std::vector`` bindings created using ``stl_bind.h`` can now optionally
+  implement the buffer protocol. `#488 <https://github.com/pybind/pybind11/pull/488>`_.
+
+* Automated C++ reference documentation using doxygen and breathe.
+  `#598 <https://github.com/pybind/pybind11/pull/598>`_.
+
+* Added minimum compiler version assertions.
+  `#727 <https://github.com/pybind/pybind11/pull/727>`_.
+
+* Improved compatibility with C++1z.
+  `#677 <https://github.com/pybind/pybind11/pull/677>`_.
+
+* Improved ``py::capsule`` API. Can be used to implement cleanup
+  callbacks that are involved at module destruction time.
+  `#752 <https://github.com/pybind/pybind11/pull/752>`_.
+
+* Various minor improvements and fixes.
+  `#595 <https://github.com/pybind/pybind11/pull/595>`_,
+  `#588 <https://github.com/pybind/pybind11/pull/588>`_,
+  `#589 <https://github.com/pybind/pybind11/pull/589>`_,
+  `#603 <https://github.com/pybind/pybind11/pull/603>`_,
+  `#619 <https://github.com/pybind/pybind11/pull/619>`_,
+  `#648 <https://github.com/pybind/pybind11/pull/648>`_,
+  `#695 <https://github.com/pybind/pybind11/pull/695>`_,
+  `#720 <https://github.com/pybind/pybind11/pull/720>`_,
+  `#723 <https://github.com/pybind/pybind11/pull/723>`_,
+  `#729 <https://github.com/pybind/pybind11/pull/729>`_,
+  `#724 <https://github.com/pybind/pybind11/pull/724>`_,
+  `#742 <https://github.com/pybind/pybind11/pull/742>`_,
+  `#753 <https://github.com/pybind/pybind11/pull/753>`_.
+
+v2.0.1 (Jan 4, 2017)
+-----------------------------------------------------
+
+* Fix pointer to reference error in type_caster on MSVC
+  `#583 <https://github.com/pybind/pybind11/pull/583>`_.
+
+* Fixed a segmentation in the test suite due to a typo
+  `cd7eac <https://github.com/pybind/pybind11/commit/cd7eac>`_.
+
+v2.0.0 (Jan 1, 2017)
+-----------------------------------------------------
+
+* Fixed a reference counting regression affecting types with custom metaclasses
+  (introduced in v2.0.0-rc1).
+  `#571 <https://github.com/pybind/pybind11/pull/571>`_.
+
+* Quenched a CMake policy warning.
+  `#570 <https://github.com/pybind/pybind11/pull/570>`_.
+
+v2.0.0-rc1 (Dec 23, 2016)
+-----------------------------------------------------
+
+The pybind11 developers are excited to issue a release candidate of pybind11
+with a subsequent v2.0.0 release planned in early January next year.
+
+An incredible amount of effort by went into pybind11 over the last ~5 months,
+leading to a release that is jam-packed with exciting new features and numerous
+usability improvements. The following list links PRs or individual commits
+whenever applicable.
+
+Happy Christmas!
+
+* Support for binding C++ class hierarchies that make use of multiple
+  inheritance. `#410 <https://github.com/pybind/pybind11/pull/410>`_.
+
+* PyPy support: pybind11 now supports nightly builds of PyPy and will
+  interoperate with the future 5.7 release. No code changes are necessary,
+  everything "just" works as usual. Note that we only target the Python 2.7
+  branch for now; support for 3.x will be added once its ``cpyext`` extension
+  support catches up. A few minor features remain unsupported for the time
+  being (notably dynamic attributes in custom types).
+  `#527 <https://github.com/pybind/pybind11/pull/527>`_.
+
+* Significant work on the documentation -- in particular, the monolithic
+  ``advanced.rst`` file was restructured into a easier to read hierarchical
+  organization. `#448 <https://github.com/pybind/pybind11/pull/448>`_.
+
+* Many NumPy-related improvements:
+
+  1. Object-oriented API to access and modify NumPy ``ndarray`` instances,
+     replicating much of the corresponding NumPy C API functionality.
+     `#402 <https://github.com/pybind/pybind11/pull/402>`_.
+
+  2. NumPy array ``dtype`` array descriptors are now first-class citizens and
+     are exposed via a new class ``py::dtype``.
+
+  3. Structured dtypes can be registered using the ``PYBIND11_NUMPY_DTYPE()``
+     macro. Special ``array`` constructors accepting dtype objects were also
+     added.
+
+     One potential caveat involving this change: format descriptor strings
+     should now be accessed via ``format_descriptor::format()`` (however, for
+     compatibility purposes, the old syntax ``format_descriptor::value`` will
+     still work for non-structured data types). `#308
+     <https://github.com/pybind/pybind11/pull/308>`_.
+
+  4. Further improvements to support structured dtypes throughout the system.
+     `#472 <https://github.com/pybind/pybind11/pull/472>`_,
+     `#474 <https://github.com/pybind/pybind11/pull/474>`_,
+     `#459 <https://github.com/pybind/pybind11/pull/459>`_,
+     `#453 <https://github.com/pybind/pybind11/pull/453>`_,
+     `#452 <https://github.com/pybind/pybind11/pull/452>`_, and
+     `#505 <https://github.com/pybind/pybind11/pull/505>`_.
+
+  5. Fast access operators. `#497 <https://github.com/pybind/pybind11/pull/497>`_.
+
+  6. Constructors for arrays whose storage is owned by another object.
+     `#440 <https://github.com/pybind/pybind11/pull/440>`_.
+
+  7. Added constructors for ``array`` and ``array_t`` explicitly accepting shape
+     and strides; if strides are not provided, they are deduced assuming
+     C-contiguity. Also added simplified constructors for 1-dimensional case.
+
+  8. Added buffer/NumPy support for ``char[N]`` and ``std::array<char, N>`` types.
+
+  9. Added ``memoryview`` wrapper type which is constructible from ``buffer_info``.
+
+* Eigen: many additional conversions and support for non-contiguous
+  arrays/slices.
+  `#427 <https://github.com/pybind/pybind11/pull/427>`_,
+  `#315 <https://github.com/pybind/pybind11/pull/315>`_,
+  `#316 <https://github.com/pybind/pybind11/pull/316>`_,
+  `#312 <https://github.com/pybind/pybind11/pull/312>`_, and
+  `#267 <https://github.com/pybind/pybind11/pull/267>`_
+
+* Incompatible changes in ``class_<...>::class_()``:
+
+    1. Declarations of types that provide access via the buffer protocol must
+       now include the ``py::buffer_protocol()`` annotation as an argument to
+       the ``class_`` constructor.
+
+    2. Declarations of types that require a custom metaclass (i.e. all classes
+       which include static properties via commands such as
+       ``def_readwrite_static()``) must now include the ``py::metaclass()``
+       annotation as an argument to the ``class_`` constructor.
+
+       These two changes were necessary to make type definitions in pybind11
+       future-proof, and to support PyPy via its cpyext mechanism. `#527
+       <https://github.com/pybind/pybind11/pull/527>`_.
+
+
+    3. This version of pybind11 uses a redesigned mechanism for instantiating
+       trampoline classes that are used to override virtual methods from within
+       Python. This led to the following user-visible syntax change: instead of
+
+       .. code-block:: cpp
+
+           py::class_<TrampolineClass>("MyClass")
+             .alias<MyClass>()
+             ....
+
+       write
+
+       .. code-block:: cpp
+
+           py::class_<MyClass, TrampolineClass>("MyClass")
+             ....
+
+       Importantly, both the original and the trampoline class are now
+       specified as an arguments (in arbitrary order) to the ``py::class_``
+       template, and the ``alias<..>()`` call is gone. The new scheme has zero
+       overhead in cases when Python doesn't override any functions of the
+       underlying C++ class. `rev. 86d825
+       <https://github.com/pybind/pybind11/commit/86d825>`_.
+
+* Added ``eval`` and ``eval_file`` functions for evaluating expressions and
+  statements from a string or file. `rev. 0d3fc3
+  <https://github.com/pybind/pybind11/commit/0d3fc3>`_.
+
+* pybind11 can now create types with a modifiable dictionary.
+  `#437 <https://github.com/pybind/pybind11/pull/437>`_ and
+  `#444 <https://github.com/pybind/pybind11/pull/444>`_.
+
+* Support for translation of arbitrary C++ exceptions to Python counterparts.
+  `#296 <https://github.com/pybind/pybind11/pull/296>`_ and
+  `#273 <https://github.com/pybind/pybind11/pull/273>`_.
+
+* Report full backtraces through mixed C++/Python code, better reporting for
+  import errors, fixed GIL management in exception processing.
+  `#537 <https://github.com/pybind/pybind11/pull/537>`_,
+  `#494 <https://github.com/pybind/pybind11/pull/494>`_,
+  `rev. e72d95 <https://github.com/pybind/pybind11/commit/e72d95>`_, and
+  `rev. 099d6e <https://github.com/pybind/pybind11/commit/099d6e>`_.
+
+* Support for bit-level operations, comparisons, and serialization of C++
+  enumerations. `#503 <https://github.com/pybind/pybind11/pull/503>`_,
+  `#508 <https://github.com/pybind/pybind11/pull/508>`_,
+  `#380 <https://github.com/pybind/pybind11/pull/380>`_,
+  `#309 <https://github.com/pybind/pybind11/pull/309>`_.
+  `#311 <https://github.com/pybind/pybind11/pull/311>`_.
+
+* The ``class_`` constructor now accepts its template arguments in any order.
+  `#385 <https://github.com/pybind/pybind11/pull/385>`_.
+
+* Attribute and item accessors now have a more complete interface which makes
+  it possible to chain attributes as in
+  ``obj.attr("a")[key].attr("b").attr("method")(1, 2, 3)``. `#425
+  <https://github.com/pybind/pybind11/pull/425>`_.
+
+* Major redesign of the default and conversion constructors in ``pytypes.h``.
+  `#464 <https://github.com/pybind/pybind11/pull/464>`_.
+
+* Added built-in support for ``std::shared_ptr`` holder type. It is no longer
+  necessary to to include a declaration of the form
+  ``PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)`` (though continuing to
+  do so won't cause an error).
+  `#454 <https://github.com/pybind/pybind11/pull/454>`_.
+
+* New ``py::overload_cast`` casting operator to select among multiple possible
+  overloads of a function. An example:
+
+    .. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def("set", py::overload_cast<int>(&Pet::set), "Set the pet's age")
+            .def("set", py::overload_cast<const std::string &>(&Pet::set), "Set the pet's name");
+
+  This feature only works on C++14-capable compilers.
+  `#541 <https://github.com/pybind/pybind11/pull/541>`_.
+
+* C++ types are automatically cast to Python types, e.g. when assigning
+  them as an attribute. For instance, the following is now legal:
+
+    .. code-block:: cpp
+
+        py::module m = /* ... */
+        m.attr("constant") = 123;
+
+  (Previously, a ``py::cast`` call was necessary to avoid a compilation error.)
+  `#551 <https://github.com/pybind/pybind11/pull/551>`_.
+
+* Redesigned ``pytest``-based test suite. `#321 <https://github.com/pybind/pybind11/pull/321>`_.
+
+* Instance tracking to detect reference leaks in test suite. `#324 <https://github.com/pybind/pybind11/pull/324>`_
+
+* pybind11 can now distinguish between multiple different instances that are
+  located at the same memory address, but which have different types.
+  `#329 <https://github.com/pybind/pybind11/pull/329>`_.
+
+* Improved logic in ``move`` return value policy.
+  `#510 <https://github.com/pybind/pybind11/pull/510>`_,
+  `#297 <https://github.com/pybind/pybind11/pull/297>`_.
+
+* Generalized unpacking API to permit calling Python functions from C++ using
+  notation such as ``foo(a1, a2, *args, "ka"_a=1, "kb"_a=2, **kwargs)``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* ``py::print()`` function whose behavior matches that of the native Python
+  ``print()`` function. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::dict`` keyword constructor:``auto d = dict("number"_a=42,
+  "name"_a="World");``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::str::format()`` method and ``_s`` literal: ``py::str s = "1 + 2
+  = {}"_s.format(3);``. `#372 <https://github.com/pybind/pybind11/pull/372>`_.
+
+* Added ``py::repr()`` function which is equivalent to Python's builtin
+  ``repr()``. `#333 <https://github.com/pybind/pybind11/pull/333>`_.
+
+* Improved construction and destruction logic for holder types. It is now
+  possible to reference instances with smart pointer holder types without
+  constructing the holder if desired. The ``PYBIND11_DECLARE_HOLDER_TYPE``
+  macro now accepts an optional second parameter to indicate whether the holder
+  type uses intrusive reference counting.
+  `#533 <https://github.com/pybind/pybind11/pull/533>`_ and
+  `#561 <https://github.com/pybind/pybind11/pull/561>`_.
+
+* Mapping a stateless C++ function to Python and back is now "for free" (i.e.
+  no extra indirections or argument conversion overheads). `rev. 954b79
+  <https://github.com/pybind/pybind11/commit/954b79>`_.
+
+* Bindings for ``std::valarray<T>``.
+  `#545 <https://github.com/pybind/pybind11/pull/545>`_.
+
+* Improved support for C++17 capable compilers.
+  `#562 <https://github.com/pybind/pybind11/pull/562>`_.
+
+* Bindings for ``std::optional<t>``.
+  `#475 <https://github.com/pybind/pybind11/pull/475>`_,
+  `#476 <https://github.com/pybind/pybind11/pull/476>`_,
+  `#479 <https://github.com/pybind/pybind11/pull/479>`_,
+  `#499 <https://github.com/pybind/pybind11/pull/499>`_, and
+  `#501 <https://github.com/pybind/pybind11/pull/501>`_.
+
+* ``stl_bind.h``: general improvements and support for ``std::map`` and
+  ``std::unordered_map``.
+  `#490 <https://github.com/pybind/pybind11/pull/490>`_,
+  `#282 <https://github.com/pybind/pybind11/pull/282>`_,
+  `#235 <https://github.com/pybind/pybind11/pull/235>`_.
+
+* The ``std::tuple``, ``std::pair``, ``std::list``, and ``std::vector`` type
+  casters now accept any Python sequence type as input. `rev. 107285
+  <https://github.com/pybind/pybind11/commit/107285>`_.
+
+* Improved CMake Python detection on multi-architecture Linux.
+  `#532 <https://github.com/pybind/pybind11/pull/532>`_.
+
+* Infrastructure to selectively disable or enable parts of the automatically
+  generated docstrings. `#486 <https://github.com/pybind/pybind11/pull/486>`_.
+
+* ``reference`` and ``reference_internal`` are now the default return value
+  properties for static and non-static properties, respectively. `#473
+  <https://github.com/pybind/pybind11/pull/473>`_. (the previous defaults
+  were ``automatic``). `#473 <https://github.com/pybind/pybind11/pull/473>`_.
+
+* Support for ``std::unique_ptr`` with non-default deleters or no deleter at
+  all (``py::nodelete``). `#384 <https://github.com/pybind/pybind11/pull/384>`_.
+
+* Deprecated ``handle::call()`` method. The new syntax to call Python
+  functions is simply ``handle()``. It can also be invoked explicitly via
+  ``handle::operator<X>()``, where ``X`` is an optional return value policy.
+
+* Print more informative error messages when ``make_tuple()`` or ``cast()``
+  fail. `#262 <https://github.com/pybind/pybind11/pull/262>`_.
+
+* Creation of holder types for classes deriving from
+  ``std::enable_shared_from_this<>`` now also works for ``const`` values.
+  `#260 <https://github.com/pybind/pybind11/pull/260>`_.
+
+* ``make_iterator()`` improvements for better compatibility with various
+  types (now uses prefix increment operator); it now also accepts iterators
+  with different begin/end types as long as they are equality comparable.
+  `#247 <https://github.com/pybind/pybind11/pull/247>`_.
+
+* ``arg()`` now accepts a wider range of argument types for default values.
+  `#244 <https://github.com/pybind/pybind11/pull/244>`_.
+
+* Support ``keep_alive`` where the nurse object may be ``None``. `#341
+  <https://github.com/pybind/pybind11/pull/341>`_.
+
+* Added constructors for ``str`` and ``bytes`` from zero-terminated char
+  pointers, and from char pointers and length. Added constructors for ``str``
+  from ``bytes`` and for ``bytes`` from ``str``, which will perform UTF-8
+  decoding/encoding as required.
+
+* Many other improvements of library internals without user-visible changes
+
+
+1.8.1 (July 12, 2016)
+----------------------
+* Fixed a rare but potentially very severe issue when the garbage collector ran
+  during pybind11 type creation.
+
+1.8.0 (June 14, 2016)
+----------------------
+* Redesigned CMake build system which exports a convenient
+  ``pybind11_add_module`` function to parent projects.
+* ``std::vector<>`` type bindings analogous to Boost.Python's ``indexing_suite``
+* Transparent conversion of sparse and dense Eigen matrices and vectors (``eigen.h``)
+* Added an ``ExtraFlags`` template argument to the NumPy ``array_t<>`` wrapper
+  to disable an enforced cast that may lose precision, e.g. to create overloads
+  for different precisions and complex vs real-valued matrices.
+* Prevent implicit conversion of floating point values to integral types in
+  function arguments
+* Fixed incorrect default return value policy for functions returning a shared
+  pointer
+* Don't allow registering a type via ``class_`` twice
+* Don't allow casting a ``None`` value into a C++ lvalue reference
+* Fixed a crash in ``enum_::operator==`` that was triggered by the ``help()`` command
+* Improved detection of whether or not custom C++ types can be copy/move-constructed
+* Extended ``str`` type to also work with ``bytes`` instances
+* Added a ``"name"_a`` user defined string literal that is equivalent to ``py::arg("name")``.
+* When specifying function arguments via ``py::arg``, the test that verifies
+  the number of arguments now runs at compile time.
+* Added ``[[noreturn]]`` attribute to ``pybind11_fail()`` to quench some
+  compiler warnings
+* List function arguments in exception text when the dispatch code cannot find
+  a matching overload
+* Added ``PYBIND11_OVERLOAD_NAME`` and ``PYBIND11_OVERLOAD_PURE_NAME`` macros which
+  can be used to override virtual methods whose name differs in C++ and Python
+  (e.g. ``__call__`` and ``operator()``)
+* Various minor ``iterator`` and ``make_iterator()`` improvements
+* Transparently support ``__bool__`` on Python 2.x and Python 3.x
+* Fixed issue with destructor of unpickled object not being called
+* Minor CMake build system improvements on Windows
+* New ``pybind11::args`` and ``pybind11::kwargs`` types to create functions which
+  take an arbitrary number of arguments and keyword arguments
+* New syntax to call a Python function from C++ using ``*args`` and ``*kwargs``
+* The functions ``def_property_*`` now correctly process docstring arguments (these
+  formerly caused a segmentation fault)
+* Many ``mkdoc.py`` improvements (enumerations, template arguments, ``DOC()``
+  macro accepts more arguments)
+* Cygwin support
+* Documentation improvements (pickling support, ``keep_alive``, macro usage)
+
+1.7 (April 30, 2016)
+----------------------
+* Added a new ``move`` return value policy that triggers C++11 move semantics.
+  The automatic return value policy falls back to this case whenever a rvalue
+  reference is encountered
+* Significantly more general GIL state routines that are used instead of
+  Python's troublesome ``PyGILState_Ensure`` and ``PyGILState_Release`` API
+* Redesign of opaque types that drastically simplifies their usage
+* Extended ability to pass values of type ``[const] void *``
+* ``keep_alive`` fix: don't fail when there is no patient
+* ``functional.h``: acquire the GIL before calling a Python function
+* Added Python RAII type wrappers ``none`` and ``iterable``
+* Added ``*args`` and ``*kwargs`` pass-through parameters to
+  ``pybind11.get_include()`` function
+* Iterator improvements and fixes
+* Documentation on return value policies and opaque types improved
+
+1.6 (April 30, 2016)
+----------------------
+* Skipped due to upload to PyPI gone wrong and inability to recover
+  (https://github.com/pypa/packaging-problems/issues/74)
+
+1.5 (April 21, 2016)
+----------------------
+* For polymorphic types, use RTTI to try to return the closest type registered with pybind11
+* Pickling support for serializing and unserializing C++ instances to a byte stream in Python
+* Added a convenience routine ``make_iterator()`` which turns a range indicated
+  by a pair of C++ iterators into a iterable Python object
+* Added ``len()`` and a variadic ``make_tuple()`` function
+* Addressed a rare issue that could confuse the current virtual function
+  dispatcher and another that could lead to crashes in multi-threaded
+  applications
+* Added a ``get_include()`` function to the Python module that returns the path
+  of the directory containing the installed pybind11 header files
+* Documentation improvements: import issues, symbol visibility, pickling, limitations
+* Added casting support for ``std::reference_wrapper<>``
+
+1.4 (April 7, 2016)
+--------------------------
+* Transparent type conversion for ``std::wstring`` and ``wchar_t``
+* Allow passing ``nullptr``-valued strings
+* Transparent passing of ``void *`` pointers using capsules
+* Transparent support for returning values wrapped in ``std::unique_ptr<>``
+* Improved docstring generation for compatibility with Sphinx
+* Nicer debug error message when default parameter construction fails
+* Support for "opaque" types that bypass the transparent conversion layer for STL containers
+* Redesigned type casting interface to avoid ambiguities that could occasionally cause compiler errors
+* Redesigned property implementation; fixes crashes due to an unfortunate default return value policy
+* Anaconda package generation support
+
+1.3 (March 8, 2016)
+--------------------------
+
+* Added support for the Intel C++ compiler (v15+)
+* Added support for the STL unordered set/map data structures
+* Added support for the STL linked list data structure
+* NumPy-style broadcasting support in ``pybind11::vectorize``
+* pybind11 now displays more verbose error messages when ``arg::operator=()`` fails
+* pybind11 internal data structures now live in a version-dependent namespace to avoid ABI issues
+* Many, many bugfixes involving corner cases and advanced usage
+
+1.2 (February 7, 2016)
+--------------------------
+
+* Optional: efficient generation of function signatures at compile time using C++14
+* Switched to a simpler and more general way of dealing with function default
+  arguments. Unused keyword arguments in function calls are now detected and
+  cause errors as expected
+* New ``keep_alive`` call policy analogous to Boost.Python's ``with_custodian_and_ward``
+* New ``pybind11::base<>`` attribute to indicate a subclass relationship
+* Improved interface for RAII type wrappers in ``pytypes.h``
+* Use RAII type wrappers consistently within pybind11 itself. This
+  fixes various potential refcount leaks when exceptions occur
+* Added new ``bytes`` RAII type wrapper (maps to ``string`` in Python 2.7)
+* Made handle and related RAII classes const correct, using them more
+  consistently everywhere now
+* Got rid of the ugly ``__pybind11__`` attributes on the Python side---they are
+  now stored in a C++ hash table that is not visible in Python
+* Fixed refcount leaks involving NumPy arrays and bound functions
+* Vastly improved handling of shared/smart pointers
+* Removed an unnecessary copy operation in ``pybind11::vectorize``
+* Fixed naming clashes when both pybind11 and NumPy headers are included
+* Added conversions for additional exception types
+* Documentation improvements (using multiple extension modules, smart pointers,
+  other minor clarifications)
+* unified infrastructure for parsing variadic arguments in ``class_`` and cpp_function
+* Fixed license text (was: ZLIB, should have been: 3-clause BSD)
+* Python 3.2 compatibility
+* Fixed remaining issues when accessing types in another plugin module
+* Added enum comparison and casting methods
+* Improved SFINAE-based detection of whether types are copy-constructible
+* Eliminated many warnings about unused variables and the use of ``offsetof()``
+* Support for ``std::array<>`` conversions
+
+1.1 (December 7, 2015)
+--------------------------
+
+* Documentation improvements (GIL, wrapping functions, casting, fixed many typos)
+* Generalized conversion of integer types
+* Improved support for casting function objects
+* Improved support for ``std::shared_ptr<>`` conversions
+* Initial support for ``std::set<>`` conversions
+* Fixed type resolution issue for types defined in a separate plugin module
+* Cmake build system improvements
+* Factored out generic functionality to non-templated code (smaller code size)
+* Added a code size / compile time benchmark vs Boost.Python
+* Added an appveyor CI script
+
+1.0 (October 15, 2015)
+------------------------
+* Initial release
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/classes.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/classes.rst
new file mode 100644
index 0000000..a63f6a1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/classes.rst
@@ -0,0 +1,532 @@
+.. _classes:
+
+Object-oriented code
+####################
+
+Creating bindings for a custom type
+===================================
+
+Let's now look at a more complex example where we'll create bindings for a
+custom C++ data structure named ``Pet``. Its definition is given below:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name) : name(name) { }
+        void setName(const std::string &name_) { name = name_; }
+        const std::string &getName() const { return name; }
+
+        std::string name;
+    };
+
+The binding code for ``Pet`` looks as follows:
+
+.. code-block:: cpp
+
+    #include <pybind11/pybind11.h>
+
+    namespace py = pybind11;
+
+    PYBIND11_MODULE(example, m) {
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def("setName", &Pet::setName)
+            .def("getName", &Pet::getName);
+    }
+
+:class:`class_` creates bindings for a C++ *class* or *struct*-style data
+structure. :func:`init` is a convenience function that takes the types of a
+constructor's parameters as template arguments and wraps the corresponding
+constructor (see the :ref:`custom_constructors` section for details). An
+interactive Python session demonstrating this example is shown below:
+
+.. code-block:: pycon
+
+    % python
+    >>> import example
+    >>> p = example.Pet('Molly')
+    >>> print(p)
+    <example.Pet object at 0x10cd98060>
+    >>> p.getName()
+    u'Molly'
+    >>> p.setName('Charly')
+    >>> p.getName()
+    u'Charly'
+
+.. seealso::
+
+    Static member functions can be bound in the same way using
+    :func:`class_::def_static`.
+
+Keyword and default arguments
+=============================
+It is possible to specify keyword and default arguments using the syntax
+discussed in the previous chapter. Refer to the sections :ref:`keyword_args`
+and :ref:`default_args` for details.
+
+Binding lambda functions
+========================
+
+Note how ``print(p)`` produced a rather useless summary of our data structure in the example above:
+
+.. code-block:: pycon
+
+    >>> print(p)
+    <example.Pet object at 0x10cd98060>
+
+To address this, we could bind an utility function that returns a human-readable
+summary to the special method slot named ``__repr__``. Unfortunately, there is no
+suitable functionality in the ``Pet`` data structure, and it would be nice if
+we did not have to change it. This can easily be accomplished by binding a
+Lambda function instead:
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def("setName", &Pet::setName)
+            .def("getName", &Pet::getName)
+            .def("__repr__",
+                [](const Pet &a) {
+                    return "<example.Pet named '" + a.name + "'>";
+                }
+            );
+
+Both stateless [#f1]_ and stateful lambda closures are supported by pybind11.
+With the above change, the same Python code now produces the following output:
+
+.. code-block:: pycon
+
+    >>> print(p)
+    <example.Pet named 'Molly'>
+
+.. [#f1] Stateless closures are those with an empty pair of brackets ``[]`` as the capture object.
+
+.. _properties:
+
+Instance and static fields
+==========================
+
+We can also directly expose the ``name`` field using the
+:func:`class_::def_readwrite` method. A similar :func:`class_::def_readonly`
+method also exists for ``const`` fields.
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def_readwrite("name", &Pet::name)
+            // ... remainder ...
+
+This makes it possible to write
+
+.. code-block:: pycon
+
+    >>> p = example.Pet('Molly')
+    >>> p.name
+    u'Molly'
+    >>> p.name = 'Charly'
+    >>> p.name
+    u'Charly'
+
+Now suppose that ``Pet::name`` was a private internal variable
+that can only be accessed via setters and getters.
+
+.. code-block:: cpp
+
+    class Pet {
+    public:
+        Pet(const std::string &name) : name(name) { }
+        void setName(const std::string &name_) { name = name_; }
+        const std::string &getName() const { return name; }
+    private:
+        std::string name;
+    };
+
+In this case, the method :func:`class_::def_property`
+(:func:`class_::def_property_readonly` for read-only data) can be used to
+provide a field-like interface within Python that will transparently call
+the setter and getter functions:
+
+.. code-block:: cpp
+
+        py::class_<Pet>(m, "Pet")
+            .def(py::init<const std::string &>())
+            .def_property("name", &Pet::getName, &Pet::setName)
+            // ... remainder ...
+
+Write only properties can be defined by passing ``nullptr`` as the
+input for the read function.
+
+.. seealso::
+
+    Similar functions :func:`class_::def_readwrite_static`,
+    :func:`class_::def_readonly_static` :func:`class_::def_property_static`,
+    and :func:`class_::def_property_readonly_static` are provided for binding
+    static variables and properties. Please also see the section on
+    :ref:`static_properties` in the advanced part of the documentation.
+
+Dynamic attributes
+==================
+
+Native Python classes can pick up new attributes dynamically:
+
+.. code-block:: pycon
+
+    >>> class Pet:
+    ...     name = 'Molly'
+    ...
+    >>> p = Pet()
+    >>> p.name = 'Charly'  # overwrite existing
+    >>> p.age = 2  # dynamically add a new attribute
+
+By default, classes exported from C++ do not support this and the only writable
+attributes are the ones explicitly defined using :func:`class_::def_readwrite`
+or :func:`class_::def_property`.
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+        .def(py::init<>())
+        .def_readwrite("name", &Pet::name);
+
+Trying to set any other attribute results in an error:
+
+.. code-block:: pycon
+
+    >>> p = example.Pet()
+    >>> p.name = 'Charly'  # OK, attribute defined in C++
+    >>> p.age = 2  # fail
+    AttributeError: 'Pet' object has no attribute 'age'
+
+To enable dynamic attributes for C++ classes, the :class:`py::dynamic_attr` tag
+must be added to the :class:`py::class_` constructor:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet", py::dynamic_attr())
+        .def(py::init<>())
+        .def_readwrite("name", &Pet::name);
+
+Now everything works as expected:
+
+.. code-block:: pycon
+
+    >>> p = example.Pet()
+    >>> p.name = 'Charly'  # OK, overwrite value in C++
+    >>> p.age = 2  # OK, dynamically add a new attribute
+    >>> p.__dict__  # just like a native Python class
+    {'age': 2}
+
+Note that there is a small runtime cost for a class with dynamic attributes.
+Not only because of the addition of a ``__dict__``, but also because of more
+expensive garbage collection tracking which must be activated to resolve
+possible circular references. Native Python classes incur this same cost by
+default, so this is not anything to worry about. By default, pybind11 classes
+are more efficient than native Python classes. Enabling dynamic attributes
+just brings them on par.
+
+.. _inheritance:
+
+Inheritance and automatic downcasting
+=====================================
+
+Suppose now that the example consists of two data structures with an
+inheritance relationship:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name) : name(name) { }
+        std::string name;
+    };
+
+    struct Dog : Pet {
+        Dog(const std::string &name) : Pet(name) { }
+        std::string bark() const { return "woof!"; }
+    };
+
+There are two different ways of indicating a hierarchical relationship to
+pybind11: the first specifies the C++ base class as an extra template
+parameter of the :class:`class_`:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+       .def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    // Method 1: template parameter:
+    py::class_<Dog, Pet /* <- specify C++ parent type */>(m, "Dog")
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Alternatively, we can also assign a name to the previously bound ``Pet``
+:class:`class_` object and reference it when binding the ``Dog`` class:
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+    pet.def(py::init<const std::string &>())
+       .def_readwrite("name", &Pet::name);
+
+    // Method 2: pass parent class_ object:
+    py::class_<Dog>(m, "Dog", pet /* <- specify Python parent type */)
+        .def(py::init<const std::string &>())
+        .def("bark", &Dog::bark);
+
+Functionality-wise, both approaches are equivalent. Afterwards, instances will
+expose fields and methods of both types:
+
+.. code-block:: pycon
+
+    >>> p = example.Dog('Molly')
+    >>> p.name
+    u'Molly'
+    >>> p.bark()
+    u'woof!'
+
+The C++ classes defined above are regular non-polymorphic types with an
+inheritance relationship. This is reflected in Python:
+
+.. code-block:: cpp
+
+    // Return a base pointer to a derived instance
+    m.def("pet_store", []() { return std::unique_ptr<Pet>(new Dog("Molly")); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store()
+    >>> type(p)  # `Dog` instance behind `Pet` pointer
+    Pet          # no pointer downcasting for regular non-polymorphic types
+    >>> p.bark()
+    AttributeError: 'Pet' object has no attribute 'bark'
+
+The function returned a ``Dog`` instance, but because it's a non-polymorphic
+type behind a base pointer, Python only sees a ``Pet``. In C++, a type is only
+considered polymorphic if it has at least one virtual function and pybind11
+will automatically recognize this:
+
+.. code-block:: cpp
+
+    struct PolymorphicPet {
+        virtual ~PolymorphicPet() = default;
+    };
+
+    struct PolymorphicDog : PolymorphicPet {
+        std::string bark() const { return "woof!"; }
+    };
+
+    // Same binding code
+    py::class_<PolymorphicPet>(m, "PolymorphicPet");
+    py::class_<PolymorphicDog, PolymorphicPet>(m, "PolymorphicDog")
+        .def(py::init<>())
+        .def("bark", &PolymorphicDog::bark);
+
+    // Again, return a base pointer to a derived instance
+    m.def("pet_store2", []() { return std::unique_ptr<PolymorphicPet>(new PolymorphicDog); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store2()
+    >>> type(p)
+    PolymorphicDog  # automatically downcast
+    >>> p.bark()
+    u'woof!'
+
+Given a pointer to a polymorphic base, pybind11 performs automatic downcasting
+to the actual derived type. Note that this goes beyond the usual situation in
+C++: we don't just get access to the virtual functions of the base, we get the
+concrete derived type including functions and attributes that the base type may
+not even be aware of.
+
+.. seealso::
+
+    For more information about polymorphic behavior see :ref:`overriding_virtuals`.
+
+
+Overloaded methods
+==================
+
+Sometimes there are several overloaded C++ methods with the same name taking
+different kinds of input arguments:
+
+.. code-block:: cpp
+
+    struct Pet {
+        Pet(const std::string &name, int age) : name(name), age(age) { }
+
+        void set(int age_) { age = age_; }
+        void set(const std::string &name_) { name = name_; }
+
+        std::string name;
+        int age;
+    };
+
+Attempting to bind ``Pet::set`` will cause an error since the compiler does not
+know which method the user intended to select. We can disambiguate by casting
+them to function pointers. Binding multiple functions to the same Python name
+automatically creates a chain of function overloads that will be tried in
+sequence.
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+       .def(py::init<const std::string &, int>())
+       .def("set", (void (Pet::*)(int)) &Pet::set, "Set the pet's age")
+       .def("set", (void (Pet::*)(const std::string &)) &Pet::set, "Set the pet's name");
+
+The overload signatures are also visible in the method's docstring:
+
+.. code-block:: pycon
+
+    >>> help(example.Pet)
+
+    class Pet(__builtin__.object)
+     |  Methods defined here:
+     |
+     |  __init__(...)
+     |      Signature : (Pet, str, int) -> NoneType
+     |
+     |  set(...)
+     |      1. Signature : (Pet, int) -> NoneType
+     |
+     |      Set the pet's age
+     |
+     |      2. Signature : (Pet, str) -> NoneType
+     |
+     |      Set the pet's name
+
+If you have a C++14 compatible compiler [#cpp14]_, you can use an alternative
+syntax to cast the overloaded function:
+
+.. code-block:: cpp
+
+    py::class_<Pet>(m, "Pet")
+        .def("set", py::overload_cast<int>(&Pet::set), "Set the pet's age")
+        .def("set", py::overload_cast<const std::string &>(&Pet::set), "Set the pet's name");
+
+Here, ``py::overload_cast`` only requires the parameter types to be specified.
+The return type and class are deduced. This avoids the additional noise of
+``void (Pet::*)()`` as seen in the raw cast. If a function is overloaded based
+on constness, the ``py::const_`` tag should be used:
+
+.. code-block:: cpp
+
+    struct Widget {
+        int foo(int x, float y);
+        int foo(int x, float y) const;
+    };
+
+    py::class_<Widget>(m, "Widget")
+       .def("foo_mutable", py::overload_cast<int, float>(&Widget::foo))
+       .def("foo_const",   py::overload_cast<int, float>(&Widget::foo, py::const_));
+
+If you prefer the ``py::overload_cast`` syntax but have a C++11 compatible compiler only,
+you can use ``py::detail::overload_cast_impl`` with an additional set of parentheses:
+
+.. code-block:: cpp
+
+    template <typename... Args>
+    using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+
+    py::class_<Pet>(m, "Pet")
+        .def("set", overload_cast_<int>()(&Pet::set), "Set the pet's age")
+        .def("set", overload_cast_<const std::string &>()(&Pet::set), "Set the pet's name");
+
+.. [#cpp14] A compiler which supports the ``-std=c++14`` flag
+            or Visual Studio 2015 Update 2 and newer.
+
+.. note::
+
+    To define multiple overloaded constructors, simply declare one after the
+    other using the ``.def(py::init<...>())`` syntax. The existing machinery
+    for specifying keyword and default arguments also works.
+
+Enumerations and internal types
+===============================
+
+Let's now suppose that the example class contains an internal enumeration type,
+e.g.:
+
+.. code-block:: cpp
+
+    struct Pet {
+        enum Kind {
+            Dog = 0,
+            Cat
+        };
+
+        Pet(const std::string &name, Kind type) : name(name), type(type) { }
+
+        std::string name;
+        Kind type;
+    };
+
+The binding code for this example looks as follows:
+
+.. code-block:: cpp
+
+    py::class_<Pet> pet(m, "Pet");
+
+    pet.def(py::init<const std::string &, Pet::Kind>())
+        .def_readwrite("name", &Pet::name)
+        .def_readwrite("type", &Pet::type);
+
+    py::enum_<Pet::Kind>(pet, "Kind")
+        .value("Dog", Pet::Kind::Dog)
+        .value("Cat", Pet::Kind::Cat)
+        .export_values();
+
+To ensure that the ``Kind`` type is created within the scope of ``Pet``, the
+``pet`` :class:`class_` instance must be supplied to the :class:`enum_`.
+constructor. The :func:`enum_::export_values` function exports the enum entries
+into the parent scope, which should be skipped for newer C++11-style strongly
+typed enums.
+
+.. code-block:: pycon
+
+    >>> p = Pet('Lucy', Pet.Cat)
+    >>> p.type
+    Kind.Cat
+    >>> int(p.type)
+    1L
+
+The entries defined by the enumeration type are exposed in the ``__members__`` property:
+
+.. code-block:: pycon
+
+    >>> Pet.Kind.__members__
+    {'Dog': Kind.Dog, 'Cat': Kind.Cat}
+
+The ``name`` property returns the name of the enum value as a unicode string.
+
+.. note::
+
+    It is also possible to use ``str(enum)``, however these accomplish different
+    goals. The following shows how these two approaches differ.
+
+    .. code-block:: pycon
+
+        >>> p = Pet( "Lucy", Pet.Cat )
+        >>> pet_type = p.type
+        >>> pet_type
+        Pet.Cat
+        >>> str(pet_type)
+        'Pet.Cat'
+        >>> pet_type.name
+        'Cat'
+
+.. note::
+
+    When the special tag ``py::arithmetic()`` is specified to the ``enum_``
+    constructor, pybind11 creates an enumeration that also supports rudimentary
+    arithmetic and bit-level operations like comparisons, and, or, xor, negation,
+    etc.
+
+    .. code-block:: cpp
+
+        py::enum_<Pet::Kind>(pet, "Kind", py::arithmetic())
+           ...
+
+    By default, these are omitted to conserve space.
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/compiling.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/compiling.rst
new file mode 100644
index 0000000..c50c7d8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/compiling.rst
@@ -0,0 +1,289 @@
+.. _compiling:
+
+Build systems
+#############
+
+Building with setuptools
+========================
+
+For projects on PyPI, building with setuptools is the way to go. Sylvain Corlay
+has kindly provided an example project which shows how to set up everything,
+including automatic generation of documentation using Sphinx. Please refer to
+the [python_example]_ repository.
+
+.. [python_example] https://github.com/pybind/python_example
+
+Building with cppimport
+========================
+
+[cppimport]_ is a small Python import hook that determines whether there is a C++
+source file whose name matches the requested module. If there is, the file is
+compiled as a Python extension using pybind11 and placed in the same folder as
+the C++ source file. Python is then able to find the module and load it.
+
+.. [cppimport] https://github.com/tbenthompson/cppimport
+
+.. _cmake:
+
+Building with CMake
+===================
+
+For C++ codebases that have an existing CMake-based build system, a Python
+extension module can be created with just a few lines of code:
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 2.8.12)
+    project(example)
+
+    add_subdirectory(pybind11)
+    pybind11_add_module(example example.cpp)
+
+This assumes that the pybind11 repository is located in a subdirectory named
+:file:`pybind11` and that the code is located in a file named :file:`example.cpp`.
+The CMake command ``add_subdirectory`` will import the pybind11 project which
+provides the ``pybind11_add_module`` function. It will take care of all the
+details needed to build a Python extension module on any platform.
+
+A working sample project, including a way to invoke CMake from :file:`setup.py` for
+PyPI integration, can be found in the [cmake_example]_  repository.
+
+.. [cmake_example] https://github.com/pybind/cmake_example
+
+pybind11_add_module
+-------------------
+
+To ease the creation of Python extension modules, pybind11 provides a CMake
+function with the following signature:
+
+.. code-block:: cmake
+
+    pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+                        [NO_EXTRAS] [SYSTEM] [THIN_LTO] source1 [source2 ...])
+
+This function behaves very much like CMake's builtin ``add_library`` (in fact,
+it's a wrapper function around that command). It will add a library target
+called ``<name>`` to be built from the listed source files. In addition, it
+will take care of all the Python-specific compiler and linker flags as well
+as the OS- and Python-version-specific file extension. The produced target
+``<name>`` can be further manipulated with regular CMake commands.
+
+``MODULE`` or ``SHARED`` may be given to specify the type of library. If no
+type is given, ``MODULE`` is used by default which ensures the creation of a
+Python-exclusive module. Specifying ``SHARED`` will create a more traditional
+dynamic library which can also be linked from elsewhere. ``EXCLUDE_FROM_ALL``
+removes this target from the default build (see CMake docs for details).
+
+Since pybind11 is a template library, ``pybind11_add_module`` adds compiler
+flags to ensure high quality code generation without bloat arising from long
+symbol names and duplication of code in different translation units. It
+sets default visibility to *hidden*, which is required for some pybind11
+features and functionality when attempting to load multiple pybind11 modules
+compiled under different pybind11 versions.  It also adds additional flags
+enabling LTO (Link Time Optimization) and strip unneeded symbols. See the
+:ref:`FAQ entry <faq:symhidden>` for a more detailed explanation. These
+latter optimizations are never applied in ``Debug`` mode.  If ``NO_EXTRAS`` is
+given, they will always be disabled, even in ``Release`` mode. However, this
+will result in code bloat and is generally not recommended.
+
+By default, pybind11 and Python headers will be included with ``-I``. In order
+to include pybind11 as system library, e.g. to avoid warnings in downstream
+code with warn-levels outside of pybind11's scope, set the option ``SYSTEM``.
+
+As stated above, LTO is enabled by default. Some newer compilers also support
+different flavors of LTO such as `ThinLTO`_. Setting ``THIN_LTO`` will cause
+the function to prefer this flavor if available. The function falls back to
+regular LTO if ``-flto=thin`` is not available.
+
+.. _ThinLTO: http://clang.llvm.org/docs/ThinLTO.html
+
+Configuration variables
+-----------------------
+
+By default, pybind11 will compile modules with the C++14 standard, if available
+on the target compiler, falling back to C++11 if C++14 support is not
+available.  Note, however, that this default is subject to change: future
+pybind11 releases are expected to migrate to newer C++ standards as they become
+available.  To override this, the standard flag can be given explicitly in
+``PYBIND11_CPP_STANDARD``:
+
+.. code-block:: cmake
+
+    # Use just one of these:
+    # GCC/clang:
+    set(PYBIND11_CPP_STANDARD -std=c++11)
+    set(PYBIND11_CPP_STANDARD -std=c++14)
+    set(PYBIND11_CPP_STANDARD -std=c++1z) # Experimental C++17 support
+    # MSVC:
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+    set(PYBIND11_CPP_STANDARD /std:c++latest) # Enables some MSVC C++17 features
+
+    add_subdirectory(pybind11)  # or find_package(pybind11)
+
+Note that this and all other configuration variables must be set **before** the
+call to ``add_subdirectory`` or ``find_package``. The variables can also be set
+when calling CMake from the command line using the ``-D<variable>=<value>`` flag.
+
+The target Python version can be selected by setting ``PYBIND11_PYTHON_VERSION``
+or an exact Python installation can be specified with ``PYTHON_EXECUTABLE``.
+For example:
+
+.. code-block:: bash
+
+    cmake -DPYBIND11_PYTHON_VERSION=3.6 ..
+    # or
+    cmake -DPYTHON_EXECUTABLE=path/to/python ..
+
+find_package vs. add_subdirectory
+---------------------------------
+
+For CMake-based projects that don't include the pybind11 repository internally,
+an external installation can be detected through ``find_package(pybind11)``.
+See the `Config file`_ docstring for details of relevant CMake variables.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 2.8.12)
+    project(example)
+
+    find_package(pybind11 REQUIRED)
+    pybind11_add_module(example example.cpp)
+
+Note that ``find_package(pybind11)`` will only work correctly if pybind11
+has been correctly installed on the system, e. g. after downloading or cloning
+the pybind11 repository  :
+
+.. code-block:: bash
+
+    cd pybind11
+    mkdir build
+    cd build
+    cmake ..
+    make install
+
+Once detected, the aforementioned ``pybind11_add_module`` can be employed as
+before. The function usage and configuration variables are identical no matter
+if pybind11 is added as a subdirectory or found as an installed package. You
+can refer to the same [cmake_example]_ repository for a full sample project
+-- just swap out ``add_subdirectory`` for ``find_package``.
+
+.. _Config file: https://github.com/pybind/pybind11/blob/master/tools/pybind11Config.cmake.in
+
+Advanced: interface library target
+----------------------------------
+
+When using a version of CMake greater than 3.0, pybind11 can additionally
+be used as a special *interface library* . The target ``pybind11::module``
+is available with pybind11 headers, Python headers and libraries as needed,
+and C++ compile definitions attached. This target is suitable for linking
+to an independently constructed (through ``add_library``, not
+``pybind11_add_module``) target in the consuming project.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.0)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or add_subdirectory(pybind11)
+
+    add_library(example MODULE main.cpp)
+    target_link_libraries(example PRIVATE pybind11::module)
+    set_target_properties(example PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                             SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+.. warning::
+
+    Since pybind11 is a metatemplate library, it is crucial that certain
+    compiler flags are provided to ensure high quality code generation. In
+    contrast to the ``pybind11_add_module()`` command, the CMake interface
+    library only provides the *minimal* set of parameters to ensure that the
+    code using pybind11 compiles, but it does **not** pass these extra compiler
+    flags (i.e. this is up to you).
+
+    These include Link Time Optimization (``-flto`` on GCC/Clang/ICPC, ``/GL``
+    and ``/LTCG`` on Visual Studio) and .OBJ files with many sections on Visual
+    Studio (``/bigobj``).  The :ref:`FAQ <faq:symhidden>` contains an
+    explanation on why these are needed.
+
+Embedding the Python interpreter
+--------------------------------
+
+In addition to extension modules, pybind11 also supports embedding Python into
+a C++ executable or library. In CMake, simply link with the ``pybind11::embed``
+target. It provides everything needed to get the interpreter running. The Python
+headers and libraries are attached to the target. Unlike ``pybind11::module``,
+there is no need to manually set any additional properties here. For more
+information about usage in C++, see :doc:`/advanced/embedding`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.0)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or add_subdirectory(pybind11)
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+.. _building_manually:
+
+Building manually
+=================
+
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+
+On Linux, you can compile an example such as the one given in
+:ref:`simple_example` using the following command:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+The flags given here assume that you're using Python 3. For Python 2, just
+change the executable appropriately (to ``python`` or ``python2``).
+
+The ``python3 -m pybind11 --includes`` command fetches the include paths for
+both pybind11 and Python headers. This assumes that pybind11 has been installed
+using ``pip`` or ``conda``. If it hasn't, you can also manually specify
+``-I <path-to-pybind11>/include`` together with the Python includes path
+``python3-config --includes``.
+
+Note that Python 2.7 modules don't use a special suffix, so you should simply
+use ``example.so`` instead of ``example`python3-config --extension-suffix```.
+Besides, the ``--extension-suffix`` option may or may not be available, depending
+on the distribution; in the latter case, the module extension can be manually
+set to ``.so``.
+
+On Mac OS: the build command is almost the same but it also requires passing
+the ``-undefined dynamic_lookup`` flag so as to ignore missing symbols when
+building the module:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -undefined dynamic_lookup `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+In general, it is advisable to include several additional build parameters
+that can considerably reduce the size of the created binary. Refer to section
+:ref:`cmake` for a detailed example of a suitable cross-platform CMake-based
+build system that works on all platforms including Windows.
+
+.. note::
+
+    On Linux and macOS, it's better to (intentionally) not link against
+    ``libpython``. The symbols will be resolved when the extension library
+    is loaded into a Python binary. This is preferable because you might
+    have several different installations of a given Python version (e.g. the
+    system-provided Python, and one that ships with a piece of commercial
+    software). In this way, the plugin will work with both versions, instead
+    of possibly importing a second Python library into a process that already
+    contains one (which will lead to a segfault).
+
+Generating binding code automatically
+=====================================
+
+The ``Binder`` project is a tool for automatic generation of pybind11 binding
+code by introspecting existing C++ codebases using LLVM/Clang. See the
+[binder]_ documentation for details.
+
+.. [binder] http://cppbinder.readthedocs.io/en/latest/about.html
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/conf.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/conf.py
new file mode 100644
index 0000000..a1e4e00
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/conf.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# pybind11 documentation build configuration file, created by
+# sphinx-quickstart on Sun Oct 11 19:23:48 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+import subprocess
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+
+breathe_projects = {'pybind11': '.build/doxygenxml/'}
+breathe_default_project = 'pybind11'
+breathe_domain_by_extension = {'h': 'cpp'}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['.templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'pybind11'
+copyright = '2017, Wenzel Jakob'
+author = 'Wenzel Jakob'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '2.4'
+# The full version, including alpha/beta/rc tags.
+release = '2.4.dev4'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['.build', 'release.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+default_role = 'any'
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+#pygments_style = 'monokai'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+    html_context = {
+        'css_files': [
+            '_static/theme_overrides.css'
+        ]
+    }
+else:
+    html_context = {
+        'css_files': [
+            '//media.readthedocs.org/css/sphinx_rtd_theme.css',            
+            '//media.readthedocs.org/css/readthedocs-doc-embed.css',    
+            '_static/theme_overrides.css'
+        ]
+    }
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pybind11doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+'preamble': '\DeclareUnicodeCharacter{00A0}{}',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'pybind11.tex', 'pybind11 Documentation',
+   'Wenzel Jakob', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = 'pybind11-logo.png'
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pybind11', 'pybind11 Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'pybind11', 'pybind11 Documentation',
+   author, 'pybind11', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+primary_domain = 'cpp'
+highlight_language = 'cpp'
+
+
+def generate_doxygen_xml(app):
+    build_dir = os.path.join(app.confdir, '.build')
+    if not os.path.exists(build_dir):
+        os.mkdir(build_dir)
+
+    try:
+        subprocess.call(['doxygen', '--version'])
+        retcode = subprocess.call(['doxygen'], cwd=app.confdir)
+        if retcode < 0:
+            sys.stderr.write("doxygen error code: {}\n".format(-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: {}\n".format(e))
+
+
+def setup(app):
+    """Add hook for building doxygen xml when needed"""
+    app.connect("builder-inited", generate_doxygen_xml)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst
new file mode 100644
index 0000000..4d491fb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst
@@ -0,0 +1,324 @@
+Frequently asked questions
+##########################
+
+"ImportError: dynamic module does not define init function"
+===========================================================
+
+1. Make sure that the name specified in PYBIND11_MODULE is identical to the
+filename of the extension library (without prefixes such as .so)
+
+2. If the above did not fix the issue, you are likely using an incompatible
+version of Python (for instance, the extension library was compiled against
+Python 2, while the interpreter is running on top of some version of Python
+3, or vice versa).
+
+"Symbol not found: ``__Py_ZeroStruct`` / ``_PyInstanceMethod_Type``"
+========================================================================
+
+See the first answer.
+
+"SystemError: dynamic module not initialized properly"
+======================================================
+
+See the first answer.
+
+The Python interpreter immediately crashes when importing my module
+===================================================================
+
+See the first answer.
+
+CMake doesn't detect the right Python version
+=============================================
+
+The CMake-based build system will try to automatically detect the installed
+version of Python and link against that. When this fails, or when there are
+multiple versions of Python and it finds the wrong one, delete
+``CMakeCache.txt`` and then invoke CMake as follows:
+
+.. code-block:: bash
+
+    cmake -DPYTHON_EXECUTABLE:FILEPATH=<path-to-python-executable> .
+
+.. _faq_reference_arguments:
+
+Limitations involving reference arguments
+=========================================
+
+In C++, it's fairly common to pass arguments using mutable references or
+mutable pointers, which allows both read and write access to the value
+supplied by the caller. This is sometimes done for efficiency reasons, or to
+realize functions that have multiple return values. Here are two very basic
+examples:
+
+.. code-block:: cpp
+
+    void increment(int &i) { i++; }
+    void increment_ptr(int *i) { (*i)++; }
+
+In Python, all arguments are passed by reference, so there is no general
+issue in binding such code from Python.
+
+However, certain basic Python types (like ``str``, ``int``, ``bool``,
+``float``, etc.) are **immutable**. This means that the following attempt
+to port the function to Python doesn't have the same effect on the value
+provided by the caller -- in fact, it does nothing at all.
+
+.. code-block:: python
+
+    def increment(i):
+        i += 1 # nope..
+
+pybind11 is also affected by such language-level conventions, which means that
+binding ``increment`` or ``increment_ptr`` will also create Python functions
+that don't modify their arguments.
+
+Although inconvenient, one workaround is to encapsulate the immutable types in
+a custom type that does allow modifications.
+
+An other alternative involves binding a small wrapper lambda function that
+returns a tuple with all output arguments (see the remainder of the
+documentation for examples on binding lambda functions). An example:
+
+.. code-block:: cpp
+
+    int foo(int &i) { i++; return 123; }
+
+and the binding code
+
+.. code-block:: cpp
+
+   m.def("foo", [](int i) { int rv = foo(i); return std::make_tuple(rv, i); });
+
+
+How can I reduce the build time?
+================================
+
+It's good practice to split binding code over multiple files, as in the
+following example:
+
+:file:`example.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex1(py::module &);
+    void init_ex2(py::module &);
+    /* ... */
+
+    PYBIND11_MODULE(example, m) {
+        init_ex1(m);
+        init_ex2(m);
+        /* ... */
+    }
+
+:file:`ex1.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex1(py::module &m) {
+        m.def("add", [](int a, int b) { return a + b; });
+    }
+
+:file:`ex2.cpp`:
+
+.. code-block:: cpp
+
+    void init_ex2(py::module &m) {
+        m.def("sub", [](int a, int b) { return a - b; });
+    }
+
+:command:`python`:
+
+.. code-block:: pycon
+
+    >>> import example
+    >>> example.add(1, 2)
+    3
+    >>> example.sub(1, 1)
+    0
+
+As shown above, the various ``init_ex`` functions should be contained in
+separate files that can be compiled independently from one another, and then
+linked together into the same final shared object.  Following this approach
+will:
+
+1. reduce memory requirements per compilation unit.
+
+2. enable parallel builds (if desired).
+
+3. allow for faster incremental builds. For instance, when a single class
+   definition is changed, only a subset of the binding code will generally need
+   to be recompiled.
+
+"recursive template instantiation exceeded maximum depth of 256"
+================================================================
+
+If you receive an error about excessive recursive template evaluation, try
+specifying a larger value, e.g. ``-ftemplate-depth=1024`` on GCC/Clang. The
+culprit is generally the generation of function signatures at compile time
+using C++14 template metaprogramming.
+
+.. _`faq:hidden_visibility`:
+
+"‘SomeClass’ declared with greater visibility than the type of its field ‘SomeClass::member’ [-Wattributes]"
+============================================================================================================
+
+This error typically indicates that you are compiling without the required
+``-fvisibility`` flag.  pybind11 code internally forces hidden visibility on
+all internal code, but if non-hidden (and thus *exported*) code attempts to
+include a pybind type (for example, ``py::object`` or ``py::list``) you can run
+into this warning.
+
+To avoid it, make sure you are specifying ``-fvisibility=hidden`` when
+compiling pybind code.
+
+As to why ``-fvisibility=hidden`` is necessary, because pybind modules could
+have been compiled under different versions of pybind itself, it is also
+important that the symbols defined in one module do not clash with the
+potentially-incompatible symbols defined in another.  While Python extension
+modules are usually loaded with localized symbols (under POSIX systems
+typically using ``dlopen`` with the ``RTLD_LOCAL`` flag), this Python default
+can be changed, but even if it isn't it is not always enough to guarantee
+complete independence of the symbols involved when not using
+``-fvisibility=hidden``.
+
+Additionally, ``-fvisiblity=hidden`` can deliver considerably binary size
+savings.  (See the following section for more details).
+
+
+.. _`faq:symhidden`:
+
+How can I create smaller binaries?
+==================================
+
+To do its job, pybind11 extensively relies on a programming technique known as
+*template metaprogramming*, which is a way of performing computation at compile
+time using type information. Template metaprogamming usually instantiates code
+involving significant numbers of deeply nested types that are either completely
+removed or reduced to just a few instructions during the compiler's optimization
+phase. However, due to the nested nature of these types, the resulting symbol
+names in the compiled extension library can be extremely long. For instance,
+the included test suite contains the following symbol:
+
+.. only:: html
+
+    .. code-block:: none
+
+        _​_​Z​N​8​p​y​b​i​n​d​1​1​1​2​c​p​p​_​f​u​n​c​t​i​o​n​C​1​I​v​8​E​x​a​m​p​l​e​2​J​R​N​S​t​3​_​_​1​6​v​e​c​t​o​r​I​N​S​3​_​1​2​b​a​s​i​c​_​s​t​r​i​n​g​I​w​N​S​3​_​1​1​c​h​a​r​_​t​r​a​i​t​s​I​w​E​E​N​S​3​_​9​a​l​l​o​c​a​t​o​r​I​w​E​E​E​E​N​S​8​_​I​S​A​_​E​E​E​E​E​J​N​S​_​4​n​a​m​e​E​N​S​_​7​s​i​b​l​i​n​g​E​N​S​_​9​i​s​_​m​e​t​h​o​d​E​A​2​8​_​c​E​E​E​M​T​0​_​F​T​_​D​p​T​1​_​E​D​p​R​K​T​2​_
+
+.. only:: not html
+
+    .. code-block:: cpp
+
+        __ZN8pybind1112cpp_functionC1Iv8Example2JRNSt3__16vectorINS3_12basic_stringIwNS3_11char_traitsIwEENS3_9allocatorIwEEEENS8_ISA_EEEEEJNS_4nameENS_7siblingENS_9is_methodEA28_cEEEMT0_FT_DpT1_EDpRKT2_
+
+which is the mangled form of the following function type:
+
+.. code-block:: cpp
+
+    pybind11::cpp_function::cpp_function<void, Example2, std::__1::vector<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> >, std::__1::allocator<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> > > >&, pybind11::name, pybind11::sibling, pybind11::is_method, char [28]>(void (Example2::*)(std::__1::vector<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> >, std::__1::allocator<std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> > > >&), pybind11::name const&, pybind11::sibling const&, pybind11::is_method const&, char const (&) [28])
+
+The memory needed to store just the mangled name of this function (196 bytes)
+is larger than the actual piece of code (111 bytes) it represents! On the other
+hand, it's silly to even give this function a name -- after all, it's just a
+tiny cog in a bigger piece of machinery that is not exposed to the outside
+world. So we'll generally only want to export symbols for those functions which
+are actually called from the outside.
+
+This can be achieved by specifying the parameter ``-fvisibility=hidden`` to GCC
+and Clang, which sets the default symbol visibility to *hidden*, which has a
+tremendous impact on the final binary size of the resulting extension library.
+(On Visual Studio, symbols are already hidden by default, so nothing needs to
+be done there.)
+
+In addition to decreasing binary size, ``-fvisibility=hidden`` also avoids
+potential serious issues when loading multiple modules and is required for
+proper pybind operation.  See the previous FAQ entry for more details.
+
+Working with ancient Visual Studio 2008 builds on Windows
+=========================================================
+
+The official Windows distributions of Python are compiled using truly
+ancient versions of Visual Studio that lack good C++11 support. Some users
+implicitly assume that it would be impossible to load a plugin built with
+Visual Studio 2015 into a Python distribution that was compiled using Visual
+Studio 2008. However, no such issue exists: it's perfectly legitimate to
+interface DLLs that are built with different compilers and/or C libraries.
+Common gotchas to watch out for involve not ``free()``-ing memory region
+that that were ``malloc()``-ed in another shared library, using data
+structures with incompatible ABIs, and so on. pybind11 is very careful not
+to make these types of mistakes.
+
+How can I properly handle Ctrl-C in long-running functions?
+===========================================================
+
+Ctrl-C is received by the Python interpreter, and holds it until the GIL
+is released, so a long-running function won't be interrupted.
+
+To interrupt from inside your function, you can use the ``PyErr_CheckSignals()``
+function, that will tell if a signal has been raised on the Python side.  This
+function merely checks a flag, so its impact is negligible. When a signal has
+been received, you must either explicitly interrupt execution by throwing
+``py::error_already_set`` (which will propagate the existing
+``KeyboardInterrupt``), or clear the error (which you usually will not want):
+
+.. code-block:: cpp
+
+    PYBIND11_MODULE(example, m)
+    {
+        m.def("long running_func", []()
+        {
+            for (;;) {
+                if (PyErr_CheckSignals() != 0)
+                    throw py::error_already_set();
+                // Long running iteration
+            }
+        });
+    }
+
+Inconsistent detection of Python version in CMake and pybind11
+==============================================================
+
+The functions ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` provided by CMake
+for Python version detection are not used by pybind11 due to unreliability and limitations that make
+them unsuitable for pybind11's needs. Instead pybind provides its own, more reliable Python detection
+CMake code. Conflicts can arise, however, when using pybind11 in a project that *also* uses the CMake
+Python detection in a system with several Python versions installed.
+
+This difference may cause inconsistencies and errors if *both* mechanisms are used in the same project. Consider the following
+Cmake code executed in a system with Python 2.7 and 3.x installed:
+
+.. code-block:: cmake
+
+    find_package(PythonInterp)
+    find_package(PythonLibs)
+    find_package(pybind11)
+
+It will detect Python 2.7 and pybind11 will pick it as well.
+
+In contrast this code:
+
+.. code-block:: cmake
+
+    find_package(pybind11)
+    find_package(PythonInterp)
+    find_package(PythonLibs)
+
+will detect Python 3.x for pybind11 and may crash on ``find_package(PythonLibs)`` afterwards.
+
+It is advised to avoid using ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` from CMake and rely
+on pybind11 in detecting Python version. If this is not possible CMake machinery should be called *before* including pybind11.
+
+How to cite this project?
+=========================
+
+We suggest the following BibTeX template to cite pybind11 in scientific
+discourse:
+
+.. code-block:: bash
+
+    @misc{pybind11,
+       author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan},
+       year = {2017},
+       note = {https://github.com/pybind/pybind11},
+       title = {pybind11 -- Seamless operability between C++11 and Python}
+    }
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/index.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/index.rst
new file mode 100644
index 0000000..d236611
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/index.rst
@@ -0,0 +1,47 @@
+.. only: not latex
+
+    .. image:: pybind11-logo.png
+
+pybind11 --- Seamless operability between C++11 and Python
+==========================================================
+
+.. only: not latex
+
+    Contents:
+
+.. toctree::
+   :maxdepth: 1
+
+   intro
+   changelog
+   upgrade
+
+.. toctree::
+   :caption: The Basics
+   :maxdepth: 2
+
+   basics
+   classes
+   compiling
+
+.. toctree::
+   :caption: Advanced Topics
+   :maxdepth: 2
+
+   advanced/functions
+   advanced/classes
+   advanced/exceptions
+   advanced/smart_ptrs
+   advanced/cast/index
+   advanced/pycpp/index
+   advanced/embedding
+   advanced/misc
+
+.. toctree::
+   :caption: Extra Information
+   :maxdepth: 1
+
+   faq
+   benchmark
+   limitations
+   reference
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/intro.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/intro.rst
new file mode 100644
index 0000000..10e1799
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/intro.rst
@@ -0,0 +1,93 @@
+.. image:: pybind11-logo.png
+
+About this project
+==================
+**pybind11** is a lightweight header-only library that exposes C++ types in Python
+and vice versa, mainly to create Python bindings of existing C++ code. Its
+goals and syntax are similar to the excellent `Boost.Python`_ library by David
+Abrahams: to minimize boilerplate code in traditional extension modules by
+inferring type information using compile-time introspection.
+
+.. _Boost.Python: http://www.boost.org/doc/libs/release/libs/python/doc/index.html
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.
+
+Core features
+*************
+The following core C++ features can be mapped to Python
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+Goodies
+*******
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
+  implementation-agnostic interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of `PyRosetta`_, an enormous Boost.Python binding project, reported a binary
+  size reduction of **5.4x** and compile time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using ``constexpr``),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+.. _PyRosetta: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+
+Supported compilers
+*******************
+
+1. Clang/LLVM (any non-ancient version with C++11 support)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 or newer
+4. Intel C++ compiler v17 or newer (v16 with pybind11 v2.0 and v15 with pybind11 v2.0 and a `workaround <https://github.com/pybind/pybind11/issues/276>`_ )
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/limitations.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/limitations.rst
new file mode 100644
index 0000000..a1a4f1a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/limitations.rst
@@ -0,0 +1,20 @@
+Limitations
+###########
+
+pybind11 strives to be a general solution to binding generation, but it also has
+certain limitations:
+
+- pybind11 casts away ``const``-ness in function arguments and return values.
+  This is in line with the Python language, which has no concept of ``const``
+  values. This means that some additional care is needed to avoid bugs that
+  would be caught by the type checker in a traditional C++ program.
+
+- The NumPy interface ``pybind11::array`` greatly simplifies accessing
+  numerical data from C++ (and vice versa), but it's not a full-blown array
+  class like ``Eigen::Array`` or ``boost.multi_array``.
+
+These features could be implemented but would lead to a significant increase in
+complexity. I've decided to draw the line here to keep this project simple and
+compact. Users who absolutely require these features are encouraged to fork
+pybind11.
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11-logo.png b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11-logo.png
new file mode 100644
index 0000000..4cbad54
Binary files /dev/null and b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11-logo.png differ
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.png b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.png
new file mode 100644
index 0000000..833231f
Binary files /dev/null and b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.png differ
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.svg b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.svg
new file mode 100644
index 0000000..5bf950e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.svg
@@ -0,0 +1,427 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="468pt" height="252pt" viewBox="0 0 468 252" version="1.1">
+<defs>
+<g>
+<symbol overflow="visible" id="glyph0-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph0-1">
+<path style="stroke:none;" d="M 3.726562 0 L 2.847656 0 L 2.847656 -5.601562 C 2.636719 -5.398438 2.359375 -5.195312 2.015625 -4.996094 C 1.671875 -4.792969 1.363281 -4.640625 1.089844 -4.539062 L 1.089844 -5.390625 C 1.582031 -5.621094 2.011719 -5.902344 2.378906 -6.230469 C 2.746094 -6.558594 3.007812 -6.878906 3.160156 -7.1875 L 3.726562 -7.1875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-2">
+<path style="stroke:none;" d="M 0.414062 -3.53125 C 0.414062 -4.375 0.503906 -5.058594 0.675781 -5.574219 C 0.851562 -6.089844 1.109375 -6.488281 1.453125 -6.765625 C 1.796875 -7.046875 2.226562 -7.1875 2.75 -7.1875 C 3.132812 -7.1875 3.46875 -7.109375 3.757812 -6.957031 C 4.046875 -6.800781 4.289062 -6.578125 4.476562 -6.285156 C 4.664062 -5.996094 4.8125 -5.640625 4.921875 -5.222656 C 5.03125 -4.804688 5.082031 -4.238281 5.082031 -3.53125 C 5.082031 -2.691406 4.996094 -2.011719 4.824219 -1.496094 C 4.652344 -0.980469 4.394531 -0.582031 4.050781 -0.300781 C 3.707031 -0.0195312 3.273438 0.121094 2.75 0.121094 C 2.058594 0.121094 1.515625 -0.125 1.125 -0.621094 C 0.652344 -1.214844 0.414062 -2.1875 0.414062 -3.53125 Z M 1.320312 -3.53125 C 1.320312 -2.355469 1.457031 -1.574219 1.730469 -1.183594 C 2.007812 -0.796875 2.34375 -0.601562 2.75 -0.601562 C 3.152344 -0.601562 3.492188 -0.796875 3.765625 -1.1875 C 4.042969 -1.578125 4.179688 -2.359375 4.179688 -3.53125 C 4.179688 -4.710938 4.042969 -5.492188 3.765625 -5.878906 C 3.492188 -6.265625 3.148438 -6.460938 2.738281 -6.460938 C 2.335938 -6.460938 2.011719 -6.289062 1.773438 -5.945312 C 1.46875 -5.511719 1.320312 -4.707031 1.320312 -3.53125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-3">
+<path style="stroke:none;" d="M 5.035156 -0.84375 L 5.035156 0 L 0.304688 0 C 0.296875 -0.210938 0.332031 -0.414062 0.40625 -0.609375 C 0.527344 -0.933594 0.71875 -1.25 0.984375 -1.5625 C 1.25 -1.875 1.632812 -2.234375 2.132812 -2.648438 C 2.910156 -3.285156 3.4375 -3.789062 3.710938 -4.164062 C 3.984375 -4.535156 4.121094 -4.886719 4.121094 -5.21875 C 4.121094 -5.566406 3.996094 -5.863281 3.746094 -6.101562 C 3.5 -6.339844 3.171875 -6.460938 2.773438 -6.460938 C 2.351562 -6.460938 2.011719 -6.332031 1.757812 -6.078125 C 1.503906 -5.824219 1.375 -5.472656 1.371094 -5.023438 L 0.46875 -5.117188 C 0.53125 -5.789062 0.761719 -6.304688 1.167969 -6.65625 C 1.570312 -7.011719 2.113281 -7.1875 2.792969 -7.1875 C 3.480469 -7.1875 4.023438 -6.996094 4.421875 -6.617188 C 4.824219 -6.234375 5.023438 -5.761719 5.023438 -5.199219 C 5.023438 -4.914062 4.964844 -4.632812 4.847656 -4.355469 C 4.730469 -4.078125 4.535156 -3.789062 4.265625 -3.480469 C 3.992188 -3.175781 3.542969 -2.753906 2.910156 -2.222656 C 2.382812 -1.78125 2.042969 -1.480469 1.894531 -1.320312 C 1.746094 -1.164062 1.621094 -1.003906 1.523438 -0.84375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-4">
+<path style="stroke:none;" d="M 0.414062 -1.875 L 1.335938 -1.953125 C 1.40625 -1.503906 1.566406 -1.167969 1.8125 -0.941406 C 2.0625 -0.714844 2.363281 -0.601562 2.714844 -0.601562 C 3.136719 -0.601562 3.496094 -0.761719 3.789062 -1.078125 C 4.082031 -1.398438 4.226562 -1.820312 4.226562 -2.347656 C 4.226562 -2.851562 4.085938 -3.246094 3.804688 -3.535156 C 3.523438 -3.824219 3.15625 -3.96875 2.699219 -3.96875 C 2.417969 -3.96875 2.160156 -3.90625 1.933594 -3.777344 C 1.707031 -3.648438 1.527344 -3.480469 1.398438 -3.277344 L 0.570312 -3.382812 L 1.265625 -7.0625 L 4.824219 -7.0625 L 4.824219 -6.21875 L 1.96875 -6.21875 L 1.582031 -4.296875 C 2.011719 -4.597656 2.460938 -4.746094 2.933594 -4.746094 C 3.558594 -4.746094 4.085938 -4.53125 4.515625 -4.097656 C 4.945312 -3.664062 5.160156 -3.105469 5.160156 -2.425781 C 5.160156 -1.777344 4.972656 -1.21875 4.59375 -0.746094 C 4.136719 -0.167969 3.507812 0.121094 2.714844 0.121094 C 2.0625 0.121094 1.53125 -0.0585938 1.121094 -0.425781 C 0.710938 -0.789062 0.472656 -1.273438 0.414062 -1.875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-5">
+<path style="stroke:none;" d="M 0.820312 0 L 0.820312 -7.15625 L 5.648438 -7.15625 L 5.648438 -6.3125 L 1.765625 -6.3125 L 1.765625 -4.097656 L 5.125 -4.097656 L 5.125 -3.25 L 1.765625 -3.25 L 1.765625 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-6">
+<path style="stroke:none;" d="M 4.058594 0 L 4.058594 -0.761719 C 3.65625 -0.175781 3.105469 0.117188 2.414062 0.117188 C 2.105469 0.117188 1.820312 0.0585938 1.554688 -0.0585938 C 1.289062 -0.175781 1.09375 -0.324219 0.964844 -0.5 C 0.835938 -0.679688 0.746094 -0.894531 0.695312 -1.152344 C 0.65625 -1.324219 0.640625 -1.597656 0.640625 -1.972656 L 0.640625 -5.1875 L 1.519531 -5.1875 L 1.519531 -2.308594 C 1.519531 -1.851562 1.535156 -1.542969 1.570312 -1.382812 C 1.625 -1.152344 1.746094 -0.96875 1.921875 -0.835938 C 2.101562 -0.703125 2.324219 -0.640625 2.585938 -0.640625 C 2.851562 -0.640625 3.097656 -0.707031 3.328125 -0.84375 C 3.5625 -0.976562 3.726562 -1.160156 3.820312 -1.394531 C 3.917969 -1.625 3.964844 -1.964844 3.964844 -2.40625 L 3.964844 -5.1875 L 4.84375 -5.1875 L 4.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-7">
+<path style="stroke:none;" d="M 0.660156 0 L 0.660156 -5.1875 L 1.449219 -5.1875 L 1.449219 -4.449219 C 1.832031 -5.019531 2.382812 -5.304688 3.101562 -5.304688 C 3.414062 -5.304688 3.699219 -5.246094 3.960938 -5.132812 C 4.222656 -5.023438 4.421875 -4.875 4.550781 -4.691406 C 4.679688 -4.507812 4.773438 -4.292969 4.824219 -4.042969 C 4.855469 -3.878906 4.875 -3.59375 4.875 -3.1875 L 4.875 0 L 3.992188 0 L 3.992188 -3.15625 C 3.992188 -3.511719 3.960938 -3.78125 3.890625 -3.957031 C 3.824219 -4.132812 3.703125 -4.277344 3.527344 -4.382812 C 3.351562 -4.488281 3.148438 -4.539062 2.914062 -4.539062 C 2.539062 -4.539062 2.21875 -4.421875 1.945312 -4.183594 C 1.671875 -3.945312 1.539062 -3.496094 1.539062 -2.832031 L 1.539062 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-8">
+<path style="stroke:none;" d="M 4.042969 -1.898438 L 4.90625 -1.789062 C 4.8125 -1.191406 4.570312 -0.726562 4.183594 -0.386719 C 3.792969 -0.0507812 3.316406 0.117188 2.75 0.117188 C 2.039062 0.117188 1.46875 -0.113281 1.039062 -0.578125 C 0.605469 -1.042969 0.390625 -1.707031 0.390625 -2.574219 C 0.390625 -3.132812 0.484375 -3.625 0.667969 -4.042969 C 0.855469 -4.460938 1.136719 -4.777344 1.515625 -4.988281 C 1.894531 -5.199219 2.308594 -5.304688 2.753906 -5.304688 C 3.316406 -5.304688 3.777344 -5.160156 4.136719 -4.875 C 4.492188 -4.589844 4.722656 -4.1875 4.824219 -3.664062 L 3.96875 -3.53125 C 3.886719 -3.878906 3.746094 -4.140625 3.539062 -4.316406 C 3.332031 -4.492188 3.082031 -4.578125 2.789062 -4.578125 C 2.34375 -4.578125 1.984375 -4.421875 1.710938 -4.105469 C 1.433594 -3.789062 1.292969 -3.285156 1.292969 -2.597656 C 1.292969 -1.902344 1.425781 -1.394531 1.695312 -1.078125 C 1.960938 -0.761719 2.308594 -0.605469 2.738281 -0.605469 C 3.085938 -0.605469 3.371094 -0.710938 3.601562 -0.921875 C 3.835938 -1.132812 3.980469 -1.460938 4.042969 -1.898438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-9">
+<path style="stroke:none;" d="M 2.578125 -0.785156 L 2.703125 -0.0078125 C 2.457031 0.0429688 2.234375 0.0703125 2.039062 0.0703125 C 1.722656 0.0703125 1.476562 0.0195312 1.296875 -0.0820312 C 1.121094 -0.183594 1 -0.316406 0.929688 -0.480469 C 0.855469 -0.644531 0.820312 -0.992188 0.820312 -1.519531 L 0.820312 -4.5 L 0.175781 -4.5 L 0.175781 -5.1875 L 0.820312 -5.1875 L 0.820312 -6.46875 L 1.695312 -6.996094 L 1.695312 -5.1875 L 2.578125 -5.1875 L 2.578125 -4.5 L 1.695312 -4.5 L 1.695312 -1.46875 C 1.695312 -1.21875 1.710938 -1.058594 1.742188 -0.984375 C 1.773438 -0.914062 1.820312 -0.859375 1.890625 -0.816406 C 1.960938 -0.773438 2.0625 -0.75 2.191406 -0.75 C 2.289062 -0.75 2.417969 -0.761719 2.578125 -0.785156 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-10">
+<path style="stroke:none;" d="M 0.664062 -6.148438 L 0.664062 -7.15625 L 1.542969 -7.15625 L 1.542969 -6.148438 Z M 0.664062 0 L 0.664062 -5.1875 L 1.542969 -5.1875 L 1.542969 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-11">
+<path style="stroke:none;" d="M 0.332031 -2.59375 C 0.332031 -3.554688 0.597656 -4.265625 1.132812 -4.726562 C 1.578125 -5.109375 2.121094 -5.304688 2.765625 -5.304688 C 3.476562 -5.304688 4.058594 -5.070312 4.511719 -4.601562 C 4.964844 -4.132812 5.191406 -3.488281 5.191406 -2.664062 C 5.191406 -2 5.089844 -1.472656 4.890625 -1.089844 C 4.691406 -0.707031 4.398438 -0.410156 4.015625 -0.199219 C 3.632812 0.0117188 3.214844 0.117188 2.765625 0.117188 C 2.039062 0.117188 1.449219 -0.117188 1.003906 -0.582031 C 0.554688 -1.046875 0.332031 -1.71875 0.332031 -2.59375 Z M 1.234375 -2.59375 C 1.234375 -1.929688 1.378906 -1.429688 1.671875 -1.101562 C 1.960938 -0.769531 2.324219 -0.605469 2.765625 -0.605469 C 3.199219 -0.605469 3.5625 -0.773438 3.851562 -1.101562 C 4.140625 -1.433594 4.289062 -1.941406 4.289062 -2.621094 C 4.289062 -3.261719 4.140625 -3.75 3.851562 -4.078125 C 3.558594 -4.410156 3.195312 -4.574219 2.765625 -4.574219 C 2.324219 -4.574219 1.960938 -4.410156 1.671875 -4.082031 C 1.382812 -3.753906 1.234375 -3.257812 1.234375 -2.59375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-12">
+<path style="stroke:none;" d="M 0.308594 -1.546875 L 1.175781 -1.683594 C 1.226562 -1.335938 1.363281 -1.070312 1.585938 -0.882812 C 1.808594 -0.699219 2.117188 -0.605469 2.519531 -0.605469 C 2.921875 -0.605469 3.222656 -0.6875 3.417969 -0.851562 C 3.613281 -1.015625 3.710938 -1.210938 3.710938 -1.429688 C 3.710938 -1.628906 3.625 -1.785156 3.453125 -1.898438 C 3.332031 -1.976562 3.03125 -2.078125 2.554688 -2.195312 C 1.910156 -2.359375 1.460938 -2.5 1.214844 -2.621094 C 0.964844 -2.738281 0.777344 -2.902344 0.648438 -3.113281 C 0.519531 -3.324219 0.453125 -3.554688 0.453125 -3.808594 C 0.453125 -4.039062 0.507812 -4.253906 0.613281 -4.449219 C 0.71875 -4.648438 0.863281 -4.8125 1.046875 -4.941406 C 1.183594 -5.042969 1.367188 -5.128906 1.605469 -5.199219 C 1.839844 -5.269531 2.09375 -5.304688 2.363281 -5.304688 C 2.769531 -5.304688 3.128906 -5.242188 3.433594 -5.125 C 3.742188 -5.007812 3.96875 -4.851562 4.117188 -4.652344 C 4.261719 -4.453125 4.363281 -4.183594 4.417969 -3.847656 L 3.558594 -3.730469 C 3.519531 -3.996094 3.40625 -4.207031 3.21875 -4.355469 C 3.03125 -4.503906 2.769531 -4.578125 2.425781 -4.578125 C 2.023438 -4.578125 1.734375 -4.511719 1.5625 -4.378906 C 1.390625 -4.246094 1.304688 -4.089844 1.304688 -3.910156 C 1.304688 -3.796875 1.339844 -3.695312 1.410156 -3.601562 C 1.484375 -3.507812 1.59375 -3.429688 1.75 -3.367188 C 1.835938 -3.335938 2.09375 -3.261719 2.523438 -3.144531 C 3.144531 -2.976562 3.578125 -2.84375 3.824219 -2.738281 C 4.070312 -2.632812 4.265625 -2.476562 4.40625 -2.273438 C 4.546875 -2.074219 4.613281 -1.824219 4.613281 -1.523438 C 4.613281 -1.230469 4.527344 -0.953125 4.359375 -0.695312 C 4.1875 -0.4375 3.941406 -0.238281 3.617188 -0.09375 C 3.296875 0.046875 2.929688 0.117188 2.523438 0.117188 C 1.851562 0.117188 1.335938 -0.0234375 0.984375 -0.304688 C 0.632812 -0.582031 0.40625 -0.996094 0.308594 -1.546875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph1-1">
+<path style="stroke:none;" d="M -2.300781 -0.449219 L -2.378906 -1.34375 C -2.019531 -1.386719 -1.726562 -1.484375 -1.496094 -1.636719 C -1.265625 -1.792969 -1.082031 -2.03125 -0.941406 -2.359375 C -0.796875 -2.683594 -0.726562 -3.050781 -0.726562 -3.457031 C -0.726562 -3.820312 -0.78125 -4.136719 -0.890625 -4.414062 C -0.996094 -4.691406 -1.144531 -4.898438 -1.332031 -5.03125 C -1.519531 -5.167969 -1.722656 -5.234375 -1.945312 -5.234375 C -2.167969 -5.234375 -2.363281 -5.167969 -2.53125 -5.039062 C -2.699219 -4.910156 -2.839844 -4.695312 -2.953125 -4.394531 C -3.027344 -4.203125 -3.144531 -3.777344 -3.304688 -3.121094 C -3.460938 -2.460938 -3.609375 -2 -3.75 -1.738281 C -3.929688 -1.398438 -4.152344 -1.140625 -4.417969 -0.972656 C -4.683594 -0.804688 -4.980469 -0.722656 -5.308594 -0.722656 C -5.667969 -0.722656 -6.007812 -0.824219 -6.320312 -1.03125 C -6.632812 -1.234375 -6.875 -1.535156 -7.035156 -1.929688 C -7.199219 -2.324219 -7.28125 -2.761719 -7.28125 -3.242188 C -7.28125 -3.773438 -7.195312 -4.242188 -7.023438 -4.644531 C -6.851562 -5.050781 -6.601562 -5.363281 -6.269531 -5.582031 C -5.9375 -5.800781 -5.5625 -5.917969 -5.140625 -5.933594 L -5.074219 -5.023438 C -5.527344 -4.976562 -5.867188 -4.808594 -6.097656 -4.527344 C -6.328125 -4.246094 -6.445312 -3.832031 -6.445312 -3.28125 C -6.445312 -2.707031 -6.339844 -2.289062 -6.128906 -2.027344 C -5.921875 -1.765625 -5.667969 -1.636719 -5.371094 -1.636719 C -5.113281 -1.636719 -4.902344 -1.726562 -4.734375 -1.914062 C -4.570312 -2.097656 -4.398438 -2.574219 -4.226562 -3.34375 C -4.050781 -4.113281 -3.898438 -4.640625 -3.769531 -4.925781 C -3.578125 -5.34375 -3.335938 -5.652344 -3.039062 -5.851562 C -2.746094 -6.046875 -2.40625 -6.148438 -2.023438 -6.148438 C -1.640625 -6.148438 -1.28125 -6.039062 -0.945312 -5.820312 C -0.609375 -5.601562 -0.347656 -5.289062 -0.160156 -4.878906 C 0.0273438 -4.472656 0.121094 -4.011719 0.121094 -3.5 C 0.121094 -2.851562 0.0273438 -2.308594 -0.160156 -1.871094 C -0.351562 -1.433594 -0.632812 -1.089844 -1.011719 -0.84375 C -1.390625 -0.59375 -1.820312 -0.460938 -2.300781 -0.449219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-2">
+<path style="stroke:none;" d="M -1.671875 -4.210938 L -1.558594 -5.117188 C -1.027344 -4.972656 -0.617188 -4.707031 -0.320312 -4.320312 C -0.0273438 -3.933594 0.117188 -3.4375 0.117188 -2.835938 C 0.117188 -2.078125 -0.117188 -1.476562 -0.582031 -1.03125 C -1.050781 -0.585938 -1.707031 -0.367188 -2.546875 -0.367188 C -3.421875 -0.367188 -4.097656 -0.589844 -4.578125 -1.039062 C -5.0625 -1.488281 -5.304688 -2.070312 -5.304688 -2.789062 C -5.304688 -3.480469 -5.066406 -4.046875 -4.59375 -4.488281 C -4.121094 -4.925781 -3.457031 -5.148438 -2.601562 -5.148438 C -2.550781 -5.148438 -2.472656 -5.144531 -2.367188 -5.140625 L -2.367188 -1.273438 C -1.796875 -1.304688 -1.363281 -1.46875 -1.058594 -1.757812 C -0.757812 -2.046875 -0.605469 -2.410156 -0.605469 -2.84375 C -0.605469 -3.164062 -0.691406 -3.4375 -0.859375 -3.667969 C -1.027344 -3.894531 -1.296875 -4.074219 -1.671875 -4.210938 Z M -3.089844 -1.324219 L -3.089844 -4.21875 C -3.527344 -4.179688 -3.855469 -4.070312 -4.070312 -3.886719 C -4.410156 -3.605469 -4.578125 -3.242188 -4.578125 -2.796875 C -4.578125 -2.394531 -4.445312 -2.054688 -4.175781 -1.78125 C -3.90625 -1.503906 -3.542969 -1.351562 -3.089844 -1.324219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-3">
+<path style="stroke:none;" d="M -1.898438 -4.042969 L -1.789062 -4.90625 C -1.191406 -4.8125 -0.726562 -4.570312 -0.386719 -4.183594 C -0.0507812 -3.792969 0.117188 -3.316406 0.117188 -2.75 C 0.117188 -2.039062 -0.113281 -1.46875 -0.578125 -1.039062 C -1.042969 -0.605469 -1.707031 -0.390625 -2.574219 -0.390625 C -3.132812 -0.390625 -3.625 -0.484375 -4.042969 -0.667969 C -4.460938 -0.855469 -4.777344 -1.136719 -4.988281 -1.515625 C -5.199219 -1.894531 -5.304688 -2.308594 -5.304688 -2.753906 C -5.304688 -3.316406 -5.160156 -3.777344 -4.875 -4.136719 C -4.589844 -4.492188 -4.1875 -4.722656 -3.664062 -4.824219 L -3.53125 -3.96875 C -3.878906 -3.886719 -4.140625 -3.746094 -4.316406 -3.539062 C -4.492188 -3.332031 -4.578125 -3.082031 -4.578125 -2.789062 C -4.578125 -2.34375 -4.421875 -1.984375 -4.105469 -1.710938 C -3.789062 -1.433594 -3.285156 -1.292969 -2.597656 -1.292969 C -1.902344 -1.292969 -1.394531 -1.425781 -1.078125 -1.695312 C -0.761719 -1.960938 -0.605469 -2.308594 -0.605469 -2.738281 C -0.605469 -3.085938 -0.710938 -3.371094 -0.921875 -3.601562 C -1.132812 -3.835938 -1.460938 -3.980469 -1.898438 -4.042969 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-4">
+<path style="stroke:none;" d="M -2.59375 -0.332031 C -3.554688 -0.332031 -4.265625 -0.597656 -4.726562 -1.132812 C -5.109375 -1.578125 -5.304688 -2.121094 -5.304688 -2.765625 C -5.304688 -3.476562 -5.070312 -4.058594 -4.601562 -4.511719 C -4.132812 -4.964844 -3.488281 -5.191406 -2.664062 -5.191406 C -2 -5.191406 -1.472656 -5.089844 -1.089844 -4.890625 C -0.707031 -4.691406 -0.410156 -4.398438 -0.199219 -4.015625 C 0.0117188 -3.632812 0.117188 -3.214844 0.117188 -2.765625 C 0.117188 -2.039062 -0.117188 -1.449219 -0.582031 -1.003906 C -1.046875 -0.554688 -1.71875 -0.332031 -2.59375 -0.332031 Z M -2.59375 -1.234375 C -1.929688 -1.234375 -1.429688 -1.378906 -1.101562 -1.671875 C -0.769531 -1.960938 -0.605469 -2.324219 -0.605469 -2.765625 C -0.605469 -3.199219 -0.773438 -3.5625 -1.101562 -3.851562 C -1.433594 -4.140625 -1.941406 -4.289062 -2.621094 -4.289062 C -3.261719 -4.289062 -3.75 -4.140625 -4.078125 -3.851562 C -4.410156 -3.558594 -4.574219 -3.195312 -4.574219 -2.765625 C -4.574219 -2.324219 -4.410156 -1.960938 -4.082031 -1.671875 C -3.753906 -1.382812 -3.257812 -1.234375 -2.59375 -1.234375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-5">
+<path style="stroke:none;" d="M 0 -0.660156 L -5.1875 -0.660156 L -5.1875 -1.449219 L -4.449219 -1.449219 C -5.019531 -1.832031 -5.304688 -2.382812 -5.304688 -3.101562 C -5.304688 -3.414062 -5.246094 -3.699219 -5.132812 -3.960938 C -5.023438 -4.222656 -4.875 -4.421875 -4.691406 -4.550781 C -4.507812 -4.679688 -4.292969 -4.773438 -4.042969 -4.824219 C -3.878906 -4.855469 -3.59375 -4.875 -3.1875 -4.875 L 0 -4.875 L 0 -3.992188 L -3.15625 -3.992188 C -3.511719 -3.992188 -3.78125 -3.960938 -3.957031 -3.890625 C -4.132812 -3.824219 -4.277344 -3.703125 -4.382812 -3.527344 C -4.488281 -3.351562 -4.539062 -3.148438 -4.539062 -2.914062 C -4.539062 -2.539062 -4.421875 -2.21875 -4.183594 -1.945312 C -3.945312 -1.671875 -3.496094 -1.539062 -2.832031 -1.539062 L 0 -1.539062 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-6">
+<path style="stroke:none;" d="M 0 -4.023438 L -0.65625 -4.023438 C -0.140625 -3.695312 0.117188 -3.210938 0.117188 -2.574219 C 0.117188 -2.160156 0.00390625 -1.78125 -0.226562 -1.433594 C -0.453125 -1.085938 -0.769531 -0.816406 -1.179688 -0.628906 C -1.585938 -0.4375 -2.058594 -0.34375 -2.585938 -0.34375 C -3.105469 -0.34375 -3.574219 -0.429688 -3.996094 -0.601562 C -4.417969 -0.773438 -4.742188 -1.03125 -4.964844 -1.375 C -5.191406 -1.722656 -5.304688 -2.109375 -5.304688 -2.535156 C -5.304688 -2.847656 -5.238281 -3.125 -5.105469 -3.367188 C -4.972656 -3.613281 -4.800781 -3.8125 -4.589844 -3.964844 L -7.15625 -3.964844 L -7.15625 -4.839844 L 0 -4.839844 Z M -2.585938 -1.246094 C -1.921875 -1.246094 -1.425781 -1.386719 -1.097656 -1.664062 C -0.769531 -1.945312 -0.605469 -2.273438 -0.605469 -2.65625 C -0.605469 -3.039062 -0.761719 -3.367188 -1.078125 -3.636719 C -1.390625 -3.90625 -1.871094 -4.039062 -2.515625 -4.039062 C -3.226562 -4.039062 -3.746094 -3.902344 -4.078125 -3.628906 C -4.410156 -3.355469 -4.574219 -3.015625 -4.574219 -2.617188 C -4.574219 -2.226562 -4.414062 -1.898438 -4.097656 -1.636719 C -3.777344 -1.375 -3.273438 -1.246094 -2.585938 -1.246094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-7">
+<path style="stroke:none;" d="M -1.546875 -0.308594 L -1.683594 -1.175781 C -1.335938 -1.226562 -1.070312 -1.363281 -0.882812 -1.585938 C -0.699219 -1.808594 -0.605469 -2.117188 -0.605469 -2.519531 C -0.605469 -2.921875 -0.6875 -3.222656 -0.851562 -3.417969 C -1.015625 -3.613281 -1.210938 -3.710938 -1.429688 -3.710938 C -1.628906 -3.710938 -1.785156 -3.625 -1.898438 -3.453125 C -1.976562 -3.332031 -2.078125 -3.03125 -2.195312 -2.554688 C -2.359375 -1.910156 -2.5 -1.460938 -2.621094 -1.214844 C -2.738281 -0.964844 -2.902344 -0.777344 -3.113281 -0.648438 C -3.324219 -0.519531 -3.554688 -0.453125 -3.808594 -0.453125 C -4.039062 -0.453125 -4.253906 -0.507812 -4.449219 -0.613281 C -4.648438 -0.71875 -4.8125 -0.863281 -4.941406 -1.046875 C -5.042969 -1.183594 -5.128906 -1.367188 -5.199219 -1.605469 C -5.269531 -1.839844 -5.304688 -2.09375 -5.304688 -2.363281 C -5.304688 -2.769531 -5.242188 -3.128906 -5.125 -3.433594 C -5.007812 -3.742188 -4.851562 -3.96875 -4.652344 -4.117188 C -4.453125 -4.261719 -4.183594 -4.363281 -3.847656 -4.417969 L -3.730469 -3.558594 C -3.996094 -3.519531 -4.207031 -3.40625 -4.355469 -3.21875 C -4.503906 -3.03125 -4.578125 -2.769531 -4.578125 -2.425781 C -4.578125 -2.023438 -4.511719 -1.734375 -4.378906 -1.5625 C -4.246094 -1.390625 -4.089844 -1.304688 -3.910156 -1.304688 C -3.796875 -1.304688 -3.695312 -1.339844 -3.601562 -1.410156 C -3.507812 -1.484375 -3.429688 -1.59375 -3.367188 -1.75 C -3.335938 -1.835938 -3.261719 -2.09375 -3.144531 -2.523438 C -2.976562 -3.144531 -2.84375 -3.578125 -2.738281 -3.824219 C -2.632812 -4.070312 -2.476562 -4.265625 -2.273438 -4.40625 C -2.074219 -4.546875 -1.824219 -4.613281 -1.523438 -4.613281 C -1.230469 -4.613281 -0.953125 -4.527344 -0.695312 -4.359375 C -0.4375 -4.1875 -0.238281 -3.941406 -0.09375 -3.617188 C 0.046875 -3.296875 0.117188 -2.929688 0.117188 -2.523438 C 0.117188 -1.851562 -0.0234375 -1.335938 -0.304688 -0.984375 C -0.582031 -0.632812 -0.996094 -0.40625 -1.546875 -0.308594 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph2-1">
+<path style="stroke:none;" d="M 7.054688 -3.011719 L 8.191406 -2.726562 C 7.953125 -1.792969 7.523438 -1.078125 6.90625 -0.589844 C 6.285156 -0.0976562 5.53125 0.148438 4.632812 0.148438 C 3.707031 0.148438 2.957031 -0.0429688 2.375 -0.417969 C 1.796875 -0.796875 1.355469 -1.34375 1.050781 -2.054688 C 0.75 -2.769531 0.597656 -3.539062 0.597656 -4.359375 C 0.597656 -5.253906 0.769531 -6.035156 1.109375 -6.699219 C 1.453125 -7.367188 1.9375 -7.871094 2.570312 -8.21875 C 3.199219 -8.5625 3.894531 -8.734375 4.652344 -8.734375 C 5.511719 -8.734375 6.234375 -8.515625 6.820312 -8.078125 C 7.40625 -7.640625 7.8125 -7.027344 8.046875 -6.234375 L 6.925781 -5.96875 C 6.726562 -6.59375 6.4375 -7.050781 6.058594 -7.335938 C 5.679688 -7.621094 5.203125 -7.765625 4.628906 -7.765625 C 3.96875 -7.765625 3.417969 -7.605469 2.972656 -7.289062 C 2.53125 -6.972656 2.21875 -6.546875 2.039062 -6.015625 C 1.859375 -5.480469 1.769531 -4.929688 1.769531 -4.367188 C 1.769531 -3.636719 1.875 -2.996094 2.089844 -2.453125 C 2.300781 -1.90625 2.632812 -1.5 3.082031 -1.230469 C 3.53125 -0.960938 4.015625 -0.828125 4.539062 -0.828125 C 5.175781 -0.828125 5.71875 -1.007812 6.15625 -1.375 C 6.597656 -1.742188 6.898438 -2.289062 7.054688 -3.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-2">
+<path style="stroke:none;" d="M 0.398438 -3.109375 C 0.398438 -4.261719 0.71875 -5.117188 1.359375 -5.671875 C 1.894531 -6.132812 2.546875 -6.363281 3.316406 -6.363281 C 4.171875 -6.363281 4.871094 -6.082031 5.414062 -5.523438 C 5.957031 -4.960938 6.226562 -4.1875 6.226562 -3.199219 C 6.226562 -2.398438 6.109375 -1.769531 5.867188 -1.308594 C 5.628906 -0.851562 5.277344 -0.492188 4.820312 -0.242188 C 4.359375 0.0117188 3.859375 0.140625 3.316406 0.140625 C 2.445312 0.140625 1.742188 -0.140625 1.203125 -0.695312 C 0.667969 -1.253906 0.398438 -2.0625 0.398438 -3.109375 Z M 1.484375 -3.109375 C 1.484375 -2.3125 1.65625 -1.71875 2.003906 -1.320312 C 2.351562 -0.925781 2.789062 -0.726562 3.316406 -0.726562 C 3.839844 -0.726562 4.273438 -0.925781 4.625 -1.324219 C 4.972656 -1.722656 5.144531 -2.328125 5.144531 -3.148438 C 5.144531 -3.917969 4.96875 -4.5 4.621094 -4.894531 C 4.269531 -5.292969 3.835938 -5.492188 3.316406 -5.492188 C 2.789062 -5.492188 2.351562 -5.292969 2.003906 -4.898438 C 1.65625 -4.503906 1.484375 -3.90625 1.484375 -3.109375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-3">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.734375 -6.222656 L 1.734375 -5.351562 C 1.929688 -5.65625 2.1875 -5.898438 2.515625 -6.085938 C 2.839844 -6.269531 3.207031 -6.363281 3.621094 -6.363281 C 4.082031 -6.363281 4.460938 -6.265625 4.753906 -6.078125 C 5.050781 -5.886719 5.257812 -5.617188 5.378906 -5.273438 C 5.871094 -6 6.511719 -6.363281 7.300781 -6.363281 C 7.917969 -6.363281 8.390625 -6.191406 8.726562 -5.851562 C 9.058594 -5.507812 9.222656 -4.984375 9.222656 -4.273438 L 9.222656 0 L 8.171875 0 L 8.171875 -3.921875 C 8.171875 -4.34375 8.140625 -4.644531 8.070312 -4.832031 C 8.003906 -5.015625 7.878906 -5.164062 7.699219 -5.28125 C 7.519531 -5.394531 7.308594 -5.449219 7.066406 -5.449219 C 6.628906 -5.449219 6.265625 -5.304688 5.976562 -5.011719 C 5.6875 -4.722656 5.542969 -4.257812 5.542969 -3.617188 L 5.542969 0 L 4.488281 0 L 4.488281 -4.042969 C 4.488281 -4.511719 4.402344 -4.863281 4.230469 -5.097656 C 4.058594 -5.332031 3.777344 -5.449219 3.386719 -5.449219 C 3.089844 -5.449219 2.816406 -5.371094 2.5625 -5.214844 C 2.3125 -5.058594 2.128906 -4.828125 2.015625 -4.53125 C 1.902344 -4.230469 1.84375 -3.796875 1.84375 -3.226562 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-4">
+<path style="stroke:none;" d="M 0.789062 2.382812 L 0.789062 -6.222656 L 1.75 -6.222656 L 1.75 -5.414062 C 1.976562 -5.730469 2.234375 -5.96875 2.519531 -6.125 C 2.804688 -6.285156 3.148438 -6.363281 3.554688 -6.363281 C 4.085938 -6.363281 4.554688 -6.226562 4.960938 -5.953125 C 5.367188 -5.679688 5.675781 -5.292969 5.882812 -4.796875 C 6.089844 -4.296875 6.195312 -3.75 6.195312 -3.15625 C 6.195312 -2.519531 6.078125 -1.949219 5.851562 -1.4375 C 5.621094 -0.929688 5.289062 -0.539062 4.855469 -0.265625 C 4.417969 0.00390625 3.960938 0.140625 3.480469 0.140625 C 3.128906 0.140625 2.8125 0.0664062 2.535156 -0.0820312 C 2.253906 -0.230469 2.023438 -0.417969 1.84375 -0.644531 L 1.84375 2.382812 Z M 1.746094 -3.078125 C 1.746094 -2.277344 1.90625 -1.683594 2.234375 -1.300781 C 2.558594 -0.917969 2.949219 -0.726562 3.410156 -0.726562 C 3.878906 -0.726562 4.28125 -0.925781 4.613281 -1.320312 C 4.949219 -1.71875 5.117188 -2.332031 5.117188 -3.164062 C 5.117188 -3.957031 4.953125 -4.550781 4.625 -4.945312 C 4.300781 -5.339844 3.910156 -5.539062 3.457031 -5.539062 C 3.007812 -5.539062 2.609375 -5.328125 2.265625 -4.90625 C 1.917969 -4.488281 1.746094 -3.875 1.746094 -3.078125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-5">
+<path style="stroke:none;" d="M 0.796875 -7.375 L 0.796875 -8.589844 L 1.851562 -8.589844 L 1.851562 -7.375 Z M 0.796875 0 L 0.796875 -6.222656 L 1.851562 -6.222656 L 1.851562 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-6">
+<path style="stroke:none;" d="M 0.765625 0 L 0.765625 -8.589844 L 1.820312 -8.589844 L 1.820312 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-7">
+<path style="stroke:none;" d="M 4.851562 -0.765625 C 4.460938 -0.433594 4.085938 -0.203125 3.722656 -0.0625 C 3.363281 0.0742188 2.976562 0.140625 2.5625 0.140625 C 1.878906 0.140625 1.351562 -0.0273438 0.984375 -0.359375 C 0.617188 -0.695312 0.433594 -1.121094 0.433594 -1.640625 C 0.433594 -1.945312 0.503906 -2.222656 0.640625 -2.476562 C 0.78125 -2.726562 0.960938 -2.929688 1.1875 -3.082031 C 1.410156 -3.234375 1.664062 -3.351562 1.945312 -3.429688 C 2.152344 -3.484375 2.464844 -3.535156 2.882812 -3.585938 C 3.734375 -3.6875 4.359375 -3.808594 4.765625 -3.949219 C 4.769531 -4.09375 4.769531 -4.1875 4.769531 -4.226562 C 4.769531 -4.65625 4.671875 -4.957031 4.46875 -5.132812 C 4.199219 -5.371094 3.800781 -5.492188 3.269531 -5.492188 C 2.773438 -5.492188 2.40625 -5.402344 2.171875 -5.230469 C 1.933594 -5.054688 1.757812 -4.75 1.648438 -4.304688 L 0.617188 -4.445312 C 0.710938 -4.886719 0.863281 -5.246094 1.078125 -5.515625 C 1.292969 -5.789062 1.601562 -5.996094 2.007812 -6.144531 C 2.414062 -6.289062 2.886719 -6.363281 3.421875 -6.363281 C 3.953125 -6.363281 4.382812 -6.300781 4.71875 -6.175781 C 5.050781 -6.050781 5.292969 -5.894531 5.449219 -5.703125 C 5.605469 -5.515625 5.714844 -5.273438 5.777344 -4.984375 C 5.8125 -4.804688 5.828125 -4.484375 5.828125 -4.015625 L 5.828125 -2.609375 C 5.828125 -1.628906 5.851562 -1.007812 5.898438 -0.746094 C 5.941406 -0.488281 6.03125 -0.238281 6.164062 0 L 5.0625 0 C 4.953125 -0.21875 4.882812 -0.476562 4.851562 -0.765625 Z M 4.765625 -3.125 C 4.382812 -2.96875 3.804688 -2.835938 3.039062 -2.726562 C 2.605469 -2.664062 2.300781 -2.59375 2.121094 -2.515625 C 1.941406 -2.4375 1.804688 -2.320312 1.703125 -2.171875 C 1.605469 -2.019531 1.558594 -1.851562 1.558594 -1.671875 C 1.558594 -1.390625 1.664062 -1.15625 1.878906 -0.96875 C 2.089844 -0.78125 2.402344 -0.6875 2.8125 -0.6875 C 3.21875 -0.6875 3.578125 -0.773438 3.898438 -0.953125 C 4.214844 -1.128906 4.445312 -1.375 4.59375 -1.679688 C 4.707031 -1.917969 4.765625 -2.273438 4.765625 -2.734375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-8">
+<path style="stroke:none;" d="M 3.09375 -0.945312 L 3.246094 -0.0117188 C 2.949219 0.0507812 2.683594 0.0820312 2.449219 0.0820312 C 2.066406 0.0820312 1.769531 0.0234375 1.558594 -0.101562 C 1.347656 -0.222656 1.199219 -0.378906 1.113281 -0.578125 C 1.027344 -0.773438 0.984375 -1.1875 0.984375 -1.820312 L 0.984375 -5.402344 L 0.210938 -5.402344 L 0.210938 -6.222656 L 0.984375 -6.222656 L 0.984375 -7.765625 L 2.03125 -8.398438 L 2.03125 -6.222656 L 3.09375 -6.222656 L 3.09375 -5.402344 L 2.03125 -5.402344 L 2.03125 -1.765625 C 2.03125 -1.464844 2.050781 -1.269531 2.089844 -1.183594 C 2.125 -1.097656 2.1875 -1.03125 2.269531 -0.976562 C 2.355469 -0.925781 2.476562 -0.902344 2.632812 -0.902344 C 2.75 -0.902344 2.902344 -0.914062 3.09375 -0.945312 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-9">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.742188 -6.222656 L 1.742188 -5.335938 C 2.199219 -6.019531 2.859375 -6.363281 3.71875 -6.363281 C 4.09375 -6.363281 4.441406 -6.296875 4.753906 -6.160156 C 5.070312 -6.027344 5.304688 -5.851562 5.460938 -5.632812 C 5.617188 -5.414062 5.726562 -5.152344 5.789062 -4.851562 C 5.828125 -4.65625 5.847656 -4.3125 5.847656 -3.828125 L 5.847656 0 L 4.792969 0 L 4.792969 -3.785156 C 4.792969 -4.214844 4.75 -4.535156 4.671875 -4.75 C 4.589844 -4.960938 4.441406 -5.132812 4.234375 -5.257812 C 4.023438 -5.386719 3.78125 -5.449219 3.5 -5.449219 C 3.050781 -5.449219 2.660156 -5.304688 2.335938 -5.023438 C 2.007812 -4.738281 1.84375 -4.195312 1.84375 -3.398438 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-10">
+<path style="stroke:none;" d="M 5.050781 -2.003906 L 6.140625 -1.867188 C 5.96875 -1.230469 5.648438 -0.738281 5.1875 -0.386719 C 4.722656 -0.0351562 4.125 0.140625 3.40625 0.140625 C 2.496094 0.140625 1.773438 -0.140625 1.238281 -0.699219 C 0.707031 -1.261719 0.4375 -2.046875 0.4375 -3.058594 C 0.4375 -4.105469 0.710938 -4.917969 1.25 -5.496094 C 1.789062 -6.074219 2.484375 -6.363281 3.34375 -6.363281 C 4.175781 -6.363281 4.859375 -6.078125 5.382812 -5.515625 C 5.910156 -4.949219 6.175781 -4.148438 6.175781 -3.125 C 6.175781 -3.0625 6.171875 -2.96875 6.171875 -2.84375 L 1.53125 -2.84375 C 1.570312 -2.160156 1.761719 -1.632812 2.109375 -1.273438 C 2.457031 -0.910156 2.890625 -0.726562 3.410156 -0.726562 C 3.796875 -0.726562 4.125 -0.828125 4.398438 -1.03125 C 4.671875 -1.234375 4.890625 -1.558594 5.050781 -2.003906 Z M 1.585938 -3.710938 L 5.0625 -3.710938 C 5.015625 -4.234375 4.882812 -4.625 4.664062 -4.886719 C 4.328125 -5.292969 3.890625 -5.496094 3.359375 -5.496094 C 2.875 -5.496094 2.464844 -5.335938 2.136719 -5.007812 C 1.804688 -4.683594 1.625 -4.25 1.585938 -3.710938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-11">
+<path style="stroke:none;" d="M 1.042969 0 L 1.042969 -5.402344 L 0.109375 -5.402344 L 0.109375 -6.222656 L 1.042969 -6.222656 L 1.042969 -6.882812 C 1.042969 -7.300781 1.078125 -7.613281 1.15625 -7.816406 C 1.257812 -8.089844 1.433594 -8.3125 1.691406 -8.480469 C 1.945312 -8.652344 2.304688 -8.734375 2.765625 -8.734375 C 3.0625 -8.734375 3.390625 -8.703125 3.75 -8.632812 L 3.59375 -7.710938 C 3.375 -7.75 3.164062 -7.769531 2.96875 -7.769531 C 2.648438 -7.769531 2.421875 -7.703125 2.289062 -7.5625 C 2.15625 -7.425781 2.09375 -7.171875 2.09375 -6.796875 L 2.09375 -6.222656 L 3.304688 -6.222656 L 3.304688 -5.402344 L 2.09375 -5.402344 L 2.09375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-12">
+<path style="stroke:none;" d="M 4.828125 0 L 4.828125 -0.785156 C 4.433594 -0.167969 3.851562 0.140625 3.085938 0.140625 C 2.589844 0.140625 2.136719 0.00390625 1.71875 -0.269531 C 1.304688 -0.542969 0.980469 -0.925781 0.753906 -1.414062 C 0.523438 -1.90625 0.410156 -2.46875 0.410156 -3.105469 C 0.410156 -3.726562 0.515625 -4.289062 0.71875 -4.796875 C 0.925781 -5.300781 1.238281 -5.6875 1.652344 -5.960938 C 2.066406 -6.230469 2.53125 -6.363281 3.039062 -6.363281 C 3.414062 -6.363281 3.75 -6.285156 4.042969 -6.125 C 4.335938 -5.96875 4.574219 -5.761719 4.757812 -5.507812 L 4.757812 -8.589844 L 5.804688 -8.589844 L 5.804688 0 Z M 1.492188 -3.105469 C 1.492188 -2.308594 1.664062 -1.710938 2 -1.320312 C 2.335938 -0.925781 2.730469 -0.726562 3.1875 -0.726562 C 3.648438 -0.726562 4.039062 -0.914062 4.363281 -1.292969 C 4.683594 -1.667969 4.84375 -2.242188 4.84375 -3.015625 C 4.84375 -3.867188 4.679688 -4.492188 4.351562 -4.890625 C 4.023438 -5.289062 3.621094 -5.492188 3.140625 -5.492188 C 2.671875 -5.492188 2.28125 -5.296875 1.964844 -4.914062 C 1.652344 -4.53125 1.492188 -3.929688 1.492188 -3.105469 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-13">
+<path style="stroke:none;" d="M 4.867188 0 L 4.867188 -0.914062 C 4.382812 -0.210938 3.726562 0.140625 2.894531 0.140625 C 2.527344 0.140625 2.183594 0.0703125 1.867188 -0.0703125 C 1.546875 -0.210938 1.3125 -0.386719 1.15625 -0.601562 C 1.003906 -0.8125 0.894531 -1.074219 0.832031 -1.382812 C 0.789062 -1.589844 0.765625 -1.917969 0.765625 -2.367188 L 0.765625 -6.222656 L 1.820312 -6.222656 L 1.820312 -2.773438 C 1.820312 -2.222656 1.84375 -1.851562 1.886719 -1.65625 C 1.953125 -1.378906 2.09375 -1.164062 2.308594 -1.003906 C 2.523438 -0.847656 2.789062 -0.765625 3.105469 -0.765625 C 3.421875 -0.765625 3.71875 -0.847656 3.996094 -1.011719 C 4.273438 -1.171875 4.46875 -1.394531 4.585938 -1.671875 C 4.699219 -1.953125 4.757812 -2.359375 4.757812 -2.890625 L 4.757812 -6.222656 L 5.8125 -6.222656 L 5.8125 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph3-1">
+<path style="stroke:none;" d="M 0.953125 0 L 0.953125 -9.304688 L 4.445312 -9.304688 C 5.15625 -9.304688 5.722656 -9.210938 6.152344 -9.023438 C 6.582031 -8.835938 6.921875 -8.546875 7.164062 -8.152344 C 7.40625 -7.761719 7.527344 -7.351562 7.527344 -6.925781 C 7.527344 -6.527344 7.421875 -6.152344 7.203125 -5.800781 C 6.988281 -5.449219 6.664062 -5.167969 6.226562 -4.953125 C 6.789062 -4.785156 7.222656 -4.503906 7.523438 -4.105469 C 7.828125 -3.707031 7.980469 -3.238281 7.980469 -2.699219 C 7.980469 -2.261719 7.886719 -1.855469 7.703125 -1.480469 C 7.519531 -1.105469 7.292969 -0.820312 7.019531 -0.617188 C 6.75 -0.414062 6.410156 -0.257812 6 -0.15625 C 5.59375 -0.0507812 5.09375 0 4.5 0 Z M 2.183594 -5.394531 L 4.195312 -5.394531 C 4.742188 -5.394531 5.132812 -5.429688 5.371094 -5.503906 C 5.683594 -5.597656 5.917969 -5.75 6.078125 -5.96875 C 6.238281 -6.183594 6.316406 -6.453125 6.316406 -6.78125 C 6.316406 -7.089844 6.242188 -7.359375 6.09375 -7.59375 C 5.945312 -7.828125 5.734375 -7.992188 5.460938 -8.078125 C 5.183594 -8.164062 4.710938 -8.207031 4.042969 -8.207031 L 2.183594 -8.207031 Z M 2.183594 -1.097656 L 4.5 -1.097656 C 4.898438 -1.097656 5.175781 -1.113281 5.339844 -1.140625 C 5.621094 -1.191406 5.859375 -1.277344 6.050781 -1.398438 C 6.242188 -1.515625 6.394531 -1.6875 6.519531 -1.914062 C 6.640625 -2.140625 6.703125 -2.402344 6.703125 -2.699219 C 6.703125 -3.046875 6.613281 -3.347656 6.4375 -3.601562 C 6.257812 -3.859375 6.011719 -4.039062 5.695312 -4.140625 C 5.382812 -4.246094 4.929688 -4.296875 4.335938 -4.296875 L 2.183594 -4.296875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-2">
+<path style="stroke:none;" d="M 0.429688 -3.371094 C 0.429688 -4.617188 0.777344 -5.542969 1.472656 -6.144531 C 2.050781 -6.644531 2.757812 -6.894531 3.59375 -6.894531 C 4.519531 -6.894531 5.277344 -6.589844 5.867188 -5.984375 C 6.453125 -5.375 6.746094 -4.535156 6.746094 -3.464844 C 6.746094 -2.597656 6.617188 -1.914062 6.355469 -1.417969 C 6.097656 -0.921875 5.71875 -0.535156 5.222656 -0.261719 C 4.722656 0.015625 4.179688 0.152344 3.59375 0.152344 C 2.648438 0.152344 1.886719 -0.148438 1.304688 -0.753906 C 0.722656 -1.359375 0.429688 -2.230469 0.429688 -3.371094 Z M 1.605469 -3.371094 C 1.605469 -2.507812 1.792969 -1.859375 2.171875 -1.429688 C 2.546875 -1 3.023438 -0.789062 3.59375 -0.789062 C 4.160156 -0.789062 4.632812 -1.003906 5.007812 -1.433594 C 5.382812 -1.867188 5.574219 -2.523438 5.574219 -3.410156 C 5.574219 -4.242188 5.382812 -4.875 5.003906 -5.304688 C 4.625 -5.734375 4.15625 -5.949219 3.59375 -5.949219 C 3.023438 -5.949219 2.546875 -5.734375 2.171875 -5.304688 C 1.792969 -4.878906 1.605469 -4.234375 1.605469 -3.371094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-3">
+<path style="stroke:none;" d="M 0.398438 -2.011719 L 1.53125 -2.191406 C 1.59375 -1.738281 1.769531 -1.390625 2.058594 -1.148438 C 2.347656 -0.90625 2.753906 -0.789062 3.273438 -0.789062 C 3.800781 -0.789062 4.1875 -0.894531 4.445312 -1.109375 C 4.699219 -1.320312 4.824219 -1.570312 4.824219 -1.859375 C 4.824219 -2.117188 4.710938 -2.320312 4.488281 -2.46875 C 4.332031 -2.570312 3.941406 -2.699219 3.320312 -2.855469 C 2.480469 -3.066406 1.902344 -3.25 1.578125 -3.40625 C 1.253906 -3.558594 1.007812 -3.773438 0.839844 -4.046875 C 0.671875 -4.320312 0.589844 -4.621094 0.589844 -4.953125 C 0.589844 -5.253906 0.660156 -5.53125 0.796875 -5.785156 C 0.933594 -6.042969 1.121094 -6.253906 1.359375 -6.421875 C 1.535156 -6.554688 1.777344 -6.667969 2.085938 -6.757812 C 2.390625 -6.847656 2.722656 -6.894531 3.070312 -6.894531 C 3.601562 -6.894531 4.066406 -6.816406 4.464844 -6.664062 C 4.867188 -6.511719 5.160156 -6.304688 5.351562 -6.046875 C 5.542969 -5.785156 5.671875 -5.4375 5.746094 -5 L 4.628906 -4.851562 C 4.578125 -5.195312 4.429688 -5.46875 4.1875 -5.664062 C 3.945312 -5.859375 3.597656 -5.953125 3.15625 -5.953125 C 2.628906 -5.953125 2.253906 -5.867188 2.03125 -5.695312 C 1.808594 -5.519531 1.695312 -5.316406 1.695312 -5.085938 C 1.695312 -4.9375 1.742188 -4.804688 1.835938 -4.683594 C 1.929688 -4.5625 2.074219 -4.460938 2.273438 -4.378906 C 2.386719 -4.335938 2.722656 -4.242188 3.28125 -4.085938 C 4.089844 -3.871094 4.652344 -3.695312 4.972656 -3.558594 C 5.292969 -3.421875 5.542969 -3.21875 5.726562 -2.957031 C 5.90625 -2.695312 6 -2.371094 6 -1.980469 C 6 -1.601562 5.886719 -1.242188 5.664062 -0.90625 C 5.441406 -0.570312 5.121094 -0.308594 4.703125 -0.125 C 4.285156 0.0585938 3.8125 0.152344 3.28125 0.152344 C 2.40625 0.152344 1.738281 -0.03125 1.277344 -0.394531 C 0.820312 -0.757812 0.527344 -1.296875 0.398438 -2.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-4">
+<path style="stroke:none;" d="M 3.351562 -1.023438 L 3.515625 -0.0117188 C 3.195312 0.0546875 2.90625 0.0898438 2.652344 0.0898438 C 2.238281 0.0898438 1.917969 0.0234375 1.6875 -0.109375 C 1.460938 -0.238281 1.300781 -0.410156 1.207031 -0.625 C 1.113281 -0.839844 1.066406 -1.289062 1.066406 -1.972656 L 1.066406 -5.851562 L 0.226562 -5.851562 L 0.226562 -6.742188 L 1.066406 -6.742188 L 1.066406 -8.410156 L 2.203125 -9.097656 L 2.203125 -6.742188 L 3.351562 -6.742188 L 3.351562 -5.851562 L 2.203125 -5.851562 L 2.203125 -1.910156 C 2.203125 -1.585938 2.222656 -1.375 2.261719 -1.28125 C 2.304688 -1.1875 2.367188 -1.113281 2.460938 -1.058594 C 2.550781 -1.003906 2.679688 -0.976562 2.851562 -0.976562 C 2.976562 -0.976562 3.144531 -0.992188 3.351562 -1.023438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-5">
+<path style="stroke:none;" d="M 1.179688 0 L 1.179688 -1.300781 L 2.480469 -1.300781 L 2.480469 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-6">
+<path style="stroke:none;" d="M 1.003906 0 L 1.003906 -9.304688 L 4.511719 -9.304688 C 5.128906 -9.304688 5.601562 -9.277344 5.929688 -9.21875 C 6.386719 -9.140625 6.769531 -8.996094 7.078125 -8.78125 C 7.386719 -8.566406 7.636719 -8.269531 7.824219 -7.882812 C 8.011719 -7.5 8.105469 -7.074219 8.105469 -6.613281 C 8.105469 -5.824219 7.855469 -5.152344 7.351562 -4.605469 C 6.847656 -4.058594 5.9375 -3.78125 4.621094 -3.78125 L 2.234375 -3.78125 L 2.234375 0 Z M 2.234375 -4.882812 L 4.640625 -4.882812 C 5.4375 -4.882812 6 -5.03125 6.335938 -5.324219 C 6.667969 -5.621094 6.835938 -6.039062 6.835938 -6.578125 C 6.835938 -6.964844 6.738281 -7.296875 6.542969 -7.574219 C 6.34375 -7.851562 6.085938 -8.035156 5.765625 -8.125 C 5.558594 -8.179688 5.171875 -8.207031 4.613281 -8.207031 L 2.234375 -8.207031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-7">
+<path style="stroke:none;" d="M 0.804688 2.597656 L 0.679688 1.523438 C 0.929688 1.589844 1.148438 1.625 1.332031 1.625 C 1.585938 1.625 1.789062 1.582031 1.941406 1.5 C 2.09375 1.414062 2.21875 1.296875 2.316406 1.140625 C 2.390625 1.027344 2.503906 0.746094 2.664062 0.292969 C 2.6875 0.230469 2.722656 0.136719 2.765625 0.0117188 L 0.210938 -6.742188 L 1.441406 -6.742188 L 2.84375 -2.835938 C 3.027344 -2.34375 3.1875 -1.820312 3.332031 -1.277344 C 3.464844 -1.800781 3.621094 -2.3125 3.800781 -2.8125 L 5.242188 -6.742188 L 6.386719 -6.742188 L 3.820312 0.113281 C 3.546875 0.855469 3.332031 1.363281 3.179688 1.644531 C 2.976562 2.019531 2.746094 2.296875 2.480469 2.472656 C 2.21875 2.648438 1.90625 2.734375 1.542969 2.734375 C 1.324219 2.734375 1.078125 2.6875 0.804688 2.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-8">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -9.304688 L 2 -9.304688 L 2 -5.96875 C 2.53125 -6.585938 3.207031 -6.894531 4.019531 -6.894531 C 4.519531 -6.894531 4.953125 -6.796875 5.320312 -6.597656 C 5.6875 -6.402344 5.949219 -6.128906 6.109375 -5.78125 C 6.269531 -5.433594 6.347656 -4.933594 6.347656 -4.273438 L 6.347656 0 L 5.203125 0 L 5.203125 -4.273438 C 5.203125 -4.84375 5.082031 -5.257812 4.832031 -5.519531 C 4.585938 -5.78125 4.234375 -5.910156 3.78125 -5.910156 C 3.445312 -5.910156 3.125 -5.820312 2.828125 -5.644531 C 2.53125 -5.46875 2.316406 -5.234375 2.191406 -4.933594 C 2.0625 -4.632812 2 -4.21875 2 -3.6875 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-9">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -6.742188 L 1.886719 -6.742188 L 1.886719 -5.78125 C 2.382812 -6.523438 3.09375 -6.894531 4.03125 -6.894531 C 4.4375 -6.894531 4.808594 -6.820312 5.152344 -6.675781 C 5.492188 -6.527344 5.746094 -6.335938 5.914062 -6.101562 C 6.085938 -5.863281 6.203125 -5.582031 6.273438 -5.257812 C 6.3125 -5.046875 6.335938 -4.675781 6.335938 -4.144531 L 6.335938 0 L 5.191406 0 L 5.191406 -4.101562 C 5.191406 -4.566406 5.148438 -4.914062 5.058594 -5.144531 C 4.96875 -5.375 4.8125 -5.558594 4.585938 -5.695312 C 4.359375 -5.835938 4.09375 -5.902344 3.789062 -5.902344 C 3.304688 -5.902344 2.882812 -5.75 2.53125 -5.441406 C 2.175781 -5.132812 2 -4.546875 2 -3.679688 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-10">
+<path style="stroke:none;" d="M 0.855469 2.582031 L 0.855469 -6.742188 L 1.898438 -6.742188 L 1.898438 -5.867188 C 2.144531 -6.207031 2.421875 -6.464844 2.730469 -6.636719 C 3.039062 -6.808594 3.414062 -6.894531 3.851562 -6.894531 C 4.429688 -6.894531 4.9375 -6.746094 5.375 -6.449219 C 5.816406 -6.152344 6.148438 -5.734375 6.375 -5.195312 C 6.597656 -4.65625 6.710938 -4.066406 6.710938 -3.421875 C 6.710938 -2.730469 6.585938 -2.109375 6.339844 -1.558594 C 6.089844 -1.007812 5.730469 -0.582031 5.257812 -0.289062 C 4.785156 0.00390625 4.289062 0.152344 3.769531 0.152344 C 3.390625 0.152344 3.046875 0.0703125 2.746094 -0.0898438 C 2.441406 -0.25 2.195312 -0.453125 2 -0.699219 L 2 2.582031 Z M 1.890625 -3.332031 C 1.890625 -2.464844 2.066406 -1.824219 2.417969 -1.410156 C 2.769531 -0.996094 3.195312 -0.789062 3.695312 -0.789062 C 4.203125 -0.789062 4.636719 -1 5 -1.429688 C 5.359375 -1.859375 5.542969 -2.527344 5.542969 -3.429688 C 5.542969 -4.289062 5.363281 -4.929688 5.011719 -5.359375 C 4.65625 -5.785156 4.234375 -6 3.746094 -6 C 3.257812 -6 2.828125 -5.769531 2.453125 -5.316406 C 2.078125 -4.859375 1.890625 -4.199219 1.890625 -3.332031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-11">
+<path style="stroke:none;" d="M 1.910156 0 L 0.851562 0 L 0.851562 -9.304688 L 1.992188 -9.304688 L 1.992188 -5.984375 C 2.476562 -6.589844 3.089844 -6.894531 3.839844 -6.894531 C 4.253906 -6.894531 4.648438 -6.808594 5.019531 -6.644531 C 5.390625 -6.476562 5.691406 -6.242188 5.933594 -5.9375 C 6.171875 -5.636719 6.359375 -5.269531 6.492188 -4.84375 C 6.628906 -4.414062 6.695312 -3.957031 6.695312 -3.472656 C 6.695312 -2.316406 6.410156 -1.425781 5.839844 -0.792969 C 5.269531 -0.164062 4.582031 0.152344 3.78125 0.152344 C 2.988281 0.152344 2.363281 -0.179688 1.910156 -0.84375 Z M 1.898438 -3.421875 C 1.898438 -2.613281 2.007812 -2.027344 2.226562 -1.667969 C 2.585938 -1.082031 3.074219 -0.789062 3.6875 -0.789062 C 4.1875 -0.789062 4.617188 -1.003906 4.984375 -1.4375 C 5.347656 -1.871094 5.527344 -2.519531 5.527344 -3.375 C 5.527344 -4.257812 5.355469 -4.90625 5.003906 -5.324219 C 4.65625 -5.742188 4.234375 -5.953125 3.738281 -5.953125 C 3.238281 -5.953125 2.808594 -5.738281 2.445312 -5.304688 C 2.082031 -4.871094 1.898438 -4.242188 1.898438 -3.421875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-12">
+<path style="stroke:none;" d="M 0.863281 -7.992188 L 0.863281 -9.304688 L 2.007812 -9.304688 L 2.007812 -7.992188 Z M 0.863281 0 L 0.863281 -6.742188 L 2.007812 -6.742188 L 2.007812 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-13">
+<path style="stroke:none;" d="M 5.230469 0 L 5.230469 -0.851562 C 4.804688 -0.183594 4.175781 0.152344 3.34375 0.152344 C 2.808594 0.152344 2.3125 0.00390625 1.863281 -0.292969 C 1.414062 -0.589844 1.0625 -1 0.816406 -1.53125 C 0.570312 -2.0625 0.445312 -2.675781 0.445312 -3.363281 C 0.445312 -4.035156 0.554688 -4.648438 0.78125 -5.195312 C 1.003906 -5.742188 1.339844 -6.164062 1.789062 -6.457031 C 2.238281 -6.75 2.738281 -6.894531 3.292969 -6.894531 C 3.699219 -6.894531 4.0625 -6.808594 4.378906 -6.636719 C 4.695312 -6.464844 4.957031 -6.242188 5.15625 -5.96875 L 5.15625 -9.304688 L 6.289062 -9.304688 L 6.289062 0 Z M 1.617188 -3.363281 C 1.617188 -2.5 1.800781 -1.855469 2.164062 -1.429688 C 2.527344 -1 2.957031 -0.789062 3.453125 -0.789062 C 3.953125 -0.789062 4.375 -0.992188 4.726562 -1.398438 C 5.074219 -1.808594 5.25 -2.429688 5.25 -3.269531 C 5.25 -4.191406 5.070312 -4.867188 4.714844 -5.300781 C 4.359375 -5.730469 3.921875 -5.949219 3.402344 -5.949219 C 2.894531 -5.949219 2.46875 -5.742188 2.128906 -5.324219 C 1.789062 -4.910156 1.617188 -4.257812 1.617188 -3.363281 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-14">
+<path style="stroke:none;" d="M 4.84375 0 L 3.699219 0 L 3.699219 -7.28125 C 3.425781 -7.019531 3.066406 -6.757812 2.617188 -6.492188 C 2.171875 -6.230469 1.769531 -6.035156 1.414062 -5.902344 L 1.414062 -7.007812 C 2.054688 -7.308594 2.613281 -7.671875 3.089844 -8.101562 C 3.570312 -8.527344 3.90625 -8.941406 4.105469 -9.34375 L 4.84375 -9.34375 Z "/>
+</symbol>
+</g>
+<clipPath id="clip1">
+  <path d="M 89 21 L 91 21 L 91 221 L 89 221 Z "/>
+</clipPath>
+<clipPath id="clip2">
+  <path d="M 201 21 L 203 21 L 203 221 L 201 221 Z "/>
+</clipPath>
+<clipPath id="clip3">
+  <path d="M 313 21 L 315 21 L 315 221 L 313 221 Z "/>
+</clipPath>
+<clipPath id="clip4">
+  <path d="M 33 180 L 355 180 L 355 182 L 33 182 Z "/>
+</clipPath>
+<clipPath id="clip5">
+  <path d="M 33 146 L 355 146 L 355 148 L 33 148 Z "/>
+</clipPath>
+<clipPath id="clip6">
+  <path d="M 33 120 L 355 120 L 355 122 L 33 122 Z "/>
+</clipPath>
+<clipPath id="clip7">
+  <path d="M 33 95 L 355 95 L 355 97 L 33 97 Z "/>
+</clipPath>
+<clipPath id="clip8">
+  <path d="M 33 61 L 355 61 L 355 63 L 33 63 Z "/>
+</clipPath>
+</defs>
+<g id="surface11">
+<g clip-path="url(#clip1)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 90.226562 220.007812 L 90.226562 21 "/>
+</g>
+<g clip-path="url(#clip2)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 201.996094 220.007812 L 201.996094 21 "/>
+</g>
+<g clip-path="url(#clip3)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 313.761719 220.007812 L 313.761719 21 "/>
+</g>
+<g clip-path="url(#clip4)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 180.734375 L 355 180.734375 "/>
+</g>
+<g clip-path="url(#clip5)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 146.960938 L 355 146.960938 "/>
+</g>
+<g clip-path="url(#clip6)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 121.410156 L 355 121.410156 "/>
+</g>
+<g clip-path="url(#clip7)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 95.859375 L 355 95.859375 "/>
+</g>
+<g clip-path="url(#clip8)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 33 62.082031 L 355 62.082031 "/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 49.128906 169 C 49.128906 168.167969 48.800781 167.375 48.214844 166.785156 C 47.625 166.199219 46.832031 165.871094 46 165.871094 C 45.167969 165.871094 44.375 166.199219 43.785156 166.785156 C 43.199219 167.375 42.871094 168.167969 42.871094 169 C 42.871094 169.832031 43.199219 170.625 43.785156 171.214844 C 44.375 171.800781 45.167969 172.128906 46 172.128906 C 46.832031 172.128906 47.625 171.800781 48.214844 171.214844 C 48.800781 170.625 49.128906 169.832031 49.128906 169 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 82.128906 169 C 82.128906 168.167969 81.800781 167.375 81.214844 166.785156 C 80.625 166.199219 79.832031 165.871094 79 165.871094 C 78.167969 165.871094 77.375 166.199219 76.785156 166.785156 C 76.199219 167.375 75.871094 168.167969 75.871094 169 C 75.871094 169.832031 76.199219 170.625 76.785156 171.214844 C 77.375 171.800781 78.167969 172.128906 79 172.128906 C 79.832031 172.128906 80.625 171.800781 81.214844 171.214844 C 81.800781 170.625 82.128906 169.832031 82.128906 169 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 116.128906 167 C 116.128906 166.167969 115.800781 165.375 115.214844 164.785156 C 114.625 164.199219 113.832031 163.871094 113 163.871094 C 112.167969 163.871094 111.375 164.199219 110.785156 164.785156 C 110.199219 165.375 109.871094 166.167969 109.871094 167 C 109.871094 167.832031 110.199219 168.625 110.785156 169.214844 C 111.375 169.800781 112.167969 170.128906 113 170.128906 C 113.832031 170.128906 114.625 169.800781 115.214844 169.214844 C 115.800781 168.625 116.128906 167.832031 116.128906 167 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 150.128906 159 C 150.128906 158.167969 149.800781 157.375 149.214844 156.785156 C 148.625 156.199219 147.832031 155.871094 147 155.871094 C 146.167969 155.871094 145.375 156.199219 144.785156 156.785156 C 144.199219 157.375 143.871094 158.167969 143.871094 159 C 143.871094 159.832031 144.199219 160.625 144.785156 161.214844 C 145.375 161.800781 146.167969 162.128906 147 162.128906 C 147.832031 162.128906 148.625 161.800781 149.214844 161.214844 C 149.800781 160.625 150.128906 159.832031 150.128906 159 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 183.128906 146 C 183.128906 145.167969 182.800781 144.375 182.214844 143.785156 C 181.625 143.199219 180.832031 142.871094 180 142.871094 C 179.167969 142.871094 178.375 143.199219 177.785156 143.785156 C 177.199219 144.375 176.871094 145.167969 176.871094 146 C 176.871094 146.832031 177.199219 147.625 177.785156 148.214844 C 178.375 148.800781 179.167969 149.128906 180 149.128906 C 180.832031 149.128906 181.625 148.800781 182.214844 148.214844 C 182.800781 147.625 183.128906 146.832031 183.128906 146 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 217.128906 128 C 217.128906 127.167969 216.800781 126.375 216.214844 125.785156 C 215.625 125.199219 214.832031 124.871094 214 124.871094 C 213.167969 124.871094 212.375 125.199219 211.785156 125.785156 C 211.199219 126.375 210.871094 127.167969 210.871094 128 C 210.871094 128.832031 211.199219 129.625 211.785156 130.214844 C 212.375 130.800781 213.167969 131.128906 214 131.128906 C 214.832031 131.128906 215.625 130.800781 216.214844 130.214844 C 216.800781 129.625 217.128906 128.832031 217.128906 128 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 251.128906 108 C 251.128906 107.167969 250.800781 106.375 250.214844 105.785156 C 249.625 105.199219 248.832031 104.871094 248 104.871094 C 247.167969 104.871094 246.375 105.199219 245.785156 105.785156 C 245.199219 106.375 244.871094 107.167969 244.871094 108 C 244.871094 108.832031 245.199219 109.625 245.785156 110.214844 C 246.375 110.800781 247.167969 111.128906 248 111.128906 C 248.832031 111.128906 249.625 110.800781 250.214844 110.214844 C 250.800781 109.625 251.128906 108.832031 251.128906 108 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 284.128906 85 C 284.128906 84.167969 283.800781 83.375 283.214844 82.785156 C 282.625 82.199219 281.832031 81.871094 281 81.871094 C 280.167969 81.871094 279.375 82.199219 278.785156 82.785156 C 278.199219 83.375 277.871094 84.167969 277.871094 85 C 277.871094 85.832031 278.199219 86.625 278.785156 87.214844 C 279.375 87.800781 280.167969 88.128906 281 88.128906 C 281.832031 88.128906 282.625 87.800781 283.214844 87.214844 C 283.800781 86.625 284.128906 85.832031 284.128906 85 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 318.128906 59 C 318.128906 58.167969 317.800781 57.375 317.214844 56.785156 C 316.625 56.199219 315.832031 55.871094 315 55.871094 C 314.167969 55.871094 313.375 56.199219 312.785156 56.785156 C 312.199219 57.375 311.871094 58.167969 311.871094 59 C 311.871094 59.832031 312.199219 60.625 312.785156 61.214844 C 313.375 61.800781 314.167969 62.128906 315 62.128906 C 315.832031 62.128906 316.625 61.800781 317.214844 61.214844 C 317.800781 60.625 318.128906 59.832031 318.128906 59 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 352.128906 31 C 352.128906 30.167969 351.800781 29.375 351.214844 28.785156 C 350.625 28.199219 349.832031 27.871094 349 27.871094 C 348.167969 27.871094 347.375 28.199219 346.785156 28.785156 C 346.199219 29.375 345.871094 30.167969 345.871094 31 C 345.871094 31.832031 346.199219 32.625 346.785156 33.214844 C 347.375 33.800781 348.167969 34.128906 349 34.128906 C 349.832031 34.128906 350.625 33.800781 351.214844 33.214844 C 351.800781 32.625 352.128906 31.832031 352.128906 31 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 49.128906 205 C 49.128906 204.167969 48.800781 203.375 48.214844 202.785156 C 47.625 202.199219 46.832031 201.871094 46 201.871094 C 45.167969 201.871094 44.375 202.199219 43.785156 202.785156 C 43.199219 203.375 42.871094 204.167969 42.871094 205 C 42.871094 205.832031 43.199219 206.625 43.785156 207.214844 C 44.375 207.800781 45.167969 208.128906 46 208.128906 C 46.832031 208.128906 47.625 207.800781 48.214844 207.214844 C 48.800781 206.625 49.128906 205.832031 49.128906 205 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 82.128906 199 C 82.128906 198.167969 81.800781 197.375 81.214844 196.785156 C 80.625 196.199219 79.832031 195.871094 79 195.871094 C 78.167969 195.871094 77.375 196.199219 76.785156 196.785156 C 76.199219 197.375 75.871094 198.167969 75.871094 199 C 75.871094 199.832031 76.199219 200.625 76.785156 201.214844 C 77.375 201.800781 78.167969 202.128906 79 202.128906 C 79.832031 202.128906 80.625 201.800781 81.214844 201.214844 C 81.800781 200.625 82.128906 199.832031 82.128906 199 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 116.128906 190 C 116.128906 189.167969 115.800781 188.375 115.214844 187.785156 C 114.625 187.199219 113.832031 186.871094 113 186.871094 C 112.167969 186.871094 111.375 187.199219 110.785156 187.785156 C 110.199219 188.375 109.871094 189.167969 109.871094 190 C 109.871094 190.832031 110.199219 191.625 110.785156 192.214844 C 111.375 192.800781 112.167969 193.128906 113 193.128906 C 113.832031 193.128906 114.625 192.800781 115.214844 192.214844 C 115.800781 191.625 116.128906 190.832031 116.128906 190 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 150.128906 177 C 150.128906 176.167969 149.800781 175.375 149.214844 174.785156 C 148.625 174.199219 147.832031 173.871094 147 173.871094 C 146.167969 173.871094 145.375 174.199219 144.785156 174.785156 C 144.199219 175.375 143.871094 176.167969 143.871094 177 C 143.871094 177.832031 144.199219 178.625 144.785156 179.214844 C 145.375 179.800781 146.167969 180.128906 147 180.128906 C 147.832031 180.128906 148.625 179.800781 149.214844 179.214844 C 149.800781 178.625 150.128906 177.832031 150.128906 177 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 183.128906 159 C 183.128906 158.167969 182.800781 157.375 182.214844 156.785156 C 181.625 156.199219 180.832031 155.871094 180 155.871094 C 179.167969 155.871094 178.375 156.199219 177.785156 156.785156 C 177.199219 157.375 176.871094 158.167969 176.871094 159 C 176.871094 159.832031 177.199219 160.625 177.785156 161.214844 C 178.375 161.800781 179.167969 162.128906 180 162.128906 C 180.832031 162.128906 181.625 161.800781 182.214844 161.214844 C 182.800781 160.625 183.128906 159.832031 183.128906 159 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 217.128906 138 C 217.128906 137.167969 216.800781 136.375 216.214844 135.785156 C 215.625 135.199219 214.832031 134.871094 214 134.871094 C 213.167969 134.871094 212.375 135.199219 211.785156 135.785156 C 211.199219 136.375 210.871094 137.167969 210.871094 138 C 210.871094 138.832031 211.199219 139.625 211.785156 140.214844 C 212.375 140.800781 213.167969 141.128906 214 141.128906 C 214.832031 141.128906 215.625 140.800781 216.214844 140.214844 C 216.800781 139.625 217.128906 138.832031 217.128906 138 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 251.128906 115 C 251.128906 114.167969 250.800781 113.375 250.214844 112.785156 C 249.625 112.199219 248.832031 111.871094 248 111.871094 C 247.167969 111.871094 246.375 112.199219 245.785156 112.785156 C 245.199219 113.375 244.871094 114.167969 244.871094 115 C 244.871094 115.832031 245.199219 116.625 245.785156 117.214844 C 246.375 117.800781 247.167969 118.128906 248 118.128906 C 248.832031 118.128906 249.625 117.800781 250.214844 117.214844 C 250.800781 116.625 251.128906 115.832031 251.128906 115 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 284.128906 91 C 284.128906 90.167969 283.800781 89.375 283.214844 88.785156 C 282.625 88.199219 281.832031 87.871094 281 87.871094 C 280.167969 87.871094 279.375 88.199219 278.785156 88.785156 C 278.199219 89.375 277.871094 90.167969 277.871094 91 C 277.871094 91.832031 278.199219 92.625 278.785156 93.214844 C 279.375 93.800781 280.167969 94.128906 281 94.128906 C 281.832031 94.128906 282.625 93.800781 283.214844 93.214844 C 283.800781 92.625 284.128906 91.832031 284.128906 91 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 318.128906 65 C 318.128906 64.167969 317.800781 63.375 317.214844 62.785156 C 316.625 62.199219 315.832031 61.871094 315 61.871094 C 314.167969 61.871094 313.375 62.199219 312.785156 62.785156 C 312.199219 63.375 311.871094 64.167969 311.871094 65 C 311.871094 65.832031 312.199219 66.625 312.785156 67.214844 C 313.375 67.800781 314.167969 68.128906 315 68.128906 C 315.832031 68.128906 316.625 67.800781 317.214844 67.214844 C 317.800781 66.625 318.128906 65.832031 318.128906 65 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 352.128906 38 C 352.128906 37.167969 351.800781 36.375 351.214844 35.785156 C 350.625 35.199219 349.832031 34.871094 349 34.871094 C 348.167969 34.871094 347.375 35.199219 346.785156 35.785156 C 346.199219 36.375 345.871094 37.167969 345.871094 38 C 345.871094 38.832031 346.199219 39.625 346.785156 40.214844 C 347.375 40.800781 348.167969 41.128906 349 41.128906 C 349.832031 41.128906 350.625 40.800781 351.214844 40.214844 C 351.800781 39.625 352.128906 38.832031 352.128906 38 Z "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 220.007812 L 33 220.007812 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 220.007812 L 33 21 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 21 L 355 21 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 21 L 355 220.007812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.226562 220.007812 L 90.226562 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="84.725786" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="90.28731" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 201.996094 220.007812 L 201.996094 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="193.49443" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="199.055953" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="204.617477" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 313.761719 220.007812 L 313.761719 216.785156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="302.763074" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="308.324597" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="313.88612" y="232.006944"/>
+  <use xlink:href="#glyph0-2" x="319.447644" y="232.006944"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 45.75 220.007812 L 45.75 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 56.578125 220.007812 L 56.578125 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 65.429688 220.007812 L 65.429688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 72.914062 220.007812 L 72.914062 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 79.394531 220.007812 L 79.394531 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 85.113281 220.007812 L 85.113281 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 123.871094 220.007812 L 123.871094 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 143.554688 220.007812 L 143.554688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 157.515625 220.007812 L 157.515625 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 168.347656 220.007812 L 168.347656 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 177.199219 220.007812 L 177.199219 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 184.679688 220.007812 L 184.679688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 191.164062 220.007812 L 191.164062 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 196.878906 220.007812 L 196.878906 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 235.640625 220.007812 L 235.640625 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 255.320312 220.007812 L 255.320312 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 269.285156 220.007812 L 269.285156 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 280.117188 220.007812 L 280.117188 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 288.96875 220.007812 L 288.96875 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 296.449219 220.007812 L 296.449219 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 302.929688 220.007812 L 302.929688 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 308.648438 220.007812 L 308.648438 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.410156 220.007812 L 347.410156 218.398438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 180.734375 L 36.21875 180.734375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="24" y="183.235493"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 146.960938 L 36.21875 146.960938 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-4" x="24" y="149.45964"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 121.410156 L 36.21875 121.410156 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="123.909193"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="123.909193"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 95.859375 L 36.21875 95.859375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="19" y="98.358747"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="98.358747"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 62.082031 L 36.21875 62.082031 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-4" x="19" y="64.582894"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="64.582894"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 206.285156 L 34.609375 206.285156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 165.789062 L 34.609375 165.789062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 155.183594 L 34.609375 155.183594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 140.238281 L 34.609375 140.238281 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 134.558594 L 34.609375 134.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 129.632812 L 34.609375 129.632812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 125.292969 L 34.609375 125.292969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 80.914062 L 34.609375 80.914062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 70.308594 L 34.609375 70.308594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 33 36.53125 L 34.609375 36.53125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.226562 21 L 90.226562 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 201.996094 21 L 201.996094 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 313.761719 21 L 313.761719 24.21875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 45.75 21 L 45.75 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 56.578125 21 L 56.578125 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 65.429688 21 L 65.429688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 72.914062 21 L 72.914062 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 79.394531 21 L 79.394531 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 85.113281 21 L 85.113281 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 123.871094 21 L 123.871094 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 143.554688 21 L 143.554688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 157.515625 21 L 157.515625 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 168.347656 21 L 168.347656 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 177.199219 21 L 177.199219 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 184.679688 21 L 184.679688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 191.164062 21 L 191.164062 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 196.878906 21 L 196.878906 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 235.640625 21 L 235.640625 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 255.320312 21 L 255.320312 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 269.285156 21 L 269.285156 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 280.117188 21 L 280.117188 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 288.96875 21 L 288.96875 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 296.449219 21 L 296.449219 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 302.929688 21 L 302.929688 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 308.648438 21 L 308.648438 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.410156 21 L 347.410156 22.609375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 180.734375 L 351.78125 180.734375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 146.960938 L 351.78125 146.960938 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 121.410156 L 351.78125 121.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 95.859375 L 351.78125 95.859375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 62.082031 L 351.78125 62.082031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 206.285156 L 353.390625 206.285156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 165.789062 L 353.390625 165.789062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 155.183594 L 353.390625 155.183594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 140.238281 L 353.390625 140.238281 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 134.558594 L 353.390625 134.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 129.632812 L 353.390625 129.632812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 125.292969 L 353.390625 125.292969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 80.914062 L 353.390625 80.914062 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 70.308594 L 353.390625 70.308594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 36.53125 L 353.390625 36.53125 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-5" x="172.5" y="251.006944"/>
+  <use xlink:href="#glyph0-6" x="178.608398" y="251.006944"/>
+  <use xlink:href="#glyph0-7" x="184.169922" y="251.006944"/>
+  <use xlink:href="#glyph0-8" x="189.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-9" x="194.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-10" x="197.509766" y="251.006944"/>
+  <use xlink:href="#glyph0-11" x="199.731445" y="251.006944"/>
+  <use xlink:href="#glyph0-7" x="205.292969" y="251.006944"/>
+  <use xlink:href="#glyph0-12" x="210.854492" y="251.006944"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-1" x="9" y="140.003472"/>
+  <use xlink:href="#glyph1-2" x="9" y="133.33355"/>
+  <use xlink:href="#glyph1-3" x="9" y="127.772027"/>
+  <use xlink:href="#glyph1-4" x="9" y="122.772027"/>
+  <use xlink:href="#glyph1-5" x="9" y="117.210503"/>
+  <use xlink:href="#glyph1-6" x="9" y="111.64898"/>
+  <use xlink:href="#glyph1-7" x="9" y="106.087457"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-1" x="122" y="12"/>
+  <use xlink:href="#glyph2-2" x="130.666016" y="12"/>
+  <use xlink:href="#glyph2-3" x="137.339844" y="12"/>
+  <use xlink:href="#glyph2-4" x="147.335938" y="12"/>
+  <use xlink:href="#glyph2-5" x="154.009766" y="12"/>
+  <use xlink:href="#glyph2-6" x="156.675781" y="12"/>
+  <use xlink:href="#glyph2-7" x="159.341797" y="12"/>
+  <use xlink:href="#glyph2-8" x="166.015625" y="12"/>
+  <use xlink:href="#glyph2-5" x="169.349609" y="12"/>
+  <use xlink:href="#glyph2-2" x="172.015625" y="12"/>
+  <use xlink:href="#glyph2-9" x="178.689453" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-8" x="188" y="12"/>
+  <use xlink:href="#glyph2-5" x="191.333984" y="12"/>
+  <use xlink:href="#glyph2-3" x="194" y="12"/>
+  <use xlink:href="#glyph2-10" x="203.996094" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-2" x="214" y="12"/>
+  <use xlink:href="#glyph2-11" x="220.673828" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-3" x="227" y="12"/>
+  <use xlink:href="#glyph2-2" x="236.996094" y="12"/>
+  <use xlink:href="#glyph2-12" x="243.669922" y="12"/>
+  <use xlink:href="#glyph2-13" x="250.34375" y="12"/>
+  <use xlink:href="#glyph2-6" x="257.017578" y="12"/>
+  <use xlink:href="#glyph2-10" x="259.683594" y="12"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 382.148438 116 C 382.148438 115.164062 381.816406 114.363281 381.226562 113.773438 C 380.636719 113.183594 379.835938 112.851562 379 112.851562 C 378.164062 112.851562 377.363281 113.183594 376.773438 113.773438 C 376.183594 114.363281 375.851562 115.164062 375.851562 116 C 375.851562 116.835938 376.183594 117.636719 376.773438 118.226562 C 377.363281 118.816406 378.164062 119.148438 379 119.148438 C 379.835938 119.148438 380.636719 118.816406 381.226562 118.226562 C 381.816406 117.636719 382.148438 116.835938 382.148438 116 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-1" x="391" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="399.670898" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="406.901367" y="120.28418"/>
+  <use xlink:href="#glyph3-3" x="414.131836" y="120.28418"/>
+  <use xlink:href="#glyph3-4" x="420.631836" y="120.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-5" x="424" y="120.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-6" x="428" y="120.28418"/>
+  <use xlink:href="#glyph3-7" x="436.670898" y="120.28418"/>
+  <use xlink:href="#glyph3-4" x="443.170898" y="120.28418"/>
+  <use xlink:href="#glyph3-8" x="446.783203" y="120.28418"/>
+  <use xlink:href="#glyph3-2" x="454.013672" y="120.28418"/>
+  <use xlink:href="#glyph3-9" x="461.244141" y="120.28418"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 382.148438 139 C 382.148438 138.164062 381.816406 137.363281 381.226562 136.773438 C 380.636719 136.183594 379.835938 135.851562 379 135.851562 C 378.164062 135.851562 377.363281 136.183594 376.773438 136.773438 C 376.183594 137.363281 375.851562 138.164062 375.851562 139 C 375.851562 139.835938 376.183594 140.636719 376.773438 141.226562 C 377.363281 141.816406 378.164062 142.148438 379 142.148438 C 379.835938 142.148438 380.636719 141.816406 381.226562 141.226562 C 381.816406 140.636719 382.148438 139.835938 382.148438 139 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph3-10" x="391" y="143.28418"/>
+  <use xlink:href="#glyph3-7" x="398.230469" y="143.28418"/>
+  <use xlink:href="#glyph3-11" x="404.730469" y="143.28418"/>
+  <use xlink:href="#glyph3-12" x="411.960938" y="143.28418"/>
+  <use xlink:href="#glyph3-9" x="414.849609" y="143.28418"/>
+  <use xlink:href="#glyph3-13" x="422.080078" y="143.28418"/>
+  <use xlink:href="#glyph3-14" x="429.310547" y="143.28418"/>
+  <use xlink:href="#glyph3-14" x="436.541016" y="143.28418"/>
+</g>
+</g>
+</svg>
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.png b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.png
new file mode 100644
index 0000000..9f17272
Binary files /dev/null and b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.png differ
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.svg b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.svg
new file mode 100644
index 0000000..5ed6530
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.svg
@@ -0,0 +1,427 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="468pt" height="246pt" viewBox="0 0 468 246" version="1.1">
+<defs>
+<g>
+<symbol overflow="visible" id="glyph0-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph0-1">
+<path style="stroke:none;" d="M 3.726562 0 L 2.847656 0 L 2.847656 -5.601562 C 2.636719 -5.398438 2.359375 -5.195312 2.015625 -4.996094 C 1.671875 -4.792969 1.363281 -4.640625 1.089844 -4.539062 L 1.089844 -5.390625 C 1.582031 -5.621094 2.011719 -5.902344 2.378906 -6.230469 C 2.746094 -6.558594 3.007812 -6.878906 3.160156 -7.1875 L 3.726562 -7.1875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-2">
+<path style="stroke:none;" d="M 0.414062 -3.53125 C 0.414062 -4.375 0.503906 -5.058594 0.675781 -5.574219 C 0.851562 -6.089844 1.109375 -6.488281 1.453125 -6.765625 C 1.796875 -7.046875 2.226562 -7.1875 2.75 -7.1875 C 3.132812 -7.1875 3.46875 -7.109375 3.757812 -6.957031 C 4.046875 -6.800781 4.289062 -6.578125 4.476562 -6.285156 C 4.664062 -5.996094 4.8125 -5.640625 4.921875 -5.222656 C 5.03125 -4.804688 5.082031 -4.238281 5.082031 -3.53125 C 5.082031 -2.691406 4.996094 -2.011719 4.824219 -1.496094 C 4.652344 -0.980469 4.394531 -0.582031 4.050781 -0.300781 C 3.707031 -0.0195312 3.273438 0.121094 2.75 0.121094 C 2.058594 0.121094 1.515625 -0.125 1.125 -0.621094 C 0.652344 -1.214844 0.414062 -2.1875 0.414062 -3.53125 Z M 1.320312 -3.53125 C 1.320312 -2.355469 1.457031 -1.574219 1.730469 -1.183594 C 2.007812 -0.796875 2.34375 -0.601562 2.75 -0.601562 C 3.152344 -0.601562 3.492188 -0.796875 3.765625 -1.1875 C 4.042969 -1.578125 4.179688 -2.359375 4.179688 -3.53125 C 4.179688 -4.710938 4.042969 -5.492188 3.765625 -5.878906 C 3.492188 -6.265625 3.148438 -6.460938 2.738281 -6.460938 C 2.335938 -6.460938 2.011719 -6.289062 1.773438 -5.945312 C 1.46875 -5.511719 1.320312 -4.707031 1.320312 -3.53125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-3">
+<path style="stroke:none;" d="M 0.820312 0 L 0.820312 -7.15625 L 5.648438 -7.15625 L 5.648438 -6.3125 L 1.765625 -6.3125 L 1.765625 -4.097656 L 5.125 -4.097656 L 5.125 -3.25 L 1.765625 -3.25 L 1.765625 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-4">
+<path style="stroke:none;" d="M 4.058594 0 L 4.058594 -0.761719 C 3.65625 -0.175781 3.105469 0.117188 2.414062 0.117188 C 2.105469 0.117188 1.820312 0.0585938 1.554688 -0.0585938 C 1.289062 -0.175781 1.09375 -0.324219 0.964844 -0.5 C 0.835938 -0.679688 0.746094 -0.894531 0.695312 -1.152344 C 0.65625 -1.324219 0.640625 -1.597656 0.640625 -1.972656 L 0.640625 -5.1875 L 1.519531 -5.1875 L 1.519531 -2.308594 C 1.519531 -1.851562 1.535156 -1.542969 1.570312 -1.382812 C 1.625 -1.152344 1.746094 -0.96875 1.921875 -0.835938 C 2.101562 -0.703125 2.324219 -0.640625 2.585938 -0.640625 C 2.851562 -0.640625 3.097656 -0.707031 3.328125 -0.84375 C 3.5625 -0.976562 3.726562 -1.160156 3.820312 -1.394531 C 3.917969 -1.625 3.964844 -1.964844 3.964844 -2.40625 L 3.964844 -5.1875 L 4.84375 -5.1875 L 4.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-5">
+<path style="stroke:none;" d="M 0.660156 0 L 0.660156 -5.1875 L 1.449219 -5.1875 L 1.449219 -4.449219 C 1.832031 -5.019531 2.382812 -5.304688 3.101562 -5.304688 C 3.414062 -5.304688 3.699219 -5.246094 3.960938 -5.132812 C 4.222656 -5.023438 4.421875 -4.875 4.550781 -4.691406 C 4.679688 -4.507812 4.773438 -4.292969 4.824219 -4.042969 C 4.855469 -3.878906 4.875 -3.59375 4.875 -3.1875 L 4.875 0 L 3.992188 0 L 3.992188 -3.15625 C 3.992188 -3.511719 3.960938 -3.78125 3.890625 -3.957031 C 3.824219 -4.132812 3.703125 -4.277344 3.527344 -4.382812 C 3.351562 -4.488281 3.148438 -4.539062 2.914062 -4.539062 C 2.539062 -4.539062 2.21875 -4.421875 1.945312 -4.183594 C 1.671875 -3.945312 1.539062 -3.496094 1.539062 -2.832031 L 1.539062 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-6">
+<path style="stroke:none;" d="M 4.042969 -1.898438 L 4.90625 -1.789062 C 4.8125 -1.191406 4.570312 -0.726562 4.183594 -0.386719 C 3.792969 -0.0507812 3.316406 0.117188 2.75 0.117188 C 2.039062 0.117188 1.46875 -0.113281 1.039062 -0.578125 C 0.605469 -1.042969 0.390625 -1.707031 0.390625 -2.574219 C 0.390625 -3.132812 0.484375 -3.625 0.667969 -4.042969 C 0.855469 -4.460938 1.136719 -4.777344 1.515625 -4.988281 C 1.894531 -5.199219 2.308594 -5.304688 2.753906 -5.304688 C 3.316406 -5.304688 3.777344 -5.160156 4.136719 -4.875 C 4.492188 -4.589844 4.722656 -4.1875 4.824219 -3.664062 L 3.96875 -3.53125 C 3.886719 -3.878906 3.746094 -4.140625 3.539062 -4.316406 C 3.332031 -4.492188 3.082031 -4.578125 2.789062 -4.578125 C 2.34375 -4.578125 1.984375 -4.421875 1.710938 -4.105469 C 1.433594 -3.789062 1.292969 -3.285156 1.292969 -2.597656 C 1.292969 -1.902344 1.425781 -1.394531 1.695312 -1.078125 C 1.960938 -0.761719 2.308594 -0.605469 2.738281 -0.605469 C 3.085938 -0.605469 3.371094 -0.710938 3.601562 -0.921875 C 3.835938 -1.132812 3.980469 -1.460938 4.042969 -1.898438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-7">
+<path style="stroke:none;" d="M 2.578125 -0.785156 L 2.703125 -0.0078125 C 2.457031 0.0429688 2.234375 0.0703125 2.039062 0.0703125 C 1.722656 0.0703125 1.476562 0.0195312 1.296875 -0.0820312 C 1.121094 -0.183594 1 -0.316406 0.929688 -0.480469 C 0.855469 -0.644531 0.820312 -0.992188 0.820312 -1.519531 L 0.820312 -4.5 L 0.175781 -4.5 L 0.175781 -5.1875 L 0.820312 -5.1875 L 0.820312 -6.46875 L 1.695312 -6.996094 L 1.695312 -5.1875 L 2.578125 -5.1875 L 2.578125 -4.5 L 1.695312 -4.5 L 1.695312 -1.46875 C 1.695312 -1.21875 1.710938 -1.058594 1.742188 -0.984375 C 1.773438 -0.914062 1.820312 -0.859375 1.890625 -0.816406 C 1.960938 -0.773438 2.0625 -0.75 2.191406 -0.75 C 2.289062 -0.75 2.417969 -0.761719 2.578125 -0.785156 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-8">
+<path style="stroke:none;" d="M 0.664062 -6.148438 L 0.664062 -7.15625 L 1.542969 -7.15625 L 1.542969 -6.148438 Z M 0.664062 0 L 0.664062 -5.1875 L 1.542969 -5.1875 L 1.542969 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-9">
+<path style="stroke:none;" d="M 0.332031 -2.59375 C 0.332031 -3.554688 0.597656 -4.265625 1.132812 -4.726562 C 1.578125 -5.109375 2.121094 -5.304688 2.765625 -5.304688 C 3.476562 -5.304688 4.058594 -5.070312 4.511719 -4.601562 C 4.964844 -4.132812 5.191406 -3.488281 5.191406 -2.664062 C 5.191406 -2 5.089844 -1.472656 4.890625 -1.089844 C 4.691406 -0.707031 4.398438 -0.410156 4.015625 -0.199219 C 3.632812 0.0117188 3.214844 0.117188 2.765625 0.117188 C 2.039062 0.117188 1.449219 -0.117188 1.003906 -0.582031 C 0.554688 -1.046875 0.332031 -1.71875 0.332031 -2.59375 Z M 1.234375 -2.59375 C 1.234375 -1.929688 1.378906 -1.429688 1.671875 -1.101562 C 1.960938 -0.769531 2.324219 -0.605469 2.765625 -0.605469 C 3.199219 -0.605469 3.5625 -0.773438 3.851562 -1.101562 C 4.140625 -1.433594 4.289062 -1.941406 4.289062 -2.621094 C 4.289062 -3.261719 4.140625 -3.75 3.851562 -4.078125 C 3.558594 -4.410156 3.195312 -4.574219 2.765625 -4.574219 C 2.324219 -4.574219 1.960938 -4.410156 1.671875 -4.082031 C 1.382812 -3.753906 1.234375 -3.257812 1.234375 -2.59375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph0-10">
+<path style="stroke:none;" d="M 0.308594 -1.546875 L 1.175781 -1.683594 C 1.226562 -1.335938 1.363281 -1.070312 1.585938 -0.882812 C 1.808594 -0.699219 2.117188 -0.605469 2.519531 -0.605469 C 2.921875 -0.605469 3.222656 -0.6875 3.417969 -0.851562 C 3.613281 -1.015625 3.710938 -1.210938 3.710938 -1.429688 C 3.710938 -1.628906 3.625 -1.785156 3.453125 -1.898438 C 3.332031 -1.976562 3.03125 -2.078125 2.554688 -2.195312 C 1.910156 -2.359375 1.460938 -2.5 1.214844 -2.621094 C 0.964844 -2.738281 0.777344 -2.902344 0.648438 -3.113281 C 0.519531 -3.324219 0.453125 -3.554688 0.453125 -3.808594 C 0.453125 -4.039062 0.507812 -4.253906 0.613281 -4.449219 C 0.71875 -4.648438 0.863281 -4.8125 1.046875 -4.941406 C 1.183594 -5.042969 1.367188 -5.128906 1.605469 -5.199219 C 1.839844 -5.269531 2.09375 -5.304688 2.363281 -5.304688 C 2.769531 -5.304688 3.128906 -5.242188 3.433594 -5.125 C 3.742188 -5.007812 3.96875 -4.851562 4.117188 -4.652344 C 4.261719 -4.453125 4.363281 -4.183594 4.417969 -3.847656 L 3.558594 -3.730469 C 3.519531 -3.996094 3.40625 -4.207031 3.21875 -4.355469 C 3.03125 -4.503906 2.769531 -4.578125 2.425781 -4.578125 C 2.023438 -4.578125 1.734375 -4.511719 1.5625 -4.378906 C 1.390625 -4.246094 1.304688 -4.089844 1.304688 -3.910156 C 1.304688 -3.796875 1.339844 -3.695312 1.410156 -3.601562 C 1.484375 -3.507812 1.59375 -3.429688 1.75 -3.367188 C 1.835938 -3.335938 2.09375 -3.261719 2.523438 -3.144531 C 3.144531 -2.976562 3.578125 -2.84375 3.824219 -2.738281 C 4.070312 -2.632812 4.265625 -2.476562 4.40625 -2.273438 C 4.546875 -2.074219 4.613281 -1.824219 4.613281 -1.523438 C 4.613281 -1.230469 4.527344 -0.953125 4.359375 -0.695312 C 4.1875 -0.4375 3.941406 -0.238281 3.617188 -0.09375 C 3.296875 0.046875 2.929688 0.117188 2.523438 0.117188 C 1.851562 0.117188 1.335938 -0.0234375 0.984375 -0.304688 C 0.632812 -0.582031 0.40625 -0.996094 0.308594 -1.546875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph1-1">
+<path style="stroke:none;" d="M 0.375 -1.6875 L 1.203125 -1.757812 C 1.265625 -1.351562 1.410156 -1.050781 1.632812 -0.847656 C 1.855469 -0.644531 2.125 -0.539062 2.445312 -0.539062 C 2.824219 -0.539062 3.148438 -0.683594 3.410156 -0.972656 C 3.671875 -1.257812 3.804688 -1.640625 3.804688 -2.113281 C 3.804688 -2.566406 3.679688 -2.921875 3.425781 -3.179688 C 3.171875 -3.441406 2.839844 -3.574219 2.429688 -3.574219 C 2.175781 -3.574219 1.945312 -3.515625 1.742188 -3.398438 C 1.535156 -3.28125 1.375 -3.132812 1.257812 -2.949219 L 0.515625 -3.046875 L 1.136719 -6.355469 L 4.34375 -6.355469 L 4.34375 -5.597656 L 1.769531 -5.597656 L 1.421875 -3.867188 C 1.808594 -4.136719 2.214844 -4.273438 2.640625 -4.273438 C 3.203125 -4.273438 3.679688 -4.078125 4.066406 -3.6875 C 4.453125 -3.296875 4.644531 -2.796875 4.644531 -2.183594 C 4.644531 -1.601562 4.476562 -1.097656 4.136719 -0.671875 C 3.722656 -0.152344 3.15625 0.109375 2.445312 0.109375 C 1.859375 0.109375 1.378906 -0.0546875 1.007812 -0.382812 C 0.636719 -0.710938 0.425781 -1.144531 0.375 -1.6875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-2">
+<path style="stroke:none;" d="M 4.476562 -4.863281 L 3.691406 -4.804688 C 3.621094 -5.113281 3.523438 -5.339844 3.390625 -5.480469 C 3.179688 -5.707031 2.914062 -5.820312 2.601562 -5.820312 C 2.351562 -5.820312 2.128906 -5.75 1.9375 -5.609375 C 1.6875 -5.425781 1.492188 -5.160156 1.347656 -4.8125 C 1.203125 -4.464844 1.132812 -3.96875 1.125 -3.320312 C 1.316406 -3.613281 1.546875 -3.828125 1.824219 -3.96875 C 2.097656 -4.109375 2.386719 -4.179688 2.6875 -4.179688 C 3.214844 -4.179688 3.664062 -3.984375 4.035156 -3.597656 C 4.40625 -3.210938 4.59375 -2.707031 4.59375 -2.09375 C 4.59375 -1.6875 4.503906 -1.3125 4.332031 -0.964844 C 4.15625 -0.617188 3.917969 -0.351562 3.613281 -0.167969 C 3.308594 0.015625 2.960938 0.109375 2.574219 0.109375 C 1.914062 0.109375 1.378906 -0.132812 0.960938 -0.617188 C 0.546875 -1.101562 0.339844 -1.902344 0.339844 -3.015625 C 0.339844 -4.261719 0.570312 -5.164062 1.027344 -5.730469 C 1.429688 -6.222656 1.96875 -6.46875 2.648438 -6.46875 C 3.15625 -6.46875 3.570312 -6.328125 3.894531 -6.042969 C 4.21875 -5.757812 4.414062 -5.367188 4.476562 -4.863281 Z M 1.25 -2.085938 C 1.25 -1.8125 1.304688 -1.554688 1.421875 -1.304688 C 1.539062 -1.054688 1.699219 -0.867188 1.90625 -0.734375 C 2.113281 -0.605469 2.332031 -0.539062 2.5625 -0.539062 C 2.894531 -0.539062 3.183594 -0.675781 3.421875 -0.945312 C 3.664062 -1.214844 3.785156 -1.582031 3.785156 -2.042969 C 3.785156 -2.488281 3.664062 -2.839844 3.429688 -3.097656 C 3.191406 -3.351562 2.890625 -3.480469 2.53125 -3.480469 C 2.171875 -3.480469 1.871094 -3.351562 1.621094 -3.097656 C 1.371094 -2.839844 1.25 -2.503906 1.25 -2.085938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph1-3">
+<path style="stroke:none;" d="M 0.425781 -5.597656 L 0.425781 -6.359375 L 4.597656 -6.359375 L 4.597656 -5.742188 C 4.1875 -5.304688 3.78125 -4.726562 3.378906 -4.003906 C 2.976562 -3.28125 2.664062 -2.535156 2.445312 -1.769531 C 2.285156 -1.230469 2.183594 -0.640625 2.140625 0 L 1.328125 0 C 1.335938 -0.507812 1.4375 -1.117188 1.625 -1.835938 C 1.816406 -2.554688 2.089844 -3.246094 2.445312 -3.914062 C 2.800781 -4.578125 3.179688 -5.140625 3.582031 -5.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph2-1">
+<path style="stroke:none;" d="M 0 -0.734375 L -7.15625 -0.734375 L -7.15625 -3.417969 C -7.15625 -3.964844 -7.085938 -4.402344 -6.941406 -4.734375 C -6.796875 -5.0625 -6.574219 -5.324219 -6.273438 -5.511719 C -5.972656 -5.699219 -5.65625 -5.789062 -5.328125 -5.789062 C -5.023438 -5.789062 -4.734375 -5.707031 -4.460938 -5.542969 C -4.191406 -5.375 -3.976562 -5.125 -3.808594 -4.789062 C -3.679688 -5.222656 -3.464844 -5.554688 -3.160156 -5.789062 C -2.851562 -6.023438 -2.492188 -6.136719 -2.074219 -6.136719 C -1.738281 -6.136719 -1.429688 -6.066406 -1.140625 -5.925781 C -0.851562 -5.785156 -0.628906 -5.609375 -0.472656 -5.398438 C -0.316406 -5.191406 -0.199219 -4.929688 -0.121094 -4.617188 C -0.0390625 -4.304688 0 -3.917969 0 -3.460938 Z M -4.148438 -1.679688 L -4.148438 -3.226562 C -4.148438 -3.648438 -4.179688 -3.949219 -4.234375 -4.132812 C -4.304688 -4.371094 -4.421875 -4.554688 -4.589844 -4.675781 C -4.757812 -4.796875 -4.964844 -4.859375 -5.214844 -4.859375 C -5.453125 -4.859375 -5.660156 -4.800781 -5.84375 -4.6875 C -6.023438 -4.574219 -6.148438 -4.410156 -6.214844 -4.199219 C -6.28125 -3.988281 -6.3125 -3.625 -6.3125 -3.109375 L -6.3125 -1.679688 Z M -0.84375 -1.679688 L -0.84375 -3.460938 C -0.84375 -3.765625 -0.855469 -3.984375 -0.878906 -4.105469 C -0.917969 -4.324219 -0.984375 -4.507812 -1.074219 -4.652344 C -1.164062 -4.800781 -1.296875 -4.921875 -1.472656 -5.015625 C -1.648438 -5.109375 -1.847656 -5.15625 -2.074219 -5.15625 C -2.34375 -5.15625 -2.574219 -5.085938 -2.769531 -4.953125 C -2.96875 -4.816406 -3.105469 -4.625 -3.1875 -4.382812 C -3.265625 -4.140625 -3.304688 -3.789062 -3.304688 -3.335938 L -3.304688 -1.679688 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-2">
+<path style="stroke:none;" d="M 1.996094 -0.621094 L 1.171875 -0.523438 C 1.222656 -0.714844 1.25 -0.882812 1.25 -1.023438 C 1.25 -1.21875 1.21875 -1.375 1.152344 -1.492188 C 1.085938 -1.609375 0.996094 -1.707031 0.878906 -1.78125 C 0.789062 -1.835938 0.574219 -1.925781 0.226562 -2.050781 C 0.175781 -2.066406 0.105469 -2.09375 0.0078125 -2.128906 L -5.1875 -0.160156 L -5.1875 -1.109375 L -2.183594 -2.1875 C -1.800781 -2.328125 -1.402344 -2.453125 -0.980469 -2.5625 C -1.382812 -2.664062 -1.777344 -2.785156 -2.164062 -2.925781 L -5.1875 -4.03125 L -5.1875 -4.914062 L 0.0859375 -2.9375 C 0.65625 -2.726562 1.050781 -2.5625 1.265625 -2.445312 C 1.554688 -2.289062 1.765625 -2.109375 1.902344 -1.910156 C 2.039062 -1.707031 2.105469 -1.464844 2.105469 -1.1875 C 2.105469 -1.015625 2.070312 -0.828125 1.996094 -0.621094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-3">
+<path style="stroke:none;" d="M -0.785156 -2.578125 L -0.0078125 -2.703125 C 0.0429688 -2.457031 0.0703125 -2.234375 0.0703125 -2.039062 C 0.0703125 -1.722656 0.0195312 -1.476562 -0.0820312 -1.296875 C -0.183594 -1.121094 -0.316406 -1 -0.480469 -0.929688 C -0.644531 -0.855469 -0.992188 -0.820312 -1.519531 -0.820312 L -4.5 -0.820312 L -4.5 -0.175781 L -5.1875 -0.175781 L -5.1875 -0.820312 L -6.46875 -0.820312 L -6.996094 -1.695312 L -5.1875 -1.695312 L -5.1875 -2.578125 L -4.5 -2.578125 L -4.5 -1.695312 L -1.46875 -1.695312 C -1.21875 -1.695312 -1.058594 -1.710938 -0.984375 -1.742188 C -0.914062 -1.773438 -0.859375 -1.820312 -0.816406 -1.890625 C -0.773438 -1.960938 -0.75 -2.0625 -0.75 -2.191406 C -0.75 -2.289062 -0.761719 -2.417969 -0.785156 -2.578125 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-4">
+<path style="stroke:none;" d="M -1.671875 -4.210938 L -1.558594 -5.117188 C -1.027344 -4.972656 -0.617188 -4.707031 -0.320312 -4.320312 C -0.0273438 -3.933594 0.117188 -3.4375 0.117188 -2.835938 C 0.117188 -2.078125 -0.117188 -1.476562 -0.582031 -1.03125 C -1.050781 -0.585938 -1.707031 -0.367188 -2.546875 -0.367188 C -3.421875 -0.367188 -4.097656 -0.589844 -4.578125 -1.039062 C -5.0625 -1.488281 -5.304688 -2.070312 -5.304688 -2.789062 C -5.304688 -3.480469 -5.066406 -4.046875 -4.59375 -4.488281 C -4.121094 -4.925781 -3.457031 -5.148438 -2.601562 -5.148438 C -2.550781 -5.148438 -2.472656 -5.144531 -2.367188 -5.140625 L -2.367188 -1.273438 C -1.796875 -1.304688 -1.363281 -1.46875 -1.058594 -1.757812 C -0.757812 -2.046875 -0.605469 -2.410156 -0.605469 -2.84375 C -0.605469 -3.164062 -0.691406 -3.4375 -0.859375 -3.667969 C -1.027344 -3.894531 -1.296875 -4.074219 -1.671875 -4.210938 Z M -3.089844 -1.324219 L -3.089844 -4.21875 C -3.527344 -4.179688 -3.855469 -4.070312 -4.070312 -3.886719 C -4.410156 -3.605469 -4.578125 -3.242188 -4.578125 -2.796875 C -4.578125 -2.394531 -4.445312 -2.054688 -4.175781 -1.78125 C -3.90625 -1.503906 -3.542969 -1.351562 -3.089844 -1.324219 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-5">
+<path style="stroke:none;" d="M -1.546875 -0.308594 L -1.683594 -1.175781 C -1.335938 -1.226562 -1.070312 -1.363281 -0.882812 -1.585938 C -0.699219 -1.808594 -0.605469 -2.117188 -0.605469 -2.519531 C -0.605469 -2.921875 -0.6875 -3.222656 -0.851562 -3.417969 C -1.015625 -3.613281 -1.210938 -3.710938 -1.429688 -3.710938 C -1.628906 -3.710938 -1.785156 -3.625 -1.898438 -3.453125 C -1.976562 -3.332031 -2.078125 -3.03125 -2.195312 -2.554688 C -2.359375 -1.910156 -2.5 -1.460938 -2.621094 -1.214844 C -2.738281 -0.964844 -2.902344 -0.777344 -3.113281 -0.648438 C -3.324219 -0.519531 -3.554688 -0.453125 -3.808594 -0.453125 C -4.039062 -0.453125 -4.253906 -0.507812 -4.449219 -0.613281 C -4.648438 -0.71875 -4.8125 -0.863281 -4.941406 -1.046875 C -5.042969 -1.183594 -5.128906 -1.367188 -5.199219 -1.605469 C -5.269531 -1.839844 -5.304688 -2.09375 -5.304688 -2.363281 C -5.304688 -2.769531 -5.242188 -3.128906 -5.125 -3.433594 C -5.007812 -3.742188 -4.851562 -3.96875 -4.652344 -4.117188 C -4.453125 -4.261719 -4.183594 -4.363281 -3.847656 -4.417969 L -3.730469 -3.558594 C -3.996094 -3.519531 -4.207031 -3.40625 -4.355469 -3.21875 C -4.503906 -3.03125 -4.578125 -2.769531 -4.578125 -2.425781 C -4.578125 -2.023438 -4.511719 -1.734375 -4.378906 -1.5625 C -4.246094 -1.390625 -4.089844 -1.304688 -3.910156 -1.304688 C -3.796875 -1.304688 -3.695312 -1.339844 -3.601562 -1.410156 C -3.507812 -1.484375 -3.429688 -1.59375 -3.367188 -1.75 C -3.335938 -1.835938 -3.261719 -2.09375 -3.144531 -2.523438 C -2.976562 -3.144531 -2.84375 -3.578125 -2.738281 -3.824219 C -2.632812 -4.070312 -2.476562 -4.265625 -2.273438 -4.40625 C -2.074219 -4.546875 -1.824219 -4.613281 -1.523438 -4.613281 C -1.230469 -4.613281 -0.953125 -4.527344 -0.695312 -4.359375 C -0.4375 -4.1875 -0.238281 -3.941406 -0.09375 -3.617188 C 0.046875 -3.296875 0.117188 -2.929688 0.117188 -2.523438 C 0.117188 -1.851562 -0.0234375 -1.335938 -0.304688 -0.984375 C -0.582031 -0.632812 -0.996094 -0.40625 -1.546875 -0.308594 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph3-1">
+<path style="stroke:none;" d="M 0.984375 0 L 0.984375 -8.589844 L 6.78125 -8.589844 L 6.78125 -7.578125 L 2.121094 -7.578125 L 2.121094 -4.914062 L 6.152344 -4.914062 L 6.152344 -3.902344 L 2.121094 -3.902344 L 2.121094 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-2">
+<path style="stroke:none;" d="M 0.796875 -7.375 L 0.796875 -8.589844 L 1.851562 -8.589844 L 1.851562 -7.375 Z M 0.796875 0 L 0.796875 -6.222656 L 1.851562 -6.222656 L 1.851562 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-3">
+<path style="stroke:none;" d="M 0.765625 0 L 0.765625 -8.589844 L 1.820312 -8.589844 L 1.820312 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-4">
+<path style="stroke:none;" d="M 5.050781 -2.003906 L 6.140625 -1.867188 C 5.96875 -1.230469 5.648438 -0.738281 5.1875 -0.386719 C 4.722656 -0.0351562 4.125 0.140625 3.40625 0.140625 C 2.496094 0.140625 1.773438 -0.140625 1.238281 -0.699219 C 0.707031 -1.261719 0.4375 -2.046875 0.4375 -3.058594 C 0.4375 -4.105469 0.710938 -4.917969 1.25 -5.496094 C 1.789062 -6.074219 2.484375 -6.363281 3.34375 -6.363281 C 4.175781 -6.363281 4.859375 -6.078125 5.382812 -5.515625 C 5.910156 -4.949219 6.175781 -4.148438 6.175781 -3.125 C 6.175781 -3.0625 6.171875 -2.96875 6.171875 -2.84375 L 1.53125 -2.84375 C 1.570312 -2.160156 1.761719 -1.632812 2.109375 -1.273438 C 2.457031 -0.910156 2.890625 -0.726562 3.410156 -0.726562 C 3.796875 -0.726562 4.125 -0.828125 4.398438 -1.03125 C 4.671875 -1.234375 4.890625 -1.558594 5.050781 -2.003906 Z M 1.585938 -3.710938 L 5.0625 -3.710938 C 5.015625 -4.234375 4.882812 -4.625 4.664062 -4.886719 C 4.328125 -5.292969 3.890625 -5.496094 3.359375 -5.496094 C 2.875 -5.496094 2.464844 -5.335938 2.136719 -5.007812 C 1.804688 -4.683594 1.625 -4.25 1.585938 -3.710938 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-5">
+<path style="stroke:none;" d="M 0.367188 -1.859375 L 1.414062 -2.023438 C 1.472656 -1.605469 1.632812 -1.28125 1.902344 -1.0625 C 2.167969 -0.839844 2.542969 -0.726562 3.023438 -0.726562 C 3.507812 -0.726562 3.867188 -0.824219 4.101562 -1.023438 C 4.335938 -1.21875 4.453125 -1.453125 4.453125 -1.71875 C 4.453125 -1.957031 4.351562 -2.140625 4.140625 -2.28125 C 3.996094 -2.375 3.640625 -2.492188 3.0625 -2.636719 C 2.289062 -2.832031 1.753906 -3 1.457031 -3.144531 C 1.15625 -3.285156 0.929688 -3.484375 0.777344 -3.734375 C 0.621094 -3.988281 0.546875 -4.265625 0.546875 -4.570312 C 0.546875 -4.847656 0.609375 -5.105469 0.734375 -5.339844 C 0.863281 -5.578125 1.035156 -5.773438 1.253906 -5.929688 C 1.417969 -6.050781 1.640625 -6.152344 1.925781 -6.238281 C 2.207031 -6.320312 2.511719 -6.363281 2.835938 -6.363281 C 3.324219 -6.363281 3.753906 -6.292969 4.121094 -6.152344 C 4.492188 -6.011719 4.765625 -5.820312 4.9375 -5.582031 C 5.113281 -5.339844 5.234375 -5.019531 5.304688 -4.617188 L 4.273438 -4.476562 C 4.226562 -4.796875 4.089844 -5.046875 3.863281 -5.226562 C 3.640625 -5.40625 3.320312 -5.496094 2.914062 -5.496094 C 2.429688 -5.496094 2.082031 -5.414062 1.875 -5.257812 C 1.667969 -5.097656 1.5625 -4.90625 1.5625 -4.695312 C 1.5625 -4.558594 1.609375 -4.433594 1.695312 -4.324219 C 1.78125 -4.210938 1.914062 -4.117188 2.097656 -4.042969 C 2.203125 -4.003906 2.515625 -3.914062 3.03125 -3.773438 C 3.777344 -3.574219 4.296875 -3.410156 4.589844 -3.285156 C 4.886719 -3.15625 5.117188 -2.972656 5.285156 -2.730469 C 5.453125 -2.488281 5.539062 -2.1875 5.539062 -1.828125 C 5.539062 -1.476562 5.433594 -1.144531 5.230469 -0.835938 C 5.023438 -0.523438 4.726562 -0.285156 4.34375 -0.113281 C 3.957031 0.0546875 3.515625 0.140625 3.03125 0.140625 C 2.222656 0.140625 1.605469 -0.0273438 1.179688 -0.363281 C 0.757812 -0.699219 0.484375 -1.195312 0.367188 -1.859375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-6">
+<path style="stroke:none;" d="M 0.234375 0 L 0.234375 -0.855469 L 4.195312 -5.402344 C 3.746094 -5.378906 3.351562 -5.367188 3.007812 -5.367188 L 0.46875 -5.367188 L 0.46875 -6.222656 L 5.554688 -6.222656 L 5.554688 -5.523438 L 2.1875 -1.578125 L 1.535156 -0.855469 C 2.007812 -0.890625 2.453125 -0.90625 2.867188 -0.90625 L 5.742188 -0.90625 L 5.742188 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-7">
+<path style="stroke:none;" d="M 0.398438 -3.109375 C 0.398438 -4.261719 0.71875 -5.117188 1.359375 -5.671875 C 1.894531 -6.132812 2.546875 -6.363281 3.316406 -6.363281 C 4.171875 -6.363281 4.871094 -6.082031 5.414062 -5.523438 C 5.957031 -4.960938 6.226562 -4.1875 6.226562 -3.199219 C 6.226562 -2.398438 6.109375 -1.769531 5.867188 -1.308594 C 5.628906 -0.851562 5.277344 -0.492188 4.820312 -0.242188 C 4.359375 0.0117188 3.859375 0.140625 3.316406 0.140625 C 2.445312 0.140625 1.742188 -0.140625 1.203125 -0.695312 C 0.667969 -1.253906 0.398438 -2.0625 0.398438 -3.109375 Z M 1.484375 -3.109375 C 1.484375 -2.3125 1.65625 -1.71875 2.003906 -1.320312 C 2.351562 -0.925781 2.789062 -0.726562 3.316406 -0.726562 C 3.839844 -0.726562 4.273438 -0.925781 4.625 -1.324219 C 4.972656 -1.722656 5.144531 -2.328125 5.144531 -3.148438 C 5.144531 -3.917969 4.96875 -4.5 4.621094 -4.894531 C 4.269531 -5.292969 3.835938 -5.492188 3.316406 -5.492188 C 2.789062 -5.492188 2.351562 -5.292969 2.003906 -4.898438 C 1.65625 -4.503906 1.484375 -3.90625 1.484375 -3.109375 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-8">
+<path style="stroke:none;" d="M 1.042969 0 L 1.042969 -5.402344 L 0.109375 -5.402344 L 0.109375 -6.222656 L 1.042969 -6.222656 L 1.042969 -6.882812 C 1.042969 -7.300781 1.078125 -7.613281 1.15625 -7.816406 C 1.257812 -8.089844 1.433594 -8.3125 1.691406 -8.480469 C 1.945312 -8.652344 2.304688 -8.734375 2.765625 -8.734375 C 3.0625 -8.734375 3.390625 -8.703125 3.75 -8.632812 L 3.59375 -7.710938 C 3.375 -7.75 3.164062 -7.769531 2.96875 -7.769531 C 2.648438 -7.769531 2.421875 -7.703125 2.289062 -7.5625 C 2.15625 -7.425781 2.09375 -7.171875 2.09375 -6.796875 L 2.09375 -6.222656 L 3.304688 -6.222656 L 3.304688 -5.402344 L 2.09375 -5.402344 L 2.09375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-9">
+<path style="stroke:none;" d="M 0.789062 0 L 0.789062 -6.222656 L 1.734375 -6.222656 L 1.734375 -5.351562 C 1.929688 -5.65625 2.1875 -5.898438 2.515625 -6.085938 C 2.839844 -6.269531 3.207031 -6.363281 3.621094 -6.363281 C 4.082031 -6.363281 4.460938 -6.265625 4.753906 -6.078125 C 5.050781 -5.886719 5.257812 -5.617188 5.378906 -5.273438 C 5.871094 -6 6.511719 -6.363281 7.300781 -6.363281 C 7.917969 -6.363281 8.390625 -6.191406 8.726562 -5.851562 C 9.058594 -5.507812 9.222656 -4.984375 9.222656 -4.273438 L 9.222656 0 L 8.171875 0 L 8.171875 -3.921875 C 8.171875 -4.34375 8.140625 -4.644531 8.070312 -4.832031 C 8.003906 -5.015625 7.878906 -5.164062 7.699219 -5.28125 C 7.519531 -5.394531 7.308594 -5.449219 7.066406 -5.449219 C 6.628906 -5.449219 6.265625 -5.304688 5.976562 -5.011719 C 5.6875 -4.722656 5.542969 -4.257812 5.542969 -3.617188 L 5.542969 0 L 4.488281 0 L 4.488281 -4.042969 C 4.488281 -4.511719 4.402344 -4.863281 4.230469 -5.097656 C 4.058594 -5.332031 3.777344 -5.449219 3.386719 -5.449219 C 3.089844 -5.449219 2.816406 -5.371094 2.5625 -5.214844 C 2.3125 -5.058594 2.128906 -4.828125 2.015625 -4.53125 C 1.902344 -4.230469 1.84375 -3.796875 1.84375 -3.226562 L 1.84375 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-10">
+<path style="stroke:none;" d="M 4.828125 0 L 4.828125 -0.785156 C 4.433594 -0.167969 3.851562 0.140625 3.085938 0.140625 C 2.589844 0.140625 2.136719 0.00390625 1.71875 -0.269531 C 1.304688 -0.542969 0.980469 -0.925781 0.753906 -1.414062 C 0.523438 -1.90625 0.410156 -2.46875 0.410156 -3.105469 C 0.410156 -3.726562 0.515625 -4.289062 0.71875 -4.796875 C 0.925781 -5.300781 1.238281 -5.6875 1.652344 -5.960938 C 2.066406 -6.230469 2.53125 -6.363281 3.039062 -6.363281 C 3.414062 -6.363281 3.75 -6.285156 4.042969 -6.125 C 4.335938 -5.96875 4.574219 -5.761719 4.757812 -5.507812 L 4.757812 -8.589844 L 5.804688 -8.589844 L 5.804688 0 Z M 1.492188 -3.105469 C 1.492188 -2.308594 1.664062 -1.710938 2 -1.320312 C 2.335938 -0.925781 2.730469 -0.726562 3.1875 -0.726562 C 3.648438 -0.726562 4.039062 -0.914062 4.363281 -1.292969 C 4.683594 -1.667969 4.84375 -2.242188 4.84375 -3.015625 C 4.84375 -3.867188 4.679688 -4.492188 4.351562 -4.890625 C 4.023438 -5.289062 3.621094 -5.492188 3.140625 -5.492188 C 2.671875 -5.492188 2.28125 -5.296875 1.964844 -4.914062 C 1.652344 -4.53125 1.492188 -3.929688 1.492188 -3.105469 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph3-11">
+<path style="stroke:none;" d="M 4.867188 0 L 4.867188 -0.914062 C 4.382812 -0.210938 3.726562 0.140625 2.894531 0.140625 C 2.527344 0.140625 2.183594 0.0703125 1.867188 -0.0703125 C 1.546875 -0.210938 1.3125 -0.386719 1.15625 -0.601562 C 1.003906 -0.8125 0.894531 -1.074219 0.832031 -1.382812 C 0.789062 -1.589844 0.765625 -1.917969 0.765625 -2.367188 L 0.765625 -6.222656 L 1.820312 -6.222656 L 1.820312 -2.773438 C 1.820312 -2.222656 1.84375 -1.851562 1.886719 -1.65625 C 1.953125 -1.378906 2.09375 -1.164062 2.308594 -1.003906 C 2.523438 -0.847656 2.789062 -0.765625 3.105469 -0.765625 C 3.421875 -0.765625 3.71875 -0.847656 3.996094 -1.011719 C 4.273438 -1.171875 4.46875 -1.394531 4.585938 -1.671875 C 4.699219 -1.953125 4.757812 -2.359375 4.757812 -2.890625 L 4.757812 -6.222656 L 5.8125 -6.222656 L 5.8125 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-0">
+<path style="stroke:none;" d=""/>
+</symbol>
+<symbol overflow="visible" id="glyph4-1">
+<path style="stroke:none;" d="M 0.953125 0 L 0.953125 -9.304688 L 4.445312 -9.304688 C 5.15625 -9.304688 5.722656 -9.210938 6.152344 -9.023438 C 6.582031 -8.835938 6.921875 -8.546875 7.164062 -8.152344 C 7.40625 -7.761719 7.527344 -7.351562 7.527344 -6.925781 C 7.527344 -6.527344 7.421875 -6.152344 7.203125 -5.800781 C 6.988281 -5.449219 6.664062 -5.167969 6.226562 -4.953125 C 6.789062 -4.785156 7.222656 -4.503906 7.523438 -4.105469 C 7.828125 -3.707031 7.980469 -3.238281 7.980469 -2.699219 C 7.980469 -2.261719 7.886719 -1.855469 7.703125 -1.480469 C 7.519531 -1.105469 7.292969 -0.820312 7.019531 -0.617188 C 6.75 -0.414062 6.410156 -0.257812 6 -0.15625 C 5.59375 -0.0507812 5.09375 0 4.5 0 Z M 2.183594 -5.394531 L 4.195312 -5.394531 C 4.742188 -5.394531 5.132812 -5.429688 5.371094 -5.503906 C 5.683594 -5.597656 5.917969 -5.75 6.078125 -5.96875 C 6.238281 -6.183594 6.316406 -6.453125 6.316406 -6.78125 C 6.316406 -7.089844 6.242188 -7.359375 6.09375 -7.59375 C 5.945312 -7.828125 5.734375 -7.992188 5.460938 -8.078125 C 5.183594 -8.164062 4.710938 -8.207031 4.042969 -8.207031 L 2.183594 -8.207031 Z M 2.183594 -1.097656 L 4.5 -1.097656 C 4.898438 -1.097656 5.175781 -1.113281 5.339844 -1.140625 C 5.621094 -1.191406 5.859375 -1.277344 6.050781 -1.398438 C 6.242188 -1.515625 6.394531 -1.6875 6.519531 -1.914062 C 6.640625 -2.140625 6.703125 -2.402344 6.703125 -2.699219 C 6.703125 -3.046875 6.613281 -3.347656 6.4375 -3.601562 C 6.257812 -3.859375 6.011719 -4.039062 5.695312 -4.140625 C 5.382812 -4.246094 4.929688 -4.296875 4.335938 -4.296875 L 2.183594 -4.296875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-2">
+<path style="stroke:none;" d="M 0.429688 -3.371094 C 0.429688 -4.617188 0.777344 -5.542969 1.472656 -6.144531 C 2.050781 -6.644531 2.757812 -6.894531 3.59375 -6.894531 C 4.519531 -6.894531 5.277344 -6.589844 5.867188 -5.984375 C 6.453125 -5.375 6.746094 -4.535156 6.746094 -3.464844 C 6.746094 -2.597656 6.617188 -1.914062 6.355469 -1.417969 C 6.097656 -0.921875 5.71875 -0.535156 5.222656 -0.261719 C 4.722656 0.015625 4.179688 0.152344 3.59375 0.152344 C 2.648438 0.152344 1.886719 -0.148438 1.304688 -0.753906 C 0.722656 -1.359375 0.429688 -2.230469 0.429688 -3.371094 Z M 1.605469 -3.371094 C 1.605469 -2.507812 1.792969 -1.859375 2.171875 -1.429688 C 2.546875 -1 3.023438 -0.789062 3.59375 -0.789062 C 4.160156 -0.789062 4.632812 -1.003906 5.007812 -1.433594 C 5.382812 -1.867188 5.574219 -2.523438 5.574219 -3.410156 C 5.574219 -4.242188 5.382812 -4.875 5.003906 -5.304688 C 4.625 -5.734375 4.15625 -5.949219 3.59375 -5.949219 C 3.023438 -5.949219 2.546875 -5.734375 2.171875 -5.304688 C 1.792969 -4.878906 1.605469 -4.234375 1.605469 -3.371094 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-3">
+<path style="stroke:none;" d="M 0.398438 -2.011719 L 1.53125 -2.191406 C 1.59375 -1.738281 1.769531 -1.390625 2.058594 -1.148438 C 2.347656 -0.90625 2.753906 -0.789062 3.273438 -0.789062 C 3.800781 -0.789062 4.1875 -0.894531 4.445312 -1.109375 C 4.699219 -1.320312 4.824219 -1.570312 4.824219 -1.859375 C 4.824219 -2.117188 4.710938 -2.320312 4.488281 -2.46875 C 4.332031 -2.570312 3.941406 -2.699219 3.320312 -2.855469 C 2.480469 -3.066406 1.902344 -3.25 1.578125 -3.40625 C 1.253906 -3.558594 1.007812 -3.773438 0.839844 -4.046875 C 0.671875 -4.320312 0.589844 -4.621094 0.589844 -4.953125 C 0.589844 -5.253906 0.660156 -5.53125 0.796875 -5.785156 C 0.933594 -6.042969 1.121094 -6.253906 1.359375 -6.421875 C 1.535156 -6.554688 1.777344 -6.667969 2.085938 -6.757812 C 2.390625 -6.847656 2.722656 -6.894531 3.070312 -6.894531 C 3.601562 -6.894531 4.066406 -6.816406 4.464844 -6.664062 C 4.867188 -6.511719 5.160156 -6.304688 5.351562 -6.046875 C 5.542969 -5.785156 5.671875 -5.4375 5.746094 -5 L 4.628906 -4.851562 C 4.578125 -5.195312 4.429688 -5.46875 4.1875 -5.664062 C 3.945312 -5.859375 3.597656 -5.953125 3.15625 -5.953125 C 2.628906 -5.953125 2.253906 -5.867188 2.03125 -5.695312 C 1.808594 -5.519531 1.695312 -5.316406 1.695312 -5.085938 C 1.695312 -4.9375 1.742188 -4.804688 1.835938 -4.683594 C 1.929688 -4.5625 2.074219 -4.460938 2.273438 -4.378906 C 2.386719 -4.335938 2.722656 -4.242188 3.28125 -4.085938 C 4.089844 -3.871094 4.652344 -3.695312 4.972656 -3.558594 C 5.292969 -3.421875 5.542969 -3.21875 5.726562 -2.957031 C 5.90625 -2.695312 6 -2.371094 6 -1.980469 C 6 -1.601562 5.886719 -1.242188 5.664062 -0.90625 C 5.441406 -0.570312 5.121094 -0.308594 4.703125 -0.125 C 4.285156 0.0585938 3.8125 0.152344 3.28125 0.152344 C 2.40625 0.152344 1.738281 -0.03125 1.277344 -0.394531 C 0.820312 -0.757812 0.527344 -1.296875 0.398438 -2.011719 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-4">
+<path style="stroke:none;" d="M 3.351562 -1.023438 L 3.515625 -0.0117188 C 3.195312 0.0546875 2.90625 0.0898438 2.652344 0.0898438 C 2.238281 0.0898438 1.917969 0.0234375 1.6875 -0.109375 C 1.460938 -0.238281 1.300781 -0.410156 1.207031 -0.625 C 1.113281 -0.839844 1.066406 -1.289062 1.066406 -1.972656 L 1.066406 -5.851562 L 0.226562 -5.851562 L 0.226562 -6.742188 L 1.066406 -6.742188 L 1.066406 -8.410156 L 2.203125 -9.097656 L 2.203125 -6.742188 L 3.351562 -6.742188 L 3.351562 -5.851562 L 2.203125 -5.851562 L 2.203125 -1.910156 C 2.203125 -1.585938 2.222656 -1.375 2.261719 -1.28125 C 2.304688 -1.1875 2.367188 -1.113281 2.460938 -1.058594 C 2.550781 -1.003906 2.679688 -0.976562 2.851562 -0.976562 C 2.976562 -0.976562 3.144531 -0.992188 3.351562 -1.023438 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-5">
+<path style="stroke:none;" d="M 1.179688 0 L 1.179688 -1.300781 L 2.480469 -1.300781 L 2.480469 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-6">
+<path style="stroke:none;" d="M 1.003906 0 L 1.003906 -9.304688 L 4.511719 -9.304688 C 5.128906 -9.304688 5.601562 -9.277344 5.929688 -9.21875 C 6.386719 -9.140625 6.769531 -8.996094 7.078125 -8.78125 C 7.386719 -8.566406 7.636719 -8.269531 7.824219 -7.882812 C 8.011719 -7.5 8.105469 -7.074219 8.105469 -6.613281 C 8.105469 -5.824219 7.855469 -5.152344 7.351562 -4.605469 C 6.847656 -4.058594 5.9375 -3.78125 4.621094 -3.78125 L 2.234375 -3.78125 L 2.234375 0 Z M 2.234375 -4.882812 L 4.640625 -4.882812 C 5.4375 -4.882812 6 -5.03125 6.335938 -5.324219 C 6.667969 -5.621094 6.835938 -6.039062 6.835938 -6.578125 C 6.835938 -6.964844 6.738281 -7.296875 6.542969 -7.574219 C 6.34375 -7.851562 6.085938 -8.035156 5.765625 -8.125 C 5.558594 -8.179688 5.171875 -8.207031 4.613281 -8.207031 L 2.234375 -8.207031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-7">
+<path style="stroke:none;" d="M 0.804688 2.597656 L 0.679688 1.523438 C 0.929688 1.589844 1.148438 1.625 1.332031 1.625 C 1.585938 1.625 1.789062 1.582031 1.941406 1.5 C 2.09375 1.414062 2.21875 1.296875 2.316406 1.140625 C 2.390625 1.027344 2.503906 0.746094 2.664062 0.292969 C 2.6875 0.230469 2.722656 0.136719 2.765625 0.0117188 L 0.210938 -6.742188 L 1.441406 -6.742188 L 2.84375 -2.835938 C 3.027344 -2.34375 3.1875 -1.820312 3.332031 -1.277344 C 3.464844 -1.800781 3.621094 -2.3125 3.800781 -2.8125 L 5.242188 -6.742188 L 6.386719 -6.742188 L 3.820312 0.113281 C 3.546875 0.855469 3.332031 1.363281 3.179688 1.644531 C 2.976562 2.019531 2.746094 2.296875 2.480469 2.472656 C 2.21875 2.648438 1.90625 2.734375 1.542969 2.734375 C 1.324219 2.734375 1.078125 2.6875 0.804688 2.597656 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-8">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -9.304688 L 2 -9.304688 L 2 -5.96875 C 2.53125 -6.585938 3.207031 -6.894531 4.019531 -6.894531 C 4.519531 -6.894531 4.953125 -6.796875 5.320312 -6.597656 C 5.6875 -6.402344 5.949219 -6.128906 6.109375 -5.78125 C 6.269531 -5.433594 6.347656 -4.933594 6.347656 -4.273438 L 6.347656 0 L 5.203125 0 L 5.203125 -4.273438 C 5.203125 -4.84375 5.082031 -5.257812 4.832031 -5.519531 C 4.585938 -5.78125 4.234375 -5.910156 3.78125 -5.910156 C 3.445312 -5.910156 3.125 -5.820312 2.828125 -5.644531 C 2.53125 -5.46875 2.316406 -5.234375 2.191406 -4.933594 C 2.0625 -4.632812 2 -4.21875 2 -3.6875 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-9">
+<path style="stroke:none;" d="M 0.855469 0 L 0.855469 -6.742188 L 1.886719 -6.742188 L 1.886719 -5.78125 C 2.382812 -6.523438 3.09375 -6.894531 4.03125 -6.894531 C 4.4375 -6.894531 4.808594 -6.820312 5.152344 -6.675781 C 5.492188 -6.527344 5.746094 -6.335938 5.914062 -6.101562 C 6.085938 -5.863281 6.203125 -5.582031 6.273438 -5.257812 C 6.3125 -5.046875 6.335938 -4.675781 6.335938 -4.144531 L 6.335938 0 L 5.191406 0 L 5.191406 -4.101562 C 5.191406 -4.566406 5.148438 -4.914062 5.058594 -5.144531 C 4.96875 -5.375 4.8125 -5.558594 4.585938 -5.695312 C 4.359375 -5.835938 4.09375 -5.902344 3.789062 -5.902344 C 3.304688 -5.902344 2.882812 -5.75 2.53125 -5.441406 C 2.175781 -5.132812 2 -4.546875 2 -3.679688 L 2 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-10">
+<path style="stroke:none;" d="M 0.855469 2.582031 L 0.855469 -6.742188 L 1.898438 -6.742188 L 1.898438 -5.867188 C 2.144531 -6.207031 2.421875 -6.464844 2.730469 -6.636719 C 3.039062 -6.808594 3.414062 -6.894531 3.851562 -6.894531 C 4.429688 -6.894531 4.9375 -6.746094 5.375 -6.449219 C 5.816406 -6.152344 6.148438 -5.734375 6.375 -5.195312 C 6.597656 -4.65625 6.710938 -4.066406 6.710938 -3.421875 C 6.710938 -2.730469 6.585938 -2.109375 6.339844 -1.558594 C 6.089844 -1.007812 5.730469 -0.582031 5.257812 -0.289062 C 4.785156 0.00390625 4.289062 0.152344 3.769531 0.152344 C 3.390625 0.152344 3.046875 0.0703125 2.746094 -0.0898438 C 2.441406 -0.25 2.195312 -0.453125 2 -0.699219 L 2 2.582031 Z M 1.890625 -3.332031 C 1.890625 -2.464844 2.066406 -1.824219 2.417969 -1.410156 C 2.769531 -0.996094 3.195312 -0.789062 3.695312 -0.789062 C 4.203125 -0.789062 4.636719 -1 5 -1.429688 C 5.359375 -1.859375 5.542969 -2.527344 5.542969 -3.429688 C 5.542969 -4.289062 5.363281 -4.929688 5.011719 -5.359375 C 4.65625 -5.785156 4.234375 -6 3.746094 -6 C 3.257812 -6 2.828125 -5.769531 2.453125 -5.316406 C 2.078125 -4.859375 1.890625 -4.199219 1.890625 -3.332031 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-11">
+<path style="stroke:none;" d="M 1.910156 0 L 0.851562 0 L 0.851562 -9.304688 L 1.992188 -9.304688 L 1.992188 -5.984375 C 2.476562 -6.589844 3.089844 -6.894531 3.839844 -6.894531 C 4.253906 -6.894531 4.648438 -6.808594 5.019531 -6.644531 C 5.390625 -6.476562 5.691406 -6.242188 5.933594 -5.9375 C 6.171875 -5.636719 6.359375 -5.269531 6.492188 -4.84375 C 6.628906 -4.414062 6.695312 -3.957031 6.695312 -3.472656 C 6.695312 -2.316406 6.410156 -1.425781 5.839844 -0.792969 C 5.269531 -0.164062 4.582031 0.152344 3.78125 0.152344 C 2.988281 0.152344 2.363281 -0.179688 1.910156 -0.84375 Z M 1.898438 -3.421875 C 1.898438 -2.613281 2.007812 -2.027344 2.226562 -1.667969 C 2.585938 -1.082031 3.074219 -0.789062 3.6875 -0.789062 C 4.1875 -0.789062 4.617188 -1.003906 4.984375 -1.4375 C 5.347656 -1.871094 5.527344 -2.519531 5.527344 -3.375 C 5.527344 -4.257812 5.355469 -4.90625 5.003906 -5.324219 C 4.65625 -5.742188 4.234375 -5.953125 3.738281 -5.953125 C 3.238281 -5.953125 2.808594 -5.738281 2.445312 -5.304688 C 2.082031 -4.871094 1.898438 -4.242188 1.898438 -3.421875 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-12">
+<path style="stroke:none;" d="M 0.863281 -7.992188 L 0.863281 -9.304688 L 2.007812 -9.304688 L 2.007812 -7.992188 Z M 0.863281 0 L 0.863281 -6.742188 L 2.007812 -6.742188 L 2.007812 0 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-13">
+<path style="stroke:none;" d="M 5.230469 0 L 5.230469 -0.851562 C 4.804688 -0.183594 4.175781 0.152344 3.34375 0.152344 C 2.808594 0.152344 2.3125 0.00390625 1.863281 -0.292969 C 1.414062 -0.589844 1.0625 -1 0.816406 -1.53125 C 0.570312 -2.0625 0.445312 -2.675781 0.445312 -3.363281 C 0.445312 -4.035156 0.554688 -4.648438 0.78125 -5.195312 C 1.003906 -5.742188 1.339844 -6.164062 1.789062 -6.457031 C 2.238281 -6.75 2.738281 -6.894531 3.292969 -6.894531 C 3.699219 -6.894531 4.0625 -6.808594 4.378906 -6.636719 C 4.695312 -6.464844 4.957031 -6.242188 5.15625 -5.96875 L 5.15625 -9.304688 L 6.289062 -9.304688 L 6.289062 0 Z M 1.617188 -3.363281 C 1.617188 -2.5 1.800781 -1.855469 2.164062 -1.429688 C 2.527344 -1 2.957031 -0.789062 3.453125 -0.789062 C 3.953125 -0.789062 4.375 -0.992188 4.726562 -1.398438 C 5.074219 -1.808594 5.25 -2.429688 5.25 -3.269531 C 5.25 -4.191406 5.070312 -4.867188 4.714844 -5.300781 C 4.359375 -5.730469 3.921875 -5.949219 3.402344 -5.949219 C 2.894531 -5.949219 2.46875 -5.742188 2.128906 -5.324219 C 1.789062 -4.910156 1.617188 -4.257812 1.617188 -3.363281 Z "/>
+</symbol>
+<symbol overflow="visible" id="glyph4-14">
+<path style="stroke:none;" d="M 4.84375 0 L 3.699219 0 L 3.699219 -7.28125 C 3.425781 -7.019531 3.066406 -6.757812 2.617188 -6.492188 C 2.171875 -6.230469 1.769531 -6.035156 1.414062 -5.902344 L 1.414062 -7.007812 C 2.054688 -7.308594 2.613281 -7.671875 3.089844 -8.101562 C 3.570312 -8.527344 3.90625 -8.941406 4.105469 -9.34375 L 4.84375 -9.34375 Z "/>
+</symbol>
+</g>
+<clipPath id="clip1">
+  <path d="M 94 19 L 96 19 L 96 215 L 94 215 Z "/>
+</clipPath>
+<clipPath id="clip2">
+  <path d="M 204 19 L 206 19 L 206 215 L 204 215 Z "/>
+</clipPath>
+<clipPath id="clip3">
+  <path d="M 314 19 L 316 19 L 316 215 L 314 215 Z "/>
+</clipPath>
+<clipPath id="clip4">
+  <path d="M 39 171 L 355 171 L 355 173 L 39 173 Z "/>
+</clipPath>
+<clipPath id="clip5">
+  <path d="M 39 107 L 355 107 L 355 109 L 39 109 Z "/>
+</clipPath>
+<clipPath id="clip6">
+  <path d="M 39 44 L 355 44 L 355 46 L 39 46 Z "/>
+</clipPath>
+</defs>
+<g id="surface18">
+<g clip-path="url(#clip1)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 95.160156 214.296875 L 95.160156 19 "/>
+</g>
+<g clip-path="url(#clip2)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 204.84375 214.296875 L 204.84375 19 "/>
+</g>
+<g clip-path="url(#clip3)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 314.53125 214.296875 L 314.53125 19 "/>
+</g>
+<g clip-path="url(#clip4)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 171.960938 L 355 171.960938 "/>
+</g>
+<g clip-path="url(#clip5)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 108.234375 L 355 108.234375 "/>
+</g>
+<g clip-path="url(#clip6)" clip-rule="nonzero">
+<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:0.5;stroke-dasharray:1,2;stroke-miterlimit:3.25;" d="M 39 44.511719 L 355 44.511719 "/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 55.070312 201 C 55.070312 200.183594 54.75 199.402344 54.171875 198.828125 C 53.597656 198.25 52.816406 197.929688 52 197.929688 C 51.183594 197.929688 50.402344 198.25 49.828125 198.828125 C 49.25 199.402344 48.929688 200.183594 48.929688 201 C 48.929688 201.816406 49.25 202.597656 49.828125 203.171875 C 50.402344 203.75 51.183594 204.070312 52 204.070312 C 52.816406 204.070312 53.597656 203.75 54.171875 203.171875 C 54.75 202.597656 55.070312 201.816406 55.070312 201 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 88.070312 176 C 88.070312 175.183594 87.75 174.402344 87.171875 173.828125 C 86.597656 173.25 85.816406 172.929688 85 172.929688 C 84.183594 172.929688 83.402344 173.25 82.828125 173.828125 C 82.25 174.402344 81.929688 175.183594 81.929688 176 C 81.929688 176.816406 82.25 177.597656 82.828125 178.171875 C 83.402344 178.75 84.183594 179.070312 85 179.070312 C 85.816406 179.070312 86.597656 178.75 87.171875 178.171875 C 87.75 177.597656 88.070312 176.816406 88.070312 176 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 121.070312 160 C 121.070312 159.183594 120.75 158.402344 120.171875 157.828125 C 119.597656 157.25 118.816406 156.929688 118 156.929688 C 117.183594 156.929688 116.402344 157.25 115.828125 157.828125 C 115.25 158.402344 114.929688 159.183594 114.929688 160 C 114.929688 160.816406 115.25 161.597656 115.828125 162.171875 C 116.402344 162.75 117.183594 163.070312 118 163.070312 C 118.816406 163.070312 119.597656 162.75 120.171875 162.171875 C 120.75 161.597656 121.070312 160.816406 121.070312 160 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 154.070312 142 C 154.070312 141.183594 153.75 140.402344 153.171875 139.828125 C 152.597656 139.25 151.816406 138.929688 151 138.929688 C 150.183594 138.929688 149.402344 139.25 148.828125 139.828125 C 148.25 140.402344 147.929688 141.183594 147.929688 142 C 147.929688 142.816406 148.25 143.597656 148.828125 144.171875 C 149.402344 144.75 150.183594 145.070312 151 145.070312 C 151.816406 145.070312 152.597656 144.75 153.171875 144.171875 C 153.75 143.597656 154.070312 142.816406 154.070312 142 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 187.070312 124 C 187.070312 123.183594 186.75 122.402344 186.171875 121.828125 C 185.597656 121.25 184.816406 120.929688 184 120.929688 C 183.183594 120.929688 182.402344 121.25 181.828125 121.828125 C 181.25 122.402344 180.929688 123.183594 180.929688 124 C 180.929688 124.816406 181.25 125.597656 181.828125 126.171875 C 182.402344 126.75 183.183594 127.070312 184 127.070312 C 184.816406 127.070312 185.597656 126.75 186.171875 126.171875 C 186.75 125.597656 187.070312 124.816406 187.070312 124 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 220.070312 105 C 220.070312 104.183594 219.75 103.402344 219.171875 102.828125 C 218.597656 102.25 217.816406 101.929688 217 101.929688 C 216.183594 101.929688 215.402344 102.25 214.828125 102.828125 C 214.25 103.402344 213.929688 104.183594 213.929688 105 C 213.929688 105.816406 214.25 106.597656 214.828125 107.171875 C 215.402344 107.75 216.183594 108.070312 217 108.070312 C 217.816406 108.070312 218.597656 107.75 219.171875 107.171875 C 219.75 106.597656 220.070312 105.816406 220.070312 105 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 253.070312 86 C 253.070312 85.183594 252.75 84.402344 252.171875 83.828125 C 251.597656 83.25 250.816406 82.929688 250 82.929688 C 249.183594 82.929688 248.402344 83.25 247.828125 83.828125 C 247.25 84.402344 246.929688 85.183594 246.929688 86 C 246.929688 86.816406 247.25 87.597656 247.828125 88.171875 C 248.402344 88.75 249.183594 89.070312 250 89.070312 C 250.816406 89.070312 251.597656 88.75 252.171875 88.171875 C 252.75 87.597656 253.070312 86.816406 253.070312 86 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 286.070312 67 C 286.070312 66.183594 285.75 65.402344 285.171875 64.828125 C 284.597656 64.25 283.816406 63.929688 283 63.929688 C 282.183594 63.929688 281.402344 64.25 280.828125 64.828125 C 280.25 65.402344 279.929688 66.183594 279.929688 67 C 279.929688 67.816406 280.25 68.597656 280.828125 69.171875 C 281.402344 69.75 282.183594 70.070312 283 70.070312 C 283.816406 70.070312 284.597656 69.75 285.171875 69.171875 C 285.75 68.597656 286.070312 67.816406 286.070312 67 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 319.070312 48 C 319.070312 47.183594 318.75 46.402344 318.171875 45.828125 C 317.597656 45.25 316.816406 44.929688 316 44.929688 C 315.183594 44.929688 314.402344 45.25 313.828125 45.828125 C 313.25 46.402344 312.929688 47.183594 312.929688 48 C 312.929688 48.816406 313.25 49.597656 313.828125 50.171875 C 314.402344 50.75 315.183594 51.070312 316 51.070312 C 316.816406 51.070312 317.597656 50.75 318.171875 50.171875 C 318.75 49.597656 319.070312 48.816406 319.070312 48 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 352.070312 29 C 352.070312 28.183594 351.75 27.402344 351.171875 26.828125 C 350.597656 26.25 349.816406 25.929688 349 25.929688 C 348.183594 25.929688 347.402344 26.25 346.828125 26.828125 C 346.25 27.402344 345.929688 28.183594 345.929688 29 C 345.929688 29.816406 346.25 30.597656 346.828125 31.171875 C 347.402344 31.75 348.183594 32.070312 349 32.070312 C 349.816406 32.070312 350.597656 31.75 351.171875 31.171875 C 351.75 30.597656 352.070312 29.816406 352.070312 29 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 55.070312 191 C 55.070312 190.183594 54.75 189.402344 54.171875 188.828125 C 53.597656 188.25 52.816406 187.929688 52 187.929688 C 51.183594 187.929688 50.402344 188.25 49.828125 188.828125 C 49.25 189.402344 48.929688 190.183594 48.929688 191 C 48.929688 191.816406 49.25 192.597656 49.828125 193.171875 C 50.402344 193.75 51.183594 194.070312 52 194.070312 C 52.816406 194.070312 53.597656 193.75 54.171875 193.171875 C 54.75 192.597656 55.070312 191.816406 55.070312 191 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 88.070312 181 C 88.070312 180.183594 87.75 179.402344 87.171875 178.828125 C 86.597656 178.25 85.816406 177.929688 85 177.929688 C 84.183594 177.929688 83.402344 178.25 82.828125 178.828125 C 82.25 179.402344 81.929688 180.183594 81.929688 181 C 81.929688 181.816406 82.25 182.597656 82.828125 183.171875 C 83.402344 183.75 84.183594 184.070312 85 184.070312 C 85.816406 184.070312 86.597656 183.75 87.171875 183.171875 C 87.75 182.597656 88.070312 181.816406 88.070312 181 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 121.070312 170 C 121.070312 169.183594 120.75 168.402344 120.171875 167.828125 C 119.597656 167.25 118.816406 166.929688 118 166.929688 C 117.183594 166.929688 116.402344 167.25 115.828125 167.828125 C 115.25 168.402344 114.929688 169.183594 114.929688 170 C 114.929688 170.816406 115.25 171.597656 115.828125 172.171875 C 116.402344 172.75 117.183594 173.070312 118 173.070312 C 118.816406 173.070312 119.597656 172.75 120.171875 172.171875 C 120.75 171.597656 121.070312 170.816406 121.070312 170 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 154.070312 157 C 154.070312 156.183594 153.75 155.402344 153.171875 154.828125 C 152.597656 154.25 151.816406 153.929688 151 153.929688 C 150.183594 153.929688 149.402344 154.25 148.828125 154.828125 C 148.25 155.402344 147.929688 156.183594 147.929688 157 C 147.929688 157.816406 148.25 158.597656 148.828125 159.171875 C 149.402344 159.75 150.183594 160.070312 151 160.070312 C 151.816406 160.070312 152.597656 159.75 153.171875 159.171875 C 153.75 158.597656 154.070312 157.816406 154.070312 157 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 187.070312 142 C 187.070312 141.183594 186.75 140.402344 186.171875 139.828125 C 185.597656 139.25 184.816406 138.929688 184 138.929688 C 183.183594 138.929688 182.402344 139.25 181.828125 139.828125 C 181.25 140.402344 180.929688 141.183594 180.929688 142 C 180.929688 142.816406 181.25 143.597656 181.828125 144.171875 C 182.402344 144.75 183.183594 145.070312 184 145.070312 C 184.816406 145.070312 185.597656 144.75 186.171875 144.171875 C 186.75 143.597656 187.070312 142.816406 187.070312 142 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 220.070312 125 C 220.070312 124.183594 219.75 123.402344 219.171875 122.828125 C 218.597656 122.25 217.816406 121.929688 217 121.929688 C 216.183594 121.929688 215.402344 122.25 214.828125 122.828125 C 214.25 123.402344 213.929688 124.183594 213.929688 125 C 213.929688 125.816406 214.25 126.597656 214.828125 127.171875 C 215.402344 127.75 216.183594 128.070312 217 128.070312 C 217.816406 128.070312 218.597656 127.75 219.171875 127.171875 C 219.75 126.597656 220.070312 125.816406 220.070312 125 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 253.070312 107 C 253.070312 106.183594 252.75 105.402344 252.171875 104.828125 C 251.597656 104.25 250.816406 103.929688 250 103.929688 C 249.183594 103.929688 248.402344 104.25 247.828125 104.828125 C 247.25 105.402344 246.929688 106.183594 246.929688 107 C 246.929688 107.816406 247.25 108.597656 247.828125 109.171875 C 248.402344 109.75 249.183594 110.070312 250 110.070312 C 250.816406 110.070312 251.597656 109.75 252.171875 109.171875 C 252.75 108.597656 253.070312 107.816406 253.070312 107 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 286.070312 88 C 286.070312 87.183594 285.75 86.402344 285.171875 85.828125 C 284.597656 85.25 283.816406 84.929688 283 84.929688 C 282.183594 84.929688 281.402344 85.25 280.828125 85.828125 C 280.25 86.402344 279.929688 87.183594 279.929688 88 C 279.929688 88.816406 280.25 89.597656 280.828125 90.171875 C 281.402344 90.75 282.183594 91.070312 283 91.070312 C 283.816406 91.070312 284.597656 90.75 285.171875 90.171875 C 285.75 89.597656 286.070312 88.816406 286.070312 88 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 319.070312 69 C 319.070312 68.183594 318.75 67.402344 318.171875 66.828125 C 317.597656 66.25 316.816406 65.929688 316 65.929688 C 315.183594 65.929688 314.402344 66.25 313.828125 66.828125 C 313.25 67.402344 312.929688 68.183594 312.929688 69 C 312.929688 69.816406 313.25 70.597656 313.828125 71.171875 C 314.402344 71.75 315.183594 72.070312 316 72.070312 C 316.816406 72.070312 317.597656 71.75 318.171875 71.171875 C 318.75 70.597656 319.070312 69.816406 319.070312 69 Z "/>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 352.070312 50 C 352.070312 49.183594 351.75 48.402344 351.171875 47.828125 C 350.597656 47.25 349.816406 46.929688 349 46.929688 C 348.183594 46.929688 347.402344 47.25 346.828125 47.828125 C 346.25 48.402344 345.929688 49.183594 345.929688 50 C 345.929688 50.816406 346.25 51.597656 346.828125 52.171875 C 347.402344 52.75 348.183594 53.070312 349 53.070312 C 349.816406 53.070312 350.597656 52.75 351.171875 52.171875 C 351.75 51.597656 352.070312 50.816406 352.070312 50 Z "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 214.296875 L 39 214.296875 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 214.296875 L 39 19 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 19 L 355 19 "/>
+<path style="fill:none;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 19 L 355 214.296875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 95.160156 214.296875 L 95.160156 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="89.659467" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="95.220991" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 204.84375 214.296875 L 204.84375 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="196.345465" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="201.906989" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="207.468512" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 314.53125 214.296875 L 314.53125 211.140625 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="303.531463" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="309.092987" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="314.65451" y="226.29874"/>
+  <use xlink:href="#glyph0-2" x="320.216034" y="226.29874"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 51.511719 214.296875 L 51.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 62.140625 214.296875 L 62.140625 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 70.824219 214.296875 L 70.824219 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 78.167969 214.296875 L 78.167969 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 84.53125 214.296875 L 84.53125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.140625 214.296875 L 90.140625 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 128.179688 214.296875 L 128.179688 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 147.492188 214.296875 L 147.492188 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 161.195312 214.296875 L 161.195312 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 171.828125 214.296875 L 171.828125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 180.511719 214.296875 L 180.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 187.855469 214.296875 L 187.855469 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 194.214844 214.296875 L 194.214844 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 199.828125 214.296875 L 199.828125 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 237.863281 214.296875 L 237.863281 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 257.179688 214.296875 L 257.179688 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 270.882812 214.296875 L 270.882812 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 281.511719 214.296875 L 281.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 290.199219 214.296875 L 290.199219 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 297.539062 214.296875 L 297.539062 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 303.902344 214.296875 L 303.902344 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 309.511719 214.296875 L 309.511719 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.550781 214.296875 L 347.550781 212.71875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 171.960938 L 42.160156 171.960938 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="176.961688"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="176.961688"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-1" x="30" y="172.81325"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 108.234375 L 42.160156 108.234375 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="113.23603"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="113.23603"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-2" x="30" y="109.087593"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 44.511719 L 42.160156 44.511719 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-1" x="19" y="49.510373"/>
+  <use xlink:href="#glyph0-2" x="24.561523" y="49.510373"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph1-3" x="30" y="45.361936"/>
+</g>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 205.28125 L 40.578125 205.28125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 197.320312 L 40.578125 197.320312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 191.144531 L 40.578125 191.144531 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 186.097656 L 40.578125 186.097656 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 181.832031 L 40.578125 181.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 178.136719 L 40.578125 178.136719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 174.878906 L 40.578125 174.878906 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 152.777344 L 40.578125 152.777344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 141.558594 L 40.578125 141.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 133.59375 L 40.578125 133.59375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 127.417969 L 40.578125 127.417969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 122.375 L 40.578125 122.375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 118.105469 L 40.578125 118.105469 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 114.410156 L 40.578125 114.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 111.152344 L 40.578125 111.152344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 89.050781 L 40.578125 89.050781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 77.832031 L 40.578125 77.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 69.871094 L 40.578125 69.871094 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 63.695312 L 40.578125 63.695312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 58.648438 L 40.578125 58.648438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 54.382812 L 40.578125 54.382812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 50.6875 L 40.578125 50.6875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 47.425781 L 40.578125 47.425781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 39 25.328125 L 40.578125 25.328125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 95.160156 19 L 95.160156 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 204.84375 19 L 204.84375 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 314.53125 19 L 314.53125 22.160156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 51.511719 19 L 51.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 62.140625 19 L 62.140625 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 70.824219 19 L 70.824219 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 78.167969 19 L 78.167969 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 84.53125 19 L 84.53125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 90.140625 19 L 90.140625 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 128.179688 19 L 128.179688 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 147.492188 19 L 147.492188 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 161.195312 19 L 161.195312 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 171.828125 19 L 171.828125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 180.511719 19 L 180.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 187.855469 19 L 187.855469 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 194.214844 19 L 194.214844 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 199.828125 19 L 199.828125 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 237.863281 19 L 237.863281 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 257.179688 19 L 257.179688 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 270.882812 19 L 270.882812 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 281.511719 19 L 281.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 290.199219 19 L 290.199219 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 297.539062 19 L 297.539062 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 303.902344 19 L 303.902344 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 309.511719 19 L 309.511719 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 347.550781 19 L 347.550781 20.578125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 171.960938 L 351.839844 171.960938 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 108.234375 L 351.839844 108.234375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 44.511719 L 351.839844 44.511719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 205.28125 L 353.421875 205.28125 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 197.320312 L 353.421875 197.320312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 191.144531 L 353.421875 191.144531 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 186.097656 L 353.421875 186.097656 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 181.832031 L 353.421875 181.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 178.136719 L 353.421875 178.136719 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 174.878906 L 353.421875 174.878906 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 152.777344 L 353.421875 152.777344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 141.558594 L 353.421875 141.558594 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 133.59375 L 353.421875 133.59375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 127.417969 L 353.421875 127.417969 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 122.375 L 353.421875 122.375 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 118.105469 L 353.421875 118.105469 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 114.410156 L 353.421875 114.410156 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 111.152344 L 353.421875 111.152344 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 89.050781 L 353.421875 89.050781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 77.832031 L 353.421875 77.832031 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 69.871094 L 353.421875 69.871094 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 63.695312 L 353.421875 63.695312 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 58.648438 L 353.421875 58.648438 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 54.382812 L 353.421875 54.382812 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 50.6875 L 353.421875 50.6875 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 47.425781 L 353.421875 47.425781 "/>
+<path style="fill:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(39.99939%,39.99939%,39.99939%);stroke-opacity:1;stroke-miterlimit:3.25;" d="M 355 25.328125 L 353.421875 25.328125 "/>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph0-3" x="175.5" y="245.29874"/>
+  <use xlink:href="#glyph0-4" x="181.608398" y="245.29874"/>
+  <use xlink:href="#glyph0-5" x="187.169922" y="245.29874"/>
+  <use xlink:href="#glyph0-6" x="192.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-7" x="197.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-8" x="200.509766" y="245.29874"/>
+  <use xlink:href="#glyph0-9" x="202.731445" y="245.29874"/>
+  <use xlink:href="#glyph0-5" x="208.292969" y="245.29874"/>
+  <use xlink:href="#glyph0-10" x="213.854492" y="245.29874"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph2-1" x="9" y="129.14937"/>
+  <use xlink:href="#glyph2-2" x="9" y="122.479448"/>
+  <use xlink:href="#glyph2-3" x="9" y="117.479448"/>
+  <use xlink:href="#glyph2-4" x="9" y="114.701128"/>
+  <use xlink:href="#glyph2-5" x="9" y="109.139605"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-1" x="148" y="12"/>
+  <use xlink:href="#glyph3-2" x="155.330078" y="12"/>
+  <use xlink:href="#glyph3-3" x="157.996094" y="12"/>
+  <use xlink:href="#glyph3-4" x="160.662109" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-5" x="170" y="12"/>
+  <use xlink:href="#glyph3-2" x="176" y="12"/>
+  <use xlink:href="#glyph3-6" x="178.666016" y="12"/>
+  <use xlink:href="#glyph3-4" x="184.666016" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-7" x="194" y="12"/>
+  <use xlink:href="#glyph3-8" x="200.673828" y="12"/>
+</g>
+<g style="fill:rgb(39.99939%,39.99939%,39.99939%);fill-opacity:1;">
+  <use xlink:href="#glyph3-9" x="207" y="12"/>
+  <use xlink:href="#glyph3-7" x="216.996094" y="12"/>
+  <use xlink:href="#glyph3-10" x="223.669922" y="12"/>
+  <use xlink:href="#glyph3-11" x="230.34375" y="12"/>
+  <use xlink:href="#glyph3-3" x="237.017578" y="12"/>
+  <use xlink:href="#glyph3-4" x="239.683594" y="12"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(36.84082%,50.67749%,70.979309%);fill-opacity:1;" d="M 382.148438 113 C 382.148438 112.164062 381.816406 111.363281 381.226562 110.773438 C 380.636719 110.183594 379.835938 109.851562 379 109.851562 C 378.164062 109.851562 377.363281 110.183594 376.773438 110.773438 C 376.183594 111.363281 375.851562 112.164062 375.851562 113 C 375.851562 113.835938 376.183594 114.636719 376.773438 115.226562 C 377.363281 115.816406 378.164062 116.148438 379 116.148438 C 379.835938 116.148438 380.636719 115.816406 381.226562 115.226562 C 381.816406 114.636719 382.148438 113.835938 382.148438 113 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-1" x="391" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="399.670898" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="406.901367" y="117.28418"/>
+  <use xlink:href="#glyph4-3" x="414.131836" y="117.28418"/>
+  <use xlink:href="#glyph4-4" x="420.631836" y="117.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-5" x="424" y="117.28418"/>
+</g>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-6" x="428" y="117.28418"/>
+  <use xlink:href="#glyph4-7" x="436.670898" y="117.28418"/>
+  <use xlink:href="#glyph4-4" x="443.170898" y="117.28418"/>
+  <use xlink:href="#glyph4-8" x="446.783203" y="117.28418"/>
+  <use xlink:href="#glyph4-2" x="454.013672" y="117.28418"/>
+  <use xlink:href="#glyph4-9" x="461.244141" y="117.28418"/>
+</g>
+<path style=" stroke:none;fill-rule:evenodd;fill:rgb(88.070679%,61.103821%,14.204407%);fill-opacity:1;" d="M 382.148438 136 C 382.148438 135.164062 381.816406 134.363281 381.226562 133.773438 C 380.636719 133.183594 379.835938 132.851562 379 132.851562 C 378.164062 132.851562 377.363281 133.183594 376.773438 133.773438 C 376.183594 134.363281 375.851562 135.164062 375.851562 136 C 375.851562 136.835938 376.183594 137.636719 376.773438 138.226562 C 377.363281 138.816406 378.164062 139.148438 379 139.148438 C 379.835938 139.148438 380.636719 138.816406 381.226562 138.226562 C 381.816406 137.636719 382.148438 136.835938 382.148438 136 Z "/>
+<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
+  <use xlink:href="#glyph4-10" x="391" y="140.28418"/>
+  <use xlink:href="#glyph4-7" x="398.230469" y="140.28418"/>
+  <use xlink:href="#glyph4-11" x="404.730469" y="140.28418"/>
+  <use xlink:href="#glyph4-12" x="411.960938" y="140.28418"/>
+  <use xlink:href="#glyph4-9" x="414.849609" y="140.28418"/>
+  <use xlink:href="#glyph4-13" x="422.080078" y="140.28418"/>
+  <use xlink:href="#glyph4-14" x="429.310547" y="140.28418"/>
+  <use xlink:href="#glyph4-14" x="436.541016" y="140.28418"/>
+</g>
+</g>
+</svg>
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/reference.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/reference.rst
new file mode 100644
index 0000000..a9fbe60
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/reference.rst
@@ -0,0 +1,117 @@
+.. _reference:
+
+.. warning::
+
+    Please be advised that the reference documentation discussing pybind11
+    internals is currently incomplete. Please refer to the previous sections
+    and the pybind11 header files for the nitty gritty details.
+
+Reference
+#########
+
+.. _macros:
+
+Macros
+======
+
+.. doxygendefine:: PYBIND11_MODULE
+
+.. _core_types:
+
+Convenience classes for arbitrary Python types
+==============================================
+
+Common member functions
+-----------------------
+
+.. doxygenclass:: object_api
+    :members:
+
+Without reference counting
+--------------------------
+
+.. doxygenclass:: handle
+    :members:
+
+With reference counting
+-----------------------
+
+.. doxygenclass:: object
+    :members:
+
+.. doxygenfunction:: reinterpret_borrow
+
+.. doxygenfunction:: reinterpret_steal
+
+Convenience classes for specific Python types
+=============================================
+
+.. doxygenclass:: module
+    :members:
+
+.. doxygengroup:: pytypes
+    :members:
+
+.. _extras:
+
+Passing extra arguments to ``def`` or ``class_``
+================================================
+
+.. doxygengroup:: annotations
+    :members:
+
+Embedding the interpreter
+=========================
+
+.. doxygendefine:: PYBIND11_EMBEDDED_MODULE
+
+.. doxygenfunction:: initialize_interpreter
+
+.. doxygenfunction:: finalize_interpreter
+
+.. doxygenclass:: scoped_interpreter
+
+Redirecting C++ streams
+=======================
+
+.. doxygenclass:: scoped_ostream_redirect
+
+.. doxygenclass:: scoped_estream_redirect
+
+.. doxygenfunction:: add_ostream_redirect
+
+Python built-in functions
+=========================
+
+.. doxygengroup:: python_builtins
+    :members:
+
+Inheritance
+===========
+
+See :doc:`/classes` and :doc:`/advanced/classes` for more detail.
+
+.. doxygendefine:: PYBIND11_OVERLOAD
+
+.. doxygendefine:: PYBIND11_OVERLOAD_PURE
+
+.. doxygendefine:: PYBIND11_OVERLOAD_NAME
+
+.. doxygendefine:: PYBIND11_OVERLOAD_PURE_NAME
+
+.. doxygenfunction:: get_overload
+
+Exceptions
+==========
+
+.. doxygenclass:: error_already_set
+    :members:
+
+.. doxygenclass:: builtin_exception
+    :members:
+
+
+Literals
+========
+
+.. doxygennamespace:: literals
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/release.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/release.rst
new file mode 100644
index 0000000..9846f97
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/release.rst
@@ -0,0 +1,21 @@
+To release a new version of pybind11:
+
+- Update the version number and push to pypi
+    - Update ``pybind11/_version.py`` (set release version, remove 'dev').
+    - Update ``PYBIND11_VERSION_MAJOR`` etc. in ``include/pybind11/detail/common.h``.
+    - Ensure that all the information in ``setup.py`` is up-to-date.
+    - Update version in ``docs/conf.py``.
+    - Tag release date in ``docs/changelog.rst``.
+    - ``git add`` and ``git commit``.
+    - if new minor version: ``git checkout -b vX.Y``, ``git push -u origin vX.Y``
+    - ``git tag -a vX.Y.Z -m 'vX.Y.Z release'``.
+    - ``git push``
+    - ``git push --tags``.
+    - ``python setup.py sdist upload``.
+    - ``python setup.py bdist_wheel upload``.
+- Get back to work
+    - Update ``_version.py`` (add 'dev' and increment minor).
+    - Update version in ``docs/conf.py``
+    - Update version macros in ``include/pybind11/common.h``
+    - ``git add`` and ``git commit``.
+      ``git push``
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/requirements.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/requirements.txt
new file mode 100644
index 0000000..3818fe8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/requirements.txt
@@ -0,0 +1 @@
+breathe == 4.5.0
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/upgrade.rst b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/upgrade.rst
new file mode 100644
index 0000000..3f56973
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/docs/upgrade.rst
@@ -0,0 +1,404 @@
+Upgrade guide
+#############
+
+This is a companion guide to the :doc:`changelog`. While the changelog briefly
+lists all of the new features, improvements and bug fixes, this upgrade guide
+focuses only the subset which directly impacts your experience when upgrading
+to a new version. But it goes into more detail. This includes things like
+deprecated APIs and their replacements, build system changes, general code
+modernization and other useful information.
+
+
+v2.2
+====
+
+Deprecation of the ``PYBIND11_PLUGIN`` macro
+--------------------------------------------
+
+``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+The old macro emits a compile-time deprecation warning.
+
+.. code-block:: cpp
+
+    // old
+    PYBIND11_PLUGIN(example) {
+        py::module m("example", "documentation string");
+
+        m.def("add", [](int a, int b) { return a + b; });
+
+        return m.ptr();
+    }
+
+    // new
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "documentation string"; // optional
+
+        m.def("add", [](int a, int b) { return a + b; });
+    }
+
+
+New API for defining custom constructors and pickling functions
+---------------------------------------------------------------
+
+The old placement-new custom constructors have been deprecated. The new approach
+uses ``py::init()`` and factory functions to greatly improve type safety.
+
+Placement-new can be called accidentally with an incompatible type (without any
+compiler errors or warnings), or it can initialize the same object multiple times
+if not careful with the Python-side ``__init__`` calls. The new-style custom
+constructors prevent such mistakes. See :ref:`custom_constructors` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        .def("__init__", [](Foo &self, ...) {
+            new (&self) Foo(...); // uses placement-new
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        .def(py::init([](...) { // Note: no `self` argument
+            return new Foo(...); // return by raw pointer
+            // or: return std::make_unique<Foo>(...); // return by holder
+            // or: return Foo(...); // return by value (move constructor)
+        }));
+
+Mirroring the custom constructor changes, ``py::pickle()`` is now the preferred
+way to get and set object state. See :ref:`pickling` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        ...
+        .def("__getstate__", [](const Foo &self) {
+            return py::make_tuple(self.value1(), self.value2(), ...);
+        })
+        .def("__setstate__", [](Foo &self, py::tuple t) {
+            new (&self) Foo(t[0].cast<std::string>(), ...);
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        ...
+        .def(py::pickle(
+            [](const Foo &self) { // __getstate__
+                return py::make_tuple(f.value1(), f.value2(), ...); // unchanged
+            },
+            [](py::tuple t) { // __setstate__, note: no `self` argument
+                return new Foo(t[0].cast<std::string>(), ...);
+                // or: return std::make_unique<Foo>(...); // return by holder
+                // or: return Foo(...); // return by value (move constructor)
+            }
+        ));
+
+For both the constructors and pickling, warnings are shown at module
+initialization time (on import, not when the functions are called).
+They're only visible when compiled in debug mode. Sample warning:
+
+.. code-block:: none
+
+    pybind11-bound class 'mymodule.Foo' is using an old-style placement-new '__init__'
+    which has been deprecated. See the upgrade guide in pybind11's docs.
+
+
+Stricter enforcement of hidden symbol visibility for pybind11 modules
+---------------------------------------------------------------------
+
+pybind11 now tries to actively enforce hidden symbol visibility for modules.
+If you're using either one of pybind11's :doc:`CMake or Python build systems
+<compiling>` (the two example repositories) and you haven't been exporting any
+symbols, there's nothing to be concerned about. All the changes have been done
+transparently in the background. If you were building manually or relied on
+specific default visibility, read on.
+
+Setting default symbol visibility to *hidden* has always been recommended for
+pybind11 (see :ref:`faq:symhidden`). On Linux and macOS, hidden symbol
+visibility (in conjunction with the ``strip`` utility) yields much smaller
+module binaries. `CPython's extension docs`_ also recommend hiding symbols
+by default, with the goal of avoiding symbol name clashes between modules.
+Starting with v2.2, pybind11 enforces this more strictly: (1) by declaring
+all symbols inside the ``pybind11`` namespace as hidden and (2) by including
+the ``-fvisibility=hidden`` flag on Linux and macOS (only for extension
+modules, not for embedding the interpreter).
+
+.. _CPython's extension docs: https://docs.python.org/3/extending/extending.html#providing-a-c-api-for-an-extension-module
+
+The namespace-scope hidden visibility is done automatically in pybind11's
+headers and it's generally transparent to users. It ensures that:
+
+* Modules compiled with different pybind11 versions don't clash with each other.
+
+* Some new features, like ``py::module_local`` bindings, can work as intended.
+
+The ``-fvisibility=hidden`` flag applies the same visibility to user bindings
+outside of the ``pybind11`` namespace. It's now set automatic by pybind11's
+CMake and Python build systems, but this needs to be done manually by users
+of other build systems. Adding this flag:
+
+* Minimizes the chances of symbol conflicts between modules. E.g. if two
+  unrelated modules were statically linked to different (ABI-incompatible)
+  versions of the same third-party library, a symbol clash would be likely
+  (and would end with unpredictable results).
+
+* Produces smaller binaries on Linux and macOS, as pointed out previously.
+
+Within pybind11's CMake build system, ``pybind11_add_module`` has always been
+setting the ``-fvisibility=hidden`` flag in release mode. From now on, it's
+being applied unconditionally, even in debug mode and it can no longer be opted
+out of with the ``NO_EXTRAS`` option. The ``pybind11::module`` target now also
+adds this flag to it's interface. The ``pybind11::embed`` target is unchanged.
+
+The most significant change here is for the ``pybind11::module`` target. If you
+were previously relying on default visibility, i.e. if your Python module was
+doubling as a shared library with dependents, you'll need to either export
+symbols manually (recommended for cross-platform libraries) or factor out the
+shared library (and have the Python module link to it like the other
+dependents). As a temporary workaround, you can also restore default visibility
+using the CMake code below, but this is not recommended in the long run:
+
+.. code-block:: cmake
+
+    target_link_libraries(mymodule PRIVATE pybind11::module)
+
+    add_library(restore_default_visibility INTERFACE)
+    target_compile_options(restore_default_visibility INTERFACE -fvisibility=default)
+    target_link_libraries(mymodule PRIVATE restore_default_visibility)
+
+
+Local STL container bindings
+----------------------------
+
+Previous pybind11 versions could only bind types globally -- all pybind11
+modules, even unrelated ones, would have access to the same exported types.
+However, this would also result in a conflict if two modules exported the
+same C++ type, which is especially problematic for very common types, e.g.
+``std::vector<int>``. :ref:`module_local` were added to resolve this (see
+that section for a complete usage guide).
+
+``py::class_`` still defaults to global bindings (because these types are
+usually unique across modules), however in order to avoid clashes of opaque
+types, ``py::bind_vector`` and ``py::bind_map`` will now bind STL containers
+as ``py::module_local`` if their elements are: builtins (``int``, ``float``,
+etc.), not bound using ``py::class_``, or bound as ``py::module_local``. For
+example, this change allows multiple modules to bind ``std::vector<int>``
+without causing conflicts. See :ref:`stl_bind` for more details.
+
+When upgrading to this version, if you have multiple modules which depend on
+a single global binding of an STL container, note that all modules can still
+accept foreign  ``py::module_local`` types in the direction of Python-to-C++.
+The locality only affects the C++-to-Python direction. If this is needed in
+multiple modules, you'll need to either:
+
+* Add a copy of the same STL binding to all of the modules which need it.
+
+* Restore the global status of that single binding by marking it
+  ``py::module_local(false)``.
+
+The latter is an easy workaround, but in the long run it would be best to
+localize all common type bindings in order to avoid conflicts with
+third-party modules.
+
+
+Negative strides for Python buffer objects and numpy arrays
+-----------------------------------------------------------
+
+Support for negative strides required changing the integer type from unsigned
+to signed in the interfaces of ``py::buffer_info`` and ``py::array``. If you
+have compiler warnings enabled, you may notice some new conversion warnings
+after upgrading. These can be resolved using ``static_cast``.
+
+
+Deprecation of some ``py::object`` APIs
+---------------------------------------
+
+To compare ``py::object`` instances by pointer, you should now use
+``obj1.is(obj2)`` which is equivalent to ``obj1 is obj2`` in Python.
+Previously, pybind11 used ``operator==`` for this (``obj1 == obj2``), but
+that could be confusing and is now deprecated (so that it can eventually
+be replaced with proper rich object comparison in a future release).
+
+For classes which inherit from ``py::object``, ``borrowed`` and ``stolen``
+were previously available as protected constructor tags. Now the types
+should be used directly instead: ``borrowed_t{}`` and ``stolen_t{}``
+(`#771 <https://github.com/pybind/pybind11/pull/771>`_).
+
+
+Stricter compile-time error checking
+------------------------------------
+
+Some error checks have been moved from run time to compile time. Notably,
+automatic conversion of ``std::shared_ptr<T>`` is not possible when ``T`` is
+not directly registered with ``py::class_<T>`` (e.g. ``std::shared_ptr<int>``
+or ``std::shared_ptr<std::vector<T>>`` are not automatically convertible).
+Attempting to bind a function with such arguments now results in a compile-time
+error instead of waiting to fail at run time.
+
+``py::init<...>()`` constructor definitions are also stricter and now prevent
+bindings which could cause unexpected behavior:
+
+.. code-block:: cpp
+
+    struct Example {
+        Example(int &);
+    };
+
+    py::class_<Example>(m, "Example")
+        .def(py::init<int &>()); // OK, exact match
+        // .def(py::init<int>()); // compile-time error, mismatch
+
+A non-``const`` lvalue reference is not allowed to bind to an rvalue. However,
+note that a constructor taking ``const T &`` can still be registered using
+``py::init<T>()`` because a ``const`` lvalue reference can bind to an rvalue.
+
+v2.1
+====
+
+Minimum compiler versions are enforced at compile time
+------------------------------------------------------
+
+The minimums also apply to v2.0 but the check is now explicit and a compile-time
+error is raised if the compiler does not meet the requirements:
+
+* GCC >= 4.8
+* clang >= 3.3 (appleclang >= 5.0)
+* MSVC >= 2015u3
+* Intel C++ >= 15.0
+
+
+The ``py::metaclass`` attribute is not required for static properties
+---------------------------------------------------------------------
+
+Binding classes with static properties is now possible by default. The
+zero-parameter version of ``py::metaclass()`` is deprecated. However, a new
+one-parameter ``py::metaclass(python_type)`` version was added for rare
+cases when a custom metaclass is needed to override pybind11's default.
+
+.. code-block:: cpp
+
+    // old -- emits a deprecation warning
+    py::class_<Foo>(m, "Foo", py::metaclass())
+        .def_property_readonly_static("foo", ...);
+
+    // new -- static properties work without the attribute
+    py::class_<Foo>(m, "Foo")
+        .def_property_readonly_static("foo", ...);
+
+    // new -- advanced feature, override pybind11's default metaclass
+    py::class_<Bar>(m, "Bar", py::metaclass(custom_python_type))
+        ...
+
+
+v2.0
+====
+
+Breaking changes in ``py::class_``
+----------------------------------
+
+These changes were necessary to make type definitions in pybind11
+future-proof, to support PyPy via its ``cpyext`` mechanism (`#527
+<https://github.com/pybind/pybind11/pull/527>`_), and to improve efficiency
+(`rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_).
+
+1. Declarations of types that provide access via the buffer protocol must
+   now include the ``py::buffer_protocol()`` annotation as an argument to
+   the ``py::class_`` constructor.
+
+   .. code-block:: cpp
+
+       py::class_<Matrix>("Matrix", py::buffer_protocol())
+           .def(py::init<...>())
+           .def_buffer(...);
+
+2. Classes which include static properties (e.g. ``def_readwrite_static()``)
+   must now include the ``py::metaclass()`` attribute. Note: this requirement
+   has since been removed in v2.1. If you're upgrading from 1.x, it's
+   recommended to skip directly to v2.1 or newer.
+
+3. This version of pybind11 uses a redesigned mechanism for instantiating
+   trampoline classes that are used to override virtual methods from within
+   Python. This led to the following user-visible syntax change:
+
+   .. code-block:: cpp
+
+       // old v1.x syntax
+       py::class_<TrampolineClass>("MyClass")
+           .alias<MyClass>()
+           ...
+
+       // new v2.x syntax
+       py::class_<MyClass, TrampolineClass>("MyClass")
+           ...
+
+   Importantly, both the original and the trampoline class are now specified
+   as arguments to the ``py::class_`` template, and the ``alias<..>()`` call
+   is gone. The new scheme has zero overhead in cases when Python doesn't
+   override any functions of the underlying C++ class.
+   `rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_.
+
+   The class type must be the first template argument given to ``py::class_``
+   while the trampoline can be mixed in arbitrary order with other arguments
+   (see the following section).
+
+
+Deprecation of the ``py::base<T>()`` attribute
+----------------------------------------------
+
+``py::base<T>()`` was deprecated in favor of specifying ``T`` as a template
+argument to ``py::class_``. This new syntax also supports multiple inheritance.
+Note that, while the type being exported must be the first argument in the
+``py::class_<Class, ...>`` template, the order of the following types (bases,
+holder and/or trampoline) is not important.
+
+.. code-block:: cpp
+
+    // old v1.x
+    py::class_<Derived>("Derived", py::base<Base>());
+
+    // new v2.x
+    py::class_<Derived, Base>("Derived");
+
+    // new -- multiple inheritance
+    py::class_<Derived, Base1, Base2>("Derived");
+
+    // new -- apart from `Derived` the argument order can be arbitrary
+    py::class_<Derived, Base1, Holder, Base2, Trampoline>("Derived");
+
+
+Out-of-the-box support for ``std::shared_ptr``
+----------------------------------------------
+
+The relevant type caster is now built in, so it's no longer necessary to
+include a declaration of the form:
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)
+
+Continuing to do so won’t cause an error or even a deprecation warning,
+but it's completely redundant.
+
+
+Deprecation of a few ``py::object`` APIs
+----------------------------------------
+
+All of the old-style calls emit deprecation warnings.
+
++---------------------------------------+---------------------------------------------+
+|  Old syntax                           |  New syntax                                 |
++=======================================+=============================================+
+| ``obj.call(args...)``                 | ``obj(args...)``                            |
++---------------------------------------+---------------------------------------------+
+| ``obj.str()``                         | ``py::str(obj)``                            |
++---------------------------------------+---------------------------------------------+
+| ``auto l = py::list(obj); l.check()`` | ``py::isinstance<py::list>(obj)``           |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, true)``             | ``py::reinterpret_borrow<py::object>(ptr)`` |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, false)``            | ``py::reinterpret_steal<py::object>(ptr)``  |
++---------------------------------------+---------------------------------------------+
+| ``if (obj.attr("foo"))``              | ``if (py::hasattr(obj, "foo"))``            |
++---------------------------------------+---------------------------------------------+
+| ``if (obj["bar"])``                   | ``if (obj.contains("bar"))``                |
++---------------------------------------+---------------------------------------------+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/attr.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/attr.h
new file mode 100644
index 0000000..6962d6f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/attr.h
@@ -0,0 +1,493 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "cast.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+
+/// Annotation for operators
+struct is_operator { };
+
+/// Annotation for parent scope
+struct scope { handle value; scope(const handle &s) : value(s) { } };
+
+/// Annotation for documentation
+struct doc { const char *value; doc(const char *value) : value(value) { } };
+
+/// Annotation for function names
+struct name { const char *value; name(const char *value) : value(value) { } };
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+
+/// Annotation indicating that a class derives from another given type
+template <typename T> struct base {
+    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() { }
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient> struct keep_alive { };
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance { };
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr { };
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol { };
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic { };
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl) (function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = { };
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data) (function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false) { }
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) +
+                          "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+        if (base_info->type->tp_dictoffset != 0)
+            dynamic_attr = true;
+
+        if (caster)
+            base_info->implicit_casts.emplace_back(type, caster);
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void> struct process_attribute;
+
+template <typename T> struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) { }
+    static void init(const T &, type_record *) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
+};
+
+/// Process an attribute specifying the function's name
+template <> struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <> struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+};
+template <> struct process_attribute<char *> : process_attribute<const char *> { };
+
+/// Process an attribute indicating the function's return value policy
+template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
+template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <> struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
+/// Process a keyword argument attribute (*without* a default value)
+template <> struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+
+        if (!a.value) {
+#if !defined(NDEBUG)
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name)
+                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
+                else
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument "
+                          + descr + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "Compile in debug mode for more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args> struct process_attributes {
+    static void init(const Args&... args, function_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void init(const Args&... args, type_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
+        ignore_unused(unused);
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
+        ignore_unused(unused);
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/buffer_info.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/buffer_info.h
new file mode 100644
index 0000000..1f4115a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/buffer_info.h
@@ -0,0 +1,114 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in), readonly) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly=true)
+    : buffer_info(const_cast<T*>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}, view->readonly) {
+        this->view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(view, rhs.view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (view && ownview) { PyBuffer_Release(view); delete view; }
+    }
+
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in, bool readonly)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) { }
+
+    Py_buffer *view = nullptr;
+    bool ownview = false;
+};
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/cast.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/cast.h
new file mode 100644
index 0000000..ad22531
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/cast.h
@@ -0,0 +1,2159 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pytypes.h"
+#include "detail/typeid.h"
+#include "detail/descr.h"
+#include "detail/internals.h"
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#    define PYBIND11_HAS_STRING_VIEW
+#  endif
+#endif
+#ifdef PYBIND11_HAS_STRING_VIEW
+#include <string_view>
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        get_internals().loader_patient_stack.push_back(nullptr);
+    }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            pybind11_fail("loader_life_support: internal error");
+
+        auto ptr = stack.back();
+        stack.pop_back();
+        Py_CLEAR(ptr);
+
+        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
+        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
+            stack.shrink_to_fit();
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+
+        auto &list_ptr = stack.back();
+        if (list_ptr == nullptr) {
+            list_ptr = PyList_New(1);
+            if (!list_ptr)
+                pybind11_fail("loader_life_support: error allocating list");
+            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+        } else {
+            auto result = PyList_Append(list_ptr, h.ptr());
+            if (result == -1)
+                pybind11_fail("loader_life_support: error adding patient");
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
+                }
+                if (!found) bases.push_back(tinfo);
+            }
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
+    auto &bases = all_type_info(type);
+    if (bases.size() == 0)
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = registered_local_types_cpp();
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end())
+        return it->second;
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() {}
+
+    // Used for past-the-end iterator
+    value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr(); }
+
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+            ? inst->simple_holder_constructed
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
+    }
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+
+    if (!throw_if_missing)
+        return value_and_holder();
+
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+#else
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
+            std::string(Py_TYPE(this)->tp_name) + "' instance");
+#endif
+}
+
+PYBIND11_NOINLINE inline void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+#else
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+#endif
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+PYBIND11_NOINLINE inline void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+}
+
+PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type)
+        return false;
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE inline std::string error_string() {
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+        return "Unknown internal error occurred";
+    }
+
+    error_scope scope; // Preserve error state
+
+    std::string errorString;
+    if (scope.type) {
+        errorString += handle(scope.type).attr("__name__").cast<std::string>();
+        errorString += ": ";
+    }
+    if (scope.value)
+        errorString += (std::string) str(scope.value);
+
+    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+
+#if PY_MAJOR_VERSION >= 3
+    if (scope.trace != nullptr)
+        PyException_SetTraceback(scope.value, scope.trace);
+#endif
+
+#if !defined(PYPY_VERSION)
+    if (scope.trace) {
+        PyTracebackObject *trace = (PyTracebackObject *) scope.trace;
+
+        /* Get the deepest trace possible */
+        while (trace->tb_next)
+            trace = trace->tb_next;
+
+        PyFrameObject *frame = trace->tb_frame;
+        errorString += "\n\nAt:\n";
+        while (frame) {
+            int lineno = PyFrame_GetLineNumber(frame);
+            errorString +=
+                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
+                "(" + std::to_string(lineno) + "): " +
+                handle(frame->f_code->co_name).cast<std::string>() + "\n";
+            frame = frame->f_back;
+        }
+    }
+#endif
+
+    return errorString;
+}
+
+PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (auto vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x03000000
+    return _PyThreadState_Current;
+#elif PY_VERSION_HEX < 0x03050000
+    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+#elif PY_VERSION_HEX < 0x03050200
+    return (PyThreadState*) _PyThreadState_Current.value;
+#else
+    return _PyThreadState_UncheckedGet();
+#endif
+}
+
+// Forward declarations
+inline void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
+
+    type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
+
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
+    }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) // no type info: error will be set already
+            return handle();
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
+
+        auto it_instances = get_internals().registered_instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                    return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " +
+                                     type_name + " is non-copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor)
+                    valueptr = move_constructor(src);
+                else if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " +
+                                     type_name + " is neither movable nor copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+                #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+                        vptr = ::operator new(type->type_size,
+                                              std::align_val_t(type->type_align));
+                    else
+                #endif
+                vptr = ::operator new(type->type_size);
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = src.get_type();
+        if (!hasattr(pytype, local_key))
+            return false;
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            value = nullptr;
+            return true;
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            else if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        return try_load_foreign_module_local(src);
+    }
+
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+
+// Specialization for types that appear to be copy constructible but also look like stl containers
+// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
+// so, copy constructability depends on whether the value_type is copy constructible.
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>,
+        // Avoid infinite recursion
+        negation<std::is_same<Container, typename Container::value_type>>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't themselves
+// copy constructible, but this can not be relied upon when T1 or T2 are themselves containers).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T, typename SFINAE = void> struct is_copy_assignable : std::is_copy_assignable<T> {};
+template <typename Container> struct is_copy_assignable<Container, enable_if_t<all_of<
+        std::is_copy_assignable<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_assignable<typename Container::value_type> {};
+template <typename T1, typename T2> struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook
+{
+    static const void *get(const itype *src, const std::type_info*&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
+{
+    static const void *get(const itype *src, const std::type_info*& type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void*>(src);
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type> class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = _<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) { }
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type))
+                return {vsrc, tpi};
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, policy, parent, st.second,
+            make_copy_constructor(src), make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+
+    template <typename T> using cast_op_type = detail::cast_op_type<T>;
+
+    operator itype*() { return (type *) value; }
+    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+
+protected:
+    using Constructor = void *(*)(const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. */
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
+template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+}
+template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator
+        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+}
+
+template <typename type> class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
+            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
+            policy = return_value_policy::automatic_reference;
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T> using cast_op_type = std::reference_wrapper<type>;
+    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name) \
+    protected: \
+        type value; \
+    public: \
+        static constexpr auto name = py_name; \
+        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
+        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
+            if (!src) return none().release(); \
+            if (policy == return_value_policy::take_ownership) { \
+                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
+            } else { \
+                return cast(*src, policy, parent); \
+            } \
+        } \
+        operator type*() { return &value; } \
+        operator type&() { return value; } \
+        operator type&&() && { return std::move(value); } \
+        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+
+
+template <typename CharT> using is_std_char_type = any_of<
+    std::is_same<CharT, char>, /* std::string */
+    std::is_same<CharT, char16_t>, /* std::u16string */
+    std::is_same<CharT, char32_t>, /* std::u32string */
+    std::is_same<CharT, wchar_t> /* std::wstring */
+>;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+public:
+
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src)
+            return false;
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr()))
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            else
+                return false;
+        } else if (PyFloat_Check(src.ptr())) {
+            return false;
+        } else if (std::is_unsigned<py_type>::value) {
+            py_value = as_unsigned<py_type>(src.ptr());
+        } else { // signed integer:
+            py_value = sizeof(T) <= sizeof(long)
+                ? (py_type) PyLong_AsLong(src.ptr())
+                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
+        }
+
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Protect std::numeric_limits::min/max with parentheses
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                       (py_value < (py_type) (std::numeric_limits<T>::min)() ||
+                        py_value > (py_type) (std::numeric_limits<T>::max)()))) {
+            bool type_error = py_err && PyErr_ExceptionMatches(
+#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
+                PyExc_SystemError
+#else
+                PyExc_TypeError
+#endif
+            );
+            PyErr_Clear();
+            if (type_error && convert && PyNumber_Check(src.ptr())) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                     ? PyNumber_Float(src.ptr())
+                                                     : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) <= sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) > sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) > sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+};
+
+template<typename T> struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none())
+            return true;
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(T, _("None"));
+};
+
+template <> class type_caster<void_type> : public void_caster<void_type> {};
+
+template <> class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        } else if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr)
+            return capsule(ptr).release();
+        else
+            return none().inc_ref();
+    }
+
+    template <typename T> using cast_op_type = void*&;
+    operator void *&() { return value; }
+    static constexpr auto name = _("capsule");
+private:
+    void *value = nullptr;
+};
+
+template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
+
+template <> class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        else if (src.ptr() == Py_True) { value = true; return true; }
+        else if (src.ptr() == Py_False) { value = false; return true; }
+        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0;  // None is implicitly converted to False
+            }
+            #if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+            #else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+            #endif
+            if (res == 0 || res == 1) {
+                value = (bool) res;
+                return true;
+            } else {
+                PyErr_Clear();
+            }
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, _("bool"));
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false> struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+            "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+#if PY_MAJOR_VERSION < 3
+        object temp;
+#endif
+        handle load_src = src;
+        if (!src) {
+            return false;
+        } else if (!PyUnicode_Check(load_src.ptr())) {
+#if PY_MAJOR_VERSION >= 3
+            return load_bytes(load_src);
+#else
+            if (sizeof(CharT) == 1) {
+                return load_bytes(load_src);
+            }
+
+            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
+            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+                return false;
+
+            temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
+            if (!temp) { PyErr_Clear(); return false; }
+            load_src = temp;
+#endif
+        }
+
+        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+        if (!utfNbytes) { PyErr_Clear(); return false; }
+
+        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView)
+            loader_life_support::add_patient(utfNbytes);
+
+        return true;
+    }
+
+    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) throw error_already_set();
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return
+            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
+            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
+                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
+        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
+        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
+        // passing the encoding as a string value, which works properly:
+        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (bytes) {
+                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = type_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) return pybind11::none().inc_ref();
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) throw error_already_set();
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
+    operator CharT&() {
+        if (none)
+            throw value_error("Cannot convert None to a character");
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0)
+            throw value_error("Cannot convert empty string to a character");
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to figure
+        // out how long the first encoded character is in bytes to distinguish between these two
+        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
+        // can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            unsigned char v0 = static_cast<unsigned char>(value[0]);
+            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
+                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
+                4; // 0b11110xxx - start of 4-byte sequence
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000)
+                throw value_error("Character code point not in range(0x10000)");
+        }
+
+        if (str_len != 1)
+            throw value_error("Expected a character, but multi-character string found");
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = _(PYBIND11_STRING_NAME);
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+public:
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size)
+            return false;
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+
+    template <typename T> using cast_op_type = type;
+
+    operator type() & { return implicit_cast(indices{}); }
+    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
+            if (!r)
+                return false;
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        std::array<object, size> entries{{
+            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
+        }};
+        for (const auto &entry: entries)
+            if (!entry)
+                return handle();
+        tuple result(size);
+        int counter = 0;
+        for (auto & entry: entries)
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
+    : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts> class type_caster<std::tuple<Ts...>>
+    : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+template <typename type, typename holder_type>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type*() { return this->value; }
+    explicit operator type&() { return *(this->value); }
+    explicit operator holder_type*() { return std::addressof(holder); }
+
+    // Workaround for Intel compiler bug
+    // see pybind11 issue 94
+    #if defined(__ICC) || defined(__INTEL_COMPILER)
+    operator holder_type&() { return holder; }
+    #else
+    explicit operator holder_type&() { return holder; }
+    #endif
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder)
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        } else {
+            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if defined(NDEBUG)
+                             "(compile in debug mode for type information)");
+#else
+                             "of type '" + type_id<holder_type>() + "''");
+#endif
+        }
+    }
+
+    template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) { return false; }
+
+    template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+
+template <typename type, typename holder_type>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \
+    namespace pybind11 { namespace detail { \
+    template <typename type> \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__>  { }; \
+    template <typename type> \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>> \
+        : public type_caster_holder<type, holder_type> { }; \
+    }}
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder> struct is_holder_type :
+    std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
+    std::true_type {};
+
+template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
+template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
+template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
+template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) { value = src; return static_cast<bool>(value); }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src))
+            return false;
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> { };
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T> using move_is_plain_type = satisfies_none_of<T,
+    std::is_void, std::is_pointer, std::is_reference, std::is_const
+>;
+template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
+template <typename T> struct move_always<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<is_copy_constructible<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T, typename SFINAE = void> struct move_if_unreferenced : std::false_type {};
+template <typename T> struct move_if_unreferenced<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<move_always<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T> using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type> using cast_is_temporary_value_reference = bool_constant<
+    (std::is_reference<type>::value || std::is_pointer<type>::value) &&
+    !std::is_base_of<type_caster_generic, make_caster<type>>::value &&
+    !std::is_same<intrinsic_t<type>, void>::value
+>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void> struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return> struct return_value_policy_override<Return,
+        detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value &&
+               !std::is_pointer<Return>::value
+                   ? return_value_policy::move : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    if (!conv.load(handle, true)) {
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type " +
+            (std::string) str(handle.get_type()) + " to C++ type '" + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T> make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+            "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) { return T(reinterpret_borrow<object>(handle)); }
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(const T &value, return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    if (policy == return_value_policy::automatic)
+        policy = std::is_pointer<T>::value ? return_value_policy::take_ownership : return_value_policy::copy;
+    else if (policy == return_value_policy::automatic_reference)
+        policy = std::is_pointer<T>::value ? return_value_policy::reference : return_value_policy::copy;
+    return reinterpret_steal<object>(detail::make_caster<T>::cast(value, policy, parent));
+}
+
+template <typename T> T handle::cast() const { return pybind11::cast<T>(*this); }
+template <> inline void handle::cast() const { return; }
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1)
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references"
+            " (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) +
+                " instance to C++ " + type_id<T>() + " instance: instance has multiple references");
+#endif
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T&());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
+    if (object.ref_count() > 1)
+        return cast<T>(object);
+    else
+        return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
+    return cast<T>(object);
+}
+
+template <typename T> T object::cast() const & { return pybind11::cast<T>(*this); }
+template <typename T> T object::cast() && { return pybind11::cast<T>(std::move(*this)); }
+template <> inline void object::cast() const & { return; }
+template <> inline void object::cast() && { return; }
+
+NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) { return pybind11::cast(std::forward<T>(o)); }
+
+struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro
+template <typename ret_type> using overload_caster_t = conditional_t<
+    cast_is_temporary_value_reference<ret_type>::value, make_caster<ret_type>, overload_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o, make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&, overload_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked"); }
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even
+// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in
+// cases where pybind11::cast is valid.
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o)); }
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked"); }
+template <> inline void cast_safe<void>(object &&) {}
+
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() { return tuple(0); }
+
+template <return_value_policy policy = return_value_policy::automatic_reference,
+          typename... Args> tuple make_tuple(Args&&... args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args {
+        { reinterpret_steal<object>(detail::make_caster<Args>::cast(
+            std::forward<Args>(args_), policy, nullptr))... }
+    };
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if defined(NDEBUG)
+            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+#else
+            std::array<std::string, size> argtypes { {type_id<Args>()...} };
+            throw cast_error("make_tuple(): unable to convert argument of type '" +
+                argtypes[i] + "' to Python object");
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args)
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
+    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
+    /// Assign a value to this argument
+    template <typename T> arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) { flag_none = flag; return *this; }
+
+    const char *name; ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
+    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base),
+          value(reinterpret_steal<object>(
+              detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
+          )),
+          descr(descr)
+#if !defined(NDEBUG)
+        , type(type_id<T>())
+#endif
+    { }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) { }
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) { }
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if !defined(NDEBUG)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/> using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+}
+
+NAMESPACE_BEGIN(detail)
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get args/kwargs argument positions relative to the end of the argument list:
+    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
+                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+
+    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
+
+    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos < 0;
+    static constexpr bool has_args = args_pos < 0;
+
+    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) {
+        return load_impl_sequence(call, indices{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
+            if (!r)
+                return false;
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) { }
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
+        ignore_unused(_);
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+#endif
+        }
+        args_list.append(o);
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (const auto &a : ap)
+            args_list.append(a);
+    }
+
+    void process(list &/*args_list*/, arg_v a) {
+        if (!a.name)
+#if defined(NDEBUG)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+
+        if (m_kwargs.contains(a.name)) {
+#if defined(NDEBUG)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = a.value;
+    }
+
+    void process(list &/*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp)
+            return;
+        for (const auto &k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if defined(NDEBUG)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error("Got kwargs without a name; only named arguments "
+                         "may be passed via py::arg() to a python function call. "
+                         "(compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(std::string type) {
+        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
+                         "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error("Got multiple values for keyword argument "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(std::string name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+    [[noreturn]] static void argument_cast_error() {
+        throw cast_error("Unable to convert call argument to Python object "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
+        throw cast_error("Unable to convert call argument '" + name
+                         + "' of type '" + type + "' to Python object");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<all_of<is_positional<Args>...>::value>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(
+        constexpr_last<is_positional, Args...>() < constexpr_first<is_keyword_or_ds, Args...>()
+        && constexpr_last<is_s_unpacking, Args...>() < constexpr_first<is_ds_unpacking, Args...>(),
+        "Invalid function call: positional args must precede keywords and ** unpacking; "
+        "* unpacking must precede ** unpacking"
+    );
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+NAMESPACE_END(detail)
+
+#define PYBIND11_MAKE_OPAQUE(...) \
+    namespace pybind11 { namespace detail { \
+        template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \
+    }}
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/chrono.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/chrono.h
new file mode 100644
index 0000000..ea777e6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/chrono.h
@@ -0,0 +1,184 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <cmath>
+#include <ctime>
+#include <chrono>
+#include <datetime.h>
+
+// Backport the PyDateTime_DELTA functions from Python3.3 if required
+#ifndef PyDateTime_DELTA_GET_DAYS
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#endif
+#ifndef PyDateTime_DELTA_GET_SECONDS
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#endif
+#ifndef PyDateTime_DELTA_GET_MICROSECONDS
+#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename type> class duration_caster {
+public:
+    typedef typename type::rep rep;
+    typedef typename type::period period;
+
+    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        else if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        else return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+};
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec   = 0;
+            cal.tm_min   = 0;
+            cal.tm_hour  = 0;
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday  = 1;   // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon   = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year  = 70;  // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        }
+        else return false;
+
+        value = system_clock::from_time_t(std::mktime(&cal)) + msecs;
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        std::time_t tt = system_clock::to_time_t(time_point_cast<system_clock::duration>(src));
+        // this function uses static memory so it's best to copy it out asap just in case
+        // otherwise other code that is using localtime may break this (not just python code)
+        std::tm localtime = *std::localtime(&tt);
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using us_t = duration<int, std::micro>;
+
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+    }
+    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
+: public duration_caster<std::chrono::time_point<Clock, Duration>> {
+};
+
+template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
+: public duration_caster<std::chrono::duration<Rep, Period>> {
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/common.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/common.h
new file mode 100644
index 0000000..6c8a4f1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/complex.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/complex.h
new file mode 100644
index 0000000..3f89638
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/complex.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#  undef I
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T> class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src)
+            return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 0000000..edfa7de
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,639 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if PY_VERSION_HEX >= 0x03030000
+#  define PYBIND11_BUILTIN_QUALNAME
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
+// signatures; in 3.3+ this macro expands to nothing:
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    view->readonly = info->readonly;
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+#if PY_MAJOR_VERSION >= 3
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+#else
+        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+#endif
+    }
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+#if PY_VERSION_HEX >= 0x03050000
+    type->tp_as_async = &heap_type->as_async;
+#endif
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h
new file mode 100644
index 0000000..e2330bb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h
@@ -0,0 +1,815 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if !defined(NAMESPACE_BEGIN)
+#  define NAMESPACE_BEGIN(name) namespace name {
+#endif
+#if !defined(NAMESPACE_END)
+#  define NAMESPACE_END(name) }
+#endif
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus >= 201703L
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1700
+#    error pybind11 requires Intel C++ compiler v17 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 4
+#define PYBIND11_VERSION_PATCH dev4
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(copysign)
+#  undef copysign
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name();   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+// Providing a separate PyInit decl to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();               \
+    extern "C" PYBIND11_EXPORT void init##name();           \
+    extern "C" PYBIND11_EXPORT void init##name() {          \
+        (void)pybind11_init_wrapper();                      \
+    }                                                       \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+
+#define PYBIND11_CHECK_PYTHON_VERSION \
+    {                                                                          \
+        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
+            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
+        const char *runtime_ver = Py_GetVersion();                             \
+        size_t len = std::strlen(compiled_ver);                                \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
+                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
+            PyErr_Format(PyExc_ImportError,                                    \
+                "Python version mismatch: module was compiled for Python %s, " \
+                "but the interpreter version is incompatible: %s.",            \
+                compiled_ver, runtime_ver);                                    \
+            return nullptr;                                                    \
+        }                                                                      \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS \
+        catch (pybind11::error_already_set &e) {                               \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
+/// can be converted to a Base pointer)
+template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+NAMESPACE_END(detail)
+
+
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/descr.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/descr.h
new file mode 100644
index 0000000..8d404e5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/descr.h
@@ -0,0 +1,100 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#  define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1];
+
+    constexpr descr() : text{'\0'} { }
+    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+
+    template <typename... Chars>
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> _(char const(&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
+template <size_t...Digits> struct int_to_str<0, Digits...> {
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
+    return _(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
+    return _(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+
+template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + _(", ") + concat(args...);
+}
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return _("{") + descr + _("}");
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/init.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/init.h
new file mode 100644
index 0000000..acfe00b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/init.h
@@ -0,0 +1,335 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static constexpr auto name = _<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
+template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            else
+                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+NAMESPACE_END(initimpl)
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/internals.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/internals.h
new file mode 100644
index 0000000..6224dfb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/internals.h
@@ -0,0 +1,349 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#else
+    // Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if PY_MAJOR_VERSION < 3
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             do {                                                            \
+                 PyThread_delete_key_value((key));                           \
+                 PyThread_set_key_value((key), (value));                     \
+             } while (false)
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void)key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    PYBIND11_TLS_KEY_INIT(tstate);
+    PyInterpreterState *istate = nullptr;
+    ~internals() {
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens whe PyThread_tss_free is called.
+        // PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
+        // PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
+        // of those have anything to do with CPython internals.
+        // PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 4
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#   define PYBIND11_BUILD_TYPE "_debug"
+#else
+#   define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+#if defined(_MSC_VER)
+#   define PYBIND11_COMPILER_TYPE "_msvc"
+#elif defined(__INTEL_COMPILER)
+#   define PYBIND11_COMPILER_TYPE "_icc"
+#elif defined(__clang__)
+#   define PYBIND11_COMPILER_TYPE "_clang"
+#elif defined(__PGI)
+#   define PYBIND11_COMPILER_TYPE "_pgi"
+#elif defined(__MINGW32__)
+#   define PYBIND11_COMPILER_TYPE "_mingw"
+#elif defined(__CYGWIN__)
+#   define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#elif defined(__GNUC__)
+#   define PYBIND11_COMPILER_TYPE "_gcc"
+#else
+#   define PYBIND11_COMPILER_TYPE "_unknown"
+#endif
+
+#if defined(_LIBCPP_VERSION)
+#  define PYBIND11_STDLIB "_libcpp"
+#elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#  define PYBIND11_STDLIB "_libstdcpp"
+#else
+#  define PYBIND11_STDLIB ""
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+#if defined(__GXX_ABI_VERSION)
+#  define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#else
+#  define PYBIND11_BUILD_ABI ""
+#endif
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)           { e.restore();                                    return;
+    } catch (const builtin_exception &e)     { e.set_error();                                  return;
+    } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+    } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+    } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::overflow_error &e)   { PyErr_SetString(PyExc_OverflowError, e.what()); return;
+    } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+    } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)       { e.restore();   return;
+    } catch (const builtin_exception &e) { e.set_error(); return;
+    }
+}
+#endif
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp)
+        return **internals_pp;
+
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state (PyGILState_Ensure()) {}
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) internals_pp = new internals*();
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+        PyEval_InitThreads();
+        PyThreadState *tstate = PyThreadState_Get();
+        #if PY_VERSION_HEX >= 0x03070000
+            internals_ptr->tstate = PyThread_tss_alloc();
+            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
+            PyThread_tss_set(internals_ptr->tstate, tstate);
+        #else
+            internals_ptr->tstate = PyThread_create_key();
+            if (internals_ptr->tstate == -1)
+                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
+            PyThread_set_key_value(internals_ptr->tstate, tstate);
+        #endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/typeid.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/typeid.h
new file mode 100644
index 0000000..9c8a4fc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/typeid.h
@@ -0,0 +1,55 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) break;
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res {
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
+    if (status == 0)
+        name = res.get();
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T> static std::string type_id() {
+    std::string name(typeid(T).name());
+    detail::clean_type_id(name);
+    return name;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eigen.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eigen.h
new file mode 100644
index 0000000..d963d96
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eigen.h
@@ -0,0 +1,607 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "numpy.h"
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#elif defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
+//   under Clang, so disable that warning here:
+#    pragma GCC diagnostic ignored "-Wdeprecated"
+#  endif
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T> using is_eigen_other = all_of<
+    is_template_base_of<Eigen::EigenBase, T>,
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
+>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
+
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+                return false;
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
+
+    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor =
+        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+        _("]") +
+        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+        // *gave* a numpy.ndarray of the right type and dimensions.
+        _<show_writeable>(", flags.writeable", "") +
+        _<show_c_contiguous>(", flags.c_contiguous", "") +
+        _<show_f_contiguous>(", flags.f_contiguous", "") +
+        _("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+        else if (ref.ndim() == 1) buf = buf.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+protected:
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
+};
+
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    typedef typename Type::Scalar Scalar;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
+    typedef typename Type::Index Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src)
+            return false;
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!obj.get_type().is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices)
+            return false;
+
+        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type&>(src).makeCompressed();
+
+        object matrix_type = module::import("scipy.sparse").attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(
+            std::make_tuple(data, innerIndices, outerIndices),
+            std::make_pair(src.rows(), src.cols())
+        ).release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/embed.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/embed.h
new file mode 100644
index 0000000..f814c78
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/embed.h
@@ -0,0 +1,202 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name();  \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name();       \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    `Python documentation`_ for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eval.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eval.h
new file mode 100644
index 0000000..ea85ba1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eval.h
@@ -0,0 +1,117 @@
+/*
+    pybind11/exec.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(str expr, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+#if PY_VERSION_HEX >= 0x03040000
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+#elif PY_VERSION_HEX >= 0x03000000
+    FILE *f = _Py_fopen(fname.ptr(), "r");
+#else
+    /* No unicode support in open() :( */
+    auto fobj = reinterpret_steal<object>(PyFile_FromString(
+        const_cast<char *>(fname_str.c_str()),
+        const_cast<char*>("r")));
+    FILE *f = nullptr;
+    if (fobj)
+        f = PyFile_AsFile(fobj.ptr());
+    closeFile = 0;
+#endif
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
+    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr());
+    (void) closeFile;
+#else
+    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                    local.ptr(), closeFile);
+#endif
+
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/functional.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/functional.h
new file mode 100644
index 0000000..f8bda64
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/functional.h
@@ -0,0 +1,101 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <functional>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*) (Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            return true;
+        }
+
+        if (!isinstance<function>(src))
+            return false;
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+            auto rec = (function_record *) c;
+
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                struct capture { function_type f; };
+                value = ((capture *) &rec->data)->f;
+                return true;
+            }
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+            func_handle(function&& f_) : f(std::move(f_)) {}
+            func_handle(const func_handle&) = default;
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                object retval(hfunc.f(std::forward<Args>(args)...));
+                /* Visual studio 2015 parser issue: need parentheses around this expression */
+                return (retval.template cast<Return>());
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_)
+            return none().inc_ref();
+
+        auto result = f_.template target<function_type>();
+        if (result)
+            return cpp_function(*result, policy).release();
+        else
+            return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
+                               + make_caster<retval_type>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/iostream.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/iostream.h
new file mode 100644
index 0000000..c43b7c9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,209 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            {
+                gil_scoped_acquire tmp;
+                pywrite(line);
+                pyflush();
+            }
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+
+    pythonbuf(object pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size),
+          d_buffer(new char[buf_size]),
+          pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf&&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/numpy.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/numpy.h
new file mode 100644
index 0000000..ba41a22
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/numpy.h
@@ -0,0 +1,1642 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "complex.h"
+#include <numeric>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <functional>
+#include <utility>
+#include <vector>
+#include <typeindex>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user. */
+static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class array; // Forward declaration
+
+NAMESPACE_BEGIN(detail)
+template <typename type, typename SFINAE = void> struct npy_format_descriptor;
+
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD
+    char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject* dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end())
+            return &(it->second);
+        if (throw_if_missing)
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        return nullptr;
+    }
+
+    template<typename T> numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals& get_numpy_internals() {
+    static numpy_internals* ptr = nullptr;
+    if (!ptr)
+        load_numpy_internals(ptr);
+    return *ptr;
+}
+
+template <typename T> struct same_size {
+    template <typename U> using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete> constexpr int platform_lookup() { return -1; }
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_, NPY_UBYTE_,
+        NPY_SHORT_, NPY_USHORT_,
+        NPY_INT_, NPY_UINT_,
+        NPY_LONG_, NPY_ULONG_,
+        NPY_LONGLONG_, NPY_ULONGLONG_,
+        NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_,
+        NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_, NPY_UNICODE_, NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_ = platform_lookup<std::int32_t, long, int, short>(
+            NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_ = platform_lookup<std::int64_t, long, long long, int>(
+            NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_ = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    typedef struct {
+        Py_intptr_t *ptr;
+        int len;
+    } PyArray_Dims;
+
+    static npy_api& get() {
+        static npy_api api = lookup();
+        return api;
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)
+        (PyTypeObject *, PyObject *, int, Py_intptr_t *,
+         Py_intptr_t *, void *, int, PyObject *);
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_) (PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_) (PyObject *, PyObject *);
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *,
+                                             Py_ssize_t *, PyObject **, PyObject *);
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 9,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+        API_PyArray_GetArrayParamsFromObject = 278,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module m = module::import("numpy.core.multiarray");
+        auto c = m.attr("_ARRAY_API");
+#if PY_MAJOR_VERSION >= 3
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
+#else
+        void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr());
+#endif
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_SetBaseObject);
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy* array_proxy(void* ptr) {
+    return reinterpret_cast<PyArray_Proxy*>(ptr);
+}
+
+inline const PyArray_Proxy* array_proxy(const void* ptr) {
+    return reinterpret_cast<const PyArray_Proxy*>(ptr);
+}
+
+inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) {
+   return reinterpret_cast<PyArrayDescr_Proxy*>(ptr);
+}
+
+inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) {
+   return reinterpret_cast<const PyArrayDescr_Proxy*>(ptr);
+}
+
+inline bool check_flags(const void* ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T> struct is_std_array : std::false_type { };
+template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
+template <typename T> struct is_complex : std::false_type { };
+template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+
+template <typename T> struct array_info_scalar {
+    typedef T type;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = _("");
+    static void append_extents(list& /* shape */) { }
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T, size_t N> struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list& shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = _<array_info<T>::is_array>(
+        concat(_<N>(), array_info<T>::extents), _<N>()
+    );
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
+template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
+template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+template <typename T> using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T> using is_pod_struct = all_of<
+    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
+    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
+    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
+    std::is_trivially_copyable<T>,
+#else
+    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+    std::is_trivially_destructible<T>,
+    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#endif
+    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+>;
+
+template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
+            shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix> const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const { return operator()(index); }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
+    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+public:
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix> T& operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) { return operator()(index); }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+
+NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_);
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
+    }
+
+    explicit dtype(const std::string &format) {
+        m_ptr = from_args(pybind11::str(format)).release().ptr();
+    }
+
+    dtype(const char *format) : dtype(std::string(format)) { }
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = names;
+        args["formats"] = formats;
+        args["offsets"] = offsets;
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(object args) {
+        PyObject *ptr = nullptr;
+        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
+            throw error_already_set();
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T> static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(m_ptr)->elsize;
+    }
+
+    /// Returns true for structured data types.
+    bool has_fields() const {
+        return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
+    }
+
+    /// Single-character type code.
+    char kind() const {
+        return detail::array_descriptor_proxy(m_ptr)->kind;
+    }
+
+private:
+    static object _dtype_from_pep3118() {
+        static PyObject *obj = module::import("numpy.core._internal")
+            .attr("_dtype_from_pep3118").cast<object>().release().ptr();
+        return reinterpret_borrow<object>(obj);
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields())
+            return *this;
+
+        struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; };
+        std::vector<field_descr> field_descriptors;
+
+        for (auto field : attr("fields").attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto format = spec[1].cast<tuple>()[0].cast<dtype>();
+            auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
+            if (!len(name) && format.kind() == 'V')
+                continue;
+            field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
+        }
+
+        std::sort(field_descriptors.begin(), field_descriptors.end(),
+                  [](const field_descr& a, const field_descr& b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto& descr : field_descriptors) {
+            names.append(descr.name);
+            formats.append(descr.format);
+            offsets.append(descr.offset);
+        }
+        return dtype(names, formats, offsets, itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array({{0}}, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
+          const void *ptr = nullptr, handle base = handle()) {
+
+        if (strides->empty())
+            *strides = c_strides(*shape, dt.itemsize());
+
+        auto ndim = shape->size();
+        if (ndim != strides->size())
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base))
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            else
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            const_cast<void *>(ptr), flags, nullptr));
+        if (!tmp)
+            throw error_already_set();
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) { }
+
+    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) { }
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
+
+    explicit array(const buffer_info &info)
+    : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { }
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    }
+
+    /// Total number of bytes
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+
+    /// Number of dimensions
+    ssize_t ndim() const {
+        return detail::array_proxy(m_ptr)->nd;
+    }
+
+    /// Base object
+    object base() const {
+        return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
+    }
+
+    /// Dimensions of the array
+    const ssize_t* shape() const {
+        return detail::array_proxy(m_ptr)->dimensions;
+    }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t* strides() const {
+        return detail::array_proxy(m_ptr)->strides;
+    }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const {
+        return detail::array_proxy(m_ptr)->flags;
+    }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template<typename... Ix> const void* data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template<typename... Ix> void* mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim())
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
+     * reshaped for the duration of the returned object, and the caller must take care not to access
+     * invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto& api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d = {
+            new_shape->data(), int(new_shape->size())
+        };
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        object new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
+        );
+        if (!new_array) throw error_already_set();
+        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+protected:
+    template<typename, typename> friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string& msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) +
+                          " (ndim = " + std::to_string(ndim()) + ")");
+    }
+
+    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable())
+            throw std::domain_error("array is not writeable");
+    }
+
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        if (ndim > 0)
+            for (size_t i = ndim - 1; i > 0; --i)
+                strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
+        return strides;
+    }
+
+    template<typename... Ix> void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
+
+    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i) +
+                              " is out of bounds for axis " + std::to_string(axis) +
+                              " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) PyErr_Clear();
+        if (!is_borrowed) Py_XDECREF(h.ptr());
+    }
+
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) throw error_already_set();
+    }
+
+    explicit array_t(const buffer_info& info) : array(info) { }
+
+    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) { }
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
+
+    explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) { }
+
+    constexpr ssize_t itemsize() const {
+        return sizeof(T);
+    }
+
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template<typename... Ix> const T* data(Ix... index) const {
+        return static_cast<const T*>(array::data(index...));
+    }
+
+    template<typename... Ix> T* mutable_data(Ix... index) {
+        return static_cast<T*>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template<typename... Ix> const T& at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template<typename... Ix> T& mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
+     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
+     * for the duration of the returned object, and the caller must take care not to access invalid
+     * dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, dtype::of<T>().release().ptr(), 0, 0,
+            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N> struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+template <size_t N> struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src))
+            return false;
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, bool>::value>(
+        _("bool"), _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T)*8>()
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
+        _("float") + _<sizeof(T)*8>(), _("longdouble")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
+                                   || std::is_same<typename T::value_type, double>::value>(
+        _("complex") + _<sizeof(typename T::value_type)*16>(), _("longcomplex")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {
+        npy_api::NPY_BOOL_,
+        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_INT16_,    npy_api::NPY_UINT16_,
+        npy_api::NPY_INT32_,  npy_api::NPY_UINT32_,  npy_api::NPY_INT64_,    npy_api::NPY_UINT64_,
+        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
+        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
+    };
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() {
+        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
+            return reinterpret_steal<pybind11::dtype>(ptr);
+        pybind11_fail("Unsupported buffer format!");
+    }
+};
+
+#define PYBIND11_DECL_CHAR_FMT \
+    static constexpr auto name = _("S") + _<N>(); \
+    static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
+template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#undef PYBIND11_DECL_CHAR_FMT
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
+    }
+};
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+inline PYBIND11_NOINLINE void register_structured_dtype(
+    any_container<field_descriptor> fields,
+    const std::type_info& tinfo, ssize_t itemsize,
+    bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto& numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false))
+        pybind11_fail("NumPy: dtype is already registered");
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(ordered_fields.begin(), ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto& field : ordered_fields) {
+        if (!field.descr)
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
+                            field.name + "` @ " + tinfo.name());
+        names.append(PYBIND11_STR_TYPE(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto& field : ordered_fields) {
+        if (field.offset > offset)
+            oss << (field.offset - offset) << 'x';
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset)
+        oss << (itemsize - offset) << 'x';
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Sanity check: verify that NumPy properly parses our buffer format string
+    auto& api = npy_api::get();
+    auto arr =  array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str };
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE> struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() {
+        return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
+    }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields), typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T), &direct_converter);
+    }
+
+private:
+    static PyObject* dtype_ptr() {
+        static PyObject* ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void*& value) {
+        auto& api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
+            return false;
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#else
+
+#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+    ::pybind11::detail::field_descriptor {                                                    \
+        Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+        ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
+        ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
+    }
+
+// Extract name, offset and format descriptor for a struct field
+#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#define PYBIND11_EVAL0(...) __VA_ARGS__
+#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__)))
+#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__)))
+#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__)))
+#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__)))
+#define PYBIND11_EVAL(...)  PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__)))
+#define PYBIND11_MAP_END(...)
+#define PYBIND11_MAP_OUT
+#define PYBIND11_MAP_COMMA ,
+#define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0)
+#define PYBIND11_MAP_NEXT(test, next)  PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#ifdef _MSC_VER // MSVC is not as eager to expand macros, hence this workaround
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP_LIST_NEXT(test, next) \
+    PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#define PYBIND11_MAP_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#ifdef _MSC_VER
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP2_LIST_NEXT(test, next) \
+    PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#define PYBIND11_MAP2_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+template  <class T>
+using array_iterator = typename std::add_pointer<T>::type;
+
+template <class T>
+array_iterator<T> array_begin(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
+}
+
+template <class T>
+array_iterator<T> array_end(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
+}
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : p_ptr(0), m_strides() {}
+
+    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
+        : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            value_type s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) {
+        p_ptr += m_strides[dim];
+    }
+
+    void* data() const {
+        return p_ptr;
+    }
+
+private:
+    char* p_ptr;
+    container_type m_strides;
+};
+
+template <size_t N> class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers,
+                         const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0),
+          m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i)
+            m_shape[i] = shape[i];
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i)
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+    }
+
+    multi_array_iterator& operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            } else {
+                m_index[i] = 0;
+            }
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
+    }
+
+private:
+
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter)
+                *strides_iter = *buffer_strides_iter;
+            else
+                *strides_iter = 0;
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator)
+            iter.increment(dim);
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
+// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
+// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+        return std::max(res, buf.ndim);
+    });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
+    // the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
+            if (dim_size_out == 1)
+                dim_size_out = dim_size_in;
+            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1)
+            continue;
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim)
+            return broadcast_trivial::non_trivial;
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
+            return broadcast_trivial::non_trivial;
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
+                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_c = false;
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
+                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_f = false;
+            }
+        }
+    }
+
+    return
+        trivial_broadcast_c ? broadcast_trivial::c_trivial :
+        trivial_broadcast_f ? broadcast_trivial::f_trivial :
+        broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+private:
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag
+    // when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        size_t ndim = (size_t) nd;
+
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
+
+        if (size == 0) return std::move(result);
+
+        /* Call the function */
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
+
+        return std::move(result);
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
+
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+};
+
+NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
+vectorize(Return (*f) (Args ...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(
+        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/operators.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/operators.h
new file mode 100644
index 0000000..b3dd62c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/operators.h
@@ -0,0 +1,168 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
+    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
+    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
+    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
+    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
+    op_repr, op_truediv, op_itruediv, op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t { };
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t { };
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R> struct op_ {
+    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
+};                                                                                     \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
+    static char const* name() { return "__" #rid "__"; }                               \
+    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
+};                                                                                     \
+inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
+    return op_<op_##id, op_l, self_t, self_t>();                                       \
+}                                                                                      \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}                                                                                      \
+template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
+    return op_<op_##id, op_r, T, self_t>();                                            \
+}
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
+};                                                                                     \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
+template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
+    static B execute_cast(const L &l) { return B(expr); }                              \
+};                                                                                     \
+inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
+    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
+}
+
+PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
+PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
+PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
+PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
+PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
+PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
+PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
+PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
+PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
+PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
+PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
+PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
+PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
+PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
+PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
+PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
+//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
+PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
+PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
+PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
+PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
+PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
+PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
+PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
+PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+NAMESPACE_END(detail)
+
+using detail::self;
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/options.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/options.h
new file mode 100644
index 0000000..cc1e1f6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/options.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options&) = delete;
+    options& operator=(const options&) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() {
+        global_state() = previous_state;
+    }
+
+    // Setter methods (affect the global state):
+
+    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+
+    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+
+    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+
+    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    // This type is not meant to be allocated on the heap.
+    void* operator new(size_t) = delete;
+
+private:
+
+    struct state {
+        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pybind11.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pybind11.h
new file mode 100644
index 0000000..d95d61f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pybind11.h
@@ -0,0 +1,2183 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning push
+#  pragma warning disable 68    // integer conversion resulted in a change of sign
+#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
+#  pragma warning disable 878   // incompatible exception specifications
+#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
+#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#  pragma warning disable 1786  // function "strdup" was declared deprecated
+#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
+#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
+#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
+#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
+#  pragma warning(disable: 4702) // warning C4702: unreachable code
+#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#  pragma GCC diagnostic ignored "-Wattributes"
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+#  endif
+#endif
+
+#include "attr.h"
+#include "options.h"
+#include "detail/class.h"
+#include "detail/init.h"
+
+#if defined(__GNUG__) && !defined(__clang__)
+#  include <cxxabi.h>
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() { }
+    cpp_function(std::nullptr_t) { }
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    cpp_function(Return (*f)(Args...), const Extra&... extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func, typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    cpp_function(Func &&f, const Extra&... extra) {
+        initialize(std::forward<Func>(f),
+                   (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE detail::function_record *make_function_record() {
+        return new detail::function_record();
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
+        using namespace detail;
+        struct capture { remove_reference_t<Func> f; };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture object) */
+        auto rec = make_function_record();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wplacement-new"
+#endif
+            new ((capture *) &rec->data) capture { std::forward<Func>(f) };
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic pop
+#endif
+            if (!std::is_trivially_destructible<Func>::value)
+                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
+        } else {
+            rec->data[0] = new capture { std::forward<Func>(f) };
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out = make_caster<
+            conditional_t<std::is_void<Return>::value, void_type, Return>
+        >;
+
+        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                      "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call))
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            auto data = (sizeof(capture) <= sizeof(call.func.data)
+                         ? &call.func.data : call.func.data[0]);
+            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result = cast_out::cast(
+                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        /* Generate a readable signature describing the function's arguments and return value types */
+        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
+
+        if (cast_in::has_args) rec->has_args = true;
+        if (cast_in::has_kwargs) rec->has_kwargs = true;
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr =
+            std::is_convertible<Func, FunctionType>::value &&
+            sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(detail::function_record *rec, const char *text,
+                            const std::type_info *const *types, size_t args) {
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = strdup(rec->name ? rec->name : "");
+        if (rec->doc) rec->doc = strdup(rec->doc);
+        for (auto &a: rec->args) {
+            if (a.name)
+                a.name = strdup(a.name);
+            if (a.descr)
+                a.descr = strdup(a.descr);
+            else if (a.value)
+                a.descr = strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
+        }
+
+        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+
+#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(
+                PyExc_FutureWarning,
+                ("pybind11-bound class '" + class_name + "' is using an old-style "
+                 "placement-new '" + func_name + "' which has been deprecated. See "
+                 "the upgrade guide in pybind11's docs. This message is only visible "
+                 "when compiled in debug mode.").c_str(), 0
+            );
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        for (auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                if (*(pc + 1) == '*')
+                    continue;
+
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += rec->args[arg_index].descr;
+                }
+                arg_index++;
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t)
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                if (auto tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature +=
+                        th.attr("__module__").cast<std::string>() + "." +
+                        th.attr("__qualname__").cast<std::string>(); // Python 3.3+, but we backport it to earlier versions
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature +=
+                        rec->scope.attr("__module__").cast<std::string>() + "." +
+                        rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    std::string tname(t->name());
+                    detail::clean_type_id(tname);
+                    signature += tname;
+                }
+            } else {
+                signature += c;
+            }
+        }
+        if (arg_index != args || types[type_index] != nullptr)
+            pybind11_fail("Internal error while parsing type signature (2)");
+
+#if PY_MAJOR_VERSION < 3
+        if (strcmp(rec->name, "__next__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("next");
+        } else if (strcmp(rec->name, "__bool__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("__nonzero__");
+        }
+#endif
+        rec->signature = strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
+                chain = (detail::function_record *) rec_capsule;
+                /* Never append a method to an overload chain of a parent class;
+                   instead, hide the parent's overloads in this case */
+                if (!chain->scope.is(rec->scope))
+                    chain = nullptr;
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_')
+                pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) +
+                        "\" with a function of the same name");
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(rec, [](void *ptr) {
+                destruct((detail::function_record *) ptr);
+            });
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+        } else {
+            /* Append at the end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            chain_start = chain;
+            if (chain->is_method != rec->is_method)
+                pybind11_fail("overloading a method with both static and instance methods is not supported; "
+                    #if defined(NDEBUG)
+                        "compile in debug mode for more details"
+                    #else
+                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
+                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
+                    #endif
+                );
+            while (chain->next)
+                chain = chain->next;
+            chain->next = rec;
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) signatures += "\n";
+                if (chain)
+                    signatures += std::to_string(++index) + ". ";
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += "\n";
+            }
+            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures, we
+                // need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) first_user_def = false;
+                    else signatures += "\n";
+                }
+                if (options::show_function_signatures()) signatures += "\n";
+                signatures += it->doc;
+                if (options::show_function_signatures()) signatures += "\n";
+            }
+        }
+
+        /* Install docstring */
+        PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
+        if (func->m_ml->ml_doc)
+            std::free(const_cast<char *>(func->m_ml->ml_doc));
+        func->m_ml->ml_doc = strdup(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object");
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec) {
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data)
+                rec->free_data(rec);
+            std::free((char *) rec->name);
+            std::free((char *) rec->doc);
+            std::free((char *) rec->signature);
+            for (auto &arg: rec->args) {
+                std::free(const_cast<char *>(arg.name));
+                std::free(const_cast<char *>(arg.descr));
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+                delete rec->def;
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
+                              *it = overloads;
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right overload */
+        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+                return nullptr;
+            }
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered())
+                return none().release().ptr();
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
+            for (; it != nullptr; it = it->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
+                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the function
+                      takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get a
+                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *it;
+                size_t pos_args = func.nargs;    // Number of positional arguments that we need
+                if (func.has_args) --pos_args;   // (but don't count py::args
+                if (func.has_kwargs) --pos_args; //  or py::kwargs)
+
+                if (!func.has_args && n_args_in > pos_args)
+                    continue; // Too many arguments for this overload
+
+                if (n_args_in < pos_args && func.args.size() < pos_args)
+                    continue; // Not enough arguments given, and not enough defaults to fill in the blanks
+
+                function_call call(func, parent);
+
+                size_t args_to_copy = (std::min)(pos_args, n_args_in); // Protect std::min with parentheses
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder)
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg)
+                    continue; // Maybe it was meant for another overload (issue #688)
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < pos_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < pos_args; ++args_copied) {
+                        const auto &arg = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg.name)
+                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            PyDict_DelItemString(kwargs.ptr(), arg.name);
+                        } else if (arg.value) {
+                            value = arg.value;
+                        }
+
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg.convert);
+                        }
+                        else
+                            break;
+                    }
+
+                    if (args_copied < pos_args)
+                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+                        }
+                    }
+                    call.args.push_back(extra_args);
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr())
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+                // 5. Put everything in a vector.  Not technically step 5, we've been building it
+                // in `call.args` all along.
+                #if !defined(NDEBUG)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
+                #endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+                    break;
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'it' to be valid, as it would be
+                        // if we'd encountered this failure in the first-pass loop.
+                        if (!result)
+                            it = &call.func;
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#if defined(__GNUG__) && !defined(__clang__)
+        } catch ( abi::__forced_unwind& ) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception
+               in reverse order of registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call PyErr_SetString or PyErr_SetObject
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception. */
+
+            auto last_exception = std::current_exception();
+            auto &registered_exception_translators = get_internals().registered_exception_translators;
+            for (auto& translator : registered_exception_translators) {
+                try {
+                    translator(last_exception);
+                } catch (...) {
+                    last_exception = std::current_exception();
+                    continue;
+                }
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator)
+                return handle(Py_NotImplemented).inc_ref().ptr();
+
+            std::string msg = std::string(overloads->name) + "(): incompatible " +
+                std::string(overloads->is_constructor ? "constructor" : "function") +
+                " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    "+ std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) next = end = sig.find(')');
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) msg += it2->signature;
+
+                msg += "\n";
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) some_args = true;
+                else msg += ", ";
+                msg += pybind11::repr(args_[ti]);
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (kwargs.size() > 0) {
+                    if (some_args) msg += "; ";
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) first = false;
+                        else msg += ", ";
+                        msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else {
+            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+                auto *pi = reinterpret_cast<instance *>(parent.ptr());
+                self_value_and_holder.type->init_instance(pi, nullptr);
+            }
+            return result.ptr();
+        }
+    }
+};
+
+/// Wrapper for Python extension modules
+class module : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    explicit module(const char *name, const char *doc = nullptr) {
+        if (!options::show_user_defined_docstrings()) doc = nullptr;
+#if PY_MAJOR_VERSION >= 3
+        PyModuleDef *def = new PyModuleDef();
+        std::memset(def, 0, sizeof(PyModuleDef));
+        def->m_name = name;
+        def->m_doc = doc;
+        def->m_size = -1;
+        Py_INCREF(def);
+        m_ptr = PyModule_Create(def);
+#else
+        m_ptr = Py_InitModule3(name, nullptr, doc);
+#endif
+        if (m_ptr == nullptr)
+            pybind11_fail("Internal error in module::module()");
+        inc_ref();
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module &def(const char *name_, Func &&f, const Extra& ... extra) {
+        cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
+                          sibling(getattr(*this, name_, none())), extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module m("example", "pybind11 example plugin");
+            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module def_submodule(const char *name, const char *doc = nullptr) {
+        std::string full_name = std::string(PyModule_GetName(m_ptr))
+            + std::string(".") + std::string(name);
+        auto result = reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
+        if (doc && options::show_user_defined_docstrings())
+            result.attr("__doc__") = pybind11::str(doc);
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj)
+            throw error_already_set();
+        return reinterpret_steal<module>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj)
+            throw error_already_set();
+        *this = reinterpret_steal<module>(obj);
+    }
+
+    // Adds an object to the module using the given name.  Throws if an object with the given name
+    // already exists.
+    //
+    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
+    // established will, in most cases, break things.
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name))
+            pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
+                    std::string(name) + "\"");
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+};
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+}
+
+NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+    template <typename...> friend class class_;
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, rec.name))
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
+                          "\": an object with that name is already defined");
+
+        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
+                          "\" is already registered!");
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local)
+            registered_local_types_cpp()[tindex] = tinfo;
+        else
+            internals.registered_types_cpp[tindex] = tinfo;
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        }
+        else if (rec.bases.size() == 1) {
+            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2)
+                tinfo2->simple_type = false;
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(
+            buffer_info *(*get_buffer)(PyObject *, void *),
+            void *get_buffer_data) {
+        PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr;
+        auto tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer)
+            pybind11_fail(
+                "To be able to register buffer protocol support for the type '" +
+                std::string(tinfo->type->tp_name) +
+                "' the associated class<>(..) invocation must "
+                "include the pybind11::buffer_protocol() annotation!");
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget, handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
+        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
+                                                       : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+
+template <typename> void set_operator_new(...) { }
+
+template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
+template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type { };
+template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
+template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type { };
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); }
+template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); }
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void)s; (void)a;
+    #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+        if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+            #ifdef __cpp_sized_deallocation
+                ::operator delete(p, s, std::align_val_t(a));
+            #else
+                ::operator delete(p, std::align_val_t(a));
+            #endif
+            return;
+        }
+    #endif
+    #ifdef __cpp_sized_deallocation
+        ::operator delete(p, s);
+    #else
+        ::operator delete(p);
+    #endif
+}
+
+NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T> using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T> struct is_valid_class_option :
+        detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+            "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+            "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &... extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
+                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type>&);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) { }
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func&& f, const Extra&... extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = cf;
+        return *this;
+    }
+
+    template <typename Func, typename... Extra> class_ &
+    def_static(const char *name_, Func &&f, const Extra&... extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = staticmethod(cf);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ &def(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ & def_cast(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func> class_& def_buffer(Func &&func) {
+        struct capture { Func func; };
+        capture *ptr = new capture { std::forward<Func>(func) };
+        install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
+            detail::make_caster<type> caster;
+            if (!caster.load(obj, false))
+                return nullptr;
+            return new buffer_info(((capture *) ptr)->func(caster));
+        }, ptr);
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func] (type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
+                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
+                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
+        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                            return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        static_assert( 0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+           char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
+           detail::process_attributes<Extra...>::init(extra..., rec_fget);
+           if (rec_fget->doc && rec_fget->doc != doc_prev) {
+              free(doc_prev);
+              rec_fget->doc = strdup(rec_fget->doc);
+           }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                free(doc_prev);
+                rec_fset->doc = strdup(rec_fset->doc);
+            }
+            if (! rec_active) rec_active = rec_fset;
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
+        try {
+            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                    v_h.value_ptr<type>()->shared_from_this());
+            if (sh) {
+                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+                v_h.set_holder_constructed();
+            }
+        } catch (const std::bad_weak_ptr &) {}
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
+    /// optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        }
+        else {
+            detail::call_operator_delete(v_h.value_ptr<type>(),
+                v_h.type->type_size,
+                v_h.type->type_align
+            );
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        return h ? (detail::function_record *) reinterpret_borrow<capsule>(PyCFunction_GET_SELF(h.ptr()))
+                 : nullptr;
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) { return {std::forward<Func>(f)}; }
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the second
+/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+NAMESPACE_BEGIN(detail)
+struct enum_base {
+    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](handle arg) -> str {
+                handle type = arg.get_type();
+                object type_name = type.attr("__name__");
+                dict entries = type.attr("__entries");
+                for (const auto &kv : entries) {
+                    object other = kv.second[int_(0)];
+                    if (other.equal(arg))
+                        return pybind11::str("{}.{}").format(type_name, kv.first);
+                }
+                return pybind11::str("{}.???").format(type_name);
+            }, is_method(m_base)
+        );
+
+        m_base.attr("name") = property(cpp_function(
+            [](handle arg) -> str {
+                dict entries = arg.get_type().attr("__entries");
+                for (const auto &kv : entries) {
+                    if (handle(kv.second[int_(0)]).equal(arg))
+                        return pybind11::str(kv.first);
+                }
+                return "???";
+            }, is_method(m_base)
+        ));
+
+        m_base.attr("__doc__") = static_property(cpp_function(
+            [](handle arg) -> std::string {
+                std::string docstring;
+                dict entries = arg.attr("__entries");
+                if (((PyTypeObject *) arg.ptr())->tp_doc)
+                    docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
+                docstring += "Members:";
+                for (const auto &kv : entries) {
+                    auto key = std::string(pybind11::str(kv.first));
+                    auto comment = kv.second[int_(1)];
+                    docstring += "\n\n  " + key;
+                    if (!comment.is_none())
+                        docstring += " : " + (std::string) pybind11::str(comment);
+                }
+                return docstring;
+            }
+        ), none(), none(), "");
+
+        m_base.attr("__members__") = static_property(cpp_function(
+            [](handle arg) -> dict {
+                dict entries = arg.attr("__entries"), m;
+                for (const auto &kv : entries)
+                    m[kv.first] = kv.second[int_(0)];
+                return m;
+            }), none(), none(), ""
+        );
+
+        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a, object b) {                                               \
+                    if (!a.get_type().is(b.get_type()))                                \
+                        strict_behavior;                                               \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b_) {                                             \
+                    int_ a(a_), b(b_);                                                 \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                    \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b) {                                              \
+                    int_ a(a_);                                                        \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() &&  a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__",  b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__",   a <  b);
+                PYBIND11_ENUM_OP_CONV("__gt__",   a >  b);
+                PYBIND11_ENUM_OP_CONV("__le__",   a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__",   a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__",  a &  b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a &  b);
+                PYBIND11_ENUM_OP_CONV("__or__",   a |  b);
+                PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
+                PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
+                m_base.attr("__invert__") = cpp_function(
+                    [](object arg) { return ~(int_(arg)); }, is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+                #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) <  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) >  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+                #undef PYBIND11_THROW
+            }
+        }
+
+        #undef PYBIND11_ENUM_OP_CONV_LHS
+        #undef PYBIND11_ENUM_OP_CONV
+        #undef PYBIND11_ENUM_OP_STRICT
+
+        object getstate = cpp_function(
+            [](object arg) { return int_(arg); }, is_method(m_base));
+
+        m_base.attr("__getstate__") = getstate;
+        m_base.attr("__hash__") = getstate;
+    }
+
+    PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!");
+        }
+
+        entries[name] = std::make_pair(value, doc);
+        m_base.attr(name) = value;
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (const auto &kv : entries)
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type> class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::def;
+    using Base::attr;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Scalar = typename std::underlying_type<Type>::type;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra&... extra)
+      : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }));
+        def("__int__", [](Type value) { return (Scalar) value; });
+        #if PY_MAJOR_VERSION < 3
+            def("__long__", [](Type value) { return (Scalar) value; });
+        #endif
+        #if PY_MAJOR_VERSION > 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 8)
+            def("__index__", [](Type value) { return (Scalar) value; });
+        #endif
+
+        cpp_function setstate(
+            [](Type &value, Scalar arg) { value = static_cast<Type>(arg); },
+            is_method(*this));
+        attr("__setstate__") = setstate;
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_& export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_& value(char const* name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+NAMESPACE_BEGIN(detail)
+
+
+inline void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient)
+        pybind11_fail("Could not activate keep_alive!");
+
+    if (patient.is_none() || nurse.is_none())
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    }
+    else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport(
+            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0)
+            return ret;
+        else if (n == 1 && call.init_self)
+            return call.init_self;
+        else if (n <= call.args.size())
+            return call.args[n - 1];
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals().registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+        .try_emplace(type);
+#else
+        .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+            get_internals().registered_types_py.erase(type);
+            wr.dec_ref();
+        })).release();
+    }
+
+    return res;
+}
+
+template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = decltype(*std::declval<Iterator>()),
+          typename... Extra>
+iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> ValueType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return *s.it;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = decltype((*std::declval<Iterator>()).first),
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> KeyType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return (*s.it).first;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
+    return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
+    return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+template <typename InputType, typename OutputType> void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        set_flag(bool &flag) : flag(flag) { flag = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) // implicit conversions are non-reentrant
+            return nullptr;
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false))
+            return nullptr;
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr)
+            PyErr_Clear();
+        return result;
+    };
+
+    if (auto tinfo = detail::get_type_info(typeid(OutputType)))
+        tinfo->implicit_conversions.push_back(implicit_caster);
+    else
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+}
+
+template <typename ExceptionTranslator>
+void register_exception_translator(ExceptionTranslator&& translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with PyErr_SetString for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
+        std::string full_name = scope.attr("__name__").cast<std::string>() +
+                                std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
+        if (hasattr(scope, name))
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \"" + std::string(name) + "\"");
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    void operator()(const char *message) {
+        PyErr_SetString(m_ptr, message);
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Returns a reference to a function-local static exception object used in the simple
+// register_exception approach below.  (It would be simpler to have the static local variable
+// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+template <typename CppException>
+exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
+NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs an exception translator to
+ * translate the C++ exception to the created Python exception using the exceptions what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_exception(handle scope,
+                                            const char *name,
+                                            PyObject *base = PyExc_Exception) {
+    auto &ex = detail::get_exception_object<CppException>();
+    if (!ex) ex = exception<CppException>(scope, name, base);
+
+    register_exception_translator([](std::exception_ptr p) {
+        if (!p) return;
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            detail::get_exception_object<CppException>()(e.what());
+        }
+    });
+    return ex;
+}
+
+NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
+    auto line = sep.attr("join")(strings);
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(line);
+    write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
+        file.attr("flush")();
+}
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto const &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+            #if !defined(NDEBUG)
+                if (!tstate)
+                    pybind11_fail("scoped_acquire: could not create thread state!");
+            #endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            /* Work around an annoying assertion in PyThreadState_Swap */
+            #if defined(Py_DEBUG)
+                PyInterpreterState *interp = tstate->interp;
+                tstate->interp = nullptr;
+            #endif
+            PyEval_AcquireThread(tstate);
+            #if defined(Py_DEBUG)
+                tstate->interp = interp;
+            #endif
+        }
+
+        inc_ref();
+    }
+
+    void inc_ref() {
+        ++tstate->gilstate_counter;
+    }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+        #if !defined(NDEBUG)
+            if (detail::get_thread_state_unchecked() != tstate)
+                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+            if (tstate->gilstate_counter < 0)
+                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        #endif
+        if (tstate->gilstate_counter == 0) {
+            #if !defined(NDEBUG)
+                if (!release)
+                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            #endif
+            PyThreadState_Clear(tstate);
+            PyThreadState_DeleteCurrent();
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release)
+           PyEval_SaveThread();
+    }
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+};
+
+class gil_scoped_release {
+public:
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+    ~gil_scoped_release() {
+        if (!tstate)
+            return;
+        PyEval_RestoreThread(tstate);
+        if (disassoc) {
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+};
+#elif defined(PYPY_VERSION)
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+public:
+    gil_scoped_acquire() { state = PyGILState_Ensure(); }
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+public:
+    gil_scoped_release() { state = PyEval_SaveThread(); }
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+};
+#else
+class gil_scoped_acquire { };
+class gil_scoped_release { };
+#endif
+
+error_already_set::~error_already_set() {
+    if (m_type) {
+        gil_scoped_acquire gil;
+        error_scope scope;
+        m_type.release().dec_ref();
+        m_value.release().dec_ref();
+        m_trace.release().dec_ref();
+    }
+}
+
+inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name)  {
+    handle self = detail::get_object_handle(this_ptr, this_type);
+    if (!self)
+        return function();
+    handle type = self.get_type();
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overloaded in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = detail::get_internals().inactive_overload_cache;
+    if (cache.find(key) != cache.end())
+        return function();
+
+    function overload = getattr(self, name, function());
+    if (overload.is_cpp_function()) {
+        cache.insert(key);
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame && (std::string) str(frame->f_code->co_name) == name &&
+        frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller = PyDict_GetItem(
+            frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr())
+            return function();
+    }
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d; d["self"] = self; d["name"] = pybind11::str(name);
+    PyObject *result = PyRun_String(
+        "import inspect\n"
+        "frame = inspect.currentframe()\n"
+        "if frame is not None:\n"
+        "    frame = frame.f_back\n"
+        "    if frame is not None and str(frame.f_code.co_name) == name and "
+        "frame.f_code.co_argcount > 0:\n"
+        "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+        "        if self_caller == self:\n"
+        "            self = None\n",
+        Py_file_input, d.ptr(), d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    if (d["self"].is_none())
+        return function();
+    Py_DECREF(result);
+#endif
+
+    return overload;
+}
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
+
+  :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first
+                   non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overloaded Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T> function get_overload(const T *this_ptr, const char *name) {
+    auto tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \
+        pybind11::gil_scoped_acquire gil; \
+        pybind11::function overload = pybind11::get_overload(static_cast<const cname *>(this), name); \
+        if (overload) { \
+            auto o = overload(__VA_ARGS__); \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
+                static pybind11::detail::overload_caster_t<ret_type> caster; \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
+            } \
+            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
+        } \
+    }
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn'
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERLOAD_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            toString,    // Name of function in C++ (name)
+            "__str__",   // Name of method in Python (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    return cname::fn(__VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it
+    throws if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the method
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. This macro should be used if the method name in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERLOAD_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws
+    if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#  pragma warning(pop)
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic pop
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pytypes.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pytypes.h
new file mode 100644
index 0000000..4003d69
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pytypes.h
@@ -0,0 +1,1484 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+#include <utility>
+#include <type_traits>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* A few forward declarations */
+class handle; class object;
+class str; class iterator;
+struct arg; struct arg_v;
+
+NAMESPACE_BEGIN(detail)
+class args_proxy;
+inline bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy> class accessor;
+namespace accessor_policies {
+    struct obj_attr;
+    struct str_attr;
+    struct generic_item;
+    struct sequence_item;
+    struct list_item;
+    struct tuple_item;
+}
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag { };
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T> bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+        object call(Args&&... args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const      { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const  { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const  { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const  { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other) const;
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other) const;
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other) const;
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other) const;
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other) const;
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other) const;
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other) const;
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other) const;
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other) const;
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+    /// Return a handle to the Python type object underlying the instance
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+    /// Creates a ``handle`` from the given raw Python object pointer
+    handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T> T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+protected:
+    PyObject *m_ptr = nullptr;
+};
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+      PyObject *tmp = m_ptr;
+      m_ptr = nullptr;
+      return handle(tmp);
+    }
+
+    object& operator=(const object &other) {
+        other.inc_ref();
+        dec_ref();
+        m_ptr = other.m_ptr;
+        return *this;
+    }
+
+    object& operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T> T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T> T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t { };
+    struct stolen_t { };
+
+    template <typename T> friend T reinterpret_borrow(handle);
+    template <typename T> friend T reinterpret_steal(handle);
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) { }
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+
+NAMESPACE_BEGIN(detail)
+inline std::string error_string();
+NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class error_already_set : public std::runtime_error {
+public:
+    /// Constructs a new exception from the current Python error indicator, if any.  The current
+    /// Python error indicator will be cleared.
+    error_already_set() : std::runtime_error(detail::error_string()) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+    }
+
+    error_already_set(const error_already_set &) = default;
+    error_already_set(error_already_set &&) = default;
+
+    inline ~error_already_set();
+
+    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
+    /// already set it is cleared first.  After this call, the current object no longer stores the
+    /// error variables (but the `.what()` string is still available).
+    void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+
+    const object& type() const { return m_type; }
+    const object& value() const { return m_value; }
+    const object& trace() const { return m_trace; }
+
+private:
+    object m_type, m_value, m_trace;
+};
+
+/** \defgroup python_builtins _
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return T::check_(obj); }
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
+
+template <> inline bool isinstance<handle>(handle obj) = delete;
+template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1)
+        throw error_already_set();
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) { throw error_already_set(); }
+    return h;
+}
+
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+#if PY_MAJOR_VERSION >= 3
+        if (PyInstanceMethod_Check(value.ptr()))
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        else
+#endif
+        if (PyMethod_Check(value.ptr()))
+            value = PyMethod_GET_FUNCTION(value.ptr());
+    }
+    return value;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python accessors/methods.
+// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
+// through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) { return std::forward<T>(o); }
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
+    accessor(const accessor &) = default;
+    accessor(accessor &&) = default;
+
+    // accessor overload required to override default assignment operator (templates are not allowed
+    // to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T> void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T> void operator=(T &&value) & {
+        get_cache() = reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
+            std::is_same<T, accessor_policies::obj_attr>::value, bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T> T cast() const { return get_cache().template cast<T>(); }
+
+private:
+    object &get_cache() const {
+        if (!cache) { cache = Policy::get(obj, key); }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+
+NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+
+    reference operator*() const { return Policy::dereference(); }
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() { Policy::increment(); return *this; }
+    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
+    It &operator--() { Policy::decrement(); return *this; }
+    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
+    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
+    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+
+    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
+    friend bool operator> (const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    arrow_proxy(T &&value) : value(std::move(value)) { }
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type;
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    reference dereference() const { return {key, value}; }
+    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    } else {
+        PyErr_Clear();
+        return false;
+    }
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+#if PY_MAJOR_VERSION >= 3
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+#endif
+
+inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) { }
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) { }
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T> using is_keyword = std::is_base_of<arg, T>;
+template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T> using is_positional = satisfies_none_of<T,
+    is_keyword, is_s_unpacking, is_ds_unpacking
+>;
+template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    public: \
+        PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
+        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
+        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
+        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
+        PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
+        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
+        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) \
+    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    Name(object &&o) \
+    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    template <typename Policy_> \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) : Parent(o) { } \
+    Name(object &&o) : Parent(std::move(o)) { }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    Name() : Parent() { }
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto& self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const { operator*(); return &value; }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (PyErr_Occurred()) { throw error_already_set(); }
+    }
+
+private:
+    object value = {};
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+
+    str(const char *c, size_t n)
+        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
+    str(const char *c = "")
+        : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    str(const std::string &s) : str(s.data(), s.size()) { }
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
+
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp)
+                pybind11_fail("Unable to extract string contents! (encoding issue)");
+        }
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+            pybind11_fail("Unable to extract string contents! (invalid type)");
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+#if PY_MAJOR_VERSION < 3
+        if (!str_value) throw error_already_set();
+        PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+        Py_XDECREF(str_value); str_value = unicode;
+#endif
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    bytes(const char *c = "")
+        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    bytes(const char *c, size_t n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    // Allow implicit conversion:
+    bytes(const std::string &s) : bytes(s.data(), s.size()) { }
+
+    explicit bytes(const pybind11::str &s);
+
+    operator std::string() const {
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+            pybind11_fail("Unable to extract bytes contents!");
+        return std::string(buffer, (size_t) length);
+    }
+};
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp)
+            pybind11_fail("Unable to extract string contents! (encoding issue)");
+    }
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract string contents! (invalid type)");
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj)
+        pybind11_fail("Could not allocate bytes object!");
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes& b) {
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract bytes contents!");
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
+    if (!obj)
+        pybind11_fail("Could not allocate string object!");
+    m_ptr = obj.release().ptr();
+}
+
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) { }
+};
+
+#if PY_MAJOR_VERSION >= 3
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) { }
+};
+#endif
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) { }
+    // Allow implicit conversion from and to `bool`:
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
+    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) return nullptr;
+        return handle(value ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)
+#if PY_VERSION_HEX < 0x03000000
+            || PyInt_Check(o)
+#endif
+    ) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    else {
+        unsigned long long v = PyLong_AsUnsignedLongLong(o);
+        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+}
+NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
+    // Allow implicit conversion from C++ integral types:
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLong((long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+        } else {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLongLong((long long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+        if (!m_ptr) pybind11_fail("Could not allocate int object!");
+    }
+
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    operator T() const {
+        return std::is_unsigned<T>::value
+            ? detail::as_unsigned<T>(m_ptr)
+            : sizeof(T) <= sizeof(long)
+              ? (T) PyLong_AsLong(m_ptr)
+              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
+    }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
+        int_ start(start_), stop(stop_), step(step_);
+        m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
+        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
+    }
+    bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
+                 size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length, (ssize_t *) start,
+                                    (ssize_t *) stop, (ssize_t *) step,
+                                    (ssize_t *) slicelength) == 0;
+    }
+    bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
+      ssize_t *slicelength) const {
+      return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+          length, start,
+          stop, step,
+          slicelength) == 0;
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+
+    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
+    capsule(const void *value, void (*destruct)(PyObject *))
+        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    capsule(const void *value, void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            void *ptr = PyCapsule_GetPointer(o, nullptr);
+            destructor(ptr);
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+
+        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
+            pybind11_fail("Could not set capsule context!");
+    }
+
+    capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+            destructor();
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    template <typename T> operator T *() const {
+        auto name = this->name();
+        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) pybind11_fail("Unable to extract capsule contents!");
+        return result;
+    }
+
+    const char *name() const { return PyCapsule_GetName(m_ptr); }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate dict object!");
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() const { PyDict_Clear(ptr()); }
+    template <typename T> bool contains(T &&key) const {
+        return PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr()) == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op))
+            return handle(op).inc_ref().ptr();
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const { return (size_t) PySequence_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate list object!");
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T> void append(T &&val) const {
+        PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+    template <typename T> void insert(size_t index, T &&val) const {
+        PyList_Insert(m_ptr, static_cast<ssize_t>(index),
+            detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+};
+
+class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) };
+class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)  };
+
+class set : public object {
+public:
+    PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
+    set() : object(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate set object!");
+    }
+    size_t size() const { return (size_t) PySet_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    template <typename T> bool add(T &&val) const {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() const { PySet_Clear(m_ptr); }
+    template <typename T> bool contains(T &&val) const {
+        return PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 1;
+    }
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr()))
+            return fun;
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) flags |= PyBUF_WRITABLE;
+        Py_buffer *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    explicit memoryview(const buffer_info& info) {
+        static Py_buffer buf { };
+        // Py_buffer uses signed sizes, strides and shape!..
+        static std::vector<Py_ssize_t> py_strides { };
+        static std::vector<Py_ssize_t> py_shape { };
+        buf.buf = info.ptr;
+        buf.itemsize = info.itemsize;
+        buf.format = const_cast<char *>(info.format.c_str());
+        buf.ndim = (int) info.ndim;
+        buf.len = info.size;
+        py_strides.clear();
+        py_shape.clear();
+        for (size_t i = 0; i < (size_t) info.ndim; ++i) {
+            py_strides.push_back(info.strides[i]);
+            py_shape.push_back(info.shape[i]);
+        }
+        buf.strides = py_strides.data();
+        buf.shape = py_shape.data();
+        buf.suboffsets = nullptr;
+        buf.readonly = info.readonly;
+        buf.internal = nullptr;
+
+        m_ptr = PyMemoryView_FromBuffer(&buf);
+        if (!m_ptr)
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+    }
+
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+};
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0)
+        pybind11_fail("Unable to compute length of object");
+    return (size_t) result;
+}
+
+inline size_t len_hint(handle h) {
+#if PY_VERSION_HEX >= 0x03040000
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+#else
+    ssize_t result = PyObject_Length(h.ptr());
+#endif
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) throw error_already_set();
+#if PY_MAJOR_VERSION < 3
+    PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+    Py_XDECREF(str_value); str_value = unicode;
+    if (!str_value) throw error_already_set();
+#endif
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
+template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
+template <typename D> item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D> obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D> args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+
+template <typename D>
+handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1)
+        throw error_already_set();
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                   \
+    template <typename D> object object_api<D>::op() const {                   \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));        \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                  \
+    template <typename D>                                                      \
+    object object_api<D>::op(object_api const &other) const {                  \
+        object result = reinterpret_steal<object>(                             \
+            fn(derived().ptr(), other.derived().ptr()));                       \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY (operator~,   PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY (operator-,   PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+,   PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY(operator+=,  PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-,   PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator-=,  PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*,   PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator*=,  PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/,   PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator/=,  PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|,   PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY(operator|=,  PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&,   PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY(operator&=,  PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^,   PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY(operator^=,  PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<,  PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>,  PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl.h
new file mode 100644
index 0000000..32f8d29
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl.h
@@ -0,0 +1,386 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include <list>
+#include <deque>
+#include <valarray>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    include <optional>
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    include <experimental/optional>
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+template <typename Type, typename Key> struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<pybind11::set>(src))
+            return false;
+        auto s = reinterpret_borrow<pybind11::set>(src);
+        value.clear();
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert))
+                return false;
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Key>::policy(policy);
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(value_))
+                return handle();
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+};
+
+template <typename Type, typename Key, typename Value> struct map_caster {
+    using key_conv   = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src))
+            return false;
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) ||
+                !vconv.load(it.second.ptr(), convert))
+                return false;
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value)
+                return handle();
+            d[key] = value;
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Value> struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<str>(src))
+            return false;
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type,
+              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
+    void reserve_maybe(sequence, void *) { }
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Value>::policy(policy);
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
+ : list_caster<std::vector<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
+ : list_caster<std::deque<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
+ : list_caster<std::list<Type, Alloc>, Type> { };
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size)
+            value.resize(size);
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size()))
+            return false;
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+};
+
+template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
+ : array_caster<std::array<Type, Size>, Type, false, Size> { };
+
+template <typename Type> struct type_caster<std::valarray<Type>>
+ : array_caster<std::valarray<Type>, Type, true> { };
+
+template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
+  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+
+template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+
+template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
+  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template<typename T> struct optional_caster {
+    using value_conv = make_caster<typename T::value_type>;
+
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+        if (!src)
+            return none().inc_ref();
+        policy = return_value_policy_override<typename T::value_type>::policy(policy);
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        } else if (src.is_none()) {
+            return true;  // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert))
+            return false;
+
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+};
+
+#if PYBIND11_HAS_OPTIONAL
+template<typename T> struct type_caster<std::optional<T>>
+    : public optional_caster<std::optional<T>> {};
+
+template<> struct type_caster<std::nullopt_t>
+    : public void_caster<std::nullopt_t> {};
+#endif
+
+#if PYBIND11_HAS_EXP_OPTIONAL
+template<typename T> struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template<> struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
+
+NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << (std::string) str(obj);
+    return os;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl_bind.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl_bind.h
new file mode 100644
index 0000000..62bd908
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl_bind.h
@@ -0,0 +1,649 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>  struct container_traits {
+    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
+    template <typename T2> static std::false_type test_comparable(...);
+    template <typename T2> static std::true_type test_value(typename T2::value_type *);
+    template <typename T2> static std::false_type test_value(...);
+    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2> static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type { };
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T, enable_if_t<container_traits<T>::is_element &&
+                   container_traits<T>::is_comparable>>
+    : std::true_type { };
+
+/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
+    static constexpr const bool value =
+        is_comparable<typename T::value_type>::value;
+};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value =
+        is_comparable<typename T::first_type>::value &&
+        is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+
+template<typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template<typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def("count",
+        [](const Vector &v, const T &x) {
+            return std::count(v.begin(), v.end(), x);
+        },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list"
+    );
+
+    cl.def("remove", [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end())
+                v.erase(p);
+            else
+                throw value_error();
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item."
+    );
+
+    cl.def("__contains__",
+        [](const Vector &v, const T &x) {
+            return std::find(v.begin(), v.end(), x) != v.end();
+        },
+        arg("x"),
+        "Return true the container contains ``x``"
+    );
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
+// silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("append",
+           [](Vector &v, const T &value) { v.push_back(value); },
+           arg("x"),
+           "Add an item to the end of the list");
+
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
+
+    cl.def("extend",
+       [](Vector &v, const Vector &src) {
+           v.insert(v.end(), src.begin(), src.end());
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("extend",
+       [](Vector &v, iterable it) {
+           const size_t old_size = v.size();
+           v.reserve(old_size + len_hint(it));
+           try {
+               for (handle h : it) {
+                   v.push_back(h.cast<T>());
+               }
+           } catch (const cast_error &) {
+               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
+               try {
+                   v.shrink_to_fit();
+               } catch (const std::exception &) {
+                   // Do nothing
+               }
+               throw;
+           }
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0)
+                i += v.size();
+            if (i < 0 || (SizeType)i > v.size())
+                throw index_error();
+            v.insert(v.begin() + i, x);
+        },
+        arg("i") , arg("x"),
+        "Insert an item at a given position."
+    );
+
+    cl.def("pop",
+        [](Vector &v) {
+            if (v.empty())
+                throw index_error();
+            T t = v.back();
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item"
+    );
+
+    cl.def("pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = v[(SizeType) i];
+            v.erase(v.begin() + i);
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``"
+    );
+
+    cl.def("__setitem__",
+        [wrap_i](Vector &v, DiffType i, const T &t) {
+            i = wrap_i(i, v.size());
+            v[(SizeType)i] = t;
+        }
+    );
+
+    /// Slicing protocol
+    cl.def("__getitem__",
+        [](const Vector &v, slice slice) -> Vector * {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            Vector *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i=0; i<slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, slice slice,  const Vector &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+
+            for (size_t i=0; i<slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object"
+    );
+
+    cl.def("__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, slice slice) {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object"
+    );
+
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector> using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType)i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::reference_internal, ItType, ItType, T&>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+    cl.def("__getitem__",
+        [](const Vector &v, DiffType i) -> T {
+            if (i < 0 && (i += v.size()) < 0)
+                throw index_error();
+            if ((SizeType)i >= v.size())
+                throw index_error();
+            return v[(SizeType)i];
+        }
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::copy, ItType, ItType, T>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def("__repr__",
+           [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i=0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1)
+                    s << ", ";
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list."
+    );
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        auto vec = std::unique_ptr<Vector>(new Vector());
+        vec->reserve((size_t) info.shape[0]);
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        for (; p != end; p += step)
+            vec->push_back(*p);
+        return vec.release();
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
+NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def("__bool__",
+        [](const Vector &v) -> bool {
+            return !v.empty();
+        },
+        "Check whether the list is nonempty"
+    );
+
+    cl.def("__len__", &Vector::size);
+
+
+
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+
+
+//
+// std::map, std::unordered_map
+//
+
+NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               auto it = m.find(k);
+               if (it != m.end()) it->second = v;
+               else m.emplace(k, v);
+           }
+    );
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
+template<typename Map, typename Class_>
+void map_assignment(enable_if_t<
+        !is_copy_assignable<typename Map::mapped_type>::value &&
+        is_copy_constructible<typename Map::mapped_type>::value,
+        Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               // We can't use m[k] = v; because value type might not be default constructable
+               auto r = m.emplace(k, v);
+               if (!r.second) {
+                   // value type is not copy assignable so the only way to insert it is to erase it first...
+                   m.erase(r.first);
+                   m.emplace(k, v);
+               }
+           }
+    );
+}
+
+
+template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+-> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
+
+    cl.def("__repr__",
+           [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f)
+                    s << ", ";
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map."
+    );
+}
+
+
+NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def("__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty"
+    );
+
+    cl.def("__iter__",
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("items",
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end())
+              throw key_error();
+           return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__",
+        [](Map &m, const KeyType &k) -> bool {
+            auto it = m.find(k);
+            if (it == m.end())
+              return false;
+           return true;
+        }
+    );
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__",
+           [](Map &m, const KeyType &k) {
+               auto it = m.find(k);
+               if (it == m.end())
+                   throw key_error();
+               m.erase(it);
+           }
+    );
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__init__.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__init__.py
new file mode 100644
index 0000000..4b1de3e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__init__.py
@@ -0,0 +1,12 @@
+from ._version import version_info, __version__  # noqa: F401 imported but unused
+
+
+def get_include(user=False):
+    import os
+    d = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(d, "include")):
+        # Package is installed
+        return os.path.join(d, "include")
+    else:
+        # Package is from a source directory
+        return os.path.join(os.path.dirname(d), "include")
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__main__.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__main__.py
new file mode 100644
index 0000000..89b263a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__main__.py
@@ -0,0 +1,36 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import sysconfig
+
+from . import get_include
+
+
+def print_includes():
+    dirs = [sysconfig.get_path('include'),
+            sysconfig.get_path('platinclude'),
+            get_include()]
+
+    # Make unique but preserve order
+    unique_dirs = []
+    for d in dirs:
+        if d not in unique_dirs:
+            unique_dirs.append(d)
+
+    print(' '.join('-I' + d for d in unique_dirs))
+
+
+def main():
+    parser = argparse.ArgumentParser(prog='python -m pybind11')
+    parser.add_argument('--includes', action='store_true',
+                        help='Include flags for both pybind11 and Python headers.')
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.includes:
+        print_includes()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/_version.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/_version.py
new file mode 100644
index 0000000..5bf3483
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/pybind11/_version.py
@@ -0,0 +1,2 @@
+version_info = (2, 4, 'dev4')
+__version__ = '.'.join(map(str, version_info))
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.cfg b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.cfg
new file mode 100644
index 0000000..002f38d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.cfg
@@ -0,0 +1,12 @@
+[bdist_wheel]
+universal=1
+
+[flake8]
+max-line-length = 99
+show_source = True
+exclude = .git, __pycache__, build, dist, docs, tools, venv
+ignore =
+    # required for pretty matrix formatting: multiple spaces after `,` and `[`
+    E201, E241, W504,
+    # camelcase 'cPickle' imported as lowercase 'pickle'
+    N813
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.py
new file mode 100644
index 0000000..473ea1e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/setup.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+# Setup script for PyPI; use CMakeFile.txt to build extension modules
+
+from setuptools import setup
+from distutils.command.install_headers import install_headers
+from distutils.command.build_py import build_py
+from pybind11 import __version__
+import os
+
+package_data = [
+    'include/pybind11/detail/class.h',
+    'include/pybind11/detail/common.h',
+    'include/pybind11/detail/descr.h',
+    'include/pybind11/detail/init.h',
+    'include/pybind11/detail/internals.h',
+    'include/pybind11/detail/typeid.h',
+    'include/pybind11/attr.h',
+    'include/pybind11/buffer_info.h',
+    'include/pybind11/cast.h',
+    'include/pybind11/chrono.h',
+    'include/pybind11/common.h',
+    'include/pybind11/complex.h',
+    'include/pybind11/eigen.h',
+    'include/pybind11/embed.h',
+    'include/pybind11/eval.h',
+    'include/pybind11/functional.h',
+    'include/pybind11/iostream.h',
+    'include/pybind11/numpy.h',
+    'include/pybind11/operators.h',
+    'include/pybind11/options.h',
+    'include/pybind11/pybind11.h',
+    'include/pybind11/pytypes.h',
+    'include/pybind11/stl.h',
+    'include/pybind11/stl_bind.h',
+]
+
+# Prevent installation of pybind11 headers by setting
+# PYBIND11_USE_CMAKE.
+if os.environ.get('PYBIND11_USE_CMAKE'):
+    headers = []
+else:
+    headers = package_data
+
+
+class InstallHeaders(install_headers):
+    """Use custom header installer because the default one flattens subdirectories"""
+    def run(self):
+        if not self.distribution.headers:
+            return
+
+        for header in self.distribution.headers:
+            subdir = os.path.dirname(os.path.relpath(header, 'include/pybind11'))
+            install_dir = os.path.join(self.install_dir, subdir)
+            self.mkpath(install_dir)
+
+            (out, _) = self.copy_file(header, install_dir)
+            self.outfiles.append(out)
+
+
+# Install the headers inside the package as well
+class BuildPy(build_py):
+    def build_package_data(self):
+        build_py.build_package_data(self)
+        for header in package_data:
+            target = os.path.join(self.build_lib, 'pybind11', header)
+            self.mkpath(os.path.dirname(target))
+            self.copy_file(header, target, preserve_mode=False)
+
+
+setup(
+    name='pybind11',
+    version=__version__,
+    description='Seamless operability between C++11 and Python',
+    author='Wenzel Jakob',
+    author_email='wenzel.jakob@epfl.ch',
+    url='https://github.com/pybind/pybind11',
+    download_url='https://github.com/pybind/pybind11/tarball/v' + __version__,
+    packages=['pybind11'],
+    license='BSD',
+    headers=headers,
+    zip_safe=False,
+    cmdclass=dict(install_headers=InstallHeaders, build_py=BuildPy),
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Utilities',
+        'Programming Language :: C++',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'License :: OSI Approved :: BSD License'
+    ],
+    keywords='C++11, Python bindings',
+    long_description="""pybind11 is a lightweight header-only library that
+exposes C++ types in Python and vice versa, mainly to create Python bindings of
+existing C++ code. Its goals and syntax are similar to the excellent
+Boost.Python by David Abrahams: to minimize boilerplate code in traditional
+extension modules by inferring type information using compile-time
+introspection.
+
+The main issue with Boost.Python-and the reason for creating such a similar
+project-is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.""")
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/CMakeLists.txt
new file mode 100644
index 0000000..765c47a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/CMakeLists.txt
@@ -0,0 +1,259 @@
+# CMakeLists.txt -- Build system for the pybind11 test suite
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+option(PYBIND11_WERROR  "Report all warnings as errors"  OFF)
+
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+    # We're being loaded directly, i.e. not via add_subdirectory, so make this
+    # work as its own project and load the pybind11Config to get the tools we need
+    project(pybind11_tests CXX)
+
+    find_package(pybind11 REQUIRED CONFIG)
+endif()
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting tests build type to MinSizeRel as none was specified")
+  set(CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
+    "MinSizeRel" "RelWithDebInfo")
+endif()
+
+# Full set of test files (you can override these; see below)
+set(PYBIND11_TEST_FILES
+  test_async.cpp
+  test_buffers.cpp
+  test_builtin_casters.cpp
+  test_call_policies.cpp
+  test_callbacks.cpp
+  test_chrono.cpp
+  test_class.cpp
+  test_constants_and_functions.cpp
+  test_copy_move.cpp
+  test_docstring_options.cpp
+  test_eigen.cpp
+  test_enum.cpp
+  test_eval.cpp
+  test_exceptions.cpp
+  test_factory_constructors.cpp
+  test_gil_scoped.cpp
+  test_iostream.cpp
+  test_kwargs_and_defaults.cpp
+  test_local_bindings.cpp
+  test_methods_and_attributes.cpp
+  test_modules.cpp
+  test_multiple_inheritance.cpp
+  test_numpy_array.cpp
+  test_numpy_dtypes.cpp
+  test_numpy_vectorize.cpp
+  test_opaque_types.cpp
+  test_operator_overloading.cpp
+  test_pickling.cpp
+  test_pytypes.cpp
+  test_sequences_and_iterators.cpp
+  test_smart_ptr.cpp
+  test_stl.cpp
+  test_stl_binders.cpp
+  test_tagbased_polymorphic.cpp
+  test_union.cpp
+  test_virtual_functions.cpp
+)
+
+# Invoking cmake with something like:
+#     cmake -DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_picking.cpp" ..
+# lets you override the tests that get compiled and run.  You can restore to all tests with:
+#     cmake -DPYBIND11_TEST_OVERRIDE= ..
+if (PYBIND11_TEST_OVERRIDE)
+  set(PYBIND11_TEST_FILES ${PYBIND11_TEST_OVERRIDE})
+endif()
+
+# Skip test_async for Python < 3.5
+list(FIND PYBIND11_TEST_FILES test_async.cpp PYBIND11_TEST_FILES_ASYNC_I)
+if((PYBIND11_TEST_FILES_ASYNC_I GREATER -1) AND ("${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}" VERSION_LESS 3.5))
+  message(STATUS "Skipping test_async because Python version ${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR} < 3.5")
+  list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_ASYNC_I})
+endif()
+
+string(REPLACE ".cpp" ".py" PYBIND11_PYTEST_FILES "${PYBIND11_TEST_FILES}")
+
+# Contains the set of test files that require pybind11_cross_module_tests to be
+# built; if none of these are built (i.e. because TEST_OVERRIDE is used and
+# doesn't include them) the second module doesn't get built.
+set(PYBIND11_CROSS_MODULE_TESTS
+  test_exceptions.py
+  test_local_bindings.py
+  test_stl.py
+  test_stl_binders.py
+)
+
+set(PYBIND11_CROSS_MODULE_GIL_TESTS
+  test_gil_scoped.py
+)
+
+# Check if Eigen is available; if not, remove from PYBIND11_TEST_FILES (but
+# keep it in PYBIND11_PYTEST_FILES, so that we get the "eigen is not installed"
+# skip message).
+list(FIND PYBIND11_TEST_FILES test_eigen.cpp PYBIND11_TEST_FILES_EIGEN_I)
+if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
+  # Try loading via newer Eigen's Eigen3Config first (bypassing tools/FindEigen3.cmake).
+  # Eigen 3.3.1+ exports a cmake 3.0+ target for handling dependency requirements, but also
+  # produces a fatal error if loaded from a pre-3.0 cmake.
+  if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+    find_package(Eigen3 3.2.7 QUIET CONFIG)
+    if (EIGEN3_FOUND)
+      if (EIGEN3_VERSION_STRING AND NOT EIGEN3_VERSION_STRING VERSION_LESS 3.3.1)
+        set(PYBIND11_EIGEN_VIA_TARGET 1)
+      endif()
+    endif()
+  endif()
+  if (NOT EIGEN3_FOUND)
+    # Couldn't load via target, so fall back to allowing module mode finding, which will pick up
+    # tools/FindEigen3.cmake
+    find_package(Eigen3 3.2.7 QUIET)
+  endif()
+
+  if(EIGEN3_FOUND)
+    # Eigen 3.3.1+ cmake sets EIGEN3_VERSION_STRING (and hard codes the version when installed
+    # rather than looking it up in the cmake script); older versions, and the
+    # tools/FindEigen3.cmake, set EIGEN3_VERSION instead.
+    if(NOT EIGEN3_VERSION AND EIGEN3_VERSION_STRING)
+      set(EIGEN3_VERSION ${EIGEN3_VERSION_STRING})
+    endif()
+    message(STATUS "Building tests with Eigen v${EIGEN3_VERSION}")
+  else()
+    list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_EIGEN_I})
+    message(STATUS "Building tests WITHOUT Eigen")
+  endif()
+endif()
+
+# Optional dependency for some tests (boost::variant is only supported with version >= 1.56)
+find_package(Boost 1.56)
+
+# Compile with compiler warnings turned on
+function(pybind11_enable_warnings target_name)
+  if(MSVC)
+    target_compile_options(${target_name} PRIVATE /W4)
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
+      target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion -Wcast-qual -Wdeprecated)
+  endif()
+
+  if(PYBIND11_WERROR)
+    if(MSVC)
+      target_compile_options(${target_name} PRIVATE /WX)
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
+      target_compile_options(${target_name} PRIVATE -Werror)
+    endif()
+  endif()
+endfunction()
+
+set(test_targets pybind11_tests)
+
+# Build pybind11_cross_module_tests if any test_whatever.py are being built that require it
+foreach(t ${PYBIND11_CROSS_MODULE_TESTS})
+  list(FIND PYBIND11_PYTEST_FILES ${t} i)
+  if (i GREATER -1)
+    list(APPEND test_targets pybind11_cross_module_tests)
+    break()
+  endif()
+endforeach()
+
+foreach(t ${PYBIND11_CROSS_MODULE_GIL_TESTS})
+  list(FIND PYBIND11_PYTEST_FILES ${t} i)
+  if (i GREATER -1)
+    list(APPEND test_targets cross_module_gil_utils)
+    break()
+  endif()
+endforeach()
+
+set(testdir ${CMAKE_CURRENT_SOURCE_DIR})
+foreach(target ${test_targets})
+  set(test_files ${PYBIND11_TEST_FILES})
+  if(NOT target STREQUAL "pybind11_tests")
+    set(test_files "")
+  endif()
+
+  # Create the binding library
+  pybind11_add_module(${target} THIN_LTO ${target}.cpp ${test_files} ${PYBIND11_HEADERS})
+  pybind11_enable_warnings(${target})
+
+  if(MSVC)
+    target_compile_options(${target} PRIVATE /utf-8)
+  endif()
+
+  if(EIGEN3_FOUND)
+    if (PYBIND11_EIGEN_VIA_TARGET)
+      target_link_libraries(${target} PRIVATE Eigen3::Eigen)
+    else()
+      target_include_directories(${target} PRIVATE ${EIGEN3_INCLUDE_DIR})
+    endif()
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_EIGEN)
+  endif()
+
+  if(Boost_FOUND)
+    target_include_directories(${target} PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_BOOST)
+  endif()
+
+  # Always write the output file directly into the 'tests' directory (even on MSVC)
+  if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${testdir})
+    foreach(config ${CMAKE_CONFIGURATION_TYPES})
+      string(TOUPPER ${config} config)
+      set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config} ${testdir})
+    endforeach()
+  endif()
+endforeach()
+
+# Make sure pytest is found or produce a fatal error
+if(NOT PYBIND11_PYTEST_FOUND)
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pytest; print(pytest.__version__)"
+                  RESULT_VARIABLE pytest_not_found OUTPUT_VARIABLE pytest_version ERROR_QUIET)
+  if(pytest_not_found)
+    message(FATAL_ERROR "Running the tests requires pytest. Please install it manually"
+                        " (try: ${PYTHON_EXECUTABLE} -m pip install pytest)")
+  elseif(pytest_version VERSION_LESS 3.0)
+    message(FATAL_ERROR "Running the tests requires pytest >= 3.0. Found: ${pytest_version}"
+                        "Please update it (try: ${PYTHON_EXECUTABLE} -m pip install -U pytest)")
+  endif()
+  set(PYBIND11_PYTEST_FOUND TRUE CACHE INTERNAL "")
+endif()
+
+if(CMAKE_VERSION VERSION_LESS 3.2)
+  set(PYBIND11_USES_TERMINAL "")
+else()
+  set(PYBIND11_USES_TERMINAL "USES_TERMINAL")
+endif()
+
+# A single command to compile and run the tests
+add_custom_target(pytest COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PYBIND11_PYTEST_FILES}
+                  DEPENDS ${test_targets} WORKING_DIRECTORY ${testdir} ${PYBIND11_USES_TERMINAL})
+
+if(PYBIND11_TEST_OVERRIDE)
+  add_custom_command(TARGET pytest POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E echo "Note: not all tests run: -DPYBIND11_TEST_OVERRIDE is in effect")
+endif()
+
+# Add a check target to run all the tests, starting with pytest (we add dependencies to this below)
+add_custom_target(check DEPENDS pytest)
+
+# The remaining tests only apply when being built as part of the pybind11 project, but not if the
+# tests are being built independently.
+if (NOT PROJECT_NAME STREQUAL "pybind11")
+  return()
+endif()
+
+# Add a post-build comment to show the primary test suite .so size and, if a previous size, compare it:
+add_custom_command(TARGET pybind11_tests POST_BUILD
+  COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/libsize.py
+  $<TARGET_FILE:pybind11_tests> ${CMAKE_CURRENT_BINARY_DIR}/sosize-$<TARGET_FILE_NAME:pybind11_tests>.txt)
+
+# Test embedding the interpreter. Provides the `cpptest` target.
+add_subdirectory(test_embed)
+
+# Test CMake build using functions and targets from subdirectory or installed location
+add_subdirectory(test_cmake_build)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/conftest.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/conftest.py
new file mode 100644
index 0000000..57f681c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/conftest.py
@@ -0,0 +1,244 @@
+"""pytest configuration
+
+Extends output capture as needed by pybind11: ignore constructors, optional unordered lines.
+Adds docstring and exceptions message sanitizers: ignore Python 2 vs 3 differences.
+"""
+
+import pytest
+import textwrap
+import difflib
+import re
+import sys
+import contextlib
+import platform
+import gc
+
+_unicode_marker = re.compile(r'u(\'[^\']*\')')
+_long_marker = re.compile(r'([0-9])L')
+_hexadecimal = re.compile(r'0x[0-9a-fA-F]+')
+
+# test_async.py requires support for async and await
+collect_ignore = []
+if sys.version_info[:2] < (3, 5):
+    collect_ignore.append("test_async.py")
+
+
+def _strip_and_dedent(s):
+    """For triple-quote strings"""
+    return textwrap.dedent(s.lstrip('\n').rstrip())
+
+
+def _split_and_sort(s):
+    """For output which does not require specific line order"""
+    return sorted(_strip_and_dedent(s).splitlines())
+
+
+def _make_explanation(a, b):
+    """Explanation for a failed assert -- the a and b arguments are List[str]"""
+    return ["--- actual / +++ expected"] + [line.strip('\n') for line in difflib.ndiff(a, b)]
+
+
+class Output(object):
+    """Basic output post-processing and comparison"""
+    def __init__(self, string):
+        self.string = string
+        self.explanation = []
+
+    def __str__(self):
+        return self.string
+
+    def __eq__(self, other):
+        # Ignore constructor/destructor output which is prefixed with "###"
+        a = [line for line in self.string.strip().splitlines() if not line.startswith("###")]
+        b = _strip_and_dedent(other).splitlines()
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a, b)
+            return False
+
+
+class Unordered(Output):
+    """Custom comparison for output without strict line ordering"""
+    def __eq__(self, other):
+        a = _split_and_sort(self.string)
+        b = _split_and_sort(other)
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a, b)
+            return False
+
+
+class Capture(object):
+    def __init__(self, capfd):
+        self.capfd = capfd
+        self.out = ""
+        self.err = ""
+
+    def __enter__(self):
+        self.capfd.readouterr()
+        return self
+
+    def __exit__(self, *args):
+        self.out, self.err = self.capfd.readouterr()
+
+    def __eq__(self, other):
+        a = Output(self.out)
+        b = other
+        if a == b:
+            return True
+        else:
+            self.explanation = a.explanation
+            return False
+
+    def __str__(self):
+        return self.out
+
+    def __contains__(self, item):
+        return item in self.out
+
+    @property
+    def unordered(self):
+        return Unordered(self.out)
+
+    @property
+    def stderr(self):
+        return Output(self.err)
+
+
+@pytest.fixture
+def capture(capsys):
+    """Extended `capsys` with context manager and custom equality operators"""
+    return Capture(capsys)
+
+
+class SanitizedString(object):
+    def __init__(self, sanitizer):
+        self.sanitizer = sanitizer
+        self.string = ""
+        self.explanation = []
+
+    def __call__(self, thing):
+        self.string = self.sanitizer(thing)
+        return self
+
+    def __eq__(self, other):
+        a = self.string
+        b = _strip_and_dedent(other)
+        if a == b:
+            return True
+        else:
+            self.explanation = _make_explanation(a.splitlines(), b.splitlines())
+            return False
+
+
+def _sanitize_general(s):
+    s = s.strip()
+    s = s.replace("pybind11_tests.", "m.")
+    s = s.replace("unicode", "str")
+    s = _long_marker.sub(r"\1", s)
+    s = _unicode_marker.sub(r"\1", s)
+    return s
+
+
+def _sanitize_docstring(thing):
+    s = thing.__doc__
+    s = _sanitize_general(s)
+    return s
+
+
+@pytest.fixture
+def doc():
+    """Sanitize docstrings and add custom failure explanation"""
+    return SanitizedString(_sanitize_docstring)
+
+
+def _sanitize_message(thing):
+    s = str(thing)
+    s = _sanitize_general(s)
+    s = _hexadecimal.sub("0", s)
+    return s
+
+
+@pytest.fixture
+def msg():
+    """Sanitize messages and add custom failure explanation"""
+    return SanitizedString(_sanitize_message)
+
+
+# noinspection PyUnusedLocal
+def pytest_assertrepr_compare(op, left, right):
+    """Hook to insert custom failure explanation"""
+    if hasattr(left, 'explanation'):
+        return left.explanation
+
+
+@contextlib.contextmanager
+def suppress(exception):
+    """Suppress the desired exception"""
+    try:
+        yield
+    except exception:
+        pass
+
+
+def gc_collect():
+    ''' Run the garbage collector twice (needed when running
+    reference counting tests with PyPy) '''
+    gc.collect()
+    gc.collect()
+
+
+def pytest_configure():
+    """Add import suppression and test requirements to `pytest` namespace"""
+    try:
+        import numpy as np
+    except ImportError:
+        np = None
+    try:
+        import scipy
+    except ImportError:
+        scipy = None
+    try:
+        from pybind11_tests.eigen import have_eigen
+    except ImportError:
+        have_eigen = False
+    pypy = platform.python_implementation() == "PyPy"
+
+    skipif = pytest.mark.skipif
+    pytest.suppress = suppress
+    pytest.requires_numpy = skipif(not np, reason="numpy is not installed")
+    pytest.requires_scipy = skipif(not np, reason="scipy is not installed")
+    pytest.requires_eigen_and_numpy = skipif(not have_eigen or not np,
+                                             reason="eigen and/or numpy are not installed")
+    pytest.requires_eigen_and_scipy = skipif(
+        not have_eigen or not scipy, reason="eigen and/or scipy are not installed")
+    pytest.unsupported_on_pypy = skipif(pypy, reason="unsupported on PyPy")
+    pytest.unsupported_on_py2 = skipif(sys.version_info.major < 3,
+                                       reason="unsupported on Python 2.x")
+    pytest.gc_collect = gc_collect
+
+
+def _test_import_pybind11():
+    """Early diagnostic for test module initialization errors
+
+    When there is an error during initialization, the first import will report the
+    real error while all subsequent imports will report nonsense. This import test
+    is done early (in the pytest configuration file, before any tests) in order to
+    avoid the noise of having all tests fail with identical error messages.
+
+    Any possible exception is caught here and reported manually *without* the stack
+    trace. This further reduces noise since the trace would only show pytest internals
+    which are not useful for debugging pybind11 module issues.
+    """
+    # noinspection PyBroadException
+    try:
+        import pybind11_tests  # noqa: F401 imported but unused
+    except Exception as e:
+        print("Failed to import pybind11_tests from pytest:")
+        print("  {}: {}".format(type(e).__name__, e))
+        sys.exit(1)
+
+
+_test_import_pybind11()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/constructor_stats.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/constructor_stats.h
new file mode 100644
index 0000000..431e5ac
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/constructor_stats.h
@@ -0,0 +1,276 @@
+#pragma once
+/*
+    tests/constructor_stats.h -- framework for printing and tracking object
+    instance lifetimes in example/test code.
+
+    Copyright (c) 2016 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+This header provides a few useful tools for writing examples or tests that want to check and/or
+display object instance lifetimes.  It requires that you include this header and add the following
+function calls to constructors:
+
+    class MyClass {
+        MyClass() { ...; print_default_created(this); }
+        ~MyClass() { ...; print_destroyed(this); }
+        MyClass(const MyClass &c) { ...; print_copy_created(this); }
+        MyClass(MyClass &&c) { ...; print_move_created(this); }
+        MyClass(int a, int b) { ...; print_created(this, a, b); }
+        MyClass &operator=(const MyClass &c) { ...; print_copy_assigned(this); }
+        MyClass &operator=(MyClass &&c) { ...; print_move_assigned(this); }
+
+        ...
+    }
+
+You can find various examples of these in several of the existing testing .cpp files.  (Of course
+you don't need to add any of the above constructors/operators that you don't actually have, except
+for the destructor).
+
+Each of these will print an appropriate message such as:
+
+    ### MyClass @ 0x2801910 created via default constructor
+    ### MyClass @ 0x27fa780 created 100 200
+    ### MyClass @ 0x2801910 destroyed
+    ### MyClass @ 0x27fa780 destroyed
+
+You can also include extra arguments (such as the 100, 200 in the output above, coming from the
+value constructor) for all of the above methods which will be included in the output.
+
+For testing, each of these also keeps track the created instances and allows you to check how many
+of the various constructors have been invoked from the Python side via code such as:
+
+    from pybind11_tests import ConstructorStats
+    cstats = ConstructorStats.get(MyClass)
+    print(cstats.alive())
+    print(cstats.default_constructions)
+
+Note that `.alive()` should usually be the first thing you call as it invokes Python's garbage
+collector to actually destroy objects that aren't yet referenced.
+
+For everything except copy and move constructors and destructors, any extra values given to the
+print_...() function is stored in a class-specific values list which you can retrieve and inspect
+from the ConstructorStats instance `.values()` method.
+
+In some cases, when you need to track instances of a C++ class not registered with pybind11, you
+need to add a function returning the ConstructorStats for the C++ class; this can be done with:
+
+    m.def("get_special_cstats", &ConstructorStats::get<SpecialClass>, py::return_value_policy::reference)
+
+Finally, you can suppress the output messages, but keep the constructor tracking (for
+inspection/testing in python) by using the functions with `print_` replaced with `track_` (e.g.
+`track_copy_created(this)`).
+
+*/
+
+#include "pybind11_tests.h"
+#include <unordered_map>
+#include <list>
+#include <typeindex>
+#include <sstream>
+
+class ConstructorStats {
+protected:
+    std::unordered_map<void*, int> _instances; // Need a map rather than set because members can shared address with parents
+    std::list<std::string> _values; // Used to track values (e.g. of value constructors)
+public:
+    int default_constructions = 0;
+    int copy_constructions = 0;
+    int move_constructions = 0;
+    int copy_assignments = 0;
+    int move_assignments = 0;
+
+    void copy_created(void *inst) {
+        created(inst);
+        copy_constructions++;
+    }
+
+    void move_created(void *inst) {
+        created(inst);
+        move_constructions++;
+    }
+
+    void default_created(void *inst) {
+        created(inst);
+        default_constructions++;
+    }
+
+    void created(void *inst) {
+        ++_instances[inst];
+    }
+
+    void destroyed(void *inst) {
+        if (--_instances[inst] < 0)
+            throw std::runtime_error("cstats.destroyed() called with unknown "
+                                     "instance; potential double-destruction "
+                                     "or a missing cstats.created()");
+    }
+
+    static void gc() {
+        // Force garbage collection to ensure any pending destructors are invoked:
+#if defined(PYPY_VERSION)
+        PyObject *globals = PyEval_GetGlobals();
+        PyObject *result = PyRun_String(
+            "import gc\n"
+            "for i in range(2):"
+            "    gc.collect()\n",
+            Py_file_input, globals, globals);
+        if (result == nullptr)
+            throw py::error_already_set();
+        Py_DECREF(result);
+#else
+        py::module::import("gc").attr("collect")();
+#endif
+    }
+
+    int alive() {
+        gc();
+        int total = 0;
+        for (const auto &p : _instances)
+            if (p.second > 0)
+                total += p.second;
+        return total;
+    }
+
+    void value() {} // Recursion terminator
+    // Takes one or more values, converts them to strings, then stores them.
+    template <typename T, typename... Tmore> void value(const T &v, Tmore &&...args) {
+        std::ostringstream oss;
+        oss << v;
+        _values.push_back(oss.str());
+        value(std::forward<Tmore>(args)...);
+    }
+
+    // Move out stored values
+    py::list values() {
+        py::list l;
+        for (const auto &v : _values) l.append(py::cast(v));
+        _values.clear();
+        return l;
+    }
+
+    // Gets constructor stats from a C++ type index
+    static ConstructorStats& get(std::type_index type) {
+        static std::unordered_map<std::type_index, ConstructorStats> all_cstats;
+        return all_cstats[type];
+    }
+
+    // Gets constructor stats from a C++ type
+    template <typename T> static ConstructorStats& get() {
+#if defined(PYPY_VERSION)
+        gc();
+#endif
+        return get(typeid(T));
+    }
+
+    // Gets constructor stats from a Python class
+    static ConstructorStats& get(py::object class_) {
+        auto &internals = py::detail::get_internals();
+        const std::type_index *t1 = nullptr, *t2 = nullptr;
+        try {
+            auto *type_info = internals.registered_types_py.at((PyTypeObject *) class_.ptr()).at(0);
+            for (auto &p : internals.registered_types_cpp) {
+                if (p.second == type_info) {
+                    if (t1) {
+                        t2 = &p.first;
+                        break;
+                    }
+                    t1 = &p.first;
+                }
+            }
+        }
+        catch (const std::out_of_range&) {}
+        if (!t1) throw std::runtime_error("Unknown class passed to ConstructorStats::get()");
+        auto &cs1 = get(*t1);
+        // If we have both a t1 and t2 match, one is probably the trampoline class; return whichever
+        // has more constructions (typically one or the other will be 0)
+        if (t2) {
+            auto &cs2 = get(*t2);
+            int cs1_total = cs1.default_constructions + cs1.copy_constructions + cs1.move_constructions + (int) cs1._values.size();
+            int cs2_total = cs2.default_constructions + cs2.copy_constructions + cs2.move_constructions + (int) cs2._values.size();
+            if (cs2_total > cs1_total) return cs2;
+        }
+        return cs1;
+    }
+};
+
+// To track construction/destruction, you need to call these methods from the various
+// constructors/operators.  The ones that take extra values record the given values in the
+// constructor stats values for later inspection.
+template <class T> void track_copy_created(T *inst) { ConstructorStats::get<T>().copy_created(inst); }
+template <class T> void track_move_created(T *inst) { ConstructorStats::get<T>().move_created(inst); }
+template <class T, typename... Values> void track_copy_assigned(T *, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.copy_assignments++;
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_move_assigned(T *, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.move_assignments++;
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_default_created(T *inst, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.default_created(inst);
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_created(T *inst, Values &&...values) {
+    auto &cst = ConstructorStats::get<T>();
+    cst.created(inst);
+    cst.value(std::forward<Values>(values)...);
+}
+template <class T, typename... Values> void track_destroyed(T *inst) {
+    ConstructorStats::get<T>().destroyed(inst);
+}
+template <class T, typename... Values> void track_values(T *, Values &&...values) {
+    ConstructorStats::get<T>().value(std::forward<Values>(values)...);
+}
+
+/// Don't cast pointers to Python, print them as strings
+inline const char *format_ptrs(const char *p) { return p; }
+template <typename T>
+py::str format_ptrs(T *p) { return "{:#x}"_s.format(reinterpret_cast<std::uintptr_t>(p)); }
+template <typename T>
+auto format_ptrs(T &&x) -> decltype(std::forward<T>(x)) { return std::forward<T>(x); }
+
+template <class T, typename... Output>
+void print_constr_details(T *inst, const std::string &action, Output &&...output) {
+    py::print("###", py::type_id<T>(), "@", format_ptrs(inst), action,
+              format_ptrs(std::forward<Output>(output))...);
+}
+
+// Verbose versions of the above:
+template <class T, typename... Values> void print_copy_created(T *inst, Values &&...values) { // NB: this prints, but doesn't store, given values
+    print_constr_details(inst, "created via copy constructor", values...);
+    track_copy_created(inst);
+}
+template <class T, typename... Values> void print_move_created(T *inst, Values &&...values) { // NB: this prints, but doesn't store, given values
+    print_constr_details(inst, "created via move constructor", values...);
+    track_move_created(inst);
+}
+template <class T, typename... Values> void print_copy_assigned(T *inst, Values &&...values) {
+    print_constr_details(inst, "assigned via copy assignment", values...);
+    track_copy_assigned(inst, values...);
+}
+template <class T, typename... Values> void print_move_assigned(T *inst, Values &&...values) {
+    print_constr_details(inst, "assigned via move assignment", values...);
+    track_move_assigned(inst, values...);
+}
+template <class T, typename... Values> void print_default_created(T *inst, Values &&...values) {
+    print_constr_details(inst, "created via default constructor", values...);
+    track_default_created(inst, values...);
+}
+template <class T, typename... Values> void print_created(T *inst, Values &&...values) {
+    print_constr_details(inst, "created", values...);
+    track_created(inst, values...);
+}
+template <class T, typename... Values> void print_destroyed(T *inst, Values &&...values) { // Prints but doesn't store given values
+    print_constr_details(inst, "destroyed", values...);
+    track_destroyed(inst);
+}
+template <class T, typename... Values> void print_values(T *inst, Values &&...values) {
+    print_constr_details(inst, ":", values...);
+    track_values(inst, values...);
+}
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cc
new file mode 100644
index 0000000..07db9f6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cc
@@ -0,0 +1,73 @@
+/*
+    tests/cross_module_gil_utils.cpp -- tools for acquiring GIL from a different module
+
+    Copyright (c) 2019 Google LLC
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+#include <pybind11/pybind11.h>
+#include <cstdint>
+
+// This file mimics a DSO that makes pybind11 calls but does not define a
+// PYBIND11_MODULE. The purpose is to test that such a DSO can create a
+// py::gil_scoped_acquire when the running thread is in a GIL-released state.
+//
+// Note that we define a Python module here for convenience, but in general
+// this need not be the case. The typical scenario would be a DSO that implements
+// shared logic used internally by multiple pybind11 modules.
+
+namespace {
+
+namespace py = pybind11;
+void gil_acquire() { py::gil_scoped_acquire gil; }
+
+constexpr char kModuleName[] = "cross_module_gil_utils";
+
+#if PY_MAJOR_VERSION >= 3
+struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    kModuleName,
+    NULL,
+    0,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#else
+PyMethodDef module_methods[] = {
+    {NULL, NULL, 0, NULL}
+};
+#endif
+
+}  // namespace
+
+extern "C" PYBIND11_EXPORT
+#if PY_MAJOR_VERSION >= 3
+PyObject* PyInit_cross_module_gil_utils()
+#else
+void initcross_module_gil_utils()
+#endif
+{
+
+    PyObject* m =
+#if PY_MAJOR_VERSION >= 3
+        PyModule_Create(&moduledef);
+#else
+        Py_InitModule(kModuleName, module_methods);
+#endif
+
+    if (m != NULL) {
+        static_assert(
+            sizeof(&gil_acquire) == sizeof(void*),
+            "Function pointer must have the same size as void*");
+        PyModule_AddObject(m, "gil_acquire_funcaddr",
+                           PyLong_FromVoidPtr(reinterpret_cast<void*>(&gil_acquire)));
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cpp
new file mode 100644
index 0000000..07db9f6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cpp
@@ -0,0 +1,73 @@
+/*
+    tests/cross_module_gil_utils.cpp -- tools for acquiring GIL from a different module
+
+    Copyright (c) 2019 Google LLC
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+#include <pybind11/pybind11.h>
+#include <cstdint>
+
+// This file mimics a DSO that makes pybind11 calls but does not define a
+// PYBIND11_MODULE. The purpose is to test that such a DSO can create a
+// py::gil_scoped_acquire when the running thread is in a GIL-released state.
+//
+// Note that we define a Python module here for convenience, but in general
+// this need not be the case. The typical scenario would be a DSO that implements
+// shared logic used internally by multiple pybind11 modules.
+
+namespace {
+
+namespace py = pybind11;
+void gil_acquire() { py::gil_scoped_acquire gil; }
+
+constexpr char kModuleName[] = "cross_module_gil_utils";
+
+#if PY_MAJOR_VERSION >= 3
+struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    kModuleName,
+    NULL,
+    0,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#else
+PyMethodDef module_methods[] = {
+    {NULL, NULL, 0, NULL}
+};
+#endif
+
+}  // namespace
+
+extern "C" PYBIND11_EXPORT
+#if PY_MAJOR_VERSION >= 3
+PyObject* PyInit_cross_module_gil_utils()
+#else
+void initcross_module_gil_utils()
+#endif
+{
+
+    PyObject* m =
+#if PY_MAJOR_VERSION >= 3
+        PyModule_Create(&moduledef);
+#else
+        Py_InitModule(kModuleName, module_methods);
+#endif
+
+    if (m != NULL) {
+        static_assert(
+            sizeof(&gil_acquire) == sizeof(void*),
+            "Function pointer must have the same size as void*");
+        PyModule_AddObject(m, "gil_acquire_funcaddr",
+                           PyLong_FromVoidPtr(reinterpret_cast<void*>(&gil_acquire)));
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/local_bindings.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/local_bindings.h
new file mode 100644
index 0000000..b6afb80
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/local_bindings.h
@@ -0,0 +1,64 @@
+#pragma once
+#include "pybind11_tests.h"
+
+/// Simple class used to test py::local:
+template <int> class LocalBase {
+public:
+    LocalBase(int i) : i(i) { }
+    int i = -1;
+};
+
+/// Registered with py::module_local in both main and secondary modules:
+using LocalType = LocalBase<0>;
+/// Registered without py::module_local in both modules:
+using NonLocalType = LocalBase<1>;
+/// A second non-local type (for stl_bind tests):
+using NonLocal2 = LocalBase<2>;
+/// Tests within-module, different-compilation-unit local definition conflict:
+using LocalExternal = LocalBase<3>;
+/// Mixed: registered local first, then global
+using MixedLocalGlobal = LocalBase<4>;
+/// Mixed: global first, then local
+using MixedGlobalLocal = LocalBase<5>;
+
+/// Registered with py::module_local only in the secondary module:
+using ExternalType1 = LocalBase<6>;
+using ExternalType2 = LocalBase<7>;
+
+using LocalVec = std::vector<LocalType>;
+using LocalVec2 = std::vector<NonLocal2>;
+using LocalMap = std::unordered_map<std::string, LocalType>;
+using NonLocalVec = std::vector<NonLocalType>;
+using NonLocalVec2 = std::vector<NonLocal2>;
+using NonLocalMap = std::unordered_map<std::string, NonLocalType>;
+using NonLocalMap2 = std::unordered_map<std::string, uint8_t>;
+
+PYBIND11_MAKE_OPAQUE(LocalVec);
+PYBIND11_MAKE_OPAQUE(LocalVec2);
+PYBIND11_MAKE_OPAQUE(LocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalVec);
+//PYBIND11_MAKE_OPAQUE(NonLocalVec2); // same type as LocalVec2
+PYBIND11_MAKE_OPAQUE(NonLocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalMap2);
+
+
+// Simple bindings (used with the above):
+template <typename T, int Adjust = 0, typename... Args>
+py::class_<T> bind_local(Args && ...args) {
+    return py::class_<T>(std::forward<Args>(args)...)
+        .def(py::init<int>())
+        .def("get", [](T &i) { return i.i + Adjust; });
+};
+
+// Simulate a foreign library base class (to match the example in the docs):
+namespace pets {
+class Pet {
+public:
+    Pet(std::string name) : name_(name) {}
+    std::string name_;
+    const std::string &name() { return name_; }
+};
+}
+
+struct MixGL { int i; MixGL(int i) : i{i} {} };
+struct MixGL2 { int i; MixGL2(int i) : i{i} {} };
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/object.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/object.h
new file mode 100644
index 0000000..9235f19
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/object.h
@@ -0,0 +1,175 @@
+#if !defined(__OBJECT_H)
+#define __OBJECT_H
+
+#include <atomic>
+#include "constructor_stats.h"
+
+/// Reference counted object base class
+class Object {
+public:
+    /// Default constructor
+    Object() { print_default_created(this); }
+
+    /// Copy constructor
+    Object(const Object &) : m_refCount(0) { print_copy_created(this); }
+
+    /// Return the current reference count
+    int getRefCount() const { return m_refCount; };
+
+    /// Increase the object's reference count by one
+    void incRef() const { ++m_refCount; }
+
+    /** \brief Decrease the reference count of
+     * the object and possibly deallocate it.
+     *
+     * The object will automatically be deallocated once
+     * the reference count reaches zero.
+     */
+    void decRef(bool dealloc = true) const {
+        --m_refCount;
+        if (m_refCount == 0 && dealloc)
+            delete this;
+        else if (m_refCount < 0)
+            throw std::runtime_error("Internal error: reference count < 0!");
+    }
+
+    virtual std::string toString() const = 0;
+protected:
+    /** \brief Virtual protected deconstructor.
+     * (Will only be called by \ref ref)
+     */
+    virtual ~Object() { print_destroyed(this); }
+private:
+    mutable std::atomic<int> m_refCount { 0 };
+};
+
+// Tag class used to track constructions of ref objects.  When we track constructors, below, we
+// track and print out the actual class (e.g. ref<MyObject>), and *also* add a fake tracker for
+// ref_tag.  This lets us check that the total number of ref<Anything> constructors/destructors is
+// correct without having to check each individual ref<Whatever> type individually.
+class ref_tag {};
+
+/**
+ * \brief Reference counting helper
+ *
+ * The \a ref refeference template is a simple wrapper to store a
+ * pointer to an object. It takes care of increasing and decreasing
+ * the reference count of the object. When the last reference goes
+ * out of scope, the associated object will be deallocated.
+ *
+ * \ingroup libcore
+ */
+template <typename T> class ref {
+public:
+    /// Create a nullptr reference
+    ref() : m_ptr(nullptr) { print_default_created(this); track_default_created((ref_tag*) this); }
+
+    /// Construct a reference from a pointer
+    ref(T *ptr) : m_ptr(ptr) {
+        if (m_ptr) ((Object *) m_ptr)->incRef();
+
+        print_created(this, "from pointer", m_ptr); track_created((ref_tag*) this, "from pointer");
+
+    }
+
+    /// Copy constructor
+    ref(const ref &r) : m_ptr(r.m_ptr) {
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+
+        print_copy_created(this, "with pointer", m_ptr); track_copy_created((ref_tag*) this);
+    }
+
+    /// Move constructor
+    ref(ref &&r) : m_ptr(r.m_ptr) {
+        r.m_ptr = nullptr;
+
+        print_move_created(this, "with pointer", m_ptr); track_move_created((ref_tag*) this);
+    }
+
+    /// Destroy this reference
+    ~ref() {
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+
+        print_destroyed(this); track_destroyed((ref_tag*) this);
+    }
+
+    /// Move another reference into the current one
+    ref& operator=(ref&& r) {
+        print_move_assigned(this, "pointer", r.m_ptr); track_move_assigned((ref_tag*) this);
+
+        if (*this == r)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = r.m_ptr;
+        r.m_ptr = nullptr;
+        return *this;
+    }
+
+    /// Overwrite this reference with another reference
+    ref& operator=(const ref& r) {
+        print_copy_assigned(this, "pointer", r.m_ptr); track_copy_assigned((ref_tag*) this);
+
+        if (m_ptr == r.m_ptr)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = r.m_ptr;
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+        return *this;
+    }
+
+    /// Overwrite this reference with a pointer to another object
+    ref& operator=(T *ptr) {
+        print_values(this, "assigned pointer"); track_values((ref_tag*) this, "assigned pointer");
+
+        if (m_ptr == ptr)
+            return *this;
+        if (m_ptr)
+            ((Object *) m_ptr)->decRef();
+        m_ptr = ptr;
+        if (m_ptr)
+            ((Object *) m_ptr)->incRef();
+        return *this;
+    }
+
+    /// Compare this reference with another reference
+    bool operator==(const ref &r) const { return m_ptr == r.m_ptr; }
+
+    /// Compare this reference with another reference
+    bool operator!=(const ref &r) const { return m_ptr != r.m_ptr; }
+
+    /// Compare this reference with a pointer
+    bool operator==(const T* ptr) const { return m_ptr == ptr; }
+
+    /// Compare this reference with a pointer
+    bool operator!=(const T* ptr) const { return m_ptr != ptr; }
+
+    /// Access the object referenced by this reference
+    T* operator->() { return m_ptr; }
+
+    /// Access the object referenced by this reference
+    const T* operator->() const { return m_ptr; }
+
+    /// Return a C++ reference to the referenced object
+    T& operator*() { return *m_ptr; }
+
+    /// Return a const C++ reference to the referenced object
+    const T& operator*() const { return *m_ptr; }
+
+    /// Return a pointer to the referenced object
+    operator T* () { return m_ptr; }
+
+    /// Return a const pointer to the referenced object
+    T* get_ptr() { return m_ptr; }
+
+    /// Return a pointer to the referenced object
+    const T* get_ptr() const { return m_ptr; }
+private:
+    T *m_ptr;
+};
+
+#endif /* __OBJECT_H */
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cc
new file mode 100644
index 0000000..f705e31
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cc
@@ -0,0 +1,123 @@
+/*
+    tests/pybind11_cross_module_tests.cpp -- contains tests that require multiple modules
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+PYBIND11_MODULE(pybind11_cross_module_tests, m) {
+    m.doc() = "pybind11 cross-module test module";
+
+    // test_local_bindings.py tests:
+    //
+    // Definitions here are tested by importing both this module and the
+    // relevant pybind11_tests submodule from a test_whatever.py
+
+    // test_load_external
+    bind_local<ExternalType1>(m, "ExternalType1", py::module_local());
+    bind_local<ExternalType2>(m, "ExternalType2", py::module_local());
+
+    // test_exceptions.py
+    m.def("raise_runtime_error", []() { PyErr_SetString(PyExc_RuntimeError, "My runtime error"); throw py::error_already_set(); });
+    m.def("raise_value_error", []() { PyErr_SetString(PyExc_ValueError, "My value error"); throw py::error_already_set(); });
+    m.def("throw_pybind_value_error", []() { throw py::value_error("pybind11 value error"); });
+    m.def("throw_pybind_type_error", []() { throw py::type_error("pybind11 type error"); });
+    m.def("throw_stop_iteration", []() { throw py::stop_iteration(); });
+
+    // test_local_bindings.py
+    // Local to both:
+    bind_local<LocalType, 1>(m, "LocalType", py::module_local())
+        .def("get2", [](LocalType &t) { return t.i + 2; })
+        ;
+
+    // Can only be called with our python type:
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // This registration will fail (global registration when LocalFail is already registered
+    // globally in the main test module):
+    m.def("register_nonlocal", [m]() {
+        bind_local<NonLocalType, 0>(m, "NonLocalType");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+
+    // test_stl_bind_global
+    // and global if the type (or one of the types, for the map) is global (so these will fail,
+    // assuming pybind11_tests is already loaded):
+    m.def("register_nonlocal_vec", [m]() {
+        py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    });
+    m.def("register_nonlocal_map", [m]() {
+        py::bind_map<NonLocalMap>(m, "NonLocalMap");
+    });
+    // The default can, however, be overridden to global using `py::module_local()` or
+    // `py::module_local(false)`.
+    // Explicitly made local:
+    py::bind_vector<NonLocalVec2>(m, "NonLocalVec2", py::module_local());
+    // Explicitly made global (and so will fail to bind):
+    m.def("register_nonlocal_map2", [m]() {
+        py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+    });
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global_local", [m]() {
+        bind_local<MixedGlobalLocal, 200>(m, "MixedGlobalLocal", py::module_local());
+    });
+    m.def("register_mixed_local_global", [m]() {
+        bind_local<MixedLocalGlobal, 2000>(m, "MixedLocalGlobal", py::module_local(false));
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+
+    m.def("load_vector_via_binding", [](std::vector<int> &v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL", py::module_local()).def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 100; });
+
+    py::class_<MixGL2>(m, "MixGL2", py::module_local()).def(py::init<int>());
+
+    // test_vector_bool
+    // We can't test both stl.h and stl_bind.h conversions of `std::vector<bool>` within
+    // the same module (it would be an ODR violation). Therefore `bind_vector` of `bool`
+    // is defined here and tested in `test_stl_binders.py`.
+    py::bind_vector<std::vector<bool>>(m, "VectorBool");
+
+    // test_missing_header_message
+    // The main module already includes stl.h, but we need to test the error message
+    // which appears when this header is missing.
+    m.def("missing_header_arg", [](std::vector<float>) { });
+    m.def("missing_header_return", []() { return std::vector<float>(); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cpp
new file mode 100644
index 0000000..f705e31
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cpp
@@ -0,0 +1,123 @@
+/*
+    tests/pybind11_cross_module_tests.cpp -- contains tests that require multiple modules
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+PYBIND11_MODULE(pybind11_cross_module_tests, m) {
+    m.doc() = "pybind11 cross-module test module";
+
+    // test_local_bindings.py tests:
+    //
+    // Definitions here are tested by importing both this module and the
+    // relevant pybind11_tests submodule from a test_whatever.py
+
+    // test_load_external
+    bind_local<ExternalType1>(m, "ExternalType1", py::module_local());
+    bind_local<ExternalType2>(m, "ExternalType2", py::module_local());
+
+    // test_exceptions.py
+    m.def("raise_runtime_error", []() { PyErr_SetString(PyExc_RuntimeError, "My runtime error"); throw py::error_already_set(); });
+    m.def("raise_value_error", []() { PyErr_SetString(PyExc_ValueError, "My value error"); throw py::error_already_set(); });
+    m.def("throw_pybind_value_error", []() { throw py::value_error("pybind11 value error"); });
+    m.def("throw_pybind_type_error", []() { throw py::type_error("pybind11 type error"); });
+    m.def("throw_stop_iteration", []() { throw py::stop_iteration(); });
+
+    // test_local_bindings.py
+    // Local to both:
+    bind_local<LocalType, 1>(m, "LocalType", py::module_local())
+        .def("get2", [](LocalType &t) { return t.i + 2; })
+        ;
+
+    // Can only be called with our python type:
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // This registration will fail (global registration when LocalFail is already registered
+    // globally in the main test module):
+    m.def("register_nonlocal", [m]() {
+        bind_local<NonLocalType, 0>(m, "NonLocalType");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+
+    // test_stl_bind_global
+    // and global if the type (or one of the types, for the map) is global (so these will fail,
+    // assuming pybind11_tests is already loaded):
+    m.def("register_nonlocal_vec", [m]() {
+        py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    });
+    m.def("register_nonlocal_map", [m]() {
+        py::bind_map<NonLocalMap>(m, "NonLocalMap");
+    });
+    // The default can, however, be overridden to global using `py::module_local()` or
+    // `py::module_local(false)`.
+    // Explicitly made local:
+    py::bind_vector<NonLocalVec2>(m, "NonLocalVec2", py::module_local());
+    // Explicitly made global (and so will fail to bind):
+    m.def("register_nonlocal_map2", [m]() {
+        py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+    });
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global_local", [m]() {
+        bind_local<MixedGlobalLocal, 200>(m, "MixedGlobalLocal", py::module_local());
+    });
+    m.def("register_mixed_local_global", [m]() {
+        bind_local<MixedLocalGlobal, 2000>(m, "MixedLocalGlobal", py::module_local(false));
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+
+    m.def("load_vector_via_binding", [](std::vector<int> &v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL", py::module_local()).def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 100; });
+
+    py::class_<MixGL2>(m, "MixGL2", py::module_local()).def(py::init<int>());
+
+    // test_vector_bool
+    // We can't test both stl.h and stl_bind.h conversions of `std::vector<bool>` within
+    // the same module (it would be an ODR violation). Therefore `bind_vector` of `bool`
+    // is defined here and tested in `test_stl_binders.py`.
+    py::bind_vector<std::vector<bool>>(m, "VectorBool");
+
+    // test_missing_header_message
+    // The main module already includes stl.h, but we need to test the error message
+    // which appears when this header is missing.
+    m.def("missing_header_arg", [](std::vector<float>) { });
+    m.def("missing_header_return", []() { return std::vector<float>(); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cc
new file mode 100644
index 0000000..bc7d2c3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cc
@@ -0,0 +1,93 @@
+/*
+    tests/pybind11_tests.cpp -- pybind example plugin
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#include <functional>
+#include <list>
+
+/*
+For testing purposes, we define a static global variable here in a function that each individual
+test .cpp calls with its initialization lambda.  It's convenient here because we can just not
+compile some test files to disable/ignore some of the test code.
+
+It is NOT recommended as a way to use pybind11 in practice, however: the initialization order will
+be essentially random, which is okay for our test scripts (there are no dependencies between the
+individual pybind11 test .cpp files), but most likely not what you want when using pybind11
+productively.
+
+Instead, see the "How can I reduce the build time?" question in the "Frequently asked questions"
+section of the documentation for good practice on splitting binding code over multiple files.
+*/
+std::list<std::function<void(py::module &)>> &initializers() {
+    static std::list<std::function<void(py::module &)>> inits;
+    return inits;
+}
+
+test_initializer::test_initializer(Initializer init) {
+    initializers().push_back(init);
+}
+
+test_initializer::test_initializer(const char *submodule_name, Initializer init) {
+    initializers().push_back([=](py::module &parent) {
+        auto m = parent.def_submodule(submodule_name);
+        init(m);
+    });
+}
+
+void bind_ConstructorStats(py::module &m) {
+    py::class_<ConstructorStats>(m, "ConstructorStats")
+        .def("alive", &ConstructorStats::alive)
+        .def("values", &ConstructorStats::values)
+        .def_readwrite("default_constructions", &ConstructorStats::default_constructions)
+        .def_readwrite("copy_assignments", &ConstructorStats::copy_assignments)
+        .def_readwrite("move_assignments", &ConstructorStats::move_assignments)
+        .def_readwrite("copy_constructions", &ConstructorStats::copy_constructions)
+        .def_readwrite("move_constructions", &ConstructorStats::move_constructions)
+        .def_static("get", (ConstructorStats &(*)(py::object)) &ConstructorStats::get, py::return_value_policy::reference_internal)
+
+        // Not exactly ConstructorStats, but related: expose the internal pybind number of registered instances
+        // to allow instance cleanup checks (invokes a GC first)
+        .def_static("detail_reg_inst", []() {
+            ConstructorStats::gc();
+            return py::detail::get_internals().registered_instances.size();
+        })
+        ;
+}
+
+PYBIND11_MODULE(pybind11_tests, m) {
+    m.doc() = "pybind11 test module";
+
+    bind_ConstructorStats(m);
+
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+
+    py::class_<UserType>(m, "UserType", "A `py::class_` type for testing")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("get_value", &UserType::value, "Get value using a method")
+        .def("set_value", &UserType::set, "Set value using a method")
+        .def_property("value", &UserType::value, &UserType::set, "Get/set value using a property")
+        .def("__repr__", [](const UserType& u) { return "UserType({})"_s.format(u.value()); });
+
+    py::class_<IncType, UserType>(m, "IncType")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("__repr__", [](const IncType& u) { return "IncType({})"_s.format(u.value()); });
+
+    for (const auto &initializer : initializers())
+        initializer(m);
+
+    if (!py::hasattr(m, "have_eigen")) m.attr("have_eigen") = false;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cpp
new file mode 100644
index 0000000..bc7d2c3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cpp
@@ -0,0 +1,93 @@
+/*
+    tests/pybind11_tests.cpp -- pybind example plugin
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#include <functional>
+#include <list>
+
+/*
+For testing purposes, we define a static global variable here in a function that each individual
+test .cpp calls with its initialization lambda.  It's convenient here because we can just not
+compile some test files to disable/ignore some of the test code.
+
+It is NOT recommended as a way to use pybind11 in practice, however: the initialization order will
+be essentially random, which is okay for our test scripts (there are no dependencies between the
+individual pybind11 test .cpp files), but most likely not what you want when using pybind11
+productively.
+
+Instead, see the "How can I reduce the build time?" question in the "Frequently asked questions"
+section of the documentation for good practice on splitting binding code over multiple files.
+*/
+std::list<std::function<void(py::module &)>> &initializers() {
+    static std::list<std::function<void(py::module &)>> inits;
+    return inits;
+}
+
+test_initializer::test_initializer(Initializer init) {
+    initializers().push_back(init);
+}
+
+test_initializer::test_initializer(const char *submodule_name, Initializer init) {
+    initializers().push_back([=](py::module &parent) {
+        auto m = parent.def_submodule(submodule_name);
+        init(m);
+    });
+}
+
+void bind_ConstructorStats(py::module &m) {
+    py::class_<ConstructorStats>(m, "ConstructorStats")
+        .def("alive", &ConstructorStats::alive)
+        .def("values", &ConstructorStats::values)
+        .def_readwrite("default_constructions", &ConstructorStats::default_constructions)
+        .def_readwrite("copy_assignments", &ConstructorStats::copy_assignments)
+        .def_readwrite("move_assignments", &ConstructorStats::move_assignments)
+        .def_readwrite("copy_constructions", &ConstructorStats::copy_constructions)
+        .def_readwrite("move_constructions", &ConstructorStats::move_constructions)
+        .def_static("get", (ConstructorStats &(*)(py::object)) &ConstructorStats::get, py::return_value_policy::reference_internal)
+
+        // Not exactly ConstructorStats, but related: expose the internal pybind number of registered instances
+        // to allow instance cleanup checks (invokes a GC first)
+        .def_static("detail_reg_inst", []() {
+            ConstructorStats::gc();
+            return py::detail::get_internals().registered_instances.size();
+        })
+        ;
+}
+
+PYBIND11_MODULE(pybind11_tests, m) {
+    m.doc() = "pybind11 test module";
+
+    bind_ConstructorStats(m);
+
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+
+    py::class_<UserType>(m, "UserType", "A `py::class_` type for testing")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("get_value", &UserType::value, "Get value using a method")
+        .def("set_value", &UserType::set, "Set value using a method")
+        .def_property("value", &UserType::value, &UserType::set, "Get/set value using a property")
+        .def("__repr__", [](const UserType& u) { return "UserType({})"_s.format(u.value()); });
+
+    py::class_<IncType, UserType>(m, "IncType")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("__repr__", [](const IncType& u) { return "IncType({})"_s.format(u.value()); });
+
+    for (const auto &initializer : initializers())
+        initializer(m);
+
+    if (!py::hasattr(m, "have_eigen")) m.attr("have_eigen") = false;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.h b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.h
new file mode 100644
index 0000000..90963a5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <pybind11/pybind11.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+// We get some really long type names here which causes MSVC 2015 to emit warnings
+#  pragma warning(disable: 4503) // warning C4503: decorated name length exceeded, name was truncated
+#endif
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+class test_initializer {
+    using Initializer = void (*)(py::module &);
+
+public:
+    test_initializer(Initializer init);
+    test_initializer(const char *submodule_name, Initializer init);
+};
+
+#define TEST_SUBMODULE(name, variable)                   \
+    void test_submodule_##name(py::module &);            \
+    test_initializer name(#name, test_submodule_##name); \
+    void test_submodule_##name(py::module &variable)
+
+
+/// Dummy type which is not exported anywhere -- something to trigger a conversion error
+struct UnregisteredType { };
+
+/// A user-defined type which is exported and can be used by any test
+class UserType {
+public:
+    UserType() = default;
+    UserType(int i) : i(i) { }
+
+    int value() const { return i; }
+    void set(int set) { i = set; }
+
+private:
+    int i = -1;
+};
+
+/// Like UserType, but increments `value` on copy for quick reference vs. copy tests
+class IncType : public UserType {
+public:
+    using UserType::UserType;
+    IncType() = default;
+    IncType(const IncType &other) : IncType(other.value() + 1) { }
+    IncType(IncType &&) = delete;
+    IncType &operator=(const IncType &) = delete;
+    IncType &operator=(IncType &&) = delete;
+};
+
+/// Custom cast-only type that casts to a string "rvalue" or "lvalue" depending on the cast context.
+/// Used to test recursive casters (e.g. std::tuple, stl containers).
+struct RValueCaster {};
+NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(detail)
+template<> class type_caster<RValueCaster> {
+public:
+    PYBIND11_TYPE_CASTER(RValueCaster, _("RValueCaster"));
+    static handle cast(RValueCaster &&, return_value_policy, handle) { return py::str("rvalue").release(); }
+    static handle cast(const RValueCaster &, return_value_policy, handle) { return py::str("lvalue").release(); }
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pytest.ini b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pytest.ini
new file mode 100644
index 0000000..f209964
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/pytest.ini
@@ -0,0 +1,16 @@
+[pytest]
+minversion = 3.0
+norecursedirs = test_cmake_build test_embed
+addopts =
+    # show summary of skipped tests
+    -rs
+    # capture only Python print and C++ py::print, but not C output (low-level Python errors)
+    --capture=sys
+filterwarnings =
+    # make warnings into errors but ignore certain third-party extension issues
+    error
+    # importing scipy submodules on some version of Python
+    ignore::ImportWarning
+    # bogus numpy ABI warning (see numpy/#432)
+    ignore:.*numpy.dtype size changed.*:RuntimeWarning
+    ignore:.*numpy.ufunc size changed.*:RuntimeWarning
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cc
new file mode 100644
index 0000000..f0ad0d5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cc
@@ -0,0 +1,26 @@
+/*
+    tests/test_async.cpp -- __await__ support
+
+    Copyright (c) 2019 Google Inc.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(async_module, m) {
+    struct DoesNotSupportAsync {};
+    py::class_<DoesNotSupportAsync>(m, "DoesNotSupportAsync")
+        .def(py::init<>());
+    struct SupportsAsync {};
+    py::class_<SupportsAsync>(m, "SupportsAsync")
+        .def(py::init<>())
+        .def("__await__", [](const SupportsAsync& self) -> py::object {
+            static_cast<void>(self);
+            py::object loop = py::module::import("asyncio.events").attr("get_event_loop")();
+            py::object f = loop.attr("create_future")();
+            f.attr("set_result")(5);
+            return f.attr("__await__")();
+        });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cpp
new file mode 100644
index 0000000..f0ad0d5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cpp
@@ -0,0 +1,26 @@
+/*
+    tests/test_async.cpp -- __await__ support
+
+    Copyright (c) 2019 Google Inc.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(async_module, m) {
+    struct DoesNotSupportAsync {};
+    py::class_<DoesNotSupportAsync>(m, "DoesNotSupportAsync")
+        .def(py::init<>());
+    struct SupportsAsync {};
+    py::class_<SupportsAsync>(m, "SupportsAsync")
+        .def(py::init<>())
+        .def("__await__", [](const SupportsAsync& self) -> py::object {
+            static_cast<void>(self);
+            py::object loop = py::module::import("asyncio.events").attr("get_event_loop")();
+            py::object f = loop.attr("create_future")();
+            f.attr("set_result")(5);
+            return f.attr("__await__")();
+        });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.py
new file mode 100644
index 0000000..e1c959d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.py
@@ -0,0 +1,23 @@
+import asyncio
+import pytest
+from pybind11_tests import async_module as m
+
+
+@pytest.fixture
+def event_loop():
+    loop = asyncio.new_event_loop()
+    yield loop
+    loop.close()
+
+
+async def get_await_result(x):
+    return await x
+
+
+def test_await(event_loop):
+    assert 5 == event_loop.run_until_complete(get_await_result(m.SupportsAsync()))
+
+
+def test_await_missing(event_loop):
+    with pytest.raises(TypeError):
+        event_loop.run_until_complete(get_await_result(m.DoesNotSupportAsync()))
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cc
new file mode 100644
index 0000000..1bc67ff
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cc
@@ -0,0 +1,195 @@
+/*
+    tests/test_buffers.cpp -- supporting Pythons' buffer protocol
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(buffers, m) {
+    // test_from_python / test_to_python:
+    class Matrix {
+    public:
+        Matrix(ssize_t rows, ssize_t cols) : m_rows(rows), m_cols(cols) {
+            print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (rows*cols)];
+            memset(m_data, 0, sizeof(float) * (size_t) (rows * cols));
+        }
+
+        Matrix(const Matrix &s) : m_rows(s.m_rows), m_cols(s.m_cols) {
+            print_copy_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+        }
+
+        Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_rows = 0;
+            s.m_cols = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Matrix() {
+            print_destroyed(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+        }
+
+        Matrix &operator=(const Matrix &s) {
+            print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+            m_rows = s.m_rows;
+            m_cols = s.m_cols;
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+            return *this;
+        }
+
+        Matrix &operator=(Matrix &&s) {
+            print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            if (&s != this) {
+                delete[] m_data;
+                m_rows = s.m_rows; m_cols = s.m_cols; m_data = s.m_data;
+                s.m_rows = 0; s.m_cols = 0; s.m_data = nullptr;
+            }
+            return *this;
+        }
+
+        float operator()(ssize_t i, ssize_t j) const {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float &operator()(ssize_t i, ssize_t j) {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float *data() { return m_data; }
+
+        ssize_t rows() const { return m_rows; }
+        ssize_t cols() const { return m_cols; }
+    private:
+        ssize_t m_rows;
+        ssize_t m_cols;
+        float *m_data;
+    };
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def(py::init<ssize_t, ssize_t>())
+        /// Construct from a buffer
+        .def(py::init([](py::buffer const b) {
+            py::buffer_info info = b.request();
+            if (info.format != py::format_descriptor<float>::format() || info.ndim != 2)
+                throw std::runtime_error("Incompatible buffer format!");
+
+            auto v = new Matrix(info.shape[0], info.shape[1]);
+            memcpy(v->data(), info.ptr, sizeof(float) * (size_t) (v->rows() * v->cols()));
+            return v;
+        }))
+
+       .def("rows", &Matrix::rows)
+       .def("cols", &Matrix::cols)
+
+        /// Bare bones interface
+       .def("__getitem__", [](const Matrix &m, std::pair<ssize_t, ssize_t> i) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            return m(i.first, i.second);
+        })
+       .def("__setitem__", [](Matrix &m, std::pair<ssize_t, ssize_t> i, float v) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            m(i.first, i.second) = v;
+        })
+       /// Provide buffer access
+       .def_buffer([](Matrix &m) -> py::buffer_info {
+            return py::buffer_info(
+                m.data(),                               /* Pointer to buffer */
+                { m.rows(), m.cols() },                 /* Buffer dimensions */
+                { sizeof(float) * size_t(m.cols()),     /* Strides (in bytes) for each index */
+                  sizeof(float) }
+            );
+        })
+        ;
+
+
+    // test_inherited_protocol
+    class SquareMatrix : public Matrix {
+    public:
+        SquareMatrix(ssize_t n) : Matrix(n, n) { }
+    };
+    // Derived classes inherit the buffer protocol and the buffer access function
+    py::class_<SquareMatrix, Matrix>(m, "SquareMatrix")
+        .def(py::init<ssize_t>());
+
+
+    // test_pointer_to_member_fn
+    // Tests that passing a pointer to member to the base class works in
+    // the derived class.
+    struct Buffer {
+        int32_t value = 0;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, sizeof(value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+    };
+    py::class_<Buffer>(m, "Buffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &Buffer::value)
+        .def_buffer(&Buffer::get_buffer_info);
+
+
+    class ConstBuffer {
+        std::unique_ptr<int32_t> value;
+
+    public:
+        int32_t get_value() const { return *value; }
+        void set_value(int32_t v) { *value = v; }
+
+        py::buffer_info get_buffer_info() const {
+            return py::buffer_info(value.get(), sizeof(*value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+
+        ConstBuffer() : value(new int32_t{0}) { };
+    };
+    py::class_<ConstBuffer>(m, "ConstBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_property("value", &ConstBuffer::get_value, &ConstBuffer::set_value)
+        .def_buffer(&ConstBuffer::get_buffer_info);
+
+    struct DerivedBuffer : public Buffer { };
+    py::class_<DerivedBuffer>(m, "DerivedBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", (int32_t DerivedBuffer::*) &DerivedBuffer::value)
+        .def_buffer(&DerivedBuffer::get_buffer_info);
+
+    struct BufferReadOnly {
+        const uint8_t value = 0;
+        BufferReadOnly(uint8_t value): value(value) {}
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1);
+        }
+    };
+    py::class_<BufferReadOnly>(m, "BufferReadOnly", py::buffer_protocol())
+        .def(py::init<uint8_t>())
+        .def_buffer(&BufferReadOnly::get_buffer_info);
+
+    struct BufferReadOnlySelect {
+        uint8_t value = 0;
+        bool readonly = false;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1, readonly);
+        }
+    };
+    py::class_<BufferReadOnlySelect>(m, "BufferReadOnlySelect", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &BufferReadOnlySelect::value)
+        .def_readwrite("readonly", &BufferReadOnlySelect::readonly)
+        .def_buffer(&BufferReadOnlySelect::get_buffer_info);
+
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cpp
new file mode 100644
index 0000000..1bc67ff
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cpp
@@ -0,0 +1,195 @@
+/*
+    tests/test_buffers.cpp -- supporting Pythons' buffer protocol
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(buffers, m) {
+    // test_from_python / test_to_python:
+    class Matrix {
+    public:
+        Matrix(ssize_t rows, ssize_t cols) : m_rows(rows), m_cols(cols) {
+            print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (rows*cols)];
+            memset(m_data, 0, sizeof(float) * (size_t) (rows * cols));
+        }
+
+        Matrix(const Matrix &s) : m_rows(s.m_rows), m_cols(s.m_cols) {
+            print_copy_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+        }
+
+        Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_rows = 0;
+            s.m_cols = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Matrix() {
+            print_destroyed(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+        }
+
+        Matrix &operator=(const Matrix &s) {
+            print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+            m_rows = s.m_rows;
+            m_cols = s.m_cols;
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+            return *this;
+        }
+
+        Matrix &operator=(Matrix &&s) {
+            print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            if (&s != this) {
+                delete[] m_data;
+                m_rows = s.m_rows; m_cols = s.m_cols; m_data = s.m_data;
+                s.m_rows = 0; s.m_cols = 0; s.m_data = nullptr;
+            }
+            return *this;
+        }
+
+        float operator()(ssize_t i, ssize_t j) const {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float &operator()(ssize_t i, ssize_t j) {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
+
+        float *data() { return m_data; }
+
+        ssize_t rows() const { return m_rows; }
+        ssize_t cols() const { return m_cols; }
+    private:
+        ssize_t m_rows;
+        ssize_t m_cols;
+        float *m_data;
+    };
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def(py::init<ssize_t, ssize_t>())
+        /// Construct from a buffer
+        .def(py::init([](py::buffer const b) {
+            py::buffer_info info = b.request();
+            if (info.format != py::format_descriptor<float>::format() || info.ndim != 2)
+                throw std::runtime_error("Incompatible buffer format!");
+
+            auto v = new Matrix(info.shape[0], info.shape[1]);
+            memcpy(v->data(), info.ptr, sizeof(float) * (size_t) (v->rows() * v->cols()));
+            return v;
+        }))
+
+       .def("rows", &Matrix::rows)
+       .def("cols", &Matrix::cols)
+
+        /// Bare bones interface
+       .def("__getitem__", [](const Matrix &m, std::pair<ssize_t, ssize_t> i) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            return m(i.first, i.second);
+        })
+       .def("__setitem__", [](Matrix &m, std::pair<ssize_t, ssize_t> i, float v) {
+            if (i.first >= m.rows() || i.second >= m.cols())
+                throw py::index_error();
+            m(i.first, i.second) = v;
+        })
+       /// Provide buffer access
+       .def_buffer([](Matrix &m) -> py::buffer_info {
+            return py::buffer_info(
+                m.data(),                               /* Pointer to buffer */
+                { m.rows(), m.cols() },                 /* Buffer dimensions */
+                { sizeof(float) * size_t(m.cols()),     /* Strides (in bytes) for each index */
+                  sizeof(float) }
+            );
+        })
+        ;
+
+
+    // test_inherited_protocol
+    class SquareMatrix : public Matrix {
+    public:
+        SquareMatrix(ssize_t n) : Matrix(n, n) { }
+    };
+    // Derived classes inherit the buffer protocol and the buffer access function
+    py::class_<SquareMatrix, Matrix>(m, "SquareMatrix")
+        .def(py::init<ssize_t>());
+
+
+    // test_pointer_to_member_fn
+    // Tests that passing a pointer to member to the base class works in
+    // the derived class.
+    struct Buffer {
+        int32_t value = 0;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, sizeof(value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+    };
+    py::class_<Buffer>(m, "Buffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &Buffer::value)
+        .def_buffer(&Buffer::get_buffer_info);
+
+
+    class ConstBuffer {
+        std::unique_ptr<int32_t> value;
+
+    public:
+        int32_t get_value() const { return *value; }
+        void set_value(int32_t v) { *value = v; }
+
+        py::buffer_info get_buffer_info() const {
+            return py::buffer_info(value.get(), sizeof(*value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+
+        ConstBuffer() : value(new int32_t{0}) { };
+    };
+    py::class_<ConstBuffer>(m, "ConstBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_property("value", &ConstBuffer::get_value, &ConstBuffer::set_value)
+        .def_buffer(&ConstBuffer::get_buffer_info);
+
+    struct DerivedBuffer : public Buffer { };
+    py::class_<DerivedBuffer>(m, "DerivedBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", (int32_t DerivedBuffer::*) &DerivedBuffer::value)
+        .def_buffer(&DerivedBuffer::get_buffer_info);
+
+    struct BufferReadOnly {
+        const uint8_t value = 0;
+        BufferReadOnly(uint8_t value): value(value) {}
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1);
+        }
+    };
+    py::class_<BufferReadOnly>(m, "BufferReadOnly", py::buffer_protocol())
+        .def(py::init<uint8_t>())
+        .def_buffer(&BufferReadOnly::get_buffer_info);
+
+    struct BufferReadOnlySelect {
+        uint8_t value = 0;
+        bool readonly = false;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, 1, readonly);
+        }
+    };
+    py::class_<BufferReadOnlySelect>(m, "BufferReadOnlySelect", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &BufferReadOnlySelect::value)
+        .def_readwrite("readonly", &BufferReadOnlySelect::readonly)
+        .def_buffer(&BufferReadOnlySelect::get_buffer_info);
+
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.py
new file mode 100644
index 0000000..bf7aaed
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.py
@@ -0,0 +1,118 @@
+import io
+import struct
+import sys
+
+import pytest
+
+from pybind11_tests import buffers as m
+from pybind11_tests import ConstructorStats
+
+PY3 = sys.version_info[0] >= 3
+
+pytestmark = pytest.requires_numpy
+
+with pytest.suppress(ImportError):
+    import numpy as np
+
+
+def test_from_python():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.Matrix(np.array([1, 2, 3]))  # trying to assign a 1D array
+    assert str(excinfo.value) == "Incompatible buffer format!"
+
+    m3 = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+    m4 = m.Matrix(m3)
+
+    for i in range(m4.rows()):
+        for j in range(m4.cols()):
+            assert m3[i, j] == m4[i, j]
+
+    cstats = ConstructorStats.get(m.Matrix)
+    assert cstats.alive() == 1
+    del m3, m4
+    assert cstats.alive() == 0
+    assert cstats.values() == ["2x3 matrix"]
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0  # Don't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+# PyPy: Memory leak in the "np.array(m, copy=False)" call
+# https://bitbucket.org/pypy/pypy/issues/2444
+@pytest.unsupported_on_pypy
+def test_to_python():
+    mat = m.Matrix(5, 4)
+    assert memoryview(mat).shape == (5, 4)
+
+    assert mat[2, 3] == 0
+    mat[2, 3] = 4.0
+    mat[3, 2] = 7.0
+    assert mat[2, 3] == 4
+    assert mat[3, 2] == 7
+    assert struct.unpack_from('f', mat, (3 * 4 + 2) * 4) == (7, )
+    assert struct.unpack_from('f', mat, (2 * 4 + 3) * 4) == (4, )
+
+    mat2 = np.array(mat, copy=False)
+    assert mat2.shape == (5, 4)
+    assert abs(mat2).sum() == 11
+    assert mat2[2, 3] == 4 and mat2[3, 2] == 7
+    mat2[2, 3] = 5
+    assert mat2[2, 3] == 5
+
+    cstats = ConstructorStats.get(m.Matrix)
+    assert cstats.alive() == 1
+    del mat
+    pytest.gc_collect()
+    assert cstats.alive() == 1
+    del mat2  # holds a mat reference
+    pytest.gc_collect()
+    assert cstats.alive() == 0
+    assert cstats.values() == ["5x4 matrix"]
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0  # Don't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+@pytest.unsupported_on_pypy
+def test_inherited_protocol():
+    """SquareMatrix is derived from Matrix and inherits the buffer protocol"""
+
+    matrix = m.SquareMatrix(5)
+    assert memoryview(matrix).shape == (5, 5)
+    assert np.asarray(matrix).shape == (5, 5)
+
+
+@pytest.unsupported_on_pypy
+def test_pointer_to_member_fn():
+    for cls in [m.Buffer, m.ConstBuffer, m.DerivedBuffer]:
+        buf = cls()
+        buf.value = 0x12345678
+        value = struct.unpack('i', bytearray(buf))[0]
+        assert value == 0x12345678
+
+
+@pytest.unsupported_on_pypy
+def test_readonly_buffer():
+    buf = m.BufferReadOnly(0x64)
+    view = memoryview(buf)
+    assert view[0] == 0x64 if PY3 else b'd'
+    assert view.readonly
+
+
+@pytest.unsupported_on_pypy
+def test_selective_readonly_buffer():
+    buf = m.BufferReadOnlySelect()
+
+    memoryview(buf)[0] = 0x64 if PY3 else b'd'
+    assert buf.value == 0x64
+
+    io.BytesIO(b'A').readinto(buf)
+    assert buf.value == ord(b'A')
+
+    buf.readonly = True
+    with pytest.raises(TypeError):
+        memoryview(buf)[0] = 0 if PY3 else b'\0'
+    with pytest.raises(TypeError):
+        io.BytesIO(b'1').readinto(buf)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cc
new file mode 100644
index 0000000..e026127
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cc
@@ -0,0 +1,170 @@
+/*
+    tests/test_builtin_casters.cpp -- Casters available without any additional headers
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/complex.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+TEST_SUBMODULE(builtin_casters, m) {
+    // test_simple_string
+    m.def("string_roundtrip", [](const char *s) { return s; });
+
+    // test_unicode_conversion
+    // Some test characters in utf16 and utf32 encodings.  The last one (the 𝐀) contains a null byte
+    char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*‽*/, cake32 = 0x1f382 /*🎂*/,              mathbfA32 = 0x1d400 /*𝐀*/;
+    char16_t b16 = 0x62 /*b*/, z16 = 0x7a,       ib16 = 0x203d,       cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
+    std::wstring wstr;
+    wstr.push_back(0x61); // a
+    wstr.push_back(0x2e18); // ⸘
+    if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
+    else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
+    wstr.push_back(0x7a); // z
+
+    m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
+    m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
+    m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
+    m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
+    m.def("bad_utf8_string", []()  { return std::string("abc\xd0" "def"); });
+    m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
+    // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
+    if (PY_MAJOR_VERSION >= 3)
+        m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
+    if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
+        m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
+    m.def("u8_Z", []() -> char { return 'Z'; });
+    m.def("u8_eacute", []() -> char { return '\xe9'; });
+    m.def("u16_ibang", [=]() -> char16_t { return ib16; });
+    m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
+    m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
+
+    // test_single_char_arguments
+    m.attr("wchar_size") = py::cast(sizeof(wchar_t));
+    m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char_lv", [](char &c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
+    m.def("ord_char16_lv", [](char16_t &c) -> uint16_t { return c; });
+    m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
+    m.def("ord_wchar", [](wchar_t c) -> int { return c; });
+
+    // test_bytes_to_string
+    m.def("strlen", [](char *s) { return strlen(s); });
+    m.def("string_length", [](std::string s) { return s.length(); });
+
+    // test_string_view
+#ifdef PYBIND11_HAS_STRING_VIEW
+    m.attr("has_string_view") = true;
+    m.def("string_view_print",   [](std::string_view s)    { py::print(s, s.size()); });
+    m.def("string_view16_print", [](std::u16string_view s) { py::print(s, s.size()); });
+    m.def("string_view32_print", [](std::u32string_view s) { py::print(s, s.size()); });
+    m.def("string_view_chars",   [](std::string_view s)    { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
+    m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view_return",   []() { return std::string_view(u8"utf8 secret \U0001f382"); });
+    m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
+    m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
+#endif
+
+    // test_integer_casting
+    m.def("i32_str", [](std::int32_t v) { return std::to_string(v); });
+    m.def("u32_str", [](std::uint32_t v) { return std::to_string(v); });
+    m.def("i64_str", [](std::int64_t v) { return std::to_string(v); });
+    m.def("u64_str", [](std::uint64_t v) { return std::to_string(v); });
+
+    // test_tuple
+    m.def("pair_passthrough", [](std::pair<bool, std::string> input) {
+        return std::make_pair(input.second, input.first);
+    }, "Return a pair in reversed order");
+    m.def("tuple_passthrough", [](std::tuple<bool, std::string, int> input) {
+        return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
+    }, "Return a triple in reversed order");
+    m.def("empty_tuple", []() { return std::tuple<>(); });
+    static std::pair<RValueCaster, RValueCaster> lvpair;
+    static std::tuple<RValueCaster, RValueCaster, RValueCaster> lvtuple;
+    static std::pair<RValueCaster, std::tuple<RValueCaster, std::pair<RValueCaster, RValueCaster>>> lvnested;
+    m.def("rvalue_pair", []() { return std::make_pair(RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_pair", []() -> const decltype(lvpair) & { return lvpair; });
+    m.def("rvalue_tuple", []() { return std::make_tuple(RValueCaster{}, RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_tuple", []() -> const decltype(lvtuple) & { return lvtuple; });
+    m.def("rvalue_nested", []() {
+        return std::make_pair(RValueCaster{}, std::make_tuple(RValueCaster{}, std::make_pair(RValueCaster{}, RValueCaster{}))); });
+    m.def("lvalue_nested", []() -> const decltype(lvnested) & { return lvnested; });
+
+    // test_builtins_cast_return_none
+    m.def("return_none_string", []() -> std::string * { return nullptr; });
+    m.def("return_none_char",   []() -> const char *  { return nullptr; });
+    m.def("return_none_bool",   []() -> bool *        { return nullptr; });
+    m.def("return_none_int",    []() -> int *         { return nullptr; });
+    m.def("return_none_float",  []() -> float *       { return nullptr; });
+
+    // test_none_deferred
+    m.def("defer_none_cstring", [](char *) { return false; });
+    m.def("defer_none_cstring", [](py::none) { return true; });
+    m.def("defer_none_custom", [](UserType *) { return false; });
+    m.def("defer_none_custom", [](py::none) { return true; });
+    m.def("nodefer_none_void", [](void *) { return true; });
+    m.def("nodefer_none_void", [](py::none) { return false; });
+
+    // test_void_caster
+    m.def("load_nullptr_t", [](std::nullptr_t) {}); // not useful, but it should still compile
+    m.def("cast_nullptr_t", []() { return std::nullptr_t{}; });
+
+    // test_bool_caster
+    m.def("bool_passthrough", [](bool arg) { return arg; });
+    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg().noconvert());
+
+    // test_reference_wrapper
+    m.def("refwrap_builtin", [](std::reference_wrapper<int> p) { return 10 * p.get(); });
+    m.def("refwrap_usertype", [](std::reference_wrapper<UserType> p) { return p.get().value(); });
+    // Not currently supported (std::pair caster has return-by-value cast operator);
+    // triggers static_assert failure.
+    //m.def("refwrap_pair", [](std::reference_wrapper<std::pair<int, int>>) { });
+
+    m.def("refwrap_list", [](bool copy) {
+        static IncType x1(1), x2(2);
+        py::list l;
+        for (auto &f : {std::ref(x1), std::ref(x2)}) {
+            l.append(py::cast(f, copy ? py::return_value_policy::copy
+                                      : py::return_value_policy::reference));
+        }
+        return l;
+    }, "copy"_a);
+
+    m.def("refwrap_iiw", [](const IncType &w) { return w.value(); });
+    m.def("refwrap_call_iiw", [](IncType &w, py::function f) {
+        py::list l;
+        l.append(f(std::ref(w)));
+        l.append(f(std::cref(w)));
+        IncType x(w.value());
+        l.append(f(std::ref(x)));
+        IncType y(w.value());
+        auto r3 = std::ref(y);
+        l.append(f(r3));
+        return l;
+    });
+
+    // test_complex
+    m.def("complex_cast", [](float x) { return "{}"_s.format(x); });
+    m.def("complex_cast", [](std::complex<float> x) { return "({}, {})"_s.format(x.real(), x.imag()); });
+
+    // test int vs. long (Python 2)
+    m.def("int_cast", []() {return (int) 42;});
+    m.def("long_cast", []() {return (long) 42;});
+    m.def("longlong_cast", []() {return  ULLONG_MAX;});
+
+    /// test void* cast operator
+    m.def("test_void_caster", []() -> bool {
+        void *v = (void *) 0xabcd;
+        py::object o = py::cast(v);
+        return py::cast<void *>(o) == v;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cpp
new file mode 100644
index 0000000..e026127
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cpp
@@ -0,0 +1,170 @@
+/*
+    tests/test_builtin_casters.cpp -- Casters available without any additional headers
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/complex.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+TEST_SUBMODULE(builtin_casters, m) {
+    // test_simple_string
+    m.def("string_roundtrip", [](const char *s) { return s; });
+
+    // test_unicode_conversion
+    // Some test characters in utf16 and utf32 encodings.  The last one (the 𝐀) contains a null byte
+    char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*‽*/, cake32 = 0x1f382 /*🎂*/,              mathbfA32 = 0x1d400 /*𝐀*/;
+    char16_t b16 = 0x62 /*b*/, z16 = 0x7a,       ib16 = 0x203d,       cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
+    std::wstring wstr;
+    wstr.push_back(0x61); // a
+    wstr.push_back(0x2e18); // ⸘
+    if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
+    else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
+    wstr.push_back(0x7a); // z
+
+    m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
+    m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
+    m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
+    m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
+    m.def("bad_utf8_string", []()  { return std::string("abc\xd0" "def"); });
+    m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
+    // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
+    if (PY_MAJOR_VERSION >= 3)
+        m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
+    if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
+        m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
+    m.def("u8_Z", []() -> char { return 'Z'; });
+    m.def("u8_eacute", []() -> char { return '\xe9'; });
+    m.def("u16_ibang", [=]() -> char16_t { return ib16; });
+    m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
+    m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
+
+    // test_single_char_arguments
+    m.attr("wchar_size") = py::cast(sizeof(wchar_t));
+    m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char_lv", [](char &c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
+    m.def("ord_char16_lv", [](char16_t &c) -> uint16_t { return c; });
+    m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
+    m.def("ord_wchar", [](wchar_t c) -> int { return c; });
+
+    // test_bytes_to_string
+    m.def("strlen", [](char *s) { return strlen(s); });
+    m.def("string_length", [](std::string s) { return s.length(); });
+
+    // test_string_view
+#ifdef PYBIND11_HAS_STRING_VIEW
+    m.attr("has_string_view") = true;
+    m.def("string_view_print",   [](std::string_view s)    { py::print(s, s.size()); });
+    m.def("string_view16_print", [](std::u16string_view s) { py::print(s, s.size()); });
+    m.def("string_view32_print", [](std::u32string_view s) { py::print(s, s.size()); });
+    m.def("string_view_chars",   [](std::string_view s)    { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
+    m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view_return",   []() { return std::string_view(u8"utf8 secret \U0001f382"); });
+    m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
+    m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
+#endif
+
+    // test_integer_casting
+    m.def("i32_str", [](std::int32_t v) { return std::to_string(v); });
+    m.def("u32_str", [](std::uint32_t v) { return std::to_string(v); });
+    m.def("i64_str", [](std::int64_t v) { return std::to_string(v); });
+    m.def("u64_str", [](std::uint64_t v) { return std::to_string(v); });
+
+    // test_tuple
+    m.def("pair_passthrough", [](std::pair<bool, std::string> input) {
+        return std::make_pair(input.second, input.first);
+    }, "Return a pair in reversed order");
+    m.def("tuple_passthrough", [](std::tuple<bool, std::string, int> input) {
+        return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
+    }, "Return a triple in reversed order");
+    m.def("empty_tuple", []() { return std::tuple<>(); });
+    static std::pair<RValueCaster, RValueCaster> lvpair;
+    static std::tuple<RValueCaster, RValueCaster, RValueCaster> lvtuple;
+    static std::pair<RValueCaster, std::tuple<RValueCaster, std::pair<RValueCaster, RValueCaster>>> lvnested;
+    m.def("rvalue_pair", []() { return std::make_pair(RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_pair", []() -> const decltype(lvpair) & { return lvpair; });
+    m.def("rvalue_tuple", []() { return std::make_tuple(RValueCaster{}, RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_tuple", []() -> const decltype(lvtuple) & { return lvtuple; });
+    m.def("rvalue_nested", []() {
+        return std::make_pair(RValueCaster{}, std::make_tuple(RValueCaster{}, std::make_pair(RValueCaster{}, RValueCaster{}))); });
+    m.def("lvalue_nested", []() -> const decltype(lvnested) & { return lvnested; });
+
+    // test_builtins_cast_return_none
+    m.def("return_none_string", []() -> std::string * { return nullptr; });
+    m.def("return_none_char",   []() -> const char *  { return nullptr; });
+    m.def("return_none_bool",   []() -> bool *        { return nullptr; });
+    m.def("return_none_int",    []() -> int *         { return nullptr; });
+    m.def("return_none_float",  []() -> float *       { return nullptr; });
+
+    // test_none_deferred
+    m.def("defer_none_cstring", [](char *) { return false; });
+    m.def("defer_none_cstring", [](py::none) { return true; });
+    m.def("defer_none_custom", [](UserType *) { return false; });
+    m.def("defer_none_custom", [](py::none) { return true; });
+    m.def("nodefer_none_void", [](void *) { return true; });
+    m.def("nodefer_none_void", [](py::none) { return false; });
+
+    // test_void_caster
+    m.def("load_nullptr_t", [](std::nullptr_t) {}); // not useful, but it should still compile
+    m.def("cast_nullptr_t", []() { return std::nullptr_t{}; });
+
+    // test_bool_caster
+    m.def("bool_passthrough", [](bool arg) { return arg; });
+    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg().noconvert());
+
+    // test_reference_wrapper
+    m.def("refwrap_builtin", [](std::reference_wrapper<int> p) { return 10 * p.get(); });
+    m.def("refwrap_usertype", [](std::reference_wrapper<UserType> p) { return p.get().value(); });
+    // Not currently supported (std::pair caster has return-by-value cast operator);
+    // triggers static_assert failure.
+    //m.def("refwrap_pair", [](std::reference_wrapper<std::pair<int, int>>) { });
+
+    m.def("refwrap_list", [](bool copy) {
+        static IncType x1(1), x2(2);
+        py::list l;
+        for (auto &f : {std::ref(x1), std::ref(x2)}) {
+            l.append(py::cast(f, copy ? py::return_value_policy::copy
+                                      : py::return_value_policy::reference));
+        }
+        return l;
+    }, "copy"_a);
+
+    m.def("refwrap_iiw", [](const IncType &w) { return w.value(); });
+    m.def("refwrap_call_iiw", [](IncType &w, py::function f) {
+        py::list l;
+        l.append(f(std::ref(w)));
+        l.append(f(std::cref(w)));
+        IncType x(w.value());
+        l.append(f(std::ref(x)));
+        IncType y(w.value());
+        auto r3 = std::ref(y);
+        l.append(f(r3));
+        return l;
+    });
+
+    // test_complex
+    m.def("complex_cast", [](float x) { return "{}"_s.format(x); });
+    m.def("complex_cast", [](std::complex<float> x) { return "({}, {})"_s.format(x.real(), x.imag()); });
+
+    // test int vs. long (Python 2)
+    m.def("int_cast", []() {return (int) 42;});
+    m.def("long_cast", []() {return (long) 42;});
+    m.def("longlong_cast", []() {return  ULLONG_MAX;});
+
+    /// test void* cast operator
+    m.def("test_void_caster", []() -> bool {
+        void *v = (void *) 0xabcd;
+        py::object o = py::cast(v);
+        return py::cast<void *>(o) == v;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.py
new file mode 100644
index 0000000..abbfcec
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.py
@@ -0,0 +1,346 @@
+# Python < 3 needs this: coding=utf-8
+import pytest
+
+from pybind11_tests import builtin_casters as m
+from pybind11_tests import UserType, IncType
+
+
+def test_simple_string():
+    assert m.string_roundtrip("const char *") == "const char *"
+
+
+def test_unicode_conversion():
+    """Tests unicode conversion and error reporting."""
+    assert m.good_utf8_string() == u"Say utf8‽ 🎂 𝐀"
+    assert m.good_utf16_string() == u"b‽🎂𝐀z"
+    assert m.good_utf32_string() == u"a𝐀🎂‽z"
+    assert m.good_wchar_string() == u"a⸘𝐀z"
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf8_string()
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf16_string()
+
+    # These are provided only if they actually fail (they don't when 32-bit and under Python 2.7)
+    if hasattr(m, "bad_utf32_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_utf32_string()
+    if hasattr(m, "bad_wchar_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_wchar_string()
+
+    assert m.u8_Z() == 'Z'
+    assert m.u8_eacute() == u'é'
+    assert m.u16_ibang() == u'‽'
+    assert m.u32_mathbfA() == u'𝐀'
+    assert m.wchar_heart() == u'♥'
+
+
+def test_single_char_arguments():
+    """Tests failures for passing invalid inputs to char-accepting functions"""
+    def toobig_message(r):
+        return "Character code point not in range({0:#x})".format(r)
+    toolong_message = "Expected a character, but multi-character string found"
+
+    assert m.ord_char(u'a') == 0x61  # simple ASCII
+    assert m.ord_char_lv(u'b') == 0x62
+    assert m.ord_char(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'Ā') == 0x100  # requires 2 bytes, doesn't fit in a char
+    assert str(excinfo.value) == toobig_message(0x100)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'ab')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char16(u'a') == 0x61
+    assert m.ord_char16(u'é') == 0xE9
+    assert m.ord_char16_lv(u'ê') == 0xEA
+    assert m.ord_char16(u'Ā') == 0x100
+    assert m.ord_char16(u'‽') == 0x203d
+    assert m.ord_char16(u'♥') == 0x2665
+    assert m.ord_char16_lv(u'♡') == 0x2661
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'🎂') == 0x1F382  # requires surrogate pair
+    assert str(excinfo.value) == toobig_message(0x10000)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char32(u'a') == 0x61
+    assert m.ord_char32(u'é') == 0xE9
+    assert m.ord_char32(u'Ā') == 0x100
+    assert m.ord_char32(u'‽') == 0x203d
+    assert m.ord_char32(u'♥') == 0x2665
+    assert m.ord_char32(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char32(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_wchar(u'a') == 0x61
+    assert m.ord_wchar(u'é') == 0xE9
+    assert m.ord_wchar(u'Ā') == 0x100
+    assert m.ord_wchar(u'‽') == 0x203d
+    assert m.ord_wchar(u'♥') == 0x2665
+    if m.wchar_size == 2:
+        with pytest.raises(ValueError) as excinfo:
+            assert m.ord_wchar(u'🎂') == 0x1F382  # requires surrogate pair
+        assert str(excinfo.value) == toobig_message(0x10000)
+    else:
+        assert m.ord_wchar(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_wchar(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+
+def test_bytes_to_string():
+    """Tests the ability to pass bytes to C++ string-accepting functions.  Note that this is
+    one-way: the only way to return bytes to Python is via the pybind11::bytes class."""
+    # Issue #816
+    import sys
+    byte = bytes if sys.version_info[0] < 3 else str
+
+    assert m.strlen(byte("hi")) == 2
+    assert m.string_length(byte("world")) == 5
+    assert m.string_length(byte("a\x00b")) == 3
+    assert m.strlen(byte("a\x00b")) == 1  # C-string limitation
+
+    # passing in a utf8 encoded string should work
+    assert m.string_length(u'💩'.encode("utf8")) == 4
+
+
+@pytest.mark.skipif(not hasattr(m, "has_string_view"), reason="no <string_view>")
+def test_string_view(capture):
+    """Tests support for C++17 string_view arguments and return values"""
+    assert m.string_view_chars("Hi") == [72, 105]
+    assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
+    assert m.string_view16_chars("Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
+    assert m.string_view32_chars("Hi 🎂") == [72, 105, 32, 127874]
+
+    assert m.string_view_return() == "utf8 secret 🎂"
+    assert m.string_view16_return() == "utf16 secret 🎂"
+    assert m.string_view32_return() == "utf32 secret 🎂"
+
+    with capture:
+        m.string_view_print("Hi")
+        m.string_view_print("utf8 🎂")
+        m.string_view16_print("utf16 🎂")
+        m.string_view32_print("utf32 🎂")
+    assert capture == """
+        Hi 2
+        utf8 🎂 9
+        utf16 🎂 8
+        utf32 🎂 7
+    """
+
+    with capture:
+        m.string_view_print("Hi, ascii")
+        m.string_view_print("Hi, utf8 🎂")
+        m.string_view16_print("Hi, utf16 🎂")
+        m.string_view32_print("Hi, utf32 🎂")
+    assert capture == """
+        Hi, ascii 9
+        Hi, utf8 🎂 13
+        Hi, utf16 🎂 12
+        Hi, utf32 🎂 11
+    """
+
+
+def test_integer_casting():
+    """Issue #929 - out-of-range integer values shouldn't be accepted"""
+    import sys
+    assert m.i32_str(-1) == "-1"
+    assert m.i64_str(-1) == "-1"
+    assert m.i32_str(2000000000) == "2000000000"
+    assert m.u32_str(2000000000) == "2000000000"
+    if sys.version_info < (3,):
+        assert m.i32_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-999999999999)) == "-999999999999"  # noqa: F821 undefined name
+        assert m.u64_str(long(999999999999)) == "999999999999"  # noqa: F821 undefined name 'long'
+    else:
+        assert m.i64_str(-999999999999) == "-999999999999"
+        assert m.u64_str(999999999999) == "999999999999"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.u32_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.u64_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(-3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    if sys.version_info < (3,):
+        with pytest.raises(TypeError) as excinfo:
+            m.u32_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+        with pytest.raises(TypeError) as excinfo:
+            m.u64_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_tuple(doc):
+    """std::pair <-> tuple & std::tuple <-> tuple"""
+    assert m.pair_passthrough((True, "test")) == ("test", True)
+    assert m.tuple_passthrough((True, "test", 5)) == (5, "test", True)
+    # Any sequence can be cast to a std::pair or std::tuple
+    assert m.pair_passthrough([True, "test"]) == ("test", True)
+    assert m.tuple_passthrough([True, "test", 5]) == (5, "test", True)
+    assert m.empty_tuple() == ()
+
+    assert doc(m.pair_passthrough) == """
+        pair_passthrough(arg0: Tuple[bool, str]) -> Tuple[str, bool]
+
+        Return a pair in reversed order
+    """
+    assert doc(m.tuple_passthrough) == """
+        tuple_passthrough(arg0: Tuple[bool, str, int]) -> Tuple[int, str, bool]
+
+        Return a triple in reversed order
+    """
+
+    assert m.rvalue_pair() == ("rvalue", "rvalue")
+    assert m.lvalue_pair() == ("lvalue", "lvalue")
+    assert m.rvalue_tuple() == ("rvalue", "rvalue", "rvalue")
+    assert m.lvalue_tuple() == ("lvalue", "lvalue", "lvalue")
+    assert m.rvalue_nested() == ("rvalue", ("rvalue", ("rvalue", "rvalue")))
+    assert m.lvalue_nested() == ("lvalue", ("lvalue", ("lvalue", "lvalue")))
+
+
+def test_builtins_cast_return_none():
+    """Casters produced with PYBIND11_TYPE_CASTER() should convert nullptr to None"""
+    assert m.return_none_string() is None
+    assert m.return_none_char() is None
+    assert m.return_none_bool() is None
+    assert m.return_none_int() is None
+    assert m.return_none_float() is None
+
+
+def test_none_deferred():
+    """None passed as various argument types should defer to other overloads"""
+    assert not m.defer_none_cstring("abc")
+    assert m.defer_none_cstring(None)
+    assert not m.defer_none_custom(UserType())
+    assert m.defer_none_custom(None)
+    assert m.nodefer_none_void(None)
+
+
+def test_void_caster():
+    assert m.load_nullptr_t(None) is None
+    assert m.cast_nullptr_t() is None
+
+
+def test_reference_wrapper():
+    """std::reference_wrapper for builtin and user types"""
+    assert m.refwrap_builtin(42) == 420
+    assert m.refwrap_usertype(UserType(42)) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_builtin(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_usertype(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    a1 = m.refwrap_list(copy=True)
+    a2 = m.refwrap_list(copy=True)
+    assert [x.value for x in a1] == [2, 3]
+    assert [x.value for x in a2] == [2, 3]
+    assert not a1[0] is a2[0] and not a1[1] is a2[1]
+
+    b1 = m.refwrap_list(copy=False)
+    b2 = m.refwrap_list(copy=False)
+    assert [x.value for x in b1] == [1, 2]
+    assert [x.value for x in b2] == [1, 2]
+    assert b1[0] is b2[0] and b1[1] is b2[1]
+
+    assert m.refwrap_iiw(IncType(5)) == 5
+    assert m.refwrap_call_iiw(IncType(10), m.refwrap_iiw) == [10, 10, 10, 10]
+
+
+def test_complex_cast():
+    """std::complex casts"""
+    assert m.complex_cast(1) == "1.0"
+    assert m.complex_cast(2j) == "(0.0, 2.0)"
+
+
+def test_bool_caster():
+    """Test bool caster implicit conversions."""
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    def require_implicit(v):
+        pytest.raises(TypeError, noconvert, v)
+
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+
+    # straight up bool
+    assert convert(True) is True
+    assert convert(False) is False
+    assert noconvert(True) is True
+    assert noconvert(False) is False
+
+    # None requires implicit conversion
+    require_implicit(None)
+    assert convert(None) is False
+
+    class A(object):
+        def __init__(self, x):
+            self.x = x
+
+        def __nonzero__(self):
+            return self.x
+
+        def __bool__(self):
+            return self.x
+
+    class B(object):
+        pass
+
+    # Arbitrary objects are not accepted
+    cant_convert(object())
+    cant_convert(B())
+
+    # Objects with __nonzero__ / __bool__ defined can be converted
+    require_implicit(A(True))
+    assert convert(A(True)) is True
+    assert convert(A(False)) is False
+
+
+@pytest.requires_numpy
+def test_numpy_bool():
+    import numpy as np
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+
+    # np.bool_ is not considered implicit
+    assert convert(np.bool_(True)) is True
+    assert convert(np.bool_(False)) is False
+    assert noconvert(np.bool_(True)) is True
+    assert noconvert(np.bool_(False)) is False
+    cant_convert(np.zeros(2, dtype='int'))
+
+
+def test_int_long():
+    """In Python 2, a C++ int should return a Python int rather than long
+    if possible: longs are not always accepted where ints are used (such
+    as the argument to sys.exit()). A C++ long long is always a Python
+    long."""
+
+    import sys
+    must_be_long = type(getattr(sys, 'maxint', 1) + 1)
+    assert isinstance(m.int_cast(), int)
+    assert isinstance(m.long_cast(), int)
+    assert isinstance(m.longlong_cast(), must_be_long)
+
+
+def test_void_caster_2():
+    assert m.test_void_caster()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cc
new file mode 100644
index 0000000..fd24557
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cc
@@ -0,0 +1,100 @@
+/*
+    tests/test_call_policies.cpp -- keep_alive and call_guard
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+struct CustomGuard {
+    static bool enabled;
+
+    CustomGuard() { enabled = true; }
+    ~CustomGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool CustomGuard::enabled = false;
+
+struct DependentGuard {
+    static bool enabled;
+
+    DependentGuard() { enabled = CustomGuard::enabled; }
+    ~DependentGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool DependentGuard::enabled = false;
+
+TEST_SUBMODULE(call_policies, m) {
+    // Parent/Child are used in:
+    // test_keep_alive_argument, test_keep_alive_return_value, test_alive_gc_derived,
+    // test_alive_gc_multi_derived, test_return_none, test_keep_alive_constructor
+    class Child {
+    public:
+        Child() { py::print("Allocating child."); }
+        Child(const Child &) = default;
+        Child(Child &&) = default;
+        ~Child() { py::print("Releasing child."); }
+    };
+    py::class_<Child>(m, "Child")
+        .def(py::init<>());
+
+    class Parent {
+    public:
+        Parent() { py::print("Allocating parent."); }
+        ~Parent() { py::print("Releasing parent."); }
+        void addChild(Child *) { }
+        Child *returnChild() { return new Child(); }
+        Child *returnNullChild() { return nullptr; }
+    };
+    py::class_<Parent>(m, "Parent")
+        .def(py::init<>())
+        .def(py::init([](Child *) { return new Parent(); }), py::keep_alive<1, 2>())
+        .def("addChild", &Parent::addChild)
+        .def("addChildKeepAlive", &Parent::addChild, py::keep_alive<1, 2>())
+        .def("returnChild", &Parent::returnChild)
+        .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
+
+#if !defined(PYPY_VERSION)
+    // test_alive_gc
+    class ParentGC : public Parent {
+    public:
+        using Parent::Parent;
+    };
+    py::class_<ParentGC, Parent>(m, "ParentGC", py::dynamic_attr())
+        .def(py::init<>());
+#endif
+
+    // test_call_guard
+    m.def("unguarded_call", &CustomGuard::report_status);
+    m.def("guarded_call", &CustomGuard::report_status, py::call_guard<CustomGuard>());
+
+    m.def("multiple_guards_correct_order", []() {
+        return CustomGuard::report_status() + std::string(" & ") + DependentGuard::report_status();
+    }, py::call_guard<CustomGuard, DependentGuard>());
+
+    m.def("multiple_guards_wrong_order", []() {
+        return DependentGuard::report_status() + std::string(" & ") + CustomGuard::report_status();
+    }, py::call_guard<DependentGuard, CustomGuard>());
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+    // `py::call_guard<py::gil_scoped_release>()` should work in PyPy as well,
+    // but it's unclear how to test it without `PyGILState_GetThisThreadState`.
+    auto report_gil_status = []() {
+        auto is_gil_held = false;
+        if (auto tstate = py::detail::get_thread_state_unchecked())
+            is_gil_held = (tstate == PyGILState_GetThisThreadState());
+
+        return is_gil_held ? "GIL held" : "GIL released";
+    };
+
+    m.def("with_gil", report_gil_status);
+    m.def("without_gil", report_gil_status, py::call_guard<py::gil_scoped_release>());
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cpp
new file mode 100644
index 0000000..fd24557
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cpp
@@ -0,0 +1,100 @@
+/*
+    tests/test_call_policies.cpp -- keep_alive and call_guard
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+struct CustomGuard {
+    static bool enabled;
+
+    CustomGuard() { enabled = true; }
+    ~CustomGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool CustomGuard::enabled = false;
+
+struct DependentGuard {
+    static bool enabled;
+
+    DependentGuard() { enabled = CustomGuard::enabled; }
+    ~DependentGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool DependentGuard::enabled = false;
+
+TEST_SUBMODULE(call_policies, m) {
+    // Parent/Child are used in:
+    // test_keep_alive_argument, test_keep_alive_return_value, test_alive_gc_derived,
+    // test_alive_gc_multi_derived, test_return_none, test_keep_alive_constructor
+    class Child {
+    public:
+        Child() { py::print("Allocating child."); }
+        Child(const Child &) = default;
+        Child(Child &&) = default;
+        ~Child() { py::print("Releasing child."); }
+    };
+    py::class_<Child>(m, "Child")
+        .def(py::init<>());
+
+    class Parent {
+    public:
+        Parent() { py::print("Allocating parent."); }
+        ~Parent() { py::print("Releasing parent."); }
+        void addChild(Child *) { }
+        Child *returnChild() { return new Child(); }
+        Child *returnNullChild() { return nullptr; }
+    };
+    py::class_<Parent>(m, "Parent")
+        .def(py::init<>())
+        .def(py::init([](Child *) { return new Parent(); }), py::keep_alive<1, 2>())
+        .def("addChild", &Parent::addChild)
+        .def("addChildKeepAlive", &Parent::addChild, py::keep_alive<1, 2>())
+        .def("returnChild", &Parent::returnChild)
+        .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
+
+#if !defined(PYPY_VERSION)
+    // test_alive_gc
+    class ParentGC : public Parent {
+    public:
+        using Parent::Parent;
+    };
+    py::class_<ParentGC, Parent>(m, "ParentGC", py::dynamic_attr())
+        .def(py::init<>());
+#endif
+
+    // test_call_guard
+    m.def("unguarded_call", &CustomGuard::report_status);
+    m.def("guarded_call", &CustomGuard::report_status, py::call_guard<CustomGuard>());
+
+    m.def("multiple_guards_correct_order", []() {
+        return CustomGuard::report_status() + std::string(" & ") + DependentGuard::report_status();
+    }, py::call_guard<CustomGuard, DependentGuard>());
+
+    m.def("multiple_guards_wrong_order", []() {
+        return DependentGuard::report_status() + std::string(" & ") + CustomGuard::report_status();
+    }, py::call_guard<DependentGuard, CustomGuard>());
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+    // `py::call_guard<py::gil_scoped_release>()` should work in PyPy as well,
+    // but it's unclear how to test it without `PyGILState_GetThisThreadState`.
+    auto report_gil_status = []() {
+        auto is_gil_held = false;
+        if (auto tstate = py::detail::get_thread_state_unchecked())
+            is_gil_held = (tstate == PyGILState_GetThisThreadState());
+
+        return is_gil_held ? "GIL held" : "GIL released";
+    };
+
+    m.def("with_gil", report_gil_status);
+    m.def("without_gil", report_gil_status, py::call_guard<py::gil_scoped_release>());
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.py
new file mode 100644
index 0000000..7c83559
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.py
@@ -0,0 +1,187 @@
+import pytest
+from pybind11_tests import call_policies as m
+from pybind11_tests import ConstructorStats
+
+
+def test_keep_alive_argument(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChild(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChildKeepAlive(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_keep_alive_return_value(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChildKeepAlive()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+# https://bitbucket.org/pypy/pypy/issues/2447
+@pytest.unsupported_on_pypy
+def test_alive_gc(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = m.ParentGC()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_derived(capture):
+    class Derived(m.Parent):
+        pass
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_multi_derived(capture):
+    class Derived(m.Parent, m.Child):
+        def __init__(self):
+            m.Parent.__init__(self)
+            m.Child.__init__(self)
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    # +3 rather than +2 because Derived corresponds to two registered instances
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+        Releasing child.
+    """
+
+
+def test_return_none(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveParent()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+
+def test_keep_alive_constructor(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    with capture:
+        p = m.Parent(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == """
+        Allocating child.
+        Allocating parent.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_call_guard():
+    assert m.unguarded_call() == "unguarded"
+    assert m.guarded_call() == "guarded"
+
+    assert m.multiple_guards_correct_order() == "guarded & guarded"
+    assert m.multiple_guards_wrong_order() == "unguarded & guarded"
+
+    if hasattr(m, "with_gil"):
+        assert m.with_gil() == "GIL held"
+        assert m.without_gil() == "GIL released"
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cc
new file mode 100644
index 0000000..71b88c4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cc
@@ -0,0 +1,168 @@
+/*
+    tests/test_callbacks.cpp -- callbacks
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+
+int dummy_function(int i) { return i + 1; }
+
+TEST_SUBMODULE(callbacks, m) {
+    // test_callbacks, test_function_signatures
+    m.def("test_callback1", [](py::object func) { return func(); });
+    m.def("test_callback2", [](py::object func) { return func("Hello", 'x', true, 5); });
+    m.def("test_callback3", [](const std::function<int(int)> &func) {
+        return "func(43) = " + std::to_string(func(43)); });
+    m.def("test_callback4", []() -> std::function<int(int)> { return [](int i) { return i+1; }; });
+    m.def("test_callback5", []() {
+        return py::cpp_function([](int i) { return i+1; }, py::arg("number"));
+    });
+
+    // test_keyword_args_and_generalized_unpacking
+    m.def("test_tuple_unpacking", [](py::function f) {
+        auto t1 = py::make_tuple(2, 3);
+        auto t2 = py::make_tuple(5, 6);
+        return f("positional", 1, *t1, 4, *t2);
+    });
+
+    m.def("test_dict_unpacking", [](py::function f) {
+        auto d1 = py::dict("key"_a="value", "a"_a=1);
+        auto d2 = py::dict();
+        auto d3 = py::dict("b"_a=2);
+        return f("positional", 1, **d1, **d2, **d3);
+    });
+
+    m.def("test_keyword_args", [](py::function f) {
+        return f("x"_a=10, "y"_a=20);
+    });
+
+    m.def("test_unpacking_and_keywords1", [](py::function f) {
+        auto args = py::make_tuple(2);
+        auto kwargs = py::dict("d"_a=4);
+        return f(1, *args, "c"_a=3, **kwargs);
+    });
+
+    m.def("test_unpacking_and_keywords2", [](py::function f) {
+        auto kwargs1 = py::dict("a"_a=1);
+        auto kwargs2 = py::dict("c"_a=3, "d"_a=4);
+        return f("positional", *py::make_tuple(1), 2, *py::make_tuple(3, 4), 5,
+                 "key"_a="value", **kwargs1, "b"_a=2, **kwargs2, "e"_a=5);
+    });
+
+    m.def("test_unpacking_error1", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f("x"_a=1, "y"_a=2, **kwargs); // duplicate ** after keyword
+    });
+
+    m.def("test_unpacking_error2", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f(**kwargs, "x"_a=1); // duplicate keyword after **
+    });
+
+    m.def("test_arg_conversion_error1", [](py::function f) {
+        f(234, UnregisteredType(), "kw"_a=567);
+    });
+
+    m.def("test_arg_conversion_error2", [](py::function f) {
+        f(234, "expected_name"_a=UnregisteredType(), "kw"_a=567);
+    });
+
+    // test_lambda_closure_cleanup
+    struct Payload {
+        Payload() { print_default_created(this); }
+        ~Payload() { print_destroyed(this); }
+        Payload(const Payload &) { print_copy_created(this); }
+        Payload(Payload &&) { print_move_created(this); }
+    };
+    // Export the payload constructor statistics for testing purposes:
+    m.def("payload_cstats", &ConstructorStats::get<Payload>);
+    /* Test cleanup of lambda closure */
+    m.def("test_cleanup", []() -> std::function<void(void)> {
+        Payload p;
+
+        return [p]() {
+            /* p should be cleaned up when the returned function is garbage collected */
+            (void) p;
+        };
+    });
+
+    // test_cpp_function_roundtrip
+    /* Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer */
+    m.def("dummy_function", &dummy_function);
+    m.def("dummy_function2", [](int i, int j) { return i + j; });
+    m.def("roundtrip", [](std::function<int(int)> f, bool expect_none = false) {
+        if (expect_none && f)
+            throw std::runtime_error("Expected None to be converted to empty std::function");
+        return f;
+    }, py::arg("f"), py::arg("expect_none")=false);
+    m.def("test_dummy_function", [](const std::function<int(int)> &f) -> std::string {
+        using fn_type = int (*)(int);
+        auto result = f.target<fn_type>();
+        if (!result) {
+            auto r = f(1);
+            return "can't convert to function pointer: eval(1) = " + std::to_string(r);
+        } else if (*result == dummy_function) {
+            auto r = (*result)(1);
+            return "matches dummy_function: eval(1) = " + std::to_string(r);
+        } else {
+            return "argument does NOT match dummy_function. This should never happen!";
+        }
+    });
+
+    class AbstractBase { public: virtual unsigned int func() = 0; };
+    m.def("func_accepting_func_accepting_base", [](std::function<double(AbstractBase&)>) { });
+
+    struct MovableObject {
+        bool valid = true;
+
+        MovableObject() = default;
+        MovableObject(const MovableObject &) = default;
+        MovableObject &operator=(const MovableObject &) = default;
+        MovableObject(MovableObject &&o) : valid(o.valid) { o.valid = false; }
+        MovableObject &operator=(MovableObject &&o) {
+            valid = o.valid;
+            o.valid = false;
+            return *this;
+        }
+    };
+    py::class_<MovableObject>(m, "MovableObject");
+
+    // test_movable_object
+    m.def("callback_with_movable", [](std::function<void(MovableObject &)> f) {
+        auto x = MovableObject();
+        f(x); // lvalue reference shouldn't move out object
+        return x.valid; // must still return `true`
+    });
+
+    // test_bound_method_callback
+    struct CppBoundMethodTest {};
+    py::class_<CppBoundMethodTest>(m, "CppBoundMethodTest")
+        .def(py::init<>())
+        .def("triple", [](CppBoundMethodTest &, int val) { return 3 * val; });
+
+    // test async Python callbacks
+    using callback_f = std::function<void(int)>;
+    m.def("test_async_callback", [](callback_f f, py::list work) {
+        // make detached thread that calls `f` with piece of work after a little delay
+        auto start_f = [f](int j) {
+            auto invoke_f = [f, j] {
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
+                f(j);
+            };
+            auto t = std::thread(std::move(invoke_f));
+            t.detach();
+        };
+
+        // spawn worker threads
+        for (auto i : work)
+            start_f(py::cast<int>(i));
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cpp
new file mode 100644
index 0000000..71b88c4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cpp
@@ -0,0 +1,168 @@
+/*
+    tests/test_callbacks.cpp -- callbacks
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+
+int dummy_function(int i) { return i + 1; }
+
+TEST_SUBMODULE(callbacks, m) {
+    // test_callbacks, test_function_signatures
+    m.def("test_callback1", [](py::object func) { return func(); });
+    m.def("test_callback2", [](py::object func) { return func("Hello", 'x', true, 5); });
+    m.def("test_callback3", [](const std::function<int(int)> &func) {
+        return "func(43) = " + std::to_string(func(43)); });
+    m.def("test_callback4", []() -> std::function<int(int)> { return [](int i) { return i+1; }; });
+    m.def("test_callback5", []() {
+        return py::cpp_function([](int i) { return i+1; }, py::arg("number"));
+    });
+
+    // test_keyword_args_and_generalized_unpacking
+    m.def("test_tuple_unpacking", [](py::function f) {
+        auto t1 = py::make_tuple(2, 3);
+        auto t2 = py::make_tuple(5, 6);
+        return f("positional", 1, *t1, 4, *t2);
+    });
+
+    m.def("test_dict_unpacking", [](py::function f) {
+        auto d1 = py::dict("key"_a="value", "a"_a=1);
+        auto d2 = py::dict();
+        auto d3 = py::dict("b"_a=2);
+        return f("positional", 1, **d1, **d2, **d3);
+    });
+
+    m.def("test_keyword_args", [](py::function f) {
+        return f("x"_a=10, "y"_a=20);
+    });
+
+    m.def("test_unpacking_and_keywords1", [](py::function f) {
+        auto args = py::make_tuple(2);
+        auto kwargs = py::dict("d"_a=4);
+        return f(1, *args, "c"_a=3, **kwargs);
+    });
+
+    m.def("test_unpacking_and_keywords2", [](py::function f) {
+        auto kwargs1 = py::dict("a"_a=1);
+        auto kwargs2 = py::dict("c"_a=3, "d"_a=4);
+        return f("positional", *py::make_tuple(1), 2, *py::make_tuple(3, 4), 5,
+                 "key"_a="value", **kwargs1, "b"_a=2, **kwargs2, "e"_a=5);
+    });
+
+    m.def("test_unpacking_error1", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f("x"_a=1, "y"_a=2, **kwargs); // duplicate ** after keyword
+    });
+
+    m.def("test_unpacking_error2", [](py::function f) {
+        auto kwargs = py::dict("x"_a=3);
+        return f(**kwargs, "x"_a=1); // duplicate keyword after **
+    });
+
+    m.def("test_arg_conversion_error1", [](py::function f) {
+        f(234, UnregisteredType(), "kw"_a=567);
+    });
+
+    m.def("test_arg_conversion_error2", [](py::function f) {
+        f(234, "expected_name"_a=UnregisteredType(), "kw"_a=567);
+    });
+
+    // test_lambda_closure_cleanup
+    struct Payload {
+        Payload() { print_default_created(this); }
+        ~Payload() { print_destroyed(this); }
+        Payload(const Payload &) { print_copy_created(this); }
+        Payload(Payload &&) { print_move_created(this); }
+    };
+    // Export the payload constructor statistics for testing purposes:
+    m.def("payload_cstats", &ConstructorStats::get<Payload>);
+    /* Test cleanup of lambda closure */
+    m.def("test_cleanup", []() -> std::function<void(void)> {
+        Payload p;
+
+        return [p]() {
+            /* p should be cleaned up when the returned function is garbage collected */
+            (void) p;
+        };
+    });
+
+    // test_cpp_function_roundtrip
+    /* Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer */
+    m.def("dummy_function", &dummy_function);
+    m.def("dummy_function2", [](int i, int j) { return i + j; });
+    m.def("roundtrip", [](std::function<int(int)> f, bool expect_none = false) {
+        if (expect_none && f)
+            throw std::runtime_error("Expected None to be converted to empty std::function");
+        return f;
+    }, py::arg("f"), py::arg("expect_none")=false);
+    m.def("test_dummy_function", [](const std::function<int(int)> &f) -> std::string {
+        using fn_type = int (*)(int);
+        auto result = f.target<fn_type>();
+        if (!result) {
+            auto r = f(1);
+            return "can't convert to function pointer: eval(1) = " + std::to_string(r);
+        } else if (*result == dummy_function) {
+            auto r = (*result)(1);
+            return "matches dummy_function: eval(1) = " + std::to_string(r);
+        } else {
+            return "argument does NOT match dummy_function. This should never happen!";
+        }
+    });
+
+    class AbstractBase { public: virtual unsigned int func() = 0; };
+    m.def("func_accepting_func_accepting_base", [](std::function<double(AbstractBase&)>) { });
+
+    struct MovableObject {
+        bool valid = true;
+
+        MovableObject() = default;
+        MovableObject(const MovableObject &) = default;
+        MovableObject &operator=(const MovableObject &) = default;
+        MovableObject(MovableObject &&o) : valid(o.valid) { o.valid = false; }
+        MovableObject &operator=(MovableObject &&o) {
+            valid = o.valid;
+            o.valid = false;
+            return *this;
+        }
+    };
+    py::class_<MovableObject>(m, "MovableObject");
+
+    // test_movable_object
+    m.def("callback_with_movable", [](std::function<void(MovableObject &)> f) {
+        auto x = MovableObject();
+        f(x); // lvalue reference shouldn't move out object
+        return x.valid; // must still return `true`
+    });
+
+    // test_bound_method_callback
+    struct CppBoundMethodTest {};
+    py::class_<CppBoundMethodTest>(m, "CppBoundMethodTest")
+        .def(py::init<>())
+        .def("triple", [](CppBoundMethodTest &, int val) { return 3 * val; });
+
+    // test async Python callbacks
+    using callback_f = std::function<void(int)>;
+    m.def("test_async_callback", [](callback_f f, py::list work) {
+        // make detached thread that calls `f` with piece of work after a little delay
+        auto start_f = [f](int j) {
+            auto invoke_f = [f, j] {
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
+                f(j);
+            };
+            auto t = std::thread(std::move(invoke_f));
+            t.detach();
+        };
+
+        // spawn worker threads
+        for (auto i : work)
+            start_f(py::cast<int>(i));
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.py
new file mode 100644
index 0000000..6439c8e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.py
@@ -0,0 +1,136 @@
+import pytest
+from pybind11_tests import callbacks as m
+from threading import Thread
+
+
+def test_callbacks():
+    from functools import partial
+
+    def func1():
+        return "func1"
+
+    def func2(a, b, c, d):
+        return "func2", a, b, c, d
+
+    def func3(a):
+        return "func3({})".format(a)
+
+    assert m.test_callback1(func1) == "func1"
+    assert m.test_callback2(func2) == ("func2", "Hello", "x", True, 5)
+    assert m.test_callback1(partial(func2, 1, 2, 3, 4)) == ("func2", 1, 2, 3, 4)
+    assert m.test_callback1(partial(func3, "partial")) == "func3(partial)"
+    assert m.test_callback3(lambda i: i + 1) == "func(43) = 44"
+
+    f = m.test_callback4()
+    assert f(43) == 44
+    f = m.test_callback5()
+    assert f(number=43) == 44
+
+
+def test_bound_method_callback():
+    # Bound Python method:
+    class MyClass:
+        def double(self, val):
+            return 2 * val
+
+    z = MyClass()
+    assert m.test_callback3(z.double) == "func(43) = 86"
+
+    z = m.CppBoundMethodTest()
+    assert m.test_callback3(z.triple) == "func(43) = 129"
+
+
+def test_keyword_args_and_generalized_unpacking():
+
+    def f(*args, **kwargs):
+        return args, kwargs
+
+    assert m.test_tuple_unpacking(f) == (("positional", 1, 2, 3, 4, 5, 6), {})
+    assert m.test_dict_unpacking(f) == (("positional", 1), {"key": "value", "a": 1, "b": 2})
+    assert m.test_keyword_args(f) == ((), {"x": 10, "y": 20})
+    assert m.test_unpacking_and_keywords1(f) == ((1, 2), {"c": 3, "d": 4})
+    assert m.test_unpacking_and_keywords2(f) == (
+        ("positional", 1, 2, 3, 4, 5),
+        {"key": "value", "a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
+    )
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_unpacking_error1(f)
+    assert "Got multiple values for keyword argument" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_unpacking_error2(f)
+    assert "Got multiple values for keyword argument" in str(excinfo.value)
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.test_arg_conversion_error1(f)
+    assert "Unable to convert call argument" in str(excinfo.value)
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.test_arg_conversion_error2(f)
+    assert "Unable to convert call argument" in str(excinfo.value)
+
+
+def test_lambda_closure_cleanup():
+    m.test_cleanup()
+    cstats = m.payload_cstats()
+    assert cstats.alive() == 0
+    assert cstats.copy_constructions == 1
+    assert cstats.move_constructions >= 1
+
+
+def test_cpp_function_roundtrip():
+    """Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer"""
+
+    assert m.test_dummy_function(m.dummy_function) == "matches dummy_function: eval(1) = 2"
+    assert (m.test_dummy_function(m.roundtrip(m.dummy_function)) ==
+            "matches dummy_function: eval(1) = 2")
+    assert m.roundtrip(None, expect_none=True) is None
+    assert (m.test_dummy_function(lambda x: x + 2) ==
+            "can't convert to function pointer: eval(1) = 3")
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_dummy_function(m.dummy_function2)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.test_dummy_function(lambda x, y: x + y)
+    assert any(s in str(excinfo.value) for s in ("missing 1 required positional argument",
+                                                 "takes exactly 2 arguments"))
+
+
+def test_function_signatures(doc):
+    assert doc(m.test_callback3) == "test_callback3(arg0: Callable[[int], int]) -> str"
+    assert doc(m.test_callback4) == "test_callback4() -> Callable[[int], int]"
+
+
+def test_movable_object():
+    assert m.callback_with_movable(lambda _: None) is True
+
+
+def test_async_callbacks():
+    # serves as state for async callback
+    class Item:
+        def __init__(self, value):
+            self.value = value
+
+    res = []
+
+    # generate stateful lambda that will store result in `res`
+    def gen_f():
+        s = Item(3)
+        return lambda j: res.append(s.value + j)
+
+    # do some work async
+    work = [1, 2, 3, 4]
+    m.test_async_callback(gen_f(), work)
+    # wait until work is done
+    from time import sleep
+    sleep(0.5)
+    assert sum(res) == sum([x + 3 for x in work])
+
+
+def test_async_async_callbacks():
+    t = Thread(target=test_async_callbacks)
+    t.start()
+    t.join()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cc
new file mode 100644
index 0000000..899d08d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cc
@@ -0,0 +1,55 @@
+/*
+    tests/test_chrono.cpp -- test conversions to/from std::chrono types
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/chrono.h>
+
+TEST_SUBMODULE(chrono, m) {
+    using system_time = std::chrono::system_clock::time_point;
+    using steady_time = std::chrono::steady_clock::time_point;
+
+    using timespan = std::chrono::duration<int64_t, std::nano>;
+    using timestamp = std::chrono::time_point<std::chrono::system_clock, timespan>;
+
+    // test_chrono_system_clock
+    // Return the current time off the wall clock
+    m.def("test_chrono1", []() { return std::chrono::system_clock::now(); });
+
+    // test_chrono_system_clock_roundtrip
+    // Round trip the passed in system clock time
+    m.def("test_chrono2", [](system_time t) { return t; });
+
+    // test_chrono_duration_roundtrip
+    // Round trip the passed in duration
+    m.def("test_chrono3", [](std::chrono::system_clock::duration d) { return d; });
+
+    // test_chrono_duration_subtraction_equivalence
+    // Difference between two passed in time_points
+    m.def("test_chrono4", [](system_time a, system_time b) { return a - b; });
+
+    // test_chrono_steady_clock
+    // Return the current time off the steady_clock
+    m.def("test_chrono5", []() { return std::chrono::steady_clock::now(); });
+
+    // test_chrono_steady_clock_roundtrip
+    // Round trip a steady clock timepoint
+    m.def("test_chrono6", [](steady_time t) { return t; });
+
+    // test_floating_point_duration
+    // Roundtrip a duration in microseconds from a float argument
+    m.def("test_chrono7", [](std::chrono::microseconds t) { return t; });
+    // Float durations (issue #719)
+    m.def("test_chrono_float_diff", [](std::chrono::duration<float> a, std::chrono::duration<float> b) {
+        return a - b; });
+
+    m.def("test_nano_timepoint", [](timestamp start, timespan delta) -> timestamp {
+        return start + delta;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cpp
new file mode 100644
index 0000000..899d08d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cpp
@@ -0,0 +1,55 @@
+/*
+    tests/test_chrono.cpp -- test conversions to/from std::chrono types
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/chrono.h>
+
+TEST_SUBMODULE(chrono, m) {
+    using system_time = std::chrono::system_clock::time_point;
+    using steady_time = std::chrono::steady_clock::time_point;
+
+    using timespan = std::chrono::duration<int64_t, std::nano>;
+    using timestamp = std::chrono::time_point<std::chrono::system_clock, timespan>;
+
+    // test_chrono_system_clock
+    // Return the current time off the wall clock
+    m.def("test_chrono1", []() { return std::chrono::system_clock::now(); });
+
+    // test_chrono_system_clock_roundtrip
+    // Round trip the passed in system clock time
+    m.def("test_chrono2", [](system_time t) { return t; });
+
+    // test_chrono_duration_roundtrip
+    // Round trip the passed in duration
+    m.def("test_chrono3", [](std::chrono::system_clock::duration d) { return d; });
+
+    // test_chrono_duration_subtraction_equivalence
+    // Difference between two passed in time_points
+    m.def("test_chrono4", [](system_time a, system_time b) { return a - b; });
+
+    // test_chrono_steady_clock
+    // Return the current time off the steady_clock
+    m.def("test_chrono5", []() { return std::chrono::steady_clock::now(); });
+
+    // test_chrono_steady_clock_roundtrip
+    // Round trip a steady clock timepoint
+    m.def("test_chrono6", [](steady_time t) { return t; });
+
+    // test_floating_point_duration
+    // Roundtrip a duration in microseconds from a float argument
+    m.def("test_chrono7", [](std::chrono::microseconds t) { return t; });
+    // Float durations (issue #719)
+    m.def("test_chrono_float_diff", [](std::chrono::duration<float> a, std::chrono::duration<float> b) {
+        return a - b; });
+
+    m.def("test_nano_timepoint", [](timestamp start, timespan delta) -> timestamp {
+        return start + delta;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.py
new file mode 100644
index 0000000..55c9544
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.py
@@ -0,0 +1,176 @@
+from pybind11_tests import chrono as m
+import datetime
+
+
+def test_chrono_system_clock():
+
+    # Get the time from both c++ and datetime
+    date1 = m.test_chrono1()
+    date2 = datetime.datetime.today()
+
+    # The returned value should be a datetime
+    assert isinstance(date1, datetime.datetime)
+
+    # The numbers should vary by a very small amount (time it took to execute)
+    diff = abs(date1 - date2)
+
+    # There should never be a days/seconds difference
+    assert diff.days == 0
+    assert diff.seconds == 0
+
+    # We test that no more than about 0.5 seconds passes here
+    # This makes sure that the dates created are very close to the same
+    # but if the testing system is incredibly overloaded this should still pass
+    assert diff.microseconds < 500000
+
+
+def test_chrono_system_clock_roundtrip():
+    date1 = datetime.datetime.today()
+
+    # Roundtrip the time
+    date2 = m.test_chrono2(date1)
+
+    # The returned value should be a datetime
+    assert isinstance(date2, datetime.datetime)
+
+    # They should be identical (no information lost on roundtrip)
+    diff = abs(date1 - date2)
+    assert diff.days == 0
+    assert diff.seconds == 0
+    assert diff.microseconds == 0
+
+
+def test_chrono_system_clock_roundtrip_date():
+    date1 = datetime.date.today()
+
+    # Roundtrip the time
+    datetime2 = m.test_chrono2(date1)
+    date2 = datetime2.date()
+    time2 = datetime2.time()
+
+    # The returned value should be a datetime
+    assert isinstance(datetime2, datetime.datetime)
+    assert isinstance(date2, datetime.date)
+    assert isinstance(time2, datetime.time)
+
+    # They should be identical (no information lost on roundtrip)
+    diff = abs(date1 - date2)
+    assert diff.days == 0
+    assert diff.seconds == 0
+    assert diff.microseconds == 0
+
+    # Year, Month & Day should be the same after the round trip
+    assert date1.year == date2.year
+    assert date1.month == date2.month
+    assert date1.day == date2.day
+
+    # There should be no time information
+    assert time2.hour == 0
+    assert time2.minute == 0
+    assert time2.second == 0
+    assert time2.microsecond == 0
+
+
+def test_chrono_system_clock_roundtrip_time():
+    time1 = datetime.datetime.today().time()
+
+    # Roundtrip the time
+    datetime2 = m.test_chrono2(time1)
+    date2 = datetime2.date()
+    time2 = datetime2.time()
+
+    # The returned value should be a datetime
+    assert isinstance(datetime2, datetime.datetime)
+    assert isinstance(date2, datetime.date)
+    assert isinstance(time2, datetime.time)
+
+    # Hour, Minute, Second & Microsecond should be the same after the round trip
+    assert time1.hour == time2.hour
+    assert time1.minute == time2.minute
+    assert time1.second == time2.second
+    assert time1.microsecond == time2.microsecond
+
+    # There should be no date information (i.e. date = python base date)
+    assert date2.year == 1970
+    assert date2.month == 1
+    assert date2.day == 1
+
+
+def test_chrono_duration_roundtrip():
+
+    # Get the difference between two times (a timedelta)
+    date1 = datetime.datetime.today()
+    date2 = datetime.datetime.today()
+    diff = date2 - date1
+
+    # Make sure this is a timedelta
+    assert isinstance(diff, datetime.timedelta)
+
+    cpp_diff = m.test_chrono3(diff)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_duration_subtraction_equivalence():
+
+    date1 = datetime.datetime.today()
+    date2 = datetime.datetime.today()
+
+    diff = date2 - date1
+    cpp_diff = m.test_chrono4(date2, date1)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_duration_subtraction_equivalence_date():
+
+    date1 = datetime.date.today()
+    date2 = datetime.date.today()
+
+    diff = date2 - date1
+    cpp_diff = m.test_chrono4(date2, date1)
+
+    assert cpp_diff.days == diff.days
+    assert cpp_diff.seconds == diff.seconds
+    assert cpp_diff.microseconds == diff.microseconds
+
+
+def test_chrono_steady_clock():
+    time1 = m.test_chrono5()
+    assert isinstance(time1, datetime.timedelta)
+
+
+def test_chrono_steady_clock_roundtrip():
+    time1 = datetime.timedelta(days=10, seconds=10, microseconds=100)
+    time2 = m.test_chrono6(time1)
+
+    assert isinstance(time2, datetime.timedelta)
+
+    # They should be identical (no information lost on roundtrip)
+    assert time1.days == time2.days
+    assert time1.seconds == time2.seconds
+    assert time1.microseconds == time2.microseconds
+
+
+def test_floating_point_duration():
+    # Test using a floating point number in seconds
+    time = m.test_chrono7(35.525123)
+
+    assert isinstance(time, datetime.timedelta)
+
+    assert time.seconds == 35
+    assert 525122 <= time.microseconds <= 525123
+
+    diff = m.test_chrono_float_diff(43.789012, 1.123456)
+    assert diff.seconds == 42
+    assert 665556 <= diff.microseconds <= 665557
+
+
+def test_nano_timepoint():
+    time = datetime.datetime.now()
+    time1 = m.test_nano_timepoint(time, datetime.timedelta(seconds=60))
+    assert(time1 == time + datetime.timedelta(seconds=60))
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cc
new file mode 100644
index 0000000..499d0cc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cc
@@ -0,0 +1,422 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
+// test_brace_initialization
+struct NoBraceInitialization {
+    NoBraceInitialization(std::vector<int> v) : vec{std::move(v)} {}
+    template <typename T>
+    NoBraceInitialization(std::initializer_list<T> l) : vec(l) {}
+
+    std::vector<int> vec;
+};
+
+TEST_SUBMODULE(class_, m) {
+    // test_instance
+    struct NoConstructor {
+        NoConstructor() = default;
+        NoConstructor(const NoConstructor &) = default;
+        NoConstructor(NoConstructor &&) = default;
+        static NoConstructor *new_instance() {
+            auto *ptr = new NoConstructor();
+            print_created(ptr, "via new_instance");
+            return ptr;
+        }
+        ~NoConstructor() { print_destroyed(this); }
+    };
+
+    py::class_<NoConstructor>(m, "NoConstructor")
+        .def_static("new_instance", &NoConstructor::new_instance, "Return an instance");
+
+    // test_inheritance
+    class Pet {
+    public:
+        Pet(const std::string &name, const std::string &species)
+            : m_name(name), m_species(species) {}
+        std::string name() const { return m_name; }
+        std::string species() const { return m_species; }
+    private:
+        std::string m_name;
+        std::string m_species;
+    };
+
+    class Dog : public Pet {
+    public:
+        Dog(const std::string &name) : Pet(name, "dog") {}
+        std::string bark() const { return "Woof!"; }
+    };
+
+    class Rabbit : public Pet {
+    public:
+        Rabbit(const std::string &name) : Pet(name, "parrot") {}
+    };
+
+    class Hamster : public Pet {
+    public:
+        Hamster(const std::string &name) : Pet(name, "rodent") {}
+    };
+
+    class Chimera : public Pet {
+        Chimera() : Pet("Kimmy", "chimera") {}
+    };
+
+    py::class_<Pet> pet_class(m, "Pet");
+    pet_class
+        .def(py::init<std::string, std::string>())
+        .def("name", &Pet::name)
+        .def("species", &Pet::species);
+
+    /* One way of declaring a subclass relationship: reference parent's class_ object */
+    py::class_<Dog>(m, "Dog", pet_class)
+        .def(py::init<std::string>());
+
+    /* Another way of declaring a subclass relationship: reference parent's C++ type */
+    py::class_<Rabbit, Pet>(m, "Rabbit")
+        .def(py::init<std::string>());
+
+    /* And another: list parent in class template arguments */
+    py::class_<Hamster, Pet>(m, "Hamster")
+        .def(py::init<std::string>());
+
+    /* Constructors are not inherited by default */
+    py::class_<Chimera, Pet>(m, "Chimera");
+
+    m.def("pet_name_species", [](const Pet &pet) { return pet.name() + " is a " + pet.species(); });
+    m.def("dog_bark", [](const Dog &dog) { return dog.bark(); });
+
+    // test_automatic_upcasting
+    struct BaseClass {
+        BaseClass() = default;
+        BaseClass(const BaseClass &) = default;
+        BaseClass(BaseClass &&) = default;
+        virtual ~BaseClass() {}
+    };
+    struct DerivedClass1 : BaseClass { };
+    struct DerivedClass2 : BaseClass { };
+
+    py::class_<BaseClass>(m, "BaseClass").def(py::init<>());
+    py::class_<DerivedClass1>(m, "DerivedClass1").def(py::init<>());
+    py::class_<DerivedClass2>(m, "DerivedClass2").def(py::init<>());
+
+    m.def("return_class_1", []() -> BaseClass* { return new DerivedClass1(); });
+    m.def("return_class_2", []() -> BaseClass* { return new DerivedClass2(); });
+    m.def("return_class_n", [](int n) -> BaseClass* {
+        if (n == 1) return new DerivedClass1();
+        if (n == 2) return new DerivedClass2();
+        return new BaseClass();
+    });
+    m.def("return_none", []() -> BaseClass* { return nullptr; });
+
+    // test_isinstance
+    m.def("check_instances", [](py::list l) {
+        return py::make_tuple(
+            py::isinstance<py::tuple>(l[0]),
+            py::isinstance<py::dict>(l[1]),
+            py::isinstance<Pet>(l[2]),
+            py::isinstance<Pet>(l[3]),
+            py::isinstance<Dog>(l[4]),
+            py::isinstance<Rabbit>(l[5]),
+            py::isinstance<UnregisteredType>(l[6])
+        );
+    });
+
+    // test_mismatched_holder
+    struct MismatchBase1 { };
+    struct MismatchDerived1 : MismatchBase1 { };
+
+    struct MismatchBase2 { };
+    struct MismatchDerived2 : MismatchBase2 { };
+
+    m.def("mismatched_holder_1", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase1, std::shared_ptr<MismatchBase1>>(mod, "MismatchBase1");
+        py::class_<MismatchDerived1, MismatchBase1>(mod, "MismatchDerived1");
+    });
+    m.def("mismatched_holder_2", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase2>(mod, "MismatchBase2");
+        py::class_<MismatchDerived2, std::shared_ptr<MismatchDerived2>,
+                   MismatchBase2>(mod, "MismatchDerived2");
+    });
+
+    // test_override_static
+    // #511: problem with inheritance + overwritten def_static
+    struct MyBase {
+        static std::unique_ptr<MyBase> make() {
+            return std::unique_ptr<MyBase>(new MyBase());
+        }
+    };
+
+    struct MyDerived : MyBase {
+        static std::unique_ptr<MyDerived> make() {
+            return std::unique_ptr<MyDerived>(new MyDerived());
+        }
+    };
+
+    py::class_<MyBase>(m, "MyBase")
+        .def_static("make", &MyBase::make);
+
+    py::class_<MyDerived, MyBase>(m, "MyDerived")
+        .def_static("make", &MyDerived::make)
+        .def_static("make2", &MyDerived::make);
+
+    // test_implicit_conversion_life_support
+    struct ConvertibleFromUserType {
+        int i;
+
+        ConvertibleFromUserType(UserType u) : i(u.value()) { }
+    };
+
+    py::class_<ConvertibleFromUserType>(m, "AcceptsUserType")
+        .def(py::init<UserType>());
+    py::implicitly_convertible<UserType, ConvertibleFromUserType>();
+
+    m.def("implicitly_convert_argument", [](const ConvertibleFromUserType &r) { return r.i; });
+    m.def("implicitly_convert_variable", [](py::object o) {
+        // `o` is `UserType` and `r` is a reference to a temporary created by implicit
+        // conversion. This is valid when called inside a bound function because the temp
+        // object is attached to the same life support system as the arguments.
+        const auto &r = o.cast<const ConvertibleFromUserType &>();
+        return r.i;
+    });
+    m.add_object("implicitly_convert_variable_fail", [&] {
+        auto f = [](PyObject *, PyObject *args) -> PyObject * {
+            auto o = py::reinterpret_borrow<py::tuple>(args)[0];
+            try { // It should fail here because there is no life support.
+                o.cast<const ConvertibleFromUserType &>();
+            } catch (const py::cast_error &e) {
+                return py::str(e.what()).release().ptr();
+            }
+            return py::str().release().ptr();
+        };
+
+        auto def = new PyMethodDef{"f", f, METH_VARARGS, nullptr};
+        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, nullptr, m.ptr()));
+    }());
+
+    // test_operator_new_delete
+    struct HasOpNewDel {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("A new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("A placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("A delete"); return ::operator delete(p); }
+    };
+    struct HasOpNewDelSize {
+        std::uint32_t i;
+        static void *operator new(size_t s) { py::print("B new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("B placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("B delete", s); return ::operator delete(p); }
+    };
+    struct AliasedHasOpNewDelSize {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("C new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("C placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("C delete", s); return ::operator delete(p); }
+        virtual ~AliasedHasOpNewDelSize() = default;
+    };
+    struct PyAliasedHasOpNewDelSize : AliasedHasOpNewDelSize {
+        PyAliasedHasOpNewDelSize() = default;
+        PyAliasedHasOpNewDelSize(int) { }
+        std::uint64_t j;
+    };
+    struct HasOpNewDelBoth {
+        std::uint32_t i[8];
+        static void *operator new(size_t s) { py::print("D new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("D placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("D delete"); return ::operator delete(p); }
+        static void operator delete(void *p, size_t s) { py::print("D wrong delete", s); return ::operator delete(p); }
+    };
+    py::class_<HasOpNewDel>(m, "HasOpNewDel").def(py::init<>());
+    py::class_<HasOpNewDelSize>(m, "HasOpNewDelSize").def(py::init<>());
+    py::class_<HasOpNewDelBoth>(m, "HasOpNewDelBoth").def(py::init<>());
+    py::class_<AliasedHasOpNewDelSize, PyAliasedHasOpNewDelSize> aliased(m, "AliasedHasOpNewDelSize");
+    aliased.def(py::init<>());
+    aliased.attr("size_noalias") = py::int_(sizeof(AliasedHasOpNewDelSize));
+    aliased.attr("size_alias") = py::int_(sizeof(PyAliasedHasOpNewDelSize));
+
+    // This test is actually part of test_local_bindings (test_duplicate_local), but we need a
+    // definition in a different compilation unit within the same module:
+    bind_local<LocalExternal, 17>(m, "LocalExternal", py::module_local());
+
+    // test_bind_protected_functions
+    class ProtectedA {
+    protected:
+        int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class PublicistA : public ProtectedA {
+    public:
+        using ProtectedA::foo;
+    };
+
+    py::class_<ProtectedA>(m, "ProtectedA")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistA::foo);
+#else
+        .def("foo", static_cast<int (ProtectedA::*)() const>(&PublicistA::foo));
+#endif
+
+    class ProtectedB {
+    public:
+        virtual ~ProtectedB() = default;
+
+    protected:
+        virtual int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class TrampolineB : public ProtectedB {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, ProtectedB, foo, ); }
+    };
+
+    class PublicistB : public ProtectedB {
+    public:
+        using ProtectedB::foo;
+    };
+
+    py::class_<ProtectedB, TrampolineB>(m, "ProtectedB")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistB::foo);
+#else
+        .def("foo", static_cast<int (ProtectedB::*)() const>(&PublicistB::foo));
+#endif
+
+    // test_brace_initialization
+    struct BraceInitialization {
+        int field1;
+        std::string field2;
+    };
+
+    py::class_<BraceInitialization>(m, "BraceInitialization")
+        .def(py::init<int, const std::string &>())
+        .def_readwrite("field1", &BraceInitialization::field1)
+        .def_readwrite("field2", &BraceInitialization::field2);
+    // We *don't* want to construct using braces when the given constructor argument maps to a
+    // constructor, because brace initialization could go to the wrong place (in particular when
+    // there is also an `initializer_list<T>`-accept constructor):
+    py::class_<NoBraceInitialization>(m, "NoBraceInitialization")
+        .def(py::init<std::vector<int>>())
+        .def_readonly("vec", &NoBraceInitialization::vec);
+
+    // test_reentrant_implicit_conversion_failure
+    // #1035: issue with runaway reentrant implicit conversion
+    struct BogusImplicitConversion {
+        BogusImplicitConversion(const BogusImplicitConversion &) { }
+    };
+
+    py::class_<BogusImplicitConversion>(m, "BogusImplicitConversion")
+        .def(py::init<const BogusImplicitConversion &>());
+
+    py::implicitly_convertible<int, BogusImplicitConversion>();
+
+    // test_qualname
+    // #1166: nested class docstring doesn't show nested name
+    // Also related: tests that __qualname__ is set properly
+    struct NestBase {};
+    struct Nested {};
+    py::class_<NestBase> base(m, "NestBase");
+    base.def(py::init<>());
+    py::class_<Nested>(base, "Nested")
+        .def(py::init<>())
+        .def("fn", [](Nested &, int, NestBase &, Nested &) {})
+        .def("fa", [](Nested &, int, NestBase &, Nested &) {},
+                "a"_a, "b"_a, "c"_a);
+    base.def("g", [](NestBase &, Nested &) {});
+    base.def("h", []() { return NestBase(); });
+
+    // test_error_after_conversion
+    // The second-pass path through dispatcher() previously didn't
+    // remember which overload was used, and would crash trying to
+    // generate a useful error message
+
+    struct NotRegistered {};
+    struct StringWrapper { std::string str; };
+    m.def("test_error_after_conversions", [](int) {});
+    m.def("test_error_after_conversions",
+          [](StringWrapper) -> NotRegistered { return {}; });
+    py::class_<StringWrapper>(m, "StringWrapper").def(py::init<std::string>());
+    py::implicitly_convertible<std::string, StringWrapper>();
+
+    #if defined(PYBIND11_CPP17)
+        struct alignas(1024) Aligned {
+            std::uintptr_t ptr() const { return (uintptr_t) this; }
+        };
+        py::class_<Aligned>(m, "Aligned")
+            .def(py::init<>())
+            .def("ptr", &Aligned::ptr);
+    #endif
+}
+
+template <int N> class BreaksBase { public: virtual ~BreaksBase() = default; };
+template <int N> class BreaksTramp : public BreaksBase<N> {};
+// These should all compile just fine:
+typedef py::class_<BreaksBase<1>, std::unique_ptr<BreaksBase<1>>, BreaksTramp<1>> DoesntBreak1;
+typedef py::class_<BreaksBase<2>, BreaksTramp<2>, std::unique_ptr<BreaksBase<2>>> DoesntBreak2;
+typedef py::class_<BreaksBase<3>, std::unique_ptr<BreaksBase<3>>> DoesntBreak3;
+typedef py::class_<BreaksBase<4>, BreaksTramp<4>> DoesntBreak4;
+typedef py::class_<BreaksBase<5>> DoesntBreak5;
+typedef py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>> DoesntBreak6;
+typedef py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>> DoesntBreak7;
+typedef py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>> DoesntBreak8;
+#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
+        "DoesntBreak" #N " has wrong type!")
+CHECK_BASE(1); CHECK_BASE(2); CHECK_BASE(3); CHECK_BASE(4); CHECK_BASE(5); CHECK_BASE(6); CHECK_BASE(7); CHECK_BASE(8);
+#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
+        "DoesntBreak" #N " has wrong type_alias!")
+#define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
+        "DoesntBreak" #N " has type alias, but shouldn't!")
+CHECK_ALIAS(1); CHECK_ALIAS(2); CHECK_NOALIAS(3); CHECK_ALIAS(4); CHECK_NOALIAS(5); CHECK_ALIAS(6); CHECK_ALIAS(7); CHECK_NOALIAS(8);
+#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
+        "DoesntBreak" #N " has wrong holder_type!")
+CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
+CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
+
+// There's no nice way to test that these fail because they fail to compile; leave them here,
+// though, so that they can be manually tested by uncommenting them (and seeing that compilation
+// failures occurs).
+
+// We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
+#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
+        "Breaks1 has wrong type!");
+
+//// Two holder classes:
+//typedef py::class_<BreaksBase<-1>, std::unique_ptr<BreaksBase<-1>>, std::unique_ptr<BreaksBase<-1>>> Breaks1;
+//CHECK_BROKEN(1);
+//// Two aliases:
+//typedef py::class_<BreaksBase<-2>, BreaksTramp<-2>, BreaksTramp<-2>> Breaks2;
+//CHECK_BROKEN(2);
+//// Holder + 2 aliases
+//typedef py::class_<BreaksBase<-3>, std::unique_ptr<BreaksBase<-3>>, BreaksTramp<-3>, BreaksTramp<-3>> Breaks3;
+//CHECK_BROKEN(3);
+//// Alias + 2 holders
+//typedef py::class_<BreaksBase<-4>, std::unique_ptr<BreaksBase<-4>>, BreaksTramp<-4>, std::shared_ptr<BreaksBase<-4>>> Breaks4;
+//CHECK_BROKEN(4);
+//// Invalid option (not a subclass or holder)
+//typedef py::class_<BreaksBase<-5>, BreaksTramp<-4>> Breaks5;
+//CHECK_BROKEN(5);
+//// Invalid option: multiple inheritance not supported:
+//template <> struct BreaksBase<-8> : BreaksBase<-6>, BreaksBase<-7> {};
+//typedef py::class_<BreaksBase<-8>, BreaksBase<-6>, BreaksBase<-7>> Breaks8;
+//CHECK_BROKEN(8);
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cpp
new file mode 100644
index 0000000..499d0cc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cpp
@@ -0,0 +1,422 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
+// test_brace_initialization
+struct NoBraceInitialization {
+    NoBraceInitialization(std::vector<int> v) : vec{std::move(v)} {}
+    template <typename T>
+    NoBraceInitialization(std::initializer_list<T> l) : vec(l) {}
+
+    std::vector<int> vec;
+};
+
+TEST_SUBMODULE(class_, m) {
+    // test_instance
+    struct NoConstructor {
+        NoConstructor() = default;
+        NoConstructor(const NoConstructor &) = default;
+        NoConstructor(NoConstructor &&) = default;
+        static NoConstructor *new_instance() {
+            auto *ptr = new NoConstructor();
+            print_created(ptr, "via new_instance");
+            return ptr;
+        }
+        ~NoConstructor() { print_destroyed(this); }
+    };
+
+    py::class_<NoConstructor>(m, "NoConstructor")
+        .def_static("new_instance", &NoConstructor::new_instance, "Return an instance");
+
+    // test_inheritance
+    class Pet {
+    public:
+        Pet(const std::string &name, const std::string &species)
+            : m_name(name), m_species(species) {}
+        std::string name() const { return m_name; }
+        std::string species() const { return m_species; }
+    private:
+        std::string m_name;
+        std::string m_species;
+    };
+
+    class Dog : public Pet {
+    public:
+        Dog(const std::string &name) : Pet(name, "dog") {}
+        std::string bark() const { return "Woof!"; }
+    };
+
+    class Rabbit : public Pet {
+    public:
+        Rabbit(const std::string &name) : Pet(name, "parrot") {}
+    };
+
+    class Hamster : public Pet {
+    public:
+        Hamster(const std::string &name) : Pet(name, "rodent") {}
+    };
+
+    class Chimera : public Pet {
+        Chimera() : Pet("Kimmy", "chimera") {}
+    };
+
+    py::class_<Pet> pet_class(m, "Pet");
+    pet_class
+        .def(py::init<std::string, std::string>())
+        .def("name", &Pet::name)
+        .def("species", &Pet::species);
+
+    /* One way of declaring a subclass relationship: reference parent's class_ object */
+    py::class_<Dog>(m, "Dog", pet_class)
+        .def(py::init<std::string>());
+
+    /* Another way of declaring a subclass relationship: reference parent's C++ type */
+    py::class_<Rabbit, Pet>(m, "Rabbit")
+        .def(py::init<std::string>());
+
+    /* And another: list parent in class template arguments */
+    py::class_<Hamster, Pet>(m, "Hamster")
+        .def(py::init<std::string>());
+
+    /* Constructors are not inherited by default */
+    py::class_<Chimera, Pet>(m, "Chimera");
+
+    m.def("pet_name_species", [](const Pet &pet) { return pet.name() + " is a " + pet.species(); });
+    m.def("dog_bark", [](const Dog &dog) { return dog.bark(); });
+
+    // test_automatic_upcasting
+    struct BaseClass {
+        BaseClass() = default;
+        BaseClass(const BaseClass &) = default;
+        BaseClass(BaseClass &&) = default;
+        virtual ~BaseClass() {}
+    };
+    struct DerivedClass1 : BaseClass { };
+    struct DerivedClass2 : BaseClass { };
+
+    py::class_<BaseClass>(m, "BaseClass").def(py::init<>());
+    py::class_<DerivedClass1>(m, "DerivedClass1").def(py::init<>());
+    py::class_<DerivedClass2>(m, "DerivedClass2").def(py::init<>());
+
+    m.def("return_class_1", []() -> BaseClass* { return new DerivedClass1(); });
+    m.def("return_class_2", []() -> BaseClass* { return new DerivedClass2(); });
+    m.def("return_class_n", [](int n) -> BaseClass* {
+        if (n == 1) return new DerivedClass1();
+        if (n == 2) return new DerivedClass2();
+        return new BaseClass();
+    });
+    m.def("return_none", []() -> BaseClass* { return nullptr; });
+
+    // test_isinstance
+    m.def("check_instances", [](py::list l) {
+        return py::make_tuple(
+            py::isinstance<py::tuple>(l[0]),
+            py::isinstance<py::dict>(l[1]),
+            py::isinstance<Pet>(l[2]),
+            py::isinstance<Pet>(l[3]),
+            py::isinstance<Dog>(l[4]),
+            py::isinstance<Rabbit>(l[5]),
+            py::isinstance<UnregisteredType>(l[6])
+        );
+    });
+
+    // test_mismatched_holder
+    struct MismatchBase1 { };
+    struct MismatchDerived1 : MismatchBase1 { };
+
+    struct MismatchBase2 { };
+    struct MismatchDerived2 : MismatchBase2 { };
+
+    m.def("mismatched_holder_1", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase1, std::shared_ptr<MismatchBase1>>(mod, "MismatchBase1");
+        py::class_<MismatchDerived1, MismatchBase1>(mod, "MismatchDerived1");
+    });
+    m.def("mismatched_holder_2", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase2>(mod, "MismatchBase2");
+        py::class_<MismatchDerived2, std::shared_ptr<MismatchDerived2>,
+                   MismatchBase2>(mod, "MismatchDerived2");
+    });
+
+    // test_override_static
+    // #511: problem with inheritance + overwritten def_static
+    struct MyBase {
+        static std::unique_ptr<MyBase> make() {
+            return std::unique_ptr<MyBase>(new MyBase());
+        }
+    };
+
+    struct MyDerived : MyBase {
+        static std::unique_ptr<MyDerived> make() {
+            return std::unique_ptr<MyDerived>(new MyDerived());
+        }
+    };
+
+    py::class_<MyBase>(m, "MyBase")
+        .def_static("make", &MyBase::make);
+
+    py::class_<MyDerived, MyBase>(m, "MyDerived")
+        .def_static("make", &MyDerived::make)
+        .def_static("make2", &MyDerived::make);
+
+    // test_implicit_conversion_life_support
+    struct ConvertibleFromUserType {
+        int i;
+
+        ConvertibleFromUserType(UserType u) : i(u.value()) { }
+    };
+
+    py::class_<ConvertibleFromUserType>(m, "AcceptsUserType")
+        .def(py::init<UserType>());
+    py::implicitly_convertible<UserType, ConvertibleFromUserType>();
+
+    m.def("implicitly_convert_argument", [](const ConvertibleFromUserType &r) { return r.i; });
+    m.def("implicitly_convert_variable", [](py::object o) {
+        // `o` is `UserType` and `r` is a reference to a temporary created by implicit
+        // conversion. This is valid when called inside a bound function because the temp
+        // object is attached to the same life support system as the arguments.
+        const auto &r = o.cast<const ConvertibleFromUserType &>();
+        return r.i;
+    });
+    m.add_object("implicitly_convert_variable_fail", [&] {
+        auto f = [](PyObject *, PyObject *args) -> PyObject * {
+            auto o = py::reinterpret_borrow<py::tuple>(args)[0];
+            try { // It should fail here because there is no life support.
+                o.cast<const ConvertibleFromUserType &>();
+            } catch (const py::cast_error &e) {
+                return py::str(e.what()).release().ptr();
+            }
+            return py::str().release().ptr();
+        };
+
+        auto def = new PyMethodDef{"f", f, METH_VARARGS, nullptr};
+        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, nullptr, m.ptr()));
+    }());
+
+    // test_operator_new_delete
+    struct HasOpNewDel {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("A new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("A placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("A delete"); return ::operator delete(p); }
+    };
+    struct HasOpNewDelSize {
+        std::uint32_t i;
+        static void *operator new(size_t s) { py::print("B new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("B placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("B delete", s); return ::operator delete(p); }
+    };
+    struct AliasedHasOpNewDelSize {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("C new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("C placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("C delete", s); return ::operator delete(p); }
+        virtual ~AliasedHasOpNewDelSize() = default;
+    };
+    struct PyAliasedHasOpNewDelSize : AliasedHasOpNewDelSize {
+        PyAliasedHasOpNewDelSize() = default;
+        PyAliasedHasOpNewDelSize(int) { }
+        std::uint64_t j;
+    };
+    struct HasOpNewDelBoth {
+        std::uint32_t i[8];
+        static void *operator new(size_t s) { py::print("D new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("D placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("D delete"); return ::operator delete(p); }
+        static void operator delete(void *p, size_t s) { py::print("D wrong delete", s); return ::operator delete(p); }
+    };
+    py::class_<HasOpNewDel>(m, "HasOpNewDel").def(py::init<>());
+    py::class_<HasOpNewDelSize>(m, "HasOpNewDelSize").def(py::init<>());
+    py::class_<HasOpNewDelBoth>(m, "HasOpNewDelBoth").def(py::init<>());
+    py::class_<AliasedHasOpNewDelSize, PyAliasedHasOpNewDelSize> aliased(m, "AliasedHasOpNewDelSize");
+    aliased.def(py::init<>());
+    aliased.attr("size_noalias") = py::int_(sizeof(AliasedHasOpNewDelSize));
+    aliased.attr("size_alias") = py::int_(sizeof(PyAliasedHasOpNewDelSize));
+
+    // This test is actually part of test_local_bindings (test_duplicate_local), but we need a
+    // definition in a different compilation unit within the same module:
+    bind_local<LocalExternal, 17>(m, "LocalExternal", py::module_local());
+
+    // test_bind_protected_functions
+    class ProtectedA {
+    protected:
+        int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class PublicistA : public ProtectedA {
+    public:
+        using ProtectedA::foo;
+    };
+
+    py::class_<ProtectedA>(m, "ProtectedA")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistA::foo);
+#else
+        .def("foo", static_cast<int (ProtectedA::*)() const>(&PublicistA::foo));
+#endif
+
+    class ProtectedB {
+    public:
+        virtual ~ProtectedB() = default;
+
+    protected:
+        virtual int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class TrampolineB : public ProtectedB {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, ProtectedB, foo, ); }
+    };
+
+    class PublicistB : public ProtectedB {
+    public:
+        using ProtectedB::foo;
+    };
+
+    py::class_<ProtectedB, TrampolineB>(m, "ProtectedB")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistB::foo);
+#else
+        .def("foo", static_cast<int (ProtectedB::*)() const>(&PublicistB::foo));
+#endif
+
+    // test_brace_initialization
+    struct BraceInitialization {
+        int field1;
+        std::string field2;
+    };
+
+    py::class_<BraceInitialization>(m, "BraceInitialization")
+        .def(py::init<int, const std::string &>())
+        .def_readwrite("field1", &BraceInitialization::field1)
+        .def_readwrite("field2", &BraceInitialization::field2);
+    // We *don't* want to construct using braces when the given constructor argument maps to a
+    // constructor, because brace initialization could go to the wrong place (in particular when
+    // there is also an `initializer_list<T>`-accept constructor):
+    py::class_<NoBraceInitialization>(m, "NoBraceInitialization")
+        .def(py::init<std::vector<int>>())
+        .def_readonly("vec", &NoBraceInitialization::vec);
+
+    // test_reentrant_implicit_conversion_failure
+    // #1035: issue with runaway reentrant implicit conversion
+    struct BogusImplicitConversion {
+        BogusImplicitConversion(const BogusImplicitConversion &) { }
+    };
+
+    py::class_<BogusImplicitConversion>(m, "BogusImplicitConversion")
+        .def(py::init<const BogusImplicitConversion &>());
+
+    py::implicitly_convertible<int, BogusImplicitConversion>();
+
+    // test_qualname
+    // #1166: nested class docstring doesn't show nested name
+    // Also related: tests that __qualname__ is set properly
+    struct NestBase {};
+    struct Nested {};
+    py::class_<NestBase> base(m, "NestBase");
+    base.def(py::init<>());
+    py::class_<Nested>(base, "Nested")
+        .def(py::init<>())
+        .def("fn", [](Nested &, int, NestBase &, Nested &) {})
+        .def("fa", [](Nested &, int, NestBase &, Nested &) {},
+                "a"_a, "b"_a, "c"_a);
+    base.def("g", [](NestBase &, Nested &) {});
+    base.def("h", []() { return NestBase(); });
+
+    // test_error_after_conversion
+    // The second-pass path through dispatcher() previously didn't
+    // remember which overload was used, and would crash trying to
+    // generate a useful error message
+
+    struct NotRegistered {};
+    struct StringWrapper { std::string str; };
+    m.def("test_error_after_conversions", [](int) {});
+    m.def("test_error_after_conversions",
+          [](StringWrapper) -> NotRegistered { return {}; });
+    py::class_<StringWrapper>(m, "StringWrapper").def(py::init<std::string>());
+    py::implicitly_convertible<std::string, StringWrapper>();
+
+    #if defined(PYBIND11_CPP17)
+        struct alignas(1024) Aligned {
+            std::uintptr_t ptr() const { return (uintptr_t) this; }
+        };
+        py::class_<Aligned>(m, "Aligned")
+            .def(py::init<>())
+            .def("ptr", &Aligned::ptr);
+    #endif
+}
+
+template <int N> class BreaksBase { public: virtual ~BreaksBase() = default; };
+template <int N> class BreaksTramp : public BreaksBase<N> {};
+// These should all compile just fine:
+typedef py::class_<BreaksBase<1>, std::unique_ptr<BreaksBase<1>>, BreaksTramp<1>> DoesntBreak1;
+typedef py::class_<BreaksBase<2>, BreaksTramp<2>, std::unique_ptr<BreaksBase<2>>> DoesntBreak2;
+typedef py::class_<BreaksBase<3>, std::unique_ptr<BreaksBase<3>>> DoesntBreak3;
+typedef py::class_<BreaksBase<4>, BreaksTramp<4>> DoesntBreak4;
+typedef py::class_<BreaksBase<5>> DoesntBreak5;
+typedef py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>> DoesntBreak6;
+typedef py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>> DoesntBreak7;
+typedef py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>> DoesntBreak8;
+#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
+        "DoesntBreak" #N " has wrong type!")
+CHECK_BASE(1); CHECK_BASE(2); CHECK_BASE(3); CHECK_BASE(4); CHECK_BASE(5); CHECK_BASE(6); CHECK_BASE(7); CHECK_BASE(8);
+#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
+        "DoesntBreak" #N " has wrong type_alias!")
+#define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
+        "DoesntBreak" #N " has type alias, but shouldn't!")
+CHECK_ALIAS(1); CHECK_ALIAS(2); CHECK_NOALIAS(3); CHECK_ALIAS(4); CHECK_NOALIAS(5); CHECK_ALIAS(6); CHECK_ALIAS(7); CHECK_NOALIAS(8);
+#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
+        "DoesntBreak" #N " has wrong holder_type!")
+CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
+CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
+
+// There's no nice way to test that these fail because they fail to compile; leave them here,
+// though, so that they can be manually tested by uncommenting them (and seeing that compilation
+// failures occurs).
+
+// We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
+#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
+        "Breaks1 has wrong type!");
+
+//// Two holder classes:
+//typedef py::class_<BreaksBase<-1>, std::unique_ptr<BreaksBase<-1>>, std::unique_ptr<BreaksBase<-1>>> Breaks1;
+//CHECK_BROKEN(1);
+//// Two aliases:
+//typedef py::class_<BreaksBase<-2>, BreaksTramp<-2>, BreaksTramp<-2>> Breaks2;
+//CHECK_BROKEN(2);
+//// Holder + 2 aliases
+//typedef py::class_<BreaksBase<-3>, std::unique_ptr<BreaksBase<-3>>, BreaksTramp<-3>, BreaksTramp<-3>> Breaks3;
+//CHECK_BROKEN(3);
+//// Alias + 2 holders
+//typedef py::class_<BreaksBase<-4>, std::unique_ptr<BreaksBase<-4>>, BreaksTramp<-4>, std::shared_ptr<BreaksBase<-4>>> Breaks4;
+//CHECK_BROKEN(4);
+//// Invalid option (not a subclass or holder)
+//typedef py::class_<BreaksBase<-5>, BreaksTramp<-4>> Breaks5;
+//CHECK_BROKEN(5);
+//// Invalid option: multiple inheritance not supported:
+//template <> struct BreaksBase<-8> : BreaksBase<-6>, BreaksBase<-7> {};
+//typedef py::class_<BreaksBase<-8>, BreaksBase<-6>, BreaksBase<-7>> Breaks8;
+//CHECK_BROKEN(8);
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.py
new file mode 100644
index 0000000..ed63ca8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.py
@@ -0,0 +1,281 @@
+import pytest
+
+from pybind11_tests import class_ as m
+from pybind11_tests import UserType, ConstructorStats
+
+
+def test_repr():
+    # In Python 3.3+, repr() accesses __qualname__
+    assert "pybind11_type" in repr(type(UserType))
+    assert "UserType" in repr(UserType)
+
+
+def test_instance(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.NoConstructor()
+    assert msg(excinfo.value) == "m.class_.NoConstructor: No constructor defined!"
+
+    instance = m.NoConstructor.new_instance()
+
+    cstats = ConstructorStats.get(m.NoConstructor)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+
+def test_docstrings(doc):
+    assert doc(UserType) == "A `py::class_` type for testing"
+    assert UserType.__name__ == "UserType"
+    assert UserType.__module__ == "pybind11_tests"
+    assert UserType.get_value.__name__ == "get_value"
+    assert UserType.get_value.__module__ == "pybind11_tests"
+
+    assert doc(UserType.get_value) == """
+        get_value(self: m.UserType) -> int
+
+        Get value using a method
+    """
+    assert doc(UserType.value) == "Get/set value using a property"
+
+    assert doc(m.NoConstructor.new_instance) == """
+        new_instance() -> m.class_.NoConstructor
+
+        Return an instance
+    """
+
+
+def test_qualname(doc):
+    """Tests that a properly qualified name is set in __qualname__ (even in pre-3.3, where we
+    backport the attribute) and that generated docstrings properly use it and the module name"""
+    assert m.NestBase.__qualname__ == "NestBase"
+    assert m.NestBase.Nested.__qualname__ == "NestBase.Nested"
+
+    assert doc(m.NestBase.__init__) == """
+        __init__(self: m.class_.NestBase) -> None
+    """
+    assert doc(m.NestBase.g) == """
+        g(self: m.class_.NestBase, arg0: m.class_.NestBase.Nested) -> None
+    """
+    assert doc(m.NestBase.Nested.__init__) == """
+        __init__(self: m.class_.NestBase.Nested) -> None
+    """
+    assert doc(m.NestBase.Nested.fn) == """
+        fn(self: m.class_.NestBase.Nested, arg0: int, arg1: m.class_.NestBase, arg2: m.class_.NestBase.Nested) -> None
+    """  # noqa: E501 line too long
+    assert doc(m.NestBase.Nested.fa) == """
+        fa(self: m.class_.NestBase.Nested, a: int, b: m.class_.NestBase, c: m.class_.NestBase.Nested) -> None
+    """  # noqa: E501 line too long
+    assert m.NestBase.__module__ == "pybind11_tests.class_"
+    assert m.NestBase.Nested.__module__ == "pybind11_tests.class_"
+
+
+def test_inheritance(msg):
+    roger = m.Rabbit('Rabbit')
+    assert roger.name() + " is a " + roger.species() == "Rabbit is a parrot"
+    assert m.pet_name_species(roger) == "Rabbit is a parrot"
+
+    polly = m.Pet('Polly', 'parrot')
+    assert polly.name() + " is a " + polly.species() == "Polly is a parrot"
+    assert m.pet_name_species(polly) == "Polly is a parrot"
+
+    molly = m.Dog('Molly')
+    assert molly.name() + " is a " + molly.species() == "Molly is a dog"
+    assert m.pet_name_species(molly) == "Molly is a dog"
+
+    fred = m.Hamster('Fred')
+    assert fred.name() + " is a " + fred.species() == "Fred is a rodent"
+
+    assert m.dog_bark(molly) == "Woof!"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.dog_bark(polly)
+    assert msg(excinfo.value) == """
+        dog_bark(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.class_.Dog) -> str
+
+        Invoked with: <m.class_.Pet object at 0>
+    """
+
+    with pytest.raises(TypeError) as excinfo:
+        m.Chimera("lion", "goat")
+    assert "No constructor defined!" in str(excinfo.value)
+
+
+def test_automatic_upcasting():
+    assert type(m.return_class_1()).__name__ == "DerivedClass1"
+    assert type(m.return_class_2()).__name__ == "DerivedClass2"
+    assert type(m.return_none()).__name__ == "NoneType"
+    # Repeat these a few times in a random order to ensure no invalid caching is applied
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+
+
+def test_isinstance():
+    objects = [tuple(), dict(), m.Pet("Polly", "parrot")] + [m.Dog("Molly")] * 4
+    expected = (True, True, True, True, True, False, False)
+    assert m.check_instances(objects) == expected
+
+
+def test_mismatched_holder():
+    import re
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_1()
+    assert re.match('generic_type: type ".*MismatchDerived1" does not have a non-default '
+                    'holder type while its base ".*MismatchBase1" does', str(excinfo.value))
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_2()
+    assert re.match('generic_type: type ".*MismatchDerived2" has a non-default holder type '
+                    'while its base ".*MismatchBase2" does not', str(excinfo.value))
+
+
+def test_override_static():
+    """#511: problem with inheritance + overwritten def_static"""
+    b = m.MyBase.make()
+    d1 = m.MyDerived.make2()
+    d2 = m.MyDerived.make()
+
+    assert isinstance(b, m.MyBase)
+    assert isinstance(d1, m.MyDerived)
+    assert isinstance(d2, m.MyDerived)
+
+
+def test_implicit_conversion_life_support():
+    """Ensure the lifetime of temporary objects created for implicit conversions"""
+    assert m.implicitly_convert_argument(UserType(5)) == 5
+    assert m.implicitly_convert_variable(UserType(5)) == 5
+
+    assert "outside a bound function" in m.implicitly_convert_variable_fail(UserType(5))
+
+
+def test_operator_new_delete(capture):
+    """Tests that class-specific operator new/delete functions are invoked"""
+
+    class SubAliased(m.AliasedHasOpNewDelSize):
+        pass
+
+    with capture:
+        a = m.HasOpNewDel()
+        b = m.HasOpNewDelSize()
+        d = m.HasOpNewDelBoth()
+    assert capture == """
+        A new 8
+        B new 4
+        D new 32
+    """
+    sz_alias = str(m.AliasedHasOpNewDelSize.size_alias)
+    sz_noalias = str(m.AliasedHasOpNewDelSize.size_noalias)
+    with capture:
+        c = m.AliasedHasOpNewDelSize()
+        c2 = SubAliased()
+    assert capture == (
+        "C new " + sz_noalias + "\n" +
+        "C new " + sz_alias + "\n"
+    )
+
+    with capture:
+        del a
+        pytest.gc_collect()
+        del b
+        pytest.gc_collect()
+        del d
+        pytest.gc_collect()
+    assert capture == """
+        A delete
+        B delete 4
+        D delete
+    """
+
+    with capture:
+        del c
+        pytest.gc_collect()
+        del c2
+        pytest.gc_collect()
+    assert capture == (
+        "C delete " + sz_noalias + "\n" +
+        "C delete " + sz_alias + "\n"
+    )
+
+
+def test_bind_protected_functions():
+    """Expose protected member functions to Python using a helper class"""
+    a = m.ProtectedA()
+    assert a.foo() == 42
+
+    b = m.ProtectedB()
+    assert b.foo() == 42
+
+    class C(m.ProtectedB):
+        def __init__(self):
+            m.ProtectedB.__init__(self)
+
+        def foo(self):
+            return 0
+
+    c = C()
+    assert c.foo() == 0
+
+
+def test_brace_initialization():
+    """ Tests that simple POD classes can be constructed using C++11 brace initialization """
+    a = m.BraceInitialization(123, "test")
+    assert a.field1 == 123
+    assert a.field2 == "test"
+
+    # Tests that a non-simple class doesn't get brace initialization (if the
+    # class defines an initializer_list constructor, in particular, it would
+    # win over the expected constructor).
+    b = m.NoBraceInitialization([123, 456])
+    assert b.vec == [123, 456]
+
+
+@pytest.unsupported_on_pypy
+def test_class_refcount():
+    """Instances must correctly increase/decrease the reference count of their types (#1029)"""
+    from sys import getrefcount
+
+    class PyDog(m.Dog):
+        pass
+
+    for cls in m.Dog, PyDog:
+        refcount_1 = getrefcount(cls)
+        molly = [cls("Molly") for _ in range(10)]
+        refcount_2 = getrefcount(cls)
+
+        del molly
+        pytest.gc_collect()
+        refcount_3 = getrefcount(cls)
+
+        assert refcount_1 == refcount_3
+        assert refcount_2 > refcount_1
+
+
+def test_reentrant_implicit_conversion_failure(msg):
+    # ensure that there is no runaway reentrant implicit conversion (#1035)
+    with pytest.raises(TypeError) as excinfo:
+        m.BogusImplicitConversion(0)
+    assert msg(excinfo.value) == '''
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.class_.BogusImplicitConversion(arg0: m.class_.BogusImplicitConversion)
+
+        Invoked with: 0
+    '''
+
+
+def test_error_after_conversions():
+    with pytest.raises(TypeError) as exc_info:
+        m.test_error_after_conversions("hello")
+    assert str(exc_info.value).startswith(
+        "Unable to convert function return value to a Python type!")
+
+
+def test_aligned():
+    if hasattr(m, "Aligned"):
+        p = m.Aligned().ptr()
+        assert p % 1024 == 0
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/CMakeLists.txt
new file mode 100644
index 0000000..c9b5fcb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/CMakeLists.txt
@@ -0,0 +1,58 @@
+add_custom_target(test_cmake_build)
+
+if(CMAKE_VERSION VERSION_LESS 3.1)
+  # 3.0 needed for interface library for subdirectory_target/installed_target
+  # 3.1 needed for cmake -E env for testing
+  return()
+endif()
+
+include(CMakeParseArguments)
+function(pybind11_add_build_test name)
+  cmake_parse_arguments(ARG "INSTALL" "" "" ${ARGN})
+
+  set(build_options "-DCMAKE_PREFIX_PATH=${PROJECT_BINARY_DIR}/mock_install"
+                    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+                    "-DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}"
+                    "-DPYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}")
+  if(NOT ARG_INSTALL)
+    list(APPEND build_options "-DPYBIND11_PROJECT_DIR=${PROJECT_SOURCE_DIR}")
+  endif()
+
+  add_custom_target(test_${name} ${CMAKE_CTEST_COMMAND}
+    --quiet --output-log ${name}.log
+    --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/${name}"
+                     "${CMAKE_CURRENT_BINARY_DIR}/${name}"
+    --build-config Release
+    --build-noclean
+    --build-generator ${CMAKE_GENERATOR}
+    $<$<BOOL:${CMAKE_GENERATOR_PLATFORM}>:--build-generator-platform> ${CMAKE_GENERATOR_PLATFORM}
+    --build-makeprogram ${CMAKE_MAKE_PROGRAM}
+    --build-target check
+    --build-options ${build_options}
+  )
+  if(ARG_INSTALL)
+    add_dependencies(test_${name} mock_install)
+  endif()
+  add_dependencies(test_cmake_build test_${name})
+endfunction()
+
+pybind11_add_build_test(subdirectory_function)
+pybind11_add_build_test(subdirectory_target)
+if(NOT ${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+  pybind11_add_build_test(subdirectory_embed)
+endif()
+
+if(PYBIND11_INSTALL)
+  add_custom_target(mock_install ${CMAKE_COMMAND}
+    "-DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/mock_install"
+    -P "${PROJECT_BINARY_DIR}/cmake_install.cmake"
+  )
+
+  pybind11_add_build_test(installed_function INSTALL)
+  pybind11_add_build_test(installed_target INSTALL)
+  if(NOT ${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+    pybind11_add_build_test(installed_embed INSTALL)
+  endif()
+endif()
+
+add_dependencies(check test_cmake_build)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cc
new file mode 100644
index 0000000..b9581d2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cc
@@ -0,0 +1,21 @@
+#include <pybind11/embed.h>
+namespace py = pybind11;
+
+PYBIND11_EMBEDDED_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2)
+        throw std::runtime_error("Expected test.py file as the first argument");
+    auto test_py_file = argv[1];
+
+    py::scoped_interpreter guard{};
+
+    auto m = py::module::import("test_cmake_build");
+    if (m.attr("add")(1, 2).cast<int>() != 3)
+        throw std::runtime_error("embed.cpp failed");
+
+    py::module::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
+    py::eval_file(test_py_file, py::globals());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cpp
new file mode 100644
index 0000000..b9581d2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cpp
@@ -0,0 +1,21 @@
+#include <pybind11/embed.h>
+namespace py = pybind11;
+
+PYBIND11_EMBEDDED_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2)
+        throw std::runtime_error("Expected test.py file as the first argument");
+    auto test_py_file = argv[1];
+
+    py::scoped_interpreter guard{};
+
+    auto m = py::module::import("test_cmake_build");
+    if (m.attr("add")(1, 2).cast<int>() != 3)
+        throw std::runtime_error("embed.cpp failed");
+
+    py::module::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
+    py::eval_file(test_py_file, py::globals());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
new file mode 100644
index 0000000..f7fc09c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_installed_embed CXX)
+
+set(CMAKE_MODULE_PATH "")
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+add_executable(test_cmake_build ../embed.cpp)
+target_link_libraries(test_cmake_build PRIVATE pybind11::embed)
+
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::embed).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_cmake_build PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
+add_custom_target(check $<TARGET_FILE:test_cmake_build> ${PROJECT_SOURCE_DIR}/../test.py)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
new file mode 100644
index 0000000..e0c20a8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(test_installed_module CXX)
+
+set(CMAKE_MODULE_PATH "")
+
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+pybind11_add_module(test_cmake_build SHARED NO_EXTRAS ../main.cpp)
+
+add_custom_target(check ${CMAKE_COMMAND} -E env PYTHONPATH=$<TARGET_FILE_DIR:test_cmake_build>
+                  ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/../test.py ${PROJECT_NAME})
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
new file mode 100644
index 0000000..cd3ae6f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_installed_target CXX)
+
+set(CMAKE_MODULE_PATH "")
+
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+add_library(test_cmake_build MODULE ../main.cpp)
+
+target_link_libraries(test_cmake_build PRIVATE pybind11::module)
+
+# make sure result is, for example, test_installed_target.so, not libtest_installed_target.dylib
+set_target_properties(test_cmake_build PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                                  SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::module).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_cmake_build PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
+add_custom_target(check ${CMAKE_COMMAND} -E env PYTHONPATH=$<TARGET_FILE_DIR:test_cmake_build>
+                  ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/../test.py ${PROJECT_NAME})
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cc
new file mode 100644
index 0000000..e30f2c4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cc
@@ -0,0 +1,6 @@
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+PYBIND11_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cpp
new file mode 100644
index 0000000..e30f2c4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cpp
@@ -0,0 +1,6 @@
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+PYBIND11_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
new file mode 100644
index 0000000..88ba60d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_subdirectory_embed CXX)
+
+set(PYBIND11_INSTALL ON CACHE BOOL "")
+set(PYBIND11_EXPORT_NAME test_export)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+
+# Test basic target functionality
+add_executable(test_cmake_build ../embed.cpp)
+target_link_libraries(test_cmake_build PRIVATE pybind11::embed)
+
+add_custom_target(check $<TARGET_FILE:test_cmake_build> ${PROJECT_SOURCE_DIR}/../test.py)
+
+# Test custom export group -- PYBIND11_EXPORT_NAME
+add_library(test_embed_lib ../embed.cpp)
+target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
+
+install(TARGETS test_embed_lib
+        EXPORT  test_export
+        ARCHIVE DESTINATION bin
+        LIBRARY DESTINATION lib
+        RUNTIME DESTINATION lib)
+install(EXPORT      test_export
+        DESTINATION lib/cmake/test_export/test_export-Targets.cmake)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
new file mode 100644
index 0000000..278007a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(test_subdirectory_module CXX)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+pybind11_add_module(test_cmake_build THIN_LTO ../main.cpp)
+
+add_custom_target(check ${CMAKE_COMMAND} -E env PYTHONPATH=$<TARGET_FILE_DIR:test_cmake_build>
+                  ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/../test.py ${PROJECT_NAME})
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
new file mode 100644
index 0000000..6b142d6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_subdirectory_target CXX)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+
+add_library(test_cmake_build MODULE ../main.cpp)
+
+target_link_libraries(test_cmake_build PRIVATE pybind11::module)
+
+# make sure result is, for example, test_installed_target.so, not libtest_installed_target.dylib
+set_target_properties(test_cmake_build PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                                  SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+add_custom_target(check ${CMAKE_COMMAND} -E env PYTHONPATH=$<TARGET_FILE_DIR:test_cmake_build>
+                  ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/../test.py ${PROJECT_NAME})
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/test.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/test.py
new file mode 100644
index 0000000..1467a61
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/test.py
@@ -0,0 +1,5 @@
+import sys
+import test_cmake_build
+
+assert test_cmake_build.add(1, 2) == 3
+print("{} imports, runs, and adds: 1 + 2 = 3".format(sys.argv[1]))
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cc
new file mode 100644
index 0000000..e8ec74b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cc
@@ -0,0 +1,127 @@
+/*
+    tests/test_constants_and_functions.cpp -- global constants and functions, enumerations, raw byte strings
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+enum MyEnum { EFirstEntry = 1, ESecondEntry };
+
+std::string test_function1() {
+    return "test_function()";
+}
+
+std::string test_function2(MyEnum k) {
+    return "test_function(enum=" + std::to_string(k) + ")";
+}
+
+std::string test_function3(int i) {
+    return "test_function(" + std::to_string(i) + ")";
+}
+
+py::str test_function4()           { return "test_function()"; }
+py::str test_function4(char *)     { return "test_function(char *)"; }
+py::str test_function4(int, float) { return "test_function(int, float)"; }
+py::str test_function4(float, int) { return "test_function(float, int)"; }
+
+py::bytes return_bytes() {
+    const char *data = "\x01\x00\x02\x00";
+    return std::string(data, 4);
+}
+
+std::string print_bytes(py::bytes bytes) {
+    std::string ret = "bytes[";
+    const auto value = static_cast<std::string>(bytes);
+    for (size_t i = 0; i < value.length(); ++i) {
+        ret += std::to_string(static_cast<int>(value[i])) + " ";
+    }
+    ret.back() = ']';
+    return ret;
+}
+
+// Test that we properly handle C++17 exception specifiers (which are part of the function signature
+// in C++17).  These should all still work before C++17, but don't affect the function signature.
+namespace test_exc_sp {
+int f1(int x) noexcept { return x+1; }
+int f2(int x) noexcept(true) { return x+2; }
+int f3(int x) noexcept(false) { return x+3; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+int f4(int x) throw() { return x+4; } // Deprecated equivalent to noexcept(true)
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+struct C {
+    int m1(int x) noexcept { return x-1; }
+    int m2(int x) const noexcept { return x-2; }
+    int m3(int x) noexcept(true) { return x-3; }
+    int m4(int x) const noexcept(true) { return x-4; }
+    int m5(int x) noexcept(false) { return x-5; }
+    int m6(int x) const noexcept(false) { return x-6; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+    int m7(int x) throw() { return x-7; }
+    int m8(int x) const throw() { return x-8; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+};
+}
+
+
+TEST_SUBMODULE(constants_and_functions, m) {
+    // test_constants
+    m.attr("some_constant") = py::int_(14);
+
+    // test_function_overloading
+    m.def("test_function", &test_function1);
+    m.def("test_function", &test_function2);
+    m.def("test_function", &test_function3);
+
+#if defined(PYBIND11_OVERLOAD_CAST)
+    m.def("test_function", py::overload_cast<>(&test_function4));
+    m.def("test_function", py::overload_cast<char *>(&test_function4));
+    m.def("test_function", py::overload_cast<int, float>(&test_function4));
+    m.def("test_function", py::overload_cast<float, int>(&test_function4));
+#else
+    m.def("test_function", static_cast<py::str (*)()>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(char *)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(int, float)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(float, int)>(&test_function4));
+#endif
+
+    py::enum_<MyEnum>(m, "MyEnum")
+        .value("EFirstEntry", EFirstEntry)
+        .value("ESecondEntry", ESecondEntry)
+        .export_values();
+
+    // test_bytes
+    m.def("return_bytes", &return_bytes);
+    m.def("print_bytes", &print_bytes);
+
+    // test_exception_specifiers
+    using namespace test_exc_sp;
+    py::class_<C>(m, "C")
+        .def(py::init<>())
+        .def("m1", &C::m1)
+        .def("m2", &C::m2)
+        .def("m3", &C::m3)
+        .def("m4", &C::m4)
+        .def("m5", &C::m5)
+        .def("m6", &C::m6)
+        .def("m7", &C::m7)
+        .def("m8", &C::m8)
+        ;
+    m.def("f1", f1);
+    m.def("f2", f2);
+    m.def("f3", f3);
+    m.def("f4", f4);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cpp
new file mode 100644
index 0000000..e8ec74b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cpp
@@ -0,0 +1,127 @@
+/*
+    tests/test_constants_and_functions.cpp -- global constants and functions, enumerations, raw byte strings
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+enum MyEnum { EFirstEntry = 1, ESecondEntry };
+
+std::string test_function1() {
+    return "test_function()";
+}
+
+std::string test_function2(MyEnum k) {
+    return "test_function(enum=" + std::to_string(k) + ")";
+}
+
+std::string test_function3(int i) {
+    return "test_function(" + std::to_string(i) + ")";
+}
+
+py::str test_function4()           { return "test_function()"; }
+py::str test_function4(char *)     { return "test_function(char *)"; }
+py::str test_function4(int, float) { return "test_function(int, float)"; }
+py::str test_function4(float, int) { return "test_function(float, int)"; }
+
+py::bytes return_bytes() {
+    const char *data = "\x01\x00\x02\x00";
+    return std::string(data, 4);
+}
+
+std::string print_bytes(py::bytes bytes) {
+    std::string ret = "bytes[";
+    const auto value = static_cast<std::string>(bytes);
+    for (size_t i = 0; i < value.length(); ++i) {
+        ret += std::to_string(static_cast<int>(value[i])) + " ";
+    }
+    ret.back() = ']';
+    return ret;
+}
+
+// Test that we properly handle C++17 exception specifiers (which are part of the function signature
+// in C++17).  These should all still work before C++17, but don't affect the function signature.
+namespace test_exc_sp {
+int f1(int x) noexcept { return x+1; }
+int f2(int x) noexcept(true) { return x+2; }
+int f3(int x) noexcept(false) { return x+3; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+int f4(int x) throw() { return x+4; } // Deprecated equivalent to noexcept(true)
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+struct C {
+    int m1(int x) noexcept { return x-1; }
+    int m2(int x) const noexcept { return x-2; }
+    int m3(int x) noexcept(true) { return x-3; }
+    int m4(int x) const noexcept(true) { return x-4; }
+    int m5(int x) noexcept(false) { return x-5; }
+    int m6(int x) const noexcept(false) { return x-6; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+    int m7(int x) throw() { return x-7; }
+    int m8(int x) const throw() { return x-8; }
+#if defined(__GNUG__)
+#  pragma GCC diagnostic pop
+#endif
+};
+}
+
+
+TEST_SUBMODULE(constants_and_functions, m) {
+    // test_constants
+    m.attr("some_constant") = py::int_(14);
+
+    // test_function_overloading
+    m.def("test_function", &test_function1);
+    m.def("test_function", &test_function2);
+    m.def("test_function", &test_function3);
+
+#if defined(PYBIND11_OVERLOAD_CAST)
+    m.def("test_function", py::overload_cast<>(&test_function4));
+    m.def("test_function", py::overload_cast<char *>(&test_function4));
+    m.def("test_function", py::overload_cast<int, float>(&test_function4));
+    m.def("test_function", py::overload_cast<float, int>(&test_function4));
+#else
+    m.def("test_function", static_cast<py::str (*)()>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(char *)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(int, float)>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(float, int)>(&test_function4));
+#endif
+
+    py::enum_<MyEnum>(m, "MyEnum")
+        .value("EFirstEntry", EFirstEntry)
+        .value("ESecondEntry", ESecondEntry)
+        .export_values();
+
+    // test_bytes
+    m.def("return_bytes", &return_bytes);
+    m.def("print_bytes", &print_bytes);
+
+    // test_exception_specifiers
+    using namespace test_exc_sp;
+    py::class_<C>(m, "C")
+        .def(py::init<>())
+        .def("m1", &C::m1)
+        .def("m2", &C::m2)
+        .def("m3", &C::m3)
+        .def("m4", &C::m4)
+        .def("m5", &C::m5)
+        .def("m6", &C::m6)
+        .def("m7", &C::m7)
+        .def("m8", &C::m8)
+        ;
+    m.def("f1", f1);
+    m.def("f2", f2);
+    m.def("f3", f3);
+    m.def("f4", f4);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.py
new file mode 100644
index 0000000..472682d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.py
@@ -0,0 +1,39 @@
+from pybind11_tests import constants_and_functions as m
+
+
+def test_constants():
+    assert m.some_constant == 14
+
+
+def test_function_overloading():
+    assert m.test_function() == "test_function()"
+    assert m.test_function(7) == "test_function(7)"
+    assert m.test_function(m.MyEnum.EFirstEntry) == "test_function(enum=1)"
+    assert m.test_function(m.MyEnum.ESecondEntry) == "test_function(enum=2)"
+
+    assert m.test_function() == "test_function()"
+    assert m.test_function("abcd") == "test_function(char *)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(2.0, 2) == "test_function(float, int)"
+
+
+def test_bytes():
+    assert m.print_bytes(m.return_bytes()) == "bytes[1 0 2 0]"
+
+
+def test_exception_specifiers():
+    c = m.C()
+    assert c.m1(2) == 1
+    assert c.m2(3) == 1
+    assert c.m3(5) == 2
+    assert c.m4(7) == 3
+    assert c.m5(10) == 5
+    assert c.m6(14) == 8
+    assert c.m7(20) == 13
+    assert c.m8(29) == 21
+
+    assert m.f1(33) == 34
+    assert m.f2(53) == 55
+    assert m.f3(86) == 89
+    assert m.f4(140) == 144
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cc
new file mode 100644
index 0000000..98d5e0a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cc
@@ -0,0 +1,213 @@
+/*
+    tests/test_copy_move_policies.cpp -- 'copy' and 'move' return value policies
+                                         and related tests
+
+    Copyright (c) 2016 Ben North <ben@redfrontdoor.org>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+template <typename derived>
+struct empty {
+    static const derived& get_one() { return instance_; }
+    static derived instance_;
+};
+
+struct lacking_copy_ctor : public empty<lacking_copy_ctor> {
+    lacking_copy_ctor() {}
+    lacking_copy_ctor(const lacking_copy_ctor& other) = delete;
+};
+
+template <> lacking_copy_ctor empty<lacking_copy_ctor>::instance_ = {};
+
+struct lacking_move_ctor : public empty<lacking_move_ctor> {
+    lacking_move_ctor() {}
+    lacking_move_ctor(const lacking_move_ctor& other) = delete;
+    lacking_move_ctor(lacking_move_ctor&& other) = delete;
+};
+
+template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
+
+/* Custom type caster move/copy test classes */
+class MoveOnlyInt {
+public:
+    MoveOnlyInt() { print_default_created(this); }
+    MoveOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOnlyInt(MoveOnlyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOnlyInt &operator=(MoveOnlyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOnlyInt(const MoveOnlyInt &) = delete;
+    MoveOnlyInt &operator=(const MoveOnlyInt &) = delete;
+    ~MoveOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+class MoveOrCopyInt {
+public:
+    MoveOrCopyInt() { print_default_created(this); }
+    MoveOrCopyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOrCopyInt(MoveOrCopyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOrCopyInt(const MoveOrCopyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    MoveOrCopyInt &operator=(const MoveOrCopyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~MoveOrCopyInt() { print_destroyed(this); }
+
+    int value;
+};
+class CopyOnlyInt {
+public:
+    CopyOnlyInt() { print_default_created(this); }
+    CopyOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    CopyOnlyInt(const CopyOnlyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    CopyOnlyInt &operator=(const CopyOnlyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~CopyOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(detail)
+template <> struct type_caster<MoveOnlyInt> {
+    PYBIND11_TYPE_CASTER(MoveOnlyInt, _("MoveOnlyInt"));
+    bool load(handle src, bool) { value = MoveOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<MoveOrCopyInt> {
+    PYBIND11_TYPE_CASTER(MoveOrCopyInt, _("MoveOrCopyInt"));
+    bool load(handle src, bool) { value = MoveOrCopyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOrCopyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<CopyOnlyInt> {
+protected:
+    CopyOnlyInt value;
+public:
+    static constexpr auto name = _("CopyOnlyInt");
+    bool load(handle src, bool) { value = CopyOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const CopyOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+    static handle cast(const CopyOnlyInt *src, return_value_policy policy, handle parent) {
+        if (!src) return none().release();
+        return cast(*src, policy, parent);
+    }
+    operator CopyOnlyInt*() { return &value; }
+    operator CopyOnlyInt&() { return value; }
+    template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
+
+TEST_SUBMODULE(copy_move_policies, m) {
+    // test_lacking_copy_ctor
+    py::class_<lacking_copy_ctor>(m, "lacking_copy_ctor")
+        .def_static("get_one", &lacking_copy_ctor::get_one,
+                    py::return_value_policy::copy);
+    // test_lacking_move_ctor
+    py::class_<lacking_move_ctor>(m, "lacking_move_ctor")
+        .def_static("get_one", &lacking_move_ctor::get_one,
+                    py::return_value_policy::move);
+
+    // test_move_and_copy_casts
+    m.def("move_and_copy_casts", [](py::object o) {
+        int r = 0;
+        r += py::cast<MoveOrCopyInt>(o).value; /* moves */
+        r += py::cast<MoveOnlyInt>(o).value; /* moves */
+        r += py::cast<CopyOnlyInt>(o).value; /* copies */
+        MoveOrCopyInt m1(py::cast<MoveOrCopyInt>(o)); /* moves */
+        MoveOnlyInt m2(py::cast<MoveOnlyInt>(o)); /* moves */
+        CopyOnlyInt m3(py::cast<CopyOnlyInt>(o)); /* copies */
+        r += m1.value + m2.value + m3.value;
+
+        return r;
+    });
+
+    // test_move_and_copy_loads
+    m.def("move_only", [](MoveOnlyInt m) { return m.value; });
+    m.def("move_or_copy", [](MoveOrCopyInt m) { return m.value; });
+    m.def("copy_only", [](CopyOnlyInt m) { return m.value; });
+    m.def("move_pair", [](std::pair<MoveOnlyInt, MoveOrCopyInt> p) {
+        return p.first.value + p.second.value;
+    });
+    m.def("move_tuple", [](std::tuple<MoveOnlyInt, MoveOrCopyInt, MoveOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value + std::get<2>(t).value;
+    });
+    m.def("copy_tuple", [](std::tuple<CopyOnlyInt, CopyOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value;
+    });
+    m.def("move_copy_nested", [](std::pair<MoveOnlyInt, std::pair<std::tuple<MoveOrCopyInt, CopyOnlyInt, std::tuple<MoveOnlyInt>>, MoveOrCopyInt>> x) {
+        return x.first.value + std::get<0>(x.second.first).value + std::get<1>(x.second.first).value +
+            std::get<0>(std::get<2>(x.second.first)).value + x.second.second.value;
+    });
+    m.def("move_and_copy_cstats", []() {
+        ConstructorStats::gc();
+        // Reset counts to 0 so that previous tests don't affect later ones:
+        auto &mc = ConstructorStats::get<MoveOrCopyInt>();
+        mc.move_assignments = mc.move_constructions = mc.copy_assignments = mc.copy_constructions = 0;
+        auto &mo = ConstructorStats::get<MoveOnlyInt>();
+        mo.move_assignments = mo.move_constructions = mo.copy_assignments = mo.copy_constructions = 0;
+        auto &co = ConstructorStats::get<CopyOnlyInt>();
+        co.move_assignments = co.move_constructions = co.copy_assignments = co.copy_constructions = 0;
+        py::dict d;
+        d["MoveOrCopyInt"] = py::cast(mc, py::return_value_policy::reference);
+        d["MoveOnlyInt"] = py::cast(mo, py::return_value_policy::reference);
+        d["CopyOnlyInt"] = py::cast(co, py::return_value_policy::reference);
+        return d;
+    });
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_move_and_copy_load_optional
+    m.attr("has_optional") = true;
+    m.def("move_optional", [](std::optional<MoveOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_or_copy_optional", [](std::optional<MoveOrCopyInt> o) {
+        return o->value;
+    });
+    m.def("copy_optional", [](std::optional<CopyOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_optional_tuple", [](std::optional<std::tuple<MoveOrCopyInt, MoveOnlyInt, CopyOnlyInt>> x) {
+        return std::get<0>(*x).value + std::get<1>(*x).value + std::get<2>(*x).value;
+    });
+#else
+    m.attr("has_optional") = false;
+#endif
+
+    // #70 compilation issue if operator new is not public
+    struct PrivateOpNew {
+        int value = 1;
+    private:
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4822) // warning C4822: local class member function does not have a body
+#endif
+        void *operator new(size_t bytes);
+    };
+    py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
+    m.def("private_op_new_value", []() { return PrivateOpNew(); });
+    m.def("private_op_new_reference", []() -> const PrivateOpNew & {
+        static PrivateOpNew x{};
+        return x;
+    }, py::return_value_policy::reference);
+
+    // test_move_fallback
+    // #389: rvp::move should fall-through to copy on non-movable objects
+    struct MoveIssue1 {
+        int v;
+        MoveIssue1(int v) : v{v} {}
+        MoveIssue1(const MoveIssue1 &c) = default;
+        MoveIssue1(MoveIssue1 &&) = delete;
+    };
+    py::class_<MoveIssue1>(m, "MoveIssue1").def(py::init<int>()).def_readwrite("value", &MoveIssue1::v);
+
+    struct MoveIssue2 {
+        int v;
+        MoveIssue2(int v) : v{v} {}
+        MoveIssue2(MoveIssue2 &&) = default;
+    };
+    py::class_<MoveIssue2>(m, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
+
+    m.def("get_moveissue1", [](int i) { return new MoveIssue1(i); }, py::return_value_policy::move);
+    m.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cpp
new file mode 100644
index 0000000..98d5e0a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cpp
@@ -0,0 +1,213 @@
+/*
+    tests/test_copy_move_policies.cpp -- 'copy' and 'move' return value policies
+                                         and related tests
+
+    Copyright (c) 2016 Ben North <ben@redfrontdoor.org>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+template <typename derived>
+struct empty {
+    static const derived& get_one() { return instance_; }
+    static derived instance_;
+};
+
+struct lacking_copy_ctor : public empty<lacking_copy_ctor> {
+    lacking_copy_ctor() {}
+    lacking_copy_ctor(const lacking_copy_ctor& other) = delete;
+};
+
+template <> lacking_copy_ctor empty<lacking_copy_ctor>::instance_ = {};
+
+struct lacking_move_ctor : public empty<lacking_move_ctor> {
+    lacking_move_ctor() {}
+    lacking_move_ctor(const lacking_move_ctor& other) = delete;
+    lacking_move_ctor(lacking_move_ctor&& other) = delete;
+};
+
+template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
+
+/* Custom type caster move/copy test classes */
+class MoveOnlyInt {
+public:
+    MoveOnlyInt() { print_default_created(this); }
+    MoveOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOnlyInt(MoveOnlyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOnlyInt &operator=(MoveOnlyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOnlyInt(const MoveOnlyInt &) = delete;
+    MoveOnlyInt &operator=(const MoveOnlyInt &) = delete;
+    ~MoveOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+class MoveOrCopyInt {
+public:
+    MoveOrCopyInt() { print_default_created(this); }
+    MoveOrCopyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOrCopyInt(MoveOrCopyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOrCopyInt(const MoveOrCopyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    MoveOrCopyInt &operator=(const MoveOrCopyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~MoveOrCopyInt() { print_destroyed(this); }
+
+    int value;
+};
+class CopyOnlyInt {
+public:
+    CopyOnlyInt() { print_default_created(this); }
+    CopyOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    CopyOnlyInt(const CopyOnlyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    CopyOnlyInt &operator=(const CopyOnlyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~CopyOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(detail)
+template <> struct type_caster<MoveOnlyInt> {
+    PYBIND11_TYPE_CASTER(MoveOnlyInt, _("MoveOnlyInt"));
+    bool load(handle src, bool) { value = MoveOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<MoveOrCopyInt> {
+    PYBIND11_TYPE_CASTER(MoveOrCopyInt, _("MoveOrCopyInt"));
+    bool load(handle src, bool) { value = MoveOrCopyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOrCopyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<CopyOnlyInt> {
+protected:
+    CopyOnlyInt value;
+public:
+    static constexpr auto name = _("CopyOnlyInt");
+    bool load(handle src, bool) { value = CopyOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const CopyOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+    static handle cast(const CopyOnlyInt *src, return_value_policy policy, handle parent) {
+        if (!src) return none().release();
+        return cast(*src, policy, parent);
+    }
+    operator CopyOnlyInt*() { return &value; }
+    operator CopyOnlyInt&() { return value; }
+    template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
+
+TEST_SUBMODULE(copy_move_policies, m) {
+    // test_lacking_copy_ctor
+    py::class_<lacking_copy_ctor>(m, "lacking_copy_ctor")
+        .def_static("get_one", &lacking_copy_ctor::get_one,
+                    py::return_value_policy::copy);
+    // test_lacking_move_ctor
+    py::class_<lacking_move_ctor>(m, "lacking_move_ctor")
+        .def_static("get_one", &lacking_move_ctor::get_one,
+                    py::return_value_policy::move);
+
+    // test_move_and_copy_casts
+    m.def("move_and_copy_casts", [](py::object o) {
+        int r = 0;
+        r += py::cast<MoveOrCopyInt>(o).value; /* moves */
+        r += py::cast<MoveOnlyInt>(o).value; /* moves */
+        r += py::cast<CopyOnlyInt>(o).value; /* copies */
+        MoveOrCopyInt m1(py::cast<MoveOrCopyInt>(o)); /* moves */
+        MoveOnlyInt m2(py::cast<MoveOnlyInt>(o)); /* moves */
+        CopyOnlyInt m3(py::cast<CopyOnlyInt>(o)); /* copies */
+        r += m1.value + m2.value + m3.value;
+
+        return r;
+    });
+
+    // test_move_and_copy_loads
+    m.def("move_only", [](MoveOnlyInt m) { return m.value; });
+    m.def("move_or_copy", [](MoveOrCopyInt m) { return m.value; });
+    m.def("copy_only", [](CopyOnlyInt m) { return m.value; });
+    m.def("move_pair", [](std::pair<MoveOnlyInt, MoveOrCopyInt> p) {
+        return p.first.value + p.second.value;
+    });
+    m.def("move_tuple", [](std::tuple<MoveOnlyInt, MoveOrCopyInt, MoveOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value + std::get<2>(t).value;
+    });
+    m.def("copy_tuple", [](std::tuple<CopyOnlyInt, CopyOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value;
+    });
+    m.def("move_copy_nested", [](std::pair<MoveOnlyInt, std::pair<std::tuple<MoveOrCopyInt, CopyOnlyInt, std::tuple<MoveOnlyInt>>, MoveOrCopyInt>> x) {
+        return x.first.value + std::get<0>(x.second.first).value + std::get<1>(x.second.first).value +
+            std::get<0>(std::get<2>(x.second.first)).value + x.second.second.value;
+    });
+    m.def("move_and_copy_cstats", []() {
+        ConstructorStats::gc();
+        // Reset counts to 0 so that previous tests don't affect later ones:
+        auto &mc = ConstructorStats::get<MoveOrCopyInt>();
+        mc.move_assignments = mc.move_constructions = mc.copy_assignments = mc.copy_constructions = 0;
+        auto &mo = ConstructorStats::get<MoveOnlyInt>();
+        mo.move_assignments = mo.move_constructions = mo.copy_assignments = mo.copy_constructions = 0;
+        auto &co = ConstructorStats::get<CopyOnlyInt>();
+        co.move_assignments = co.move_constructions = co.copy_assignments = co.copy_constructions = 0;
+        py::dict d;
+        d["MoveOrCopyInt"] = py::cast(mc, py::return_value_policy::reference);
+        d["MoveOnlyInt"] = py::cast(mo, py::return_value_policy::reference);
+        d["CopyOnlyInt"] = py::cast(co, py::return_value_policy::reference);
+        return d;
+    });
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_move_and_copy_load_optional
+    m.attr("has_optional") = true;
+    m.def("move_optional", [](std::optional<MoveOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_or_copy_optional", [](std::optional<MoveOrCopyInt> o) {
+        return o->value;
+    });
+    m.def("copy_optional", [](std::optional<CopyOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_optional_tuple", [](std::optional<std::tuple<MoveOrCopyInt, MoveOnlyInt, CopyOnlyInt>> x) {
+        return std::get<0>(*x).value + std::get<1>(*x).value + std::get<2>(*x).value;
+    });
+#else
+    m.attr("has_optional") = false;
+#endif
+
+    // #70 compilation issue if operator new is not public
+    struct PrivateOpNew {
+        int value = 1;
+    private:
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4822) // warning C4822: local class member function does not have a body
+#endif
+        void *operator new(size_t bytes);
+    };
+    py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
+    m.def("private_op_new_value", []() { return PrivateOpNew(); });
+    m.def("private_op_new_reference", []() -> const PrivateOpNew & {
+        static PrivateOpNew x{};
+        return x;
+    }, py::return_value_policy::reference);
+
+    // test_move_fallback
+    // #389: rvp::move should fall-through to copy on non-movable objects
+    struct MoveIssue1 {
+        int v;
+        MoveIssue1(int v) : v{v} {}
+        MoveIssue1(const MoveIssue1 &c) = default;
+        MoveIssue1(MoveIssue1 &&) = delete;
+    };
+    py::class_<MoveIssue1>(m, "MoveIssue1").def(py::init<int>()).def_readwrite("value", &MoveIssue1::v);
+
+    struct MoveIssue2 {
+        int v;
+        MoveIssue2(int v) : v{v} {}
+        MoveIssue2(MoveIssue2 &&) = default;
+    };
+    py::class_<MoveIssue2>(m, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
+
+    m.def("get_moveissue1", [](int i) { return new MoveIssue1(i); }, py::return_value_policy::move);
+    m.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.py
new file mode 100644
index 0000000..0e671d9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.py
@@ -0,0 +1,112 @@
+import pytest
+from pybind11_tests import copy_move_policies as m
+
+
+def test_lacking_copy_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_copy_ctor.get_one()
+    assert "is non-copyable!" in str(excinfo.value)
+
+
+def test_lacking_move_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_move_ctor.get_one()
+    assert "is neither movable nor copyable!" in str(excinfo.value)
+
+
+def test_move_and_copy_casts():
+    """Cast some values in C++ via custom type casters and count the number of moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The type move constructions/assignments below each get incremented: the move assignment comes
+    # from the type_caster load; the move construction happens when extracting that via a cast or
+    # loading into an argument.
+    assert m.move_and_copy_casts(3) == 18
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions >= 2
+    assert c_mc.alive() == 0
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions >= 2
+    assert c_c.alive() == 0
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions >= 2
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_move_and_copy_loads():
+    """Call some functions that load arguments via custom type casters and count the number of
+    moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    assert m.move_only(10) == 10  # 1 move, c_m
+    assert m.move_or_copy(11) == 11  # 1 move, c_mc
+    assert m.copy_only(12) == 12  # 1 copy, c_c
+    assert m.move_pair((13, 14)) == 27  # 1 c_m move, 1 c_mc move
+    assert m.move_tuple((15, 16, 17)) == 48  # 2 c_m moves, 1 c_mc move
+    assert m.copy_tuple((18, 19)) == 37  # 2 c_c copies
+    # Direct constructions: 2 c_m moves, 2 c_mc moves, 1 c_c copy
+    # Extra moves/copies when moving pairs/tuples: 3 c_m, 3 c_mc, 2 c_c
+    assert m.move_copy_nested((1, ((2, 3, (4,)), 5))) == 15
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 6
+    assert c_m.move_constructions == 9
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 5
+    assert c_mc.move_constructions == 8
+    assert c_c.copy_assignments == 4
+    assert c_c.copy_constructions == 6
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+@pytest.mark.skipif(not m.has_optional, reason='no <optional>')
+def test_move_and_copy_load_optional():
+    """Tests move/copy loads of std::optional arguments"""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The extra move/copy constructions below come from the std::optional move (which has to move
+    # its arguments):
+    assert m.move_optional(10) == 10  # c_m: 1 move assign, 2 move construct
+    assert m.move_or_copy_optional(11) == 11  # c_mc: 1 move assign, 2 move construct
+    assert m.copy_optional(12) == 12  # c_c: 1 copy assign, 2 copy construct
+    # 1 move assign + move construct moves each of c_m, c_mc, 1 c_c copy
+    # +1 move/copy construct each from moving the tuple
+    # +1 move/copy construct each from moving the optional (which moves the tuple again)
+    assert m.move_optional_tuple((3, 4, 5)) == 12
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions == 5
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions == 5
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions == 5
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_private_op_new():
+    """An object with a private `operator new` cannot be returned by value"""
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.private_op_new_value()
+    assert "is neither movable nor copyable" in str(excinfo.value)
+
+    assert m.private_op_new_reference().value == 1
+
+
+def test_move_fallback():
+    """#389: rvp::move should fall-through to copy on non-movable objects"""
+
+    m2 = m.get_moveissue2(2)
+    assert m2.value == 2
+    m1 = m.get_moveissue1(1)
+    assert m1.value == 1
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cc
new file mode 100644
index 0000000..8c8f79f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cc
@@ -0,0 +1,61 @@
+/*
+    tests/test_docstring_options.cpp -- generation of docstrings and signatures
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(docstring_options, m) {
+    // test_docstring_options
+    {
+        py::options options;
+        options.disable_function_signatures();
+
+        m.def("test_function1", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function2", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        m.def("test_overloaded1", [](int) {}, py::arg("i"), "Overload docstring");
+        m.def("test_overloaded1", [](double) {}, py::arg("d"));
+
+        m.def("test_overloaded2", [](int) {}, py::arg("i"), "overload docstring 1");
+        m.def("test_overloaded2", [](double) {}, py::arg("d"), "overload docstring 2");
+
+        m.def("test_overloaded3", [](int) {}, py::arg("i"));
+        m.def("test_overloaded3", [](double) {}, py::arg("d"), "Overload docstr");
+
+        options.enable_function_signatures();
+
+        m.def("test_function3", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function4", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        options.disable_function_signatures().disable_user_defined_docstrings();
+
+        m.def("test_function5", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        {
+            py::options nested_options;
+            nested_options.enable_user_defined_docstrings();
+            m.def("test_function6", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+        }
+    }
+
+    m.def("test_function7", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+    {
+        py::options options;
+        options.disable_user_defined_docstrings();
+
+        struct DocstringTestFoo {
+            int value;
+            void setValue(int v) { value = v; }
+            int getValue() const { return value; }
+        };
+        py::class_<DocstringTestFoo>(m, "DocstringTestFoo", "This is a class docstring")
+            .def_property("value_prop", &DocstringTestFoo::getValue, &DocstringTestFoo::setValue, "This is a property docstring")
+        ;
+    }
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cpp
new file mode 100644
index 0000000..8c8f79f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cpp
@@ -0,0 +1,61 @@
+/*
+    tests/test_docstring_options.cpp -- generation of docstrings and signatures
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(docstring_options, m) {
+    // test_docstring_options
+    {
+        py::options options;
+        options.disable_function_signatures();
+
+        m.def("test_function1", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function2", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        m.def("test_overloaded1", [](int) {}, py::arg("i"), "Overload docstring");
+        m.def("test_overloaded1", [](double) {}, py::arg("d"));
+
+        m.def("test_overloaded2", [](int) {}, py::arg("i"), "overload docstring 1");
+        m.def("test_overloaded2", [](double) {}, py::arg("d"), "overload docstring 2");
+
+        m.def("test_overloaded3", [](int) {}, py::arg("i"));
+        m.def("test_overloaded3", [](double) {}, py::arg("d"), "Overload docstr");
+
+        options.enable_function_signatures();
+
+        m.def("test_function3", [](int, int) {}, py::arg("a"), py::arg("b"));
+        m.def("test_function4", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        options.disable_function_signatures().disable_user_defined_docstrings();
+
+        m.def("test_function5", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+        {
+            py::options nested_options;
+            nested_options.enable_user_defined_docstrings();
+            m.def("test_function6", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+        }
+    }
+
+    m.def("test_function7", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+
+    {
+        py::options options;
+        options.disable_user_defined_docstrings();
+
+        struct DocstringTestFoo {
+            int value;
+            void setValue(int v) { value = v; }
+            int getValue() const { return value; }
+        };
+        py::class_<DocstringTestFoo>(m, "DocstringTestFoo", "This is a class docstring")
+            .def_property("value_prop", &DocstringTestFoo::getValue, &DocstringTestFoo::setValue, "This is a property docstring")
+        ;
+    }
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.py
new file mode 100644
index 0000000..0dbca60
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.py
@@ -0,0 +1,38 @@
+from pybind11_tests import docstring_options as m
+
+
+def test_docstring_options():
+    # options.disable_function_signatures()
+    assert not m.test_function1.__doc__
+
+    assert m.test_function2.__doc__ == "A custom docstring"
+
+    # docstring specified on just the first overload definition:
+    assert m.test_overloaded1.__doc__ == "Overload docstring"
+
+    # docstring on both overloads:
+    assert m.test_overloaded2.__doc__ == "overload docstring 1\noverload docstring 2"
+
+    # docstring on only second overload:
+    assert m.test_overloaded3.__doc__ == "Overload docstr"
+
+    # options.enable_function_signatures()
+    assert m.test_function3.__doc__ .startswith("test_function3(a: int, b: int) -> None")
+
+    assert m.test_function4.__doc__ .startswith("test_function4(a: int, b: int) -> None")
+    assert m.test_function4.__doc__ .endswith("A custom docstring\n")
+
+    # options.disable_function_signatures()
+    # options.disable_user_defined_docstrings()
+    assert not m.test_function5.__doc__
+
+    # nested options.enable_user_defined_docstrings()
+    assert m.test_function6.__doc__ == "A custom docstring"
+
+    # RAII destructor
+    assert m.test_function7.__doc__ .startswith("test_function7(a: int, b: int) -> None")
+    assert m.test_function7.__doc__ .endswith("A custom docstring\n")
+
+    # Suppression of user-defined docstrings for non-function objects
+    assert not m.DocstringTestFoo.__doc__
+    assert not m.DocstringTestFoo.value_prop.__doc__
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cc
new file mode 100644
index 0000000..aba088d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cc
@@ -0,0 +1,329 @@
+/*
+    tests/eigen.cpp -- automatic conversion of Eigen types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/eigen.h>
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4996) // C4996: std::unary_negation is deprecated
+#endif
+
+#include <Eigen/Cholesky>
+
+using MatrixXdR = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+
+
+// Sets/resets a testing reference matrix to have values of 10*r + c, where r and c are the
+// (1-based) row/column number.
+template <typename M> void reset_ref(M &x) {
+    for (int i = 0; i < x.rows(); i++) for (int j = 0; j < x.cols(); j++)
+        x(i, j) = 11 + 10*i + j;
+}
+
+// Returns a static, column-major matrix
+Eigen::MatrixXd &get_cm() {
+    static Eigen::MatrixXd *x;
+    if (!x) {
+        x = new Eigen::MatrixXd(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Likewise, but row-major
+MatrixXdR &get_rm() {
+    static MatrixXdR *x;
+    if (!x) {
+        x = new MatrixXdR(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Resets the values of the static matrices returned by get_cm()/get_rm()
+void reset_refs() {
+    reset_ref(get_cm());
+    reset_ref(get_rm());
+}
+
+// Returns element 2,1 from a matrix (used to test copy/nocopy)
+double get_elem(Eigen::Ref<const Eigen::MatrixXd> m) { return m(2, 1); };
+
+
+// Returns a matrix with 10*r + 100*c added to each matrix element (to help test that the matrix
+// reference is referencing rows/columns correctly).
+template <typename MatrixArgType> Eigen::MatrixXd adjust_matrix(MatrixArgType m) {
+    Eigen::MatrixXd ret(m);
+    for (int c = 0; c < m.cols(); c++) for (int r = 0; r < m.rows(); r++)
+        ret(r, c) += 10*r + 100*c;
+    return ret;
+}
+
+struct CustomOperatorNew {
+    CustomOperatorNew() = default;
+
+    Eigen::Matrix4d a = Eigen::Matrix4d::Zero();
+    Eigen::Matrix4d b = Eigen::Matrix4d::Identity();
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW;
+};
+
+TEST_SUBMODULE(eigen, m) {
+    using FixedMatrixR = Eigen::Matrix<float, 5, 6, Eigen::RowMajor>;
+    using FixedMatrixC = Eigen::Matrix<float, 5, 6>;
+    using DenseMatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using DenseMatrixC = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
+    using FourRowMatrixC = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixC = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using FourRowMatrixR = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixR = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using SparseMatrixR = Eigen::SparseMatrix<float, Eigen::RowMajor>;
+    using SparseMatrixC = Eigen::SparseMatrix<float>;
+
+    m.attr("have_eigen") = true;
+
+    // various tests
+    m.def("double_col", [](const Eigen::VectorXf &x) -> Eigen::VectorXf { return 2.0f * x; });
+    m.def("double_row", [](const Eigen::RowVectorXf &x) -> Eigen::RowVectorXf { return 2.0f * x; });
+    m.def("double_complex", [](const Eigen::VectorXcf &x) -> Eigen::VectorXcf { return 2.0f * x; });
+    m.def("double_threec", [](py::EigenDRef<Eigen::Vector3f> x) { x *= 2; });
+    m.def("double_threer", [](py::EigenDRef<Eigen::RowVector3f> x) { x *= 2; });
+    m.def("double_mat_cm", [](Eigen::MatrixXf x) -> Eigen::MatrixXf { return 2.0f * x; });
+    m.def("double_mat_rm", [](DenseMatrixR x) -> DenseMatrixR { return 2.0f * x; });
+
+    // test_eigen_ref_to_python
+    // Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
+    m.def("cholesky1", [](Eigen::Ref<MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky2", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky3", [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky4", [](Eigen::Ref<const MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+
+    // test_eigen_ref_mutators
+    // Mutators: these add some value to the given element using Eigen, but Eigen should be mapping into
+    // the numpy array data and so the result should show up there.  There are three versions: one that
+    // works on a contiguous-row matrix (numpy's default), one for a contiguous-column matrix, and one
+    // for any matrix.
+    auto add_rm = [](Eigen::Ref<MatrixXdR> x, int r, int c, double v) { x(r,c) += v; };
+    auto add_cm = [](Eigen::Ref<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; };
+
+    // Mutators (Eigen maps into numpy variables):
+    m.def("add_rm", add_rm); // Only takes row-contiguous
+    m.def("add_cm", add_cm); // Only takes column-contiguous
+    // Overloaded versions that will accept either row or column contiguous:
+    m.def("add1", add_rm);
+    m.def("add1", add_cm);
+    m.def("add2", add_cm);
+    m.def("add2", add_rm);
+    // This one accepts a matrix of any stride:
+    m.def("add_any", [](py::EigenDRef<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; });
+
+    // Return mutable references (numpy maps into eigen variables)
+    m.def("get_cm_ref", []() { return Eigen::Ref<Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_ref", []() { return Eigen::Ref<MatrixXdR>(get_rm()); });
+    // The same references, but non-mutable (numpy maps into eigen variables, but is !writeable)
+    m.def("get_cm_const_ref", []() { return Eigen::Ref<const Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_const_ref", []() { return Eigen::Ref<const MatrixXdR>(get_rm()); });
+
+    m.def("reset_refs", reset_refs); // Restores get_{cm,rm}_ref to original values
+
+    // Increments and returns ref to (same) matrix
+    m.def("incr_matrix", [](Eigen::Ref<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Same, but accepts a matrix of any strides
+    m.def("incr_matrix_any", [](py::EigenDRef<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even rows
+    m.def("even_rows", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), (m.rows() + 1) / 2, m.cols(),
+                py::EigenDStride(m.outerStride(), 2 * m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even columns
+    m.def("even_cols", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), m.rows(), (m.cols() + 1) / 2,
+                py::EigenDStride(2 * m.outerStride(), m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns diagonals: a vector-like object with an inner stride != 1
+    m.def("diagonal", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal(); });
+    m.def("diagonal_1", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal<1>(); });
+    m.def("diagonal_n", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int index) { return x.diagonal(index); });
+
+    // Return a block of a matrix (gives non-standard strides)
+    m.def("block", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int start_row, int start_col, int block_rows, int block_cols) {
+        return x.block(start_row, start_col, block_rows, block_cols);
+    });
+
+    // test_eigen_return_references, test_eigen_keepalive
+    // return value referencing/copying tests:
+    class ReturnTester {
+        Eigen::MatrixXd mat = create();
+    public:
+        ReturnTester() { print_created(this); }
+        ~ReturnTester() { print_destroyed(this); }
+        static Eigen::MatrixXd create() { return Eigen::MatrixXd::Ones(10, 10); }
+        static const Eigen::MatrixXd createConst() { return Eigen::MatrixXd::Ones(10, 10); }
+        Eigen::MatrixXd &get() { return mat; }
+        Eigen::MatrixXd *getPtr() { return &mat; }
+        const Eigen::MatrixXd &view() { return mat; }
+        const Eigen::MatrixXd *viewPtr() { return &mat; }
+        Eigen::Ref<Eigen::MatrixXd> ref() { return mat; }
+        Eigen::Ref<const Eigen::MatrixXd> refConst() { return mat; }
+        Eigen::Block<Eigen::MatrixXd> block(int r, int c, int nrow, int ncol) { return mat.block(r, c, nrow, ncol); }
+        Eigen::Block<const Eigen::MatrixXd> blockConst(int r, int c, int nrow, int ncol) const { return mat.block(r, c, nrow, ncol); }
+        py::EigenDMap<Eigen::Matrix2d> corners() { return py::EigenDMap<Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+        py::EigenDMap<const Eigen::Matrix2d> cornersConst() const { return py::EigenDMap<const Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+    };
+    using rvp = py::return_value_policy;
+    py::class_<ReturnTester>(m, "ReturnTester")
+        .def(py::init<>())
+        .def_static("create", &ReturnTester::create)
+        .def_static("create_const", &ReturnTester::createConst)
+        .def("get", &ReturnTester::get, rvp::reference_internal)
+        .def("get_ptr", &ReturnTester::getPtr, rvp::reference_internal)
+        .def("view", &ReturnTester::view, rvp::reference_internal)
+        .def("view_ptr", &ReturnTester::view, rvp::reference_internal)
+        .def("copy_get", &ReturnTester::get)   // Default rvp: copy
+        .def("copy_view", &ReturnTester::view) //         "
+        .def("ref", &ReturnTester::ref) // Default for Ref is to reference
+        .def("ref_const", &ReturnTester::refConst) // Likewise, but const
+        .def("ref_safe", &ReturnTester::ref, rvp::reference_internal)
+        .def("ref_const_safe", &ReturnTester::refConst, rvp::reference_internal)
+        .def("copy_ref", &ReturnTester::ref, rvp::copy)
+        .def("copy_ref_const", &ReturnTester::refConst, rvp::copy)
+        .def("block", &ReturnTester::block)
+        .def("block_safe", &ReturnTester::block, rvp::reference_internal)
+        .def("block_const", &ReturnTester::blockConst, rvp::reference_internal)
+        .def("copy_block", &ReturnTester::block, rvp::copy)
+        .def("corners", &ReturnTester::corners, rvp::reference_internal)
+        .def("corners_const", &ReturnTester::cornersConst, rvp::reference_internal)
+        ;
+
+    // test_special_matrix_objects
+    // Returns a DiagonalMatrix with diagonal (1,2,3,...)
+    m.def("incr_diag", [](int k) {
+        Eigen::DiagonalMatrix<int, Eigen::Dynamic> m(k);
+        for (int i = 0; i < k; i++) m.diagonal()[i] = i+1;
+        return m;
+    });
+
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_lower", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Lower>();
+    });
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_upper", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Upper>();
+    });
+
+    // Test matrix for various functions below.
+    Eigen::MatrixXf mat(5, 6);
+    mat << 0,  3,  0,  0,  0, 11,
+           22, 0,  0,  0, 17, 11,
+           7,  5,  0,  1,  0, 11,
+           0,  0,  0,  0,  0, 11,
+           0,  0, 14,  0,  8, 11;
+
+    // test_fixed, and various other tests
+    m.def("fixed_r", [mat]() -> FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_r_const", [mat]() -> const FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_c", [mat]() -> FixedMatrixC { return FixedMatrixC(mat); });
+    m.def("fixed_copy_r", [](const FixedMatrixR &m) -> FixedMatrixR { return m; });
+    m.def("fixed_copy_c", [](const FixedMatrixC &m) -> FixedMatrixC { return m; });
+    // test_mutator_descriptors
+    m.def("fixed_mutator_r", [](Eigen::Ref<FixedMatrixR>) {});
+    m.def("fixed_mutator_c", [](Eigen::Ref<FixedMatrixC>) {});
+    m.def("fixed_mutator_a", [](py::EigenDRef<FixedMatrixC>) {});
+    // test_dense
+    m.def("dense_r", [mat]() -> DenseMatrixR { return DenseMatrixR(mat); });
+    m.def("dense_c", [mat]() -> DenseMatrixC { return DenseMatrixC(mat); });
+    m.def("dense_copy_r", [](const DenseMatrixR &m) -> DenseMatrixR { return m; });
+    m.def("dense_copy_c", [](const DenseMatrixC &m) -> DenseMatrixC { return m; });
+    // test_sparse, test_sparse_signature
+    m.def("sparse_r", [mat]() -> SparseMatrixR { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_c", [mat]() -> SparseMatrixC { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_copy_r", [](const SparseMatrixR &m) -> SparseMatrixR { return m; });
+    m.def("sparse_copy_c", [](const SparseMatrixC &m) -> SparseMatrixC { return m; });
+    // test_partially_fixed
+    m.def("partial_copy_four_rm_r", [](const FourRowMatrixR &m) -> FourRowMatrixR { return m; });
+    m.def("partial_copy_four_rm_c", [](const FourColMatrixR &m) -> FourColMatrixR { return m; });
+    m.def("partial_copy_four_cm_r", [](const FourRowMatrixC &m) -> FourRowMatrixC { return m; });
+    m.def("partial_copy_four_cm_c", [](const FourColMatrixC &m) -> FourColMatrixC { return m; });
+
+    // test_cpp_casting
+    // Test that we can cast a numpy object to a Eigen::MatrixXd explicitly
+    m.def("cpp_copy", [](py::handle m) { return m.cast<Eigen::MatrixXd>()(1, 0); });
+    m.def("cpp_ref_c", [](py::handle m) { return m.cast<Eigen::Ref<Eigen::MatrixXd>>()(1, 0); });
+    m.def("cpp_ref_r", [](py::handle m) { return m.cast<Eigen::Ref<MatrixXdR>>()(1, 0); });
+    m.def("cpp_ref_any", [](py::handle m) { return m.cast<py::EigenDRef<Eigen::MatrixXd>>()(1, 0); });
+
+
+    // test_nocopy_wrapper
+    // Test that we can prevent copying into an argument that would normally copy: First a version
+    // that would allow copying (if types or strides don't match) for comparison:
+    m.def("get_elem", &get_elem);
+    // Now this alternative that calls the tells pybind to fail rather than copy:
+    m.def("get_elem_nocopy", [](Eigen::Ref<const Eigen::MatrixXd> m) -> double { return get_elem(m); },
+            py::arg().noconvert());
+    // Also test a row-major-only no-copy const ref:
+    m.def("get_elem_rm_nocopy", [](Eigen::Ref<const Eigen::Matrix<long, -1, -1, Eigen::RowMajor>> &m) -> long { return m(2, 1); },
+            py::arg().noconvert());
+
+    // test_issue738
+    // Issue #738: 1xN or Nx1 2D matrices were neither accepted nor properly copied with an
+    // incompatible stride value on the length-1 dimension--but that should be allowed (without
+    // requiring a copy!) because the stride value can be safely ignored on a size-1 dimension.
+    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg().noconvert());
+    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg().noconvert());
+
+    // test_issue1105
+    // Issue #1105: when converting from a numpy two-dimensional (Nx1) or (1xN) value into a dense
+    // eigen Vector or RowVector, the argument would fail to load because the numpy copy would fail:
+    // numpy won't broadcast a Nx1 into a 1-dimensional vector.
+    m.def("iss1105_col", [](Eigen::VectorXd) { return true; });
+    m.def("iss1105_row", [](Eigen::RowVectorXd) { return true; });
+
+    // test_named_arguments
+    // Make sure named arguments are working properly:
+    m.def("matrix_multiply", [](const py::EigenDRef<const Eigen::MatrixXd> A, const py::EigenDRef<const Eigen::MatrixXd> B)
+            -> Eigen::MatrixXd {
+        if (A.cols() != B.rows()) throw std::domain_error("Nonconformable matrices!");
+        return A * B;
+    }, py::arg("A"), py::arg("B"));
+
+    // test_custom_operator_new
+    py::class_<CustomOperatorNew>(m, "CustomOperatorNew")
+        .def(py::init<>())
+        .def_readonly("a", &CustomOperatorNew::a)
+        .def_readonly("b", &CustomOperatorNew::b);
+
+    // test_eigen_ref_life_support
+    // In case of a failure (the caster's temp array does not live long enough), creating
+    // a new array (np.ones(10)) increases the chances that the temp array will be garbage
+    // collected and/or that its memory will be overridden with different values.
+    m.def("get_elem_direct", [](Eigen::Ref<const Eigen::VectorXd> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v(5);
+    });
+    m.def("get_elem_indirect", [](std::vector<Eigen::Ref<const Eigen::VectorXd>> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v[0](5);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cpp
new file mode 100644
index 0000000..aba088d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cpp
@@ -0,0 +1,329 @@
+/*
+    tests/eigen.cpp -- automatic conversion of Eigen types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/eigen.h>
+#include <pybind11/stl.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4996) // C4996: std::unary_negation is deprecated
+#endif
+
+#include <Eigen/Cholesky>
+
+using MatrixXdR = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+
+
+// Sets/resets a testing reference matrix to have values of 10*r + c, where r and c are the
+// (1-based) row/column number.
+template <typename M> void reset_ref(M &x) {
+    for (int i = 0; i < x.rows(); i++) for (int j = 0; j < x.cols(); j++)
+        x(i, j) = 11 + 10*i + j;
+}
+
+// Returns a static, column-major matrix
+Eigen::MatrixXd &get_cm() {
+    static Eigen::MatrixXd *x;
+    if (!x) {
+        x = new Eigen::MatrixXd(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Likewise, but row-major
+MatrixXdR &get_rm() {
+    static MatrixXdR *x;
+    if (!x) {
+        x = new MatrixXdR(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Resets the values of the static matrices returned by get_cm()/get_rm()
+void reset_refs() {
+    reset_ref(get_cm());
+    reset_ref(get_rm());
+}
+
+// Returns element 2,1 from a matrix (used to test copy/nocopy)
+double get_elem(Eigen::Ref<const Eigen::MatrixXd> m) { return m(2, 1); };
+
+
+// Returns a matrix with 10*r + 100*c added to each matrix element (to help test that the matrix
+// reference is referencing rows/columns correctly).
+template <typename MatrixArgType> Eigen::MatrixXd adjust_matrix(MatrixArgType m) {
+    Eigen::MatrixXd ret(m);
+    for (int c = 0; c < m.cols(); c++) for (int r = 0; r < m.rows(); r++)
+        ret(r, c) += 10*r + 100*c;
+    return ret;
+}
+
+struct CustomOperatorNew {
+    CustomOperatorNew() = default;
+
+    Eigen::Matrix4d a = Eigen::Matrix4d::Zero();
+    Eigen::Matrix4d b = Eigen::Matrix4d::Identity();
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW;
+};
+
+TEST_SUBMODULE(eigen, m) {
+    using FixedMatrixR = Eigen::Matrix<float, 5, 6, Eigen::RowMajor>;
+    using FixedMatrixC = Eigen::Matrix<float, 5, 6>;
+    using DenseMatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using DenseMatrixC = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
+    using FourRowMatrixC = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixC = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using FourRowMatrixR = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixR = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using SparseMatrixR = Eigen::SparseMatrix<float, Eigen::RowMajor>;
+    using SparseMatrixC = Eigen::SparseMatrix<float>;
+
+    m.attr("have_eigen") = true;
+
+    // various tests
+    m.def("double_col", [](const Eigen::VectorXf &x) -> Eigen::VectorXf { return 2.0f * x; });
+    m.def("double_row", [](const Eigen::RowVectorXf &x) -> Eigen::RowVectorXf { return 2.0f * x; });
+    m.def("double_complex", [](const Eigen::VectorXcf &x) -> Eigen::VectorXcf { return 2.0f * x; });
+    m.def("double_threec", [](py::EigenDRef<Eigen::Vector3f> x) { x *= 2; });
+    m.def("double_threer", [](py::EigenDRef<Eigen::RowVector3f> x) { x *= 2; });
+    m.def("double_mat_cm", [](Eigen::MatrixXf x) -> Eigen::MatrixXf { return 2.0f * x; });
+    m.def("double_mat_rm", [](DenseMatrixR x) -> DenseMatrixR { return 2.0f * x; });
+
+    // test_eigen_ref_to_python
+    // Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
+    m.def("cholesky1", [](Eigen::Ref<MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky2", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky3", [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky4", [](Eigen::Ref<const MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+
+    // test_eigen_ref_mutators
+    // Mutators: these add some value to the given element using Eigen, but Eigen should be mapping into
+    // the numpy array data and so the result should show up there.  There are three versions: one that
+    // works on a contiguous-row matrix (numpy's default), one for a contiguous-column matrix, and one
+    // for any matrix.
+    auto add_rm = [](Eigen::Ref<MatrixXdR> x, int r, int c, double v) { x(r,c) += v; };
+    auto add_cm = [](Eigen::Ref<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; };
+
+    // Mutators (Eigen maps into numpy variables):
+    m.def("add_rm", add_rm); // Only takes row-contiguous
+    m.def("add_cm", add_cm); // Only takes column-contiguous
+    // Overloaded versions that will accept either row or column contiguous:
+    m.def("add1", add_rm);
+    m.def("add1", add_cm);
+    m.def("add2", add_cm);
+    m.def("add2", add_rm);
+    // This one accepts a matrix of any stride:
+    m.def("add_any", [](py::EigenDRef<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; });
+
+    // Return mutable references (numpy maps into eigen variables)
+    m.def("get_cm_ref", []() { return Eigen::Ref<Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_ref", []() { return Eigen::Ref<MatrixXdR>(get_rm()); });
+    // The same references, but non-mutable (numpy maps into eigen variables, but is !writeable)
+    m.def("get_cm_const_ref", []() { return Eigen::Ref<const Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_const_ref", []() { return Eigen::Ref<const MatrixXdR>(get_rm()); });
+
+    m.def("reset_refs", reset_refs); // Restores get_{cm,rm}_ref to original values
+
+    // Increments and returns ref to (same) matrix
+    m.def("incr_matrix", [](Eigen::Ref<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Same, but accepts a matrix of any strides
+    m.def("incr_matrix_any", [](py::EigenDRef<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even rows
+    m.def("even_rows", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), (m.rows() + 1) / 2, m.cols(),
+                py::EigenDStride(m.outerStride(), 2 * m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even columns
+    m.def("even_cols", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), m.rows(), (m.cols() + 1) / 2,
+                py::EigenDStride(2 * m.outerStride(), m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns diagonals: a vector-like object with an inner stride != 1
+    m.def("diagonal", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal(); });
+    m.def("diagonal_1", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal<1>(); });
+    m.def("diagonal_n", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int index) { return x.diagonal(index); });
+
+    // Return a block of a matrix (gives non-standard strides)
+    m.def("block", [](const Eigen::Ref<const Eigen::MatrixXd> &x, int start_row, int start_col, int block_rows, int block_cols) {
+        return x.block(start_row, start_col, block_rows, block_cols);
+    });
+
+    // test_eigen_return_references, test_eigen_keepalive
+    // return value referencing/copying tests:
+    class ReturnTester {
+        Eigen::MatrixXd mat = create();
+    public:
+        ReturnTester() { print_created(this); }
+        ~ReturnTester() { print_destroyed(this); }
+        static Eigen::MatrixXd create() { return Eigen::MatrixXd::Ones(10, 10); }
+        static const Eigen::MatrixXd createConst() { return Eigen::MatrixXd::Ones(10, 10); }
+        Eigen::MatrixXd &get() { return mat; }
+        Eigen::MatrixXd *getPtr() { return &mat; }
+        const Eigen::MatrixXd &view() { return mat; }
+        const Eigen::MatrixXd *viewPtr() { return &mat; }
+        Eigen::Ref<Eigen::MatrixXd> ref() { return mat; }
+        Eigen::Ref<const Eigen::MatrixXd> refConst() { return mat; }
+        Eigen::Block<Eigen::MatrixXd> block(int r, int c, int nrow, int ncol) { return mat.block(r, c, nrow, ncol); }
+        Eigen::Block<const Eigen::MatrixXd> blockConst(int r, int c, int nrow, int ncol) const { return mat.block(r, c, nrow, ncol); }
+        py::EigenDMap<Eigen::Matrix2d> corners() { return py::EigenDMap<Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+        py::EigenDMap<const Eigen::Matrix2d> cornersConst() const { return py::EigenDMap<const Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+    };
+    using rvp = py::return_value_policy;
+    py::class_<ReturnTester>(m, "ReturnTester")
+        .def(py::init<>())
+        .def_static("create", &ReturnTester::create)
+        .def_static("create_const", &ReturnTester::createConst)
+        .def("get", &ReturnTester::get, rvp::reference_internal)
+        .def("get_ptr", &ReturnTester::getPtr, rvp::reference_internal)
+        .def("view", &ReturnTester::view, rvp::reference_internal)
+        .def("view_ptr", &ReturnTester::view, rvp::reference_internal)
+        .def("copy_get", &ReturnTester::get)   // Default rvp: copy
+        .def("copy_view", &ReturnTester::view) //         "
+        .def("ref", &ReturnTester::ref) // Default for Ref is to reference
+        .def("ref_const", &ReturnTester::refConst) // Likewise, but const
+        .def("ref_safe", &ReturnTester::ref, rvp::reference_internal)
+        .def("ref_const_safe", &ReturnTester::refConst, rvp::reference_internal)
+        .def("copy_ref", &ReturnTester::ref, rvp::copy)
+        .def("copy_ref_const", &ReturnTester::refConst, rvp::copy)
+        .def("block", &ReturnTester::block)
+        .def("block_safe", &ReturnTester::block, rvp::reference_internal)
+        .def("block_const", &ReturnTester::blockConst, rvp::reference_internal)
+        .def("copy_block", &ReturnTester::block, rvp::copy)
+        .def("corners", &ReturnTester::corners, rvp::reference_internal)
+        .def("corners_const", &ReturnTester::cornersConst, rvp::reference_internal)
+        ;
+
+    // test_special_matrix_objects
+    // Returns a DiagonalMatrix with diagonal (1,2,3,...)
+    m.def("incr_diag", [](int k) {
+        Eigen::DiagonalMatrix<int, Eigen::Dynamic> m(k);
+        for (int i = 0; i < k; i++) m.diagonal()[i] = i+1;
+        return m;
+    });
+
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_lower", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Lower>();
+    });
+    // Returns a SelfAdjointView referencing the lower triangle of m
+    m.def("symmetric_upper", [](const Eigen::MatrixXi &m) {
+            return m.selfadjointView<Eigen::Upper>();
+    });
+
+    // Test matrix for various functions below.
+    Eigen::MatrixXf mat(5, 6);
+    mat << 0,  3,  0,  0,  0, 11,
+           22, 0,  0,  0, 17, 11,
+           7,  5,  0,  1,  0, 11,
+           0,  0,  0,  0,  0, 11,
+           0,  0, 14,  0,  8, 11;
+
+    // test_fixed, and various other tests
+    m.def("fixed_r", [mat]() -> FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_r_const", [mat]() -> const FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_c", [mat]() -> FixedMatrixC { return FixedMatrixC(mat); });
+    m.def("fixed_copy_r", [](const FixedMatrixR &m) -> FixedMatrixR { return m; });
+    m.def("fixed_copy_c", [](const FixedMatrixC &m) -> FixedMatrixC { return m; });
+    // test_mutator_descriptors
+    m.def("fixed_mutator_r", [](Eigen::Ref<FixedMatrixR>) {});
+    m.def("fixed_mutator_c", [](Eigen::Ref<FixedMatrixC>) {});
+    m.def("fixed_mutator_a", [](py::EigenDRef<FixedMatrixC>) {});
+    // test_dense
+    m.def("dense_r", [mat]() -> DenseMatrixR { return DenseMatrixR(mat); });
+    m.def("dense_c", [mat]() -> DenseMatrixC { return DenseMatrixC(mat); });
+    m.def("dense_copy_r", [](const DenseMatrixR &m) -> DenseMatrixR { return m; });
+    m.def("dense_copy_c", [](const DenseMatrixC &m) -> DenseMatrixC { return m; });
+    // test_sparse, test_sparse_signature
+    m.def("sparse_r", [mat]() -> SparseMatrixR { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_c", [mat]() -> SparseMatrixC { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_copy_r", [](const SparseMatrixR &m) -> SparseMatrixR { return m; });
+    m.def("sparse_copy_c", [](const SparseMatrixC &m) -> SparseMatrixC { return m; });
+    // test_partially_fixed
+    m.def("partial_copy_four_rm_r", [](const FourRowMatrixR &m) -> FourRowMatrixR { return m; });
+    m.def("partial_copy_four_rm_c", [](const FourColMatrixR &m) -> FourColMatrixR { return m; });
+    m.def("partial_copy_four_cm_r", [](const FourRowMatrixC &m) -> FourRowMatrixC { return m; });
+    m.def("partial_copy_four_cm_c", [](const FourColMatrixC &m) -> FourColMatrixC { return m; });
+
+    // test_cpp_casting
+    // Test that we can cast a numpy object to a Eigen::MatrixXd explicitly
+    m.def("cpp_copy", [](py::handle m) { return m.cast<Eigen::MatrixXd>()(1, 0); });
+    m.def("cpp_ref_c", [](py::handle m) { return m.cast<Eigen::Ref<Eigen::MatrixXd>>()(1, 0); });
+    m.def("cpp_ref_r", [](py::handle m) { return m.cast<Eigen::Ref<MatrixXdR>>()(1, 0); });
+    m.def("cpp_ref_any", [](py::handle m) { return m.cast<py::EigenDRef<Eigen::MatrixXd>>()(1, 0); });
+
+
+    // test_nocopy_wrapper
+    // Test that we can prevent copying into an argument that would normally copy: First a version
+    // that would allow copying (if types or strides don't match) for comparison:
+    m.def("get_elem", &get_elem);
+    // Now this alternative that calls the tells pybind to fail rather than copy:
+    m.def("get_elem_nocopy", [](Eigen::Ref<const Eigen::MatrixXd> m) -> double { return get_elem(m); },
+            py::arg().noconvert());
+    // Also test a row-major-only no-copy const ref:
+    m.def("get_elem_rm_nocopy", [](Eigen::Ref<const Eigen::Matrix<long, -1, -1, Eigen::RowMajor>> &m) -> long { return m(2, 1); },
+            py::arg().noconvert());
+
+    // test_issue738
+    // Issue #738: 1xN or Nx1 2D matrices were neither accepted nor properly copied with an
+    // incompatible stride value on the length-1 dimension--but that should be allowed (without
+    // requiring a copy!) because the stride value can be safely ignored on a size-1 dimension.
+    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg().noconvert());
+    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg().noconvert());
+
+    // test_issue1105
+    // Issue #1105: when converting from a numpy two-dimensional (Nx1) or (1xN) value into a dense
+    // eigen Vector or RowVector, the argument would fail to load because the numpy copy would fail:
+    // numpy won't broadcast a Nx1 into a 1-dimensional vector.
+    m.def("iss1105_col", [](Eigen::VectorXd) { return true; });
+    m.def("iss1105_row", [](Eigen::RowVectorXd) { return true; });
+
+    // test_named_arguments
+    // Make sure named arguments are working properly:
+    m.def("matrix_multiply", [](const py::EigenDRef<const Eigen::MatrixXd> A, const py::EigenDRef<const Eigen::MatrixXd> B)
+            -> Eigen::MatrixXd {
+        if (A.cols() != B.rows()) throw std::domain_error("Nonconformable matrices!");
+        return A * B;
+    }, py::arg("A"), py::arg("B"));
+
+    // test_custom_operator_new
+    py::class_<CustomOperatorNew>(m, "CustomOperatorNew")
+        .def(py::init<>())
+        .def_readonly("a", &CustomOperatorNew::a)
+        .def_readonly("b", &CustomOperatorNew::b);
+
+    // test_eigen_ref_life_support
+    // In case of a failure (the caster's temp array does not live long enough), creating
+    // a new array (np.ones(10)) increases the chances that the temp array will be garbage
+    // collected and/or that its memory will be overridden with different values.
+    m.def("get_elem_direct", [](Eigen::Ref<const Eigen::VectorXd> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v(5);
+    });
+    m.def("get_elem_indirect", [](std::vector<Eigen::Ref<const Eigen::VectorXd>> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v[0](5);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.py
new file mode 100644
index 0000000..55d9351
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.py
@@ -0,0 +1,694 @@
+import pytest
+from pybind11_tests import ConstructorStats
+
+pytestmark = pytest.requires_eigen_and_numpy
+
+with pytest.suppress(ImportError):
+    from pybind11_tests import eigen as m
+    import numpy as np
+
+    ref = np.array([[ 0.,  3,  0,  0,  0, 11],
+                    [22,  0,  0,  0, 17, 11],
+                    [ 7,  5,  0,  1,  0, 11],
+                    [ 0,  0,  0,  0,  0, 11],
+                    [ 0,  0, 14,  0,  8, 11]])
+
+
+def assert_equal_ref(mat):
+    np.testing.assert_array_equal(mat, ref)
+
+
+def assert_sparse_equal_ref(sparse_mat):
+    assert_equal_ref(sparse_mat.toarray())
+
+
+def test_fixed():
+    assert_equal_ref(m.fixed_c())
+    assert_equal_ref(m.fixed_r())
+    assert_equal_ref(m.fixed_copy_r(m.fixed_r()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_r(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_r()))
+
+
+def test_dense():
+    assert_equal_ref(m.dense_r())
+    assert_equal_ref(m.dense_c())
+    assert_equal_ref(m.dense_copy_r(m.dense_r()))
+    assert_equal_ref(m.dense_copy_c(m.dense_c()))
+    assert_equal_ref(m.dense_copy_r(m.dense_c()))
+    assert_equal_ref(m.dense_copy_c(m.dense_r()))
+
+
+def test_partially_fixed():
+    ref2 = np.array([[0., 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_rm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_cm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    # TypeError should be raise for a shape mismatch
+    functions = [m.partial_copy_four_rm_r, m.partial_copy_four_rm_c,
+                 m.partial_copy_four_cm_r, m.partial_copy_four_cm_c]
+    matrix_with_wrong_shape = [[1, 2],
+                               [3, 4]]
+    for f in functions:
+        with pytest.raises(TypeError) as excinfo:
+            f(matrix_with_wrong_shape)
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_mutator_descriptors():
+    zr = np.arange(30, dtype='float32').reshape(5, 6)  # row-major
+    zc = zr.reshape(6, 5).transpose()  # column-major
+
+    m.fixed_mutator_r(zr)
+    m.fixed_mutator_c(zc)
+    m.fixed_mutator_a(zr)
+    m.fixed_mutator_a(zc)
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_r(zc)
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable, flags.c_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_c(zr)
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable, flags.f_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_a(np.array([[1, 2], [3, 4]], dtype='float32'))
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable]) -> None'
+            in str(excinfo.value))
+    zr.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.fixed_mutator_r(zr)
+    with pytest.raises(TypeError):
+        m.fixed_mutator_a(zr)
+
+
+def test_cpp_casting():
+    assert m.cpp_copy(m.fixed_r()) == 22.
+    assert m.cpp_copy(m.fixed_c()) == 22.
+    z = np.array([[5., 6], [7, 8]])
+    assert m.cpp_copy(z) == 7.
+    assert m.cpp_copy(m.get_cm_ref()) == 21.
+    assert m.cpp_copy(m.get_rm_ref()) == 21.
+    assert m.cpp_ref_c(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_r(m.get_rm_ref()) == 21.
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_c: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_c())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_r: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_r())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    assert m.cpp_ref_any(m.ReturnTester.create()) == 1.
+
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+
+
+def test_pass_readonly_array():
+    z = np.full((5, 6), 42.0)
+    z.flags.writeable = False
+    np.testing.assert_array_equal(z, m.fixed_copy_r(z))
+    np.testing.assert_array_equal(m.fixed_r_const(), m.fixed_r())
+    assert not m.fixed_r_const().flags.writeable
+    np.testing.assert_array_equal(m.fixed_copy_r(m.fixed_r_const()), m.fixed_r_const())
+
+
+def test_nonunit_stride_from_python():
+    counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
+
+    counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
+    for slice_idx, ref_mat in enumerate(slices):
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
+
+    # Mutator:
+    m.double_threer(second_row)
+    m.double_threec(second_col)
+    np.testing.assert_array_equal(counting_mat, [[0., 2, 2], [6, 16, 10], [6, 14, 8]])
+
+
+def test_negative_stride_from_python(msg):
+    """Eigen doesn't support (as of yet) negative strides. When a function takes an Eigen matrix by
+    copy or const reference, we can pass a numpy array that has negative strides.  Otherwise, an
+    exception will be thrown as Eigen will not be able to map the numpy array."""
+
+    counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
+    counting_mat = counting_mat[::-1, ::-1]
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
+
+    counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    counting_3d = counting_3d[::-1, ::-1, ::-1]
+    slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
+    for slice_idx, ref_mat in enumerate(slices):
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
+
+    # Mutator:
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threer(second_row)
+    assert msg(excinfo.value) == """
+        double_threer(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float32[1, 3], flags.writeable]) -> None
+
+        Invoked with: """ + repr(np.array([ 5.,  4.,  3.], dtype='float32'))  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threec(second_col)
+    assert msg(excinfo.value) == """
+        double_threec(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float32[3, 1], flags.writeable]) -> None
+
+        Invoked with: """ + repr(np.array([ 7.,  4.,  1.], dtype='float32'))  # noqa: E501 line too long
+
+
+def test_nonunit_stride_to_python():
+    assert np.all(m.diagonal(ref) == ref.diagonal())
+    assert np.all(m.diagonal_1(ref) == ref.diagonal(1))
+    for i in range(-5, 7):
+        assert np.all(m.diagonal_n(ref, i) == ref.diagonal(i)), "m.diagonal_n({})".format(i)
+
+    assert np.all(m.block(ref, 2, 1, 3, 3) == ref[2:5, 1:4])
+    assert np.all(m.block(ref, 1, 4, 4, 2) == ref[1:, 4:])
+    assert np.all(m.block(ref, 1, 4, 3, 2) == ref[1:4, 4:])
+
+
+def test_eigen_ref_to_python():
+    chols = [m.cholesky1, m.cholesky2, m.cholesky3, m.cholesky4]
+    for i, chol in enumerate(chols, start=1):
+        mymat = chol(np.array([[1., 2, 4], [2, 13, 23], [4, 23, 77]]))
+        assert np.all(mymat == np.array([[1, 0, 0], [2, 3, 0], [4, 5, 6]])), "cholesky{}".format(i)
+
+
+def assign_both(a1, a2, r, c, v):
+    a1[r, c] = v
+    a2[r, c] = v
+
+
+def array_copy_but_one(a, r, c, v):
+    z = np.array(a, copy=True)
+    z[r, c] = v
+    return z
+
+
+def test_eigen_return_references():
+    """Tests various ways of returning references and non-referencing copies"""
+
+    master = np.ones((10, 10))
+    a = m.ReturnTester()
+    a_get1 = a.get()
+    assert not a_get1.flags.owndata and a_get1.flags.writeable
+    assign_both(a_get1, master, 3, 3, 5)
+    a_get2 = a.get_ptr()
+    assert not a_get2.flags.owndata and a_get2.flags.writeable
+    assign_both(a_get1, master, 2, 3, 6)
+
+    a_view1 = a.view()
+    assert not a_view1.flags.owndata and not a_view1.flags.writeable
+    with pytest.raises(ValueError):
+        a_view1[2, 3] = 4
+    a_view2 = a.view_ptr()
+    assert not a_view2.flags.owndata and not a_view2.flags.writeable
+    with pytest.raises(ValueError):
+        a_view2[2, 3] = 4
+
+    a_copy1 = a.copy_get()
+    assert a_copy1.flags.owndata and a_copy1.flags.writeable
+    np.testing.assert_array_equal(a_copy1, master)
+    a_copy1[7, 7] = -44  # Shouldn't affect anything else
+    c1want = array_copy_but_one(master, 7, 7, -44)
+    a_copy2 = a.copy_view()
+    assert a_copy2.flags.owndata and a_copy2.flags.writeable
+    np.testing.assert_array_equal(a_copy2, master)
+    a_copy2[4, 4] = -22  # Shouldn't affect anything else
+    c2want = array_copy_but_one(master, 4, 4, -22)
+
+    a_ref1 = a.ref()
+    assert not a_ref1.flags.owndata and a_ref1.flags.writeable
+    assign_both(a_ref1, master, 1, 1, 15)
+    a_ref2 = a.ref_const()
+    assert not a_ref2.flags.owndata and not a_ref2.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref2[5, 5] = 33
+    a_ref3 = a.ref_safe()
+    assert not a_ref3.flags.owndata and a_ref3.flags.writeable
+    assign_both(a_ref3, master, 0, 7, 99)
+    a_ref4 = a.ref_const_safe()
+    assert not a_ref4.flags.owndata and not a_ref4.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref4[7, 0] = 987654321
+
+    a_copy3 = a.copy_ref()
+    assert a_copy3.flags.owndata and a_copy3.flags.writeable
+    np.testing.assert_array_equal(a_copy3, master)
+    a_copy3[8, 1] = 11
+    c3want = array_copy_but_one(master, 8, 1, 11)
+    a_copy4 = a.copy_ref_const()
+    assert a_copy4.flags.owndata and a_copy4.flags.writeable
+    np.testing.assert_array_equal(a_copy4, master)
+    a_copy4[8, 4] = 88
+    c4want = array_copy_but_one(master, 8, 4, 88)
+
+    a_block1 = a.block(3, 3, 2, 2)
+    assert not a_block1.flags.owndata and a_block1.flags.writeable
+    a_block1[0, 0] = 55
+    master[3, 3] = 55
+    a_block2 = a.block_safe(2, 2, 3, 2)
+    assert not a_block2.flags.owndata and a_block2.flags.writeable
+    a_block2[2, 1] = -123
+    master[4, 3] = -123
+    a_block3 = a.block_const(6, 7, 4, 3)
+    assert not a_block3.flags.owndata and not a_block3.flags.writeable
+    with pytest.raises(ValueError):
+        a_block3[2, 2] = -44444
+
+    a_copy5 = a.copy_block(2, 2, 2, 3)
+    assert a_copy5.flags.owndata and a_copy5.flags.writeable
+    np.testing.assert_array_equal(a_copy5, master[2:4, 2:5])
+    a_copy5[1, 1] = 777
+    c5want = array_copy_but_one(master[2:4, 2:5], 1, 1, 777)
+
+    a_corn1 = a.corners()
+    assert not a_corn1.flags.owndata and a_corn1.flags.writeable
+    a_corn1 *= 50
+    a_corn1[1, 1] = 999
+    master[0, 0] = 50
+    master[0, 9] = 50
+    master[9, 0] = 50
+    master[9, 9] = 999
+    a_corn2 = a.corners_const()
+    assert not a_corn2.flags.owndata and not a_corn2.flags.writeable
+    with pytest.raises(ValueError):
+        a_corn2[1, 0] = 51
+
+    # All of the changes made all the way along should be visible everywhere
+    # now (except for the copies, of course)
+    np.testing.assert_array_equal(a_get1, master)
+    np.testing.assert_array_equal(a_get2, master)
+    np.testing.assert_array_equal(a_view1, master)
+    np.testing.assert_array_equal(a_view2, master)
+    np.testing.assert_array_equal(a_ref1, master)
+    np.testing.assert_array_equal(a_ref2, master)
+    np.testing.assert_array_equal(a_ref3, master)
+    np.testing.assert_array_equal(a_ref4, master)
+    np.testing.assert_array_equal(a_block1, master[3:5, 3:5])
+    np.testing.assert_array_equal(a_block2, master[2:5, 2:4])
+    np.testing.assert_array_equal(a_block3, master[6:10, 7:10])
+    np.testing.assert_array_equal(a_corn1, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+    np.testing.assert_array_equal(a_corn2, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+
+    np.testing.assert_array_equal(a_copy1, c1want)
+    np.testing.assert_array_equal(a_copy2, c2want)
+    np.testing.assert_array_equal(a_copy3, c3want)
+    np.testing.assert_array_equal(a_copy4, c4want)
+    np.testing.assert_array_equal(a_copy5, c5want)
+
+
+def assert_keeps_alive(cl, method, *args):
+    cstats = ConstructorStats.get(cl)
+    start_with = cstats.alive()
+    a = cl()
+    assert cstats.alive() == start_with + 1
+    z = method(a, *args)
+    assert cstats.alive() == start_with + 1
+    del a
+    # Here's the keep alive in action:
+    assert cstats.alive() == start_with + 1
+    del z
+    # Keep alive should have expired:
+    assert cstats.alive() == start_with
+
+
+def test_eigen_keepalive():
+    a = m.ReturnTester()
+    cstats = ConstructorStats.get(m.ReturnTester)
+    assert cstats.alive() == 1
+    unsafe = [a.ref(), a.ref_const(), a.block(1, 2, 3, 4)]
+    copies = [a.copy_get(), a.copy_view(), a.copy_ref(), a.copy_ref_const(),
+              a.copy_block(4, 3, 2, 1)]
+    del a
+    assert cstats.alive() == 0
+    del unsafe
+    del copies
+
+    for meth in [m.ReturnTester.get, m.ReturnTester.get_ptr, m.ReturnTester.view,
+                 m.ReturnTester.view_ptr, m.ReturnTester.ref_safe, m.ReturnTester.ref_const_safe,
+                 m.ReturnTester.corners, m.ReturnTester.corners_const]:
+        assert_keeps_alive(m.ReturnTester, meth)
+
+    for meth in [m.ReturnTester.block_safe, m.ReturnTester.block_const]:
+        assert_keeps_alive(m.ReturnTester, meth, 4, 3, 2, 1)
+
+
+def test_eigen_ref_mutators():
+    """Tests Eigen's ability to mutate numpy values"""
+
+    orig = np.array([[1., 2, 3], [4, 5, 6], [7, 8, 9]])
+    zr = np.array(orig)
+    zc = np.array(orig, order='F')
+    m.add_rm(zr, 1, 0, 100)
+    assert np.all(zr == np.array([[1., 2, 3], [104, 5, 6], [7, 8, 9]]))
+    m.add_cm(zc, 1, 0, 200)
+    assert np.all(zc == np.array([[1., 2, 3], [204, 5, 6], [7, 8, 9]]))
+
+    m.add_any(zr, 1, 0, 20)
+    assert np.all(zr == np.array([[1., 2, 3], [124, 5, 6], [7, 8, 9]]))
+    m.add_any(zc, 1, 0, 10)
+    assert np.all(zc == np.array([[1., 2, 3], [214, 5, 6], [7, 8, 9]]))
+
+    # Can't reference a col-major array with a row-major Ref, and vice versa:
+    with pytest.raises(TypeError):
+        m.add_rm(zc, 1, 0, 1)
+    with pytest.raises(TypeError):
+        m.add_cm(zr, 1, 0, 1)
+
+    # Overloads:
+    m.add1(zr, 1, 0, -100)
+    m.add2(zr, 1, 0, -20)
+    assert np.all(zr == orig)
+    m.add1(zc, 1, 0, -200)
+    m.add2(zc, 1, 0, -10)
+    assert np.all(zc == orig)
+
+    # a non-contiguous slice (this won't work on either the row- or
+    # column-contiguous refs, but should work for the any)
+    cornersr = zr[0::2, 0::2]
+    cornersc = zc[0::2, 0::2]
+
+    assert np.all(cornersr == np.array([[1., 3], [7, 9]]))
+    assert np.all(cornersc == np.array([[1., 3], [7, 9]]))
+
+    with pytest.raises(TypeError):
+        m.add_rm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_rm(cornersc, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersc, 0, 1, 25)
+    m.add_any(cornersr, 0, 1, 25)
+    m.add_any(cornersc, 0, 1, 44)
+    assert np.all(zr == np.array([[1., 2, 28], [4, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1., 2, 47], [4, 5, 6], [7, 8, 9]]))
+
+    # You shouldn't be allowed to pass a non-writeable array to a mutating Eigen method:
+    zro = zr[0:4, 0:4]
+    zro.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.add_rm(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add_any(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add1(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add2(zro, 0, 0, 0)
+
+    # integer array shouldn't be passable to a double-matrix-accepting mutating func:
+    zi = np.array([[1, 2], [3, 4]])
+    with pytest.raises(TypeError):
+        m.add_rm(zi)
+
+
+def test_numpy_ref_mutators():
+    """Tests numpy mutating Eigen matrices (for returned Eigen::Ref<...>s)"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    zc = m.get_cm_ref()
+    zcro = m.get_cm_const_ref()
+    zr = m.get_rm_ref()
+    zrro = m.get_rm_const_ref()
+
+    assert [zc[1, 2], zcro[1, 2], zr[1, 2], zrro[1, 2]] == [23] * 4
+
+    assert not zc.flags.owndata and zc.flags.writeable
+    assert not zr.flags.owndata and zr.flags.writeable
+    assert not zcro.flags.owndata and not zcro.flags.writeable
+    assert not zrro.flags.owndata and not zrro.flags.writeable
+
+    zc[1, 2] = 99
+    expect = np.array([[11., 12, 13], [21, 22, 99], [31, 32, 33]])
+    # We should have just changed zc, of course, but also zcro and the original eigen matrix
+    assert np.all(zc == expect)
+    assert np.all(zcro == expect)
+    assert np.all(m.get_cm_ref() == expect)
+
+    zr[1, 2] = 99
+    assert np.all(zr == expect)
+    assert np.all(zrro == expect)
+    assert np.all(m.get_rm_ref() == expect)
+
+    # Make sure the readonly ones are numpy-readonly:
+    with pytest.raises(ValueError):
+        zcro[1, 2] = 6
+    with pytest.raises(ValueError):
+        zrro[1, 2] = 6
+
+    # We should be able to explicitly copy like this (and since we're copying,
+    # the const should drop away)
+    y1 = np.array(m.get_cm_const_ref())
+
+    assert y1.flags.owndata and y1.flags.writeable
+    # We should get copies of the eigen data, which was modified above:
+    assert y1[1, 2] == 99
+    y1[1, 2] += 12
+    assert y1[1, 2] == 111
+    assert zc[1, 2] == 99  # Make sure we aren't referencing the original
+
+
+def test_both_ref_mutators():
+    """Tests a complex chain of nested eigen/numpy references"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    z = m.get_cm_ref()  # numpy -> eigen
+    z[0, 2] -= 3
+    z2 = m.incr_matrix(z, 1)  # numpy -> eigen -> numpy -> eigen
+    z2[1, 1] += 6
+    z3 = m.incr_matrix(z, 2)  # (numpy -> eigen)^3
+    z3[2, 2] += -5
+    z4 = m.incr_matrix(z, 3)  # (numpy -> eigen)^4
+    z4[1, 1] -= 1
+    z5 = m.incr_matrix(z, 4)  # (numpy -> eigen)^5
+    z5[0, 0] = 0
+    assert np.all(z == z2)
+    assert np.all(z == z3)
+    assert np.all(z == z4)
+    assert np.all(z == z5)
+    expect = np.array([[0., 22, 20], [31, 37, 33], [41, 42, 38]])
+    assert np.all(z == expect)
+
+    y = np.array(range(100), dtype='float64').reshape(10, 10)
+    y2 = m.incr_matrix_any(y, 10)  # np -> eigen -> np
+    y3 = m.incr_matrix_any(y2[0::2, 0::2], -33)  # np -> eigen -> np slice -> np -> eigen -> np
+    y4 = m.even_rows(y3)  # numpy -> eigen slice -> (... y3)
+    y5 = m.even_cols(y4)  # numpy -> eigen slice -> (... y4)
+    y6 = m.incr_matrix_any(y5, 1000)  # numpy -> eigen -> (... y5)
+
+    # Apply same mutations using just numpy:
+    yexpect = np.array(range(100), dtype='float64').reshape(10, 10)
+    yexpect += 10
+    yexpect[0::2, 0::2] -= 33
+    yexpect[0::4, 0::4] += 1000
+    assert np.all(y6 == yexpect[0::4, 0::4])
+    assert np.all(y5 == yexpect[0::4, 0::4])
+    assert np.all(y4 == yexpect[0::4, 0::2])
+    assert np.all(y3 == yexpect[0::2, 0::2])
+    assert np.all(y2 == yexpect)
+    assert np.all(y == yexpect)
+
+
+def test_nocopy_wrapper():
+    # get_elem requires a column-contiguous matrix reference, but should be
+    # callable with other types of matrix (via copying):
+    int_matrix_colmajor = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='F')
+    dbl_matrix_colmajor = np.array(int_matrix_colmajor, dtype='double', order='F', copy=True)
+    int_matrix_rowmajor = np.array(int_matrix_colmajor, order='C', copy=True)
+    dbl_matrix_rowmajor = np.array(int_matrix_rowmajor, dtype='double', order='C', copy=True)
+
+    # All should be callable via get_elem:
+    assert m.get_elem(int_matrix_colmajor) == 8
+    assert m.get_elem(dbl_matrix_colmajor) == 8
+    assert m.get_elem(int_matrix_rowmajor) == 8
+    assert m.get_elem(dbl_matrix_rowmajor) == 8
+
+    # All but the second should fail with m.get_elem_nocopy:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_colmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    assert m.get_elem_nocopy(dbl_matrix_colmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+
+    # For the row-major test, we take a long matrix in row-major, so only the third is allowed:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(int_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    assert m.get_elem_rm_nocopy(int_matrix_rowmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+
+
+def test_eigen_ref_life_support():
+    """Ensure the lifetime of temporary arrays created by the `Ref` caster
+
+    The `Ref` caster sometimes creates a copy which needs to stay alive. This needs to
+    happen both for directs casts (just the array) or indirectly (e.g. list of arrays).
+    """
+
+    a = np.full(shape=10, fill_value=8, dtype=np.int8)
+    assert m.get_elem_direct(a) == 8
+
+    list_of_a = [a]
+    assert m.get_elem_indirect(list_of_a) == 8
+
+
+def test_special_matrix_objects():
+    assert np.all(m.incr_diag(7) == np.diag([1., 2, 3, 4, 5, 6, 7]))
+
+    asymm = np.array([[ 1.,  2,  3,  4],
+                      [ 5,  6,  7,  8],
+                      [ 9, 10, 11, 12],
+                      [13, 14, 15, 16]])
+    symm_lower = np.array(asymm)
+    symm_upper = np.array(asymm)
+    for i in range(4):
+        for j in range(i + 1, 4):
+            symm_lower[i, j] = symm_lower[j, i]
+            symm_upper[j, i] = symm_upper[i, j]
+
+    assert np.all(m.symmetric_lower(asymm) == symm_lower)
+    assert np.all(m.symmetric_upper(asymm) == symm_upper)
+
+
+def test_dense_signature(doc):
+    assert doc(m.double_col) == """
+        double_col(arg0: numpy.ndarray[float32[m, 1]]) -> numpy.ndarray[float32[m, 1]]
+    """
+    assert doc(m.double_row) == """
+        double_row(arg0: numpy.ndarray[float32[1, n]]) -> numpy.ndarray[float32[1, n]]
+    """
+    assert doc(m.double_complex) == """
+        double_complex(arg0: numpy.ndarray[complex64[m, 1]]) -> numpy.ndarray[complex64[m, 1]]
+    """
+    assert doc(m.double_mat_rm) == """
+        double_mat_rm(arg0: numpy.ndarray[float32[m, n]]) -> numpy.ndarray[float32[m, n]]
+    """
+
+
+def test_named_arguments():
+    a = np.array([[1.0, 2], [3, 4], [5, 6]])
+    b = np.ones((2, 1))
+
+    assert np.all(m.matrix_multiply(a, b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(A=a, B=b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(B=b, A=a) == np.array([[3.], [7], [11]]))
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(b, a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(A=b, B=a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(B=a, A=b)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+
+@pytest.requires_eigen_and_scipy
+def test_sparse():
+    assert_sparse_equal_ref(m.sparse_r())
+    assert_sparse_equal_ref(m.sparse_c())
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_r()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_r()))
+
+
+@pytest.requires_eigen_and_scipy
+def test_sparse_signature(doc):
+    assert doc(m.sparse_copy_r) == """
+        sparse_copy_r(arg0: scipy.sparse.csr_matrix[float32]) -> scipy.sparse.csr_matrix[float32]
+    """  # noqa: E501 line too long
+    assert doc(m.sparse_copy_c) == """
+        sparse_copy_c(arg0: scipy.sparse.csc_matrix[float32]) -> scipy.sparse.csc_matrix[float32]
+    """  # noqa: E501 line too long
+
+
+def test_issue738():
+    """Ignore strides on a length-1 dimension (even if they would be incompatible length > 1)"""
+    assert np.all(m.iss738_f1(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f1(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+    assert np.all(m.iss738_f2(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f2(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+
+def test_issue1105():
+    """Issue 1105: 1xN or Nx1 input arrays weren't accepted for eigen
+    compile-time row vectors or column vector"""
+    assert m.iss1105_row(np.ones((1, 7)))
+    assert m.iss1105_col(np.ones((7, 1)))
+
+    # These should still fail (incompatible dimensions):
+    with pytest.raises(TypeError) as excinfo:
+        m.iss1105_row(np.ones((7, 1)))
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.iss1105_col(np.ones((1, 7)))
+    assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_custom_operator_new():
+    """Using Eigen types as member variables requires a class-specific
+    operator new with proper alignment"""
+
+    o = m.CustomOperatorNew()
+    np.testing.assert_allclose(o.a, 0.0)
+    np.testing.assert_allclose(o.b.diagonal(), 1.0)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/CMakeLists.txt b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/CMakeLists.txt
new file mode 100644
index 0000000..8b4f1f8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/CMakeLists.txt
@@ -0,0 +1,41 @@
+if(${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+  add_custom_target(cpptest)  # Dummy target on PyPy. Embedding is not supported.
+  set(_suppress_unused_variable_warning "${DOWNLOAD_CATCH}")
+  return()
+endif()
+
+find_package(Catch 1.9.3)
+if(CATCH_FOUND)
+  message(STATUS "Building interpreter tests using Catch v${CATCH_VERSION}")
+else()
+  message(STATUS "Catch not detected. Interpreter tests will be skipped. Install Catch headers"
+                 " manually or use `cmake -DDOWNLOAD_CATCH=1` to fetch them automatically.")
+  return()
+endif()
+
+add_executable(test_embed
+  catch.cpp
+  test_interpreter.cpp
+)
+target_include_directories(test_embed PRIVATE ${CATCH_INCLUDE_DIR})
+pybind11_enable_warnings(test_embed)
+
+if(NOT CMAKE_VERSION VERSION_LESS 3.0)
+  target_link_libraries(test_embed PRIVATE pybind11::embed)
+else()
+  target_include_directories(test_embed PRIVATE ${PYBIND11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
+  target_compile_options(test_embed PRIVATE ${PYBIND11_CPP_STANDARD})
+  target_link_libraries(test_embed PRIVATE ${PYTHON_LIBRARIES})
+endif()
+
+find_package(Threads REQUIRED)
+target_link_libraries(test_embed PUBLIC ${CMAKE_THREAD_LIBS_INIT})
+
+add_custom_target(cpptest COMMAND $<TARGET_FILE:test_embed>
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+pybind11_add_module(external_module THIN_LTO external_module.cpp)
+set_target_properties(external_module PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+add_dependencies(cpptest external_module)
+
+add_dependencies(check cpptest)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cc
new file mode 100644
index 0000000..dd13738
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cc
@@ -0,0 +1,22 @@
+// The Catch implementation is compiled here. This is a standalone
+// translation unit to avoid recompiling it for every test change.
+
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+namespace py = pybind11;
+
+int main(int argc, char *argv[]) {
+    py::scoped_interpreter guard{};
+    auto result = Catch::Session().run(argc, argv);
+
+    return result < 0xff ? result : 0xff;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cpp
new file mode 100644
index 0000000..dd13738
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cpp
@@ -0,0 +1,22 @@
+// The Catch implementation is compiled here. This is a standalone
+// translation unit to avoid recompiling it for every test change.
+
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+namespace py = pybind11;
+
+int main(int argc, char *argv[]) {
+    py::scoped_interpreter guard{};
+    auto result = Catch::Session().run(argc, argv);
+
+    return result < 0xff ? result : 0xff;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cc
new file mode 100644
index 0000000..e9a6058
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cc
@@ -0,0 +1,23 @@
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+/* Simple test module/test class to check that the referenced internals data of external pybind11
+ * modules aren't preserved over a finalize/initialize.
+ */
+
+PYBIND11_MODULE(external_module, m) {
+    class A {
+    public:
+        A(int value) : v{value} {};
+        int v;
+    };
+
+    py::class_<A>(m, "A")
+        .def(py::init<int>())
+        .def_readwrite("value", &A::v);
+
+    m.def("internals_at", []() {
+        return reinterpret_cast<uintptr_t>(&py::detail::get_internals());
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cpp
new file mode 100644
index 0000000..e9a6058
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cpp
@@ -0,0 +1,23 @@
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+/* Simple test module/test class to check that the referenced internals data of external pybind11
+ * modules aren't preserved over a finalize/initialize.
+ */
+
+PYBIND11_MODULE(external_module, m) {
+    class A {
+    public:
+        A(int value) : v{value} {};
+        int v;
+    };
+
+    py::class_<A>(m, "A")
+        .def(py::init<int>())
+        .def_readwrite("value", &A::v);
+
+    m.def("internals_at", []() {
+        return reinterpret_cast<uintptr_t>(&py::detail::get_internals());
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cc
new file mode 100644
index 0000000..222bd56
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cc
@@ -0,0 +1,284 @@
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#include <catch.hpp>
+
+#include <thread>
+#include <fstream>
+#include <functional>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+class Widget {
+public:
+    Widget(std::string message) : message(message) { }
+    virtual ~Widget() = default;
+
+    std::string the_message() const { return message; }
+    virtual int the_answer() const = 0;
+
+private:
+    std::string message;
+};
+
+class PyWidget final : public Widget {
+    using Widget::Widget;
+
+    int the_answer() const override { PYBIND11_OVERLOAD_PURE(int, Widget, the_answer); }
+};
+
+PYBIND11_EMBEDDED_MODULE(widget_module, m) {
+    py::class_<Widget, PyWidget>(m, "Widget")
+        .def(py::init<std::string>())
+        .def_property_readonly("the_message", &Widget::the_message);
+
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_exception, ) {
+    throw std::runtime_error("C++ Error");
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
+    auto d = py::dict();
+    d["missing"].cast<py::object>();
+}
+
+TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
+    auto module = py::module::import("test_interpreter");
+    REQUIRE(py::hasattr(module, "DerivedWidget"));
+
+    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module.attr("__dict__"));
+    py::exec(R"(
+        widget = DerivedWidget("{} - {}".format(hello, x))
+        message = widget.the_message
+    )", py::globals(), locals);
+    REQUIRE(locals["message"].cast<std::string>() == "Hello, World! - 5");
+
+    auto py_widget = module.attr("DerivedWidget")("The question");
+    auto message = py_widget.attr("the_message");
+    REQUIRE(message.cast<std::string>() == "The question");
+
+    const auto &cpp_widget = py_widget.cast<const Widget &>();
+    REQUIRE(cpp_widget.the_answer() == 42);
+}
+
+TEST_CASE("Import error handling") {
+    REQUIRE_NOTHROW(py::module::import("widget_module"));
+    REQUIRE_THROWS_WITH(py::module::import("throw_exception"),
+                        "ImportError: C++ Error");
+    REQUIRE_THROWS_WITH(py::module::import("throw_error_already_set"),
+                        Catch::Contains("ImportError: KeyError"));
+}
+
+TEST_CASE("There can be only one interpreter") {
+    static_assert(std::is_move_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_move_assignable<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_assignable<py::scoped_interpreter>::value, "");
+
+    REQUIRE_THROWS_WITH(py::initialize_interpreter(), "The interpreter is already running");
+    REQUIRE_THROWS_WITH(py::scoped_interpreter(), "The interpreter is already running");
+
+    py::finalize_interpreter();
+    REQUIRE_NOTHROW(py::scoped_interpreter());
+    {
+        auto pyi1 = py::scoped_interpreter();
+        auto pyi2 = std::move(pyi1);
+    }
+    py::initialize_interpreter();
+}
+
+bool has_pybind11_internals_builtin() {
+    auto builtins = py::handle(PyEval_GetBuiltins());
+    return builtins.contains(PYBIND11_INTERNALS_ID);
+};
+
+bool has_pybind11_internals_static() {
+    auto **&ipp = py::detail::get_internals_pp();
+    return ipp && *ipp;
+}
+
+TEST_CASE("Restart the interpreter") {
+    // Verify pre-restart state.
+    REQUIRE(py::module::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(py::module::import("external_module").attr("A")(123).attr("value").cast<int>() == 123);
+
+    // local and foreign module internals should point to the same internals:
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Restart the interpreter.
+    py::finalize_interpreter();
+    REQUIRE(Py_IsInitialized() == 0);
+
+    py::initialize_interpreter();
+    REQUIRE(Py_IsInitialized() == 1);
+
+    // Internals are deleted after a restart.
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    pybind11::detail::get_internals();
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Make sure that an interpreter with no get_internals() created until finalize still gets the
+    // internals destroyed
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    bool ran = false;
+    py::module::import("__main__").attr("internals_destroy_test") =
+        py::capsule(&ran, [](void *ran) { py::detail::get_internals(); *static_cast<bool *>(ran) = true; });
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    REQUIRE_FALSE(ran);
+    py::finalize_interpreter();
+    REQUIRE(ran);
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    // C++ modules can be reloaded.
+    auto cpp_module = py::module::import("widget_module");
+    REQUIRE(cpp_module.attr("add")(1, 2).cast<int>() == 3);
+
+    // C++ type information is reloaded and can be used in python modules.
+    auto py_module = py::module::import("test_interpreter");
+    auto py_widget = py_module.attr("DerivedWidget")("Hello after restart");
+    REQUIRE(py_widget.attr("the_message").cast<std::string>() == "Hello after restart");
+}
+
+TEST_CASE("Subinterpreter") {
+    // Add tags to the modules in the main interpreter and test the basics.
+    py::module::import("__main__").attr("main_tag") = "main interpreter";
+    {
+        auto m = py::module::import("widget_module");
+        m.attr("extension_module_tag") = "added to module in main interpreter";
+
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    /// Create and switch to a subinterpreter.
+    auto main_tstate = PyThreadState_Get();
+    auto sub_tstate = Py_NewInterpreter();
+
+    // Subinterpreters get their own copy of builtins. detail::get_internals() still
+    // works by returning from the static variable, i.e. all interpreters share a single
+    // global pybind11::internals;
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Modules tags should be gone.
+    REQUIRE_FALSE(py::hasattr(py::module::import("__main__"), "tag"));
+    {
+        auto m = py::module::import("widget_module");
+        REQUIRE_FALSE(py::hasattr(m, "extension_module_tag"));
+
+        // Function bindings should still work.
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+
+    // Restore main interpreter.
+    Py_EndInterpreter(sub_tstate);
+    PyThreadState_Swap(main_tstate);
+
+    REQUIRE(py::hasattr(py::module::import("__main__"), "main_tag"));
+    REQUIRE(py::hasattr(py::module::import("widget_module"), "extension_module_tag"));
+}
+
+TEST_CASE("Execution frame") {
+    // When the interpreter is embedded, there is no execution frame, but `py::exec`
+    // should still function by using reasonable globals: `__main__.__dict__`.
+    py::exec("var = dict(number=42)");
+    REQUIRE(py::globals()["var"]["number"].cast<int>() == 42);
+}
+
+TEST_CASE("Threads") {
+    // Restart interpreter to ensure threads are not initialized
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    constexpr auto num_threads = 10;
+    auto locals = py::dict("count"_a=0);
+
+    {
+        py::gil_scoped_release gil_release{};
+        REQUIRE(has_pybind11_internals_static());
+
+        auto threads = std::vector<std::thread>();
+        for (auto i = 0; i < num_threads; ++i) {
+            threads.emplace_back([&]() {
+                py::gil_scoped_acquire gil{};
+                locals["count"] = locals["count"].cast<int>() + 1;
+            });
+        }
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    REQUIRE(locals["count"].cast<int>() == num_threads);
+}
+
+// Scope exit utility https://stackoverflow.com/a/36644501/7255855
+struct scope_exit {
+    std::function<void()> f_;
+    explicit scope_exit(std::function<void()> f) noexcept : f_(std::move(f)) {}
+    ~scope_exit() { if (f_) f_(); }
+};
+
+TEST_CASE("Reload module from file") {
+    // Disable generation of cached bytecode (.pyc files) for this test, otherwise
+    // Python might pick up an old version from the cache instead of the new versions
+    // of the .py files generated below
+    auto sys = py::module::import("sys");
+    bool dont_write_bytecode = sys.attr("dont_write_bytecode").cast<bool>();
+    sys.attr("dont_write_bytecode") = true;
+    // Reset the value at scope exit
+    scope_exit reset_dont_write_bytecode([&]() {
+        sys.attr("dont_write_bytecode") = dont_write_bytecode;
+    });
+
+    std::string module_name = "test_module_reload";
+    std::string module_file = module_name + ".py";
+
+    // Create the module .py file
+    std::ofstream test_module(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 1\n";
+    test_module.close();
+    // Delete the file at scope exit
+    scope_exit delete_module_file([&]() {
+        std::remove(module_file.c_str());
+    });
+
+    // Import the module from file
+    auto module = py::module::import(module_name.c_str());
+    int result = module.attr("test")().cast<int>();
+    REQUIRE(result == 1);
+
+    // Update the module .py file with a small change
+    test_module.open(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 2\n";
+    test_module.close();
+
+    // Reload the module
+    module.reload();
+    result = module.attr("test")().cast<int>();
+    REQUIRE(result == 2);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cpp
new file mode 100644
index 0000000..222bd56
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cpp
@@ -0,0 +1,284 @@
+#include <pybind11/embed.h>
+
+#ifdef _MSC_VER
+// Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to catch
+// 2.0.1; this should be fixed in the next catch release after 2.0.1).
+#  pragma warning(disable: 4996)
+#endif
+
+#include <catch.hpp>
+
+#include <thread>
+#include <fstream>
+#include <functional>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+class Widget {
+public:
+    Widget(std::string message) : message(message) { }
+    virtual ~Widget() = default;
+
+    std::string the_message() const { return message; }
+    virtual int the_answer() const = 0;
+
+private:
+    std::string message;
+};
+
+class PyWidget final : public Widget {
+    using Widget::Widget;
+
+    int the_answer() const override { PYBIND11_OVERLOAD_PURE(int, Widget, the_answer); }
+};
+
+PYBIND11_EMBEDDED_MODULE(widget_module, m) {
+    py::class_<Widget, PyWidget>(m, "Widget")
+        .def(py::init<std::string>())
+        .def_property_readonly("the_message", &Widget::the_message);
+
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_exception, ) {
+    throw std::runtime_error("C++ Error");
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
+    auto d = py::dict();
+    d["missing"].cast<py::object>();
+}
+
+TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
+    auto module = py::module::import("test_interpreter");
+    REQUIRE(py::hasattr(module, "DerivedWidget"));
+
+    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module.attr("__dict__"));
+    py::exec(R"(
+        widget = DerivedWidget("{} - {}".format(hello, x))
+        message = widget.the_message
+    )", py::globals(), locals);
+    REQUIRE(locals["message"].cast<std::string>() == "Hello, World! - 5");
+
+    auto py_widget = module.attr("DerivedWidget")("The question");
+    auto message = py_widget.attr("the_message");
+    REQUIRE(message.cast<std::string>() == "The question");
+
+    const auto &cpp_widget = py_widget.cast<const Widget &>();
+    REQUIRE(cpp_widget.the_answer() == 42);
+}
+
+TEST_CASE("Import error handling") {
+    REQUIRE_NOTHROW(py::module::import("widget_module"));
+    REQUIRE_THROWS_WITH(py::module::import("throw_exception"),
+                        "ImportError: C++ Error");
+    REQUIRE_THROWS_WITH(py::module::import("throw_error_already_set"),
+                        Catch::Contains("ImportError: KeyError"));
+}
+
+TEST_CASE("There can be only one interpreter") {
+    static_assert(std::is_move_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_move_assignable<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_assignable<py::scoped_interpreter>::value, "");
+
+    REQUIRE_THROWS_WITH(py::initialize_interpreter(), "The interpreter is already running");
+    REQUIRE_THROWS_WITH(py::scoped_interpreter(), "The interpreter is already running");
+
+    py::finalize_interpreter();
+    REQUIRE_NOTHROW(py::scoped_interpreter());
+    {
+        auto pyi1 = py::scoped_interpreter();
+        auto pyi2 = std::move(pyi1);
+    }
+    py::initialize_interpreter();
+}
+
+bool has_pybind11_internals_builtin() {
+    auto builtins = py::handle(PyEval_GetBuiltins());
+    return builtins.contains(PYBIND11_INTERNALS_ID);
+};
+
+bool has_pybind11_internals_static() {
+    auto **&ipp = py::detail::get_internals_pp();
+    return ipp && *ipp;
+}
+
+TEST_CASE("Restart the interpreter") {
+    // Verify pre-restart state.
+    REQUIRE(py::module::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(py::module::import("external_module").attr("A")(123).attr("value").cast<int>() == 123);
+
+    // local and foreign module internals should point to the same internals:
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Restart the interpreter.
+    py::finalize_interpreter();
+    REQUIRE(Py_IsInitialized() == 0);
+
+    py::initialize_interpreter();
+    REQUIRE(Py_IsInitialized() == 1);
+
+    // Internals are deleted after a restart.
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    pybind11::detail::get_internals();
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+    REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
+            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+
+    // Make sure that an interpreter with no get_internals() created until finalize still gets the
+    // internals destroyed
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    bool ran = false;
+    py::module::import("__main__").attr("internals_destroy_test") =
+        py::capsule(&ran, [](void *ran) { py::detail::get_internals(); *static_cast<bool *>(ran) = true; });
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    REQUIRE_FALSE(ran);
+    py::finalize_interpreter();
+    REQUIRE(ran);
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    // C++ modules can be reloaded.
+    auto cpp_module = py::module::import("widget_module");
+    REQUIRE(cpp_module.attr("add")(1, 2).cast<int>() == 3);
+
+    // C++ type information is reloaded and can be used in python modules.
+    auto py_module = py::module::import("test_interpreter");
+    auto py_widget = py_module.attr("DerivedWidget")("Hello after restart");
+    REQUIRE(py_widget.attr("the_message").cast<std::string>() == "Hello after restart");
+}
+
+TEST_CASE("Subinterpreter") {
+    // Add tags to the modules in the main interpreter and test the basics.
+    py::module::import("__main__").attr("main_tag") = "main interpreter";
+    {
+        auto m = py::module::import("widget_module");
+        m.attr("extension_module_tag") = "added to module in main interpreter";
+
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    /// Create and switch to a subinterpreter.
+    auto main_tstate = PyThreadState_Get();
+    auto sub_tstate = Py_NewInterpreter();
+
+    // Subinterpreters get their own copy of builtins. detail::get_internals() still
+    // works by returning from the static variable, i.e. all interpreters share a single
+    // global pybind11::internals;
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Modules tags should be gone.
+    REQUIRE_FALSE(py::hasattr(py::module::import("__main__"), "tag"));
+    {
+        auto m = py::module::import("widget_module");
+        REQUIRE_FALSE(py::hasattr(m, "extension_module_tag"));
+
+        // Function bindings should still work.
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+
+    // Restore main interpreter.
+    Py_EndInterpreter(sub_tstate);
+    PyThreadState_Swap(main_tstate);
+
+    REQUIRE(py::hasattr(py::module::import("__main__"), "main_tag"));
+    REQUIRE(py::hasattr(py::module::import("widget_module"), "extension_module_tag"));
+}
+
+TEST_CASE("Execution frame") {
+    // When the interpreter is embedded, there is no execution frame, but `py::exec`
+    // should still function by using reasonable globals: `__main__.__dict__`.
+    py::exec("var = dict(number=42)");
+    REQUIRE(py::globals()["var"]["number"].cast<int>() == 42);
+}
+
+TEST_CASE("Threads") {
+    // Restart interpreter to ensure threads are not initialized
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    constexpr auto num_threads = 10;
+    auto locals = py::dict("count"_a=0);
+
+    {
+        py::gil_scoped_release gil_release{};
+        REQUIRE(has_pybind11_internals_static());
+
+        auto threads = std::vector<std::thread>();
+        for (auto i = 0; i < num_threads; ++i) {
+            threads.emplace_back([&]() {
+                py::gil_scoped_acquire gil{};
+                locals["count"] = locals["count"].cast<int>() + 1;
+            });
+        }
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    REQUIRE(locals["count"].cast<int>() == num_threads);
+}
+
+// Scope exit utility https://stackoverflow.com/a/36644501/7255855
+struct scope_exit {
+    std::function<void()> f_;
+    explicit scope_exit(std::function<void()> f) noexcept : f_(std::move(f)) {}
+    ~scope_exit() { if (f_) f_(); }
+};
+
+TEST_CASE("Reload module from file") {
+    // Disable generation of cached bytecode (.pyc files) for this test, otherwise
+    // Python might pick up an old version from the cache instead of the new versions
+    // of the .py files generated below
+    auto sys = py::module::import("sys");
+    bool dont_write_bytecode = sys.attr("dont_write_bytecode").cast<bool>();
+    sys.attr("dont_write_bytecode") = true;
+    // Reset the value at scope exit
+    scope_exit reset_dont_write_bytecode([&]() {
+        sys.attr("dont_write_bytecode") = dont_write_bytecode;
+    });
+
+    std::string module_name = "test_module_reload";
+    std::string module_file = module_name + ".py";
+
+    // Create the module .py file
+    std::ofstream test_module(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 1\n";
+    test_module.close();
+    // Delete the file at scope exit
+    scope_exit delete_module_file([&]() {
+        std::remove(module_file.c_str());
+    });
+
+    // Import the module from file
+    auto module = py::module::import(module_name.c_str());
+    int result = module.attr("test")().cast<int>();
+    REQUIRE(result == 1);
+
+    // Update the module .py file with a small change
+    test_module.open(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 2\n";
+    test_module.close();
+
+    // Reload the module
+    module.reload();
+    result = module.attr("test")().cast<int>();
+    REQUIRE(result == 2);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.py
new file mode 100644
index 0000000..26a0479
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.py
@@ -0,0 +1,9 @@
+from widget_module import Widget
+
+
+class DerivedWidget(Widget):
+    def __init__(self, message):
+        super(DerivedWidget, self).__init__(message)
+
+    def the_answer(self):
+        return 42
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cc
new file mode 100644
index 0000000..3153089
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cc
@@ -0,0 +1,87 @@
+/*
+    tests/test_enums.cpp -- enumerations
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(enums, m) {
+    // test_unscoped_enum
+    enum UnscopedEnum {
+        EOne = 1,
+        ETwo,
+        EThree
+    };
+    py::enum_<UnscopedEnum>(m, "UnscopedEnum", py::arithmetic(), "An unscoped enumeration")
+        .value("EOne", EOne, "Docstring for EOne")
+        .value("ETwo", ETwo, "Docstring for ETwo")
+        .value("EThree", EThree, "Docstring for EThree")
+        .export_values();
+
+    // test_scoped_enum
+    enum class ScopedEnum {
+        Two = 2,
+        Three
+    };
+    py::enum_<ScopedEnum>(m, "ScopedEnum", py::arithmetic())
+        .value("Two", ScopedEnum::Two)
+        .value("Three", ScopedEnum::Three);
+
+    m.def("test_scoped_enum", [](ScopedEnum z) {
+        return "ScopedEnum::" + std::string(z == ScopedEnum::Two ? "Two" : "Three");
+    });
+
+    // test_binary_operators
+    enum Flags {
+        Read = 4,
+        Write = 2,
+        Execute = 1
+    };
+    py::enum_<Flags>(m, "Flags", py::arithmetic())
+        .value("Read", Flags::Read)
+        .value("Write", Flags::Write)
+        .value("Execute", Flags::Execute)
+        .export_values();
+
+    // test_implicit_conversion
+    class ClassWithUnscopedEnum {
+    public:
+        enum EMode {
+            EFirstMode = 1,
+            ESecondMode
+        };
+
+        static EMode test_function(EMode mode) {
+            return mode;
+        }
+    };
+    py::class_<ClassWithUnscopedEnum> exenum_class(m, "ClassWithUnscopedEnum");
+    exenum_class.def_static("test_function", &ClassWithUnscopedEnum::test_function);
+    py::enum_<ClassWithUnscopedEnum::EMode>(exenum_class, "EMode")
+        .value("EFirstMode", ClassWithUnscopedEnum::EFirstMode)
+        .value("ESecondMode", ClassWithUnscopedEnum::ESecondMode)
+        .export_values();
+
+    // test_enum_to_int
+    m.def("test_enum_to_int", [](int) { });
+    m.def("test_enum_to_uint", [](uint32_t) { });
+    m.def("test_enum_to_long_long", [](long long) { });
+
+    // test_duplicate_enum_name
+    enum SimpleEnum
+    {
+        ONE, TWO, THREE
+    };
+
+    m.def("register_bad_enum", [m]() {
+        py::enum_<SimpleEnum>(m, "SimpleEnum")
+            .value("ONE", SimpleEnum::ONE)          //NOTE: all value function calls are called with the same first parameter value
+            .value("ONE", SimpleEnum::TWO)
+            .value("ONE", SimpleEnum::THREE)
+            .export_values();
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cpp
new file mode 100644
index 0000000..3153089
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cpp
@@ -0,0 +1,87 @@
+/*
+    tests/test_enums.cpp -- enumerations
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(enums, m) {
+    // test_unscoped_enum
+    enum UnscopedEnum {
+        EOne = 1,
+        ETwo,
+        EThree
+    };
+    py::enum_<UnscopedEnum>(m, "UnscopedEnum", py::arithmetic(), "An unscoped enumeration")
+        .value("EOne", EOne, "Docstring for EOne")
+        .value("ETwo", ETwo, "Docstring for ETwo")
+        .value("EThree", EThree, "Docstring for EThree")
+        .export_values();
+
+    // test_scoped_enum
+    enum class ScopedEnum {
+        Two = 2,
+        Three
+    };
+    py::enum_<ScopedEnum>(m, "ScopedEnum", py::arithmetic())
+        .value("Two", ScopedEnum::Two)
+        .value("Three", ScopedEnum::Three);
+
+    m.def("test_scoped_enum", [](ScopedEnum z) {
+        return "ScopedEnum::" + std::string(z == ScopedEnum::Two ? "Two" : "Three");
+    });
+
+    // test_binary_operators
+    enum Flags {
+        Read = 4,
+        Write = 2,
+        Execute = 1
+    };
+    py::enum_<Flags>(m, "Flags", py::arithmetic())
+        .value("Read", Flags::Read)
+        .value("Write", Flags::Write)
+        .value("Execute", Flags::Execute)
+        .export_values();
+
+    // test_implicit_conversion
+    class ClassWithUnscopedEnum {
+    public:
+        enum EMode {
+            EFirstMode = 1,
+            ESecondMode
+        };
+
+        static EMode test_function(EMode mode) {
+            return mode;
+        }
+    };
+    py::class_<ClassWithUnscopedEnum> exenum_class(m, "ClassWithUnscopedEnum");
+    exenum_class.def_static("test_function", &ClassWithUnscopedEnum::test_function);
+    py::enum_<ClassWithUnscopedEnum::EMode>(exenum_class, "EMode")
+        .value("EFirstMode", ClassWithUnscopedEnum::EFirstMode)
+        .value("ESecondMode", ClassWithUnscopedEnum::ESecondMode)
+        .export_values();
+
+    // test_enum_to_int
+    m.def("test_enum_to_int", [](int) { });
+    m.def("test_enum_to_uint", [](uint32_t) { });
+    m.def("test_enum_to_long_long", [](long long) { });
+
+    // test_duplicate_enum_name
+    enum SimpleEnum
+    {
+        ONE, TWO, THREE
+    };
+
+    m.def("register_bad_enum", [m]() {
+        py::enum_<SimpleEnum>(m, "SimpleEnum")
+            .value("ONE", SimpleEnum::ONE)          //NOTE: all value function calls are called with the same first parameter value
+            .value("ONE", SimpleEnum::TWO)
+            .value("ONE", SimpleEnum::THREE)
+            .export_values();
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.py
new file mode 100644
index 0000000..7fe9b61
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.py
@@ -0,0 +1,206 @@
+import pytest
+from pybind11_tests import enums as m
+
+
+def test_unscoped_enum():
+    assert str(m.UnscopedEnum.EOne) == "UnscopedEnum.EOne"
+    assert str(m.UnscopedEnum.ETwo) == "UnscopedEnum.ETwo"
+    assert str(m.EOne) == "UnscopedEnum.EOne"
+
+    # name property
+    assert m.UnscopedEnum.EOne.name == "EOne"
+    assert m.UnscopedEnum.ETwo.name == "ETwo"
+    assert m.EOne.name == "EOne"
+    # name readonly
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.EOne.name = ""
+    # name returns a copy
+    foo = m.UnscopedEnum.EOne.name
+    foo = "bar"
+    assert m.UnscopedEnum.EOne.name == "EOne"
+
+    # __members__ property
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+    # __members__ readonly
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.__members__ = {}
+    # __members__ returns a copy
+    foo = m.UnscopedEnum.__members__
+    foo["bar"] = "baz"
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+
+    for docstring_line in '''An unscoped enumeration
+
+Members:
+
+  EOne : Docstring for EOne
+
+  ETwo : Docstring for ETwo
+
+  EThree : Docstring for EThree'''.split('\n'):
+        assert docstring_line in m.UnscopedEnum.__doc__
+
+    # Unscoped enums will accept ==/!= int comparisons
+    y = m.UnscopedEnum.ETwo
+    assert y == 2
+    assert 2 == y
+    assert y != 3
+    assert 3 != y
+    # Compare with None
+    assert (y != None)  # noqa: E711
+    assert not (y == None)  # noqa: E711
+    # Compare with an object
+    assert (y != object())
+    assert not (y == object())
+    # Compare with string
+    assert y != "2"
+    assert "2" != y
+    assert not ("2" == y)
+    assert not (y == "2")
+
+    with pytest.raises(TypeError):
+        y < object()
+
+    with pytest.raises(TypeError):
+        y <= object()
+
+    with pytest.raises(TypeError):
+        y > object()
+
+    with pytest.raises(TypeError):
+        y >= object()
+
+    with pytest.raises(TypeError):
+        y | object()
+
+    with pytest.raises(TypeError):
+        y & object()
+
+    with pytest.raises(TypeError):
+        y ^ object()
+
+    assert int(m.UnscopedEnum.ETwo) == 2
+    assert str(m.UnscopedEnum(2)) == "UnscopedEnum.ETwo"
+
+    # order
+    assert m.UnscopedEnum.EOne < m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne < 2
+    assert m.UnscopedEnum.ETwo > m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo > 1
+    assert m.UnscopedEnum.ETwo <= 2
+    assert m.UnscopedEnum.ETwo >= 2
+    assert m.UnscopedEnum.EOne <= m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne <= 2
+    assert m.UnscopedEnum.ETwo >= m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo >= 1
+    assert not (m.UnscopedEnum.ETwo < m.UnscopedEnum.EOne)
+    assert not (2 < m.UnscopedEnum.EOne)
+
+    # arithmetic
+    assert m.UnscopedEnum.EOne & m.UnscopedEnum.EThree == m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.EOne | m.UnscopedEnum.ETwo == m.UnscopedEnum.EThree
+    assert m.UnscopedEnum.EOne ^ m.UnscopedEnum.EThree == m.UnscopedEnum.ETwo
+
+
+def test_scoped_enum():
+    assert m.test_scoped_enum(m.ScopedEnum.Three) == "ScopedEnum::Three"
+    z = m.ScopedEnum.Two
+    assert m.test_scoped_enum(z) == "ScopedEnum::Two"
+
+    # Scoped enums will *NOT* accept ==/!= int comparisons (Will always return False)
+    assert not z == 3
+    assert not 3 == z
+    assert z != 3
+    assert 3 != z
+    # Compare with None
+    assert (z != None)  # noqa: E711
+    assert not (z == None)  # noqa: E711
+    # Compare with an object
+    assert (z != object())
+    assert not (z == object())
+    # Scoped enums will *NOT* accept >, <, >= and <= int comparisons (Will throw exceptions)
+    with pytest.raises(TypeError):
+        z > 3
+    with pytest.raises(TypeError):
+        z < 3
+    with pytest.raises(TypeError):
+        z >= 3
+    with pytest.raises(TypeError):
+        z <= 3
+
+    # order
+    assert m.ScopedEnum.Two < m.ScopedEnum.Three
+    assert m.ScopedEnum.Three > m.ScopedEnum.Two
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Three
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Two
+    assert m.ScopedEnum.Two >= m.ScopedEnum.Two
+    assert m.ScopedEnum.Three >= m.ScopedEnum.Two
+
+
+def test_implicit_conversion():
+    assert str(m.ClassWithUnscopedEnum.EMode.EFirstMode) == "EMode.EFirstMode"
+    assert str(m.ClassWithUnscopedEnum.EFirstMode) == "EMode.EFirstMode"
+
+    f = m.ClassWithUnscopedEnum.test_function
+    first = m.ClassWithUnscopedEnum.EFirstMode
+    second = m.ClassWithUnscopedEnum.ESecondMode
+
+    assert f(first) == 1
+
+    assert f(first) == f(first)
+    assert not f(first) != f(first)
+
+    assert f(first) != f(second)
+    assert not f(first) == f(second)
+
+    assert f(first) == int(f(first))
+    assert not f(first) != int(f(first))
+
+    assert f(first) != int(f(second))
+    assert not f(first) == int(f(second))
+
+    # noinspection PyDictCreation
+    x = {f(first): 1, f(second): 2}
+    x[f(first)] = 3
+    x[f(second)] = 4
+    # Hashing test
+    assert str(x) == "{EMode.EFirstMode: 3, EMode.ESecondMode: 4}"
+
+
+def test_binary_operators():
+    assert int(m.Flags.Read) == 4
+    assert int(m.Flags.Write) == 2
+    assert int(m.Flags.Execute) == 1
+    assert int(m.Flags.Read | m.Flags.Write | m.Flags.Execute) == 7
+    assert int(m.Flags.Read | m.Flags.Write) == 6
+    assert int(m.Flags.Read | m.Flags.Execute) == 5
+    assert int(m.Flags.Write | m.Flags.Execute) == 3
+    assert int(m.Flags.Write | 1) == 3
+    assert ~m.Flags.Write == -3
+
+    state = m.Flags.Read | m.Flags.Write
+    assert (state & m.Flags.Read) != 0
+    assert (state & m.Flags.Write) != 0
+    assert (state & m.Flags.Execute) == 0
+    assert (state & 1) == 0
+
+    state2 = ~state
+    assert state2 == -7
+    assert int(state ^ state2) == -1
+
+
+def test_enum_to_int():
+    m.test_enum_to_int(m.Flags.Read)
+    m.test_enum_to_int(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_uint(m.Flags.Read)
+    m.test_enum_to_uint(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_long_long(m.Flags.Read)
+    m.test_enum_to_long_long(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+
+
+def test_duplicate_enum_name():
+    with pytest.raises(ValueError) as excinfo:
+        m.register_bad_enum()
+    assert str(excinfo.value) == 'SimpleEnum: element "ONE" already exists!'
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cc
new file mode 100644
index 0000000..e094821
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cc
@@ -0,0 +1,91 @@
+/*
+    tests/test_eval.cpp -- Usage of eval() and eval_file()
+
+    Copyright (c) 2016 Klemens D. Morgenstern
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/eval.h>
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(eval_, m) {
+    // test_evals
+
+    auto global = py::dict(py::module::import("__main__").attr("__dict__"));
+
+    m.def("test_eval_statements", [global]() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        // Regular string literal
+        py::exec(
+            "message = 'Hello World!'\n"
+            "x = call_test()",
+            global, local
+        );
+
+        // Multi-line raw string literal
+        py::exec(R"(
+            if x == 42:
+                print(message)
+            else:
+                raise RuntimeError
+            )", global, local
+        );
+        auto x = local["x"].cast<int>();
+
+        return x == 42;
+    });
+
+    m.def("test_eval", [global]() {
+        auto local = py::dict();
+        local["x"] = py::int_(42);
+        auto x = py::eval("x", global, local);
+        return x.cast<int>() == 42;
+    });
+
+    m.def("test_eval_single_statement", []() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        auto result = py::eval<py::eval_single_statement>("x = call_test()", py::dict(), local);
+        auto x = local["x"].cast<int>();
+        return result.is_none() && x == 42;
+    });
+
+    m.def("test_eval_file", [global](py::str filename) {
+        auto local = py::dict();
+        local["y"] = py::int_(43);
+
+        int val_out;
+        local["call_test2"] = py::cpp_function([&](int value) { val_out = value; });
+
+        auto result = py::eval_file(filename, global, local);
+        return val_out == 43 && result.is_none();
+    });
+
+    m.def("test_eval_failure", []() {
+        try {
+            py::eval("nonsense code ...");
+        } catch (py::error_already_set &) {
+            return true;
+        }
+        return false;
+    });
+
+    m.def("test_eval_file_failure", []() {
+        try {
+            py::eval_file("non-existing file");
+        } catch (std::exception &) {
+            return true;
+        }
+        return false;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cpp
new file mode 100644
index 0000000..e094821
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cpp
@@ -0,0 +1,91 @@
+/*
+    tests/test_eval.cpp -- Usage of eval() and eval_file()
+
+    Copyright (c) 2016 Klemens D. Morgenstern
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/eval.h>
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(eval_, m) {
+    // test_evals
+
+    auto global = py::dict(py::module::import("__main__").attr("__dict__"));
+
+    m.def("test_eval_statements", [global]() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        // Regular string literal
+        py::exec(
+            "message = 'Hello World!'\n"
+            "x = call_test()",
+            global, local
+        );
+
+        // Multi-line raw string literal
+        py::exec(R"(
+            if x == 42:
+                print(message)
+            else:
+                raise RuntimeError
+            )", global, local
+        );
+        auto x = local["x"].cast<int>();
+
+        return x == 42;
+    });
+
+    m.def("test_eval", [global]() {
+        auto local = py::dict();
+        local["x"] = py::int_(42);
+        auto x = py::eval("x", global, local);
+        return x.cast<int>() == 42;
+    });
+
+    m.def("test_eval_single_statement", []() {
+        auto local = py::dict();
+        local["call_test"] = py::cpp_function([&]() -> int {
+            return 42;
+        });
+
+        auto result = py::eval<py::eval_single_statement>("x = call_test()", py::dict(), local);
+        auto x = local["x"].cast<int>();
+        return result.is_none() && x == 42;
+    });
+
+    m.def("test_eval_file", [global](py::str filename) {
+        auto local = py::dict();
+        local["y"] = py::int_(43);
+
+        int val_out;
+        local["call_test2"] = py::cpp_function([&](int value) { val_out = value; });
+
+        auto result = py::eval_file(filename, global, local);
+        return val_out == 43 && result.is_none();
+    });
+
+    m.def("test_eval_failure", []() {
+        try {
+            py::eval("nonsense code ...");
+        } catch (py::error_already_set &) {
+            return true;
+        }
+        return false;
+    });
+
+    m.def("test_eval_file_failure", []() {
+        try {
+            py::eval_file("non-existing file");
+        } catch (std::exception &) {
+            return true;
+        }
+        return false;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.py
new file mode 100644
index 0000000..bda4ef6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.py
@@ -0,0 +1,17 @@
+import os
+from pybind11_tests import eval_ as m
+
+
+def test_evals(capture):
+    with capture:
+        assert m.test_eval_statements()
+    assert capture == "Hello World!"
+
+    assert m.test_eval()
+    assert m.test_eval_single_statement()
+
+    filename = os.path.join(os.path.dirname(__file__), "test_eval_call.py")
+    assert m.test_eval_file(filename)
+
+    assert m.test_eval_failure()
+    assert m.test_eval_file_failure()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval_call.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval_call.py
new file mode 100644
index 0000000..53c7e72
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval_call.py
@@ -0,0 +1,4 @@
+# This file is called from 'test_eval.py'
+
+if 'call_test2' in locals():
+    call_test2(y)  # noqa: F821 undefined name
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cc
new file mode 100644
index 0000000..56cd9bc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cc
@@ -0,0 +1,197 @@
+/*
+    tests/test_custom-exceptions.cpp -- exception translation
+
+    Copyright (c) 2016 Pim Schellart <P.Schellart@princeton.edu>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+// A type that should be raised as an exception in Python
+class MyException : public std::exception {
+public:
+    explicit MyException(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to a standard Python exception
+class MyException2 : public std::exception {
+public:
+    explicit MyException2(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that is not derived from std::exception (and is thus unknown)
+class MyException3 {
+public:
+    explicit MyException3(const char * m) : message{m} {}
+    virtual const char * what() const noexcept {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to MyException
+// and delegated to its exception translator
+class MyException4 : public std::exception {
+public:
+    explicit MyException4(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+
+// Like the above, but declared via the helper function
+class MyException5 : public std::logic_error {
+public:
+    explicit MyException5(const std::string &what) : std::logic_error(what) {}
+};
+
+// Inherits from MyException5
+class MyException5_1 : public MyException5 {
+    using MyException5::MyException5;
+};
+
+struct PythonCallInDestructor {
+    PythonCallInDestructor(const py::dict &d) : d(d) {}
+    ~PythonCallInDestructor() { d["good"] = true; }
+
+    py::dict d;
+};
+
+TEST_SUBMODULE(exceptions, m) {
+    m.def("throw_std_exception", []() {
+        throw std::runtime_error("This exception was intentionally thrown.");
+    });
+
+    // make a new custom exception and use it as a translation target
+    static py::exception<MyException> ex(m, "MyException");
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException &e) {
+            // Set MyException as the active python error
+            ex(e.what());
+        }
+    });
+
+    // register new translator for MyException2
+    // no need to store anything here because this type will
+    // never by visible from Python
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException2 &e) {
+            // Translate this exception to a standard RuntimeError
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+    });
+
+    // register new translator for MyException4
+    // which will catch it and delegate to the previously registered
+    // translator for MyException by throwing a new exception
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException4 &e) {
+            throw MyException(e.what());
+        }
+    });
+
+    // A simple exception translation:
+    auto ex5 = py::register_exception<MyException5>(m, "MyException5");
+    // A slightly more complicated one that declares MyException5_1 as a subclass of MyException5
+    py::register_exception<MyException5_1>(m, "MyException5_1", ex5.ptr());
+
+    m.def("throws1", []() { throw MyException("this error should go to a custom type"); });
+    m.def("throws2", []() { throw MyException2("this error should go to a standard Python exception"); });
+    m.def("throws3", []() { throw MyException3("this error cannot be translated"); });
+    m.def("throws4", []() { throw MyException4("this error is rethrown"); });
+    m.def("throws5", []() { throw MyException5("this is a helper-defined translated exception"); });
+    m.def("throws5_1", []() { throw MyException5_1("MyException5 subclass"); });
+    m.def("throws_logic_error", []() { throw std::logic_error("this error should fall through to the standard handler"); });
+    m.def("throws_overflow_error", []() {throw std::overflow_error(""); });
+    m.def("exception_matches", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set& ex) {
+            if (!ex.matches(PyExc_KeyError)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("exception_matches_base", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_Exception)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("modulenotfound_exception_matches_base", []() {
+        try {
+            // On Python >= 3.6, this raises a ModuleNotFoundError, a subclass of ImportError
+            py::module::import("nonexistent");
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_ImportError)) throw;
+            return true;
+        }
+        return false;
+    });
+
+    m.def("throw_already_set", [](bool err) {
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        try {
+            throw py::error_already_set();
+        } catch (const std::runtime_error& e) {
+            if ((err && e.what() != std::string("ValueError: foo")) ||
+                (!err && e.what() != std::string("Unknown internal error occurred")))
+            {
+                PyErr_Clear();
+                throw std::runtime_error("error message mismatch");
+            }
+        }
+        PyErr_Clear();
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        throw py::error_already_set();
+    });
+
+    m.def("python_call_in_destructor", [](py::dict d) {
+        try {
+            PythonCallInDestructor set_dict_in_destructor(d);
+            PyErr_SetString(PyExc_ValueError, "foo");
+            throw py::error_already_set();
+        } catch (const py::error_already_set&) {
+            return true;
+        }
+        return false;
+    });
+
+    // test_nested_throws
+    m.def("try_catch", [m](py::object exc_type, py::function f, py::args args) {
+        try { f(*args); }
+        catch (py::error_already_set &ex) {
+            if (ex.matches(exc_type))
+                py::print(ex.what());
+            else
+                throw;
+        }
+    });
+
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cpp
new file mode 100644
index 0000000..56cd9bc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cpp
@@ -0,0 +1,197 @@
+/*
+    tests/test_custom-exceptions.cpp -- exception translation
+
+    Copyright (c) 2016 Pim Schellart <P.Schellart@princeton.edu>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+// A type that should be raised as an exception in Python
+class MyException : public std::exception {
+public:
+    explicit MyException(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to a standard Python exception
+class MyException2 : public std::exception {
+public:
+    explicit MyException2(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that is not derived from std::exception (and is thus unknown)
+class MyException3 {
+public:
+    explicit MyException3(const char * m) : message{m} {}
+    virtual const char * what() const noexcept {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+// A type that should be translated to MyException
+// and delegated to its exception translator
+class MyException4 : public std::exception {
+public:
+    explicit MyException4(const char * m) : message{m} {}
+    virtual const char * what() const noexcept override {return message.c_str();}
+private:
+    std::string message = "";
+};
+
+
+// Like the above, but declared via the helper function
+class MyException5 : public std::logic_error {
+public:
+    explicit MyException5(const std::string &what) : std::logic_error(what) {}
+};
+
+// Inherits from MyException5
+class MyException5_1 : public MyException5 {
+    using MyException5::MyException5;
+};
+
+struct PythonCallInDestructor {
+    PythonCallInDestructor(const py::dict &d) : d(d) {}
+    ~PythonCallInDestructor() { d["good"] = true; }
+
+    py::dict d;
+};
+
+TEST_SUBMODULE(exceptions, m) {
+    m.def("throw_std_exception", []() {
+        throw std::runtime_error("This exception was intentionally thrown.");
+    });
+
+    // make a new custom exception and use it as a translation target
+    static py::exception<MyException> ex(m, "MyException");
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException &e) {
+            // Set MyException as the active python error
+            ex(e.what());
+        }
+    });
+
+    // register new translator for MyException2
+    // no need to store anything here because this type will
+    // never by visible from Python
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException2 &e) {
+            // Translate this exception to a standard RuntimeError
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+        }
+    });
+
+    // register new translator for MyException4
+    // which will catch it and delegate to the previously registered
+    // translator for MyException by throwing a new exception
+    py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const MyException4 &e) {
+            throw MyException(e.what());
+        }
+    });
+
+    // A simple exception translation:
+    auto ex5 = py::register_exception<MyException5>(m, "MyException5");
+    // A slightly more complicated one that declares MyException5_1 as a subclass of MyException5
+    py::register_exception<MyException5_1>(m, "MyException5_1", ex5.ptr());
+
+    m.def("throws1", []() { throw MyException("this error should go to a custom type"); });
+    m.def("throws2", []() { throw MyException2("this error should go to a standard Python exception"); });
+    m.def("throws3", []() { throw MyException3("this error cannot be translated"); });
+    m.def("throws4", []() { throw MyException4("this error is rethrown"); });
+    m.def("throws5", []() { throw MyException5("this is a helper-defined translated exception"); });
+    m.def("throws5_1", []() { throw MyException5_1("MyException5 subclass"); });
+    m.def("throws_logic_error", []() { throw std::logic_error("this error should fall through to the standard handler"); });
+    m.def("throws_overflow_error", []() {throw std::overflow_error(""); });
+    m.def("exception_matches", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set& ex) {
+            if (!ex.matches(PyExc_KeyError)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("exception_matches_base", []() {
+        py::dict foo;
+        try {
+            // Assign to a py::object to force read access of nonexistent dict entry
+            py::object o = foo["bar"];
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_Exception)) throw;
+            return true;
+        }
+        return false;
+    });
+    m.def("modulenotfound_exception_matches_base", []() {
+        try {
+            // On Python >= 3.6, this raises a ModuleNotFoundError, a subclass of ImportError
+            py::module::import("nonexistent");
+        }
+        catch (py::error_already_set &ex) {
+            if (!ex.matches(PyExc_ImportError)) throw;
+            return true;
+        }
+        return false;
+    });
+
+    m.def("throw_already_set", [](bool err) {
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        try {
+            throw py::error_already_set();
+        } catch (const std::runtime_error& e) {
+            if ((err && e.what() != std::string("ValueError: foo")) ||
+                (!err && e.what() != std::string("Unknown internal error occurred")))
+            {
+                PyErr_Clear();
+                throw std::runtime_error("error message mismatch");
+            }
+        }
+        PyErr_Clear();
+        if (err)
+            PyErr_SetString(PyExc_ValueError, "foo");
+        throw py::error_already_set();
+    });
+
+    m.def("python_call_in_destructor", [](py::dict d) {
+        try {
+            PythonCallInDestructor set_dict_in_destructor(d);
+            PyErr_SetString(PyExc_ValueError, "foo");
+            throw py::error_already_set();
+        } catch (const py::error_already_set&) {
+            return true;
+        }
+        return false;
+    });
+
+    // test_nested_throws
+    m.def("try_catch", [m](py::object exc_type, py::function f, py::args args) {
+        try { f(*args); }
+        catch (py::error_already_set &ex) {
+            if (ex.matches(exc_type))
+                py::print(ex.what());
+            else
+                throw;
+        }
+    });
+
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.py
new file mode 100644
index 0000000..ac2b360
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.py
@@ -0,0 +1,150 @@
+import pytest
+
+from pybind11_tests import exceptions as m
+import pybind11_cross_module_tests as cm
+
+
+def test_std_exception(msg):
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_std_exception()
+    assert msg(excinfo.value) == "This exception was intentionally thrown."
+
+
+def test_error_already_set(msg):
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_already_set(False)
+    assert msg(excinfo.value) == "Unknown internal error occurred"
+
+    with pytest.raises(ValueError) as excinfo:
+        m.throw_already_set(True)
+    assert msg(excinfo.value) == "foo"
+
+
+def test_cross_module_exceptions():
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.raise_runtime_error()
+    assert str(excinfo.value) == "My runtime error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.raise_value_error()
+    assert str(excinfo.value) == "My value error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.throw_pybind_value_error()
+    assert str(excinfo.value) == "pybind11 value error"
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.throw_pybind_type_error()
+    assert str(excinfo.value) == "pybind11 type error"
+
+    with pytest.raises(StopIteration) as excinfo:
+        cm.throw_stop_iteration()
+
+
+def test_python_call_in_catch():
+    d = {}
+    assert m.python_call_in_destructor(d) is True
+    assert d["good"] is True
+
+
+def test_exception_matches():
+    assert m.exception_matches()
+    assert m.exception_matches_base()
+    assert m.modulenotfound_exception_matches_base()
+
+
+def test_custom(msg):
+    # Can we catch a MyException?
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws1()
+    assert msg(excinfo.value) == "this error should go to a custom type"
+
+    # Can we translate to standard Python exceptions?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws2()
+    assert msg(excinfo.value) == "this error should go to a standard Python exception"
+
+    # Can we handle unknown exceptions?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws3()
+    assert msg(excinfo.value) == "Caught an unknown exception!"
+
+    # Can we delegate to another handler by rethrowing?
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws4()
+    assert msg(excinfo.value) == "this error is rethrown"
+
+    # Can we fall-through to the default handler?
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws_logic_error()
+    assert msg(excinfo.value) == "this error should fall through to the standard handler"
+
+    # OverFlow error translation.
+    with pytest.raises(OverflowError) as excinfo:
+        m.throws_overflow_error()
+
+    # Can we handle a helper-declared exception?
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5()
+    assert msg(excinfo.value) == "this is a helper-defined translated exception"
+
+    # Exception subclassing:
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5_1()
+    assert msg(excinfo.value) == "MyException5 subclass"
+    assert isinstance(excinfo.value, m.MyException5_1)
+
+    with pytest.raises(m.MyException5_1) as excinfo:
+        m.throws5_1()
+    assert msg(excinfo.value) == "MyException5 subclass"
+
+    with pytest.raises(m.MyException5) as excinfo:
+        try:
+            m.throws5()
+        except m.MyException5_1:
+            raise RuntimeError("Exception error: caught child from parent")
+    assert msg(excinfo.value) == "this is a helper-defined translated exception"
+
+
+def test_nested_throws(capture):
+    """Tests nested (e.g. C++ -> Python -> C++) exception handling"""
+
+    def throw_myex():
+        raise m.MyException("nested error")
+
+    def throw_myex5():
+        raise m.MyException5("nested error 5")
+
+    # In the comments below, the exception is caught in the first step, thrown in the last step
+
+    # C++ -> Python
+    with capture:
+        m.try_catch(m.MyException5, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # Python -> C++ -> Python
+    with pytest.raises(m.MyException) as excinfo:
+        m.try_catch(m.MyException5, throw_myex)
+    assert str(excinfo.value) == "nested error"
+
+    def pycatch(exctype, f, *args):
+        try:
+            f(*args)
+        except m.MyException as e:
+            print(e)
+
+    # C++ -> Python -> C++ -> Python
+    with capture:
+        m.try_catch(
+            m.MyException5, pycatch, m.MyException, m.try_catch, m.MyException, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # C++ -> Python -> C++
+    with capture:
+        m.try_catch(m.MyException, pycatch, m.MyException5, m.throws4)
+    assert capture == "this error is rethrown"
+
+    # Python -> C++ -> Python -> C++
+    with pytest.raises(m.MyException5) as excinfo:
+        m.try_catch(m.MyException, pycatch, m.MyException, m.throws5)
+    assert str(excinfo.value) == "this is a helper-defined translated exception"
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cc
new file mode 100644
index 0000000..5cfbfdc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cc
@@ -0,0 +1,338 @@
+/*
+    tests/test_factory_constructors.cpp -- tests construction from a factory function
+                                           via py::init_factory()
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <cmath>
+
+// Classes for testing python construction via C++ factory function:
+// Not publicly constructible, copyable, or movable:
+class TestFactory1 {
+    friend class TestFactoryHelper;
+    TestFactory1() : value("(empty)") { print_default_created(this); }
+    TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory1(TestFactory1 &&) = delete;
+    TestFactory1(const TestFactory1 &) = delete;
+    TestFactory1 &operator=(TestFactory1 &&) = delete;
+    TestFactory1 &operator=(const TestFactory1 &) = delete;
+public:
+    std::string value;
+    ~TestFactory1() { print_destroyed(this); }
+};
+// Non-public construction, but moveable:
+class TestFactory2 {
+    friend class TestFactoryHelper;
+    TestFactory2() : value("(empty2)") { print_default_created(this); }
+    TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
+public:
+    TestFactory2(TestFactory2 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory2 &operator=(TestFactory2 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    ~TestFactory2() { print_destroyed(this); }
+};
+// Mixed direct/factory construction:
+class TestFactory3 {
+protected:
+    friend class TestFactoryHelper;
+    TestFactory3() : value("(empty3)") { print_default_created(this); }
+    TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
+public:
+    TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory3(TestFactory3 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory3 &operator=(TestFactory3 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    virtual ~TestFactory3() { print_destroyed(this); }
+};
+// Inheritance test
+class TestFactory4 : public TestFactory3 {
+public:
+    TestFactory4() : TestFactory3() { print_default_created(this); }
+    TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
+    virtual ~TestFactory4() { print_destroyed(this); }
+};
+// Another class for an invalid downcast test
+class TestFactory5 : public TestFactory3 {
+public:
+    TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
+    virtual ~TestFactory5() { print_destroyed(this); }
+};
+
+class TestFactory6 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory6(int i) : value{i} { print_created(this, i); }
+    TestFactory6(TestFactory6 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory6(const TestFactory6 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory6() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF6 : public TestFactory6 {
+public:
+    // Special constructor that allows the factory to construct a PyTF6 from a TestFactory6 only
+    // when an alias is needed:
+    PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) { alias = true; print_created(this, "move", value); }
+    PyTF6(int i) : TestFactory6(i) { alias = true; print_created(this, i); }
+    PyTF6(PyTF6 &&f) : TestFactory6(std::move(f)) { print_move_created(this); }
+    PyTF6(const PyTF6 &f) : TestFactory6(f) { print_copy_created(this); }
+    PyTF6(std::string s) : TestFactory6((int) s.size()) { alias = true; print_created(this, s); }
+    virtual ~PyTF6() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory6, get, /*no args*/); }
+};
+
+class TestFactory7 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory7(int i) : value{i} { print_created(this, i); }
+    TestFactory7(TestFactory7 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory7(const TestFactory7 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory7() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF7 : public TestFactory7 {
+public:
+    PyTF7(int i) : TestFactory7(i) { alias = true; print_created(this, i); }
+    PyTF7(PyTF7 &&f) : TestFactory7(std::move(f)) { print_move_created(this); }
+    PyTF7(const PyTF7 &f) : TestFactory7(f) { print_copy_created(this); }
+    virtual ~PyTF7() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory7, get, /*no args*/); }
+};
+
+
+class TestFactoryHelper {
+public:
+    // Non-movable, non-copyable type:
+    // Return via pointer:
+    static TestFactory1 *construct1() { return new TestFactory1(); }
+    // Holder:
+    static std::unique_ptr<TestFactory1> construct1(int a) { return std::unique_ptr<TestFactory1>(new TestFactory1(a)); }
+    // pointer again
+    static TestFactory1 *construct1_string(std::string a) { return new TestFactory1(a); }
+
+    // Moveable type:
+    // pointer:
+    static TestFactory2 *construct2() { return new TestFactory2(); }
+    // holder:
+    static std::unique_ptr<TestFactory2> construct2(int a) { return std::unique_ptr<TestFactory2>(new TestFactory2(a)); }
+    // by value moving:
+    static TestFactory2 construct2(std::string a) { return TestFactory2(a); }
+
+    // shared_ptr holder type:
+    // pointer:
+    static TestFactory3 *construct3() { return new TestFactory3(); }
+    // holder:
+    static std::shared_ptr<TestFactory3> construct3(int a) { return std::shared_ptr<TestFactory3>(new TestFactory3(a)); }
+};
+
+TEST_SUBMODULE(factory_constructors, m) {
+
+    // Define various trivial types to allow simpler overload resolution:
+    py::module m_tag = m.def_submodule("tag");
+#define MAKE_TAG_TYPE(Name) \
+    struct Name##_tag {}; \
+    py::class_<Name##_tag>(m_tag, #Name "_tag").def(py::init<>()); \
+    m_tag.attr(#Name) = py::cast(Name##_tag{})
+    MAKE_TAG_TYPE(pointer);
+    MAKE_TAG_TYPE(unique_ptr);
+    MAKE_TAG_TYPE(move);
+    MAKE_TAG_TYPE(shared_ptr);
+    MAKE_TAG_TYPE(derived);
+    MAKE_TAG_TYPE(TF4);
+    MAKE_TAG_TYPE(TF5);
+    MAKE_TAG_TYPE(null_ptr);
+    MAKE_TAG_TYPE(base);
+    MAKE_TAG_TYPE(invalid_base);
+    MAKE_TAG_TYPE(alias);
+    MAKE_TAG_TYPE(unaliasable);
+    MAKE_TAG_TYPE(mixed);
+
+    // test_init_factory_basic, test_bad_type
+    py::class_<TestFactory1>(m, "TestFactory1")
+        .def(py::init([](unique_ptr_tag, int v) { return TestFactoryHelper::construct1(v); }))
+        .def(py::init(&TestFactoryHelper::construct1_string)) // raw function pointer
+        .def(py::init([](pointer_tag) { return TestFactoryHelper::construct1(); }))
+        .def(py::init([](py::handle, int v, py::handle) { return TestFactoryHelper::construct1(v); }))
+        .def_readwrite("value", &TestFactory1::value)
+        ;
+    py::class_<TestFactory2>(m, "TestFactory2")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](unique_ptr_tag, std::string v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](move_tag) { return TestFactoryHelper::construct2(); }))
+        .def_readwrite("value", &TestFactory2::value)
+        ;
+
+    // Stateful & reused:
+    int c = 1;
+    auto c4a = [c](pointer_tag, TF4_tag, int a) { (void) c; return new TestFactory4(a);};
+
+    // test_init_factory_basic, test_init_factory_casting
+    py::class_<TestFactory3, std::shared_ptr<TestFactory3>>(m, "TestFactory3")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct3(v); }))
+        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }))
+        .def("__init__", [](TestFactory3 &self, std::string v) { new (&self) TestFactory3(v); }) // placement-new ctor
+
+        // factories returning a derived type:
+        .def(py::init(c4a)) // derived ptr
+        .def(py::init([](pointer_tag, TF5_tag, int a) { return new TestFactory5(a); }))
+        // derived shared ptr:
+        .def(py::init([](shared_ptr_tag, TF4_tag, int a) { return std::make_shared<TestFactory4>(a); }))
+        .def(py::init([](shared_ptr_tag, TF5_tag, int a) { return std::make_shared<TestFactory5>(a); }))
+
+        // Returns nullptr:
+        .def(py::init([](null_ptr_tag) { return (TestFactory3 *) nullptr; }))
+
+        .def_readwrite("value", &TestFactory3::value)
+        ;
+
+    // test_init_factory_casting
+    py::class_<TestFactory4, TestFactory3, std::shared_ptr<TestFactory4>>(m, "TestFactory4")
+        .def(py::init(c4a)) // pointer
+        ;
+
+    // Doesn't need to be registered, but registering makes getting ConstructorStats easier:
+    py::class_<TestFactory5, TestFactory3, std::shared_ptr<TestFactory5>>(m, "TestFactory5");
+
+    // test_init_factory_alias
+    // Alias testing
+    py::class_<TestFactory6, PyTF6>(m, "TestFactory6")
+        .def(py::init([](base_tag, int i) { return TestFactory6(i); }))
+        .def(py::init([](alias_tag, int i) { return PyTF6(i); }))
+        .def(py::init([](alias_tag, std::string s) { return PyTF6(s); }))
+        .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF6(i); }))
+        .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory6(i); }))
+        .def(py::init([](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
+
+        .def("get", &TestFactory6::get)
+        .def("has_alias", &TestFactory6::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference)
+        ;
+
+    // test_init_factory_dual
+    // Separate alias constructor testing
+    py::class_<TestFactory7, PyTF7, std::shared_ptr<TestFactory7>>(m, "TestFactory7")
+        .def(py::init(
+            [](int i) { return TestFactory7(i); },
+            [](int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](pointer_tag, int i) { return new TestFactory7(i); },
+            [](pointer_tag, int i) { return new PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, int i) { return new TestFactory7(i); },
+            [](mixed_tag, int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, std::string s) { return TestFactory7((int) s.size()); },
+            [](mixed_tag, std::string s) { return new PyTF7((int) s.size()); }))
+        .def(py::init(
+            [](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
+            [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
+        .def(py::init(
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(10*i); }))
+        .def(py::init(
+            [](shared_ptr_tag, base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, base_tag, int i) { auto *p = new PyTF7(i); return std::shared_ptr<TestFactory7>(p); }))
+        .def(py::init(
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); })) // <-- invalid alias factory
+
+        .def("get", &TestFactory7::get)
+        .def("has_alias", &TestFactory7::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference)
+        ;
+
+    // test_placement_new_alternative
+    // Class with a custom new operator but *without* a placement new operator (issue #948)
+    class NoPlacementNew {
+    public:
+        NoPlacementNew(int i) : i(i) { }
+        static void *operator new(std::size_t s) {
+            auto *p = ::operator new(s);
+            py::print("operator new called, returning", reinterpret_cast<uintptr_t>(p));
+            return p;
+        }
+        static void operator delete(void *p) {
+            py::print("operator delete called on", reinterpret_cast<uintptr_t>(p));
+            ::operator delete(p);
+        }
+        int i;
+    };
+    // As of 2.2, `py::init<args>` no longer requires placement new
+    py::class_<NoPlacementNew>(m, "NoPlacementNew")
+        .def(py::init<int>())
+        .def(py::init([]() { return new NoPlacementNew(100); }))
+        .def_readwrite("i", &NoPlacementNew::i)
+        ;
+
+
+    // test_reallocations
+    // Class that has verbose operator_new/operator_delete calls
+    struct NoisyAlloc {
+        NoisyAlloc(const NoisyAlloc &) = default;
+        NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
+        NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
+        ~NoisyAlloc() { py::print("~NoisyAlloc()"); }
+
+        static void *operator new(size_t s) { py::print("noisy new"); return ::operator new(s); }
+        static void *operator new(size_t, void *p) { py::print("noisy placement new"); return p; }
+        static void operator delete(void *p, size_t) { py::print("noisy delete"); ::operator delete(p); }
+        static void operator delete(void *, void *) { py::print("noisy placement delete"); }
+#if defined(_MSC_VER) && _MSC_VER < 1910
+        // MSVC 2015 bug: the above "noisy delete" isn't invoked (fixed in MSVC 2017)
+        static void operator delete(void *p) { py::print("noisy delete"); ::operator delete(p); }
+#endif
+    };
+    py::class_<NoisyAlloc>(m, "NoisyAlloc")
+        // Since these overloads have the same number of arguments, the dispatcher will try each of
+        // them until the arguments convert.  Thus we can get a pre-allocation here when passing a
+        // single non-integer:
+        .def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }) // Regular constructor, runs first, requires preallocation
+        .def(py::init([](double d) { return new NoisyAlloc(d); }))
+
+        // The two-argument version: first the factory pointer overload.
+        .def(py::init([](int i, int) { return new NoisyAlloc(i); }))
+        // Return-by-value:
+        .def(py::init([](double d, int) { return NoisyAlloc(d); }))
+        // Old-style placement new init; requires preallocation
+        .def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); })
+        // Requires deallocation of previous overload preallocated value:
+        .def(py::init([](int i, double) { return new NoisyAlloc(i); }))
+        // Regular again: requires yet another preallocation
+        .def("__init__", [](NoisyAlloc &a, int i, std::string) { new (&a) NoisyAlloc(i); })
+        ;
+
+
+
+
+    // static_assert testing (the following def's should all fail with appropriate compilation errors):
+#if 0
+    struct BadF1Base {};
+    struct BadF1 : BadF1Base {};
+    struct PyBadF1 : BadF1 {};
+    py::class_<BadF1, PyBadF1, std::shared_ptr<BadF1>> bf1(m, "BadF1");
+    // wrapped factory function must return a compatible pointer, holder, or value
+    bf1.def(py::init([]() { return 3; }));
+    // incompatible factory function pointer return type
+    bf1.def(py::init([]() { static int three = 3; return &three; }));
+    // incompatible factory function std::shared_ptr<T> return type: cannot convert shared_ptr<T> to holder
+    // (non-polymorphic base)
+    bf1.def(py::init([]() { return std::shared_ptr<BadF1Base>(new BadF1()); }));
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cpp
new file mode 100644
index 0000000..5cfbfdc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cpp
@@ -0,0 +1,338 @@
+/*
+    tests/test_factory_constructors.cpp -- tests construction from a factory function
+                                           via py::init_factory()
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <cmath>
+
+// Classes for testing python construction via C++ factory function:
+// Not publicly constructible, copyable, or movable:
+class TestFactory1 {
+    friend class TestFactoryHelper;
+    TestFactory1() : value("(empty)") { print_default_created(this); }
+    TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory1(TestFactory1 &&) = delete;
+    TestFactory1(const TestFactory1 &) = delete;
+    TestFactory1 &operator=(TestFactory1 &&) = delete;
+    TestFactory1 &operator=(const TestFactory1 &) = delete;
+public:
+    std::string value;
+    ~TestFactory1() { print_destroyed(this); }
+};
+// Non-public construction, but moveable:
+class TestFactory2 {
+    friend class TestFactoryHelper;
+    TestFactory2() : value("(empty2)") { print_default_created(this); }
+    TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
+public:
+    TestFactory2(TestFactory2 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory2 &operator=(TestFactory2 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    ~TestFactory2() { print_destroyed(this); }
+};
+// Mixed direct/factory construction:
+class TestFactory3 {
+protected:
+    friend class TestFactoryHelper;
+    TestFactory3() : value("(empty3)") { print_default_created(this); }
+    TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
+public:
+    TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory3(TestFactory3 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory3 &operator=(TestFactory3 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    virtual ~TestFactory3() { print_destroyed(this); }
+};
+// Inheritance test
+class TestFactory4 : public TestFactory3 {
+public:
+    TestFactory4() : TestFactory3() { print_default_created(this); }
+    TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
+    virtual ~TestFactory4() { print_destroyed(this); }
+};
+// Another class for an invalid downcast test
+class TestFactory5 : public TestFactory3 {
+public:
+    TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
+    virtual ~TestFactory5() { print_destroyed(this); }
+};
+
+class TestFactory6 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory6(int i) : value{i} { print_created(this, i); }
+    TestFactory6(TestFactory6 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory6(const TestFactory6 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory6() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF6 : public TestFactory6 {
+public:
+    // Special constructor that allows the factory to construct a PyTF6 from a TestFactory6 only
+    // when an alias is needed:
+    PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) { alias = true; print_created(this, "move", value); }
+    PyTF6(int i) : TestFactory6(i) { alias = true; print_created(this, i); }
+    PyTF6(PyTF6 &&f) : TestFactory6(std::move(f)) { print_move_created(this); }
+    PyTF6(const PyTF6 &f) : TestFactory6(f) { print_copy_created(this); }
+    PyTF6(std::string s) : TestFactory6((int) s.size()) { alias = true; print_created(this, s); }
+    virtual ~PyTF6() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory6, get, /*no args*/); }
+};
+
+class TestFactory7 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory7(int i) : value{i} { print_created(this, i); }
+    TestFactory7(TestFactory7 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory7(const TestFactory7 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory7() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF7 : public TestFactory7 {
+public:
+    PyTF7(int i) : TestFactory7(i) { alias = true; print_created(this, i); }
+    PyTF7(PyTF7 &&f) : TestFactory7(std::move(f)) { print_move_created(this); }
+    PyTF7(const PyTF7 &f) : TestFactory7(f) { print_copy_created(this); }
+    virtual ~PyTF7() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory7, get, /*no args*/); }
+};
+
+
+class TestFactoryHelper {
+public:
+    // Non-movable, non-copyable type:
+    // Return via pointer:
+    static TestFactory1 *construct1() { return new TestFactory1(); }
+    // Holder:
+    static std::unique_ptr<TestFactory1> construct1(int a) { return std::unique_ptr<TestFactory1>(new TestFactory1(a)); }
+    // pointer again
+    static TestFactory1 *construct1_string(std::string a) { return new TestFactory1(a); }
+
+    // Moveable type:
+    // pointer:
+    static TestFactory2 *construct2() { return new TestFactory2(); }
+    // holder:
+    static std::unique_ptr<TestFactory2> construct2(int a) { return std::unique_ptr<TestFactory2>(new TestFactory2(a)); }
+    // by value moving:
+    static TestFactory2 construct2(std::string a) { return TestFactory2(a); }
+
+    // shared_ptr holder type:
+    // pointer:
+    static TestFactory3 *construct3() { return new TestFactory3(); }
+    // holder:
+    static std::shared_ptr<TestFactory3> construct3(int a) { return std::shared_ptr<TestFactory3>(new TestFactory3(a)); }
+};
+
+TEST_SUBMODULE(factory_constructors, m) {
+
+    // Define various trivial types to allow simpler overload resolution:
+    py::module m_tag = m.def_submodule("tag");
+#define MAKE_TAG_TYPE(Name) \
+    struct Name##_tag {}; \
+    py::class_<Name##_tag>(m_tag, #Name "_tag").def(py::init<>()); \
+    m_tag.attr(#Name) = py::cast(Name##_tag{})
+    MAKE_TAG_TYPE(pointer);
+    MAKE_TAG_TYPE(unique_ptr);
+    MAKE_TAG_TYPE(move);
+    MAKE_TAG_TYPE(shared_ptr);
+    MAKE_TAG_TYPE(derived);
+    MAKE_TAG_TYPE(TF4);
+    MAKE_TAG_TYPE(TF5);
+    MAKE_TAG_TYPE(null_ptr);
+    MAKE_TAG_TYPE(base);
+    MAKE_TAG_TYPE(invalid_base);
+    MAKE_TAG_TYPE(alias);
+    MAKE_TAG_TYPE(unaliasable);
+    MAKE_TAG_TYPE(mixed);
+
+    // test_init_factory_basic, test_bad_type
+    py::class_<TestFactory1>(m, "TestFactory1")
+        .def(py::init([](unique_ptr_tag, int v) { return TestFactoryHelper::construct1(v); }))
+        .def(py::init(&TestFactoryHelper::construct1_string)) // raw function pointer
+        .def(py::init([](pointer_tag) { return TestFactoryHelper::construct1(); }))
+        .def(py::init([](py::handle, int v, py::handle) { return TestFactoryHelper::construct1(v); }))
+        .def_readwrite("value", &TestFactory1::value)
+        ;
+    py::class_<TestFactory2>(m, "TestFactory2")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](unique_ptr_tag, std::string v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](move_tag) { return TestFactoryHelper::construct2(); }))
+        .def_readwrite("value", &TestFactory2::value)
+        ;
+
+    // Stateful & reused:
+    int c = 1;
+    auto c4a = [c](pointer_tag, TF4_tag, int a) { (void) c; return new TestFactory4(a);};
+
+    // test_init_factory_basic, test_init_factory_casting
+    py::class_<TestFactory3, std::shared_ptr<TestFactory3>>(m, "TestFactory3")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct3(v); }))
+        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }))
+        .def("__init__", [](TestFactory3 &self, std::string v) { new (&self) TestFactory3(v); }) // placement-new ctor
+
+        // factories returning a derived type:
+        .def(py::init(c4a)) // derived ptr
+        .def(py::init([](pointer_tag, TF5_tag, int a) { return new TestFactory5(a); }))
+        // derived shared ptr:
+        .def(py::init([](shared_ptr_tag, TF4_tag, int a) { return std::make_shared<TestFactory4>(a); }))
+        .def(py::init([](shared_ptr_tag, TF5_tag, int a) { return std::make_shared<TestFactory5>(a); }))
+
+        // Returns nullptr:
+        .def(py::init([](null_ptr_tag) { return (TestFactory3 *) nullptr; }))
+
+        .def_readwrite("value", &TestFactory3::value)
+        ;
+
+    // test_init_factory_casting
+    py::class_<TestFactory4, TestFactory3, std::shared_ptr<TestFactory4>>(m, "TestFactory4")
+        .def(py::init(c4a)) // pointer
+        ;
+
+    // Doesn't need to be registered, but registering makes getting ConstructorStats easier:
+    py::class_<TestFactory5, TestFactory3, std::shared_ptr<TestFactory5>>(m, "TestFactory5");
+
+    // test_init_factory_alias
+    // Alias testing
+    py::class_<TestFactory6, PyTF6>(m, "TestFactory6")
+        .def(py::init([](base_tag, int i) { return TestFactory6(i); }))
+        .def(py::init([](alias_tag, int i) { return PyTF6(i); }))
+        .def(py::init([](alias_tag, std::string s) { return PyTF6(s); }))
+        .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF6(i); }))
+        .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory6(i); }))
+        .def(py::init([](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
+
+        .def("get", &TestFactory6::get)
+        .def("has_alias", &TestFactory6::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference)
+        ;
+
+    // test_init_factory_dual
+    // Separate alias constructor testing
+    py::class_<TestFactory7, PyTF7, std::shared_ptr<TestFactory7>>(m, "TestFactory7")
+        .def(py::init(
+            [](int i) { return TestFactory7(i); },
+            [](int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](pointer_tag, int i) { return new TestFactory7(i); },
+            [](pointer_tag, int i) { return new PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, int i) { return new TestFactory7(i); },
+            [](mixed_tag, int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, std::string s) { return TestFactory7((int) s.size()); },
+            [](mixed_tag, std::string s) { return new PyTF7((int) s.size()); }))
+        .def(py::init(
+            [](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
+            [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
+        .def(py::init(
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(10*i); }))
+        .def(py::init(
+            [](shared_ptr_tag, base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, base_tag, int i) { auto *p = new PyTF7(i); return std::shared_ptr<TestFactory7>(p); }))
+        .def(py::init(
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); })) // <-- invalid alias factory
+
+        .def("get", &TestFactory7::get)
+        .def("has_alias", &TestFactory7::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference)
+        ;
+
+    // test_placement_new_alternative
+    // Class with a custom new operator but *without* a placement new operator (issue #948)
+    class NoPlacementNew {
+    public:
+        NoPlacementNew(int i) : i(i) { }
+        static void *operator new(std::size_t s) {
+            auto *p = ::operator new(s);
+            py::print("operator new called, returning", reinterpret_cast<uintptr_t>(p));
+            return p;
+        }
+        static void operator delete(void *p) {
+            py::print("operator delete called on", reinterpret_cast<uintptr_t>(p));
+            ::operator delete(p);
+        }
+        int i;
+    };
+    // As of 2.2, `py::init<args>` no longer requires placement new
+    py::class_<NoPlacementNew>(m, "NoPlacementNew")
+        .def(py::init<int>())
+        .def(py::init([]() { return new NoPlacementNew(100); }))
+        .def_readwrite("i", &NoPlacementNew::i)
+        ;
+
+
+    // test_reallocations
+    // Class that has verbose operator_new/operator_delete calls
+    struct NoisyAlloc {
+        NoisyAlloc(const NoisyAlloc &) = default;
+        NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
+        NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
+        ~NoisyAlloc() { py::print("~NoisyAlloc()"); }
+
+        static void *operator new(size_t s) { py::print("noisy new"); return ::operator new(s); }
+        static void *operator new(size_t, void *p) { py::print("noisy placement new"); return p; }
+        static void operator delete(void *p, size_t) { py::print("noisy delete"); ::operator delete(p); }
+        static void operator delete(void *, void *) { py::print("noisy placement delete"); }
+#if defined(_MSC_VER) && _MSC_VER < 1910
+        // MSVC 2015 bug: the above "noisy delete" isn't invoked (fixed in MSVC 2017)
+        static void operator delete(void *p) { py::print("noisy delete"); ::operator delete(p); }
+#endif
+    };
+    py::class_<NoisyAlloc>(m, "NoisyAlloc")
+        // Since these overloads have the same number of arguments, the dispatcher will try each of
+        // them until the arguments convert.  Thus we can get a pre-allocation here when passing a
+        // single non-integer:
+        .def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }) // Regular constructor, runs first, requires preallocation
+        .def(py::init([](double d) { return new NoisyAlloc(d); }))
+
+        // The two-argument version: first the factory pointer overload.
+        .def(py::init([](int i, int) { return new NoisyAlloc(i); }))
+        // Return-by-value:
+        .def(py::init([](double d, int) { return NoisyAlloc(d); }))
+        // Old-style placement new init; requires preallocation
+        .def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); })
+        // Requires deallocation of previous overload preallocated value:
+        .def(py::init([](int i, double) { return new NoisyAlloc(i); }))
+        // Regular again: requires yet another preallocation
+        .def("__init__", [](NoisyAlloc &a, int i, std::string) { new (&a) NoisyAlloc(i); })
+        ;
+
+
+
+
+    // static_assert testing (the following def's should all fail with appropriate compilation errors):
+#if 0
+    struct BadF1Base {};
+    struct BadF1 : BadF1Base {};
+    struct PyBadF1 : BadF1 {};
+    py::class_<BadF1, PyBadF1, std::shared_ptr<BadF1>> bf1(m, "BadF1");
+    // wrapped factory function must return a compatible pointer, holder, or value
+    bf1.def(py::init([]() { return 3; }));
+    // incompatible factory function pointer return type
+    bf1.def(py::init([]() { static int three = 3; return &three; }));
+    // incompatible factory function std::shared_ptr<T> return type: cannot convert shared_ptr<T> to holder
+    // (non-polymorphic base)
+    bf1.def(py::init([]() { return std::shared_ptr<BadF1Base>(new BadF1()); }));
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.py
new file mode 100644
index 0000000..78a3910
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.py
@@ -0,0 +1,459 @@
+import pytest
+import re
+
+from pybind11_tests import factory_constructors as m
+from pybind11_tests.factory_constructors import tag
+from pybind11_tests import ConstructorStats
+
+
+def test_init_factory_basic():
+    """Tests py::init_factory() wrapper around various ways of returning the object"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory1, m.TestFactory2, m.TestFactory3]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    x1 = m.TestFactory1(tag.unique_ptr, 3)
+    assert x1.value == "3"
+    y1 = m.TestFactory1(tag.pointer)
+    assert y1.value == "(empty)"
+    z1 = m.TestFactory1("hi!")
+    assert z1.value == "hi!"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+
+    x2 = m.TestFactory2(tag.move)
+    assert x2.value == "(empty2)"
+    y2 = m.TestFactory2(tag.pointer, 7)
+    assert y2.value == "7"
+    z2 = m.TestFactory2(tag.unique_ptr, "hi again")
+    assert z2.value == "hi again"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+
+    x3 = m.TestFactory3(tag.shared_ptr)
+    assert x3.value == "(empty3)"
+    y3 = m.TestFactory3(tag.pointer, 42)
+    assert y3.value == "42"
+    z3 = m.TestFactory3("bye")
+    assert z3.value == "bye"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.TestFactory3(tag.null_ptr)
+    assert str(excinfo.value) == "pybind11::init(): factory function returned nullptr"
+
+    assert [i.alive() for i in cstats] == [3, 3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 9
+
+    del x1, y2, y3, z3
+    assert [i.alive() for i in cstats] == [2, 2, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    del x2, x3, y1, z1, z2
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["3", "hi!"],
+        ["7", "hi again"],
+        ["42", "bye"]
+    ]
+    assert [i.default_constructions for i in cstats] == [1, 1, 1]
+
+
+def test_init_factory_signature(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.TestFactory1("invalid", "constructor", "arguments")
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int)
+            2. m.factory_constructors.TestFactory1(arg0: str)
+            3. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.pointer_tag)
+            4. m.factory_constructors.TestFactory1(arg0: handle, arg1: int, arg2: handle)
+
+        Invoked with: 'invalid', 'constructor', 'arguments'
+    """  # noqa: E501 line too long
+
+    assert msg(m.TestFactory1.__init__.__doc__) == """
+        __init__(*args, **kwargs)
+        Overloaded function.
+
+        1. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int) -> None
+
+        2. __init__(self: m.factory_constructors.TestFactory1, arg0: str) -> None
+
+        3. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.pointer_tag) -> None
+
+        4. __init__(self: m.factory_constructors.TestFactory1, arg0: handle, arg1: int, arg2: handle) -> None
+    """  # noqa: E501 line too long
+
+
+def test_init_factory_casting():
+    """Tests py::init_factory() wrapper with various upcasting and downcasting returns"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory3, m.TestFactory4, m.TestFactory5]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    # Construction from derived references:
+    a = m.TestFactory3(tag.pointer, tag.TF4, 4)
+    assert a.value == "4"
+    b = m.TestFactory3(tag.shared_ptr, tag.TF4, 5)
+    assert b.value == "5"
+    c = m.TestFactory3(tag.pointer, tag.TF5, 6)
+    assert c.value == "6"
+    d = m.TestFactory3(tag.shared_ptr, tag.TF5, 7)
+    assert d.value == "7"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    # Shared a lambda with TF3:
+    e = m.TestFactory4(tag.pointer, tag.TF4, 8)
+    assert e.value == "8"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    assert [i.alive() for i in cstats] == [5, 3, 2]
+
+    del a
+    assert [i.alive() for i in cstats] == [4, 2, 2]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    del b, c, e
+    assert [i.alive() for i in cstats] == [1, 0, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 1
+
+    del d
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["4", "5", "6", "7", "8"],
+        ["4", "5", "8"],
+        ["6", "7"]
+    ]
+
+
+def test_init_factory_alias():
+    """Tests py::init_factory() wrapper with value conversions and alias types"""
+
+    cstats = [m.TestFactory6.get_cstats(), m.TestFactory6.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    a = m.TestFactory6(tag.base, 1)
+    assert a.get() == 1
+    assert not a.has_alias()
+    b = m.TestFactory6(tag.alias, "hi there")
+    assert b.get() == 8
+    assert b.has_alias()
+    c = m.TestFactory6(tag.alias, 3)
+    assert c.get() == 3
+    assert c.has_alias()
+    d = m.TestFactory6(tag.alias, tag.pointer, 4)
+    assert d.get() == 4
+    assert d.has_alias()
+    e = m.TestFactory6(tag.base, tag.pointer, 5)
+    assert e.get() == 5
+    assert not e.has_alias()
+    f = m.TestFactory6(tag.base, tag.alias, tag.pointer, 6)
+    assert f.get() == 6
+    assert f.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    assert [i.alive() for i in cstats] == [6, 4]
+
+    del a, b, e
+    assert [i.alive() for i in cstats] == [3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    del f, c, d
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    class MyTest(m.TestFactory6):
+        def __init__(self, *args):
+            m.TestFactory6.__init__(self, *args)
+
+        def get(self):
+            return -5 + m.TestFactory6.get(self)
+
+    # Return Class by value, moved into new alias:
+    z = MyTest(tag.base, 123)
+    assert z.get() == 118
+    assert z.has_alias()
+
+    # Return alias by value, moved into new alias:
+    y = MyTest(tag.alias, "why hello!")
+    assert y.get() == 5
+    assert y.has_alias()
+
+    # Return Class by pointer, moved into new alias then original destroyed:
+    x = MyTest(tag.base, tag.pointer, 47)
+    assert x.get() == 42
+    assert x.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    assert [i.alive() for i in cstats] == [3, 3]
+    del x, y, z
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "8", "3", "4", "5", "6", "123", "10", "47"],
+        ["hi there", "3", "4", "6", "move", "123", "why hello!", "move", "47"]
+    ]
+
+
+def test_init_factory_dual():
+    """Tests init factory functions with dual main/alias factory functions"""
+    from pybind11_tests.factory_constructors import TestFactory7
+
+    cstats = [TestFactory7.get_cstats(), TestFactory7.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    class PythFactory7(TestFactory7):
+        def get(self):
+            return 100 + TestFactory7.get(self)
+
+    a1 = TestFactory7(1)
+    a2 = PythFactory7(2)
+    assert a1.get() == 1
+    assert a2.get() == 102
+    assert not a1.has_alias()
+    assert a2.has_alias()
+
+    b1 = TestFactory7(tag.pointer, 3)
+    b2 = PythFactory7(tag.pointer, 4)
+    assert b1.get() == 3
+    assert b2.get() == 104
+    assert not b1.has_alias()
+    assert b2.has_alias()
+
+    c1 = TestFactory7(tag.mixed, 5)
+    c2 = PythFactory7(tag.mixed, 6)
+    assert c1.get() == 5
+    assert c2.get() == 106
+    assert not c1.has_alias()
+    assert c2.has_alias()
+
+    d1 = TestFactory7(tag.base, tag.pointer, 7)
+    d2 = PythFactory7(tag.base, tag.pointer, 8)
+    assert d1.get() == 7
+    assert d2.get() == 108
+    assert not d1.has_alias()
+    assert d2.has_alias()
+
+    # Both return an alias; the second multiplies the value by 10:
+    e1 = TestFactory7(tag.alias, tag.pointer, 9)
+    e2 = PythFactory7(tag.alias, tag.pointer, 10)
+    assert e1.get() == 9
+    assert e2.get() == 200
+    assert e1.has_alias()
+    assert e2.has_alias()
+
+    f1 = TestFactory7(tag.shared_ptr, tag.base, 11)
+    f2 = PythFactory7(tag.shared_ptr, tag.base, 12)
+    assert f1.get() == 11
+    assert f2.get() == 112
+    assert not f1.has_alias()
+    assert f2.has_alias()
+
+    g1 = TestFactory7(tag.shared_ptr, tag.invalid_base, 13)
+    assert g1.get() == 13
+    assert not g1.has_alias()
+    with pytest.raises(TypeError) as excinfo:
+        PythFactory7(tag.shared_ptr, tag.invalid_base, 14)
+    assert (str(excinfo.value) ==
+            "pybind11::init(): construction failed: returned holder-wrapped instance is not an "
+            "alias instance")
+
+    assert [i.alive() for i in cstats] == [13, 7]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 13
+
+    del a1, a2, b1, d1, e1, e2
+    assert [i.alive() for i in cstats] == [7, 4]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 7
+    del b2, c1, c2, d2, f1, f2, g1
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "2", "3", "4", "5", "6", "7", "8", "9", "100", "11", "12", "13", "14"],
+        ["2", "4", "6", "8", "9", "100", "12"]
+    ]
+
+
+def test_no_placement_new(capture):
+    """Prior to 2.2, `py::init<...>` relied on the type supporting placement
+    new; this tests a class without placement new support."""
+    with capture:
+        a = m.NoPlacementNew(123)
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert a.i == 123
+    with capture:
+        del a
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+    with capture:
+        b = m.NoPlacementNew()
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert b.i == 100
+    with capture:
+        del b
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+
+def test_multiple_inheritance():
+    class MITest(m.TestFactory1, m.TestFactory2):
+        def __init__(self):
+            m.TestFactory1.__init__(self, tag.unique_ptr, 33)
+            m.TestFactory2.__init__(self, tag.move)
+
+    a = MITest()
+    assert m.TestFactory1.value.fget(a) == "33"
+    assert m.TestFactory2.value.fget(a) == "(empty2)"
+
+
+def create_and_destroy(*args):
+    a = m.NoisyAlloc(*args)
+    print("---")
+    del a
+    pytest.gc_collect()
+
+
+def strip_comments(s):
+    return re.sub(r'\s+#.*', '', s)
+
+
+def test_reallocations(capture, msg):
+    """When the constructor is overloaded, previous overloads can require a preallocated value.
+    This test makes sure that such preallocated values only happen when they might be necessary,
+    and that they are deallocated properly"""
+
+    pytest.gc_collect()
+
+    with capture:
+        create_and_destroy(1)
+    assert msg(capture) == """
+        noisy new
+        noisy placement new
+        NoisyAlloc(int 1)
+        ---
+        ~NoisyAlloc()
+        noisy delete
+    """
+    with capture:
+        create_and_destroy(1.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # allocation required to attempt first overload
+        noisy delete            # have to dealloc before considering factory init overload
+        noisy new               # pointer factory calling "new", part 1: allocation
+        NoisyAlloc(double 1.5)  # ... part two, invoking constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2, 3)
+    assert msg(capture) == strip_comments("""
+        noisy new          # pointer factory calling "new", allocation
+        NoisyAlloc(int 2)  # constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2.5, 3)
+    assert msg(capture) == strip_comments("""
+        NoisyAlloc(double 2.5)  # construction (local func variable: operator_new not called)
+        noisy new               # return-by-value "new" part 1: allocation
+        ~NoisyAlloc()           # moved-away local func variable destruction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(3.5, 4.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # preallocation needed before invoking placement-new overload
+        noisy placement new     # Placement new
+        NoisyAlloc(double 3.5)  # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(4, 0.5)
+    assert msg(capture) == strip_comments("""
+        noisy new          # preallocation needed before invoking placement-new overload
+        noisy delete       # deallocation of preallocated storage
+        noisy new          # Factory pointer allocation
+        NoisyAlloc(int 4)  # factory pointer construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(5, "hi")
+    assert msg(capture) == strip_comments("""
+        noisy new            # preallocation needed before invoking first placement new
+        noisy delete         # delete before considering new-style constructor
+        noisy new            # preallocation for second placement new
+        noisy placement new  # Placement new in the second placement new overload
+        NoisyAlloc(int 5)    # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+
+@pytest.unsupported_on_py2
+def test_invalid_self():
+    """Tests invocation of the pybind-registered base class with an invalid `self` argument.  You
+    can only actually do this on Python 3: Python 2 raises an exception itself if you try."""
+    class NotPybindDerived(object):
+        pass
+
+    # Attempts to initialize with an invalid type passed as `self`:
+    class BrokenTF1(m.TestFactory1):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory1.__init__(a, tag.pointer)
+            elif bad == 2:
+                a = NotPybindDerived()
+                m.TestFactory1.__init__(a, tag.pointer)
+
+    # Same as above, but for a class with an alias:
+    class BrokenTF6(m.TestFactory6):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.base, 1)
+            elif bad == 2:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.alias, 1)
+            elif bad == 3:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.base, 1)
+            elif bad == 4:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.alias, 1)
+
+    for arg in (1, 2):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF1(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
+
+    for arg in (1, 2, 3, 4):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF6(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cc
new file mode 100644
index 0000000..76c17fd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cc
@@ -0,0 +1,52 @@
+/*
+    tests/test_gil_scoped.cpp -- acquire and release gil
+
+    Copyright (c) 2017 Borja Zarco (Google LLC) <bzarco@google.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/functional.h>
+
+
+class VirtClass  {
+public:
+    virtual ~VirtClass() {}
+    virtual void virtual_func() {}
+    virtual void pure_virtual_func() = 0;
+};
+
+class PyVirtClass : public VirtClass {
+    void virtual_func() override {
+        PYBIND11_OVERLOAD(void, VirtClass, virtual_func,);
+    }
+    void pure_virtual_func() override {
+        PYBIND11_OVERLOAD_PURE(void, VirtClass, pure_virtual_func,);
+    }
+};
+
+TEST_SUBMODULE(gil_scoped, m) {
+  py::class_<VirtClass, PyVirtClass>(m, "VirtClass")
+      .def(py::init<>())
+      .def("virtual_func", &VirtClass::virtual_func)
+      .def("pure_virtual_func", &VirtClass::pure_virtual_func);
+
+    m.def("test_callback_py_obj",
+          [](py::object func) { func(); });
+    m.def("test_callback_std_func",
+          [](const std::function<void()> &func) { func(); });
+    m.def("test_callback_virtual_func",
+          [](VirtClass &virt) { virt.virtual_func(); });
+    m.def("test_callback_pure_virtual_func",
+          [](VirtClass &virt) { virt.pure_virtual_func(); });
+    m.def("test_cross_module_gil",
+          []() {
+              auto cm = py::module::import("cross_module_gil_utils");
+              auto gil_acquire = reinterpret_cast<void (*)()>(
+                  PyLong_AsVoidPtr(cm.attr("gil_acquire_funcaddr").ptr()));
+              py::gil_scoped_release gil_release;
+              gil_acquire();
+          });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cpp
new file mode 100644
index 0000000..76c17fd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cpp
@@ -0,0 +1,52 @@
+/*
+    tests/test_gil_scoped.cpp -- acquire and release gil
+
+    Copyright (c) 2017 Borja Zarco (Google LLC) <bzarco@google.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/functional.h>
+
+
+class VirtClass  {
+public:
+    virtual ~VirtClass() {}
+    virtual void virtual_func() {}
+    virtual void pure_virtual_func() = 0;
+};
+
+class PyVirtClass : public VirtClass {
+    void virtual_func() override {
+        PYBIND11_OVERLOAD(void, VirtClass, virtual_func,);
+    }
+    void pure_virtual_func() override {
+        PYBIND11_OVERLOAD_PURE(void, VirtClass, pure_virtual_func,);
+    }
+};
+
+TEST_SUBMODULE(gil_scoped, m) {
+  py::class_<VirtClass, PyVirtClass>(m, "VirtClass")
+      .def(py::init<>())
+      .def("virtual_func", &VirtClass::virtual_func)
+      .def("pure_virtual_func", &VirtClass::pure_virtual_func);
+
+    m.def("test_callback_py_obj",
+          [](py::object func) { func(); });
+    m.def("test_callback_std_func",
+          [](const std::function<void()> &func) { func(); });
+    m.def("test_callback_virtual_func",
+          [](VirtClass &virt) { virt.virtual_func(); });
+    m.def("test_callback_pure_virtual_func",
+          [](VirtClass &virt) { virt.pure_virtual_func(); });
+    m.def("test_cross_module_gil",
+          []() {
+              auto cm = py::module::import("cross_module_gil_utils");
+              auto gil_acquire = reinterpret_cast<void (*)()>(
+                  PyLong_AsVoidPtr(cm.attr("gil_acquire_funcaddr").ptr()));
+              py::gil_scoped_release gil_release;
+              gil_acquire();
+          });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.py
new file mode 100644
index 0000000..1548337
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.py
@@ -0,0 +1,85 @@
+import multiprocessing
+import threading
+from pybind11_tests import gil_scoped as m
+
+
+def _run_in_process(target, *args, **kwargs):
+    """Runs target in process and returns its exitcode after 10s (None if still alive)."""
+    process = multiprocessing.Process(target=target, args=args, kwargs=kwargs)
+    process.daemon = True
+    try:
+        process.start()
+        # Do not need to wait much, 10s should be more than enough.
+        process.join(timeout=10)
+        return process.exitcode
+    finally:
+        if process.is_alive():
+            process.terminate()
+
+
+def _python_to_cpp_to_python():
+    """Calls different C++ functions that come back to Python."""
+    class ExtendedVirtClass(m.VirtClass):
+        def virtual_func(self):
+            pass
+
+        def pure_virtual_func(self):
+            pass
+
+    extended = ExtendedVirtClass()
+    m.test_callback_py_obj(lambda: None)
+    m.test_callback_std_func(lambda: None)
+    m.test_callback_virtual_func(extended)
+    m.test_callback_pure_virtual_func(extended)
+
+
+def _python_to_cpp_to_python_from_threads(num_threads, parallel=False):
+    """Calls different C++ functions that come back to Python, from Python threads."""
+    threads = []
+    for _ in range(num_threads):
+        thread = threading.Thread(target=_python_to_cpp_to_python)
+        thread.daemon = True
+        thread.start()
+        if parallel:
+            threads.append(thread)
+        else:
+            thread.join()
+    for thread in threads:
+        thread.join()
+
+
+def test_python_to_cpp_to_python_from_thread():
+    """Makes sure there is no GIL deadlock when running in a thread.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 1) == 0
+
+
+def test_python_to_cpp_to_python_from_thread_multiple_parallel():
+    """Makes sure there is no GIL deadlock when running in a thread multiple times in parallel.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=True) == 0
+
+
+def test_python_to_cpp_to_python_from_thread_multiple_sequential():
+    """Makes sure there is no GIL deadlock when running in a thread multiple times sequentially.
+
+    It runs in a separate process to be able to stop and assert if it deadlocks.
+    """
+    assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=False) == 0
+
+
+def test_python_to_cpp_to_python_from_process():
+    """Makes sure there is no GIL deadlock when using processes.
+
+    This test is for completion, but it was never an issue.
+    """
+    assert _run_in_process(_python_to_cpp_to_python) == 0
+
+
+def test_cross_module_gil():
+    """Makes sure that the GIL can be acquired by another module from a GIL-released state."""
+    m.test_cross_module_gil()  # Should not raise a SIGSEGV
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cc
new file mode 100644
index 0000000..e67f88a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cc
@@ -0,0 +1,73 @@
+/*
+    tests/test_iostream.cpp -- Usage of scoped_output_redirect
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/iostream.h>
+#include "pybind11_tests.h"
+#include <iostream>
+
+
+void noisy_function(std::string msg, bool flush) {
+
+    std::cout << msg;
+    if (flush)
+        std::cout << std::flush;
+}
+
+void noisy_funct_dual(std::string msg, std::string emsg) {
+    std::cout << msg;
+    std::cerr << emsg;
+}
+
+TEST_SUBMODULE(iostream, m) {
+
+    add_ostream_redirect(m);
+
+    // test_evals
+
+    m.def("captured_output_default", [](std::string msg) {
+        py::scoped_ostream_redirect redir;
+        std::cout << msg << std::flush;
+    });
+
+    m.def("captured_output", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cout, py::module::import("sys").attr("stdout"));
+        std::cout << msg << std::flush;
+    });
+
+    m.def("guard_output", &noisy_function,
+            py::call_guard<py::scoped_ostream_redirect>(),
+            py::arg("msg"), py::arg("flush")=true);
+
+    m.def("captured_err", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("noisy_function", &noisy_function, py::arg("msg"), py::arg("flush") = true);
+
+    m.def("dual_guard", &noisy_funct_dual,
+            py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),
+            py::arg("msg"), py::arg("emsg"));
+
+    m.def("raw_output", [](std::string msg) {
+        std::cout << msg << std::flush;
+    });
+
+    m.def("raw_err", [](std::string msg) {
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("captured_dual", [](std::string msg, std::string emsg) {
+        py::scoped_ostream_redirect redirout(std::cout, py::module::import("sys").attr("stdout"));
+        py::scoped_ostream_redirect redirerr(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cout << msg << std::flush;
+        std::cerr << emsg << std::flush;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cpp
new file mode 100644
index 0000000..e67f88a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cpp
@@ -0,0 +1,73 @@
+/*
+    tests/test_iostream.cpp -- Usage of scoped_output_redirect
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/iostream.h>
+#include "pybind11_tests.h"
+#include <iostream>
+
+
+void noisy_function(std::string msg, bool flush) {
+
+    std::cout << msg;
+    if (flush)
+        std::cout << std::flush;
+}
+
+void noisy_funct_dual(std::string msg, std::string emsg) {
+    std::cout << msg;
+    std::cerr << emsg;
+}
+
+TEST_SUBMODULE(iostream, m) {
+
+    add_ostream_redirect(m);
+
+    // test_evals
+
+    m.def("captured_output_default", [](std::string msg) {
+        py::scoped_ostream_redirect redir;
+        std::cout << msg << std::flush;
+    });
+
+    m.def("captured_output", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cout, py::module::import("sys").attr("stdout"));
+        std::cout << msg << std::flush;
+    });
+
+    m.def("guard_output", &noisy_function,
+            py::call_guard<py::scoped_ostream_redirect>(),
+            py::arg("msg"), py::arg("flush")=true);
+
+    m.def("captured_err", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("noisy_function", &noisy_function, py::arg("msg"), py::arg("flush") = true);
+
+    m.def("dual_guard", &noisy_funct_dual,
+            py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),
+            py::arg("msg"), py::arg("emsg"));
+
+    m.def("raw_output", [](std::string msg) {
+        std::cout << msg << std::flush;
+    });
+
+    m.def("raw_err", [](std::string msg) {
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("captured_dual", [](std::string msg, std::string emsg) {
+        py::scoped_ostream_redirect redirout(std::cout, py::module::import("sys").attr("stdout"));
+        py::scoped_ostream_redirect redirerr(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cout << msg << std::flush;
+        std::cerr << emsg << std::flush;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.py
new file mode 100644
index 0000000..27095b2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.py
@@ -0,0 +1,214 @@
+from pybind11_tests import iostream as m
+import sys
+
+from contextlib import contextmanager
+
+try:
+    # Python 3
+    from io import StringIO
+except ImportError:
+    # Python 2
+    try:
+        from cStringIO import StringIO
+    except ImportError:
+        from StringIO import StringIO
+
+try:
+    # Python 3.4
+    from contextlib import redirect_stdout
+except ImportError:
+    @contextmanager
+    def redirect_stdout(target):
+        original = sys.stdout
+        sys.stdout = target
+        yield
+        sys.stdout = original
+
+try:
+    # Python 3.5
+    from contextlib import redirect_stderr
+except ImportError:
+    @contextmanager
+    def redirect_stderr(target):
+        original = sys.stderr
+        sys.stderr = target
+        yield
+        sys.stderr = original
+
+
+def test_captured(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.captured_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_err(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+
+
+def test_captured_large_string(capsys):
+    # Make this bigger than the buffer used on the C++ side: 1024 chars
+    msg = "I've been redirected to Python, I hope!"
+    msg = msg * (1024 // len(msg) + 1)
+
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+
+def test_guard_capture(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.guard_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+
+def test_series_captured(capture):
+    with capture:
+        m.captured_output("a")
+        m.captured_output("b")
+    assert capture == "ab"
+
+
+def test_flush(capfd):
+    msg = "(not flushed)"
+    msg2 = "(flushed)"
+
+    with m.ostream_redirect():
+        m.noisy_function(msg, flush=False)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == ''
+
+        m.noisy_function(msg2, flush=True)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == msg + msg2
+
+        m.noisy_function(msg, flush=False)
+
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+
+
+def test_not_captured(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_err(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.raw_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.captured_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_multi_captured(capfd):
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output("a")
+        m.raw_output("b")
+        m.captured_output("c")
+        m.raw_output("d")
+    stdout, stderr = capfd.readouterr()
+    assert stdout == 'bd'
+    assert stream.getvalue() == 'ac'
+
+
+def test_dual(capsys):
+    m.captured_dual("a", "b")
+    stdout, stderr = capsys.readouterr()
+    assert stdout == "a"
+    assert stderr == "b"
+
+
+def test_redirect(capfd):
+    msg = "Should not be in log!"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        with m.ostream_redirect():
+            m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stream.getvalue() == msg
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+
+def test_redirect_err(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        with m.ostream_redirect(stdout=False):
+            m.raw_output(msg)
+            m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == msg2
+
+
+def test_redirect_both(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    stream2 = StringIO()
+    with redirect_stdout(stream):
+        with redirect_stderr(stream2):
+            with m.ostream_redirect():
+                m.raw_output(msg)
+                m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+    assert stream2.getvalue() == msg2
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cc
new file mode 100644
index 0000000..6563fb9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cc
@@ -0,0 +1,102 @@
+/*
+    tests/test_kwargs_and_defaults.cpp -- keyword arguments and default values
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+TEST_SUBMODULE(kwargs_and_defaults, m) {
+    auto kw_func = [](int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); };
+
+    // test_named_arguments
+    m.def("kw_func0", kw_func);
+    m.def("kw_func1", kw_func, py::arg("x"), py::arg("y"));
+    m.def("kw_func2", kw_func, py::arg("x") = 100, py::arg("y") = 200);
+    m.def("kw_func3", [](const char *) { }, py::arg("data") = std::string("Hello world!"));
+
+    /* A fancier default argument */
+    std::vector<int> list{{13, 17}};
+    m.def("kw_func4", [](const std::vector<int> &entries) {
+        std::string ret = "{";
+        for (int i : entries)
+            ret += std::to_string(i) + " ";
+        ret.back() = '}';
+        return ret;
+    }, py::arg("myList") = list);
+
+    m.def("kw_func_udl", kw_func, "x"_a, "y"_a=300);
+    m.def("kw_func_udl_z", kw_func, "x"_a, "y"_a=0);
+
+    // test_args_and_kwargs
+    m.def("args_function", [](py::args args) -> py::tuple {
+        return std::move(args);
+    });
+    m.def("args_kwargs_function", [](py::args args, py::kwargs kwargs) {
+        return py::make_tuple(args, kwargs);
+    });
+
+    // test_mixed_args_and_kwargs
+    m.def("mixed_plus_args", [](int i, double j, py::args args) {
+        return py::make_tuple(i, j, args);
+    });
+    m.def("mixed_plus_kwargs", [](int i, double j, py::kwargs kwargs) {
+        return py::make_tuple(i, j, kwargs);
+    });
+    auto mixed_plus_both = [](int i, double j, py::args args, py::kwargs kwargs) {
+        return py::make_tuple(i, j, args, kwargs);
+    };
+    m.def("mixed_plus_args_kwargs", mixed_plus_both);
+
+    m.def("mixed_plus_args_kwargs_defaults", mixed_plus_both,
+            py::arg("i") = 1, py::arg("j") = 3.14159);
+
+    // test_args_refcount
+    // PyPy needs a garbage collection to get the reference count values to match CPython's behaviour
+    #ifdef PYPY_VERSION
+    #define GC_IF_NEEDED ConstructorStats::gc()
+    #else
+    #define GC_IF_NEEDED
+    #endif
+    m.def("arg_refcount_h", [](py::handle h) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_h", [](py::handle h, py::handle, py::handle) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_o", [](py::object o) { GC_IF_NEEDED; return o.ref_count(); });
+    m.def("args_refcount", [](py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size());
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+    m.def("mixed_args_refcount", [](py::object o, py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size() + 1);
+        t[0] = o.ref_count();
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i + 1] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+
+    // pybind11 won't allow these to be bound: args and kwargs, if present, must be at the end.
+    // Uncomment these to test that the static_assert is indeed working:
+//    m.def("bad_args1", [](py::args, int) {});
+//    m.def("bad_args2", [](py::kwargs, int) {});
+//    m.def("bad_args3", [](py::kwargs, py::args) {});
+//    m.def("bad_args4", [](py::args, int, py::kwargs) {});
+//    m.def("bad_args5", [](py::args, py::kwargs, int) {});
+//    m.def("bad_args6", [](py::args, py::args) {});
+//    m.def("bad_args7", [](py::kwargs, py::kwargs) {});
+
+    // test_function_signatures (along with most of the above)
+    struct KWClass { void foo(int, float) {} };
+    py::class_<KWClass>(m, "KWClass")
+        .def("foo0", &KWClass::foo)
+        .def("foo1", &KWClass::foo, "x"_a, "y"_a);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cpp
new file mode 100644
index 0000000..6563fb9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cpp
@@ -0,0 +1,102 @@
+/*
+    tests/test_kwargs_and_defaults.cpp -- keyword arguments and default values
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+TEST_SUBMODULE(kwargs_and_defaults, m) {
+    auto kw_func = [](int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); };
+
+    // test_named_arguments
+    m.def("kw_func0", kw_func);
+    m.def("kw_func1", kw_func, py::arg("x"), py::arg("y"));
+    m.def("kw_func2", kw_func, py::arg("x") = 100, py::arg("y") = 200);
+    m.def("kw_func3", [](const char *) { }, py::arg("data") = std::string("Hello world!"));
+
+    /* A fancier default argument */
+    std::vector<int> list{{13, 17}};
+    m.def("kw_func4", [](const std::vector<int> &entries) {
+        std::string ret = "{";
+        for (int i : entries)
+            ret += std::to_string(i) + " ";
+        ret.back() = '}';
+        return ret;
+    }, py::arg("myList") = list);
+
+    m.def("kw_func_udl", kw_func, "x"_a, "y"_a=300);
+    m.def("kw_func_udl_z", kw_func, "x"_a, "y"_a=0);
+
+    // test_args_and_kwargs
+    m.def("args_function", [](py::args args) -> py::tuple {
+        return std::move(args);
+    });
+    m.def("args_kwargs_function", [](py::args args, py::kwargs kwargs) {
+        return py::make_tuple(args, kwargs);
+    });
+
+    // test_mixed_args_and_kwargs
+    m.def("mixed_plus_args", [](int i, double j, py::args args) {
+        return py::make_tuple(i, j, args);
+    });
+    m.def("mixed_plus_kwargs", [](int i, double j, py::kwargs kwargs) {
+        return py::make_tuple(i, j, kwargs);
+    });
+    auto mixed_plus_both = [](int i, double j, py::args args, py::kwargs kwargs) {
+        return py::make_tuple(i, j, args, kwargs);
+    };
+    m.def("mixed_plus_args_kwargs", mixed_plus_both);
+
+    m.def("mixed_plus_args_kwargs_defaults", mixed_plus_both,
+            py::arg("i") = 1, py::arg("j") = 3.14159);
+
+    // test_args_refcount
+    // PyPy needs a garbage collection to get the reference count values to match CPython's behaviour
+    #ifdef PYPY_VERSION
+    #define GC_IF_NEEDED ConstructorStats::gc()
+    #else
+    #define GC_IF_NEEDED
+    #endif
+    m.def("arg_refcount_h", [](py::handle h) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_h", [](py::handle h, py::handle, py::handle) { GC_IF_NEEDED; return h.ref_count(); });
+    m.def("arg_refcount_o", [](py::object o) { GC_IF_NEEDED; return o.ref_count(); });
+    m.def("args_refcount", [](py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size());
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+    m.def("mixed_args_refcount", [](py::object o, py::args a) {
+        GC_IF_NEEDED;
+        py::tuple t(a.size() + 1);
+        t[0] = o.ref_count();
+        for (size_t i = 0; i < a.size(); i++)
+            // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
+            t[i + 1] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+        return t;
+    });
+
+    // pybind11 won't allow these to be bound: args and kwargs, if present, must be at the end.
+    // Uncomment these to test that the static_assert is indeed working:
+//    m.def("bad_args1", [](py::args, int) {});
+//    m.def("bad_args2", [](py::kwargs, int) {});
+//    m.def("bad_args3", [](py::kwargs, py::args) {});
+//    m.def("bad_args4", [](py::args, int, py::kwargs) {});
+//    m.def("bad_args5", [](py::args, py::kwargs, int) {});
+//    m.def("bad_args6", [](py::args, py::args) {});
+//    m.def("bad_args7", [](py::kwargs, py::kwargs) {});
+
+    // test_function_signatures (along with most of the above)
+    struct KWClass { void foo(int, float) {} };
+    py::class_<KWClass>(m, "KWClass")
+        .def("foo0", &KWClass::foo)
+        .def("foo1", &KWClass::foo, "x"_a, "y"_a);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.py
new file mode 100644
index 0000000..27a05a0
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.py
@@ -0,0 +1,147 @@
+import pytest
+from pybind11_tests import kwargs_and_defaults as m
+
+
+def test_function_signatures(doc):
+    assert doc(m.kw_func0) == "kw_func0(arg0: int, arg1: int) -> str"
+    assert doc(m.kw_func1) == "kw_func1(x: int, y: int) -> str"
+    assert doc(m.kw_func2) == "kw_func2(x: int = 100, y: int = 200) -> str"
+    assert doc(m.kw_func3) == "kw_func3(data: str = 'Hello world!') -> None"
+    assert doc(m.kw_func4) == "kw_func4(myList: List[int] = [13, 17]) -> str"
+    assert doc(m.kw_func_udl) == "kw_func_udl(x: int, y: int = 300) -> str"
+    assert doc(m.kw_func_udl_z) == "kw_func_udl_z(x: int, y: int = 0) -> str"
+    assert doc(m.args_function) == "args_function(*args) -> tuple"
+    assert doc(m.args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
+    assert doc(m.KWClass.foo0) == \
+        "foo0(self: m.kwargs_and_defaults.KWClass, arg0: int, arg1: float) -> None"
+    assert doc(m.KWClass.foo1) == \
+        "foo1(self: m.kwargs_and_defaults.KWClass, x: int, y: float) -> None"
+
+
+def test_named_arguments(msg):
+    assert m.kw_func0(5, 10) == "x=5, y=10"
+
+    assert m.kw_func1(5, 10) == "x=5, y=10"
+    assert m.kw_func1(5, y=10) == "x=5, y=10"
+    assert m.kw_func1(y=10, x=5) == "x=5, y=10"
+
+    assert m.kw_func2() == "x=100, y=200"
+    assert m.kw_func2(5) == "x=5, y=200"
+    assert m.kw_func2(x=5) == "x=5, y=200"
+    assert m.kw_func2(y=10) == "x=100, y=10"
+    assert m.kw_func2(5, 10) == "x=5, y=10"
+    assert m.kw_func2(x=5, y=10) == "x=5, y=10"
+
+    with pytest.raises(TypeError) as excinfo:
+        # noinspection PyArgumentList
+        m.kw_func2(x=5, y=10, z=12)
+    assert excinfo.match(
+        r'(?s)^kw_func2\(\): incompatible.*Invoked with: kwargs: ((x=5|y=10|z=12)(, |$))' + '{3}$')
+
+    assert m.kw_func4() == "{13 17}"
+    assert m.kw_func4(myList=[1, 2, 3]) == "{1 2 3}"
+
+    assert m.kw_func_udl(x=5, y=10) == "x=5, y=10"
+    assert m.kw_func_udl_z(x=5) == "x=5, y=0"
+
+
+def test_arg_and_kwargs():
+    args = 'arg1_value', 'arg2_value', 3
+    assert m.args_function(*args) == args
+
+    args = 'a1', 'a2'
+    kwargs = dict(arg3='a3', arg4=4)
+    assert m.args_kwargs_function(*args, **kwargs) == (args, kwargs)
+
+
+def test_mixed_args_and_kwargs(msg):
+    mpa = m.mixed_plus_args
+    mpk = m.mixed_plus_kwargs
+    mpak = m.mixed_plus_args_kwargs
+    mpakd = m.mixed_plus_args_kwargs_defaults
+
+    assert mpa(1, 2.5, 4, 99.5, None) == (1, 2.5, (4, 99.5, None))
+    assert mpa(1, 2.5) == (1, 2.5, ())
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa(1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with: 1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa()
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    assert mpk(-2, 3.5, pi=3.14159, e=2.71828) == (-2, 3.5, {'e': 2.71828, 'pi': 3.14159})
+    assert mpak(7, 7.7, 7.77, 7.777, 7.7777, minusseven=-7) == (
+        7, 7.7, (7.77, 7.777, 7.7777), {'minusseven': -7})
+    assert mpakd() == (1, 3.14159, (), {})
+    assert mpakd(3) == (3, 3.14159, (), {})
+    assert mpakd(j=2.71828) == (1, 2.71828, (), {})
+    assert mpakd(k=42) == (1, 3.14159, (), {'k': 42})
+    assert mpakd(1, 1, 2, 3, 5, 8, then=13, followedby=21) == (
+        1, 1, (2, 3, 5, 8), {'then': 13, 'followedby': 21})
+    # Arguments specified both positionally and via kwargs should fail:
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, i=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1; kwargs: i=1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, 2, j=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1, 2; kwargs: j=1
+    """  # noqa: E501 line too long
+
+
+def test_args_refcount():
+    """Issue/PR #1216 - py::args elements get double-inc_ref()ed when combined with regular
+    arguments"""
+    refcount = m.arg_refcount_h
+
+    myval = 54321
+    expected = refcount(myval)
+    assert m.arg_refcount_h(myval) == expected
+    assert m.arg_refcount_o(myval) == expected + 1
+    assert m.arg_refcount_h(myval) == expected
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_args(1, 2.0, "a", myval) == (1, 2.0, ("a", myval))
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_kwargs(3, 4.0, a=1, b=myval) == (3, 4.0, {"a": 1, "b": myval})
+    assert refcount(myval) == expected
+
+    assert m.args_function(-1, myval) == (-1, myval)
+    assert refcount(myval) == expected
+
+    assert m.mixed_plus_args_kwargs(5, 6.0, myval, a=myval) == (5, 6.0, (myval,), {"a": myval})
+    assert refcount(myval) == expected
+
+    assert m.args_kwargs_function(7, 8, myval, a=1, b=myval) == \
+        ((7, 8, myval), {"a": 1, "b": myval})
+    assert refcount(myval) == expected
+
+    exp3 = refcount(myval, myval, myval)
+    assert m.args_refcount(myval, myval, myval) == (exp3, exp3, exp3)
+    assert refcount(myval) == expected
+
+    # This function takes the first arg as a `py::object` and the rest as a `py::args`.  Unlike the
+    # previous case, when we have both positional and `py::args` we need to construct a new tuple
+    # for the `py::args`; in the previous case, we could simply inc_ref and pass on Python's input
+    # tuple without having to inc_ref the individual elements, but here we can't, hence the extra
+    # refs.
+    assert m.mixed_args_refcount(myval, myval, myval) == (exp3 + 3, exp3 + 3, exp3 + 3)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cc
new file mode 100644
index 0000000..97c02db
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cc
@@ -0,0 +1,101 @@
+/*
+    tests/test_local_bindings.cpp -- tests the py::module_local class feature which makes a class
+                                     binding local to the module in which it is defined.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+TEST_SUBMODULE(local_bindings, m) {
+    // test_load_external
+    m.def("load_external1", [](ExternalType1 &e) { return e.i; });
+    m.def("load_external2", [](ExternalType2 &e) { return e.i; });
+
+    // test_local_bindings
+    // Register a class with py::module_local:
+    bind_local<LocalType, -1>(m, "LocalType", py::module_local())
+        .def("get3", [](LocalType &t) { return t.i + 3; })
+        ;
+
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // The main pybind11 test module is loaded first, so this registration will succeed (the second
+    // one, in pybind11_cross_module_tests.cpp, is designed to fail):
+    bind_local<NonLocalType, 0>(m, "NonLocalType")
+        .def(py::init<int>())
+        .def("get", [](LocalType &i) { return i.i; })
+        ;
+
+    // test_duplicate_local
+    // py::module_local declarations should be visible across compilation units that get linked together;
+    // this tries to register a duplicate local.  It depends on a definition in test_class.cpp and
+    // should raise a runtime error from the duplicate definition attempt.  If test_class isn't
+    // available it *also* throws a runtime error (with "test_class not enabled" as value).
+    m.def("register_local_external", [m]() {
+        auto main = py::module::import("pybind11_tests");
+        if (py::hasattr(main, "class_")) {
+            bind_local<LocalExternal, 7>(m, "LocalExternal", py::module_local());
+        }
+        else throw std::runtime_error("test_class not enabled");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+    // and global if the type (or one of the types, for the map) is global:
+    py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    py::bind_map<NonLocalMap>(m, "NonLocalMap");
+
+    // test_stl_bind_global
+    // They can, however, be overridden to global using `py::module_local(false)`:
+    bind_local<NonLocal2, 10>(m, "NonLocal2");
+    py::bind_vector<LocalVec2>(m, "LocalVec2", py::module_local());
+    py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global", [m]() {
+        bind_local<MixedGlobalLocal, 100>(m, "MixedGlobalLocal", py::module_local(false));
+    });
+    m.def("register_mixed_local", [m]() {
+        bind_local<MixedLocalGlobal, 1000>(m, "MixedLocalGlobal", py::module_local());
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    m.def("load_vector_via_caster", [](std::vector<int> v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL").def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 10; });
+
+    py::class_<MixGL2>(m, "MixGL2").def(py::init<int>());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cpp
new file mode 100644
index 0000000..97c02db
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cpp
@@ -0,0 +1,101 @@
+/*
+    tests/test_local_bindings.cpp -- tests the py::module_local class feature which makes a class
+                                     binding local to the module in which it is defined.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+TEST_SUBMODULE(local_bindings, m) {
+    // test_load_external
+    m.def("load_external1", [](ExternalType1 &e) { return e.i; });
+    m.def("load_external2", [](ExternalType2 &e) { return e.i; });
+
+    // test_local_bindings
+    // Register a class with py::module_local:
+    bind_local<LocalType, -1>(m, "LocalType", py::module_local())
+        .def("get3", [](LocalType &t) { return t.i + 3; })
+        ;
+
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // The main pybind11 test module is loaded first, so this registration will succeed (the second
+    // one, in pybind11_cross_module_tests.cpp, is designed to fail):
+    bind_local<NonLocalType, 0>(m, "NonLocalType")
+        .def(py::init<int>())
+        .def("get", [](LocalType &i) { return i.i; })
+        ;
+
+    // test_duplicate_local
+    // py::module_local declarations should be visible across compilation units that get linked together;
+    // this tries to register a duplicate local.  It depends on a definition in test_class.cpp and
+    // should raise a runtime error from the duplicate definition attempt.  If test_class isn't
+    // available it *also* throws a runtime error (with "test_class not enabled" as value).
+    m.def("register_local_external", [m]() {
+        auto main = py::module::import("pybind11_tests");
+        if (py::hasattr(main, "class_")) {
+            bind_local<LocalExternal, 7>(m, "LocalExternal", py::module_local());
+        }
+        else throw std::runtime_error("test_class not enabled");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+    // and global if the type (or one of the types, for the map) is global:
+    py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    py::bind_map<NonLocalMap>(m, "NonLocalMap");
+
+    // test_stl_bind_global
+    // They can, however, be overridden to global using `py::module_local(false)`:
+    bind_local<NonLocal2, 10>(m, "NonLocal2");
+    py::bind_vector<LocalVec2>(m, "LocalVec2", py::module_local());
+    py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global", [m]() {
+        bind_local<MixedGlobalLocal, 100>(m, "MixedGlobalLocal", py::module_local(false));
+    });
+    m.def("register_mixed_local", [m]() {
+        bind_local<MixedLocalGlobal, 1000>(m, "MixedLocalGlobal", py::module_local());
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    m.def("load_vector_via_caster", [](std::vector<int> v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL").def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 10; });
+
+    py::class_<MixGL2>(m, "MixGL2").def(py::init<int>());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.py
new file mode 100644
index 0000000..b380376
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.py
@@ -0,0 +1,226 @@
+import pytest
+
+from pybind11_tests import local_bindings as m
+
+
+def test_load_external():
+    """Load a `py::module_local` type that's only registered in an external module"""
+    import pybind11_cross_module_tests as cm
+
+    assert m.load_external1(cm.ExternalType1(11)) == 11
+    assert m.load_external2(cm.ExternalType2(22)) == 22
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external2(cm.ExternalType1(21)) == 21
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external1(cm.ExternalType2(12)) == 12
+    assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_local_bindings():
+    """Tests that duplicate `py::module_local` class bindings work across modules"""
+
+    # Make sure we can load the second module with the conflicting (but local) definition:
+    import pybind11_cross_module_tests as cm
+
+    i1 = m.LocalType(5)
+    assert i1.get() == 4
+    assert i1.get3() == 8
+
+    i2 = cm.LocalType(10)
+    assert i2.get() == 11
+    assert i2.get2() == 12
+
+    assert not hasattr(i1, 'get2')
+    assert not hasattr(i2, 'get3')
+
+    # Loading within the local module
+    assert m.local_value(i1) == 5
+    assert cm.local_value(i2) == 10
+
+    # Cross-module loading works as well (on failure, the type loader looks for
+    # external module-local converters):
+    assert m.local_value(i2) == 10
+    assert cm.local_value(i1) == 5
+
+
+def test_nonlocal_failure():
+    """Tests that attempting to register a non-local type in multiple modules fails"""
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalType" is already registered!'
+
+
+def test_duplicate_local():
+    """Tests expected failure when registering a class twice with py::local in the same module"""
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_local_external()
+    import pybind11_tests
+    assert str(excinfo.value) == (
+        'generic_type: type "LocalExternal" is already registered!'
+        if hasattr(pybind11_tests, 'class_') else 'test_class not enabled')
+
+
+def test_stl_bind_local():
+    import pybind11_cross_module_tests as cm
+
+    v1, v2 = m.LocalVec(), cm.LocalVec()
+    v1.append(m.LocalType(1))
+    v1.append(m.LocalType(2))
+    v2.append(cm.LocalType(1))
+    v2.append(cm.LocalType(2))
+
+    # Cross module value loading:
+    v1.append(cm.LocalType(3))
+    v2.append(m.LocalType(3))
+
+    assert [i.get() for i in v1] == [0, 1, 2]
+    assert [i.get() for i in v2] == [2, 3, 4]
+
+    v3, v4 = m.NonLocalVec(), cm.NonLocalVec2()
+    v3.append(m.NonLocalType(1))
+    v3.append(m.NonLocalType(2))
+    v4.append(m.NonLocal2(3))
+    v4.append(m.NonLocal2(4))
+
+    assert [i.get() for i in v3] == [1, 2]
+    assert [i.get() for i in v4] == [13, 14]
+
+    d1, d2 = m.LocalMap(), cm.LocalMap()
+    d1["a"] = v1[0]
+    d1["b"] = v1[1]
+    d2["c"] = v2[0]
+    d2["d"] = v2[1]
+    assert {i: d1[i].get() for i in d1} == {'a': 0, 'b': 1}
+    assert {i: d2[i].get() for i in d2} == {'c': 2, 'd': 3}
+
+
+def test_stl_bind_global():
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_vec()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalVec" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map2()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap2" is already registered!'
+
+
+def test_mixed_local_global():
+    """Local types take precedence over globally registered types: a module with a `module_local`
+    type can be registered even if the type is already registered globally.  With the module,
+    casting will go to the local type; outside the module casting goes to the global type."""
+    import pybind11_cross_module_tests as cm
+    m.register_mixed_global()
+    m.register_mixed_local()
+
+    a = []
+    a.append(m.MixedGlobalLocal(1))
+    a.append(m.MixedLocalGlobal(2))
+    a.append(m.get_mixed_gl(3))
+    a.append(m.get_mixed_lg(4))
+
+    assert [x.get() for x in a] == [101, 1002, 103, 1004]
+
+    cm.register_mixed_global_local()
+    cm.register_mixed_local_global()
+    a.append(m.MixedGlobalLocal(5))
+    a.append(m.MixedLocalGlobal(6))
+    a.append(cm.MixedGlobalLocal(7))
+    a.append(cm.MixedLocalGlobal(8))
+    a.append(m.get_mixed_gl(9))
+    a.append(m.get_mixed_lg(10))
+    a.append(cm.get_mixed_gl(11))
+    a.append(cm.get_mixed_lg(12))
+
+    assert [x.get() for x in a] == \
+        [101, 1002, 103, 1004, 105, 1006, 207, 2008, 109, 1010, 211, 2012]
+
+
+def test_internal_locals_differ():
+    """Makes sure the internal local type map differs across the two modules"""
+    import pybind11_cross_module_tests as cm
+    assert m.local_cpp_types_addr() != cm.local_cpp_types_addr()
+
+
+def test_stl_caster_vs_stl_bind(msg):
+    """One module uses a generic vector caster from `<pybind11/stl.h>` while the other
+    exports `std::vector<int>` via `py:bind_vector` and `py::module_local`"""
+    import pybind11_cross_module_tests as cm
+
+    v1 = cm.VectorInt([1, 2, 3])
+    assert m.load_vector_via_caster(v1) == 6
+    assert cm.load_vector_via_binding(v1) == 6
+
+    v2 = [1, 2, 3]
+    assert m.load_vector_via_caster(v2) == 6
+    with pytest.raises(TypeError) as excinfo:
+        cm.load_vector_via_binding(v2) == 6
+    assert msg(excinfo.value) == """
+    load_vector_via_binding(): incompatible function arguments. The following argument types are supported:
+        1. (arg0: pybind11_cross_module_tests.VectorInt) -> int
+
+    Invoked with: [1, 2, 3]
+    """  # noqa: E501 line too long
+
+
+def test_cross_module_calls():
+    import pybind11_cross_module_tests as cm
+
+    v1 = m.LocalVec()
+    v1.append(m.LocalType(1))
+    v2 = cm.LocalVec()
+    v2.append(cm.LocalType(2))
+
+    # Returning the self pointer should get picked up as returning an existing
+    # instance (even when that instance is of a foreign, non-local type).
+    assert m.return_self(v1) is v1
+    assert cm.return_self(v2) is v2
+    assert m.return_self(v2) is v2
+    assert cm.return_self(v1) is v1
+
+    assert m.LocalVec is not cm.LocalVec
+    # Returning a copy, on the other hand, always goes to the local type,
+    # regardless of where the source type came from.
+    assert type(m.return_copy(v1)) is m.LocalVec
+    assert type(m.return_copy(v2)) is m.LocalVec
+    assert type(cm.return_copy(v1)) is cm.LocalVec
+    assert type(cm.return_copy(v2)) is cm.LocalVec
+
+    # Test the example given in the documentation (which also tests inheritance casting):
+    mycat = m.Cat("Fluffy")
+    mydog = cm.Dog("Rover")
+    assert mycat.get_name() == "Fluffy"
+    assert mydog.name() == "Rover"
+    assert m.Cat.__base__.__name__ == "Pet"
+    assert cm.Dog.__base__.__name__ == "Pet"
+    assert m.Cat.__base__ is not cm.Dog.__base__
+    assert m.pet_name(mycat) == "Fluffy"
+    assert m.pet_name(mydog) == "Rover"
+    assert cm.pet_name(mycat) == "Fluffy"
+    assert cm.pet_name(mydog) == "Rover"
+
+    assert m.MixGL is not cm.MixGL
+    a = m.MixGL(1)
+    b = cm.MixGL(2)
+    assert m.get_gl_value(a) == 11
+    assert m.get_gl_value(b) == 12
+    assert cm.get_gl_value(a) == 101
+    assert cm.get_gl_value(b) == 102
+
+    c, d = m.MixGL2(3), cm.MixGL2(4)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(c)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(d)
+    assert "incompatible function arguments" in str(excinfo.value)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cc
new file mode 100644
index 0000000..c7b82f1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cc
@@ -0,0 +1,460 @@
+/*
+    tests/test_methods_and_attributes.cpp -- constructors, deconstructors, attribute access,
+    __str__, argument and return value conventions
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#if !defined(PYBIND11_OVERLOAD_CAST)
+template <typename... Args>
+using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+#endif
+
+class ExampleMandA {
+public:
+    ExampleMandA() { print_default_created(this); }
+    ExampleMandA(int value) : value(value) { print_created(this, value); }
+    ExampleMandA(const ExampleMandA &e) : value(e.value) { print_copy_created(this); }
+    ExampleMandA(ExampleMandA &&e) : value(e.value) { print_move_created(this); }
+    ~ExampleMandA() { print_destroyed(this); }
+
+    std::string toString() {
+        return "ExampleMandA[value=" + std::to_string(value) + "]";
+    }
+
+    void operator=(const ExampleMandA &e) { print_copy_assigned(this); value = e.value; }
+    void operator=(ExampleMandA &&e) { print_move_assigned(this); value = e.value; }
+
+    void add1(ExampleMandA other) { value += other.value; }         // passing by value
+    void add2(ExampleMandA &other) { value += other.value; }        // passing by reference
+    void add3(const ExampleMandA &other) { value += other.value; }  // passing by const reference
+    void add4(ExampleMandA *other) { value += other->value; }       // passing by pointer
+    void add5(const ExampleMandA *other) { value += other->value; } // passing by const pointer
+
+    void add6(int other) { value += other; }                        // passing by value
+    void add7(int &other) { value += other; }                       // passing by reference
+    void add8(const int &other) { value += other; }                 // passing by const reference
+    void add9(int *other) { value += *other; }                      // passing by pointer
+    void add10(const int *other) { value += *other; }               // passing by const pointer
+
+    ExampleMandA self1() { return *this; }                          // return by value
+    ExampleMandA &self2() { return *this; }                         // return by reference
+    const ExampleMandA &self3() { return *this; }                   // return by const reference
+    ExampleMandA *self4() { return this; }                          // return by pointer
+    const ExampleMandA *self5() { return this; }                    // return by const pointer
+
+    int internal1() { return value; }                               // return by value
+    int &internal2() { return value; }                              // return by reference
+    const int &internal3() { return value; }                        // return by const reference
+    int *internal4() { return &value; }                             // return by pointer
+    const int *internal5() { return &value; }                       // return by const pointer
+
+    py::str overloaded()             { return "()"; }
+    py::str overloaded(int)          { return "(int)"; }
+    py::str overloaded(int, float)   { return "(int, float)"; }
+    py::str overloaded(float, int)   { return "(float, int)"; }
+    py::str overloaded(int, int)     { return "(int, int)"; }
+    py::str overloaded(float, float) { return "(float, float)"; }
+    py::str overloaded(int)          const { return "(int) const"; }
+    py::str overloaded(int, float)   const { return "(int, float) const"; }
+    py::str overloaded(float, int)   const { return "(float, int) const"; }
+    py::str overloaded(int, int)     const { return "(int, int) const"; }
+    py::str overloaded(float, float) const { return "(float, float) const"; }
+
+    static py::str overloaded(float) { return "static float"; }
+
+    int value = 0;
+};
+
+struct TestProperties {
+    int value = 1;
+    static int static_value;
+
+    int get() const { return value; }
+    void set(int v) { value = v; }
+
+    static int static_get() { return static_value; }
+    static void static_set(int v) { static_value = v; }
+};
+int TestProperties::static_value = 1;
+
+struct TestPropertiesOverride : TestProperties {
+    int value = 99;
+    static int static_value;
+};
+int TestPropertiesOverride::static_value = 99;
+
+struct TestPropRVP {
+    UserType v1{1};
+    UserType v2{1};
+    static UserType sv1;
+    static UserType sv2;
+
+    const UserType &get1() const { return v1; }
+    const UserType &get2() const { return v2; }
+    UserType get_rvalue() const { return v2; }
+    void set1(int v) { v1.set(v); }
+    void set2(int v) { v2.set(v); }
+};
+UserType TestPropRVP::sv1(1);
+UserType TestPropRVP::sv2(1);
+
+// py::arg/py::arg_v testing: these arguments just record their argument when invoked
+class ArgInspector1 { public: std::string arg = "(default arg inspector 1)"; };
+class ArgInspector2 { public: std::string arg = "(default arg inspector 2)"; };
+class ArgAlwaysConverts { };
+namespace pybind11 { namespace detail {
+template <> struct type_caster<ArgInspector1> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector1, _("ArgInspector1"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector1 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector1 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgInspector2> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector2, _("ArgInspector2"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector2 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector2 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgAlwaysConverts> {
+public:
+    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, _("ArgAlwaysConverts"));
+
+    bool load(handle, bool convert) {
+        return convert;
+    }
+
+    static handle cast(const ArgAlwaysConverts &, return_value_policy, handle) {
+        return py::none().release();
+    }
+};
+}}
+
+// test_custom_caster_destruction
+class DestructionTester {
+public:
+    DestructionTester() { print_default_created(this); }
+    ~DestructionTester() { print_destroyed(this); }
+    DestructionTester(const DestructionTester &) { print_copy_created(this); }
+    DestructionTester(DestructionTester &&) { print_move_created(this); }
+    DestructionTester &operator=(const DestructionTester &) { print_copy_assigned(this); return *this; }
+    DestructionTester &operator=(DestructionTester &&) { print_move_assigned(this); return *this; }
+};
+namespace pybind11 { namespace detail {
+template <> struct type_caster<DestructionTester> {
+    PYBIND11_TYPE_CASTER(DestructionTester, _("DestructionTester"));
+    bool load(handle, bool) { return true; }
+
+    static handle cast(const DestructionTester &, return_value_policy, handle) {
+        return py::bool_(true).release();
+    }
+};
+}}
+
+// Test None-allowed py::arg argument policy
+class NoneTester { public: int answer = 42; };
+int none1(const NoneTester &obj) { return obj.answer; }
+int none2(NoneTester *obj) { return obj ? obj->answer : -1; }
+int none3(std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
+int none4(std::shared_ptr<NoneTester> *obj) { return obj && *obj ? (*obj)->answer : -1; }
+int none5(std::shared_ptr<NoneTester> obj) { return obj ? obj->answer : -1; }
+
+struct StrIssue {
+    int val = -1;
+
+    StrIssue() = default;
+    StrIssue(int i) : val{i} {}
+};
+
+// Issues #854, #910: incompatible function args when member function/pointer is in unregistered base class
+class UnregisteredBase {
+public:
+    void do_nothing() const {}
+    void increase_value() { rw_value++; ro_value += 0.25; }
+    void set_int(int v) { rw_value = v; }
+    int get_int() const { return rw_value; }
+    double get_double() const { return ro_value; }
+    int rw_value = 42;
+    double ro_value = 1.25;
+};
+class RegisteredDerived : public UnregisteredBase {
+public:
+    using UnregisteredBase::UnregisteredBase;
+    double sum() const { return rw_value + ro_value; }
+};
+
+TEST_SUBMODULE(methods_and_attributes, m) {
+    // test_methods_and_attributes
+    py::class_<ExampleMandA> emna(m, "ExampleMandA");
+    emna.def(py::init<>())
+        .def(py::init<int>())
+        .def(py::init<const ExampleMandA&>())
+        .def("add1", &ExampleMandA::add1)
+        .def("add2", &ExampleMandA::add2)
+        .def("add3", &ExampleMandA::add3)
+        .def("add4", &ExampleMandA::add4)
+        .def("add5", &ExampleMandA::add5)
+        .def("add6", &ExampleMandA::add6)
+        .def("add7", &ExampleMandA::add7)
+        .def("add8", &ExampleMandA::add8)
+        .def("add9", &ExampleMandA::add9)
+        .def("add10", &ExampleMandA::add10)
+        .def("self1", &ExampleMandA::self1)
+        .def("self2", &ExampleMandA::self2)
+        .def("self3", &ExampleMandA::self3)
+        .def("self4", &ExampleMandA::self4)
+        .def("self5", &ExampleMandA::self5)
+        .def("internal1", &ExampleMandA::internal1)
+        .def("internal2", &ExampleMandA::internal2)
+        .def("internal3", &ExampleMandA::internal3)
+        .def("internal4", &ExampleMandA::internal4)
+        .def("internal5", &ExampleMandA::internal5)
+#if defined(PYBIND11_OVERLOAD_CAST)
+        .def("overloaded", py::overload_cast<>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,   float>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float,   int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,     int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_float", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_const", py::overload_cast<int         >(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,   float>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float,   int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,     int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float, float>(&ExampleMandA::overloaded, py::const_))
+#else
+        // Use both the traditional static_cast method and the C++11 compatible overload_cast_
+        .def("overloaded", overload_cast_<>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int,   float>()(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float,   int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int,     int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float, float)>(&ExampleMandA::overloaded))
+        .def("overloaded_float", overload_cast_<float, float>()(&ExampleMandA::overloaded))
+        .def("overloaded_const", overload_cast_<int         >()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", overload_cast_<int,   float>()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float,   int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int,     int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float, float) const>(&ExampleMandA::overloaded))
+#endif
+        // test_no_mixed_overloads
+        // Raise error if trying to mix static/non-static overloads on the same name:
+        .def_static("add_mixed_overloads1", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def       ("overload_mixed1", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded))
+                .def_static("overload_mixed1", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded));
+        })
+        .def_static("add_mixed_overloads2", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def_static("overload_mixed2", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded))
+                .def       ("overload_mixed2", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded));
+        })
+        .def("__str__", &ExampleMandA::toString)
+        .def_readwrite("value", &ExampleMandA::value);
+
+    // test_copy_method
+    // Issue #443: can't call copied methods in Python 3
+    emna.attr("add2b") = emna.attr("add2");
+
+    // test_properties, test_static_properties, test_static_cls
+    py::class_<TestProperties>(m, "TestProperties")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestProperties::value)
+        .def_readwrite("def_readwrite", &TestProperties::value)
+        .def_property("def_writeonly", nullptr,
+                      [](TestProperties& s,int v) { s.value = v; } )
+        .def_property("def_property_writeonly", nullptr, &TestProperties::set)
+        .def_property_readonly("def_property_readonly", &TestProperties::get)
+        .def_property("def_property", &TestProperties::get, &TestProperties::set)
+        .def_property("def_property_impossible", nullptr, nullptr)
+        .def_readonly_static("def_readonly_static", &TestProperties::static_value)
+        .def_readwrite_static("def_readwrite_static", &TestProperties::static_value)
+        .def_property_static("def_writeonly_static", nullptr,
+                             [](py::object, int v) { TestProperties::static_value = v; })
+        .def_property_readonly_static("def_property_readonly_static",
+                                      [](py::object) { return TestProperties::static_get(); })
+        .def_property_static("def_property_writeonly_static", nullptr,
+                             [](py::object, int v) { return TestProperties::static_set(v); })
+        .def_property_static("def_property_static",
+                             [](py::object) { return TestProperties::static_get(); },
+                             [](py::object, int v) { TestProperties::static_set(v); })
+        .def_property_static("static_cls",
+                             [](py::object cls) { return cls; },
+                             [](py::object cls, py::function f) { f(cls); });
+
+    py::class_<TestPropertiesOverride, TestProperties>(m, "TestPropertiesOverride")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestPropertiesOverride::value)
+        .def_readonly_static("def_readonly_static", &TestPropertiesOverride::static_value);
+
+    auto static_get1 = [](py::object) -> const UserType & { return TestPropRVP::sv1; };
+    auto static_get2 = [](py::object) -> const UserType & { return TestPropRVP::sv2; };
+    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.set(v); };
+    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.set(v); };
+    auto rvp_copy = py::return_value_policy::copy;
+
+    // test_property_return_value_policies
+    py::class_<TestPropRVP>(m, "TestPropRVP")
+        .def(py::init<>())
+        .def_property_readonly("ro_ref", &TestPropRVP::get1)
+        .def_property_readonly("ro_copy", &TestPropRVP::get2, rvp_copy)
+        .def_property_readonly("ro_func", py::cpp_function(&TestPropRVP::get2, rvp_copy))
+        .def_property("rw_ref", &TestPropRVP::get1, &TestPropRVP::set1)
+        .def_property("rw_copy", &TestPropRVP::get2, &TestPropRVP::set2, rvp_copy)
+        .def_property("rw_func", py::cpp_function(&TestPropRVP::get2, rvp_copy), &TestPropRVP::set2)
+        .def_property_readonly_static("static_ro_ref", static_get1)
+        .def_property_readonly_static("static_ro_copy", static_get2, rvp_copy)
+        .def_property_readonly_static("static_ro_func", py::cpp_function(static_get2, rvp_copy))
+        .def_property_static("static_rw_ref", static_get1, static_set1)
+        .def_property_static("static_rw_copy", static_get2, static_set2, rvp_copy)
+        .def_property_static("static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
+        // test_property_rvalue_policy
+        .def_property_readonly("rvalue", &TestPropRVP::get_rvalue)
+        .def_property_readonly_static("static_rvalue", [](py::object) { return UserType(1); });
+
+    // test_metaclass_override
+    struct MetaclassOverride { };
+    py::class_<MetaclassOverride>(m, "MetaclassOverride", py::metaclass((PyObject *) &PyType_Type))
+        .def_property_readonly_static("readonly", [](py::object) { return 1; });
+
+#if !defined(PYPY_VERSION)
+    // test_dynamic_attributes
+    class DynamicClass {
+    public:
+        DynamicClass() { print_default_created(this); }
+        ~DynamicClass() { print_destroyed(this); }
+    };
+    py::class_<DynamicClass>(m, "DynamicClass", py::dynamic_attr())
+        .def(py::init());
+
+    class CppDerivedDynamicClass : public DynamicClass { };
+    py::class_<CppDerivedDynamicClass, DynamicClass>(m, "CppDerivedDynamicClass")
+        .def(py::init());
+#endif
+
+    // test_noconvert_args
+    //
+    // Test converting.  The ArgAlwaysConverts is just there to make the first no-conversion pass
+    // fail so that our call always ends up happening via the second dispatch (the one that allows
+    // some conversion).
+    class ArgInspector {
+    public:
+        ArgInspector1 f(ArgInspector1 a, ArgAlwaysConverts) { return a; }
+        std::string g(ArgInspector1 a, const ArgInspector1 &b, int c, ArgInspector2 *d, ArgAlwaysConverts) {
+            return a.arg + "\n" + b.arg + "\n" + std::to_string(c) + "\n" + d->arg;
+        }
+        static ArgInspector2 h(ArgInspector2 a, ArgAlwaysConverts) { return a; }
+    };
+    py::class_<ArgInspector>(m, "ArgInspector")
+        .def(py::init<>())
+        .def("f", &ArgInspector::f, py::arg(), py::arg() = ArgAlwaysConverts())
+        .def("g", &ArgInspector::g, "a"_a.noconvert(), "b"_a, "c"_a.noconvert()=13, "d"_a=ArgInspector2(), py::arg() = ArgAlwaysConverts())
+        .def_static("h", &ArgInspector::h, py::arg().noconvert(), py::arg() = ArgAlwaysConverts())
+        ;
+    m.def("arg_inspect_func", [](ArgInspector2 a, ArgInspector1 b, ArgAlwaysConverts) { return a.arg + "\n" + b.arg; },
+            py::arg().noconvert(false), py::arg_v(nullptr, ArgInspector1()).noconvert(true), py::arg() = ArgAlwaysConverts());
+
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("ints_preferred", [](int i) { return i / 2; }, py::arg("i"));
+    m.def("ints_only", [](int i) { return i / 2; }, py::arg("i").noconvert());
+
+    // test_bad_arg_default
+    // Issue/PR #648: bad arg default debugging output
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+    m.def("bad_arg_def_named", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg("a") = UnregisteredType());
+    });
+    m.def("bad_arg_def_unnamed", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg() = UnregisteredType());
+    });
+
+    // test_accepts_none
+    py::class_<NoneTester, std::shared_ptr<NoneTester>>(m, "NoneTester")
+        .def(py::init<>());
+    m.def("no_none1", &none1, py::arg().none(false));
+    m.def("no_none2", &none2, py::arg().none(false));
+    m.def("no_none3", &none3, py::arg().none(false));
+    m.def("no_none4", &none4, py::arg().none(false));
+    m.def("no_none5", &none5, py::arg().none(false));
+    m.def("ok_none1", &none1);
+    m.def("ok_none2", &none2, py::arg().none(true));
+    m.def("ok_none3", &none3);
+    m.def("ok_none4", &none4, py::arg().none(true));
+    m.def("ok_none5", &none5);
+
+    // test_str_issue
+    // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
+    py::class_<StrIssue>(m, "StrIssue")
+        .def(py::init<int>())
+        .def(py::init<>())
+        .def("__str__", [](const StrIssue &si) {
+            return "StrIssue[" + std::to_string(si.val) + "]"; }
+        );
+
+    // test_unregistered_base_implementations
+    //
+    // Issues #854/910: incompatible function args when member function/pointer is in unregistered
+    // base class The methods and member pointers below actually resolve to members/pointers in
+    // UnregisteredBase; before this test/fix they would be registered via lambda with a first
+    // argument of an unregistered type, and thus uncallable.
+    py::class_<RegisteredDerived>(m, "RegisteredDerived")
+        .def(py::init<>())
+        .def("do_nothing", &RegisteredDerived::do_nothing)
+        .def("increase_value", &RegisteredDerived::increase_value)
+        .def_readwrite("rw_value", &RegisteredDerived::rw_value)
+        .def_readonly("ro_value", &RegisteredDerived::ro_value)
+        // These should trigger a static_assert if uncommented
+        //.def_readwrite("fails", &UserType::value) // should trigger a static_assert if uncommented
+        //.def_readonly("fails", &UserType::value) // should trigger a static_assert if uncommented
+        .def_property("rw_value_prop", &RegisteredDerived::get_int, &RegisteredDerived::set_int)
+        .def_property_readonly("ro_value_prop", &RegisteredDerived::get_double)
+        // This one is in the registered class:
+        .def("sum", &RegisteredDerived::sum)
+        ;
+
+    using Adapted = decltype(py::method_adaptor<RegisteredDerived>(&RegisteredDerived::do_nothing));
+    static_assert(std::is_same<Adapted, void (RegisteredDerived::*)() const>::value, "");
+
+    // test_custom_caster_destruction
+    // Test that `take_ownership` works on types with a custom type caster when given a pointer
+
+    // default policy: don't take ownership:
+    m.def("custom_caster_no_destroy", []() { static auto *dt = new DestructionTester(); return dt; });
+
+    m.def("custom_caster_destroy", []() { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Takes ownership: destroy when finished
+    m.def("custom_caster_destroy_const", []() -> const DestructionTester * { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Likewise (const doesn't inhibit destruction)
+    m.def("destruction_tester_cstats", &ConstructorStats::get<DestructionTester>, py::return_value_policy::reference);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cpp
new file mode 100644
index 0000000..c7b82f1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cpp
@@ -0,0 +1,460 @@
+/*
+    tests/test_methods_and_attributes.cpp -- constructors, deconstructors, attribute access,
+    __str__, argument and return value conventions
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+#if !defined(PYBIND11_OVERLOAD_CAST)
+template <typename... Args>
+using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+#endif
+
+class ExampleMandA {
+public:
+    ExampleMandA() { print_default_created(this); }
+    ExampleMandA(int value) : value(value) { print_created(this, value); }
+    ExampleMandA(const ExampleMandA &e) : value(e.value) { print_copy_created(this); }
+    ExampleMandA(ExampleMandA &&e) : value(e.value) { print_move_created(this); }
+    ~ExampleMandA() { print_destroyed(this); }
+
+    std::string toString() {
+        return "ExampleMandA[value=" + std::to_string(value) + "]";
+    }
+
+    void operator=(const ExampleMandA &e) { print_copy_assigned(this); value = e.value; }
+    void operator=(ExampleMandA &&e) { print_move_assigned(this); value = e.value; }
+
+    void add1(ExampleMandA other) { value += other.value; }         // passing by value
+    void add2(ExampleMandA &other) { value += other.value; }        // passing by reference
+    void add3(const ExampleMandA &other) { value += other.value; }  // passing by const reference
+    void add4(ExampleMandA *other) { value += other->value; }       // passing by pointer
+    void add5(const ExampleMandA *other) { value += other->value; } // passing by const pointer
+
+    void add6(int other) { value += other; }                        // passing by value
+    void add7(int &other) { value += other; }                       // passing by reference
+    void add8(const int &other) { value += other; }                 // passing by const reference
+    void add9(int *other) { value += *other; }                      // passing by pointer
+    void add10(const int *other) { value += *other; }               // passing by const pointer
+
+    ExampleMandA self1() { return *this; }                          // return by value
+    ExampleMandA &self2() { return *this; }                         // return by reference
+    const ExampleMandA &self3() { return *this; }                   // return by const reference
+    ExampleMandA *self4() { return this; }                          // return by pointer
+    const ExampleMandA *self5() { return this; }                    // return by const pointer
+
+    int internal1() { return value; }                               // return by value
+    int &internal2() { return value; }                              // return by reference
+    const int &internal3() { return value; }                        // return by const reference
+    int *internal4() { return &value; }                             // return by pointer
+    const int *internal5() { return &value; }                       // return by const pointer
+
+    py::str overloaded()             { return "()"; }
+    py::str overloaded(int)          { return "(int)"; }
+    py::str overloaded(int, float)   { return "(int, float)"; }
+    py::str overloaded(float, int)   { return "(float, int)"; }
+    py::str overloaded(int, int)     { return "(int, int)"; }
+    py::str overloaded(float, float) { return "(float, float)"; }
+    py::str overloaded(int)          const { return "(int) const"; }
+    py::str overloaded(int, float)   const { return "(int, float) const"; }
+    py::str overloaded(float, int)   const { return "(float, int) const"; }
+    py::str overloaded(int, int)     const { return "(int, int) const"; }
+    py::str overloaded(float, float) const { return "(float, float) const"; }
+
+    static py::str overloaded(float) { return "static float"; }
+
+    int value = 0;
+};
+
+struct TestProperties {
+    int value = 1;
+    static int static_value;
+
+    int get() const { return value; }
+    void set(int v) { value = v; }
+
+    static int static_get() { return static_value; }
+    static void static_set(int v) { static_value = v; }
+};
+int TestProperties::static_value = 1;
+
+struct TestPropertiesOverride : TestProperties {
+    int value = 99;
+    static int static_value;
+};
+int TestPropertiesOverride::static_value = 99;
+
+struct TestPropRVP {
+    UserType v1{1};
+    UserType v2{1};
+    static UserType sv1;
+    static UserType sv2;
+
+    const UserType &get1() const { return v1; }
+    const UserType &get2() const { return v2; }
+    UserType get_rvalue() const { return v2; }
+    void set1(int v) { v1.set(v); }
+    void set2(int v) { v2.set(v); }
+};
+UserType TestPropRVP::sv1(1);
+UserType TestPropRVP::sv2(1);
+
+// py::arg/py::arg_v testing: these arguments just record their argument when invoked
+class ArgInspector1 { public: std::string arg = "(default arg inspector 1)"; };
+class ArgInspector2 { public: std::string arg = "(default arg inspector 2)"; };
+class ArgAlwaysConverts { };
+namespace pybind11 { namespace detail {
+template <> struct type_caster<ArgInspector1> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector1, _("ArgInspector1"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector1 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector1 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgInspector2> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector2, _("ArgInspector2"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector2 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector2 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgAlwaysConverts> {
+public:
+    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, _("ArgAlwaysConverts"));
+
+    bool load(handle, bool convert) {
+        return convert;
+    }
+
+    static handle cast(const ArgAlwaysConverts &, return_value_policy, handle) {
+        return py::none().release();
+    }
+};
+}}
+
+// test_custom_caster_destruction
+class DestructionTester {
+public:
+    DestructionTester() { print_default_created(this); }
+    ~DestructionTester() { print_destroyed(this); }
+    DestructionTester(const DestructionTester &) { print_copy_created(this); }
+    DestructionTester(DestructionTester &&) { print_move_created(this); }
+    DestructionTester &operator=(const DestructionTester &) { print_copy_assigned(this); return *this; }
+    DestructionTester &operator=(DestructionTester &&) { print_move_assigned(this); return *this; }
+};
+namespace pybind11 { namespace detail {
+template <> struct type_caster<DestructionTester> {
+    PYBIND11_TYPE_CASTER(DestructionTester, _("DestructionTester"));
+    bool load(handle, bool) { return true; }
+
+    static handle cast(const DestructionTester &, return_value_policy, handle) {
+        return py::bool_(true).release();
+    }
+};
+}}
+
+// Test None-allowed py::arg argument policy
+class NoneTester { public: int answer = 42; };
+int none1(const NoneTester &obj) { return obj.answer; }
+int none2(NoneTester *obj) { return obj ? obj->answer : -1; }
+int none3(std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
+int none4(std::shared_ptr<NoneTester> *obj) { return obj && *obj ? (*obj)->answer : -1; }
+int none5(std::shared_ptr<NoneTester> obj) { return obj ? obj->answer : -1; }
+
+struct StrIssue {
+    int val = -1;
+
+    StrIssue() = default;
+    StrIssue(int i) : val{i} {}
+};
+
+// Issues #854, #910: incompatible function args when member function/pointer is in unregistered base class
+class UnregisteredBase {
+public:
+    void do_nothing() const {}
+    void increase_value() { rw_value++; ro_value += 0.25; }
+    void set_int(int v) { rw_value = v; }
+    int get_int() const { return rw_value; }
+    double get_double() const { return ro_value; }
+    int rw_value = 42;
+    double ro_value = 1.25;
+};
+class RegisteredDerived : public UnregisteredBase {
+public:
+    using UnregisteredBase::UnregisteredBase;
+    double sum() const { return rw_value + ro_value; }
+};
+
+TEST_SUBMODULE(methods_and_attributes, m) {
+    // test_methods_and_attributes
+    py::class_<ExampleMandA> emna(m, "ExampleMandA");
+    emna.def(py::init<>())
+        .def(py::init<int>())
+        .def(py::init<const ExampleMandA&>())
+        .def("add1", &ExampleMandA::add1)
+        .def("add2", &ExampleMandA::add2)
+        .def("add3", &ExampleMandA::add3)
+        .def("add4", &ExampleMandA::add4)
+        .def("add5", &ExampleMandA::add5)
+        .def("add6", &ExampleMandA::add6)
+        .def("add7", &ExampleMandA::add7)
+        .def("add8", &ExampleMandA::add8)
+        .def("add9", &ExampleMandA::add9)
+        .def("add10", &ExampleMandA::add10)
+        .def("self1", &ExampleMandA::self1)
+        .def("self2", &ExampleMandA::self2)
+        .def("self3", &ExampleMandA::self3)
+        .def("self4", &ExampleMandA::self4)
+        .def("self5", &ExampleMandA::self5)
+        .def("internal1", &ExampleMandA::internal1)
+        .def("internal2", &ExampleMandA::internal2)
+        .def("internal3", &ExampleMandA::internal3)
+        .def("internal4", &ExampleMandA::internal4)
+        .def("internal5", &ExampleMandA::internal5)
+#if defined(PYBIND11_OVERLOAD_CAST)
+        .def("overloaded", py::overload_cast<>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,   float>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float,   int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,     int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_float", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_const", py::overload_cast<int         >(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,   float>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float,   int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,     int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float, float>(&ExampleMandA::overloaded, py::const_))
+#else
+        // Use both the traditional static_cast method and the C++11 compatible overload_cast_
+        .def("overloaded", overload_cast_<>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int>()(&ExampleMandA::overloaded))
+        .def("overloaded", overload_cast_<int,   float>()(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float,   int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int,     int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float, float)>(&ExampleMandA::overloaded))
+        .def("overloaded_float", overload_cast_<float, float>()(&ExampleMandA::overloaded))
+        .def("overloaded_const", overload_cast_<int         >()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", overload_cast_<int,   float>()(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float,   int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int,     int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float, float) const>(&ExampleMandA::overloaded))
+#endif
+        // test_no_mixed_overloads
+        // Raise error if trying to mix static/non-static overloads on the same name:
+        .def_static("add_mixed_overloads1", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def       ("overload_mixed1", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded))
+                .def_static("overload_mixed1", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded));
+        })
+        .def_static("add_mixed_overloads2", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def_static("overload_mixed2", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded))
+                .def       ("overload_mixed2", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded));
+        })
+        .def("__str__", &ExampleMandA::toString)
+        .def_readwrite("value", &ExampleMandA::value);
+
+    // test_copy_method
+    // Issue #443: can't call copied methods in Python 3
+    emna.attr("add2b") = emna.attr("add2");
+
+    // test_properties, test_static_properties, test_static_cls
+    py::class_<TestProperties>(m, "TestProperties")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestProperties::value)
+        .def_readwrite("def_readwrite", &TestProperties::value)
+        .def_property("def_writeonly", nullptr,
+                      [](TestProperties& s,int v) { s.value = v; } )
+        .def_property("def_property_writeonly", nullptr, &TestProperties::set)
+        .def_property_readonly("def_property_readonly", &TestProperties::get)
+        .def_property("def_property", &TestProperties::get, &TestProperties::set)
+        .def_property("def_property_impossible", nullptr, nullptr)
+        .def_readonly_static("def_readonly_static", &TestProperties::static_value)
+        .def_readwrite_static("def_readwrite_static", &TestProperties::static_value)
+        .def_property_static("def_writeonly_static", nullptr,
+                             [](py::object, int v) { TestProperties::static_value = v; })
+        .def_property_readonly_static("def_property_readonly_static",
+                                      [](py::object) { return TestProperties::static_get(); })
+        .def_property_static("def_property_writeonly_static", nullptr,
+                             [](py::object, int v) { return TestProperties::static_set(v); })
+        .def_property_static("def_property_static",
+                             [](py::object) { return TestProperties::static_get(); },
+                             [](py::object, int v) { TestProperties::static_set(v); })
+        .def_property_static("static_cls",
+                             [](py::object cls) { return cls; },
+                             [](py::object cls, py::function f) { f(cls); });
+
+    py::class_<TestPropertiesOverride, TestProperties>(m, "TestPropertiesOverride")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestPropertiesOverride::value)
+        .def_readonly_static("def_readonly_static", &TestPropertiesOverride::static_value);
+
+    auto static_get1 = [](py::object) -> const UserType & { return TestPropRVP::sv1; };
+    auto static_get2 = [](py::object) -> const UserType & { return TestPropRVP::sv2; };
+    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.set(v); };
+    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.set(v); };
+    auto rvp_copy = py::return_value_policy::copy;
+
+    // test_property_return_value_policies
+    py::class_<TestPropRVP>(m, "TestPropRVP")
+        .def(py::init<>())
+        .def_property_readonly("ro_ref", &TestPropRVP::get1)
+        .def_property_readonly("ro_copy", &TestPropRVP::get2, rvp_copy)
+        .def_property_readonly("ro_func", py::cpp_function(&TestPropRVP::get2, rvp_copy))
+        .def_property("rw_ref", &TestPropRVP::get1, &TestPropRVP::set1)
+        .def_property("rw_copy", &TestPropRVP::get2, &TestPropRVP::set2, rvp_copy)
+        .def_property("rw_func", py::cpp_function(&TestPropRVP::get2, rvp_copy), &TestPropRVP::set2)
+        .def_property_readonly_static("static_ro_ref", static_get1)
+        .def_property_readonly_static("static_ro_copy", static_get2, rvp_copy)
+        .def_property_readonly_static("static_ro_func", py::cpp_function(static_get2, rvp_copy))
+        .def_property_static("static_rw_ref", static_get1, static_set1)
+        .def_property_static("static_rw_copy", static_get2, static_set2, rvp_copy)
+        .def_property_static("static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
+        // test_property_rvalue_policy
+        .def_property_readonly("rvalue", &TestPropRVP::get_rvalue)
+        .def_property_readonly_static("static_rvalue", [](py::object) { return UserType(1); });
+
+    // test_metaclass_override
+    struct MetaclassOverride { };
+    py::class_<MetaclassOverride>(m, "MetaclassOverride", py::metaclass((PyObject *) &PyType_Type))
+        .def_property_readonly_static("readonly", [](py::object) { return 1; });
+
+#if !defined(PYPY_VERSION)
+    // test_dynamic_attributes
+    class DynamicClass {
+    public:
+        DynamicClass() { print_default_created(this); }
+        ~DynamicClass() { print_destroyed(this); }
+    };
+    py::class_<DynamicClass>(m, "DynamicClass", py::dynamic_attr())
+        .def(py::init());
+
+    class CppDerivedDynamicClass : public DynamicClass { };
+    py::class_<CppDerivedDynamicClass, DynamicClass>(m, "CppDerivedDynamicClass")
+        .def(py::init());
+#endif
+
+    // test_noconvert_args
+    //
+    // Test converting.  The ArgAlwaysConverts is just there to make the first no-conversion pass
+    // fail so that our call always ends up happening via the second dispatch (the one that allows
+    // some conversion).
+    class ArgInspector {
+    public:
+        ArgInspector1 f(ArgInspector1 a, ArgAlwaysConverts) { return a; }
+        std::string g(ArgInspector1 a, const ArgInspector1 &b, int c, ArgInspector2 *d, ArgAlwaysConverts) {
+            return a.arg + "\n" + b.arg + "\n" + std::to_string(c) + "\n" + d->arg;
+        }
+        static ArgInspector2 h(ArgInspector2 a, ArgAlwaysConverts) { return a; }
+    };
+    py::class_<ArgInspector>(m, "ArgInspector")
+        .def(py::init<>())
+        .def("f", &ArgInspector::f, py::arg(), py::arg() = ArgAlwaysConverts())
+        .def("g", &ArgInspector::g, "a"_a.noconvert(), "b"_a, "c"_a.noconvert()=13, "d"_a=ArgInspector2(), py::arg() = ArgAlwaysConverts())
+        .def_static("h", &ArgInspector::h, py::arg().noconvert(), py::arg() = ArgAlwaysConverts())
+        ;
+    m.def("arg_inspect_func", [](ArgInspector2 a, ArgInspector1 b, ArgAlwaysConverts) { return a.arg + "\n" + b.arg; },
+            py::arg().noconvert(false), py::arg_v(nullptr, ArgInspector1()).noconvert(true), py::arg() = ArgAlwaysConverts());
+
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("ints_preferred", [](int i) { return i / 2; }, py::arg("i"));
+    m.def("ints_only", [](int i) { return i / 2; }, py::arg("i").noconvert());
+
+    // test_bad_arg_default
+    // Issue/PR #648: bad arg default debugging output
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+    m.def("bad_arg_def_named", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg("a") = UnregisteredType());
+    });
+    m.def("bad_arg_def_unnamed", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg() = UnregisteredType());
+    });
+
+    // test_accepts_none
+    py::class_<NoneTester, std::shared_ptr<NoneTester>>(m, "NoneTester")
+        .def(py::init<>());
+    m.def("no_none1", &none1, py::arg().none(false));
+    m.def("no_none2", &none2, py::arg().none(false));
+    m.def("no_none3", &none3, py::arg().none(false));
+    m.def("no_none4", &none4, py::arg().none(false));
+    m.def("no_none5", &none5, py::arg().none(false));
+    m.def("ok_none1", &none1);
+    m.def("ok_none2", &none2, py::arg().none(true));
+    m.def("ok_none3", &none3);
+    m.def("ok_none4", &none4, py::arg().none(true));
+    m.def("ok_none5", &none5);
+
+    // test_str_issue
+    // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
+    py::class_<StrIssue>(m, "StrIssue")
+        .def(py::init<int>())
+        .def(py::init<>())
+        .def("__str__", [](const StrIssue &si) {
+            return "StrIssue[" + std::to_string(si.val) + "]"; }
+        );
+
+    // test_unregistered_base_implementations
+    //
+    // Issues #854/910: incompatible function args when member function/pointer is in unregistered
+    // base class The methods and member pointers below actually resolve to members/pointers in
+    // UnregisteredBase; before this test/fix they would be registered via lambda with a first
+    // argument of an unregistered type, and thus uncallable.
+    py::class_<RegisteredDerived>(m, "RegisteredDerived")
+        .def(py::init<>())
+        .def("do_nothing", &RegisteredDerived::do_nothing)
+        .def("increase_value", &RegisteredDerived::increase_value)
+        .def_readwrite("rw_value", &RegisteredDerived::rw_value)
+        .def_readonly("ro_value", &RegisteredDerived::ro_value)
+        // These should trigger a static_assert if uncommented
+        //.def_readwrite("fails", &UserType::value) // should trigger a static_assert if uncommented
+        //.def_readonly("fails", &UserType::value) // should trigger a static_assert if uncommented
+        .def_property("rw_value_prop", &RegisteredDerived::get_int, &RegisteredDerived::set_int)
+        .def_property_readonly("ro_value_prop", &RegisteredDerived::get_double)
+        // This one is in the registered class:
+        .def("sum", &RegisteredDerived::sum)
+        ;
+
+    using Adapted = decltype(py::method_adaptor<RegisteredDerived>(&RegisteredDerived::do_nothing));
+    static_assert(std::is_same<Adapted, void (RegisteredDerived::*)() const>::value, "");
+
+    // test_custom_caster_destruction
+    // Test that `take_ownership` works on types with a custom type caster when given a pointer
+
+    // default policy: don't take ownership:
+    m.def("custom_caster_no_destroy", []() { static auto *dt = new DestructionTester(); return dt; });
+
+    m.def("custom_caster_destroy", []() { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Takes ownership: destroy when finished
+    m.def("custom_caster_destroy_const", []() -> const DestructionTester * { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Likewise (const doesn't inhibit destruction)
+    m.def("destruction_tester_cstats", &ConstructorStats::get<DestructionTester>, py::return_value_policy::reference);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.py
new file mode 100644
index 0000000..f1c862b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.py
@@ -0,0 +1,512 @@
+import pytest
+from pybind11_tests import methods_and_attributes as m
+from pybind11_tests import ConstructorStats
+
+
+def test_methods_and_attributes():
+    instance1 = m.ExampleMandA()
+    instance2 = m.ExampleMandA(32)
+
+    instance1.add1(instance2)
+    instance1.add2(instance2)
+    instance1.add3(instance2)
+    instance1.add4(instance2)
+    instance1.add5(instance2)
+    instance1.add6(32)
+    instance1.add7(32)
+    instance1.add8(32)
+    instance1.add9(32)
+    instance1.add10(32)
+
+    assert str(instance1) == "ExampleMandA[value=320]"
+    assert str(instance2) == "ExampleMandA[value=32]"
+    assert str(instance1.self1()) == "ExampleMandA[value=320]"
+    assert str(instance1.self2()) == "ExampleMandA[value=320]"
+    assert str(instance1.self3()) == "ExampleMandA[value=320]"
+    assert str(instance1.self4()) == "ExampleMandA[value=320]"
+    assert str(instance1.self5()) == "ExampleMandA[value=320]"
+
+    assert instance1.internal1() == 320
+    assert instance1.internal2() == 320
+    assert instance1.internal3() == 320
+    assert instance1.internal4() == 320
+    assert instance1.internal5() == 320
+
+    assert instance1.overloaded() == "()"
+    assert instance1.overloaded(0) == "(int)"
+    assert instance1.overloaded(1, 1.0) == "(int, float)"
+    assert instance1.overloaded(2.0, 2) == "(float, int)"
+    assert instance1.overloaded(3,   3) == "(int, int)"
+    assert instance1.overloaded(4., 4.) == "(float, float)"
+    assert instance1.overloaded_const(-3) == "(int) const"
+    assert instance1.overloaded_const(5, 5.0) == "(int, float) const"
+    assert instance1.overloaded_const(6.0, 6) == "(float, int) const"
+    assert instance1.overloaded_const(7,   7) == "(int, int) const"
+    assert instance1.overloaded_const(8., 8.) == "(float, float) const"
+    assert instance1.overloaded_float(1, 1) == "(float, float)"
+    assert instance1.overloaded_float(1, 1.) == "(float, float)"
+    assert instance1.overloaded_float(1., 1) == "(float, float)"
+    assert instance1.overloaded_float(1., 1.) == "(float, float)"
+
+    assert instance1.value == 320
+    instance1.value = 100
+    assert str(instance1) == "ExampleMandA[value=100]"
+
+    cstats = ConstructorStats.get(m.ExampleMandA)
+    assert cstats.alive() == 2
+    del instance1, instance2
+    assert cstats.alive() == 0
+    assert cstats.values() == ["32"]
+    assert cstats.default_constructions == 1
+    assert cstats.copy_constructions == 3
+    assert cstats.move_constructions >= 1
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_copy_method():
+    """Issue #443: calling copied methods fails in Python 3"""
+
+    m.ExampleMandA.add2c = m.ExampleMandA.add2
+    m.ExampleMandA.add2d = m.ExampleMandA.add2b
+    a = m.ExampleMandA(123)
+    assert a.value == 123
+    a.add2(m.ExampleMandA(-100))
+    assert a.value == 23
+    a.add2b(m.ExampleMandA(20))
+    assert a.value == 43
+    a.add2c(m.ExampleMandA(6))
+    assert a.value == 49
+    a.add2d(m.ExampleMandA(-7))
+    assert a.value == 42
+
+
+def test_properties():
+    instance = m.TestProperties()
+
+    assert instance.def_readonly == 1
+    with pytest.raises(AttributeError):
+        instance.def_readonly = 2
+
+    instance.def_readwrite = 2
+    assert instance.def_readwrite == 2
+
+    assert instance.def_property_readonly == 2
+    with pytest.raises(AttributeError):
+        instance.def_property_readonly = 3
+
+    instance.def_property = 3
+    assert instance.def_property == 3
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_writeonly  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    instance.def_property_writeonly = 4
+    assert instance.def_property_readonly == 4
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_impossible  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    with pytest.raises(AttributeError) as excinfo:
+        instance.def_property_impossible = 5
+    assert "can't set attribute" in str(excinfo.value)
+
+
+def test_static_properties():
+    assert m.TestProperties.def_readonly_static == 1
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_readonly_static = 2
+    assert "can't set attribute" in str(excinfo.value)
+
+    m.TestProperties.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = m.TestProperties.def_writeonly_static  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    m.TestProperties.def_writeonly_static = 3
+    assert m.TestProperties.def_readonly_static == 3
+
+    assert m.TestProperties.def_property_readonly_static == 3
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_property_readonly_static = 99
+    assert "can't set attribute" in str(excinfo.value)
+
+    m.TestProperties.def_property_static = 4
+    assert m.TestProperties.def_property_static == 4
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = m.TestProperties.def_property_writeonly_static
+    assert "unreadable attribute" in str(excinfo.value)
+
+    m.TestProperties.def_property_writeonly_static = 5
+    assert m.TestProperties.def_property_static == 5
+
+    # Static property read and write via instance
+    instance = m.TestProperties()
+
+    m.TestProperties.def_readwrite_static = 0
+    assert m.TestProperties.def_readwrite_static == 0
+    assert instance.def_readwrite_static == 0
+
+    instance.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
+    assert instance.def_readwrite_static == 2
+
+    with pytest.raises(AttributeError) as excinfo:
+        dummy = instance.def_property_writeonly_static  # noqa: F841 unused var
+    assert "unreadable attribute" in str(excinfo.value)
+
+    instance.def_property_writeonly_static = 4
+    assert instance.def_property_static == 4
+
+    # It should be possible to override properties in derived classes
+    assert m.TestPropertiesOverride().def_readonly == 99
+    assert m.TestPropertiesOverride.def_readonly_static == 99
+
+
+def test_static_cls():
+    """Static property getter and setters expect the type object as the their only argument"""
+
+    instance = m.TestProperties()
+    assert m.TestProperties.static_cls is m.TestProperties
+    assert instance.static_cls is m.TestProperties
+
+    def check_self(self):
+        assert self is m.TestProperties
+
+    m.TestProperties.static_cls = check_self
+    instance.static_cls = check_self
+
+
+def test_metaclass_override():
+    """Overriding pybind11's default metaclass changes the behavior of `static_property`"""
+
+    assert type(m.ExampleMandA).__name__ == "pybind11_type"
+    assert type(m.MetaclassOverride).__name__ == "type"
+
+    assert m.MetaclassOverride.readonly == 1
+    assert type(m.MetaclassOverride.__dict__["readonly"]).__name__ == "pybind11_static_property"
+
+    # Regular `type` replaces the property instead of calling `__set__()`
+    m.MetaclassOverride.readonly = 2
+    assert m.MetaclassOverride.readonly == 2
+    assert isinstance(m.MetaclassOverride.__dict__["readonly"], int)
+
+
+def test_no_mixed_overloads():
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads1()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind static method ExampleMandA.overload_mixed1"
+             "(arg0: float) -> str")
+            )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads2()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind instance method ExampleMandA.overload_mixed2"
+             "(self: pybind11_tests.methods_and_attributes.ExampleMandA, arg0: int, arg1: int)"
+             " -> str")
+            )
+
+
+@pytest.mark.parametrize("access", ["ro", "rw", "static_ro", "static_rw"])
+def test_property_return_value_policies(access):
+    if not access.startswith("static"):
+        obj = m.TestPropRVP()
+    else:
+        obj = m.TestPropRVP
+
+    ref = getattr(obj, access + "_ref")
+    assert ref.value == 1
+    ref.value = 2
+    assert getattr(obj, access + "_ref").value == 2
+    ref.value = 1  # restore original value for static properties
+
+    copy = getattr(obj, access + "_copy")
+    assert copy.value == 1
+    copy.value = 2
+    assert getattr(obj, access + "_copy").value == 1
+
+    copy = getattr(obj, access + "_func")
+    assert copy.value == 1
+    copy.value = 2
+    assert getattr(obj, access + "_func").value == 1
+
+
+def test_property_rvalue_policy():
+    """When returning an rvalue, the return value policy is automatically changed from
+    `reference(_internal)` to `move`. The following would not work otherwise."""
+
+    instance = m.TestPropRVP()
+    o = instance.rvalue
+    assert o.value == 1
+
+    os = m.TestPropRVP.static_rvalue
+    assert os.value == 1
+
+
+# https://bitbucket.org/pypy/pypy/issues/2447
+@pytest.unsupported_on_pypy
+def test_dynamic_attributes():
+    instance = m.DynamicClass()
+    assert not hasattr(instance, "foo")
+    assert "foo" not in dir(instance)
+
+    # Dynamically add attribute
+    instance.foo = 42
+    assert hasattr(instance, "foo")
+    assert instance.foo == 42
+    assert "foo" in dir(instance)
+
+    # __dict__ should be accessible and replaceable
+    assert "foo" in instance.__dict__
+    instance.__dict__ = {"bar": True}
+    assert not hasattr(instance, "foo")
+    assert hasattr(instance, "bar")
+
+    with pytest.raises(TypeError) as excinfo:
+        instance.__dict__ = []
+    assert str(excinfo.value) == "__dict__ must be set to a dictionary, not a 'list'"
+
+    cstats = ConstructorStats.get(m.DynamicClass)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+    # Derived classes should work as well
+    class PythonDerivedDynamicClass(m.DynamicClass):
+        pass
+
+    for cls in m.CppDerivedDynamicClass, PythonDerivedDynamicClass:
+        derived = cls()
+        derived.foobar = 100
+        assert derived.foobar == 100
+
+        assert cstats.alive() == 1
+        del derived
+        assert cstats.alive() == 0
+
+
+# https://bitbucket.org/pypy/pypy/issues/2447
+@pytest.unsupported_on_pypy
+def test_cyclic_gc():
+    # One object references itself
+    instance = m.DynamicClass()
+    instance.circular_reference = instance
+
+    cstats = ConstructorStats.get(m.DynamicClass)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+    # Two object reference each other
+    i1 = m.DynamicClass()
+    i2 = m.DynamicClass()
+    i1.cycle = i2
+    i2.cycle = i1
+
+    assert cstats.alive() == 2
+    del i1, i2
+    assert cstats.alive() == 0
+
+
+def test_noconvert_args(msg):
+    a = m.ArgInspector()
+    assert msg(a.f("hi")) == """
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = hi
+    """
+    assert msg(a.g("this is a", "this is b")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        13
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42)) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42, "this is d")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = this is d
+    """
+    assert (a.h("arg 1") ==
+            "loading ArgInspector2 argument WITHOUT conversion allowed.  Argument value = arg 1")
+    assert msg(m.arg_inspect_func("A1", "A2")) == """
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = A1
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = A2
+    """
+
+    assert m.floats_preferred(4) == 2.0
+    assert m.floats_only(4.0) == 2.0
+    with pytest.raises(TypeError) as excinfo:
+        m.floats_only(4)
+    assert msg(excinfo.value) == """
+        floats_only(): incompatible function arguments. The following argument types are supported:
+            1. (f: float) -> float
+
+        Invoked with: 4
+    """
+
+    assert m.ints_preferred(4) == 2
+    assert m.ints_preferred(True) == 0
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_preferred(4.0)
+    assert msg(excinfo.value) == """
+        ints_preferred(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """  # noqa: E501 line too long
+
+    assert m.ints_only(4) == 2
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_only(4.0)
+    assert msg(excinfo.value) == """
+        ints_only(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """
+
+
+def test_bad_arg_default(msg):
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_named()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'a: UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_unnamed()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+
+def test_accepts_none(msg):
+    a = m.NoneTester()
+    assert m.no_none1(a) == 42
+    assert m.no_none2(a) == 42
+    assert m.no_none3(a) == 42
+    assert m.no_none4(a) == 42
+    assert m.no_none5(a) == 42
+    assert m.ok_none1(a) == 42
+    assert m.ok_none2(a) == 42
+    assert m.ok_none3(a) == 42
+    assert m.ok_none4(a) == 42
+    assert m.ok_none5(a) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none1(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none2(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none3(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none4(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none5(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    # The first one still raises because you can't pass None as a lvalue reference arg:
+    with pytest.raises(TypeError) as excinfo:
+        assert m.ok_none1(None) == -1
+    assert msg(excinfo.value) == """
+        ok_none1(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.methods_and_attributes.NoneTester) -> int
+
+        Invoked with: None
+    """
+
+    # The rest take the argument as pointer or holder, and accept None:
+    assert m.ok_none2(None) == -1
+    assert m.ok_none3(None) == -1
+    assert m.ok_none4(None) == -1
+    assert m.ok_none5(None) == -1
+
+
+def test_str_issue(msg):
+    """#283: __str__ called on uninitialized instance when constructor arguments invalid"""
+
+    assert str(m.StrIssue(3)) == "StrIssue[3]"
+
+    with pytest.raises(TypeError) as excinfo:
+        str(m.StrIssue("no", "such", "constructor"))
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.methods_and_attributes.StrIssue(arg0: int)
+            2. m.methods_and_attributes.StrIssue()
+
+        Invoked with: 'no', 'such', 'constructor'
+    """
+
+
+def test_unregistered_base_implementations():
+    a = m.RegisteredDerived()
+    a.do_nothing()
+    assert a.rw_value == 42
+    assert a.ro_value == 1.25
+    a.rw_value += 5
+    assert a.sum() == 48.25
+    a.increase_value()
+    assert a.rw_value == 48
+    assert a.ro_value == 1.5
+    assert a.sum() == 49.5
+    assert a.rw_value_prop == 48
+    a.rw_value_prop += 1
+    assert a.rw_value_prop == 49
+    a.increase_value()
+    assert a.ro_value_prop == 1.75
+
+
+def test_custom_caster_destruction():
+    """Tests that returning a pointer to a type that gets converted with a custom type caster gets
+    destroyed when the function has py::return_value_policy::take_ownership policy applied."""
+
+    cstats = m.destruction_tester_cstats()
+    # This one *doesn't* have take_ownership: the pointer should be used but not destroyed:
+    z = m.custom_caster_no_destroy()
+    assert cstats.alive() == 1 and cstats.default_constructions == 1
+    assert z
+
+    # take_ownership applied: this constructs a new object, casts it, then destroys it:
+    z = m.custom_caster_destroy()
+    assert z
+    assert cstats.default_constructions == 2
+
+    # Same, but with a const pointer return (which should *not* inhibit destruction):
+    z = m.custom_caster_destroy_const()
+    assert z
+    assert cstats.default_constructions == 3
+
+    # Make sure we still only have the original object (from ..._no_destroy()) alive:
+    assert cstats.alive() == 1
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cc
new file mode 100644
index 0000000..c1475fa
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cc
@@ -0,0 +1,98 @@
+/*
+    tests/test_modules.cpp -- nested modules, importing modules, and
+                            internal references
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(modules, m) {
+    // test_nested_modules
+    py::module m_sub = m.def_submodule("subsubmodule");
+    m_sub.def("submodule_func", []() { return "submodule_func()"; });
+
+    // test_reference_internal
+    class A {
+    public:
+        A(int v) : v(v) { print_created(this, v); }
+        ~A() { print_destroyed(this); }
+        A(const A&) { print_copy_created(this); }
+        A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
+        std::string toString() { return "A[" + std::to_string(v) + "]"; }
+    private:
+        int v;
+    };
+    py::class_<A>(m_sub, "A")
+        .def(py::init<int>())
+        .def("__repr__", &A::toString);
+
+    class B {
+    public:
+        B() { print_default_created(this); }
+        ~B() { print_destroyed(this); }
+        B(const B&) { print_copy_created(this); }
+        B& operator=(const B &copy) { print_copy_assigned(this); a1 = copy.a1; a2 = copy.a2; return *this; }
+        A &get_a1() { return a1; }
+        A &get_a2() { return a2; }
+
+        A a1{1};
+        A a2{2};
+    };
+    py::class_<B>(m_sub, "B")
+        .def(py::init<>())
+        .def("get_a1", &B::get_a1, "Return the internal A 1", py::return_value_policy::reference_internal)
+        .def("get_a2", &B::get_a2, "Return the internal A 2", py::return_value_policy::reference_internal)
+        .def_readwrite("a1", &B::a1)  // def_readonly uses an internal reference return policy by default
+        .def_readwrite("a2", &B::a2);
+
+    m.attr("OD") = py::module::import("collections").attr("OrderedDict");
+
+    // test_duplicate_registration
+    // Registering two things with the same name
+    m.def("duplicate_registration", []() {
+        class Dupe1 { };
+        class Dupe2 { };
+        class Dupe3 { };
+        class DupeException { };
+
+        auto dm = py::module("dummy");
+        auto failures = py::list();
+
+        py::class_<Dupe1>(dm, "Dupe1");
+        py::class_<Dupe2>(dm, "Dupe2");
+        dm.def("dupe1_factory", []() { return Dupe1(); });
+        py::exception<DupeException>(dm, "DupeException");
+
+        try {
+            py::class_<Dupe1>(dm, "Dupe1");
+            failures.append("Dupe1 class");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("Dupe1", []() { return Dupe1(); });
+            failures.append("Dupe1 function");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<Dupe3>(dm, "dupe1_factory");
+            failures.append("dupe1_factory");
+        } catch (std::runtime_error &) {}
+        try {
+            py::exception<Dupe3>(dm, "Dupe2");
+            failures.append("Dupe2");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("DupeException", []() { return 30; });
+            failures.append("DupeException1");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<DupeException>(dm, "DupeException");
+            failures.append("DupeException2");
+        } catch (std::runtime_error &) {}
+
+        return failures;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cpp
new file mode 100644
index 0000000..c1475fa
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cpp
@@ -0,0 +1,98 @@
+/*
+    tests/test_modules.cpp -- nested modules, importing modules, and
+                            internal references
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+TEST_SUBMODULE(modules, m) {
+    // test_nested_modules
+    py::module m_sub = m.def_submodule("subsubmodule");
+    m_sub.def("submodule_func", []() { return "submodule_func()"; });
+
+    // test_reference_internal
+    class A {
+    public:
+        A(int v) : v(v) { print_created(this, v); }
+        ~A() { print_destroyed(this); }
+        A(const A&) { print_copy_created(this); }
+        A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
+        std::string toString() { return "A[" + std::to_string(v) + "]"; }
+    private:
+        int v;
+    };
+    py::class_<A>(m_sub, "A")
+        .def(py::init<int>())
+        .def("__repr__", &A::toString);
+
+    class B {
+    public:
+        B() { print_default_created(this); }
+        ~B() { print_destroyed(this); }
+        B(const B&) { print_copy_created(this); }
+        B& operator=(const B &copy) { print_copy_assigned(this); a1 = copy.a1; a2 = copy.a2; return *this; }
+        A &get_a1() { return a1; }
+        A &get_a2() { return a2; }
+
+        A a1{1};
+        A a2{2};
+    };
+    py::class_<B>(m_sub, "B")
+        .def(py::init<>())
+        .def("get_a1", &B::get_a1, "Return the internal A 1", py::return_value_policy::reference_internal)
+        .def("get_a2", &B::get_a2, "Return the internal A 2", py::return_value_policy::reference_internal)
+        .def_readwrite("a1", &B::a1)  // def_readonly uses an internal reference return policy by default
+        .def_readwrite("a2", &B::a2);
+
+    m.attr("OD") = py::module::import("collections").attr("OrderedDict");
+
+    // test_duplicate_registration
+    // Registering two things with the same name
+    m.def("duplicate_registration", []() {
+        class Dupe1 { };
+        class Dupe2 { };
+        class Dupe3 { };
+        class DupeException { };
+
+        auto dm = py::module("dummy");
+        auto failures = py::list();
+
+        py::class_<Dupe1>(dm, "Dupe1");
+        py::class_<Dupe2>(dm, "Dupe2");
+        dm.def("dupe1_factory", []() { return Dupe1(); });
+        py::exception<DupeException>(dm, "DupeException");
+
+        try {
+            py::class_<Dupe1>(dm, "Dupe1");
+            failures.append("Dupe1 class");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("Dupe1", []() { return Dupe1(); });
+            failures.append("Dupe1 function");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<Dupe3>(dm, "dupe1_factory");
+            failures.append("dupe1_factory");
+        } catch (std::runtime_error &) {}
+        try {
+            py::exception<Dupe3>(dm, "Dupe2");
+            failures.append("Dupe2");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("DupeException", []() { return 30; });
+            failures.append("DupeException1");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<DupeException>(dm, "DupeException");
+            failures.append("DupeException2");
+        } catch (std::runtime_error &) {}
+
+        return failures;
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.py
new file mode 100644
index 0000000..2552838
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.py
@@ -0,0 +1,72 @@
+from pybind11_tests import modules as m
+from pybind11_tests.modules import subsubmodule as ms
+from pybind11_tests import ConstructorStats
+
+
+def test_nested_modules():
+    import pybind11_tests
+    assert pybind11_tests.__name__ == "pybind11_tests"
+    assert pybind11_tests.modules.__name__ == "pybind11_tests.modules"
+    assert pybind11_tests.modules.subsubmodule.__name__ == "pybind11_tests.modules.subsubmodule"
+    assert m.__name__ == "pybind11_tests.modules"
+    assert ms.__name__ == "pybind11_tests.modules.subsubmodule"
+
+    assert ms.submodule_func() == "submodule_func()"
+
+
+def test_reference_internal():
+    b = ms.B()
+    assert str(b.get_a1()) == "A[1]"
+    assert str(b.a1) == "A[1]"
+    assert str(b.get_a2()) == "A[2]"
+    assert str(b.a2) == "A[2]"
+
+    b.a1 = ms.A(42)
+    b.a2 = ms.A(43)
+    assert str(b.get_a1()) == "A[42]"
+    assert str(b.a1) == "A[42]"
+    assert str(b.get_a2()) == "A[43]"
+    assert str(b.a2) == "A[43]"
+
+    astats, bstats = ConstructorStats.get(ms.A), ConstructorStats.get(ms.B)
+    assert astats.alive() == 2
+    assert bstats.alive() == 1
+    del b
+    assert astats.alive() == 0
+    assert bstats.alive() == 0
+    assert astats.values() == ['1', '2', '42', '43']
+    assert bstats.values() == []
+    assert astats.default_constructions == 0
+    assert bstats.default_constructions == 1
+    assert astats.copy_constructions == 0
+    assert bstats.copy_constructions == 0
+    # assert astats.move_constructions >= 0  # Don't invoke any
+    # assert bstats.move_constructions >= 0  # Don't invoke any
+    assert astats.copy_assignments == 2
+    assert bstats.copy_assignments == 0
+    assert astats.move_assignments == 0
+    assert bstats.move_assignments == 0
+
+
+def test_importing():
+    from pybind11_tests.modules import OD
+    from collections import OrderedDict
+
+    assert OD is OrderedDict
+    assert str(OD([(1, 'a'), (2, 'b')])) == "OrderedDict([(1, 'a'), (2, 'b')])"
+
+
+def test_pydoc():
+    """Pydoc needs to be able to provide help() for everything inside a pybind11 module"""
+    import pybind11_tests
+    import pydoc
+
+    assert pybind11_tests.__name__ == "pybind11_tests"
+    assert pybind11_tests.__doc__ == "pybind11 test module"
+    assert pydoc.text.docmodule(pybind11_tests)
+
+
+def test_duplicate_registration():
+    """Registering two things with the same name"""
+
+    assert m.duplicate_registration() == []
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cc
new file mode 100644
index 0000000..ba1674f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cc
@@ -0,0 +1,220 @@
+/*
+    tests/test_multiple_inheritance.cpp -- multiple inheritance,
+    implicit MI casts
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+// Many bases for testing that multiple inheritance from many classes (i.e. requiring extra
+// space for holder constructed flags) works.
+template <int N> struct BaseN {
+    BaseN(int i) : i(i) { }
+    int i;
+};
+
+// test_mi_static_properties
+struct Vanilla {
+    std::string vanilla() { return "Vanilla"; };
+};
+struct WithStatic1 {
+    static std::string static_func1() { return "WithStatic1"; };
+    static int static_value1;
+};
+struct WithStatic2 {
+    static std::string static_func2() { return "WithStatic2"; };
+    static int static_value2;
+};
+struct VanillaStaticMix1 : Vanilla, WithStatic1, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix1"; }
+    static int static_value;
+};
+struct VanillaStaticMix2 : WithStatic1, Vanilla, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix2"; }
+    static int static_value;
+};
+int WithStatic1::static_value1 = 1;
+int WithStatic2::static_value2 = 2;
+int VanillaStaticMix1::static_value = 12;
+int VanillaStaticMix2::static_value = 12;
+
+TEST_SUBMODULE(multiple_inheritance, m) {
+
+    // test_multiple_inheritance_mix1
+    // test_multiple_inheritance_mix2
+    struct Base1 {
+        Base1(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1> b1(m, "Base1");
+    b1.def(py::init<int>())
+      .def("foo", &Base1::foo);
+
+    struct Base2 {
+        Base2(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2> b2(m, "Base2");
+    b2.def(py::init<int>())
+      .def("bar", &Base2::bar);
+
+
+    // test_multiple_inheritance_cpp
+    struct Base12 : Base1, Base2 {
+        Base12(int i, int j) : Base1(i), Base2(j) { }
+    };
+    struct MIType : Base12 {
+        MIType(int i, int j) : Base12(i, j) { }
+    };
+    py::class_<Base12, Base1, Base2>(m, "Base12");
+    py::class_<MIType, Base12>(m, "MIType")
+        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_python_many_bases
+    #define PYBIND11_BASEN(N) py::class_<BaseN<N>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) { return b.i + N; })
+    PYBIND11_BASEN( 1); PYBIND11_BASEN( 2); PYBIND11_BASEN( 3); PYBIND11_BASEN( 4);
+    PYBIND11_BASEN( 5); PYBIND11_BASEN( 6); PYBIND11_BASEN( 7); PYBIND11_BASEN( 8);
+    PYBIND11_BASEN( 9); PYBIND11_BASEN(10); PYBIND11_BASEN(11); PYBIND11_BASEN(12);
+    PYBIND11_BASEN(13); PYBIND11_BASEN(14); PYBIND11_BASEN(15); PYBIND11_BASEN(16);
+    PYBIND11_BASEN(17);
+
+    // Uncommenting this should result in a compile time failure (MI can only be specified via
+    // template parameters because pybind has to know the types involved; see discussion in #742 for
+    // details).
+//    struct Base12v2 : Base1, Base2 {
+//        Base12v2(int i, int j) : Base1(i), Base2(j) { }
+//    };
+//    py::class_<Base12v2>(m, "Base12v2", b1, b2)
+//        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_virtbase
+    // Test the case where not all base classes are specified, and where pybind11 requires the
+    // py::multiple_inheritance flag to perform proper casting between types.
+    struct Base1a {
+        Base1a(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1a, std::shared_ptr<Base1a>>(m, "Base1a")
+        .def(py::init<int>())
+        .def("foo", &Base1a::foo);
+
+    struct Base2a {
+        Base2a(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2a, std::shared_ptr<Base2a>>(m, "Base2a")
+        .def(py::init<int>())
+        .def("bar", &Base2a::bar);
+
+    struct Base12a : Base1a, Base2a {
+        Base12a(int i, int j) : Base1a(i), Base2a(j) { }
+    };
+    py::class_<Base12a, /* Base1 missing */ Base2a,
+               std::shared_ptr<Base12a>>(m, "Base12a", py::multiple_inheritance())
+        .def(py::init<int, int>());
+
+    m.def("bar_base2a", [](Base2a *b) { return b->bar(); });
+    m.def("bar_base2a_sharedptr", [](std::shared_ptr<Base2a> b) { return b->bar(); });
+
+    // test_mi_unaligned_base
+    // test_mi_base_return
+    // Issue #801: invalid casting to derived type with MI bases
+    struct I801B1 { int a = 1; I801B1() = default; I801B1(const I801B1 &) = default; virtual ~I801B1() = default; };
+    struct I801B2 { int b = 2; I801B2() = default; I801B2(const I801B2 &) = default; virtual ~I801B2() = default; };
+    struct I801C : I801B1, I801B2 {};
+    struct I801D : I801C {}; // Indirect MI
+    // Unregistered classes:
+    struct I801B3 { int c = 3; virtual ~I801B3() = default; };
+    struct I801E : I801B3, I801D {};
+
+    py::class_<I801B1, std::shared_ptr<I801B1>>(m, "I801B1").def(py::init<>()).def_readonly("a", &I801B1::a);
+    py::class_<I801B2, std::shared_ptr<I801B2>>(m, "I801B2").def(py::init<>()).def_readonly("b", &I801B2::b);
+    py::class_<I801C, I801B1, I801B2, std::shared_ptr<I801C>>(m, "I801C").def(py::init<>());
+    py::class_<I801D, I801C, std::shared_ptr<I801D>>(m, "I801D").def(py::init<>());
+
+    // Two separate issues here: first, we want to recognize a pointer to a base type as being a
+    // known instance even when the pointer value is unequal (i.e. due to a non-first
+    // multiple-inheritance base class):
+    m.def("i801b1_c", [](I801C *c) { return static_cast<I801B1 *>(c); });
+    m.def("i801b2_c", [](I801C *c) { return static_cast<I801B2 *>(c); });
+    m.def("i801b1_d", [](I801D *d) { return static_cast<I801B1 *>(d); });
+    m.def("i801b2_d", [](I801D *d) { return static_cast<I801B2 *>(d); });
+
+    // Second, when returned a base class pointer to a derived instance, we cannot assume that the
+    // pointer is `reinterpret_cast`able to the derived pointer because, like above, the base class
+    // pointer could be offset.
+    m.def("i801c_b1", []() -> I801B1 * { return new I801C(); });
+    m.def("i801c_b2", []() -> I801B2 * { return new I801C(); });
+    m.def("i801d_b1", []() -> I801B1 * { return new I801D(); });
+    m.def("i801d_b2", []() -> I801B2 * { return new I801D(); });
+
+    // Return a base class pointer to a pybind-registered type when the actual derived type
+    // isn't pybind-registered (and uses multiple-inheritance to offset the pybind base)
+    m.def("i801e_c", []() -> I801C * { return new I801E(); });
+    m.def("i801e_b2", []() -> I801B2 * { return new I801E(); });
+
+
+    // test_mi_static_properties
+    py::class_<Vanilla>(m, "Vanilla")
+        .def(py::init<>())
+        .def("vanilla", &Vanilla::vanilla);
+
+    py::class_<WithStatic1>(m, "WithStatic1")
+        .def(py::init<>())
+        .def_static("static_func1", &WithStatic1::static_func1)
+        .def_readwrite_static("static_value1", &WithStatic1::static_value1);
+
+    py::class_<WithStatic2>(m, "WithStatic2")
+        .def(py::init<>())
+        .def_static("static_func2", &WithStatic2::static_func2)
+        .def_readwrite_static("static_value2", &WithStatic2::static_value2);
+
+    py::class_<VanillaStaticMix1, Vanilla, WithStatic1, WithStatic2>(
+        m, "VanillaStaticMix1")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix1::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix1::static_value);
+
+    py::class_<VanillaStaticMix2, WithStatic1, Vanilla, WithStatic2>(
+        m, "VanillaStaticMix2")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix2::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix2::static_value);
+
+
+#if !defined(PYPY_VERSION)
+    struct WithDict { };
+    struct VanillaDictMix1 : Vanilla, WithDict { };
+    struct VanillaDictMix2 : WithDict, Vanilla { };
+    py::class_<WithDict>(m, "WithDict", py::dynamic_attr()).def(py::init<>());
+    py::class_<VanillaDictMix1, Vanilla, WithDict>(m, "VanillaDictMix1").def(py::init<>());
+    py::class_<VanillaDictMix2, WithDict, Vanilla>(m, "VanillaDictMix2").def(py::init<>());
+#endif
+
+    // test_diamond_inheritance
+    // Issue #959: segfault when constructing diamond inheritance instance
+    // All of these have int members so that there will be various unequal pointers involved.
+    struct B { int b; B() = default; B(const B&) = default; virtual ~B() = default; };
+    struct C0 : public virtual B { int c0; };
+    struct C1 : public virtual B { int c1; };
+    struct D : public C0, public C1 { int d; };
+    py::class_<B>(m, "B")
+        .def("b", [](B *self) { return self; });
+    py::class_<C0, B>(m, "C0")
+        .def("c0", [](C0 *self) { return self; });
+    py::class_<C1, B>(m, "C1")
+        .def("c1", [](C1 *self) { return self; });
+    py::class_<D, C0, C1>(m, "D")
+        .def(py::init<>());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cpp
new file mode 100644
index 0000000..ba1674f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cpp
@@ -0,0 +1,220 @@
+/*
+    tests/test_multiple_inheritance.cpp -- multiple inheritance,
+    implicit MI casts
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+
+// Many bases for testing that multiple inheritance from many classes (i.e. requiring extra
+// space for holder constructed flags) works.
+template <int N> struct BaseN {
+    BaseN(int i) : i(i) { }
+    int i;
+};
+
+// test_mi_static_properties
+struct Vanilla {
+    std::string vanilla() { return "Vanilla"; };
+};
+struct WithStatic1 {
+    static std::string static_func1() { return "WithStatic1"; };
+    static int static_value1;
+};
+struct WithStatic2 {
+    static std::string static_func2() { return "WithStatic2"; };
+    static int static_value2;
+};
+struct VanillaStaticMix1 : Vanilla, WithStatic1, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix1"; }
+    static int static_value;
+};
+struct VanillaStaticMix2 : WithStatic1, Vanilla, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix2"; }
+    static int static_value;
+};
+int WithStatic1::static_value1 = 1;
+int WithStatic2::static_value2 = 2;
+int VanillaStaticMix1::static_value = 12;
+int VanillaStaticMix2::static_value = 12;
+
+TEST_SUBMODULE(multiple_inheritance, m) {
+
+    // test_multiple_inheritance_mix1
+    // test_multiple_inheritance_mix2
+    struct Base1 {
+        Base1(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1> b1(m, "Base1");
+    b1.def(py::init<int>())
+      .def("foo", &Base1::foo);
+
+    struct Base2 {
+        Base2(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2> b2(m, "Base2");
+    b2.def(py::init<int>())
+      .def("bar", &Base2::bar);
+
+
+    // test_multiple_inheritance_cpp
+    struct Base12 : Base1, Base2 {
+        Base12(int i, int j) : Base1(i), Base2(j) { }
+    };
+    struct MIType : Base12 {
+        MIType(int i, int j) : Base12(i, j) { }
+    };
+    py::class_<Base12, Base1, Base2>(m, "Base12");
+    py::class_<MIType, Base12>(m, "MIType")
+        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_python_many_bases
+    #define PYBIND11_BASEN(N) py::class_<BaseN<N>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) { return b.i + N; })
+    PYBIND11_BASEN( 1); PYBIND11_BASEN( 2); PYBIND11_BASEN( 3); PYBIND11_BASEN( 4);
+    PYBIND11_BASEN( 5); PYBIND11_BASEN( 6); PYBIND11_BASEN( 7); PYBIND11_BASEN( 8);
+    PYBIND11_BASEN( 9); PYBIND11_BASEN(10); PYBIND11_BASEN(11); PYBIND11_BASEN(12);
+    PYBIND11_BASEN(13); PYBIND11_BASEN(14); PYBIND11_BASEN(15); PYBIND11_BASEN(16);
+    PYBIND11_BASEN(17);
+
+    // Uncommenting this should result in a compile time failure (MI can only be specified via
+    // template parameters because pybind has to know the types involved; see discussion in #742 for
+    // details).
+//    struct Base12v2 : Base1, Base2 {
+//        Base12v2(int i, int j) : Base1(i), Base2(j) { }
+//    };
+//    py::class_<Base12v2>(m, "Base12v2", b1, b2)
+//        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_virtbase
+    // Test the case where not all base classes are specified, and where pybind11 requires the
+    // py::multiple_inheritance flag to perform proper casting between types.
+    struct Base1a {
+        Base1a(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1a, std::shared_ptr<Base1a>>(m, "Base1a")
+        .def(py::init<int>())
+        .def("foo", &Base1a::foo);
+
+    struct Base2a {
+        Base2a(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2a, std::shared_ptr<Base2a>>(m, "Base2a")
+        .def(py::init<int>())
+        .def("bar", &Base2a::bar);
+
+    struct Base12a : Base1a, Base2a {
+        Base12a(int i, int j) : Base1a(i), Base2a(j) { }
+    };
+    py::class_<Base12a, /* Base1 missing */ Base2a,
+               std::shared_ptr<Base12a>>(m, "Base12a", py::multiple_inheritance())
+        .def(py::init<int, int>());
+
+    m.def("bar_base2a", [](Base2a *b) { return b->bar(); });
+    m.def("bar_base2a_sharedptr", [](std::shared_ptr<Base2a> b) { return b->bar(); });
+
+    // test_mi_unaligned_base
+    // test_mi_base_return
+    // Issue #801: invalid casting to derived type with MI bases
+    struct I801B1 { int a = 1; I801B1() = default; I801B1(const I801B1 &) = default; virtual ~I801B1() = default; };
+    struct I801B2 { int b = 2; I801B2() = default; I801B2(const I801B2 &) = default; virtual ~I801B2() = default; };
+    struct I801C : I801B1, I801B2 {};
+    struct I801D : I801C {}; // Indirect MI
+    // Unregistered classes:
+    struct I801B3 { int c = 3; virtual ~I801B3() = default; };
+    struct I801E : I801B3, I801D {};
+
+    py::class_<I801B1, std::shared_ptr<I801B1>>(m, "I801B1").def(py::init<>()).def_readonly("a", &I801B1::a);
+    py::class_<I801B2, std::shared_ptr<I801B2>>(m, "I801B2").def(py::init<>()).def_readonly("b", &I801B2::b);
+    py::class_<I801C, I801B1, I801B2, std::shared_ptr<I801C>>(m, "I801C").def(py::init<>());
+    py::class_<I801D, I801C, std::shared_ptr<I801D>>(m, "I801D").def(py::init<>());
+
+    // Two separate issues here: first, we want to recognize a pointer to a base type as being a
+    // known instance even when the pointer value is unequal (i.e. due to a non-first
+    // multiple-inheritance base class):
+    m.def("i801b1_c", [](I801C *c) { return static_cast<I801B1 *>(c); });
+    m.def("i801b2_c", [](I801C *c) { return static_cast<I801B2 *>(c); });
+    m.def("i801b1_d", [](I801D *d) { return static_cast<I801B1 *>(d); });
+    m.def("i801b2_d", [](I801D *d) { return static_cast<I801B2 *>(d); });
+
+    // Second, when returned a base class pointer to a derived instance, we cannot assume that the
+    // pointer is `reinterpret_cast`able to the derived pointer because, like above, the base class
+    // pointer could be offset.
+    m.def("i801c_b1", []() -> I801B1 * { return new I801C(); });
+    m.def("i801c_b2", []() -> I801B2 * { return new I801C(); });
+    m.def("i801d_b1", []() -> I801B1 * { return new I801D(); });
+    m.def("i801d_b2", []() -> I801B2 * { return new I801D(); });
+
+    // Return a base class pointer to a pybind-registered type when the actual derived type
+    // isn't pybind-registered (and uses multiple-inheritance to offset the pybind base)
+    m.def("i801e_c", []() -> I801C * { return new I801E(); });
+    m.def("i801e_b2", []() -> I801B2 * { return new I801E(); });
+
+
+    // test_mi_static_properties
+    py::class_<Vanilla>(m, "Vanilla")
+        .def(py::init<>())
+        .def("vanilla", &Vanilla::vanilla);
+
+    py::class_<WithStatic1>(m, "WithStatic1")
+        .def(py::init<>())
+        .def_static("static_func1", &WithStatic1::static_func1)
+        .def_readwrite_static("static_value1", &WithStatic1::static_value1);
+
+    py::class_<WithStatic2>(m, "WithStatic2")
+        .def(py::init<>())
+        .def_static("static_func2", &WithStatic2::static_func2)
+        .def_readwrite_static("static_value2", &WithStatic2::static_value2);
+
+    py::class_<VanillaStaticMix1, Vanilla, WithStatic1, WithStatic2>(
+        m, "VanillaStaticMix1")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix1::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix1::static_value);
+
+    py::class_<VanillaStaticMix2, WithStatic1, Vanilla, WithStatic2>(
+        m, "VanillaStaticMix2")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix2::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix2::static_value);
+
+
+#if !defined(PYPY_VERSION)
+    struct WithDict { };
+    struct VanillaDictMix1 : Vanilla, WithDict { };
+    struct VanillaDictMix2 : WithDict, Vanilla { };
+    py::class_<WithDict>(m, "WithDict", py::dynamic_attr()).def(py::init<>());
+    py::class_<VanillaDictMix1, Vanilla, WithDict>(m, "VanillaDictMix1").def(py::init<>());
+    py::class_<VanillaDictMix2, WithDict, Vanilla>(m, "VanillaDictMix2").def(py::init<>());
+#endif
+
+    // test_diamond_inheritance
+    // Issue #959: segfault when constructing diamond inheritance instance
+    // All of these have int members so that there will be various unequal pointers involved.
+    struct B { int b; B() = default; B(const B&) = default; virtual ~B() = default; };
+    struct C0 : public virtual B { int c0; };
+    struct C1 : public virtual B { int c1; };
+    struct D : public C0, public C1 { int d; };
+    py::class_<B>(m, "B")
+        .def("b", [](B *self) { return self; });
+    py::class_<C0, B>(m, "C0")
+        .def("c0", [](C0 *self) { return self; });
+    py::class_<C1, B>(m, "C1")
+        .def("c1", [](C1 *self) { return self; });
+    py::class_<D, C0, C1>(m, "D")
+        .def(py::init<>());
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.py
new file mode 100644
index 0000000..475dd3b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.py
@@ -0,0 +1,349 @@
+import pytest
+from pybind11_tests import ConstructorStats
+from pybind11_tests import multiple_inheritance as m
+
+
+def test_multiple_inheritance_cpp():
+    mt = m.MIType(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+def test_multiple_inheritance_mix1():
+    class Base1:
+        def __init__(self, i):
+            self.i = i
+
+        def foo(self):
+            return self.i
+
+    class MITypePy(Base1, m.Base2):
+        def __init__(self, i, j):
+            Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    mt = MITypePy(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+def test_multiple_inheritance_mix2():
+
+    class Base2:
+        def __init__(self, i):
+            self.i = i
+
+        def bar(self):
+            return self.i
+
+    class MITypePy(m.Base1, Base2):
+        def __init__(self, i, j):
+            m.Base1.__init__(self, i)
+            Base2.__init__(self, j)
+
+    mt = MITypePy(3, 4)
+
+    assert mt.foo() == 3
+    assert mt.bar() == 4
+
+
+def test_multiple_inheritance_python():
+
+    class MI1(m.Base1, m.Base2):
+        def __init__(self, i, j):
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class B1(object):
+        def v(self):
+            return 1
+
+    class MI2(B1, m.Base1, m.Base2):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI3(MI2):
+        def __init__(self, i, j):
+            MI2.__init__(self, i, j)
+
+    class MI4(MI3, m.Base2):
+        def __init__(self, i, j):
+            MI3.__init__(self, i, j)
+            # This should be ignored (Base2 is already initialized via MI2):
+            m.Base2.__init__(self, i + 100)
+
+    class MI5(m.Base2, B1, m.Base1):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI6(m.Base2, B1):
+        def __init__(self, i):
+            m.Base2.__init__(self, i)
+            B1.__init__(self)
+
+    class B2(B1):
+        def v(self):
+            return 2
+
+    class B3(object):
+        def v(self):
+            return 3
+
+    class B4(B3, B2):
+        def v(self):
+            return 4
+
+    class MI7(B4, MI6):
+        def __init__(self, i):
+            B4.__init__(self)
+            MI6.__init__(self, i)
+
+    class MI8(MI6, B3):
+        def __init__(self, i):
+            MI6.__init__(self, i)
+            B3.__init__(self)
+
+    class MI8b(B3, MI6):
+        def __init__(self, i):
+            B3.__init__(self)
+            MI6.__init__(self, i)
+
+    mi1 = MI1(1, 2)
+    assert mi1.foo() == 1
+    assert mi1.bar() == 2
+
+    mi2 = MI2(3, 4)
+    assert mi2.v() == 1
+    assert mi2.foo() == 3
+    assert mi2.bar() == 4
+
+    mi3 = MI3(5, 6)
+    assert mi3.v() == 1
+    assert mi3.foo() == 5
+    assert mi3.bar() == 6
+
+    mi4 = MI4(7, 8)
+    assert mi4.v() == 1
+    assert mi4.foo() == 7
+    assert mi4.bar() == 8
+
+    mi5 = MI5(10, 11)
+    assert mi5.v() == 1
+    assert mi5.foo() == 10
+    assert mi5.bar() == 11
+
+    mi6 = MI6(12)
+    assert mi6.v() == 1
+    assert mi6.bar() == 12
+
+    mi7 = MI7(13)
+    assert mi7.v() == 4
+    assert mi7.bar() == 13
+
+    mi8 = MI8(14)
+    assert mi8.v() == 1
+    assert mi8.bar() == 14
+
+    mi8b = MI8b(15)
+    assert mi8b.v() == 3
+    assert mi8b.bar() == 15
+
+
+def test_multiple_inheritance_python_many_bases():
+
+    class MIMany14(m.BaseN1, m.BaseN2, m.BaseN3, m.BaseN4):
+        def __init__(self):
+            m.BaseN1.__init__(self, 1)
+            m.BaseN2.__init__(self, 2)
+            m.BaseN3.__init__(self, 3)
+            m.BaseN4.__init__(self, 4)
+
+    class MIMany58(m.BaseN5, m.BaseN6, m.BaseN7, m.BaseN8):
+        def __init__(self):
+            m.BaseN5.__init__(self, 5)
+            m.BaseN6.__init__(self, 6)
+            m.BaseN7.__init__(self, 7)
+            m.BaseN8.__init__(self, 8)
+
+    class MIMany916(m.BaseN9, m.BaseN10, m.BaseN11, m.BaseN12, m.BaseN13, m.BaseN14, m.BaseN15,
+                    m.BaseN16):
+        def __init__(self):
+            m.BaseN9.__init__(self, 9)
+            m.BaseN10.__init__(self, 10)
+            m.BaseN11.__init__(self, 11)
+            m.BaseN12.__init__(self, 12)
+            m.BaseN13.__init__(self, 13)
+            m.BaseN14.__init__(self, 14)
+            m.BaseN15.__init__(self, 15)
+            m.BaseN16.__init__(self, 16)
+
+    class MIMany19(MIMany14, MIMany58, m.BaseN9):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            m.BaseN9.__init__(self, 9)
+
+    class MIMany117(MIMany14, MIMany58, MIMany916, m.BaseN17):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            MIMany916.__init__(self)
+            m.BaseN17.__init__(self, 17)
+
+    # Inherits from 4 registered C++ classes: can fit in one pointer on any modern arch:
+    a = MIMany14()
+    for i in range(1, 4):
+        assert getattr(a, "f" + str(i))() == 2 * i
+
+    # Inherits from 8: requires 1/2 pointers worth of holder flags on 32/64-bit arch:
+    b = MIMany916()
+    for i in range(9, 16):
+        assert getattr(b, "f" + str(i))() == 2 * i
+
+    # Inherits from 9: requires >= 2 pointers worth of holder flags
+    c = MIMany19()
+    for i in range(1, 9):
+        assert getattr(c, "f" + str(i))() == 2 * i
+
+    # Inherits from 17: requires >= 3 pointers worth of holder flags
+    d = MIMany117()
+    for i in range(1, 17):
+        assert getattr(d, "f" + str(i))() == 2 * i
+
+
+def test_multiple_inheritance_virtbase():
+
+    class MITypePy(m.Base12a):
+        def __init__(self, i, j):
+            m.Base12a.__init__(self, i, j)
+
+    mt = MITypePy(3, 4)
+    assert mt.bar() == 4
+    assert m.bar_base2a(mt) == 4
+    assert m.bar_base2a_sharedptr(mt) == 4
+
+
+def test_mi_static_properties():
+    """Mixing bases with and without static properties should be possible
+     and the result should be independent of base definition order"""
+
+    for d in (m.VanillaStaticMix1(), m.VanillaStaticMix2()):
+        assert d.vanilla() == "Vanilla"
+        assert d.static_func1() == "WithStatic1"
+        assert d.static_func2() == "WithStatic2"
+        assert d.static_func() == d.__class__.__name__
+
+        m.WithStatic1.static_value1 = 1
+        m.WithStatic2.static_value2 = 2
+        assert d.static_value1 == 1
+        assert d.static_value2 == 2
+        assert d.static_value == 12
+
+        d.static_value1 = 0
+        assert d.static_value1 == 0
+        d.static_value2 = 0
+        assert d.static_value2 == 0
+        d.static_value = 0
+        assert d.static_value == 0
+
+
+@pytest.unsupported_on_pypy
+def test_mi_dynamic_attributes():
+    """Mixing bases with and without dynamic attribute support"""
+
+    for d in (m.VanillaDictMix1(), m.VanillaDictMix2()):
+        d.dynamic = 1
+        assert d.dynamic == 1
+
+
+def test_mi_unaligned_base():
+    """Returning an offset (non-first MI) base class pointer should recognize the instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c = m.I801C()
+    d = m.I801D()
+    # + 4 below because we have the two instances, and each instance has offset base I801B2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+    b1c = m.i801b1_c(c)
+    assert b1c is c
+    b2c = m.i801b2_c(c)
+    assert b2c is c
+    b1d = m.i801b1_d(d)
+    assert b1d is d
+    b2d = m.i801b2_d(d)
+    assert b2d is d
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4  # no extra instances
+    del c, b1c, b2c
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    del d, b1d, b2d
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+
+def test_mi_base_return():
+    """Tests returning an offset (non-first MI) base class pointer to a derived instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c1 = m.i801c_b1()
+    assert type(c1) is m.I801C
+    assert c1.a == 1
+    assert c1.b == 2
+
+    d1 = m.i801d_b1()
+    assert type(d1) is m.I801D
+    assert d1.a == 1
+    assert d1.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    c2 = m.i801c_b2()
+    assert type(c2) is m.I801C
+    assert c2.a == 1
+    assert c2.b == 2
+
+    d2 = m.i801d_b2()
+    assert type(d2) is m.I801D
+    assert d2.a == 1
+    assert d2.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 8
+
+    del c2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    del c1, d1, d2
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    # Returning an unregistered derived type with a registered base; we won't
+    # pick up the derived type, obviously, but should still work (as an object
+    # of whatever type was returned).
+    e1 = m.i801e_c()
+    assert type(e1) is m.I801C
+    assert e1.a == 1
+    assert e1.b == 2
+
+    e2 = m.i801e_b2()
+    assert type(e2) is m.I801B2
+    assert e2.b == 2
+
+
+def test_diamond_inheritance():
+    """Tests that diamond inheritance works as expected (issue #959)"""
+
+    # Issue #959: this shouldn't segfault:
+    d = m.D()
+
+    # Make sure all the various distinct pointers are all recognized as registered instances:
+    assert d is d.c0()
+    assert d is d.c1()
+    assert d is d.b()
+    assert d is d.c0().b()
+    assert d is d.c1().b()
+    assert d is d.c0().c1().b().c0().b()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cc
new file mode 100644
index 0000000..156a3bf
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cc
@@ -0,0 +1,390 @@
+/*
+    tests/test_numpy_array.cpp -- test core array functionality
+
+    Copyright (c) 2016 Ivan Smirnov <i.s.smirnov@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <cstdint>
+
+// Size / dtype checks.
+struct DtypeCheck {
+    py::dtype numpy{};
+    py::dtype pybind11{};
+};
+
+template <typename T>
+DtypeCheck get_dtype_check(const char* name) {
+    py::module np = py::module::import("numpy");
+    DtypeCheck check{};
+    check.numpy = np.attr("dtype")(np.attr(name));
+    check.pybind11 = py::dtype::of<T>();
+    return check;
+}
+
+std::vector<DtypeCheck> get_concrete_dtype_checks() {
+    return {
+        // Normalization
+        get_dtype_check<std::int8_t>("int8"),
+        get_dtype_check<std::uint8_t>("uint8"),
+        get_dtype_check<std::int16_t>("int16"),
+        get_dtype_check<std::uint16_t>("uint16"),
+        get_dtype_check<std::int32_t>("int32"),
+        get_dtype_check<std::uint32_t>("uint32"),
+        get_dtype_check<std::int64_t>("int64"),
+        get_dtype_check<std::uint64_t>("uint64")
+    };
+}
+
+struct DtypeSizeCheck {
+    std::string name{};
+    int size_cpp{};
+    int size_numpy{};
+    // For debugging.
+    py::dtype dtype{};
+};
+
+template <typename T>
+DtypeSizeCheck get_dtype_size_check() {
+    DtypeSizeCheck check{};
+    check.name = py::type_id<T>();
+    check.size_cpp = sizeof(T);
+    check.dtype = py::dtype::of<T>();
+    check.size_numpy = check.dtype.attr("itemsize").template cast<int>();
+    return check;
+}
+
+std::vector<DtypeSizeCheck> get_platform_dtype_size_checks() {
+    return {
+        get_dtype_size_check<short>(),
+        get_dtype_size_check<unsigned short>(),
+        get_dtype_size_check<int>(),
+        get_dtype_size_check<unsigned int>(),
+        get_dtype_size_check<long>(),
+        get_dtype_size_check<unsigned long>(),
+        get_dtype_size_check<long long>(),
+        get_dtype_size_check<unsigned long long>(),
+    };
+}
+
+// Arrays.
+using arr = py::array;
+using arr_t = py::array_t<uint16_t, 0>;
+static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
+
+template<typename... Ix> arr data(const arr& a, Ix... index) {
+    return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
+}
+
+template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
+    return arr(a.size() - a.index_at(index...), a.data(index...));
+}
+
+template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
+    auto ptr = (uint8_t *) a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+        ptr[i] = (uint8_t) (ptr[i] * 2);
+    return a;
+}
+
+template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
+    auto ptr = a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
+        ptr[i]++;
+    return a;
+}
+
+template<typename... Ix> ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
+
+#define def_index_fn(name, type) \
+    sm.def(#name, [](type a) { return name(a); }); \
+    sm.def(#name, [](type a, int i) { return name(a, i); }); \
+    sm.def(#name, [](type a, int i, int j) { return name(a, i, j); }); \
+    sm.def(#name, [](type a, int i, int j, int k) { return name(a, i, j, k); });
+
+template <typename T, typename T2> py::handle auxiliaries(T &&r, T2 &&r2) {
+    if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+    py::list l;
+    l.append(*r.data(0, 0));
+    l.append(*r2.mutable_data(0, 0));
+    l.append(r.data(0, 1) == r2.mutable_data(0, 1));
+    l.append(r.ndim());
+    l.append(r.itemsize());
+    l.append(r.shape(0));
+    l.append(r.shape(1));
+    l.append(r.size());
+    l.append(r.nbytes());
+    return l.release();
+}
+
+// note: declaration at local scope would create a dangling reference!
+static int data_i = 42;
+
+TEST_SUBMODULE(numpy_array, sm) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_dtypes
+    py::class_<DtypeCheck>(sm, "DtypeCheck")
+        .def_readonly("numpy", &DtypeCheck::numpy)
+        .def_readonly("pybind11", &DtypeCheck::pybind11)
+        .def("__repr__", [](const DtypeCheck& self) {
+            return py::str("<DtypeCheck numpy={} pybind11={}>").format(
+                self.numpy, self.pybind11);
+        });
+    sm.def("get_concrete_dtype_checks", &get_concrete_dtype_checks);
+
+    py::class_<DtypeSizeCheck>(sm, "DtypeSizeCheck")
+        .def_readonly("name", &DtypeSizeCheck::name)
+        .def_readonly("size_cpp", &DtypeSizeCheck::size_cpp)
+        .def_readonly("size_numpy", &DtypeSizeCheck::size_numpy)
+        .def("__repr__", [](const DtypeSizeCheck& self) {
+            return py::str("<DtypeSizeCheck name='{}' size_cpp={} size_numpy={} dtype={}>").format(
+                self.name, self.size_cpp, self.size_numpy, self.dtype);
+        });
+    sm.def("get_platform_dtype_size_checks", &get_platform_dtype_size_checks);
+
+    // test_array_attributes
+    sm.def("ndim", [](const arr& a) { return a.ndim(); });
+    sm.def("shape", [](const arr& a) { return arr(a.ndim(), a.shape()); });
+    sm.def("shape", [](const arr& a, ssize_t dim) { return a.shape(dim); });
+    sm.def("strides", [](const arr& a) { return arr(a.ndim(), a.strides()); });
+    sm.def("strides", [](const arr& a, ssize_t dim) { return a.strides(dim); });
+    sm.def("writeable", [](const arr& a) { return a.writeable(); });
+    sm.def("size", [](const arr& a) { return a.size(); });
+    sm.def("itemsize", [](const arr& a) { return a.itemsize(); });
+    sm.def("nbytes", [](const arr& a) { return a.nbytes(); });
+    sm.def("owndata", [](const arr& a) { return a.owndata(); });
+
+    // test_index_offset
+    def_index_fn(index_at, const arr&);
+    def_index_fn(index_at_t, const arr_t&);
+    def_index_fn(offset_at, const arr&);
+    def_index_fn(offset_at_t, const arr_t&);
+    // test_data
+    def_index_fn(data, const arr&);
+    def_index_fn(data_t, const arr_t&);
+    // test_mutate_data, test_mutate_readonly
+    def_index_fn(mutate_data, arr&);
+    def_index_fn(mutate_data_t, arr_t&);
+    def_index_fn(at_t, const arr_t&);
+    def_index_fn(mutate_at_t, arr_t&);
+
+    // test_make_c_f_array
+    sm.def("make_f_array", [] { return py::array_t<float>({ 2, 2 }, { 4, 8 }); });
+    sm.def("make_c_array", [] { return py::array_t<float>({ 2, 2 }, { 8, 4 }); });
+
+    // test_empty_shaped_array
+    sm.def("make_empty_shaped_array", [] { return py::array(py::dtype("f"), {}, {}); });
+    // test numpy scalars (empty shape, ndim==0)
+    sm.def("scalar_int", []() { return py::array(py::dtype("i"), {}, {}, &data_i); });
+
+    // test_wrap
+    sm.def("wrap", [](py::array a) {
+        return py::array(
+            a.dtype(),
+            {a.shape(), a.shape() + a.ndim()},
+            {a.strides(), a.strides() + a.ndim()},
+            a.data(),
+            a
+        );
+    });
+
+    // test_numpy_view
+    struct ArrayClass {
+        int data[2] = { 1, 2 };
+        ArrayClass() { py::print("ArrayClass()"); }
+        ~ArrayClass() { py::print("~ArrayClass()"); }
+    };
+    py::class_<ArrayClass>(sm, "ArrayClass")
+        .def(py::init<>())
+        .def("numpy_view", [](py::object &obj) {
+            py::print("ArrayClass::numpy_view()");
+            ArrayClass &a = obj.cast<ArrayClass&>();
+            return py::array_t<int>({2}, {4}, a.data, obj);
+        }
+    );
+
+    // test_cast_numpy_int64_to_uint64
+    sm.def("function_taking_uint64", [](uint64_t) { });
+
+    // test_isinstance
+    sm.def("isinstance_untyped", [](py::object yes, py::object no) {
+        return py::isinstance<py::array>(yes) && !py::isinstance<py::array>(no);
+    });
+    sm.def("isinstance_typed", [](py::object o) {
+        return py::isinstance<py::array_t<double>>(o) && !py::isinstance<py::array_t<int>>(o);
+    });
+
+    // test_constructors
+    sm.def("default_constructors", []() {
+        return py::dict(
+            "array"_a=py::array(),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(),
+            "array_t<double>"_a=py::array_t<double>()
+        );
+    });
+    sm.def("converting_constructors", [](py::object o) {
+        return py::dict(
+            "array"_a=py::array(o),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(o),
+            "array_t<double>"_a=py::array_t<double>(o)
+        );
+    });
+
+    // test_overload_resolution
+    sm.def("overloaded", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded", [](py::array_t<float>) { return "float"; });
+    sm.def("overloaded", [](py::array_t<int>) { return "int"; });
+    sm.def("overloaded", [](py::array_t<unsigned short>) { return "unsigned short"; });
+    sm.def("overloaded", [](py::array_t<long long>) { return "long long"; });
+    sm.def("overloaded", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded", [](py::array_t<std::complex<float>>) { return "float complex"; });
+
+    sm.def("overloaded2", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded2", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded2", [](py::array_t<std::complex<float>>) { return "float complex"; });
+    sm.def("overloaded2", [](py::array_t<float>) { return "float"; });
+
+    // Only accept the exact types:
+    sm.def("overloaded3", [](py::array_t<int>) { return "int"; }, py::arg().noconvert());
+    sm.def("overloaded3", [](py::array_t<double>) { return "double"; }, py::arg().noconvert());
+
+    // Make sure we don't do unsafe coercion (e.g. float to int) when not using forcecast, but
+    // rather that float gets converted via the safe (conversion to double) overload:
+    sm.def("overloaded4", [](py::array_t<long long, 0>) { return "long long"; });
+    sm.def("overloaded4", [](py::array_t<double, 0>) { return "double"; });
+
+    // But we do allow conversion to int if forcecast is enabled (but only if no overload matches
+    // without conversion)
+    sm.def("overloaded5", [](py::array_t<unsigned int>) { return "unsigned int"; });
+    sm.def("overloaded5", [](py::array_t<double>) { return "double"; });
+
+    // test_greedy_string_overload
+    // Issue 685: ndarray shouldn't go to std::string overload
+    sm.def("issue685", [](std::string) { return "string"; });
+    sm.def("issue685", [](py::array) { return "array"; });
+    sm.def("issue685", [](py::object) { return "other"; });
+
+    // test_array_unchecked_fixed_dims
+    sm.def("proxy_add2", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked<2>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+
+    sm.def("proxy_init3", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_init3F", [](double start) {
+        py::array_t<double, py::array::f_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_squared_L2_norm", [](py::array_t<double> a) {
+        auto r = a.unchecked<1>();
+        double sumsq = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            sumsq += r[i] * r(i); // Either notation works for a 1D array
+        return sumsq;
+    });
+
+    sm.def("proxy_auxiliaries2", [](py::array_t<double> a) {
+        auto r = a.unchecked<2>();
+        auto r2 = a.mutable_unchecked<2>();
+        return auxiliaries(r, r2);
+    });
+
+    // test_array_unchecked_dyn_dims
+    // Same as the above, but without a compile-time dimensions specification:
+    sm.def("proxy_add2_dyn", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+    sm.def("proxy_init3_dyn", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 3) throw std::domain_error("error: ndim != 3");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_auxiliaries2_dyn", [](py::array_t<double> a) {
+        return auxiliaries(a.unchecked(), a.mutable_unchecked());
+    });
+
+    sm.def("array_auxiliaries2", [](py::array_t<double> a) {
+        return auxiliaries(a, a);
+    });
+
+    // test_array_failures
+    // Issue #785: Uninformative "Unknown internal error" exception when constructing array from empty object:
+    sm.def("array_fail_test", []() { return py::array(py::object()); });
+    sm.def("array_t_fail_test", []() { return py::array_t<double>(py::object()); });
+    // Make sure the error from numpy is being passed through:
+    sm.def("array_fail_test_negative_size", []() { int c = 0; return py::array(-1, &c); });
+
+    // test_initializer_list
+    // Issue (unnumbered; reported in #788): regression: initializer lists can be ambiguous
+    sm.def("array_initializer_list1", []() { return py::array_t<float>(1); }); // { 1 } also works, but clang warns about it
+    sm.def("array_initializer_list2", []() { return py::array_t<float>({ 1, 2 }); });
+    sm.def("array_initializer_list3", []() { return py::array_t<float>({ 1, 2, 3 }); });
+    sm.def("array_initializer_list4", []() { return py::array_t<float>({ 1, 2, 3, 4 }); });
+
+    // test_array_resize
+    // reshape array to 2D without changing size
+    sm.def("array_reshape2", [](py::array_t<double> a) {
+        const ssize_t dim_sz = (ssize_t)std::sqrt(a.size());
+        if (dim_sz * dim_sz != a.size())
+            throw std::domain_error("array_reshape2: input array total size is not a squared integer");
+        a.resize({dim_sz, dim_sz});
+    });
+
+    // resize to 3D array with each dimension = N
+    sm.def("array_resize3", [](py::array_t<double> a, size_t N, bool refcheck) {
+        a.resize({N, N, N}, refcheck);
+    });
+
+    // test_array_create_and_resize
+    // return 2D array with Nrows = Ncols = N
+    sm.def("create_and_resize", [](size_t N) {
+        py::array_t<double> a;
+        a.resize({N, N});
+        std::fill(a.mutable_data(), a.mutable_data() + a.size(), 42.);
+        return a;
+    });
+
+#if PY_MAJOR_VERSION >= 3
+        sm.def("index_using_ellipsis", [](py::array a) {
+            return a[py::make_tuple(0, py::ellipsis(), 0)];
+        });
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cpp
new file mode 100644
index 0000000..156a3bf
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cpp
@@ -0,0 +1,390 @@
+/*
+    tests/test_numpy_array.cpp -- test core array functionality
+
+    Copyright (c) 2016 Ivan Smirnov <i.s.smirnov@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <cstdint>
+
+// Size / dtype checks.
+struct DtypeCheck {
+    py::dtype numpy{};
+    py::dtype pybind11{};
+};
+
+template <typename T>
+DtypeCheck get_dtype_check(const char* name) {
+    py::module np = py::module::import("numpy");
+    DtypeCheck check{};
+    check.numpy = np.attr("dtype")(np.attr(name));
+    check.pybind11 = py::dtype::of<T>();
+    return check;
+}
+
+std::vector<DtypeCheck> get_concrete_dtype_checks() {
+    return {
+        // Normalization
+        get_dtype_check<std::int8_t>("int8"),
+        get_dtype_check<std::uint8_t>("uint8"),
+        get_dtype_check<std::int16_t>("int16"),
+        get_dtype_check<std::uint16_t>("uint16"),
+        get_dtype_check<std::int32_t>("int32"),
+        get_dtype_check<std::uint32_t>("uint32"),
+        get_dtype_check<std::int64_t>("int64"),
+        get_dtype_check<std::uint64_t>("uint64")
+    };
+}
+
+struct DtypeSizeCheck {
+    std::string name{};
+    int size_cpp{};
+    int size_numpy{};
+    // For debugging.
+    py::dtype dtype{};
+};
+
+template <typename T>
+DtypeSizeCheck get_dtype_size_check() {
+    DtypeSizeCheck check{};
+    check.name = py::type_id<T>();
+    check.size_cpp = sizeof(T);
+    check.dtype = py::dtype::of<T>();
+    check.size_numpy = check.dtype.attr("itemsize").template cast<int>();
+    return check;
+}
+
+std::vector<DtypeSizeCheck> get_platform_dtype_size_checks() {
+    return {
+        get_dtype_size_check<short>(),
+        get_dtype_size_check<unsigned short>(),
+        get_dtype_size_check<int>(),
+        get_dtype_size_check<unsigned int>(),
+        get_dtype_size_check<long>(),
+        get_dtype_size_check<unsigned long>(),
+        get_dtype_size_check<long long>(),
+        get_dtype_size_check<unsigned long long>(),
+    };
+}
+
+// Arrays.
+using arr = py::array;
+using arr_t = py::array_t<uint16_t, 0>;
+static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
+
+template<typename... Ix> arr data(const arr& a, Ix... index) {
+    return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
+}
+
+template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
+    return arr(a.size() - a.index_at(index...), a.data(index...));
+}
+
+template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
+    auto ptr = (uint8_t *) a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+        ptr[i] = (uint8_t) (ptr[i] * 2);
+    return a;
+}
+
+template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
+    auto ptr = a.mutable_data(index...);
+    for (ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
+        ptr[i]++;
+    return a;
+}
+
+template<typename... Ix> ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
+
+#define def_index_fn(name, type) \
+    sm.def(#name, [](type a) { return name(a); }); \
+    sm.def(#name, [](type a, int i) { return name(a, i); }); \
+    sm.def(#name, [](type a, int i, int j) { return name(a, i, j); }); \
+    sm.def(#name, [](type a, int i, int j, int k) { return name(a, i, j, k); });
+
+template <typename T, typename T2> py::handle auxiliaries(T &&r, T2 &&r2) {
+    if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+    py::list l;
+    l.append(*r.data(0, 0));
+    l.append(*r2.mutable_data(0, 0));
+    l.append(r.data(0, 1) == r2.mutable_data(0, 1));
+    l.append(r.ndim());
+    l.append(r.itemsize());
+    l.append(r.shape(0));
+    l.append(r.shape(1));
+    l.append(r.size());
+    l.append(r.nbytes());
+    return l.release();
+}
+
+// note: declaration at local scope would create a dangling reference!
+static int data_i = 42;
+
+TEST_SUBMODULE(numpy_array, sm) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_dtypes
+    py::class_<DtypeCheck>(sm, "DtypeCheck")
+        .def_readonly("numpy", &DtypeCheck::numpy)
+        .def_readonly("pybind11", &DtypeCheck::pybind11)
+        .def("__repr__", [](const DtypeCheck& self) {
+            return py::str("<DtypeCheck numpy={} pybind11={}>").format(
+                self.numpy, self.pybind11);
+        });
+    sm.def("get_concrete_dtype_checks", &get_concrete_dtype_checks);
+
+    py::class_<DtypeSizeCheck>(sm, "DtypeSizeCheck")
+        .def_readonly("name", &DtypeSizeCheck::name)
+        .def_readonly("size_cpp", &DtypeSizeCheck::size_cpp)
+        .def_readonly("size_numpy", &DtypeSizeCheck::size_numpy)
+        .def("__repr__", [](const DtypeSizeCheck& self) {
+            return py::str("<DtypeSizeCheck name='{}' size_cpp={} size_numpy={} dtype={}>").format(
+                self.name, self.size_cpp, self.size_numpy, self.dtype);
+        });
+    sm.def("get_platform_dtype_size_checks", &get_platform_dtype_size_checks);
+
+    // test_array_attributes
+    sm.def("ndim", [](const arr& a) { return a.ndim(); });
+    sm.def("shape", [](const arr& a) { return arr(a.ndim(), a.shape()); });
+    sm.def("shape", [](const arr& a, ssize_t dim) { return a.shape(dim); });
+    sm.def("strides", [](const arr& a) { return arr(a.ndim(), a.strides()); });
+    sm.def("strides", [](const arr& a, ssize_t dim) { return a.strides(dim); });
+    sm.def("writeable", [](const arr& a) { return a.writeable(); });
+    sm.def("size", [](const arr& a) { return a.size(); });
+    sm.def("itemsize", [](const arr& a) { return a.itemsize(); });
+    sm.def("nbytes", [](const arr& a) { return a.nbytes(); });
+    sm.def("owndata", [](const arr& a) { return a.owndata(); });
+
+    // test_index_offset
+    def_index_fn(index_at, const arr&);
+    def_index_fn(index_at_t, const arr_t&);
+    def_index_fn(offset_at, const arr&);
+    def_index_fn(offset_at_t, const arr_t&);
+    // test_data
+    def_index_fn(data, const arr&);
+    def_index_fn(data_t, const arr_t&);
+    // test_mutate_data, test_mutate_readonly
+    def_index_fn(mutate_data, arr&);
+    def_index_fn(mutate_data_t, arr_t&);
+    def_index_fn(at_t, const arr_t&);
+    def_index_fn(mutate_at_t, arr_t&);
+
+    // test_make_c_f_array
+    sm.def("make_f_array", [] { return py::array_t<float>({ 2, 2 }, { 4, 8 }); });
+    sm.def("make_c_array", [] { return py::array_t<float>({ 2, 2 }, { 8, 4 }); });
+
+    // test_empty_shaped_array
+    sm.def("make_empty_shaped_array", [] { return py::array(py::dtype("f"), {}, {}); });
+    // test numpy scalars (empty shape, ndim==0)
+    sm.def("scalar_int", []() { return py::array(py::dtype("i"), {}, {}, &data_i); });
+
+    // test_wrap
+    sm.def("wrap", [](py::array a) {
+        return py::array(
+            a.dtype(),
+            {a.shape(), a.shape() + a.ndim()},
+            {a.strides(), a.strides() + a.ndim()},
+            a.data(),
+            a
+        );
+    });
+
+    // test_numpy_view
+    struct ArrayClass {
+        int data[2] = { 1, 2 };
+        ArrayClass() { py::print("ArrayClass()"); }
+        ~ArrayClass() { py::print("~ArrayClass()"); }
+    };
+    py::class_<ArrayClass>(sm, "ArrayClass")
+        .def(py::init<>())
+        .def("numpy_view", [](py::object &obj) {
+            py::print("ArrayClass::numpy_view()");
+            ArrayClass &a = obj.cast<ArrayClass&>();
+            return py::array_t<int>({2}, {4}, a.data, obj);
+        }
+    );
+
+    // test_cast_numpy_int64_to_uint64
+    sm.def("function_taking_uint64", [](uint64_t) { });
+
+    // test_isinstance
+    sm.def("isinstance_untyped", [](py::object yes, py::object no) {
+        return py::isinstance<py::array>(yes) && !py::isinstance<py::array>(no);
+    });
+    sm.def("isinstance_typed", [](py::object o) {
+        return py::isinstance<py::array_t<double>>(o) && !py::isinstance<py::array_t<int>>(o);
+    });
+
+    // test_constructors
+    sm.def("default_constructors", []() {
+        return py::dict(
+            "array"_a=py::array(),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(),
+            "array_t<double>"_a=py::array_t<double>()
+        );
+    });
+    sm.def("converting_constructors", [](py::object o) {
+        return py::dict(
+            "array"_a=py::array(o),
+            "array_t<int32>"_a=py::array_t<std::int32_t>(o),
+            "array_t<double>"_a=py::array_t<double>(o)
+        );
+    });
+
+    // test_overload_resolution
+    sm.def("overloaded", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded", [](py::array_t<float>) { return "float"; });
+    sm.def("overloaded", [](py::array_t<int>) { return "int"; });
+    sm.def("overloaded", [](py::array_t<unsigned short>) { return "unsigned short"; });
+    sm.def("overloaded", [](py::array_t<long long>) { return "long long"; });
+    sm.def("overloaded", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded", [](py::array_t<std::complex<float>>) { return "float complex"; });
+
+    sm.def("overloaded2", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded2", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded2", [](py::array_t<std::complex<float>>) { return "float complex"; });
+    sm.def("overloaded2", [](py::array_t<float>) { return "float"; });
+
+    // Only accept the exact types:
+    sm.def("overloaded3", [](py::array_t<int>) { return "int"; }, py::arg().noconvert());
+    sm.def("overloaded3", [](py::array_t<double>) { return "double"; }, py::arg().noconvert());
+
+    // Make sure we don't do unsafe coercion (e.g. float to int) when not using forcecast, but
+    // rather that float gets converted via the safe (conversion to double) overload:
+    sm.def("overloaded4", [](py::array_t<long long, 0>) { return "long long"; });
+    sm.def("overloaded4", [](py::array_t<double, 0>) { return "double"; });
+
+    // But we do allow conversion to int if forcecast is enabled (but only if no overload matches
+    // without conversion)
+    sm.def("overloaded5", [](py::array_t<unsigned int>) { return "unsigned int"; });
+    sm.def("overloaded5", [](py::array_t<double>) { return "double"; });
+
+    // test_greedy_string_overload
+    // Issue 685: ndarray shouldn't go to std::string overload
+    sm.def("issue685", [](std::string) { return "string"; });
+    sm.def("issue685", [](py::array) { return "array"; });
+    sm.def("issue685", [](py::object) { return "other"; });
+
+    // test_array_unchecked_fixed_dims
+    sm.def("proxy_add2", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked<2>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+
+    sm.def("proxy_init3", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_init3F", [](double start) {
+        py::array_t<double, py::array::f_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_squared_L2_norm", [](py::array_t<double> a) {
+        auto r = a.unchecked<1>();
+        double sumsq = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            sumsq += r[i] * r(i); // Either notation works for a 1D array
+        return sumsq;
+    });
+
+    sm.def("proxy_auxiliaries2", [](py::array_t<double> a) {
+        auto r = a.unchecked<2>();
+        auto r2 = a.mutable_unchecked<2>();
+        return auxiliaries(r, r2);
+    });
+
+    // test_array_unchecked_dyn_dims
+    // Same as the above, but without a compile-time dimensions specification:
+    sm.def("proxy_add2_dyn", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+    sm.def("proxy_init3_dyn", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 3) throw std::domain_error("error: ndim != 3");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_auxiliaries2_dyn", [](py::array_t<double> a) {
+        return auxiliaries(a.unchecked(), a.mutable_unchecked());
+    });
+
+    sm.def("array_auxiliaries2", [](py::array_t<double> a) {
+        return auxiliaries(a, a);
+    });
+
+    // test_array_failures
+    // Issue #785: Uninformative "Unknown internal error" exception when constructing array from empty object:
+    sm.def("array_fail_test", []() { return py::array(py::object()); });
+    sm.def("array_t_fail_test", []() { return py::array_t<double>(py::object()); });
+    // Make sure the error from numpy is being passed through:
+    sm.def("array_fail_test_negative_size", []() { int c = 0; return py::array(-1, &c); });
+
+    // test_initializer_list
+    // Issue (unnumbered; reported in #788): regression: initializer lists can be ambiguous
+    sm.def("array_initializer_list1", []() { return py::array_t<float>(1); }); // { 1 } also works, but clang warns about it
+    sm.def("array_initializer_list2", []() { return py::array_t<float>({ 1, 2 }); });
+    sm.def("array_initializer_list3", []() { return py::array_t<float>({ 1, 2, 3 }); });
+    sm.def("array_initializer_list4", []() { return py::array_t<float>({ 1, 2, 3, 4 }); });
+
+    // test_array_resize
+    // reshape array to 2D without changing size
+    sm.def("array_reshape2", [](py::array_t<double> a) {
+        const ssize_t dim_sz = (ssize_t)std::sqrt(a.size());
+        if (dim_sz * dim_sz != a.size())
+            throw std::domain_error("array_reshape2: input array total size is not a squared integer");
+        a.resize({dim_sz, dim_sz});
+    });
+
+    // resize to 3D array with each dimension = N
+    sm.def("array_resize3", [](py::array_t<double> a, size_t N, bool refcheck) {
+        a.resize({N, N, N}, refcheck);
+    });
+
+    // test_array_create_and_resize
+    // return 2D array with Nrows = Ncols = N
+    sm.def("create_and_resize", [](size_t N) {
+        py::array_t<double> a;
+        a.resize({N, N});
+        std::fill(a.mutable_data(), a.mutable_data() + a.size(), 42.);
+        return a;
+    });
+
+#if PY_MAJOR_VERSION >= 3
+        sm.def("index_using_ellipsis", [](py::array a) {
+            return a[py::make_tuple(0, py::ellipsis(), 0)];
+        });
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.py
new file mode 100644
index 0000000..d0a6324
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.py
@@ -0,0 +1,447 @@
+import pytest
+from pybind11_tests import numpy_array as m
+
+pytestmark = pytest.requires_numpy
+
+with pytest.suppress(ImportError):
+    import numpy as np
+
+
+def test_dtypes():
+    # See issue #1328.
+    # - Platform-dependent sizes.
+    for size_check in m.get_platform_dtype_size_checks():
+        print(size_check)
+        assert size_check.size_cpp == size_check.size_numpy, size_check
+    # - Concrete sizes.
+    for check in m.get_concrete_dtype_checks():
+        print(check)
+        assert check.numpy == check.pybind11, check
+        if check.numpy.num != check.pybind11.num:
+            print("NOTE: typenum mismatch for {}: {} != {}".format(
+                check, check.numpy.num, check.pybind11.num))
+
+
+@pytest.fixture(scope='function')
+def arr():
+    return np.array([[1, 2, 3], [4, 5, 6]], '=u2')
+
+
+def test_array_attributes():
+    a = np.array(0, 'f8')
+    assert m.ndim(a) == 0
+    assert all(m.shape(a) == [])
+    assert all(m.strides(a) == [])
+    with pytest.raises(IndexError) as excinfo:
+        m.shape(a, 0)
+    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    with pytest.raises(IndexError) as excinfo:
+        m.strides(a, 0)
+    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    assert m.writeable(a)
+    assert m.size(a) == 1
+    assert m.itemsize(a) == 8
+    assert m.nbytes(a) == 8
+    assert m.owndata(a)
+
+    a = np.array([[1, 2, 3], [4, 5, 6]], 'u2').view()
+    a.flags.writeable = False
+    assert m.ndim(a) == 2
+    assert all(m.shape(a) == [2, 3])
+    assert m.shape(a, 0) == 2
+    assert m.shape(a, 1) == 3
+    assert all(m.strides(a) == [6, 2])
+    assert m.strides(a, 0) == 6
+    assert m.strides(a, 1) == 2
+    with pytest.raises(IndexError) as excinfo:
+        m.shape(a, 2)
+    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    with pytest.raises(IndexError) as excinfo:
+        m.strides(a, 2)
+    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    assert not m.writeable(a)
+    assert m.size(a) == 6
+    assert m.itemsize(a) == 2
+    assert m.nbytes(a) == 12
+    assert not m.owndata(a)
+
+
+@pytest.mark.parametrize('args, ret', [([], 0), ([0], 0), ([1], 3), ([0, 1], 1), ([1, 2], 5)])
+def test_index_offset(arr, args, ret):
+    assert m.index_at(arr, *args) == ret
+    assert m.index_at_t(arr, *args) == ret
+    assert m.offset_at(arr, *args) == ret * arr.dtype.itemsize
+    assert m.offset_at_t(arr, *args) == ret * arr.dtype.itemsize
+
+
+def test_dim_check_fail(arr):
+    for func in (m.index_at, m.index_at_t, m.offset_at, m.offset_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t):
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 1, 2, 3)
+        assert str(excinfo.value) == 'too many indices for an array: 3 (ndim = 2)'
+
+
+@pytest.mark.parametrize('args, ret',
+                         [([], [1, 2, 3, 4, 5, 6]),
+                          ([1], [4, 5, 6]),
+                          ([0, 1], [2, 3, 4, 5, 6]),
+                          ([1, 2], [6])])
+def test_data(arr, args, ret):
+    from sys import byteorder
+    assert all(m.data_t(arr, *args) == ret)
+    assert all(m.data(arr, *args)[(0 if byteorder == 'little' else 1)::2] == ret)
+    assert all(m.data(arr, *args)[(1 if byteorder == 'little' else 0)::2] == 0)
+
+
+@pytest.mark.parametrize('dim', [0, 1, 3])
+def test_at_fail(arr, dim):
+    for func in m.at_t, m.mutate_at_t:
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, *([0] * dim))
+        assert str(excinfo.value) == 'index dimension mismatch: {} (ndim = 2)'.format(dim)
+
+
+def test_at(arr):
+    assert m.at_t(arr, 0, 2) == 3
+    assert m.at_t(arr, 1, 0) == 4
+
+    assert all(m.mutate_at_t(arr, 0, 2).ravel() == [1, 2, 4, 4, 5, 6])
+    assert all(m.mutate_at_t(arr, 1, 0).ravel() == [1, 2, 4, 5, 5, 6])
+
+
+def test_mutate_readonly(arr):
+    arr.flags.writeable = False
+    for func, args in (m.mutate_data, ()), (m.mutate_data_t, ()), (m.mutate_at_t, (0, 0)):
+        with pytest.raises(ValueError) as excinfo:
+            func(arr, *args)
+        assert str(excinfo.value) == 'array is not writeable'
+
+
+def test_mutate_data(arr):
+    assert all(m.mutate_data(arr).ravel() == [2, 4, 6, 8, 10, 12])
+    assert all(m.mutate_data(arr).ravel() == [4, 8, 12, 16, 20, 24])
+    assert all(m.mutate_data(arr, 1).ravel() == [4, 8, 12, 32, 40, 48])
+    assert all(m.mutate_data(arr, 0, 1).ravel() == [4, 16, 24, 64, 80, 96])
+    assert all(m.mutate_data(arr, 1, 2).ravel() == [4, 16, 24, 64, 80, 192])
+
+    assert all(m.mutate_data_t(arr).ravel() == [5, 17, 25, 65, 81, 193])
+    assert all(m.mutate_data_t(arr).ravel() == [6, 18, 26, 66, 82, 194])
+    assert all(m.mutate_data_t(arr, 1).ravel() == [6, 18, 26, 67, 83, 195])
+    assert all(m.mutate_data_t(arr, 0, 1).ravel() == [6, 19, 27, 68, 84, 196])
+    assert all(m.mutate_data_t(arr, 1, 2).ravel() == [6, 19, 27, 68, 84, 197])
+
+
+def test_bounds_check(arr):
+    for func in (m.index_at, m.index_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t, m.at_t, m.mutate_at_t):
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 2, 0)
+        assert str(excinfo.value) == 'index 2 is out of bounds for axis 0 with size 2'
+        with pytest.raises(IndexError) as excinfo:
+            func(arr, 0, 4)
+        assert str(excinfo.value) == 'index 4 is out of bounds for axis 1 with size 3'
+
+
+def test_make_c_f_array():
+    assert m.make_c_array().flags.c_contiguous
+    assert not m.make_c_array().flags.f_contiguous
+    assert m.make_f_array().flags.f_contiguous
+    assert not m.make_f_array().flags.c_contiguous
+
+
+def test_make_empty_shaped_array():
+    m.make_empty_shaped_array()
+
+    # empty shape means numpy scalar, PEP 3118
+    assert m.scalar_int().ndim == 0
+    assert m.scalar_int().shape == ()
+    assert m.scalar_int() == 42
+
+
+def test_wrap():
+    def assert_references(a, b, base=None):
+        from distutils.version import LooseVersion
+        if base is None:
+            base = a
+        assert a is not b
+        assert a.__array_interface__['data'][0] == b.__array_interface__['data'][0]
+        assert a.shape == b.shape
+        assert a.strides == b.strides
+        assert a.flags.c_contiguous == b.flags.c_contiguous
+        assert a.flags.f_contiguous == b.flags.f_contiguous
+        assert a.flags.writeable == b.flags.writeable
+        assert a.flags.aligned == b.flags.aligned
+        if LooseVersion(np.__version__) >= LooseVersion("1.14.0"):
+            assert a.flags.writebackifcopy == b.flags.writebackifcopy
+        else:
+            assert a.flags.updateifcopy == b.flags.updateifcopy
+        assert np.all(a == b)
+        assert not b.flags.owndata
+        assert b.base is base
+        if a.flags.writeable and a.ndim == 2:
+            a[0, 0] = 1234
+            assert b[0, 0] == 1234
+
+    a1 = np.array([1, 2], dtype=np.int16)
+    assert a1.flags.owndata and a1.base is None
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='F')
+    assert a1.flags.owndata and a1.base is None
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='C')
+    a1.flags.writeable = False
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1 = np.random.random((4, 4, 4))
+    a2 = m.wrap(a1)
+    assert_references(a1, a2)
+
+    a1t = a1.transpose()
+    a2 = m.wrap(a1t)
+    assert_references(a1t, a2, a1)
+
+    a1d = a1.diagonal()
+    a2 = m.wrap(a1d)
+    assert_references(a1d, a2, a1)
+
+    a1m = a1[::-1, ::-1, ::-1]
+    a2 = m.wrap(a1m)
+    assert_references(a1m, a2, a1)
+
+
+def test_numpy_view(capture):
+    with capture:
+        ac = m.ArrayClass()
+        ac_view_1 = ac.numpy_view()
+        ac_view_2 = ac.numpy_view()
+        assert np.all(ac_view_1 == np.array([1, 2], dtype=np.int32))
+        del ac
+        pytest.gc_collect()
+    assert capture == """
+        ArrayClass()
+        ArrayClass::numpy_view()
+        ArrayClass::numpy_view()
+    """
+    ac_view_1[0] = 4
+    ac_view_1[1] = 3
+    assert ac_view_2[0] == 4
+    assert ac_view_2[1] == 3
+    with capture:
+        del ac_view_1
+        del ac_view_2
+        pytest.gc_collect()
+        pytest.gc_collect()
+    assert capture == """
+        ~ArrayClass()
+    """
+
+
+@pytest.unsupported_on_pypy
+def test_cast_numpy_int64_to_uint64():
+    m.function_taking_uint64(123)
+    m.function_taking_uint64(np.uint64(123))
+
+
+def test_isinstance():
+    assert m.isinstance_untyped(np.array([1, 2, 3]), "not an array")
+    assert m.isinstance_typed(np.array([1.0, 2.0, 3.0]))
+
+
+def test_constructors():
+    defaults = m.default_constructors()
+    for a in defaults.values():
+        assert a.size == 0
+    assert defaults["array"].dtype == np.array([]).dtype
+    assert defaults["array_t<int32>"].dtype == np.int32
+    assert defaults["array_t<double>"].dtype == np.float64
+
+    results = m.converting_constructors([1, 2, 3])
+    for a in results.values():
+        np.testing.assert_array_equal(a, [1, 2, 3])
+    assert results["array"].dtype == np.int_
+    assert results["array_t<int32>"].dtype == np.int32
+    assert results["array_t<double>"].dtype == np.float64
+
+
+def test_overload_resolution(msg):
+    # Exact overload matches:
+    assert m.overloaded(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded(np.array([1], dtype='ushort')) == 'unsigned short'
+    assert m.overloaded(np.array([1], dtype='intc')) == 'int'
+    assert m.overloaded(np.array([1], dtype='longlong')) == 'long long'
+    assert m.overloaded(np.array([1], dtype='complex')) == 'double complex'
+    assert m.overloaded(np.array([1], dtype='csingle')) == 'float complex'
+
+    # No exact match, should call first convertible version:
+    assert m.overloaded(np.array([1], dtype='uint8')) == 'double'
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded("not an array")
+    assert msg(excinfo.value) == """
+        overloaded(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float64]) -> str
+            2. (arg0: numpy.ndarray[float32]) -> str
+            3. (arg0: numpy.ndarray[int32]) -> str
+            4. (arg0: numpy.ndarray[uint16]) -> str
+            5. (arg0: numpy.ndarray[int64]) -> str
+            6. (arg0: numpy.ndarray[complex128]) -> str
+            7. (arg0: numpy.ndarray[complex64]) -> str
+
+        Invoked with: 'not an array'
+    """
+
+    assert m.overloaded2(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded2(np.array([1], dtype='complex64')) == 'float complex'
+    assert m.overloaded2(np.array([1], dtype='complex128')) == 'double complex'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+
+    assert m.overloaded3(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded3(np.array([1], dtype='intc')) == 'int'
+    expected_exc = """
+        overloaded3(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[int32]) -> str
+            2. (arg0: numpy.ndarray[float64]) -> str
+
+        Invoked with: """
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='uintc'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1], dtype='uint32'))
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='float32'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1.], dtype='float32'))
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='complex'))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1. + 0.j]))
+
+    # Exact matches:
+    assert m.overloaded4(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='longlong')) == 'long long'
+    # Non-exact matches requiring conversion.  Since float to integer isn't a
+    # save conversion, it should go to the double overload, but short can go to
+    # either (and so should end up on the first-registered, the long long).
+    assert m.overloaded4(np.array([1], dtype='float32')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='short')) == 'long long'
+
+    assert m.overloaded5(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded5(np.array([1], dtype='uintc')) == 'unsigned int'
+    assert m.overloaded5(np.array([1], dtype='float32')) == 'unsigned int'
+
+
+def test_greedy_string_overload():
+    """Tests fix for #685 - ndarray shouldn't go to std::string overload"""
+
+    assert m.issue685("abc") == "string"
+    assert m.issue685(np.array([97, 98, 99], dtype='b')) == "array"
+    assert m.issue685(123) == "other"
+
+
+def test_array_unchecked_fixed_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    with pytest.raises(ValueError) as excinfo:
+        m.proxy_add2(np.array([1., 2, 3]), 5.0)
+    assert msg(excinfo.value) == "array has incorrect number of dimensions: 1; expected 2"
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3(3.0) == expect_c)
+    expect_f = np.transpose(expect_c)
+    assert np.all(m.proxy_init3F(3.0) == expect_f)
+
+    assert m.proxy_squared_L2_norm(np.array(range(6))) == 55
+    assert m.proxy_squared_L2_norm(np.array(range(6), dtype="float64")) == 55
+
+    assert m.proxy_auxiliaries2(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_unchecked_dyn_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2_dyn(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3_dyn(3.0) == expect_c)
+
+    assert m.proxy_auxiliaries2_dyn(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2_dyn(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_failure():
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_t_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array_t from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test_negative_size()
+    assert str(excinfo.value) == 'negative dimensions are not allowed'
+
+
+def test_initializer_list():
+    assert m.array_initializer_list1().shape == (1,)
+    assert m.array_initializer_list2().shape == (1, 2)
+    assert m.array_initializer_list3().shape == (1, 2, 3)
+    assert m.array_initializer_list4().shape == (1, 2, 3, 4)
+
+
+def test_array_resize(msg):
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float64')
+    m.array_reshape2(a)
+    assert(a.size == 9)
+    assert(np.all(a == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
+
+    # total size change should succced with refcheck off
+    m.array_resize3(a, 4, False)
+    assert(a.size == 64)
+    # ... and fail with refcheck on
+    try:
+        m.array_resize3(a, 3, True)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize an array"))
+    # transposed array doesn't own data
+    b = a.transpose()
+    try:
+        m.array_resize3(b, 3, False)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize this array: it does not own its data"))
+    # ... but reshape should be fine
+    m.array_reshape2(b)
+    assert(b.shape == (8, 8))
+
+
+@pytest.unsupported_on_pypy
+def test_array_create_and_resize(msg):
+    a = m.create_and_resize(2)
+    assert(a.size == 4)
+    assert(np.all(a == 42.))
+
+
+@pytest.unsupported_on_py2
+def test_index_using_ellipsis():
+    a = m.index_using_ellipsis(np.zeros((5, 6, 7)))
+    assert a.shape == (6,)
+
+
+@pytest.unsupported_on_pypy
+def test_dtype_refcount_leak():
+    from sys import getrefcount
+    dtype = np.dtype(np.float_)
+    a = np.array([1], dtype=dtype)
+    before = getrefcount(dtype)
+    m.ndim(a)
+    after = getrefcount(dtype)
+    assert after == before
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cc
new file mode 100644
index 0000000..467e025
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cc
@@ -0,0 +1,474 @@
+/*
+  tests/test_numpy_dtypes.cpp -- Structured and compound NumPy dtypes
+
+  Copyright (c) 2016 Ivan Smirnov
+
+  All rights reserved. Use of this source code is governed by a
+  BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+#ifdef __GNUC__
+#define PYBIND11_PACKED(cls) cls __attribute__((__packed__))
+#else
+#define PYBIND11_PACKED(cls) __pragma(pack(push, 1)) cls __pragma(pack(pop))
+#endif
+
+namespace py = pybind11;
+
+struct SimpleStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+};
+
+std::ostream& operator<<(std::ostream& os, const SimpleStruct& v) {
+    return os << "s:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+struct SimpleStructReordered {
+    bool bool_;
+    float float_;
+    uint32_t uint_;
+    long double ldbl_;
+};
+
+PYBIND11_PACKED(struct PackedStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+});
+
+std::ostream& operator<<(std::ostream& os, const PackedStruct& v) {
+    return os << "p:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+PYBIND11_PACKED(struct NestedStruct {
+    SimpleStruct a;
+    PackedStruct b;
+});
+
+std::ostream& operator<<(std::ostream& os, const NestedStruct& v) {
+    return os << "n:a=" << v.a << ";b=" << v.b;
+}
+
+struct PartialStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    uint64_t dummy2;
+    long double ldbl_;
+};
+
+struct PartialNestedStruct {
+    uint64_t dummy1;
+    PartialStruct a;
+    uint64_t dummy2;
+};
+
+struct UnboundStruct { };
+
+struct StringStruct {
+    char a[3];
+    std::array<char, 3> b;
+};
+
+struct ComplexStruct {
+    std::complex<float> cflt;
+    std::complex<double> cdbl;
+};
+
+std::ostream& operator<<(std::ostream& os, const ComplexStruct& v) {
+    return os << "c:" << v.cflt << "," << v.cdbl;
+}
+
+struct ArrayStruct {
+    char a[3][4];
+    int32_t b[2];
+    std::array<uint8_t, 3> c;
+    std::array<float, 2> d[4];
+};
+
+PYBIND11_PACKED(struct StructWithUglyNames {
+    int8_t __x__;
+    uint64_t __y__;
+});
+
+enum class E1 : int64_t { A = -1, B = 1 };
+enum E2 : uint8_t { X = 1, Y = 2 };
+
+PYBIND11_PACKED(struct EnumStruct {
+    E1 e1;
+    E2 e2;
+});
+
+std::ostream& operator<<(std::ostream& os, const StringStruct& v) {
+    os << "a='";
+    for (size_t i = 0; i < 3 && v.a[i]; i++) os << v.a[i];
+    os << "',b='";
+    for (size_t i = 0; i < 3 && v.b[i]; i++) os << v.b[i];
+    return os << "'";
+}
+
+std::ostream& operator<<(std::ostream& os, const ArrayStruct& v) {
+    os << "a={";
+    for (int i = 0; i < 3; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{';
+        for (int j = 0; j < 3; j++)
+            os << v.a[i][j] << ',';
+        os << v.a[i][3] << '}';
+    }
+    os << "},b={" << v.b[0] << ',' << v.b[1];
+    os << "},c={" << int(v.c[0]) << ',' << int(v.c[1]) << ',' << int(v.c[2]);
+    os << "},d={";
+    for (int i = 0; i < 4; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{' << v.d[i][0] << ',' << v.d[i][1] << '}';
+    }
+    return os << '}';
+}
+
+std::ostream& operator<<(std::ostream& os, const EnumStruct& v) {
+    return os << "e1=" << (v.e1 == E1::A ? "A" : "B") << ",e2=" << (v.e2 == E2::X ? "X" : "Y");
+}
+
+template <typename T>
+py::array mkarray_via_buffer(size_t n) {
+    return py::array(py::buffer_info(nullptr, sizeof(T),
+                                     py::format_descriptor<T>::format(),
+                                     1, { n }, { sizeof(T) }));
+}
+
+#define SET_TEST_VALS(s, i) do { \
+    s.bool_ = (i) % 2 != 0; \
+    s.uint_ = (uint32_t) (i); \
+    s.float_ = (float) (i) * 1.5f; \
+    s.ldbl_ = (long double) (i) * -2.5L; } while (0)
+
+template <typename S>
+py::array_t<S, 0> create_recarray(size_t n) {
+    auto arr = mkarray_via_buffer<S>(n);
+    auto req = arr.request();
+    auto ptr = static_cast<S*>(req.ptr);
+    for (size_t i = 0; i < n; i++) {
+        SET_TEST_VALS(ptr[i], i);
+    }
+    return arr;
+}
+
+template <typename S>
+py::list print_recarray(py::array_t<S, 0> arr) {
+    const auto req = arr.request();
+    const auto ptr = static_cast<S*>(req.ptr);
+    auto l = py::list();
+    for (ssize_t i = 0; i < req.size; i++) {
+        std::stringstream ss;
+        ss << ptr[i];
+        l.append(py::str(ss.str()));
+    }
+    return l;
+}
+
+py::array_t<int32_t, 0> test_array_ctors(int i) {
+    using arr_t = py::array_t<int32_t, 0>;
+
+    std::vector<int32_t> data { 1, 2, 3, 4, 5, 6 };
+    std::vector<ssize_t> shape { 3, 2 };
+    std::vector<ssize_t> strides { 8, 4 };
+
+    auto ptr = data.data();
+    auto vptr = (void *) ptr;
+    auto dtype = py::dtype("int32");
+
+    py::buffer_info buf_ndim1(vptr, 4, "i", 6);
+    py::buffer_info buf_ndim1_null(nullptr, 4, "i", 6);
+    py::buffer_info buf_ndim2(vptr, 4, "i", 2, shape, strides);
+    py::buffer_info buf_ndim2_null(nullptr, 4, "i", 2, shape, strides);
+
+    auto fill = [](py::array arr) {
+        auto req = arr.request();
+        for (int i = 0; i < 6; i++) ((int32_t *) req.ptr)[i] = i + 1;
+        return arr;
+    };
+
+    switch (i) {
+    // shape: (3, 2)
+    case 10: return arr_t(shape, strides, ptr);
+    case 11: return py::array(shape, strides, ptr);
+    case 12: return py::array(dtype, shape, strides, vptr);
+    case 13: return arr_t(shape, ptr);
+    case 14: return py::array(shape, ptr);
+    case 15: return py::array(dtype, shape, vptr);
+    case 16: return arr_t(buf_ndim2);
+    case 17: return py::array(buf_ndim2);
+    // shape: (3, 2) - post-fill
+    case 20: return fill(arr_t(shape, strides));
+    case 21: return py::array(shape, strides, ptr); // can't have nullptr due to templated ctor
+    case 22: return fill(py::array(dtype, shape, strides));
+    case 23: return fill(arr_t(shape));
+    case 24: return py::array(shape, ptr); // can't have nullptr due to templated ctor
+    case 25: return fill(py::array(dtype, shape));
+    case 26: return fill(arr_t(buf_ndim2_null));
+    case 27: return fill(py::array(buf_ndim2_null));
+    // shape: (6, )
+    case 30: return arr_t(6, ptr);
+    case 31: return py::array(6, ptr);
+    case 32: return py::array(dtype, 6, vptr);
+    case 33: return arr_t(buf_ndim1);
+    case 34: return py::array(buf_ndim1);
+    // shape: (6, )
+    case 40: return fill(arr_t(6));
+    case 41: return py::array(6, ptr);  // can't have nullptr due to templated ctor
+    case 42: return fill(py::array(dtype, 6));
+    case 43: return fill(arr_t(buf_ndim1_null));
+    case 44: return fill(py::array(buf_ndim1_null));
+    }
+    return arr_t();
+}
+
+py::list test_dtype_ctors() {
+    py::list list;
+    list.append(py::dtype("int32"));
+    list.append(py::dtype(std::string("float64")));
+    list.append(py::dtype::from_args(py::str("bool")));
+    py::list names, offsets, formats;
+    py::dict dict;
+    names.append(py::str("a")); names.append(py::str("b")); dict["names"] = names;
+    offsets.append(py::int_(1)); offsets.append(py::int_(10)); dict["offsets"] = offsets;
+    formats.append(py::dtype("int32")); formats.append(py::dtype("float64")); dict["formats"] = formats;
+    dict["itemsize"] = py::int_(20);
+    list.append(py::dtype::from_args(dict));
+    list.append(py::dtype(names, formats, offsets, 20));
+    list.append(py::dtype(py::buffer_info((void *) 0, sizeof(unsigned int), "I", 1)));
+    list.append(py::dtype(py::buffer_info((void *) 0, 0, "T{i:a:f:b:}", 1)));
+    return list;
+}
+
+struct A {};
+struct B {};
+
+TEST_SUBMODULE(numpy_dtypes, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // typeinfo may be registered before the dtype descriptor for scalar casts to work...
+    py::class_<SimpleStruct>(m, "SimpleStruct");
+
+    PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(SimpleStructReordered, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PackedStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(NestedStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(PartialStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PartialNestedStruct, a);
+    PYBIND11_NUMPY_DTYPE(StringStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(ArrayStruct, a, b, c, d);
+    PYBIND11_NUMPY_DTYPE(EnumStruct, e1, e2);
+    PYBIND11_NUMPY_DTYPE(ComplexStruct, cflt, cdbl);
+
+    // ... or after
+    py::class_<PackedStruct>(m, "PackedStruct");
+
+    PYBIND11_NUMPY_DTYPE_EX(StructWithUglyNames, __x__, "x", __y__, "y");
+
+    // If uncommented, this should produce a static_assert failure telling the user that the struct
+    // is not a POD type
+//    struct NotPOD { std::string v; NotPOD() : v("hi") {}; };
+//    PYBIND11_NUMPY_DTYPE(NotPOD, v);
+
+    // Check that dtypes can be registered programmatically, both from
+    // initializer lists of field descriptors and from other containers.
+    py::detail::npy_format_descriptor<A>::register_dtype(
+        {}
+    );
+    py::detail::npy_format_descriptor<B>::register_dtype(
+        std::vector<py::detail::field_descriptor>{}
+    );
+
+    // test_recarray, test_scalar_conversion
+    m.def("create_rec_simple", &create_recarray<SimpleStruct>);
+    m.def("create_rec_packed", &create_recarray<PackedStruct>);
+    m.def("create_rec_nested", [](size_t n) { // test_signature
+        py::array_t<NestedStruct, 0> arr = mkarray_via_buffer<NestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<NestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+            SET_TEST_VALS(ptr[i].b, i + 1);
+        }
+        return arr;
+    });
+    m.def("create_rec_partial", &create_recarray<PartialStruct>);
+    m.def("create_rec_partial_nested", [](size_t n) {
+        py::array_t<PartialNestedStruct, 0> arr = mkarray_via_buffer<PartialNestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<PartialNestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+        }
+        return arr;
+    });
+    m.def("print_rec_simple", &print_recarray<SimpleStruct>);
+    m.def("print_rec_packed", &print_recarray<PackedStruct>);
+    m.def("print_rec_nested", &print_recarray<NestedStruct>);
+
+    // test_format_descriptors
+    m.def("get_format_unbound", []() { return py::format_descriptor<UnboundStruct>::format(); });
+    m.def("print_format_descriptors", []() {
+        py::list l;
+        for (const auto &fmt : {
+            py::format_descriptor<SimpleStruct>::format(),
+            py::format_descriptor<PackedStruct>::format(),
+            py::format_descriptor<NestedStruct>::format(),
+            py::format_descriptor<PartialStruct>::format(),
+            py::format_descriptor<PartialNestedStruct>::format(),
+            py::format_descriptor<StringStruct>::format(),
+            py::format_descriptor<ArrayStruct>::format(),
+            py::format_descriptor<EnumStruct>::format(),
+            py::format_descriptor<ComplexStruct>::format()
+        }) {
+            l.append(py::cast(fmt));
+        }
+        return l;
+    });
+
+    // test_dtype
+    m.def("print_dtypes", []() {
+        py::list l;
+        for (const py::handle &d : {
+            py::dtype::of<SimpleStruct>(),
+            py::dtype::of<PackedStruct>(),
+            py::dtype::of<NestedStruct>(),
+            py::dtype::of<PartialStruct>(),
+            py::dtype::of<PartialNestedStruct>(),
+            py::dtype::of<StringStruct>(),
+            py::dtype::of<ArrayStruct>(),
+            py::dtype::of<EnumStruct>(),
+            py::dtype::of<StructWithUglyNames>(),
+            py::dtype::of<ComplexStruct>()
+        })
+            l.append(py::str(d));
+        return l;
+    });
+    m.def("test_dtype_ctors", &test_dtype_ctors);
+    m.def("test_dtype_methods", []() {
+        py::list list;
+        auto dt1 = py::dtype::of<int32_t>();
+        auto dt2 = py::dtype::of<SimpleStruct>();
+        list.append(dt1); list.append(dt2);
+        list.append(py::bool_(dt1.has_fields())); list.append(py::bool_(dt2.has_fields()));
+        list.append(py::int_(dt1.itemsize())); list.append(py::int_(dt2.itemsize()));
+        return list;
+    });
+    struct TrailingPaddingStruct {
+        int32_t a;
+        char b;
+    };
+    PYBIND11_NUMPY_DTYPE(TrailingPaddingStruct, a, b);
+    m.def("trailing_padding_dtype", []() { return py::dtype::of<TrailingPaddingStruct>(); });
+
+    // test_string_array
+    m.def("create_string_array", [](bool non_empty) {
+        py::array_t<StringStruct, 0> arr = mkarray_via_buffer<StringStruct>(non_empty ? 4 : 0);
+        if (non_empty) {
+            auto req = arr.request();
+            auto ptr = static_cast<StringStruct*>(req.ptr);
+            for (ssize_t i = 0; i < req.size * req.itemsize; i++)
+                static_cast<char*>(req.ptr)[i] = 0;
+            ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
+            ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
+            ptr[3].a[0] = 'a'; ptr[3].b[0] = 'a';
+
+            ptr[2].a[1] = 'b'; ptr[2].b[1] = 'b';
+            ptr[3].a[1] = 'b'; ptr[3].b[1] = 'b';
+
+            ptr[3].a[2] = 'c'; ptr[3].b[2] = 'c';
+        }
+        return arr;
+    });
+    m.def("print_string_array", &print_recarray<StringStruct>);
+
+    // test_array_array
+    m.def("create_array_array", [](size_t n) {
+        py::array_t<ArrayStruct, 0> arr = mkarray_via_buffer<ArrayStruct>(n);
+        auto ptr = (ArrayStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            for (size_t j = 0; j < 3; j++)
+                for (size_t k = 0; k < 4; k++)
+                    ptr[i].a[j][k] = char('A' + (i * 100 + j * 10 + k) % 26);
+            for (size_t j = 0; j < 2; j++)
+                ptr[i].b[j] = int32_t(i * 1000 + j);
+            for (size_t j = 0; j < 3; j++)
+                ptr[i].c[j] = uint8_t(i * 10 + j);
+            for (size_t j = 0; j < 4; j++)
+                for (size_t k = 0; k < 2; k++)
+                    ptr[i].d[j][k] = float(i) * 100.0f + float(j) * 10.0f + float(k);
+        }
+        return arr;
+    });
+    m.def("print_array_array", &print_recarray<ArrayStruct>);
+
+    // test_enum_array
+    m.def("create_enum_array", [](size_t n) {
+        py::array_t<EnumStruct, 0> arr = mkarray_via_buffer<EnumStruct>(n);
+        auto ptr = (EnumStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].e1 = static_cast<E1>(-1 + ((int) i % 2) * 2);
+            ptr[i].e2 = static_cast<E2>(1 + (i % 2));
+        }
+        return arr;
+    });
+    m.def("print_enum_array", &print_recarray<EnumStruct>);
+
+    // test_complex_array
+    m.def("create_complex_array", [](size_t n) {
+        py::array_t<ComplexStruct, 0> arr = mkarray_via_buffer<ComplexStruct>(n);
+        auto ptr = (ComplexStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].cflt.real(float(i));
+            ptr[i].cflt.imag(float(i) + 0.25f);
+            ptr[i].cdbl.real(double(i) + 0.5);
+            ptr[i].cdbl.imag(double(i) + 0.75);
+        }
+        return arr;
+    });
+    m.def("print_complex_array", &print_recarray<ComplexStruct>);
+
+    // test_array_constructors
+    m.def("test_array_ctors", &test_array_ctors);
+
+    // test_compare_buffer_info
+    struct CompareStruct {
+        bool x;
+        uint32_t y;
+        float z;
+    };
+    PYBIND11_NUMPY_DTYPE(CompareStruct, x, y, z);
+    m.def("compare_buffer_info", []() {
+        py::list list;
+        list.append(py::bool_(py::detail::compare_buffer_info<float>::compare(py::buffer_info(nullptr, sizeof(float), "f", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<unsigned>::compare(py::buffer_info(nullptr, sizeof(int), "I", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), "l", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), sizeof(long) == sizeof(int) ? "i" : "q", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<CompareStruct>::compare(py::buffer_info(nullptr, sizeof(CompareStruct), "T{?:x:3xI:y:f:z:}", 1))));
+        return list;
+    });
+    m.def("buffer_to_dtype", [](py::buffer& buf) { return py::dtype(buf.request()); });
+
+    // test_scalar_conversion
+    m.def("f_simple", [](SimpleStruct s) { return s.uint_ * 10; });
+    m.def("f_packed", [](PackedStruct s) { return s.uint_ * 10; });
+    m.def("f_nested", [](NestedStruct s) { return s.a.uint_ * 10; });
+
+    // test_register_dtype
+    m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_); });
+
+    // test_str_leak
+    m.def("dtype_wrapper", [](py::object d) { return py::dtype::from_args(std::move(d)); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cpp
new file mode 100644
index 0000000..467e025
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cpp
@@ -0,0 +1,474 @@
+/*
+  tests/test_numpy_dtypes.cpp -- Structured and compound NumPy dtypes
+
+  Copyright (c) 2016 Ivan Smirnov
+
+  All rights reserved. Use of this source code is governed by a
+  BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+#ifdef __GNUC__
+#define PYBIND11_PACKED(cls) cls __attribute__((__packed__))
+#else
+#define PYBIND11_PACKED(cls) __pragma(pack(push, 1)) cls __pragma(pack(pop))
+#endif
+
+namespace py = pybind11;
+
+struct SimpleStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+};
+
+std::ostream& operator<<(std::ostream& os, const SimpleStruct& v) {
+    return os << "s:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+struct SimpleStructReordered {
+    bool bool_;
+    float float_;
+    uint32_t uint_;
+    long double ldbl_;
+};
+
+PYBIND11_PACKED(struct PackedStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
+});
+
+std::ostream& operator<<(std::ostream& os, const PackedStruct& v) {
+    return os << "p:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
+}
+
+PYBIND11_PACKED(struct NestedStruct {
+    SimpleStruct a;
+    PackedStruct b;
+});
+
+std::ostream& operator<<(std::ostream& os, const NestedStruct& v) {
+    return os << "n:a=" << v.a << ";b=" << v.b;
+}
+
+struct PartialStruct {
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    uint64_t dummy2;
+    long double ldbl_;
+};
+
+struct PartialNestedStruct {
+    uint64_t dummy1;
+    PartialStruct a;
+    uint64_t dummy2;
+};
+
+struct UnboundStruct { };
+
+struct StringStruct {
+    char a[3];
+    std::array<char, 3> b;
+};
+
+struct ComplexStruct {
+    std::complex<float> cflt;
+    std::complex<double> cdbl;
+};
+
+std::ostream& operator<<(std::ostream& os, const ComplexStruct& v) {
+    return os << "c:" << v.cflt << "," << v.cdbl;
+}
+
+struct ArrayStruct {
+    char a[3][4];
+    int32_t b[2];
+    std::array<uint8_t, 3> c;
+    std::array<float, 2> d[4];
+};
+
+PYBIND11_PACKED(struct StructWithUglyNames {
+    int8_t __x__;
+    uint64_t __y__;
+});
+
+enum class E1 : int64_t { A = -1, B = 1 };
+enum E2 : uint8_t { X = 1, Y = 2 };
+
+PYBIND11_PACKED(struct EnumStruct {
+    E1 e1;
+    E2 e2;
+});
+
+std::ostream& operator<<(std::ostream& os, const StringStruct& v) {
+    os << "a='";
+    for (size_t i = 0; i < 3 && v.a[i]; i++) os << v.a[i];
+    os << "',b='";
+    for (size_t i = 0; i < 3 && v.b[i]; i++) os << v.b[i];
+    return os << "'";
+}
+
+std::ostream& operator<<(std::ostream& os, const ArrayStruct& v) {
+    os << "a={";
+    for (int i = 0; i < 3; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{';
+        for (int j = 0; j < 3; j++)
+            os << v.a[i][j] << ',';
+        os << v.a[i][3] << '}';
+    }
+    os << "},b={" << v.b[0] << ',' << v.b[1];
+    os << "},c={" << int(v.c[0]) << ',' << int(v.c[1]) << ',' << int(v.c[2]);
+    os << "},d={";
+    for (int i = 0; i < 4; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{' << v.d[i][0] << ',' << v.d[i][1] << '}';
+    }
+    return os << '}';
+}
+
+std::ostream& operator<<(std::ostream& os, const EnumStruct& v) {
+    return os << "e1=" << (v.e1 == E1::A ? "A" : "B") << ",e2=" << (v.e2 == E2::X ? "X" : "Y");
+}
+
+template <typename T>
+py::array mkarray_via_buffer(size_t n) {
+    return py::array(py::buffer_info(nullptr, sizeof(T),
+                                     py::format_descriptor<T>::format(),
+                                     1, { n }, { sizeof(T) }));
+}
+
+#define SET_TEST_VALS(s, i) do { \
+    s.bool_ = (i) % 2 != 0; \
+    s.uint_ = (uint32_t) (i); \
+    s.float_ = (float) (i) * 1.5f; \
+    s.ldbl_ = (long double) (i) * -2.5L; } while (0)
+
+template <typename S>
+py::array_t<S, 0> create_recarray(size_t n) {
+    auto arr = mkarray_via_buffer<S>(n);
+    auto req = arr.request();
+    auto ptr = static_cast<S*>(req.ptr);
+    for (size_t i = 0; i < n; i++) {
+        SET_TEST_VALS(ptr[i], i);
+    }
+    return arr;
+}
+
+template <typename S>
+py::list print_recarray(py::array_t<S, 0> arr) {
+    const auto req = arr.request();
+    const auto ptr = static_cast<S*>(req.ptr);
+    auto l = py::list();
+    for (ssize_t i = 0; i < req.size; i++) {
+        std::stringstream ss;
+        ss << ptr[i];
+        l.append(py::str(ss.str()));
+    }
+    return l;
+}
+
+py::array_t<int32_t, 0> test_array_ctors(int i) {
+    using arr_t = py::array_t<int32_t, 0>;
+
+    std::vector<int32_t> data { 1, 2, 3, 4, 5, 6 };
+    std::vector<ssize_t> shape { 3, 2 };
+    std::vector<ssize_t> strides { 8, 4 };
+
+    auto ptr = data.data();
+    auto vptr = (void *) ptr;
+    auto dtype = py::dtype("int32");
+
+    py::buffer_info buf_ndim1(vptr, 4, "i", 6);
+    py::buffer_info buf_ndim1_null(nullptr, 4, "i", 6);
+    py::buffer_info buf_ndim2(vptr, 4, "i", 2, shape, strides);
+    py::buffer_info buf_ndim2_null(nullptr, 4, "i", 2, shape, strides);
+
+    auto fill = [](py::array arr) {
+        auto req = arr.request();
+        for (int i = 0; i < 6; i++) ((int32_t *) req.ptr)[i] = i + 1;
+        return arr;
+    };
+
+    switch (i) {
+    // shape: (3, 2)
+    case 10: return arr_t(shape, strides, ptr);
+    case 11: return py::array(shape, strides, ptr);
+    case 12: return py::array(dtype, shape, strides, vptr);
+    case 13: return arr_t(shape, ptr);
+    case 14: return py::array(shape, ptr);
+    case 15: return py::array(dtype, shape, vptr);
+    case 16: return arr_t(buf_ndim2);
+    case 17: return py::array(buf_ndim2);
+    // shape: (3, 2) - post-fill
+    case 20: return fill(arr_t(shape, strides));
+    case 21: return py::array(shape, strides, ptr); // can't have nullptr due to templated ctor
+    case 22: return fill(py::array(dtype, shape, strides));
+    case 23: return fill(arr_t(shape));
+    case 24: return py::array(shape, ptr); // can't have nullptr due to templated ctor
+    case 25: return fill(py::array(dtype, shape));
+    case 26: return fill(arr_t(buf_ndim2_null));
+    case 27: return fill(py::array(buf_ndim2_null));
+    // shape: (6, )
+    case 30: return arr_t(6, ptr);
+    case 31: return py::array(6, ptr);
+    case 32: return py::array(dtype, 6, vptr);
+    case 33: return arr_t(buf_ndim1);
+    case 34: return py::array(buf_ndim1);
+    // shape: (6, )
+    case 40: return fill(arr_t(6));
+    case 41: return py::array(6, ptr);  // can't have nullptr due to templated ctor
+    case 42: return fill(py::array(dtype, 6));
+    case 43: return fill(arr_t(buf_ndim1_null));
+    case 44: return fill(py::array(buf_ndim1_null));
+    }
+    return arr_t();
+}
+
+py::list test_dtype_ctors() {
+    py::list list;
+    list.append(py::dtype("int32"));
+    list.append(py::dtype(std::string("float64")));
+    list.append(py::dtype::from_args(py::str("bool")));
+    py::list names, offsets, formats;
+    py::dict dict;
+    names.append(py::str("a")); names.append(py::str("b")); dict["names"] = names;
+    offsets.append(py::int_(1)); offsets.append(py::int_(10)); dict["offsets"] = offsets;
+    formats.append(py::dtype("int32")); formats.append(py::dtype("float64")); dict["formats"] = formats;
+    dict["itemsize"] = py::int_(20);
+    list.append(py::dtype::from_args(dict));
+    list.append(py::dtype(names, formats, offsets, 20));
+    list.append(py::dtype(py::buffer_info((void *) 0, sizeof(unsigned int), "I", 1)));
+    list.append(py::dtype(py::buffer_info((void *) 0, 0, "T{i:a:f:b:}", 1)));
+    return list;
+}
+
+struct A {};
+struct B {};
+
+TEST_SUBMODULE(numpy_dtypes, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // typeinfo may be registered before the dtype descriptor for scalar casts to work...
+    py::class_<SimpleStruct>(m, "SimpleStruct");
+
+    PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(SimpleStructReordered, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PackedStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(NestedStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(PartialStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PartialNestedStruct, a);
+    PYBIND11_NUMPY_DTYPE(StringStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(ArrayStruct, a, b, c, d);
+    PYBIND11_NUMPY_DTYPE(EnumStruct, e1, e2);
+    PYBIND11_NUMPY_DTYPE(ComplexStruct, cflt, cdbl);
+
+    // ... or after
+    py::class_<PackedStruct>(m, "PackedStruct");
+
+    PYBIND11_NUMPY_DTYPE_EX(StructWithUglyNames, __x__, "x", __y__, "y");
+
+    // If uncommented, this should produce a static_assert failure telling the user that the struct
+    // is not a POD type
+//    struct NotPOD { std::string v; NotPOD() : v("hi") {}; };
+//    PYBIND11_NUMPY_DTYPE(NotPOD, v);
+
+    // Check that dtypes can be registered programmatically, both from
+    // initializer lists of field descriptors and from other containers.
+    py::detail::npy_format_descriptor<A>::register_dtype(
+        {}
+    );
+    py::detail::npy_format_descriptor<B>::register_dtype(
+        std::vector<py::detail::field_descriptor>{}
+    );
+
+    // test_recarray, test_scalar_conversion
+    m.def("create_rec_simple", &create_recarray<SimpleStruct>);
+    m.def("create_rec_packed", &create_recarray<PackedStruct>);
+    m.def("create_rec_nested", [](size_t n) { // test_signature
+        py::array_t<NestedStruct, 0> arr = mkarray_via_buffer<NestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<NestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+            SET_TEST_VALS(ptr[i].b, i + 1);
+        }
+        return arr;
+    });
+    m.def("create_rec_partial", &create_recarray<PartialStruct>);
+    m.def("create_rec_partial_nested", [](size_t n) {
+        py::array_t<PartialNestedStruct, 0> arr = mkarray_via_buffer<PartialNestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<PartialNestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+        }
+        return arr;
+    });
+    m.def("print_rec_simple", &print_recarray<SimpleStruct>);
+    m.def("print_rec_packed", &print_recarray<PackedStruct>);
+    m.def("print_rec_nested", &print_recarray<NestedStruct>);
+
+    // test_format_descriptors
+    m.def("get_format_unbound", []() { return py::format_descriptor<UnboundStruct>::format(); });
+    m.def("print_format_descriptors", []() {
+        py::list l;
+        for (const auto &fmt : {
+            py::format_descriptor<SimpleStruct>::format(),
+            py::format_descriptor<PackedStruct>::format(),
+            py::format_descriptor<NestedStruct>::format(),
+            py::format_descriptor<PartialStruct>::format(),
+            py::format_descriptor<PartialNestedStruct>::format(),
+            py::format_descriptor<StringStruct>::format(),
+            py::format_descriptor<ArrayStruct>::format(),
+            py::format_descriptor<EnumStruct>::format(),
+            py::format_descriptor<ComplexStruct>::format()
+        }) {
+            l.append(py::cast(fmt));
+        }
+        return l;
+    });
+
+    // test_dtype
+    m.def("print_dtypes", []() {
+        py::list l;
+        for (const py::handle &d : {
+            py::dtype::of<SimpleStruct>(),
+            py::dtype::of<PackedStruct>(),
+            py::dtype::of<NestedStruct>(),
+            py::dtype::of<PartialStruct>(),
+            py::dtype::of<PartialNestedStruct>(),
+            py::dtype::of<StringStruct>(),
+            py::dtype::of<ArrayStruct>(),
+            py::dtype::of<EnumStruct>(),
+            py::dtype::of<StructWithUglyNames>(),
+            py::dtype::of<ComplexStruct>()
+        })
+            l.append(py::str(d));
+        return l;
+    });
+    m.def("test_dtype_ctors", &test_dtype_ctors);
+    m.def("test_dtype_methods", []() {
+        py::list list;
+        auto dt1 = py::dtype::of<int32_t>();
+        auto dt2 = py::dtype::of<SimpleStruct>();
+        list.append(dt1); list.append(dt2);
+        list.append(py::bool_(dt1.has_fields())); list.append(py::bool_(dt2.has_fields()));
+        list.append(py::int_(dt1.itemsize())); list.append(py::int_(dt2.itemsize()));
+        return list;
+    });
+    struct TrailingPaddingStruct {
+        int32_t a;
+        char b;
+    };
+    PYBIND11_NUMPY_DTYPE(TrailingPaddingStruct, a, b);
+    m.def("trailing_padding_dtype", []() { return py::dtype::of<TrailingPaddingStruct>(); });
+
+    // test_string_array
+    m.def("create_string_array", [](bool non_empty) {
+        py::array_t<StringStruct, 0> arr = mkarray_via_buffer<StringStruct>(non_empty ? 4 : 0);
+        if (non_empty) {
+            auto req = arr.request();
+            auto ptr = static_cast<StringStruct*>(req.ptr);
+            for (ssize_t i = 0; i < req.size * req.itemsize; i++)
+                static_cast<char*>(req.ptr)[i] = 0;
+            ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
+            ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
+            ptr[3].a[0] = 'a'; ptr[3].b[0] = 'a';
+
+            ptr[2].a[1] = 'b'; ptr[2].b[1] = 'b';
+            ptr[3].a[1] = 'b'; ptr[3].b[1] = 'b';
+
+            ptr[3].a[2] = 'c'; ptr[3].b[2] = 'c';
+        }
+        return arr;
+    });
+    m.def("print_string_array", &print_recarray<StringStruct>);
+
+    // test_array_array
+    m.def("create_array_array", [](size_t n) {
+        py::array_t<ArrayStruct, 0> arr = mkarray_via_buffer<ArrayStruct>(n);
+        auto ptr = (ArrayStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            for (size_t j = 0; j < 3; j++)
+                for (size_t k = 0; k < 4; k++)
+                    ptr[i].a[j][k] = char('A' + (i * 100 + j * 10 + k) % 26);
+            for (size_t j = 0; j < 2; j++)
+                ptr[i].b[j] = int32_t(i * 1000 + j);
+            for (size_t j = 0; j < 3; j++)
+                ptr[i].c[j] = uint8_t(i * 10 + j);
+            for (size_t j = 0; j < 4; j++)
+                for (size_t k = 0; k < 2; k++)
+                    ptr[i].d[j][k] = float(i) * 100.0f + float(j) * 10.0f + float(k);
+        }
+        return arr;
+    });
+    m.def("print_array_array", &print_recarray<ArrayStruct>);
+
+    // test_enum_array
+    m.def("create_enum_array", [](size_t n) {
+        py::array_t<EnumStruct, 0> arr = mkarray_via_buffer<EnumStruct>(n);
+        auto ptr = (EnumStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].e1 = static_cast<E1>(-1 + ((int) i % 2) * 2);
+            ptr[i].e2 = static_cast<E2>(1 + (i % 2));
+        }
+        return arr;
+    });
+    m.def("print_enum_array", &print_recarray<EnumStruct>);
+
+    // test_complex_array
+    m.def("create_complex_array", [](size_t n) {
+        py::array_t<ComplexStruct, 0> arr = mkarray_via_buffer<ComplexStruct>(n);
+        auto ptr = (ComplexStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].cflt.real(float(i));
+            ptr[i].cflt.imag(float(i) + 0.25f);
+            ptr[i].cdbl.real(double(i) + 0.5);
+            ptr[i].cdbl.imag(double(i) + 0.75);
+        }
+        return arr;
+    });
+    m.def("print_complex_array", &print_recarray<ComplexStruct>);
+
+    // test_array_constructors
+    m.def("test_array_ctors", &test_array_ctors);
+
+    // test_compare_buffer_info
+    struct CompareStruct {
+        bool x;
+        uint32_t y;
+        float z;
+    };
+    PYBIND11_NUMPY_DTYPE(CompareStruct, x, y, z);
+    m.def("compare_buffer_info", []() {
+        py::list list;
+        list.append(py::bool_(py::detail::compare_buffer_info<float>::compare(py::buffer_info(nullptr, sizeof(float), "f", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<unsigned>::compare(py::buffer_info(nullptr, sizeof(int), "I", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), "l", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), sizeof(long) == sizeof(int) ? "i" : "q", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<CompareStruct>::compare(py::buffer_info(nullptr, sizeof(CompareStruct), "T{?:x:3xI:y:f:z:}", 1))));
+        return list;
+    });
+    m.def("buffer_to_dtype", [](py::buffer& buf) { return py::dtype(buf.request()); });
+
+    // test_scalar_conversion
+    m.def("f_simple", [](SimpleStruct s) { return s.uint_ * 10; });
+    m.def("f_packed", [](PackedStruct s) { return s.uint_ * 10; });
+    m.def("f_nested", [](NestedStruct s) { return s.a.uint_ * 10; });
+
+    // test_register_dtype
+    m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_); });
+
+    // test_str_leak
+    m.def("dtype_wrapper", [](py::object d) { return py::dtype::from_args(std::move(d)); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.py
new file mode 100644
index 0000000..2e63885
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.py
@@ -0,0 +1,310 @@
+import re
+import pytest
+from pybind11_tests import numpy_dtypes as m
+
+pytestmark = pytest.requires_numpy
+
+with pytest.suppress(ImportError):
+    import numpy as np
+
+
+@pytest.fixture(scope='module')
+def simple_dtype():
+    ld = np.dtype('longdouble')
+    return np.dtype({'names': ['bool_', 'uint_', 'float_', 'ldbl_'],
+                     'formats': ['?', 'u4', 'f4', 'f{}'.format(ld.itemsize)],
+                     'offsets': [0, 4, 8, (16 if ld.alignment > 4 else 12)]})
+
+
+@pytest.fixture(scope='module')
+def packed_dtype():
+    return np.dtype([('bool_', '?'), ('uint_', 'u4'), ('float_', 'f4'), ('ldbl_', 'g')])
+
+
+def dt_fmt():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+    return ("{{'names':['bool_','uint_','float_','ldbl_'],"
+            " 'formats':['?','" + e + "u4','" + e + "f4','" + e + "f{}'],"
+            " 'offsets':[0,4,8,{}], 'itemsize':{}}}")
+
+
+def simple_dtype_fmt():
+    ld = np.dtype('longdouble')
+    simple_ld_off = 12 + 4 * (ld.alignment > 4)
+    return dt_fmt().format(ld.itemsize, simple_ld_off, simple_ld_off + ld.itemsize)
+
+
+def packed_dtype_fmt():
+    from sys import byteorder
+    return "[('bool_', '?'), ('uint_', '{e}u4'), ('float_', '{e}f4'), ('ldbl_', '{e}f{}')]".format(
+        np.dtype('longdouble').itemsize, e='<' if byteorder == 'little' else '>')
+
+
+def partial_ld_offset():
+    return 12 + 4 * (np.dtype('uint64').alignment > 4) + 8 + 8 * (
+        np.dtype('longdouble').alignment > 8)
+
+
+def partial_dtype_fmt():
+    ld = np.dtype('longdouble')
+    partial_ld_off = partial_ld_offset()
+    return dt_fmt().format(ld.itemsize, partial_ld_off, partial_ld_off + ld.itemsize)
+
+
+def partial_nested_fmt():
+    ld = np.dtype('longdouble')
+    partial_nested_off = 8 + 8 * (ld.alignment > 8)
+    partial_ld_off = partial_ld_offset()
+    partial_nested_size = partial_nested_off * 2 + partial_ld_off + ld.itemsize
+    return "{{'names':['a'], 'formats':[{}], 'offsets':[{}], 'itemsize':{}}}".format(
+        partial_dtype_fmt(), partial_nested_off, partial_nested_size)
+
+
+def assert_equal(actual, expected_data, expected_dtype):
+    np.testing.assert_equal(actual, np.array(expected_data, dtype=expected_dtype))
+
+
+def test_format_descriptors():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.get_format_unbound()
+    assert re.match('^NumPy type info missing for .*UnboundStruct.*$', str(excinfo.value))
+
+    ld = np.dtype('longdouble')
+    ldbl_fmt = ('4x' if ld.alignment > 4 else '') + ld.char
+    ss_fmt = "^T{?:bool_:3xI:uint_:f:float_:" + ldbl_fmt + ":ldbl_:}"
+    dbl = np.dtype('double')
+    partial_fmt = ("^T{?:bool_:3xI:uint_:f:float_:" +
+                   str(4 * (dbl.alignment > 4) + dbl.itemsize + 8 * (ld.alignment > 8)) +
+                   "xg:ldbl_:}")
+    nested_extra = str(max(8, ld.alignment))
+    assert m.print_format_descriptors() == [
+        ss_fmt,
+        "^T{?:bool_:I:uint_:f:float_:g:ldbl_:}",
+        "^T{" + ss_fmt + ":a:^T{?:bool_:I:uint_:f:float_:g:ldbl_:}:b:}",
+        partial_fmt,
+        "^T{" + nested_extra + "x" + partial_fmt + ":a:" + nested_extra + "x}",
+        "^T{3s:a:3s:b:}",
+        "^T{(3)4s:a:(2)i:b:(3)B:c:1x(4, 2)f:d:}",
+        '^T{q:e1:B:e2:}',
+        '^T{Zf:cflt:Zd:cdbl:}'
+    ]
+
+
+def test_dtype(simple_dtype):
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    assert m.print_dtypes() == [
+        simple_dtype_fmt(),
+        packed_dtype_fmt(),
+        "[('a', {}), ('b', {})]".format(simple_dtype_fmt(), packed_dtype_fmt()),
+        partial_dtype_fmt(),
+        partial_nested_fmt(),
+        "[('a', 'S3'), ('b', 'S3')]",
+        ("{{'names':['a','b','c','d'], " +
+         "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('" + e + "f4', (4, 2))], " +
+         "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e),
+        "[('e1', '" + e + "i8'), ('e2', 'u1')]",
+        "[('x', 'i1'), ('y', '" + e + "u8')]",
+        "[('cflt', '" + e + "c8'), ('cdbl', '" + e + "c16')]"
+    ]
+
+    d1 = np.dtype({'names': ['a', 'b'], 'formats': ['int32', 'float64'],
+                   'offsets': [1, 10], 'itemsize': 20})
+    d2 = np.dtype([('a', 'i4'), ('b', 'f4')])
+    assert m.test_dtype_ctors() == [np.dtype('int32'), np.dtype('float64'),
+                                    np.dtype('bool'), d1, d1, np.dtype('uint32'), d2]
+
+    assert m.test_dtype_methods() == [np.dtype('int32'), simple_dtype, False, True,
+                                      np.dtype('int32').itemsize, simple_dtype.itemsize]
+
+    assert m.trailing_padding_dtype() == m.buffer_to_dtype(np.zeros(1, m.trailing_padding_dtype()))
+
+
+def test_recarray(simple_dtype, packed_dtype):
+    elements = [(False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)]
+
+    for func, dtype in [(m.create_rec_simple, simple_dtype), (m.create_rec_packed, packed_dtype)]:
+        arr = func(0)
+        assert arr.dtype == dtype
+        assert_equal(arr, [], simple_dtype)
+        assert_equal(arr, [], packed_dtype)
+
+        arr = func(3)
+        assert arr.dtype == dtype
+        assert_equal(arr, elements, simple_dtype)
+        assert_equal(arr, elements, packed_dtype)
+
+        if dtype == simple_dtype:
+            assert m.print_rec_simple(arr) == [
+                "s:0,0,0,-0",
+                "s:1,1,1.5,-2.5",
+                "s:0,2,3,-5"
+            ]
+        else:
+            assert m.print_rec_packed(arr) == [
+                "p:0,0,0,-0",
+                "p:1,1,1.5,-2.5",
+                "p:0,2,3,-5"
+            ]
+
+    nested_dtype = np.dtype([('a', simple_dtype), ('b', packed_dtype)])
+
+    arr = m.create_rec_nested(0)
+    assert arr.dtype == nested_dtype
+    assert_equal(arr, [], nested_dtype)
+
+    arr = m.create_rec_nested(3)
+    assert arr.dtype == nested_dtype
+    assert_equal(arr, [((False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5)),
+                       ((True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)),
+                       ((False, 2, 3.0, -5.0), (True, 3, 4.5, -7.5))], nested_dtype)
+    assert m.print_rec_nested(arr) == [
+        "n:a=s:0,0,0,-0;b=p:1,1,1.5,-2.5",
+        "n:a=s:1,1,1.5,-2.5;b=p:0,2,3,-5",
+        "n:a=s:0,2,3,-5;b=p:1,3,4.5,-7.5"
+    ]
+
+    arr = m.create_rec_partial(3)
+    assert str(arr.dtype) == partial_dtype_fmt()
+    partial_dtype = arr.dtype
+    assert '' not in arr.dtype.fields
+    assert partial_dtype.itemsize > simple_dtype.itemsize
+    assert_equal(arr, elements, simple_dtype)
+    assert_equal(arr, elements, packed_dtype)
+
+    arr = m.create_rec_partial_nested(3)
+    assert str(arr.dtype) == partial_nested_fmt()
+    assert '' not in arr.dtype.fields
+    assert '' not in arr.dtype.fields['a'][0].fields
+    assert arr.dtype.itemsize > partial_dtype.itemsize
+    np.testing.assert_equal(arr['a'], m.create_rec_partial(3))
+
+
+def test_array_constructors():
+    data = np.arange(1, 7, dtype='int32')
+    for i in range(8):
+        np.testing.assert_array_equal(m.test_array_ctors(10 + i), data.reshape((3, 2)))
+        np.testing.assert_array_equal(m.test_array_ctors(20 + i), data.reshape((3, 2)))
+    for i in range(5):
+        np.testing.assert_array_equal(m.test_array_ctors(30 + i), data)
+        np.testing.assert_array_equal(m.test_array_ctors(40 + i), data)
+
+
+def test_string_array():
+    arr = m.create_string_array(True)
+    assert str(arr.dtype) == "[('a', 'S3'), ('b', 'S3')]"
+    assert m.print_string_array(arr) == [
+        "a='',b=''",
+        "a='a',b='a'",
+        "a='ab',b='ab'",
+        "a='abc',b='abc'"
+    ]
+    dtype = arr.dtype
+    assert arr['a'].tolist() == [b'', b'a', b'ab', b'abc']
+    assert arr['b'].tolist() == [b'', b'a', b'ab', b'abc']
+    arr = m.create_string_array(False)
+    assert dtype == arr.dtype
+
+
+def test_array_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_array_array(3)
+    assert str(arr.dtype) == (
+        "{{'names':['a','b','c','d'], " +
+        "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('{e}f4', (4, 2))], " +
+        "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e)
+    assert m.print_array_array(arr) == [
+        "a={{A,B,C,D},{K,L,M,N},{U,V,W,X}},b={0,1}," +
+        "c={0,1,2},d={{0,1},{10,11},{20,21},{30,31}}",
+        "a={{W,X,Y,Z},{G,H,I,J},{Q,R,S,T}},b={1000,1001}," +
+        "c={10,11,12},d={{100,101},{110,111},{120,121},{130,131}}",
+        "a={{S,T,U,V},{C,D,E,F},{M,N,O,P}},b={2000,2001}," +
+        "c={20,21,22},d={{200,201},{210,211},{220,221},{230,231}}",
+    ]
+    assert arr['a'].tolist() == [[b'ABCD', b'KLMN', b'UVWX'],
+                                 [b'WXYZ', b'GHIJ', b'QRST'],
+                                 [b'STUV', b'CDEF', b'MNOP']]
+    assert arr['b'].tolist() == [[0, 1], [1000, 1001], [2000, 2001]]
+    assert m.create_array_array(0).dtype == arr.dtype
+
+
+def test_enum_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_enum_array(3)
+    dtype = arr.dtype
+    assert dtype == np.dtype([('e1', e + 'i8'), ('e2', 'u1')])
+    assert m.print_enum_array(arr) == [
+        "e1=A,e2=X",
+        "e1=B,e2=Y",
+        "e1=A,e2=X"
+    ]
+    assert arr['e1'].tolist() == [-1, 1, -1]
+    assert arr['e2'].tolist() == [1, 2, 1]
+    assert m.create_enum_array(0).dtype == dtype
+
+
+def test_complex_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_complex_array(3)
+    dtype = arr.dtype
+    assert dtype == np.dtype([('cflt', e + 'c8'), ('cdbl', e + 'c16')])
+    assert m.print_complex_array(arr) == [
+        "c:(0,0.25),(0.5,0.75)",
+        "c:(1,1.25),(1.5,1.75)",
+        "c:(2,2.25),(2.5,2.75)"
+    ]
+    assert arr['cflt'].tolist() == [0.0 + 0.25j, 1.0 + 1.25j, 2.0 + 2.25j]
+    assert arr['cdbl'].tolist() == [0.5 + 0.75j, 1.5 + 1.75j, 2.5 + 2.75j]
+    assert m.create_complex_array(0).dtype == dtype
+
+
+def test_signature(doc):
+    assert doc(m.create_rec_nested) == \
+        "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
+
+
+def test_scalar_conversion():
+    n = 3
+    arrays = [m.create_rec_simple(n), m.create_rec_packed(n),
+              m.create_rec_nested(n), m.create_enum_array(n)]
+    funcs = [m.f_simple, m.f_packed, m.f_nested]
+
+    for i, func in enumerate(funcs):
+        for j, arr in enumerate(arrays):
+            if i == j and i < 2:
+                assert [func(arr[k]) for k in range(n)] == [k * 10 for k in range(n)]
+            else:
+                with pytest.raises(TypeError) as excinfo:
+                    func(arr[0])
+                assert 'incompatible function arguments' in str(excinfo.value)
+
+
+def test_register_dtype():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_dtype()
+    assert 'dtype is already registered' in str(excinfo.value)
+
+
+@pytest.unsupported_on_pypy
+def test_str_leak():
+    from sys import getrefcount
+    fmt = "f4"
+    pytest.gc_collect()
+    start = getrefcount(fmt)
+    d = m.dtype_wrapper(fmt)
+    assert d is np.dtype("f4")
+    del d
+    pytest.gc_collect()
+    assert getrefcount(fmt) == start
+
+
+def test_compare_buffer_info():
+    assert all(m.compare_buffer_info())
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cc
new file mode 100644
index 0000000..a875a74
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cc
@@ -0,0 +1,89 @@
+/*
+    tests/test_numpy_vectorize.cpp -- auto-vectorize functions over NumPy array
+    arguments
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+double my_func(int x, float y, double z) {
+    py::print("my_func(x:int={}, y:float={:.0f}, z:float={:.0f})"_s.format(x, y, z));
+    return (float) x*y*z;
+}
+
+TEST_SUBMODULE(numpy_vectorize, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vectorize, test_docs, test_array_collapse
+    // Vectorize all arguments of a function (though non-vector arguments are also allowed)
+    m.def("vectorized_func", py::vectorize(my_func));
+
+    // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
+    m.def("vectorized_func2",
+        [](py::array_t<int> x, py::array_t<float> y, float z) {
+            return py::vectorize([z](int x, float y) { return my_func(x, y, z); })(x, y);
+        }
+    );
+
+    // Vectorize a complex-valued function
+    m.def("vectorized_func3", py::vectorize(
+        [](std::complex<double> c) { return c * std::complex<double>(2.f); }
+    ));
+
+    // test_type_selection
+    // Numpy function which only accepts specific data types
+    m.def("selective_func", [](py::array_t<int, py::array::c_style>) { return "Int branch taken."; });
+    m.def("selective_func", [](py::array_t<float, py::array::c_style>) { return "Float branch taken."; });
+    m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
+
+
+    // test_passthrough_arguments
+    // Passthrough test: references and non-pod types should be automatically passed through (in the
+    // function definition below, only `b`, `d`, and `g` are vectorized):
+    struct NonPODClass {
+        NonPODClass(int v) : value{v} {}
+        int value;
+    };
+    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
+    m.def("vec_passthrough", py::vectorize(
+        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
+            return *a + b + c.at(0) + d + e + f.value + g;
+        }
+    ));
+
+    // test_method_vectorization
+    struct VectorizeTestClass {
+        VectorizeTestClass(int v) : value{v} {};
+        float method(int x, float y) { return y + (float) (x + value); }
+        int value = 0;
+    };
+    py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
+    vtc .def(py::init<int>())
+        .def_readwrite("value", &VectorizeTestClass::value);
+
+    // Automatic vectorizing of methods
+    vtc.def("method", py::vectorize(&VectorizeTestClass::method));
+
+    // test_trivial_broadcasting
+    // Internal optimization test for whether the input is trivially broadcastable:
+    py::enum_<py::detail::broadcast_trivial>(m, "trivial")
+        .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
+        .value("c_trivial", py::detail::broadcast_trivial::c_trivial)
+        .value("non_trivial", py::detail::broadcast_trivial::non_trivial);
+    m.def("vectorized_is_trivial", [](
+                py::array_t<int, py::array::forcecast> arg1,
+                py::array_t<float, py::array::forcecast> arg2,
+                py::array_t<double, py::array::forcecast> arg3
+                ) {
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::array<py::buffer_info, 3> buffers {{ arg1.request(), arg2.request(), arg3.request() }};
+        return py::detail::broadcast(buffers, ndim, shape);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cpp
new file mode 100644
index 0000000..a875a74
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cpp
@@ -0,0 +1,89 @@
+/*
+    tests/test_numpy_vectorize.cpp -- auto-vectorize functions over NumPy array
+    arguments
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/numpy.h>
+
+double my_func(int x, float y, double z) {
+    py::print("my_func(x:int={}, y:float={:.0f}, z:float={:.0f})"_s.format(x, y, z));
+    return (float) x*y*z;
+}
+
+TEST_SUBMODULE(numpy_vectorize, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vectorize, test_docs, test_array_collapse
+    // Vectorize all arguments of a function (though non-vector arguments are also allowed)
+    m.def("vectorized_func", py::vectorize(my_func));
+
+    // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
+    m.def("vectorized_func2",
+        [](py::array_t<int> x, py::array_t<float> y, float z) {
+            return py::vectorize([z](int x, float y) { return my_func(x, y, z); })(x, y);
+        }
+    );
+
+    // Vectorize a complex-valued function
+    m.def("vectorized_func3", py::vectorize(
+        [](std::complex<double> c) { return c * std::complex<double>(2.f); }
+    ));
+
+    // test_type_selection
+    // Numpy function which only accepts specific data types
+    m.def("selective_func", [](py::array_t<int, py::array::c_style>) { return "Int branch taken."; });
+    m.def("selective_func", [](py::array_t<float, py::array::c_style>) { return "Float branch taken."; });
+    m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
+
+
+    // test_passthrough_arguments
+    // Passthrough test: references and non-pod types should be automatically passed through (in the
+    // function definition below, only `b`, `d`, and `g` are vectorized):
+    struct NonPODClass {
+        NonPODClass(int v) : value{v} {}
+        int value;
+    };
+    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
+    m.def("vec_passthrough", py::vectorize(
+        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
+            return *a + b + c.at(0) + d + e + f.value + g;
+        }
+    ));
+
+    // test_method_vectorization
+    struct VectorizeTestClass {
+        VectorizeTestClass(int v) : value{v} {};
+        float method(int x, float y) { return y + (float) (x + value); }
+        int value = 0;
+    };
+    py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
+    vtc .def(py::init<int>())
+        .def_readwrite("value", &VectorizeTestClass::value);
+
+    // Automatic vectorizing of methods
+    vtc.def("method", py::vectorize(&VectorizeTestClass::method));
+
+    // test_trivial_broadcasting
+    // Internal optimization test for whether the input is trivially broadcastable:
+    py::enum_<py::detail::broadcast_trivial>(m, "trivial")
+        .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
+        .value("c_trivial", py::detail::broadcast_trivial::c_trivial)
+        .value("non_trivial", py::detail::broadcast_trivial::non_trivial);
+    m.def("vectorized_is_trivial", [](
+                py::array_t<int, py::array::forcecast> arg1,
+                py::array_t<float, py::array::forcecast> arg2,
+                py::array_t<double, py::array::forcecast> arg3
+                ) {
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::array<py::buffer_info, 3> buffers {{ arg1.request(), arg2.request(), arg3.request() }};
+        return py::detail::broadcast(buffers, ndim, shape);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.py
new file mode 100644
index 0000000..0e9c883
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.py
@@ -0,0 +1,196 @@
+import pytest
+from pybind11_tests import numpy_vectorize as m
+
+pytestmark = pytest.requires_numpy
+
+with pytest.suppress(ImportError):
+    import numpy as np
+
+
+def test_vectorize(capture):
+    assert np.isclose(m.vectorized_func3(np.array(3 + 7j)), [6 + 14j])
+
+    for f in [m.vectorized_func, m.vectorized_func2]:
+        with capture:
+            assert np.isclose(f(1, 2, 3), 6)
+        assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
+        with capture:
+            assert np.isclose(f(np.array(1), np.array(2), 3), 6)
+        assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
+        with capture:
+            assert np.allclose(f(np.array([1, 3]), np.array([2, 4]), 3), [6, 36])
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=3)
+            my_func(x:int=3, y:float=4, z:float=3)
+        """
+        with capture:
+            a = np.array([[1, 2], [3, 4]], order='F')
+            b = np.array([[10, 20], [30, 40]], order='F')
+            c = 3
+            result = f(a, b, c)
+            assert np.allclose(result, a * b * c)
+            assert result.flags.f_contiguous
+        # All inputs are F order and full or singletons, so we the result is in col-major order:
+        assert capture == """
+            my_func(x:int=1, y:float=10, z:float=3)
+            my_func(x:int=3, y:float=30, z:float=3)
+            my_func(x:int=2, y:float=20, z:float=3)
+            my_func(x:int=4, y:float=40, z:float=3)
+        """
+        with capture:
+            a, b, c = np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=3)
+            my_func(x:int=3, y:float=4, z:float=3)
+            my_func(x:int=5, y:float=6, z:float=3)
+            my_func(x:int=7, y:float=8, z:float=3)
+            my_func(x:int=9, y:float=10, z:float=3)
+            my_func(x:int=11, y:float=12, z:float=3)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=3, z:float=2)
+            my_func(x:int=3, y:float=4, z:float=2)
+            my_func(x:int=4, y:float=2, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=4, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F'), np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]])[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F')[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+
+
+def test_type_selection():
+    assert m.selective_func(np.array([1], dtype=np.int32)) == "Int branch taken."
+    assert m.selective_func(np.array([1.0], dtype=np.float32)) == "Float branch taken."
+    assert m.selective_func(np.array([1.0j], dtype=np.complex64)) == "Complex float branch taken."
+
+
+def test_docs(doc):
+    assert doc(m.vectorized_func) == """
+        vectorized_func(arg0: numpy.ndarray[int32], arg1: numpy.ndarray[float32], arg2: numpy.ndarray[float64]) -> object
+    """  # noqa: E501 line too long
+
+
+def test_trivial_broadcasting():
+    trivial, vectorized_is_trivial = m.trivial, m.vectorized_is_trivial
+
+    assert vectorized_is_trivial(1, 2, 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array(1), np.array(2), 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array([1, 3]), np.array([2, 4]), 3) == trivial.c_trivial
+    assert trivial.c_trivial == vectorized_is_trivial(
+        np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3)
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2) == trivial.non_trivial
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2) == trivial.non_trivial
+    z1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype='int32')
+    z2 = np.array(z1, dtype='float32')
+    z3 = np.array(z1, dtype='float64')
+    assert vectorized_is_trivial(z1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, 1, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, z2, 1) == trivial.c_trivial
+    assert vectorized_is_trivial(z1[::2, ::2], 1, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(1, 1, z1[::2, ::2]) == trivial.c_trivial
+    assert vectorized_is_trivial(1, 1, z3[::2, ::2]) == trivial.non_trivial
+    assert vectorized_is_trivial(z1, 1, z3[1::4, 1::4]) == trivial.c_trivial
+
+    y1 = np.array(z1, order='F')
+    y2 = np.array(y1)
+    y3 = np.array(y1)
+    assert vectorized_is_trivial(y1, y2, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, 1, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, 1, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, z2, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(z1[1::4, 1::4], y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(y1[1::4, 1::4], z2, 1) == trivial.c_trivial
+
+    assert m.vectorized_func(z1, z2, z3).flags.c_contiguous
+    assert m.vectorized_func(y1, y2, y3).flags.f_contiguous
+    assert m.vectorized_func(z1, 1, 1).flags.c_contiguous
+    assert m.vectorized_func(1, y2, 1).flags.f_contiguous
+    assert m.vectorized_func(z1[1::4, 1::4], y2, 1).flags.f_contiguous
+    assert m.vectorized_func(y1[1::4, 1::4], z2, 1).flags.c_contiguous
+
+
+def test_passthrough_arguments(doc):
+    assert doc(m.vec_passthrough) == (
+        "vec_passthrough(" + ", ".join([
+            "arg0: float",
+            "arg1: numpy.ndarray[float64]",
+            "arg2: numpy.ndarray[float64]",
+            "arg3: numpy.ndarray[int32]",
+            "arg4: int",
+            "arg5: m.numpy_vectorize.NonPODClass",
+            "arg6: numpy.ndarray[float64]"]) + ") -> object")
+
+    b = np.array([[10, 20, 30]], dtype='float64')
+    c = np.array([100, 200])  # NOT a vectorized argument
+    d = np.array([[1000], [2000], [3000]], dtype='int')
+    g = np.array([[1000000, 2000000, 3000000]], dtype='int')  # requires casting
+    assert np.all(
+        m.vec_passthrough(1, b, c, d, 10000, m.NonPODClass(100000), g) ==
+        np.array([[1111111, 2111121, 3111131],
+                  [1112111, 2112121, 3112131],
+                  [1113111, 2113121, 3113131]]))
+
+
+def test_method_vectorization():
+    o = m.VectorizeTestClass(3)
+    x = np.array([1, 2], dtype='int')
+    y = np.array([[10], [20]], dtype='float32')
+    assert np.all(o.method(x, y) == [[14, 15], [24, 25]])
+
+
+def test_array_collapse():
+    assert not isinstance(m.vectorized_func(1, 2, 3), np.ndarray)
+    assert not isinstance(m.vectorized_func(np.array(1), 2, 3), np.ndarray)
+    z = m.vectorized_func([1], 2, 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, )
+    z = m.vectorized_func(1, [[[2]]], 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, 1, 1)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cc
new file mode 100644
index 0000000..0d20d9a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cc
@@ -0,0 +1,67 @@
+/*
+    tests/test_opaque_types.cpp -- opaque types, passing void pointers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+#include <vector>
+
+// IMPORTANT: Disable internal pybind11 translation mechanisms for STL data structures
+//
+// This also deliberately doesn't use the below StringList type alias to test
+// that MAKE_OPAQUE can handle a type containing a `,`.  (The `std::allocator`
+// bit is just the default `std::vector` allocator).
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+using StringList = std::vector<std::string, std::allocator<std::string>>;
+
+TEST_SUBMODULE(opaque_types, m) {
+    // test_string_list
+    py::class_<StringList>(m, "StringList")
+        .def(py::init<>())
+        .def("pop_back", &StringList::pop_back)
+        /* There are multiple versions of push_back(), etc. Select the right ones. */
+        .def("push_back", (void (StringList::*)(const std::string &)) &StringList::push_back)
+        .def("back", (std::string &(StringList::*)()) &StringList::back)
+        .def("__len__", [](const StringList &v) { return v.size(); })
+        .def("__iter__", [](StringList &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>());
+
+    class ClassWithSTLVecProperty {
+    public:
+        StringList stringList;
+    };
+    py::class_<ClassWithSTLVecProperty>(m, "ClassWithSTLVecProperty")
+        .def(py::init<>())
+        .def_readwrite("stringList", &ClassWithSTLVecProperty::stringList);
+
+    m.def("print_opaque_list", [](const StringList &l) {
+        std::string ret = "Opaque list: [";
+        bool first = true;
+        for (auto entry : l) {
+            if (!first)
+                ret += ", ";
+            ret += entry;
+            first = false;
+        }
+        return ret + "]";
+    });
+
+    // test_pointers
+    m.def("return_void_ptr", []() { return (void *) 0x1234; });
+    m.def("get_void_ptr_value", [](void *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+    m.def("return_null_str", []() { return (char *) nullptr; });
+    m.def("get_null_str_value", [](char *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+
+    m.def("return_unique_ptr", []() -> std::unique_ptr<StringList> {
+        StringList *result = new StringList();
+        result->push_back("some value");
+        return std::unique_ptr<StringList>(result);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cpp
new file mode 100644
index 0000000..0d20d9a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cpp
@@ -0,0 +1,67 @@
+/*
+    tests/test_opaque_types.cpp -- opaque types, passing void pointers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+#include <vector>
+
+// IMPORTANT: Disable internal pybind11 translation mechanisms for STL data structures
+//
+// This also deliberately doesn't use the below StringList type alias to test
+// that MAKE_OPAQUE can handle a type containing a `,`.  (The `std::allocator`
+// bit is just the default `std::vector` allocator).
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+using StringList = std::vector<std::string, std::allocator<std::string>>;
+
+TEST_SUBMODULE(opaque_types, m) {
+    // test_string_list
+    py::class_<StringList>(m, "StringList")
+        .def(py::init<>())
+        .def("pop_back", &StringList::pop_back)
+        /* There are multiple versions of push_back(), etc. Select the right ones. */
+        .def("push_back", (void (StringList::*)(const std::string &)) &StringList::push_back)
+        .def("back", (std::string &(StringList::*)()) &StringList::back)
+        .def("__len__", [](const StringList &v) { return v.size(); })
+        .def("__iter__", [](StringList &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>());
+
+    class ClassWithSTLVecProperty {
+    public:
+        StringList stringList;
+    };
+    py::class_<ClassWithSTLVecProperty>(m, "ClassWithSTLVecProperty")
+        .def(py::init<>())
+        .def_readwrite("stringList", &ClassWithSTLVecProperty::stringList);
+
+    m.def("print_opaque_list", [](const StringList &l) {
+        std::string ret = "Opaque list: [";
+        bool first = true;
+        for (auto entry : l) {
+            if (!first)
+                ret += ", ";
+            ret += entry;
+            first = false;
+        }
+        return ret + "]";
+    });
+
+    // test_pointers
+    m.def("return_void_ptr", []() { return (void *) 0x1234; });
+    m.def("get_void_ptr_value", [](void *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+    m.def("return_null_str", []() { return (char *) nullptr; });
+    m.def("get_null_str_value", [](char *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
+
+    m.def("return_unique_ptr", []() -> std::unique_ptr<StringList> {
+        StringList *result = new StringList();
+        result->push_back("some value");
+        return std::unique_ptr<StringList>(result);
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.py
new file mode 100644
index 0000000..6b3802f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.py
@@ -0,0 +1,46 @@
+import pytest
+from pybind11_tests import opaque_types as m
+from pybind11_tests import ConstructorStats, UserType
+
+
+def test_string_list():
+    lst = m.StringList()
+    lst.push_back("Element 1")
+    lst.push_back("Element 2")
+    assert m.print_opaque_list(lst) == "Opaque list: [Element 1, Element 2]"
+    assert lst.back() == "Element 2"
+
+    for i, k in enumerate(lst, start=1):
+        assert k == "Element {}".format(i)
+    lst.pop_back()
+    assert m.print_opaque_list(lst) == "Opaque list: [Element 1]"
+
+    cvp = m.ClassWithSTLVecProperty()
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: []"
+
+    cvp.stringList = lst
+    cvp.stringList.push_back("Element 3")
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: [Element 1, Element 3]"
+
+
+def test_pointers(msg):
+    living_before = ConstructorStats.get(UserType).alive()
+    assert m.get_void_ptr_value(m.return_void_ptr()) == 0x1234
+    assert m.get_void_ptr_value(UserType())  # Should also work for other C++ types
+    assert ConstructorStats.get(UserType).alive() == living_before
+
+    with pytest.raises(TypeError) as excinfo:
+        m.get_void_ptr_value([1, 2, 3])  # This should not work
+    assert msg(excinfo.value) == """
+        get_void_ptr_value(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: capsule) -> int
+
+        Invoked with: [1, 2, 3]
+    """  # noqa: E501 line too long
+
+    assert m.return_null_str() is None
+    assert m.get_null_str_value(m.return_null_str()) is not None
+
+    ptr = m.return_unique_ptr()
+    assert "StringList" in repr(ptr)
+    assert m.print_opaque_list(ptr) == "Opaque list: [some value]"
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cc
new file mode 100644
index 0000000..7b11170
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cc
@@ -0,0 +1,171 @@
+/*
+    tests/test_operator_overloading.cpp -- operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <functional>
+
+class Vector2 {
+public:
+    Vector2(float x, float y) : x(x), y(y) { print_created(this, toString()); }
+    Vector2(const Vector2 &v) : x(v.x), y(v.y) { print_copy_created(this); }
+    Vector2(Vector2 &&v) : x(v.x), y(v.y) { print_move_created(this); v.x = v.y = 0; }
+    Vector2 &operator=(const Vector2 &v) { x = v.x; y = v.y; print_copy_assigned(this); return *this; }
+    Vector2 &operator=(Vector2 &&v) { x = v.x; y = v.y; v.x = v.y = 0; print_move_assigned(this); return *this; }
+    ~Vector2() { print_destroyed(this); }
+
+    std::string toString() const { return "[" + std::to_string(x) + ", " + std::to_string(y) + "]"; }
+
+    Vector2 operator-() const { return Vector2(-x, -y); }
+    Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
+    Vector2 operator-(const Vector2 &v) const { return Vector2(x - v.x, y - v.y); }
+    Vector2 operator-(float value) const { return Vector2(x - value, y - value); }
+    Vector2 operator+(float value) const { return Vector2(x + value, y + value); }
+    Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
+    Vector2 operator/(float value) const { return Vector2(x / value, y / value); }
+    Vector2 operator*(const Vector2 &v) const { return Vector2(x * v.x, y * v.y); }
+    Vector2 operator/(const Vector2 &v) const { return Vector2(x / v.x, y / v.y); }
+    Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
+    Vector2& operator-=(const Vector2 &v) { x -= v.x; y -= v.y; return *this; }
+    Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
+    Vector2& operator/=(float v) { x /= v; y /= v; return *this; }
+    Vector2& operator*=(const Vector2 &v) { x *= v.x; y *= v.y; return *this; }
+    Vector2& operator/=(const Vector2 &v) { x /= v.x; y /= v.y; return *this; }
+
+    friend Vector2 operator+(float f, const Vector2 &v) { return Vector2(f + v.x, f + v.y); }
+    friend Vector2 operator-(float f, const Vector2 &v) { return Vector2(f - v.x, f - v.y); }
+    friend Vector2 operator*(float f, const Vector2 &v) { return Vector2(f * v.x, f * v.y); }
+    friend Vector2 operator/(float f, const Vector2 &v) { return Vector2(f / v.x, f / v.y); }
+private:
+    float x, y;
+};
+
+class C1 { };
+class C2 { };
+
+int operator+(const C1 &, const C1 &) { return 11; }
+int operator+(const C2 &, const C2 &) { return 22; }
+int operator+(const C2 &, const C1 &) { return 21; }
+int operator+(const C1 &, const C2 &) { return 12; }
+
+namespace std {
+    template<>
+    struct hash<Vector2> {
+        // Not a good hash function, but easy to test
+        size_t operator()(const Vector2 &) { return 4; }
+    };
+}
+
+// MSVC warns about unknown pragmas, and warnings are errors.
+#ifndef _MSC_VER
+  #pragma GCC diagnostic push
+  // clang 7.0.0 and Apple LLVM 10.0.1 introduce `-Wself-assign-overloaded` to
+  // `-Wall`, which is used here for overloading (e.g. `py::self += py::self `).
+  // Here, we suppress the warning using `#pragma diagnostic`.
+  // Taken from: https://github.com/RobotLocomotion/drake/commit/aaf84b46
+  // TODO(eric): This could be resolved using a function / functor (e.g. `py::self()`).
+  #if (__APPLE__) && (__clang__)
+    #if (__clang_major__ >= 10) && (__clang_minor__ >= 0) && (__clang_patchlevel__ >= 1)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #elif (__clang__)
+    #if (__clang_major__ >= 7)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #endif
+#endif
+
+TEST_SUBMODULE(operators, m) {
+
+    // test_operator_overloading
+    py::class_<Vector2>(m, "Vector2")
+        .def(py::init<float, float>())
+        .def(py::self + py::self)
+        .def(py::self + float())
+        .def(py::self - py::self)
+        .def(py::self - float())
+        .def(py::self * float())
+        .def(py::self / float())
+        .def(py::self * py::self)
+        .def(py::self / py::self)
+        .def(py::self += py::self)
+        .def(py::self -= py::self)
+        .def(py::self *= float())
+        .def(py::self /= float())
+        .def(py::self *= py::self)
+        .def(py::self /= py::self)
+        .def(float() + py::self)
+        .def(float() - py::self)
+        .def(float() * py::self)
+        .def(float() / py::self)
+        .def(-py::self)
+        .def("__str__", &Vector2::toString)
+        .def(hash(py::self))
+        ;
+
+    m.attr("Vector") = m.attr("Vector2");
+
+    // test_operators_notimplemented
+    // #393: need to return NotSupported to ensure correct arithmetic operator behavior
+    py::class_<C1>(m, "C1")
+        .def(py::init<>())
+        .def(py::self + py::self);
+
+    py::class_<C2>(m, "C2")
+        .def(py::init<>())
+        .def(py::self + py::self)
+        .def("__add__", [](const C2& c2, const C1& c1) { return c2 + c1; })
+        .def("__radd__", [](const C2& c2, const C1& c1) { return c1 + c2; });
+
+    // test_nested
+    // #328: first member in a class can't be used in operators
+    struct NestABase { int value = -2; };
+    py::class_<NestABase>(m, "NestABase")
+        .def(py::init<>())
+        .def_readwrite("value", &NestABase::value);
+
+    struct NestA : NestABase {
+        int value = 3;
+        NestA& operator+=(int i) { value += i; return *this; }
+    };
+    py::class_<NestA>(m, "NestA")
+        .def(py::init<>())
+        .def(py::self += int())
+        .def("as_base", [](NestA &a) -> NestABase& {
+            return (NestABase&) a;
+        }, py::return_value_policy::reference_internal);
+    m.def("get_NestA", [](const NestA &a) { return a.value; });
+
+    struct NestB {
+        NestA a;
+        int value = 4;
+        NestB& operator-=(int i) { value -= i; return *this; }
+    };
+    py::class_<NestB>(m, "NestB")
+        .def(py::init<>())
+        .def(py::self -= int())
+        .def_readwrite("a", &NestB::a);
+    m.def("get_NestB", [](const NestB &b) { return b.value; });
+
+    struct NestC {
+        NestB b;
+        int value = 5;
+        NestC& operator*=(int i) { value *= i; return *this; }
+    };
+    py::class_<NestC>(m, "NestC")
+        .def(py::init<>())
+        .def(py::self *= int())
+        .def_readwrite("b", &NestC::b);
+    m.def("get_NestC", [](const NestC &c) { return c.value; });
+}
+
+#ifndef _MSC_VER
+  #pragma GCC diagnostic pop
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cpp
new file mode 100644
index 0000000..7b11170
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cpp
@@ -0,0 +1,171 @@
+/*
+    tests/test_operator_overloading.cpp -- operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <functional>
+
+class Vector2 {
+public:
+    Vector2(float x, float y) : x(x), y(y) { print_created(this, toString()); }
+    Vector2(const Vector2 &v) : x(v.x), y(v.y) { print_copy_created(this); }
+    Vector2(Vector2 &&v) : x(v.x), y(v.y) { print_move_created(this); v.x = v.y = 0; }
+    Vector2 &operator=(const Vector2 &v) { x = v.x; y = v.y; print_copy_assigned(this); return *this; }
+    Vector2 &operator=(Vector2 &&v) { x = v.x; y = v.y; v.x = v.y = 0; print_move_assigned(this); return *this; }
+    ~Vector2() { print_destroyed(this); }
+
+    std::string toString() const { return "[" + std::to_string(x) + ", " + std::to_string(y) + "]"; }
+
+    Vector2 operator-() const { return Vector2(-x, -y); }
+    Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
+    Vector2 operator-(const Vector2 &v) const { return Vector2(x - v.x, y - v.y); }
+    Vector2 operator-(float value) const { return Vector2(x - value, y - value); }
+    Vector2 operator+(float value) const { return Vector2(x + value, y + value); }
+    Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
+    Vector2 operator/(float value) const { return Vector2(x / value, y / value); }
+    Vector2 operator*(const Vector2 &v) const { return Vector2(x * v.x, y * v.y); }
+    Vector2 operator/(const Vector2 &v) const { return Vector2(x / v.x, y / v.y); }
+    Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
+    Vector2& operator-=(const Vector2 &v) { x -= v.x; y -= v.y; return *this; }
+    Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
+    Vector2& operator/=(float v) { x /= v; y /= v; return *this; }
+    Vector2& operator*=(const Vector2 &v) { x *= v.x; y *= v.y; return *this; }
+    Vector2& operator/=(const Vector2 &v) { x /= v.x; y /= v.y; return *this; }
+
+    friend Vector2 operator+(float f, const Vector2 &v) { return Vector2(f + v.x, f + v.y); }
+    friend Vector2 operator-(float f, const Vector2 &v) { return Vector2(f - v.x, f - v.y); }
+    friend Vector2 operator*(float f, const Vector2 &v) { return Vector2(f * v.x, f * v.y); }
+    friend Vector2 operator/(float f, const Vector2 &v) { return Vector2(f / v.x, f / v.y); }
+private:
+    float x, y;
+};
+
+class C1 { };
+class C2 { };
+
+int operator+(const C1 &, const C1 &) { return 11; }
+int operator+(const C2 &, const C2 &) { return 22; }
+int operator+(const C2 &, const C1 &) { return 21; }
+int operator+(const C1 &, const C2 &) { return 12; }
+
+namespace std {
+    template<>
+    struct hash<Vector2> {
+        // Not a good hash function, but easy to test
+        size_t operator()(const Vector2 &) { return 4; }
+    };
+}
+
+// MSVC warns about unknown pragmas, and warnings are errors.
+#ifndef _MSC_VER
+  #pragma GCC diagnostic push
+  // clang 7.0.0 and Apple LLVM 10.0.1 introduce `-Wself-assign-overloaded` to
+  // `-Wall`, which is used here for overloading (e.g. `py::self += py::self `).
+  // Here, we suppress the warning using `#pragma diagnostic`.
+  // Taken from: https://github.com/RobotLocomotion/drake/commit/aaf84b46
+  // TODO(eric): This could be resolved using a function / functor (e.g. `py::self()`).
+  #if (__APPLE__) && (__clang__)
+    #if (__clang_major__ >= 10) && (__clang_minor__ >= 0) && (__clang_patchlevel__ >= 1)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #elif (__clang__)
+    #if (__clang_major__ >= 7)
+      #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+    #endif
+  #endif
+#endif
+
+TEST_SUBMODULE(operators, m) {
+
+    // test_operator_overloading
+    py::class_<Vector2>(m, "Vector2")
+        .def(py::init<float, float>())
+        .def(py::self + py::self)
+        .def(py::self + float())
+        .def(py::self - py::self)
+        .def(py::self - float())
+        .def(py::self * float())
+        .def(py::self / float())
+        .def(py::self * py::self)
+        .def(py::self / py::self)
+        .def(py::self += py::self)
+        .def(py::self -= py::self)
+        .def(py::self *= float())
+        .def(py::self /= float())
+        .def(py::self *= py::self)
+        .def(py::self /= py::self)
+        .def(float() + py::self)
+        .def(float() - py::self)
+        .def(float() * py::self)
+        .def(float() / py::self)
+        .def(-py::self)
+        .def("__str__", &Vector2::toString)
+        .def(hash(py::self))
+        ;
+
+    m.attr("Vector") = m.attr("Vector2");
+
+    // test_operators_notimplemented
+    // #393: need to return NotSupported to ensure correct arithmetic operator behavior
+    py::class_<C1>(m, "C1")
+        .def(py::init<>())
+        .def(py::self + py::self);
+
+    py::class_<C2>(m, "C2")
+        .def(py::init<>())
+        .def(py::self + py::self)
+        .def("__add__", [](const C2& c2, const C1& c1) { return c2 + c1; })
+        .def("__radd__", [](const C2& c2, const C1& c1) { return c1 + c2; });
+
+    // test_nested
+    // #328: first member in a class can't be used in operators
+    struct NestABase { int value = -2; };
+    py::class_<NestABase>(m, "NestABase")
+        .def(py::init<>())
+        .def_readwrite("value", &NestABase::value);
+
+    struct NestA : NestABase {
+        int value = 3;
+        NestA& operator+=(int i) { value += i; return *this; }
+    };
+    py::class_<NestA>(m, "NestA")
+        .def(py::init<>())
+        .def(py::self += int())
+        .def("as_base", [](NestA &a) -> NestABase& {
+            return (NestABase&) a;
+        }, py::return_value_policy::reference_internal);
+    m.def("get_NestA", [](const NestA &a) { return a.value; });
+
+    struct NestB {
+        NestA a;
+        int value = 4;
+        NestB& operator-=(int i) { value -= i; return *this; }
+    };
+    py::class_<NestB>(m, "NestB")
+        .def(py::init<>())
+        .def(py::self -= int())
+        .def_readwrite("a", &NestB::a);
+    m.def("get_NestB", [](const NestB &b) { return b.value; });
+
+    struct NestC {
+        NestB b;
+        int value = 5;
+        NestC& operator*=(int i) { value *= i; return *this; }
+    };
+    py::class_<NestC>(m, "NestC")
+        .def(py::init<>())
+        .def(py::self *= int())
+        .def_readwrite("b", &NestC::b);
+    m.def("get_NestC", [](const NestC &c) { return c.value; });
+}
+
+#ifndef _MSC_VER
+  #pragma GCC diagnostic pop
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.py
new file mode 100644
index 0000000..bd36ac2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.py
@@ -0,0 +1,108 @@
+import pytest
+from pybind11_tests import operators as m
+from pybind11_tests import ConstructorStats
+
+
+def test_operator_overloading():
+    v1 = m.Vector2(1, 2)
+    v2 = m.Vector(3, -1)
+    assert str(v1) == "[1.000000, 2.000000]"
+    assert str(v2) == "[3.000000, -1.000000]"
+
+    assert str(-v2) == "[-3.000000, 1.000000]"
+
+    assert str(v1 + v2) == "[4.000000, 1.000000]"
+    assert str(v1 - v2) == "[-2.000000, 3.000000]"
+    assert str(v1 - 8) == "[-7.000000, -6.000000]"
+    assert str(v1 + 8) == "[9.000000, 10.000000]"
+    assert str(v1 * 8) == "[8.000000, 16.000000]"
+    assert str(v1 / 8) == "[0.125000, 0.250000]"
+    assert str(8 - v1) == "[7.000000, 6.000000]"
+    assert str(8 + v1) == "[9.000000, 10.000000]"
+    assert str(8 * v1) == "[8.000000, 16.000000]"
+    assert str(8 / v1) == "[8.000000, 4.000000]"
+    assert str(v1 * v2) == "[3.000000, -2.000000]"
+    assert str(v2 / v1) == "[3.000000, -0.500000]"
+
+    v1 += 2 * v2
+    assert str(v1) == "[7.000000, 0.000000]"
+    v1 -= v2
+    assert str(v1) == "[4.000000, 1.000000]"
+    v1 *= 2
+    assert str(v1) == "[8.000000, 2.000000]"
+    v1 /= 16
+    assert str(v1) == "[0.500000, 0.125000]"
+    v1 *= v2
+    assert str(v1) == "[1.500000, -0.125000]"
+    v2 /= v1
+    assert str(v2) == "[2.000000, 8.000000]"
+
+    assert hash(v1) == 4
+
+    cstats = ConstructorStats.get(m.Vector2)
+    assert cstats.alive() == 2
+    del v1
+    assert cstats.alive() == 1
+    del v2
+    assert cstats.alive() == 0
+    assert cstats.values() == ['[1.000000, 2.000000]', '[3.000000, -1.000000]',
+                               '[-3.000000, 1.000000]', '[4.000000, 1.000000]',
+                               '[-2.000000, 3.000000]', '[-7.000000, -6.000000]',
+                               '[9.000000, 10.000000]', '[8.000000, 16.000000]',
+                               '[0.125000, 0.250000]', '[7.000000, 6.000000]',
+                               '[9.000000, 10.000000]', '[8.000000, 16.000000]',
+                               '[8.000000, 4.000000]', '[3.000000, -2.000000]',
+                               '[3.000000, -0.500000]', '[6.000000, -2.000000]']
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 10
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_operators_notimplemented():
+    """#393: need to return NotSupported to ensure correct arithmetic operator behavior"""
+
+    c1, c2 = m.C1(), m.C2()
+    assert c1 + c1 == 11
+    assert c2 + c2 == 22
+    assert c2 + c1 == 21
+    assert c1 + c2 == 12
+
+
+def test_nested():
+    """#328: first member in a class can't be used in operators"""
+
+    a = m.NestA()
+    b = m.NestB()
+    c = m.NestC()
+
+    a += 10
+    assert m.get_NestA(a) == 13
+    b.a += 100
+    assert m.get_NestA(b.a) == 103
+    c.b.a += 1000
+    assert m.get_NestA(c.b.a) == 1003
+    b -= 1
+    assert m.get_NestB(b) == 3
+    c.b -= 3
+    assert m.get_NestB(c.b) == 1
+    c *= 7
+    assert m.get_NestC(c) == 35
+
+    abase = a.as_base()
+    assert abase.value == -2
+    a.as_base().value += 44
+    assert abase.value == 42
+    assert c.b.a.as_base().value == -2
+    c.b.a.as_base().value += 44
+    assert c.b.a.as_base().value == 42
+
+    del c
+    pytest.gc_collect()
+    del a  # Shouldn't delete while abase is still alive
+    pytest.gc_collect()
+
+    assert abase.value == 42
+    del abase, b
+    pytest.gc_collect()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cc
new file mode 100644
index 0000000..9dc63bd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cc
@@ -0,0 +1,130 @@
+/*
+    tests/test_pickling.cpp -- pickle support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(pickling, m) {
+    // test_roundtrip
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra1(int extra1) { m_extra1 = extra1; }
+        void setExtra2(int extra2) { m_extra2 = extra2; }
+        int extra1() const { return m_extra1; }
+        int extra2() const { return m_extra2; }
+    private:
+        std::string m_value;
+        int m_extra1 = 0;
+        int m_extra2 = 0;
+    };
+
+    class PickleableNew : public Pickleable {
+    public:
+        using Pickleable::Pickleable;
+    };
+
+    py::class_<Pickleable>(m, "Pickleable")
+        .def(py::init<std::string>())
+        .def("value", &Pickleable::value)
+        .def("extra1", &Pickleable::extra1)
+        .def("extra2", &Pickleable::extra2)
+        .def("setExtra1", &Pickleable::setExtra1)
+        .def("setExtra2", &Pickleable::setExtra2)
+        // For details on the methods below, refer to
+        // http://docs.python.org/3/library/pickle.html#pickling-class-instances
+        .def("__getstate__", [](const Pickleable &p) {
+            /* Return a tuple that fully encodes the state of the object */
+            return py::make_tuple(p.value(), p.extra1(), p.extra2());
+        })
+        .def("__setstate__", [](Pickleable &p, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Invoke the constructor (need to use in-place version) */
+            new (&p) Pickleable(t[0].cast<std::string>());
+
+            /* Assign any additional state */
+            p.setExtra1(t[1].cast<int>());
+            p.setExtra2(t[2].cast<int>());
+        });
+
+    py::class_<PickleableNew, Pickleable>(m, "PickleableNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](const PickleableNew &p) {
+                return py::make_tuple(p.value(), p.extra1(), p.extra2());
+            },
+            [](py::tuple t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+                auto p = PickleableNew(t[0].cast<std::string>());
+
+                p.setExtra1(t[1].cast<int>());
+                p.setExtra2(t[2].cast<int>());
+                return p;
+            }
+        ));
+
+#if !defined(PYPY_VERSION)
+    // test_roundtrip_with_dict
+    class PickleableWithDict {
+    public:
+        PickleableWithDict(const std::string &value) : value(value) { }
+
+        std::string value;
+        int extra;
+    };
+
+    class PickleableWithDictNew : public PickleableWithDict {
+    public:
+        using PickleableWithDict::PickleableWithDict;
+    };
+
+    py::class_<PickleableWithDict>(m, "PickleableWithDict", py::dynamic_attr())
+        .def(py::init<std::string>())
+        .def_readwrite("value", &PickleableWithDict::value)
+        .def_readwrite("extra", &PickleableWithDict::extra)
+        .def("__getstate__", [](py::object self) {
+            /* Also include __dict__ in state */
+            return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+        })
+        .def("__setstate__", [](py::object self, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Cast and construct */
+            auto& p = self.cast<PickleableWithDict&>();
+            new (&p) PickleableWithDict(t[0].cast<std::string>());
+
+            /* Assign C++ state */
+            p.extra = t[1].cast<int>();
+
+            /* Assign Python state */
+            self.attr("__dict__") = t[2];
+        });
+
+    py::class_<PickleableWithDictNew, PickleableWithDict>(m, "PickleableWithDictNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](py::object self) {
+                return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+            },
+            [](const py::tuple &t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+
+                auto cpp_state = PickleableWithDictNew(t[0].cast<std::string>());
+                cpp_state.extra = t[1].cast<int>();
+
+                auto py_state = t[2].cast<py::dict>();
+                return std::make_pair(cpp_state, py_state);
+            }
+        ));
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cpp
new file mode 100644
index 0000000..9dc63bd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cpp
@@ -0,0 +1,130 @@
+/*
+    tests/test_pickling.cpp -- pickle support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(pickling, m) {
+    // test_roundtrip
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra1(int extra1) { m_extra1 = extra1; }
+        void setExtra2(int extra2) { m_extra2 = extra2; }
+        int extra1() const { return m_extra1; }
+        int extra2() const { return m_extra2; }
+    private:
+        std::string m_value;
+        int m_extra1 = 0;
+        int m_extra2 = 0;
+    };
+
+    class PickleableNew : public Pickleable {
+    public:
+        using Pickleable::Pickleable;
+    };
+
+    py::class_<Pickleable>(m, "Pickleable")
+        .def(py::init<std::string>())
+        .def("value", &Pickleable::value)
+        .def("extra1", &Pickleable::extra1)
+        .def("extra2", &Pickleable::extra2)
+        .def("setExtra1", &Pickleable::setExtra1)
+        .def("setExtra2", &Pickleable::setExtra2)
+        // For details on the methods below, refer to
+        // http://docs.python.org/3/library/pickle.html#pickling-class-instances
+        .def("__getstate__", [](const Pickleable &p) {
+            /* Return a tuple that fully encodes the state of the object */
+            return py::make_tuple(p.value(), p.extra1(), p.extra2());
+        })
+        .def("__setstate__", [](Pickleable &p, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Invoke the constructor (need to use in-place version) */
+            new (&p) Pickleable(t[0].cast<std::string>());
+
+            /* Assign any additional state */
+            p.setExtra1(t[1].cast<int>());
+            p.setExtra2(t[2].cast<int>());
+        });
+
+    py::class_<PickleableNew, Pickleable>(m, "PickleableNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](const PickleableNew &p) {
+                return py::make_tuple(p.value(), p.extra1(), p.extra2());
+            },
+            [](py::tuple t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+                auto p = PickleableNew(t[0].cast<std::string>());
+
+                p.setExtra1(t[1].cast<int>());
+                p.setExtra2(t[2].cast<int>());
+                return p;
+            }
+        ));
+
+#if !defined(PYPY_VERSION)
+    // test_roundtrip_with_dict
+    class PickleableWithDict {
+    public:
+        PickleableWithDict(const std::string &value) : value(value) { }
+
+        std::string value;
+        int extra;
+    };
+
+    class PickleableWithDictNew : public PickleableWithDict {
+    public:
+        using PickleableWithDict::PickleableWithDict;
+    };
+
+    py::class_<PickleableWithDict>(m, "PickleableWithDict", py::dynamic_attr())
+        .def(py::init<std::string>())
+        .def_readwrite("value", &PickleableWithDict::value)
+        .def_readwrite("extra", &PickleableWithDict::extra)
+        .def("__getstate__", [](py::object self) {
+            /* Also include __dict__ in state */
+            return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+        })
+        .def("__setstate__", [](py::object self, py::tuple t) {
+            if (t.size() != 3)
+                throw std::runtime_error("Invalid state!");
+            /* Cast and construct */
+            auto& p = self.cast<PickleableWithDict&>();
+            new (&p) PickleableWithDict(t[0].cast<std::string>());
+
+            /* Assign C++ state */
+            p.extra = t[1].cast<int>();
+
+            /* Assign Python state */
+            self.attr("__dict__") = t[2];
+        });
+
+    py::class_<PickleableWithDictNew, PickleableWithDict>(m, "PickleableWithDictNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](py::object self) {
+                return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+            },
+            [](const py::tuple &t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+
+                auto cpp_state = PickleableWithDictNew(t[0].cast<std::string>());
+                cpp_state.extra = t[1].cast<int>();
+
+                auto py_state = t[2].cast<py::dict>();
+                return std::make_pair(cpp_state, py_state);
+            }
+        ));
+#endif
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.py
new file mode 100644
index 0000000..5ae05aa
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.py
@@ -0,0 +1,42 @@
+import pytest
+from pybind11_tests import pickling as m
+
+try:
+    import cPickle as pickle  # Use cPickle on Python 2.7
+except ImportError:
+    import pickle
+
+
+@pytest.mark.parametrize("cls_name", ["Pickleable", "PickleableNew"])
+def test_roundtrip(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
+    p.setExtra1(15)
+    p.setExtra2(48)
+
+    data = pickle.dumps(p, 2)  # Must use pickle protocol >= 2
+    p2 = pickle.loads(data)
+    assert p2.value() == p.value()
+    assert p2.extra1() == p.extra1()
+    assert p2.extra2() == p.extra2()
+
+
+@pytest.unsupported_on_pypy
+@pytest.mark.parametrize("cls_name", ["PickleableWithDict", "PickleableWithDictNew"])
+def test_roundtrip_with_dict(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
+    p.extra = 15
+    p.dynamic = "Attribute"
+
+    data = pickle.dumps(p, pickle.HIGHEST_PROTOCOL)
+    p2 = pickle.loads(data)
+    assert p2.value == p.value
+    assert p2.extra == p.extra
+    assert p2.dynamic == p.dynamic
+
+
+def test_enum_pickle():
+    from pybind11_tests import enums as e
+    data = pickle.dumps(e.EOne, 2)
+    assert e.EOne == pickle.loads(data)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cc
new file mode 100644
index 0000000..244e1db
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cc
@@ -0,0 +1,310 @@
+/*
+    tests/test_pytypes.cpp -- Python type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+
+TEST_SUBMODULE(pytypes, m) {
+    // test_list
+    m.def("get_list", []() {
+        py::list list;
+        list.append("value");
+        py::print("Entry at position 0:", list[0]);
+        list[0] = py::str("overwritten");
+        list.insert(0, "inserted-0");
+        list.insert(2, "inserted-2");
+        return list;
+    });
+    m.def("print_list", [](py::list list) {
+        int index = 0;
+        for (auto item : list)
+            py::print("list item {}: {}"_s.format(index++, item));
+    });
+
+    // test_set
+    m.def("get_set", []() {
+        py::set set;
+        set.add(py::str("key1"));
+        set.add("key2");
+        set.add(std::string("key3"));
+        return set;
+    });
+    m.def("print_set", [](py::set set) {
+        for (auto item : set)
+            py::print("key:", item);
+    });
+    m.def("set_contains", [](py::set set, py::object key) {
+        return set.contains(key);
+    });
+    m.def("set_contains", [](py::set set, const char* key) {
+        return set.contains(key);
+    });
+
+    // test_dict
+    m.def("get_dict", []() { return py::dict("key"_a="value"); });
+    m.def("print_dict", [](py::dict dict) {
+        for (auto item : dict)
+            py::print("key: {}, value={}"_s.format(item.first, item.second));
+    });
+    m.def("dict_keyword_constructor", []() {
+        auto d1 = py::dict("x"_a=1, "y"_a=2);
+        auto d2 = py::dict("z"_a=3, **d1);
+        return d2;
+    });
+    m.def("dict_contains", [](py::dict dict, py::object val) {
+        return dict.contains(val);
+    });
+    m.def("dict_contains", [](py::dict dict, const char* val) {
+        return dict.contains(val);
+    });
+
+    // test_str
+    m.def("str_from_string", []() { return py::str(std::string("baz")); });
+    m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); });
+    m.def("str_from_object", [](const py::object& obj) { return py::str(obj); });
+    m.def("repr_from_object", [](const py::object& obj) { return py::repr(obj); });
+
+    m.def("str_format", []() {
+        auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
+        auto s2 = "{a} + {b} = {c}"_s.format("a"_a=1, "b"_a=2, "c"_a=3);
+        return py::make_tuple(s1, s2);
+    });
+
+    // test_bytes
+    m.def("bytes_from_string", []() { return py::bytes(std::string("foo")); });
+    m.def("bytes_from_str", []() { return py::bytes(py::str("bar", 3)); });
+
+    // test_capsule
+    m.def("return_capsule_with_destructor", []() {
+        py::print("creating capsule");
+        return py::capsule([]() {
+            py::print("destructing capsule");
+        });
+    });
+
+    m.def("return_capsule_with_destructor_2", []() {
+        py::print("creating capsule");
+        return py::capsule((void *) 1234, [](void *ptr) {
+            py::print("destructing capsule: {}"_s.format((size_t) ptr));
+        });
+    });
+
+    m.def("return_capsule_with_name_and_destructor", []() {
+        auto capsule = py::capsule((void *) 1234, "pointer type description", [](PyObject *ptr) {
+            if (ptr) {
+                auto name = PyCapsule_GetName(ptr);
+                py::print("destructing capsule ({}, '{}')"_s.format(
+                    (size_t) PyCapsule_GetPointer(ptr, name), name
+                ));
+            }
+        });
+        void *contents = capsule;
+        py::print("created capsule ({}, '{}')"_s.format((size_t) contents, capsule.name()));
+        return capsule;
+    });
+
+    // test_accessors
+    m.def("accessor_api", [](py::object o) {
+        auto d = py::dict();
+
+        d["basic_attr"] = o.attr("basic_attr");
+
+        auto l = py::list();
+        for (const auto &item : o.attr("begin_end")) {
+            l.append(item);
+        }
+        d["begin_end"] = l;
+
+        d["operator[object]"] = o.attr("d")["operator[object]"_s];
+        d["operator[char *]"] = o.attr("d")["operator[char *]"];
+
+        d["attr(object)"] = o.attr("sub").attr("attr_obj");
+        d["attr(char *)"] = o.attr("sub").attr("attr_char");
+        try {
+            o.attr("sub").attr("missing").ptr();
+        } catch (const py::error_already_set &) {
+            d["missing_attr_ptr"] = "raised"_s;
+        }
+        try {
+            o.attr("missing").attr("doesn't matter");
+        } catch (const py::error_already_set &) {
+            d["missing_attr_chain"] = "raised"_s;
+        }
+
+        d["is_none"] = o.attr("basic_attr").is_none();
+
+        d["operator()"] = o.attr("func")(1);
+        d["operator*"] = o.attr("func")(*o.attr("begin_end"));
+
+        // Test implicit conversion
+        py::list implicit_list = o.attr("begin_end");
+        d["implicit_list"] = implicit_list;
+        py::dict implicit_dict = o.attr("__dict__");
+        d["implicit_dict"] = implicit_dict;
+
+        return d;
+    });
+
+    m.def("tuple_accessor", [](py::tuple existing_t) {
+        try {
+            existing_t[0] = 1;
+        } catch (const py::error_already_set &) {
+            // --> Python system error
+            // Only new tuples (refcount == 1) are mutable
+            auto new_t = py::tuple(3);
+            for (size_t i = 0; i < new_t.size(); ++i) {
+                new_t[i] = i;
+            }
+            return new_t;
+        }
+        return py::tuple();
+    });
+
+    m.def("accessor_assignment", []() {
+        auto l = py::list(1);
+        l[0] = 0;
+
+        auto d = py::dict();
+        d["get"] = l[0];
+        auto var = l[0];
+        d["deferred_get"] = var;
+        l[0] = 1;
+        d["set"] = l[0];
+        var = 99; // this assignment should not overwrite l[0]
+        d["deferred_set"] = l[0];
+        d["var"] = var;
+
+        return d;
+    });
+
+    // test_constructors
+    m.def("default_constructors", []() {
+        return py::dict(
+            "str"_a=py::str(),
+            "bool"_a=py::bool_(),
+            "int"_a=py::int_(),
+            "float"_a=py::float_(),
+            "tuple"_a=py::tuple(),
+            "list"_a=py::list(),
+            "dict"_a=py::dict(),
+            "set"_a=py::set()
+        );
+    });
+
+    m.def("converting_constructors", [](py::dict d) {
+        return py::dict(
+            "str"_a=py::str(d["str"]),
+            "bool"_a=py::bool_(d["bool"]),
+            "int"_a=py::int_(d["int"]),
+            "float"_a=py::float_(d["float"]),
+            "tuple"_a=py::tuple(d["tuple"]),
+            "list"_a=py::list(d["list"]),
+            "dict"_a=py::dict(d["dict"]),
+            "set"_a=py::set(d["set"]),
+            "memoryview"_a=py::memoryview(d["memoryview"])
+        );
+    });
+
+    m.def("cast_functions", [](py::dict d) {
+        // When converting between Python types, obj.cast<T>() should be the same as T(obj)
+        return py::dict(
+            "str"_a=d["str"].cast<py::str>(),
+            "bool"_a=d["bool"].cast<py::bool_>(),
+            "int"_a=d["int"].cast<py::int_>(),
+            "float"_a=d["float"].cast<py::float_>(),
+            "tuple"_a=d["tuple"].cast<py::tuple>(),
+            "list"_a=d["list"].cast<py::list>(),
+            "dict"_a=d["dict"].cast<py::dict>(),
+            "set"_a=d["set"].cast<py::set>(),
+            "memoryview"_a=d["memoryview"].cast<py::memoryview>()
+        );
+    });
+
+    m.def("get_implicit_casting", []() {
+        py::dict d;
+        d["char*_i1"] = "abc";
+        const char *c2 = "abc";
+        d["char*_i2"] = c2;
+        d["char*_e"] = py::cast(c2);
+        d["char*_p"] = py::str(c2);
+
+        d["int_i1"] = 42;
+        int i = 42;
+        d["int_i2"] = i;
+        i++;
+        d["int_e"] = py::cast(i);
+        i++;
+        d["int_p"] = py::int_(i);
+
+        d["str_i1"] = std::string("str");
+        std::string s2("str1");
+        d["str_i2"] = s2;
+        s2[3] = '2';
+        d["str_e"] = py::cast(s2);
+        s2[3] = '3';
+        d["str_p"] = py::str(s2);
+
+        py::list l(2);
+        l[0] = 3;
+        l[1] = py::cast(6);
+        l.append(9);
+        l.append(py::cast(12));
+        l.append(py::int_(15));
+
+        return py::dict(
+            "d"_a=d,
+            "l"_a=l
+        );
+    });
+
+    // test_print
+    m.def("print_function", []() {
+        py::print("Hello, World!");
+        py::print(1, 2.0, "three", true, std::string("-- multiple args"));
+        auto args = py::make_tuple("and", "a", "custom", "separator");
+        py::print("*args", *args, "sep"_a="-");
+        py::print("no new line here", "end"_a=" -- ");
+        py::print("next print");
+
+        auto py_stderr = py::module::import("sys").attr("stderr");
+        py::print("this goes to stderr", "file"_a=py_stderr);
+
+        py::print("flush", "flush"_a=true);
+
+        py::print("{a} + {b} = {c}"_s.format("a"_a="py::print", "b"_a="str.format", "c"_a="this"));
+    });
+
+    m.def("print_failure", []() { py::print(42, UnregisteredType()); });
+
+    m.def("hash_function", [](py::object obj) { return py::hash(obj); });
+
+    m.def("test_number_protocol", [](py::object a, py::object b) {
+        py::list l;
+        l.append(a.equal(b));
+        l.append(a.not_equal(b));
+        l.append(a < b);
+        l.append(a <= b);
+        l.append(a > b);
+        l.append(a >= b);
+        l.append(a + b);
+        l.append(a - b);
+        l.append(a * b);
+        l.append(a / b);
+        l.append(a | b);
+        l.append(a & b);
+        l.append(a ^ b);
+        l.append(a >> b);
+        l.append(a << b);
+        return l;
+    });
+
+    m.def("test_list_slicing", [](py::list a) {
+        return a[py::slice(0, -1, 2)];
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cpp
new file mode 100644
index 0000000..244e1db
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cpp
@@ -0,0 +1,310 @@
+/*
+    tests/test_pytypes.cpp -- Python type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+
+TEST_SUBMODULE(pytypes, m) {
+    // test_list
+    m.def("get_list", []() {
+        py::list list;
+        list.append("value");
+        py::print("Entry at position 0:", list[0]);
+        list[0] = py::str("overwritten");
+        list.insert(0, "inserted-0");
+        list.insert(2, "inserted-2");
+        return list;
+    });
+    m.def("print_list", [](py::list list) {
+        int index = 0;
+        for (auto item : list)
+            py::print("list item {}: {}"_s.format(index++, item));
+    });
+
+    // test_set
+    m.def("get_set", []() {
+        py::set set;
+        set.add(py::str("key1"));
+        set.add("key2");
+        set.add(std::string("key3"));
+        return set;
+    });
+    m.def("print_set", [](py::set set) {
+        for (auto item : set)
+            py::print("key:", item);
+    });
+    m.def("set_contains", [](py::set set, py::object key) {
+        return set.contains(key);
+    });
+    m.def("set_contains", [](py::set set, const char* key) {
+        return set.contains(key);
+    });
+
+    // test_dict
+    m.def("get_dict", []() { return py::dict("key"_a="value"); });
+    m.def("print_dict", [](py::dict dict) {
+        for (auto item : dict)
+            py::print("key: {}, value={}"_s.format(item.first, item.second));
+    });
+    m.def("dict_keyword_constructor", []() {
+        auto d1 = py::dict("x"_a=1, "y"_a=2);
+        auto d2 = py::dict("z"_a=3, **d1);
+        return d2;
+    });
+    m.def("dict_contains", [](py::dict dict, py::object val) {
+        return dict.contains(val);
+    });
+    m.def("dict_contains", [](py::dict dict, const char* val) {
+        return dict.contains(val);
+    });
+
+    // test_str
+    m.def("str_from_string", []() { return py::str(std::string("baz")); });
+    m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); });
+    m.def("str_from_object", [](const py::object& obj) { return py::str(obj); });
+    m.def("repr_from_object", [](const py::object& obj) { return py::repr(obj); });
+
+    m.def("str_format", []() {
+        auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
+        auto s2 = "{a} + {b} = {c}"_s.format("a"_a=1, "b"_a=2, "c"_a=3);
+        return py::make_tuple(s1, s2);
+    });
+
+    // test_bytes
+    m.def("bytes_from_string", []() { return py::bytes(std::string("foo")); });
+    m.def("bytes_from_str", []() { return py::bytes(py::str("bar", 3)); });
+
+    // test_capsule
+    m.def("return_capsule_with_destructor", []() {
+        py::print("creating capsule");
+        return py::capsule([]() {
+            py::print("destructing capsule");
+        });
+    });
+
+    m.def("return_capsule_with_destructor_2", []() {
+        py::print("creating capsule");
+        return py::capsule((void *) 1234, [](void *ptr) {
+            py::print("destructing capsule: {}"_s.format((size_t) ptr));
+        });
+    });
+
+    m.def("return_capsule_with_name_and_destructor", []() {
+        auto capsule = py::capsule((void *) 1234, "pointer type description", [](PyObject *ptr) {
+            if (ptr) {
+                auto name = PyCapsule_GetName(ptr);
+                py::print("destructing capsule ({}, '{}')"_s.format(
+                    (size_t) PyCapsule_GetPointer(ptr, name), name
+                ));
+            }
+        });
+        void *contents = capsule;
+        py::print("created capsule ({}, '{}')"_s.format((size_t) contents, capsule.name()));
+        return capsule;
+    });
+
+    // test_accessors
+    m.def("accessor_api", [](py::object o) {
+        auto d = py::dict();
+
+        d["basic_attr"] = o.attr("basic_attr");
+
+        auto l = py::list();
+        for (const auto &item : o.attr("begin_end")) {
+            l.append(item);
+        }
+        d["begin_end"] = l;
+
+        d["operator[object]"] = o.attr("d")["operator[object]"_s];
+        d["operator[char *]"] = o.attr("d")["operator[char *]"];
+
+        d["attr(object)"] = o.attr("sub").attr("attr_obj");
+        d["attr(char *)"] = o.attr("sub").attr("attr_char");
+        try {
+            o.attr("sub").attr("missing").ptr();
+        } catch (const py::error_already_set &) {
+            d["missing_attr_ptr"] = "raised"_s;
+        }
+        try {
+            o.attr("missing").attr("doesn't matter");
+        } catch (const py::error_already_set &) {
+            d["missing_attr_chain"] = "raised"_s;
+        }
+
+        d["is_none"] = o.attr("basic_attr").is_none();
+
+        d["operator()"] = o.attr("func")(1);
+        d["operator*"] = o.attr("func")(*o.attr("begin_end"));
+
+        // Test implicit conversion
+        py::list implicit_list = o.attr("begin_end");
+        d["implicit_list"] = implicit_list;
+        py::dict implicit_dict = o.attr("__dict__");
+        d["implicit_dict"] = implicit_dict;
+
+        return d;
+    });
+
+    m.def("tuple_accessor", [](py::tuple existing_t) {
+        try {
+            existing_t[0] = 1;
+        } catch (const py::error_already_set &) {
+            // --> Python system error
+            // Only new tuples (refcount == 1) are mutable
+            auto new_t = py::tuple(3);
+            for (size_t i = 0; i < new_t.size(); ++i) {
+                new_t[i] = i;
+            }
+            return new_t;
+        }
+        return py::tuple();
+    });
+
+    m.def("accessor_assignment", []() {
+        auto l = py::list(1);
+        l[0] = 0;
+
+        auto d = py::dict();
+        d["get"] = l[0];
+        auto var = l[0];
+        d["deferred_get"] = var;
+        l[0] = 1;
+        d["set"] = l[0];
+        var = 99; // this assignment should not overwrite l[0]
+        d["deferred_set"] = l[0];
+        d["var"] = var;
+
+        return d;
+    });
+
+    // test_constructors
+    m.def("default_constructors", []() {
+        return py::dict(
+            "str"_a=py::str(),
+            "bool"_a=py::bool_(),
+            "int"_a=py::int_(),
+            "float"_a=py::float_(),
+            "tuple"_a=py::tuple(),
+            "list"_a=py::list(),
+            "dict"_a=py::dict(),
+            "set"_a=py::set()
+        );
+    });
+
+    m.def("converting_constructors", [](py::dict d) {
+        return py::dict(
+            "str"_a=py::str(d["str"]),
+            "bool"_a=py::bool_(d["bool"]),
+            "int"_a=py::int_(d["int"]),
+            "float"_a=py::float_(d["float"]),
+            "tuple"_a=py::tuple(d["tuple"]),
+            "list"_a=py::list(d["list"]),
+            "dict"_a=py::dict(d["dict"]),
+            "set"_a=py::set(d["set"]),
+            "memoryview"_a=py::memoryview(d["memoryview"])
+        );
+    });
+
+    m.def("cast_functions", [](py::dict d) {
+        // When converting between Python types, obj.cast<T>() should be the same as T(obj)
+        return py::dict(
+            "str"_a=d["str"].cast<py::str>(),
+            "bool"_a=d["bool"].cast<py::bool_>(),
+            "int"_a=d["int"].cast<py::int_>(),
+            "float"_a=d["float"].cast<py::float_>(),
+            "tuple"_a=d["tuple"].cast<py::tuple>(),
+            "list"_a=d["list"].cast<py::list>(),
+            "dict"_a=d["dict"].cast<py::dict>(),
+            "set"_a=d["set"].cast<py::set>(),
+            "memoryview"_a=d["memoryview"].cast<py::memoryview>()
+        );
+    });
+
+    m.def("get_implicit_casting", []() {
+        py::dict d;
+        d["char*_i1"] = "abc";
+        const char *c2 = "abc";
+        d["char*_i2"] = c2;
+        d["char*_e"] = py::cast(c2);
+        d["char*_p"] = py::str(c2);
+
+        d["int_i1"] = 42;
+        int i = 42;
+        d["int_i2"] = i;
+        i++;
+        d["int_e"] = py::cast(i);
+        i++;
+        d["int_p"] = py::int_(i);
+
+        d["str_i1"] = std::string("str");
+        std::string s2("str1");
+        d["str_i2"] = s2;
+        s2[3] = '2';
+        d["str_e"] = py::cast(s2);
+        s2[3] = '3';
+        d["str_p"] = py::str(s2);
+
+        py::list l(2);
+        l[0] = 3;
+        l[1] = py::cast(6);
+        l.append(9);
+        l.append(py::cast(12));
+        l.append(py::int_(15));
+
+        return py::dict(
+            "d"_a=d,
+            "l"_a=l
+        );
+    });
+
+    // test_print
+    m.def("print_function", []() {
+        py::print("Hello, World!");
+        py::print(1, 2.0, "three", true, std::string("-- multiple args"));
+        auto args = py::make_tuple("and", "a", "custom", "separator");
+        py::print("*args", *args, "sep"_a="-");
+        py::print("no new line here", "end"_a=" -- ");
+        py::print("next print");
+
+        auto py_stderr = py::module::import("sys").attr("stderr");
+        py::print("this goes to stderr", "file"_a=py_stderr);
+
+        py::print("flush", "flush"_a=true);
+
+        py::print("{a} + {b} = {c}"_s.format("a"_a="py::print", "b"_a="str.format", "c"_a="this"));
+    });
+
+    m.def("print_failure", []() { py::print(42, UnregisteredType()); });
+
+    m.def("hash_function", [](py::object obj) { return py::hash(obj); });
+
+    m.def("test_number_protocol", [](py::object a, py::object b) {
+        py::list l;
+        l.append(a.equal(b));
+        l.append(a.not_equal(b));
+        l.append(a < b);
+        l.append(a <= b);
+        l.append(a > b);
+        l.append(a >= b);
+        l.append(a + b);
+        l.append(a - b);
+        l.append(a * b);
+        l.append(a / b);
+        l.append(a | b);
+        l.append(a & b);
+        l.append(a ^ b);
+        l.append(a >> b);
+        l.append(a << b);
+        return l;
+    });
+
+    m.def("test_list_slicing", [](py::list a) {
+        return a[py::slice(0, -1, 2)];
+    });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.py
new file mode 100644
index 0000000..0e8d6c3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.py
@@ -0,0 +1,263 @@
+from __future__ import division
+import pytest
+import sys
+
+from pybind11_tests import pytypes as m
+from pybind11_tests import debug_enabled
+
+
+def test_list(capture, doc):
+    with capture:
+        lst = m.get_list()
+        assert lst == ["inserted-0", "overwritten", "inserted-2"]
+
+        lst.append("value2")
+        m.print_list(lst)
+    assert capture.unordered == """
+        Entry at position 0: value
+        list item 0: inserted-0
+        list item 1: overwritten
+        list item 2: inserted-2
+        list item 3: value2
+    """
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_set(capture, doc):
+    s = m.get_set()
+    assert s == {"key1", "key2", "key3"}
+
+    with capture:
+        s.add("key4")
+        m.print_set(s)
+    assert capture.unordered == """
+        key: key1
+        key: key2
+        key: key3
+        key: key4
+    """
+
+    assert not m.set_contains(set([]), 42)
+    assert m.set_contains({42}, 42)
+    assert m.set_contains({"foo"}, "foo")
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_dict(capture, doc):
+    d = m.get_dict()
+    assert d == {"key": "value"}
+
+    with capture:
+        d["key2"] = "value2"
+        m.print_dict(d)
+    assert capture.unordered == """
+        key: key, value=value
+        key: key2, value=value2
+    """
+
+    assert not m.dict_contains({}, 42)
+    assert m.dict_contains({42: None}, 42)
+    assert m.dict_contains({"foo": None}, "foo")
+
+    assert doc(m.get_dict) == "get_dict() -> dict"
+    assert doc(m.print_dict) == "print_dict(arg0: dict) -> None"
+
+    assert m.dict_keyword_constructor() == {"x": 1, "y": 2, "z": 3}
+
+
+def test_str(doc):
+    assert m.str_from_string().encode().decode() == "baz"
+    assert m.str_from_bytes().encode().decode() == "boo"
+
+    assert doc(m.str_from_bytes) == "str_from_bytes() -> str"
+
+    class A(object):
+        def __str__(self):
+            return "this is a str"
+
+        def __repr__(self):
+            return "this is a repr"
+
+    assert m.str_from_object(A()) == "this is a str"
+    assert m.repr_from_object(A()) == "this is a repr"
+
+    s1, s2 = m.str_format()
+    assert s1 == "1 + 2 = 3"
+    assert s1 == s2
+
+
+def test_bytes(doc):
+    assert m.bytes_from_string().decode() == "foo"
+    assert m.bytes_from_str().decode() == "bar"
+
+    assert doc(m.bytes_from_str) == "bytes_from_str() -> {}".format(
+        "bytes" if sys.version_info[0] == 3 else "str"
+    )
+
+
+def test_capsule(capture):
+    pytest.gc_collect()
+    with capture:
+        a = m.return_capsule_with_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule
+    """
+
+    with capture:
+        a = m.return_capsule_with_destructor_2()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule: 1234
+    """
+
+    with capture:
+        a = m.return_capsule_with_name_and_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        created capsule (1234, 'pointer type description')
+        destructing capsule (1234, 'pointer type description')
+    """
+
+
+def test_accessors():
+    class SubTestObject:
+        attr_obj = 1
+        attr_char = 2
+
+    class TestObject:
+        basic_attr = 1
+        begin_end = [1, 2, 3]
+        d = {"operator[object]": 1, "operator[char *]": 2}
+        sub = SubTestObject()
+
+        def func(self, x, *args):
+            return self.basic_attr + x + sum(args)
+
+    d = m.accessor_api(TestObject())
+    assert d["basic_attr"] == 1
+    assert d["begin_end"] == [1, 2, 3]
+    assert d["operator[object]"] == 1
+    assert d["operator[char *]"] == 2
+    assert d["attr(object)"] == 1
+    assert d["attr(char *)"] == 2
+    assert d["missing_attr_ptr"] == "raised"
+    assert d["missing_attr_chain"] == "raised"
+    assert d["is_none"] is False
+    assert d["operator()"] == 2
+    assert d["operator*"] == 7
+    assert d["implicit_list"] == [1, 2, 3]
+    assert all(x in TestObject.__dict__ for x in d["implicit_dict"])
+
+    assert m.tuple_accessor(tuple()) == (0, 1, 2)
+
+    d = m.accessor_assignment()
+    assert d["get"] == 0
+    assert d["deferred_get"] == 0
+    assert d["set"] == 1
+    assert d["deferred_set"] == 1
+    assert d["var"] == 99
+
+
+def test_constructors():
+    """C++ default and converting constructors are equivalent to type calls in Python"""
+    types = [str, bool, int, float, tuple, list, dict, set]
+    expected = {t.__name__: t() for t in types}
+    assert m.default_constructors() == expected
+
+    data = {
+        str: 42,
+        bool: "Not empty",
+        int: "42",
+        float: "+1e3",
+        tuple: range(3),
+        list: range(3),
+        dict: [("two", 2), ("one", 1), ("three", 3)],
+        set: [4, 4, 5, 6, 6, 6],
+        memoryview: b'abc'
+    }
+    inputs = {k.__name__: v for k, v in data.items()}
+    expected = {k.__name__: k(v) for k, v in data.items()}
+
+    assert m.converting_constructors(inputs) == expected
+    assert m.cast_functions(inputs) == expected
+
+    # Converting constructors and cast functions should just reference rather
+    # than copy when no conversion is needed:
+    noconv1 = m.converting_constructors(expected)
+    for k in noconv1:
+        assert noconv1[k] is expected[k]
+
+    noconv2 = m.cast_functions(expected)
+    for k in noconv2:
+        assert noconv2[k] is expected[k]
+
+
+def test_implicit_casting():
+    """Tests implicit casting when assigning or appending to dicts and lists."""
+    z = m.get_implicit_casting()
+    assert z['d'] == {
+        'char*_i1': 'abc', 'char*_i2': 'abc', 'char*_e': 'abc', 'char*_p': 'abc',
+        'str_i1': 'str', 'str_i2': 'str1', 'str_e': 'str2', 'str_p': 'str3',
+        'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
+    }
+    assert z['l'] == [3, 6, 9, 12, 15]
+
+
+def test_print(capture):
+    with capture:
+        m.print_function()
+    assert capture == """
+        Hello, World!
+        1 2.0 three True -- multiple args
+        *args-and-a-custom-separator
+        no new line here -- next print
+        flush
+        py::print + str.format = this
+    """
+    assert capture.stderr == "this goes to stderr"
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.print_failure()
+    assert str(excinfo.value) == "make_tuple(): unable to convert " + (
+        "argument of type 'UnregisteredType' to Python object"
+        if debug_enabled else
+        "arguments to Python object (compile in debug mode for details)"
+    )
+
+
+def test_hash():
+    class Hashable(object):
+        def __init__(self, value):
+            self.value = value
+
+        def __hash__(self):
+            return self.value
+
+    class Unhashable(object):
+        __hash__ = None
+
+    assert m.hash_function(Hashable(42)) == 42
+    with pytest.raises(TypeError):
+        m.hash_function(Unhashable())
+
+
+def test_number_protocol():
+    for a, b in [(1, 1), (3, 5)]:
+        li = [a == b, a != b, a < b, a <= b, a > b, a >= b, a + b,
+              a - b, a * b, a / b, a | b, a & b, a ^ b, a >> b, a << b]
+        assert m.test_number_protocol(a, b) == li
+
+
+def test_list_slicing():
+    li = list(range(100))
+    assert li[::2] == m.test_list_slicing(li)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cc
new file mode 100644
index 0000000..87ccf99
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cc
@@ -0,0 +1,353 @@
+/*
+    tests/test_sequences_and_iterators.cpp -- supporting Pythons' sequence protocol, iterators,
+    etc.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+template<typename T>
+class NonZeroIterator {
+    const T* ptr_;
+public:
+    NonZeroIterator(const T* ptr) : ptr_(ptr) {}
+    const T& operator*() const { return *ptr_; }
+    NonZeroIterator& operator++() { ++ptr_; return *this; }
+};
+
+class NonZeroSentinel {};
+
+template<typename A, typename B>
+bool operator==(const NonZeroIterator<std::pair<A, B>>& it, const NonZeroSentinel&) {
+    return !(*it).first || !(*it).second;
+}
+
+template <typename PythonType>
+py::list test_random_access_iterator(PythonType x) {
+    if (x.size() < 5)
+        throw py::value_error("Please provide at least 5 elements for testing.");
+
+    auto checks = py::list();
+    auto assert_equal = [&checks](py::handle a, py::handle b) {
+        auto result = PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_EQ);
+        if (result == -1) { throw py::error_already_set(); }
+        checks.append(result != 0);
+    };
+
+    auto it = x.begin();
+    assert_equal(x[0], *it);
+    assert_equal(x[0], it[0]);
+    assert_equal(x[1], it[1]);
+
+    assert_equal(x[1], *(++it));
+    assert_equal(x[1], *(it++));
+    assert_equal(x[2], *it);
+    assert_equal(x[3], *(it += 1));
+    assert_equal(x[2], *(--it));
+    assert_equal(x[2], *(it--));
+    assert_equal(x[1], *it);
+    assert_equal(x[0], *(it -= 1));
+
+    assert_equal(it->attr("real"), x[0].attr("real"));
+    assert_equal((it + 1)->attr("real"), x[1].attr("real"));
+
+    assert_equal(x[1], *(it + 1));
+    assert_equal(x[1], *(1 + it));
+    it += 3;
+    assert_equal(x[1], *(it - 2));
+
+    checks.append(static_cast<std::size_t>(x.end() - x.begin()) == x.size());
+    checks.append((x.begin() + static_cast<std::ptrdiff_t>(x.size())) == x.end());
+    checks.append(x.begin() < x.end());
+
+    return checks;
+}
+
+TEST_SUBMODULE(sequences_and_iterators, m) {
+    // test_sliceable
+    class Sliceable{
+    public:
+      Sliceable(int n): size(n) {}
+      int start,stop,step;
+      int size;
+    };
+    py::class_<Sliceable>(m,"Sliceable")
+        .def(py::init<int>())
+        .def("__getitem__",[](const Sliceable &s, py::slice slice) {
+          ssize_t start, stop, step, slicelength;
+          if (!slice.compute(s.size, &start, &stop, &step, &slicelength))
+              throw py::error_already_set();
+          int istart = static_cast<int>(start);
+          int istop =  static_cast<int>(stop);
+          int istep =  static_cast<int>(step);
+          return std::make_tuple(istart,istop,istep);
+        })
+        ;
+
+    // test_sequence
+    class Sequence {
+    public:
+        Sequence(size_t size) : m_size(size) {
+            print_created(this, "of size", m_size);
+            m_data = new float[size];
+            memset(m_data, 0, sizeof(float) * size);
+        }
+        Sequence(const std::vector<float> &value) : m_size(value.size()) {
+            print_created(this, "of size", m_size, "from std::vector");
+            m_data = new float[m_size];
+            memcpy(m_data, &value[0], sizeof(float) * m_size);
+        }
+        Sequence(const Sequence &s) : m_size(s.m_size) {
+            print_copy_created(this);
+            m_data = new float[m_size];
+            memcpy(m_data, s.m_data, sizeof(float)*m_size);
+        }
+        Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_size = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Sequence() { print_destroyed(this); delete[] m_data; }
+
+        Sequence &operator=(const Sequence &s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = new float[m_size];
+                memcpy(m_data, s.m_data, sizeof(float)*m_size);
+            }
+            print_copy_assigned(this);
+            return *this;
+        }
+
+        Sequence &operator=(Sequence &&s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = s.m_data;
+                s.m_size = 0;
+                s.m_data = nullptr;
+            }
+            print_move_assigned(this);
+            return *this;
+        }
+
+        bool operator==(const Sequence &s) const {
+            if (m_size != s.size()) return false;
+            for (size_t i = 0; i < m_size; ++i)
+                if (m_data[i] != s[i])
+                    return false;
+            return true;
+        }
+        bool operator!=(const Sequence &s) const { return !operator==(s); }
+
+        float operator[](size_t index) const { return m_data[index]; }
+        float &operator[](size_t index) { return m_data[index]; }
+
+        bool contains(float v) const {
+            for (size_t i = 0; i < m_size; ++i)
+                if (v == m_data[i])
+                    return true;
+            return false;
+        }
+
+        Sequence reversed() const {
+            Sequence result(m_size);
+            for (size_t i = 0; i < m_size; ++i)
+                result[m_size - i - 1] = m_data[i];
+            return result;
+        }
+
+        size_t size() const { return m_size; }
+
+        const float *begin() const { return m_data; }
+        const float *end() const { return m_data+m_size; }
+
+    private:
+        size_t m_size;
+        float *m_data;
+    };
+    py::class_<Sequence>(m, "Sequence")
+        .def(py::init<size_t>())
+        .def(py::init<const std::vector<float>&>())
+        /// Bare bones interface
+        .def("__getitem__", [](const Sequence &s, size_t i) {
+            if (i >= s.size()) throw py::index_error();
+            return s[i];
+        })
+        .def("__setitem__", [](Sequence &s, size_t i, float v) {
+            if (i >= s.size()) throw py::index_error();
+            s[i] = v;
+        })
+        .def("__len__", &Sequence::size)
+        /// Optional sequence protocol operations
+        .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
+                         py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
+        .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
+        .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
+        /// Slicing protocol (optional)
+        .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            Sequence *seq = new Sequence(slicelength);
+            for (size_t i = 0; i < slicelength; ++i) {
+                (*seq)[i] = s[start]; start += step;
+            }
+            return seq;
+        })
+        .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+            for (size_t i = 0; i < slicelength; ++i) {
+                s[start] = value[i]; start += step;
+            }
+        })
+        /// Comparisons
+        .def(py::self == py::self)
+        .def(py::self != py::self)
+        // Could also define py::self + py::self for concatenation, etc.
+        ;
+
+    // test_map_iterator
+    // Interface of a map-like object that isn't (directly) an unordered_map, but provides some basic
+    // map-like functionality.
+    class StringMap {
+    public:
+        StringMap() = default;
+        StringMap(std::unordered_map<std::string, std::string> init)
+            : map(std::move(init)) {}
+
+        void set(std::string key, std::string val) { map[key] = val; }
+        std::string get(std::string key) const { return map.at(key); }
+        size_t size() const { return map.size(); }
+    private:
+        std::unordered_map<std::string, std::string> map;
+    public:
+        decltype(map.cbegin()) begin() const { return map.cbegin(); }
+        decltype(map.cend()) end() const { return map.cend(); }
+    };
+    py::class_<StringMap>(m, "StringMap")
+        .def(py::init<>())
+        .def(py::init<std::unordered_map<std::string, std::string>>())
+        .def("__getitem__", [](const StringMap &map, std::string key) {
+                try { return map.get(key); }
+                catch (const std::out_of_range&) {
+                    throw py::key_error("key '" + key + "' does not exist");
+                }
+        })
+        .def("__setitem__", &StringMap::set)
+        .def("__len__", &StringMap::size)
+        .def("__iter__", [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        .def("items", [](const StringMap &map) { return py::make_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        ;
+
+    // test_generalized_iterators
+    class IntPairs {
+    public:
+        IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
+        const std::pair<int, int>* begin() const { return data_.data(); }
+    private:
+        std::vector<std::pair<int, int>> data_;
+    };
+    py::class_<IntPairs>(m, "IntPairs")
+        .def(py::init<std::vector<std::pair<int, int>>>())
+        .def("nonzero", [](const IntPairs& s) {
+                return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        .def("nonzero_keys", [](const IntPairs& s) {
+            return py::make_key_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        ;
+
+
+#if 0
+    // Obsolete: special data structure for exposing custom iterator types to python
+    // kept here for illustrative purposes because there might be some use cases which
+    // are not covered by the much simpler py::make_iterator
+
+    struct PySequenceIterator {
+        PySequenceIterator(const Sequence &seq, py::object ref) : seq(seq), ref(ref) { }
+
+        float next() {
+            if (index == seq.size())
+                throw py::stop_iteration();
+            return seq[index++];
+        }
+
+        const Sequence &seq;
+        py::object ref; // keep a reference
+        size_t index = 0;
+    };
+
+    py::class_<PySequenceIterator>(seq, "Iterator")
+        .def("__iter__", [](PySequenceIterator &it) -> PySequenceIterator& { return it; })
+        .def("__next__", &PySequenceIterator::next);
+
+    On the actual Sequence object, the iterator would be constructed as follows:
+    .def("__iter__", [](py::object s) { return PySequenceIterator(s.cast<const Sequence &>(), s); })
+#endif
+
+    // test_python_iterator_in_cpp
+    m.def("object_to_list", [](py::object o) {
+        auto l = py::list();
+        for (auto item : o) {
+            l.append(item);
+        }
+        return l;
+    });
+
+    m.def("iterator_to_list", [](py::iterator it) {
+        auto l = py::list();
+        while (it != py::iterator::sentinel()) {
+            l.append(*it);
+            ++it;
+        }
+        return l;
+    });
+
+    // Make sure that py::iterator works with std algorithms
+    m.def("count_none", [](py::object o) {
+        return std::count_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+    });
+
+    m.def("find_none", [](py::object o) {
+        auto it = std::find_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+        return it->is_none();
+    });
+
+    m.def("count_nonzeros", [](py::dict d) {
+       return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
+           return p.second.cast<int>() != 0;
+       });
+    });
+
+    m.def("tuple_iterator", &test_random_access_iterator<py::tuple>);
+    m.def("list_iterator", &test_random_access_iterator<py::list>);
+    m.def("sequence_iterator", &test_random_access_iterator<py::sequence>);
+
+    // test_iterator_passthrough
+    // #181: iterator passthrough did not compile
+    m.def("iterator_passthrough", [](py::iterator s) -> py::iterator {
+        return py::make_iterator(std::begin(s), std::end(s));
+    });
+
+    // test_iterator_rvp
+    // #388: Can't make iterators via make_iterator() with different r/v policies
+    static std::vector<int> list = { 1, 2, 3 };
+    m.def("make_iterator_1", []() { return py::make_iterator<py::return_value_policy::copy>(list); });
+    m.def("make_iterator_2", []() { return py::make_iterator<py::return_value_policy::automatic>(list); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cpp
new file mode 100644
index 0000000..87ccf99
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cpp
@@ -0,0 +1,353 @@
+/*
+    tests/test_sequences_and_iterators.cpp -- supporting Pythons' sequence protocol, iterators,
+    etc.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+template<typename T>
+class NonZeroIterator {
+    const T* ptr_;
+public:
+    NonZeroIterator(const T* ptr) : ptr_(ptr) {}
+    const T& operator*() const { return *ptr_; }
+    NonZeroIterator& operator++() { ++ptr_; return *this; }
+};
+
+class NonZeroSentinel {};
+
+template<typename A, typename B>
+bool operator==(const NonZeroIterator<std::pair<A, B>>& it, const NonZeroSentinel&) {
+    return !(*it).first || !(*it).second;
+}
+
+template <typename PythonType>
+py::list test_random_access_iterator(PythonType x) {
+    if (x.size() < 5)
+        throw py::value_error("Please provide at least 5 elements for testing.");
+
+    auto checks = py::list();
+    auto assert_equal = [&checks](py::handle a, py::handle b) {
+        auto result = PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_EQ);
+        if (result == -1) { throw py::error_already_set(); }
+        checks.append(result != 0);
+    };
+
+    auto it = x.begin();
+    assert_equal(x[0], *it);
+    assert_equal(x[0], it[0]);
+    assert_equal(x[1], it[1]);
+
+    assert_equal(x[1], *(++it));
+    assert_equal(x[1], *(it++));
+    assert_equal(x[2], *it);
+    assert_equal(x[3], *(it += 1));
+    assert_equal(x[2], *(--it));
+    assert_equal(x[2], *(it--));
+    assert_equal(x[1], *it);
+    assert_equal(x[0], *(it -= 1));
+
+    assert_equal(it->attr("real"), x[0].attr("real"));
+    assert_equal((it + 1)->attr("real"), x[1].attr("real"));
+
+    assert_equal(x[1], *(it + 1));
+    assert_equal(x[1], *(1 + it));
+    it += 3;
+    assert_equal(x[1], *(it - 2));
+
+    checks.append(static_cast<std::size_t>(x.end() - x.begin()) == x.size());
+    checks.append((x.begin() + static_cast<std::ptrdiff_t>(x.size())) == x.end());
+    checks.append(x.begin() < x.end());
+
+    return checks;
+}
+
+TEST_SUBMODULE(sequences_and_iterators, m) {
+    // test_sliceable
+    class Sliceable{
+    public:
+      Sliceable(int n): size(n) {}
+      int start,stop,step;
+      int size;
+    };
+    py::class_<Sliceable>(m,"Sliceable")
+        .def(py::init<int>())
+        .def("__getitem__",[](const Sliceable &s, py::slice slice) {
+          ssize_t start, stop, step, slicelength;
+          if (!slice.compute(s.size, &start, &stop, &step, &slicelength))
+              throw py::error_already_set();
+          int istart = static_cast<int>(start);
+          int istop =  static_cast<int>(stop);
+          int istep =  static_cast<int>(step);
+          return std::make_tuple(istart,istop,istep);
+        })
+        ;
+
+    // test_sequence
+    class Sequence {
+    public:
+        Sequence(size_t size) : m_size(size) {
+            print_created(this, "of size", m_size);
+            m_data = new float[size];
+            memset(m_data, 0, sizeof(float) * size);
+        }
+        Sequence(const std::vector<float> &value) : m_size(value.size()) {
+            print_created(this, "of size", m_size, "from std::vector");
+            m_data = new float[m_size];
+            memcpy(m_data, &value[0], sizeof(float) * m_size);
+        }
+        Sequence(const Sequence &s) : m_size(s.m_size) {
+            print_copy_created(this);
+            m_data = new float[m_size];
+            memcpy(m_data, s.m_data, sizeof(float)*m_size);
+        }
+        Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_size = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Sequence() { print_destroyed(this); delete[] m_data; }
+
+        Sequence &operator=(const Sequence &s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = new float[m_size];
+                memcpy(m_data, s.m_data, sizeof(float)*m_size);
+            }
+            print_copy_assigned(this);
+            return *this;
+        }
+
+        Sequence &operator=(Sequence &&s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = s.m_data;
+                s.m_size = 0;
+                s.m_data = nullptr;
+            }
+            print_move_assigned(this);
+            return *this;
+        }
+
+        bool operator==(const Sequence &s) const {
+            if (m_size != s.size()) return false;
+            for (size_t i = 0; i < m_size; ++i)
+                if (m_data[i] != s[i])
+                    return false;
+            return true;
+        }
+        bool operator!=(const Sequence &s) const { return !operator==(s); }
+
+        float operator[](size_t index) const { return m_data[index]; }
+        float &operator[](size_t index) { return m_data[index]; }
+
+        bool contains(float v) const {
+            for (size_t i = 0; i < m_size; ++i)
+                if (v == m_data[i])
+                    return true;
+            return false;
+        }
+
+        Sequence reversed() const {
+            Sequence result(m_size);
+            for (size_t i = 0; i < m_size; ++i)
+                result[m_size - i - 1] = m_data[i];
+            return result;
+        }
+
+        size_t size() const { return m_size; }
+
+        const float *begin() const { return m_data; }
+        const float *end() const { return m_data+m_size; }
+
+    private:
+        size_t m_size;
+        float *m_data;
+    };
+    py::class_<Sequence>(m, "Sequence")
+        .def(py::init<size_t>())
+        .def(py::init<const std::vector<float>&>())
+        /// Bare bones interface
+        .def("__getitem__", [](const Sequence &s, size_t i) {
+            if (i >= s.size()) throw py::index_error();
+            return s[i];
+        })
+        .def("__setitem__", [](Sequence &s, size_t i, float v) {
+            if (i >= s.size()) throw py::index_error();
+            s[i] = v;
+        })
+        .def("__len__", &Sequence::size)
+        /// Optional sequence protocol operations
+        .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
+                         py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
+        .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
+        .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
+        /// Slicing protocol (optional)
+        .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            Sequence *seq = new Sequence(slicelength);
+            for (size_t i = 0; i < slicelength; ++i) {
+                (*seq)[i] = s[start]; start += step;
+            }
+            return seq;
+        })
+        .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+            for (size_t i = 0; i < slicelength; ++i) {
+                s[start] = value[i]; start += step;
+            }
+        })
+        /// Comparisons
+        .def(py::self == py::self)
+        .def(py::self != py::self)
+        // Could also define py::self + py::self for concatenation, etc.
+        ;
+
+    // test_map_iterator
+    // Interface of a map-like object that isn't (directly) an unordered_map, but provides some basic
+    // map-like functionality.
+    class StringMap {
+    public:
+        StringMap() = default;
+        StringMap(std::unordered_map<std::string, std::string> init)
+            : map(std::move(init)) {}
+
+        void set(std::string key, std::string val) { map[key] = val; }
+        std::string get(std::string key) const { return map.at(key); }
+        size_t size() const { return map.size(); }
+    private:
+        std::unordered_map<std::string, std::string> map;
+    public:
+        decltype(map.cbegin()) begin() const { return map.cbegin(); }
+        decltype(map.cend()) end() const { return map.cend(); }
+    };
+    py::class_<StringMap>(m, "StringMap")
+        .def(py::init<>())
+        .def(py::init<std::unordered_map<std::string, std::string>>())
+        .def("__getitem__", [](const StringMap &map, std::string key) {
+                try { return map.get(key); }
+                catch (const std::out_of_range&) {
+                    throw py::key_error("key '" + key + "' does not exist");
+                }
+        })
+        .def("__setitem__", &StringMap::set)
+        .def("__len__", &StringMap::size)
+        .def("__iter__", [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        .def("items", [](const StringMap &map) { return py::make_iterator(map.begin(), map.end()); },
+                py::keep_alive<0, 1>())
+        ;
+
+    // test_generalized_iterators
+    class IntPairs {
+    public:
+        IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
+        const std::pair<int, int>* begin() const { return data_.data(); }
+    private:
+        std::vector<std::pair<int, int>> data_;
+    };
+    py::class_<IntPairs>(m, "IntPairs")
+        .def(py::init<std::vector<std::pair<int, int>>>())
+        .def("nonzero", [](const IntPairs& s) {
+                return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        .def("nonzero_keys", [](const IntPairs& s) {
+            return py::make_key_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        ;
+
+
+#if 0
+    // Obsolete: special data structure for exposing custom iterator types to python
+    // kept here for illustrative purposes because there might be some use cases which
+    // are not covered by the much simpler py::make_iterator
+
+    struct PySequenceIterator {
+        PySequenceIterator(const Sequence &seq, py::object ref) : seq(seq), ref(ref) { }
+
+        float next() {
+            if (index == seq.size())
+                throw py::stop_iteration();
+            return seq[index++];
+        }
+
+        const Sequence &seq;
+        py::object ref; // keep a reference
+        size_t index = 0;
+    };
+
+    py::class_<PySequenceIterator>(seq, "Iterator")
+        .def("__iter__", [](PySequenceIterator &it) -> PySequenceIterator& { return it; })
+        .def("__next__", &PySequenceIterator::next);
+
+    On the actual Sequence object, the iterator would be constructed as follows:
+    .def("__iter__", [](py::object s) { return PySequenceIterator(s.cast<const Sequence &>(), s); })
+#endif
+
+    // test_python_iterator_in_cpp
+    m.def("object_to_list", [](py::object o) {
+        auto l = py::list();
+        for (auto item : o) {
+            l.append(item);
+        }
+        return l;
+    });
+
+    m.def("iterator_to_list", [](py::iterator it) {
+        auto l = py::list();
+        while (it != py::iterator::sentinel()) {
+            l.append(*it);
+            ++it;
+        }
+        return l;
+    });
+
+    // Make sure that py::iterator works with std algorithms
+    m.def("count_none", [](py::object o) {
+        return std::count_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+    });
+
+    m.def("find_none", [](py::object o) {
+        auto it = std::find_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+        return it->is_none();
+    });
+
+    m.def("count_nonzeros", [](py::dict d) {
+       return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
+           return p.second.cast<int>() != 0;
+       });
+    });
+
+    m.def("tuple_iterator", &test_random_access_iterator<py::tuple>);
+    m.def("list_iterator", &test_random_access_iterator<py::list>);
+    m.def("sequence_iterator", &test_random_access_iterator<py::sequence>);
+
+    // test_iterator_passthrough
+    // #181: iterator passthrough did not compile
+    m.def("iterator_passthrough", [](py::iterator s) -> py::iterator {
+        return py::make_iterator(std::begin(s), std::end(s));
+    });
+
+    // test_iterator_rvp
+    // #388: Can't make iterators via make_iterator() with different r/v policies
+    static std::vector<int> list = { 1, 2, 3 };
+    m.def("make_iterator_1", []() { return py::make_iterator<py::return_value_policy::copy>(list); });
+    m.def("make_iterator_2", []() { return py::make_iterator<py::return_value_policy::automatic>(list); });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.py
new file mode 100644
index 0000000..6bd1606
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.py
@@ -0,0 +1,171 @@
+import pytest
+from pybind11_tests import sequences_and_iterators as m
+from pybind11_tests import ConstructorStats
+
+
+def isclose(a, b, rel_tol=1e-05, abs_tol=0.0):
+    """Like math.isclose() from Python 3.5"""
+    return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+
+def allclose(a_list, b_list, rel_tol=1e-05, abs_tol=0.0):
+    return all(isclose(a, b, rel_tol=rel_tol, abs_tol=abs_tol) for a, b in zip(a_list, b_list))
+
+
+def test_generalized_iterators():
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero()) == [(1, 2), (3, 4)]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero()) == [(1, 2)]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero()) == []
+
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero_keys()) == [1, 3]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_keys()) == [1]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_keys()) == []
+
+    # __next__ must continue to raise StopIteration
+    it = m.IntPairs([(0, 0)]).nonzero()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
+
+    it = m.IntPairs([(0, 0)]).nonzero_keys()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
+
+
+def test_sliceable():
+    sliceable = m.Sliceable(100)
+    assert sliceable[::] == (0, 100, 1)
+    assert sliceable[10::] == (10, 100, 1)
+    assert sliceable[:10:] == (0, 10, 1)
+    assert sliceable[::10] == (0, 100, 10)
+    assert sliceable[-10::] == (90, 100, 1)
+    assert sliceable[:-10:] == (0, 90, 1)
+    assert sliceable[::-10] == (99, -1, -10)
+    assert sliceable[50:60:1] == (50, 60, 1)
+    assert sliceable[50:60:-1] == (50, 60, -1)
+
+
+def test_sequence():
+    cstats = ConstructorStats.get(m.Sequence)
+
+    s = m.Sequence(5)
+    assert cstats.values() == ['of size', '5']
+
+    assert "Sequence" in repr(s)
+    assert len(s) == 5
+    assert s[0] == 0 and s[3] == 0
+    assert 12.34 not in s
+    s[0], s[3] = 12.34, 56.78
+    assert 12.34 in s
+    assert isclose(s[0], 12.34) and isclose(s[3], 56.78)
+
+    rev = reversed(s)
+    assert cstats.values() == ['of size', '5']
+
+    rev2 = s[::-1]
+    assert cstats.values() == ['of size', '5']
+
+    it = iter(m.Sequence(0))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+    assert cstats.values() == ['of size', '0']
+
+    expected = [0, 56.78, 0, 0, 12.34]
+    assert allclose(rev, expected)
+    assert allclose(rev2, expected)
+    assert rev == rev2
+
+    rev[0::2] = m.Sequence([2.0, 2.0, 2.0])
+    assert cstats.values() == ['of size', '3', 'from std::vector']
+
+    assert allclose(rev, [2, 56.78, 2, 0, 2])
+
+    assert cstats.alive() == 4
+    del it
+    assert cstats.alive() == 3
+    del s
+    assert cstats.alive() == 2
+    del rev
+    assert cstats.alive() == 1
+    del rev2
+    assert cstats.alive() == 0
+
+    assert cstats.values() == []
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 1
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+
+def test_map_iterator():
+    sm = m.StringMap({'hi': 'bye', 'black': 'white'})
+    assert sm['hi'] == 'bye'
+    assert len(sm) == 2
+    assert sm['black'] == 'white'
+
+    with pytest.raises(KeyError):
+        assert sm['orange']
+    sm['orange'] = 'banana'
+    assert sm['orange'] == 'banana'
+
+    expected = {'hi': 'bye', 'black': 'white', 'orange': 'banana'}
+    for k in sm:
+        assert sm[k] == expected[k]
+    for k, v in sm.items():
+        assert v == expected[k]
+
+    it = iter(m.StringMap({}))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+
+
+def test_python_iterator_in_cpp():
+    t = (1, 2, 3)
+    assert m.object_to_list(t) == [1, 2, 3]
+    assert m.object_to_list(iter(t)) == [1, 2, 3]
+    assert m.iterator_to_list(iter(t)) == [1, 2, 3]
+
+    with pytest.raises(TypeError) as excinfo:
+        m.object_to_list(1)
+    assert "object is not iterable" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.iterator_to_list(1)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    def bad_next_call():
+        raise RuntimeError("py::iterator::advance() should propagate errors")
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.iterator_to_list(iter(bad_next_call, None))
+    assert str(excinfo.value) == "py::iterator::advance() should propagate errors"
+
+    lst = [1, None, 0, None]
+    assert m.count_none(lst) == 2
+    assert m.find_none(lst) is True
+    assert m.count_nonzeros({"a": 0, "b": 1, "c": 2}) == 2
+
+    r = range(5)
+    assert all(m.tuple_iterator(tuple(r)))
+    assert all(m.list_iterator(list(r)))
+    assert all(m.sequence_iterator(r))
+
+
+def test_iterator_passthrough():
+    """#181: iterator passthrough did not compile"""
+    from pybind11_tests.sequences_and_iterators import iterator_passthrough
+
+    assert list(iterator_passthrough(iter([3, 5, 7, 9, 11, 13, 15]))) == [3, 5, 7, 9, 11, 13, 15]
+
+
+def test_iterator_rvp():
+    """#388: Can't make iterators via make_iterator() with different r/v policies """
+    import pybind11_tests.sequences_and_iterators as m
+
+    assert list(m.make_iterator_1()) == [1, 2, 3]
+    assert list(m.make_iterator_2()) == [1, 2, 3]
+    assert not isinstance(m.make_iterator_1(), type(m.make_iterator_2()))
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cc
new file mode 100644
index 0000000..87c9be8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cc
@@ -0,0 +1,366 @@
+/*
+    tests/test_smart_ptr.cpp -- binding classes with custom reference counting,
+    implicit conversions between types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#  pragma warning(disable: 4702) // unreachable code in system header
+#endif
+
+#include "pybind11_tests.h"
+#include "object.h"
+
+// Make pybind aware of the ref-counted wrapper type (s):
+
+// ref<T> is a wrapper for 'Object' which uses intrusive reference counting
+// It is always possible to construct a ref<T> from an Object* pointer without
+// possible inconsistencies, hence the 'true' argument at the end.
+PYBIND11_DECLARE_HOLDER_TYPE(T, ref<T>, true);
+// Make pybind11 aware of the non-standard getter member function
+namespace pybind11 { namespace detail {
+    template <typename T>
+    struct holder_helper<ref<T>> {
+        static const T *get(const ref<T> &p) { return p.get_ptr(); }
+    };
+}}
+
+// The following is not required anymore for std::shared_ptr, but it should compile without error:
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+
+// This is just a wrapper around unique_ptr, but with extra fields to deliberately bloat up the
+// holder size to trigger the non-simple-layout internal instance layout for single inheritance with
+// large holder type:
+template <typename T> class huge_unique_ptr {
+    std::unique_ptr<T> ptr;
+    uint64_t padding[10];
+public:
+    huge_unique_ptr(T *p) : ptr(p) {};
+    T *get() { return ptr.get(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
+
+// Simple custom holder that works like unique_ptr
+template <typename T>
+class custom_unique_ptr {
+    std::unique_ptr<T> impl;
+public:
+    custom_unique_ptr(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
+
+// Simple custom holder that works like shared_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class shared_ptr_with_addressof_operator {
+    std::shared_ptr<T> impl;
+public:
+    shared_ptr_with_addressof_operator( ) = default;
+    shared_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, shared_ptr_with_addressof_operator<T>);
+
+// Simple custom holder that works like unique_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class unique_ptr_with_addressof_operator {
+    std::unique_ptr<T> impl;
+public:
+    unique_ptr_with_addressof_operator() = default;
+    unique_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, unique_ptr_with_addressof_operator<T>);
+
+
+TEST_SUBMODULE(smart_ptr, m) {
+
+    // test_smart_ptr
+
+    // Object implementation in `object.h`
+    py::class_<Object, ref<Object>> obj(m, "Object");
+    obj.def("getRefCount", &Object::getRefCount);
+
+    // Custom object with builtin reference counting (see 'object.h' for the implementation)
+    class MyObject1 : public Object {
+    public:
+        MyObject1(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject1[" + std::to_string(value) + "]"; }
+    protected:
+        virtual ~MyObject1() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject1, ref<MyObject1>>(m, "MyObject1", obj)
+        .def(py::init<int>());
+    py::implicitly_convertible<py::int_, MyObject1>();
+
+    m.def("make_object_1", []() -> Object * { return new MyObject1(1); });
+    m.def("make_object_2", []() -> ref<Object> { return new MyObject1(2); });
+    m.def("make_myobject1_1", []() -> MyObject1 * { return new MyObject1(4); });
+    m.def("make_myobject1_2", []() -> ref<MyObject1> { return new MyObject1(5); });
+    m.def("print_object_1", [](const Object *obj) { py::print(obj->toString()); });
+    m.def("print_object_2", [](ref<Object> obj) { py::print(obj->toString()); });
+    m.def("print_object_3", [](const ref<Object> &obj) { py::print(obj->toString()); });
+    m.def("print_object_4", [](const ref<Object> *obj) { py::print((*obj)->toString()); });
+    m.def("print_myobject1_1", [](const MyObject1 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_2", [](ref<MyObject1> obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_3", [](const ref<MyObject1> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_4", [](const ref<MyObject1> *obj) { py::print((*obj)->toString()); });
+
+    // Expose constructor stats for the ref type
+    m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
+
+
+    // Object managed by a std::shared_ptr<>
+    class MyObject2 {
+    public:
+        MyObject2(const MyObject2 &) = default;
+        MyObject2(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
+        virtual ~MyObject2() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject2, std::shared_ptr<MyObject2>>(m, "MyObject2")
+        .def(py::init<int>());
+    m.def("make_myobject2_1", []() { return new MyObject2(6); });
+    m.def("make_myobject2_2", []() { return std::make_shared<MyObject2>(7); });
+    m.def("print_myobject2_1", [](const MyObject2 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_2", [](std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_3", [](const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_4", [](const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); });
+
+    // Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
+    class MyObject3 : public std::enable_shared_from_this<MyObject3> {
+    public:
+        MyObject3(const MyObject3 &) = default;
+        MyObject3(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
+        virtual ~MyObject3() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject3, std::shared_ptr<MyObject3>>(m, "MyObject3")
+        .def(py::init<int>());
+    m.def("make_myobject3_1", []() { return new MyObject3(8); });
+    m.def("make_myobject3_2", []() { return std::make_shared<MyObject3>(9); });
+    m.def("print_myobject3_1", [](const MyObject3 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_2", [](std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_3", [](const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_4", [](const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); });
+
+    // test_smart_ptr_refcounting
+    m.def("test_object1_refcounting", []() {
+        ref<MyObject1> o = new MyObject1(0);
+        bool good = o->getRefCount() == 1;
+        py::object o2 = py::cast(o, py::return_value_policy::reference);
+        // always request (partial) ownership for objects with intrusive
+        // reference counting even when using the 'reference' RVP
+        good &= o->getRefCount() == 2;
+        return good;
+    });
+
+    // test_unique_nodelete
+    // Object with a private destructor
+    class MyObject4 {
+    public:
+        MyObject4(int value) : value{value} { print_created(this); }
+        int value;
+    private:
+        ~MyObject4() { print_destroyed(this); }
+    };
+    py::class_<MyObject4, std::unique_ptr<MyObject4, py::nodelete>>(m, "MyObject4")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4::value);
+
+    // test_unique_deleter
+    // Object with std::unique_ptr<T, D> where D is not matching the base class
+    // Object with a protected destructor
+    class MyObject4a {
+    public:
+        MyObject4a(int i) {
+            value = i;
+            print_created(this);
+        };
+        int value;
+    protected:
+        virtual ~MyObject4a() { print_destroyed(this); }
+    };
+    py::class_<MyObject4a, std::unique_ptr<MyObject4a, py::nodelete>>(m, "MyObject4a")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4a::value);
+
+    // Object derived but with public destructor and no Deleter in default holder
+    class MyObject4b : public MyObject4a {
+    public:
+        MyObject4b(int i) : MyObject4a(i) { print_created(this); }
+        ~MyObject4b() { print_destroyed(this); }
+    };
+    py::class_<MyObject4b, MyObject4a>(m, "MyObject4b")
+        .def(py::init<int>());
+
+    // test_large_holder
+    class MyObject5 { // managed by huge_unique_ptr
+    public:
+        MyObject5(int value) : value{value} { print_created(this); }
+        ~MyObject5() { print_destroyed(this); }
+        int value;
+    };
+    py::class_<MyObject5, huge_unique_ptr<MyObject5>>(m, "MyObject5")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject5::value);
+
+    // test_shared_ptr_and_references
+    struct SharedPtrRef {
+        struct A {
+            A() { print_created(this); }
+            A(const A &) { print_copy_created(this); }
+            A(A &&) { print_move_created(this); }
+            ~A() { print_destroyed(this); }
+        };
+
+        A value = {};
+        std::shared_ptr<A> shared = std::make_shared<A>();
+    };
+    using A = SharedPtrRef::A;
+    py::class_<A, std::shared_ptr<A>>(m, "A");
+    py::class_<SharedPtrRef>(m, "SharedPtrRef")
+        .def(py::init<>())
+        .def_readonly("ref", &SharedPtrRef::value)
+        .def_property_readonly("copy", [](const SharedPtrRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedPtrRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedPtrRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedPtrRef &, const A &) { return true; })
+        .def("set_holder", [](SharedPtrRef &, std::shared_ptr<A>) { return true; });
+
+    // test_shared_ptr_from_this_and_references
+    struct SharedFromThisRef {
+        struct B : std::enable_shared_from_this<B> {
+            B() { print_created(this); }
+            B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
+            B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
+            ~B() { print_destroyed(this); }
+        };
+
+        B value = {};
+        std::shared_ptr<B> shared = std::make_shared<B>();
+    };
+    using B = SharedFromThisRef::B;
+    py::class_<B, std::shared_ptr<B>>(m, "B");
+    py::class_<SharedFromThisRef>(m, "SharedFromThisRef")
+        .def(py::init<>())
+        .def_readonly("bad_wp", &SharedFromThisRef::value)
+        .def_property_readonly("ref", [](const SharedFromThisRef &s) -> const B & { return *s.shared; })
+        .def_property_readonly("copy", [](const SharedFromThisRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedFromThisRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedFromThisRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedFromThisRef &, const B &) { return true; })
+        .def("set_holder", [](SharedFromThisRef &, std::shared_ptr<B>) { return true; });
+
+    // Issue #865: shared_from_this doesn't work with virtual inheritance
+    struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
+        SharedFromThisVBase() = default;
+        SharedFromThisVBase(const SharedFromThisVBase &) = default;
+        virtual ~SharedFromThisVBase() = default;
+    };
+    struct SharedFromThisVirt : virtual SharedFromThisVBase {};
+    static std::shared_ptr<SharedFromThisVirt> sft(new SharedFromThisVirt());
+    py::class_<SharedFromThisVirt, std::shared_ptr<SharedFromThisVirt>>(m, "SharedFromThisVirt")
+        .def_static("get", []() { return sft.get(); });
+
+    // test_move_only_holder
+    struct C {
+        C() { print_created(this); }
+        ~C() { print_destroyed(this); }
+    };
+    py::class_<C, custom_unique_ptr<C>>(m, "TypeWithMoveOnlyHolder")
+        .def_static("make", []() { return custom_unique_ptr<C>(new C); });
+
+    // test_holder_with_addressof_operator
+    struct TypeForHolderWithAddressOf {
+        TypeForHolderWithAddressOf() { print_created(this); }
+        TypeForHolderWithAddressOf(const TypeForHolderWithAddressOf &) { print_copy_created(this); }
+        TypeForHolderWithAddressOf(TypeForHolderWithAddressOf &&) { print_move_created(this); }
+        ~TypeForHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "TypeForHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value = 42;
+    };
+    using HolderWithAddressOf = shared_ptr_with_addressof_operator<TypeForHolderWithAddressOf>;
+    py::class_<TypeForHolderWithAddressOf, HolderWithAddressOf>(m, "TypeForHolderWithAddressOf")
+        .def_static("make", []() { return HolderWithAddressOf(new TypeForHolderWithAddressOf); })
+        .def("get", [](const HolderWithAddressOf &self) { return self.get(); })
+        .def("print_object_1", [](const TypeForHolderWithAddressOf *obj) { py::print(obj->toString()); })
+        .def("print_object_2", [](HolderWithAddressOf obj) { py::print(obj.get()->toString()); })
+        .def("print_object_3", [](const HolderWithAddressOf &obj) { py::print(obj.get()->toString()); })
+        .def("print_object_4", [](const HolderWithAddressOf *obj) { py::print((*obj).get()->toString()); });
+
+    // test_move_only_holder_with_addressof_operator
+    struct TypeForMoveOnlyHolderWithAddressOf {
+        TypeForMoveOnlyHolderWithAddressOf(int value) : value{value} { print_created(this); }
+        ~TypeForMoveOnlyHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "MoveOnlyHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value;
+    };
+    using MoveOnlyHolderWithAddressOf = unique_ptr_with_addressof_operator<TypeForMoveOnlyHolderWithAddressOf>;
+    py::class_<TypeForMoveOnlyHolderWithAddressOf, MoveOnlyHolderWithAddressOf>(m, "TypeForMoveOnlyHolderWithAddressOf")
+        .def_static("make", []() { return MoveOnlyHolderWithAddressOf(new TypeForMoveOnlyHolderWithAddressOf(0)); })
+        .def_readwrite("value", &TypeForMoveOnlyHolderWithAddressOf::value)
+        .def("print_object", [](const TypeForMoveOnlyHolderWithAddressOf *obj) { py::print(obj->toString()); });
+
+    // test_smart_ptr_from_default
+    struct HeldByDefaultHolder { };
+    py::class_<HeldByDefaultHolder>(m, "HeldByDefaultHolder")
+        .def(py::init<>())
+        .def_static("load_shared_ptr", [](std::shared_ptr<HeldByDefaultHolder>) {});
+
+    // test_shared_ptr_gc
+    // #187: issue involving std::shared_ptr<> return value policy & garbage collection
+    struct ElementBase {
+        virtual ~ElementBase() { } /* Force creation of virtual table */
+    };
+    py::class_<ElementBase, std::shared_ptr<ElementBase>>(m, "ElementBase");
+
+    struct ElementA : ElementBase {
+        ElementA(int v) : v(v) { }
+        int value() { return v; }
+        int v;
+    };
+    py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m, "ElementA")
+        .def(py::init<int>())
+        .def("value", &ElementA::value);
+
+    struct ElementList {
+        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
+        std::vector<std::shared_ptr<ElementBase>> l;
+    };
+    py::class_<ElementList, std::shared_ptr<ElementList>>(m, "ElementList")
+        .def(py::init<>())
+        .def("add", &ElementList::add)
+        .def("get", [](ElementList &el) {
+            py::list list;
+            for (auto &e : el.l)
+                list.append(py::cast(e));
+            return list;
+        });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cpp
new file mode 100644
index 0000000..87c9be8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cpp
@@ -0,0 +1,366 @@
+/*
+    tests/test_smart_ptr.cpp -- binding classes with custom reference counting,
+    implicit conversions between types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#  pragma warning(disable: 4702) // unreachable code in system header
+#endif
+
+#include "pybind11_tests.h"
+#include "object.h"
+
+// Make pybind aware of the ref-counted wrapper type (s):
+
+// ref<T> is a wrapper for 'Object' which uses intrusive reference counting
+// It is always possible to construct a ref<T> from an Object* pointer without
+// possible inconsistencies, hence the 'true' argument at the end.
+PYBIND11_DECLARE_HOLDER_TYPE(T, ref<T>, true);
+// Make pybind11 aware of the non-standard getter member function
+namespace pybind11 { namespace detail {
+    template <typename T>
+    struct holder_helper<ref<T>> {
+        static const T *get(const ref<T> &p) { return p.get_ptr(); }
+    };
+}}
+
+// The following is not required anymore for std::shared_ptr, but it should compile without error:
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+
+// This is just a wrapper around unique_ptr, but with extra fields to deliberately bloat up the
+// holder size to trigger the non-simple-layout internal instance layout for single inheritance with
+// large holder type:
+template <typename T> class huge_unique_ptr {
+    std::unique_ptr<T> ptr;
+    uint64_t padding[10];
+public:
+    huge_unique_ptr(T *p) : ptr(p) {};
+    T *get() { return ptr.get(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
+
+// Simple custom holder that works like unique_ptr
+template <typename T>
+class custom_unique_ptr {
+    std::unique_ptr<T> impl;
+public:
+    custom_unique_ptr(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
+
+// Simple custom holder that works like shared_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class shared_ptr_with_addressof_operator {
+    std::shared_ptr<T> impl;
+public:
+    shared_ptr_with_addressof_operator( ) = default;
+    shared_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, shared_ptr_with_addressof_operator<T>);
+
+// Simple custom holder that works like unique_ptr and has operator& overload
+// To obtain address of an instance of this holder pybind should use std::addressof
+// Attempt to get address via operator& may leads to segmentation fault
+template <typename T>
+class unique_ptr_with_addressof_operator {
+    std::unique_ptr<T> impl;
+public:
+    unique_ptr_with_addressof_operator() = default;
+    unique_ptr_with_addressof_operator(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+    T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, unique_ptr_with_addressof_operator<T>);
+
+
+TEST_SUBMODULE(smart_ptr, m) {
+
+    // test_smart_ptr
+
+    // Object implementation in `object.h`
+    py::class_<Object, ref<Object>> obj(m, "Object");
+    obj.def("getRefCount", &Object::getRefCount);
+
+    // Custom object with builtin reference counting (see 'object.h' for the implementation)
+    class MyObject1 : public Object {
+    public:
+        MyObject1(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject1[" + std::to_string(value) + "]"; }
+    protected:
+        virtual ~MyObject1() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject1, ref<MyObject1>>(m, "MyObject1", obj)
+        .def(py::init<int>());
+    py::implicitly_convertible<py::int_, MyObject1>();
+
+    m.def("make_object_1", []() -> Object * { return new MyObject1(1); });
+    m.def("make_object_2", []() -> ref<Object> { return new MyObject1(2); });
+    m.def("make_myobject1_1", []() -> MyObject1 * { return new MyObject1(4); });
+    m.def("make_myobject1_2", []() -> ref<MyObject1> { return new MyObject1(5); });
+    m.def("print_object_1", [](const Object *obj) { py::print(obj->toString()); });
+    m.def("print_object_2", [](ref<Object> obj) { py::print(obj->toString()); });
+    m.def("print_object_3", [](const ref<Object> &obj) { py::print(obj->toString()); });
+    m.def("print_object_4", [](const ref<Object> *obj) { py::print((*obj)->toString()); });
+    m.def("print_myobject1_1", [](const MyObject1 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_2", [](ref<MyObject1> obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_3", [](const ref<MyObject1> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_4", [](const ref<MyObject1> *obj) { py::print((*obj)->toString()); });
+
+    // Expose constructor stats for the ref type
+    m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
+
+
+    // Object managed by a std::shared_ptr<>
+    class MyObject2 {
+    public:
+        MyObject2(const MyObject2 &) = default;
+        MyObject2(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
+        virtual ~MyObject2() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject2, std::shared_ptr<MyObject2>>(m, "MyObject2")
+        .def(py::init<int>());
+    m.def("make_myobject2_1", []() { return new MyObject2(6); });
+    m.def("make_myobject2_2", []() { return std::make_shared<MyObject2>(7); });
+    m.def("print_myobject2_1", [](const MyObject2 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_2", [](std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_3", [](const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_4", [](const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); });
+
+    // Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
+    class MyObject3 : public std::enable_shared_from_this<MyObject3> {
+    public:
+        MyObject3(const MyObject3 &) = default;
+        MyObject3(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
+        virtual ~MyObject3() { print_destroyed(this); }
+    private:
+        int value;
+    };
+    py::class_<MyObject3, std::shared_ptr<MyObject3>>(m, "MyObject3")
+        .def(py::init<int>());
+    m.def("make_myobject3_1", []() { return new MyObject3(8); });
+    m.def("make_myobject3_2", []() { return std::make_shared<MyObject3>(9); });
+    m.def("print_myobject3_1", [](const MyObject3 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_2", [](std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_3", [](const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_4", [](const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); });
+
+    // test_smart_ptr_refcounting
+    m.def("test_object1_refcounting", []() {
+        ref<MyObject1> o = new MyObject1(0);
+        bool good = o->getRefCount() == 1;
+        py::object o2 = py::cast(o, py::return_value_policy::reference);
+        // always request (partial) ownership for objects with intrusive
+        // reference counting even when using the 'reference' RVP
+        good &= o->getRefCount() == 2;
+        return good;
+    });
+
+    // test_unique_nodelete
+    // Object with a private destructor
+    class MyObject4 {
+    public:
+        MyObject4(int value) : value{value} { print_created(this); }
+        int value;
+    private:
+        ~MyObject4() { print_destroyed(this); }
+    };
+    py::class_<MyObject4, std::unique_ptr<MyObject4, py::nodelete>>(m, "MyObject4")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4::value);
+
+    // test_unique_deleter
+    // Object with std::unique_ptr<T, D> where D is not matching the base class
+    // Object with a protected destructor
+    class MyObject4a {
+    public:
+        MyObject4a(int i) {
+            value = i;
+            print_created(this);
+        };
+        int value;
+    protected:
+        virtual ~MyObject4a() { print_destroyed(this); }
+    };
+    py::class_<MyObject4a, std::unique_ptr<MyObject4a, py::nodelete>>(m, "MyObject4a")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject4a::value);
+
+    // Object derived but with public destructor and no Deleter in default holder
+    class MyObject4b : public MyObject4a {
+    public:
+        MyObject4b(int i) : MyObject4a(i) { print_created(this); }
+        ~MyObject4b() { print_destroyed(this); }
+    };
+    py::class_<MyObject4b, MyObject4a>(m, "MyObject4b")
+        .def(py::init<int>());
+
+    // test_large_holder
+    class MyObject5 { // managed by huge_unique_ptr
+    public:
+        MyObject5(int value) : value{value} { print_created(this); }
+        ~MyObject5() { print_destroyed(this); }
+        int value;
+    };
+    py::class_<MyObject5, huge_unique_ptr<MyObject5>>(m, "MyObject5")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject5::value);
+
+    // test_shared_ptr_and_references
+    struct SharedPtrRef {
+        struct A {
+            A() { print_created(this); }
+            A(const A &) { print_copy_created(this); }
+            A(A &&) { print_move_created(this); }
+            ~A() { print_destroyed(this); }
+        };
+
+        A value = {};
+        std::shared_ptr<A> shared = std::make_shared<A>();
+    };
+    using A = SharedPtrRef::A;
+    py::class_<A, std::shared_ptr<A>>(m, "A");
+    py::class_<SharedPtrRef>(m, "SharedPtrRef")
+        .def(py::init<>())
+        .def_readonly("ref", &SharedPtrRef::value)
+        .def_property_readonly("copy", [](const SharedPtrRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedPtrRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedPtrRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedPtrRef &, const A &) { return true; })
+        .def("set_holder", [](SharedPtrRef &, std::shared_ptr<A>) { return true; });
+
+    // test_shared_ptr_from_this_and_references
+    struct SharedFromThisRef {
+        struct B : std::enable_shared_from_this<B> {
+            B() { print_created(this); }
+            B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
+            B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
+            ~B() { print_destroyed(this); }
+        };
+
+        B value = {};
+        std::shared_ptr<B> shared = std::make_shared<B>();
+    };
+    using B = SharedFromThisRef::B;
+    py::class_<B, std::shared_ptr<B>>(m, "B");
+    py::class_<SharedFromThisRef>(m, "SharedFromThisRef")
+        .def(py::init<>())
+        .def_readonly("bad_wp", &SharedFromThisRef::value)
+        .def_property_readonly("ref", [](const SharedFromThisRef &s) -> const B & { return *s.shared; })
+        .def_property_readonly("copy", [](const SharedFromThisRef &s) { return s.value; },
+                               py::return_value_policy::copy)
+        .def_readonly("holder_ref", &SharedFromThisRef::shared)
+        .def_property_readonly("holder_copy", [](const SharedFromThisRef &s) { return s.shared; },
+                               py::return_value_policy::copy)
+        .def("set_ref", [](SharedFromThisRef &, const B &) { return true; })
+        .def("set_holder", [](SharedFromThisRef &, std::shared_ptr<B>) { return true; });
+
+    // Issue #865: shared_from_this doesn't work with virtual inheritance
+    struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
+        SharedFromThisVBase() = default;
+        SharedFromThisVBase(const SharedFromThisVBase &) = default;
+        virtual ~SharedFromThisVBase() = default;
+    };
+    struct SharedFromThisVirt : virtual SharedFromThisVBase {};
+    static std::shared_ptr<SharedFromThisVirt> sft(new SharedFromThisVirt());
+    py::class_<SharedFromThisVirt, std::shared_ptr<SharedFromThisVirt>>(m, "SharedFromThisVirt")
+        .def_static("get", []() { return sft.get(); });
+
+    // test_move_only_holder
+    struct C {
+        C() { print_created(this); }
+        ~C() { print_destroyed(this); }
+    };
+    py::class_<C, custom_unique_ptr<C>>(m, "TypeWithMoveOnlyHolder")
+        .def_static("make", []() { return custom_unique_ptr<C>(new C); });
+
+    // test_holder_with_addressof_operator
+    struct TypeForHolderWithAddressOf {
+        TypeForHolderWithAddressOf() { print_created(this); }
+        TypeForHolderWithAddressOf(const TypeForHolderWithAddressOf &) { print_copy_created(this); }
+        TypeForHolderWithAddressOf(TypeForHolderWithAddressOf &&) { print_move_created(this); }
+        ~TypeForHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "TypeForHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value = 42;
+    };
+    using HolderWithAddressOf = shared_ptr_with_addressof_operator<TypeForHolderWithAddressOf>;
+    py::class_<TypeForHolderWithAddressOf, HolderWithAddressOf>(m, "TypeForHolderWithAddressOf")
+        .def_static("make", []() { return HolderWithAddressOf(new TypeForHolderWithAddressOf); })
+        .def("get", [](const HolderWithAddressOf &self) { return self.get(); })
+        .def("print_object_1", [](const TypeForHolderWithAddressOf *obj) { py::print(obj->toString()); })
+        .def("print_object_2", [](HolderWithAddressOf obj) { py::print(obj.get()->toString()); })
+        .def("print_object_3", [](const HolderWithAddressOf &obj) { py::print(obj.get()->toString()); })
+        .def("print_object_4", [](const HolderWithAddressOf *obj) { py::print((*obj).get()->toString()); });
+
+    // test_move_only_holder_with_addressof_operator
+    struct TypeForMoveOnlyHolderWithAddressOf {
+        TypeForMoveOnlyHolderWithAddressOf(int value) : value{value} { print_created(this); }
+        ~TypeForMoveOnlyHolderWithAddressOf() { print_destroyed(this); }
+        std::string toString() const {
+            return "MoveOnlyHolderWithAddressOf[" + std::to_string(value) + "]";
+        }
+        int value;
+    };
+    using MoveOnlyHolderWithAddressOf = unique_ptr_with_addressof_operator<TypeForMoveOnlyHolderWithAddressOf>;
+    py::class_<TypeForMoveOnlyHolderWithAddressOf, MoveOnlyHolderWithAddressOf>(m, "TypeForMoveOnlyHolderWithAddressOf")
+        .def_static("make", []() { return MoveOnlyHolderWithAddressOf(new TypeForMoveOnlyHolderWithAddressOf(0)); })
+        .def_readwrite("value", &TypeForMoveOnlyHolderWithAddressOf::value)
+        .def("print_object", [](const TypeForMoveOnlyHolderWithAddressOf *obj) { py::print(obj->toString()); });
+
+    // test_smart_ptr_from_default
+    struct HeldByDefaultHolder { };
+    py::class_<HeldByDefaultHolder>(m, "HeldByDefaultHolder")
+        .def(py::init<>())
+        .def_static("load_shared_ptr", [](std::shared_ptr<HeldByDefaultHolder>) {});
+
+    // test_shared_ptr_gc
+    // #187: issue involving std::shared_ptr<> return value policy & garbage collection
+    struct ElementBase {
+        virtual ~ElementBase() { } /* Force creation of virtual table */
+    };
+    py::class_<ElementBase, std::shared_ptr<ElementBase>>(m, "ElementBase");
+
+    struct ElementA : ElementBase {
+        ElementA(int v) : v(v) { }
+        int value() { return v; }
+        int v;
+    };
+    py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m, "ElementA")
+        .def(py::init<int>())
+        .def("value", &ElementA::value);
+
+    struct ElementList {
+        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
+        std::vector<std::shared_ptr<ElementBase>> l;
+    };
+    py::class_<ElementList, std::shared_ptr<ElementList>>(m, "ElementList")
+        .def(py::init<>())
+        .def("add", &ElementList::add)
+        .def("get", [](ElementList &el) {
+            py::list list;
+            for (auto &e : el.l)
+                list.append(py::cast(e));
+            return list;
+        });
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.py
new file mode 100644
index 0000000..c662704
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.py
@@ -0,0 +1,286 @@
+import pytest
+from pybind11_tests import smart_ptr as m
+from pybind11_tests import ConstructorStats
+
+
+def test_smart_ptr(capture):
+    # Object1
+    for i, o in enumerate([m.make_object_1(), m.make_object_2(), m.MyObject1(3)], start=1):
+        assert o.getRefCount() == 1
+        with capture:
+            m.print_object_1(o)
+            m.print_object_2(o)
+            m.print_object_3(o)
+            m.print_object_4(o)
+        assert capture == "MyObject1[{i}]\n".format(i=i) * 4
+
+    for i, o in enumerate([m.make_myobject1_1(), m.make_myobject1_2(), m.MyObject1(6), 7],
+                          start=4):
+        print(o)
+        with capture:
+            if not isinstance(o, int):
+                m.print_object_1(o)
+                m.print_object_2(o)
+                m.print_object_3(o)
+                m.print_object_4(o)
+            m.print_myobject1_1(o)
+            m.print_myobject1_2(o)
+            m.print_myobject1_3(o)
+            m.print_myobject1_4(o)
+        assert capture == "MyObject1[{i}]\n".format(i=i) * (4 if isinstance(o, int) else 8)
+
+    cstats = ConstructorStats.get(m.MyObject1)
+    assert cstats.alive() == 0
+    expected_values = ['MyObject1[{}]'.format(i) for i in range(1, 7)] + ['MyObject1[7]'] * 4
+    assert cstats.values() == expected_values
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object2
+    for i, o in zip([8, 6, 7], [m.MyObject2(8), m.make_myobject2_1(), m.make_myobject2_2()]):
+        print(o)
+        with capture:
+            m.print_myobject2_1(o)
+            m.print_myobject2_2(o)
+            m.print_myobject2_3(o)
+            m.print_myobject2_4(o)
+        assert capture == "MyObject2[{i}]\n".format(i=i) * 4
+
+    cstats = ConstructorStats.get(m.MyObject2)
+    assert cstats.alive() == 1
+    o = None
+    assert cstats.alive() == 0
+    assert cstats.values() == ['MyObject2[8]', 'MyObject2[6]', 'MyObject2[7]']
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object3
+    for i, o in zip([9, 8, 9], [m.MyObject3(9), m.make_myobject3_1(), m.make_myobject3_2()]):
+        print(o)
+        with capture:
+            m.print_myobject3_1(o)
+            m.print_myobject3_2(o)
+            m.print_myobject3_3(o)
+            m.print_myobject3_4(o)
+        assert capture == "MyObject3[{i}]\n".format(i=i) * 4
+
+    cstats = ConstructorStats.get(m.MyObject3)
+    assert cstats.alive() == 1
+    o = None
+    assert cstats.alive() == 0
+    assert cstats.values() == ['MyObject3[9]', 'MyObject3[8]', 'MyObject3[9]']
+    assert cstats.default_constructions == 0
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # Object
+    cstats = ConstructorStats.get(m.Object)
+    assert cstats.alive() == 0
+    assert cstats.values() == []
+    assert cstats.default_constructions == 10
+    assert cstats.copy_constructions == 0
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 0
+    assert cstats.move_assignments == 0
+
+    # ref<>
+    cstats = m.cstats_ref()
+    assert cstats.alive() == 0
+    assert cstats.values() == ['from pointer'] * 10
+    assert cstats.default_constructions == 30
+    assert cstats.copy_constructions == 12
+    # assert cstats.move_constructions >= 0 # Doesn't invoke any
+    assert cstats.copy_assignments == 30
+    assert cstats.move_assignments == 0
+
+
+def test_smart_ptr_refcounting():
+    assert m.test_object1_refcounting()
+
+
+def test_unique_nodelete():
+    o = m.MyObject4(23)
+    assert o.value == 23
+    cstats = ConstructorStats.get(m.MyObject4)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 1  # Leak, but that's intentional
+
+
+def test_unique_nodelete4a():
+    o = m.MyObject4a(23)
+    assert o.value == 23
+    cstats = ConstructorStats.get(m.MyObject4a)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 1  # Leak, but that's intentional
+
+
+def test_unique_deleter():
+    o = m.MyObject4b(23)
+    assert o.value == 23
+    cstats4a = ConstructorStats.get(m.MyObject4a)
+    assert cstats4a.alive() == 2  # Two because of previous test
+    cstats4b = ConstructorStats.get(m.MyObject4b)
+    assert cstats4b.alive() == 1
+    del o
+    assert cstats4a.alive() == 1  # Should now only be one leftover from previous test
+    assert cstats4b.alive() == 0  # Should be deleted
+
+
+def test_large_holder():
+    o = m.MyObject5(5)
+    assert o.value == 5
+    cstats = ConstructorStats.get(m.MyObject5)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 0
+
+
+def test_shared_ptr_and_references():
+    s = m.SharedPtrRef()
+    stats = ConstructorStats.get(m.A)
+    assert stats.alive() == 2
+
+    ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false)
+    assert stats.alive() == 2
+    assert s.set_ref(ref)
+    with pytest.raises(RuntimeError) as excinfo:
+        assert s.set_holder(ref)
+    assert "Unable to cast from non-held to held instance" in str(excinfo.value)
+
+    copy = s.copy  # init_holder_helper(holder_ptr=false, owned=true)
+    assert stats.alive() == 3
+    assert s.set_ref(copy)
+    assert s.set_holder(copy)
+
+    holder_ref = s.holder_ref  # init_holder_helper(holder_ptr=true, owned=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_ref)
+    assert s.set_holder(holder_ref)
+
+    holder_copy = s.holder_copy  # init_holder_helper(holder_ptr=true, owned=true)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_copy)
+    assert s.set_holder(holder_copy)
+
+    del ref, copy, holder_ref, holder_copy, s
+    assert stats.alive() == 0
+
+
+def test_shared_ptr_from_this_and_references():
+    s = m.SharedFromThisRef()
+    stats = ConstructorStats.get(m.B)
+    assert stats.alive() == 2
+
+    ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=false)
+    assert stats.alive() == 2
+    assert s.set_ref(ref)
+    assert s.set_holder(ref)  # std::enable_shared_from_this can create a holder from a reference
+
+    bad_wp = s.bad_wp  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=true)
+    assert stats.alive() == 2
+    assert s.set_ref(bad_wp)
+    with pytest.raises(RuntimeError) as excinfo:
+        assert s.set_holder(bad_wp)
+    assert "Unable to cast from non-held to held instance" in str(excinfo.value)
+
+    copy = s.copy  # init_holder_helper(holder_ptr=false, owned=true, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(copy)
+    assert s.set_holder(copy)
+
+    holder_ref = s.holder_ref  # init_holder_helper(holder_ptr=true, owned=false, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_ref)
+    assert s.set_holder(holder_ref)
+
+    holder_copy = s.holder_copy  # init_holder_helper(holder_ptr=true, owned=true, bad_wp=false)
+    assert stats.alive() == 3
+    assert s.set_ref(holder_copy)
+    assert s.set_holder(holder_copy)
+
+    del ref, bad_wp, copy, holder_ref, holder_copy, s
+    assert stats.alive() == 0
+
+    z = m.SharedFromThisVirt.get()
+    y = m.SharedFromThisVirt.get()
+    assert y is z
+
+
+def test_move_only_holder():
+    a = m.TypeWithMoveOnlyHolder.make()
+    stats = ConstructorStats.get(m.TypeWithMoveOnlyHolder)
+    assert stats.alive() == 1
+    del a
+    assert stats.alive() == 0
+
+
+def test_holder_with_addressof_operator():
+    # this test must not throw exception from c++
+    a = m.TypeForHolderWithAddressOf.make()
+    a.print_object_1()
+    a.print_object_2()
+    a.print_object_3()
+    a.print_object_4()
+
+    stats = ConstructorStats.get(m.TypeForHolderWithAddressOf)
+    assert stats.alive() == 1
+
+    np = m.TypeForHolderWithAddressOf.make()
+    assert stats.alive() == 2
+    del a
+    assert stats.alive() == 1
+    del np
+    assert stats.alive() == 0
+
+    b = m.TypeForHolderWithAddressOf.make()
+    c = b
+    assert b.get() is c.get()
+    assert stats.alive() == 1
+
+    del b
+    assert stats.alive() == 1
+
+    del c
+    assert stats.alive() == 0
+
+
+def test_move_only_holder_with_addressof_operator():
+    a = m.TypeForMoveOnlyHolderWithAddressOf.make()
+    a.print_object()
+
+    stats = ConstructorStats.get(m.TypeForMoveOnlyHolderWithAddressOf)
+    assert stats.alive() == 1
+
+    a.value = 42
+    assert a.value == 42
+
+    del a
+    assert stats.alive() == 0
+
+
+def test_smart_ptr_from_default():
+    instance = m.HeldByDefaultHolder()
+    with pytest.raises(RuntimeError) as excinfo:
+        m.HeldByDefaultHolder.load_shared_ptr(instance)
+    assert "Unable to load a custom holder type from a " \
+           "default-holder instance" in str(excinfo.value)
+
+
+def test_shared_ptr_gc():
+    """#187: issue involving std::shared_ptr<> return value policy & garbage collection"""
+    el = m.ElementList()
+    for i in range(10):
+        el.add(m.ElementA(i))
+    pytest.gc_collect()
+    for i, v in enumerate(el.get()):
+        assert i == v.value()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cc
new file mode 100644
index 0000000..207c9fb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cc
@@ -0,0 +1,284 @@
+/*
+    tests/test_stl.cpp -- STL type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+#include <vector>
+#include <string>
+
+// Test with `std::variant` in C++17 mode, or with `boost::variant` in C++11/14
+#if PYBIND11_HAS_VARIANT
+using std::variant;
+#elif defined(PYBIND11_TEST_BOOST) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+#  include <boost/variant.hpp>
+#  define PYBIND11_HAS_VARIANT 1
+using boost::variant;
+
+namespace pybind11 { namespace detail {
+template <typename... Ts>
+struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+template <>
+struct visit_helper<boost::variant> {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+        return boost::apply_visitor(args...);
+    }
+};
+}} // namespace pybind11::detail
+#endif
+
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+/// Issue #528: templated constructor
+struct TplCtorClass {
+    template <typename T> TplCtorClass(const T &) { }
+    bool operator==(const TplCtorClass &) const { return true; }
+};
+
+namespace std {
+    template <>
+    struct hash<TplCtorClass> { size_t operator()(const TplCtorClass &) const { return 0; } };
+}
+
+
+TEST_SUBMODULE(stl, m) {
+    // test_vector
+    m.def("cast_vector", []() { return std::vector<int>{1}; });
+    m.def("load_vector", [](const std::vector<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+    // `std::vector<bool>` is special because it returns proxy objects instead of references
+    m.def("cast_bool_vector", []() { return std::vector<bool>{true, false}; });
+    m.def("load_bool_vector", [](const std::vector<bool> &v) {
+        return v.at(0) == true && v.at(1) == false;
+    });
+    // Unnumbered regression (caused by #936): pointers to stl containers aren't castable
+    static std::vector<RValueCaster> lvv{2};
+    m.def("cast_ptr_vector", []() { return &lvv; });
+
+    // test_deque
+    m.def("cast_deque", []() { return std::deque<int>{1}; });
+    m.def("load_deque", [](const std::deque<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+
+    // test_array
+    m.def("cast_array", []() { return std::array<int, 2> {{1 , 2}}; });
+    m.def("load_array", [](const std::array<int, 2> &a) { return a[0] == 1 && a[1] == 2; });
+
+    // test_valarray
+    m.def("cast_valarray", []() { return std::valarray<int>{1, 4, 9}; });
+    m.def("load_valarray", [](const std::valarray<int>& v) {
+        return v.size() == 3 && v[0] == 1 && v[1] == 4 && v[2] == 9;
+    });
+
+    // test_map
+    m.def("cast_map", []() { return std::map<std::string, std::string>{{"key", "value"}}; });
+    m.def("load_map", [](const std::map<std::string, std::string> &map) {
+        return map.at("key") == "value" && map.at("key2") == "value2";
+    });
+
+    // test_set
+    m.def("cast_set", []() { return std::set<std::string>{"key1", "key2"}; });
+    m.def("load_set", [](const std::set<std::string> &set) {
+        return set.count("key1") && set.count("key2") && set.count("key3");
+    });
+
+    // test_recursive_casting
+    m.def("cast_rv_vector", []() { return std::vector<RValueCaster>{2}; });
+    m.def("cast_rv_array", []() { return std::array<RValueCaster, 3>(); });
+    // NB: map and set keys are `const`, so while we technically do move them (as `const Type &&`),
+    // casters don't typically do anything with that, which means they fall to the `const Type &`
+    // caster.
+    m.def("cast_rv_map", []() { return std::unordered_map<std::string, RValueCaster>{{"a", RValueCaster{}}}; });
+    m.def("cast_rv_nested", []() {
+        std::vector<std::array<std::list<std::unordered_map<std::string, RValueCaster>>, 2>> v;
+        v.emplace_back(); // add an array
+        v.back()[0].emplace_back(); // add a map to the array
+        v.back()[0].back().emplace("b", RValueCaster{});
+        v.back()[0].back().emplace("c", RValueCaster{});
+        v.back()[1].emplace_back(); // add a map to the array
+        v.back()[1].back().emplace("a", RValueCaster{});
+        return v;
+    });
+    static std::array<RValueCaster, 2> lva;
+    static std::unordered_map<std::string, RValueCaster> lvm{{"a", RValueCaster{}}, {"b", RValueCaster{}}};
+    static std::unordered_map<std::string, std::vector<std::list<std::array<RValueCaster, 2>>>> lvn;
+    lvn["a"].emplace_back(); // add a list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["a"].emplace_back(); // another list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["b"].emplace_back(); // add a list
+    lvn["b"].back().emplace_back(); // add an array
+    lvn["b"].back().emplace_back(); // add another array
+    m.def("cast_lv_vector", []() -> const decltype(lvv) & { return lvv; });
+    m.def("cast_lv_array", []() -> const decltype(lva) & { return lva; });
+    m.def("cast_lv_map", []() -> const decltype(lvm) & { return lvm; });
+    m.def("cast_lv_nested", []() -> const decltype(lvn) & { return lvn; });
+    // #853:
+    m.def("cast_unique_ptr_vector", []() {
+        std::vector<std::unique_ptr<UserType>> v;
+        v.emplace_back(new UserType{7});
+        v.emplace_back(new UserType{42});
+        return v;
+    });
+
+    // test_move_out_container
+    struct MoveOutContainer {
+        struct Value { int value; };
+        std::list<Value> move_list() const { return {{0}, {1}, {2}}; }
+    };
+    py::class_<MoveOutContainer::Value>(m, "MoveOutContainerValue")
+        .def_readonly("value", &MoveOutContainer::Value::value);
+    py::class_<MoveOutContainer>(m, "MoveOutContainer")
+        .def(py::init<>())
+        .def_property_readonly("move_list", &MoveOutContainer::move_list);
+
+    // Class that can be move- and copy-constructed, but not assigned
+    struct NoAssign {
+        int value;
+
+        explicit NoAssign(int value = 0) : value(value) { }
+        NoAssign(const NoAssign &) = default;
+        NoAssign(NoAssign &&) = default;
+
+        NoAssign &operator=(const NoAssign &) = delete;
+        NoAssign &operator=(NoAssign &&) = delete;
+    };
+    py::class_<NoAssign>(m, "NoAssign", "Class with no C++ assignment operators")
+        .def(py::init<>())
+        .def(py::init<int>());
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_optional
+    m.attr("has_optional") = true;
+
+    using opt_int = std::optional<int>;
+    using opt_no_assign = std::optional<NoAssign>;
+    m.def("double_or_zero", [](const opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none", [](int x) -> opt_int {
+        return x ? opt_int(x / 2) : opt_int();
+    });
+    m.def("test_nullopt", [](opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::nullopt, "None"));
+    m.def("test_no_assign", [](const opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::nullopt, "None"));
+
+    m.def("nodefer_none_optional", [](std::optional<int>) { return true; });
+    m.def("nodefer_none_optional", [](py::none) { return false; });
+#endif
+
+#ifdef PYBIND11_HAS_EXP_OPTIONAL
+    // test_exp_optional
+    m.attr("has_exp_optional") = true;
+
+    using exp_opt_int = std::experimental::optional<int>;
+    using exp_opt_no_assign = std::experimental::optional<NoAssign>;
+    m.def("double_or_zero_exp", [](const exp_opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none_exp", [](int x) -> exp_opt_int {
+        return x ? exp_opt_int(x / 2) : exp_opt_int();
+    });
+    m.def("test_nullopt_exp", [](exp_opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+    m.def("test_no_assign_exp", [](const exp_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+#endif
+
+#ifdef PYBIND11_HAS_VARIANT
+    static_assert(std::is_same<py::detail::variant_caster_visitor::result_type, py::handle>::value,
+                  "visitor::result_type is required by boost::variant in C++11 mode");
+
+    struct visitor {
+        using result_type = const char *;
+
+        result_type operator()(int) { return "int"; }
+        result_type operator()(std::string) { return "std::string"; }
+        result_type operator()(double) { return "double"; }
+        result_type operator()(std::nullptr_t) { return "std::nullptr_t"; }
+    };
+
+    // test_variant
+    m.def("load_variant", [](variant<int, std::string, double, std::nullptr_t> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("load_variant_2pass", [](variant<double, int> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("cast_variant", []() {
+        using V = variant<int, std::string>;
+        return py::make_tuple(V(5), V("Hello"));
+    });
+#endif
+
+    // #528: templated constructor
+    // (no python tests: the test here is that this compiles)
+    m.def("tpl_ctor_vector", [](std::vector<TplCtorClass> &) {});
+    m.def("tpl_ctor_map", [](std::unordered_map<TplCtorClass, TplCtorClass> &) {});
+    m.def("tpl_ctor_set", [](std::unordered_set<TplCtorClass> &) {});
+#if defined(PYBIND11_HAS_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::optional<TplCtorClass> &) {});
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::experimental::optional<TplCtorClass> &) {});
+#endif
+
+    // test_vec_of_reference_wrapper
+    // #171: Can't return STL structures containing reference wrapper
+    m.def("return_vec_of_reference_wrapper", [](std::reference_wrapper<UserType> p4) {
+        static UserType p1{1}, p2{2}, p3{3};
+        return std::vector<std::reference_wrapper<UserType>> {
+            std::ref(p1), std::ref(p2), std::ref(p3), p4
+        };
+    });
+
+    // test_stl_pass_by_pointer
+    m.def("stl_pass_by_pointer", [](std::vector<int>* v) { return *v; }, "v"_a=nullptr);
+
+    // #1258: pybind11/stl.h converts string to vector<string>
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::vector<std::string>) { return 1; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::list<std::string>) { return 2; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::string) { return 3; });
+
+    class Placeholder {
+    public:
+        Placeholder() { print_created(this); }
+        Placeholder(const Placeholder &) = delete;
+        ~Placeholder() { print_destroyed(this); }
+    };
+    py::class_<Placeholder>(m, "Placeholder");
+
+    /// test_stl_vector_ownership
+    m.def("test_stl_ownership",
+          []() {
+              std::vector<Placeholder *> result;
+              result.push_back(new Placeholder());
+              return result;
+          },
+          py::return_value_policy::take_ownership);
+
+    m.def("array_cast_sequence", [](std::array<int, 3> x) { return x; });
+
+    /// test_issue_1561
+    struct Issue1561Inner { std::string data; };
+    struct Issue1561Outer { std::vector<Issue1561Inner> list; };
+
+    py::class_<Issue1561Inner>(m, "Issue1561Inner")
+        .def(py::init<std::string>())
+        .def_readwrite("data", &Issue1561Inner::data);
+
+    py::class_<Issue1561Outer>(m, "Issue1561Outer")
+        .def(py::init<>())
+        .def_readwrite("list", &Issue1561Outer::list);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cpp
new file mode 100644
index 0000000..207c9fb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cpp
@@ -0,0 +1,284 @@
+/*
+    tests/test_stl.cpp -- STL type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+#include <vector>
+#include <string>
+
+// Test with `std::variant` in C++17 mode, or with `boost::variant` in C++11/14
+#if PYBIND11_HAS_VARIANT
+using std::variant;
+#elif defined(PYBIND11_TEST_BOOST) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+#  include <boost/variant.hpp>
+#  define PYBIND11_HAS_VARIANT 1
+using boost::variant;
+
+namespace pybind11 { namespace detail {
+template <typename... Ts>
+struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+template <>
+struct visit_helper<boost::variant> {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+        return boost::apply_visitor(args...);
+    }
+};
+}} // namespace pybind11::detail
+#endif
+
+PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
+
+/// Issue #528: templated constructor
+struct TplCtorClass {
+    template <typename T> TplCtorClass(const T &) { }
+    bool operator==(const TplCtorClass &) const { return true; }
+};
+
+namespace std {
+    template <>
+    struct hash<TplCtorClass> { size_t operator()(const TplCtorClass &) const { return 0; } };
+}
+
+
+TEST_SUBMODULE(stl, m) {
+    // test_vector
+    m.def("cast_vector", []() { return std::vector<int>{1}; });
+    m.def("load_vector", [](const std::vector<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+    // `std::vector<bool>` is special because it returns proxy objects instead of references
+    m.def("cast_bool_vector", []() { return std::vector<bool>{true, false}; });
+    m.def("load_bool_vector", [](const std::vector<bool> &v) {
+        return v.at(0) == true && v.at(1) == false;
+    });
+    // Unnumbered regression (caused by #936): pointers to stl containers aren't castable
+    static std::vector<RValueCaster> lvv{2};
+    m.def("cast_ptr_vector", []() { return &lvv; });
+
+    // test_deque
+    m.def("cast_deque", []() { return std::deque<int>{1}; });
+    m.def("load_deque", [](const std::deque<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+
+    // test_array
+    m.def("cast_array", []() { return std::array<int, 2> {{1 , 2}}; });
+    m.def("load_array", [](const std::array<int, 2> &a) { return a[0] == 1 && a[1] == 2; });
+
+    // test_valarray
+    m.def("cast_valarray", []() { return std::valarray<int>{1, 4, 9}; });
+    m.def("load_valarray", [](const std::valarray<int>& v) {
+        return v.size() == 3 && v[0] == 1 && v[1] == 4 && v[2] == 9;
+    });
+
+    // test_map
+    m.def("cast_map", []() { return std::map<std::string, std::string>{{"key", "value"}}; });
+    m.def("load_map", [](const std::map<std::string, std::string> &map) {
+        return map.at("key") == "value" && map.at("key2") == "value2";
+    });
+
+    // test_set
+    m.def("cast_set", []() { return std::set<std::string>{"key1", "key2"}; });
+    m.def("load_set", [](const std::set<std::string> &set) {
+        return set.count("key1") && set.count("key2") && set.count("key3");
+    });
+
+    // test_recursive_casting
+    m.def("cast_rv_vector", []() { return std::vector<RValueCaster>{2}; });
+    m.def("cast_rv_array", []() { return std::array<RValueCaster, 3>(); });
+    // NB: map and set keys are `const`, so while we technically do move them (as `const Type &&`),
+    // casters don't typically do anything with that, which means they fall to the `const Type &`
+    // caster.
+    m.def("cast_rv_map", []() { return std::unordered_map<std::string, RValueCaster>{{"a", RValueCaster{}}}; });
+    m.def("cast_rv_nested", []() {
+        std::vector<std::array<std::list<std::unordered_map<std::string, RValueCaster>>, 2>> v;
+        v.emplace_back(); // add an array
+        v.back()[0].emplace_back(); // add a map to the array
+        v.back()[0].back().emplace("b", RValueCaster{});
+        v.back()[0].back().emplace("c", RValueCaster{});
+        v.back()[1].emplace_back(); // add a map to the array
+        v.back()[1].back().emplace("a", RValueCaster{});
+        return v;
+    });
+    static std::array<RValueCaster, 2> lva;
+    static std::unordered_map<std::string, RValueCaster> lvm{{"a", RValueCaster{}}, {"b", RValueCaster{}}};
+    static std::unordered_map<std::string, std::vector<std::list<std::array<RValueCaster, 2>>>> lvn;
+    lvn["a"].emplace_back(); // add a list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["a"].emplace_back(); // another list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["b"].emplace_back(); // add a list
+    lvn["b"].back().emplace_back(); // add an array
+    lvn["b"].back().emplace_back(); // add another array
+    m.def("cast_lv_vector", []() -> const decltype(lvv) & { return lvv; });
+    m.def("cast_lv_array", []() -> const decltype(lva) & { return lva; });
+    m.def("cast_lv_map", []() -> const decltype(lvm) & { return lvm; });
+    m.def("cast_lv_nested", []() -> const decltype(lvn) & { return lvn; });
+    // #853:
+    m.def("cast_unique_ptr_vector", []() {
+        std::vector<std::unique_ptr<UserType>> v;
+        v.emplace_back(new UserType{7});
+        v.emplace_back(new UserType{42});
+        return v;
+    });
+
+    // test_move_out_container
+    struct MoveOutContainer {
+        struct Value { int value; };
+        std::list<Value> move_list() const { return {{0}, {1}, {2}}; }
+    };
+    py::class_<MoveOutContainer::Value>(m, "MoveOutContainerValue")
+        .def_readonly("value", &MoveOutContainer::Value::value);
+    py::class_<MoveOutContainer>(m, "MoveOutContainer")
+        .def(py::init<>())
+        .def_property_readonly("move_list", &MoveOutContainer::move_list);
+
+    // Class that can be move- and copy-constructed, but not assigned
+    struct NoAssign {
+        int value;
+
+        explicit NoAssign(int value = 0) : value(value) { }
+        NoAssign(const NoAssign &) = default;
+        NoAssign(NoAssign &&) = default;
+
+        NoAssign &operator=(const NoAssign &) = delete;
+        NoAssign &operator=(NoAssign &&) = delete;
+    };
+    py::class_<NoAssign>(m, "NoAssign", "Class with no C++ assignment operators")
+        .def(py::init<>())
+        .def(py::init<int>());
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_optional
+    m.attr("has_optional") = true;
+
+    using opt_int = std::optional<int>;
+    using opt_no_assign = std::optional<NoAssign>;
+    m.def("double_or_zero", [](const opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none", [](int x) -> opt_int {
+        return x ? opt_int(x / 2) : opt_int();
+    });
+    m.def("test_nullopt", [](opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::nullopt, "None"));
+    m.def("test_no_assign", [](const opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::nullopt, "None"));
+
+    m.def("nodefer_none_optional", [](std::optional<int>) { return true; });
+    m.def("nodefer_none_optional", [](py::none) { return false; });
+#endif
+
+#ifdef PYBIND11_HAS_EXP_OPTIONAL
+    // test_exp_optional
+    m.attr("has_exp_optional") = true;
+
+    using exp_opt_int = std::experimental::optional<int>;
+    using exp_opt_no_assign = std::experimental::optional<NoAssign>;
+    m.def("double_or_zero_exp", [](const exp_opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none_exp", [](int x) -> exp_opt_int {
+        return x ? exp_opt_int(x / 2) : exp_opt_int();
+    });
+    m.def("test_nullopt_exp", [](exp_opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+    m.def("test_no_assign_exp", [](const exp_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+#endif
+
+#ifdef PYBIND11_HAS_VARIANT
+    static_assert(std::is_same<py::detail::variant_caster_visitor::result_type, py::handle>::value,
+                  "visitor::result_type is required by boost::variant in C++11 mode");
+
+    struct visitor {
+        using result_type = const char *;
+
+        result_type operator()(int) { return "int"; }
+        result_type operator()(std::string) { return "std::string"; }
+        result_type operator()(double) { return "double"; }
+        result_type operator()(std::nullptr_t) { return "std::nullptr_t"; }
+    };
+
+    // test_variant
+    m.def("load_variant", [](variant<int, std::string, double, std::nullptr_t> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("load_variant_2pass", [](variant<double, int> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("cast_variant", []() {
+        using V = variant<int, std::string>;
+        return py::make_tuple(V(5), V("Hello"));
+    });
+#endif
+
+    // #528: templated constructor
+    // (no python tests: the test here is that this compiles)
+    m.def("tpl_ctor_vector", [](std::vector<TplCtorClass> &) {});
+    m.def("tpl_ctor_map", [](std::unordered_map<TplCtorClass, TplCtorClass> &) {});
+    m.def("tpl_ctor_set", [](std::unordered_set<TplCtorClass> &) {});
+#if defined(PYBIND11_HAS_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::optional<TplCtorClass> &) {});
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::experimental::optional<TplCtorClass> &) {});
+#endif
+
+    // test_vec_of_reference_wrapper
+    // #171: Can't return STL structures containing reference wrapper
+    m.def("return_vec_of_reference_wrapper", [](std::reference_wrapper<UserType> p4) {
+        static UserType p1{1}, p2{2}, p3{3};
+        return std::vector<std::reference_wrapper<UserType>> {
+            std::ref(p1), std::ref(p2), std::ref(p3), p4
+        };
+    });
+
+    // test_stl_pass_by_pointer
+    m.def("stl_pass_by_pointer", [](std::vector<int>* v) { return *v; }, "v"_a=nullptr);
+
+    // #1258: pybind11/stl.h converts string to vector<string>
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::vector<std::string>) { return 1; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::list<std::string>) { return 2; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](std::string) { return 3; });
+
+    class Placeholder {
+    public:
+        Placeholder() { print_created(this); }
+        Placeholder(const Placeholder &) = delete;
+        ~Placeholder() { print_destroyed(this); }
+    };
+    py::class_<Placeholder>(m, "Placeholder");
+
+    /// test_stl_vector_ownership
+    m.def("test_stl_ownership",
+          []() {
+              std::vector<Placeholder *> result;
+              result.push_back(new Placeholder());
+              return result;
+          },
+          py::return_value_policy::take_ownership);
+
+    m.def("array_cast_sequence", [](std::array<int, 3> x) { return x; });
+
+    /// test_issue_1561
+    struct Issue1561Inner { std::string data; };
+    struct Issue1561Outer { std::vector<Issue1561Inner> list; };
+
+    py::class_<Issue1561Inner>(m, "Issue1561Inner")
+        .def(py::init<std::string>())
+        .def_readwrite("data", &Issue1561Inner::data);
+
+    py::class_<Issue1561Outer>(m, "Issue1561Outer")
+        .def(py::init<>())
+        .def_readwrite("list", &Issue1561Outer::list);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.py
new file mode 100644
index 0000000..2335cb9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.py
@@ -0,0 +1,241 @@
+import pytest
+
+from pybind11_tests import stl as m
+from pybind11_tests import UserType
+from pybind11_tests import ConstructorStats
+
+
+def test_vector(doc):
+    """std::vector <-> list"""
+    lst = m.cast_vector()
+    assert lst == [1]
+    lst.append(2)
+    assert m.load_vector(lst)
+    assert m.load_vector(tuple(lst))
+
+    assert m.cast_bool_vector() == [True, False]
+    assert m.load_bool_vector([True, False])
+
+    assert doc(m.cast_vector) == "cast_vector() -> List[int]"
+    assert doc(m.load_vector) == "load_vector(arg0: List[int]) -> bool"
+
+    # Test regression caused by 936: pointers to stl containers weren't castable
+    assert m.cast_ptr_vector() == ["lvalue", "lvalue"]
+
+
+def test_deque(doc):
+    """std::deque <-> list"""
+    lst = m.cast_deque()
+    assert lst == [1]
+    lst.append(2)
+    assert m.load_deque(lst)
+    assert m.load_deque(tuple(lst))
+
+
+def test_array(doc):
+    """std::array <-> list"""
+    lst = m.cast_array()
+    assert lst == [1, 2]
+    assert m.load_array(lst)
+
+    assert doc(m.cast_array) == "cast_array() -> List[int[2]]"
+    assert doc(m.load_array) == "load_array(arg0: List[int[2]]) -> bool"
+
+
+def test_valarray(doc):
+    """std::valarray <-> list"""
+    lst = m.cast_valarray()
+    assert lst == [1, 4, 9]
+    assert m.load_valarray(lst)
+
+    assert doc(m.cast_valarray) == "cast_valarray() -> List[int]"
+    assert doc(m.load_valarray) == "load_valarray(arg0: List[int]) -> bool"
+
+
+def test_map(doc):
+    """std::map <-> dict"""
+    d = m.cast_map()
+    assert d == {"key": "value"}
+    assert "key" in d
+    d["key2"] = "value2"
+    assert "key2" in d
+    assert m.load_map(d)
+
+    assert doc(m.cast_map) == "cast_map() -> Dict[str, str]"
+    assert doc(m.load_map) == "load_map(arg0: Dict[str, str]) -> bool"
+
+
+def test_set(doc):
+    """std::set <-> set"""
+    s = m.cast_set()
+    assert s == {"key1", "key2"}
+    s.add("key3")
+    assert m.load_set(s)
+
+    assert doc(m.cast_set) == "cast_set() -> Set[str]"
+    assert doc(m.load_set) == "load_set(arg0: Set[str]) -> bool"
+
+
+def test_recursive_casting():
+    """Tests that stl casters preserve lvalue/rvalue context for container values"""
+    assert m.cast_rv_vector() == ["rvalue", "rvalue"]
+    assert m.cast_lv_vector() == ["lvalue", "lvalue"]
+    assert m.cast_rv_array() == ["rvalue", "rvalue", "rvalue"]
+    assert m.cast_lv_array() == ["lvalue", "lvalue"]
+    assert m.cast_rv_map() == {"a": "rvalue"}
+    assert m.cast_lv_map() == {"a": "lvalue", "b": "lvalue"}
+    assert m.cast_rv_nested() == [[[{"b": "rvalue", "c": "rvalue"}], [{"a": "rvalue"}]]]
+    assert m.cast_lv_nested() == {
+        "a": [[["lvalue", "lvalue"]], [["lvalue", "lvalue"]]],
+        "b": [[["lvalue", "lvalue"], ["lvalue", "lvalue"]]]
+    }
+
+    # Issue #853 test case:
+    z = m.cast_unique_ptr_vector()
+    assert z[0].value == 7 and z[1].value == 42
+
+
+def test_move_out_container():
+    """Properties use the `reference_internal` policy by default. If the underlying function
+    returns an rvalue, the policy is automatically changed to `move` to avoid referencing
+    a temporary. In case the return value is a container of user-defined types, the policy
+    also needs to be applied to the elements, not just the container."""
+    c = m.MoveOutContainer()
+    moved_out_list = c.move_list
+    assert [x.value for x in moved_out_list] == [0, 1, 2]
+
+
+@pytest.mark.skipif(not hasattr(m, "has_optional"), reason='no <optional>')
+def test_optional():
+    assert m.double_or_zero(None) == 0
+    assert m.double_or_zero(42) == 84
+    pytest.raises(TypeError, m.double_or_zero, 'foo')
+
+    assert m.half_or_none(0) is None
+    assert m.half_or_none(42) == 21
+    pytest.raises(TypeError, m.half_or_none, 'foo')
+
+    assert m.test_nullopt() == 42
+    assert m.test_nullopt(None) == 42
+    assert m.test_nullopt(42) == 42
+    assert m.test_nullopt(43) == 43
+
+    assert m.test_no_assign() == 42
+    assert m.test_no_assign(None) == 42
+    assert m.test_no_assign(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign, 43)
+
+    assert m.nodefer_none_optional(None)
+
+
+@pytest.mark.skipif(not hasattr(m, "has_exp_optional"), reason='no <experimental/optional>')
+def test_exp_optional():
+    assert m.double_or_zero_exp(None) == 0
+    assert m.double_or_zero_exp(42) == 84
+    pytest.raises(TypeError, m.double_or_zero_exp, 'foo')
+
+    assert m.half_or_none_exp(0) is None
+    assert m.half_or_none_exp(42) == 21
+    pytest.raises(TypeError, m.half_or_none_exp, 'foo')
+
+    assert m.test_nullopt_exp() == 42
+    assert m.test_nullopt_exp(None) == 42
+    assert m.test_nullopt_exp(42) == 42
+    assert m.test_nullopt_exp(43) == 43
+
+    assert m.test_no_assign_exp() == 42
+    assert m.test_no_assign_exp(None) == 42
+    assert m.test_no_assign_exp(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign_exp, 43)
+
+
+@pytest.mark.skipif(not hasattr(m, "load_variant"), reason='no <variant>')
+def test_variant(doc):
+    assert m.load_variant(1) == "int"
+    assert m.load_variant("1") == "std::string"
+    assert m.load_variant(1.0) == "double"
+    assert m.load_variant(None) == "std::nullptr_t"
+
+    assert m.load_variant_2pass(1) == "int"
+    assert m.load_variant_2pass(1.0) == "double"
+
+    assert m.cast_variant() == (5, "Hello")
+
+    assert doc(m.load_variant) == "load_variant(arg0: Union[int, str, float, None]) -> str"
+
+
+def test_vec_of_reference_wrapper():
+    """#171: Can't return reference wrappers (or STL structures containing them)"""
+    assert str(m.return_vec_of_reference_wrapper(UserType(4))) == \
+        "[UserType(1), UserType(2), UserType(3), UserType(4)]"
+
+
+def test_stl_pass_by_pointer(msg):
+    """Passing nullptr or None to an STL container pointer is not expected to work"""
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer()  # default value is `nullptr`
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int] = None) -> List[int]
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer(None)
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int] = None) -> List[int]
+
+        Invoked with: None
+    """  # noqa: E501 line too long
+
+    assert m.stl_pass_by_pointer([1, 2, 3]) == [1, 2, 3]
+
+
+def test_missing_header_message():
+    """Trying convert `list` to a `std::vector`, or vice versa, without including
+    <pybind11/stl.h> should result in a helpful suggestion in the error message"""
+    import pybind11_cross_module_tests as cm
+
+    expected_message = ("Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                        "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                        "conversions are optional and require extra headers to be included\n"
+                        "when compiling your pybind11 module.")
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_arg([1.0, 2.0, 3.0])
+    assert expected_message in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_return()
+    assert expected_message in str(excinfo.value)
+
+
+def test_function_with_string_and_vector_string_arg():
+    """Check if a string is NOT implicitly converted to a list, which was the
+    behavior before fix of issue #1258"""
+    assert m.func_with_string_or_vector_string_arg_overload(('A', 'B', )) == 2
+    assert m.func_with_string_or_vector_string_arg_overload(['A', 'B']) == 2
+    assert m.func_with_string_or_vector_string_arg_overload('A') == 3
+
+
+def test_stl_ownership():
+    cstats = ConstructorStats.get(m.Placeholder)
+    assert cstats.alive() == 0
+    r = m.test_stl_ownership()
+    assert len(r) == 1
+    del r
+    assert cstats.alive() == 0
+
+
+def test_array_cast_sequence():
+    assert m.array_cast_sequence((1, 2, 3)) == [1, 2, 3]
+
+
+def test_issue_1561():
+    """ check fix for issue #1561 """
+    bar = m.Issue1561Outer()
+    bar.list = [m.Issue1561Inner('bar')]
+    bar.list
+    assert bar.list[0].data == 'bar'
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cc
new file mode 100644
index 0000000..8688874
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cc
@@ -0,0 +1,129 @@
+/*
+    tests/test_stl_binders.cpp -- Usage of stl_binders functions
+
+    Copyright (c) 2016 Sergey Lyskov
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/stl_bind.h>
+#include <pybind11/numpy.h>
+#include <map>
+#include <deque>
+#include <unordered_map>
+
+class El {
+public:
+    El() = delete;
+    El(int v) : a(v) { }
+
+    int a;
+};
+
+std::ostream & operator<<(std::ostream &s, El const&v) {
+    s << "El{" << v.a << '}';
+    return s;
+}
+
+/// Issue #487: binding std::vector<E> with E non-copyable
+class E_nc {
+public:
+    explicit E_nc(int i) : value{i} {}
+    E_nc(const E_nc &) = delete;
+    E_nc &operator=(const E_nc &) = delete;
+    E_nc(E_nc &&) = default;
+    E_nc &operator=(E_nc &&) = default;
+
+    int value;
+};
+
+template <class Container> Container *one_to_n(int n) {
+    auto v = new Container();
+    for (int i = 1; i <= n; i++)
+        v->emplace_back(i);
+    return v;
+}
+
+template <class Map> Map *times_ten(int n) {
+    auto m = new Map();
+    for (int i = 1; i <= n; i++)
+        m->emplace(int(i), E_nc(10*i));
+    return m;
+}
+
+template <class NestMap> NestMap *times_hundred(int n) {
+    auto m = new NestMap();
+    for (int i = 1; i <= n; i++)
+        for (int j = 1; j <= n; j++)
+            (*m)[i].emplace(int(j*10), E_nc(100*j));
+    return m;
+}
+
+TEST_SUBMODULE(stl_binders, m) {
+    // test_vector_int
+    py::bind_vector<std::vector<unsigned int>>(m, "VectorInt", py::buffer_protocol());
+
+    // test_vector_custom
+    py::class_<El>(m, "El")
+        .def(py::init<int>());
+    py::bind_vector<std::vector<El>>(m, "VectorEl");
+    py::bind_vector<std::vector<std::vector<El>>>(m, "VectorVectorEl");
+
+    // test_map_string_double
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+    py::bind_map<std::unordered_map<std::string, double>>(m, "UnorderedMapStringDouble");
+
+    // test_map_string_double_const
+    py::bind_map<std::map<std::string, double const>>(m, "MapStringDoubleConst");
+    py::bind_map<std::unordered_map<std::string, double const>>(m, "UnorderedMapStringDoubleConst");
+
+    py::class_<E_nc>(m, "ENC")
+        .def(py::init<int>())
+        .def_readwrite("value", &E_nc::value);
+
+    // test_noncopyable_containers
+    py::bind_vector<std::vector<E_nc>>(m, "VectorENC");
+    m.def("get_vnc", &one_to_n<std::vector<E_nc>>, py::return_value_policy::reference);
+    py::bind_vector<std::deque<E_nc>>(m, "DequeENC");
+    m.def("get_dnc", &one_to_n<std::deque<E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::map<int, E_nc>>(m, "MapENC");
+    m.def("get_mnc", &times_ten<std::map<int, E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, E_nc>>(m, "UmapENC");
+    m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>, py::return_value_policy::reference);
+    // Issue #1885: binding nested std::map<X, Container<E>> with E non-copyable
+    py::bind_map<std::map<int, std::vector<E_nc>>>(m, "MapVecENC");
+    m.def("get_nvnc", [](int n)
+        {
+            auto m = new std::map<int, std::vector<E_nc>>();
+            for (int i = 1; i <= n; i++)
+                for (int j = 1; j <= n; j++)
+                    (*m)[i].emplace_back(j);
+            return m;
+        }, py::return_value_policy::reference);
+    py::bind_map<std::map<int, std::map<int, E_nc>>>(m, "MapMapENC");
+    m.def("get_nmnc", &times_hundred<std::map<int, std::map<int, E_nc>>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, std::unordered_map<int, E_nc>>>(m, "UmapUmapENC");
+    m.def("get_numnc", &times_hundred<std::unordered_map<int, std::unordered_map<int, E_nc>>>, py::return_value_policy::reference);
+
+    // test_vector_buffer
+    py::bind_vector<std::vector<unsigned char>>(m, "VectorUChar", py::buffer_protocol());
+    // no dtype declared for this version:
+    struct VUndeclStruct { bool w; uint32_t x; double y; bool z; };
+    m.def("create_undeclstruct", [m] () mutable {
+        py::bind_vector<std::vector<VUndeclStruct>>(m, "VectorUndeclStruct", py::buffer_protocol());
+    });
+
+    // The rest depends on numpy:
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vector_buffer_numpy
+    struct VStruct { bool w; uint32_t x; double y; bool z; };
+    PYBIND11_NUMPY_DTYPE(VStruct, w, x, y, z);
+    py::class_<VStruct>(m, "VStruct").def_readwrite("x", &VStruct::x);
+    py::bind_vector<std::vector<VStruct>>(m, "VectorStruct", py::buffer_protocol());
+    m.def("get_vectorstruct", [] {return std::vector<VStruct> {{0, 5, 3.0, 1}, {1, 30, -1e4, 0}};});
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cpp
new file mode 100644
index 0000000..8688874
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cpp
@@ -0,0 +1,129 @@
+/*
+    tests/test_stl_binders.cpp -- Usage of stl_binders functions
+
+    Copyright (c) 2016 Sergey Lyskov
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+#include <pybind11/stl_bind.h>
+#include <pybind11/numpy.h>
+#include <map>
+#include <deque>
+#include <unordered_map>
+
+class El {
+public:
+    El() = delete;
+    El(int v) : a(v) { }
+
+    int a;
+};
+
+std::ostream & operator<<(std::ostream &s, El const&v) {
+    s << "El{" << v.a << '}';
+    return s;
+}
+
+/// Issue #487: binding std::vector<E> with E non-copyable
+class E_nc {
+public:
+    explicit E_nc(int i) : value{i} {}
+    E_nc(const E_nc &) = delete;
+    E_nc &operator=(const E_nc &) = delete;
+    E_nc(E_nc &&) = default;
+    E_nc &operator=(E_nc &&) = default;
+
+    int value;
+};
+
+template <class Container> Container *one_to_n(int n) {
+    auto v = new Container();
+    for (int i = 1; i <= n; i++)
+        v->emplace_back(i);
+    return v;
+}
+
+template <class Map> Map *times_ten(int n) {
+    auto m = new Map();
+    for (int i = 1; i <= n; i++)
+        m->emplace(int(i), E_nc(10*i));
+    return m;
+}
+
+template <class NestMap> NestMap *times_hundred(int n) {
+    auto m = new NestMap();
+    for (int i = 1; i <= n; i++)
+        for (int j = 1; j <= n; j++)
+            (*m)[i].emplace(int(j*10), E_nc(100*j));
+    return m;
+}
+
+TEST_SUBMODULE(stl_binders, m) {
+    // test_vector_int
+    py::bind_vector<std::vector<unsigned int>>(m, "VectorInt", py::buffer_protocol());
+
+    // test_vector_custom
+    py::class_<El>(m, "El")
+        .def(py::init<int>());
+    py::bind_vector<std::vector<El>>(m, "VectorEl");
+    py::bind_vector<std::vector<std::vector<El>>>(m, "VectorVectorEl");
+
+    // test_map_string_double
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+    py::bind_map<std::unordered_map<std::string, double>>(m, "UnorderedMapStringDouble");
+
+    // test_map_string_double_const
+    py::bind_map<std::map<std::string, double const>>(m, "MapStringDoubleConst");
+    py::bind_map<std::unordered_map<std::string, double const>>(m, "UnorderedMapStringDoubleConst");
+
+    py::class_<E_nc>(m, "ENC")
+        .def(py::init<int>())
+        .def_readwrite("value", &E_nc::value);
+
+    // test_noncopyable_containers
+    py::bind_vector<std::vector<E_nc>>(m, "VectorENC");
+    m.def("get_vnc", &one_to_n<std::vector<E_nc>>, py::return_value_policy::reference);
+    py::bind_vector<std::deque<E_nc>>(m, "DequeENC");
+    m.def("get_dnc", &one_to_n<std::deque<E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::map<int, E_nc>>(m, "MapENC");
+    m.def("get_mnc", &times_ten<std::map<int, E_nc>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, E_nc>>(m, "UmapENC");
+    m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>, py::return_value_policy::reference);
+    // Issue #1885: binding nested std::map<X, Container<E>> with E non-copyable
+    py::bind_map<std::map<int, std::vector<E_nc>>>(m, "MapVecENC");
+    m.def("get_nvnc", [](int n)
+        {
+            auto m = new std::map<int, std::vector<E_nc>>();
+            for (int i = 1; i <= n; i++)
+                for (int j = 1; j <= n; j++)
+                    (*m)[i].emplace_back(j);
+            return m;
+        }, py::return_value_policy::reference);
+    py::bind_map<std::map<int, std::map<int, E_nc>>>(m, "MapMapENC");
+    m.def("get_nmnc", &times_hundred<std::map<int, std::map<int, E_nc>>>, py::return_value_policy::reference);
+    py::bind_map<std::unordered_map<int, std::unordered_map<int, E_nc>>>(m, "UmapUmapENC");
+    m.def("get_numnc", &times_hundred<std::unordered_map<int, std::unordered_map<int, E_nc>>>, py::return_value_policy::reference);
+
+    // test_vector_buffer
+    py::bind_vector<std::vector<unsigned char>>(m, "VectorUChar", py::buffer_protocol());
+    // no dtype declared for this version:
+    struct VUndeclStruct { bool w; uint32_t x; double y; bool z; };
+    m.def("create_undeclstruct", [m] () mutable {
+        py::bind_vector<std::vector<VUndeclStruct>>(m, "VectorUndeclStruct", py::buffer_protocol());
+    });
+
+    // The rest depends on numpy:
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vector_buffer_numpy
+    struct VStruct { bool w; uint32_t x; double y; bool z; };
+    PYBIND11_NUMPY_DTYPE(VStruct, w, x, y, z);
+    py::class_<VStruct>(m, "VStruct").def_readwrite("x", &VStruct::x);
+    py::bind_vector<std::vector<VStruct>>(m, "VectorStruct", py::buffer_protocol());
+    m.def("get_vectorstruct", [] {return std::vector<VStruct> {{0, 5, 3.0, 1}, {1, 30, -1e4, 0}};});
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.py
new file mode 100644
index 0000000..b83a587
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.py
@@ -0,0 +1,273 @@
+import pytest
+import sys
+from pybind11_tests import stl_binders as m
+
+with pytest.suppress(ImportError):
+    import numpy as np
+
+
+def test_vector_int():
+    v_int = m.VectorInt([0, 0])
+    assert len(v_int) == 2
+    assert bool(v_int) is True
+
+    # test construction from a generator
+    v_int1 = m.VectorInt(x for x in range(5))
+    assert v_int1 == m.VectorInt([0, 1, 2, 3, 4])
+
+    v_int2 = m.VectorInt([0, 0])
+    assert v_int == v_int2
+    v_int2[1] = 1
+    assert v_int != v_int2
+
+    v_int2.append(2)
+    v_int2.insert(0, 1)
+    v_int2.insert(0, 2)
+    v_int2.insert(0, 3)
+    v_int2.insert(6, 3)
+    assert str(v_int2) == "VectorInt[3, 2, 1, 0, 1, 2, 3]"
+    with pytest.raises(IndexError):
+        v_int2.insert(8, 4)
+
+    v_int.append(99)
+    v_int2[2:-2] = v_int
+    assert v_int2 == m.VectorInt([3, 2, 0, 0, 99, 2, 3])
+    del v_int2[1:3]
+    assert v_int2 == m.VectorInt([3, 0, 99, 2, 3])
+    del v_int2[0]
+    assert v_int2 == m.VectorInt([0, 99, 2, 3])
+
+    v_int2.extend(m.VectorInt([4, 5]))
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5])
+
+    v_int2.extend([6, 7])
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7])
+
+    # test error handling, and that the vector is unchanged
+    with pytest.raises(RuntimeError):
+        v_int2.extend([8, 'a'])
+
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7])
+
+    # test extending from a generator
+    v_int2.extend(x for x in range(5))
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4])
+
+    # test negative indexing
+    assert v_int2[-1] == 4
+
+    # insert with negative index
+    v_int2.insert(-1, 88)
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 88, 4])
+
+    # delete negative index
+    del v_int2[-1]
+    assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 88])
+
+# related to the PyPy's buffer protocol.
+@pytest.unsupported_on_pypy
+def test_vector_buffer():
+    b = bytearray([1, 2, 3, 4])
+    v = m.VectorUChar(b)
+    assert v[1] == 2
+    v[2] = 5
+    mv = memoryview(v)  # We expose the buffer interface
+    if sys.version_info.major > 2:
+        assert mv[2] == 5
+        mv[2] = 6
+    else:
+        assert mv[2] == '\x05'
+        mv[2] = '\x06'
+    assert v[2] == 6
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.create_undeclstruct()  # Undeclared struct contents, no buffer interface
+    assert "NumPy type info missing for " in str(excinfo.value)
+
+
+@pytest.unsupported_on_pypy
+@pytest.requires_numpy
+def test_vector_buffer_numpy():
+    a = np.array([1, 2, 3, 4], dtype=np.int32)
+    with pytest.raises(TypeError):
+        m.VectorInt(a)
+
+    a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.uintc)
+    v = m.VectorInt(a[0, :])
+    assert len(v) == 4
+    assert v[2] == 3
+    ma = np.asarray(v)
+    ma[2] = 5
+    assert v[2] == 5
+
+    v = m.VectorInt(a[:, 1])
+    assert len(v) == 3
+    assert v[2] == 10
+
+    v = m.get_vectorstruct()
+    assert v[0].x == 5
+    ma = np.asarray(v)
+    ma[1]['x'] = 99
+    assert v[1].x == 99
+
+    v = m.VectorStruct(np.zeros(3, dtype=np.dtype([('w', 'bool'), ('x', 'I'),
+                                                   ('y', 'float64'), ('z', 'bool')], align=True)))
+    assert len(v) == 3
+
+
+def test_vector_bool():
+    import pybind11_cross_module_tests as cm
+
+    vv_c = cm.VectorBool()
+    for i in range(10):
+        vv_c.append(i % 2 == 0)
+    for i in range(10):
+        assert vv_c[i] == (i % 2 == 0)
+    assert str(vv_c) == "VectorBool[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]"
+
+
+def test_vector_custom():
+    v_a = m.VectorEl()
+    v_a.append(m.El(1))
+    v_a.append(m.El(2))
+    assert str(v_a) == "VectorEl[El{1}, El{2}]"
+
+    vv_a = m.VectorVectorEl()
+    vv_a.append(v_a)
+    vv_b = vv_a[0]
+    assert str(vv_b) == "VectorEl[El{1}, El{2}]"
+
+
+def test_map_string_double():
+    mm = m.MapStringDouble()
+    mm['a'] = 1
+    mm['b'] = 2.5
+
+    assert list(mm) == ['a', 'b']
+    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    assert str(mm) == "MapStringDouble{a: 1, b: 2.5}"
+
+    um = m.UnorderedMapStringDouble()
+    um['ua'] = 1.1
+    um['ub'] = 2.6
+
+    assert sorted(list(um)) == ['ua', 'ub']
+    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
+    assert "UnorderedMapStringDouble" in str(um)
+
+
+def test_map_string_double_const():
+    mc = m.MapStringDoubleConst()
+    mc['a'] = 10
+    mc['b'] = 20.5
+    assert str(mc) == "MapStringDoubleConst{a: 10, b: 20.5}"
+
+    umc = m.UnorderedMapStringDoubleConst()
+    umc['a'] = 11
+    umc['b'] = 21.5
+
+    str(umc)
+
+
+def test_noncopyable_containers():
+    # std::vector
+    vnc = m.get_vnc(5)
+    for i in range(0, 5):
+        assert vnc[i].value == i + 1
+
+    for i, j in enumerate(vnc, start=1):
+        assert j.value == i
+
+    # std::deque
+    dnc = m.get_dnc(5)
+    for i in range(0, 5):
+        assert dnc[i].value == i + 1
+
+    i = 1
+    for j in dnc:
+        assert(j.value == i)
+        i += 1
+
+    # std::map
+    mnc = m.get_mnc(5)
+    for i in range(1, 6):
+        assert mnc[i].value == 10 * i
+
+    vsum = 0
+    for k, v in mnc.items():
+        assert v.value == 10 * k
+        vsum += v.value
+
+    assert vsum == 150
+
+    # std::unordered_map
+    mnc = m.get_umnc(5)
+    for i in range(1, 6):
+        assert mnc[i].value == 10 * i
+
+    vsum = 0
+    for k, v in mnc.items():
+        assert v.value == 10 * k
+        vsum += v.value
+
+    assert vsum == 150
+
+    # nested std::map<std::vector>
+    nvnc = m.get_nvnc(5)
+    for i in range(1, 6):
+        for j in range(0, 5):
+            assert nvnc[i][j].value == j + 1
+
+    for k, v in nvnc.items():
+        for i, j in enumerate(v, start=1):
+            assert j.value == i
+
+    # nested std::map<std::map>
+    nmnc = m.get_nmnc(5)
+    for i in range(1, 6):
+        for j in range(10, 60, 10):
+            assert nmnc[i][j].value == 10 * j
+
+    vsum = 0
+    for k_o, v_o in nmnc.items():
+        for k_i, v_i in v_o.items():
+            assert v_i.value == 10 * k_i
+            vsum += v_i.value
+
+    assert vsum == 7500
+
+    # nested std::unordered_map<std::unordered_map>
+    numnc = m.get_numnc(5)
+    for i in range(1, 6):
+        for j in range(10, 60, 10):
+            assert numnc[i][j].value == 10 * j
+
+    vsum = 0
+    for k_o, v_o in numnc.items():
+        for k_i, v_i in v_o.items():
+            assert v_i.value == 10 * k_i
+            vsum += v_i.value
+
+    assert vsum == 7500
+
+
+def test_map_delitem():
+    mm = m.MapStringDouble()
+    mm['a'] = 1
+    mm['b'] = 2.5
+
+    assert list(mm) == ['a', 'b']
+    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    del mm['a']
+    assert list(mm) == ['b']
+    assert list(mm.items()) == [('b', 2.5)]
+
+    um = m.UnorderedMapStringDouble()
+    um['ua'] = 1.1
+    um['ub'] = 2.6
+
+    assert sorted(list(um)) == ['ua', 'ub']
+    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
+    del um['ua']
+    assert sorted(list(um)) == ['ub']
+    assert sorted(list(um.items())) == [('ub', 2.6)]
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cc
new file mode 100644
index 0000000..272e460
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cc
@@ -0,0 +1,136 @@
+/*
+    tests/test_tagbased_polymorphic.cpp -- test of polymorphic_type_hook
+
+    Copyright (c) 2018 Hudson River Trading LLC <opensource@hudson-trading.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+
+struct Animal
+{
+    enum class Kind {
+        Unknown = 0,
+        Dog = 100, Labrador, Chihuahua, LastDog = 199,
+        Cat = 200, Panther, LastCat = 299
+    };
+    static const std::type_info* type_of_kind(Kind kind);
+    static std::string name_of_kind(Kind kind);
+
+    const Kind kind;
+    const std::string name;
+
+  protected:
+    Animal(const std::string& _name, Kind _kind)
+        : kind(_kind), name(_name)
+    {}
+};
+
+struct Dog : Animal
+{
+    Dog(const std::string& _name, Kind _kind = Kind::Dog) : Animal(_name, _kind) {}
+    std::string bark() const { return name_of_kind(kind) + " " + name + " goes " + sound; }
+    std::string sound = "WOOF!";
+};
+
+struct Labrador : Dog
+{
+    Labrador(const std::string& _name, int _excitement = 9001)
+        : Dog(_name, Kind::Labrador), excitement(_excitement) {}
+    int excitement;
+};
+
+struct Chihuahua : Dog
+{
+    Chihuahua(const std::string& _name) : Dog(_name, Kind::Chihuahua) { sound = "iyiyiyiyiyi"; }
+    std::string bark() const { return Dog::bark() + " and runs in circles"; }
+};
+
+struct Cat : Animal
+{
+    Cat(const std::string& _name, Kind _kind = Kind::Cat) : Animal(_name, _kind) {}
+    std::string purr() const { return "mrowr"; }
+};
+
+struct Panther : Cat
+{
+    Panther(const std::string& _name) : Cat(_name, Kind::Panther) {}
+    std::string purr() const { return "mrrrRRRRRR"; }
+};
+
+std::vector<std::unique_ptr<Animal>> create_zoo()
+{
+    std::vector<std::unique_ptr<Animal>> ret;
+    ret.emplace_back(new Labrador("Fido", 15000));
+
+    // simulate some new type of Dog that the Python bindings
+    // haven't been updated for; it should still be considered
+    // a Dog, not just an Animal.
+    ret.emplace_back(new Dog("Ginger", Dog::Kind(150)));
+
+    ret.emplace_back(new Chihuahua("Hertzl"));
+    ret.emplace_back(new Cat("Tiger", Cat::Kind::Cat));
+    ret.emplace_back(new Panther("Leo"));
+    return ret;
+}
+
+const std::type_info* Animal::type_of_kind(Kind kind)
+{
+    switch (kind) {
+        case Kind::Unknown: break;
+
+        case Kind::Dog: break;
+        case Kind::Labrador: return &typeid(Labrador);
+        case Kind::Chihuahua: return &typeid(Chihuahua);
+        case Kind::LastDog: break;
+
+        case Kind::Cat: break;
+        case Kind::Panther: return &typeid(Panther);
+        case Kind::LastCat: break;
+    }
+
+    if (kind >= Kind::Dog && kind <= Kind::LastDog) return &typeid(Dog);
+    if (kind >= Kind::Cat && kind <= Kind::LastCat) return &typeid(Cat);
+    return nullptr;
+}
+
+std::string Animal::name_of_kind(Kind kind)
+{
+    std::string raw_name = type_of_kind(kind)->name();
+    py::detail::clean_type_id(raw_name);
+    return raw_name;
+}
+
+namespace pybind11 {
+    template <typename itype>
+    struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_base_of<Animal, itype>::value>>
+    {
+        static const void *get(const itype *src, const std::type_info*& type)
+        { type = src ? Animal::type_of_kind(src->kind) : nullptr; return src; }
+    };
+}
+
+TEST_SUBMODULE(tagbased_polymorphic, m) {
+    py::class_<Animal>(m, "Animal")
+        .def_readonly("name", &Animal::name);
+    py::class_<Dog, Animal>(m, "Dog")
+        .def(py::init<std::string>())
+        .def_readwrite("sound", &Dog::sound)
+        .def("bark", &Dog::bark);
+    py::class_<Labrador, Dog>(m, "Labrador")
+        .def(py::init<std::string, int>(), "name"_a, "excitement"_a = 9001)
+        .def_readwrite("excitement", &Labrador::excitement);
+    py::class_<Chihuahua, Dog>(m, "Chihuahua")
+        .def(py::init<std::string>())
+        .def("bark", &Chihuahua::bark);
+    py::class_<Cat, Animal>(m, "Cat")
+        .def(py::init<std::string>())
+        .def("purr", &Cat::purr);
+    py::class_<Panther, Cat>(m, "Panther")
+        .def(py::init<std::string>())
+        .def("purr", &Panther::purr);
+    m.def("create_zoo", &create_zoo);
+};
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cpp
new file mode 100644
index 0000000..272e460
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cpp
@@ -0,0 +1,136 @@
+/*
+    tests/test_tagbased_polymorphic.cpp -- test of polymorphic_type_hook
+
+    Copyright (c) 2018 Hudson River Trading LLC <opensource@hudson-trading.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+
+struct Animal
+{
+    enum class Kind {
+        Unknown = 0,
+        Dog = 100, Labrador, Chihuahua, LastDog = 199,
+        Cat = 200, Panther, LastCat = 299
+    };
+    static const std::type_info* type_of_kind(Kind kind);
+    static std::string name_of_kind(Kind kind);
+
+    const Kind kind;
+    const std::string name;
+
+  protected:
+    Animal(const std::string& _name, Kind _kind)
+        : kind(_kind), name(_name)
+    {}
+};
+
+struct Dog : Animal
+{
+    Dog(const std::string& _name, Kind _kind = Kind::Dog) : Animal(_name, _kind) {}
+    std::string bark() const { return name_of_kind(kind) + " " + name + " goes " + sound; }
+    std::string sound = "WOOF!";
+};
+
+struct Labrador : Dog
+{
+    Labrador(const std::string& _name, int _excitement = 9001)
+        : Dog(_name, Kind::Labrador), excitement(_excitement) {}
+    int excitement;
+};
+
+struct Chihuahua : Dog
+{
+    Chihuahua(const std::string& _name) : Dog(_name, Kind::Chihuahua) { sound = "iyiyiyiyiyi"; }
+    std::string bark() const { return Dog::bark() + " and runs in circles"; }
+};
+
+struct Cat : Animal
+{
+    Cat(const std::string& _name, Kind _kind = Kind::Cat) : Animal(_name, _kind) {}
+    std::string purr() const { return "mrowr"; }
+};
+
+struct Panther : Cat
+{
+    Panther(const std::string& _name) : Cat(_name, Kind::Panther) {}
+    std::string purr() const { return "mrrrRRRRRR"; }
+};
+
+std::vector<std::unique_ptr<Animal>> create_zoo()
+{
+    std::vector<std::unique_ptr<Animal>> ret;
+    ret.emplace_back(new Labrador("Fido", 15000));
+
+    // simulate some new type of Dog that the Python bindings
+    // haven't been updated for; it should still be considered
+    // a Dog, not just an Animal.
+    ret.emplace_back(new Dog("Ginger", Dog::Kind(150)));
+
+    ret.emplace_back(new Chihuahua("Hertzl"));
+    ret.emplace_back(new Cat("Tiger", Cat::Kind::Cat));
+    ret.emplace_back(new Panther("Leo"));
+    return ret;
+}
+
+const std::type_info* Animal::type_of_kind(Kind kind)
+{
+    switch (kind) {
+        case Kind::Unknown: break;
+
+        case Kind::Dog: break;
+        case Kind::Labrador: return &typeid(Labrador);
+        case Kind::Chihuahua: return &typeid(Chihuahua);
+        case Kind::LastDog: break;
+
+        case Kind::Cat: break;
+        case Kind::Panther: return &typeid(Panther);
+        case Kind::LastCat: break;
+    }
+
+    if (kind >= Kind::Dog && kind <= Kind::LastDog) return &typeid(Dog);
+    if (kind >= Kind::Cat && kind <= Kind::LastCat) return &typeid(Cat);
+    return nullptr;
+}
+
+std::string Animal::name_of_kind(Kind kind)
+{
+    std::string raw_name = type_of_kind(kind)->name();
+    py::detail::clean_type_id(raw_name);
+    return raw_name;
+}
+
+namespace pybind11 {
+    template <typename itype>
+    struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_base_of<Animal, itype>::value>>
+    {
+        static const void *get(const itype *src, const std::type_info*& type)
+        { type = src ? Animal::type_of_kind(src->kind) : nullptr; return src; }
+    };
+}
+
+TEST_SUBMODULE(tagbased_polymorphic, m) {
+    py::class_<Animal>(m, "Animal")
+        .def_readonly("name", &Animal::name);
+    py::class_<Dog, Animal>(m, "Dog")
+        .def(py::init<std::string>())
+        .def_readwrite("sound", &Dog::sound)
+        .def("bark", &Dog::bark);
+    py::class_<Labrador, Dog>(m, "Labrador")
+        .def(py::init<std::string, int>(), "name"_a, "excitement"_a = 9001)
+        .def_readwrite("excitement", &Labrador::excitement);
+    py::class_<Chihuahua, Dog>(m, "Chihuahua")
+        .def(py::init<std::string>())
+        .def("bark", &Chihuahua::bark);
+    py::class_<Cat, Animal>(m, "Cat")
+        .def(py::init<std::string>())
+        .def("purr", &Cat::purr);
+    py::class_<Panther, Cat>(m, "Panther")
+        .def(py::init<std::string>())
+        .def("purr", &Panther::purr);
+    m.def("create_zoo", &create_zoo);
+};
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.py
new file mode 100644
index 0000000..2574d7d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.py
@@ -0,0 +1,20 @@
+from pybind11_tests import tagbased_polymorphic as m
+
+
+def test_downcast():
+    zoo = m.create_zoo()
+    assert [type(animal) for animal in zoo] == [
+        m.Labrador, m.Dog, m.Chihuahua, m.Cat, m.Panther
+    ]
+    assert [animal.name for animal in zoo] == [
+        "Fido", "Ginger", "Hertzl", "Tiger", "Leo"
+    ]
+    zoo[1].sound = "woooooo"
+    assert [dog.bark() for dog in zoo[:3]] == [
+        "Labrador Fido goes WOOF!",
+        "Dog Ginger goes woooooo",
+        "Chihuahua Hertzl goes iyiyiyiyiyi and runs in circles"
+    ]
+    assert [cat.purr() for cat in zoo[3:]] == ["mrowr", "mrrrRRRRRR"]
+    zoo[0].excitement -= 1000
+    assert zoo[0].excitement == 14000
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cc
new file mode 100644
index 0000000..7b98ea2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cc
@@ -0,0 +1,22 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2019 Roland Dreier <roland.dreier@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(union_, m) {
+    union TestUnion {
+        int value_int;
+        unsigned value_uint;
+    };
+
+    py::class_<TestUnion>(m, "TestUnion")
+        .def(py::init<>())
+        .def_readonly("as_int", &TestUnion::value_int)
+        .def_readwrite("as_uint", &TestUnion::value_uint);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cpp
new file mode 100644
index 0000000..7b98ea2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cpp
@@ -0,0 +1,22 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2019 Roland Dreier <roland.dreier@gmail.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+TEST_SUBMODULE(union_, m) {
+    union TestUnion {
+        int value_int;
+        unsigned value_uint;
+    };
+
+    py::class_<TestUnion>(m, "TestUnion")
+        .def(py::init<>())
+        .def_readonly("as_int", &TestUnion::value_int)
+        .def_readwrite("as_uint", &TestUnion::value_uint);
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.py
new file mode 100644
index 0000000..e1866e7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.py
@@ -0,0 +1,8 @@
+from pybind11_tests import union_ as m
+
+
+def test_union():
+    instance = m.TestUnion()
+
+    instance.as_uint = 10
+    assert instance.as_int == 10
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cc b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cc
new file mode 100644
index 0000000..ccf018d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cc
@@ -0,0 +1,479 @@
+/*
+    tests/test_virtual_functions.cpp -- overriding virtual functions from Python
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+/* This is an example class that we'll want to be able to extend from Python */
+class ExampleVirt  {
+public:
+    ExampleVirt(int state) : state(state) { print_created(this, state); }
+    ExampleVirt(const ExampleVirt &e) : state(e.state) { print_copy_created(this); }
+    ExampleVirt(ExampleVirt &&e) : state(e.state) { print_move_created(this); e.state = 0; }
+    virtual ~ExampleVirt() { print_destroyed(this); }
+
+    virtual int run(int value) {
+        py::print("Original implementation of "
+                  "ExampleVirt::run(state={}, value={}, str1={}, str2={})"_s.format(state, value, get_string1(), *get_string2()));
+        return state + value;
+    }
+
+    virtual bool run_bool() = 0;
+    virtual void pure_virtual() = 0;
+
+    // Returning a reference/pointer to a type converted from python (numbers, strings, etc.) is a
+    // bit trickier, because the actual int& or std::string& or whatever only exists temporarily, so
+    // we have to handle it specially in the trampoline class (see below).
+    virtual const std::string &get_string1() { return str1; }
+    virtual const std::string *get_string2() { return &str2; }
+
+private:
+    int state;
+    const std::string str1{"default1"}, str2{"default2"};
+};
+
+/* This is a wrapper class that must be generated */
+class PyExampleVirt : public ExampleVirt {
+public:
+    using ExampleVirt::ExampleVirt; /* Inherit constructors */
+
+    int run(int value) override {
+        /* Generate wrapping code that enables native function overloading */
+        PYBIND11_OVERLOAD(
+            int,         /* Return type */
+            ExampleVirt, /* Parent class */
+            run,         /* Name of function */
+            value        /* Argument(s) */
+        );
+    }
+
+    bool run_bool() override {
+        PYBIND11_OVERLOAD_PURE(
+            bool,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            run_bool,     /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    void pure_virtual() override {
+        PYBIND11_OVERLOAD_PURE(
+            void,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            pure_virtual, /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    // We can return reference types for compatibility with C++ virtual interfaces that do so, but
+    // note they have some significant limitations (see the documentation).
+    const std::string &get_string1() override {
+        PYBIND11_OVERLOAD(
+            const std::string &, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string1,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+    const std::string *get_string2() override {
+        PYBIND11_OVERLOAD(
+            const std::string *, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string2,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+};
+
+class NonCopyable {
+public:
+    NonCopyable(int a, int b) : value{new int(a*b)} { print_created(this, a, b); }
+    NonCopyable(NonCopyable &&o) { value = std::move(o.value); print_move_created(this); }
+    NonCopyable(const NonCopyable &) = delete;
+    NonCopyable() = delete;
+    void operator=(const NonCopyable &) = delete;
+    void operator=(NonCopyable &&) = delete;
+    std::string get_value() const {
+        if (value) return std::to_string(*value); else return "(null)";
+    }
+    ~NonCopyable() { print_destroyed(this); }
+
+private:
+    std::unique_ptr<int> value;
+};
+
+// This is like the above, but is both copy and movable.  In effect this means it should get moved
+// when it is not referenced elsewhere, but copied if it is still referenced.
+class Movable {
+public:
+    Movable(int a, int b) : value{a+b} { print_created(this, a, b); }
+    Movable(const Movable &m) { value = m.value; print_copy_created(this); }
+    Movable(Movable &&m) { value = std::move(m.value); print_move_created(this); }
+    std::string get_value() const { return std::to_string(value); }
+    ~Movable() { print_destroyed(this); }
+private:
+    int value;
+};
+
+class NCVirt {
+public:
+    virtual ~NCVirt() { }
+    virtual NonCopyable get_noncopyable(int a, int b) { return NonCopyable(a, b); }
+    virtual Movable get_movable(int a, int b) = 0;
+
+    std::string print_nc(int a, int b) { return get_noncopyable(a, b).get_value(); }
+    std::string print_movable(int a, int b) { return get_movable(a, b).get_value(); }
+};
+class NCVirtTrampoline : public NCVirt {
+#if !defined(__INTEL_COMPILER)
+    NonCopyable get_noncopyable(int a, int b) override {
+        PYBIND11_OVERLOAD(NonCopyable, NCVirt, get_noncopyable, a, b);
+    }
+#endif
+    Movable get_movable(int a, int b) override {
+        PYBIND11_OVERLOAD_PURE(Movable, NCVirt, get_movable, a, b);
+    }
+};
+
+struct Base {
+    /* for some reason MSVC2015 can't compile this if the function is pure virtual */
+    virtual std::string dispatch() const { return {}; };
+    virtual ~Base() = default;
+};
+
+struct DispatchIssue : Base {
+    virtual std::string dispatch() const {
+        PYBIND11_OVERLOAD_PURE(std::string, Base, dispatch, /* no arguments */);
+    }
+};
+
+static void test_gil() {
+    {
+        py::gil_scoped_acquire lock;
+        py::print("1st lock acquired");
+
+    }
+
+    {
+        py::gil_scoped_acquire lock;
+        py::print("2nd lock acquired");
+    }
+
+}
+
+static void test_gil_from_thread() {
+    py::gil_scoped_release release;
+
+    std::thread t(test_gil);
+    t.join();
+}
+
+
+// Forward declaration (so that we can put the main tests here; the inherited virtual approaches are
+// rather long).
+void initialize_inherited_virtuals(py::module &m);
+
+TEST_SUBMODULE(virtual_functions, m) {
+    // test_override
+    py::class_<ExampleVirt, PyExampleVirt>(m, "ExampleVirt")
+        .def(py::init<int>())
+        /* Reference original class in function definitions */
+        .def("run", &ExampleVirt::run)
+        .def("run_bool", &ExampleVirt::run_bool)
+        .def("pure_virtual", &ExampleVirt::pure_virtual);
+
+    py::class_<NonCopyable>(m, "NonCopyable")
+        .def(py::init<int, int>());
+
+    py::class_<Movable>(m, "Movable")
+        .def(py::init<int, int>());
+
+    // test_move_support
+#if !defined(__INTEL_COMPILER)
+    py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
+        .def(py::init<>())
+        .def("get_noncopyable", &NCVirt::get_noncopyable)
+        .def("get_movable", &NCVirt::get_movable)
+        .def("print_nc", &NCVirt::print_nc)
+        .def("print_movable", &NCVirt::print_movable);
+#endif
+
+    m.def("runExampleVirt", [](ExampleVirt *ex, int value) { return ex->run(value); });
+    m.def("runExampleVirtBool", [](ExampleVirt* ex) { return ex->run_bool(); });
+    m.def("runExampleVirtVirtual", [](ExampleVirt *ex) { ex->pure_virtual(); });
+
+    m.def("cstats_debug", &ConstructorStats::get<ExampleVirt>);
+    initialize_inherited_virtuals(m);
+
+    // test_alias_delay_initialization1
+    // don't invoke Python dispatch classes by default when instantiating C++ classes
+    // that were not extended on the Python side
+    struct A {
+        virtual ~A() {}
+        virtual void f() { py::print("A.f()"); }
+    };
+
+    struct PyA : A {
+        PyA() { py::print("PyA.PyA()"); }
+        ~PyA() { py::print("PyA.~PyA()"); }
+
+        void f() override {
+            py::print("PyA.f()");
+            // This convolution just gives a `void`, but tests that PYBIND11_TYPE() works to protect
+            // a type containing a ,
+            PYBIND11_OVERLOAD(PYBIND11_TYPE(typename std::enable_if<true, void>::type), A, f);
+        }
+    };
+
+    py::class_<A, PyA>(m, "A")
+        .def(py::init<>())
+        .def("f", &A::f);
+
+    m.def("call_f", [](A *a) { a->f(); });
+
+    // test_alias_delay_initialization2
+    // ... unless we explicitly request it, as in this example:
+    struct A2 {
+        virtual ~A2() {}
+        virtual void f() { py::print("A2.f()"); }
+    };
+
+    struct PyA2 : A2 {
+        PyA2() { py::print("PyA2.PyA2()"); }
+        ~PyA2() { py::print("PyA2.~PyA2()"); }
+        void f() override {
+            py::print("PyA2.f()");
+            PYBIND11_OVERLOAD(void, A2, f);
+        }
+    };
+
+    py::class_<A2, PyA2>(m, "A2")
+        .def(py::init_alias<>())
+        .def(py::init([](int) { return new PyA2(); }))
+        .def("f", &A2::f);
+
+    m.def("call_f", [](A2 *a2) { a2->f(); });
+
+    // test_dispatch_issue
+    // #159: virtual function dispatch has problems with similar-named functions
+    py::class_<Base, DispatchIssue>(m, "DispatchIssue")
+        .def(py::init<>())
+        .def("dispatch", &Base::dispatch);
+
+    m.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
+
+    // test_override_ref
+    // #392/397: overriding reference-returning functions
+    class OverrideTest {
+    public:
+        struct A { std::string value = "hi"; };
+        std::string v;
+        A a;
+        explicit OverrideTest(const std::string &v) : v{v} {}
+        virtual std::string str_value() { return v; }
+        virtual std::string &str_ref() { return v; }
+        virtual A A_value() { return a; }
+        virtual A &A_ref() { return a; }
+        virtual ~OverrideTest() = default;
+    };
+
+    class PyOverrideTest : public OverrideTest {
+    public:
+        using OverrideTest::OverrideTest;
+        std::string str_value() override { PYBIND11_OVERLOAD(std::string, OverrideTest, str_value); }
+        // Not allowed (uncommenting should hit a static_assert failure): we can't get a reference
+        // to a python numeric value, since we only copy values in the numeric type caster:
+//      std::string &str_ref() override { PYBIND11_OVERLOAD(std::string &, OverrideTest, str_ref); }
+        // But we can work around it like this:
+    private:
+        std::string _tmp;
+        std::string str_ref_helper() { PYBIND11_OVERLOAD(std::string, OverrideTest, str_ref); }
+    public:
+        std::string &str_ref() override { return _tmp = str_ref_helper(); }
+
+        A A_value() override { PYBIND11_OVERLOAD(A, OverrideTest, A_value); }
+        A &A_ref() override { PYBIND11_OVERLOAD(A &, OverrideTest, A_ref); }
+    };
+
+    py::class_<OverrideTest::A>(m, "OverrideTest_A")
+        .def_readwrite("value", &OverrideTest::A::value);
+    py::class_<OverrideTest, PyOverrideTest>(m, "OverrideTest")
+        .def(py::init<const std::string &>())
+        .def("str_value", &OverrideTest::str_value)
+//      .def("str_ref", &OverrideTest::str_ref)
+        .def("A_value", &OverrideTest::A_value)
+        .def("A_ref", &OverrideTest::A_ref);
+}
+
+
+// Inheriting virtual methods.  We do two versions here: the repeat-everything version and the
+// templated trampoline versions mentioned in docs/advanced.rst.
+//
+// These base classes are exactly the same, but we technically need distinct
+// classes for this example code because we need to be able to bind them
+// properly (pybind11, sensibly, doesn't allow us to bind the same C++ class to
+// multiple python classes).
+class A_Repeat {
+#define A_METHODS \
+public: \
+    virtual int unlucky_number() = 0; \
+    virtual std::string say_something(unsigned times) { \
+        std::string s = ""; \
+        for (unsigned i = 0; i < times; ++i) \
+            s += "hi"; \
+        return s; \
+    } \
+    std::string say_everything() { \
+        return say_something(1) + " " + std::to_string(unlucky_number()); \
+    }
+A_METHODS
+    virtual ~A_Repeat() = default;
+};
+class B_Repeat : public A_Repeat {
+#define B_METHODS \
+public: \
+    int unlucky_number() override { return 13; } \
+    std::string say_something(unsigned times) override { \
+        return "B says hi " + std::to_string(times) + " times"; \
+    } \
+    virtual double lucky_number() { return 7.0; }
+B_METHODS
+};
+class C_Repeat : public B_Repeat {
+#define C_METHODS \
+public: \
+    int unlucky_number() override { return 4444; } \
+    double lucky_number() override { return 888; }
+C_METHODS
+};
+class D_Repeat : public C_Repeat {
+#define D_METHODS // Nothing overridden.
+D_METHODS
+};
+
+// Base classes for templated inheritance trampolines.  Identical to the repeat-everything version:
+class A_Tpl { A_METHODS; virtual ~A_Tpl() = default; };
+class B_Tpl : public A_Tpl { B_METHODS };
+class C_Tpl : public B_Tpl { C_METHODS };
+class D_Tpl : public C_Tpl { D_METHODS };
+
+
+// Inheritance approach 1: each trampoline gets every virtual method (11 in total)
+class PyA_Repeat : public A_Repeat {
+public:
+    using A_Repeat::A_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, A_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, A_Repeat, say_something, times); }
+};
+class PyB_Repeat : public B_Repeat {
+public:
+    using B_Repeat::B_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, B_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, B_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, B_Repeat, lucky_number, ); }
+};
+class PyC_Repeat : public C_Repeat {
+public:
+    using C_Repeat::C_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, C_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, C_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, C_Repeat, lucky_number, ); }
+};
+class PyD_Repeat : public D_Repeat {
+public:
+    using D_Repeat::D_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, D_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, D_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, D_Repeat, lucky_number, ); }
+};
+
+// Inheritance approach 2: templated trampoline classes.
+//
+// Advantages:
+// - we have only 2 (template) class and 4 method declarations (one per virtual method, plus one for
+//   any override of a pure virtual method), versus 4 classes and 6 methods (MI) or 4 classes and 11
+//   methods (repeat).
+// - Compared to MI, we also don't have to change the non-trampoline inheritance to virtual, and can
+//   properly inherit constructors.
+//
+// Disadvantage:
+// - the compiler must still generate and compile 14 different methods (more, even, than the 11
+//   required for the repeat approach) instead of the 6 required for MI.  (If there was no pure
+//   method (or no pure method override), the number would drop down to the same 11 as the repeat
+//   approach).
+template <class Base = A_Tpl>
+class PyA_Tpl : public Base {
+public:
+    using Base::Base; // Inherit constructors
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, Base, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, Base, say_something, times); }
+};
+template <class Base = B_Tpl>
+class PyB_Tpl : public PyA_Tpl<Base> {
+public:
+    using PyA_Tpl<Base>::PyA_Tpl; // Inherit constructors (via PyA_Tpl's inherited constructors)
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, Base, unlucky_number, ); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, Base, lucky_number, ); }
+};
+// Since C_Tpl and D_Tpl don't declare any new virtual methods, we don't actually need these (we can
+// use PyB_Tpl<C_Tpl> and PyB_Tpl<D_Tpl> for the trampoline classes instead):
+/*
+template <class Base = C_Tpl> class PyC_Tpl : public PyB_Tpl<Base> {
+public:
+    using PyB_Tpl<Base>::PyB_Tpl;
+};
+template <class Base = D_Tpl> class PyD_Tpl : public PyC_Tpl<Base> {
+public:
+    using PyC_Tpl<Base>::PyC_Tpl;
+};
+*/
+
+void initialize_inherited_virtuals(py::module &m) {
+    // test_inherited_virtuals
+
+    // Method 1: repeat
+    py::class_<A_Repeat, PyA_Repeat>(m, "A_Repeat")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Repeat::unlucky_number)
+        .def("say_something", &A_Repeat::say_something)
+        .def("say_everything", &A_Repeat::say_everything);
+    py::class_<B_Repeat, A_Repeat, PyB_Repeat>(m, "B_Repeat")
+        .def(py::init<>())
+        .def("lucky_number", &B_Repeat::lucky_number);
+    py::class_<C_Repeat, B_Repeat, PyC_Repeat>(m, "C_Repeat")
+        .def(py::init<>());
+    py::class_<D_Repeat, C_Repeat, PyD_Repeat>(m, "D_Repeat")
+        .def(py::init<>());
+
+    // test_
+    // Method 2: Templated trampolines
+    py::class_<A_Tpl, PyA_Tpl<>>(m, "A_Tpl")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Tpl::unlucky_number)
+        .def("say_something", &A_Tpl::say_something)
+        .def("say_everything", &A_Tpl::say_everything);
+    py::class_<B_Tpl, A_Tpl, PyB_Tpl<>>(m, "B_Tpl")
+        .def(py::init<>())
+        .def("lucky_number", &B_Tpl::lucky_number);
+    py::class_<C_Tpl, B_Tpl, PyB_Tpl<C_Tpl>>(m, "C_Tpl")
+        .def(py::init<>());
+    py::class_<D_Tpl, C_Tpl, PyB_Tpl<D_Tpl>>(m, "D_Tpl")
+        .def(py::init<>());
+
+
+    // Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+    m.def("test_gil", &test_gil);
+    m.def("test_gil_from_thread", &test_gil_from_thread);
+};
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cpp b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cpp
new file mode 100644
index 0000000..ccf018d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cpp
@@ -0,0 +1,479 @@
+/*
+    tests/test_virtual_functions.cpp -- overriding virtual functions from Python
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/functional.h>
+#include <thread>
+
+/* This is an example class that we'll want to be able to extend from Python */
+class ExampleVirt  {
+public:
+    ExampleVirt(int state) : state(state) { print_created(this, state); }
+    ExampleVirt(const ExampleVirt &e) : state(e.state) { print_copy_created(this); }
+    ExampleVirt(ExampleVirt &&e) : state(e.state) { print_move_created(this); e.state = 0; }
+    virtual ~ExampleVirt() { print_destroyed(this); }
+
+    virtual int run(int value) {
+        py::print("Original implementation of "
+                  "ExampleVirt::run(state={}, value={}, str1={}, str2={})"_s.format(state, value, get_string1(), *get_string2()));
+        return state + value;
+    }
+
+    virtual bool run_bool() = 0;
+    virtual void pure_virtual() = 0;
+
+    // Returning a reference/pointer to a type converted from python (numbers, strings, etc.) is a
+    // bit trickier, because the actual int& or std::string& or whatever only exists temporarily, so
+    // we have to handle it specially in the trampoline class (see below).
+    virtual const std::string &get_string1() { return str1; }
+    virtual const std::string *get_string2() { return &str2; }
+
+private:
+    int state;
+    const std::string str1{"default1"}, str2{"default2"};
+};
+
+/* This is a wrapper class that must be generated */
+class PyExampleVirt : public ExampleVirt {
+public:
+    using ExampleVirt::ExampleVirt; /* Inherit constructors */
+
+    int run(int value) override {
+        /* Generate wrapping code that enables native function overloading */
+        PYBIND11_OVERLOAD(
+            int,         /* Return type */
+            ExampleVirt, /* Parent class */
+            run,         /* Name of function */
+            value        /* Argument(s) */
+        );
+    }
+
+    bool run_bool() override {
+        PYBIND11_OVERLOAD_PURE(
+            bool,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            run_bool,     /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    void pure_virtual() override {
+        PYBIND11_OVERLOAD_PURE(
+            void,         /* Return type */
+            ExampleVirt,  /* Parent class */
+            pure_virtual, /* Name of function */
+                          /* This function has no arguments. The trailing comma
+                             in the previous line is needed for some compilers */
+        );
+    }
+
+    // We can return reference types for compatibility with C++ virtual interfaces that do so, but
+    // note they have some significant limitations (see the documentation).
+    const std::string &get_string1() override {
+        PYBIND11_OVERLOAD(
+            const std::string &, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string1,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+    const std::string *get_string2() override {
+        PYBIND11_OVERLOAD(
+            const std::string *, /* Return type */
+            ExampleVirt,         /* Parent class */
+            get_string2,         /* Name of function */
+                                 /* (no arguments) */
+        );
+    }
+
+};
+
+class NonCopyable {
+public:
+    NonCopyable(int a, int b) : value{new int(a*b)} { print_created(this, a, b); }
+    NonCopyable(NonCopyable &&o) { value = std::move(o.value); print_move_created(this); }
+    NonCopyable(const NonCopyable &) = delete;
+    NonCopyable() = delete;
+    void operator=(const NonCopyable &) = delete;
+    void operator=(NonCopyable &&) = delete;
+    std::string get_value() const {
+        if (value) return std::to_string(*value); else return "(null)";
+    }
+    ~NonCopyable() { print_destroyed(this); }
+
+private:
+    std::unique_ptr<int> value;
+};
+
+// This is like the above, but is both copy and movable.  In effect this means it should get moved
+// when it is not referenced elsewhere, but copied if it is still referenced.
+class Movable {
+public:
+    Movable(int a, int b) : value{a+b} { print_created(this, a, b); }
+    Movable(const Movable &m) { value = m.value; print_copy_created(this); }
+    Movable(Movable &&m) { value = std::move(m.value); print_move_created(this); }
+    std::string get_value() const { return std::to_string(value); }
+    ~Movable() { print_destroyed(this); }
+private:
+    int value;
+};
+
+class NCVirt {
+public:
+    virtual ~NCVirt() { }
+    virtual NonCopyable get_noncopyable(int a, int b) { return NonCopyable(a, b); }
+    virtual Movable get_movable(int a, int b) = 0;
+
+    std::string print_nc(int a, int b) { return get_noncopyable(a, b).get_value(); }
+    std::string print_movable(int a, int b) { return get_movable(a, b).get_value(); }
+};
+class NCVirtTrampoline : public NCVirt {
+#if !defined(__INTEL_COMPILER)
+    NonCopyable get_noncopyable(int a, int b) override {
+        PYBIND11_OVERLOAD(NonCopyable, NCVirt, get_noncopyable, a, b);
+    }
+#endif
+    Movable get_movable(int a, int b) override {
+        PYBIND11_OVERLOAD_PURE(Movable, NCVirt, get_movable, a, b);
+    }
+};
+
+struct Base {
+    /* for some reason MSVC2015 can't compile this if the function is pure virtual */
+    virtual std::string dispatch() const { return {}; };
+    virtual ~Base() = default;
+};
+
+struct DispatchIssue : Base {
+    virtual std::string dispatch() const {
+        PYBIND11_OVERLOAD_PURE(std::string, Base, dispatch, /* no arguments */);
+    }
+};
+
+static void test_gil() {
+    {
+        py::gil_scoped_acquire lock;
+        py::print("1st lock acquired");
+
+    }
+
+    {
+        py::gil_scoped_acquire lock;
+        py::print("2nd lock acquired");
+    }
+
+}
+
+static void test_gil_from_thread() {
+    py::gil_scoped_release release;
+
+    std::thread t(test_gil);
+    t.join();
+}
+
+
+// Forward declaration (so that we can put the main tests here; the inherited virtual approaches are
+// rather long).
+void initialize_inherited_virtuals(py::module &m);
+
+TEST_SUBMODULE(virtual_functions, m) {
+    // test_override
+    py::class_<ExampleVirt, PyExampleVirt>(m, "ExampleVirt")
+        .def(py::init<int>())
+        /* Reference original class in function definitions */
+        .def("run", &ExampleVirt::run)
+        .def("run_bool", &ExampleVirt::run_bool)
+        .def("pure_virtual", &ExampleVirt::pure_virtual);
+
+    py::class_<NonCopyable>(m, "NonCopyable")
+        .def(py::init<int, int>());
+
+    py::class_<Movable>(m, "Movable")
+        .def(py::init<int, int>());
+
+    // test_move_support
+#if !defined(__INTEL_COMPILER)
+    py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
+        .def(py::init<>())
+        .def("get_noncopyable", &NCVirt::get_noncopyable)
+        .def("get_movable", &NCVirt::get_movable)
+        .def("print_nc", &NCVirt::print_nc)
+        .def("print_movable", &NCVirt::print_movable);
+#endif
+
+    m.def("runExampleVirt", [](ExampleVirt *ex, int value) { return ex->run(value); });
+    m.def("runExampleVirtBool", [](ExampleVirt* ex) { return ex->run_bool(); });
+    m.def("runExampleVirtVirtual", [](ExampleVirt *ex) { ex->pure_virtual(); });
+
+    m.def("cstats_debug", &ConstructorStats::get<ExampleVirt>);
+    initialize_inherited_virtuals(m);
+
+    // test_alias_delay_initialization1
+    // don't invoke Python dispatch classes by default when instantiating C++ classes
+    // that were not extended on the Python side
+    struct A {
+        virtual ~A() {}
+        virtual void f() { py::print("A.f()"); }
+    };
+
+    struct PyA : A {
+        PyA() { py::print("PyA.PyA()"); }
+        ~PyA() { py::print("PyA.~PyA()"); }
+
+        void f() override {
+            py::print("PyA.f()");
+            // This convolution just gives a `void`, but tests that PYBIND11_TYPE() works to protect
+            // a type containing a ,
+            PYBIND11_OVERLOAD(PYBIND11_TYPE(typename std::enable_if<true, void>::type), A, f);
+        }
+    };
+
+    py::class_<A, PyA>(m, "A")
+        .def(py::init<>())
+        .def("f", &A::f);
+
+    m.def("call_f", [](A *a) { a->f(); });
+
+    // test_alias_delay_initialization2
+    // ... unless we explicitly request it, as in this example:
+    struct A2 {
+        virtual ~A2() {}
+        virtual void f() { py::print("A2.f()"); }
+    };
+
+    struct PyA2 : A2 {
+        PyA2() { py::print("PyA2.PyA2()"); }
+        ~PyA2() { py::print("PyA2.~PyA2()"); }
+        void f() override {
+            py::print("PyA2.f()");
+            PYBIND11_OVERLOAD(void, A2, f);
+        }
+    };
+
+    py::class_<A2, PyA2>(m, "A2")
+        .def(py::init_alias<>())
+        .def(py::init([](int) { return new PyA2(); }))
+        .def("f", &A2::f);
+
+    m.def("call_f", [](A2 *a2) { a2->f(); });
+
+    // test_dispatch_issue
+    // #159: virtual function dispatch has problems with similar-named functions
+    py::class_<Base, DispatchIssue>(m, "DispatchIssue")
+        .def(py::init<>())
+        .def("dispatch", &Base::dispatch);
+
+    m.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
+
+    // test_override_ref
+    // #392/397: overriding reference-returning functions
+    class OverrideTest {
+    public:
+        struct A { std::string value = "hi"; };
+        std::string v;
+        A a;
+        explicit OverrideTest(const std::string &v) : v{v} {}
+        virtual std::string str_value() { return v; }
+        virtual std::string &str_ref() { return v; }
+        virtual A A_value() { return a; }
+        virtual A &A_ref() { return a; }
+        virtual ~OverrideTest() = default;
+    };
+
+    class PyOverrideTest : public OverrideTest {
+    public:
+        using OverrideTest::OverrideTest;
+        std::string str_value() override { PYBIND11_OVERLOAD(std::string, OverrideTest, str_value); }
+        // Not allowed (uncommenting should hit a static_assert failure): we can't get a reference
+        // to a python numeric value, since we only copy values in the numeric type caster:
+//      std::string &str_ref() override { PYBIND11_OVERLOAD(std::string &, OverrideTest, str_ref); }
+        // But we can work around it like this:
+    private:
+        std::string _tmp;
+        std::string str_ref_helper() { PYBIND11_OVERLOAD(std::string, OverrideTest, str_ref); }
+    public:
+        std::string &str_ref() override { return _tmp = str_ref_helper(); }
+
+        A A_value() override { PYBIND11_OVERLOAD(A, OverrideTest, A_value); }
+        A &A_ref() override { PYBIND11_OVERLOAD(A &, OverrideTest, A_ref); }
+    };
+
+    py::class_<OverrideTest::A>(m, "OverrideTest_A")
+        .def_readwrite("value", &OverrideTest::A::value);
+    py::class_<OverrideTest, PyOverrideTest>(m, "OverrideTest")
+        .def(py::init<const std::string &>())
+        .def("str_value", &OverrideTest::str_value)
+//      .def("str_ref", &OverrideTest::str_ref)
+        .def("A_value", &OverrideTest::A_value)
+        .def("A_ref", &OverrideTest::A_ref);
+}
+
+
+// Inheriting virtual methods.  We do two versions here: the repeat-everything version and the
+// templated trampoline versions mentioned in docs/advanced.rst.
+//
+// These base classes are exactly the same, but we technically need distinct
+// classes for this example code because we need to be able to bind them
+// properly (pybind11, sensibly, doesn't allow us to bind the same C++ class to
+// multiple python classes).
+class A_Repeat {
+#define A_METHODS \
+public: \
+    virtual int unlucky_number() = 0; \
+    virtual std::string say_something(unsigned times) { \
+        std::string s = ""; \
+        for (unsigned i = 0; i < times; ++i) \
+            s += "hi"; \
+        return s; \
+    } \
+    std::string say_everything() { \
+        return say_something(1) + " " + std::to_string(unlucky_number()); \
+    }
+A_METHODS
+    virtual ~A_Repeat() = default;
+};
+class B_Repeat : public A_Repeat {
+#define B_METHODS \
+public: \
+    int unlucky_number() override { return 13; } \
+    std::string say_something(unsigned times) override { \
+        return "B says hi " + std::to_string(times) + " times"; \
+    } \
+    virtual double lucky_number() { return 7.0; }
+B_METHODS
+};
+class C_Repeat : public B_Repeat {
+#define C_METHODS \
+public: \
+    int unlucky_number() override { return 4444; } \
+    double lucky_number() override { return 888; }
+C_METHODS
+};
+class D_Repeat : public C_Repeat {
+#define D_METHODS // Nothing overridden.
+D_METHODS
+};
+
+// Base classes for templated inheritance trampolines.  Identical to the repeat-everything version:
+class A_Tpl { A_METHODS; virtual ~A_Tpl() = default; };
+class B_Tpl : public A_Tpl { B_METHODS };
+class C_Tpl : public B_Tpl { C_METHODS };
+class D_Tpl : public C_Tpl { D_METHODS };
+
+
+// Inheritance approach 1: each trampoline gets every virtual method (11 in total)
+class PyA_Repeat : public A_Repeat {
+public:
+    using A_Repeat::A_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, A_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, A_Repeat, say_something, times); }
+};
+class PyB_Repeat : public B_Repeat {
+public:
+    using B_Repeat::B_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, B_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, B_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, B_Repeat, lucky_number, ); }
+};
+class PyC_Repeat : public C_Repeat {
+public:
+    using C_Repeat::C_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, C_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, C_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, C_Repeat, lucky_number, ); }
+};
+class PyD_Repeat : public D_Repeat {
+public:
+    using D_Repeat::D_Repeat;
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, D_Repeat, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, D_Repeat, say_something, times); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, D_Repeat, lucky_number, ); }
+};
+
+// Inheritance approach 2: templated trampoline classes.
+//
+// Advantages:
+// - we have only 2 (template) class and 4 method declarations (one per virtual method, plus one for
+//   any override of a pure virtual method), versus 4 classes and 6 methods (MI) or 4 classes and 11
+//   methods (repeat).
+// - Compared to MI, we also don't have to change the non-trampoline inheritance to virtual, and can
+//   properly inherit constructors.
+//
+// Disadvantage:
+// - the compiler must still generate and compile 14 different methods (more, even, than the 11
+//   required for the repeat approach) instead of the 6 required for MI.  (If there was no pure
+//   method (or no pure method override), the number would drop down to the same 11 as the repeat
+//   approach).
+template <class Base = A_Tpl>
+class PyA_Tpl : public Base {
+public:
+    using Base::Base; // Inherit constructors
+    int unlucky_number() override { PYBIND11_OVERLOAD_PURE(int, Base, unlucky_number, ); }
+    std::string say_something(unsigned times) override { PYBIND11_OVERLOAD(std::string, Base, say_something, times); }
+};
+template <class Base = B_Tpl>
+class PyB_Tpl : public PyA_Tpl<Base> {
+public:
+    using PyA_Tpl<Base>::PyA_Tpl; // Inherit constructors (via PyA_Tpl's inherited constructors)
+    int unlucky_number() override { PYBIND11_OVERLOAD(int, Base, unlucky_number, ); }
+    double lucky_number() override { PYBIND11_OVERLOAD(double, Base, lucky_number, ); }
+};
+// Since C_Tpl and D_Tpl don't declare any new virtual methods, we don't actually need these (we can
+// use PyB_Tpl<C_Tpl> and PyB_Tpl<D_Tpl> for the trampoline classes instead):
+/*
+template <class Base = C_Tpl> class PyC_Tpl : public PyB_Tpl<Base> {
+public:
+    using PyB_Tpl<Base>::PyB_Tpl;
+};
+template <class Base = D_Tpl> class PyD_Tpl : public PyC_Tpl<Base> {
+public:
+    using PyC_Tpl<Base>::PyC_Tpl;
+};
+*/
+
+void initialize_inherited_virtuals(py::module &m) {
+    // test_inherited_virtuals
+
+    // Method 1: repeat
+    py::class_<A_Repeat, PyA_Repeat>(m, "A_Repeat")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Repeat::unlucky_number)
+        .def("say_something", &A_Repeat::say_something)
+        .def("say_everything", &A_Repeat::say_everything);
+    py::class_<B_Repeat, A_Repeat, PyB_Repeat>(m, "B_Repeat")
+        .def(py::init<>())
+        .def("lucky_number", &B_Repeat::lucky_number);
+    py::class_<C_Repeat, B_Repeat, PyC_Repeat>(m, "C_Repeat")
+        .def(py::init<>());
+    py::class_<D_Repeat, C_Repeat, PyD_Repeat>(m, "D_Repeat")
+        .def(py::init<>());
+
+    // test_
+    // Method 2: Templated trampolines
+    py::class_<A_Tpl, PyA_Tpl<>>(m, "A_Tpl")
+        .def(py::init<>())
+        .def("unlucky_number", &A_Tpl::unlucky_number)
+        .def("say_something", &A_Tpl::say_something)
+        .def("say_everything", &A_Tpl::say_everything);
+    py::class_<B_Tpl, A_Tpl, PyB_Tpl<>>(m, "B_Tpl")
+        .def(py::init<>())
+        .def("lucky_number", &B_Tpl::lucky_number);
+    py::class_<C_Tpl, B_Tpl, PyB_Tpl<C_Tpl>>(m, "C_Tpl")
+        .def(py::init<>());
+    py::class_<D_Tpl, C_Tpl, PyB_Tpl<D_Tpl>>(m, "D_Tpl")
+        .def(py::init<>());
+
+
+    // Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+    m.def("test_gil", &test_gil);
+    m.def("test_gil_from_thread", &test_gil_from_thread);
+};
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.py
new file mode 100644
index 0000000..5ce9abd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.py
@@ -0,0 +1,377 @@
+import pytest
+
+from pybind11_tests import virtual_functions as m
+from pybind11_tests import ConstructorStats
+
+
+def test_override(capture, msg):
+    class ExtendedExampleVirt(m.ExampleVirt):
+        def __init__(self, state):
+            super(ExtendedExampleVirt, self).__init__(state + 1)
+            self.data = "Hello world"
+
+        def run(self, value):
+            print('ExtendedExampleVirt::run(%i), calling parent..' % value)
+            return super(ExtendedExampleVirt, self).run(value + 1)
+
+        def run_bool(self):
+            print('ExtendedExampleVirt::run_bool()')
+            return False
+
+        def get_string1(self):
+            return "override1"
+
+        def pure_virtual(self):
+            print('ExtendedExampleVirt::pure_virtual(): %s' % self.data)
+
+    class ExtendedExampleVirt2(ExtendedExampleVirt):
+        def __init__(self, state):
+            super(ExtendedExampleVirt2, self).__init__(state + 1)
+
+        def get_string2(self):
+            return "override2"
+
+    ex12 = m.ExampleVirt(10)
+    with capture:
+        assert m.runExampleVirt(ex12, 20) == 30
+    assert capture == """
+        Original implementation of ExampleVirt::run(state=10, value=20, str1=default1, str2=default2)
+    """  # noqa: E501 line too long
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.runExampleVirtVirtual(ex12)
+    assert msg(excinfo.value) == 'Tried to call pure virtual function "ExampleVirt::pure_virtual"'
+
+    ex12p = ExtendedExampleVirt(10)
+    with capture:
+        assert m.runExampleVirt(ex12p, 20) == 32
+    assert capture == """
+        ExtendedExampleVirt::run(20), calling parent..
+        Original implementation of ExampleVirt::run(state=11, value=21, str1=override1, str2=default2)
+    """  # noqa: E501 line too long
+    with capture:
+        assert m.runExampleVirtBool(ex12p) is False
+    assert capture == "ExtendedExampleVirt::run_bool()"
+    with capture:
+        m.runExampleVirtVirtual(ex12p)
+    assert capture == "ExtendedExampleVirt::pure_virtual(): Hello world"
+
+    ex12p2 = ExtendedExampleVirt2(15)
+    with capture:
+        assert m.runExampleVirt(ex12p2, 50) == 68
+    assert capture == """
+        ExtendedExampleVirt::run(50), calling parent..
+        Original implementation of ExampleVirt::run(state=17, value=51, str1=override1, str2=override2)
+    """  # noqa: E501 line too long
+
+    cstats = ConstructorStats.get(m.ExampleVirt)
+    assert cstats.alive() == 3
+    del ex12, ex12p, ex12p2
+    assert cstats.alive() == 0
+    assert cstats.values() == ['10', '11', '17']
+    assert cstats.copy_constructions == 0
+    assert cstats.move_constructions >= 0
+
+
+def test_alias_delay_initialization1(capture):
+    """`A` only initializes its trampoline class when we inherit from it
+
+    If we just create and use an A instance directly, the trampoline initialization is
+    bypassed and we only initialize an A() instead (for performance reasons).
+    """
+    class B(m.A):
+        def __init__(self):
+            super(B, self).__init__()
+
+        def f(self):
+            print("In python f()")
+
+    # C++ version
+    with capture:
+        a = m.A()
+        m.call_f(a)
+        del a
+        pytest.gc_collect()
+    assert capture == "A.f()"
+
+    # Python version
+    with capture:
+        b = B()
+        m.call_f(b)
+        del b
+        pytest.gc_collect()
+    assert capture == """
+        PyA.PyA()
+        PyA.f()
+        In python f()
+        PyA.~PyA()
+    """
+
+
+def test_alias_delay_initialization2(capture):
+    """`A2`, unlike the above, is configured to always initialize the alias
+
+    While the extra initialization and extra class layer has small virtual dispatch
+    performance penalty, it also allows us to do more things with the trampoline
+    class such as defining local variables and performing construction/destruction.
+    """
+    class B2(m.A2):
+        def __init__(self):
+            super(B2, self).__init__()
+
+        def f(self):
+            print("In python B2.f()")
+
+    # No python subclass version
+    with capture:
+        a2 = m.A2()
+        m.call_f(a2)
+        del a2
+        pytest.gc_collect()
+        a3 = m.A2(1)
+        m.call_f(a3)
+        del a3
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+    """
+
+    # Python subclass version
+    with capture:
+        b2 = B2()
+        m.call_f(b2)
+        del b2
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        In python B2.f()
+        PyA2.~PyA2()
+    """
+
+
+# PyPy: Reference count > 1 causes call with noncopyable instance
+# to fail in ncv1.print_nc()
+@pytest.unsupported_on_pypy
+@pytest.mark.skipif(not hasattr(m, "NCVirt"), reason="NCVirt test broken on ICPC")
+def test_move_support():
+    class NCVirtExt(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Constructs and returns a new instance:
+            nc = m.NonCopyable(a * a, b * b)
+            return nc
+
+        def get_movable(self, a, b):
+            # Return a referenced copy
+            self.movable = m.Movable(a, b)
+            return self.movable
+
+    class NCVirtExt2(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Keep a reference: this is going to throw an exception
+            self.nc = m.NonCopyable(a, b)
+            return self.nc
+
+        def get_movable(self, a, b):
+            # Return a new instance without storing it
+            return m.Movable(a, b)
+
+    ncv1 = NCVirtExt()
+    assert ncv1.print_nc(2, 3) == "36"
+    assert ncv1.print_movable(4, 5) == "9"
+    ncv2 = NCVirtExt2()
+    assert ncv2.print_movable(7, 7) == "14"
+    # Don't check the exception message here because it differs under debug/non-debug mode
+    with pytest.raises(RuntimeError):
+        ncv2.print_nc(9, 9)
+
+    nc_stats = ConstructorStats.get(m.NonCopyable)
+    mv_stats = ConstructorStats.get(m.Movable)
+    assert nc_stats.alive() == 1
+    assert mv_stats.alive() == 1
+    del ncv1, ncv2
+    assert nc_stats.alive() == 0
+    assert mv_stats.alive() == 0
+    assert nc_stats.values() == ['4', '9', '9', '9']
+    assert mv_stats.values() == ['4', '5', '7', '7']
+    assert nc_stats.copy_constructions == 0
+    assert mv_stats.copy_constructions == 1
+    assert nc_stats.move_constructions >= 0
+    assert mv_stats.move_constructions >= 0
+
+
+def test_dispatch_issue(msg):
+    """#159: virtual function dispatch has problems with similar-named functions"""
+    class PyClass1(m.DispatchIssue):
+        def dispatch(self):
+            return "Yay.."
+
+    class PyClass2(m.DispatchIssue):
+        def dispatch(self):
+            with pytest.raises(RuntimeError) as excinfo:
+                super(PyClass2, self).dispatch()
+            assert msg(excinfo.value) == 'Tried to call pure virtual function "Base::dispatch"'
+
+            p = PyClass1()
+            return m.dispatch_issue_go(p)
+
+    b = PyClass2()
+    assert m.dispatch_issue_go(b) == "Yay.."
+
+
+def test_override_ref():
+    """#392/397: overriding reference-returning functions"""
+    o = m.OverrideTest("asdf")
+
+    # Not allowed (see associated .cpp comment)
+    # i = o.str_ref()
+    # assert o.str_ref() == "asdf"
+    assert o.str_value() == "asdf"
+
+    assert o.A_value().value == "hi"
+    a = o.A_ref()
+    assert a.value == "hi"
+    a.value = "bye"
+    assert a.value == "bye"
+
+
+def test_inherited_virtuals():
+    class AR(m.A_Repeat):
+        def unlucky_number(self):
+            return 99
+
+    class AT(m.A_Tpl):
+        def unlucky_number(self):
+            return 999
+
+    obj = AR()
+    assert obj.say_something(3) == "hihihi"
+    assert obj.unlucky_number() == 99
+    assert obj.say_everything() == "hi 99"
+
+    obj = AT()
+    assert obj.say_something(3) == "hihihi"
+    assert obj.unlucky_number() == 999
+    assert obj.say_everything() == "hi 999"
+
+    for obj in [m.B_Repeat(), m.B_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 13
+        assert obj.lucky_number() == 7.0
+        assert obj.say_everything() == "B says hi 1 times 13"
+
+    for obj in [m.C_Repeat(), m.C_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 4444
+        assert obj.lucky_number() == 888.0
+        assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CR(m.C_Repeat):
+        def lucky_number(self):
+            return m.C_Repeat.lucky_number(self) + 1.25
+
+    obj = CR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 889.25
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CT(m.C_Tpl):
+        pass
+
+    obj = CT()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 888.0
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CCR(CR):
+        def lucky_number(self):
+            return CR.lucky_number(self) * 10
+
+    obj = CCR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 8892.5
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class CCT(CT):
+        def lucky_number(self):
+            return CT.lucky_number(self) * 1000
+
+    obj = CCT()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 4444
+    assert obj.lucky_number() == 888000.0
+    assert obj.say_everything() == "B says hi 1 times 4444"
+
+    class DR(m.D_Repeat):
+        def unlucky_number(self):
+            return 123
+
+        def lucky_number(self):
+            return 42.0
+
+    for obj in [m.D_Repeat(), m.D_Tpl()]:
+        assert obj.say_something(3) == "B says hi 3 times"
+        assert obj.unlucky_number() == 4444
+        assert obj.lucky_number() == 888.0
+        assert obj.say_everything() == "B says hi 1 times 4444"
+
+    obj = DR()
+    assert obj.say_something(3) == "B says hi 3 times"
+    assert obj.unlucky_number() == 123
+    assert obj.lucky_number() == 42.0
+    assert obj.say_everything() == "B says hi 1 times 123"
+
+    class DT(m.D_Tpl):
+        def say_something(self, times):
+            return "DT says:" + (' quack' * times)
+
+        def unlucky_number(self):
+            return 1234
+
+        def lucky_number(self):
+            return -4.25
+
+    obj = DT()
+    assert obj.say_something(3) == "DT says: quack quack quack"
+    assert obj.unlucky_number() == 1234
+    assert obj.lucky_number() == -4.25
+    assert obj.say_everything() == "DT says: quack 1234"
+
+    class DT2(DT):
+        def say_something(self, times):
+            return "DT2: " + ('QUACK' * times)
+
+        def unlucky_number(self):
+            return -3
+
+    class BT(m.B_Tpl):
+        def say_something(self, times):
+            return "BT" * times
+
+        def unlucky_number(self):
+            return -7
+
+        def lucky_number(self):
+            return -1.375
+
+    obj = BT()
+    assert obj.say_something(3) == "BTBTBT"
+    assert obj.unlucky_number() == -7
+    assert obj.lucky_number() == -1.375
+    assert obj.say_everything() == "BT -7"
+
+
+def test_issue_1454():
+    # Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+    m.test_gil()
+    m.test_gil_from_thread()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindCatch.cmake b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindCatch.cmake
new file mode 100644
index 0000000..9d490c5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindCatch.cmake
@@ -0,0 +1,57 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line REGEX "Catch v.*" LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(DOWNLOAD ${url} "${destination_dir}/catch.hpp" STATUS status)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+  endif()
+  set(CATCH_INCLUDE_DIR "${destination_dir}" CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(CATCH_INCLUDE_DIR NAMES catch.hpp PATH_SUFFIXES catch)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+set(CATCH_FOUND TRUE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindEigen3.cmake b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindEigen3.cmake
new file mode 100644
index 0000000..9c546a0
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindEigen3.cmake
@@ -0,0 +1,81 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+
+else (EIGEN3_INCLUDE_DIR)
+
+  find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+      PATHS
+      ${CMAKE_INSTALL_PREFIX}/include
+      ${KDE4_INCLUDE_DIR}
+      PATH_SUFFIXES eigen3 eigen
+    )
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindPythonLibsNew.cmake b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindPythonLibsNew.cmake
new file mode 100644
index 0000000..52c92f1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/FindPythonLibsNew.cmake
@@ -0,0 +1,202 @@
+# - Find python libraries
+# This module finds the libraries corresponding to the Python interpreter
+# FindPythonInterp provides.
+# This code sets the following variables:
+#
+#  PYTHONLIBS_FOUND           - have the Python libs been found
+#  PYTHON_PREFIX              - path to the Python installation
+#  PYTHON_LIBRARIES           - path to the python library
+#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
+#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
+#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
+#  PYTHON_SITE_PACKAGES       - path to installation site-packages
+#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
+#
+# Thanks to talljimbo for the patch adding the 'LDVERSION' config
+# variable usage.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
+    return()
+endif()
+
+# Use the Python interpreter to find the libs.
+if(PythonLibsNew_FIND_REQUIRED)
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
+else()
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
+endif()
+
+if(NOT PYTHONINTERP_FOUND)
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
+# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
+# way to detect a CPython debug interpreter.
+#
+# The library suffix is from the config var LDVERSION sometimes, otherwise
+# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
+execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "from distutils import sysconfig as s;import sys;import struct;
+print('.'.join(str(v) for v in sys.version_info));
+print(sys.prefix);
+print(s.get_python_inc(plat_specific=True));
+print(s.get_python_lib(plat_specific=True));
+print(s.get_config_var('SO'));
+print(hasattr(sys, 'gettotalrefcount')+0);
+print(struct.calcsize('@P'));
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(s.get_config_var('LIBDIR') or '');
+print(s.get_config_var('MULTIARCH') or '');
+"
+    RESULT_VARIABLE _PYTHON_SUCCESS
+    OUTPUT_VARIABLE _PYTHON_VALUES
+    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+if(NOT _PYTHON_SUCCESS MATCHES 0)
+    if(PythonLibsNew_FIND_REQUIRED)
+        message(FATAL_ERROR
+            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# Convert the process output into a list
+if(WIN32)
+    string(REGEX REPLACE "\\\\" "/" _PYTHON_VALUES ${_PYTHON_VALUES})
+endif()
+string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
+list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
+list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
+list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
+list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
+list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
+list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
+list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
+list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
+list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
+
+# Make sure the Python has the same pointer-size as the chosen compiler
+# Skip if CMAKE_SIZEOF_VOID_P is not defined
+if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
+    if(PythonLibsNew_FIND_REQUIRED)
+        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
+        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+        message(FATAL_ERROR
+            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
+            "chosen compiler is  ${_CMAKE_BITS}-bit")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# The built-in FindPython didn't always give the version numbers
+string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
+list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
+list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
+list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX "${PYTHON_PREFIX}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR "${PYTHON_INCLUDE_DIR}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES "${PYTHON_SITE_PACKAGES}")
+
+if(CMAKE_HOST_WIN32 AND NOT (MSYS OR MINGW))
+    set(PYTHON_LIBRARY
+        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+
+else()
+    if(PYTHON_MULTIARCH)
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+    else()
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+    endif()
+    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+    # Probably this needs to be more involved. It would be nice if the config
+    # information the python interpreter itself gave us were more complete.
+    find_library(PYTHON_LIBRARY
+        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+        PATHS ${_PYTHON_LIBS_SEARCH}
+        NO_DEFAULT_PATH)
+
+    # If all else fails, just set the name/version and let the linker figure out the path.
+    if(NOT PYTHON_LIBRARY)
+        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
+    endif()
+endif()
+
+MARK_AS_ADVANCED(
+  PYTHON_LIBRARY
+  PYTHON_INCLUDE_DIR
+)
+
+# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
+# cache entries because they are meant to specify the location of a single
+# library. We now set the variables listed by the documentation for this
+# module.
+SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
+SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
+
+find_package_message(PYTHON
+    "Found PythonLibs: ${PYTHON_LIBRARY}"
+    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
+
+set(PYTHONLIBS_FOUND TRUE)
+set(PythonLibsNew_FOUND TRUE)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/check-style.sh b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/check-style.sh
new file mode 100755
index 0000000..0a9f7d2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/check-style.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Script to check include/test code for common pybind11 code style errors.
+#
+# This script currently checks for
+#
+# 1. use of tabs instead of spaces
+# 2. MSDOS-style CRLF endings
+# 3. trailing spaces
+# 4. missing space between keyword and parenthesis, e.g.: for(, if(, while(
+# 5. Missing space between right parenthesis and brace, e.g. 'for (...){'
+# 6. opening brace on its own line. It should always be on the same line as the
+#    if/while/for/do statement.
+#
+# Invoke as: tools/check-style.sh
+#
+
+check_style_errors=0
+IFS=$'\n'
+
+found="$( GREP_COLORS='mt=41' GREP_COLOR='41' grep $'\t' include tests/*.{cpp,py,h} docs/*.rst -rn --color=always )"
+if [ -n "$found" ]; then
+    # The mt=41 sets a red background for matched tabs:
+    echo -e '\033[31;01mError: found tab characters in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
+
+
+found="$( grep -IUlr $'\r' include tests/*.{cpp,py,h} docs/*.rst --color=always )"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found CRLF characters in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
+
+found="$(GREP_COLORS='mt=41' GREP_COLOR='41' grep '[[:blank:]]\+$' include tests/*.{cpp,py,h} docs/*.rst -rn --color=always )"
+if [ -n "$found" ]; then
+    # The mt=41 sets a red background for matched trailing spaces
+    echo -e '\033[31;01mError: found trailing spaces in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
+
+found="$(grep '\<\(if\|for\|while\|catch\)(\|){' include tests/*.{cpp,h} -rn --color=always)"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found the following coding style problems:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
+
+found="$(awk '
+function prefix(filename, lineno) {
+    return "    \033[35m" filename "\033[36m:\033[32m" lineno "\033[36m:\033[0m"
+}
+function mark(pattern, string) { sub(pattern, "\033[01;31m&\033[0m", string); return string }
+last && /^\s*{/ {
+    print prefix(FILENAME, FNR-1) mark("\\)\\s*$", last)
+    print prefix(FILENAME, FNR)   mark("^\\s*{", $0)
+    last=""
+}
+{ last = /(if|for|while|catch|switch)\s*\(.*\)\s*$/ ? $0 : "" }
+' $(find include -type f) tests/*.{cpp,h} docs/*.rst)"
+if [ -n "$found" ]; then
+    check_style_errors=1
+    echo -e '\033[31;01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files:\033[0m'
+    echo "$found"
+fi
+
+exit $check_style_errors
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/libsize.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/libsize.py
new file mode 100644
index 0000000..5dcb8b0
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/libsize.py
@@ -0,0 +1,38 @@
+from __future__ import print_function, division
+import os
+import sys
+
+# Internal build script for generating debugging test .so size.
+# Usage:
+#     python libsize.py file.so save.txt -- displays the size of file.so and, if save.txt exists, compares it to the
+#                                           size in it, then overwrites save.txt with the new size for future runs.
+
+if len(sys.argv) != 3:
+    sys.exit("Invalid arguments: usage: python libsize.py file.so save.txt")
+
+lib = sys.argv[1]
+save = sys.argv[2]
+
+if not os.path.exists(lib):
+    sys.exit("Error: requested file ({}) does not exist".format(lib))
+
+libsize = os.path.getsize(lib)
+
+print("------", os.path.basename(lib), "file size:", libsize, end='')
+
+if os.path.exists(save):
+    with open(save) as sf:
+        oldsize = int(sf.readline())
+
+    if oldsize > 0:
+        change = libsize - oldsize
+        if change == 0:
+            print(" (no change)")
+        else:
+            print(" (change of {:+} bytes = {:+.2%})".format(change, change / oldsize))
+else:
+    print()
+
+with open(save, 'w') as sf:
+    sf.write(str(libsize))
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/mkdoc.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/mkdoc.py
new file mode 100755
index 0000000..44164af
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/mkdoc.py
@@ -0,0 +1,379 @@
+#!/usr/bin/env python3
+#
+#  Syntax: mkdoc.py [-I<path> ..] [.. a list of header files ..]
+#
+#  Extract documentation from C++ header files to use it in Python bindings
+#
+
+import os
+import sys
+import platform
+import re
+import textwrap
+
+from clang import cindex
+from clang.cindex import CursorKind
+from collections import OrderedDict
+from glob import glob
+from threading import Thread, Semaphore
+from multiprocessing import cpu_count
+
+RECURSE_LIST = [
+    CursorKind.TRANSLATION_UNIT,
+    CursorKind.NAMESPACE,
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.ENUM_DECL,
+    CursorKind.CLASS_TEMPLATE
+]
+
+PRINT_LIST = [
+    CursorKind.CLASS_DECL,
+    CursorKind.STRUCT_DECL,
+    CursorKind.ENUM_DECL,
+    CursorKind.ENUM_CONSTANT_DECL,
+    CursorKind.CLASS_TEMPLATE,
+    CursorKind.FUNCTION_DECL,
+    CursorKind.FUNCTION_TEMPLATE,
+    CursorKind.CONVERSION_FUNCTION,
+    CursorKind.CXX_METHOD,
+    CursorKind.CONSTRUCTOR,
+    CursorKind.FIELD_DECL
+]
+
+PREFIX_BLACKLIST = [
+    CursorKind.TRANSLATION_UNIT
+]
+
+CPP_OPERATORS = {
+    '<=': 'le', '>=': 'ge', '==': 'eq', '!=': 'ne', '[]': 'array',
+    '+=': 'iadd', '-=': 'isub', '*=': 'imul', '/=': 'idiv', '%=':
+    'imod', '&=': 'iand', '|=': 'ior', '^=': 'ixor', '<<=': 'ilshift',
+    '>>=': 'irshift', '++': 'inc', '--': 'dec', '<<': 'lshift', '>>':
+    'rshift', '&&': 'land', '||': 'lor', '!': 'lnot', '~': 'bnot',
+    '&': 'band', '|': 'bor', '+': 'add', '-': 'sub', '*': 'mul', '/':
+    'div', '%': 'mod', '<': 'lt', '>': 'gt', '=': 'assign', '()': 'call'
+}
+
+CPP_OPERATORS = OrderedDict(
+    sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0])))
+
+job_count = cpu_count()
+job_semaphore = Semaphore(job_count)
+
+
+class NoFilenamesError(ValueError):
+    pass
+
+
+def d(s):
+    return s if isinstance(s, str) else s.decode('utf8')
+
+
+def sanitize_name(name):
+    name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name)
+    for k, v in CPP_OPERATORS.items():
+        name = name.replace('operator%s' % k, 'operator_%s' % v)
+    name = re.sub('<.*>', '', name)
+    name = ''.join([ch if ch.isalnum() else '_' for ch in name])
+    name = re.sub('_$', '', re.sub('_+', '_', name))
+    return '__doc_' + name
+
+
+def process_comment(comment):
+    result = ''
+
+    # Remove C++ comment syntax
+    leading_spaces = float('inf')
+    for s in comment.expandtabs(tabsize=4).splitlines():
+        s = s.strip()
+        if s.startswith('/*'):
+            s = s[2:].lstrip('*')
+        elif s.endswith('*/'):
+            s = s[:-2].rstrip('*')
+        elif s.startswith('///'):
+            s = s[3:]
+        if s.startswith('*'):
+            s = s[1:]
+        if len(s) > 0:
+            leading_spaces = min(leading_spaces, len(s) - len(s.lstrip()))
+        result += s + '\n'
+
+    if leading_spaces != float('inf'):
+        result2 = ""
+        for s in result.splitlines():
+            result2 += s[leading_spaces:] + '\n'
+        result = result2
+
+    # Doxygen tags
+    cpp_group = '([\w:]+)'
+    param_group = '([\[\w:\]]+)'
+
+    s = result
+    s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s)
+    s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s)
+    s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s)
+    s = re.sub(r'\\ingroup\s+%s' % cpp_group, r'', s)
+    s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group),
+               r'\n\n$Parameter ``\2``:\n\n', s)
+    s = re.sub(r'\\tparam%s?\s+%s' % (param_group, cpp_group),
+               r'\n\n$Template parameter ``\2``:\n\n', s)
+
+    for in_, out_ in {
+        'return': 'Returns',
+        'author': 'Author',
+        'authors': 'Authors',
+        'copyright': 'Copyright',
+        'date': 'Date',
+        'remark': 'Remark',
+        'sa': 'See also',
+        'see': 'See also',
+        'extends': 'Extends',
+        'throw': 'Throws',
+        'throws': 'Throws'
+    }.items():
+        s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s)
+
+    s = re.sub(r'\\details\s*', r'\n\n', s)
+    s = re.sub(r'\\brief\s*', r'', s)
+    s = re.sub(r'\\short\s*', r'', s)
+    s = re.sub(r'\\ref\s*', r'', s)
+
+    s = re.sub(r'\\code\s?(.*?)\s?\\endcode',
+               r"```\n\1\n```\n", s, flags=re.DOTALL)
+
+    # HTML/TeX tags
+    s = re.sub(r'<tt>(.*?)</tt>', r'``\1``', s, flags=re.DOTALL)
+    s = re.sub(r'<pre>(.*?)</pre>', r"```\n\1\n```\n", s, flags=re.DOTALL)
+    s = re.sub(r'<em>(.*?)</em>', r'*\1*', s, flags=re.DOTALL)
+    s = re.sub(r'<b>(.*?)</b>', r'**\1**', s, flags=re.DOTALL)
+    s = re.sub(r'\\f\$(.*?)\\f\$', r'$\1$', s, flags=re.DOTALL)
+    s = re.sub(r'<li>', r'\n\n* ', s)
+    s = re.sub(r'</?ul>', r'', s)
+    s = re.sub(r'</li>', r'\n\n', s)
+
+    s = s.replace('``true``', '``True``')
+    s = s.replace('``false``', '``False``')
+
+    # Re-flow text
+    wrapper = textwrap.TextWrapper()
+    wrapper.expand_tabs = True
+    wrapper.replace_whitespace = True
+    wrapper.drop_whitespace = True
+    wrapper.width = 70
+    wrapper.initial_indent = wrapper.subsequent_indent = ''
+
+    result = ''
+    in_code_segment = False
+    for x in re.split(r'(```)', s):
+        if x == '```':
+            if not in_code_segment:
+                result += '```\n'
+            else:
+                result += '\n```\n\n'
+            in_code_segment = not in_code_segment
+        elif in_code_segment:
+            result += x.strip()
+        else:
+            for y in re.split(r'(?: *\n *){2,}', x):
+                wrapped = wrapper.fill(re.sub(r'\s+', ' ', y).strip())
+                if len(wrapped) > 0 and wrapped[0] == '$':
+                    result += wrapped[1:] + '\n'
+                    wrapper.initial_indent = \
+                        wrapper.subsequent_indent = ' ' * 4
+                else:
+                    if len(wrapped) > 0:
+                        result += wrapped + '\n\n'
+                    wrapper.initial_indent = wrapper.subsequent_indent = ''
+    return result.rstrip().lstrip('\n')
+
+
+def extract(filename, node, prefix, output):
+    if not (node.location.file is None or
+            os.path.samefile(d(node.location.file.name), filename)):
+        return 0
+    if node.kind in RECURSE_LIST:
+        sub_prefix = prefix
+        if node.kind not in PREFIX_BLACKLIST:
+            if len(sub_prefix) > 0:
+                sub_prefix += '_'
+            sub_prefix += d(node.spelling)
+        for i in node.get_children():
+            extract(filename, i, sub_prefix, output)
+    if node.kind in PRINT_LIST:
+        comment = d(node.raw_comment) if node.raw_comment is not None else ''
+        comment = process_comment(comment)
+        sub_prefix = prefix
+        if len(sub_prefix) > 0:
+            sub_prefix += '_'
+        if len(node.spelling) > 0:
+            name = sanitize_name(sub_prefix + d(node.spelling))
+            output.append((name, filename, comment))
+
+
+class ExtractionThread(Thread):
+    def __init__(self, filename, parameters, output):
+        Thread.__init__(self)
+        self.filename = filename
+        self.parameters = parameters
+        self.output = output
+        job_semaphore.acquire()
+
+    def run(self):
+        print('Processing "%s" ..' % self.filename, file=sys.stderr)
+        try:
+            index = cindex.Index(
+                cindex.conf.lib.clang_createIndex(False, True))
+            tu = index.parse(self.filename, self.parameters)
+            extract(self.filename, tu.cursor, '', self.output)
+        finally:
+            job_semaphore.release()
+
+
+def read_args(args):
+    parameters = []
+    filenames = []
+    if "-x" not in args:
+        parameters.extend(['-x', 'c++'])
+    if not any(it.startswith("-std=") for it in args):
+        parameters.append('-std=c++11')
+
+    if platform.system() == 'Darwin':
+        dev_path = '/Applications/Xcode.app/Contents/Developer/'
+        lib_dir = dev_path + 'Toolchains/XcodeDefault.xctoolchain/usr/lib/'
+        sdk_dir = dev_path + 'Platforms/MacOSX.platform/Developer/SDKs'
+        libclang = lib_dir + 'libclang.dylib'
+
+        if os.path.exists(libclang):
+            cindex.Config.set_library_path(os.path.dirname(libclang))
+
+        if os.path.exists(sdk_dir):
+            sysroot_dir = os.path.join(sdk_dir, next(os.walk(sdk_dir))[1][0])
+            parameters.append('-isysroot')
+            parameters.append(sysroot_dir)
+    elif platform.system() == 'Linux':
+        # clang doesn't find its own base includes by default on Linux,
+        # but different distros install them in different paths.
+        # Try to autodetect, preferring the highest numbered version.
+        def clang_folder_version(d):
+            return [int(ver) for ver in re.findall(r'(?<!lib)(?<!\d)\d+', d)]
+        clang_include_dir = max((
+            path
+            for libdir in ['lib64', 'lib', 'lib32']
+            for path in glob('/usr/%s/clang/*/include' % libdir)
+            if os.path.isdir(path)
+        ), default=None, key=clang_folder_version)
+        if clang_include_dir:
+            parameters.extend(['-isystem', clang_include_dir])
+
+    for item in args:
+        if item.startswith('-'):
+            parameters.append(item)
+        else:
+            filenames.append(item)
+
+    if len(filenames) == 0:
+        raise NoFilenamesError("args parameter did not contain any filenames")
+
+    return parameters, filenames
+
+
+def extract_all(args):
+    parameters, filenames = read_args(args)
+    output = []
+    for filename in filenames:
+        thr = ExtractionThread(filename, parameters, output)
+        thr.start()
+
+    print('Waiting for jobs to finish ..', file=sys.stderr)
+    for i in range(job_count):
+        job_semaphore.acquire()
+
+    return output
+
+
+def write_header(comments, out_file=sys.stdout):
+    print('''/*
+  This file contains docstrings for the Python bindings.
+  Do not edit! These were automatically extracted by mkdoc.py
+ */
+
+#define __EXPAND(x)                                      x
+#define __COUNT(_1, _2, _3, _4, _5, _6, _7, COUNT, ...)  COUNT
+#define __VA_SIZE(...)                                   __EXPAND(__COUNT(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1))
+#define __CAT1(a, b)                                     a ## b
+#define __CAT2(a, b)                                     __CAT1(a, b)
+#define __DOC1(n1)                                       __doc_##n1
+#define __DOC2(n1, n2)                                   __doc_##n1##_##n2
+#define __DOC3(n1, n2, n3)                               __doc_##n1##_##n2##_##n3
+#define __DOC4(n1, n2, n3, n4)                           __doc_##n1##_##n2##_##n3##_##n4
+#define __DOC5(n1, n2, n3, n4, n5)                       __doc_##n1##_##n2##_##n3##_##n4##_##n5
+#define __DOC6(n1, n2, n3, n4, n5, n6)                   __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6
+#define __DOC7(n1, n2, n3, n4, n5, n6, n7)               __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6##_##n7
+#define DOC(...)                                         __EXPAND(__EXPAND(__CAT2(__DOC, __VA_SIZE(__VA_ARGS__)))(__VA_ARGS__))
+
+#if defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+''', file=out_file)
+
+
+    name_ctr = 1
+    name_prev = None
+    for name, _, comment in list(sorted(comments, key=lambda x: (x[0], x[1]))):
+        if name == name_prev:
+            name_ctr += 1
+            name = name + "_%i" % name_ctr
+        else:
+            name_prev = name
+            name_ctr = 1
+        print('\nstatic const char *%s =%sR"doc(%s)doc";' %
+              (name, '\n' if '\n' in comment else ' ', comment), file=out_file)
+
+    print('''
+#if defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+''', file=out_file)
+
+
+def mkdoc(args):
+    args = list(args)
+    out_path = None
+    for idx, arg in enumerate(args):
+        if arg.startswith("-o"):
+            args.remove(arg)
+            try:
+                out_path = arg[2:] or args.pop(idx)
+            except IndexError:
+                print("-o flag requires an argument")
+                exit(-1)
+            break
+
+    comments = extract_all(args)
+
+    if out_path:
+        try:
+            with open(out_path, 'w') as out_file:
+                write_header(comments, out_file)
+        except:
+            # In the event of an error, don't leave a partially-written
+            # output file.
+            try:
+                os.unlink(out_path)
+            except:
+                pass
+            raise
+    else:
+        write_header(comments)
+
+
+if __name__ == '__main__':
+    try:
+        mkdoc(sys.argv[1:])
+    except NoFilenamesError:
+        print('Syntax: %s [.. a list of header files ..]' % sys.argv[0])
+        exit(-1)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Config.cmake.in b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Config.cmake.in
new file mode 100644
index 0000000..8a7272f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Config.cmake.in
@@ -0,0 +1,104 @@
+# pybind11Config.cmake
+# --------------------
+#
+# PYBIND11 cmake module.
+# This module sets the following variables in your project::
+#
+#   pybind11_FOUND - true if pybind11 and all required components found on the system
+#   pybind11_VERSION - pybind11 version in format Major.Minor.Release
+#   pybind11_INCLUDE_DIRS - Directories where pybind11 and python headers are located.
+#   pybind11_INCLUDE_DIR - Directory where pybind11 headers are located.
+#   pybind11_DEFINITIONS - Definitions necessary to use pybind11, namely USING_pybind11.
+#   pybind11_LIBRARIES - compile flags and python libraries (as needed) to link against.
+#   pybind11_LIBRARY - empty.
+#   CMAKE_MODULE_PATH - appends location of accompanying FindPythonLibsNew.cmake and
+#                       pybind11Tools.cmake modules.
+#
+#
+# Available components: None
+#
+#
+# Exported targets::
+#
+# If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
+# interface library targets::
+#
+#   pybind11::module - for extension modules
+#   pybind11::embed - for embedding the Python interpreter
+#
+# Python headers, libraries (as needed by platform), and the C++ standard
+# are attached to the target. Set PythonLibsNew variables to influence
+# python detection and PYBIND11_CPP_STANDARD (-std=c++11 or -std=c++14) to
+# influence standard setting. ::
+#
+#   find_package(pybind11 CONFIG REQUIRED)
+#   message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+#
+#   # Create an extension module
+#   add_library(mylib MODULE main.cpp)
+#   target_link_libraries(mylib pybind11::module)
+#
+#   # Or embed the Python interpreter into an executable
+#   add_executable(myexe main.cpp)
+#   target_link_libraries(myexe pybind11::embed)
+#
+# Suggested usage::
+#
+# find_package with version info is not recommended except for release versions. ::
+#
+#   find_package(pybind11 CONFIG)
+#   find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
+#
+#
+# The following variables can be set to guide the search for this package::
+#
+#   pybind11_DIR - CMake variable, set to directory containing this Config file
+#   CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
+#   PATH - environment variable, set to bin directory of this package
+#   CMAKE_DISABLE_FIND_PACKAGE_pybind11 - CMake variable, disables
+#     find_package(pybind11) when not REQUIRED, perhaps to force internal build
+
+@PACKAGE_INIT@
+
+set(PN pybind11)
+
+# location of pybind11/pybind11.h
+set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+
+set(${PN}_LIBRARY "")
+set(${PN}_DEFINITIONS USING_${PN})
+
+check_required_components(${PN})
+
+# make detectable the FindPythonLibsNew.cmake module
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
+
+include(pybind11Tools)
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::pybind11)
+    include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+
+    find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED)
+    set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIRS})
+    set_property(TARGET ${PN}::embed APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    if(WIN32 OR CYGWIN)
+      set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    endif()
+
+    if(CMAKE_VERSION VERSION_LESS 3.3)
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS "${PYBIND11_CPP_STANDARD}")
+    else()
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+    endif()
+
+    get_property(_iid TARGET ${PN}::pybind11 PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    get_property(_ill TARGET ${PN}::module PROPERTY INTERFACE_LINK_LIBRARIES)
+    set(${PN}_INCLUDE_DIRS ${_iid})
+    set(${PN}_LIBRARIES ${_ico} ${_ill})
+endif()
+endif()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Tools.cmake b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Tools.cmake
new file mode 100644
index 0000000..d0a2bfc
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Tools.cmake
@@ -0,0 +1,227 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+# Add a CMake parameter for choosing a desired Python version
+if(NOT PYBIND11_PYTHON_VERSION)
+  set(PYBIND11_PYTHON_VERSION "" CACHE STRING "Python version to use for compiling modules")
+endif()
+
+set(Python_ADDITIONAL_VERSIONS 3.8 3.7 3.6 3.5 3.4)
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} REQUIRED)
+
+include(CheckCXXCompilerFlag)
+include(CMakeParseArguments)
+
+if(NOT PYBIND11_CPP_STANDARD AND NOT CMAKE_CXX_STANDARD)
+  if(NOT MSVC)
+    check_cxx_compiler_flag("-std=c++14" HAS_CPP14_FLAG)
+
+    if (HAS_CPP14_FLAG)
+      set(PYBIND11_CPP_STANDARD -std=c++14)
+    else()
+      check_cxx_compiler_flag("-std=c++11" HAS_CPP11_FLAG)
+      if (HAS_CPP11_FLAG)
+        set(PYBIND11_CPP_STANDARD -std=c++11)
+      else()
+        message(FATAL_ERROR "Unsupported compiler -- pybind11 requires C++11 support!")
+      endif()
+    endif()
+  elseif(MSVC)
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+  endif()
+
+  set(PYBIND11_CPP_STANDARD ${PYBIND11_CPP_STANDARD} CACHE STRING
+      "C++ standard flag, e.g. -std=c++11, -std=c++14, /std:c++14.  Defaults to C++14 mode." FORCE)
+endif()
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.  cxxflags and
+# linkerflags are lists of flags to use.  The result variable is a unique variable name for each set
+# of flags: the compilation result will be cached base on the result variable.  If the flags work,
+# sets them in cxxflags_out/linkerflags_out internal cache variables (in addition to ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if (${result})
+    set(${cxxflags_out} "${cxxflags}" CACHE INTERNAL "" FORCE)
+    set(${linkerflags_out} "${linkerflags}" CACHE INTERNAL "" FORCE)
+  endif()
+endfunction()
+
+# Internal: find the appropriate link time optimization flags for this compiler
+function(_pybind11_add_lto_flags target_name prefer_thin_lto)
+  if (NOT DEFINED PYBIND11_LTO_CXX_FLAGS)
+    set(PYBIND11_LTO_CXX_FLAGS "" CACHE INTERNAL "")
+    set(PYBIND11_LTO_LINKER_FLAGS "" CACHE INTERNAL "")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+      set(cxx_append "")
+      set(linker_append "")
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+        # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+        set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+      elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+        set(cxx_append ";-fno-fat-lto-objects")
+      endif()
+
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO_THIN
+          "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+
+      if (NOT HAS_FLTO_THIN)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO
+          "-flto${cxx_append}" "-flto${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+      # Intel equivalent to LTO is called IPO
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO
+      "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    elseif(MSVC)
+      # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+      # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+      # with - instead of /, even if it is a bit non-standard:
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG
+        "/GL" "-LTCG" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+
+    if (PYBIND11_LTO_CXX_FLAGS)
+      message(STATUS "LTO enabled")
+    else()
+      message(STATUS "LTO disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if (PYBIND11_LTO_CXX_FLAGS)
+    target_compile_options(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_CXX_FLAGS}>")
+  endif()
+  if (PYBIND11_LTO_LINKER_FLAGS)
+    target_link_libraries(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_LINKER_FLAGS}>")
+  endif()
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [SYSTEM] [THIN_LTO] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options MODULE SHARED EXCLUDE_FROM_ALL NO_EXTRAS SYSTEM THIN_LTO)
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  if(ARG_SYSTEM)
+    set(inc_isystem SYSTEM)
+  endif()
+
+  target_include_directories(${target_name} ${inc_isystem}
+    PRIVATE ${PYBIND11_INCLUDE_DIR}  # from project CMakeLists.txt
+    PRIVATE ${pybind11_INCLUDE_DIR}  # from pybind11Config
+    PRIVATE ${PYTHON_INCLUDE_DIRS})
+
+  # Python debug libraries expose slightly different objects
+  # https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+  # https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+  if(PYTHON_IS_DEBUG)
+    target_compile_definitions(${target_name} PRIVATE Py_DEBUG)
+  endif()
+
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(${target_name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
+  set_target_properties(${target_name} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+
+  if(WIN32 OR CYGWIN)
+    # Link against the Python shared library on Windows
+    target_link_libraries(${target_name} PRIVATE ${PYTHON_LIBRARIES})
+  elseif(APPLE)
+    # It's quite common to have multiple copies of the same Python version
+    # installed on one's system. E.g.: one copy from the OS and another copy
+    # that's statically linked into an application like Blender or Maya.
+    # If we link our plugin library against the OS Python here and import it
+    # into Blender or Maya later on, this will cause segfaults when multiple
+    # conflicting Python instances are active at the same time (even when they
+    # are of the same version).
+
+    # Windows is not affected by this issue since it handles DLL imports
+    # differently. The solution for Linux and Mac OS is simple: we just don't
+    # link against the Python library. The resulting shared library will have
+    # missing symbols, but that's perfectly fine -- they will be resolved at
+    # import time.
+
+    target_link_libraries(${target_name} PRIVATE "-undefined dynamic_lookup")
+
+    if(ARG_SHARED)
+      # Suppress CMake >= 3.0 warning for shared libraries
+      set_target_properties(${target_name} PROPERTIES MACOSX_RPATH ON)
+    endif()
+  endif()
+
+  # Make sure C++11/14 are enabled
+  if(CMAKE_VERSION VERSION_LESS 3.3)
+    target_compile_options(${target_name} PUBLIC ${PYBIND11_CPP_STANDARD})
+  else()
+    target_compile_options(${target_name} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  _pybind11_add_lto_flags(${target_name} ${ARG_THIN_LTO})
+
+  if (NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+    # Strip unnecessary sections of the binary on Linux/Mac OS
+    if(CMAKE_STRIP)
+      if(APPLE)
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} -x $<TARGET_FILE:${target_name}>)
+      else()
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} $<TARGET_FILE:${target_name}>)
+      endif()
+    endif()
+  endif()
+
+  if(MSVC)
+    # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
+    # needed for bigger binding projects due to the limit to 64k addressable sections
+    target_compile_options(${target_name} PRIVATE /bigobj)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/MP>)
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
+  endif()
+endfunction()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/build_protoc.sh b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/build_protoc.sh
new file mode 100644
index 0000000..5723036
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/build_protoc.sh
@@ -0,0 +1,11 @@
+cd $1
+echo "in `pwd`"
+# ~/cmake-3.6.3-Linux-x86_64/bin/cmake ../cmake/ -DBUILD_SHARED_LIBS=OFF -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_BUILD_EXAMPLES=OFF -DCMAKE_CXX_COMPILER=g++-4.8 -DCMAKE_C_COMPILER=gcc-4.8
+$CMAKE ../cmake/ \
+ -DCMAKE_CXX_FLAGS=" -fPIC -fvisibility=hidden "\
+ -DBUILD_SHARED_LIBS=OFF \
+ -Dprotobuf_BUILD_TESTS=OFF \
+ -Dprotobuf_BUILD_EXAMPLES=OFF \
+ -DCMAKE_INSTALL_PREFIX=../../protobuf \
+ -DCMAKE_CXX_COMPILER=$CPP_COMPILER \
+ -DCMAKE_C_COMPILER=$C_COMPILER && make -j4 install && cp protoc ../../
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/detect_dependency.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/detect_dependency.py
new file mode 100644
index 0000000..73ba982
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/detect_dependency.py
@@ -0,0 +1,141 @@
+import pkg_resources as pkg_res
+import sys, os, re, pip, shutil
+import os.path as P
+
+
+__wd = '3rdparty'  # working dir == `pwd`/3rdparty
+__offline = {}
+
+def load_offline_json(path: str):
+    import json
+    '''
+        {file_name}: {
+            'url': {url},
+            'version': {version_string},
+            'md5': {md5}
+        }
+    '''
+    with open(path, "r") as f:
+        ret = json.load(f)
+    assert isinstance(ret, dict)
+    need_key = ['url', 'version', 'md5']
+    md5_pat = re.compile(r'^[0-9a-f]{32}$')
+    for k in ret:
+        assert isinstance(k, str) \
+            and '\x00' not in k \
+            and '\\' not in k \
+            and '/' not in k
+        for need_k in need_key:
+            assert need_k in ret[k] \
+                and isinstance(ret[k][need_k], str)
+        assert md5_pat.match(ret[k]['md5']) is not None
+    return ret
+
+
+def parse_version(ver_str: str):
+    parts = ver_str.split(".")
+    major = int(parts[0])
+    minor = int(parts[1]) if len(parts) > 1 else None
+    patch = int(parts[2]) if len(parts) > 2 else None
+    return major, minor, patch
+
+
+def download(url: str, dest: str):
+    from requests import get
+    import subprocess
+    print("downloading " + dest + " from\n    " + url, file=sys.stderr)
+    # offline mode
+    if len(__offline) > 0:
+        file_name = P.basename(dest)
+        assert url == __offline[file_name]['url']
+        fin = subprocess.run(['md5sum', dest], stdout=subprocess.PIPE)
+        assert __offline[file_name]['md5'] == \
+            fin.stdout.decode('utf-8').split(" ")[0]
+        return __offline[file_name]['version']
+    proxies=dict()
+    if "PROXY" in os.environ.keys():
+        proxy_url = os.environ["PROXY"]
+        proxies = { "http": proxy_url, "https": proxy_url, "ftp": proxy_url }
+    # online mode
+    with open(dest, "wb") as f:
+        response = get(url, proxies=proxies)
+        f.write(response.content)
+        return None
+
+
+def extract_tar(tar: str, pattern: str, dest_dir: str):
+    import tarfile
+    pat = re.compile(pattern)
+    count = 0
+    with tarfile.open(tar, "r") as t:
+        for f in t.getmembers():
+            if len(pat.findall(f.name)) > 0:
+                t.extract(f, dest_dir)
+                count += 1
+    return count
+
+
+def detect_protobuf(pkgs: dict):
+    if 'protobuf' not in pkgs:
+        raise Exception("protobuf not found")
+    protobuf = pkgs['protobuf']
+    if P.exists(P.join(__wd, 'protobuf')):
+        return
+
+    major, minor, patch = parse_version(protobuf.version)
+    if major != 3 or minor < 5 or patch is None:
+        raise Exception("Incompatible protobuf version " + protobuf.version)
+
+    protobuf_pkg = P.join(
+        __wd,
+        'protobuf-%d.%d.%d.tar.gz' % (major, minor, patch)
+    )
+    url = 'https://github.com/protocolbuffers/protobuf/archive/' + \
+        'v%d.%d.%d.tar.gz' % (major, minor, patch)
+    got_ver = download(url, protobuf_pkg)
+    if got_ver is not None and got_ver != protobuf.version:
+        raise Exception(
+            "Offline package with invalid version: expect %s, got %s" %
+            (protobuf.version, got_ver)
+        )
+
+    extract_tar(protobuf_pkg, r'.*', __wd)
+
+    haeder_list_f = None
+    protobuf_dir = None
+    for f in os.listdir(__wd):
+        if 'protobuf' in f and P.isdir(P.join(__wd, f)):
+            protobuf_dir = P.join(__wd, f)
+            break
+
+    protoc_build_dir = protobuf_dir + '/static_build'
+    os.mkdir(protoc_build_dir)
+    if 0 != os.system('sh {}/build_protoc.sh {}'
+                      .format(P.dirname(__file__),
+                              protoc_build_dir)):
+        raise RuntimeError('Fail to build protoc')
+
+    shutil.rmtree(protobuf_dir)
+                
+    library = []
+    for d, sub_d, sub_f in os.walk(P.join(protobuf.location, 'google')):
+        for f in sub_f:
+            if f.endswith('.so'):
+                library.append(P.join(d, f))
+    print('protobuf==' + protobuf.version)
+    return library
+
+
+def detect_dependency():
+    pkgs = {p.project_name: p for p in pkg_res.working_set}
+    os.makedirs(__wd, exist_ok=True)
+    
+    if P.exists(P.join(__wd, 'offline.json')):
+        print("Use offline mode, loading " + P.join(__wd, 'offline.json'))
+        __offline.update(load_offline_json(P.join(__wd, 'offline.json')))
+        print("offline.json loaded with " + str(len(__offline)) + " archieve")
+
+    detect_protobuf(pkgs)
+
+if __name__ == '__main__':
+    detect_dependency()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/protobuf.cmake b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/protobuf.cmake
new file mode 100644
index 0000000..f8db405
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/protobuf.cmake
@@ -0,0 +1,34 @@
+
+set(Protobuf_ROOT ${CMAKE_SOURCE_DIR}/3rdparty/protobuf)
+
+if(EXISTS ${Protobuf_ROOT}/lib/cmake/protobuf)
+    set(Protobuf_DIR ${Protobuf_ROOT}/lib/cmake/protobuf)
+else()
+    set(Protobuf_DIR ${Protobuf_ROOT}/lib64/cmake/protobuf)
+endif()
+
+include(${Protobuf_DIR}/protobuf-options.cmake)
+include(${Protobuf_DIR}/protobuf-targets.cmake)
+include(${Protobuf_DIR}/protobuf-config.cmake)
+include(${Protobuf_DIR}/protobuf-module.cmake)
+
+# find_package(Protobuf REQUIRED HINTS ${Protobuf_DIR})
+
+if(PROTOBUF_FOUND)
+    include_directories(${PROTOBUF_INCLUDE_DIR})
+    include_directories(${PROTOBUF_INCLUDE_DIRS})
+    include_directories(${CMAKE_CURRENT_BINARY_DIR})
+    message(STATUS "Protobuf ${PROTOBUF_LIBRARIES}")
+    # protobuf_generate_cpp(ONNX_PROTO_SRC ONNX_PROTO_HEAD onnx-proto/onnx.proto)
+    set(ONNX_PROTO ${CMAKE_SOURCE_DIR}/../src/onnx-proto/onnx.proto)
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/proto)
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/proto/onnx.pb.h ${CMAKE_CURRENT_BINARY_DIR}/proto/onnx.pb.cc
+        COMMAND protobuf::protoc --cpp_out=${CMAKE_CURRENT_BINARY_DIR}/proto ${ONNX_PROTO} -I ${CMAKE_SOURCE_DIR}/../src/onnx-proto)
+    set(ONNX_PROTO_HEAD ${CMAKE_BINARY_DIR}/proto/onnx.pb.h)
+    set(ONNX_PROTO_SRC  ${CMAKE_BINARY_DIR}/proto/onnx.pb.cc)
+    include_directories(${CMAKE_BINARY_DIR}/proto)
+else()
+    message(FATAL_ERROR "Protobuf not found, must install first")
+endif()
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/setup.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/setup.py
new file mode 100644
index 0000000..642286f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/script/setup.py
@@ -0,0 +1,132 @@
+from setuptools import setup, Extension, find_packages
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+import subprocess
+import platform
+from shutil import copy2, rmtree, copytree
+import os
+import re
+import sys
+
+
+old_dir = os.getcwd()
+os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/..")
+p = 'package/onnx2tnn/'
+
+
+def ignore_files(dir, names):
+    return [name for name in names
+            if name == '__pycache__' or name.endswith('.pyc')]
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class Env:
+
+    @classmethod
+    def _find_command(self):
+        env = os.environ.copy()
+
+        self._cmake = env.get('CMAKE', 'cmake')
+        self._cpp   = env.get('CPP_COMPILER', 'g++')
+        self._c     = env.get('C_COMPILER', 'gcc')
+
+    @classmethod
+    def get_update_env(self):
+        return {
+            'CMAKE': self._cmake,
+            'CPP_COMPILER': self._cpp,
+            'C_COMPILER': self._c,
+        }
+
+
+class CMakeBuild(build_ext):
+
+    def run(self):
+
+        env = Env.get_update_env()
+        self._cmake = env['CMAKE']
+        self._cpp   = env['CPP_COMPILER']
+        self._c     = env['C_COMPILER']
+
+        try:
+            out = subprocess.check_output([self._cmake, '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            raise RuntimeError('Windows not supported yet')
+        else:
+            cmake_version = LooseVersion(
+                re.search(r'version\s*([\d.]+)', out.decode())
+                .group(1)
+            )
+            if cmake_version < '3.5.0':
+                raise RuntimeError("CMake >= 3.5.0 is required")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(
+            os.path.dirname(self.get_ext_fullpath(ext.name))
+        )
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DPYTHON_EXECUTABLE=' + sys.executable,
+                      '-DCMAKE_C_COMPILER=' + self._c,
+                      '-DCMAKE_CXX_COMPILER=' + self._cpp]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+        build_args += ['--', '-j']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'\
+            .format(env.get('CXXFLAGS', ''),
+                    self.distribution.get_version())
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        subprocess.check_call([self._cmake, ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call([self._cmake, '--build', '.'] + build_args, cwd=self.build_temp)
+
+
+try:
+    from detect_dependency import detect_dependency
+
+    Env._find_command()
+    os.environ.update(Env.get_update_env())
+    detect_dependency()
+
+    os.makedirs(p, exist_ok=True)
+    copy2('onnx2tnn.py', p + '__main__.py')
+    copy2('version.py', p)
+    copy2('onnx_model_cheker.py', p)
+    copy2('__init__.py', p)
+    rmtree(p + 'onnx_optimizer', True)
+    copytree('onnx_optimizer', p + 'onnx_optimizer', ignore=ignore_files)
+
+    setup(name='onnx2tnn',
+          version='0.0.1',
+          description='tools to convert onnx model to tnn model',
+          long_description='tools to convert onnx model to tnn model',
+          author='darrenyao, dandiding, lucas, wingzygan(contributer)',
+          package_dir={'': 'package'},
+          packages=find_packages('package'),
+          ext_modules=[CMakeExtension('onnx2tnn.onnx2tnn', '.')],
+          cmdclass=dict(build_ext=CMakeBuild),
+          install_requires=[
+              'onnx==1.6',
+              'onnxruntime==1.1',
+              'onnx-simplifier==0.2.4',
+          ])
+
+finally:
+    os.chdir(old_dir)
+    rmtree('package', True)
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-converter/version.py b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/version.py
new file mode 100644
index 0000000..7e32a22
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-converter/version.py
@@ -0,0 +1,35 @@
+import os
+import argparse
+import sys
+import time
+# sys.path.append('./onnx-optimizer')
+# from onnx_optimizer import onnx_optimizer
+
+import onnx2tnn
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('file_path', help='Input file path')
+    parser.add_argument('-version', help='version to set')
+    args = parser.parse_args()
+    file_path = args.file_path
+    file_version = args.version
+    # print(file_path)
+    # print(file_version)
+
+    if file_version == None:
+        print('0.----get file version:')
+        status = onnx2tnn.version(file_path)
+        print('1.----get file time:')
+        status = onnx2tnn.time(file_path)
+        print("2.----file version status: "+str(status))
+    else:
+        print('0.----set file version:')
+        status = onnx2tnn.set_version(file_path, file_version)
+        print('1.----set file time:')
+        file_time = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
+        status = onnx2tnn.set_time(file_path, file_time)
+        print("2.----file version status: "+str(status))
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml2onnx.py b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml2onnx.py
new file mode 100644
index 0000000..4cff6ca
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml2onnx.py
@@ -0,0 +1,385 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import argparse
+import sys
+import time
+
+import struct
+import json
+
+import onnx
+from onnx import helper, shape_inference
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx import numpy_helper
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('coreml_folder', help='Input coreml model folder')
+    args = parser.parse_args()
+    coreml_folder = args.coreml_folder
+
+    layer_bytes = []
+    net_layer_data = {}
+    coreml_net = coreml_folder + '/model.espresso.net'
+    coreml_shape = coreml_folder + '/model.espresso.shape'
+    coreml_weights = coreml_folder + '/model.espresso.weights'
+
+    with open(coreml_net, encoding='utf-8') as f:
+        net_dict = json.load(f)
+        net_layers = net_dict['layers']
+
+    # print(net_layers[1])
+
+    with open(coreml_shape, encoding='utf-8') as f:
+        net_dict = json.load(f)
+        net_layer_shapes = net_dict['layer_shapes']
+
+    # print(net_layer_shapes[net_layers[1]['bottom']])
+
+    # coreml_weights
+    with open(coreml_weights, 'rb') as f:
+        # First byte of the file is an integer with how many
+        # sections there are.  This lets us iterate through each section
+        # and get the map for how to read the rest of the file.
+        num_layers = struct.unpack('<i', f.read(4))[0]
+        # print("num_layers: " + str(num_layers))
+
+        f.read(4)  # padding bytes
+
+        # The next section defines the number of bytes each layer contains.
+        # It has a format of
+        # | Layer Number | <padding> | Bytes in layer | <padding> |
+        while len(layer_bytes) < num_layers:
+            layer_num, _, num_bytes, _ = struct.unpack('<iiii', f.read(16))
+            layer_bytes.append((layer_num, num_bytes))
+
+        # print("layer_num: " + str(layer_num))
+        # print("num_bytes: " + str(num_bytes))
+        # print("each layer:\n")
+
+        # Read actual layer weights.  Weights are floats as far as I can tell.
+        for layer_num, num_bytes in layer_bytes:
+            # print("layer_num: " + str(layer_num))
+            # print("count: " + str(num_bytes / 4))
+            data = struct.unpack("f" * int((num_bytes / 4)), f.read(num_bytes))
+            net_layer_data[layer_num] = data
+
+    # print(net_layer_data[1])
+    # tensor_shape = [16]
+    # tensor = helper.make_tensor('1', TensorProto.FLOAT, tensor_shape, net_layer_data[1])
+    # print(tensor)
+
+    #构建onnx
+    #创建输入 (ValueInfoProto)
+    net_inputes = []
+    net_input_names = net_layers[0]['bottom'].split(',')
+    for net_input_name in net_input_names:
+        net_input_shape_dict = net_layer_shapes[net_input_name]
+        net_input = helper.make_tensor_value_info(net_input_name, TensorProto.FLOAT,
+                                                  [net_input_shape_dict['n'], net_input_shape_dict['k'],
+                                                   net_input_shape_dict['h'], net_input_shape_dict['w']])
+        net_inputes.append(net_input)
+
+    #创建输出 (ValueInfoProto)
+    net_output_shape_dict = net_layer_shapes[net_layers[-1]['top']]
+    net_output = helper.make_tensor_value_info(net_layers[-1]['top'], TensorProto.FLOAT, [net_output_shape_dict['n'], net_output_shape_dict['k'], net_output_shape_dict['h'], net_output_shape_dict['w']])
+    net_outputes = [net_output]
+
+    # check the model.espresso.net file to adjust the index #
+    print('check the model.espresso.net file to adjust the index')
+    # net_output_name = net_layers[-2]['top']
+    net_output_name = net_layers[-1]['top']
+    if net_output_name.isdigit() != True:
+        net_output2_shape_dict = net_layer_shapes[net_output_name]
+        net_output2 = helper.make_tensor_value_info(net_output_name, TensorProto.FLOAT, [net_output2_shape_dict['n'], net_output2_shape_dict['k'], net_output2_shape_dict['h'], net_output2_shape_dict['w']])
+        net_outputes.append(net_output2)
+
+    onnx_blob_shapes = []
+    for blob_name, blob_shape_dict in net_layer_shapes.items():
+        onnx_blob_shapes.append(helper.make_tensor_value_info(blob_name, TensorProto.FLOAT,
+                                                              [blob_shape_dict['n'],
+                                                               blob_shape_dict['k'],
+                                                               blob_shape_dict['h'],
+                                                               blob_shape_dict['w']]))
+
+    #创建nodes NodeProto
+    onnx_net_nodes = []
+    onnx_net_weights = []
+
+    # check the model.espresso.net file to adjust the index #
+    print('check the model.espresso.net file to adjust the index')
+    # layer_info = net_layers[1]
+    layer_info = net_layers[0]
+
+    for layer_info in net_layers:
+        print(layer_info['type'])
+        if layer_info['type'] == 'convolution':
+            stride_x = 1
+            if  ('stride_x' in layer_info):
+                stride_x = layer_info['stride_x']
+
+            stride_y = 1
+            if  ('stride_y' in layer_info):
+                stride_y = layer_info['stride_y']
+
+            node_inputs = layer_info['bottom'].split(',')
+            if  ('blob_weights' in layer_info):
+                node_inputs.append(str(layer_info['blob_weights']))
+            if  ('blob_biases' in layer_info):
+                node_inputs.append(str(layer_info['blob_biases']))
+
+            node_conv_outputs = layer_info['top'].split(',')
+            node_relu_outputs = []
+            if layer_info['fused_relu'] == 1:
+                node_relu_outputs = node_conv_outputs
+                node_conv_outputs = []
+                for temp_output in node_relu_outputs:
+                    conv_output_blob_name = 'conv_'+temp_output
+                    node_conv_outputs.append(conv_output_blob_name)
+                    blob_shape_dict = net_layer_shapes[temp_output]
+                    onnx_blob_shapes.append(helper.make_tensor_value_info(conv_output_blob_name, TensorProto.FLOAT,
+                                                                          [blob_shape_dict['n'],
+                                                                           blob_shape_dict['k'],
+                                                                           blob_shape_dict['h'],
+                                                                           blob_shape_dict['w']]))
+
+            conv_group_num = layer_info['n_groups']
+            layer_node = helper.make_node('Conv', # node type
+                                          node_inputs, # inputs
+                                          node_conv_outputs, # outputs
+                                          kernel_shape = [layer_info['Nx'], layer_info['Ny']],
+                                          strides = [stride_x, stride_y],
+                                          pads = [layer_info['pad_l'], layer_info['pad_t'], layer_info['pad_r'], layer_info['pad_b']],
+                                          group = conv_group_num,
+                                          dilations = [1, 1])
+            onnx_net_nodes.append(layer_node)
+
+            #weights
+            weights_shape = [layer_info['C'], int(layer_info['K']/conv_group_num), layer_info['Nx'], layer_info['Ny']]
+            onnx_weights_tensor = helper.make_tensor(str(layer_info['blob_weights']), TensorProto.FLOAT, weights_shape, net_layer_data[layer_info['blob_weights']])
+            onnx_net_weights.append(onnx_weights_tensor)
+
+            #bias
+            if  ('blob_biases' in layer_info):
+                bias_shape = [layer_info['C']]
+                onnx_bias_tensor = helper.make_tensor(str(layer_info['blob_biases']), TensorProto.FLOAT, bias_shape, net_layer_data[layer_info['blob_biases']])
+                onnx_net_weights.append(onnx_bias_tensor)
+
+            if layer_info['fused_relu'] == 1:
+                layer_node = helper.make_node('Relu', # node type
+                                              node_conv_outputs, # inputs
+                                              node_relu_outputs, # outputs
+                                              )
+                onnx_net_nodes.append(layer_node)
+        elif layer_info['type'] == 'pool':
+            stride_x = 1
+            if  ('stride_x' in layer_info):
+                stride_x = layer_info['stride_x']
+
+            stride_y = 1
+            if  ('stride_y' in layer_info):
+                stride_y = layer_info['stride_y']
+
+            node_type = 'MaxPool'
+            if  layer_info['avg_or_max'] == 0:
+                node_type = 'AveragePool'
+
+            node_inputs = layer_info['bottom'].split(',')
+            node_outputs = layer_info['top'].split(',')
+            layer_node = helper.make_node(node_type, # node type
+                                          node_inputs, # inputs
+                                          node_outputs, # outputs
+                                          kernel_shape = [layer_info['size_x'], layer_info['size_y']],
+                                          strides = [stride_x, stride_y],
+                                          pads = [layer_info['pad_l'], layer_info['pad_t'], layer_info['pad_r'], layer_info['pad_b']])
+            onnx_net_nodes.append(layer_node)
+        elif layer_info['type'] == 'elementwise':
+            node_inputs = layer_info['bottom'].split(',')
+            node_type = ''
+            node_inputs_extra = []
+            if  layer_info['operation'] == 0:
+                # check
+                node_type = 'Add'
+            elif layer_info['operation'] == 1:
+                # check 注意如果输如只有1个，需要像取[layer_info['alpha']]值
+                node_type = 'Mul'
+                if len(node_inputs) == 1:
+                    # scales
+                    scales_tensor_name = 'elementwise_' + layer_info['top']
+                    node_inputs_extra.append(scales_tensor_name)
+                    scales = [layer_info['alpha']]
+                    onnx_scales_tensor = helper.make_tensor(scales_tensor_name, TensorProto.FLOAT, [1], scales)
+                    onnx_net_weights.append(onnx_scales_tensor)
+            elif layer_info['operation'] == 2:
+                node_type = 'Sub'
+            elif layer_info['operation'] == 3:
+                # check
+                node_type = 'Mul'
+                if len(node_inputs) == 1:
+                    # scales
+                    scales_tensor_name = 'elementwise_' + layer_info['top']
+                    node_inputs_extra.append(scales_tensor_name)
+                    scales = [layer_info['alpha']]
+                    onnx_scales_tensor = helper.make_tensor(scales_tensor_name, TensorProto.FLOAT, [1], scales)
+                    onnx_net_weights.append(onnx_scales_tensor)
+            elif layer_info['operation'] == 10:
+                # check，求倒数，y=1/x
+                node_type = 'Div'
+                # scales
+                scales_tensor_name = 'elementwise_' + layer_info['top']
+                node_inputs_extra.append(scales_tensor_name)
+                scales = [layer_info['alpha']]
+                onnx_scales_tensor = helper.make_tensor(scales_tensor_name, TensorProto.FLOAT, [1], scales)
+                onnx_net_weights.append(onnx_scales_tensor)
+            elif layer_info['operation'] == 24:
+                # check
+                node_type = 'Abs'
+            else:
+                print('Error: unsupported elementwise operation: ' + str(layer_info['operation']))
+                assert(0)
+
+            node_inputs = layer_info['bottom'].split(',')
+            node_inputs.extend(node_inputs_extra)
+            node_outputs = layer_info['top'].split(',')
+            layer_node = helper.make_node(node_type, # node type
+                                          node_inputs, # inputs
+                                          node_outputs, # outputs
+                                          )
+            onnx_net_nodes.append(layer_node)
+        elif layer_info['type'] == 'upsample':
+            node_type = 'Upsample'
+            mode = 0
+            if  layer_info['mode'] != 0:
+                print('Error: unsupported upsample mode: ' + str(layer_info['mode']))
+                assert(0)
+
+            scales_tensor_name = 'upsample_'+layer_info['top']
+
+            node_inputs = layer_info['bottom'].split(',')
+
+            if node_inputs[0].isdigit() != True:
+                node_input_shape_dict = net_layer_shapes[node_inputs[0]]
+                node_input_tensor = helper.make_tensor_value_info(node_inputs[0], TensorProto.FLOAT,
+                                                                  [node_input_shape_dict['n'], node_input_shape_dict['k'],
+                                                                   node_input_shape_dict['h'], node_input_shape_dict['w']])
+                net_inputes.append(node_input_tensor)
+
+
+            node_inputs.append(scales_tensor_name)
+            node_outputs = layer_info['top'].split(',')
+            layer_node = helper.make_node(node_type, # node type
+                                          node_inputs, # inputs
+                                          node_outputs, # outputs
+                                          mode = 'nearest',
+                                          )
+            onnx_net_nodes.append(layer_node)
+
+            # scales
+            scales = [1.0, 1.0, layer_info['scaling_factor_x'], layer_info['scaling_factor_y']]
+            onnx_scales_tensor = helper.make_tensor(scales_tensor_name, TensorProto.FLOAT, [4], scales)
+            onnx_net_weights.append(onnx_scales_tensor)
+        elif layer_info['type'] == 'concat':
+            node_type = 'Concat'
+
+            node_inputs = layer_info['bottom'].split(',')
+            node_outputs = layer_info['top'].split(',')
+            layer_node = helper.make_node(node_type,  # node type
+                                          node_inputs,  # inputs
+                                          node_outputs,  # outputs
+                                          axis = 1,
+                                          )
+            onnx_net_nodes.append(layer_node)
+        elif layer_info['type'] == 'activation':
+            node_inputs = layer_info['bottom'].split(',')
+            node_outputs = layer_info['top'].split(',')
+
+            activation_mode = layer_info['mode']
+            if activation_mode == 0:
+                layer_node = helper.make_node('Relu',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 1:
+                layer_node = helper.make_node('Tanh',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 2:
+                layer_node = helper.make_node('LeakyRelu',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 3:
+                layer_node = helper.make_node('Sigmoid',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 4:
+                layer_node = helper.make_node('PRelu',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 8:
+                layer_node = helper.make_node('Elu',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 9:
+                layer_node = helper.make_node('ThresholdedRelu',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  alpha = layer_info['alpha']
+                  )
+            elif activation_mode == 10:
+                layer_node = helper.make_node('Softplus',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            elif activation_mode == 12:
+                layer_node = helper.make_node('Softsign',  # node type
+                 node_inputs,  # inputs
+                  node_outputs,  # outputs
+                  )
+            else:
+                print('Error: unsupported activation mode: ' + str(activation_mode))
+                assert(0)
+
+            onnx_net_nodes.append(layer_node)
+        elif layer_info['type'] == 'load_constant':
+            # constant_blob
+            print('constant_blob: ' + str(layer_info['constant_blob']))
+        else:
+            print('Error: unsupported layer type: ' + layer_info['type'])
+            assert(0)
+
+    #创建graph GraphProto
+    graph_def = helper.make_graph(
+        onnx_net_nodes,
+        'onnx-model',
+        net_inputes,
+        net_outputes,
+        initializer = onnx_net_weights,
+        value_info = onnx_blob_shapes,
+    )
+
+    #创建model (ModelProto)
+    # onnx_model = helper.make_model(graph_def, producer_name='YouTu Tencent')
+    onnx_model = helper.make_model(graph_def, producer_name='YouTu Tencent', opset_imports=[helper.make_operatorsetid("", 12)])
+
+    # print('The model is:\n{}'.format(onnx_model))
+    onnx.checker.check_model(onnx_model)
+    print('Before shape inference, the shape info of Y is:\n{}'.format(onnx_model.graph.value_info))
+
+    # Apply shape inference on the model
+    inferred_model = shape_inference.infer_shapes(onnx_model)
+    onnx.checker.check_model(inferred_model)
+    print('After shape inference, the shape info of Y is:\n{}'.format(inferred_model.graph.value_info))
+    onnx.save(inferred_model, coreml_folder+'/model.onnx')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml_fp6.py b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml_fp6.py
new file mode 100644
index 0000000..b59b79a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/coreml_fp6.py
@@ -0,0 +1,27 @@
+import os
+import argparse
+import sys
+import time
+# sys.path.append('./onnx-optimizer')
+# from onnx_optimizer import onnx_optimizer
+
+# from onnx import version_converter
+from onnx_coreml import convert
+import coremltools
+from coremltools.models import MLModel
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('coreml_model_path', help='Input CoreML model path')
+    args = parser.parse_args()
+    cml_net_path = args.coreml_model_path
+
+    cml_net_path_fp16 = cml_net_path+'_half.mlmodel'
+
+    cml_model = coremltools.utils.load_spec(cml_net_path)
+    cml_model_fp16 = coremltools.utils.convert_neural_network_spec_weights_to_fp16(cml_model)
+    coremltools.utils.save_spec(cml_model_fp16, cml_net_path_fp16)
+    # https://www.jianshu.com/p/4703bc425564
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx2coreml.py b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx2coreml.py
new file mode 100755
index 0000000..034e55e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx2coreml.py
@@ -0,0 +1,60 @@
+import os
+import argparse
+import sys
+import time
+# sys.path.append('./onnx-optimizer')
+# from onnx_optimizer import onnx_optimizer
+
+import onnx
+from onnx import helper, shape_inference
+# from onnx import version_converter
+import coremltools
+from coremltools.models import MLModel
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('onnx_model_path', help='Input ONNX model full path')
+    parser.add_argument('-mlmodelc', help='Save model wiht mlmodelc, 1:default yes, 0: no')
+    args = parser.parse_args()
+
+    onnx_net_path = args.onnx_model_path
+    mlmodelc = args.mlmodelc
+    if mlmodelc == None:
+        mlmodelc = '1'
+
+    onnx_net_dir = onnx_net_path[0:onnx_net_path.rfind('.')];
+    print("dir: "+onnx_net_dir)
+
+    cml_net_path = onnx_net_path+'.mlmodel'
+    if onnx_net_path.endswith('.onnx'):
+        cml_net_path = onnx_net_path[0:len(onnx_net_path)-5]+'.mlmodel'
+
+    onnx_model = onnx.load(onnx_net_path)
+    onnx_inputs = onnx_model.graph.input
+    onnx_outputs = onnx_model.graph.output
+    # #
+    # onnx_model.graph.input[0].type.tensor_type.shape.dim[2].dim_value = 320
+    # onnx_model.graph.input[0].type.tensor_type.shape.dim[3].dim_value = 320
+    # onnx.checker.check_model(onnx_model)
+    # onnx_model = shape_inference.infer_shapes(onnx_model)
+
+    cml_model = coremltools.converters.onnx.convert(model=onnx_net_path)
+    # cml_model = convert(onnx_model, image_input_names=[onnx_inputs[0].name])
+    cml_model.save(cml_net_path)
+    # new_cml_spec = cml_model.get_spec()
+    # coremltools.utils.save_spec(new_cml_spec, cml_net_path)
+
+    # print("dir: "+onnx_net_dir)
+    if mlmodelc == '1':
+        cmd = '/Applications/Xcode.app/Contents/Developer/usr/bin/coremlc compile '+cml_net_path+' '+onnx_net_dir
+        os.system(cmd)
+
+    # coremltools.utils.rename_feature(new_cml_spec, onnx_inputs[0].name, 'input_image')
+    # coremltools.utils.rename_feature(new_cml_spec, onnx_outputs[0].name, 'output_array')
+    # new_cml_spec.neuralNetwork.preprocessing[0].featureName = 'input_image'
+    # # https://github.com/apple/coremltools/issues/244
+    # coremltools.utils.save_spec(new_cml_spec, cml_net_path)
+    # cml_model.save(cml_net_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx_net.py b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx_net.py
new file mode 100644
index 0000000..96cccea
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/onnx_net.py
@@ -0,0 +1,85 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import argparse
+import sys
+import time
+
+import struct
+import json
+
+import onnx
+from onnx import helper, shape_inference
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx import numpy_helper
+
+def main():
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument('coreml_folder', help='Input coreml model folder')
+    # args = parser.parse_args()
+    # coreml_folder = args.coreml_folder
+    #
+    # coreml_net = coreml_folder + '/model.espresso.net'
+    # coreml_shape = coreml_folder + '/model.espresso.shape'
+    # coreml_weights = coreml_folder + '/model.espresso.weights'
+
+    # print(net_layer_data[1])
+    # tensor_shape = [16]
+    # tensor = helper.make_tensor('1', TensorProto.FLOAT, tensor_shape, net_layer_data[1])
+    # print(tensor)
+
+    #构建onnx
+    #创建输入 (ValueInfoProto)
+    net_inputes = []
+    net_input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 13, 4, 4])
+    net_inputes.append(net_input)
+
+    #创建输出 (ValueInfoProto)
+    net_output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 13, 4, 4])
+    net_outputes = [net_output]
+
+    onnx_blob_shapes = [net_input, net_output]
+
+    #创建nodes NodeProto
+    onnx_net_nodes = []
+    onnx_net_weights = []
+
+    node_inputs = ["input"]
+    node_outputs = ["output"]
+    layer_node = helper.make_node("ThresholdedRelu",  # node type
+                                  node_inputs,  # inputs
+                                  node_outputs,  # outputs
+                                  alpha = 1.5
+                                  )
+    onnx_net_nodes.append(layer_node)
+
+    #创建graph GraphProto
+    graph_def = helper.make_graph(
+        onnx_net_nodes,
+        'onnx-model',
+        net_inputes,
+        net_outputes,
+        initializer = onnx_net_weights,
+        value_info = onnx_blob_shapes,
+    )
+
+    #创建model (ModelProto)
+    # onnx_model = helper.make_model(graph_def, producer_name='YouTu Tencent')
+    onnx_model = helper.make_model(graph_def, producer_name='YouTu Tencent', opset_imports=[helper.make_operatorsetid("", 12)])
+
+    # print('The model is:\n{}'.format(onnx_model))
+    onnx.checker.check_model(onnx_model)
+    print('Before shape inference, the shape info of Y is:\n{}'.format(onnx_model.graph.value_info))
+
+    # Apply shape inference on the model
+    inferred_model = shape_inference.infer_shapes(onnx_model)
+    onnx.checker.check_model(inferred_model)
+    print('After shape inference, the shape info of Y is:\n{}'.format(inferred_model.graph.value_info))
+    onnx.save(inferred_model, 'onnx_net.onnx')
+
+
+if __name__ == '__main__':
+    main()
diff --git "a/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/\345\256\211\350\243\205\350\257\264\346\230\216-CoreML.md" "b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/\345\256\211\350\243\205\350\257\264\346\230\216-CoreML.md"
new file mode 100644
index 0000000..75bd025
--- /dev/null
+++ "b/3rdparty/TNN/tools/onnx2tnn/onnx-coreml/\345\256\211\350\243\205\350\257\264\346\230\216-CoreML.md"
@@ -0,0 +1,19 @@
+//参考 https://stringpiggy.hpd.io/mac-osx-python3-dual-install/
+//遇到资源无法下载，可切换到GestWiFi，或尝试用代理
+<!--
+export http_proxy=http://web-proxy.oa.com:8080/
+export all_proxy=https://web-proxy.oa.com:8080/
+export ftp_proxy=ftp://web-proxy.oa.com:8080/
+export socks_proxy=socks://web-proxy.oa.com:8080/
+export ALL_PROXY=https://web-proxy.oa.com:8080/
+export https_proxy=https://web-proxy.oa.com:8080/
+-->
+
+步骤：
+<!-- 版本不一致会导致protobuf库找不到，最好与转化库so所用的版本保持一致，否则都用最新版本，so重新编译
+参考：https://blog.csdn.net/qq_21383435/article/details/81035852
+-->
+<!-- 编译+运行 -->
+1. 安装coremltools //https://pypi.org/project/coremltools/
+pip3 install -U coremltools
+pip3 install -U onnx-coreml
diff --git a/3rdparty/TNN/tools/onnx2tnn/onnx-tnn-operators.md b/3rdparty/TNN/tools/onnx2tnn/onnx-tnn-operators.md
new file mode 100644
index 0000000..37e2389
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/onnx-tnn-operators.md
@@ -0,0 +1,78 @@
+# onnx vs tnn
+
+以下表格中的内容,会随着后续工具的不断完善进行相应的补充
+
+| onnx                                                         | tnn                        |
+|--------------------------------------------------------------|---------------------------------|
+| AveragePool                                                  | Pooling / Pooling3D             |
+| BatchNormalization                                           | BatchNormCxx                    |
+| Clip                                                         | ReLU6                           |
+| Concat                                                       | Concat                          |
+| Conv                                                         | Convolution3D / Convolution     |
+| ConvTranspose(ConvTranspose+BatchNormalization)              | Deconvolution3D / Deconvolution |
+| DepthToSpace                                                 | Reorg                           |
+| Div                                                          | Mul                             |
+| Gemm                                                         | InnerProduct                    |
+| GlobalAveragePool                                            | Pooling / Pooling3D             |
+| GlobalMaxPool                                                | Pooling / Pooling3D             |
+| InstanceNormalization                                        | InstBatchNormCxx                |
+| LeakyRelu                                                    | PReLU                           |
+| MaxPool                                                      | Pooling / Pooling3D             |
+| Mul                                                          | Mul                             |
+| Normalize(ReduceL2 + Clip+ Expand+Div)                       | Normalize                       |
+| PReLU                                                        | PReLU                           |
+| Pad                                                          | Pad / Pad3D                     |
+| ReduceMean                                                   | ReduceMean                      |
+| Relu                                                         | ReLU                            |
+| Reshape                                                      | Reshape                         |
+| ShuffleChannle(Reshape+Transpose+Reshape)                    | ShuffleChannle                  |
+| Slice                                                        | StridedSlice                    |
+| Softmax(Exp + ReduceSum + Div)                               | SoftmaxCaffe                    |
+| Softmax(Transpose + Reshape + Softmax + Reshape + Transpose) | SoftmaxCaffe                    |
+| Softplus                                                     | Softplus                        |
+| Split                                                        | SplitV                          |
+| Sub                                                          | BatchNormCxx                    |
+| Tanh                                                         | TanH                            |
+| Tile                                                         | Repeat                          |
+| Transpose                                                    | Transpose                       |
+| Upsample                                                     | Upsample                        |
+
+# use onnx operator
+
+| onnx                  | 1.2.2                              | 1.6.0                                            | compatible   |
+|-----------------------|------------------------------------|--------------------------------------------------|--------------|
+| AveragePool           | -                                  | attributes(ceil\_mode)                           | yes          |
+| BatchNormalization    | spatial                            | spatial(delete) (not use)                        | yes          |
+| Clip                  | attributes(min, max)               | inputs(min, max)                                 | yes(support) |
+| Concat                | -                                  | -                                                | yes          |
+| Conv                  | -                                  | -                                                | yes          |
+| ConvTranspose         | -                                  | -                                                | yes          |
+| DepthToSpace          | attributes(blocksize)              | attributes(blocksize,mode)                       | yes(support) |
+| Div                   | -                                  | -                                                | yes          |
+| Exp                   | -                                  | -                                                | yes          |
+| Expand                | not support                        | support                                          | yes          |
+| Gemm                  | -                                  | -                                                | yes          |
+| GlobalAveragePool     | -                                  | -                                                | yes          |
+| GlobalMaxPool         | -                                  | -                                                | yes          |
+| InstanceNormalization | -                                  | -                                                | yes          |
+| LeakyRelu             | -                                  | -                                                | yes          |
+| MaxPool               | -                                  | attributes(ceil\_mode, dilations,storage\_order) | ?            |
+| Mul                   | -                                  | -                                                | yes          |
+| PRelu                 | -                                  | -                                                | yes          |
+| Pad                   | attributes(pads, value)            | inputs(pads, constant_value)                     | yes(support) |
+| ReduceL2              | -                                  | -                                                | yes          |
+| ReduceMean            | -                                  | -                                                | yes          |
+| ReduceSum             | -                                  | -                                                | yes          |
+| Relu                  | -                                  | -                                                | yes          |
+| Reshape               | -                                  | -                                                | yes          |
+| Slice                 | attributes(starts,ends,axes,steps) | inputs(starts,ends,axes,steps)                   | yes(support) |
+| Softmax               | -                                  | -                                                | yes          |
+| Softplus              | -                                  | -                                                | yes          |
+| Split                 | -                                  | -                                                | yes          |
+| Sub                   | -                                  | -                                                | yes          |
+| Tanh                  | -                                  | -                                                | yes          |
+| Tile                  | -                                  | -                                                | yes          |
+| Transpose             | -                                  | -                                                | yes          |
+| Upsample              | Upsample                           | deprecated(弃用了), 使用 Resize 替代             | yes          |
+
+(ps: "-" 代表 onnx 1.2.2 和 onnx 1.6.0 版本的 op 是相同的, 没有修改)
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ histogram.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ histogram.cc
new file mode 100644
index 0000000..8d65b99
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ histogram.cc	
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Histogram);
+
+string OnnxOpConverterHistogram::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Histogram";
+}
+
+string OnnxOpConverterHistogram::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+    int depth = -1;
+    if (node_has_attr(node, "depth")) {
+        depth = get_node_attr_i(node, "depth");
+    } else if (node.input_size() >1 && net_info.weights_map.find(node.input(1)) != net_info.weights_map.end()) {
+        const onnx::TensorProto& tensorProto = net_info.weights_map.at(node.input(1));
+        auto depth_i = get_tensor_proto_data_vector<int>(tensorProto);
+        depth = depth_i[0];
+    }
+    
+    layer_param << depth << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterHistogram::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterHistogram::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Histogram, Histogram);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_abs.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_abs.cc
new file mode 100644
index 0000000..e662e87
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_abs.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Abs, Abs);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_acos.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_acos.cc
new file mode 100644
index 0000000..17d1f23
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_acos.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Acos, Acos);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_adaptive_pool.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_adaptive_pool.cc
new file mode 100644
index 0000000..7da4746
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_adaptive_pool.cc
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(AdaptivePool);
+
+string OnnxOpConverterAdaptivePool::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Pooling";
+}
+
+string OnnxOpConverterAdaptivePool::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    const std::string &onnx_op = node.op_type();
+    int pool_type              = (onnx_op == "adaptive_avg_pool2d") ? 1 : 0;
+    layer_param << pool_type << " ";
+
+    const int kernel_shape     = -1;
+    const int stride           = -1;
+    const int pad              = 0;
+    const int pad_type         = -1;
+    const int ceil_mode        = 0;
+    const int is_adaptive_pool = 1;
+    layer_param << kernel_shape << " " << kernel_shape << " ";
+    layer_param << stride << " " << stride << " ";
+    layer_param << pad << " " << pad << " ";
+    layer_param << -1 << " " << -1 << " ";
+    layer_param << pad_type << " ";
+    layer_param << ceil_mode << " ";
+    layer_param << is_adaptive_pool << " ";
+
+    const auto output_shape_name = node.input(1);
+    const auto output_shape      = net_info.weights_map[output_shape_name];
+    auto shape_data              = (const int64_t *)get_tensor_proto_data(output_shape);
+    int data_size                = get_tensor_proto_data_size(output_shape);
+    if (data_size == 1) {
+        layer_param << shape_data[0] << " " << shape_data[0] << " ";
+    } else if (data_size == 2) {
+        layer_param << shape_data[0] << " " << shape_data[1] << " ";
+    } else {
+        DLog("output shape's size = %d, it's unsuported \n", data_size);
+        assert(0);
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterAdaptivePool::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterAdaptivePool::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(AdaptivePool, adaptive_avg_pool2d);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_add.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_add.cc
new file mode 100644
index 0000000..2bcc44a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_add.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Add);
+
+
+string OnnxOpConverterAdd::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    return "Add";
+}
+
+string OnnxOpConverterAdd::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterAdd::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterAdd::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Add, Add);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_arg_max_or_min.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_arg_max_or_min.cc
new file mode 100644
index 0000000..cb5c550
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_arg_max_or_min.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(ArgMaxOrMin);
+
+string OnnxOpConverterArgMaxOrMin::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "ArgMaxOrMin";
+}
+
+string OnnxOpConverterArgMaxOrMin::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    int64_t axis              = get_node_attr_i(node, "axis", 0);
+    int64_t keepdims          = get_node_attr_i(node, "keepdims", 1);
+    int64_t select_last_index = 0;
+    if (net_info.opset >= 12) {
+        select_last_index = get_node_attr_i(node, "select_last_index", 0);
+    }
+    if (select_last_index != 0) {
+        DLog("ArgMaxOrMin: do not support select last index for now.\n");
+        assert(0);
+    }
+    if (onnx_op == "ArgMin"){
+        layer_param << 0 << " ";
+    } else if (onnx_op == "ArgMax") {
+        layer_param << 1 << " ";
+    } else {
+        DLog("ArgMaxOrMin: do not support type.\n");
+        assert(0);
+    }
+    layer_param << axis << " ";
+    layer_param << keepdims << " ";
+    layer_param << select_last_index << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterArgMaxOrMin::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterArgMaxOrMin::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(ArgMaxOrMin, ArgMax);
+REGISTER_OP_CONVERTER(ArgMaxOrMin, ArgMin);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_asin.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_asin.cc
new file mode 100644
index 0000000..ce92b41
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_asin.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Asin, Asin);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_atan.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_atan.cc
new file mode 100644
index 0000000..0b68299
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_atan.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Atan, Atan);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_batchnorm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_batchnorm.cc
new file mode 100644
index 0000000..f4edf90
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_batchnorm.cc
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <cmath>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER(BatchNorm);
+
+string OnnxOpConverterBatchNorm::TNNOpType(NodeProto& node,
+                                                OnnxNetInfo &net_info) {
+    return "BatchNormCxx";
+}
+
+string OnnxOpConverterBatchNorm::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return "";
+}
+
+bool OnnxOpConverterBatchNorm::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterBatchNorm::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    float epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
+
+    const onnx::TensorProto& gamma = net_info.weights_map[node.input(1)];
+    const onnx::TensorProto& beta  = net_info.weights_map[node.input(2)];
+    const onnx::TensorProto& mean  = net_info.weights_map[node.input(3)];
+    const onnx::TensorProto& var   = net_info.weights_map[node.input(4)];
+
+    int channels = get_tensor_proto_data_size(gamma);
+
+    float* slope = new float[channels];
+    float* bias  = new float[channels];
+    // apply epsilon to var
+    {
+        const float* gamma_data = get_tensor_proto_data(gamma);
+        const float* beta_data  = get_tensor_proto_data(beta);
+        const float* mean_data  = get_tensor_proto_data(mean);
+        const float* var_data   = get_tensor_proto_data(var);
+
+        for (int j = 0; j < channels; j++) {
+            double sqrt_var = std::sqrt(double(var_data[j])+ epsilon);
+            bias[j] = double(beta_data[j]) - double(gamma_data[j])*double(mean_data[j])/sqrt_var;
+            slope[j]  = double(gamma_data[j])/sqrt_var;
+        }
+    }
+
+    WriteRawData(slope, channels, net_writer, net_info.data_type, {channels});
+    WriteRawData(bias, channels, net_writer, net_info.data_type, {channels});
+
+    delete[] slope;
+    delete[] bias;
+
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(BatchNorm, BatchNormalization);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_bitshift.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_bitshift.cc
new file mode 100644
index 0000000..158ac09
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_bitshift.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(BitShift);
+
+string OnnxOpConverterBitShift::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "BitShift";
+}
+
+string OnnxOpConverterBitShift::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    auto direction_s = get_node_attr_s(node, "direction");
+    auto direction = direction_s=="LEFT" ? 1 : 0;
+    
+    int bits = 0;
+    if (node_has_attr(node, "depth")) {
+        bits = get_node_attr_i(node, "depth");
+    } else if (node.input_size() >1 && net_info.weights_map.find(node.input(1)) != net_info.weights_map.end()) {
+        const onnx::TensorProto& tensorProto = net_info.weights_map.at(node.input(1));
+        auto depth_i = get_tensor_proto_data_vector<int>(tensorProto);
+        bits = depth_i[0];
+    }
+    
+    layer_param << direction << " "<<bits<<" ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterBitShift::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterBitShift::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(BitShift, BitShift);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cast.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cast.cc
new file mode 100644
index 0000000..b636871
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cast.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Cast);
+
+string OnnxOpConverterCast::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Cast";
+}
+
+string OnnxOpConverterCast::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+    //    TensorProto_DataType_UNDEFINED = 0,
+    //    TensorProto_DataType_FLOAT = 1,
+    //    TensorProto_DataType_UINT8 = 2,
+    //    TensorProto_DataType_INT8 = 3,
+    //    TensorProto_DataType_UINT16 = 4,
+    //    TensorProto_DataType_INT16 = 5,
+    //    TensorProto_DataType_INT32 = 6,
+    //    TensorProto_DataType_INT64 = 7,
+    //    TensorProto_DataType_STRING = 8,
+    //    TensorProto_DataType_BOOL = 9,
+    //    TensorProto_DataType_FLOAT16 = 10,
+    //    TensorProto_DataType_DOUBLE = 11,
+    //    TensorProto_DataType_UINT32 = 12,
+    //    TensorProto_DataType_UINT64 = 13,
+    //    TensorProto_DataType_COMPLEX64 = 14,
+    //    TensorProto_DataType_COMPLEX128 = 15,
+    //    TensorProto_DataType_BFLOAT16 = 16
+    
+    //转成common.h里面的DataType值
+    int64_t to = get_node_attr_i(node, "to");
+    DataType data_type = GetTnnDataTypeFromOnnx(to);
+    layer_param << data_type << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterCast::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterCast::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Cast, Cast);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ceil.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ceil.cc
new file mode 100644
index 0000000..fcad09b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_ceil.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Ceil, Ceil);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_clip.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_clip.cc
new file mode 100644
index 0000000..d91c3b6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_clip.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include <cstdlib>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Clip);
+
+string OnnxOpConverterClip::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    double min = get_node_attr_f(node, "min", net_info, 1, -DBL_MAX);
+    double max = get_node_attr_f(node, "max", net_info, 2, DBL_MAX);
+    if (std::fabs(min) <= DBL_EPSILON && std::fabs(max - 6) <= DBL_EPSILON) {
+        return "ReLU6";
+    } else {
+        return "Clip";
+    }
+}
+
+string OnnxOpConverterClip::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    
+    double min = get_node_attr_f(node, "min", net_info, 1, -DBL_MAX);
+    double max = get_node_attr_f(node, "max", net_info, 2, DBL_MAX);
+    if (std::fabs(min) <= DBL_EPSILON && std::fabs(max - 6) <= DBL_EPSILON) {
+        return "";
+    } else {
+        ostringstream layer_param;
+        layer_param << min << " " << max << " ";
+        return layer_param.str();
+    }
+}
+
+bool OnnxOpConverterClip::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterClip::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Clip, Clip);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_concat.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_concat.cc
new file mode 100644
index 0000000..ae580d3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_concat.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Concat,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterConcat::TNNOpType(NodeProto &node,
+                                             OnnxNetInfo &net_info) {
+    return "Concat";
+}
+
+std::vector<std::string> OnnxOpConverterConcat::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return GetAllInputNames(node, net_info);
+}
+
+string OnnxOpConverterConcat::TNNLayerParam(NodeProto &node,
+                                                 OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+    int axis = (int)get_node_attr_i(node, "axis", 1);
+    layer_param << axis << " ";
+    return layer_param.str();
+}
+
+bool OnnxOpConverterConcat::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterConcat::WriteTNNModel(Serializer *net_writer,
+                                              NodeProto &node,
+                                              OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Concat, Concat);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_const.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_const.cc
new file mode 100644
index 0000000..fe535b0
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_const.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Const);
+
+string OnnxOpConverterConst::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Const";
+}
+
+string OnnxOpConverterConst::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
+    std::vector<int> dims = CreateDimsVectorFromTensor(tensor);
+    ostringstream layer_param;
+    layer_param << dims.size() << " ";
+    for (const auto& dim: dims) {
+        layer_param << dim << " ";
+    }
+    return layer_param.str();
+}
+
+bool OnnxOpConverterConst::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterConst::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op        = node.op_type();
+    std::string name                  = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
+    WriteTensorData(tensor, net_writer, net_info.data_type);
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(Const, Constant);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_constantofshape.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_constantofshape.cc
new file mode 100644
index 0000000..e0af35c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_constantofshape.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(ConstantOfShape);
+
+string OnnxOpConverterConstantOfShape::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "ConstantOfShape";
+}
+
+string OnnxOpConverterConstantOfShape::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterConstantOfShape::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+};
+
+int OnnxOpConverterConstantOfShape::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    // 数据可能是任意类型
+    auto value = get_node_attr_tensor(node, "value");
+    WriteTensorData(value, net_writer, DATA_TYPE_AUTO);
+
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(ConstantOfShape, ConstantOfShape);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_convolution.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_convolution.cc
new file mode 100644
index 0000000..6edd1cb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_convolution.cc
@@ -0,0 +1,210 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER(Conv);
+
+string OnnxOpConverterConv::TNNOpType(NodeProto& node,
+                                      OnnxNetInfo &net_info) {
+    const std::string& onnx_op = node.op_type();
+
+    std::vector<int64_t> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+    const int kernel_shape_size = kernel_shape.size();
+
+    if (onnx_op == "Conv") {
+        switch (kernel_shape_size) {
+            case 1:
+                return "Convolution1D";
+            case 2:
+                return "Convolution";
+            case 3:
+                return "Convolution3D";
+            default:
+                DLog("Convolution%d is unsuported \n", kernel_shape_size);
+                assert(0);
+        }
+    } else if (onnx_op == "ConvTranspose") {
+        return kernel_shape.size() == 3 ? "Deconvolution3D" : "Deconvolution";
+    }
+
+    return "";
+}
+
+string OnnxOpConverterConv::TNNLayerParam(NodeProto& node,
+                                          OnnxNetInfo& net_info) {
+    ostringstream layer_param;
+
+    const std::string& onnx_op = node.op_type();
+
+    const onnx::TensorProto& weight = net_info.weights_map[node.input(1)];
+    int channel_output              = 0;
+    int channel_input               = 0;
+
+    int group = (int)get_node_attr_i(node, "group", 1);
+
+    if (onnx_op == "Conv") {
+        channel_output = (int)weight.dims(0);
+        channel_input  = (int)weight.dims(1);
+    } else if (onnx_op == "ConvTranspose") {
+        channel_input = (int)weight.dims(0) / group;
+        channel_output  = (int)weight.dims(1) * group;
+    }
+    int has_bias = node.input_size() == 3 ? 1 : 0;
+    //        has_bias = 0;
+
+    std::string auto_pad = get_node_attr_s(node, "auto_pad");  // TODO
+    std::vector<int64_t> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+    std::vector<int64_t> dilations    = get_node_attr_ai(node, "dilations");
+    std::vector<int64_t> strides      = get_node_attr_ai(node, "strides");
+    std::vector<int64_t> pads         = get_node_attr_ai(node, "pads");
+    std::vector<int64_t> output_pads = get_node_attr_ai(node, "output_padding");
+
+    int pad_type = -1;
+    if (auto_pad == "SAME_UPPER") {
+        pad_type = 0;
+    } else if (auto_pad == "VALID") {
+        pad_type = 1;
+    } else if (auto_pad == "SAME_LOWER") {
+        pad_type = 0;
+        //can be conbine with ceil mode
+        DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+        assert(0);
+    }
+
+    if (output_pads.size()>0 && output_pads[0] != 0) {
+        // output padding conver to pad_type 3 for deconvolution
+        pad_type = 3;
+    }
+
+    layer_param << group << " " << channel_input << " " << channel_output
+                << " ";
+
+    //kernel size
+    if (kernel_shape.size() == 1) {
+        layer_param << kernel_shape[0] << " ";
+    } else if (kernel_shape.size() == 2) {
+        layer_param << kernel_shape[0] << " " << kernel_shape[1] << " ";
+    } else if (kernel_shape.size() == 3) {
+        layer_param << kernel_shape[0] << " " << kernel_shape[1] << " "
+                    << kernel_shape[2] << " ";
+    }
+
+    //stride
+    if (strides.size() == 1) {
+        layer_param << strides[0] << " ";
+    } else if (strides.size() == 2) {
+        layer_param << strides[0] << " " << strides[1] << " ";
+    } else if (strides.size() == 3) {
+        layer_param << strides[0] << " " << strides[1] << " " << strides[2]
+                    << " ";
+    }
+
+    //pad
+    if (pads.size() == 1) {
+        layer_param << pads[0] << " " << pads[0] << " ";
+    } else if (pads.size() == 2) {
+        layer_param << pads[0] << " " << pads[1] << " ";
+    } else if (pads.size() == 4) {
+        if (pads[0] == pads[2] && pads[1] == pads[3]) {
+            layer_param << pads[0] << " " << pads[1] << " ";
+        } else if (pads[0] < pads[2] || pads[1] < pads[3]) {
+            pad_type = 0;//SAME UPPER
+            layer_param << pads[0] << " " << pads[1] << " ";
+        } else {
+            DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+            assert(0);
+        }
+    } else if (pads.size() == 6) {
+        if (pads[0] == pads[3] && pads[1] == pads[4] && pads[2] == pads[5]) {
+            layer_param << pads[0] << " " << pads[1] << " " << pads[2] << " ";
+        } else if (pads[0] < pads[3] && pads[1] < pads[4] && pads[2] < pads[5]) {
+            pad_type = 0;//SAME UPPER
+            layer_param << pads[0] << " " << pads[1] << " " << pads[2] << " ";
+        } else {
+            DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+            assert(0);
+        }
+    } else {
+        if (auto_pad == "SAME_LOWER" || auto_pad == "SAME_UPPER" ||
+            auto_pad == "VALID" || auto_pad == "") {
+            if (kernel_shape.size() == 3) {
+                layer_param << 0 << " " << 0 << " " << 0 << " ";
+            } else {
+                layer_param << 0 << " " << 0 << " ";
+            }
+        } else {
+            DLog("not implement\n");
+            assert(0);
+        }
+    }
+
+    layer_param << has_bias << " " << pad_type << " ";
+
+    if (dilations.size() == 1) {
+        layer_param << dilations[0] << " ";
+    } else if (dilations.size() == 2) {
+        layer_param << dilations[0] << " " << dilations[1] << " ";
+    } else if (dilations.size() == 3) {
+        layer_param << dilations[0] << " " << dilations[1] << " "
+                    << dilations[2] << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterConv::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterConv::WriteTNNModel(Serializer* net_writer,
+                                       NodeProto& node,
+                                       OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    //对应conv_layer_datad的反序列化
+    net_writer->PutString(name);
+
+    int has_bias = node.input_size() == 3 ? 1 : 0;
+    //                has_bias = 0;
+    net_writer->PutInt(has_bias);
+
+    const onnx::TensorProto& weights = net_info.weights_map[node.input(1)];
+    WriteTensorData(weights, net_writer, net_info.data_type);
+
+    if (has_bias) {
+        const onnx::TensorProto& bias = net_info.weights_map[node.input(2)];
+        WriteTensorData(bias, net_writer, net_info.data_type);
+    }
+
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(Conv, Conv);
+REGISTER_OP_CONVERTER(Conv, ConvTranspose);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cos.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cos.cc
new file mode 100644
index 0000000..21bf4a7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_cos.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Cos, Cos);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_depthtospace.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_depthtospace.cc
new file mode 100644
index 0000000..27e7d70
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_depthtospace.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(DepthToSpace);
+
+string OnnxOpConverterDepthToSpace::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Reorg";
+}
+
+string OnnxOpConverterDepthToSpace::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    int block_size           = get_node_attr_i(node, "blocksize", 2);
+    int run_with_output_dims = 0;
+    string mode_name         = get_node_attr_s(node, "mode", "DCR");
+    int forward              = 1;
+    if (node.op_type() == "SpaceToDepth") {
+        forward = 0;  // SpaceToDepth
+    } else if (node.op_type() == "DepthToSpace") {
+        forward = 1;  // DepthToSpace
+    }
+    int mode;  // 0 for DCR mode, 1 for CRD mode;
+    if (mode_name == "DCR" || node.op_type() == "SpaceToDepth") {
+        mode = 0;
+    } else {
+        mode = 1;
+    }
+    layer_param << block_size << " " << forward << " " << run_with_output_dims << " " << mode << " ";
+
+    return layer_param.str();
+}
+
+
+bool OnnxOpConverterDepthToSpace::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+};
+
+int OnnxOpConverterDepthToSpace::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(DepthToSpace, DepthToSpace);
+REGISTER_OP_CONVERTER(DepthToSpace, SpaceToDepth);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_detection_output.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_detection_output.cc
new file mode 100644
index 0000000..62c2b26
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_detection_output.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(DetectionOutput);
+
+string OnnxOpConverterDetectionOutput::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "DetectionOutput";
+}
+
+string OnnxOpConverterDetectionOutput::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    int64_t num_classes                = get_node_attr_i(node, "num_classes");
+    int64_t share_location             = get_node_attr_i(node, "share_location");
+    int64_t background_label_id        = get_node_attr_i(node, "background_label_id");
+    int64_t variance_encoded_in_target = get_node_attr_i(node, "variance_encoded_in_target");
+    int64_t code_type                  = get_node_attr_i(node, "code_type");
+    int64_t keep_top_k                 = get_node_attr_i(node, "keep_top_k");
+    float confidence_threshold         = get_node_attr_f(node, "confidence_threshold");
+    float nms_threshold                = get_node_attr_f(node, "nms_threshold");
+    int64_t top_k                      = get_node_attr_i(node, "top_k");
+    float eta                          = get_node_attr_f(node, "eta");
+
+    layer_param << num_classes << " ";
+    layer_param << share_location << " ";
+    layer_param << background_label_id << " ";
+    layer_param << variance_encoded_in_target << " ";
+    layer_param << code_type << " ";
+    layer_param << keep_top_k << " ";
+    layer_param << confidence_threshold << " ";
+    layer_param << nms_threshold << " ";
+    layer_param << top_k << " ";
+    layer_param << eta << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterDetectionOutput::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterDetectionOutput::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(DetectionOutput, DetectionOutput);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_div.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_div.cc
new file mode 100644
index 0000000..d9c1d7e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_div.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Div);
+
+
+string OnnxOpConverterDiv::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    auto weight_input = GetWeightInputIndexName(node, net_info);
+    auto weight_input_index = get<0>(weight_input);
+    auto weight_name = get<1>(weight_input);
+    if (weight_input_index == 1) {
+        auto weight = net_info.weights_map[weight_name];
+        if (weight.data_type() == onnx::TensorProto_DataType_FLOAT ||
+            weight.data_type() == onnx::TensorProto_DataType_DOUBLE) {
+            return "Mul";
+        }
+    }
+    return "Div";
+}
+
+string OnnxOpConverterDiv::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterDiv::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+    
+   auto weight_input = GetWeightInputIndexName(node, net_info);
+   auto weight_input_index = get<0>(weight_input);
+   auto weight_name = get<1>(weight_input);
+    
+   if (weight_input_index == 1) {
+       return true;
+   } else {
+       return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+   }
+}
+
+int OnnxOpConverterDiv::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    const std::string &onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+    
+   auto weight_input = GetWeightInputIndexName(node, net_info);
+   auto weight_input_index = get<0>(weight_input);
+   auto weight_name = get<1>(weight_input);
+    do {
+        BREAK_IF(weight_input_index != 1);
+        auto weight = net_info.weights_map[weight_name];
+        
+        BREAK_IF(!(weight.data_type() == onnx::TensorProto_DataType_FLOAT ||
+                   weight.data_type() == onnx::TensorProto_DataType_DOUBLE));
+        
+        //写头信息
+        net_writer->PutInt(0);  //触发type from string
+        net_writer->PutString(tnn_layer_type);
+        net_writer->PutString(name);
+        
+        int size         = get_tensor_proto_data_size(weight);
+        const float *mul = get_tensor_proto_data(weight);
+        float *div       = new float[size];
+        for (int j = 0; j < size; j++) {
+            div[j] = 1.0f / mul[j];
+        }
+        auto dims = GetDimsFromTensor(weight);
+        WriteRawData(div, size, net_writer, net_info.data_type, dims);
+        delete[] div;
+        
+        return 1;
+    } while (0);
+    
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Div, Div);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_dropout.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_dropout.cc
new file mode 100644
index 0000000..4772c6a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_dropout.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Dropout, Dropout);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_einsum.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_einsum.cc
new file mode 100644
index 0000000..690470f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_einsum.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Einsum);
+
+string OnnxOpConverterEinsum::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Einsum";
+}
+
+string OnnxOpConverterEinsum::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    auto equation = get_node_attr_s(node, "equation");
+    std::replace(equation.begin(), equation.end(), ',', ';');
+    layer_param << equation << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterEinsum::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterEinsum::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Einsum, Einsum);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_elu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_elu.cc
new file mode 100644
index 0000000..35b307b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_elu.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Elu);
+
+string OnnxOpConverterElu::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "Elu";
+}
+
+string OnnxOpConverterElu::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    float alpha = get_node_attr_f(node, "alpha", 1.0);
+    layer_param << alpha << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterElu::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterElu::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Elu, Elu);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_equal.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_equal.cc
new file mode 100644
index 0000000..ec74cd8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_equal.cc
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Equal,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+string OnnxOpConverterEqual::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Equal";
+}
+
+std::vector<std::string> OnnxOpConverterEqual::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return GetAllInputNames(node, net_info);
+}
+
+string OnnxOpConverterEqual::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterEqual::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterEqual::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Equal, Equal);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_erf.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_erf.cc
new file mode 100644
index 0000000..ab749c3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_erf.cc
@@ -0,0 +1,18 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Erf, Erf);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_exp.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_exp.cc
new file mode 100644
index 0000000..3041ab5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_exp.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Exp, Exp);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_expand.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_expand.cc
new file mode 100644
index 0000000..e9e4ab3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_expand.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Expand);
+
+string OnnxOpConverterExpand::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Expand";
+}
+
+string OnnxOpConverterExpand::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+//    const std::string &input_name = node.input(0);
+//    // skip weight reshape
+//    if (net_info.weights_map.find(input_name) != net_info.weights_map.end()) {
+//        DLog("expand of weights is not supported, input0:%s out_node:%s\n", node.input(0).c_str(),
+//             node.output(0).c_str());
+//        assert(0);
+//    }
+
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    const auto &shape_name = node.input(1);
+    if (net_info.weights_shape_map.find(shape_name) != net_info.weights_shape_map.end()) {
+        const onnx::TensorProto &shape_tp = net_info.weights_map[node.input(1)];
+        auto shape_data                   = (const int64_t *)get_tensor_proto_data(shape_tp);
+        int shape_dim_size                = get_tensor_proto_data_size(shape_tp);
+        layer_param << shape_dim_size << " ";
+        for (int i = 0; i < shape_dim_size; ++i) {
+            layer_param << shape_data[i] << " ";
+        }
+    } else {
+        int shape_dim_size = 0;
+        layer_param << shape_dim_size << " ";
+    }
+    return layer_param.str();
+}
+
+bool OnnxOpConverterExpand::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterExpand::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Expand, Expand);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_flatten.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_flatten.cc
new file mode 100644
index 0000000..88723ed
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_flatten.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Flatten);
+
+string OnnxOpConverterFlatten::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Flatten";
+}
+
+string OnnxOpConverterFlatten::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    if (node.input_size() == 1 || node.input_size() == 2) {
+        const std::string &input_name = node.input(0);
+
+        // skip weight Flatten
+        if (net_info.weights_map.find(input_name) != net_info.weights_map.end()) {
+            DLog("Flatten of weights is not supported, input0:%s out_node:%s\n", node.input(0).c_str(),
+                 node.output(0).c_str());
+            assert(0);
+        }
+    }
+    ostringstream layer_param;
+    int axis = get_node_attr_i(node, "axis");
+    layer_param << axis << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterFlatten::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterFlatten::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Flatten, Flatten);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_floor.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_floor.cc
new file mode 100644
index 0000000..cd42355
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_floor.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Floor, Floor);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gather.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gather.cc
new file mode 100644
index 0000000..2568fbf
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gather.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Gather);
+
+string OnnxOpConverterGather::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Gather";
+}
+
+string OnnxOpConverterGather::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    auto tnn_op_type           = TNNOpType(node, net_info);
+
+    int axis = 0;
+    if (node_has_attr(node, "axis")) {
+        axis = (int)get_node_attr_i(node, "axis");
+    }
+
+    //auto indices = get_node_attr_ai(node, "indices", net_info, 1);
+
+    ostringstream layer_param;
+    layer_param << axis << " ";
+    auto data_iter    = net_info.weights_map.find(node.input(0));
+    auto indices_iter = net_info.weights_map.find(node.input(1));
+    layer_param << (data_iter == net_info.weights_map.end() ? 0 : 1) << " ";
+    layer_param << (indices_iter == net_info.weights_map.end() ? 0 : 1) << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterGather::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterGather::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    std::string name                  = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    auto data_iter    = net_info.weights_map.find(node.input(0));
+    auto indices_iter = net_info.weights_map.find(node.input(1));
+    if (data_iter != net_info.weights_map.end()) {
+        net_writer->PutInt(1);
+        WriteTensorData(data_iter->second, net_writer, net_info.data_type);
+    } else {
+        net_writer->PutInt(0);
+    }
+    if (indices_iter != net_info.weights_map.end()) {
+        net_writer->PutInt(1);
+        WriteTensorData(indices_iter->second, net_writer, DATA_TYPE_INT32);
+    } else {
+        net_writer->PutInt(0);
+    }
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(Gather, Gather);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gathernd.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gathernd.cc
new file mode 100644
index 0000000..1ea72a7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gathernd.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(GatherND,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+string OnnxOpConverterGatherND::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "GatherND";
+}
+
+std::vector<std::string> OnnxOpConverterGatherND::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return GetAllInputNames(node, net_info);
+}
+
+string OnnxOpConverterGatherND::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+    
+    int64_t batch_dims = get_node_attr_i(node, "batch_dims", 0);
+
+    layer_param << batch_dims << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterGatherND::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterGatherND::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(GatherND, GatherND);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gelu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gelu.cc
new file mode 100644
index 0000000..6060c0b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gelu.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(GELU);
+
+string OnnxOpConverterGELU::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "GELU";
+}
+
+string OnnxOpConverterGELU::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    auto approximation = get_node_attr_i(node, "approximation", 0);
+    
+    ostringstream layer_param;
+    layer_param << approximation;
+    return layer_param.str();
+}
+
+bool OnnxOpConverterGELU::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterGELU::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(GELU, GELU);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gemm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gemm.cc
new file mode 100644
index 0000000..7daad9c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gemm.cc
@@ -0,0 +1,168 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+template <class T>
+onnx::TensorProto MakeTensor(const std::string &name, const std::vector<T> &v,
+                             const std::vector<int> &shape, onnx::TensorProto_DataType data_type)
+{
+    onnx::TensorProto tensor;
+
+    tensor.set_name(name);
+    for (auto dim : shape)
+    {
+        tensor.add_dims(dim);
+    }
+    tensor.set_data_type(data_type);
+    tensor.mutable_raw_data()->assign(
+            reinterpret_cast<const char *>(v.data()), v.size() * sizeof(T));
+
+    return tensor;
+}
+
+DECLARE_OP_CONVERTER(Gemm);
+
+string OnnxOpConverterGemm::TNNOpType(NodeProto& node,
+                                      OnnxNetInfo &net_info) {
+    float alpha = get_node_attr_f(node, "alpha", 1.f);
+    float beta  = get_node_attr_f(node, "beta", 1.f);
+    int transA  = (int)get_node_attr_i(node, "transA", 0);
+    int transB  = (int)get_node_attr_i(node, "transB", 0);
+    if (std::abs(alpha - 1.f) <= 1e-6) {
+        // InnerProduct-like A * B + C
+        if (transA == 0) {
+            return "InnerProduct";
+        }
+    }
+    return "";
+}
+
+bool OnnxOpConverterGemm::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+string OnnxOpConverterGemm::TNNLayerParam(NodeProto& node,
+                                          OnnxNetInfo& net_info) {
+    ostringstream layer_param;
+
+    const std::string& onnx_op = node.op_type();
+
+    float alpha   = get_node_attr_f(node, "alpha", 1.f);
+    float beta    = get_node_attr_f(node, "beta", 1.f);
+    int broadcast = (int)get_node_attr_i(node, "broadcast", 0);
+    int transA    = (int)get_node_attr_i(node, "transA", 0);
+    int transB    = (int)get_node_attr_i(node, "transB", 0);
+
+    if (!(beta == 1 || beta == 0)) {
+        DLog("error::Gemm convert failed: beta should be 0 or 1\n");
+        assert(0);
+    }
+
+    if (alpha == 1.f) {
+        // InnerProduct-like A * B + C
+        if (transA == 0) {
+            int axis = 1;  // Fix TODO
+            const onnx::TensorProto& weights =
+                    net_info.weights_map[node.input(1)];
+            int channel_output =
+                    transB ? (int)weights.dims(0) : (int)weights.dims(1);
+            int has_bias = 1;
+            layer_param << channel_output << " " << has_bias << " "
+                        << (int)0 << " " << axis << " ";
+        }
+    } else {
+        DLog("error::Gemm convert failed:transA(%d) transB(%d)\n", transA,
+             transB);
+        assert(0);
+    }
+
+    return layer_param.str();
+}
+
+int OnnxOpConverterGemm::WriteTNNModel(Serializer* net_writer,
+                                       NodeProto& node,
+                                       OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    float alpha = get_node_attr_f(node, "alpha", 1.f);
+    float beta  = get_node_attr_f(node, "beta", 1.f);
+    int transA  = (int)get_node_attr_i(node, "transA", 0);
+    int transB  = (int)get_node_attr_i(node, "transB", 0);
+    if (alpha == 1.f) {
+        // InnerProduct-like A * B + C
+        if (transA == 0) {
+            int axis = 1;  // Fix TODO
+
+            //写头信息
+            net_writer->PutInt(0);  //触发type from string
+            net_writer->PutString(tnn_layer_type);
+            net_writer->PutString(name);
+
+            //写数据
+            //对应innerproduct_data的反序列化
+            net_writer->PutString(name);
+            auto B = get_node_attr_tensor(node, "B", net_info, 1);
+            if (transB == 1) {
+                WriteTensorData(B, net_writer, net_info.data_type);
+            } else {
+                auto const h = B.dims(0);
+                auto const w = B.dims(1);
+
+                float* permuted_data = new float[h * w];
+                auto bptr = get_tensor_proto_data(B);
+
+                float* permuted_data_ptr = permuted_data;
+                for (int j = 0; j < w; j++) {
+                    for (int k = 0; k < h; k++) {
+                        float vb = bptr[k * w + j];
+                        *permuted_data_ptr = vb;
+                        permuted_data_ptr++;
+                    }
+                }
+
+                WriteRawData(permuted_data, (int)(h * w), net_writer, net_info.data_type, {int(h), int(w)});
+                delete [] permuted_data;
+            }
+
+            int num_bias = B.dims(1);
+            if (transB == 1) {
+                num_bias = B.dims(0);
+            }
+
+            onnx::TensorProto bias;
+            if (node.input_size() == 3) {
+                bias = get_node_attr_tensor(node, "C", net_info, 2);
+            } else {
+                std::vector<int> bias_shape = {num_bias};
+                std::vector<float> bias_data(num_bias, 0.0f);
+                bias = MakeTensor("C", bias_data, bias_shape, onnx::TensorProto::FLOAT);
+            }
+            WriteTensorData(bias, net_writer, net_info.data_type);
+        }
+    }
+
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(Gemm, Gemm);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gridsample.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gridsample.cc
new file mode 100644
index 0000000..9fad7f8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_gridsample.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(GridSample);
+
+string OnnxOpConverterGridSample::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "GridSample";
+}
+
+string OnnxOpConverterGridSample::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+    auto mode = get_node_attr_ai(node, "mode", net_info, 2);
+    if (mode.size() > 0 && mode[0] == 0) {
+        //bilinear
+        layer_param << "2 ";
+    } else {
+        LOGE("GridSample dont support mode");
+        return "";
+    }
+    
+    auto pade_type = get_node_attr_ai(node, "padding_mode", net_info, 3);
+    if (pade_type.size() > 0 && pade_type[0] == 0) {
+        //padding zeros
+        layer_param << "0 ";
+    } else {
+        LOGE("GridSample dont support pade_type");
+        return "";
+    }
+    
+    auto align_corners = get_node_attr_ai(node, "align_corners", net_info, 4);
+    if (align_corners.size() > 0 && align_corners[0] == 0) {
+        //false
+        layer_param << "0 ";
+    } else {
+        LOGE("GridSample dont support align_corners");
+        return "";
+    }
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterGridSample::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterGridSample::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(GridSample, GridSample);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_group_norm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_group_norm.cc
new file mode 100644
index 0000000..d4795a1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_group_norm.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(GroupNorm,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterGroupNorm::TNNOpType(NodeProto& node,
+                                               OnnxNetInfo &net_info) {
+    return "GroupNorm";
+}
+
+std::vector<std::string> OnnxOpConverterGroupNorm::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return GetAllInputNames(node, net_info);
+}
+string OnnxOpConverterGroupNorm::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    const onnx::TensorProto& scale = net_info.weights_map[node.input(1)];
+    
+    auto num_groups = get_node_attr_i(node, "num_groups", 0);
+    float eps = get_node_attr_f(node, "eps", 1e-5f);
+
+    ostringstream layer_param;
+    layer_param << num_groups << " " << eps;
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterGroupNorm::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterGroupNorm::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    //similar to LSTM, from 2020.12.20 on, no WriteTNNModel, no LayerResource, use const_resource_map
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(GroupNorm, GroupNormalization);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_sigmoid.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_sigmoid.cc
new file mode 100644
index 0000000..fcac281
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_sigmoid.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(HardSigmoid);
+
+string OnnxOpConverterHardSigmoid::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "HardSigmoid";
+}
+
+string OnnxOpConverterHardSigmoid::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    float alpha = get_node_attr_f(node, "alpha", 2.f/10.f);
+    float beta = get_node_attr_f(node, "beta", 5.f/10.f);
+    layer_param << alpha << " " << beta << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterHardSigmoid::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterHardSigmoid::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(HardSigmoid, HardSigmoid);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_swish.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_swish.cc
new file mode 100644
index 0000000..cf4915c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hard_swish.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(HardSwish);
+
+string OnnxOpConverterHardSwish::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "HardSwish";
+}
+
+string OnnxOpConverterHardSwish::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    float alpha = get_node_attr_f(node, "alpha", 1.f/6.f);
+    float beta = get_node_attr_f(node, "beta", 3.f/6.f);
+    layer_param << alpha << " " << beta << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterHardSwish::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterHardSwish::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(HardSwish, HardSwish);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hdrguide.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hdrguide.cc
new file mode 100644
index 0000000..e7e0ce3
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_hdrguide.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER(HDRGuide);
+
+string OnnxOpConverterHDRGuide::TNNOpType(NodeProto& node,
+                                                OnnxNetInfo &net_info) {
+    return "HDRGuide";
+}
+
+string OnnxOpConverterHDRGuide::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return "";
+}
+
+bool OnnxOpConverterHDRGuide::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterHDRGuide::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    float epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
+
+    const onnx::TensorProto& ccm_weight = net_info.weights_map[node.input(1)];
+    const onnx::TensorProto& ccm_bias  = net_info.weights_map[node.input(2)];
+    const onnx::TensorProto& shifts  = net_info.weights_map[node.input(3)];
+    const onnx::TensorProto& slopes   = net_info.weights_map[node.input(4)];
+    const onnx::TensorProto& projection_weight = net_info.weights_map[node.input(5)];
+    const onnx::TensorProto& projection_bias  = net_info.weights_map[node.input(6)];
+    
+    WriteTensorData(ccm_weight, net_writer, net_info.data_type);
+    WriteTensorData(ccm_bias, net_writer, net_info.data_type);
+    WriteTensorData(shifts, net_writer, net_info.data_type);
+    WriteTensorData(slopes, net_writer, net_info.data_type);
+    WriteTensorData(projection_weight, net_writer, net_info.data_type);
+    WriteTensorData(projection_bias, net_writer, net_info.data_type);
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(HDRGuide, HDRGuide);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_instance_norm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_instance_norm.cc
new file mode 100644
index 0000000..aaff008
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_instance_norm.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER(InstNorm);
+
+string OnnxOpConverterInstNorm::TNNOpType(NodeProto& node,
+                                               OnnxNetInfo &net_info) {
+    return "InstBatchNormCxx";
+}
+
+string OnnxOpConverterInstNorm::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    const onnx::TensorProto& scale = net_info.weights_map[node.input(1)];
+
+    float epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
+    int channels = get_tensor_proto_data_size(scale);
+    
+    
+    ostringstream layer_param;
+    layer_param << channels << " " << epsilon;
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterInstNorm::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterInstNorm::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node,net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    const onnx::TensorProto& scale = net_info.weights_map[node.input(1)];
+    const onnx::TensorProto& b = net_info.weights_map[node.input(2)];
+
+    int channels = get_tensor_proto_data_size(scale);
+
+    const float* scale_data = get_tensor_proto_data(scale);
+    const float* b_data = get_tensor_proto_data(b);
+
+    WriteRawData(scale_data, channels, net_writer, net_info.data_type, {channels});
+    WriteRawData(b_data, channels, net_writer, net_info.data_type, {channels});
+
+
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(InstNorm, InstanceNormalization);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_inverse.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_inverse.cc
new file mode 100644
index 0000000..fcd8a0a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_inverse.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Inverse);
+
+string OnnxOpConverterInverse::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Inverse";
+}
+
+string OnnxOpConverterInverse::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterInverse::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterInverse::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Inverse, Inverse);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_layer_norm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_layer_norm.cc
new file mode 100644
index 0000000..ca74eb1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_layer_norm.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER_WITH_FUNC(LayerNorm,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterLayerNorm::TNNOpType(NodeProto& node,
+                                               OnnxNetInfo &net_info) {
+    return "LayerNorm";
+}
+
+std::vector<std::string> OnnxOpConverterLayerNorm::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return GetAllInputNames(node, net_info);
+}
+string OnnxOpConverterLayerNorm::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    
+    auto axes_size = get_node_attr_i(node, "reduce_axes_size", 0);
+    float eps = get_node_attr_f(node, "epsilon", 1e-5f);
+
+    ostringstream layer_param;
+    layer_param << axes_size << " " << eps;
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterLayerNorm::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterLayerNorm::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    //similar to LSTM, from 2020.12.20 on, no WriteTNNModel, no LayerResource, use const_resource_map
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(LayerNorm, LayerNormalization);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_leakyrelu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_leakyrelu.cc
new file mode 100644
index 0000000..47a4865
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_leakyrelu.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(LeakyRelu);
+
+string OnnxOpConverterLeakyRelu::TNNOpType(NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+    return "PReLU";
+}
+
+string OnnxOpConverterLeakyRelu::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    layer_param << "1 0 ";
+    return layer_param.str();
+}
+
+bool OnnxOpConverterLeakyRelu::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+};
+
+int OnnxOpConverterLeakyRelu::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    float slope = get_node_attr_f(node, "alpha", 0.01f);
+
+    net_writer->PutString(name);
+    WriteRawData(&slope, 1, net_writer, DATA_TYPE_FLOAT, {1});
+    
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(LeakyRelu, LeakyRelu);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_log.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_log.cc
new file mode 100644
index 0000000..ec154a9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_log.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Log, Log);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_logsigmoid.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_logsigmoid.cc
new file mode 100644
index 0000000..8a720d1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_logsigmoid.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(LogSigmoid, LogSigmoid);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lp_normalization.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lp_normalization.cc
new file mode 100644
index 0000000..6401eae
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lp_normalization.cc
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER(LpNormalization);
+
+string OnnxOpConverterLpNormalization::TNNOpType(NodeProto& node,
+                                                OnnxNetInfo &net_info) {
+    return "Normalize";
+}
+
+string OnnxOpConverterLpNormalization::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+
+
+    int across_spatial_dims = 0;
+    float epsilon = get_node_attr_f(node, "epsilon", 1e-10); // default esp from pytorch.
+    int channel_shared = 0;
+    int dim = (int)get_node_attr_i(node, "axis", 1);
+    int p = (int)get_node_attr_i(node, "p", 2);
+
+    ostringstream layer_param;
+    layer_param <<across_spatial_dims<<" "<<epsilon<<" "<<channel_shared<<" "<<dim<<" "<<p<<" ";
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterLpNormalization::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    bool convert_for_old_model = false;
+    if (convert_for_old_model) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+int OnnxOpConverterLpNormalization::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    bool convert_for_old_model = false;
+    if (convert_for_old_model) {
+        const onnx::TensorProto& scale = net_info.weights_map[node.input(1)];
+
+        const std::string& onnx_op = node.op_type();
+        std::string name = !node.name().empty() ? node.name() : node.output(0);
+        const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+        //写头信息
+        net_writer->PutInt(0);  //触发type from string
+        net_writer->PutString(tnn_layer_type);
+        net_writer->PutString(name);
+
+        //写数据
+        // TNN 默认需要scale 作为weights, 长度是channel数量。
+        // 暂时未知通道数量，临时按1024来写weights
+
+        const float scale_data = 1.0f;
+        int channels = 1024;
+
+        float* k = new float[channels];
+
+        // TNN 需要scale参数，onnx 模型中未提供，所以默认初始化为1.0f
+        for(int i=0;i<channels;i++) {
+            k[i] = 1.0f;
+        }
+
+        WriteRawData(k, channels, net_writer, net_info.data_type, {channels});
+
+        delete [] k;
+
+
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+REGISTER_OP_CONVERTER(LpNormalization, LpNormalization);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lrn.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lrn.cc
new file mode 100644
index 0000000..6a26aff
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lrn.cc
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <iostream>
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(LRN);
+
+string OnnxOpConverterLRN::TNNOpType(NodeProto& node,
+                                          OnnxNetInfo& net_info) {
+    return "LRN";
+}
+
+string OnnxOpConverterLRN::TNNLayerParam(NodeProto& node,
+                                              OnnxNetInfo& net_info) {
+    float alpha = get_node_attr_f(node, "alpha", 0.0001);
+    float beta  = get_node_attr_f(node, "beta", 0.75);
+    float bias  = get_node_attr_f(node, "bias", 1.0);
+    float size  = get_node_attr_i(node, "size");
+
+    ostringstream layer_param;
+    layer_param << alpha << " ";
+    layer_param << beta << " ";
+    layer_param << bias << " ";
+    layer_param << size << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterLRN::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterLRN::WriteTNNModel(Serializer* net_writer,
+                                           NodeProto& node,
+                                           OnnxNetInfo& net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(LRN, LRN);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lstm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lstm.cc
new file mode 100644
index 0000000..02320fa
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_lstm.cc
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_OP_CONVERTER_WITH_FUNC(LSTM,
+                               std::vector<std::string> GetInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterLSTM::TNNOpType(NodeProto& node,
+                                                OnnxNetInfo &net_info) {
+    return "LSTMONNX";
+}
+
+std::vector<std::string> OnnxOpConverterLSTM::GetInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<std::string> input_names;
+    for (int j = 0; j < (int)node.input_size(); j++) {
+        const auto input_name = node.input(j);
+        if (input_name.length() <= 0) {
+            continue;
+        }
+        input_names.push_back(input_name);
+    }
+    return input_names;
+}
+
+string OnnxOpConverterLSTM::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    if (node.input(4).length() > 0) {
+        DLog("Note: sequence_lens  is only supported\n");
+        assert(0);
+    }
+    
+    int hidden_size = (int)get_node_attr_i(node, "hidden_size", 0);
+    auto direction_s = get_node_attr_s(node, "direction", "forward");
+    int direction = 0;
+    if (direction_s == "reverse") {
+        direction = 1;
+    } else if (direction_s == "bidirectional") {
+        direction = 2;
+    }
+
+    ostringstream layer_param;
+    layer_param <<0<<" "<<hidden_size<<" "<<direction<<" ";
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterLSTM::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+};
+
+int OnnxOpConverterLSTM::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    //write weights in constant resource from now on
+    //write weights in constant resource from now on
+    //write weights in constant resource from now on
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(LSTM, LSTM);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_matmul.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_matmul.cc
new file mode 100644
index 0000000..22aa56d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_matmul.cc
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <iostream>
+
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(MatMul);
+
+string OnnxOpConverterMatMul::TNNOpType(NodeProto& node, OnnxNetInfo& net_info) {
+    return "MatMul";
+}
+
+string OnnxOpConverterMatMul::TNNLayerParam(NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterMatMul::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterMatMul::WriteTNNModel(Serializer* net_writer, NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(MatMul, MatMul);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_max.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_max.cc
new file mode 100644
index 0000000..a537060
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_max.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Max);
+
+
+string OnnxOpConverterMax::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    return "Maximum";
+}
+
+string OnnxOpConverterMax::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterMax::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterMax::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Max, Max);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_min.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_min.cc
new file mode 100644
index 0000000..3af4cc7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_min.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Min);
+
+
+string OnnxOpConverterMin::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    return "Minimum";
+}
+
+string OnnxOpConverterMin::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterMin::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+};
+
+int OnnxOpConverterMin::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Min, Min);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_mul.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_mul.cc
new file mode 100644
index 0000000..015bf0f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_mul.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Mul);
+
+
+string OnnxOpConverterMul::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    return "Mul";
+}
+
+string OnnxOpConverterMul::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterMul::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterMul::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Mul, Mul);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.cc
new file mode 100644
index 0000000..992ef73
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_converter_multidir_broadcast.h"
+
+#include "onnx_utility.h"
+
+std::tuple<int, std::string> OnnxOpConverterMultiBrodcast::GetWeightInputIndexName(NodeProto &node,
+                                                                                   OnnxNetInfo &net_info) {
+    int weight_input_index = -1;
+    int weight_input_count = 0;
+    string weight_name     = "";
+
+    std::map<std::string, onnx::TensorProto>::iterator it;
+    for (int j = 0; j < node.input_size(); j++) {
+        const std::string &input_name = node.input(j);
+        it                            = net_info.weights_map.find(input_name);
+        if (it != net_info.weights_map.end()) {
+            if (weight_input_count <= 0) {
+                weight_input_index = j;
+                weight_name = input_name;
+            } else {
+                weight_input_index = -1;
+                weight_name     = "";
+            }
+            weight_input_count++;
+        }
+    }
+
+    return std::make_tuple(weight_input_index, weight_name);
+}
+
+string OnnxOpConverterMultiBrodcast::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    auto weight_input       = GetWeightInputIndexName(node, net_info);
+    auto weight_input_index = get<0>(weight_input);
+    if (weight_input_index >= 0) {
+        ostringstream layer_param;
+        layer_param << weight_input_index << " ";
+        return layer_param.str();
+    }
+    return "";
+}
+
+bool OnnxOpConverterMultiBrodcast::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op        = node.op_type();
+    std::string name                  = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    auto weight_input       = GetWeightInputIndexName(node, net_info);
+    auto weight_input_index = get<0>(weight_input);
+    auto weight_name        = get<1>(weight_input);
+    if (weight_input_index < 0) {
+        return false;
+    }
+    return true;
+}
+
+int OnnxOpConverterMultiBrodcast::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op        = node.op_type();
+    std::string name                  = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    auto weight_input       = GetWeightInputIndexName(node, net_info);
+    auto weight_input_index = get<0>(weight_input);
+    auto weight_name        = get<1>(weight_input);
+    if (weight_input_index < 0) {
+        return 0;
+    }
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    const onnx::TensorProto &weight = net_info.weights_map[weight_name];
+    WriteTensorData(weight, net_writer, DATA_TYPE_AUTO);
+    
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.h b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.h
new file mode 100644
index 0000000..6cddbb1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_multidir_broadcast.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#ifndef onnx_converter_multidir_broadcast_hpp_
+#define onnx_converter_multidir_broadcast_hpp_
+#include <memory>
+#include <tuple>
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+class OnnxOpConverterMultiBrodcast : public OnnxOpConverter {
+public:
+    OnnxOpConverterMultiBrodcast(string ignore) : OnnxOpConverter(ignore){};
+    virtual ~OnnxOpConverterMultiBrodcast(){};
+    virtual string TNNOpType(NodeProto &, OnnxNetInfo &) = 0;
+    virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &);
+    virtual bool HasLayerResource(NodeProto &node, OnnxNetInfo &net_info);
+    virtual int WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &);
+
+protected:
+    std::tuple<int, std::string> GetWeightInputIndexName(NodeProto &, OnnxNetInfo &);
+};
+
+#define DECLARE_MULTI_BROADCASR_OP_CONVERTER(onnx_type)                                                                \
+    class OnnxOpConverter##onnx_type : public OnnxOpConverterMultiBrodcast {                                           \
+    public:                                                                                                            \
+        OnnxOpConverter##onnx_type(string ignore) : OnnxOpConverterMultiBrodcast(ignore){};                            \
+        virtual ~OnnxOpConverter##onnx_type(){};                                                                       \
+        virtual string TNNOpType(NodeProto &, OnnxNetInfo &net_info);                                                  \
+        virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &);                                                      \
+        virtual bool HasLayerResource(NodeProto &, OnnxNetInfo &); \
+        virtual int WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &);                                           \
+    }
+
+#define REGISTER_MULTI_BROADCASR_OP_CONVERTER(converter_suffix, onnx_type)                                             \
+    REGISTER_OP_CONVERTER(converter_suffix, onnx_type)
+
+#endif
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_neg.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_neg.cc
new file mode 100644
index 0000000..3552195
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_neg.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Neg, Neg);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_nonzero.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_nonzero.cc
new file mode 100644
index 0000000..3b15a6e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_nonzero.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(NonZero);
+
+string OnnxOpConverterNonZero::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "NonZero";
+}
+
+string OnnxOpConverterNonZero::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "";
+}
+
+
+bool OnnxOpConverterNonZero::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+};
+
+int OnnxOpConverterNonZero::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(NonZero, NonZero);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_normalize.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_normalize.cc
new file mode 100644
index 0000000..f725b71
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_normalize.cc
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+DECLARE_OP_CONVERTER(Normalize);
+
+string OnnxOpConverterNormalize::TNNOpType(NodeProto& node,
+                                                OnnxNetInfo &net_info) {
+    return "Normalize";
+}
+
+string OnnxOpConverterNormalize::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+
+
+    int across_spatial_dims = 0;
+    float epsilon = get_node_attr_f(node, "epsilon", 1e-12); // default esp from pytorch.
+    int channel_shared = 0;
+    int dim = (int)get_node_attr_i(node, "dim", 1);
+    int p = (int)get_node_attr_i(node, "p", 2);
+
+    ostringstream layer_param;
+    layer_param <<across_spatial_dims<<" "<<epsilon<<" "<<channel_shared<<" "<<dim<<" "<<p<<" ";
+    
+    return layer_param.str();
+}
+
+bool OnnxOpConverterNormalize::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    bool convert_for_old_model = false;
+    if (convert_for_old_model) {
+        return true;
+    } else {
+        return false;
+    }
+};
+
+int OnnxOpConverterNormalize::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    bool convert_for_old_model = false;
+    if (convert_for_old_model) {
+        const onnx::TensorProto& scale = net_info.weights_map[node.input(1)];
+
+        const std::string& onnx_op = node.op_type();
+        std::string name = !node.name().empty() ? node.name() : node.output(0);
+        const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+        //写头信息
+        net_writer->PutInt(0);  //触发type from string
+        net_writer->PutString(tnn_layer_type);
+        net_writer->PutString(name);
+
+        //写数据
+        // TNN 默认需要scale 作为weights, 长度是channel数量。
+        // 暂时未知通道数量，临时按1024来写weights
+
+        const float scale_data = 1.0f;
+        int channels = 1024;
+
+        float* k = new float[channels];
+
+        // TNN 需要scale参数，onnx 模型中未提供，所以默认初始化为1.0f
+        for(int i=0;i<channels;i++) {
+            k[i] = 1.0f;
+        }
+
+        WriteRawData(k, channels, net_writer, net_info.data_type , {channels});
+
+        delete [] k;
+
+
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+REGISTER_OP_CONVERTER(Normalize, Normalize);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_not.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_not.cc
new file mode 100644
index 0000000..bfb7e0a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_not.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Neg, Not);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_onehot.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_onehot.cc
new file mode 100644
index 0000000..6025f7f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_onehot.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(OneHot);
+
+string OnnxOpConverterOneHot::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "OneHot";
+}
+
+string OnnxOpConverterOneHot::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+    int axis = (int)get_node_attr_i(node, "axis", -1);
+    
+    int depth = -1;
+    if (node_has_attr(node, "depth")) {
+        depth = (int)get_node_attr_i(node, "depth");
+    } else if (node.input_size() >1 && net_info.weights_map.find(node.input(1)) != net_info.weights_map.end()) {
+        const onnx::TensorProto& tensorProto = net_info.weights_map.at(node.input(1));
+        auto depth_i = get_tensor_proto_data_vector<int>(tensorProto);
+        depth = depth_i[0];
+    }
+    
+    layer_param << axis <<" "<< depth << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterOneHot::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterOneHot::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(OneHot, OneHot);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pad.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pad.cc
new file mode 100644
index 0000000..c080666
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pad.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Pad);
+
+string OnnxOpConverterPad::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<int64_t> pads = get_node_attr_ai(node, "pads", net_info, 1);
+    if (pads.size() == 8) {
+        return "Pad";
+    } else {
+        return "PadV2";
+    }
+}
+
+string OnnxOpConverterPad::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    std::string mode          = get_node_attr_s(node, "mode");
+    std::vector<int64_t> pads = get_node_attr_ai(node, "pads", net_info, 1);
+    float const_value         = get_node_attr_f(node, "value", net_info, 2, 0.f);
+
+    int type = 0;
+    if (mode == "constant" || mode == "") {
+        type = 0;
+    } else if (mode == "reflect") {
+        DLog("Warning: Pad mode 1(reflect) may not support, depend on device\n");
+        // TNN reflect pad has a type of 1
+        type = 1;
+    } else if (mode == "edge") {
+        DLog("Warning: Pad mode 2(edge) may not support, depend on device\n");
+        // TNN reflect pad has a type of 1
+        type = 2;
+    } else {
+        DLog("Warning: Pad mode 3 may not support, depend on device\n");
+        type = 3;
+    }
+    
+    auto op_type = TNNOpType(node, net_info);
+    
+    if (op_type == "Pad") {
+        if (pads.size() == 10) {
+            int64_t pad_t   = pads[2];
+            int64_t pad_b   = pads[7];
+            int64_t pad_l   = pads[3];
+            int64_t pad_r   = pads[8];
+            int64_t pad_d_f = pads[4];
+            int64_t pad_d_b = pads[9];
+
+            layer_param << "0 0 " << pad_t << " " << pad_b << " " << pad_l << " " << pad_r << " " << pad_d_f << " "
+                        << pad_d_b << " 0 0 " << type << " ";
+        } else if (pads.size() == 8) {
+            int64_t pad_c_b = pads[1];
+            int64_t pad_c_e = pads[5];
+            int64_t pad_t   = pads[2];
+            int64_t pad_b   = pads[6];
+            int64_t pad_l   = pads[3];
+            int64_t pad_r   = pads[7];
+            if ((type == 1 || type == 2) && (pad_c_b != 0 || pad_c_e != 0)) {
+                DLog("Pad (edge, reflect) do not support pad in channel!");
+                assert(0);
+            }
+            layer_param << "0 0 " << pad_t << " " << pad_b << " " << pad_l << " " << pad_r << " " << pad_c_b << " "
+                        << pad_c_e << " " << type << " ";
+        } else if (pads.size() == 6) {
+            int64_t pad_c_b = pads[1];
+            int64_t pad_c_e = pads[4];
+            int64_t pad_t   = pads[2];
+            int64_t pad_b   = pads[5];
+            int64_t pad_l   = 0;
+            int64_t pad_r   = 0;
+            layer_param << "0 0 " << pad_t << " " << pad_b << " " << pad_l << " " << pad_r << " " << pad_c_b << " "
+                        << pad_c_e << " " << type << " ";
+        }
+    } else {
+        int dim_size = (int)pads.size()/2;
+        layer_param << dim_size << " ";
+        for (int i=0; i<pads.size(); i++) {
+            layer_param << pads[i] << " ";
+        }
+        layer_param << type << " ";
+    }
+    layer_param << const_value << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPad::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+};
+
+int OnnxOpConverterPad::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Pad, Pad);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pixel_shuffle.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pixel_shuffle.cc
new file mode 100644
index 0000000..7f06c8e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pixel_shuffle.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(PixelShuffle);
+
+string OnnxOpConverterPixelShuffle::TNNOpType(NodeProto &, OnnxNetInfo &net_info) {
+    return "PixelShuffle";
+}
+
+string OnnxOpConverterPixelShuffle::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    int upscale_factor = get_node_attr_i(node, "upscale_factor");
+    ostringstream layer_param;
+    layer_param << upscale_factor << " ";
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPixelShuffle::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterPixelShuffle::WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(PixelShuffle, PixelShuffle);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pool.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pool.cc
new file mode 100644
index 0000000..8c66d90
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pool.cc
@@ -0,0 +1,183 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+DECLARE_OP_CONVERTER(Pool);
+
+string OnnxOpConverterPool::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    std::vector<int64_t> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+    return kernel_shape.size() == 3 ? "Pooling3D" : "Pooling";
+}
+
+// NOTE: 由于 Caffe 的 Average Pool 的计算很特殊，与 Pytorch 的 Average Pool 计算不同，
+// Caffe 计算 Average Pool 时，除数是 kernel_h * kernel_w
+// Pytorch(ONNN) 计算 Average Pool 时，除数是有效位数
+// 以上两种计算方式在计算 Average Pool 的边缘时，会造成结果差异。
+// 在优化 Average Pool 时，需要注意不能将 caffe 的 Pad + Average Pool 优化掉。
+string OnnxOpConverterPool::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    const std::string &onnx_op = node.op_type();
+    if (onnx_op == "AveragePool" || onnx_op == "MaxPool") {
+        std::string auto_pad = get_node_attr_s(node, "auto_pad");
+        std::vector<int64_t> kernel_shape =
+            get_node_attr_ai(node, "kernel_shape");
+        std::vector<int64_t> strides = get_node_attr_ai(node, "strides");
+        std::vector<int64_t> pads    = get_node_attr_ai(node, "pads");
+        //计算输出时候采用的截断方式 0：floor 1：ceil
+        int ceil_mode = (int)get_node_attr_i(node, "ceil_mode", 0);
+        
+        int pad_type = -1;
+        if (auto_pad == "SAME_UPPER") {
+            pad_type = 0;
+        } else if (auto_pad == "VALID") {
+            pad_type = 1;
+        } else if (auto_pad == "SAME_LOWER") {
+            pad_type = 0;
+            //can be conbine with ceil mode
+            DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+            assert(0);
+        }
+
+        int pool_type = (onnx_op == "AveragePool") ? 1 : 0;
+        layer_param << pool_type << " ";
+
+        bool is3d = false;
+        if (kernel_shape.size() == 1) {
+            layer_param << kernel_shape[0] << " " << kernel_shape[0] << " ";
+        } else if (kernel_shape.size() == 2) {
+            layer_param << kernel_shape[0] << " " << kernel_shape[1] << " ";
+        } else if (kernel_shape.size() == 3) {
+            is3d = true;
+            layer_param << kernel_shape[0] << " " << kernel_shape[1] << " "
+                        << kernel_shape[2] << " ";
+        }
+
+        if (strides.size() == 1) {
+            layer_param << strides[0] << " " << strides[0] << " ";
+        } else if (strides.size() == 2) {
+            layer_param << strides[0] << " " << strides[1] << " ";
+        } else if (strides.size() == 3) {
+            layer_param << strides[0] << " " << strides[1] << " " << strides[2]
+                        << " ";
+        }
+
+        if (pads.size() == 1) {
+            layer_param << pads[0] << " " << pads[0] << " ";
+        } else if (pads.size() == 2) {
+            layer_param << pads[0] << " " << pads[1] << " ";
+        } else if (pads.size() == 4) {
+            if (pads[0] == pads[2] && pads[1] == pads[3]) {
+                layer_param << pads[0] << " " << pads[1] << " ";
+            } else if (pads[0] < pads[2] && pads[1] < pads[3]) {
+                pad_type = 0;//SAME UPPER
+                layer_param << pads[0] << " " << pads[1] << " ";
+            } else {
+                DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+                assert(0);
+            }
+        } else if (pads.size() == 6) {
+            if (pads[0] == pads[3] && pads[1] == pads[4] && pads[2] == pads[5]) {
+                layer_param << pads[0] << " " << pads[1] << " " << pads[2] << " ";
+            } else if (pads[0] < pads[3] && pads[1] < pads[4] && pads[2] < pads[5]) {
+                pad_type = 0;//SAME UPPER
+                layer_param << pads[0] << " " << pads[1] << " " << pads[2] << " ";
+            } else {
+                 DLog("SAME_LOWER is unsuported, change toSAME_UPPER \n");
+                 assert(0);
+            }
+        } else {
+            if (auto_pad == "" || auto_pad == "SAME_LOWER" || auto_pad == "SAME_UPPER" || auto_pad == "VALID") {
+                if (kernel_shape.size() == 3) {
+                    layer_param << 0 << " " << 0 << " " << 0 << " ";
+                } else {
+                    layer_param << 0 << " " << 0 << " ";
+                }
+            } else {
+                DLog("not implement\n");
+                assert(0);
+            }
+        }
+
+        // kernel_h_index_in_input_node_size kernel_w_index_in_input_node_size
+        // for runtime kernel size of global pool
+        if (is3d) {
+            layer_param << -1 << " " << -1 << " " << -1 << " ";
+        } else {
+            layer_param << -1 << " " << -1 << " ";
+        }
+        
+        //pad type
+        layer_param << pad_type << " ";
+
+        //ceil mode, 计算输出时候采用的截断方式 0：floor 1：ceil
+        layer_param << ceil_mode << " ";
+    } else if (onnx_op == "GlobalAveragePool" || onnx_op == "GlobalMaxPool") {
+        std::string auto_pad = get_node_attr_s(node, "auto_pad");  // TODO
+        std::vector<int64_t> kernel_shape =
+            get_node_attr_ai(node, "kernel_shape");
+        std::vector<int64_t> strides = get_node_attr_ai(node, "strides");
+        std::vector<int64_t> pads    = get_node_attr_ai(node, "pads");
+        //计算输出时候采用的截断方式 0：floor 1：ceil
+        int ceil_mode = 0;
+
+        int pool_type = (onnx_op == "GlobalAveragePool") ? 1 : 0;
+
+        if (kernel_shape.size() >= 3) {
+            layer_param << pool_type << " 0 0 0 1 1 1 0 0 0 -1 -1 -1 ";
+        } else {
+            layer_param << pool_type << " 0 0 1 1 0 0 -1 -1 ";
+        }
+
+        if (auto_pad == "SAME_LOWER" || auto_pad == "SAME_UPPER") {
+            DLog("not implement\n");
+            assert(0);
+        } else {
+            layer_param << -1 << " ";
+        }
+
+        layer_param << ceil_mode << " ";
+
+        const int is_adaptive_pool = 0;
+        const int output_h         = -1;
+        const int output_w         = -1;
+        layer_param << is_adaptive_pool << " " << output_h << " " << output_w << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPool::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterPool::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Pool, MaxPool);
+REGISTER_OP_CONVERTER(Pool, AveragePool);
+REGISTER_OP_CONVERTER(Pool, GlobalMaxPool);
+REGISTER_OP_CONVERTER(Pool, GlobalAveragePool);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pow.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pow.cc
new file mode 100644
index 0000000..5b4d924
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_pow.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Power);
+
+string OnnxOpConverterPower::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "Power";
+}
+
+string OnnxOpConverterPower::TNNLayerParam(NodeProto &node,
+                                                     OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+    float scale = 1.0;
+    float shift = 0.0;
+    float exponent = get_node_attr_f(node, "exponent", net_info, 1, 0.0);
+    bool has_tensor = net_info.weights_map.find(node.input(1)) != net_info.weights_map.end();
+    if (node.input_size() > 1 && has_tensor) {
+        onnx::TensorProto exponent_tensor = net_info.weights_map[node.input(1)];
+        const auto *exponent_data         = get_tensor_proto_data(exponent_tensor);
+        exponent                          = exponent_data[0];
+        if (exponent_tensor.data_type() == onnx::TensorProto_DataType_DOUBLE) {
+            exponent = (float)(((const double *)(exponent_data))[0]);
+        }
+    }
+
+//    std::vector<int64_t> pads = get_node_attr_ai(node, "pads", net_info, 1);
+//    float value = get_node_attr_f(node, "value", net_info, 2,0.f);
+    layer_param <<exponent<<" "<<scale<<" "<<shift<<" ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPower::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterPower::WriteTNNModel(Serializer *net_writer,
+                                                  NodeProto &node,
+                                                  OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Power, Pow);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prelu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prelu.cc
new file mode 100644
index 0000000..18c12eb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prelu.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(PRelu);
+
+string OnnxOpConverterPRelu::TNNOpType(NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    return "PReLU";
+}
+
+string OnnxOpConverterPRelu::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    const onnx::TensorProto &slope = net_info.weights_map[node.input(1)];
+    int num_slope                  = get_tensor_proto_data_size(slope);
+
+    if (num_slope == 1) {
+        layer_param << "1 0 ";
+    } else if (num_slope > 1) {
+        layer_param << "0 0 ";
+    } else {
+        assert(0);
+    }
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPRelu::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return true;
+}
+
+int OnnxOpConverterPRelu::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    std::string name = !node.name().empty() ? node.name() : node.output(0);
+    const std::string &tnn_layer_type = TNNOpType(node, net_info);
+
+    //写头信息
+    net_writer->PutInt(0);  //触发type from string
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    //写数据
+    const onnx::TensorProto &slope = net_info.weights_map[node.input(1)];
+    int num_slope                  = get_tensor_proto_data_size(slope);
+
+    net_writer->PutString(name);
+    WriteTensorData(slope, net_writer, DATA_TYPE_FLOAT);
+    
+    //有权值写入的返回1， 没有的返回0
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(PRelu, PRelu);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prior_box.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prior_box.cc
new file mode 100644
index 0000000..964f935
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_prior_box.cc
@@ -0,0 +1,93 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(PriorBox);
+
+string OnnxOpConverterPriorBox::TNNOpType(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "PriorBox";
+}
+
+string OnnxOpConverterPriorBox::TNNLayerParam(NodeProto &node,
+                                                   OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    std::vector<float> min_sizes = get_node_attr_af(node, "min_sizes");
+    std::vector<float> max_sizes = get_node_attr_af(node, "max_sizes");
+    int32_t clip                   = get_node_attr_i(node, "clip");
+    int32_t flip                   = get_node_attr_i(node, "flip");
+    std::vector<float> variances = get_node_attr_af(node, "variances");
+    std::vector<float> aspect_ratios =
+        get_node_attr_af(node, "aspect_ratios");
+
+    std::vector<int64_t> img_sizes = get_node_attr_ai(node, "img_sizes");
+    std::vector<float> steps     = get_node_attr_af(node, "steps");
+    float offset                 = get_node_attr_f(node, "offset");
+
+    layer_param << min_sizes.size() << " ";
+    for (float min_size : min_sizes) {
+        layer_param << min_size << " ";
+    }
+
+    layer_param << max_sizes.size() << " ";
+    for (float max_size : max_sizes) {
+        layer_param << max_size << " ";
+    }
+
+    layer_param << clip << " ";
+    layer_param << flip << " ";
+
+    layer_param << variances.size() << " ";
+    for (float variance : variances) {
+        layer_param << variance << " ";
+    }
+
+    layer_param << aspect_ratios.size() << " ";
+    for (float aspect_ratio : aspect_ratios) {
+        layer_param << aspect_ratio << " ";
+    }
+    // [img_w, img_h]
+    assert(img_sizes.size() == 2);
+    layer_param << img_sizes[0] << " ";
+    layer_param << img_sizes[1] << " ";
+    // [step_w, step_h]
+    assert(steps.size() == 2);
+    layer_param << steps[0] << " ";
+    layer_param << steps[1] << " ";
+
+    layer_param << offset << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterPriorBox::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+};
+
+int OnnxOpConverterPriorBox::WriteTNNModel(Serializer *net_writer,
+                                                NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(PriorBox, PriorBox);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_range.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_range.cc
new file mode 100644
index 0000000..c1360fd
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_range.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Range,
+                               virtual std::vector<std::string> GetInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+//void OnnxOpConverterRange::ProcessConstantNode(NodeProto &node, OnnxNetInfo &net_info) {
+//    for (const auto &input_node_name : node.input()) {
+//        if (net_info.const_node_map.find(input_node_name) != net_info.const_node_map.end()) {
+//            net_info.used_const_node.insert(input_node_name);
+//        }
+//    }
+//}
+
+string OnnxOpConverterRange::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Range";
+}
+
+std::vector<std::string> OnnxOpConverterRange::GetInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    //start, limit, delta
+    return {node.input(0), node.input(1), node.input(2)};
+}
+
+string OnnxOpConverterRange::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterRange::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterRange::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Range, Range);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reciprocal.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reciprocal.cc
new file mode 100644
index 0000000..efcda1d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reciprocal.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Reciprocal, Reciprocal);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.cc
new file mode 100644
index 0000000..0daa01b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_converter_reduce.h"
+
+string OnnxConverterReduce::TNNLayerParam(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    std::vector<int64_t> axes = get_node_attr_ai(node, "axes");
+    int64_t keepdims          = get_node_attr_i(node, "keepdims", 1);
+    layer_param << keepdims << " ";
+
+    for (int64_t axis : axes) {
+        layer_param << axis << " ";
+    }
+    return layer_param.str();
+}
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceL1, ReduceL1);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceL2, ReduceL2);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceLogSum, ReduceLogSum);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceLogSumExp, ReduceLogSumExp);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceMax, ReduceMax);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceMean, ReduceMean);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceMin, ReduceMin);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceProd, ReduceProd);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceSum, ReduceSum);
+
+REGISTER_OP_CONVERTER_REDUCE(ReduceSumSquare, ReduceSumSquare);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.h b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.h
new file mode 100644
index 0000000..32e1839
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reduce.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef ONNX2TNN_CORE_LAYER_ONNX_CONVERTER_REDUCE_H_
+#define ONNX2TNN_CORE_LAYER_ONNX_CONVERTER_REDUCE_H_
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+class OnnxConverterReduce : public OnnxOpConverter {
+public:
+    OnnxConverterReduce(string tnn_type, string onnx_type)
+        : OnnxOpConverter(onnx_type) {
+        tnn_type_ = tnn_type;
+    };
+    virtual ~OnnxConverterReduce(){};
+    virtual string TNNOpType(NodeProto &, OnnxNetInfo &) {
+        return tnn_type_;
+    };
+    string TNNLayerParam(NodeProto &, OnnxNetInfo &);
+    virtual int WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &) {
+        return 0;
+    };
+
+private:
+    string tnn_type_;
+};
+
+#define REGISTER_OP_CONVERTER_REDUCE(tnn_type, onnx_type)                      \
+    OnnxOpConverterRegister<OnnxConverterReduce>                               \
+        g_converter_##tnn_type_##onnx_type(#tnn_type, #onnx_type)
+
+#endif  // ONNX2TNN_CORE_LAYER_ONNX_CONVERTER_REDUCE_H_
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_relu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_relu.cc
new file mode 100644
index 0000000..c57e6e1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_relu.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(ReLU, Relu);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reshape.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reshape.cc
new file mode 100644
index 0000000..262581a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_reshape.cc
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Reshape,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterReshape::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Reshape";
+}
+
+std::vector<std::string> OnnxOpConverterReshape::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<std::string> inputs = {node.input(0)};
+    if (node.input_size() == 2 && net_info.weights_map.find(node.input(1)) == net_info.weights_map.end()) {
+        inputs.push_back(node.input(1));
+    }
+    return inputs;
+}
+
+string OnnxOpConverterReshape::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    const auto &weight_name = node.input(1);
+    auto iter               = net_info.weights_map.find(weight_name);
+    if (iter != net_info.weights_map.end()) {
+        const auto &shape_tp = net_info.weights_map[weight_name];
+        auto shape_data      = (const int64_t *)get_tensor_proto_data(shape_tp);
+        int data_size        = get_tensor_proto_data_size(shape_tp);
+        int start_axis       = 0;
+        //no need to add 1 after support dynamic shape
+        //int end_axis         = data_size < 4 ? 4 : data_size;
+        //int shape_size       = data_size < 4 ? 4 : data_size;
+        int end_axis               = data_size;
+        int shape_size          = data_size;
+//        int end_axis = data_size;
+//        int shape_size = data_size;
+
+        layer_param << start_axis << " ";
+        layer_param << end_axis << " ";
+        layer_param << shape_size << " ";
+        for (int i = 0; i < shape_size; ++i) {
+            if (i < data_size) {
+                layer_param << shape_data[i] << " ";
+            } else {
+                layer_param << 1 << " ";
+            }
+        }
+    } else {
+        int start_axis = 0;
+        int end_axis   = 0;
+        int shape_size = 0;
+        layer_param << start_axis << " ";
+        layer_param << end_axis << " ";
+        layer_param << shape_size << " ";
+    }
+    int reshape_type = 0;
+    layer_param << reshape_type << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterReshape::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterReshape::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Reshape, Reshape);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_resize.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_resize.cc
new file mode 100644
index 0000000..bee0756
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_resize.cc
@@ -0,0 +1,134 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Resize);
+
+string OnnxOpConverterResize::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Upsample";
+}
+
+string OnnxOpConverterResize::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    std::string coordinate_transformation_mode = get_node_attr_s(node, "coordinate_transformation_mode", "half_pexel");
+    std::string mode                           = get_node_attr_s(node, "mode");
+
+    std::vector<float> scales;
+    std::vector<int64_t> sizes;
+    string scales_name = "";
+    string sizes_name = "";
+    if (net_info.opset >= 11) {
+        if (node.input_size() > 2) {
+            scales_name = node.input(2);
+            scales = get_node_attr_af(node, "scales", net_info, 2);
+        }
+
+        if (node.input_size() > 3) {
+            sizes_name = node.input(3);
+            sizes  = get_node_attr_ai(node, "sizes", net_info, 3);
+        }
+    } else {
+        scales_name = node.input(1);
+        scales = get_node_attr_af(node, "scales", net_info, 1);
+    }
+    float h_scale = 0;
+    float w_scale = 0;
+
+    int resize_type = 0;
+    if (mode == "nearest") {
+        resize_type = 1;
+    } else if (mode == "bilinear" || mode == "linear") {
+        resize_type = 2;
+    } else if (mode == "trilinear") {
+        DLog("not implement\n");
+        assert(0);
+    }
+
+    int align_corners = 0;
+    if (coordinate_transformation_mode == "half_pixel") {
+        align_corners = 0;
+    } else if (coordinate_transformation_mode == "align_corners") {
+        align_corners = 1;
+    } else {
+        DLog("resize: coordinate_transformation_mode(%s) is not supported, result may be different.\n",
+             coordinate_transformation_mode.c_str());
+    }
+    
+    
+    if (sizes_name.length() > 0) {
+        if (net_info.weights_map.find(sizes_name) == net_info.weights_map.end()) {
+            //sizes is input blob(not constant)
+            layer_param << resize_type << " " << h_scale << " " << w_scale << " " << align_corners << " " << 0
+                            << " " << 0 << " ";
+        } else {
+            h_scale = 0.0;
+            w_scale = 0.0;
+
+            int target_height = 0;
+            int target_width  = 0;
+
+            if (sizes.size() == 4) {
+                target_height = (int)sizes[2];
+                target_width  = (int)sizes[3];
+            }
+
+            if (target_height <= 0 || target_width <= 0) {
+                DLog("resize to smaller hw not implemented.\n");
+                assert(0);
+            }
+            layer_param << resize_type << " " << h_scale << " " << w_scale << " " << align_corners << " " << target_height
+                        << " " << target_width << " ";
+        }
+    } else {
+        if (scales.size() == 2) {
+            w_scale = scales[1];
+        } else if (scales.size() == 3) {
+            h_scale = scales[1];
+            w_scale = scales[2];
+        } else if (scales.size() == 4) {
+            h_scale = scales[2];
+            w_scale = scales[3];
+
+            if (scales[1] != 1.f) {
+                DLog("not implement\n");
+                assert(0);
+            }
+        } else {
+            h_scale = get_node_attr_f(node, "height_scale", -1.0f);
+            w_scale = get_node_attr_f(node, "width_scale", -1.0f);
+
+            if ((h_scale <= 0 || w_scale <= 0) && scales_name.length() <= 0) {
+                DLog("resize invalid scale\n");
+                assert(0);
+            }
+        }
+        layer_param << resize_type << " " << h_scale << " " << w_scale << " " << align_corners << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterResize::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterResize::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Resize, Resize);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_roialign.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_roialign.cc
new file mode 100644
index 0000000..1feeee6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_roialign.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(RoiAlign);
+
+string OnnxOpConverterRoiAlign::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "RoiAlign";
+}
+
+string OnnxOpConverterRoiAlign::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    auto mode_str       = get_node_attr_s(node, "mode");
+    int mode            = mode_str != "max" ? 1 : 0;
+    int output_height   = get_node_attr_i(node, "output_height", 1);
+    int output_width    = get_node_attr_i(node, "output_width", 1);
+    int sampling_ratio  = get_node_attr_i(node, "sampling_ratio", 0);
+    float spatial_scale = get_node_attr_f(node, "spatial_scale", 1.0);
+
+    layer_param << mode << " ";
+    layer_param << output_height << " ";
+    layer_param << output_width << " ";
+    layer_param << sampling_ratio << " ";
+    layer_param << spatial_scale << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterRoiAlign::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterRoiAlign::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(RoiAlign, RoiAlign);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_scatter_nd.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_scatter_nd.cc
new file mode 100644
index 0000000..1123862
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_scatter_nd.cc
@@ -0,0 +1,99 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(ScatterND,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterScatterND::TNNOpType(NodeProto& node, OnnxNetInfo& net_info) {
+    return "ScatterND";
+}
+
+string OnnxOpConverterScatterND::TNNLayerParam(NodeProto& node, OnnxNetInfo& net_info) {
+    return "";
+}
+
+std::vector<std::string> OnnxOpConverterScatterND::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    auto iter_indices = net_info.weights_map.find(node.input(1));
+    auto iter_updates = net_info.weights_map.find(node.input(2));
+    if (iter_indices == net_info.weights_map.end() && iter_updates == net_info.weights_map.end()) {
+        //X, Y, condition order for input
+        return {node.input(0), node.input(1), node.input(2)};
+    } else if (iter_updates == net_info.weights_map.end()){
+        return {node.input(0), node.input(2)};
+    } else {
+        return {node.input(0), node.input(1)};
+    }
+
+}
+
+bool OnnxOpConverterScatterND::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    //support indices input is not const, move layer resource to const blob
+    auto iter_indices = net_info.weights_map.find(node.input(1));
+    auto iter_updates = net_info.weights_map.find(node.input(2));
+    if (iter_indices == net_info.weights_map.end() && iter_updates == net_info.weights_map.end()) {
+        return false;
+    } else {
+        return true;
+    }
+};
+
+int OnnxOpConverterScatterND::WriteTNNModel(Serializer* net_writer, NodeProto& node, OnnxNetInfo& net_info) {
+    //support indices input is not const, move layer resource to const blob
+    if (!HasLayerResource(node, net_info)) {
+        return 0;
+    }
+    
+    const std::string& onnx_op        = node.op_type();
+    std::string name                  = !node.name().empty() ? node.name() : node.output(0);
+    const std::string& tnn_layer_type = TNNOpType(node, net_info);
+
+    net_writer->PutInt(0);
+    net_writer->PutString(tnn_layer_type);
+    net_writer->PutString(name);
+
+    const auto& weights_map  = net_info.weights_map;
+    const auto& indices_name = node.input(1);
+    bool has_indices         = false;
+    if (weights_map.find(node.input(1)) != weights_map.end()) {
+        has_indices                      = true;
+        const onnx::TensorProto& indices = net_info.weights_map[node.input(1)];
+        // save indices shape
+        // cast int64_t to int
+        std::vector<int> indices_dims(indices.dims().begin(), indices.dims().end());
+        net_writer->PutBool(has_indices);
+        // save indices value
+        WriteIntTensorData(indices, net_writer);
+    } else {
+        net_writer->PutBool(has_indices);
+    }
+    // save update
+    bool has_update   = false;
+    auto& update_name = node.input(2);
+    if (weights_map.find(update_name) != weights_map.end()) {
+        has_update                      = true;
+        const onnx::TensorProto& update = net_info.weights_map[update_name];
+        std::vector<int> update_dims(update.dims().begin(), update.dims().end());
+        net_writer->PutBool(has_update);
+        WriteTensorData(update, net_writer, DATA_TYPE_FLOAT);
+    } else {
+        net_writer->PutBool(has_update);
+    }
+
+    return 1;
+}
+
+REGISTER_OP_CONVERTER(ScatterND, ScatterND);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_selu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_selu.cc
new file mode 100644
index 0000000..3ad9a6b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_selu.cc
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Selu);
+
+string OnnxOpConverterSelu::TNNOpType(NodeProto &node,
+                                      OnnxNetInfo &net_info) {
+    return "Selu";
+}
+
+string OnnxOpConverterSelu::TNNLayerParam(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+
+    double alpha = get_node_attr_f(node, "alpha");
+    double gamma = get_node_attr_f(node, "gamma");
+    ostringstream layer_param;
+    layer_param << alpha << " "<< gamma << " ";
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSelu::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSelu::WriteTNNModel(Serializer *net_writer,
+                                       NodeProto &node,
+                                       OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Selu, Selu);
+
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shape.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shape.cc
new file mode 100644
index 0000000..627782e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shape.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Shape);
+
+string OnnxOpConverterShape::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Shape";
+}
+
+string OnnxOpConverterShape::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterShape::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterShape::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Shape, Shape);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shuffle_channel.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shuffle_channel.cc
new file mode 100644
index 0000000..ef67316
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_shuffle_channel.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(ShuffleChannel);
+
+string OnnxOpConverterShuffleChannel::TNNOpType(NodeProto &node,
+                                                     OnnxNetInfo &net_info) {
+    return "ShuffleChannel";
+}
+
+string OnnxOpConverterShuffleChannel::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    int64_t group = get_node_attr_i(node, "group", 1);
+    layer_param << group << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterShuffleChannel::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterShuffleChannel::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(ShuffleChannel, ShuffleChannel);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sigmoid.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sigmoid.cc
new file mode 100644
index 0000000..e61bfd6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sigmoid.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Sigmoid, Sigmoid);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sign.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sign.cc
new file mode 100644
index 0000000..ad4ea10
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sign.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Sign, Sign);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_signed_mul.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_signed_mul.cc
new file mode 100644
index 0000000..50d6297
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_signed_mul.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(SignedMul);
+
+string OnnxOpConverterSignedMul::TNNOpType(NodeProto &node,
+                                          OnnxNetInfo &net_info) {
+    return "SignedMul";
+}
+
+string OnnxOpConverterSignedMul::TNNLayerParam(
+    NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    float alpha = get_node_attr_f(node, "alpha", 1.f);
+    float beta = get_node_attr_f(node, "beta", 1.f);
+    float gamma = get_node_attr_f(node, "gamma", 2.f);
+    layer_param << alpha << " " << beta << " "<< gamma << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSignedMul::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSignedMul::WriteTNNModel(Serializer *net_writer,
+                                                      NodeProto &node,
+                                                      OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(SignedMul, SignedMul);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sin.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sin.cc
new file mode 100644
index 0000000..65a0a10
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sin.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Sin, Sin);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_size.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_size.cc
new file mode 100644
index 0000000..044d5e8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_size.cc
@@ -0,0 +1,94 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Size);
+
+string OnnxOpConverterSize::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Size";
+}
+
+string OnnxOpConverterSize::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+//    //    TensorProto_DataType_UNDEFINED = 0,
+//    //    TensorProto_DataType_FLOAT = 1,
+//    //    TensorProto_DataType_UINT8 = 2,
+//    //    TensorProto_DataType_INT8 = 3,
+//    //    TensorProto_DataType_UINT16 = 4,
+//    //    TensorProto_DataType_INT16 = 5,
+//    //    TensorProto_DataType_INT32 = 6,
+//    //    TensorProto_DataType_INT64 = 7,
+//    //    TensorProto_DataType_STRING = 8,
+//    //    TensorProto_DataType_BOOL = 9,
+//    //    TensorProto_DataType_FLOAT16 = 10,
+//    //    TensorProto_DataType_DOUBLE = 11,
+//    //    TensorProto_DataType_UINT32 = 12,
+//    //    TensorProto_DataType_UINT64 = 13,
+//    //    TensorProto_DataType_COMPLEX64 = 14,
+//    //    TensorProto_DataType_COMPLEX128 = 15,
+//    //    TensorProto_DataType_BFLOAT16 = 16
+//    
+//    //转成common.h里面的DataType值
+//    int64_t to = get_node_attr_i(node, "to");
+//    DataType data_type = DATA_TYPE_AUTO;
+//    switch (to) {
+//        case 1:
+//            data_type = DATA_TYPE_FLOAT;
+//            break;
+//        case 9://INT8 BOOL(sizeof(bool) == sizeof(char))
+//        case 3:
+//            data_type = DATA_TYPE_INT8;
+//            break;
+//        case 6:
+//        case 7:
+//            data_type = DATA_TYPE_INT32;
+//            break;
+//        case 10:
+//            data_type = DATA_TYPE_HALF;
+//            break;
+//        case 12:
+//        case 13:
+//            data_type = DATA_TYPE_UINT32;
+//            break;
+//        case 16:
+//            data_type = DATA_TYPE_BFP16;
+//            break;
+//        default:
+//            DLog("unsupport data type");
+//            assert(0);
+//            break;
+//    }
+//    layer_param << data_type << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSize::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSize::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Size, Size);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_slice.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_slice.cc
new file mode 100644
index 0000000..131626f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_slice.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Slice);
+
+string OnnxOpConverterSlice::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "StridedSliceV2";
+}
+
+string OnnxOpConverterSlice::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    std::vector<int64_t> starts = get_node_attr_ai(node, "starts", net_info, 1);
+    std::vector<int64_t> ends   = get_node_attr_ai(node, "ends", net_info, 2);
+    std::vector<int64_t> axes   = get_node_attr_ai(node, "axes", net_info, 3);
+    std::vector<int64_t> steps;
+    if (net_info.opset >= 10) {
+        steps = get_node_attr_ai(node, "steps", net_info, 4);
+    }
+    layer_param << starts.size() << " ";
+    for (const auto &start : starts) {
+        layer_param << start << " ";
+    }
+    layer_param << ends.size() << " ";
+    for (const auto &end : ends) {
+        if (end == LLONG_MAX) {
+            layer_param << INT_MAX << " ";
+        } else if (end == LLONG_MIN || end == -LLONG_MAX) {
+            layer_param << INT_MIN << " ";
+        } else {
+            layer_param << end << " ";
+        }
+    }
+    // pad axes size to starts.size
+    if (axes.empty()) {
+        for (int i = 0; i < starts.size(); ++i) {
+            axes.push_back(i);
+        }
+    }
+    layer_param << axes.size() << " ";
+    for (const auto &axis : axes) {
+        layer_param << axis << " ";
+    }
+    // Pad steps
+    if (steps.empty()) {
+        steps = std::vector<int64_t>(starts.size(), 1);
+    }
+    layer_param << steps.size() << " ";
+    for (const auto &step : steps) {
+        layer_param << step << " ";
+    }
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSlice::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSlice::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Slice, Slice);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softmax.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softmax.cc
new file mode 100644
index 0000000..ab22892
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softmax.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Softmax);
+
+string OnnxOpConverterSoftmax::TNNOpType(NodeProto &node,
+                                              OnnxNetInfo &net_info) {
+    return "SoftmaxCaffe";
+}
+
+string OnnxOpConverterSoftmax::TNNLayerParam(NodeProto &node,
+                                                  OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+    int axis = (int)get_node_attr_i(node, "axis", 1);
+    layer_param << axis << " ";
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSoftmax::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSoftmax::WriteTNNModel(Serializer *net_writer,
+                                               NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Softmax, Softmax);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softplus.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softplus.cc
new file mode 100644
index 0000000..8837dd4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softplus.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Softplus, Softplus);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softsign.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softsign.cc
new file mode 100644
index 0000000..f21df46
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_softsign.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Softsign, Softsign);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_split.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_split.cc
new file mode 100644
index 0000000..fefccbe
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_split.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Split);
+
+string OnnxOpConverterSplit::TNNOpType(NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    // caffe split can be removed by caffe2onnx tool
+    return "SplitV";
+}
+
+string OnnxOpConverterSplit::TNNLayerParam(NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    int64_t axis                = get_node_attr_i(node, "axis", 1);
+    std::vector<int64_t> splits = get_node_attr_ai(node, "split");
+
+    layer_param << axis << " " << splits.size() << " ";
+    for (int64_t iter : splits) {
+        layer_param << iter << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSplit::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSplit::WriteTNNModel(Serializer *net_writer,
+                                             NodeProto &node,
+                                             OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Split, Split);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sqrt.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sqrt.cc
new file mode 100644
index 0000000..45bd74f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sqrt.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Sqrt, Sqrt);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_squeeze.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_squeeze.cc
new file mode 100644
index 0000000..0a9664a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_squeeze.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Squeeze);
+
+string OnnxOpConverterSqueeze::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Squeeze";
+}
+
+string OnnxOpConverterSqueeze::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    auto axes          = get_node_attr_ai(node, "axes");
+    layer_param << axes.size() << " ";
+    for (auto item : axes) {
+        layer_param << item << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterSqueeze::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterSqueeze::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Squeeze, Squeeze);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sub.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sub.cc
new file mode 100644
index 0000000..e60e40a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_sub.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Sub);
+
+
+string OnnxOpConverterSub::TNNOpType(NodeProto& node, OnnxNetInfo &net_info) {
+    return "Sub";
+}
+
+string OnnxOpConverterSub::TNNLayerParam(NodeProto& node,
+                                                    OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterSub::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+};
+
+int OnnxOpConverterSub::WriteTNNModel(Serializer* net_writer,
+                                                 NodeProto& node,
+                                                 OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Sub, Sub);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tan.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tan.cc
new file mode 100644
index 0000000..35a035a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tan.cc
@@ -0,0 +1,17 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Tan, Tan);
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tanh.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tanh.cc
new file mode 100644
index 0000000..b03ed3f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tanh.cc
@@ -0,0 +1,20 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+REGISTER_OP_CONVERTER_NoParamNoWeight(Tanh, Tanh);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tile.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tile.cc
new file mode 100644
index 0000000..e7fbc76
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_tile.cc
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Tile);
+
+string OnnxOpConverterTile::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Tile";
+}
+
+string OnnxOpConverterTile::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+    
+    if (net_info.weights_map.find(node.input(1)) !=  net_info.weights_map.end()) {
+        const onnx::TensorProto &repeats = net_info.weights_map[node.input(1)];
+        int num_repeats = (int)get_tensor_proto_data_size(repeats);
+        auto repeats_data  = get_tensor_proto_data_vector<long long int>(repeats);
+        for (int ii = 0; ii < num_repeats; ii++) {
+            layer_param << repeats_data[ii] << " ";
+        }
+    }
+    return layer_param.str();
+}
+
+bool OnnxOpConverterTile::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterTile::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Tile, Tile);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_topk.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_topk.cc
new file mode 100644
index 0000000..5c078c7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_topk.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(TopK);
+
+string OnnxOpConverterTopK::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "TopK";
+}
+
+string OnnxOpConverterTopK::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    int axis = -1;
+    int largest = 1;
+    int sorted = 1;
+
+    if (node_has_attr(node, "axis")) {
+        axis = get_node_attr_i(node, "axis");
+    }
+    if (node_has_attr(node, "largest")) {
+        largest = get_node_attr_i(node, "largest");
+    }
+    if (node_has_attr(node, "sorted")) {
+        sorted = get_node_attr_i(node, "sorted");
+    }
+    int k = get_node_attr_i(node, "k");
+
+    layer_param << axis << " " << largest << " " << sorted << " " << k << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterTopK::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterTopK::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(TopK, TopK);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_transpose.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_transpose.cc
new file mode 100644
index 0000000..256a329
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_transpose.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Transpose);
+
+string OnnxOpConverterTranspose::TNNOpType(NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+//    const std::string &onnx_op = node.op_type();
+//
+//    std::vector<int64_t> perm = get_node_attr_ai(node, "perm");
+//    if (perm.size() > 4) {
+//        return "Transpose";
+//    } else {
+//        return "Transpose3D";
+//    }
+    return "Permute";
+}
+
+string OnnxOpConverterTranspose::TNNLayerParam(NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    std::vector<int64_t> perm = get_node_attr_ai(node, "perm");
+    layer_param << perm.size() << " ";
+    for (int ii = 0; ii < perm.size(); ii++) {
+        layer_param << perm[ii] << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterTranspose::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterTranspose::WriteTNNModel(Serializer *net_writer,
+                                             NodeProto &node,
+                                             OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Transpose, Transpose);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_unsqueeze.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_unsqueeze.cc
new file mode 100644
index 0000000..d83cc2c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_unsqueeze.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Unsqueeze,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+
+string OnnxOpConverterUnsqueeze::TNNOpType(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    return "Unsqueeze";
+}
+
+std::vector<std::string> OnnxOpConverterUnsqueeze::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    return {node.input(0)};
+}
+
+string OnnxOpConverterUnsqueeze::TNNLayerParam(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    const std::string &onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    auto axes          = get_node_attr_ai(node, "axes");
+    layer_param << axes.size() << " ";
+    for (auto item : axes) {
+        layer_param << item << " ";
+    }
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterUnsqueeze::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterUnsqueeze::WriteTNNModel(Serializer *net_writer,
+                                            NodeProto &node,
+                                            OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Unsqueeze, Unsqueeze);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_upsample.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_upsample.cc
new file mode 100644
index 0000000..fee4711
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_upsample.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER(Upsample);
+
+string OnnxOpConverterUpsample::TNNOpType(NodeProto &node,
+                                               OnnxNetInfo &net_info) {
+    return "Upsample";
+}
+
+string OnnxOpConverterUpsample::TNNLayerParam(NodeProto &node,
+                                                   OnnxNetInfo &net_info) {
+    ostringstream layer_param;
+
+    int align_corners = (int)get_node_attr_i(node, "align_corners", -1);
+    std::string mode  = get_node_attr_s(node, "mode");
+
+    std::vector<float> scales;
+
+    if (node.input_size() == 1) {
+        scales = get_node_attr_af(node, "scales");
+    } else {
+        if (net_info.weights_map.find(node.input(1)) != net_info.weights_map.end()) {
+            auto &scales_tp = net_info.weights_map[node.input(1)];
+            const float *scales_data = get_tensor_proto_data(scales_tp);
+
+            int float_data_size = scales_tp.float_data_size();
+            // float data is None, use raw data instead
+            if (float_data_size == 0) {
+                float_data_size = (int)scales_tp.dims().Get(0);
+            }
+
+            for (int j = 0; j < float_data_size; j++) {
+                scales.push_back(scales_data[j]);
+            }
+        }
+    }
+
+    int resize_type = 0;
+    if (mode == "nearest") {
+        resize_type = 1;
+    } else if (mode == "bilinear" || mode == "linear") {
+        resize_type = 2;
+    } else if (mode == "trilinear") {
+        DLog("not implement\n");
+        assert(0);
+    }
+
+    float h_scale = 1.f;
+    float w_scale = 1.f;
+    if (scales.size() == 2) {
+        w_scale = scales[1];
+    } else if (scales.size() == 3) {
+        h_scale = scales[1];
+        w_scale = scales[2];
+    } else if (scales.size() == 4) {
+        h_scale = scales[2];
+        w_scale = scales[3];
+
+        if (scales[1] != 1.f) {
+            DLog("not implement\n");
+            assert(0);
+        }
+    } else {
+        h_scale = get_node_attr_f(node, "height_scale", 0.0f);
+        w_scale = get_node_attr_f(node, "width_scale", 0.0f);
+    }
+
+    if (align_corners < 0) {
+        if (h_scale >= 1.0f || w_scale >= 1.0f) {
+            align_corners = 0;
+        } else {
+            align_corners = 1;
+        }
+    }
+    layer_param << resize_type << " " << h_scale << " " << w_scale << " "
+                << align_corners << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterUpsample::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterUpsample::WriteTNNModel(Serializer *net_writer,
+                                                NodeProto &node,
+                                                OnnxNetInfo &net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Upsample, Upsample);
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_where.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_where.cc
new file mode 100644
index 0000000..d8236db
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_converter_where.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+DECLARE_OP_CONVERTER_WITH_FUNC(Where,
+                               virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info););
+string OnnxOpConverterWhere::TNNOpType(NodeProto &node, OnnxNetInfo &net_info) {
+    return "Where";
+}
+
+std::vector<std::string> OnnxOpConverterWhere::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    //X, Y, condition order for input
+    return {node.input(1), node.input(2), node.input(0)};
+}
+
+string OnnxOpConverterWhere::TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+    return "";
+}
+
+bool OnnxOpConverterWhere::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterWhere::WriteTNNModel(Serializer *net_writer, NodeProto &node, OnnxNetInfo &net_info) {
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(Where, Where);
+
+
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.cc
new file mode 100644
index 0000000..02e623b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.cc
@@ -0,0 +1,352 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_op_converter.h"
+#include "tnn/utils/data_type_utils.h"
+#include <mutex>
+#include "onnx_utility.h"
+#include "onnx.pb.h"
+
+std::vector<std::string> OnnxOpConverter::GetAllInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<std::string> inputs;
+    for (int j = 0; j < (int)node.input_size(); j++) {
+        const std::string &input_name = node.input(j);
+        if (input_name.length() > 0) {
+            inputs.push_back(input_name);
+        }
+    }
+    return inputs;
+}
+
+std::vector<std::string> OnnxOpConverter::GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<std::string> inputs;
+    
+    int input_size  = node.input_size();
+    bool has_another_variable_input = false;
+    
+    for (int j = 1; j < (int)node.input_size(); j++) {
+        const std::string &input_name = node.input(j);
+        if (input_name.length() > 0 &&
+            net_info.weights_map.find(input_name) == net_info.weights_map.end()) {
+            has_another_variable_input = true;
+            break;
+        }
+    }
+    
+    bool all_inputs_const = (!has_another_variable_input) &&
+    net_info.weights_map.find(node.input(0)) != net_info.weights_map.end();
+    
+    for (int j = 0; j < (int)node.input_size(); j++) {
+        const auto input_name = node.input(j);
+        if (input_name.length() <= 0) {
+            continue;
+        } else {
+            //if all inputs are const, it is a const layer which is only excuted on NAIVE
+            if (all_inputs_const) {
+                
+            } else {
+                if (HasLayerResource(node, net_info)) {
+                    if (net_info.weights_map.find(input_name) != net_info.weights_map.end() &&
+                    net_info.used_const_node.find(input_name) == net_info.used_const_node.end()) {
+                        continue;
+                    }
+                } else {
+                    if (j == 0 && node.input_size() == 1) {
+                        
+                    } else {
+                        if (!has_another_variable_input && net_info.weights_map.find(input_name) != net_info.weights_map.end()) {
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
+        
+        inputs.push_back(input_name);
+    }
+    
+    return inputs;
+}
+
+std::vector<std::string> OnnxOpConverter::GetValidOutputNames(NodeProto &node, OnnxNetInfo &net_info) {
+    std::vector<std::string> outputs;
+    int output_size = node.output_size();
+    
+    for (int j = 0; j < output_size; j++) {
+        const auto output_name = node.output(j);
+        outputs.push_back(output_name);
+    }
+    return outputs;
+}
+
+string OnnxOpConverter::TNNLayerProto(NodeProto &node,
+                                           OnnxNetInfo &net_info) {
+    ostringstream proto_layer;
+
+    string tnn_layer_type = TNNOpType(node, net_info);
+    proto_layer << "\"" << tnn_layer_type << " ";
+
+    std::string name = node.name();
+    if (name.empty()) {
+        name = node.output(0);
+    }
+    proto_layer << name << " ";
+    ProcessConstantNode(node, net_info);
+    
+    auto inputs = GetValidInputNames(node, net_info);
+    auto outputs = GetValidOutputNames(node, net_info);
+    
+    proto_layer << inputs.size() << " " << outputs.size() << " ";
+
+    for (auto iter : inputs) {
+        proto_layer << iter << " ";
+    }
+
+    for (auto iter : outputs) {
+        proto_layer << iter << " ";
+    }
+
+    string param = TNNLayerParam(node, net_info);
+    proto_layer << param << ",\"";
+    return proto_layer.str();
+}
+
+int OnnxOpConverter::WriteIntTensorData(const onnx::TensorProto &tensor, Serializer *writer) {
+    if (tensor.data_type() == onnx::TensorProto_DataType_INT64) {
+        int item_size = get_tensor_proto_data_size(tensor);
+        if (item_size == 0) {
+            DLog("invalid size \n");
+            return -1;
+        }
+        auto dims = GetDimsFromTensor(tensor);
+        if (tensor.has_raw_data()) {
+            int64_t *raw_data = (int64_t *)tensor.raw_data().data();
+            auto tmp          = new int32_t[item_size];
+            for (int i = 0; i < item_size; ++i) {
+                tmp[i] = raw_data[i];
+            }
+            writer->PutRaw(sizeof(int32_t) * item_size, (char *)tmp, dims, DATA_TYPE_INT32);
+            delete[] tmp;
+        }
+        // cast from int64 to int32
+    }
+    return 0;
+}
+
+int OnnxOpConverter::WriteTensorData(const onnx::TensorProto &tensor,
+                                     Serializer *writer, DataType dst_data_type) {
+    int ret = 0;
+    do {
+        int item_size = get_tensor_proto_data_size(tensor);
+        //adapt to save empty tensor for some para of op
+//        if (item_size == 0) {
+//            DLog("invalid size\n");
+//            assert(0);
+//            break;
+//        }
+        
+        auto tensor_data_type = tensor.data_type();
+        DLog("tersor (%s) data type: %d item_size: %d\n", tensor.name().c_str(), tensor_data_type, item_size);
+        
+        auto dims = GetDimsFromTensor(tensor);
+        if (dims.empty() && item_size !=1) {
+            DLog("dims size is invalid\n");
+            assert(0);
+        }
+        if (tensor.has_raw_data()) {
+            const std::string &raw_data = tensor.raw_data();
+            WriteRawData((const void *)raw_data.data(), item_size, tensor_data_type, writer, dst_data_type, dims);
+        } else if (tensor.data_type() == 1) {
+            WriteRawData((float *)tensor.float_data().data(), item_size, writer,
+                         dst_data_type, dims);
+        } else if (tensor.data_type() == 6) {
+            int32_t *raw_data = (int32_t *)tensor.int32_data().data();
+            float *temp = new float[item_size];
+            for (int i=0; i<item_size; i++) {
+                temp[i] = raw_data[i];
+            }
+            WriteRawData(temp, item_size, writer, dst_data_type, dims);
+        } else if (tensor.data_type() == 7) {
+            int64_t *raw_data = (int64_t *)tensor.int64_data().data();
+            float *temp = new float[item_size];
+            for (int i=0; i<item_size; i++) {
+                temp[i] = raw_data[i];
+            }
+            WriteRawData(temp, item_size, writer, dst_data_type, dims);
+            delete [] temp;
+        } else {
+            DLog("invalid tensor type\n");
+            assert(0);
+            break;
+        }
+    } while (0);
+    return ret;
+}
+
+int OnnxOpConverter::WriteRawData(const void *raw_data, int data_count, int src_data_type, Serializer *writer,
+                 DataType dst_data_type, std::vector<int32_t> dims) {
+    int ret = 0;
+    do {
+        //    TensorProto_DataType_UNDEFINED = 0,
+        //    TensorProto_DataType_FLOAT = 1,
+        //    TensorProto_DataType_UINT8 = 2,
+        //    TensorProto_DataType_INT8 = 3,
+        //    TensorProto_DataType_UINT16 = 4,
+        //    TensorProto_DataType_INT16 = 5,
+        //    TensorProto_DataType_INT32 = 6,
+        //    TensorProto_DataType_INT64 = 7,
+        //    TensorProto_DataType_STRING = 8,
+        //    TensorProto_DataType_BOOL = 9,
+        //    TensorProto_DataType_FLOAT16 = 10,
+        //    TensorProto_DataType_DOUBLE = 11,
+        //    TensorProto_DataType_UINT32 = 12,
+        //    TensorProto_DataType_UINT64 = 13,
+        //    TensorProto_DataType_COMPLEX64 = 14,
+        //    TensorProto_DataType_COMPLEX128 = 15,
+        //    TensorProto_DataType_BFLOAT16 = 16
+        
+        if (!raw_data && data_count > 0) {
+            DLog("invalid data or size\n");
+            assert(0);
+            break;
+        }
+        
+        if (src_data_type == onnx::TensorProto_DataType_FLOAT ||
+            src_data_type == onnx::TensorProto_DataType_DOUBLE) {//float double
+            //double to float
+            auto float_data = (float *)raw_data;
+            if (src_data_type == onnx::TensorProto_DataType_DOUBLE) {
+                float_data = new float [data_count];
+                auto double_data = (double *)raw_data;
+                for (int ii=0; ii<data_count; ii++) {
+                    float_data[ii] = double_data[ii];
+                }
+            }
+            
+            if (dst_data_type == DATA_TYPE_AUTO ||
+                dst_data_type == DATA_TYPE_FLOAT) {
+                writer->PutRaw(data_count * sizeof(float), (char *)float_data, dims,DATA_TYPE_FLOAT);
+            } else if (dst_data_type == DATA_TYPE_HALF) {
+                if (data_count > 0) {
+                    float16 *half_data = new float16[data_count];
+                    ret = TNN_NS::ConvertFromFloatToHalf((float *)float_data, (void *)half_data, data_count);
+                    writer->PutRaw(data_count * sizeof(float16), (char *)half_data, dims , DATA_TYPE_HALF);
+                    delete[] half_data;
+                } else {
+                    writer->PutRaw(data_count * sizeof(float16), (char *)NULL, dims , DATA_TYPE_HALF);
+                }
+            } else{
+                DLog("unsupport  src_data_type: %d dst_data_type: %d\n", src_data_type, dst_data_type);
+                assert(0);
+            }
+            if (float_data != raw_data) {
+                delete [] float_data;
+            }
+        } else if (src_data_type == onnx::TensorProto_DataType_INT32){//int32
+            if (dst_data_type == DATA_TYPE_AUTO ||
+                dst_data_type == DATA_TYPE_INT32) {
+                writer->PutRaw(data_count * sizeof(int32_t), (char *)raw_data, dims, DATA_TYPE_INT32);
+            } else{
+                DLog("unsupport  src_data_type: %d dst_data_type: %d\n", src_data_type, dst_data_type);
+                assert(0);
+            }
+        } else if (src_data_type == onnx::TensorProto_DataType_INT64){//int_64
+            if (dst_data_type == DATA_TYPE_AUTO ||
+                dst_data_type == DATA_TYPE_INT32) {
+                if (data_count > 0) {
+                    auto int64_data = (int64_t *)raw_data;
+                    auto int32_data = new int32_t[data_count];
+                    for (int ii=0; ii<data_count; ii++) {
+                        //此处一定用saturate_cast，避免int64最大值转换为-1导致出差
+                        int32_data[ii] = DataTypeUtils::SaturateCast(int64_data[ii]);
+                    }
+                    writer->PutRaw(data_count * sizeof(int32_t), (char *)int32_data, dims, DATA_TYPE_INT32);
+                    delete[] int32_data;
+                } else {
+                    writer->PutRaw(data_count * sizeof(int32_t), (char *)NULL, dims, DATA_TYPE_INT32);
+                }
+            } else{
+                DLog("unsupport  src_data_type: %d dst_data_type: %d\n", src_data_type, dst_data_type);
+                assert(0);
+            }
+        } else if (src_data_type == onnx::TensorProto_DataType_UINT64){//uint_64
+            if (dst_data_type == DATA_TYPE_AUTO ||
+                dst_data_type == DATA_TYPE_INT32) {
+                if (data_count > 0) {
+                    auto uint64_data = (uint64_t *)raw_data;
+                    auto int32_data = new int32_t[data_count];
+                    for (int ii=0; ii<data_count; ii++) {
+                        //此处一定用saturate_cast，避免int64最大值转换为-1导致出差
+                        int32_data[ii] = DataTypeUtils::SaturateCast(uint64_data[ii]);
+                    }
+                    writer->PutRaw(data_count * sizeof(int32_t), (char *)int32_data, dims, DATA_TYPE_INT32);
+                    delete[] int32_data;
+                } else {
+                    writer->PutRaw(data_count * sizeof(int32_t), (char *)NULL, dims, DATA_TYPE_INT32);
+                }
+            } else{
+                DLog("unsupport  src_data_type: %d dst_data_type: %d\n", src_data_type, dst_data_type);
+                assert(0);
+            }
+        } else {
+            DLog("unsupport  src_data_type: %d dst_data_type: %d\n", src_data_type, dst_data_type);
+            assert(0);
+        }
+
+
+    } while (0);
+    return ret;
+}
+
+int OnnxOpConverter::WriteRawData(const float *raw_data, int data_count,
+                                  Serializer *writer, DataType dst_data_type, std::vector<int> dims) {
+    return WriteRawData((const void *)raw_data, data_count, 1, writer, dst_data_type, dims);
+}
+
+OnnxOpConverterManager::OnnxOpConverterManager() {}
+
+OnnxOpConverterManager::~OnnxOpConverterManager() {}
+
+std::shared_ptr<OnnxOpConverterManager> &OnnxOpConverterManager::Shared() {
+    static std::once_flag once;
+    static std::shared_ptr<OnnxOpConverterManager>
+        g_global_onnx_op_converter_manager;
+    std::call_once(once, []() {
+        g_global_onnx_op_converter_manager =
+            std::make_shared<OnnxOpConverterManager>();
+    });
+    return g_global_onnx_op_converter_manager;
+}
+
+std::shared_ptr<OnnxOpConverter> OnnxOpConverterManager::GetOnnxOpConverter(
+    string onnx_type) {
+    auto iter = converter_map_.find(onnx_type);
+    if (iter != converter_map_.end()) {
+        return iter->second;
+    }
+    return nullptr;
+}
+
+int OnnxOpConverterManager::SetOnnxOpConverter(
+    string onnx_type, std::shared_ptr<OnnxOpConverter> converter) {
+    auto iter = converter_map_.find(onnx_type);
+    if (iter != converter_map_.end()) {
+        DLog("Error: onnx_type(%s) cannot be registered twice\n",
+             onnx_type.c_str());
+        assert(0);
+        return 1;
+    }
+    converter_map_[onnx_type] = converter;
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.h b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.h
new file mode 100644
index 0000000..fbcad13
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/layer/onnx_op_converter.h
@@ -0,0 +1,203 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef onnx_op_converter_hpp
+#define onnx_op_converter_hpp
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <memory>
+#include <string>
+#include <vector>
+#include <set>
+
+#include "tnn/interpreter/tnn/objseri.h"
+#include "onnx.pb.h"
+#include "onnx2tnn_prefix.h"
+
+using namespace std;
+using namespace onnx;
+using namespace TNN_NS;
+
+#ifdef __fp16
+typedef __fp16 float16;
+#else
+typedef uint16_t float16;
+#endif
+
+typedef std::map<std::string, onnx::TensorProto> TensorProtoMap;
+typedef std::map<std::string, onnx::TensorShapeProto> TensorShapeMap;
+struct OnnxNetInfo {
+    DataType data_type = DATA_TYPE_FLOAT;
+    // onnx weight node and weight reshape node
+    TensorProtoMap weights_map;
+    TensorShapeMap weights_shape_map;
+    std::set<std::string > used_const_node;
+    std::map<std::string, onnx::NodeProto> proxy_node_map;
+    bool is_3D_model = false;
+    int opset = 0;
+};
+
+class OnnxOpConverter {
+public:
+    OnnxOpConverter(string onnx_op_type) {
+        onnx_op_type_ = onnx_op_type;
+    };
+    virtual ~OnnxOpConverter(){};
+    string OnnxOpType() {
+        return onnx_op_type_;
+    };
+    virtual string TNNOpType(NodeProto &node, OnnxNetInfo &net_info) = 0;
+    std::vector<std::string> GetAllInputNames(NodeProto &node, OnnxNetInfo &net_info);
+    virtual std::vector<std::string> GetValidInputNames(NodeProto &node, OnnxNetInfo &net_info);
+    virtual std::vector<std::string> GetValidOutputNames(NodeProto &node, OnnxNetInfo &net_info);
+    string TNNLayerProto(NodeProto &node, OnnxNetInfo &net_info);
+    virtual string TNNLayerParam(NodeProto &node, OnnxNetInfo &net_info) {
+        return "";
+    };
+    virtual void ProcessConstantNode(NodeProto &node ,OnnxNetInfo &net_info) {
+        // do nothing
+        return;
+    };
+    
+    //TNN是否有固定的layerresource去解析，有的话输入为const的就无需写到constant_map里面，如conv；
+    //而concat之前没有，所以其输入日过为const就写到constant_map里面。
+    virtual bool HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+        return false;
+    };
+    
+    //有权值写入的返回1， 没有的返回0
+    virtual int WriteTNNModel(Serializer *writer, NodeProto &node,
+                                   OnnxNetInfo &net_info) {
+        return 0;
+    };
+    
+    //write will write the shape and data of tensor
+    static int WriteTensorData(const onnx::TensorProto &tensor, Serializer *writer,
+                        DataType dst_data_type);
+    
+    static int WriteRawData(const void *raw_data, int data_count, int src_data_type, Serializer *writer,
+                     DataType dst_data_type, std::vector<int32_t> dims);
+    //depreceted
+    static int WriteRawData(const float *raw_data, int data_count, Serializer *writer,
+                     DataType dst_data_type, std::vector<int32_t> dims);
+
+    static int WriteIntTensorData(const onnx::TensorProto& tensor, Serializer* writer);
+
+protected:
+    string onnx_op_type_;
+};
+
+class OnnxOpConverterManager {
+public:
+    static std::shared_ptr<OnnxOpConverterManager> &Shared();
+    OnnxOpConverterManager();
+    ~OnnxOpConverterManager();
+    std::shared_ptr<OnnxOpConverter> GetOnnxOpConverter(string onnx_type);
+    int SetOnnxOpConverter(string onnx_type,
+                           std::shared_ptr<OnnxOpConverter> converter);
+
+private:
+    std::map<string, std::shared_ptr<OnnxOpConverter>> converter_map_;
+};
+
+template <typename T>
+class OnnxOpConverterRegister {
+public:
+    OnnxOpConverterRegister(string onnx_op_type) {
+        auto converter = std::make_shared<T>(onnx_op_type);
+        auto &manager  = OnnxOpConverterManager::Shared();
+        manager->SetOnnxOpConverter(onnx_op_type, converter);
+    };
+    OnnxOpConverterRegister(string raidnet_op_type, string onnx_op_type) {
+        auto converter = std::make_shared<T>(raidnet_op_type, onnx_op_type);
+        auto &manager  = OnnxOpConverterManager::Shared();
+        manager->SetOnnxOpConverter(onnx_op_type, converter);
+    };
+    ~OnnxOpConverterRegister(){};
+
+private:
+    OnnxOpConverterRegister();
+};
+
+#define DECLARE_OP_CONVERTER(onnx_type)                                        \
+    class OnnxOpConverter##onnx_type : public OnnxOpConverter {                \
+    public:                                                                    \
+        OnnxOpConverter##onnx_type(string ignore) : OnnxOpConverter(ignore){}; \
+        virtual ~OnnxOpConverter##onnx_type(){};                               \
+        virtual string TNNOpType(NodeProto &, OnnxNetInfo &net_info);     \
+        virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &);         \
+        virtual bool HasLayerResource(NodeProto &node, OnnxNetInfo &net_info);  \
+        virtual int WriteTNNModel(Serializer *, NodeProto &,              \
+                                       OnnxNetInfo &);                         \
+    }
+
+#define DECLARE_OP_CONVERTER_WITH_FUNC(onnx_type, extra_func)                                        \
+    class OnnxOpConverter##onnx_type : public OnnxOpConverter {                \
+    public:                                                                    \
+        OnnxOpConverter##onnx_type(string ignore) : OnnxOpConverter(ignore){}; \
+        virtual ~OnnxOpConverter##onnx_type(){};                               \
+        virtual string TNNOpType(NodeProto &, OnnxNetInfo &net_info);     \
+        virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &);         \
+        virtual bool HasLayerResource(NodeProto &node, OnnxNetInfo &net_info);  \
+        virtual int WriteTNNModel(Serializer *, NodeProto &,              \
+                                       OnnxNetInfo &);                         \
+        extra_func \
+    }
+
+#define DECLARE_OP_CONVERTER_WITH_PROCESS(onnx_type)                                                                   \
+    class OnnxOpConverter##onnx_type : public OnnxOpConverter {                                                        \
+    public:                                                                                                            \
+        OnnxOpConverter##onnx_type(string ignore) : OnnxOpConverter(ignore){};                                         \
+        virtual ~OnnxOpConverter##onnx_type(){};                                                                       \
+        virtual string TNNOpType(NodeProto &, OnnxNetInfo &net_info);                                                  \
+        virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &);                                                      \
+        virtual int WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &);                                           \
+        virtual void ProcessConstantNode(NodeProto &node, OnnxNetInfo &net_info);                                      \
+    }
+
+#define REGISTER_OP_CONVERTER(converter_suffix, onnx_type)                     \
+    OnnxOpConverterRegister<OnnxOpConverter##converter_suffix>                 \
+        g_converter_##onnx_type(#onnx_type)
+
+
+
+class OnnxOpConverterNoParamNoWeight : public OnnxOpConverter {
+public:
+    OnnxOpConverterNoParamNoWeight(string tnn_type, string onnx_type)
+        : OnnxOpConverter(onnx_type) {
+        tnn_type_ = tnn_type;
+    };
+    virtual ~OnnxOpConverterNoParamNoWeight(){};
+    virtual string TNNOpType(NodeProto &, OnnxNetInfo &) {
+        return tnn_type_;
+    };
+    virtual string TNNLayerParam(NodeProto &, OnnxNetInfo &) {
+        return "";
+    };
+    virtual int WriteTNNModel(Serializer *, NodeProto &, OnnxNetInfo &) {
+        return 0;
+    };
+
+private:
+    string tnn_type_;
+};
+
+#define REGISTER_OP_CONVERTER_NoParamNoWeight(tnn_type, onnx_type)        \
+    OnnxOpConverterRegister<OnnxOpConverterNoParamNoWeight>                    \
+        g_converter_##tnn_type_##onnx_type(#tnn_type, #onnx_type)
+
+#endif /* onnx_op_converter_hpp */
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.cc
new file mode 100644
index 0000000..403424c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.cc
@@ -0,0 +1,615 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <execinfo.h>
+
+#include <set>
+#include <exception>
+
+#include "onnx2tnn_prefix.h"
+
+using namespace std;
+
+std::vector<int> GetNextIndexNode(std::vector<IndexNode>& index_nodes, int index) {
+    auto node = index_nodes[index].node;
+    std::vector<int> next_indexes;
+
+    for (int ii = index + 1; ii < index_nodes.size(); ii++) {
+        auto item    = index_nodes[ii].node;
+        bool is_next = false;
+        for (auto output : node->output()) {
+            for (auto input : item->input()) {
+                if (output == input) {
+                    next_indexes.push_back(ii);
+                    is_next = true;
+                    break;
+                }
+            }
+            if (is_next) {
+                break;
+            }
+        }
+    }
+    return next_indexes;
+}
+
+int RemoveIndexNode(std::vector<IndexNode>& index_nodes, int index) {
+    auto node                     = index_nodes[index].node;
+    auto node_input               = node->input(0);
+    std::vector<int> next_indexes = GetNextIndexNode(index_nodes, index);
+    for (auto index : next_indexes) {
+        auto next_node = index_nodes[index].node;
+        for (int output_index = 0; output_index < node->output_size(); output_index++) {
+            for (int ii = 0; ii < next_node->input_size(); ii++) {
+                if (node->output(output_index) == next_node->input(ii)) {
+                    next_node->set_input(ii, node_input);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+Onnx2TNN::Onnx2TNN(std::string onnx_model_path, std::string tnn_proto_path,
+                   std::string tnn_model_path, InputShapesMap shapes_map) {
+    tnn_proto_path_ = tnn_proto_path;
+    tnn_model_path_ = tnn_model_path;
+
+    onnx_model_path_ = onnx_model_path;
+    target_inputs_shape_map_ = shapes_map;
+}
+
+Onnx2TNN::~Onnx2TNN() {
+    if (!onnx_model_) {
+        delete onnx_model_;
+    }
+}
+
+bool Onnx2TNN::CheckIs3DModel() {
+    const onnx::GraphProto& graph = onnx_model_->graph();
+    //写入每层的权值
+    for (int i = 0; i < graph.node_size(); i++) {
+        const onnx::NodeProto& node = graph.node(i);
+        const std::string& onnx_op  = node.op_type();
+        if (onnx_op == "Conv") {
+            std::vector<int64_t> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            return kernel_shape.size() == 3;
+        }
+    }
+    return false;
+}
+
+int Onnx2TNN::Convert(DataType dataType) {
+    int ret = 0;
+
+    //加载onnx模型
+    if (!onnx_model_) {
+        onnx::ModelProto* onnx_model = new onnx::ModelProto();
+        ret                          = read_proto_from_binary(onnx_model_path_.c_str(), (google::protobuf::Message*)onnx_model);
+
+        if (ret != 0) {
+            delete onnx_model;
+
+            LOGE("read_proto_from_binary failed, path:%s\n", onnx_model_path_.c_str());
+            return ret;
+        }
+
+        onnx_model_ = onnx_model;
+    }
+
+    //提取onnx的blob和weights信息
+    ret = OnnxExtractBlobWeights();
+    if (ret != 0) {
+        LOGE("OnnxExtractBlobWeights failed");
+        return ret;
+    }
+
+    onnx_net_info_.data_type   = dataType;
+    onnx_net_info_.is_3D_model = CheckIs3DModel();
+    onnx_net_info_.opset       = onnx_model_->opset_import(0).version();
+
+    ret = TNNWriteProto();
+    if (ret != 0) {
+        LOGE("TNNWriteProto failed");
+        return ret;
+    }
+
+    ret = TNNWriteModel();
+    if (ret != 0) {
+        LOGE("TNNWriteModel failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int Onnx2TNN::TNNWriteProto() {
+    int ret          = 0;
+    FILE* file_proto = nullptr;
+
+    do {
+        file_proto = fopen(tnn_proto_path_.c_str(), "w");
+        if (!file_proto) {
+            LOGE("fopen proto file failed, path:%s\n", tnn_proto_path_.c_str());
+            break;
+        }
+
+        const onnx::GraphProto& graph = onnx_model_->graph();
+
+        ostringstream proto_net_info;
+        {
+            // line 1
+            proto_net_info << "\"1 " << (int)onnx_blob_names_.size() << " 1 " << g_version_magic_number_v2 << " ,\""
+                           << endl;
+
+            // line 2, input blobs
+            {
+                int input_blob_count                          = 0;
+                std::vector<onnx::ValueInfoProto*> input_blobs = std::vector<onnx::ValueInfoProto*>();
+                for (int j = 0; j < graph.input_size(); j++) {
+                    const std::string& input_name = graph.input(j).name();
+
+                    // check weight
+                    if (onnx_net_info_.weights_map.find(input_name) != onnx_net_info_.weights_map.end())
+                        continue;
+
+                    // split the input
+                    if (onnx_node_reference_.size() > 0) {
+                        if (onnx_node_reference_.find(input_name) != onnx_node_reference_.end()) {
+                            int refcount = onnx_node_reference_[input_name];
+                            if (refcount < 1) {
+                                continue;
+                            }
+                        }
+                    }
+
+                    // check it is an used input
+                    bool is_used_input = false;
+                    for (int iz = 0; iz < graph.node_size() && !is_used_input; iz++) {
+                        const onnx::NodeProto& node = graph.node(iz);
+                        for (int nz = 0; nz < node.input_size(); nz++) {
+                            if (input_name == node.input(nz) && node.op_type() != k_tnn_noop_type) {
+                                is_used_input = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (!is_used_input) {
+                        continue;
+                    }
+
+                    input_blob_count++;
+                    input_blobs.push_back((onnx::ValueInfoProto*)(&(graph.input(j))));
+                }
+
+                if (input_blob_count == 0) {
+                    LOGE("invalid input blob count(must >= 1): %d\n", input_blob_count);
+                    assert(0);
+                    break;
+                }
+
+                proto_net_info << "\"";
+                for (int ii = 0; ii < input_blob_count; ii++) {
+                    onnx::ValueInfoProto* input_blob = input_blobs[ii];
+                    auto shape = GetDimsFromTensorShape(input_blob->type().tensor_type().shape());
+                    
+                    if (target_inputs_shape_map_.find(input_blob->name()) != target_inputs_shape_map_.end()) {
+                        shape = target_inputs_shape_map_[input_blob->name()];
+                    }
+                    
+                    if (shape.size() > 0 && shape[0] <= 0) {
+                        shape[0] = 1;
+                    }
+                    
+                    proto_net_info << input_blob->name() << " " << shape.size() << " ";
+                    for (const auto& dim : shape) {
+                        proto_net_info << dim << " ";
+                    }
+                    LOGD("input_blob_shape dim_size: %d\n", (int)shape.size());
+                        
+                    DataType input_data_type = GetTnnDataTypeFromOnnx(input_blob->type());
+                    proto_net_info << input_data_type << " ";
+                    if (input_blob_count > 1 && ii != input_blob_count - 1) {
+                        proto_net_info << ": ";
+                    }
+                }
+                proto_net_info << ",\"" << endl;
+            }
+
+            // line 3, all blobs
+            {
+                proto_net_info << "\" ";
+                for (auto item = onnx_blob_names_.begin(); item != onnx_blob_names_.end(); item++) {
+                    proto_net_info << *item << " ";
+                }
+                proto_net_info << ",\"" << endl;
+            }
+
+            // line 4, output blobs
+            {
+                int output_blob_count             = 0;
+                onnx::ValueInfoProto* output_blob = nullptr;
+
+                proto_net_info << "\"";
+                for (int j = 0; j < graph.output_size(); j++) {
+                    const std::string& output_name = graph.output(j).name();
+
+                    // check weight
+                    if (onnx_net_info_.weights_map.find(output_name) != onnx_net_info_.weights_map.end())
+                        continue;
+                    
+                    // check it is an used output
+                    bool is_used_output = false;
+                    for (int iz = 0; iz < graph.node_size() && !is_used_output; iz++) {
+                        const onnx::NodeProto& node = graph.node(iz);
+                        for (int nz = 0; nz < node.output_size(); nz++) {
+                            if (output_name == node.output(nz) && node.op_type() != k_tnn_noop_type) {
+                                is_used_output = true;
+                                break;
+                            }
+                        }
+                    }
+                    
+                    if (!is_used_output) {
+                        continue;
+                    }
+
+                    output_blob_count++;
+                    output_blob = (onnx::ValueInfoProto*)(&(graph.output(j)));
+                    proto_net_info << output_blob->name() << " ";
+                }
+                proto_net_info << ",\"" << endl;
+
+                if (output_blob_count <= 0) {
+                    LOGE("invalid output blob count(must = 1): %d\n", output_blob_count);
+                    assert(0);
+                    break;
+                }
+            }
+        }
+
+        int layer_count = 0;
+        ostringstream proto_layers;
+        {
+            for (int i = 0; i < graph.node_size(); i++) {
+                onnx::NodeProto& node      = (onnx::NodeProto&)graph.node(i);
+                const std::string& onnx_op = node.op_type();
+
+                if (onnx_op == k_tnn_noop_type) {
+                    continue;
+                }
+                if (onnx_op == "Constant" &&
+                    onnx_net_info_.used_const_node.find(node.output(0)) == onnx_net_info_.used_const_node.end()) {
+                    continue;
+                }
+                auto op_converter = OnnxOpConverterManager::Shared()->GetOnnxOpConverter(onnx_op);
+                if (!op_converter) {
+                    LOGE("error::op convert failed onnx:%s\n", onnx_op.c_str());
+                    assert(0);
+                } else {
+                    LOGD("node:%s onnx:%s -> tnn:%s\n", node.output(0).c_str(), onnx_op.c_str(),
+                         op_converter->TNNOpType(node, onnx_net_info_).c_str());
+                }
+
+                auto op_proto = op_converter->TNNLayerProto(node, onnx_net_info_);
+                proto_layers << op_proto << endl;
+                layer_count++;
+            }
+        }
+
+        fprintf(file_proto, "%s", proto_net_info.str().c_str());
+        // line 5, 层数 TODO
+        fprintf(file_proto, "\" %d ,\"\n", layer_count);
+        fprintf(file_proto, "%s", proto_layers.str().c_str());
+
+        // LOGE("%s", proto_net_info.str().c_str());
+        // // line 5, 层数 TODO
+        // LOGE("\" %d ,\"\n", layer_count);
+        // LOGE("%s", proto_layers.str().c_str());
+
+    } while (0);
+
+    if (file_proto) {
+        fclose(file_proto);
+    }
+    return ret;
+}
+
+int Onnx2TNN::OnnxExtractBlobWeights() {
+    if (!onnx_model_) {
+        LOGE("onnx_model is nil");
+        return -1;
+    }
+
+    const onnx::GraphProto& graph   = onnx_model_->graph();
+    onnx::GraphProto* mutable_graph = onnx_model_->mutable_graph();
+    TransferInputName(mutable_graph);
+
+    int node_count = graph.node_size();
+
+    // node reference
+    std::map<std::string, int> node_reference;
+    std::map<std::string, std::vector<int>> follow_up_node_ids;
+    std::map<std::string, int> node_name_to_node_id;
+
+    //去除常量node，便于fuse判断pattern
+    std::vector<IndexNode> index_nodes;
+    for (int i = 0; i < node_count; i++) {
+        auto node = mutable_graph->mutable_node(i);
+        index_nodes.push_back(IndexNode(i, node));
+    }
+    ClearEmptyNode(index_nodes);
+
+    // weight node and weight reshape node
+    TensorProtoMap weights;
+    TensorShapeMap weight_shapes;
+
+    for (int j = 0; j < graph.initializer_size(); j++) {
+        const onnx::TensorProto& initializer = graph.initializer(j);
+        LOGD("weight = %s\n", initializer.name().c_str());
+        weights[initializer.name()] = initializer;
+    }
+
+    for (int j = 0; j < graph.value_info_size(); j++) {
+        const onnx::TensorShapeProto& shape_info = graph.value_info(j).type().tensor_type().shape();
+        LOGD("value_info dim_size = %d\n", shape_info.dim_size());
+        weight_shapes[graph.value_info(j).name()] = shape_info;
+    }
+    // initial proxy node
+    for (int i = 0; i < graph.node_size(); ++i) {
+        const auto& node = graph.node(i);
+        onnx_net_info_.proxy_node_map[node.output(0)] = node;
+    }
+
+    std::set<std::string> need_constant_node = {};
+    // process constant node
+    for (int i = 0; i < graph.node_size(); ++i) {
+        const auto& node    = graph.node(i);
+        const auto& op_type = node.op_type();
+        if (std::find(need_constant_node.begin(), need_constant_node.end(), op_type) != need_constant_node.end()) {
+            for (const auto& input_name : node.input()) {
+                if (onnx_net_info_.proxy_node_map.find(input_name) != onnx_net_info_.proxy_node_map.end()) {
+                    const auto& input_node = onnx_net_info_.proxy_node_map.find(input_name)->second;
+                    if (input_node.op_type() == "Constant") {
+                        onnx_net_info_.used_const_node.insert(input_node.output(0));
+                    }
+                }
+            }
+        }
+    }
+
+    // global definition line
+    // [layer count] [blob count]
+    std::set<std::string> blob_names;
+    for (int i = 0; i < node_count; i++) {
+        const onnx::NodeProto& node = graph.node(i);
+
+        const std::string& onnx_op = node.op_type();
+
+        std::string name = node.name();
+        if (name.empty()) {
+            name = node.output(0);
+        }
+
+        if (onnx_op == "Constant" &&
+            onnx_net_info_.used_const_node.find(node.output(0)) == onnx_net_info_.used_const_node.end()) {
+            // Constant
+            onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
+            weights[node.output(0)]  = tensor;
+            LOGD("const node to initialize = %s\n", name.c_str());
+            continue;
+        } else if (onnx_op == "Cast") {
+            // do nothing
+        }
+
+        for (int j = 0; j < (int)node.input_size(); j++) {
+            const std::string& input_name = node.input(j);
+
+            // check weight
+            if (weights.find(input_name) != weights.end()) {
+                continue;
+            }
+
+            blob_names.insert(input_name);
+
+            if (node_reference.find(input_name) == node_reference.end()) {
+                node_reference[input_name] = 1;
+            } else {
+                node_reference[input_name] = node_reference[input_name] + 1;
+            }
+
+            // 记录依赖 node name 的节点列表
+            {
+                std::vector<int> node_list;
+                if (follow_up_node_ids.find(input_name) != follow_up_node_ids.end()) {
+                    node_list = follow_up_node_ids[input_name];
+                }
+                node_list.push_back(i);
+                follow_up_node_ids[input_name] = node_list;
+            }
+        }
+
+        for (int j = 0; j < (int)node.output_size(); j++) {
+            const std::string& output_name = node.output(j);
+
+            blob_names.insert(output_name);
+
+            // 记录node name 对应的node id
+            {
+                // 每个node name只应该出现一次
+                if (node_name_to_node_id.find(output_name) != node_name_to_node_id.end()) {
+                    assert(0);
+                }
+                node_name_to_node_id[output_name] = i;
+            }
+
+            // 简化代码逻辑， Dropout 只取0号output，原因暂不清楚
+            if (onnx_op == "Dropout") {
+                break;
+            }
+        }
+    }
+
+    // include Input node
+    int input_node_count = 0;
+    for (int j = 0; j < graph.input_size(); j++) {
+        const std::string& input_name = graph.input(j).name();
+
+        // check weight
+        if (weights.find(input_name) != weights.end())
+            continue;
+
+        blob_names.insert(input_name);
+
+        input_node_count++;
+    }
+    onnx_net_info_.weights_map       = weights;
+    onnx_net_info_.weights_shape_map = weight_shapes;
+
+    // onnx_op remove
+    RemoveIdentity(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemovePad(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // RemoveExpand(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemovePool(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveConcat(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // RemoveReshape(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseShuffleChannel(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveSplitUnsqueezeConcat(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // RemoveSqueeze(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // RemoveUnsqueeze(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveDropout(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveReshapeWhere(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveImageScaler(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseHDRGuide(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // op transfer
+    TransferReduceMax(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    TransferGlobalMaxPool(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    TransferGroupNormalization(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    TransferInverse(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    TransferGridSample(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    
+    // onnx_op chain fusion
+    // FuseMatMul(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    // FuseShuffleChannel(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseLogSigmoid(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseSoftmax(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseHardSigmoid(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseHardSwish(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseGELU(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseTranspose(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseBatchNorm(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FusePRelu(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseNormalize(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseFlatten(mutable_graph, index_nodes, weights, node_reference, blob_names);
+
+    FuseSignedMul(mutable_graph, index_nodes, weights, node_reference, blob_names);
+
+    // FuseGEMM(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseDeconv(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseConv(mutable_graph, index_nodes, weights, node_reference, blob_names);
+
+    FuseDepthToSpace(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseGlobalAveragePool(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseLayerNormalization(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseGroupNormalization(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseInstanceNormalization(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FusePooling(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseRelu6(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseSpaceToDepth(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseLSTM(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseArgMaxOrMin(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    FuseHistogram(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveConsecutiveReshape(mutable_graph, index_nodes, weights, node_reference, blob_names);
+#ifdef PROCESS_TF
+    TransferSplit(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    TransferConcat(mutable_graph, index_nodes, weights, node_reference, blob_names);
+    RemoveTranspose(mutable_graph, index_nodes, weights, node_reference, blob_names);
+#endif
+    //    //onnx_op split
+    //    for (int i = 0; i < node_count; i++) {
+    //        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    //
+    //        // ConvTranspose => ConvTranspose - Pad
+    //        do {
+    //            if (node->op_type() == "ConvTranspose") {
+    //                onnx::NodeProto* node_deconv = node;
+    //                std::vector<int64_t> output_pads = get_node_attr_ai(*node_deconv, "output_padding");
+    //
+    //                bool all_zero = true;
+    //                for (auto iter : output_pads) {
+    //                    if (iter != 0) {
+    //                        all_zero = false;
+    //                        break;
+    //                    }
+    //                }
+    //                BREAK_IF(all_zero);
+    //
+    //                reduced_node_count -= 1;
+    //                i += 1;
+    //            }
+    //        } while (0);
+    //    }
+
+    // remove node_reference entry with reference equals to one
+    int splitncnn_blob_count                = 0;
+    std::map<std::string, int>::iterator it = node_reference.begin();
+    while (it != node_reference.end()) {
+        if (it->second == 1) {
+            node_reference.erase(it++);
+        } else {
+            splitncnn_blob_count += it->second;
+            //             fprintf(stderr, "%s %d\n", it->first.c_str(),
+            //             it->second);
+            ++it;
+        }
+    }
+
+    onnx_blob_names_                 = blob_names;
+    onnx_node_reference_             = node_reference;
+    onnx_net_info_.weights_map       = weights;
+    onnx_net_info_.weights_shape_map = weight_shapes;
+    return 0;
+}
+
+int Onnx2TNN::ClearEmptyNode(std::vector<IndexNode>& index_nodes) {
+    std::vector<IndexNode> nodes;
+    for (auto item : index_nodes) {
+        if (item.node->op_type() == "Constant") {
+            continue;
+        } else if (item.node->op_type() == k_tnn_noop_type) {
+            continue;
+        }
+        nodes.push_back(item);
+    }
+    index_nodes = nodes;
+    return 0;
+}
+
+std::string get_backtrack() {
+    const int MAX_SIZE = 10;
+    std::string backtrace_str;
+    char** strings        = nullptr;
+    void* array[MAX_SIZE] = {0};
+    size_t size           = backtrace(array, MAX_SIZE);
+    strings               = backtrace_symbols(array, size);
+    for (size_t i = 0; i < size; i++)
+        backtrace_str += std::string(strings[i]) + std::string("\n");
+    free(strings);
+    return backtrace_str;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.h b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.h
new file mode 100644
index 0000000..3cf488d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn.h
@@ -0,0 +1,350 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef onnx2tnn_hpp
+#define onnx2tnn_hpp
+
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <limits>
+#include <set>
+#include <sstream>
+#include <exception>
+
+#include "tnn/core/blob.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "onnx2tnn_prefix.h"
+#include "onnx_op_converter.h"
+
+#include "onnx.pb.h"
+#include "onnx_utility.h"
+
+using namespace std;
+using namespace TNN_NS;
+
+const std::string tag = "converter";
+
+
+struct IndexNode {
+    int index;
+    onnx::NodeProto* node;
+    int mark = 0;
+    IndexNode(int index, onnx::NodeProto* node) {
+        this->index = index;
+        this->node = node;
+    }
+};
+
+std::vector<int> GetNextIndexNode(std::vector<IndexNode> &index_nodes, int index);
+int RemoveIndexNode(std::vector<IndexNode> &index_nodes, int index);
+
+class Onnx2TNN {
+public:
+    Onnx2TNN(std::string onnx_model_path, std::string tnn_proto_path,
+             std::string tnn_model_path, InputShapesMap shapes_map = {});
+    ~Onnx2TNN();
+
+    int Convert(DataType dataType = DATA_TYPE_FLOAT);
+
+private:
+    std::string tnn_proto_path_;
+    std::string tnn_model_path_;
+    std::string onnx_model_path_;
+    onnx::ModelProto* onnx_model_ = nullptr;
+    InputShapesMap target_inputs_shape_map_ = {};
+    
+    int OnnxExtractBlobWeights();
+    bool CheckIs3DModel();
+
+    // proto相关
+    int TNNWriteProto();
+
+    // model相关
+    int TNNWriteModel();
+
+    // onnx node reference
+    std::set<std::string> onnx_blob_names_;
+    // onnx node reference
+    std::map<std::string, int> onnx_node_reference_;
+
+    OnnxNetInfo onnx_net_info_;
+
+protected:
+    //clear empty node like const and noop
+    int ClearEmptyNode(std::vector<IndexNode>& index_nodes);
+
+
+    //remove
+    int RemoveConsecutiveReshape(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode>& index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int RemoveReshape(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode>& index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int RemovePool(onnx::GraphProto* mutable_graph,
+                   std::vector<IndexNode> & index_nodes,
+                   std::map<std::string, onnx::TensorProto>& weights,
+                   std::map<std::string, int>& node_reference,
+                   std::set<std::string>& blob_names);
+    int RemovePad(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int RemoveUnsqueeze(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int RemoveExpand(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int RemoveConcat(onnx::GraphProto* mutable_graph,
+                     std::vector<IndexNode> & index_nodes,
+                     std::map<std::string, onnx::TensorProto>& weights,
+                     std::map<std::string, int>& node_reference,
+                     std::set<std::string>& blob_names);
+    int RemoveSplitUnsqueezeConcat(onnx::GraphProto* mutable_graph,
+                                   std::vector<IndexNode> & index_nodes,
+                                   std::map<std::string, onnx::TensorProto>& weights,
+                                   std::map<std::string, int>& node_reference,
+                                   std::set<std::string>& blob_names);
+
+    int RemoveTranspose(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names);
+
+    int RemoveImageScaler(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                          std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                          std::set<std::string>& blob_names);
+
+    int RemoveSqueeze(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+
+    int RemoveDropout(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    
+    int RemoveReshapeWhere(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int RemoveIdentity(onnx::GraphProto* mutable_graph,
+                       std::vector<IndexNode> & index_nodes,
+                       std::map<std::string, onnx::TensorProto>& weights,
+                       std::map<std::string, int>& node_reference,
+                       std::set<std::string>& blob_names);
+protected:
+    //fuse
+    int FuseLogSigmoid(onnx::GraphProto* mutable_graph,
+                       std::vector<IndexNode> & index_nodes,
+                       std::map<std::string, onnx::TensorProto>& weights,
+                       std::map<std::string, int>& node_reference,
+                       std::set<std::string>& blob_names);
+    int FuseSoftmax(onnx::GraphProto* mutable_graph,
+                    std::vector<IndexNode> & index_nodes,
+                    std::map<std::string, onnx::TensorProto>& weights,
+                    std::map<std::string, int>& node_reference,
+                    std::set<std::string>& blob_names);
+    int FuseHardSigmoid(onnx::GraphProto* mutable_graph,
+                        std::vector<IndexNode> & index_nodes,
+                        std::map<std::string, onnx::TensorProto>& weights,
+                        std::map<std::string, int>& node_reference,
+                        std::set<std::string>& blob_names);
+    //call HardSigmoid before FuseHardSwish
+    int FuseHardSwish(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int FuseGELU(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int FuseTranspose(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int FuseShuffleChannel(onnx::GraphProto* mutable_graph,
+                           std::vector<IndexNode> & index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names);
+    int FuseMatMul(onnx::GraphProto* mutable_graph,
+                   std::vector<IndexNode> & index_nodes,
+                   std::map<std::string, onnx::TensorProto>& weights,
+                   std::map<std::string, int>& node_reference,
+                   std::set<std::string>& blob_names);
+    int FuseNormalize(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int FuseFlatten(onnx::GraphProto* mutable_graph,
+                    std::vector<IndexNode> & index_nodes,
+                    std::map<std::string, onnx::TensorProto>& weights,
+                    std::map<std::string, int>& node_reference,
+                    std::set<std::string>& blob_names);
+    int FusePRelu(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int FuseLSTM(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int FuseArgMaxOrMin(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode> & index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                  std::set<std::string>& blob_names);
+    int FuseBatchNorm(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+    int FuseGEMM(onnx::GraphProto* mutable_graph,
+                         std::vector<IndexNode> & index_nodes,
+                         std::map<std::string, onnx::TensorProto>& weights,
+                         std::map<std::string, int>& node_reference,
+                         std::set<std::string>& blob_names);
+    int FuseSignedMul(onnx::GraphProto* mutable_graph,
+                         std::vector<IndexNode> & index_nodes,
+                         std::map<std::string, onnx::TensorProto>& weights,
+                         std::map<std::string, int>& node_reference,
+                         std::set<std::string>& blob_names);
+
+    int FuseDeconv(onnx::GraphProto* mutable_graph,
+                   std::vector<IndexNode> & index_nodes,
+                   std::map<std::string, onnx::TensorProto>& weights,
+                   std::map<std::string, int>& node_reference,
+                   std::set<std::string>& blob_names);
+    int FuseConv(onnx::GraphProto* mutable_graph,
+                   std::vector<IndexNode> & index_nodes,
+                   std::map<std::string, onnx::TensorProto>& weights,
+                   std::map<std::string, int>& node_reference,
+                   std::set<std::string>& blob_names);
+    int FuseHDRGuide(onnx::GraphProto* mutable_graph,
+                     std::vector<IndexNode> & index_nodes,
+                     std::map<std::string, onnx::TensorProto>& weights,
+                     std::map<std::string, int>& node_reference,
+                     std::set<std::string>& blob_names);
+    int FuseDepthToSpace(onnx::GraphProto* mutable_graph,
+                         std::vector<IndexNode> & index_nodes,
+                         std::map<std::string, onnx::TensorProto>& weights,
+                         std::map<std::string, int>& node_reference,
+                         std::set<std::string>& blob_names);
+
+    int FuseGlobalAveragePool(onnx::GraphProto* mutable_graph,
+                         std::vector<IndexNode> & index_nodes,
+                         std::map<std::string, onnx::TensorProto>& weights,
+                         std::map<std::string, int>& node_reference,
+                         std::set<std::string>& blob_names);
+
+    int FuseLayerNormalization(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names);
+    int FuseInstanceNormalization(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names);
+    int FuseGroupNormalization(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names);
+    
+    int FusePooling(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                    std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                    std::set<std::string>& blob_names);
+    int FuseRelu6(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                    std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                    std::set<std::string>& blob_names);
+    int FuseSpaceToDepth(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                         std::map<std::string, onnx::TensorProto>& weights,
+                         std::map<std::string, int>& node_reference, std::set<std::string>& blob_names);
+    int FuseHistogram(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                    std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                    std::set<std::string>& blob_names);
+    
+protected:
+    //transfer
+    int TransferReduceMax(onnx::GraphProto* mutable_graph,
+                           std::vector<IndexNode> & index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names);
+    
+    int TransferGlobalMaxPool(onnx::GraphProto* mutable_graph, 
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names);
+    int TransferGroupNormalization(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names);
+    int TransferInverse(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names);
+    int TransferGridSample(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names);
+    
+    int TransferInputName(onnx::GraphProto* mutable_graph);
+
+    int TransferSplit(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+
+    int TransferConcat(onnx::GraphProto* mutable_graph,
+                      std::vector<IndexNode> & index_nodes,
+                      std::map<std::string, onnx::TensorProto>& weights,
+                      std::map<std::string, int>& node_reference,
+                      std::set<std::string>& blob_names);
+
+
+};
+
+std::string get_backtrack();
+
+#endif /* onnx2tnn_hpp */
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_model.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_model.cc
new file mode 100644
index 0000000..801e8a7
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_model.cc
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#ifdef __fp16
+typedef __fp16 float16;
+#else
+typedef uint16_t float16;
+#endif
+
+int Onnx2TNN::TNNWriteModel() {
+    int ret = 0;
+
+    std::ofstream file_model;
+    file_model.open(tnn_model_path_, std::ios::binary);
+    file_model.write((char *)(&g_version_magic_number_v2), sizeof(g_version_magic_number_v2));
+    int model_pos = sizeof(g_version_magic_number_v2);
+
+    do {
+        if (!file_model || !file_model.is_open() || !file_model.good()) {
+            file_model.close();
+            DLog("model file open failed\n");
+            assert(0);
+            break;
+        }
+
+        Serializer net_writer(file_model);
+
+        const onnx::GraphProto& graph = onnx_model_->graph();
+
+        //统计含有权值的层数
+        int weight_layer_count = 0;
+        //层数 含有权值的层数 先占位后更正
+        net_writer.PutInt(weight_layer_count);
+        //        file_model.write(reinterpret_cast<char*>(&weight_layer_count),
+        //        sizeof(int));
+
+        //写入每层的权值
+        for (int i = 0; i < graph.node_size(); i++) {
+            onnx::NodeProto& node      = (onnx::NodeProto&)graph.node(i);
+            const std::string& onnx_op = node.op_type();
+            const auto& used_const_node = this->onnx_net_info_.used_const_node;
+            if (onnx_op == k_tnn_noop_type || ( onnx_op == "Constant" && used_const_node.find(node.output(0)) == used_const_node.end() )) {
+                continue;
+            }
+
+            auto op_converter =
+                OnnxOpConverterManager::Shared()->GetOnnxOpConverter(onnx_op);
+            if (op_converter == nullptr) {
+                fprintf(stderr, "get op convert for %s failed\n",onnx_op.c_str());
+                assert(0);
+            }
+            weight_layer_count += op_converter->WriteTNNModel(
+                &net_writer, node, onnx_net_info_);
+        }
+
+        //更正含有权值的层数
+        file_model.seekp(model_pos, ios::beg);
+        net_writer.PutInt(weight_layer_count);
+        file_model.seekp(0, ios::end);
+        
+        //写入constant_map
+        {
+            std::set<std::string> const_id_set;
+            
+            //写入每层的constant输入（除了已经写入layerresource的，如conv）
+            for (int i = 0; i < graph.node_size(); i++) {
+                onnx::NodeProto& node      = (onnx::NodeProto&)graph.node(i);
+                const std::string& onnx_op = node.op_type();
+                const auto& used_const_node = this->onnx_net_info_.used_const_node;
+                if (onnx_op == k_tnn_noop_type || ( onnx_op == "Constant" && used_const_node.find(node.output(0)) == used_const_node.end() )) {
+                    continue;
+                }
+
+                auto op_converter =
+                    OnnxOpConverterManager::Shared()->GetOnnxOpConverter(onnx_op);
+                if (op_converter == nullptr) {
+                    fprintf(stderr, "get op convert for %s failed\n",onnx_op.c_str());
+                    assert(0);
+                }
+                
+                for (int j = 0; j < (int)node.input_size(); j++) {
+                    const std::string &input_name = node.input(j);
+                    //some op like ConstantOfShape, its input(0) may be const but it is not in layer resource
+                    if ( (j==0 || !op_converter->HasLayerResource(node, onnx_net_info_)) &&
+                        onnx_net_info_.weights_map.find(input_name) != onnx_net_info_.weights_map.end() ) {
+                        const_id_set.insert(input_name);
+                    }
+                }
+            }
+            
+            if (const_id_set.size() < 0) {
+                break;
+            }
+            
+            //write version number
+            net_writer.PutInt(g_version_magic_number_v2);
+            //write const count
+            net_writer.PutInt((int)const_id_set.size());
+            for (auto id : const_id_set) {
+                auto const_tensor = onnx_net_info_.weights_map[id];
+                net_writer.PutString(id);
+                OnnxOpConverter::WriteTensorData(const_tensor, &net_writer, DATA_TYPE_AUTO);
+                
+            }
+        }
+    } while (0);
+    
+    file_model.close();
+
+    return ret;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_prefix.h b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_prefix.h
new file mode 100644
index 0000000..4765ceb
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx2tnn_prefix.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef onnx2tnn_prefix_h
+#define onnx2tnn_prefix_h
+#include "tnn/core/macro.h"
+#include "tnn/core/common.h"
+#include "tnn/utils/half_utils.h"
+
+
+#define LOG_LEVEL 2
+#define LOG_FUNCTION (LOG_LEVEL >= 3 ? __PRETTY_FUNCTION__ : __FUNCTION__)
+#define DLog(fmt, ...)                                                         \
+    printf(("%s [Line %d] " fmt), LOG_FUNCTION, __LINE__, ##__VA_ARGS__)
+
+
+#define k_tnn_noop_type "tnn.noop"
+#define k_layout_order_nchw "nchw"
+#define k_layout_order_nhwc "nhwc"
+
+#define k_onnx_from_tensorflow "tensorflow"
+#define k_onnx_from_pytorch "pytorch"
+
+#define k_device_gpu "gpu"
+#define k_device_cpu "cpu"
+
+#define ERROR(fmt, ...)                                             \
+    do {                                                            \
+        const int _MAX = 2000;                                      \
+        char _ss[_MAX];                                             \
+        auto bt = get_backtrack();                                  \
+        snprintf(_ss, _MAX, fmt "\tin:\n%s:%d\nbacktrace:\n:%s",    \
+            ##__VA_ARGS__ , __FILE__, __LINE__, bt.c_str());        \
+        throw std::runtime_error(_ss);                              \
+    } while(0)
+    
+#endif /* prefix_h */
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_arg_max_or_min.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_arg_max_or_min.cc
new file mode 100644
index 0000000..ef37352
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_arg_max_or_min.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseArgMaxOrMin(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ArgMax <= ArgMax - Unsqueeze(axis = 0)
+        do {
+            if ((node->op_type() == "ArgMax" || node->op_type() == "ArgMin") && i + 1 < node_count) {
+                onnx::NodeProto* node_arg = node;
+                
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_unsuqeeze = index_nodes[next_indexes[0]].node;
+                
+                // check op
+                if (!(node_unsuqeeze->op_type() == "Unsqueeze"))
+                    break;
+                
+                auto keepdims = get_node_attr_i(*node_arg, "keepdims");
+                auto axis_arg = get_node_attr_i(*node_arg, "axis", 0);
+                auto axes_unsuqeeze = get_node_attr_ai(*node_unsuqeeze, "axes");
+                if (keepdims !=0 || axes_unsuqeeze.size() != 1 ||
+                    axis_arg != axes_unsuqeeze[0])
+                        break;
+                
+                node_unsuqeeze->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_arg->output(0)));
+                blob_names.erase(node_arg->output(0));
+                
+                node_arg->set_output(0, node_unsuqeeze->output(0));
+                auto attr = node_arg->mutable_attribute(1);
+                attr->set_i(1);
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_batchnorm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_batchnorm.cc
new file mode 100644
index 0000000..23ee978
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_batchnorm.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseBatchNorm(onnx::GraphProto* mutable_graph,
+                                 std::vector<IndexNode> & index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference,
+                                 std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
+        do {
+            if (node->op_type() == "Unsqueeze")
+            {
+                if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                    continue;
+
+                if (i+2 >= node_count)
+                    continue;
+
+                auto node2 = index_nodes[i+1].node;
+                auto node3 = index_nodes[i+2].node;
+
+                if (node2->op_type() != "BatchNormalization" || node3->op_type() != "Squeeze")
+                    continue;
+
+                if (node_reference.find(node2->output(0)) == node_reference.end() || node_reference[node2->output(0)] != 1)
+                    continue;
+
+                if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0))
+                    continue;
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+
+                node2->set_input(0, node->input(0));
+                node2->set_output(0, node3->output(0));
+
+                i += 2;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc
new file mode 100644
index 0000000..0dc3c2e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+
+#include "onnx2tnn.h"
+
+onnx::TensorProto CopyFloatTensorProto(onnx::TensorProto& src_tensor) {
+    onnx::TensorProto cpy_tensor;
+    cpy_tensor.set_data_type(src_tensor.data_type());
+
+    int dims_size = 1;
+    for (const auto dim : src_tensor.dims()) {
+        dims_size *= dim;
+        cpy_tensor.add_dims(dim);
+    }
+
+    auto* src_data = get_tensor_proto_data(src_tensor);
+    for (int i = 0; i < dims_size; i++) {
+        cpy_tensor.add_float_data(src_data[i]);
+    }
+
+    return cpy_tensor;
+}
+
+int Onnx2TNN::FuseConv(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                       std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                       std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    std::map<std::string, std::map<std::string, int>> unable_fuse_table;
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "Conv" && i + 1 < node_count) {
+                auto node_conv      = node;
+                auto node_batchnorm = index_nodes[i + 1].node;
+
+                // check op
+                if (!(node_batchnorm->op_type() == "BatchNormalization"))
+                    break;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                auto conv_weights_name = node_conv->input(1);
+                auto bn_gamma_name     = node_batchnorm->input(1);
+                if (unable_fuse_table.count(conv_weights_name) == 0) {
+                    std::map<std::string, int> tmp;
+                    tmp[bn_gamma_name]                   = 0;
+                    unable_fuse_table[conv_weights_name] = std::move(tmp);
+                } else if (unable_fuse_table[conv_weights_name].count(bn_gamma_name) == 0) {
+                    unable_fuse_table[conv_weights_name][bn_gamma_name] = 0;
+                }
+                unable_fuse_table[conv_weights_name][bn_gamma_name]++;
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    // Conv <= Conv - BatchNormalization
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "Conv" && i + 1 < node_count) {
+                auto node_conv      = node;
+                auto node_batchnorm = index_nodes[i + 1].node;
+
+                // check op
+                if (!(node_batchnorm->op_type() == "BatchNormalization"))
+                    break;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                auto conv_weights_name = node_conv->input(1);
+                auto bn_gamma_name     = node_batchnorm->input(1);
+                if (unable_fuse_table[conv_weights_name].size() > 1) {
+                    break;
+                }
+
+                auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
+
+                bool can_fuse = false;
+                if (node_conv->output_size() == 1 && node_batchnorm->input_size() == 5 &&
+                    node_conv->output(0) == node_batchnorm->input(0)) {
+                    //目前仅仅考虑二维情况
+                    can_fuse = kernel_shape.size() == 2;
+                }
+                int kernel_size = (int)kernel_shape[0] * kernel_shape[1];
+
+                if (!can_fuse) {
+                    break;
+                }
+
+                int group                = (int)get_node_attr_i(*node_conv, "group", 1);
+                auto& conv_weight_tensor = weights[node_conv->input(1)];
+                int channel_output       = (int)conv_weight_tensor.dims(0);
+                int channel_input        = (int)conv_weight_tensor.dims(1) * group;
+
+                float* slope = new float[channel_output];
+                float* bias  = new float[channel_output];
+                {
+                    float epsilon = get_node_attr_f(*node_batchnorm, "epsilon", 1e-5f);
+
+                    const onnx::TensorProto& gamma = weights[node_batchnorm->input(1)];
+                    const onnx::TensorProto& beta  = weights[node_batchnorm->input(2)];
+                    const onnx::TensorProto& mean  = weights[node_batchnorm->input(3)];
+                    const onnx::TensorProto& var   = weights[node_batchnorm->input(4)];
+
+                    int channels = get_tensor_proto_data_size(gamma);
+                    assert(channels == channel_output);
+
+                    // apply epsilon to var
+                    {
+                        const float* gamma_data = get_tensor_proto_data(gamma);
+                        const float* beta_data  = get_tensor_proto_data(beta);
+                        const float* mean_data  = get_tensor_proto_data(mean);
+                        const float* var_data   = get_tensor_proto_data(var);
+
+                        for (int j = 0; j < channels; j++) {
+                            double sqrt_var = sqrt(double(var_data[j]) + epsilon);
+                            bias[j]  = double(beta_data[j]) - double(gamma_data[j]) * double(mean_data[j]) / sqrt_var;
+                            slope[j] = double(gamma_data[j]) / sqrt_var;
+                        }
+                    }
+                }
+
+                int has_bias = node_conv->input_size() == 3 ? 1 : 0;
+                if (!has_bias) {
+                    auto temp_tensor        = onnx::TensorProto(weights[node_batchnorm->input(2)]);
+                    float* temp_tensor_data = get_tensor_proto_mutable_data(temp_tensor);
+                    int channels            = get_tensor_proto_data_size(temp_tensor);
+                    assert(channels == channel_output);
+                    for (int j = 0; j < channels; j++) {
+                        temp_tensor_data[j] = 0;
+                    }
+                    auto temp_tensor_name     = node_batchnorm->output(0) + "_bias";
+                    weights[temp_tensor_name] = temp_tensor;
+
+                    node_conv->add_input(temp_tensor_name);
+                }
+                auto& conv_bias_tensor = weights[node_conv->input(2)];
+
+                auto new_conv_weight_name   = node_conv->input(1) + "_@" + std::to_string(i);
+                auto new_conv_weight_tensor = CopyFloatTensorProto(weights[node_conv->input(1)]);
+                node_conv->set_input(1, new_conv_weight_name);
+
+                auto new_conv_bias_name   = node_conv->input(2) + "_@" + std::to_string(i);
+                auto new_conv_bias_tensor = CopyFloatTensorProto(weights[node_conv->input(2)]);
+                node_conv->set_input(2, new_conv_bias_name);
+
+                // modeify conv weight
+                float* conv_weights = get_tensor_proto_mutable_data(new_conv_weight_tensor);
+                float* conv_bias    = get_tensor_proto_mutable_data(new_conv_bias_tensor);
+
+                const int channel_input_group  = channel_input / group;
+                const int channel_output_group = channel_output / group;
+                for (int g = 0; g < group; g++) {
+                    for (int g_o = 0; g_o < channel_output_group; g_o++) {
+                        int oc = g * channel_output_group + g_o;
+                        for (int g_i = 0; g_i < channel_input_group; g_i++) {
+                            for (int g_k = 0; g_k < kernel_size; g_k++) {
+                                int index = g * channel_output_group * channel_input_group * kernel_size +
+                                            g_o * channel_input_group * kernel_size + g_i * kernel_size + g_k;
+                                conv_weights[index] *= slope[oc];
+                            }
+                        }
+                        conv_bias[oc] = conv_bias[oc] * slope[oc] + bias[oc];
+                        //                        conv_bias[oc] = conv_bias[oc]
+                        //                        + bias[oc] + 1000;
+                    }
+                }
+
+                delete[] slope;
+                delete[] bias;
+
+                weights[new_conv_bias_name]   = new_conv_bias_tensor;
+                weights[new_conv_weight_name] = new_conv_weight_tensor;
+
+                node_batchnorm->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_conv->output(0)));
+                blob_names.erase(node_conv->output(0));
+                node_conv->set_output(0, node_batchnorm->output(0));
+
+                i += 1;
+            }
+        } while (0);
+
+        // Conv <= Conv - Add
+        do {
+            if (node->op_type() == "Conv" && i + 1 < node_count) {
+                auto node_conv                = node;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                auto node_add = index_nodes[next_indexes[0]].node;
+
+                // check op
+                if (!(node_add->op_type() == "Add")) {
+                    break;
+                }
+                if (weights.find(node_add->input(0)) == weights.end() &&
+                    weights.find(node_add->input(1)) == weights.end()) {
+                    // Add don't have weight
+                    break;
+                }
+
+                auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
+                bool can_fuse     = false;
+                if (node_conv->output_size() == 1 && node_add->input_size() == 2 &&
+                    node_conv->output(0) == node_add->input(0)) {
+                    //目前仅仅考虑二维情况
+                    can_fuse = kernel_shape.size() == 2;
+                }
+                if (!can_fuse) {
+                    break;
+                }
+
+                int group                = (int)get_node_attr_i(*node_conv, "group", 1);
+                auto& conv_weight_tensor = weights[node_conv->input(1)];
+                int channel_output       = (int)conv_weight_tensor.dims(0);
+                // get add weight
+                onnx::TensorProto add_bias_tensor;
+                std::string add_bias_name;
+                if (weights.find(node_add->input(0)) != weights.end()) {
+                    add_bias_name   = node_add->input(0);
+                    add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
+                } else {
+                    add_bias_name   = node_add->input(1);
+                    add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
+                }
+
+                int add_bias_size = get_tensor_proto_data_size(add_bias_tensor);
+                if (add_bias_size != 1) {
+                    if (add_bias_tensor.dims_size() < 2) {
+                        break;
+                    }
+                    int add_bias_channel_size = add_bias_tensor.dims(1);
+                    if (add_bias_size != channel_output || add_bias_channel_size != channel_output) {
+                        break;
+                    }
+                }
+                int has_bias = node_conv->input_size() == 3 ? 1 : 0;
+                if (!has_bias) {
+                    // move add bias to Conv
+                    node_conv->add_input(add_bias_name);
+                } else {
+                    float* add_bias        = get_tensor_proto_mutable_data(add_bias_tensor);
+                    auto& conv_bias_tensor = weights[node_conv->input(2)];
+                    float* conv_bias       = get_tensor_proto_mutable_data(conv_bias_tensor);
+
+                    for (int i = 0; i < channel_output; ++i) {
+                        if (add_bias_size == 1) {
+                            conv_bias[i] = conv_bias[i] + add_bias[0];
+                        } else {
+                            conv_bias[i] = conv_bias[i] + add_bias[i];
+                        }
+                    }
+                }
+
+                node_add->set_op_type(k_tnn_noop_type);
+                node_reference.erase(node_reference.find(node_conv->output(0)));
+                blob_names.erase(node_conv->output(0));
+                node_conv->set_output(0, node_add->output(0));
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_deconv.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_deconv.cc
new file mode 100644
index 0000000..1b29a4d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_deconv.cc
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseDeconv(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    //ConvTranspose <= ConvTranspose - BatchNormalization
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "ConvTranspose" && i + 1 < node_count) {
+                auto node_deconv = node;
+                auto node_batchnorm = index_nodes[i+1].node;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                // check op
+                if (!(node_batchnorm->op_type() == "BatchNormalization"))
+                    break;
+
+                auto kernel_shape = get_node_attr_ai(*node_deconv, "kernel_shape");
+
+                bool can_fuse = false;
+                if (node_deconv->output_size() == 1 && node_batchnorm->input_size() == 5 &&
+                    node_deconv->output(0) == node_batchnorm->input(0)) {
+                    //目前仅仅考虑二维情况
+                    can_fuse = kernel_shape.size() == 2;
+                }
+                int kernel_size = (int)kernel_shape[0]*kernel_shape[1];
+
+                if (!can_fuse) {
+                    break;
+                }
+
+                if (node_deconv->input_size() == 2) {
+                    int group = (int)get_node_attr_i(*node_deconv, "group", 1);
+                    auto& deconv_weight_tensor = weights[node_deconv->input(1)];
+
+                    int channel_input = (int)deconv_weight_tensor.dims(0);
+                    int channel_output  = (int)deconv_weight_tensor.dims(1) * group;
+
+                    float* slope = new float[channel_output];
+                    float* bias  = new float[channel_output];
+                    {
+                        float epsilon = get_node_attr_f(*node_batchnorm, "epsilon", 1e-5f);
+
+                        const onnx::TensorProto& gamma = weights[node_batchnorm->input(1)];
+                        const onnx::TensorProto& beta  = weights[node_batchnorm->input(2)];
+                        const onnx::TensorProto& mean  = weights[node_batchnorm->input(3)];
+                        const onnx::TensorProto& var   = weights[node_batchnorm->input(4)];
+
+                        int channels = get_tensor_proto_data_size(gamma);
+                        assert(channels == channel_output);
+
+
+                        // apply epsilon to var
+                        {
+                            const float* gamma_data = get_tensor_proto_data(gamma);
+                            const float* beta_data  = get_tensor_proto_data(beta);
+                            const float* mean_data  = get_tensor_proto_data(mean);
+                            const float* var_data   = get_tensor_proto_data(var);
+
+                            for (int j = 0; j < channels; j++) {
+                                double sqrt_var = sqrt(double(var_data[j])+ epsilon);
+                                bias[j] = double(beta_data[j]) - double(gamma_data[j])*double(mean_data[j])/sqrt_var;
+                                slope[j]  = double(gamma_data[j])/sqrt_var;
+                            }
+                        }
+                    }
+
+                    //modeify deconv weight
+                    float* deconv_weights = get_tensor_proto_mutable_data(deconv_weight_tensor);
+                    //float* deconv_bias = get_tensor_proto_mutable_data(deconv_bias_tensor);
+                    const int channel_input_group = channel_input / group;
+                    const int channel_output_group = channel_output / group;
+                    for (int g=0; g<group; g++) {
+                        for (int g_o=0; g_o<channel_output_group; g_o++) {
+                            int oc = g*channel_output_group+g_o;
+                            for (int g_i=0; g_i<channel_input_group; g_i++) {
+                                for (int g_k=0; g_k<kernel_size; g_k++) {
+                                    int index = g*channel_input_group*channel_output_group*kernel_size +
+                                        g_i*channel_output_group*kernel_size +
+                                        g_o*kernel_size +
+                                        g_k;
+                                    deconv_weights[index] *= slope[oc];
+                                }
+                            }
+                            //deconv_bias[oc] = deconv_bias[oc]*slope[oc] + bias[oc];
+//                        deconv_bias[oc] = deconv_bias[oc] + bias[oc] + 1000;
+                        }
+                    }
+                    // create tensor proto
+                    onnx::TensorProto tensor_B;
+                    tensor_B.set_name(node->output(0) + "_B");
+                    tensor_B.add_dims(channel_output);
+                    tensor_B.set_data_type(onnx::TensorProto_DataType_FLOAT);
+                    for (int c = 0; c < channel_output; c++) {
+                        tensor_B.add_float_data(bias[c]);
+                    }
+                    node_deconv->add_input(tensor_B.name());
+                    weights[tensor_B.name()] = tensor_B;
+
+
+                    delete[] slope;
+                    delete[] bias;
+                    node_batchnorm->set_op_type(k_tnn_noop_type);
+                    node_reference.erase(
+                        node_reference.find(node_deconv->output(0)));
+                    blob_names.erase(node_deconv->output(0));
+                    node_deconv->set_output(0, node_batchnorm->output(0));
+
+                    i += 1;
+                } else if (node_deconv->input_size() == 3){
+
+                    int group = (int)get_node_attr_i(*node_deconv, "group", 1);
+                    auto& deconv_weight_tensor = weights[node_deconv->input(1)];
+                    auto& deconv_bias_tensor = weights[node_deconv->input(2)];
+
+                    int channel_input = (int)deconv_weight_tensor.dims(0);
+                    int channel_output  = (int)deconv_weight_tensor.dims(1) * group;
+
+                    float* slope = new float[channel_output];
+                    float* bias  = new float[channel_output];
+                    {
+                        float epsilon = get_node_attr_f(*node_batchnorm, "epsilon", 1e-5f);
+
+                        const onnx::TensorProto& gamma = weights[node_batchnorm->input(1)];
+                        const onnx::TensorProto& beta  = weights[node_batchnorm->input(2)];
+                        const onnx::TensorProto& mean  = weights[node_batchnorm->input(3)];
+                        const onnx::TensorProto& var   = weights[node_batchnorm->input(4)];
+
+                        int channels = get_tensor_proto_data_size(gamma);
+                        assert(channels == channel_output);
+
+
+                        // apply epsilon to var
+                        {
+                            const float* gamma_data = get_tensor_proto_data(gamma);
+                            const float* beta_data  = get_tensor_proto_data(beta);
+                            const float* mean_data  = get_tensor_proto_data(mean);
+                            const float* var_data   = get_tensor_proto_data(var);
+
+                            for (int j = 0; j < channels; j++) {
+                                double sqrt_var = sqrt(double(var_data[j])+ epsilon);
+                                bias[j] = double(beta_data[j]) - double(gamma_data[j])*double(mean_data[j])/sqrt_var;
+                                slope[j]  = double(gamma_data[j])/sqrt_var;
+                            }
+                        }
+                    }
+
+                    //modeify deconv weight
+                    float* deconv_weights = get_tensor_proto_mutable_data(deconv_weight_tensor);
+                    float* deconv_bias = get_tensor_proto_mutable_data(deconv_bias_tensor);
+                    const int channel_input_group = channel_input / group;
+                    const int channel_output_group = channel_output / group;
+                    for (int g=0; g<group; g++) {
+                        for (int g_o=0; g_o<channel_output_group; g_o++) {
+                            int oc = g*channel_output_group+g_o;
+                            for (int g_i=0; g_i<channel_input_group; g_i++) {
+                                for (int g_k=0; g_k<kernel_size; g_k++) {
+                                    int index = g*channel_input_group*channel_output_group*kernel_size +
+                                        g_i*channel_output_group*kernel_size +
+                                        g_o*kernel_size +
+                                        g_k;
+                                    deconv_weights[index] *= slope[oc];
+                                }
+                            }
+                            deconv_bias[oc] = deconv_bias[oc]*slope[oc] + bias[oc];
+//                        deconv_bias[oc] = deconv_bias[oc] + bias[oc] + 1000;
+                        }
+                    }
+
+                    delete [] slope;
+                    delete [] bias;
+
+                    node_batchnorm->set_op_type(k_tnn_noop_type);
+                    node_reference.erase(node_reference.find(node_deconv->output(0)));
+                    blob_names.erase(node_deconv->output(0));
+                    node_deconv->set_output(0, node_batchnorm->output(0));
+
+                    i += 1;
+                } else {
+                    DLog("error::ConvTranspose node->input_size() == 2 or node->input_size() == 3 ");
+                    assert(0);
+                }
+            }
+        } while (0);
+
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_depth_to_space.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_depth_to_space.cc
new file mode 100644
index 0000000..1e03d16
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_depth_to_space.cc
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::FuseDepthToSpace(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                               std::map<std::string, onnx::TensorProto>& weights,
+                               std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        // DepthToSpace <= Reshape - Transpose - Reshape
+        // CRD mode: Reshape - Transpose(0, 1, 4, 2, 5, 3) - Reshape
+        // DCR mode: Reshape - Transpose(0, 3, 4, 1, 5, 2) - Reshape
+        do {
+            if (node->op_type() != "Reshape" || i + 2 >= node_count) {
+                break;
+            }
+            auto shapes = get_node_attr_ai(*node, "shape", weights, 1);
+            if (shapes.size() != 6) {
+                break;
+            }
+            auto transpose_node = index_nodes[i + 1].node;
+            if (transpose_node->op_type() != "Transpose") {
+                break;
+            }
+            auto permute = get_node_attr_ai(*transpose_node, "perm");
+            if (permute.size() != 6) {
+                break;
+            }
+            std::vector<int64_t> CRD_mode = {0, 1, 4, 2, 5, 3};
+            std::vector<int64_t> DCR_mode = {0, 3, 4, 1, 5, 2};
+            bool is_crd                   = true;
+            bool is_dcr                   = true;
+            for (int index = 0; index < 6; index++) {
+                if (CRD_mode[index] != permute[index]) {
+                    is_crd = false;
+                    break;
+                }
+            }
+            for (int index = 0; index < 6; index++) {
+                if (DCR_mode[index] != permute[index]) {
+                    is_dcr = false;
+                    break;
+                }
+            }
+            if (!is_crd && !is_dcr) {
+                break;
+            }
+            if (is_crd && shapes[2] != shapes[3]) {
+                break;
+            }
+            if (is_dcr && shapes[1] != shapes[2]) {
+                break;
+            }
+
+            auto reshape_node = index_nodes[i + 2].node;
+            if (reshape_node->op_type() != "Reshape") {
+                break;
+            }
+            if (node->output(0) != transpose_node->input(0) || transpose_node->output(0) != reshape_node->input(0)) {
+                break;
+            }
+            if (node_reference[node->output(0)] != 1 || node_reference[transpose_node->output(0)] != 1) {
+                break;
+            }
+            reshape_node->set_op_type("DepthToSpace");
+            // reshape_node->clear_input();
+            // set input
+            // reshape_node->add_input(0, node->input(0));
+            reshape_node->set_input(0, node->input(0));
+            onnx::AttributeProto* attr_block_size = reshape_node->add_attribute();
+            // CRD 和 DCR 中，shapes[2] 都是 block size
+            int block_size = shapes[2];
+            attr_block_size->set_name("blocksize");
+            attr_block_size->set_i(block_size);
+
+            onnx::AttributeProto* attr_mode = reshape_node->add_attribute();
+            std::string mode                = is_crd ? "CRD" : "DCR";
+            attr_mode->set_name("mode");
+            attr_mode->set_s(mode);
+
+            node->set_op_type(k_tnn_noop_type);
+            transpose_node->set_op_type(k_tnn_noop_type);
+            node_reference.erase(node_reference.find(node->output(0)));
+            RemoveIndexNode(index_nodes, i);
+            node_reference.erase(node_reference.find(transpose_node->output(0)));
+            RemoveIndexNode(index_nodes, i + 1);
+            i = i + 2;
+        } while (0);
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_flatten.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_flatten.cc
new file mode 100644
index 0000000..9e3323f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_flatten.cc
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseFlatten(onnx::GraphProto* mutable_graph,
+                               std::vector<IndexNode> & index_nodes,
+                               std::map<std::string, onnx::TensorProto>& weights,
+                               std::map<std::string, int>& node_reference,
+                               std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat - Reshape
+        do {
+            if (node->op_type() == "Shape" && i+6 < node_count)
+            {
+                if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                    break;
+
+                onnx::NodeProto* node2 = mutable_graph->mutable_node(i+1);
+                auto node3 = index_nodes[i+2].node;
+                auto node4 = index_nodes[i+3].node;
+                auto node5 = index_nodes[i+4].node;
+                auto node6 = index_nodes[i+5].node;
+                auto node7 = index_nodes[i+6].node;
+
+                if (node2->op_type() != "Gather" ||
+                    node3->op_type() != "Constant" ||
+                    node4->op_type() != "Unsqueeze" ||
+                    node5->op_type() != "Unsqueeze" ||
+                    node6->op_type() != "Concat" ||
+                    node7->op_type() != "Reshape")
+                    break;
+
+                if (node_reference.find(node2->output(0)) == node_reference.end() || node_reference[node2->output(0)] != 1)
+                    break;
+
+    //             if (node_reference.find(node3->output(0)) == node_reference.end() || node_reference[node3->output(0)] != 1)
+    //                 continue;
+
+                if (node_reference.find(node4->output(0)) == node_reference.end() || node_reference[node4->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node5->output(0)) == node_reference.end() || node_reference[node5->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node6->output(0)) == node_reference.end() || node_reference[node6->output(0)] != 1)
+                    break;
+
+                if (node2->input(0) != node->output(0) || node4->input(0) != node2->output(0) || node5->input(0) != node3->output(0)
+                    || node6->input(0) != node4->output(0) || node6->input(1) != node5->output(0)
+                    || node7->input(0) != node->input(0) || node7->input(1) != node6->output(0))
+                    break;
+
+                // axis = 0
+                int gather_axis = get_node_attr_i(*node2, "axis");
+                if (gather_axis != 0)
+                    break;
+
+                // indices = 0
+                if (weights.find(node2->input(1)) == weights.end())
+                    break;
+
+                std::vector<int64_t> gather_indices = get_tensor_proto_reshape_shape(weights[node2->input(1)]);
+                if (gather_indices.size() != 1 || gather_indices[0] != 0)
+                    break;
+
+                // axes = (0)
+                std::vector<int64_t> unsqueeze_axes = get_node_attr_ai(*node4, "axes");
+                if (unsqueeze_axes.size() != 1)
+                    break;
+                if (unsqueeze_axes[0] != 0)
+                    break;
+
+                // axes = (0)
+                std::vector<int64_t> unsqueeze2_axes = get_node_attr_ai(*node5, "axes");
+                if (unsqueeze2_axes.size() != 1)
+                    break;
+                if (unsqueeze2_axes[0] != 0)
+                    break;
+
+                // data = -1
+                if (weights.find(node5->input(0)) == weights.end())
+                    break;
+
+                std::vector<int64_t> unsqueeze2_data = get_tensor_proto_reshape_shape(weights[node5->input(0)]);
+                if (unsqueeze2_data.size() != 1 || unsqueeze2_data[0] != -1)
+                    break;
+
+                // axis = 0
+                int concat_axis = get_node_attr_i(*node6, "axis");
+                if (concat_axis != 0)
+                    break;
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+    //             node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+    //             node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+    //             blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+
+                node7->set_op_type("Flatten");
+                node7->clear_input();
+                node7->add_input(node->input(0));
+
+                i += 5;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gelu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gelu.cc
new file mode 100644
index 0000000..04f888e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gelu.cc
@@ -0,0 +1,208 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseGELU(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        // GELU <= Div - Erf - Add(1) - Mul - Mul(0.5)
+        do {
+            if (node->op_type() == "Div" && i + 4 < node_count) {
+                auto node_erf = index_nodes[i + 1].node;
+                auto node_add  = index_nodes[i + 2].node;
+                auto node_mul1  = index_nodes[i + 3].node;
+                auto node_mul2  = index_nodes[i + 4].node;
+
+                if (node_erf->op_type() != "Erf" || node_add->op_type() != "Add" ||
+                    node_mul1->op_type() != "Mul" || node_mul2->op_type() != "Mul")
+                    break;
+                if (node_erf->input(0) != node->output(0) ||
+                    node_add->input(0) != node_erf->output(0) ||
+                    node_mul2->input(0) != node_mul1->output(0)) {
+                    break;
+                }
+                
+                if (node_mul1->input(0) != node->input(0) ||
+                    node_mul1->input(1) != node_add->output(0)) {
+                    break;
+                }
+                
+                //Div tensor
+                if (weights.find(node->input(1)) == weights.end()) {
+                    break;
+                }
+                float div = *get_tensor_proto_data(weights[node->input(1)]);
+                if (fabs(div-1.4142135381698608f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                //Add tensor
+                if (weights.find(node_add->input(1)) == weights.end()) {
+                    break;
+                }
+                float add = *get_tensor_proto_data(weights[node_add->input(1)]);
+                if (fabs(add-1.0f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                //Mul tensor
+                if (weights.find(node_mul2->input(1)) == weights.end()) {
+                    break;
+                }
+                float mul = *get_tensor_proto_data(weights[node_mul2->input(1)]);
+                if (fabs(mul-0.5f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node_erf->set_op_type(k_tnn_noop_type);
+                node_add->set_op_type(k_tnn_noop_type);
+                node_mul1->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node_erf->output(0)));
+                node_reference.erase(node_reference.find(node_add->output(0)));
+                node_reference.erase(node_reference.find(node_mul1->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node_erf->output(0));
+                blob_names.erase(node_add->output(0));
+                blob_names.erase(node_mul1->output(0));
+
+                node_mul2->set_op_type("GELU");
+                node_mul2->clear_input();
+                node_mul2->add_input(node->input(0));
+                
+                // approximation
+                onnx::AttributeProto* attr_approximation  = node_mul2->add_attribute();
+                attr_approximation->set_name("approximation");
+                attr_approximation->set_i(0);
+
+                i += 4;
+            }
+        } while (0);
+        
+        // approximation GELU <= Pow - Mul - Add - Mul - Tanh - Add - Mul(0.5) - Mul
+        // the approximation has big error if input is -2.281006575
+        do {
+            if (node->op_type() == "Pow" && i + 7 < node_count) {
+                auto node1 = index_nodes[i + 1].node;
+                auto node2  = index_nodes[i + 2].node;
+                auto node3  = index_nodes[i + 3].node;
+                auto node4  = index_nodes[i + 4].node;
+                auto node5  = index_nodes[i + 5].node;
+                auto node6  = index_nodes[i + 6].node;
+                auto node7  = index_nodes[i + 7].node;
+
+                if (node1->op_type() != "Mul" || node2->op_type() != "Add" ||
+                    node3->op_type() != "Mul" || node4->op_type() != "Tanh" ||
+                    node5->op_type() != "Add" || node6->op_type() != "Mul" ||
+                    node7->op_type() != "Mul")
+                    break;
+                
+                if ((node1->input(0) != node->output(0) && node1->input(1) != node->output(0)) ||
+                    (node3->input(0) != node2->output(0) && node3->input(1) != node2->output(0)) ||
+                    node4->input(0) != node3->output(0) ||
+                    (node5->input(0) != node4->output(0) && node5->input(1) != node4->output(0)) ||
+                    (node6->input(0) != node5->output(0) && node6->input(1) != node5->output(0))) {
+                    break;
+                }
+                
+                if (node2->input(0) != node->input(0) ||
+                    node2->input(1) != node1->output(0)) {
+                    break;
+                }
+                
+                if (node7->input(0) != node->input(0) ||
+                    node7->input(1) != node6->output(0)) {
+                    break;
+                }
+                
+                //TODO check input pow Y = 3
+                
+                //Mul0 tensor
+                if (weights.find(node1->input(0)) == weights.end()) {
+                    break;
+                }
+                float mul0 = *get_tensor_proto_data(weights[node1->input(0)]);
+                if (fabs(mul0-0.044714998453855515f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                //Add tensor
+                if (weights.find(node5->input(0)) == weights.end()) {
+                    break;
+                }
+                float add = *get_tensor_proto_data(weights[node5->input(0)]);
+                if (fabs(add-1.0f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                //Mul tensor
+                if (weights.find(node6->input(0)) == weights.end()) {
+                    break;
+                }
+                float mul = *get_tensor_proto_data(weights[node6->input(0)]);
+                if (fabs(mul-0.5f) > FLT_EPSILON) {
+                    break;
+                }
+                
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node1->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node1->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node1->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+
+                node7->set_op_type("GELU");
+                node7->clear_input();
+                node7->add_input(node->input(0));
+                
+                // approximation
+                onnx::AttributeProto* attr_approximation  = node7->add_attribute();
+                attr_approximation->set_name("approximation");
+                attr_approximation->set_i(1);
+
+                i += 7;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gemm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gemm.cc
new file mode 100644
index 0000000..cbb3a56
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_gemm.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseGEMM(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Gemm <= MatMul - Add
+         do {
+             if (node->op_type() == "MatMul" && i+1<node_count) {
+                 auto node2 = index_nodes[i+1].node;
+                 if (node2->op_type() != "Add")
+                     break;
+                 std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                 if (next_indexes.size() != 1) {
+                     break;
+                 }
+
+                 auto B = get_node_attr_tensor(*node, "B", onnx_net_info_, 1);
+                 auto const h = B.dims(0);
+                 auto const w = B.dims(1);
+                 if (B.dims_size() != 2)
+                     break;
+
+                 auto C = get_node_attr_tensor(*node2, "B", onnx_net_info_, 1);
+                 auto const channel = C.dims(0);
+                 if (C.dims_size() != 1)
+                     break;
+
+                 if (w != channel) {
+                     break;
+                 }
+
+                 // reduce
+                 node->set_op_type(k_tnn_noop_type);
+
+                 node_reference.erase(node_reference.find(node->output(0)));
+                 blob_names.erase(node->output(0));
+
+                 node2->set_op_type("Gemm");
+                 node2->set_input(0, node->input(0));
+
+                 node2->set_input(1, B.name());
+
+                 if (node2->input_size() == 2) {
+                     node2->add_input(C.name());
+                 } else {
+                     node2->set_input(2, C.name());
+                 }
+
+                 onnx::AttributeProto* attr_alpha = node2->add_attribute();
+                 attr_alpha->set_name("alpha");
+                 attr_alpha->set_f(1.f);
+
+                 onnx::AttributeProto* attr_beta = node2->add_attribute();
+                 attr_beta->set_name("beta");
+                 attr_beta->set_f(1.f);
+
+                 onnx::AttributeProto* attr_transA = node2->add_attribute();
+                 attr_transA->set_name("transA");
+                 attr_transA->set_i(0);
+
+                 onnx::AttributeProto* attr_transB = node2->add_attribute();
+                 attr_transB->set_name("transB");
+                 attr_transB->set_i(0);
+
+                 i += 1;
+             }
+         } while (0);
+
+        // Gemm <= MatMul
+         do {
+             if (node->op_type() == "MatMul") {
+                 auto B = get_node_attr_tensor(*node, "B", onnx_net_info_, 1);
+                 if (B.dims_size() == 0) {
+                     break;
+                 }
+                 auto const h = B.dims(0);
+                 auto const channel = B.dims(1);
+                 if (B.dims_size() != 2)
+                     break;
+
+                 node->set_op_type("Gemm");
+                 node->set_input(0, node->input(0));
+                 node->set_input(1, B.name());
+
+                 onnx::AttributeProto* attr_alpha = node->add_attribute();
+                 attr_alpha->set_name("alpha");
+                 attr_alpha->set_f(1.f);
+
+                 onnx::AttributeProto* attr_beta = node->add_attribute();
+                 attr_beta->set_name("beta");
+                 attr_beta->set_f(1.f);
+
+                 onnx::AttributeProto* attr_transA = node->add_attribute();
+                 attr_transA->set_name("transA");
+                 attr_transA->set_i(0);
+
+                 onnx::AttributeProto* attr_transB = node->add_attribute();
+                 attr_transB->set_name("transB");
+                 attr_transB->set_i(0);
+
+                 onnx::AttributeProto* attr_C = node->add_attribute();
+                 attr_C->set_name("C");
+                 {
+                     //https://github.com/pytorch/pytorch/blob/master/caffe2/onnx/helper.h
+                     onnx::TensorProto tensor_C;
+                     tensor_C.set_name("C");
+                     tensor_C.add_dims(channel);
+                     tensor_C.set_data_type(onnx::TensorProto_DataType_FLOAT);
+                     for (int c=0; c<channel; c++) {
+                         tensor_C.add_float_data(0);
+                     }
+
+                     attr_C->mutable_t()->CopyFrom(tensor_C);
+                     attr_C->set_type(AttributeProto::TENSOR);
+                 }
+
+                 i += 0;
+             }
+         } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_global_average_pool.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_global_average_pool.cc
new file mode 100644
index 0000000..5df01e6
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_global_average_pool.cc
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::FuseGlobalAveragePool(
+    onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+    std::map<std::string, onnx::TensorProto>& weights,
+    std::map<std::string, int>& node_reference,
+    std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    // GlobalAveragePool <= Transpose + ReduceMean
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "Transpose" && i + 1 < node_count) {
+                auto node_trans = node;
+
+                std::vector<int> next_indexes =
+                    GetNextIndexNode(index_nodes, i);
+      
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                if (node_trans->output_size() !=1){
+                    break;
+                }
+
+                auto node_reduce_mean = index_nodes[i + 1].node;
+
+                // check op
+                if (!(node_reduce_mean->op_type() == "ReduceMean"))
+                    break;
+                
+                //check keepdim
+                auto keepdims = get_node_attr_i(*node_trans, "keepdims", 0);
+                if (keepdims  != 1) {
+                    break;
+                }
+                
+                // check transpose
+                if (node_trans->output(0) != node_reduce_mean->input(0)) {
+                    break;
+                }
+                // check transpose perm
+                vector<int64_t> perm = get_node_attr_ai(*node_trans, "perm");
+                if (perm.size() != 4 || perm[0] != 0 || perm[1] != 2 ||
+                    perm[2] != 3 || perm[3] != 1) {
+                    break;
+                }
+                // check reduce mean axes
+                vector<int64_t> axes = get_node_attr_ai(*node_reduce_mean, "axes");
+                if (axes.size() != 2 || axes[0] != 1 || axes[1] != 2) {
+                    break;
+                }
+
+
+                node_reduce_mean->set_op_type(k_tnn_noop_type);
+                node_trans->set_op_type("GlobalAveragePool");
+                node_reference.erase(node_reference.find(node_trans->output(0)));
+                blob_names.erase(node_trans->output(0));
+                node_trans->set_output(0, node_reduce_mean->output(0));
+
+                i += 1;
+            }
+            if(node->op_type() == "ReduceMean"){
+                //check reduce mean axes
+                auto node_reduce_mean =node;
+                
+                //check keepdim
+                auto keepdims = get_node_attr_i(*node_reduce_mean, "keepdims", 0);
+                if (keepdims  != 1) {
+                    break;
+                }
+                
+                vector<int64_t> axes = get_node_attr_ai(*node_reduce_mean, "axes");
+                if (axes.size() != 2 || axes[0] != 2 || axes[1] != 3) {
+                    break;
+                }
+                node_reduce_mean->set_op_type("GlobalAveragePool");
+                node_reference.erase(node_reference.find(node_reduce_mean->output(0)));
+                blob_names.erase(node_reduce_mean->output(0));
+
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc
new file mode 100644
index 0000000..632de78
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseGroupNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                        std::map<std::string, onnx::TensorProto>& weights,
+                                        std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node_reshape0 = index_nodes[i].node;
+
+        /**
+         * Fuse for a special model
+         * GroupNormalization <= X + Reshape + InstanceNormalization + Shape + Reshape + (Unsqueeze) + Mul + (Unsqueeze) + Add
+         *
+         * */
+        do {
+            string node0_type = node_reshape0->op_type();
+            if (node0_type == "Reshape" && i + 5 < node_count) {
+                auto node_inst_norm  = index_nodes[i + 1].node;
+                auto node_shape  = index_nodes[i + 2].node;
+                auto node_reshape  = index_nodes[i + 3].node;
+                
+                int offset = 5;
+                onnx::NodeProto* node_unsqueeze_scale = nullptr;
+                onnx::NodeProto* node_mul = nullptr;
+                onnx::NodeProto* node_unsqueeze_bias = nullptr;
+                onnx::NodeProto* node_add = nullptr;
+                if (i + 7 < node_count) {
+                    node_unsqueeze_scale = index_nodes[i + 4].node;
+                    node_mul = index_nodes[i + 5].node;
+                    node_unsqueeze_bias = index_nodes[i + 6].node;
+                    node_add = index_nodes[i + 7].node;
+                    offset = 7;
+                } else {
+                    node_mul = index_nodes[i + 4].node;
+                    node_add = index_nodes[i + 5].node;
+                }
+                
+                if (node_inst_norm->op_type() != "InstanceNormalization" || node_shape->op_type() != "Shape" || node_reshape->op_type() != "Reshape" ||
+                    node_mul->op_type() != "Mul" || node_add->op_type() != "Add" ) {
+                    break;
+                }
+                
+                if ((!node_unsqueeze_scale && node_unsqueeze_scale->op_type() != "Unsqueeze") ||
+                    (!node_unsqueeze_bias && node_unsqueeze_bias->op_type() != "Unsqueeze" )) {
+                    break;
+                }
+                
+                //check Shape input
+                if (node_shape->input(0) != node_reshape0->input(0)) {
+                    break;
+                }
+
+                if ( !(node_inst_norm->input(0) == node_reshape0->output(0) && node_reshape->input(0) == node_inst_norm->output(0) &&
+                       node_reshape->input(1) == node_shape->output(0) ) ) {
+                    break;
+                }
+                
+                if ( !(node_mul->input(0) == node_reshape->output(0) && node_add->input(0) == node_mul->output(0) ) ) {
+                    break;
+                }
+                
+                //inst scale bias
+                if ( node_inst_norm->input_size() < 3 || weights.find(node_inst_norm->input(1)) == weights.end() ||
+                    weights.find(node_inst_norm->input(2)) == weights.end() ){
+                    break;
+                }
+                //group scale bias
+                string group_scale_name;
+                if ( node_unsqueeze_scale && weights.find(node_unsqueeze_scale->input(0)) != weights.end()){
+                    group_scale_name = node_unsqueeze_scale->input(0);
+                } else if ( !node_unsqueeze_scale && weights.find(node_mul->input(1)) != weights.end()) {
+                    group_scale_name = node_mul->input(1);
+                } else {
+                    break;
+                }
+                string group_bias_name;
+                if ( node_unsqueeze_bias && weights.find(node_unsqueeze_bias->input(0)) != weights.end()){
+                    group_bias_name = node_unsqueeze_bias->input(0);
+                } else if ( !node_unsqueeze_bias && weights.find(node_add->input(1)) != weights.end()) {
+                    group_bias_name = node_add->input(1);
+                } else {
+                    break;
+                }
+                
+                const onnx::TensorProto& inst_scale = weights[node_inst_norm->input(1)];
+                const onnx::TensorProto& inst_bias  = weights[node_inst_norm->input(2)];
+                const int group = get_tensor_proto_data_size(inst_scale);
+                
+                onnx::TensorProto& group_scale = weights[group_scale_name];
+                onnx::TensorProto& group_bias  = weights[group_bias_name];
+                const int channels = get_tensor_proto_data_size(group_scale);
+                const int channel_per_group = channels / group;
+                
+                //fix scale bias
+                {
+                    const float* inst_scale_data = get_tensor_proto_data(inst_scale);
+                    const float* inst_bias_data  = get_tensor_proto_data(inst_bias);
+                    float* group_scale_data  = get_tensor_proto_mutable_data(group_scale);
+                    float* group_bias_data   = get_tensor_proto_mutable_data(group_bias);
+
+                    for (int j = 0; j < channels; j++) {
+                        int inst_index = j / channel_per_group;
+                        group_bias_data[j] += group_scale_data[j]*inst_bias_data[inst_index];
+                        group_scale_data[j] *= inst_scale_data[inst_index];
+                    }
+                }
+                
+                node_inst_norm->set_op_type("GroupNormalization");
+                // input
+                node_inst_norm->clear_input();
+                node_inst_norm->add_input(node_reshape0->input(0));
+                node_inst_norm->add_input(group_scale_name);
+                node_inst_norm->add_input(group_bias_name);
+                
+                //output
+                node_inst_norm->set_output(0, node_add->output(0));
+                
+                //group
+                auto attr_group = node_inst_norm->add_attribute();
+                attr_group->set_name("num_groups");
+                attr_group->set_i(group);
+                
+                node_reshape0->set_op_type(k_tnn_noop_type);
+                node_shape->set_op_type(k_tnn_noop_type);
+                node_reshape->set_op_type(k_tnn_noop_type);
+                node_mul->set_op_type(k_tnn_noop_type);
+                node_add->set_op_type(k_tnn_noop_type);
+                if (node_unsqueeze_scale) {
+                    node_unsqueeze_scale->set_op_type(k_tnn_noop_type);
+                    node_reference.erase(node_reference.find(node_unsqueeze_scale->output(0)));
+                    blob_names.erase(node_unsqueeze_scale->output(0));
+                }
+                if (node_unsqueeze_bias) {
+                    node_unsqueeze_bias->set_op_type(k_tnn_noop_type);
+                    node_reference.erase(node_reference.find(node_unsqueeze_bias->output(0)));
+                    blob_names.erase(node_unsqueeze_bias->output(0));
+                }
+
+                node_reference.erase(node_reference.find(node_reshape0->output(0)));
+                node_reference.erase(node_reference.find(node_shape->output(0)));
+                node_reference.erase(node_reference.find(node_reshape->output(0)));
+                node_reference.erase(node_reference.find(node_mul->output(0)));
+//                node_reference.erase(node_reference.find(node_add->output(0)));
+                blob_names.erase(node_reshape0->output(0));
+                blob_names.erase(node_inst_norm->output(0));
+                blob_names.erase(node_shape->output(0));
+                blob_names.erase(node_reshape->output(0));
+                blob_names.erase(node_mul->output(0));
+                i += offset;
+            }
+        } while (0);
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_sigmoid.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_sigmoid.cc
new file mode 100644
index 0000000..67f529c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_sigmoid.cc
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseHardSigmoid(onnx::GraphProto* mutable_graph,
+                                   std::vector<IndexNode> & index_nodes,
+                                   std::map<std::string, onnx::TensorProto>& weights,
+                                   std::map<std::string, int>& node_reference,
+                                   std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
+        do {
+            if (node->op_type() == "Add" && i+2 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                     break;
+
+                 if (weights.find(node->input(1)) == weights.end())
+                     break;
+
+                 const onnx::TensorProto& add_three = weights[node->input(1)];
+                 if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+                     break;
+
+                 float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0] : add_three.float_data().data()[0];
+                 if (constant_add_three != 3.f)
+                     break;
+
+                 auto node2 = index_nodes[i+1].node;
+                 auto node3 = index_nodes[i+2].node;
+
+                 if (node2->op_type() != "Clip" || node3->op_type() != "Div")
+                     break;
+
+                 if (node_reference.find(node2->output(0)) == node_reference.end() || node_reference[node2->output(0)] != 1)
+                     break;
+
+                float relu6_min = get_node_attr_f(*node2, "min", onnx_net_info_,1, -FLT_MAX);
+                float relu6_max = get_node_attr_f(*node2, "max", onnx_net_info_, 2, FLT_MAX);
+                 if (relu6_min != 0.f || relu6_max != 6.f)
+                     break;
+
+                 if (weights.find(node3->input(1)) == weights.end())
+                     break;
+
+                 const onnx::TensorProto& div_six = weights[node3->input(1)];
+                 if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+                     break;
+
+                 float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0] : div_six.float_data().data()[0];
+                 if (constant_div_six != 6.f)
+                     break;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                next_indexes = GetNextIndexNode(index_nodes, i+ 1);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                 // reduce
+                 node->set_op_type(k_tnn_noop_type);
+                 node2->set_op_type(k_tnn_noop_type);
+
+                 node_reference.erase(node_reference.find(node->output(0)));
+                 node_reference.erase(node_reference.find(node2->output(0)));
+                 blob_names.erase(node->output(0));
+                 blob_names.erase(node2->output(0));
+
+                 node3->set_op_type("HardSigmoid");
+                 node3->clear_input();
+                 node3->add_input(node->input(0));
+
+                 onnx::AttributeProto* attr_alpha = node3->add_attribute();
+                 attr_alpha->set_name("alpha");
+                 attr_alpha->set_f(1.f/6.f);
+
+                 onnx::AttributeProto* attr_beta = node3->add_attribute();
+                 attr_beta->set_name("beta");
+                 attr_beta->set_f(3.f/6.f);
+
+                 i += 2;
+             }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc
new file mode 100644
index 0000000..7041224
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "onnx2tnn.h"
+
+
+inline bool IsEqual(float num1, float num2) {
+    return std::abs(num1 - num2) <= 1e-6;
+}
+
+
+int Onnx2TNN::FuseHardSwish(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        // HardSwish <= Add - Clip - Div - Mul
+        // out =  x0 * (clip(x0 + 3, 0, 6) / 6)
+        // out =  x0 * clip(x0/6 + 3/6, 0, 1)
+        // ensure HardSigmoid first called before FuseHardSwish, so this pattern never happen
+        do {
+            if (node->op_type() == "Add" && i + 3 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                if (weights.find(node->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& add_three = weights[node->input(1)];
+                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+                    break;
+
+                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                                    : add_three.float_data().data()[0];
+                if (!IsEqual(constant_add_three, 3.f))
+                    break;
+
+                auto node_clip = index_nodes[i + 1].node;
+                auto node_div  = index_nodes[i + 2].node;
+                auto node_mul  = index_nodes[i + 3].node;
+
+                if (node_clip->op_type() != "Clip" || node_div->op_type() != "Div" || node_mul->op_type() != "Mul")
+                    break;
+
+                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+                    node_reference[node_clip->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
+                    node_reference[node_mul->output(0)] != 1)
+                    break;
+
+                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
+                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
+                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+                    break;
+
+                if (!(node_div->input_size() == 2 && node_div->input(0) == node_clip->output(0)))
+                    break;
+
+                if (weights.find(node_div->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& div_six = weights[node_div->input(1)];
+                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+                    break;
+
+                float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
+                                                                : div_six.float_data().data()[0];
+                if (!IsEqual(constant_div_six, 6.f))
+                    break;
+                int x0_index = (node_mul->input(1) == node_div->output(0)) ? 0 : 1;
+                std::vector<std::string> inputs;
+                inputs.push_back(node_mul->input(x0_index));
+                if (inputs[0] != node->input(0)) {
+                    inputs.push_back(node->input(0));
+                }
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+                node_div->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node_clip->output(0)));
+                node_reference.erase(node_reference.find(node_div->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node_clip->output(0));
+                blob_names.erase(node_div->output(0));
+
+                node_mul->set_op_type("HardSwish");
+                node_mul->clear_input();
+                node_mul->add_input(inputs[0]);
+                if (inputs.size() == 2) {
+                    node_mul->add_input(inputs[1]);
+                }
+
+                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+                attr_alpha->set_name("alpha");
+                attr_alpha->set_f(1.f / 6.f);
+
+                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+                attr_beta->set_name("beta");
+                attr_beta->set_f(3.f / 6.f);
+
+                i += 3;
+            }
+        } while (0);
+
+        // HardSwish <= Add - Clip - Cast - Div - Cast - Mul
+        // out =  x0 * (clip(x0 + 3, 0, 6) / 6)
+        // out =  x0 * clip(x0/6 + 3/6, 0, 1)
+        do {
+            if (node->op_type() == "Add" && i + 6 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                if (weights.find(node->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& add_three = weights[node->input(1)];
+                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+                    break;
+
+                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                                    : add_three.float_data().data()[0];
+                if (!IsEqual(constant_add_three, 3.f))
+                    break;
+
+                auto node_clip   = index_nodes[i + 1].node;
+                auto node_cast_1 = index_nodes[i + 2].node;
+                auto node_div    = index_nodes[i + 3].node;
+                auto node_cast_2 = index_nodes[i + 4].node;
+                auto node_mul    = index_nodes[i + 5].node;
+
+                if (node_clip->op_type() != "Clip" || node_cast_1->op_type() != "Cast" ||
+                    node_div->op_type() != "Div" || node_cast_2->op_type() != "Cast" || node_mul->op_type() != "Mul")
+                    break;
+
+                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+                    node_reference[node_clip->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node_mul->output(0)) == node_reference.end())
+                    break;
+
+                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
+                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
+                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+                    break;
+
+                if (!(node_div->input_size() == 2 && node_div->input(0) == node_cast_1->output(0)))
+                    break;
+
+                if (weights.find(node_div->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& div_six = weights[node_div->input(1)];
+                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+                    break;
+
+                float constant_div_six = 0.f;
+                if (div_six.has_raw_data()) {
+                    auto data_type = div_six.data_type();
+                    if (data_type == onnx::TensorProto_DataType_FLOAT) {
+                        constant_div_six = ((const float*)div_six.raw_data().data())[0];
+                    } else if (data_type == onnx::TensorProto_DataType_DOUBLE) {
+                        constant_div_six = (float)((const double*)div_six.raw_data().data())[0];
+                    }
+                } else {
+                    constant_div_six = div_six.float_data().data()[0];
+                }
+                if (!IsEqual(constant_div_six, 6.f))
+                    break;
+                int x0_index = (node_mul->input(1) == node_cast_2->output(0)) ? 0 : 1;
+                std::vector<std::string> inputs;
+                inputs.push_back(node_mul->input(x0_index));
+                if (inputs[0] != node->input(0)) {
+                    inputs.push_back(node->input(0));
+                }
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+                node_cast_1->set_op_type(k_tnn_noop_type);
+                node_div->set_op_type(k_tnn_noop_type);
+                node_cast_2->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node_clip->output(0)));
+                node_reference.erase(node_reference.find(node_cast_1->output(0)));
+                node_reference.erase(node_reference.find(node_div->output(0)));
+                node_reference.erase(node_reference.find(node_cast_2->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node_clip->output(0));
+                blob_names.erase(node_cast_1->output(0));
+                blob_names.erase(node_div->output(0));
+                blob_names.erase(node_cast_2->output(0));
+
+                node_mul->set_op_type("HardSwish");
+                node_mul->clear_input();
+                node_mul->add_input(inputs[0]);
+                if (inputs.size() == 2) {
+                    node_mul->add_input(inputs[1]);
+                }
+
+                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+                attr_alpha->set_name("alpha");
+                attr_alpha->set_f(1.f / 6.f);
+
+                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+                attr_beta->set_name("beta");
+                attr_beta->set_f(3.f / 6.f);
+
+                i += 5;
+            }
+        } while (0);
+
+        // HardSwish <= Add - Clip - Mul - Div
+        // out =  (x0 * clip(x1 + 3, 0, 6)) / 6
+        // out =  x0 * clip(x1/6 + 3/6, 0, 1)
+        do {
+            if (node->op_type() == "Add" && i + 3 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                if (weights.find(node->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& add_three = weights[node->input(1)];
+                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+                    break;
+
+                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                                    : add_three.float_data().data()[0];
+                if (!IsEqual(constant_add_three, 3.f))
+                    break;
+
+                auto node_clip = index_nodes[i + 1].node;
+                auto node_mul  = index_nodes[i + 2].node;
+                auto node_div  = index_nodes[i + 3].node;
+
+                if (node_clip->op_type() != "Clip" || node_mul->op_type() != "Mul" || node_div->op_type() != "Div")
+                    break;
+
+                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+                    node_reference[node_clip->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
+                    node_reference[node_mul->output(0)] != 1)
+                    break;
+
+                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
+                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
+                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+                    break;
+
+                if (!(node_mul->input_size() == 2 &&
+                      (node_mul->input(0) == node_clip->output(0) || node_mul->input(1) == node_clip->output(0))))
+                    break;
+                int x0_index = (node_mul->input(1) == node_clip->output(0)) ? 0 : 1;
+                std::vector<std::string> inputs;
+                inputs.push_back(node_mul->input(x0_index));
+                if (inputs[0] != node->input(0)) {
+                    inputs.push_back(node->input(0));
+                }
+
+                if (node_div->input(0) != node_mul->output(0))
+                    break;
+
+                if (weights.find(node_div->input(1)) == weights.end())
+                    break;
+
+                const onnx::TensorProto& div_six = weights[node_div->input(1)];
+                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+                    break;
+
+                float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
+                                                                : div_six.float_data().data()[0];
+                if (!IsEqual(constant_div_six, 6.f))
+                    break;
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+                node_mul->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node_clip->output(0)));
+                node_reference.erase(node_reference.find(node_mul->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node_clip->output(0));
+                blob_names.erase(node_mul->output(0));
+
+                node_div->set_op_type("HardSwish");
+                node_div->clear_input();
+                node_div->add_input(inputs[0]);
+                if (inputs.size() == 2) {
+                    node_div->add_input(inputs[1]);
+                }
+
+                onnx::AttributeProto* attr_alpha = node_div->add_attribute();
+                attr_alpha->set_name("alpha");
+                attr_alpha->set_f(1.f / 6.f);
+
+                onnx::AttributeProto* attr_beta = node_div->add_attribute();
+                attr_beta->set_name("beta");
+                attr_beta->set_f(3.f / 6.f);
+
+                i += 3;
+            }
+        } while (0);
+        // HardSwish <= HardSigmoid - Mul
+        do {
+            if (node->op_type() == "HardSigmoid" && i + 1 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                float alpha = get_node_attr_f(*node, "alpha", 0.2f);
+                float beta  = get_node_attr_f(*node, "beta", 0.5f);
+
+                auto node_mul = index_nodes[i + 1].node;
+
+                if (node_mul->op_type() != "Mul")
+                    break;
+
+                if (!(node_mul->input_size() == 2 &&
+                      (node_mul->input(0) == node->output(0) || node_mul->input(1) == node->output(0))))
+                    break;
+
+                int x0_index = (node_mul->input(1) == node->output(0)) ? 0 : 1;
+                std::vector<std::string> inputs;
+                inputs.push_back(node_mul->input(x0_index));
+                if (inputs[0] != node->input(0)) {
+                    inputs.push_back(node->input(0));
+                }
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                blob_names.erase(node->output(0));
+
+                node_mul->set_op_type("HardSwish");
+                node_mul->clear_input();
+                node_mul->add_input(inputs[0]);
+                if (inputs.size() == 2) {
+                    node_mul->add_input(inputs[1]);
+                }
+
+                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+                attr_alpha->set_name("alpha");
+                attr_alpha->set_f(alpha);
+
+                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+                attr_beta->set_name("beta");
+                attr_beta->set_f(beta);
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hdr_guide.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hdr_guide.cc
new file mode 100644
index 0000000..8dad8e5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hdr_guide.cc
@@ -0,0 +1,172 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseHDRGuide(onnx::GraphProto* mutable_graph,
+                                std::vector<IndexNode> & index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // HDRGuide <= Conv - Unsqueeze - Sub - Relu - Mul - ReduceSum - Conv - Clip - Squeeze
+        do {
+            if (node->op_type() == "Conv" && i + 8 < node_count) {
+                auto node_conv1 = node;
+                auto node_unsqueeze = index_nodes[i+1].node;
+                auto node_sub = index_nodes[i+2].node;
+                auto node_relu = index_nodes[i+3].node;
+                auto node_mul = index_nodes[i+4].node;
+                auto node_reduce_sum = index_nodes[i+5].node;
+                auto node_conv2 = index_nodes[i+6].node;
+                auto node_clip = index_nodes[i+7].node;
+                auto node_squeeze = index_nodes[i+8].node;
+
+                // check op
+                if (!((node_unsqueeze->op_type() == "Unsqueeze" || node_unsqueeze->op_type() == k_tnn_noop_type) &&
+                      node_sub->op_type() == "Sub" &&
+                      node_relu->op_type() == "Relu" &&
+                      node_mul->op_type() == "Mul" &&
+                      node_reduce_sum->op_type() == "ReduceSum" &&
+                      node_conv2->op_type() == "Conv" &&
+                      node_clip->op_type() == "Clip" &&
+                      (node_squeeze->op_type() == "Squeeze" || node_squeeze->op_type() == k_tnn_noop_type))) {
+                    break;
+                }
+
+                node_unsqueeze->set_op_type(k_tnn_noop_type);
+                node_sub->set_op_type(k_tnn_noop_type);
+                node_relu->set_op_type(k_tnn_noop_type);
+                node_mul->set_op_type(k_tnn_noop_type);
+                node_reduce_sum->set_op_type(k_tnn_noop_type);
+                node_conv2->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+                node_squeeze->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_unsqueeze->output(0));
+                node_reference.erase(node_sub->output(0));
+                node_reference.erase(node_relu->output(0));
+                node_reference.erase(node_mul->output(0));
+                node_reference.erase(node_reduce_sum->output(0));
+                node_reference.erase(node_conv2->output(0));
+                node_reference.erase(node_clip->output(0));
+                node_reference.erase(node_squeeze->output(0));
+
+                blob_names.erase(node_unsqueeze->output(0));
+                blob_names.erase(node_sub->output(0));
+                blob_names.erase(node_relu->output(0));
+                blob_names.erase(node_mul->output(0));
+                blob_names.erase(node_reduce_sum->output(0));
+                blob_names.erase(node_conv2->output(0));
+                blob_names.erase(node_clip->output(0));
+                blob_names.erase(node_squeeze->output(0));
+
+                node_conv1->set_op_type("HDRGuide");
+                node_conv1->set_output(0, node_squeeze->output(0));
+
+                //node_sub
+                node_conv1->add_input(node_sub->input(1));
+
+                //node_mul
+                node_conv1->add_input(node_mul->input(0));
+
+                //node_conv2
+                node_conv1->add_input(node_conv2->input(1));
+                node_conv1->add_input(node_conv2->input(2));
+
+                i += 8;
+            }
+        } while (0);
+
+        // HDRGuide <= Conv - Sub - Relu - Mul - ReduceSum - Conv - Clip - Squeeze(0)
+        do {
+            if (node->op_type() == "Conv" && i + 6 < node_count) {
+                int node_index = i+1;
+                auto node_conv1 = node;
+                auto node_sub = index_nodes[node_index++].node;
+                auto node_relu = index_nodes[node_index++].node;
+                auto node_mul = index_nodes[node_index++].node;
+                auto node_reduce_sum = index_nodes[node_index++].node;
+                auto node_conv2 = index_nodes[node_index++].node;
+                auto node_clip = index_nodes[node_index++].node;
+                auto node_squeeze = node_clip;
+                if (node_index < node_count) {
+                    node_squeeze = index_nodes[node_index++].node;
+                    if (!(node_squeeze->op_type() == "Squeeze" || node_squeeze->op_type() == k_tnn_noop_type)) {
+                        node_squeeze = node_clip;
+                        node_index--;
+                    }
+                }
+
+                // check op
+                if (!(node_sub->op_type() == "Sub" &&
+                      node_relu->op_type() == "Relu" &&
+                      node_mul->op_type() == "Mul" &&
+                      node_reduce_sum->op_type() == "ReduceSum" &&
+                      node_conv2->op_type() == "Conv" &&
+                      node_clip->op_type() == "Clip")) {
+                    break;
+                }
+
+                node_sub->set_op_type(k_tnn_noop_type);
+                node_relu->set_op_type(k_tnn_noop_type);
+                node_mul->set_op_type(k_tnn_noop_type);
+                node_reduce_sum->set_op_type(k_tnn_noop_type);
+                node_conv2->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+                node_squeeze->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_sub->output(0));
+                node_reference.erase(node_relu->output(0));
+                node_reference.erase(node_mul->output(0));
+                node_reference.erase(node_reduce_sum->output(0));
+                node_reference.erase(node_conv2->output(0));
+                node_reference.erase(node_clip->output(0));
+                node_reference.erase(node_squeeze->output(0));
+
+                blob_names.erase(node_sub->output(0));
+                blob_names.erase(node_relu->output(0));
+                blob_names.erase(node_mul->output(0));
+                blob_names.erase(node_reduce_sum->output(0));
+                blob_names.erase(node_conv2->output(0));
+                blob_names.erase(node_clip->output(0));
+                blob_names.erase(node_squeeze->output(0));
+
+                node_conv1->set_op_type("HDRGuide");
+                node_conv1->set_output(0, node_squeeze->output(0));
+
+                //node_sub
+                node_conv1->add_input(node_sub->input(1));
+
+                //node_mul
+                node_conv1->add_input(node_mul->input(0));
+
+                //node_conv2
+                node_conv1->add_input(node_conv2->input(1));
+                node_conv1->add_input(node_conv2->input(2));
+
+                i = node_index - 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_histogram.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_histogram.cc
new file mode 100644
index 0000000..7d6a1d2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_histogram.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseHistogram(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Histogram <= OneHot - Cast - MatMul
+        do {
+            if (node->op_type() == "OneHot" && i + 2 < node_count) {
+                onnx::NodeProto* node_onehot = node;
+                
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_cast = index_nodes[next_indexes[0]].node;
+                
+                // check op
+                if (node_cast->op_type() != "Cast")
+                    break;
+                auto to = get_node_attr_i(*node_cast, "to", 1);
+                if (to != 6)
+                        break;
+                
+                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_matmul = index_nodes[next_indexes[0]].node;
+                // check op
+                if (node_matmul->op_type() != "MatMul")
+                    break;
+                
+                node_onehot->set_op_type("Histogram");
+                node_cast->set_op_type(k_tnn_noop_type);
+                node_matmul->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_cast->output(0)));
+                blob_names.erase(node_cast->output(0));
+                node_reference.erase(node_reference.find(node_matmul->output(0)));
+                blob_names.erase(node_matmul->output(0));
+                
+                node_onehot->set_output(0, node_matmul->output(0));
+
+                i += 2;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_instance_normalization.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_instance_normalization.cc
new file mode 100644
index 0000000..7cf5021
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_instance_normalization.cc
@@ -0,0 +1,146 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseInstanceNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                        std::map<std::string, onnx::TensorProto>& weights,
+                                        std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    //NOTE: The old fuse logic of instance norm is wrong, new version of pytorch now directly convert it to InstanceNormalization instead of multiple nodes
+    return 0;
+    
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        /**
+         * Fuse for a special model
+         * InstanceNormalization <= ReduceMean + Sub + Mul + ReduceMean + Add + Sqrt + Reciprocal + Mul + Mul
+         *                          + Sub + Add + Mul + Add
+         *
+         * */
+        do {
+            string reduce_type = node->op_type();
+            if (reduce_type == "ReduceMean" && i + 11 < node_count) {
+                auto node1  = index_nodes[i + 1].node;
+                auto node2  = index_nodes[i + 2].node;
+                auto node3  = index_nodes[i + 3].node;
+                auto node4  = index_nodes[i + 4].node;
+                auto node5  = index_nodes[i + 5].node;
+                auto node6  = index_nodes[i + 6].node;
+                auto node7  = index_nodes[i + 7].node;
+                auto node8  = index_nodes[i + 8].node;
+                auto node9  = index_nodes[i + 9].node;
+                auto node10 = index_nodes[i + 10].node;
+                auto node11 = index_nodes[i + 11].node;
+
+                if (node1->op_type() != "Sub" || node2->op_type() != "Mul" || node3->op_type() != "ReduceMean" ||
+                    node4->op_type() != "Add" || node5->op_type() != "Sqrt" || node6->op_type() != "Reciprocal" ||
+                    node7->op_type() != "Mul" || node8->op_type() != "Mul" || node9->op_type() != "Sub" ||
+                    node10->op_type() != "Mul" || node11->op_type() != "Add") {
+                    break;
+                }
+                if (i == 0) {
+                    break;
+                }
+                auto node_before = index_nodes[i - 1].node;
+                if (node->input(0) != node_before->output(0) || node1->input(0) != node_before->output(0) ||
+                    node1->input(1) != node->output(0)) {
+                    break;
+                }
+
+                if (node2->input(0) != node1->output(0) || node3->input(0) != node2->output(0) ||
+                    node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
+                    node6->input(0) != node5->output(0) || node7->input(0) != node6->output(0)) {
+                    break;
+                }
+                // Mul(X, X)
+                if (node2->input(0) != node2->input(1)) {
+                    break;
+                }
+                if (!((node8->input(0) == node7->output(0) && node8->input(1) == node->output(0)) ||
+                      (node8->input(0) == node->output(0) && node8->input(1) == node7->output(0)))) {
+                    break;
+                }
+                if (node9->input(1) != node8->output(0)) {
+                    break;
+                }
+
+                if (!((node10->input(0) == node_before->output(0) && node10->input(1) == node7->output(0)) ||
+                      (node10->input(0) == node7->output(0) && node10->input(1) == node_before->output(0)))) {
+                    break;
+                }
+                if (!((node11->input(0) == node9->output(0) && node11->input(1) == node10->output(0)) ||
+                      (node11->input(0) == node10->output(0) && node11->input(1) == node9->output(0)))) {
+                    break;
+                }
+
+                node11->set_op_type("InstanceNormalization");
+                node11->clear_input();
+                // input
+                node11->add_input( node->input(0));
+                // Scale
+                node11->add_input(node7->input(1));
+                // Bias
+                node11->add_input(node9->input(0));
+                // epsilon
+                const onnx::TensorProto& tensorProto = weights.at(node4->input(1));
+                float epsilon                        = *get_tensor_proto_data(tensorProto);
+                onnx::AttributeProto* attr_epsilon   = node11->add_attribute();
+                attr_epsilon->set_name("epsilon");
+                attr_epsilon->set_f(epsilon);
+
+                node->set_op_type(k_tnn_noop_type);
+                node1->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+                node7->set_op_type(k_tnn_noop_type);
+                node8->set_op_type(k_tnn_noop_type);
+                node9->set_op_type(k_tnn_noop_type);
+                node10->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node1->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                node_reference.erase(node_reference.find(node7->output(0)));
+                node_reference.erase(node_reference.find(node8->output(0)));
+                node_reference.erase(node_reference.find(node9->output(0)));
+                node_reference.erase(node_reference.find(node10->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node1->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+                blob_names.erase(node7->output(0));
+                blob_names.erase(node8->output(0));
+                blob_names.erase(node9->output(0));
+                blob_names.erase(node10->output(0));
+
+                i += 11;
+            }
+        } while (0);
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_layer_normalization.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_layer_normalization.cc
new file mode 100644
index 0000000..80b1dba
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_layer_normalization.cc
@@ -0,0 +1,406 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseLayerNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                        std::map<std::string, onnx::TensorProto>& weights,
+                                        std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    //NOTE: The fuse logic of layer norm is similar with instance norm. The major difference is the process of scale and bias
+    //Unlike Batch Normalization and Instance Normalization, which applies scalar scale and bias for each entire channel/plane with the affine option,
+    //Layer Normalization applies per-element scale and bias with elementwise_affine.
+    //see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html?highlight=layernorm#torch.nn.LayerNorm
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        /**
+         * Fuse for a special model
+         * LayerNormalization <= ReduceMean + Sub + Pow + ReduceMean + Add + Sqrt + Div + Mul + Add
+         *
+         * */
+        do {
+            string reduce_type = node->op_type();
+            if (reduce_type == "ReduceMean" && i + 8 < node_count) {
+                auto node1  = index_nodes[i + 1].node;
+                auto node2  = index_nodes[i + 2].node;
+                auto node3  = index_nodes[i + 3].node;
+                auto node4  = index_nodes[i + 4].node;
+                auto node5  = index_nodes[i + 5].node;
+                auto node6  = index_nodes[i + 6].node;
+                auto node7  = index_nodes[i + 7].node;
+                auto node8  = index_nodes[i + 8].node;
+
+                if (node1->op_type() != "Sub" || node2->op_type() != "Pow" || node3->op_type() != "ReduceMean" ||
+                    node4->op_type() != "Add" || node5->op_type() != "Sqrt" || node6->op_type() != "Div" ||
+                    node7->op_type() != "Mul" || node8->op_type() != "Add") {
+                    break;
+                }
+                
+                if (node1->input_size() != 2 || node1->input(0) != node->input(0) || node1->input(1) != node->output(0)) {
+                    break;
+                }
+
+                if (node2->input(0) != node1->output(0) || node3->input(0) != node2->output(0) ||
+                    node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0)) {
+                    break;
+                }
+                
+                if (node6->input_size() != 2 || node6->input(0) != node1->output(0) || node6->input(1) != node5->output(0)) {
+                    break;
+                }
+                
+                if (node7->input(0) != node6->output(0) || node8->input(0) != node7->output(0)) {
+                    break;
+                }
+                
+                //scale tensor
+                if (weights.find(node7->input(1)) == weights.end()) {
+                    break;
+                }
+                auto tensor_scale = weights.at(node7->input(1));
+                auto dims_scale = GetDimsFromTensor(tensor_scale);
+                
+                //bias tensor
+                if (weights.find(node8->input(1)) == weights.end()) {
+                    break;
+                }
+                
+                auto axis0 = get_node_attr_ai(*node, "axes");
+                auto axis3 = get_node_attr_ai(*node3, "axes");
+                auto tensor_bias = weights.at(node8->input(1));
+                auto dims_bias = GetDimsFromTensor(tensor_bias);
+                
+                //check dims
+                if (axis0.size() != axis3.size() || axis0.size() != dims_scale.size() ||
+                    dims_scale.size() != dims_bias.size()) {
+                    break;
+                }
+                node8->set_op_type("LayerNormalization");
+                auto bias_name = node8->input(1);
+                node8->clear_input();
+                // input
+                node8->add_input(node->input(0));
+                // Scale
+                node8->add_input(node7->input(1));
+                // Bias
+                node8->add_input(bias_name);
+                // epsilon
+                const onnx::TensorProto& tensorProto = weights.at(node4->input(1));
+                float epsilon = *get_tensor_proto_data(tensorProto);
+                onnx::AttributeProto* attr_epsilon   = node8->add_attribute();
+                attr_epsilon->set_name("epsilon");
+                attr_epsilon->set_f(epsilon);
+                
+                // axes_size
+                onnx::AttributeProto* attr_axis  = node8->add_attribute();
+                attr_axis->set_name("reduce_axes_size");
+                attr_axis->set_i(axis0.size());
+
+                node->set_op_type(k_tnn_noop_type);
+                node1->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+                node7->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node1->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                node_reference.erase(node_reference.find(node7->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node1->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+                blob_names.erase(node7->output(0));
+
+                i += 8;
+            }
+        } while (0);
+        
+        /**
+         * Fuse for a special model
+         * LayerNormalization <= ReduceMean + Sub + Cast + Pow + ReduceMean + Add + Sqrt + Div + Mul + Add
+         *
+         * */
+        do {
+            string reduce_type = node->op_type();
+            if (reduce_type == "ReduceMean" && i + 9 < node_count) {
+                auto node1  = index_nodes[i + 1].node;
+                auto node1_2  = index_nodes[i + 2].node;
+                auto node2  = index_nodes[i + 3].node;
+                auto node3  = index_nodes[i + 4].node;
+                auto node4  = index_nodes[i + 5].node;
+                auto node5  = index_nodes[i + 6].node;
+                auto node6  = index_nodes[i + 7].node;
+                auto node7  = index_nodes[i + 8].node;
+                auto node8  = index_nodes[i + 9].node;
+
+                if (node1->op_type() != "Sub" || node1_2->op_type() != "Cast" ||
+                    node2->op_type() != "Pow" || node3->op_type() != "ReduceMean" ||
+                    node4->op_type() != "Add" || node5->op_type() != "Sqrt" || node6->op_type() != "Div" ||
+                    node7->op_type() != "Mul" || node8->op_type() != "Add") {
+                    break;
+                }
+                
+                if (node1->input_size() != 2 || node1->input(0) != node->input(0) || node1->input(1) != node->output(0)) {
+                    break;
+                }
+
+                if (node1_2->input(0) != node1->output(0) ||
+                    node2->input(0) != node1_2->output(0) || node3->input(0) != node2->output(0) ||
+                    node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0)) {
+                    break;
+                }
+                
+                if (node6->input_size() != 2 || node6->input(0) != node1->output(0) || node6->input(1) != node5->output(0)) {
+                    break;
+                }
+                
+                if (node7->input(0) != node6->output(0) || node8->input(0) != node7->output(0)) {
+                    break;
+                }
+                
+                //scale tensor
+                if (weights.find(node7->input(1)) == weights.end()) {
+                    break;
+                }
+                auto tensor_scale = weights.at(node7->input(1));
+                auto dims_scale = GetDimsFromTensor(tensor_scale);
+                
+                //bias tensor
+                if (weights.find(node8->input(1)) == weights.end()) {
+                    break;
+                }
+                
+                auto axis0 = get_node_attr_ai(*node, "axes");
+                auto axis3 = get_node_attr_ai(*node3, "axes");
+                auto tensor_bias = weights.at(node8->input(1));
+                auto dims_bias = GetDimsFromTensor(tensor_bias);
+                
+                //check dims
+                if (axis0.size() != axis3.size() || axis0.size() != dims_scale.size() ||
+                    dims_scale.size() != dims_bias.size()) {
+                    break;
+                }
+                node8->set_op_type("LayerNormalization");
+                auto bias_name = node8->input(1);
+                node8->clear_input();
+                // input
+                node8->add_input(node->input(0));
+                // Scale
+                node8->add_input(node7->input(1));
+                // Bias
+                node8->add_input(bias_name);
+                // epsilon
+                const onnx::TensorProto& tensorProto = weights.at(node4->input(1));
+                float epsilon = *get_tensor_proto_data(tensorProto);
+                onnx::AttributeProto* attr_epsilon   = node8->add_attribute();
+                attr_epsilon->set_name("epsilon");
+                attr_epsilon->set_f(epsilon);
+                
+                // axes_size
+                onnx::AttributeProto* attr_axis  = node8->add_attribute();
+                attr_axis->set_name("reduce_axes_size");
+                attr_axis->set_i(axis0.size());
+
+                node->set_op_type(k_tnn_noop_type);
+                node1->set_op_type(k_tnn_noop_type);
+                node1_2->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+                node7->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node1->output(0)));
+                node_reference.erase(node_reference.find(node1_2->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                node_reference.erase(node_reference.find(node7->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node1->output(0));
+                blob_names.erase(node1_2->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+                blob_names.erase(node7->output(0));
+
+                i += 9;
+            }
+        } while (0);
+        
+        /**
+         * Fuse for a special model
+         * LayerNormalization <= ReduceMean + Sub + Mul + ReduceMean + Add + Sqrt + Reciprocal + Mul + Mul
+         *                          + Sub + Add + Mul + Add
+         *
+         * */
+        do {
+            string reduce_type = node->op_type();
+            if (reduce_type == "ReduceMean" && i + 11 < node_count) {
+                auto node1  = index_nodes[i + 1].node;
+                auto node2  = index_nodes[i + 2].node;
+                auto node3  = index_nodes[i + 3].node;
+                auto node4  = index_nodes[i + 4].node;
+                auto node5  = index_nodes[i + 5].node;
+                auto node6  = index_nodes[i + 6].node;
+                auto node7  = index_nodes[i + 7].node;
+                auto node8  = index_nodes[i + 8].node;
+                auto node9  = index_nodes[i + 9].node;
+                auto node10 = index_nodes[i + 10].node;
+                auto node11 = index_nodes[i + 11].node;
+
+                if (node1->op_type() != "Sub" || node2->op_type() != "Mul" || node3->op_type() != "ReduceMean" ||
+                    node4->op_type() != "Add" || node5->op_type() != "Sqrt" || node6->op_type() != "Reciprocal" ||
+                    node7->op_type() != "Mul" || node8->op_type() != "Mul" || node9->op_type() != "Sub" ||
+                    node10->op_type() != "Mul" || node11->op_type() != "Add") {
+                    break;
+                }
+                if (i == 0) {
+                    break;
+                }
+                auto node_before = index_nodes[i - 1].node;
+                if (node->input(0) != node_before->output(0) || node1->input(0) != node_before->output(0) ||
+                    node1->input(1) != node->output(0)) {
+                    break;
+                }
+
+                if (node2->input(0) != node1->output(0) || node3->input(0) != node2->output(0) ||
+                    node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
+                    node6->input(0) != node5->output(0) || node7->input(0) != node6->output(0)) {
+                    break;
+                }
+                // Mul(X, X)
+                if (node2->input(0) != node2->input(1)) {
+                    break;
+                }
+                if (!((node8->input(0) == node7->output(0) && node8->input(1) == node->output(0)) ||
+                      (node8->input(0) == node->output(0) && node8->input(1) == node7->output(0)))) {
+                    break;
+                }
+                if (node9->input(1) != node8->output(0)) {
+                    break;
+                }
+
+                if (!((node10->input(0) == node_before->output(0) && node10->input(1) == node7->output(0)) ||
+                      (node10->input(0) == node7->output(0) && node10->input(1) == node_before->output(0)))) {
+                    break;
+                }
+                if (!((node11->input(0) == node9->output(0) && node11->input(1) == node10->output(0)) ||
+                      (node11->input(0) == node10->output(0) && node11->input(1) == node9->output(0)))) {
+                    break;
+                }
+
+                //scale tensor
+                if (weights.find(node7->input(1)) == weights.end()) {
+                    break;
+                }
+                auto tensor_scale = weights.at(node7->input(1));
+                auto dims_scale = GetDimsFromTensor(tensor_scale);
+                
+                //bias tensor
+                if (weights.find(node9->input(0)) == weights.end()) {
+                    break;
+                }
+                
+                auto axis0 = get_node_attr_ai(*node, "axes");
+                auto axis3 = get_node_attr_ai(*node3, "axes");
+                auto tensor_bias = weights.at(node9->input(0));
+                auto dims_bias = GetDimsFromTensor(tensor_bias);
+                
+                //check dims
+                if (axis0.size() != axis3.size() || axis0.size() != dims_scale.size() ||
+                    dims_scale.size() != dims_bias.size()) {
+                    break;
+                }
+                
+                node11->set_op_type("LayerNormalization");
+                node11->clear_input();
+                // input
+                node11->add_input(node->input(0));
+                // Scale
+                node11->add_input(node7->input(1));
+                // Bias
+                node11->add_input(node9->input(0));
+                // epsilon
+                const onnx::TensorProto& tensorProto = weights.at(node4->input(1));
+                float epsilon                        = *get_tensor_proto_data(tensorProto);
+                onnx::AttributeProto* attr_epsilon   = node11->add_attribute();
+                attr_epsilon->set_name("epsilon");
+                attr_epsilon->set_f(epsilon);
+                
+                // axes_size
+                onnx::AttributeProto* attr_axis  = node11->add_attribute();
+                attr_axis->set_name("reduce_axes_size");
+                attr_axis->set_i(axis0.size());
+
+                node->set_op_type(k_tnn_noop_type);
+                node1->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+                node6->set_op_type(k_tnn_noop_type);
+                node7->set_op_type(k_tnn_noop_type);
+                node8->set_op_type(k_tnn_noop_type);
+                node9->set_op_type(k_tnn_noop_type);
+                node10->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node1->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                node_reference.erase(node_reference.find(node6->output(0)));
+                node_reference.erase(node_reference.find(node7->output(0)));
+                node_reference.erase(node_reference.find(node8->output(0)));
+                node_reference.erase(node_reference.find(node9->output(0)));
+                node_reference.erase(node_reference.find(node10->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node1->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+                blob_names.erase(node6->output(0));
+                blob_names.erase(node7->output(0));
+                blob_names.erase(node8->output(0));
+                blob_names.erase(node9->output(0));
+                blob_names.erase(node10->output(0));
+
+                i += 11;
+            }
+        } while (0);
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_logsigmoid.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_logsigmoid.cc
new file mode 100644
index 0000000..a3e39ab
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_logsigmoid.cc
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseLogSigmoid(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // LogSigmoid <= Sigmoid - Log
+        do {
+            if (node->op_type() == "Sigmoid" && i + 1 < node_count) {
+                onnx::NodeProto* node_sigmoid = node;
+                onnx::NodeProto* node_log = index_nodes[i+1].node;
+
+                // check op
+                if (!(node_log->op_type() == "Log"))
+                    break;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                node_log->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_sigmoid->output(0)));
+                blob_names.erase(node_sigmoid->output(0));
+
+                node_sigmoid->set_op_type("LogSigmoid");
+                node_sigmoid->set_output(0, node_log->output(0));
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc
new file mode 100644
index 0000000..b35f500
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseLSTM(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // LSTM <= LSTM(direction=forward) - Squeeze(axis = 1)
+        do {
+            if (node->op_type() == "LSTM" && i + 1 < node_count) {
+                onnx::NodeProto* node_lstm = node;
+                auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
+                if (direction != "forward" && direction != "reverse") {
+                    break;
+                }
+                
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_suqeeze = index_nodes[next_indexes[0]].node;
+                
+                // check op
+                if (!(node_suqeeze->op_type() == "Squeeze"))
+                    break;
+                
+                auto axes = get_node_attr_ai(*node_suqeeze, "axes");
+                if (axes.size() != 1 || axes[0] != 1)
+                        break;
+                
+                node_suqeeze->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_lstm->output(0)));
+                blob_names.erase(node_lstm->output(0));
+                
+                node_lstm->set_output(0, node_suqeeze->output(0));
+
+                i += 1;
+            }
+        } while (0);
+        // LSTM <= LSTM(direction=bidirectional) - Transpose - Reshape
+        do {
+            if (node->op_type() == "LSTM" && i + 2 < node_count) {
+                onnx::NodeProto* node_lstm = node;
+                auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
+                if (direction != "bidirectional") {
+                    break;
+                }
+                
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_transpose = index_nodes[next_indexes[0]].node;
+                
+                // check op
+                if (node_transpose->op_type() != "Transpose")
+                    break;
+                auto perm = get_node_attr_ai(*node_transpose, "perm");
+                if (perm.size() != 4 || perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3)
+                        break;
+                
+                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                onnx::NodeProto* node_reshape = index_nodes[next_indexes[0]].node;
+                // check op
+                if (node_reshape->op_type() != "Reshape")
+                    break;
+                auto shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info_, 1);
+                if (shape.size() != 3 || shape[0] != 0 || shape[1] != 0 || shape[2] != -1)
+                        break;
+                
+                node_transpose->set_op_type(k_tnn_noop_type);
+                node_reshape->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_lstm->output(0)));
+                blob_names.erase(node_lstm->output(0));
+                node_reference.erase(node_reference.find(node_transpose->output(0)));
+                blob_names.erase(node_transpose->output(0));
+                
+                node_lstm->set_output(0, node_reshape->output(0));
+
+                i += 2;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_matmul.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_matmul.cc
new file mode 100644
index 0000000..91aef3f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_matmul.cc
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseMatMul(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // MatMul <= Transpose(weight) - MatMul
+         do {
+             if (node->op_type() == "Transpose") {
+                 // check weight
+                 if (weights.find(node->input(0)) == weights.end())
+                     break;
+
+                 onnx::TensorProto& B = weights[node->input(0)];
+                 if (B.dims_size() != 2)
+                     break;
+
+                 if (node_reference.find(node->output(0)) == node_reference.end() ||
+                     node_reference[node->output(0)] != 1)
+                     break;
+
+                 // perm = (1, 0)
+                 std::vector<int64_t> perm = get_node_attr_ai(*node, "perm");
+                 if (perm.size() != 2)
+                     break;
+                 if (perm[0] != 1 || perm[1] != 0)
+                     break;
+
+                 if (i + 1 >= node_count)
+                     break;
+
+                 auto node2 = index_nodes[i+1].node;
+
+                 if (node2->op_type() != "MatMul")
+                     break;
+                 std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                 if (next_indexes.size() != 1) {
+                     break;
+                 }
+
+                 // reduce
+                 node->set_op_type(k_tnn_noop_type);
+
+                 node_reference.erase(node_reference.find(node->output(0)));
+                 blob_names.erase(node->output(0));
+
+                 node2->set_input(1, node->input(0));
+
+                 // permute weight
+                 {
+                     auto const h = B.dims(0);
+                     auto const w = B.dims(1);
+
+                     std::vector<float> permuted_data;
+                     permuted_data.reserve(h * w);
+                     const float* bptr = B.has_raw_data()
+                                             ? (const float*)B.raw_data().data()
+                                             : B.float_data().data();
+
+                     for (int j = 0; j < w; j++) {
+                         for (int k = 0; k < h; k++) {
+                             float vb = bptr[k * w + j];
+                             permuted_data.push_back(vb);
+                         }
+                     }
+
+                     B.set_dims(0, w);
+                     B.set_dims(1, h);
+
+                     if (B.has_raw_data()) {
+                         B.set_raw_data(permuted_data.data(),
+                                        permuted_data.size() * sizeof(float));
+                     } else {
+                         for (int j = 0; j < (int)permuted_data.size(); j++)
+                             B.set_float_data(j, permuted_data[j]);
+                     }
+                 }
+
+                 i += 1;
+             }
+         } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_normalize.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_normalize.cc
new file mode 100644
index 0000000..272543d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_normalize.cc
@@ -0,0 +1,323 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+#include <limits.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseNormalize(onnx::GraphProto* mutable_graph,
+                                 std::vector<IndexNode> & index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference,
+                                 std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
+        do {
+            string reduce_type = node->op_type();
+            if ((reduce_type == "ReduceL1" || reduce_type == "ReduceL2" ||
+                 reduce_type == "ReduceMax" || reduce_type == "ReduceMin") && i+4 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                // axes = (1)
+                std::vector<int64_t> axes = get_node_attr_ai(*node, "axes");
+                if (axes.size() != 1)
+                    break;
+                if (axes[0] != 1)
+                    break;
+
+                auto node2 = index_nodes[i+1].node;
+                auto node3 = index_nodes[i+2].node;
+                auto node4 = index_nodes[i+3].node;
+                auto node5 = index_nodes[i+4].node;
+
+                if (node2->op_type() != "Clip" ||
+                    node3->op_type() != "Shape" ||
+                    node4->op_type() != "Expand" ||
+                    node5->op_type() != "Div")
+                    break;
+
+                if (node_reference.find(node2->output(0)) == node_reference.end() ||
+                    node_reference[node2->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node3->output(0)) == node_reference.end() ||
+                    node_reference[node3->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node4->output(0)) == node_reference.end() ||
+                    node_reference[node4->output(0)] != 1)
+                    break;
+
+                if (node2->input(0) != node->output(0) || node3->input(0) != node->input(0)
+                    || node4->input(0) != node2->output(0) || node4->input(1) != node3->output(0)
+                    || node5->input(0) != node->input(0) || node5->input(1) != node4->output(0))
+                    break;
+
+                // +eps
+                float clip_min = get_node_attr_f(*node2, "min", 0.f);
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 2;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+
+                node5->set_op_type("Normalize");
+                node5->clear_input();
+                node5->add_input(node->input(0));
+
+                onnx::AttributeProto* attr_eps = node5->add_attribute();
+                attr_eps->set_name("eps");
+                attr_eps->set_f(clip_min);
+
+                onnx::AttributeProto* attr_dim = node5->add_attribute();
+                attr_dim->set_name("dim");
+                attr_dim->set_i(1);
+
+                onnx::AttributeProto* attr_p = node5->add_attribute();
+                attr_p->set_name("p");
+                if (reduce_type == "ReduceL1") { //1范数
+                    attr_p->set_i(1);//
+                } else if (reduce_type == "ReduceL2") { //2范数
+                    attr_p->set_i(2);//
+                } else if (reduce_type == "ReduceMax") { //无穷大范数
+                    attr_p->set_i(INT_MAX);//
+                } else if (reduce_type == "ReduceMin") { //无穷小范数
+                    attr_p->set_i(INT_MIN);//
+                }
+
+                i += 4;
+            }
+        } while (0);
+
+        // Normalize <= X - ReduceL2 - Clip - Expand - Div
+        do {
+            string reduce_type = node->op_type();
+            if ((reduce_type == "ReduceL1" || reduce_type == "ReduceL2" ||
+                 reduce_type == "ReduceMax" || reduce_type == "ReduceMin") && i+3 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() ||
+                    node_reference[node->output(0)] != 1)
+                    break;
+
+                // axes = (1)
+                std::vector<int64_t> axes = get_node_attr_ai(*node, "axes");
+                if (axes.size() != 1)
+                    break;
+                if (axes[0] != 1)
+                    break;
+
+                auto node2 = index_nodes[i+1].node;
+                auto node3 = index_nodes[i+2].node;
+                auto node4 = index_nodes[i+3].node;
+
+                if (node2->op_type() != "Clip" ||
+                    node3->op_type() != "Expand" ||
+                    node4->op_type() != "Div")
+                    break;
+
+                if (node_reference.find(node2->output(0)) == node_reference.end() ||
+                    node_reference[node2->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node3->output(0)) == node_reference.end() ||
+                    node_reference[node3->output(0)] != 1)
+                    break;
+
+                if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)
+                    || node4->input(0) != node->input(0) || node4->input(1) != node3->output(0))
+                    break;
+
+                // +eps
+                float clip_min = get_node_attr_f(*node2, "min", 0.f);
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+
+                node4->set_op_type("Normalize");
+                node4->clear_input();
+                node4->add_input(node->input(0));
+
+                onnx::AttributeProto* attr_eps = node4->add_attribute();
+                attr_eps->set_name("eps");
+                attr_eps->set_f(clip_min);
+
+                onnx::AttributeProto* attr_dim = node4->add_attribute();
+                attr_dim->set_name("dim");
+                attr_dim->set_i(1);
+
+                onnx::AttributeProto* attr_p = node4->add_attribute();
+                attr_p->set_name("p");
+                if (reduce_type == "ReduceL1") { //1范数
+                    attr_p->set_i(1);//
+                } else if (reduce_type == "ReduceL2") { //2范数
+                    attr_p->set_i(2);//
+                } else if (reduce_type == "ReduceMax") { //无穷大范数
+                    attr_p->set_i(INT_MAX);//
+                } else if (reduce_type == "ReduceMin") { //无穷小范数
+                    attr_p->set_i(INT_MIN);//
+                }
+
+                i += 3;
+            }
+        } while (0);
+
+        do {
+            // Normalize <= X - Mul(square) - ReduceSum - Max - Sqrt - Reciprocal - Mul
+            // node: Mul(X, X)
+            string node_type = node->op_type();
+            if (node_type != "Mul" || node->input_size() != 2 ||
+                node->input(0) != node->input(1) || i + 1 >= node_count) {
+                break;
+            }
+            // node2: Reduce
+            auto node2 = index_nodes[i+1].node;
+            string reduce_type = node2->op_type();
+            if ((reduce_type == "ReduceL1" || reduce_type == "ReduceL2" ||
+                reduce_type == "ReduceMax" || reduce_type == "ReduceMin" ||
+                reduce_type == "ReduceSum") && i+5 < node_count) {
+                if (node_reference.find(node2->output(0)) == node_reference.end() ||
+                    node_reference[node2->output(0)] != 1)
+                    break;
+
+                // axes = (1)
+                std::vector<int64_t> axes = get_node_attr_ai(*node2, "axes");
+                if (axes.size() != 1)
+                    break;
+                if (axes[0] != 1)
+                    break;
+
+                auto node3 = index_nodes[i+2].node;
+                auto node4 = index_nodes[i+3].node;
+                auto node5 = index_nodes[i+4].node;
+                auto node6 = index_nodes[i+5].node;
+
+                if (node3->op_type() != "Max" ||
+                    node4->op_type() != "Sqrt" ||
+                    node5->op_type() != "Reciprocal" ||
+                    node6->op_type() != "Mul")
+                    break;
+
+
+                if (node_reference.find(node3->output(0)) == node_reference.end() ||
+                    node_reference[node3->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node4->output(0)) == node_reference.end() ||
+                    node_reference[node4->output(0)] != 1)
+                    break;
+
+                if (node_reference.find(node5->output(0)) == node_reference.end() ||
+                    node_reference[node5->output(0)] != 1)
+                    break;
+
+                if (node2->input(0) != node->output(0) ||
+                    node3->input(0) != node2->output(0) ||
+                    node4->input(0) != node3->output(0) ||
+                    node5->input(0) != node4->output(0) )
+                    break;
+                if (node6->input(0) != node5->output(0) &&
+                    node6->input(1) != node5->output(0)) {
+                    break;
+                }
+                // +eps
+                //float clip_min = get_node_attr_f(*node3, "min", 0.f);
+                if (get_tensor_proto_data_size(weights[node3->input(1)]) != 1 ) {
+                    break;
+                }
+                // get float value
+                float eps = *(get_tensor_proto_data(weights[node3->input(1)]));
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+                node3->set_op_type(k_tnn_noop_type);
+                node4->set_op_type(k_tnn_noop_type);
+                node5->set_op_type(k_tnn_noop_type);
+
+                node_reference[node->input(0)] -= 1;
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                node_reference.erase(node_reference.find(node3->output(0)));
+                node_reference.erase(node_reference.find(node4->output(0)));
+                node_reference.erase(node_reference.find(node5->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+                blob_names.erase(node3->output(0));
+                blob_names.erase(node4->output(0));
+                blob_names.erase(node5->output(0));
+
+                node6->set_op_type("Normalize");
+                node6->clear_input();
+                node6->add_input(node->input(0));
+
+                onnx::AttributeProto* attr_eps = node6->add_attribute();
+                attr_eps->set_name("eps");
+                attr_eps->set_f(eps);
+
+                onnx::AttributeProto* attr_dim = node6->add_attribute();
+                attr_dim->set_name("dim");
+                attr_dim->set_i(1);
+
+                onnx::AttributeProto* attr_p = node6->add_attribute();
+                attr_p->set_name("p");
+                if (reduce_type == "ReduceL1") { //1范数
+                    attr_p->set_i(1);//
+                } else if (reduce_type == "ReduceL2") { //2范数
+                    attr_p->set_i(2);//
+                } else if (reduce_type == "ReduceMax") { //无穷大范数
+                    attr_p->set_i(INT_MAX);//
+                } else if (reduce_type == "ReduceMin") { //无穷小范数
+                    attr_p->set_i(INT_MIN);//
+                } else if (reduce_type == "ReduceSum") {
+                    attr_p->set_i(2);
+                }
+
+                i += 5;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_pooling.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_pooling.cc
new file mode 100644
index 0000000..b632f2b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_pooling.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::FusePooling(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                          std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                          std::set<std::string>& blob_names) {
+    //  Pooling <= Pad + Pooling
+    auto const node_count = index_nodes.size();
+    for (int i = 0; i < node_count; i++) {
+        auto node_pad = index_nodes[i].node;
+        do {
+            if (node_pad->op_type() == "Pad" && i + 1 < node_count) {
+                auto node_pooling = index_nodes[i + 1].node;
+                if (node_pooling->op_type() != "MaxPool" && node_pooling->op_type() != "GlobalMaxPool") {
+                    break;
+                }
+                if (node_pad->output(0) != node_pooling->input(0)) {
+                    break;
+                }
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                auto pads = get_node_attr_ai(*node_pad, "pads", weights, 1);
+                if (pads.size() != 8) {
+                    break;
+                }
+                auto pooling_pads = get_node_attr_ai(*node_pooling, "pads");
+                if (pooling_pads.size() != 4) {
+                    break;
+                }
+                if ((pads[2] != pads[6]) || (pads[3] != pads[7])) {
+                    break;
+                }
+                pooling_pads[0] += pads[2];
+                pooling_pads[1] += pads[3];
+                pooling_pads[2] += pads[6];
+                pooling_pads[3] += pads[7];
+                set_node_attr_ai(*node_pooling, "pads", pooling_pads);
+                node_pad->set_op_type(k_tnn_noop_type);
+                node_pooling->set_input(0, node_pad->input(0));
+                node_reference.erase(node_reference.find(node_pad->output(0)));
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_prelu.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_prelu.cc
new file mode 100644
index 0000000..8531266
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_prelu.cc
@@ -0,0 +1,77 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FusePRelu(onnx::GraphProto* mutable_graph,
+                             std::vector<IndexNode> & index_nodes,
+                             std::map<std::string, onnx::TensorProto>& weights,
+                             std::map<std::string, int>& node_reference,
+                             std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // PReLU <= Unsqueeze - PReLU
+        do {
+            if (node->op_type() == "Unsqueeze")
+            {
+                // check weight
+                if (weights.find(node->input(0)) == weights.end())
+                    continue;
+
+                onnx::TensorProto& B = weights[node->input(0)];
+                if (B.dims_size() != 1)
+                    continue;
+
+                if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                    continue;
+
+                // axes = (1, 2)
+                std::vector<int64_t> axes = get_node_attr_ai(*node, "axes");
+                if (axes.size() != 2)
+                    continue;
+                if (axes[0] != 1 || axes[1] != 2)
+                    continue;
+
+                if (i+1 >= node_count)
+                    continue;
+
+                auto node2 = index_nodes[i+1].node;
+
+                if (node2->op_type() != "PRelu")
+                    continue;
+
+                if (node2->input(1) != node->output(0))
+                    continue;
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                blob_names.erase(node->output(0));
+
+                node2->set_input(1, node->input(0));
+
+                i += 1;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_relu6.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_relu6.cc
new file mode 100644
index 0000000..beb65d1
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_relu6.cc
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseRelu6(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Relu6 <= Relu - Affine - Affine - Clip - Affine
+        do {
+            if (node->op_type() == "Relu" && i + 4 < node_count) {
+                onnx::NodeProto* node_relu = node;
+                onnx::NodeProto* node_affine_0 = index_nodes[i+1].node;
+                onnx::NodeProto* node_affine_1 = index_nodes[i+2].node;
+                onnx::NodeProto* node_clip = index_nodes[i+3].node;
+                onnx::NodeProto* node_affine_2 = index_nodes[i+4].node;
+                
+                // check op
+                if (!(node_affine_0->op_type() == "Affine") ||
+                    !(node_affine_1->op_type() == "Affine") ||
+                    !(node_clip->op_type() == "Clip") ||
+                    !(node_affine_2->op_type() == "Affine"))
+                    break;
+                
+                // check order
+                if (node_relu->output(0) != node_affine_0->input(0) ||
+                    node_affine_0->output(0) != node_affine_1->input(0) ||
+                    node_affine_1->output(0) != node_clip->input(0) ||
+                    node_clip->output(0) != node_affine_2->input(0)) {
+                    break;
+                }
+                
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                next_indexes = GetNextIndexNode(index_nodes, i+1);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                next_indexes = GetNextIndexNode(index_nodes, i+2);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                next_indexes = GetNextIndexNode(index_nodes, i+3);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                
+                float affine_alpha = get_node_attr_f(*node_affine_0, "alpha", onnx_net_info_, 1, 0);
+                float affine_beta = get_node_attr_f(*node_affine_0, "beta", onnx_net_info_, 2, 0);
+                if (affine_alpha != -1.0f || affine_beta!=0.0f)
+                        break;
+                
+                affine_alpha = get_node_attr_f(*node_affine_1, "alpha", onnx_net_info_, 1, 0);
+                affine_beta = get_node_attr_f(*node_affine_1, "beta", onnx_net_info_, 2, 0);
+                if (affine_alpha != 1.0f || affine_beta!=0.0f)
+                        break;
+                
+                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_,1, -FLT_MAX);
+                if (relu6_min != -6.0f)
+                        break;
+                
+                affine_alpha = get_node_attr_f(*node_affine_2, "alpha", onnx_net_info_, 1, 0);
+                affine_beta = get_node_attr_f(*node_affine_2, "beta", onnx_net_info_, 2, 0);
+                if (affine_alpha != -1.0f || affine_beta!=0.0f)
+                        break;
+                
+                node_relu->set_op_type(k_tnn_noop_type);
+                node_affine_0->set_op_type(k_tnn_noop_type);
+                node_affine_1->set_op_type(k_tnn_noop_type);
+                node_clip->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_relu->output(0)));
+                blob_names.erase(node_relu->output(0));
+                node_reference.erase(node_reference.find(node_affine_0->output(0)));
+                blob_names.erase(node_affine_0->output(0));
+                node_reference.erase(node_reference.find(node_affine_1->output(0)));
+                blob_names.erase(node_affine_1->output(0));
+                node_reference.erase(node_reference.find(node_clip->output(0)));
+                blob_names.erase(node_clip->output(0));
+                
+                node_affine_2->set_op_type("Clip");
+                auto attr_min = node_affine_2->add_attribute();
+                attr_min->set_name("min");
+                attr_min->set_f(0);
+                auto attr_max = node_affine_2->add_attribute();
+                attr_max->set_name("max");
+                attr_max->set_f(6);
+                node_affine_2->set_input(0, node_relu->input(0));
+
+                i += 4;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc
new file mode 100644
index 0000000..5c60722
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc
@@ -0,0 +1,344 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::FuseShuffleChannel(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ShuffleChannel <= Reshape - Transpose - Reshape
+        do {
+            if (node->op_type() == "Reshape" && i + 2 < node_count) {
+                if (node_reference[node->output(0)] != 1) {
+                    break;
+                }
+
+                auto node2 = index_nodes[i + 1].node;
+                auto node3 = index_nodes[i + 2].node;
+                if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") {
+                    break;
+                }
+
+                if (node_reference[node2->output(0)] != 1) {
+                    break;
+                }
+
+                std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info_, 1);
+                std::vector<int64_t> perm   = get_node_attr_ai(*node2, "perm");
+                std::vector<int64_t> shape3 = get_node_attr_ai(*node3, "shape", onnx_net_info_, 1);
+
+                int64_t group = 0;
+
+                if (shape1.size() == 5 && perm.size() == 5) {
+                    // batch groups channels_per_group, height, width
+                    group = shape1[1];
+
+                    // 0 2 1 3 4
+                    if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4) {
+                        break;
+                    }
+
+                    if (shape3.size() != 4 || shape3[0] != shape1[0] ||
+                        (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3] ||
+                        shape3[3] != shape1[4]) {
+                        break;
+                    }
+                } else if (shape1.size() == 3 && perm.size() == 3) {
+                    // groups, channels_per_group, height*width
+                    group = shape1[0];
+                    // 1 0 2
+                    if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
+                        break;
+                    }
+
+                    // TODO：考虑情况shape3各种大小
+                    if (shape3.size() != 5 || shape3[0] != shape1[0] ||
+                        (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3]) {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+                // or batch groups channels_per_group, height*width
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node2->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node2->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node2->output(0));
+
+                node3->set_op_type("ShuffleChannel");
+                node3->set_input(0, node->input(0));
+
+                onnx::AttributeProto* attr_group = node3->add_attribute();
+                attr_group->set_name("group");
+                attr_group->set_i(group);
+
+                i += 2;
+            }
+        } while (0);
+
+        // ShuffleChannel - StrideSlice - StrideSlice <= Reshape - Transpose - Reshape - Gather - Gather
+        do {
+            if (node->op_type() == "Reshape" && i + 4 < node_count) {
+                if (node_reference[node->output(0)] != 1) {
+                    break;
+                }
+
+                auto node_transpose           = index_nodes[i + 1].node;
+                auto node_reshape2            = index_nodes[i + 2].node;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i + 2);
+                if (next_indexes.size() < 2) {
+                    break;
+                }
+                auto node_gather1 = index_nodes[next_indexes[0]].node;
+                auto node_gather2 = index_nodes[next_indexes[1]].node;
+                if (node_transpose->op_type() != "Transpose" || node_reshape2->op_type() != "Reshape" ||
+                    node_gather1->op_type() != "Gather" || node_gather2->op_type() != "Gather") {
+                    break;
+                }
+
+                if (node_reference[node_transpose->input(0)] != 1 ||
+                    node_reference[node_reshape2->input(0)] != 1) {
+                    break;
+                }
+
+                if (node->output(0) != node_transpose->input(0) ||
+                    node_transpose->output(0) != node_reshape2->input(0) ||
+                    node_reshape2->output(0) != node_gather1->input(0) ||
+                    node_reshape2->output(0) != node_gather2->input(0)) {
+                    break;
+                }
+
+                std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info_, 1);
+                std::vector<int64_t> perm   = get_node_attr_ai(*node_transpose, "perm");
+                std::vector<int64_t> shape3 = get_node_attr_ai(*node_reshape2, "shape", onnx_net_info_, 1);
+
+                if (shape1.size() != 3 || perm.size() != 3 || shape3.size() != 5) {
+                    break;
+                }
+
+                // groups, channels_per_group, height*width
+                int64_t output_channels = shape3[2] * 2;
+                int64_t group           = shape3[2];
+
+                //                def channel_shuffle_failed(x):
+                //                batchsize, num_channels, height, width = x.data.size()
+                //                assert (num_channels % 4 == 0)
+                //                x = x.reshape(batchsize * num_channels // 2, 2, height * width)
+                //                x = x.permute(1, 0, 2)
+                //                x = x.reshape(2, -1, num_channels // 2, height, width)
+                //                return x[0], x[1]
+
+                //                def channel_shuffle_succeed(x):
+                //                batchsize, num_channels, height, width = x.data.size()
+                //                assert (num_channels % 4 == 0)
+                //                x = x.reshape(batchsize, num_channels // 2, 2, height, width)
+                //                x = x.permute(0, 2, 1, 3, 4)
+                //                x = x.reshape(batchsize, num_channels, height, width)
+                //                return x[:,0:num_channels/2,:,:], x[:,num_channels/2:num_channels,:,:]
+
+                // 1 0 2
+                if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
+                    break;
+                }
+
+                // TODO：考虑情况shape3各种大小
+                if (shape3[0] != 2 || shape3[1] != -1) {
+                    break;
+                }
+
+                int64_t axis1 = get_node_attr_i(*node_gather1, "axis");
+                auto indices1 = get_node_attr_ai(*node_gather1, "indices", onnx_net_info_, 1);
+                int64_t axis2 = get_node_attr_i(*node_gather2, "axis");
+                auto indices2 = get_node_attr_ai(*node_gather2, "indices", onnx_net_info_, 1);
+                if (axis1 != 0 || axis2 != 0 || indices1[0] != 0 || indices2[0] != 1) {
+                    break;
+                }
+
+                // reduce
+                node->set_op_type(k_tnn_noop_type);
+                node_transpose->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node->output(0)));
+                node_reference.erase(node_reference.find(node_transpose->output(0)));
+                blob_names.erase(node->output(0));
+                blob_names.erase(node_transpose->output(0));
+
+                node_reshape2->set_op_type("ShuffleChannel");
+                node_reshape2->set_input(0, node->input(0));
+
+                onnx::AttributeProto* attr_group = node_reshape2->add_attribute();
+                attr_group->set_name("group");
+                attr_group->set_i(group);
+
+                // convert  gather to stride slice
+                node_gather1->set_op_type("Slice");
+                {
+                    onnx::AttributeProto* attr_starts = node_gather1->add_attribute();
+                    attr_starts->set_name("starts");
+                    attr_starts->add_ints(0);
+
+                    onnx::AttributeProto* attr_ends = node_gather1->add_attribute();
+                    attr_ends->set_name("ends");
+                    attr_ends->add_ints(output_channels / 2);
+
+                    onnx::AttributeProto* attr_axes = node_gather1->add_attribute();
+                    attr_axes->set_name("axes");
+                    attr_axes->add_ints(1);
+                }
+
+                node_gather2->set_op_type("Slice");
+                {
+                    onnx::AttributeProto* attr_starts = node_gather2->add_attribute();
+                    attr_starts->set_name("starts");
+                    attr_starts->add_ints(output_channels / 2);
+
+                    onnx::AttributeProto* attr_ends = node_gather2->add_attribute();
+                    attr_ends->set_name("ends");
+                    attr_ends->add_ints(INT_MAX);
+
+                    onnx::AttributeProto* attr_axes = node_gather2->add_attribute();
+                    attr_axes->set_name("axes");
+                    attr_axes->add_ints(1);
+                }
+                i += 4;
+            }
+        } while (0);
+
+        // ShuffleChannel <= split - unsqueeze(n>=1) - concat - transpose(0,1,2,4,3) - reshape
+        do {
+            if (node->op_type() == "Split") {
+                int64_t group = node->output_size();
+                int64_t g     = 0;
+                std::vector<onnx::NodeProto*> nodes_unsqueeze;
+
+                for (; g < group; g++) {
+                    auto node_unsqueeze = index_nodes[i + 1 + g].node;
+                    if (node_unsqueeze->op_type() != "Unsqueeze") {
+                        break;
+                    }
+                    // [BUG] can not get the axis seanxcwang@20200616
+                    // else {
+                    //     if (get_node_attr_i(*node_unsqueeze, "axis") != 3) {
+                    //         std::cout << get_node_attr_i(*node_unsqueeze, "axis") << std::endl;
+                    //         break;
+                    //     }
+                    // }
+                    auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + g);
+                    if (next_indexes.size() != 1) {
+                        break;
+                    }
+                    nodes_unsqueeze.push_back(node_unsqueeze);
+                }
+                // all outputs of split should be unsqueeze
+                if (g < group)
+                    break;
+
+                // next node should be concat
+                auto node_concat = index_nodes[i + 1 + group].node;
+                if (node_concat->op_type() != "Concat") {
+                    break;
+                } else {
+                    if (get_node_attr_i(*node_concat, "axis") != 3) {
+                        break;
+                    }
+                    auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + group);
+                    if (next_indexes.size() != 1) {
+                        break;
+                    }
+                }
+
+                // next node should be transpose
+                auto node_transpose = index_nodes[i + 2 + group].node;
+                if (node_transpose->op_type() != "Transpose") {
+                    break;
+                } else {
+                    auto next_indexes = GetNextIndexNode(index_nodes, i + 2 + group);
+                    if (next_indexes.size() != 1) {
+                        break;
+                    }
+                    std::vector<int64_t> perm = get_node_attr_ai(*node_transpose, "perm");
+                    if (perm.size() == 5) {
+                        if (perm[0] != 0 || perm[1] != 1 || perm[2] != 2 || perm[3] != 4 || perm[4] != 3) {
+                            break;
+                        }
+                    } else {
+                        break;
+                    }
+                }
+
+                // next node should be reshape
+                auto node_reshape = index_nodes[i + 3 + group].node;
+                if (node_reshape->op_type() != "Reshape") {
+                    break;
+                } else {
+                    std::vector<int64_t> shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info_, 1);
+                    if (shape.size() != 4) {
+                        break;
+                    }
+                }
+
+                // get the shuffle pattern, reduce now
+                node->set_op_type(k_tnn_noop_type);
+                for (auto& iter : nodes_unsqueeze) {
+                    iter->set_op_type(k_tnn_noop_type);
+                }
+                node_concat->set_op_type(k_tnn_noop_type);
+                node_transpose->set_op_type(k_tnn_noop_type);
+
+                for (g = 0; g < group; g++) {
+                    node_reference.erase(node_reference.find(node->output(g)));
+                }
+                for (auto& iter : nodes_unsqueeze) {
+                    node_reference.erase(node_reference.find(iter->output(0)));
+                }
+                node_reference.erase(node_reference.find(node_concat->output(0)));
+                node_reference.erase(node_reference.find(node_transpose->output(0)));
+
+                for (g = 0; g < group; g++) {
+                    blob_names.erase(node->output(0));
+                }
+                for (auto& iter : nodes_unsqueeze) {
+                    blob_names.erase(iter->output(0));
+                }
+                blob_names.erase(node_concat->output(0));
+                blob_names.erase(node_transpose->output(0));
+
+                // set new node
+                node_reshape->set_op_type("ShuffleChannel");
+                node_reshape->set_input(0, node->input(0));
+                onnx::AttributeProto* attr_group = node_reshape->add_attribute();
+                attr_group->set_name("group");
+                attr_group->set_i(group);
+
+                i += 3 + group;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_signed_mul.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_signed_mul.cc
new file mode 100644
index 0000000..5de7a0c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_signed_mul.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseSignedMul(onnx::GraphProto* mutable_graph,
+                                 std::vector<IndexNode> & index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference,
+                                 std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // SignedMul <= Sub - Sign - Add - Div - Gather - Slice - Mul
+        do {
+            if (node->op_type() == "Sub" && i + 6 < node_count) {
+                if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                    break;
+
+                auto node_sub = node;
+                auto node_sign = index_nodes[i+1].node;
+                auto node_add = index_nodes[i+2].node;
+                auto node_div = index_nodes[i+3].node;
+                auto node_gather = index_nodes[i+4].node;
+                auto node_slice = index_nodes[i+5].node;
+                auto node_mul = index_nodes[i+6].node;
+
+                if (node_sign->op_type() != "Sign" || node_add->op_type() != "Add" ||
+                    node_div->op_type() != "Div" || node_gather->op_type() != "Gather" ||
+                    node_slice->op_type() != "Slice" || node_mul->op_type() != "Mul")
+                    break;
+
+                // reduce
+                node_sub->set_op_type(k_tnn_noop_type);
+                node_sign->set_op_type(k_tnn_noop_type);
+                node_add->set_op_type(k_tnn_noop_type);
+                node_div->set_op_type(k_tnn_noop_type);
+                node_gather->set_op_type(k_tnn_noop_type);
+                node_slice->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_sub->output(0)));
+                node_reference.erase(node_reference.find(node_sign->output(0)));
+                node_reference.erase(node_reference.find(node_add->output(0)));
+                node_reference.erase(node_reference.find(node_div->output(0)));
+                node_reference.erase(node_reference.find(node_gather->output(0)));
+                node_reference.erase(node_reference.find(node_slice->output(0)));
+                blob_names.erase(node_sub->output(0));
+                blob_names.erase(node_sign->output(0));
+                blob_names.erase(node_add->output(0));
+                blob_names.erase(node_div->output(0));
+                blob_names.erase(node_gather->output(0));
+                blob_names.erase(node_slice->output(0));
+
+                node_mul->set_op_type("SignedMul");
+                node_mul->clear_input();
+                node_mul->add_input(node_sub->input(0));
+
+                onnx::AttributeProto* attr_group = node_mul->add_attribute();
+                attr_group->set_name("alpha");
+                attr_group->set_f(1.0f);
+
+                attr_group = node_mul->add_attribute();
+                attr_group->set_name("beta");
+                attr_group->set_f(1.0f);
+
+                attr_group = node_mul->add_attribute();
+                attr_group->set_name("gamma");
+                attr_group->set_f(2.0f);
+
+                i += 6;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc
new file mode 100644
index 0000000..6c4357b
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc
@@ -0,0 +1,199 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseSoftmax(onnx::GraphProto* mutable_graph,
+                               std::vector<IndexNode> & index_nodes,
+                               std::map<std::string, onnx::TensorProto>& weights,
+                               std::map<std::string, int>& node_reference,
+                               std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // Softmax <= Exp - ReduceSum - Div
+        do {
+            if (node->op_type() == "Exp" && i + 2 < node_count) {
+                auto node_exp = node;
+                auto node_reducesum = index_nodes[i+1].node;
+                auto node_div = index_nodes[i+2].node;
+
+                // check op
+                if (!(node_reducesum->op_type() == "ReduceSum" &&
+                    node_div->op_type() == "Div"))
+                    break;
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+                if (next_indexes.size() != 2) {
+                    break;
+                }
+                next_indexes = GetNextIndexNode(index_nodes, i+1);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+
+                auto axis = get_node_attr_ai(*node_reducesum, "axes");
+
+                bool can_fuse = false;
+                int softmax_axis = 1;
+                if (axis.size() == 1 &&
+                    node_div->input_size() == 2) {
+                    softmax_axis = axis[0];
+
+                    can_fuse =
+                        (node_div->input(0) == node_exp->output(0) &&
+                        node_div->input(1) == node_reducesum->output(0)) ||
+                        (node_div->input(1) == node_exp->output(0) &&
+                        node_div->input(0) == node_reducesum->output(0));
+                } else {
+                    DLog("axis size %d, axis[0]:%d input_size:%d\n", (int)axis.size(), (int)axis[0], (int)node_div->input_size());
+                }
+
+                if (!can_fuse) {
+                    DLog("exp didn't fuse to softmax\n");
+                    break;
+                }
+
+                node_reducesum->set_op_type(k_tnn_noop_type);
+                node_div->set_op_type(k_tnn_noop_type);
+
+                node_reference.erase(node_reference.find(node_exp->output(0)));
+                node_reference.erase(
+                    node_reference.find(node_reducesum->output(0)));
+                blob_names.erase(node_exp->output(0));
+                blob_names.erase(node_reducesum->output(0));
+
+                node_exp->set_op_type("Softmax");
+                node_exp->set_output(0, node_div->output(0));
+                onnx::AttributeProto* attr_group = node_exp->add_attribute();
+                attr_group->set_name("axis");
+                attr_group->set_i(softmax_axis);
+
+                i += 2;
+
+            }
+        } while (0);
+
+        // Softmax <= Transpose - Softmax - Transpose
+        do {
+          if (node->op_type() == "Transpose" && i + 2 < node_count) {
+              auto node_transpose1 = node;
+              auto node_softmax = index_nodes[i+1].node;
+              auto node_transpose2 = index_nodes[i+2].node;
+
+              // check op
+              if (!(node_softmax->op_type() == "Softmax" &&
+                    node_transpose2->op_type() == "Transpose"))
+                  break;
+
+              std::vector<int64_t> perm1 =
+                  get_node_attr_ai(*node_transpose1, "perm");
+              int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
+              std::vector<int64_t> perm2 =
+                  get_node_attr_ai(*node_transpose2, "perm");
+              bool can_fuse = false;
+              if (perm1.size() == 4 && perm2.size() == 4) {
+                  can_fuse = axis == 3 && perm1[0] == 0 && perm1[1] == 2 &&
+                             perm1[2] == 3 && perm1[3] == 1 &&
+                             perm2[0] == 0 && perm2[1] == 3 &&
+                             perm2[2] == 1 && perm2[3] == 2;
+              }
+
+              if (!can_fuse) {
+                  break;
+              }
+
+              node_transpose1->set_op_type(k_tnn_noop_type);
+              node_transpose2->set_op_type(k_tnn_noop_type);
+
+              node_reference.erase(
+                  node_reference.find(node_transpose1->output(0)));
+              node_reference.erase(
+                  node_reference.find(node_softmax->output(0)));
+              blob_names.erase(node_transpose1->output(0));
+              blob_names.erase(node_softmax->output(0));
+
+              auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
+              axis_attr->set_i(1);
+              node_softmax->set_input(0, node_transpose1->input(0));
+              node_softmax->set_output(0, node_transpose2->output(0));
+
+              i += 2;
+          }
+        } while (0);
+
+        // Softmax <= Transpose - Reshape - Softmax - Reshape - Transpose
+        do {
+            if (node->op_type() == "Transpose" && i + 4 < node_count) {
+              auto node_transpose1 = node;
+              auto node_reshape1 = index_nodes[i+1].node;
+              auto node_softmax = index_nodes[i+2].node;
+              auto node_reshape2 = index_nodes[i+3].node;
+              auto node_transpose2 = index_nodes[i+4].node;
+
+              // check op
+              if (!(node_reshape1->op_type() == "Reshape" &&
+                    node_softmax->op_type() == "Softmax" &&
+                    node_reshape2->op_type() == "Reshape" &&
+                    node_transpose2->op_type() == "Transpose"))
+                  break;
+
+              std::vector<int64_t> perm1 =
+                  get_node_attr_ai(*node_transpose1, "perm");
+              int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
+              std::vector<int64_t> perm2 =
+                  get_node_attr_ai(*node_transpose2, "perm");
+              bool can_fuse = false;
+              if (perm1.size() == 4 && perm2.size() == 4) {
+                  can_fuse = axis == 1 && perm1[0] == 0 && perm1[1] == 2 &&
+                             perm1[2] == 3 && perm1[3] == 1 &&
+                             perm2[0] == 0 && perm2[1] == 3 &&
+                             perm2[2] == 1 && perm2[3] == 2;
+              }
+
+              if (!can_fuse) {
+                  break;
+              }
+
+              node_transpose1->set_op_type(k_tnn_noop_type);
+              node_reshape1->set_op_type(k_tnn_noop_type);
+              node_reshape2->set_op_type(k_tnn_noop_type);
+              node_transpose2->set_op_type(k_tnn_noop_type);
+
+              node_reference.erase(node_transpose1->output(0));
+              node_reference.erase(node_reshape1->output(0));
+              node_reference.erase(node_softmax->output(0));
+              node_reference.erase(node_reshape2->output(0));
+
+              blob_names.erase(node_transpose1->output(0));
+              blob_names.erase(node_reshape1->output(0));
+              blob_names.erase(node_softmax->output(0));
+              blob_names.erase(node_reshape2->output(0));
+
+              auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
+              axis_attr->set_i(1);
+              node_softmax->set_input(0, node_transpose1->input(0));
+              node_softmax->set_output(0, node_transpose2->output(0));
+
+              i += 4;
+          }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_spacetodepth.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_spacetodepth.cc
new file mode 100644
index 0000000..1c573c9
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_spacetodepth.cc
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::FuseSpaceToDepth(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                               std::map<std::string, onnx::TensorProto>& weights,
+                               std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        // SpaceToDepth <= Reshape - Transpose - Reshape
+        // CRD mode: Reshape - Transpose(0, 1, 3, 5, 2, 4) - Reshape
+        // DCR mode: Reshape - Transpose(0, 3, 5, 1, 2, 4) - Reshape
+        do {
+            if (node->op_type() != "Reshape" || i + 2 >= node_count) {
+                break;
+            }
+            auto shapes = get_node_attr_ai(*node, "shape", weights, 1);
+            if (shapes.size() != 6) {
+                break;
+            }
+            if (shapes[3] != shapes[5]) {
+                break;
+            }
+            auto transpose_node = index_nodes[i + 1].node;
+            if (transpose_node->op_type() != "Transpose") {
+                break;
+            }
+            auto permute = get_node_attr_ai(*transpose_node, "perm");
+            if (permute.size() != 6) {
+                break;
+            }
+            std::vector<int64_t> CRD_mode = {0, 1, 3, 5, 2, 4};
+            std::vector<int64_t> DCR_mode = {0, 3, 5, 1, 2, 4};
+            bool is_crd                   = true;
+            bool is_dcr                   = true;
+            for (int index = 0; index < 6; index++) {
+                if (CRD_mode[index] != permute[index]) {
+                    is_crd = false;
+                    break;
+                }
+            }
+            for (int index = 0; index < 6; index++) {
+                if (DCR_mode[index] != permute[index]) {
+                    is_dcr = false;
+                    break;
+                }
+            }
+            if (!is_crd && !is_dcr) {
+                break;
+            }
+
+            auto reshape_node = index_nodes[i + 2].node;
+            if (reshape_node->op_type() != "Reshape") {
+                break;
+            }
+            if (node->output(0) != transpose_node->input(0) || transpose_node->output(0) != reshape_node->input(0)) {
+                break;
+            }
+            if (node_reference[node->output(0)] != 1 || node_reference[transpose_node->output(0)] != 1) {
+                break;
+            }
+            reshape_node->set_op_type("SpaceToDepth");
+            // reshape_node->clear_input();
+            // set input
+            // reshape_node->add_input(0, node->input(0));
+            reshape_node->set_input(0, node->input(0));
+            onnx::AttributeProto* attr_block_size = reshape_node->add_attribute();
+            int block_size                        = shapes[3];
+            attr_block_size->set_name("blocksize");
+            attr_block_size->set_i(block_size);
+
+            onnx::AttributeProto* attr_mode = reshape_node->add_attribute();
+            std::string mode                = is_crd ? "CRD" : "DCR";
+            attr_mode->set_name("mode");
+            attr_mode->set_s(mode);
+
+            node->set_op_type(k_tnn_noop_type);
+            transpose_node->set_op_type(k_tnn_noop_type);
+            node_reference.erase(node_reference.find(node->output(0)));
+            RemoveIndexNode(index_nodes, i);
+            node_reference.erase(node_reference.find(transpose_node->output(0)));
+            RemoveIndexNode(index_nodes, i + 1);
+            i = i + 2;
+        } while (0);
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_transpose.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_transpose.cc
new file mode 100644
index 0000000..33a8050
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_transpose.cc
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::FuseTranspose(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        // Transpose <= Transpose - Transpose
+        do {
+            if (node->op_type() == "Transpose" && i + 2 < node_count) {
+                auto node1 = index_nodes[i + 1].node;
+
+                if (node1->op_type() != "Transpose")
+                    break;
+                if (node1->input_size() != 1 || node1->input(0) != node->output(0)) {
+                    break;
+                }
+                
+                auto perm = get_node_attr_ai(*node, "perm");
+                auto perm1 = get_node_attr_ai(*node1, "perm");
+                if (perm.size() != perm1.size()) {
+                    break;
+                }
+                
+                auto dst_perm = perm;
+                for (int i=0; i < perm.size(); i++) {
+                    dst_perm[i] = perm[perm1[i]];
+                }
+                
+                node->set_output(0, node1->output(0));
+                set_node_attr_ai(*node, "perm", dst_perm);
+                perm = get_node_attr_ai(*node, "perm");
+                // reduce
+                node1->set_op_type(k_tnn_noop_type);
+                
+                
+                node_reference.erase(node_reference.find(node1->output(0)));
+                blob_names.erase(node->output(0));
+
+                i += 2;
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_global_max_pool.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_global_max_pool.cc
new file mode 100644
index 0000000..9bf3b22
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_global_max_pool.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::TransferGlobalMaxPool(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                    std::map<std::string, onnx::TensorProto>& weights,
+                                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    // GlobalMaxPool <=  ReduceMax
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "ReduceMax") {
+                // check reduce max axes
+                auto node_reduce_max = node;
+                vector<int64_t> axes = get_node_attr_ai(*node_reduce_max, "axes");
+                if (axes.size() != 2 || axes[0] != 2 || axes[1] != 3) {
+                    break;
+                }
+                node_reduce_max->clear_attribute();
+                node_reduce_max->set_op_type("GlobalMaxPool");
+                node_reference.erase(node_reference.find(node_reduce_max->output(0)));
+                blob_names.erase(node_reduce_max->output(0));
+            }
+        } while (0);
+    }
+
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_gridsample.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_gridsample.cc
new file mode 100644
index 0000000..aad6a1f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_gridsample.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::TransferGridSample(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                    std::map<std::string, onnx::TensorProto>& weights,
+                                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ReduceMax <= Aten
+        do {
+            if (node->op_type() == "ATen") {
+              auto op = get_node_attr_s(*node, "operator");
+              if(op == "grid_sampler") {
+                  node->set_op_type("GridSample");
+              }
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_group_normalization.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_group_normalization.cc
new file mode 100644
index 0000000..ff812e8
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_group_normalization.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::TransferGroupNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                    std::map<std::string, onnx::TensorProto>& weights,
+                                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ReduceMax <= Aten
+        do {
+            if (node->op_type() == "ATen")
+              auto eps = get_node_attr_f(*node, "eps", 1e-5f);
+              auto num_groups = get_node_attr_i(*node, "num_groups", 0);
+              auto op = get_node_attr_s(*node, "operator");
+
+              if(op == "group_norm") {
+                  node->set_op_type("GroupNormalization");
+              }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_input_name.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_input_name.cc
new file mode 100644
index 0000000..cc41d44
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_input_name.cc
@@ -0,0 +1,60 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::TransferInputName(onnx::GraphProto* mutable_graph) {
+    // for input name
+    // images_0 <- images:0
+    std::set<std::string> initializers;
+    const int initializer_count = mutable_graph->initializer_size();
+    for (int i = 0; i < initializer_count; i++) {
+        const auto& initializer_name = mutable_graph->initializer(i).name();
+        initializers.insert(initializer_name);
+    }
+
+    std::map<std::string, std::string> hack_names_map;
+    for (int i = 0; i < mutable_graph->input_size(); ++i) {
+        const std::string& name = mutable_graph->mutable_input(i)->name();
+        if (name.find(':') != std::string::npos && initializers.find(name) == initializers.end()) {
+            // graph input's name has special character ':'
+            std::string hack_name = name;
+            std::replace(hack_name.begin(), hack_name.end(), ':', '_');
+            hack_names_map[name] = hack_name;
+            mutable_graph->mutable_input(i)->set_name(hack_name.c_str());
+        }
+    }
+    if (hack_names_map.empty()) {
+        return 0;
+    }
+    int node_count = mutable_graph->node_size();
+
+    for (int j = 0; j < node_count; j++) {
+        auto node = mutable_graph->mutable_node(j);
+
+        do {
+            for (int k = 0; k < node->input_size(); ++k) {
+                std::string* node_input_name = node->mutable_input(k);
+                if (hack_names_map.find(*node_input_name) !=
+                    hack_names_map.end()) {
+                    *node_input_name = hack_names_map[*node_input_name];
+                }
+            }
+        } while (0);
+    }
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_inverse.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_inverse.cc
new file mode 100644
index 0000000..cd7d3ed
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_inverse.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+int Onnx2TNN::TransferInverse(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                    std::map<std::string, onnx::TensorProto>& weights,
+                                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ReduceMax <= Aten
+        do {
+            if (node->op_type() == "ATen") {
+              auto op = get_node_attr_s(*node, "operator");
+              if(op == "inverse") {
+                  node->set_op_type("Inverse");
+              }
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_reduce_max.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_reduce_max.cc
new file mode 100644
index 0000000..e80b9d4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_transfer_reduce_max.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::TransferReduceMax(onnx::GraphProto* mutable_graph,
+                                 std::vector<IndexNode> & index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference,
+                                 std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // ReduceMax <= Aten
+        do {
+            if (node->op_type() == "ATen")
+            {
+              auto axes = get_node_attr_i(*node, "dim", 1);
+              auto keepdims = get_node_attr_i(*node, "keepdim", 0);
+              auto op = get_node_attr_s(*node, "operator");
+
+              auto attr_axes = node->add_attribute();
+              attr_axes->set_name("axes");
+              attr_axes->add_ints(axes);
+
+              auto attr_keepdims = node->add_attribute();
+              attr_keepdims->set_name("keepdims");
+              attr_keepdims->set_i(keepdims);
+
+              if(op == "max") {
+                  node->set_op_type("ReduceMax");
+                  if (node->output_size() > 1) {
+                      auto output = node->output(0);
+                      node->clear_output();
+                      node->add_output(output);
+                  }
+              }
+
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_remove_transpose.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_remove_transpose.cc
new file mode 100644
index 0000000..eed872f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_remove_transpose.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+#include "onnx_utility.h"
+
+/**
+ * 该方法是为了消除 TF 转换为 onnx 后, onnx 模型中有大量的 transpose 操作,
+ * 造成某些模型无法转换,并且会影响运行效率.
+ * */
+int Onnx2TNN::RemoveTranspose(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        // x <= x - Transpose(0, 2, 3, 1)
+        // x <= x - Transpose(0, 3, 1, 2)
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_transpose = index_nodes[i + 1].node;
+            if (node_transpose->op_type() != "Transpose")
+                break;
+            auto perm = get_node_attr_ai(*node_transpose, "perm");
+            if (!((perm[0] == 0 && perm[1] == 2 && perm[2] == 3 && perm[3] == 1) ||
+                  (perm[0] == 0 && perm[1] == 3 && perm[2] == 1 && perm[3] == 2))) {
+                break;
+            }
+
+            if (node_reference.find(node_transpose->output(0)) == node_reference.end() ||
+                node_reference[node_transpose->output(0)] != 1) {
+                break;
+            }
+
+            // reduce
+            node_transpose->set_op_type(k_tnn_noop_type);
+
+            node_reference.erase(node_reference.find(node_transpose->output(0)));
+            blob_names.erase(node_transpose->output(0));
+
+            RemoveIndexNode(index_nodes, i + 1);
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_concat.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_concat.cc
new file mode 100644
index 0000000..8cade47
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_concat.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+/**
+ *  该方法是为了处理 TF 转换为 onnx 后, onnx 模型中 Concat 的axis 的参数
+ * |         |   tf(nhwc) |  tnn(nchw)   |
+ * |  axis   |     -1     |     1        |
+ * |  axis   |     3      |     1        |
+ * */
+int Onnx2TNN::TransferConcat(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        do {
+            if (node->op_type() == "Concat") {
+                auto axis = get_node_attr_i(*node, "axis", 0);
+                if (axis == 3 || axis == -1) {
+                    auto attr_axis = node->mutable_attribute(0);
+                    attr_axis->set_i(1);
+                }
+            }
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_split.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_split.cc
new file mode 100644
index 0000000..880ada2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_process_tf/onnx2tnn_transfer_split.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+/**
+ *  该方法是为了处理 TF 转换为 onnx 后, onnx 模型中 Split 的axis 的参数
+ * |         |   tf(nhwc) |  tnn(nchw)   |
+ * |  axis   |     -1     |     1        |
+ * |  axis   |     3      |     1        |
+ * */
+int Onnx2TNN::TransferSplit(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        do {
+            if (node->op_type() == "Split") {
+                auto axis = get_node_attr_i(*node, "axis", 0);
+                if (axis == -1 || axis == 3) {
+                    auto attr_axis = node->mutable_attribute(0);
+                    //attr_axis->set_name("axis");
+                    attr_axis->set_i(1);
+                }
+            }
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_concat.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_concat.cc
new file mode 100644
index 0000000..3f8f5c4
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_concat.cc
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveConcat(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        // x <= x - Concat(1 input)
+        do {
+            auto node = index_nodes[i].node;
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_concat = index_nodes[i + 1].node;
+            if (node_concat->op_type() != "Concat")
+                break;
+
+            if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                break;
+
+            if (node_concat->input_size() > 1) {
+                break;
+            }
+
+            // reduce
+            node_concat->set_op_type(k_tnn_noop_type);
+
+            auto erase_node = node_reference.find(node_concat->output(0));
+            if (erase_node != node_reference.end()) {
+                node_reference.erase(node_reference.find(node_concat->output(0)));
+            }
+            blob_names.erase(node->output(0));
+
+            //            node->set_output(0, node_concat->output(0));
+            RemoveIndexNode(index_nodes, i + 1);
+        } while (0);
+
+        // X -> Concat(1 input) -> Y = X -> Y
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_concat = index_nodes[i].node;
+            auto node_next   = index_nodes[i + 1].node;
+            if (node_concat->op_type() != "Concat")
+                break;
+
+            if (node_reference.find(node_concat->output(0)) == node_reference.end() ||
+                node_reference[node_concat->output(0)] != 1)
+                break;
+
+            if (node_concat->input_size() > 1) {
+                break;
+            }
+
+            // reduce
+            node_concat->set_op_type(k_tnn_noop_type);
+            for (int j = 0; j < node_next->input_size(); j++) {
+                std::string node_next_input_name = node_next->input(j);
+                if (node_concat->output(0) == node_next_input_name) {
+                    node_next->set_input(j, node_concat->input(0));
+                }
+            }
+            node_reference.erase(node_reference.find(node_concat->output(0)));
+            blob_names.erase(node_concat->output(0));
+
+            //            node->set_output(0, node_concat->output(0));
+            RemoveIndexNode(index_nodes, i);
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_dropout.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_dropout.cc
new file mode 100644
index 0000000..f5396e2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_dropout.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveDropout(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        if (node->op_type() != "Dropout") {
+            continue;
+        }
+        if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1) {
+            continue;
+        }
+        if (node->output_size() != 1) {
+            LOGE("Onnx2tnn unsupport Droout has 2 output\n");
+            assert(-1);
+        }
+        node->set_op_type(k_tnn_noop_type);
+        RemoveIndexNode(index_nodes, i);
+        if (node_reference.find(node->output(0)) != node_reference.end()) {
+            node_reference.erase(node_reference.find(node->output(0)));
+        }
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_expand.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_expand.cc
new file mode 100644
index 0000000..7038295
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_expand.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveExpand(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        //BroadcastNode <= Expand - BroadcastNode
+        do {
+            std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+            if (next_indexes.size() != 1) {
+                break;
+            }
+            auto node_expand = node;
+            auto node_broadcast = index_nodes[next_indexes[0]].node;
+
+            if (node_expand->op_type() != "Expand")
+                break;
+
+            if (node_broadcast->op_type() != "Mul" && node_broadcast->op_type() != "Div" &&
+                node_broadcast->op_type() != "Add" && node_broadcast->op_type() != "Sub" &&
+                node_broadcast->op_type() != "Min" && node_broadcast->op_type() != "Max")
+                break;
+
+            // reduce
+            node_expand->set_op_type(k_tnn_noop_type);
+
+            node_reference.erase(node_reference.find(node_expand->output(0)));
+            blob_names.erase(node_expand->output(0));
+
+            RemoveIndexNode(index_nodes, i);
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_identity.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_identity.cc
new file mode 100644
index 0000000..3bc7d41
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_identity.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveIdentity(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    std::set<std::string> output_node_set;
+    int output_node_size = mutable_graph->output_size();
+    for (int index = 0; index < output_node_size; index++) {
+        const std::string& output_name = mutable_graph->output(index).name();
+        if (output_node_set.find(output_name) == output_node_set.end()) {
+            output_node_set.emplace(output_name);
+        }
+    }
+
+    auto const node_count = index_nodes.size();
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        if (node->op_type() != "Identity") {
+            continue;
+        }
+
+        const std::string& node_output_name = node->output(0);
+        if (output_node_set.find(node_output_name) != output_node_set.end() && i > 0) {
+            bool is_remove = false;
+            const auto& node_input_name = node->input(0);
+            for (int index = i - 1; index >= 0 && !is_remove; index--) {
+                auto pre_node = index_nodes[index].node;
+                for (int j = 0; j < pre_node->output_size(); j++) {
+                    if (node_input_name == pre_node->output(j)) {
+                        pre_node->set_output(j, node_output_name);
+                        is_remove = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        node->set_op_type(k_tnn_noop_type);
+        if (node_reference.find(node->output(0)) == node_reference.end()) {
+            continue;
+        }
+        RemoveIndexNode(index_nodes, i);
+        if (node_reference.find(node->output(0)) != node_reference.end()) {
+            node_reference.erase(node_reference.find(node->output(0)));
+        }
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_image_scaler.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_image_scaler.cc
new file mode 100644
index 0000000..7360695
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_image_scaler.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveImageScaler(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        // X -> Y <= X -> ImageScaler -> Y
+        // X is model input
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_image_scaler = index_nodes[i].node;
+            auto node_next = index_nodes[i+1].node;
+            if (node_image_scaler->op_type() != "ImageScaler") {
+                break;
+            }
+
+            if (node_reference.find(node_image_scaler->output(0)) == node_reference.end() ||
+                node_reference[node_image_scaler->output(0)] != 1) {
+                break;
+            }
+
+            node_image_scaler->set_op_type(k_tnn_noop_type);
+            node_next->set_input(0, node_image_scaler->input(0));
+
+            RemoveIndexNode(index_nodes, i);
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pad.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pad.cc
new file mode 100644
index 0000000..3516f8a
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pad.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemovePad(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        //x <= x - Pad(1x1s1)
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_pad = index_nodes[i+1].node;
+            if (node_pad->op_type() != "Pad")
+                break;
+            
+            if (node_reference.find(node_pad->output(0)) == node_reference.end() ||
+                node_reference[node_pad->output(0)] != 1)
+                break;
+
+            bool need_remove = true;
+            std::vector<int64_t> pads = get_node_attr_ai(*node_pad, "pads", weights, 1);
+            for (auto item : pads) {
+                need_remove = need_remove && (item == 0);
+            }
+
+            if (pads.size()==0 || !need_remove) {
+                break;
+            }
+            // reduce
+            node_pad->set_op_type(k_tnn_noop_type);
+
+            node_reference.erase(node_reference.find(node_pad->output(0)));
+            blob_names.erase(node_pad->output(0));
+
+            RemoveIndexNode(index_nodes, i+1);
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pool.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pool.cc
new file mode 100644
index 0000000..ac5878f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_pool.cc
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemovePool(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        //x <= x - Pool(1x1s1)
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_pool = index_nodes[i+1].node;
+            if (node_pool->op_type() != "AveragePool" && node_pool->op_type() != "MaxPool")
+                break;
+
+            if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+                break;
+
+            bool need_remove = true;
+            std::vector<int64_t> kernel_shape = get_node_attr_ai(*node_pool, "kernel_shape");
+            std::vector<int64_t> strides = get_node_attr_ai(*node_pool, "strides");
+            std::vector<int64_t> pads    = get_node_attr_ai(*node_pool, "pads");
+            for (auto item : kernel_shape) {
+                need_remove = need_remove && (item == 1);
+            }
+            for (auto item : strides) {
+                need_remove = need_remove && (item == 1);
+            }
+            for (auto item : pads) {
+                need_remove = need_remove && (item == 0);
+            }
+
+            if (!need_remove) {
+                break;
+            }
+            // reduce
+            node_pool->set_op_type(k_tnn_noop_type);
+
+            node_reference.erase(node_reference.find(node_pool->output(0)));
+            blob_names.erase(node->output(0));
+
+//            node->set_output(0, node_pool->output(0));
+            RemoveIndexNode(index_nodes, i+1);
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape.cc
new file mode 100644
index 0000000..9f38f3c
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape.cc
@@ -0,0 +1,164 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveReshape(onnx::GraphProto* mutable_graph,
+                                 std::vector<IndexNode> & index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference,
+                                 std::set<std::string>& blob_names) {
+    const int node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        onnx::NodeProto* node = index_nodes[i].node;
+
+        //x <= x - Reshape(1, -1 )
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_reshape = index_nodes[i + 1].node;
+            if (node_reshape->op_type() != "Reshape")
+                break;
+
+            std::vector<int64_t> shape;
+            if (node_reshape->input_size() == 1) {
+                shape = get_node_attr_ai(*node_reshape, "shape");
+            }  else {
+                shape = get_tensor_proto_reshape_shape(weights[node_reshape->input(1)]);
+            }
+
+//            if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+//                break;
+
+            bool need_remove = false;
+            if (shape.size() == 2 && shape[0] == 1 && shape[1] == -1) {
+                need_remove = true;
+            }
+
+            if (!need_remove) {
+                break;
+            }
+            // reduce
+            node_reshape->set_op_type(k_tnn_noop_type);
+
+            auto item = node_reference.find(node_reshape->output(0));
+            if (item != node_reference.end()) {
+                node_reference.erase(item);
+            }
+            blob_names.erase(node->output(0));
+
+//            node->set_output(0, node_reshape->output(0));
+            RemoveIndexNode(index_nodes, i+1);
+        } while (0);
+
+        //x <= x - Reshape(1, n, 1, 1)
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_reshape = index_nodes[i + 1].node;
+            if (node_reshape->op_type() != "Reshape")
+                break;
+
+            std::vector<int64_t> shape;
+            if (node_reshape->input_size() == 1) {
+                shape = get_node_attr_ai(*node_reshape, "shape");
+            }  else {
+                shape = get_tensor_proto_reshape_shape(weights[node_reshape->input(1)]);
+            }
+
+//            if (node_reference.find(node->output(0)) == node_reference.end() || node_reference[node->output(0)] != 1)
+//                break;
+
+            bool need_remove = false;
+            if (shape.size() == 4 && shape[0] == 1 && shape[2] == 1 && shape[3] == 1) {
+                need_remove = true;
+            }
+
+            if (!need_remove) {
+                break;
+            }
+            // reduce
+            node_reshape->set_op_type(k_tnn_noop_type);
+
+            auto item = node_reference.find(node_reshape->output(0));
+            if (item != node_reference.end()) {
+                node_reference.erase(item);
+            }
+            blob_names.erase(node->output(0));
+
+//            node->set_output(0, node_reshape->output(0));
+            RemoveIndexNode(index_nodes, i+1);
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
+
+
+int Onnx2TNN::RemoveConsecutiveReshape(onnx::GraphProto* mutable_graph,
+                  std::vector<IndexNode>& index_nodes,
+                  std::map<std::string, onnx::TensorProto>& weights,
+                  std::map<std::string, int>& node_reference,
+                             std::set<std::string>& blob_names) {
+    const int node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        onnx::NodeProto* node = index_nodes[i].node;
+
+        //Reshape_1 <= Reshape_0 - Reshape_1
+        do {
+            if (i + 1 >= node_count) {
+                break;
+            }
+            auto node_reshape_0 = node;
+            std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+            if (next_indexes.size() != 1) {
+                break;
+            }
+            auto node_reshape_1 = index_nodes[next_indexes[0]].node;
+            
+            if (node_reshape_0->op_type() != "Reshape" || node_reshape_1->op_type() != "Reshape")
+                break;
+            
+            //确保两个reshape前后相接，且shape是常量
+            if (node_reshape_0->input_size() > 1 && weights.find(node_reshape_0->input(1)) == weights.end()) {
+                LOGE("Onnx2TNN::RemoveConsecutiveReshape node_reshape_0 shape is not const\n");
+                break;
+            }
+            
+            if (node_reshape_1->input_size() > 1 && weights.find(node_reshape_1->input(1)) == weights.end()) {
+                LOGE("Onnx2TNN::RemoveConsecutiveReshape node_reshape_1 shape is not const\n");
+                break;
+            }
+
+            node_reshape_0->set_op_type(k_tnn_noop_type);
+
+            auto item = node_reference.find(node_reshape_0->output(0));
+            if (item != node_reference.end()) {
+                node_reference.erase(item);
+            }
+            blob_names.erase(node_reshape_0->output(0));
+            RemoveIndexNode(index_nodes, i);
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape_where.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape_where.cc
new file mode 100644
index 0000000..1c883b5
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_reshape_where.cc
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveReshapeWhere(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    auto const node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        //x <= x - Reshape(-1) - Shape - ConstantOfShape - Mul - Equal - (Cast) - Where
+        do {
+            std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+            if (next_indexes.size() != 1) {
+                break;
+            }
+            
+            onnx::NodeProto *node_x = node;
+            
+            onnx::NodeProto *node_reshape = nullptr;
+            onnx::NodeProto *node_shape = nullptr;
+            onnx::NodeProto *node_constantofshape = nullptr;
+            onnx::NodeProto *node_mul = nullptr;
+            onnx::NodeProto *node_equal = nullptr;
+            onnx::NodeProto *node_cast = nullptr;
+            onnx::NodeProto *node_where = nullptr;
+            int node_where_index = 0;
+            
+            node_reshape = index_nodes[next_indexes[0]].node;
+            if (node_reshape->op_type() != "Reshape")
+                break;
+            
+            {
+                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+                if (next_indexes.size() != 3) {
+                    break;
+                }
+                
+                node_shape = index_nodes[next_indexes[0]].node;
+                node_equal = index_nodes[next_indexes[1]].node;
+                node_where = index_nodes[next_indexes[2]].node;
+                node_where_index = next_indexes[2];
+                
+                if (node_shape->op_type() != "Shape" || node_equal->op_type() != "Equal" ||
+                    node_where->op_type() != "Where")
+                    break;
+                
+                {
+                    auto equal_next_indexes = GetNextIndexNode(index_nodes, next_indexes[1]);
+                    if (equal_next_indexes.size() != 1) {
+                        break;
+                    }
+                    node_cast = index_nodes[equal_next_indexes[0]].node;
+                    if (node_cast->op_type() != "Cast" && node_cast->op_type() != "Where")
+                        break;
+                }
+
+            }
+            
+            {
+                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+                if (next_indexes.size() != 1) {
+                    break;
+                }
+                
+                node_constantofshape = index_nodes[next_indexes[0]].node;
+                if (node_constantofshape->op_type() != "ConstantOfShape")
+                    break;
+            }
+
+            {
+                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+                if (next_indexes.size() != 2) {
+                    break;
+                }
+                
+                node_mul = index_nodes[next_indexes[0]].node;
+                if (node_mul->op_type() != "Mul")
+                    break;
+            }
+            
+            // reduce
+            node_reshape->set_op_type(k_tnn_noop_type);
+            node_shape->set_op_type(k_tnn_noop_type);
+            node_constantofshape->set_op_type(k_tnn_noop_type);
+            node_mul->set_op_type(k_tnn_noop_type);
+            node_equal->set_op_type(k_tnn_noop_type);
+            node_cast->set_op_type(k_tnn_noop_type);
+            node_where->set_op_type(k_tnn_noop_type);
+
+            node_reference.erase(node_reference.find(node_x->output(0)));
+            blob_names.erase(node_x->output(0));
+            node_reference.erase(node_reference.find(node_reshape->output(0)));
+            blob_names.erase(node_reshape->output(0));
+            node_reference.erase(node_reference.find(node_shape->output(0)));
+            blob_names.erase(node_shape->output(0));
+            node_reference.erase(node_reference.find(node_constantofshape->output(0)));
+            blob_names.erase(node_constantofshape->output(0));
+            node_reference.erase(node_reference.find(node_mul->output(0)));
+            blob_names.erase(node_mul->output(0));
+            node_reference.erase(node_reference.find(node_equal->output(0)));
+            blob_names.erase(node_equal->output(0));
+            if (node_cast != node_where) {
+                node_reference.erase(node_reference.find(node_cast->output(0)));
+                blob_names.erase(node_cast->output(0));
+            }
+
+            {
+                auto node_where_output = node_where->output(0);
+                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, node_where_index);
+                for (auto index : next_indexes) {
+                    auto next_node = index_nodes[index].node;
+                    for (int ii = 0; ii < next_node->input_size(); ii++) {
+                        if (node_where_output == next_node->input(ii)) {
+                            next_node->set_input(ii, node_x->output(0));
+                        }
+                    }
+                }
+            }
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_split_unsqueeze_concat.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_split_unsqueeze_concat.cc
new file mode 100644
index 0000000..cbf7c67
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_split_unsqueeze_concat.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveSplitUnsqueezeConcat(onnx::GraphProto* mutable_graph,
+                                              std::vector<IndexNode> & index_nodes,
+                                              std::map<std::string, onnx::TensorProto>& weights,
+                                              std::map<std::string, int>& node_reference,
+                                              std::set<std::string>& blob_names) {
+    const int node_count = index_nodes.size();
+
+    for (int i = 0; i < node_count; i++) {
+        onnx::NodeProto* node = index_nodes[i].node;
+
+        //x <= x - Split - Unsquzze(n) - Concat
+        do {
+            if (i + 3 >= node_count) {
+                break;
+            }
+            auto node_split = index_nodes[i + 1].node;
+            if (node_split->op_type() != "Split")
+                break;
+
+            auto node_unqueeze = index_nodes[i + 2].node;
+            int index_concat = i + 3;
+            auto node_concat = index_nodes[index_concat].node;
+            while (node_concat->op_type() == "Unsqueeze" && index_concat+1 < node_count) {
+                node_concat = index_nodes[++index_concat].node;
+            }
+
+            if (node_split->op_type() != "Split" ||
+                node_unqueeze->op_type() != "Unsqueeze" ||
+                node_concat->op_type() != "Concat")
+                break;
+
+            // reduce
+            for (int index=i+1; index<=index_concat; index++) {
+                index_nodes[index].node->set_op_type(k_tnn_noop_type);
+                auto item = node_reference.find(index_nodes[index].node->output(0));
+                if (item != node_reference.end()) {
+                    node_reference.erase(item);
+                }
+
+                blob_names.erase(index_nodes[index].node->output(0));
+            }
+
+            node->set_output(0, node_concat->output(0));
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_squeeze.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_squeeze.cc
new file mode 100644
index 0000000..c194b47
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_squeeze.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveSqueeze(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                            std::map<std::string, onnx::TensorProto>& weights,
+                            std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    std::set<std::string> output_node_set;
+    int output_node_size = mutable_graph->output_size();
+    for (int index = 0; index < output_node_size; index++) {
+        const std::string& output_name = mutable_graph->output(index).name();
+        if (output_node_set.find(output_name) == output_node_set.end()) {
+            output_node_set.emplace(output_name);
+        }
+    }
+
+    auto const node_count = index_nodes.size();
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+        if (node->op_type() != "Squeeze") {
+            continue;
+        }
+
+        const std::string& node_output_name = node->output(0);
+        if (output_node_set.find(node_output_name) != output_node_set.end() && i > 0) {
+            bool is_remove = false;
+            const auto& node_input_name = node->input(0);
+            for (int index = i - 1; index >= 0 && !is_remove; index--) {
+                auto pre_node = index_nodes[index].node;
+                for (int j = 0; j < pre_node->output_size(); j++) {
+                    if (node_input_name == pre_node->output(j)) {
+                        pre_node->set_output(j, node_output_name);
+                        is_remove = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        node->set_op_type(k_tnn_noop_type);
+        if (node_reference.find(node->output(0)) == node_reference.end()) {
+            continue;
+        }
+        RemoveIndexNode(index_nodes, i);
+        if (node_reference.find(node->output(0)) != node_reference.end()) {
+            node_reference.erase(node_reference.find(node->output(0)));
+        }
+    }
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_unsqueeze.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_unsqueeze.cc
new file mode 100644
index 0000000..2e5dcb2
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_remove/onnx2tnn_remove_unsqueeze.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#include <math.h>
+
+#include "onnx2tnn.h"
+
+int Onnx2TNN::RemoveUnsqueeze(onnx::GraphProto* mutable_graph,
+                              std::vector<IndexNode> & index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names) {
+    std::set<std::string> output_node_set;
+    int output_node_size = mutable_graph->output_size();
+    for (int index = 0; index < output_node_size; index++) {
+        const std::string& output_name = mutable_graph->output(index).name();
+        if (output_node_set.find(output_name) == output_node_set.end()) {
+            output_node_set.emplace(output_name);
+        }
+    }
+
+    auto const node_count = index_nodes.size();
+    for (int i = 0; i < node_count; i++) {
+        auto node = index_nodes[i].node;
+
+        //x <= x - Unsqueeze
+        do {
+            if (node->op_type() != "Unsqueeze")
+                break;
+
+            const std::string& node_output_name = node->output(0);
+            if (output_node_set.find(node_output_name) != output_node_set.end() && i > 0) {
+                const auto& node_input_name = node->input(0);
+                bool is_remove = false;
+                for (int index = i - 1; index >= 0 && !is_remove; index--) {
+                    auto pre_node = index_nodes[index].node;
+                    for (int j = 0; j < pre_node->output_size(); j++) {
+                        auto output_name = pre_node->output(j);
+                        if (node_input_name == pre_node->output(j)) {
+                            pre_node->set_output(j, node_output_name);
+                            is_remove = true;
+                            break;
+                        }
+                    }
+                }
+            }
+
+            // reduce
+            node->set_op_type(k_tnn_noop_type);
+            if (node_reference.find(node->output(0)) == node_reference.end())
+                break;
+
+            node_reference.erase(node_reference.find(node->output(0)));
+            blob_names.erase(node->output(0));
+
+            RemoveIndexNode(index_nodes, i);
+
+        } while (0);
+    }
+
+    ClearEmptyNode(index_nodes);
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.cc b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.cc
new file mode 100644
index 0000000..831243d
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.cc
@@ -0,0 +1,697 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx_utility.h"
+#include "onnx2tnn_prefix.h"
+#include "onnx_op_converter.h"
+
+std::vector<int64_t> get_tensor_proto_reshape_shape(
+    const onnx::TensorProto& tp) {
+    const int64_t* shape_data = 0;
+    int size                  = 0;
+
+    // int64
+    if (tp.has_raw_data()) {
+        shape_data = (const int64_t*)tp.raw_data().data();
+        size       = tp.raw_data().size() / 8;
+    } else if (tp.data_type() == 7) {
+        shape_data = tp.int64_data().data();
+        size       = tp.int64_data_size();
+    }
+
+    std::vector<int64_t> shape;
+    for (int j = 0; j < size; j++) {
+        shape.push_back(shape_data[j]);
+    }
+
+    return shape;
+}
+
+bool node_has_attr(const onnx::NodeProto& node, const char* key) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+onnx::AttributeProto* get_node_mutable_attr(onnx::NodeProto& node,
+                                            const char* key) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        onnx::AttributeProto* attr = node.mutable_attribute(i);
+        if (attr->name() == key) {
+            return attr;
+        }
+    }
+    return nullptr;
+}
+
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto& node,
+                                      const char* key) {
+    std::vector<int64_t> v;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            v.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++) {
+                v[j] = attr.ints(j);
+            }
+
+            break;
+        }
+    }
+    return v;
+}
+/*
+ * description：
+ *  获取operator 中 attribute 值。
+ *  首先通过 attribute 的 key 来获取 attribute 的 value。
+ *  如果在 attribute 中查找不到，会继续在 inputs 中进行查找（兼容 onnx 1.6.0）。
+ * return：
+ *  返回 vector<int64>
+ * note:
+ *  inputs 中的数据类型包含 int32 以及 int64
+ * */
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto& node,
+                                      const char* key,
+                                      const OnnxNetInfo& net_info, int number) {
+    std::vector<int64_t> array_i;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            array_i.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++) {
+                array_i[j] = attr.ints(j);
+            }
+            break;
+        }
+    }
+    if (array_i.empty()) {
+        // get params from inputs
+        if (number < 0) {
+            number += node.input_size();
+        }
+        assert(number >= 0);
+        if (number > node.input_size() - 1) {
+            LOGD("the number > node.input_size(), key:%s\n", key);
+            return array_i;
+        }
+        const string& name = node.input(number);
+        LOGD("name :%s\n", name.c_str());
+        if (net_info.weights_map.find(name) == net_info.weights_map.end()) {
+            LOGD("input %d name:%s is not weight\n", number, name.c_str());
+            return array_i;
+        }
+
+        const onnx::TensorProto& tensorProto = net_info.weights_map.at(name);
+        array_i = get_tensor_proto_data_vector<int64_t>(tensorProto);
+    }
+    return array_i;
+}
+
+/*
+ * description：
+ *  获取operator 中 attribute 值。
+ *  首先通过 attribute 的 key 来获取 attribute 的 value。
+ *  如果在 attribute 中查找不到，会继续在 inputs 中进行查找（兼容 onnx 1.6.0）。
+ * return：
+ *  返回 vector<int64>
+ * note:
+ *  inputs 中的数据类型包含 int32 以及 int64
+ * */
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto& node,
+                                      const char* key,
+                                      const TensorProtoMap& weights_map,
+                                      int number) {
+    std::vector<int64_t> array_i;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            array_i.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++) {
+                array_i[j] = attr.ints(j);
+            }
+            break;
+        }
+    }
+    if (array_i.empty()) {
+        // get params from inputs
+        if (number < 0) {
+            number += node.input_size();
+        }
+        assert(number >= 0);
+        if (number > node.input_size() - 1) {
+            LOGD("the number > node.input_size(), key:%s\n", key);
+            return array_i;
+        }
+        const string& name = node.input(number);
+        LOGD("name :%s\n", name.c_str());
+        if (weights_map.find(name) == weights_map.end()) {
+            LOGD("input %d name:%s is not weight\n", number, name.c_str());
+            return array_i;
+        }
+
+        const onnx::TensorProto& tensorProto = weights_map.at(name);
+        array_i = get_tensor_proto_data_vector<int64_t>(tensorProto);
+    }
+    return array_i;
+}
+
+bool set_node_attr_ai(onnx::NodeProto& node, const char* key, std::vector<int64_t> values){
+    for (int i = 0; i < node.attribute_size(); ++i) {
+        auto attr = node.mutable_attribute(i);
+        if (attr->name() == key) {
+            for (int j = 0; j < values.size(); j++) {
+                attr->set_ints(j, values[j]);
+            }
+            return true;
+        }
+    }
+    return false;
+}
+
+std::vector<float> get_node_attr_af(const onnx::NodeProto& node,
+                                    const char* key) {
+    std::vector<float> v;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            v.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++) {
+                v[j] = attr.floats(j);
+            }
+
+            break;
+        }
+    }
+
+    return v;
+}
+
+/*
+ * description：
+ *  获取operator 中 attribute 值。
+ *  首先通过 attribute 的 key 来获取 attribute 的 value。
+ *  如果在 attribute 中查找不到，会继续在 inputs 中进行查找（兼容 onnx 1.6.0）。
+ * return：
+ *  返回 vector<float>
+ * note:
+ *  inputs 中的数据类型包含 float
+ * */
+std::vector<float> get_node_attr_af(const onnx::NodeProto& node,
+                                    const char* key,
+                                    const OnnxNetInfo& net_info,
+                                    const int number) {
+    std::vector<float> v;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            v.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++) {
+                v[j] = attr.floats(j);
+            }
+            break;
+        }
+    }
+
+    if (v.size() > 0) {
+        return v;
+    }
+
+    if (number < 0 || number >= node.input_size()) {
+        LOGD("invalid number %d for input_size:%d\n", number,
+             node.input_size());
+        LOGD("node output %s key %s", node.output(0).c_str(), key);
+        assert(0);
+        return v;
+    }
+
+    // get attribute from inputs
+    const string& name = node.input(number);
+    if (net_info.weights_map.find(name) == net_info.weights_map.end()) {
+        LOGD("invalid name for input: %s\n", name.c_str());
+        return v;
+    }
+    const onnx::TensorProto& tensorProto = net_info.weights_map.at(name);
+
+    const int size = get_tensor_proto_data_size(tensorProto);
+    v              = get_tensor_proto_data_vector<float>(tensorProto);
+    return v;
+}
+/*
+ * description：
+ *  获取op 中某些 attribute 的值。 先通过 attribute 的key 来获取 attribute 的
+ * value。 如果在 attribute 中查找不到，会在 inputs 中进行查找。这是为了兼容
+ * onnx 1.6.0 的标准.
+ *
+ * return：
+ *  返回 vector<double>.
+ *
+ * note:
+ *  将返回值设置为 double，是因为 onnx 1.6.0 中的 inputs 的类型为float 和 double
+ * 类型
+ * */
+std::vector<double> get_node_attr_ad(const onnx::NodeProto& node,
+                                     const char* key,
+                                     const OnnxNetInfo& net_info,
+                                     const int number) {
+    std::vector<double> v;
+
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            v.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++) {
+                // float -> double
+                v[j] = attr.floats(j);
+            }
+            break;
+        }
+    }
+    if (v.empty()) {
+        // get params from inputs
+        assert(number >= 0);
+        if (number > node.input_size() - 1) {
+            LOGD("the number > node.input_size()\n");
+            return v;
+        }
+        const string& name                   = node.input(number);
+        const onnx::TensorProto& tensorProto = net_info.weights_map.at(name);
+        if (tensorProto.data_type() == TensorProto_DataType_FLOAT) {
+            const int size    = tensorProto.float_data_size();
+            const float* data = tensorProto.float_data().data();
+            v.resize(size);
+            for (int i = 0; i < size; ++i) {
+                // float -> double
+                v[i] = data[i];
+            }
+        } else if (tensorProto.data_type() == TensorProto_DataType_DOUBLE) {
+            const int size     = tensorProto.double_data_size();
+            const double* data = tensorProto.double_data().data();
+            v.resize(size);
+            for (int i = 0; i < size; ++i) {
+                v[i] = data[i];
+            }
+        } else {
+            LOGD("not support the type");
+        }
+    }
+
+    return v;
+}
+
+int64_t get_node_attr_i(const onnx::NodeProto& node, const char* key, int def) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.i();
+        }
+    }
+    return def;
+}
+
+float get_node_attr_f(const onnx::NodeProto& node, const char* key, float def) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.f();
+        }
+    }
+
+    return def;
+}
+
+float get_node_attr_f(const onnx::NodeProto& node, const char* key,
+                      const OnnxNetInfo& net_info, const int number,
+                      float def) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.f();
+        }
+    }
+
+    if (number < 0 || number >= node.input_size()) {
+        LOGD("invalid number for input\n");
+        return def;
+    }
+
+    // get attribute from inputs
+    if (number > node.input_size()) {
+        DLog("invalid number for input size: %s\n", node.name().c_str());
+        return def;
+    }
+    const string& name = node.input(number);
+    if (net_info.weights_map.find(name) == net_info.weights_map.end()) {
+        LOGD("invalid name for input: %s\n", name.c_str());
+        return def;
+    }
+    const onnx::TensorProto& tensorProto = net_info.weights_map.at(name);
+
+    const int size          = get_tensor_proto_data_size(tensorProto);
+    const float* tensorData = get_tensor_proto_data(tensorProto);
+    if (size > 0) {
+        return tensorData[0];
+    } else {
+        LOGD("TensorProto is invalid");
+    }
+    return def;
+}
+
+double get_node_attr_d(const onnx::NodeProto& node, const char* key,
+                       const OnnxNetInfo& net_info, const int number,
+                       double def) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            // float -> double
+            return attr.f();
+        }
+    }
+    // get attribute from inputs
+
+    if (number < 0 || number >= node.input_size()) {
+        LOGD("invalid number for input\n");
+        assert(0);
+        return def;
+    }
+
+    const string& name = node.input(number);
+    if (net_info.weights_map.find(name) == net_info.weights_map.end()) {
+        LOGD("invalid name for input: %s\n", name.c_str());
+        //        assert(0);
+        return def;
+    }
+
+    const onnx::TensorProto& tensorProto = net_info.weights_map.at(name);
+
+    const int size = get_tensor_proto_data_size(tensorProto);
+    const double* tensorData =
+        (const double*)get_tensor_proto_data(tensorProto);
+    if (size > 0) {
+        return tensorData[0];
+    } else {
+        LOGD("TensorProto is invalid");
+    }
+    return def;
+}
+
+std::string get_node_attr_s(const onnx::NodeProto& node, const char* key,
+                            const std::string& def) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.s();
+        }
+    }
+
+    return def;
+}
+
+onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node,
+                                       const char* key) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.t();
+        }
+    }
+
+    return onnx::TensorProto();
+}
+
+onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node,
+                                       const char* key,
+                                       const OnnxNetInfo& net_info,
+                                       const int number) {
+    for (int i = 0; i < node.attribute_size(); i++) {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key) {
+            return attr.t();
+        }
+    }
+
+    if (number < 0 || number >= node.input_size()) {
+        LOGD("invalid number for input\n");
+        assert(0);
+        return onnx::TensorProto();
+    }
+
+    const string& name = node.input(number);
+    if (net_info.weights_map.find(name) == net_info.weights_map.end()) {
+        LOGD("invalid name for input: %s\n", name.c_str());
+        return onnx::TensorProto();
+    }
+
+    auto tensorProto = net_info.weights_map.at(name);
+    return tensorProto;
+}
+
+int get_tensor_proto_data_size(const onnx::TensorProto& tp) {
+    int size = 0;
+    if (tp.has_raw_data()) {
+        const std::string &raw_data = tp.raw_data();
+        switch (tp.data_type()) {
+            case onnx::TensorProto_DataType_FLOAT: {
+                size = int(raw_data.size() / sizeof(float));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT8: {
+                size = int(raw_data.size() / sizeof(uint8_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT8: {
+                size = int(raw_data.size() / sizeof(int8_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT16: {
+                size = int(raw_data.size() / sizeof(uint16_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT16: {
+                size = int(raw_data.size() / sizeof(int16_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT32: {
+                size = int(raw_data.size() / sizeof(int32_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_INT64: {
+                size = int(raw_data.size() / sizeof(int64_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_BOOL: {
+                size = int(raw_data.size() / sizeof(bool));
+                break;
+            }
+            case onnx::TensorProto_DataType_FLOAT16: {
+                size = int(raw_data.size() / (sizeof(float) / 2));
+                break;
+            }
+            case onnx::TensorProto_DataType_DOUBLE: {
+                size = int(raw_data.size() / sizeof(double));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT32: {
+                size = int(raw_data.size() / sizeof(uint32_t));
+                break;
+            }
+            case onnx::TensorProto_DataType_UINT64: {
+                size = int(raw_data.size() / sizeof(uint64_t));
+                break;
+            }
+            default: {
+                DLog("Onnx Converter: do not support tensor proto data type\n");
+                size = -1;
+            }
+        }
+    } else {
+        switch (tp.data_type()) {
+            case onnx::TensorProto_DataType_FLOAT: {
+                size = tp.float_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_INT32: {
+                size = tp.int32_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_INT64: {
+                size = tp.int64_data_size();
+                break;
+            }
+            case onnx::TensorProto_DataType_DOUBLE: {
+                size = tp.double_data_size();
+                break;
+            }
+            default: {
+                DLog("Onnx Converter: do not support tensor proto data type\n");
+                size = -1;
+            }
+        }
+    }
+    return size;
+}
+
+const float* get_tensor_proto_data(const onnx::TensorProto& tp) {
+    //    TensorProto_DataType_UNDEFINED = 0,
+    //    TensorProto_DataType_FLOAT = 1,
+    //    TensorProto_DataType_UINT8 = 2,
+    //    TensorProto_DataType_INT8 = 3,
+    //    TensorProto_DataType_UINT16 = 4,
+    //    TensorProto_DataType_INT16 = 5,
+    //    TensorProto_DataType_INT32 = 6,
+    //    TensorProto_DataType_INT64 = 7,
+    //    TensorProto_DataType_STRING = 8,
+    //    TensorProto_DataType_BOOL = 9,
+    //    TensorProto_DataType_FLOAT16 = 10,
+    //    TensorProto_DataType_DOUBLE = 11,
+    //    TensorProto_DataType_UINT32 = 12,
+    //    TensorProto_DataType_UINT64 = 13,
+    //    TensorProto_DataType_COMPLEX64 = 14,
+    //    TensorProto_DataType_COMPLEX128 = 15,
+    //    TensorProto_DataType_BFLOAT16 = 16
+    if (tp.has_raw_data()) {
+        return (const float*)tp.raw_data().data();
+    } else if (tp.data_type() == 1) {
+        return tp.float_data().data();
+    } else if (tp.data_type() == 6) {
+        return (const float*)tp.int32_data().data();
+    } else if (tp.data_type() == 7) {
+        return (const float*)tp.int64_data().data();
+    } else if (tp.data_type() == 11) {
+        return (const float*)tp.double_data().data();
+    } else {
+        printf("name:%s data_type :%d\n", tp.name().c_str(), tp.data_type());
+        assert(0);
+        return nullptr;
+    }
+}
+
+float* get_tensor_proto_mutable_data(onnx::TensorProto& tp) {
+    if (tp.has_raw_data()) {
+        return (float*)tp.mutable_raw_data()->data();
+    } else if (tp.data_type() == 1) {
+        return tp.mutable_float_data()->mutable_data();
+    } else if (tp.data_type() == 6) {
+        return (float*)tp.mutable_int32_data()->mutable_data();
+    } else if (tp.data_type() == 7) {
+        return (float*)tp.mutable_int64_data()->mutable_data();
+    } else if (tp.data_type() == 11) {
+        return (float*)tp.mutable_double_data()->mutable_data();
+    } else {
+        assert(0);
+        return nullptr;
+    }
+}
+
+int read_proto_from_binary(const char* filepath,
+                           google::protobuf::Message* message) {
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (!fs.is_open()) {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return -1;
+    }
+
+    google::protobuf::io::IstreamInputStream input(&fs);
+    google::protobuf::io::CodedInputStream codedstr(&input);
+
+    codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX/2);
+
+    bool success = message->ParseFromCodedStream(&codedstr);
+
+    fs.close();
+
+    return success ? 0 : -1;
+}
+
+std::string replace_node_name(std::string str, char older_value,
+                              char new_value) {
+    if (str.find(older_value) != std::string::npos) {
+        replace(str.begin(), str.end(), older_value, new_value);
+        return str;
+    }
+    return str;
+}
+
+std::vector<int> GetDimsFromTensor(const onnx::TensorProto& tensor) {
+    std::vector<int> dims = {};
+    for (const auto dim: tensor.dims()) {
+        dims.push_back(int(dim));
+    }
+    return dims;
+}
+
+std::vector<int> GetDimsFromTensorShape(const onnx::TensorShapeProto& shape) {
+    std::vector<int> dims = {};
+    for (const auto &item : shape.dim()) {
+        dims.push_back((int)item.dim_value());
+    }
+    return dims;
+}
+
+DataType GetTnnDataTypeFromOnnx(const onnx::TypeProto& onnx_type) {
+    return GetTnnDataTypeFromOnnx(onnx_type.tensor_type().elem_type());
+}
+
+DataType GetTnnDataTypeFromOnnx(long long int onnx_data_type) {
+    //keep the same as cast op
+    switch (onnx_data_type) {
+        case onnx::TensorProto_DataType_FLOAT:
+        case onnx::TensorProto_DataType_DOUBLE:{
+            return DATA_TYPE_FLOAT;
+        }
+        case onnx::TensorProto_DataType_FLOAT16: {
+            return DATA_TYPE_HALF;
+        }
+        case onnx::TensorProto_DataType_BOOL: //INT8 BOOL(sizeof(bool) == sizeof(char))
+        case onnx::TensorProto_DataType_UINT8:
+        case onnx::TensorProto_DataType_INT8: {
+            return DATA_TYPE_INT8;
+        }
+        case onnx::TensorProto_DataType_INT64:
+        case onnx::TensorProto_DataType_INT32:
+        case onnx::TensorProto_DataType_UINT32:
+        case onnx::TensorProto_DataType_UINT64: {
+            return DATA_TYPE_INT32;
+        }
+        case onnx::TensorProto_DataType_BFLOAT16: {
+            return DATA_TYPE_BFP16;
+        }
+        default:{
+            DLog("Not support onnx TypeProto type: %d",(int) onnx_data_type);
+            assert(0);
+        }
+    }
+    return DATA_TYPE_AUTO;
+}
+
+std::vector<int> CreateDimsVectorFromTensor(const onnx::TensorProto& tensor) {
+    std::vector<int> dims = {};
+    const auto& tensor_dims = tensor.dims();
+    if (tensor_dims.empty()) {
+        return dims;
+    }
+    for (int i = 0; i < tensor_dims.size(); i++) {
+        dims.push_back((int)tensor.dims(i));
+    }
+    return dims;
+}
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.h b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.h
new file mode 100644
index 0000000..d800279
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/core/onnx_utility.h
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef onnx_utility_hpp
+#define onnx_utility_hpp
+#include <float.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include <iostream>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <limits>
+#include <set>
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include "onnx.pb.h"
+#include "onnx_op_converter.h"
+
+std::vector<int64_t> get_tensor_proto_reshape_shape(const onnx::TensorProto& tp);
+
+bool node_has_attr(const onnx::NodeProto &node, const char *key);
+
+onnx::AttributeProto *get_node_mutable_attr(onnx::NodeProto &node,
+                                            const char *key);
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto& node, const char* key);
+
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto &node, const char *key,
+                                      const OnnxNetInfo& net_info, int number);
+
+std::vector<int64_t> get_node_attr_ai(const onnx::NodeProto &node, const char *key,
+                                      const TensorProtoMap &weights, int number);
+
+std::vector<float> get_node_attr_af(const onnx::NodeProto &node, const char *key);
+
+std::vector<float> get_node_attr_af(const onnx::NodeProto& node, const char* key,
+                                    const OnnxNetInfo &net_info, const int number);
+
+int64_t get_node_attr_i(const onnx::NodeProto &node, const char *key,
+                        int def = 0);
+float get_node_attr_f(const onnx::NodeProto &node, const char *key,
+                      float def = 0.f);
+float get_node_attr_f(const onnx::NodeProto &node, const char *key,
+                      const OnnxNetInfo &net_info, const int number,
+                      float def = 0.f);
+double get_node_attr_d(const onnx::NodeProto &node, const char *key,
+                       const OnnxNetInfo &net_info, const int number,
+                       double def = 0.f);
+std::string get_node_attr_s(const onnx::NodeProto &node, const char *key,
+                            const std::string &def = std::string());
+onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto &node,
+                                       const char *key);
+onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto &node, const char *key,
+                                       const OnnxNetInfo &net_info, const int number);
+
+int get_tensor_proto_data_size(const onnx::TensorProto &tp);
+const float *get_tensor_proto_data(const onnx::TensorProto &tp);
+float *get_tensor_proto_mutable_data(onnx::TensorProto &tp);
+
+std::string replace_node_name(std::string str, char older_value, char new_value);
+
+template <class T>
+std::vector<T> get_tensor_proto_data_vector(const onnx::TensorProto &tp) {
+    std::vector<T> data_vec;
+   //    TensorProto_DataType_FLOAT = 1,
+    int size  = get_tensor_proto_data_size(tp);
+    T *data_T = nullptr;
+    if (tp.has_raw_data()) {
+        const std::string &raw_data = tp.raw_data();
+        data_T                      = (T *)raw_data.data();
+    } else if (tp.data_type() == 1) {
+        data_T = (T *)tp.float_data().data();
+    } else if (tp.data_type() == 6) {
+        data_T = (T *)tp.int32_data().data();
+    } else if (tp.data_type() == 7) {
+        data_T = (T *)tp.int64_data().data();
+    } else if (tp.data_type() == 11) {
+        data_T = (T *)tp.double_data().data();
+    } else {
+        printf("name:%s data_type :%d\n", tp.name().c_str(), tp.data_type());
+        assert(0);
+        return data_vec;
+    }
+
+    for (int i = 0; i < size; i++) {
+        data_vec.push_back(data_T[i]);
+    }
+    return data_vec;
+}
+
+int read_proto_from_binary(const char *filepath,
+                           google::protobuf::Message *message);
+
+bool set_node_attr_ai(onnx::NodeProto& node, const char* key, std::vector<int64_t> values);
+
+std::vector<int> GetDimsFromTensor(const onnx::TensorProto& tensor);
+std::vector<int> GetDimsFromTensorShape(const onnx::TensorShapeProto& tensor);
+
+DataType GetTnnDataTypeFromOnnx(const onnx::TypeProto& onnx_type);
+DataType GetTnnDataTypeFromOnnx(long long int onnx_data_type);
+
+std::vector<int> CreateDimsVectorFromTensor(const onnx::TensorProto& tensor);
+
+#endif /* onnx_utility_hpp */
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto/onnx.proto b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto/onnx.proto
new file mode 100644
index 0000000..3de8926
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto/onnx.proto
@@ -0,0 +1,483 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+syntax = "proto2";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+// 
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control. 
+  // For the IR, we are using simple numbers starting with with 0x00000001, 
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION = 0x0000000000000004;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  optional GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  optional int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+  }
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  optional DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3 b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3
new file mode 100644
index 0000000..a2d6917
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3
@@ -0,0 +1,483 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+syntax = "proto3";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+// 
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control. 
+  // For the IR, we are using simple numbers starting with with 0x00000001, 
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION = 0x0000000000000004;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  float f = 2;               // float
+  int64 i = 3;               // int
+  bytes s = 4;               // UTF-8 string
+  TensorProto t = 5;         // tensor value
+  GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  string doc_string = 6;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  string key = 1;
+  string value= 2;
+};
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    int64 begin = 1;
+    int64 end = 2;
+  }
+  Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    int32 elem_type = 1;
+    TensorShapeProto shape = 2;
+  }
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  int64 version = 2;
+}
\ No newline at end of file
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.cc b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.cc
new file mode 100644
index 0000000..4c76a6f
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.cc
@@ -0,0 +1,8111 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: onnx.proto3
+
+#include "onnx.proto3.pb.h"
+
+#include <algorithm>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/extension_set.h>
+#include <google/protobuf/wire_format_lite_inl.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/reflection_ops.h>
+#include <google/protobuf/wire_format.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<0> scc_info_OperatorSetIdProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<0> scc_info_StringStringEntryProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<0> scc_info_TensorProto_Segment_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<0> scc_info_TensorShapeProto_Dimension_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<1> scc_info_TensorShapeProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<1> scc_info_TypeProto_Tensor_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<1> scc_info_TypeProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<1> scc_info_ValueInfoProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<2> scc_info_AttributeProto_onnx_2eproto3;
+extern PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3 ::google::protobuf::internal::SCCInfo<2> scc_info_TensorProto_onnx_2eproto3;
+namespace onnx {
+class AttributeProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<AttributeProto> _instance;
+} _AttributeProto_default_instance_;
+class ValueInfoProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<ValueInfoProto> _instance;
+} _ValueInfoProto_default_instance_;
+class NodeProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<NodeProto> _instance;
+} _NodeProto_default_instance_;
+class ModelProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<ModelProto> _instance;
+} _ModelProto_default_instance_;
+class StringStringEntryProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<StringStringEntryProto> _instance;
+} _StringStringEntryProto_default_instance_;
+class GraphProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<GraphProto> _instance;
+} _GraphProto_default_instance_;
+class TensorProto_SegmentDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorProto_Segment> _instance;
+} _TensorProto_Segment_default_instance_;
+class TensorProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorProto> _instance;
+} _TensorProto_default_instance_;
+class TensorShapeProto_DimensionDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto_Dimension> _instance;
+  ::google::protobuf::int64 dim_value_;
+  ::google::protobuf::internal::ArenaStringPtr dim_param_;
+} _TensorShapeProto_Dimension_default_instance_;
+class TensorShapeProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto> _instance;
+} _TensorShapeProto_default_instance_;
+class TypeProto_TensorDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TypeProto_Tensor> _instance;
+} _TypeProto_Tensor_default_instance_;
+class TypeProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TypeProto> _instance;
+  const ::onnx::TypeProto_Tensor* tensor_type_;
+} _TypeProto_default_instance_;
+class OperatorSetIdProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<OperatorSetIdProto> _instance;
+} _OperatorSetIdProto_default_instance_;
+}  // namespace onnx
+static void InitDefaultsAttributeProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_AttributeProto_default_instance_;
+    new (ptr) ::onnx::AttributeProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  {
+    void* ptr = &::onnx::_NodeProto_default_instance_;
+    new (ptr) ::onnx::NodeProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  {
+    void* ptr = &::onnx::_GraphProto_default_instance_;
+    new (ptr) ::onnx::GraphProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::AttributeProto::InitAsDefaultInstance();
+  ::onnx::NodeProto::InitAsDefaultInstance();
+  ::onnx::GraphProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<2> scc_info_AttributeProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 2, InitDefaultsAttributeProto_onnx_2eproto3}, {
+      &scc_info_TensorProto_onnx_2eproto3.base,
+      &scc_info_ValueInfoProto_onnx_2eproto3.base,}};
+
+static void InitDefaultsValueInfoProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_ValueInfoProto_default_instance_;
+    new (ptr) ::onnx::ValueInfoProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::ValueInfoProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<1> scc_info_ValueInfoProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 1, InitDefaultsValueInfoProto_onnx_2eproto3}, {
+      &scc_info_TypeProto_onnx_2eproto3.base,}};
+
+static void InitDefaultsModelProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_ModelProto_default_instance_;
+    new (ptr) ::onnx::ModelProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::ModelProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<3> scc_info_ModelProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 3, InitDefaultsModelProto_onnx_2eproto3}, {
+      &scc_info_OperatorSetIdProto_onnx_2eproto3.base,
+      &scc_info_AttributeProto_onnx_2eproto3.base,
+      &scc_info_StringStringEntryProto_onnx_2eproto3.base,}};
+
+static void InitDefaultsStringStringEntryProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_StringStringEntryProto_default_instance_;
+    new (ptr) ::onnx::StringStringEntryProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::StringStringEntryProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<0> scc_info_StringStringEntryProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 0, InitDefaultsStringStringEntryProto_onnx_2eproto3}, {}};
+
+static void InitDefaultsTensorProto_Segment_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TensorProto_Segment_default_instance_;
+    new (ptr) ::onnx::TensorProto_Segment();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TensorProto_Segment::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<0> scc_info_TensorProto_Segment_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 0, InitDefaultsTensorProto_Segment_onnx_2eproto3}, {}};
+
+static void InitDefaultsTensorProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TensorProto_default_instance_;
+    new (ptr) ::onnx::TensorProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TensorProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<2> scc_info_TensorProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 2, InitDefaultsTensorProto_onnx_2eproto3}, {
+      &scc_info_TensorProto_Segment_onnx_2eproto3.base,
+      &scc_info_StringStringEntryProto_onnx_2eproto3.base,}};
+
+static void InitDefaultsTensorShapeProto_Dimension_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TensorShapeProto_Dimension_default_instance_;
+    new (ptr) ::onnx::TensorShapeProto_Dimension();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TensorShapeProto_Dimension::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<0> scc_info_TensorShapeProto_Dimension_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 0, InitDefaultsTensorShapeProto_Dimension_onnx_2eproto3}, {}};
+
+static void InitDefaultsTensorShapeProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TensorShapeProto_default_instance_;
+    new (ptr) ::onnx::TensorShapeProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TensorShapeProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<1> scc_info_TensorShapeProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 1, InitDefaultsTensorShapeProto_onnx_2eproto3}, {
+      &scc_info_TensorShapeProto_Dimension_onnx_2eproto3.base,}};
+
+static void InitDefaultsTypeProto_Tensor_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TypeProto_Tensor_default_instance_;
+    new (ptr) ::onnx::TypeProto_Tensor();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TypeProto_Tensor::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<1> scc_info_TypeProto_Tensor_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 1, InitDefaultsTypeProto_Tensor_onnx_2eproto3}, {
+      &scc_info_TensorShapeProto_onnx_2eproto3.base,}};
+
+static void InitDefaultsTypeProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_TypeProto_default_instance_;
+    new (ptr) ::onnx::TypeProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::TypeProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<1> scc_info_TypeProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 1, InitDefaultsTypeProto_onnx_2eproto3}, {
+      &scc_info_TypeProto_Tensor_onnx_2eproto3.base,}};
+
+static void InitDefaultsOperatorSetIdProto_onnx_2eproto3() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  {
+    void* ptr = &::onnx::_OperatorSetIdProto_default_instance_;
+    new (ptr) ::onnx::OperatorSetIdProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::onnx::OperatorSetIdProto::InitAsDefaultInstance();
+}
+
+::google::protobuf::internal::SCCInfo<0> scc_info_OperatorSetIdProto_onnx_2eproto3 =
+    {{ATOMIC_VAR_INIT(::google::protobuf::internal::SCCInfoBase::kUninitialized), 0, InitDefaultsOperatorSetIdProto_onnx_2eproto3}, {}};
+
+void InitDefaults_onnx_2eproto3() {
+  ::google::protobuf::internal::InitSCC(&scc_info_AttributeProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_ValueInfoProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_ModelProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_StringStringEntryProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TensorProto_Segment_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TensorProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TensorShapeProto_Dimension_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TensorShapeProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TypeProto_Tensor_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_TypeProto_onnx_2eproto3.base);
+  ::google::protobuf::internal::InitSCC(&scc_info_OperatorSetIdProto_onnx_2eproto3.base);
+}
+
+::google::protobuf::Metadata file_level_metadata_onnx_2eproto3[13];
+const ::google::protobuf::EnumDescriptor* file_level_enum_descriptors_onnx_2eproto3[4];
+constexpr ::google::protobuf::ServiceDescriptor const** file_level_service_descriptors_onnx_2eproto3 = nullptr;
+
+const ::google::protobuf::uint32 TableStruct_onnx_2eproto3::offsets[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = {
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, ref_attr_name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, doc_string_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, f_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, i_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, s_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, t_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, g_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, floats_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, ints_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, strings_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, tensors_),
+  PROTOBUF_FIELD_OFFSET(::onnx::AttributeProto, graphs_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::ValueInfoProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::ValueInfoProto, name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ValueInfoProto, type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ValueInfoProto, doc_string_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, input_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, output_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, op_type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, domain_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, attribute_),
+  PROTOBUF_FIELD_OFFSET(::onnx::NodeProto, doc_string_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, ir_version_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, opset_import_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, producer_name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, producer_version_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, domain_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, model_version_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, doc_string_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, graph_),
+  PROTOBUF_FIELD_OFFSET(::onnx::ModelProto, metadata_props_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::StringStringEntryProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::StringStringEntryProto, key_),
+  PROTOBUF_FIELD_OFFSET(::onnx::StringStringEntryProto, value_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, node_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, initializer_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, doc_string_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, input_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, output_),
+  PROTOBUF_FIELD_OFFSET(::onnx::GraphProto, value_info_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto_Segment, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto_Segment, begin_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto_Segment, end_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, dims_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, data_type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, segment_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, float_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, int32_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, string_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, int64_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, name_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, doc_string_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, raw_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, external_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, data_location_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, double_data_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorProto, uint64_data_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto_Dimension, _internal_metadata_),
+  ~0u,  // no _extensions_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto_Dimension, _oneof_case_[0]),
+  ~0u,  // no _weak_field_map_
+  offsetof(::onnx::TensorShapeProto_DimensionDefaultTypeInternal, dim_value_),
+  offsetof(::onnx::TensorShapeProto_DimensionDefaultTypeInternal, dim_param_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto_Dimension, denotation_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto_Dimension, value_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::TensorShapeProto, dim_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto_Tensor, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto_Tensor, elem_type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto_Tensor, shape_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto, _oneof_case_[0]),
+  ~0u,  // no _weak_field_map_
+  offsetof(::onnx::TypeProtoDefaultTypeInternal, tensor_type_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto, denotation_),
+  PROTOBUF_FIELD_OFFSET(::onnx::TypeProto, value_),
+  ~0u,  // no _has_bits_
+  PROTOBUF_FIELD_OFFSET(::onnx::OperatorSetIdProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  PROTOBUF_FIELD_OFFSET(::onnx::OperatorSetIdProto, domain_),
+  PROTOBUF_FIELD_OFFSET(::onnx::OperatorSetIdProto, version_),
+};
+static const ::google::protobuf::internal::MigrationSchema schemas[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = {
+  { 0, -1, sizeof(::onnx::AttributeProto)},
+  { 19, -1, sizeof(::onnx::ValueInfoProto)},
+  { 27, -1, sizeof(::onnx::NodeProto)},
+  { 39, -1, sizeof(::onnx::ModelProto)},
+  { 53, -1, sizeof(::onnx::StringStringEntryProto)},
+  { 60, -1, sizeof(::onnx::GraphProto)},
+  { 72, -1, sizeof(::onnx::TensorProto_Segment)},
+  { 79, -1, sizeof(::onnx::TensorProto)},
+  { 98, -1, sizeof(::onnx::TensorShapeProto_Dimension)},
+  { 107, -1, sizeof(::onnx::TensorShapeProto)},
+  { 113, -1, sizeof(::onnx::TypeProto_Tensor)},
+  { 120, -1, sizeof(::onnx::TypeProto)},
+  { 128, -1, sizeof(::onnx::OperatorSetIdProto)},
+};
+
+static ::google::protobuf::Message const * const file_default_instances[] = {
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_AttributeProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_ValueInfoProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_NodeProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_ModelProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_StringStringEntryProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_GraphProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TensorProto_Segment_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TensorProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TensorShapeProto_Dimension_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TensorShapeProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TypeProto_Tensor_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_TypeProto_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::onnx::_OperatorSetIdProto_default_instance_),
+};
+
+::google::protobuf::internal::AssignDescriptorsTable assign_descriptors_table_onnx_2eproto3 = {
+  {}, AddDescriptors_onnx_2eproto3, "onnx.proto3", schemas,
+  file_default_instances, TableStruct_onnx_2eproto3::offsets,
+  file_level_metadata_onnx_2eproto3, 13, file_level_enum_descriptors_onnx_2eproto3, file_level_service_descriptors_onnx_2eproto3,
+};
+
+const char descriptor_table_protodef_onnx_2eproto3[] =
+  "\n\013onnx.proto3\022\004onnx\"\340\003\n\016AttributeProto\022\014"
+  "\n\004name\030\001 \001(\t\022\025\n\rref_attr_name\030\025 \001(\t\022\022\n\nd"
+  "oc_string\030\r \001(\t\0220\n\004type\030\024 \001(\0162\".onnx.Att"
+  "ributeProto.AttributeType\022\t\n\001f\030\002 \001(\002\022\t\n\001"
+  "i\030\003 \001(\003\022\t\n\001s\030\004 \001(\014\022\034\n\001t\030\005 \001(\0132\021.onnx.Ten"
+  "sorProto\022\033\n\001g\030\006 \001(\0132\020.onnx.GraphProto\022\016\n"
+  "\006floats\030\007 \003(\002\022\014\n\004ints\030\010 \003(\003\022\017\n\007strings\030\t"
+  " \003(\014\022\"\n\007tensors\030\n \003(\0132\021.onnx.TensorProto"
+  "\022 \n\006graphs\030\013 \003(\0132\020.onnx.GraphProto\"\221\001\n\rA"
+  "ttributeType\022\r\n\tUNDEFINED\020\000\022\t\n\005FLOAT\020\001\022\007"
+  "\n\003INT\020\002\022\n\n\006STRING\020\003\022\n\n\006TENSOR\020\004\022\t\n\005GRAPH"
+  "\020\005\022\n\n\006FLOATS\020\006\022\010\n\004INTS\020\007\022\013\n\007STRINGS\020\010\022\013\n"
+  "\007TENSORS\020\t\022\n\n\006GRAPHS\020\n\"Q\n\016ValueInfoProto"
+  "\022\014\n\004name\030\001 \001(\t\022\035\n\004type\030\002 \001(\0132\017.onnx.Type"
+  "Proto\022\022\n\ndoc_string\030\003 \001(\t\"\226\001\n\tNodeProto\022"
+  "\r\n\005input\030\001 \003(\t\022\016\n\006output\030\002 \003(\t\022\014\n\004name\030\003"
+  " \001(\t\022\017\n\007op_type\030\004 \001(\t\022\016\n\006domain\030\007 \001(\t\022\'\n"
+  "\tattribute\030\005 \003(\0132\024.onnx.AttributeProto\022\022"
+  "\n\ndoc_string\030\006 \001(\t\"\223\002\n\nModelProto\022\022\n\nir_"
+  "version\030\001 \001(\003\022.\n\014opset_import\030\010 \003(\0132\030.on"
+  "nx.OperatorSetIdProto\022\025\n\rproducer_name\030\002"
+  " \001(\t\022\030\n\020producer_version\030\003 \001(\t\022\016\n\006domain"
+  "\030\004 \001(\t\022\025\n\rmodel_version\030\005 \001(\003\022\022\n\ndoc_str"
+  "ing\030\006 \001(\t\022\037\n\005graph\030\007 \001(\0132\020.onnx.GraphPro"
+  "to\0224\n\016metadata_props\030\016 \003(\0132\034.onnx.String"
+  "StringEntryProto\"4\n\026StringStringEntryPro"
+  "to\022\013\n\003key\030\001 \001(\t\022\r\n\005value\030\002 \001(\t\"\352\001\n\nGraph"
+  "Proto\022\035\n\004node\030\001 \003(\0132\017.onnx.NodeProto\022\014\n\004"
+  "name\030\002 \001(\t\022&\n\013initializer\030\005 \003(\0132\021.onnx.T"
+  "ensorProto\022\022\n\ndoc_string\030\n \001(\t\022#\n\005input\030"
+  "\013 \003(\0132\024.onnx.ValueInfoProto\022$\n\006output\030\014 "
+  "\003(\0132\024.onnx.ValueInfoProto\022(\n\nvalue_info\030"
+  "\r \003(\0132\024.onnx.ValueInfoProto\"\270\005\n\013TensorPr"
+  "oto\022\014\n\004dims\030\001 \003(\003\022\021\n\tdata_type\030\002 \001(\005\022*\n\007"
+  "segment\030\003 \001(\0132\031.onnx.TensorProto.Segment"
+  "\022\026\n\nfloat_data\030\004 \003(\002B\002\020\001\022\026\n\nint32_data\030\005"
+  " \003(\005B\002\020\001\022\023\n\013string_data\030\006 \003(\014\022\026\n\nint64_d"
+  "ata\030\007 \003(\003B\002\020\001\022\014\n\004name\030\010 \001(\t\022\022\n\ndoc_strin"
+  "g\030\014 \001(\t\022\020\n\010raw_data\030\t \001(\014\0223\n\rexternal_da"
+  "ta\030\r \003(\0132\034.onnx.StringStringEntryProto\0225"
+  "\n\rdata_location\030\016 \001(\0162\036.onnx.TensorProto"
+  ".DataLocation\022\027\n\013double_data\030\n \003(\001B\002\020\001\022\027"
+  "\n\013uint64_data\030\013 \003(\004B\002\020\001\032%\n\007Segment\022\r\n\005be"
+  "gin\030\001 \001(\003\022\013\n\003end\030\002 \001(\003\"\332\001\n\010DataType\022\r\n\tU"
+  "NDEFINED\020\000\022\t\n\005FLOAT\020\001\022\t\n\005UINT8\020\002\022\010\n\004INT8"
+  "\020\003\022\n\n\006UINT16\020\004\022\t\n\005INT16\020\005\022\t\n\005INT32\020\006\022\t\n\005"
+  "INT64\020\007\022\n\n\006STRING\020\010\022\010\n\004BOOL\020\t\022\013\n\007FLOAT16"
+  "\020\n\022\n\n\006DOUBLE\020\013\022\n\n\006UINT32\020\014\022\n\n\006UINT64\020\r\022\r"
+  "\n\tCOMPLEX64\020\016\022\016\n\nCOMPLEX128\020\017\022\014\n\010BFLOAT1"
+  "6\020\020\")\n\014DataLocation\022\013\n\007DEFAULT\020\000\022\014\n\010EXTE"
+  "RNAL\020\001\"\225\001\n\020TensorShapeProto\022-\n\003dim\030\001 \003(\013"
+  "2 .onnx.TensorShapeProto.Dimension\032R\n\tDi"
+  "mension\022\023\n\tdim_value\030\001 \001(\003H\000\022\023\n\tdim_para"
+  "m\030\002 \001(\tH\000\022\022\n\ndenotation\030\003 \001(\tB\007\n\005value\"\233"
+  "\001\n\tTypeProto\022-\n\013tensor_type\030\001 \001(\0132\026.onnx"
+  ".TypeProto.TensorH\000\022\022\n\ndenotation\030\006 \001(\t\032"
+  "B\n\006Tensor\022\021\n\telem_type\030\001 \001(\005\022%\n\005shape\030\002 "
+  "\001(\0132\026.onnx.TensorShapeProtoB\007\n\005value\"5\n\022"
+  "OperatorSetIdProto\022\016\n\006domain\030\001 \001(\t\022\017\n\007ve"
+  "rsion\030\002 \001(\003*}\n\007Version\022\022\n\016_START_VERSION"
+  "\020\000\022\031\n\025IR_VERSION_2017_10_10\020\001\022\031\n\025IR_VERS"
+  "ION_2017_10_30\020\002\022\030\n\024IR_VERSION_2017_11_3"
+  "\020\003\022\016\n\nIR_VERSION\020\004b\006proto3"
+  ;
+::google::protobuf::internal::DescriptorTable descriptor_table_onnx_2eproto3 = {
+  false, InitDefaults_onnx_2eproto3, 
+  descriptor_table_protodef_onnx_2eproto3,
+  "onnx.proto3", &assign_descriptors_table_onnx_2eproto3, 2506,
+};
+
+void AddDescriptors_onnx_2eproto3() {
+  static constexpr ::google::protobuf::internal::InitFunc deps[1] =
+  {
+  };
+ ::google::protobuf::internal::AddDescriptors(&descriptor_table_onnx_2eproto3, deps, 0);
+}
+
+// Force running AddDescriptors() at dynamic initialization time.
+static bool dynamic_init_dummy_onnx_2eproto3 = []() { AddDescriptors_onnx_2eproto3(); return true; }();
+namespace onnx {
+const ::google::protobuf::EnumDescriptor* AttributeProto_AttributeType_descriptor() {
+  ::google::protobuf::internal::AssignDescriptors(&assign_descriptors_table_onnx_2eproto3);
+  return file_level_enum_descriptors_onnx_2eproto3[0];
+}
+bool AttributeProto_AttributeType_IsValid(int value) {
+  switch (value) {
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+      return true;
+    default:
+      return false;
+  }
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const AttributeProto_AttributeType AttributeProto::UNDEFINED;
+const AttributeProto_AttributeType AttributeProto::FLOAT;
+const AttributeProto_AttributeType AttributeProto::INT;
+const AttributeProto_AttributeType AttributeProto::STRING;
+const AttributeProto_AttributeType AttributeProto::TENSOR;
+const AttributeProto_AttributeType AttributeProto::GRAPH;
+const AttributeProto_AttributeType AttributeProto::FLOATS;
+const AttributeProto_AttributeType AttributeProto::INTS;
+const AttributeProto_AttributeType AttributeProto::STRINGS;
+const AttributeProto_AttributeType AttributeProto::TENSORS;
+const AttributeProto_AttributeType AttributeProto::GRAPHS;
+const AttributeProto_AttributeType AttributeProto::AttributeType_MIN;
+const AttributeProto_AttributeType AttributeProto::AttributeType_MAX;
+const int AttributeProto::AttributeType_ARRAYSIZE;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+const ::google::protobuf::EnumDescriptor* TensorProto_DataType_descriptor() {
+  ::google::protobuf::internal::AssignDescriptors(&assign_descriptors_table_onnx_2eproto3);
+  return file_level_enum_descriptors_onnx_2eproto3[1];
+}
+bool TensorProto_DataType_IsValid(int value) {
+  switch (value) {
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+    case 16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const TensorProto_DataType TensorProto::UNDEFINED;
+const TensorProto_DataType TensorProto::FLOAT;
+const TensorProto_DataType TensorProto::UINT8;
+const TensorProto_DataType TensorProto::INT8;
+const TensorProto_DataType TensorProto::UINT16;
+const TensorProto_DataType TensorProto::INT16;
+const TensorProto_DataType TensorProto::INT32;
+const TensorProto_DataType TensorProto::INT64;
+const TensorProto_DataType TensorProto::STRING;
+const TensorProto_DataType TensorProto::BOOL;
+const TensorProto_DataType TensorProto::FLOAT16;
+const TensorProto_DataType TensorProto::DOUBLE;
+const TensorProto_DataType TensorProto::UINT32;
+const TensorProto_DataType TensorProto::UINT64;
+const TensorProto_DataType TensorProto::COMPLEX64;
+const TensorProto_DataType TensorProto::COMPLEX128;
+const TensorProto_DataType TensorProto::BFLOAT16;
+const TensorProto_DataType TensorProto::DataType_MIN;
+const TensorProto_DataType TensorProto::DataType_MAX;
+const int TensorProto::DataType_ARRAYSIZE;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+const ::google::protobuf::EnumDescriptor* TensorProto_DataLocation_descriptor() {
+  ::google::protobuf::internal::AssignDescriptors(&assign_descriptors_table_onnx_2eproto3);
+  return file_level_enum_descriptors_onnx_2eproto3[2];
+}
+bool TensorProto_DataLocation_IsValid(int value) {
+  switch (value) {
+    case 0:
+    case 1:
+      return true;
+    default:
+      return false;
+  }
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const TensorProto_DataLocation TensorProto::DEFAULT;
+const TensorProto_DataLocation TensorProto::EXTERNAL;
+const TensorProto_DataLocation TensorProto::DataLocation_MIN;
+const TensorProto_DataLocation TensorProto::DataLocation_MAX;
+const int TensorProto::DataLocation_ARRAYSIZE;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+const ::google::protobuf::EnumDescriptor* Version_descriptor() {
+  ::google::protobuf::internal::AssignDescriptors(&assign_descriptors_table_onnx_2eproto3);
+  return file_level_enum_descriptors_onnx_2eproto3[3];
+}
+bool Version_IsValid(int value) {
+  switch (value) {
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+      return true;
+    default:
+      return false;
+  }
+}
+
+
+// ===================================================================
+
+void AttributeProto::InitAsDefaultInstance() {
+  ::onnx::_AttributeProto_default_instance_._instance.get_mutable()->t_ = const_cast< ::onnx::TensorProto*>(
+      ::onnx::TensorProto::internal_default_instance());
+  ::onnx::_AttributeProto_default_instance_._instance.get_mutable()->g_ = const_cast< ::onnx::GraphProto*>(
+      ::onnx::GraphProto::internal_default_instance());
+}
+class AttributeProto::HasBitSetters {
+ public:
+  static const ::onnx::TensorProto& t(const AttributeProto* msg);
+  static const ::onnx::GraphProto& g(const AttributeProto* msg);
+};
+
+const ::onnx::TensorProto&
+AttributeProto::HasBitSetters::t(const AttributeProto* msg) {
+  return *msg->t_;
+}
+const ::onnx::GraphProto&
+AttributeProto::HasBitSetters::g(const AttributeProto* msg) {
+  return *msg->g_;
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int AttributeProto::kNameFieldNumber;
+const int AttributeProto::kRefAttrNameFieldNumber;
+const int AttributeProto::kDocStringFieldNumber;
+const int AttributeProto::kTypeFieldNumber;
+const int AttributeProto::kFFieldNumber;
+const int AttributeProto::kIFieldNumber;
+const int AttributeProto::kSFieldNumber;
+const int AttributeProto::kTFieldNumber;
+const int AttributeProto::kGFieldNumber;
+const int AttributeProto::kFloatsFieldNumber;
+const int AttributeProto::kIntsFieldNumber;
+const int AttributeProto::kStringsFieldNumber;
+const int AttributeProto::kTensorsFieldNumber;
+const int AttributeProto::kGraphsFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+AttributeProto::AttributeProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.AttributeProto)
+}
+AttributeProto::AttributeProto(const AttributeProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      floats_(from.floats_),
+      ints_(from.ints_),
+      strings_(from.strings_),
+      tensors_(from.tensors_),
+      graphs_(from.graphs_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  s_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.s().size() > 0) {
+    s_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.s_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  ref_attr_name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.ref_attr_name().size() > 0) {
+    ref_attr_name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.ref_attr_name_);
+  }
+  if (from.has_t()) {
+    t_ = new ::onnx::TensorProto(*from.t_);
+  } else {
+    t_ = nullptr;
+  }
+  if (from.has_g()) {
+    g_ = new ::onnx::GraphProto(*from.g_);
+  } else {
+    g_ = nullptr;
+  }
+  ::memcpy(&i_, &from.i_,
+    static_cast<size_t>(reinterpret_cast<char*>(&type_) -
+    reinterpret_cast<char*>(&i_)) + sizeof(type_));
+  // @@protoc_insertion_point(copy_constructor:onnx.AttributeProto)
+}
+
+void AttributeProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_AttributeProto_onnx_2eproto3.base);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  s_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ref_attr_name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ::memset(&t_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&type_) -
+      reinterpret_cast<char*>(&t_)) + sizeof(type_));
+}
+
+AttributeProto::~AttributeProto() {
+  // @@protoc_insertion_point(destructor:onnx.AttributeProto)
+  SharedDtor();
+}
+
+void AttributeProto::SharedDtor() {
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  s_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ref_attr_name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (this != internal_default_instance()) delete t_;
+  if (this != internal_default_instance()) delete g_;
+}
+
+void AttributeProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const AttributeProto& AttributeProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_AttributeProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void AttributeProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.AttributeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  floats_.Clear();
+  ints_.Clear();
+  strings_.Clear();
+  tensors_.Clear();
+  graphs_.Clear();
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  s_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ref_attr_name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (GetArenaNoVirtual() == nullptr && t_ != nullptr) {
+    delete t_;
+  }
+  t_ = nullptr;
+  if (GetArenaNoVirtual() == nullptr && g_ != nullptr) {
+    delete g_;
+  }
+  g_ = nullptr;
+  ::memset(&i_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&type_) -
+      reinterpret_cast<char*>(&i_)) + sizeof(type_));
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* AttributeProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<AttributeProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // string name = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.AttributeProto.name");
+        object = msg->mutable_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // float f = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 21) goto handle_unusual;
+        msg->set_f(::google::protobuf::io::UnalignedLoad<float>(ptr));
+        ptr += sizeof(float);
+        break;
+      }
+      // int64 i = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 24) goto handle_unusual;
+        msg->set_i(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // bytes s = 4;
+      case 4: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 34) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        object = msg->mutable_s();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParser;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheck(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // .onnx.TensorProto t = 5;
+      case 5: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 42) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::TensorProto::_InternalParse;
+        object = msg->mutable_t();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // .onnx.GraphProto g = 6;
+      case 6: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 50) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::GraphProto::_InternalParse;
+        object = msg->mutable_g();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // repeated float floats = 7;
+      case 7: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 58) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedFloatParser;
+          object = msg->mutable_floats();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 61) goto handle_unusual;
+        do {
+          msg->add_floats(::google::protobuf::io::UnalignedLoad<float>(ptr));
+          ptr += sizeof(float);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 61 && (ptr += 1));
+        break;
+      }
+      // repeated int64 ints = 8;
+      case 8: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 66) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedInt64Parser;
+          object = msg->mutable_ints();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 64) goto handle_unusual;
+        do {
+          msg->add_ints(::google::protobuf::internal::ReadVarint(&ptr));
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 64 && (ptr += 1));
+        break;
+      }
+      // repeated bytes strings = 9;
+      case 9: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 74) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          object = msg->add_strings();
+          if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+            parser_till_end = ::google::protobuf::internal::GreedyStringParser;
+            goto string_till_end;
+          }
+          GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheck(ptr, size, ctx));
+          ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+          ptr += size;
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 74 && (ptr += 1));
+        break;
+      }
+      // repeated .onnx.TensorProto tensors = 10;
+      case 10: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 82) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::TensorProto::_InternalParse;
+          object = msg->add_tensors();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 82 && (ptr += 1));
+        break;
+      }
+      // repeated .onnx.GraphProto graphs = 11;
+      case 11: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 90) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::GraphProto::_InternalParse;
+          object = msg->add_graphs();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 90 && (ptr += 1));
+        break;
+      }
+      // string doc_string = 13;
+      case 13: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 106) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.AttributeProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // .onnx.AttributeProto.AttributeType type = 20;
+      case 20: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 160) goto handle_unusual;
+        ::google::protobuf::uint64 val = ::google::protobuf::internal::ReadVarint(&ptr);
+        msg->set_type(static_cast<::onnx::AttributeProto_AttributeType>(val));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // string ref_attr_name = 21;
+      case 21: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 170) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.AttributeProto.ref_attr_name");
+        object = msg->mutable_ref_attr_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool AttributeProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.AttributeProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(16383u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // string name = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.AttributeProto.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // float f = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (21 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>(
+                 input, &f_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int64 i = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (24 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &i_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // bytes s = 4;
+      case 4: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (34 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadBytes(
+                input, this->mutable_s()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.TensorProto t = 5;
+      case 5: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (42 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_t()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.GraphProto g = 6;
+      case 6: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (50 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_g()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated float floats = 7;
+      case 7: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (58 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>(
+                 input, this->mutable_floats())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (61 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>(
+                 1, 58u, input, this->mutable_floats())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated int64 ints = 8;
+      case 8: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (66 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, this->mutable_ints())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (64 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 1, 66u, input, this->mutable_ints())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated bytes strings = 9;
+      case 9: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (74 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadBytes(
+                input, this->add_strings()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.TensorProto tensors = 10;
+      case 10: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (82 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_tensors()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.GraphProto graphs = 11;
+      case 11: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (90 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_graphs()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 13;
+      case 13: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (106 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.AttributeProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.AttributeProto.AttributeType type = 20;
+      case 20: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (160 & 0xFF)) {
+          int value = 0;
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   int, ::google::protobuf::internal::WireFormatLite::TYPE_ENUM>(
+                 input, &value)));
+          set_type(static_cast< ::onnx::AttributeProto_AttributeType >(value));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string ref_attr_name = 21;
+      case 21: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (170 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_ref_attr_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->ref_attr_name().data(), static_cast<int>(this->ref_attr_name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.AttributeProto.ref_attr_name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.AttributeProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.AttributeProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void AttributeProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.AttributeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      1, this->name(), output);
+  }
+
+  // float f = 2;
+  if (this->f() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteFloat(2, this->f(), output);
+  }
+
+  // int64 i = 3;
+  if (this->i() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(3, this->i(), output);
+  }
+
+  // bytes s = 4;
+  if (this->s().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteBytesMaybeAliased(
+      4, this->s(), output);
+  }
+
+  // .onnx.TensorProto t = 5;
+  if (this->has_t()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      5, HasBitSetters::t(this), output);
+  }
+
+  // .onnx.GraphProto g = 6;
+  if (this->has_g()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      6, HasBitSetters::g(this), output);
+  }
+
+  // repeated float floats = 7;
+  if (this->floats_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(7, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_floats_cached_byte_size_.load(
+        std::memory_order_relaxed));
+    ::google::protobuf::internal::WireFormatLite::WriteFloatArray(
+      this->floats().data(), this->floats_size(), output);
+  }
+
+  // repeated int64 ints = 8;
+  if (this->ints_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(8, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_ints_cached_byte_size_.load(
+        std::memory_order_relaxed));
+  }
+  for (int i = 0, n = this->ints_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64NoTag(
+      this->ints(i), output);
+  }
+
+  // repeated bytes strings = 9;
+  for (int i = 0, n = this->strings_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteBytes(
+      9, this->strings(i), output);
+  }
+
+  // repeated .onnx.TensorProto tensors = 10;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->tensors_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      10,
+      this->tensors(static_cast<int>(i)),
+      output);
+  }
+
+  // repeated .onnx.GraphProto graphs = 11;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->graphs_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      11,
+      this->graphs(static_cast<int>(i)),
+      output);
+  }
+
+  // string doc_string = 13;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      13, this->doc_string(), output);
+  }
+
+  // .onnx.AttributeProto.AttributeType type = 20;
+  if (this->type() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteEnum(
+      20, this->type(), output);
+  }
+
+  // string ref_attr_name = 21;
+  if (this->ref_attr_name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->ref_attr_name().data(), static_cast<int>(this->ref_attr_name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.ref_attr_name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      21, this->ref_attr_name(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.AttributeProto)
+}
+
+::google::protobuf::uint8* AttributeProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.AttributeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        1, this->name(), target);
+  }
+
+  // float f = 2;
+  if (this->f() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(2, this->f(), target);
+  }
+
+  // int64 i = 3;
+  if (this->i() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(3, this->i(), target);
+  }
+
+  // bytes s = 4;
+  if (this->s().size() > 0) {
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteBytesToArray(
+        4, this->s(), target);
+  }
+
+  // .onnx.TensorProto t = 5;
+  if (this->has_t()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        5, HasBitSetters::t(this), target);
+  }
+
+  // .onnx.GraphProto g = 6;
+  if (this->has_g()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        6, HasBitSetters::g(this), target);
+  }
+
+  // repeated float floats = 7;
+  if (this->floats_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      7,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _floats_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteFloatNoTagToArray(this->floats_, target);
+  }
+
+  // repeated int64 ints = 8;
+  if (this->ints_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      8,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _ints_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteInt64NoTagToArray(this->ints_, target);
+  }
+
+  // repeated bytes strings = 9;
+  for (int i = 0, n = this->strings_size(); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteBytesToArray(9, this->strings(i), target);
+  }
+
+  // repeated .onnx.TensorProto tensors = 10;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->tensors_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        10, this->tensors(static_cast<int>(i)), target);
+  }
+
+  // repeated .onnx.GraphProto graphs = 11;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->graphs_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        11, this->graphs(static_cast<int>(i)), target);
+  }
+
+  // string doc_string = 13;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        13, this->doc_string(), target);
+  }
+
+  // .onnx.AttributeProto.AttributeType type = 20;
+  if (this->type() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteEnumToArray(
+      20, this->type(), target);
+  }
+
+  // string ref_attr_name = 21;
+  if (this->ref_attr_name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->ref_attr_name().data(), static_cast<int>(this->ref_attr_name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.AttributeProto.ref_attr_name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        21, this->ref_attr_name(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.AttributeProto)
+  return target;
+}
+
+size_t AttributeProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.AttributeProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated float floats = 7;
+  {
+    unsigned int count = static_cast<unsigned int>(this->floats_size());
+    size_t data_size = 4UL * count;
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _floats_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated int64 ints = 8;
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      Int64Size(this->ints_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _ints_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated bytes strings = 9;
+  total_size += 1 *
+      ::google::protobuf::internal::FromIntSize(this->strings_size());
+  for (int i = 0, n = this->strings_size(); i < n; i++) {
+    total_size += ::google::protobuf::internal::WireFormatLite::BytesSize(
+      this->strings(i));
+  }
+
+  // repeated .onnx.TensorProto tensors = 10;
+  {
+    unsigned int count = static_cast<unsigned int>(this->tensors_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->tensors(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.GraphProto graphs = 11;
+  {
+    unsigned int count = static_cast<unsigned int>(this->graphs_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->graphs(static_cast<int>(i)));
+    }
+  }
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // bytes s = 4;
+  if (this->s().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::BytesSize(
+        this->s());
+  }
+
+  // string doc_string = 13;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  // string ref_attr_name = 21;
+  if (this->ref_attr_name().size() > 0) {
+    total_size += 2 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->ref_attr_name());
+  }
+
+  // .onnx.TensorProto t = 5;
+  if (this->has_t()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *t_);
+  }
+
+  // .onnx.GraphProto g = 6;
+  if (this->has_g()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *g_);
+  }
+
+  // int64 i = 3;
+  if (this->i() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->i());
+  }
+
+  // float f = 2;
+  if (this->f() != 0) {
+    total_size += 1 + 4;
+  }
+
+  // .onnx.AttributeProto.AttributeType type = 20;
+  if (this->type() != 0) {
+    total_size += 2 +
+      ::google::protobuf::internal::WireFormatLite::EnumSize(this->type());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void AttributeProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.AttributeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const AttributeProto* source =
+      ::google::protobuf::DynamicCastToGenerated<AttributeProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.AttributeProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.AttributeProto)
+    MergeFrom(*source);
+  }
+}
+
+void AttributeProto::MergeFrom(const AttributeProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.AttributeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  floats_.MergeFrom(from.floats_);
+  ints_.MergeFrom(from.ints_);
+  strings_.MergeFrom(from.strings_);
+  tensors_.MergeFrom(from.tensors_);
+  graphs_.MergeFrom(from.graphs_);
+  if (from.name().size() > 0) {
+
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  if (from.s().size() > 0) {
+
+    s_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.s_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.ref_attr_name().size() > 0) {
+
+    ref_attr_name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.ref_attr_name_);
+  }
+  if (from.has_t()) {
+    mutable_t()->::onnx::TensorProto::MergeFrom(from.t());
+  }
+  if (from.has_g()) {
+    mutable_g()->::onnx::GraphProto::MergeFrom(from.g());
+  }
+  if (from.i() != 0) {
+    set_i(from.i());
+  }
+  if (from.f() != 0) {
+    set_f(from.f());
+  }
+  if (from.type() != 0) {
+    set_type(from.type());
+  }
+}
+
+void AttributeProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.AttributeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void AttributeProto::CopyFrom(const AttributeProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.AttributeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool AttributeProto::IsInitialized() const {
+  return true;
+}
+
+void AttributeProto::Swap(AttributeProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void AttributeProto::InternalSwap(AttributeProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  floats_.InternalSwap(&other->floats_);
+  ints_.InternalSwap(&other->ints_);
+  strings_.InternalSwap(CastToBase(&other->strings_));
+  CastToBase(&tensors_)->InternalSwap(CastToBase(&other->tensors_));
+  CastToBase(&graphs_)->InternalSwap(CastToBase(&other->graphs_));
+  name_.Swap(&other->name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  s_.Swap(&other->s_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  ref_attr_name_.Swap(&other->ref_attr_name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(t_, other->t_);
+  swap(g_, other->g_);
+  swap(i_, other->i_);
+  swap(f_, other->f_);
+  swap(type_, other->type_);
+}
+
+::google::protobuf::Metadata AttributeProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void ValueInfoProto::InitAsDefaultInstance() {
+  ::onnx::_ValueInfoProto_default_instance_._instance.get_mutable()->type_ = const_cast< ::onnx::TypeProto*>(
+      ::onnx::TypeProto::internal_default_instance());
+}
+class ValueInfoProto::HasBitSetters {
+ public:
+  static const ::onnx::TypeProto& type(const ValueInfoProto* msg);
+};
+
+const ::onnx::TypeProto&
+ValueInfoProto::HasBitSetters::type(const ValueInfoProto* msg) {
+  return *msg->type_;
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int ValueInfoProto::kNameFieldNumber;
+const int ValueInfoProto::kTypeFieldNumber;
+const int ValueInfoProto::kDocStringFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+ValueInfoProto::ValueInfoProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.ValueInfoProto)
+}
+ValueInfoProto::ValueInfoProto(const ValueInfoProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_type()) {
+    type_ = new ::onnx::TypeProto(*from.type_);
+  } else {
+    type_ = nullptr;
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.ValueInfoProto)
+}
+
+void ValueInfoProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_ValueInfoProto_onnx_2eproto3.base);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  type_ = nullptr;
+}
+
+ValueInfoProto::~ValueInfoProto() {
+  // @@protoc_insertion_point(destructor:onnx.ValueInfoProto)
+  SharedDtor();
+}
+
+void ValueInfoProto::SharedDtor() {
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (this != internal_default_instance()) delete type_;
+}
+
+void ValueInfoProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const ValueInfoProto& ValueInfoProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_ValueInfoProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void ValueInfoProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.ValueInfoProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (GetArenaNoVirtual() == nullptr && type_ != nullptr) {
+    delete type_;
+  }
+  type_ = nullptr;
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* ValueInfoProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<ValueInfoProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // string name = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ValueInfoProto.name");
+        object = msg->mutable_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // .onnx.TypeProto type = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::TypeProto::_InternalParse;
+        object = msg->mutable_type();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // string doc_string = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 26) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ValueInfoProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool ValueInfoProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.ValueInfoProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // string name = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ValueInfoProto.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.TypeProto type = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_type()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (26 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ValueInfoProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.ValueInfoProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.ValueInfoProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void ValueInfoProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.ValueInfoProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ValueInfoProto.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      1, this->name(), output);
+  }
+
+  // .onnx.TypeProto type = 2;
+  if (this->has_type()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      2, HasBitSetters::type(this), output);
+  }
+
+  // string doc_string = 3;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ValueInfoProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      3, this->doc_string(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.ValueInfoProto)
+}
+
+::google::protobuf::uint8* ValueInfoProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.ValueInfoProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ValueInfoProto.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        1, this->name(), target);
+  }
+
+  // .onnx.TypeProto type = 2;
+  if (this->has_type()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        2, HasBitSetters::type(this), target);
+  }
+
+  // string doc_string = 3;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ValueInfoProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        3, this->doc_string(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.ValueInfoProto)
+  return target;
+}
+
+size_t ValueInfoProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.ValueInfoProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // string name = 1;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // string doc_string = 3;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  // .onnx.TypeProto type = 2;
+  if (this->has_type()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *type_);
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void ValueInfoProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.ValueInfoProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const ValueInfoProto* source =
+      ::google::protobuf::DynamicCastToGenerated<ValueInfoProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.ValueInfoProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.ValueInfoProto)
+    MergeFrom(*source);
+  }
+}
+
+void ValueInfoProto::MergeFrom(const ValueInfoProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.ValueInfoProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.name().size() > 0) {
+
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_type()) {
+    mutable_type()->::onnx::TypeProto::MergeFrom(from.type());
+  }
+}
+
+void ValueInfoProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.ValueInfoProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void ValueInfoProto::CopyFrom(const ValueInfoProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.ValueInfoProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool ValueInfoProto::IsInitialized() const {
+  return true;
+}
+
+void ValueInfoProto::Swap(ValueInfoProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void ValueInfoProto::InternalSwap(ValueInfoProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  name_.Swap(&other->name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(type_, other->type_);
+}
+
+::google::protobuf::Metadata ValueInfoProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void NodeProto::InitAsDefaultInstance() {
+}
+class NodeProto::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int NodeProto::kInputFieldNumber;
+const int NodeProto::kOutputFieldNumber;
+const int NodeProto::kNameFieldNumber;
+const int NodeProto::kOpTypeFieldNumber;
+const int NodeProto::kDomainFieldNumber;
+const int NodeProto::kAttributeFieldNumber;
+const int NodeProto::kDocStringFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+NodeProto::NodeProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.NodeProto)
+}
+NodeProto::NodeProto(const NodeProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      input_(from.input_),
+      output_(from.output_),
+      attribute_(from.attribute_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  op_type_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.op_type().size() > 0) {
+    op_type_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.op_type_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.domain().size() > 0) {
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.NodeProto)
+}
+
+void NodeProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_AttributeProto_onnx_2eproto3.base);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  op_type_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+NodeProto::~NodeProto() {
+  // @@protoc_insertion_point(destructor:onnx.NodeProto)
+  SharedDtor();
+}
+
+void NodeProto::SharedDtor() {
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  op_type_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+void NodeProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const NodeProto& NodeProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_AttributeProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void NodeProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.NodeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  input_.Clear();
+  output_.Clear();
+  attribute_.Clear();
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  op_type_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* NodeProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<NodeProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // repeated string input = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          ctx->extra_parse_data().SetFieldName("onnx.NodeProto.input");
+          object = msg->add_input();
+          if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+            parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+            goto string_till_end;
+          }
+          GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+          ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+          ptr += size;
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 10 && (ptr += 1));
+        break;
+      }
+      // repeated string output = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          ctx->extra_parse_data().SetFieldName("onnx.NodeProto.output");
+          object = msg->add_output();
+          if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+            parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+            goto string_till_end;
+          }
+          GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+          ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+          ptr += size;
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 18 && (ptr += 1));
+        break;
+      }
+      // string name = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 26) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.NodeProto.name");
+        object = msg->mutable_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string op_type = 4;
+      case 4: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 34) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.NodeProto.op_type");
+        object = msg->mutable_op_type();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // repeated .onnx.AttributeProto attribute = 5;
+      case 5: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 42) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::AttributeProto::_InternalParse;
+          object = msg->add_attribute();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 42 && (ptr += 1));
+        break;
+      }
+      // string doc_string = 6;
+      case 6: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 50) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.NodeProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string domain = 7;
+      case 7: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 58) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.NodeProto.domain");
+        object = msg->mutable_domain();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool NodeProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.NodeProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated string input = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->add_input()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->input(this->input_size() - 1).data(),
+            static_cast<int>(this->input(this->input_size() - 1).length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.input"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated string output = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->add_output()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->output(this->output_size() - 1).data(),
+            static_cast<int>(this->output(this->output_size() - 1).length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.output"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string name = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (26 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string op_type = 4;
+      case 4: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (34 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_op_type()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->op_type().data(), static_cast<int>(this->op_type().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.op_type"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.AttributeProto attribute = 5;
+      case 5: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (42 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_attribute()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 6;
+      case 6: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (50 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string domain = 7;
+      case 7: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (58 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_domain()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->domain().data(), static_cast<int>(this->domain().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.NodeProto.domain"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.NodeProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.NodeProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void NodeProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.NodeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated string input = 1;
+  for (int i = 0, n = this->input_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->input(i).data(), static_cast<int>(this->input(i).length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.input");
+    ::google::protobuf::internal::WireFormatLite::WriteString(
+      1, this->input(i), output);
+  }
+
+  // repeated string output = 2;
+  for (int i = 0, n = this->output_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->output(i).data(), static_cast<int>(this->output(i).length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.output");
+    ::google::protobuf::internal::WireFormatLite::WriteString(
+      2, this->output(i), output);
+  }
+
+  // string name = 3;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      3, this->name(), output);
+  }
+
+  // string op_type = 4;
+  if (this->op_type().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->op_type().data(), static_cast<int>(this->op_type().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.op_type");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      4, this->op_type(), output);
+  }
+
+  // repeated .onnx.AttributeProto attribute = 5;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->attribute_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      5,
+      this->attribute(static_cast<int>(i)),
+      output);
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      6, this->doc_string(), output);
+  }
+
+  // string domain = 7;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.domain");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      7, this->domain(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.NodeProto)
+}
+
+::google::protobuf::uint8* NodeProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.NodeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated string input = 1;
+  for (int i = 0, n = this->input_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->input(i).data(), static_cast<int>(this->input(i).length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.input");
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteStringToArray(1, this->input(i), target);
+  }
+
+  // repeated string output = 2;
+  for (int i = 0, n = this->output_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->output(i).data(), static_cast<int>(this->output(i).length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.output");
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteStringToArray(2, this->output(i), target);
+  }
+
+  // string name = 3;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        3, this->name(), target);
+  }
+
+  // string op_type = 4;
+  if (this->op_type().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->op_type().data(), static_cast<int>(this->op_type().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.op_type");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        4, this->op_type(), target);
+  }
+
+  // repeated .onnx.AttributeProto attribute = 5;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->attribute_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        5, this->attribute(static_cast<int>(i)), target);
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        6, this->doc_string(), target);
+  }
+
+  // string domain = 7;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.NodeProto.domain");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        7, this->domain(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.NodeProto)
+  return target;
+}
+
+size_t NodeProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.NodeProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated string input = 1;
+  total_size += 1 *
+      ::google::protobuf::internal::FromIntSize(this->input_size());
+  for (int i = 0, n = this->input_size(); i < n; i++) {
+    total_size += ::google::protobuf::internal::WireFormatLite::StringSize(
+      this->input(i));
+  }
+
+  // repeated string output = 2;
+  total_size += 1 *
+      ::google::protobuf::internal::FromIntSize(this->output_size());
+  for (int i = 0, n = this->output_size(); i < n; i++) {
+    total_size += ::google::protobuf::internal::WireFormatLite::StringSize(
+      this->output(i));
+  }
+
+  // repeated .onnx.AttributeProto attribute = 5;
+  {
+    unsigned int count = static_cast<unsigned int>(this->attribute_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->attribute(static_cast<int>(i)));
+    }
+  }
+
+  // string name = 3;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // string op_type = 4;
+  if (this->op_type().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->op_type());
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  // string domain = 7;
+  if (this->domain().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->domain());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void NodeProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.NodeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const NodeProto* source =
+      ::google::protobuf::DynamicCastToGenerated<NodeProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.NodeProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.NodeProto)
+    MergeFrom(*source);
+  }
+}
+
+void NodeProto::MergeFrom(const NodeProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.NodeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  input_.MergeFrom(from.input_);
+  output_.MergeFrom(from.output_);
+  attribute_.MergeFrom(from.attribute_);
+  if (from.name().size() > 0) {
+
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  if (from.op_type().size() > 0) {
+
+    op_type_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.op_type_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.domain().size() > 0) {
+
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+}
+
+void NodeProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.NodeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void NodeProto::CopyFrom(const NodeProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.NodeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool NodeProto::IsInitialized() const {
+  return true;
+}
+
+void NodeProto::Swap(NodeProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void NodeProto::InternalSwap(NodeProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  input_.InternalSwap(CastToBase(&other->input_));
+  output_.InternalSwap(CastToBase(&other->output_));
+  CastToBase(&attribute_)->InternalSwap(CastToBase(&other->attribute_));
+  name_.Swap(&other->name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  op_type_.Swap(&other->op_type_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  domain_.Swap(&other->domain_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+}
+
+::google::protobuf::Metadata NodeProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void ModelProto::InitAsDefaultInstance() {
+  ::onnx::_ModelProto_default_instance_._instance.get_mutable()->graph_ = const_cast< ::onnx::GraphProto*>(
+      ::onnx::GraphProto::internal_default_instance());
+}
+class ModelProto::HasBitSetters {
+ public:
+  static const ::onnx::GraphProto& graph(const ModelProto* msg);
+};
+
+const ::onnx::GraphProto&
+ModelProto::HasBitSetters::graph(const ModelProto* msg) {
+  return *msg->graph_;
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int ModelProto::kIrVersionFieldNumber;
+const int ModelProto::kOpsetImportFieldNumber;
+const int ModelProto::kProducerNameFieldNumber;
+const int ModelProto::kProducerVersionFieldNumber;
+const int ModelProto::kDomainFieldNumber;
+const int ModelProto::kModelVersionFieldNumber;
+const int ModelProto::kDocStringFieldNumber;
+const int ModelProto::kGraphFieldNumber;
+const int ModelProto::kMetadataPropsFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+ModelProto::ModelProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.ModelProto)
+}
+ModelProto::ModelProto(const ModelProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      opset_import_(from.opset_import_),
+      metadata_props_(from.metadata_props_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  producer_name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.producer_name().size() > 0) {
+    producer_name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.producer_name_);
+  }
+  producer_version_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.producer_version().size() > 0) {
+    producer_version_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.producer_version_);
+  }
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.domain().size() > 0) {
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_graph()) {
+    graph_ = new ::onnx::GraphProto(*from.graph_);
+  } else {
+    graph_ = nullptr;
+  }
+  ::memcpy(&ir_version_, &from.ir_version_,
+    static_cast<size_t>(reinterpret_cast<char*>(&model_version_) -
+    reinterpret_cast<char*>(&ir_version_)) + sizeof(model_version_));
+  // @@protoc_insertion_point(copy_constructor:onnx.ModelProto)
+}
+
+void ModelProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_ModelProto_onnx_2eproto3.base);
+  producer_name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  producer_version_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ::memset(&graph_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&model_version_) -
+      reinterpret_cast<char*>(&graph_)) + sizeof(model_version_));
+}
+
+ModelProto::~ModelProto() {
+  // @@protoc_insertion_point(destructor:onnx.ModelProto)
+  SharedDtor();
+}
+
+void ModelProto::SharedDtor() {
+  producer_name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  producer_version_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (this != internal_default_instance()) delete graph_;
+}
+
+void ModelProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const ModelProto& ModelProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_ModelProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void ModelProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.ModelProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  opset_import_.Clear();
+  metadata_props_.Clear();
+  producer_name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  producer_version_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (GetArenaNoVirtual() == nullptr && graph_ != nullptr) {
+    delete graph_;
+  }
+  graph_ = nullptr;
+  ::memset(&ir_version_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&model_version_) -
+      reinterpret_cast<char*>(&ir_version_)) + sizeof(model_version_));
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* ModelProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<ModelProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // int64 ir_version = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 8) goto handle_unusual;
+        msg->set_ir_version(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // string producer_name = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ModelProto.producer_name");
+        object = msg->mutable_producer_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string producer_version = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 26) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ModelProto.producer_version");
+        object = msg->mutable_producer_version();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string domain = 4;
+      case 4: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 34) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ModelProto.domain");
+        object = msg->mutable_domain();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // int64 model_version = 5;
+      case 5: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 40) goto handle_unusual;
+        msg->set_model_version(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // string doc_string = 6;
+      case 6: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 50) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.ModelProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // .onnx.GraphProto graph = 7;
+      case 7: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 58) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::GraphProto::_InternalParse;
+        object = msg->mutable_graph();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // repeated .onnx.OperatorSetIdProto opset_import = 8;
+      case 8: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 66) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::OperatorSetIdProto::_InternalParse;
+          object = msg->add_opset_import();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 66 && (ptr += 1));
+        break;
+      }
+      // repeated .onnx.StringStringEntryProto metadata_props = 14;
+      case 14: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 114) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::StringStringEntryProto::_InternalParse;
+          object = msg->add_metadata_props();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 114 && (ptr += 1));
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool ModelProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.ModelProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int64 ir_version = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (8 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &ir_version_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string producer_name = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_producer_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->producer_name().data(), static_cast<int>(this->producer_name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ModelProto.producer_name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string producer_version = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (26 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_producer_version()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->producer_version().data(), static_cast<int>(this->producer_version().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ModelProto.producer_version"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string domain = 4;
+      case 4: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (34 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_domain()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->domain().data(), static_cast<int>(this->domain().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ModelProto.domain"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int64 model_version = 5;
+      case 5: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (40 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &model_version_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 6;
+      case 6: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (50 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.ModelProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.GraphProto graph = 7;
+      case 7: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (58 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_graph()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.OperatorSetIdProto opset_import = 8;
+      case 8: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (66 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_opset_import()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.StringStringEntryProto metadata_props = 14;
+      case 14: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (114 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_metadata_props()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.ModelProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.ModelProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void ModelProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.ModelProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 ir_version = 1;
+  if (this->ir_version() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(1, this->ir_version(), output);
+  }
+
+  // string producer_name = 2;
+  if (this->producer_name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->producer_name().data(), static_cast<int>(this->producer_name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.producer_name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      2, this->producer_name(), output);
+  }
+
+  // string producer_version = 3;
+  if (this->producer_version().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->producer_version().data(), static_cast<int>(this->producer_version().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.producer_version");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      3, this->producer_version(), output);
+  }
+
+  // string domain = 4;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.domain");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      4, this->domain(), output);
+  }
+
+  // int64 model_version = 5;
+  if (this->model_version() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(5, this->model_version(), output);
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      6, this->doc_string(), output);
+  }
+
+  // .onnx.GraphProto graph = 7;
+  if (this->has_graph()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      7, HasBitSetters::graph(this), output);
+  }
+
+  // repeated .onnx.OperatorSetIdProto opset_import = 8;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->opset_import_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      8,
+      this->opset_import(static_cast<int>(i)),
+      output);
+  }
+
+  // repeated .onnx.StringStringEntryProto metadata_props = 14;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->metadata_props_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      14,
+      this->metadata_props(static_cast<int>(i)),
+      output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.ModelProto)
+}
+
+::google::protobuf::uint8* ModelProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.ModelProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 ir_version = 1;
+  if (this->ir_version() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(1, this->ir_version(), target);
+  }
+
+  // string producer_name = 2;
+  if (this->producer_name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->producer_name().data(), static_cast<int>(this->producer_name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.producer_name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        2, this->producer_name(), target);
+  }
+
+  // string producer_version = 3;
+  if (this->producer_version().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->producer_version().data(), static_cast<int>(this->producer_version().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.producer_version");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        3, this->producer_version(), target);
+  }
+
+  // string domain = 4;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.domain");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        4, this->domain(), target);
+  }
+
+  // int64 model_version = 5;
+  if (this->model_version() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(5, this->model_version(), target);
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.ModelProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        6, this->doc_string(), target);
+  }
+
+  // .onnx.GraphProto graph = 7;
+  if (this->has_graph()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        7, HasBitSetters::graph(this), target);
+  }
+
+  // repeated .onnx.OperatorSetIdProto opset_import = 8;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->opset_import_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        8, this->opset_import(static_cast<int>(i)), target);
+  }
+
+  // repeated .onnx.StringStringEntryProto metadata_props = 14;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->metadata_props_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        14, this->metadata_props(static_cast<int>(i)), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.ModelProto)
+  return target;
+}
+
+size_t ModelProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.ModelProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated .onnx.OperatorSetIdProto opset_import = 8;
+  {
+    unsigned int count = static_cast<unsigned int>(this->opset_import_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->opset_import(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.StringStringEntryProto metadata_props = 14;
+  {
+    unsigned int count = static_cast<unsigned int>(this->metadata_props_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->metadata_props(static_cast<int>(i)));
+    }
+  }
+
+  // string producer_name = 2;
+  if (this->producer_name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->producer_name());
+  }
+
+  // string producer_version = 3;
+  if (this->producer_version().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->producer_version());
+  }
+
+  // string domain = 4;
+  if (this->domain().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->domain());
+  }
+
+  // string doc_string = 6;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  // .onnx.GraphProto graph = 7;
+  if (this->has_graph()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *graph_);
+  }
+
+  // int64 ir_version = 1;
+  if (this->ir_version() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->ir_version());
+  }
+
+  // int64 model_version = 5;
+  if (this->model_version() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->model_version());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void ModelProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.ModelProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const ModelProto* source =
+      ::google::protobuf::DynamicCastToGenerated<ModelProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.ModelProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.ModelProto)
+    MergeFrom(*source);
+  }
+}
+
+void ModelProto::MergeFrom(const ModelProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.ModelProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  opset_import_.MergeFrom(from.opset_import_);
+  metadata_props_.MergeFrom(from.metadata_props_);
+  if (from.producer_name().size() > 0) {
+
+    producer_name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.producer_name_);
+  }
+  if (from.producer_version().size() > 0) {
+
+    producer_version_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.producer_version_);
+  }
+  if (from.domain().size() > 0) {
+
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_graph()) {
+    mutable_graph()->::onnx::GraphProto::MergeFrom(from.graph());
+  }
+  if (from.ir_version() != 0) {
+    set_ir_version(from.ir_version());
+  }
+  if (from.model_version() != 0) {
+    set_model_version(from.model_version());
+  }
+}
+
+void ModelProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.ModelProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void ModelProto::CopyFrom(const ModelProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.ModelProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool ModelProto::IsInitialized() const {
+  return true;
+}
+
+void ModelProto::Swap(ModelProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void ModelProto::InternalSwap(ModelProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  CastToBase(&opset_import_)->InternalSwap(CastToBase(&other->opset_import_));
+  CastToBase(&metadata_props_)->InternalSwap(CastToBase(&other->metadata_props_));
+  producer_name_.Swap(&other->producer_name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  producer_version_.Swap(&other->producer_version_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  domain_.Swap(&other->domain_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(graph_, other->graph_);
+  swap(ir_version_, other->ir_version_);
+  swap(model_version_, other->model_version_);
+}
+
+::google::protobuf::Metadata ModelProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void StringStringEntryProto::InitAsDefaultInstance() {
+}
+class StringStringEntryProto::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int StringStringEntryProto::kKeyFieldNumber;
+const int StringStringEntryProto::kValueFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+StringStringEntryProto::StringStringEntryProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.StringStringEntryProto)
+}
+StringStringEntryProto::StringStringEntryProto(const StringStringEntryProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  key_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.key().size() > 0) {
+    key_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.key_);
+  }
+  value_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.value().size() > 0) {
+    value_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.value_);
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.StringStringEntryProto)
+}
+
+void StringStringEntryProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_StringStringEntryProto_onnx_2eproto3.base);
+  key_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  value_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+StringStringEntryProto::~StringStringEntryProto() {
+  // @@protoc_insertion_point(destructor:onnx.StringStringEntryProto)
+  SharedDtor();
+}
+
+void StringStringEntryProto::SharedDtor() {
+  key_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  value_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+void StringStringEntryProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const StringStringEntryProto& StringStringEntryProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_StringStringEntryProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void StringStringEntryProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.StringStringEntryProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  key_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  value_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* StringStringEntryProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<StringStringEntryProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // string key = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.StringStringEntryProto.key");
+        object = msg->mutable_key();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string value = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.StringStringEntryProto.value");
+        object = msg->mutable_value();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool StringStringEntryProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.StringStringEntryProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // string key = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_key()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->key().data(), static_cast<int>(this->key().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.StringStringEntryProto.key"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string value = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_value()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->value().data(), static_cast<int>(this->value().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.StringStringEntryProto.value"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.StringStringEntryProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.StringStringEntryProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void StringStringEntryProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.StringStringEntryProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string key = 1;
+  if (this->key().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->key().data(), static_cast<int>(this->key().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.StringStringEntryProto.key");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      1, this->key(), output);
+  }
+
+  // string value = 2;
+  if (this->value().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->value().data(), static_cast<int>(this->value().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.StringStringEntryProto.value");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      2, this->value(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.StringStringEntryProto)
+}
+
+::google::protobuf::uint8* StringStringEntryProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.StringStringEntryProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string key = 1;
+  if (this->key().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->key().data(), static_cast<int>(this->key().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.StringStringEntryProto.key");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        1, this->key(), target);
+  }
+
+  // string value = 2;
+  if (this->value().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->value().data(), static_cast<int>(this->value().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.StringStringEntryProto.value");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        2, this->value(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.StringStringEntryProto)
+  return target;
+}
+
+size_t StringStringEntryProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.StringStringEntryProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // string key = 1;
+  if (this->key().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->key());
+  }
+
+  // string value = 2;
+  if (this->value().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->value());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void StringStringEntryProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.StringStringEntryProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const StringStringEntryProto* source =
+      ::google::protobuf::DynamicCastToGenerated<StringStringEntryProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.StringStringEntryProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.StringStringEntryProto)
+    MergeFrom(*source);
+  }
+}
+
+void StringStringEntryProto::MergeFrom(const StringStringEntryProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.StringStringEntryProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.key().size() > 0) {
+
+    key_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.key_);
+  }
+  if (from.value().size() > 0) {
+
+    value_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.value_);
+  }
+}
+
+void StringStringEntryProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.StringStringEntryProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void StringStringEntryProto::CopyFrom(const StringStringEntryProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.StringStringEntryProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool StringStringEntryProto::IsInitialized() const {
+  return true;
+}
+
+void StringStringEntryProto::Swap(StringStringEntryProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void StringStringEntryProto::InternalSwap(StringStringEntryProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  key_.Swap(&other->key_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  value_.Swap(&other->value_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+}
+
+::google::protobuf::Metadata StringStringEntryProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void GraphProto::InitAsDefaultInstance() {
+}
+class GraphProto::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int GraphProto::kNodeFieldNumber;
+const int GraphProto::kNameFieldNumber;
+const int GraphProto::kInitializerFieldNumber;
+const int GraphProto::kDocStringFieldNumber;
+const int GraphProto::kInputFieldNumber;
+const int GraphProto::kOutputFieldNumber;
+const int GraphProto::kValueInfoFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+GraphProto::GraphProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.GraphProto)
+}
+GraphProto::GraphProto(const GraphProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      node_(from.node_),
+      initializer_(from.initializer_),
+      input_(from.input_),
+      output_(from.output_),
+      value_info_(from.value_info_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.GraphProto)
+}
+
+void GraphProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_AttributeProto_onnx_2eproto3.base);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+GraphProto::~GraphProto() {
+  // @@protoc_insertion_point(destructor:onnx.GraphProto)
+  SharedDtor();
+}
+
+void GraphProto::SharedDtor() {
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+void GraphProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const GraphProto& GraphProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_AttributeProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void GraphProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.GraphProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  node_.Clear();
+  initializer_.Clear();
+  input_.Clear();
+  output_.Clear();
+  value_info_.Clear();
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* GraphProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<GraphProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // repeated .onnx.NodeProto node = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::NodeProto::_InternalParse;
+          object = msg->add_node();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 10 && (ptr += 1));
+        break;
+      }
+      // string name = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.GraphProto.name");
+        object = msg->mutable_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // repeated .onnx.TensorProto initializer = 5;
+      case 5: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 42) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::TensorProto::_InternalParse;
+          object = msg->add_initializer();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 42 && (ptr += 1));
+        break;
+      }
+      // string doc_string = 10;
+      case 10: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 82) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.GraphProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // repeated .onnx.ValueInfoProto input = 11;
+      case 11: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 90) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::ValueInfoProto::_InternalParse;
+          object = msg->add_input();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 90 && (ptr += 1));
+        break;
+      }
+      // repeated .onnx.ValueInfoProto output = 12;
+      case 12: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 98) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::ValueInfoProto::_InternalParse;
+          object = msg->add_output();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 98 && (ptr += 1));
+        break;
+      }
+      // repeated .onnx.ValueInfoProto value_info = 13;
+      case 13: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 106) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::ValueInfoProto::_InternalParse;
+          object = msg->add_value_info();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 106 && (ptr += 1));
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool GraphProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.GraphProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated .onnx.NodeProto node = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_node()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string name = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.GraphProto.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.TensorProto initializer = 5;
+      case 5: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (42 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_initializer()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 10;
+      case 10: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (82 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.GraphProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.ValueInfoProto input = 11;
+      case 11: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (90 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_input()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.ValueInfoProto output = 12;
+      case 12: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (98 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_output()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.ValueInfoProto value_info = 13;
+      case 13: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (106 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_value_info()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.GraphProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.GraphProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void GraphProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.GraphProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .onnx.NodeProto node = 1;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->node_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      1,
+      this->node(static_cast<int>(i)),
+      output);
+  }
+
+  // string name = 2;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.GraphProto.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      2, this->name(), output);
+  }
+
+  // repeated .onnx.TensorProto initializer = 5;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->initializer_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      5,
+      this->initializer(static_cast<int>(i)),
+      output);
+  }
+
+  // string doc_string = 10;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.GraphProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      10, this->doc_string(), output);
+  }
+
+  // repeated .onnx.ValueInfoProto input = 11;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->input_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      11,
+      this->input(static_cast<int>(i)),
+      output);
+  }
+
+  // repeated .onnx.ValueInfoProto output = 12;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->output_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      12,
+      this->output(static_cast<int>(i)),
+      output);
+  }
+
+  // repeated .onnx.ValueInfoProto value_info = 13;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->value_info_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      13,
+      this->value_info(static_cast<int>(i)),
+      output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.GraphProto)
+}
+
+::google::protobuf::uint8* GraphProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.GraphProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .onnx.NodeProto node = 1;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->node_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        1, this->node(static_cast<int>(i)), target);
+  }
+
+  // string name = 2;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.GraphProto.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        2, this->name(), target);
+  }
+
+  // repeated .onnx.TensorProto initializer = 5;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->initializer_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        5, this->initializer(static_cast<int>(i)), target);
+  }
+
+  // string doc_string = 10;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.GraphProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        10, this->doc_string(), target);
+  }
+
+  // repeated .onnx.ValueInfoProto input = 11;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->input_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        11, this->input(static_cast<int>(i)), target);
+  }
+
+  // repeated .onnx.ValueInfoProto output = 12;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->output_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        12, this->output(static_cast<int>(i)), target);
+  }
+
+  // repeated .onnx.ValueInfoProto value_info = 13;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->value_info_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        13, this->value_info(static_cast<int>(i)), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.GraphProto)
+  return target;
+}
+
+size_t GraphProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.GraphProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated .onnx.NodeProto node = 1;
+  {
+    unsigned int count = static_cast<unsigned int>(this->node_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->node(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.TensorProto initializer = 5;
+  {
+    unsigned int count = static_cast<unsigned int>(this->initializer_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->initializer(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.ValueInfoProto input = 11;
+  {
+    unsigned int count = static_cast<unsigned int>(this->input_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->input(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.ValueInfoProto output = 12;
+  {
+    unsigned int count = static_cast<unsigned int>(this->output_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->output(static_cast<int>(i)));
+    }
+  }
+
+  // repeated .onnx.ValueInfoProto value_info = 13;
+  {
+    unsigned int count = static_cast<unsigned int>(this->value_info_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->value_info(static_cast<int>(i)));
+    }
+  }
+
+  // string name = 2;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // string doc_string = 10;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void GraphProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.GraphProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const GraphProto* source =
+      ::google::protobuf::DynamicCastToGenerated<GraphProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.GraphProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.GraphProto)
+    MergeFrom(*source);
+  }
+}
+
+void GraphProto::MergeFrom(const GraphProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.GraphProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  node_.MergeFrom(from.node_);
+  initializer_.MergeFrom(from.initializer_);
+  input_.MergeFrom(from.input_);
+  output_.MergeFrom(from.output_);
+  value_info_.MergeFrom(from.value_info_);
+  if (from.name().size() > 0) {
+
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+}
+
+void GraphProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.GraphProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void GraphProto::CopyFrom(const GraphProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.GraphProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool GraphProto::IsInitialized() const {
+  return true;
+}
+
+void GraphProto::Swap(GraphProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void GraphProto::InternalSwap(GraphProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  CastToBase(&node_)->InternalSwap(CastToBase(&other->node_));
+  CastToBase(&initializer_)->InternalSwap(CastToBase(&other->initializer_));
+  CastToBase(&input_)->InternalSwap(CastToBase(&other->input_));
+  CastToBase(&output_)->InternalSwap(CastToBase(&other->output_));
+  CastToBase(&value_info_)->InternalSwap(CastToBase(&other->value_info_));
+  name_.Swap(&other->name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+}
+
+::google::protobuf::Metadata GraphProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TensorProto_Segment::InitAsDefaultInstance() {
+}
+class TensorProto_Segment::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorProto_Segment::kBeginFieldNumber;
+const int TensorProto_Segment::kEndFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorProto_Segment::TensorProto_Segment()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TensorProto.Segment)
+}
+TensorProto_Segment::TensorProto_Segment(const TensorProto_Segment& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::memcpy(&begin_, &from.begin_,
+    static_cast<size_t>(reinterpret_cast<char*>(&end_) -
+    reinterpret_cast<char*>(&begin_)) + sizeof(end_));
+  // @@protoc_insertion_point(copy_constructor:onnx.TensorProto.Segment)
+}
+
+void TensorProto_Segment::SharedCtor() {
+  ::memset(&begin_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&end_) -
+      reinterpret_cast<char*>(&begin_)) + sizeof(end_));
+}
+
+TensorProto_Segment::~TensorProto_Segment() {
+  // @@protoc_insertion_point(destructor:onnx.TensorProto.Segment)
+  SharedDtor();
+}
+
+void TensorProto_Segment::SharedDtor() {
+}
+
+void TensorProto_Segment::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TensorProto_Segment& TensorProto_Segment::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TensorProto_Segment_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TensorProto_Segment::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TensorProto.Segment)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  ::memset(&begin_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&end_) -
+      reinterpret_cast<char*>(&begin_)) + sizeof(end_));
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TensorProto_Segment::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TensorProto_Segment*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // int64 begin = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 8) goto handle_unusual;
+        msg->set_begin(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // int64 end = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 16) goto handle_unusual;
+        msg->set_end(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TensorProto_Segment::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TensorProto.Segment)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int64 begin = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (8 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &begin_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int64 end = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (16 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &end_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TensorProto.Segment)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TensorProto.Segment)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TensorProto_Segment::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TensorProto.Segment)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 begin = 1;
+  if (this->begin() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(1, this->begin(), output);
+  }
+
+  // int64 end = 2;
+  if (this->end() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(2, this->end(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TensorProto.Segment)
+}
+
+::google::protobuf::uint8* TensorProto_Segment::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TensorProto.Segment)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 begin = 1;
+  if (this->begin() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(1, this->begin(), target);
+  }
+
+  // int64 end = 2;
+  if (this->end() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(2, this->end(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TensorProto.Segment)
+  return target;
+}
+
+size_t TensorProto_Segment::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TensorProto.Segment)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // int64 begin = 1;
+  if (this->begin() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->begin());
+  }
+
+  // int64 end = 2;
+  if (this->end() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->end());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TensorProto_Segment::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TensorProto.Segment)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorProto_Segment* source =
+      ::google::protobuf::DynamicCastToGenerated<TensorProto_Segment>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TensorProto.Segment)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TensorProto.Segment)
+    MergeFrom(*source);
+  }
+}
+
+void TensorProto_Segment::MergeFrom(const TensorProto_Segment& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TensorProto.Segment)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.begin() != 0) {
+    set_begin(from.begin());
+  }
+  if (from.end() != 0) {
+    set_end(from.end());
+  }
+}
+
+void TensorProto_Segment::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TensorProto.Segment)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorProto_Segment::CopyFrom(const TensorProto_Segment& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TensorProto.Segment)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorProto_Segment::IsInitialized() const {
+  return true;
+}
+
+void TensorProto_Segment::Swap(TensorProto_Segment* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TensorProto_Segment::InternalSwap(TensorProto_Segment* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  swap(begin_, other->begin_);
+  swap(end_, other->end_);
+}
+
+::google::protobuf::Metadata TensorProto_Segment::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TensorProto::InitAsDefaultInstance() {
+  ::onnx::_TensorProto_default_instance_._instance.get_mutable()->segment_ = const_cast< ::onnx::TensorProto_Segment*>(
+      ::onnx::TensorProto_Segment::internal_default_instance());
+}
+class TensorProto::HasBitSetters {
+ public:
+  static const ::onnx::TensorProto_Segment& segment(const TensorProto* msg);
+};
+
+const ::onnx::TensorProto_Segment&
+TensorProto::HasBitSetters::segment(const TensorProto* msg) {
+  return *msg->segment_;
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorProto::kDimsFieldNumber;
+const int TensorProto::kDataTypeFieldNumber;
+const int TensorProto::kSegmentFieldNumber;
+const int TensorProto::kFloatDataFieldNumber;
+const int TensorProto::kInt32DataFieldNumber;
+const int TensorProto::kStringDataFieldNumber;
+const int TensorProto::kInt64DataFieldNumber;
+const int TensorProto::kNameFieldNumber;
+const int TensorProto::kDocStringFieldNumber;
+const int TensorProto::kRawDataFieldNumber;
+const int TensorProto::kExternalDataFieldNumber;
+const int TensorProto::kDataLocationFieldNumber;
+const int TensorProto::kDoubleDataFieldNumber;
+const int TensorProto::kUint64DataFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorProto::TensorProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TensorProto)
+}
+TensorProto::TensorProto(const TensorProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      dims_(from.dims_),
+      float_data_(from.float_data_),
+      int32_data_(from.int32_data_),
+      string_data_(from.string_data_),
+      int64_data_(from.int64_data_),
+      double_data_(from.double_data_),
+      uint64_data_(from.uint64_data_),
+      external_data_(from.external_data_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  raw_data_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.raw_data().size() > 0) {
+    raw_data_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.raw_data_);
+  }
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.doc_string().size() > 0) {
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_segment()) {
+    segment_ = new ::onnx::TensorProto_Segment(*from.segment_);
+  } else {
+    segment_ = nullptr;
+  }
+  ::memcpy(&data_type_, &from.data_type_,
+    static_cast<size_t>(reinterpret_cast<char*>(&data_location_) -
+    reinterpret_cast<char*>(&data_type_)) + sizeof(data_location_));
+  // @@protoc_insertion_point(copy_constructor:onnx.TensorProto)
+}
+
+void TensorProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_TensorProto_onnx_2eproto3.base);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  raw_data_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  ::memset(&segment_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&data_location_) -
+      reinterpret_cast<char*>(&segment_)) + sizeof(data_location_));
+}
+
+TensorProto::~TensorProto() {
+  // @@protoc_insertion_point(destructor:onnx.TensorProto)
+  SharedDtor();
+}
+
+void TensorProto::SharedDtor() {
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  raw_data_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (this != internal_default_instance()) delete segment_;
+}
+
+void TensorProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TensorProto& TensorProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TensorProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TensorProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TensorProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  dims_.Clear();
+  float_data_.Clear();
+  int32_data_.Clear();
+  string_data_.Clear();
+  int64_data_.Clear();
+  double_data_.Clear();
+  uint64_data_.Clear();
+  external_data_.Clear();
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  raw_data_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (GetArenaNoVirtual() == nullptr && segment_ != nullptr) {
+    delete segment_;
+  }
+  segment_ = nullptr;
+  ::memset(&data_type_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&data_location_) -
+      reinterpret_cast<char*>(&data_type_)) + sizeof(data_location_));
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TensorProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TensorProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // repeated int64 dims = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 10) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedInt64Parser;
+          object = msg->mutable_dims();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 8) goto handle_unusual;
+        do {
+          msg->add_dims(::google::protobuf::internal::ReadVarint(&ptr));
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 8 && (ptr += 1));
+        break;
+      }
+      // int32 data_type = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 16) goto handle_unusual;
+        msg->set_data_type(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // .onnx.TensorProto.Segment segment = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 26) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::TensorProto_Segment::_InternalParse;
+        object = msg->mutable_segment();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // repeated float float_data = 4 [packed = true];
+      case 4: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 34) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedFloatParser;
+          object = msg->mutable_float_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 37) goto handle_unusual;
+        do {
+          msg->add_float_data(::google::protobuf::io::UnalignedLoad<float>(ptr));
+          ptr += sizeof(float);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 37 && (ptr += 1));
+        break;
+      }
+      // repeated int32 int32_data = 5 [packed = true];
+      case 5: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 42) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedInt32Parser;
+          object = msg->mutable_int32_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 40) goto handle_unusual;
+        do {
+          msg->add_int32_data(::google::protobuf::internal::ReadVarint(&ptr));
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 40 && (ptr += 1));
+        break;
+      }
+      // repeated bytes string_data = 6;
+      case 6: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 50) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          object = msg->add_string_data();
+          if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+            parser_till_end = ::google::protobuf::internal::GreedyStringParser;
+            goto string_till_end;
+          }
+          GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheck(ptr, size, ctx));
+          ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+          ptr += size;
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 50 && (ptr += 1));
+        break;
+      }
+      // repeated int64 int64_data = 7 [packed = true];
+      case 7: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 58) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedInt64Parser;
+          object = msg->mutable_int64_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 56) goto handle_unusual;
+        do {
+          msg->add_int64_data(::google::protobuf::internal::ReadVarint(&ptr));
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 56 && (ptr += 1));
+        break;
+      }
+      // string name = 8;
+      case 8: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 66) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.TensorProto.name");
+        object = msg->mutable_name();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // bytes raw_data = 9;
+      case 9: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 74) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        object = msg->mutable_raw_data();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParser;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheck(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // repeated double double_data = 10 [packed = true];
+      case 10: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 82) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedDoubleParser;
+          object = msg->mutable_double_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 81) goto handle_unusual;
+        do {
+          msg->add_double_data(::google::protobuf::io::UnalignedLoad<double>(ptr));
+          ptr += sizeof(double);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 81 && (ptr += 1));
+        break;
+      }
+      // repeated uint64 uint64_data = 11 [packed = true];
+      case 11: {
+        if (static_cast<::google::protobuf::uint8>(tag) == 90) {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::google::protobuf::internal::PackedUInt64Parser;
+          object = msg->mutable_uint64_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          auto newend = ptr + size;
+          if (size) ptr = parser_till_end(ptr, newend, object, ctx);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr == newend);
+          break;
+        } else if (static_cast<::google::protobuf::uint8>(tag) != 88) goto handle_unusual;
+        do {
+          msg->add_uint64_data(::google::protobuf::internal::ReadVarint(&ptr));
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 88 && (ptr += 1));
+        break;
+      }
+      // string doc_string = 12;
+      case 12: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 98) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.TensorProto.doc_string");
+        object = msg->mutable_doc_string();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // repeated .onnx.StringStringEntryProto external_data = 13;
+      case 13: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 106) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::StringStringEntryProto::_InternalParse;
+          object = msg->add_external_data();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 106 && (ptr += 1));
+        break;
+      }
+      // .onnx.TensorProto.DataLocation data_location = 14;
+      case 14: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 112) goto handle_unusual;
+        ::google::protobuf::uint64 val = ::google::protobuf::internal::ReadVarint(&ptr);
+        msg->set_data_location(static_cast<::onnx::TensorProto_DataLocation>(val));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TensorProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TensorProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated int64 dims = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, this->mutable_dims())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (8 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 1, 10u, input, this->mutable_dims())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int32 data_type = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (16 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, &data_type_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.TensorProto.Segment segment = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (26 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_segment()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated float float_data = 4 [packed = true];
+      case 4: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (34 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>(
+                 input, this->mutable_float_data())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (37 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>(
+                 1, 34u, input, this->mutable_float_data())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated int32 int32_data = 5 [packed = true];
+      case 5: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (42 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, this->mutable_int32_data())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (40 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 1, 42u, input, this->mutable_int32_data())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated bytes string_data = 6;
+      case 6: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (50 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadBytes(
+                input, this->add_string_data()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated int64 int64_data = 7 [packed = true];
+      case 7: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (58 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, this->mutable_int64_data())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (56 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 1, 58u, input, this->mutable_int64_data())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string name = 8;
+      case 8: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (66 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.TensorProto.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // bytes raw_data = 9;
+      case 9: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (74 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadBytes(
+                input, this->mutable_raw_data()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated double double_data = 10 [packed = true];
+      case 10: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (82 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   double, ::google::protobuf::internal::WireFormatLite::TYPE_DOUBLE>(
+                 input, this->mutable_double_data())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (81 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   double, ::google::protobuf::internal::WireFormatLite::TYPE_DOUBLE>(
+                 1, 82u, input, this->mutable_double_data())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated uint64 uint64_data = 11 [packed = true];
+      case 11: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (90 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::uint64, ::google::protobuf::internal::WireFormatLite::TYPE_UINT64>(
+                 input, this->mutable_uint64_data())));
+        } else if (static_cast< ::google::protobuf::uint8>(tag) == (88 & 0xFF)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::uint64, ::google::protobuf::internal::WireFormatLite::TYPE_UINT64>(
+                 1, 90u, input, this->mutable_uint64_data())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string doc_string = 12;
+      case 12: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (98 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_doc_string()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.TensorProto.doc_string"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated .onnx.StringStringEntryProto external_data = 13;
+      case 13: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (106 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_external_data()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.TensorProto.DataLocation data_location = 14;
+      case 14: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (112 & 0xFF)) {
+          int value = 0;
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   int, ::google::protobuf::internal::WireFormatLite::TYPE_ENUM>(
+                 input, &value)));
+          set_data_location(static_cast< ::onnx::TensorProto_DataLocation >(value));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TensorProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TensorProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TensorProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TensorProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated int64 dims = 1;
+  if (this->dims_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(1, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_dims_cached_byte_size_.load(
+        std::memory_order_relaxed));
+  }
+  for (int i = 0, n = this->dims_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64NoTag(
+      this->dims(i), output);
+  }
+
+  // int32 data_type = 2;
+  if (this->data_type() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32(2, this->data_type(), output);
+  }
+
+  // .onnx.TensorProto.Segment segment = 3;
+  if (this->has_segment()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      3, HasBitSetters::segment(this), output);
+  }
+
+  // repeated float float_data = 4 [packed = true];
+  if (this->float_data_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(4, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_float_data_cached_byte_size_.load(
+        std::memory_order_relaxed));
+    ::google::protobuf::internal::WireFormatLite::WriteFloatArray(
+      this->float_data().data(), this->float_data_size(), output);
+  }
+
+  // repeated int32 int32_data = 5 [packed = true];
+  if (this->int32_data_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(5, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_int32_data_cached_byte_size_.load(
+        std::memory_order_relaxed));
+  }
+  for (int i = 0, n = this->int32_data_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32NoTag(
+      this->int32_data(i), output);
+  }
+
+  // repeated bytes string_data = 6;
+  for (int i = 0, n = this->string_data_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteBytes(
+      6, this->string_data(i), output);
+  }
+
+  // repeated int64 int64_data = 7 [packed = true];
+  if (this->int64_data_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(7, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_int64_data_cached_byte_size_.load(
+        std::memory_order_relaxed));
+  }
+  for (int i = 0, n = this->int64_data_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64NoTag(
+      this->int64_data(i), output);
+  }
+
+  // string name = 8;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorProto.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      8, this->name(), output);
+  }
+
+  // bytes raw_data = 9;
+  if (this->raw_data().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteBytesMaybeAliased(
+      9, this->raw_data(), output);
+  }
+
+  // repeated double double_data = 10 [packed = true];
+  if (this->double_data_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(10, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_double_data_cached_byte_size_.load(
+        std::memory_order_relaxed));
+    ::google::protobuf::internal::WireFormatLite::WriteDoubleArray(
+      this->double_data().data(), this->double_data_size(), output);
+  }
+
+  // repeated uint64 uint64_data = 11 [packed = true];
+  if (this->uint64_data_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(11, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(_uint64_data_cached_byte_size_.load(
+        std::memory_order_relaxed));
+  }
+  for (int i = 0, n = this->uint64_data_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteUInt64NoTag(
+      this->uint64_data(i), output);
+  }
+
+  // string doc_string = 12;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorProto.doc_string");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      12, this->doc_string(), output);
+  }
+
+  // repeated .onnx.StringStringEntryProto external_data = 13;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->external_data_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      13,
+      this->external_data(static_cast<int>(i)),
+      output);
+  }
+
+  // .onnx.TensorProto.DataLocation data_location = 14;
+  if (this->data_location() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteEnum(
+      14, this->data_location(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TensorProto)
+}
+
+::google::protobuf::uint8* TensorProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TensorProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated int64 dims = 1;
+  if (this->dims_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      1,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _dims_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteInt64NoTagToArray(this->dims_, target);
+  }
+
+  // int32 data_type = 2;
+  if (this->data_type() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(2, this->data_type(), target);
+  }
+
+  // .onnx.TensorProto.Segment segment = 3;
+  if (this->has_segment()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        3, HasBitSetters::segment(this), target);
+  }
+
+  // repeated float float_data = 4 [packed = true];
+  if (this->float_data_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      4,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _float_data_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteFloatNoTagToArray(this->float_data_, target);
+  }
+
+  // repeated int32 int32_data = 5 [packed = true];
+  if (this->int32_data_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      5,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _int32_data_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteInt32NoTagToArray(this->int32_data_, target);
+  }
+
+  // repeated bytes string_data = 6;
+  for (int i = 0, n = this->string_data_size(); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteBytesToArray(6, this->string_data(i), target);
+  }
+
+  // repeated int64 int64_data = 7 [packed = true];
+  if (this->int64_data_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      7,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _int64_data_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteInt64NoTagToArray(this->int64_data_, target);
+  }
+
+  // string name = 8;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorProto.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        8, this->name(), target);
+  }
+
+  // bytes raw_data = 9;
+  if (this->raw_data().size() > 0) {
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteBytesToArray(
+        9, this->raw_data(), target);
+  }
+
+  // repeated double double_data = 10 [packed = true];
+  if (this->double_data_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      10,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _double_data_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteDoubleNoTagToArray(this->double_data_, target);
+  }
+
+  // repeated uint64 uint64_data = 11 [packed = true];
+  if (this->uint64_data_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      11,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        _uint64_data_cached_byte_size_.load(std::memory_order_relaxed),
+         target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteUInt64NoTagToArray(this->uint64_data_, target);
+  }
+
+  // string doc_string = 12;
+  if (this->doc_string().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->doc_string().data(), static_cast<int>(this->doc_string().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorProto.doc_string");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        12, this->doc_string(), target);
+  }
+
+  // repeated .onnx.StringStringEntryProto external_data = 13;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->external_data_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        13, this->external_data(static_cast<int>(i)), target);
+  }
+
+  // .onnx.TensorProto.DataLocation data_location = 14;
+  if (this->data_location() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteEnumToArray(
+      14, this->data_location(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TensorProto)
+  return target;
+}
+
+size_t TensorProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TensorProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated int64 dims = 1;
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      Int64Size(this->dims_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _dims_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated float float_data = 4 [packed = true];
+  {
+    unsigned int count = static_cast<unsigned int>(this->float_data_size());
+    size_t data_size = 4UL * count;
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _float_data_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated int32 int32_data = 5 [packed = true];
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      Int32Size(this->int32_data_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _int32_data_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated bytes string_data = 6;
+  total_size += 1 *
+      ::google::protobuf::internal::FromIntSize(this->string_data_size());
+  for (int i = 0, n = this->string_data_size(); i < n; i++) {
+    total_size += ::google::protobuf::internal::WireFormatLite::BytesSize(
+      this->string_data(i));
+  }
+
+  // repeated int64 int64_data = 7 [packed = true];
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      Int64Size(this->int64_data_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _int64_data_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated double double_data = 10 [packed = true];
+  {
+    unsigned int count = static_cast<unsigned int>(this->double_data_size());
+    size_t data_size = 8UL * count;
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _double_data_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated uint64 uint64_data = 11 [packed = true];
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      UInt64Size(this->uint64_data_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast<::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    _uint64_data_cached_byte_size_.store(cached_size,
+                                    std::memory_order_relaxed);
+    total_size += data_size;
+  }
+
+  // repeated .onnx.StringStringEntryProto external_data = 13;
+  {
+    unsigned int count = static_cast<unsigned int>(this->external_data_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->external_data(static_cast<int>(i)));
+    }
+  }
+
+  // string name = 8;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // bytes raw_data = 9;
+  if (this->raw_data().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::BytesSize(
+        this->raw_data());
+  }
+
+  // string doc_string = 12;
+  if (this->doc_string().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->doc_string());
+  }
+
+  // .onnx.TensorProto.Segment segment = 3;
+  if (this->has_segment()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *segment_);
+  }
+
+  // int32 data_type = 2;
+  if (this->data_type() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int32Size(
+        this->data_type());
+  }
+
+  // .onnx.TensorProto.DataLocation data_location = 14;
+  if (this->data_location() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::EnumSize(this->data_location());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TensorProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TensorProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorProto* source =
+      ::google::protobuf::DynamicCastToGenerated<TensorProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TensorProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TensorProto)
+    MergeFrom(*source);
+  }
+}
+
+void TensorProto::MergeFrom(const TensorProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TensorProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  dims_.MergeFrom(from.dims_);
+  float_data_.MergeFrom(from.float_data_);
+  int32_data_.MergeFrom(from.int32_data_);
+  string_data_.MergeFrom(from.string_data_);
+  int64_data_.MergeFrom(from.int64_data_);
+  double_data_.MergeFrom(from.double_data_);
+  uint64_data_.MergeFrom(from.uint64_data_);
+  external_data_.MergeFrom(from.external_data_);
+  if (from.name().size() > 0) {
+
+    name_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name_);
+  }
+  if (from.raw_data().size() > 0) {
+
+    raw_data_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.raw_data_);
+  }
+  if (from.doc_string().size() > 0) {
+
+    doc_string_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.doc_string_);
+  }
+  if (from.has_segment()) {
+    mutable_segment()->::onnx::TensorProto_Segment::MergeFrom(from.segment());
+  }
+  if (from.data_type() != 0) {
+    set_data_type(from.data_type());
+  }
+  if (from.data_location() != 0) {
+    set_data_location(from.data_location());
+  }
+}
+
+void TensorProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TensorProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorProto::CopyFrom(const TensorProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TensorProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorProto::IsInitialized() const {
+  return true;
+}
+
+void TensorProto::Swap(TensorProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TensorProto::InternalSwap(TensorProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  dims_.InternalSwap(&other->dims_);
+  float_data_.InternalSwap(&other->float_data_);
+  int32_data_.InternalSwap(&other->int32_data_);
+  string_data_.InternalSwap(CastToBase(&other->string_data_));
+  int64_data_.InternalSwap(&other->int64_data_);
+  double_data_.InternalSwap(&other->double_data_);
+  uint64_data_.InternalSwap(&other->uint64_data_);
+  CastToBase(&external_data_)->InternalSwap(CastToBase(&other->external_data_));
+  name_.Swap(&other->name_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  raw_data_.Swap(&other->raw_data_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  doc_string_.Swap(&other->doc_string_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(segment_, other->segment_);
+  swap(data_type_, other->data_type_);
+  swap(data_location_, other->data_location_);
+}
+
+::google::protobuf::Metadata TensorProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TensorShapeProto_Dimension::InitAsDefaultInstance() {
+  ::onnx::_TensorShapeProto_Dimension_default_instance_.dim_value_ = PROTOBUF_LONGLONG(0);
+  ::onnx::_TensorShapeProto_Dimension_default_instance_.dim_param_.UnsafeSetDefault(
+      &::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+class TensorShapeProto_Dimension::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorShapeProto_Dimension::kDimValueFieldNumber;
+const int TensorShapeProto_Dimension::kDimParamFieldNumber;
+const int TensorShapeProto_Dimension::kDenotationFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorShapeProto_Dimension::TensorShapeProto_Dimension()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TensorShapeProto.Dimension)
+}
+TensorShapeProto_Dimension::TensorShapeProto_Dimension(const TensorShapeProto_Dimension& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  denotation_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.denotation().size() > 0) {
+    denotation_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.denotation_);
+  }
+  clear_has_value();
+  switch (from.value_case()) {
+    case kDimValue: {
+      set_dim_value(from.dim_value());
+      break;
+    }
+    case kDimParam: {
+      set_dim_param(from.dim_param());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.TensorShapeProto.Dimension)
+}
+
+void TensorShapeProto_Dimension::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_TensorShapeProto_Dimension_onnx_2eproto3.base);
+  denotation_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  clear_has_value();
+}
+
+TensorShapeProto_Dimension::~TensorShapeProto_Dimension() {
+  // @@protoc_insertion_point(destructor:onnx.TensorShapeProto.Dimension)
+  SharedDtor();
+}
+
+void TensorShapeProto_Dimension::SharedDtor() {
+  denotation_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (has_value()) {
+    clear_value();
+  }
+}
+
+void TensorShapeProto_Dimension::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TensorShapeProto_Dimension& TensorShapeProto_Dimension::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TensorShapeProto_Dimension_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TensorShapeProto_Dimension::clear_value() {
+// @@protoc_insertion_point(one_of_clear_start:onnx.TensorShapeProto.Dimension)
+  switch (value_case()) {
+    case kDimValue: {
+      // No need to clear
+      break;
+    }
+    case kDimParam: {
+      value_.dim_param_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  _oneof_case_[0] = VALUE_NOT_SET;
+}
+
+
+void TensorShapeProto_Dimension::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TensorShapeProto.Dimension)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  denotation_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  clear_value();
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TensorShapeProto_Dimension::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TensorShapeProto_Dimension*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // int64 dim_value = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 8) goto handle_unusual;
+        msg->set_dim_value(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // string dim_param = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.TensorShapeProto.Dimension.dim_param");
+        object = msg->mutable_dim_param();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // string denotation = 3;
+      case 3: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 26) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.TensorShapeProto.Dimension.denotation");
+        object = msg->mutable_denotation();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TensorShapeProto_Dimension::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TensorShapeProto.Dimension)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int64 dim_value = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (8 & 0xFF)) {
+          clear_value();
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &value_.dim_value_)));
+          set_has_dim_value();
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string dim_param = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_dim_param()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->dim_param().data(), static_cast<int>(this->dim_param().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.TensorShapeProto.Dimension.dim_param"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string denotation = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (26 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_denotation()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->denotation().data(), static_cast<int>(this->denotation().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.TensorShapeProto.Dimension.denotation"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TensorShapeProto.Dimension)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TensorShapeProto.Dimension)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TensorShapeProto_Dimension::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TensorShapeProto.Dimension)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 dim_value = 1;
+  if (has_dim_value()) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(1, this->dim_value(), output);
+  }
+
+  // string dim_param = 2;
+  if (has_dim_param()) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->dim_param().data(), static_cast<int>(this->dim_param().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorShapeProto.Dimension.dim_param");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      2, this->dim_param(), output);
+  }
+
+  // string denotation = 3;
+  if (this->denotation().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->denotation().data(), static_cast<int>(this->denotation().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorShapeProto.Dimension.denotation");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      3, this->denotation(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TensorShapeProto.Dimension)
+}
+
+::google::protobuf::uint8* TensorShapeProto_Dimension::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TensorShapeProto.Dimension)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 dim_value = 1;
+  if (has_dim_value()) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(1, this->dim_value(), target);
+  }
+
+  // string dim_param = 2;
+  if (has_dim_param()) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->dim_param().data(), static_cast<int>(this->dim_param().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorShapeProto.Dimension.dim_param");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        2, this->dim_param(), target);
+  }
+
+  // string denotation = 3;
+  if (this->denotation().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->denotation().data(), static_cast<int>(this->denotation().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TensorShapeProto.Dimension.denotation");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        3, this->denotation(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TensorShapeProto.Dimension)
+  return target;
+}
+
+size_t TensorShapeProto_Dimension::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TensorShapeProto.Dimension)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // string denotation = 3;
+  if (this->denotation().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->denotation());
+  }
+
+  switch (value_case()) {
+    // int64 dim_value = 1;
+    case kDimValue: {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int64Size(
+          this->dim_value());
+      break;
+    }
+    // string dim_param = 2;
+    case kDimParam: {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::StringSize(
+          this->dim_param());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TensorShapeProto_Dimension::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TensorShapeProto.Dimension)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorShapeProto_Dimension* source =
+      ::google::protobuf::DynamicCastToGenerated<TensorShapeProto_Dimension>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TensorShapeProto.Dimension)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TensorShapeProto.Dimension)
+    MergeFrom(*source);
+  }
+}
+
+void TensorShapeProto_Dimension::MergeFrom(const TensorShapeProto_Dimension& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TensorShapeProto.Dimension)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.denotation().size() > 0) {
+
+    denotation_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.denotation_);
+  }
+  switch (from.value_case()) {
+    case kDimValue: {
+      set_dim_value(from.dim_value());
+      break;
+    }
+    case kDimParam: {
+      set_dim_param(from.dim_param());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+}
+
+void TensorShapeProto_Dimension::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TensorShapeProto.Dimension)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorShapeProto_Dimension::CopyFrom(const TensorShapeProto_Dimension& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TensorShapeProto.Dimension)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorShapeProto_Dimension::IsInitialized() const {
+  return true;
+}
+
+void TensorShapeProto_Dimension::Swap(TensorShapeProto_Dimension* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TensorShapeProto_Dimension::InternalSwap(TensorShapeProto_Dimension* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  denotation_.Swap(&other->denotation_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(value_, other->value_);
+  swap(_oneof_case_[0], other->_oneof_case_[0]);
+}
+
+::google::protobuf::Metadata TensorShapeProto_Dimension::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TensorShapeProto::InitAsDefaultInstance() {
+}
+class TensorShapeProto::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorShapeProto::kDimFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorShapeProto::TensorShapeProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TensorShapeProto)
+}
+TensorShapeProto::TensorShapeProto(const TensorShapeProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr),
+      dim_(from.dim_) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  // @@protoc_insertion_point(copy_constructor:onnx.TensorShapeProto)
+}
+
+void TensorShapeProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_TensorShapeProto_onnx_2eproto3.base);
+}
+
+TensorShapeProto::~TensorShapeProto() {
+  // @@protoc_insertion_point(destructor:onnx.TensorShapeProto)
+  SharedDtor();
+}
+
+void TensorShapeProto::SharedDtor() {
+}
+
+void TensorShapeProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TensorShapeProto& TensorShapeProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TensorShapeProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TensorShapeProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  dim_.Clear();
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TensorShapeProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TensorShapeProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        do {
+          ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+          parser_till_end = ::onnx::TensorShapeProto_Dimension::_InternalParse;
+          object = msg->add_dim();
+          if (size > end - ptr) goto len_delim_till_end;
+          ptr += size;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+              {parser_till_end, object}, ptr - size, ptr));
+          if (ptr >= end) break;
+        } while ((::google::protobuf::io::UnalignedLoad<::google::protobuf::uint64>(ptr) & 255) == 10 && (ptr += 1));
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TensorShapeProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TensorShapeProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+                input, add_dim()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TensorShapeProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TensorShapeProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TensorShapeProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      1,
+      this->dim(static_cast<int>(i)),
+      output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TensorShapeProto)
+}
+
+::google::protobuf::uint8* TensorShapeProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        1, this->dim(static_cast<int>(i)), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TensorShapeProto)
+  return target;
+}
+
+size_t TensorShapeProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TensorShapeProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+  {
+    unsigned int count = static_cast<unsigned int>(this->dim_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->dim(static_cast<int>(i)));
+    }
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TensorShapeProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TensorShapeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorShapeProto* source =
+      ::google::protobuf::DynamicCastToGenerated<TensorShapeProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TensorShapeProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TensorShapeProto)
+    MergeFrom(*source);
+  }
+}
+
+void TensorShapeProto::MergeFrom(const TensorShapeProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TensorShapeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  dim_.MergeFrom(from.dim_);
+}
+
+void TensorShapeProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TensorShapeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorShapeProto::CopyFrom(const TensorShapeProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TensorShapeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorShapeProto::IsInitialized() const {
+  return true;
+}
+
+void TensorShapeProto::Swap(TensorShapeProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TensorShapeProto::InternalSwap(TensorShapeProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  CastToBase(&dim_)->InternalSwap(CastToBase(&other->dim_));
+}
+
+::google::protobuf::Metadata TensorShapeProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TypeProto_Tensor::InitAsDefaultInstance() {
+  ::onnx::_TypeProto_Tensor_default_instance_._instance.get_mutable()->shape_ = const_cast< ::onnx::TensorShapeProto*>(
+      ::onnx::TensorShapeProto::internal_default_instance());
+}
+class TypeProto_Tensor::HasBitSetters {
+ public:
+  static const ::onnx::TensorShapeProto& shape(const TypeProto_Tensor* msg);
+};
+
+const ::onnx::TensorShapeProto&
+TypeProto_Tensor::HasBitSetters::shape(const TypeProto_Tensor* msg) {
+  return *msg->shape_;
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TypeProto_Tensor::kElemTypeFieldNumber;
+const int TypeProto_Tensor::kShapeFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TypeProto_Tensor::TypeProto_Tensor()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TypeProto.Tensor)
+}
+TypeProto_Tensor::TypeProto_Tensor(const TypeProto_Tensor& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  if (from.has_shape()) {
+    shape_ = new ::onnx::TensorShapeProto(*from.shape_);
+  } else {
+    shape_ = nullptr;
+  }
+  elem_type_ = from.elem_type_;
+  // @@protoc_insertion_point(copy_constructor:onnx.TypeProto.Tensor)
+}
+
+void TypeProto_Tensor::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_TypeProto_Tensor_onnx_2eproto3.base);
+  ::memset(&shape_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&elem_type_) -
+      reinterpret_cast<char*>(&shape_)) + sizeof(elem_type_));
+}
+
+TypeProto_Tensor::~TypeProto_Tensor() {
+  // @@protoc_insertion_point(destructor:onnx.TypeProto.Tensor)
+  SharedDtor();
+}
+
+void TypeProto_Tensor::SharedDtor() {
+  if (this != internal_default_instance()) delete shape_;
+}
+
+void TypeProto_Tensor::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TypeProto_Tensor& TypeProto_Tensor::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TypeProto_Tensor_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TypeProto_Tensor::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TypeProto.Tensor)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  if (GetArenaNoVirtual() == nullptr && shape_ != nullptr) {
+    delete shape_;
+  }
+  shape_ = nullptr;
+  elem_type_ = 0;
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TypeProto_Tensor::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TypeProto_Tensor*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // int32 elem_type = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 8) goto handle_unusual;
+        msg->set_elem_type(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      // .onnx.TensorShapeProto shape = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 18) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::TensorShapeProto::_InternalParse;
+        object = msg->mutable_shape();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TypeProto_Tensor::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TypeProto.Tensor)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int32 elem_type = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (8 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, &elem_type_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // .onnx.TensorShapeProto shape = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (18 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_shape()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TypeProto.Tensor)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TypeProto.Tensor)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TypeProto_Tensor::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TypeProto.Tensor)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int32 elem_type = 1;
+  if (this->elem_type() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32(1, this->elem_type(), output);
+  }
+
+  // .onnx.TensorShapeProto shape = 2;
+  if (this->has_shape()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      2, HasBitSetters::shape(this), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TypeProto.Tensor)
+}
+
+::google::protobuf::uint8* TypeProto_Tensor::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TypeProto.Tensor)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int32 elem_type = 1;
+  if (this->elem_type() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(1, this->elem_type(), target);
+  }
+
+  // .onnx.TensorShapeProto shape = 2;
+  if (this->has_shape()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        2, HasBitSetters::shape(this), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TypeProto.Tensor)
+  return target;
+}
+
+size_t TypeProto_Tensor::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TypeProto.Tensor)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // .onnx.TensorShapeProto shape = 2;
+  if (this->has_shape()) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::MessageSize(
+        *shape_);
+  }
+
+  // int32 elem_type = 1;
+  if (this->elem_type() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int32Size(
+        this->elem_type());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TypeProto_Tensor::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TypeProto.Tensor)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TypeProto_Tensor* source =
+      ::google::protobuf::DynamicCastToGenerated<TypeProto_Tensor>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TypeProto.Tensor)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TypeProto.Tensor)
+    MergeFrom(*source);
+  }
+}
+
+void TypeProto_Tensor::MergeFrom(const TypeProto_Tensor& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TypeProto.Tensor)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.has_shape()) {
+    mutable_shape()->::onnx::TensorShapeProto::MergeFrom(from.shape());
+  }
+  if (from.elem_type() != 0) {
+    set_elem_type(from.elem_type());
+  }
+}
+
+void TypeProto_Tensor::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TypeProto.Tensor)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TypeProto_Tensor::CopyFrom(const TypeProto_Tensor& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TypeProto.Tensor)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TypeProto_Tensor::IsInitialized() const {
+  return true;
+}
+
+void TypeProto_Tensor::Swap(TypeProto_Tensor* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TypeProto_Tensor::InternalSwap(TypeProto_Tensor* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  swap(shape_, other->shape_);
+  swap(elem_type_, other->elem_type_);
+}
+
+::google::protobuf::Metadata TypeProto_Tensor::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TypeProto::InitAsDefaultInstance() {
+  ::onnx::_TypeProto_default_instance_.tensor_type_ = const_cast< ::onnx::TypeProto_Tensor*>(
+      ::onnx::TypeProto_Tensor::internal_default_instance());
+}
+class TypeProto::HasBitSetters {
+ public:
+  static const ::onnx::TypeProto_Tensor& tensor_type(const TypeProto* msg);
+};
+
+const ::onnx::TypeProto_Tensor&
+TypeProto::HasBitSetters::tensor_type(const TypeProto* msg) {
+  return *msg->value_.tensor_type_;
+}
+void TypeProto::set_allocated_tensor_type(::onnx::TypeProto_Tensor* tensor_type) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  clear_value();
+  if (tensor_type) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      tensor_type = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, tensor_type, submessage_arena);
+    }
+    set_has_tensor_type();
+    value_.tensor_type_ = tensor_type;
+  }
+  // @@protoc_insertion_point(field_set_allocated:onnx.TypeProto.tensor_type)
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TypeProto::kTensorTypeFieldNumber;
+const int TypeProto::kDenotationFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TypeProto::TypeProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.TypeProto)
+}
+TypeProto::TypeProto(const TypeProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  denotation_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.denotation().size() > 0) {
+    denotation_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.denotation_);
+  }
+  clear_has_value();
+  switch (from.value_case()) {
+    case kTensorType: {
+      mutable_tensor_type()->::onnx::TypeProto_Tensor::MergeFrom(from.tensor_type());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  // @@protoc_insertion_point(copy_constructor:onnx.TypeProto)
+}
+
+void TypeProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_TypeProto_onnx_2eproto3.base);
+  denotation_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  clear_has_value();
+}
+
+TypeProto::~TypeProto() {
+  // @@protoc_insertion_point(destructor:onnx.TypeProto)
+  SharedDtor();
+}
+
+void TypeProto::SharedDtor() {
+  denotation_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (has_value()) {
+    clear_value();
+  }
+}
+
+void TypeProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const TypeProto& TypeProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_TypeProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void TypeProto::clear_value() {
+// @@protoc_insertion_point(one_of_clear_start:onnx.TypeProto)
+  switch (value_case()) {
+    case kTensorType: {
+      delete value_.tensor_type_;
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  _oneof_case_[0] = VALUE_NOT_SET;
+}
+
+
+void TypeProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.TypeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  denotation_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  clear_value();
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* TypeProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<TypeProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // .onnx.TypeProto.Tensor tensor_type = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        parser_till_end = ::onnx::TypeProto_Tensor::_InternalParse;
+        object = msg->mutable_tensor_type();
+        if (size > end - ptr) goto len_delim_till_end;
+        ptr += size;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ctx->ParseExactRange(
+            {parser_till_end, object}, ptr - size, ptr));
+        break;
+      }
+      // string denotation = 6;
+      case 6: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 50) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.TypeProto.denotation");
+        object = msg->mutable_denotation();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool TypeProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.TypeProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // .onnx.TypeProto.Tensor tensor_type = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(
+               input, mutable_tensor_type()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string denotation = 6;
+      case 6: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (50 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_denotation()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->denotation().data(), static_cast<int>(this->denotation().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.TypeProto.denotation"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.TypeProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.TypeProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void TypeProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.TypeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // .onnx.TypeProto.Tensor tensor_type = 1;
+  if (has_tensor_type()) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      1, HasBitSetters::tensor_type(this), output);
+  }
+
+  // string denotation = 6;
+  if (this->denotation().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->denotation().data(), static_cast<int>(this->denotation().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TypeProto.denotation");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      6, this->denotation(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.TypeProto)
+}
+
+::google::protobuf::uint8* TypeProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.TypeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // .onnx.TypeProto.Tensor tensor_type = 1;
+  if (has_tensor_type()) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        1, HasBitSetters::tensor_type(this), target);
+  }
+
+  // string denotation = 6;
+  if (this->denotation().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->denotation().data(), static_cast<int>(this->denotation().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.TypeProto.denotation");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        6, this->denotation(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.TypeProto)
+  return target;
+}
+
+size_t TypeProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.TypeProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // string denotation = 6;
+  if (this->denotation().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->denotation());
+  }
+
+  switch (value_case()) {
+    // .onnx.TypeProto.Tensor tensor_type = 1;
+    case kTensorType: {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          *value_.tensor_type_);
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void TypeProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.TypeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TypeProto* source =
+      ::google::protobuf::DynamicCastToGenerated<TypeProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.TypeProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.TypeProto)
+    MergeFrom(*source);
+  }
+}
+
+void TypeProto::MergeFrom(const TypeProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.TypeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.denotation().size() > 0) {
+
+    denotation_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.denotation_);
+  }
+  switch (from.value_case()) {
+    case kTensorType: {
+      mutable_tensor_type()->::onnx::TypeProto_Tensor::MergeFrom(from.tensor_type());
+      break;
+    }
+    case VALUE_NOT_SET: {
+      break;
+    }
+  }
+}
+
+void TypeProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.TypeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TypeProto::CopyFrom(const TypeProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.TypeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TypeProto::IsInitialized() const {
+  return true;
+}
+
+void TypeProto::Swap(TypeProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void TypeProto::InternalSwap(TypeProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  denotation_.Swap(&other->denotation_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(value_, other->value_);
+  swap(_oneof_case_[0], other->_oneof_case_[0]);
+}
+
+::google::protobuf::Metadata TypeProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void OperatorSetIdProto::InitAsDefaultInstance() {
+}
+class OperatorSetIdProto::HasBitSetters {
+ public:
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int OperatorSetIdProto::kDomainFieldNumber;
+const int OperatorSetIdProto::kVersionFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+OperatorSetIdProto::OperatorSetIdProto()
+  : ::google::protobuf::Message(), _internal_metadata_(nullptr) {
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:onnx.OperatorSetIdProto)
+}
+OperatorSetIdProto::OperatorSetIdProto(const OperatorSetIdProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(nullptr) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.domain().size() > 0) {
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+  version_ = from.version_;
+  // @@protoc_insertion_point(copy_constructor:onnx.OperatorSetIdProto)
+}
+
+void OperatorSetIdProto::SharedCtor() {
+  ::google::protobuf::internal::InitSCC(
+      &scc_info_OperatorSetIdProto_onnx_2eproto3.base);
+  domain_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  version_ = PROTOBUF_LONGLONG(0);
+}
+
+OperatorSetIdProto::~OperatorSetIdProto() {
+  // @@protoc_insertion_point(destructor:onnx.OperatorSetIdProto)
+  SharedDtor();
+}
+
+void OperatorSetIdProto::SharedDtor() {
+  domain_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+void OperatorSetIdProto::SetCachedSize(int size) const {
+  _cached_size_.Set(size);
+}
+const OperatorSetIdProto& OperatorSetIdProto::default_instance() {
+  ::google::protobuf::internal::InitSCC(&::scc_info_OperatorSetIdProto_onnx_2eproto3.base);
+  return *internal_default_instance();
+}
+
+
+void OperatorSetIdProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:onnx.OperatorSetIdProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  version_ = PROTOBUF_LONGLONG(0);
+  _internal_metadata_.Clear();
+}
+
+#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+const char* OperatorSetIdProto::_InternalParse(const char* begin, const char* end, void* object,
+                  ::google::protobuf::internal::ParseContext* ctx) {
+  auto msg = static_cast<OperatorSetIdProto*>(object);
+  ::google::protobuf::int32 size; (void)size;
+  int depth; (void)depth;
+  ::google::protobuf::uint32 tag;
+  ::google::protobuf::internal::ParseFunc parser_till_end; (void)parser_till_end;
+  auto ptr = begin;
+  while (ptr < end) {
+    ptr = ::google::protobuf::io::Parse32(ptr, &tag);
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    switch (tag >> 3) {
+      // string domain = 1;
+      case 1: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 10) goto handle_unusual;
+        ptr = ::google::protobuf::io::ReadSize(ptr, &size);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ctx->extra_parse_data().SetFieldName("onnx.OperatorSetIdProto.domain");
+        object = msg->mutable_domain();
+        if (size > end - ptr + ::google::protobuf::internal::ParseContext::kSlopBytes) {
+          parser_till_end = ::google::protobuf::internal::GreedyStringParserUTF8;
+          goto string_till_end;
+        }
+        GOOGLE_PROTOBUF_PARSER_ASSERT(::google::protobuf::internal::StringCheckUTF8(ptr, size, ctx));
+        ::google::protobuf::internal::InlineGreedyStringParser(object, ptr, size, ctx);
+        ptr += size;
+        break;
+      }
+      // int64 version = 2;
+      case 2: {
+        if (static_cast<::google::protobuf::uint8>(tag) != 16) goto handle_unusual;
+        msg->set_version(::google::protobuf::internal::ReadVarint(&ptr));
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        break;
+      }
+      default: {
+      handle_unusual:
+        if ((tag & 7) == 4 || tag == 0) {
+          ctx->EndGroup(tag);
+          return ptr;
+        }
+        auto res = UnknownFieldParse(tag, {_InternalParse, msg},
+          ptr, end, msg->_internal_metadata_.mutable_unknown_fields(), ctx);
+        ptr = res.first;
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        if (res.second) return ptr;
+      }
+    }  // switch
+  }  // while
+  return ptr;
+string_till_end:
+  static_cast<::std::string*>(object)->clear();
+  static_cast<::std::string*>(object)->reserve(size);
+  goto len_delim_till_end;
+len_delim_till_end:
+  return ctx->StoreAndTailCall(ptr, end, {_InternalParse, msg},
+                               {parser_till_end, object}, size);
+}
+#else  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+bool OperatorSetIdProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:onnx.OperatorSetIdProto)
+  for (;;) {
+    ::std::pair<::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // string domain = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (10 & 0xFF)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_domain()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->domain().data(), static_cast<int>(this->domain().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "onnx.OperatorSetIdProto.domain"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int64 version = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) == (16 & 0xFF)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &version_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:onnx.OperatorSetIdProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:onnx.OperatorSetIdProto)
+  return false;
+#undef DO_
+}
+#endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+
+void OperatorSetIdProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:onnx.OperatorSetIdProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string domain = 1;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.OperatorSetIdProto.domain");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      1, this->domain(), output);
+  }
+
+  // int64 version = 2;
+  if (this->version() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(2, this->version(), output);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        _internal_metadata_.unknown_fields(), output);
+  }
+  // @@protoc_insertion_point(serialize_end:onnx.OperatorSetIdProto)
+}
+
+::google::protobuf::uint8* OperatorSetIdProto::InternalSerializeWithCachedSizesToArray(
+    ::google::protobuf::uint8* target) const {
+  // @@protoc_insertion_point(serialize_to_array_start:onnx.OperatorSetIdProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // string domain = 1;
+  if (this->domain().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->domain().data(), static_cast<int>(this->domain().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "onnx.OperatorSetIdProto.domain");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        1, this->domain(), target);
+  }
+
+  // int64 version = 2;
+  if (this->version() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(2, this->version(), target);
+  }
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        _internal_metadata_.unknown_fields(), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:onnx.OperatorSetIdProto)
+  return target;
+}
+
+size_t OperatorSetIdProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:onnx.OperatorSetIdProto)
+  size_t total_size = 0;
+
+  if (_internal_metadata_.have_unknown_fields()) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        _internal_metadata_.unknown_fields());
+  }
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  // string domain = 1;
+  if (this->domain().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->domain());
+  }
+
+  // int64 version = 2;
+  if (this->version() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->version());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  SetCachedSize(cached_size);
+  return total_size;
+}
+
+void OperatorSetIdProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:onnx.OperatorSetIdProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const OperatorSetIdProto* source =
+      ::google::protobuf::DynamicCastToGenerated<OperatorSetIdProto>(
+          &from);
+  if (source == nullptr) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:onnx.OperatorSetIdProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:onnx.OperatorSetIdProto)
+    MergeFrom(*source);
+  }
+}
+
+void OperatorSetIdProto::MergeFrom(const OperatorSetIdProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:onnx.OperatorSetIdProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.domain().size() > 0) {
+
+    domain_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.domain_);
+  }
+  if (from.version() != 0) {
+    set_version(from.version());
+  }
+}
+
+void OperatorSetIdProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:onnx.OperatorSetIdProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void OperatorSetIdProto::CopyFrom(const OperatorSetIdProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:onnx.OperatorSetIdProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool OperatorSetIdProto::IsInitialized() const {
+  return true;
+}
+
+void OperatorSetIdProto::Swap(OperatorSetIdProto* other) {
+  if (other == this) return;
+  InternalSwap(other);
+}
+void OperatorSetIdProto::InternalSwap(OperatorSetIdProto* other) {
+  using std::swap;
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  domain_.Swap(&other->domain_, &::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+    GetArenaNoVirtual());
+  swap(version_, other->version_);
+}
+
+::google::protobuf::Metadata OperatorSetIdProto::GetMetadata() const {
+  ::google::protobuf::internal::AssignDescriptors(&::assign_descriptors_table_onnx_2eproto3);
+  return ::file_level_metadata_onnx_2eproto3[kIndexInFileMessages];
+}
+
+
+// @@protoc_insertion_point(namespace_scope)
+}  // namespace onnx
+namespace google {
+namespace protobuf {
+template<> PROTOBUF_NOINLINE ::onnx::AttributeProto* Arena::CreateMaybeMessage< ::onnx::AttributeProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::AttributeProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::ValueInfoProto* Arena::CreateMaybeMessage< ::onnx::ValueInfoProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::ValueInfoProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::NodeProto* Arena::CreateMaybeMessage< ::onnx::NodeProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::NodeProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::ModelProto* Arena::CreateMaybeMessage< ::onnx::ModelProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::ModelProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::StringStringEntryProto* Arena::CreateMaybeMessage< ::onnx::StringStringEntryProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::StringStringEntryProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::GraphProto* Arena::CreateMaybeMessage< ::onnx::GraphProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::GraphProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TensorProto_Segment* Arena::CreateMaybeMessage< ::onnx::TensorProto_Segment >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TensorProto_Segment >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TensorProto* Arena::CreateMaybeMessage< ::onnx::TensorProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TensorProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TensorShapeProto_Dimension* Arena::CreateMaybeMessage< ::onnx::TensorShapeProto_Dimension >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TensorShapeProto_Dimension >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TensorShapeProto* Arena::CreateMaybeMessage< ::onnx::TensorShapeProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TensorShapeProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TypeProto_Tensor* Arena::CreateMaybeMessage< ::onnx::TypeProto_Tensor >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TypeProto_Tensor >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::TypeProto* Arena::CreateMaybeMessage< ::onnx::TypeProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::TypeProto >(arena);
+}
+template<> PROTOBUF_NOINLINE ::onnx::OperatorSetIdProto* Arena::CreateMaybeMessage< ::onnx::OperatorSetIdProto >(Arena* arena) {
+  return Arena::CreateInternal< ::onnx::OperatorSetIdProto >(arena);
+}
+}  // namespace protobuf
+}  // namespace google
+
+// @@protoc_insertion_point(global_scope)
+#include <google/protobuf/port_undef.inc>
diff --git a/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.h b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.h
new file mode 100644
index 0000000..24e3d5e
--- /dev/null
+++ b/3rdparty/TNN/tools/onnx2tnn/src/onnx-proto3/onnx.proto3.pb.h
@@ -0,0 +1,5558 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: onnx.proto3
+
+#ifndef PROTOBUF_INCLUDED_onnx_2eproto3
+#define PROTOBUF_INCLUDED_onnx_2eproto3
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3007000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3007001 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/generated_enum_reflection.h>
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_onnx_2eproto3
+
+// Internal implementation detail -- do not use these members.
+struct TableStruct_onnx_2eproto3 {
+  static const ::google::protobuf::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::google::protobuf::internal::ParseTable schema[13]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors_onnx_2eproto3();
+namespace onnx {
+class AttributeProto;
+class AttributeProtoDefaultTypeInternal;
+extern AttributeProtoDefaultTypeInternal _AttributeProto_default_instance_;
+class GraphProto;
+class GraphProtoDefaultTypeInternal;
+extern GraphProtoDefaultTypeInternal _GraphProto_default_instance_;
+class ModelProto;
+class ModelProtoDefaultTypeInternal;
+extern ModelProtoDefaultTypeInternal _ModelProto_default_instance_;
+class NodeProto;
+class NodeProtoDefaultTypeInternal;
+extern NodeProtoDefaultTypeInternal _NodeProto_default_instance_;
+class OperatorSetIdProto;
+class OperatorSetIdProtoDefaultTypeInternal;
+extern OperatorSetIdProtoDefaultTypeInternal _OperatorSetIdProto_default_instance_;
+class StringStringEntryProto;
+class StringStringEntryProtoDefaultTypeInternal;
+extern StringStringEntryProtoDefaultTypeInternal _StringStringEntryProto_default_instance_;
+class TensorProto;
+class TensorProtoDefaultTypeInternal;
+extern TensorProtoDefaultTypeInternal _TensorProto_default_instance_;
+class TensorProto_Segment;
+class TensorProto_SegmentDefaultTypeInternal;
+extern TensorProto_SegmentDefaultTypeInternal _TensorProto_Segment_default_instance_;
+class TensorShapeProto;
+class TensorShapeProtoDefaultTypeInternal;
+extern TensorShapeProtoDefaultTypeInternal _TensorShapeProto_default_instance_;
+class TensorShapeProto_Dimension;
+class TensorShapeProto_DimensionDefaultTypeInternal;
+extern TensorShapeProto_DimensionDefaultTypeInternal _TensorShapeProto_Dimension_default_instance_;
+class TypeProto;
+class TypeProtoDefaultTypeInternal;
+extern TypeProtoDefaultTypeInternal _TypeProto_default_instance_;
+class TypeProto_Tensor;
+class TypeProto_TensorDefaultTypeInternal;
+extern TypeProto_TensorDefaultTypeInternal _TypeProto_Tensor_default_instance_;
+class ValueInfoProto;
+class ValueInfoProtoDefaultTypeInternal;
+extern ValueInfoProtoDefaultTypeInternal _ValueInfoProto_default_instance_;
+}  // namespace onnx
+namespace google {
+namespace protobuf {
+template<> ::onnx::AttributeProto* Arena::CreateMaybeMessage<::onnx::AttributeProto>(Arena*);
+template<> ::onnx::GraphProto* Arena::CreateMaybeMessage<::onnx::GraphProto>(Arena*);
+template<> ::onnx::ModelProto* Arena::CreateMaybeMessage<::onnx::ModelProto>(Arena*);
+template<> ::onnx::NodeProto* Arena::CreateMaybeMessage<::onnx::NodeProto>(Arena*);
+template<> ::onnx::OperatorSetIdProto* Arena::CreateMaybeMessage<::onnx::OperatorSetIdProto>(Arena*);
+template<> ::onnx::StringStringEntryProto* Arena::CreateMaybeMessage<::onnx::StringStringEntryProto>(Arena*);
+template<> ::onnx::TensorProto* Arena::CreateMaybeMessage<::onnx::TensorProto>(Arena*);
+template<> ::onnx::TensorProto_Segment* Arena::CreateMaybeMessage<::onnx::TensorProto_Segment>(Arena*);
+template<> ::onnx::TensorShapeProto* Arena::CreateMaybeMessage<::onnx::TensorShapeProto>(Arena*);
+template<> ::onnx::TensorShapeProto_Dimension* Arena::CreateMaybeMessage<::onnx::TensorShapeProto_Dimension>(Arena*);
+template<> ::onnx::TypeProto* Arena::CreateMaybeMessage<::onnx::TypeProto>(Arena*);
+template<> ::onnx::TypeProto_Tensor* Arena::CreateMaybeMessage<::onnx::TypeProto_Tensor>(Arena*);
+template<> ::onnx::ValueInfoProto* Arena::CreateMaybeMessage<::onnx::ValueInfoProto>(Arena*);
+}  // namespace protobuf
+}  // namespace google
+namespace onnx {
+
+enum AttributeProto_AttributeType {
+  AttributeProto_AttributeType_UNDEFINED = 0,
+  AttributeProto_AttributeType_FLOAT = 1,
+  AttributeProto_AttributeType_INT = 2,
+  AttributeProto_AttributeType_STRING = 3,
+  AttributeProto_AttributeType_TENSOR = 4,
+  AttributeProto_AttributeType_GRAPH = 5,
+  AttributeProto_AttributeType_FLOATS = 6,
+  AttributeProto_AttributeType_INTS = 7,
+  AttributeProto_AttributeType_STRINGS = 8,
+  AttributeProto_AttributeType_TENSORS = 9,
+  AttributeProto_AttributeType_GRAPHS = 10,
+  AttributeProto_AttributeType_AttributeProto_AttributeType_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::min(),
+  AttributeProto_AttributeType_AttributeProto_AttributeType_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::max()
+};
+bool AttributeProto_AttributeType_IsValid(int value);
+const AttributeProto_AttributeType AttributeProto_AttributeType_AttributeType_MIN = AttributeProto_AttributeType_UNDEFINED;
+const AttributeProto_AttributeType AttributeProto_AttributeType_AttributeType_MAX = AttributeProto_AttributeType_GRAPHS;
+const int AttributeProto_AttributeType_AttributeType_ARRAYSIZE = AttributeProto_AttributeType_AttributeType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* AttributeProto_AttributeType_descriptor();
+inline const ::std::string& AttributeProto_AttributeType_Name(AttributeProto_AttributeType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    AttributeProto_AttributeType_descriptor(), value);
+}
+inline bool AttributeProto_AttributeType_Parse(
+    const ::std::string& name, AttributeProto_AttributeType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<AttributeProto_AttributeType>(
+    AttributeProto_AttributeType_descriptor(), name, value);
+}
+enum TensorProto_DataType {
+  TensorProto_DataType_UNDEFINED = 0,
+  TensorProto_DataType_FLOAT = 1,
+  TensorProto_DataType_UINT8 = 2,
+  TensorProto_DataType_INT8 = 3,
+  TensorProto_DataType_UINT16 = 4,
+  TensorProto_DataType_INT16 = 5,
+  TensorProto_DataType_INT32 = 6,
+  TensorProto_DataType_INT64 = 7,
+  TensorProto_DataType_STRING = 8,
+  TensorProto_DataType_BOOL = 9,
+  TensorProto_DataType_FLOAT16 = 10,
+  TensorProto_DataType_DOUBLE = 11,
+  TensorProto_DataType_UINT32 = 12,
+  TensorProto_DataType_UINT64 = 13,
+  TensorProto_DataType_COMPLEX64 = 14,
+  TensorProto_DataType_COMPLEX128 = 15,
+  TensorProto_DataType_BFLOAT16 = 16,
+  TensorProto_DataType_TensorProto_DataType_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::min(),
+  TensorProto_DataType_TensorProto_DataType_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::max()
+};
+bool TensorProto_DataType_IsValid(int value);
+const TensorProto_DataType TensorProto_DataType_DataType_MIN = TensorProto_DataType_UNDEFINED;
+const TensorProto_DataType TensorProto_DataType_DataType_MAX = TensorProto_DataType_BFLOAT16;
+const int TensorProto_DataType_DataType_ARRAYSIZE = TensorProto_DataType_DataType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* TensorProto_DataType_descriptor();
+inline const ::std::string& TensorProto_DataType_Name(TensorProto_DataType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    TensorProto_DataType_descriptor(), value);
+}
+inline bool TensorProto_DataType_Parse(
+    const ::std::string& name, TensorProto_DataType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<TensorProto_DataType>(
+    TensorProto_DataType_descriptor(), name, value);
+}
+enum TensorProto_DataLocation {
+  TensorProto_DataLocation_DEFAULT = 0,
+  TensorProto_DataLocation_EXTERNAL = 1,
+  TensorProto_DataLocation_TensorProto_DataLocation_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::min(),
+  TensorProto_DataLocation_TensorProto_DataLocation_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::max()
+};
+bool TensorProto_DataLocation_IsValid(int value);
+const TensorProto_DataLocation TensorProto_DataLocation_DataLocation_MIN = TensorProto_DataLocation_DEFAULT;
+const TensorProto_DataLocation TensorProto_DataLocation_DataLocation_MAX = TensorProto_DataLocation_EXTERNAL;
+const int TensorProto_DataLocation_DataLocation_ARRAYSIZE = TensorProto_DataLocation_DataLocation_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* TensorProto_DataLocation_descriptor();
+inline const ::std::string& TensorProto_DataLocation_Name(TensorProto_DataLocation value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    TensorProto_DataLocation_descriptor(), value);
+}
+inline bool TensorProto_DataLocation_Parse(
+    const ::std::string& name, TensorProto_DataLocation* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<TensorProto_DataLocation>(
+    TensorProto_DataLocation_descriptor(), name, value);
+}
+enum Version {
+  _START_VERSION = 0,
+  IR_VERSION_2017_10_10 = 1,
+  IR_VERSION_2017_10_30 = 2,
+  IR_VERSION_2017_11_3 = 3,
+  IR_VERSION = 4,
+  Version_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::min(),
+  Version_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::google::protobuf::int32>::max()
+};
+bool Version_IsValid(int value);
+const Version Version_MIN = _START_VERSION;
+const Version Version_MAX = IR_VERSION;
+const int Version_ARRAYSIZE = Version_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* Version_descriptor();
+inline const ::std::string& Version_Name(Version value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    Version_descriptor(), value);
+}
+inline bool Version_Parse(
+    const ::std::string& name, Version* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<Version>(
+    Version_descriptor(), name, value);
+}
+// ===================================================================
+
+class AttributeProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.AttributeProto) */ {
+ public:
+  AttributeProto();
+  virtual ~AttributeProto();
+
+  AttributeProto(const AttributeProto& from);
+
+  inline AttributeProto& operator=(const AttributeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  AttributeProto(AttributeProto&& from) noexcept
+    : AttributeProto() {
+    *this = ::std::move(from);
+  }
+
+  inline AttributeProto& operator=(AttributeProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const AttributeProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const AttributeProto* internal_default_instance() {
+    return reinterpret_cast<const AttributeProto*>(
+               &_AttributeProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  void Swap(AttributeProto* other);
+  friend void swap(AttributeProto& a, AttributeProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline AttributeProto* New() const final {
+    return CreateMaybeMessage<AttributeProto>(nullptr);
+  }
+
+  AttributeProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<AttributeProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const AttributeProto& from);
+  void MergeFrom(const AttributeProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(AttributeProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  typedef AttributeProto_AttributeType AttributeType;
+  static const AttributeType UNDEFINED =
+    AttributeProto_AttributeType_UNDEFINED;
+  static const AttributeType FLOAT =
+    AttributeProto_AttributeType_FLOAT;
+  static const AttributeType INT =
+    AttributeProto_AttributeType_INT;
+  static const AttributeType STRING =
+    AttributeProto_AttributeType_STRING;
+  static const AttributeType TENSOR =
+    AttributeProto_AttributeType_TENSOR;
+  static const AttributeType GRAPH =
+    AttributeProto_AttributeType_GRAPH;
+  static const AttributeType FLOATS =
+    AttributeProto_AttributeType_FLOATS;
+  static const AttributeType INTS =
+    AttributeProto_AttributeType_INTS;
+  static const AttributeType STRINGS =
+    AttributeProto_AttributeType_STRINGS;
+  static const AttributeType TENSORS =
+    AttributeProto_AttributeType_TENSORS;
+  static const AttributeType GRAPHS =
+    AttributeProto_AttributeType_GRAPHS;
+  static inline bool AttributeType_IsValid(int value) {
+    return AttributeProto_AttributeType_IsValid(value);
+  }
+  static const AttributeType AttributeType_MIN =
+    AttributeProto_AttributeType_AttributeType_MIN;
+  static const AttributeType AttributeType_MAX =
+    AttributeProto_AttributeType_AttributeType_MAX;
+  static const int AttributeType_ARRAYSIZE =
+    AttributeProto_AttributeType_AttributeType_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  AttributeType_descriptor() {
+    return AttributeProto_AttributeType_descriptor();
+  }
+  static inline const ::std::string& AttributeType_Name(AttributeType value) {
+    return AttributeProto_AttributeType_Name(value);
+  }
+  static inline bool AttributeType_Parse(const ::std::string& name,
+      AttributeType* value) {
+    return AttributeProto_AttributeType_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  // repeated float floats = 7;
+  int floats_size() const;
+  void clear_floats();
+  static const int kFloatsFieldNumber = 7;
+  float floats(int index) const;
+  void set_floats(int index, float value);
+  void add_floats(float value);
+  const ::google::protobuf::RepeatedField< float >&
+      floats() const;
+  ::google::protobuf::RepeatedField< float >*
+      mutable_floats();
+
+  // repeated int64 ints = 8;
+  int ints_size() const;
+  void clear_ints();
+  static const int kIntsFieldNumber = 8;
+  ::google::protobuf::int64 ints(int index) const;
+  void set_ints(int index, ::google::protobuf::int64 value);
+  void add_ints(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      ints() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_ints();
+
+  // repeated bytes strings = 9;
+  int strings_size() const;
+  void clear_strings();
+  static const int kStringsFieldNumber = 9;
+  const ::std::string& strings(int index) const;
+  ::std::string* mutable_strings(int index);
+  void set_strings(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_strings(int index, ::std::string&& value);
+  #endif
+  void set_strings(int index, const char* value);
+  void set_strings(int index, const void* value, size_t size);
+  ::std::string* add_strings();
+  void add_strings(const ::std::string& value);
+  #if LANG_CXX11
+  void add_strings(::std::string&& value);
+  #endif
+  void add_strings(const char* value);
+  void add_strings(const void* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField<::std::string>& strings() const;
+  ::google::protobuf::RepeatedPtrField<::std::string>* mutable_strings();
+
+  // repeated .onnx.TensorProto tensors = 10;
+  int tensors_size() const;
+  void clear_tensors();
+  static const int kTensorsFieldNumber = 10;
+  ::onnx::TensorProto* mutable_tensors(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >*
+      mutable_tensors();
+  const ::onnx::TensorProto& tensors(int index) const;
+  ::onnx::TensorProto* add_tensors();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >&
+      tensors() const;
+
+  // repeated .onnx.GraphProto graphs = 11;
+  int graphs_size() const;
+  void clear_graphs();
+  static const int kGraphsFieldNumber = 11;
+  ::onnx::GraphProto* mutable_graphs(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::GraphProto >*
+      mutable_graphs();
+  const ::onnx::GraphProto& graphs(int index) const;
+  ::onnx::GraphProto* add_graphs();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::GraphProto >&
+      graphs() const;
+
+  // string name = 1;
+  void clear_name();
+  static const int kNameFieldNumber = 1;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+
+  // bytes s = 4;
+  void clear_s();
+  static const int kSFieldNumber = 4;
+  const ::std::string& s() const;
+  void set_s(const ::std::string& value);
+  #if LANG_CXX11
+  void set_s(::std::string&& value);
+  #endif
+  void set_s(const char* value);
+  void set_s(const void* value, size_t size);
+  ::std::string* mutable_s();
+  ::std::string* release_s();
+  void set_allocated_s(::std::string* s);
+
+  // string doc_string = 13;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 13;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // string ref_attr_name = 21;
+  void clear_ref_attr_name();
+  static const int kRefAttrNameFieldNumber = 21;
+  const ::std::string& ref_attr_name() const;
+  void set_ref_attr_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_ref_attr_name(::std::string&& value);
+  #endif
+  void set_ref_attr_name(const char* value);
+  void set_ref_attr_name(const char* value, size_t size);
+  ::std::string* mutable_ref_attr_name();
+  ::std::string* release_ref_attr_name();
+  void set_allocated_ref_attr_name(::std::string* ref_attr_name);
+
+  // .onnx.TensorProto t = 5;
+  bool has_t() const;
+  void clear_t();
+  static const int kTFieldNumber = 5;
+  const ::onnx::TensorProto& t() const;
+  ::onnx::TensorProto* release_t();
+  ::onnx::TensorProto* mutable_t();
+  void set_allocated_t(::onnx::TensorProto* t);
+
+  // .onnx.GraphProto g = 6;
+  bool has_g() const;
+  void clear_g();
+  static const int kGFieldNumber = 6;
+  const ::onnx::GraphProto& g() const;
+  ::onnx::GraphProto* release_g();
+  ::onnx::GraphProto* mutable_g();
+  void set_allocated_g(::onnx::GraphProto* g);
+
+  // int64 i = 3;
+  void clear_i();
+  static const int kIFieldNumber = 3;
+  ::google::protobuf::int64 i() const;
+  void set_i(::google::protobuf::int64 value);
+
+  // float f = 2;
+  void clear_f();
+  static const int kFFieldNumber = 2;
+  float f() const;
+  void set_f(float value);
+
+  // .onnx.AttributeProto.AttributeType type = 20;
+  void clear_type();
+  static const int kTypeFieldNumber = 20;
+  ::onnx::AttributeProto_AttributeType type() const;
+  void set_type(::onnx::AttributeProto_AttributeType value);
+
+  // @@protoc_insertion_point(class_scope:onnx.AttributeProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedField< float > floats_;
+  mutable std::atomic<int> _floats_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > ints_;
+  mutable std::atomic<int> _ints_cached_byte_size_;
+  ::google::protobuf::RepeatedPtrField<::std::string> strings_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto > tensors_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::GraphProto > graphs_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr s_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  ::google::protobuf::internal::ArenaStringPtr ref_attr_name_;
+  ::onnx::TensorProto* t_;
+  ::onnx::GraphProto* g_;
+  ::google::protobuf::int64 i_;
+  float f_;
+  int type_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class ValueInfoProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.ValueInfoProto) */ {
+ public:
+  ValueInfoProto();
+  virtual ~ValueInfoProto();
+
+  ValueInfoProto(const ValueInfoProto& from);
+
+  inline ValueInfoProto& operator=(const ValueInfoProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  ValueInfoProto(ValueInfoProto&& from) noexcept
+    : ValueInfoProto() {
+    *this = ::std::move(from);
+  }
+
+  inline ValueInfoProto& operator=(ValueInfoProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const ValueInfoProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ValueInfoProto* internal_default_instance() {
+    return reinterpret_cast<const ValueInfoProto*>(
+               &_ValueInfoProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    1;
+
+  void Swap(ValueInfoProto* other);
+  friend void swap(ValueInfoProto& a, ValueInfoProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ValueInfoProto* New() const final {
+    return CreateMaybeMessage<ValueInfoProto>(nullptr);
+  }
+
+  ValueInfoProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<ValueInfoProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const ValueInfoProto& from);
+  void MergeFrom(const ValueInfoProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ValueInfoProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // string name = 1;
+  void clear_name();
+  static const int kNameFieldNumber = 1;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+
+  // string doc_string = 3;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 3;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // .onnx.TypeProto type = 2;
+  bool has_type() const;
+  void clear_type();
+  static const int kTypeFieldNumber = 2;
+  const ::onnx::TypeProto& type() const;
+  ::onnx::TypeProto* release_type();
+  ::onnx::TypeProto* mutable_type();
+  void set_allocated_type(::onnx::TypeProto* type);
+
+  // @@protoc_insertion_point(class_scope:onnx.ValueInfoProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  ::onnx::TypeProto* type_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class NodeProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.NodeProto) */ {
+ public:
+  NodeProto();
+  virtual ~NodeProto();
+
+  NodeProto(const NodeProto& from);
+
+  inline NodeProto& operator=(const NodeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  NodeProto(NodeProto&& from) noexcept
+    : NodeProto() {
+    *this = ::std::move(from);
+  }
+
+  inline NodeProto& operator=(NodeProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const NodeProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const NodeProto* internal_default_instance() {
+    return reinterpret_cast<const NodeProto*>(
+               &_NodeProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    2;
+
+  void Swap(NodeProto* other);
+  friend void swap(NodeProto& a, NodeProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline NodeProto* New() const final {
+    return CreateMaybeMessage<NodeProto>(nullptr);
+  }
+
+  NodeProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<NodeProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const NodeProto& from);
+  void MergeFrom(const NodeProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(NodeProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated string input = 1;
+  int input_size() const;
+  void clear_input();
+  static const int kInputFieldNumber = 1;
+  const ::std::string& input(int index) const;
+  ::std::string* mutable_input(int index);
+  void set_input(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_input(int index, ::std::string&& value);
+  #endif
+  void set_input(int index, const char* value);
+  void set_input(int index, const char* value, size_t size);
+  ::std::string* add_input();
+  void add_input(const ::std::string& value);
+  #if LANG_CXX11
+  void add_input(::std::string&& value);
+  #endif
+  void add_input(const char* value);
+  void add_input(const char* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField<::std::string>& input() const;
+  ::google::protobuf::RepeatedPtrField<::std::string>* mutable_input();
+
+  // repeated string output = 2;
+  int output_size() const;
+  void clear_output();
+  static const int kOutputFieldNumber = 2;
+  const ::std::string& output(int index) const;
+  ::std::string* mutable_output(int index);
+  void set_output(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_output(int index, ::std::string&& value);
+  #endif
+  void set_output(int index, const char* value);
+  void set_output(int index, const char* value, size_t size);
+  ::std::string* add_output();
+  void add_output(const ::std::string& value);
+  #if LANG_CXX11
+  void add_output(::std::string&& value);
+  #endif
+  void add_output(const char* value);
+  void add_output(const char* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField<::std::string>& output() const;
+  ::google::protobuf::RepeatedPtrField<::std::string>* mutable_output();
+
+  // repeated .onnx.AttributeProto attribute = 5;
+  int attribute_size() const;
+  void clear_attribute();
+  static const int kAttributeFieldNumber = 5;
+  ::onnx::AttributeProto* mutable_attribute(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::AttributeProto >*
+      mutable_attribute();
+  const ::onnx::AttributeProto& attribute(int index) const;
+  ::onnx::AttributeProto* add_attribute();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::AttributeProto >&
+      attribute() const;
+
+  // string name = 3;
+  void clear_name();
+  static const int kNameFieldNumber = 3;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+
+  // string op_type = 4;
+  void clear_op_type();
+  static const int kOpTypeFieldNumber = 4;
+  const ::std::string& op_type() const;
+  void set_op_type(const ::std::string& value);
+  #if LANG_CXX11
+  void set_op_type(::std::string&& value);
+  #endif
+  void set_op_type(const char* value);
+  void set_op_type(const char* value, size_t size);
+  ::std::string* mutable_op_type();
+  ::std::string* release_op_type();
+  void set_allocated_op_type(::std::string* op_type);
+
+  // string doc_string = 6;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 6;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // string domain = 7;
+  void clear_domain();
+  static const int kDomainFieldNumber = 7;
+  const ::std::string& domain() const;
+  void set_domain(const ::std::string& value);
+  #if LANG_CXX11
+  void set_domain(::std::string&& value);
+  #endif
+  void set_domain(const char* value);
+  void set_domain(const char* value, size_t size);
+  ::std::string* mutable_domain();
+  ::std::string* release_domain();
+  void set_allocated_domain(::std::string* domain);
+
+  // @@protoc_insertion_point(class_scope:onnx.NodeProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedPtrField<::std::string> input_;
+  ::google::protobuf::RepeatedPtrField<::std::string> output_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::AttributeProto > attribute_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr op_type_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  ::google::protobuf::internal::ArenaStringPtr domain_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class ModelProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.ModelProto) */ {
+ public:
+  ModelProto();
+  virtual ~ModelProto();
+
+  ModelProto(const ModelProto& from);
+
+  inline ModelProto& operator=(const ModelProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  ModelProto(ModelProto&& from) noexcept
+    : ModelProto() {
+    *this = ::std::move(from);
+  }
+
+  inline ModelProto& operator=(ModelProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const ModelProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ModelProto* internal_default_instance() {
+    return reinterpret_cast<const ModelProto*>(
+               &_ModelProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    3;
+
+  void Swap(ModelProto* other);
+  friend void swap(ModelProto& a, ModelProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ModelProto* New() const final {
+    return CreateMaybeMessage<ModelProto>(nullptr);
+  }
+
+  ModelProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<ModelProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const ModelProto& from);
+  void MergeFrom(const ModelProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ModelProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .onnx.OperatorSetIdProto opset_import = 8;
+  int opset_import_size() const;
+  void clear_opset_import();
+  static const int kOpsetImportFieldNumber = 8;
+  ::onnx::OperatorSetIdProto* mutable_opset_import(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::OperatorSetIdProto >*
+      mutable_opset_import();
+  const ::onnx::OperatorSetIdProto& opset_import(int index) const;
+  ::onnx::OperatorSetIdProto* add_opset_import();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::OperatorSetIdProto >&
+      opset_import() const;
+
+  // repeated .onnx.StringStringEntryProto metadata_props = 14;
+  int metadata_props_size() const;
+  void clear_metadata_props();
+  static const int kMetadataPropsFieldNumber = 14;
+  ::onnx::StringStringEntryProto* mutable_metadata_props(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >*
+      mutable_metadata_props();
+  const ::onnx::StringStringEntryProto& metadata_props(int index) const;
+  ::onnx::StringStringEntryProto* add_metadata_props();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >&
+      metadata_props() const;
+
+  // string producer_name = 2;
+  void clear_producer_name();
+  static const int kProducerNameFieldNumber = 2;
+  const ::std::string& producer_name() const;
+  void set_producer_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_producer_name(::std::string&& value);
+  #endif
+  void set_producer_name(const char* value);
+  void set_producer_name(const char* value, size_t size);
+  ::std::string* mutable_producer_name();
+  ::std::string* release_producer_name();
+  void set_allocated_producer_name(::std::string* producer_name);
+
+  // string producer_version = 3;
+  void clear_producer_version();
+  static const int kProducerVersionFieldNumber = 3;
+  const ::std::string& producer_version() const;
+  void set_producer_version(const ::std::string& value);
+  #if LANG_CXX11
+  void set_producer_version(::std::string&& value);
+  #endif
+  void set_producer_version(const char* value);
+  void set_producer_version(const char* value, size_t size);
+  ::std::string* mutable_producer_version();
+  ::std::string* release_producer_version();
+  void set_allocated_producer_version(::std::string* producer_version);
+
+  // string domain = 4;
+  void clear_domain();
+  static const int kDomainFieldNumber = 4;
+  const ::std::string& domain() const;
+  void set_domain(const ::std::string& value);
+  #if LANG_CXX11
+  void set_domain(::std::string&& value);
+  #endif
+  void set_domain(const char* value);
+  void set_domain(const char* value, size_t size);
+  ::std::string* mutable_domain();
+  ::std::string* release_domain();
+  void set_allocated_domain(::std::string* domain);
+
+  // string doc_string = 6;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 6;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // .onnx.GraphProto graph = 7;
+  bool has_graph() const;
+  void clear_graph();
+  static const int kGraphFieldNumber = 7;
+  const ::onnx::GraphProto& graph() const;
+  ::onnx::GraphProto* release_graph();
+  ::onnx::GraphProto* mutable_graph();
+  void set_allocated_graph(::onnx::GraphProto* graph);
+
+  // int64 ir_version = 1;
+  void clear_ir_version();
+  static const int kIrVersionFieldNumber = 1;
+  ::google::protobuf::int64 ir_version() const;
+  void set_ir_version(::google::protobuf::int64 value);
+
+  // int64 model_version = 5;
+  void clear_model_version();
+  static const int kModelVersionFieldNumber = 5;
+  ::google::protobuf::int64 model_version() const;
+  void set_model_version(::google::protobuf::int64 value);
+
+  // @@protoc_insertion_point(class_scope:onnx.ModelProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::OperatorSetIdProto > opset_import_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto > metadata_props_;
+  ::google::protobuf::internal::ArenaStringPtr producer_name_;
+  ::google::protobuf::internal::ArenaStringPtr producer_version_;
+  ::google::protobuf::internal::ArenaStringPtr domain_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  ::onnx::GraphProto* graph_;
+  ::google::protobuf::int64 ir_version_;
+  ::google::protobuf::int64 model_version_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class StringStringEntryProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.StringStringEntryProto) */ {
+ public:
+  StringStringEntryProto();
+  virtual ~StringStringEntryProto();
+
+  StringStringEntryProto(const StringStringEntryProto& from);
+
+  inline StringStringEntryProto& operator=(const StringStringEntryProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  StringStringEntryProto(StringStringEntryProto&& from) noexcept
+    : StringStringEntryProto() {
+    *this = ::std::move(from);
+  }
+
+  inline StringStringEntryProto& operator=(StringStringEntryProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const StringStringEntryProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const StringStringEntryProto* internal_default_instance() {
+    return reinterpret_cast<const StringStringEntryProto*>(
+               &_StringStringEntryProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    4;
+
+  void Swap(StringStringEntryProto* other);
+  friend void swap(StringStringEntryProto& a, StringStringEntryProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline StringStringEntryProto* New() const final {
+    return CreateMaybeMessage<StringStringEntryProto>(nullptr);
+  }
+
+  StringStringEntryProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<StringStringEntryProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const StringStringEntryProto& from);
+  void MergeFrom(const StringStringEntryProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(StringStringEntryProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // string key = 1;
+  void clear_key();
+  static const int kKeyFieldNumber = 1;
+  const ::std::string& key() const;
+  void set_key(const ::std::string& value);
+  #if LANG_CXX11
+  void set_key(::std::string&& value);
+  #endif
+  void set_key(const char* value);
+  void set_key(const char* value, size_t size);
+  ::std::string* mutable_key();
+  ::std::string* release_key();
+  void set_allocated_key(::std::string* key);
+
+  // string value = 2;
+  void clear_value();
+  static const int kValueFieldNumber = 2;
+  const ::std::string& value() const;
+  void set_value(const ::std::string& value);
+  #if LANG_CXX11
+  void set_value(::std::string&& value);
+  #endif
+  void set_value(const char* value);
+  void set_value(const char* value, size_t size);
+  ::std::string* mutable_value();
+  ::std::string* release_value();
+  void set_allocated_value(::std::string* value);
+
+  // @@protoc_insertion_point(class_scope:onnx.StringStringEntryProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::internal::ArenaStringPtr key_;
+  ::google::protobuf::internal::ArenaStringPtr value_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class GraphProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.GraphProto) */ {
+ public:
+  GraphProto();
+  virtual ~GraphProto();
+
+  GraphProto(const GraphProto& from);
+
+  inline GraphProto& operator=(const GraphProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  GraphProto(GraphProto&& from) noexcept
+    : GraphProto() {
+    *this = ::std::move(from);
+  }
+
+  inline GraphProto& operator=(GraphProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const GraphProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const GraphProto* internal_default_instance() {
+    return reinterpret_cast<const GraphProto*>(
+               &_GraphProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    5;
+
+  void Swap(GraphProto* other);
+  friend void swap(GraphProto& a, GraphProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline GraphProto* New() const final {
+    return CreateMaybeMessage<GraphProto>(nullptr);
+  }
+
+  GraphProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<GraphProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const GraphProto& from);
+  void MergeFrom(const GraphProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(GraphProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .onnx.NodeProto node = 1;
+  int node_size() const;
+  void clear_node();
+  static const int kNodeFieldNumber = 1;
+  ::onnx::NodeProto* mutable_node(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::NodeProto >*
+      mutable_node();
+  const ::onnx::NodeProto& node(int index) const;
+  ::onnx::NodeProto* add_node();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::NodeProto >&
+      node() const;
+
+  // repeated .onnx.TensorProto initializer = 5;
+  int initializer_size() const;
+  void clear_initializer();
+  static const int kInitializerFieldNumber = 5;
+  ::onnx::TensorProto* mutable_initializer(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >*
+      mutable_initializer();
+  const ::onnx::TensorProto& initializer(int index) const;
+  ::onnx::TensorProto* add_initializer();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >&
+      initializer() const;
+
+  // repeated .onnx.ValueInfoProto input = 11;
+  int input_size() const;
+  void clear_input();
+  static const int kInputFieldNumber = 11;
+  ::onnx::ValueInfoProto* mutable_input(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+      mutable_input();
+  const ::onnx::ValueInfoProto& input(int index) const;
+  ::onnx::ValueInfoProto* add_input();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+      input() const;
+
+  // repeated .onnx.ValueInfoProto output = 12;
+  int output_size() const;
+  void clear_output();
+  static const int kOutputFieldNumber = 12;
+  ::onnx::ValueInfoProto* mutable_output(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+      mutable_output();
+  const ::onnx::ValueInfoProto& output(int index) const;
+  ::onnx::ValueInfoProto* add_output();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+      output() const;
+
+  // repeated .onnx.ValueInfoProto value_info = 13;
+  int value_info_size() const;
+  void clear_value_info();
+  static const int kValueInfoFieldNumber = 13;
+  ::onnx::ValueInfoProto* mutable_value_info(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+      mutable_value_info();
+  const ::onnx::ValueInfoProto& value_info(int index) const;
+  ::onnx::ValueInfoProto* add_value_info();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+      value_info() const;
+
+  // string name = 2;
+  void clear_name();
+  static const int kNameFieldNumber = 2;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+
+  // string doc_string = 10;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 10;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // @@protoc_insertion_point(class_scope:onnx.GraphProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::NodeProto > node_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto > initializer_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto > input_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto > output_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto > value_info_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TensorProto_Segment :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TensorProto.Segment) */ {
+ public:
+  TensorProto_Segment();
+  virtual ~TensorProto_Segment();
+
+  TensorProto_Segment(const TensorProto_Segment& from);
+
+  inline TensorProto_Segment& operator=(const TensorProto_Segment& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorProto_Segment(TensorProto_Segment&& from) noexcept
+    : TensorProto_Segment() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorProto_Segment& operator=(TensorProto_Segment&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TensorProto_Segment& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorProto_Segment* internal_default_instance() {
+    return reinterpret_cast<const TensorProto_Segment*>(
+               &_TensorProto_Segment_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    6;
+
+  void Swap(TensorProto_Segment* other);
+  friend void swap(TensorProto_Segment& a, TensorProto_Segment& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorProto_Segment* New() const final {
+    return CreateMaybeMessage<TensorProto_Segment>(nullptr);
+  }
+
+  TensorProto_Segment* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TensorProto_Segment>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TensorProto_Segment& from);
+  void MergeFrom(const TensorProto_Segment& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TensorProto_Segment* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // int64 begin = 1;
+  void clear_begin();
+  static const int kBeginFieldNumber = 1;
+  ::google::protobuf::int64 begin() const;
+  void set_begin(::google::protobuf::int64 value);
+
+  // int64 end = 2;
+  void clear_end();
+  static const int kEndFieldNumber = 2;
+  ::google::protobuf::int64 end() const;
+  void set_end(::google::protobuf::int64 value);
+
+  // @@protoc_insertion_point(class_scope:onnx.TensorProto.Segment)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::int64 begin_;
+  ::google::protobuf::int64 end_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TensorProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TensorProto) */ {
+ public:
+  TensorProto();
+  virtual ~TensorProto();
+
+  TensorProto(const TensorProto& from);
+
+  inline TensorProto& operator=(const TensorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorProto(TensorProto&& from) noexcept
+    : TensorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorProto& operator=(TensorProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TensorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorProto* internal_default_instance() {
+    return reinterpret_cast<const TensorProto*>(
+               &_TensorProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    7;
+
+  void Swap(TensorProto* other);
+  friend void swap(TensorProto& a, TensorProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorProto* New() const final {
+    return CreateMaybeMessage<TensorProto>(nullptr);
+  }
+
+  TensorProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TensorProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TensorProto& from);
+  void MergeFrom(const TensorProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TensorProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  typedef TensorProto_Segment Segment;
+
+  typedef TensorProto_DataType DataType;
+  static const DataType UNDEFINED =
+    TensorProto_DataType_UNDEFINED;
+  static const DataType FLOAT =
+    TensorProto_DataType_FLOAT;
+  static const DataType UINT8 =
+    TensorProto_DataType_UINT8;
+  static const DataType INT8 =
+    TensorProto_DataType_INT8;
+  static const DataType UINT16 =
+    TensorProto_DataType_UINT16;
+  static const DataType INT16 =
+    TensorProto_DataType_INT16;
+  static const DataType INT32 =
+    TensorProto_DataType_INT32;
+  static const DataType INT64 =
+    TensorProto_DataType_INT64;
+  static const DataType STRING =
+    TensorProto_DataType_STRING;
+  static const DataType BOOL =
+    TensorProto_DataType_BOOL;
+  static const DataType FLOAT16 =
+    TensorProto_DataType_FLOAT16;
+  static const DataType DOUBLE =
+    TensorProto_DataType_DOUBLE;
+  static const DataType UINT32 =
+    TensorProto_DataType_UINT32;
+  static const DataType UINT64 =
+    TensorProto_DataType_UINT64;
+  static const DataType COMPLEX64 =
+    TensorProto_DataType_COMPLEX64;
+  static const DataType COMPLEX128 =
+    TensorProto_DataType_COMPLEX128;
+  static const DataType BFLOAT16 =
+    TensorProto_DataType_BFLOAT16;
+  static inline bool DataType_IsValid(int value) {
+    return TensorProto_DataType_IsValid(value);
+  }
+  static const DataType DataType_MIN =
+    TensorProto_DataType_DataType_MIN;
+  static const DataType DataType_MAX =
+    TensorProto_DataType_DataType_MAX;
+  static const int DataType_ARRAYSIZE =
+    TensorProto_DataType_DataType_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  DataType_descriptor() {
+    return TensorProto_DataType_descriptor();
+  }
+  static inline const ::std::string& DataType_Name(DataType value) {
+    return TensorProto_DataType_Name(value);
+  }
+  static inline bool DataType_Parse(const ::std::string& name,
+      DataType* value) {
+    return TensorProto_DataType_Parse(name, value);
+  }
+
+  typedef TensorProto_DataLocation DataLocation;
+  static const DataLocation DEFAULT =
+    TensorProto_DataLocation_DEFAULT;
+  static const DataLocation EXTERNAL =
+    TensorProto_DataLocation_EXTERNAL;
+  static inline bool DataLocation_IsValid(int value) {
+    return TensorProto_DataLocation_IsValid(value);
+  }
+  static const DataLocation DataLocation_MIN =
+    TensorProto_DataLocation_DataLocation_MIN;
+  static const DataLocation DataLocation_MAX =
+    TensorProto_DataLocation_DataLocation_MAX;
+  static const int DataLocation_ARRAYSIZE =
+    TensorProto_DataLocation_DataLocation_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  DataLocation_descriptor() {
+    return TensorProto_DataLocation_descriptor();
+  }
+  static inline const ::std::string& DataLocation_Name(DataLocation value) {
+    return TensorProto_DataLocation_Name(value);
+  }
+  static inline bool DataLocation_Parse(const ::std::string& name,
+      DataLocation* value) {
+    return TensorProto_DataLocation_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  // repeated int64 dims = 1;
+  int dims_size() const;
+  void clear_dims();
+  static const int kDimsFieldNumber = 1;
+  ::google::protobuf::int64 dims(int index) const;
+  void set_dims(int index, ::google::protobuf::int64 value);
+  void add_dims(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      dims() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_dims();
+
+  // repeated float float_data = 4 [packed = true];
+  int float_data_size() const;
+  void clear_float_data();
+  static const int kFloatDataFieldNumber = 4;
+  float float_data(int index) const;
+  void set_float_data(int index, float value);
+  void add_float_data(float value);
+  const ::google::protobuf::RepeatedField< float >&
+      float_data() const;
+  ::google::protobuf::RepeatedField< float >*
+      mutable_float_data();
+
+  // repeated int32 int32_data = 5 [packed = true];
+  int int32_data_size() const;
+  void clear_int32_data();
+  static const int kInt32DataFieldNumber = 5;
+  ::google::protobuf::int32 int32_data(int index) const;
+  void set_int32_data(int index, ::google::protobuf::int32 value);
+  void add_int32_data(::google::protobuf::int32 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      int32_data() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_int32_data();
+
+  // repeated bytes string_data = 6;
+  int string_data_size() const;
+  void clear_string_data();
+  static const int kStringDataFieldNumber = 6;
+  const ::std::string& string_data(int index) const;
+  ::std::string* mutable_string_data(int index);
+  void set_string_data(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_string_data(int index, ::std::string&& value);
+  #endif
+  void set_string_data(int index, const char* value);
+  void set_string_data(int index, const void* value, size_t size);
+  ::std::string* add_string_data();
+  void add_string_data(const ::std::string& value);
+  #if LANG_CXX11
+  void add_string_data(::std::string&& value);
+  #endif
+  void add_string_data(const char* value);
+  void add_string_data(const void* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField<::std::string>& string_data() const;
+  ::google::protobuf::RepeatedPtrField<::std::string>* mutable_string_data();
+
+  // repeated int64 int64_data = 7 [packed = true];
+  int int64_data_size() const;
+  void clear_int64_data();
+  static const int kInt64DataFieldNumber = 7;
+  ::google::protobuf::int64 int64_data(int index) const;
+  void set_int64_data(int index, ::google::protobuf::int64 value);
+  void add_int64_data(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      int64_data() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_int64_data();
+
+  // repeated double double_data = 10 [packed = true];
+  int double_data_size() const;
+  void clear_double_data();
+  static const int kDoubleDataFieldNumber = 10;
+  double double_data(int index) const;
+  void set_double_data(int index, double value);
+  void add_double_data(double value);
+  const ::google::protobuf::RepeatedField< double >&
+      double_data() const;
+  ::google::protobuf::RepeatedField< double >*
+      mutable_double_data();
+
+  // repeated uint64 uint64_data = 11 [packed = true];
+  int uint64_data_size() const;
+  void clear_uint64_data();
+  static const int kUint64DataFieldNumber = 11;
+  ::google::protobuf::uint64 uint64_data(int index) const;
+  void set_uint64_data(int index, ::google::protobuf::uint64 value);
+  void add_uint64_data(::google::protobuf::uint64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::uint64 >&
+      uint64_data() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::uint64 >*
+      mutable_uint64_data();
+
+  // repeated .onnx.StringStringEntryProto external_data = 13;
+  int external_data_size() const;
+  void clear_external_data();
+  static const int kExternalDataFieldNumber = 13;
+  ::onnx::StringStringEntryProto* mutable_external_data(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >*
+      mutable_external_data();
+  const ::onnx::StringStringEntryProto& external_data(int index) const;
+  ::onnx::StringStringEntryProto* add_external_data();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >&
+      external_data() const;
+
+  // string name = 8;
+  void clear_name();
+  static const int kNameFieldNumber = 8;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+
+  // bytes raw_data = 9;
+  void clear_raw_data();
+  static const int kRawDataFieldNumber = 9;
+  const ::std::string& raw_data() const;
+  void set_raw_data(const ::std::string& value);
+  #if LANG_CXX11
+  void set_raw_data(::std::string&& value);
+  #endif
+  void set_raw_data(const char* value);
+  void set_raw_data(const void* value, size_t size);
+  ::std::string* mutable_raw_data();
+  ::std::string* release_raw_data();
+  void set_allocated_raw_data(::std::string* raw_data);
+
+  // string doc_string = 12;
+  void clear_doc_string();
+  static const int kDocStringFieldNumber = 12;
+  const ::std::string& doc_string() const;
+  void set_doc_string(const ::std::string& value);
+  #if LANG_CXX11
+  void set_doc_string(::std::string&& value);
+  #endif
+  void set_doc_string(const char* value);
+  void set_doc_string(const char* value, size_t size);
+  ::std::string* mutable_doc_string();
+  ::std::string* release_doc_string();
+  void set_allocated_doc_string(::std::string* doc_string);
+
+  // .onnx.TensorProto.Segment segment = 3;
+  bool has_segment() const;
+  void clear_segment();
+  static const int kSegmentFieldNumber = 3;
+  const ::onnx::TensorProto_Segment& segment() const;
+  ::onnx::TensorProto_Segment* release_segment();
+  ::onnx::TensorProto_Segment* mutable_segment();
+  void set_allocated_segment(::onnx::TensorProto_Segment* segment);
+
+  // int32 data_type = 2;
+  void clear_data_type();
+  static const int kDataTypeFieldNumber = 2;
+  ::google::protobuf::int32 data_type() const;
+  void set_data_type(::google::protobuf::int32 value);
+
+  // .onnx.TensorProto.DataLocation data_location = 14;
+  void clear_data_location();
+  static const int kDataLocationFieldNumber = 14;
+  ::onnx::TensorProto_DataLocation data_location() const;
+  void set_data_location(::onnx::TensorProto_DataLocation value);
+
+  // @@protoc_insertion_point(class_scope:onnx.TensorProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > dims_;
+  mutable std::atomic<int> _dims_cached_byte_size_;
+  ::google::protobuf::RepeatedField< float > float_data_;
+  mutable std::atomic<int> _float_data_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > int32_data_;
+  mutable std::atomic<int> _int32_data_cached_byte_size_;
+  ::google::protobuf::RepeatedPtrField<::std::string> string_data_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > int64_data_;
+  mutable std::atomic<int> _int64_data_cached_byte_size_;
+  ::google::protobuf::RepeatedField< double > double_data_;
+  mutable std::atomic<int> _double_data_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::uint64 > uint64_data_;
+  mutable std::atomic<int> _uint64_data_cached_byte_size_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto > external_data_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr raw_data_;
+  ::google::protobuf::internal::ArenaStringPtr doc_string_;
+  ::onnx::TensorProto_Segment* segment_;
+  ::google::protobuf::int32 data_type_;
+  int data_location_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TensorShapeProto_Dimension :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TensorShapeProto.Dimension) */ {
+ public:
+  TensorShapeProto_Dimension();
+  virtual ~TensorShapeProto_Dimension();
+
+  TensorShapeProto_Dimension(const TensorShapeProto_Dimension& from);
+
+  inline TensorShapeProto_Dimension& operator=(const TensorShapeProto_Dimension& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorShapeProto_Dimension(TensorShapeProto_Dimension&& from) noexcept
+    : TensorShapeProto_Dimension() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorShapeProto_Dimension& operator=(TensorShapeProto_Dimension&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TensorShapeProto_Dimension& default_instance();
+
+  enum ValueCase {
+    kDimValue = 1,
+    kDimParam = 2,
+    VALUE_NOT_SET = 0,
+  };
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorShapeProto_Dimension* internal_default_instance() {
+    return reinterpret_cast<const TensorShapeProto_Dimension*>(
+               &_TensorShapeProto_Dimension_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    8;
+
+  void Swap(TensorShapeProto_Dimension* other);
+  friend void swap(TensorShapeProto_Dimension& a, TensorShapeProto_Dimension& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorShapeProto_Dimension* New() const final {
+    return CreateMaybeMessage<TensorShapeProto_Dimension>(nullptr);
+  }
+
+  TensorShapeProto_Dimension* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TensorShapeProto_Dimension>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TensorShapeProto_Dimension& from);
+  void MergeFrom(const TensorShapeProto_Dimension& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TensorShapeProto_Dimension* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // string denotation = 3;
+  void clear_denotation();
+  static const int kDenotationFieldNumber = 3;
+  const ::std::string& denotation() const;
+  void set_denotation(const ::std::string& value);
+  #if LANG_CXX11
+  void set_denotation(::std::string&& value);
+  #endif
+  void set_denotation(const char* value);
+  void set_denotation(const char* value, size_t size);
+  ::std::string* mutable_denotation();
+  ::std::string* release_denotation();
+  void set_allocated_denotation(::std::string* denotation);
+
+  // int64 dim_value = 1;
+  private:
+  bool has_dim_value() const;
+  public:
+  void clear_dim_value();
+  static const int kDimValueFieldNumber = 1;
+  ::google::protobuf::int64 dim_value() const;
+  void set_dim_value(::google::protobuf::int64 value);
+
+  // string dim_param = 2;
+  private:
+  bool has_dim_param() const;
+  public:
+  void clear_dim_param();
+  static const int kDimParamFieldNumber = 2;
+  const ::std::string& dim_param() const;
+  void set_dim_param(const ::std::string& value);
+  #if LANG_CXX11
+  void set_dim_param(::std::string&& value);
+  #endif
+  void set_dim_param(const char* value);
+  void set_dim_param(const char* value, size_t size);
+  ::std::string* mutable_dim_param();
+  ::std::string* release_dim_param();
+  void set_allocated_dim_param(::std::string* dim_param);
+
+  void clear_value();
+  ValueCase value_case() const;
+  // @@protoc_insertion_point(class_scope:onnx.TensorShapeProto.Dimension)
+ private:
+  class HasBitSetters;
+  void set_has_dim_value();
+  void set_has_dim_param();
+
+  inline bool has_value() const;
+  inline void clear_has_value();
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::internal::ArenaStringPtr denotation_;
+  union ValueUnion {
+    ValueUnion() {}
+    ::google::protobuf::int64 dim_value_;
+    ::google::protobuf::internal::ArenaStringPtr dim_param_;
+  } value_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  ::google::protobuf::uint32 _oneof_case_[1];
+
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TensorShapeProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TensorShapeProto) */ {
+ public:
+  TensorShapeProto();
+  virtual ~TensorShapeProto();
+
+  TensorShapeProto(const TensorShapeProto& from);
+
+  inline TensorShapeProto& operator=(const TensorShapeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorShapeProto(TensorShapeProto&& from) noexcept
+    : TensorShapeProto() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorShapeProto& operator=(TensorShapeProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TensorShapeProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorShapeProto* internal_default_instance() {
+    return reinterpret_cast<const TensorShapeProto*>(
+               &_TensorShapeProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    9;
+
+  void Swap(TensorShapeProto* other);
+  friend void swap(TensorShapeProto& a, TensorShapeProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorShapeProto* New() const final {
+    return CreateMaybeMessage<TensorShapeProto>(nullptr);
+  }
+
+  TensorShapeProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TensorShapeProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TensorShapeProto& from);
+  void MergeFrom(const TensorShapeProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TensorShapeProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  typedef TensorShapeProto_Dimension Dimension;
+
+  // accessors -------------------------------------------------------
+
+  // repeated .onnx.TensorShapeProto.Dimension dim = 1;
+  int dim_size() const;
+  void clear_dim();
+  static const int kDimFieldNumber = 1;
+  ::onnx::TensorShapeProto_Dimension* mutable_dim(int index);
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorShapeProto_Dimension >*
+      mutable_dim();
+  const ::onnx::TensorShapeProto_Dimension& dim(int index) const;
+  ::onnx::TensorShapeProto_Dimension* add_dim();
+  const ::google::protobuf::RepeatedPtrField< ::onnx::TensorShapeProto_Dimension >&
+      dim() const;
+
+  // @@protoc_insertion_point(class_scope:onnx.TensorShapeProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::RepeatedPtrField< ::onnx::TensorShapeProto_Dimension > dim_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TypeProto_Tensor :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TypeProto.Tensor) */ {
+ public:
+  TypeProto_Tensor();
+  virtual ~TypeProto_Tensor();
+
+  TypeProto_Tensor(const TypeProto_Tensor& from);
+
+  inline TypeProto_Tensor& operator=(const TypeProto_Tensor& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TypeProto_Tensor(TypeProto_Tensor&& from) noexcept
+    : TypeProto_Tensor() {
+    *this = ::std::move(from);
+  }
+
+  inline TypeProto_Tensor& operator=(TypeProto_Tensor&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TypeProto_Tensor& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TypeProto_Tensor* internal_default_instance() {
+    return reinterpret_cast<const TypeProto_Tensor*>(
+               &_TypeProto_Tensor_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    10;
+
+  void Swap(TypeProto_Tensor* other);
+  friend void swap(TypeProto_Tensor& a, TypeProto_Tensor& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TypeProto_Tensor* New() const final {
+    return CreateMaybeMessage<TypeProto_Tensor>(nullptr);
+  }
+
+  TypeProto_Tensor* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TypeProto_Tensor>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TypeProto_Tensor& from);
+  void MergeFrom(const TypeProto_Tensor& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TypeProto_Tensor* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // .onnx.TensorShapeProto shape = 2;
+  bool has_shape() const;
+  void clear_shape();
+  static const int kShapeFieldNumber = 2;
+  const ::onnx::TensorShapeProto& shape() const;
+  ::onnx::TensorShapeProto* release_shape();
+  ::onnx::TensorShapeProto* mutable_shape();
+  void set_allocated_shape(::onnx::TensorShapeProto* shape);
+
+  // int32 elem_type = 1;
+  void clear_elem_type();
+  static const int kElemTypeFieldNumber = 1;
+  ::google::protobuf::int32 elem_type() const;
+  void set_elem_type(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:onnx.TypeProto.Tensor)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::onnx::TensorShapeProto* shape_;
+  ::google::protobuf::int32 elem_type_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class TypeProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.TypeProto) */ {
+ public:
+  TypeProto();
+  virtual ~TypeProto();
+
+  TypeProto(const TypeProto& from);
+
+  inline TypeProto& operator=(const TypeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TypeProto(TypeProto&& from) noexcept
+    : TypeProto() {
+    *this = ::std::move(from);
+  }
+
+  inline TypeProto& operator=(TypeProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const TypeProto& default_instance();
+
+  enum ValueCase {
+    kTensorType = 1,
+    VALUE_NOT_SET = 0,
+  };
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TypeProto* internal_default_instance() {
+    return reinterpret_cast<const TypeProto*>(
+               &_TypeProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    11;
+
+  void Swap(TypeProto* other);
+  friend void swap(TypeProto& a, TypeProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TypeProto* New() const final {
+    return CreateMaybeMessage<TypeProto>(nullptr);
+  }
+
+  TypeProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<TypeProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const TypeProto& from);
+  void MergeFrom(const TypeProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(TypeProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  typedef TypeProto_Tensor Tensor;
+
+  // accessors -------------------------------------------------------
+
+  // string denotation = 6;
+  void clear_denotation();
+  static const int kDenotationFieldNumber = 6;
+  const ::std::string& denotation() const;
+  void set_denotation(const ::std::string& value);
+  #if LANG_CXX11
+  void set_denotation(::std::string&& value);
+  #endif
+  void set_denotation(const char* value);
+  void set_denotation(const char* value, size_t size);
+  ::std::string* mutable_denotation();
+  ::std::string* release_denotation();
+  void set_allocated_denotation(::std::string* denotation);
+
+  // .onnx.TypeProto.Tensor tensor_type = 1;
+  bool has_tensor_type() const;
+  void clear_tensor_type();
+  static const int kTensorTypeFieldNumber = 1;
+  const ::onnx::TypeProto_Tensor& tensor_type() const;
+  ::onnx::TypeProto_Tensor* release_tensor_type();
+  ::onnx::TypeProto_Tensor* mutable_tensor_type();
+  void set_allocated_tensor_type(::onnx::TypeProto_Tensor* tensor_type);
+
+  void clear_value();
+  ValueCase value_case() const;
+  // @@protoc_insertion_point(class_scope:onnx.TypeProto)
+ private:
+  class HasBitSetters;
+  void set_has_tensor_type();
+
+  inline bool has_value() const;
+  inline void clear_has_value();
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::internal::ArenaStringPtr denotation_;
+  union ValueUnion {
+    ValueUnion() {}
+    ::onnx::TypeProto_Tensor* tensor_type_;
+  } value_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  ::google::protobuf::uint32 _oneof_case_[1];
+
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// -------------------------------------------------------------------
+
+class OperatorSetIdProto :
+    public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:onnx.OperatorSetIdProto) */ {
+ public:
+  OperatorSetIdProto();
+  virtual ~OperatorSetIdProto();
+
+  OperatorSetIdProto(const OperatorSetIdProto& from);
+
+  inline OperatorSetIdProto& operator=(const OperatorSetIdProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  OperatorSetIdProto(OperatorSetIdProto&& from) noexcept
+    : OperatorSetIdProto() {
+    *this = ::std::move(from);
+  }
+
+  inline OperatorSetIdProto& operator=(OperatorSetIdProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  static const ::google::protobuf::Descriptor* descriptor() {
+    return default_instance().GetDescriptor();
+  }
+  static const OperatorSetIdProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const OperatorSetIdProto* internal_default_instance() {
+    return reinterpret_cast<const OperatorSetIdProto*>(
+               &_OperatorSetIdProto_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    12;
+
+  void Swap(OperatorSetIdProto* other);
+  friend void swap(OperatorSetIdProto& a, OperatorSetIdProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline OperatorSetIdProto* New() const final {
+    return CreateMaybeMessage<OperatorSetIdProto>(nullptr);
+  }
+
+  OperatorSetIdProto* New(::google::protobuf::Arena* arena) const final {
+    return CreateMaybeMessage<OperatorSetIdProto>(arena);
+  }
+  void CopyFrom(const ::google::protobuf::Message& from) final;
+  void MergeFrom(const ::google::protobuf::Message& from) final;
+  void CopyFrom(const OperatorSetIdProto& from);
+  void MergeFrom(const OperatorSetIdProto& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  #if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  static const char* _InternalParse(const char* begin, const char* end, void* object, ::google::protobuf::internal::ParseContext* ctx);
+  ::google::protobuf::internal::ParseFunc _ParseFunc() const final { return _InternalParse; }
+  #else
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) final;
+  #endif  // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const final;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      ::google::protobuf::uint8* target) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(OperatorSetIdProto* other);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return nullptr;
+  }
+  inline void* MaybeArenaPtr() const {
+    return nullptr;
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const final;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // string domain = 1;
+  void clear_domain();
+  static const int kDomainFieldNumber = 1;
+  const ::std::string& domain() const;
+  void set_domain(const ::std::string& value);
+  #if LANG_CXX11
+  void set_domain(::std::string&& value);
+  #endif
+  void set_domain(const char* value);
+  void set_domain(const char* value, size_t size);
+  ::std::string* mutable_domain();
+  ::std::string* release_domain();
+  void set_allocated_domain(::std::string* domain);
+
+  // int64 version = 2;
+  void clear_version();
+  static const int kVersionFieldNumber = 2;
+  ::google::protobuf::int64 version() const;
+  void set_version(::google::protobuf::int64 value);
+
+  // @@protoc_insertion_point(class_scope:onnx.OperatorSetIdProto)
+ private:
+  class HasBitSetters;
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  ::google::protobuf::internal::ArenaStringPtr domain_;
+  ::google::protobuf::int64 version_;
+  mutable ::google::protobuf::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_onnx_2eproto3;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// AttributeProto
+
+// string name = 1;
+inline void AttributeProto::clear_name() {
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& AttributeProto::name() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.name)
+  return name_.GetNoArena();
+}
+inline void AttributeProto::set_name(const ::std::string& value) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.name)
+}
+#if LANG_CXX11
+inline void AttributeProto::set_name(::std::string&& value) {
+  
+  name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.AttributeProto.name)
+}
+#endif
+inline void AttributeProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.AttributeProto.name)
+}
+inline void AttributeProto::set_name(const char* value, size_t size) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.AttributeProto.name)
+}
+inline ::std::string* AttributeProto::mutable_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.name)
+  return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* AttributeProto::release_name() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.name)
+  
+  return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void AttributeProto::set_allocated_name(::std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.name)
+}
+
+// string ref_attr_name = 21;
+inline void AttributeProto::clear_ref_attr_name() {
+  ref_attr_name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& AttributeProto::ref_attr_name() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.ref_attr_name)
+  return ref_attr_name_.GetNoArena();
+}
+inline void AttributeProto::set_ref_attr_name(const ::std::string& value) {
+  
+  ref_attr_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.ref_attr_name)
+}
+#if LANG_CXX11
+inline void AttributeProto::set_ref_attr_name(::std::string&& value) {
+  
+  ref_attr_name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.AttributeProto.ref_attr_name)
+}
+#endif
+inline void AttributeProto::set_ref_attr_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  ref_attr_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.AttributeProto.ref_attr_name)
+}
+inline void AttributeProto::set_ref_attr_name(const char* value, size_t size) {
+  
+  ref_attr_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.AttributeProto.ref_attr_name)
+}
+inline ::std::string* AttributeProto::mutable_ref_attr_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.ref_attr_name)
+  return ref_attr_name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* AttributeProto::release_ref_attr_name() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.ref_attr_name)
+  
+  return ref_attr_name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void AttributeProto::set_allocated_ref_attr_name(::std::string* ref_attr_name) {
+  if (ref_attr_name != nullptr) {
+    
+  } else {
+    
+  }
+  ref_attr_name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ref_attr_name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.ref_attr_name)
+}
+
+// string doc_string = 13;
+inline void AttributeProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& AttributeProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void AttributeProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.doc_string)
+}
+#if LANG_CXX11
+inline void AttributeProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.AttributeProto.doc_string)
+}
+#endif
+inline void AttributeProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.AttributeProto.doc_string)
+}
+inline void AttributeProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.AttributeProto.doc_string)
+}
+inline ::std::string* AttributeProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* AttributeProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void AttributeProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.doc_string)
+}
+
+// .onnx.AttributeProto.AttributeType type = 20;
+inline void AttributeProto::clear_type() {
+  type_ = 0;
+}
+inline ::onnx::AttributeProto_AttributeType AttributeProto::type() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.type)
+  return static_cast< ::onnx::AttributeProto_AttributeType >(type_);
+}
+inline void AttributeProto::set_type(::onnx::AttributeProto_AttributeType value) {
+  
+  type_ = value;
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.type)
+}
+
+// float f = 2;
+inline void AttributeProto::clear_f() {
+  f_ = 0;
+}
+inline float AttributeProto::f() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.f)
+  return f_;
+}
+inline void AttributeProto::set_f(float value) {
+  
+  f_ = value;
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.f)
+}
+
+// int64 i = 3;
+inline void AttributeProto::clear_i() {
+  i_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 AttributeProto::i() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.i)
+  return i_;
+}
+inline void AttributeProto::set_i(::google::protobuf::int64 value) {
+  
+  i_ = value;
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.i)
+}
+
+// bytes s = 4;
+inline void AttributeProto::clear_s() {
+  s_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& AttributeProto::s() const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.s)
+  return s_.GetNoArena();
+}
+inline void AttributeProto::set_s(const ::std::string& value) {
+  
+  s_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.s)
+}
+#if LANG_CXX11
+inline void AttributeProto::set_s(::std::string&& value) {
+  
+  s_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.AttributeProto.s)
+}
+#endif
+inline void AttributeProto::set_s(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  s_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.AttributeProto.s)
+}
+inline void AttributeProto::set_s(const void* value, size_t size) {
+  
+  s_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.AttributeProto.s)
+}
+inline ::std::string* AttributeProto::mutable_s() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.s)
+  return s_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* AttributeProto::release_s() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.s)
+  
+  return s_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void AttributeProto::set_allocated_s(::std::string* s) {
+  if (s != nullptr) {
+    
+  } else {
+    
+  }
+  s_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), s);
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.s)
+}
+
+// .onnx.TensorProto t = 5;
+inline bool AttributeProto::has_t() const {
+  return this != internal_default_instance() && t_ != nullptr;
+}
+inline void AttributeProto::clear_t() {
+  if (GetArenaNoVirtual() == nullptr && t_ != nullptr) {
+    delete t_;
+  }
+  t_ = nullptr;
+}
+inline const ::onnx::TensorProto& AttributeProto::t() const {
+  const ::onnx::TensorProto* p = t_;
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.t)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::TensorProto*>(
+      &::onnx::_TensorProto_default_instance_);
+}
+inline ::onnx::TensorProto* AttributeProto::release_t() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.t)
+  
+  ::onnx::TensorProto* temp = t_;
+  t_ = nullptr;
+  return temp;
+}
+inline ::onnx::TensorProto* AttributeProto::mutable_t() {
+  
+  if (t_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::TensorProto>(GetArenaNoVirtual());
+    t_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.t)
+  return t_;
+}
+inline void AttributeProto::set_allocated_t(::onnx::TensorProto* t) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete t_;
+  }
+  if (t) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      t = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, t, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  t_ = t;
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.t)
+}
+
+// .onnx.GraphProto g = 6;
+inline bool AttributeProto::has_g() const {
+  return this != internal_default_instance() && g_ != nullptr;
+}
+inline void AttributeProto::clear_g() {
+  if (GetArenaNoVirtual() == nullptr && g_ != nullptr) {
+    delete g_;
+  }
+  g_ = nullptr;
+}
+inline const ::onnx::GraphProto& AttributeProto::g() const {
+  const ::onnx::GraphProto* p = g_;
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.g)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::GraphProto*>(
+      &::onnx::_GraphProto_default_instance_);
+}
+inline ::onnx::GraphProto* AttributeProto::release_g() {
+  // @@protoc_insertion_point(field_release:onnx.AttributeProto.g)
+  
+  ::onnx::GraphProto* temp = g_;
+  g_ = nullptr;
+  return temp;
+}
+inline ::onnx::GraphProto* AttributeProto::mutable_g() {
+  
+  if (g_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::GraphProto>(GetArenaNoVirtual());
+    g_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.g)
+  return g_;
+}
+inline void AttributeProto::set_allocated_g(::onnx::GraphProto* g) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete g_;
+  }
+  if (g) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      g = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, g, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  g_ = g;
+  // @@protoc_insertion_point(field_set_allocated:onnx.AttributeProto.g)
+}
+
+// repeated float floats = 7;
+inline int AttributeProto::floats_size() const {
+  return floats_.size();
+}
+inline void AttributeProto::clear_floats() {
+  floats_.Clear();
+}
+inline float AttributeProto::floats(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.floats)
+  return floats_.Get(index);
+}
+inline void AttributeProto::set_floats(int index, float value) {
+  floats_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.floats)
+}
+inline void AttributeProto::add_floats(float value) {
+  floats_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.floats)
+}
+inline const ::google::protobuf::RepeatedField< float >&
+AttributeProto::floats() const {
+  // @@protoc_insertion_point(field_list:onnx.AttributeProto.floats)
+  return floats_;
+}
+inline ::google::protobuf::RepeatedField< float >*
+AttributeProto::mutable_floats() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.AttributeProto.floats)
+  return &floats_;
+}
+
+// repeated int64 ints = 8;
+inline int AttributeProto::ints_size() const {
+  return ints_.size();
+}
+inline void AttributeProto::clear_ints() {
+  ints_.Clear();
+}
+inline ::google::protobuf::int64 AttributeProto::ints(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.ints)
+  return ints_.Get(index);
+}
+inline void AttributeProto::set_ints(int index, ::google::protobuf::int64 value) {
+  ints_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.ints)
+}
+inline void AttributeProto::add_ints(::google::protobuf::int64 value) {
+  ints_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.ints)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+AttributeProto::ints() const {
+  // @@protoc_insertion_point(field_list:onnx.AttributeProto.ints)
+  return ints_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+AttributeProto::mutable_ints() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.AttributeProto.ints)
+  return &ints_;
+}
+
+// repeated bytes strings = 9;
+inline int AttributeProto::strings_size() const {
+  return strings_.size();
+}
+inline void AttributeProto::clear_strings() {
+  strings_.Clear();
+}
+inline const ::std::string& AttributeProto::strings(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.strings)
+  return strings_.Get(index);
+}
+inline ::std::string* AttributeProto::mutable_strings(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.strings)
+  return strings_.Mutable(index);
+}
+inline void AttributeProto::set_strings(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.strings)
+  strings_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void AttributeProto::set_strings(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:onnx.AttributeProto.strings)
+  strings_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void AttributeProto::set_strings(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  strings_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:onnx.AttributeProto.strings)
+}
+inline void AttributeProto::set_strings(int index, const void* value, size_t size) {
+  strings_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:onnx.AttributeProto.strings)
+}
+inline ::std::string* AttributeProto::add_strings() {
+  // @@protoc_insertion_point(field_add_mutable:onnx.AttributeProto.strings)
+  return strings_.Add();
+}
+inline void AttributeProto::add_strings(const ::std::string& value) {
+  strings_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.strings)
+}
+#if LANG_CXX11
+inline void AttributeProto::add_strings(::std::string&& value) {
+  strings_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.strings)
+}
+#endif
+inline void AttributeProto::add_strings(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  strings_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:onnx.AttributeProto.strings)
+}
+inline void AttributeProto::add_strings(const void* value, size_t size) {
+  strings_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:onnx.AttributeProto.strings)
+}
+inline const ::google::protobuf::RepeatedPtrField<::std::string>&
+AttributeProto::strings() const {
+  // @@protoc_insertion_point(field_list:onnx.AttributeProto.strings)
+  return strings_;
+}
+inline ::google::protobuf::RepeatedPtrField<::std::string>*
+AttributeProto::mutable_strings() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.AttributeProto.strings)
+  return &strings_;
+}
+
+// repeated .onnx.TensorProto tensors = 10;
+inline int AttributeProto::tensors_size() const {
+  return tensors_.size();
+}
+inline void AttributeProto::clear_tensors() {
+  tensors_.Clear();
+}
+inline ::onnx::TensorProto* AttributeProto::mutable_tensors(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.tensors)
+  return tensors_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >*
+AttributeProto::mutable_tensors() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.AttributeProto.tensors)
+  return &tensors_;
+}
+inline const ::onnx::TensorProto& AttributeProto::tensors(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.tensors)
+  return tensors_.Get(index);
+}
+inline ::onnx::TensorProto* AttributeProto::add_tensors() {
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.tensors)
+  return tensors_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >&
+AttributeProto::tensors() const {
+  // @@protoc_insertion_point(field_list:onnx.AttributeProto.tensors)
+  return tensors_;
+}
+
+// repeated .onnx.GraphProto graphs = 11;
+inline int AttributeProto::graphs_size() const {
+  return graphs_.size();
+}
+inline void AttributeProto::clear_graphs() {
+  graphs_.Clear();
+}
+inline ::onnx::GraphProto* AttributeProto::mutable_graphs(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.AttributeProto.graphs)
+  return graphs_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::GraphProto >*
+AttributeProto::mutable_graphs() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.AttributeProto.graphs)
+  return &graphs_;
+}
+inline const ::onnx::GraphProto& AttributeProto::graphs(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.AttributeProto.graphs)
+  return graphs_.Get(index);
+}
+inline ::onnx::GraphProto* AttributeProto::add_graphs() {
+  // @@protoc_insertion_point(field_add:onnx.AttributeProto.graphs)
+  return graphs_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::GraphProto >&
+AttributeProto::graphs() const {
+  // @@protoc_insertion_point(field_list:onnx.AttributeProto.graphs)
+  return graphs_;
+}
+
+// -------------------------------------------------------------------
+
+// ValueInfoProto
+
+// string name = 1;
+inline void ValueInfoProto::clear_name() {
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ValueInfoProto::name() const {
+  // @@protoc_insertion_point(field_get:onnx.ValueInfoProto.name)
+  return name_.GetNoArena();
+}
+inline void ValueInfoProto::set_name(const ::std::string& value) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ValueInfoProto.name)
+}
+#if LANG_CXX11
+inline void ValueInfoProto::set_name(::std::string&& value) {
+  
+  name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ValueInfoProto.name)
+}
+#endif
+inline void ValueInfoProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ValueInfoProto.name)
+}
+inline void ValueInfoProto::set_name(const char* value, size_t size) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ValueInfoProto.name)
+}
+inline ::std::string* ValueInfoProto::mutable_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ValueInfoProto.name)
+  return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ValueInfoProto::release_name() {
+  // @@protoc_insertion_point(field_release:onnx.ValueInfoProto.name)
+  
+  return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ValueInfoProto::set_allocated_name(::std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ValueInfoProto.name)
+}
+
+// .onnx.TypeProto type = 2;
+inline bool ValueInfoProto::has_type() const {
+  return this != internal_default_instance() && type_ != nullptr;
+}
+inline void ValueInfoProto::clear_type() {
+  if (GetArenaNoVirtual() == nullptr && type_ != nullptr) {
+    delete type_;
+  }
+  type_ = nullptr;
+}
+inline const ::onnx::TypeProto& ValueInfoProto::type() const {
+  const ::onnx::TypeProto* p = type_;
+  // @@protoc_insertion_point(field_get:onnx.ValueInfoProto.type)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::TypeProto*>(
+      &::onnx::_TypeProto_default_instance_);
+}
+inline ::onnx::TypeProto* ValueInfoProto::release_type() {
+  // @@protoc_insertion_point(field_release:onnx.ValueInfoProto.type)
+  
+  ::onnx::TypeProto* temp = type_;
+  type_ = nullptr;
+  return temp;
+}
+inline ::onnx::TypeProto* ValueInfoProto::mutable_type() {
+  
+  if (type_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::TypeProto>(GetArenaNoVirtual());
+    type_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.ValueInfoProto.type)
+  return type_;
+}
+inline void ValueInfoProto::set_allocated_type(::onnx::TypeProto* type) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete type_;
+  }
+  if (type) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      type = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, type, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  type_ = type;
+  // @@protoc_insertion_point(field_set_allocated:onnx.ValueInfoProto.type)
+}
+
+// string doc_string = 3;
+inline void ValueInfoProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ValueInfoProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.ValueInfoProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void ValueInfoProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ValueInfoProto.doc_string)
+}
+#if LANG_CXX11
+inline void ValueInfoProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ValueInfoProto.doc_string)
+}
+#endif
+inline void ValueInfoProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ValueInfoProto.doc_string)
+}
+inline void ValueInfoProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ValueInfoProto.doc_string)
+}
+inline ::std::string* ValueInfoProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ValueInfoProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ValueInfoProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.ValueInfoProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ValueInfoProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ValueInfoProto.doc_string)
+}
+
+// -------------------------------------------------------------------
+
+// NodeProto
+
+// repeated string input = 1;
+inline int NodeProto::input_size() const {
+  return input_.size();
+}
+inline void NodeProto::clear_input() {
+  input_.Clear();
+}
+inline const ::std::string& NodeProto::input(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.input)
+  return input_.Get(index);
+}
+inline ::std::string* NodeProto::mutable_input(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.input)
+  return input_.Mutable(index);
+}
+inline void NodeProto::set_input(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.input)
+  input_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void NodeProto::set_input(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.input)
+  input_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void NodeProto::set_input(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  input_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.input)
+}
+inline void NodeProto::set_input(int index, const char* value, size_t size) {
+  input_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.input)
+}
+inline ::std::string* NodeProto::add_input() {
+  // @@protoc_insertion_point(field_add_mutable:onnx.NodeProto.input)
+  return input_.Add();
+}
+inline void NodeProto::add_input(const ::std::string& value) {
+  input_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:onnx.NodeProto.input)
+}
+#if LANG_CXX11
+inline void NodeProto::add_input(::std::string&& value) {
+  input_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:onnx.NodeProto.input)
+}
+#endif
+inline void NodeProto::add_input(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  input_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:onnx.NodeProto.input)
+}
+inline void NodeProto::add_input(const char* value, size_t size) {
+  input_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:onnx.NodeProto.input)
+}
+inline const ::google::protobuf::RepeatedPtrField<::std::string>&
+NodeProto::input() const {
+  // @@protoc_insertion_point(field_list:onnx.NodeProto.input)
+  return input_;
+}
+inline ::google::protobuf::RepeatedPtrField<::std::string>*
+NodeProto::mutable_input() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.NodeProto.input)
+  return &input_;
+}
+
+// repeated string output = 2;
+inline int NodeProto::output_size() const {
+  return output_.size();
+}
+inline void NodeProto::clear_output() {
+  output_.Clear();
+}
+inline const ::std::string& NodeProto::output(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.output)
+  return output_.Get(index);
+}
+inline ::std::string* NodeProto::mutable_output(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.output)
+  return output_.Mutable(index);
+}
+inline void NodeProto::set_output(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.output)
+  output_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void NodeProto::set_output(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.output)
+  output_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void NodeProto::set_output(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  output_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.output)
+}
+inline void NodeProto::set_output(int index, const char* value, size_t size) {
+  output_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.output)
+}
+inline ::std::string* NodeProto::add_output() {
+  // @@protoc_insertion_point(field_add_mutable:onnx.NodeProto.output)
+  return output_.Add();
+}
+inline void NodeProto::add_output(const ::std::string& value) {
+  output_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:onnx.NodeProto.output)
+}
+#if LANG_CXX11
+inline void NodeProto::add_output(::std::string&& value) {
+  output_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:onnx.NodeProto.output)
+}
+#endif
+inline void NodeProto::add_output(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  output_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:onnx.NodeProto.output)
+}
+inline void NodeProto::add_output(const char* value, size_t size) {
+  output_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:onnx.NodeProto.output)
+}
+inline const ::google::protobuf::RepeatedPtrField<::std::string>&
+NodeProto::output() const {
+  // @@protoc_insertion_point(field_list:onnx.NodeProto.output)
+  return output_;
+}
+inline ::google::protobuf::RepeatedPtrField<::std::string>*
+NodeProto::mutable_output() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.NodeProto.output)
+  return &output_;
+}
+
+// string name = 3;
+inline void NodeProto::clear_name() {
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& NodeProto::name() const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.name)
+  return name_.GetNoArena();
+}
+inline void NodeProto::set_name(const ::std::string& value) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.name)
+}
+#if LANG_CXX11
+inline void NodeProto::set_name(::std::string&& value) {
+  
+  name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.NodeProto.name)
+}
+#endif
+inline void NodeProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.name)
+}
+inline void NodeProto::set_name(const char* value, size_t size) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.name)
+}
+inline ::std::string* NodeProto::mutable_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.name)
+  return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* NodeProto::release_name() {
+  // @@protoc_insertion_point(field_release:onnx.NodeProto.name)
+  
+  return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void NodeProto::set_allocated_name(::std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.NodeProto.name)
+}
+
+// string op_type = 4;
+inline void NodeProto::clear_op_type() {
+  op_type_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& NodeProto::op_type() const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.op_type)
+  return op_type_.GetNoArena();
+}
+inline void NodeProto::set_op_type(const ::std::string& value) {
+  
+  op_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.op_type)
+}
+#if LANG_CXX11
+inline void NodeProto::set_op_type(::std::string&& value) {
+  
+  op_type_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.NodeProto.op_type)
+}
+#endif
+inline void NodeProto::set_op_type(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  op_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.op_type)
+}
+inline void NodeProto::set_op_type(const char* value, size_t size) {
+  
+  op_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.op_type)
+}
+inline ::std::string* NodeProto::mutable_op_type() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.op_type)
+  return op_type_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* NodeProto::release_op_type() {
+  // @@protoc_insertion_point(field_release:onnx.NodeProto.op_type)
+  
+  return op_type_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void NodeProto::set_allocated_op_type(::std::string* op_type) {
+  if (op_type != nullptr) {
+    
+  } else {
+    
+  }
+  op_type_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), op_type);
+  // @@protoc_insertion_point(field_set_allocated:onnx.NodeProto.op_type)
+}
+
+// string domain = 7;
+inline void NodeProto::clear_domain() {
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& NodeProto::domain() const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.domain)
+  return domain_.GetNoArena();
+}
+inline void NodeProto::set_domain(const ::std::string& value) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.domain)
+}
+#if LANG_CXX11
+inline void NodeProto::set_domain(::std::string&& value) {
+  
+  domain_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.NodeProto.domain)
+}
+#endif
+inline void NodeProto::set_domain(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.domain)
+}
+inline void NodeProto::set_domain(const char* value, size_t size) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.domain)
+}
+inline ::std::string* NodeProto::mutable_domain() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.domain)
+  return domain_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* NodeProto::release_domain() {
+  // @@protoc_insertion_point(field_release:onnx.NodeProto.domain)
+  
+  return domain_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void NodeProto::set_allocated_domain(::std::string* domain) {
+  if (domain != nullptr) {
+    
+  } else {
+    
+  }
+  domain_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), domain);
+  // @@protoc_insertion_point(field_set_allocated:onnx.NodeProto.domain)
+}
+
+// repeated .onnx.AttributeProto attribute = 5;
+inline int NodeProto::attribute_size() const {
+  return attribute_.size();
+}
+inline void NodeProto::clear_attribute() {
+  attribute_.Clear();
+}
+inline ::onnx::AttributeProto* NodeProto::mutable_attribute(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.attribute)
+  return attribute_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::AttributeProto >*
+NodeProto::mutable_attribute() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.NodeProto.attribute)
+  return &attribute_;
+}
+inline const ::onnx::AttributeProto& NodeProto::attribute(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.attribute)
+  return attribute_.Get(index);
+}
+inline ::onnx::AttributeProto* NodeProto::add_attribute() {
+  // @@protoc_insertion_point(field_add:onnx.NodeProto.attribute)
+  return attribute_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::AttributeProto >&
+NodeProto::attribute() const {
+  // @@protoc_insertion_point(field_list:onnx.NodeProto.attribute)
+  return attribute_;
+}
+
+// string doc_string = 6;
+inline void NodeProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& NodeProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.NodeProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void NodeProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.NodeProto.doc_string)
+}
+#if LANG_CXX11
+inline void NodeProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.NodeProto.doc_string)
+}
+#endif
+inline void NodeProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.NodeProto.doc_string)
+}
+inline void NodeProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.NodeProto.doc_string)
+}
+inline ::std::string* NodeProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.NodeProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* NodeProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.NodeProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void NodeProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.NodeProto.doc_string)
+}
+
+// -------------------------------------------------------------------
+
+// ModelProto
+
+// int64 ir_version = 1;
+inline void ModelProto::clear_ir_version() {
+  ir_version_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 ModelProto::ir_version() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.ir_version)
+  return ir_version_;
+}
+inline void ModelProto::set_ir_version(::google::protobuf::int64 value) {
+  
+  ir_version_ = value;
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.ir_version)
+}
+
+// repeated .onnx.OperatorSetIdProto opset_import = 8;
+inline int ModelProto::opset_import_size() const {
+  return opset_import_.size();
+}
+inline void ModelProto::clear_opset_import() {
+  opset_import_.Clear();
+}
+inline ::onnx::OperatorSetIdProto* ModelProto::mutable_opset_import(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.opset_import)
+  return opset_import_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::OperatorSetIdProto >*
+ModelProto::mutable_opset_import() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.ModelProto.opset_import)
+  return &opset_import_;
+}
+inline const ::onnx::OperatorSetIdProto& ModelProto::opset_import(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.opset_import)
+  return opset_import_.Get(index);
+}
+inline ::onnx::OperatorSetIdProto* ModelProto::add_opset_import() {
+  // @@protoc_insertion_point(field_add:onnx.ModelProto.opset_import)
+  return opset_import_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::OperatorSetIdProto >&
+ModelProto::opset_import() const {
+  // @@protoc_insertion_point(field_list:onnx.ModelProto.opset_import)
+  return opset_import_;
+}
+
+// string producer_name = 2;
+inline void ModelProto::clear_producer_name() {
+  producer_name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ModelProto::producer_name() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.producer_name)
+  return producer_name_.GetNoArena();
+}
+inline void ModelProto::set_producer_name(const ::std::string& value) {
+  
+  producer_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.producer_name)
+}
+#if LANG_CXX11
+inline void ModelProto::set_producer_name(::std::string&& value) {
+  
+  producer_name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ModelProto.producer_name)
+}
+#endif
+inline void ModelProto::set_producer_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  producer_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ModelProto.producer_name)
+}
+inline void ModelProto::set_producer_name(const char* value, size_t size) {
+  
+  producer_name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ModelProto.producer_name)
+}
+inline ::std::string* ModelProto::mutable_producer_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.producer_name)
+  return producer_name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ModelProto::release_producer_name() {
+  // @@protoc_insertion_point(field_release:onnx.ModelProto.producer_name)
+  
+  return producer_name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ModelProto::set_allocated_producer_name(::std::string* producer_name) {
+  if (producer_name != nullptr) {
+    
+  } else {
+    
+  }
+  producer_name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), producer_name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ModelProto.producer_name)
+}
+
+// string producer_version = 3;
+inline void ModelProto::clear_producer_version() {
+  producer_version_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ModelProto::producer_version() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.producer_version)
+  return producer_version_.GetNoArena();
+}
+inline void ModelProto::set_producer_version(const ::std::string& value) {
+  
+  producer_version_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.producer_version)
+}
+#if LANG_CXX11
+inline void ModelProto::set_producer_version(::std::string&& value) {
+  
+  producer_version_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ModelProto.producer_version)
+}
+#endif
+inline void ModelProto::set_producer_version(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  producer_version_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ModelProto.producer_version)
+}
+inline void ModelProto::set_producer_version(const char* value, size_t size) {
+  
+  producer_version_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ModelProto.producer_version)
+}
+inline ::std::string* ModelProto::mutable_producer_version() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.producer_version)
+  return producer_version_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ModelProto::release_producer_version() {
+  // @@protoc_insertion_point(field_release:onnx.ModelProto.producer_version)
+  
+  return producer_version_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ModelProto::set_allocated_producer_version(::std::string* producer_version) {
+  if (producer_version != nullptr) {
+    
+  } else {
+    
+  }
+  producer_version_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), producer_version);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ModelProto.producer_version)
+}
+
+// string domain = 4;
+inline void ModelProto::clear_domain() {
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ModelProto::domain() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.domain)
+  return domain_.GetNoArena();
+}
+inline void ModelProto::set_domain(const ::std::string& value) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.domain)
+}
+#if LANG_CXX11
+inline void ModelProto::set_domain(::std::string&& value) {
+  
+  domain_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ModelProto.domain)
+}
+#endif
+inline void ModelProto::set_domain(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ModelProto.domain)
+}
+inline void ModelProto::set_domain(const char* value, size_t size) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ModelProto.domain)
+}
+inline ::std::string* ModelProto::mutable_domain() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.domain)
+  return domain_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ModelProto::release_domain() {
+  // @@protoc_insertion_point(field_release:onnx.ModelProto.domain)
+  
+  return domain_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ModelProto::set_allocated_domain(::std::string* domain) {
+  if (domain != nullptr) {
+    
+  } else {
+    
+  }
+  domain_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), domain);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ModelProto.domain)
+}
+
+// int64 model_version = 5;
+inline void ModelProto::clear_model_version() {
+  model_version_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 ModelProto::model_version() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.model_version)
+  return model_version_;
+}
+inline void ModelProto::set_model_version(::google::protobuf::int64 value) {
+  
+  model_version_ = value;
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.model_version)
+}
+
+// string doc_string = 6;
+inline void ModelProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& ModelProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void ModelProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.ModelProto.doc_string)
+}
+#if LANG_CXX11
+inline void ModelProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.ModelProto.doc_string)
+}
+#endif
+inline void ModelProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.ModelProto.doc_string)
+}
+inline void ModelProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.ModelProto.doc_string)
+}
+inline ::std::string* ModelProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* ModelProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.ModelProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void ModelProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.ModelProto.doc_string)
+}
+
+// .onnx.GraphProto graph = 7;
+inline bool ModelProto::has_graph() const {
+  return this != internal_default_instance() && graph_ != nullptr;
+}
+inline void ModelProto::clear_graph() {
+  if (GetArenaNoVirtual() == nullptr && graph_ != nullptr) {
+    delete graph_;
+  }
+  graph_ = nullptr;
+}
+inline const ::onnx::GraphProto& ModelProto::graph() const {
+  const ::onnx::GraphProto* p = graph_;
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.graph)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::GraphProto*>(
+      &::onnx::_GraphProto_default_instance_);
+}
+inline ::onnx::GraphProto* ModelProto::release_graph() {
+  // @@protoc_insertion_point(field_release:onnx.ModelProto.graph)
+  
+  ::onnx::GraphProto* temp = graph_;
+  graph_ = nullptr;
+  return temp;
+}
+inline ::onnx::GraphProto* ModelProto::mutable_graph() {
+  
+  if (graph_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::GraphProto>(GetArenaNoVirtual());
+    graph_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.graph)
+  return graph_;
+}
+inline void ModelProto::set_allocated_graph(::onnx::GraphProto* graph) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete graph_;
+  }
+  if (graph) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      graph = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, graph, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  graph_ = graph;
+  // @@protoc_insertion_point(field_set_allocated:onnx.ModelProto.graph)
+}
+
+// repeated .onnx.StringStringEntryProto metadata_props = 14;
+inline int ModelProto::metadata_props_size() const {
+  return metadata_props_.size();
+}
+inline void ModelProto::clear_metadata_props() {
+  metadata_props_.Clear();
+}
+inline ::onnx::StringStringEntryProto* ModelProto::mutable_metadata_props(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.ModelProto.metadata_props)
+  return metadata_props_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >*
+ModelProto::mutable_metadata_props() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.ModelProto.metadata_props)
+  return &metadata_props_;
+}
+inline const ::onnx::StringStringEntryProto& ModelProto::metadata_props(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.ModelProto.metadata_props)
+  return metadata_props_.Get(index);
+}
+inline ::onnx::StringStringEntryProto* ModelProto::add_metadata_props() {
+  // @@protoc_insertion_point(field_add:onnx.ModelProto.metadata_props)
+  return metadata_props_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >&
+ModelProto::metadata_props() const {
+  // @@protoc_insertion_point(field_list:onnx.ModelProto.metadata_props)
+  return metadata_props_;
+}
+
+// -------------------------------------------------------------------
+
+// StringStringEntryProto
+
+// string key = 1;
+inline void StringStringEntryProto::clear_key() {
+  key_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& StringStringEntryProto::key() const {
+  // @@protoc_insertion_point(field_get:onnx.StringStringEntryProto.key)
+  return key_.GetNoArena();
+}
+inline void StringStringEntryProto::set_key(const ::std::string& value) {
+  
+  key_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.StringStringEntryProto.key)
+}
+#if LANG_CXX11
+inline void StringStringEntryProto::set_key(::std::string&& value) {
+  
+  key_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.StringStringEntryProto.key)
+}
+#endif
+inline void StringStringEntryProto::set_key(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  key_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.StringStringEntryProto.key)
+}
+inline void StringStringEntryProto::set_key(const char* value, size_t size) {
+  
+  key_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.StringStringEntryProto.key)
+}
+inline ::std::string* StringStringEntryProto::mutable_key() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.StringStringEntryProto.key)
+  return key_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* StringStringEntryProto::release_key() {
+  // @@protoc_insertion_point(field_release:onnx.StringStringEntryProto.key)
+  
+  return key_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void StringStringEntryProto::set_allocated_key(::std::string* key) {
+  if (key != nullptr) {
+    
+  } else {
+    
+  }
+  key_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), key);
+  // @@protoc_insertion_point(field_set_allocated:onnx.StringStringEntryProto.key)
+}
+
+// string value = 2;
+inline void StringStringEntryProto::clear_value() {
+  value_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& StringStringEntryProto::value() const {
+  // @@protoc_insertion_point(field_get:onnx.StringStringEntryProto.value)
+  return value_.GetNoArena();
+}
+inline void StringStringEntryProto::set_value(const ::std::string& value) {
+  
+  value_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.StringStringEntryProto.value)
+}
+#if LANG_CXX11
+inline void StringStringEntryProto::set_value(::std::string&& value) {
+  
+  value_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.StringStringEntryProto.value)
+}
+#endif
+inline void StringStringEntryProto::set_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  value_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.StringStringEntryProto.value)
+}
+inline void StringStringEntryProto::set_value(const char* value, size_t size) {
+  
+  value_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.StringStringEntryProto.value)
+}
+inline ::std::string* StringStringEntryProto::mutable_value() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.StringStringEntryProto.value)
+  return value_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* StringStringEntryProto::release_value() {
+  // @@protoc_insertion_point(field_release:onnx.StringStringEntryProto.value)
+  
+  return value_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void StringStringEntryProto::set_allocated_value(::std::string* value) {
+  if (value != nullptr) {
+    
+  } else {
+    
+  }
+  value_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set_allocated:onnx.StringStringEntryProto.value)
+}
+
+// -------------------------------------------------------------------
+
+// GraphProto
+
+// repeated .onnx.NodeProto node = 1;
+inline int GraphProto::node_size() const {
+  return node_.size();
+}
+inline void GraphProto::clear_node() {
+  node_.Clear();
+}
+inline ::onnx::NodeProto* GraphProto::mutable_node(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.node)
+  return node_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::NodeProto >*
+GraphProto::mutable_node() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.GraphProto.node)
+  return &node_;
+}
+inline const ::onnx::NodeProto& GraphProto::node(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.node)
+  return node_.Get(index);
+}
+inline ::onnx::NodeProto* GraphProto::add_node() {
+  // @@protoc_insertion_point(field_add:onnx.GraphProto.node)
+  return node_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::NodeProto >&
+GraphProto::node() const {
+  // @@protoc_insertion_point(field_list:onnx.GraphProto.node)
+  return node_;
+}
+
+// string name = 2;
+inline void GraphProto::clear_name() {
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& GraphProto::name() const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.name)
+  return name_.GetNoArena();
+}
+inline void GraphProto::set_name(const ::std::string& value) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.GraphProto.name)
+}
+#if LANG_CXX11
+inline void GraphProto::set_name(::std::string&& value) {
+  
+  name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.GraphProto.name)
+}
+#endif
+inline void GraphProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.GraphProto.name)
+}
+inline void GraphProto::set_name(const char* value, size_t size) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.GraphProto.name)
+}
+inline ::std::string* GraphProto::mutable_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.name)
+  return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* GraphProto::release_name() {
+  // @@protoc_insertion_point(field_release:onnx.GraphProto.name)
+  
+  return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void GraphProto::set_allocated_name(::std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.GraphProto.name)
+}
+
+// repeated .onnx.TensorProto initializer = 5;
+inline int GraphProto::initializer_size() const {
+  return initializer_.size();
+}
+inline void GraphProto::clear_initializer() {
+  initializer_.Clear();
+}
+inline ::onnx::TensorProto* GraphProto::mutable_initializer(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.initializer)
+  return initializer_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >*
+GraphProto::mutable_initializer() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.GraphProto.initializer)
+  return &initializer_;
+}
+inline const ::onnx::TensorProto& GraphProto::initializer(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.initializer)
+  return initializer_.Get(index);
+}
+inline ::onnx::TensorProto* GraphProto::add_initializer() {
+  // @@protoc_insertion_point(field_add:onnx.GraphProto.initializer)
+  return initializer_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::TensorProto >&
+GraphProto::initializer() const {
+  // @@protoc_insertion_point(field_list:onnx.GraphProto.initializer)
+  return initializer_;
+}
+
+// string doc_string = 10;
+inline void GraphProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& GraphProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void GraphProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.GraphProto.doc_string)
+}
+#if LANG_CXX11
+inline void GraphProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.GraphProto.doc_string)
+}
+#endif
+inline void GraphProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.GraphProto.doc_string)
+}
+inline void GraphProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.GraphProto.doc_string)
+}
+inline ::std::string* GraphProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* GraphProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.GraphProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void GraphProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.GraphProto.doc_string)
+}
+
+// repeated .onnx.ValueInfoProto input = 11;
+inline int GraphProto::input_size() const {
+  return input_.size();
+}
+inline void GraphProto::clear_input() {
+  input_.Clear();
+}
+inline ::onnx::ValueInfoProto* GraphProto::mutable_input(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.input)
+  return input_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+GraphProto::mutable_input() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.GraphProto.input)
+  return &input_;
+}
+inline const ::onnx::ValueInfoProto& GraphProto::input(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.input)
+  return input_.Get(index);
+}
+inline ::onnx::ValueInfoProto* GraphProto::add_input() {
+  // @@protoc_insertion_point(field_add:onnx.GraphProto.input)
+  return input_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+GraphProto::input() const {
+  // @@protoc_insertion_point(field_list:onnx.GraphProto.input)
+  return input_;
+}
+
+// repeated .onnx.ValueInfoProto output = 12;
+inline int GraphProto::output_size() const {
+  return output_.size();
+}
+inline void GraphProto::clear_output() {
+  output_.Clear();
+}
+inline ::onnx::ValueInfoProto* GraphProto::mutable_output(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.output)
+  return output_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+GraphProto::mutable_output() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.GraphProto.output)
+  return &output_;
+}
+inline const ::onnx::ValueInfoProto& GraphProto::output(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.output)
+  return output_.Get(index);
+}
+inline ::onnx::ValueInfoProto* GraphProto::add_output() {
+  // @@protoc_insertion_point(field_add:onnx.GraphProto.output)
+  return output_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+GraphProto::output() const {
+  // @@protoc_insertion_point(field_list:onnx.GraphProto.output)
+  return output_;
+}
+
+// repeated .onnx.ValueInfoProto value_info = 13;
+inline int GraphProto::value_info_size() const {
+  return value_info_.size();
+}
+inline void GraphProto::clear_value_info() {
+  value_info_.Clear();
+}
+inline ::onnx::ValueInfoProto* GraphProto::mutable_value_info(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.GraphProto.value_info)
+  return value_info_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >*
+GraphProto::mutable_value_info() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.GraphProto.value_info)
+  return &value_info_;
+}
+inline const ::onnx::ValueInfoProto& GraphProto::value_info(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.GraphProto.value_info)
+  return value_info_.Get(index);
+}
+inline ::onnx::ValueInfoProto* GraphProto::add_value_info() {
+  // @@protoc_insertion_point(field_add:onnx.GraphProto.value_info)
+  return value_info_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::ValueInfoProto >&
+GraphProto::value_info() const {
+  // @@protoc_insertion_point(field_list:onnx.GraphProto.value_info)
+  return value_info_;
+}
+
+// -------------------------------------------------------------------
+
+// TensorProto_Segment
+
+// int64 begin = 1;
+inline void TensorProto_Segment::clear_begin() {
+  begin_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 TensorProto_Segment::begin() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.Segment.begin)
+  return begin_;
+}
+inline void TensorProto_Segment::set_begin(::google::protobuf::int64 value) {
+  
+  begin_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.Segment.begin)
+}
+
+// int64 end = 2;
+inline void TensorProto_Segment::clear_end() {
+  end_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 TensorProto_Segment::end() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.Segment.end)
+  return end_;
+}
+inline void TensorProto_Segment::set_end(::google::protobuf::int64 value) {
+  
+  end_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.Segment.end)
+}
+
+// -------------------------------------------------------------------
+
+// TensorProto
+
+// repeated int64 dims = 1;
+inline int TensorProto::dims_size() const {
+  return dims_.size();
+}
+inline void TensorProto::clear_dims() {
+  dims_.Clear();
+}
+inline ::google::protobuf::int64 TensorProto::dims(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.dims)
+  return dims_.Get(index);
+}
+inline void TensorProto::set_dims(int index, ::google::protobuf::int64 value) {
+  dims_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.dims)
+}
+inline void TensorProto::add_dims(::google::protobuf::int64 value) {
+  dims_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.dims)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+TensorProto::dims() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.dims)
+  return dims_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+TensorProto::mutable_dims() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.dims)
+  return &dims_;
+}
+
+// int32 data_type = 2;
+inline void TensorProto::clear_data_type() {
+  data_type_ = 0;
+}
+inline ::google::protobuf::int32 TensorProto::data_type() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.data_type)
+  return data_type_;
+}
+inline void TensorProto::set_data_type(::google::protobuf::int32 value) {
+  
+  data_type_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.data_type)
+}
+
+// .onnx.TensorProto.Segment segment = 3;
+inline bool TensorProto::has_segment() const {
+  return this != internal_default_instance() && segment_ != nullptr;
+}
+inline void TensorProto::clear_segment() {
+  if (GetArenaNoVirtual() == nullptr && segment_ != nullptr) {
+    delete segment_;
+  }
+  segment_ = nullptr;
+}
+inline const ::onnx::TensorProto_Segment& TensorProto::segment() const {
+  const ::onnx::TensorProto_Segment* p = segment_;
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.segment)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::TensorProto_Segment*>(
+      &::onnx::_TensorProto_Segment_default_instance_);
+}
+inline ::onnx::TensorProto_Segment* TensorProto::release_segment() {
+  // @@protoc_insertion_point(field_release:onnx.TensorProto.segment)
+  
+  ::onnx::TensorProto_Segment* temp = segment_;
+  segment_ = nullptr;
+  return temp;
+}
+inline ::onnx::TensorProto_Segment* TensorProto::mutable_segment() {
+  
+  if (segment_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::TensorProto_Segment>(GetArenaNoVirtual());
+    segment_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.segment)
+  return segment_;
+}
+inline void TensorProto::set_allocated_segment(::onnx::TensorProto_Segment* segment) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete segment_;
+  }
+  if (segment) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      segment = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, segment, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  segment_ = segment;
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorProto.segment)
+}
+
+// repeated float float_data = 4 [packed = true];
+inline int TensorProto::float_data_size() const {
+  return float_data_.size();
+}
+inline void TensorProto::clear_float_data() {
+  float_data_.Clear();
+}
+inline float TensorProto::float_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.float_data)
+  return float_data_.Get(index);
+}
+inline void TensorProto::set_float_data(int index, float value) {
+  float_data_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.float_data)
+}
+inline void TensorProto::add_float_data(float value) {
+  float_data_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.float_data)
+}
+inline const ::google::protobuf::RepeatedField< float >&
+TensorProto::float_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.float_data)
+  return float_data_;
+}
+inline ::google::protobuf::RepeatedField< float >*
+TensorProto::mutable_float_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.float_data)
+  return &float_data_;
+}
+
+// repeated int32 int32_data = 5 [packed = true];
+inline int TensorProto::int32_data_size() const {
+  return int32_data_.size();
+}
+inline void TensorProto::clear_int32_data() {
+  int32_data_.Clear();
+}
+inline ::google::protobuf::int32 TensorProto::int32_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.int32_data)
+  return int32_data_.Get(index);
+}
+inline void TensorProto::set_int32_data(int index, ::google::protobuf::int32 value) {
+  int32_data_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.int32_data)
+}
+inline void TensorProto::add_int32_data(::google::protobuf::int32 value) {
+  int32_data_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.int32_data)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+TensorProto::int32_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.int32_data)
+  return int32_data_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+TensorProto::mutable_int32_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.int32_data)
+  return &int32_data_;
+}
+
+// repeated bytes string_data = 6;
+inline int TensorProto::string_data_size() const {
+  return string_data_.size();
+}
+inline void TensorProto::clear_string_data() {
+  string_data_.Clear();
+}
+inline const ::std::string& TensorProto::string_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.string_data)
+  return string_data_.Get(index);
+}
+inline ::std::string* TensorProto::mutable_string_data(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.string_data)
+  return string_data_.Mutable(index);
+}
+inline void TensorProto::set_string_data(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.string_data)
+  string_data_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void TensorProto::set_string_data(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.string_data)
+  string_data_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void TensorProto::set_string_data(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  string_data_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:onnx.TensorProto.string_data)
+}
+inline void TensorProto::set_string_data(int index, const void* value, size_t size) {
+  string_data_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorProto.string_data)
+}
+inline ::std::string* TensorProto::add_string_data() {
+  // @@protoc_insertion_point(field_add_mutable:onnx.TensorProto.string_data)
+  return string_data_.Add();
+}
+inline void TensorProto::add_string_data(const ::std::string& value) {
+  string_data_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.string_data)
+}
+#if LANG_CXX11
+inline void TensorProto::add_string_data(::std::string&& value) {
+  string_data_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.string_data)
+}
+#endif
+inline void TensorProto::add_string_data(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  string_data_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:onnx.TensorProto.string_data)
+}
+inline void TensorProto::add_string_data(const void* value, size_t size) {
+  string_data_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:onnx.TensorProto.string_data)
+}
+inline const ::google::protobuf::RepeatedPtrField<::std::string>&
+TensorProto::string_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.string_data)
+  return string_data_;
+}
+inline ::google::protobuf::RepeatedPtrField<::std::string>*
+TensorProto::mutable_string_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.string_data)
+  return &string_data_;
+}
+
+// repeated int64 int64_data = 7 [packed = true];
+inline int TensorProto::int64_data_size() const {
+  return int64_data_.size();
+}
+inline void TensorProto::clear_int64_data() {
+  int64_data_.Clear();
+}
+inline ::google::protobuf::int64 TensorProto::int64_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.int64_data)
+  return int64_data_.Get(index);
+}
+inline void TensorProto::set_int64_data(int index, ::google::protobuf::int64 value) {
+  int64_data_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.int64_data)
+}
+inline void TensorProto::add_int64_data(::google::protobuf::int64 value) {
+  int64_data_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.int64_data)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+TensorProto::int64_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.int64_data)
+  return int64_data_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+TensorProto::mutable_int64_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.int64_data)
+  return &int64_data_;
+}
+
+// string name = 8;
+inline void TensorProto::clear_name() {
+  name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& TensorProto::name() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.name)
+  return name_.GetNoArena();
+}
+inline void TensorProto::set_name(const ::std::string& value) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.name)
+}
+#if LANG_CXX11
+inline void TensorProto::set_name(::std::string&& value) {
+  
+  name_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TensorProto.name)
+}
+#endif
+inline void TensorProto::set_name(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TensorProto.name)
+}
+inline void TensorProto::set_name(const char* value, size_t size) {
+  
+  name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorProto.name)
+}
+inline ::std::string* TensorProto::mutable_name() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.name)
+  return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TensorProto::release_name() {
+  // @@protoc_insertion_point(field_release:onnx.TensorProto.name)
+  
+  return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void TensorProto::set_allocated_name(::std::string* name) {
+  if (name != nullptr) {
+    
+  } else {
+    
+  }
+  name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name);
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorProto.name)
+}
+
+// string doc_string = 12;
+inline void TensorProto::clear_doc_string() {
+  doc_string_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& TensorProto::doc_string() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.doc_string)
+  return doc_string_.GetNoArena();
+}
+inline void TensorProto::set_doc_string(const ::std::string& value) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.doc_string)
+}
+#if LANG_CXX11
+inline void TensorProto::set_doc_string(::std::string&& value) {
+  
+  doc_string_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TensorProto.doc_string)
+}
+#endif
+inline void TensorProto::set_doc_string(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TensorProto.doc_string)
+}
+inline void TensorProto::set_doc_string(const char* value, size_t size) {
+  
+  doc_string_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorProto.doc_string)
+}
+inline ::std::string* TensorProto::mutable_doc_string() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.doc_string)
+  return doc_string_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TensorProto::release_doc_string() {
+  // @@protoc_insertion_point(field_release:onnx.TensorProto.doc_string)
+  
+  return doc_string_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void TensorProto::set_allocated_doc_string(::std::string* doc_string) {
+  if (doc_string != nullptr) {
+    
+  } else {
+    
+  }
+  doc_string_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), doc_string);
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorProto.doc_string)
+}
+
+// bytes raw_data = 9;
+inline void TensorProto::clear_raw_data() {
+  raw_data_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& TensorProto::raw_data() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.raw_data)
+  return raw_data_.GetNoArena();
+}
+inline void TensorProto::set_raw_data(const ::std::string& value) {
+  
+  raw_data_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.raw_data)
+}
+#if LANG_CXX11
+inline void TensorProto::set_raw_data(::std::string&& value) {
+  
+  raw_data_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TensorProto.raw_data)
+}
+#endif
+inline void TensorProto::set_raw_data(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  raw_data_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TensorProto.raw_data)
+}
+inline void TensorProto::set_raw_data(const void* value, size_t size) {
+  
+  raw_data_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorProto.raw_data)
+}
+inline ::std::string* TensorProto::mutable_raw_data() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.raw_data)
+  return raw_data_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TensorProto::release_raw_data() {
+  // @@protoc_insertion_point(field_release:onnx.TensorProto.raw_data)
+  
+  return raw_data_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void TensorProto::set_allocated_raw_data(::std::string* raw_data) {
+  if (raw_data != nullptr) {
+    
+  } else {
+    
+  }
+  raw_data_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), raw_data);
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorProto.raw_data)
+}
+
+// repeated .onnx.StringStringEntryProto external_data = 13;
+inline int TensorProto::external_data_size() const {
+  return external_data_.size();
+}
+inline void TensorProto::clear_external_data() {
+  external_data_.Clear();
+}
+inline ::onnx::StringStringEntryProto* TensorProto::mutable_external_data(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.TensorProto.external_data)
+  return external_data_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >*
+TensorProto::mutable_external_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.external_data)
+  return &external_data_;
+}
+inline const ::onnx::StringStringEntryProto& TensorProto::external_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.external_data)
+  return external_data_.Get(index);
+}
+inline ::onnx::StringStringEntryProto* TensorProto::add_external_data() {
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.external_data)
+  return external_data_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::StringStringEntryProto >&
+TensorProto::external_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.external_data)
+  return external_data_;
+}
+
+// .onnx.TensorProto.DataLocation data_location = 14;
+inline void TensorProto::clear_data_location() {
+  data_location_ = 0;
+}
+inline ::onnx::TensorProto_DataLocation TensorProto::data_location() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.data_location)
+  return static_cast< ::onnx::TensorProto_DataLocation >(data_location_);
+}
+inline void TensorProto::set_data_location(::onnx::TensorProto_DataLocation value) {
+  
+  data_location_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.data_location)
+}
+
+// repeated double double_data = 10 [packed = true];
+inline int TensorProto::double_data_size() const {
+  return double_data_.size();
+}
+inline void TensorProto::clear_double_data() {
+  double_data_.Clear();
+}
+inline double TensorProto::double_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.double_data)
+  return double_data_.Get(index);
+}
+inline void TensorProto::set_double_data(int index, double value) {
+  double_data_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.double_data)
+}
+inline void TensorProto::add_double_data(double value) {
+  double_data_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.double_data)
+}
+inline const ::google::protobuf::RepeatedField< double >&
+TensorProto::double_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.double_data)
+  return double_data_;
+}
+inline ::google::protobuf::RepeatedField< double >*
+TensorProto::mutable_double_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.double_data)
+  return &double_data_;
+}
+
+// repeated uint64 uint64_data = 11 [packed = true];
+inline int TensorProto::uint64_data_size() const {
+  return uint64_data_.size();
+}
+inline void TensorProto::clear_uint64_data() {
+  uint64_data_.Clear();
+}
+inline ::google::protobuf::uint64 TensorProto::uint64_data(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorProto.uint64_data)
+  return uint64_data_.Get(index);
+}
+inline void TensorProto::set_uint64_data(int index, ::google::protobuf::uint64 value) {
+  uint64_data_.Set(index, value);
+  // @@protoc_insertion_point(field_set:onnx.TensorProto.uint64_data)
+}
+inline void TensorProto::add_uint64_data(::google::protobuf::uint64 value) {
+  uint64_data_.Add(value);
+  // @@protoc_insertion_point(field_add:onnx.TensorProto.uint64_data)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::uint64 >&
+TensorProto::uint64_data() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorProto.uint64_data)
+  return uint64_data_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::uint64 >*
+TensorProto::mutable_uint64_data() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorProto.uint64_data)
+  return &uint64_data_;
+}
+
+// -------------------------------------------------------------------
+
+// TensorShapeProto_Dimension
+
+// int64 dim_value = 1;
+inline bool TensorShapeProto_Dimension::has_dim_value() const {
+  return value_case() == kDimValue;
+}
+inline void TensorShapeProto_Dimension::set_has_dim_value() {
+  _oneof_case_[0] = kDimValue;
+}
+inline void TensorShapeProto_Dimension::clear_dim_value() {
+  if (has_dim_value()) {
+    value_.dim_value_ = PROTOBUF_LONGLONG(0);
+    clear_has_value();
+  }
+}
+inline ::google::protobuf::int64 TensorShapeProto_Dimension::dim_value() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorShapeProto.Dimension.dim_value)
+  if (has_dim_value()) {
+    return value_.dim_value_;
+  }
+  return PROTOBUF_LONGLONG(0);
+}
+inline void TensorShapeProto_Dimension::set_dim_value(::google::protobuf::int64 value) {
+  if (!has_dim_value()) {
+    clear_value();
+    set_has_dim_value();
+  }
+  value_.dim_value_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TensorShapeProto.Dimension.dim_value)
+}
+
+// string dim_param = 2;
+inline bool TensorShapeProto_Dimension::has_dim_param() const {
+  return value_case() == kDimParam;
+}
+inline void TensorShapeProto_Dimension::set_has_dim_param() {
+  _oneof_case_[0] = kDimParam;
+}
+inline void TensorShapeProto_Dimension::clear_dim_param() {
+  if (has_dim_param()) {
+    value_.dim_param_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+    clear_has_value();
+  }
+}
+inline const ::std::string& TensorShapeProto_Dimension::dim_param() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorShapeProto.Dimension.dim_param)
+  if (has_dim_param()) {
+    return value_.dim_param_.GetNoArena();
+  }
+  return *&::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+inline void TensorShapeProto_Dimension::set_dim_param(const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:onnx.TensorShapeProto.Dimension.dim_param)
+  if (!has_dim_param()) {
+    clear_value();
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  }
+  value_.dim_param_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TensorShapeProto.Dimension.dim_param)
+}
+#if LANG_CXX11
+inline void TensorShapeProto_Dimension::set_dim_param(::std::string&& value) {
+  // @@protoc_insertion_point(field_set:onnx.TensorShapeProto.Dimension.dim_param)
+  if (!has_dim_param()) {
+    clear_value();
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  }
+  value_.dim_param_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TensorShapeProto.Dimension.dim_param)
+}
+#endif
+inline void TensorShapeProto_Dimension::set_dim_param(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  if (!has_dim_param()) {
+    clear_value();
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  }
+  value_.dim_param_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TensorShapeProto.Dimension.dim_param)
+}
+inline void TensorShapeProto_Dimension::set_dim_param(const char* value, size_t size) {
+  if (!has_dim_param()) {
+    clear_value();
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  }
+  value_.dim_param_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorShapeProto.Dimension.dim_param)
+}
+inline ::std::string* TensorShapeProto_Dimension::mutable_dim_param() {
+  if (!has_dim_param()) {
+    clear_value();
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.TensorShapeProto.Dimension.dim_param)
+  return value_.dim_param_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TensorShapeProto_Dimension::release_dim_param() {
+  // @@protoc_insertion_point(field_release:onnx.TensorShapeProto.Dimension.dim_param)
+  if (has_dim_param()) {
+    clear_has_value();
+    return value_.dim_param_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  } else {
+    return nullptr;
+  }
+}
+inline void TensorShapeProto_Dimension::set_allocated_dim_param(::std::string* dim_param) {
+  if (has_value()) {
+    clear_value();
+  }
+  if (dim_param != nullptr) {
+    set_has_dim_param();
+    value_.dim_param_.UnsafeSetDefault(dim_param);
+  }
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorShapeProto.Dimension.dim_param)
+}
+
+// string denotation = 3;
+inline void TensorShapeProto_Dimension::clear_denotation() {
+  denotation_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& TensorShapeProto_Dimension::denotation() const {
+  // @@protoc_insertion_point(field_get:onnx.TensorShapeProto.Dimension.denotation)
+  return denotation_.GetNoArena();
+}
+inline void TensorShapeProto_Dimension::set_denotation(const ::std::string& value) {
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TensorShapeProto.Dimension.denotation)
+}
+#if LANG_CXX11
+inline void TensorShapeProto_Dimension::set_denotation(::std::string&& value) {
+  
+  denotation_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TensorShapeProto.Dimension.denotation)
+}
+#endif
+inline void TensorShapeProto_Dimension::set_denotation(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TensorShapeProto.Dimension.denotation)
+}
+inline void TensorShapeProto_Dimension::set_denotation(const char* value, size_t size) {
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TensorShapeProto.Dimension.denotation)
+}
+inline ::std::string* TensorShapeProto_Dimension::mutable_denotation() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.TensorShapeProto.Dimension.denotation)
+  return denotation_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TensorShapeProto_Dimension::release_denotation() {
+  // @@protoc_insertion_point(field_release:onnx.TensorShapeProto.Dimension.denotation)
+  
+  return denotation_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void TensorShapeProto_Dimension::set_allocated_denotation(::std::string* denotation) {
+  if (denotation != nullptr) {
+    
+  } else {
+    
+  }
+  denotation_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), denotation);
+  // @@protoc_insertion_point(field_set_allocated:onnx.TensorShapeProto.Dimension.denotation)
+}
+
+inline bool TensorShapeProto_Dimension::has_value() const {
+  return value_case() != VALUE_NOT_SET;
+}
+inline void TensorShapeProto_Dimension::clear_has_value() {
+  _oneof_case_[0] = VALUE_NOT_SET;
+}
+inline TensorShapeProto_Dimension::ValueCase TensorShapeProto_Dimension::value_case() const {
+  return TensorShapeProto_Dimension::ValueCase(_oneof_case_[0]);
+}
+// -------------------------------------------------------------------
+
+// TensorShapeProto
+
+// repeated .onnx.TensorShapeProto.Dimension dim = 1;
+inline int TensorShapeProto::dim_size() const {
+  return dim_.size();
+}
+inline void TensorShapeProto::clear_dim() {
+  dim_.Clear();
+}
+inline ::onnx::TensorShapeProto_Dimension* TensorShapeProto::mutable_dim(int index) {
+  // @@protoc_insertion_point(field_mutable:onnx.TensorShapeProto.dim)
+  return dim_.Mutable(index);
+}
+inline ::google::protobuf::RepeatedPtrField< ::onnx::TensorShapeProto_Dimension >*
+TensorShapeProto::mutable_dim() {
+  // @@protoc_insertion_point(field_mutable_list:onnx.TensorShapeProto.dim)
+  return &dim_;
+}
+inline const ::onnx::TensorShapeProto_Dimension& TensorShapeProto::dim(int index) const {
+  // @@protoc_insertion_point(field_get:onnx.TensorShapeProto.dim)
+  return dim_.Get(index);
+}
+inline ::onnx::TensorShapeProto_Dimension* TensorShapeProto::add_dim() {
+  // @@protoc_insertion_point(field_add:onnx.TensorShapeProto.dim)
+  return dim_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::onnx::TensorShapeProto_Dimension >&
+TensorShapeProto::dim() const {
+  // @@protoc_insertion_point(field_list:onnx.TensorShapeProto.dim)
+  return dim_;
+}
+
+// -------------------------------------------------------------------
+
+// TypeProto_Tensor
+
+// int32 elem_type = 1;
+inline void TypeProto_Tensor::clear_elem_type() {
+  elem_type_ = 0;
+}
+inline ::google::protobuf::int32 TypeProto_Tensor::elem_type() const {
+  // @@protoc_insertion_point(field_get:onnx.TypeProto.Tensor.elem_type)
+  return elem_type_;
+}
+inline void TypeProto_Tensor::set_elem_type(::google::protobuf::int32 value) {
+  
+  elem_type_ = value;
+  // @@protoc_insertion_point(field_set:onnx.TypeProto.Tensor.elem_type)
+}
+
+// .onnx.TensorShapeProto shape = 2;
+inline bool TypeProto_Tensor::has_shape() const {
+  return this != internal_default_instance() && shape_ != nullptr;
+}
+inline void TypeProto_Tensor::clear_shape() {
+  if (GetArenaNoVirtual() == nullptr && shape_ != nullptr) {
+    delete shape_;
+  }
+  shape_ = nullptr;
+}
+inline const ::onnx::TensorShapeProto& TypeProto_Tensor::shape() const {
+  const ::onnx::TensorShapeProto* p = shape_;
+  // @@protoc_insertion_point(field_get:onnx.TypeProto.Tensor.shape)
+  return p != nullptr ? *p : *reinterpret_cast<const ::onnx::TensorShapeProto*>(
+      &::onnx::_TensorShapeProto_default_instance_);
+}
+inline ::onnx::TensorShapeProto* TypeProto_Tensor::release_shape() {
+  // @@protoc_insertion_point(field_release:onnx.TypeProto.Tensor.shape)
+  
+  ::onnx::TensorShapeProto* temp = shape_;
+  shape_ = nullptr;
+  return temp;
+}
+inline ::onnx::TensorShapeProto* TypeProto_Tensor::mutable_shape() {
+  
+  if (shape_ == nullptr) {
+    auto* p = CreateMaybeMessage<::onnx::TensorShapeProto>(GetArenaNoVirtual());
+    shape_ = p;
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.TypeProto.Tensor.shape)
+  return shape_;
+}
+inline void TypeProto_Tensor::set_allocated_shape(::onnx::TensorShapeProto* shape) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == nullptr) {
+    delete shape_;
+  }
+  if (shape) {
+    ::google::protobuf::Arena* submessage_arena = nullptr;
+    if (message_arena != submessage_arena) {
+      shape = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, shape, submessage_arena);
+    }
+    
+  } else {
+    
+  }
+  shape_ = shape;
+  // @@protoc_insertion_point(field_set_allocated:onnx.TypeProto.Tensor.shape)
+}
+
+// -------------------------------------------------------------------
+
+// TypeProto
+
+// .onnx.TypeProto.Tensor tensor_type = 1;
+inline bool TypeProto::has_tensor_type() const {
+  return value_case() == kTensorType;
+}
+inline void TypeProto::set_has_tensor_type() {
+  _oneof_case_[0] = kTensorType;
+}
+inline void TypeProto::clear_tensor_type() {
+  if (has_tensor_type()) {
+    delete value_.tensor_type_;
+    clear_has_value();
+  }
+}
+inline ::onnx::TypeProto_Tensor* TypeProto::release_tensor_type() {
+  // @@protoc_insertion_point(field_release:onnx.TypeProto.tensor_type)
+  if (has_tensor_type()) {
+    clear_has_value();
+      ::onnx::TypeProto_Tensor* temp = value_.tensor_type_;
+    value_.tensor_type_ = nullptr;
+    return temp;
+  } else {
+    return nullptr;
+  }
+}
+inline const ::onnx::TypeProto_Tensor& TypeProto::tensor_type() const {
+  // @@protoc_insertion_point(field_get:onnx.TypeProto.tensor_type)
+  return has_tensor_type()
+      ? *value_.tensor_type_
+      : *reinterpret_cast< ::onnx::TypeProto_Tensor*>(&::onnx::_TypeProto_Tensor_default_instance_);
+}
+inline ::onnx::TypeProto_Tensor* TypeProto::mutable_tensor_type() {
+  if (!has_tensor_type()) {
+    clear_value();
+    set_has_tensor_type();
+    value_.tensor_type_ = CreateMaybeMessage< ::onnx::TypeProto_Tensor >(
+        GetArenaNoVirtual());
+  }
+  // @@protoc_insertion_point(field_mutable:onnx.TypeProto.tensor_type)
+  return value_.tensor_type_;
+}
+
+// string denotation = 6;
+inline void TypeProto::clear_denotation() {
+  denotation_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& TypeProto::denotation() const {
+  // @@protoc_insertion_point(field_get:onnx.TypeProto.denotation)
+  return denotation_.GetNoArena();
+}
+inline void TypeProto::set_denotation(const ::std::string& value) {
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.TypeProto.denotation)
+}
+#if LANG_CXX11
+inline void TypeProto::set_denotation(::std::string&& value) {
+  
+  denotation_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.TypeProto.denotation)
+}
+#endif
+inline void TypeProto::set_denotation(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.TypeProto.denotation)
+}
+inline void TypeProto::set_denotation(const char* value, size_t size) {
+  
+  denotation_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.TypeProto.denotation)
+}
+inline ::std::string* TypeProto::mutable_denotation() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.TypeProto.denotation)
+  return denotation_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* TypeProto::release_denotation() {
+  // @@protoc_insertion_point(field_release:onnx.TypeProto.denotation)
+  
+  return denotation_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void TypeProto::set_allocated_denotation(::std::string* denotation) {
+  if (denotation != nullptr) {
+    
+  } else {
+    
+  }
+  denotation_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), denotation);
+  // @@protoc_insertion_point(field_set_allocated:onnx.TypeProto.denotation)
+}
+
+inline bool TypeProto::has_value() const {
+  return value_case() != VALUE_NOT_SET;
+}
+inline void TypeProto::clear_has_value() {
+  _oneof_case_[0] = VALUE_NOT_SET;
+}
+inline TypeProto::ValueCase TypeProto::value_case() const {
+  return TypeProto::ValueCase(_oneof_case_[0]);
+}
+// -------------------------------------------------------------------
+
+// OperatorSetIdProto
+
+// string domain = 1;
+inline void OperatorSetIdProto::clear_domain() {
+  domain_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline const ::std::string& OperatorSetIdProto::domain() const {
+  // @@protoc_insertion_point(field_get:onnx.OperatorSetIdProto.domain)
+  return domain_.GetNoArena();
+}
+inline void OperatorSetIdProto::set_domain(const ::std::string& value) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value);
+  // @@protoc_insertion_point(field_set:onnx.OperatorSetIdProto.domain)
+}
+#if LANG_CXX11
+inline void OperatorSetIdProto::set_domain(::std::string&& value) {
+  
+  domain_.SetNoArena(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value));
+  // @@protoc_insertion_point(field_set_rvalue:onnx.OperatorSetIdProto.domain)
+}
+#endif
+inline void OperatorSetIdProto::set_domain(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value));
+  // @@protoc_insertion_point(field_set_char:onnx.OperatorSetIdProto.domain)
+}
+inline void OperatorSetIdProto::set_domain(const char* value, size_t size) {
+  
+  domain_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(reinterpret_cast<const char*>(value), size));
+  // @@protoc_insertion_point(field_set_pointer:onnx.OperatorSetIdProto.domain)
+}
+inline ::std::string* OperatorSetIdProto::mutable_domain() {
+  
+  // @@protoc_insertion_point(field_mutable:onnx.OperatorSetIdProto.domain)
+  return domain_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline ::std::string* OperatorSetIdProto::release_domain() {
+  // @@protoc_insertion_point(field_release:onnx.OperatorSetIdProto.domain)
+  
+  return domain_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+inline void OperatorSetIdProto::set_allocated_domain(::std::string* domain) {
+  if (domain != nullptr) {
+    
+  } else {
+    
+  }
+  domain_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), domain);
+  // @@protoc_insertion_point(field_set_allocated:onnx.OperatorSetIdProto.domain)
+}
+
+// int64 version = 2;
+inline void OperatorSetIdProto::clear_version() {
+  version_ = PROTOBUF_LONGLONG(0);
+}
+inline ::google::protobuf::int64 OperatorSetIdProto::version() const {
+  // @@protoc_insertion_point(field_get:onnx.OperatorSetIdProto.version)
+  return version_;
+}
+inline void OperatorSetIdProto::set_version(::google::protobuf::int64 value) {
+  
+  version_ = value;
+  // @@protoc_insertion_point(field_set:onnx.OperatorSetIdProto.version)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace onnx
+
+namespace google {
+namespace protobuf {
+
+template <> struct is_proto_enum< ::onnx::AttributeProto_AttributeType> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::onnx::AttributeProto_AttributeType>() {
+  return ::onnx::AttributeProto_AttributeType_descriptor();
+}
+template <> struct is_proto_enum< ::onnx::TensorProto_DataType> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::onnx::TensorProto_DataType>() {
+  return ::onnx::TensorProto_DataType_descriptor();
+}
+template <> struct is_proto_enum< ::onnx::TensorProto_DataLocation> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::onnx::TensorProto_DataLocation>() {
+  return ::onnx::TensorProto_DataLocation_descriptor();
+}
+template <> struct is_proto_enum< ::onnx::Version> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::onnx::Version>() {
+  return ::onnx::Version_descriptor();
+}
+
+}  // namespace protobuf
+}  // namespace google
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // PROTOBUF_INCLUDED_onnx_2eproto3
diff --git a/3rdparty/TNN/tools/quantization/CMakeLists.txt b/3rdparty/TNN/tools/quantization/CMakeLists.txt
new file mode 100644
index 0000000..31a739b
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/CMakeLists.txt
@@ -0,0 +1,13 @@
+file(GLOB COMMON_SRCS ${CMAKE_SOURCE_DIR}/tools/common/*.cc)
+file(GLOB QUANTIZATION_SRCS *.cc)
+
+message(${QUANTIZATION_SRCS})
+
+include_directories(${CMAKE_SOURCE_DIR}/tools/common)
+include_directories(${CMAKE_SOURCE_DIR}/tools/quantization)
+include_directories(${CMAKE_SOURCE_DIR}/source/tnn/interpreter/tnn)
+include_directories(${CMAKE_SOURCE_DIR}/third_party/stb)
+
+add_executable(quantization_cmd ${QUANTIZATION_SRCS} ${COMMON_SRCS})
+target_link_libraries(quantization_cmd TNN)
+set_target_properties(quantization_cmd PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
diff --git a/3rdparty/TNN/tools/quantization/calibration.cc b/3rdparty/TNN/tools/quantization/calibration.cc
new file mode 100644
index 0000000..114a6b2
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/calibration.cc
@@ -0,0 +1,737 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "calibration.h"
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include "file_reader.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/interpreter/tnn/objseri.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+static const std::set<LayerType> kQuantizedLayerTypeStr = {LAYER_CONVOLUTION, LAYER_ADD, LAYER_CONCAT,
+                                                           LAYER_INNER_PRODUCT};
+
+static const std::set<LayerType> kBlobScaleMergeLayerTypeStr = {LAYER_RELU, LAYER_POOLING};
+
+static void InitWeightScaleADMM(const float* weights, const int size, const int output_channel, bool merge_channel,
+                                float* weight_scale, const int quantize_bits) {
+    int weight_scale_count = merge_channel ? 1 : output_channel;
+    const int s_size       = size / weight_scale_count;
+    const int bound        = std::pow(2, quantize_bits - 1) - 1;
+
+    for (int i = 0; i < weight_scale_count; i++) {
+        float avg = 0;
+        float max = 0;
+        float val_abs;
+
+        for (int j = 0; j < s_size; j++) {
+            val_abs = std::fabs(weights[i * s_size + j]);
+            avg += val_abs;
+            if (val_abs > max) {
+                max = val_abs;
+            }
+        }
+        avg = avg / float(s_size);
+
+        if (quantize_bits > 2) {
+            weight_scale[i] = max / (bound * 1.25);
+        } else {
+            weight_scale[i] = avg;
+        }
+    }
+}
+
+static void UpdateQuantizedWeightsADMM(const float* weights, const int size, const int output_channel,
+                                       bool merge_channel, float* weight_scale, const int quantize_bits,
+                                       int8_t* quantized_weights) {
+    int weight_scale_count = merge_channel ? 1 : output_channel;
+    const int s_size       = size / weight_scale_count;
+    const float bound      = std::pow(2, quantize_bits - 1) - 1;
+    const float eps        = 1e-9f;
+    float weight_quan;
+    ASSERT(quantize_bits > 4);
+
+    for (int i = 0; i < size; i++) {
+        weight_quan          = weights[i] / (weight_scale[i / s_size] + eps);
+        quantized_weights[i] = std::min(bound, std::max(-bound, std::roundf(weight_quan)));
+    }
+}
+
+static void UpdateAlphaADMM(const float* weights, const int size, const int output_channel, bool merge_channel,
+                            float* weight_scale, int8_t* quantized_weights) {
+    int weight_scale_count = merge_channel ? 1 : output_channel;
+    const int s_size       = size / weight_scale_count;
+    const float eps        = 1e-9f;
+
+    for (int i = 0; i < weight_scale_count; i++) {
+        const int offset = i * s_size;
+        float sum1       = 0;
+        float sum2       = 0;
+
+        for (int j = 0; j < s_size; j++) {
+            sum1 += weights[offset + j] * quantized_weights[offset + j];
+            sum2 += quantized_weights[offset + j] * quantized_weights[offset + j];
+        }
+        weight_scale[i] = sum1 / (sum2 + eps);
+    }
+}
+
+Calibration::Calibration() {}
+
+Calibration::~Calibration() {}
+
+Status Calibration::Init(NetworkConfig& net_config, ModelConfig& model_config, InputShapesMap inputs_shape) {
+    TNN tnn;
+    Status status = tnn.Init(model_config);
+    if (status != TNN_OK) {
+        LOGE("tnn init failed!\n");
+        return TNNERR_INVALID_MODEL;
+    }
+    instance_ = tnn.CreateInst(net_config, status);
+    if (status != TNN_OK) {
+        LOGE("tnn create instance failed!\n");
+        return TNNERR_INST_ERR;
+    }
+
+    interpreter_ = std::dynamic_pointer_cast<DefaultModelInterpreter>(instance_->GetInterpreter());
+    if (interpreter_ == nullptr) {
+        LOGE("instance init failed!\n");
+        return TNNERR_INST_ERR;
+    }
+
+    return TNN_OK;
+}
+
+int Calibration::SetCalibrationParams(CalibrationParam params) {
+    cali_params_ = params;
+
+    if (cali_params_.blob_quantize_method == ADMM) {
+        LOGE("Not support ADMM in quantizing blobs!\n");
+        cali_params_.blob_quantize_method = MIN_MAX;
+        return -1;
+    }
+
+    if (cali_params_.weights_quantize_method == KL_DIVERGENCE) {
+        LOGE("Not support KL_DIVERGENCE in quantizing weights!\n");
+        cali_params_.weights_quantize_method = MIN_MAX;
+        return -1;
+    }
+
+    return 0;
+}
+
+Status Calibration::RunCalibration(DataSet& dataset) {
+    // Compute Feature Scale
+    int ret = CalBlobScale(dataset);
+    if (ret != 0) {
+        LOGE("calcluate blob scale failed!\n");
+        return TNNERR_QUANTIZE_ERROR;
+    }
+
+    // Quantize params
+    ret = QuantizeParams();
+    if (ret != 0) {
+        LOGE("quantize params failed!\n");
+        return TNNERR_QUANTIZE_ERROR;
+    }
+
+    // Merge Blob Scale of some layers
+    ret = MergeBlobScale();
+    if (ret != 0) {
+        LOGE("merge blob scale failed!\n");
+        return TNNERR_QUANTIZE_ERROR;
+    }
+
+    return TNN_OK;
+}
+
+Status Calibration::Serialize(std::string proto_path, std::string model_path) {
+    NetStructure* net_struct  = interpreter_->GetNetStructure();
+    NetResource* net_resource = interpreter_->GetNetResource();
+    if (net_struct == nullptr || net_resource == nullptr) {
+        LOGE("net struct or net resource is null\n");
+        return TNNERR_INVALID_MODEL;
+    }
+
+    TNN_NS::ModelPacker packer(net_struct, net_resource);
+
+    Status status = packer.Pack(proto_path, model_path);
+    if (status != TNN_OK) {
+        LOGE("pack the model failed!\n");
+        return status;
+    }
+
+    return TNN_OK;
+}
+
+int Calibration::CalBlobScale(DataSet& dataset) {
+    printf("Start to calculate blob scale ...\n");
+    NetResource* net_resource = interpreter_->GetNetResource();
+
+    Status status = instance_->Reshape(dataset.input_shape);
+    if (status != TNN_OK) {
+        LOGE("instance reshape failed!\n");
+        return -1;
+    }
+
+    // Init Feature map
+    int ret = InitFeatureMap();
+    if (ret != 0) {
+        LOGE("init feautre map for quantize failed!\n");
+        return ret;
+    }
+    printf("\tInit Feature Map done!\n");
+
+    // Collect the Range of Feature map
+    ret = UpdateBlobRange(dataset);
+    if (ret != 0) {
+        LOGE("collect feautre map range failed!\n");
+        return ret;
+    }
+    printf("\tCollect Blob Range done!\n");
+
+    // Calculate Distribute of Feature map
+    ret = UpdateBlobDistribute(dataset);
+    if (ret != 0) {
+        LOGE("update feautre map distribute failed!\n");
+        return ret;
+    }
+    printf("\tCollect Blob Distribution done!\n");
+
+    // Compute Scale of Feature map and save to resource map
+    for (auto& item : feature_map_) {
+        std::vector<float> scale_vec;
+        int ret = item.second->CalculateScale(scale_vec);
+        if (ret != 0) {
+            return ret;
+        }
+        std::string input_scale_name                 = item.first->GetBlobDesc().name + BLOB_SCALE_SUFFIX;
+        LayerResource* blob_scale_res                = CreateIntScale(scale_vec);
+        net_resource->resource_map[input_scale_name] = std::shared_ptr<LayerResource>(blob_scale_res);
+        printf("\t====> Calculate (%s) done!\n", input_scale_name.c_str());
+    }
+
+    return 0;
+}
+
+int Calibration::InitFeatureMap() {
+    feature_map_.clear();
+
+    BlobStatisticCallback func = [&](std::vector<Blob*>& blobs, LayerInfo* info) {
+        LayerType layer_type = info->type;
+        if (kQuantizedLayerTypeStr.find(layer_type) != kQuantizedLayerTypeStr.end() ||
+            kBlobScaleMergeLayerTypeStr.find(layer_type) != kBlobScaleMergeLayerTypeStr.end()) {
+            for (auto blob : blobs) {
+                if (feature_map_.find(blob) == feature_map_.end()) {
+                    std::shared_ptr<ScaleCalculator> scale_cal(new ScaleCalculator());
+                    if (scale_cal->Init(blob, cali_params_.merge_blob_channel, cali_params_.blob_quantize_method) ==
+                        0) {
+                        feature_map_[blob] = scale_cal;
+                    }
+                }
+
+                // set FC layer input and ouput blob to merge channel
+                if (layer_type == LAYER_INNER_PRODUCT) {
+                    if (feature_map_.find(blob) != feature_map_.end()) {
+                        feature_map_[blob]->SetMergeChannel(true);
+                    }
+                }
+            }
+        }
+    };
+
+    instance_->ForwardWithCallback(func, func);
+
+    // set input blob quantize method to MIN_MAX
+    BlobMap input_blobs;
+    Status status = instance_->GetAllInputBlobs(input_blobs);
+    if (status != TNN_OK) {
+        LOGE("instance get input blobs failed!\n");
+        return -1;
+    }
+    for (auto item : input_blobs) {
+        if (feature_map_.find(item.second) != feature_map_.end()) {
+            feature_map_[item.second]->SetQuantizeMethod(MIN_MAX);
+        }
+    }
+
+    return 0;
+}
+
+int Calibration::UpdateBlobRange(DataSet& dataset) {
+    BlobMap input_blobs;
+    Status status = instance_->GetAllInputBlobs(input_blobs);
+    if (status != TNN_OK) {
+        LOGE("instance get input blobs failed!\n");
+        return -1;
+    }
+    Blob* input_blob = input_blobs.begin()->second;
+
+    BlobStatisticCallback func = [&](std::vector<Blob*>& blobs, LayerInfo* info) {
+        for (auto blob : blobs) {
+            if (feature_map_.find(blob) != feature_map_.end()) {
+                feature_map_[blob]->UpdateRange();
+            }
+        }
+    };
+
+    FileReader file_reader;
+    file_reader.SetBiasValue(cali_params_.input_bias);
+    file_reader.SetScaleValue(cali_params_.input_scale);
+    file_reader.SetReverseChannel(cali_params_.reverse_channel);
+    for (auto file_pack : dataset.file_list) {
+        for (auto item : feature_map_) {
+            item.second->ClearRangeFlag();
+        }
+
+        status = file_reader.Read(input_blob, file_pack.first, file_pack.second);
+        if (status != TNN_OK) {
+            LOGE("read input file (%s) failed!\n", file_pack.first.c_str());
+            continue;
+        }
+        instance_->ForwardWithCallback(func, func);
+    }
+
+    return 0;
+}
+
+int Calibration::UpdateBlobDistribute(DataSet& dataset) {
+    for (auto& item : feature_map_) {
+        item.second->ResetDistribute();
+    }
+
+    BlobMap input_blobs;
+    Status status = instance_->GetAllInputBlobs(input_blobs);
+    if (status != TNN_OK) {
+        LOGE("instance get input blobs failed!\n");
+        return -1;
+    }
+    Blob* input_blob = input_blobs.begin()->second;
+
+    BlobStatisticCallback func = [&](std::vector<Blob*>& blobs, LayerInfo* info) {
+        for (auto blob : blobs) {
+            if (feature_map_.find(blob) != feature_map_.end()) {
+                feature_map_[blob]->UpdateDistribute();
+            }
+        }
+    };
+
+    FileReader file_reader;
+    file_reader.SetBiasValue(cali_params_.input_bias);
+    file_reader.SetScaleValue(cali_params_.input_scale);
+    file_reader.SetReverseChannel(cali_params_.reverse_channel);
+    for (auto file_pack : dataset.file_list) {
+        for (auto& item : feature_map_) {
+            item.second->ClearDistributeFlag();
+        }
+
+        status = file_reader.Read(input_blob, file_pack.first, file_pack.second);
+        if (status != TNN_OK) {
+            LOGE("read input file (%s) failed!\n", file_pack.first.c_str());
+            continue;
+        }
+        instance_->ForwardWithCallback(func, func);
+    }
+
+    return 0;
+}
+
+IntScaleResource* Calibration::CreateIntScale(std::vector<float> scale_vec) {
+    IntScaleResource* int8scale = new IntScaleResource();
+    // scale
+    RawBuffer scale(scale_vec.size() * sizeof(float));
+    float* k_data = scale.force_to<float*>();
+    memcpy(k_data, scale_vec.data(), scale_vec.size() * sizeof(float));
+    int8scale->scale_handle = scale;
+
+    // bias
+    RawBuffer bias(scale_vec.size() * sizeof(int32_t));
+    bias.SetDataType(DATA_TYPE_INT32);
+    int32_t* b_data = bias.force_to<int32_t*>();
+    memset(b_data, 0, scale_vec.size() * sizeof(int32_t));
+    int8scale->bias_handle = bias;
+    return int8scale;
+}
+
+int Calibration::QuantizeParams() {
+    printf("Start to Quantize Parameters ...\n");
+    NetStructure* net_struct  = interpreter_->GetNetStructure();
+    NetResource* net_resource = interpreter_->GetNetResource();
+
+    for (auto& item : net_struct->layers) {
+        LayerType layer_type = item->type;
+        if (kQuantizedLayerTypeStr.find(layer_type) != kQuantizedLayerTypeStr.end()) {
+            // assign NetStructure
+            item->param->quantized = true;
+
+            // assign NetResource
+            if (layer_type == LAYER_CONVOLUTION) {
+                printf("\tQuantize Convolution parameters...\n");
+                if (net_resource->resource_map.find(item->name) == net_resource->resource_map.end()) {
+                    LOGE("Convolution resource not found (name: %s)", item->name.c_str());
+                    return -1;
+                }
+
+                ConvLayerResource* conv_res =
+                    dynamic_cast<ConvLayerResource*>(net_resource->resource_map[item->name].get());
+                ConvLayerParam* conv_param        = dynamic_cast<ConvLayerParam*>(item->param.get());
+                std::string input_blob_scale_name = item->inputs[0] + BLOB_SCALE_SUFFIX;
+                if (net_resource->resource_map.find(input_blob_scale_name) == net_resource->resource_map.end()) {
+                    LOGE("Blob Scale resource not found (name: %s)", input_blob_scale_name.c_str());
+                    return -1;
+                }
+                IntScaleResource* blob_scale =
+                    dynamic_cast<IntScaleResource*>(net_resource->resource_map[input_blob_scale_name].get());
+                int ret = QuantizeConvParams(conv_res, conv_param, blob_scale);
+                if (ret != 0) {
+                    LOGE(
+                        "Quantize convolution weights failed! (layer name: "
+                        "%s)\n",
+                        item->name.c_str());
+                    return -1;
+                }
+                printf("\t====> done!\n");
+
+            } else if (layer_type == LAYER_INNER_PRODUCT) {
+                printf("\tQuantize InnerProduct parameters...\n");
+                if (net_resource->resource_map.find(item->name) == net_resource->resource_map.end()) {
+                    LOGE("InnerProduct resource not found (name: %s)", item->name.c_str());
+                    return -1;
+                }
+
+                InnerProductLayerResource* fc_res =
+                    dynamic_cast<InnerProductLayerResource*>(net_resource->resource_map[item->name].get());
+                InnerProductLayerParam* fc_param  = dynamic_cast<InnerProductLayerParam*>(item->param.get());
+                std::string input_blob_scale_name = item->inputs[0] + BLOB_SCALE_SUFFIX;
+                if (net_resource->resource_map.find(input_blob_scale_name) == net_resource->resource_map.end()) {
+                    LOGE("Blob Scale resource not found (name: %s)", input_blob_scale_name.c_str());
+                    return -1;
+                }
+                IntScaleResource* blob_scale =
+                    dynamic_cast<IntScaleResource*>(net_resource->resource_map[input_blob_scale_name].get());
+                int ret = QuantizeFcParams(fc_res, fc_param, blob_scale);
+                if (ret != 0) {
+                    LOGE(
+                        "Quantize InnerProduct weights failed! (layer name: "
+                        "%s)\n",
+                        item->name.c_str());
+                    return -1;
+                }
+                printf("\t====> done!\n");
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Calibration::QuantizeConvParams(ConvLayerResource* resource, ConvLayerParam* param, IntScaleResource* input_scale) {
+    int group          = param->group;
+    int output_channel = param->output_channel;
+    int kernel_size    = DimsVectorUtils::Count(param->kernels);
+    int size           = resource->filter_handle.GetDataCount();
+    if (size % (kernel_size * output_channel) != 0) {
+        LOGE("invalid weight size!\n");
+        return -1;
+    }
+    if (output_channel % group != 0) {
+        LOGE("invalid conv param (output channel, group)!\n");
+        return -1;
+    }
+    int input_channel_per_group  = size / output_channel / kernel_size;
+    int output_channel_per_group = output_channel / group;
+    int oc_stride                = input_channel_per_group * kernel_size;
+
+    std::vector<float> weight_multiby_inputscale(size);
+    bool merge_channel = false;
+    if (input_scale->scale_handle.GetDataCount() == 1)
+        merge_channel = true;
+
+    bool is_depthwise = group == output_channel;
+
+    // multi weights by input_scale
+    float* input_scale_data = input_scale->scale_handle.force_to<float*>();
+    auto filter_handle      = resource->filter_handle;
+    if (resource->filter_handle.GetDataType() == DATA_TYPE_HALF) {
+        LOGI("Fp16 model is used to quantize, precision may be lower than fp32 model!");
+        filter_handle = ConvertHalfHandle(filter_handle);
+    }
+    float* weight_data = filter_handle.force_to<float*>();
+    for (int group_idx = 0; group_idx < group; group_idx++) {
+        for (int oc = 0; oc < output_channel_per_group; ++oc) {
+            for (int ic = 0; ic < input_channel_per_group; ++ic) {
+                int s_idx = ic + group_idx * input_channel_per_group;
+                for (int i = 0; i < kernel_size; ++i) {
+                    int idx = (group_idx * output_channel_per_group + oc) * oc_stride + ic * kernel_size + i;
+                    if (merge_channel)
+                        s_idx = 0;
+                    if (is_depthwise) {
+                        weight_multiby_inputscale[idx] = weight_data[idx];
+                    } else {
+                        weight_multiby_inputscale[idx] = weight_data[idx] * input_scale_data[s_idx];
+                    }
+                }
+            }
+        }
+    }
+
+    // quantize weights
+    RawBuffer weight_quantized(size * sizeof(char));
+    weight_quantized.SetDataType(DATA_TYPE_INT8);
+    int weight_scale_size = output_channel;
+    if (cali_params_.merge_weights_channel)
+        weight_scale_size = 1;
+    RawBuffer weight_scale(weight_scale_size * sizeof(float));
+
+    float* weight_scale_data      = weight_scale.force_to<float*>();
+    int8_t* weight_quantized_data = weight_quantized.force_to<int8_t*>();
+    int ret                       = CalQuantizedWeights(weight_multiby_inputscale.data(), size, output_channel,
+                                  cali_params_.merge_weights_channel, weight_quantized_data, weight_scale_data);
+    if (ret != 0) {
+        LOGE("Calculate quantized weights failed!\n");
+        return ret;
+    }
+
+    // for depthwise conv, need to mul weight_scale by input_scale
+    if (is_depthwise) {
+        for (int i = 0; i < weight_scale_size; ++i) {
+            int s_idx = i;
+            if (merge_channel)
+                s_idx = 0;
+            weight_scale_data[i] = weight_scale_data[i] * input_scale_data[s_idx];
+        }
+    }
+
+    resource->filter_handle = weight_quantized;
+    resource->scale_handle  = weight_scale;
+
+    // quantize bias
+    if (param->bias) {
+        auto fp32_bias_handle = ConvertHalfHandle(resource->bias_handle);
+        float* bias_data      = fp32_bias_handle.force_to<float*>();
+        RawBuffer bias_quantized(output_channel * sizeof(int32_t));
+        bias_quantized.SetDataType(DATA_TYPE_INT32);
+        int32_t* bias_quantized_data = bias_quantized.force_to<int32_t*>();
+
+        for (int oc = 0; oc < output_channel; ++oc) {
+            if (weight_scale_data[oc] == 0) {
+                bias_quantized_data[oc] = 0;
+            } else {
+                int weight_scale_idx = oc;
+                if (cali_params_.merge_weights_channel)
+                    weight_scale_idx = 0;
+                bias_quantized_data[oc] = static_cast<int32_t>(bias_data[oc] / weight_scale_data[weight_scale_idx]);
+            }
+        }
+
+        resource->bias_handle = bias_quantized;
+    }
+
+    return 0;
+}
+
+int Calibration::QuantizeFcParams(InnerProductLayerResource* resource, InnerProductLayerParam* param,
+                                  IntScaleResource* input_scale) {
+    int output_channel = param->num_output;
+    int size           = resource->weight_handle.GetDataCount();
+    if (size % output_channel != 0) {
+        LOGE("invalid weight size!\n");
+        return -1;
+    }
+    int oc_stride = size / output_channel;
+
+    std::vector<float> weight_multiby_inputscale(size);
+    if (input_scale->scale_handle.GetDataCount() != 1) {
+        LOGE("invalid scale size!\n");
+        return -1;
+    }
+
+    // multi weights by input_scale
+    float* input_scale_data = input_scale->scale_handle.force_to<float*>();
+    auto weight_handle      = resource->weight_handle;
+    if (resource->weight_handle.GetDataType() == DATA_TYPE_HALF) {
+        LOGI("Fp16 model is used to quantize, precision may be lower than fp32 model!");
+        weight_handle = ConvertHalfHandle(weight_handle);
+    }
+    float* weight_data = weight_handle.force_to<float*>();
+    for (int i = 0; i < size; ++i) {
+        weight_multiby_inputscale[i] = weight_data[i] * input_scale_data[0];
+    }
+
+    // quantize weights
+    RawBuffer weight_quantized(size * sizeof(char));
+    weight_quantized.SetDataType(DATA_TYPE_INT8);
+    int weight_scale_size = output_channel;
+    if (cali_params_.merge_weights_channel)
+        weight_scale_size = 1;
+    RawBuffer weight_scale(weight_scale_size * sizeof(float));
+
+    float* weight_scale_data      = weight_scale.force_to<float*>();
+    int8_t* weight_quantized_data = weight_quantized.force_to<int8_t*>();
+    int ret                       = CalQuantizedWeights(weight_multiby_inputscale.data(), size, output_channel,
+                                  cali_params_.merge_weights_channel, weight_quantized_data, weight_scale_data);
+    if (ret != 0) {
+        LOGE("Calculate quantized weights failed!\n");
+        return ret;
+    }
+
+    resource->weight_handle = weight_quantized;
+    resource->scale_handle  = weight_scale;
+
+    // quantize bias
+    if (param->has_bias) {
+        auto fp32_bias_handle = ConvertHalfHandle(resource->bias_handle);
+        float* bias_data      = fp32_bias_handle.force_to<float*>();
+        RawBuffer bias_quantized(output_channel * sizeof(int32_t));
+        bias_quantized.SetDataType(DATA_TYPE_INT32);
+        int32_t* bias_quantized_data = bias_quantized.force_to<int32_t*>();
+
+        for (int oc = 0; oc < output_channel; ++oc) {
+            if (weight_scale_data[oc] == 0) {
+                bias_quantized_data[oc] = 0;
+            } else {
+                int weight_scale_idx = oc;
+                if (cali_params_.merge_weights_channel)
+                    weight_scale_idx = 0;
+                bias_quantized_data[oc] = static_cast<int32_t>(bias_data[oc] / weight_scale_data[weight_scale_idx]);
+            }
+        }
+
+        resource->bias_handle = bias_quantized;
+    }
+
+    return 0;
+}
+
+int Calibration::CalQuantizedWeights(const float* weights, const int size, const int output_channel, bool merge_channel,
+                                     int8_t* quantized_weights, float* weight_scale) {
+    ASSERT(size % output_channel == 0);
+
+    if (cali_params_.weights_quantize_method == MIN_MAX) {
+        // MIN_MAX
+        int weight_scale_count = merge_channel ? 1 : output_channel;
+        int s_size             = size / weight_scale_count;
+        for (int s_idx = 0; s_idx < weight_scale_count; ++s_idx) {
+            const float* weight_start = weights + s_idx * s_size;
+            int8_t* weight_q_start    = quantized_weights + s_idx * s_size;
+            auto minmax               = std::minmax_element(weight_start, weight_start + s_size);
+            float max_val_abs         = std::max(std::abs(*minmax.first), std::abs(*minmax.second));
+
+            weight_scale[s_idx]    = max_val_abs / 127.0f;
+            float scale_float2int8 = 1.0f;
+            if (max_val_abs != 0)
+                scale_float2int8 = 1 / weight_scale[s_idx];
+
+            // quantize weights
+            for (int i = 0; i < s_size; ++i) {
+                int value         = static_cast<int>(std::round(weight_start[i] * scale_float2int8));
+                weight_q_start[i] = std::min(127, std::max(-127, value));
+            }
+        }
+    } else if (cali_params_.weights_quantize_method == ADMM) {
+        // ADMM
+        int weight_scale_count  = merge_channel ? 1 : output_channel;
+        int s_size              = size / weight_scale_count;
+        const int quantize_bits = 8;
+
+        InitWeightScaleADMM(weights, size, output_channel, merge_channel, weight_scale, quantize_bits);
+
+        int iter           = 0;
+        float pre_sum      = 0;
+        float cur_sum      = 0;
+        const int max_iter = 1000;
+
+        for (int i = 0; i < size; i++) {
+            pre_sum += std::fabs(weights[i]);
+        }
+        // update weights quan
+        while (iter < max_iter) {
+            UpdateQuantizedWeightsADMM(weights, size, output_channel, merge_channel, weight_scale, quantize_bits,
+                                       quantized_weights);
+            UpdateAlphaADMM(weights, size, output_channel, merge_channel, weight_scale, quantized_weights);
+            iter++;
+        }
+
+        for (int i = 0; i < size; i++) {
+            cur_sum += std::fabs(quantized_weights[i] * weight_scale[i / s_size]);
+        }
+        // LOGD("iter: %d  with diff %f\n", iter, pre_sum - cur_sum);
+    } else {
+        LOGE("Not support yet (method: %d) for quantize weights", cali_params_.weights_quantize_method);
+        return -1;
+    }
+
+    return 0;
+}
+
+int Calibration::MergeBlobScale() {
+    printf("Start to Merge Blob Scale ...\n");
+    NetStructure* net_struct  = interpreter_->GetNetStructure();
+    NetResource* net_resource = interpreter_->GetNetResource();
+
+    for (auto& item : net_struct->layers) {
+        MergeBlobScaleRecursion(item.get(), net_struct, net_resource);
+    }
+
+    return 0;
+}
+
+void Calibration::MergeBlobScaleRecursion(LayerInfo* layer_info, NetStructure* net_struct, NetResource* net_resource) {
+    LayerType layer_type = layer_info->type;
+    // Skip average pooling
+    if (layer_type == LAYER_POOLING) {
+        auto param = dynamic_cast<PoolingLayerParam *>(layer_info->param.get());
+        if (param->pool_type == 1) {
+            return;
+        }
+    }
+    if (kBlobScaleMergeLayerTypeStr.find(layer_type) != kBlobScaleMergeLayerTypeStr.end()) {
+        ASSERT(layer_info->inputs.size() == 1 && layer_info->outputs.size() == 1)
+        LayerInfo* pre_layer_info = GetLayerInfoFromOutpubBlobName(layer_info->inputs[0], net_struct);
+        if (pre_layer_info != nullptr && pre_layer_info->param->quantized) {
+            // merge blob scale
+            std::string input_scale_name  = layer_info->inputs[0] + BLOB_SCALE_SUFFIX;
+            std::string output_scale_name = layer_info->outputs[0] + +BLOB_SCALE_SUFFIX;
+            if (net_resource->resource_map.find(input_scale_name) != net_resource->resource_map.end() &&
+                net_resource->resource_map.find(output_scale_name) != net_resource->resource_map.end()) {
+                net_resource->resource_map[input_scale_name] = net_resource->resource_map[output_scale_name];
+                layer_info->param->quantized                 = true;
+            }
+
+            MergeBlobScaleRecursion(pre_layer_info, net_struct, net_resource);
+        }
+    }
+}
+
+LayerInfo* Calibration::GetLayerInfoFromOutpubBlobName(std::string blob_name, NetStructure* net_struct) {
+    LayerInfo* layer_info = nullptr;
+    for (auto item : net_struct->layers) {
+        for (auto name : item->outputs) {
+            if (name == blob_name) {
+                layer_info = item.get();
+                return layer_info;
+            }
+        }
+    }
+
+    return layer_info;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/tools/quantization/calibration.h b/3rdparty/TNN/tools/quantization/calibration.h
new file mode 100644
index 0000000..2d67045
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/calibration.h
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_QUANTIZATION_CALIBRATION_H_
+#define TNN_TOOLS_QUANTIZATION_CALIBRATION_H_
+
+#include <memory>
+#include "tnn/core/blob.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/default_model_interpreter.h"
+
+#include "calibration_common.h"
+#include "scale_calculator.h"
+
+namespace TNN_NS {
+
+class Calibration {
+public:
+    // @brief Calibration Constructor
+    Calibration();
+
+    // @brief Calibration virtual Destructor
+    virtual ~Calibration();
+
+public:
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param inputs_shape_map modify input shape, if empty, it will use the
+    // shape in proto
+    Status Init(NetworkConfig& net_config, ModelConfig& model_config, InputShapesMap inputs_shape = InputShapesMap());
+
+    // @brief set the quanztize method
+    // @param params the params of calibration
+    int SetCalibrationParams(CalibrationParam params);
+
+    // @brief int net with network config, net structure and net resource info
+    // @param dataset calibration inputs
+    Status RunCalibration(DataSet& dataset);
+
+    // @brief int net with network config, net structure and net resource info
+    // @param proto_path, file path to save the quantized proto.
+    // @param model_path, file path to save the quantized model.
+    Status Serialize(std::string proto_path, std::string model_path);
+
+private:
+    int CalBlobScale(DataSet& dataset);
+    int InitFeatureMap();
+    int UpdateBlobRange(DataSet& dataset);
+    int UpdateBlobDistribute(DataSet& dataset);
+    IntScaleResource* CreateIntScale(std::vector<float> scale_vec);
+
+    int QuantizeParams();
+    int QuantizeConvParams(ConvLayerResource* resource, ConvLayerParam* param, IntScaleResource* input_scale);
+    int QuantizeFcParams(InnerProductLayerResource* resource, InnerProductLayerParam* param,
+                         IntScaleResource* input_scale);
+    int CalQuantizedWeights(const float* weights, const int size, const int output_channel, bool merge_channel,
+                            int8_t* quantized_weight, float* weight_scale);
+
+    int MergeBlobScale();
+    void MergeBlobScaleRecursion(LayerInfo* layer_info, NetStructure* net_struct, NetResource* net_resource);
+    LayerInfo* GetLayerInfoFromOutpubBlobName(std::string blob_name, NetStructure* net_struct);
+
+    std::shared_ptr<DefaultModelInterpreter> interpreter_;
+    std::shared_ptr<Instance> instance_;
+    std::map<Blob*, std::shared_ptr<ScaleCalculator>> feature_map_;
+    CalibrationParam cali_params_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_QUANTIZATION_CALIBRATION_H_
diff --git a/3rdparty/TNN/tools/quantization/calibration_common.h b/3rdparty/TNN/tools/quantization/calibration_common.h
new file mode 100644
index 0000000..4f49e4a
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/calibration_common.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_QUANTIZATION_CALIBRATION_COMMON_H_
+#define TNN_TOOLS_QUANTIZATION_CALIBRATION_COMMON_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "file_reader.h"
+
+namespace TNN_NS {
+
+typedef enum {
+    /* min max method */
+    MIN_MAX = 0,
+    /* ADMM method */
+    ADMM = 1,
+    /* kl divergence method */
+    KL_DIVERGENCE = 2,
+} CalibrationMethod;
+
+struct DataSet {
+    /* list of input file path and format */
+    std::vector<std::pair<std::string, FileFormat>> file_list;
+
+    /* input shape of the input files* */
+    InputShapesMap input_shape;
+};
+
+struct CalibrationParam {
+    CalibrationMethod blob_quantize_method    = MIN_MAX;
+    CalibrationMethod weights_quantize_method = MIN_MAX;
+    bool merge_blob_channel                   = false;
+    bool merge_weights_channel                = false;
+    std::vector<float> input_bias             = {0, 0, 0, 0};
+    std::vector<float> input_scale            = {1.0f, 1.0f, 1.0f, 1.0f};
+    bool reverse_channel                      = false;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_QUANTIZATION_CALIBRATION_COMMON_H_
diff --git a/3rdparty/TNN/tools/quantization/quantize.cc b/3rdparty/TNN/tools/quantization/quantize.cc
new file mode 100644
index 0000000..ff35e74
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/quantize.cc
@@ -0,0 +1,372 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/common.h"
+#include "tnn/core/instance.h"
+#include "tnn/core/tnn.h"
+
+#include "calibration.h"
+#include "calibration_common.h"
+#include "file_reader.h"
+#include "scale_calculator.h"
+#include "tnn/utils/split_utils.h"
+
+#include <dirent.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <string>
+
+using namespace TNN_NS;
+
+bool CheckResult(std::string desc, int ret) {
+    if (ret != 0) {
+        printf("%s failed: ret %d or 0x%X\n", desc.c_str(), ret, ret);
+        return false;
+    } else {
+        printf("%s success!\n", desc.c_str());
+        return true;
+    }
+}
+
+void ParseProtoFile(char* proto_buffer, size_t proto_buffer_length) {
+    // remove all the " and \n character
+    size_t fill = 0;
+    for (size_t i = 0; i < proto_buffer_length; ++i) {
+        if (proto_buffer[i] != '\"' && proto_buffer[i] != '\n') {
+            proto_buffer[fill++] = proto_buffer[i];
+        }
+    }
+    proto_buffer[fill] = '\0';
+}
+
+int InitModelConfig(ModelConfig& model_config, std::string proto_file, std::string model_file) {
+    FILE* fp = fopen(proto_file.c_str(), "r");
+    if (fp == NULL) {
+        printf("invalid proto file\n");
+        return -1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    int size = ftell(fp);
+    rewind(fp);
+    char* buffer = new char[size + 1];
+    memset(buffer, 0, size + 1);
+    int ret = fread(buffer, 1, size, fp);
+    if (ret != size) {
+        printf("read proto_file failed!\n");
+        return -1;
+    }
+    fclose(fp);
+
+    ParseProtoFile(buffer, size);
+    std::string buffer_str(buffer);
+
+    model_config.params.push_back(buffer_str);
+    delete[] buffer;
+
+    {
+        std::ifstream model_stream(model_file);
+        if (!model_stream.is_open() || !model_stream.good()) {
+            printf("read model_file failed!\n");
+            return -1;
+        }
+        std::string model_content =
+            std::string((std::istreambuf_iterator<char>(model_stream)), std::istreambuf_iterator<char>());
+
+        model_config.params.push_back(model_content);
+    }
+
+    return 0;
+}
+
+bool GetInputType(std::string name, FileFormat& format) {
+    int pos = name.rfind('.');
+    if (pos == std::string::npos)
+        return false;
+
+    std::string suffix = name.substr(pos);
+    std::transform(suffix.begin(), suffix.end(), suffix.begin(), tolower);
+    if (suffix == ".txt") {
+        format = TEXT;
+    } else if (suffix == ".npy") {
+        format = NPY;
+    } else if (suffix == ".jpg") {
+        format = IMAGE;
+    } else if (suffix == ".jpeg") {
+        format = IMAGE;
+    } else if (suffix == ".png") {
+        format = IMAGE;
+    } else if (suffix == ".bmp") {
+        format = IMAGE;
+    } else {
+        return false;
+    }
+
+    return true;
+}
+
+int ImportDataSet(DataSet& dataset, std::string folder_path) {
+    dataset.file_list.clear();
+
+    DIR* dp;
+    struct dirent* dirp;
+    if ((dp = opendir(folder_path.c_str())) == NULL) {
+        printf("Can't open %s\n", folder_path.c_str());
+        return -1;
+    }
+    while ((dirp = readdir(dp)) != NULL) {
+        if (dirp->d_type == DT_REG) {
+            std::string file_name = dirp->d_name;
+            FileFormat format     = NOTSUPPORT;
+            if (GetInputType(file_name, format)) {
+                char full_name[256 + 1];
+                snprintf(full_name, 256, "%s/%s", folder_path.c_str(), dirp->d_name);
+                dataset.file_list.push_back(std::make_pair(full_name, format));
+                printf("\timport: %s  type: %d\n", dirp->d_name, format);
+            }
+        }
+    }
+    closedir(dp);
+
+    if (dataset.file_list.size() == 0) {
+        printf("no valid input file found!\n");
+        return -1;
+    }
+    printf("import total %lu files\n", dataset.file_list.size());
+    return 0;
+}
+
+bool CheckNumberString(std::string num_str) {
+    const char* num_char = num_str.c_str();
+
+    for (int i = 0; i < num_str.length(); ++i) {
+        if (!(num_char[i] >= '0' && num_char[i] <= '9') && num_char[i] != '.') {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool CheckFileName(std::string name) {
+    std::ofstream write_stream;
+    write_stream.open(name);
+    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
+        write_stream.close();
+        return false;
+    }
+    write_stream.close();
+    return true;
+}
+
+void PrintConfig() {
+    printf(
+        "usage:\n./quantization_cmd [-h] [-p] <proto file> [-m] <model file> [-i] <input folder> [-b] <val> [-w] <val> "
+        "[-n] <val> [-s] <val> [-t] <val> [-o] <output_name>\n"
+        "\t-h, --help        \t show this message\n"
+        "\t-p, --proto       \t(require) tnn proto file name\n"
+        "\t-m, --model       \t(require) tnn model file name\n"
+        "\t-i, --input_path  \t(require) the folder of input files\n"
+        "\t-b, --blob_method \t(optional) the method to quantize blob\n"
+        "\t\t0: MIN_MAX  (default)\n"
+        "\t\t2: KL_DIVERGENCE\n"
+        "\t-w, --weight_method\t(optional) the method to quantize weights\n"
+        "\t\t0: MIN_MAX  (default)\n"
+        "\t\t1: ADMM\n"
+        "\t-r, --reverse_channel\t(optional) reverse B and R channel when preprocess image\n"
+        "\t\t0: the network uses rgb order  (default)\n"
+        "\t\t1: the network uses bgr order\n"
+        "\t-n, --bias         \t(optional) bias val when preprocess image "
+        "input, ie, "
+        "0.0,0.0,0.0 \n"
+        "\t-s, --scale        \t(optional) scale val when preprocess image "
+        "input, ie, "
+        "1.0,1.0,1.0 \n"
+        "\t\tformula: y = (x - bias) * scale\n"
+        "\t-t, --merge_type\t(optional) merge blob/weights channel when quantize blob/weights\n"
+        "\t\t0: per-channel mode  (default)\n"
+        "\t\t1: mix mode          weight: per-channel  blob: per-tensor\n"
+        "\t\t2: per-tersor mode\n"
+        "\t-o, --output       \t(optional) specify the name of output\n");
+}
+
+int main(int argc, char* argv[]) {
+    // Init parameters
+    std::string proto_file_name;
+    std::string model_file_name;
+    std::string input_path;
+    std::string output_name = "model";
+
+    CalibrationParam cali_params;
+
+    struct option long_options[] = {{"proto", required_argument, 0, 'p'},
+                                    {"model", required_argument, 0, 'm'},
+                                    {"input_path", required_argument, 0, 'i'},
+                                    {"blob_method", required_argument, 0, 'b'},
+                                    {"weight_method", required_argument, 0, 'w'},
+                                    {"reverse_channel", required_argument, 0, 'r'},
+                                    {"bias", required_argument, 0, 'n'},
+                                    {"scale", required_argument, 0, 's'},
+                                    {"merge_type", required_argument, 0, 't'},
+                                    {"output", required_argument, 0, 'o'},
+                                    {"help", no_argument, 0, 'h'},
+                                    {0, 0, 0, 0}};
+
+    const char* optstring = "p:m:i:b:w:r:n:s:t:o:h";
+
+    if (argc == 1) {
+        PrintConfig();
+        return 0;
+    }
+
+    while (1) {
+        int c = getopt_long(argc, argv, optstring, long_options, nullptr);
+        if (c == -1)
+            break;
+
+        switch (c) {
+            case 'p':
+                printf("proto: %s\n", optarg);
+                proto_file_name = optarg;
+                break;
+            case 'm':
+                printf("model: %s\n", optarg);
+                model_file_name = optarg;
+                break;
+            case 'i':
+                printf("input path: %s\n", optarg);
+                input_path = optarg;
+                break;
+            case 'b':
+                printf("blob quantize method: %s\n", optarg);
+                cali_params.blob_quantize_method = (CalibrationMethod)atoi(optarg);
+                break;
+            case 'w':
+                printf("weight quantize method: %s\n", optarg);
+                cali_params.weights_quantize_method = (CalibrationMethod)atoi(optarg);
+                break;
+            case 'r': {
+                printf("reverse channel: %s\n", optarg);
+                int reverse_channel = atoi(optarg);
+                if (1 == reverse_channel) {
+                    cali_params.reverse_channel = true;
+                } else {
+                    cali_params.reverse_channel = false;
+                }
+            } break;
+            case 'n': {
+                printf("bias: %s\n", optarg);
+                std::vector<std::string> array;
+                SplitUtils::SplitStr(optarg, array, ",");
+                cali_params.input_bias.clear();
+                for (auto s : array) {
+                    if (!CheckNumberString(s)) {
+                        printf("invalid bias value: %s\n", s.c_str());
+                        return -1;
+                    }
+                    cali_params.input_bias.push_back(atof(s.c_str()));
+                }
+            } break;
+            case 's': {
+                printf("scale: %s\n", optarg);
+                std::vector<std::string> array;
+                SplitUtils::SplitStr(optarg, array, ",");
+                cali_params.input_scale.clear();
+                for (auto s : array) {
+                    if (!CheckNumberString(s)) {
+                        printf("invalid scale value: %s\n", s.c_str());
+                        return -1;
+                    }
+                    cali_params.input_scale.push_back(atof(s.c_str()));
+                }
+            } break;
+            case 't': {
+                printf("merge type: %s\n", optarg);
+                int merge_type = atoi(optarg);
+                if (0 == merge_type) {
+                    cali_params.merge_blob_channel    = false;
+                    cali_params.merge_weights_channel = false;
+                } else if (1 == merge_type) {
+                    cali_params.merge_blob_channel    = true;
+                    cali_params.merge_weights_channel = false;
+                } else if (2 == merge_type) {
+                    cali_params.merge_blob_channel    = true;
+                    cali_params.merge_weights_channel = true;
+                } else {
+                    cali_params.merge_blob_channel    = false;
+                    cali_params.merge_weights_channel = false;
+                }
+            } break;
+            case 'o':
+                printf("output name: %s\n", optarg);
+                output_name = optarg;
+                if (!CheckFileName(output_name + ".quantized.tnnproto")) {
+                    printf("invaild output name!\n");
+                    return 0;
+                }
+                break;
+            case 'h':
+            case '?':
+                PrintConfig();
+                return 0;
+            default:
+                PrintConfig();
+                break;
+        }
+    }
+
+    ModelConfig model_config;
+    int ret = InitModelConfig(model_config, proto_file_name, model_file_name);
+    if (CheckResult("init model config", ret) != true)
+        return -1;
+
+    NetworkConfig net_config;
+    net_config.device_type = DEVICE_NAIVE;
+    DataSet dataset;
+    ret = ImportDataSet(dataset, input_path);
+    if (CheckResult("import data set", ret) != true)
+        return -1;
+
+    Calibration calibration;
+    Status status = calibration.Init(net_config, model_config);
+    if (status != TNN_OK) {
+        printf("calibration init failed!\n");
+        return -1;
+    }
+
+    ret = calibration.SetCalibrationParams(cali_params);
+    if (ret != 0) {
+        printf("set calibration params failed!\n");
+        return -1;
+    }
+
+    status = calibration.RunCalibration(dataset);
+    if (status != TNN_OK) {
+        printf("calibration run failed!\n");
+        return -1;
+    }
+    status = calibration.Serialize(output_name + ".quantized.tnnproto", output_name + ".quantized.tnnmodel");
+    if (status != TNN_OK) {
+        printf("calibration serialize failed! (%s)\n", status.description().c_str());
+        return -1;
+    }
+    printf("quantize model success!\n");
+
+    return 0;
+}
diff --git a/3rdparty/TNN/tools/quantization/scale_calculator.cc b/3rdparty/TNN/tools/quantization/scale_calculator.cc
new file mode 100644
index 0000000..bfdd64a
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/scale_calculator.cc
@@ -0,0 +1,345 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "scale_calculator.h"
+
+#include "tnn/utils/dims_vector_utils.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace TNN_NS {
+
+// Given distribution P and Q, KL-Divergence is
+// Sum(P[i] * log(P[i] / Q[i]))
+static float KlDivergence(const std::vector<float>& dis_ref, const std::vector<float>& dis_epd) {
+    float result   = 0.0f;
+    const int size = dis_ref.size();
+
+    for (int i = 0; i < size; ++i) {
+        if (dis_ref[i] != 0) {
+            if (dis_epd[i] == 0) {
+                result += 1.0f;
+            } else {
+                result += (dis_ref[i] * std::log(dis_ref[i] / dis_epd[i]));
+            }
+        }
+    }
+
+    return result;
+}
+
+ScaleCalculator::ScaleCalculator() {
+    origin_blob_          = nullptr;
+    range_done_flag_      = false;
+    distribute_done_flag_ = false;
+    bin_nums_             = 2048;
+}
+
+ScaleCalculator::~ScaleCalculator() {}
+
+int ScaleCalculator::Init(Blob* blob, bool merge_channel, CalibrationMethod method) {
+    origin_blob_   = blob;
+    merge_channel_ = merge_channel;
+    cali_method_   = method;
+
+    // TO-DO: support different data_type and device_type
+    if (blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT && blob->GetBlobDesc().device_type == DEVICE_NAIVE) {
+        // TO-DO: support different data format, now only NCHW
+        int channel = blob->GetBlobDesc().dims[1];
+        int height  = blob->GetBlobDesc().dims[2];
+        int width   = blob->GetBlobDesc().dims[3];
+
+        range_per_channel_.resize(channel);
+        for (auto& item : range_per_channel_) {
+            item.first  = 1e6;   // init min
+            item.second = -1e6;  // init max
+        }
+
+        interval_per_channel_.resize(channel);
+        valid_channel_.resize(channel);
+        distribute_per_channel_.resize(channel);
+        for (auto& item : distribute_per_channel_) {
+            item.resize(bin_nums_);
+        }
+
+        if (height * width < 100) {
+            // the data num is too small, use minmax
+            cali_method_ = MIN_MAX;
+        }
+
+        return 0;
+    } else {
+        LOGE("Invalid Blob for quantization!\n");
+        return -1;
+    }
+}
+
+int ScaleCalculator::SetQuantizeMethod(CalibrationMethod method) {
+    if (method != MIN_MAX && method != KL_DIVERGENCE) {
+        LOGE("invalid method (%d) for blob quantization!\n", method);
+        return -1;
+    }
+
+    cali_method_ = method;
+    return 0;
+}
+
+void ScaleCalculator::SetMergeChannel(bool merge) {
+    merge_channel_ = merge;
+}
+
+void ScaleCalculator::ClearRangeFlag() {
+    range_done_flag_ = false;
+}
+
+void ScaleCalculator::ClearDistributeFlag() {
+    distribute_done_flag_ = false;
+}
+
+int ScaleCalculator::UpdateRange() {
+    if (range_done_flag_) {
+        return 0;
+    }
+
+    int batch   = origin_blob_->GetBlobDesc().dims[0];
+    int channel = origin_blob_->GetBlobDesc().dims[1];
+    int hxw     = DimsVectorUtils::Count(origin_blob_->GetBlobDesc().dims, 2);
+    float* data_ptr = reinterpret_cast<float*>(static_cast<char*>(origin_blob_->GetHandle().base) +
+                                               origin_blob_->GetHandle().bytes_offset);
+
+    for (int b = 0; b < batch; ++b) {
+        for (int c = 0; c < channel; ++c) {
+            int channel_idx = c;
+            if (merge_channel_) {
+                channel_idx = 0;
+            }
+
+            float* p = data_ptr + b * channel * hxw + c * hxw;
+            for (int i = 0; i < hxw; ++i) {
+                float val = p[i];
+
+                if (val < range_per_channel_[channel_idx].first) {
+                    range_per_channel_[channel_idx].first = val;
+                }
+                if (val > range_per_channel_[channel_idx].second) {
+                    range_per_channel_[channel_idx].second = val;
+                }
+            }
+        }
+    }
+
+    range_done_flag_ = true;
+    return 0;
+}
+
+int ScaleCalculator::ResetDistribute() {
+    for (unsigned int i = 0; i < interval_per_channel_.size(); ++i) {
+        float max_val     = std::max(std::abs(range_per_channel_[i].first), std::abs(range_per_channel_[i].second));
+        valid_channel_[i] = max_val > 0.00001;
+        if (valid_channel_[i]) {
+            interval_per_channel_[i] = (float)bin_nums_ / max_val;
+        }
+
+        if (merge_channel_)
+            break;
+    }
+
+    for (auto& item : distribute_per_channel_) {
+        std::fill(item.begin(), item.end(), 1.0e-7);
+    }
+
+    return 0;
+}
+
+int ScaleCalculator::UpdateDistribute() {
+    if (distribute_done_flag_) {
+        return 0;
+    }
+
+    int batch   = origin_blob_->GetBlobDesc().dims[0];
+    int channel = origin_blob_->GetBlobDesc().dims[1];
+    int hxw     = DimsVectorUtils::Count(origin_blob_->GetBlobDesc().dims, 2);
+    float* data_ptr = reinterpret_cast<float*>(static_cast<char*>(origin_blob_->GetHandle().base) +
+                                               origin_blob_->GetHandle().bytes_offset);
+
+    for (int b = 0; b < batch; ++b) {
+        for (int c = 0; c < channel; ++c) {
+            int channel_idx = c;
+            if (merge_channel_) {
+                channel_idx = 0;
+            }
+            if (!valid_channel_[channel_idx]) {
+                continue;
+            }
+
+            float* p               = data_ptr + b * channel * hxw + c * hxw;
+            float* distribute_data = distribute_per_channel_[channel_idx].data();
+            for (int i = 0; i < hxw; ++i) {
+                float val = p[i];
+                if (val == 0) {
+                    continue;
+                }
+
+                int index = static_cast<int>(std::abs(val) * interval_per_channel_[channel_idx]);
+                index     = std::min(index, bin_nums_ - 1);
+                distribute_data[index] += 1.0;
+            }
+        }
+    }
+
+    distribute_done_flag_ = true;
+    return 0;
+}
+
+int ScaleCalculator::CalculateScale(std::vector<float>& val) {
+    val.clear();
+
+    if (merge_channel_) {
+        val.push_back(0.0f);
+        if (!valid_channel_[0]) {
+            return -1;
+        }
+        int ret = CalculateScalePerDis(distribute_per_channel_[0], interval_per_channel_[0], val[0]);
+        if (ret != 0)
+            return -1;
+    } else {
+        val.resize(valid_channel_.size());
+        std::fill(val.begin(), val.end(), 0.0f);
+
+        for (unsigned int c = 0; c < distribute_per_channel_.size(); ++c) {
+            if (!valid_channel_[c]) {
+                continue;
+            }
+            int ret = CalculateScalePerDis(distribute_per_channel_[c], interval_per_channel_[c], val[c]);
+            if (ret != 0)
+                return -1;
+        }
+    }
+
+    return 0;
+}
+
+int ScaleCalculator::CalculateScalePerDis(std::vector<float>& distribute, float interval, float& output) {
+    const int target_bin_nums = 128;
+    int threshold             = target_bin_nums;
+
+    // normalize
+    float sum = 0;
+    std::for_each(distribute.begin(), distribute.end(), [&](float n) { sum += n; });
+    std::for_each(distribute.begin(), distribute.end(), [sum](float& n) { n /= sum; });
+
+    if (cali_method_ == MIN_MAX) {
+        threshold = bin_nums_ - 1;
+    } else if (cali_method_ == KL_DIVERGENCE) {
+        float kl_val_min          = 1e6;
+        float sum_after_threshold = 0.0f;
+        std::for_each(distribute.begin() + target_bin_nums, distribute.end(),
+                      [&](float n) { sum_after_threshold += n; });
+        for (int i = target_bin_nums; i < bin_nums_; ++i) {
+            // 1. get referenced distribute
+            std::vector<float> distribute_ref(i);
+            std::copy(distribute.begin(), distribute.begin() + i, distribute_ref.begin());
+            distribute_ref[i - 1] += sum_after_threshold;
+            sum_after_threshold -= distribute[i];  // for next loop
+
+            // 2. quantize the distribute within threshold scope as target bins
+            std::vector<float> distribute_quantized(target_bin_nums);
+            const float bin_interval = (float)i / (float)target_bin_nums;
+
+            for (int j = 0; j < target_bin_nums; ++j) {
+                const float start = j * bin_interval;
+                const float end   = start + bin_interval;
+
+                const int left_upper = static_cast<int>(std::ceil(start));
+                if (left_upper > start) {
+                    const float left_scale = left_upper - start;
+                    distribute_quantized[j] += left_scale * distribute[left_upper - 1];
+                }
+                const int right_lower = static_cast<int>(std::floor(end));
+                if (right_lower < end) {
+                    const float right_scale = end - right_lower;
+                    distribute_quantized[j] += right_scale * distribute[right_lower];
+                }
+                std::for_each(distribute.begin() + left_upper, distribute.begin() + right_lower,
+                              [&](float n) { distribute_quantized[j] += n; });
+            }
+
+            // 3. expand target bins to i bins to calculate kl
+            std::vector<float> distribute_expanded(i);
+            for (int j = 0; j < target_bin_nums; ++j) {
+                const float start    = j * bin_interval;
+                const float end      = start + bin_interval;
+                float count          = 0;
+                const int left_upper = static_cast<int>(std::ceil(start));
+                float left_scale     = 0.0f;
+                if (left_upper > start) {
+                    left_scale = left_upper - start;
+                    if (distribute[left_upper - 1] != 0) {
+                        count += left_scale;
+                    }
+                }
+                const int right_lower = static_cast<int>(std::floor(end));
+                float right_scale     = 0.0f;
+                if (right_lower < end) {
+                    right_scale = end - right_lower;
+                    if (distribute[right_lower] != 0) {
+                        count += right_scale;
+                    }
+                }
+
+                std::for_each(distribute.begin() + left_upper, distribute.begin() + right_lower, [&](float n) {
+                    if (n != 0) {
+                        count += 1;
+                    }
+                });
+
+                if (count == 0) {
+                    continue;
+                }
+                const float to_expand_val = distribute_quantized[j] / count;
+                if (left_upper > start && distribute[left_upper - 1] != 0) {
+                    distribute_expanded[left_upper - 1] += to_expand_val * left_scale;
+                }
+                if (right_lower < end && distribute[right_lower] != 0) {
+                    distribute_expanded[right_lower] += to_expand_val * right_scale;
+                }
+
+                for (int k = left_upper; k < right_lower; ++k) {
+                    if (distribute[k] != 0) {
+                        distribute_expanded[k] += to_expand_val;
+                    }
+                }
+            }
+
+            // 4. calculate kl val
+            const float kl_val_cur = KlDivergence(distribute_ref, distribute_expanded);
+
+            // 5. get the threshold of min kl val
+            if (kl_val_cur < kl_val_min) {
+                kl_val_min = kl_val_cur;
+                threshold  = i;
+            }
+        }
+    } else {
+        LOGE("invalid calibration method! (type: %d)\n", cali_method_);
+        return -1;
+    }
+
+    output = ((float)threshold + 0.5) / interval / 127.0;
+
+    return 0;
+}
+
+}  // namespace TNN_NS
diff --git a/3rdparty/TNN/tools/quantization/scale_calculator.h b/3rdparty/TNN/tools/quantization/scale_calculator.h
new file mode 100644
index 0000000..0088130
--- /dev/null
+++ b/3rdparty/TNN/tools/quantization/scale_calculator.h
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_TOOLS_QUANTIZATION_SCALE_CALCULATOR_H_
+#define TNN_TOOLS_QUANTIZATION_SCALE_CALCULATOR_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/core/status.h"
+
+#include "calibration_common.h"
+
+namespace TNN_NS {
+
+class ScaleCalculator {
+public:
+    // @brief ScaleCalculator constructor
+    ScaleCalculator();
+
+    // @brief ScaleCalculator virtual Destructor
+    virtual ~ScaleCalculator();
+
+public:
+    // @brief: init with a blob and a method.
+    // param 0 : input_blob, can be a device blob
+    // param 1 : flag to merge_channel
+    // param 2 : method, kl, minmax or admm
+    int Init(Blob* blob, bool merge_channel = true, CalibrationMethod method = MIN_MAX);
+
+    // @brief: set the quantize method
+    // param 0 : method, the method to set
+    int SetQuantizeMethod(CalibrationMethod method);
+
+    // @brief: set merge channel param
+    // param 0 : method, the method to set
+    void SetMergeChannel(bool merge);
+
+    // @brief: clear range_done_flag_.
+    void ClearRangeFlag();
+
+    // @brief: clear distribute_done_flag_.
+    void ClearDistributeFlag();
+
+    // @brief: update the blob data.
+    int UpdateRange();
+
+    // @brief: reset distribute according range.
+    int ResetDistribute();
+
+    // @brief: update distribute.
+    int UpdateDistribute();
+
+    // @brief: get the per-channel scale of the given blob
+    int CalculateScale(std::vector<float>& val);
+
+private:
+    int CalculateScalePerDis(std::vector<float>& distribute, float interval, float& output);
+
+    Blob* origin_blob_;
+    bool merge_channel_;
+    CalibrationMethod cali_method_;
+    int bin_nums_;
+    bool range_done_flag_;
+    bool distribute_done_flag_;
+    std::vector<std::pair<float, float>> range_per_channel_;
+    std::vector<float> interval_per_channel_;
+    std::vector<bool> valid_channel_;
+    std::vector<std::vector<float>> distribute_per_channel_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_TOOLS_QUANTIZATION_SCALE_CALCULATOR_H_
diff --git a/3rdparty/TNN/tools/tnn2mem/CMakeLists.txt b/3rdparty/TNN/tools/tnn2mem/CMakeLists.txt
new file mode 100644
index 0000000..fdf5104
--- /dev/null
+++ b/3rdparty/TNN/tools/tnn2mem/CMakeLists.txt
@@ -0,0 +1 @@
+ADD_EXECUTABLE(tnn2mem tnn2mem.cpp)
diff --git a/3rdparty/TNN/tools/tnn2mem/tnn2mem.cpp b/3rdparty/TNN/tools/tnn2mem/tnn2mem.cpp
new file mode 100644
index 0000000..294d501
--- /dev/null
+++ b/3rdparty/TNN/tools/tnn2mem/tnn2mem.cpp
@@ -0,0 +1,121 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cfloat>
+#include <cstddef>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <vector>
+
+static void SanitizeName(char* name) {
+    for (std::size_t i = 0; i < strlen(name); i++) {
+        if (!isalnum(name[i])) {
+            name[i] = '_';
+        }
+    }
+}
+
+static std::string PathtoVarname(const char* path) {
+    const char* lastslash = strrchr(path, '/');
+    const char* name      = lastslash == NULL ? path : lastslash + 1;
+
+    std::string varname = name;
+    SanitizeName((char*)varname.c_str());
+
+    return varname;
+}
+
+static int DumpProto(const char* proto_path, const char* model_path, const char* idcpp_path) {
+    FILE* fp = fopen(proto_path, "rb");
+    FILE* mp = fopen(model_path, "rb");
+
+    if (!fp) {
+        fprintf(stderr, "fopen %s failed\n", proto_path);
+        return -1;
+    }
+
+    if (!mp) {
+        fprintf(stderr, "fopen %s failed\n", model_path);
+        return -1;
+    }
+    std::string proto_var         = PathtoVarname(proto_path);
+    std::string model_var         = PathtoVarname(model_path);
+    std::string include_guard_var = PathtoVarname(idcpp_path);
+
+    FILE* ip = fopen(idcpp_path, "wb");
+
+    fprintf(ip, "#ifndef TNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+    fprintf(ip, "#define TNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+
+     fprintf(ip, "#include <string>\n");
+
+    fprintf(ip, "\n#ifdef _MSC_VER\n__declspec(align(4))\n#else\n__attribute__((aligned(4)))\n#endif\n");
+
+    fprintf(ip, "static const unsigned char %s[] = {\n", proto_var.c_str());
+    int i = 0;
+    int j = 0;
+    int c;
+
+    while (1) {
+        c = fgetc(fp);
+        if (feof(fp)) {
+            break;
+        }
+        fprintf(ip, "0x%02x,", c);
+        j++;
+        if (j % 16 == 0) {
+            fprintf(ip, "\n");
+        }
+    }
+    fprintf(ip, "};\n");
+
+    std::ifstream model_stream(model_path);
+    std::string model_content =
+        std::string((std::istreambuf_iterator<char>(model_stream)), std::istreambuf_iterator<char>());
+
+    fprintf(ip, "static const unsigned char %s[] = {\n", model_var.c_str());
+
+    while (1) {
+        c = fgetc(mp);
+        if (feof(mp)) {
+            break;
+        }
+        fprintf(ip, "0x%02x,", c);
+        i++;
+        if (i % 16 == 0) {
+            fprintf(ip, "\n");
+        }
+    }
+    fprintf(ip, "};\n");
+
+    fprintf(ip, "static const int %s_length = ", model_var.c_str());
+    fprintf(ip, "%u;\n", i);
+
+    fprintf(ip, "static const int %s_length = ", proto_var.c_str());
+    fprintf(ip, "%u;\n", j);
+    
+    fprintf(ip, "#endif // TNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+
+    fclose(fp);
+    fclose(mp);
+    fclose(ip);
+    return 0;
+}
+
+int main(int argc, char** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "Usage: %s [tnnproto] [tnnmodel] [memcpppath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* proto_path  = argv[1];
+    const char* model_path  = argv[2];
+    const char* memcpp_path = argv[3];
+    DumpProto(proto_path, model_path, memcpp_path);
+    return 0;
+}
diff --git a/3rdparty/base-utils/.gitignore b/3rdparty/base-utils/.gitignore
new file mode 100644
index 0000000..b565887
--- /dev/null
+++ b/3rdparty/base-utils/.gitignore
@@ -0,0 +1,24 @@
+# git rm -r --cached .
+# git add .
+# git commit -m 'update .gitignore'
+
+*.iml
+.gradle
+/local.properties
+/.idea
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+test/cmake-build-debug
+cmake-build-debug
+test/build
+build
diff --git a/3rdparty/base-utils/README.md b/3rdparty/base-utils/README.md
new file mode 100644
index 0000000..5d1f8db
--- /dev/null
+++ b/3rdparty/base-utils/README.md
@@ -0,0 +1,92 @@
+# base_utils
+集成C/C++ OpenCV常用的算法
+- 增加了debug测试宏定义，如时间测试，LOG信息等
+- 针对目标坐标点的卡尔曼滤波，加权平均滤波
+- 常用的文件处理函数
+- 常用的OpenCV图像处理函数
+
+## 1.目录结构
+```
+├── base_utils         # base_utils的源代码
+├── data               # 相关测试数据
+├── test               # base_utils的测试代码
+│   ├── build.sh
+│   ├── CMakeLists.txt
+│   ├── kalman_test.cpp
+│   └── main.cpp
+└── README.md
+
+```
+
+## 2.相关配置
+- OpenCV配置方法
+```cmake
+# opencv set
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ./src/)
+MESSAGE(STATUS "OpenCV_INCLUDE_DIRS = ${OpenCV_INCLUDE_DIRS}")
+```
+
+- base-utils库的配置方法
+```cmake
+# base-utils
+set(BASE_ROOT ../) # 设置base-utils所在的根目录
+add_subdirectory(${BASE_ROOT}/base_utils/ base_build) # 添加子目录到build中
+include_directories(${BASE_ROOT}/base_utils/include)
+include_directories(${BASE_ROOT}/base_utils/src)
+MESSAGE(STATUS "BASE_ROOT = ${BASE_ROOT}")
+```
+
+- 配置OpenCL（可选）
+    1. OpenCL: https://software.intel.com/content/www/us/en/develop/tools/opencl-sdk/choose-download.html 
+    2. Android系统一般都支持OpenCL，Linux系统可参考如下配置：
+
+```bash
+# 参考安装OpenCL： https://blog.csdn.net/qq_28483731/article/details/68235383，作为测试，安装`intel cpu版本的OpenCL`即可
+# 安装clinfo，clinfo是一个显示OpenCL平台和设备的软件
+sudo apt-get install clinfo
+# 安装依赖
+sudo apt install dkms xz-utils openssl libnuma1 libpciaccess0 bc curl libssl-dev lsb-core libicu-dev
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+echo "deb http://download.mono-project.com/repo/debian wheezy main" | sudo tee /etc/apt/sources.list.d/mono-xamarin.list
+sudo apt-get update
+sudo apt-get install mono-complete
+# 在intel官网上下载了intel SDK的tgz文件，并且解压
+sudo sh install.sh
+```
+
+
+
+## 3.Demo测试
+
+- `build`
+```bash
+cd test
+bash build.sh
+```
+- `test/main.cpp`测试样例
+```c++
+#include<opencv2/opencv.hpp>
+#include<string>
+#include "debug.h"
+using namespace std;
+
+int main() {
+    string path = "../../data/test_image/test1.jpg";
+    DEBUG_TIME(t1);
+    cv::Mat image = cv::imread(path);
+    LOGI("image:%s", path.c_str());
+    LOGD("image:%s", path.c_str());
+    LOGW("image:%s", path.c_str());
+    LOGE("image:%s", path.c_str());
+    LOGF("image:%s", path.c_str());
+    DEBUG_TIME(t2);
+    LOGI("rum time:%3.3fms", RUN_TIME(t2 - t1));
+    cv::waitKey(0);
+    DEBUG_IMSHOW("image", image);
+    return 0;
+}
+
+```
+
+<img src="docs/demo.jpg" width="0" height="" />
diff --git a/3rdparty/base-utils/base_utils/CMakeLists.txt b/3rdparty/base-utils/base_utils/CMakeLists.txt
new file mode 100755
index 0000000..cd3b961
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.5)
+project(base_utils)
+add_compile_options(-fPIC) # fix Bug: can not be used when making a shared object
+set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -pthread")
+#set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG")
+#set(CMAKE_CXX_FLAGS_DEBUG "-g")
+if (NOT CMAKE_BUILD_TYPE)
+    # -DCMAKE_BUILD_TYPE=Debug
+    # -DCMAKE_BUILD_TYPE=Release
+    message(STATUS "No build type selected, default to Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Debug)" FORCE)
+endif ()
+
+
+option(BASE_FILTER_ENABLE "Enable filter" ON)
+option(BASE_OPENMP_ENABLE "Enable filter" ON)
+
+message(STATUS "BASE_FILTER_ENABLE:${BASE_FILTER_ENABLE}")
+message(STATUS "BASE_OPENMP_ENABLE:${BASE_OPENMP_ENABLE}")
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_definitions(-DPLATFORM_ANDROID)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_definitions(-DPLATFORM_LINUX)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    add_definitions(-DPLATFORM_WINDOWS)
+endif ()
+
+# 常用的变量：
+# . 表示当前目录
+# ${PROJECT_SOURCE_DIR}：为包含 project()的最近一个CMakeLists.txt文件所在的文件夹
+# ${CMAKE_CURRENT_SOURCE_DIR}：当前CMakeLists.txt的路径
+
+# opencv set
+#find_package(OpenCV REQUIRED)
+#include_directories(${OpenCV_INCLUDE_DIRS} ./src/)
+
+# 指定头文件目录,PROJECT_SOURCE_DIR为工程的根目录  
+include_directories(include)
+
+#指定可执行文件的输出目录，输出到bin下面  
+#set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin)
+
+#指定库文件输出路径  
+#set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib)
+
+# 将指定的源文件生成链接文件base_utils
+# add_library(base_utils myhello.cpp)
+# 更加便捷的方法是使用aux_source_directory(<dir> <variable>)
+# 查找当前目录下的所有源文件，并将名称保存到DIR_SRCS变量中
+# DIR_SRCS可以多次使用，相同的变量会自动拼接
+aux_source_directory(src DIR_SRCS)
+if (BASE_FILTER_ENABLE)
+    #add_subdirectory(src/filter)
+    aux_source_directory(src/filter DIR_SRCS)
+endif ()
+
+#将指定的源文件生成链接文件
+MESSAGE(STATUS "DIR_SRCS = ${DIR_SRCS}")
+#add_library(base_utils STATIC ${DIR_SRCS}) # 静态库（.a、.lib）
+#add_library(base_utils SHARED ${DIR_SRCS})  # 动态库（.so、.dll)
+add_library(base_utils  ${DIR_SRCS})  #
+#set_target_properties(my_lib PROPERTIES OUTPUT_NAME "my_lib")
diff --git a/3rdparty/base-utils/base_utils/include/android_utils.h b/3rdparty/base-utils/base_utils/include/android_utils.h
new file mode 100644
index 0000000..e445a2b
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/android_utils.h
@@ -0,0 +1,117 @@
+//
+// Created by dm on 2021/1/25.
+//
+
+#ifndef BASE_UTILS_ANDROID_UTILS_H
+#define BASE_UTILS_ANDROID_UTILS_H
+
+//#ifdef PLATFORM_LINUX
+#ifdef PLATFORM_ANDROID
+#include <jni.h>
+#include <android/bitmap.h>
+#include <android/log.h>
+#include "opencv2/opencv.hpp"
+
+#define ASSERT_TRUE(status, ret)     if (!(status)) { return ret; }
+#define ASSERT_FALSE(status)    ASSERT_TRUE(status, false)
+
+/***
+ * 将Android Bitmap转OpenCV Mat
+ * @param env
+ * @param obj_bitmap
+ * @param matrix
+ * @return
+ */
+bool BitmapToMatrix(JNIEnv *env, jobject obj_bitmap, cv::Mat &matrix) {
+    void *bitmapPixels;         // Save picture pixel data
+    AndroidBitmapInfo bitmapInfo; // Save picture parameters
+
+    // Get picture parameters
+    ASSERT_FALSE(AndroidBitmap_getInfo(env, obj_bitmap, &bitmapInfo) >= 0);
+    // Only ARGB? 8888 and RGB? 565 are supported
+    ASSERT_FALSE(bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGBA_8888
+                 || bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGB_565);
+    // Get picture pixels (lock memory block)
+    ASSERT_FALSE(AndroidBitmap_lockPixels(env, obj_bitmap, &bitmapPixels) >= 0);
+    ASSERT_FALSE(bitmapPixels);
+
+    if (bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        // Establish temporary mat
+        cv::Mat tmp(bitmapInfo.height, bitmapInfo.width, CV_8UC4, bitmapPixels);
+        tmp.copyTo(matrix);// Copy to target matrix
+    } else {
+        cv::Mat tmp(bitmapInfo.height, bitmapInfo.width, CV_8UC2, bitmapPixels);
+        cv::cvtColor(tmp, matrix, cv::COLOR_BGR5652RGB);
+    }
+
+    //convert RGB to BGR
+    cv::cvtColor(matrix, matrix, cv::COLOR_RGB2BGR);
+    //cv::cvtColor(matrix, matrix, cv::COLOR_RGB2BGRA);
+    AndroidBitmap_unlockPixels(env, obj_bitmap);            // Unlock
+    return true;
+}
+
+
+/***
+ * 将OpenCV Mat转Android Bitmap
+ * @param env
+ * @param matrix
+ * @param bitmap :  jobject bitmap,
+ * @return
+ */
+bool MatrixToBitmap(JNIEnv *env, cv::Mat &matrix, jobject bitmap) {
+    void *bitmapPixels;     // Save picture pixel data
+    AndroidBitmapInfo bitmapInfo; // Save picture parameters
+    // Get picture parameters
+    ASSERT_FALSE(AndroidBitmap_getInfo(env, bitmap, &bitmapInfo) >= 0);
+    // Only ARGB? 8888 and RGB? 565 are supported
+    ASSERT_FALSE(bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGBA_8888
+                 || bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGB_565);
+    // It must be a 2-dimensional matrix with the same length and width
+    ASSERT_FALSE(matrix.dims == 2 && bitmapInfo.height == (uint32_t) matrix.rows
+                 && bitmapInfo.width == (uint32_t) matrix.cols);
+    ASSERT_FALSE(matrix.type() == CV_8UC1 || matrix.type() == CV_8UC3 || matrix.type() == CV_8UC4);
+    // Get picture pixels (lock memory block)
+    ASSERT_FALSE(AndroidBitmap_lockPixels(env, bitmap, &bitmapPixels) >= 0);
+    ASSERT_FALSE(bitmapPixels);
+
+    if (bitmapInfo.format == ANDROID_BITMAP_FORMAT_RGBA_8888) {
+        cv::Mat tmp(bitmapInfo.height, bitmapInfo.width, CV_8UC4, bitmapPixels);
+        switch (matrix.type()) {
+            case CV_8UC1:
+                cv::cvtColor(matrix, tmp, cv::COLOR_GRAY2RGBA);
+                break;
+            case CV_8UC3:
+                cv::cvtColor(matrix, tmp, cv::COLOR_RGB2RGBA);
+                break;
+            case CV_8UC4:
+                matrix.copyTo(tmp);
+                break;
+            default:
+                AndroidBitmap_unlockPixels(env, bitmap);
+                return false;
+        }
+    } else {
+        cv::Mat tmp(bitmapInfo.height, bitmapInfo.width, CV_8UC2, bitmapPixels);
+        switch (matrix.type()) {
+            case CV_8UC1:
+                cv::cvtColor(matrix, tmp, cv::COLOR_GRAY2BGR565);
+                break;
+            case CV_8UC3:
+                cv::cvtColor(matrix, tmp, cv::COLOR_RGB2BGR565);
+                break;
+            case CV_8UC4:
+                cv::cvtColor(matrix, tmp, cv::COLOR_RGBA2BGR565);
+                break;
+            default:
+                AndroidBitmap_unlockPixels(env, bitmap);
+                return false;
+        }
+    }
+    AndroidBitmap_unlockPixels(env, bitmap); // Unlock
+    return true;
+}
+
+
+#endif
+#endif //BASE_UTILS_ANDROID_UTILS_H
diff --git a/3rdparty/base-utils/base_utils/include/base.h b/3rdparty/base-utils/base_utils/include/base.h
new file mode 100644
index 0000000..f1f4ba3
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/base.h
@@ -0,0 +1,26 @@
+//
+// Created by dm on 2021/1/27.
+//
+
+#ifndef DETECTOR_BASE_H
+#define DETECTOR_BASE_H
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace std;
+/***
+ * 释放vector空间
+ * @tparam T
+ * @param vt
+ */
+template<class T>
+void clear_vector(std::vector <T> &vt) {
+    vt.clear();
+    std::vector <T> vtTemp;
+    vtTemp.swap(vt);
+    //vector<T>().swap(queue);
+};
+
+#endif //DETECTOR_BASE_H
diff --git a/3rdparty/base-utils/base_utils/include/debug.h b/3rdparty/base-utils/base_utils/include/debug.h
new file mode 100644
index 0000000..2a625a5
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/debug.h
@@ -0,0 +1,121 @@
+#ifndef DETECT_DEBUG_H
+#define DETECT_DEBUG_H
+
+#include <iostream>
+#include <string>
+#include "opencv2/opencv.hpp"
+#include "image_utils.h"
+#include <chrono>
+#include <assert.h>
+
+using namespace std;
+
+/***
+ * 设置LOG的信息开关
+ */
+//debug info ON-OFF
+//CMake setting add_definitions(-DDEBUG_ON)
+#define DEBUG_OFF
+#ifdef  DEBUG_ON
+#define DEBUG_LOG_OFF         //Window debug:print debug info
+#define DEBUG_IMSHOW_OFF      //show debug images
+#define DEBUG_IMWRITE_OFF     //write debug images
+#define DEBUG_ANDROID_OFF      //android debug on/off
+
+#else
+#define DEBUG_OFF(format, ...)
+#endif
+
+/***
+ * 其他宏定义
+ */
+#define LOG_TAG    "dm-jni-log"
+#define millisecond 1000000
+#define DEBUG_TIME_ON         //run times test on/off
+#define FILE_INFO printf("[%s line%d] [tag:%s] ",__FILE__,__LINE__,LOG_TAG);
+#define ASSERT(...) assert( __VA_ARGS__)
+#define CV_ASSERT(...) CV_Assert( __VA_ARGS__)
+
+
+/***
+ * 打印信息定义
+ */
+//print debug info
+#ifdef DEBUG_ANDROID_ON
+#include <android/log.h>
+// Define the LOGI and others for print debug infomation like the log.i in java
+#define LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG, __VA_ARGS__)
+#define LOGD(...)  __android_log_print(ANDROID_LOG_DEBUG,LOG_TAG, __VA_ARGS__)
+#define LOGW(...)  __android_log_print(ANDROID_LOG_WARN,LOG_TAG, __VA_ARGS__)
+#define LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG, __VA_ARGS__)
+#define LOGF(...)  __android_log_print(ANDROID_LOG_FATAL,LOG_TAG, __VA_ARGS__)
+#define DEBUG_PRINT(format, ...)
+#define DEBUG_COUT(...)
+#elif defined DEBUG_LOG_ON
+#define LOGI(...)  FILE_INFO;printf(__VA_ARGS__); printf("\n")
+#define LOGD(...)  FILE_INFO;printf(__VA_ARGS__); printf("\n")
+#define LOGW(...)  FILE_INFO;printf(__VA_ARGS__); printf("\n")
+#define LOGE(...)  FILE_INFO;printf(__VA_ARGS__); printf("\n")
+#define LOGF(...)  FILE_INFO;printf(__VA_ARGS__); printf("\n")
+#define DEBUG_PRINT(...) printf( __VA_ARGS__);printf("\n")
+#define DEBUG_COUT(...) std::cout __VA_ARGS__ << std::endl
+#else
+#define LOGI(...)
+#define LOGD(...)
+#define LOGW(...)
+#define LOGE(...)
+#define LOGF(...)
+#define DEBUG_PRINT(format, ...)
+#define DEBUG_COUT(...)
+#endif
+
+/***
+ * run time define
+ */
+#ifdef  DEBUG_TIME_ON
+//设置计算运行时间的宏定义
+#define DEBUG_TIME(time_) auto time_ =std::chrono::high_resolution_clock::now()
+#define RUN_TIME(time_)  (double)(time_).count()/millisecond
+#else
+#define DEBUG_TIME(time_)
+#define RUN_TIME(time_)
+#endif
+
+/***
+ * show debug images define
+ */
+#ifdef  DEBUG_IMSHOW_ON
+#define DEBUG_IMSHOW(...) image_show(__VA_ARGS__)
+#else
+#define DEBUG_IMSHOW(format, ...)
+#endif
+
+/***
+ * write debug images define
+ */
+#ifdef  DEBUG_IMWRITE_ON
+#define DEBUG_IMWRITE(...) image_save(__VA_ARGS__)
+#else
+#define DEBUG_IMWRITE(format, ...)
+#endif
+
+
+template<typename TYPE>
+void PRINT_1D(string name, TYPE *p1, int len) {
+    printf("%s", name.c_str());
+    for (int i = 0; i < len; i++) {
+        printf("%f,", p1[i]);
+    }
+    cout << endl;
+}
+
+template<typename TYPE>
+void PRINT_VECTOR(string tag, vector<TYPE> v) {
+    printf("%s", tag.c_str());
+    //for (const auto &c : v) cout << c << endl;
+    for (const auto &c : v) cout << c << " ";
+    printf("\n");
+
+};
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/base-utils/base_utils/include/file_utils.h b/3rdparty/base-utils/base_utils/include/file_utils.h
new file mode 100644
index 0000000..eb82427
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/file_utils.h
@@ -0,0 +1,170 @@
+//
+// Created by dm on 2021/1/15.
+//
+
+#ifndef DETECTOR_FILE_UTILS_H
+#define DETECTOR_FILE_UTILS_H
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+using namespace std;
+
+#ifdef PLATFORM_ANDROID//_ANDROID
+#define _ANDROID
+#define separator "/"
+#elif defined PLATFORM_LINUX//_LINUX
+#define _LINUX
+#define separator "/"
+#elif defined PLATFORM_WINDOWS//_WIN32
+#define _WINDOWS
+#define separator  "\\"
+#endif
+
+
+/***
+ * 保存vector<TYPE>的数据
+ *     int num=128;
+ *     string file="path/to/data.bin";
+ *     vector<float> data(num, 0.0f);
+ *     save_bin(file, data);
+ * @tparam TYPE
+ * @param path
+ * @param data
+ */
+template<typename TYPE>
+void save_bin(string path, vector<TYPE> &data) {
+    ofstream file(path, ios::out | ios::binary);
+    if (!file) {
+        printf("Can't save file:%s\n", path.c_str());
+        return;
+    }
+    file.write((char *) data.data(), sizeof(TYPE) * data.size());
+    file.close();
+    printf("finish save file:%s\n", path.c_str());
+
+};
+
+/***
+ * 读取vector<TYPE>的数据
+ *     int num=128;
+ *     string file="path/to/data.bin";
+ *     vector<float> out(num, 0.0f);
+ *     load_bin(file, out);
+ * @tparam TYPE
+ * @param path
+ * @param out
+ */
+template<typename TYPE>
+void load_bin(string path, vector<TYPE> &out) {
+    std::ifstream file(path, std::ios::in | std::ios::binary);
+    if (!file) {
+        printf("Can't open file:%s\n", path.c_str());
+        return;
+    }
+    //vector<TYPE> data(size, 0.0f);
+    file.read((char *) out.data(), sizeof(TYPE) * out.size());
+    file.close();
+    printf("finish load file:%s\n", path.c_str());
+}
+
+/***
+ * 将string类型的内容保存为txt文件
+ * @param path
+ * @param data
+ * @param bCover
+ */
+void write_datatxt(string path, string data, bool bCover = false);
+
+/***
+ * 保存vector<string>，每项保存一行
+ * @param path
+ * @param contents
+ * @param bCover
+ */
+void write_contents(string path, vector<string> contents, bool bCover = true);
+
+/***
+ * 读取文本，每行一项
+ * @param path
+ * @return
+ */
+vector<string> read_contents(string path);
+
+
+/***
+ * 加载文件的内容
+ * @param path
+ * @return 以string形式，返回文件内容
+ */
+std::string load_file(string path);
+
+/***
+ * 加载文件的内容
+ * @param path
+ * @param file_string: 以string形式，返回文件内容
+ * @return 0:表示读取成功，1表示读取失败
+ */
+int load_file(const char *path, std::string &file_string);
+
+
+/***
+ * 判断文件是否存在
+ * @param path
+ * @return
+ */
+bool file_exists(string path);
+
+
+/***
+ * 获得文件路径的文件名称
+ * @param path
+ * @return
+ */
+string get_basename(string path);
+
+/***
+ * 获得文件路径的父目录路径
+ * @param path
+ * @return
+ */
+string get_parent(string path);
+
+
+/***
+ * 获得文件路径的父目录文件名称
+ * @param path
+ * @return
+ */
+string get_subname(string path);
+
+
+/***
+ * 获得后缀名称
+ * @param path
+ * @param tolower 是否转为小写
+ * @return
+ */
+string get_postfix(string path, bool tolower = true);
+
+
+/***
+ * 实现路径拼接
+ * @param path1
+ * @param path2
+ * @return
+ */
+string path_joint(string path1, string path2);
+
+/***
+ * 获得directory目录下的所有文件
+ * @param dir
+ * @return
+ */
+vector<string> get_files_list(string dir);
+
+
+#endif //DETECTOR_FILE_UTILS_H
diff --git a/3rdparty/base-utils/base_utils/include/filter/kalman_filter.h b/3rdparty/base-utils/base_utils/include/filter/kalman_filter.h
new file mode 100644
index 0000000..ff8a77b
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/filter/kalman_filter.h
@@ -0,0 +1,61 @@
+//
+// Created by dm on 2021/1/19.
+//
+
+#ifndef BASE_UTILS_KALMAN_FILTER_H
+#define BASE_UTILS_KALMAN_FILTER_H
+
+#include <opencv2/opencv.hpp>
+#include <vector>
+#include <numeric>
+
+using namespace std;
+
+
+/***
+ * 卡尔曼滤波，可以有效解决关键点的抖动问题
+ * 卡尔曼滤波模型假设k时刻的真实状态是从(k − 1)时刻的状态演化而来，符合下式：
+ *      X(k) = F(k) * X(k-1) + B(k)*U(k) + W（k）
+ * 其中：
+ *      F(k)  是作用在xk−1上的状态变换模型（/矩阵/矢量）。
+ *      B(k)  是作用在控制器向量uk上的输入－控制模型。
+ *      W(k)  是过程噪声，并假定其符合均值为零，协方差矩阵为Qk的多元正态分布
+ */
+class KalmanFilter {
+public:
+    cv::KalmanFilter *mKF;
+private:
+    cv::Mat mMeasurement;
+
+public:
+    /***
+     *
+     * @param stateNum 状态值4×1向量(x,y,△x,△y)，坐标及速度
+     * @param measureNum 测量值2×1向量(x,y)
+     * @param controlParams
+     */
+    KalmanFilter(int stateNum = 4, int measureNum = 2);
+
+
+    /***
+     * 析构函数
+     */
+    ~KalmanFilter();
+
+    /***
+     * 更新数据
+     * @param pos 测量值坐标点
+     */
+    void update(cv::Point pos);
+
+
+    /***
+     * 获得预测结果
+     * @return
+     */
+    cv::Point predict();
+
+};
+
+
+#endif //BASE_UTILS_KALMAN_FILTER_H
diff --git a/3rdparty/base-utils/base_utils/include/filter/mean_filter.h b/3rdparty/base-utils/base_utils/include/filter/mean_filter.h
new file mode 100644
index 0000000..a1ee8f7
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/filter/mean_filter.h
@@ -0,0 +1,69 @@
+//
+// Created by dm on 2021/1/20.
+//
+
+#ifndef BASE_UTILS_MEAN_FILTER_H
+#define BASE_UTILS_MEAN_FILTER_H
+
+#include <opencv2/opencv.hpp>
+#include <vector>
+#include <numeric>
+
+using namespace std;
+
+/***
+ * 加权滑动平均滤波法(Weighted Moving Average Filter)，
+ * 可以有效解决关键点的抖动问题
+ */
+class MovingMeanFilter {
+public:
+    vector<cv::Point> mQueue;
+private:
+    int mWinSize;
+    vector<float> mWeightDecay;
+public:
+
+    /***
+     * 构造函数
+     * @param win_size 滑动窗口，用于记录历史信息的窗口大小，默认5
+     * @param decay 权重衰减系数，值越大，历史影响衰减的越快，平滑力度越小,默认0.6
+     */
+    MovingMeanFilter(int win_size = 5, float decay = 0.6);
+
+    /***
+     * 析构函数
+     */
+    ~MovingMeanFilter();
+
+    /***
+     * 更新数据
+     * @param pos 坐标点
+     */
+    void update(cv::Point pos);
+
+
+    /***
+     * 获得预测结果
+     * @return 返回预测的坐标值
+     */
+    cv::Point predict();
+
+private:
+    /***
+     * 实现加权平均平滑，权重由weight_decay确定
+     * @return
+     */
+    cv::Point filter();
+
+    /***
+     * 获得权重衰减
+     * 当n=5,decay=0.5时，对应的衰减权重为，越远的权重越小
+     * w=[0.0625 0.0625 0.125  0.25   0.5   ]
+     * @param n
+     * @param decay 衰减系数，值越大，历史影响衰减的越快，平滑力度越小
+     * @return
+     */
+    vector<float> get_weight_decay(int n, float decay);
+};
+
+#endif //BASE_UTILS_MEAN_FILTER_H
diff --git a/3rdparty/base-utils/base_utils/include/filter/tracking_flow.h b/3rdparty/base-utils/base_utils/include/filter/tracking_flow.h
new file mode 100644
index 0000000..0a9a930
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/filter/tracking_flow.h
@@ -0,0 +1,75 @@
+//
+// Created by dm on 2021/1/21.
+//
+
+#ifndef DETECTOR_POITFLOW_H
+#define DETECTOR_POITFLOW_H
+
+#include <opencv2/opencv.hpp>
+#include <vector>
+#include "mean_filter.h"
+#include "kalman_filter.h"
+
+using namespace std;
+
+class TrackingFlow {
+public:
+    /***
+     *
+     * @param num_points
+     */
+    TrackingFlow(int num_points) {
+        mNumPoints = num_points;
+        init();
+    }
+
+    /***
+     *
+     */
+    ~TrackingFlow() {
+        for (int i = 0; i < this->mNumPoints; ++i) {
+            delete mFilter.at(i);
+            mFilter.at(i) = nullptr;
+        }
+        vector<MovingMeanFilter *>().swap(mFilter);
+        //vector<KalmanFilter *>().swap(mFilter);
+    }
+
+private:
+    /***
+     *
+     */
+    void init() {
+        for (int i = 0; i < this->mNumPoints; ++i) {
+            mFilter.push_back(new MovingMeanFilter());
+            // mFilter.push_back(KalmanFilter());
+        }
+    }
+
+    /***
+     *
+     * @param points
+     */
+    void update(vector<cv::Point> &points) {
+        for (int i = 0; i < this->mNumPoints; ++i) {
+            mFilter.at(i)->update(points.at(i));
+        }
+    }
+
+    /***
+     *
+     * @param points
+     */
+    void predict(vector<cv::Point> &points) {
+        for (int i = 0; i < this->mNumPoints; ++i) {
+            points.at(i) = mFilter.at(i)->predict();
+        }
+    }
+
+private:
+    int mNumPoints;
+    std::vector<MovingMeanFilter *> mFilter;
+    //std::vector<KalmanFilter*> mFilter;
+};
+
+#endif //DETECTOR_POITFLOW_H
diff --git a/3rdparty/base-utils/base_utils/include/image_utils.h b/3rdparty/base-utils/base_utils/include/image_utils.h
new file mode 100644
index 0000000..240e621
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/image_utils.h
@@ -0,0 +1,212 @@
+//
+// Created by dm on 2021/1/15.
+//
+
+#ifndef DETECTOR_IMAGE_UTILS_H
+#define DETECTOR_IMAGE_UTILS_H
+
+#include <string>
+#include "opencv2/opencv.hpp"
+
+using namespace std;
+
+
+/***
+ * 缩放图片，若resize_width或者resize_height，有一个是小于等于0，则进行等比例缩放图片
+ * @param image
+ * @param resize_width 默认-1
+ * @param resize_height 默认-1
+ * @return
+ */
+cv::Mat image_resize(cv::Mat &image, int resize_width = -1, int resize_height = -1);
+
+
+/***
+ * 逆时针旋转图像
+ * @param image 图像
+ * @param center 旋转中心点
+ * @param angle  旋转角度
+ * @param color  设置背景边界颜色：(0, 0, 0)
+ * @return
+ */
+cv::Mat rotate_image(cv::Mat &image, cv::Point2f center, float angle, cv::Scalar color = (0, 0, 0));
+
+
+/***
+ * 逆时针旋转图像和关键点
+ * @param image 输入图像/输出旋转后的图像
+ * @param points 输入需要旋转的关键点
+ * @param center 旋转中心点
+ * @param angle  旋转角度
+ * @return
+ */
+vector<cv::Point2f> rotate_image_points(cv::Mat &image, vector<cv::Point2f> &points, cv::Point2f center, float angle);
+
+/***
+ * 逆时针旋转图像中点
+ * @param point 图像中需要旋转的点
+ * @param center 旋转中心点
+ * @param image_width   原始图像的width
+ * @param image_height  原始图像的height
+ * @param angle 旋转角度
+ * @return
+ */
+cv::Point2f rotate_point(cv::Point2f point, cv::Point2f center, int image_width, int image_height, float angle);
+
+/***
+ * 逆时针旋转图像中点
+ * @param points 图像中需要旋转的点
+ * @param center 旋转中心点
+ * @param image_width   原始图像的width
+ * @param image_height  原始图像的height
+ * @param angle 旋转角度
+ * @return
+ */
+vector<cv::Point2f> rotate_points(vector<cv::Point2f> &points, cv::Point2f center,
+                                  int image_width, int image_height, float angle);
+
+
+/***
+ * 扩展rect的大小
+ * @param rect
+ * @param sx X方向扩展倍数
+ * @param sy Y方向扩展倍数
+ * @return
+ */
+cv::Rect extend_rect(cv::Rect rect, float sx = 1.0f, float sy = 1.0f);
+
+/***
+ * 图像裁剪,超出的区域会被丢弃
+ * @param image
+ * @param rect
+ * @return
+ */
+cv::Mat image_crop(cv::Mat &image, cv::Rect rect);
+
+
+/***
+ * 图像裁剪,超出的区域会被丢弃
+ * @param image
+ * @param x1
+ * @param x2
+ * @param y1
+ * @param y2
+ * @return
+ */
+cv::Mat image_crop(cv::Mat &image, int x1, int x2, int y1, int y2);
+
+
+/***
+ * 图像裁剪,超出的区域会被填充
+ * @param image
+ * @param rect
+ * @param color 填充的颜色，默认黑色 color = (0, 0, 0)
+ * @return
+ */
+cv::Mat image_crop_padding(cv::Mat &image, cv::Rect rect, cv::Scalar color = (0, 0, 0));
+
+
+/***
+ * 中心裁剪
+ * @param image
+ * @param crop_width
+ * @param crop_height
+ * @return
+ */
+cv::Mat image_center_crop(cv::Mat &image, int crop_width, int crop_height);
+
+
+/***
+ * 显示图像
+ * @param name
+ * @param image
+ * @param waitKey
+ */
+void image_show(string name, cv::Mat &image, int waitKey = 0);
+
+
+/***
+ * 保存图像
+ * @param name
+ * @param image
+ */
+void image_save(string name, cv::Mat &image);
+
+
+/***
+ * 绘制点和文本
+ *  image 图像
+ *  center  圆心坐标
+ *  radius 圆形的半径
+ *  color 线条的颜色
+ *  thickness 如果是正数，表示组成圆的线条的粗细程度。否则，表示圆是否被填充
+ *  line_type 线条的类型。见 cvLine 的描述
+ *  shift 圆心坐标点和半径值的小数点位数
+ * @param image
+ * @param point
+ * @param text
+ */
+void draw_point_text(cv::Mat &image, cv::Point2f points, string text = "", cv::Scalar color = (0, 0, 255));
+
+
+/***
+ * 绘制多个点和文本
+ * @param image
+ * @param points
+ * @param texts
+ */
+void draw_points_texts(cv::Mat &image, vector<cv::Point2f> points, vector<string> texts = {},
+                       cv::Scalar color = (0, 0, 255));
+
+
+/***
+ * 绘制矩形框
+ * @param image
+ * @param rect
+ * @param text
+ */
+void draw_rect_text(cv::Mat &image, cv::Rect rect, string text = "", cv::Scalar color = (255, 0, 0));
+
+
+/***
+ * 绘制多个矩形框和文本
+ * @param image
+ * @param rects
+ * @param texts
+ */
+void draw_rects_texts(cv::Mat &image, vector<cv::Rect> rects, vector<string> texts, cv::Scalar color = (255, 0, 0));
+
+
+/***
+ * 绘制连接线
+ * @param image
+ * @param points
+ * @param skeleton 需要连接的ID序号
+ */
+void
+draw_lines(cv::Mat &image, vector<cv::Point2f> points, vector<vector<int>> skeleton, cv::Scalar color = (0, 255, 0));
+
+/***
+ * 绘制带箭头的连接线
+ * @param image
+ * @param points
+ * @param skeleton 需要连接的ID序号
+ */
+void draw_arrowed_lines(cv::Mat &image, vector<cv::Point2f> points, vector<vector<int>> skeleton,
+                        cv::Scalar color = (0, 255, 0));
+
+
+/***
+ * 绘制yaw,pitch,roll坐标轴(左手坐标系)
+ * @param imgBRG 输入必须是BGR格式的图像
+ * @param pitch红色X
+ * @param yaw 绿色Y
+ * @param roll 蓝色Z
+ * @param center 坐标原始点
+ * @param vis
+ * @param size
+ */
+void draw_yaw_pitch_roll_in_left_axis(cv::Mat &imgBRG, float pitch, float yaw, float roll,
+                                      cv::Point center, bool vis = true, int size = 50);
+
+#endif //DETECTOR_IMAGE_UTILS_H
diff --git a/3rdparty/base-utils/base_utils/include/math_utils.h b/3rdparty/base-utils/base_utils/include/math_utils.h
new file mode 100644
index 0000000..7bc048d
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/include/math_utils.h
@@ -0,0 +1,75 @@
+//
+// Created by dm on 2021/3/8.
+//
+
+#ifndef BASE_UTILS_MATH_UTILS_H
+#define BASE_UTILS_MATH_UTILS_H
+
+#include <vector>
+#include <algorithm>
+#include "opencv2/opencv.hpp"
+
+#define PI 3.141592653589793
+using namespace std;
+
+/***
+ * SoftMaX函数
+ * @param src 输入
+ * @param dst 输出
+ * @param max_index 输出SoftMaX最大值的index
+ * @param max_value 输出SoftMaX最大值，max_value = dst[max_index]
+ */
+void softmax(vector<float> &src, vector<float> &dst, int &max_index, float &max_value);
+
+
+/***
+ * 计算两个Rect的IOU
+ * @param r1
+ * @param r2
+ * @return
+ */
+float cv_iou(const cv::Rect &r1, const cv::Rect &r2);
+
+/***
+ * 计算两个Rect的IOU
+ * @param r1
+ * @param r2
+ * @return
+ */
+float cv_iou2(const cv::Rect &r1, const cv::Rect &r2);
+
+
+/***
+ * 产生vector：P12 = point2-point1
+ * @param point1
+ * @param point2
+ * @return
+ */
+cv::Point2f create_vector(cv::Point2f point1, cv::Point2f point2);
+
+/***
+ * 计算两个vector的夹角(0,180)
+ * @param v1 vector1
+ * @param v2 vector2
+ * @param minangle：false,true：转为最小角度(0,90)
+ * @return
+ */
+float vector_angle(cv::Point2f v1, cv::Point2f v2, bool minangle = false);
+
+
+/***
+ * 计算两个向量的乘积
+ * @param v1
+ * @param v2
+ * @return
+ */
+float vector_multiply(vector<float> v1, vector<float> v2);
+
+/***
+ * 将弧度转换为角度
+ * @param radian 弧度
+ * @return
+ */
+float radian2angle(float radian);
+
+#endif //BASE_UTILS_MATH_UTILS_H
diff --git a/3rdparty/base-utils/base_utils/src/debug.cpp b/3rdparty/base-utils/base_utils/src/debug.cpp
new file mode 100644
index 0000000..f3be611
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/debug.cpp
@@ -0,0 +1,10 @@
+#include <iostream>
+#include <fstream>
+#include "opencv2/opencv.hpp"
+
+using namespace std;
+
+#include "debug.h"
+
+
+
diff --git a/3rdparty/base-utils/base_utils/src/file_utils.cpp b/3rdparty/base-utils/base_utils/src/file_utils.cpp
new file mode 100644
index 0000000..1f88fe4
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/file_utils.cpp
@@ -0,0 +1,240 @@
+//
+// Created by dm on 2021/1/15.
+//
+
+#include "file_utils.h"
+
+
+void write_datatxt(string path, string data, bool bCover) {
+    //fstream fout(path, ios::app);
+    fstream fout;
+    if (bCover) {
+        fout.open(path);//默认是：ios_base::in | ios_base::out
+    } else {
+        fout.open(path, ios::app);//所有写入附加在文件末尾
+    }
+    fout << data << endl;
+    fout.flush();
+    fout.close();
+}
+
+
+void write_contents(string path, vector<string> contents, bool bCover) {
+    //fstream fout(path, ios::app);
+    fstream fout;
+    if (bCover) {
+        fout.open(path, ios_base::out);//默认是：ios_base::in | ios_base::out
+    } else {
+        fout.open(path, ios::app);//所有写入附加在文件末尾
+    }
+    int num = contents.size();
+    for (int i = 0; i < num; ++i) {
+        fout << contents.at(i) << endl;
+    }
+    fout.flush();
+    fout.close();
+}
+
+
+vector<string> read_contents(string path) {
+    ifstream infile;
+    infile.open(path.data());   //将文件流对象与文件连接起来
+    vector<string> contents;
+    if (infile.is_open()) {
+        string line;
+        while (getline(infile, line)) {
+            contents.push_back(line);
+        }
+    } else {
+        printf("Failed to open file:%s", path.c_str());
+    }
+    infile.close();  //关闭文件输入流
+    return contents;
+}
+
+
+bool file_exists(string path) {
+    fstream _file;
+    _file.open(path, ios::in);
+    if (!_file) {
+        return false;
+    } else {
+        return true;
+    }
+}
+
+
+std::string load_file(string path) {
+    std::ifstream file(path, std::ios::in);
+    if (file.is_open()) {
+        file.seekg(0, file.end);
+        int size = file.tellg();
+        char *content = new char[size];
+
+        file.seekg(0, file.beg);
+        file.read(content, size);
+        std::string fileContent;
+        fileContent.assign(content, size);
+        delete[] content;
+        file.close();
+        return fileContent;
+    } else {
+        printf("Failed to open file:%s", path.c_str());
+        return "";
+    }
+}
+
+
+int load_file(const char *path, std::string &file_string) {
+    size_t uiSize = 0;
+    size_t uiFileSize = 0;
+    char *pStr = NULL;
+    std::fstream fFile(path, (std::fstream::in | std::fstream::binary));
+    if (fFile.is_open()) {
+        fFile.seekg(0, std::fstream::end);
+        uiSize = uiFileSize = (size_t) fFile.tellg();  // 获得文件大小
+        fFile.seekg(0, std::fstream::beg);
+        pStr = new char[uiSize + 1];
+
+        if (NULL == pStr) {
+            fFile.close();
+            return 0;
+        }
+
+        fFile.read(pStr, uiFileSize);                // 读取uiFileSize字节
+        fFile.close();
+        pStr[uiSize] = '\0';
+        file_string = pStr;
+        delete[] pStr;
+        return 0;
+    }
+    printf("Failed to open file:%s", path);
+    return -1;
+}
+
+
+string get_basename(string path) {
+    int index = path.find_last_of(separator);
+    string name{""};
+    if (index > -1) {
+        name = path.substr(index + 1, path.length());
+    }
+    return name;
+}
+
+string get_parent(string path) {
+    int index = path.find_last_of(separator);
+    string parent{""};
+    if (index > -1) {
+        parent = path.substr(0, index);
+    }
+    return parent;
+}
+
+string get_subname(string path) {
+    string parent = get_parent(path);
+    string subname = get_basename(parent);
+    return subname;
+}
+
+
+string get_postfix(string path, bool tolower) {
+    std::string postfix = path.substr(path.find_last_of('.') + 1);
+    if (tolower) {
+        transform(postfix.begin(), postfix.end(), postfix.begin(), ::tolower);
+        //transform(postfix.begin(), postfix.end(), postfix.begin(), ::toupper);
+    }
+    return postfix;
+}
+
+string path_joint(string path1, string path2) {
+    return path1 + separator + path2;
+}
+
+
+//#ifdef _LINUX
+#ifdef PLATFORM_LINUX
+
+#include <memory.h>
+#include <dirent.h>
+
+vector<string> get_files_list(string dirpath) {
+    vector<string> allPath;
+    DIR *dir = opendir(dirpath.c_str());
+    if (dir == NULL) {
+        cout << "opendir error" << endl;
+        return allPath;
+    }
+    struct dirent *entry;
+    while ((entry = readdir(dir)) != NULL) {
+        if (entry->d_type == DT_DIR) {//It's dir
+            if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+                continue;
+            string dirNew = dirpath + separator + entry->d_name;
+            vector<string> tempPath = get_files_list(dirNew);
+            allPath.insert(allPath.end(), tempPath.begin(), tempPath.end());
+
+        } else {
+            //cout << "name = " << entry->d_name << ", len = " << entry->d_reclen << ", entry->d_type = " << (int)entry->d_type << endl;
+            string name = entry->d_name;
+            string imgdir = dirpath + separator + name;
+            //sprintf("%s",imgdir.c_str());
+            allPath.push_back(imgdir);
+        }
+
+    }
+    closedir(dir);
+    //system("pause");
+    return allPath;
+}
+
+
+//#ifdef _WIN32//__WINDOWS_
+#elif PLATFORM_WINDOWS//__WINDOWS_
+#include <io.h>
+vector<string> get_files_list(string dir)
+{
+    vector<string> allPath;
+    // 在目录后面加上"\\*.*"进行第一次搜索
+    string dir2 = dir + separator+"*.*";
+
+    intptr_t handle;
+    _finddata_t findData;
+
+    handle = _findfirst(dir2.c_str(), &findData);
+    if (handle == -1) {// 检查是否成功
+        cout << "can not found the file ... " << endl;
+        return allPath;
+    }
+    while (_findnext(handle, &findData) == 0)
+    {
+        if (findData.attrib & _A_SUBDIR) 是否含有子目录
+        {
+            //若该子目录为"."或".."，则进行下一次循环，否则输出子目录名，并进入下一次搜索
+            if (strcmp(findData.name, ".") == 0 || strcmp(findData.name, "..") == 0)
+                continue;
+            // 在目录后面加上"\\"和搜索到的目录名进行下一次搜索
+            string dirNew = dir + separator + findData.name;
+            vector<string> tempPath = get_files_list(dirNew);
+            allPath.insert(allPath.end(), tempPath.begin(), tempPath.end());
+        }
+        else //不是子目录，即是文件，则输出文件名和文件的大小
+        {
+            string filePath = dir + separator + findData.name;
+            allPath.push_back(filePath);
+        }
+    }
+    _findclose(handle);    // 关闭搜索句柄
+    return allPath;
+}
+
+//#ifdef _LINUX
+#elif PLATFORM_ANDROID
+#include <memory.h>
+#include <dirent.h>
+vector<string> get_files_list(string dirpath) {
+    vector<string> allPath;
+    return allPath;
+}
+
+#endif
\ No newline at end of file
diff --git a/3rdparty/base-utils/base_utils/src/filter/kalman_filter.cpp b/3rdparty/base-utils/base_utils/src/filter/kalman_filter.cpp
new file mode 100644
index 0000000..6f1b7c4
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/filter/kalman_filter.cpp
@@ -0,0 +1,54 @@
+//
+// Created by dm on 2021/1/19.
+//
+
+#include "filter/kalman_filter.h"
+
+
+KalmanFilter::KalmanFilter(int stateNum, int measureNum) {
+    mKF = new cv::KalmanFilter(stateNum, measureNum, 0);
+    //转移矩阵A
+    mKF->transitionMatrix = (cv::Mat_<float>(4, 4) << 1, 0, 1, 0,
+            0, 1, 0, 1,
+            0, 0, 1, 0,
+            0, 0, 0, 1);
+    //测量矩阵H
+    cv::setIdentity(mKF->measurementMatrix);
+    //测量噪声方差矩阵R，取值越小收敛越快
+    cv::setIdentity(mKF->measurementNoiseCov, cv::Scalar::all(1e-4));
+    //过程(系统)噪声噪声方差矩阵Q
+    cv::setIdentity(mKF->processNoiseCov, cv::Scalar::all(1e-5));
+    //后验错误估计协方差矩阵P
+    cv::setIdentity(mKF->errorCovPost, cv::Scalar::all(1));
+    //系统初始状态 x(0)
+    //cv::RNG rng;
+    //rng.fill(KF->statePost, RNG::UNIFORM, 0, winHeight > winWidth ? winWidth : winHeight);
+    //初始测量值x'(0)，因为后面要更新这个值，所以先定义
+    mMeasurement = cv::Mat::zeros(measureNum, 1, CV_32F);
+}
+
+
+KalmanFilter::~KalmanFilter(){
+    mMeasurement.release();
+    delete mKF;
+    mKF= nullptr;
+}
+
+void KalmanFilter::update(cv::Point pos) {
+    if (pos.x > 0 && pos.y > 0) {
+        mMeasurement.at<float>(0) = (float) pos.x;
+        mMeasurement.at<float>(1) = (float) pos.y;
+        //4.update
+        mKF->correct(mMeasurement);
+    }
+}
+
+
+cv::Point KalmanFilter::predict() {
+    //返回的是下一时刻的状态值KF.statePost (k+1)
+    cv::Mat prediction = mKF->predict();
+    //预测值(x',y')
+    cv::Point dst = cv::Point(prediction.at<float>(0), prediction.at<float>(1));
+    return dst;
+}
+
diff --git a/3rdparty/base-utils/base_utils/src/filter/mean_filter.cpp b/3rdparty/base-utils/base_utils/src/filter/mean_filter.cpp
new file mode 100644
index 0000000..a26280b
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/filter/mean_filter.cpp
@@ -0,0 +1,69 @@
+//
+// Created by dm on 2021/1/21.
+//
+
+#include <filter/mean_filter.h>
+
+
+MovingMeanFilter::MovingMeanFilter(int win_size, float decay) {
+    this->mWinSize = win_size;
+    this->mWeightDecay = get_weight_decay(win_size, decay);
+}
+
+MovingMeanFilter::~MovingMeanFilter() {
+    mQueue.clear();
+    vector<cv::Point>().swap(mQueue);
+    mWeightDecay.clear();
+    vector<float>().swap(mWeightDecay);
+}
+
+
+void MovingMeanFilter::update(cv::Point pos) {
+    if (mQueue.size() >= this->mWinSize) {
+        mQueue.erase(mQueue.begin());
+    }
+    if (pos.x > 0 && pos.y > 0) {
+        mQueue.push_back(pos);
+    }
+}
+
+
+cv::Point MovingMeanFilter::predict() {
+    cv::Point dst;
+    if (mQueue.size() > 0) {
+        dst = filter();
+    } else {
+        dst = cv::Point(0, 0);
+    }
+    return dst;
+}
+
+
+cv::Point MovingMeanFilter::filter() {
+    cv::Mat data = cv::Mat(mQueue).reshape(1, mQueue.size());
+    vector<float> t(this->mWeightDecay.end() - mQueue.size(), this->mWeightDecay.end());
+    cv::Mat w = cv::Mat(t).reshape(1, 1);
+    data.convertTo(data, CV_32FC1, 1.0);
+    cv::Mat out = w * data; //矩阵乘法:w(1,num)*data(num,2)
+    cv::Point dst = cv::Point(out.at<float>(0), out.at<float>(1));
+    return dst;
+}
+
+
+vector<float> MovingMeanFilter::get_weight_decay(int n, float decay) {
+    float r = decay / (1.0 - decay);
+    float sum = 0.f;
+    //计算衰减权重
+    vector<float> w = {1.0};
+    for (int i = 1; i < n; ++i) {
+        // fix bug: accumulate的init必须设置为输入同一类型，否则计算精度损失
+        sum = accumulate(w.begin(), w.end(), 0.f);
+        w.push_back(sum * r);
+    }
+    // 进行归一化
+    sum = accumulate(w.begin(), w.end(), 0.f);
+    for (int i = 0; i < w.size(); ++i) {
+        w.at(i) = w.at(i) / sum;
+    }
+    return w;
+}
\ No newline at end of file
diff --git a/3rdparty/base-utils/base_utils/src/filter/tracking_flow.cpp b/3rdparty/base-utils/base_utils/src/filter/tracking_flow.cpp
new file mode 100644
index 0000000..5b6a3f0
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/filter/tracking_flow.cpp
@@ -0,0 +1,5 @@
+//
+// Created by dm on 2021/1/21.
+//
+
+#include "filter/tracking_flow.h"
diff --git a/3rdparty/base-utils/base_utils/src/image_utils.cpp b/3rdparty/base-utils/base_utils/src/image_utils.cpp
new file mode 100644
index 0000000..0ecb0c0
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/image_utils.cpp
@@ -0,0 +1,285 @@
+//
+// Created by dm on 2021/1/15.
+//
+
+#include "image_utils.h"
+#include "math_utils.h"
+
+cv::Mat image_resize(cv::Mat &image, int resize_width, int resize_height) {
+    cv::Mat dst;
+    auto width = image.cols;
+    auto height = image.rows;
+    if (resize_height <= 0 && resize_width <= 0) {
+        resize_width = width;
+        resize_height = height;
+    } else if (resize_height <= 0) {
+        resize_height = int(height * resize_width / width);
+    } else if (resize_width <= 0) {
+        resize_width = int(width * resize_height / height);
+    }
+    cv::resize(image, dst, cv::Size(resize_width, resize_height));
+    return dst;
+}
+
+
+cv::Mat rotate_image(cv::Mat &image, cv::Point2f center, float angle, cv::Scalar color) {
+    //输出图像的尺寸与原图一样
+    cv::Size dsize(image.cols, image.rows);
+    //指定旋转中心
+    //cv::Point2f center(image.cols / 2., image.rows / 2.);
+    //获取旋转矩阵（2x3矩阵）
+    cv::Mat rot_mat = cv::getRotationMatrix2D(center, angle, 1.0);
+    cv::Mat dst;
+    cv::warpAffine(image, dst, rot_mat, dsize, cv::INTER_LINEAR, cv::BORDER_CONSTANT, color);
+    return dst;
+}
+
+vector<cv::Point2f> rotate_image_points(cv::Mat &image, vector<cv::Point2f> &points, cv::Point2f center, float angle) {
+    image = rotate_image(image, center, angle);
+    return rotate_points(points, center, image.cols, image.rows, angle);;
+}
+
+cv::Point2f rotate_point(cv::Point2f point, cv::Point2f center, int image_width, int image_height, float angle) {
+    // 将图像坐标转换到平面坐标
+    float x1 = point.x;
+    float y1 = image_height - point.y;
+    float x2 = center.x;
+    float y2 = image_height - center.y;
+    float x = (x1 - x2) * cos(PI / 180.0 * angle) - (y1 - y2) * sin(PI / 180.0 * angle) + x2;
+    float y = (x1 - x2) * sin(PI / 180.0 * angle) + (y1 - y2) * cos(PI / 180.0 * angle) + y2;
+    // 将平面坐标转换到图像坐标
+    y = image_height - y;
+    return {x, y};
+}
+
+vector<cv::Point2f> rotate_points(vector<cv::Point2f> &points, cv::Point2f center,
+                                  int image_width, int image_height, float angle) {
+    vector<cv::Point2f> dst_points;
+    for (auto &point:points) {
+        dst_points.push_back(rotate_point(point, center, image_width, image_height, angle));
+    }
+    return dst_points;
+}
+
+
+cv::Rect extend_rect(cv::Rect rect, float sx, float sy) {
+    float cx = (rect.x + rect.x + rect.width) / 2.0f;
+    float cy = (rect.y + rect.y + rect.height) / 2.0f;
+    float ew = rect.width * sx;
+    float eh = rect.height * sy;
+    float ex = cx - 0.5 * ew;
+    float ey = cy - 0.5 * eh;
+    cv::Rect r(ex, ey, ew, eh);
+    return r;
+}
+
+
+cv::Mat image_crop(cv::Mat &image, cv::Rect rect) {
+    cv::Mat dst;
+    //求交集,避免越界
+    rect &= cv::Rect(0, 0, image.cols, image.rows);
+    image(rect).copyTo(dst);
+    return dst;
+};
+
+
+cv::Mat image_crop(cv::Mat &image, int x1, int x2, int y1, int y2) {
+    int width = image.cols;
+    int height = image.rows;
+    int left = std::max(0, (int) x1);
+    int right = std::min((int) x2, width);
+    int top = std::max(0, (int) y1);
+    int bottom = std::min(int(y2), height);
+    cv::Rect rect(left, top, right - left, bottom - top);
+    cv::Mat dst = image_crop(image, rect);
+    return dst;
+};
+
+
+cv::Mat image_crop_padding(cv::Mat &image, cv::Rect rect, cv::Scalar color) {
+    int borderType = cv::BORDER_CONSTANT;//固定像素填充
+    //int borderType = cv::BORDER_REPLICATE;//复制最边缘像素
+    //int borderType = cv::BORDER_REFLECT_101;//边缘对称法填充
+    int crop_x1 = cv::max(0, rect.x);
+    int crop_y1 = cv::max(0, rect.y);
+    int crop_x2 = cv::min(image.cols, rect.x + rect.width); // 图像范围 0到cols-1, 0到rows-1
+    int crop_y2 = cv::min(image.rows, rect.y + rect.height);
+
+    int left_x = (-rect.x);
+    int top_y = (-rect.y);
+    int right_x = rect.x + rect.width - image.cols;
+    int down_y = rect.y + rect.height - image.rows;
+    //cv::Mat roiImage = srcImage(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+    cv::Mat roiImage = image(cv::Rect(crop_x1, crop_y1, (crop_x2 - crop_x1), (crop_y2 - crop_y1)));
+    if (top_y > 0 || down_y > 0 || left_x > 0 || right_x > 0)//只要存在边界越界的情况，就需要边界填充
+    {
+        left_x = (left_x > 0 ? left_x : 0);
+        right_x = (right_x > 0 ? right_x : 0);
+        top_y = (top_y > 0 ? top_y : 0);
+        down_y = (down_y > 0 ? down_y : 0);
+        cv::copyMakeBorder(roiImage, roiImage, top_y, down_y, left_x, right_x, borderType, color);
+    }
+    return roiImage;
+}
+
+cv::Mat image_center_crop(cv::Mat &image, int crop_width, int crop_height) {
+    auto width = image.cols;
+    auto height = image.rows;
+    int x1 = std::max(0, ((width - crop_width + 1) / 2));
+    int y1 = std::max(0, ((height - crop_height + 1) / 2));
+    int x2 = x1 + crop_width;
+    int y2 = y1 + crop_height;
+    cv::Mat dst = image_crop(image, x1, x2, y1, y2);
+    return dst;
+}
+
+
+void image_show(string name, cv::Mat &image, int waitKey) {
+#ifndef PLATFORM_ANDROID
+    cv::namedWindow(name, cv::WINDOW_AUTOSIZE);
+    cv::Mat img_show = image.clone();
+    if (img_show.channels() == 1)
+        cvtColor(img_show, img_show, cv::COLOR_GRAY2BGR);
+    //char str[200];
+    //sprintf(str, ",Size:%dx%d", image.rows, image.cols);
+    //RESIZE(img_show, 400);
+    cv::imshow(name, img_show);
+    cv::waitKey(waitKey);
+#endif
+}
+
+void image_save(string name, cv::Mat &image) {
+#ifndef PLATFORM_ANDROID
+    cv::namedWindow(name, cv::WINDOW_AUTOSIZE);
+    cv::imwrite(name, image);
+#endif
+
+}
+
+
+void draw_point_text(cv::Mat &image, cv::Point2f points, string text, cv::Scalar color) {
+    int radius = 4;
+    int thickness = -1;//实心点
+    cv::circle(image, points, radius, color, thickness);
+    if (text != "") {
+        cv::putText(image,
+                    text,
+                    cv::Point(points.x + 5, points.y),
+                    cv::FONT_HERSHEY_COMPLEX,
+                    0.5,
+                    color);
+    }
+}
+
+void draw_points_texts(cv::Mat &image, vector<cv::Point2f> points, vector<string> texts, cv::Scalar color) {
+    int num = points.size();
+    if (texts.size() != num && texts.size() == 0) {
+        for (int i = 0; i < num; ++i) {
+            texts.push_back("");
+        }
+    }
+    for (int i = 0; i < num; ++i) {
+        draw_point_text(image, points[i], texts[i], color);
+    }
+}
+
+
+void draw_rect_text(cv::Mat &image, cv::Rect rect, string text, cv::Scalar color) {
+    cv::rectangle(image, rect, color, 2);
+    if (text != "") {
+        cv::putText(image,
+                    text,
+                    cv::Point(rect.x + 5, rect.y),
+                    cv::FONT_HERSHEY_COMPLEX,
+                    0.5,
+                    color);
+    }
+}
+
+void draw_rects_texts(cv::Mat &image,
+                      vector<cv::Rect> rects,
+                      vector<string> texts,
+                      cv::Scalar color) {
+    int num = rects.size();
+    if (texts.size() != num && texts.size() == 0) {
+        for (int i = 0; i < num; ++i) {
+            texts.push_back("");
+        }
+    }
+    for (int i = 0; i < num; ++i) {
+        draw_rect_text(image, rects[i], texts[i], color);
+    }
+}
+
+void draw_lines(cv::Mat &image,
+                vector<cv::Point2f> points,
+                vector<vector<int>> skeleton,
+                cv::Scalar color) {
+    int thickness = 1;
+    for (auto &pair:skeleton) {
+        if (points[pair[0]].x > 0. && points[pair[0]].y > 0. &&
+            points[pair[1]].x > 0. && points[pair[1]].y > 0.) {
+            cv::Point2d p0 = points[pair[0]];
+            cv::Point2d p1 = points[pair[1]];
+            cv::line(image, p0, p1, color, thickness);
+        }
+    }
+}
+
+void draw_arrowed_lines(cv::Mat &image,
+                        vector<cv::Point2f> points,
+                        vector<vector<int>> skeleton,
+                        cv::Scalar color) {
+    int thickness = 1;
+    for (auto &pair:skeleton) {
+        if (points[pair[0]].x > 0. && points[pair[0]].y > 0. &&
+            points[pair[1]].x > 0. && points[pair[1]].y > 0.) {
+            cv::Point2d p0 = points[pair[0]];
+            cv::Point2d p1 = points[pair[1]];
+            cv::arrowedLine(image, p1, p0, color, thickness);
+        }
+    }
+}
+
+
+void draw_yaw_pitch_roll_in_left_axis(cv::Mat &imgBRG, float pitch, float yaw, float roll,
+                                      cv::Point center, bool vis, int size) {
+
+    float cx = center.x;
+    float cy = center.y;
+    char text[200];
+    sprintf(text, "(pitch,yaw,roll)=(%3.1f,%3.1f,%3.1f)", pitch, yaw, roll);
+    pitch = pitch * PI / 180;
+    yaw = -yaw * PI / 180;
+    roll = roll * PI / 180;
+    // X-Axis pointing to right. drawn in red
+    float x1 = size * (cos(yaw) * cos(roll)) + cx;
+    float y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + cy;
+    cv::Scalar color_yaw_x(0, 0, 255); //BGR;
+    // Y-Axis | drawn in green
+    float x2 = size * (-cos(yaw) * sin(roll)) + cx;
+    float y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + cy;
+    cv::Scalar color_pitch_y(0, 255, 0);
+    // Z-Axis (out of the screen) drawn in blue
+    float x3 = size * (sin(yaw)) + cx;
+    float y3 = size * (-cos(yaw) * sin(pitch)) + cy;
+    cv::Scalar color_roll_z(255, 0, 0);
+    float tipLength = 0.2;
+    cv::arrowedLine(imgBRG, cv::Point(int(cx), int(cy)), cv::Point(int(x1), int(y1)), color_yaw_x, 2,
+                    tipLength);
+    cv::arrowedLine(imgBRG, cv::Point(int(cx), int(cy)), cv::Point(int(x2), int(y2)), color_pitch_y, 2,
+                    tipLength);
+    cv::arrowedLine(imgBRG, cv::Point(int(cx), int(cy)), cv::Point(int(x3), int(y3)), color_roll_z, 2,
+                    tipLength);
+    if (vis) {
+        cv::putText(imgBRG,
+                    text,
+                    cv::Point(cx, cy),
+                    cv::FONT_HERSHEY_COMPLEX,
+                    0.5,
+                    (0, 0, 255));
+    }
+}
+
+
+
diff --git a/3rdparty/base-utils/base_utils/src/math_utils.cpp b/3rdparty/base-utils/base_utils/src/math_utils.cpp
new file mode 100644
index 0000000..8c75ea2
--- /dev/null
+++ b/3rdparty/base-utils/base_utils/src/math_utils.cpp
@@ -0,0 +1,77 @@
+//
+// Created by dm on 2021/3/8.
+//
+
+#include "math_utils.h"
+#include<cmath>
+
+using namespace std;
+
+void softmax(vector<float> &src, vector<float> &dst, int &max_index, float &max_value) {
+    int length = src.size();
+    max_index = max_element(src.begin(), src.end()) - src.begin();
+    max_value = src[max_index];
+    float denominator{0};
+    for (int i = 0; i < length; ++i) {
+        dst.push_back(std::exp(src[i] - max_value));
+        denominator += dst[i];
+    }
+    for (int i = 0; i < length; ++i) {
+        dst[i] /= denominator;
+    }
+    max_value = dst[max_index];
+}
+
+
+float cv_iou(const cv::Rect &r1, const cv::Rect &r2) {
+    cv::Rect I = r1 | r2;//并集
+    cv::Rect U = r1 & r2;//交集
+    return U.area() * 1.f / I.area();
+}
+
+float cv_iou2(const cv::Rect &r1, const cv::Rect &r2) {
+    // 计算每个矩形的面积
+    int s1 = r1.width * r1.height;
+    int s2 = r2.width * r2.height;
+    // 计算相交矩形的面积
+    int xmin = max(r1.x, r2.x);
+    int ymin = max(r1.y, r2.y);
+    int xmax = min(r1.x + r1.width, r2.x + r2.width);
+    int ymax = min(r1.y + r1.height, r2.y + r2.height);
+    int w = max(0, xmax - xmin);
+    int h = max(0, ymax - ymin);
+    float area = w * h;
+    return area / (s1 + s2 - area);
+}
+
+cv::Point2f create_vector(cv::Point2f point1, cv::Point2f point2) {
+    // P12 = point2-point1
+    return point2 - point1;
+}
+
+float vector_angle(cv::Point2f v1, cv::Point2f v2, bool minangle) {
+    // cosφ = u·v/|u||v|
+    float lx = sqrt(v1.dot(v1));
+    float ly = sqrt(v2.dot(v2));
+    float value = v1.dot(v2) / ((lx * ly) + 1e-6);
+    float radian = acos(value);
+    float angle = radian2angle(radian);
+    if (minangle) {
+        angle = angle < 90 ? angle : 180 - angle;
+    }
+    return angle;
+}
+
+float vector_multiply(vector<float> v1, vector<float> v2) {
+    int size = v1.size();
+    float angle = 0.f;
+    for (int i = 0; i < size; ++i) {
+        angle += v1[i] * v2[i];
+    }
+    return angle;
+}
+
+float radian2angle(float radian) {
+    float angle = radian * (180 / PI);
+    return angle;
+}
diff --git a/3rdparty/base-utils/data/test_image/test1.jpg b/3rdparty/base-utils/data/test_image/test1.jpg
new file mode 100644
index 0000000..363673f
Binary files /dev/null and b/3rdparty/base-utils/data/test_image/test1.jpg differ
diff --git a/3rdparty/base-utils/data/write_contents.txt b/3rdparty/base-utils/data/write_contents.txt
new file mode 100644
index 0000000..4e095a8
--- /dev/null
+++ b/3rdparty/base-utils/data/write_contents.txt
@@ -0,0 +1,15 @@
+../../base_utils/include/debug.h
+../../base_utils/include/base.h
+../../base_utils/include/image_utils.h
+../../base_utils/include/filter/mean_filter.h
+../../base_utils/include/filter/kalman_filter.h
+../../base_utils/include/filter/tracking_flow.h
+../../base_utils/include/file_utils.h
+../../base_utils/include/android_utils.h
+../../base_utils/src/file_utils.cpp
+../../base_utils/src/debug.cpp
+../../base_utils/src/filter/mean_filter.cpp
+../../base_utils/src/filter/kalman_filter.cpp
+../../base_utils/src/filter/tracking_flow.cpp
+../../base_utils/src/image_utils.cpp
+../../base_utils/CMakeLists.txt
diff --git a/3rdparty/base-utils/test/CMakeLists.txt b/3rdparty/base-utils/test/CMakeLists.txt
new file mode 100644
index 0000000..17c05a3
--- /dev/null
+++ b/3rdparty/base-utils/test/CMakeLists.txt
@@ -0,0 +1,76 @@
+cmake_minimum_required(VERSION 3.5)
+project(base_utils)
+add_compile_options(-fPIC) # fix Bug: can not be used when making a shared object
+set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -pthread")
+set(BASE_FILTER_ENABLE ON CACHE BOOL "" FORCE)
+set(BASE_OPENMP_ENABLE ON CACHE BOOL "" FORCE)  # Multi-Thread
+if (NOT CMAKE_BUILD_TYPE)
+    # -DCMAKE_BUILD_TYPE=Debug
+    # -DCMAKE_BUILD_TYPE=Release
+    message(STATUS "No build type selected, default to Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Debug)" FORCE)
+endif ()
+
+
+# set openMP
+if (BASE_OPENMP_ENABLE)
+    FIND_PACKAGE(OpenMP REQUIRED)
+    if (OPENMP_FOUND)
+        # target_link_libraries可以不添加-fopenmp
+        message(STATUS "OPENMP FOUND")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS"${CMAKE_EXE_LINKER_FLAGS}${OpenMP_EXE_LINKER_FLAGS}")
+    else ()
+        error("OpenMP Not Found.")
+    endif ()
+endif ()
+
+
+# opencv set
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ./src/)
+#MESSAGE(STATUS "OpenCV_INCLUDE_DIRS = ${OpenCV_INCLUDE_DIRS}")
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_definitions(-DPLATFORM_ANDROID)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_definitions(-DPLATFORM_LINUX)
+    add_definitions(-DDEBUG_ON)            # for WIN/Linux Log
+    add_definitions(-DDEBUG_LOG_ON)        # for WIN/Linux Log
+    add_definitions(-DDEBUG_IMSHOW_ON)    # for OpenCV show
+elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    add_definitions(-DPLATFORM_WINDOWS)
+    add_definitions(-DDEBUG_ON)            # for WIN/Linux Log
+    add_definitions(-DDEBUG_LOG_ON)        # for WIN/Linux Log
+    add_definitions(-DDEBUG_IMSHOW_ON)    # for OpenCV show
+endif ()
+
+# base_utils
+set(BASE_ROOT ../) # 设置base-utils所在的根目录
+add_subdirectory(${BASE_ROOT}/base_utils/ base_build) # 添加子目录到build中
+include_directories(${BASE_ROOT}/base_utils/include)
+include_directories(${BASE_ROOT}/base_utils/src)
+MESSAGE(STATUS "BASE_ROOT = ${BASE_ROOT}")
+
+
+# base_cl
+set(BASE_CL contrib/base_cl) # 设置base_cl所在的根目录
+add_subdirectory(${BASE_CL}/ base_cl_build) # 添加子目录到build中
+include_directories(${BASE_CL}/include)
+include_directories(${BASE_CL}/src)
+MESSAGE(STATUS "BASE_CL = ${BASE_CL}")
+
+
+# Test
+add_executable(main main.cpp)
+add_executable(kalman_test kalman_test.cpp)
+add_executable(openmp_test openmp_test.cpp)
+add_executable(opencl_test opencl_test.cpp)
+
+
+#target_link_libraries(main base_utils ${OpenCV_LIBS} -lpthread)
+target_link_libraries(main base_utils ${OpenCV_LIBS})
+target_link_libraries(kalman_test base_utils ${OpenCV_LIBS})
+target_link_libraries(openmp_test base_utils ${OpenCV_LIBS} -lpthread)
+target_link_libraries(opencl_test base_utils ${OpenCV_LIBS} OpenCL  -lpthread)
diff --git a/3rdparty/base-utils/test/build.sh b/3rdparty/base-utils/test/build.sh
new file mode 100644
index 0000000..274637e
--- /dev/null
+++ b/3rdparty/base-utils/test/build.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+if [ ! -d "build/" ];then
+  mkdir "build"
+else
+  echo "exist build"
+fi
+cd build
+cmake ..
+make -j4
+sleep 1
+#./main
+#./kalman_test
+./openmp_test
+
diff --git a/3rdparty/base-utils/test/contrib/base_cl/CMakeLists.txt b/3rdparty/base-utils/test/contrib/base_cl/CMakeLists.txt
new file mode 100755
index 0000000..1fa4dc6
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.5)
+project(base_utils)
+set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -pthread")
+set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG")
+set(CMAKE_CXX_FLAGS_DEBUG "-g")
+
+
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_definitions(-DPLATFORM_ANDROID)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_definitions(-DPLATFORM_LINUX)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    add_definitions(-DPLATFORM_WINDOWS)
+endif ()
+
+# 常用的变量：
+# . 表示当前目录
+# ${PROJECT_SOURCE_DIR}：为包含 project()的最近一个CMakeLists.txt文件所在的文件夹
+# ${CMAKE_CURRENT_SOURCE_DIR}：当前CMakeLists.txt的路径
+
+# opencv set
+#find_package(OpenCV REQUIRED)
+#include_directories(${OpenCV_INCLUDE_DIRS} ./src/)
+
+# 指定头文件目录,PROJECT_SOURCE_DIR为工程的根目录  
+include_directories(include)
+
+#指定可执行文件的输出目录，输出到bin下面  
+#set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin)
+
+#指定库文件输出路径  
+#set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib)
+
+# 将指定的源文件生成链接文件base_utils
+# add_library(base_utils myhello.cpp)
+# 更加便捷的方法是使用aux_source_directory(<dir> <variable>)
+# 查找当前目录下的所有源文件，并将名称保存到DIR_SRCS变量中
+# DIR_SRCS可以多次使用，相同的变量会自动拼接
+aux_source_directory(src DIR_SRCS)
+
+
+#将指定的源文件生成链接文件
+MESSAGE(STATUS "DIR_SRCS = ${DIR_SRCS}")
+#add_library(base_utils STATIC ${DIR_SRCS}) # 静态库（.a、.lib）
+#add_library(base_utils SHARED ${DIR_SRCS})  # 动态库（.so、.dll)
+add_library(base_cl  ${DIR_SRCS})  #
+#set_target_properties(my_lib PROPERTIES OUTPUT_NAME "my_lib")
diff --git a/3rdparty/base-utils/test/contrib/base_cl/README.md b/3rdparty/base-utils/test/contrib/base_cl/README.md
new file mode 100644
index 0000000..521b57b
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/README.md
@@ -0,0 +1,9 @@
+# OpenCL基础
+
+## OpenCL的基本流程
+- 选择平台（clGetPlatformIDs）并创建上下文（clCreateContextFromType）
+- 选择设备并创建命令队列（clCreateCommandQueue）
+- 创建（clCreateProgramWithSource）和构建程序对象（clBuildProgram）
+- 创建内核和内存对象（clCreateBuffer），将主机数据复制到设备上下文
+- 运行内核排队（clEnqueueNDRangeKernel）
+- 从内核读回结果（clEnqueueReadBuffer）
\ No newline at end of file
diff --git a/3rdparty/base-utils/test/contrib/base_cl/include/Convolution.h b/3rdparty/base-utils/test/contrib/base_cl/include/Convolution.h
new file mode 100755
index 0000000..b90138d
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/include/Convolution.h
@@ -0,0 +1,297 @@
+//
+// Book:      OpenCL(R) Programming Guide
+// Authors:   Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
+// ISBN-10:   0-321-74964-2
+// ISBN-13:   978-0-321-74964-2
+// Publisher: Addison-Wesley Professional
+// URLs:      http://safari.informit.com/9780132488006/
+//            http://www.openclprogrammingguide.com
+//
+
+
+// Convolution.cpp
+//
+//    This is a simple example that demonstrates OpenCL platform, device, and context
+//    use.
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+
+#include <CL/cl.h>
+
+#endif
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif
+
+// Constants
+const unsigned int inputSignalWidth = 8;
+const unsigned int inputSignalHeight = 8;
+
+cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
+        {
+                {3, 1, 1, 4, 8, 2, 1, 3},
+                {4, 2, 1, 1, 2, 1, 2, 3},
+                {4, 4, 4, 4, 3, 2, 2, 2},
+                {9, 8, 3, 8, 9, 0, 0, 0},
+                {9, 3, 3, 9, 0, 0, 0, 0},
+                {0, 9, 0, 8, 0, 0, 0, 0},
+                {3, 0, 8, 8, 9, 4, 4, 4},
+                {5, 9, 8, 1, 8, 1, 1, 1}
+        };
+
+const unsigned int outputSignalWidth = 6;
+const unsigned int outputSignalHeight = 6;
+
+cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
+
+const unsigned int maskWidth = 3;
+const unsigned int maskHeight = 3;
+
+cl_uint mask[maskWidth][maskHeight] =
+        {
+                {1, 1, 1},
+                {1, 0, 1},
+                {1, 1, 1},
+        };
+
+///
+// Function to check and handle OpenCL errors
+inline void
+checkErr(cl_int err, const char *name) {
+    if (err != CL_SUCCESS) {
+        std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void CL_CALLBACK contextCallback(
+        const char *errInfo,
+        const void *private_info,
+        size_t cb,
+        void *user_data) {
+    std::cout << "Error occured during context use: " << errInfo << std::endl;
+    // should really perform any clearup and so on at this point
+    // but for simplicitly just exit.
+    exit(1);
+}
+
+///
+//	main() for Convoloution example
+//
+int test(const char *cl_file) {
+    cl_int errNum;
+    cl_uint numPlatforms;
+    cl_uint numDevices;
+    cl_platform_id *platformIDs;
+    cl_device_id *deviceIDs;
+    cl_context context = NULL;
+    cl_command_queue queue;
+    cl_program program;
+    cl_kernel kernel;
+    cl_mem inputSignalBuffer;
+    cl_mem outputSignalBuffer;
+    cl_mem maskBuffer;
+
+    // First, select an OpenCL platform to run on.  
+    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+    checkErr(
+            (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
+            "clGetPlatformIDs");
+
+    platformIDs = (cl_platform_id *) alloca(
+            sizeof(cl_platform_id) * numPlatforms);
+
+    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
+    checkErr(
+            (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
+            "clGetPlatformIDs");
+
+    // Iterate through the list of platforms until we find one that supports
+    // a CPU device, otherwise fail with an error.
+    deviceIDs = NULL;
+    cl_uint i;
+    for (i = 0; i < numPlatforms; i++) {
+        errNum = clGetDeviceIDs(
+                platformIDs[i],
+                CL_DEVICE_TYPE_CPU,
+                0,
+                NULL,
+                &numDevices);
+        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) {
+            checkErr(errNum, "clGetDeviceIDs");
+        } else if (numDevices > 0) {
+            deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
+            errNum = clGetDeviceIDs(
+                    platformIDs[i],
+                    CL_DEVICE_TYPE_CPU,
+                    numDevices,
+                    &deviceIDs[0],
+                    NULL);
+            checkErr(errNum, "clGetDeviceIDs");
+            break;
+        }
+    }
+
+    // Check to see if we found at least one CPU device, otherwise return
+    if (deviceIDs == NULL) {
+        std::cout << "No CPU device found" << std::endl;
+        exit(-1);
+    }
+
+    // Next, create an OpenCL context on the selected platform.  
+    cl_context_properties contextProperties[] =
+            {
+                    CL_CONTEXT_PLATFORM,
+                    (cl_context_properties) platformIDs[i],
+                    0
+            };
+    context = clCreateContext(
+            contextProperties,
+            numDevices,
+            deviceIDs,
+            &contextCallback,
+            NULL,
+            &errNum);
+    checkErr(errNum, "clCreateContext");
+
+    std::ifstream srcFile(cl_file);
+    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");
+
+    std::string srcProg(
+            std::istreambuf_iterator<char>(srcFile),
+            (std::istreambuf_iterator<char>()));
+
+    const char *src = srcProg.c_str();
+    size_t length = srcProg.length();
+
+    // Create program from source
+    program = clCreateProgramWithSource(
+            context,
+            1,
+            &src,
+            &length,
+            &errNum);
+    checkErr(errNum, "clCreateProgramWithSource");
+
+    // Build program
+    errNum = clBuildProgram(
+            program,
+            numDevices,
+            deviceIDs,
+            NULL,
+            NULL,
+            NULL);
+    if (errNum != CL_SUCCESS) {
+        // Determine the reason for the error
+        char buildLog[16384];
+        clGetProgramBuildInfo(
+                program,
+                deviceIDs[0],
+                CL_PROGRAM_BUILD_LOG,
+                sizeof(buildLog),
+                buildLog,
+                NULL);
+
+        std::cerr << "Error in kernel: " << std::endl;
+        std::cerr << buildLog;
+        checkErr(errNum, "clBuildProgram");
+    }
+
+    // Create kernel object
+    kernel = clCreateKernel(
+            program,
+            "convolve",
+            &errNum);
+    checkErr(errNum, "clCreateKernel");
+
+    // Now allocate buffers
+    inputSignalBuffer = clCreateBuffer(
+            context,
+            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+            sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
+            static_cast<void *>(inputSignal),
+            &errNum);
+    checkErr(errNum, "clCreateBuffer(inputSignal)");
+
+    maskBuffer = clCreateBuffer(
+            context,
+            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+            sizeof(cl_uint) * maskHeight * maskWidth,
+            static_cast<void *>(mask),
+            &errNum);
+    checkErr(errNum, "clCreateBuffer(mask)");
+
+    outputSignalBuffer = clCreateBuffer(
+            context,
+            CL_MEM_WRITE_ONLY,
+            sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
+            NULL,
+            &errNum);
+    checkErr(errNum, "clCreateBuffer(outputSignal)");
+
+    // Pick the first device and create command queue.
+    queue = clCreateCommandQueue(
+            context,
+            deviceIDs[0],
+            0,
+            &errNum);
+    checkErr(errNum, "clCreateCommandQueue");
+
+    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
+    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
+    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
+    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
+    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
+    checkErr(errNum, "clSetKernelArg");
+
+    const size_t globalWorkSize[1] = {outputSignalWidth * outputSignalHeight};
+    const size_t localWorkSize[1] = {1};
+
+    // Queue the kernel up for execution across the array
+    errNum = clEnqueueNDRangeKernel(
+            queue,
+            kernel,
+            1,
+            NULL,
+            globalWorkSize,
+            localWorkSize,
+            0,
+            NULL,
+            NULL);
+    checkErr(errNum, "clEnqueueNDRangeKernel");
+
+    errNum = clEnqueueReadBuffer(
+            queue,
+            outputSignalBuffer,
+            CL_TRUE,
+            0,
+            sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
+            outputSignal,
+            0,
+            NULL,
+            NULL);
+    checkErr(errNum, "clEnqueueReadBuffer");
+
+    // Output the result buffer
+    for (int y = 0; y < outputSignalHeight; y++) {
+        for (int x = 0; x < outputSignalWidth; x++) {
+            std::cout << outputSignal[x][y] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    std::cout << std::endl << "Executed program succesfully." << std::endl;
+
+    return 0;
+}
+
+ 
+
diff --git a/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld.h b/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld.h
new file mode 100644
index 0000000..29ac6e7
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld.h
@@ -0,0 +1,299 @@
+//
+// Created by dm on 2021/2/19.
+//
+
+#ifndef BASE_UTILS_HELLOWORLD_H
+#define BASE_UTILS_HELLOWORLD_H
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+
+#include <CL/cl.h>
+
+#endif
+
+const int ARRAY_SIZE = 1000;
+
+//  Create an OpenCL context on the first available platform using
+//  either a GPU or CPU depending on what is available.
+//  选择平台（clGetPlatformIDs）并创建上下文（clCreateContextFromType）
+cl_context CreateContext() {
+    cl_int errNum;
+    cl_uint numPlatforms;
+    cl_platform_id firstPlatformId;
+    cl_context context = NULL;
+
+    // First, select an OpenCL platform to run on.  For this example, we
+    // simply choose the first available platform.  Normally, you would
+    // query for all available platforms and select the most appropriate one.
+    errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
+    if (errNum != CL_SUCCESS || numPlatforms <= 0) {
+        std::cerr << "Failed to find any OpenCL platforms." << std::endl;
+        return NULL;
+    }
+
+    // Next, create an OpenCL context on the platform.  Attempt to
+    // create a GPU-based context, and if that fails, try to create
+    // a CPU-based context.
+    cl_context_properties contextProperties[] =
+            {
+                    CL_CONTEXT_PLATFORM,
+                    (cl_context_properties) firstPlatformId,
+                    0
+            };
+    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
+                                      NULL, NULL, &errNum);
+    if (errNum != CL_SUCCESS) {
+        std::cout << "Could not create GPU context, trying CPU..." << std::endl;
+        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
+                                          NULL, NULL, &errNum);
+        if (errNum != CL_SUCCESS) {
+            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
+            return NULL;
+        }
+        std::cout << "create CPU context successfully" << std::endl;
+    }
+    return context;
+}
+
+///
+//  Create a command queue on the first device available on the
+//  context
+//  选择设备并创建命令队列（clCreateCommandQueue）
+cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device) {
+    cl_int errNum;
+    cl_device_id *devices;
+    cl_command_queue commandQueue = NULL;
+    size_t deviceBufferSize = -1;
+
+    // First get the size of the devices buffer 查询上下文的信息
+    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
+    if (errNum != CL_SUCCESS) {
+        std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
+        return NULL;
+    }
+
+    if (deviceBufferSize <= 0) {
+        std::cerr << "No devices available.";
+        return NULL;
+    }
+
+    // Allocate memory for the devices buffer
+    devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
+    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
+    if (errNum != CL_SUCCESS) {
+        delete[] devices;
+        std::cerr << "Failed to get device IDs";
+        return NULL;
+    }
+
+    // In this example, we just choose the first available device.  In a
+    // real program, you would likely use all available devices or choose
+    // the highest performance device based on OpenCL device queries
+    // 在选择的设备上创建一个命令队列
+    commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
+    if (commandQueue == NULL) {
+        delete[] devices;
+        std::cerr << "Failed to create commandQueue for device 0";
+        return NULL;
+    }
+
+    *device = devices[0];
+    delete[] devices;
+    return commandQueue;
+}
+
+///
+//  Create an OpenCL program from the kernel source file
+//  创建（clCreateProgramWithSource）和构建程序对象（clBuildProgram）
+cl_program CreateProgram(cl_context context, cl_device_id device, const char *fileName) {
+    cl_int errNum;
+    cl_program program;
+
+    std::ifstream kernelFile(fileName, std::ios::in);
+    if (!kernelFile.is_open()) {
+        std::cerr << "Failed to open file for reading: " << fileName << std::endl;
+        return NULL;
+    }
+
+    std::ostringstream oss;
+    oss << kernelFile.rdbuf();
+
+    std::string srcStdStr = oss.str();
+    const char *srcStr = srcStdStr.c_str();
+    program = clCreateProgramWithSource(context, 1,
+                                        (const char **) &srcStr,
+                                        NULL, NULL);
+    if (program == NULL) {
+        std::cerr << "Failed to create CL program from source." << std::endl;
+        return NULL;
+    }
+
+    errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (errNum != CL_SUCCESS) {
+        // Determine the reason for the error
+        char buildLog[16384];
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+                              sizeof(buildLog), buildLog, NULL);
+
+        std::cerr << "Error in kernel: " << std::endl;
+        std::cerr << buildLog;
+        clReleaseProgram(program);
+        return NULL;
+    }
+
+    return program;
+}
+
+///
+//  Create memory objects used as the arguments to the kernel
+//  The kernel takes three arguments: result (output), a (input),
+//  and b (input)
+//  内存对象（clCreateBuffer）
+bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
+                      float *a, float *b) {
+    // 内存对象，将主机数据复制到设备上下文
+    memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   sizeof(float) * ARRAY_SIZE, a, NULL);
+    memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   sizeof(float) * ARRAY_SIZE, b, NULL);
+    memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                   sizeof(float) * ARRAY_SIZE, NULL, NULL);
+
+    if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL) {
+        std::cerr << "Error creating memory objects." << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+///
+//  Cleanup any created OpenCL resources
+//
+void Cleanup(cl_context context, cl_command_queue commandQueue,
+             cl_program program, cl_kernel kernel, cl_mem memObjects[3]) {
+    for (int i = 0; i < 3; i++) {
+        if (memObjects[i] != 0)
+            clReleaseMemObject(memObjects[i]);
+    }
+    if (commandQueue != 0)
+        clReleaseCommandQueue(commandQueue);
+
+    if (kernel != 0)
+        clReleaseKernel(kernel);
+
+    if (program != 0)
+        clReleaseProgram(program);
+
+    if (context != 0)
+        clReleaseContext(context);
+
+}
+
+
+int test(const char *cl_file) {
+    cl_context context = 0;
+    cl_command_queue commandQueue = 0;
+    cl_program program = 0;
+    cl_device_id device = 0;
+    cl_kernel kernel = 0;
+    cl_mem memObjects[3] = {0, 0, 0};
+    cl_int errNum;
+
+    // Create an OpenCL context on first available platform
+    context = CreateContext();
+    if (context == NULL) {
+        std::cerr << "Failed to create OpenCL context." << std::endl;
+        return 1;
+    }
+
+    // Create a command-queue on the first device available
+    // on the created context
+    commandQueue = CreateCommandQueue(context, &device);
+    if (commandQueue == NULL) {
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Create OpenCL program from HelloWorld.cl kernel source
+    program = CreateProgram(context, device, cl_file);
+    if (program == NULL) {
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Create OpenCL kernel 创建内核
+    kernel = clCreateKernel(program, "hello_kernel", NULL);
+    if (kernel == NULL) {
+        std::cerr << "Failed to create kernel" << std::endl;
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Create memory objects that will be used as arguments to
+    // kernel.  First create host memory arrays that will be
+    // used to store the arguments to the kernel
+    float result[ARRAY_SIZE];
+    float a[ARRAY_SIZE];
+    float b[ARRAY_SIZE];
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+        a[i] = (float) i;
+        b[i] = (float) (i * 2);
+    }
+    //内存对象，将主机数据复制到设备上下文
+    if (!CreateMemObjects(context, memObjects, a, b)) {
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Set the kernel arguments (result, a, b) 设置内核参数
+    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+    if (errNum != CL_SUCCESS) {
+        std::cerr << "Error setting kernel arguments." << std::endl;
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    size_t globalWorkSize[1] = {ARRAY_SIZE};
+    size_t localWorkSize[1] = {1};
+
+    // Queue the kernel up for execution across the array运行内核
+    errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
+                                    globalWorkSize, localWorkSize,
+                                    0, NULL, NULL);
+    if (errNum != CL_SUCCESS) {
+        std::cerr << "Error queuing kernel for execution." << std::endl;
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Read the output buffer back to the Host
+    errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
+                                 0, ARRAY_SIZE * sizeof(float), result,
+                                 0, NULL, NULL);
+    if (errNum != CL_SUCCESS) {
+        std::cerr << "Error reading result buffer." << std::endl;
+        Cleanup(context, commandQueue, program, kernel, memObjects);
+        return 1;
+    }
+
+    // Output the result buffer
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+        std::cout << result[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "Executed program succesfully." << std::endl;
+    Cleanup(context, commandQueue, program, kernel, memObjects);
+
+    return 0;
+}
+
+#endif //BASE_UTILS_HELLOWORLD_H
diff --git a/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld_str.h b/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld_str.h
new file mode 100644
index 0000000..72b8f54
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/include/HelloWorld_str.h
@@ -0,0 +1,283 @@
+//
+// Created by dm on 2021/2/19.
+//
+
+#ifndef BASE_UTILS_HELLOWORLD_H
+#define BASE_UTILS_HELLOWORLD_H
+
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cl_info.h"
+
+// OpenCl-1.cpp : Defines the entry point for the console application.
+//
+#include <CL/cl.h>
+#include <iostream>
+#include <fstream>
+#include "file_utils.h"
+using namespace std;
+
+
+int HelloWorld(const char *cl_file)
+{
+    cl_int			iStatus			= 0;				// 函数返回状态
+    cl_uint			uiNumPlatforms	= 0;				// 平台个数
+    cl_platform_id	Platform		= NULL;				// 选择的平台
+    size_t			uiSize			= 0;				// 平台版本名字字节数
+    cl_int			iErr			= 0;				// 返回参数
+    char			*pName			= NULL;				// 平台版本名
+    cl_uint			uiNumDevices	= 0;				// 设备数量
+    cl_device_id	*pDevices		= NULL;				// 设备
+    cl_context		Context			= NULL;				// 设备环境
+    cl_command_queue	CommandQueue	= NULL;			// 命令队列
+    const char		*pFileName		= cl_file;	// cl文件名
+    string			strSource		= "";				// 用于存储cl文件中的代码
+    const char		*pSource;							// 代码字符串指针
+    size_t			uiArrSourceSize[]	= {0};			// 代码字符串长度
+    cl_program		Program			= NULL;				// 程序对象
+    const char		*pInput			= "gdkknvnqkc";		// 输入字符串
+    size_t			uiStrlength		= strlen(pInput);	// 输入字符串长度
+    char			*pOutput		= NULL;				// 输出字符串
+    cl_mem			memInutBuffer	= NULL;				// 输入内存对象
+    cl_mem			memOutputBuffer	= NULL;				// 输出内存对象
+    cl_kernel		Kernel			= NULL;				// 内核对象
+    size_t			uiGlobal_Work_Size[1]	= {0};		// 用于设定内核分布
+
+
+    //-------------------1. 获得并选择可用平台-----------------------------
+    // 查询可用的平台个数，并返回状态
+    iStatus = clGetPlatformIDs(0, NULL, &uiNumPlatforms);
+    if (CL_SUCCESS != iStatus)
+    {
+        cout << "Error: Getting platforms error" << endl;
+        return 0;
+    }
+
+
+    // 获得平台地址
+    if (uiNumPlatforms > 0)  // 如果有可用平台
+    {
+        // 根据平台数为平台分配内存空间
+        cl_platform_id *platforms_buffer = (cl_platform_id *)malloc(uiNumPlatforms * sizeof(cl_platform_id));
+
+        // 获得可用的平台
+        iStatus = clGetPlatformIDs(uiNumPlatforms, platforms_buffer, NULL);
+        Platform = platforms_buffer[0];	// 获得第一个平台的地址
+        free(platforms_buffer);			// 释放平台占用的内存空间
+    }
+
+    // 获得平台版本名
+    // 获得平台版本名的字节数
+    iErr = clGetPlatformInfo(Platform, CL_PLATFORM_VERSION, 0, NULL, &uiSize);
+
+    // 根据字节数为平台版本名分配内存空间
+    pName = (char *)alloca(uiSize * sizeof(char));
+
+    // 获得平台版本名字
+    iErr = clGetPlatformInfo(Platform, CL_PLATFORM_VERSION, uiSize, pName, NULL);
+    cout << pName <<endl;
+
+
+
+    //--------------2. 查询GPU设备，并选择可用设备------------------------
+    // 获得GPU设备数量
+    iStatus = clGetDeviceIDs(Platform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
+    if (0 == uiNumDevices)	// 如果没有GPU设备
+    {
+        cout << "No GPU device available." << endl;
+        cout << "Choose CPU as default device." << endl;
+
+        // 选择CPU作为设备，获得设备数
+        iStatus = clGetDeviceIDs(Platform, CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices);
+
+        // 为设备分配空间
+        pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
+
+        // 获得平台
+        iStatus = clGetDeviceIDs(Platform, CL_DEVICE_TYPE_CPU, uiNumDevices, pDevices, NULL);
+    }
+    else
+    {
+        pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
+
+        iStatus = clGetDeviceIDs(Platform, CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, NULL);
+    }
+
+
+    // -------------------3.创建设备环境---------------------------------
+    // 创建设备环境
+    Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
+    if (NULL == Context)
+    {
+        cout << "Error: Can not create context" << endl;
+        return 0;
+    }
+
+    // -------------------4.创建命令队列--------------------------------------
+    // 创建第1个设备的命令队列
+    CommandQueue = clCreateCommandQueue(Context, pDevices[0], 0, NULL);
+    if (NULL == CommandQueue)
+    {
+        cout << "Error: Can not create CommandQueue" << endl;
+        return 0;
+    }
+
+
+    // ----------------------5. 创建程序对象------------------------------
+    // 将cl文件中的代码转为字符串
+    iStatus = load_file(pFileName, strSource);
+    pSource = strSource.c_str();			// 获得strSource指针
+    uiArrSourceSize[0]  = strlen(pSource);	// 字符串大小
+
+    // 创建程序对象
+    Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
+    if (NULL == Program)
+    {
+        cout << "Error: Can not create program" << endl;
+        return 0;
+    }
+
+
+    // -----------------------------6. 编译程序--------------------------------
+    // 编译程序
+    iStatus = clBuildProgram(Program, 1, pDevices, NULL, NULL, NULL);
+    if (CL_SUCCESS != iStatus)	// 编译错误
+    {
+        cout << "Error: Can not build program" << endl;
+        char szBuildLog[16384];
+        clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
+
+        cout << "Error in Kernel: "<< endl << szBuildLog;
+        clReleaseProgram(Program);
+
+        return 0;
+    }
+
+    //-------------------------7. 并创建输入输出内核内存对象--------------------------------
+
+    // 创建输入内存对象
+    memInutBuffer = clCreateBuffer(
+            Context,
+            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  // 输入内存为只读，并可以从宿主机内存复制到设备内存
+            (uiStrlength + 1) * sizeof(char),		  // 输入内存空间大小
+            (void *)pInput,
+            NULL);
+
+    // 创建输出内存对象
+    memOutputBuffer = clCreateBuffer(
+            Context,
+            CL_MEM_WRITE_ONLY,					// 输出内存只能写
+            (uiStrlength + 1) * sizeof(char),	// 输出内存空间大小
+            NULL,
+            NULL);
+
+    if ((NULL == memInutBuffer) || (NULL == memOutputBuffer))
+    {
+        cout << "Error creating memory objects" << endl;
+        return 0;
+    }
+
+    //--------------------------8. 创建内核对象-------------------------------------
+    Kernel =  clCreateKernel(Program,
+                             "helloworld",  // cl文件中的入口函数
+                             NULL);
+    if (NULL == Kernel)
+    {
+        cout << "Error: Can not create kernel" << endl;
+        return 0;
+    }
+
+
+
+    //----------------------------9. 设置内核参数----------------------------------
+    iStatus = clSetKernelArg(Kernel,
+                             0,		// 参数索引
+                             sizeof(cl_mem),
+                             (void *)&memInutBuffer);
+
+    iStatus |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void *)&memOutputBuffer);
+
+    if (CL_SUCCESS != iStatus)
+    {
+        cout << "Error setting kernel arguments" << endl;
+    }
+
+
+
+    // --------------------------10.运行内核---------------------------------
+    uiGlobal_Work_Size[0] = uiStrlength;  // 输入字符串大小
+
+    // 利用命令队列使将再设备上执行的内核排队
+    iStatus = clEnqueueNDRangeKernel(
+            CommandQueue,
+            Kernel,
+            1,
+            NULL,
+            uiGlobal_Work_Size,  // 确定内核在设备上的多个处理单元间的分布
+            NULL,				 // 确定内核在设备上的多个处理单元间的分布
+            0,
+            NULL,
+            NULL);
+
+
+    if (CL_SUCCESS != iStatus)
+    {
+        cout << "Error: Can not run kernel" << endl;
+        return 0;
+    }
+
+    // ----------------------------11. 将输出读取到主机内存
+    pOutput = (char *)malloc(uiStrlength + 1);  // uiStrlength 为 输入字符串长度
+
+    iStatus = clEnqueueReadBuffer(
+            CommandQueue,		// 命令队列
+            memOutputBuffer,	// 输出内存对象
+            CL_TRUE,			// 内核读取结束之前该函数不会返回
+            0,
+            uiStrlength * sizeof(char),
+            pOutput,
+            0,
+            NULL,
+            NULL);
+
+    if (CL_SUCCESS != iStatus)
+    {
+        cout << "Error: Can not reading result buffer" << endl;
+        return 0;
+    }
+
+
+    // ---------------------12--输出计算结果---------------
+    pOutput[uiStrlength] = '\0';
+    cout << "Input String:" << endl;
+    cout << pInput<<endl;
+    cout << "Output String:" <<endl;
+    cout << pOutput <<endl;
+
+
+    // -------------------------------13. 释放资源--------------------------------
+    iStatus = clReleaseKernel(Kernel);
+    iStatus = clReleaseProgram(Program);
+    iStatus = clReleaseMemObject(memInutBuffer);
+    iStatus = clReleaseMemObject(memOutputBuffer);
+    iStatus = clReleaseCommandQueue(CommandQueue);
+    iStatus = clReleaseContext(Context);
+
+    if (NULL != pOutput)
+    {
+        free(pOutput);
+        pOutput = NULL;
+    }
+
+    if (NULL != pDevices)
+    {
+        free(pDevices);
+        pDevices = NULL;
+    }
+
+    return 0;
+}
+
+#endif //BASE_UTILS_HELLOWORLD_H
diff --git a/3rdparty/base-utils/test/contrib/base_cl/include/cl_info.h b/3rdparty/base-utils/test/contrib/base_cl/include/cl_info.h
new file mode 100644
index 0000000..203e3dd
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/include/cl_info.h
@@ -0,0 +1,123 @@
+//
+// Created by dm on 2021/2/19.
+//
+
+#ifndef BASE_UTILS_CL_INFO_H
+#define BASE_UTILS_CL_INFO_H
+
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef MAC
+#include <OpenCL/cl.h>
+#else
+
+#include <CL/cl.h>
+
+#endif
+
+int get_cl_info() {
+
+    /* Host data structures */
+    cl_platform_id *platforms;
+    //每一个cl_platform_id 结构表示一个在主机上的OpenCL执行平台，就是指电脑中支持OpenCL的硬件，如nvidia显卡，intel CPU和显卡，AMD显卡和CPU等
+    cl_uint num_platforms;
+    cl_int i, err, platform_index = -1;
+
+    /* Extension data */
+    char *ext_data;
+    size_t ext_size;
+    const char icd_ext[] = "cl_khr_icd";
+
+    //要使platform工作，需要两个步骤。1 需要为cl_platform_id结构分配内存空间。2 需要调用clGetPlatformIDs初始化这些数据结构。一般还需要步骤0：询问主机上有多少platforms
+
+    /* Find number of platforms */
+    //返回值如果为-1就说明调用函数失败，如果为0标明成功
+    //第二个参数为NULL代表要咨询主机上有多少个platform，并使用num_platforms取得实际flatform数量。
+    //第一个参数为1，代表我们需要取最多1个platform。可以改为任意大如：INT_MAX整数最大值。但是据说0，否则会报错，实际测试好像不会报错。下面是步骤0：询问主机有多少platforms
+    err = clGetPlatformIDs(5, NULL, &num_platforms);
+    if (err < 0) {
+        perror("Couldn't find any platforms.");
+        exit(1);
+    }
+
+    printf("I have platforms: %d\n", num_platforms); //本人计算机上显示为2，有intel和nvidia两个平台
+
+    /* Access all installed platforms */
+    //步骤1 创建cl_platform_id，并分配空间
+    platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
+    //步骤2 第二个参数用指针platforms存储platform
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+
+    /* Find extensions of all platforms */
+    //获取额外的平台信息。上面已经取得了平台id了，那么就可以进一步获取更加详细的信息了。
+    //一个for循环获取所有的主机上的platforms信息
+    for (i = 0; i < num_platforms; i++) {
+        /* Find size of extension data */
+        //也是和前面一样，先设置第三和第四个参数为0和NULL，然后就可以用第五个参数ext_size获取额外信息的长度了。
+        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size);
+        if (err < 0) {
+            perror("Couldn't read extension data.");
+            exit(1);
+        }
+
+        printf("The size of extension data is: %d\n", (int) ext_size);//我的计算机显示224.
+
+        /* Access extension data */
+        //这里的ext_data相当于一个缓存，存储相关信息。
+        ext_data = (char *) malloc(ext_size);
+        //这个函数就是获取相关信息的函数，第二个参数指明了需要什么样的信息，如这里的CL_PLATFORM_EXTENSIONS表示是opencl支持的扩展功能信息。我计算机输出一大串，机器比较新（专门为了学图形学而购置的电脑），支持的东西比较多。
+        clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, ext_size, ext_data, NULL);
+        printf("Platform %d supports extensions: %s\n", i, ext_data);
+
+        //这里是输出生产商的名字，比如我显卡信息是：NVIDIA CUDA
+        char *name = (char *) malloc(ext_size);
+        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, ext_size, name, NULL);
+        printf("Platform %d name: %s\n", i, name);
+
+        //这里是供应商信息，我显卡信息：NVIDIA Corporation
+        char *vendor = (char *) malloc(ext_size);
+        clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, ext_size, vendor, NULL);
+        printf("Platform %d vendor: %s\n", i, vendor);
+
+        //最高支持的OpenCL版本，本机显示：OpenCL1.1 CUDA 4.2.1
+        char *version = (char *) malloc(ext_size);
+        clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, ext_size, version, NULL);
+        printf("Platform %d version: %s\n", i, version);
+
+        //这个只有两个值：full profile 和 embeded profile
+        char *profile = (char *) malloc(ext_size);
+        clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, ext_size, profile, NULL);
+        printf("Platform %d full profile or embeded profile?: %s\n", i, profile);
+
+        /* Look for ICD extension */
+        //如果支持ICD这一扩展功能的platform，输出显示，本机的Intel和Nvidia都支持这一扩展功能
+        if (strstr(ext_data, icd_ext) != NULL)
+            platform_index = i;
+        //std::cout<<"Platform_index = "<<platform_index<<std::endl;
+        printf("Platform_index is: %d\n", platform_index);
+        /* Display whether ICD extension is supported */
+        if (platform_index > -1)
+            printf("Platform %d supports the %s extension.\n", platform_index, icd_ext);
+
+
+        //释放空间
+        free(ext_data);
+        free(name);
+        free(vendor);
+        free(version);
+        free(profile);
+    }
+
+    if (platform_index <= -1)
+        printf("No platforms support the %s extension.\n", icd_ext);
+
+    /* Deallocate resources */
+    free(platforms);
+    printf("done\n", icd_ext);
+    return 0;
+}
+
+#endif //BASE_UTILS_CL_INFO_H
diff --git a/3rdparty/base-utils/test/contrib/base_cl/kernel/Convolution.cl b/3rdparty/base-utils/test/contrib/base_cl/kernel/Convolution.cl
new file mode 100755
index 0000000..fef1904
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/kernel/Convolution.cl
@@ -0,0 +1,37 @@
+//
+// Book:      OpenCL(R) Programming Guide
+// Authors:   Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
+// ISBN-10:   0-321-74964-2
+// ISBN-13:   978-0-321-74964-2
+// Publisher: Addison-Wesley Professional
+// URLs:      http://safari.informit.com/9780132488006/
+//            http://www.openclprogrammingguide.com
+//
+
+// Convolution.cl
+//
+//    This is a simple kernel performing convolution.
+
+__kernel void convolve(
+	const __global  uint * const input,
+    __constant uint * const mask,
+    __global  uint * const output,
+    const int inputWidth,
+    const int maskWidth)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    uint sum = 0;
+    for (int r = 0; r < maskWidth; r++)
+    {
+        const int idxIntmp = (y + r) * inputWidth + x;
+
+        for (int c = 0; c < maskWidth; c++)
+        {
+			sum += mask[(r * maskWidth)  + c] * input[idxIntmp + c];
+        }
+    } 
+    
+	output[y * get_global_size(0) + x] = sum;
+}
\ No newline at end of file
diff --git a/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld.cl b/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld.cl
new file mode 100644
index 0000000..fd2dbcb
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld.cl
@@ -0,0 +1,9 @@
+
+__kernel void hello_kernel(__global const float *a,
+						__global const float *b,
+						__global float *result)
+{
+    int gid = get_global_id(0);
+
+    result[gid] = a[gid] + b[gid];
+}
diff --git a/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld_Kernel.cl b/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld_Kernel.cl
new file mode 100644
index 0000000..19d4aec
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/kernel/HelloWorld_Kernel.cl
@@ -0,0 +1,5 @@
+__kernel void helloworld(__global char *pIn, __global char *pOut)
+{
+	int iNum = get_global_id(0);
+	pOut[iNum] = pIn[iNum] + 1;
+}
\ No newline at end of file
diff --git a/3rdparty/base-utils/test/contrib/base_cl/src/cl_info.cpp b/3rdparty/base-utils/test/contrib/base_cl/src/cl_info.cpp
new file mode 100644
index 0000000..3536210
--- /dev/null
+++ b/3rdparty/base-utils/test/contrib/base_cl/src/cl_info.cpp
@@ -0,0 +1,5 @@
+//
+// Created by dm on 2021/2/19.
+//
+
+#include <cl_info.h>
\ No newline at end of file
diff --git a/3rdparty/base-utils/test/kalman_test.cpp b/3rdparty/base-utils/test/kalman_test.cpp
new file mode 100755
index 0000000..47522c1
--- /dev/null
+++ b/3rdparty/base-utils/test/kalman_test.cpp
@@ -0,0 +1,73 @@
+#include<opencv2/opencv.hpp>
+#include<string>
+#include "debug.h"
+
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include <iostream>
+#include <stdio.h>
+#include "filter/kalman_filter.h"
+#include "filter/mean_filter.h"
+
+using namespace std;
+
+// 鼠标的坐标位置
+cv::Point mousePosition = cv::Point(0, 0);
+
+//mouse event callback
+void static mouseEvent(int event, int x, int y, int flags, void *param) {
+    if (event == cv::EVENT_MOUSEMOVE) {
+        mousePosition = cv::Point(x, y);
+    }
+}
+
+void test() {
+    cv::namedWindow("kalman");
+    cv::setMouseCallback("kalman", mouseEvent);
+    //卡尔曼滤波
+    //KalmanFilter filter = KalmanFilter(4, 2);
+    // 加权平均滤波
+    MovingMeanFilter filter = MovingMeanFilter(5, 0.6);
+    cv::Mat image(800, 800, CV_8UC3, cv::Scalar(0));
+    cv::Point cm = cv::Point(-1, -1);
+    cv::Point cp = cv::Point(-1, -1);
+    cv::Point lm = cv::Point(-1, -1);
+    cv::Point lp = cv::Point(-1, -1);
+    cv::Point pred;
+    while (true) {
+        //update measurement
+        filter.update(mousePosition);
+        //prediction
+        pred = filter.predict();   //预测值(x',y')
+        cout << "curr_point:" << mousePosition << endl;
+        cout << "pred_point:" << pred << endl;
+
+        if (lm.x < 0 && lm.y < 0) {
+            lm = mousePosition;
+        }
+        if (lp.x < 0 && lp.y < 0) {
+            lp = pred;
+        }
+        //draw
+        cm = mousePosition;
+        cp = pred;
+        cv::line(image, lm, cm, cv::Scalar(0, 200, 0));//绘制测量值轨迹（绿色）
+        cv::line(image, lp, cp, cv::Scalar(0, 0, 200));//绘制预测值轨迹（红色）
+        lm = cm;
+        lp = cp;
+
+        cv::imshow("kalman", image);
+        int key = cv::waitKey(100);
+        if (key == 27) {//esc
+            break;
+        } else if (key == 'c' || key == 'C') {
+            image = cv::Mat(800, 800, CV_8UC3, cv::Scalar(0));
+        }
+        cout << "========================" << endl;
+    }
+}
+
+int main(void) {
+    test();
+
+}
diff --git a/3rdparty/base-utils/test/main.cpp b/3rdparty/base-utils/test/main.cpp
new file mode 100755
index 0000000..d3ea24f
--- /dev/null
+++ b/3rdparty/base-utils/test/main.cpp
@@ -0,0 +1,103 @@
+#include<opencv2/opencv.hpp>
+#include<string>
+#include "debug.h"
+#include "image_utils.h"
+#include "file_utils.h"
+#include "math_utils.h"
+
+using namespace std;
+
+
+void test_opencv() {
+    string path = "../../data/test_image/test1.jpg";
+    DEBUG_TIME(t1);
+    cv::Mat image = cv::imread(path);
+    image = image_resize(image, -1, 300);
+    LOGI("image:%s,w-h=[%d,%d]", path.c_str(), image.cols, image.rows);
+    LOGD("image:%s,w-h=[%d,%d]", path.c_str(), image.cols, image.rows);
+    LOGW("image:%s,w-h=[%d,%d]", path.c_str(), image.cols, image.rows);
+    LOGE("image:%s,w-h=[%d,%d]", path.c_str(), image.cols, image.rows);
+    LOGF("image:%s,w-h=[%d,%d]", path.c_str(), image.cols, image.rows);
+    DEBUG_TIME(t2);
+    LOGI("rum time:%3.3fms", RUN_TIME(t2 - t1));
+    cv::waitKey(0);
+    DEBUG_IMSHOW("image", image);
+}
+
+
+void test_rotate_points() {
+    string path = "../../data/test_image/test1.jpg";
+    DEBUG_TIME(t1);
+    cv::Mat image = cv::imread(path);
+    float angle = 45;
+    //cv::Point2f center(image.cols / 2., image.rows / 2.);
+    cv::Point2f center(image.cols / 2., image.rows / 2.);
+    cv::Point2f point1(238., 305);
+    cv::Point2f point2(265., 280);
+    cv::Mat dst = image.clone();
+    vector<cv::Point2f> points;
+    points.push_back(point1);
+    points.push_back(point2);
+    vector<cv::Point2f> dst_point = rotate_image_points(dst, points, center, angle);
+    draw_points_texts(image, points);
+    draw_points_texts(dst, dst_point);
+    DEBUG_IMSHOW("image", image, 10);
+    DEBUG_IMSHOW("dst", dst);
+}
+
+void test_read_dir() {
+    //string image_dir = "../../data/test_image/test1.jpg";
+    string image_dir = "../../base_utils";
+    vector<string> image_list = get_files_list(image_dir);
+    for (int i = 0; i < image_list.size(); ++i) {
+        string path = image_list.at(i);
+        string subname = get_subname(path);
+        string name = get_basename(path);
+        printf("path:%s\tsubname:%s\tname:%s\n", path.c_str(), subname.c_str(), name.c_str());
+    }
+    //PRINT_VECTOR("image_list:\n", image_list);
+}
+
+void test_read_write_file() {
+    string path = "../../data/write_contents.txt";
+    string image_dir = "../../base_utils";
+    vector<string> image_list = get_files_list(image_dir);
+    write_contents(path, image_list, true);
+    image_list = read_contents(path);
+    PRINT_VECTOR("image_list:", image_list);
+}
+
+void test_math_utils() {
+    vector<float> src = {0.01, 0.2, 10.};
+    vector<float> dst;
+    int max_index = 0;
+    float max_value = 0;
+    softmax(src, dst, max_index, max_value);
+    PRINT_VECTOR("src:", src);
+    PRINT_VECTOR("dst:", dst);
+    LOGD("max_index:%d", max_index);
+    LOGD("max_value:%f", max_value);
+
+}
+
+void test_math_utils_vector() {
+    cv::Point2f point1(0, 0);
+    cv::Point2f point2(1, 1);
+    cv::Point2f point3(0, 0);
+    cv::Point2f point4(1, 1);
+    cv::Point2f v1 = create_vector(point1, point2);
+    cv::Point2f v2 = create_vector(point3, point4);
+    float angle = compute_vector_angle(v1, v2, true);
+    LOGD("angle:%f", angle);
+}
+
+
+int main() {
+    //test_opencv();
+    //test_read_dir();
+    //test_read_write_file();
+    //test_math_utils();
+    test_rotate_points();
+//    test_math_utils_vector();
+    return 0;
+}
diff --git a/3rdparty/base-utils/test/opencl_test.cpp b/3rdparty/base-utils/test/opencl_test.cpp
new file mode 100755
index 0000000..bae8098
--- /dev/null
+++ b/3rdparty/base-utils/test/opencl_test.cpp
@@ -0,0 +1,20 @@
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cl_info.h"
+//#include "HelloWorld.h"
+#include "Convolution.h"
+
+int main() {
+    //get_cl_info();
+    //const char *cl_file = "../contrib/base_cl/kernel/HelloWorld_Kernel.cl";
+    //const char *cl_file = "../contrib/base_cl/kernel/HelloWorld.cl";
+    const char *cl_file = "../contrib/base_cl/kernel/Convolution.cl";
+    test(cl_file);
+    printf("finish\n");
+    return 0;
+}
+
+
+
diff --git a/3rdparty/base-utils/test/openmp_test.cpp b/3rdparty/base-utils/test/openmp_test.cpp
new file mode 100755
index 0000000..d2f4993
--- /dev/null
+++ b/3rdparty/base-utils/test/openmp_test.cpp
@@ -0,0 +1,57 @@
+#include <iostream>
+#include <omp.h>   // NEW ADD
+#include<opencv2/opencv.hpp>
+#include<string>
+#include "debug.h"
+#include "image_utils.h"
+
+using namespace std;
+
+#define ITERA_NUMS 50
+#define THREAD_NUMS 4
+
+void test1(int iter_nums) {
+#pragma omp parallel for num_threads(THREAD_NUMS)// NEW ADD
+    for (int i = 0; i < iter_nums; i++) {
+        LOGI("i=%d", i);
+    }
+}
+
+void test2(int iter_nums) {
+    string path = "../../data/test_image/test1.jpg";
+    cv::Mat src = cv::imread(path);
+    cv::Mat dst;
+    int resize_width = 1000;
+    int resize_height = 1000;
+    DEBUG_TIME(t1);
+    for (int i = 0; i < iter_nums; i++) {
+        LOGI("i=%d", i);
+        cv::resize(src, dst, cv::Size(resize_width, resize_height));
+        //dst = image_resize(src, resize_width, resize_height);
+    }
+    DEBUG_TIME(t2);
+    DEBUG_TIME(t3);
+    //#pragma omp parallel for num_threads(THREAD_NUMS) private(dst)
+#pragma omp parallel for num_threads(THREAD_NUMS)
+    for (int i = 0; i < iter_nums; i++) {
+        LOGI("i=%d", i);
+        cv::resize(src, dst, cv::Size(resize_width, resize_height));
+        //dst = image_resize(src, resize_width, resize_height);
+    }
+    DEBUG_TIME(t4);
+    LOGI("Single-Thread:%3.3fms", RUN_TIME(t2 - t1));
+    LOGI("Multi-Thread :%3.3fms", RUN_TIME(t4 - t3));
+
+}
+
+
+int main() {
+#if _OPENMP
+    LOGI("support openmp");
+#else
+    LOGI("not support openmp");
+#endif
+    //test1(ITERA_NUMS);
+    test2(ITERA_NUMS);
+    return 0;
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..dc13434
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,75 @@
+cmake_minimum_required(VERSION 3.5)
+project(Detector)
+
+add_compile_options(-fPIC) # fix Bug: can not be used when making a shared object
+set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -pthread")
+#set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG")
+#set(CMAKE_CXX_FLAGS_DEBUG "-g")
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    # -DCMAKE_BUILD_TYPE=Debug
+    # -DCMAKE_BUILD_TYPE=Release
+    message(STATUS "No build type selected, default to Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Debug)" FORCE)
+endif ()
+
+# opencv set
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ./src/)
+#MESSAGE(STATUS "OpenCV_INCLUDE_DIRS = ${OpenCV_INCLUDE_DIRS}")
+
+# base_utils
+set(BASE_ROOT 3rdparty/base-utils) # 设置base-utils所在的根目录
+add_subdirectory(${BASE_ROOT}/base_utils/ base_build) # 添加子目录到build中
+include_directories(${BASE_ROOT}/base_utils/include)
+include_directories(${BASE_ROOT}/base_utils/src)
+MESSAGE(STATUS "BASE_ROOT = ${BASE_ROOT}")
+
+
+# TNN set
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds it for you.
+# Gradle automatically packages shared libraries with your APK.
+# build for platform
+# set(TNN_BUILD_SHARED OFF CACHE BOOL "" FORCE)
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(TNN_CPU_ENABLE ON CACHE BOOL "" FORCE)
+    set(TNN_X86_ENABLE OFF CACHE BOOL "" FORCE)
+    set(TNN_QUANTIZATION_ENABLE OFF CACHE BOOL "" FORCE)
+    set(TNN_OPENMP_ENABLE ON CACHE BOOL "" FORCE)  # Multi-Thread
+    add_definitions(-DTNN_OPENCL_ENABLE)           # for OpenCL GPU
+    add_definitions(-DDEBUG_ON)                    # for WIN/Linux Log
+    add_definitions(-DDEBUG_LOG_ON)                # for WIN/Linux Log
+    add_definitions(-DDEBUG_IMSHOW_OFF)            # for OpenCV show
+    add_definitions(-DPLATFORM_LINUX)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(TNN_CPU_ENABLE ON CACHE BOOL "" FORCE)
+    set(TNN_X86_ENABLE ON CACHE BOOL "" FORCE)
+    set(TNN_QUANTIZATION_ENABLE OFF CACHE BOOL "" FORCE)
+    set(TNN_OPENMP_ENABLE ON CACHE BOOL "" FORCE)  # Multi-Thread
+    add_definitions(-DTNN_OPENCL_ENABLE)           # for OpenCL GPU
+    add_definitions(-DDEBUG_ON)                    # for WIN/Linux Log
+    add_definitions(-DDEBUG_LOG_ON)                # for WIN/Linux Log
+    add_definitions(-DDEBUG_IMSHOW_OFF)            # for OpenCV show
+    add_definitions(-DPLATFORM_WINDOWS)
+endif ()
+set(TNN_ROOT 3rdparty/TNN)
+include_directories(${TNN_ROOT}/include)
+include_directories(${TNN_ROOT}/third_party/opencl/include)
+add_subdirectory(${TNN_ROOT}) # 添加外部项目文件夹
+MESSAGE(STATUS "TNN_ROOT = ${TNN_ROOT}")
+
+
+# Detector
+include_directories(src)
+set(SRC_LIST src/KeyPointDetector.cpp)
+add_library(dmcv SHARED ${SRC_LIST})
+target_link_libraries(dmcv ${OpenCV_LIBS} base_utils)
+MESSAGE(STATUS "DIR_SRCS = ${SRC_LIST}")
+
+add_executable(Detector src/main_for_pose.cpp)
+target_link_libraries(Detector dmcv TNN -lpthread)
+
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6ed4b4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,134 @@
+# Human-Pose-Estimation-Lite-cpp
+
+这是轻量化版本的人体姿态估计(2D Pose)C++推理代码，推理框架使用[TNN](https://github.com/Tencent/TNN)
+
+- 轻量化模型是基于MobileNet V2的改进版本 
+- 使用COCO的数据集进行训练，也可以支持MPII数据
+- 支持OpenCL模型推理加速，在普通手机可实时检测
+- 该仓库并未集成`人体检测模型`，Pose检测输入是原图，使用人体检测框并进行裁剪，Pose检测效果会更好
+- 关于轻量化版本的人体检测检测模型，可参考[Object-Detection-Lite-cpp](https://github.com/PanJinquan/Object-Detection-Lite-cpp)
+- 仅用于学习交流，并未进行过多的性能优化
+- 博客《[2D Pose人体关键点检测(Python/Android /C++ Demo)](https://panjinquan.blog.csdn.net/article/details/115765863) 》
+- [Python Demo](https://github.com/PanJinquan/Human-Keypoints-Detection)  模型训练代码暂时未提供
+- [Android Demo](https://download.csdn.net/download/guyuealian/24186395) 已经集成了轻量化版本的`人体检测模型`和`人体姿态估计模型`，在普通手机可实时检测
+
+|[Android Demo](https://download.csdn.net/download/guyuealian/24186395) CPU:70ms,GPU:50ms|
+|---|
+|![Android Demo](data/videos/demo.gif)|
+
+
+## 1.目录结构
+```
+.
+├── 3rdparty
+├── data
+├── docs
+├── src
+├── build.sh
+├── CMakeLists.txt
+├── README.md
+└── result.jpg
+```
+
+## 2.配置说明
+#### (1)依赖库
+- TNN：https://github.com/Tencent/TNN (推理框架)
+- OpenCV: https://opencv.org/releases （推荐opencv-4.3.0）
+- OpenCL: https://software.intel.com/content/www/us/en/develop/tools/opencl-sdk/choose-download.html (GPU的支持)
+- base-utils：https://github.com/PanJinquan/base-utils  (一些文件和图像处理的相关工具)
+
+#### (2)配置说明
+
+- 配置OpenCV（推荐opencv-4.3.0）
+  
+```bash
+mkdir build
+cd build
+cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
+sudo make install
+```
+
+- 配置OpenCL（可选）
+> Android系统一般都支持OpenCL，Linux系统可参考如下配置：
+
+```bash
+# 参考安装OpenCL： https://blog.csdn.net/qq_28483731/article/details/68235383，作为测试，安装`intel cpu版本的OpenCL`即可
+# 安装clinfo，clinfo是一个显示OpenCL平台和设备的软件
+sudo apt-get install clinfo
+# 安装依赖
+sudo apt install dkms xz-utils openssl libnuma1 libpciaccess0 bc curl libssl-dev lsb-core libicu-dev
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+echo "deb http://download.mono-project.com/repo/debian wheezy main" | sudo tee /etc/apt/sources.list.d/mono-xamarin.list
+sudo apt-get update
+sudo apt-get install mono-complete
+# 在intel官网上下载了intel SDK的tgz文件，并且解压
+sudo sh install.sh
+```
+
+
+
+## 3.模型参数说明
+- 模型需要配置的参数如下：
+```c++
+struct ModelParam {
+    float aspect_ratio;                //长宽比，一般为0.75
+    float scale_ratio;                 //缩放比例，一般为1.25
+    int input_width;                   //模型输入宽度，单位：像素
+    int input_height;                  //模型输入高度，单位：像素
+    bool use_udp;                      //是否使用无偏估计UDP,一般为false
+    bool use_rgb;                      //是否使用RGB作为模型输入
+    vector<float> bias;                //输入数据偏置：bias=-m/std
+    vector<float> scale;               //输入数据归一化尺度：scale=1/std/255
+    vector<vector<float>> skeleton;    //关键点连接序号ID(用于可视化显示)
+};
+```
+
+## 4.Demo
+ 
+- `bash build.sh`
+
+<img src="docs/result.jpg" width="500" height="" />
+
+
+## 5.COCO关键点说明
+- 关键点连接线序号（用于绘制图像）
+
+```python
+skeleton =[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], [8, 10], [0, 1], [0, 2], [1, 3], [2, 4]]
+```
+
+- 图像左右翻转时，成对的关键点（训练时用于数据增强）
+
+```python
+flip_pairs=[[1, 2], [3, 4], [5, 6], [7, 8],[9, 10], [11, 12], [13, 14], [15, 16]]
+```
+
+- 每个关键点序号对应人体关键点的意义
+```python
+"keypoints": {
+ 0: "nose",
+ 1: "left_eye",
+ 2: "right_eye",
+ 3: "left_ear",
+ 4: "right_ear",
+ 5: "left_shoulder",
+ 6: "right_shoulder",
+ 7: "left_elbow",
+ 8: "right_elbow",
+ 9: "left_wrist",
+ 10: "right_wrist",
+ 11: "left_hip",
+ 12: "right_hip",
+ 13: "left_knee",
+ 14: "right_knee",
+ 15: "left_ankle",
+ 16: "right_ankle"
+}
+```
+
+## 6.联系
+
+- pan_jinquan@163.com
+- 麻烦给个`Star`：
+- 如果你觉得该帖子帮到你，还望贵人多多支持，鄙人会再接再厉，继续努力的~
+<img src="docs/pjq.png" width="800" height="" />
\ No newline at end of file
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..67b451d
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+if [ ! -d "build/" ];then
+  mkdir "build"
+else
+  echo "exist build"
+fi
+cd build
+cmake ..
+make -j4
+sleep 1
+#./video_for_pose
+./Detector
+
diff --git a/data/test_image/person/README.md b/data/test_image/person/README.md
new file mode 100644
index 0000000..9f3b108
--- /dev/null
+++ b/data/test_image/person/README.md
@@ -0,0 +1,5 @@
+# 
+|image_name|(x,y,w,h)|
+|:---:|:---:|
+|test.png|207, 7, 385, 791|
+|test.png|[[1306, 243, 239, 911], [200, 240, 394, 841], [523, 174, 473, 1039]]|
\ No newline at end of file
diff --git a/data/test_image/person/test.png b/data/test_image/person/test.png
new file mode 100644
index 0000000..226741c
Binary files /dev/null and b/data/test_image/person/test.png differ
diff --git a/data/test_image/person/test1.jpg b/data/test_image/person/test1.jpg
new file mode 100644
index 0000000..e42e0a5
Binary files /dev/null and b/data/test_image/person/test1.jpg differ
diff --git a/data/test_image/person/test2.jpg b/data/test_image/person/test2.jpg
new file mode 100755
index 0000000..96297ca
Binary files /dev/null and b/data/test_image/person/test2.jpg differ
diff --git a/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnmodel b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnmodel
new file mode 100644
index 0000000..eb338bc
Binary files /dev/null and b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnmodel differ
diff --git a/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnproto b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnproto
new file mode 100644
index 0000000..dd75736
--- /dev/null
+++ b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt.tnnproto
@@ -0,0 +1,107 @@
+"1 103 1 4206624770 ,"
+"input 1 3 256 192 ,"
+" 329 332 337 340 345 348 351 354 357 362 365 368 371 374 377 380 383 388 391 394 397 400 403 406 409 412 415 418 423 426 429 432 435 438 441 444 449 452 455 458 461 464 467 470 474 475 477 478 480 481 483 486 489 492 495 498 501 504 507 510 513 516 519 522 525 528 531 534 537 540 543 546 549 552 555 558 561 564 567 570 573 576 579 582 585 588 591 594 597 600 603 606 609 612 615 618 621 624 627 630 633 input output ,"
+"output ,"
+" 102 ,"
+"Convolution Conv_0 1 1 input 483 1 3 32 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_1 1 1 483 329 ,"
+"Convolution Conv_2 1 1 329 486 32 1 32 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_3 1 1 486 332 ,"
+"Convolution Conv_4 1 1 332 489 1 32 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_5 1 1 489 492 1 16 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_6 1 1 492 337 ,"
+"Convolution Conv_7 1 1 337 495 96 1 96 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_8 1 1 495 340 ,"
+"Convolution Conv_9 1 1 340 498 1 96 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_10 1 1 498 501 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_11 1 1 501 345 ,"
+"Convolution Conv_12 1 1 345 504 144 1 144 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_13 1 1 504 348 ,"
+"Convolution Conv_14 1 1 348 507 1 144 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_15 2 1 498 507 351 ,"
+"Convolution Conv_16 1 1 351 510 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_17 1 1 510 354 ,"
+"Convolution Conv_18 1 1 354 513 144 1 144 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_19 1 1 513 357 ,"
+"Convolution Conv_20 1 1 357 516 1 144 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_21 1 1 516 519 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_22 1 1 519 362 ,"
+"Convolution Conv_23 1 1 362 522 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_24 1 1 522 365 ,"
+"Convolution Conv_25 1 1 365 525 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_26 2 1 516 525 368 ,"
+"Convolution Conv_27 1 1 368 528 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_28 1 1 528 371 ,"
+"Convolution Conv_29 1 1 371 531 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_30 1 1 531 374 ,"
+"Convolution Conv_31 1 1 374 534 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_32 2 1 368 534 377 ,"
+"Convolution Conv_33 1 1 377 537 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_34 1 1 537 380 ,"
+"Convolution Conv_35 1 1 380 540 192 1 192 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_36 1 1 540 383 ,"
+"Convolution Conv_37 1 1 383 543 1 192 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_38 1 1 543 546 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_39 1 1 546 388 ,"
+"Convolution Conv_40 1 1 388 549 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_41 1 1 549 391 ,"
+"Convolution Conv_42 1 1 391 552 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_43 2 1 543 552 394 ,"
+"Convolution Conv_44 1 1 394 555 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_45 1 1 555 397 ,"
+"Convolution Conv_46 1 1 397 558 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_47 1 1 558 400 ,"
+"Convolution Conv_48 1 1 400 561 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_49 2 1 394 561 403 ,"
+"Convolution Conv_50 1 1 403 564 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_51 1 1 564 406 ,"
+"Convolution Conv_52 1 1 406 567 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_53 1 1 567 409 ,"
+"Convolution Conv_54 1 1 409 570 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_55 2 1 403 570 412 ,"
+"Convolution Conv_56 1 1 412 573 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_57 1 1 573 415 ,"
+"Convolution Conv_58 1 1 415 576 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_59 1 1 576 418 ,"
+"Convolution Conv_60 1 1 418 579 1 384 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_61 1 1 579 582 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_62 1 1 582 423 ,"
+"Convolution Conv_63 1 1 423 585 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_64 1 1 585 426 ,"
+"Convolution Conv_65 1 1 426 588 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_66 2 1 579 588 429 ,"
+"Convolution Conv_67 1 1 429 591 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_68 1 1 591 432 ,"
+"Convolution Conv_69 1 1 432 594 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_70 1 1 594 435 ,"
+"Convolution Conv_71 1 1 435 597 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_72 2 1 429 597 438 ,"
+"Convolution Conv_73 1 1 438 600 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_74 1 1 600 441 ,"
+"Convolution Conv_75 1 1 441 603 576 1 576 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_76 1 1 603 444 ,"
+"Convolution Conv_77 1 1 444 606 1 576 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_78 1 1 606 609 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_79 1 1 609 449 ,"
+"Convolution Conv_80 1 1 449 612 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_81 1 1 612 452 ,"
+"Convolution Conv_82 1 1 452 615 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_83 2 1 606 615 455 ,"
+"Convolution Conv_84 1 1 455 618 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_85 1 1 618 458 ,"
+"Convolution Conv_86 1 1 458 621 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_87 1 1 621 461 ,"
+"Convolution Conv_88 1 1 461 624 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_89 2 1 455 624 464 ,"
+"Convolution Conv_90 1 1 464 627 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_91 1 1 627 467 ,"
+"Convolution Conv_92 1 1 467 630 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_93 1 1 630 470 ,"
+"Convolution Conv_94 1 1 470 633 1 960 320 1 1 1 1 0 0 1 -1 1 1 ,"
+"Deconvolution ConvTranspose_95 1 1 633 474 1 320 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_97 1 1 474 475 ,"
+"Deconvolution ConvTranspose_98 1 1 475 477 1 128 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_100 1 1 477 478 ,"
+"Deconvolution ConvTranspose_101 1 1 478 480 1 128 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_103 1 1 480 481 ,"
+"Convolution Conv_104 1 1 481 output 1 128 17 1 1 1 1 0 0 1 -1 1 1 ,"
diff --git a/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnmodel b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnmodel
new file mode 100644
index 0000000..18ca190
Binary files /dev/null and b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnmodel differ
diff --git a/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnproto b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnproto
new file mode 100644
index 0000000..dd75736
--- /dev/null
+++ b/data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt16.tnnproto
@@ -0,0 +1,107 @@
+"1 103 1 4206624770 ,"
+"input 1 3 256 192 ,"
+" 329 332 337 340 345 348 351 354 357 362 365 368 371 374 377 380 383 388 391 394 397 400 403 406 409 412 415 418 423 426 429 432 435 438 441 444 449 452 455 458 461 464 467 470 474 475 477 478 480 481 483 486 489 492 495 498 501 504 507 510 513 516 519 522 525 528 531 534 537 540 543 546 549 552 555 558 561 564 567 570 573 576 579 582 585 588 591 594 597 600 603 606 609 612 615 618 621 624 627 630 633 input output ,"
+"output ,"
+" 102 ,"
+"Convolution Conv_0 1 1 input 483 1 3 32 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_1 1 1 483 329 ,"
+"Convolution Conv_2 1 1 329 486 32 1 32 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_3 1 1 486 332 ,"
+"Convolution Conv_4 1 1 332 489 1 32 16 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_5 1 1 489 492 1 16 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_6 1 1 492 337 ,"
+"Convolution Conv_7 1 1 337 495 96 1 96 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_8 1 1 495 340 ,"
+"Convolution Conv_9 1 1 340 498 1 96 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_10 1 1 498 501 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_11 1 1 501 345 ,"
+"Convolution Conv_12 1 1 345 504 144 1 144 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_13 1 1 504 348 ,"
+"Convolution Conv_14 1 1 348 507 1 144 24 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_15 2 1 498 507 351 ,"
+"Convolution Conv_16 1 1 351 510 1 24 144 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_17 1 1 510 354 ,"
+"Convolution Conv_18 1 1 354 513 144 1 144 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_19 1 1 513 357 ,"
+"Convolution Conv_20 1 1 357 516 1 144 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_21 1 1 516 519 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_22 1 1 519 362 ,"
+"Convolution Conv_23 1 1 362 522 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_24 1 1 522 365 ,"
+"Convolution Conv_25 1 1 365 525 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_26 2 1 516 525 368 ,"
+"Convolution Conv_27 1 1 368 528 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_28 1 1 528 371 ,"
+"Convolution Conv_29 1 1 371 531 192 1 192 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_30 1 1 531 374 ,"
+"Convolution Conv_31 1 1 374 534 1 192 32 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_32 2 1 368 534 377 ,"
+"Convolution Conv_33 1 1 377 537 1 32 192 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_34 1 1 537 380 ,"
+"Convolution Conv_35 1 1 380 540 192 1 192 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_36 1 1 540 383 ,"
+"Convolution Conv_37 1 1 383 543 1 192 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_38 1 1 543 546 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_39 1 1 546 388 ,"
+"Convolution Conv_40 1 1 388 549 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_41 1 1 549 391 ,"
+"Convolution Conv_42 1 1 391 552 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_43 2 1 543 552 394 ,"
+"Convolution Conv_44 1 1 394 555 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_45 1 1 555 397 ,"
+"Convolution Conv_46 1 1 397 558 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_47 1 1 558 400 ,"
+"Convolution Conv_48 1 1 400 561 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_49 2 1 394 561 403 ,"
+"Convolution Conv_50 1 1 403 564 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_51 1 1 564 406 ,"
+"Convolution Conv_52 1 1 406 567 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_53 1 1 567 409 ,"
+"Convolution Conv_54 1 1 409 570 1 384 64 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_55 2 1 403 570 412 ,"
+"Convolution Conv_56 1 1 412 573 1 64 384 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_57 1 1 573 415 ,"
+"Convolution Conv_58 1 1 415 576 384 1 384 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_59 1 1 576 418 ,"
+"Convolution Conv_60 1 1 418 579 1 384 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_61 1 1 579 582 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_62 1 1 582 423 ,"
+"Convolution Conv_63 1 1 423 585 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_64 1 1 585 426 ,"
+"Convolution Conv_65 1 1 426 588 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_66 2 1 579 588 429 ,"
+"Convolution Conv_67 1 1 429 591 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_68 1 1 591 432 ,"
+"Convolution Conv_69 1 1 432 594 576 1 576 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_70 1 1 594 435 ,"
+"Convolution Conv_71 1 1 435 597 1 576 96 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_72 2 1 429 597 438 ,"
+"Convolution Conv_73 1 1 438 600 1 96 576 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_74 1 1 600 441 ,"
+"Convolution Conv_75 1 1 441 603 576 1 576 3 3 2 2 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_76 1 1 603 444 ,"
+"Convolution Conv_77 1 1 444 606 1 576 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Convolution Conv_78 1 1 606 609 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_79 1 1 609 449 ,"
+"Convolution Conv_80 1 1 449 612 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_81 1 1 612 452 ,"
+"Convolution Conv_82 1 1 452 615 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_83 2 1 606 615 455 ,"
+"Convolution Conv_84 1 1 455 618 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_85 1 1 618 458 ,"
+"Convolution Conv_86 1 1 458 621 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_87 1 1 621 461 ,"
+"Convolution Conv_88 1 1 461 624 1 960 160 1 1 1 1 0 0 1 -1 1 1 ,"
+"Add Add_89 2 1 455 624 464 ,"
+"Convolution Conv_90 1 1 464 627 1 160 960 1 1 1 1 0 0 1 -1 1 1 ,"
+"ReLU6 Clip_91 1 1 627 467 ,"
+"Convolution Conv_92 1 1 467 630 960 1 960 3 3 1 1 1 1 1 -1 1 1 ,"
+"ReLU6 Clip_93 1 1 630 470 ,"
+"Convolution Conv_94 1 1 470 633 1 960 320 1 1 1 1 0 0 1 -1 1 1 ,"
+"Deconvolution ConvTranspose_95 1 1 633 474 1 320 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_97 1 1 474 475 ,"
+"Deconvolution ConvTranspose_98 1 1 475 477 1 128 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_100 1 1 477 478 ,"
+"Deconvolution ConvTranspose_101 1 1 478 480 1 128 128 4 4 2 2 1 1 1 -1 1 1 ,"
+"ReLU Relu_103 1 1 480 481 ,"
+"Convolution Conv_104 1 1 481 output 1 128 17 1 1 1 1 0 0 1 -1 1 1 ,"
diff --git a/data/videos/demo.gif b/data/videos/demo.gif
new file mode 100644
index 0000000..1d4cde7
Binary files /dev/null and b/data/videos/demo.gif differ
diff --git a/data/videos/demo.mp4 b/data/videos/demo.mp4
new file mode 100644
index 0000000..9b56e18
Binary files /dev/null and b/data/videos/demo.mp4 differ
diff --git a/data/videos/kunkun_cut.mp4 b/data/videos/kunkun_cut.mp4
new file mode 100644
index 0000000..f83320b
Binary files /dev/null and b/data/videos/kunkun_cut.mp4 differ
diff --git a/docs/pjq.png b/docs/pjq.png
new file mode 100644
index 0000000..c9fcac9
Binary files /dev/null and b/docs/pjq.png differ
diff --git a/docs/result.jpg b/docs/result.jpg
new file mode 100644
index 0000000..63110ea
Binary files /dev/null and b/docs/result.jpg differ
diff --git a/result.jpg b/result.jpg
new file mode 100644
index 0000000..63110ea
Binary files /dev/null and b/result.jpg differ
diff --git a/src/KeyPointDetector.cpp b/src/KeyPointDetector.cpp
new file mode 100644
index 0000000..60d1ebd
--- /dev/null
+++ b/src/KeyPointDetector.cpp
@@ -0,0 +1,533 @@
+//
+// Created by pan_jinquan@163.com on 2020/6/24.
+//
+
+#include "KeyPointDetector.h"
+#include "debug.h"
+#include "file_utils.h"
+
+namespace dm {
+    namespace vision {
+        KeyPointDetector::KeyPointDetector(const string modelPath,
+                                           ModelParam modelParam,
+                                           int numThread,
+                                           TNNDevice deviceID) {
+            this->mModelParam = modelParam;
+            this->mNumThread = numThread;
+            string tnnproto = modelPath + ".tnnproto";
+            string tnnmodel = modelPath + ".tnnmodel";
+            // Load param and model
+            int status = init_model(tnnmodel, tnnproto, deviceID);
+            if (status == -1) {
+                LOGI("TNN init failed %d", status);
+            } else {
+                LOGI("TNN init successfully: %d", status);
+            }
+        }
+
+
+        KeyPointDetector::KeyPointDetector(const string modelPath,
+                                           const string protoPath,
+                                           ModelParam modelParam,
+                                           int numThread,
+                                           TNNDevice deviceID) {
+            this->mModelParam = modelParam;
+            this->mNumThread = numThread;
+            // Load param and model
+            int status = init_model(modelPath, protoPath, deviceID);
+            if (status == -1) {
+                LOGI("TNN init failed %d", status);
+            } else {
+                LOGI("TNN init successfully: %d", status);
+            }
+        }
+
+        int KeyPointDetector::init_model(const string &modelPath,
+                                         const string &protoPath,
+                                         TNNDevice deviceID) {
+            //vector<int> nchw = {1, 3, this->mInputHeight, this->mInputWidth};
+            vector<int> nchw = {1, 3, this->mModelParam.input_height,
+                                this->mModelParam.input_width};
+            string protoContent, modelContent;
+
+            if (!file_exists(modelPath)) {
+                LOGI("no tnnmodel file:%s", modelPath.c_str());
+                return -1;
+
+            }
+            if (!file_exists(protoPath)) {
+                LOGI("no tnnproto file:%s", protoPath.c_str());
+                return -1;
+            }
+            LOGI("load tnnproto %s", protoPath.c_str());
+            LOGI("load tnnmodel %s", modelPath.c_str());
+            protoContent = load_file(protoPath);
+            modelContent = load_file(modelPath);
+            LOGI("tnnproto len=%d", protoContent.length());
+            LOGI("tnnmodel len=%d", modelContent.length());
+            this->mDevice = deviceID == TNNGPU ? DEVICE_GPU : DEVICE_CPU;
+            TNN_NS::Status status;
+            TNN_NS::ModelConfig config;
+
+            config.model_type = TNN_NS::MODEL_TYPE_TNN;
+            //config.model_type = TNN_NS::MODEL_TYPE_NCNN;
+            config.params = {protoContent, modelContent};
+
+            net = make_shared<TNN_NS::TNN>();
+            status = net->Init(config);
+
+            if (status != TNN_NS::TNN_OK) {
+                LOGI("detector init failed %d", (int) status);
+                return -1;
+            }
+
+            TNN_NS::InputShapesMap shapeMap;
+            //shapeMap.insert(pair<string, TNN_NS::DimsVector>("input", nchw));
+            //instance
+            TNN_NS::NetworkConfig network_config;
+            network_config.library_path = {""};
+            network_config.device_type = this->mDevice;
+            instance = net->CreateInst(network_config, status, shapeMap);
+            //instance->SetCpuNumThreads(std::max(this->mNumThread, 1));
+            if (this->mDevice == DEVICE_CPU) {
+                // fix a BUG:Error Init layer Clip_131 (err: 40966 or 0xA006)
+                instance->SetCpuNumThreads(std::max(this->mNumThread, 1));
+            }
+            if (status != TNN_NS::TNN_OK || !instance) {
+                LOGI("DEVICE_GPU:%d initialization failed, switch to DEVICE_CPU", this->mDevice);
+                // 如果出现GPU加载失败，切换到CPU
+                this->mDevice = DEVICE_CPU;
+                network_config.device_type = this->mDevice;
+                instance = net->CreateInst(network_config, status, shapeMap);
+                instance->SetCpuNumThreads(std::max(this->mNumThread, 1));
+                if (status != TNN_NS::TNN_OK) {
+                    LOGI("detector init failed %%lld", (int) status);
+                    return -1;
+                }
+            }
+            return status == TNN_NS::TNN_OK ? 0 : -1;
+        }
+
+        KeyPointDetector::~KeyPointDetector() {
+            this->net->DeInit();
+            this->instance->DeInit();
+            this->net = nullptr;
+            this->instance = nullptr;
+        }
+
+        int
+        KeyPointDetector::detect(cv::Mat &imgBRG, vector<cv::Rect> boxes, float scoreThresh, FrameInfo &outFrameInfo) {
+            this->time_pre_process = 0.f;
+            this->time_model_infer = 0.f;
+            this->time_post_process = 0.f;
+            this->time_total = 0.f;
+
+            if (imgBRG.empty()) {
+                LOGI("image is empty ,please check!");
+                return 0;
+            }
+            int nums = boxes.size();
+            DEBUG_TIME(t0);
+            for (int i = 0; i < nums; ++i) {
+                DEBUG_TIME(t1);
+                TransInfo transInfo;
+                this->pre_process(imgBRG, boxes.at(i), transInfo);
+                DEBUG_TIME(t2);
+                shared_ptr<TNN_NS::Mat> heatmap;
+                this->forward(transInfo.input_image, heatmap);
+                DEBUG_TIME(t3);
+                ObjectInfo outObjectInfo;
+                this->post_process(outObjectInfo, transInfo, heatmap, scoreThresh);
+                DEBUG_TIME(t4);
+                outFrameInfo.info.push_back(outObjectInfo);
+                this->time_pre_process += RUN_TIME(t2 - t1);
+                this->time_model_infer += RUN_TIME(t3 - t2);
+                this->time_post_process += RUN_TIME(t4 - t3);
+            }
+            DEBUG_TIME(t5);
+            this->time_total = RUN_TIME(t5 - t0);
+            LOGW("===================Benchmark========================");
+            LOGW("-->pre_process  : %3.5f/%d=%3.5f ms", this->time_pre_process, nums, this->time_pre_process / nums);
+            LOGW("-->model_infer  : %3.5f/%d=%3.5f ms", this->time_model_infer, nums, this->time_model_infer / nums);
+            LOGW("-->post_process : %3.5f/%d=%3.5f ms", this->time_post_process, nums, this->time_post_process / nums);
+            LOGW("-->avg_total    : %3.5f/%d=%3.5f ms", this->time_total, nums, this->time_total / nums);
+            LOGW("====================================================");
+            return 0;
+        }
+
+
+        void KeyPointDetector::pre_process(cv::Mat &image, cv::Rect rect, TransInfo &outTransInfo) {
+            float x = rect.x;
+            float y = rect.y;
+            float w = rect.width;
+            float h = rect.height;
+            outTransInfo.rect = rect;
+            //float aspect_ratio = 0.75;
+            //float scale_ratio = 1.25;
+            float pixel_std = 200;
+            cv::Point2f center;
+            cv::Point2f scale;
+            center.x = x + w * 0.5;
+            center.y = y + h * 0.5;
+            if (w > this->mModelParam.aspect_ratio * h) {
+                h = w * 1.0 / this->mModelParam.aspect_ratio;
+            } else if (w < this->mModelParam.aspect_ratio * h) {
+                w = h * this->mModelParam.aspect_ratio;
+            }
+            scale.x = w * 1.0 / pixel_std;
+            scale.y = h * 1.0 / pixel_std;
+            if (center.x != -1) {
+                scale.x = scale.x * this->mModelParam.scale_ratio;
+                scale.y = scale.y * this->mModelParam.scale_ratio;
+            }
+            cv::Point2f output_size(this->mModelParam.input_width, this->mModelParam.input_height);
+            float rot = 0;
+            cv::Mat trans = get_affine_transform(center,
+                                                 scale,
+                                                 rot,
+                                                 output_size,
+                                                 {0., 0.},
+                                                 false);
+
+            cv::warpAffine(image, outTransInfo.input_image, trans,
+                           cv::Size(output_size.x, output_size.y),
+                           INTER_FLAGS);
+            if (this->mModelParam.use_rgb) {
+                cv::cvtColor(outTransInfo.input_image, outTransInfo.input_image, cv::COLOR_BGR2RGB);
+            }
+            outTransInfo.center = center;
+            outTransInfo.scale = scale;
+            DEBUG_COUT(<< "trans = " << endl << " " << trans);
+            DEBUG_PRINT("center: [%f,%f]", center.x, center.y);
+            DEBUG_PRINT("scale : [%f,%f]", scale.x, scale.y);
+        }
+
+        cv::Mat KeyPointDetector::get_affine_transform(cv::Point2f center,
+                                                       cv::Point2f scale,
+                                                       float rot,
+                                                       cv::Point2f output_size,
+                                                       cv::Point2f shift,
+                                                       bool inv) {
+
+            float scale_tmp_0 = scale.x * 200.0;
+            float scale_tmp_1 = scale.y * 200.0;
+            float src_w = scale_tmp_0;
+            float dst_w = (float) output_size.x;
+            float dst_h = (float) output_size.y;
+            float rot_rad = PI * rot / 180;
+            float src_point0 = 0;
+            float src_point1 = -0.5 * src_w;
+            float sn = sin(rot_rad);
+            float cs = cos(rot_rad);
+            //src_dir = get_dir([0, src_w * -0.5], rot_rad)
+            //dst_dir = np.array([0, dst_w * -0.5], np.float32)
+            float src_dir_x = src_point0 * cs - src_point1 * sn;
+            float src_dir_y = src_point0 * sn + src_point1 * cs;
+            float dst_dir_x = 0;
+            float dst_dir_y = -0.5 * dst_w;
+            //src = np.zeros((3, 2), dtype = np.float32)
+            //dst = np.zeros((3, 2), dtype = np.float32)
+            cv::Mat src(3, 2, CV_32FC1);
+            cv::Mat dst(3, 2, CV_32FC1);
+            //src[0, :] = center + scale_tmp * shift
+            src.at<float>(0, 0) = center.x + scale_tmp_0 * shift.x;
+            src.at<float>(0, 1) = center.y + scale_tmp_1 * shift.y;
+            //src[1, :] = center + src_dir + scale_tmp * shift
+            src.at<float>(1, 0) = center.x + src_dir_x + scale_tmp_0 * shift.x;
+            src.at<float>(1, 1) = center.y + src_dir_y + scale_tmp_1 * shift.y;
+            //dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+            dst.at<float>(0, 0) = dst_w * 0.5;
+            dst.at<float>(0, 1) = dst_h * 0.5;
+            //dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) +dst_dir
+            dst.at<float>(1, 0) = dst_w * 0.5 + dst_dir_x;
+            dst.at<float>(1, 1) = dst_h * 0.5 + dst_dir_y;
+            //src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+            //b + np.array([-direct[1], direct[0]], dtype=np.float32)
+            cv::Point2f src_o = {0, 0};
+            get_3rd_point({src.at<float>(0, 0), src.at<float>(0, 1)},
+                          {src.at<float>(1, 0), src.at<float>(1, 1)}, src_o);
+            cv::Point2f dst_o = {0, 0};
+            get_3rd_point({dst.at<float>(0, 0), dst.at<float>(0, 1)},
+                          {dst.at<float>(1, 0), dst.at<float>(1, 1)}, dst_o);
+            src.at<float>(2, 0) = src_o.x;
+            src.at<float>(2, 1) = src_o.y;
+            dst.at<float>(2, 0) = dst_o.x;
+            dst.at<float>(2, 1) = dst_o.y;
+            cv::Mat trans;
+            if (inv) {
+                trans = cv::getAffineTransform(dst, src);
+            } else {
+                trans = cv::getAffineTransform(src, dst);
+            }
+            DEBUG_COUT(<< "src = " << endl << " " << src);
+            DEBUG_COUT(<< "dst = " << endl << " " << dst);
+            return trans;
+        }
+
+        void KeyPointDetector::get_3rd_point(cv::Point2f a, cv::Point2f b, cv::Point2f &o) {
+            //direct = a - b
+            //return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+            o.x = b.x - (a.y - b.y);
+            o.y = b.y + (a.x - b.x);
+        }
+
+
+        int KeyPointDetector::forward(cv::Mat &input_image, shared_ptr<TNN_NS::Mat> &outHeatmap) {
+
+            // 数据始终位于CPU，不需要设置成OPENCL，tnn自动复制cpu->gpu
+            //TNN_NS::DimsVector target_dims = {1, 3, this->mInputHeight, this->mInputWidth};
+            TNN_NS::DimsVector target_dims = {1, 3, this->mModelParam.input_height,
+                                              this->mModelParam.input_width};
+            //LOGW("instance device:%d", this->device);
+            //LOGW("data device    :%d", DEVICE_CPU);
+            auto input_tensor = make_shared<TNN_NS::Mat>(DEVICE_CPU,
+                                                         TNN_NS::N8UC3, target_dims,
+                                                         input_image.data);
+            // step 1. set input mat
+            TNN_NS::MatConvertParam input_convert_param;
+            input_convert_param.scale = this->mModelParam.scale;
+            input_convert_param.bias = this->mModelParam.bias;
+            // TNN初始化时设置CPU或者GPU模式
+            auto status = instance->SetInputMat(input_tensor, input_convert_param);
+            if (status != TNN_NS::TNN_OK) {
+                LOGE("SetInputMat Error: %s", status.description().c_str());
+                return status;
+            }
+
+            // step 2. Forward
+            status = instance->ForwardAsync(nullptr);
+            //status = instance->Forward();
+            if (status != TNN_NS::TNN_OK) {
+                LOGE("Forward Error: %s", status.description().c_str());
+                return status;
+            }
+
+            // step 3. get output mat
+            TNN_NS::MatConvertParam output_convert_param;
+            shared_ptr<TNN_NS::Mat> output_tensor = nullptr;
+            status = instance->GetOutputMat(output_tensor, output_convert_param,
+                                            "output",
+                                            DEVICE_CPU,
+                                            TNN_NS::NCHW_FLOAT);
+
+            if (status != TNN_NS::TNN_OK) {
+                LOGE("GetOutputMat Error: %s", status.description().c_str());
+                return status;
+            }
+
+            outHeatmap = output_tensor;
+            DEBUG_PRINT("input_tensor : w,h=%d,%d", input_tensor->GetWidth(),
+                        input_tensor->GetHeight());
+            DEBUG_PRINT("output_tensor: w,h=%d,%d", output_tensor->GetWidth(),
+                        output_tensor->GetHeight());
+            return status;
+        }
+
+        void KeyPointDetector::get_final_preds(TransInfo &transInfo,
+                                               shared_ptr<TNN_NS::Mat> &heatmap,
+                                               vector<cv::Point2f> &coords,
+                                               vector<float> &maxvals) {
+
+            get_max_preds(heatmap, coords, maxvals);
+            int w = heatmap->GetWidth();
+            int h = heatmap->GetHeight();
+            int num_joints = heatmap->GetChannel(); //num joints
+            auto *heatmaps_data = (float *) heatmap->GetData();// dim=(1,num_anchors,2,1) NCHW
+            int wh = w * h;
+            for (int c = 0; c < num_joints; ++c) {
+                //const float *ptr = finger->heatmap->GetDims(c);
+                const float *ptr = &heatmaps_data[c * wh];
+                int x = int(floor(coords[c].x + 0.5));
+                int y = int(floor(coords[c].y + 0.5));
+                if (((1 < x) && (x < w - 1)) && ((1 < y) && (y < h - 1))) {
+                    float diff_x = ptr[y * w + x + 1] - ptr[y * w + x - 1];
+                    float diff_y = ptr[(y + 1) * w + x] - ptr[(y - 1) * w + x];
+                    coords[c].x += sign(diff_x) * 0.25;
+                    coords[c].y += sign(diff_y) * 0.25;
+                }
+            }
+            float rot = 0;
+            cv::Point2f output_size(w, h);
+            cv::Mat trans = get_affine_transform(transInfo.center,
+                                                 transInfo.scale,
+                                                 rot,
+                                                 output_size,
+                                                 {0., 0.},
+                                                 true);
+            DEBUG_COUT(<< "inv-trans = " << endl << " " << trans);
+            for (int c = 0; c < num_joints; ++c) {
+                affine_transform(coords[c], trans);
+            }
+        }
+
+
+        void KeyPointDetector::get_final_preds_offset(TransInfo &transInfo,
+                                                      shared_ptr<TNN_NS::Mat> &heatmap,
+                                                      vector<cv::Point2f> &coords,
+                                                      vector<float> &maxvals) {
+            int w = heatmap->GetWidth();
+            int h = heatmap->GetHeight();
+            int num_joints = heatmap->GetChannel() / 3; //num joints
+            auto *heatmaps_data = (float *) heatmap->GetData();// dim=(1,num_anchors,2,1) NCHW
+            int wh = w * h;
+            float KPD = 4.0;
+            get_max_preds_offset(heatmap, coords, maxvals);
+            //DEBUG_COUT(<<"coords"<<coords);
+            for (int c = 0; c < num_joints; ++c) {
+                //const float *ptr = finger->heatmap->GetDims(c);
+                //const float *ptr = &heatmaps_data[3 * c * wh];
+                const float *offset_x = &heatmaps_data[(3 * c + 1) * wh];
+                const float *offset_y = &heatmaps_data[(3 * c + 2) * wh];
+                int x = int(coords[c].x);
+                int y = int(coords[c].y);
+                coords[c].x += offset_x[y * w + x] * KPD;
+                coords[c].y += offset_y[y * w + x] * KPD;
+            }
+            cv::Point2f output_size(w, h);
+            for (int c = 0; c < num_joints; ++c) {
+                affine_transform_offset(coords[c], transInfo.center, transInfo.scale, output_size);
+            }
+        }
+
+
+        void KeyPointDetector::post_process(ObjectInfo &outObjectInfo,
+                                            TransInfo &transInfo,
+                                            shared_ptr<TNN_NS::Mat> &heatmap,
+                                            float scoreThresh) {
+            vector<cv::Point2f> coords;
+            vector<float> maxvals;
+            if (this->mModelParam.use_udp) {
+                get_final_preds_offset(transInfo, heatmap, coords, maxvals);
+            } else {
+                get_final_preds(transInfo, heatmap, coords, maxvals);
+            }
+
+            for (int i = 0; i < coords.size(); ++i) {
+                KeyPoint kp;
+                if (maxvals[i] > scoreThresh) {
+                    kp.point = coords[i];
+                } else {
+                    kp.point = {-1., -1.};
+                }
+                kp.score = maxvals[i];
+                outObjectInfo.keypoints.push_back(kp);
+            }
+            outObjectInfo.rect = transInfo.rect;
+        }
+
+        void KeyPointDetector::affine_transform(cv::Point2f &pt, cv::Mat t) {
+            cv::Mat new_pt = (cv::Mat_<double>(3, 1) << pt.x, pt.y, 1.0);
+            cv::Mat out = t * new_pt; // (2,3)*(3,1)=(2,1)
+            pt.x = (float) out.at<double>(0, 0);
+            pt.y = (float) out.at<double>(1, 0);
+        }
+
+        void KeyPointDetector::affine_transform_offset(cv::Point2f &pt,
+                                                       cv::Point2f center,
+                                                       cv::Point2f scale,
+                                                       cv::Point2f output_size) {
+            scale = scale * 200.f;
+            float scale_x = scale.x / (output_size.x - 1.0);
+            float scale_y = scale.y / (output_size.y - 1.0);
+            pt.x = pt.x * scale_x + center.x - scale.x * 0.5;
+            pt.y = pt.y * scale_y + center.y - scale.y * 0.5;
+        }
+
+        void KeyPointDetector::get_max_preds(shared_ptr<TNN_NS::Mat> &heatmap,
+                                             vector<cv::Point2f> &preds,
+                                             vector<float> &maxvals) {
+            //int dims = batch_heatmaps->GetChannel();
+            int w = heatmap->GetWidth();
+            int h = heatmap->GetHeight();
+            auto *heatmaps_data = (float *) heatmap->GetData();// dim=(1,num_anchors,2,1) NCHW
+            int wh = w * h;
+            int num_joints = heatmap->GetChannel(); //num joints
+            for (int c = 0; c < num_joints; ++c) {
+                //const float *ptr = batch_heatmaps.channel(c);
+                const float *ptr = &heatmaps_data[c * wh];
+                cv::Point2f point;
+                float max_value = 0.0f;
+                for (int y = 0; y < h; ++y) {
+                    for (int x = 0; x < w; ++x) {
+                        if (ptr[x] >= max_value) {
+                            point.x = x;
+                            point.y = y;
+                            max_value = ptr[x];
+                        }
+                    }
+                    ptr += w;
+                }
+                preds.push_back(point);
+                maxvals.push_back(max_value);
+            }
+        }
+
+
+        void KeyPointDetector::get_max_preds_offset(shared_ptr<TNN_NS::Mat> &heatmap,
+                                                    vector<cv::Point2f> &preds,
+                                                    vector<float> &maxvals) {
+            //int dims = batch_heatmaps->GetChannel();
+            int w = heatmap->GetWidth();
+            int h = heatmap->GetHeight();
+            auto *heatmaps_data = (float *) heatmap->GetData();// dim=(1,num_anchors,2,1) NCHW
+            int wh = w * h;
+            int num_joints = heatmap->GetChannel() / 3; //num joints
+            for (int c = 0; c < num_joints; ++c) {
+                //const float *ptr = batch_heatmaps.channel(c);
+                //const float *ptr = &heatmaps_data[c * wh];
+                const float *ptr = &heatmaps_data[3 * c * wh];
+                cv::Point2f point;
+                float max_value = 0.0f;
+                for (int y = 0; y < h; ++y) {
+                    for (int x = 0; x < w; ++x) {
+                        if (ptr[x] >= max_value) {
+                            point.x = x;
+                            point.y = y;
+                            max_value = ptr[x];
+                        }
+                    }
+                    ptr += w;
+                }
+                preds.push_back(point);
+                maxvals.push_back(max_value);
+            }
+        }
+
+
+        void KeyPointDetector::visualize_result(cv::Mat &imgBRG,
+                                                FrameInfo &frameInfo,
+                                                vector<vector<int>> skeleton,
+                                                int waitKey) {
+            //draw rectangle
+            cv::Mat vis_image;
+            imgBRG.copyTo(vis_image);
+            for (int i = 0; i < frameInfo.info.size(); ++i) {
+                // draw rect
+                auto obj = frameInfo.info.at(i);
+                cv::Rect rect(obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+                string obj_info = "ID:" + to_string(i);
+                draw_rect_text(vis_image, rect, obj_info);
+                // draw points
+                vector<cv::Point2f> points;
+                vector<string> texts;
+                for (int j = 0; j < obj.keypoints.size(); ++j) {
+                    auto kps = obj.keypoints[j];
+                    LOGI("ID:%d point:[%f,%f] score:%f ", i, kps.point.x, kps.point.y, kps.score);
+                    // string info = to_string(i) + " score:" + to_string(lm.score);
+                    char info[200];
+                    //sprintf(info, "%d-score:%3.3f", j, kps.score);
+                    sprintf(info, "%d", j);
+                    points.push_back(cv::Point(kps.point.x, kps.point.y));
+                    texts.push_back(info);
+                }
+                draw_points_texts(vis_image, points, texts, cv::Scalar(0, 0, 255));
+                draw_lines(vis_image, points, skeleton, cv::Scalar(255, 0, 0));
+                //draw_arrowed_lines(vis_image, points, skeleton);
+            }
+            image_show("result", vis_image, waitKey);
+            image_save("../result.jpg", vis_image);
+        }
+
+    }
+};
+
diff --git a/src/KeyPointDetector.h b/src/KeyPointDetector.h
new file mode 100644
index 0000000..36bd3e9
--- /dev/null
+++ b/src/KeyPointDetector.h
@@ -0,0 +1,297 @@
+//
+// Created by pan_jinquan@163.com on 2020/6/24.
+//
+
+
+#ifndef BODY_DETECTION_RK3399_BODY_DETECTION_H
+#define BODY_DETECTION_RK3399_BODY_DETECTION_H
+
+#pragma once
+
+#include <vector>
+#include <tnn/utils/blob_converter.h>
+#include <tnn/core/tnn.h>
+#include <fstream>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <opencv2/opencv.hpp>
+#include <opencv2/core.hpp>
+#include "Types.h"
+#include "image_utils.h"
+
+using namespace std;
+
+#ifdef  TNN_ARM_ENABLE
+static TNN_NS::DeviceType DEVICE_CPU = TNN_NS::DEVICE_ARM;
+#else
+static TNN_NS::DeviceType DEVICE_CPU = TNN_NS::DEVICE_NAIVE;
+#endif
+
+#ifdef  TNN_OPENCL_ENABLE
+static TNN_NS::DeviceType DEVICE_GPU = TNN_NS::DEVICE_OPENCL;
+#else
+static TNN_NS::DeviceType DEVICE_GPU = TNN_NS::DEVICE_CUDA;
+#endif
+
+/***
+ * TNN设备
+ */
+typedef enum {
+    // run on CPU
+    TNNCPU = 0,
+    // run on GPU, if failed run on CPU(需要OpenCL支持)
+    TNNGPU = 1,
+    // run on NPU, if failed run on CPU(暂不支持NPU)
+    TNNNPU = 2,
+} TNNDevice;
+
+/***
+ * 符号函数
+ * @param x
+ * @return
+ */
+static int sign(float x) {
+    if (x > 0) {
+        return 1;
+    } else if (x == 0) {
+        return 0;
+    } else {
+        return -1;
+    }
+};
+
+
+namespace dm {
+    namespace vision {
+        /***
+         * 图像变换(transform)数据结构体
+         */
+        struct TransInfo {
+            cv::Point2f center;       // 中心点
+            cv::Point2f scale;        // 缩放比例
+            cv::Rect rect;            // 检测框
+            cv::Mat input_image;      // 根据rect裁剪的区域(检测区域)
+        };
+
+        class KeyPointDetector {
+        public:
+            /***
+             * 构造函数
+             * @param modelPath： TNN *.tnnmodel *.tnnproto文件路径（不含后缀名）
+             * @param modelParam：模型输入参数
+             * @param numThread: 开启线程数,默认1
+             * @param TNNDevice: 运行设备，默认TNNCPU
+             */
+            KeyPointDetector(const string modelPath,
+                             ModelParam modelParam,
+                             int numThread=1,
+                             TNNDevice deviceID=TNNCPU);
+
+
+            /***
+             * 构造函数
+             * @param modelPath： TNN *.tnnmodel参数文件路径（含后缀名）
+             * @param protoPath： TNN *.tnnproto模型文件路径（含后缀名）
+             * @param modelParam：模型输入参数
+             * @param numThread: 开启线程数,默认1
+             * @param TNNDevice: 运行设备，默认TNNCPU
+             */
+            KeyPointDetector(const string modelPath,
+                             const string protoPath,
+                             ModelParam modelParam,
+                             int numThread=1,
+                             TNNDevice deviceID=TNNCPU);
+
+            /***
+             * 释放
+             */
+            ~KeyPointDetector();
+
+            /***
+             * 检测指尖关键点
+             * @param imgBRG: BGR Image
+             * @param boxes: 检测区域框(w,y,width,height)
+             * @param scoreThresh：得分阈值，高于得分的坐标为有效值，低于该分数的坐标置为-1，值越大则结果越准确，范围：0~1，默认值：0.6
+             * @param outFrameInfo: 输出当前帧图像的关键点信息
+             * @return
+             */
+            int detect(cv::Mat &imgBRG, vector<cv::Rect> boxes,float scoreThresh, FrameInfo &outFrameInfo);
+
+
+            /***
+             * 可视化检测结果
+             * @param imgBRG: BGR Image
+             * @param outFrameInfo: 输出当前帧图像的关键点信息
+             * @param waitKey: 等待key的时间，默认0
+             */
+            void visualize_result(cv::Mat &imgBRG,
+                                  FrameInfo &frameInfo,
+                                  vector<vector<int>> skeleton,
+                                  int waitKey = 0);
+
+
+        private:
+
+            /***
+             * 模型初始化
+             * @param modelPath： TNN *.tnnmodel参数文件路径（含后缀名）
+             * @param protoPath： TNN *.tnnproto模型文件路径（含后缀名）
+             * @param TNNDevice: 运行设备，默认TNNCPU
+             */
+            int init_model(const string &modelPath, const string &protoPath, TNNDevice deviceID);
+
+            /***
+             * 预处理函数
+             * @param image
+             * @param rect
+             * @param outTransInfo 输出图像变换(transform)数据结构体
+             */
+            void pre_process(cv::Mat &image, cv::Rect rect, TransInfo &outTransInfo);
+
+
+            /***
+             * 模型推理
+             * @param input_image 模型输入的图像
+             * @param outHeatmap  模型输出Tensor,即Heatmap
+             * @return
+             */
+            int forward(cv::Mat &input_image, shared_ptr<TNN_NS::Mat> &outHeatmap);
+
+
+            /***
+             * 后处理函数
+             * @param outObjectInfo 输出目标关键点信息
+             * @param transInfo 图像变换(transform)数据结构体
+             * @param heatmap
+             * @param scoreThresh 关键点分数
+             */
+            void post_process(ObjectInfo &outObjectInfo,
+                              TransInfo &transInfo,
+                              shared_ptr<TNN_NS::Mat> &heatmap,
+                              float scoreThresh = 0.5);
+
+            /***
+             *
+             * @param transInfo 图像变换(transform)数据结构体
+             * @param heatmap
+             * @param coords
+             * @param maxvals
+             */
+            void get_final_preds(TransInfo &transInfo,
+                                 shared_ptr<TNN_NS::Mat> &heatmap,
+                                 vector<cv::Point2f> &coords,
+                                 vector<float> &maxvals);
+
+
+            /***
+             *
+             * @param transInfo 图像变换(transform)数据结构体
+             * @param heatmap
+             * @param coords
+             * @param maxvals
+             */
+            void get_final_preds_offset(TransInfo &transInfo,
+                                        shared_ptr<TNN_NS::Mat> &heatmap,
+                                        vector<cv::Point2f> &coords,
+                                        vector<float> &maxvals);
+
+
+            /***
+             *
+             * @param center
+             * @param scale
+             * @param rot
+             * @param output_size
+             * @param shift
+             * @param inv
+             * @return
+             */
+            cv::Mat get_affine_transform(cv::Point2f center,
+                                         cv::Point2f scale,
+                                         float rot,
+                                         cv::Point2f output_size,
+                                         cv::Point2f shift,
+                                         bool inv = false);
+
+            /***
+             *
+             * @param pt
+             * @param t
+             */
+            void affine_transform(cv::Point2f &pt, cv::Mat t);
+
+
+            /***
+             *
+             * @param pt
+             * @param center
+             * @param scale
+             * @param output_size
+             */
+            void affine_transform_offset(cv::Point2f &pt,
+                                         cv::Point2f center,
+                                         cv::Point2f scale,
+                                         cv::Point2f output_size);
+
+
+            /***
+             *
+             * @param heatmap
+             * @param preds
+             * @param maxvals
+             */
+            void get_max_preds(shared_ptr<TNN_NS::Mat> &heatmap,
+                               vector<cv::Point2f> &preds,
+                               vector<float> &maxvals);
+
+
+            /***
+             *
+             * @param heatmap
+             * @param preds
+             * @param maxvals
+             */
+            void get_max_preds_offset(shared_ptr<TNN_NS::Mat> &heatmap,
+                                      vector<cv::Point2f> &preds,
+                                      vector<float> &maxvals);
+
+
+            /***
+             *
+             * @param a
+             * @param b
+             * @param o
+             */
+            void get_3rd_point(cv::Point2f a, cv::Point2f, cv::Point2f &o);
+
+        public:
+            float time_total{0.0f};           // 模型检测总共时间
+            float time_pre_process{0.0f};     // 模型预处理时间
+            float time_model_infer{0.0f};     // 模型推理时间
+            float time_post_process{0.0f};    // 模型后处理时间
+        private:
+            const string TAG = "KeyPointDetectorCpp";
+            shared_ptr<TNN_NS::TNN> net = nullptr;
+            shared_ptr<TNN_NS::Instance> instance = nullptr;
+            /***TNN_NS::DeviceType
+             * DEVICE_NAIVE      = 0x0000,
+             * DEVICE_X86        = 0x0010,
+             * DEVICE_ARM        = 0x0020,
+             * DEVICE_OPENCL     = 0x1000,
+             * DEVICE_METAL      = 0x1010,
+             * DEVICE_CUDA       = 0x1020,
+             * DEVICE_DSP        = 0x1030,
+             * DEVICE_ATLAS      = 0x1040,
+             * DEVICE_HUAWEI_NPU = 0x1050,
+             * DEVICE_RK_NPU     = 0x1060,
+             */
+            TNN_NS::DeviceType mDevice;  // 运行实例设备
+            int mNumThread;
+            ModelParam mModelParam;
+        };
+    }
+};
+
+
+#endif //BODY_DETECTION_RK3399_BODY_DETECTION_H
diff --git a/src/Types.h b/src/Types.h
new file mode 100644
index 0000000..8264704
--- /dev/null
+++ b/src/Types.h
@@ -0,0 +1,88 @@
+//
+// Created by pan_jinquan@163.com on 2020/6/24.
+//
+
+
+#ifndef BODY_DETECTION_RK3399_TYPES_H
+#define BODY_DETECTION_RK3399_TYPES_H
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/core.hpp>
+#include "debug.h"
+
+using namespace std;
+
+#define PI  3.141592653589793
+//双线性差值INTER_LINEAR,速度较慢，效果较好；最邻近插值INTER_NEAREST，速度较快，效果较差
+//static int INTER_FLAGS = cv::INTER_LINEAR;
+static int INTER_FLAGS = cv::INTER_NEAREST;
+
+
+/***
+ * 模型基本参数bias和scale与torch的transforms数据处理的对齐方法
+ * transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+ * torch : y = (x-m)/std=x/std-m/std; 由于torch中0~255归一化0~1,逆归一化后：
+ *         y = x/std/255 - m/std
+ * TNN   : y = scale*x + bias
+ * 对比得 ：scale=1/std/255 , bias=-m/std
+ */
+struct ModelParam {
+    float aspect_ratio;                //长宽比，一般为0.75
+    float scale_ratio;                 //缩放比例，一般为1.25
+    int input_width;                   //模型输入宽度，单位：像素
+    int input_height;                  //模型输入高度，单位：像素
+    bool use_udp;                      //是否使用无偏估计UDP,一般为false
+    bool use_rgb;                      //是否使用RGB作为模型输入
+    vector<float> bias;                //输入数据偏置：bias=-m/std
+    vector<float> scale;               //输入数据归一化尺度：scale=1/std/255
+    vector<vector<int>> skeleton;    //关键点连接序号ID(用于可视化显示)
+};
+
+
+
+/***
+ * person Model param
+ */
+static ModelParam COCO_PERSON_PARAM = {0.75f,
+                                      1.25f,
+                                      192,
+                                      256,
+                                      false,
+                                      false,
+                                      {-0.5 / 0.5, -0.5 / 0.5, -0.5 / 0.5, 0},//bias=-m/std
+                                      {1 / 0.5 / 255.f, 1 / 0.5 / 255.f, 1 / 0.5 / 255.f, 0},     //scale=1/std/255
+                                      {{15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12}, {5, 6},
+                                       {5, 7}, {6, 8}, {7, 9}, {8, 10}, {0, 1}, {0, 2}, {1, 3}, {2, 4}}};
+
+
+
+static ModelParam MODEL_TYPE[] = {
+        COCO_PERSON_PARAM,
+};
+
+
+/***
+ * 关键点(包含一个坐标点point和分数score)
+ */
+struct KeyPoint {
+    float score;//关键点分数
+    cv::Point2f point;
+};
+
+/***
+ * 目标信息(包含目标的多个关键点keypoints和检测区域框rect)
+ */
+struct ObjectInfo {
+    vector<KeyPoint> keypoints;
+    cv::Rect rect;
+};
+
+/***
+ * 帧信息(帧图像中多个目标的信息)
+ */
+struct FrameInfo {
+    vector<ObjectInfo> info;
+};
+
+
+#endif //BODY_DETECTION_RK3399_TYPES_H
diff --git a/src/main_for_pose.cpp b/src/main_for_pose.cpp
new file mode 100644
index 0000000..8a6dbdc
--- /dev/null
+++ b/src/main_for_pose.cpp
@@ -0,0 +1,54 @@
+//
+// Created by pan_jinquan@163.com on 2020/6/24.
+//
+
+
+#include "KeyPointDetector.h"
+#include "Types.h"
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace dm;
+using namespace vision;
+using namespace std;
+
+
+
+void test_coco_person() {
+    // 人体姿态检测
+    const char *test_image = "../data/test_image/person/test2.jpg";
+    const char *model_path = (char *) "../data/tnn/coco_person/mbv2_1.0_g_17_192_256_128_s1.25_sim.opt";
+    const int num_thread = 1;
+    const float score_thresh = 0.5;
+
+    TNNDevice device_id = TNNGPU;//运行设备
+    ModelParam model_param = COCO_PERSON_PARAM;//模型参数
+    KeyPointDetector *detector = new KeyPointDetector(model_path,
+                                                      model_param,
+                                                      num_thread,
+                                                      device_id);
+    cv::Mat bgr;
+    bgr = cv::imread(test_image);
+    int src_h = bgr.rows;
+    int src_w = bgr.cols;
+    // 检测区域为整张图片的大小
+    FrameInfo resultInfo;
+    cv::Rect box(0, 0, src_w, src_h);
+    vector<cv::Rect> bboxes;
+    bboxes.push_back(box);
+    // 开始检测
+    detector->detect(bgr, bboxes, score_thresh, resultInfo);
+    // 可视化代码
+    detector->visualize_result(bgr, resultInfo, model_param.skeleton);
+    // 释放空间
+    delete detector;
+    detector = nullptr;
+    printf("FINISHED.\n");
+}
+
+
+int main() {
+    test_coco_person();
+    return 0;
+}